From 8798ea68b0a34e7ecae43f4ccaf7a446697c51c8 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 10 Oct 2018 13:39:59 +0000
Subject: [PATCH 0001/1116] [x86] allow single source horizontal op matching
 (PR39195)

This is intended to restore horizontal codegen to what it looked like before IR demanded elements improved in:
rL343727

As noted in PR39195:
https://bugs.llvm.org/show_bug.cgi?id=39195
...horizontal ops can be worse for performance than a shuffle+regular binop, so I've added a TODO. Ideally, we'd
solve that in a machine instruction pass, but a quicker solution will be adding a 'HasFastHorizontalOp' feature
bit to deal with it here in the DAG.

Differential Revision: https://reviews.llvm.org/D52997


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344141 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            |  8 +-
 .../X86/avx512-intrinsics-fast-isel.ll        | 12 +--
 test/CodeGen/X86/haddsub-undef.ll             | 71 ++++----------
 test/CodeGen/X86/phaddsub.ll                  | 96 +++++--------------
 test/CodeGen/X86/vector-shuffle-combining.ll  | 39 +++++---
 5 files changed, 79 insertions(+), 147 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 4c18c5a84c2..67f98d8ee72 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -37026,9 +37026,13 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
         continue;
 
       // The  low half of the 128-bit result must choose from A.
-      // The high half of the 128-bit result must choose from B.
+      // The high half of the 128-bit result must choose from B,
+      // unless B is undef. In that case, we are always choosing from A.
+      // TODO: Using a horizontal op on a single input is likely worse for
+      // performance on many CPUs, so this should be limited here or reversed
+      // in a later pass.
       unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
-      unsigned Src = i >= NumEltsPer64BitChunk;
+      unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
 
       // Check that successive elements are being operated on. If not, this is
       // not a horizontal operation.
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index f889bb90550..20c509732c8 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -7210,8 +7210,7 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
 ; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
@@ -7226,8 +7225,7 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
 ; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; X64-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 entry:
@@ -7407,8 +7405,7 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
@@ -7425,8 +7422,7 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; X64-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll
index 84decabcbce..d7c0936a474 100644
--- a/test/CodeGen/X86/haddsub-undef.ll
+++ b/test/CodeGen/X86/haddsub-undef.ll
@@ -453,14 +453,12 @@ define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
 define <2 x double> @add_pd_003(<2 x double> %x) {
 ; SSE-LABEL: add_pd_003:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
-; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    haddpd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: add_pd_003:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
   %add = fadd <2 x double> %l, %x
@@ -472,16 +470,12 @@ define <2 x double> @add_pd_003(<2 x double> %x) {
 define <2 x double> @add_pd_003_2(<2 x double> %x) {
 ; SSE-LABEL: add_pd_003_2:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movapd %xmm0, %xmm1
-; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
-; SSE-NEXT:    addpd %xmm0, %xmm1
-; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    haddpd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: add_pd_003_2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   %add = fadd <2 x double> %l, %x
@@ -491,16 +485,12 @@ define <2 x double> @add_pd_003_2(<2 x double> %x) {
 define <2 x double> @add_pd_010(<2 x double> %x) {
 ; SSE-LABEL: add_pd_010:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
-; SSE-NEXT:    addpd %xmm0, %xmm1
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    haddpd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: add_pd_010:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX-NEXT:    retq
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
@@ -512,17 +502,12 @@ define <2 x double> @add_pd_010(<2 x double> %x) {
 define <4 x float> @add_ps_007(<4 x float> %x) {
 ; SSE-LABEL: add_ps_007:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    haddps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: add_ps_007:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
@@ -533,18 +518,13 @@ define <4 x float> @add_ps_007(<4 x float> %x) {
 define <4 x float> @add_ps_030(<4 x float> %x) {
 ; SSE-LABEL: add_ps_030:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    haddps %xmm0, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: add_ps_030:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
@@ -557,16 +537,12 @@ define <4 x float> @add_ps_030(<4 x float> %x) {
 define <4 x float> @add_ps_007_2(<4 x float> %x) {
 ; SSE-LABEL: add_ps_007_2:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    haddps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: add_ps_007_2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
@@ -577,14 +553,12 @@ define <4 x float> @add_ps_007_2(<4 x float> %x) {
 define <4 x float> @add_ps_008(<4 x float> %x) {
 ; SSE-LABEL: add_ps_008:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    haddps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: add_ps_008:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = fadd <4 x float> %l, %x
@@ -594,16 +568,13 @@ define <4 x float> @add_ps_008(<4 x float> %x) {
 define <4 x float> @add_ps_017(<4 x float> %x) {
 ; SSE-LABEL: add_ps_017:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE-NEXT:    addps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    haddps %xmm0, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: add_ps_017:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
@@ -615,17 +586,13 @@ define <4 x float> @add_ps_017(<4 x float> %x) {
 define <4 x float> @add_ps_018(<4 x float> %x) {
 ; SSE-LABEL: add_ps_018:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    haddps %xmm0, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: add_ps_018:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
diff --git a/test/CodeGen/X86/phaddsub.ll b/test/CodeGen/X86/phaddsub.ll
index 5d7c77b9a81..7b3f8db76c4 100644
--- a/test/CodeGen/X86/phaddsub.ll
+++ b/test/CodeGen/X86/phaddsub.ll
@@ -286,16 +286,12 @@ define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
 define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
 ; SSSE3-LABEL: phaddd_single_source1:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    phaddd %xmm0, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: phaddd_single_source1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
@@ -306,17 +302,13 @@ define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
 define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
 ; SSSE3-LABEL: phaddd_single_source2:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    phaddd %xmm0, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: phaddd_single_source2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
@@ -329,16 +321,12 @@ define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
 define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
 ; SSSE3-LABEL: phaddd_single_source3:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    phaddd %xmm0, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: phaddd_single_source3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
@@ -349,14 +337,12 @@ define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
 define <4 x i32> @phaddd_single_source4(<4 x i32> %x) {
 ; SSSE3-LABEL: phaddd_single_source4:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    phaddd %xmm0, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: phaddd_single_source4:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = add <4 x i32> %l, %x
@@ -366,15 +352,13 @@ define <4 x i32> @phaddd_single_source4(<4 x i32> %x) {
 define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
 ; SSSE3-LABEL: phaddd_single_source5:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
-; SSSE3-NEXT:    paddd %xmm0, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSSE3-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: phaddd_single_source5:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
@@ -386,17 +370,13 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
 define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
 ; SSSE3-LABEL: phaddd_single_source6:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    phaddd %xmm0, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: phaddd_single_source6:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
@@ -409,17 +389,12 @@ define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
 define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
 ; SSSE3-LABEL: phaddw_single_source1:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15]
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-NEXT:    phaddw %xmm0, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: phaddw_single_source1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15]
-; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
   %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
@@ -430,22 +405,14 @@ define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
 define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
 ; SSSE3-LABEL: phaddw_single_source2:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-NEXT:    phaddw %xmm0, %xmm0
 ; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: phaddw_single_source2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; AVX-NEXT:    retq
@@ -459,20 +426,12 @@ define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
 define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
 ; SSSE3-LABEL: phaddw_single_source3:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-NEXT:    phaddw %xmm0, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: phaddw_single_source3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef>
   %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef>
@@ -483,16 +442,12 @@ define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
 define <8 x i16> @phaddw_single_source4(<8 x i16> %x) {
 ; SSSE3-LABEL: phaddw_single_source4:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pslld $16, %xmm1
-; SSSE3-NEXT:    paddw %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    phaddw %xmm0, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: phaddw_single_source4:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpslld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6>
   %add = add <8 x i16> %l, %x
@@ -502,18 +457,13 @@ define <8 x i16> @phaddw_single_source4(<8 x i16> %x) {
 define <8 x i16> @phaddw_single_source6(<8 x i16> %x) {
 ; SSSE3-LABEL: phaddw_single_source6:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-NEXT:    phaddw %xmm0, %xmm0
 ; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: phaddw_single_source6:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index 2eb9362947e..5c0a223d496 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2700,21 +2700,36 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
 }
 
 define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: PR22377:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
-; SSE-NEXT:    addps %xmm0, %xmm1
-; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    retq
+; SSE2-LABEL: PR22377:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; SSE2-NEXT:    addps %xmm0, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: PR22377:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movaps %xmm0, %xmm1
+; SSSE3-NEXT:    haddps %xmm0, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: PR22377:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movaps %xmm0, %xmm1
+; SSE41-NEXT:    haddps %xmm0, %xmm1
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: PR22377:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; AVX-NEXT:    retq
 entry:
   %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
-- 
GitLab


From df002d74e3e6588a8ffb2aa8459369696b2611c4 Mon Sep 17 00:00:00 2001
From: Nirav Dave <niravd@google.com>
Date: Wed, 10 Oct 2018 14:15:52 +0000
Subject: [PATCH 0002/1116] [DAGCombine] Improve Load-Store Forwarding

Summary:
Extend analysis forwarding loads from preceeding stores to work with
extended loads and truncated stores to the same address so long as the
load is fully subsumed by the store.

Hexagon's swp-epilog-phis.ll and swp-memrefs-epilog1.ll test are
deleted as they've no longer seem to be relevant.

Reviewers: RKSimon, rnk, kparzysz, javed.absar

Subscribers: sdardis, nemanjai, hiraditya, atanasyan, llvm-commits

Differential Revision: https://reviews.llvm.org/D49200

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344142 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      | 145 ++++++++++++++++--
 test/CodeGen/AArch64/arm64-ld-from-st.ll      |  28 ++--
 test/CodeGen/AArch64/regress-tblgen-chains.ll |   4 +-
 test/CodeGen/Hexagon/clr_set_toggle.ll        |   2 +-
 test/CodeGen/Hexagon/swp-epilog-phis.ll       |  55 -------
 test/CodeGen/Hexagon/swp-memrefs-epilog1.ll   |  90 -----------
 test/CodeGen/Mips/cconv/vector.ll             |  22 +--
 .../Mips/indirect-jump-hazard/jumptables.ll   |  22 ++-
 test/CodeGen/Mips/o32_cc_byval.ll             |   8 +-
 test/CodeGen/Mips/o32_cc_vararg.ll            |  10 +-
 test/CodeGen/PowerPC/addi-offset-fold.ll      |   5 +-
 .../SystemZ/store_nonbytesized_vecs.ll        |   3 +-
 test/CodeGen/X86/i386-shrink-wrapping.ll      |   2 +-
 test/CodeGen/X86/pr32108.ll                   |   1 -
 test/CodeGen/X86/pr38533.ll                   |   6 -
 test/CodeGen/X86/win64_vararg.ll              |   5 +-
 16 files changed, 184 insertions(+), 224 deletions(-)
 delete mode 100644 test/CodeGen/Hexagon/swp-epilog-phis.ll
 delete mode 100644 test/CodeGen/Hexagon/swp-memrefs-epilog1.ll

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 29adcad22e1..eca5d8369eb 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -250,6 +250,11 @@ namespace {
     SDValue SplitIndexingFromLoad(LoadSDNode *LD);
     bool SliceUpLoad(SDNode *N);
 
+    // Scalars have size 0 to distinguish from singleton vectors.
+    SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD);
+    bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val);
+    bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val);
+
     /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
     ///   load.
     ///
@@ -12762,6 +12767,133 @@ SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
   return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
 }
 
+static inline int numVectorEltsOrZero(EVT T) {
+  return T.isVector() ? T.getVectorNumElements() : 0;
+}
+
+bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
+  Val = ST->getValue();
+  EVT STType = Val.getValueType();
+  EVT STMemType = ST->getMemoryVT();
+  if (STType == STMemType)
+    return true;
+  if (isTypeLegal(STMemType))
+    return false; // fail.
+  if (STType.isFloatingPoint() && STMemType.isFloatingPoint() &&
+      TLI.isOperationLegal(ISD::FTRUNC, STMemType)) {
+    Val = DAG.getNode(ISD::FTRUNC, SDLoc(ST), STMemType, Val);
+    return true;
+  }
+  if (numVectorEltsOrZero(STType) == numVectorEltsOrZero(STMemType) &&
+      STType.isInteger() && STMemType.isInteger()) {
+    Val = DAG.getNode(ISD::TRUNCATE, SDLoc(ST), STMemType, Val);
+    return true;
+  }
+  if (STType.getSizeInBits() == STMemType.getSizeInBits()) {
+    Val = DAG.getBitcast(STMemType, Val);
+    return true;
+  }
+  return false; // fail.
+}
+
+bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) {
+  EVT LDMemType = LD->getMemoryVT();
+  EVT LDType = LD->getValueType(0);
+  assert(Val.getValueType() == LDMemType &&
+         "Attempting to extend value of non-matching type");
+  if (LDType == LDMemType)
+    return true;
+  if (LDMemType.isInteger() && LDType.isInteger()) {
+    switch (LD->getExtensionType()) {
+    case ISD::NON_EXTLOAD:
+      Val = DAG.getBitcast(LDType, Val);
+      return true;
+    case ISD::EXTLOAD:
+      Val = DAG.getNode(ISD::ANY_EXTEND, SDLoc(LD), LDType, Val);
+      return true;
+    case ISD::SEXTLOAD:
+      Val = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(LD), LDType, Val);
+      return true;
+    case ISD::ZEXTLOAD:
+      Val = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(LD), LDType, Val);
+      return true;
+    }
+  }
+  return false;
+}
+
+SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
+  if (OptLevel == CodeGenOpt::None || LD->isVolatile())
+    return SDValue();
+  SDValue Chain = LD->getOperand(0);
+  StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode());
+  if (!ST || ST->isVolatile())
+    return SDValue();
+
+  EVT LDType = LD->getValueType(0);
+  EVT LDMemType = LD->getMemoryVT();
+  EVT STMemType = ST->getMemoryVT();
+  EVT STType = ST->getValue().getValueType();
+
+  BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
+  BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
+  int64_t Offset;
+
+  bool STCoversLD =
+      BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset) && (Offset >= 0) &&
+      (Offset * 8 <= LDMemType.getSizeInBits()) &&
+      (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits());
+
+  if (!STCoversLD)
+    return SDValue();
+
+  // Memory as copy space (potentially masked).
+  if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
+    // Simple case: Direct non-truncating forwarding
+    if (LDType.getSizeInBits() == LDMemType.getSizeInBits())
+      return CombineTo(LD, ST->getValue(), Chain);
+    // Can we model the truncate and extension with an and mask?
+    if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
+        !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
+      // Mask to size of LDMemType
+      auto Mask =
+          DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(),
+                                               STMemType.getSizeInBits()),
+                          SDLoc(ST), STType);
+      auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
+      return CombineTo(LD, Val, Chain);
+    }
+  }
+
+  // TODO: Deal with nonzero offset.
+  if (LD->getBasePtr().isUndef() || Offset != 0)
+    return SDValue();
+  // Model necessary truncations / extenstions.
+  SDValue Val;
+  // Truncate Value To Stored Memory Size.
+  do {
+    if (!getTruncatedStoreValue(ST, Val))
+      continue;
+    if (!isTypeLegal(LDMemType))
+      continue;
+    if (STMemType != LDMemType) {
+      if (numVectorEltsOrZero(STMemType) == numVectorEltsOrZero(LDMemType) &&
+          STMemType.isInteger() && LDMemType.isInteger())
+        Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val);
+      else
+        continue;
+    }
+    if (!extendLoadedValueToExtension(LD, Val))
+      continue;
+    return CombineTo(LD, Val, Chain);
+  } while (false);
+
+  // On failure, cleanup dead nodes we may have created.
+  if (Val->use_empty())
+    deleteAndRecombine(Val.getNode());
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitLOAD(SDNode *N) {
   LoadSDNode *LD  = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
@@ -12828,17 +12960,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
 
   // If this load is directly stored, replace the load value with the stored
   // value.
-  // TODO: Handle store large -> read small portion.
-  // TODO: Handle TRUNCSTORE/LOADEXT
-  if (OptLevel != CodeGenOpt::None &&
-      ISD::isNormalLoad(N) && !LD->isVolatile()) {
-    if (ISD::isNON_TRUNCStore(Chain.getNode())) {
-      StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
-      if (PrevST->getBasePtr() == Ptr &&
-          PrevST->getValue().getValueType() == N->getValueType(0))
-        return CombineTo(N, PrevST->getOperand(1), Chain);
-    }
-  }
+  if (auto V = ForwardStoreValueToDirectLoad(LD))
+    return V;
 
   // Try to infer better alignment information than the load already has.
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
diff --git a/test/CodeGen/AArch64/arm64-ld-from-st.ll b/test/CodeGen/AArch64/arm64-ld-from-st.ll
index dd8add70cdb..5488c21fa29 100644
--- a/test/CodeGen/AArch64/arm64-ld-from-st.ll
+++ b/test/CodeGen/AArch64/arm64-ld-from-st.ll
@@ -13,7 +13,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str64Ldr32_0
-; CHECK: and x0, x1, #0xffffffff
+; CHECK: mov w0, w1
 define i32 @Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i32*
@@ -37,7 +37,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str64Ldr16_0
-; CHECK: and x0, x1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i16*
@@ -85,7 +85,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str64Ldr8_0
-; CHECK: and x0, x1, #0xff
+; CHECK: mov w0, w1
 define i8 @Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i8*
@@ -193,7 +193,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str32Ldr16_0
-; CHECK: and w0, w1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
 entry:
   %0 = bitcast i32* %P to i16*
@@ -217,7 +217,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str32Ldr8_0
-; CHECK: and w0, w1, #0xff
+; CHECK: mov w0, w1
 define i8 @Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
 entry:
   %0 = bitcast i32* %P to i8*
@@ -265,7 +265,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str16Ldr16
-; CHECK: and w0, w1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
 entry:
   %0 = bitcast i16* %P to i16*
@@ -277,7 +277,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str16Ldr8_0
-; CHECK: and w0, w1, #0xff
+; CHECK: mov w0, w1
 define i8 @Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
 entry:
   %0 = bitcast i16* %P to i8*
@@ -314,7 +314,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str64Ldr32_0
-; CHECK: and x0, x1, #0xffffffff
+; CHECK: mov w0, w1
 define i32 @Unscaled_Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i32*
@@ -338,7 +338,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str64Ldr16_0
-; CHECK: and x0, x1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Unscaled_Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i16*
@@ -386,7 +386,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str64Ldr8_0
-; CHECK: and x0, x1, #0xff
+; CHECK: mov w0, w1
 define i8 @Unscaled_Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i8*
@@ -494,7 +494,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str32Ldr16_0
-; CHECK: and w0, w1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Unscaled_Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
 entry:
   %0 = bitcast i32* %P to i16*
@@ -518,7 +518,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str32Ldr8_0
-; CHECK: and w0, w1, #0xff
+; CHECK: mov w0, w1
 define i8 @Unscaled_Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
 entry:
   %0 = bitcast i32* %P to i8*
@@ -566,7 +566,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str16Ldr16
-; CHECK: and w0, w1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Unscaled_Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
 entry:
   %0 = bitcast i16* %P to i16*
@@ -578,7 +578,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str16Ldr8_0
-; CHECK: and w0, w1, #0xff
+; CHECK: mov w0, w1
 define i8 @Unscaled_Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
 entry:
   %0 = bitcast i16* %P to i8*
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
index 24038cda507..50da7d139f1 100644
--- a/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -26,9 +26,9 @@ define i64 @test_chains() {
   store i8 %inc.4, i8* %locvar
 
 ; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #1
 ; CHECK: sturb w[[STRVAL:[0-9]+]], [x29, [[LOCADDR]]]
-; CHECK: and w0, w[[STRVAL]], #0xff
+; CHECK: and x0, x[[STRVAL]], #0xff
 
   %ret.1 = load i8, i8* %locvar
   %ret.2 = zext i8 %ret.1 to i64
diff --git a/test/CodeGen/Hexagon/clr_set_toggle.ll b/test/CodeGen/Hexagon/clr_set_toggle.ll
index 9318f2d8a6b..43c866c7b76 100644
--- a/test/CodeGen/Hexagon/clr_set_toggle.ll
+++ b/test/CodeGen/Hexagon/clr_set_toggle.ll
@@ -70,7 +70,7 @@ entry:
 define zeroext i16 @my_setbit(i16 zeroext %crc) nounwind {
 entry:
 ; CHECK-LABEL: my_setbit
-; CHECK: memh(r{{[0-9]+}}+#{{[0-9]+}}) = setbit(#15)
+; CHECK: r{{[0-9]+}} = setbit(r{{[0-9]+}},#15)
   %crc.addr = alloca i16, align 2
   store i16 %crc, i16* %crc.addr, align 2
   %0 = load i16, i16* %crc.addr, align 2
diff --git a/test/CodeGen/Hexagon/swp-epilog-phis.ll b/test/CodeGen/Hexagon/swp-epilog-phis.ll
deleted file mode 100644
index 1073f1c46b1..00000000000
--- a/test/CodeGen/Hexagon/swp-epilog-phis.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 \
-; RUN:     -pipeliner-ignore-recmii -disable-hexagon-nv-schedule \
-; RUN:     -hexagon-initial-cfg-cleanup=0 -stats -o /dev/null \
-; RUN:     -enable-aa-sched-mi < %s 2>&1 | FileCheck %s --check-prefix=STATS
-; REQUIRES: asserts
-;
-; Test that we generate the correct phis in the last epilog block when
-; allowing multiple stages.
-;
-; STATS: 1 pipeliner        - Number of loops software pipelined
-
-; Function Attrs: nounwind
-define void @f0() #0 {
-b0:
-  br i1 undef, label %b6, label %b1
-
-b1:                                               ; preds = %b0
-  br i1 undef, label %b6, label %b2
-
-b2:                                               ; preds = %b1
-  br label %b4
-
-b3:                                               ; preds = %b4, %b3
-  %v0 = add nsw i32 0, 57344
-  %v1 = trunc i32 %v0 to i16
-  store i16 %v1, i16* null, align 2, !tbaa !0
-  %v2 = getelementptr inbounds i8, i8* null, i32 undef
-  %v3 = load i8, i8* %v2, align 1, !tbaa !4
-  %v4 = zext i8 %v3 to i32
-  %v5 = shl nuw nsw i32 %v4, 6
-  %v6 = add nsw i32 %v5, 57344
-  %v7 = trunc i32 %v6 to i16
-  store i16 %v7, i16* undef, align 2, !tbaa !0
-  br i1 undef, label %b5, label %b3
-
-b4:                                               ; preds = %b5, %b2
-  %v8 = phi i32 [ 0, %b2 ], [ %v9, %b5 ]
-  br label %b3
-
-b5:                                               ; preds = %b3
-  %v9 = add i32 %v8, 1
-  %v10 = icmp eq i32 %v9, undef
-  br i1 %v10, label %b6, label %b4
-
-b6:                                               ; preds = %b5, %b1, %b0
-  ret void
-}
-
-attributes #0 = { nounwind "target-cpu"="hexagonv55" }
-
-!0 = !{!1, !1, i64 0}
-!1 = !{!"short", !2}
-!2 = !{!"omnipotent char", !3}
-!3 = !{!"Simple C/C++ TBAA"}
-!4 = !{!2, !2, i64 0}
diff --git a/test/CodeGen/Hexagon/swp-memrefs-epilog1.ll b/test/CodeGen/Hexagon/swp-memrefs-epilog1.ll
deleted file mode 100644
index bb45eeac140..00000000000
--- a/test/CodeGen/Hexagon/swp-memrefs-epilog1.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; RUN: llc -march=hexagon -enable-pipeliner < %s | FileCheck %s
-
-; Test that a store and load, that alias, are not put in the same packet. The
-; pipeliner altered the size of the memrefs for these instructions, which
-; resulted in no order dependence between the instructions in the DAG. No order
-; dependence was added since the size was set to UINT_MAX, but there is a
-; computation using the size that overflowed.
-
-; CHECK: endloop0
-; CHECK: memh([[REG:r([0-9]+)]]+#0) =
-; CHECK: = memh([[REG]]++#2)
-
-; Function Attrs: nounwind
-define signext i16 @f0(i16* nocapture readonly %a0, i16* nocapture readonly %a1) local_unnamed_addr #0 {
-b0:
-  %v0 = alloca [40 x i16], align 8
-  %v1 = bitcast [40 x i16]* %v0 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull %v1) #2
-  %v2 = getelementptr inbounds [40 x i16], [40 x i16]* %v0, i32 0, i32 0
-  br label %b1
-
-b1:                                               ; preds = %b1, %b0
-  %v3 = phi i16* [ %a1, %b0 ], [ %v24, %b1 ]
-  %v4 = phi i16* [ %v2, %b0 ], [ %v25, %b1 ]
-  %v5 = phi i32 [ 0, %b0 ], [ %v14, %b1 ]
-  %v6 = phi i32 [ 1, %b0 ], [ %v22, %b1 ]
-  %v7 = phi i32 [ 0, %b0 ], [ %v23, %b1 ]
-  %v8 = load i16, i16* %v3, align 2
-  %v9 = sext i16 %v8 to i32
-  %v10 = tail call i32 @llvm.hexagon.A2.aslh(i32 %v9)
-  %v11 = tail call i32 @llvm.hexagon.S2.asr.r.r.sat(i32 %v10, i32 1)
-  %v12 = tail call i32 @llvm.hexagon.A2.asrh(i32 %v11)
-  %v13 = trunc i32 %v12 to i16
-  store i16 %v13, i16* %v4, align 2
-  %v14 = add nuw nsw i32 %v5, 1
-  %v15 = icmp eq i32 %v14, 40
-  %v16 = getelementptr inbounds i16, i16* %a0, i32 %v7
-  %v17 = load i16, i16* %v16, align 2
-  %v18 = sext i16 %v17 to i32
-  %v19 = getelementptr inbounds [40 x i16], [40 x i16]* %v0, i32 0, i32 %v7
-  %v20 = load i16, i16* %v19, align 2
-  %v21 = sext i16 %v20 to i32
-  %v22 = tail call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s1(i32 %v6, i32 %v18, i32 %v21)
-  %v23 = add nuw nsw i32 %v7, 1
-  %v24 = getelementptr i16, i16* %v3, i32 1
-  %v25 = getelementptr i16, i16* %v4, i32 1
-  br i1 %v15, label %b2, label %b1
-
-b2:                                               ; preds = %b1
-  %v26 = tail call signext i16 @f1(i32 %v22) #0
-  %v27 = sext i16 %v26 to i32
-  %v28 = tail call i32 @llvm.hexagon.S2.asl.r.r.sat(i32 %v22, i32 %v27)
-  %v29 = tail call i32 @llvm.hexagon.A2.asrh(i32 %v28)
-  %v30 = shl i32 %v29, 16
-  %v31 = ashr exact i32 %v30, 16
-  %v32 = icmp slt i32 %v30, 65536
-  br label %b3
-
-b3:                                               ; preds = %b2
-  call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull %v1) #2
-  ret i16 0
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.hexagon.S2.asr.r.r.sat(i32, i32) #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.hexagon.A2.aslh(i32) #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.hexagon.A2.asrh(i32) #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s1(i32, i32, i32) #2
-
-; Function Attrs: nounwind
-declare signext i16 @f1(i32) local_unnamed_addr #0
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.hexagon.S2.asl.r.r.sat(i32, i32) #2
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/Mips/cconv/vector.ll b/test/CodeGen/Mips/cconv/vector.ll
index b580d2a338c..8cec16683ca 100644
--- a/test/CodeGen/Mips/cconv/vector.ll
+++ b/test/CodeGen/Mips/cconv/vector.ll
@@ -2053,12 +2053,10 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
 ; MIPS64R5-NEXT:    sd $4, 24($sp)
 ; MIPS64R5-NEXT:    ldi.b $w0, 0
 ; MIPS64R5-NEXT:    lw $1, 20($sp)
-; MIPS64R5-NEXT:    lw $2, 16($sp)
 ; MIPS64R5-NEXT:    move.v $w1, $w0
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
+; MIPS64R5-NEXT:    insert.d $w1[0], $5
 ; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    lw $1, 24($sp)
-; MIPS64R5-NEXT:    insert.d $w0[0], $1
+; MIPS64R5-NEXT:    insert.d $w0[0], $4
 ; MIPS64R5-NEXT:    lw $1, 28($sp)
 ; MIPS64R5-NEXT:    insert.d $w0[1], $1
 ; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
@@ -3533,12 +3531,8 @@ define void @call_i8_2() {
 ; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS32R5EB-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
 ; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EB-NEXT:    addiu $1, $zero, 1543
-; MIPS32R5EB-NEXT:    sh $1, 20($sp)
-; MIPS32R5EB-NEXT:    addiu $1, $zero, 3080
-; MIPS32R5EB-NEXT:    sh $1, 24($sp)
-; MIPS32R5EB-NEXT:    lhu $4, 20($sp)
-; MIPS32R5EB-NEXT:    lhu $5, 24($sp)
+; MIPS32R5EB-NEXT:    addiu $4, $zero, 1543
+; MIPS32R5EB-NEXT:    addiu $5, $zero, 3080
 ; MIPS32R5EB-NEXT:    jal i8_2
 ; MIPS32R5EB-NEXT:    nop
 ; MIPS32R5EB-NEXT:    sw $2, 16($sp)
@@ -3645,12 +3639,8 @@ define void @call_i8_2() {
 ; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS32R5EL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
 ; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EL-NEXT:    addiu $1, $zero, 1798
-; MIPS32R5EL-NEXT:    sh $1, 20($sp)
-; MIPS32R5EL-NEXT:    addiu $1, $zero, 2060
-; MIPS32R5EL-NEXT:    sh $1, 24($sp)
-; MIPS32R5EL-NEXT:    lhu $4, 20($sp)
-; MIPS32R5EL-NEXT:    lhu $5, 24($sp)
+; MIPS32R5EL-NEXT:    addiu $4, $zero, 1798
+; MIPS32R5EL-NEXT:    addiu $5, $zero, 2060
 ; MIPS32R5EL-NEXT:    jal i8_2
 ; MIPS32R5EL-NEXT:    nop
 ; MIPS32R5EL-NEXT:    sw $2, 16($sp)
diff --git a/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll b/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
index 4f2339d18c3..efa07590900 100644
--- a/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
+++ b/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
@@ -155,11 +155,10 @@ define i8* @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2:       # %bb.0: # %entry
 ; MIPS64R2-NEXT:    daddiu $sp, $sp, -16
 ; MIPS64R2-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R2-NEXT:    sw $4, 4($sp)
-; MIPS64R2-NEXT:    lwu $2, 4($sp)
+; MIPS64R2-NEXT:    dext $2, $4, 0, 32
 ; MIPS64R2-NEXT:    sltiu $1, $2, 7
 ; MIPS64R2-NEXT:    beqz $1, .LBB0_3
-; MIPS64R2-NEXT:    nop
+; MIPS64R2-NEXT:    sw $4, 4($sp)
 ; MIPS64R2-NEXT:  .LBB0_1: # %entry
 ; MIPS64R2-NEXT:    dsll $1, $2, 3
 ; MIPS64R2-NEXT:    lui $2, %highest(.LJTI0_0)
@@ -251,10 +250,10 @@ define i8* @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6:       # %bb.0: # %entry
 ; MIPS64R6-NEXT:    daddiu $sp, $sp, -16
 ; MIPS64R6-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R6-NEXT:    sw $4, 4($sp)
-; MIPS64R6-NEXT:    lwu $2, 4($sp)
+; MIPS64R6-NEXT:    dext $2, $4, 0, 32
 ; MIPS64R6-NEXT:    sltiu $1, $2, 7
-; MIPS64R6-NEXT:    beqzc $1, .LBB0_3
+; MIPS64R6-NEXT:    beqz $1, .LBB0_3
+; MIPS64R6-NEXT:    sw $4, 4($sp)
 ; MIPS64R6-NEXT:  .LBB0_1: # %entry
 ; MIPS64R6-NEXT:    dsll $1, $2, 3
 ; MIPS64R6-NEXT:    lui $2, %highest(.LJTI0_0)
@@ -473,11 +472,10 @@ define i8* @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R2-NEXT:    lui $1, %hi(%neg(%gp_rel(_Z3fooi)))
 ; PIC-MIPS64R2-NEXT:    daddu $1, $1, $25
 ; PIC-MIPS64R2-NEXT:    daddiu $2, $1, %lo(%neg(%gp_rel(_Z3fooi)))
-; PIC-MIPS64R2-NEXT:    sw $4, 4($sp)
-; PIC-MIPS64R2-NEXT:    lwu $3, 4($sp)
+; PIC-MIPS64R2-NEXT:    dext $3, $4, 0, 32
 ; PIC-MIPS64R2-NEXT:    sltiu $1, $3, 7
 ; PIC-MIPS64R2-NEXT:    beqz $1, .LBB0_3
-; PIC-MIPS64R2-NEXT:    nop
+; PIC-MIPS64R2-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS64R2-NEXT:  .LBB0_1: # %entry
 ; PIC-MIPS64R2-NEXT:    dsll $1, $3, 3
 ; PIC-MIPS64R2-NEXT:    ld $3, %got_page(.LJTI0_0)($2)
@@ -537,10 +535,10 @@ define i8* @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R6-NEXT:    lui $1, %hi(%neg(%gp_rel(_Z3fooi)))
 ; PIC-MIPS64R6-NEXT:    daddu $1, $1, $25
 ; PIC-MIPS64R6-NEXT:    daddiu $2, $1, %lo(%neg(%gp_rel(_Z3fooi)))
-; PIC-MIPS64R6-NEXT:    sw $4, 4($sp)
-; PIC-MIPS64R6-NEXT:    lwu $3, 4($sp)
+; PIC-MIPS64R6-NEXT:    dext $3, $4, 0, 32
 ; PIC-MIPS64R6-NEXT:    sltiu $1, $3, 7
-; PIC-MIPS64R6-NEXT:    beqzc $1, .LBB0_3
+; PIC-MIPS64R6-NEXT:    beqz $1, .LBB0_3
+; PIC-MIPS64R6-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS64R6-NEXT:  .LBB0_1: # %entry
 ; PIC-MIPS64R6-NEXT:    dsll $1, $3, 3
 ; PIC-MIPS64R6-NEXT:    ld $3, %got_page(.LJTI0_0)($2)
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index d61f05dc868..19eb80b79ba 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -109,7 +109,8 @@ define void @f2(float %f, %struct.S1* nocapture byval %s1) nounwind {
 ; CHECK-NEXT:    lw $1, 64($sp)
 ; CHECK-NEXT:    lw $2, 68($sp)
 ; CHECK-NEXT:    lh $3, 58($sp)
-; CHECK-NEXT:    lb $5, 56($sp)
+; CHECK-NEXT:    sll $5, $6, 24
+; CHECK-NEXT:    sra $5, $5, 24
 ; CHECK-NEXT:    swc1 $f12, 36($sp)
 ; CHECK-NEXT:    sw $5, 32($sp)
 ; CHECK-NEXT:    sw $3, 28($sp)
@@ -191,11 +192,12 @@ define void @f4(float %f, %struct.S3* nocapture byval %s3, %struct.S1* nocapture
 ; CHECK-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    addu $gp, $2, $25
 ; CHECK-NEXT:    move $4, $7
-; CHECK-NEXT:    sw $5, 52($sp)
 ; CHECK-NEXT:    sw $6, 56($sp)
+; CHECK-NEXT:    sw $5, 52($sp)
 ; CHECK-NEXT:    sw $7, 60($sp)
 ; CHECK-NEXT:    lw $1, 80($sp)
-; CHECK-NEXT:    lb $2, 52($sp)
+; CHECK-NEXT:    sll $2, $5, 24
+; CHECK-NEXT:    sra $2, $2, 24
 ; CHECK-NEXT:    addiu $3, $zero, 4
 ; CHECK-NEXT:    lui $5, 16576
 ; CHECK-NEXT:    sw $5, 36($sp)
diff --git a/test/CodeGen/Mips/o32_cc_vararg.ll b/test/CodeGen/Mips/o32_cc_vararg.ll
index 73aad48b73e..27d454f31d9 100644
--- a/test/CodeGen/Mips/o32_cc_vararg.ll
+++ b/test/CodeGen/Mips/o32_cc_vararg.ll
@@ -29,10 +29,10 @@ entry:
 
 ; CHECK-LABEL: va1:
 ; CHECK: addiu   $sp, $sp, -16
-; CHECK: sw      $5, 20($sp)
 ; CHECK: sw      $7, 28($sp)
 ; CHECK: sw      $6, 24($sp)
-; CHECK: lw      $2, 20($sp)
+; CHECK: sw      $5, 20($sp)
+; CHECK: move    $2, $5
 }
 
 ; check whether the variable double argument will be accessed from the 8-byte
@@ -83,9 +83,9 @@ entry:
 
 ; CHECK-LABEL: va3:
 ; CHECK: addiu   $sp, $sp, -16
-; CHECK: sw      $6, 24($sp)
 ; CHECK: sw      $7, 28($sp)
-; CHECK: lw      $2, 24($sp)
+; CHECK: sw      $6, 24($sp)
+; CHECK: move    $2, $6
 }
 
 ; double
@@ -135,7 +135,7 @@ entry:
 ; CHECK-LABEL: va5:
 ; CHECK: addiu   $sp, $sp, -24
 ; CHECK: sw      $7, 36($sp)
-; CHECK: lw      $2, 36($sp)
+; CHECK: move    $2, $7
 }
 
 ; double
diff --git a/test/CodeGen/PowerPC/addi-offset-fold.ll b/test/CodeGen/PowerPC/addi-offset-fold.ll
index ab00a4dab3a..7af99203694 100644
--- a/test/CodeGen/PowerPC/addi-offset-fold.ll
+++ b/test/CodeGen/PowerPC/addi-offset-fold.ll
@@ -24,12 +24,11 @@ entry:
   ret i32 %bf.cast
 
 ; CHECK-LABEL: @foo
-; FIXME: We don't need to do these stores/loads at all.
+; FIXME: We don't need to do these stores at all.
 ; CHECK-DAG: std 3, -24(1)
 ; CHECK-DAG: stb 4, -16(1)
-; CHECK-DAG: lbz [[REG1:[0-9]+]], -16(1)
+; CHECK-DAG: sldi [[REG3:[0-9]+]], 4, 32
 ; CHECK-DAG: lwz [[REG2:[0-9]+]], -20(1)
-; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG1]], 32
 ; CHECK-DAG: or [[REG4:[0-9]+]], [[REG2]], [[REG3]]
 ; CHECK: rldicl 3, [[REG4]], 33, 57
 ; CHECK: blr
diff --git a/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
index 8b7184f38e8..60a6a180467 100644
--- a/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
+++ b/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
@@ -60,8 +60,7 @@ define i16 @fun1(<16 x i1> %src)
 ; CHECK-NEXT:    rosbg %r0, %r1, 62, 62, 1
 ; CHECK-NEXT:    vlgvb %r1, %v24, 15
 ; CHECK-NEXT:    rosbg %r0, %r1, 63, 63, 0
-; CHECK-NEXT:    sth %r0, 160(%r15)
-; CHECK-NEXT:    lh %r2, 160(%r15)
+; CHECK-NEXT:    llhr %r2, %r0
 ; CHECK-NEXT:    aghi %r15, 168
 ; CHECK-NEXT:    br %r14
 {
diff --git a/test/CodeGen/X86/i386-shrink-wrapping.ll b/test/CodeGen/X86/i386-shrink-wrapping.ll
index 8a5b92a82fb..495ead223b2 100644
--- a/test/CodeGen/X86/i386-shrink-wrapping.ll
+++ b/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -56,7 +56,7 @@ target triple = "i386-apple-macosx10.5"
 ;
 ; CHECK-NEXT: L_e$non_lazy_ptr, [[E:%[a-z]+]]
 ; CHECK-NEXT: movb %dl, ([[E]])
-; CHECK-NEXT: movsbl ([[E]]), [[CONV:%[a-z]+]]
+; CHECK-NEXT: movzbl %dl, [[CONV:%[a-z]+]]
 ; CHECK-NEXT: movl $6, [[CONV:%[a-z]+]]
 ; The eflags is used in the next instruction.
 ; If that instruction disappear, we are not exercising the bug
diff --git a/test/CodeGen/X86/pr32108.ll b/test/CodeGen/X86/pr32108.ll
index bde5daff285..dc14746440a 100644
--- a/test/CodeGen/X86/pr32108.ll
+++ b/test/CodeGen/X86/pr32108.ll
@@ -4,7 +4,6 @@
 define void @pr32108() {
 ; CHECK-LABEL: pr32108:
 ; CHECK:       # %bb.0: # %BB
-; CHECK-NEXT:    movb $0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %CF244
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/test/CodeGen/X86/pr38533.ll b/test/CodeGen/X86/pr38533.ll
index 96d003ba1a8..59c67acc9be 100644
--- a/test/CodeGen/X86/pr38533.ll
+++ b/test/CodeGen/X86/pr38533.ll
@@ -19,8 +19,6 @@ define void @pr38533_2(half %x) {
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    callq __gnu_f2h_ieee
-; SSE-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
 ; SSE-NEXT:    movw %ax, (%rax)
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -30,8 +28,6 @@ define void @pr38533_2(half %x) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; AVX512-NEXT:    movw %ax, (%rax)
 ; AVX512-NEXT:    retq
   %a = bitcast half %x to <4 x i4>
@@ -46,8 +42,6 @@ define void @pr38533_3(half %x) {
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    callq __gnu_f2h_ieee
-; SSE-NEXT:    movw %ax, (%rsp)
-; SSE-NEXT:    movzwl (%rsp), %eax
 ; SSE-NEXT:    movw %ax, (%rax)
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
diff --git a/test/CodeGen/X86/win64_vararg.ll b/test/CodeGen/X86/win64_vararg.ll
index fc9a10ed586..f0aff6f89bc 100644
--- a/test/CodeGen/X86/win64_vararg.ll
+++ b/test/CodeGen/X86/win64_vararg.ll
@@ -124,7 +124,8 @@ entry:
 ; CHECK: movq %rcx, %rax
 ; CHECK-DAG: movq %r9, 40(%rsp)
 ; CHECK-DAG: movq %r8, 32(%rsp)
-; CHECK: movl 32(%rsp), %[[tmp:[^ ]*]]
-; CHECK: movl %[[tmp]], (%rax)
+; CHECK-DAG: leaq 36(%rsp), %[[sret:[^ ]*]]
+; CHECK-DAG: movl %r8d, (%rax)
+; CHECK-DAG: movq %[[sret]], (%rsp)
 ; CHECK: popq
 ; CHECK: retq
-- 
GitLab


From 444acbd06151972689d80403f079bbcae562d11e Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 10 Oct 2018 14:22:48 +0000
Subject: [PATCH 0003/1116] [llvm-exegesis][NFC] Code simplification

Summary: Simplify code by having LLVMState hold the RegisterAliasingTrackerCache.

Reviewers: courbet

Subscribers: tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D53078

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344143 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/Latency.cpp            |  5 ++---
 tools/llvm-exegesis/lib/LlvmState.cpp          |  2 ++
 tools/llvm-exegesis/lib/LlvmState.h            |  3 +++
 tools/llvm-exegesis/lib/MCInstrDescView.cpp    | 18 +++++++++---------
 tools/llvm-exegesis/lib/MCInstrDescView.h      |  4 ++--
 tools/llvm-exegesis/lib/SnippetGenerator.cpp   |  7 +++----
 tools/llvm-exegesis/lib/SnippetGenerator.h     |  1 -
 tools/llvm-exegesis/lib/Uops.cpp               |  9 +++++----
 tools/llvm-exegesis/lib/X86/Target.cpp         |  6 +++---
 .../llvm-exegesis/X86/SnippetGeneratorTest.cpp |  2 +-
 10 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp
index 4173cf3f9a1..ea646f4f261 100644
--- a/tools/llvm-exegesis/lib/Latency.cpp
+++ b/tools/llvm-exegesis/lib/Latency.cpp
@@ -32,8 +32,7 @@ LatencySnippetGenerator::generateTwoInstructionPrototype(
   for (const unsigned OtherOpcode : Opcodes) {
     if (OtherOpcode == Instr.Description->Opcode)
       continue;
-    const auto &OtherInstrDesc = State.getInstrInfo().get(OtherOpcode);
-    const Instruction OtherInstr(OtherInstrDesc, RATC);
+    const Instruction OtherInstr(State, OtherOpcode);
     if (OtherInstr.hasMemoryOperands())
       continue;
     const AliasingConfigurations Forward(Instr, OtherInstr);
@@ -59,7 +58,7 @@ LatencySnippetGenerator::generateTwoInstructionPrototype(
 
 llvm::Expected<CodeTemplate>
 LatencySnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
-  const Instruction Instr(State.getInstrInfo().get(Opcode), RATC);
+  const Instruction Instr(State, Opcode);
   if (Instr.hasMemoryOperands())
     return llvm::make_error<BenchmarkFailure>(
         "Infeasible : has memory operands");
diff --git a/tools/llvm-exegesis/lib/LlvmState.cpp b/tools/llvm-exegesis/lib/LlvmState.cpp
index 9ff42ca71fd..279792e9031 100644
--- a/tools/llvm-exegesis/lib/LlvmState.cpp
+++ b/tools/llvm-exegesis/lib/LlvmState.cpp
@@ -35,6 +35,8 @@ LLVMState::LLVMState(const std::string &Triple, const std::string &CpuName) {
     llvm::errs() << "no exegesis target for " << Triple << ", using default\n";
     TheExegesisTarget = &ExegesisTarget::getDefault();
   }
+  RATC.reset(new RegisterAliasingTrackerCache(
+      getRegInfo(), getFunctionReservedRegs(getTargetMachine())));
 }
 
 LLVMState::LLVMState()
diff --git a/tools/llvm-exegesis/lib/LlvmState.h b/tools/llvm-exegesis/lib/LlvmState.h
index c84db300841..aa7705a36a6 100644
--- a/tools/llvm-exegesis/lib/LlvmState.h
+++ b/tools/llvm-exegesis/lib/LlvmState.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TOOLS_LLVM_EXEGESIS_LLVMSTATE_H
 #define LLVM_TOOLS_LLVM_EXEGESIS_LLVMSTATE_H
 
+#include "RegisterAliasing.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -54,10 +55,12 @@ public:
   const llvm::MCSubtargetInfo &getSubtargetInfo() const {
     return *TargetMachine->getMCSubtargetInfo();
   }
+  const RegisterAliasingTrackerCache &getRATC() const { return *RATC; }
 
 private:
   const ExegesisTarget *TheExegesisTarget;
   std::unique_ptr<const llvm::TargetMachine> TargetMachine;
+  std::unique_ptr<const RegisterAliasingTrackerCache> RATC;
 };
 
 } // namespace exegesis
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
index 75d85873146..d54f3ca2a45 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.cpp
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
@@ -87,24 +87,24 @@ const llvm::MCOperandInfo &Operand::getExplicitOperandInfo() const {
   return *Info;
 }
 
-Instruction::Instruction(const llvm::MCInstrDesc &MCInstrDesc,
-                         const RegisterAliasingTrackerCache &RATC)
-    : Description(&MCInstrDesc) {
+Instruction::Instruction(const LLVMState &State, unsigned Opcode)
+    : Description(&State.getInstrInfo().get(Opcode)) {
+  const auto &RATC = State.getRATC();
   unsigned OpIndex = 0;
-  for (; OpIndex < MCInstrDesc.getNumOperands(); ++OpIndex) {
-    const auto &OpInfo = MCInstrDesc.opInfo_begin()[OpIndex];
+  for (; OpIndex < Description->getNumOperands(); ++OpIndex) {
+    const auto &OpInfo = Description->opInfo_begin()[OpIndex];
     Operand Operand;
     Operand.Index = OpIndex;
-    Operand.IsDef = (OpIndex < MCInstrDesc.getNumDefs());
+    Operand.IsDef = (OpIndex < Description->getNumDefs());
     // TODO(gchatelet): Handle isLookupPtrRegClass.
     if (OpInfo.RegClass >= 0)
       Operand.Tracker = &RATC.getRegisterClass(OpInfo.RegClass);
     Operand.TiedToIndex =
-        MCInstrDesc.getOperandConstraint(OpIndex, llvm::MCOI::TIED_TO);
+        Description->getOperandConstraint(OpIndex, llvm::MCOI::TIED_TO);
     Operand.Info = &OpInfo;
     Operands.push_back(Operand);
   }
-  for (const llvm::MCPhysReg *MCPhysReg = MCInstrDesc.getImplicitDefs();
+  for (const llvm::MCPhysReg *MCPhysReg = Description->getImplicitDefs();
        MCPhysReg && *MCPhysReg; ++MCPhysReg, ++OpIndex) {
     Operand Operand;
     Operand.Index = OpIndex;
@@ -113,7 +113,7 @@ Instruction::Instruction(const llvm::MCInstrDesc &MCInstrDesc,
     Operand.ImplicitReg = MCPhysReg;
     Operands.push_back(Operand);
   }
-  for (const llvm::MCPhysReg *MCPhysReg = MCInstrDesc.getImplicitUses();
+  for (const llvm::MCPhysReg *MCPhysReg = Description->getImplicitUses();
        MCPhysReg && *MCPhysReg; ++MCPhysReg, ++OpIndex) {
     Operand Operand;
     Operand.Index = OpIndex;
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.h b/tools/llvm-exegesis/lib/MCInstrDescView.h
index 39e5c4a5f5b..914bf51a22b 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.h
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.h
@@ -21,6 +21,7 @@
 
 #include <random>
 
+#include "LlvmState.h"
 #include "RegisterAliasing.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
@@ -92,8 +93,7 @@ struct Operand {
 // A view over an MCInstrDesc offering a convenient interface to compute
 // Register aliasing.
 struct Instruction {
-  Instruction(const llvm::MCInstrDesc &MCInstrDesc,
-              const RegisterAliasingTrackerCache &ATC);
+  Instruction(const LLVMState &State, unsigned Opcode);
 
   // Returns the Operand linked to this Variable.
   // In case the Variable is tied, the primary (i.e. Def) Operand is returned.
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
index 3765776f724..16dbd214e95 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
@@ -25,9 +25,7 @@ namespace exegesis {
 SnippetGeneratorFailure::SnippetGeneratorFailure(const llvm::Twine &S)
     : llvm::StringError(S, llvm::inconvertibleErrorCode()) {}
 
-SnippetGenerator::SnippetGenerator(const LLVMState &State)
-    : State(State), RATC(State.getRegInfo(),
-                         getFunctionReservedRegs(State.getTargetMachine())) {}
+SnippetGenerator::SnippetGenerator(const LLVMState &State) : State(State) {}
 
 SnippetGenerator::~SnippetGenerator() = default;
 
@@ -35,6 +33,7 @@ llvm::Expected<std::vector<BenchmarkCode>>
 SnippetGenerator::generateConfigurations(unsigned Opcode) const {
   if (auto E = generateCodeTemplate(Opcode)) {
     CodeTemplate &CT = E.get();
+    const auto &RATC = State.getRATC();
     const llvm::BitVector &ForbiddenRegs =
         CT.ScratchSpacePointerInReg
             ? RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits()
@@ -64,7 +63,7 @@ std::vector<RegisterValue> SnippetGenerator::computeRegisterInitialValues(
   // Ignore memory operands which are handled separately.
   // Loop invariant: DefinedRegs[i] is true iif it has been set at least once
   // before the current instruction.
-  llvm::BitVector DefinedRegs = RATC.emptyRegisters();
+  llvm::BitVector DefinedRegs = State.getRATC().emptyRegisters();
   std::vector<RegisterValue> RIV;
   for (const InstructionTemplate &IT : Instructions) {
     // Returns the register that this Operand sets or uses, or 0 if this is not
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.h b/tools/llvm-exegesis/lib/SnippetGenerator.h
index 9493c584816..24afe95fda0 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.h
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.h
@@ -54,7 +54,6 @@ public:
 
 protected:
   const LLVMState &State;
-  const RegisterAliasingTrackerCache RATC;
 
   // Generates a single code template that has a self-dependency.
   llvm::Expected<CodeTemplate>
diff --git a/tools/llvm-exegesis/lib/Uops.cpp b/tools/llvm-exegesis/lib/Uops.cpp
index 2208e2a3821..fdb6a27ab59 100644
--- a/tools/llvm-exegesis/lib/Uops.cpp
+++ b/tools/llvm-exegesis/lib/Uops.cpp
@@ -130,7 +130,7 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
   CodeTemplate CT;
 
   const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
-  const Instruction Instr(State.getInstrInfo().get(Opcode), RATC);
+  const Instruction Instr(State, Opcode);
   if (Instr.hasMemoryOperands()) {
     CT.ScratchSpacePointerInReg =
         ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
@@ -138,7 +138,7 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
       return llvm::make_error<BenchmarkFailure>(
           "Infeasible : target does not support memory instructions");
     ScratchSpaceAliasedRegs =
-        &RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
+        &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
     // If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
     // FIXME: We could make a copy of the scratch register.
     for (const auto &Op : Instr.Operands) {
@@ -185,12 +185,13 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
     return std::move(CT);
   }
+  const auto &ReservedRegisters = State.getRATC().reservedRegisters();
   // No tied variables, we pick random values for defs.
   llvm::BitVector Defs(State.getRegInfo().getNumRegs());
   for (const auto &Op : Instr.Operands) {
     if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
       auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
-      remove(PossibleRegisters, RATC.reservedRegisters());
+      remove(PossibleRegisters, ReservedRegisters);
       // Do not use the scratch memory address register.
       if (ScratchSpaceAliasedRegs)
         remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
@@ -205,7 +206,7 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
   for (const auto &Op : Instr.Operands) {
     if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
       auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
-      remove(PossibleRegisters, RATC.reservedRegisters());
+      remove(PossibleRegisters, ReservedRegisters);
       // Do not use the scratch memory address register.
       if (ScratchSpaceAliasedRegs)
         remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index 4a9cb08e27a..8c03f1ac826 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -37,9 +37,9 @@ template <typename Impl> class X86SnippetGenerator : public Impl {
     }
 
     // Handle X87.
-    const auto &InstrDesc = InstrInfo.get(Opcode);
-    const unsigned FPInstClass = InstrDesc.TSFlags & llvm::X86II::FPTypeMask;
-    const Instruction Instr(InstrDesc, this->RATC);
+    const unsigned FPInstClass =
+        InstrInfo.get(Opcode).TSFlags & llvm::X86II::FPTypeMask;
+    const Instruction Instr(this->State, Opcode);
     switch (FPInstClass) {
     case llvm::X86II::NotFP:
       break;
diff --git a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
index 9685c730b8b..f2539aaea18 100644
--- a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -248,7 +248,7 @@ public:
   FakeSnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {}
 
   Instruction createInstruction(unsigned Opcode) {
-    return Instruction(State.getInstrInfo().get(Opcode), RATC);
+    return Instruction(State, Opcode);
   }
 
 private:
-- 
GitLab


From e144c76bb4fb9fb8bdfcbaf53b1b353da0cdbe05 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Wed, 10 Oct 2018 14:46:54 +0000
Subject: [PATCH 0004/1116] [llvm-mca][BtVer2] Add two more move-elimination
 tests. NFC

These should test all the optimizable moves on Jaguar.
A follow-up patch will teach how to recognize these optimizable register moves.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344144 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../X86/BtVer2/reg-move-elimination-2.s       | 137 ++++++++++++++++++
 .../X86/BtVer2/reg-move-elimination-3.s       | 122 ++++++++++++++++
 2 files changed, 259 insertions(+)
 create mode 100644 test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s
 create mode 100644 test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s

diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s
new file mode 100644
index 00000000000..33cd3972194
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s
@@ -0,0 +1,137 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+pxor %mm0, %mm0
+movq %mm0, %mm1
+
+xorps %xmm0, %xmm0
+movaps %xmm0, %xmm1
+movups %xmm1, %xmm2
+movapd %xmm2, %xmm3
+movupd %xmm3, %xmm4
+movdqa %xmm4, %xmm5
+movdqu %xmm5, %xmm0
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      27
+# CHECK-NEXT: Total Cycles:      19
+# CHECK-NEXT: Total uOps:        27
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.42
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 4.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.50                        pxor	%mm0, %mm0
+# CHECK-NEXT:  1      1     0.50                        movq	%mm0, %mm1
+# CHECK-NEXT:  1      0     0.50                        xorps	%xmm0, %xmm0
+# CHECK-NEXT:  1      1     0.50                        movaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      1     0.50                        movups	%xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        movapd	%xmm2, %xmm3
+# CHECK-NEXT:  1      1     0.50                        movupd	%xmm3, %xmm4
+# CHECK-NEXT:  1      1     0.50                        movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  1      1     0.50                        movdqu	%xmm5, %xmm0
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    21
+# CHECK-NEXT: Max number of mappings used:         8
+
+# CHECK:      *  Register File #1 -- JFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     72
+# CHECK-NEXT:    Total number of mappings created: 21
+# CHECK-NEXT:    Max number of mappings used:      8
+
+# CHECK:      *  Register File #2 -- JIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     64
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -     2.00   2.00   3.33   3.67    -      -      -      -     1.33   1.67    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%mm0, %mm0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -     movq	%mm0, %mm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -     1.00   0.33   0.67    -      -      -      -      -      -      -     movaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -     1.00    -     0.33   0.67    -      -      -      -      -      -      -     movups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00   0.67   0.33    -      -      -      -      -      -      -     movapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -     0.33   0.67    -      -      -      -      -      -      -     movupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -     movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -     movdqu	%xmm5, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .    .  .   pxor	%mm0, %mm0
+# CHECK-NEXT: [0,1]     DeER .    .    .  .   movq	%mm0, %mm1
+# CHECK-NEXT: [0,2]     .D-R .    .    .  .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,3]     .DeER.    .    .  .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,4]     . DeER    .    .  .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [0,5]     . D=eER   .    .  .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,6]     .  D=eER  .    .  .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,7]     .  D==eER .    .  .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,8]     .   D==eER.    .  .   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     .   D----R.    .  .   pxor	%mm0, %mm0
+# CHECK-NEXT: [1,1]     .    DeE--R    .  .   movq	%mm0, %mm1
+# CHECK-NEXT: [1,2]     .    D----R    .  .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [1,3]     .    .DeE--R   .  .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,4]     .    .D=eE-R   .  .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [1,5]     .    . D=eE-R  .  .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,6]     .    . D==eER  .  .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,7]     .    .  D==eER .  .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,8]     .    .  D===eER.  .   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .    .   D----R.  .   pxor	%mm0, %mm0
+# CHECK-NEXT: [2,1]     .    .   DeE---R  .   movq	%mm0, %mm1
+# CHECK-NEXT: [2,2]     .    .    D----R  .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [2,3]     .    .    DeE---R .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,4]     .    .    .DeE--R .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [2,5]     .    .    .D=eE--R.   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,6]     .    .    . D=eE-R.   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,7]     .    .    . D==eE-R   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,8]     .    .    .  D==eER   movdqu	%xmm5, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    2.7       pxor	%mm0, %mm0
+# CHECK-NEXT: 1.     3     1.0    1.0    1.7       movq	%mm0, %mm1
+# CHECK-NEXT: 2.     3     0.0    0.0    3.0       xorps	%xmm0, %xmm0
+# CHECK-NEXT: 3.     3     1.0    1.0    1.7       movaps	%xmm0, %xmm1
+# CHECK-NEXT: 4.     3     1.3    0.0    1.0       movups	%xmm1, %xmm2
+# CHECK-NEXT: 5.     3     2.0    0.0    1.0       movapd	%xmm2, %xmm3
+# CHECK-NEXT: 6.     3     2.3    0.0    0.3       movupd	%xmm3, %xmm4
+# CHECK-NEXT: 7.     3     3.0    0.0    0.3       movdqa	%xmm4, %xmm5
+# CHECK-NEXT: 8.     3     3.3    0.0    0.0       movdqu	%xmm5, %xmm0
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s
new file mode 100644
index 00000000000..e3e0abc75e7
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s
@@ -0,0 +1,122 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+vxorps  %xmm0, %xmm0, %xmm0
+vmovaps %xmm0, %xmm1
+vmovups %xmm1, %xmm2
+vmovapd %xmm2, %xmm3
+vmovupd %xmm3, %xmm4
+vmovdqa %xmm4, %xmm5
+vmovdqu %xmm5, %xmm0
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      21
+# CHECK-NEXT: Total Cycles:      16
+# CHECK-NEXT: Total uOps:        21
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.31
+# CHECK-NEXT: IPC:               1.31
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.50                        vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      1     0.50                        vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  1      1     0.50                        vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  1      1     0.50                        vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    18
+# CHECK-NEXT: Max number of mappings used:         9
+
+# CHECK:      *  Register File #1 -- JFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     72
+# CHECK-NEXT:    Total number of mappings created: 18
+# CHECK-NEXT:    Max number of mappings used:      9
+
+# CHECK:      *  Register File #2 -- JIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     64
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -     2.00   2.00   3.00   3.00    -      -      -      -     1.00   1.00    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -     1.00   0.33   0.67    -      -      -      -      -      -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -     1.00    -     0.67   0.33    -      -      -      -      -      -      -     vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -     0.33   0.67    -      -      -      -      -     1.00    -     vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -     1.00    -      -     vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .    .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     DeER .    .    .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,2]     .DeER.    .    .   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [0,3]     .D=eER    .    .   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,4]     . D=eER   .    .   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,5]     . D==eER  .    .   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,6]     .  D==eER .    .   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     .  D----R .    .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     .   DeE--R.    .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,2]     .   D=eE-R.    .   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [1,3]     .    D=eE-R    .   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,4]     .    D==eER    .   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,5]     .    .D==eER   .   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,6]     .    .D===eER  .   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .    . D----R  .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .    . DeE---R .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,2]     .    .  DeE--R .   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [2,3]     .    .  D=eE--R.   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,4]     .    .   D=eE-R.   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,5]     .    .   D==eE-R   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,6]     .    .    D==eER   vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    2.7       vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     3     1.0    1.0    1.7       vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: 2.     3     1.3    0.0    1.0       vmovups	%xmm1, %xmm2
+# CHECK-NEXT: 3.     3     2.0    0.0    1.0       vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: 4.     3     2.3    0.0    0.3       vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: 5.     3     3.0    0.0    0.3       vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: 6.     3     3.3    0.0    0.0       vmovdqu	%xmm5, %xmm0
-- 
GitLab


From 09f76c80ba8d7572c8121d173b32d24f7259a795 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 10 Oct 2018 14:57:32 +0000
Subject: [PATCH 0005/1116] [llvm-exegesis][NFC] Pass Instruction instead of
 bare Opcode

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344145 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/Latency.cpp               |  3 +--
 tools/llvm-exegesis/lib/Latency.h                 |  2 +-
 tools/llvm-exegesis/lib/MCInstrDescView.cpp       | 15 +++++++++++----
 tools/llvm-exegesis/lib/MCInstrDescView.h         |  1 +
 tools/llvm-exegesis/lib/SnippetGenerator.cpp      |  4 ++--
 tools/llvm-exegesis/lib/SnippetGenerator.h        |  4 ++--
 tools/llvm-exegesis/lib/Uops.cpp                  |  6 ++----
 tools/llvm-exegesis/lib/Uops.h                    |  2 +-
 tools/llvm-exegesis/lib/X86/Target.cpp            | 10 ++++------
 tools/llvm-exegesis/llvm-exegesis.cpp             | 14 +++++++-------
 .../llvm-exegesis/X86/SnippetGeneratorTest.cpp    |  8 +++++---
 11 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp
index ea646f4f261..f6786b123ad 100644
--- a/tools/llvm-exegesis/lib/Latency.cpp
+++ b/tools/llvm-exegesis/lib/Latency.cpp
@@ -57,8 +57,7 @@ LatencySnippetGenerator::generateTwoInstructionPrototype(
 }
 
 llvm::Expected<CodeTemplate>
-LatencySnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
-  const Instruction Instr(State, Opcode);
+LatencySnippetGenerator::generateCodeTemplate(const Instruction &Instr) const {
   if (Instr.hasMemoryOperands())
     return llvm::make_error<BenchmarkFailure>(
         "Infeasible : has memory operands");
diff --git a/tools/llvm-exegesis/lib/Latency.h b/tools/llvm-exegesis/lib/Latency.h
index 37feb62e3dc..83c798f60f3 100644
--- a/tools/llvm-exegesis/lib/Latency.h
+++ b/tools/llvm-exegesis/lib/Latency.h
@@ -27,7 +27,7 @@ public:
   ~LatencySnippetGenerator() override;
 
   llvm::Expected<CodeTemplate>
-  generateCodeTemplate(unsigned Opcode) const override;
+  generateCodeTemplate(const Instruction &Instr) const override;
 
 private:
   llvm::Expected<CodeTemplate>
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
index d54f3ca2a45..fa9378856f4 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.cpp
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
@@ -88,7 +88,8 @@ const llvm::MCOperandInfo &Operand::getExplicitOperandInfo() const {
 }
 
 Instruction::Instruction(const LLVMState &State, unsigned Opcode)
-    : Description(&State.getInstrInfo().get(Opcode)) {
+    : Description(&State.getInstrInfo().get(Opcode)),
+      Name(State.getInstrInfo().getName(Opcode)) {
   const auto &RATC = State.getRATC();
   unsigned OpIndex = 0;
   for (; OpIndex < Description->getNumOperands(); ++OpIndex) {
@@ -198,6 +199,7 @@ bool Instruction::hasAliasingRegisters() const {
 
 void Instruction::dump(const llvm::MCRegisterInfo &RegInfo,
                        llvm::raw_ostream &Stream) const {
+  Stream << "- " << Name << "\n";
   for (const auto &Op : Operands) {
     Stream << "- Op" << Op.getIndex();
     if (Op.isExplicit())
@@ -227,10 +229,15 @@ void Instruction::dump(const llvm::MCRegisterInfo &RegInfo,
   }
   for (const auto &Var : Variables) {
     Stream << "- Var" << Var.getIndex();
-    Stream << " (";
-    for (auto OperandIndex : Var.TiedOperands)
+    Stream << " [";
+    bool IsFirst = true;
+    for (auto OperandIndex : Var.TiedOperands) {
+      if (!IsFirst)
+        Stream << ",";
       Stream << "Op" << OperandIndex;
-    Stream << ")";
+      IsFirst = false;
+    }
+    Stream << "]";
     Stream << "\n";
   }
   if (hasMemoryOperands())
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.h b/tools/llvm-exegesis/lib/MCInstrDescView.h
index 914bf51a22b..265476ae125 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.h
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.h
@@ -130,6 +130,7 @@ struct Instruction {
             llvm::raw_ostream &Stream) const;
 
   const llvm::MCInstrDesc *Description; // Never nullptr.
+  llvm::StringRef Name;                 // The name of this instruction.
   llvm::SmallVector<Operand, 8> Operands;
   llvm::SmallVector<Variable, 4> Variables;
   llvm::BitVector ImplDefRegs; // The set of aliased implicit def registers.
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
index 16dbd214e95..f7a76d88ccf 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
@@ -30,8 +30,8 @@ SnippetGenerator::SnippetGenerator(const LLVMState &State) : State(State) {}
 SnippetGenerator::~SnippetGenerator() = default;
 
 llvm::Expected<std::vector<BenchmarkCode>>
-SnippetGenerator::generateConfigurations(unsigned Opcode) const {
-  if (auto E = generateCodeTemplate(Opcode)) {
+SnippetGenerator::generateConfigurations(const Instruction &Instr) const {
+  if (auto E = generateCodeTemplate(Instr)) {
     CodeTemplate &CT = E.get();
     const auto &RATC = State.getRATC();
     const llvm::BitVector &ForbiddenRegs =
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.h b/tools/llvm-exegesis/lib/SnippetGenerator.h
index 24afe95fda0..c9a19cd0eeb 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.h
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.h
@@ -46,7 +46,7 @@ public:
 
   // Calls generateCodeTemplate and expands it into one or more BenchmarkCode.
   llvm::Expected<std::vector<BenchmarkCode>>
-  generateConfigurations(unsigned Opcode) const;
+  generateConfigurations(const Instruction &Instr) const;
 
   // Given a snippet, computes which registers the setup code needs to define.
   std::vector<RegisterValue> computeRegisterInitialValues(
@@ -66,7 +66,7 @@ protected:
 private:
   // API to be implemented by subclasses.
   virtual llvm::Expected<CodeTemplate>
-  generateCodeTemplate(unsigned Opcode) const = 0;
+  generateCodeTemplate(const Instruction &Instr) const = 0;
 };
 
 // A global Random Number Generator to randomize configurations.
diff --git a/tools/llvm-exegesis/lib/Uops.cpp b/tools/llvm-exegesis/lib/Uops.cpp
index fdb6a27ab59..1a701d169eb 100644
--- a/tools/llvm-exegesis/lib/Uops.cpp
+++ b/tools/llvm-exegesis/lib/Uops.cpp
@@ -125,13 +125,11 @@ void UopsSnippetGenerator::instantiateMemoryOperands(
 }
 
 llvm::Expected<CodeTemplate>
-UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
-  const auto &ET = State.getExegesisTarget();
+UopsSnippetGenerator::generateCodeTemplate(const Instruction &Instr) const {
   CodeTemplate CT;
-
   const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
-  const Instruction Instr(State, Opcode);
   if (Instr.hasMemoryOperands()) {
+    const auto &ET = State.getExegesisTarget();
     CT.ScratchSpacePointerInReg =
         ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
     if (CT.ScratchSpacePointerInReg == 0)
diff --git a/tools/llvm-exegesis/lib/Uops.h b/tools/llvm-exegesis/lib/Uops.h
index 33d0d8b1596..1cfa8242078 100644
--- a/tools/llvm-exegesis/lib/Uops.h
+++ b/tools/llvm-exegesis/lib/Uops.h
@@ -26,7 +26,7 @@ public:
   ~UopsSnippetGenerator() override;
 
   llvm::Expected<CodeTemplate>
-  generateCodeTemplate(unsigned Opcode) const override;
+  generateCodeTemplate(const Instruction &Instr) const override;
 
   static constexpr const size_t kMinNumDifferentAddresses = 6;
 
diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index 8c03f1ac826..440996ad555 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -26,10 +26,9 @@ template <typename Impl> class X86SnippetGenerator : public Impl {
   using Impl::Impl;
 
   llvm::Expected<CodeTemplate>
-  generateCodeTemplate(unsigned Opcode) const override {
+  generateCodeTemplate(const Instruction &Instr) const override {
     // Test whether we can generate a snippet for this instruction.
-    const auto &InstrInfo = this->State.getInstrInfo();
-    const auto OpcodeName = InstrInfo.getName(Opcode);
+    const auto OpcodeName = Instr.Name;
     if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") ||
         OpcodeName.startswith("ADJCALLSTACK")) {
       return llvm::make_error<BenchmarkFailure>(
@@ -38,8 +37,7 @@ template <typename Impl> class X86SnippetGenerator : public Impl {
 
     // Handle X87.
     const unsigned FPInstClass =
-        InstrInfo.get(Opcode).TSFlags & llvm::X86II::FPTypeMask;
-    const Instruction Instr(this->State, Opcode);
+        Instr.Description->TSFlags & llvm::X86II::FPTypeMask;
     switch (FPInstClass) {
     case llvm::X86II::NotFP:
       break;
@@ -67,7 +65,7 @@ template <typename Impl> class X86SnippetGenerator : public Impl {
     }
 
     // Fallback to generic implementation.
-    return Impl::Base::generateCodeTemplate(Opcode);
+    return Impl::Base::generateCodeTemplate(Instr);
   }
 };
 
diff --git a/tools/llvm-exegesis/llvm-exegesis.cpp b/tools/llvm-exegesis/llvm-exegesis.cpp
index 8fed1375c6f..b4891f1f1db 100644
--- a/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -124,12 +124,8 @@ static unsigned getOpcodeOrDie(const llvm::MCInstrInfo &MCInstrInfo) {
 // Generates code snippets for opcode `Opcode`.
 static llvm::Expected<std::vector<BenchmarkCode>>
 generateSnippets(const LLVMState &State, unsigned Opcode) {
-  const std::unique_ptr<SnippetGenerator> Generator =
-      State.getExegesisTarget().createSnippetGenerator(BenchmarkMode, State);
-  if (!Generator)
-    llvm::report_fatal_error("cannot create snippet generator");
-
-  const llvm::MCInstrDesc &InstrDesc = State.getInstrInfo().get(Opcode);
+  const Instruction Instr(State, Opcode);
+  const llvm::MCInstrDesc &InstrDesc = *Instr.Description;
   // Ignore instructions that we cannot run.
   if (InstrDesc.isPseudo())
     return llvm::make_error<BenchmarkFailure>("Unsupported opcode: isPseudo");
@@ -140,7 +136,11 @@ generateSnippets(const LLVMState &State, unsigned Opcode) {
     return llvm::make_error<BenchmarkFailure>(
         "Unsupported opcode: isCall/isReturn");
 
-  return Generator->generateConfigurations(Opcode);
+  const std::unique_ptr<SnippetGenerator> Generator =
+      State.getExegesisTarget().createSnippetGenerator(BenchmarkMode, State);
+  if (!Generator)
+    llvm::report_fatal_error("cannot create snippet generator");
+  return Generator->generateConfigurations(Instr);
 }
 
 namespace {
diff --git a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
index f2539aaea18..4df489df06f 100644
--- a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -59,7 +59,8 @@ protected:
 
   CodeTemplate checkAndGetCodeTemplate(unsigned Opcode) {
     randomGenerator().seed(0); // Initialize seed.
-    auto CodeTemplateOrError = Generator.generateCodeTemplate(Opcode);
+    const Instruction Instr(State, Opcode);
+    auto CodeTemplateOrError = Generator.generateCodeTemplate(Instr);
     EXPECT_FALSE(CodeTemplateOrError.takeError()); // Valid configuration.
     return std::move(CodeTemplateOrError.get());
   }
@@ -238,7 +239,8 @@ TEST_F(UopsSnippetGeneratorTest, MemoryUse) {
 TEST_F(UopsSnippetGeneratorTest, MemoryUse_Movsb) {
   // MOVSB writes to scratch memory register.
   const unsigned Opcode = llvm::X86::MOVSB;
-  auto Error = Generator.generateCodeTemplate(Opcode).takeError();
+  const Instruction Instr(State, Opcode);
+  auto Error = Generator.generateCodeTemplate(Instr).takeError();
   EXPECT_TRUE((bool)Error);
   llvm::consumeError(std::move(Error));
 }
@@ -253,7 +255,7 @@ public:
 
 private:
   llvm::Expected<CodeTemplate>
-  generateCodeTemplate(unsigned Opcode) const override {
+  generateCodeTemplate(const Instruction &Instr) const override {
     return llvm::make_error<llvm::StringError>("not implemented",
                                                llvm::inconvertibleErrorCode());
   }
-- 
GitLab


From d93bcaaa5bc5c40187a582a78c39d240db9cebbc Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Wed, 10 Oct 2018 16:08:02 +0000
Subject: [PATCH 0006/1116] [llvm-mca] Minor refactoring in preparation for a
 patch that will fully fix PR36671. NFCI

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344149 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../include/HardwareUnits/RegisterFile.h         | 11 ++++++-----
 .../llvm-mca/lib/HardwareUnits/RegisterFile.cpp  | 16 +++++++++-------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
index 1026079c377..6a45c707de0 100644
--- a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
+++ b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
@@ -68,9 +68,11 @@ class RegisterFile : public HardwareUnit {
     bool AllowZeroMoveEliminationOnly;
 
     RegisterMappingTracker(unsigned NumPhysRegisters,
-                           unsigned MaxMoveEliminated = 0U)
+                           unsigned MaxMoveEliminated = 0U,
+                           bool AllowZeroMoveElimOnly = false)
         : NumPhysRegs(NumPhysRegisters), NumUsedPhysRegs(0),
-          MaxMoveEliminatedPerCycle(MaxMoveEliminated), NumMoveEliminated(0U) {}
+          MaxMoveEliminatedPerCycle(MaxMoveEliminated), NumMoveEliminated(0U),
+          AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly) {}
   };
 
   // A vector of register file descriptors.  This set always contains at least
@@ -151,9 +153,8 @@ class RegisterFile : public HardwareUnit {
   // Here FPRegisterFile contains all the registers defined by register class
   // VR128RegClass and VR256RegClass. FPRegisterFile implements 60
   // registers which can be used for register renaming purpose.
-  void
-  addRegisterFile(llvm::ArrayRef<llvm::MCRegisterCostEntry> RegisterClasses,
-                  unsigned NumPhysRegs);
+  void addRegisterFile(const llvm::MCRegisterFileDesc &RF,
+                       llvm::ArrayRef<llvm::MCRegisterCostEntry> Entries);
 
   // Consumes physical registers in each register file specified by the
   // `IndexPlusCostPairTy`. This method is called from `addRegisterMapping()`.
diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
index 51a24786139..01131253b5b 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
@@ -37,7 +37,7 @@ void RegisterFile::initialize(const MCSchedModel &SM, unsigned NumRegs) {
   // declared by the target. The number of physical registers in the default
   // register file is set equal to `NumRegs`. A value of zero for `NumRegs`
   // means: this register file has an unbounded number of physical registers.
-  addRegisterFile({} /* all registers */, NumRegs);
+  RegisterFiles.emplace_back(NumRegs);
   if (!SM.hasExtraProcessorInfo())
     return;
 
@@ -48,15 +48,17 @@ void RegisterFile::initialize(const MCSchedModel &SM, unsigned NumRegs) {
   for (unsigned I = 0, E = Info.NumRegisterFiles; I < E; ++I) {
     const MCRegisterFileDesc &RF = Info.RegisterFiles[I];
     // Skip invalid register files with zero physical registers.
-    unsigned Length = RF.NumRegisterCostEntries;
+    // TODO: verify this constraint in SubtargetEmitter, and convert this
+    // statement into an assert.
     if (!RF.NumPhysRegs)
       continue;
+
     // The cost of a register definition is equivalent to the number of
     // physical registers that are allocated at register renaming stage.
+    unsigned Length = RF.NumRegisterCostEntries;
     const MCRegisterCostEntry *FirstElt =
         &Info.RegisterCostTable[RF.RegisterCostEntryIdx];
-    addRegisterFile(ArrayRef<MCRegisterCostEntry>(FirstElt, Length),
-                    RF.NumPhysRegs);
+    addRegisterFile(RF, ArrayRef<MCRegisterCostEntry>(FirstElt, Length));
   }
 }
 
@@ -65,15 +67,15 @@ void RegisterFile::cycleStart() {
     RMT.NumMoveEliminated = 0;
 }
 
-void RegisterFile::addRegisterFile(ArrayRef<MCRegisterCostEntry> Entries,
-                                   unsigned NumPhysRegs) {
+void RegisterFile::addRegisterFile(const MCRegisterFileDesc &RF,
+                                   ArrayRef<MCRegisterCostEntry> Entries) {
   // A default register file is always allocated at index #0. That register file
   // is mainly used to count the total number of mappings created by all
   // register files at runtime. Users can limit the number of available physical
   // registers in register file #0 through the command line flag
   // `-register-file-size`.
   unsigned RegisterFileIndex = RegisterFiles.size();
-  RegisterFiles.emplace_back(NumPhysRegs);
+  RegisterFiles.emplace_back(RF.NumPhysRegs);
 
   // Special case where there is no register class identifier in the set.
   // An empty set of register classes means: this register file contains all
-- 
GitLab


From 0768811666e1fa7b814858ed657b2b2e0055a8f7 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 10 Oct 2018 16:16:43 +0000
Subject: [PATCH 0007/1116] [llvm-exegesis] Fix always true assert

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344151 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/MCInstrDescView.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.h b/tools/llvm-exegesis/lib/MCInstrDescView.h
index 265476ae125..6910538a31f 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.h
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.h
@@ -81,7 +81,7 @@ struct Operand {
   const llvm::MCOperandInfo &getExplicitOperandInfo() const;
 
   // Please use the accessors above and not the following fields.
-  unsigned Index = 0;
+  int Index = -1;
   bool IsDef = false;
   const RegisterAliasingTracker *Tracker = nullptr; // Set for Register Op.
   const llvm::MCOperandInfo *Info = nullptr;        // Set for Explicit Op.
-- 
GitLab


From 7b3bebb197858be97876e0eb8b98b2f0be71387b Mon Sep 17 00:00:00 2001
From: Scott Linder <scott@scottlinder.com>
Date: Wed, 10 Oct 2018 16:35:47 +0000
Subject: [PATCH 0008/1116] Relax trivial cast requirements in
 CallPromotionUtils

Differential Revision: https://reviews.llvm.org/D52792


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344153 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/CallPromotionUtils.cpp   | 14 +++---
 .../Util/call-promotion-utils-ptrcast.ll      | 50 +++++++++++++++++++
 2 files changed, 58 insertions(+), 6 deletions(-)
 create mode 100644 test/Transforms/Util/call-promotion-utils-ptrcast.ll

diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp
index 6d18d061461..261ab87c3e7 100644
--- a/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -177,8 +177,8 @@ static void createRetBitCast(CallSite CS, Type *RetTy, CastInst **RetBitCast) {
     InsertBefore = &*std::next(CS.getInstruction()->getIterator());
 
   // Bitcast the return value to the correct type.
-  auto *Cast = CastInst::Create(Instruction::BitCast, CS.getInstruction(),
-                                RetTy, "", InsertBefore);
+  auto *Cast = CastInst::CreateBitOrPointerCast(CS.getInstruction(), RetTy, "",
+                                                InsertBefore);
   if (RetBitCast)
     *RetBitCast = Cast;
 
@@ -321,12 +321,14 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee,
                             const char **FailureReason) {
   assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted");
 
+  auto &DL = Callee->getParent()->getDataLayout();
+
   // Check the return type. The callee's return value type must be bitcast
   // compatible with the call site's type.
   Type *CallRetTy = CS.getInstruction()->getType();
   Type *FuncRetTy = Callee->getReturnType();
   if (CallRetTy != FuncRetTy)
-    if (!CastInst::isBitCastable(FuncRetTy, CallRetTy)) {
+    if (!CastInst::isBitOrNoopPointerCastable(FuncRetTy, CallRetTy, DL)) {
       if (FailureReason)
         *FailureReason = "Return type mismatch";
       return false;
@@ -351,7 +353,7 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee,
     Type *ActualTy = CS.getArgument(I)->getType();
     if (FormalTy == ActualTy)
       continue;
-    if (!CastInst::isBitCastable(ActualTy, FormalTy)) {
+    if (!CastInst::isBitOrNoopPointerCastable(ActualTy, FormalTy, DL)) {
       if (FailureReason)
         *FailureReason = "Argument type mismatch";
       return false;
@@ -396,8 +398,8 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
     Type *FormalTy = CalleeType->getParamType(ArgNo);
     Type *ActualTy = Arg->getType();
     if (FormalTy != ActualTy) {
-      auto *Cast = CastInst::Create(Instruction::BitCast, Arg, FormalTy, "",
-                                    CS.getInstruction());
+      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "",
+                                                    CS.getInstruction());
       CS.setArgument(ArgNo, Cast);
     }
   }
diff --git a/test/Transforms/Util/call-promotion-utils-ptrcast.ll b/test/Transforms/Util/call-promotion-utils-ptrcast.ll
new file mode 100644
index 00000000000..351ec292f18
--- /dev/null
+++ b/test/Transforms/Util/call-promotion-utils-ptrcast.ll
@@ -0,0 +1,50 @@
+; RUN: opt -S -pgo-icall-prom -icp-total-percent-threshold=0 -icp-max-prom=4 < %s 2>&1 | FileCheck %s
+
+; Test that CallPromotionUtils will promote calls which require pointer casts.
+
+@foo = common global i64 (i64)* null, align 8
+
+; Check ptrcast arguments.
+define i64 @func1(i8* %a) {
+  ret i64 undef
+}
+
+; Check ptrcast return.
+define i8* @func2(i64 %a) {
+  ret i8* undef
+}
+
+; Check ptrcast arguments and return.
+define i8* @func3(i8 *%a) {
+  ret i8* undef
+}
+
+; Check mixed ptrcast and bitcast.
+define i8* @func4(double %f) {
+  ret i8* undef
+}
+
+define i64 @bar() {
+  %tmp = load i64 (i64)*, i64 (i64)** @foo, align 8
+
+; CHECK: [[ARG:%[0-9]+]] = bitcast i64 1 to double
+; CHECK-NEXT: [[RET:%[0-9]+]] = call i8* @func4(double [[ARG]])
+; CHECK-NEXT: ptrtoint i8* [[RET]] to i64
+
+; CHECK: [[RET:%[0-9]+]] = call i8* @func2(i64 1)
+; CHECK-NEXT: ptrtoint i8* [[RET]] to i64
+
+; CHECK: [[ARG:%[0-9]+]] = inttoptr i64 1 to i8*
+; CHECK-NEXT: [[RET:%[0-9]+]] = call i8* @func3(i8* [[ARG]])
+; CHECK-NEXT: ptrtoint i8* [[RET]] to i64
+
+; CHECK: [[ARG:%[0-9]+]] = inttoptr i64 1 to i8*
+; CHECK-NEXT: call i64 @func1(i8* [[ARG]])
+; CHECK-NOT: ptrtoint
+; CHECK-NOT: bitcast
+
+  %call = call i64 %tmp(i64 1), !prof !1
+  ret i64 %call
+}
+
+!1 = !{!"VP", i32 0, i64 1600, i64 7651369219802541373, i64 1030, i64 -4377547752858689819, i64 410, i64 -6929281286627296573, i64 150, i64 -2545542355363006406, i64 10}
-- 
GitLab


From a50609a0fa137e401a255fcbe050e1eeaf7a77a6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 10 Oct 2018 17:37:32 +0000
Subject: [PATCH 0009/1116] Change the timestamp of llvmcache-foo file to meet
 the thinLTO prune policy

The case will randomly fail if we test it with command "
while llvm-lit test/tools/gold/X86/cache.ll ; do true; done". It is because the llvmcache-foo file is younger than llvmcache-349F039B8EB076D412007D82778442BED3148C4E and llvmcache-A8107945C65C2B2BBEE8E61AA604C311D60D58D6. But due to timestamp precision reason their timestamp is the same. Given the same timestamp, the file prune policy is to remove bigger size file first, so mostly foo file is removed for its bigger size. And the files size is under threshold after deleting foo file. That's what test case expect.

However sometimes, the precision is enough to measure that timestamp of llvmcache-349F039B8EB076D412007D82778442BED3148C4E and llvmcache-A8107945C65C2B2BBEE8E61AA604C311D60D58D6 are smaller than foo, so llvmcache-349F039B8EB076D412007D82778442BED3148C4E and llvmcache-A8107945C65C2B2BBEE8E61AA604C311D60D58D6 are deleted first. Since the files size is still above the file size threshold after deleting the 2 files, the foo file is also deleted. And then the test case fails, because it expect only one file should be deleted instead of 3.

The fix is to change the timestamp of llvmcache-foo file to meet the thinLTO prune policy.

Patch by Luo Yuanke.

Differential Revision: https://reviews.llvm.org/D52452

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344158 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/gold/X86/cache.ll | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/tools/gold/X86/cache.ll b/test/tools/gold/X86/cache.ll
index 51ffee282b1..4446aa6d887 100644
--- a/test/tools/gold/X86/cache.ll
+++ b/test/tools/gold/X86/cache.ll
@@ -53,6 +53,9 @@
 ; RUN: ls %t.cache | count 5
 
 
+; Increase the age of llvmcache-foo
+; RUN: touch -r %t.cache/llvmcache-foo -d '-2 minutes' %t.cache/llvmcache-foo
+
 ; This should remove it.
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:     --plugin-opt=thinlto \
-- 
GitLab


From 3b607cb1fc8c262c2e8496c44814325a8cd0729a Mon Sep 17 00:00:00 2001
From: Renato Golin <renato.golin@linaro.org>
Date: Wed, 10 Oct 2018 17:55:21 +0000
Subject: [PATCH 0010/1116] [VPlan] Fix CondBit quoting in dumpBasicBlock

Quotes were being printed for VPInstructions but not the rest.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344161 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/VPlan.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp
index 09d20fbdefe..39cb4e9ec68 100644
--- a/lib/Transforms/Vectorize/VPlan.cpp
+++ b/lib/Transforms/Vectorize/VPlan.cpp
@@ -543,8 +543,10 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
     if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) {
       CBI->printAsOperand(OS);
       OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\"";
-    } else
+    } else {
       CBV->printAsOperand(OS);
+      OS << "\"";
+    }
   }
 
   bumpIndent(-2);
-- 
GitLab


From 1cc98e6672b6319fdb00b70dd4474aabdadbe193 Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Wed, 10 Oct 2018 17:58:09 +0000
Subject: [PATCH 0011/1116] [OptRemarks] Add library for parsing optimization
 remarks

Add a library that parses optimization remarks (currently YAML, so based
on the YAMLParser).

The goal is to be able to provide tools a remark parser that is not
completely dependent on YAML, in case we decide to change the format
later.

It exposes a C API which takes a handler that is called with the remark
structure.

It adds a libLLVMOptRemark.a static library, and it's used in-tree by
the llvm-opt-report tool (from which the parser has been mostly moved
out).

Differential Revision: https://reviews.llvm.org/D52776

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344162 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm-c/OptRemarks.h                   | 197 ++++++++
 lib/CMakeLists.txt                            |   1 +
 lib/LLVMBuild.txt                             |   1 +
 lib/OptRemarks/CMakeLists.txt                 |   3 +
 lib/OptRemarks/LLVMBuild.txt                  |  22 +
 lib/OptRemarks/OptRemarksParser.cpp           | 368 +++++++++++++++
 tools/llvm-opt-report/CMakeLists.txt          |   2 +-
 tools/llvm-opt-report/OptReport.cpp           | 144 ++----
 unittests/CMakeLists.txt                      |   1 +
 unittests/OptRemarks/CMakeLists.txt           |   8 +
 .../OptRemarks/OptRemarksParsingTest.cpp      | 433 ++++++++++++++++++
 11 files changed, 1073 insertions(+), 107 deletions(-)
 create mode 100644 include/llvm-c/OptRemarks.h
 create mode 100644 lib/OptRemarks/CMakeLists.txt
 create mode 100644 lib/OptRemarks/LLVMBuild.txt
 create mode 100644 lib/OptRemarks/OptRemarksParser.cpp
 create mode 100644 unittests/OptRemarks/CMakeLists.txt
 create mode 100644 unittests/OptRemarks/OptRemarksParsingTest.cpp

diff --git a/include/llvm-c/OptRemarks.h b/include/llvm-c/OptRemarks.h
new file mode 100644
index 00000000000..f3449cc1b8c
--- /dev/null
+++ b/include/llvm-c/OptRemarks.h
@@ -0,0 +1,197 @@
+/*===-- llvm-c/OptRemarks.h - OptRemarks Public C Interface -------*- C -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header provides a public interface to an opt-remark library.          *|
+|* LLVM provides an implementation of this interface.                         *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_OPT_REMARKS_H
+#define LLVM_C_OPT_REMARKS_H
+
+#include "llvm-c/Core.h"
+#include "llvm-c/Types.h"
+#ifdef __cplusplus
+#include <cstddef>
+extern "C" {
+#else
+#include <stddef.h>
+#endif /* !defined(__cplusplus) */
+
+/**
+ * @defgroup LLVMCOPTREMARKS OptRemarks
+ * @ingroup LLVMC
+ *
+ * @{
+ */
+
+#define OPT_REMARKS_API_VERSION 0
+
+/**
+ * String containing a buffer and a length. The buffer is not guaranteed to be
+ * zero-terminated.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  const char *Str;
+  uint32_t Len;
+} LLVMOptRemarkStringRef;
+
+/**
+ * DebugLoc containing File, Line and Column.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  // File:
+  LLVMOptRemarkStringRef SourceFile;
+  // Line:
+  uint32_t SourceLineNumber;
+  // Column:
+  uint32_t SourceColumnNumber;
+} LLVMOptRemarkDebugLoc;
+
+/**
+ * Element of the "Args" list. The key might give more information about what
+ * are the semantics of the value, e.g. "Callee" will tell you that the value
+ * is a symbol that names a function.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  // e.g. "Callee"
+  LLVMOptRemarkStringRef Key;
+  // e.g. "malloc"
+  LLVMOptRemarkStringRef Value;
+
+  // "DebugLoc": Optional
+  LLVMOptRemarkDebugLoc DebugLoc;
+} LLVMOptRemarkArg;
+
+/**
+ * One remark entry.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  // e.g. !Missed, !Passed
+  LLVMOptRemarkStringRef RemarkType;
+  // "Pass": Required
+  LLVMOptRemarkStringRef PassName;
+  // "Name": Required
+  LLVMOptRemarkStringRef RemarkName;
+  // "Function": Required
+  LLVMOptRemarkStringRef FunctionName;
+
+  // "DebugLoc": Optional
+  LLVMOptRemarkDebugLoc DebugLoc;
+  // "Hotness": Optional
+  uint32_t Hotness;
+  // "Args": Optional. It is an array of `num_args` elements.
+  uint32_t NumArgs;
+  LLVMOptRemarkArg *Args;
+} LLVMOptRemarkEntry;
+
+typedef struct LLVMOptRemarkOpaqueParser *LLVMOptRemarkParserRef;
+
+/**
+ * Creates a remark parser that can be used to read and parse the buffer located
+ * in \p Buf of size \p Size.
+ *
+ * \p Buf cannot be NULL.
+ *
+ * This function should be paired with LLVMOptRemarkParserDispose() to avoid
+ * leaking resources.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf,
+                                                        uint64_t Size);
+
+/**
+ * Returns the next remark in the file.
+ *
+ * The value pointed to by the return value is invalidated by the next call to
+ * LLVMOptRemarkParserGetNext().
+ *
+ * If the parser reaches the end of the buffer, the return value will be NULL.
+ *
+ * In the case of an error, the return value will be NULL, and:
+ *
+ * 1) LLVMOptRemarkParserHasError() will return `1`.
+ *
+ * 2) LLVMOptRemarkParserGetErrorMessage() will return a descriptive error
+ *    message.
+ *
+ * An error may occur if:
+ *
+ * 1) An argument is invalid.
+ *
+ * 2) There is a YAML parsing error. This type of error aborts parsing
+ *    immediately and returns `1`. It can occur on malformed YAML.
+ *
+ * 3) Remark parsing error. If this type of error occurs, the parser won't call
+ *    the handler and will continue to the next one. It can occur on malformed
+ *    remarks, like missing or extra fields in the file.
+ *
+ * Here is a quick example of the usage:
+ *
+ * ```
+ *  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, Size);
+ *  LLVMOptRemarkEntry *Remark = NULL;
+ *  while ((Remark == LLVMOptRemarkParserGetNext(Parser))) {
+ *    // use Remark
+ *  }
+ *  bool HasError = LLVMOptRemarkParserHasError(Parser);
+ *  LLVMOptRemarkParserDispose(Parser);
+ * ```
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern LLVMOptRemarkEntry *
+LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser);
+
+/**
+ * Returns `1` if the parser encountered an error while parsing the buffer.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser);
+
+/**
+ * Returns a null-terminated string containing an error message.
+ *
+ * In case of no error, the result is `NULL`.
+ *
+ * The memory of the string is bound to the lifetime of \p Parser. If
+ * LLVMOptRemarkParserDispose() is called, the memory of the string will be
+ * released.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern const char *
+LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser);
+
+/**
+ * Releases all the resources used by \p Parser.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser);
+
+/**
+ * @} // endgoup LLVMCOPTREMARKS
+ */
+
+#ifdef __cplusplus
+}
+#endif /* !defined(__cplusplus) */
+
+#endif /* LLVM_C_OPT_REMARKS_H */
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index ecf8b93d253..1f54c611bad 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -15,6 +15,7 @@ add_subdirectory(MC)
 add_subdirectory(Object)
 add_subdirectory(ObjectYAML)
 add_subdirectory(Option)
+add_subdirectory(OptRemarks)
 add_subdirectory(DebugInfo)
 add_subdirectory(ExecutionEngine)
 add_subdirectory(Target)
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index a6cd15699fb..0eb4bba2676 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -35,6 +35,7 @@ subdirectories =
  BinaryFormat
  ObjectYAML
  Option
+ OptRemarks
  Passes
  ProfileData
  Support
diff --git a/lib/OptRemarks/CMakeLists.txt b/lib/OptRemarks/CMakeLists.txt
new file mode 100644
index 00000000000..8fefe1d986b
--- /dev/null
+++ b/lib/OptRemarks/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMOptRemarks
+  OptRemarksParser.cpp
+)
diff --git a/lib/OptRemarks/LLVMBuild.txt b/lib/OptRemarks/LLVMBuild.txt
new file mode 100644
index 00000000000..4c1032296dc
--- /dev/null
+++ b/lib/OptRemarks/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/OptRemarks/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = OptRemarks
+parent = Libraries
+required_libraries = Support
diff --git a/lib/OptRemarks/OptRemarksParser.cpp b/lib/OptRemarks/OptRemarksParser.cpp
new file mode 100644
index 00000000000..4b8b038c832
--- /dev/null
+++ b/lib/OptRemarks/OptRemarksParser.cpp
@@ -0,0 +1,368 @@
+//===- OptRemarksParser.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides utility methods used by clients that want to use the
+// parser for optimization remarks in LLVM.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/OptRemarks.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLTraits.h"
+
+using namespace llvm;
+
+namespace {
+struct RemarkParser {
+  /// Source manager for better error messages.
+  SourceMgr SM;
+  /// Stream for yaml parsing.
+  yaml::Stream Stream;
+  /// Storage for the error stream.
+  std::string ErrorString;
+  /// The error stream.
+  raw_string_ostream ErrorStream;
+  /// Iterator in the YAML stream.
+  yaml::document_iterator DI;
+  /// The parsed remark (if any).
+  Optional<LLVMOptRemarkEntry> LastRemark;
+  /// Temporary parsing buffer for the arguments.
+  SmallVector<LLVMOptRemarkArg, 8> TmpArgs;
+  /// The state used by the parser to parse a remark entry. Invalidated with
+  /// every call to `parseYAMLElement`.
+  struct ParseState {
+    /// Temporary parsing buffer for the arguments.
+    SmallVectorImpl<LLVMOptRemarkArg> *Args;
+    StringRef Type;
+    StringRef Pass;
+    StringRef Name;
+    StringRef Function;
+    /// Optional.
+    Optional<StringRef> File;
+    Optional<unsigned> Line;
+    Optional<unsigned> Column;
+    Optional<unsigned> Hotness;
+
+    ParseState(SmallVectorImpl<LLVMOptRemarkArg> &Args) : Args(&Args) {}
+    /// Use Args only as a **temporary** buffer.
+    ~ParseState() { Args->clear(); }
+  };
+
+  ParseState State;
+
+  /// Set to `true` if we had any errors during parsing.
+  bool HadAnyErrors = false;
+
+  RemarkParser(StringRef Buf)
+      : SM(), Stream(Buf, SM), ErrorString(), ErrorStream(ErrorString),
+        DI(Stream.begin()), LastRemark(), TmpArgs(), State(TmpArgs) {
+    SM.setDiagHandler(RemarkParser::HandleDiagnostic, this);
+  }
+
+  /// Parse a YAML element.
+  Error parseYAMLElement(yaml::Document &Remark);
+
+private:
+  /// Parse one key to a string.
+  /// otherwise.
+  Error parseKey(StringRef &Result, yaml::KeyValueNode &Node);
+  /// Parse one value to a string.
+  Error parseValue(StringRef &Result, yaml::KeyValueNode &Node);
+  /// Parse one value to an unsigned.
+  Error parseValue(Optional<unsigned> &Result, yaml::KeyValueNode &Node);
+  /// Parse a debug location.
+  Error parseDebugLoc(Optional<StringRef> &File, Optional<unsigned> &Line,
+                      Optional<unsigned> &Column, yaml::KeyValueNode &Node);
+  /// Parse an argument.
+  Error parseArg(SmallVectorImpl<LLVMOptRemarkArg> &TmpArgs, yaml::Node &Node);
+
+  /// Handle a diagnostic from the YAML stream. Records the error in the
+  /// RemarkParser class.
+  static void HandleDiagnostic(const SMDiagnostic &Diag, void *Ctx) {
+    assert(Ctx && "Expected non-null Ctx in diagnostic handler.");
+    auto *Parser = static_cast<RemarkParser *>(Ctx);
+    Diag.print(/*ProgName=*/nullptr, Parser->ErrorStream, /*ShowColors*/ false,
+               /*ShowKindLabels*/ true);
+  }
+};
+
+class ParseError : public ErrorInfo<ParseError> {
+public:
+  static char ID;
+
+  ParseError(StringRef Message, yaml::Node &Node)
+      : Message(Message), Node(Node) {}
+
+  void log(raw_ostream &OS) const override { OS << Message; }
+  std::error_code convertToErrorCode() const override {
+    return inconvertibleErrorCode();
+  }
+
+  StringRef getMessage() const { return Message; }
+  yaml::Node &getNode() const { return Node; }
+
+private:
+  StringRef Message; // No need to hold a full copy of the buffer.
+  yaml::Node &Node;
+};
+
+char ParseError::ID = 0;
+
+static LLVMOptRemarkStringRef toOptRemarkStr(StringRef Str) {
+  return {Str.data(), static_cast<uint32_t>(Str.size())};
+}
+
+Error RemarkParser::parseKey(StringRef &Result, yaml::KeyValueNode &Node) {
+  auto *Key = dyn_cast<yaml::ScalarNode>(Node.getKey());
+  if (!Key)
+    return make_error<ParseError>("key is not a string.", Node);
+
+  Result = Key->getRawValue();
+  return Error::success();
+}
+
+Error RemarkParser::parseValue(StringRef &Result, yaml::KeyValueNode &Node) {
+  auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
+  if (!Value)
+    return make_error<ParseError>("expected a value of scalar type.", Node);
+  Result = Value->getRawValue();
+
+  if (Result.front() == '\'')
+    Result = Result.drop_front();
+
+  if (Result.back() == '\'')
+    Result = Result.drop_back();
+
+  return Error::success();
+}
+
+Error RemarkParser::parseValue(Optional<unsigned> &Result,
+                               yaml::KeyValueNode &Node) {
+  SmallVector<char, 4> Tmp;
+  auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
+  if (!Value)
+    return make_error<ParseError>("expected a value of scalar type.", Node);
+  unsigned UnsignedValue = 0;
+  if (Value->getValue(Tmp).getAsInteger(10, UnsignedValue))
+    return make_error<ParseError>("expected a value of integer type.", *Value);
+  Result = UnsignedValue;
+  return Error::success();
+}
+
+Error RemarkParser::parseDebugLoc(Optional<StringRef> &File,
+                                  Optional<unsigned> &Line,
+                                  Optional<unsigned> &Column,
+                                  yaml::KeyValueNode &Node) {
+  auto *DebugLoc = dyn_cast<yaml::MappingNode>(Node.getValue());
+  if (!DebugLoc)
+    return make_error<ParseError>("expected a value of mapping type.", Node);
+
+  for (yaml::KeyValueNode &DLNode : *DebugLoc) {
+    StringRef KeyName;
+    if (Error E = parseKey(KeyName, DLNode))
+      return E;
+    if (KeyName == "File") {
+      File = StringRef(); // Set the optional to contain a default constructed
+                          // value, to be passed to the parsing function.
+      if (Error E = parseValue(*File, DLNode))
+        return E;
+    } else if (KeyName == "Column") {
+      if (Error E = parseValue(Column, DLNode))
+        return E;
+    } else if (KeyName == "Line") {
+      if (Error E = parseValue(Line, DLNode))
+        return E;
+    } else {
+      return make_error<ParseError>("unknown entry in DebugLoc map.", DLNode);
+    }
+  }
+
+  // If any of the debug loc fields is missing, return an error.
+  if (!File || !Line || !Column)
+    return make_error<ParseError>("DebugLoc node incomplete.", Node);
+
+  return Error::success();
+}
+
+Error RemarkParser::parseArg(SmallVectorImpl<LLVMOptRemarkArg> &Args,
+                             yaml::Node &Node) {
+  auto *ArgMap = dyn_cast<yaml::MappingNode>(&Node);
+  if (!ArgMap)
+    return make_error<ParseError>("expected a value of mapping type.", Node);
+
+  StringRef ValueStr;
+  StringRef KeyStr;
+  Optional<StringRef> File;
+  Optional<unsigned> Line;
+  Optional<unsigned> Column;
+
+  for (yaml::KeyValueNode &ArgEntry : *ArgMap) {
+    StringRef KeyName;
+    if (Error E = parseKey(KeyName, ArgEntry))
+      return E;
+
+    // Try to parse debug locs.
+    if (KeyName == "DebugLoc") {
+      // Can't have multiple DebugLoc entries per argument.
+      if (File || Line || Column)
+        return make_error<ParseError>(
+            "only one DebugLoc entry is allowed per argument.", ArgEntry);
+
+      if (Error E = parseDebugLoc(File, Line, Column, ArgEntry))
+        return E;
+      continue;
+    }
+
+    // If we already have a string, error out.
+    if (!ValueStr.empty())
+      return make_error<ParseError>(
+          "only one string entry is allowed per argument.", ArgEntry);
+
+    // Try to parse a string.
+    if (Error E = parseValue(ValueStr, ArgEntry))
+      return E;
+
+    // Keep the key from the string.
+    KeyStr = KeyName;
+  }
+
+  if (KeyStr.empty())
+    return make_error<ParseError>("argument key is missing.", *ArgMap);
+  if (ValueStr.empty())
+    return make_error<ParseError>("argument value is missing.", *ArgMap);
+
+  Args.push_back(LLVMOptRemarkArg{
+      toOptRemarkStr(KeyStr), toOptRemarkStr(ValueStr),
+      LLVMOptRemarkDebugLoc{toOptRemarkStr(File.getValueOr(StringRef())),
+                            Line.getValueOr(0), Column.getValueOr(0)}});
+
+  return Error::success();
+}
+
+Error RemarkParser::parseYAMLElement(yaml::Document &Remark) {
+  // Parsing a new remark, clear the previous one.
+  LastRemark = None;
+  State = ParseState(TmpArgs);
+
+  auto *Root = dyn_cast<yaml::MappingNode>(Remark.getRoot());
+  if (!Root)
+    return make_error<ParseError>("document root is not of mapping type.",
+                                  *Remark.getRoot());
+
+  State.Type = Root->getRawTag();
+
+  for (yaml::KeyValueNode &RemarkField : *Root) {
+    StringRef KeyName;
+    if (Error E = parseKey(KeyName, RemarkField))
+      return E;
+
+    if (KeyName == "Pass") {
+      if (Error E = parseValue(State.Pass, RemarkField))
+        return E;
+    } else if (KeyName == "Name") {
+      if (Error E = parseValue(State.Name, RemarkField))
+        return E;
+    } else if (KeyName == "Function") {
+      if (Error E = parseValue(State.Function, RemarkField))
+        return E;
+    } else if (KeyName == "Hotness") {
+      if (Error E = parseValue(State.Hotness, RemarkField))
+        return E;
+    } else if (KeyName == "DebugLoc") {
+      if (Error E =
+              parseDebugLoc(State.File, State.Line, State.Column, RemarkField))
+        return E;
+    } else if (KeyName == "Args") {
+      auto *Args = dyn_cast<yaml::SequenceNode>(RemarkField.getValue());
+      if (!Args)
+        return make_error<ParseError>("wrong value type for key.", RemarkField);
+
+      for (yaml::Node &Arg : *Args)
+        if (Error E = parseArg(*State.Args, Arg))
+          return E;
+    } else {
+      return make_error<ParseError>("unknown key.", RemarkField);
+    }
+  }
+
+  // If the YAML parsing failed, don't even continue parsing. We might
+  // encounter malformed YAML.
+  if (Stream.failed())
+    return make_error<ParseError>("YAML parsing failed.", *Remark.getRoot());
+
+  // Check if any of the mandatory fields are missing.
+  if (State.Type.empty() || State.Pass.empty() || State.Name.empty() ||
+      State.Function.empty())
+    return make_error<ParseError>("Type, Pass, Name or Function missing.",
+                                  *Remark.getRoot());
+
+  LastRemark = LLVMOptRemarkEntry{
+      toOptRemarkStr(State.Type),
+      toOptRemarkStr(State.Pass),
+      toOptRemarkStr(State.Name),
+      toOptRemarkStr(State.Function),
+      LLVMOptRemarkDebugLoc{toOptRemarkStr(State.File.getValueOr(StringRef())),
+                            State.Line.getValueOr(0),
+                            State.Column.getValueOr(0)},
+      State.Hotness.getValueOr(0),
+      static_cast<uint32_t>(State.Args->size()),
+      State.Args->data()};
+
+  return Error::success();
+}
+} // namespace
+
+// Create wrappers for C Binding types (see CBindingWrapping.h).
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(RemarkParser, LLVMOptRemarkParserRef);
+
+extern "C" LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf,
+                                                            uint64_t Size) {
+  return wrap(
+      new RemarkParser(StringRef(static_cast<const char *>(Buf), Size)));
+}
+
+extern "C" LLVMOptRemarkEntry *
+LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser) {
+  RemarkParser &TheParser = *unwrap(Parser);
+  // Check for EOF.
+  if (TheParser.HadAnyErrors || TheParser.DI == TheParser.Stream.end())
+    return nullptr;
+
+  // Try to parse an entry.
+  if (Error E = TheParser.parseYAMLElement(*TheParser.DI)) {
+    handleAllErrors(std::move(E), [&](const ParseError &PE) {
+      TheParser.Stream.printError(&PE.getNode(),
+                                  Twine(PE.getMessage()) + Twine('\n'));
+      TheParser.HadAnyErrors = true;
+    });
+    return nullptr;
+  }
+
+  // Move on.
+  ++TheParser.DI;
+
+  // Return the just-parsed remark.
+  if (Optional<LLVMOptRemarkEntry> &Entry = TheParser.LastRemark)
+    return &*Entry;
+  return nullptr;
+}
+
+extern "C" LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser) {
+  return unwrap(Parser)->HadAnyErrors;
+}
+
+extern "C" const char *
+LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser) {
+  return unwrap(Parser)->ErrorStream.str().c_str();
+}
+
+extern "C" void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser) {
+  delete unwrap(Parser);
+}
diff --git a/tools/llvm-opt-report/CMakeLists.txt b/tools/llvm-opt-report/CMakeLists.txt
index 777537a54c0..3aabc03ab3f 100644
--- a/tools/llvm-opt-report/CMakeLists.txt
+++ b/tools/llvm-opt-report/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS Core Demangle Object Support)
+set(LLVM_LINK_COMPONENTS Core Demangle Object OptRemarks Support)
 
 add_llvm_tool(llvm-opt-report
   OptReport.cpp
diff --git a/tools/llvm-opt-report/OptReport.cpp b/tools/llvm-opt-report/OptReport.cpp
index aa7966132c2..071f779a9e6 100644
--- a/tools/llvm-opt-report/OptReport.cpp
+++ b/tools/llvm-opt-report/OptReport.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm-c/OptRemarks.h"
 #include <cstdlib>
 #include <map>
 #include <set>
@@ -142,104 +143,44 @@ typedef std::map<std::string, std::map<int, std::map<std::string, std::map<int,
           OptReportLocationInfo>>>> LocationInfoTy;
 } // anonymous namespace
 
-static void collectLocationInfo(yaml::Stream &Stream,
-                                LocationInfoTy &LocationInfo) {
-  SmallVector<char, 8> Tmp;
-
-  // Note: We're using the YAML parser here directly, instead of using the
-  // YAMLTraits implementation, because the YAMLTraits implementation does not
-  // support a way to handle only a subset of the input keys (it will error out
-  // if there is an input key that you don't map to your class), and
-  // furthermore, it does not provide a way to handle the Args sequence of
-  // key/value pairs, where the order must be captured and the 'String' key
-  // might be repeated.
-  for (auto &Doc : Stream) {
-    auto *Root = dyn_cast<yaml::MappingNode>(Doc.getRoot());
-    if (!Root)
-      continue;
+static bool readLocationInfo(LocationInfoTy &LocationInfo) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+      MemoryBuffer::getFile(InputFileName.c_str());
+  if (std::error_code EC = Buf.getError()) {
+    WithColor::error() << "Can't open file " << InputFileName << ": "
+                       << EC.message() << "\n";
+    return false;
+  }
 
-    bool Transformed = Root->getRawTag() == "!Passed";
-    std::string Pass, File, Function;
-    int Line = 0, Column = 1;
+  StringRef Buffer = (*Buf)->getBuffer();
+  LLVMOptRemarkParserRef Parser =
+      LLVMOptRemarkParserCreate(Buffer.data(), Buffer.size());
+
+  LLVMOptRemarkEntry *Remark = nullptr;
+  while ((Remark = LLVMOptRemarkParserGetNext(Parser))) {
+    bool Transformed =
+        StringRef(Remark->RemarkType.Str, Remark->RemarkType.Len) == "!Passed";
+    StringRef Pass(Remark->PassName.Str, Remark->PassName.Len);
+    StringRef File(Remark->DebugLoc.SourceFile.Str,
+                   Remark->DebugLoc.SourceFile.Len);
+    StringRef Function(Remark->FunctionName.Str, Remark->FunctionName.Len);
+    uint32_t Line = Remark->DebugLoc.SourceLineNumber;
+    uint32_t Column = Remark->DebugLoc.SourceColumnNumber;
+    ArrayRef<LLVMOptRemarkArg> Args(Remark->Args, Remark->NumArgs);
 
     int VectorizationFactor = 1;
     int InterleaveCount = 1;
     int UnrollCount = 1;
 
-    for (auto &RootChild : *Root) {
-      auto *Key = dyn_cast<yaml::ScalarNode>(RootChild.getKey());
-      if (!Key)
-        continue;
-      StringRef KeyName = Key->getValue(Tmp);
-      if (KeyName == "Pass") {
-        auto *Value = dyn_cast<yaml::ScalarNode>(RootChild.getValue());
-        if (!Value)
-          continue;
-        Pass = Value->getValue(Tmp);
-      } else if (KeyName == "Function") {
-        auto *Value = dyn_cast<yaml::ScalarNode>(RootChild.getValue());
-        if (!Value)
-          continue;
-        Function = Value->getValue(Tmp);
-      } else if (KeyName == "DebugLoc") {
-        auto *DebugLoc = dyn_cast<yaml::MappingNode>(RootChild.getValue());
-        if (!DebugLoc)
-          continue;
-
-        for (auto &DLChild : *DebugLoc) {
-          auto *DLKey = dyn_cast<yaml::ScalarNode>(DLChild.getKey());
-          if (!DLKey)
-            continue;
-          StringRef DLKeyName = DLKey->getValue(Tmp);
-          if (DLKeyName == "File") {
-            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
-            if (!Value)
-              continue;
-            File = Value->getValue(Tmp);
-          } else if (DLKeyName == "Line") {
-            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
-            if (!Value)
-              continue;
-            Value->getValue(Tmp).getAsInteger(10, Line);
-          } else if (DLKeyName == "Column") {
-            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
-            if (!Value)
-              continue;
-            Value->getValue(Tmp).getAsInteger(10, Column);
-          }
-        }
-      } else if (KeyName == "Args") {
-        auto *Args = dyn_cast<yaml::SequenceNode>(RootChild.getValue());
-        if (!Args)
-          continue;
-        for (auto &ArgChild : *Args) {
-          auto *ArgMap = dyn_cast<yaml::MappingNode>(&ArgChild);
-          if (!ArgMap)
-            continue;
-          for (auto &ArgKV : *ArgMap) {
-            auto *ArgKey = dyn_cast<yaml::ScalarNode>(ArgKV.getKey());
-            if (!ArgKey)
-              continue;
-            StringRef ArgKeyName = ArgKey->getValue(Tmp);
-            if (ArgKeyName == "VectorizationFactor") {
-              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
-              if (!Value)
-                continue;
-              Value->getValue(Tmp).getAsInteger(10, VectorizationFactor);
-            } else if (ArgKeyName == "InterleaveCount") {
-              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
-              if (!Value)
-                continue;
-              Value->getValue(Tmp).getAsInteger(10, InterleaveCount);
-            } else if (ArgKeyName == "UnrollCount") {
-              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
-              if (!Value)
-                continue;
-              Value->getValue(Tmp).getAsInteger(10, UnrollCount);
-            }
-          }
-        }
-      }
+    for (const LLVMOptRemarkArg &Arg : Args) {
+      StringRef ArgKeyName(Arg.Key.Str, Arg.Key.Len);
+      StringRef ArgValue(Arg.Value.Str, Arg.Value.Len);
+      if (ArgKeyName == "VectorizationFactor")
+        ArgValue.getAsInteger(10, VectorizationFactor);
+      else if (ArgKeyName == "InterleaveCount")
+        ArgValue.getAsInteger(10, InterleaveCount);
+      else if (ArgKeyName == "UnrollCount")
+        ArgValue.getAsInteger(10, UnrollCount);
     }
 
     if (Line < 1 || File.empty())
@@ -268,22 +209,13 @@ static void collectLocationInfo(yaml::Stream &Stream,
       UpdateLLII(LI.Vectorized);
     }
   }
-}
-
-static bool readLocationInfo(LocationInfoTy &LocationInfo) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
-      MemoryBuffer::getFileOrSTDIN(InputFileName);
-  if (std::error_code EC = Buf.getError()) {
-    WithColor::error() << "Can't open file " << InputFileName << ": "
-                       << EC.message() << "\n";
-    return false;
-  }
 
-  SourceMgr SM;
-  yaml::Stream Stream(Buf.get()->getBuffer(), SM);
-  collectLocationInfo(Stream, LocationInfo);
+  bool HasError = LLVMOptRemarkParserHasError(Parser);
+  if (HasError)
+    WithColor::error() << LLVMOptRemarkParserGetErrorMessage(Parser) << "\n";
 
-  return true;
+  LLVMOptRemarkParserDispose(Parser);
+  return !HasError;
 }
 
 static bool writeReport(LocationInfoTy &LocationInfo) {
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index bc41ab66a23..5dba2de4a88 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -26,6 +26,7 @@ add_subdirectory(MI)
 add_subdirectory(Object)
 add_subdirectory(ObjectYAML)
 add_subdirectory(Option)
+add_subdirectory(OptRemarks)
 add_subdirectory(Passes)
 add_subdirectory(ProfileData)
 add_subdirectory(Support)
diff --git a/unittests/OptRemarks/CMakeLists.txt b/unittests/OptRemarks/CMakeLists.txt
new file mode 100644
index 00000000000..94c74867cc4
--- /dev/null
+++ b/unittests/OptRemarks/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_LINK_COMPONENTS
+  OptRemarks
+  Support
+  )
+
+add_llvm_unittest(OptRemarksTests
+  OptRemarksParsingTest.cpp
+  )
diff --git a/unittests/OptRemarks/OptRemarksParsingTest.cpp b/unittests/OptRemarks/OptRemarksParsingTest.cpp
new file mode 100644
index 00000000000..a28820ffb7f
--- /dev/null
+++ b/unittests/OptRemarks/OptRemarksParsingTest.cpp
@@ -0,0 +1,433 @@
+//===- unittest/Support/OptRemarksParsingTest.cpp - OptTable tests --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/OptRemarks.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+template <size_t N> bool tryParse(const char (&Buf)[N]) {
+  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1);
+  LLVMOptRemarkEntry *Remark = nullptr;
+  while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) {
+    EXPECT_TRUE(Remark == nullptr); // Only one remark per test.
+    Remark = NewRemark;
+  }
+  EXPECT_TRUE(Remark != nullptr); // We need *exactly* one remark per test.
+  bool HasError = LLVMOptRemarkParserHasError(Parser);
+  LLVMOptRemarkParserDispose(Parser);
+  return !HasError;
+}
+
+template <size_t N>
+bool parseExpectError(const char (&Buf)[N], const char *Error) {
+  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1);
+  LLVMOptRemarkEntry *Remark = nullptr;
+  while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) {
+    EXPECT_FALSE(NewRemark);
+  }
+  EXPECT_TRUE(Remark == nullptr); // We are parsing only one malformed remark.
+  EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser));
+  bool MatchesError =
+      StringRef(LLVMOptRemarkParserGetErrorMessage(Parser)).contains(Error);
+  LLVMOptRemarkParserDispose(Parser);
+
+  return MatchesError;
+}
+
+TEST(OptRemarks, OptRemarksParsingEmpty) {
+  StringRef Buf = R"YAML(
+)YAML";
+  LLVMOptRemarkParserRef Parser =
+      LLVMOptRemarkParserCreate(Buf.data(), Buf.size());
+  LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser);
+  EXPECT_TRUE(NewRemark == nullptr); // No remark expected.
+  EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser));
+  EXPECT_TRUE(StringRef(LLVMOptRemarkParserGetErrorMessage(Parser))
+                  .contains("document root is not of mapping type."));
+  LLVMOptRemarkParserDispose(Parser);
+}
+
+TEST(OptRemarks, OptRemarksParsingGood) {
+  EXPECT_TRUE(tryParse(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+DebugLoc: { File: file.c, Line: 3, Column: 12 }
+Function: foo
+Args:
+  - Callee: bar
+  - String: ' will not be inlined into '
+  - Caller: foo
+    DebugLoc: { File: file.c, Line: 2, Column: 0 }
+  - String: ' because its definition is unavailable'
+)YAML"));
+
+  // No debug loc should also pass.
+  EXPECT_TRUE(tryParse(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+Args:
+  - Callee: bar
+  - String: ' will not be inlined into '
+  - Caller: foo
+    DebugLoc: { File: file.c, Line: 2, Column: 0 }
+  - String: ' because its definition is unavailable'
+)YAML"));
+
+  // No args is also ok.
+  EXPECT_TRUE(tryParse(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+DebugLoc: { File: file.c, Line: 3, Column: 12 }
+Function: foo
+)YAML"));
+
+  // Different order.
+  EXPECT_TRUE(tryParse(R"YAML(
+--- !Missed
+DebugLoc: { Line: 3, Column: 12, File: file.c }
+Function: foo
+Name: NoDefinition
+Args:
+  - Callee: bar
+  - String: ' will not be inlined into '
+  - Caller: foo
+    DebugLoc: { File: file.c, Line: 2, Column: 0 }
+  - String: ' because its definition is unavailable'
+Pass: inline
+)YAML"));
+}
+
+// Mandatory common part of a remark.
+#define COMMON_REMARK "\nPass: inline\nName: NoDefinition\nFunction: foo\n"
+// Test all the types.
+TEST(OptRemarks, OptRemarksParsingTypes) {
+  // Type: Passed
+  EXPECT_TRUE(tryParse("--- !Passed" COMMON_REMARK));
+  // Type: Missed
+  EXPECT_TRUE(tryParse("--- !Missed" COMMON_REMARK));
+  // Type: Analysis
+  EXPECT_TRUE(tryParse("--- !Analysis" COMMON_REMARK));
+  // Type: AnalysisFPCompute
+  EXPECT_TRUE(tryParse("--- !AnalysisFPCompute" COMMON_REMARK));
+  // Type: AnalysisAliasing
+  EXPECT_TRUE(tryParse("--- !AnalysisAliasing" COMMON_REMARK));
+  // Type: Failure
+  EXPECT_TRUE(tryParse("--- !Failure" COMMON_REMARK));
+}
+#undef COMMON_REMARK
+
+TEST(OptRemarks, OptRemarksParsingMissingFields) {
+  // No type.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+---
+Pass: inline
+Name: NoDefinition
+Function: foo
+)YAML",
+                               "error: Type, Pass, Name or Function missing."));
+  // No pass.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Name: NoDefinition
+Function: foo
+)YAML",
+                               "error: Type, Pass, Name or Function missing."));
+  // No name.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Function: foo
+)YAML",
+                               "error: Type, Pass, Name or Function missing."));
+  // No function.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+)YAML",
+                               "error: Type, Pass, Name or Function missing."));
+  // Debug loc but no file.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+DebugLoc: { Line: 3, Column: 12 }
+)YAML",
+                               "DebugLoc node incomplete."));
+  // Debug loc but no line.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+DebugLoc: { File: file.c, Column: 12 }
+)YAML",
+                               "DebugLoc node incomplete."));
+  // Debug loc but no column.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+DebugLoc: { File: file.c, Line: 3 }
+)YAML",
+                               "DebugLoc node incomplete."));
+}
+
+TEST(OptRemarks, OptRemarksParsingWrongTypes) {
+  // Wrong debug loc type.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+DebugLoc: foo
+)YAML",
+                               "expected a value of mapping type."));
+  // Wrong line type.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+DebugLoc: { File: file.c, Line: b, Column: 12 }
+)YAML",
+                               "expected a value of integer type."));
+  // Wrong column type.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+DebugLoc: { File: file.c, Line: 3, Column: c }
+)YAML",
+                               "expected a value of integer type."));
+  // Wrong args type.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+Args: foo
+)YAML",
+                               "wrong value type for key."));
+  // Wrong key type.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+{ A: a }: inline
+Name: NoDefinition
+Function: foo
+)YAML",
+                               "key is not a string."));
+  // Debug loc with unknown entry.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+DebugLoc: { File: file.c, Column: 12, Unknown: 12 }
+)YAML",
+                               "unknown entry in DebugLoc map."));
+  // Unknown entry.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Unknown: inline
+)YAML",
+                               "unknown key."));
+  // Not a scalar.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: { File: a, Line: 1, Column: 2 }
+Name: NoDefinition
+Function: foo
+)YAML",
+                               "expected a value of scalar type."));
+  // Not a string file in debug loc.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+DebugLoc: { File: { a: b }, Column: 12, Line: 12 }
+)YAML",
+                               "expected a value of scalar type."));
+  // Not a integer column in debug loc.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+DebugLoc: { File: file.c, Column: { a: b }, Line: 12 }
+)YAML",
+                               "expected a value of scalar type."));
+  // Not a integer line in debug loc.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+DebugLoc: { File: file.c, Column: 12, Line: { a: b } }
+)YAML",
+                               "expected a value of scalar type."));
+  // Not a mapping type value for args.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+DebugLoc: { File: file.c, Column: 12, Line: { a: b } }
+)YAML",
+                               "expected a value of scalar type."));
+}
+
+TEST(OptRemarks, OptRemarksParsingWrongArgs) {
+  // Multiple debug locs per arg.
+  EXPECT_TRUE(
+      parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+Args:
+  - Str: string
+    DebugLoc: { File: a, Line: 1, Column: 2 }
+    DebugLoc: { File: a, Line: 1, Column: 2 }
+)YAML",
+                       "only one DebugLoc entry is allowed per argument."));
+  // Multiple strings per arg.
+  EXPECT_TRUE(
+      parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+Args:
+  - Str: string
+    Str2: string
+    DebugLoc: { File: a, Line: 1, Column: 2 }
+)YAML",
+                       "only one string entry is allowed per argument."));
+  // No arg value.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+Args:
+  - Callee: ''
+  - DebugLoc: { File: a, Line: 1, Column: 2 }
+)YAML",
+                               "argument value is missing."));
+  // No arg value.
+  EXPECT_TRUE(parseExpectError(R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+Function: foo
+Args:
+  - DebugLoc: { File: a, Line: 1, Column: 2 }
+)YAML",
+                               "argument key is missing."));
+
+}
+
+TEST(OptRemarks, OptRemarksGoodStruct) {
+  StringRef Buf = R"YAML(
+--- !Missed
+Pass: inline
+Name: NoDefinition
+DebugLoc: { File: file.c, Line: 3, Column: 12 }
+Function: foo
+Args:
+  - Callee: bar
+  - String: ' will not be inlined into '
+  - Caller: foo
+    DebugLoc: { File: file.c, Line: 2, Column: 0 }
+  - String: ' because its definition is unavailable'
+)YAML";
+
+  LLVMOptRemarkParserRef Parser =
+      LLVMOptRemarkParserCreate(Buf.data(), Buf.size());
+  LLVMOptRemarkEntry *Remark = LLVMOptRemarkParserGetNext(Parser);
+  EXPECT_FALSE(Remark == nullptr);
+  EXPECT_EQ(StringRef(Remark->RemarkType.Str, 7), "!Missed");
+  EXPECT_EQ(Remark->RemarkType.Len, 7U);
+  EXPECT_EQ(StringRef(Remark->PassName.Str, 6), "inline");
+  EXPECT_EQ(Remark->PassName.Len, 6U);
+  EXPECT_EQ(StringRef(Remark->RemarkName.Str, 12), "NoDefinition");
+  EXPECT_EQ(Remark->RemarkName.Len, 12U);
+  EXPECT_EQ(StringRef(Remark->FunctionName.Str, 3), "foo");
+  EXPECT_EQ(Remark->FunctionName.Len, 3U);
+  EXPECT_EQ(StringRef(Remark->DebugLoc.SourceFile.Str, 6), "file.c");
+  EXPECT_EQ(Remark->DebugLoc.SourceFile.Len, 6U);
+  EXPECT_EQ(Remark->DebugLoc.SourceLineNumber, 3U);
+  EXPECT_EQ(Remark->DebugLoc.SourceColumnNumber, 12U);
+  EXPECT_EQ(Remark->Hotness, 0U);
+  EXPECT_EQ(Remark->NumArgs, 4U);
+  // Arg 0
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[0];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Callee");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 3), "bar");
+    EXPECT_EQ(Arg.Value.Len, 3U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+  // Arg 1
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[1];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 26), " will not be inlined into ");
+    EXPECT_EQ(Arg.Value.Len, 26U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+  // Arg 2
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[2];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Caller");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 3), "foo");
+    EXPECT_EQ(Arg.Value.Len, 3U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 6), "file.c");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 6U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 2U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+  // Arg 3
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[3];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 38),
+              " because its definition is unavailable");
+    EXPECT_EQ(Arg.Value.Len, 38U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+
+  EXPECT_EQ(LLVMOptRemarkParserGetNext(Parser), nullptr);
+
+  EXPECT_FALSE(LLVMOptRemarkParserHasError(Parser));
+  LLVMOptRemarkParserDispose(Parser);
+}
-- 
GitLab


From f6b8b02db767ad21db1b66d25278c63a279554ed Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Wed, 10 Oct 2018 18:01:48 +0000
Subject: [PATCH 0012/1116] [GlobalISel] Fix the artifact combiner to fold
 G_IMPLICIT_DEF properly

Summary:
GlobalISel generates incorrect code because the legalizer artifact
combiner assumes `G_[SZ]EXT (G_IMPLICIT_DEF)` is equivalent to
`G_IMPLICIT_DEF `.

Replace `G_[SZ]EXT (G_IMPLICIT_DEF)` with 0 because the top bits
will be 0 for G_ZEXT and 0/1 for the G_SEXT.

Reviewers: aditya_nandakumar, dsanders, aemerson, javed.absar

Reviewed By: aditya_nandakumar

Subscribers: rovka, kristof.beyls, llvm-commits

Differential Revision: https://reviews.llvm.org/D52996

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344163 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../GlobalISel/LegalizationArtifactCombiner.h | 24 +++++++++----
 .../AArch64/GlobalISel/legalize-ext.mir       | 20 +++++++++--
 test/CodeGen/X86/GlobalISel/legalize-ext.mir  | 24 ++++++-------
 .../CodeGen/X86/GlobalISel/legalize-undef.mir | 36 +++++++++----------
 4 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 873587651ef..256f1ccbee7 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -109,7 +109,7 @@ public:
     return tryFoldImplicitDef(MI, DeadInsts);
   }
 
-  /// Try to fold sb = EXTEND (G_IMPLICIT_DEF sa) -> sb = G_IMPLICIT_DEF
+  /// Try to fold G_[ASZ]EXT (G_IMPLICIT_DEF).
   bool tryFoldImplicitDef(MachineInstr &MI,
                           SmallVectorImpl<MachineInstr *> &DeadInsts) {
     unsigned Opcode = MI.getOpcode();
@@ -119,13 +119,25 @@ public:
 
     if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF,
                                            MI.getOperand(1).getReg(), MRI)) {
+      Builder.setInstr(MI);
       unsigned DstReg = MI.getOperand(0).getReg();
       LLT DstTy = MRI.getType(DstReg);
-      if (isInstUnsupported({TargetOpcode::G_IMPLICIT_DEF, {DstTy}}))
-        return false;
-      LLVM_DEBUG(dbgs() << ".. Combine EXT(IMPLICIT_DEF) " << MI;);
-      Builder.setInstr(MI);
-      Builder.buildInstr(TargetOpcode::G_IMPLICIT_DEF, DstReg);
+
+      if (Opcode == TargetOpcode::G_ANYEXT) {
+        // G_ANYEXT (G_IMPLICIT_DEF) -> G_IMPLICIT_DEF
+        if (isInstUnsupported({TargetOpcode::G_IMPLICIT_DEF, {DstTy}}))
+          return false;
+        LLVM_DEBUG(dbgs() << ".. Combine G_ANYEXT(G_IMPLICIT_DEF): " << MI;);
+        Builder.buildInstr(TargetOpcode::G_IMPLICIT_DEF, DstReg);
+      } else {
+        // G_[SZ]EXT (G_IMPLICIT_DEF) -> G_CONSTANT 0 because the top
+        // bits will be 0 for G_ZEXT and 0/1 for the G_SEXT.
+        if (isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
+          return false;
+        LLVM_DEBUG(dbgs() << ".. Combine G_[SZ]EXT(G_IMPLICIT_DEF): " << MI;);
+        Builder.buildConstant(DstReg, 0);
+      }
+
       markInstAndDefDead(MI, *DefMI, DeadInsts);
       return true;
     }
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
index cf4f687408f..c4bcbb683c1 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
@@ -82,9 +82,9 @@ body: |
     ; CHECK: $w0 = COPY [[ASHR2]](s32)
     ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK: [[TRUNC10:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[TRUNC3]]4(s32)
-    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC3]]1, [[TRUNC3]]2
-    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[TRUNC3]]3(s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC10]], [[COPY5]]
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[AND3]](s32)
     ; CHECK: $w0 = COPY [[COPY6]](s32)
     ; CHECK: [[TRUNC11:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK: $w0 = COPY [[TRUNC11]](s32)
@@ -92,6 +92,12 @@ body: |
     ; CHECK: $w0 = COPY [[TRUNC12]](s32)
     ; CHECK: [[FPEXT:%[0-9]+]]:_(s64) = G_FPEXT [[TRUNC12]](s32)
     ; CHECK: $x0 = COPY [[FPEXT]](s64)
+    ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: $w0 = COPY [[C7]](s32)
+    ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: $w0 = COPY [[C8]](s32)
+    ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK: $w0 = COPY [[DEF]](s32)
     %0(s64) = COPY $x0
 
     %1(s1) = G_TRUNC %0
@@ -140,4 +146,12 @@ body: |
     $w0 = COPY %17
     %18(s64) = G_FPEXT %17
     $x0 = COPY %18
+
+    %24:_(s16) = G_IMPLICIT_DEF
+    %25:_(s32) = G_ZEXT %24(s16)
+    $w0 = COPY %25(s32)
+    %26:_(s32) = G_SEXT %24(s16)
+    $w0 = COPY %26(s32)
+    %27:_(s32) = G_ANYEXT %24(s16)
+    $w0 = COPY %27(s32)
 ...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
index cf9b8039096..71f1facfb81 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
@@ -288,12 +288,12 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i1toi8
-    ; X32: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; X32: $al = COPY [[DEF]](s8)
+    ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; X32: $al = COPY [[C]](s8)
     ; X32: RET 0, implicit $al
     ; X64-LABEL: name: test_sext_i1toi8
-    ; X64: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; X64: $al = COPY [[DEF]](s8)
+    ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; X64: $al = COPY [[C]](s8)
     ; X64: RET 0, implicit $al
     %0(s1) = G_IMPLICIT_DEF
     %1(s8) = G_SEXT %0(s1)
@@ -314,12 +314,12 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i1toi16
-    ; X32: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; X32: $ax = COPY [[DEF]](s16)
+    ; X32: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; X32: $ax = COPY [[C]](s16)
     ; X32: RET 0, implicit $ax
     ; X64-LABEL: name: test_sext_i1toi16
-    ; X64: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; X64: $ax = COPY [[DEF]](s16)
+    ; X64: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; X64: $ax = COPY [[C]](s16)
     ; X64: RET 0, implicit $ax
     %0(s1) = G_IMPLICIT_DEF
     %1(s16) = G_SEXT %0(s1)
@@ -341,12 +341,12 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i1
-    ; X32: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X32: $eax = COPY [[DEF]](s32)
+    ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; X32: $eax = COPY [[C]](s32)
     ; X32: RET 0, implicit $eax
     ; X64-LABEL: name: test_sext_i1
-    ; X64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X64: $eax = COPY [[DEF]](s32)
+    ; X64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; X64: $eax = COPY [[C]](s32)
     ; X64: RET 0, implicit $eax
     %0(s1) = G_IMPLICIT_DEF
     %2(s32) = G_SEXT %0(s1)
diff --git a/test/CodeGen/X86/GlobalISel/legalize-undef.mir b/test/CodeGen/X86/GlobalISel/legalize-undef.mir
index 997064b366d..4a865e4e582 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-undef.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-undef.mir
@@ -11,32 +11,32 @@ body: |
     liveins:
     ; X64-LABEL: name: test_implicit_def
     ; X64: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+    ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; X64: G_STORE [[C]](s8), [[DEF]](p0) :: (store 1)
     ; X64: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
     ; X64: G_STORE [[DEF1]](s8), [[DEF]](p0) :: (store 1)
-    ; X64: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; X64: G_STORE [[DEF2]](s8), [[DEF]](p0) :: (store 1)
-    ; X64: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; X64: G_STORE [[DEF3]](s16), [[DEF]](p0) :: (store 2)
-    ; X64: [[DEF4:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X64: G_STORE [[DEF4]](s32), [[DEF]](p0) :: (store 4)
-    ; X64: [[DEF5:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; X64: G_STORE [[DEF5]](s64), [[DEF]](p0) :: (store 8)
+    ; X64: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; X64: G_STORE [[DEF2]](s16), [[DEF]](p0) :: (store 2)
+    ; X64: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; X64: G_STORE [[DEF3]](s32), [[DEF]](p0) :: (store 4)
+    ; X64: [[DEF4:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; X64: G_STORE [[DEF4]](s64), [[DEF]](p0) :: (store 8)
     ; X32-LABEL: name: test_implicit_def
     ; X32: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+    ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; X32: G_STORE [[C]](s8), [[DEF]](p0) :: (store 1)
     ; X32: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
     ; X32: G_STORE [[DEF1]](s8), [[DEF]](p0) :: (store 1)
-    ; X32: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; X32: G_STORE [[DEF2]](s8), [[DEF]](p0) :: (store 1)
-    ; X32: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; X32: G_STORE [[DEF3]](s16), [[DEF]](p0) :: (store 2)
+    ; X32: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; X32: G_STORE [[DEF2]](s16), [[DEF]](p0) :: (store 2)
+    ; X32: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; X32: G_STORE [[DEF3]](s32), [[DEF]](p0) :: (store 4)
     ; X32: [[DEF4:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X32: G_STORE [[DEF4]](s32), [[DEF]](p0) :: (store 4)
     ; X32: [[DEF5:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X32: [[DEF6:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X32: G_STORE [[DEF5]](s32), [[DEF]](p0) :: (store 4, align 8)
-    ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; X32: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[C]](s32)
-    ; X32: G_STORE [[DEF6]](s32), [[GEP]](p0) :: (store 4)
+    ; X32: G_STORE [[DEF4]](s32), [[DEF]](p0) :: (store 4, align 8)
+    ; X32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; X32: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[C1]](s32)
+    ; X32: G_STORE [[DEF5]](s32), [[GEP]](p0) :: (store 4)
     %5:_(p0) = G_IMPLICIT_DEF
     %0:_(s1) = G_IMPLICIT_DEF
     G_STORE %0, %5 ::(store 1)
-- 
GitLab


From b501cdb9f5588ae98681dd5d8cc6ccc22ad40cb4 Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Wed, 10 Oct 2018 18:07:44 +0000
Subject: [PATCH 0013/1116] Revert "[OptRemarks] Add library for parsing
 optimization remarks"

This reverts commit 1cc98e6672b6319fdb00b70dd4474aabdadbe193.

Seems to break bots: http://lab.llvm.org:8011/builders/clang-x86_64-linux-abi-test/builds/33398/steps/build-unified-tree/logs/stdio

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344164 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm-c/OptRemarks.h                   | 197 --------
 lib/CMakeLists.txt                            |   1 -
 lib/LLVMBuild.txt                             |   1 -
 lib/OptRemarks/CMakeLists.txt                 |   3 -
 lib/OptRemarks/LLVMBuild.txt                  |  22 -
 lib/OptRemarks/OptRemarksParser.cpp           | 368 ---------------
 tools/llvm-opt-report/CMakeLists.txt          |   2 +-
 tools/llvm-opt-report/OptReport.cpp           | 144 ++++--
 unittests/CMakeLists.txt                      |   1 -
 unittests/OptRemarks/CMakeLists.txt           |   8 -
 .../OptRemarks/OptRemarksParsingTest.cpp      | 433 ------------------
 11 files changed, 107 insertions(+), 1073 deletions(-)
 delete mode 100644 include/llvm-c/OptRemarks.h
 delete mode 100644 lib/OptRemarks/CMakeLists.txt
 delete mode 100644 lib/OptRemarks/LLVMBuild.txt
 delete mode 100644 lib/OptRemarks/OptRemarksParser.cpp
 delete mode 100644 unittests/OptRemarks/CMakeLists.txt
 delete mode 100644 unittests/OptRemarks/OptRemarksParsingTest.cpp

diff --git a/include/llvm-c/OptRemarks.h b/include/llvm-c/OptRemarks.h
deleted file mode 100644
index f3449cc1b8c..00000000000
--- a/include/llvm-c/OptRemarks.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*===-- llvm-c/OptRemarks.h - OptRemarks Public C Interface -------*- C -*-===*\
-|*                                                                            *|
-|*                     The LLVM Compiler Infrastructure                       *|
-|*                                                                            *|
-|* This file is distributed under the University of Illinois Open Source      *|
-|* License. See LICENSE.TXT for details.                                      *|
-|*                                                                            *|
-|*===----------------------------------------------------------------------===*|
-|*                                                                            *|
-|* This header provides a public interface to an opt-remark library.          *|
-|* LLVM provides an implementation of this interface.                         *|
-|*                                                                            *|
-\*===----------------------------------------------------------------------===*/
-
-#ifndef LLVM_C_OPT_REMARKS_H
-#define LLVM_C_OPT_REMARKS_H
-
-#include "llvm-c/Core.h"
-#include "llvm-c/Types.h"
-#ifdef __cplusplus
-#include <cstddef>
-extern "C" {
-#else
-#include <stddef.h>
-#endif /* !defined(__cplusplus) */
-
-/**
- * @defgroup LLVMCOPTREMARKS OptRemarks
- * @ingroup LLVMC
- *
- * @{
- */
-
-#define OPT_REMARKS_API_VERSION 0
-
-/**
- * String containing a buffer and a length. The buffer is not guaranteed to be
- * zero-terminated.
- *
- * \since OPT_REMARKS_API_VERSION=0
- */
-typedef struct {
-  const char *Str;
-  uint32_t Len;
-} LLVMOptRemarkStringRef;
-
-/**
- * DebugLoc containing File, Line and Column.
- *
- * \since OPT_REMARKS_API_VERSION=0
- */
-typedef struct {
-  // File:
-  LLVMOptRemarkStringRef SourceFile;
-  // Line:
-  uint32_t SourceLineNumber;
-  // Column:
-  uint32_t SourceColumnNumber;
-} LLVMOptRemarkDebugLoc;
-
-/**
- * Element of the "Args" list. The key might give more information about what
- * are the semantics of the value, e.g. "Callee" will tell you that the value
- * is a symbol that names a function.
- *
- * \since OPT_REMARKS_API_VERSION=0
- */
-typedef struct {
-  // e.g. "Callee"
-  LLVMOptRemarkStringRef Key;
-  // e.g. "malloc"
-  LLVMOptRemarkStringRef Value;
-
-  // "DebugLoc": Optional
-  LLVMOptRemarkDebugLoc DebugLoc;
-} LLVMOptRemarkArg;
-
-/**
- * One remark entry.
- *
- * \since OPT_REMARKS_API_VERSION=0
- */
-typedef struct {
-  // e.g. !Missed, !Passed
-  LLVMOptRemarkStringRef RemarkType;
-  // "Pass": Required
-  LLVMOptRemarkStringRef PassName;
-  // "Name": Required
-  LLVMOptRemarkStringRef RemarkName;
-  // "Function": Required
-  LLVMOptRemarkStringRef FunctionName;
-
-  // "DebugLoc": Optional
-  LLVMOptRemarkDebugLoc DebugLoc;
-  // "Hotness": Optional
-  uint32_t Hotness;
-  // "Args": Optional. It is an array of `num_args` elements.
-  uint32_t NumArgs;
-  LLVMOptRemarkArg *Args;
-} LLVMOptRemarkEntry;
-
-typedef struct LLVMOptRemarkOpaqueParser *LLVMOptRemarkParserRef;
-
-/**
- * Creates a remark parser that can be used to read and parse the buffer located
- * in \p Buf of size \p Size.
- *
- * \p Buf cannot be NULL.
- *
- * This function should be paired with LLVMOptRemarkParserDispose() to avoid
- * leaking resources.
- *
- * \since OPT_REMARKS_API_VERSION=0
- */
-extern LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf,
-                                                        uint64_t Size);
-
-/**
- * Returns the next remark in the file.
- *
- * The value pointed to by the return value is invalidated by the next call to
- * LLVMOptRemarkParserGetNext().
- *
- * If the parser reaches the end of the buffer, the return value will be NULL.
- *
- * In the case of an error, the return value will be NULL, and:
- *
- * 1) LLVMOptRemarkParserHasError() will return `1`.
- *
- * 2) LLVMOptRemarkParserGetErrorMessage() will return a descriptive error
- *    message.
- *
- * An error may occur if:
- *
- * 1) An argument is invalid.
- *
- * 2) There is a YAML parsing error. This type of error aborts parsing
- *    immediately and returns `1`. It can occur on malformed YAML.
- *
- * 3) Remark parsing error. If this type of error occurs, the parser won't call
- *    the handler and will continue to the next one. It can occur on malformed
- *    remarks, like missing or extra fields in the file.
- *
- * Here is a quick example of the usage:
- *
- * ```
- *  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, Size);
- *  LLVMOptRemarkEntry *Remark = NULL;
- *  while ((Remark == LLVMOptRemarkParserGetNext(Parser))) {
- *    // use Remark
- *  }
- *  bool HasError = LLVMOptRemarkParserHasError(Parser);
- *  LLVMOptRemarkParserDispose(Parser);
- * ```
- *
- * \since OPT_REMARKS_API_VERSION=0
- */
-extern LLVMOptRemarkEntry *
-LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser);
-
-/**
- * Returns `1` if the parser encountered an error while parsing the buffer.
- *
- * \since OPT_REMARKS_API_VERSION=0
- */
-extern LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser);
-
-/**
- * Returns a null-terminated string containing an error message.
- *
- * In case of no error, the result is `NULL`.
- *
- * The memory of the string is bound to the lifetime of \p Parser. If
- * LLVMOptRemarkParserDispose() is called, the memory of the string will be
- * released.
- *
- * \since OPT_REMARKS_API_VERSION=0
- */
-extern const char *
-LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser);
-
-/**
- * Releases all the resources used by \p Parser.
- *
- * \since OPT_REMARKS_API_VERSION=0
- */
-extern void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser);
-
-/**
- * @} // endgoup LLVMCOPTREMARKS
- */
-
-#ifdef __cplusplus
-}
-#endif /* !defined(__cplusplus) */
-
-#endif /* LLVM_C_OPT_REMARKS_H */
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 1f54c611bad..ecf8b93d253 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -15,7 +15,6 @@ add_subdirectory(MC)
 add_subdirectory(Object)
 add_subdirectory(ObjectYAML)
 add_subdirectory(Option)
-add_subdirectory(OptRemarks)
 add_subdirectory(DebugInfo)
 add_subdirectory(ExecutionEngine)
 add_subdirectory(Target)
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index 0eb4bba2676..a6cd15699fb 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -35,7 +35,6 @@ subdirectories =
  BinaryFormat
  ObjectYAML
  Option
- OptRemarks
  Passes
  ProfileData
  Support
diff --git a/lib/OptRemarks/CMakeLists.txt b/lib/OptRemarks/CMakeLists.txt
deleted file mode 100644
index 8fefe1d986b..00000000000
--- a/lib/OptRemarks/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_llvm_library(LLVMOptRemarks
-  OptRemarksParser.cpp
-)
diff --git a/lib/OptRemarks/LLVMBuild.txt b/lib/OptRemarks/LLVMBuild.txt
deleted file mode 100644
index 4c1032296dc..00000000000
--- a/lib/OptRemarks/LLVMBuild.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-;===- ./lib/OptRemarks/LLVMBuild.txt ---------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = OptRemarks
-parent = Libraries
-required_libraries = Support
diff --git a/lib/OptRemarks/OptRemarksParser.cpp b/lib/OptRemarks/OptRemarksParser.cpp
deleted file mode 100644
index 4b8b038c832..00000000000
--- a/lib/OptRemarks/OptRemarksParser.cpp
+++ /dev/null
@@ -1,368 +0,0 @@
-//===- OptRemarksParser.cpp -----------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides utility methods used by clients that want to use the
-// parser for optimization remarks in LLVM.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm-c/OptRemarks.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/YAMLTraits.h"
-
-using namespace llvm;
-
-namespace {
-struct RemarkParser {
-  /// Source manager for better error messages.
-  SourceMgr SM;
-  /// Stream for yaml parsing.
-  yaml::Stream Stream;
-  /// Storage for the error stream.
-  std::string ErrorString;
-  /// The error stream.
-  raw_string_ostream ErrorStream;
-  /// Iterator in the YAML stream.
-  yaml::document_iterator DI;
-  /// The parsed remark (if any).
-  Optional<LLVMOptRemarkEntry> LastRemark;
-  /// Temporary parsing buffer for the arguments.
-  SmallVector<LLVMOptRemarkArg, 8> TmpArgs;
-  /// The state used by the parser to parse a remark entry. Invalidated with
-  /// every call to `parseYAMLElement`.
-  struct ParseState {
-    /// Temporary parsing buffer for the arguments.
-    SmallVectorImpl<LLVMOptRemarkArg> *Args;
-    StringRef Type;
-    StringRef Pass;
-    StringRef Name;
-    StringRef Function;
-    /// Optional.
-    Optional<StringRef> File;
-    Optional<unsigned> Line;
-    Optional<unsigned> Column;
-    Optional<unsigned> Hotness;
-
-    ParseState(SmallVectorImpl<LLVMOptRemarkArg> &Args) : Args(&Args) {}
-    /// Use Args only as a **temporary** buffer.
-    ~ParseState() { Args->clear(); }
-  };
-
-  ParseState State;
-
-  /// Set to `true` if we had any errors during parsing.
-  bool HadAnyErrors = false;
-
-  RemarkParser(StringRef Buf)
-      : SM(), Stream(Buf, SM), ErrorString(), ErrorStream(ErrorString),
-        DI(Stream.begin()), LastRemark(), TmpArgs(), State(TmpArgs) {
-    SM.setDiagHandler(RemarkParser::HandleDiagnostic, this);
-  }
-
-  /// Parse a YAML element.
-  Error parseYAMLElement(yaml::Document &Remark);
-
-private:
-  /// Parse one key to a string.
-  /// otherwise.
-  Error parseKey(StringRef &Result, yaml::KeyValueNode &Node);
-  /// Parse one value to a string.
-  Error parseValue(StringRef &Result, yaml::KeyValueNode &Node);
-  /// Parse one value to an unsigned.
-  Error parseValue(Optional<unsigned> &Result, yaml::KeyValueNode &Node);
-  /// Parse a debug location.
-  Error parseDebugLoc(Optional<StringRef> &File, Optional<unsigned> &Line,
-                      Optional<unsigned> &Column, yaml::KeyValueNode &Node);
-  /// Parse an argument.
-  Error parseArg(SmallVectorImpl<LLVMOptRemarkArg> &TmpArgs, yaml::Node &Node);
-
-  /// Handle a diagnostic from the YAML stream. Records the error in the
-  /// RemarkParser class.
-  static void HandleDiagnostic(const SMDiagnostic &Diag, void *Ctx) {
-    assert(Ctx && "Expected non-null Ctx in diagnostic handler.");
-    auto *Parser = static_cast<RemarkParser *>(Ctx);
-    Diag.print(/*ProgName=*/nullptr, Parser->ErrorStream, /*ShowColors*/ false,
-               /*ShowKindLabels*/ true);
-  }
-};
-
-class ParseError : public ErrorInfo<ParseError> {
-public:
-  static char ID;
-
-  ParseError(StringRef Message, yaml::Node &Node)
-      : Message(Message), Node(Node) {}
-
-  void log(raw_ostream &OS) const override { OS << Message; }
-  std::error_code convertToErrorCode() const override {
-    return inconvertibleErrorCode();
-  }
-
-  StringRef getMessage() const { return Message; }
-  yaml::Node &getNode() const { return Node; }
-
-private:
-  StringRef Message; // No need to hold a full copy of the buffer.
-  yaml::Node &Node;
-};
-
-char ParseError::ID = 0;
-
-static LLVMOptRemarkStringRef toOptRemarkStr(StringRef Str) {
-  return {Str.data(), static_cast<uint32_t>(Str.size())};
-}
-
-Error RemarkParser::parseKey(StringRef &Result, yaml::KeyValueNode &Node) {
-  auto *Key = dyn_cast<yaml::ScalarNode>(Node.getKey());
-  if (!Key)
-    return make_error<ParseError>("key is not a string.", Node);
-
-  Result = Key->getRawValue();
-  return Error::success();
-}
-
-Error RemarkParser::parseValue(StringRef &Result, yaml::KeyValueNode &Node) {
-  auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
-  if (!Value)
-    return make_error<ParseError>("expected a value of scalar type.", Node);
-  Result = Value->getRawValue();
-
-  if (Result.front() == '\'')
-    Result = Result.drop_front();
-
-  if (Result.back() == '\'')
-    Result = Result.drop_back();
-
-  return Error::success();
-}
-
-Error RemarkParser::parseValue(Optional<unsigned> &Result,
-                               yaml::KeyValueNode &Node) {
-  SmallVector<char, 4> Tmp;
-  auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
-  if (!Value)
-    return make_error<ParseError>("expected a value of scalar type.", Node);
-  unsigned UnsignedValue = 0;
-  if (Value->getValue(Tmp).getAsInteger(10, UnsignedValue))
-    return make_error<ParseError>("expected a value of integer type.", *Value);
-  Result = UnsignedValue;
-  return Error::success();
-}
-
-Error RemarkParser::parseDebugLoc(Optional<StringRef> &File,
-                                  Optional<unsigned> &Line,
-                                  Optional<unsigned> &Column,
-                                  yaml::KeyValueNode &Node) {
-  auto *DebugLoc = dyn_cast<yaml::MappingNode>(Node.getValue());
-  if (!DebugLoc)
-    return make_error<ParseError>("expected a value of mapping type.", Node);
-
-  for (yaml::KeyValueNode &DLNode : *DebugLoc) {
-    StringRef KeyName;
-    if (Error E = parseKey(KeyName, DLNode))
-      return E;
-    if (KeyName == "File") {
-      File = StringRef(); // Set the optional to contain a default constructed
-                          // value, to be passed to the parsing function.
-      if (Error E = parseValue(*File, DLNode))
-        return E;
-    } else if (KeyName == "Column") {
-      if (Error E = parseValue(Column, DLNode))
-        return E;
-    } else if (KeyName == "Line") {
-      if (Error E = parseValue(Line, DLNode))
-        return E;
-    } else {
-      return make_error<ParseError>("unknown entry in DebugLoc map.", DLNode);
-    }
-  }
-
-  // If any of the debug loc fields is missing, return an error.
-  if (!File || !Line || !Column)
-    return make_error<ParseError>("DebugLoc node incomplete.", Node);
-
-  return Error::success();
-}
-
-Error RemarkParser::parseArg(SmallVectorImpl<LLVMOptRemarkArg> &Args,
-                             yaml::Node &Node) {
-  auto *ArgMap = dyn_cast<yaml::MappingNode>(&Node);
-  if (!ArgMap)
-    return make_error<ParseError>("expected a value of mapping type.", Node);
-
-  StringRef ValueStr;
-  StringRef KeyStr;
-  Optional<StringRef> File;
-  Optional<unsigned> Line;
-  Optional<unsigned> Column;
-
-  for (yaml::KeyValueNode &ArgEntry : *ArgMap) {
-    StringRef KeyName;
-    if (Error E = parseKey(KeyName, ArgEntry))
-      return E;
-
-    // Try to parse debug locs.
-    if (KeyName == "DebugLoc") {
-      // Can't have multiple DebugLoc entries per argument.
-      if (File || Line || Column)
-        return make_error<ParseError>(
-            "only one DebugLoc entry is allowed per argument.", ArgEntry);
-
-      if (Error E = parseDebugLoc(File, Line, Column, ArgEntry))
-        return E;
-      continue;
-    }
-
-    // If we already have a string, error out.
-    if (!ValueStr.empty())
-      return make_error<ParseError>(
-          "only one string entry is allowed per argument.", ArgEntry);
-
-    // Try to parse a string.
-    if (Error E = parseValue(ValueStr, ArgEntry))
-      return E;
-
-    // Keep the key from the string.
-    KeyStr = KeyName;
-  }
-
-  if (KeyStr.empty())
-    return make_error<ParseError>("argument key is missing.", *ArgMap);
-  if (ValueStr.empty())
-    return make_error<ParseError>("argument value is missing.", *ArgMap);
-
-  Args.push_back(LLVMOptRemarkArg{
-      toOptRemarkStr(KeyStr), toOptRemarkStr(ValueStr),
-      LLVMOptRemarkDebugLoc{toOptRemarkStr(File.getValueOr(StringRef())),
-                            Line.getValueOr(0), Column.getValueOr(0)}});
-
-  return Error::success();
-}
-
-Error RemarkParser::parseYAMLElement(yaml::Document &Remark) {
-  // Parsing a new remark, clear the previous one.
-  LastRemark = None;
-  State = ParseState(TmpArgs);
-
-  auto *Root = dyn_cast<yaml::MappingNode>(Remark.getRoot());
-  if (!Root)
-    return make_error<ParseError>("document root is not of mapping type.",
-                                  *Remark.getRoot());
-
-  State.Type = Root->getRawTag();
-
-  for (yaml::KeyValueNode &RemarkField : *Root) {
-    StringRef KeyName;
-    if (Error E = parseKey(KeyName, RemarkField))
-      return E;
-
-    if (KeyName == "Pass") {
-      if (Error E = parseValue(State.Pass, RemarkField))
-        return E;
-    } else if (KeyName == "Name") {
-      if (Error E = parseValue(State.Name, RemarkField))
-        return E;
-    } else if (KeyName == "Function") {
-      if (Error E = parseValue(State.Function, RemarkField))
-        return E;
-    } else if (KeyName == "Hotness") {
-      if (Error E = parseValue(State.Hotness, RemarkField))
-        return E;
-    } else if (KeyName == "DebugLoc") {
-      if (Error E =
-              parseDebugLoc(State.File, State.Line, State.Column, RemarkField))
-        return E;
-    } else if (KeyName == "Args") {
-      auto *Args = dyn_cast<yaml::SequenceNode>(RemarkField.getValue());
-      if (!Args)
-        return make_error<ParseError>("wrong value type for key.", RemarkField);
-
-      for (yaml::Node &Arg : *Args)
-        if (Error E = parseArg(*State.Args, Arg))
-          return E;
-    } else {
-      return make_error<ParseError>("unknown key.", RemarkField);
-    }
-  }
-
-  // If the YAML parsing failed, don't even continue parsing. We might
-  // encounter malformed YAML.
-  if (Stream.failed())
-    return make_error<ParseError>("YAML parsing failed.", *Remark.getRoot());
-
-  // Check if any of the mandatory fields are missing.
-  if (State.Type.empty() || State.Pass.empty() || State.Name.empty() ||
-      State.Function.empty())
-    return make_error<ParseError>("Type, Pass, Name or Function missing.",
-                                  *Remark.getRoot());
-
-  LastRemark = LLVMOptRemarkEntry{
-      toOptRemarkStr(State.Type),
-      toOptRemarkStr(State.Pass),
-      toOptRemarkStr(State.Name),
-      toOptRemarkStr(State.Function),
-      LLVMOptRemarkDebugLoc{toOptRemarkStr(State.File.getValueOr(StringRef())),
-                            State.Line.getValueOr(0),
-                            State.Column.getValueOr(0)},
-      State.Hotness.getValueOr(0),
-      static_cast<uint32_t>(State.Args->size()),
-      State.Args->data()};
-
-  return Error::success();
-}
-} // namespace
-
-// Create wrappers for C Binding types (see CBindingWrapping.h).
-DEFINE_SIMPLE_CONVERSION_FUNCTIONS(RemarkParser, LLVMOptRemarkParserRef);
-
-extern "C" LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf,
-                                                            uint64_t Size) {
-  return wrap(
-      new RemarkParser(StringRef(static_cast<const char *>(Buf), Size)));
-}
-
-extern "C" LLVMOptRemarkEntry *
-LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser) {
-  RemarkParser &TheParser = *unwrap(Parser);
-  // Check for EOF.
-  if (TheParser.HadAnyErrors || TheParser.DI == TheParser.Stream.end())
-    return nullptr;
-
-  // Try to parse an entry.
-  if (Error E = TheParser.parseYAMLElement(*TheParser.DI)) {
-    handleAllErrors(std::move(E), [&](const ParseError &PE) {
-      TheParser.Stream.printError(&PE.getNode(),
-                                  Twine(PE.getMessage()) + Twine('\n'));
-      TheParser.HadAnyErrors = true;
-    });
-    return nullptr;
-  }
-
-  // Move on.
-  ++TheParser.DI;
-
-  // Return the just-parsed remark.
-  if (Optional<LLVMOptRemarkEntry> &Entry = TheParser.LastRemark)
-    return &*Entry;
-  return nullptr;
-}
-
-extern "C" LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser) {
-  return unwrap(Parser)->HadAnyErrors;
-}
-
-extern "C" const char *
-LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser) {
-  return unwrap(Parser)->ErrorStream.str().c_str();
-}
-
-extern "C" void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser) {
-  delete unwrap(Parser);
-}
diff --git a/tools/llvm-opt-report/CMakeLists.txt b/tools/llvm-opt-report/CMakeLists.txt
index 3aabc03ab3f..777537a54c0 100644
--- a/tools/llvm-opt-report/CMakeLists.txt
+++ b/tools/llvm-opt-report/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS Core Demangle Object OptRemarks Support)
+set(LLVM_LINK_COMPONENTS Core Demangle Object Support)
 
 add_llvm_tool(llvm-opt-report
   OptReport.cpp
diff --git a/tools/llvm-opt-report/OptReport.cpp b/tools/llvm-opt-report/OptReport.cpp
index 071f779a9e6..aa7966132c2 100644
--- a/tools/llvm-opt-report/OptReport.cpp
+++ b/tools/llvm-opt-report/OptReport.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm-c/OptRemarks.h"
 #include <cstdlib>
 #include <map>
 #include <set>
@@ -143,44 +142,104 @@ typedef std::map<std::string, std::map<int, std::map<std::string, std::map<int,
           OptReportLocationInfo>>>> LocationInfoTy;
 } // anonymous namespace
 
-static bool readLocationInfo(LocationInfoTy &LocationInfo) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
-      MemoryBuffer::getFile(InputFileName.c_str());
-  if (std::error_code EC = Buf.getError()) {
-    WithColor::error() << "Can't open file " << InputFileName << ": "
-                       << EC.message() << "\n";
-    return false;
-  }
+static void collectLocationInfo(yaml::Stream &Stream,
+                                LocationInfoTy &LocationInfo) {
+  SmallVector<char, 8> Tmp;
+
+  // Note: We're using the YAML parser here directly, instead of using the
+  // YAMLTraits implementation, because the YAMLTraits implementation does not
+  // support a way to handle only a subset of the input keys (it will error out
+  // if there is an input key that you don't map to your class), and
+  // furthermore, it does not provide a way to handle the Args sequence of
+  // key/value pairs, where the order must be captured and the 'String' key
+  // might be repeated.
+  for (auto &Doc : Stream) {
+    auto *Root = dyn_cast<yaml::MappingNode>(Doc.getRoot());
+    if (!Root)
+      continue;
 
-  StringRef Buffer = (*Buf)->getBuffer();
-  LLVMOptRemarkParserRef Parser =
-      LLVMOptRemarkParserCreate(Buffer.data(), Buffer.size());
-
-  LLVMOptRemarkEntry *Remark = nullptr;
-  while ((Remark = LLVMOptRemarkParserGetNext(Parser))) {
-    bool Transformed =
-        StringRef(Remark->RemarkType.Str, Remark->RemarkType.Len) == "!Passed";
-    StringRef Pass(Remark->PassName.Str, Remark->PassName.Len);
-    StringRef File(Remark->DebugLoc.SourceFile.Str,
-                   Remark->DebugLoc.SourceFile.Len);
-    StringRef Function(Remark->FunctionName.Str, Remark->FunctionName.Len);
-    uint32_t Line = Remark->DebugLoc.SourceLineNumber;
-    uint32_t Column = Remark->DebugLoc.SourceColumnNumber;
-    ArrayRef<LLVMOptRemarkArg> Args(Remark->Args, Remark->NumArgs);
+    bool Transformed = Root->getRawTag() == "!Passed";
+    std::string Pass, File, Function;
+    int Line = 0, Column = 1;
 
     int VectorizationFactor = 1;
     int InterleaveCount = 1;
     int UnrollCount = 1;
 
-    for (const LLVMOptRemarkArg &Arg : Args) {
-      StringRef ArgKeyName(Arg.Key.Str, Arg.Key.Len);
-      StringRef ArgValue(Arg.Value.Str, Arg.Value.Len);
-      if (ArgKeyName == "VectorizationFactor")
-        ArgValue.getAsInteger(10, VectorizationFactor);
-      else if (ArgKeyName == "InterleaveCount")
-        ArgValue.getAsInteger(10, InterleaveCount);
-      else if (ArgKeyName == "UnrollCount")
-        ArgValue.getAsInteger(10, UnrollCount);
+    for (auto &RootChild : *Root) {
+      auto *Key = dyn_cast<yaml::ScalarNode>(RootChild.getKey());
+      if (!Key)
+        continue;
+      StringRef KeyName = Key->getValue(Tmp);
+      if (KeyName == "Pass") {
+        auto *Value = dyn_cast<yaml::ScalarNode>(RootChild.getValue());
+        if (!Value)
+          continue;
+        Pass = Value->getValue(Tmp);
+      } else if (KeyName == "Function") {
+        auto *Value = dyn_cast<yaml::ScalarNode>(RootChild.getValue());
+        if (!Value)
+          continue;
+        Function = Value->getValue(Tmp);
+      } else if (KeyName == "DebugLoc") {
+        auto *DebugLoc = dyn_cast<yaml::MappingNode>(RootChild.getValue());
+        if (!DebugLoc)
+          continue;
+
+        for (auto &DLChild : *DebugLoc) {
+          auto *DLKey = dyn_cast<yaml::ScalarNode>(DLChild.getKey());
+          if (!DLKey)
+            continue;
+          StringRef DLKeyName = DLKey->getValue(Tmp);
+          if (DLKeyName == "File") {
+            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
+            if (!Value)
+              continue;
+            File = Value->getValue(Tmp);
+          } else if (DLKeyName == "Line") {
+            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
+            if (!Value)
+              continue;
+            Value->getValue(Tmp).getAsInteger(10, Line);
+          } else if (DLKeyName == "Column") {
+            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
+            if (!Value)
+              continue;
+            Value->getValue(Tmp).getAsInteger(10, Column);
+          }
+        }
+      } else if (KeyName == "Args") {
+        auto *Args = dyn_cast<yaml::SequenceNode>(RootChild.getValue());
+        if (!Args)
+          continue;
+        for (auto &ArgChild : *Args) {
+          auto *ArgMap = dyn_cast<yaml::MappingNode>(&ArgChild);
+          if (!ArgMap)
+            continue;
+          for (auto &ArgKV : *ArgMap) {
+            auto *ArgKey = dyn_cast<yaml::ScalarNode>(ArgKV.getKey());
+            if (!ArgKey)
+              continue;
+            StringRef ArgKeyName = ArgKey->getValue(Tmp);
+            if (ArgKeyName == "VectorizationFactor") {
+              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
+              if (!Value)
+                continue;
+              Value->getValue(Tmp).getAsInteger(10, VectorizationFactor);
+            } else if (ArgKeyName == "InterleaveCount") {
+              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
+              if (!Value)
+                continue;
+              Value->getValue(Tmp).getAsInteger(10, InterleaveCount);
+            } else if (ArgKeyName == "UnrollCount") {
+              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
+              if (!Value)
+                continue;
+              Value->getValue(Tmp).getAsInteger(10, UnrollCount);
+            }
+          }
+        }
+      }
     }
 
     if (Line < 1 || File.empty())
@@ -209,13 +268,22 @@ static bool readLocationInfo(LocationInfoTy &LocationInfo) {
       UpdateLLII(LI.Vectorized);
     }
   }
+}
+
+static bool readLocationInfo(LocationInfoTy &LocationInfo) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+      MemoryBuffer::getFileOrSTDIN(InputFileName);
+  if (std::error_code EC = Buf.getError()) {
+    WithColor::error() << "Can't open file " << InputFileName << ": "
+                       << EC.message() << "\n";
+    return false;
+  }
 
-  bool HasError = LLVMOptRemarkParserHasError(Parser);
-  if (HasError)
-    WithColor::error() << LLVMOptRemarkParserGetErrorMessage(Parser) << "\n";
+  SourceMgr SM;
+  yaml::Stream Stream(Buf.get()->getBuffer(), SM);
+  collectLocationInfo(Stream, LocationInfo);
 
-  LLVMOptRemarkParserDispose(Parser);
-  return !HasError;
+  return true;
 }
 
 static bool writeReport(LocationInfoTy &LocationInfo) {
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 5dba2de4a88..bc41ab66a23 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -26,7 +26,6 @@ add_subdirectory(MI)
 add_subdirectory(Object)
 add_subdirectory(ObjectYAML)
 add_subdirectory(Option)
-add_subdirectory(OptRemarks)
 add_subdirectory(Passes)
 add_subdirectory(ProfileData)
 add_subdirectory(Support)
diff --git a/unittests/OptRemarks/CMakeLists.txt b/unittests/OptRemarks/CMakeLists.txt
deleted file mode 100644
index 94c74867cc4..00000000000
--- a/unittests/OptRemarks/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  OptRemarks
-  Support
-  )
-
-add_llvm_unittest(OptRemarksTests
-  OptRemarksParsingTest.cpp
-  )
diff --git a/unittests/OptRemarks/OptRemarksParsingTest.cpp b/unittests/OptRemarks/OptRemarksParsingTest.cpp
deleted file mode 100644
index a28820ffb7f..00000000000
--- a/unittests/OptRemarks/OptRemarksParsingTest.cpp
+++ /dev/null
@@ -1,433 +0,0 @@
-//===- unittest/Support/OptRemarksParsingTest.cpp - OptTable tests --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm-c/OptRemarks.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-
-template <size_t N> bool tryParse(const char (&Buf)[N]) {
-  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1);
-  LLVMOptRemarkEntry *Remark = nullptr;
-  while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) {
-    EXPECT_TRUE(Remark == nullptr); // Only one remark per test.
-    Remark = NewRemark;
-  }
-  EXPECT_TRUE(Remark != nullptr); // We need *exactly* one remark per test.
-  bool HasError = LLVMOptRemarkParserHasError(Parser);
-  LLVMOptRemarkParserDispose(Parser);
-  return !HasError;
-}
-
-template <size_t N>
-bool parseExpectError(const char (&Buf)[N], const char *Error) {
-  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1);
-  LLVMOptRemarkEntry *Remark = nullptr;
-  while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) {
-    EXPECT_FALSE(NewRemark);
-  }
-  EXPECT_TRUE(Remark == nullptr); // We are parsing only one malformed remark.
-  EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser));
-  bool MatchesError =
-      StringRef(LLVMOptRemarkParserGetErrorMessage(Parser)).contains(Error);
-  LLVMOptRemarkParserDispose(Parser);
-
-  return MatchesError;
-}
-
-TEST(OptRemarks, OptRemarksParsingEmpty) {
-  StringRef Buf = R"YAML(
-)YAML";
-  LLVMOptRemarkParserRef Parser =
-      LLVMOptRemarkParserCreate(Buf.data(), Buf.size());
-  LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser);
-  EXPECT_TRUE(NewRemark == nullptr); // No remark expected.
-  EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser));
-  EXPECT_TRUE(StringRef(LLVMOptRemarkParserGetErrorMessage(Parser))
-                  .contains("document root is not of mapping type."));
-  LLVMOptRemarkParserDispose(Parser);
-}
-
-TEST(OptRemarks, OptRemarksParsingGood) {
-  EXPECT_TRUE(tryParse(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-DebugLoc: { File: file.c, Line: 3, Column: 12 }
-Function: foo
-Args:
-  - Callee: bar
-  - String: ' will not be inlined into '
-  - Caller: foo
-    DebugLoc: { File: file.c, Line: 2, Column: 0 }
-  - String: ' because its definition is unavailable'
-)YAML"));
-
-  // No debug loc should also pass.
-  EXPECT_TRUE(tryParse(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-Args:
-  - Callee: bar
-  - String: ' will not be inlined into '
-  - Caller: foo
-    DebugLoc: { File: file.c, Line: 2, Column: 0 }
-  - String: ' because its definition is unavailable'
-)YAML"));
-
-  // No args is also ok.
-  EXPECT_TRUE(tryParse(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-DebugLoc: { File: file.c, Line: 3, Column: 12 }
-Function: foo
-)YAML"));
-
-  // Different order.
-  EXPECT_TRUE(tryParse(R"YAML(
---- !Missed
-DebugLoc: { Line: 3, Column: 12, File: file.c }
-Function: foo
-Name: NoDefinition
-Args:
-  - Callee: bar
-  - String: ' will not be inlined into '
-  - Caller: foo
-    DebugLoc: { File: file.c, Line: 2, Column: 0 }
-  - String: ' because its definition is unavailable'
-Pass: inline
-)YAML"));
-}
-
-// Mandatory common part of a remark.
-#define COMMON_REMARK "\nPass: inline\nName: NoDefinition\nFunction: foo\n"
-// Test all the types.
-TEST(OptRemarks, OptRemarksParsingTypes) {
-  // Type: Passed
-  EXPECT_TRUE(tryParse("--- !Passed" COMMON_REMARK));
-  // Type: Missed
-  EXPECT_TRUE(tryParse("--- !Missed" COMMON_REMARK));
-  // Type: Analysis
-  EXPECT_TRUE(tryParse("--- !Analysis" COMMON_REMARK));
-  // Type: AnalysisFPCompute
-  EXPECT_TRUE(tryParse("--- !AnalysisFPCompute" COMMON_REMARK));
-  // Type: AnalysisAliasing
-  EXPECT_TRUE(tryParse("--- !AnalysisAliasing" COMMON_REMARK));
-  // Type: Failure
-  EXPECT_TRUE(tryParse("--- !Failure" COMMON_REMARK));
-}
-#undef COMMON_REMARK
-
-TEST(OptRemarks, OptRemarksParsingMissingFields) {
-  // No type.
-  EXPECT_TRUE(parseExpectError(R"YAML(
----
-Pass: inline
-Name: NoDefinition
-Function: foo
-)YAML",
-                               "error: Type, Pass, Name or Function missing."));
-  // No pass.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Name: NoDefinition
-Function: foo
-)YAML",
-                               "error: Type, Pass, Name or Function missing."));
-  // No name.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Function: foo
-)YAML",
-                               "error: Type, Pass, Name or Function missing."));
-  // No function.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-)YAML",
-                               "error: Type, Pass, Name or Function missing."));
-  // Debug loc but no file.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-DebugLoc: { Line: 3, Column: 12 }
-)YAML",
-                               "DebugLoc node incomplete."));
-  // Debug loc but no line.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-DebugLoc: { File: file.c, Column: 12 }
-)YAML",
-                               "DebugLoc node incomplete."));
-  // Debug loc but no column.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-DebugLoc: { File: file.c, Line: 3 }
-)YAML",
-                               "DebugLoc node incomplete."));
-}
-
-TEST(OptRemarks, OptRemarksParsingWrongTypes) {
-  // Wrong debug loc type.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-DebugLoc: foo
-)YAML",
-                               "expected a value of mapping type."));
-  // Wrong line type.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-DebugLoc: { File: file.c, Line: b, Column: 12 }
-)YAML",
-                               "expected a value of integer type."));
-  // Wrong column type.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-DebugLoc: { File: file.c, Line: 3, Column: c }
-)YAML",
-                               "expected a value of integer type."));
-  // Wrong args type.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-Args: foo
-)YAML",
-                               "wrong value type for key."));
-  // Wrong key type.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-{ A: a }: inline
-Name: NoDefinition
-Function: foo
-)YAML",
-                               "key is not a string."));
-  // Debug loc with unknown entry.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-DebugLoc: { File: file.c, Column: 12, Unknown: 12 }
-)YAML",
-                               "unknown entry in DebugLoc map."));
-  // Unknown entry.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Unknown: inline
-)YAML",
-                               "unknown key."));
-  // Not a scalar.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: { File: a, Line: 1, Column: 2 }
-Name: NoDefinition
-Function: foo
-)YAML",
-                               "expected a value of scalar type."));
-  // Not a string file in debug loc.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-DebugLoc: { File: { a: b }, Column: 12, Line: 12 }
-)YAML",
-                               "expected a value of scalar type."));
-  // Not a integer column in debug loc.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-DebugLoc: { File: file.c, Column: { a: b }, Line: 12 }
-)YAML",
-                               "expected a value of scalar type."));
-  // Not a integer line in debug loc.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-DebugLoc: { File: file.c, Column: 12, Line: { a: b } }
-)YAML",
-                               "expected a value of scalar type."));
-  // Not a mapping type value for args.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-DebugLoc: { File: file.c, Column: 12, Line: { a: b } }
-)YAML",
-                               "expected a value of scalar type."));
-}
-
-TEST(OptRemarks, OptRemarksParsingWrongArgs) {
-  // Multiple debug locs per arg.
-  EXPECT_TRUE(
-      parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-Args:
-  - Str: string
-    DebugLoc: { File: a, Line: 1, Column: 2 }
-    DebugLoc: { File: a, Line: 1, Column: 2 }
-)YAML",
-                       "only one DebugLoc entry is allowed per argument."));
-  // Multiple strings per arg.
-  EXPECT_TRUE(
-      parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-Args:
-  - Str: string
-    Str2: string
-    DebugLoc: { File: a, Line: 1, Column: 2 }
-)YAML",
-                       "only one string entry is allowed per argument."));
-  // No arg value.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-Args:
-  - Callee: ''
-  - DebugLoc: { File: a, Line: 1, Column: 2 }
-)YAML",
-                               "argument value is missing."));
-  // No arg value.
-  EXPECT_TRUE(parseExpectError(R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-Function: foo
-Args:
-  - DebugLoc: { File: a, Line: 1, Column: 2 }
-)YAML",
-                               "argument key is missing."));
-
-}
-
-TEST(OptRemarks, OptRemarksGoodStruct) {
-  StringRef Buf = R"YAML(
---- !Missed
-Pass: inline
-Name: NoDefinition
-DebugLoc: { File: file.c, Line: 3, Column: 12 }
-Function: foo
-Args:
-  - Callee: bar
-  - String: ' will not be inlined into '
-  - Caller: foo
-    DebugLoc: { File: file.c, Line: 2, Column: 0 }
-  - String: ' because its definition is unavailable'
-)YAML";
-
-  LLVMOptRemarkParserRef Parser =
-      LLVMOptRemarkParserCreate(Buf.data(), Buf.size());
-  LLVMOptRemarkEntry *Remark = LLVMOptRemarkParserGetNext(Parser);
-  EXPECT_FALSE(Remark == nullptr);
-  EXPECT_EQ(StringRef(Remark->RemarkType.Str, 7), "!Missed");
-  EXPECT_EQ(Remark->RemarkType.Len, 7U);
-  EXPECT_EQ(StringRef(Remark->PassName.Str, 6), "inline");
-  EXPECT_EQ(Remark->PassName.Len, 6U);
-  EXPECT_EQ(StringRef(Remark->RemarkName.Str, 12), "NoDefinition");
-  EXPECT_EQ(Remark->RemarkName.Len, 12U);
-  EXPECT_EQ(StringRef(Remark->FunctionName.Str, 3), "foo");
-  EXPECT_EQ(Remark->FunctionName.Len, 3U);
-  EXPECT_EQ(StringRef(Remark->DebugLoc.SourceFile.Str, 6), "file.c");
-  EXPECT_EQ(Remark->DebugLoc.SourceFile.Len, 6U);
-  EXPECT_EQ(Remark->DebugLoc.SourceLineNumber, 3U);
-  EXPECT_EQ(Remark->DebugLoc.SourceColumnNumber, 12U);
-  EXPECT_EQ(Remark->Hotness, 0U);
-  EXPECT_EQ(Remark->NumArgs, 4U);
-  // Arg 0
-  {
-    LLVMOptRemarkArg &Arg = Remark->Args[0];
-    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Callee");
-    EXPECT_EQ(Arg.Key.Len, 6U);
-    EXPECT_EQ(StringRef(Arg.Value.Str, 3), "bar");
-    EXPECT_EQ(Arg.Value.Len, 3U);
-    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
-    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
-    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
-    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
-  }
-  // Arg 1
-  {
-    LLVMOptRemarkArg &Arg = Remark->Args[1];
-    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String");
-    EXPECT_EQ(Arg.Key.Len, 6U);
-    EXPECT_EQ(StringRef(Arg.Value.Str, 26), " will not be inlined into ");
-    EXPECT_EQ(Arg.Value.Len, 26U);
-    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
-    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
-    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
-    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
-  }
-  // Arg 2
-  {
-    LLVMOptRemarkArg &Arg = Remark->Args[2];
-    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Caller");
-    EXPECT_EQ(Arg.Key.Len, 6U);
-    EXPECT_EQ(StringRef(Arg.Value.Str, 3), "foo");
-    EXPECT_EQ(Arg.Value.Len, 3U);
-    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 6), "file.c");
-    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 6U);
-    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 2U);
-    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
-  }
-  // Arg 3
-  {
-    LLVMOptRemarkArg &Arg = Remark->Args[3];
-    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String");
-    EXPECT_EQ(Arg.Key.Len, 6U);
-    EXPECT_EQ(StringRef(Arg.Value.Str, 38),
-              " because its definition is unavailable");
-    EXPECT_EQ(Arg.Value.Len, 38U);
-    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
-    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
-    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
-    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
-  }
-
-  EXPECT_EQ(LLVMOptRemarkParserGetNext(Parser), nullptr);
-
-  EXPECT_FALSE(LLVMOptRemarkParserHasError(Parser));
-  LLVMOptRemarkParserDispose(Parser);
-}
-- 
GitLab


From 2a9ea3459b2a29c629a23d1d9b1287a20e7f7c1a Mon Sep 17 00:00:00 2001
From: Scott Linder <scott@scottlinder.com>
Date: Wed, 10 Oct 2018 18:14:02 +0000
Subject: [PATCH 0014/1116] [Support] Remove redundant qualifiers in YAMLTraits
 (NFC)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344166 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/YAMLTraits.h | 12 -------
 lib/Support/YAMLTraits.cpp        | 56 +++++++++++++++----------------
 2 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 6836aa2aa06..5d029ad5ce9 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -250,7 +250,6 @@ struct has_ScalarEnumerationTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
     (sizeof(test<ScalarEnumerationTraits<T>>(nullptr)) == 1);
 };
@@ -267,7 +266,6 @@ struct has_ScalarBitSetTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value = (sizeof(test<ScalarBitSetTraits<T>>(nullptr)) == 1);
 };
 
@@ -287,7 +285,6 @@ struct has_ScalarTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
       (sizeof(test<ScalarTraits<T>>(nullptr, nullptr, nullptr)) == 1);
 };
@@ -306,7 +303,6 @@ struct has_BlockScalarTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
       (sizeof(test<BlockScalarTraits<T>>(nullptr, nullptr)) == 1);
 };
@@ -321,7 +317,6 @@ template <class T, class Context> struct has_MappingTraits {
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
       (sizeof(test<MappingContextTraits<T, Context>>(nullptr)) == 1);
 };
@@ -335,7 +330,6 @@ template <class T> struct has_MappingTraits<T, EmptyContext> {
 
   template <typename U> static double test(...);
 
-public:
   static bool const value = (sizeof(test<MappingTraits<T>>(nullptr)) == 1);
 };
 
@@ -349,7 +343,6 @@ template <class T, class Context> struct has_MappingValidateTraits {
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
       (sizeof(test<MappingContextTraits<T, Context>>(nullptr)) == 1);
 };
@@ -363,7 +356,6 @@ template <class T> struct has_MappingValidateTraits<T, EmptyContext> {
 
   template <typename U> static double test(...);
 
-public:
   static bool const value = (sizeof(test<MappingTraits<T>>(nullptr)) == 1);
 };
 
@@ -379,7 +371,6 @@ struct has_SequenceMethodTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =  (sizeof(test<SequenceTraits<T>>(nullptr)) == 1);
 };
 
@@ -395,7 +386,6 @@ struct has_CustomMappingTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
       (sizeof(test<CustomMappingTraits<T>>(nullptr)) == 1);
 };
@@ -425,7 +415,6 @@ struct has_FlowTraits<T, true>
   template<typename C>
   static char (&f(...))[2];
 
-public:
   static bool const value = sizeof(f<Derived>(nullptr)) == 2;
 };
 
@@ -446,7 +435,6 @@ struct has_DocumentListTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value = (sizeof(test<DocumentListTraits<T>>(nullptr))==1);
 };
 
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index d6345efd00c..f8492c96bab 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -98,7 +98,7 @@ bool Input::setCurrentDocument() {
       ++DocIterator;
       return setCurrentDocument();
     }
-    TopNode = this->createHNodes(N);
+    TopNode = createHNodes(N);
     CurrentNode = TopNode.get();
     return true;
   }
@@ -343,7 +343,7 @@ void Input::blockScalarString(StringRef &S) { scalarString(S, QuotingType::None)
 
 void Input::setError(HNode *hnode, const Twine &message) {
   assert(hnode && "HNode must not be NULL");
-  this->setError(hnode->_node, message);
+  setError(hnode->_node, message);
 }
 
 void Input::setError(Node *node, const Twine &message) {
@@ -366,7 +366,7 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
   } else if (SequenceNode *SQ = dyn_cast<SequenceNode>(N)) {
     auto SQHNode = llvm::make_unique<SequenceHNode>(N);
     for (Node &SN : *SQ) {
-      auto Entry = this->createHNodes(&SN);
+      auto Entry = createHNodes(&SN);
       if (EC)
         break;
       SQHNode->Entries.push_back(std::move(Entry));
@@ -391,7 +391,7 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
         // Copy string to permanent storage
         KeyStr = StringStorage.str().copy(StringAllocator);
       }
-      auto ValueHNode = this->createHNodes(Value);
+      auto ValueHNode = createHNodes(Value);
       if (EC)
         break;
       mapHNode->Mapping[KeyStr] = std::move(ValueHNode);
@@ -406,7 +406,7 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
 }
 
 void Input::setError(const Twine &Message) {
-  this->setError(CurrentNode, Message);
+  setError(CurrentNode, Message);
 }
 
 bool Input::canElideEmptySequence() {
@@ -440,11 +440,11 @@ bool Output::mapTag(StringRef Tag, bool Use) {
         StateStack.size() > 1 && (StateStack[StateStack.size() - 2] == inSeq ||
           StateStack[StateStack.size() - 2] == inFlowSeq);
     if (SequenceElement && StateStack.back() == inMapFirstKey) {
-      this->newLineCheck();
+      newLineCheck();
     } else {
-      this->output(" ");
+      output(" ");
     }
-    this->output(Tag);
+    output(Tag);
     if (SequenceElement) {
       // If we're writing the tag during the first element of a map, the tag
       // takes the place of the first element in the sequence.
@@ -476,8 +476,8 @@ bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault,
     if (State == inFlowMapFirstKey || State == inFlowMapOtherKey) {
       flowKey(Key);
     } else {
-      this->newLineCheck();
-      this->paddedKey(Key);
+      newLineCheck();
+      paddedKey(Key);
     }
     return true;
   }
@@ -496,23 +496,23 @@ void Output::postflightKey(void *) {
 
 void Output::beginFlowMapping() {
   StateStack.push_back(inFlowMapFirstKey);
-  this->newLineCheck();
+  newLineCheck();
   ColumnAtMapFlowStart = Column;
   output("{ ");
 }
 
 void Output::endFlowMapping() {
   StateStack.pop_back();
-  this->outputUpToEndOfLine(" }");
+  outputUpToEndOfLine(" }");
 }
 
 void Output::beginDocuments() {
-  this->outputUpToEndOfLine("---");
+  outputUpToEndOfLine("---");
 }
 
 bool Output::preflightDocument(unsigned index) {
   if (index > 0)
-    this->outputUpToEndOfLine("\n---");
+    outputUpToEndOfLine("\n---");
   return true;
 }
 
@@ -542,7 +542,7 @@ void Output::postflightElement(void *) {
 
 unsigned Output::beginFlowSequence() {
   StateStack.push_back(inFlowSeq);
-  this->newLineCheck();
+  newLineCheck();
   ColumnAtFlowStart = Column;
   output("[ ");
   NeedFlowSequenceComma = false;
@@ -551,7 +551,7 @@ unsigned Output::beginFlowSequence() {
 
 void Output::endFlowSequence() {
   StateStack.pop_back();
-  this->outputUpToEndOfLine(" ]");
+  outputUpToEndOfLine(" ]");
 }
 
 bool Output::preflightFlowElement(unsigned, void *&) {
@@ -577,8 +577,8 @@ void Output::beginEnumScalar() {
 
 bool Output::matchEnumScalar(const char *Str, bool Match) {
   if (Match && !EnumerationMatchFound) {
-    this->newLineCheck();
-    this->outputUpToEndOfLine(Str);
+    newLineCheck();
+    outputUpToEndOfLine(Str);
     EnumerationMatchFound = true;
   }
   return false;
@@ -597,7 +597,7 @@ void Output::endEnumScalar() {
 }
 
 bool Output::beginBitSetScalar(bool &DoClear) {
-  this->newLineCheck();
+  newLineCheck();
   output("[ ");
   NeedBitValueComma = false;
   DoClear = false;
@@ -608,27 +608,27 @@ bool Output::bitSetMatch(const char *Str, bool Matches) {
   if (Matches) {
     if (NeedBitValueComma)
       output(", ");
-    this->output(Str);
+    output(Str);
     NeedBitValueComma = true;
   }
   return false;
 }
 
 void Output::endBitSetScalar() {
-  this->outputUpToEndOfLine(" ]");
+  outputUpToEndOfLine(" ]");
 }
 
 void Output::scalarString(StringRef &S, QuotingType MustQuote) {
-  this->newLineCheck();
+  newLineCheck();
   if (S.empty()) {
     // Print '' for the empty string because leaving the field empty is not
     // allowed.
-    this->outputUpToEndOfLine("''");
+    outputUpToEndOfLine("''");
     return;
   }
   if (MustQuote == QuotingType::None) {
     // Only quote if we must.
-    this->outputUpToEndOfLine(S);
+    outputUpToEndOfLine(S);
     return;
   }
 
@@ -645,7 +645,7 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {
   // escapes. This is handled in yaml::escape.
   if (MustQuote == QuotingType::Double) {
     output(yaml::escape(Base, /* EscapePrintable= */ false));
-    this->outputUpToEndOfLine(Quote);
+    outputUpToEndOfLine(Quote);
     return;
   }
 
@@ -659,7 +659,7 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {
     ++j;
   }
   output(StringRef(&Base[i], j - i));
-  this->outputUpToEndOfLine(Quote); // Ending quote.
+  outputUpToEndOfLine(Quote); // Ending quote.
 }
 
 void Output::blockScalarString(StringRef &S) {
@@ -702,7 +702,7 @@ void Output::output(StringRef s) {
 }
 
 void Output::outputUpToEndOfLine(StringRef s) {
-  this->output(s);
+  output(s);
   if (StateStack.empty() || (StateStack.back() != inFlowSeq &&
                              StateStack.back() != inFlowMapFirstKey &&
                              StateStack.back() != inFlowMapOtherKey))
@@ -723,7 +723,7 @@ void Output::newLineCheck() {
     return;
   NeedsNewLine = false;
 
-  this->outputNewLine();
+  outputNewLine();
 
   assert(StateStack.size() > 0);
   unsigned Indent = StateStack.size() - 1;
-- 
GitLab


From 105b05e085580cdd3b9ef95db08e3661b8532232 Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Wed, 10 Oct 2018 18:43:42 +0000
Subject: [PATCH 0015/1116] Reland: [OptRemarks] Add library for parsing
 optimization remarks

Add a library that parses optimization remarks (currently YAML, so based
on the YAMLParser).

The goal is to be able to provide tools a remark parser that is not
completely dependent on YAML, in case we decide to change the format
later.

It exposes a C API which takes a handler that is called with the remark
structure.

It adds a libLLVMOptRemark.a static library, and it's used in-tree by
the llvm-opt-report tool (from which the parser has been mostly moved
out).

Differential Revision: https://reviews.llvm.org/D52776

Fixed the tests by removing the usage of C++11 strings, which seems not
to be supported by gcc 4.8.4 if they're used as a macro argument.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344171 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm-c/OptRemarks.h                   | 197 ++++++++
 lib/CMakeLists.txt                            |   1 +
 lib/LLVMBuild.txt                             |   1 +
 lib/OptRemarks/CMakeLists.txt                 |   3 +
 lib/OptRemarks/LLVMBuild.txt                  |  22 +
 lib/OptRemarks/OptRemarksParser.cpp           | 368 +++++++++++++++
 tools/llvm-opt-report/CMakeLists.txt          |   2 +-
 tools/llvm-opt-report/OptReport.cpp           | 144 ++----
 unittests/CMakeLists.txt                      |   1 +
 unittests/OptRemarks/CMakeLists.txt           |   8 +
 .../OptRemarks/OptRemarksParsingTest.cpp      | 433 ++++++++++++++++++
 11 files changed, 1073 insertions(+), 107 deletions(-)
 create mode 100644 include/llvm-c/OptRemarks.h
 create mode 100644 lib/OptRemarks/CMakeLists.txt
 create mode 100644 lib/OptRemarks/LLVMBuild.txt
 create mode 100644 lib/OptRemarks/OptRemarksParser.cpp
 create mode 100644 unittests/OptRemarks/CMakeLists.txt
 create mode 100644 unittests/OptRemarks/OptRemarksParsingTest.cpp

diff --git a/include/llvm-c/OptRemarks.h b/include/llvm-c/OptRemarks.h
new file mode 100644
index 00000000000..f3449cc1b8c
--- /dev/null
+++ b/include/llvm-c/OptRemarks.h
@@ -0,0 +1,197 @@
+/*===-- llvm-c/OptRemarks.h - OptRemarks Public C Interface -------*- C -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header provides a public interface to an opt-remark library.          *|
+|* LLVM provides an implementation of this interface.                         *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_OPT_REMARKS_H
+#define LLVM_C_OPT_REMARKS_H
+
+#include "llvm-c/Core.h"
+#include "llvm-c/Types.h"
+#ifdef __cplusplus
+#include <cstddef>
+extern "C" {
+#else
+#include <stddef.h>
+#endif /* !defined(__cplusplus) */
+
+/**
+ * @defgroup LLVMCOPTREMARKS OptRemarks
+ * @ingroup LLVMC
+ *
+ * @{
+ */
+
+#define OPT_REMARKS_API_VERSION 0
+
+/**
+ * String containing a buffer and a length. The buffer is not guaranteed to be
+ * zero-terminated.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  const char *Str;
+  uint32_t Len;
+} LLVMOptRemarkStringRef;
+
+/**
+ * DebugLoc containing File, Line and Column.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  // File:
+  LLVMOptRemarkStringRef SourceFile;
+  // Line:
+  uint32_t SourceLineNumber;
+  // Column:
+  uint32_t SourceColumnNumber;
+} LLVMOptRemarkDebugLoc;
+
+/**
+ * Element of the "Args" list. The key might give more information about what
+ * are the semantics of the value, e.g. "Callee" will tell you that the value
+ * is a symbol that names a function.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  // e.g. "Callee"
+  LLVMOptRemarkStringRef Key;
+  // e.g. "malloc"
+  LLVMOptRemarkStringRef Value;
+
+  // "DebugLoc": Optional
+  LLVMOptRemarkDebugLoc DebugLoc;
+} LLVMOptRemarkArg;
+
+/**
+ * One remark entry.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  // e.g. !Missed, !Passed
+  LLVMOptRemarkStringRef RemarkType;
+  // "Pass": Required
+  LLVMOptRemarkStringRef PassName;
+  // "Name": Required
+  LLVMOptRemarkStringRef RemarkName;
+  // "Function": Required
+  LLVMOptRemarkStringRef FunctionName;
+
+  // "DebugLoc": Optional
+  LLVMOptRemarkDebugLoc DebugLoc;
+  // "Hotness": Optional
+  uint32_t Hotness;
+  // "Args": Optional. It is an array of `num_args` elements.
+  uint32_t NumArgs;
+  LLVMOptRemarkArg *Args;
+} LLVMOptRemarkEntry;
+
+typedef struct LLVMOptRemarkOpaqueParser *LLVMOptRemarkParserRef;
+
+/**
+ * Creates a remark parser that can be used to read and parse the buffer located
+ * in \p Buf of size \p Size.
+ *
+ * \p Buf cannot be NULL.
+ *
+ * This function should be paired with LLVMOptRemarkParserDispose() to avoid
+ * leaking resources.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf,
+                                                        uint64_t Size);
+
+/**
+ * Returns the next remark in the file.
+ *
+ * The value pointed to by the return value is invalidated by the next call to
+ * LLVMOptRemarkParserGetNext().
+ *
+ * If the parser reaches the end of the buffer, the return value will be NULL.
+ *
+ * In the case of an error, the return value will be NULL, and:
+ *
+ * 1) LLVMOptRemarkParserHasError() will return `1`.
+ *
+ * 2) LLVMOptRemarkParserGetErrorMessage() will return a descriptive error
+ *    message.
+ *
+ * An error may occur if:
+ *
+ * 1) An argument is invalid.
+ *
+ * 2) There is a YAML parsing error. This type of error aborts parsing
+ *    immediately and returns `1`. It can occur on malformed YAML.
+ *
+ * 3) Remark parsing error. If this type of error occurs, the parser won't call
+ *    the handler and will continue to the next one. It can occur on malformed
+ *    remarks, like missing or extra fields in the file.
+ *
+ * Here is a quick example of the usage:
+ *
+ * ```
+ *  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, Size);
+ *  LLVMOptRemarkEntry *Remark = NULL;
+ *  while ((Remark == LLVMOptRemarkParserGetNext(Parser))) {
+ *    // use Remark
+ *  }
+ *  bool HasError = LLVMOptRemarkParserHasError(Parser);
+ *  LLVMOptRemarkParserDispose(Parser);
+ * ```
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern LLVMOptRemarkEntry *
+LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser);
+
+/**
+ * Returns `1` if the parser encountered an error while parsing the buffer.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser);
+
+/**
+ * Returns a null-terminated string containing an error message.
+ *
+ * In case of no error, the result is `NULL`.
+ *
+ * The memory of the string is bound to the lifetime of \p Parser. If
+ * LLVMOptRemarkParserDispose() is called, the memory of the string will be
+ * released.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern const char *
+LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser);
+
+/**
+ * Releases all the resources used by \p Parser.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser);
+
+/**
+ * @} // endgoup LLVMCOPTREMARKS
+ */
+
+#ifdef __cplusplus
+}
+#endif /* !defined(__cplusplus) */
+
+#endif /* LLVM_C_OPT_REMARKS_H */
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index ecf8b93d253..1f54c611bad 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -15,6 +15,7 @@ add_subdirectory(MC)
 add_subdirectory(Object)
 add_subdirectory(ObjectYAML)
 add_subdirectory(Option)
+add_subdirectory(OptRemarks)
 add_subdirectory(DebugInfo)
 add_subdirectory(ExecutionEngine)
 add_subdirectory(Target)
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index a6cd15699fb..0eb4bba2676 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -35,6 +35,7 @@ subdirectories =
  BinaryFormat
  ObjectYAML
  Option
+ OptRemarks
  Passes
  ProfileData
  Support
diff --git a/lib/OptRemarks/CMakeLists.txt b/lib/OptRemarks/CMakeLists.txt
new file mode 100644
index 00000000000..8fefe1d986b
--- /dev/null
+++ b/lib/OptRemarks/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMOptRemarks
+  OptRemarksParser.cpp
+)
diff --git a/lib/OptRemarks/LLVMBuild.txt b/lib/OptRemarks/LLVMBuild.txt
new file mode 100644
index 00000000000..4c1032296dc
--- /dev/null
+++ b/lib/OptRemarks/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/OptRemarks/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = OptRemarks
+parent = Libraries
+required_libraries = Support
diff --git a/lib/OptRemarks/OptRemarksParser.cpp b/lib/OptRemarks/OptRemarksParser.cpp
new file mode 100644
index 00000000000..0478d2bfbfa
--- /dev/null
+++ b/lib/OptRemarks/OptRemarksParser.cpp
@@ -0,0 +1,368 @@
+//===- OptRemarksParser.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides utility methods used by clients that want to use the
+// parser for optimization remarks in LLVM.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/OptRemarks.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLTraits.h"
+
+using namespace llvm;
+
+namespace {
+struct RemarkParser {
+  /// Source manager for better error messages.
+  SourceMgr SM;
+  /// Stream for yaml parsing.
+  yaml::Stream Stream;
+  /// Storage for the error stream.
+  std::string ErrorString;
+  /// The error stream.
+  raw_string_ostream ErrorStream;
+  /// Iterator in the YAML stream.
+  yaml::document_iterator DI;
+  /// The parsed remark (if any).
+  Optional<LLVMOptRemarkEntry> LastRemark;
+  /// Temporary parsing buffer for the arguments.
+  SmallVector<LLVMOptRemarkArg, 8> TmpArgs;
+  /// The state used by the parser to parse a remark entry. Invalidated with
+  /// every call to `parseYAMLElement`.
+  struct ParseState {
+    /// Temporary parsing buffer for the arguments.
+    SmallVectorImpl<LLVMOptRemarkArg> *Args;
+    StringRef Type;
+    StringRef Pass;
+    StringRef Name;
+    StringRef Function;
+    /// Optional.
+    Optional<StringRef> File;
+    Optional<unsigned> Line;
+    Optional<unsigned> Column;
+    Optional<unsigned> Hotness;
+
+    ParseState(SmallVectorImpl<LLVMOptRemarkArg> &Args) : Args(&Args) {}
+    /// Use Args only as a **temporary** buffer.
+    ~ParseState() { Args->clear(); }
+  };
+
+  ParseState State;
+
+  /// Set to `true` if we had any errors during parsing.
+  bool HadAnyErrors = false;
+
+  RemarkParser(StringRef Buf)
+      : SM(), Stream(Buf, SM), ErrorString(), ErrorStream(ErrorString),
+        DI(Stream.begin()), LastRemark(), TmpArgs(), State(TmpArgs) {
+    SM.setDiagHandler(RemarkParser::HandleDiagnostic, this);
+  }
+
+  /// Parse a YAML element.
+  Error parseYAMLElement(yaml::Document &Remark);
+
+private:
+  /// Parse one key to a string.
+  /// otherwise.
+  Error parseKey(StringRef &Result, yaml::KeyValueNode &Node);
+  /// Parse one value to a string.
+  Error parseValue(StringRef &Result, yaml::KeyValueNode &Node);
+  /// Parse one value to an unsigned.
+  Error parseValue(Optional<unsigned> &Result, yaml::KeyValueNode &Node);
+  /// Parse a debug location.
+  Error parseDebugLoc(Optional<StringRef> &File, Optional<unsigned> &Line,
+                      Optional<unsigned> &Column, yaml::KeyValueNode &Node);
+  /// Parse an argument.
+  Error parseArg(SmallVectorImpl<LLVMOptRemarkArg> &TmpArgs, yaml::Node &Node);
+
+  /// Handle a diagnostic from the YAML stream. Records the error in the
+  /// RemarkParser class.
+  static void HandleDiagnostic(const SMDiagnostic &Diag, void *Ctx) {
+    assert(Ctx && "Expected non-null Ctx in diagnostic handler.");
+    auto *Parser = static_cast<RemarkParser *>(Ctx);
+    Diag.print(/*ProgName=*/nullptr, Parser->ErrorStream, /*ShowColors*/ false,
+               /*ShowKindLabels*/ true);
+  }
+};
+
+class ParseError : public ErrorInfo<ParseError> {
+public:
+  static char ID;
+
+  ParseError(StringRef Message, yaml::Node &Node)
+      : Message(Message), Node(Node) {}
+
+  void log(raw_ostream &OS) const override { OS << Message; }
+  std::error_code convertToErrorCode() const override {
+    return inconvertibleErrorCode();
+  }
+
+  StringRef getMessage() const { return Message; }
+  yaml::Node &getNode() const { return Node; }
+
+private:
+  StringRef Message; // No need to hold a full copy of the buffer.
+  yaml::Node &Node;
+};
+
+char ParseError::ID = 0;
+
+static LLVMOptRemarkStringRef toOptRemarkStr(StringRef Str) {
+  return {Str.data(), static_cast<uint32_t>(Str.size())};
+}
+
+Error RemarkParser::parseKey(StringRef &Result, yaml::KeyValueNode &Node) {
+  auto *Key = dyn_cast<yaml::ScalarNode>(Node.getKey());
+  if (!Key)
+    return make_error<ParseError>("key is not a string.", Node);
+
+  Result = Key->getRawValue();
+  return Error::success();
+}
+
+Error RemarkParser::parseValue(StringRef &Result, yaml::KeyValueNode &Node) {
+  auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
+  if (!Value)
+    return make_error<ParseError>("expected a value of scalar type.", Node);
+  Result = Value->getRawValue();
+
+  if (Result.front() == '\'')
+    Result = Result.drop_front();
+
+  if (Result.back() == '\'')
+    Result = Result.drop_back();
+
+  return Error::success();
+}
+
+Error RemarkParser::parseValue(Optional<unsigned> &Result,
+                               yaml::KeyValueNode &Node) {
+  SmallVector<char, 4> Tmp;
+  auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
+  if (!Value)
+    return make_error<ParseError>("expected a value of scalar type.", Node);
+  unsigned UnsignedValue = 0;
+  if (Value->getValue(Tmp).getAsInteger(10, UnsignedValue))
+    return make_error<ParseError>("expected a value of integer type.", *Value);
+  Result = UnsignedValue;
+  return Error::success();
+}
+
+Error RemarkParser::parseDebugLoc(Optional<StringRef> &File,
+                                  Optional<unsigned> &Line,
+                                  Optional<unsigned> &Column,
+                                  yaml::KeyValueNode &Node) {
+  auto *DebugLoc = dyn_cast<yaml::MappingNode>(Node.getValue());
+  if (!DebugLoc)
+    return make_error<ParseError>("expected a value of mapping type.", Node);
+
+  for (yaml::KeyValueNode &DLNode : *DebugLoc) {
+    StringRef KeyName;
+    if (Error E = parseKey(KeyName, DLNode))
+      return E;
+    if (KeyName == "File") {
+      File = StringRef(); // Set the optional to contain a default constructed
+                          // value, to be passed to the parsing function.
+      if (Error E = parseValue(*File, DLNode))
+        return E;
+    } else if (KeyName == "Column") {
+      if (Error E = parseValue(Column, DLNode))
+        return E;
+    } else if (KeyName == "Line") {
+      if (Error E = parseValue(Line, DLNode))
+        return E;
+    } else {
+      return make_error<ParseError>("unknown entry in DebugLoc map.", DLNode);
+    }
+  }
+
+  // If any of the debug loc fields is missing, return an error.
+  if (!File || !Line || !Column)
+    return make_error<ParseError>("DebugLoc node incomplete.", Node);
+
+  return Error::success();
+}
+
+Error RemarkParser::parseArg(SmallVectorImpl<LLVMOptRemarkArg> &Args,
+                             yaml::Node &Node) {
+  auto *ArgMap = dyn_cast<yaml::MappingNode>(&Node);
+  if (!ArgMap)
+    return make_error<ParseError>("expected a value of mapping type.", Node);
+
+  StringRef ValueStr;
+  StringRef KeyStr;
+  Optional<StringRef> File;
+  Optional<unsigned> Line;
+  Optional<unsigned> Column;
+
+  for (yaml::KeyValueNode &ArgEntry : *ArgMap) {
+    StringRef KeyName;
+    if (Error E = parseKey(KeyName, ArgEntry))
+      return E;
+
+    // Try to parse debug locs.
+    if (KeyName == "DebugLoc") {
+      // Can't have multiple DebugLoc entries per argument.
+      if (File || Line || Column)
+        return make_error<ParseError>(
+            "only one DebugLoc entry is allowed per argument.", ArgEntry);
+
+      if (Error E = parseDebugLoc(File, Line, Column, ArgEntry))
+        return E;
+      continue;
+    }
+
+    // If we already have a string, error out.
+    if (!ValueStr.empty())
+      return make_error<ParseError>(
+          "only one string entry is allowed per argument.", ArgEntry);
+
+    // Try to parse a string.
+    if (Error E = parseValue(ValueStr, ArgEntry))
+      return E;
+
+    // Keep the key from the string.
+    KeyStr = KeyName;
+  }
+
+  if (KeyStr.empty())
+    return make_error<ParseError>("argument key is missing.", *ArgMap);
+  if (ValueStr.empty())
+    return make_error<ParseError>("argument value is missing.", *ArgMap);
+
+  Args.push_back(LLVMOptRemarkArg{
+      toOptRemarkStr(KeyStr), toOptRemarkStr(ValueStr),
+      LLVMOptRemarkDebugLoc{toOptRemarkStr(File.getValueOr(StringRef())),
+                            Line.getValueOr(0), Column.getValueOr(0)}});
+
+  return Error::success();
+}
+
+Error RemarkParser::parseYAMLElement(yaml::Document &Remark) {
+  // Parsing a new remark, clear the previous one.
+  LastRemark = None;
+  State = ParseState(TmpArgs);
+
+  auto *Root = dyn_cast<yaml::MappingNode>(Remark.getRoot());
+  if (!Root)
+    return make_error<ParseError>("document root is not of mapping type.",
+                                  *Remark.getRoot());
+
+  State.Type = Root->getRawTag();
+
+  for (yaml::KeyValueNode &RemarkField : *Root) {
+    StringRef KeyName;
+    if (Error E = parseKey(KeyName, RemarkField))
+      return E;
+
+    if (KeyName == "Pass") {
+      if (Error E = parseValue(State.Pass, RemarkField))
+        return E;
+    } else if (KeyName == "Name") {
+      if (Error E = parseValue(State.Name, RemarkField))
+        return E;
+    } else if (KeyName == "Function") {
+      if (Error E = parseValue(State.Function, RemarkField))
+        return E;
+    } else if (KeyName == "Hotness") {
+      if (Error E = parseValue(State.Hotness, RemarkField))
+        return E;
+    } else if (KeyName == "DebugLoc") {
+      if (Error E =
+              parseDebugLoc(State.File, State.Line, State.Column, RemarkField))
+        return E;
+    } else if (KeyName == "Args") {
+      auto *Args = dyn_cast<yaml::SequenceNode>(RemarkField.getValue());
+      if (!Args)
+        return make_error<ParseError>("wrong value type for key.", RemarkField);
+
+      for (yaml::Node &Arg : *Args)
+        if (Error E = parseArg(*State.Args, Arg))
+          return E;
+    } else {
+      return make_error<ParseError>("unknown key.", RemarkField);
+    }
+  }
+
+  // If the YAML parsing failed, don't even continue parsing. We might
+  // encounter malformed YAML.
+  if (Stream.failed())
+    return make_error<ParseError>("YAML parsing failed.", *Remark.getRoot());
+
+  // Check if any of the mandatory fields are missing.
+  if (State.Type.empty() || State.Pass.empty() || State.Name.empty() ||
+      State.Function.empty())
+    return make_error<ParseError>("Type, Pass, Name or Function missing.",
+                                  *Remark.getRoot());
+
+  LastRemark = LLVMOptRemarkEntry{
+      toOptRemarkStr(State.Type),
+      toOptRemarkStr(State.Pass),
+      toOptRemarkStr(State.Name),
+      toOptRemarkStr(State.Function),
+      LLVMOptRemarkDebugLoc{toOptRemarkStr(State.File.getValueOr(StringRef())),
+                            State.Line.getValueOr(0),
+                            State.Column.getValueOr(0)},
+      State.Hotness.getValueOr(0),
+      static_cast<uint32_t>(State.Args->size()),
+      State.Args->data()};
+
+  return Error::success();
+}
+} // namespace
+
+// Create wrappers for C Binding types (see CBindingWrapping.h).
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(RemarkParser, LLVMOptRemarkParserRef)
+
+extern "C" LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf,
+                                                            uint64_t Size) {
+  return wrap(
+      new RemarkParser(StringRef(static_cast<const char *>(Buf), Size)));
+}
+
+extern "C" LLVMOptRemarkEntry *
+LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser) {
+  RemarkParser &TheParser = *unwrap(Parser);
+  // Check for EOF.
+  if (TheParser.HadAnyErrors || TheParser.DI == TheParser.Stream.end())
+    return nullptr;
+
+  // Try to parse an entry.
+  if (Error E = TheParser.parseYAMLElement(*TheParser.DI)) {
+    handleAllErrors(std::move(E), [&](const ParseError &PE) {
+      TheParser.Stream.printError(&PE.getNode(),
+                                  Twine(PE.getMessage()) + Twine('\n'));
+      TheParser.HadAnyErrors = true;
+    });
+    return nullptr;
+  }
+
+  // Move on.
+  ++TheParser.DI;
+
+  // Return the just-parsed remark.
+  if (Optional<LLVMOptRemarkEntry> &Entry = TheParser.LastRemark)
+    return &*Entry;
+  return nullptr;
+}
+
+extern "C" LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser) {
+  return unwrap(Parser)->HadAnyErrors;
+}
+
+extern "C" const char *
+LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser) {
+  return unwrap(Parser)->ErrorStream.str().c_str();
+}
+
+extern "C" void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser) {
+  delete unwrap(Parser);
+}
diff --git a/tools/llvm-opt-report/CMakeLists.txt b/tools/llvm-opt-report/CMakeLists.txt
index 777537a54c0..3aabc03ab3f 100644
--- a/tools/llvm-opt-report/CMakeLists.txt
+++ b/tools/llvm-opt-report/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS Core Demangle Object Support)
+set(LLVM_LINK_COMPONENTS Core Demangle Object OptRemarks Support)
 
 add_llvm_tool(llvm-opt-report
   OptReport.cpp
diff --git a/tools/llvm-opt-report/OptReport.cpp b/tools/llvm-opt-report/OptReport.cpp
index aa7966132c2..071f779a9e6 100644
--- a/tools/llvm-opt-report/OptReport.cpp
+++ b/tools/llvm-opt-report/OptReport.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm-c/OptRemarks.h"
 #include <cstdlib>
 #include <map>
 #include <set>
@@ -142,104 +143,44 @@ typedef std::map<std::string, std::map<int, std::map<std::string, std::map<int,
           OptReportLocationInfo>>>> LocationInfoTy;
 } // anonymous namespace
 
-static void collectLocationInfo(yaml::Stream &Stream,
-                                LocationInfoTy &LocationInfo) {
-  SmallVector<char, 8> Tmp;
-
-  // Note: We're using the YAML parser here directly, instead of using the
-  // YAMLTraits implementation, because the YAMLTraits implementation does not
-  // support a way to handle only a subset of the input keys (it will error out
-  // if there is an input key that you don't map to your class), and
-  // furthermore, it does not provide a way to handle the Args sequence of
-  // key/value pairs, where the order must be captured and the 'String' key
-  // might be repeated.
-  for (auto &Doc : Stream) {
-    auto *Root = dyn_cast<yaml::MappingNode>(Doc.getRoot());
-    if (!Root)
-      continue;
+static bool readLocationInfo(LocationInfoTy &LocationInfo) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+      MemoryBuffer::getFile(InputFileName.c_str());
+  if (std::error_code EC = Buf.getError()) {
+    WithColor::error() << "Can't open file " << InputFileName << ": "
+                       << EC.message() << "\n";
+    return false;
+  }
 
-    bool Transformed = Root->getRawTag() == "!Passed";
-    std::string Pass, File, Function;
-    int Line = 0, Column = 1;
+  StringRef Buffer = (*Buf)->getBuffer();
+  LLVMOptRemarkParserRef Parser =
+      LLVMOptRemarkParserCreate(Buffer.data(), Buffer.size());
+
+  LLVMOptRemarkEntry *Remark = nullptr;
+  while ((Remark = LLVMOptRemarkParserGetNext(Parser))) {
+    bool Transformed =
+        StringRef(Remark->RemarkType.Str, Remark->RemarkType.Len) == "!Passed";
+    StringRef Pass(Remark->PassName.Str, Remark->PassName.Len);
+    StringRef File(Remark->DebugLoc.SourceFile.Str,
+                   Remark->DebugLoc.SourceFile.Len);
+    StringRef Function(Remark->FunctionName.Str, Remark->FunctionName.Len);
+    uint32_t Line = Remark->DebugLoc.SourceLineNumber;
+    uint32_t Column = Remark->DebugLoc.SourceColumnNumber;
+    ArrayRef<LLVMOptRemarkArg> Args(Remark->Args, Remark->NumArgs);
 
     int VectorizationFactor = 1;
     int InterleaveCount = 1;
     int UnrollCount = 1;
 
-    for (auto &RootChild : *Root) {
-      auto *Key = dyn_cast<yaml::ScalarNode>(RootChild.getKey());
-      if (!Key)
-        continue;
-      StringRef KeyName = Key->getValue(Tmp);
-      if (KeyName == "Pass") {
-        auto *Value = dyn_cast<yaml::ScalarNode>(RootChild.getValue());
-        if (!Value)
-          continue;
-        Pass = Value->getValue(Tmp);
-      } else if (KeyName == "Function") {
-        auto *Value = dyn_cast<yaml::ScalarNode>(RootChild.getValue());
-        if (!Value)
-          continue;
-        Function = Value->getValue(Tmp);
-      } else if (KeyName == "DebugLoc") {
-        auto *DebugLoc = dyn_cast<yaml::MappingNode>(RootChild.getValue());
-        if (!DebugLoc)
-          continue;
-
-        for (auto &DLChild : *DebugLoc) {
-          auto *DLKey = dyn_cast<yaml::ScalarNode>(DLChild.getKey());
-          if (!DLKey)
-            continue;
-          StringRef DLKeyName = DLKey->getValue(Tmp);
-          if (DLKeyName == "File") {
-            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
-            if (!Value)
-              continue;
-            File = Value->getValue(Tmp);
-          } else if (DLKeyName == "Line") {
-            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
-            if (!Value)
-              continue;
-            Value->getValue(Tmp).getAsInteger(10, Line);
-          } else if (DLKeyName == "Column") {
-            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
-            if (!Value)
-              continue;
-            Value->getValue(Tmp).getAsInteger(10, Column);
-          }
-        }
-      } else if (KeyName == "Args") {
-        auto *Args = dyn_cast<yaml::SequenceNode>(RootChild.getValue());
-        if (!Args)
-          continue;
-        for (auto &ArgChild : *Args) {
-          auto *ArgMap = dyn_cast<yaml::MappingNode>(&ArgChild);
-          if (!ArgMap)
-            continue;
-          for (auto &ArgKV : *ArgMap) {
-            auto *ArgKey = dyn_cast<yaml::ScalarNode>(ArgKV.getKey());
-            if (!ArgKey)
-              continue;
-            StringRef ArgKeyName = ArgKey->getValue(Tmp);
-            if (ArgKeyName == "VectorizationFactor") {
-              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
-              if (!Value)
-                continue;
-              Value->getValue(Tmp).getAsInteger(10, VectorizationFactor);
-            } else if (ArgKeyName == "InterleaveCount") {
-              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
-              if (!Value)
-                continue;
-              Value->getValue(Tmp).getAsInteger(10, InterleaveCount);
-            } else if (ArgKeyName == "UnrollCount") {
-              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
-              if (!Value)
-                continue;
-              Value->getValue(Tmp).getAsInteger(10, UnrollCount);
-            }
-          }
-        }
-      }
+    for (const LLVMOptRemarkArg &Arg : Args) {
+      StringRef ArgKeyName(Arg.Key.Str, Arg.Key.Len);
+      StringRef ArgValue(Arg.Value.Str, Arg.Value.Len);
+      if (ArgKeyName == "VectorizationFactor")
+        ArgValue.getAsInteger(10, VectorizationFactor);
+      else if (ArgKeyName == "InterleaveCount")
+        ArgValue.getAsInteger(10, InterleaveCount);
+      else if (ArgKeyName == "UnrollCount")
+        ArgValue.getAsInteger(10, UnrollCount);
     }
 
     if (Line < 1 || File.empty())
@@ -268,22 +209,13 @@ static void collectLocationInfo(yaml::Stream &Stream,
       UpdateLLII(LI.Vectorized);
     }
   }
-}
-
-static bool readLocationInfo(LocationInfoTy &LocationInfo) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
-      MemoryBuffer::getFileOrSTDIN(InputFileName);
-  if (std::error_code EC = Buf.getError()) {
-    WithColor::error() << "Can't open file " << InputFileName << ": "
-                       << EC.message() << "\n";
-    return false;
-  }
 
-  SourceMgr SM;
-  yaml::Stream Stream(Buf.get()->getBuffer(), SM);
-  collectLocationInfo(Stream, LocationInfo);
+  bool HasError = LLVMOptRemarkParserHasError(Parser);
+  if (HasError)
+    WithColor::error() << LLVMOptRemarkParserGetErrorMessage(Parser) << "\n";
 
-  return true;
+  LLVMOptRemarkParserDispose(Parser);
+  return !HasError;
 }
 
 static bool writeReport(LocationInfoTy &LocationInfo) {
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index bc41ab66a23..5dba2de4a88 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -26,6 +26,7 @@ add_subdirectory(MI)
 add_subdirectory(Object)
 add_subdirectory(ObjectYAML)
 add_subdirectory(Option)
+add_subdirectory(OptRemarks)
 add_subdirectory(Passes)
 add_subdirectory(ProfileData)
 add_subdirectory(Support)
diff --git a/unittests/OptRemarks/CMakeLists.txt b/unittests/OptRemarks/CMakeLists.txt
new file mode 100644
index 00000000000..94c74867cc4
--- /dev/null
+++ b/unittests/OptRemarks/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_LINK_COMPONENTS
+  OptRemarks
+  Support
+  )
+
+add_llvm_unittest(OptRemarksTests
+  OptRemarksParsingTest.cpp
+  )
diff --git a/unittests/OptRemarks/OptRemarksParsingTest.cpp b/unittests/OptRemarks/OptRemarksParsingTest.cpp
new file mode 100644
index 00000000000..a3b28f038b5
--- /dev/null
+++ b/unittests/OptRemarks/OptRemarksParsingTest.cpp
@@ -0,0 +1,433 @@
+//===- unittest/Support/OptRemarksParsingTest.cpp - OptTable tests --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/OptRemarks.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+template <size_t N> bool tryParse(const char (&Buf)[N]) {
+  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1);
+  LLVMOptRemarkEntry *Remark = nullptr;
+  while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) {
+    EXPECT_TRUE(Remark == nullptr); // Only one remark per test.
+    Remark = NewRemark;
+  }
+  EXPECT_TRUE(Remark != nullptr); // We need *exactly* one remark per test.
+  bool HasError = LLVMOptRemarkParserHasError(Parser);
+  LLVMOptRemarkParserDispose(Parser);
+  return !HasError;
+}
+
+template <size_t N>
+bool parseExpectError(const char (&Buf)[N], const char *Error) {
+  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1);
+  LLVMOptRemarkEntry *Remark = nullptr;
+  while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) {
+    EXPECT_FALSE(NewRemark);
+  }
+  EXPECT_TRUE(Remark == nullptr); // We are parsing only one malformed remark.
+  EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser));
+  bool MatchesError =
+      StringRef(LLVMOptRemarkParserGetErrorMessage(Parser)).contains(Error);
+  LLVMOptRemarkParserDispose(Parser);
+
+  return MatchesError;
+}
+
+TEST(OptRemarks, OptRemarksParsingEmpty) {
+  StringRef Buf = "\n"
+                  "\n";
+  LLVMOptRemarkParserRef Parser =
+      LLVMOptRemarkParserCreate(Buf.data(), Buf.size());
+  LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser);
+  EXPECT_TRUE(NewRemark == nullptr); // No remark expected.
+  EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser));
+  EXPECT_TRUE(StringRef(LLVMOptRemarkParserGetErrorMessage(Parser))
+                  .contains("document root is not of mapping type."));
+  LLVMOptRemarkParserDispose(Parser);
+}
+
+TEST(OptRemarks, OptRemarksParsingGood) {
+  EXPECT_TRUE(tryParse("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"DebugLoc: { File: file.c, Line: 3, Column: 12 }\n"
+"Function: foo\n"
+"Args:\n"
+"  - Callee: bar\n"
+"  - String: ' will not be inlined into '\n"
+"  - Caller: foo\n"
+"    DebugLoc: { File: file.c, Line: 2, Column: 0 }\n"
+"  - String: ' because its definition is unavailable'\n"
+""));
+
+  // No debug loc should also pass.
+  EXPECT_TRUE(tryParse("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args:\n"
+"  - Callee: bar\n"
+"  - String: ' will not be inlined into '\n"
+"  - Caller: foo\n"
+"    DebugLoc: { File: file.c, Line: 2, Column: 0 }\n"
+"  - String: ' because its definition is unavailable'\n"
+""));
+
+  // No args is also ok.
+  EXPECT_TRUE(tryParse("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"DebugLoc: { File: file.c, Line: 3, Column: 12 }\n"
+"Function: foo\n"
+""));
+
+  // Different order.
+  EXPECT_TRUE(tryParse("\n"
+"--- !Missed\n"
+"DebugLoc: { Line: 3, Column: 12, File: file.c }\n"
+"Function: foo\n"
+"Name: NoDefinition\n"
+"Args:\n"
+"  - Callee: bar\n"
+"  - String: ' will not be inlined into '\n"
+"  - Caller: foo\n"
+"    DebugLoc: { File: file.c, Line: 2, Column: 0 }\n"
+"  - String: ' because its definition is unavailable'\n"
+"Pass: inline\n"
+""));
+}
+
+// Mandatory common part of a remark.
+#define COMMON_REMARK "\nPass: inline\nName: NoDefinition\nFunction: foo\n\n"
+// Test all the types.
+TEST(OptRemarks, OptRemarksParsingTypes) {
+  // Type: Passed
+  EXPECT_TRUE(tryParse("--- !Passed" COMMON_REMARK));
+  // Type: Missed
+  EXPECT_TRUE(tryParse("--- !Missed" COMMON_REMARK));
+  // Type: Analysis
+  EXPECT_TRUE(tryParse("--- !Analysis" COMMON_REMARK));
+  // Type: AnalysisFPCompute
+  EXPECT_TRUE(tryParse("--- !AnalysisFPCompute" COMMON_REMARK));
+  // Type: AnalysisAliasing
+  EXPECT_TRUE(tryParse("--- !AnalysisAliasing" COMMON_REMARK));
+  // Type: Failure
+  EXPECT_TRUE(tryParse("--- !Failure" COMMON_REMARK));
+}
+#undef COMMON_REMARK
+
+TEST(OptRemarks, OptRemarksParsingMissingFields) {
+  // No type.
+  EXPECT_TRUE(parseExpectError("\n"
+"---\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"",
+                               "error: Type, Pass, Name or Function missing."));
+  // No pass.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"",
+                               "error: Type, Pass, Name or Function missing."));
+  // No name.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Function: foo\n"
+"",
+                               "error: Type, Pass, Name or Function missing."));
+  // No function.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"",
+                               "error: Type, Pass, Name or Function missing."));
+  // Debug loc but no file.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { Line: 3, Column: 12 }\n"
+"",
+                               "DebugLoc node incomplete."));
+  // Debug loc but no line.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Column: 12 }\n"
+"",
+                               "DebugLoc node incomplete."));
+  // Debug loc but no column.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Line: 3 }\n"
+"",
+                               "DebugLoc node incomplete."));
+}
+
+TEST(OptRemarks, OptRemarksParsingWrongTypes) {
+  // Wrong debug loc type.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: foo\n"
+"",
+                               "expected a value of mapping type."));
+  // Wrong line type.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Line: b, Column: 12 }\n"
+"",
+                               "expected a value of integer type."));
+  // Wrong column type.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Line: 3, Column: c }\n"
+"",
+                               "expected a value of integer type."));
+  // Wrong args type.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args: foo\n"
+"",
+                               "wrong value type for key."));
+  // Wrong key type.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"{ A: a }: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"",
+                               "key is not a string."));
+  // Debug loc with unknown entry.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Column: 12, Unknown: 12 }\n"
+"",
+                               "unknown entry in DebugLoc map."));
+  // Unknown entry.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Unknown: inline\n"
+"",
+                               "unknown key."));
+  // Not a scalar.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: { File: a, Line: 1, Column: 2 }\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"",
+                               "expected a value of scalar type."));
+  // Not a string file in debug loc.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: { a: b }, Column: 12, Line: 12 }\n"
+"",
+                               "expected a value of scalar type."));
+  // Not a integer column in debug loc.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Column: { a: b }, Line: 12 }\n"
+"",
+                               "expected a value of scalar type."));
+  // Not a integer line in debug loc.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Column: 12, Line: { a: b } }\n"
+"",
+                               "expected a value of scalar type."));
+  // Not a mapping type value for args.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Column: 12, Line: { a: b } }\n"
+"",
+                               "expected a value of scalar type."));
+}
+
+TEST(OptRemarks, OptRemarksParsingWrongArgs) {
+  // Multiple debug locs per arg.
+  EXPECT_TRUE(
+      parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args:\n"
+"  - Str: string\n"
+"    DebugLoc: { File: a, Line: 1, Column: 2 }\n"
+"    DebugLoc: { File: a, Line: 1, Column: 2 }\n"
+"",
+                       "only one DebugLoc entry is allowed per argument."));
+  // Multiple strings per arg.
+  EXPECT_TRUE(
+      parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args:\n"
+"  - Str: string\n"
+"    Str2: string\n"
+"    DebugLoc: { File: a, Line: 1, Column: 2 }\n"
+"",
+                       "only one string entry is allowed per argument."));
+  // No arg value.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args:\n"
+"  - Callee: ''\n"
+"  - DebugLoc: { File: a, Line: 1, Column: 2 }\n"
+"",
+                               "argument value is missing."));
+  // No arg value.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args:\n"
+"  - DebugLoc: { File: a, Line: 1, Column: 2 }\n"
+"",
+                               "argument key is missing."));
+
+}
+
+TEST(OptRemarks, OptRemarksGoodStruct) {
+  StringRef Buf = "\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"DebugLoc: { File: file.c, Line: 3, Column: 12 }\n"
+"Function: foo\n"
+"Args:\n"
+"  - Callee: bar\n"
+"  - String: ' will not be inlined into '\n"
+"  - Caller: foo\n"
+"    DebugLoc: { File: file.c, Line: 2, Column: 0 }\n"
+"  - String: ' because its definition is unavailable'\n"
+"\n";
+
+  LLVMOptRemarkParserRef Parser =
+      LLVMOptRemarkParserCreate(Buf.data(), Buf.size());
+  LLVMOptRemarkEntry *Remark = LLVMOptRemarkParserGetNext(Parser);
+  EXPECT_FALSE(Remark == nullptr);
+  EXPECT_EQ(StringRef(Remark->RemarkType.Str, 7), "!Missed");
+  EXPECT_EQ(Remark->RemarkType.Len, 7U);
+  EXPECT_EQ(StringRef(Remark->PassName.Str, 6), "inline");
+  EXPECT_EQ(Remark->PassName.Len, 6U);
+  EXPECT_EQ(StringRef(Remark->RemarkName.Str, 12), "NoDefinition");
+  EXPECT_EQ(Remark->RemarkName.Len, 12U);
+  EXPECT_EQ(StringRef(Remark->FunctionName.Str, 3), "foo");
+  EXPECT_EQ(Remark->FunctionName.Len, 3U);
+  EXPECT_EQ(StringRef(Remark->DebugLoc.SourceFile.Str, 6), "file.c");
+  EXPECT_EQ(Remark->DebugLoc.SourceFile.Len, 6U);
+  EXPECT_EQ(Remark->DebugLoc.SourceLineNumber, 3U);
+  EXPECT_EQ(Remark->DebugLoc.SourceColumnNumber, 12U);
+  EXPECT_EQ(Remark->Hotness, 0U);
+  EXPECT_EQ(Remark->NumArgs, 4U);
+  // Arg 0
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[0];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Callee");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 3), "bar");
+    EXPECT_EQ(Arg.Value.Len, 3U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+  // Arg 1
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[1];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 26), " will not be inlined into ");
+    EXPECT_EQ(Arg.Value.Len, 26U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+  // Arg 2
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[2];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Caller");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 3), "foo");
+    EXPECT_EQ(Arg.Value.Len, 3U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 6), "file.c");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 6U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 2U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+  // Arg 3
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[3];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 38),
+              " because its definition is unavailable");
+    EXPECT_EQ(Arg.Value.Len, 38U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+
+  EXPECT_EQ(LLVMOptRemarkParserGetNext(Parser), nullptr);
+
+  EXPECT_FALSE(LLVMOptRemarkParserHasError(Parser));
+  LLVMOptRemarkParserDispose(Parser);
+}
-- 
GitLab


From ca0c32a3b8c810d7e315661a48b070c5ac2fa150 Mon Sep 17 00:00:00 2001
From: Renato Golin <renato.golin@linaro.org>
Date: Wed, 10 Oct 2018 18:49:49 +0000
Subject: [PATCH 0016/1116] [LV] Add a new reduction pattern match

Adding a new reduction pattern match for vectorizing code similar to TSVC s3111:

for (int i = 0; i < N; i++)
  if (a[i] > b)
    sum += a[i];

This patch adds support for fadd, fsub and fmull, as well as multiple
branches and different (but compatible) instructions (ex. add+sub) in
different branches.

I have forwarded to trunk, added fsub and fmul functionality and
additional tests, but the credit goes to Takahiro, who did most of the
actual work.

Differential Revision: https://reviews.llvm.org/D49168

Patch by Takahiro Miyoshi <takahiro.miyoshi@linaro.org>.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344172 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/IVDescriptors.h         |   7 +-
 lib/Analysis/IVDescriptors.cpp                |  71 +-
 test/Transforms/LoopVectorize/if-reduction.ll | 666 ++++++++++++++++++
 3 files changed, 737 insertions(+), 7 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/if-reduction.ll

diff --git a/include/llvm/Analysis/IVDescriptors.h b/include/llvm/Analysis/IVDescriptors.h
index d1d7e5ef022..64b4ae23cc5 100644
--- a/include/llvm/Analysis/IVDescriptors.h
+++ b/include/llvm/Analysis/IVDescriptors.h
@@ -140,7 +140,8 @@ public:
 
   /// Returns true if instruction I has multiple uses in Insts
   static bool hasMultipleUsesOf(Instruction *I,
-                                SmallPtrSetImpl<Instruction *> &Insts);
+                                SmallPtrSetImpl<Instruction *> &Insts,
+                                unsigned MaxNumUses);
 
   /// Returns true if all uses of the instruction I is within the Set.
   static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set);
@@ -150,6 +151,10 @@ public:
   /// or max(X, Y).
   static InstDesc isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev);
 
+  /// Returns a struct describing if the instruction is a
+  /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
+  static InstDesc isConditionalRdxPattern(RecurrenceKind Kind, Instruction *I);
+
   /// Returns identity corresponding to the RecurrenceKind.
   static Constant *getRecurrenceIdentity(RecurrenceKind K, Type *Tp);
 
diff --git a/lib/Analysis/IVDescriptors.cpp b/lib/Analysis/IVDescriptors.cpp
index 854a95573e9..47bddf68f49 100644
--- a/lib/Analysis/IVDescriptors.cpp
+++ b/lib/Analysis/IVDescriptors.cpp
@@ -299,9 +299,17 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
         return false;
     }
 
+    bool IsASelect = isa<SelectInst>(Cur);
+
+    // A conditional reduction operation must only have 2 or less uses in
+    // VisitedInsts.
+    if (IsASelect && (Kind == RK_FloatAdd || Kind == RK_FloatMult) &&
+        hasMultipleUsesOf(Cur, VisitedInsts, 2))
+      return false;
+
     // A reduction operation must only have one use of the reduction value.
-    if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
-        hasMultipleUsesOf(Cur, VisitedInsts))
+    if (!IsAPhi && !IsASelect && Kind != RK_IntegerMinMax &&
+        Kind != RK_FloatMinMax && hasMultipleUsesOf(Cur, VisitedInsts, 1))
       return false;
 
     // All inputs to a PHI node must be a reduction value.
@@ -362,7 +370,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
       } else if (!isa<PHINode>(UI) &&
                  ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
                    !isa<SelectInst>(UI)) ||
-                  !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence()))
+                  (!isConditionalRdxPattern(Kind, UI).isRecurrence() &&
+                   !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence())))
         return false;
 
       // Remember that we completed the cycle.
@@ -491,6 +500,52 @@ RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev) {
   return InstDesc(false, I);
 }
 
+/// Returns true if the select instruction has users in the compare-and-add
+/// reduction pattern below. The select instruction argument is the last one
+/// in the sequence.
+///
+/// %sum.1 = phi ...
+/// ...
+/// %cmp = fcmp pred %0, %CFP
+/// %add = fadd %0, %sum.1
+/// %sum.2 = select %cmp, %add, %sum.1
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isConditionalRdxPattern(
+    RecurrenceKind Kind, Instruction *I) {
+  SelectInst *SI = dyn_cast<SelectInst>(I);
+  if (!SI)
+    return InstDesc(false, I);
+
+  CmpInst *CI = dyn_cast<CmpInst>(SI->getCondition());
+  // Only handle single use cases for now.
+  if (!CI || !CI->hasOneUse())
+    return InstDesc(false, I);
+
+  Value *TrueVal = SI->getTrueValue();
+  Value *FalseVal = SI->getFalseValue();
+  // Handle only when either of operands of select instruction is a PHI
+  // node for now.
+  if ((isa<PHINode>(*TrueVal) && isa<PHINode>(*FalseVal)) ||
+      (!isa<PHINode>(*TrueVal) && !isa<PHINode>(*FalseVal)))
+    return InstDesc(false, I);
+
+  Instruction *I1 =
+      isa<PHINode>(*TrueVal) ? dyn_cast<Instruction>(FalseVal)
+                             : dyn_cast<Instruction>(TrueVal);
+  if (!I1 || !I1->isBinaryOp())
+    return InstDesc(false, I);
+
+  Value *Op1, *Op2;
+  if (m_FAdd(m_Value(Op1), m_Value(Op2)).match(I1) ||
+      m_FSub(m_Value(Op1), m_Value(Op2)).match(I1))
+    return InstDesc(Kind == RK_FloatAdd, SI);
+
+  if (m_FMul(m_Value(Op1), m_Value(Op2)).match(I1))
+    return InstDesc(Kind == RK_FloatMult, SI);
+
+  return InstDesc(false, I);
+}
+
 RecurrenceDescriptor::InstDesc
 RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
                                         InstDesc &Prev, bool HasFunNoNaNAttr) {
@@ -520,9 +575,12 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
   case Instruction::FSub:
   case Instruction::FAdd:
     return InstDesc(Kind == RK_FloatAdd, I, UAI);
+  case Instruction::Select:
+    if (Kind == RK_FloatAdd || Kind == RK_FloatMult)
+      return isConditionalRdxPattern(Kind, I);
+    LLVM_FALLTHROUGH;
   case Instruction::FCmp:
   case Instruction::ICmp:
-  case Instruction::Select:
     if (Kind != RK_IntegerMinMax &&
         (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
       return InstDesc(false, I);
@@ -531,13 +589,14 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
 }
 
 bool RecurrenceDescriptor::hasMultipleUsesOf(
-    Instruction *I, SmallPtrSetImpl<Instruction *> &Insts) {
+    Instruction *I, SmallPtrSetImpl<Instruction *> &Insts,
+    unsigned MaxNumUses) {
   unsigned NumUses = 0;
   for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E;
        ++Use) {
     if (Insts.count(dyn_cast<Instruction>(*Use)))
       ++NumUses;
-    if (NumUses > 1)
+    if (NumUses > MaxNumUses)
       return true;
   }
 
diff --git a/test/Transforms/LoopVectorize/if-reduction.ll b/test/Transforms/LoopVectorize/if-reduction.ll
new file mode 100644
index 00000000000..dd9a6118337
--- /dev/null
+++ b/test/Transforms/LoopVectorize/if-reduction.ll
@@ -0,0 +1,666 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fadd_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fadd_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fadd_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, 0.000000e+00
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fadd_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fadd_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fadd_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, 0.000000e+00
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and a floating-point
+;   value.
+;
+; float fcmp_val_fadd_select1(float * restrict x, float y, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_val_fadd_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat2
+; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_val_fadd_select1(float* noalias %x, float %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, %y
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and a floating-point
+;   value.
+;
+; double fcmp_val_fadd_select2(double * restrict x, double y, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_val_fadd_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat2
+; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_val_fadd_select2(double* noalias %x, double %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, %y
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and another array
+;   element.
+;
+; float fcmp_array_elm_fadd_select1(float * restrict x, float * restrict y,
+;                                   const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y[i])
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_array_elm_fadd_select1(
+; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %[[V1:.*]]
+; CHECK: %[[V4:.*]] = fadd fast <4 x float> %[[V0]], %[[V3:.*]]
+; CHECK: select <4 x i1> %[[V2]], <4 x float> %[[V4]], <4 x float> %[[V3]]
+define float @fcmp_array_elm_fadd_select1(float* noalias %x, float* noalias %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %1 = load float, float* %arrayidx.2, align 4
+  %cmp.2 = fcmp fast ogt float %0, %1
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %2 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %2
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and another array
+;   element.
+;
+; double fcmp_array_elm_fadd_select2(double * restrict x, double * restrict y,
+;                                    const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y[i])
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_array_elm_fadd_select2(
+; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %[[V1:.*]]
+; CHECK: %[[V4:.*]] = fadd fast <4 x double> %[[V0]], %[[V3:.*]]
+; CHECK: select <4 x i1> %[[V2]], <4 x double> %[[V4]], <4 x double> %[[V3]]
+define double @fcmp_array_elm_fadd_select2(double* noalias %x, double* noalias %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx.1 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %1 = load double, double* %arrayidx.2, align 4
+  %cmp.2 = fcmp fast ogt double %0, %1
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %2 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %2
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fsub instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fsub_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select1(
+; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fsub <4 x float> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fsub_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp ogt float %0, 0.000000e+00
+  %sub = fsub float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %sub, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fsub instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fsub_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select2(
+; CHECK: %[[V1:.*]] = fcmp ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fsub <4 x double> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fsub_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp ogt double %0, 0.000000e+00
+  %sub = fsub double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %sub, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fmul instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fmult_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select1(
+; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fmul <4 x float> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fmult_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp ogt float %0, 0.000000e+00
+  %mult = fmul float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %mult, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fmul instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fmult_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select2(
+; CHECK: %[[V1:.*]] = fcmp ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fmul <4 x double> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fmult_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp ogt double %0, 0.000000e+00
+  %mult = fmul double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %mult, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float multi pattern
+;   Check vectorisation of reduction code with a pair of selects to different
+;   fadd patterns.
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum+=2*a[i];
+;     else
+;       sum+=3*a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_multi(
+; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
+; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[M1:.*]] = fmul fast <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[M2:.*]] = fmul fast <4 x float> %[[V0]], <float 2.000000e+00,
+; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
+; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
+; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
+; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
+; CHECK: %[[S1:.*]] = select <4 x i1> %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]]
+; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]]
+; CHECK: fadd fast <4 x float> %[[S2]],
+define float @fcmp_multi(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.011 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %for.inc, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %if.else14
+
+if.then10:                                        ; preds = %if.else
+  %mul = fmul fast float %0, 2.000000e+00
+  br label %for.inc
+
+if.else14:                                        ; preds = %if.else
+  %mul17 = fmul fast float %0, 3.000000e+00
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.else14, %if.then10
+  %.pn = phi float [ %mul, %if.then10 ], [ %mul17, %if.else14 ], [ %0, %for.body ]
+  %sum.1 = fadd fast float %.pn, %sum.011
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + fsub patterns
+;   Check vectorisation of reduction code with a pair of selects to different
+;   instructions { fadd, fsub } but equivalent (change in constant).
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum-=a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_fadd_fsub(
+; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
+; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[SUB:.*]] = fsub fast <4 x float>
+; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float>
+; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
+; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
+; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
+; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
+; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]]
+; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]]
+define float @fcmp_fadd_fsub(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %add = fadd fast float %0, %sum.010
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %for.inc
+
+if.then10:                                        ; preds = %if.else
+  %sub = fsub fast float %sum.010, %0
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then10, %if.else
+  %sum.1 = phi float [ %add, %if.then ], [ %sub, %if.then10 ], [ %sum.010, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + fmul patterns
+;   Check lack of vectorisation of reduction code with a pair of non-compatible
+;   instructions { fadd, fmul }.
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum*=a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_fadd_fmul(
+; CHECK-NOT: <4 x float>
+define float @fcmp_fadd_fmul(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %add = fadd fast float %0, %sum.010
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %for.inc
+
+if.then10:                                        ; preds = %if.else
+  %mul = fmul fast float %0, %sum.010
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then10, %if.else
+  %sum.1 = phi float [ %add, %if.then ], [ %mul, %if.then10 ], [ %sum.010, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + store patterns
+;   Check lack of vectorisation of reduction code with a store back, given it
+;   has loop dependency on a[i].
+;
+; float fcmp_store_back(float a[], int LEN) {
+;     float sum = 0.0;
+;     for (int i = 0; i < LEN; i++) {
+;       sum += a[i];
+;       a[i] = sum;
+;     }
+;     return sum;
+; }
+
+; CHECK-LABEL: @fcmp_store_back(
+; CHECK-NOT: <4 x float>
+define float @fcmp_store_back(float* nocapture %a, i32 %LEN) nounwind readonly {
+entry:
+  %cmp7 = icmp sgt i32 %LEN, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %LEN to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.08 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd fast float %0, %sum.08
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  ret float %sum.0.lcssa
+}
-- 
GitLab


From c8b6096ed08df62066b9abfc0e56c734d6530be8 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 10 Oct 2018 19:09:16 +0000
Subject: [PATCH 0017/1116] [WebAssembly][NFC] Use vnot patfrag to simplify
 v128.not

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53097

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344175 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 28262fbcaf6..491ee56b794 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -508,23 +508,16 @@ defm XOR : SIMDBitwise<xor, "xor", 62>;
 } // isCommutable = 1
 
 // Bitwise logic: v128.not
-multiclass SIMDNot<ValueType vec_t, PatFrag splat_pat, ValueType lane_t> {
-  defm NOT_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec),
-                           (outs), (ins),
-                           [(set
-                             (vec_t V128:$dst),
-                             (vec_t (xor
-                               (vec_t V128:$vec),
-                               (vec_t (splat_pat (lane_t -1)))
-                             ))
-                           )],
+multiclass SIMDNot<ValueType vec_t> {
+  defm NOT_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
+                           [(set (vec_t V128:$dst), (vec_t (vnot V128:$vec)))],
                            "v128.not\t$dst, $vec", "v128.not", 63>;
 }
 
-defm "" : SIMDNot<v16i8, splat16, i32>;
-defm "" : SIMDNot<v8i16, splat8, i32>;
-defm "" : SIMDNot<v4i32, splat4, i32>;
-defm "" : SIMDNot<v2i64, splat2, i64>;
+defm "" : SIMDNot<v16i8>;
+defm "" : SIMDNot<v8i16>;
+defm "" : SIMDNot<v4i32>;
+defm "" : SIMDNot<v2i64>;
 
 // Bitwise select: v128.bitselect
 def wasm_bitselect_t : SDTypeProfile<1, 3,
-- 
GitLab


From 4942b853a9cf260fc8159868fc1d0d636a6bead9 Mon Sep 17 00:00:00 2001
From: David Bolvansky <david.bolvansky@gmail.com>
Date: Wed, 10 Oct 2018 20:10:37 +0000
Subject: [PATCH 0018/1116] [DwarfVerifier] Fixed -Wimplicit-fallthrough
 warning

Reviewers: JDevlieghere, RKSimon

Reviewed By: JDevlieghere

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D52963

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344176 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/DWARF/DWARFVerifier.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index c433fe470cb..e78e13bf4af 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -508,14 +508,15 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
                   "incompatible tag " +
                   TagString(RefTag));
     }
+    break;
   }
   case DW_AT_type: {
     DWARFDie TypeDie = Die.getAttributeValueAsReferencedDie(DW_AT_type);
     if (TypeDie && !isType(TypeDie.getTag())) {
       ReportError("DIE has " + AttributeString(Attr) +
                   " with incompatible tag " + TagString(TypeDie.getTag()));
-      break;
     }
+    break;
   }
   default:
     break;
-- 
GitLab


From 9f5daa2df05a60699215bed0da0a569531d4e26d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 10 Oct 2018 20:39:39 +0000
Subject: [PATCH 0019/1116] revert r344082: [InstCombine] reverse 'trunc X to
 <N x i1>' canonicalization

This commit accidentally included the diffs from D53057.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344178 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCasts.cpp          |  31 +-
 .../InstCombine/InstCombineCompares.cpp       |   7 -
 .../InstCombine/InstCombineVectorOps.cpp      |  30 --
 test/Transforms/InstCombine/apint-shift.ll    |   4 +-
 .../Transforms/InstCombine/apint-shl-trunc.ll |   5 +-
 test/Transforms/InstCombine/icmp.ll           |  20 +-
 test/Transforms/InstCombine/vec_shuffle.ll    |   5 +-
 test/Transforms/InstCombine/vector-casts.ll   |  19 +-
 .../LoopVectorize/X86/masked_load_store.ll    | 336 +++++++++---------
 9 files changed, 212 insertions(+), 245 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 74f1e695ff6..fd59c3a7c0c 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -706,35 +706,12 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
 
+  // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
   if (DestTy->getScalarSizeInBits() == 1) {
+    Constant *One = ConstantInt::get(SrcTy, 1);
+    Src = Builder.CreateAnd(Src, One);
     Value *Zero = Constant::getNullValue(Src->getType());
-    if (DestTy->isIntegerTy()) {
-      // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
-      // TODO: We canonicalize to more instructions here because we are probably
-      // lacking equivalent analysis for trunc relative to icmp. There may also
-      // be codegen concerns. If those trunc limitations were removed, we could
-      // remove this transform.
-      Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1));
-      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
-    }
-
-    // For vectors, we do not canonicalize all truncs to icmp, so optimize
-    // patterns that would be covered within visitICmpInst.
-    Value *X;
-    const APInt *C;
-    if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) {
-      // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
-      APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C);
-      Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
-      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
-    }
-    if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)),
-                                   m_Deferred(X))))) {
-      // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0
-      APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1;
-      Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
-      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
-    }
+    return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
   }
 
   // FIXME: Maybe combine the next two transforms to handle the no cast case
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index bf8bc8818f7..07bd98b30ab 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1609,13 +1609,6 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
 Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
                                                  BinaryOperator *And,
                                                  const APInt &C1) {
-  // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
-  // TODO: We canonicalize to the longer form for scalars because we have
-  // better analysis/folds for icmp, and codegen may be better with icmp.
-  if (Cmp.getPredicate() == CmpInst::ICMP_NE && Cmp.getType()->isVectorTy() &&
-      C1.isNullValue() && match(And->getOperand(1), m_One()))
-    return new TruncInst(And->getOperand(0), Cmp.getType());
-
   const APInt *C2;
   if (!match(And->getOperand(1), m_APInt(C2)))
     return nullptr;
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 7258127f319..61a3e31f960 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1477,33 +1477,6 @@ static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,
   return SelectInst::Create(NarrowCond, NarrowX, NarrowY);
 }
 
-/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask.
-static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
-  Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
-  if (!Shuf.isIdentityWithExtract() || !isa<UndefValue>(Op1))
-    return nullptr;
-
-  Value *X, *Y;
-  Constant *Mask;
-  if (!match(Op0, m_ShuffleVector(m_Value(X), m_Value(Y), m_Constant(Mask))))
-    return nullptr;
-
-  // We are extracting a subvector from a shuffle. Remove excess elements from
-  // the 1st shuffle mask to eliminate the extract.
-  //   shuf (shuf X, Y, <C0, C1, C2, C3>), undef, <0, undef, 2> -->
-  //   shuf X, Y, <C0, undef, C2>
-  unsigned NumElts = Shuf.getType()->getVectorNumElements();
-  SmallVector<Constant *, 16> NewMask(NumElts);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    // If the extracting shuffle has an undef mask element, it transfers to the
-    // new shuffle mask. Otherwise, copy the original mask element.
-    Constant *ExtractMaskElt = Shuf.getMask()->getAggregateElement(i);
-    Constant *MaskElt = Mask->getAggregateElement(i);
-    NewMask[i] = isa<UndefValue>(ExtractMaskElt) ? ExtractMaskElt : MaskElt;
-  }
-  return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask));
-}
-
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
@@ -1526,9 +1499,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     return &SVI;
   }
 
-  if (Instruction *I = foldIdentityExtractShuffle(SVI))
-    return I;
-
   SmallVector<int, 16> Mask = SVI.getShuffleMask();
   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
   unsigned LHSWidth = LHS->getType()->getVectorNumElements();
diff --git a/test/Transforms/InstCombine/apint-shift.ll b/test/Transforms/InstCombine/apint-shift.ll
index 3266fa6e443..fc564665a60 100644
--- a/test/Transforms/InstCombine/apint-shift.ll
+++ b/test/Transforms/InstCombine/apint-shift.ll
@@ -319,8 +319,8 @@ define i1 @test16(i84 %X) {
 
 define <2 x i1> @test16vec(<2 x i84> %X) {
 ; CHECK-LABEL: @test16vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i84> [[X:%.*]], <i84 16, i84 16>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i84> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i84> %X, <i84 16, i84 16>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i84> [[AND]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %shr = ashr <2 x i84> %X, <i84 4, i84 4>
diff --git a/test/Transforms/InstCombine/apint-shl-trunc.ll b/test/Transforms/InstCombine/apint-shl-trunc.ll
index 2241c88cb6b..c7d7d369592 100644
--- a/test/Transforms/InstCombine/apint-shl-trunc.ll
+++ b/test/Transforms/InstCombine/apint-shl-trunc.ll
@@ -27,8 +27,9 @@ define i1 @test1(i799 %X, i799 %A) {
 
 define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) {
 ; CHECK-LABEL: @test0vec(
-; CHECK-NEXT:    [[B:%.*]] = lshr <2 x i39> [[X:%.*]], [[A:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = trunc <2 x i39> [[B]] to <2 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i39> <i39 1, i39 1>, [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i39> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i39> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
   %B = lshr <2 x i39> %X, %A
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 1f05bb67e96..1f97009911b 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -2427,9 +2427,10 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) {
 
 define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @icmp_and_or_lshr_vec(
-; CHECK-NEXT:    [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[SHF]], [[X]]
-; CHECK-NEXT:    [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1>
+; CHECK-NEXT:    [[SHF1:%.*]] = shl nuw <2 x i32> <i32 1, i32 1>, [[Y:%.*]]
+; CHECK-NEXT:    [[OR2:%.*]] = or <2 x i32> [[SHF1]], <i32 1, i32 1>
+; CHECK-NEXT:    [[AND3:%.*]] = and <2 x i32> [[OR2]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND3]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %shf = lshr <2 x i32> %x, %y
@@ -2444,7 +2445,8 @@ define <2 x i1> @icmp_and_or_lshr_vec_commute(<2 x i32> %xp, <2 x i32> %y) {
 ; CHECK-NEXT:    [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
 ; CHECK-NEXT:    [[SHF:%.*]] = lshr <2 x i32> [[X]], [[Y:%.*]]
 ; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]]
-; CHECK-NEXT:    [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> [[OR]], <i32 1, i32 1>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %x = srem <2 x i32> %xp, <i32 42, i32 -42> ; prevent complexity-based canonicalization
@@ -2470,8 +2472,8 @@ define i1 @icmp_and_or_lshr_cst(i32 %x) {
 
 define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 3>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[AND1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 3>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %shf = lshr <2 x i32> %x, <i32 1, i32 1>
@@ -2484,8 +2486,10 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) {
 define <2 x i1> @icmp_and_or_lshr_cst_vec_commute(<2 x i32> %xp) {
 ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_commute(
 ; CHECK-NEXT:    [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X]], <i32 3, i32 3>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[SHF:%.*]] = lshr <2 x i32> [[X]], <i32 1, i32 1>
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]]
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> [[OR]], <i32 1, i32 1>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %x = srem <2 x i32> %xp, <i32 42, i32 -42> ; prevent complexity-based canonicalization
diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll
index 8adb211b0a9..e9c3539ef6b 100644
--- a/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/test/Transforms/InstCombine/vec_shuffle.ll
@@ -174,7 +174,8 @@ define <8 x i8> @test12a(<8 x i8> %t6, <8 x i8> %t2) {
 
 define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @extract_subvector_of_shuffle(
-; CHECK-NEXT:    [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NEXT:    [[EXTRACT_SUBV:%.*]] = shufflevector <3 x i8> [[SHUF]], <3 x i8> undef, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    ret <2 x i8> [[EXTRACT_SUBV]]
 ;
   %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <3 x i32> <i32 0, i32 2, i32 0>
@@ -193,7 +194,7 @@ define <4 x i8> @extract_subvector_of_shuffle_extra_use(<2 x i8> %x, <2 x i8> %y
 ; CHECK-LABEL: @extract_subvector_of_shuffle_extra_use(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <5 x i32> <i32 undef, i32 2, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    call void @use_v5i8(<5 x i8> [[SHUF]])
-; CHECK-NEXT:    [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X]], <2 x i8> [[Y]], <4 x i32> <i32 undef, i32 2, i32 0, i32 undef>
+; CHECK-NEXT:    [[EXTRACT_SUBV:%.*]] = shufflevector <5 x i8> [[SHUF]], <5 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    ret <4 x i8> [[EXTRACT_SUBV]]
 ;
   %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <5 x i32> <i32 undef, i32 2, i32 0, i32 1, i32 0>
diff --git a/test/Transforms/InstCombine/vector-casts.ll b/test/Transforms/InstCombine/vector-casts.ll
index e0d6083a969..6e0d66b8ed4 100644
--- a/test/Transforms/InstCombine/vector-casts.ll
+++ b/test/Transforms/InstCombine/vector-casts.ll
@@ -1,22 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-; Can't get smaller than this.
+; This turns into a&1 != 0
+; TODO: The bar for canonicalizing to something bigger than the minimal length IR is very high. 
+; This pattern does not appear to meet that standard.
 
 define <2 x i1> @trunc(<2 x i64> %a) {
 ; CHECK-LABEL: @trunc(
-; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
+; CHECK-NEXT:    [[T:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[T]]
 ;
   %t = trunc <2 x i64> %a to <2 x i1>
   ret <2 x i1> %t
 }
 
-; This is trunc.
+; TODO: This could be just 1 instruction (trunc). 
 
 define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
 ; CHECK-LABEL: @and_cmp_is_trunc(
-; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
+; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
+; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %t = and <2 x i64> %a, <i64 1, i64 1>
@@ -24,11 +28,12 @@ define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
   ret <2 x i1> %r
 }
 
-; This is trunc.
+; TODO: This could be just 1 instruction (trunc). 
 
 define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) {
 ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt(
-; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
+; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 undef, i64 1>
+; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %t = and <2 x i64> %a, <i64 undef, i64 1>
@@ -36,7 +41,7 @@ define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) {
   ret <2 x i1> %r
 }
 
-; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete.
+; TODO: This could be just 1 instruction (trunc). 
 
 define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) {
 ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts(
diff --git a/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index 71038feec7b..8e948639ba1 100644
--- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -2901,45 +2901,49 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
 ; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
 ; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; AVX-NEXT:    [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
-; AVX-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
-; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
-; AVX-NEXT:    [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
-; AVX-NEXT:    [[TMP13:%.*]] = bitcast double** [[TMP12]] to <4 x double*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double*> undef)
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 4
-; AVX-NEXT:    [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef)
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
+; AVX-NEXT:    [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; AVX-NEXT:    [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1>
+; AVX-NEXT:    [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1>
+; AVX-NEXT:    [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1>
+; AVX-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer
+; AVX-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer
+; AVX-NEXT:    [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer
+; AVX-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
 ; AVX-NEXT:    [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef)
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 12
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 4
 ; AVX-NEXT:    [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef)
-; AVX-NEXT:    [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX-NEXT:    [[TMP21:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
-; AVX-NEXT:    [[TMP22:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
-; AVX-NEXT:    [[TMP23:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
-; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
-; AVX-NEXT:    [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
-; AVX-NEXT:    [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
-; AVX-NEXT:    [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
-; AVX-NEXT:    [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
-; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
-; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
-; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
-; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
-; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
+; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8
+; AVX-NEXT:    [[TMP21:%.*]] = bitcast double** [[TMP20]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 12
+; AVX-NEXT:    [[TMP23:%.*]] = bitcast double** [[TMP22]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP24:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX-NEXT:    [[TMP25:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX-NEXT:    [[TMP26:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX-NEXT:    [[TMP27:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]]
+; AVX-NEXT:    [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]]
+; AVX-NEXT:    [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]]
+; AVX-NEXT:    [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]]
+; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]])
+; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4
 ; AVX-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]])
+; AVX-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
+; AVX-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]])
+; AVX-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12
+; AVX-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]])
 ; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; AVX-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51
+; AVX-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51
 ; AVX:       middle.block:
 ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@@ -2949,14 +2953,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
 ; AVX:       for.body:
 ; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
 ; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
-; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX-NEXT:    [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX-NEXT:    [[TMP42:%.*]] = and i8 [[TMP41]], 1
+; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
 ; AVX-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX:       land.lhs.true:
 ; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
-; AVX-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
+; AVX-NEXT:    [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
+; AVX-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP43]], null
 ; AVX-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX:       if.then:
 ; AVX-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
@@ -2994,45 +2998,49 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
 ; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
 ; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
-; AVX512-NEXT:    [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
-; AVX512-NEXT:    [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
-; AVX512-NEXT:    [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
-; AVX512-NEXT:    [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP13:%.*]] = bitcast double** [[TMP12]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double*> undef)
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
-; AVX512-NEXT:    [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef)
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer
+; AVX512-NEXT:    [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer
+; AVX512-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
+; AVX512-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
 ; AVX512-NEXT:    [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef)
-; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 24
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8
 ; AVX512-NEXT:    [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef)
-; AVX512-NEXT:    [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX512-NEXT:    [[TMP21:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
-; AVX512-NEXT:    [[TMP22:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
-; AVX512-NEXT:    [[TMP23:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
-; AVX512-NEXT:    [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
-; AVX512-NEXT:    [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
-; AVX512-NEXT:    [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
-; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
-; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
-; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
-; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
-; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 16
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast double** [[TMP20]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 24
+; AVX512-NEXT:    [[TMP23:%.*]] = bitcast double** [[TMP22]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP24:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP25:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]]
+; AVX512-NEXT:    [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]]
+; AVX512-NEXT:    [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]]
+; AVX512-NEXT:    [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]]
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]])
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
 ; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]])
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
-; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@@ -3042,14 +3050,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX512-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
-; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX512-NEXT:    [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP42:%.*]] = and i8 [[TMP41]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
 ; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX512:       land.lhs.true:
 ; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
-; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
+; AVX512-NEXT:    [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP43]], null
 ; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
@@ -3154,45 +3162,49 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
 ; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
 ; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; AVX-NEXT:    [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
-; AVX-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
-; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
-; AVX-NEXT:    [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
-; AVX-NEXT:    [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <4 x i32 ()*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x i32 ()*> undef)
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 4
-; AVX-NEXT:    [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef)
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
+; AVX-NEXT:    [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; AVX-NEXT:    [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1>
+; AVX-NEXT:    [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1>
+; AVX-NEXT:    [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1>
+; AVX-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer
+; AVX-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer
+; AVX-NEXT:    [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer
+; AVX-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
 ; AVX-NEXT:    [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef)
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 12
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 4
 ; AVX-NEXT:    [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef)
-; AVX-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
-; AVX-NEXT:    [[TMP22:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
-; AVX-NEXT:    [[TMP23:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
-; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
-; AVX-NEXT:    [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
-; AVX-NEXT:    [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
-; AVX-NEXT:    [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
-; AVX-NEXT:    [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
-; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
-; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
-; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
-; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
-; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
+; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8
+; AVX-NEXT:    [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 12
+; AVX-NEXT:    [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP24:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX-NEXT:    [[TMP25:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX-NEXT:    [[TMP26:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX-NEXT:    [[TMP27:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]]
+; AVX-NEXT:    [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]]
+; AVX-NEXT:    [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]]
+; AVX-NEXT:    [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]]
+; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]])
+; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4
 ; AVX-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]])
+; AVX-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
+; AVX-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]])
+; AVX-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12
+; AVX-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]])
 ; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; AVX-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54
+; AVX-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54
 ; AVX:       middle.block:
 ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@@ -3202,14 +3214,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
 ; AVX:       for.body:
 ; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
 ; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
-; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX-NEXT:    [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX-NEXT:    [[TMP42:%.*]] = and i8 [[TMP41]], 1
+; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
 ; AVX-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX:       land.lhs.true:
 ; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
-; AVX-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
+; AVX-NEXT:    [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
+; AVX-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null
 ; AVX-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX:       if.then:
 ; AVX-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
@@ -3247,45 +3259,49 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
 ; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
 ; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
-; AVX512-NEXT:    [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
-; AVX512-NEXT:    [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
-; AVX512-NEXT:    [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
-; AVX512-NEXT:    [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x i32 ()*> undef)
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
-; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef)
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer
+; AVX512-NEXT:    [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer
+; AVX512-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
+; AVX512-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
 ; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef)
-; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 24
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8
 ; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef)
-; AVX512-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX512-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
-; AVX512-NEXT:    [[TMP22:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
-; AVX512-NEXT:    [[TMP23:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
-; AVX512-NEXT:    [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
-; AVX512-NEXT:    [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
-; AVX512-NEXT:    [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
-; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
-; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
-; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
-; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
-; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 16
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 24
+; AVX512-NEXT:    [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP24:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP25:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]]
+; AVX512-NEXT:    [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]]
+; AVX512-NEXT:    [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]]
+; AVX512-NEXT:    [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]]
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]])
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
 ; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]])
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
-; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@@ -3295,14 +3311,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX512-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
-; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX512-NEXT:    [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP42:%.*]] = and i8 [[TMP41]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
 ; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX512:       land.lhs.true:
 ; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
-; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
+; AVX512-NEXT:    [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null
 ; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
-- 
GitLab


From 6029ddd2298b5a52ba2cc29e3f4c38f6c7cfab20 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 10 Oct 2018 20:40:12 +0000
Subject: [PATCH 0020/1116] [X86] Move X86DAGToDAGISel::matchBEXTRFromAnd()
 into X86ISelLowering

Summary:
As discussed in [[ https://bugs.llvm.org/show_bug.cgi?id=38938 | PR38938 ]],
we fail to emit `BEXTR` if the mask is shifted.
We can't deal with that in `X86DAGToDAGISel` `before the address mode for the inc is selected`,
and we can't really do it in the normal DAGCombine, because we don't have generic `ISD::BitFieldExtract` node,
and if we simply turn the shifted mask into a normal mask + shift-left, it will be folded back.
So it would seem X86ISelLowering is the place to handle this.

This patch only moves the matchBEXTRFromAnd()
from X86DAGToDAGISel to X86ISelLowering.
It does not add support for the 'shifted mask' pattern.

Reviewers: RKSimon, craig.topper, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D52426

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344179 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp | 66 ------------------------------
 lib/Target/X86/X86ISelLowering.cpp | 66 ++++++++++++++++++++++++++++++
 test/CodeGen/X86/tbm_patterns.ll   |  6 +--
 3 files changed, 68 insertions(+), 70 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index be079659da4..25a8567a9c1 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -457,7 +457,6 @@ namespace {
     }
 
     bool foldLoadStoreIntoMemOperand(SDNode *Node);
-    bool matchBEXTRFromAnd(SDNode *Node);
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
     bool tryShiftAmountMod(SDNode *N);
@@ -2582,69 +2581,6 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
   return true;
 }
 
-// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
-bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
-  MVT NVT = Node->getSimpleValueType(0);
-  SDLoc dl(Node);
-
-  SDValue N0 = Node->getOperand(0);
-  SDValue N1 = Node->getOperand(1);
-
-  // If we have TBM we can use an immediate for the control. If we have BMI
-  // we should only do this if the BEXTR instruction is implemented well.
-  // Otherwise moving the control into a register makes this more costly.
-  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
-  // hoisting the move immediate would make it worthwhile with a less optimal
-  // BEXTR?
-  if (!Subtarget->hasTBM() &&
-      !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
-    return false;
-
-  // Must have a shift right.
-  if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
-    return false;
-
-  // Shift can't have additional users.
-  if (!N0->hasOneUse())
-    return false;
-
-  // Only supported for 32 and 64 bits.
-  if (NVT != MVT::i32 && NVT != MVT::i64)
-    return false;
-
-  // Shift amount and RHS of and must be constant.
-  ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
-  ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
-  if (!MaskCst || !ShiftCst)
-    return false;
-
-  // And RHS must be a mask.
-  uint64_t Mask = MaskCst->getZExtValue();
-  if (!isMask_64(Mask))
-    return false;
-
-  uint64_t Shift = ShiftCst->getZExtValue();
-  uint64_t MaskSize = countPopulation(Mask);
-
-  // Don't interfere with something that can be handled by extracting AH.
-  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
-  if (Shift == 8 && MaskSize == 8)
-    return false;
-
-  // Make sure we are only using bits that were in the original value, not
-  // shifted in.
-  if (Shift + MaskSize > NVT.getSizeInBits())
-    return false;
-
-  // Create a BEXTR node and run it through selection.
-  SDValue C = CurDAG->getConstant(Shift | (MaskSize << 8), dl, NVT);
-  SDValue New = CurDAG->getNode(X86ISD::BEXTR, dl, NVT,
-                                N0->getOperand(0), C);
-  ReplaceNode(Node, New.getNode());
-  SelectCode(New.getNode());
-  return true;
-}
-
 // Emit a PCMISTR(I/M) instruction.
 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
                                              bool MayFoldLoad, const SDLoc &dl,
@@ -2952,8 +2888,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case ISD::AND:
-    if (matchBEXTRFromAnd(Node))
-      return;
     if (AndImmShrink && shrinkAndImmediate(Node))
       return;
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 67f98d8ee72..ab9a14a65a1 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -35278,6 +35278,69 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
   return SDValue();
 }
 
+static bool hasBEXTR(const X86Subtarget &Subtarget, EVT VT) {
+  // If we have TBM we can use an immediate for the control. If we have BMI
+  // we should only do this if the BEXTR instruction is implemented well.
+  // Otherwise moving the control into a register makes this more costly.
+  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
+  // hoisting the move immediate would make it worthwhile with a less optimal
+  // BEXTR?
+  if (!Subtarget.hasTBM() && !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
+    return false;
+  return (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
+}
+
+// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
+static SDValue combineAndIntoBEXTR(SDNode *Node, SelectionDAG &DAG,
+                                   const X86Subtarget &Subtarget) {
+  EVT NVT = Node->getValueType(0);
+  SDLoc dl(Node);
+
+  SDValue N0 = Node->getOperand(0);
+  SDValue N1 = Node->getOperand(1);
+
+  // Check if subtarget has BEXTR instruction for the node's type
+  if (!hasBEXTR(Subtarget, NVT))
+    return SDValue();
+
+  // Must have a shift right.
+  if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
+    return SDValue();
+
+  // Shift can't have additional users.
+  if (!N0->hasOneUse())
+    return SDValue();
+
+  // Shift amount and RHS of and must be constant.
+  ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+  if (!MaskCst || !ShiftCst)
+    return SDValue();
+
+  // And RHS must be a mask.
+  uint64_t Mask = MaskCst->getZExtValue();
+  if (!isMask_64(Mask))
+    return SDValue();
+
+  uint64_t Shift = ShiftCst->getZExtValue();
+  uint64_t MaskSize = countPopulation(Mask);
+
+  // Don't interfere with something that can be handled by extracting AH.
+  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
+  if (Shift == 8 && MaskSize == 8)
+    return SDValue();
+
+  // Make sure we are only using bits that were in the original value, not
+  // shifted in.
+  if (Shift + MaskSize > NVT.getSizeInBits())
+    return SDValue();
+
+  // Create a BEXTR node.
+  SDValue C = DAG.getConstant(Shift | (MaskSize << 8), dl, NVT);
+  SDValue New = DAG.getNode(X86ISD::BEXTR, dl, NVT, N0->getOperand(0), C);
+  return New;
+}
+
 // Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
 // Turn it into series of XORs and a setnp.
 static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
@@ -35379,6 +35442,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  if (SDValue R = combineAndIntoBEXTR(N, DAG, Subtarget))
+    return R;
+
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll
index 6865cc5a0ef..2b335ea4268 100644
--- a/test/CodeGen/X86/tbm_patterns.ll
+++ b/test/CodeGen/X86/tbm_patterns.ll
@@ -53,8 +53,7 @@ define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    shrl $4, %edi
-; CHECK-NEXT:    testl $4095, %edi # imm = 0xFFF
+; CHECK-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
 ; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = lshr i32 %a, 4
@@ -114,8 +113,7 @@ define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    shrl $4, %edi
-; CHECK-NEXT:    testl $4095, %edi # imm = 0xFFF
+; CHECK-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
 ; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = lshr i64 %a, 4
-- 
GitLab


From 2870bb0615585488fd166f9a625b655452be91e4 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 10 Oct 2018 20:40:54 +0000
Subject: [PATCH 0021/1116] [WebAssembly][NFC] Remove repetition of Defs =
 [ARGUMENTS]

Summary:
By moving that line into the `I` multiclass.

Reviewers: aheejin

Subscribers: dschuff, sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53093

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344180 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyInstrAtomics.td    | 16 --------------
 .../WebAssembly/WebAssemblyInstrCall.td       |  4 ----
 .../WebAssembly/WebAssemblyInstrControl.td    |  8 -------
 .../WebAssembly/WebAssemblyInstrConv.td       |  8 -------
 .../WebAssembly/WebAssemblyInstrExceptRef.td  |  4 ----
 .../WebAssembly/WebAssemblyInstrFloat.td      | 12 ----------
 .../WebAssembly/WebAssemblyInstrFormats.td    |  1 +
 .../WebAssembly/WebAssemblyInstrInfo.td       |  7 ++----
 .../WebAssembly/WebAssemblyInstrInteger.td    |  9 --------
 .../WebAssembly/WebAssemblyInstrMemory.td     | 22 -------------------
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  2 --
 11 files changed, 3 insertions(+), 90 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 9eff2cfde0a..f9d092e4b8a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -24,10 +24,8 @@ multiclass ATOMIC_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
             Requires<[HasAtomics]>;
 }
 
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_LOAD_I32 : WebAssemblyLoad<I32, "i32.atomic.load", 0xfe10>;
 defm ATOMIC_LOAD_I64 : WebAssemblyLoad<I64, "i64.atomic.load", 0xfe11>;
-} // Defs = [ARGUMENTS]
 
 // Select loads with no constant offset.
 let Predicates = [HasAtomics] in {
@@ -62,13 +60,11 @@ def : LoadPatExternSymOffOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
 
 // Extending loads. Note that there are only zero-extending atomic loads, no
 // sign-extending loads.
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load8_u", 0xfe12>;
 defm ATOMIC_LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load16_u", 0xfe13>;
 defm ATOMIC_LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load8_u", 0xfe14>;
 defm ATOMIC_LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load16_u", 0xfe15>;
 defm ATOMIC_LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load32_u", 0xfe16>;
-} // Defs = [ARGUMENTS]
 
 // Fragments for extending loads. These are different from regular loads because
 // the SDNodes are derived from AtomicSDNode rather than LoadSDNode and
@@ -200,10 +196,8 @@ def : LoadPatExternSymOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 // Atomic stores
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_STORE_I32 : WebAssemblyStore<I32, "i32.atomic.store", 0xfe17>;
 defm ATOMIC_STORE_I64 : WebAssemblyStore<I64, "i64.atomic.store", 0xfe18>;
-} // Defs = [ARGUMENTS]
 
 // We need an 'atomic' version of store patterns because store and atomic_store
 // nodes have different operand orders:
@@ -263,13 +257,11 @@ def : AStorePatExternSymOffOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
 } // Predicates = [HasAtomics]
 
 // Truncating stores.
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_STORE8_I32 : WebAssemblyStore<I32, "i32.atomic.store8", 0xfe19>;
 defm ATOMIC_STORE16_I32 : WebAssemblyStore<I32, "i32.atomic.store16", 0xfe1a>;
 defm ATOMIC_STORE8_I64 : WebAssemblyStore<I64, "i64.atomic.store8", 0xfe1b>;
 defm ATOMIC_STORE16_I64 : WebAssemblyStore<I64, "i64.atomic.store16", 0xfe1c>;
 defm ATOMIC_STORE32_I64 : WebAssemblyStore<I64, "i64.atomic.store32", 0xfe1d>;
-} // Defs = [ARGUMENTS]
 
 // Fragments for truncating stores.
 
@@ -341,8 +333,6 @@ def : AStorePatExternSymOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
 // Atomic binary read-modify-writes
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 multiclass WebAssemblyBinRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
   defm "" : I<(outs rc:$dst),
               (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
@@ -430,7 +420,6 @@ defm ATOMIC_RMW16_U_XCHG_I64 :
   WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.xchg", 0xfe46>;
 defm ATOMIC_RMW32_U_XCHG_I64 :
   WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.xchg", 0xfe47>;
-}
 
 // Select binary RMWs with no constant offset.
 class BinRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
@@ -674,8 +663,6 @@ defm : BinRMWTruncExtPattern<
 // Consider adding a pass after instruction selection that optimizes this case
 // if it is frequent.
 
-let Defs = [ARGUMENTS] in {
-
 multiclass WebAssemblyTerRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
   defm "" : I<(outs rc:$dst),
               (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp,
@@ -699,7 +686,6 @@ defm ATOMIC_RMW16_U_CMPXCHG_I64 :
   WebAssemblyTerRMW<I64, "i64.atomic.rmw16_u.cmpxchg", 0xfe4d>;
 defm ATOMIC_RMW32_U_CMPXCHG_I64 :
   WebAssemblyTerRMW<I64, "i64.atomic.rmw32_u.cmpxchg", 0xfe4e>;
-}
 
 // Select ternary RMWs with no constant offset.
 class TerRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
@@ -912,7 +898,6 @@ defm : TerRMWTruncExtPattern<
 // Atomic wait / notify
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
 let hasSideEffects = 1 in {
 defm ATOMIC_NOTIFY :
   I<(outs I32:$dst),
@@ -935,7 +920,6 @@ defm ATOMIC_WAIT_I64 :
     "i64.atomic.wait \t${off}, ${p2align}", 0xfe02>;
 } // mayLoad = 1
 } // hasSideEffects = 1
-} // Defs = [ARGUMENTS]
 
 let Predicates = [HasAtomics] in {
 // Select notifys with no constant offset.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 3c9caa3f0de..07839b79011 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -15,8 +15,6 @@
 // TODO: addr64: These currently assume the callee address is 32-bit.
 // FIXME: add $type to first call_indirect asmstr (and maybe $flags)
 
-let Defs = [ARGUMENTS] in {
-
 // Call sequence markers. These have an immediate which represents the amount of
 // stack space to allocate or free, which is used for varargs lowering.
 let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
@@ -118,8 +116,6 @@ let Uses = [SP32, SP64], isCall = 1 in {
                               0x11>;
 } // Uses = [SP32,SP64], isCall = 1
 
-} // Defs = [ARGUMENTS]
-
 // Patterns for matching a direct call to a global address.
 def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
           (CALL_I32 tglobaladdr:$callee)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index e27d81937dd..ed9879ae454 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -12,8 +12,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // The condition operand is a boolean value which WebAssembly represents as i32.
 defm BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
@@ -30,15 +28,11 @@ defm BR   : NRI<(outs), (ins bb_op:$dst),
 } // isBarrier = 1
 } // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
           (BR_IF bb_op:$dst, I32:$cond)>;
 def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
           (BR_UNLESS bb_op:$dst, I32:$cond)>;
 
-let Defs = [ARGUMENTS] in {
-
 // TODO: SelectionDAG's lowering insists on using a pointer as the index for
 // jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
 // currently.
@@ -194,5 +188,3 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
                    [(catchret bb:$dst, bb:$from)], "", 0>;
 }
 }
-
-} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index c89c1b54981..e9ba52799ee 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -13,8 +13,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 defm I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                       [(set I32:$dst, (trunc I64:$src))],
                       "i32.wrap/i64\t$dst, $src", "i32.wrap/i64", 0xa7>;
@@ -51,15 +49,11 @@ defm I64_EXTEND32_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
                             0xc4>;
 } // Predicates = [HasSignExt]
 
-} // defs = [ARGUMENTS]
-
 // Expand a "don't care" extend into zero-extend (chosen over sign-extend
 // somewhat arbitrarily, although it favors popular hardware architectures
 // and is conceptually a simpler operation).
 def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>;
 
-let Defs = [ARGUMENTS] in {
-
 // Conversion from floating point to integer instructions which don't trap on
 // overflow or invalid.
 defm I32_TRUNC_S_SAT_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
@@ -218,5 +212,3 @@ defm F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins),
                              [(set F64:$dst, (bitconvert I64:$src))],
                              "f64.reinterpret/i64\t$dst, $src",
                              "f64.reinterpret/i64", 0xbf>;
-
-} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
index 41b39f69e51..a251d60b89e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
@@ -12,8 +12,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst),
                            (ins EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond),
                            (outs), (ins),
@@ -23,8 +21,6 @@ defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst),
                            "except_ref.select\t$dst, $lhs, $rhs, $cond",
                            "except_ref.select", 0x1b>;
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(select (i32 (setne I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
           (SELECT_EXCEPT_REF EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 70e27df27e6..364c485f409 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -45,8 +45,6 @@ multiclass ComparisonFP<CondCode cond, string name, bits<32> f32Inst, bits<32> f
                 !strconcat("f64.", name), f64Inst>;
 }
 
-let Defs = [ARGUMENTS] in {
-
 let isCommutable = 1 in
 defm ADD : BinaryFP<fadd, "add ", 0x92, 0xa0>;
 defm SUB : BinaryFP<fsub, "sub ", 0x93, 0xa1>;
@@ -69,8 +67,6 @@ defm FLOOR : UnaryFP<ffloor, "floor", 0x8e, 0x9c>;
 defm TRUNC : UnaryFP<ftrunc, "trunc", 0x8f, 0x9d>;
 defm NEAREST : UnaryFP<fnearbyint, "nearest", 0x90, 0x9e>;
 
-} // Defs = [ARGUMENTS]
-
 // DAGCombine oddly folds casts into the rhs of copysign. Unfold them.
 def : Pat<(fcopysign F64:$lhs, F32:$rhs),
           (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>;
@@ -81,8 +77,6 @@ def : Pat<(fcopysign F32:$lhs, F64:$rhs),
 def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>;
 def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>;
 
-let Defs = [ARGUMENTS] in {
-
 let isCommutable = 1 in {
 defm EQ : ComparisonFP<SETOEQ, "eq  ", 0x5b, 0x61>;
 defm NE : ComparisonFP<SETUNE, "ne  ", 0x5c, 0x62>;
@@ -92,8 +86,6 @@ defm LE : ComparisonFP<SETOLE, "le  ", 0x5f, 0x65>;
 defm GT : ComparisonFP<SETOGT, "gt  ", 0x5e, 0x64>;
 defm GE : ComparisonFP<SETOGE, "ge  ", 0x60, 0x66>;
 
-} // Defs = [ARGUMENTS]
-
 // Don't care floating-point comparisons, supported via other comparisons.
 def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>;
 def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>;
@@ -108,8 +100,6 @@ def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>;
 def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>;
 def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
 
-let Defs = [ARGUMENTS] in {
-
 defm SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
@@ -119,8 +109,6 @@ defm SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
                     [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
                     "f64.select\t$dst, $lhs, $rhs, $cond", "f64.select", 0x1b>;
 
-} // Defs = [ARGUMENTS]
-
 // ISD::SELECT requires its operand to conform to getBooleanContents, but
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 683fb3d981f..2d23acfc825 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -30,6 +30,7 @@ class NI<dag oops, dag iops, list<dag> pattern, bit stack, string asmstr = "",
   dag OutOperandList = oops;
   dag InOperandList  = iops;
   let Pattern        = pattern;
+  let Defs           = [ARGUMENTS];
 }
 
 // Generates both register and stack based versions of one actual instruction.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index a2ea14cc28b..9e1409cf90e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -164,7 +164,8 @@ include "WebAssemblyInstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 multiclass ARGUMENT<WebAssemblyRegClass vt> {
-  let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
+  let hasSideEffects = 1, isCodeGenOnly = 1,
+      Defs = []<Register>, Uses = [ARGUMENTS] in
   defm ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
                         (outs), (ins i32imm:$argno),
                         [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
@@ -175,8 +176,6 @@ defm "": ARGUMENT<F32>;
 defm "": ARGUMENT<F64>;
 defm "": ARGUMENT<EXCEPT_REF>;
 
-let Defs = [ARGUMENTS] in {
-
 // get_local and set_local are not generated by instruction selection; they
 // are implied by virtual register uses and defs.
 multiclass LOCAL<WebAssemblyRegClass vt> {
@@ -266,8 +265,6 @@ defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
                    "f64.const\t$res, $imm", "f64.const\t$imm", 0x44>;
 } // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
           (CONST_I32 tglobaladdr:$addr)>;
 def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 44c93de54aa..bd41f46214a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -45,9 +45,6 @@ multiclass ComparisonInt<CondCode cond, string name, bits<32> i32Inst, bits<32>
                 !strconcat("i64.", name), i64Inst>;
 }
 
-
-let Defs = [ARGUMENTS] in {
-
 // The spaces after the names are for aesthetic purposes only, to make
 // operands line up vertically after tab expansion.
 let isCommutable = 1 in
@@ -97,16 +94,12 @@ defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                  [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
                  "i64.eqz \t$dst, $src", "i64.eqz", 0x50>;
 
-} // Defs = [ARGUMENTS]
-
 // Optimize away an explicit mask on a rotate count.
 def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>;
 def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
 
-let Defs = [ARGUMENTS] in {
-
 defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
@@ -116,8 +109,6 @@ defm SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
                     [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
                     "i64.select\t$dst, $lhs, $rhs, $cond", "i64.select", 0x1b>;
 
-} // Defs = [ARGUMENTS]
-
 // ISD::SELECT requires its operand to conform to getBooleanContents, but
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 76ef1461d22..ccc331d1bf0 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -53,8 +53,6 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off),
 // We don't need a regPlusES because external symbols never have constant
 // offsets folded into them, so we can just use add.
 
-let Defs = [ARGUMENTS] in {
-
 // Defines atomic and non-atomic loads, regular and extending.
 multiclass WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode> {
   let mayLoad = 1 in
@@ -73,8 +71,6 @@ defm LOAD_I64 : WebAssemblyLoad<I64, "i64.load", 0x29>;
 defm LOAD_F32 : WebAssemblyLoad<F32, "f32.load", 0x2a>;
 defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b>;
 
-} // Defs = [ARGUMENTS]
-
 // Select loads with no constant offset.
 class LoadPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
   Pat<(ty (kind I32:$addr)), (inst 0, 0, I32:$addr)>;
@@ -144,8 +140,6 @@ def : LoadPatExternSymOffOnly<i64, load, LOAD_I64>;
 def : LoadPatExternSymOffOnly<f32, load, LOAD_F32>;
 def : LoadPatExternSymOffOnly<f64, load, LOAD_F64>;
 
-let Defs = [ARGUMENTS] in {
-
 // Extending load.
 defm LOAD8_S_I32 : WebAssemblyLoad<I32, "i32.load8_s", 0x2c>;
 defm LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.load8_u", 0x2d>;
@@ -158,8 +152,6 @@ defm LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x33>;
 defm LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34>;
 defm LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35>;
 
-} // Defs = [ARGUMENTS]
-
 // Select extending loads with no constant offset.
 def : LoadPatNoOffset<i32, sextloadi8, LOAD8_S_I32>;
 def : LoadPatNoOffset<i32, zextloadi8, LOAD8_U_I32>;
@@ -303,9 +295,6 @@ def : LoadPatExternSymOffOnly<i64, extloadi8, LOAD8_U_I64>;
 def : LoadPatExternSymOffOnly<i64, extloadi16, LOAD16_U_I64>;
 def : LoadPatExternSymOffOnly<i64, extloadi32, LOAD32_U_I64>;
 
-
-let Defs = [ARGUMENTS] in {
-
 // Defines atomic and non-atomic stores, regular and truncating
 multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode> {
   let mayStore = 1 in
@@ -323,8 +312,6 @@ defm STORE_I64  : WebAssemblyStore<I64, "i64.store", 0x37>;
 defm STORE_F32  : WebAssemblyStore<F32, "f32.store", 0x38>;
 defm STORE_F64  : WebAssemblyStore<F64, "f64.store", 0x39>;
 
-} // Defs = [ARGUMENTS]
-
 // Select stores with no constant offset.
 class StorePatNoOffset<ValueType ty, PatFrag node, NI inst> :
   Pat<(node ty:$val, I32:$addr), (inst 0, 0, I32:$addr, ty:$val)>;
@@ -389,9 +376,6 @@ def : StorePatExternSymOffOnly<i64, store, STORE_I64>;
 def : StorePatExternSymOffOnly<f32, store, STORE_F32>;
 def : StorePatExternSymOffOnly<f64, store, STORE_F64>;
 
-
-let Defs = [ARGUMENTS] in {
-
 // Truncating store.
 defm STORE8_I32 : WebAssemblyStore<I32, "i32.store8", 0x3a>;
 defm STORE16_I32 : WebAssemblyStore<I32, "i32.store16", 0x3b>;
@@ -399,8 +383,6 @@ defm STORE8_I64 : WebAssemblyStore<I64, "i64.store8", 0x3c>;
 defm STORE16_I64 : WebAssemblyStore<I64, "i64.store16", 0x3d>;
 defm STORE32_I64 : WebAssemblyStore<I64, "i64.store32", 0x3e>;
 
-} // Defs = [ARGUMENTS]
-
 // Select truncating stores with no constant offset.
 def : StorePatNoOffset<i32, truncstorei8, STORE8_I32>;
 def : StorePatNoOffset<i32, truncstorei16, STORE16_I32>;
@@ -448,8 +430,6 @@ def : StorePatExternSymOffOnly<i64, truncstorei8, STORE8_I64>;
 def : StorePatExternSymOffOnly<i64, truncstorei16, STORE16_I64>;
 def : StorePatExternSymOffOnly<i64, truncstorei32, STORE32_I64>;
 
-let Defs = [ARGUMENTS] in {
-
 // Current memory size.
 defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
                          (outs), (ins i32imm:$flags),
@@ -493,8 +473,6 @@ defm GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
                          0x40>,
                        Requires<[HasAddr32]>;
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(int_wasm_current_memory),
           (CURRENT_MEMORY_I32 0)>;
 def : Pat<(int_wasm_grow_memory I32:$delta),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 491ee56b794..90bdc17890b 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -55,7 +55,6 @@ multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
                                   "v128.const\t"#args, 0>;
 }
 
-let Defs = [ARGUMENTS] in {
 defm "" : ConstVec<v16i8,
                    (ins vec_i8imm_op:$i0, vec_i8imm_op:$i1,
                         vec_i8imm_op:$i2, vec_i8imm_op:$i3,
@@ -100,7 +99,6 @@ defm "" : ConstVec<v2f64,
                   (ins f64imm_op:$i0, f64imm_op:$i1),
                   (build_vector (f64 fpimm:$i0), (f64 fpimm:$i1)),
                   "$i0, $i1">;
-} // Defs = [ARGUMENTS]
 
 // Create vector with identical lanes: splat
 def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>;
-- 
GitLab


From 66a2c5ecaaec52cf521193dbc139fdeba8987720 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 10 Oct 2018 20:47:46 +0000
Subject: [PATCH 0022/1116] [InstCombine] reverse 'trunc X to <N x i1>'
 canonicalization; 2nd try

Re-trying r344082 because it unintentionally included extra diffs.

Original commit message:
icmp ne (and X, 1), 0 --> trunc X to N x i1

Ideally, we'd do the same for scalars, but there will likely be
regressions unless we add more trunc folds as we're doing here
for vectors.

The motivating vector case is from PR37549:
https://bugs.llvm.org/show_bug.cgi?id=37549

define <4 x float> @bitwise_select(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {

  %c = fcmp ole <4 x float> %x, %y
  %s = sext <4 x i1> %c to <4 x i32>
  %s1 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
  %s2 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
  %cond = or <4 x i32> %s1, %s2
  %condtr = trunc <4 x i32> %cond to <4 x i1>
  %r = select <4 x i1> %condtr, <4 x float> %z, <4 x float> %w
  ret <4 x float> %r

}

Here's a sampling of the vector codegen for that case using
mask+icmp (current behavior) vs. trunc (with this patch):

AVX before:

vcmpleps        %xmm1, %xmm0, %xmm0
vpermilps       $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps       $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps   %xmm0, %xmm1, %xmm0
vandps  LCPI0_0(%rip), %xmm0, %xmm0
vxorps  %xmm1, %xmm1, %xmm1
vpcmpeqd        %xmm1, %xmm0, %xmm0
vblendvps       %xmm0, %xmm3, %xmm2, %xmm0

AVX after:

vcmpleps        %xmm1, %xmm0, %xmm0
vpermilps       $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps       $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps   %xmm0, %xmm1, %xmm0
vblendvps       %xmm0, %xmm2, %xmm3, %xmm0

AVX512f before:

vcmpleps        %xmm1, %xmm0, %xmm0
vpermilps       $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps       $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps   %xmm0, %xmm1, %xmm0
vpbroadcastd    LCPI0_0(%rip), %xmm1 ## xmm1 = [1,1,1,1]
vptestnmd       %zmm1, %zmm0, %k1
vblendmps       %zmm3, %zmm2, %zmm0 {%k1}

AVX512f after:

vcmpleps        %xmm1, %xmm0, %xmm0
vpermilps       $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps       $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps   %xmm0, %xmm1, %xmm0
vpslld  $31, %xmm0, %xmm0
vptestmd        %zmm0, %zmm0, %k1
vblendmps       %zmm2, %zmm3, %zmm0 {%k1}

AArch64 before:

fcmge   v0.4s, v1.4s, v0.4s
zip1    v1.4s, v0.4s, v0.4s
zip2    v0.4s, v0.4s, v0.4s
orr     v0.16b, v1.16b, v0.16b
movi    v1.4s, #1
and     v0.16b, v0.16b, v1.16b
cmeq    v0.4s, v0.4s, #0
bsl     v0.16b, v3.16b, v2.16b

AArch64 after:

fcmge   v0.4s, v1.4s, v0.4s
zip1    v1.4s, v0.4s, v0.4s
zip2    v0.4s, v0.4s, v0.4s
orr     v0.16b, v1.16b, v0.16b
bsl     v0.16b, v2.16b, v3.16b

PowerPC-le before:

xvcmpgesp 34, 35, 34
vspltisw 0, 1
vmrglw 3, 2, 2
vmrghw 2, 2, 2
xxlor 0, 35, 34
xxlxor 35, 35, 35
xxland 34, 0, 32
vcmpequw 2, 2, 3
xxsel 34, 36, 37, 34

PowerPC-le after:

xvcmpgesp 34, 35, 34
vmrglw 3, 2, 2
vmrghw 2, 2, 2
xxlor 0, 35, 34
xxsel 34, 37, 36, 0

Differential Revision: https://reviews.llvm.org/D52747


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344181 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCasts.cpp          |  31 +-
 .../InstCombine/InstCombineCompares.cpp       |   7 +
 test/Transforms/InstCombine/apint-shift.ll    |   4 +-
 .../Transforms/InstCombine/apint-shl-trunc.ll |   5 +-
 test/Transforms/InstCombine/icmp.ll           |  20 +-
 test/Transforms/InstCombine/vector-casts.ll   |  19 +-
 .../LoopVectorize/X86/masked_load_store.ll    | 336 +++++++++---------
 7 files changed, 213 insertions(+), 209 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index fd59c3a7c0c..74f1e695ff6 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -706,12 +706,35 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
 
-  // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
   if (DestTy->getScalarSizeInBits() == 1) {
-    Constant *One = ConstantInt::get(SrcTy, 1);
-    Src = Builder.CreateAnd(Src, One);
     Value *Zero = Constant::getNullValue(Src->getType());
-    return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
+    if (DestTy->isIntegerTy()) {
+      // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
+      // TODO: We canonicalize to more instructions here because we are probably
+      // lacking equivalent analysis for trunc relative to icmp. There may also
+      // be codegen concerns. If those trunc limitations were removed, we could
+      // remove this transform.
+      Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1));
+      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+    }
+
+    // For vectors, we do not canonicalize all truncs to icmp, so optimize
+    // patterns that would be covered within visitICmpInst.
+    Value *X;
+    const APInt *C;
+    if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) {
+      // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
+      APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C);
+      Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
+      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+    }
+    if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)),
+                                   m_Deferred(X))))) {
+      // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0
+      APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1;
+      Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
+      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+    }
   }
 
   // FIXME: Maybe combine the next two transforms to handle the no cast case
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 07bd98b30ab..bf8bc8818f7 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1609,6 +1609,13 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
 Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
                                                  BinaryOperator *And,
                                                  const APInt &C1) {
+  // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
+  // TODO: We canonicalize to the longer form for scalars because we have
+  // better analysis/folds for icmp, and codegen may be better with icmp.
+  if (Cmp.getPredicate() == CmpInst::ICMP_NE && Cmp.getType()->isVectorTy() &&
+      C1.isNullValue() && match(And->getOperand(1), m_One()))
+    return new TruncInst(And->getOperand(0), Cmp.getType());
+
   const APInt *C2;
   if (!match(And->getOperand(1), m_APInt(C2)))
     return nullptr;
diff --git a/test/Transforms/InstCombine/apint-shift.ll b/test/Transforms/InstCombine/apint-shift.ll
index fc564665a60..3266fa6e443 100644
--- a/test/Transforms/InstCombine/apint-shift.ll
+++ b/test/Transforms/InstCombine/apint-shift.ll
@@ -319,8 +319,8 @@ define i1 @test16(i84 %X) {
 
 define <2 x i1> @test16vec(<2 x i84> %X) {
 ; CHECK-LABEL: @test16vec(
-; CHECK-NEXT:    [[AND:%.*]] = and <2 x i84> %X, <i84 16, i84 16>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i84> [[AND]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i84> [[X:%.*]], <i84 16, i84 16>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i84> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %shr = ashr <2 x i84> %X, <i84 4, i84 4>
diff --git a/test/Transforms/InstCombine/apint-shl-trunc.ll b/test/Transforms/InstCombine/apint-shl-trunc.ll
index c7d7d369592..2241c88cb6b 100644
--- a/test/Transforms/InstCombine/apint-shl-trunc.ll
+++ b/test/Transforms/InstCombine/apint-shl-trunc.ll
@@ -27,9 +27,8 @@ define i1 @test1(i799 %X, i799 %A) {
 
 define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) {
 ; CHECK-LABEL: @test0vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i39> <i39 1, i39 1>, [[A:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i39> [[TMP1]], [[X:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i39> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = lshr <2 x i39> [[X:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = trunc <2 x i39> [[B]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
   %B = lshr <2 x i39> %X, %A
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 1f97009911b..1f05bb67e96 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -2427,10 +2427,9 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) {
 
 define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @icmp_and_or_lshr_vec(
-; CHECK-NEXT:    [[SHF1:%.*]] = shl nuw <2 x i32> <i32 1, i32 1>, [[Y:%.*]]
-; CHECK-NEXT:    [[OR2:%.*]] = or <2 x i32> [[SHF1]], <i32 1, i32 1>
-; CHECK-NEXT:    [[AND3:%.*]] = and <2 x i32> [[OR2]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND3]], zeroinitializer
+; CHECK-NEXT:    [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[SHF]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %shf = lshr <2 x i32> %x, %y
@@ -2445,8 +2444,7 @@ define <2 x i1> @icmp_and_or_lshr_vec_commute(<2 x i32> %xp, <2 x i32> %y) {
 ; CHECK-NEXT:    [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
 ; CHECK-NEXT:    [[SHF:%.*]] = lshr <2 x i32> [[X]], [[Y:%.*]]
 ; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]]
-; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> [[OR]], <i32 1, i32 1>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
+; CHECK-NEXT:    [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %x = srem <2 x i32> %xp, <i32 42, i32 -42> ; prevent complexity-based canonicalization
@@ -2472,8 +2470,8 @@ define i1 @icmp_and_or_lshr_cst(i32 %x) {
 
 define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec(
-; CHECK-NEXT:    [[AND1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 3>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND1]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 3>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %shf = lshr <2 x i32> %x, <i32 1, i32 1>
@@ -2486,10 +2484,8 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) {
 define <2 x i1> @icmp_and_or_lshr_cst_vec_commute(<2 x i32> %xp) {
 ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_commute(
 ; CHECK-NEXT:    [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
-; CHECK-NEXT:    [[SHF:%.*]] = lshr <2 x i32> [[X]], <i32 1, i32 1>
-; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]]
-; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> [[OR]], <i32 1, i32 1>
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X]], <i32 3, i32 3>
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %x = srem <2 x i32> %xp, <i32 42, i32 -42> ; prevent complexity-based canonicalization
diff --git a/test/Transforms/InstCombine/vector-casts.ll b/test/Transforms/InstCombine/vector-casts.ll
index 6e0d66b8ed4..e0d6083a969 100644
--- a/test/Transforms/InstCombine/vector-casts.ll
+++ b/test/Transforms/InstCombine/vector-casts.ll
@@ -1,26 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-; This turns into a&1 != 0
-; TODO: The bar for canonicalizing to something bigger than the minimal length IR is very high. 
-; This pattern does not appear to meet that standard.
+; Can't get smaller than this.
 
 define <2 x i1> @trunc(<2 x i64> %a) {
 ; CHECK-LABEL: @trunc(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
-; CHECK-NEXT:    [[T:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[T]]
 ;
   %t = trunc <2 x i64> %a to <2 x i1>
   ret <2 x i1> %t
 }
 
-; TODO: This could be just 1 instruction (trunc). 
+; This is trunc.
 
 define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
 ; CHECK-LABEL: @and_cmp_is_trunc(
-; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
-; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %t = and <2 x i64> %a, <i64 1, i64 1>
@@ -28,12 +24,11 @@ define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
   ret <2 x i1> %r
 }
 
-; TODO: This could be just 1 instruction (trunc). 
+; This is trunc.
 
 define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) {
 ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt(
-; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 undef, i64 1>
-; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %t = and <2 x i64> %a, <i64 undef, i64 1>
@@ -41,7 +36,7 @@ define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) {
   ret <2 x i1> %r
 }
 
-; TODO: This could be just 1 instruction (trunc). 
+; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete.
 
 define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) {
 ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts(
diff --git a/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index 8e948639ba1..71038feec7b 100644
--- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -2901,49 +2901,45 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
 ; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
 ; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; AVX-NEXT:    [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; AVX-NEXT:    [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1>
-; AVX-NEXT:    [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1>
-; AVX-NEXT:    [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1>
-; AVX-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer
-; AVX-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer
-; AVX-NEXT:    [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer
-; AVX-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
+; AVX-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
+; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
+; AVX-NEXT:    [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP13:%.*]] = bitcast double** [[TMP12]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
 ; AVX-NEXT:    [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x double*> undef)
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 4
+; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 12
 ; AVX-NEXT:    [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x double*> undef)
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8
-; AVX-NEXT:    [[TMP21:%.*]] = bitcast double** [[TMP20]] to <4 x double*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x double*> undef)
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 12
-; AVX-NEXT:    [[TMP23:%.*]] = bitcast double** [[TMP22]] to <4 x double*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double*> undef)
-; AVX-NEXT:    [[TMP24:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX-NEXT:    [[TMP25:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
-; AVX-NEXT:    [[TMP26:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
-; AVX-NEXT:    [[TMP27:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
-; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
-; AVX-NEXT:    [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]]
-; AVX-NEXT:    [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]]
-; AVX-NEXT:    [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]]
-; AVX-NEXT:    [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]]
-; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]])
-; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4
+; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX-NEXT:    [[TMP21:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX-NEXT:    [[TMP22:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX-NEXT:    [[TMP23:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
+; AVX-NEXT:    [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
+; AVX-NEXT:    [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
+; AVX-NEXT:    [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
+; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
+; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
+; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
 ; AVX-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]])
-; AVX-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
-; AVX-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]])
-; AVX-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12
-; AVX-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]])
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
 ; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; AVX-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51
+; AVX-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51
 ; AVX:       middle.block:
 ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@@ -2953,14 +2949,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
 ; AVX:       for.body:
 ; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
 ; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX-NEXT:    [[TMP42:%.*]] = and i8 [[TMP41]], 1
-; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
+; AVX-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
 ; AVX-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX:       land.lhs.true:
 ; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
-; AVX-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP43]], null
+; AVX-NEXT:    [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
+; AVX-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
 ; AVX-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX:       if.then:
 ; AVX-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
@@ -2998,49 +2994,45 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
 ; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
 ; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
-; AVX512-NEXT:    [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer
-; AVX512-NEXT:    [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer
-; AVX512-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
-; AVX512-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
+; AVX512-NEXT:    [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
+; AVX512-NEXT:    [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
+; AVX512-NEXT:    [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast double** [[TMP12]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 16
 ; AVX512-NEXT:    [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x double*> undef)
-; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 24
 ; AVX512-NEXT:    [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x double*> undef)
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 16
-; AVX512-NEXT:    [[TMP21:%.*]] = bitcast double** [[TMP20]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x double*> undef)
-; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 24
-; AVX512-NEXT:    [[TMP23:%.*]] = bitcast double** [[TMP22]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double*> undef)
-; AVX512-NEXT:    [[TMP24:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX512-NEXT:    [[TMP25:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
-; AVX512-NEXT:    [[TMP26:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
-; AVX512-NEXT:    [[TMP27:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
-; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]]
-; AVX512-NEXT:    [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]]
-; AVX512-NEXT:    [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]]
-; AVX512-NEXT:    [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]]
-; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]])
-; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
+; AVX512-NEXT:    [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
+; AVX512-NEXT:    [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
+; AVX512-NEXT:    [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
 ; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]])
-; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16
-; AVX512-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]])
-; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24
-; AVX512-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
-; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@@ -3050,14 +3042,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX512-NEXT:    [[TMP42:%.*]] = and i8 [[TMP41]], 1
-; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
+; AVX512-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
 ; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX512:       land.lhs.true:
 ; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
-; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP43]], null
+; AVX512-NEXT:    [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
 ; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
@@ -3162,49 +3154,45 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
 ; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
 ; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; AVX-NEXT:    [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; AVX-NEXT:    [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1>
-; AVX-NEXT:    [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1>
-; AVX-NEXT:    [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1>
-; AVX-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer
-; AVX-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer
-; AVX-NEXT:    [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer
-; AVX-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
+; AVX-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
+; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
+; AVX-NEXT:    [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
 ; AVX-NEXT:    [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x i32 ()*> undef)
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 4
+; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 12
 ; AVX-NEXT:    [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x i32 ()*> undef)
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8
-; AVX-NEXT:    [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <4 x i32 ()*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x i32 ()*> undef)
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 12
-; AVX-NEXT:    [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <4 x i32 ()*>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x i32 ()*> undef)
-; AVX-NEXT:    [[TMP24:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX-NEXT:    [[TMP25:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
-; AVX-NEXT:    [[TMP26:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
-; AVX-NEXT:    [[TMP27:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
-; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
-; AVX-NEXT:    [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]]
-; AVX-NEXT:    [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]]
-; AVX-NEXT:    [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]]
-; AVX-NEXT:    [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]]
-; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]])
-; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4
+; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX-NEXT:    [[TMP22:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX-NEXT:    [[TMP23:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
+; AVX-NEXT:    [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
+; AVX-NEXT:    [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
+; AVX-NEXT:    [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
+; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
+; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
+; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
 ; AVX-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]])
-; AVX-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
-; AVX-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]])
-; AVX-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12
-; AVX-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]])
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
 ; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; AVX-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54
+; AVX-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54
 ; AVX:       middle.block:
 ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@@ -3214,14 +3202,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
 ; AVX:       for.body:
 ; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
 ; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX-NEXT:    [[TMP42:%.*]] = and i8 [[TMP41]], 1
-; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
+; AVX-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
 ; AVX-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX:       land.lhs.true:
 ; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
-; AVX-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null
+; AVX-NEXT:    [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
+; AVX-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
 ; AVX-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX:       if.then:
 ; AVX-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
@@ -3259,49 +3247,45 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
 ; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
 ; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
-; AVX512-NEXT:    [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer
-; AVX512-NEXT:    [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer
-; AVX512-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer
-; AVX512-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
+; AVX512-NEXT:    [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
+; AVX512-NEXT:    [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
+; AVX512-NEXT:    [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 16
 ; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x i32 ()*> undef)
-; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 24
 ; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x i32 ()*> undef)
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 16
-; AVX512-NEXT:    [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x i32 ()*> undef)
-; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 24
-; AVX512-NEXT:    [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x i32 ()*> undef)
-; AVX512-NEXT:    [[TMP24:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX512-NEXT:    [[TMP25:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
-; AVX512-NEXT:    [[TMP26:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
-; AVX512-NEXT:    [[TMP27:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
-; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]]
-; AVX512-NEXT:    [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]]
-; AVX512-NEXT:    [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]]
-; AVX512-NEXT:    [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]]
-; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]])
-; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
+; AVX512-NEXT:    [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
+; AVX512-NEXT:    [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
+; AVX512-NEXT:    [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
 ; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]])
-; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16
-; AVX512-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]])
-; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24
-; AVX512-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
-; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
@@ -3311,14 +3295,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX512-NEXT:    [[TMP42:%.*]] = and i8 [[TMP41]], 1
-; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
+; AVX512-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
 ; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX512:       land.lhs.true:
 ; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
-; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null
+; AVX512-NEXT:    [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
 ; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
-- 
GitLab


From 4c5954f7cf9763d09091f37fd909f70e21f96571 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 10 Oct 2018 20:50:52 +0000
Subject: [PATCH 0023/1116] [NFC][X86][AArch64] extract-bits.ll: add tests with
 constants+storing results.

As noted in https://reviews.llvm.org/D53080#inline-467678,
this *may* get pessimized by that diff.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344182 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/AArch64/extract-bits.ll |  90 ++++++++
 test/CodeGen/X86/extract-bits.ll     | 329 +++++++++++++++++++++++++++
 2 files changed, 419 insertions(+)

diff --git a/test/CodeGen/AArch64/extract-bits.ll b/test/CodeGen/AArch64/extract-bits.ll
index a60883b958e..21bebc67969 100644
--- a/test/CodeGen/AArch64/extract-bits.ll
+++ b/test/CodeGen/AArch64/extract-bits.ll
@@ -838,3 +838,93 @@ define i64 @c4_i64_bad(i64 %arg) {
   %tmp1 = and i64 %tmp0, 16382
   ret i64 %tmp1
 }
+
+; ---------------------------------------------------------------------------- ;
+; Constant, storing the result afterwards.
+; ---------------------------------------------------------------------------- ;
+
+; i32
+
+; The most canonical variant
+define void @c5_i32(i32 %arg, i32* %ptr) {
+; CHECK-LABEL: c5_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx w8, w0, #19, #10
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  store i32 %tmp1, i32* %ptr
+  ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i32(i32 %arg, i32* %ptr) {
+; CHECK-LABEL: c6_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx w8, w0, #19, #12
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 4095
+  store i32 %tmp1, i32* %ptr
+  ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i32(i32 %arg, i32* %ptr) {
+; CHECK-LABEL: c7_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx w8, w0, #19, #10
+; CHECK-NEXT:    lsl w8, w8, #2
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  %tmp2 = shl i32 %tmp1, 2
+  store i32 %tmp2, i32* %ptr
+  ret void
+}
+
+; i64
+
+; The most canonical variant
+define void @c5_i64(i64 %arg, i64* %ptr) {
+; CHECK-LABEL: c5_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx x8, x0, #51, #10
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  store i64 %tmp1, i64* %ptr
+  ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i64(i64 %arg, i64* %ptr) {
+; CHECK-LABEL: c6_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx x8, x0, #51, #12
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 4095
+  store i64 %tmp1, i64* %ptr
+  ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i64(i64 %arg, i64* %ptr) {
+; CHECK-LABEL: c7_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx x8, x0, #51, #10
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  %tmp2 = shl i64 %tmp1, 2
+  store i64 %tmp2, i64* %ptr
+  ret void
+}
diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll
index 6ee5b4a39a5..98c9ab271cb 100644
--- a/test/CodeGen/X86/extract-bits.ll
+++ b/test/CodeGen/X86/extract-bits.ll
@@ -5880,3 +5880,332 @@ define i64 @c4_i64_bad(i64 %arg) {
   %tmp1 = and i64 %tmp0, 16382
   ret i64 %tmp1
 }
+
+; ---------------------------------------------------------------------------- ;
+; Constant, storing the result afterwards.
+; ---------------------------------------------------------------------------- ;
+
+; i32
+
+; The most canonical variant
+define void @c5_i32(i32 %arg, i32* %ptr) {
+; X86-NOBMI-LABEL: c5_i32:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl $19, %ecx
+; X86-NOBMI-NEXT:    andl $1023, %ecx # imm = 0x3FF
+; X86-NOBMI-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: c5_i32:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl $2579, %ecx # imm = 0xA13
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1TBM-LABEL: c5_i32:
+; X86-BMI1TBM:       # %bb.0:
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1TBM-NEXT:    bextrl $2579, {{[0-9]+}}(%esp), %ecx # imm = 0xA13
+; X86-BMI1TBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1TBM-NEXT:    retl
+;
+; X86-BMI1NOTBMBMI2-LABEL: c5_i32:
+; X86-BMI1NOTBMBMI2:       # %bb.0:
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBMBMI2-NEXT:    movl $2579, %ecx # imm = 0xA13
+; X86-BMI1NOTBMBMI2-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: c5_i32:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    shrl $19, %edi
+; X64-NOBMI-NEXT:    andl $1023, %edi # imm = 0x3FF
+; X64-NOBMI-NEXT:    movl %edi, (%rsi)
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: c5_i32:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl $2579, %eax # imm = 0xA13
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %eax, (%rsi)
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1TBM-LABEL: c5_i32:
+; X64-BMI1TBM:       # %bb.0:
+; X64-BMI1TBM-NEXT:    bextrl $2579, %edi, %eax # imm = 0xA13
+; X64-BMI1TBM-NEXT:    movl %eax, (%rsi)
+; X64-BMI1TBM-NEXT:    retq
+;
+; X64-BMI1NOTBMBMI2-LABEL: c5_i32:
+; X64-BMI1NOTBMBMI2:       # %bb.0:
+; X64-BMI1NOTBMBMI2-NEXT:    movl $2579, %eax # imm = 0xA13
+; X64-BMI1NOTBMBMI2-NEXT:    bextrl %eax, %edi, %eax
+; X64-BMI1NOTBMBMI2-NEXT:    movl %eax, (%rsi)
+; X64-BMI1NOTBMBMI2-NEXT:    retq
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  store i32 %tmp1, i32* %ptr
+  ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i32(i32 %arg, i32* %ptr) {
+; X86-NOBMI-LABEL: c6_i32:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl $19, %ecx
+; X86-NOBMI-NEXT:    andl $4095, %ecx # imm = 0xFFF
+; X86-NOBMI-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: c6_i32:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl $3091, %ecx # imm = 0xC13
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1TBM-LABEL: c6_i32:
+; X86-BMI1TBM:       # %bb.0:
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1TBM-NEXT:    bextrl $3091, {{[0-9]+}}(%esp), %ecx # imm = 0xC13
+; X86-BMI1TBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1TBM-NEXT:    retl
+;
+; X86-BMI1NOTBMBMI2-LABEL: c6_i32:
+; X86-BMI1NOTBMBMI2:       # %bb.0:
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBMBMI2-NEXT:    movl $3091, %ecx # imm = 0xC13
+; X86-BMI1NOTBMBMI2-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: c6_i32:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    shrl $19, %edi
+; X64-NOBMI-NEXT:    andl $4095, %edi # imm = 0xFFF
+; X64-NOBMI-NEXT:    movl %edi, (%rsi)
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: c6_i32:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl $3091, %eax # imm = 0xC13
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %eax, (%rsi)
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1TBM-LABEL: c6_i32:
+; X64-BMI1TBM:       # %bb.0:
+; X64-BMI1TBM-NEXT:    bextrl $3091, %edi, %eax # imm = 0xC13
+; X64-BMI1TBM-NEXT:    movl %eax, (%rsi)
+; X64-BMI1TBM-NEXT:    retq
+;
+; X64-BMI1NOTBMBMI2-LABEL: c6_i32:
+; X64-BMI1NOTBMBMI2:       # %bb.0:
+; X64-BMI1NOTBMBMI2-NEXT:    movl $3091, %eax # imm = 0xC13
+; X64-BMI1NOTBMBMI2-NEXT:    bextrl %eax, %edi, %eax
+; X64-BMI1NOTBMBMI2-NEXT:    movl %eax, (%rsi)
+; X64-BMI1NOTBMBMI2-NEXT:    retq
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 4095
+  store i32 %tmp1, i32* %ptr
+  ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i32(i32 %arg, i32* %ptr) {
+; X86-LABEL: c7_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $17, %ecx
+; X86-NEXT:    andl $4092, %ecx # imm = 0xFFC
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: c7_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    shrl $17, %edi
+; X64-NEXT:    andl $4092, %edi # imm = 0xFFC
+; X64-NEXT:    movl %edi, (%rsi)
+; X64-NEXT:    retq
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  %tmp2 = shl i32 %tmp1, 2
+  store i32 %tmp2, i32* %ptr
+  ret void
+}
+
+; i64
+
+; The most canonical variant
+define void @c5_i64(i64 %arg, i64* %ptr) {
+; X86-NOBMI-LABEL: c5_i64:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl $19, %ecx
+; X86-NOBMI-NEXT:    andl $1023, %ecx # imm = 0x3FF
+; X86-NOBMI-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI-NEXT:    movl $0, 4(%eax)
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: c5_i64:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl $2579, %ecx # imm = 0xA13
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBM-NEXT:    movl $0, 4(%eax)
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1TBM-LABEL: c5_i64:
+; X86-BMI1TBM:       # %bb.0:
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1TBM-NEXT:    bextrl $2579, {{[0-9]+}}(%esp), %ecx # imm = 0xA13
+; X86-BMI1TBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1TBM-NEXT:    movl $0, 4(%eax)
+; X86-BMI1TBM-NEXT:    retl
+;
+; X86-BMI1NOTBMBMI2-LABEL: c5_i64:
+; X86-BMI1NOTBMBMI2:       # %bb.0:
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBMBMI2-NEXT:    movl $2579, %ecx # imm = 0xA13
+; X86-BMI1NOTBMBMI2-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    movl $0, 4(%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: c5_i64:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    shrq $51, %rdi
+; X64-NOBMI-NEXT:    andl $1023, %edi # imm = 0x3FF
+; X64-NOBMI-NEXT:    movq %rdi, (%rsi)
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: c5_i64:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl $2611, %eax # imm = 0xA33
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rax, (%rsi)
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1TBM-LABEL: c5_i64:
+; X64-BMI1TBM:       # %bb.0:
+; X64-BMI1TBM-NEXT:    bextrq $2611, %rdi, %rax # imm = 0xA33
+; X64-BMI1TBM-NEXT:    movq %rax, (%rsi)
+; X64-BMI1TBM-NEXT:    retq
+;
+; X64-BMI1NOTBMBMI2-LABEL: c5_i64:
+; X64-BMI1NOTBMBMI2:       # %bb.0:
+; X64-BMI1NOTBMBMI2-NEXT:    movl $2611, %eax # imm = 0xA33
+; X64-BMI1NOTBMBMI2-NEXT:    bextrq %rax, %rdi, %rax
+; X64-BMI1NOTBMBMI2-NEXT:    movq %rax, (%rsi)
+; X64-BMI1NOTBMBMI2-NEXT:    retq
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  store i64 %tmp1, i64* %ptr
+  ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i64(i64 %arg, i64* %ptr) {
+; X86-NOBMI-LABEL: c6_i64:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl $19, %ecx
+; X86-NOBMI-NEXT:    andl $4095, %ecx # imm = 0xFFF
+; X86-NOBMI-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI-NEXT:    movl $0, 4(%eax)
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: c6_i64:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl $3091, %ecx # imm = 0xC13
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBM-NEXT:    movl $0, 4(%eax)
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1TBM-LABEL: c6_i64:
+; X86-BMI1TBM:       # %bb.0:
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1TBM-NEXT:    bextrl $3091, {{[0-9]+}}(%esp), %ecx # imm = 0xC13
+; X86-BMI1TBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1TBM-NEXT:    movl $0, 4(%eax)
+; X86-BMI1TBM-NEXT:    retl
+;
+; X86-BMI1NOTBMBMI2-LABEL: c6_i64:
+; X86-BMI1NOTBMBMI2:       # %bb.0:
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBMBMI2-NEXT:    movl $3091, %ecx # imm = 0xC13
+; X86-BMI1NOTBMBMI2-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    movl $0, 4(%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: c6_i64:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    shrq $51, %rdi
+; X64-NOBMI-NEXT:    andl $4095, %edi # imm = 0xFFF
+; X64-NOBMI-NEXT:    movq %rdi, (%rsi)
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: c6_i64:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl $3123, %eax # imm = 0xC33
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rax, (%rsi)
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1TBM-LABEL: c6_i64:
+; X64-BMI1TBM:       # %bb.0:
+; X64-BMI1TBM-NEXT:    bextrq $3123, %rdi, %rax # imm = 0xC33
+; X64-BMI1TBM-NEXT:    movq %rax, (%rsi)
+; X64-BMI1TBM-NEXT:    retq
+;
+; X64-BMI1NOTBMBMI2-LABEL: c6_i64:
+; X64-BMI1NOTBMBMI2:       # %bb.0:
+; X64-BMI1NOTBMBMI2-NEXT:    movl $3123, %eax # imm = 0xC33
+; X64-BMI1NOTBMBMI2-NEXT:    bextrq %rax, %rdi, %rax
+; X64-BMI1NOTBMBMI2-NEXT:    movq %rax, (%rsi)
+; X64-BMI1NOTBMBMI2-NEXT:    retq
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 4095
+  store i64 %tmp1, i64* %ptr
+  ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i64(i64 %arg, i64* %ptr) {
+; X86-LABEL: c7_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $17, %ecx
+; X86-NEXT:    andl $4092, %ecx # imm = 0xFFC
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: c7_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    shrq $49, %rdi
+; X64-NEXT:    andl $4092, %edi # imm = 0xFFC
+; X64-NEXT:    movq %rdi, (%rsi)
+; X64-NEXT:    retq
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  %tmp2 = shl i64 %tmp1, 2
+  store i64 %tmp2, i64* %ptr
+  ret void
+}
-- 
GitLab


From 742beb6d7db253a3fbbdcd398f0fa1dfd9fc82e6 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Wed, 10 Oct 2018 21:07:02 +0000
Subject: [PATCH 0024/1116] llvm-ar: Darwin archive format fixes.

* Support writing the DARWIN64 symbol table format.

* In darwin archives, emit a symbol table whenever requested, even
  when there are no members, as the apple linker will abort if given
  an archive without a symbol table.

Added tests for same, and also simplified and moved the GNU 64-bit
symbol table test into archive-symtab.test.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344183 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Object/ArchiveWriter.cpp         | 61 ++++++++++++++++------------
 test/Object/archive-GNU64-write.test | 40 ------------------
 test/Object/archive-format.test      |  2 +-
 test/Object/archive-symtab.test      | 28 +++++++++++++
 4 files changed, 64 insertions(+), 67 deletions(-)
 delete mode 100644 test/Object/archive-GNU64-write.test

diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp
index c6c0befb90f..767205390e0 100644
--- a/lib/Object/ArchiveWriter.cpp
+++ b/lib/Object/ArchiveWriter.cpp
@@ -121,6 +121,11 @@ static void printWithSpacePadding(raw_ostream &OS, T Data, unsigned Size) {
   OS.indent(Size - SizeSoFar);
 }
 
+static bool isDarwin(object::Archive::Kind Kind) {
+  return Kind == object::Archive::K_DARWIN ||
+         Kind == object::Archive::K_DARWIN64;
+}
+
 static bool isBSDLike(object::Archive::Kind Kind) {
   switch (Kind) {
   case object::Archive::K_GNU:
@@ -128,8 +133,8 @@ static bool isBSDLike(object::Archive::Kind Kind) {
     return false;
   case object::Archive::K_BSD:
   case object::Archive::K_DARWIN:
-    return true;
   case object::Archive::K_DARWIN64:
+    return true;
   case object::Archive::K_COFF:
     break;
   }
@@ -314,7 +319,9 @@ static void printNBits(raw_ostream &Out, object::Archive::Kind Kind,
 static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
                              bool Deterministic, ArrayRef<MemberData> Members,
                              StringRef StringTable) {
-  if (StringTable.empty())
+  // We don't write a symbol table on an archive with no members -- except on
+  // Darwin, where the linker will abort unless the archive has a symbol table.
+  if (StringTable.empty() && !isDarwin(Kind))
     return;
 
   unsigned NumSyms = 0;
@@ -322,15 +329,15 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
     NumSyms += M.Symbols.size();
 
   unsigned Size = 0;
-  Size += is64BitKind(Kind) ? 8 : 4; // Number of entries
+  unsigned OffsetSize = is64BitKind(Kind) ? sizeof(uint64_t) : sizeof(uint32_t);
+
+  Size += OffsetSize; // Number of entries
   if (isBSDLike(Kind))
-    Size += NumSyms * 8; // Table
-  else if (is64BitKind(Kind))
-    Size += NumSyms * 8; // Table
+    Size += NumSyms * OffsetSize * 2; // Table
   else
-    Size += NumSyms * 4; // Table
+    Size += NumSyms * OffsetSize; // Table
   if (isBSDLike(Kind))
-    Size += 4; // byte count
+    Size += OffsetSize; // byte count
   Size += StringTable.size();
   // ld64 expects the members to be 8-byte aligned for 64-bit content and at
   // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
@@ -340,25 +347,26 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
   unsigned Pad = OffsetToAlignment(Size, Alignment);
   Size += Pad;
 
-  if (isBSDLike(Kind))
-    printBSDMemberHeader(Out, Out.tell(), "__.SYMDEF", now(Deterministic), 0, 0,
-                         0, Size);
-  else if (is64BitKind(Kind))
-    printGNUSmallMemberHeader(Out, "/SYM64", now(Deterministic), 0, 0, 0, Size);
-  else
-    printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, Size);
+  if (isBSDLike(Kind)) {
+    const char *Name = is64BitKind(Kind) ? "__.SYMDEF_64" : "__.SYMDEF";
+    printBSDMemberHeader(Out, Out.tell(), Name, now(Deterministic), 0, 0, 0,
+                         Size);
+  } else {
+    const char *Name = is64BitKind(Kind) ? "/SYM64" : "";
+    printGNUSmallMemberHeader(Out, Name, now(Deterministic), 0, 0, 0, Size);
+  }
 
   uint64_t Pos = Out.tell() + Size;
 
   if (isBSDLike(Kind))
-    print<uint32_t>(Out, Kind, NumSyms * 8);
+    printNBits(Out, Kind, NumSyms * 2 * OffsetSize);
   else
     printNBits(Out, Kind, NumSyms);
 
   for (const MemberData &M : Members) {
     for (unsigned StringOffset : M.Symbols) {
       if (isBSDLike(Kind))
-        print<uint32_t>(Out, Kind, StringOffset);
+        printNBits(Out, Kind, StringOffset);
       printNBits(Out, Kind, Pos); // member offset
     }
     Pos += M.Header.size() + M.Data.size() + M.Padding.size();
@@ -366,7 +374,7 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
 
   if (isBSDLike(Kind))
     // byte count of the string table
-    print<uint32_t>(Out, Kind, StringTable.size());
+    printNBits(Out, Kind, StringTable.size());
   Out << StringTable;
 
   while (Pad--)
@@ -466,9 +474,7 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
   // See also the functions that handle the lookup:
   // in lldb: ObjectContainerBSDArchive::Archive::FindObject()
   // in llvm/tools/dsymutil: BinaryHolder::GetArchiveMemberBuffers().
-  bool UniqueTimestamps =
-      Deterministic && (Kind == object::Archive::K_DARWIN ||
-                        Kind == object::Archive::K_DARWIN64);
+  bool UniqueTimestamps = Deterministic && isDarwin(Kind);
   std::map<StringRef, unsigned> FilenameCount;
   if (UniqueTimestamps) {
     for (const NewArchiveMember &M : NewMembers)
@@ -488,9 +494,8 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
     // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
     // uniformly.  This matches the behaviour with cctools and ensures that ld64
     // is happy with archives that we generate.
-    unsigned MemberPadding = Kind == object::Archive::K_DARWIN
-                                 ? OffsetToAlignment(Data.size(), 8)
-                                 : 0;
+    unsigned MemberPadding =
+        isDarwin(Kind) ? OffsetToAlignment(Data.size(), 8) : 0;
     unsigned TailPadding = OffsetToAlignment(Data.size() + MemberPadding, 2);
     StringRef Padding = StringRef(PaddingData, MemberPadding + TailPadding);
 
@@ -569,8 +574,12 @@ Error llvm::writeArchive(StringRef ArcName,
     // If LastOffset isn't going to fit in a 32-bit varible we need to switch
     // to 64-bit. Note that the file can be larger than 4GB as long as the last
     // member starts before the 4GB offset.
-    if (LastOffset >= (1ULL << Sym64Threshold))
-      Kind = object::Archive::K_GNU64;
+    if (LastOffset >= (1ULL << Sym64Threshold)) {
+      if (Kind == object::Archive::K_DARWIN)
+        Kind = object::Archive::K_DARWIN64;
+      else
+        Kind = object::Archive::K_GNU64;
+    }
   }
 
   Expected<sys::fs::TempFile> Temp =
diff --git a/test/Object/archive-GNU64-write.test b/test/Object/archive-GNU64-write.test
deleted file mode 100644
index 0bfb7c80d05..00000000000
--- a/test/Object/archive-GNU64-write.test
+++ /dev/null
@@ -1,40 +0,0 @@
-# REQUIRES: llvm-64-bits
-# REQUIRES: system-linux
-# REQUIRES: shell
-
-# RUN: yaml2obj %s > %t
-# RUN: dd if=%t of=%t bs=1 count=0 seek=1M
-# RUN: rm -f %t.lib
-# RUN: cp %t %t2
-# RUN: SYM64_THRESHOLD=19 llvm-ar cr %t.lib %t %t2 %p/Inputs/trivial-object-test.elf-x86-64
-# RUN: llvm-nm --print-armap %t.lib | FileCheck %s
-# RUN: grep SYM64 %t.lib
-
-# Delete temp files. They are too large.
-# RUN: rm -f %t %t2 %t.lib
-
-!ELF
-FileHeader:
-  Class:           ELFCLASS64
-  Data:            ELFDATA2LSB
-  Type:            ET_EXEC
-  Machine:         EM_X86_64
-Sections:
-  - Name:            .data
-    Type:            SHT_PROGBITS
-    Flags:           [ SHF_ALLOC ]
-    AddressAlign:    0x0000000000000001
-    Content:         "00"
-    Size:            32
-
-# CHECK:      Archive map
-# CHECK-NEXT: main in trivial-object-test.elf-x86-64
-
-# CHECK:    archive-GNU64-write.test.tmp:
-
-# CHECK:    archive-GNU64-write.test.tmp2:
-
-# CHECK:    trivial-object-test.elf-x86-64:
-# CHECK-NEXT:                     U SomeOtherFunction
-# CHECK-NEXT:    0000000000000000 T main
-# CHECK-NEXT:                     U puts
diff --git a/test/Object/archive-format.test b/test/Object/archive-format.test
index 219fc7f894a..b1ae411161b 100644
--- a/test/Object/archive-format.test
+++ b/test/Object/archive-format.test
@@ -38,7 +38,7 @@ BSD-SAME: #1/16           0           0     0     644     20        `
 BSD-NEXT: 0123456789abcdefzed.
 
 RUN: rm -f %t.a
-RUN: llvm-ar --format=darwin rc %t.a 0123456789abcde 0123456789abcdef
+RUN: llvm-ar --format=darwin rcS %t.a 0123456789abcde 0123456789abcdef
 RUN: cat %t.a | FileCheck -strict-whitespace --check-prefix=DARWIN %s
 
 DARWIN:      !<arch>
diff --git a/test/Object/archive-symtab.test b/test/Object/archive-symtab.test
index 297970725bd..96f48139ddd 100644
--- a/test/Object/archive-symtab.test
+++ b/test/Object/archive-symtab.test
@@ -2,6 +2,11 @@ RUN: rm -f %t.a
 RUN: llvm-ar rcsU %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
 RUN: llvm-nm -M %t.a | FileCheck %s
 
+RUN: rm -f %t.a
+RUN: env SYM64_THRESHOLD=1 llvm-ar rcsU %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
+RUN: llvm-nm -M %t.a | FileCheck %s
+RUXX: grep SYM64 %t.a
+
 CHECK: Archive map
 CHECK-NEXT: main in trivial-object-test.elf-x86-64
 CHECK-NEXT: foo in trivial-object-test2.elf-x86-64
@@ -82,6 +87,11 @@ RUN: rm -f %t.a
 RUN: llvm-ar --format=bsd rcsU %t.a %p/Inputs/trivial-object-test.macho-x86-64 %p/Inputs/trivial-object-test2.macho-x86-64
 RUN: llvm-nm -M %t.a | FileCheck --check-prefix=MACHO %s
 
+RUN: rm -f %t.a
+RUN: env SYM64_THRESHOLD=1 llvm-ar --format=darwin rcsU %t.a %p/Inputs/trivial-object-test.macho-x86-64 %p/Inputs/trivial-object-test2.macho-x86-64
+RUN: llvm-nm -M %t.a | FileCheck --check-prefix=MACHO %s
+RUN: grep '__\.SYMDEF_64' %t.a
+
 MACHO: Archive map
 MACHO-NEXT: _main in trivial-object-test.macho-x86-64
 MACHO-NEXT: _foo in trivial-object-test2.macho-x86-64
@@ -138,3 +148,21 @@ RUN: llvm-ar --format=gnu rcsD %t.a %p/Inputs/trivial-object-test.macho-x86-64
 RUN: FileCheck --check-prefix=GNU-SYMTAB-ALIGN %s < %t.a
 GNU-SYMTAB-ALIGN: !<arch>
 GNU-SYMTAB-ALIGN-NEXT: /               0           0     0     0       14        `
+
+
+** Test the behavior of an empty archive:
+
+No symbol table emitted for GNU archives
+RUN: rm -f %t.a
+RUN: llvm-ar rcs --format=gnu %t.a
+RUN: not grep -q '/               ' %t.a
+
+No symbol table for BSD archives
+RUN: rm -f %t.a
+RUN: llvm-ar rcs --format=bsd %t.a
+RUN: not grep -q '__\.SYMDEF' %t.a
+
+And we do emit a symbol table for DARWIN archives
+RUN: rm -f %t.a
+RUN: llvm-ar rcs --format=darwin %t.a
+RUN: grep -q '__\.SYMDEF' %t.a
-- 
GitLab


From 3b1e430b900c604ea3c9e3dd1ceee07db3338a5e Mon Sep 17 00:00:00 2001
From: Richard Smith <richard-llvm@metafoo.co.uk>
Date: Wed, 10 Oct 2018 21:09:37 +0000
Subject: [PATCH 0025/1116] Support for remapping profile data when symbols
 change, for instrumentation-based profiling.

Reviewers: davidxl, tejohnson, dlj, erik.pilkington

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D51247

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344184 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/InstrProfReader.h |  30 +++-
 lib/ProfileData/InstrProfReader.cpp        | 164 +++++++++++++++++++--
 unittests/ProfileData/InstrProfTest.cpp    |  44 +++++-
 3 files changed, 223 insertions(+), 15 deletions(-)

diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index efc22dcd0d9..08d78227611 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -348,6 +348,9 @@ struct InstrProfReaderIndexBase {
 using OnDiskHashTableImplV3 =
     OnDiskIterableChainedHashTable<InstrProfLookupTrait>;
 
+template <typename HashTableImpl>
+class InstrProfReaderItaniumRemapper;
+
 template <typename HashTableImpl>
 class InstrProfReaderIndex : public InstrProfReaderIndexBase {
 private:
@@ -355,6 +358,8 @@ private:
   typename HashTableImpl::data_iterator RecordIterator;
   uint64_t FormatVersion;
 
+  friend class InstrProfReaderItaniumRemapper<HashTableImpl>;
+
 public:
   InstrProfReaderIndex(const unsigned char *Buckets,
                        const unsigned char *const Payload,
@@ -386,13 +391,26 @@ public:
   }
 };
 
+/// Name matcher supporting fuzzy matching of symbol names to names in profiles.
+class InstrProfReaderRemapper {
+public:
+  virtual ~InstrProfReaderRemapper() {}
+  virtual Error populateRemappings() { return Error::success(); }
+  virtual Error getRecords(StringRef FuncName,
+                           ArrayRef<NamedInstrProfRecord> &Data) = 0;
+};
+
 /// Reader for the indexed binary instrprof format.
 class IndexedInstrProfReader : public InstrProfReader {
 private:
   /// The profile data file contents.
   std::unique_ptr<MemoryBuffer> DataBuffer;
+  /// The profile remapping file contents.
+  std::unique_ptr<MemoryBuffer> RemappingBuffer;
   /// The index into the profile data.
   std::unique_ptr<InstrProfReaderIndexBase> Index;
+  /// The profile remapping file contents.
+  std::unique_ptr<InstrProfReaderRemapper> Remapper;
   /// Profile summary data.
   std::unique_ptr<ProfileSummary> Summary;
   // Index to the current record in the record array.
@@ -404,8 +422,11 @@ private:
                                    const unsigned char *Cur);
 
 public:
-  IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
-      : DataBuffer(std::move(DataBuffer)), RecordIndex(0) {}
+  IndexedInstrProfReader(
+      std::unique_ptr<MemoryBuffer> DataBuffer,
+      std::unique_ptr<MemoryBuffer> RemappingBuffer = nullptr)
+      : DataBuffer(std::move(DataBuffer)),
+        RemappingBuffer(std::move(RemappingBuffer)), RecordIndex(0) {}
   IndexedInstrProfReader(const IndexedInstrProfReader &) = delete;
   IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete;
 
@@ -434,10 +455,11 @@ public:
 
   /// Factory method to create an indexed reader.
   static Expected<std::unique_ptr<IndexedInstrProfReader>>
-  create(const Twine &Path);
+  create(const Twine &Path, const Twine &RemappingPath = "");
 
   static Expected<std::unique_ptr<IndexedInstrProfReader>>
-  create(std::unique_ptr<MemoryBuffer> Buffer);
+  create(std::unique_ptr<MemoryBuffer> Buffer,
+         std::unique_ptr<MemoryBuffer> RemappingBuffer = nullptr);
 
   // Used for testing purpose only.
   void setValueProfDataEndianness(support::endianness Endianness) {
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index 3b704158a5c..eaf0eb04bfb 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
@@ -23,6 +24,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SymbolRemappingReader.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include <algorithm>
 #include <cctype>
@@ -88,16 +90,29 @@ InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
 }
 
 Expected<std::unique_ptr<IndexedInstrProfReader>>
-IndexedInstrProfReader::create(const Twine &Path) {
+IndexedInstrProfReader::create(const Twine &Path, const Twine &RemappingPath) {
   // Set up the buffer to read.
   auto BufferOrError = setupMemoryBuffer(Path);
   if (Error E = BufferOrError.takeError())
     return std::move(E);
-  return IndexedInstrProfReader::create(std::move(BufferOrError.get()));
+
+  // Set up the remapping buffer if requested.
+  std::unique_ptr<MemoryBuffer> RemappingBuffer;
+  std::string RemappingPathStr = RemappingPath.str();
+  if (!RemappingPathStr.empty()) {
+    auto RemappingBufferOrError = setupMemoryBuffer(RemappingPathStr);
+    if (Error E = RemappingBufferOrError.takeError())
+      return std::move(E);
+    RemappingBuffer = std::move(RemappingBufferOrError.get());
+  }
+
+  return IndexedInstrProfReader::create(std::move(BufferOrError.get()),
+                                        std::move(RemappingBuffer));
 }
 
 Expected<std::unique_ptr<IndexedInstrProfReader>>
-IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
+IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer,
+                               std::unique_ptr<MemoryBuffer> RemappingBuffer) {
   // Sanity check the buffer.
   if (uint64_t(Buffer->getBufferSize()) > std::numeric_limits<unsigned>::max())
     return make_error<InstrProfError>(instrprof_error::too_large);
@@ -105,7 +120,8 @@ IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
   // Create the reader.
   if (!IndexedInstrProfReader::hasFormat(*Buffer))
     return make_error<InstrProfError>(instrprof_error::bad_magic);
-  auto Result = llvm::make_unique<IndexedInstrProfReader>(std::move(Buffer));
+  auto Result = llvm::make_unique<IndexedInstrProfReader>(
+      std::move(Buffer), std::move(RemappingBuffer));
 
   // Initialize the reader and return the result.
   if (Error E = initializeReader(*Result))
@@ -587,6 +603,124 @@ InstrProfReaderIndex<HashTableImpl>::InstrProfReaderIndex(
   RecordIterator = HashTable->data_begin();
 }
 
+namespace {
+/// A remapper that does not apply any remappings.
+class InstrProfReaderNullRemapper : public InstrProfReaderRemapper {
+  InstrProfReaderIndexBase &Underlying;
+
+public:
+  InstrProfReaderNullRemapper(InstrProfReaderIndexBase &Underlying)
+      : Underlying(Underlying) {}
+
+  Error getRecords(StringRef FuncName,
+                   ArrayRef<NamedInstrProfRecord> &Data) override {
+    return Underlying.getRecords(FuncName, Data);
+  }
+};
+}
+
+/// A remapper that applies remappings based on a symbol remapping file.
+template <typename HashTableImpl>
+class llvm::InstrProfReaderItaniumRemapper
+    : public InstrProfReaderRemapper {
+public:
+  InstrProfReaderItaniumRemapper(
+      std::unique_ptr<MemoryBuffer> RemapBuffer,
+      InstrProfReaderIndex<HashTableImpl> &Underlying)
+      : RemapBuffer(std::move(RemapBuffer)), Underlying(Underlying) {
+  }
+
+  /// Extract the original function name from a PGO function name.
+  static StringRef extractName(StringRef Name) {
+    // We can have multiple :-separated pieces; there can be pieces both
+    // before and after the mangled name. Find the first part that starts
+    // with '_Z'; we'll assume that's the mangled name we want.
+    std::pair<StringRef, StringRef> Parts = {StringRef(), Name};
+    while (true) {
+      Parts = Parts.second.split(':');
+      if (Parts.first.startswith("_Z"))
+        return Parts.first;
+      if (Parts.second.empty())
+        return Name;
+    }
+  }
+
+  /// Given a mangled name extracted from a PGO function name, and a new
+  /// form for that mangled name, reconstitute the name.
+  static void reconstituteName(StringRef OrigName, StringRef ExtractedName,
+                               StringRef Replacement,
+                               SmallVectorImpl<char> &Out) {
+    Out.reserve(OrigName.size() + Replacement.size() - ExtractedName.size());
+    Out.insert(Out.end(), OrigName.begin(), ExtractedName.begin());
+    Out.insert(Out.end(), Replacement.begin(), Replacement.end());
+    Out.insert(Out.end(), ExtractedName.end(), OrigName.end());
+  }
+
+  Error populateRemappings() override {
+    if (Error E = Remappings.read(*RemapBuffer))
+      return E;
+    for (StringRef Name : Underlying.HashTable->keys()) {
+      StringRef RealName = extractName(Name);
+      if (auto Key = Remappings.insert(RealName)) {
+        // FIXME: We could theoretically map the same equivalence class to
+        // multiple names in the profile data. If that happens, we should
+        // return NamedInstrProfRecords from all of them.
+        MappedNames.insert({Key, RealName});
+      }
+    }
+    return Error::success();
+  }
+
+  Error getRecords(StringRef FuncName,
+                   ArrayRef<NamedInstrProfRecord> &Data) override {
+    StringRef RealName = extractName(FuncName);
+    if (auto Key = Remappings.lookup(RealName)) {
+      StringRef Remapped = MappedNames.lookup(Key);
+      if (!Remapped.empty()) {
+        if (RealName.begin() == FuncName.begin() &&
+            RealName.end() == FuncName.end())
+          FuncName = Remapped;
+        else {
+          // Try rebuilding the name from the given remapping.
+          SmallString<256> Reconstituted;
+          reconstituteName(FuncName, RealName, Remapped, Reconstituted);
+          Error E = Underlying.getRecords(Reconstituted, Data);
+          if (!E)
+            return E;
+
+          // If we failed because the name doesn't exist, fall back to asking
+          // about the original name.
+          if (Error Unhandled = handleErrors(
+                  std::move(E), [](std::unique_ptr<InstrProfError> Err) {
+                    return Err->get() == instrprof_error::unknown_function
+                               ? Error::success()
+                               : Error(std::move(Err));
+                  }))
+            return Unhandled;
+        }
+      }
+    }
+    return Underlying.getRecords(FuncName, Data);
+  }
+
+private:
+  /// The memory buffer containing the remapping configuration. Remappings
+  /// holds pointers into this buffer.
+  std::unique_ptr<MemoryBuffer> RemapBuffer;
+
+  /// The mangling remapper.
+  SymbolRemappingReader Remappings;
+
+  /// Mapping from mangled name keys to the name used for the key in the
+  /// profile data.
+  /// FIXME: Can we store a location within the on-disk hash table instead of
+  /// redoing lookup?
+  DenseMap<SymbolRemappingReader::Key, StringRef> MappedNames;
+
+  /// The real profile data reader.
+  InstrProfReaderIndex<HashTableImpl> &Underlying;
+};
+
 bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
   using namespace support;
 
@@ -683,10 +817,22 @@ Error IndexedInstrProfReader::readHeader() {
   uint64_t HashOffset = endian::byte_swap<uint64_t, little>(Header->HashOffset);
 
   // The rest of the file is an on disk hash table.
-  InstrProfReaderIndexBase *IndexPtr = nullptr;
-  IndexPtr = new InstrProfReaderIndex<OnDiskHashTableImplV3>(
-      Start + HashOffset, Cur, Start, HashType, FormatVersion);
-  Index.reset(IndexPtr);
+  auto IndexPtr =
+      llvm::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
+          Start + HashOffset, Cur, Start, HashType, FormatVersion);
+
+  // Load the remapping table now if requested.
+  if (RemappingBuffer) {
+    Remapper = llvm::make_unique<
+        InstrProfReaderItaniumRemapper<OnDiskHashTableImplV3>>(
+        std::move(RemappingBuffer), *IndexPtr);
+    if (Error E = Remapper->populateRemappings())
+      return E;
+  } else {
+    Remapper = llvm::make_unique<InstrProfReaderNullRemapper>(*IndexPtr);
+  }
+  Index = std::move(IndexPtr);
+
   return success();
 }
 
@@ -707,7 +853,7 @@ Expected<InstrProfRecord>
 IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
                                            uint64_t FuncHash) {
   ArrayRef<NamedInstrProfRecord> Data;
-  Error Err = Index->getRecords(FuncName, Data);
+  Error Err = Remapper->getRecords(FuncName, Data);
   if (Err)
     return std::move(Err);
   // Found it. Look for counters with the right hash.
diff --git a/unittests/ProfileData/InstrProfTest.cpp b/unittests/ProfileData/InstrProfTest.cpp
index 0c99f7fde65..2d915d44598 100644
--- a/unittests/ProfileData/InstrProfTest.cpp
+++ b/unittests/ProfileData/InstrProfTest.cpp
@@ -42,8 +42,10 @@ struct InstrProfTest : ::testing::Test {
 
   void SetUp() { Writer.setOutputSparse(false); }
 
-  void readProfile(std::unique_ptr<MemoryBuffer> Profile) {
-    auto ReaderOrErr = IndexedInstrProfReader::create(std::move(Profile));
+  void readProfile(std::unique_ptr<MemoryBuffer> Profile,
+                   std::unique_ptr<MemoryBuffer> Remapping = nullptr) {
+    auto ReaderOrErr = IndexedInstrProfReader::create(std::move(Profile),
+                                                      std::move(Remapping));
     EXPECT_THAT_ERROR(ReaderOrErr.takeError(), Succeeded());
     Reader = std::move(ReaderOrErr.get());
   }
@@ -990,6 +992,44 @@ TEST_P(MaybeSparseInstrProfTest, instr_prof_symtab_compression_test) {
   }
 }
 
+TEST_P(MaybeSparseInstrProfTest, remapping_test) {
+  Writer.addRecord({"_Z3fooi", 0x1234, {1, 2, 3, 4}}, Err);
+  Writer.addRecord({"file:_Z3barf", 0x567, {5, 6, 7}}, Err);
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile), llvm::MemoryBuffer::getMemBuffer(R"(
+    type i l
+    name 3bar 4quux
+  )"));
+
+  std::vector<uint64_t> Counts;
+  for (StringRef FooName : {"_Z3fooi", "_Z3fool"}) {
+    EXPECT_THAT_ERROR(Reader->getFunctionCounts(FooName, 0x1234, Counts),
+                      Succeeded());
+    ASSERT_EQ(4u, Counts.size());
+    EXPECT_EQ(1u, Counts[0]);
+    EXPECT_EQ(2u, Counts[1]);
+    EXPECT_EQ(3u, Counts[2]);
+    EXPECT_EQ(4u, Counts[3]);
+  }
+
+  for (StringRef BarName : {"file:_Z3barf", "file:_Z4quuxf"}) {
+    EXPECT_THAT_ERROR(Reader->getFunctionCounts(BarName, 0x567, Counts),
+                      Succeeded());
+    ASSERT_EQ(3u, Counts.size());
+    EXPECT_EQ(5u, Counts[0]);
+    EXPECT_EQ(6u, Counts[1]);
+    EXPECT_EQ(7u, Counts[2]);
+  }
+
+  for (StringRef BadName : {"_Z3foof", "_Z4quuxi", "_Z3barl", "", "_ZZZ",
+                            "_Z3barf", "otherfile:_Z4quuxf"}) {
+    EXPECT_THAT_ERROR(Reader->getFunctionCounts(BadName, 0x1234, Counts),
+                      Failed());
+    EXPECT_THAT_ERROR(Reader->getFunctionCounts(BadName, 0x567, Counts),
+                      Failed());
+  }
+}
+
 TEST_F(SparseInstrProfTest, preserve_no_records) {
   Writer.addRecord({"foo", 0x1234, {0}}, Err);
   Writer.addRecord({"bar", 0x4321, {0, 0}}, Err);
-- 
GitLab


From 3cf846acd82f47357fa277a98871d16608d724e3 Mon Sep 17 00:00:00 2001
From: Armando Montanez <amontanez@google.com>
Date: Wed, 10 Oct 2018 21:16:57 +0000
Subject: [PATCH 0026/1116] Test commit: fix typo in comment

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344185 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objcopy/llvm-objcopy.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index 3e494f92b67..41c6ef3f3dc 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -523,7 +523,7 @@ static void handleArgs(const CopyConfig &Config, Object &Obj,
 
     // The purpose of this loop is to mark symbols referenced by sections
     // (like GroupSection or RelocationSection). This way, we know which
-    // symbols are still 'needed' and wich are not.
+    // symbols are still 'needed' and which are not.
     if (Config.StripUnneeded) {
       for (auto &Section : Obj.sections())
         Section.markSymbols();
-- 
GitLab


From ea46abe2cc75b90acc4d34c28921c41b1e547598 Mon Sep 17 00:00:00 2001
From: George Burgess IV <george.burgess.iv@gmail.com>
Date: Wed, 10 Oct 2018 21:28:44 +0000
Subject: [PATCH 0027/1116] Replace most users of UnknownSize with
 LocationSize::unknown(); NFC

Moving away from UnknownSize is part of the effort to migrate us to
LocationSizes (e.g. the cleanup promised in D44748).

This doesn't entirely remove all of the uses of UnknownSize; some uses
require tweaks to assume that UnknownSize isn't just some kind of int.
This patch is intended to just be a trivial replacement for all places
where LocationSize::unknown() will Just Work.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344186 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/AliasAnalysis.h         |  3 +-
 .../llvm/Analysis/MemoryDependenceAnalysis.h  |  2 +-
 include/llvm/Analysis/MemoryLocation.h        |  2 +-
 lib/Analysis/AliasSetTracker.cpp              |  2 +-
 lib/Analysis/BasicAliasAnalysis.cpp           | 39 +++++++++----------
 lib/Analysis/CFLAndersAliasAnalysis.cpp       |  6 +--
 lib/Analysis/DependenceAnalysis.cpp           |  4 +-
 lib/Analysis/LoopAccessAnalysis.cpp           |  4 +-
 lib/Analysis/MemoryLocation.cpp               | 10 +++--
 lib/CodeGen/ImplicitNullChecks.cpp            | 10 ++---
 lib/CodeGen/MachinePipeliner.cpp              |  4 +-
 lib/Target/ARM/ARMParallelDSP.cpp             |  2 +-
 .../Hexagon/HexagonLoopIdiomRecognition.cpp   |  2 +-
 lib/Transforms/IPO/FunctionAttrs.cpp          |  2 +-
 lib/Transforms/Scalar/LICM.cpp                |  2 +-
 unittests/Analysis/AliasAnalysisTest.cpp      |  4 +-
 16 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index be3496bbd95..88a70f4fe59 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -335,8 +335,7 @@ public:
 
   /// A convenience wrapper around the primary \c alias interface.
   AliasResult alias(const Value *V1, const Value *V2) {
-    return alias(V1, MemoryLocation::UnknownSize, V2,
-                 MemoryLocation::UnknownSize);
+    return alias(V1, LocationSize::unknown(), V2, LocationSize::unknown());
   }
 
   /// A trivial helper function to check to see if the specified pointers are
diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index 1c40cffc7f6..52340b0cb51 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -304,7 +304,7 @@ private:
     /// The maximum size of the dereferences of the pointer.
     ///
     /// May be UnknownSize if the sizes are unknown.
-    LocationSize Size = MemoryLocation::UnknownSize;
+    LocationSize Size = LocationSize::unknown();
     /// The AA tags associated with dereferences of the pointer.
     ///
     /// The members may be null if there are no tags or conflicting tags.
diff --git a/include/llvm/Analysis/MemoryLocation.h b/include/llvm/Analysis/MemoryLocation.h
index 509efa2ca1d..cf839c5a1eb 100644
--- a/include/llvm/Analysis/MemoryLocation.h
+++ b/include/llvm/Analysis/MemoryLocation.h
@@ -239,7 +239,7 @@ public:
   }
 
   explicit MemoryLocation(const Value *Ptr = nullptr,
-                          LocationSize Size = UnknownSize,
+                          LocationSize Size = LocationSize::unknown(),
                           const AAMDNodes &AATags = AAMDNodes())
       : Ptr(Ptr), Size(Size), AATags(AATags) {}
 
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 0d0277e9c34..66544c51446 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -649,7 +649,7 @@ void AliasSet::print(raw_ostream &OS) const {
     for (iterator I = begin(), E = end(); I != E; ++I) {
       if (I != begin()) OS << ", ";
       I.getPointer()->printAsOperand(OS << "(");
-      if (I.getSize() == MemoryLocation::UnknownSize)
+      if (I.getSize() == LocationSize::unknown())
         OS << ", unknown)";
       else 
         OS << ", " << I.getSize() << ")";
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 2f513004fe8..b7aa395ab84 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1019,8 +1019,8 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
 
   // If we don't know the size of the accesses through both GEPs, we can't
   // determine whether the struct fields accessed can't alias.
-  if (MaybeV1Size == MemoryLocation::UnknownSize ||
-      MaybeV2Size == MemoryLocation::UnknownSize)
+  if (MaybeV1Size == LocationSize::unknown() ||
+      MaybeV2Size == LocationSize::unknown())
     return MayAlias;
 
   const uint64_t V1Size = MaybeV1Size.getValue();
@@ -1184,8 +1184,7 @@ bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
       const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject,
       LocationSize MaybeObjectAccessSize) {
   // If the object access size is unknown, or the GEP isn't inbounds, bail.
-  if (MaybeObjectAccessSize == MemoryLocation::UnknownSize ||
-      !GEPOp->isInBounds())
+  if (MaybeObjectAccessSize == LocationSize::unknown() || !GEPOp->isInBounds())
     return false;
 
   const uint64_t ObjectAccessSize = MaybeObjectAccessSize.getValue();
@@ -1254,8 +1253,8 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
       return NoAlias;
     // Do the base pointers alias?
     AliasResult BaseAlias =
-        aliasCheck(UnderlyingV1, MemoryLocation::UnknownSize, AAMDNodes(),
-                   UnderlyingV2, MemoryLocation::UnknownSize, AAMDNodes());
+        aliasCheck(UnderlyingV1, LocationSize::unknown(), AAMDNodes(),
+                   UnderlyingV2, LocationSize::unknown(), AAMDNodes());
 
     // Check for geps of non-aliasing underlying pointers where the offsets are
     // identical.
@@ -1314,13 +1313,12 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
     // pointer, we know they cannot alias.
 
     // If both accesses are unknown size, we can't do anything useful here.
-    if (V1Size == MemoryLocation::UnknownSize &&
-        V2Size == MemoryLocation::UnknownSize)
+    if (V1Size == LocationSize::unknown() && V2Size == LocationSize::unknown())
       return MayAlias;
 
-    AliasResult R = aliasCheck(UnderlyingV1, MemoryLocation::UnknownSize,
-                               AAMDNodes(), V2, MemoryLocation::UnknownSize,
-                               V2AAInfo, nullptr, UnderlyingV2);
+    AliasResult R =
+        aliasCheck(UnderlyingV1, LocationSize::unknown(), AAMDNodes(), V2,
+                   LocationSize::unknown(), V2AAInfo, nullptr, UnderlyingV2);
     if (R != MustAlias) {
       // If V2 may alias GEP base pointer, conservatively returns MayAlias.
       // If V2 is known not to alias GEP base pointer, then the two values
@@ -1351,7 +1349,7 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
   // greater, we know they do not overlap.
   if (GEP1BaseOffset != 0 && DecompGEP1.VarIndices.empty()) {
     if (GEP1BaseOffset >= 0) {
-      if (V2Size != MemoryLocation::UnknownSize) {
+      if (V2Size != LocationSize::unknown()) {
         if ((uint64_t)GEP1BaseOffset < V2Size.getValue())
           return PartialAlias;
         return NoAlias;
@@ -1365,8 +1363,8 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
       // GEP1             V2
       // We need to know that V2Size is not unknown, otherwise we might have
       // stripped a gep with negative index ('gep <ptr>, -1, ...).
-      if (V1Size != MemoryLocation::UnknownSize &&
-          V2Size != MemoryLocation::UnknownSize) {
+      if (V1Size != LocationSize::unknown() &&
+          V2Size != LocationSize::unknown()) {
         if (-(uint64_t)GEP1BaseOffset < V1Size.getValue())
           return PartialAlias;
         return NoAlias;
@@ -1416,9 +1414,8 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
     // mod Modulo. Check whether that difference guarantees that the
     // two locations do not alias.
     uint64_t ModOffset = (uint64_t)GEP1BaseOffset & (Modulo - 1);
-    if (V1Size != MemoryLocation::UnknownSize &&
-        V2Size != MemoryLocation::UnknownSize &&
-        ModOffset >= V2Size.getValue() &&
+    if (V1Size != LocationSize::unknown() &&
+        V2Size != LocationSize::unknown() && ModOffset >= V2Size.getValue() &&
         V1Size.getValue() <= Modulo - ModOffset)
       return NoAlias;
 
@@ -1426,7 +1423,7 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
     // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers
     // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr.
     if (AllPositive && GEP1BaseOffset > 0 &&
-        V2Size != MemoryLocation::UnknownSize &&
+        V2Size != LocationSize::unknown() &&
         V2Size.getValue() <= (uint64_t)GEP1BaseOffset)
       return NoAlias;
 
@@ -1607,7 +1604,7 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
   // unknown to represent all the possible values the GEP could advance the
   // pointer to.
   if (isRecursive)
-    PNSize = MemoryLocation::UnknownSize;
+    PNSize = LocationSize::unknown();
 
   AliasResult Alias =
       aliasCheck(V2, V2Size, V2AAInfo, V1Srcs[0],
@@ -1864,8 +1861,8 @@ bool BasicAAResult::constantOffsetHeuristic(
     const SmallVectorImpl<VariableGEPIndex> &VarIndices,
     LocationSize MaybeV1Size, LocationSize MaybeV2Size, int64_t BaseOffset,
     AssumptionCache *AC, DominatorTree *DT) {
-  if (VarIndices.size() != 2 || MaybeV1Size == MemoryLocation::UnknownSize ||
-      MaybeV2Size == MemoryLocation::UnknownSize)
+  if (VarIndices.size() != 2 || MaybeV1Size == LocationSize::unknown() ||
+      MaybeV2Size == LocationSize::unknown())
     return false;
 
   const uint64_t V1Size = MaybeV1Size.getValue();
diff --git a/lib/Analysis/CFLAndersAliasAnalysis.cpp b/lib/Analysis/CFLAndersAliasAnalysis.cpp
index b43b48eeef7..1c61dd369a0 100644
--- a/lib/Analysis/CFLAndersAliasAnalysis.cpp
+++ b/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -556,9 +556,9 @@ bool CFLAndersAAResult::FunctionInfo::mayAlias(
                                       OffsetValue{RHS, 0}, Comparator);
 
     if (RangePair.first != RangePair.second) {
-      // Be conservative about UnknownSize
-      if (MaybeLHSSize == MemoryLocation::UnknownSize ||
-          MaybeRHSSize == MemoryLocation::UnknownSize)
+      // Be conservative about unknown sizes
+      if (MaybeLHSSize == LocationSize::unknown() ||
+          MaybeRHSSize == LocationSize::unknown())
         return true;
 
       const uint64_t LHSSize = MaybeLHSSize.getValue();
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index 79c2728d562..b544ae5f535 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -633,8 +633,8 @@ static AliasResult underlyingObjectsAlias(AliasAnalysis *AA,
                                           const MemoryLocation &LocB) {
   // Check the original locations (minus size) for noalias, which can happen for
   // tbaa, incompatible underlying object locations, etc.
-  MemoryLocation LocAS(LocA.Ptr, MemoryLocation::UnknownSize, LocA.AATags);
-  MemoryLocation LocBS(LocB.Ptr, MemoryLocation::UnknownSize, LocB.AATags);
+  MemoryLocation LocAS(LocA.Ptr, LocationSize::unknown(), LocA.AATags);
+  MemoryLocation LocBS(LocB.Ptr, LocationSize::unknown(), LocB.AATags);
   if (AA->alias(LocAS, LocBS) == NoAlias)
     return NoAlias;
 
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 8312a0d1cff..b43e290956d 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -509,7 +509,7 @@ public:
   /// Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
     Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags);
+    AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
     Accesses.insert(MemAccessInfo(Ptr, false));
     if (IsReadOnly)
       ReadOnlyPtr.insert(Ptr);
@@ -518,7 +518,7 @@ public:
   /// Register a store.
   void addStore(MemoryLocation &Loc) {
     Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags);
+    AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
     Accesses.insert(MemAccessInfo(Ptr, true));
   }
 
diff --git a/lib/Analysis/MemoryLocation.cpp b/lib/Analysis/MemoryLocation.cpp
index 3cd4b4475ef..c0605f6ad37 100644
--- a/lib/Analysis/MemoryLocation.cpp
+++ b/lib/Analysis/MemoryLocation.cpp
@@ -55,7 +55,8 @@ MemoryLocation MemoryLocation::get(const VAArgInst *VI) {
   AAMDNodes AATags;
   VI->getAAMetadata(AATags);
 
-  return MemoryLocation(VI->getPointerOperand(), UnknownSize, AATags);
+  return MemoryLocation(VI->getPointerOperand(), LocationSize::unknown(),
+                        AATags);
 }
 
 MemoryLocation MemoryLocation::get(const AtomicCmpXchgInst *CXI) {
@@ -87,7 +88,7 @@ MemoryLocation MemoryLocation::getForSource(const AtomicMemTransferInst *MTI) {
 }
 
 MemoryLocation MemoryLocation::getForSource(const AnyMemTransferInst *MTI) {
-  uint64_t Size = UnknownSize;
+  uint64_t Size = MemoryLocation::UnknownSize;
   if (ConstantInt *C = dyn_cast<ConstantInt>(MTI->getLength()))
     Size = C->getValue().getZExtValue();
 
@@ -108,7 +109,7 @@ MemoryLocation MemoryLocation::getForDest(const AtomicMemIntrinsic *MI) {
 }
 
 MemoryLocation MemoryLocation::getForDest(const AnyMemIntrinsic *MI) {
-  uint64_t Size = UnknownSize;
+  uint64_t Size = MemoryLocation::UnknownSize;
   if (ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength()))
     Size = C->getValue().getZExtValue();
 
@@ -189,5 +190,6 @@ MemoryLocation MemoryLocation::getForArgument(ImmutableCallSite CS,
   }
   // FIXME: Handle memset_pattern4 and memset_pattern8 also.
 
-  return MemoryLocation(CS.getArgument(ArgIdx), UnknownSize, AATags);
+  return MemoryLocation(CS.getArgument(ArgIdx), LocationSize::unknown(),
+                        AATags);
 }
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index 034692de92d..deb49a1ea48 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -344,11 +344,11 @@ ImplicitNullChecks::areMemoryOpsAliased(MachineInstr &MI,
           return AR_MayAlias;
         continue;
       }
-      llvm::AliasResult AAResult = AA->alias(
-          MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
-                         MMO1->getAAInfo()),
-          MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
-                         MMO2->getAAInfo()));
+      llvm::AliasResult AAResult =
+          AA->alias(MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
+                                   MMO1->getAAInfo()),
+                    MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
+                                   MMO2->getAAInfo()));
       if (AAResult != NoAlias)
         return AR_MayAlias;
     }
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index 5f6f0cf96a5..3d8510f7c0c 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -1136,9 +1136,9 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
             continue;
           }
           AliasResult AAResult = AA->alias(
-              MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
+              MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
                              MMO1->getAAInfo()),
-              MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
+              MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
                              MMO2->getAAInfo()));
 
           if (AAResult != NoAlias) {
diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp
index 050a76413cf..3ab9298c110 100644
--- a/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/lib/Target/ARM/ARMParallelDSP.cpp
@@ -71,7 +71,7 @@ namespace {
     virtual ~OpChain() = default;
 
     void SetMemoryLocations() {
-      const auto Size = MemoryLocation::UnknownSize;
+      const auto Size = LocationSize::unknown();
       for (auto *V : AllValues) {
         if (auto *I = dyn_cast<Instruction>(V)) {
           if (I->mayWriteToMemory())
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index f9ed0390923..f38992bef69 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1970,7 +1970,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
   // Get the location that may be stored across the loop.  Since the access
   // is strided positively through memory, we say that the modified location
   // starts at the pointer and has infinite size.
-  LocationSize AccessSize = MemoryLocation::UnknownSize;
+  LocationSize AccessSize = LocationSize::unknown();
 
   // If the loop iterates a fixed number of times, we can refine the access
   // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 72c850fca99..f01c6a4e99b 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -165,7 +165,7 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
 
         AAMDNodes AAInfo;
         I->getAAMetadata(AAInfo);
-        MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo);
+        MemoryLocation Loc(Arg, LocationSize::unknown(), AAInfo);
 
         // Skip accesses to local or constant memory as they don't impact the
         // externally visible mod/ref behavior.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index bb918cf717d..601d49fc03f 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -693,7 +693,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
         for (Value *Op : CI->arg_operands())
           if (Op->getType()->isPointerTy() &&
               pointerInvalidatedByLoop(
-                  MemoryLocation(Op, MemoryLocation::UnknownSize, AAMDNodes()),
+                  MemoryLocation(Op, LocationSize::unknown(), AAMDNodes()),
                   CurAST, CurLoop, AA))
             return false;
         return true;
diff --git a/unittests/Analysis/AliasAnalysisTest.cpp b/unittests/Analysis/AliasAnalysisTest.cpp
index 0f0d44f6c78..42a4210feba 100644
--- a/unittests/Analysis/AliasAnalysisTest.cpp
+++ b/unittests/Analysis/AliasAnalysisTest.cpp
@@ -55,8 +55,8 @@ struct AATestPass : FunctionPass {
 
     for (Value *P1 : Pointers)
       for (Value *P2 : Pointers)
-        (void)AA.alias(P1, MemoryLocation::UnknownSize, P2,
-                       MemoryLocation::UnknownSize);
+        (void)AA.alias(P1, LocationSize::unknown(), P2,
+                       LocationSize::unknown());
 
     return false;
   }
-- 
GitLab


From 04af5ff3eb6ec26c853c7aabe3f282f6a293ef9f Mon Sep 17 00:00:00 2001
From: Richard Smith <richard-llvm@metafoo.co.uk>
Date: Wed, 10 Oct 2018 21:31:01 +0000
Subject: [PATCH 0028/1116] Support for remapping profile data when symbols
 change, for sample-based profiling.

Reviewers: davidxl, tejohnson, dlj, erik.pilkington

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D51248

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344187 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/SampleProfReader.h | 52 ++++++++++++++++--
 lib/ProfileData/SampleProfReader.cpp        | 55 +++++++++++++++++++
 unittests/ProfileData/SampleProfTest.cpp    | 59 ++++++++++++++-------
 3 files changed, 144 insertions(+), 22 deletions(-)

diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h
index c100e800464..3c477cc3471 100644
--- a/include/llvm/ProfileData/SampleProfReader.h
+++ b/include/llvm/ProfileData/SampleProfReader.h
@@ -222,6 +222,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SymbolRemappingReader.h"
 #include <algorithm>
 #include <cstdint>
 #include <memory>
@@ -289,11 +290,16 @@ public:
     // The function name may have been updated by adding suffix. In sample
     // profile, the function names are all stripped, so we need to strip
     // the function name suffix before matching with profile.
-    StringRef Fname = F.getName().split('.').first;
+    return getSamplesFor(F.getName().split('.').first);
+  }
+
+  /// Return the samples collected for function \p F.
+  virtual FunctionSamples *getSamplesFor(StringRef Fname) {
     std::string FGUID;
     Fname = getRepInFormat(Fname, getFormat(), FGUID);
-    if (Profiles.count(Fname))
-      return &Profiles[Fname];
+    auto It = Profiles.find(Fname);
+    if (It != Profiles.end())
+      return &It->second;
     return nullptr;
   }
 
@@ -337,6 +343,12 @@ protected:
   /// Profile summary information.
   std::unique_ptr<ProfileSummary> Summary;
 
+  /// Take ownership of the summary of this reader.
+  static std::unique_ptr<ProfileSummary>
+  takeSummary(SampleProfileReader &Reader) {
+    return std::move(Reader.Summary);
+  }
+
   /// Compute summary for this profile.
   void computeSummary();
 
@@ -525,6 +537,40 @@ protected:
   static const uint32_t GCOVTagAFDOFunction = 0xac000000;
 };
 
+/// A profile data reader proxy that remaps the profile data from another
+/// sample profile data reader, by applying a provided set of equivalences
+/// between components of the symbol names in the profile.
+class SampleProfileReaderItaniumRemapper : public SampleProfileReader {
+public:
+  SampleProfileReaderItaniumRemapper(
+      std::unique_ptr<MemoryBuffer> B, LLVMContext &C,
+      std::unique_ptr<SampleProfileReader> Underlying)
+      : SampleProfileReader(std::move(B), C, Underlying->getFormat()) {
+    Profiles = std::move(Underlying->getProfiles());
+    Summary = takeSummary(*Underlying);
+  }
+
+  /// Create a remapped sample profile from the given remapping file and
+  /// underlying samples.
+  static ErrorOr<std::unique_ptr<SampleProfileReader>>
+  create(const Twine &Filename, LLVMContext &C,
+         std::unique_ptr<SampleProfileReader> Underlying);
+
+  /// Read and validate the file header.
+  std::error_code readHeader() override { return sampleprof_error::success; }
+
+  /// Read remapping file and apply it to the sample profile.
+  std::error_code read() override;
+
+  /// Return the samples collected for function \p F.
+  FunctionSamples *getSamplesFor(StringRef FunctionName) override;
+  using SampleProfileReader::getSamplesFor;
+
+private:
+  SymbolRemappingReader Remappings;
+  DenseMap<SymbolRemappingReader::Key, FunctionSamples*> SampleMap;
+};
+
 } // end namespace sampleprof
 
 } // end namespace llvm
diff --git a/lib/ProfileData/SampleProfReader.cpp b/lib/ProfileData/SampleProfReader.cpp
index 2b4551b9849..a68d1e9d3ab 100644
--- a/lib/ProfileData/SampleProfReader.cpp
+++ b/lib/ProfileData/SampleProfReader.cpp
@@ -912,6 +912,40 @@ bool SampleProfileReaderGCC::hasFormat(const MemoryBuffer &Buffer) {
   return Magic == "adcg*704";
 }
 
+std::error_code SampleProfileReaderItaniumRemapper::read() {
+  // If the underlying data is in compact format, we can't remap it because
+  // we don't know what the original function names were.
+  if (getFormat() == SPF_Compact_Binary) {
+    Ctx.diagnose(DiagnosticInfoSampleProfile(
+        Buffer->getBufferIdentifier(),
+        "Profile data remapping cannot be applied to profile data "
+        "in compact format (original mangled names are not available).",
+        DS_Warning));
+    return sampleprof_error::success;
+  }
+
+  if (Error E = Remappings.read(*Buffer)) {
+    handleAllErrors(
+        std::move(E), [&](const SymbolRemappingParseError &ParseError) {
+          reportError(ParseError.getLineNum(), ParseError.getMessage());
+        });
+    return sampleprof_error::malformed;
+  }
+
+  for (auto &Sample : getProfiles())
+    if (auto Key = Remappings.insert(Sample.first()))
+      SampleMap.insert({Key, &Sample.second});
+
+  return sampleprof_error::success;
+}
+
+FunctionSamples *
+SampleProfileReaderItaniumRemapper::getSamplesFor(StringRef Fname) {
+  if (auto Key = Remappings.lookup(Fname))
+    return SampleMap.lookup(Key);
+  return SampleProfileReader::getSamplesFor(Fname);
+}
+
 /// Prepare a memory buffer for the contents of \p Filename.
 ///
 /// \returns an error code indicating the status of the buffer.
@@ -944,6 +978,27 @@ SampleProfileReader::create(const Twine &Filename, LLVMContext &C) {
   return create(BufferOrError.get(), C);
 }
 
+/// Create a sample profile remapper from the given input, to remap the
+/// function names in the given profile data.
+///
+/// \param Filename The file to open.
+///
+/// \param C The LLVM context to use to emit diagnostics.
+///
+/// \param Underlying The underlying profile data reader to remap.
+///
+/// \returns an error code indicating the status of the created reader.
+ErrorOr<std::unique_ptr<SampleProfileReader>>
+SampleProfileReaderItaniumRemapper::create(
+    const Twine &Filename, LLVMContext &C,
+    std::unique_ptr<SampleProfileReader> Underlying) {
+  auto BufferOrError = setupMemoryBuffer(Filename);
+  if (std::error_code EC = BufferOrError.getError())
+    return EC;
+  return llvm::make_unique<SampleProfileReaderItaniumRemapper>(
+      std::move(BufferOrError.get()), C, std::move(Underlying));
+}
+
 /// Create a sample profile reader based on the format of the input data.
 ///
 /// \param B The memory buffer to create the reader from (assumes ownership).
diff --git a/unittests/ProfileData/SampleProfTest.cpp b/unittests/ProfileData/SampleProfTest.cpp
index 73e8088b638..67e6e9fc95b 100644
--- a/unittests/ProfileData/SampleProfTest.cpp
+++ b/unittests/ProfileData/SampleProfTest.cpp
@@ -58,7 +58,7 @@ struct SampleProfTest : ::testing::Test {
     Reader->collectFuncsToUse(M);
   }
 
-  void testRoundTrip(SampleProfileFormat Format) {
+  void testRoundTrip(SampleProfileFormat Format, bool Remap) {
     SmallVector<char, 128> ProfilePath;
     ASSERT_TRUE(NoError(llvm::sys::fs::createTemporaryFile("profile", "", ProfilePath)));
     StringRef Profile(ProfilePath.data(), ProfilePath.size());
@@ -108,22 +108,35 @@ struct SampleProfTest : ::testing::Test {
     EC = Reader->read();
     ASSERT_TRUE(NoError(EC));
 
-    StringMap<FunctionSamples> &ReadProfiles = Reader->getProfiles();
-    ASSERT_EQ(2u, ReadProfiles.size());
-
-    std::string FooGUID;
-    StringRef FooRep = getRepInFormat(FooName, Format, FooGUID);
-    FunctionSamples &ReadFooSamples = ReadProfiles[FooRep];
-    ASSERT_EQ(7711u, ReadFooSamples.getTotalSamples());
-    ASSERT_EQ(610u, ReadFooSamples.getHeadSamples());
-
-    std::string BarGUID;
-    StringRef BarRep = getRepInFormat(BarName, Format, BarGUID);
-    FunctionSamples &ReadBarSamples = ReadProfiles[BarRep];
-    ASSERT_EQ(20301u, ReadBarSamples.getTotalSamples());
-    ASSERT_EQ(1437u, ReadBarSamples.getHeadSamples());
+    if (Remap) {
+      auto MemBuffer = llvm::MemoryBuffer::getMemBuffer(R"(
+        # Types 'int' and 'long' are equivalent
+        type i l
+        # Function names 'foo' and 'faux' are equivalent
+        name 3foo 4faux
+      )");
+      Reader.reset(new SampleProfileReaderItaniumRemapper(
+          std::move(MemBuffer), Context, std::move(Reader)));
+      FooName = "_Z4fauxi";
+      BarName = "_Z3barl";
+
+      EC = Reader->read();
+      ASSERT_TRUE(NoError(EC));
+    }
+
+    ASSERT_EQ(2u, Reader->getProfiles().size());
+
+    FunctionSamples *ReadFooSamples = Reader->getSamplesFor(FooName);
+    ASSERT_TRUE(ReadFooSamples != nullptr);
+    ASSERT_EQ(7711u, ReadFooSamples->getTotalSamples());
+    ASSERT_EQ(610u, ReadFooSamples->getHeadSamples());
+
+    FunctionSamples *ReadBarSamples = Reader->getSamplesFor(BarName);
+    ASSERT_TRUE(ReadBarSamples != nullptr);
+    ASSERT_EQ(20301u, ReadBarSamples->getTotalSamples());
+    ASSERT_EQ(1437u, ReadBarSamples->getHeadSamples());
     ErrorOr<SampleRecord::CallTargetMap> CTMap =
-        ReadBarSamples.findCallTargetMapAt(1, 0);
+        ReadBarSamples->findCallTargetMapAt(1, 0);
     ASSERT_FALSE(CTMap.getError());
 
     std::string MconstructGUID;
@@ -184,15 +197,23 @@ struct SampleProfTest : ::testing::Test {
 };
 
 TEST_F(SampleProfTest, roundtrip_text_profile) {
-  testRoundTrip(SampleProfileFormat::SPF_Text);
+  testRoundTrip(SampleProfileFormat::SPF_Text, false);
 }
 
 TEST_F(SampleProfTest, roundtrip_raw_binary_profile) {
-  testRoundTrip(SampleProfileFormat::SPF_Binary);
+  testRoundTrip(SampleProfileFormat::SPF_Binary, false);
 }
 
 TEST_F(SampleProfTest, roundtrip_compact_binary_profile) {
-  testRoundTrip(SampleProfileFormat::SPF_Compact_Binary);
+  testRoundTrip(SampleProfileFormat::SPF_Compact_Binary, false);
+}
+
+TEST_F(SampleProfTest, remap_text_profile) {
+  testRoundTrip(SampleProfileFormat::SPF_Text, true);
+}
+
+TEST_F(SampleProfTest, remap_raw_binary_profile) {
+  testRoundTrip(SampleProfileFormat::SPF_Binary, true);
 }
 
 TEST_F(SampleProfTest, sample_overflow_saturation) {
-- 
GitLab


From 8313c3b553f996945d5c73734d027fa01af69115 Mon Sep 17 00:00:00 2001
From: Chris Bieneman <chris.bieneman@me.com>
Date: Wed, 10 Oct 2018 21:36:12 +0000
Subject: [PATCH 0029/1116] [CMake] NFC. Updating documentation on options

The Ninja pool options are only supported with the Ninja generator and
should be called out as such.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344188 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/HandleLLVMOptions.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 2c9bd14ad05..0daaf7d95c0 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -23,7 +23,7 @@ string(TOUPPER "${LLVM_ENABLE_LTO}" uppercase_LLVM_ENABLE_LTO)
 # Ninja Job Pool support
 # The following only works with the Ninja generator in CMake >= 3.0.
 set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING
-  "Define the maximum number of concurrent compilation jobs.")
+  "Define the maximum number of concurrent compilation jobs (Ninja only).")
 if(LLVM_PARALLEL_COMPILE_JOBS)
   if(NOT CMAKE_MAKE_PROGRAM MATCHES "ninja")
     message(WARNING "Job pooling is only available with Ninja generators.")
@@ -34,7 +34,7 @@ if(LLVM_PARALLEL_COMPILE_JOBS)
 endif()
 
 set(LLVM_PARALLEL_LINK_JOBS "" CACHE STRING
-  "Define the maximum number of concurrent link jobs.")
+  "Define the maximum number of concurrent link jobs (Ninja only).")
 if(CMAKE_MAKE_PROGRAM MATCHES "ninja")
   if(NOT LLVM_PARALLEL_LINK_JOBS AND uppercase_LLVM_ENABLE_LTO STREQUAL "THIN")
     message(STATUS "ThinLTO provides its own parallel linking - limiting parallel link jobs to 2.")
-- 
GitLab


From 63c98b331978f03c4fa392b9a032fad24596dde8 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 10 Oct 2018 21:48:34 +0000
Subject: [PATCH 0030/1116] [X86] Prevent non-temporal loads from folding into
 instructions by blocking them in X86DAGToDAGISel::IsProfitableToFold rather
 than with a predicate.

Remove tryFoldVecLoad since tryFoldLoad would call IsProfitableToFold and pick up the new check.

This saves about 5K out of ~600K on the generated isel table.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344189 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp      | 32 ++++-----------
 lib/Target/X86/X86InstrFragmentsSIMD.td | 53 +++++++++++--------------
 2 files changed, 31 insertions(+), 54 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 25a8567a9c1..5eb4dbb1d98 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -239,12 +239,6 @@ namespace {
       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
     }
 
-    // Try to fold a vector load. This makes sure the load isn't non-temporal.
-    bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
-                        SDValue &Base, SDValue &Scale,
-                        SDValue &Index, SDValue &Disp,
-                        SDValue &Segment);
-
     /// Implement addressing mode selection for inline asm expressions.
     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                       unsigned ConstraintID,
@@ -516,6 +510,10 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
   if (N.getOpcode() != ISD::LOAD)
     return true;
 
+  // Don't fold non-temporal loads if we have an instruction for them.
+  if (useNonTemporalLoad(cast<LoadSDNode>(N)))
+    return false;
+
   // If N is a load, do additional profitability checks.
   if (U == Root) {
     switch (U->getOpcode()) {
@@ -2053,20 +2051,6 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
 }
 
-bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
-                                     SDValue &Base, SDValue &Scale,
-                                     SDValue &Index, SDValue &Disp,
-                                     SDValue &Segment) {
-  if (!ISD::isNON_EXTLoad(N.getNode()) ||
-      useNonTemporalLoad(cast<LoadSDNode>(N)) ||
-      !IsProfitableToFold(N, P, Root) ||
-      !IsLegalToFold(N, P, Root, OptLevel))
-    return false;
-
-  return selectAddr(N.getNode(),
-                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
-}
-
 /// Return an SDNode that returns the value of the global base register.
 /// Output instructions required to initialize the global base register,
 /// if necessary.
@@ -2595,8 +2579,8 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
   // alignment on this load.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
-      tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
-                     Tmp3, Tmp4)) {
+      tryFoldLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
+                  Tmp3, Tmp4)) {
     SDValue Load = N1.getOperand(0);
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
                       Load.getOperand(0) };
@@ -2632,8 +2616,8 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
   // alignment on this load.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
-      tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
-                     Tmp3, Tmp4)) {
+      tryFoldLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
+                  Tmp3, Tmp4)) {
     SDValue Load = N2.getOperand(0);
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
                       Load.getOperand(0), InFlag };
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 3aa825ee84e..f750fe3ee0c 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -647,28 +647,22 @@ def sdmem : Operand<v2f64> {
 // SSE pattern fragments
 //===----------------------------------------------------------------------===//
 
-// Vector load wrappers to prevent folding of non-temporal aligned loads on
-// supporting targets.
-def vecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return !useNonTemporalLoad(cast<LoadSDNode>(N));
-}]>;
-
 // 128-bit load pattern fragments
 // NOTE: all 128-bit integer vector loads are promoted to v2i64
-def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (vecload node:$ptr))>;
-def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (vecload node:$ptr))>;
-def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (vecload node:$ptr))>;
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
 
 // 256-bit load pattern fragments
 // NOTE: all 256-bit integer vector loads are promoted to v4i64
-def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (vecload node:$ptr))>;
-def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (vecload node:$ptr))>;
-def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (vecload node:$ptr))>;
+def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
+def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
+def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
 
 // 512-bit load pattern fragments
-def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (vecload node:$ptr))>;
-def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (vecload node:$ptr))>;
-def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (vecload node:$ptr))>;
+def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
 
 // 128-/256-/512-bit extload pattern fragments
 def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
@@ -682,46 +676,45 @@ def alignedstore : PatFrag<(ops node:$val, node:$ptr),
   return St->getAlignment() >= St->getMemoryVT().getStoreSize();
 }]>;
 
-// Like 'load', but always requires 128-bit vector alignment.
-def alignedvecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+// Like 'load', but always requires vector size alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   auto *Ld = cast<LoadSDNode>(N);
-  return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize() &&
-         !useNonTemporalLoad(cast<LoadSDNode>(N));
+  return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
 }]>;
 
 // 128-bit aligned load pattern fragments
 // NOTE: all 128-bit integer vector loads are promoted to v2i64
 def alignedloadv4f32 : PatFrag<(ops node:$ptr),
-                               (v4f32 (alignedvecload node:$ptr))>;
+                               (v4f32 (alignedload node:$ptr))>;
 def alignedloadv2f64 : PatFrag<(ops node:$ptr),
-                               (v2f64 (alignedvecload node:$ptr))>;
+                               (v2f64 (alignedload node:$ptr))>;
 def alignedloadv2i64 : PatFrag<(ops node:$ptr),
-                               (v2i64 (alignedvecload node:$ptr))>;
+                               (v2i64 (alignedload node:$ptr))>;
 
 // 256-bit aligned load pattern fragments
 // NOTE: all 256-bit integer vector loads are promoted to v4i64
 def alignedloadv8f32 : PatFrag<(ops node:$ptr),
-                               (v8f32 (alignedvecload node:$ptr))>;
+                               (v8f32 (alignedload node:$ptr))>;
 def alignedloadv4f64 : PatFrag<(ops node:$ptr),
-                               (v4f64 (alignedvecload node:$ptr))>;
+                               (v4f64 (alignedload node:$ptr))>;
 def alignedloadv4i64 : PatFrag<(ops node:$ptr),
-                               (v4i64 (alignedvecload node:$ptr))>;
+                               (v4i64 (alignedload node:$ptr))>;
 
 // 512-bit aligned load pattern fragments
 def alignedloadv16f32 : PatFrag<(ops node:$ptr),
-                                (v16f32 (alignedvecload node:$ptr))>;
+                                (v16f32 (alignedload node:$ptr))>;
 def alignedloadv8f64  : PatFrag<(ops node:$ptr),
-                                (v8f64  (alignedvecload node:$ptr))>;
+                                (v8f64  (alignedload node:$ptr))>;
 def alignedloadv8i64  : PatFrag<(ops node:$ptr),
-                                (v8i64  (alignedvecload node:$ptr))>;
+                                (v8i64  (alignedload node:$ptr))>;
 
-// Like 'vecload', but uses special alignment checks suitable for use in
+// Like 'load', but uses special alignment checks suitable for use in
 // memory operands in most SSE instructions, which are required to
 // be naturally aligned on some targets but not on others.  If the subtarget
 // allows unaligned accesses, match any load, though this may require
 // setting a feature bit in the processor (on startup, for example).
 // Opteron 10h and later implement such a feature.
-def memop : PatFrag<(ops node:$ptr), (vecload node:$ptr), [{
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   auto *Ld = cast<LoadSDNode>(N);
   return Subtarget->hasSSEUnalignedMem() ||
          Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
-- 
GitLab


From d784be6ea2204c4936a9cf97d3b260d8fc0b39ba Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Wed, 10 Oct 2018 22:52:32 +0000
Subject: [PATCH 0031/1116] [MC][ELF] compute entity size for explicit sections

Summary:
Global variables might declare themselves to be in explicit sections.
Calculate the entity size always to prevent assembler warnings
"entity size for SHF_MERGE not specified" when sections are to be
marked merge-able.

Fixes PR31828.

Reviewers: rnk, echristo

Reviewed By: rnk

Subscribers: llvm-commits, pirama, srhines

Differential Revision: https://reviews.llvm.org/D53056

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344197 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/TargetLoweringObjectFileImpl.cpp  | 50 +++++++++----------
 .../CodeGen/Generic/section_mergeable_size.ll |  3 ++
 2 files changed, 28 insertions(+), 25 deletions(-)
 create mode 100644 test/CodeGen/Generic/section_mergeable_size.ll

diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index f6882c40531..b046cd81d6c 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -506,6 +506,30 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
   return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr;
 }
 
+static unsigned getEntrySizeForKind(SectionKind Kind) {
+  if (Kind.isMergeable1ByteCString())
+    return 1;
+  else if (Kind.isMergeable2ByteCString())
+    return 2;
+  else if (Kind.isMergeable4ByteCString())
+    return 4;
+  else if (Kind.isMergeableConst4())
+    return 4;
+  else if (Kind.isMergeableConst8())
+    return 8;
+  else if (Kind.isMergeableConst16())
+    return 16;
+  else if (Kind.isMergeableConst32())
+    return 32;
+  else {
+    // We shouldn't have mergeable C strings or mergeable constants that we
+    // didn't handle above.
+    assert(!Kind.isMergeableCString() && "unknown string width");
+    assert(!Kind.isMergeableConst() && "unknown data width");
+    return 0;
+  }
+}
+
 MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   StringRef SectionName = GO->getSection();
@@ -550,7 +574,7 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
 
   MCSectionELF *Section = getContext().getELFSection(
       SectionName, getELFSectionType(SectionName, Kind), Flags,
-      /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol);
+      getEntrySizeForKind(Kind), Group, UniqueID, AssociatedSymbol);
   // Make sure that we did not get some other section with incompatible sh_link.
   // This should not be possible due to UniqueID code above.
   assert(Section->getAssociatedSymbol() == AssociatedSymbol &&
@@ -577,30 +601,6 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
   return ".data.rel.ro";
 }
 
-static unsigned getEntrySizeForKind(SectionKind Kind) {
-  if (Kind.isMergeable1ByteCString())
-    return 1;
-  else if (Kind.isMergeable2ByteCString())
-    return 2;
-  else if (Kind.isMergeable4ByteCString())
-    return 4;
-  else if (Kind.isMergeableConst4())
-    return 4;
-  else if (Kind.isMergeableConst8())
-    return 8;
-  else if (Kind.isMergeableConst16())
-    return 16;
-  else if (Kind.isMergeableConst32())
-    return 32;
-  else {
-    // We shouldn't have mergeable C strings or mergeable constants that we
-    // didn't handle above.
-    assert(!Kind.isMergeableCString() && "unknown string width");
-    assert(!Kind.isMergeableConst() && "unknown data width");
-    return 0;
-  }
-}
-
 static MCSectionELF *selectELFSectionForGlobal(
     MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang,
     const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags,
diff --git a/test/CodeGen/Generic/section_mergeable_size.ll b/test/CodeGen/Generic/section_mergeable_size.ll
new file mode 100644
index 00000000000..fbab7fe849f
--- /dev/null
+++ b/test/CodeGen/Generic/section_mergeable_size.ll
@@ -0,0 +1,3 @@
+; RUN: llc < %s | FileCheck %s
+@a = internal unnamed_addr constant [1 x [1 x i32]] zeroinitializer, section ".init.rodata", align 4
+; CHECK: .init.rodata,"aM",@progbits,4
-- 
GitLab


From 7f7ab9f57f5c84a64b8c37a2273b99682aab2811 Mon Sep 17 00:00:00 2001
From: Warren Ristow <warren.ristow@sony.com>
Date: Wed, 10 Oct 2018 22:54:31 +0000
Subject: [PATCH 0032/1116] [LTO] Account for overriding lib calls via the
 alias attribute

Given a library call that is represented as an llvm intrinsic call, but
later transformed to an actual call, if an overriding definition of that
library routine is provided indirectly via an alias, prevent LTO from
eliminating the definition.

This is a fix for PR38547.

Differential Revision: https://reviews.llvm.org/D52836


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344198 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/LTO/UpdateCompilerUsed.cpp               | 16 +++--
 test/LTO/X86/libcall-overridden-via-alias.ll | 69 ++++++++++++++++++++
 2 files changed, 80 insertions(+), 5 deletions(-)
 create mode 100755 test/LTO/X86/libcall-overridden-via-alias.ll

diff --git a/lib/LTO/UpdateCompilerUsed.cpp b/lib/LTO/UpdateCompilerUsed.cpp
index c982a5b0e5a..00482dee6e1 100644
--- a/lib/LTO/UpdateCompilerUsed.cpp
+++ b/lib/LTO/UpdateCompilerUsed.cpp
@@ -95,12 +95,18 @@ private:
     if (GV.hasPrivateLinkage())
       return;
 
-    // Conservatively append user-supplied runtime library functions to
-    // llvm.compiler.used.  These could be internalized and deleted by
-    // optimizations like -globalopt, causing problems when later optimizations
-    // add new library calls (e.g., llvm.memset => memset and printf => puts).
+    // Conservatively append user-supplied runtime library functions (supplied
+    // either directly, or via a function alias) to llvm.compiler.used.  These
+    // could be internalized and deleted by optimizations like -globalopt,
+    // causing problems when later optimizations add new library calls (e.g.,
+    // llvm.memset => memset and printf => puts).
     // Leave it to the linker to remove any dead code (e.g. with -dead_strip).
-    if (isa<Function>(GV) && Libcalls.count(GV.getName())) {
+    GlobalValue *FuncAliasee = nullptr;
+    if (isa<GlobalAlias>(GV)) {
+      auto *A = cast<GlobalAlias>(&GV);
+      FuncAliasee = dyn_cast<Function>(A->getAliasee());
+    }
+    if ((isa<Function>(GV) || FuncAliasee) && Libcalls.count(GV.getName())) {
       LLVMUsed.push_back(&GV);
       return;
     }
diff --git a/test/LTO/X86/libcall-overridden-via-alias.ll b/test/LTO/X86/libcall-overridden-via-alias.ll
new file mode 100755
index 00000000000..cac125b2843
--- /dev/null
+++ b/test/LTO/X86/libcall-overridden-via-alias.ll
@@ -0,0 +1,69 @@
+; Given a library call that is represented as an llvm intrinsic call, but
+; later transformed to an actual call, if an overriding definition of that
+; library routine is provided indirectly via an alias, verify that LTO
+; does not eliminate the definition.  This is a test for PR38547.
+;
+; RUN: llvm-as -o %t1 %s
+; RUN: llvm-lto -exported-symbol=main -save-merged-module -filetype=asm -o %t2 %t1
+; RUN: llvm-dis -o - %t2.merged.bc | FileCheck --check-prefix=CHECK_IR %s
+;
+; Check that the call is represented as an llvm intrinsic in the IR after LTO:
+; CHECK_IR-LABEL: main
+; CHECK_IR: call float @llvm.log.f32
+;
+; Check that the IR contains the overriding definition of the library routine
+; in the IR after LTO:
+; CHECK_IR: define internal float @logf(float [[X:%.*]])
+; CHECK_IR-NEXT:   [[TMP:%.*]] = fadd float [[X]], [[X]]
+; CHECK_IR-NEXT:   ret float [[TMP]]
+;
+; Check that the assembly code from LTO contains the call to the expected
+; library routine, and that the overriding definition of the library routine
+; is present:
+; RUN: FileCheck --check-prefix=CHECK_ASM %s < %t2
+; CHECK_ASM-LABEL: main:
+; CHECK_ASM: callq logf
+; CHECK_ASM-LABEL: logf:
+; CHECK_ASM-NEXT: add
+; CHECK_ASM-NEXT: ret
+
+; Produced from the following source-code:
+;
+;extern float logf(float);
+;// 'src' and 'dst' are 'volatile' to prohibit optimization.
+;volatile float src = 3.14f;
+;volatile float dst;
+;
+;int main() {
+;  dst = logf(src);
+;  return 0;
+;}
+;
+;extern float fname(float x);
+;float fname(float x) {
+;  return x + x;
+;}
+;
+;float logf(float x) __attribute__((alias("fname")));
+;
+target triple = "x86_64-unknown-linux-gnu"
+
+@src = global float 0x40091EB860000000, align 4
+@dst = common global float 0.000000e+00, align 4
+
+@logf = alias float (float), float (float)* @fname
+
+define i32 @main() local_unnamed_addr {
+entry:
+  %0 = load volatile float, float* @src, align 4
+  %1 = tail call float @llvm.log.f32(float %0)
+  store volatile float %1, float* @dst, align 4
+  ret i32 0
+}
+
+declare float @llvm.log.f32(float)
+
+define float @fname(float %x) {
+  %add = fadd float %x, %x
+  ret float %add
+}
-- 
GitLab


From 63ec2563a97241c4f743bf5202cb9aae88af3b37 Mon Sep 17 00:00:00 2001
From: Richard Smith <richard-llvm@metafoo.co.uk>
Date: Wed, 10 Oct 2018 23:13:47 +0000
Subject: [PATCH 0033/1116] Add a flag to remap manglings when reading profile
 data information.

This can be used to preserve profiling information across codebase
changes that have widespread impact on mangled names, but across which
most profiling data should still be usable. For example, when switching
from libstdc++ to libc++, or from the old libstdc++ ABI to the new ABI,
or even from a 32-bit to a 64-bit build.

The user can provide a remapping file specifying parts of mangled names
that should be treated as equivalent (eg, std::__1 should be treated as
equivalent to std::__cxx11), and profile data will be treated as
applying to a particular function if its name is equivalent to the name
of a function in the profile data under the provided equivalences. See
the documentation change for a description of how this is configured.

Remapping is supported for both sample-based profiling and instruction
profiling. We do not support remapping indirect branch target
information, but all other profile data should be remapped
appropriately.

Support is only added for the new pass manager. If someone wants to also
add support for this for the old pass manager, doing so should be
straightforward.

This is the LLVM side of Clang r344199.

Reviewers: davidxl, tejohnson, dlj, erik.pilkington

Subscribers: mehdi_amini, steven_wu, dexonsmith, llvm-commits

Differential Revision: https://reviews.llvm.org/D51249

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344200 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/LTO/Config.h                     |  3 +
 include/llvm/Passes/PassBuilder.h             | 13 ++--
 include/llvm/Transforms/IPO/SampleProfile.h   |  7 ++-
 .../Instrumentation/PGOInstrumentation.h      |  4 +-
 lib/LTO/LTO.cpp                               |  9 ++-
 lib/LTO/LTOBackend.cpp                        |  3 +-
 lib/Passes/PassBuilder.cpp                    |  9 ++-
 lib/Transforms/IPO/SampleProfile.cpp          | 48 +++++++++++----
 .../Instrumentation/PGOInstrumentation.cpp    | 23 +++++--
 test/Transforms/PGOProfile/Inputs/remap.map   |  8 +++
 .../PGOProfile/Inputs/remap.proftext          |  8 +++
 test/Transforms/PGOProfile/remap.ll           | 28 +++++++++
 .../Transforms/SampleProfile/Inputs/remap.map |  8 +++
 .../SampleProfile/Inputs/remap.prof           | 10 ++++
 test/Transforms/SampleProfile/remap.ll        | 60 +++++++++++++++++++
 tools/opt/NewPMDriver.cpp                     | 12 ++--
 16 files changed, 221 insertions(+), 32 deletions(-)
 create mode 100644 test/Transforms/PGOProfile/Inputs/remap.map
 create mode 100644 test/Transforms/PGOProfile/Inputs/remap.proftext
 create mode 100644 test/Transforms/PGOProfile/remap.ll
 create mode 100644 test/Transforms/SampleProfile/Inputs/remap.map
 create mode 100644 test/Transforms/SampleProfile/Inputs/remap.prof
 create mode 100644 test/Transforms/SampleProfile/remap.ll

diff --git a/include/llvm/LTO/Config.h b/include/llvm/LTO/Config.h
index 57bba5e3484..c0ad32f485c 100644
--- a/include/llvm/LTO/Config.h
+++ b/include/llvm/LTO/Config.h
@@ -73,6 +73,9 @@ struct Config {
   /// Sample PGO profile path.
   std::string SampleProfile;
 
+  /// Name remapping file for profile data.
+  std::string ProfileRemapping;
+
   /// The directory to store .dwo files.
   std::string DwoDir;
 
diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h
index 02d3dc324bc..91314430a96 100644
--- a/include/llvm/Passes/PassBuilder.h
+++ b/include/llvm/Passes/PassBuilder.h
@@ -32,10 +32,13 @@ class ModuleSummaryIndex;
 /// A struct capturing PGO tunables.
 struct PGOOptions {
   PGOOptions(std::string ProfileGenFile = "", std::string ProfileUseFile = "",
-             std::string SampleProfileFile = "", bool RunProfileGen = false,
-             bool SamplePGOSupport = false)
+             std::string SampleProfileFile = "",
+             std::string ProfileRemappingFile = "",
+             bool RunProfileGen = false, bool SamplePGOSupport = false)
       : ProfileGenFile(ProfileGenFile), ProfileUseFile(ProfileUseFile),
-        SampleProfileFile(SampleProfileFile), RunProfileGen(RunProfileGen),
+        SampleProfileFile(SampleProfileFile),
+        ProfileRemappingFile(ProfileRemappingFile),
+        RunProfileGen(RunProfileGen),
         SamplePGOSupport(SamplePGOSupport || !SampleProfileFile.empty()) {
     assert((RunProfileGen ||
             !SampleProfileFile.empty() ||
@@ -45,6 +48,7 @@ struct PGOOptions {
   std::string ProfileGenFile;
   std::string ProfileUseFile;
   std::string SampleProfileFile;
+  std::string ProfileRemappingFile;
   bool RunProfileGen;
   bool SamplePGOSupport;
 };
@@ -587,7 +591,8 @@ private:
   void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
                          OptimizationLevel Level, bool RunProfileGen,
                          std::string ProfileGenFile,
-                         std::string ProfileUseFile);
+                         std::string ProfileUseFile,
+                         std::string ProfileRemappingFile);
 
   void invokePeepholeEPCallbacks(FunctionPassManager &, OptimizationLevel);
 
diff --git a/include/llvm/Transforms/IPO/SampleProfile.h b/include/llvm/Transforms/IPO/SampleProfile.h
index cd5a0563898..af4a933ec1f 100644
--- a/include/llvm/Transforms/IPO/SampleProfile.h
+++ b/include/llvm/Transforms/IPO/SampleProfile.h
@@ -25,13 +25,16 @@ class Module;
 /// The sample profiler data loader pass.
 class SampleProfileLoaderPass : public PassInfoMixin<SampleProfileLoaderPass> {
 public:
-  SampleProfileLoaderPass(std::string File = "", bool IsThinLTOPreLink = false)
-      : ProfileFileName(File), IsThinLTOPreLink(IsThinLTOPreLink) {}
+  SampleProfileLoaderPass(std::string File = "", std::string RemappingFile = "",
+                          bool IsThinLTOPreLink = false)
+      : ProfileFileName(File), ProfileRemappingFileName(RemappingFile),
+        IsThinLTOPreLink(IsThinLTOPreLink) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
 private:
   std::string ProfileFileName;
+  std::string ProfileRemappingFileName;
   bool IsThinLTOPreLink;
 };
 
diff --git a/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
index c0b37c470b7..fdc5df68a66 100644
--- a/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
+++ b/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
@@ -36,12 +36,14 @@ public:
 /// The profile annotation (profile-instr-use) pass for IR based PGO.
 class PGOInstrumentationUse : public PassInfoMixin<PGOInstrumentationUse> {
 public:
-  PGOInstrumentationUse(std::string Filename = "");
+  PGOInstrumentationUse(std::string Filename = "",
+                        std::string RemappingFilename = "");
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
 private:
   std::string ProfileFileName;
+  std::string ProfileRemappingFileName;
 };
 
 /// The indirect function call promotion pass.
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 50d0075a608..6942cb28af2 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -263,8 +263,15 @@ static void computeCacheKey(
 
   if (!Conf.SampleProfile.empty()) {
     auto FileOrErr = MemoryBuffer::getFile(Conf.SampleProfile);
-    if (FileOrErr)
+    if (FileOrErr) {
       Hasher.update(FileOrErr.get()->getBuffer());
+
+      if (!Conf.ProfileRemapping.empty()) {
+        FileOrErr = MemoryBuffer::getFile(Conf.ProfileRemapping);
+        if (FileOrErr)
+          Hasher.update(FileOrErr.get()->getBuffer());
+      }
+    }
   }
 
   Key = toHex(Hasher.result());
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index be33ab84933..20fc40de4b9 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -155,7 +155,8 @@ static void runNewPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
                            const ModuleSummaryIndex *ImportSummary) {
   Optional<PGOOptions> PGOOpt;
   if (!Conf.SampleProfile.empty())
-    PGOOpt = PGOOptions("", "", Conf.SampleProfile, false, true);
+    PGOOpt = PGOOptions("", "", Conf.SampleProfile, Conf.ProfileRemapping,
+                        false, true);
 
   PassBuilder PB(TM, PGOOpt);
   AAManager AA;
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index a880befc0d5..94afb5409e1 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -505,7 +505,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
                                     PassBuilder::OptimizationLevel Level,
                                     bool RunProfileGen,
                                     std::string ProfileGenFile,
-                                    std::string ProfileUseFile) {
+                                    std::string ProfileUseFile,
+                                    std::string ProfileRemappingFile) {
   // Generally running simplification passes and the inliner with an high
   // threshold results in smaller executables, but there may be cases where
   // the size grows, so let's be conservative here and skip this simplification
@@ -559,7 +560,7 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
   }
 
   if (!ProfileUseFile.empty())
-    MPM.addPass(PGOInstrumentationUse(ProfileUseFile));
+    MPM.addPass(PGOInstrumentationUse(ProfileUseFile, ProfileRemappingFile));
 }
 
 static InlineParams
@@ -605,6 +606,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
     // Annotate sample profile right after early FPM to ensure freshness of
     // the debug info.
     MPM.addPass(SampleProfileLoaderPass(PGOOpt->SampleProfileFile,
+                                        PGOOpt->ProfileRemappingFile,
                                         Phase == ThinLTOPhase::PreLink));
     // Do not invoke ICP in the ThinLTOPrelink phase as it makes it hard
     // for the profile annotation to be accurate in the ThinLTO backend.
@@ -657,7 +659,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   if (PGOOpt && Phase != ThinLTOPhase::PostLink &&
       (!PGOOpt->ProfileGenFile.empty() || !PGOOpt->ProfileUseFile.empty())) {
     addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen,
-                      PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile);
+                      PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile,
+                      PGOOpt->ProfileRemappingFile);
     MPM.addPass(PGOIndirectCallPromotion(false, false));
   }
 
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 182202fda05..4a69a0c2806 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -96,6 +96,13 @@ static cl::opt<std::string> SampleProfileFile(
     "sample-profile-file", cl::init(""), cl::value_desc("filename"),
     cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
 
+// The named file contains a set of transformations that may have been applied
+// to the symbol names between the program from which the sample data was
+// collected and the current program's symbols.
+static cl::opt<std::string> SampleProfileRemappingFile(
+    "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
+
 static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
     "sample-profile-max-propagate-iterations", cl::init(100),
     cl::desc("Maximum number of iterations to go through when propagating "
@@ -183,12 +190,12 @@ private:
 class SampleProfileLoader {
 public:
   SampleProfileLoader(
-      StringRef Name, bool IsThinLTOPreLink,
+      StringRef Name, StringRef RemapName, bool IsThinLTOPreLink,
       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
       std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo)
       : GetAC(std::move(GetAssumptionCache)),
         GetTTI(std::move(GetTargetTransformInfo)), Filename(Name),
-        IsThinLTOPreLink(IsThinLTOPreLink) {}
+        RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {}
 
   bool doInitialization(Module &M);
   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
@@ -282,6 +289,9 @@ protected:
   /// Name of the profile file to load.
   std::string Filename;
 
+  /// Name of the profile remapping file to load.
+  std::string RemappingFilename;
+
   /// Flag indicating whether the profile input loaded successfully.
   bool ProfileIsValid = false;
 
@@ -311,13 +321,14 @@ public:
 
   SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile,
                                 bool IsThinLTOPreLink = false)
-      : ModulePass(ID), SampleLoader(Name, IsThinLTOPreLink,
-                                     [&](Function &F) -> AssumptionCache & {
-                                       return ACT->getAssumptionCache(F);
-                                     },
-                                     [&](Function &F) -> TargetTransformInfo & {
-                                       return TTIWP->getTTI(F);
-                                     }) {
+      : ModulePass(ID),
+        SampleLoader(Name, SampleProfileRemappingFile, IsThinLTOPreLink,
+                     [&](Function &F) -> AssumptionCache & {
+                       return ACT->getAssumptionCache(F);
+                     },
+                     [&](Function &F) -> TargetTransformInfo & {
+                       return TTIWP->getTTI(F);
+                     }) {
     initializeSampleProfileLoaderLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
@@ -1515,11 +1526,26 @@ bool SampleProfileLoader::doInitialization(Module &M) {
   Reader = std::move(ReaderOrErr.get());
   Reader->collectFuncsToUse(M);
   ProfileIsValid = (Reader->read() == sampleprof_error::success);
+
+  if (!RemappingFilename.empty()) {
+    // Apply profile remappings to the loaded profile data if requested.
+    // For now, we only support remapping symbols encoded using the Itanium
+    // C++ ABI's name mangling scheme.
+    ReaderOrErr = SampleProfileReaderItaniumRemapper::create(
+        RemappingFilename, Ctx, std::move(Reader));
+    if (std::error_code EC = ReaderOrErr.getError()) {
+      std::string Msg = "Could not open profile remapping file: " + EC.message();
+      Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+      return false;
+    }
+    Reader = std::move(ReaderOrErr.get());
+    ProfileIsValid = (Reader->read() == sampleprof_error::success);
+  }
   return true;
 }
 
 ModulePass *llvm::createSampleProfileLoaderPass() {
-  return new SampleProfileLoaderLegacyPass(SampleProfileFile);
+  return new SampleProfileLoaderLegacyPass();
 }
 
 ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
@@ -1612,6 +1638,8 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
 
   SampleProfileLoader SampleLoader(
       ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
+      ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
+                                       : ProfileRemappingFileName,
       IsThinLTOPreLink, GetAssumptionCache, GetTTI);
 
   SampleLoader.doInitialization(M);
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 307b7eaa219..ac851f660d9 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -141,6 +141,11 @@ static cl::opt<std::string>
                        cl::value_desc("filename"),
                        cl::desc("Specify the path of profile data file. This is"
                                 "mainly for test purpose."));
+static cl::opt<std::string> PGOTestProfileRemappingFile(
+    "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden,
+    cl::value_desc("filename"),
+    cl::desc("Specify the path of profile remapping file. This is mainly for "
+             "test purpose."));
 
 // Command line option to disable value profiling. The default is false:
 // i.e. value profiling is enabled by default. This is for debug purpose.
@@ -1429,13 +1434,14 @@ PreservedAnalyses PGOInstrumentationGen::run(Module &M,
 }
 
 static bool annotateAllFunctions(
-    Module &M, StringRef ProfileFileName,
+    Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName,
     function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI) {
   LLVM_DEBUG(dbgs() << "Read in profile counters: ");
   auto &Ctx = M.getContext();
   // Read the counter array from file.
-  auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName);
+  auto ReaderOrErr =
+      IndexedInstrProfReader::create(ProfileFileName, ProfileRemappingFileName);
   if (Error E = ReaderOrErr.takeError()) {
     handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
       Ctx.diagnose(
@@ -1529,10 +1535,14 @@ static bool annotateAllFunctions(
   return true;
 }
 
-PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename)
-    : ProfileFileName(std::move(Filename)) {
+PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename,
+                                             std::string RemappingFilename)
+    : ProfileFileName(std::move(Filename)),
+      ProfileRemappingFileName(std::move(RemappingFilename)) {
   if (!PGOTestProfileFile.empty())
     ProfileFileName = PGOTestProfileFile;
+  if (!PGOTestProfileRemappingFile.empty())
+    ProfileRemappingFileName = PGOTestProfileRemappingFile;
 }
 
 PreservedAnalyses PGOInstrumentationUse::run(Module &M,
@@ -1547,7 +1557,8 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
     return &FAM.getResult<BlockFrequencyAnalysis>(F);
   };
 
-  if (!annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI))
+  if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName,
+                            LookupBPI, LookupBFI))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -1564,7 +1575,7 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
   };
 
-  return annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI);
+  return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI);
 }
 
 static std::string getSimpleNodeName(const BasicBlock *Node) {
diff --git a/test/Transforms/PGOProfile/Inputs/remap.map b/test/Transforms/PGOProfile/Inputs/remap.map
new file mode 100644
index 00000000000..df3d82d38bd
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/remap.map
@@ -0,0 +1,8 @@
+# foo:: and foo::detail:: are equivalent
+name 3foo N3foo6detailE
+
+# foo::qux and foo::quux are equivalent
+type N3foo3quxE N3foo4quuxE
+
+# N::X and M::X are equivalent
+name N1N1XE N1M1XE
diff --git a/test/Transforms/PGOProfile/Inputs/remap.proftext b/test/Transforms/PGOProfile/Inputs/remap.proftext
new file mode 100644
index 00000000000..40054d78f5a
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/remap.proftext
@@ -0,0 +1,8 @@
+# :ir is the flag to indicate this is IR level profile.
+:ir
+_ZN3foo3barERKN1N1XINS_4quuxEEE
+25571299074
+2
+3
+2
+
diff --git a/test/Transforms/PGOProfile/remap.ll b/test/Transforms/PGOProfile/remap.ll
new file mode 100644
index 00000000000..2fdca9e33d1
--- /dev/null
+++ b/test/Transforms/PGOProfile/remap.ll
@@ -0,0 +1,28 @@
+; RUN: llvm-profdata merge %S/Inputs/remap.proftext -o %t.profdata
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -pgo-test-profile-remapping-file=%S/Inputs/remap.map -S | FileCheck %s --check-prefix=USE
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @_ZN3foo3barERKN1M1XINS_6detail3quxEEE(i32 %i) {
+; USE-LABEL: @_ZN3foo3barERKN1M1XINS_6detail3quxEEE
+; USE-SAME: !prof ![[FUNC_ENTRY_COUNT:[0-9]+]]
+entry:
+  %cmp = icmp sgt i32 %i, 0
+  br i1 %cmp, label %if.then, label %if.end
+; USE: br i1 %cmp, label %if.then, label %if.end
+; USE-SAME: !prof ![[BW_ENTRY:[0-9]+]]
+
+if.then:
+  %add = add nsw i32 %i, 2
+  br label %if.end
+
+if.end:
+  %retv = phi i32 [ %add, %if.then ], [ %i, %entry ]
+  ret i32 %retv
+}
+
+; USE-DAG: {{![0-9]+}} = !{i32 1, !"ProfileSummary", {{![0-9]+}}}
+; USE-DAG: {{![0-9]+}} = !{!"DetailedSummary", {{![0-9]+}}}
+; USE-DAG: ![[FUNC_ENTRY_COUNT]] = !{!"function_entry_count", i64 3}
+; USE-DAG: ![[BW_ENTRY]] = !{!"branch_weights", i32 2, i32 1}
diff --git a/test/Transforms/SampleProfile/Inputs/remap.map b/test/Transforms/SampleProfile/Inputs/remap.map
new file mode 100644
index 00000000000..df3d82d38bd
--- /dev/null
+++ b/test/Transforms/SampleProfile/Inputs/remap.map
@@ -0,0 +1,8 @@
+# foo:: and foo::detail:: are equivalent
+name 3foo N3foo6detailE
+
+# foo::qux and foo::quux are equivalent
+type N3foo3quxE N3foo4quuxE
+
+# N::X and M::X are equivalent
+name N1N1XE N1M1XE
diff --git a/test/Transforms/SampleProfile/Inputs/remap.prof b/test/Transforms/SampleProfile/Inputs/remap.prof
new file mode 100644
index 00000000000..8244a51a165
--- /dev/null
+++ b/test/Transforms/SampleProfile/Inputs/remap.prof
@@ -0,0 +1,10 @@
+_ZN3foo3barERKN1N1XINS_4quuxEEE:15680:2500
+ 1: 2500
+ 4: 1000
+ 5: 1000
+ 6: 800
+ 7: 500
+ 9: 10226
+ 10: 2243
+ 16: 0
+ 18: 0
diff --git a/test/Transforms/SampleProfile/remap.ll b/test/Transforms/SampleProfile/remap.ll
new file mode 100644
index 00000000000..206962a3bef
--- /dev/null
+++ b/test/Transforms/SampleProfile/remap.ll
@@ -0,0 +1,60 @@
+; RUN: opt %s -passes=sample-profile -sample-profile-file=%S/Inputs/remap.prof -sample-profile-remapping-file=%S/Inputs/remap.map | opt -analyze -branch-prob | FileCheck %s
+
+; Reduced from branch.ll
+
+declare i1 @foo()
+
+define void @_ZN3foo3barERKN1M1XINS_6detail3quxEEE() !dbg !2 {
+; CHECK: Printing analysis 'Branch Probability Analysis' for function '_ZN3foo3barERKN1M1XINS_6detail3quxEEE':
+
+entry:
+  %cmp = call i1 @foo(), !dbg !6
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK:  edge entry -> if.then probability is 0x4ccf6b16 / 0x80000000 = 60.01%
+; CHECK:  edge entry -> if.end probability is 0x333094ea / 0x80000000 = 39.99%
+
+if.then:
+  br label %return
+
+if.end:
+  %cmp1 = call i1 @foo(), !dbg !7
+  br i1 %cmp1, label %if.then.2, label %if.else
+; CHECK: edge if.end -> if.then.2 probability is 0x6652c748 / 0x80000000 = 79.94%
+; CHECK: edge if.end -> if.else probability is 0x19ad38b8 / 0x80000000 = 20.06%
+
+if.then.2:
+  call i1 @foo(), !dbg !8
+  br label %for.cond
+
+for.cond:
+  %cmp5 = call i1 @foo()
+  br i1 %cmp5, label %for.body, label %for.end, !prof !9
+; CHECK: edge for.cond -> for.body probability is 0x73333333 / 0x80000000 = 90.00%
+; CHECK: edge for.cond -> for.end probability is 0x0ccccccd / 0x80000000 = 10.00%
+
+for.body:
+  br label %for.cond
+
+for.end:
+  br label %return
+
+if.else:
+  br label %return
+
+return:
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!4, !5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "foo++", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !{}, retainedTypes: !{})
+!1 = !DIFile(filename: "test.cc", directory: "/foo/bar")
+!2 = distinct !DISubprogram(name: "_ZN3foo3barERKN1M1XINS_6detail3quxEEE", scope: !1, file: !1, line: 4, type: !3, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !{})
+!3 = !DISubroutineType(types: !{})
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !DILocation(line: 5, column: 8, scope: !2)
+!7 = !DILocation(line: 8, column: 6, scope: !2)
+!8 = !DILocation(line: 10, column: 11, scope: !2)
+!9 = !{!"branch_weights", i32 90, i32 10}
diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp
index 55ca23cd6f3..e63547a79d0 100644
--- a/tools/opt/NewPMDriver.cpp
+++ b/tools/opt/NewPMDriver.cpp
@@ -108,6 +108,10 @@ static cl::opt<PGOKind> PGOKindFlag(
                           "Use sampled profile to guide PGO.")));
 static cl::opt<std::string> ProfileFile(
     "profile-file", cl::desc("Path to the profile."), cl::Hidden);
+static cl::opt<std::string>
+    ProfileRemappingFile("profile-remapping-file",
+                         cl::desc("Path to the profile remapping file."),
+                         cl::Hidden);
 static cl::opt<bool> DebugInfoForProfiling(
     "new-pm-debug-info-for-profiling", cl::init(false), cl::Hidden,
     cl::desc("Emit special debug info to enable PGO profile generation."));
@@ -200,17 +204,17 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   Optional<PGOOptions> P;
   switch (PGOKindFlag) {
     case InstrGen:
-      P = PGOOptions(ProfileFile, "", "", true);
+      P = PGOOptions(ProfileFile, "", "", "", true);
       break;
     case InstrUse:
-      P = PGOOptions("", ProfileFile, "", false);
+      P = PGOOptions("", ProfileFile, "", ProfileRemappingFile, false);
       break;
     case SampleUse:
-      P = PGOOptions("", "", ProfileFile, false);
+      P = PGOOptions("", "", ProfileFile, ProfileRemappingFile, false);
       break;
     case NoPGO:
       if (DebugInfoForProfiling)
-        P = PGOOptions("", "", "", false, true);
+        P = PGOOptions("", "", "", "", false, true);
       else
         P = None;
   }
-- 
GitLab


From 9528e40193fc42c0bb185ae2442a063b035e752c Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Wed, 10 Oct 2018 23:53:12 +0000
Subject: [PATCH 0034/1116] llvm-c: Add C APIs to access DebugLoc info

Add thin shims to C interface to provide access to DebugLoc info for
Instructions, GlobalVariables and Functions.  Patch by Josh Berdine!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344202 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm-c/Core.h | 38 ++++++++++++++++++++++
 lib/IR/Core.cpp       | 73 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)

diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index f7f22387b53..2e8c29c23bf 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -929,6 +929,44 @@ void LLVMGetNamedMetadataOperands(LLVMModuleRef M, const char *Name,
 void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char *Name,
                                  LLVMValueRef Val);
 
+/**
+ * Return the directory of the debug location for this value, which must be
+ * an llvm::Instruction, llvm::GlobalVariable, or llvm::Function.
+ *
+ * @see llvm::Instruction::getDebugLoc()
+ * @see llvm::GlobalVariable::getDebugInfo()
+ * @see llvm::Function::getSubprogram()
+ */
+const char *LLVMGetDebugLocDirectory(LLVMValueRef Val, unsigned *Length);
+
+/**
+ * Return the filename of the debug location for this value, which must be
+ * an llvm::Instruction, llvm::GlobalVariable, or llvm::Function.
+ *
+ * @see llvm::Instruction::getDebugLoc()
+ * @see llvm::GlobalVariable::getDebugInfo()
+ * @see llvm::Function::getSubprogram()
+ */
+const char *LLVMGetDebugLocFilename(LLVMValueRef Val, unsigned *Length);
+
+/**
+ * Return the line number of the debug location for this value, which must be
+ * an llvm::Instruction, llvm::GlobalVariable, or llvm::Function.
+ *
+ * @see llvm::Instruction::getDebugLoc()
+ * @see llvm::GlobalVariable::getDebugInfo()
+ * @see llvm::Function::getSubprogram()
+ */
+unsigned LLVMGetDebugLocLine(LLVMValueRef Val);
+
+/**
+ * Return the column number of the debug location for this value, which must be
+ * an llvm::Instruction.
+ *
+ * @see llvm::Instruction::getDebugLoc()
+ */
+unsigned LLVMGetDebugLocColumn(LLVMValueRef Val);
+
 /**
  * Add a function to a module under a specified name.
  *
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 410a426a4a2..639b6b4489a 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
@@ -1189,6 +1190,78 @@ void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char *Name,
   N->addOperand(extractMDNode(unwrap<MetadataAsValue>(Val)));
 }
 
+const char *LLVMGetDebugLocDirectory(LLVMValueRef Val, unsigned *Length) {
+  if (!Length) return nullptr;
+  StringRef S;
+  if (const auto *I = unwrap<Instruction>(Val)) {
+    S = I->getDebugLoc()->getDirectory();
+  } else if (const auto *GV = unwrap<GlobalVariable>(Val)) {
+    SmallVector<DIGlobalVariableExpression *, 1> GVEs;
+    GV->getDebugInfo(GVEs);
+    if (GVEs.size())
+      if (const DIGlobalVariable *DGV = GVEs[0]->getVariable())
+        S = DGV->getDirectory();
+  } else if (const auto *F = unwrap<Function>(Val)) {
+    if (const DISubprogram *DSP = F->getSubprogram())
+      S = DSP->getDirectory();
+  } else {
+    assert(0 && "Expected Instruction, GlobalVariable or Function");
+    return nullptr;
+  }
+  *Length = S.size();
+  return S.data();
+}
+
+const char *LLVMGetDebugLocFilename(LLVMValueRef Val, unsigned *Length) {
+  if (!Length) return nullptr;
+  StringRef S;
+  if (const auto *I = unwrap<Instruction>(Val)) {
+    S = I->getDebugLoc()->getFilename();
+  } else if (const auto *GV = unwrap<GlobalVariable>(Val)) {
+    SmallVector<DIGlobalVariableExpression *, 1> GVEs;
+    GV->getDebugInfo(GVEs);
+    if (GVEs.size())
+      if (const DIGlobalVariable *DGV = GVEs[0]->getVariable())
+        S = DGV->getFilename();
+  } else if (const auto *F = unwrap<Function>(Val)) {
+    if (const DISubprogram *DSP = F->getSubprogram())
+      S = DSP->getFilename();
+  } else {
+    assert(0 && "Expected Instruction, GlobalVariable or Function");
+    return nullptr;
+  }
+  *Length = S.size();
+  return S.data();
+}
+
+unsigned LLVMGetDebugLocLine(LLVMValueRef Val) {
+  unsigned L = 0;
+  if (const auto *I = unwrap<Instruction>(Val)) {
+    L = I->getDebugLoc()->getLine();
+  } else if (const auto *GV = unwrap<GlobalVariable>(Val)) {
+    SmallVector<DIGlobalVariableExpression *, 1> GVEs;
+    GV->getDebugInfo(GVEs);
+    if (GVEs.size())
+      if (const DIGlobalVariable *DGV = GVEs[0]->getVariable())
+        L = DGV->getLine();
+  } else if (const auto *F = unwrap<Function>(Val)) {
+    if (const DISubprogram *DSP = F->getSubprogram())
+      L = DSP->getLine();
+  } else {
+    assert(0 && "Expected Instruction, GlobalVariable or Function");
+    return -1;
+  }
+  return L;
+}
+
+unsigned LLVMGetDebugLocColumn(LLVMValueRef Val) {
+  unsigned C = 0;
+  if (const auto *I = unwrap<Instruction>(Val))
+    if (const auto &L = I->getDebugLoc())
+      C = L->getColumn();
+  return C;
+}
+
 /*--.. Operations on scalar constants ......................................--*/
 
 LLVMValueRef LLVMConstInt(LLVMTypeRef IntTy, unsigned long long N,
-- 
GitLab


From ca6f7dc5ec23fe28db50bbbad42112a9c9acce17 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Thu, 11 Oct 2018 00:01:25 +0000
Subject: [PATCH 0035/1116] [WebAssembly] Saturating float to int intrinsics

Summary:
Although the saturating float to int instructions are already
emitted from normal IR, the fpto{s,u}i instructions produce poison
values if the argument cannot fit in the result type. These intrinsics
are therefore necessary to get guaranteed defined saturating behavior.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53004

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344204 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsWebAssembly.td      | 11 +++
 .../WebAssembly/WebAssemblyInstrConv.td       | 18 ++++
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 10 +++
 test/CodeGen/WebAssembly/conv.ll              | 88 +++++++++++++++++++
 test/CodeGen/WebAssembly/simd-intrinsics.ll   | 48 ++++++++++
 5 files changed, 175 insertions(+)

diff --git a/include/llvm/IR/IntrinsicsWebAssembly.td b/include/llvm/IR/IntrinsicsWebAssembly.td
index 54408d317d2..adf7cb0ba0e 100644
--- a/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -36,6 +36,17 @@ def int_wasm_mem_grow : Intrinsic<[llvm_anyint_ty],
 def int_wasm_current_memory : Intrinsic<[llvm_anyint_ty], [], [IntrReadMem]>;
 def int_wasm_grow_memory : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], []>;
 
+//===----------------------------------------------------------------------===//
+// Saturating float-to-int conversions
+//===----------------------------------------------------------------------===//
+
+def int_wasm_trunc_saturate_signed : Intrinsic<[llvm_anyint_ty],
+                                               [llvm_anyfloat_ty],
+                                               [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_trunc_saturate_unsigned : Intrinsic<[llvm_anyint_ty],
+                                                 [llvm_anyfloat_ty],
+                                                 [IntrNoMem, IntrSpeculatable]>;
+
 //===----------------------------------------------------------------------===//
 // Exception handling intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index e9ba52799ee..0d772c743a7 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -97,6 +97,24 @@ defm I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
                              "i64.trunc_u:sat/f64", 0xfc07>,
                              Requires<[HasNontrappingFPToInt]>;
 
+// Lower llvm.wasm.trunc.saturate.* to saturating instructions
+def : Pat<(int_wasm_trunc_saturate_signed F32:$src),
+          (I32_TRUNC_S_SAT_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_saturate_unsigned F32:$src),
+          (I32_TRUNC_U_SAT_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_saturate_signed F64:$src),
+          (I32_TRUNC_S_SAT_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src),
+          (I32_TRUNC_U_SAT_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_saturate_signed F32:$src),
+          (I64_TRUNC_S_SAT_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_saturate_unsigned F32:$src),
+          (I64_TRUNC_U_SAT_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_saturate_signed F64:$src),
+          (I64_TRUNC_S_SAT_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src),
+          (I64_TRUNC_U_SAT_F64 F64:$src)>;
+
 // Conversion from floating point to integer pseudo-instructions which don't
 // trap on overflow or invalid.
 let usesCustomInserter = 1, isCodeGenOnly = 1 in {
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 90bdc17890b..4fffd979cd6 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -782,6 +782,16 @@ defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_u/f32x4", 148>;
 defm "" : SIMDConvert<v2i64, v2f64, fp_to_sint, "i64x2.trunc_sat_s/f64x2", 149>;
 defm "" : SIMDConvert<v2i64, v2f64, fp_to_uint, "i64x2.trunc_sat_u/f64x2", 150>;
 
+// Lower llvm.wasm.trunc.saturate.* to saturating instructions
+def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
+          (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>;
+def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))),
+          (fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>;
+def : Pat<(v2i64 (int_wasm_trunc_saturate_signed (v2f64 V128:$src))),
+          (fp_to_sint_v2i64_v2f64 (v2f64 V128:$src))>;
+def : Pat<(v2i64 (int_wasm_trunc_saturate_unsigned (v2f64 V128:$src))),
+          (fp_to_uint_v2i64_v2f64 (v2f64 V128:$src))>;
+
 // Bitcasts are nops
 // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types
 foreach t1 = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
diff --git a/test/CodeGen/WebAssembly/conv.ll b/test/CodeGen/WebAssembly/conv.ll
index bd3ae29e28e..ea1ef9737c0 100644
--- a/test/CodeGen/WebAssembly/conv.ll
+++ b/test/CodeGen/WebAssembly/conv.ll
@@ -45,6 +45,17 @@ define i32 @i32_trunc_s_f32(float %x) {
   ret i32 %a
 }
 
+; CHECK-LABEL: i32_trunc_sat_s_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_s:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float)
+define i32 @i32_trunc_sat_s_f32(float %x) {
+  %a = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float %x)
+  ret i32 %a
+}
+
 ; CHECK-LABEL: i32_trunc_u_f32:
 ; CHECK-NEXT: .param f32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
@@ -55,6 +66,17 @@ define i32 @i32_trunc_u_f32(float %x) {
   ret i32 %a
 }
 
+; CHECK-LABEL: i32_trunc_sat_u_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_u:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float)
+define i32 @i32_trunc_sat_u_f32(float %x) {
+  %a = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float %x)
+  ret i32 %a
+}
+
 ; CHECK-LABEL: i32_trunc_s_f64:
 ; CHECK-NEXT: .param f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
@@ -65,6 +87,17 @@ define i32 @i32_trunc_s_f64(double %x) {
   ret i32 %a
 }
 
+; CHECK-LABEL: i32_trunc_sat_s_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_s:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double)
+define i32 @i32_trunc_sat_s_f64(double %x) {
+  %a = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double %x)
+  ret i32 %a
+}
+
 ; CHECK-LABEL: i32_trunc_u_f64:
 ; CHECK-NEXT: .param f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
@@ -75,6 +108,17 @@ define i32 @i32_trunc_u_f64(double %x) {
   ret i32 %a
 }
 
+; CHECK-LABEL: i32_trunc_sat_u_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_u:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double)
+define i32 @i32_trunc_sat_u_f64(double %x) {
+  %a = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double %x)
+  ret i32 %a
+}
+
 ; CHECK-LABEL: i64_trunc_s_f32:
 ; CHECK-NEXT: .param f32{{$}}
 ; CHECK-NEXT: .result i64{{$}}
@@ -85,6 +129,17 @@ define i64 @i64_trunc_s_f32(float %x) {
   ret i64 %a
 }
 
+; CHECK-LABEL: i64_trunc_sat_s_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_s:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float)
+define i64 @i64_trunc_sat_s_f32(float %x) {
+  %a = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float %x)
+  ret i64 %a
+}
+
 ; CHECK-LABEL: i64_trunc_u_f32:
 ; CHECK-NEXT: .param f32{{$}}
 ; CHECK-NEXT: .result i64{{$}}
@@ -95,6 +150,17 @@ define i64 @i64_trunc_u_f32(float %x) {
   ret i64 %a
 }
 
+; CHECK-LABEL: i64_trunc_sat_u_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_u:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float)
+define i64 @i64_trunc_sat_u_f32(float %x) {
+  %a = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float %x)
+  ret i64 %a
+}
+
 ; CHECK-LABEL: i64_trunc_s_f64:
 ; CHECK-NEXT: .param f64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
@@ -105,6 +171,17 @@ define i64 @i64_trunc_s_f64(double %x) {
   ret i64 %a
 }
 
+; CHECK-LABEL: i64_trunc_sat_s_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_s:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double)
+define i64 @i64_trunc_sat_s_f64(double %x) {
+  %a = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double %x)
+  ret i64 %a
+}
+
 ; CHECK-LABEL: i64_trunc_u_f64:
 ; CHECK-NEXT: .param f64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
@@ -115,6 +192,17 @@ define i64 @i64_trunc_u_f64(double %x) {
   ret i64 %a
 }
 
+; CHECK-LABEL: i64_trunc_sat_u_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_u:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double)
+define i64 @i64_trunc_sat_u_f64(double %x) {
+  %a = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double %x)
+  ret i64 %a
+}
+
 ; CHECK-LABEL: f32_convert_s_i32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result f32{{$}}
diff --git a/test/CodeGen/WebAssembly/simd-intrinsics.ll b/test/CodeGen/WebAssembly/simd-intrinsics.ll
index f9f4eb0cf9e..ab32929ceb8 100644
--- a/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -226,6 +226,30 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
   ret <4 x i32> %a
 }
 
+; CHECK-LABEL: trunc_sat_s_v4i32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.trunc_sat_s/f32x4 $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+declare <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float>)
+define <4 x i32> @trunc_sat_s_v4i32(<4 x float> %x) {
+  %a = call <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float> %x)
+  ret <4 x i32> %a
+}
+
+; CHECK-LABEL: trunc_sat_u_v4i32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.trunc_sat_u/f32x4 $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+declare <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float>)
+define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) {
+  %a = call <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float> %x)
+  ret <4 x i32> %a
+}
+
 ; ==============================================================================
 ; 2 x i64
 ; ==============================================================================
@@ -264,6 +288,30 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
   ret <2 x i64> %a
 }
 
+; CHECK-LABEL: trunc_sat_s_v2i64:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.trunc_sat_s/f64x2 $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+declare <2 x i64> @llvm.wasm.trunc.saturate.signed.v2i64.v2f64(<2 x double>)
+define <2 x i64> @trunc_sat_s_v2i64(<2 x double> %x) {
+  %a = call <2 x i64> @llvm.wasm.trunc.saturate.signed.v2i64.v2f64(<2 x double> %x)
+  ret <2 x i64> %a
+}
+
+; CHECK-LABEL: trunc_sat_u_v2i64:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.trunc_sat_u/f64x2 $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+declare <2 x i64> @llvm.wasm.trunc.saturate.unsigned.v2i64.v2f64(<2 x double>)
+define <2 x i64> @trunc_sat_u_v2i64(<2 x double> %x) {
+  %a = call <2 x i64> @llvm.wasm.trunc.saturate.unsigned.v2i64.v2f64(<2 x double> %x)
+  ret <2 x i64> %a
+}
+
 ; ==============================================================================
 ; 4 x f32
 ; ==============================================================================
-- 
GitLab


From d36df14f65d85d69bce145f509bd0b28ee92dce4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Thu, 11 Oct 2018 00:08:59 +0000
Subject: [PATCH 0036/1116] [MC][ELF] Fix section_mergeable_size.ll

Some targets use %progbits instead of @progbits.

Updating that check with a {{[@%]}}progbits regex to make those bots happy.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344206 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/Generic/section_mergeable_size.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CodeGen/Generic/section_mergeable_size.ll b/test/CodeGen/Generic/section_mergeable_size.ll
index fbab7fe849f..0a7ddd110c4 100644
--- a/test/CodeGen/Generic/section_mergeable_size.ll
+++ b/test/CodeGen/Generic/section_mergeable_size.ll
@@ -1,3 +1,3 @@
 ; RUN: llc < %s | FileCheck %s
 @a = internal unnamed_addr constant [1 x [1 x i32]] zeroinitializer, section ".init.rodata", align 4
-; CHECK: .init.rodata,"aM",@progbits,4
+; CHECK: .init.rodata,"aM",{{[@%]}}progbits,4
-- 
GitLab


From 119d9f6b0f78730ac836eb96b931434a537dec42 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Thu, 11 Oct 2018 00:49:24 +0000
Subject: [PATCH 0037/1116] [WebAssembly][NFC] Use intrinsic dag nodes directly

Summary: Instead of custom lowering to WebAssemblyISD nodes first.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53119

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344211 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyISD.def     |  7 ----
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 38 ------------------
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 39 +++++++------------
 3 files changed, 14 insertions(+), 70 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def
index f326d37944f..3c44d04598c 100644
--- a/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -22,12 +22,5 @@ HANDLE_NODETYPE(Wrapper)
 HANDLE_NODETYPE(BR_IF)
 HANDLE_NODETYPE(BR_TABLE)
 HANDLE_NODETYPE(SHUFFLE)
-HANDLE_NODETYPE(ANYTRUE)
-HANDLE_NODETYPE(ALLTRUE)
-HANDLE_NODETYPE(BITSELECT)
-HANDLE_NODETYPE(ADD_SAT_S)
-HANDLE_NODETYPE(ADD_SAT_U)
-HANDLE_NODETYPE(SUB_SAT_S)
-HANDLE_NODETYPE(SUB_SAT_U)
 
 // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4ecbf6d7487..30c2e843408 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -966,44 +966,6 @@ WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   default:
     return {}; // Don't custom lower most intrinsics.
 
-  case Intrinsic::wasm_add_saturate_signed:
-  case Intrinsic::wasm_add_saturate_unsigned:
-  case Intrinsic::wasm_sub_saturate_signed:
-  case Intrinsic::wasm_sub_saturate_unsigned: {
-    unsigned OpCode;
-    switch (IntNo) {
-    case Intrinsic::wasm_add_saturate_signed:
-      OpCode = WebAssemblyISD::ADD_SAT_S;
-      break;
-    case Intrinsic::wasm_add_saturate_unsigned:
-      OpCode = WebAssemblyISD::ADD_SAT_U;
-      break;
-    case Intrinsic::wasm_sub_saturate_signed:
-      OpCode = WebAssemblyISD::SUB_SAT_S;
-      break;
-    case Intrinsic::wasm_sub_saturate_unsigned:
-      OpCode = WebAssemblyISD::SUB_SAT_U;
-      break;
-    default:
-      llvm_unreachable("unexpected intrinsic id");
-      break;
-    }
-    return DAG.getNode(OpCode, DL, Op.getValueType(), Op.getOperand(1),
-                       Op.getOperand(2));
-  }
-
-  case Intrinsic::wasm_bitselect:
-    return DAG.getNode(WebAssemblyISD::BITSELECT, DL, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-  case Intrinsic::wasm_anytrue:
-  case Intrinsic::wasm_alltrue: {
-    unsigned OpCode = IntNo == Intrinsic::wasm_anytrue
-                          ? WebAssemblyISD::ANYTRUE
-                          : WebAssemblyISD::ALLTRUE;
-    return DAG.getNode(OpCode, DL, Op.getValueType(), Op.getOperand(1));
-  }
-
   case Intrinsic::wasm_lsda:
     // TODO For now, just return 0 not to crash
     return DAG.getConstant(0, DL, Op.getValueType());
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 4fffd979cd6..419aa0b437f 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -382,7 +382,9 @@ multiclass SIMDBinary<ValueType vec_t, string vec, SDNode node, string name,
                       bits<32> simdop> {
   defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
                         (outs), (ins),
-                        [(set (vec_t V128:$dst), (node V128:$lhs, V128:$rhs))],
+                        [(set (vec_t V128:$dst),
+                          (node (vec_t V128:$lhs), (vec_t V128:$rhs))
+                        )],
                         vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name,
                         simdop>;
 }
@@ -434,23 +436,19 @@ multiclass SIMDBinarySat<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 2)>;
 }
 
-def wasm_saturate_t : SDTypeProfile<1, 2,
-  [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>]
->;
-def wasm_add_sat_s : SDNode<"WebAssemblyISD::ADD_SAT_S", wasm_saturate_t>;
-def wasm_add_sat_u : SDNode<"WebAssemblyISD::ADD_SAT_U", wasm_saturate_t>;
-def wasm_sub_sat_s : SDNode<"WebAssemblyISD::SUB_SAT_S", wasm_saturate_t>;
-def wasm_sub_sat_u : SDNode<"WebAssemblyISD::SUB_SAT_U", wasm_saturate_t>;
-
 // Saturating integer addition: add_saturate_s / add_saturate_u
 let isCommutable = 1 in {
-defm ADD_SAT_S : SIMDBinarySat<wasm_add_sat_s, "add_saturate_s", 40>;
-defm ADD_SAT_U : SIMDBinarySat<wasm_add_sat_u, "add_saturate_u", 41>;
+defm ADD_SAT_S :
+  SIMDBinarySat<int_wasm_add_saturate_signed, "add_saturate_s", 40>;
+defm ADD_SAT_U :
+  SIMDBinarySat<int_wasm_add_saturate_unsigned, "add_saturate_u", 41>;
 } // isCommutable = 1
 
 // Saturating integer subtraction: sub_saturate_s / sub_saturate_u
-defm SUB_SAT_S : SIMDBinarySat<wasm_sub_sat_s, "sub_saturate_s", 44>;
-defm SUB_SAT_U : SIMDBinarySat<wasm_sub_sat_u, "sub_saturate_u", 45>;
+defm SUB_SAT_S :
+  SIMDBinarySat<int_wasm_sub_saturate_signed, "sub_saturate_s", 44>;
+defm SUB_SAT_U :
+  SIMDBinarySat<int_wasm_sub_saturate_unsigned, "sub_saturate_u", 45>;
 
 //===----------------------------------------------------------------------===//
 // Bit shifts
@@ -518,16 +516,11 @@ defm "" : SIMDNot<v4i32>;
 defm "" : SIMDNot<v2i64>;
 
 // Bitwise select: v128.bitselect
-def wasm_bitselect_t : SDTypeProfile<1, 3,
-  [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]
->;
-def wasm_bitselect : SDNode<"WebAssemblyISD::BITSELECT", wasm_bitselect_t>;
-
 multiclass Bitselect<ValueType vec_t> {
   defm BITSELECT_#vec_t :
     SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins),
            [(set (vec_t V128:$dst),
-             (vec_t (wasm_bitselect
+             (vec_t (int_wasm_bitselect
                (vec_t V128:$c), (vec_t V128:$v1), (vec_t V128:$v2)
              ))
            )],
@@ -562,15 +555,11 @@ multiclass SIMDReduce<string name, SDNode op, bits<32> baseInst> {
   defm "" : SIMDReduceVec<v2i64, "i64x2", name, op, !add(baseInst, 3)>;
 }
 
-def wasm_reduce_t : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>;
-
 // Any lane true: any_true
-def wasm_anytrue : SDNode<"WebAssemblyISD::ANYTRUE", wasm_reduce_t>;
-defm ANYTRUE : SIMDReduce<"any_true", wasm_anytrue, 65>;
+defm ANYTRUE : SIMDReduce<"any_true", int_wasm_anytrue, 65>;
 
 // All lanes true: all_true
-def wasm_alltrue : SDNode<"WebAssemblyISD::ALLTRUE", wasm_reduce_t>;
-defm ALLTRUE : SIMDReduce<"all_true", wasm_alltrue, 69>;
+defm ALLTRUE : SIMDReduce<"all_true", int_wasm_alltrue, 69>;
 
 //===----------------------------------------------------------------------===//
 // Comparisons
-- 
GitLab


From 4599ef42e718015834c38475f2ed8865306d1f56 Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Thu, 11 Oct 2018 03:42:17 +0000
Subject: [PATCH 0038/1116] Use fully qualified namespace name.

llvm::detail is not the only namespace named detail.  So if
someone has done a `using namespace llvm::support`, for example,
this will fail with an ambiguous namespace name.  Granted
people generally shouldn't be using large namespaces like that,
but it's common at local function scopes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344216 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/DenseMap.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index 380f1db0d04..8fe0f48adf2 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -46,9 +46,10 @@ struct DenseMapPair : public std::pair<KeyT, ValueT> {
 
 } // end namespace detail
 
-template <
-    typename KeyT, typename ValueT, typename KeyInfoT = DenseMapInfo<KeyT>,
-    typename Bucket = detail::DenseMapPair<KeyT, ValueT>, bool IsConst = false>
+template <typename KeyT, typename ValueT,
+          typename KeyInfoT = DenseMapInfo<KeyT>,
+          typename Bucket = llvm::detail::DenseMapPair<KeyT, ValueT>,
+          bool IsConst = false>
 class DenseMapIterator;
 
 template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
@@ -641,7 +642,7 @@ public:
 
 template <typename KeyT, typename ValueT,
           typename KeyInfoT = DenseMapInfo<KeyT>,
-          typename BucketT = detail::DenseMapPair<KeyT, ValueT>>
+          typename BucketT = llvm::detail::DenseMapPair<KeyT, ValueT>>
 class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
                                      KeyT, ValueT, KeyInfoT, BucketT> {
   friend class DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT, BucketT>;
@@ -798,7 +799,7 @@ private:
 
 template <typename KeyT, typename ValueT, unsigned InlineBuckets = 4,
           typename KeyInfoT = DenseMapInfo<KeyT>,
-          typename BucketT = detail::DenseMapPair<KeyT, ValueT>>
+          typename BucketT = llvm::detail::DenseMapPair<KeyT, ValueT>>
 class SmallDenseMap
     : public DenseMapBase<
           SmallDenseMap<KeyT, ValueT, InlineBuckets, KeyInfoT, BucketT>, KeyT,
-- 
GitLab


From 0801e41a5193a5c6d9794886524bbeb83f7f4150 Mon Sep 17 00:00:00 2001
From: Chris Bieneman <chris.bieneman@me.com>
Date: Thu, 11 Oct 2018 04:00:51 +0000
Subject: [PATCH 0039/1116] [Coverage] Apply filtered paths to summary

Summary:
The script to generate code coverage reports supports passing filter paths to llvm-cov when generating the HTML reports, but doesn't pass those paths to the summary generation as well. This results in a summary report that doesn't match the HTML report.

This patch addresses the problem by also passing the filter paths to the summary report generation.

Reviewers: vsk

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53110

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344217 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/prepare-code-coverage-artifact.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/prepare-code-coverage-artifact.py b/utils/prepare-code-coverage-artifact.py
index 883cdd78049..5c4af242d0d 100644
--- a/utils/prepare-code-coverage-artifact.py
+++ b/utils/prepare-code-coverage-artifact.py
@@ -51,7 +51,8 @@ def prepare_html_report(host_llvm_cov, profile, report_dir, binaries,
     subprocess.check_call(invocation)
     with open(os.path.join(report_dir, 'summary.txt'), 'wb') as Summary:
         subprocess.check_call([host_llvm_cov, 'report'] + objects +
-                               ['-instr-profile', profile], stdout=Summary)
+                               ['-instr-profile', profile] + restricted_dirs,
+                               stdout=Summary)
     print('Done!')
 
 def prepare_html_reports(host_llvm_cov, profdata_path, report_dir, binaries,
-- 
GitLab


From 6fb010f388bb2cb2f00fe039123092308ac4865d Mon Sep 17 00:00:00 2001
From: Chris Bieneman <chris.bieneman@me.com>
Date: Thu, 11 Oct 2018 04:02:53 +0000
Subject: [PATCH 0040/1116] [CMake] Unconditionally add .h and .td files to
 target sources

Previously adding header and table gen files was conditional on using an IDE. Since these files have the `HEADER_FILE_ONLY` attribute applied they are ignored as sources by all non-IDE generators, so there is really no reason not to include them.

Additionally having the CMake always include these files allows the CMake-server to include them in the sources list for targets, which is valuable to anyone using CMake-server integrated tools.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344218 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/LLVMProcessSources.cmake | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake
index f65f31d797c..7cbd2863500 100644
--- a/cmake/modules/LLVMProcessSources.cmake
+++ b/cmake/modules/LLVMProcessSources.cmake
@@ -52,16 +52,15 @@ function(llvm_process_sources OUT_VAR)
   cmake_parse_arguments(ARG "" "" "ADDITIONAL_HEADERS;ADDITIONAL_HEADER_DIRS" ${ARGN})
   set(sources ${ARG_UNPARSED_ARGUMENTS})
   llvm_check_source_file_list( ${sources} )
-  if( LLVM_ENABLE_IDE )
-    # This adds .td and .h files to the Visual Studio solution:
-    add_td_sources(sources)
-    find_all_header_files(hdrs "${ARG_ADDITIONAL_HEADER_DIRS}")
-    if (hdrs)
-      set_source_files_properties(${hdrs} PROPERTIES HEADER_FILE_ONLY ON)
-    endif()
-    set_source_files_properties(${ARG_ADDITIONAL_HEADERS} PROPERTIES HEADER_FILE_ONLY ON)
-    list(APPEND sources ${ARG_ADDITIONAL_HEADERS} ${hdrs})
+  
+  # This adds .td and .h files to the Visual Studio solution:
+  add_td_sources(sources)
+  find_all_header_files(hdrs "${ARG_ADDITIONAL_HEADER_DIRS}")
+  if (hdrs)
+    set_source_files_properties(${hdrs} PROPERTIES HEADER_FILE_ONLY ON)
   endif()
+  set_source_files_properties(${ARG_ADDITIONAL_HEADERS} PROPERTIES HEADER_FILE_ONLY ON)
+  list(APPEND sources ${ARG_ADDITIONAL_HEADERS} ${hdrs})
 
   set( ${OUT_VAR} ${sources} PARENT_SCOPE )
 endfunction(llvm_process_sources)
-- 
GitLab


From b1493403a4118390aeeb929e3d4ac7489dd0d167 Mon Sep 17 00:00:00 2001
From: Chris Bieneman <chris.bieneman@me.com>
Date: Thu, 11 Oct 2018 04:06:14 +0000
Subject: [PATCH 0041/1116] [CMake] Temporarily remove the LLVM_ENABLE_IDE
 option

All uses of this option have been removed, and the intent is to change the purpose and default value of this option. To prevent it from having impacts on users, this patch temporarily removes the option and purges it from CMake caches. In a few days, once this has propagated to contributors I will re-introduce the option with the new default value.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344219 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/HandleLLVMOptions.cmake | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 0daaf7d95c0..85aebf6ed71 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -868,12 +868,16 @@ else()
   set(LLVM_ENABLE_PLUGINS ON)
 endif()
 
-set(LLVM_ENABLE_IDE_default OFF)
-if (XCODE OR MSVC_IDE OR CMAKE_EXTRA_GENERATOR)
-  set(LLVM_ENABLE_IDE_default ON)
-endif()
-option(LLVM_ENABLE_IDE "Generate targets and process sources for use with an IDE"
-    ${LLVM_ENABLE_IDE_default})
+# Remove LLVM_ENABLE_IDE from the CMake cache. This is a temporary change to
+# allow CMake caches to be cleaned up so that we can change the default for this
+# option and how it is used.
+unset(LLVM_ENABLE_IDE CACHE)
+#set(LLVM_ENABLE_IDE_default OFF)
+#if (XCODE OR MSVC_IDE OR CMAKE_EXTRA_GENERATOR)
+#  set(LLVM_ENABLE_IDE_default ON)
+#endif()
+#option(LLVM_ENABLE_IDE "Generate targets and process sources for use with an IDE"
+#    ${LLVM_ENABLE_IDE_default})
 
 function(get_compile_definitions)
   get_directory_property(top_dir_definitions DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
-- 
GitLab


From d342bc787ac2c06b23bf013303587e7f086fdc18 Mon Sep 17 00:00:00 2001
From: Martin Storsjo <martin@martin.st>
Date: Thu, 11 Oct 2018 06:53:38 +0000
Subject: [PATCH 0042/1116] [llvm-nm] Include the text "@FILE" in the output of
 --help

libtool requires this text to be present, in order to conclude that
the tool supports response files. Also add an explicit test of using
response files with llvm-nm.

Differential Revision: https://reviews.llvm.org/D53064

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344222 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-nm/X86/response-file.test     | 5 +++++
 test/tools/llvm-nm/libtool-response-file.test | 4 ++++
 tools/llvm-nm/llvm-nm.cpp                     | 2 ++
 3 files changed, 11 insertions(+)
 create mode 100644 test/tools/llvm-nm/X86/response-file.test
 create mode 100644 test/tools/llvm-nm/libtool-response-file.test

diff --git a/test/tools/llvm-nm/X86/response-file.test b/test/tools/llvm-nm/X86/response-file.test
new file mode 100644
index 00000000000..5c53960056c
--- /dev/null
+++ b/test/tools/llvm-nm/X86/response-file.test
@@ -0,0 +1,5 @@
+# RUN: echo "-P %p/Inputs/hello.obj.elf-x86_64" > %t-response
+# RUN: llvm-nm @%t-response | FileCheck %s
+
+CHECK: main T 0 0
+CHECK: puts U 0 0
diff --git a/test/tools/llvm-nm/libtool-response-file.test b/test/tools/llvm-nm/libtool-response-file.test
new file mode 100644
index 00000000000..5d4af74e316
--- /dev/null
+++ b/test/tools/llvm-nm/libtool-response-file.test
@@ -0,0 +1,4 @@
+RUN: llvm-nm --help | FileCheck %s
+Check that the output of llvm-nm --help contains the literal text @FILE; this
+indicates to libtool that llvm-nm does support response files.
+CHECK: @FILE
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 7e1fd86d0b0..22fdd4ca85e 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -183,6 +183,8 @@ cl::opt<bool> DyldInfoOnly("dyldinfo-only",
 cl::opt<bool> NoLLVMBitcode("no-llvm-bc",
                             cl::desc("Disable LLVM bitcode reader"));
 
+cl::extrahelp HelpResponse("\nPass @FILE as argument to read options from FILE.\n");
+
 bool PrintAddress = true;
 
 bool MultipleFiles = false;
-- 
GitLab


From d7e48738baf8d9bfe70eea8383e0d230b1626ca6 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Thu, 11 Oct 2018 07:22:26 +0000
Subject: [PATCH 0043/1116] [IndVars] Drop "exact" flag from lshr and udiv when
 substituting their args

There is a transform that may replace `lshr (x+1), 1` with `lshr x, 1` in case
if it can prove that the result will be the same. However the initial instruction
might have an `exact` flag set, and it now should be dropped unless we prove
that it may hold. Incorrectly set `exact` attribute may then produce poison.

Differential Revision: https://reviews.llvm.org/D53061
Reviewed By: sanjoy


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344223 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/SimplifyIndVar.cpp      |  9 ++
 test/Transforms/IndVarSimplify/drop-exact.ll | 99 ++++++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 test/Transforms/IndVarSimplify/drop-exact.ll

diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 51fda1c620b..7faf291e73d 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -108,6 +108,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
   Value *IVSrc = nullptr;
   const unsigned OperIdx = 0;
   const SCEV *FoldedExpr = nullptr;
+  bool MustDropExactFlag = false;
   switch (UseInst->getOpcode()) {
   default:
     return nullptr;
@@ -140,6 +141,11 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
                            APInt::getOneBitSet(BitWidth, D->getZExtValue()));
     }
     FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D));
+    // We might have 'exact' flag set at this point which will no longer be
+    // correct after we make the replacement.
+    if (UseInst->isExact() &&
+        SE->getSCEV(IVSrc) != SE->getMulExpr(FoldedExpr, SE->getSCEV(D)))
+      MustDropExactFlag = true;
   }
   // We have something that might fold it's operand. Compare SCEVs.
   if (!SE->isSCEVable(UseInst->getType()))
@@ -155,6 +161,9 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
   UseInst->setOperand(OperIdx, IVSrc);
   assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
 
+  if (MustDropExactFlag)
+    UseInst->dropPoisonGeneratingFlags();
+
   ++NumElimOperand;
   Changed = true;
   if (IVOperand->use_empty())
diff --git a/test/Transforms/IndVarSimplify/drop-exact.ll b/test/Transforms/IndVarSimplify/drop-exact.ll
new file mode 100644
index 00000000000..ab5b2b5a859
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/drop-exact.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -indvars -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+; We make a transform by getting rid of add nsw i32 %tmp17, -1; make sure that
+; we drop "exact" flag on lshr as we do it.
+define void @drop_exact(i32* %p, i64* %p1) {
+; CHECK-LABEL: @drop_exact(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB12:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    ret void
+; CHECK:       bb12:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ -47436, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP42:%.*]], [[BB12]] ]
+; CHECK-NEXT:    [[TMP15]] = add nsw i32 [[TMP13]], -1
+; CHECK-NEXT:    [[TMP16:%.*]] = shl i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 42831, [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP17]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = urem i32 [[TMP19]], 250
+; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP17]], 1
+; CHECK-NEXT:    store i32 [[TMP22]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP20]] to i64
+; CHECK-NEXT:    store i64 [[TMP26]], i64* [[P1:%.*]], align 4
+; CHECK-NEXT:    [[TMP42]] = add nuw nsw i32 [[TMP14]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP42]], 719
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BB7:%.*]], label [[BB12]]
+;
+bb:
+  br label %bb12
+
+bb7:                                              ; preds = %bb12
+  ret void
+
+bb12:                                             ; preds = %bb12, %bb
+  %tmp13 = phi i32 [ -47436, %bb ], [ %tmp15, %bb12 ]
+  %tmp14 = phi i32 [ 0, %bb ], [ %tmp42, %bb12 ]
+  %tmp15 = add i32 %tmp13, -1
+  %tmp16 = shl i32 %tmp15, 1
+  %tmp17 = sub i32 42831, %tmp16
+  %tmp19 = lshr i32 %tmp17, 1
+  %tmp20 = urem i32 %tmp19, 250
+  %tmp21 = add nsw i32 %tmp17, -1
+  %tmp22 = lshr exact i32 %tmp21, 1
+  store i32 %tmp22, i32* %p, align 4
+  %tmp26 = zext i32 %tmp20 to i64
+  store i64 %tmp26, i64* %p1, align 4
+  %tmp42 = add nuw nsw i32 %tmp14, 1
+  %tmp43 = icmp ugt i32 %tmp14, 717
+  br i1 %tmp43, label %bb7, label %bb12
+}
+
+; Throw away add nsw i32 %tmp17, 0, do not drop exact flag.
+define void @dont_drop_exact(i32* %p, i64* %p1) {
+; CHECK-LABEL: @dont_drop_exact(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB12:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    ret void
+; CHECK:       bb12:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ -47436, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP42:%.*]], [[BB12]] ]
+; CHECK-NEXT:    [[TMP15]] = add nsw i32 [[TMP13]], -1
+; CHECK-NEXT:    [[TMP16:%.*]] = shl i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 42831, [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP17]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = urem i32 [[TMP19]], 250
+; CHECK-NEXT:    [[TMP22:%.*]] = lshr exact i32 [[TMP17]], 1
+; CHECK-NEXT:    store i32 [[TMP22]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP20]] to i64
+; CHECK-NEXT:    store i64 [[TMP26]], i64* [[P1:%.*]], align 4
+; CHECK-NEXT:    [[TMP42]] = add nuw nsw i32 [[TMP14]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP42]], 719
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BB7:%.*]], label [[BB12]]
+;
+bb:
+  br label %bb12
+
+bb7:                                              ; preds = %bb12
+  ret void
+
+bb12:                                             ; preds = %bb12, %bb
+  %tmp13 = phi i32 [ -47436, %bb ], [ %tmp15, %bb12 ]
+  %tmp14 = phi i32 [ 0, %bb ], [ %tmp42, %bb12 ]
+  %tmp15 = add i32 %tmp13, -1
+  %tmp16 = shl i32 %tmp15, 1
+  %tmp17 = sub i32 42831, %tmp16
+  %tmp19 = lshr i32 %tmp17, 1
+  %tmp20 = urem i32 %tmp19, 250
+  %tmp21 = add nsw i32 %tmp17, 0
+  %tmp22 = lshr exact i32 %tmp21, 1
+  store i32 %tmp22, i32* %p, align 4
+  %tmp26 = zext i32 %tmp20 to i64
+  store i64 %tmp26, i64* %p1, align 4
+  %tmp42 = add nuw nsw i32 %tmp14, 1
+  %tmp43 = icmp ugt i32 %tmp14, 717
+  br i1 %tmp43, label %bb7, label %bb12
+}
-- 
GitLab


From b4d0c491d053d679b29dd78bc1ac00d4df00fbd3 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Thu, 11 Oct 2018 07:51:13 +0000
Subject: [PATCH 0044/1116] [X86][BMI1]: X86DAGToDAGISel: select BEXTR from x &
 ~(-1 << nbits) pattern

Summary:
As discussed in D48491, we can't really do this in the TableGen,
since we need to produce *two* instructions. This only implements
one single pattern. The other 3 patterns will be in follow-ups.

I'm not sure yet if we want to also fuse shift into here
(i.e `(x >> start) & ...`)

Reviewers: RKSimon, craig.topper, spatel

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D52304

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344224 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp  |  83 +++++++++++++++++
 test/CodeGen/X86/extract-bits.ll    | 139 +++++++++-------------------
 test/CodeGen/X86/extract-lowbits.ll | 103 +++++++--------------
 3 files changed, 164 insertions(+), 161 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5eb4dbb1d98..c043c7c54cc 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -451,6 +451,7 @@ namespace {
     }
 
     bool foldLoadStoreIntoMemOperand(SDNode *Node);
+    bool matchBEXTR(SDNode *Node);
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
     bool tryShiftAmountMod(SDNode *N);
@@ -2565,6 +2566,86 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
   return true;
 }
 
+// See if this is an  X & Mask  that we can match to BEXTR.
+// Where Mask is one of the following patterns:
+//   a) x &  (1 << nbits) - 1
+//   b) x & ~(-1 << nbits)
+//   c) x &  (-1 >> (32 - y))
+//   d) x << (32 - y) >> (32 - y)
+bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) {
+  // BEXTR is BMI instruction. However, if we have BMI2, we prefer BZHI.
+  if (!Subtarget->hasBMI() || Subtarget->hasBMI2())
+    return false;
+
+  MVT NVT = Node->getSimpleValueType(0);
+
+  // Only supported for 32 and 64 bits.
+  if (NVT != MVT::i32 && NVT != MVT::i64)
+    return false;
+
+  SDValue NBits;
+
+  // b) x & ~(-1 << nbits)
+  auto matchPatternB = [&NBits](SDValue Mask) -> bool {
+    // Match `~()`. Must only have one use!
+    if (!isBitwiseNot(Mask) || !Mask->hasOneUse())
+      return false;
+    // Match `-1 << nbits`. Must only have one use!
+    SDValue M0 = Mask->getOperand(0);
+    if (M0->getOpcode() != ISD::SHL || !M0->hasOneUse())
+      return false;
+    if (!isAllOnesConstant(M0->getOperand(0)))
+      return false;
+    NBits = M0->getOperand(1);
+    return true;
+  };
+
+  auto matchLowBitMask = [&matchPatternB](SDValue Mask) -> bool {
+    // FIXME: patterns a, c, d.
+    return matchPatternB(Mask);
+  };
+
+  SDValue X = Node->getOperand(0);
+  SDValue Mask = Node->getOperand(1);
+
+  if (matchLowBitMask(Mask)) {
+    // Great.
+  } else {
+    std::swap(X, Mask);
+    if (!matchLowBitMask(Mask))
+      return false;
+  }
+
+  SDLoc DL(Node);
+
+  // Insert 8-bit NBits into lowest 8 bits of NVT-sized (32 or 64-bit) register.
+  // All the other bits are undefined, we do not care about them.
+  SDValue ImplDef =
+      SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, NVT), 0);
+  insertDAGNode(*CurDAG, NBits, ImplDef);
+  SDValue OrigNBits = NBits;
+  NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, NVT, ImplDef, NBits);
+  insertDAGNode(*CurDAG, OrigNBits, NBits);
+
+  // The 'control' of BEXTR has the pattern of:
+  // [15...8 bit][ 7...0 bit] location
+  // [ bit count][     shift] name
+  // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
+
+  // Shift NBits left by 8 bits, thus producing 'control'.
+  SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
+  SDValue Control = CurDAG->getNode(ISD::SHL, DL, NVT, NBits, C8);
+  insertDAGNode(*CurDAG, OrigNBits, Control);
+  // NOTE: could also try to extract  start  from  (x >> start)
+
+  // And finally, form the BEXTR itself.
+  SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, NVT, X, Control);
+  ReplaceNode(Node, Extract.getNode());
+  SelectCode(Extract.getNode());
+
+  return true;
+}
+
 // Emit a PCMISTR(I/M) instruction.
 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
                                              bool MayFoldLoad, const SDLoc &dl,
@@ -2872,6 +2953,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case ISD::AND:
+    if (matchBEXTR(Node))
+      return;
     if (AndImmShrink && shrinkAndImmediate(Node))
       return;
 
diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll
index 98c9ab271cb..b16aeb3d350 100644
--- a/test/CodeGen/X86/extract-bits.ll
+++ b/test/CodeGen/X86/extract-bits.ll
@@ -1507,16 +1507,12 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    andnl %edx, %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b0:
@@ -1544,10 +1540,8 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b0:
@@ -1580,16 +1574,12 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    andnl %edx, %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b1_indexzext:
@@ -1617,10 +1607,8 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b1_indexzext:
@@ -1656,17 +1644,13 @@ define i32 @bextr32_b2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    andnl %edx, %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b2_load:
@@ -1697,10 +1681,8 @@ define i32 @bextr32_b2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X64-BMI1NOTBM-NEXT:    andnl %eax, %esi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b2_load:
@@ -1735,17 +1717,13 @@ define i32 @bextr32_b3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    andnl %edx, %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b3_load_indexzext:
@@ -1776,10 +1754,8 @@ define i32 @bextr32_b3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X64-BMI1NOTBM-NEXT:    andnl %eax, %esi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b3_load_indexzext:
@@ -1815,16 +1791,12 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    andnl %edx, %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b4_commutative:
@@ -1852,10 +1824,8 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b4_commutative:
@@ -1896,24 +1866,19 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b5_skipextrauses:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    pushl %eax
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
-; X86-BMI1NOTBM-NEXT:    andnl %esi, %edi, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
-; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b5_skipextrauses:
@@ -1952,10 +1917,8 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %ebx
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -2106,10 +2069,8 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b0:
@@ -2245,13 +2206,12 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_b1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b1_indexzext:
@@ -2399,10 +2359,8 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rsi
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rsi
-; X64-BMI1NOTBM-NEXT:    andnq %rax, %rsi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b2_load:
@@ -2543,14 +2501,13 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_b3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rsi
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rsi
-; X64-BMI1NOTBM-NEXT:    andnq %rax, %rsi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b3_load_indexzext:
@@ -2694,10 +2651,8 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b4_commutative:
@@ -2876,12 +2831,10 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_b5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rbx
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
diff --git a/test/CodeGen/X86/extract-lowbits.ll b/test/CodeGen/X86/extract-lowbits.ll
index 4af130cd825..43df34000d4 100644
--- a/test/CodeGen/X86/extract-lowbits.ll
+++ b/test/CodeGen/X86/extract-lowbits.ll
@@ -794,10 +794,9 @@ define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_b0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_b0:
@@ -818,11 +817,8 @@ define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_b0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_b0:
@@ -847,10 +843,9 @@ define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_b1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_b1_indexzext:
@@ -871,11 +866,8 @@ define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_b1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_b1_indexzext:
@@ -904,9 +896,8 @@ define i32 @bzhi32_b2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %edx
-; X86-BMI1NOTBM-NEXT:    andnl (%eax), %edx, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_b2_load:
@@ -928,11 +919,8 @@ define i32 @bzhi32_b2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_b2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl (%rdi), %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_b2_load:
@@ -961,9 +949,8 @@ define i32 @bzhi32_b3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind {
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %edx
-; X86-BMI1NOTBM-NEXT:    andnl (%eax), %edx, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_b3_load_indexzext:
@@ -985,11 +972,8 @@ define i32 @bzhi32_b3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_b3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl (%rdi), %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_b3_load_indexzext:
@@ -1016,10 +1000,9 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_b4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_b4_commutative:
@@ -1040,11 +1023,8 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_b4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_b4_commutative:
@@ -1128,11 +1108,8 @@ define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_b0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_b0:
@@ -1214,11 +1191,9 @@ define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_b1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_b1_indexzext:
@@ -1307,11 +1282,8 @@ define i64 @bzhi64_b2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_b2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq (%rdi), %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_b2_load:
@@ -1399,11 +1371,9 @@ define i64 @bzhi64_b3_load_indexzext(i64* %w, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_b3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq (%rdi), %rax, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_b3_load_indexzext:
@@ -1488,11 +1458,8 @@ define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_b4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_b4_commutative:
-- 
GitLab


From fb7e6913cb74b9709a840df8d6397a7eedd63606 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Thu, 11 Oct 2018 08:46:39 +0000
Subject: [PATCH 0045/1116] [NFC] Factor out getOrCreateAddRecExpr method

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344227 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/ScalarEvolution.h |  4 +++
 lib/Analysis/ScalarEvolution.cpp        | 42 ++++++++++++++-----------
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 89918e3c205..8f4200b07e5 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -1833,6 +1833,10 @@ private:
   const SCEV *getOrCreateMulExpr(SmallVectorImpl<const SCEV *> &Ops,
                                  SCEV::NoWrapFlags Flags);
 
+  // Get addrec expr already created or create a new one.
+  const SCEV *getOrCreateAddRecExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                    const Loop *L, SCEV::NoWrapFlags Flags);
+
   /// Return x if \p Val is f(x) where f is a 1-1 function.
   const SCEV *stripInjectiveFunctions(const SCEV *Val) const;
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index d99d4767366..193020ed92f 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -2758,6 +2758,29 @@ ScalarEvolution::getOrCreateAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   return S;
 }
 
+const SCEV *
+ScalarEvolution::getOrCreateAddRecExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                       const Loop *L, SCEV::NoWrapFlags Flags) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(scAddRecExpr);
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    ID.AddPointer(Ops[i]);
+  ID.AddPointer(L);
+  void *IP = nullptr;
+  SCEVAddRecExpr *S =
+      static_cast<SCEVAddRecExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
+  if (!S) {
+    const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
+    std::uninitialized_copy(Ops.begin(), Ops.end(), O);
+    S = new (SCEVAllocator)
+        SCEVAddRecExpr(ID.Intern(SCEVAllocator), O, Ops.size(), L);
+    UniqueSCEVs.InsertNode(S, IP);
+    addToLoopUseLists(S);
+  }
+  S->setNoWrapFlags(Flags);
+  return S;
+}
+
 const SCEV *
 ScalarEvolution::getOrCreateMulExpr(SmallVectorImpl<const SCEV *> &Ops,
                                     SCEV::NoWrapFlags Flags) {
@@ -3408,24 +3431,7 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
 
   // Okay, it looks like we really DO need an addrec expr.  Check to see if we
   // already have one, otherwise create a new one.
-  FoldingSetNodeID ID;
-  ID.AddInteger(scAddRecExpr);
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i)
-    ID.AddPointer(Operands[i]);
-  ID.AddPointer(L);
-  void *IP = nullptr;
-  SCEVAddRecExpr *S =
-    static_cast<SCEVAddRecExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
-  if (!S) {
-    const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Operands.size());
-    std::uninitialized_copy(Operands.begin(), Operands.end(), O);
-    S = new (SCEVAllocator) SCEVAddRecExpr(ID.Intern(SCEVAllocator),
-                                           O, Operands.size(), L);
-    UniqueSCEVs.InsertNode(S, IP);
-    addToLoopUseLists(S);
-  }
-  S->setNoWrapFlags(Flags);
-  return S;
+  return getOrCreateAddRecExpr(Operands, L, Flags);
 }
 
 const SCEV *
-- 
GitLab


From 6298cb983b2b06783bc0f60b88f104cbe932b824 Mon Sep 17 00:00:00 2001
From: Calixte Denizet <cdenizet@mozilla.com>
Date: Thu, 11 Oct 2018 08:53:43 +0000
Subject: [PATCH 0046/1116] [gcov] Display the hit counter for the line of a
 function definition

Summary:
Right now there is no hit counter on the line of function.
So the idea is add the line of the function to all the lines covered by the entry block.
Tests in compiler-rt/profile will be fixed in another patch: https://reviews.llvm.org/D49854

Reviewers: marco-c, davidxl

Reviewed By: marco-c

Subscribers: sylvestre.ledru, llvm-commits

Differential Revision: https://reviews.llvm.org/D49853

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344228 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Instrumentation/GCOVProfiling.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 625b354cc38..a060dd53513 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -570,6 +570,12 @@ void GCOVProfiler::emitProfileNotes() {
                                                 Options.ExitBlockBeforeBody));
       GCOVFunction &Func = *Funcs.back();
 
+      // Add the function line number to the lines of the entry block
+      // to have a counter for the function definition.
+      Func.getBlock(&EntryBlock)
+          .getFile(SP->getFilename())
+          .addLine(SP->getLine());
+
       for (auto &BB : F) {
         GCOVBlock &Block = Func.getBlock(&BB);
         TerminatorInst *TI = BB.getTerminator();
-- 
GitLab


From 522fc4db7cd5cd3e490fb5d0a95491ed3d99b4f1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <florian.hahn@arm.com>
Date: Thu, 11 Oct 2018 09:27:24 +0000
Subject: [PATCH 0047/1116] [LV] Ignore more debug info.

We can avoid doing some unnecessary work by skipping debug instructions
in a few loops. It also helps to ensure debug instructions do not
prevent vectorization, although I do not have any concrete test cases
for that.

Reviewers: rengolin, hsaito, dcaballe, aprantl, vsk

Reviewed By: rengolin, dcaballe

Differential Revision: https://reviews.llvm.org/D53091


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344232 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c7c4568377b..cad1711b17f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4655,7 +4655,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
     // For each instruction in the loop.
-    for (Instruction &I : *BB) {
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
       Type *T = I.getType();
 
       // Skip ignored values.
@@ -4893,7 +4893,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
 
   unsigned Index = 0;
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
-    for (Instruction &I : *BB) {
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
       IdxToInstr[Index++] = &I;
 
       // Save the end location of each USE.
-- 
GitLab


From e13d09452159012b82271b7e655a02e16da76887 Mon Sep 17 00:00:00 2001
From: Florian Hahn <florian.hahn@arm.com>
Date: Thu, 11 Oct 2018 09:46:25 +0000
Subject: [PATCH 0048/1116] [LV] Use SmallVector instead of DenseMap in
 calculateRegisterUsage (NFC).

We assign indices sequentially for seen instructions, so we can just use
a vector and push back the seen instructions. No need for using a
DenseMap.

Reviewers: hsaito, rengolin, nadav, dcaballe

Reviewed By: rengolin

Differential Revision: https://reviews.llvm.org/D53089


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344233 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index cad1711b17f..7ebe8d102b7 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4882,7 +4882,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   using IntervalMap = DenseMap<Instruction *, unsigned>;
 
   // Maps instruction to its index.
-  DenseMap<unsigned, Instruction *> IdxToInstr;
+  SmallVector<Instruction *, 64> IdxToInstr;
   // Marks the end of each interval.
   IntervalMap EndPoint;
   // Saves the list of instruction indices that are used in the loop.
@@ -4891,10 +4891,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   // defined outside the loop, such as arguments and constants.
   SmallPtrSet<Value *, 8> LoopInvariants;
 
-  unsigned Index = 0;
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
     for (Instruction &I : BB->instructionsWithoutDebug()) {
-      IdxToInstr[Index++] = &I;
+      IdxToInstr.push_back(&I);
 
       // Save the end location of each USE.
       for (Value *U : I.operands()) {
@@ -4911,7 +4910,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
         }
 
         // Overwrite previous end points.
-        EndPoint[Instr] = Index;
+        EndPoint[Instr] = IdxToInstr.size();
         Ends.insert(Instr);
       }
     }
@@ -4948,7 +4947,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
   };
 
-  for (unsigned int i = 0; i < Index; ++i) {
+  for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
     Instruction *I = IdxToInstr[i];
 
     // Remove all of the instructions that end at this location.
-- 
GitLab


From 7a175729942d0d92d50149b3963133716d743a61 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Thu, 11 Oct 2018 10:39:03 +0000
Subject: [PATCH 0049/1116] [tblgen][CodeGenSchedule] Add a check for invalid
 RegisterFile definitions with zero physical registers.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344235 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp | 10 ++++------
 utils/TableGen/CodeGenSchedule.cpp                |  5 +++++
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
index 01131253b5b..4cfe1a50f53 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
@@ -45,13 +45,11 @@ void RegisterFile::initialize(const MCSchedModel &SM, unsigned NumRegs) {
   // object. The size of every register file, as well as the mapping between
   // register files and register classes is specified via tablegen.
   const MCExtraProcessorInfo &Info = SM.getExtraProcessorInfo();
-  for (unsigned I = 0, E = Info.NumRegisterFiles; I < E; ++I) {
+
+  // Skip invalid register file at index 0.
+  for (unsigned I = 1, E = Info.NumRegisterFiles; I < E; ++I) {
     const MCRegisterFileDesc &RF = Info.RegisterFiles[I];
-    // Skip invalid register files with zero physical registers.
-    // TODO: verify this constraint in SubtargetEmitter, and convert this
-    // statement into an assert.
-    if (!RF.NumPhysRegs)
-      continue;
+    assert(RF.NumPhysRegs && "Invalid PRF with zero physical registers!");
 
     // The cost of a register definition is equivalent to the number of
     // physical registers that are allocated at register renaming stage.
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index 881f1a813f2..f8d7d9ad3d3 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -1763,6 +1763,11 @@ void CodeGenSchedModels::collectRegisterFiles() {
     // Now set the number of physical registers as well as the cost of registers
     // in each register class.
     CGRF.NumPhysRegs = RF->getValueAsInt("NumPhysRegs");
+    if (!CGRF.NumPhysRegs) {
+      PrintFatalError(RF->getLoc(),
+                      "Invalid RegisterFile with zero physical registers");
+    }
+
     RecVec RegisterClasses = RF->getValueAsListOfDefs("RegClasses");
     std::vector<int64_t> RegisterCosts = RF->getValueAsListOfInts("RegCosts");
     for (unsigned I = 0, E = RegisterClasses.size(); I < E; ++I) {
-- 
GitLab


From 3572c80cdbe16b19fa1c09846a50e8b7728b5ca1 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 11 Oct 2018 10:46:12 +0000
Subject: [PATCH 0050/1116] [InstCombine] Add tests for demand bits of min/max.
 NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344236 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/minmax-demandbits.ll          | 256 ++++++++++++++++++
 1 file changed, 256 insertions(+)
 create mode 100644 test/Transforms/InstCombine/minmax-demandbits.ll

diff --git a/test/Transforms/InstCombine/minmax-demandbits.ll b/test/Transforms/InstCombine/minmax-demandbits.ll
new file mode 100644
index 00000000000..8977c19856f
--- /dev/null
+++ b/test/Transforms/InstCombine/minmax-demandbits.ll
@@ -0,0 +1,256 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+
+define i32 @and_umax_less(i32 %A) {
+; CHECK-LABEL: @and_umax_less(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 31
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 31
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[L1]], -32
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %l0 = icmp ugt i32 31, %A
+  %l1 = select i1 %l0, i32 31, i32 %A
+  %x = and i32 %l1, -32
+  ret i32 %x
+}
+
+define i32 @and_umax_muchless(i32 %A) {
+; CHECK-LABEL: @and_umax_muchless(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 12
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 12
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[L1]], -32
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %l0 = icmp ugt i32 12, %A
+  %l1 = select i1 %l0, i32 12, i32 %A
+  %x = and i32 %l1, -32
+  ret i32 %x
+}
+
+define i32 @and_umax_more(i32 %A) {
+; CHECK-LABEL: @and_umax_more(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 32
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 32
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[L1]], -32
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %l0 = icmp ugt i32 32, %A
+  %l1 = select i1 %l0, i32 32, i32 %A
+  %x = and i32 %l1, -32
+  ret i32 %x
+}
+
+define i32 @shr_umax(i32 %A) {
+; CHECK-LABEL: @shr_umax(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 15
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 15
+; CHECK-NEXT:    [[X:%.*]] = lshr i32 [[L1]], 4
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %l0 = icmp ugt i32 15, %A
+  %l1 = select i1 %l0, i32 15, i32 %A
+  %x = lshr i32 %l1, 4
+  ret i32 %x
+}
+
+; Various constants for C2 & umax(A, C1)
+
+define i8 @t_0_1(i8 %A) {
+; CHECK-LABEL: @t_0_1(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[A:%.*]], 1
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 0
+  %l1 = select i1 %l2, i8 %A, i8 0
+  %x = and i8 %l1, 1
+  ret i8 %x
+}
+
+define i8 @t_0_10(i8 %A) {
+; CHECK-LABEL: @t_0_10(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[A:%.*]], 10
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 0
+  %l1 = select i1 %l2, i8 %A, i8 0
+  %x = and i8 %l1, 10
+  ret i8 %x
+}
+
+define i8 @t_1_10(i8 %A) {
+; CHECK-LABEL: @t_1_10(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 1
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], 10
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 1
+  %l1 = select i1 %l2, i8 %A, i8 1
+  %x = and i8 %l1, 10
+  ret i8 %x
+}
+
+define i8 @t_2_4(i8 %A) {
+; CHECK-LABEL: @t_2_4(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], 4
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 2
+  %l1 = select i1 %l2, i8 %A, i8 2
+  %x = and i8 %l1, 4
+  ret i8 %x
+}
+
+define i8 @t_2_192(i8 %A) {
+; CHECK-LABEL: @t_2_192(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], -64
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 2
+  %l1 = select i1 %l2, i8 %A, i8 2
+  %x = and i8 %l1, -64
+  ret i8 %x
+}
+
+define i8 @t_2_63_or(i8 %A) {
+; CHECK-LABEL: @t_2_63_or(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2
+; CHECK-NEXT:    [[X:%.*]] = or i8 [[L1]], 63
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 2
+  %l1 = select i1 %l2, i8 %A, i8 2
+  %x = or i8 %l1, 63
+  ret i8 %x
+}
+
+define i8 @f_1_1(i8 %A) {
+; CHECK-LABEL: @f_1_1(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 1
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], 1
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 1
+  %l1 = select i1 %l2, i8 %A, i8 1
+  %x = and i8 %l1, 1
+  ret i8 %x
+}
+
+define i8 @f_32_32(i8 %A) {
+; CHECK-LABEL: @f_32_32(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 32
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 32
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], -32
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 32
+  %l1 = select i1 %l2, i8 %A, i8 32
+  %x = and i8 %l1, -32
+  ret i8 %x
+}
+
+define i8 @f_191_192(i8 %A) {
+; CHECK-LABEL: @f_191_192(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], -65
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 -65
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], -64
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 191
+  %l1 = select i1 %l2, i8 %A, i8 191
+  %x = and i8 %l1, 192
+  ret i8 %x
+}
+
+define i8 @f_10_1(i8 %A) {
+; CHECK-LABEL: @f_10_1(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 10
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 10
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], 1
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 10
+  %l1 = select i1 %l2, i8 %A, i8 10
+  %x = and i8 %l1, 1
+  ret i8 %x
+}
+
+define i32 @and_umin(i32 %A) {
+; CHECK-LABEL: @and_umin(
+; CHECK-NEXT:    ret i32 0
+;
+  %l0 = icmp ult i32 15, %A
+  %l1 = select i1 %l0, i32 15, i32 %A
+  %x = and i32 %l1, -32
+  ret i32 %x
+}
+
+define i32 @or_umin(i32 %A) {
+; CHECK-LABEL: @or_umin(
+; CHECK-NEXT:    ret i32 31
+;
+  %l0 = icmp ult i32 15, %A
+  %l1 = select i1 %l0, i32 15, i32 %A
+  %x = or i32 %l1, 31
+  ret i32 %x
+}
+
+define i8 @or_min_31_30(i8 %A) {
+; CHECK-LABEL: @or_min_31_30(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[A:%.*]], -30
+; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[CMP]], i8 [[A]], i8 -30
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[MIN]], 31
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %cmp = icmp ult i8 %A, -30
+  %min = select i1 %cmp, i8 %A, i8 -30
+  %r = or i8 %min, 31
+  ret i8 %r
+}
+
+define i8 @and_min_7_7(i8 %A) {
+; CHECK-LABEL: @and_min_7_7(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ult i8 [[A:%.*]], -7
+; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[L2]], i8 [[A]], i8 -7
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[MIN]], -8
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %l2 = icmp ult i8 %A, -7
+  %min = select i1 %l2, i8 %A, i8 -7
+  %r = and i8 %min, -8
+  ret i8 %r
+}
+
+define i8 @and_min_7_8(i8 %A) {
+; CHECK-LABEL: @and_min_7_8(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ult i8 [[A:%.*]], -8
+; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[L2]], i8 [[A]], i8 -8
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[MIN]], -8
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %l2 = icmp ult i8 %A, -8
+  %min = select i1 %l2, i8 %A, i8 -8
+  %r = and i8 %min, -8
+  ret i8 %r
+}
+
+define i8 @and_min_7_9(i8 %A) {
+; CHECK-LABEL: @and_min_7_9(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ult i8 [[A:%.*]], -9
+; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[L2]], i8 [[A]], i8 -9
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[MIN]], -8
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %l2 = icmp ult i8 %A, -9
+  %min = select i1 %l2, i8 %A, i8 -9
+  %r = and i8 %min, -8
+  ret i8 %r
+}
+
-- 
GitLab


From c3c05ee92fb49821124862c964f91a7e3a27f0b2 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 11 Oct 2018 11:04:09 +0000
Subject: [PATCH 0051/1116] [InstCombine] Demand bits of UMax

Use the demanded bits of umax(A,C) to prove we can just use A so long as the
lowest non-zero bit of DemandMask is higher than the highest non-zero bit of C

Differential Revision: https://reviews.llvm.org/D53033


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344237 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombineSimplifyDemanded.cpp           | 20 ++++++++++---
 .../InstCombine/minmax-demandbits.ll          | 28 +++++--------------
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 936daa828a5..18a2b2fdbfe 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -314,11 +314,22 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     Known.One  = std::move(IKnownOne);
     break;
   }
-  case Instruction::Select:
-    // If this is a select as part of a min/max pattern, don't simplify any
-    // further in case we break the structure.
+  case Instruction::Select: {
     Value *LHS, *RHS;
-    if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN)
+    SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
+    if (SPF == SPF_UMAX) {
+      // UMax(A, C) == A if ...
+      // The lowest non-zero bit of DemandMask is higher than the highest
+      // non-zero bit of C.
+      const APInt *C;
+      unsigned CTZ = DemandedMask.countTrailingZeros();
+      if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits())
+        return LHS;
+    }
+
+    // If this is a select as part of any other min/max pattern, don't simplify
+    // any further in case we break the structure.
+    if (SPF != SPF_UNKNOWN)
       return nullptr;
 
     if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) ||
@@ -336,6 +347,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     Known.One = RHSKnown.One & LHSKnown.One;
     Known.Zero = RHSKnown.Zero & LHSKnown.Zero;
     break;
+  }
   case Instruction::ZExt:
   case Instruction::Trunc: {
     unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
diff --git a/test/Transforms/InstCombine/minmax-demandbits.ll b/test/Transforms/InstCombine/minmax-demandbits.ll
index 8977c19856f..f838560f965 100644
--- a/test/Transforms/InstCombine/minmax-demandbits.ll
+++ b/test/Transforms/InstCombine/minmax-demandbits.ll
@@ -4,9 +4,7 @@
 
 define i32 @and_umax_less(i32 %A) {
 ; CHECK-LABEL: @and_umax_less(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 31
-; CHECK-NEXT:    [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 31
-; CHECK-NEXT:    [[X:%.*]] = and i32 [[L1]], -32
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[A:%.*]], -32
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
   %l0 = icmp ugt i32 31, %A
@@ -17,9 +15,7 @@ define i32 @and_umax_less(i32 %A) {
 
 define i32 @and_umax_muchless(i32 %A) {
 ; CHECK-LABEL: @and_umax_muchless(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 12
-; CHECK-NEXT:    [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 12
-; CHECK-NEXT:    [[X:%.*]] = and i32 [[L1]], -32
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[A:%.*]], -32
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
   %l0 = icmp ugt i32 12, %A
@@ -43,9 +39,7 @@ define i32 @and_umax_more(i32 %A) {
 
 define i32 @shr_umax(i32 %A) {
 ; CHECK-LABEL: @shr_umax(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 15
-; CHECK-NEXT:    [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 15
-; CHECK-NEXT:    [[X:%.*]] = lshr i32 [[L1]], 4
+; CHECK-NEXT:    [[X:%.*]] = lshr i32 [[A:%.*]], 4
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
   %l0 = icmp ugt i32 15, %A
@@ -80,9 +74,7 @@ define i8 @t_0_10(i8 %A) {
 
 define i8 @t_1_10(i8 %A) {
 ; CHECK-LABEL: @t_1_10(
-; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 1
-; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 1
-; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], 10
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[A:%.*]], 10
 ; CHECK-NEXT:    ret i8 [[X]]
 ;
   %l2 = icmp ugt i8 %A, 1
@@ -93,9 +85,7 @@ define i8 @t_1_10(i8 %A) {
 
 define i8 @t_2_4(i8 %A) {
 ; CHECK-LABEL: @t_2_4(
-; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2
-; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2
-; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], 4
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[A:%.*]], 4
 ; CHECK-NEXT:    ret i8 [[X]]
 ;
   %l2 = icmp ugt i8 %A, 2
@@ -106,9 +96,7 @@ define i8 @t_2_4(i8 %A) {
 
 define i8 @t_2_192(i8 %A) {
 ; CHECK-LABEL: @t_2_192(
-; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2
-; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2
-; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], -64
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[A:%.*]], -64
 ; CHECK-NEXT:    ret i8 [[X]]
 ;
   %l2 = icmp ugt i8 %A, 2
@@ -119,9 +107,7 @@ define i8 @t_2_192(i8 %A) {
 
 define i8 @t_2_63_or(i8 %A) {
 ; CHECK-LABEL: @t_2_63_or(
-; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2
-; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2
-; CHECK-NEXT:    [[X:%.*]] = or i8 [[L1]], 63
+; CHECK-NEXT:    [[X:%.*]] = or i8 [[A:%.*]], 63
 ; CHECK-NEXT:    ret i8 [[X]]
 ;
   %l2 = icmp ugt i8 %A, 2
-- 
GitLab


From 3e9802e2f980fe66a85921585d0fdcfb40d96184 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@lowrisc.org>
Date: Thu, 11 Oct 2018 11:11:58 +0000
Subject: [PATCH 0052/1116] [RISCV] Re-generate test/CodeGen/RISCV/vararg.ll
 after r344142

The improved load-store forwarding committed in r344142 broke this test.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344238 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/RISCV/vararg.ll | 66 +++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/test/CodeGen/RISCV/vararg.ll b/test/CodeGen/RISCV/vararg.ll
index ac08f346fbb..77f8f300956 100644
--- a/test/CodeGen/RISCV/vararg.ll
+++ b/test/CodeGen/RISCV/vararg.ll
@@ -17,16 +17,16 @@ define i32 @va1(i8* %fmt, ...) nounwind {
 ; RV32I-FPELIM-LABEL: va1:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -48
-; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
+; RV32I-FPELIM-NEXT:    mv a0, a1
 ; RV32I-FPELIM-NEXT:    sw a7, 44(sp)
 ; RV32I-FPELIM-NEXT:    sw a6, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw a5, 36(sp)
 ; RV32I-FPELIM-NEXT:    sw a4, 32(sp)
 ; RV32I-FPELIM-NEXT:    sw a3, 28(sp)
 ; RV32I-FPELIM-NEXT:    sw a2, 24(sp)
-; RV32I-FPELIM-NEXT:    addi a0, sp, 24
-; RV32I-FPELIM-NEXT:    sw a0, 12(sp)
-; RV32I-FPELIM-NEXT:    lw a0, 20(sp)
+; RV32I-FPELIM-NEXT:    addi a1, sp, 24
+; RV32I-FPELIM-NEXT:    sw a1, 12(sp)
+; RV32I-FPELIM-NEXT:    sw a0, 20(sp)
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 48
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -36,16 +36,16 @@ define i32 @va1(i8* %fmt, ...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
+; RV32I-WITHFP-NEXT:    mv a0, a1
 ; RV32I-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32I-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32I-WITHFP-NEXT:    sw a5, 20(s0)
 ; RV32I-WITHFP-NEXT:    sw a4, 16(s0)
 ; RV32I-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32I-WITHFP-NEXT:    sw a2, 8(s0)
-; RV32I-WITHFP-NEXT:    addi a0, s0, 8
-; RV32I-WITHFP-NEXT:    sw a0, -12(s0)
-; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
+; RV32I-WITHFP-NEXT:    addi a1, s0, 8
+; RV32I-WITHFP-NEXT:    sw a1, -12(s0)
+; RV32I-WITHFP-NEXT:    sw a0, 4(s0)
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 48
@@ -66,16 +66,16 @@ define i32 @va1_va_arg(i8* %fmt, ...) nounwind {
 ; RV32I-FPELIM-LABEL: va1_va_arg:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -48
-; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
+; RV32I-FPELIM-NEXT:    mv a0, a1
 ; RV32I-FPELIM-NEXT:    sw a7, 44(sp)
 ; RV32I-FPELIM-NEXT:    sw a6, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw a5, 36(sp)
 ; RV32I-FPELIM-NEXT:    sw a4, 32(sp)
 ; RV32I-FPELIM-NEXT:    sw a3, 28(sp)
 ; RV32I-FPELIM-NEXT:    sw a2, 24(sp)
-; RV32I-FPELIM-NEXT:    addi a0, sp, 24
-; RV32I-FPELIM-NEXT:    sw a0, 12(sp)
-; RV32I-FPELIM-NEXT:    lw a0, 20(sp)
+; RV32I-FPELIM-NEXT:    addi a1, sp, 24
+; RV32I-FPELIM-NEXT:    sw a1, 12(sp)
+; RV32I-FPELIM-NEXT:    sw a0, 20(sp)
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 48
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -85,16 +85,16 @@ define i32 @va1_va_arg(i8* %fmt, ...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
+; RV32I-WITHFP-NEXT:    mv a0, a1
 ; RV32I-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32I-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32I-WITHFP-NEXT:    sw a5, 20(s0)
 ; RV32I-WITHFP-NEXT:    sw a4, 16(s0)
 ; RV32I-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32I-WITHFP-NEXT:    sw a2, 8(s0)
-; RV32I-WITHFP-NEXT:    addi a0, s0, 8
-; RV32I-WITHFP-NEXT:    sw a0, -12(s0)
-; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
+; RV32I-WITHFP-NEXT:    addi a1, s0, 8
+; RV32I-WITHFP-NEXT:    sw a1, -12(s0)
+; RV32I-WITHFP-NEXT:    sw a0, 4(s0)
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 48
@@ -117,7 +117,7 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind {
 ; RV32I-FPELIM-NEXT:    sw s0, 8(sp)
 ; RV32I-FPELIM-NEXT:    sw s1, 4(sp)
 ; RV32I-FPELIM-NEXT:    addi s0, sp, 16
-; RV32I-FPELIM-NEXT:    sw a1, 4(s0)
+; RV32I-FPELIM-NEXT:    mv s1, a1
 ; RV32I-FPELIM-NEXT:    sw a7, 28(s0)
 ; RV32I-FPELIM-NEXT:    sw a6, 24(s0)
 ; RV32I-FPELIM-NEXT:    sw a5, 20(s0)
@@ -126,8 +126,8 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind {
 ; RV32I-FPELIM-NEXT:    sw a2, 8(s0)
 ; RV32I-FPELIM-NEXT:    addi a0, s0, 8
 ; RV32I-FPELIM-NEXT:    sw a0, -16(s0)
-; RV32I-FPELIM-NEXT:    lw s1, 4(s0)
-; RV32I-FPELIM-NEXT:    addi a0, s1, 15
+; RV32I-FPELIM-NEXT:    sw a1, 4(s0)
+; RV32I-FPELIM-NEXT:    addi a0, a1, 15
 ; RV32I-FPELIM-NEXT:    andi a0, a0, -16
 ; RV32I-FPELIM-NEXT:    sub a0, sp, a0
 ; RV32I-FPELIM-NEXT:    mv sp, a0
@@ -147,7 +147,7 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    sw s1, 4(sp)
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
+; RV32I-WITHFP-NEXT:    mv s1, a1
 ; RV32I-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32I-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32I-WITHFP-NEXT:    sw a5, 20(s0)
@@ -156,8 +156,8 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw a2, 8(s0)
 ; RV32I-WITHFP-NEXT:    addi a0, s0, 8
 ; RV32I-WITHFP-NEXT:    sw a0, -16(s0)
-; RV32I-WITHFP-NEXT:    lw s1, 4(s0)
-; RV32I-WITHFP-NEXT:    addi a0, s1, 15
+; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
+; RV32I-WITHFP-NEXT:    addi a0, a1, 15
 ; RV32I-WITHFP-NEXT:    andi a0, a0, -16
 ; RV32I-WITHFP-NEXT:    sub a0, sp, a0
 ; RV32I-WITHFP-NEXT:    mv sp, a0
@@ -535,17 +535,17 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -48
 ; RV32I-FPELIM-NEXT:    sw ra, 12(sp)
 ; RV32I-FPELIM-NEXT:    sw s1, 8(sp)
-; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
+; RV32I-FPELIM-NEXT:    mv s1, a1
 ; RV32I-FPELIM-NEXT:    sw a7, 44(sp)
 ; RV32I-FPELIM-NEXT:    sw a6, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw a5, 36(sp)
 ; RV32I-FPELIM-NEXT:    sw a4, 32(sp)
 ; RV32I-FPELIM-NEXT:    sw a3, 28(sp)
 ; RV32I-FPELIM-NEXT:    sw a2, 24(sp)
+; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
 ; RV32I-FPELIM-NEXT:    addi a0, sp, 24
 ; RV32I-FPELIM-NEXT:    sw a0, 4(sp)
 ; RV32I-FPELIM-NEXT:    sw a0, 0(sp)
-; RV32I-FPELIM-NEXT:    lw s1, 20(sp)
 ; RV32I-FPELIM-NEXT:    call notdead
 ; RV32I-FPELIM-NEXT:    lw a0, 4(sp)
 ; RV32I-FPELIM-NEXT:    addi a0, a0, 3
@@ -578,17 +578,17 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw s0, 24(sp)
 ; RV32I-WITHFP-NEXT:    sw s1, 20(sp)
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 32
-; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
+; RV32I-WITHFP-NEXT:    mv s1, a1
 ; RV32I-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32I-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32I-WITHFP-NEXT:    sw a5, 20(s0)
 ; RV32I-WITHFP-NEXT:    sw a4, 16(s0)
 ; RV32I-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32I-WITHFP-NEXT:    sw a2, 8(s0)
+; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
 ; RV32I-WITHFP-NEXT:    addi a0, s0, 8
 ; RV32I-WITHFP-NEXT:    sw a0, -16(s0)
 ; RV32I-WITHFP-NEXT:    sw a0, -20(s0)
-; RV32I-WITHFP-NEXT:    lw s1, 4(s0)
 ; RV32I-WITHFP-NEXT:    call notdead
 ; RV32I-WITHFP-NEXT:    lw a0, -16(s0)
 ; RV32I-WITHFP-NEXT:    addi a0, a0, 3
@@ -777,7 +777,6 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; RV32I-FPELIM-LABEL: va6_no_fixed_args:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -48
-; RV32I-FPELIM-NEXT:    sw a0, 16(sp)
 ; RV32I-FPELIM-NEXT:    sw a7, 44(sp)
 ; RV32I-FPELIM-NEXT:    sw a6, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw a5, 36(sp)
@@ -785,9 +784,9 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; RV32I-FPELIM-NEXT:    sw a3, 28(sp)
 ; RV32I-FPELIM-NEXT:    sw a2, 24(sp)
 ; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
-; RV32I-FPELIM-NEXT:    addi a0, sp, 20
-; RV32I-FPELIM-NEXT:    sw a0, 12(sp)
-; RV32I-FPELIM-NEXT:    lw a0, 16(sp)
+; RV32I-FPELIM-NEXT:    addi a1, sp, 20
+; RV32I-FPELIM-NEXT:    sw a1, 12(sp)
+; RV32I-FPELIM-NEXT:    sw a0, 16(sp)
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 48
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -797,7 +796,6 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    sw a0, 0(s0)
 ; RV32I-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32I-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32I-WITHFP-NEXT:    sw a5, 20(s0)
@@ -805,9 +803,9 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32I-WITHFP-NEXT:    sw a2, 8(s0)
 ; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
-; RV32I-WITHFP-NEXT:    addi a0, s0, 4
-; RV32I-WITHFP-NEXT:    sw a0, -12(s0)
-; RV32I-WITHFP-NEXT:    lw a0, 0(s0)
+; RV32I-WITHFP-NEXT:    addi a1, s0, 4
+; RV32I-WITHFP-NEXT:    sw a1, -12(s0)
+; RV32I-WITHFP-NEXT:    sw a0, 0(s0)
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 48
-- 
GitLab


From b4f227a2a58661618237bc78de4b4c7deb3b48c5 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 11 Oct 2018 11:28:27 +0000
Subject: [PATCH 0053/1116] [InstCombine] Demand bits of UMin

This is the umin alternative to the umax code from rL344237. We use
DeMorgans law on the umax case to bring us to the same thing on umin,
but using countLeadingOnes, not countLeadingZeros.

Differential Revision: https://reviews.llvm.org/D53036


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344239 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineSimplifyDemanded.cpp      | 10 ++++++++++
 test/Transforms/InstCombine/minmax-demandbits.ll     | 12 +++---------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 18a2b2fdbfe..45cacc73d63 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -325,6 +325,16 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       unsigned CTZ = DemandedMask.countTrailingZeros();
       if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits())
         return LHS;
+    } else if (SPF == SPF_UMIN) {
+      // UMin(A, C) == A if ...
+      // The lowest non-zero bit of DemandMask is higher than the highest
+      // non-one bit of C.
+      // This comes from using DeMorgans on the above umax example.
+      const APInt *C;
+      unsigned CTZ = DemandedMask.countTrailingZeros();
+      if (match(RHS, m_APInt(C)) &&
+          CTZ >= C->getBitWidth() - C->countLeadingOnes())
+        return LHS;
     }
 
     // If this is a select as part of any other min/max pattern, don't simplify
diff --git a/test/Transforms/InstCombine/minmax-demandbits.ll b/test/Transforms/InstCombine/minmax-demandbits.ll
index f838560f965..29a569663d2 100644
--- a/test/Transforms/InstCombine/minmax-demandbits.ll
+++ b/test/Transforms/InstCombine/minmax-demandbits.ll
@@ -190,9 +190,7 @@ define i32 @or_umin(i32 %A) {
 
 define i8 @or_min_31_30(i8 %A) {
 ; CHECK-LABEL: @or_min_31_30(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[A:%.*]], -30
-; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[CMP]], i8 [[A]], i8 -30
-; CHECK-NEXT:    [[R:%.*]] = or i8 [[MIN]], 31
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[A:%.*]], 31
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %cmp = icmp ult i8 %A, -30
@@ -203,9 +201,7 @@ define i8 @or_min_31_30(i8 %A) {
 
 define i8 @and_min_7_7(i8 %A) {
 ; CHECK-LABEL: @and_min_7_7(
-; CHECK-NEXT:    [[L2:%.*]] = icmp ult i8 [[A:%.*]], -7
-; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[L2]], i8 [[A]], i8 -7
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[MIN]], -8
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[A:%.*]], -8
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %l2 = icmp ult i8 %A, -7
@@ -216,9 +212,7 @@ define i8 @and_min_7_7(i8 %A) {
 
 define i8 @and_min_7_8(i8 %A) {
 ; CHECK-LABEL: @and_min_7_8(
-; CHECK-NEXT:    [[L2:%.*]] = icmp ult i8 [[A:%.*]], -8
-; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[L2]], i8 [[A]], i8 -8
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[MIN]], -8
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[A:%.*]], -8
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %l2 = icmp ult i8 %A, -8
-- 
GitLab


From 14c745c241fbf61c6794fc7249e1384159df397e Mon Sep 17 00:00:00 2001
From: Dylan McKay <me@dylanmckay.io>
Date: Thu, 11 Oct 2018 12:49:50 +0000
Subject: [PATCH 0054/1116] Generalize an IR verifier check to work with
 non-zero program address spaces

This commit modifies an existing IR verifier check that
assumes all functions will be located in the default address
space 0.

Rather than using the default paramater value getPointerTo(AddrSpace=0),
explicitly specify the program memory address space from the data layout.

This only affects targets that specify a nonzero address space
in their data layouts. The only in-tree target that does this
is AVR.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344243 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Verifier.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 4b954c710e3..8304ec6e8f4 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -632,7 +632,8 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
     if (ArrayType *ATy = dyn_cast<ArrayType>(GV.getValueType())) {
       StructType *STy = dyn_cast<StructType>(ATy->getElementType());
       PointerType *FuncPtrTy =
-          FunctionType::get(Type::getVoidTy(Context), false)->getPointerTo();
+          FunctionType::get(Type::getVoidTy(Context), false)->
+          getPointerTo(DL.getProgramAddressSpace());
       // FIXME: Reject the 2-field form in LLVM 4.0.
       Assert(STy &&
                  (STy->getNumElements() == 2 || STy->getNumElements() == 3) &&
-- 
GitLab


From 1f4ef788a3bc636c5726f2f8ab95f11bdc0a9d58 Mon Sep 17 00:00:00 2001
From: "Diogo N. Sampaio" <diogo.sampaio@arm.com>
Date: Thu, 11 Oct 2018 14:10:32 +0000
Subject: [PATCH 0055/1116] [AARCH64][FIX] Emit data symbol for constant pool
 data

The ARM64 elf emitter would omit printing data
symbol for zero filled constant data. This patch
overrides the emitFill method as to enforce that
the symbol is correctly printed.

Differential revision: https://reviews.llvm.org/D53132


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344248 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../AArch64/MCTargetDesc/AArch64ELFStreamer.cpp   |  5 +++++
 test/MC/AArch64/CheckDataSymbol.s                 | 15 +++++++++++++++
 2 files changed, 20 insertions(+)
 create mode 100644 test/MC/AArch64/CheckDataSymbol.s

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index c0ef8b67028..a09ac6b94c1 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -154,6 +154,11 @@ public:
     MCELFStreamer::EmitValueImpl(Value, Size, Loc);
   }
 
+  void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
+                                  SMLoc Loc) override {
+    EmitDataMappingSymbol();
+    MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
+  }
 private:
   enum ElfMappingSymbol {
     EMS_None,
diff --git a/test/MC/AArch64/CheckDataSymbol.s b/test/MC/AArch64/CheckDataSymbol.s
new file mode 100644
index 00000000000..ea3ed7b2873
--- /dev/null
+++ b/test/MC/AArch64/CheckDataSymbol.s
@@ -0,0 +1,15 @@
+# RUN: llvm-mc -filetype=obj -assemble \
+# RUN: -triple=aarch64- %s -o - \
+# RUN: | llvm-readobj -s -t - | FileCheck %s
+# CHECK:     Name: $d.1 ({{[1-9][0-9]+}})
+# CHECK-NEXT:     Value: 0x4
+# CHECK-NEXT:     Size: 0
+# CHECK-NEXT:     Binding: Local (0x0)
+# CHECK-NEXT:     Type: None (0x0)
+# CHECK-NEXT:     Other: 0
+# CHECK-NEXT:     Section: .text (0x2)
+# CHECK-NEXT:   }
+
+.text
+nop
+.zero 4
-- 
GitLab


From 4b25d67d1bc4a69db42057bf363dd8d4a9315d9f Mon Sep 17 00:00:00 2001
From: Amara Emerson <aemerson@apple.com>
Date: Thu, 11 Oct 2018 14:51:11 +0000
Subject: [PATCH 0056/1116] [InstCombine] Fix SimplifyLibCalls erasing an
 instruction while IC still had references to it.

InstCombine keeps a worklist and assumes that optimizations don't
eraseFromParent() the instruction, which SimplifyLibCalls violates. This change
adds a new callback to SimplifyLibCalls to let clients specify their own hander
for erasing actions.

Differential Revision: https://reviews.llvm.org/D52729

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344251 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/Transforms/Utils/SimplifyLibCalls.h  | 23 +++++++++++++----
 .../InstCombine/InstCombineCalls.cpp          |  6 ++++-
 lib/Transforms/Utils/SimplifyLibCalls.cpp     | 24 ++++++++++--------
 .../InstCombine/simplify-libcalls-erased.ll   | 25 +++++++++++++++++++
 4 files changed, 62 insertions(+), 16 deletions(-)
 create mode 100644 test/Transforms/InstCombine/simplify-libcalls-erased.ll

diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 2b344f44107..025bcd44e31 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -77,21 +77,34 @@ private:
   OptimizationRemarkEmitter &ORE;
   bool UnsafeFPShrink;
   function_ref<void(Instruction *, Value *)> Replacer;
+  function_ref<void(Instruction *)> Eraser;
 
   /// Internal wrapper for RAUW that is the default implementation.
   ///
   /// Other users may provide an alternate function with this signature instead
   /// of this one.
-  static void replaceAllUsesWithDefault(Instruction *I, Value *With);
+  static void replaceAllUsesWithDefault(Instruction *I, Value *With) {
+    I->replaceAllUsesWith(With);
+  }
+
+  /// Internal wrapper for eraseFromParent that is the default implementation.
+  static void eraseFromParentDefault(Instruction *I) { I->eraseFromParent(); }
 
   /// Replace an instruction's uses with a value using our replacer.
   void replaceAllUsesWith(Instruction *I, Value *With);
 
+  /// Erase an instruction from its parent with our eraser.
+  void eraseFromParent(Instruction *I);
+
+  Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B);
+
 public:
-  LibCallSimplifier(const DataLayout &DL, const TargetLibraryInfo *TLI,
-                    OptimizationRemarkEmitter &ORE,
-                    function_ref<void(Instruction *, Value *)> Replacer =
-                        &replaceAllUsesWithDefault);
+  LibCallSimplifier(
+      const DataLayout &DL, const TargetLibraryInfo *TLI,
+      OptimizationRemarkEmitter &ORE,
+      function_ref<void(Instruction *, Value *)> Replacer =
+          &replaceAllUsesWithDefault,
+      function_ref<void(Instruction *)> Eraser = &eraseFromParentDefault);
 
   /// optimizeCall - Take the given call instruction and return a more
   /// optimal value to replace the instruction with or 0 if a more
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 4e404933a22..714c6176884 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3960,7 +3960,11 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
   auto InstCombineRAUW = [this](Instruction *From, Value *With) {
     replaceInstUsesWith(*From, With);
   };
-  LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW);
+  auto InstCombineErase = [this](Instruction *I) {
+    eraseInstFromFunction(*I);
+  };
+  LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW,
+                               InstCombineErase);
   if (Value *With = Simplifier.optimizeCall(CI)) {
     ++NumSimplified;
     return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 3789181a898..41a495a0484 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -923,8 +923,7 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
 }
 
 /// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
-static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
-                               const TargetLibraryInfo &TLI) {
+Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilder<> &B) {
   // This has to be a memset of zeros (bzero).
   auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1));
   if (!FillValue || FillValue->getZExtValue() != 0)
@@ -944,7 +943,7 @@ static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
     return nullptr;
 
   LibFunc Func;
-  if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
+  if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) ||
       Func != LibFunc_malloc)
     return nullptr;
 
@@ -959,18 +958,18 @@ static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
   IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext());
   Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
                              Malloc->getArgOperand(0), Malloc->getAttributes(),
-                             B, TLI);
+                             B, *TLI);
   if (!Calloc)
     return nullptr;
 
   Malloc->replaceAllUsesWith(Calloc);
-  Malloc->eraseFromParent();
+  eraseFromParent(Malloc);
 
   return Calloc;
 }
 
 Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
-  if (auto *Calloc = foldMallocMemset(CI, B, *TLI))
+  if (auto *Calloc = foldMallocMemset(CI, B))
     return Calloc;
 
   // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
@@ -1246,7 +1245,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
       // effects (e.g., errno).  When the only consumer for the original
       // exp{,2}() is pow(), then it has to be explicitly erased.
       BaseFn->replaceAllUsesWith(ExpFn);
-      BaseFn->eraseFromParent();
+      eraseFromParent(BaseFn);
 
       return ExpFn;
     }
@@ -2591,7 +2590,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) {
         // If we were able to further simplify, remove the now redundant call.
         SimplifiedCI->replaceAllUsesWith(V);
-        SimplifiedCI->eraseFromParent();
+        eraseFromParent(SimplifiedCI);
         return V;
       }
     }
@@ -2670,15 +2669,20 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
 LibCallSimplifier::LibCallSimplifier(
     const DataLayout &DL, const TargetLibraryInfo *TLI,
     OptimizationRemarkEmitter &ORE,
-    function_ref<void(Instruction *, Value *)> Replacer)
+    function_ref<void(Instruction *, Value *)> Replacer,
+    function_ref<void(Instruction *)> Eraser)
     : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE),
-      UnsafeFPShrink(false), Replacer(Replacer) {}
+      UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {}
 
 void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
   // Indirect through the replacer used in this instance.
   Replacer(I, With);
 }
 
+void LibCallSimplifier::eraseFromParent(Instruction *I) {
+  Eraser(I);
+}
+
 // TODO:
 //   Additional cases that we need to add to this file:
 //
diff --git a/test/Transforms/InstCombine/simplify-libcalls-erased.ll b/test/Transforms/InstCombine/simplify-libcalls-erased.ll
new file mode 100644
index 00000000000..19cfcf8eba9
--- /dev/null
+++ b/test/Transforms/InstCombine/simplify-libcalls-erased.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -instcombine | FileCheck %s
+
+target triple = "x86_64"
+
+define double @pow_exp(double %x, double %y) {
+; CHECK-LABEL: @pow_exp(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[EXP:%.*]] = call fast double @llvm.exp.f64(double [[MUL]])
+; CHECK-NEXT:    ret double [[EXP]]
+;
+  %A = alloca i1
+  %call = call fast double @exp(double %x) #1
+  %pow = call fast double @llvm.pow.f64(double %call, double %y)
+  %C1 = fcmp ule double %call, %pow
+  store i1 %C1, i1* %A
+  ret double %pow
+}
+
+declare double @exp(double)
+
+declare double @llvm.pow.f64(double, double) #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind readnone }
-- 
GitLab


From 09ab8e9f3ad978820f216f13888e93366478ce07 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Thu, 11 Oct 2018 14:54:54 +0000
Subject: [PATCH 0057/1116] [llvm-mca][BtVer2] Add tests for optimizable GPR
 register moves. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344253 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../X86/BtVer2/reg-move-elimination-4.s       | 108 ++++++++++++++++++
 .../X86/BtVer2/reg-move-elimination-5.s       | 108 ++++++++++++++++++
 2 files changed, 216 insertions(+)
 create mode 100644 test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s
 create mode 100644 test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s

diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s
new file mode 100644
index 00000000000..72ca7693c5f
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s
@@ -0,0 +1,108 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+xor %eax, %eax
+mov %eax, %ebx
+mov %ebx, %ecx
+mov %ecx, %edx
+mov %edx, %eax
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      15
+# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total uOps:        15
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.25
+# CHECK-NEXT: IPC:               1.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.50                        xorl	%eax, %eax
+# CHECK-NEXT:  1      1     0.50                        movl	%eax, %ebx
+# CHECK-NEXT:  1      1     0.50                        movl	%ebx, %ecx
+# CHECK-NEXT:  1      1     0.50                        movl	%ecx, %edx
+# CHECK-NEXT:  1      1     0.50                        movl	%edx, %eax
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    12
+# CHECK-NEXT: Max number of mappings used:         7
+
+# CHECK:      *  Register File #1 -- JFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     72
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- JIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     64
+# CHECK-NEXT:    Total number of mappings created: 12
+# CHECK-NEXT:    Max number of mappings used:      7
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%eax, %eax
+# CHECK-NEXT: 0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -     movl	%eax, %ebx
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ebx, %ecx
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     movl	%ecx, %edx
+# CHECK-NEXT: 0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -     movl	%edx, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    ..   xorl	%eax, %eax
+# CHECK-NEXT: [0,1]     DeER .    ..   movl	%eax, %ebx
+# CHECK-NEXT: [0,2]     .DeER.    ..   movl	%ebx, %ecx
+# CHECK-NEXT: [0,3]     .D=eER    ..   movl	%ecx, %edx
+# CHECK-NEXT: [0,4]     . D=eER   ..   movl	%edx, %eax
+# CHECK-NEXT: [1,0]     . D---R   ..   xorl	%eax, %eax
+# CHECK-NEXT: [1,1]     .  DeE-R  ..   movl	%eax, %ebx
+# CHECK-NEXT: [1,2]     .  D=eER  ..   movl	%ebx, %ecx
+# CHECK-NEXT: [1,3]     .   D=eER ..   movl	%ecx, %edx
+# CHECK-NEXT: [1,4]     .   D==eER..   movl	%edx, %eax
+# CHECK-NEXT: [2,0]     .    D---R..   xorl	%eax, %eax
+# CHECK-NEXT: [2,1]     .    DeE--R.   movl	%eax, %ebx
+# CHECK-NEXT: [2,2]     .    .DeE-R.   movl	%ebx, %ecx
+# CHECK-NEXT: [2,3]     .    .D=eE-R   movl	%ecx, %edx
+# CHECK-NEXT: [2,4]     .    . D=eER   movl	%edx, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    2.0       xorl	%eax, %eax
+# CHECK-NEXT: 1.     3     1.0    1.0    1.0       movl	%eax, %ebx
+# CHECK-NEXT: 2.     3     1.3    0.0    0.3       movl	%ebx, %ecx
+# CHECK-NEXT: 3.     3     2.0    0.0    0.3       movl	%ecx, %edx
+# CHECK-NEXT: 4.     3     2.3    0.0    0.0       movl	%edx, %eax
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s
new file mode 100644
index 00000000000..7d6b75f7c3f
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s
@@ -0,0 +1,108 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+xor %rax, %rax
+mov %rax, %rbx
+mov %rbx, %rcx
+mov %rcx, %rdx
+mov %rdx, %rax
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      15
+# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total uOps:        15
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.25
+# CHECK-NEXT: IPC:               1.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.50                        xorq	%rax, %rax
+# CHECK-NEXT:  1      1     0.50                        movq	%rax, %rbx
+# CHECK-NEXT:  1      1     0.50                        movq	%rbx, %rcx
+# CHECK-NEXT:  1      1     0.50                        movq	%rcx, %rdx
+# CHECK-NEXT:  1      1     0.50                        movq	%rdx, %rax
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    12
+# CHECK-NEXT: Max number of mappings used:         7
+
+# CHECK:      *  Register File #1 -- JFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     72
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- JIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     64
+# CHECK-NEXT:    Total number of mappings created: 12
+# CHECK-NEXT:    Max number of mappings used:      7
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rax, %rax
+# CHECK-NEXT: 0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -     movq	%rax, %rbx
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rbx, %rcx
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     movq	%rcx, %rdx
+# CHECK-NEXT: 0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -     movq	%rdx, %rax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    ..   xorq	%rax, %rax
+# CHECK-NEXT: [0,1]     DeER .    ..   movq	%rax, %rbx
+# CHECK-NEXT: [0,2]     .DeER.    ..   movq	%rbx, %rcx
+# CHECK-NEXT: [0,3]     .D=eER    ..   movq	%rcx, %rdx
+# CHECK-NEXT: [0,4]     . D=eER   ..   movq	%rdx, %rax
+# CHECK-NEXT: [1,0]     . D---R   ..   xorq	%rax, %rax
+# CHECK-NEXT: [1,1]     .  DeE-R  ..   movq	%rax, %rbx
+# CHECK-NEXT: [1,2]     .  D=eER  ..   movq	%rbx, %rcx
+# CHECK-NEXT: [1,3]     .   D=eER ..   movq	%rcx, %rdx
+# CHECK-NEXT: [1,4]     .   D==eER..   movq	%rdx, %rax
+# CHECK-NEXT: [2,0]     .    D---R..   xorq	%rax, %rax
+# CHECK-NEXT: [2,1]     .    DeE--R.   movq	%rax, %rbx
+# CHECK-NEXT: [2,2]     .    .DeE-R.   movq	%rbx, %rcx
+# CHECK-NEXT: [2,3]     .    .D=eE-R   movq	%rcx, %rdx
+# CHECK-NEXT: [2,4]     .    . D=eER   movq	%rdx, %rax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    2.0       xorq	%rax, %rax
+# CHECK-NEXT: 1.     3     1.0    1.0    1.0       movq	%rax, %rbx
+# CHECK-NEXT: 2.     3     1.3    0.0    0.3       movq	%rbx, %rcx
+# CHECK-NEXT: 3.     3     2.0    0.0    0.3       movq	%rcx, %rdx
+# CHECK-NEXT: 4.     3     2.3    0.0    0.0       movq	%rdx, %rax
-- 
GitLab


From 7b42e952770909c390a606159577782927263286 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 11 Oct 2018 16:07:25 +0000
Subject: [PATCH 0058/1116] [DAGCombiner] move comment closer to the
 corresponding code; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344255 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index eca5d8369eb..4a80c1d358d 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15451,14 +15451,13 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
 }
 
 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
-  // (vextract (scalar_to_vector val, 0) -> val
   SDValue InVec = N->getOperand(0);
   EVT VT = InVec.getValueType();
   EVT NVT = N->getValueType(0);
-
   if (InVec.isUndef())
     return DAG.getUNDEF(NVT);
 
+  // (vextract (scalar_to_vector val, 0) -> val
   if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
     // Check if the result type doesn't match the inserted element type. A
     // SCALAR_TO_VECTOR may truncate the inserted element and the
-- 
GitLab


From ea0a193dbf8790b8e3a3a5e24a1da61e6d154ef9 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Thu, 11 Oct 2018 17:55:11 +0000
Subject: [PATCH 0059/1116] [llvm-nm] Fix crash when running with --print-armap
 on corrupt archives.

error() in llvm-nm intentionally does not return so that the callee can move on to future files/slices. When printing the archive map, this is not currently handled (the caller assumes that error() returns), so processing continues despite there being an error.

Also, change one return to a break, so that symbols can be printed even if the archive map is corrupt.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344268 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-nm/llvm-nm.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 22fdd4ca85e..7e257d8ce89 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -1755,12 +1755,14 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
         outs() << "Archive map\n";
         for (; I != E; ++I) {
           Expected<Archive::Child> C = I->getMember();
-          if (!C)
+          if (!C) {
             error(C.takeError(), Filename);
+            break;
+          }
           Expected<StringRef> FileNameOrErr = C->getName();
           if (!FileNameOrErr) {
             error(FileNameOrErr.takeError(), Filename);
-            return;
+            break;
           }
           StringRef SymName = I->getName();
           outs() << SymName << " in " << FileNameOrErr.get() << "\n";
-- 
GitLab


From 488a8b20ffdf168bbfbc987aef57baa2de903630 Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Thu, 11 Oct 2018 18:01:55 +0000
Subject: [PATCH 0060/1116] Better support for POSIX paths in PDBs.

While it doesn't make a *ton* of sense for POSIX paths to be
in PDBs, it's possible to occur in real scenarios involving
cross compilation.

The tools need to be able to handle this, because certain types
of debugging scenarios are possible without a running process
and so don't necessarily require you to be on a Windows system.
These include post-mortem debugging and binary forensics (e.g.
using a debugger to disassemble functions and examine symbols
without running the process).

There's changes in clang, LLD, and lldb in this patch.  After
this the cross-platform disassembly and source-list tests pass
on Linux.

Furthermore, the behavior of LLD can now be summarized by a much
simpler rule than before: Unless you specify /pdbsourcepath and
/pdbaltpath, the PDB ends up with paths that are valid within
the context of the machine that the link is performed on.

Differential Revision: https://reviews.llvm.org/D53149

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344269 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 4d45a103c5a..8232f076a93 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -73,6 +73,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -134,7 +135,9 @@ StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
 
   // If this is a Unix-style path, just use it as is. Don't try to canonicalize
   // it textually because one of the path components could be a symlink.
-  if (!Dir.empty() && Dir[0] == '/') {
+  if (Dir.startswith("/") || Filename.startswith("/")) {
+    if (llvm::sys::path::is_absolute(Filename, llvm::sys::path::Style::posix))
+      return Filename;
     Filepath = Dir;
     if (Dir.back() != '/')
       Filepath += '/';
-- 
GitLab


From a6f9ade27a7adb8ea7b8585f79dd3bd237e5501d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 11 Oct 2018 18:06:07 +0000
Subject: [PATCH 0061/1116] [X86] Restore X86ISelDAGToDAG::matchBEXTRFromAnd.
 Teach address matching to create a BEXTR pattern from a (shl (and X, mask >>
 C1) if C1 can be folded into addressing mode.

This is an alternative to D53080 since I think using a BEXTR for a shifted mask is definitely an improvement when the shl can be absorbed into addressing mode. The other cases I'm less sure about.

We already have several tricks for handling an and of a shift in address matching. This adds a new case for BEXTR.

I've moved the BEXTR matching code back to X86ISelDAGToDAG to allow it to match. I suppose alternatively we could directly emit a X86ISD::BEXTR node that isel could pattern match. But I'm trying to view BEXTR matching as an isel concern so DAG combine can see 'and' and 'shift' operations that are well understood. We did lose a couple cases from tbm_patterns.ll, but I think there are ways to recover that.

I've also put back the manual load folding code in matchBEXTRFromAnd that I removed a few months ago in r324939. This gives us some more freedom to make decisions based on the ability to fold a load. I haven't done anything with that yet.

Differential Revision: https://reviews.llvm.org/D53126

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344270 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp | 152 +++++++++++++++++++++++++++++
 lib/Target/X86/X86ISelLowering.cpp |  66 -------------
 lib/Target/X86/X86InstrCompiler.td |  14 ---
 test/CodeGen/X86/extract-bits.ll   |  78 ++++++++++++---
 test/CodeGen/X86/tbm_patterns.ll   |   6 +-
 5 files changed, 218 insertions(+), 98 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index c043c7c54cc..f8ec4a2bcfc 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -451,6 +451,7 @@ namespace {
     }
 
     bool foldLoadStoreIntoMemOperand(SDNode *Node);
+    bool matchBEXTRFromAndImm(SDNode *Node);
     bool matchBEXTR(SDNode *Node);
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
@@ -1340,6 +1341,64 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   return false;
 }
 
+// Transform "(X >> SHIFT) & (MASK << C1)" to
+// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
+// matched to a BEXTR later. Returns false if the simplification is performed.
+static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
+                                   uint64_t Mask,
+                                   SDValue Shift, SDValue X,
+                                   X86ISelAddressMode &AM,
+                                   const X86Subtarget &Subtarget) {
+  if (Shift.getOpcode() != ISD::SRL ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+      !Shift.hasOneUse() || !N.hasOneUse())
+    return true;
+
+  // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
+  if (!Subtarget.hasTBM() &&
+      !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
+    return true;
+
+  // We need to ensure that mask is a continuous run of bits.
+  if (!isShiftedMask_64(Mask)) return true;
+
+  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+
+  // The amount of shift we're trying to fit into the addressing mode is taken
+  // from the trailing zeros of the mask.
+  unsigned AMShiftAmt = countTrailingZeros(Mask);
+
+  // There is nothing we can do here unless the mask is removing some bits.
+  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+
+  MVT VT = N.getSimpleValueType();
+  SDLoc DL(N);
+  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
+  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+  SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
+  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
+  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
+  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  insertDAGNode(DAG, N, NewSRLAmt);
+  insertDAGNode(DAG, N, NewSRL);
+  insertDAGNode(DAG, N, NewMask);
+  insertDAGNode(DAG, N, NewAnd);
+  insertDAGNode(DAG, N, NewSHLAmt);
+  insertDAGNode(DAG, N, NewSHL);
+  DAG.ReplaceAllUsesWith(N, NewSHL);
+
+  AM.Scale = 1 << AMShiftAmt;
+  AM.IndexReg = NewAnd;
+  return false;
+}
+
 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                               unsigned Depth) {
   SDLoc dl(N);
@@ -1620,6 +1679,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     // a scale on the outside of the mask.
     if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
       return false;
+
+    // Try to fold the mask and shift into BEXTR and scale.
+    if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
+      return false;
+
     break;
   }
   }
@@ -2646,6 +2710,92 @@ bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) {
   return true;
 }
 
+// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
+bool X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
+  MVT NVT = Node->getSimpleValueType(0);
+  SDLoc dl(Node);
+
+  SDValue N0 = Node->getOperand(0);
+  SDValue N1 = Node->getOperand(1);
+
+  // If we have TBM we can use an immediate for the control. If we have BMI
+  // we should only do this if the BEXTR instruction is implemented well.
+  // Otherwise moving the control into a register makes this more costly.
+  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
+  // hoisting the move immediate would make it worthwhile with a less optimal
+  // BEXTR?
+  if (!Subtarget->hasTBM() &&
+      !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
+    return false;
+
+  // Must have a shift right.
+  if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
+    return false;
+
+  // Shift can't have additional users.
+  if (!N0->hasOneUse())
+    return false;
+
+  // Only supported for 32 and 64 bits.
+  if (NVT != MVT::i32 && NVT != MVT::i64)
+    return false;
+
+  // Shift amount and RHS of and must be constant.
+  ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+  if (!MaskCst || !ShiftCst)
+    return false;
+
+  // And RHS must be a mask.
+  uint64_t Mask = MaskCst->getZExtValue();
+  if (!isMask_64(Mask))
+    return false;
+
+  uint64_t Shift = ShiftCst->getZExtValue();
+  uint64_t MaskSize = countPopulation(Mask);
+
+  // Don't interfere with something that can be handled by extracting AH.
+  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
+  if (Shift == 8 && MaskSize == 8)
+    return false;
+
+  // Make sure we are only using bits that were in the original value, not
+  // shifted in.
+  if (Shift + MaskSize > NVT.getSizeInBits())
+    return false;
+
+  SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
+  unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
+  unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+
+  // BMI requires the immediate to placed in a register.
+  if (!Subtarget->hasTBM()) {
+    ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
+    MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+    unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
+    New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
+  }
+
+  MachineSDNode *NewNode;
+  SDValue Input = N0->getOperand(0);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
+    SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+    NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+    // Update the chain.
+    ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
+    // Record the mem-refs
+    CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
+  } else {
+    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
+  }
+
+  ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+  CurDAG->RemoveDeadNode(Node);
+  return true;
+}
+
 // Emit a PCMISTR(I/M) instruction.
 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
                                              bool MayFoldLoad, const SDLoc &dl,
@@ -2953,6 +3103,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case ISD::AND:
+    if (matchBEXTRFromAndImm(Node))
+      return;
     if (matchBEXTR(Node))
       return;
     if (AndImmShrink && shrinkAndImmediate(Node))
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ab9a14a65a1..67f98d8ee72 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -35278,69 +35278,6 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
   return SDValue();
 }
 
-static bool hasBEXTR(const X86Subtarget &Subtarget, EVT VT) {
-  // If we have TBM we can use an immediate for the control. If we have BMI
-  // we should only do this if the BEXTR instruction is implemented well.
-  // Otherwise moving the control into a register makes this more costly.
-  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
-  // hoisting the move immediate would make it worthwhile with a less optimal
-  // BEXTR?
-  if (!Subtarget.hasTBM() && !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
-    return false;
-  return (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
-}
-
-// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
-static SDValue combineAndIntoBEXTR(SDNode *Node, SelectionDAG &DAG,
-                                   const X86Subtarget &Subtarget) {
-  EVT NVT = Node->getValueType(0);
-  SDLoc dl(Node);
-
-  SDValue N0 = Node->getOperand(0);
-  SDValue N1 = Node->getOperand(1);
-
-  // Check if subtarget has BEXTR instruction for the node's type
-  if (!hasBEXTR(Subtarget, NVT))
-    return SDValue();
-
-  // Must have a shift right.
-  if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
-    return SDValue();
-
-  // Shift can't have additional users.
-  if (!N0->hasOneUse())
-    return SDValue();
-
-  // Shift amount and RHS of and must be constant.
-  ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
-  ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
-  if (!MaskCst || !ShiftCst)
-    return SDValue();
-
-  // And RHS must be a mask.
-  uint64_t Mask = MaskCst->getZExtValue();
-  if (!isMask_64(Mask))
-    return SDValue();
-
-  uint64_t Shift = ShiftCst->getZExtValue();
-  uint64_t MaskSize = countPopulation(Mask);
-
-  // Don't interfere with something that can be handled by extracting AH.
-  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
-  if (Shift == 8 && MaskSize == 8)
-    return SDValue();
-
-  // Make sure we are only using bits that were in the original value, not
-  // shifted in.
-  if (Shift + MaskSize > NVT.getSizeInBits())
-    return SDValue();
-
-  // Create a BEXTR node.
-  SDValue C = DAG.getConstant(Shift | (MaskSize << 8), dl, NVT);
-  SDValue New = DAG.getNode(X86ISD::BEXTR, dl, NVT, N0->getOperand(0), C);
-  return New;
-}
-
 // Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
 // Turn it into series of XORs and a setnp.
 static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
@@ -35442,9 +35379,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (SDValue R = combineAndIntoBEXTR(N, DAG, Subtarget))
-    return R;
-
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index de45b4697ac..051832bf4bc 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -2135,17 +2135,3 @@ def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
 let Predicates = [HasMOVBE] in {
  def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
 }
-
-// These patterns are selected by some custom code in X86ISelDAGToDAG.cpp that
-// custom combines and+srl into BEXTR. We use these patterns to avoid a bunch
-// of manual code for folding loads.
-let Predicates = [HasBMI, NoTBM] in {
-  def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
-            (BEXTR32rr GR32:$src1, (MOV32ri imm:$src2))>;
-  def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
-            (BEXTR32rm addr:$src1, (MOV32ri imm:$src2))>;
-  def : Pat<(X86bextr GR64:$src1, mov64imm32:$src2),
-            (BEXTR64rr GR64:$src1, (MOV32ri64 mov64imm32:$src2))>;
-  def : Pat<(X86bextr (loadi64 addr:$src1), mov64imm32:$src2),
-            (BEXTR64rm addr:$src1, (MOV32ri64 mov64imm32:$src2))>;
-} // HasBMI, NoTBM
diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll
index b16aeb3d350..06f316b14d0 100644
--- a/test/CodeGen/X86/extract-bits.ll
+++ b/test/CodeGen/X86/extract-bits.ll
@@ -5568,23 +5568,69 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 
 ; https://bugs.llvm.org/show_bug.cgi?id=38938
 define void @pr38938(i32* %a0, i64* %a1) {
-; X86-LABEL: pr38938:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    shrl $19, %ecx
-; X86-NEXT:    andl $4092, %ecx # imm = 0xFFC
-; X86-NEXT:    incl (%eax,%ecx)
-; X86-NEXT:    retl
+; X86-NOBMI-LABEL: pr38938:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl (%ecx), %ecx
+; X86-NOBMI-NEXT:    shrl $19, %ecx
+; X86-NOBMI-NEXT:    andl $4092, %ecx # imm = 0xFFC
+; X86-NOBMI-NEXT:    incl (%eax,%ecx)
+; X86-NOBMI-NEXT:    retl
 ;
-; X64-LABEL: pr38938:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rsi), %rax
-; X64-NEXT:    shrq $19, %rax
-; X64-NEXT:    andl $4092, %eax # imm = 0xFFC
-; X64-NEXT:    incl (%rdi,%rax)
-; X64-NEXT:    retq
+; X86-BMI1NOTBM-LABEL: pr38938:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl $2581, %edx # imm = 0xA15
+; X86-BMI1NOTBM-NEXT:    bextrl %edx, (%ecx), %ecx
+; X86-BMI1NOTBM-NEXT:    incl (%eax,%ecx,4)
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1TBM-LABEL: pr38938:
+; X86-BMI1TBM:       # %bb.0:
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1TBM-NEXT:    bextrl $2581, (%ecx), %ecx # imm = 0xA15
+; X86-BMI1TBM-NEXT:    incl (%eax,%ecx,4)
+; X86-BMI1TBM-NEXT:    retl
+;
+; X86-BMI1NOTBMBMI2-LABEL: pr38938:
+; X86-BMI1NOTBMBMI2:       # %bb.0:
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    movl $2581, %edx # imm = 0xA15
+; X86-BMI1NOTBMBMI2-NEXT:    bextrl %edx, (%ecx), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    incl (%eax,%ecx,4)
+; X86-BMI1NOTBMBMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: pr38938:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movq (%rsi), %rax
+; X64-NOBMI-NEXT:    shrq $19, %rax
+; X64-NOBMI-NEXT:    andl $4092, %eax # imm = 0xFFC
+; X64-NOBMI-NEXT:    incl (%rdi,%rax)
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: pr38938:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl $2581, %eax # imm = 0xA15
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, (%rsi), %rax
+; X64-BMI1NOTBM-NEXT:    incl (%rdi,%rax,4)
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1TBM-LABEL: pr38938:
+; X64-BMI1TBM:       # %bb.0:
+; X64-BMI1TBM-NEXT:    bextrq $2581, (%rsi), %rax # imm = 0xA15
+; X64-BMI1TBM-NEXT:    incl (%rdi,%rax,4)
+; X64-BMI1TBM-NEXT:    retq
+;
+; X64-BMI1NOTBMBMI2-LABEL: pr38938:
+; X64-BMI1NOTBMBMI2:       # %bb.0:
+; X64-BMI1NOTBMBMI2-NEXT:    movl $2581, %eax # imm = 0xA15
+; X64-BMI1NOTBMBMI2-NEXT:    bextrq %rax, (%rsi), %rax
+; X64-BMI1NOTBMBMI2-NEXT:    incl (%rdi,%rax,4)
+; X64-BMI1NOTBMBMI2-NEXT:    retq
   %tmp = load i64, i64* %a1, align 8
   %tmp1 = lshr i64 %tmp, 21
   %tmp2 = and i64 %tmp1, 1023
diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll
index 2b335ea4268..6865cc5a0ef 100644
--- a/test/CodeGen/X86/tbm_patterns.ll
+++ b/test/CodeGen/X86/tbm_patterns.ll
@@ -53,7 +53,8 @@ define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; CHECK-NEXT:    shrl $4, %edi
+; CHECK-NEXT:    testl $4095, %edi # imm = 0xFFF
 ; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = lshr i32 %a, 4
@@ -113,7 +114,8 @@ define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; CHECK-NEXT:    shrl $4, %edi
+; CHECK-NEXT:    testl $4095, %edi # imm = 0xFFF
 ; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = lshr i64 %a, 4
-- 
GitLab


From 29b3e08ecf43747ba6723ff373ef23aace083bf8 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Thu, 11 Oct 2018 18:26:02 +0000
Subject: [PATCH 0062/1116] [Hexagon] Eliminate potential sources of
 non-determinism in HCE

Also, avoid comparing GUIDs when ordering global addresses, because
source file location can cause different GUID to be calculated. As a
result, a pair of symbols can compare "less" in one directory, but
"greater" in another.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344271 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonConstExtenders.cpp | 42 +++++++++++++++-----
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/lib/Target/Hexagon/HexagonConstExtenders.cpp b/lib/Target/Hexagon/HexagonConstExtenders.cpp
index 6b48384c737..d096445f144 100644
--- a/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -376,7 +376,7 @@ namespace {
     using IndexList = SetVector<unsigned>;
     using ExtenderInit = std::pair<ExtValue, ExtExpr>;
     using AssignmentMap = std::map<ExtenderInit, IndexList>;
-    using LocDefMap = std::map<Loc, IndexList>;
+    using LocDefList = std::vector<std::pair<Loc, IndexList>>;
 
     const HexagonInstrInfo *HII = nullptr;
     const HexagonRegisterInfo *HRI = nullptr;
@@ -399,7 +399,7 @@ namespace {
     void assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
                      AssignmentMap &IMap);
     void calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs,
-                            LocDefMap &Defs);
+                            LocDefList &Defs);
     Register insertInitializer(Loc DefL, const ExtenderInit &ExtI);
     bool replaceInstrExact(const ExtDesc &ED, Register ExtR);
     bool replaceInstrExpr(const ExtDesc &ED, const ExtenderInit &ExtI,
@@ -731,7 +731,12 @@ bool HCE::ExtRoot::operator< (const HCE::ExtRoot &ER) const {
     case MachineOperand::MO_ExternalSymbol:
       return StringRef(V.SymbolName) < StringRef(ER.V.SymbolName);
     case MachineOperand::MO_GlobalAddress:
-      return V.GV->getGUID() < ER.V.GV->getGUID();
+      // Do not use GUIDs, since they depend on the source path. Moving the
+      // source file to a different directory could cause different GUID
+      // values for a pair of given symbols. These symbols could then compare
+      // "less" in one directory, but "greater" in another.
+      assert(!V.GV->getName().empty() && !ER.V.GV->getName().empty());
+      return V.GV->getName() < ER.V.GV->getName();
     case MachineOperand::MO_BlockAddress: {
       const BasicBlock *ThisB = V.BA->getBasicBlock();
       const BasicBlock *OtherB = ER.V.BA->getBasicBlock();
@@ -1236,9 +1241,13 @@ void HCE::collectInstr(MachineInstr &MI) {
 
 void HCE::collect(MachineFunction &MF) {
   Extenders.clear();
-  for (MachineBasicBlock &MBB : MF)
+  for (MachineBasicBlock &MBB : MF) {
+    // Skip unreachable blocks.
+    if (MBB.getNumber() == -1)
+      continue;
     for (MachineInstr &MI : MBB)
       collectInstr(MI);
+  }
 }
 
 void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
@@ -1463,7 +1472,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
 }
 
 void HCE::calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs,
-      LocDefMap &Defs) {
+      LocDefList &Defs) {
   if (Refs.empty())
     return;
 
@@ -1510,7 +1519,7 @@ void HCE::calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs,
     It = DomB->getFirstTerminator();
   }
   Loc DefLoc(DomB, It);
-  Defs.emplace(DefLoc, Refs);
+  Defs.emplace_back(DefLoc, Refs);
 }
 
 HCE::Register HCE::insertInitializer(Loc DefL, const ExtenderInit &ExtI) {
@@ -1880,7 +1889,7 @@ bool HCE::replaceInstr(unsigned Idx, Register ExtR, const ExtenderInit &ExtI) {
 }
 
 bool HCE::replaceExtenders(const AssignmentMap &IMap) {
-  LocDefMap Defs;
+  LocDefList Defs;
   bool Changed = false;
 
   for (const std::pair<ExtenderInit,IndexList> &P : IMap) {
@@ -1947,8 +1956,23 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) {
   AssignmentMap IMap;
 
   collect(MF);
-  llvm::sort(Extenders, [](const ExtDesc &A, const ExtDesc &B) {
-    return ExtValue(A) < ExtValue(B);
+  llvm::sort(Extenders, [this](const ExtDesc &A, const ExtDesc &B) {
+    ExtValue VA(A), VB(B);
+    if (VA != VB)
+      return VA < VB;
+    const MachineInstr *MA = A.UseMI;
+    const MachineInstr *MB = B.UseMI;
+    if (MA == MB) {
+      // If it's the same instruction, compare operand numbers.
+      return A.OpNum < B.OpNum;
+    }
+
+    const MachineBasicBlock *BA = MA->getParent();
+    const MachineBasicBlock *BB = MB->getParent();
+    assert(BA->getNumber() != -1 && BB->getNumber() != -1);
+    if (BA != BB)
+      return BA->getNumber() < BB->getNumber();
+    return MDT->dominates(MA, MB);
   });
 
   bool Changed = false;
-- 
GitLab


From f5647cf249aea2806e8cd6cfe06f783a54b46825 Mon Sep 17 00:00:00 2001
From: Nirav Dave <niravd@google.com>
Date: Thu, 11 Oct 2018 18:28:59 +0000
Subject: [PATCH 0063/1116] [DAG] Fix Big Endian in Load-Store forwarding

Summary:
Correct offset calculation in load-store forwarding for big-endian
targets.

Reviewers: rnk, RKSimon, waltl

Subscribers: sdardis, nemanjai, hiraditya, jrtc27, atanasyan, jsji, llvm-commits

Differential Revision: https://reviews.llvm.org/D53147

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344272 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |  5 ++
 test/CodeGen/Mips/cconv/vector.ll             | 70 +++++++++++++------
 .../PowerPC/big-endian-store-forward.ll       | 16 +++++
 3 files changed, 68 insertions(+), 23 deletions(-)
 create mode 100644 test/CodeGen/PowerPC/big-endian-store-forward.ll

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4a80c1d358d..16834dc1a26 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12847,6 +12847,11 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
   if (!STCoversLD)
     return SDValue();
 
+  // Normalize for Endianness.
+  if (DAG.getDataLayout().isBigEndian())
+    Offset =
+        (STMemType.getSizeInBits() - LDMemType.getSizeInBits()) / 8 - Offset;
+
   // Memory as copy space (potentially masked).
   if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
     // Simple case: Direct non-truncating forwarding
diff --git a/test/CodeGen/Mips/cconv/vector.ll b/test/CodeGen/Mips/cconv/vector.ll
index 8cec16683ca..29ffe23f712 100644
--- a/test/CodeGen/Mips/cconv/vector.ll
+++ b/test/CodeGen/Mips/cconv/vector.ll
@@ -2045,29 +2045,29 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: i32_2:
-; MIPS64R5:       # %bb.0:
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5-NEXT:    sd $5, 16($sp)
-; MIPS64R5-NEXT:    sd $4, 24($sp)
-; MIPS64R5-NEXT:    ldi.b $w0, 0
-; MIPS64R5-NEXT:    lw $1, 20($sp)
-; MIPS64R5-NEXT:    move.v $w1, $w0
-; MIPS64R5-NEXT:    insert.d $w1[0], $5
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    insert.d $w0[0], $4
-; MIPS64R5-NEXT:    lw $1, 28($sp)
-; MIPS64R5-NEXT:    insert.d $w0[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5-NEXT:    sw $2, 12($sp)
-; MIPS64R5-NEXT:    sw $1, 8($sp)
-; MIPS64R5-NEXT:    ld $2, 8($sp)
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 32
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: i32_2:
+; MIPS64R5EB:       # %bb.0:
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EB-NEXT:    sd $5, 16($sp)
+; MIPS64R5EB-NEXT:    sd $4, 24($sp)
+; MIPS64R5EB-NEXT:    ldi.b $w0, 0
+; MIPS64R5EB-NEXT:    lw $1, 16($sp)
+; MIPS64R5EB-NEXT:    move.v $w1, $w0
+; MIPS64R5EB-NEXT:    insert.d $w1[0], $1
+; MIPS64R5EB-NEXT:    insert.d $w1[1], $5
+; MIPS64R5EB-NEXT:    lw $1, 24($sp)
+; MIPS64R5EB-NEXT:    insert.d $w0[0], $1
+; MIPS64R5EB-NEXT:    insert.d $w0[1], $4
+; MIPS64R5EB-NEXT:    addv.d $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    copy_s.d $1, $w0[0]
+; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[1]
+; MIPS64R5EB-NEXT:    sw $2, 12($sp)
+; MIPS64R5EB-NEXT:    sw $1, 8($sp)
+; MIPS64R5EB-NEXT:    ld $2, 8($sp)
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
 ;
 ; MIPS32R5EL-LABEL: i32_2:
 ; MIPS32R5EL:       # %bb.0:
@@ -2093,6 +2093,30 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
 ; MIPS32R5EL-NEXT:    addiu $sp, $sp, 48
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: i32_2:
+; MIPS64R5EL:       # %bb.0:
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EL-NEXT:    sd $5, 16($sp)
+; MIPS64R5EL-NEXT:    sd $4, 24($sp)
+; MIPS64R5EL-NEXT:    ldi.b $w0, 0
+; MIPS64R5EL-NEXT:    lw $1, 20($sp)
+; MIPS64R5EL-NEXT:    move.v $w1, $w0
+; MIPS64R5EL-NEXT:    insert.d $w1[0], $5
+; MIPS64R5EL-NEXT:    insert.d $w1[1], $1
+; MIPS64R5EL-NEXT:    insert.d $w0[0], $4
+; MIPS64R5EL-NEXT:    lw $1, 28($sp)
+; MIPS64R5EL-NEXT:    insert.d $w0[1], $1
+; MIPS64R5EL-NEXT:    addv.d $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    copy_s.d $1, $w0[0]
+; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[1]
+; MIPS64R5EL-NEXT:    sw $2, 12($sp)
+; MIPS64R5EL-NEXT:    sw $1, 8($sp)
+; MIPS64R5EL-NEXT:    ld $2, 8($sp)
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
   %1 = add <2 x i32> %a, %b
   ret <2 x i32> %1
 }
diff --git a/test/CodeGen/PowerPC/big-endian-store-forward.ll b/test/CodeGen/PowerPC/big-endian-store-forward.ll
new file mode 100644
index 00000000000..1125a229005
--- /dev/null
+++ b/test/CodeGen/PowerPC/big-endian-store-forward.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+; The load is to the high byte of the 2-byte store
+@g = global i8 -75
+
+define void @f(i16 %v) {
+; CHECK-LABEL: f
+; CHECK: sth 3, -2(1)
+; CHECK: lbz 3, -2(1)
+  %p32 = alloca i16
+  store i16 %v, i16* %p32
+  %p16 = bitcast i16* %p32 to i8*
+  %tmp = load i8, i8* %p16
+  store i8 %tmp, i8* @g
+  ret void
+}
-- 
GitLab


From 2f2ce25a6b4dc4d37c6f45866ed7310adb6233dc Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Thu, 11 Oct 2018 18:31:51 +0000
Subject: [PATCH 0064/1116] [PassManager/Sanitizer] Port of AddresSanitizer
 pass from legacy to new PassManager

This patch ports the legacy pass manager to the new one to take advantage of
the benefits of the new PM. This involved moving a lot of the declarations for
`AddressSantizer` to a header so that it can be publicly used via
PassRegistry.def which I believe contains all the passes managed by the new PM.

This patch essentially decouples the instrumentation from the legacy PM such
hat it can be used by both legacy and new PM infrastructure.

Differential Revision: https://reviews.llvm.org/D52739

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344274 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/InitializePasses.h               |   4 +-
 .../Instrumentation/AddressSanitizerPass.h    |  41 ++++
 lib/Passes/PassBuilder.cpp                    |   3 +-
 lib/Passes/PassRegistry.def                   |   2 +
 .../Instrumentation/AddressSanitizer.cpp      | 178 +++++++++++-------
 .../Instrumentation/Instrumentation.cpp       |   4 +-
 .../Instrumentation/AddressSanitizer/basic.ll |   2 +
 7 files changed, 166 insertions(+), 68 deletions(-)
 create mode 100644 include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h

diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 1a9c6f82bfd..42bfc55b1aa 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -65,8 +65,8 @@ void initializeAAEvalLegacyPassPass(PassRegistry&);
 void initializeAAResultsWrapperPassPass(PassRegistry&);
 void initializeADCELegacyPassPass(PassRegistry&);
 void initializeAddDiscriminatorsLegacyPassPass(PassRegistry&);
-void initializeAddressSanitizerModulePass(PassRegistry&);
-void initializeAddressSanitizerPass(PassRegistry&);
+void initializeAddressSanitizerModuleLegacyPassPass(PassRegistry &);
+void initializeAddressSanitizerLegacyPassPass(PassRegistry &);
 void initializeAggressiveInstCombinerLegacyPassPass(PassRegistry&);
 void initializeAliasSetPrinterPass(PassRegistry&);
 void initializeAlignmentFromAssumptionsPass(PassRegistry&);
diff --git a/include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h b/include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h
new file mode 100644
index 00000000000..021e1bd4c24
--- /dev/null
+++ b/include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h
@@ -0,0 +1,41 @@
+//===--------- Definition of the AddressSanitizer class ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AddressSanitizer class which is a port of the legacy
+// AddressSanitizer pass to use the new PassManager infrastructure.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_ADDRESSSANITIZERPASS_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_ADDRESSSANITIZERPASS_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// Public interface to the address sanitizer pass for instrumenting code to
+/// check for various memory bugs.
+class AddressSanitizerPass : public PassInfoMixin<AddressSanitizerPass> {
+public:
+  explicit AddressSanitizerPass(bool CompileKernel = false,
+                                bool Recover = false,
+                                bool UseAfterScope = false);
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  bool CompileKernel;
+  bool Recover;
+  bool UseAfterScope;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 94afb5409e1..09758dc5651 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -61,7 +61,6 @@
 #include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
-#include "llvm/Transforms/Instrumentation/CGProfile.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/IPO/CalledValuePropagation.h"
@@ -87,7 +86,9 @@
 #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerPass.h"
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
+#include "llvm/Transforms/Instrumentation/CGProfile.h"
 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index 8de4541a772..ad03942fb9a 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -40,6 +40,7 @@ MODULE_ALIAS_ANALYSIS("globals-aa", GlobalsAA())
 #define MODULE_PASS(NAME, CREATE_PASS)
 #endif
 MODULE_PASS("always-inline", AlwaysInlinerPass())
+MODULE_PASS("asan", AddressSanitizerPass(false, false, true))
 MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
 MODULE_PASS("cg-profile", CGProfilePass())
 MODULE_PASS("constmerge", ConstantMergePass())
@@ -147,6 +148,7 @@ FUNCTION_PASS("adce", ADCEPass())
 FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass())
 FUNCTION_PASS("aggressive-instcombine", AggressiveInstCombinePass())
 FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
+FUNCTION_PASS("asan", AddressSanitizerPass(false, false, false))
 FUNCTION_PASS("bdce", BDCEPass())
 FUNCTION_PASS("bounds-checking", BoundsCheckingPass())
 FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 15eba9089cb..b819565e7ba 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -25,7 +25,6 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/IR/Argument.h"
@@ -70,8 +69,10 @@
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerPass.h"
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
@@ -597,26 +598,22 @@ static size_t RedzoneSizeForScale(int MappingScale) {
 namespace {
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
-struct AddressSanitizer : public FunctionPass {
-  // Pass identification, replacement for typeid
-  static char ID;
-
-  explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false,
+struct AddressSanitizer {
+  explicit AddressSanitizer(Module &M, DominatorTree *DT,
+                            bool CompileKernel = false, bool Recover = false,
                             bool UseAfterScope = false)
-      : FunctionPass(ID), UseAfterScope(UseAfterScope || ClUseAfterScope) {
+      : UseAfterScope(UseAfterScope || ClUseAfterScope), DT(DT) {
     this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
     this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ?
         ClEnableKasan : CompileKernel;
-    initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override {
-    return "AddressSanitizerFunctionPass";
-  }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    // Initialize the private fields. No one has accessed them before.
+    GlobalsMD.init(M);
+    C = &(M.getContext());
+    LongSize = M.getDataLayout().getPointerSizeInBits();
+    IntptrTy = Type::getIntNTy(*C, LongSize);
+    TargetTriple = Triple(M.getTargetTriple());
+    Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
   }
 
   uint64_t getAllocaSizeInBytes(const AllocaInst &AI) const {
@@ -661,12 +658,12 @@ struct AddressSanitizer : public FunctionPass {
                                  Value *SizeArgument, uint32_t Exp);
   void instrumentMemIntrinsic(MemIntrinsic *MI);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
-  bool runOnFunction(Function &F) override;
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
   void maybeInsertDynamicShadowAtFunctionEntry(Function &F);
   void markEscapedLocalAllocas(Function &F);
-  bool doInitialization(Module &M) override;
-  bool doFinalization(Module &M) override;
+
+  /// Return true if the function changed.
+  bool instrument(Function &F, const TargetLibraryInfo *TLI);
 
   DominatorTree &getDominatorTree() const { return *DT; }
 
@@ -724,16 +721,12 @@ private:
   DenseMap<const AllocaInst *, bool> ProcessedAllocas;
 };
 
-class AddressSanitizerModule : public ModulePass {
+class AddressSanitizerModule {
 public:
-  // Pass identification, replacement for typeid
-  static char ID;
-
   explicit AddressSanitizerModule(bool CompileKernel = false,
                                   bool Recover = false,
                                   bool UseGlobalsGC = true)
-      : ModulePass(ID),
-        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+      : UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
         // Not a typo: ClWithComdat is almost completely pointless without
         // ClUseGlobalsGC (because then it only works on modules without
         // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
@@ -742,14 +735,12 @@ public:
         // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
         // do globals-gc.
         UseCtorComdat(UseGlobalsGC && ClWithComdat) {
-          this->Recover = ClRecover.getNumOccurrences() > 0 ?
-              ClRecover : Recover;
-          this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ?
-              ClEnableKasan : CompileKernel;
-	}
+    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
+    this->CompileKernel =
+        ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel;
+  }
 
-  bool runOnModule(Module &M) override;
-  StringRef getPassName() const override { return "AddressSanitizerModule"; }
+  bool instrument(Module &M);
 
 private:
   void initializeCallbacks(Module &M);
@@ -1057,18 +1048,100 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
                      Instruction *ThenTerm, Value *ValueIfFalse);
 };
 
+class AddressSanitizerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  explicit AddressSanitizerLegacyPass(bool CompileKernel = false,
+                                      bool Recover = false,
+                                      bool UseAfterScope = false)
+      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover),
+        UseAfterScope(UseAfterScope) {}
+
+  StringRef getPassName() const override {
+    return "AddressSanitizerFunctionPass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    DominatorTree *DTree =
+        &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    AddressSanitizer Sanitizer(*F.getParent(), DTree, CompileKernel, Recover,
+                               UseAfterScope);
+    return Sanitizer.instrument(F, TLI);
+  }
+
+private:
+  bool CompileKernel;
+  bool Recover;
+  bool UseAfterScope;
+};
+
+class AddressSanitizerModuleLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  explicit AddressSanitizerModuleLegacyPass(bool CompileKernel = false,
+                                            bool Recover = false,
+                                            bool UseAfterScope = true)
+      : ModulePass(ID), CompileKernel(CompileKernel), Recover(Recover),
+        UseAfterScope(UseAfterScope) {}
+
+  StringRef getPassName() const override { return "AddressSanitizerModule"; }
+
+  bool runOnModule(Module &M) override {
+    AddressSanitizerModule Sanitizer(CompileKernel, Recover, UseAfterScope);
+    return Sanitizer.instrument(M);
+  }
+
+private:
+  bool CompileKernel;
+  bool Recover;
+  bool UseAfterScope;
+};
+
 } // end anonymous namespace
 
-char AddressSanitizer::ID = 0;
+AddressSanitizerPass::AddressSanitizerPass(bool CompileKernel, bool Recover,
+                                           bool UseAfterScope)
+    : CompileKernel(CompileKernel), Recover(Recover),
+      UseAfterScope(UseAfterScope) {}
+
+PreservedAnalyses AddressSanitizerPass::run(Function &F,
+                                            AnalysisManager<Function> &AM) {
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+  AddressSanitizer Sanitizer(*F.getParent(), DT, CompileKernel, Recover,
+                             UseAfterScope);
+  if (Sanitizer.instrument(F, TLI))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses AddressSanitizerPass::run(Module &M,
+                                            AnalysisManager<Module> &AM) {
+  AddressSanitizerModule Sanitizer(CompileKernel, Recover, UseAfterScope);
+  if (Sanitizer.instrument(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+char AddressSanitizerLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(
-    AddressSanitizer, "asan",
+    AddressSanitizerLegacyPass, "asan",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(
-    AddressSanitizer, "asan",
+    AddressSanitizerLegacyPass, "asan",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
     false)
 
@@ -1076,13 +1149,13 @@ FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel,
                                                        bool Recover,
                                                        bool UseAfterScope) {
   assert(!CompileKernel || Recover);
-  return new AddressSanitizer(CompileKernel, Recover, UseAfterScope);
+  return new AddressSanitizerLegacyPass(CompileKernel, Recover, UseAfterScope);
 }
 
-char AddressSanitizerModule::ID = 0;
+char AddressSanitizerModuleLegacyPass::ID = 0;
 
 INITIALIZE_PASS(
-    AddressSanitizerModule, "asan-module",
+    AddressSanitizerModuleLegacyPass, "asan-module",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
     "ModulePass",
     false, false)
@@ -1091,7 +1164,8 @@ ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel,
                                                    bool Recover,
                                                    bool UseGlobalsGC) {
   assert(!CompileKernel || Recover);
-  return new AddressSanitizerModule(CompileKernel, Recover, UseGlobalsGC);
+  return new AddressSanitizerModuleLegacyPass(CompileKernel, Recover,
+                                              UseGlobalsGC);
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
@@ -2268,7 +2342,7 @@ int AddressSanitizerModule::GetAsanVersion(const Module &M) const {
   return Version;
 }
 
-bool AddressSanitizerModule::runOnModule(Module &M) {
+bool AddressSanitizerModule::instrument(Module &M) {
   C = &(M.getContext());
   int LongSize = M.getDataLayout().getPointerSizeInBits();
   IntptrTy = Type::getIntNTy(*C, LongSize);
@@ -2387,25 +2461,6 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
                                            ArrayType::get(IRB.getInt8Ty(), 0));
 }
 
-// virtual
-bool AddressSanitizer::doInitialization(Module &M) {
-  // Initialize the private fields. No one has accessed them before.
-  GlobalsMD.init(M);
-
-  C = &(M.getContext());
-  LongSize = M.getDataLayout().getPointerSizeInBits();
-  IntptrTy = Type::getIntNTy(*C, LongSize);
-  TargetTriple = Triple(M.getTargetTriple());
-
-  Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
-  return true;
-}
-
-bool AddressSanitizer::doFinalization(Module &M) {
-  GlobalsMD.reset();
-  return false;
-}
-
 bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // For each NSObject descendant having a +load method, this method is invoked
   // by the ObjC runtime before any of the static constructors is called.
@@ -2479,7 +2534,7 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
   }
 }
 
-bool AddressSanitizer::runOnFunction(Function &F) {
+bool AddressSanitizer::instrument(Function &F, const TargetLibraryInfo *TLI) {
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
   if (F.getName().startswith("__asan_")) return false;
@@ -2498,7 +2553,6 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
 
   initializeCallbacks(*F.getParent());
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   FunctionStateRAII CleanupObj(this);
 
@@ -2519,8 +2573,6 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   bool IsWrite;
   unsigned Alignment;
   uint64_t TypeSize;
-  const TargetLibraryInfo *TLI =
-      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   // Fill the set of memory operations to instrument.
   for (auto &BB : F) {
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index d52b1b92817..ea819c1856b 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -56,8 +56,8 @@ BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB,
 /// initializeInstrumentation - Initialize all passes in the TransformUtils
 /// library.
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
-  initializeAddressSanitizerPass(Registry);
-  initializeAddressSanitizerModulePass(Registry);
+  initializeAddressSanitizerLegacyPassPass(Registry);
+  initializeAddressSanitizerModuleLegacyPassPass(Registry);
   initializeBoundsCheckingLegacyPassPass(Registry);
   initializeControlHeightReductionLegacyPassPass(Registry);
   initializeGCOVProfilerLegacyPassPass(Registry);
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index 099965348eb..be80a89392c 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -1,7 +1,9 @@
 ; Test basic address sanitizer instrumentation.
 ;
 ; RUN: opt < %s -asan -asan-module -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
+; RUN: opt < %s -passes='function(asan),module(asan)' -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
 ; RUN: opt < %s -asan -asan-module -asan-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
+; RUN: opt < %s -passes='function(asan),module(asan)' -asan-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
-- 
GitLab


From 4db84ee724af1ef6d709901e826fae6ee67c696a Mon Sep 17 00:00:00 2001
From: Artem Dergachev <artem.dergachev@gmail.com>
Date: Thu, 11 Oct 2018 18:43:08 +0000
Subject: [PATCH 0065/1116] Revert r344197 "[MC][ELF] compute entity size for
 explicit sections"

Revert r344206 "[MC][ELF] Fix section_mergeable_size.ll"

They were causing failures on too many important buildbots for too long.
Please revert eagerly if your fix takes more than a couple of hours to land!


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344278 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/TargetLoweringObjectFileImpl.cpp  | 50 +++++++++----------
 .../CodeGen/Generic/section_mergeable_size.ll |  3 --
 2 files changed, 25 insertions(+), 28 deletions(-)
 delete mode 100644 test/CodeGen/Generic/section_mergeable_size.ll

diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index b046cd81d6c..f6882c40531 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -506,30 +506,6 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
   return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr;
 }
 
-static unsigned getEntrySizeForKind(SectionKind Kind) {
-  if (Kind.isMergeable1ByteCString())
-    return 1;
-  else if (Kind.isMergeable2ByteCString())
-    return 2;
-  else if (Kind.isMergeable4ByteCString())
-    return 4;
-  else if (Kind.isMergeableConst4())
-    return 4;
-  else if (Kind.isMergeableConst8())
-    return 8;
-  else if (Kind.isMergeableConst16())
-    return 16;
-  else if (Kind.isMergeableConst32())
-    return 32;
-  else {
-    // We shouldn't have mergeable C strings or mergeable constants that we
-    // didn't handle above.
-    assert(!Kind.isMergeableCString() && "unknown string width");
-    assert(!Kind.isMergeableConst() && "unknown data width");
-    return 0;
-  }
-}
-
 MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   StringRef SectionName = GO->getSection();
@@ -574,7 +550,7 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
 
   MCSectionELF *Section = getContext().getELFSection(
       SectionName, getELFSectionType(SectionName, Kind), Flags,
-      getEntrySizeForKind(Kind), Group, UniqueID, AssociatedSymbol);
+      /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol);
   // Make sure that we did not get some other section with incompatible sh_link.
   // This should not be possible due to UniqueID code above.
   assert(Section->getAssociatedSymbol() == AssociatedSymbol &&
@@ -601,6 +577,30 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
   return ".data.rel.ro";
 }
 
+static unsigned getEntrySizeForKind(SectionKind Kind) {
+  if (Kind.isMergeable1ByteCString())
+    return 1;
+  else if (Kind.isMergeable2ByteCString())
+    return 2;
+  else if (Kind.isMergeable4ByteCString())
+    return 4;
+  else if (Kind.isMergeableConst4())
+    return 4;
+  else if (Kind.isMergeableConst8())
+    return 8;
+  else if (Kind.isMergeableConst16())
+    return 16;
+  else if (Kind.isMergeableConst32())
+    return 32;
+  else {
+    // We shouldn't have mergeable C strings or mergeable constants that we
+    // didn't handle above.
+    assert(!Kind.isMergeableCString() && "unknown string width");
+    assert(!Kind.isMergeableConst() && "unknown data width");
+    return 0;
+  }
+}
+
 static MCSectionELF *selectELFSectionForGlobal(
     MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang,
     const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags,
diff --git a/test/CodeGen/Generic/section_mergeable_size.ll b/test/CodeGen/Generic/section_mergeable_size.ll
deleted file mode 100644
index 0a7ddd110c4..00000000000
--- a/test/CodeGen/Generic/section_mergeable_size.ll
+++ /dev/null
@@ -1,3 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-@a = internal unnamed_addr constant [1 x [1 x i32]] zeroinitializer, section ".init.rodata", align 4
-; CHECK: .init.rodata,"aM",{{[@%]}}progbits,4
-- 
GitLab


From 75105c59a05de66c1c3237d21be676f17932c0ca Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Thu, 11 Oct 2018 18:45:44 +0000
Subject: [PATCH 0066/1116] Revert SymbolFileNativePDB plugin.

This was originally causing some test failures on non-Windows
platforms, which required fixes in the compiler and linker.  After
those fixes, however, other tests started failing.  Reverting
temporarily until I can address everything.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344279 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 8232f076a93..4d45a103c5a 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -73,7 +73,6 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -135,9 +134,7 @@ StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
 
   // If this is a Unix-style path, just use it as is. Don't try to canonicalize
   // it textually because one of the path components could be a symlink.
-  if (Dir.startswith("/") || Filename.startswith("/")) {
-    if (llvm::sys::path::is_absolute(Filename, llvm::sys::path::Style::posix))
-      return Filename;
+  if (!Dir.empty() && Dir[0] == '/') {
     Filepath = Dir;
     if (Dir.back() != '/')
       Filepath += '/';
-- 
GitLab


From 07125b4a5bda6c5c6c7debfadf48ac23350f1d7b Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Thu, 11 Oct 2018 18:45:48 +0000
Subject: [PATCH 0067/1116] [WebAssembly] Revert rL344180, which was breaking
 expensive checks

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344280 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyInstrAtomics.td    | 16 ++++++++++++++
 .../WebAssembly/WebAssemblyInstrCall.td       |  4 ++++
 .../WebAssembly/WebAssemblyInstrControl.td    |  8 +++++++
 .../WebAssembly/WebAssemblyInstrConv.td       | 12 ++++++++++
 .../WebAssembly/WebAssemblyInstrExceptRef.td  |  4 ++++
 .../WebAssembly/WebAssemblyInstrFloat.td      | 12 ++++++++++
 .../WebAssembly/WebAssemblyInstrFormats.td    |  1 -
 .../WebAssembly/WebAssemblyInstrInfo.td       |  7 ++++--
 .../WebAssembly/WebAssemblyInstrInteger.td    |  9 ++++++++
 .../WebAssembly/WebAssemblyInstrMemory.td     | 22 +++++++++++++++++++
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  2 ++
 11 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index f9d092e4b8a..9eff2cfde0a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -24,8 +24,10 @@ multiclass ATOMIC_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
             Requires<[HasAtomics]>;
 }
 
+let Defs = [ARGUMENTS] in {
 defm ATOMIC_LOAD_I32 : WebAssemblyLoad<I32, "i32.atomic.load", 0xfe10>;
 defm ATOMIC_LOAD_I64 : WebAssemblyLoad<I64, "i64.atomic.load", 0xfe11>;
+} // Defs = [ARGUMENTS]
 
 // Select loads with no constant offset.
 let Predicates = [HasAtomics] in {
@@ -60,11 +62,13 @@ def : LoadPatExternSymOffOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
 
 // Extending loads. Note that there are only zero-extending atomic loads, no
 // sign-extending loads.
+let Defs = [ARGUMENTS] in {
 defm ATOMIC_LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load8_u", 0xfe12>;
 defm ATOMIC_LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load16_u", 0xfe13>;
 defm ATOMIC_LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load8_u", 0xfe14>;
 defm ATOMIC_LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load16_u", 0xfe15>;
 defm ATOMIC_LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load32_u", 0xfe16>;
+} // Defs = [ARGUMENTS]
 
 // Fragments for extending loads. These are different from regular loads because
 // the SDNodes are derived from AtomicSDNode rather than LoadSDNode and
@@ -196,8 +200,10 @@ def : LoadPatExternSymOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 // Atomic stores
 //===----------------------------------------------------------------------===//
 
+let Defs = [ARGUMENTS] in {
 defm ATOMIC_STORE_I32 : WebAssemblyStore<I32, "i32.atomic.store", 0xfe17>;
 defm ATOMIC_STORE_I64 : WebAssemblyStore<I64, "i64.atomic.store", 0xfe18>;
+} // Defs = [ARGUMENTS]
 
 // We need an 'atomic' version of store patterns because store and atomic_store
 // nodes have different operand orders:
@@ -257,11 +263,13 @@ def : AStorePatExternSymOffOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
 } // Predicates = [HasAtomics]
 
 // Truncating stores.
+let Defs = [ARGUMENTS] in {
 defm ATOMIC_STORE8_I32 : WebAssemblyStore<I32, "i32.atomic.store8", 0xfe19>;
 defm ATOMIC_STORE16_I32 : WebAssemblyStore<I32, "i32.atomic.store16", 0xfe1a>;
 defm ATOMIC_STORE8_I64 : WebAssemblyStore<I64, "i64.atomic.store8", 0xfe1b>;
 defm ATOMIC_STORE16_I64 : WebAssemblyStore<I64, "i64.atomic.store16", 0xfe1c>;
 defm ATOMIC_STORE32_I64 : WebAssemblyStore<I64, "i64.atomic.store32", 0xfe1d>;
+} // Defs = [ARGUMENTS]
 
 // Fragments for truncating stores.
 
@@ -333,6 +341,8 @@ def : AStorePatExternSymOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
 // Atomic binary read-modify-writes
 //===----------------------------------------------------------------------===//
 
+let Defs = [ARGUMENTS] in {
+
 multiclass WebAssemblyBinRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
   defm "" : I<(outs rc:$dst),
               (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
@@ -420,6 +430,7 @@ defm ATOMIC_RMW16_U_XCHG_I64 :
   WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.xchg", 0xfe46>;
 defm ATOMIC_RMW32_U_XCHG_I64 :
   WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.xchg", 0xfe47>;
+}
 
 // Select binary RMWs with no constant offset.
 class BinRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
@@ -663,6 +674,8 @@ defm : BinRMWTruncExtPattern<
 // Consider adding a pass after instruction selection that optimizes this case
 // if it is frequent.
 
+let Defs = [ARGUMENTS] in {
+
 multiclass WebAssemblyTerRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
   defm "" : I<(outs rc:$dst),
               (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp,
@@ -686,6 +699,7 @@ defm ATOMIC_RMW16_U_CMPXCHG_I64 :
   WebAssemblyTerRMW<I64, "i64.atomic.rmw16_u.cmpxchg", 0xfe4d>;
 defm ATOMIC_RMW32_U_CMPXCHG_I64 :
   WebAssemblyTerRMW<I64, "i64.atomic.rmw32_u.cmpxchg", 0xfe4e>;
+}
 
 // Select ternary RMWs with no constant offset.
 class TerRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
@@ -898,6 +912,7 @@ defm : TerRMWTruncExtPattern<
 // Atomic wait / notify
 //===----------------------------------------------------------------------===//
 
+let Defs = [ARGUMENTS] in {
 let hasSideEffects = 1 in {
 defm ATOMIC_NOTIFY :
   I<(outs I32:$dst),
@@ -920,6 +935,7 @@ defm ATOMIC_WAIT_I64 :
     "i64.atomic.wait \t${off}, ${p2align}", 0xfe02>;
 } // mayLoad = 1
 } // hasSideEffects = 1
+} // Defs = [ARGUMENTS]
 
 let Predicates = [HasAtomics] in {
 // Select notifys with no constant offset.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 07839b79011..3c9caa3f0de 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -15,6 +15,8 @@
 // TODO: addr64: These currently assume the callee address is 32-bit.
 // FIXME: add $type to first call_indirect asmstr (and maybe $flags)
 
+let Defs = [ARGUMENTS] in {
+
 // Call sequence markers. These have an immediate which represents the amount of
 // stack space to allocate or free, which is used for varargs lowering.
 let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
@@ -116,6 +118,8 @@ let Uses = [SP32, SP64], isCall = 1 in {
                               0x11>;
 } // Uses = [SP32,SP64], isCall = 1
 
+} // Defs = [ARGUMENTS]
+
 // Patterns for matching a direct call to a global address.
 def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
           (CALL_I32 tglobaladdr:$callee)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index ed9879ae454..e27d81937dd 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -12,6 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
+let Defs = [ARGUMENTS] in {
+
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // The condition operand is a boolean value which WebAssembly represents as i32.
 defm BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
@@ -28,11 +30,15 @@ defm BR   : NRI<(outs), (ins bb_op:$dst),
 } // isBarrier = 1
 } // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
 
+} // Defs = [ARGUMENTS]
+
 def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
           (BR_IF bb_op:$dst, I32:$cond)>;
 def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
           (BR_UNLESS bb_op:$dst, I32:$cond)>;
 
+let Defs = [ARGUMENTS] in {
+
 // TODO: SelectionDAG's lowering insists on using a pointer as the index for
 // jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
 // currently.
@@ -188,3 +194,5 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
                    [(catchret bb:$dst, bb:$from)], "", 0>;
 }
 }
+
+} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index 0d772c743a7..6dca96f3ddd 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -13,6 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
+let Defs = [ARGUMENTS] in {
+
 defm I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                       [(set I32:$dst, (trunc I64:$src))],
                       "i32.wrap/i64\t$dst, $src", "i32.wrap/i64", 0xa7>;
@@ -49,11 +51,15 @@ defm I64_EXTEND32_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
                             0xc4>;
 } // Predicates = [HasSignExt]
 
+} // defs = [ARGUMENTS]
+
 // Expand a "don't care" extend into zero-extend (chosen over sign-extend
 // somewhat arbitrarily, although it favors popular hardware architectures
 // and is conceptually a simpler operation).
 def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>;
 
+let Defs = [ARGUMENTS] in {
+
 // Conversion from floating point to integer instructions which don't trap on
 // overflow or invalid.
 defm I32_TRUNC_S_SAT_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
@@ -97,6 +103,8 @@ defm I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
                              "i64.trunc_u:sat/f64", 0xfc07>,
                              Requires<[HasNontrappingFPToInt]>;
 
+} // Defs = [Arguments]
+
 // Lower llvm.wasm.trunc.saturate.* to saturating instructions
 def : Pat<(int_wasm_trunc_saturate_signed F32:$src),
           (I32_TRUNC_S_SAT_F32 F32:$src)>;
@@ -115,6 +123,8 @@ def : Pat<(int_wasm_trunc_saturate_signed F64:$src),
 def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src),
           (I64_TRUNC_U_SAT_F64 F64:$src)>;
 
+let Defs = [ARGUMENTS] in {
+
 // Conversion from floating point to integer pseudo-instructions which don't
 // trap on overflow or invalid.
 let usesCustomInserter = 1, isCodeGenOnly = 1 in {
@@ -230,3 +240,5 @@ defm F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins),
                              [(set F64:$dst, (bitconvert I64:$src))],
                              "f64.reinterpret/i64\t$dst, $src",
                              "f64.reinterpret/i64", 0xbf>;
+
+} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
index a251d60b89e..41b39f69e51 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
@@ -12,6 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
+let Defs = [ARGUMENTS] in {
+
 defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst),
                            (ins EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond),
                            (outs), (ins),
@@ -21,6 +23,8 @@ defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst),
                            "except_ref.select\t$dst, $lhs, $rhs, $cond",
                            "except_ref.select", 0x1b>;
 
+} // Defs = [ARGUMENTS]
+
 def : Pat<(select (i32 (setne I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
           (SELECT_EXCEPT_REF EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 364c485f409..70e27df27e6 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -45,6 +45,8 @@ multiclass ComparisonFP<CondCode cond, string name, bits<32> f32Inst, bits<32> f
                 !strconcat("f64.", name), f64Inst>;
 }
 
+let Defs = [ARGUMENTS] in {
+
 let isCommutable = 1 in
 defm ADD : BinaryFP<fadd, "add ", 0x92, 0xa0>;
 defm SUB : BinaryFP<fsub, "sub ", 0x93, 0xa1>;
@@ -67,6 +69,8 @@ defm FLOOR : UnaryFP<ffloor, "floor", 0x8e, 0x9c>;
 defm TRUNC : UnaryFP<ftrunc, "trunc", 0x8f, 0x9d>;
 defm NEAREST : UnaryFP<fnearbyint, "nearest", 0x90, 0x9e>;
 
+} // Defs = [ARGUMENTS]
+
 // DAGCombine oddly folds casts into the rhs of copysign. Unfold them.
 def : Pat<(fcopysign F64:$lhs, F32:$rhs),
           (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>;
@@ -77,6 +81,8 @@ def : Pat<(fcopysign F32:$lhs, F64:$rhs),
 def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>;
 def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>;
 
+let Defs = [ARGUMENTS] in {
+
 let isCommutable = 1 in {
 defm EQ : ComparisonFP<SETOEQ, "eq  ", 0x5b, 0x61>;
 defm NE : ComparisonFP<SETUNE, "ne  ", 0x5c, 0x62>;
@@ -86,6 +92,8 @@ defm LE : ComparisonFP<SETOLE, "le  ", 0x5f, 0x65>;
 defm GT : ComparisonFP<SETOGT, "gt  ", 0x5e, 0x64>;
 defm GE : ComparisonFP<SETOGE, "ge  ", 0x60, 0x66>;
 
+} // Defs = [ARGUMENTS]
+
 // Don't care floating-point comparisons, supported via other comparisons.
 def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>;
 def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>;
@@ -100,6 +108,8 @@ def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>;
 def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>;
 def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
 
+let Defs = [ARGUMENTS] in {
+
 defm SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
@@ -109,6 +119,8 @@ defm SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
                     [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
                     "f64.select\t$dst, $lhs, $rhs, $cond", "f64.select", 0x1b>;
 
+} // Defs = [ARGUMENTS]
+
 // ISD::SELECT requires its operand to conform to getBooleanContents, but
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 2d23acfc825..683fb3d981f 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -30,7 +30,6 @@ class NI<dag oops, dag iops, list<dag> pattern, bit stack, string asmstr = "",
   dag OutOperandList = oops;
   dag InOperandList  = iops;
   let Pattern        = pattern;
-  let Defs           = [ARGUMENTS];
 }
 
 // Generates both register and stack based versions of one actual instruction.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 9e1409cf90e..a2ea14cc28b 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -164,8 +164,7 @@ include "WebAssemblyInstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 multiclass ARGUMENT<WebAssemblyRegClass vt> {
-  let hasSideEffects = 1, isCodeGenOnly = 1,
-      Defs = []<Register>, Uses = [ARGUMENTS] in
+  let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
   defm ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
                         (outs), (ins i32imm:$argno),
                         [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
@@ -176,6 +175,8 @@ defm "": ARGUMENT<F32>;
 defm "": ARGUMENT<F64>;
 defm "": ARGUMENT<EXCEPT_REF>;
 
+let Defs = [ARGUMENTS] in {
+
 // get_local and set_local are not generated by instruction selection; they
 // are implied by virtual register uses and defs.
 multiclass LOCAL<WebAssemblyRegClass vt> {
@@ -265,6 +266,8 @@ defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
                    "f64.const\t$res, $imm", "f64.const\t$imm", 0x44>;
 } // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
 
+} // Defs = [ARGUMENTS]
+
 def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
           (CONST_I32 tglobaladdr:$addr)>;
 def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index bd41f46214a..44c93de54aa 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -45,6 +45,9 @@ multiclass ComparisonInt<CondCode cond, string name, bits<32> i32Inst, bits<32>
                 !strconcat("i64.", name), i64Inst>;
 }
 
+
+let Defs = [ARGUMENTS] in {
+
 // The spaces after the names are for aesthetic purposes only, to make
 // operands line up vertically after tab expansion.
 let isCommutable = 1 in
@@ -94,12 +97,16 @@ defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                  [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
                  "i64.eqz \t$dst, $src", "i64.eqz", 0x50>;
 
+} // Defs = [ARGUMENTS]
+
 // Optimize away an explicit mask on a rotate count.
 def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>;
 def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
 
+let Defs = [ARGUMENTS] in {
+
 defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
@@ -109,6 +116,8 @@ defm SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
                     [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
                     "i64.select\t$dst, $lhs, $rhs, $cond", "i64.select", 0x1b>;
 
+} // Defs = [ARGUMENTS]
+
 // ISD::SELECT requires its operand to conform to getBooleanContents, but
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index ccc331d1bf0..76ef1461d22 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -53,6 +53,8 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off),
 // We don't need a regPlusES because external symbols never have constant
 // offsets folded into them, so we can just use add.
 
+let Defs = [ARGUMENTS] in {
+
 // Defines atomic and non-atomic loads, regular and extending.
 multiclass WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode> {
   let mayLoad = 1 in
@@ -71,6 +73,8 @@ defm LOAD_I64 : WebAssemblyLoad<I64, "i64.load", 0x29>;
 defm LOAD_F32 : WebAssemblyLoad<F32, "f32.load", 0x2a>;
 defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b>;
 
+} // Defs = [ARGUMENTS]
+
 // Select loads with no constant offset.
 class LoadPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
   Pat<(ty (kind I32:$addr)), (inst 0, 0, I32:$addr)>;
@@ -140,6 +144,8 @@ def : LoadPatExternSymOffOnly<i64, load, LOAD_I64>;
 def : LoadPatExternSymOffOnly<f32, load, LOAD_F32>;
 def : LoadPatExternSymOffOnly<f64, load, LOAD_F64>;
 
+let Defs = [ARGUMENTS] in {
+
 // Extending load.
 defm LOAD8_S_I32 : WebAssemblyLoad<I32, "i32.load8_s", 0x2c>;
 defm LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.load8_u", 0x2d>;
@@ -152,6 +158,8 @@ defm LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x33>;
 defm LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34>;
 defm LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35>;
 
+} // Defs = [ARGUMENTS]
+
 // Select extending loads with no constant offset.
 def : LoadPatNoOffset<i32, sextloadi8, LOAD8_S_I32>;
 def : LoadPatNoOffset<i32, zextloadi8, LOAD8_U_I32>;
@@ -295,6 +303,9 @@ def : LoadPatExternSymOffOnly<i64, extloadi8, LOAD8_U_I64>;
 def : LoadPatExternSymOffOnly<i64, extloadi16, LOAD16_U_I64>;
 def : LoadPatExternSymOffOnly<i64, extloadi32, LOAD32_U_I64>;
 
+
+let Defs = [ARGUMENTS] in {
+
 // Defines atomic and non-atomic stores, regular and truncating
 multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode> {
   let mayStore = 1 in
@@ -312,6 +323,8 @@ defm STORE_I64  : WebAssemblyStore<I64, "i64.store", 0x37>;
 defm STORE_F32  : WebAssemblyStore<F32, "f32.store", 0x38>;
 defm STORE_F64  : WebAssemblyStore<F64, "f64.store", 0x39>;
 
+} // Defs = [ARGUMENTS]
+
 // Select stores with no constant offset.
 class StorePatNoOffset<ValueType ty, PatFrag node, NI inst> :
   Pat<(node ty:$val, I32:$addr), (inst 0, 0, I32:$addr, ty:$val)>;
@@ -376,6 +389,9 @@ def : StorePatExternSymOffOnly<i64, store, STORE_I64>;
 def : StorePatExternSymOffOnly<f32, store, STORE_F32>;
 def : StorePatExternSymOffOnly<f64, store, STORE_F64>;
 
+
+let Defs = [ARGUMENTS] in {
+
 // Truncating store.
 defm STORE8_I32 : WebAssemblyStore<I32, "i32.store8", 0x3a>;
 defm STORE16_I32 : WebAssemblyStore<I32, "i32.store16", 0x3b>;
@@ -383,6 +399,8 @@ defm STORE8_I64 : WebAssemblyStore<I64, "i64.store8", 0x3c>;
 defm STORE16_I64 : WebAssemblyStore<I64, "i64.store16", 0x3d>;
 defm STORE32_I64 : WebAssemblyStore<I64, "i64.store32", 0x3e>;
 
+} // Defs = [ARGUMENTS]
+
 // Select truncating stores with no constant offset.
 def : StorePatNoOffset<i32, truncstorei8, STORE8_I32>;
 def : StorePatNoOffset<i32, truncstorei16, STORE16_I32>;
@@ -430,6 +448,8 @@ def : StorePatExternSymOffOnly<i64, truncstorei8, STORE8_I64>;
 def : StorePatExternSymOffOnly<i64, truncstorei16, STORE16_I64>;
 def : StorePatExternSymOffOnly<i64, truncstorei32, STORE32_I64>;
 
+let Defs = [ARGUMENTS] in {
+
 // Current memory size.
 defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
                          (outs), (ins i32imm:$flags),
@@ -473,6 +493,8 @@ defm GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
                          0x40>,
                        Requires<[HasAddr32]>;
 
+} // Defs = [ARGUMENTS]
+
 def : Pat<(int_wasm_current_memory),
           (CURRENT_MEMORY_I32 0)>;
 def : Pat<(int_wasm_grow_memory I32:$delta),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 419aa0b437f..57024616f3f 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -55,6 +55,7 @@ multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
                                   "v128.const\t"#args, 0>;
 }
 
+let Defs = [ARGUMENTS] in {
 defm "" : ConstVec<v16i8,
                    (ins vec_i8imm_op:$i0, vec_i8imm_op:$i1,
                         vec_i8imm_op:$i2, vec_i8imm_op:$i3,
@@ -99,6 +100,7 @@ defm "" : ConstVec<v2f64,
                   (ins f64imm_op:$i0, f64imm_op:$i1),
                   (build_vector (f64 fpimm:$i0), (f64 fpimm:$i1)),
                   "$i0, $i1">;
+} // Defs = [ARGUMENTS]
 
 // Create vector with identical lanes: splat
 def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>;
-- 
GitLab


From 19a8ca2849de4671efe24908c3719eb1e294ee06 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sgundapa@codeaurora.org>
Date: Thu, 11 Oct 2018 19:42:46 +0000
Subject: [PATCH 0068/1116] [Pipeliner] Fix the Schedule DAG topoligical order.

This patch updates the DAG change to reflect in the topological ordering
of the nodes.

Differential Revision: https://reviews.llvm.org/D53105


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344282 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachinePipeliner.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index 3d8510f7c0c..1109be15077 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -1295,6 +1295,7 @@ void SwingSchedulerDAG::changeDependences() {
     // Add a dependence between the new instruction and the instruction
     // that defines the new base.
     SDep Dep(&I, SDep::Anti, NewBase);
+    Topo.AddPred(LastSU, &I);
     LastSU->addPred(Dep);
 
     // Remember the base and offset information so that we can update the
-- 
GitLab


From ba50914be1ad0c99e0dae4bcab52b3b167137cde Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sgundapa@codeaurora.org>
Date: Thu, 11 Oct 2018 19:45:07 +0000
Subject: [PATCH 0069/1116] [Pipeliner] Use the Index from Topo instead of
 relying on NodeNum. (NFC)

In future, if we may add any new DAG mutations other than artificial dependencies,
the NodeNum may not be valid. Instead the index from topological schedule DAG can be
used as long as we update it with the DAG change.

Differential Revision: https://reviews.llvm.org/D53104


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344283 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachinePipeliner.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index 1109be15077..02344225391 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -278,12 +278,21 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     BitVector Blocked;
     SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
     SmallVector<SmallVector<int, 4>, 16> AdjK;
+    // Node to Index from ScheduleDAGTopologicalSort
+    std::vector<int> *Node2Idx;
     unsigned NumPaths;
     static unsigned MaxPaths;
 
   public:
-    Circuits(std::vector<SUnit> &SUs)
-        : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {}
+    Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
+        : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {
+      Node2Idx = new std::vector<int>(SUs.size());
+      unsigned Idx = 0;
+      for (const auto &NodeNum : Topo)
+        Node2Idx->at(NodeNum) = Idx++;
+    }
+
+    ~Circuits() { delete Node2Idx; }
 
     /// Reset the data structures used in the circuit algorithm.
     void reset() {
@@ -1562,7 +1571,8 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
       ++NumPaths;
       break;
     } else if (!Blocked.test(W)) {
-      if (circuit(W, S, NodeSets, W < V ? true : HasBackedge))
+      if (circuit(W, S, NodeSets,
+                  Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
         F = true;
     }
   }
@@ -1602,7 +1612,7 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
   // but we do this to find the circuits, and then change them back.
   swapAntiDependences(SUnits);
 
-  Circuits Cir(SUnits);
+  Circuits Cir(SUnits, Topo);
   // Create the adjacency structure.
   Cir.createAdjacencyStructure(this);
   for (int i = 0, e = SUnits.size(); i != e; ++i) {
-- 
GitLab


From 2141d146188bfbfce6917ff11f606de819b57d52 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sgundapa@codeaurora.org>
Date: Thu, 11 Oct 2018 19:48:15 +0000
Subject: [PATCH 0070/1116] [Hexagon] Restrict compound instructions with
 constant value.

Having a constant value operand in the compound instruction
is not always profitable. This patch improves coremark by ~4% on
Hexagon.

Differential Revision: https://reviews.llvm.org/D53152


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344284 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonPatterns.td     | 37 +++++++++++-----
 test/CodeGen/Hexagon/constant_compound.ll | 52 +++++++++++++++++++++++
 2 files changed, 79 insertions(+), 10 deletions(-)
 create mode 100644 test/CodeGen/Hexagon/constant_compound.ll

diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index 2f5033a20af..f671238ec12 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -257,6 +257,23 @@ class pf2<SDNode Op> : PatFrag<(ops node:$a, node:$b), (Op node:$a, node:$b)>;
 class Not2<PatFrag P>
   : PatFrag<(ops node:$A, node:$B), (P node:$A, (not node:$B))>;
 
+// If there is a constant operand that feeds the and/or instruction,
+// do not generate the compound instructions.
+// It is not always profitable, as some times we end up with a transfer.
+// Check the below example.
+// ra = #65820; rb = lsr(rb, #8); rc ^= and (rb, ra)
+// Instead this is preferable.
+// ra = and (#65820, lsr(ra, #8)); rb = xor(rb, ra)
+class Su_ni1<PatFrag Op>
+  : PatFrag<Op.Operands, !head(Op.Fragments), [{
+            if (hasOneUse(N)){
+              // Check if Op1 is an immediate operand.
+              SDValue Op1 = N->getOperand(1);
+              return !dyn_cast<ConstantSDNode>(Op1);
+            }
+            return false;}],
+            Op.OperandTransform>;
+
 class Su<PatFrag Op>
   : PatFrag<Op.Operands, !head(Op.Fragments), [{ return hasOneUse(N); }],
             Op.OperandTransform>;
@@ -1336,16 +1353,16 @@ def: Pat<(mul I32:$Rs, n8_0ImmPred:$n8),
 def: Pat<(add Sext64:$Rs, I64:$Rt),
          (A2_addsp (LoReg Sext64:$Rs), I64:$Rt)>;
 
-def: AccRRR_pat<M4_and_and,   And, Su<And>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_and_or,    And, Su<Or>,        I32,  I32,  I32>;
-def: AccRRR_pat<M4_and_xor,   And, Su<Xor>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_or_and,    Or,  Su<And>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_or_or,     Or,  Su<Or>,        I32,  I32,  I32>;
-def: AccRRR_pat<M4_or_xor,    Or,  Su<Xor>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_xor_and,   Xor, Su<And>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_xor_or,    Xor, Su<Or>,        I32,  I32,  I32>;
-def: AccRRR_pat<M2_xor_xacc,  Xor, Su<Xor>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_xor_xacc,  Xor, Su<Xor>,       I64,  I64,  I64>;
+def: AccRRR_pat<M4_and_and,   And, Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_and_or,    And, Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M4_and_xor,   And, Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_and,    Or,  Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_or,     Or,  Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_xor,    Or,  Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_and,   Xor, Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_or,    Xor, Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M2_xor_xacc,  Xor, Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_xacc,  Xor, Su<Xor>,      I64,  I64,  I64>;
 
 // For dags like (or (and (not _), _), (shl _, _)) where the "or" with
 // one argument matches the patterns below, and with the other argument
diff --git a/test/CodeGen/Hexagon/constant_compound.ll b/test/CodeGen/Hexagon/constant_compound.ll
new file mode 100644
index 00000000000..4ca2dc5d4ed
--- /dev/null
+++ b/test/CodeGen/Hexagon/constant_compound.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=hexagon < %s 2>&1 | FileCheck %s
+
+; Generating a compound instruction with a constant is not profitable.
+; The constant needs to be kept in a register before it is fed to compound
+; instruction.
+; Before, we are generating
+; ra = #65820;
+; rb = lsr(rb, #8);
+; rc ^= and (rb, ra)
+; Now, we are generating
+; ra = and (#65820, lsr(ra, #8));
+; rb = xor(rb, ra)
+
+; CHECK: and(##65280,lsr(r
+; CHECK-NOT : ^= and
+
+define dso_local zeroext i16 @test_compound(i16 zeroext %varA, i16 zeroext %varB) local_unnamed_addr #0 {
+entry:
+  %tmp = zext i16 %varB to i32
+  %tmp1 = and i16 %varA, 255
+  %tmp2 = zext i16 %tmp1 to i32
+  %.masked.i = and i32 %tmp, 255
+  %tmp3 = xor i32 %.masked.i, %tmp2
+  %tmp4 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp3, i32 255) #2
+  %tmp5 = trunc i64 %tmp4 to i32
+  %tmp6 = and i32 %tmp5, 255
+  %tmp7 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp6, i32 81922) #2
+  %tmp8 = trunc i64 %tmp7 to i32
+  %tmp9 = xor i32 %tmp8, %tmp
+  %tmp10 = lshr i32 %tmp9, 8
+  %tmp11 = lshr i16 %varA, 8
+  %conv2 = zext i16 %tmp11 to i32
+  %tmp12 = and i32 %tmp10, 65280
+  %.masked.i7 = and i32 %tmp10, 255
+  %tmp13 = xor i32 %.masked.i7, %conv2
+  %tmp14 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp13, i32 255) #2
+  %tmp15 = trunc i64 %tmp14 to i32
+  %tmp16 = and i32 %tmp15, 255
+  %tmp17 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp16, i32 81922) #2
+  %tmp18 = trunc i64 %tmp17 to i32
+  %tmp19 = xor i32 %tmp12, %tmp18
+  %tmp20 = lshr i32 %tmp19, 8
+  %tmp21 = trunc i32 %tmp20 to i16
+  ret i16 %tmp21
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.M4.pmpyw(i32, i32) #1
+
+attributes #0 = { nounwind readnone "target-cpu"="hexagonv65" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
-- 
GitLab


From 43adb6744a1ec2d32dd6d86d570f87a54d51b64d Mon Sep 17 00:00:00 2001
From: Warren Ristow <warren.ristow@sony.com>
Date: Thu, 11 Oct 2018 20:19:25 +0000
Subject: [PATCH 0071/1116] Update test of r344198 to work with release builds.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344286 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/LTO/X86/libcall-overridden-via-alias.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/LTO/X86/libcall-overridden-via-alias.ll b/test/LTO/X86/libcall-overridden-via-alias.ll
index cac125b2843..04e1512f5b8 100755
--- a/test/LTO/X86/libcall-overridden-via-alias.ll
+++ b/test/LTO/X86/libcall-overridden-via-alias.ll
@@ -13,8 +13,8 @@
 ;
 ; Check that the IR contains the overriding definition of the library routine
 ; in the IR after LTO:
-; CHECK_IR: define internal float @logf(float [[X:%.*]])
-; CHECK_IR-NEXT:   [[TMP:%.*]] = fadd float [[X]], [[X]]
+; CHECK_IR: define internal float @logf(float
+; CHECK_IR-NEXT:   [[TMP:%.*]] = fadd float
 ; CHECK_IR-NEXT:   ret float [[TMP]]
 ;
 ; Check that the assembly code from LTO contains the call to the expected
-- 
GitLab


From b536aafd96d3e218c12d5525d0d145c051e42ae2 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Thu, 11 Oct 2018 20:21:22 +0000
Subject: [PATCH 0072/1116] [WebAssembly][NFC] Remove repetition of Defs =
 [ARGUMENTS] (fixed)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344287 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyInstrAtomics.td    | 16 --------------
 .../WebAssembly/WebAssemblyInstrCall.td       |  4 ----
 .../WebAssembly/WebAssemblyInstrControl.td    |  8 -------
 .../WebAssembly/WebAssemblyInstrConv.td       | 12 ----------
 .../WebAssembly/WebAssemblyInstrExceptRef.td  |  4 ----
 .../WebAssembly/WebAssemblyInstrFloat.td      | 12 ----------
 .../WebAssembly/WebAssemblyInstrFormats.td    |  1 +
 .../WebAssembly/WebAssemblyInstrInfo.td       |  7 ++----
 .../WebAssembly/WebAssemblyInstrInteger.td    |  9 --------
 .../WebAssembly/WebAssemblyInstrMemory.td     | 22 -------------------
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  5 ++---
 11 files changed, 5 insertions(+), 95 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 9eff2cfde0a..f9d092e4b8a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -24,10 +24,8 @@ multiclass ATOMIC_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
             Requires<[HasAtomics]>;
 }
 
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_LOAD_I32 : WebAssemblyLoad<I32, "i32.atomic.load", 0xfe10>;
 defm ATOMIC_LOAD_I64 : WebAssemblyLoad<I64, "i64.atomic.load", 0xfe11>;
-} // Defs = [ARGUMENTS]
 
 // Select loads with no constant offset.
 let Predicates = [HasAtomics] in {
@@ -62,13 +60,11 @@ def : LoadPatExternSymOffOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
 
 // Extending loads. Note that there are only zero-extending atomic loads, no
 // sign-extending loads.
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load8_u", 0xfe12>;
 defm ATOMIC_LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load16_u", 0xfe13>;
 defm ATOMIC_LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load8_u", 0xfe14>;
 defm ATOMIC_LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load16_u", 0xfe15>;
 defm ATOMIC_LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load32_u", 0xfe16>;
-} // Defs = [ARGUMENTS]
 
 // Fragments for extending loads. These are different from regular loads because
 // the SDNodes are derived from AtomicSDNode rather than LoadSDNode and
@@ -200,10 +196,8 @@ def : LoadPatExternSymOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 // Atomic stores
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_STORE_I32 : WebAssemblyStore<I32, "i32.atomic.store", 0xfe17>;
 defm ATOMIC_STORE_I64 : WebAssemblyStore<I64, "i64.atomic.store", 0xfe18>;
-} // Defs = [ARGUMENTS]
 
 // We need an 'atomic' version of store patterns because store and atomic_store
 // nodes have different operand orders:
@@ -263,13 +257,11 @@ def : AStorePatExternSymOffOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
 } // Predicates = [HasAtomics]
 
 // Truncating stores.
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_STORE8_I32 : WebAssemblyStore<I32, "i32.atomic.store8", 0xfe19>;
 defm ATOMIC_STORE16_I32 : WebAssemblyStore<I32, "i32.atomic.store16", 0xfe1a>;
 defm ATOMIC_STORE8_I64 : WebAssemblyStore<I64, "i64.atomic.store8", 0xfe1b>;
 defm ATOMIC_STORE16_I64 : WebAssemblyStore<I64, "i64.atomic.store16", 0xfe1c>;
 defm ATOMIC_STORE32_I64 : WebAssemblyStore<I64, "i64.atomic.store32", 0xfe1d>;
-} // Defs = [ARGUMENTS]
 
 // Fragments for truncating stores.
 
@@ -341,8 +333,6 @@ def : AStorePatExternSymOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
 // Atomic binary read-modify-writes
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 multiclass WebAssemblyBinRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
   defm "" : I<(outs rc:$dst),
               (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
@@ -430,7 +420,6 @@ defm ATOMIC_RMW16_U_XCHG_I64 :
   WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.xchg", 0xfe46>;
 defm ATOMIC_RMW32_U_XCHG_I64 :
   WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.xchg", 0xfe47>;
-}
 
 // Select binary RMWs with no constant offset.
 class BinRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
@@ -674,8 +663,6 @@ defm : BinRMWTruncExtPattern<
 // Consider adding a pass after instruction selection that optimizes this case
 // if it is frequent.
 
-let Defs = [ARGUMENTS] in {
-
 multiclass WebAssemblyTerRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
   defm "" : I<(outs rc:$dst),
               (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp,
@@ -699,7 +686,6 @@ defm ATOMIC_RMW16_U_CMPXCHG_I64 :
   WebAssemblyTerRMW<I64, "i64.atomic.rmw16_u.cmpxchg", 0xfe4d>;
 defm ATOMIC_RMW32_U_CMPXCHG_I64 :
   WebAssemblyTerRMW<I64, "i64.atomic.rmw32_u.cmpxchg", 0xfe4e>;
-}
 
 // Select ternary RMWs with no constant offset.
 class TerRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
@@ -912,7 +898,6 @@ defm : TerRMWTruncExtPattern<
 // Atomic wait / notify
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
 let hasSideEffects = 1 in {
 defm ATOMIC_NOTIFY :
   I<(outs I32:$dst),
@@ -935,7 +920,6 @@ defm ATOMIC_WAIT_I64 :
     "i64.atomic.wait \t${off}, ${p2align}", 0xfe02>;
 } // mayLoad = 1
 } // hasSideEffects = 1
-} // Defs = [ARGUMENTS]
 
 let Predicates = [HasAtomics] in {
 // Select notifys with no constant offset.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 3c9caa3f0de..07839b79011 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -15,8 +15,6 @@
 // TODO: addr64: These currently assume the callee address is 32-bit.
 // FIXME: add $type to first call_indirect asmstr (and maybe $flags)
 
-let Defs = [ARGUMENTS] in {
-
 // Call sequence markers. These have an immediate which represents the amount of
 // stack space to allocate or free, which is used for varargs lowering.
 let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
@@ -118,8 +116,6 @@ let Uses = [SP32, SP64], isCall = 1 in {
                               0x11>;
 } // Uses = [SP32,SP64], isCall = 1
 
-} // Defs = [ARGUMENTS]
-
 // Patterns for matching a direct call to a global address.
 def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
           (CALL_I32 tglobaladdr:$callee)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index e27d81937dd..ed9879ae454 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -12,8 +12,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // The condition operand is a boolean value which WebAssembly represents as i32.
 defm BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
@@ -30,15 +28,11 @@ defm BR   : NRI<(outs), (ins bb_op:$dst),
 } // isBarrier = 1
 } // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
           (BR_IF bb_op:$dst, I32:$cond)>;
 def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
           (BR_UNLESS bb_op:$dst, I32:$cond)>;
 
-let Defs = [ARGUMENTS] in {
-
 // TODO: SelectionDAG's lowering insists on using a pointer as the index for
 // jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
 // currently.
@@ -194,5 +188,3 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
                    [(catchret bb:$dst, bb:$from)], "", 0>;
 }
 }
-
-} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index 6dca96f3ddd..0d772c743a7 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -13,8 +13,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 defm I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                       [(set I32:$dst, (trunc I64:$src))],
                       "i32.wrap/i64\t$dst, $src", "i32.wrap/i64", 0xa7>;
@@ -51,15 +49,11 @@ defm I64_EXTEND32_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
                             0xc4>;
 } // Predicates = [HasSignExt]
 
-} // defs = [ARGUMENTS]
-
 // Expand a "don't care" extend into zero-extend (chosen over sign-extend
 // somewhat arbitrarily, although it favors popular hardware architectures
 // and is conceptually a simpler operation).
 def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>;
 
-let Defs = [ARGUMENTS] in {
-
 // Conversion from floating point to integer instructions which don't trap on
 // overflow or invalid.
 defm I32_TRUNC_S_SAT_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
@@ -103,8 +97,6 @@ defm I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
                              "i64.trunc_u:sat/f64", 0xfc07>,
                              Requires<[HasNontrappingFPToInt]>;
 
-} // Defs = [Arguments]
-
 // Lower llvm.wasm.trunc.saturate.* to saturating instructions
 def : Pat<(int_wasm_trunc_saturate_signed F32:$src),
           (I32_TRUNC_S_SAT_F32 F32:$src)>;
@@ -123,8 +115,6 @@ def : Pat<(int_wasm_trunc_saturate_signed F64:$src),
 def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src),
           (I64_TRUNC_U_SAT_F64 F64:$src)>;
 
-let Defs = [ARGUMENTS] in {
-
 // Conversion from floating point to integer pseudo-instructions which don't
 // trap on overflow or invalid.
 let usesCustomInserter = 1, isCodeGenOnly = 1 in {
@@ -240,5 +230,3 @@ defm F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins),
                              [(set F64:$dst, (bitconvert I64:$src))],
                              "f64.reinterpret/i64\t$dst, $src",
                              "f64.reinterpret/i64", 0xbf>;
-
-} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
index 41b39f69e51..a251d60b89e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
@@ -12,8 +12,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst),
                            (ins EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond),
                            (outs), (ins),
@@ -23,8 +21,6 @@ defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst),
                            "except_ref.select\t$dst, $lhs, $rhs, $cond",
                            "except_ref.select", 0x1b>;
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(select (i32 (setne I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
           (SELECT_EXCEPT_REF EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 70e27df27e6..364c485f409 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -45,8 +45,6 @@ multiclass ComparisonFP<CondCode cond, string name, bits<32> f32Inst, bits<32> f
                 !strconcat("f64.", name), f64Inst>;
 }
 
-let Defs = [ARGUMENTS] in {
-
 let isCommutable = 1 in
 defm ADD : BinaryFP<fadd, "add ", 0x92, 0xa0>;
 defm SUB : BinaryFP<fsub, "sub ", 0x93, 0xa1>;
@@ -69,8 +67,6 @@ defm FLOOR : UnaryFP<ffloor, "floor", 0x8e, 0x9c>;
 defm TRUNC : UnaryFP<ftrunc, "trunc", 0x8f, 0x9d>;
 defm NEAREST : UnaryFP<fnearbyint, "nearest", 0x90, 0x9e>;
 
-} // Defs = [ARGUMENTS]
-
 // DAGCombine oddly folds casts into the rhs of copysign. Unfold them.
 def : Pat<(fcopysign F64:$lhs, F32:$rhs),
           (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>;
@@ -81,8 +77,6 @@ def : Pat<(fcopysign F32:$lhs, F64:$rhs),
 def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>;
 def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>;
 
-let Defs = [ARGUMENTS] in {
-
 let isCommutable = 1 in {
 defm EQ : ComparisonFP<SETOEQ, "eq  ", 0x5b, 0x61>;
 defm NE : ComparisonFP<SETUNE, "ne  ", 0x5c, 0x62>;
@@ -92,8 +86,6 @@ defm LE : ComparisonFP<SETOLE, "le  ", 0x5f, 0x65>;
 defm GT : ComparisonFP<SETOGT, "gt  ", 0x5e, 0x64>;
 defm GE : ComparisonFP<SETOGE, "ge  ", 0x60, 0x66>;
 
-} // Defs = [ARGUMENTS]
-
 // Don't care floating-point comparisons, supported via other comparisons.
 def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>;
 def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>;
@@ -108,8 +100,6 @@ def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>;
 def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>;
 def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
 
-let Defs = [ARGUMENTS] in {
-
 defm SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
@@ -119,8 +109,6 @@ defm SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
                     [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
                     "f64.select\t$dst, $lhs, $rhs, $cond", "f64.select", 0x1b>;
 
-} // Defs = [ARGUMENTS]
-
 // ISD::SELECT requires its operand to conform to getBooleanContents, but
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 683fb3d981f..2d23acfc825 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -30,6 +30,7 @@ class NI<dag oops, dag iops, list<dag> pattern, bit stack, string asmstr = "",
   dag OutOperandList = oops;
   dag InOperandList  = iops;
   let Pattern        = pattern;
+  let Defs           = [ARGUMENTS];
 }
 
 // Generates both register and stack based versions of one actual instruction.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index a2ea14cc28b..9e1409cf90e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -164,7 +164,8 @@ include "WebAssemblyInstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 multiclass ARGUMENT<WebAssemblyRegClass vt> {
-  let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
+  let hasSideEffects = 1, isCodeGenOnly = 1,
+      Defs = []<Register>, Uses = [ARGUMENTS] in
   defm ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
                         (outs), (ins i32imm:$argno),
                         [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
@@ -175,8 +176,6 @@ defm "": ARGUMENT<F32>;
 defm "": ARGUMENT<F64>;
 defm "": ARGUMENT<EXCEPT_REF>;
 
-let Defs = [ARGUMENTS] in {
-
 // get_local and set_local are not generated by instruction selection; they
 // are implied by virtual register uses and defs.
 multiclass LOCAL<WebAssemblyRegClass vt> {
@@ -266,8 +265,6 @@ defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
                    "f64.const\t$res, $imm", "f64.const\t$imm", 0x44>;
 } // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
           (CONST_I32 tglobaladdr:$addr)>;
 def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 44c93de54aa..bd41f46214a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -45,9 +45,6 @@ multiclass ComparisonInt<CondCode cond, string name, bits<32> i32Inst, bits<32>
                 !strconcat("i64.", name), i64Inst>;
 }
 
-
-let Defs = [ARGUMENTS] in {
-
 // The spaces after the names are for aesthetic purposes only, to make
 // operands line up vertically after tab expansion.
 let isCommutable = 1 in
@@ -97,16 +94,12 @@ defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                  [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
                  "i64.eqz \t$dst, $src", "i64.eqz", 0x50>;
 
-} // Defs = [ARGUMENTS]
-
 // Optimize away an explicit mask on a rotate count.
 def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>;
 def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
 
-let Defs = [ARGUMENTS] in {
-
 defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
@@ -116,8 +109,6 @@ defm SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
                     [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
                     "i64.select\t$dst, $lhs, $rhs, $cond", "i64.select", 0x1b>;
 
-} // Defs = [ARGUMENTS]
-
 // ISD::SELECT requires its operand to conform to getBooleanContents, but
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 76ef1461d22..ccc331d1bf0 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -53,8 +53,6 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off),
 // We don't need a regPlusES because external symbols never have constant
 // offsets folded into them, so we can just use add.
 
-let Defs = [ARGUMENTS] in {
-
 // Defines atomic and non-atomic loads, regular and extending.
 multiclass WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode> {
   let mayLoad = 1 in
@@ -73,8 +71,6 @@ defm LOAD_I64 : WebAssemblyLoad<I64, "i64.load", 0x29>;
 defm LOAD_F32 : WebAssemblyLoad<F32, "f32.load", 0x2a>;
 defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b>;
 
-} // Defs = [ARGUMENTS]
-
 // Select loads with no constant offset.
 class LoadPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
   Pat<(ty (kind I32:$addr)), (inst 0, 0, I32:$addr)>;
@@ -144,8 +140,6 @@ def : LoadPatExternSymOffOnly<i64, load, LOAD_I64>;
 def : LoadPatExternSymOffOnly<f32, load, LOAD_F32>;
 def : LoadPatExternSymOffOnly<f64, load, LOAD_F64>;
 
-let Defs = [ARGUMENTS] in {
-
 // Extending load.
 defm LOAD8_S_I32 : WebAssemblyLoad<I32, "i32.load8_s", 0x2c>;
 defm LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.load8_u", 0x2d>;
@@ -158,8 +152,6 @@ defm LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x33>;
 defm LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34>;
 defm LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35>;
 
-} // Defs = [ARGUMENTS]
-
 // Select extending loads with no constant offset.
 def : LoadPatNoOffset<i32, sextloadi8, LOAD8_S_I32>;
 def : LoadPatNoOffset<i32, zextloadi8, LOAD8_U_I32>;
@@ -303,9 +295,6 @@ def : LoadPatExternSymOffOnly<i64, extloadi8, LOAD8_U_I64>;
 def : LoadPatExternSymOffOnly<i64, extloadi16, LOAD16_U_I64>;
 def : LoadPatExternSymOffOnly<i64, extloadi32, LOAD32_U_I64>;
 
-
-let Defs = [ARGUMENTS] in {
-
 // Defines atomic and non-atomic stores, regular and truncating
 multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode> {
   let mayStore = 1 in
@@ -323,8 +312,6 @@ defm STORE_I64  : WebAssemblyStore<I64, "i64.store", 0x37>;
 defm STORE_F32  : WebAssemblyStore<F32, "f32.store", 0x38>;
 defm STORE_F64  : WebAssemblyStore<F64, "f64.store", 0x39>;
 
-} // Defs = [ARGUMENTS]
-
 // Select stores with no constant offset.
 class StorePatNoOffset<ValueType ty, PatFrag node, NI inst> :
   Pat<(node ty:$val, I32:$addr), (inst 0, 0, I32:$addr, ty:$val)>;
@@ -389,9 +376,6 @@ def : StorePatExternSymOffOnly<i64, store, STORE_I64>;
 def : StorePatExternSymOffOnly<f32, store, STORE_F32>;
 def : StorePatExternSymOffOnly<f64, store, STORE_F64>;
 
-
-let Defs = [ARGUMENTS] in {
-
 // Truncating store.
 defm STORE8_I32 : WebAssemblyStore<I32, "i32.store8", 0x3a>;
 defm STORE16_I32 : WebAssemblyStore<I32, "i32.store16", 0x3b>;
@@ -399,8 +383,6 @@ defm STORE8_I64 : WebAssemblyStore<I64, "i64.store8", 0x3c>;
 defm STORE16_I64 : WebAssemblyStore<I64, "i64.store16", 0x3d>;
 defm STORE32_I64 : WebAssemblyStore<I64, "i64.store32", 0x3e>;
 
-} // Defs = [ARGUMENTS]
-
 // Select truncating stores with no constant offset.
 def : StorePatNoOffset<i32, truncstorei8, STORE8_I32>;
 def : StorePatNoOffset<i32, truncstorei16, STORE16_I32>;
@@ -448,8 +430,6 @@ def : StorePatExternSymOffOnly<i64, truncstorei8, STORE8_I64>;
 def : StorePatExternSymOffOnly<i64, truncstorei16, STORE16_I64>;
 def : StorePatExternSymOffOnly<i64, truncstorei32, STORE32_I64>;
 
-let Defs = [ARGUMENTS] in {
-
 // Current memory size.
 defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
                          (outs), (ins i32imm:$flags),
@@ -493,8 +473,6 @@ defm GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
                          0x40>,
                        Requires<[HasAddr32]>;
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(int_wasm_current_memory),
           (CURRENT_MEMORY_I32 0)>;
 def : Pat<(int_wasm_grow_memory I32:$delta),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 57024616f3f..b575a039ae0 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -22,7 +22,8 @@ multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
 }
 
 multiclass SIMD_ARGUMENT<ValueType vt> {
-  let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
+  let hasSideEffects = 1, isCodeGenOnly = 1,
+      Defs = []<Register>, Uses = [ARGUMENTS] in
   defm ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno),
                              (outs), (ins i32imm:$argno),
                              [(set (vt V128:$res),
@@ -55,7 +56,6 @@ multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
                                   "v128.const\t"#args, 0>;
 }
 
-let Defs = [ARGUMENTS] in {
 defm "" : ConstVec<v16i8,
                    (ins vec_i8imm_op:$i0, vec_i8imm_op:$i1,
                         vec_i8imm_op:$i2, vec_i8imm_op:$i3,
@@ -100,7 +100,6 @@ defm "" : ConstVec<v2f64,
                   (ins f64imm_op:$i0, f64imm_op:$i1),
                   (build_vector (f64 fpimm:$i0), (f64 fpimm:$i1)),
                   "$i0, $i1">;
-} // Defs = [ARGUMENTS]
 
 // Create vector with identical lanes: splat
 def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>;
-- 
GitLab


From efad4789d86a5214fb35a5f07938ed9d54c8b3e1 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 11 Oct 2018 20:36:06 +0000
Subject: [PATCH 0073/1116] [X86] Type legalize v2f32 loads by using an f64
 load and a scalar_to_vector.

On 64-bit targets the generic legalize will use an i64 load and a scalar_to_vector for us. But on 32-bit targets i64 isn't legal and the generic legalizer will end up emitting two 32-bit loads. We have DAG combines that try to put those two loads back together with pretty good success.

This patch instead uses f64 to avoid the splitting entirely. I've made it do the same for 64-bit mode for consistency and to keep the load in the fp domain.

There are a few things in here that look like regressions in 32-bit mode, but I believe they bring us closer to the 64-bit mode codegen. And that the 64-bit mode code could be better. I think those issues should be looked at separately.

Differential Revision: https://reviews.llvm.org/D52528

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344291 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            | 24 +++++++++++++++
 test/CodeGen/X86/bitcast-int-to-vector.ll     |  6 ++--
 test/CodeGen/X86/fold-load-vec.ll             |  4 +--
 .../X86/merge-consecutive-loads-256.ll        | 30 ++++++++++---------
 test/CodeGen/X86/sse-intrinsics-fast-isel.ll  | 24 +++++----------
 test/CodeGen/X86/vec_extract-avx.ll           |  4 ++-
 test/CodeGen/X86/vector-shuffle-128-v4.ll     |  4 +--
 test/CodeGen/X86/widen_load-1.ll              |  4 +--
 8 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 67f98d8ee72..d118e38ae72 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -902,6 +902,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 
+    // We want to legalize this to an f64 load rather than an i64 load on
+    // 64-bit targets and two 32-bit loads on a 32-bit target.
+    setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
+
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
@@ -26420,6 +26424,26 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
     break;
   }
+  case ISD::LOAD: {
+    // Use an f64 load and a scalar_to_vector for v2f32 loads. This avoids
+    // scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp cast
+    // since type legalization will try to use an i64 load.
+    EVT VT = N->getValueType(0);
+    assert(VT == MVT::v2f32 && "Unexpected VT");
+    if (!ISD::isNON_EXTLoad(N))
+      return;
+    auto *Ld = cast<LoadSDNode>(N);
+    SDValue Res = DAG.getLoad(MVT::f64, dl, Ld->getChain(), Ld->getBasePtr(),
+                              Ld->getPointerInfo(),
+                              Ld->getAlignment(),
+                              Ld->getMemOperand()->getFlags());
+    SDValue Chain = Res.getValue(1);
+    Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Res);
+    Res = DAG.getBitcast(MVT::v4f32, Res);
+    Results.push_back(Res);
+    Results.push_back(Chain);
+    return;
+  }
   }
 }
 
diff --git a/test/CodeGen/X86/bitcast-int-to-vector.ll b/test/CodeGen/X86/bitcast-int-to-vector.ll
index 1a04fef9e01..e319255e8f0 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector.ll
@@ -17,8 +17,10 @@ define i1 @foo(i64 %a) {
 ;
 ; X86-SSE-LABEL: foo:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movaps %xmm0, %xmm1
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
 ; X86-SSE-NEXT:    setp %al
 ; X86-SSE-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll
index 5523846dd19..115f2bf7a5b 100644
--- a/test/CodeGen/X86/fold-load-vec.ll
+++ b/test/CodeGen/X86/fold-load-vec.ll
@@ -16,8 +16,8 @@ define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
 ; CHECK-NEXT:    movlps %xmm0, (%rsp)
 ; CHECK-NEXT:    movlps %xmm0, (%rsi)
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-NEXT:    callq ext
 ; CHECK-NEXT:    addq $24, %rsp
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll
index f421d41f886..2feb9742c60 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-256.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll
@@ -237,33 +237,35 @@ define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp {
 define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noinline ssp {
 ; AVX1-LABEL: merge_8f32_2f32_23z5:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT:    vmovups 16(%rdi), %xmm1
-; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovups 16(%rdi), %xmm0
+; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: merge_8f32_2f32_23z5:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vmovdqu 16(%rdi), %xmm1
-; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovupd 16(%rdi), %xmm0
+; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: merge_8f32_2f32_23z5:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vmovdqu 16(%rdi), %xmm1
-; AVX512F-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vmovupd 16(%rdi), %xmm0
+; AVX512F-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; X32-AVX-LABEL: merge_8f32_2f32_23z5:
 ; X32-AVX:       # %bb.0:
 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
+; X32-AVX-NEXT:    vmovups 16(%eax), %xmm0
+; X32-AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X32-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    retl
   %ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2
   %ptr1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 90e31eb5fb3..47649a54e80 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -1329,19 +1329,15 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
 ; X86-AVX1-LABEL: test_mm_loadh_pi:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08]
-; X86-AVX1-NEXT:    # xmm1 = mem[0],zero
-; X86-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
-; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X86-AVX1-NEXT:    vmovhpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x16,0x00]
+; X86-AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_loadh_pi:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08]
-; X86-AVX512-NEXT:    # xmm1 = mem[0],zero
-; X86-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
-; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X86-AVX512-NEXT:    vmovhpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x16,0x00]
+; X86-AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_loadh_pi:
@@ -1396,19 +1392,15 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
 ; X86-AVX1-LABEL: test_mm_loadl_pi:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08]
-; X86-AVX1-NEXT:    # xmm1 = mem[0],zero
-; X86-AVX1-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X86-AVX1-NEXT:    # xmm0 = xmm1[0,1],xmm0[2,3]
+; X86-AVX1-NEXT:    vmovlpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x12,0x00]
+; X86-AVX1-NEXT:    # xmm0 = mem[0],xmm0[1]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_loadl_pi:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08]
-; X86-AVX512-NEXT:    # xmm1 = mem[0],zero
-; X86-AVX512-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X86-AVX512-NEXT:    # xmm0 = xmm1[0,1],xmm0[2,3]
+; X86-AVX512-NEXT:    vmovlpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x12,0x00]
+; X86-AVX512-NEXT:    # xmm0 = mem[0],xmm0[1]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_loadl_pi:
diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll
index 9a12d69b46b..a15424a763e 100644
--- a/test/CodeGen/X86/vec_extract-avx.ll
+++ b/test/CodeGen/X86/vec_extract-avx.ll
@@ -171,7 +171,9 @@ define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) {
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X32-NEXT:    vmovaps %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index e35f664e121..0e4d5dcd386 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1998,8 +1998,8 @@ define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
 define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
 ; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index 2e4acb57ee4..8cbf8c4e346 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -5,11 +5,11 @@
 
 ; This load should be before the call, not after.
 
-; SSE: movaps    compl+128(%rip), %xmm0
+; SSE: movsd     compl+128(%rip), %xmm0
 ; SSE: movaps  %xmm0, (%rsp)
 ; SSE: callq   killcommon
 
-; AVX: vmovaps    compl+128(%rip), %xmm0
+; AVX: vmovsd     compl+128(%rip), %xmm0
 ; AVX: vmovaps  %xmm0, (%rsp)
 ; AVX: callq   killcommon
 
-- 
GitLab


From 7034ff81096837cc8e955a692205dc3042735393 Mon Sep 17 00:00:00 2001
From: Aaron Smith <aaron.smith@microsoft.com>
Date: Thu, 11 Oct 2018 21:37:18 +0000
Subject: [PATCH 0074/1116] [llvm-pdbutil] Pretty print PDBSymbolUsingNamespace
 symbols

Reviewers: rnk, zturner, llvm-commits

Differential Revision: https://reviews.llvm.org/D52799

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344298 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp | 11 +++++++++++
 test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb |  0
 test/tools/llvm-pdbdump/usingnamespace.test           |  6 ++++++
 tools/llvm-pdbutil/PrettyCompilandDumper.cpp          | 11 +++++++++++
 tools/llvm-pdbutil/PrettyCompilandDumper.h            |  1 +
 5 files changed, 29 insertions(+)
 create mode 100644 test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp
 create mode 100644 test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb
 create mode 100644 test/tools/llvm-pdbdump/usingnamespace.test

diff --git a/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp
new file mode 100644
index 00000000000..403ada17713
--- /dev/null
+++ b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp
@@ -0,0 +1,11 @@
+// Compile with "cl /c /Zi /GR- UsingNamespaceTest.cpp"
+// Link with "link UsingNamespaceTest.obj /debug /nodefaultlib /entry:main"
+
+namespace NS {
+  int foo() { return 1; }
+}
+
+using namespace NS;
+int main(int argc, char **argv) {
+  return foo();
+}
diff --git a/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/test/tools/llvm-pdbdump/usingnamespace.test b/test/tools/llvm-pdbdump/usingnamespace.test
new file mode 100644
index 00000000000..954ec114010
--- /dev/null
+++ b/test/tools/llvm-pdbdump/usingnamespace.test
@@ -0,0 +1,6 @@
+; RUN: llvm-pdbutil pretty -module-syms %p/Inputs/UsingNamespaceTest.pdb > %t
+; RUN: FileCheck -input-file=%t %s
+
+; CHECK: ---SYMBOLS---
+; CHECK-NEXT: {{.*}}UsingNamespaceTest.obj
+; CHECK-DAG: using namespace NS
diff --git a/tools/llvm-pdbutil/PrettyCompilandDumper.cpp b/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
index 0d99c9b1245..94a0b2d5e78 100644
--- a/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
+++ b/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
@@ -28,6 +28,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolThunk.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
@@ -216,3 +217,13 @@ void CompilandDumper::dump(const PDBSymbolUnknown &Symbol) {
   Printer.NewLine();
   Printer << "unknown (" << Symbol.getSymTag() << ")";
 }
+
+void CompilandDumper::dump(const PDBSymbolUsingNamespace &Symbol) {
+  if (Printer.IsSymbolExcluded(Symbol.getName()))
+    return;
+
+  Printer.NewLine();
+  Printer << "using namespace ";
+  std::string Name = Symbol.getName();
+  WithColor(Printer, PDB_ColorItem::Identifier).get() << Name;
+}
diff --git a/tools/llvm-pdbutil/PrettyCompilandDumper.h b/tools/llvm-pdbutil/PrettyCompilandDumper.h
index cae196e9d13..1a840e49607 100644
--- a/tools/llvm-pdbutil/PrettyCompilandDumper.h
+++ b/tools/llvm-pdbutil/PrettyCompilandDumper.h
@@ -34,6 +34,7 @@ public:
   void dump(const PDBSymbolThunk &Symbol) override;
   void dump(const PDBSymbolTypeTypedef &Symbol) override;
   void dump(const PDBSymbolUnknown &Symbol) override;
+  void dump(const PDBSymbolUsingNamespace &Symbol) override;
 
 private:
   LinePrinter &Printer;
-- 
GitLab


From b4c8a95abbbc8ccdf4079426ddc22b8ba991ea2b Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 11 Oct 2018 21:44:38 +0000
Subject: [PATCH 0075/1116] [x86] regenerate CHECKs; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344301 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/extract-insert.ll | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/test/CodeGen/X86/extract-insert.ll b/test/CodeGen/X86/extract-insert.ll
index 823390e86d1..a6cac874c41 100644
--- a/test/CodeGen/X86/extract-insert.ll
+++ b/test/CodeGen/X86/extract-insert.ll
@@ -3,13 +3,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
 
 define i32 @extractelt_undef_insertelt(i32 %x, i32 %y) {
-; X86-LABEL: extractelt_undef_insertelt:
-; X86:       # %bb.0:
-; X86-NEXT:    retl
-;
-; X64-LABEL: extractelt_undef_insertelt:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: extractelt_undef_insertelt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %b = insertelement <4 x i32> zeroinitializer, i32 %x, i64 3
   %c = icmp uge i32 %y, %y
   %d = extractelement <4 x i32> %b, i1 %c
-- 
GitLab


From 9b3effed99daf83effa68df8706fb12a15e9a65d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 11 Oct 2018 22:04:36 +0000
Subject: [PATCH 0076/1116] [x86] add tests for extract_element; NFC

The transform for this pattern has an unnecessary one-use limitation.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344303 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/extract-insert.ll | 44 ++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/test/CodeGen/X86/extract-insert.ll b/test/CodeGen/X86/extract-insert.ll
index a6cac874c41..de8ee704b88 100644
--- a/test/CodeGen/X86/extract-insert.ll
+++ b/test/CodeGen/X86/extract-insert.ll
@@ -12,3 +12,47 @@ define i32 @extractelt_undef_insertelt(i32 %x, i32 %y) {
   ret i32 %d
 }
 
+define i8 @extractelt_bitcast(i32 %x) nounwind {
+; X86-LABEL: extractelt_bitcast:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: extractelt_bitcast:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %bc = bitcast i32 %x to <4 x i8>
+  %ext = extractelement <4 x i8> %bc, i32 0
+  ret i8 %ext
+}
+
+define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind {
+; X86-LABEL: extractelt_bitcast_extra_use:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movd %eax, %xmm0
+; X86-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    popl %ecx
+; X86-NEXT:    retl
+;
+; X64-LABEL: extractelt_bitcast_extra_use:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT:    movl %edi, (%rsi)
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %bc = bitcast i32 %x to <4 x i8>
+  store <4 x i8> %bc, <4 x i8>* %p
+  %ext = extractelement <4 x i8> %bc, i32 0
+  ret i8 %ext
+}
+
-- 
GitLab


From d7ed5b6ae7ec07fd6ead0b495a2c73e7d34728e0 Mon Sep 17 00:00:00 2001
From: Wei Mi <wmi@google.com>
Date: Thu, 11 Oct 2018 22:14:27 +0000
Subject: [PATCH 0077/1116] [SampleFDO][NFC] Remove debugging log left over in
 the code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344304 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/SampleProf.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/llvm/ProfileData/SampleProf.h b/include/llvm/ProfileData/SampleProf.h
index e632a1c955b..927dfd24687 100644
--- a/include/llvm/ProfileData/SampleProf.h
+++ b/include/llvm/ProfileData/SampleProf.h
@@ -488,8 +488,6 @@ public:
   // If the format is SPF_Compact_Binary, the name is already a GUID and we
   // don't want to return the GUID of GUID.
   static uint64_t getGUID(StringRef Name) {
-    if (Format == SPF_Compact_Binary)
-      errs() << Name << '\n';
     return (Format == SPF_Compact_Binary) ? std::stoull(Name.data())
                                           : Function::getGUID(Name);
   }
-- 
GitLab


From 744c960d1283eb9afea0ece9cd8653b382592171 Mon Sep 17 00:00:00 2001
From: Aaron Smith <aaron.smith@microsoft.com>
Date: Thu, 11 Oct 2018 22:25:55 +0000
Subject: [PATCH 0078/1116] [llvm-pdbutil] Add missing pdb for test

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344306 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm-pdbdump/Inputs/UsingNamespaceTest.pdb | Bin 0 -> 102400 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ce5211e3fc8dca5ced3ac638943aac0aff735046 100644
GIT binary patch
literal 102400
zcmeaxOfJeV&QB{*aMpL$)>iNhc2h9dGce%gl5z=VU|?VnU|?WkU|^7BU|?tf@fjEx
z#G&*kH5vk=Aut*OqaiRF0;3@?8UmvsFd71*Aut*OqaiRF0;3@?G(zCOpZ}v^Gz3ON
zU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!nMnhnjhkyk$!!Y;ZsP{%gU^E0qLtr!n
zMnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1h`KbGFpI^DhNQ<`h%FDP+?(UVEF%^8N>w-
z5P%pUaTv|Uz$gS_^D%G;lz<o@EWp6P#lR#`z{bG9!@$VE$G|8A+B*P}XA)pwK&WM6
zU>0Ct&|_d0ViE<}k(XGKS(eK1`O`-bpOGOsKbHZ*W@5-q%u7s9O=0--v50|Pl$n8p
z31l9t00TpOd|7gQVo6C+W>RTMYO$47YFTPtNqljBX;E^jBTU@Ns-&_YH9nO=&)zZI
z1j2T}s?q_?KnDkgk5X_q78j(ZrZ7O<SDaau%J8v@fnAgd>K=9y+>>8Wl9`)X1$PsI
zPrSRb7+6Ia7#cvq3krECM)LrU5CMfmD6t{%`O`;GoN@{<Fz~Q2GH@|43NbP;Ft7<Q
zFi0_Q3V>n{B*z6xKTM1a+#s3(n;as22{Et<f%NeRFff2<UI7M%ItFGT0a1v#kT_!a
z2;y@HFfbs(hv6f?C=&yp00RS7=QDiz$ScYWb{hi&$P@-JA7lm#+!TnI0RsaA2SZ9~
za(+=_Nh-rfP<XO42r=<P-46;-9wJj1sp07aPgAK*`T04iiFpjrbf4;(SCX2ZS_I)U
zF{HZX<R_NE@&z+Ps!M)pQcfzvr;n2u*hN{OX-^Pa+RI4IDM&3s$sT2f@rgMlpm4#T
zIk2d6z%UL=X6Yg>O=1{jWrf8?aG+ulC(0#|G%JjjW`&T_EJ#iS6xU3Q45A>Kfq}se
zng&7YK%o?$%J5N8l$k+HfPn#vOBg_D4Wt&9)<FJeW)NZ$hnfdULoAqSh%WyB!~`o_
zQyudvLyHsBQ=w&7s&l?`PGWH}LX3$a)!E+{l=O<y5|dLQ640`kjRBk{kt87HG6zGd
zZ)$E*Y7s;=IQ<HM#7Z*qQ;@_Iko5beA|yfOr~`@|!Z46r0|U&R$kssYW?)3fAxwnI
zF{irc<R>NOAX^6Y3j+&O&Ly!V5y_1V9H~el0yRef;qQRNqQu<P5){8O7^FgMK$8Ty
zoxvg1)5XU(KczGW8ki6>nHm@xQXTX1^D1-mOOe8#rGbHgks&ptvLH1jH4Wl0kU1c6
zCWh3|yv+POh$zGykT?rNs%u_pE<_mWKae;FL#j(^T4r8qNMbrf5}`+cAvL(9s5H3*
zA_0ywkRAnwRG<9h#2kn)LcIe+YCutbL26M6Gyx&m)4-7G3aV|<^)N7|hNR{e<Rq3r
zy^f@ZfiV?1P~i46Fr|9t<fNv<TmbVs152u7ab;d|Mp1qqJiy@Q{jaxY0HtkD7axct
zK6Wv%i^@UEL`fpc#8ObsKq?Nwd{|=v%5ebGa79FwKy_r7g{YQ+i$PQ&qU;CN+0tm`
zUnx?(43d)pl~YWN46-1afq}saTK-81Fff3vj8A0%m49+DK0F8@btuR@28Oc4oYGVV
zkbgmCpFGqo;>$j;O$d|q>>Wc<g^2R67qm`gWJoMXEGo%O%_{+ycT5b$nR)3s@U|s0
zLt0U4Dx`j8VaQ7@Eh$ROfwnbS8Itl#G8jIVF|doWLer5VwscgIQIwjPl9`tdcea%k
zTnv_YkOUmyY`6w2)#t|~;?fMJVc<l9DU8hyP`Xe?OBYH=^&&`41r&!&j0~zE8Y#^n
z<S^TEYS6Zvx&Q+MsC@%!yMfGwyDC1F0n~QW5MW?{i@`$!lAb{7Vd)7J7Ay=xOqx(P
z;7Ctci~=PfOkr$cfz<y<ElVu|rx!+s#GIV`a&UhGR1PsQB<7SSRu+TWV95OsZEWcx
zF+IH~H9fH;F)4>Y{{yQ^Y;FX#ONnUbV>J()EU?O8u?gDF*F{SkI!I{)B&P?84<<$i
zeGrY5HV|@{X~O`THVl!{2FP4k8w->!j06}MusRjmrU$8qwX;BB0!kakP&eR68`zBl
zB@e7J*uw-=77)?rEJ{sI%`8hTf~ROay-p0ZSZw~7M_gF|=>}n#2#$Fia;UbE+yCcc
zU=;&}yI(NFM?MBdG0-^yi~<Y{Y5DmKLLfc^0|Scy14C|NW*#W-2r;mW2?&7NISdRX
zEFk*RN068VR7`~dA*KKo1NG4u86fJkpkhlI5$a5!V)9G~F?*<38<KgxP_YM4y=G9i
zyI94P78I8hr6%Ua<m8m)#+0NMm&BCh=jRlI`ML!uNhzhd1u>p^1*Ij$3}sM#&Q>u2
zMfvGPiMa}HnK`M&3K|s_W|}dGqZGn2i%S!86oN}iQ!?{oj0_CTW1RDIb4&9wODbc+
zoMS@3R`>>oImZ~9=oy;o8JU?InZ+b!=EZpA7neksnZ#6>nZzXL=%rSqGJI5m+N&26
z3aZTf5_40F3lfu4LsE-N^pXn-K;i2IRYRhmV#<;YO$_u3Qj!=xeUyWmp_{Cr8<nY`
z>mH=5pz9l=pc@jPpc?@m3lU>w0QK}iy?jvb9n^CN^~OJa1f_*hJQ@O{Aut*OqaiRF
z0;3@?8UmvsFd71*Aut*OqaiSOLtu)YA)`0f#%R!-?!TL}mMi9;blCh(?8v3ryg@Km
zgXBOM#s|@N0*%pyK*FPVGz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!nMnhl_
zh5%@tzaImmAjoN?&K+Z!jm}TX0<9yn5oAnYV7R~_%)rl(nZW?35mV$EL7EEwkah5x
zdFcwy`MCv|IjKbqLJTT`Xsg&6#28crF&52&)}S+}2qu=~Ft9PG2o{&5fN8&A(7plz
zuzysbBNS{P&oMACu!7bE2rw`xfM^B=zhEmX&~gO^IR?loUZ^r;$_Zo;69XHA76Zi0
zIZ!h}cOdXGFff2NL4ejdK-MZimOFsV(?B)_!3LWr#2_dLQV-h4;Ln&?l%5PSM^F&7
z4+FGM0xVVrx_5z%fsMgO5VR)!69dHEP`5L{`~-522-JL528ITZI*{2Q43Y!66C@2<
zXAfFq0b+A9Ff@SN&c(pc0CGP!14F|X1_lNR1_p)%(DEI|849Yp9iLpb-bmXN{yEgb
z4zyODfx(1<fx!o4z$hLKfzc2c4S~@R7!85Z5Eu=C(GVC7fzc2c4S~@R7;+&1>i-Kt
z`~O-BKACyhsYMD}pgy#XAUgx74-fMaAG9xz+K2b?^z#O(kYltJWJoSgVIXl3fdRC_
zon8y-b29U?!At5v>)lCoD7LltAY+nqQy3KVQ&N*k(-rjd@>5dN5=(PRax#+?^i%Un
ziYl$ZJ0KW@7<2?dBM_i52uR<Y5iZ6B?b|alFfvFnm<cj6@PO}KjDU<)B<Pi-R)EF|
zq!`QvVPc>#XJi1&sO08>_S>5YGJ*6NFfiO;VPJ4zU~teYN=Yn91j(5Tx-de-K>C;%
zU@{C0at!8zJy1E28kn3)RaI3=QaS_3P7WrB-V02iF$uj2h^Y{BLFR$%0@=yLpqGk}
zGZSP6xyONl!2)W&UI9cFWF|}uq@S4qCId3lOppbnF90&;;K0D(q*s)hlb;MyIf_R^
zU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!n$PEEde-t#QY{S670GfXWo$&{vLG#NX
zIncZ@XpDed^GEfKhQMeDjE2By2#kinXb6mkz-S1JhQMeDjE2By2#kinXb22^2yhI%
z6GmMy8UmvsFd71*Aut*OqaiRF0;3@?8UmvsFd71*Aut*Oqai@!5CE?rXJB9ett$te
z;Sbr+j%uj_149CA#W-l4Hv?z^J7|Fb_yz!kTF^Rh*h+5D`fku7bOz83butJwAQo61
zbWJyS{2_n=w6Gks7#$=H$Dp;}aLy=WGz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0q
zLtr!nMniz?5CE<Bm*Uj_2)as<hlzo~6mm8mgD;dO!eG$$0ldlxD!{9IRB|wc0MB4>
z#HbraLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1jr5nIR-vK&=ftWkIu+opO&BR
z;1}%R5b5X<5d|_-2r3Vn@@Iq`Z_gmcpb9z`pWzC4i~xR&K7$ZcT!WE;0j3sY20je3
z7oR+)9C*b5rtql9fQ0~TYycFO2B0$wL4!#GpfvFR|A0Aa)Z)<)7!85Z5Eu=C(GVC7
zfzc2c4S~@R7!85Z5Eu;sln~Hif}FDs8Up~G_XoN&4o0J|4NwI4_d)$S2u3l8Ob)2e
zNSfBs{y%BqMSInw{eRkr@u&@>Aut*OqaiRF0;3@?8UmvsFd71*Aut*Oqai?P2!Q(d
z<(!6$png9)1AiX}LuLji0|SE$BLl-41_p)<(AfY;mVmbWr+~)+Qrcj`Aibb|ygK+E
zy8zG?@(g`FM?lIL7#R2%R)7>RFfcTL^}%#OXe!*om!FhH>aBy=F7pE!19217xeWYZ
z|1c!L_z+nhhBctU5(o!{1gS&5Ls55^-bauaND(%(KytXk4Q2)CE(Xwf2_Ur~b3hm*
z2fC{-o&j_(pMe9q+!96x1`!-`pga6PBPb3G1`Z&zI*db~gJ@O;Rt8?ML5vKbGyxI=
zoe9AS7Gr{ofzE$`h%v*(KxaKb#8}{BAUi<!27=6inFpeo7(fPsZaM_XFfgQ8#grBl
zmlUNY=EmgYl;y^hq!yRNl;r2<6odJ?1u01>rMU$$nRx}JCB-qN#hH2Od5O8H#RZAU
zsUXFA$pr-r4FBH$dk-?%17s`}5*6rbM^HF}j02eq!XWoUFgMtr5H?7~C>{-g(GVC7
zfzc2c4S~@R7!85Z5Eu=C(GVC7fx#aFuzu5WkW)$Rj1?56rlh82=B1`6Bx`FcB$kvE
zWhRxDq!ue^B<JTAWagw6DP-oAq!#5R=4eu_uLru05Y+Vrk264r6hPM$f_yNFM?+vV
z1V%$(Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^E1<fYf3K2Zqm|K5{WI3782mFz~Q2
zGVn1l3W4T6O$8XhNBv7Na0-Ar*dTdx(B5t)CI$--&A`CG#lR>8ni~bF#Z<@0U<Fl&
zDaXJ7zHfkmfkBFaO$cPZr2qp1h_;5(HUbO`aSY5t0-_8IAoEKq3sU1#89wrhGBel;
zFfgQ+rRJ5y7o{erW|pNEIl@LC7<fgQ!Ri?pKsp#cg7~2G3s@L(5-amdOX5>f3sUn^
zQu9g}KISp7in1{XG1)=g2?`<(hWPlh<aoqzhLsgAtMu$0LvhGCpcw)Q=O_kdAtq5~
z21bUIoE(NvA3<y;hSZ8wD4Us~IJKxOGa1HbVJOWjE=WzzOv_A7fvRC*aE)+<vY8oz
zU4z0rok9FG26j<41`ehY(A>WO1Ew!>^HWN5QsDt*Wd#bK_*4cxd&h7S2-^Wo4HjQ~
zjALLGWngFkB@s|?K{1+lt*p>Y0@)IZCWggyXj*iFrbR9WMu7tGAP*#sx`4zP7#W-x
z7==J<DjWqE7+jGs$nT)I28n^#AR2^0Y>+qzgZLl}OOqhGoI&x=#LwUcqLI=y$b3w7
zj0_%7b(nIP>DnEdo;{(omjDAp5+fvCgUpA7JS1Iv3otOC1rj(tgVZ6XXOR0j7(fdl
zAn6;#7hot$Eh^5;&tv!mPV*o!35L?r%oK<kP#7pM<RruRps-M3NY2kIDar?>MNmEg
z>Cs@wNX-E)%E-)12bB#VagbgEByo_g)QS?Y+dz6OkmNuq8>9y$?f@1~%gjkdxWNM~
z1~UQTR*+kK7&0<bQd09^c7nuw8H!SiONug+OHxx{;vjPa7?MkiOY(D(#Uemz^Gl18
zQ{%w_14^gD3_?sk(6|C6)C7XLg7|a_GH(=*hQMeDjE2By2#kinXb6mkz-S1JhQLq?
z0q#?VjG#5<puV`D00RSyC<6m8!~g%FN%sH$LA`MX2GII$5C*O3hAlQ{V_*~l`IC=<
zLjcq_0I@-4fW{FD*ccdi7!czKAbBPM&`toDS|$c&0R{#=XkVCtAuq8cvn&<VmIw8R
z85xrEbD{lPCWhR^yu@^H-?xZ?U6dKxzh%YNzlHR~i@{w0{JmVPDzWrNKT5&fSX_{r
zn!*5aUvXwtD#OPr26j;<sC(E+a8G_gNoH<l6=*C2-1A289f)>U7IA%Wtmc7xI9O${
zgaEWZ&I#&wvoONPd)Nd(<2I0SA&?vwsE@_O$iNMv!DBk;a)^Gh5OlnU2ihOz6<}bf
zgO2ln%!T!bLE}Cg0t^g@@B#IanHc!sV?St74ej@W%s}odgUn!on*!;Fg8G&m3@NF}
z`9+B(sSF=M;mOV*#KaGEKPWx(VD=TUh8n5i=>$(xsZRO%IjM<x4B&AfMut?+ypq)P
z)FKF<i6PZ3CqJ<SmM@qYQeE;(lX5^~Lz5WTMOmO}PY_$$19jn3i%<r!$_(QZb4oy=
zi$8N<QRjeR9G0ZgMO>Q1Fv`jbi;dtw#Uf6WOCV!K!f0t$2r12i<U~Mm&BVwc3ZfYp
z80?^F5Tp(iN}#buK~ZJ~F#!e!EG_|$eSp-$(i+JB%nU+I;!yKIX@~_g4bjE_pO_%y
zj0_B^j(L@##fj;u(6TGlIo~-avA7r^#>9~7?C%RodPQl8$*B;D&ycZ6Hb|O8l7NVF
zFr@mX<|d^UK~#g&uK-A_BqKisNlXDrzi%o+5>$>lpvWN%1IaZoz}$&!4a9B+MuZ%~
zM5r8ds(Vg;QeqCWWl+B`ut4Qp5=#=1+{nO@iWDMHa|FP75$x}P#G=I9)DjfGG8m*n
zY(SF)xt+lw)ziht7d+Ssu?Au$Qv*Xos$*V$US)26DN^{eG=R?2Ne!ti04=A3=m41m
z5@%va4b98U&x43U%mInBFr>QXmF7Z(q5cDjb1<a3fS2TjB&I_o5qbm|QiDs1N|Q?<
z65u!k=}}-v^~q08%z+3a)H^Vw1{CENq!yJx6A+R;4GgKSpy79PJq(PgA*s0qIf*5x
z=?G*G17j+3pup{CU`qAO$w^Izxd7&O29{LE;>x__jH3KJc!0s(@W0+3oVGn(d?1ea
z*u}suDhDkSC5bE(OF=mUsW=4lVMF3jjsuv6D<Y}{sw2BBM70cD45A7VWk0ALCXH79
zl_J&4AUPROImN`tAPb@y7#OUe<)4%Q0|VH~_*4c^`6mbC!-D`)hl0#wU?@w>DNSVn
z`4?37$wSQ|zU%|rgfJO2qk<|#lz+XTbt)r6VnJe2NiJw|2b7MO7(g2abKq@DW`?w)
z)Kp0Q%EFMBT3S++2panasbytI$}h=a_*llkF3Ji`M~c|eQAtKoYGO)eUOL>_R#tE^
zSmHqvaDcPn8n9HKACrhnGnj^f6A7j;HakG+LK!VxC?VC0AUPFK95OL7sDfyuG=q@C
zY|E)Z+j8mx3=E+54XEt~G8gWu_*4ddQ6>fr0R{%R7(6r}=?SDBmYzUi!NMTKqzQEc
zj`W1ZC{O~z6vh@7Nd1r0veY7I{g{}OlV1+*Z-CMS6GLK7d17TTs11hP|Io&kE)vtz
zi&E1QOA?a^&9Y%tiOr3mb}12aZCK3%Ckw1HSZso}^L5eEh7MBN0Lkfr;)98iK_5gT
zr457}X4(Lq2@dK>8X~0)kh!on7ARd92{15Vbt<$?4^j_nXMw^5ls1f^ZorW?up0+T
z9$00thY6@GAfnHSF$s-zZWYpdz)*|DX7Ic;5%G@0L~zXGkVCa?ik=~(H`m5!&>a51
zo3oZH=AU%f{7&r1rP;heN(>AP`aYR?*`9f6`3(A?IkjR2{o<lz{fyMal+>c6oc!c$
z@Qghhc!n0#BLu|-0|V%K050$>Dgy&Z4kQXXRS-nq2{dLL?f;MV{}~uY`~Rc;f6#8B
z(f&WS(FIUNg+-jG*6^o~qy2xR!3JobakT$G+W#*=8xuwCYFb%=`LHe=lr!4@hmHrJ
z_18xG|L_1A?f;MV|E;XB_A9aW1hJ|_?U{jBW`pOYi0B7mH4oei#43ZurqTXC+5%?C
zdhOxf|3{wx7Z6}z5MW|p;76PP2OSg#!l3zo@cC-so;@q*C_o0kV1|!;42)u+xqn6h
z28Oi!d<G#9pMima1+<nqF%!I=SBQaK3}ilNoS}pTG~e;*BS=gFd`}z$g9>;A0V1XV
z6$8!DGBQBKw4h>3!FLNn#7v-K@=OTx?4e?y!wF$}eW79xpnA<1AZ{n|m^g+qs6J<_
zn1G`E^rFOE1-Hx`&}j!17G|0;zM08I`NjEZB?@7g#ifZk3c)3%DVg~(Mh1rFG0yq9
zxutoTC6zH@&M_h2<L!Kd!<=IbP4o;+^^DBSjm%<_GV@|Q@{3C%%uHe`%uHgEbM!zP
z13oH2?bX9}Y$53MyU(9KIziQt=%<*nWJ41JJ<w@)pFYY#&CpF&(2dGe&~*>eRnYYf
zQP2$uP|%HF0Ih`(V`Tu%^n+&kLA`g-%spr}{?kX$$j&Gp4S~@R7!85Z5Eu=C(GVC7
zfzc2c4S~@R7!85Z5E#56(4xZ3@Uo7HL9m0F!J?Fb!Mlr@;r%~mhME3M3`+%>80N|`
zGB8v!GBkZ>W>7X_VEEb0%y8O~nc=n;6T>7`28IWVnHi+LnHdyynHbE@GBDf}V`Nyx
zz`!8Bh=IZLBQwK&2WE!cZUzR2xl9Zd(##BR8JQTW_A@YOyk=yu@@8h((89#9_$32F
zzAiJvPkROi?d6ON0j>~pD`zt^>`7*32w%p?@Z}OC!wVZmhJUXZ7#0RFGf3}YX3*+l
zW~g1qz!0~QfuX~Zfx+PjBg6F~Muz9C%nUnrGcdeb#>7x`n~|Z>fsuhZor%Hu6f?u_
z(+ms(o0%DIOEEG0b75joG-YBqe~^)37ZWqXKW+ww7hQ}D|86lc+&jd`u-KQ8A$$@u
zLr4}A!;*&#41dcQ7_?3?F??xYgr8k7cw=_dWuqZ58UmvsFd71*Aut*OqaiRF0;3@?
z8UmvsFd71*A%F;hV;Rg0`?8oAqWYK_q-7Wwlw}wgxMdg^1Z5Z)<YgEb)MOYKB4ijC
zPEKNEIPSr~AaRzNK{T3?p>IC}gXn$4UJrzcqpZ;o7!85Z5Eu=C(GVC7fzc2c4S~@R
z7!85Z5Eu=C(GVC7fx#96pd%DC7#JAJ7#J9AL>U;Im>C#6peMVZW?*2@VParN0AZ-Z
z7cej|Y+zzw0G$J9!o<J;+7AHQ7XUg85VY$bWF8*_1A{OFXdeLs12+TcJVDTTfS|hq
z7#P?YAa~P)P6ZH!nmJ01hQMeDjE2By2#kinXb6mkz-S1JhQMeDjE2By2#kinXb8|Y
z1i+j4X=}x(>7yYq8UmvsFd71*Aut*OqaiRF0;3@?8UmvsFd71*Aut*OqhK@yMnhmU
m1V%$(Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^E1190C9z*kWG*

literal 0
HcmV?d00001

-- 
GitLab


From bd934ffaabd8b706cf6d7bc52bd67bab81f7e5d5 Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov <shal1t712@gmail.com>
Date: Thu, 11 Oct 2018 22:33:50 +0000
Subject: [PATCH 0079/1116] [llvm-objcopy] Factor out CopyConfig

In this diff we move out CopyConfig from llvm-oobjcopy.cpp into a separate header CopyConfig.h
to enable us (in the future) reuse this class in the other implementations of objcopy (for coff, mach-o).
Additionally this enables us to unload the complexity from llvm-objcopy.cpp a little bit.

Test plan: make check-all

Differential revision: https://reviews.llvm.org/D53006


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344307 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objcopy/CMakeLists.txt   |   1 +
 tools/llvm-objcopy/CopyConfig.cpp   | 424 +++++++++++++++++++++++++
 tools/llvm-objcopy/CopyConfig.h     | 113 +++++++
 tools/llvm-objcopy/Object.h         |  10 +-
 tools/llvm-objcopy/llvm-objcopy.cpp | 464 +---------------------------
 5 files changed, 541 insertions(+), 471 deletions(-)
 create mode 100644 tools/llvm-objcopy/CopyConfig.cpp
 create mode 100644 tools/llvm-objcopy/CopyConfig.h

diff --git a/tools/llvm-objcopy/CMakeLists.txt b/tools/llvm-objcopy/CMakeLists.txt
index b0cd66be5b3..8d963e56758 100644
--- a/tools/llvm-objcopy/CMakeLists.txt
+++ b/tools/llvm-objcopy/CMakeLists.txt
@@ -14,6 +14,7 @@ tablegen(LLVM StripOpts.inc -gen-opt-parser-defs)
 add_public_tablegen_target(StripOptsTableGen)
 
 add_llvm_tool(llvm-objcopy
+  CopyConfig.cpp
   llvm-objcopy.cpp
   Object.cpp
   DEPENDS
diff --git a/tools/llvm-objcopy/CopyConfig.cpp b/tools/llvm-objcopy/CopyConfig.cpp
new file mode 100644
index 00000000000..d814df10525
--- /dev/null
+++ b/tools/llvm-objcopy/CopyConfig.cpp
@@ -0,0 +1,424 @@
+//===- CopyConfig.cpp -----------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CopyConfig.h"
+#include "llvm-objcopy.h"
+
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <memory>
+#include <string>
+
+namespace llvm {
+namespace objcopy {
+
+namespace {
+enum ObjcopyID {
+  OBJCOPY_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  OBJCOPY_##ID,
+#include "ObjcopyOpts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const OBJCOPY_##NAME[] = VALUE;
+#include "ObjcopyOpts.inc"
+#undef PREFIX
+
+static const opt::OptTable::Info ObjcopyInfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {OBJCOPY_##PREFIX,                                                           \
+   NAME,                                                                       \
+   HELPTEXT,                                                                   \
+   METAVAR,                                                                    \
+   OBJCOPY_##ID,                                                               \
+   opt::Option::KIND##Class,                                                   \
+   PARAM,                                                                      \
+   FLAGS,                                                                      \
+   OBJCOPY_##GROUP,                                                            \
+   OBJCOPY_##ALIAS,                                                            \
+   ALIASARGS,                                                                  \
+   VALUES},
+#include "ObjcopyOpts.inc"
+#undef OPTION
+};
+
+class ObjcopyOptTable : public opt::OptTable {
+public:
+  ObjcopyOptTable() : OptTable(ObjcopyInfoTable, true) {}
+};
+
+enum StripID {
+  STRIP_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  STRIP_##ID,
+#include "StripOpts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const STRIP_##NAME[] = VALUE;
+#include "StripOpts.inc"
+#undef PREFIX
+
+static const opt::OptTable::Info StripInfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {STRIP_##PREFIX, NAME,       HELPTEXT,                                       \
+   METAVAR,        STRIP_##ID, opt::Option::KIND##Class,                       \
+   PARAM,          FLAGS,      STRIP_##GROUP,                                  \
+   STRIP_##ALIAS,  ALIASARGS,  VALUES},
+#include "StripOpts.inc"
+#undef OPTION
+};
+
+class StripOptTable : public opt::OptTable {
+public:
+  StripOptTable() : OptTable(StripInfoTable, true) {}
+};
+
+enum SectionFlag {
+  SecNone = 0,
+  SecAlloc = 1 << 0,
+  SecLoad = 1 << 1,
+  SecNoload = 1 << 2,
+  SecReadonly = 1 << 3,
+  SecDebug = 1 << 4,
+  SecCode = 1 << 5,
+  SecData = 1 << 6,
+  SecRom = 1 << 7,
+  SecMerge = 1 << 8,
+  SecStrings = 1 << 9,
+  SecContents = 1 << 10,
+  SecShare = 1 << 11,
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ SecShare)
+};
+
+} // namespace
+
+static SectionFlag parseSectionRenameFlag(StringRef SectionName) {
+  return llvm::StringSwitch<SectionFlag>(SectionName)
+      .Case("alloc", SectionFlag::SecAlloc)
+      .Case("load", SectionFlag::SecLoad)
+      .Case("noload", SectionFlag::SecNoload)
+      .Case("readonly", SectionFlag::SecReadonly)
+      .Case("debug", SectionFlag::SecDebug)
+      .Case("code", SectionFlag::SecCode)
+      .Case("data", SectionFlag::SecData)
+      .Case("rom", SectionFlag::SecRom)
+      .Case("merge", SectionFlag::SecMerge)
+      .Case("strings", SectionFlag::SecStrings)
+      .Case("contents", SectionFlag::SecContents)
+      .Case("share", SectionFlag::SecShare)
+      .Default(SectionFlag::SecNone);
+}
+
+static SectionRename parseRenameSectionValue(StringRef FlagValue) {
+  if (!FlagValue.contains('='))
+    error("Bad format for --rename-section: missing '='");
+
+  // Initial split: ".foo" = ".bar,f1,f2,..."
+  auto Old2New = FlagValue.split('=');
+  SectionRename SR;
+  SR.OriginalName = Old2New.first;
+
+  // Flags split: ".bar" "f1" "f2" ...
+  SmallVector<StringRef, 6> NameAndFlags;
+  Old2New.second.split(NameAndFlags, ',');
+  SR.NewName = NameAndFlags[0];
+
+  if (NameAndFlags.size() > 1) {
+    SectionFlag Flags = SectionFlag::SecNone;
+    for (size_t I = 1, Size = NameAndFlags.size(); I < Size; ++I) {
+      SectionFlag Flag = parseSectionRenameFlag(NameAndFlags[I]);
+      if (Flag == SectionFlag::SecNone)
+        error("Unrecognized section flag '" + NameAndFlags[I] +
+              "'. Flags supported for GNU compatibility: alloc, load, noload, "
+              "readonly, debug, code, data, rom, share, contents, merge, "
+              "strings.");
+      Flags |= Flag;
+    }
+
+    SR.NewFlags = 0;
+    if (Flags & SectionFlag::SecAlloc)
+      *SR.NewFlags |= ELF::SHF_ALLOC;
+    if (!(Flags & SectionFlag::SecReadonly))
+      *SR.NewFlags |= ELF::SHF_WRITE;
+    if (Flags & SectionFlag::SecCode)
+      *SR.NewFlags |= ELF::SHF_EXECINSTR;
+    if (Flags & SectionFlag::SecMerge)
+      *SR.NewFlags |= ELF::SHF_MERGE;
+    if (Flags & SectionFlag::SecStrings)
+      *SR.NewFlags |= ELF::SHF_STRINGS;
+  }
+
+  return SR;
+}
+
+static const StringMap<MachineInfo> ArchMap{
+    // Name, {EMachine, 64bit, LittleEndian}
+    {"aarch64", {ELF::EM_AARCH64, true, true}},
+    {"arm", {ELF::EM_ARM, false, true}},
+    {"i386", {ELF::EM_386, false, true}},
+    {"i386:x86-64", {ELF::EM_X86_64, true, true}},
+    {"powerpc:common64", {ELF::EM_PPC64, true, true}},
+    {"sparc", {ELF::EM_SPARC, false, true}},
+    {"x86-64", {ELF::EM_X86_64, true, true}},
+};
+
+static const MachineInfo &getMachineInfo(StringRef Arch) {
+  auto Iter = ArchMap.find(Arch);
+  if (Iter == std::end(ArchMap))
+    error("Invalid architecture: '" + Arch + "'");
+  return Iter->getValue();
+}
+
+static void addGlobalSymbolsFromFile(std::vector<std::string> &Symbols,
+                                     StringRef Filename) {
+  SmallVector<StringRef, 16> Lines;
+  auto BufOrErr = MemoryBuffer::getFile(Filename);
+  if (!BufOrErr)
+    reportError(Filename, BufOrErr.getError());
+
+  BufOrErr.get()->getBuffer().split(Lines, '\n');
+  for (StringRef Line : Lines) {
+    // Ignore everything after '#', trim whitespace, and only add the symbol if
+    // it's not empty.
+    auto TrimmedLine = Line.split('#').first.trim();
+    if (!TrimmedLine.empty())
+      Symbols.push_back(TrimmedLine.str());
+  }
+}
+
+// ParseObjcopyOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseObjcopyOptions will print the help messege and
+// exit.
+DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
+  ObjcopyOptTable T;
+  unsigned MissingArgumentIndex, MissingArgumentCount;
+  llvm::opt::InputArgList InputArgs =
+      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
+
+  if (InputArgs.size() == 0) {
+    T.PrintHelp(errs(), "llvm-objcopy input [output]", "objcopy tool");
+    exit(1);
+  }
+
+  if (InputArgs.hasArg(OBJCOPY_help)) {
+    T.PrintHelp(outs(), "llvm-objcopy input [output]", "objcopy tool");
+    exit(0);
+  }
+
+  if (InputArgs.hasArg(OBJCOPY_version)) {
+    cl::PrintVersionMessage();
+    exit(0);
+  }
+
+  SmallVector<const char *, 2> Positional;
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_UNKNOWN))
+    error("unknown argument '" + Arg->getAsString(InputArgs) + "'");
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_INPUT))
+    Positional.push_back(Arg->getValue());
+
+  if (Positional.empty())
+    error("No input file specified");
+
+  if (Positional.size() > 2)
+    error("Too many positional arguments");
+
+  CopyConfig Config;
+  Config.InputFilename = Positional[0];
+  Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1];
+  Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target);
+  Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target);
+  if (Config.InputFormat == "binary") {
+    auto BinaryArch = InputArgs.getLastArgValue(OBJCOPY_binary_architecture);
+    if (BinaryArch.empty())
+      error("Specified binary input without specifiying an architecture");
+    Config.BinaryArch = getMachineInfo(BinaryArch);
+  }
+
+  if (auto Arg = InputArgs.getLastArg(OBJCOPY_compress_debug_sections,
+                                      OBJCOPY_compress_debug_sections_eq)) {
+    Config.CompressionType = DebugCompressionType::Z;
+
+    if (Arg->getOption().getID() == OBJCOPY_compress_debug_sections_eq) {
+      Config.CompressionType =
+          StringSwitch<DebugCompressionType>(
+              InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq))
+              .Case("zlib-gnu", DebugCompressionType::GNU)
+              .Case("zlib", DebugCompressionType::Z)
+              .Default(DebugCompressionType::None);
+      if (Config.CompressionType == DebugCompressionType::None)
+        error("Invalid or unsupported --compress-debug-sections format: " +
+              InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq));
+      if (!zlib::isAvailable())
+        error("LLVM was not compiled with LLVM_ENABLE_ZLIB: can not compress.");
+    }
+  }
+
+  Config.SplitDWO = InputArgs.getLastArgValue(OBJCOPY_split_dwo);
+  Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink);
+  Config.SymbolsPrefix = InputArgs.getLastArgValue(OBJCOPY_prefix_symbols);
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) {
+    if (!StringRef(Arg->getValue()).contains('='))
+      error("Bad format for --redefine-sym");
+    auto Old2New = StringRef(Arg->getValue()).split('=');
+    if (!Config.SymbolsToRename.insert(Old2New).second)
+      error("Multiple redefinition of symbol " + Old2New.first);
+  }
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_rename_section)) {
+    SectionRename SR = parseRenameSectionValue(StringRef(Arg->getValue()));
+    if (!Config.SectionsToRename.try_emplace(SR.OriginalName, SR).second)
+      error("Multiple renames of section " + SR.OriginalName);
+  }
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_remove_section))
+    Config.ToRemove.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep))
+    Config.Keep.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_only_keep))
+    Config.OnlyKeep.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_add_section))
+    Config.AddSection.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_dump_section))
+    Config.DumpSection.push_back(Arg->getValue());
+  Config.StripAll = InputArgs.hasArg(OBJCOPY_strip_all);
+  Config.StripAllGNU = InputArgs.hasArg(OBJCOPY_strip_all_gnu);
+  Config.StripDebug = InputArgs.hasArg(OBJCOPY_strip_debug);
+  Config.StripDWO = InputArgs.hasArg(OBJCOPY_strip_dwo);
+  Config.StripSections = InputArgs.hasArg(OBJCOPY_strip_sections);
+  Config.StripNonAlloc = InputArgs.hasArg(OBJCOPY_strip_non_alloc);
+  Config.StripUnneeded = InputArgs.hasArg(OBJCOPY_strip_unneeded);
+  Config.ExtractDWO = InputArgs.hasArg(OBJCOPY_extract_dwo);
+  Config.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden);
+  Config.Weaken = InputArgs.hasArg(OBJCOPY_weaken);
+  Config.DiscardAll = InputArgs.hasArg(OBJCOPY_discard_all);
+  Config.OnlyKeepDebug = InputArgs.hasArg(OBJCOPY_only_keep_debug);
+  Config.KeepFileSymbols = InputArgs.hasArg(OBJCOPY_keep_file_symbols);
+  Config.DecompressDebugSections =
+      InputArgs.hasArg(OBJCOPY_decompress_debug_sections);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol))
+    Config.SymbolsToLocalize.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbol))
+    Config.SymbolsToKeepGlobal.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbols))
+    addGlobalSymbolsFromFile(Config.SymbolsToKeepGlobal, Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbol))
+    Config.SymbolsToGlobalize.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbol))
+    Config.SymbolsToWeaken.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbol))
+    Config.SymbolsToRemove.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbol))
+    Config.SymbolsToKeep.push_back(Arg->getValue());
+
+  Config.PreserveDates = InputArgs.hasArg(OBJCOPY_preserve_dates);
+
+  DriverConfig DC;
+  DC.CopyConfigs.push_back(std::move(Config));
+  if (Config.DecompressDebugSections &&
+      Config.CompressionType != DebugCompressionType::None) {
+    error("Cannot specify --compress-debug-sections at the same time as "
+          "--decompress-debug-sections at the same time");
+  }
+
+  if (Config.DecompressDebugSections && !zlib::isAvailable())
+    error("LLVM was not compiled with LLVM_ENABLE_ZLIB: cannot decompress.");
+
+  return DC;
+}
+
+// ParseStripOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseStripOptions will print the help messege and
+// exit.
+DriverConfig parseStripOptions(ArrayRef<const char *> ArgsArr) {
+  StripOptTable T;
+  unsigned MissingArgumentIndex, MissingArgumentCount;
+  llvm::opt::InputArgList InputArgs =
+      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
+
+  if (InputArgs.size() == 0) {
+    T.PrintHelp(errs(), "llvm-strip [options] file...", "strip tool");
+    exit(1);
+  }
+
+  if (InputArgs.hasArg(STRIP_help)) {
+    T.PrintHelp(outs(), "llvm-strip [options] file...", "strip tool");
+    exit(0);
+  }
+
+  if (InputArgs.hasArg(STRIP_version)) {
+    cl::PrintVersionMessage();
+    exit(0);
+  }
+
+  SmallVector<const char *, 2> Positional;
+  for (auto Arg : InputArgs.filtered(STRIP_UNKNOWN))
+    error("unknown argument '" + Arg->getAsString(InputArgs) + "'");
+  for (auto Arg : InputArgs.filtered(STRIP_INPUT))
+    Positional.push_back(Arg->getValue());
+
+  if (Positional.empty())
+    error("No input file specified");
+
+  if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output))
+    error("Multiple input files cannot be used in combination with -o");
+
+  CopyConfig Config;
+  Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug);
+
+  Config.DiscardAll = InputArgs.hasArg(STRIP_discard_all);
+  Config.StripUnneeded = InputArgs.hasArg(STRIP_strip_unneeded);
+  Config.StripAll = InputArgs.hasArg(STRIP_strip_all);
+
+  if (!Config.StripDebug && !Config.StripUnneeded && !Config.DiscardAll)
+    Config.StripAll = true;
+
+  for (auto Arg : InputArgs.filtered(STRIP_remove_section))
+    Config.ToRemove.push_back(Arg->getValue());
+
+  for (auto Arg : InputArgs.filtered(STRIP_keep_symbol))
+    Config.SymbolsToKeep.push_back(Arg->getValue());
+
+  Config.PreserveDates = InputArgs.hasArg(STRIP_preserve_dates);
+
+  DriverConfig DC;
+  if (Positional.size() == 1) {
+    Config.InputFilename = Positional[0];
+    Config.OutputFilename =
+        InputArgs.getLastArgValue(STRIP_output, Positional[0]);
+    DC.CopyConfigs.push_back(std::move(Config));
+  } else {
+    for (const char *Filename : Positional) {
+      Config.InputFilename = Filename;
+      Config.OutputFilename = Filename;
+      DC.CopyConfigs.push_back(Config);
+    }
+  }
+
+  return DC;
+}
+
+} // namespace objcopy
+} // namespace llvm
diff --git a/tools/llvm-objcopy/CopyConfig.h b/tools/llvm-objcopy/CopyConfig.h
new file mode 100644
index 00000000000..203432a11a6
--- /dev/null
+++ b/tools/llvm-objcopy/CopyConfig.h
@@ -0,0 +1,113 @@
+//===- CopyConfig.h -------------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_OBJCOPY_COPY_CONFIG_H
+#define LLVM_TOOLS_LLVM_OBJCOPY_COPY_CONFIG_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+// Necessary for llvm::DebugCompressionType::None
+#include "llvm/Target/TargetOptions.h"
+#include <string>
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+
+// This type keeps track of the machine info for various architectures. This
+// lets us map architecture names to ELF types and the e_machine value of the
+// ELF file.
+struct MachineInfo {
+  uint16_t EMachine;
+  bool Is64Bit;
+  bool IsLittleEndian;
+};
+
+struct SectionRename {
+  StringRef OriginalName;
+  StringRef NewName;
+  Optional<uint64_t> NewFlags;
+};
+
+// Configuration for copying/stripping a single file.
+struct CopyConfig {
+  // Main input/output options
+  StringRef InputFilename;
+  StringRef InputFormat;
+  StringRef OutputFilename;
+  StringRef OutputFormat;
+
+  // Only applicable for --input-format=Binary
+  MachineInfo BinaryArch;
+
+  // Advanced options
+  StringRef AddGnuDebugLink;
+  StringRef SplitDWO;
+  StringRef SymbolsPrefix;
+
+  // Repeated options
+  std::vector<StringRef> AddSection;
+  std::vector<StringRef> DumpSection;
+  std::vector<StringRef> Keep;
+  std::vector<StringRef> OnlyKeep;
+  std::vector<StringRef> SymbolsToGlobalize;
+  std::vector<StringRef> SymbolsToKeep;
+  std::vector<StringRef> SymbolsToLocalize;
+  std::vector<StringRef> SymbolsToRemove;
+  std::vector<StringRef> SymbolsToWeaken;
+  std::vector<StringRef> ToRemove;
+  std::vector<std::string> SymbolsToKeepGlobal;
+
+  // Map options
+  StringMap<SectionRename> SectionsToRename;
+  StringMap<StringRef> SymbolsToRename;
+
+  // Boolean options
+  bool DiscardAll = false;
+  bool ExtractDWO = false;
+  bool KeepFileSymbols = false;
+  bool LocalizeHidden = false;
+  bool OnlyKeepDebug = false;
+  bool PreserveDates = false;
+  bool StripAll = false;
+  bool StripAllGNU = false;
+  bool StripDWO = false;
+  bool StripDebug = false;
+  bool StripNonAlloc = false;
+  bool StripSections = false;
+  bool StripUnneeded = false;
+  bool Weaken = false;
+  bool DecompressDebugSections = false;
+  DebugCompressionType CompressionType = DebugCompressionType::None;
+};
+
+// Configuration for the overall invocation of this tool. When invoked as
+// objcopy, will always contain exactly one CopyConfig. When invoked as strip,
+// will contain one or more CopyConfigs.
+struct DriverConfig {
+  SmallVector<CopyConfig, 1> CopyConfigs;
+};
+
+// ParseObjcopyOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseObjcopyOptions will print the help messege and
+// exit.
+DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr);
+
+// ParseStripOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseStripOptions will print the help messege and
+// exit.
+DriverConfig parseStripOptions(ArrayRef<const char *> ArgsArr);
+
+} // namespace objcopy
+} // namespace llvm
+
+#endif
diff --git a/tools/llvm-objcopy/Object.h b/tools/llvm-objcopy/Object.h
index 5fb03a5501e..46c8f1ca4bf 100644
--- a/tools/llvm-objcopy/Object.h
+++ b/tools/llvm-objcopy/Object.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_TOOLS_OBJCOPY_OBJECT_H
 #define LLVM_TOOLS_OBJCOPY_OBJECT_H
 
+#include "CopyConfig.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -67,15 +68,6 @@ public:
 
 enum ElfType { ELFT_ELF32LE, ELFT_ELF64LE, ELFT_ELF32BE, ELFT_ELF64BE };
 
-// This type keeps track of the machine info for various architectures. This
-// lets us map architecture names to ELF types and the e_machine value of the
-// ELF file.
-struct MachineInfo {
-  uint16_t EMachine;
-  bool Is64Bit;
-  bool IsLittleEndian;
-};
-
 class SectionVisitor {
 public:
   virtual ~SectionVisitor();
diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index 41c6ef3f3dc..c9b170d1d61 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -8,8 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-objcopy.h"
-
+#include "CopyConfig.h"
 #include "Object.h"
+
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
@@ -56,160 +57,8 @@ using namespace llvm::objcopy;
 using namespace object;
 using namespace ELF;
 
-namespace {
-
-enum ObjcopyID {
-  OBJCOPY_INVALID = 0, // This is not an option ID.
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  OBJCOPY_##ID,
-#include "ObjcopyOpts.inc"
-#undef OPTION
-};
-
-#define PREFIX(NAME, VALUE) const char *const OBJCOPY_##NAME[] = VALUE;
-#include "ObjcopyOpts.inc"
-#undef PREFIX
-
-static const opt::OptTable::Info ObjcopyInfoTable[] = {
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  {OBJCOPY_##PREFIX,                                                           \
-   NAME,                                                                       \
-   HELPTEXT,                                                                   \
-   METAVAR,                                                                    \
-   OBJCOPY_##ID,                                                               \
-   opt::Option::KIND##Class,                                                   \
-   PARAM,                                                                      \
-   FLAGS,                                                                      \
-   OBJCOPY_##GROUP,                                                            \
-   OBJCOPY_##ALIAS,                                                            \
-   ALIASARGS,                                                                  \
-   VALUES},
-#include "ObjcopyOpts.inc"
-#undef OPTION
-};
-
-class ObjcopyOptTable : public opt::OptTable {
-public:
-  ObjcopyOptTable() : OptTable(ObjcopyInfoTable, true) {}
-};
-
-enum StripID {
-  STRIP_INVALID = 0, // This is not an option ID.
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  STRIP_##ID,
-#include "StripOpts.inc"
-#undef OPTION
-};
-
-#define PREFIX(NAME, VALUE) const char *const STRIP_##NAME[] = VALUE;
-#include "StripOpts.inc"
-#undef PREFIX
-
-static const opt::OptTable::Info StripInfoTable[] = {
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  {STRIP_##PREFIX, NAME,       HELPTEXT,                                       \
-   METAVAR,        STRIP_##ID, opt::Option::KIND##Class,                       \
-   PARAM,          FLAGS,      STRIP_##GROUP,                                  \
-   STRIP_##ALIAS,  ALIASARGS,  VALUES},
-#include "StripOpts.inc"
-#undef OPTION
-};
-
-class StripOptTable : public opt::OptTable {
-public:
-  StripOptTable() : OptTable(StripInfoTable, true) {}
-};
-
-struct SectionRename {
-  StringRef OriginalName;
-  StringRef NewName;
-  Optional<uint64_t> NewFlags;
-};
-
-// Configuration for copying/stripping a single file.
-struct CopyConfig {
-  // Main input/output options
-  StringRef InputFilename;
-  StringRef InputFormat;
-  StringRef OutputFilename;
-  StringRef OutputFormat;
-
-  // Only applicable for --input-format=Binary
-  MachineInfo BinaryArch;
-
-  // Advanced options
-  StringRef AddGnuDebugLink;
-  StringRef SplitDWO;
-  StringRef SymbolsPrefix;
-
-  // Repeated options
-  std::vector<StringRef> AddSection;
-  std::vector<StringRef> DumpSection;
-  std::vector<StringRef> Keep;
-  std::vector<StringRef> OnlyKeep;
-  std::vector<StringRef> SymbolsToGlobalize;
-  std::vector<StringRef> SymbolsToKeep;
-  std::vector<StringRef> SymbolsToLocalize;
-  std::vector<StringRef> SymbolsToRemove;
-  std::vector<StringRef> SymbolsToWeaken;
-  std::vector<StringRef> ToRemove;
-  std::vector<std::string> SymbolsToKeepGlobal;
-
-  // Map options
-  StringMap<SectionRename> SectionsToRename;
-  StringMap<StringRef> SymbolsToRename;
-
-  // Boolean options
-  bool DiscardAll = false;
-  bool ExtractDWO = false;
-  bool KeepFileSymbols = false;
-  bool LocalizeHidden = false;
-  bool OnlyKeepDebug = false;
-  bool PreserveDates = false;
-  bool StripAll = false;
-  bool StripAllGNU = false;
-  bool StripDWO = false;
-  bool StripDebug = false;
-  bool StripNonAlloc = false;
-  bool StripSections = false;
-  bool StripUnneeded = false;
-  bool Weaken = false;
-  bool DecompressDebugSections = false;
-  DebugCompressionType CompressionType = DebugCompressionType::None;
-};
-
-// Configuration for the overall invocation of this tool. When invoked as
-// objcopy, will always contain exactly one CopyConfig. When invoked as strip,
-// will contain one or more CopyConfigs.
-struct DriverConfig {
-  SmallVector<CopyConfig, 1> CopyConfigs;
-};
-
 using SectionPred = std::function<bool(const SectionBase &Sec)>;
 
-enum SectionFlag {
-  SecNone = 0,
-  SecAlloc = 1 << 0,
-  SecLoad = 1 << 1,
-  SecNoload = 1 << 2,
-  SecReadonly = 1 << 3,
-  SecDebug = 1 << 4,
-  SecCode = 1 << 5,
-  SecData = 1 << 6,
-  SecRom = 1 << 7,
-  SecMerge = 1 << 8,
-  SecStrings = 1 << 9,
-  SecContents = 1 << 10,
-  SecShare = 1 << 11,
-  LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ SecShare)
-};
-
-} // namespace
-
 namespace llvm {
 namespace objcopy {
 
@@ -242,65 +91,6 @@ LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Error E) {
 } // end namespace objcopy
 } // end namespace llvm
 
-static SectionFlag parseSectionRenameFlag(StringRef SectionName) {
-  return llvm::StringSwitch<SectionFlag>(SectionName)
-      .Case("alloc", SectionFlag::SecAlloc)
-      .Case("load", SectionFlag::SecLoad)
-      .Case("noload", SectionFlag::SecNoload)
-      .Case("readonly", SectionFlag::SecReadonly)
-      .Case("debug", SectionFlag::SecDebug)
-      .Case("code", SectionFlag::SecCode)
-      .Case("data", SectionFlag::SecData)
-      .Case("rom", SectionFlag::SecRom)
-      .Case("merge", SectionFlag::SecMerge)
-      .Case("strings", SectionFlag::SecStrings)
-      .Case("contents", SectionFlag::SecContents)
-      .Case("share", SectionFlag::SecShare)
-      .Default(SectionFlag::SecNone);
-}
-
-static SectionRename parseRenameSectionValue(StringRef FlagValue) {
-  if (!FlagValue.contains('='))
-    error("Bad format for --rename-section: missing '='");
-
-  // Initial split: ".foo" = ".bar,f1,f2,..."
-  auto Old2New = FlagValue.split('=');
-  SectionRename SR;
-  SR.OriginalName = Old2New.first;
-
-  // Flags split: ".bar" "f1" "f2" ...
-  SmallVector<StringRef, 6> NameAndFlags;
-  Old2New.second.split(NameAndFlags, ',');
-  SR.NewName = NameAndFlags[0];
-
-  if (NameAndFlags.size() > 1) {
-    SectionFlag Flags = SectionFlag::SecNone;
-    for (size_t I = 1, Size = NameAndFlags.size(); I < Size; ++I) {
-      SectionFlag Flag = parseSectionRenameFlag(NameAndFlags[I]);
-      if (Flag == SectionFlag::SecNone)
-        error("Unrecognized section flag '" + NameAndFlags[I] +
-              "'. Flags supported for GNU compatibility: alloc, load, noload, "
-              "readonly, debug, code, data, rom, share, contents, merge, "
-              "strings.");
-      Flags |= Flag;
-    }
-
-    SR.NewFlags = 0;
-    if (Flags & SectionFlag::SecAlloc)
-      *SR.NewFlags |= ELF::SHF_ALLOC;
-    if (!(Flags & SectionFlag::SecReadonly))
-      *SR.NewFlags |= ELF::SHF_WRITE;
-    if (Flags & SectionFlag::SecCode)
-      *SR.NewFlags |= ELF::SHF_EXECINSTR;
-    if (Flags & SectionFlag::SecMerge)
-      *SR.NewFlags |= ELF::SHF_MERGE;
-    if (Flags & SectionFlag::SecStrings)
-      *SR.NewFlags |= ELF::SHF_STRINGS;
-  }
-
-  return SR;
-}
-
 static bool isDebugSection(const SectionBase &Sec) {
   return StringRef(Sec.Name).startswith(".debug") ||
          StringRef(Sec.Name).startswith(".zdebug") || Sec.Name == ".gdb_index";
@@ -319,24 +109,6 @@ static bool onlyKeepDWOPred(const Object &Obj, const SectionBase &Sec) {
   return !isDWOSection(Sec);
 }
 
-static const StringMap<MachineInfo> ArchMap{
-    // Name, {EMachine, 64bit, LittleEndian}
-    {"aarch64", {EM_AARCH64, true, true}},
-    {"arm", {EM_ARM, false, true}},
-    {"i386", {EM_386, false, true}},
-    {"i386:x86-64", {EM_X86_64, true, true}},
-    {"powerpc:common64", {EM_PPC64, true, true}},
-    {"sparc", {EM_SPARC, false, true}},
-    {"x86-64", {EM_X86_64, true, true}},
-};
-
-static const MachineInfo &getMachineInfo(StringRef Arch) {
-  auto Iter = ArchMap.find(Arch);
-  if (Iter == std::end(ArchMap))
-    error("Invalid architecture: '" + Arch + "'");
-  return Iter->getValue();
-}
-
 static ElfType getOutputElfType(const Binary &Bin) {
   // Infer output ELF type from the input ELF object
   if (isa<ELFObjectFile<ELF32LE>>(Bin))
@@ -870,238 +642,6 @@ static void executeElfObjcopy(const CopyConfig &Config) {
   }
 }
 
-static void addGlobalSymbolsFromFile(std::vector<std::string> &Symbols,
-                                     StringRef Filename) {
-  SmallVector<StringRef, 16> Lines;
-  auto BufOrErr = MemoryBuffer::getFile(Filename);
-  if (!BufOrErr)
-    reportError(Filename, BufOrErr.getError());
-
-  BufOrErr.get()->getBuffer().split(Lines, '\n');
-  for (StringRef Line : Lines) {
-    // Ignore everything after '#', trim whitespace, and only add the symbol if
-    // it's not empty.
-    auto TrimmedLine = Line.split('#').first.trim();
-    if (!TrimmedLine.empty())
-      Symbols.push_back(TrimmedLine.str());
-  }
-}
-
-// ParseObjcopyOptions returns the config and sets the input arguments. If a
-// help flag is set then ParseObjcopyOptions will print the help messege and
-// exit.
-static DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
-  ObjcopyOptTable T;
-  unsigned MissingArgumentIndex, MissingArgumentCount;
-  llvm::opt::InputArgList InputArgs =
-      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
-
-  if (InputArgs.size() == 0) {
-    T.PrintHelp(errs(), "llvm-objcopy input [output]", "objcopy tool");
-    exit(1);
-  }
-
-  if (InputArgs.hasArg(OBJCOPY_help)) {
-    T.PrintHelp(outs(), "llvm-objcopy input [output]", "objcopy tool");
-    exit(0);
-  }
-
-  if (InputArgs.hasArg(OBJCOPY_version)) {
-    cl::PrintVersionMessage();
-    exit(0);
-  }
-
-  SmallVector<const char *, 2> Positional;
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_UNKNOWN))
-    error("unknown argument '" + Arg->getAsString(InputArgs) + "'");
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_INPUT))
-    Positional.push_back(Arg->getValue());
-
-  if (Positional.empty())
-    error("No input file specified");
-
-  if (Positional.size() > 2)
-    error("Too many positional arguments");
-
-  CopyConfig Config;
-  Config.InputFilename = Positional[0];
-  Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1];
-  Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target);
-  Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target);
-  if (Config.InputFormat == "binary") {
-    auto BinaryArch = InputArgs.getLastArgValue(OBJCOPY_binary_architecture);
-    if (BinaryArch.empty())
-      error("Specified binary input without specifiying an architecture");
-    Config.BinaryArch = getMachineInfo(BinaryArch);
-  }
-
-  if (auto Arg = InputArgs.getLastArg(OBJCOPY_compress_debug_sections,
-                                      OBJCOPY_compress_debug_sections_eq)) {
-    Config.CompressionType = DebugCompressionType::Z;
-
-    if (Arg->getOption().getID() == OBJCOPY_compress_debug_sections_eq) {
-      Config.CompressionType =
-          StringSwitch<DebugCompressionType>(
-              InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq))
-              .Case("zlib-gnu", DebugCompressionType::GNU)
-              .Case("zlib", DebugCompressionType::Z)
-              .Default(DebugCompressionType::None);
-      if (Config.CompressionType == DebugCompressionType::None)
-        error("Invalid or unsupported --compress-debug-sections format: " +
-              InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq));
-      if (!zlib::isAvailable())
-        error("LLVM was not compiled with LLVM_ENABLE_ZLIB: can not compress.");
-    }
-  }
-
-  Config.SplitDWO = InputArgs.getLastArgValue(OBJCOPY_split_dwo);
-  Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink);
-  Config.SymbolsPrefix = InputArgs.getLastArgValue(OBJCOPY_prefix_symbols);
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) {
-    if (!StringRef(Arg->getValue()).contains('='))
-      error("Bad format for --redefine-sym");
-    auto Old2New = StringRef(Arg->getValue()).split('=');
-    if (!Config.SymbolsToRename.insert(Old2New).second)
-      error("Multiple redefinition of symbol " + Old2New.first);
-  }
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_rename_section)) {
-    SectionRename SR = parseRenameSectionValue(StringRef(Arg->getValue()));
-    if (!Config.SectionsToRename.try_emplace(SR.OriginalName, SR).second)
-      error("Multiple renames of section " + SR.OriginalName);
-  }
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_remove_section))
-    Config.ToRemove.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep))
-    Config.Keep.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_only_keep))
-    Config.OnlyKeep.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_add_section))
-    Config.AddSection.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_dump_section))
-    Config.DumpSection.push_back(Arg->getValue());
-  Config.StripAll = InputArgs.hasArg(OBJCOPY_strip_all);
-  Config.StripAllGNU = InputArgs.hasArg(OBJCOPY_strip_all_gnu);
-  Config.StripDebug = InputArgs.hasArg(OBJCOPY_strip_debug);
-  Config.StripDWO = InputArgs.hasArg(OBJCOPY_strip_dwo);
-  Config.StripSections = InputArgs.hasArg(OBJCOPY_strip_sections);
-  Config.StripNonAlloc = InputArgs.hasArg(OBJCOPY_strip_non_alloc);
-  Config.StripUnneeded = InputArgs.hasArg(OBJCOPY_strip_unneeded);
-  Config.ExtractDWO = InputArgs.hasArg(OBJCOPY_extract_dwo);
-  Config.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden);
-  Config.Weaken = InputArgs.hasArg(OBJCOPY_weaken);
-  Config.DiscardAll = InputArgs.hasArg(OBJCOPY_discard_all);
-  Config.OnlyKeepDebug = InputArgs.hasArg(OBJCOPY_only_keep_debug);
-  Config.KeepFileSymbols = InputArgs.hasArg(OBJCOPY_keep_file_symbols);
-  Config.DecompressDebugSections =
-      InputArgs.hasArg(OBJCOPY_decompress_debug_sections);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol))
-    Config.SymbolsToLocalize.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbol))
-    Config.SymbolsToKeepGlobal.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbols))
-    addGlobalSymbolsFromFile(Config.SymbolsToKeepGlobal, Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbol))
-    Config.SymbolsToGlobalize.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbol))
-    Config.SymbolsToWeaken.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbol))
-    Config.SymbolsToRemove.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbol))
-    Config.SymbolsToKeep.push_back(Arg->getValue());
-
-  Config.PreserveDates = InputArgs.hasArg(OBJCOPY_preserve_dates);
-
-  DriverConfig DC;
-  DC.CopyConfigs.push_back(std::move(Config));
-  if (Config.DecompressDebugSections &&
-      Config.CompressionType != DebugCompressionType::None) {
-    error("Cannot specify --compress-debug-sections at the same time as "
-          "--decompress-debug-sections at the same time");
-  }
-
-  if (Config.DecompressDebugSections && !zlib::isAvailable())
-    error("LLVM was not compiled with LLVM_ENABLE_ZLIB: cannot decompress.");
-
-  return DC;
-}
-
-// ParseStripOptions returns the config and sets the input arguments. If a
-// help flag is set then ParseStripOptions will print the help messege and
-// exit.
-static DriverConfig parseStripOptions(ArrayRef<const char *> ArgsArr) {
-  StripOptTable T;
-  unsigned MissingArgumentIndex, MissingArgumentCount;
-  llvm::opt::InputArgList InputArgs =
-      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
-
-  static const char Usage[] = "llvm-strip [options] file...";
-  if (InputArgs.size() == 0) {
-    T.PrintHelp(errs(), Usage, "strip tool");
-    exit(1);
-  }
-
-  if (InputArgs.hasArg(STRIP_help)) {
-    T.PrintHelp(outs(), Usage, "strip tool");
-    exit(0);
-  }
-
-  if (InputArgs.hasArg(STRIP_version)) {
-    cl::PrintVersionMessage();
-    exit(0);
-  }
-
-  SmallVector<const char *, 2> Positional;
-  for (auto Arg : InputArgs.filtered(STRIP_UNKNOWN))
-    error("unknown argument '" + Arg->getAsString(InputArgs) + "'");
-  for (auto Arg : InputArgs.filtered(STRIP_INPUT))
-    Positional.push_back(Arg->getValue());
-
-  if (Positional.empty())
-    error("No input file specified");
-
-  if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output))
-    error("Multiple input files cannot be used in combination with -o");
-
-  CopyConfig Config;
-  Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug);
-
-  Config.DiscardAll = InputArgs.hasArg(STRIP_discard_all);
-  Config.StripUnneeded = InputArgs.hasArg(STRIP_strip_unneeded);
-  Config.StripAll = InputArgs.hasArg(STRIP_strip_all);
-
-  if (!Config.StripDebug && !Config.StripUnneeded && !Config.DiscardAll)
-    Config.StripAll = true;
-
-  for (auto Arg : InputArgs.filtered(STRIP_remove_section))
-    Config.ToRemove.push_back(Arg->getValue());
-
-  for (auto Arg : InputArgs.filtered(STRIP_keep_symbol))
-    Config.SymbolsToKeep.push_back(Arg->getValue());
-
-  Config.PreserveDates = InputArgs.hasArg(STRIP_preserve_dates);
-
-  DriverConfig DC;
-  if (Positional.size() == 1) {
-    Config.InputFilename = Positional[0];
-    Config.OutputFilename =
-        InputArgs.getLastArgValue(STRIP_output, Positional[0]);
-    DC.CopyConfigs.push_back(std::move(Config));
-  } else {
-    for (const char *Filename : Positional) {
-      Config.InputFilename = Filename;
-      Config.OutputFilename = Filename;
-      DC.CopyConfigs.push_back(Config);
-    }
-  }
-
-  return DC;
-}
-
 int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   ToolName = argv[0];
-- 
GitLab


From 0739d3ad5470acdf5fdc9cfa036194e9af438fb3 Mon Sep 17 00:00:00 2001
From: Richard Trieu <rtrieu@google.com>
Date: Thu, 11 Oct 2018 22:42:41 +0000
Subject: [PATCH 0080/1116] Inline variable into assert to avoid unused
 variable warning.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344308 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index d118e38ae72..c6ab4fb70f6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26428,8 +26428,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     // Use an f64 load and a scalar_to_vector for v2f32 loads. This avoids
     // scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp cast
     // since type legalization will try to use an i64 load.
-    EVT VT = N->getValueType(0);
-    assert(VT == MVT::v2f32 && "Unexpected VT");
+    assert(N->getValueType(0) == MVT::v2f32 && "Unexpected VT");
     if (!ISD::isNON_EXTLoad(N))
       return;
     auto *Ld = cast<LoadSDNode>(N);
-- 
GitLab


From 81c9e86b06fc9ce882a0ea7f1a8df0d4b49f5cb2 Mon Sep 17 00:00:00 2001
From: Ana Pazos <apazos@codeaurora.org>
Date: Thu, 11 Oct 2018 22:49:13 +0000
Subject: [PATCH 0081/1116] [RISCV] Fix disassembling of fence instruction with
 invalid field

Summary:
Instruction with 0 in fence field being disassembled as fence , iorw.
Printing "unknown" to match GAS behavior.

This bug was uncovered by a LLVM MC Disassembler Protocol Buffer Fuzzer
for the RISC-V assembly language.

Reviewers: asb

Subscribers: rbar, johnrusso, simoncook, sabuasal, niosHD, kito-cheng, shiva0217, zzheng, edward-jones, mgrang, rogfer01, MartinMosbeck, brucehoult, the_o, rkruppe, jfb, PkmX, jocewei, asb

Differential Revision: https://reviews.llvm.org/D51828

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344309 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp  | 4 ++++
 test/MC/Disassembler/RISCV/unknown-fence-field.txt | 9 +++++++++
 test/MC/RISCV/rv32i-invalid.s                      | 1 +
 3 files changed, 14 insertions(+)
 create mode 100644 test/MC/Disassembler/RISCV/unknown-fence-field.txt

diff --git a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
index aa21cf0e6b4..979c8f4e2fa 100644
--- a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
+++ b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
@@ -93,6 +93,8 @@ void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
   unsigned FenceArg = MI->getOperand(OpNo).getImm();
+  assert (((FenceArg >> 4) == 0) && "Invalid immediate in printFenceArg");
+
   if ((FenceArg & RISCVFenceField::I) != 0)
     O << 'i';
   if ((FenceArg & RISCVFenceField::O) != 0)
@@ -101,6 +103,8 @@ void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo,
     O << 'r';
   if ((FenceArg & RISCVFenceField::W) != 0)
     O << 'w';
+  if (FenceArg == 0)
+    O << "unknown";
 }
 
 void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo,
diff --git a/test/MC/Disassembler/RISCV/unknown-fence-field.txt b/test/MC/Disassembler/RISCV/unknown-fence-field.txt
new file mode 100644
index 00000000000..5b20994dcb6
--- /dev/null
+++ b/test/MC/Disassembler/RISCV/unknown-fence-field.txt
@@ -0,0 +1,9 @@
+# RUN: llvm-mc -disassemble -triple=riscv32 < %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -disassemble -triple=riscv64 < %s 2>&1 | FileCheck %s
+#
+# Test generated by a LLVM MC Disassembler Protocol Buffer Fuzzer
+# for the RISC-V assembly language.
+
+# This decodes as fence , iorw with invalid fence field as 0.
+[0x0f 0x00 0xf0 0x00]
+# CHECK: fence unknown, iorw
diff --git a/test/MC/RISCV/rv32i-invalid.s b/test/MC/RISCV/rv32i-invalid.s
index 92b9b4ad34f..f856bf1f934 100644
--- a/test/MC/RISCV/rv32i-invalid.s
+++ b/test/MC/RISCV/rv32i-invalid.s
@@ -6,6 +6,7 @@ fence iorw, iore # CHECK: :[[@LINE]]:13: error: operand must be formed of letter
 fence wr, wr # CHECK: :[[@LINE]]:7: error: operand must be formed of letters selected in-order from 'iorw'
 fence rw, rr # CHECK: :[[@LINE]]:11: error: operand must be formed of letters selected in-order from 'iorw'
 fence 1, rw # CHECK: :[[@LINE]]:7: error: operand must be formed of letters selected in-order from 'iorw'
+fence unknown, unknown # CHECK: :[[@LINE]]:7: error: operand must be formed of letters selected in-order from 'iorw'
 
 ## uimm5
 slli a0, a0, 32 # CHECK: :[[@LINE]]:14: error: immediate must be an integer in the range [0, 31]
-- 
GitLab


From 1a0ffaa45417bbc9e684e65ff3fcda884435860f Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 11 Oct 2018 22:49:54 +0000
Subject: [PATCH 0082/1116] AMDGPU/GlobalISel: Implement select for G_INSERT

Reviewers: arsenm

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D53116

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344310 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 30 ++++++++++++
 lib/Target/AMDGPU/AMDGPUInstructionSelector.h |  1 +
 .../AMDGPU/GlobalISel/inst-select-insert.mir  | 49 +++++++++++++++++++
 3 files changed, 80 insertions(+)
 create mode 100644 test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir

diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8eb49d49b2e..55ceb8f666f 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -178,6 +178,34 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32);
+  DebugLoc DL = I.getDebugLoc();
+  MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG))
+                               .addDef(I.getOperand(0).getReg())
+                               .addReg(I.getOperand(1).getReg())
+                               .addReg(I.getOperand(2).getReg())
+                               .addImm(SubReg);
+
+  for (const MachineOperand &MO : Ins->operands()) {
+    if (!MO.isReg())
+      continue;
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      continue;
+
+    const TargetRegisterClass *RC =
+            TRI.getConstrainedRegClassForOperand(MO, MRI);
+    if (!RC)
+      continue;
+    RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+  }
+  I.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I,
                                           CodeGenCoverage &CoverageInfo) const {
   unsigned IntrinsicID =  I.getOperand(1).getIntrinsicID();
@@ -640,6 +668,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
     return selectG_GEP(I);
   case TargetOpcode::G_IMPLICIT_DEF:
     return selectG_IMPLICIT_DEF(I);
+  case TargetOpcode::G_INSERT:
+    return selectG_INSERT(I);
   case TargetOpcode::G_INTRINSIC:
     return selectG_INTRINSIC(I, CoverageInfo);
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 449431adc56..f3a835a32a8 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -67,6 +67,7 @@ private:
   bool selectG_ADD(MachineInstr &I) const;
   bool selectG_GEP(MachineInstr &I) const;
   bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
+  bool selectG_INSERT(MachineInstr &I) const;
   bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
   bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I,
                                         CodeGenCoverage &CoverageInfo) const;
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir
new file mode 100644
index 00000000000..93e35ead4d4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir
@@ -0,0 +1,49 @@
+# RUN: llc -march=amdgcn -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+---
+
+name:            insert512
+legalized:       true
+regBankSelected: true
+
+# CHECK-LABEL: insert512
+# CHECK: [[BASE:%[0-9]+]]:sreg_512 = IMPLICIT_DEF
+# CHECK: [[VAL:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF
+# CHECK: [[BASE0:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE]], [[VAL]], %subreg.sub0
+# CHECK: [[BASE1:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE0]], [[VAL]], %subreg.sub1
+# CHECK: [[BASE2:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE1]], [[VAL]], %subreg.sub2
+# CHECK: [[BASE3:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE2]], [[VAL]], %subreg.sub3
+# CHECK: [[BASE4:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE3]], [[VAL]], %subreg.sub4
+# CHECK: [[BASE5:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE4]], [[VAL]], %subreg.sub5
+# CHECK: [[BASE6:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE5]], [[VAL]], %subreg.sub6
+# CHECK: [[BASE7:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE6]], [[VAL]], %subreg.sub7
+# CHECK: [[BASE8:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE7]], [[VAL]], %subreg.sub8
+# CHECK: [[BASE9:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE8]], [[VAL]], %subreg.sub9
+# CHECK: [[BASE10:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE9]], [[VAL]], %subreg.sub10
+# CHECK: [[BASE11:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE10]], [[VAL]], %subreg.sub11
+# CHECK: [[BASE12:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE11]], [[VAL]], %subreg.sub12
+# CHECK: [[BASE13:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE12]], [[VAL]], %subreg.sub13
+# CHECK: [[BASE14:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE13]], [[VAL]], %subreg.sub14
+# CHECK: [[BASE15:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE14]], [[VAL]], %subreg.sub15
+
+body: |
+  bb.0:
+    %0:sgpr(s512) = G_IMPLICIT_DEF
+    %1:sgpr(s32) = G_IMPLICIT_DEF
+    %2:sgpr(s512) = G_INSERT %0:sgpr, %1:sgpr(s32), 0
+    %3:sgpr(s512) = G_INSERT %2:sgpr, %1:sgpr(s32), 32
+    %4:sgpr(s512) = G_INSERT %3:sgpr, %1:sgpr(s32), 64
+    %5:sgpr(s512) = G_INSERT %4:sgpr, %1:sgpr(s32), 96
+    %6:sgpr(s512) = G_INSERT %5:sgpr, %1:sgpr(s32), 128
+    %7:sgpr(s512) = G_INSERT %6:sgpr, %1:sgpr(s32), 160
+    %8:sgpr(s512) = G_INSERT %7:sgpr, %1:sgpr(s32), 192
+    %9:sgpr(s512) = G_INSERT %8:sgpr, %1:sgpr(s32), 224
+    %10:sgpr(s512) = G_INSERT %9:sgpr, %1:sgpr(s32), 256
+    %11:sgpr(s512) = G_INSERT %10:sgpr, %1:sgpr(s32), 288
+    %12:sgpr(s512) = G_INSERT %11:sgpr, %1:sgpr(s32), 320
+    %13:sgpr(s512) = G_INSERT %12:sgpr, %1:sgpr(s32), 352
+    %14:sgpr(s512) = G_INSERT %13:sgpr, %1:sgpr(s32), 384
+    %15:sgpr(s512) = G_INSERT %14:sgpr, %1:sgpr(s32), 416
+    %16:sgpr(s512) = G_INSERT %15:sgpr, %1:sgpr(s32), 448
+    %17:sgpr(s512) = G_INSERT %16:sgpr, %1:sgpr(s32), 480
+    $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %17:sgpr(s512)
+    SI_RETURN_TO_EPILOG $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-- 
GitLab


From 8e011ae1984fe276b409af3d8ff612f17f786c8e Mon Sep 17 00:00:00 2001
From: Kostya Serebryany <kcc@google.com>
Date: Thu, 11 Oct 2018 23:03:27 +0000
Subject: [PATCH 0083/1116] merge two near-identical functions
 createPrivateGlobalForString into one

Summary:
We have two copies of createPrivateGlobalForString (in asan and in esan).
This change merges them into one. NFC

Reviewers: vitalybuka

Reviewed By: vitalybuka

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53178

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344314 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Instrumentation.h     |  5 +++++
 .../Instrumentation/AddressSanitizer.cpp      | 22 ++++---------------
 .../Instrumentation/EfficiencySanitizer.cpp   | 15 -------------
 .../Instrumentation/Instrumentation.cpp       | 17 ++++++++++++++
 4 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index d6d9529ba9a..2157fcab726 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -36,6 +36,11 @@ class OptimizationRemarkEmitter;
 BasicBlock::iterator PrepareToSplitEntryBlock(BasicBlock &BB,
                                               BasicBlock::iterator IP);
 
+// Create a constant for Str so that we can pass it to the run-time lib.
+GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
+                                             bool AllowMerging,
+                                             const char *NamePrefix = "");
+
 // Insert GCOV profiling instrumentation
 struct GCOVOptions {
   static GCOVOptions getDefault();
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b819565e7ba..b832417154e 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1174,25 +1174,11 @@ static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
   return Res;
 }
 
-// Create a constant for Str so that we can pass it to the run-time lib.
-static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
-                                                    bool AllowMerging) {
-  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
-  // We use private linkage for module-local strings. If they can be merged
-  // with another one, we set the unnamed_addr attribute.
-  GlobalVariable *GV =
-      new GlobalVariable(M, StrConst->getType(), true,
-                         GlobalValue::PrivateLinkage, StrConst, kAsanGenPrefix);
-  if (AllowMerging) GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
-  return GV;
-}
-
 /// Create a global describing a source location.
 static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
                                                        LocationMetadata MD) {
   Constant *LocData[] = {
-      createPrivateGlobalForString(M, MD.Filename, true),
+      createPrivateGlobalForString(M, MD.Filename, true, kAsanGenPrefix),
       ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo),
       ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo),
   };
@@ -2179,7 +2165,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
   // We shouldn't merge same module names, as this string serves as unique
   // module ID in runtime.
   GlobalVariable *ModuleName = createPrivateGlobalForString(
-      M, M.getModuleIdentifier(), /*AllowMerging*/ false);
+      M, M.getModuleIdentifier(), /*AllowMerging*/ false, kAsanGenPrefix);
 
   for (size_t i = 0; i < n; i++) {
     static const uint64_t kMaxGlobalRedzone = 1 << 18;
@@ -2191,7 +2177,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
     // if it's available, otherwise just write the name of global variable).
     GlobalVariable *Name = createPrivateGlobalForString(
         M, MD.Name.empty() ? NameForGlobal : MD.Name,
-        /*AllowMerging*/ true);
+        /*AllowMerging*/ true, kAsanGenPrefix);
 
     Type *Ty = G->getValueType();
     uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
@@ -3072,7 +3058,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
       IntptrPtrTy);
   GlobalVariable *StackDescriptionGlobal =
       createPrivateGlobalForString(*F.getParent(), DescriptionString,
-                                   /*AllowMerging*/ true);
+                                   /*AllowMerging*/ true, kAsanGenPrefix);
   Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
   IRB.CreateStore(Description, BasePlus1);
   // Write the PC to redzone[2].
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 33f220a893d..0ab915de60d 100644
--- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -144,21 +144,6 @@ OverrideOptionsFromCL(EfficiencySanitizerOptions Options) {
   return Options;
 }
 
-// Create a constant for Str so that we can pass it to the run-time lib.
-static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
-                                                    bool AllowMerging) {
-  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
-  // We use private linkage for module-local strings. If they can be merged
-  // with another one, we set the unnamed_addr attribute.
-  GlobalVariable *GV =
-    new GlobalVariable(M, StrConst->getType(), true,
-                       GlobalValue::PrivateLinkage, StrConst, "");
-  if (AllowMerging)
-    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
-  return GV;
-}
-
 /// EfficiencySanitizer: instrument each module to find performance issues.
 class EfficiencySanitizer : public ModulePass {
 public:
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index ea819c1856b..1c739c09e39 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm-c/Initialization.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassRegistry.h"
 
@@ -53,6 +54,22 @@ BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB,
   return IP;
 }
 
+// Create a constant for Str so that we can pass it to the run-time lib.
+GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str,
+                                                   bool AllowMerging,
+                                                   const char *NamePrefix) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  // We use private linkage for module-local strings. If they can be merged
+  // with another one, we set the unnamed_addr attribute.
+  GlobalVariable *GV =
+      new GlobalVariable(M, StrConst->getType(), true,
+                         GlobalValue::PrivateLinkage, StrConst, NamePrefix);
+  if (AllowMerging)
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
+  return GV;
+}
+
 /// initializeInstrumentation - Initialize all passes in the TransformUtils
 /// library.
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
-- 
GitLab


From f7c87d986fb77519ba7cd394bf30da7d0e28241b Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Thu, 11 Oct 2018 23:14:35 +0000
Subject: [PATCH 0084/1116] X86/TargetTransformInfo: Report div/rem constant
 immediate costs as TCC_Free

DIV/REM by constants should always be expanded into mul/shift/etc.
patterns. Unfortunately the ConstantHoisting pass runs too early at a
point where the pattern isn't expanded yet. However after
ConstantHoisting hoisted some immediate the result may not expand
anymore. Also the hoisting typically doesn't make sense because it
operates on immediates that will change completely during the expansion.

Report DIV/REM as TCC_Free so ConstantHoisting will not touch them.

Differential Revision: https://reviews.llvm.org/D53174

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344315 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetTransformInfo.cpp     |  6 ++-
 .../ConstantHoisting/X86/bad-cases.ll         | 47 +++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/ConstantHoisting/X86/bad-cases.ll

diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 4c14715b758..d3a75123935 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2342,11 +2342,15 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
       return TTI::TCC_Free;
     ImmIdx = 1;
     break;
-  case Instruction::Mul:
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::URem:
   case Instruction::SRem:
+    // Division by constant is typically expanded later into a different
+    // instruction sequence. This completely changes the constants.
+    // Report them as "free" to stop ConstantHoist from marking them as opaque.
+    return TTI::TCC_Free;
+  case Instruction::Mul:
   case Instruction::Or:
   case Instruction::Xor:
     ImmIdx = 1;
diff --git a/test/Transforms/ConstantHoisting/X86/bad-cases.ll b/test/Transforms/ConstantHoisting/X86/bad-cases.ll
new file mode 100644
index 00000000000..00890942096
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/X86/bad-cases.ll
@@ -0,0 +1,47 @@
+; RUN: opt -consthoist -S < %s | FileCheck %s
+target triple = "x86_64--"
+
+; We don't want to convert constant divides because the benefit from converting
+; them to a mul in the backend is larget than constant materialization savings.
+define void @signed_const_division(i64 %in1, i64 %in2, i64* %addr) {
+; CHECK-LABEL: @signed_const_division
+; CHECK: %res1 = sdiv i64 %l1, 4294967296
+; CHECK: %res2 = srem i64 %l2, 4294967296
+entry:
+  br label %loop
+
+loop:
+  %l1 = phi i64 [%res1, %loop], [%in1, %entry]
+  %l2 = phi i64 [%res2, %loop], [%in2, %entry]
+  %res1 = sdiv i64 %l1, 4294967296
+  store volatile i64 %res1, i64* %addr
+  %res2 = srem i64 %l2, 4294967296
+  store volatile i64 %res2, i64* %addr
+  %again = icmp eq i64 %res1, %res2
+  br i1 %again, label %loop, label %end
+
+end:
+  ret void
+}
+
+define void @unsigned_const_division(i64 %in1, i64 %in2, i64* %addr) {
+; CHECK-LABEL: @unsigned_const_division
+; CHECK: %res1 = udiv i64 %l1, 4294967296
+; CHECK: %res2 = urem i64 %l2, 4294967296
+
+entry:
+  br label %loop
+
+loop:
+  %l1 = phi i64 [%res1, %loop], [%in1, %entry]
+  %l2 = phi i64 [%res2, %loop], [%in2, %entry]
+  %res1 = udiv i64 %l1, 4294967296
+  store volatile i64 %res1, i64* %addr
+  %res2 = urem i64 %l2, 4294967296
+  store volatile i64 %res2, i64* %addr
+  %again = icmp eq i64 %res1, %res2
+  br i1 %again, label %loop, label %end
+
+end:
+  ret void
+}
-- 
GitLab


From 639949cb1501998d06b97f782079ed5fe65e597a Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 11 Oct 2018 23:36:46 +0000
Subject: [PATCH 0085/1116] Revert "AMDGPU/GlobalISel: Implement select for
 G_INSERT"

This reverts commit r344310.

The test case was failing on some bots.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344317 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 30 ------------
 lib/Target/AMDGPU/AMDGPUInstructionSelector.h |  1 -
 .../AMDGPU/GlobalISel/inst-select-insert.mir  | 49 -------------------
 3 files changed, 80 deletions(-)
 delete mode 100644 test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir

diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 55ceb8f666f..8eb49d49b2e 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -178,34 +178,6 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
   return true;
 }
 
-bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
-  MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32);
-  DebugLoc DL = I.getDebugLoc();
-  MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG))
-                               .addDef(I.getOperand(0).getReg())
-                               .addReg(I.getOperand(1).getReg())
-                               .addReg(I.getOperand(2).getReg())
-                               .addImm(SubReg);
-
-  for (const MachineOperand &MO : Ins->operands()) {
-    if (!MO.isReg())
-      continue;
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-      continue;
-
-    const TargetRegisterClass *RC =
-            TRI.getConstrainedRegClassForOperand(MO, MRI);
-    if (!RC)
-      continue;
-    RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
-  }
-  I.eraseFromParent();
-  return true;
-}
-
 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I,
                                           CodeGenCoverage &CoverageInfo) const {
   unsigned IntrinsicID =  I.getOperand(1).getIntrinsicID();
@@ -668,8 +640,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
     return selectG_GEP(I);
   case TargetOpcode::G_IMPLICIT_DEF:
     return selectG_IMPLICIT_DEF(I);
-  case TargetOpcode::G_INSERT:
-    return selectG_INSERT(I);
   case TargetOpcode::G_INTRINSIC:
     return selectG_INTRINSIC(I, CoverageInfo);
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index f3a835a32a8..449431adc56 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -67,7 +67,6 @@ private:
   bool selectG_ADD(MachineInstr &I) const;
   bool selectG_GEP(MachineInstr &I) const;
   bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
-  bool selectG_INSERT(MachineInstr &I) const;
   bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
   bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I,
                                         CodeGenCoverage &CoverageInfo) const;
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir
deleted file mode 100644
index 93e35ead4d4..00000000000
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir
+++ /dev/null
@@ -1,49 +0,0 @@
-# RUN: llc -march=amdgcn -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
----
-
-name:            insert512
-legalized:       true
-regBankSelected: true
-
-# CHECK-LABEL: insert512
-# CHECK: [[BASE:%[0-9]+]]:sreg_512 = IMPLICIT_DEF
-# CHECK: [[VAL:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF
-# CHECK: [[BASE0:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE]], [[VAL]], %subreg.sub0
-# CHECK: [[BASE1:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE0]], [[VAL]], %subreg.sub1
-# CHECK: [[BASE2:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE1]], [[VAL]], %subreg.sub2
-# CHECK: [[BASE3:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE2]], [[VAL]], %subreg.sub3
-# CHECK: [[BASE4:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE3]], [[VAL]], %subreg.sub4
-# CHECK: [[BASE5:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE4]], [[VAL]], %subreg.sub5
-# CHECK: [[BASE6:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE5]], [[VAL]], %subreg.sub6
-# CHECK: [[BASE7:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE6]], [[VAL]], %subreg.sub7
-# CHECK: [[BASE8:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE7]], [[VAL]], %subreg.sub8
-# CHECK: [[BASE9:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE8]], [[VAL]], %subreg.sub9
-# CHECK: [[BASE10:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE9]], [[VAL]], %subreg.sub10
-# CHECK: [[BASE11:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE10]], [[VAL]], %subreg.sub11
-# CHECK: [[BASE12:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE11]], [[VAL]], %subreg.sub12
-# CHECK: [[BASE13:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE12]], [[VAL]], %subreg.sub13
-# CHECK: [[BASE14:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE13]], [[VAL]], %subreg.sub14
-# CHECK: [[BASE15:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE14]], [[VAL]], %subreg.sub15
-
-body: |
-  bb.0:
-    %0:sgpr(s512) = G_IMPLICIT_DEF
-    %1:sgpr(s32) = G_IMPLICIT_DEF
-    %2:sgpr(s512) = G_INSERT %0:sgpr, %1:sgpr(s32), 0
-    %3:sgpr(s512) = G_INSERT %2:sgpr, %1:sgpr(s32), 32
-    %4:sgpr(s512) = G_INSERT %3:sgpr, %1:sgpr(s32), 64
-    %5:sgpr(s512) = G_INSERT %4:sgpr, %1:sgpr(s32), 96
-    %6:sgpr(s512) = G_INSERT %5:sgpr, %1:sgpr(s32), 128
-    %7:sgpr(s512) = G_INSERT %6:sgpr, %1:sgpr(s32), 160
-    %8:sgpr(s512) = G_INSERT %7:sgpr, %1:sgpr(s32), 192
-    %9:sgpr(s512) = G_INSERT %8:sgpr, %1:sgpr(s32), 224
-    %10:sgpr(s512) = G_INSERT %9:sgpr, %1:sgpr(s32), 256
-    %11:sgpr(s512) = G_INSERT %10:sgpr, %1:sgpr(s32), 288
-    %12:sgpr(s512) = G_INSERT %11:sgpr, %1:sgpr(s32), 320
-    %13:sgpr(s512) = G_INSERT %12:sgpr, %1:sgpr(s32), 352
-    %14:sgpr(s512) = G_INSERT %13:sgpr, %1:sgpr(s32), 384
-    %15:sgpr(s512) = G_INSERT %14:sgpr, %1:sgpr(s32), 416
-    %16:sgpr(s512) = G_INSERT %15:sgpr, %1:sgpr(s32), 448
-    %17:sgpr(s512) = G_INSERT %16:sgpr, %1:sgpr(s32), 480
-    $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %17:sgpr(s512)
-    SI_RETURN_TO_EPILOG $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-- 
GitLab


From 0af72938856257f7c54320be1bf19873a3cc90b8 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Thu, 11 Oct 2018 23:37:58 +0000
Subject: [PATCH 0086/1116] Revert "DwarfDebug: Pick next location in case of
 missing location at block begin"

It originally triggered a stepping problem in the debugger, which could
be fixed by adjusting CodeGen/LexicalScopes.cpp however it seems we prefer
the previous behavior anyway.

See the discussion for details: http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20181008/593833.html

This reverts commit r343880.
This reverts commit r343874.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344318 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp         | 112 +++++++-----------
 lib/CodeGen/AsmPrinter/DwarfDebug.h           |   3 -
 test/DebugInfo/AArch64/line-header.ll         |   2 +-
 .../single-constant-use-preserves-dbgloc.ll   |   3 +-
 test/DebugInfo/Mips/delay-slot.ll             |   2 +-
 test/DebugInfo/NVPTX/debug-info.ll            |   2 +-
 test/DebugInfo/X86/dwarf-no-source-loc.ll     |  11 +-
 test/DebugInfo/X86/dwarf-no-source-loc.mir    |  74 ------------
 8 files changed, 52 insertions(+), 157 deletions(-)
 delete mode 100644 test/DebugInfo/X86/dwarf-no-source-loc.mir

diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index ab3559d63cc..94e12658cfe 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1371,49 +1371,6 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
   }
 }
 
-static const DebugLoc &
-findNextDebugLoc(MachineBasicBlock::const_iterator MBBI,
-                 MachineBasicBlock::const_iterator MBBE) {
-  static DebugLoc NoLocation;
-  for ( ; MBBI != MBBE; ++MBBI) {
-    if (MBBI->isDebugInstr())
-      continue;
-    const DebugLoc &DL = MBBI->getDebugLoc();
-    if (DL)
-      return DL;
-  }
-  return NoLocation;
-}
-
-void DwarfDebug::emitDebugLoc(const DebugLoc &DL) {
-  unsigned LastAsmLine =
-      Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine();
-
-  // We have an explicit location, different from the previous location.
-  // Don't repeat a line-0 record, but otherwise emit the new location.
-  // (The new location might be an explicit line 0, which we do emit.)
-  unsigned Line = DL.getLine();
-  if (PrevInstLoc && Line == 0 && LastAsmLine == 0)
-    return;
-  unsigned Flags = 0;
-  if (DL == PrologEndLoc) {
-    Flags |= DWARF2_FLAG_PROLOGUE_END | DWARF2_FLAG_IS_STMT;
-    PrologEndLoc = DebugLoc();
-  }
-  // If the line changed, we call that a new statement; unless we went to
-  // line 0 and came back, in which case it is not a new statement.
-  unsigned OldLine = PrevInstLoc ? PrevInstLoc.getLine() : LastAsmLine;
-  if (Line && Line != OldLine)
-    Flags |= DWARF2_FLAG_IS_STMT;
-
-  const MDNode *Scope = DL.getScope();
-  recordSourceLine(Line, DL.getCol(), Scope, Flags);
-
-  // If we're not at line 0, remember this location.
-  if (Line)
-    PrevInstLoc = DL;
-}
-
 // Process beginning of an instruction.
 void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   DebugHandlerBase::beginInstruction(MI);
@@ -1458,41 +1415,54 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
     // If we have already emitted a line-0 record, don't repeat it.
     if (LastAsmLine == 0)
       return;
-    // By default we emit nothing to avoid line table bloat. However at the
-    // beginning of a basic block or after a label it is undesirable to let
-    // the previous location unchanged. In these cases do a forward search for
-    // the next valid debug location.
-    if (UnknownLocations == Default) {
-      const MachineBasicBlock &MBB = *MI->getParent();
-      if (!PrevLabel && PrevInstBB == &MBB)
-        return;
-
-      const DebugLoc &NextDL = findNextDebugLoc(MI->getIterator(), MBB.end());
-      if (NextDL) {
-        emitDebugLoc(NextDL);
-        return;
-      }
-    }
-
-    // We should emit a line-0 record.
     // If user said Don't Do That, don't do that.
     if (UnknownLocations == Disable)
       return;
-    // Emit a line-0 record now.
-    // Preserve the file and column numbers, if we can, to save space in
-    // the encoded line table.
-    // Do not update PrevInstLoc, it remembers the last non-0 line.
-    const MDNode *Scope = nullptr;
-    unsigned Column = 0;
-    if (PrevInstLoc) {
-      Scope = PrevInstLoc.getScope();
-      Column = PrevInstLoc.getCol();
+    // See if we have a reason to emit a line-0 record now.
+    // Reasons to emit a line-0 record include:
+    // - User asked for it (UnknownLocations).
+    // - Instruction has a label, so it's referenced from somewhere else,
+    //   possibly debug information; we want it to have a source location.
+    // - Instruction is at the top of a block; we don't want to inherit the
+    //   location from the physically previous (maybe unrelated) block.
+    if (UnknownLocations == Enable || PrevLabel ||
+        (PrevInstBB && PrevInstBB != MI->getParent())) {
+      // Preserve the file and column numbers, if we can, to save space in
+      // the encoded line table.
+      // Do not update PrevInstLoc, it remembers the last non-0 line.
+      const MDNode *Scope = nullptr;
+      unsigned Column = 0;
+      if (PrevInstLoc) {
+        Scope = PrevInstLoc.getScope();
+        Column = PrevInstLoc.getCol();
+      }
+      recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0);
     }
-    recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0);
     return;
   }
 
-  emitDebugLoc(DL);
+  // We have an explicit location, different from the previous location.
+  // Don't repeat a line-0 record, but otherwise emit the new location.
+  // (The new location might be an explicit line 0, which we do emit.)
+  if (PrevInstLoc && DL.getLine() == 0 && LastAsmLine == 0)
+    return;
+  unsigned Flags = 0;
+  if (DL == PrologEndLoc) {
+    Flags |= DWARF2_FLAG_PROLOGUE_END | DWARF2_FLAG_IS_STMT;
+    PrologEndLoc = DebugLoc();
+  }
+  // If the line changed, we call that a new statement; unless we went to
+  // line 0 and came back, in which case it is not a new statement.
+  unsigned OldLine = PrevInstLoc ? PrevInstLoc.getLine() : LastAsmLine;
+  if (DL.getLine() && DL.getLine() != OldLine)
+    Flags |= DWARF2_FLAG_IS_STMT;
+
+  const MDNode *Scope = DL.getScope();
+  recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
+
+  // If we're not at line 0, remember this location.
+  if (DL.getLine())
+    PrevInstLoc = DL;
 }
 
 static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index e115eb771fb..fecf8056765 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -723,9 +723,6 @@ public:
   bool tuneForLLDB() const { return DebuggerTuning == DebuggerKind::LLDB; }
   bool tuneForSCE() const { return DebuggerTuning == DebuggerKind::SCE; }
   /// @}
-
-private:
-  void emitDebugLoc(const DebugLoc &DL);
 };
 
 } // end namespace llvm
diff --git a/test/DebugInfo/AArch64/line-header.ll b/test/DebugInfo/AArch64/line-header.ll
index 2ac94728b86..1d9156debf1 100644
--- a/test/DebugInfo/AArch64/line-header.ll
+++ b/test/DebugInfo/AArch64/line-header.ll
@@ -3,4 +3,4 @@
 
 ; check line table length is correctly calculated for both big and little endian
 CHECK-LABEL: .debug_line contents:
-CHECK: total_length: 0x0000003c
+CHECK: total_length: 0x0000003f
diff --git a/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll b/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll
index fa1dbb531d3..af76c889353 100644
--- a/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll
+++ b/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll
@@ -31,10 +31,11 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %entry
 ; Materialize the constant.
-; CHECK:      .loc    1 7 5
+; CHECK:      .loc    1 0
 ; CHECK-NEXT: mvn     r0, #0
 
 ; The backend performs the store to %retval first, for some reason.
+; CHECK-NEXT: .loc    1 7 5
 ; CHECK-NEXT: str     r0, [sp, #4]
   store i32 -1, i32* %x, align 4, !dbg !19
 
diff --git a/test/DebugInfo/Mips/delay-slot.ll b/test/DebugInfo/Mips/delay-slot.ll
index f8959a2c52b..8f444bce30f 100644
--- a/test/DebugInfo/Mips/delay-slot.ll
+++ b/test/DebugInfo/Mips/delay-slot.ll
@@ -16,7 +16,7 @@
 ; CHECK: 0x0000000000000004      2      0      1   0             0  is_stmt prologue_end
 ; CHECK: 0x0000000000000024      3      0      1   0             0  is_stmt
 ; CHECK: 0x0000000000000034      4      0      1   0             0  is_stmt
-; CHECK: 0x0000000000000044      5      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000048      5      0      1   0             0  is_stmt
 ; CHECK: 0x0000000000000058      5      0      1   0             0  is_stmt end_sequence
 
 
diff --git a/test/DebugInfo/NVPTX/debug-info.ll b/test/DebugInfo/NVPTX/debug-info.ll
index f80a8426286..d5dee4055f0 100644
--- a/test/DebugInfo/NVPTX/debug-info.ll
+++ b/test/DebugInfo/NVPTX/debug-info.ll
@@ -36,7 +36,6 @@
 ; CHECK: setp.ge.s32     %p{{.+}}, %r{{.+}}, %r{{.+}};
 ; CHECK: .loc [[DEBUG_INFO_CU]] 7 7
 ; CHECK: @%p{{.+}} bra   [[BB:.+]];
-; CHECK: .loc [[DEBUG_INFO_CU]] 8 13
 ; CHECK: ld.param.f32    %f{{.+}}, [{{.+}}];
 ; CHECK: ld.param.u64    %rd{{.+}}, [{{.+}}];
 ; CHECK: cvta.to.global.u64      %rd{{.+}}, %rd{{.+}};
@@ -44,6 +43,7 @@
 ; CHECK: cvta.to.global.u64      %rd{{.+}}, %rd{{.+}};
 ; CHECK: mul.wide.u32    %rd{{.+}}, %r{{.+}}, 4;
 ; CHECK: add.s64         %rd{{.+}}, %rd{{.+}}, %rd{{.+}};
+; CHECK: .loc [[DEBUG_INFO_CU]] 8 13
 ; CHECK: ld.global.f32   %f{{.+}}, [%rd{{.+}}];
 ; CHECK: add.s64         %rd{{.+}}, %rd{{.+}}, %rd{{.+}};
 ; CHECK: .loc [[DEBUG_INFO_CU]] 8 19
diff --git a/test/DebugInfo/X86/dwarf-no-source-loc.ll b/test/DebugInfo/X86/dwarf-no-source-loc.ll
index 19695ab126b..60d50a391a1 100644
--- a/test/DebugInfo/X86/dwarf-no-source-loc.ll
+++ b/test/DebugInfo/X86/dwarf-no-source-loc.ll
@@ -40,14 +40,15 @@ if.end:                                           ; preds = %if.then, %entry
   ret void, !dbg !14
 }
 
-; CHECK:      .loc 1 7 7 prologue_end
+; CHECK:      .loc 1 7 7
 ; CHECK-NOT:  .loc
-; CHECK:      # %bb.1
-; CHECK-NEXT: .file 2 "/tests{{[/\]+}}include.h"
-; CHECK-NEXT: .loc 2 20 5
+; CHECK:      .loc 1 0 7 is_stmt 0
 ; CHECK-NOT:  .loc
+; CHECK:      .loc 2 20 5 is_stmt 1
 ; CHECK:      .LBB0_2:
-; CHECK:      .loc 1 10 3
+; CHECK-NEXT: .loc 2 0 5 is_stmt 0
+; CHECK-NOT:  .loc
+; CHECK:      .loc 1 10 3 is_stmt 1
 ;
 ; DISABLE-NOT: .loc 1 0
 
diff --git a/test/DebugInfo/X86/dwarf-no-source-loc.mir b/test/DebugInfo/X86/dwarf-no-source-loc.mir
deleted file mode 100644
index f6ad6ee6d4c..00000000000
--- a/test/DebugInfo/X86/dwarf-no-source-loc.mir
+++ /dev/null
@@ -1,74 +0,0 @@
-# RUN: llc -o - %s -start-before=patchable-function -use-unknown-locations=Default | FileCheck %s --check-prefixes=CHECK,DEFAULT
-# RUN: llc -o - %s -start-before=patchable-function -use-unknown-locations=Enable | FileCheck %s --check-prefixes=CHECK,ENABLE
-# RUN: llc -o - %s -start-before=patchable-function -use-unknown-locations=Disable | FileCheck %s --check-prefixes=CHECK,DISABLE
---- |
-  target triple = "x86_64--"
-  
-  !0 = !DIFile(filename: "dwarf-no-source-loc.mir", directory: "/")
-  !1 = distinct !DICompileUnit(file: !0, language: DW_LANG_C, emissionKind: LineTablesOnly)
-  !2 = distinct !DISubprogram(name: "func", unit: !1)
-  !3 = !DILocation(line: 17, scope: !2)
-  !4 = !DILocation(line: 42, scope: !2)
-
-  !llvm.dbg.cu = !{!1}
-  !llvm.module.flags = !{!10, !11}
-  !10 = !{i32 2, !"Dwarf Version", i32 4}
-  !11 = !{i32 2, !"Debug Info Version", i32 3}
-  
-  define void @func() !dbg !2 {
-    unreachable
-  }
-...
----
-name: func
-body: |
-  bb.0:
-    NOOP
-    NOOP
-    $eax = MOV32ri 1, debug-location !3
-    ; CHECK-LABEL: bb.0
-    ; CHECK: nop
-    ; CHECK: nop
-    ; CHECK: .loc 1 17 0 prologue_end
-    ; CHECK: movl $1, %eax
-
-  bb.1:
-    NOOP
-    $ebx = MOV32ri 2, debug-location !4
-    ; CHECK-LABEL: bb.1
-    ; DEFAULT: .loc 1 42 0
-    ; ENABLE: .loc 1 0
-    ; DISABLE-NOT: .loc 1 0
-    ; CHECK: nop
-    ; ENABLE: .loc 1 42 0
-    ; CHECK: movl $2, %ebx
-
-  bb.2:
-    NOOP
-    ; CHECK-LABEL: bb.2
-    ; DEFAULT: .loc 1 0 0 is_stmt 0
-    ; ENABLE: .loc 1 0 0 is_stmt 0
-    ; DISABLE-NOT: .loc 1 0
-    ; CHECK: nop
-
-  bb.3:
-    NOOP
-    $ecx = MOV32ri 3, debug-location !3
-    ; CHECK-LABEL: bb.3
-    ; CHECK: nop
-    ; DEFAULT: .loc 1 17 0 is_stmt 1
-    ; ENABLE: .loc 1 17 0 is_stmt 1
-    ; DISABLE-NOT: .loc 1 0
-    ; CHECK: movl $3, %ecx
-
-  bb.4:
-    NOOP
-    $edx = MOV32ri 4, debug-location !4
-    ; CHECK: bb.4
-    ; DEFAULT: .loc 1 42 0
-    ; ENABLE: .loc 1 0 0 is_stmt 0
-    ; DISABLE-NOT: .loc 1 0
-    ; CHECK: nop
-    ; ENABLE: .loc 1 42 0 is_stmt 1
-    ; CHECK: movl $4, %edx
-...
-- 
GitLab


From bd755d4e272af07dbd5a65ab2d1acf4a3a42e510 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 11 Oct 2018 23:56:56 +0000
Subject: [PATCH 0087/1116] [DAGCombiner] rearrange extract_element+bitcast
 fold; NFC

I want to add another pattern here that includes scalar_to_vector,
so this makes that patch smaller. I was hoping to remove the
hasOneUse() check because it shouldn't be necessary for common
codegen, but an AMDGPU test has a comment suggesting that the
extra check makes things better on one of those targets.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344320 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 ++++++++------
 test/CodeGen/X86/extract-insert.ll       |  4 ++++
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 16834dc1a26..7ec5fac390b 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15499,13 +15499,15 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     // converts.
   }
 
-  // extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x)
-  bool isLE = DAG.getDataLayout().isLittleEndian();
-  unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1;
-  if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() &&
-      ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) {
+  if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST) {
+    // The vector index of the LSBs of the source depend on the endian-ness.
+    bool IsLE = DAG.getDataLayout().isLittleEndian();
+
+    // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x)
+    unsigned BCTruncElt = IsLE ? 0 : VT.getVectorNumElements() - 1;
     SDValue BCSrc = InVec.getOperand(0);
-    if (BCSrc.getValueType().isScalarInteger())
+    if (InVec.hasOneUse() && ConstEltNo->getZExtValue() == BCTruncElt &&
+        VT.isInteger() && BCSrc.getValueType().isScalarInteger())
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
   }
 
diff --git a/test/CodeGen/X86/extract-insert.ll b/test/CodeGen/X86/extract-insert.ll
index de8ee704b88..b3fb50de718 100644
--- a/test/CodeGen/X86/extract-insert.ll
+++ b/test/CodeGen/X86/extract-insert.ll
@@ -28,6 +28,10 @@ define i8 @extractelt_bitcast(i32 %x) nounwind {
   ret i8 %ext
 }
 
+; TODO: This should have folded to avoid vector ops, but the transform
+; is guarded by 'hasOneUse'. That limitation apparently makes some AMDGPU 
+; codegen better.
+
 define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind {
 ; X86-LABEL: extractelt_bitcast_extra_use:
 ; X86:       # %bb.0:
-- 
GitLab


From 3b7de9d1bb445d9f7652afe058e33a1ff3053036 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Fri, 12 Oct 2018 00:36:01 +0000
Subject: [PATCH 0088/1116] [llvm-objcopy] Add -F|--target compatibility

Summary:
This change adds support for the GNU --target flag, which sets both --input-target and --output-target.

GNU objcopy doesn't do any checking for whether both --target and --{input,output}-target are used, and so it allows both, e.g. "--target A --output-target B" is equivalent to "--input-target A --output-target B" since the later command line flag would override earlier ones. This may be error prone, so I chose to implement it as an error if both are used. I'm not sure if anyone is actually using both.

Reviewers: jakehehrlich, jhenderson, alexshap

Reviewed By: jakehehrlich, alexshap

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53029

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344321 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm-objcopy/input-output-target.test     | 22 +++++++++++++++++++
 tools/llvm-objcopy/CopyConfig.cpp             | 14 ++++++++++--
 tools/llvm-objcopy/ObjcopyOpts.td             |  4 ++++
 3 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 test/tools/llvm-objcopy/input-output-target.test

diff --git a/test/tools/llvm-objcopy/input-output-target.test b/test/tools/llvm-objcopy/input-output-target.test
new file mode 100644
index 00000000000..e81770a239a
--- /dev/null
+++ b/test/tools/llvm-objcopy/input-output-target.test
@@ -0,0 +1,22 @@
+# RUN: echo abcd > %t.txt
+
+# Preserve input to verify it is not modified
+# RUN: cp %t.txt %t-copy.txt
+
+# -F <target> is equivalent to -I <target> -O <target>
+# RUN: llvm-objcopy -F binary -B i386:x86-64 %t.txt %t.2.txt
+# RUN: cmp %t-copy.txt %t.2.txt
+
+# --target <target> is equivalent to --input-target <target> --output-target <target>
+# RUN: llvm-objcopy --target binary -B i386:x86-64 %t.txt %t.3.txt
+# RUN: cmp %t-copy.txt %t.3.txt
+
+# TODO: check --target and --input-target/--output-target are incompatible
+# RUN: not llvm-objcopy --target binary --input-target binary -B i386:x86-64 \
+# RUN:     %t.txt %t.4.txt 2>&1 \
+# RUN:     | FileCheck %s --check-prefix=BAD-FLAG
+# RUN: not llvm-objcopy --target binary --output-target binary -B i386:x86-64 \
+# RUN:     %t.txt %t.4.txt 2>&1 \
+# RUN:     | FileCheck %s --check-prefix=BAD-FLAG
+
+# BAD-FLAG: --target cannot be used with --input-target or --output-target.
diff --git a/tools/llvm-objcopy/CopyConfig.cpp b/tools/llvm-objcopy/CopyConfig.cpp
index d814df10525..2c3551ba026 100644
--- a/tools/llvm-objcopy/CopyConfig.cpp
+++ b/tools/llvm-objcopy/CopyConfig.cpp
@@ -247,8 +247,18 @@ DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
   CopyConfig Config;
   Config.InputFilename = Positional[0];
   Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1];
-  Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target);
-  Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target);
+  if (InputArgs.hasArg(OBJCOPY_target) &&
+      (InputArgs.hasArg(OBJCOPY_input_target) ||
+       InputArgs.hasArg(OBJCOPY_output_target)))
+    error("--target cannot be used with --input-target or --output-target");
+
+  if (InputArgs.hasArg(OBJCOPY_target)) {
+    Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_target);
+    Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_target);
+  } else {
+    Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target);
+    Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target);
+  }
   if (Config.InputFormat == "binary") {
     auto BinaryArch = InputArgs.getLastArgValue(OBJCOPY_binary_architecture);
     if (BinaryArch.empty())
diff --git a/tools/llvm-objcopy/ObjcopyOpts.td b/tools/llvm-objcopy/ObjcopyOpts.td
index 18b270b7758..f6c8a959e8b 100644
--- a/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/tools/llvm-objcopy/ObjcopyOpts.td
@@ -10,6 +10,10 @@ defm binary_architecture : Eq<"binary-architecture">,
                            HelpText<"Used when transforming an architecture-less format (such as binary) to another format">;
 def B : JoinedOrSeparate<["-"], "B">,
         Alias<binary_architecture>;
+defm target : Eq<"target">,
+              HelpText<"Format of the input and output file">,
+              Values<"binary">;
+def F : JoinedOrSeparate<[ "-" ], "F">, Alias<target>;
 defm input_target : Eq<"input-target">,
                     HelpText<"Format of the input file">,
                     Values<"binary">;
-- 
GitLab


From 441f8c5b1f48b72ed2b92ccb370b7e6fb73def30 Mon Sep 17 00:00:00 2001
From: Eugene Leviant <eleviant@accesssoftek.com>
Date: Fri, 12 Oct 2018 07:24:02 +0000
Subject: [PATCH 0089/1116] [ThinLTO] Don't import GV which contains
 blockaddress

Differential revision: https://reviews.llvm.org/D53139


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344325 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ModuleSummaryAnalysis.cpp        | 19 ++++++++++++++++---
 lib/Transforms/IPO/FunctionImport.cpp         |  3 +--
 .../X86/Inputs/globals-import-blockaddr.ll    | 12 ++++++++++++
 test/ThinLTO/X86/globals-import-blockaddr.ll  | 18 ++++++++++++++++++
 4 files changed, 47 insertions(+), 5 deletions(-)
 create mode 100644 test/ThinLTO/X86/Inputs/globals-import-blockaddr.ll
 create mode 100644 test/ThinLTO/X86/globals-import-blockaddr.ll

diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index bca40043fd9..3eb150becfa 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -74,9 +74,17 @@ cl::opt<FunctionSummary::ForceSummaryHotnessType, true> FSEC(
 // Walk through the operands of a given User via worklist iteration and populate
 // the set of GlobalValue references encountered. Invoked either on an
 // Instruction or a GlobalVariable (which walks its initializer).
-static void findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
+// Return true if any of the operands contains blockaddress. This is important
+// to know when computing summary for global var, because if global variable
+// references basic block address we can't import it separately from function
+// containing that basic block. For simplicity we currently don't import such
+// global vars at all. When importing function we aren't interested if any 
+// instruction in it takes an address of any basic block, because instruction
+// can only take an address of basic block located in the same function.
+static bool findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
                          SetVector<ValueInfo> &RefEdges,
                          SmallPtrSet<const User *, 8> &Visited) {
+  bool HasBlockAddress = false;
   SmallVector<const User *, 32> Worklist;
   Worklist.push_back(CurUser);
 
@@ -92,8 +100,10 @@ static void findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
       const User *Operand = dyn_cast<User>(OI);
       if (!Operand)
         continue;
-      if (isa<BlockAddress>(Operand))
+      if (isa<BlockAddress>(Operand)) {
+        HasBlockAddress = true;
         continue;
+      }
       if (auto *GV = dyn_cast<GlobalValue>(Operand)) {
         // We have a reference to a global value. This should be added to
         // the reference set unless it is a callee. Callees are handled
@@ -105,6 +115,7 @@ static void findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
       Worklist.push_back(Operand);
     }
   }
+  return HasBlockAddress;
 }
 
 static CalleeInfo::HotnessType getHotness(uint64_t ProfileCount,
@@ -369,7 +380,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
                        DenseSet<GlobalValue::GUID> &CantBePromoted) {
   SetVector<ValueInfo> RefEdges;
   SmallPtrSet<const User *, 8> Visited;
-  findRefEdges(Index, &V, RefEdges, Visited);
+  bool HasBlockAddress = findRefEdges(Index, &V, RefEdges, Visited);
   bool NonRenamableLocal = isNonRenamableLocal(V);
   GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal,
                                     /* Live = */ false, V.isDSOLocal());
@@ -377,6 +388,8 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
       llvm::make_unique<GlobalVarSummary>(Flags, RefEdges.takeVector());
   if (NonRenamableLocal)
     CantBePromoted.insert(V.getGUID());
+  if (HasBlockAddress)
+    GVarSummary->setNotEligibleToImport();
   Index.addGlobalValueSummary(V, std::move(GVarSummary));
 }
 
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 8f8c85e1b18..366ac2b95f4 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -278,8 +278,7 @@ static void computeImportForReferencedGlobals(
 
     for (auto &RefSummary : VI.getSummaryList())
       if (RefSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind &&
-          // Don't try to import regular LTO summaries added to dummy module.
-          !RefSummary->modulePath().empty() &&
+          !RefSummary->notEligibleToImport() &&
           !GlobalValue::isInterposableLinkage(RefSummary->linkage()) &&
           RefSummary->refs().empty()) {
         ImportList[RefSummary->modulePath()].insert(VI.getGUID());
diff --git a/test/ThinLTO/X86/Inputs/globals-import-blockaddr.ll b/test/ThinLTO/X86/Inputs/globals-import-blockaddr.ll
new file mode 100644
index 00000000000..fe1fa70ee83
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/globals-import-blockaddr.ll
@@ -0,0 +1,12 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@label_addr = internal constant [1 x i8*] [i8* blockaddress(@foo, %lb)], align 8
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define dso_local [1 x i8*]* @foo() {
+  br label %lb
+
+lb:
+  ret [1 x i8*]* @label_addr
+}
diff --git a/test/ThinLTO/X86/globals-import-blockaddr.ll b/test/ThinLTO/X86/globals-import-blockaddr.ll
new file mode 100644
index 00000000000..d4ed674030a
--- /dev/null
+++ b/test/ThinLTO/X86/globals-import-blockaddr.ll
@@ -0,0 +1,18 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/globals-import-blockaddr.ll -o %t2.bc
+; RUN: llvm-lto2 run -save-temps %t1.bc -r=%t1.bc,foo,l -r=%t1.bc,main,pl %t2.bc -r=%t2.bc,foo,pl -o %t3
+; RUN: llvm-dis %t3.1.3.import.bc -o - | FileCheck %s
+
+; Verify that we haven't imported GV containing blockaddress
+; CHECK: @label_addr.llvm.0 = external hidden constant
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare dso_local [1 x i8*]* @foo();
+
+define dso_local i32 @main() {
+  %p = call [1 x i8*]* @foo()
+  %v = ptrtoint [1 x i8*]* %p to i32
+  ret i32 %v
+}
-- 
GitLab


From edac9f00e9e101be1520a77ebd51e4f274b020b3 Mon Sep 17 00:00:00 2001
From: Stefan Maksimovic <stefan.maksimovic@mips.com>
Date: Fri, 12 Oct 2018 08:18:38 +0000
Subject: [PATCH 0090/1116] [mips] Mark fmaxl as a long double emulation
 routine

Failure was discovered upon running
projects/compiler-rt/test/builtins/Unit/divtc3_test.c
in a stage2 compiler build.

When compiling projects/compiler-rt/lib/builtins/divtc3.c,
a call to fmaxl within the divtc3 implementation had its
return values read from registers $2 and $3 instead of $f0 and $f2.
Include fmaxl in the list of long double emulation routines
to have its return value correctly interpreted as f128.

Almost exact issue here: https://reviews.llvm.org/D17760

Differential Revision: https://reviews.llvm.org/D52649


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344326 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MipsCCState.cpp       |  8 ++++----
 test/CodeGen/Mips/cconv/fmaxl_call.ll | 25 +++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 4 deletions(-)
 create mode 100644 test/CodeGen/Mips/cconv/fmaxl_call.ll

diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp
index 81a1cced93b..90cb3f437bd 100644
--- a/lib/Target/Mips/MipsCCState.cpp
+++ b/lib/Target/Mips/MipsCCState.cpp
@@ -24,10 +24,10 @@ static bool isF128SoftLibCall(const char *CallSym) {
       "__lttf2",       "__multf3",     "__netf2",       "__powitf2",
       "__subtf3",      "__trunctfdf2", "__trunctfsf2",  "__unordtf2",
       "ceill",         "copysignl",    "cosl",          "exp2l",
-      "expl",          "floorl",       "fmal",          "fmodl",
-      "log10l",        "log2l",        "logl",          "nearbyintl",
-      "powl",          "rintl",        "roundl",        "sinl",
-      "sqrtl",         "truncl"};
+      "expl",          "floorl",       "fmal",          "fmaxl",
+      "fmodl",         "log10l",       "log2l",         "logl",
+      "nearbyintl",    "powl",         "rintl",         "roundl",
+      "sinl",          "sqrtl",        "truncl"};
 
   // Check that LibCalls is sorted alphabetically.
   auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; };
diff --git a/test/CodeGen/Mips/cconv/fmaxl_call.ll b/test/CodeGen/Mips/cconv/fmaxl_call.ll
new file mode 100644
index 00000000000..0e3078edae4
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/fmaxl_call.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=mips64el-unknown-linux-gnu -mcpu=mips64 < %s | FileCheck %s
+
+define fp128 @call_fmaxl(fp128 %a, fp128 %b) {
+; CHECK-LABEL: call_fmaxl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    daddiu $sp, $sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 31, -8
+; CHECK-NEXT:    jal fmaxl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mov.d $f12, $f0
+; CHECK-NEXT:    jal f
+; CHECK-NEXT:    mov.d $f13, $f2
+; CHECK-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    daddiu $sp, $sp, 16
+    %1 = call fp128 @llvm.maxnum.f128(fp128 %a, fp128 %b)
+    %2 = call fp128 @f(fp128 %1)
+    ret fp128 %2
+}
+
+declare fp128 @llvm.maxnum.f128(fp128, fp128)
+declare fp128 @f(fp128)
-- 
GitLab


From 66c3f51a52733500b8f08dfc2beabe845c57d467 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Fri, 12 Oct 2018 09:01:59 +0000
Subject: [PATCH 0091/1116] SCCP: avoid caching DenseMap entry that might be
 invalidated.

Later calls to getValueState might insert entries into the ValueState map and
cause reallocation, invalidating a reference.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344327 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/SCCP.cpp                |  8 ++--
 test/Transforms/SCCP/latticeval-invalidate.ll | 41 +++++++++++++++++++
 2 files changed, 46 insertions(+), 3 deletions(-)
 create mode 100644 test/Transforms/SCCP/latticeval-invalidate.ll

diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index d024e03b80a..7196bc82edc 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -1017,8 +1017,9 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
 
 // Handle ICmpInst instruction.
 void SCCPSolver::visitCmpInst(CmpInst &I) {
-  LatticeVal &IV = ValueState[&I];
-  if (IV.isOverdefined()) return;
+  // Do not cache this lookup, getValueState calls later in the function might
+  // invalidate the reference.
+  if (ValueState[&I].isOverdefined()) return;
 
   Value *Op1 = I.getOperand(0);
   Value *Op2 = I.getOperand(1);
@@ -1046,7 +1047,8 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
   }
 
   // If operands are still unknown, wait for it to resolve.
-  if (!V1State.isOverdefined() && !V2State.isOverdefined() && !IV.isConstant())
+  if (!V1State.isOverdefined() && !V2State.isOverdefined() &&
+      !ValueState[&I].isConstant())
     return;
 
   markOverdefined(&I);
diff --git a/test/Transforms/SCCP/latticeval-invalidate.ll b/test/Transforms/SCCP/latticeval-invalidate.ll
new file mode 100644
index 00000000000..19ea425312f
--- /dev/null
+++ b/test/Transforms/SCCP/latticeval-invalidate.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S -sccp %s
+
+@A = external constant i32
+
+define void @test1() {
+BB4:
+  %A20 = alloca i1
+  %A15 = alloca i64
+  %A7 = alloca i64
+  %A3 = alloca i32**
+  %P = getelementptr i32, i32* @A, i32 0
+  %B = ptrtoint i32* %P to i64
+  %B8 = shl i64 %B, 9223372036854775807
+  %G10 = getelementptr i32*, i32** undef, i64 %B
+  %B10 = urem i64 %B, %B8
+  %B12 = shl i64 %B, %B
+  %BB = and i64 %B, %B8
+  %B1 = xor i64 %B, %B
+  %B23 = lshr i64 %B8, undef
+  %C5 = icmp uge i64 %B, %B10
+  %C17 = fcmp ord double 4.940660e-324, 0x7FEFFFFFFFFFFFFF
+  %C2 = icmp uge i1 %C17, false
+  %G = getelementptr i32, i32* %P, i1 %C17
+  %X = select i1 false, i712 0, i712 1
+  %C4 = icmp ule i1 true, false
+  %B3 = xor i1 %C17, %C2
+  %C33 = icmp slt i1 false, %C5
+  %B15 = sub i64 %B8, %B23
+  %C18 = icmp slt i64 undef, %BB
+  %G29 = getelementptr i32**, i32*** undef, i64 %B15
+  %C35 = icmp eq i1 %C17, undef
+  %C31 = icmp ult i1 %C35, %C5
+  %C29 = icmp sle i1 true, %C5
+  %C16 = icmp ne i16 -1, -32768
+  %A24 = alloca i1
+  %A21 = alloca i1
+  %A25 = alloca i32**
+  %C7 = icmp ule i1 %C4, %B3
+  %C14 = icmp slt i64 %B8, 0
+  ret void
+}
-- 
GitLab


From 39e3cf3d167b7b607e7490409984809a36649f38 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Oct 2018 10:20:16 +0000
Subject: [PATCH 0092/1116] [X86] Ignore float/double non-temporal loads
 (PR39256)

Scalar non-temporal loads were asserting instead of just being ignored.

Reduced from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=10895

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344331 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp    |  3 +++
 test/CodeGen/X86/nontemporal-loads.ll | 32 +++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index f8ec4a2bcfc..ede1c0bd7df 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -441,6 +441,9 @@ namespace {
 
       switch (StoreSize) {
       default: llvm_unreachable("Unsupported store size");
+      case 4:
+      case 8:
+        return false;
       case 16:
         return Subtarget->hasSSE41();
       case 32:
diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll
index 37ff7115ac9..56428979568 100644
--- a/test/CodeGen/X86/nontemporal-loads.ll
+++ b/test/CodeGen/X86/nontemporal-loads.ll
@@ -1911,4 +1911,36 @@ define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %m
   ret <16 x i32>%res
 }
 
+; Reduced from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=10895
+define i32 @PR39256(float* %ptr) {
+; SSE-LABEL: PR39256:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    ucomiss {{.*}}(%rip), %xmm0
+; SSE-NEXT:    setb (%rax)
+; SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR39256:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vucomiss {{.*}}(%rip), %xmm0
+; AVX-NEXT:    setb (%rax)
+; AVX-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: PR39256:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vucomiss {{.*}}(%rip), %xmm0
+; AVX512-NEXT:    setb (%rax)
+; AVX512-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; AVX512-NEXT:    retq
+entry:
+  %l = load float, float* %ptr, !nontemporal !1
+  %C = fcmp ult float %l, 0x36A0000000000000
+  store i1 %C, i1* undef
+  ret i32 -2147483648
+}
+
 !1 = !{i32 1}
-- 
GitLab


From 0b50ad3f83305d5904d9f0ffa94424134a7b6d82 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Oct 2018 10:26:59 +0000
Subject: [PATCH 0093/1116] [X86][AVX] Add examples of shuffles that can be
 reduced to a cross-lane shuffle followed by a in-lane permute

Suitable for lowering by D53148

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344332 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/vector-shuffle-256-v16.ll | 26 ++++++++++++++++++
 test/CodeGen/X86/vector-shuffle-256-v32.ll | 31 ++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index de587beadc1..90970f15fea 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -4052,6 +4052,32 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2
   ret <16 x i16> %shuffle
 }
 
+define <16 x i16> @shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,1,3,5,7,31,30,29,28,27,26,25,24]
+; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
+; AVX512VL-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24>
+  ret <16 x i16> %shuffle
+}
+
 define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-LABEL: shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu:
 ; AVX1:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 3e36b4a3b6a..5e9f30a727d 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -2495,6 +2495,37 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
   ret <32 x i8> %shuffle
 }
 
+define <32 x i8> @shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
+; AVX512VLBW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,32,34,36,38,40,42,44,46,33,35,37,39,41,43,45,47]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47>
+  ret <32 x i8> %shuffle
+}
+
 define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) {
 ; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
 ; AVX1:       # %bb.0:
-- 
GitLab


From f39b0d9784b8e8acf20d7046101eb2e229b20a8a Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Fri, 12 Oct 2018 11:23:04 +0000
Subject: [PATCH 0094/1116] [tblgen][llvm-mca] Add the ability to describe move
 elimination candidates via tablegen.

This patch adds the ability to identify instructions that are "move elimination
candidates". It also allows scheduling models to describe processor register
files that allow move elimination.

A move elimination candidate is an instruction that can be eliminated at
register renaming stage.
Each subtarget can specify which instructions are move elimination candidates
with the help of tablegen class "IsOptimizableRegisterMove" (see
llvm/Target/TargetInstrPredicate.td).

For example, on X86, BtVer2 allows both GPR and MMX/SSE moves to be eliminated.
The definition of 'IsOptimizableRegisterMove' for BtVer2 looks like this:

```
def : IsOptimizableRegisterMove<[
  InstructionEquivalenceClass<[
    // GPR variants.
    MOV32rr, MOV64rr,

    // MMX variants.
    MMX_MOVQ64rr,

    // SSE variants.
    MOVAPSrr, MOVUPSrr,
    MOVAPDrr, MOVUPDrr,
    MOVDQArr, MOVDQUrr,

    // AVX variants.
    VMOVAPSrr, VMOVUPSrr,
    VMOVAPDrr, VMOVUPDrr,
    VMOVDQArr, VMOVDQUrr
  ], CheckNot<CheckSameRegOperand<0, 1>> >
]>;
```

Definitions of IsOptimizableRegisterMove from processor models of a same
Target are processed by the SubtargetEmitter to auto-generate a target-specific
override for each of the following predicate methods:

```
bool TargetSubtargetInfo::isOptimizableRegisterMove(const MachineInstr *MI)
const;
bool MCInstrAnalysis::isOptimizableRegisterMove(const MCInst &MI, unsigned
CPUID) const;
```

By default, those methods return false (i.e. conservatively assume that there
are no move elimination candidates).

Tablegen class RegisterFile has been extended with the following information:
 - The set of register classes that allow move elimination.
 - Maxium number of moves that can be eliminated every cycle.
 - Whether move elimination is restricted to moves from registers that are
   known to be zero.

This patch is structured in three part:

A first part (which is mostly boilerplate) adds the new
'isOptimizableRegisterMove' target hooks, and extends existing register file
descriptors in MC by introducing new fields to describe properties related to
move elimination.

A second part, uses the new tablegen constructs to describe move elimination in
the BtVer2 scheduling model.

A third part, teaches llm-mca how to query the new 'isOptimizableRegisterMove'
hook to mark instructions that are candidates for move elimination. It also
teaches class RegisterFile how to describe constraints on move elimination at
PRF granularity.

llvm-mca tests for btver2 show differences before/after this patch.

Differential Revision: https://reviews.llvm.org/D53134


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344334 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/TargetSubtargetInfo.h    |  13 +++
 include/llvm/MC/MCInstrAnalysis.h             |  11 ++
 include/llvm/MC/MCSchedule.h                  |   7 ++
 include/llvm/Target/TargetInstrPredicate.td   |   8 +-
 include/llvm/Target/TargetSchedule.td         |  30 ++++-
 lib/Target/X86/X86ScheduleBtVer2.td           |  34 +++++-
 .../X86/BtVer2/reg-move-elimination-1.s       |  24 ++--
 .../X86/BtVer2/reg-move-elimination-2.s       | 104 +++++++++---------
 .../X86/BtVer2/reg-move-elimination-3.s       |  86 +++++++--------
 .../X86/BtVer2/reg-move-elimination-4.s       |  67 ++++++-----
 .../X86/BtVer2/reg-move-elimination-5.s       |  67 ++++++-----
 .../lib/HardwareUnits/RegisterFile.cpp        |  14 ++-
 tools/llvm-mca/lib/InstrBuilder.cpp           |   2 +
 utils/TableGen/CodeGenSchedule.cpp            |  14 ++-
 utils/TableGen/CodeGenSchedule.h              |  15 ++-
 utils/TableGen/SubtargetEmitter.cpp           |  14 ++-
 16 files changed, 315 insertions(+), 195 deletions(-)

diff --git a/include/llvm/CodeGen/TargetSubtargetInfo.h b/include/llvm/CodeGen/TargetSubtargetInfo.h
index e28673de225..968e4c4b810 100644
--- a/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -169,6 +169,19 @@ public:
     return isZeroIdiom(MI, Mask);
   }
 
+  /// Returns true if MI is a candidate for move elimination.
+  ///
+  /// A candidate for move elimination may be optimized out at register renaming
+  /// stage. Subtargets can specify the set of optimizable moves by
+  /// instantiating tablegen class `IsOptimizableRegisterMove` (see
+  /// llvm/Target/TargetInstrPredicate.td).
+  ///
+  /// SubtargetEmitter is responsible for processing all the definitions of class
+  /// IsOptimizableRegisterMove, and auto-generate an override for this method.
+  virtual bool isOptimizableRegisterMove(const MachineInstr *MI) const {
+    return false;
+  }
+
   /// True if the subtarget should run MachineScheduler after aggressive
   /// coalescing.
   ///
diff --git a/include/llvm/MC/MCInstrAnalysis.h b/include/llvm/MC/MCInstrAnalysis.h
index 950a1afeef5..200f10f7d64 100644
--- a/include/llvm/MC/MCInstrAnalysis.h
+++ b/include/llvm/MC/MCInstrAnalysis.h
@@ -136,6 +136,17 @@ public:
     return isZeroIdiom(MI, Mask, CPUID);
   }
 
+  /// Returns true if MI is a candidate for move elimination.
+  ///
+  /// Different subtargets may apply different constraints to optimizable
+  /// register moves. For example, on most X86 subtargets, a candidate for move
+  /// elimination cannot specify the same register for both source and
+  /// destination.
+  virtual bool isOptimizableRegisterMove(const MCInst &MI,
+                                         unsigned CPUID) const {
+    return false;
+  }
+
   /// Given a branch instruction try to get the address the branch
   /// targets. Return true on success, and the address in Target.
   virtual bool
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index 9f53a468903..8990c2e3c0d 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -142,6 +142,7 @@ struct MCSchedClassDesc {
 struct MCRegisterCostEntry {
   unsigned RegisterClassID;
   unsigned Cost;
+  bool AllowMoveElimination;
 };
 
 /// A register file descriptor.
@@ -159,6 +160,12 @@ struct MCRegisterFileDesc {
   uint16_t NumRegisterCostEntries;
   // Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable.
   uint16_t RegisterCostEntryIdx;
+  // A value of zero means: there is no limit in the number of moves that can be
+  // eliminated every cycle.
+  uint16_t MaxMovesEliminatedPerCycle;
+  // Ture if this register file only knows how to optimize register moves from
+  // known zero registers.
+  bool AllowZeroMoveEliminationOnly;
 };
 
 /// Provide extra details about the machine processor.
diff --git a/include/llvm/Target/TargetInstrPredicate.td b/include/llvm/Target/TargetInstrPredicate.td
index c4b14eba776..f70af259603 100644
--- a/include/llvm/Target/TargetInstrPredicate.td
+++ b/include/llvm/Target/TargetInstrPredicate.td
@@ -313,7 +313,7 @@ class STIPredicate<STIPredicateDecl declaration,
 }
 
 // Convenience classes and definitions used by processor scheduling models to
-// describe dependency breaking instructions.
+// describe dependency breaking instructions and move elimination candidates.
 let UpdatesOpcodeMask = 1 in {
 
 def IsZeroIdiomDecl : STIPredicateDecl<"isZeroIdiom">;
@@ -323,8 +323,14 @@ def IsDepBreakingDecl : STIPredicateDecl<"isDependencyBreaking">;
 
 } // UpdatesOpcodeMask
 
+def IsOptimizableRegisterMoveDecl
+    : STIPredicateDecl<"isOptimizableRegisterMove">;
+
 class IsZeroIdiomFunction<list<DepBreakingClass> classes>
     : STIPredicate<IsZeroIdiomDecl, classes>;
 
 class IsDepBreakingFunction<list<DepBreakingClass> classes>
     : STIPredicate<IsDepBreakingDecl, classes>;
+
+class IsOptimizableRegisterMove<list<InstructionEquivalenceClass> classes>
+    : STIPredicate<IsOptimizableRegisterMoveDecl, classes>;
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index 7d7ce2dabe0..77b1927f932 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -460,6 +460,10 @@ class SchedAlias<SchedReadWrite match, SchedReadWrite alias> {
 //  - The number of physical registers which can be used for register renaming
 //    purpose.
 //  - The cost of a register rename.
+//  - The set of registers that allow move elimination.
+//  - The maximum number of moves that can be eliminated every cycle.
+//  - Whether move elimination is limited to register moves whose input
+//    is known to be zero.
 //
 // The cost of a rename is the number of physical registers allocated by the
 // register alias table to map the new definition. By default, register can be
@@ -506,11 +510,35 @@ class SchedAlias<SchedReadWrite match, SchedReadWrite alias> {
 // partial write is combined with the previous super-register definition.  We
 // should add support for these cases, and correctly model merge problems with
 // partial register accesses.
+//
+// Field MaxMovesEliminatedPerCycle specifies how many moves can be eliminated
+// every cycle. A default value of zero for that field means: there is no limit
+// to the number of moves that can be eliminated by this register file.
+//
+// An instruction MI is a candidate for move elimination if a call to
+// method TargetSubtargetInfo::isOptimizableRegisterMove(MI) returns true (see
+// llvm/CodeGen/TargetSubtargetInfo.h, and llvm/MC/MCInstrAnalysis.h).
+//
+// Subtargets can instantiate tablegen class IsOptimizableRegisterMove (see
+// llvm/Target/TargetInstrPredicate.td) to customize the set of move elimination
+// candidates. By default, no instruction is a valid move elimination candidate.
+//
+// A register move MI is eliminated only if:
+//  - MI is a move elimination candidate.
+//  - The destination register is from a register class that allows move
+//    elimination (see field `AllowMoveElimination` below).
+//  - Constraints on the move kind, and the maximum number of moves that can be
+//    eliminated per cycle are all met.
+
 class RegisterFile<int numPhysRegs, list<RegisterClass> Classes = [],
-                   list<int> Costs = []> {
+                   list<int> Costs = [], list<bit> AllowMoveElim = [],
+                   int MaxMoveElimPerCy = 0, bit AllowZeroMoveElimOnly = 0> {
   list<RegisterClass> RegClasses = Classes;
   list<int> RegCosts = Costs;
+  list<bit> AllowMoveElimination = AllowMoveElim;
   int NumPhysRegs = numPhysRegs;
+  int MaxMovesEliminatedPerCycle = MaxMoveElimPerCy;
+  bit AllowZeroMoveEliminationOnly = AllowZeroMoveElimOnly;
   SchedMachineModel SchedModel = ?;
 }
 
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index 2c1a4b6c7f5..33a6b01546d 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -48,12 +48,22 @@ def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
 // part of it.
 // Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
 // access" - Agner Fog's "microarchitecture.pdf".
-def JIntegerPRF : RegisterFile<64, [GR64, CCR]>;
+def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
+                               0,  // Max moves that can be eliminated per cycle.
+                               1>; // Restrict move elimination to zero regs.
 
 // The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
 // registers. Operations on 256-bit data types are cracked into two COPs.
 // Reference: www.realworldtech.com/jaguar/4/
-def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// The PRF in the floating point unit can eliminate a move from a MMX or SSE
+// register that is know to be zero (i.e. it has been zeroed using a zero-idiom
+// dependency breaking instruction, or via VZEROALL).
+// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
+// instructions" - Agner Fog's "microarchitecture.pdf"
+def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
+                          0,  // Max moves that can be eliminated per cycle.
+                          1>; // Restrict move elimination to zero regs.
 
 // The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
 // retire up to two macro-ops per cycle.
@@ -805,4 +815,24 @@ def : IsDepBreakingFunction<[
   ], ZeroIdiomPredicate>
 ]>;
 
+def : IsOptimizableRegisterMove<[
+  InstructionEquivalenceClass<[
+    // GPR variants.
+    MOV32rr, MOV64rr,
+
+    // MMX variants.
+    MMX_MOVQ64rr,
+
+    // SSE variants.
+    MOVAPSrr, MOVUPSrr,
+    MOVAPDrr, MOVUPDrr,
+    MOVDQArr, MOVDQUrr,
+
+    // AVX variants.
+    VMOVAPSrr, VMOVUPSrr,
+    VMOVAPDrr, VMOVUPDrr,
+    VMOVDQArr, VMOVDQUrr
+  ], TruePred >
+]>;
+
 } // SchedModel
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s
index d2588bef30e..3b38173ebca 100644
--- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s
@@ -32,13 +32,13 @@ vaddps %xmm1, %xmm1, %xmm2
 # CHECK-NEXT:  1      3     1.00                        vaddps	%xmm1, %xmm1, %xmm2
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    6
-# CHECK-NEXT: Max number of mappings used:         5
+# CHECK-NEXT: Total number of mappings created:    3
+# CHECK-NEXT: Max number of mappings used:         3
 
 # CHECK:      *  Register File #1 -- JFpuPRF:
 # CHECK-NEXT:    Number of physical registers:     72
-# CHECK-NEXT:    Total number of mappings created: 6
-# CHECK-NEXT:    Max number of mappings used:      5
+# CHECK-NEXT:    Total number of mappings created: 3
+# CHECK-NEXT:    Max number of mappings used:      3
 
 # CHECK:      *  Register File #2 -- JIntegerPRF:
 # CHECK-NEXT:    Number of physical registers:     64
@@ -63,25 +63,25 @@ vaddps %xmm1, %xmm1, %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
-# CHECK-NEXT:  -      -      -     1.00   1.00   1.00   1.00    -      -      -      -      -      -      -
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovaps	%xmm0, %xmm1
 # CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vaddps	%xmm1, %xmm1, %xmm2
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DR   .   .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [0,1]     DeER .   .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,1]     DR   .   .   vmovaps	%xmm0, %xmm1
 # CHECK-NEXT: [0,2]     .DeeeER  .   vaddps	%xmm1, %xmm1, %xmm2
 # CHECK-NEXT: [1,0]     .D----R  .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [1,1]     . DeE--R .   vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: [1,2]     . D=eeeER.   vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,1]     . D----R .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,2]     . DeeeER .   vaddps	%xmm1, %xmm1, %xmm2
 # CHECK-NEXT: [2,0]     .  D----R.   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [2,1]     .  DeE---R   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,1]     .  D----R.   vmovaps	%xmm0, %xmm1
 # CHECK-NEXT: [2,2]     .   DeeeER   vaddps	%xmm1, %xmm1, %xmm2
 
 # CHECK:      Average Wait times (based on the timeline view):
@@ -92,5 +92,5 @@ vaddps %xmm1, %xmm1, %xmm2
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     3     0.0    0.0    2.7       vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: 1.     3     1.0    1.0    1.7       vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: 2.     3     1.3    0.0    0.0       vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 1.     3     0.0    0.0    2.7       vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: 2.     3     1.0    1.0    0.0       vaddps	%xmm1, %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s
index 33cd3972194..096fe6c5a8f 100644
--- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s
@@ -14,12 +14,12 @@ movdqu %xmm5, %xmm0
 
 # CHECK:      Iterations:        3
 # CHECK-NEXT: Instructions:      27
-# CHECK-NEXT: Total Cycles:      19
+# CHECK-NEXT: Total Cycles:      15
 # CHECK-NEXT: Total uOps:        27
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    1.42
-# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: uOps Per Cycle:    1.80
+# CHECK-NEXT: IPC:               1.80
 # CHECK-NEXT: Block RThroughput: 4.5
 
 # CHECK:      Instruction Info:
@@ -42,13 +42,13 @@ movdqu %xmm5, %xmm0
 # CHECK-NEXT:  1      1     0.50                        movdqu	%xmm5, %xmm0
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    21
-# CHECK-NEXT: Max number of mappings used:         8
+# CHECK-NEXT: Total number of mappings created:    0
+# CHECK-NEXT: Max number of mappings used:         0
 
 # CHECK:      *  Register File #1 -- JFpuPRF:
 # CHECK-NEXT:    Number of physical registers:     72
-# CHECK-NEXT:    Total number of mappings created: 21
-# CHECK-NEXT:    Max number of mappings used:      8
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
 
 # CHECK:      *  Register File #2 -- JIntegerPRF:
 # CHECK-NEXT:    Number of physical registers:     64
@@ -73,51 +73,51 @@ movdqu %xmm5, %xmm0
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
-# CHECK-NEXT:  -      -      -     2.00   2.00   3.33   3.67    -      -      -      -     1.33   1.67    -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%mm0, %mm0
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -     movq	%mm0, %mm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%mm0, %mm1
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00   0.33   0.67    -      -      -      -      -      -      -     movaps	%xmm0, %xmm1
-# CHECK-NEXT:  -      -      -     1.00    -     0.33   0.67    -      -      -      -      -      -      -     movups	%xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00   0.67   0.33    -      -      -      -      -      -      -     movapd	%xmm2, %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -     0.33   0.67    -      -      -      -      -      -      -     movupd	%xmm3, %xmm4
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -     movdqa	%xmm4, %xmm5
-# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -     movdqu	%xmm5, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movdqu	%xmm5, %xmm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DR   .    .    .  .   pxor	%mm0, %mm0
-# CHECK-NEXT: [0,1]     DeER .    .    .  .   movq	%mm0, %mm1
-# CHECK-NEXT: [0,2]     .D-R .    .    .  .   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [0,3]     .DeER.    .    .  .   movaps	%xmm0, %xmm1
-# CHECK-NEXT: [0,4]     . DeER    .    .  .   movups	%xmm1, %xmm2
-# CHECK-NEXT: [0,5]     . D=eER   .    .  .   movapd	%xmm2, %xmm3
-# CHECK-NEXT: [0,6]     .  D=eER  .    .  .   movupd	%xmm3, %xmm4
-# CHECK-NEXT: [0,7]     .  D==eER .    .  .   movdqa	%xmm4, %xmm5
-# CHECK-NEXT: [0,8]     .   D==eER.    .  .   movdqu	%xmm5, %xmm0
-# CHECK-NEXT: [1,0]     .   D----R.    .  .   pxor	%mm0, %mm0
-# CHECK-NEXT: [1,1]     .    DeE--R    .  .   movq	%mm0, %mm1
-# CHECK-NEXT: [1,2]     .    D----R    .  .   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [1,3]     .    .DeE--R   .  .   movaps	%xmm0, %xmm1
-# CHECK-NEXT: [1,4]     .    .D=eE-R   .  .   movups	%xmm1, %xmm2
-# CHECK-NEXT: [1,5]     .    . D=eE-R  .  .   movapd	%xmm2, %xmm3
-# CHECK-NEXT: [1,6]     .    . D==eER  .  .   movupd	%xmm3, %xmm4
-# CHECK-NEXT: [1,7]     .    .  D==eER .  .   movdqa	%xmm4, %xmm5
-# CHECK-NEXT: [1,8]     .    .  D===eER.  .   movdqu	%xmm5, %xmm0
-# CHECK-NEXT: [2,0]     .    .   D----R.  .   pxor	%mm0, %mm0
-# CHECK-NEXT: [2,1]     .    .   DeE---R  .   movq	%mm0, %mm1
-# CHECK-NEXT: [2,2]     .    .    D----R  .   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [2,3]     .    .    DeE---R .   movaps	%xmm0, %xmm1
-# CHECK-NEXT: [2,4]     .    .    .DeE--R .   movups	%xmm1, %xmm2
-# CHECK-NEXT: [2,5]     .    .    .D=eE--R.   movapd	%xmm2, %xmm3
-# CHECK-NEXT: [2,6]     .    .    . D=eE-R.   movupd	%xmm3, %xmm4
-# CHECK-NEXT: [2,7]     .    .    . D==eE-R   movdqa	%xmm4, %xmm5
-# CHECK-NEXT: [2,8]     .    .    .  D==eER   movdqu	%xmm5, %xmm0
+# CHECK:      [0,0]     DR   .    .   .   pxor	%mm0, %mm0
+# CHECK-NEXT: [0,1]     DR   .    .   .   movq	%mm0, %mm1
+# CHECK-NEXT: [0,2]     .DR  .    .   .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,3]     .DR  .    .   .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,4]     . DR .    .   .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [0,5]     . DR .    .   .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,6]     .  DR.    .   .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,7]     .  DR.    .   .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,8]     .   DR    .   .   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     .   DR    .   .   pxor	%mm0, %mm0
+# CHECK-NEXT: [1,1]     .    DR   .   .   movq	%mm0, %mm1
+# CHECK-NEXT: [1,2]     .    DR   .   .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [1,3]     .    .DR  .   .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,4]     .    .DR  .   .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [1,5]     .    . DR .   .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,6]     .    . DR .   .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,7]     .    .  DR.   .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,8]     .    .  DR.   .   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .    .   DR   .   pxor	%mm0, %mm0
+# CHECK-NEXT: [2,1]     .    .   DR   .   movq	%mm0, %mm1
+# CHECK-NEXT: [2,2]     .    .    DR  .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [2,3]     .    .    DR  .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,4]     .    .    .DR .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [2,5]     .    .    .DR .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,6]     .    .    . DR.   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,7]     .    .    . DR.   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,8]     .    .    .  DR   movdqu	%xmm5, %xmm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -126,12 +126,12 @@ movdqu %xmm5, %xmm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     0.0    0.0    2.7       pxor	%mm0, %mm0
-# CHECK-NEXT: 1.     3     1.0    1.0    1.7       movq	%mm0, %mm1
-# CHECK-NEXT: 2.     3     0.0    0.0    3.0       xorps	%xmm0, %xmm0
-# CHECK-NEXT: 3.     3     1.0    1.0    1.7       movaps	%xmm0, %xmm1
-# CHECK-NEXT: 4.     3     1.3    0.0    1.0       movups	%xmm1, %xmm2
-# CHECK-NEXT: 5.     3     2.0    0.0    1.0       movapd	%xmm2, %xmm3
-# CHECK-NEXT: 6.     3     2.3    0.0    0.3       movupd	%xmm3, %xmm4
-# CHECK-NEXT: 7.     3     3.0    0.0    0.3       movdqa	%xmm4, %xmm5
-# CHECK-NEXT: 8.     3     3.3    0.0    0.0       movdqu	%xmm5, %xmm0
+# CHECK-NEXT: 0.     3     0.0    0.0    0.0       pxor	%mm0, %mm0
+# CHECK-NEXT: 1.     3     0.0    0.0    0.0       movq	%mm0, %mm1
+# CHECK-NEXT: 2.     3     0.0    0.0    0.0       xorps	%xmm0, %xmm0
+# CHECK-NEXT: 3.     3     0.0    0.0    0.0       movaps	%xmm0, %xmm1
+# CHECK-NEXT: 4.     3     0.0    0.0    0.0       movups	%xmm1, %xmm2
+# CHECK-NEXT: 5.     3     0.0    0.0    0.0       movapd	%xmm2, %xmm3
+# CHECK-NEXT: 6.     3     0.0    0.0    0.0       movupd	%xmm3, %xmm4
+# CHECK-NEXT: 7.     3     0.0    0.0    0.0       movdqa	%xmm4, %xmm5
+# CHECK-NEXT: 8.     3     0.0    0.0    0.0       movdqu	%xmm5, %xmm0
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s
index e3e0abc75e7..3d64bfd0bfd 100644
--- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s
@@ -11,12 +11,12 @@ vmovdqu %xmm5, %xmm0
 
 # CHECK:      Iterations:        3
 # CHECK-NEXT: Instructions:      21
-# CHECK-NEXT: Total Cycles:      16
+# CHECK-NEXT: Total Cycles:      12
 # CHECK-NEXT: Total uOps:        21
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    1.31
-# CHECK-NEXT: IPC:               1.31
+# CHECK-NEXT: uOps Per Cycle:    1.75
+# CHECK-NEXT: IPC:               1.75
 # CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Instruction Info:
@@ -37,13 +37,13 @@ vmovdqu %xmm5, %xmm0
 # CHECK-NEXT:  1      1     0.50                        vmovdqu	%xmm5, %xmm0
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    18
-# CHECK-NEXT: Max number of mappings used:         9
+# CHECK-NEXT: Total number of mappings created:    0
+# CHECK-NEXT: Max number of mappings used:         0
 
 # CHECK:      *  Register File #1 -- JFpuPRF:
 # CHECK-NEXT:    Number of physical registers:     72
-# CHECK-NEXT:    Total number of mappings created: 18
-# CHECK-NEXT:    Max number of mappings used:      9
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
 
 # CHECK:      *  Register File #2 -- JIntegerPRF:
 # CHECK-NEXT:    Number of physical registers:     64
@@ -68,43 +68,43 @@ vmovdqu %xmm5, %xmm0
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
-# CHECK-NEXT:  -      -      -     2.00   2.00   3.00   3.00    -      -      -      -     1.00   1.00    -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00   0.33   0.67    -      -      -      -      -      -      -     vmovaps	%xmm0, %xmm1
-# CHECK-NEXT:  -      -      -     1.00    -     0.67   0.33    -      -      -      -      -      -      -     vmovups	%xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vmovapd	%xmm2, %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vmovupd	%xmm3, %xmm4
-# CHECK-NEXT:  -      -      -      -      -     0.33   0.67    -      -      -      -      -     1.00    -     vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -     1.00    -      -     vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovdqu	%xmm5, %xmm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DR   .    .    .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [0,1]     DeER .    .    .   vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: [0,2]     .DeER.    .    .   vmovups	%xmm1, %xmm2
-# CHECK-NEXT: [0,3]     .D=eER    .    .   vmovapd	%xmm2, %xmm3
-# CHECK-NEXT: [0,4]     . D=eER   .    .   vmovupd	%xmm3, %xmm4
-# CHECK-NEXT: [0,5]     . D==eER  .    .   vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT: [0,6]     .  D==eER .    .   vmovdqu	%xmm5, %xmm0
-# CHECK-NEXT: [1,0]     .  D----R .    .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [1,1]     .   DeE--R.    .   vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: [1,2]     .   D=eE-R.    .   vmovups	%xmm1, %xmm2
-# CHECK-NEXT: [1,3]     .    D=eE-R    .   vmovapd	%xmm2, %xmm3
-# CHECK-NEXT: [1,4]     .    D==eER    .   vmovupd	%xmm3, %xmm4
-# CHECK-NEXT: [1,5]     .    .D==eER   .   vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT: [1,6]     .    .D===eER  .   vmovdqu	%xmm5, %xmm0
-# CHECK-NEXT: [2,0]     .    . D----R  .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [2,1]     .    . DeE---R .   vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: [2,2]     .    .  DeE--R .   vmovups	%xmm1, %xmm2
-# CHECK-NEXT: [2,3]     .    .  D=eE--R.   vmovapd	%xmm2, %xmm3
-# CHECK-NEXT: [2,4]     .    .   D=eE-R.   vmovupd	%xmm3, %xmm4
-# CHECK-NEXT: [2,5]     .    .   D==eE-R   vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT: [2,6]     .    .    D==eER   vmovdqu	%xmm5, %xmm0
+# CHECK:      [0,0]     DR   .    ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     DR   .    ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,2]     .DR  .    ..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [0,3]     .DR  .    ..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,4]     . DR .    ..   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,5]     . DR .    ..   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,6]     .  DR.    ..   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     .  DR.    ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     .   DR    ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,2]     .   DR    ..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [1,3]     .    DR   ..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,4]     .    DR   ..   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,5]     .    .DR  ..   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,6]     .    .DR  ..   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .    . DR ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .    . DR ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,2]     .    .  DR..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [2,3]     .    .  DR..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,4]     .    .   DR.   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,5]     .    .   DR.   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,6]     .    .    DR   vmovdqu	%xmm5, %xmm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -113,10 +113,10 @@ vmovdqu %xmm5, %xmm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     0.0    0.0    2.7       vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: 1.     3     1.0    1.0    1.7       vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: 2.     3     1.3    0.0    1.0       vmovups	%xmm1, %xmm2
-# CHECK-NEXT: 3.     3     2.0    0.0    1.0       vmovapd	%xmm2, %xmm3
-# CHECK-NEXT: 4.     3     2.3    0.0    0.3       vmovupd	%xmm3, %xmm4
-# CHECK-NEXT: 5.     3     3.0    0.0    0.3       vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT: 6.     3     3.3    0.0    0.0       vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: 0.     3     0.0    0.0    0.0       vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     3     0.0    0.0    0.0       vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: 2.     3     0.0    0.0    0.0       vmovups	%xmm1, %xmm2
+# CHECK-NEXT: 3.     3     0.0    0.0    0.0       vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: 4.     3     0.0    0.0    0.0       vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: 5.     3     0.0    0.0    0.0       vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: 6.     3     0.0    0.0    0.0       vmovdqu	%xmm5, %xmm0
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s
index 72ca7693c5f..223b4c2c239 100644
--- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s
@@ -9,12 +9,12 @@ mov %edx, %eax
 
 # CHECK:      Iterations:        3
 # CHECK-NEXT: Instructions:      15
-# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total Cycles:      9
 # CHECK-NEXT: Total uOps:        15
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    1.25
-# CHECK-NEXT: IPC:               1.25
+# CHECK-NEXT: uOps Per Cycle:    1.67
+# CHECK-NEXT: IPC:               1.67
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Instruction Info:
@@ -33,8 +33,8 @@ mov %edx, %eax
 # CHECK-NEXT:  1      1     0.50                        movl	%edx, %eax
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    12
-# CHECK-NEXT: Max number of mappings used:         7
+# CHECK-NEXT: Total number of mappings created:    0
+# CHECK-NEXT: Max number of mappings used:         0
 
 # CHECK:      *  Register File #1 -- JFpuPRF:
 # CHECK-NEXT:    Number of physical registers:     72
@@ -43,8 +43,8 @@ mov %edx, %eax
 
 # CHECK:      *  Register File #2 -- JIntegerPRF:
 # CHECK-NEXT:    Number of physical registers:     64
-# CHECK-NEXT:    Total number of mappings created: 12
-# CHECK-NEXT:    Max number of mappings used:      7
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - JALU0
@@ -64,35 +64,34 @@ mov %edx, %eax
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
-# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%eax, %eax
-# CHECK-NEXT: 0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -     movl	%eax, %ebx
-# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ebx, %ecx
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     movl	%ecx, %edx
-# CHECK-NEXT: 0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -     movl	%edx, %eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%eax, %ebx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ebx, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ecx, %edx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%edx, %eax
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DR   .    ..   xorl	%eax, %eax
-# CHECK-NEXT: [0,1]     DeER .    ..   movl	%eax, %ebx
-# CHECK-NEXT: [0,2]     .DeER.    ..   movl	%ebx, %ecx
-# CHECK-NEXT: [0,3]     .D=eER    ..   movl	%ecx, %edx
-# CHECK-NEXT: [0,4]     . D=eER   ..   movl	%edx, %eax
-# CHECK-NEXT: [1,0]     . D---R   ..   xorl	%eax, %eax
-# CHECK-NEXT: [1,1]     .  DeE-R  ..   movl	%eax, %ebx
-# CHECK-NEXT: [1,2]     .  D=eER  ..   movl	%ebx, %ecx
-# CHECK-NEXT: [1,3]     .   D=eER ..   movl	%ecx, %edx
-# CHECK-NEXT: [1,4]     .   D==eER..   movl	%edx, %eax
-# CHECK-NEXT: [2,0]     .    D---R..   xorl	%eax, %eax
-# CHECK-NEXT: [2,1]     .    DeE--R.   movl	%eax, %ebx
-# CHECK-NEXT: [2,2]     .    .DeE-R.   movl	%ebx, %ecx
-# CHECK-NEXT: [2,3]     .    .D=eE-R   movl	%ecx, %edx
-# CHECK-NEXT: [2,4]     .    . D=eER   movl	%edx, %eax
+# CHECK:      [0,0]     DR   .  .   xorl	%eax, %eax
+# CHECK-NEXT: [0,1]     DR   .  .   movl	%eax, %ebx
+# CHECK-NEXT: [0,2]     .DR  .  .   movl	%ebx, %ecx
+# CHECK-NEXT: [0,3]     .DR  .  .   movl	%ecx, %edx
+# CHECK-NEXT: [0,4]     . DR .  .   movl	%edx, %eax
+# CHECK-NEXT: [1,0]     . DR .  .   xorl	%eax, %eax
+# CHECK-NEXT: [1,1]     .  DR.  .   movl	%eax, %ebx
+# CHECK-NEXT: [1,2]     .  DR.  .   movl	%ebx, %ecx
+# CHECK-NEXT: [1,3]     .   DR  .   movl	%ecx, %edx
+# CHECK-NEXT: [1,4]     .   DR  .   movl	%edx, %eax
+# CHECK-NEXT: [2,0]     .    DR .   xorl	%eax, %eax
+# CHECK-NEXT: [2,1]     .    DR .   movl	%eax, %ebx
+# CHECK-NEXT: [2,2]     .    .DR.   movl	%ebx, %ecx
+# CHECK-NEXT: [2,3]     .    .DR.   movl	%ecx, %edx
+# CHECK-NEXT: [2,4]     .    . DR   movl	%edx, %eax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -101,8 +100,8 @@ mov %edx, %eax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     0.0    0.0    2.0       xorl	%eax, %eax
-# CHECK-NEXT: 1.     3     1.0    1.0    1.0       movl	%eax, %ebx
-# CHECK-NEXT: 2.     3     1.3    0.0    0.3       movl	%ebx, %ecx
-# CHECK-NEXT: 3.     3     2.0    0.0    0.3       movl	%ecx, %edx
-# CHECK-NEXT: 4.     3     2.3    0.0    0.0       movl	%edx, %eax
+# CHECK-NEXT: 0.     3     0.0    0.0    0.0       xorl	%eax, %eax
+# CHECK-NEXT: 1.     3     0.0    0.0    0.0       movl	%eax, %ebx
+# CHECK-NEXT: 2.     3     0.0    0.0    0.0       movl	%ebx, %ecx
+# CHECK-NEXT: 3.     3     0.0    0.0    0.0       movl	%ecx, %edx
+# CHECK-NEXT: 4.     3     0.0    0.0    0.0       movl	%edx, %eax
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s
index 7d6b75f7c3f..ab873c7c43f 100644
--- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s
@@ -9,12 +9,12 @@ mov %rdx, %rax
 
 # CHECK:      Iterations:        3
 # CHECK-NEXT: Instructions:      15
-# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total Cycles:      9
 # CHECK-NEXT: Total uOps:        15
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    1.25
-# CHECK-NEXT: IPC:               1.25
+# CHECK-NEXT: uOps Per Cycle:    1.67
+# CHECK-NEXT: IPC:               1.67
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Instruction Info:
@@ -33,8 +33,8 @@ mov %rdx, %rax
 # CHECK-NEXT:  1      1     0.50                        movq	%rdx, %rax
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    12
-# CHECK-NEXT: Max number of mappings used:         7
+# CHECK-NEXT: Total number of mappings created:    0
+# CHECK-NEXT: Max number of mappings used:         0
 
 # CHECK:      *  Register File #1 -- JFpuPRF:
 # CHECK-NEXT:    Number of physical registers:     72
@@ -43,8 +43,8 @@ mov %rdx, %rax
 
 # CHECK:      *  Register File #2 -- JIntegerPRF:
 # CHECK-NEXT:    Number of physical registers:     64
-# CHECK-NEXT:    Total number of mappings created: 12
-# CHECK-NEXT:    Max number of mappings used:      7
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - JALU0
@@ -64,35 +64,34 @@ mov %rdx, %rax
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
-# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rax, %rax
-# CHECK-NEXT: 0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -     movq	%rax, %rbx
-# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rbx, %rcx
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     movq	%rcx, %rdx
-# CHECK-NEXT: 0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -     movq	%rdx, %rax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rax, %rbx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rbx, %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rcx, %rdx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rdx, %rax
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DR   .    ..   xorq	%rax, %rax
-# CHECK-NEXT: [0,1]     DeER .    ..   movq	%rax, %rbx
-# CHECK-NEXT: [0,2]     .DeER.    ..   movq	%rbx, %rcx
-# CHECK-NEXT: [0,3]     .D=eER    ..   movq	%rcx, %rdx
-# CHECK-NEXT: [0,4]     . D=eER   ..   movq	%rdx, %rax
-# CHECK-NEXT: [1,0]     . D---R   ..   xorq	%rax, %rax
-# CHECK-NEXT: [1,1]     .  DeE-R  ..   movq	%rax, %rbx
-# CHECK-NEXT: [1,2]     .  D=eER  ..   movq	%rbx, %rcx
-# CHECK-NEXT: [1,3]     .   D=eER ..   movq	%rcx, %rdx
-# CHECK-NEXT: [1,4]     .   D==eER..   movq	%rdx, %rax
-# CHECK-NEXT: [2,0]     .    D---R..   xorq	%rax, %rax
-# CHECK-NEXT: [2,1]     .    DeE--R.   movq	%rax, %rbx
-# CHECK-NEXT: [2,2]     .    .DeE-R.   movq	%rbx, %rcx
-# CHECK-NEXT: [2,3]     .    .D=eE-R   movq	%rcx, %rdx
-# CHECK-NEXT: [2,4]     .    . D=eER   movq	%rdx, %rax
+# CHECK:      [0,0]     DR   .  .   xorq	%rax, %rax
+# CHECK-NEXT: [0,1]     DR   .  .   movq	%rax, %rbx
+# CHECK-NEXT: [0,2]     .DR  .  .   movq	%rbx, %rcx
+# CHECK-NEXT: [0,3]     .DR  .  .   movq	%rcx, %rdx
+# CHECK-NEXT: [0,4]     . DR .  .   movq	%rdx, %rax
+# CHECK-NEXT: [1,0]     . DR .  .   xorq	%rax, %rax
+# CHECK-NEXT: [1,1]     .  DR.  .   movq	%rax, %rbx
+# CHECK-NEXT: [1,2]     .  DR.  .   movq	%rbx, %rcx
+# CHECK-NEXT: [1,3]     .   DR  .   movq	%rcx, %rdx
+# CHECK-NEXT: [1,4]     .   DR  .   movq	%rdx, %rax
+# CHECK-NEXT: [2,0]     .    DR .   xorq	%rax, %rax
+# CHECK-NEXT: [2,1]     .    DR .   movq	%rax, %rbx
+# CHECK-NEXT: [2,2]     .    .DR.   movq	%rbx, %rcx
+# CHECK-NEXT: [2,3]     .    .DR.   movq	%rcx, %rdx
+# CHECK-NEXT: [2,4]     .    . DR   movq	%rdx, %rax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -101,8 +100,8 @@ mov %rdx, %rax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     0.0    0.0    2.0       xorq	%rax, %rax
-# CHECK-NEXT: 1.     3     1.0    1.0    1.0       movq	%rax, %rbx
-# CHECK-NEXT: 2.     3     1.3    0.0    0.3       movq	%rbx, %rcx
-# CHECK-NEXT: 3.     3     2.0    0.0    0.3       movq	%rcx, %rdx
-# CHECK-NEXT: 4.     3     2.3    0.0    0.0       movq	%rdx, %rax
+# CHECK-NEXT: 0.     3     0.0    0.0    0.0       xorq	%rax, %rax
+# CHECK-NEXT: 1.     3     0.0    0.0    0.0       movq	%rax, %rbx
+# CHECK-NEXT: 2.     3     0.0    0.0    0.0       movq	%rbx, %rcx
+# CHECK-NEXT: 3.     3     0.0    0.0    0.0       movq	%rcx, %rdx
+# CHECK-NEXT: 4.     3     0.0    0.0    0.0       movq	%rdx, %rax
diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
index 4cfe1a50f53..481e2e18fa9 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
@@ -73,7 +73,8 @@ void RegisterFile::addRegisterFile(const MCRegisterFileDesc &RF,
   // registers in register file #0 through the command line flag
   // `-register-file-size`.
   unsigned RegisterFileIndex = RegisterFiles.size();
-  RegisterFiles.emplace_back(RF.NumPhysRegs);
+  RegisterFiles.emplace_back(RF.NumPhysRegs, RF.MaxMovesEliminatedPerCycle,
+                             RF.AllowZeroMoveEliminationOnly);
 
   // Special case where there is no register class identifier in the set.
   // An empty set of register classes means: this register file contains all
@@ -99,6 +100,7 @@ void RegisterFile::addRegisterFile(const MCRegisterFileDesc &RF,
       }
       IPC = std::make_pair(RegisterFileIndex, RCE.Cost);
       Entry.RenameAs = Reg;
+      Entry.AllowMoveElimination = RCE.AllowMoveElimination;
 
       // Assume the same cost for each sub-register.
       for (MCSubRegIterator I(Reg, &MRI); I.isValid(); ++I) {
@@ -273,10 +275,6 @@ bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) {
   const RegisterMapping &RMFrom = RegisterMappings[RS.getRegisterID()];
   const RegisterMapping &RMTo = RegisterMappings[WS.getRegisterID()];
 
-  // Early exit if the PRF doesn't support move elimination for this register.
-  if (!RMTo.second.AllowMoveElimination)
-    return false;
-
   // From and To must be owned by the same PRF.
   const RegisterRenamingInfo &RRIFrom = RMFrom.second;
   const RegisterRenamingInfo &RRITo = RMTo.second;
@@ -298,9 +296,13 @@ bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) {
   // For now, we assume that there is a strong correlation between registers
   // that allow move elimination, and how those same registers are renamed in
   // hardware.
-  if (RRITo.RenameAs && RRITo.RenameAs != WS.getRegisterID())
+  if (RRITo.RenameAs && RRITo.RenameAs != WS.getRegisterID()) {
+    // Early exit if the PRF doesn't support move elimination for this register.
+    if (!RegisterMappings[RRITo.RenameAs].second.AllowMoveElimination)
+      return false;
     if (!WS.clearsSuperRegisters())
       return false;
+  }
 
   RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex];
   if (RMT.MaxMoveEliminatedPerCycle &&
diff --git a/tools/llvm-mca/lib/InstrBuilder.cpp b/tools/llvm-mca/lib/InstrBuilder.cpp
index 0a26f40b940..1cb020a9f6d 100644
--- a/tools/llvm-mca/lib/InstrBuilder.cpp
+++ b/tools/llvm-mca/lib/InstrBuilder.cpp
@@ -463,6 +463,8 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
   bool IsZeroIdiom = MCIA.isZeroIdiom(MCI, Mask, ProcID);
   bool IsDepBreaking =
       IsZeroIdiom || MCIA.isDependencyBreaking(MCI, Mask, ProcID);
+  if (MCIA.isOptimizableRegisterMove(MCI, ProcID))
+    NewIS->setOptimizableMove();
 
   // Initialize Reads first.
   for (const ReadDescriptor &RD : D.Reads) {
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index f8d7d9ad3d3..e94ed760fc4 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -1759,6 +1759,10 @@ void CodeGenSchedModels::collectRegisterFiles() {
     CodeGenProcModel &PM = getProcModel(RF->getValueAsDef("SchedModel"));
     PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(),RF));
     CodeGenRegisterFile &CGRF = PM.RegisterFiles.back();
+    CGRF.MaxMovesEliminatedPerCycle =
+        RF->getValueAsInt("MaxMovesEliminatedPerCycle");
+    CGRF.AllowZeroMoveEliminationOnly =
+        RF->getValueAsBit("AllowZeroMoveEliminationOnly");
 
     // Now set the number of physical registers as well as the cost of registers
     // in each register class.
@@ -1770,9 +1774,17 @@ void CodeGenSchedModels::collectRegisterFiles() {
 
     RecVec RegisterClasses = RF->getValueAsListOfDefs("RegClasses");
     std::vector<int64_t> RegisterCosts = RF->getValueAsListOfInts("RegCosts");
+    ListInit *MoveElimInfo = RF->getValueAsListInit("AllowMoveElimination");
     for (unsigned I = 0, E = RegisterClasses.size(); I < E; ++I) {
       int Cost = RegisterCosts.size() > I ? RegisterCosts[I] : 1;
-      CGRF.Costs.emplace_back(RegisterClasses[I], Cost);
+
+      bool AllowMoveElim = false;
+      if (MoveElimInfo->size() > I) {
+        BitInit *Val = cast<BitInit>(MoveElimInfo->getElement(I));
+        AllowMoveElim = Val->getValue();
+      }
+
+      CGRF.Costs.emplace_back(RegisterClasses[I], Cost, AllowMoveElim);
     }
   }
 }
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index c2af28bbaa0..39443bb35e9 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -167,8 +167,9 @@ struct CodeGenSchedClass {
 struct CodeGenRegisterCost {
   Record *RCDef;
   unsigned Cost;
-  CodeGenRegisterCost(Record *RC, unsigned RegisterCost)
-      : RCDef(RC), Cost(RegisterCost) {}
+  bool AllowMoveElimination;
+  CodeGenRegisterCost(Record *RC, unsigned RegisterCost, bool AllowMoveElim = false)
+      : RCDef(RC), Cost(RegisterCost), AllowMoveElimination(AllowMoveElim) {}
   CodeGenRegisterCost(const CodeGenRegisterCost &) = default;
   CodeGenRegisterCost &operator=(const CodeGenRegisterCost &) = delete;
 };
@@ -181,12 +182,18 @@ struct CodeGenRegisterCost {
 struct CodeGenRegisterFile {
   std::string Name;
   Record *RegisterFileDef;
+  unsigned MaxMovesEliminatedPerCycle;
+  bool AllowZeroMoveEliminationOnly;
 
   unsigned NumPhysRegs;
   std::vector<CodeGenRegisterCost> Costs;
 
-  CodeGenRegisterFile(StringRef name, Record *def)
-      : Name(name), RegisterFileDef(def), NumPhysRegs(0) {}
+  CodeGenRegisterFile(StringRef name, Record *def, unsigned MaxMoveElimPerCy = 0,
+                      bool AllowZeroMoveElimOnly = false)
+      : Name(name), RegisterFileDef(def),
+        MaxMovesEliminatedPerCycle(MaxMoveElimPerCy),
+        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly),
+        NumPhysRegs(0) {}
 
   bool hasDefaultCosts() const { return Costs.empty(); }
 };
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index ef0428eeed0..d1ea968590f 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -653,7 +653,7 @@ SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
     return 0;
 
   // Print the RegisterCost table first.
-  OS << "\n// {RegisterClassID, Register Cost}\n";
+  OS << "\n// {RegisterClassID, Register Cost, AllowMoveElimination }\n";
   OS << "static const llvm::MCRegisterCostEntry " << ProcModel.ModelName
      << "RegisterCosts"
      << "[] = {\n";
@@ -668,24 +668,28 @@ SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
       Record *Rec = RC.RCDef;
       if (Rec->getValue("Namespace"))
         OS << Rec->getValueAsString("Namespace") << "::";
-      OS << Rec->getName() << "RegClassID, " << RC.Cost << "},\n";
+      OS << Rec->getName() << "RegClassID, " << RC.Cost << ", "
+         << RC.AllowMoveElimination << "},\n";
     }
   }
   OS << "};\n";
 
   // Now generate a table with register file info.
-  OS << "\n // {Name, #PhysRegs, #CostEntries, IndexToCostTbl}\n";
+  OS << "\n // {Name, #PhysRegs, #CostEntries, IndexToCostTbl, "
+     << "MaxMovesEliminatedPerCycle, AllowZeroMoveEliminationOnly }\n";
   OS << "static const llvm::MCRegisterFileDesc " << ProcModel.ModelName
      << "RegisterFiles"
      << "[] = {\n"
-     << "  { \"InvalidRegisterFile\", 0, 0, 0 },\n";
+     << "  { \"InvalidRegisterFile\", 0, 0, 0, 0, 0 },\n";
   unsigned CostTblIndex = 0;
 
   for (const CodeGenRegisterFile &RD : ProcModel.RegisterFiles) {
     OS << "  { ";
     OS << '"' << RD.Name << '"' << ", " << RD.NumPhysRegs << ", ";
     unsigned NumCostEntries = RD.Costs.size();
-    OS << NumCostEntries << ", " << CostTblIndex << "},\n";
+    OS << NumCostEntries << ", " << CostTblIndex << ", "
+       << RD.MaxMovesEliminatedPerCycle << ", "
+       << RD.AllowZeroMoveEliminationOnly << "},\n";
     CostTblIndex += NumCostEntries;
   }
   OS << "};\n";
-- 
GitLab


From d584a99dbb2a9af55ddaa41265a512c8c634b7cf Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Oct 2018 12:10:34 +0000
Subject: [PATCH 0095/1116] [X86][SSE] Add extract_subvector(PSHUFB) ->
 PSHUFB(extract_subvector()) combine

Fixes PR32160 by reducing the size of PSHUFB if we only use one of the lanes.

This approach can probably be generalized to handle any target shuffle (and any subvector index) but we have no test coverage at the moment.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344336 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 12 ++++++++++++
 test/CodeGen/X86/vector-trunc.ll   |  6 ++----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c6ab4fb70f6..15bd238833d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -40306,6 +40306,18 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
                                                  : ISD::SIGN_EXTEND_VECTOR_INREG;
       return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
     }
+    if (InOpcode == ISD::BITCAST) {
+      // TODO - do this for target shuffles in general.
+      SDValue InVecBC = peekThroughOneUseBitcasts(InVec);
+      if (InVecBC.getOpcode() == X86ISD::PSHUFB && OpVT.is128BitVector()) {
+        SDLoc DL(N);
+        SDValue SubPSHUFB =
+            DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+                        extract128BitVector(InVecBC.getOperand(0), 0, DAG, DL),
+                        extract128BitVector(InVecBC.getOperand(1), 0, DAG, DL));
+        return DAG.getBitcast(OpVT, SubPSHUFB);
+      }
+    }
   }
 
   return SDValue();
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index 0d00f8af5a8..db3692f318f 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -1922,16 +1922,14 @@ define <8 x i16> @PR32160(<8 x i32> %x) {
 ;
 ; AVX2-SLOW-LABEL: PR32160:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: PR32160:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
-- 
GitLab


From 9a6d7be910df45c53626f8f4272c69ad971abc87 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Fri, 12 Oct 2018 12:26:37 +0000
Subject: [PATCH 0096/1116] Fix documentation of MachineInstr::getNumOperands

The documentation stated "Access to explicit operands of the
instruction." This is misleading, as it also lists implicit operands.

Patch by Philip Ginsbach.

Differential Revision: https://reviews.llvm.org/D35481


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344338 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineInstr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 7c4e771ce72..ea1a2a536fc 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -408,7 +408,7 @@ public:
   /// Returns the opcode of this MachineInstr.
   unsigned getOpcode() const { return MCID->Opcode; }
 
-  /// Access to explicit operands of the instruction.
+  /// Retuns the total number of operands.
   unsigned getNumOperands() const { return NumOperands; }
 
   const MachineOperand& getOperand(unsigned i) const {
-- 
GitLab


From 638941f488dd9d1b1b9d49913240fdc2beec56a0 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Fri, 12 Oct 2018 12:38:27 +0000
Subject: [PATCH 0097/1116] [llvm-mca] Remove method
 RegisterFileStatistics::initializeRegisterFileInfo(). NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344339 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/Views/RegisterFileStatistics.cpp | 3 ++-
 tools/llvm-mca/Views/RegisterFileStatistics.h   | 7 +------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/tools/llvm-mca/Views/RegisterFileStatistics.cpp b/tools/llvm-mca/Views/RegisterFileStatistics.cpp
index 7dbc76a51e1..cd540e9dc60 100644
--- a/tools/llvm-mca/Views/RegisterFileStatistics.cpp
+++ b/tools/llvm-mca/Views/RegisterFileStatistics.cpp
@@ -19,7 +19,8 @@ using namespace llvm;
 
 namespace mca {
 
-void RegisterFileStatistics::initializeRegisterFileInfo() {
+RegisterFileStatistics::RegisterFileStatistics(const llvm::MCSubtargetInfo &sti)
+    : STI(sti) {
   const MCSchedModel &SM = STI.getSchedModel();
   RegisterFileUsage Empty = {0, 0, 0};
   if (!SM.hasExtraProcessorInfo()) {
diff --git a/tools/llvm-mca/Views/RegisterFileStatistics.h b/tools/llvm-mca/Views/RegisterFileStatistics.h
index 3dcac4d4f75..1e89d66dc50 100644
--- a/tools/llvm-mca/Views/RegisterFileStatistics.h
+++ b/tools/llvm-mca/Views/RegisterFileStatistics.h
@@ -51,15 +51,10 @@ class RegisterFileStatistics : public View {
   // There is one entry for each register file implemented by the processor.
   llvm::SmallVector<RegisterFileUsage, 4> RegisterFiles;
 
-  void initializeRegisterFileInfo();
-
 public:
-  RegisterFileStatistics(const llvm::MCSubtargetInfo &sti) : STI(sti) {
-    initializeRegisterFileInfo();
-  }
+  RegisterFileStatistics(const llvm::MCSubtargetInfo &sti);
 
   void onEvent(const HWInstructionEvent &Event) override;
-
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
-- 
GitLab


From cadc63b548f93cf19407baa6d1760eb6c668c183 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Oct 2018 13:24:51 +0000
Subject: [PATCH 0098/1116] [X86][AVX] Regenerate tzcnt tests

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344341 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/vector-tzcnt-256.ll | 138 +++++----------------------
 1 file changed, 24 insertions(+), 114 deletions(-)

diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index 775a7a359ab..b1173fa4b88 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -1370,145 +1370,55 @@ define <4 x i64> @foldv4i64u() nounwind {
 }
 
 define <8 x i32> @foldv8i32() nounwind {
-; AVX-LABEL: foldv8i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv8i32:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv8i32:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv8i32:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv8i32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
   ret <8 x i32> %out
 }
 
 define <8 x i32> @foldv8i32u() nounwind {
-; AVX-LABEL: foldv8i32u:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv8i32u:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv8i32u:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv8i32u:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv8i32u:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
   ret <8 x i32> %out
 }
 
 define <16 x i16> @foldv16i16() nounwind {
-; AVX-LABEL: foldv16i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv16i16:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv16i16:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv16i16:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv16i16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
   ret <16 x i16> %out
 }
 
 define <16 x i16> @foldv16i16u() nounwind {
-; AVX-LABEL: foldv16i16u:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv16i16u:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv16i16u:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv16i16u:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv16i16u:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
   ret <16 x i16> %out
 }
 
 define <32 x i8> @foldv32i8() nounwind {
-; AVX-LABEL: foldv32i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv32i8:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv32i8:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv32i8:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv32i8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
   ret <32 x i8> %out
 }
 
 define <32 x i8> @foldv32i8u() nounwind {
-; AVX-LABEL: foldv32i8u:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv32i8u:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv32i8u:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv32i8u:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv32i8u:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
   ret <32 x i8> %out
 }
-- 
GitLab


From 29956bffea8f12634f457488cfe5071fed92d120 Mon Sep 17 00:00:00 2001
From: Max Moroz <mmoroz@chromium.org>
Date: Fri, 12 Oct 2018 13:59:31 +0000
Subject: [PATCH 0099/1116] [SanitizerCoverage] Make Inline8bit and TracePC
 counters dead stripping resistant.

Summary:
Otherwise, at least on Mac, the linker eliminates unused symbols which
causes libFuzzer to error out due to a mismatch of the sizes of coverage tables.

Issue in Chromium: https://bugs.chromium.org/p/chromium/issues/detail?id=892167

Reviewers: morehouse, kcc, george.karpenkov

Reviewed By: morehouse

Subscribers: kubamracek, llvm-commits

Differential Revision: https://reviews.llvm.org/D53113

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344345 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Instrumentation/SanitizerCoverage.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 2a055920c3e..bf461c61ede 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -594,6 +594,7 @@ GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
   Array->setSection(getSectionName(Section));
   Array->setAlignment(Ty->isPointerTy() ? DL->getPointerSize()
                                         : Ty->getPrimitiveSizeInBits() / 8);
+  GlobalsToAppendToUsed.push_back(Array);
   GlobalsToAppendToCompilerUsed.push_back(Array);
   MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
   Array->addMetadata(LLVMContext::MD_associated, *MD);
@@ -631,14 +632,14 @@ SanitizerCoverageModule::CreatePCArray(Function &F,
 
 void SanitizerCoverageModule::CreateFunctionLocalArrays(
     Function &F, ArrayRef<BasicBlock *> AllBlocks) {
-  if (Options.TracePCGuard) {
+  if (Options.TracePCGuard)
     FunctionGuardArray = CreateFunctionLocalArrayInSection(
         AllBlocks.size(), F, Int32Ty, SanCovGuardsSectionName);
-    GlobalsToAppendToUsed.push_back(FunctionGuardArray);
-  }
+
   if (Options.Inline8bitCounters)
     Function8bitCounterArray = CreateFunctionLocalArrayInSection(
         AllBlocks.size(), F, Int8Ty, SanCovCountersSectionName);
+
   if (Options.PCTable)
     FunctionPCsArray = CreatePCArray(F, AllBlocks);
 }
-- 
GitLab


From 706d3da44f2997d8fada9a5cd3ef89cf91883ccf Mon Sep 17 00:00:00 2001
From: Hiroshi Inoue <inouehrs@jp.ibm.com>
Date: Fri, 12 Oct 2018 14:02:20 +0000
Subject: [PATCH 0100/1116] [PowerPC] avoid masking already-zero bits in
 BitPermutationSelector

The current BitPermutationSelector generates a code to build a value by tracking two types of bits: ConstZero and Variable.
ConstZero means a bit we need to mask off and Variable is a bit we copy from an input value.

This patch add third type of bits VariableKnownToBeZero caused by AssertZext node or zero-extending load node.
VariableKnownToBeZero means a bit comes from an input value, but it is known to be already zero. So we do not need to mask them.
VariableKnownToBeZero enhances flexibility to group bits, since we can avoid redundant masking for these bits.

This patch also renames "HasZero" to "NeedMask" since now we may skip masking even when we have zeros (of type VariableKnownToBeZero).

Differential Revision: https://reviews.llvm.org/D48025


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344347 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp     | 119 ++++++++++++++++++---
 test/CodeGen/PowerPC/addi-offset-fold.ll   |   5 +-
 test/CodeGen/PowerPC/bitfieldinsert.ll     |  35 +++++-
 test/CodeGen/PowerPC/ppc64le-aggregates.ll |  10 +-
 test/CodeGen/PowerPC/rlwimi-dyn-and.ll     |   2 +-
 5 files changed, 143 insertions(+), 28 deletions(-)

diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index af17bb5f165..5ec7b102884 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -1083,9 +1083,14 @@ class BitPermutationSelector {
     // lowest-order bit.
     unsigned Idx;
 
+    // ConstZero means a bit we need to mask off.
+    // Variable is a bit comes from an input variable.
+    // VariableKnownToBeZero is also a bit comes from an input variable,
+    // but it is known to be already zero. So we do not need to mask them.
     enum Kind {
       ConstZero,
-      Variable
+      Variable,
+      VariableKnownToBeZero
     } K;
 
     ValueBit(SDValue V, unsigned I, Kind K = Variable)
@@ -1094,11 +1099,11 @@ class BitPermutationSelector {
       : V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {}
 
     bool isZero() const {
-      return K == ConstZero;
+      return K == ConstZero || K == VariableKnownToBeZero;
     }
 
     bool hasValue() const {
-      return K == Variable;
+      return K == Variable || K == VariableKnownToBeZero;
     }
 
     SDValue getValue() const {
@@ -1248,8 +1253,14 @@ class BitPermutationSelector {
         for (unsigned i = 0; i < NumBits; ++i)
           if (((Mask >> i) & 1) == 1)
             Bits[i] = (*LHSBits)[i];
-          else
-            Bits[i] = ValueBit(ValueBit::ConstZero);
+          else {
+            // AND instruction masks this bit. If the input is already zero,
+            // we have nothing to do here. Otherwise, make the bit ConstZero.
+            if ((*LHSBits)[i].isZero())
+              Bits[i] = (*LHSBits)[i];
+            else
+              Bits[i] = ValueBit(ValueBit::ConstZero);
+          }
 
         return std::make_pair(Interesting, &Bits);
       }
@@ -1259,8 +1270,26 @@ class BitPermutationSelector {
       const auto &RHSBits = *getValueBits(V.getOperand(1), NumBits).second;
 
       bool AllDisjoint = true;
-      for (unsigned i = 0; i < NumBits; ++i)
-        if (LHSBits[i].isZero())
+      SDValue LastVal = SDValue();
+      unsigned LastIdx = 0;
+      for (unsigned i = 0; i < NumBits; ++i) {
+        if (LHSBits[i].isZero() && RHSBits[i].isZero()) {
+          // If both inputs are known to be zero and one is ConstZero and
+          // another is VariableKnownToBeZero, we can select whichever
+          // we like. To minimize the number of bit groups, we select
+          // VariableKnownToBeZero if this bit is the next bit of the same
+          // input variable from the previous bit. Otherwise, we select
+          // ConstZero.
+          if (LHSBits[i].hasValue() && LHSBits[i].getValue() == LastVal &&
+              LHSBits[i].getValueBitIndex() == LastIdx + 1)
+            Bits[i] = LHSBits[i];
+          else if (RHSBits[i].hasValue() && RHSBits[i].getValue() == LastVal &&
+                   RHSBits[i].getValueBitIndex() == LastIdx + 1)
+            Bits[i] = RHSBits[i];
+          else
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+        }
+        else if (LHSBits[i].isZero())
           Bits[i] = RHSBits[i];
         else if (RHSBits[i].isZero())
           Bits[i] = LHSBits[i];
@@ -1268,6 +1297,16 @@ class BitPermutationSelector {
           AllDisjoint = false;
           break;
         }
+        // We remember the value and bit index of this bit.
+        if (Bits[i].hasValue()) {
+          LastVal = Bits[i].getValue();
+          LastIdx = Bits[i].getValueBitIndex();
+        }
+        else {
+          if (LastVal) LastVal = SDValue();
+          LastIdx = 0;
+        }
+      }
 
       if (!AllDisjoint)
         break;
@@ -1293,6 +1332,44 @@ class BitPermutationSelector {
 
       return std::make_pair(Interesting, &Bits);
     }
+    case ISD::AssertZext: {
+      // For AssertZext, we look through the operand and
+      // mark the bits known to be zero.
+      const SmallVector<ValueBit, 64> *LHSBits;
+      std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0),
+                                                    NumBits);
+
+      EVT FromType = cast<VTSDNode>(V.getOperand(1))->getVT();
+      const unsigned NumValidBits = FromType.getSizeInBits();
+      for (unsigned i = 0; i < NumValidBits; ++i)
+        Bits[i] = (*LHSBits)[i];
+
+      // These bits are known to be zero.
+      for (unsigned i = NumValidBits; i < NumBits; ++i)
+        Bits[i] = ValueBit((*LHSBits)[i].getValue(),
+                           (*LHSBits)[i].getValueBitIndex(),
+                           ValueBit::VariableKnownToBeZero);
+
+      return std::make_pair(Interesting, &Bits);
+    }
+    case ISD::LOAD:
+      LoadSDNode *LD = cast<LoadSDNode>(V);
+      if (ISD::isZEXTLoad(V.getNode()) && V.getResNo() == 0) {
+        EVT VT = LD->getMemoryVT();
+        const unsigned NumValidBits = VT.getSizeInBits();
+
+        for (unsigned i = 0; i < NumValidBits; ++i)
+          Bits[i] = ValueBit(V, i);
+
+        // These bits are known to be zero.
+        for (unsigned i = NumValidBits; i < NumBits; ++i)
+          Bits[i] = ValueBit(V, i, ValueBit::VariableKnownToBeZero);
+
+        // Zero-extending load itself cannot be optimized. So, it is not
+        // interesting by itself though it gives useful information.
+        return std::make_pair(Interesting = false, &Bits);
+      }
+      break;
     }
 
     for (unsigned i = 0; i < NumBits; ++i)
@@ -1304,7 +1381,7 @@ class BitPermutationSelector {
   // For each value (except the constant ones), compute the left-rotate amount
   // to get it from its original to final position.
   void computeRotationAmounts() {
-    HasZeros = false;
+    NeedMask = false;
     RLAmt.resize(Bits.size());
     for (unsigned i = 0; i < Bits.size(); ++i)
       if (Bits[i].hasValue()) {
@@ -1314,7 +1391,7 @@ class BitPermutationSelector {
         else
           RLAmt[i] = Bits.size() - (VBI - i);
       } else if (Bits[i].isZero()) {
-        HasZeros = true;
+        NeedMask = true;
         RLAmt[i] = UINT32_MAX;
       } else {
         llvm_unreachable("Unknown value bit type");
@@ -1330,6 +1407,7 @@ class BitPermutationSelector {
     unsigned LastRLAmt = RLAmt[0];
     SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue();
     unsigned LastGroupStartIdx = 0;
+    bool IsGroupOfZeros = !Bits[LastGroupStartIdx].hasValue();
     for (unsigned i = 1; i < Bits.size(); ++i) {
       unsigned ThisRLAmt = RLAmt[i];
       SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();
@@ -1342,10 +1420,20 @@ class BitPermutationSelector {
           LastGroupStartIdx = 0;
       }
 
+      // If this bit is known to be zero and the current group is a bit group
+      // of zeros, we do not need to terminate the current bit group even the
+      // Value or RLAmt does not match here. Instead, we terminate this group
+      // when the first non-zero bit appears later.
+      if (IsGroupOfZeros && Bits[i].isZero())
+        continue;
+
       // If this bit has the same underlying value and the same rotate factor as
       // the last one, then they're part of the same group.
       if (ThisRLAmt == LastRLAmt && ThisValue == LastValue)
-        continue;
+        // We cannot continue the current group if this bits is not known to
+        // be zero in a bit group of zeros.
+        if (!(IsGroupOfZeros && ThisValue && !Bits[i].isZero()))
+          continue;
 
       if (LastValue.getNode())
         BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
@@ -1353,6 +1441,7 @@ class BitPermutationSelector {
       LastRLAmt = ThisRLAmt;
       LastValue = ThisValue;
       LastGroupStartIdx = i;
+      IsGroupOfZeros = !Bits[LastGroupStartIdx].hasValue();
     }
     if (LastValue.getNode())
       BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
@@ -1698,7 +1787,7 @@ class BitPermutationSelector {
     // If we've not yet selected a 'starting' instruction, and we have no zeros
     // to fill in, select the (Value, RLAmt) with the highest priority (largest
     // number of groups), and start with this rotated value.
-    if ((!HasZeros || LateMask) && !Res) {
+    if ((!NeedMask || LateMask) && !Res) {
       ValueRotInfo &VRI = ValueRotsVec[0];
       if (VRI.RLAmt) {
         if (InstCnt) *InstCnt += 1;
@@ -2077,7 +2166,7 @@ class BitPermutationSelector {
     // If we've not yet selected a 'starting' instruction, and we have no zeros
     // to fill in, select the (Value, RLAmt) with the highest priority (largest
     // number of groups), and start with this rotated value.
-    if ((!HasZeros || LateMask) && !Res) {
+    if ((!NeedMask || LateMask) && !Res) {
       // If we have both Repl32 groups and non-Repl32 groups, the non-Repl32
       // groups will come first, and so the VRI representing the largest number
       // of groups might not be first (it might be the first Repl32 groups).
@@ -2230,7 +2319,7 @@ class BitPermutationSelector {
 
   SmallVector<ValueBit, 64> Bits;
 
-  bool HasZeros;
+  bool NeedMask;
   SmallVector<unsigned, 64> RLAmt;
 
   SmallVector<BitGroup, 16> BitGroups;
@@ -2259,10 +2348,10 @@ public:
                          " selection for:    ");
     LLVM_DEBUG(N->dump(CurDAG));
 
-    // Fill it RLAmt and set HasZeros.
+    // Fill it RLAmt and set NeedMask.
     computeRotationAmounts();
 
-    if (!HasZeros)
+    if (!NeedMask)
       return Select(N, false);
 
     // We currently have two techniques for handling results with zeros: early
diff --git a/test/CodeGen/PowerPC/addi-offset-fold.ll b/test/CodeGen/PowerPC/addi-offset-fold.ll
index 7af99203694..db2fb0eee7c 100644
--- a/test/CodeGen/PowerPC/addi-offset-fold.ll
+++ b/test/CodeGen/PowerPC/addi-offset-fold.ll
@@ -27,10 +27,9 @@ entry:
 ; FIXME: We don't need to do these stores at all.
 ; CHECK-DAG: std 3, -24(1)
 ; CHECK-DAG: stb 4, -16(1)
-; CHECK-DAG: sldi [[REG3:[0-9]+]], 4, 32
 ; CHECK-DAG: lwz [[REG2:[0-9]+]], -20(1)
-; CHECK-DAG: or [[REG4:[0-9]+]], [[REG2]], [[REG3]]
-; CHECK: rldicl 3, [[REG4]], 33, 57
+; CHECK-DAG: rlwinm 3, [[REG2]], 1, 31, 31
+; CHECK: rlwimi 3, 4, 1, 25, 30
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/bitfieldinsert.ll b/test/CodeGen/PowerPC/bitfieldinsert.ll
index e654c7d8a0c..76a648b6f13 100644
--- a/test/CodeGen/PowerPC/bitfieldinsert.ll
+++ b/test/CodeGen/PowerPC/bitfieldinsert.ll
@@ -1,6 +1,35 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
 
+; equivalent C code
+;   struct s64 {
+;   	int a:5;
+;   	int b:16;
+;   	long c:42;
+;   };
+;   void bitfieldinsert64(struct s *p, unsigned short v) {
+;   	p->b = v;
+;   }
+
+%struct.s64 = type { i64 }
+
+define void @bitfieldinsert64(%struct.s64* nocapture %p, i16 zeroext %v) {
+; CHECK-LABEL: @bitfieldinsert64
+; CHECK: ld [[REG1:[0-9]+]], 0(3)
+; CHECK-NEXT: rlwimi [[REG1]], 4, 5, 11, 26
+; CHECK-NEXT: std [[REG1]], 0(3)
+; CHECK-NEXT: blr
+entry:
+  %0 = getelementptr inbounds %struct.s64, %struct.s64* %p, i64 0, i32 0
+  %1 = zext i16 %v to i64
+  %bf.load = load i64, i64* %0, align 8
+  %bf.shl = shl nuw nsw i64 %1, 5
+  %bf.clear = and i64 %bf.load, -2097121
+  %bf.set = or i64 %bf.clear, %bf.shl
+  store i64 %bf.set, i64* %0, align 8
+  ret void
+}
+
 ; bitfieldinsert32: Test for rlwimi
 ; equivalent C code
 ;   struct s32 {
@@ -17,9 +46,9 @@
 define void @bitfieldinsert32(%struct.s32* nocapture %p, i32 zeroext %v) {
 ; CHECK-LABEL: @bitfieldinsert32
 ; CHECK: lwz [[REG1:[0-9]+]], 0(3)
-; CHECK: rlwimi [[REG1]], 4, 8, 8, 23
-; CHECK: stw [[REG1]], 0(3)
-; CHECK: blr
+; CHECK-NEXT: rlwimi [[REG1]], 4, 8, 8, 23
+; CHECK-NEXT: stw [[REG1]], 0(3)
+; CHECK-NEXT: blr
 entry:
   %0 = getelementptr inbounds %struct.s32, %struct.s32* %p, i64 0, i32 0
   %bf.load = load i32, i32* %0, align 4
diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
index 91119786b1f..a35250526c7 100644
--- a/test/CodeGen/PowerPC/ppc64le-aggregates.ll
+++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
@@ -236,14 +236,12 @@ entry:
 ; CHECK-DAG: stfs 6, [[OFF1:[0-9]+]](1)
 ; CHECK-DAG: stfs 7, [[OFF2:[0-9]+]](1)
 ; CHECK-DAG: stfs 8, [[OFF3:[0-9]+]](1)
-; CHECK-DAG: lwz [[REG0:[0-9]+]], [[OFF0]](1)
+; CHECK-DAG: lwz 9, [[OFF0]](1)
 ; CHECK-DAG: lwz [[REG1:[0-9]+]], [[OFF1]](1)
-; CHECK-DAG: lwz [[REG2:[0-9]+]], [[OFF2]](1)
+; CHECK-DAG: lwz 10, [[OFF2]](1)
 ; CHECK-DAG: lwz [[REG3:[0-9]+]], [[OFF3]](1)
-; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
-; CHECK-DAG: sldi [[REG3]], [[REG3]], 32
-; CHECK-DAG: or 9, [[REG0]], [[REG1]]
-; CHECK-DAG: or 10, [[REG2]], [[REG3]]
+; CHECK-DAG: rldimi 9, [[REG1]], 32, 0
+; CHECK-DAG: rldimi 10, [[REG3]], 32, 0
 ; CHECK: bl test1
 
 declare void @test1([8 x float], [8 x float])
diff --git a/test/CodeGen/PowerPC/rlwimi-dyn-and.ll b/test/CodeGen/PowerPC/rlwimi-dyn-and.ll
index 0d7501afc27..6e2802f6ff9 100644
--- a/test/CodeGen/PowerPC/rlwimi-dyn-and.ll
+++ b/test/CodeGen/PowerPC/rlwimi-dyn-and.ll
@@ -39,7 +39,7 @@ next:
   ret i32 %conv174
 
 ; CHECK-LABEL: @test2
-; CHECK: slwi 3, {{[0-9]+}}, 7
+; CHECK: rlwinm 3, {{[0-9]+}}, 7, 17, 24
 ; CHECK: rlwimi 3, {{[0-9]+}}, 15, 16, 16
 ; CHECK: blr
 }
-- 
GitLab


From c4e53cf2f6736bb5ff869810f26b30daacb5f78c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Oct 2018 14:18:47 +0000
Subject: [PATCH 0101/1116] [X86][SSE] LowerVectorCTPOP - pull out repeated
 byte sum stage.

Pull out repeated byte sum stage for popcount of vector elements > 8bits.

This allows us to simplify the LUT/BITMATH popcnt code to always assume vXi8 vectors, and also improves avx512bitalg codegen which only has access to vpopcntb/vpopcntw.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344348 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp    |  81 ++++------
 test/CodeGen/X86/vec_ctbits.ll        |  84 +++++-----
 test/CodeGen/X86/vector-lzcnt-128.ll  | 144 ++++++++---------
 test/CodeGen/X86/vector-popcnt-128.ll | 112 ++++++-------
 test/CodeGen/X86/vector-popcnt-256.ll |  38 +----
 test/CodeGen/X86/vector-popcnt-512.ll |  18 +--
 test/CodeGen/X86/vector-tzcnt-128.ll  | 220 ++++++++++----------------
 test/CodeGen/X86/vector-tzcnt-256.ll  |  72 +--------
 test/CodeGen/X86/vector-tzcnt-512.ll  |  36 +----
 9 files changed, 292 insertions(+), 513 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 15bd238833d..d2971d0f861 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -25023,7 +25023,8 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
                                         SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
-  unsigned VecSize = VT.getSizeInBits();
+  int NumElts = VT.getVectorNumElements();
+  assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
 
   // Implement a lookup table in register by using an algorithm based on:
   // http://wm.ite.pl/articles/sse-popcount.html
@@ -25035,56 +25036,37 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
   // masked out higher ones) for each byte. PSHUFB is used separately with both
   // to index the in-register table. Next, both are added and the result is a
   // i8 vector where each element contains the pop count for input byte.
-  //
-  // To obtain the pop count for elements != i8, we follow up with the same
-  // approach and use additional tricks as described below.
-  //
   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
 
-  int NumByteElts = VecSize / 8;
-  MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
-  SDValue In = DAG.getBitcast(ByteVecVT, Op);
   SmallVector<SDValue, 64> LUTVec;
-  for (int i = 0; i < NumByteElts; ++i)
+  for (int i = 0; i < NumElts; ++i)
     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
-  SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
-  SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
+  SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
+  SDValue M0F = DAG.getConstant(0x0F, DL, VT);
 
   // High nibbles
-  SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
-  SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
+  SDValue FourV = DAG.getConstant(4, DL, VT);
+  SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
 
   // Low nibbles
-  SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
+  SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
 
   // The input vector is used as the shuffle mask that index elements into the
   // LUT. After counting low and high nibbles, add the vector to obtain the
   // final pop count per i8 element.
-  SDValue HighPopCnt =
-      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
-  SDValue LowPopCnt =
-      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
-  SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
-
-  if (EltVT == MVT::i8)
-    return PopCnt;
-
-  return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
+  SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
+  SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
+  return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
 }
 
 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
-  assert(VT.is128BitVector() &&
-         "Only 128-bit vector bitmath lowering supported.");
-
-  int VecSize = VT.getSizeInBits();
-  MVT EltVT = VT.getVectorElementType();
-  int Len = EltVT.getSizeInBits();
+  assert(VT == MVT::v16i8 && "Only v16i8 vector CTPOP lowering supported.");
 
   // This is the vectorized version of the "best" algorithm from
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
@@ -25108,36 +25090,27 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
   // x86, so set the SRL type to have elements at least i16 wide. This is
   // correct because all of our SRLs are followed immediately by a mask anyways
   // that handles any bits that sneak into the high bits of the byte elements.
-  MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
-
+  MVT SrlVT = MVT::v8i16;
   SDValue V = Op;
 
   // v = v - ((v >> 1) & 0x55555555...)
   SDValue Srl =
       DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
-  SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
+  SDValue And = GetMask(Srl, APInt(8, 0x55));
   V = DAG.getNode(ISD::SUB, DL, VT, V, And);
 
   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
-  SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
+  SDValue AndLHS = GetMask(V, APInt(8, 0x33));
   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
-  SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
+  SDValue AndRHS = GetMask(Srl, APInt(8, 0x33));
   V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
 
   // v = (v + (v >> 4)) & 0x0F0F0F0F...
   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
-  V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
+  V = GetMask(Add, APInt(8, 0x0F));
 
-  // At this point, V contains the byte-wise population count, and we are
-  // merely doing a horizontal sum if necessary to get the wider element
-  // counts.
-  if (EltVT == MVT::i8)
-    return V;
-
-  return LowerHorizontalByteSum(
-      DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
-      DAG);
+  return V;
 }
 
 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
@@ -25163,12 +25136,6 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
-  if (!Subtarget.hasSSSE3()) {
-    // We can't use the fast LUT approach, so fall back on vectorized bitmath.
-    assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
-    return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
-  }
-
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntUnary(Op, DAG);
@@ -25177,6 +25144,18 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
   if (VT.is512BitVector() && !Subtarget.hasBWI())
     return Lower512IntUnary(Op, DAG);
 
+  // For element types greater than i8, do vXi8 pop counts and a bytesum.
+  if (VT.getScalarType() != MVT::i8) {
+    MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+    SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
+    SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
+    return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
+  }
+
+  // We can't use the fast LUT approach, so fall back on vectorized bitmath.
+  if (!Subtarget.hasSSSE3())
+    return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
+
   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
 }
 
diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 781c61b5789..978a40cbb26 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll
@@ -15,18 +15,18 @@ define <2 x i64> @footz(<2 x i64> %a) nounwind {
 ; CHECK-NEXT:    pcmpeqd %xmm3, %xmm3
 ; CHECK-NEXT:    paddq %xmm2, %xmm3
 ; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlq $1, %xmm0
+; CHECK-NEXT:    psrlw $1, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubq %xmm0, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    psubb %xmm0, %xmm3
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; CHECK-NEXT:    movdqa %xmm3, %xmm2
 ; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    psrlq $2, %xmm3
+; CHECK-NEXT:    psrlw $2, %xmm3
 ; CHECK-NEXT:    pand %xmm0, %xmm3
-; CHECK-NEXT:    paddq %xmm2, %xmm3
+; CHECK-NEXT:    paddb %xmm2, %xmm3
 ; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlq $4, %xmm0
-; CHECK-NEXT:    paddq %xmm3, %xmm0
+; CHECK-NEXT:    psrlw $4, %xmm0
+; CHECK-NEXT:    paddb %xmm3, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    psadbw %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -58,18 +58,18 @@ define <2 x i64> @foolz(<2 x i64> %a) nounwind {
 ; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
 ; CHECK-NEXT:    pxor %xmm0, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrlq $1, %xmm0
+; CHECK-NEXT:    psrlw $1, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubq %xmm0, %xmm1
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    psubb %xmm0, %xmm1
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; CHECK-NEXT:    movdqa %xmm1, %xmm2
 ; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    psrlq $2, %xmm1
+; CHECK-NEXT:    psrlw $2, %xmm1
 ; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    paddq %xmm2, %xmm1
+; CHECK-NEXT:    paddb %xmm2, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrlq $4, %xmm2
-; CHECK-NEXT:    paddq %xmm1, %xmm2
+; CHECK-NEXT:    psrlw $4, %xmm2
+; CHECK-NEXT:    paddb %xmm1, %xmm2
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm2
 ; CHECK-NEXT:    pxor %xmm0, %xmm0
 ; CHECK-NEXT:    psadbw %xmm2, %xmm0
@@ -83,18 +83,18 @@ define <2 x i64> @foopop(<2 x i64> %a) nounwind {
 ; CHECK-LABEL: foopop:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $1, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT:    psubq %xmm1, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
 ; CHECK-NEXT:    pand %xmm1, %xmm2
-; CHECK-NEXT:    psrlq $2, %xmm0
+; CHECK-NEXT:    psrlw $2, %xmm0
 ; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    paddq %xmm2, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $4, %xmm1
-; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm0, %xmm1
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
 ; CHECK-NEXT:    pxor %xmm0, %xmm0
 ; CHECK-NEXT:    psadbw %xmm0, %xmm1
@@ -119,18 +119,18 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind {
 ; CHECK-NEXT:    pcmpeqd %xmm3, %xmm3
 ; CHECK-NEXT:    paddq %xmm2, %xmm3
 ; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlq $1, %xmm0
+; CHECK-NEXT:    psrlw $1, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubq %xmm0, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    psubb %xmm0, %xmm3
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; CHECK-NEXT:    movdqa %xmm3, %xmm2
 ; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    psrlq $2, %xmm3
+; CHECK-NEXT:    psrlw $2, %xmm3
 ; CHECK-NEXT:    pand %xmm0, %xmm3
-; CHECK-NEXT:    paddq %xmm2, %xmm3
+; CHECK-NEXT:    paddb %xmm2, %xmm3
 ; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlq $4, %xmm0
-; CHECK-NEXT:    paddq %xmm3, %xmm0
+; CHECK-NEXT:    psrlw $4, %xmm0
+; CHECK-NEXT:    paddb %xmm3, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    psadbw %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -164,18 +164,18 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind {
 ; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
 ; CHECK-NEXT:    pxor %xmm0, %xmm2
 ; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    psrlq $1, %xmm0
+; CHECK-NEXT:    psrlw $1, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubq %xmm0, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    psubb %xmm0, %xmm2
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; CHECK-NEXT:    movdqa %xmm2, %xmm3
 ; CHECK-NEXT:    pand %xmm0, %xmm3
-; CHECK-NEXT:    psrlq $2, %xmm2
+; CHECK-NEXT:    psrlw $2, %xmm2
 ; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    paddq %xmm3, %xmm2
+; CHECK-NEXT:    paddb %xmm3, %xmm2
 ; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    psrlq $4, %xmm0
-; CHECK-NEXT:    paddq %xmm2, %xmm0
+; CHECK-NEXT:    psrlw $4, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    psadbw %xmm1, %xmm0
 ; CHECK-NEXT:    psubq {{.*}}(%rip), %xmm0
@@ -191,18 +191,18 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind {
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $1, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT:    psubq %xmm1, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm3
 ; CHECK-NEXT:    pand %xmm1, %xmm3
-; CHECK-NEXT:    psrlq $2, %xmm0
+; CHECK-NEXT:    psrlw $2, %xmm0
 ; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    paddq %xmm3, %xmm0
+; CHECK-NEXT:    paddb %xmm3, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $4, %xmm1
-; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm0, %xmm1
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
 ; CHECK-NEXT:    psadbw %xmm2, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll
index dc945c84b19..34ea33d576c 100644
--- a/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -37,18 +37,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubq %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlq $2, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrlq $4, %xmm2
-; SSE2-NEXT:    paddq %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $4, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    psadbw %xmm2, %xmm0
@@ -77,18 +77,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlq $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubq %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlq $2, %xmm1
+; SSE3-NEXT:    psrlw $2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddq %xmm2, %xmm1
+; SSE3-NEXT:    paddb %xmm2, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    psrlq $4, %xmm2
-; SSE3-NEXT:    paddq %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $4, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
 ; SSE3-NEXT:    psadbw %xmm2, %xmm0
@@ -303,18 +303,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubq %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlq $2, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrlq $4, %xmm2
-; SSE2-NEXT:    paddq %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $4, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    psadbw %xmm2, %xmm0
@@ -343,18 +343,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlq $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubq %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlq $2, %xmm1
+; SSE3-NEXT:    psrlw $2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddq %xmm2, %xmm1
+; SSE3-NEXT:    paddb %xmm2, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    psrlq $4, %xmm2
-; SSE3-NEXT:    paddq %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $4, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
 ; SSE3-NEXT:    psadbw %xmm2, %xmm0
@@ -566,18 +566,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrld $2, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrld $4, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -608,18 +608,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE3-NEXT:    pxor %xmm1, %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrld $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT:    psubb %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrld $2, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    paddd %xmm1, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrld $4, %xmm0
-; SSE3-NEXT:    paddd %xmm2, %xmm0
+; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE3-NEXT:    pxor %xmm1, %xmm1
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
@@ -808,18 +808,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrld $2, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrld $4, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -850,18 +850,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE3-NEXT:    pxor %xmm1, %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrld $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT:    psubb %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrld $2, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    paddd %xmm1, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrld $4, %xmm0
-; SSE3-NEXT:    paddd %xmm2, %xmm0
+; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE3-NEXT:    pxor %xmm1, %xmm1
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
@@ -1049,16 +1049,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddw %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrlw $4, %xmm2
-; SSE2-NEXT:    paddw %xmm1, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    psllw $8, %xmm0
@@ -1085,16 +1085,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubw %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
 ; SSE3-NEXT:    psrlw $2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddw %xmm2, %xmm1
+; SSE3-NEXT:    paddb %xmm2, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    psrlw $4, %xmm2
-; SSE3-NEXT:    paddw %xmm1, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSE3-NEXT:    psllw $8, %xmm0
@@ -1255,16 +1255,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddw %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrlw $4, %xmm2
-; SSE2-NEXT:    paddw %xmm1, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    psllw $8, %xmm0
@@ -1291,16 +1291,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubw %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
 ; SSE3-NEXT:    psrlw $2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddw %xmm2, %xmm1
+; SSE3-NEXT:    paddb %xmm2, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    psrlw $4, %xmm2
-; SSE3-NEXT:    paddw %xmm1, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSE3-NEXT:    psllw $8, %xmm0
diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll
index df42ebf2728..16539f1b2d4 100644
--- a/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/test/CodeGen/X86/vector-popcnt-128.ll
@@ -14,18 +14,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    psrlq $2, %xmm0
+; SSE2-NEXT:    psrlw $2, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlq $4, %xmm1
-; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    psadbw %xmm0, %xmm1
@@ -35,18 +35,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE3-LABEL: testv2i64:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSE3-NEXT:    psrlq $1, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    psubq %xmm1, %xmm0
-; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    pand %xmm1, %xmm2
-; SSE3-NEXT:    psrlq $2, %xmm0
+; SSE3-NEXT:    psrlw $2, %xmm0
 ; SSE3-NEXT:    pand %xmm1, %xmm0
-; SSE3-NEXT:    paddq %xmm2, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSE3-NEXT:    psrlq $4, %xmm1
-; SSE3-NEXT:    paddq %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
 ; SSE3-NEXT:    psadbw %xmm0, %xmm1
@@ -128,28 +128,16 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv2i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv2i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; BITALG-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
@@ -161,18 +149,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE2-LABEL: testv4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    psrld $2, %xmm0
+; SSE2-NEXT:    psrlw $2, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $4, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -187,18 +175,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE3-LABEL: testv4i32:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSE3-NEXT:    psrld $1, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    psubd %xmm1, %xmm0
-; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    pand %xmm1, %xmm2
-; SSE3-NEXT:    psrld $2, %xmm0
+; SSE3-NEXT:    psrlw $2, %xmm0
 ; SSE3-NEXT:    pand %xmm1, %xmm0
-; SSE3-NEXT:    paddd %xmm2, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSE3-NEXT:    psrld $4, %xmm1
-; SSE3-NEXT:    paddd %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
@@ -303,32 +291,20 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv4i32:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; BITALG-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
@@ -346,16 +322,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psubw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psllw $8, %xmm0
@@ -368,16 +344,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    psubw %xmm1, %xmm0
-; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    pand %xmm1, %xmm2
 ; SSE3-NEXT:    psrlw $2, %xmm0
 ; SSE3-NEXT:    pand %xmm1, %xmm0
-; SSE3-NEXT:    paddw %xmm2, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddw %xmm0, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    psllw $8, %xmm0
diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll
index b2cc2f1ebed..570f59673d1 100644
--- a/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/test/CodeGen/X86/vector-popcnt-256.ll
@@ -58,28 +58,15 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv4i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; BITALG-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    retq
@@ -151,14 +138,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv8i32:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
@@ -169,14 +150,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; BITALG-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll
index df5edc13c3e..eae9e6c79bd 100644
--- a/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/test/CodeGen/X86/vector-popcnt-512.ll
@@ -50,14 +50,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
-; BITALG-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    retq
@@ -122,14 +115,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ;
 ; BITALG-LABEL: testv16i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
-; BITALG-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index d19c10d68bc..a532794f89d 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -25,18 +25,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE2-NEXT:    paddq %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT:    psubb %xmm0, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlq $2, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm3
 ; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddq %xmm2, %xmm3
+; SSE2-NEXT:    paddb %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlq $4, %xmm0
-; SSE2-NEXT:    paddq %xmm3, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    paddb %xmm3, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    psadbw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -50,18 +50,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE3-NEXT:    paddq %xmm2, %xmm3
 ; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlq $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubq %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT:    psubb %xmm0, %xmm3
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm3, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlq $2, %xmm3
+; SSE3-NEXT:    psrlw $2, %xmm3
 ; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddq %xmm2, %xmm3
+; SSE3-NEXT:    paddb %xmm2, %xmm3
 ; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlq $4, %xmm0
-; SSE3-NEXT:    paddq %xmm3, %xmm0
+; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    paddb %xmm3, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE3-NEXT:    psadbw %xmm1, %xmm0
 ; SSE3-NEXT:    retq
@@ -155,15 +155,9 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv2i64:
@@ -173,14 +167,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; BITALG-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
@@ -217,18 +204,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE2-NEXT:    paddq %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT:    psubb %xmm0, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlq $2, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm3
 ; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddq %xmm2, %xmm3
+; SSE2-NEXT:    paddb %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlq $4, %xmm0
-; SSE2-NEXT:    paddq %xmm3, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    paddb %xmm3, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    psadbw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -242,18 +229,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE3-NEXT:    paddq %xmm2, %xmm3
 ; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlq $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubq %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT:    psubb %xmm0, %xmm3
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm3, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlq $2, %xmm3
+; SSE3-NEXT:    psrlw $2, %xmm3
 ; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddq %xmm2, %xmm3
+; SSE3-NEXT:    paddb %xmm2, %xmm3
 ; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlq $4, %xmm0
-; SSE3-NEXT:    paddq %xmm3, %xmm0
+; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    paddb %xmm3, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE3-NEXT:    psadbw %xmm1, %xmm0
 ; SSE3-NEXT:    retq
@@ -386,15 +373,9 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv2i64u:
@@ -404,14 +385,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; BITALG-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
@@ -448,18 +422,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE2-NEXT:    paddd %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT:    psubb %xmm0, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrld $2, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm3
 ; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    paddb %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrld $4, %xmm0
-; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    paddb %xmm3, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -478,18 +452,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE3-NEXT:    paddd %xmm2, %xmm3
 ; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrld $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubd %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT:    psubb %xmm0, %xmm3
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm3, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrld $2, %xmm3
+; SSE3-NEXT:    psrlw $2, %xmm3
 ; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddd %xmm2, %xmm3
+; SSE3-NEXT:    paddb %xmm2, %xmm3
 ; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrld $4, %xmm0
-; SSE3-NEXT:    paddd %xmm3, %xmm0
+; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    paddb %xmm3, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -667,19 +641,13 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i32:
@@ -689,14 +657,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; BITALG-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -742,18 +703,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE2-NEXT:    paddd %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT:    psubb %xmm0, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrld $2, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm3
 ; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    paddb %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrld $4, %xmm0
-; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    paddb %xmm3, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -772,18 +733,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE3-NEXT:    paddd %xmm2, %xmm3
 ; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrld $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubd %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT:    psubb %xmm0, %xmm3
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm3, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrld $2, %xmm3
+; SSE3-NEXT:    psrlw $2, %xmm3
 ; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddd %xmm2, %xmm3
+; SSE3-NEXT:    paddb %xmm2, %xmm3
 ; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrld $4, %xmm0
-; SSE3-NEXT:    paddd %xmm3, %xmm0
+; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    paddb %xmm3, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -938,19 +899,13 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i32u:
@@ -960,14 +915,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; BITALG-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1014,16 +962,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psubw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psllw $8, %xmm0
@@ -1041,16 +989,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    psubw %xmm1, %xmm0
-; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    pand %xmm1, %xmm2
 ; SSE3-NEXT:    psrlw $2, %xmm0
 ; SSE3-NEXT:    pand %xmm1, %xmm0
-; SSE3-NEXT:    paddw %xmm2, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddw %xmm0, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    psllw $8, %xmm0
@@ -1210,16 +1158,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psubw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psllw $8, %xmm0
@@ -1237,16 +1185,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    psubw %xmm1, %xmm0
-; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    pand %xmm1, %xmm2
 ; SSE3-NEXT:    psrlw $2, %xmm0
 ; SSE3-NEXT:    pand %xmm1, %xmm0
-; SSE3-NEXT:    paddw %xmm2, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddw %xmm0, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    psllw $8, %xmm0
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index b1173fa4b88..cae0a2d605a 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -124,14 +124,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; BITALG_NOVLX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
@@ -142,14 +135,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; BITALG-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
@@ -270,14 +256,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; BITALG_NOVLX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
@@ -288,14 +267,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; BITALG-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
@@ -452,14 +424,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; BITALG_NOVLX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG_NOVLX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -474,14 +439,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; BITALG-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -623,14 +581,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; BITALG_NOVLX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG_NOVLX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -645,14 +596,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; BITALG-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 37c86f7f81a..4a9fd82593a 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -87,14 +87,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
 ; BITALG-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
@@ -157,14 +150,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
 ; BITALG-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
@@ -269,14 +255,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
 ; BITALG-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -347,14 +326,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
 ; BITALG-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-- 
GitLab


From 08bc40e744f3f44228a9ce57778b5acff1f7fa77 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Oct 2018 14:45:57 +0000
Subject: [PATCH 0102/1116] [SelectionDAG] Move VectorLegalizer::ExpandCTLZ
 codegen into SelectionDAGLegalize

Generalize SelectionDAGLegalize's CTLZ expansion to handle vectors - lets VectorLegalizer::ExpandCTLZ to just pass the expansion on instead of repeating the same codegen.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344349 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |  2 +-
 .../SelectionDAG/LegalizeVectorOps.cpp        | 27 ++-------
 test/CodeGen/X86/vec_ctbits.ll                | 58 +++++++++----------
 3 files changed, 34 insertions(+), 53 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 27875c11909..a6c0610f963 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2761,7 +2761,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op);
   case ISD::CTLZ: {
     EVT VT = Op.getValueType();
-    unsigned Len = VT.getSizeInBits();
+    unsigned Len = VT.getScalarSizeInBits();
 
     if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
       EVT SetCCVT = getSetCCResultType(VT);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 3f38ed8a03c..852415647b1 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1082,32 +1082,13 @@ SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
     return DAG.getNode(ISD::CTLZ, DL, Op.getValueType(), Op.getOperand(0));
   }
 
-  // If CTPOP is available we can lower with a CTPOP based method:
-  // u16 ctlz(u16 x) {
-  //   x |= (x >> 1);
-  //   x |= (x >> 2);
-  //   x |= (x >> 4);
-  //   x |= (x >> 8);
-  //   return ctpop(~x);
-  // }
-  // Ref: "Hacker's Delight" by Henry Warren
+  // If we have the appropriate vector bit operations, it is better to use them
+  // than unrolling and expanding each component.
   if (isPowerOf2_32(NumBitsPerElt) &&
       TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
       TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
-      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT) &&
-      TLI.isOperationLegalOrCustomOrPromote(ISD::XOR, VT)) {
-    SDLoc DL(Op);
-    SDValue Res = Op.getOperand(0);
-    EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-
-    for (unsigned i = 1; i != NumBitsPerElt; i *= 2)
-      Res = DAG.getNode(
-          ISD::OR, DL, VT, Res,
-          DAG.getNode(ISD::SRL, DL, VT, Res, DAG.getConstant(i, DL, ShiftTy)));
-
-    Res = DAG.getNOT(DL, Res, VT);
-    return DAG.getNode(ISD::CTPOP, DL, VT, Res);
-  }
+      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT))
+    return Op;
 
   // Otherwise go ahead and unroll.
   return DAG.UnrollVectorOp(Op.getNode());
diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 978a40cbb26..40e101756ef 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll
@@ -142,42 +142,42 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind {
 ; CHECK-LABEL: promlz:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-NEXT:    psrlq $1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlq $1, %xmm1
+; CHECK-NEXT:    por %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    psrlq $2, %xmm0
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-NEXT:    psrlq $4, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlq $4, %xmm1
+; CHECK-NEXT:    por %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    psrlq $8, %xmm0
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-NEXT:    psrlq $16, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlq $16, %xmm1
+; CHECK-NEXT:    por %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    psrlq $32, %xmm0
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    psrlw $1, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubb %xmm0, %xmm2
+; CHECK-NEXT:    psubb %xmm0, %xmm1
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-NEXT:    pand %xmm0, %xmm3
-; CHECK-NEXT:    psrlw $2, %xmm2
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
 ; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    paddb %xmm3, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    psrlw $4, %xmm0
-; CHECK-NEXT:    paddb %xmm2, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psadbw %xmm1, %xmm0
+; CHECK-NEXT:    psrlw $2, %xmm1
+; CHECK-NEXT:    pand %xmm0, %xmm1
+; CHECK-NEXT:    paddb %xmm2, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    psrlw $4, %xmm2
+; CHECK-NEXT:    paddb %xmm1, %xmm2
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm2
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    psadbw %xmm2, %xmm0
 ; CHECK-NEXT:    psubq {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
-- 
GitLab


From 1cea19bc9824f1c266aacca66425f84ffd743dfb Mon Sep 17 00:00:00 2001
From: Eric Liu <ioeric@google.com>
Date: Fri, 12 Oct 2018 15:01:11 +0000
Subject: [PATCH 0103/1116] Fix unused variable warning after r344348

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344350 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index d2971d0f861..872d90ad004 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -25024,6 +25024,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   int NumElts = VT.getVectorNumElements();
+  (void)EltVT;
   assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
 
   // Implement a lookup table in register by using an algorithm based on:
-- 
GitLab


From 05638b189e1bb92871b498484b3b30666ef6e82f Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Fri, 12 Oct 2018 15:12:22 +0000
Subject: [PATCH 0104/1116] [llvm-exegesis][NFC] Simplify code at the cost of
 small code duplication

Reviewers: courbet

Subscribers: tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D53198

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344351 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/X86/Target.cpp | 118 +++++++++++++------------
 1 file changed, 60 insertions(+), 58 deletions(-)

diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index 440996ad555..0e9a6de95ce 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -21,81 +21,83 @@ namespace exegesis {
 
 namespace {
 
-// Common code for X86 Uops and Latency runners.
-template <typename Impl> class X86SnippetGenerator : public Impl {
-  using Impl::Impl;
+static llvm::Error IsInvalidOpcode(const Instruction &Instr) {
+  const auto OpcodeName = Instr.Name;
+  if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") ||
+      OpcodeName.startswith("ADJCALLSTACK"))
+    return llvm::make_error<BenchmarkFailure>(
+        "Unsupported opcode: Push/Pop/AdjCallStack");
+  return llvm::Error::success();
+}
+
+static unsigned GetX86FPFlags(const Instruction &Instr) {
+  return Instr.Description->TSFlags & llvm::X86II::FPTypeMask;
+}
+
+class X86LatencySnippetGenerator : public LatencySnippetGenerator {
+public:
+  using LatencySnippetGenerator::LatencySnippetGenerator;
 
   llvm::Expected<CodeTemplate>
   generateCodeTemplate(const Instruction &Instr) const override {
-    // Test whether we can generate a snippet for this instruction.
-    const auto OpcodeName = Instr.Name;
-    if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") ||
-        OpcodeName.startswith("ADJCALLSTACK")) {
-      return llvm::make_error<BenchmarkFailure>(
-          "Unsupported opcode: Push/Pop/AdjCallStack");
-    }
+    if (auto E = IsInvalidOpcode(Instr))
+      return std::move(E);
 
-    // Handle X87.
-    const unsigned FPInstClass =
-        Instr.Description->TSFlags & llvm::X86II::FPTypeMask;
-    switch (FPInstClass) {
+    switch (GetX86FPFlags(Instr)) {
     case llvm::X86II::NotFP:
-      break;
+      return LatencySnippetGenerator::generateCodeTemplate(Instr);
     case llvm::X86II::ZeroArgFP:
-      return llvm::make_error<BenchmarkFailure>("Unsupported x87 ZeroArgFP");
     case llvm::X86II::OneArgFP:
-      return llvm::make_error<BenchmarkFailure>("Unsupported x87 OneArgFP");
+    case llvm::X86II::SpecialFP:
+    case llvm::X86II::CompareFP:
+    case llvm::X86II::CondMovFP:
+      return llvm::make_error<BenchmarkFailure>("Unsupported x87 Instruction");
     case llvm::X86II::OneArgFPRW:
-    case llvm::X86II::TwoArgFP: {
+    case llvm::X86II::TwoArgFP:
       // These are instructions like
       //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
       //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
       // They are intrinsically serial and do not modify the state of the stack.
-      // We generate the same code for latency and uops.
-      return this->generateSelfAliasingCodeTemplate(Instr);
-    }
-    case llvm::X86II::CompareFP:
-      return Impl::handleCompareFP(Instr);
-    case llvm::X86II::CondMovFP:
-      return Impl::handleCondMovFP(Instr);
-    case llvm::X86II::SpecialFP:
-      return llvm::make_error<BenchmarkFailure>("Unsupported x87 SpecialFP");
+      return generateSelfAliasingCodeTemplate(Instr);
     default:
       llvm_unreachable("Unknown FP Type!");
     }
-
-    // Fallback to generic implementation.
-    return Impl::Base::generateCodeTemplate(Instr);
   }
 };
 
-class X86LatencyImpl : public LatencySnippetGenerator {
-protected:
-  using Base = LatencySnippetGenerator;
-  using Base::Base;
-  llvm::Expected<CodeTemplate> handleCompareFP(const Instruction &Instr) const {
-    return llvm::make_error<SnippetGeneratorFailure>(
-        "Unsupported x87 CompareFP");
-  }
-  llvm::Expected<CodeTemplate> handleCondMovFP(const Instruction &Instr) const {
-    return llvm::make_error<SnippetGeneratorFailure>(
-        "Unsupported x87 CondMovFP");
-  }
-};
+class X86UopsSnippetGenerator : public UopsSnippetGenerator {
+public:
+  using UopsSnippetGenerator::UopsSnippetGenerator;
 
-class X86UopsImpl : public UopsSnippetGenerator {
-protected:
-  using Base = UopsSnippetGenerator;
-  using Base::Base;
-  // We can compute uops for any FP instruction that does not grow or shrink the
-  // stack (either do not touch the stack or push as much as they pop).
-  llvm::Expected<CodeTemplate> handleCompareFP(const Instruction &Instr) const {
-    return generateUnconstrainedCodeTemplate(
-        Instr, "instruction does not grow/shrink the FP stack");
-  }
-  llvm::Expected<CodeTemplate> handleCondMovFP(const Instruction &Instr) const {
-    return generateUnconstrainedCodeTemplate(
-        Instr, "instruction does not grow/shrink the FP stack");
+  llvm::Expected<CodeTemplate>
+  generateCodeTemplate(const Instruction &Instr) const override {
+    if (auto E = IsInvalidOpcode(Instr))
+      return std::move(E);
+
+    switch (GetX86FPFlags(Instr)) {
+    case llvm::X86II::NotFP:
+      return UopsSnippetGenerator::generateCodeTemplate(Instr);
+    case llvm::X86II::ZeroArgFP:
+    case llvm::X86II::OneArgFP:
+    case llvm::X86II::SpecialFP:
+      return llvm::make_error<BenchmarkFailure>("Unsupported x87 Instruction");
+    case llvm::X86II::OneArgFPRW:
+    case llvm::X86II::TwoArgFP:
+      // These are instructions like
+      //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
+      //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
+      // They are intrinsically serial and do not modify the state of the stack.
+      // We generate the same code for latency and uops.
+      return generateSelfAliasingCodeTemplate(Instr);
+    case llvm::X86II::CompareFP:
+    case llvm::X86II::CondMovFP:
+      // We can compute uops for any FP instruction that does not grow or shrink
+      // the stack (either do not touch the stack or push as much as they pop).
+      return generateUnconstrainedCodeTemplate(
+          Instr, "instruction does not grow/shrink the FP stack");
+    default:
+      llvm_unreachable("Unknown FP Type!");
+    }
   }
 };
 
@@ -330,12 +332,12 @@ class ExegesisX86Target : public ExegesisTarget {
 
   std::unique_ptr<SnippetGenerator>
   createLatencySnippetGenerator(const LLVMState &State) const override {
-    return llvm::make_unique<X86SnippetGenerator<X86LatencyImpl>>(State);
+    return llvm::make_unique<X86LatencySnippetGenerator>(State);
   }
 
   std::unique_ptr<SnippetGenerator>
   createUopsSnippetGenerator(const LLVMState &State) const override {
-    return llvm::make_unique<X86SnippetGenerator<X86UopsImpl>>(State);
+    return llvm::make_unique<X86UopsSnippetGenerator>(State);
   }
 
   bool matchesArch(llvm::Triple::ArchType Arch) const override {
-- 
GitLab


From a6b5202ae09893e943f38b5675f7292c99828b2a Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 12 Oct 2018 15:22:14 +0000
Subject: [PATCH 0105/1116] [AArch64][x86] add tests for trunc disguised as
 vector ops (PR39016); NFC

These correspond to the IR transform from:
D52439


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344353 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/AArch64/extract-insert.ll | 118 +++++++++++++++++++++++++
 test/CodeGen/X86/extract-insert.ll     |  55 +++++++++++-
 2 files changed, 172 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/AArch64/extract-insert.ll

diff --git a/test/CodeGen/AArch64/extract-insert.ll b/test/CodeGen/AArch64/extract-insert.ll
new file mode 100644
index 00000000000..91f6518edd8
--- /dev/null
+++ b/test/CodeGen/AArch64/extract-insert.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64_be-- < %s | FileCheck %s --check-prefix=BE
+; RUN: llc -mtriple=aarch64--    < %s | FileCheck %s --check-prefix=LE
+
+define i32 @trunc_i64_to_i32_le(i64 %x) {
+; BE-LABEL: trunc_i64_to_i32_le:
+; BE:       // %bb.0:
+; BE-NEXT:    fmov d0, x0
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    fmov w0, s0
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i64_to_i32_le:
+; LE:       // %bb.0:
+; LE-NEXT:    fmov d0, x0
+; LE-NEXT:    fmov w0, s0
+; LE-NEXT:    ret
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <4 x i32>
+  %ext = extractelement <4 x i32> %bc, i32 0
+  ret i32 %ext
+}
+
+define i32 @trunc_i64_to_i32_be(i64 %x) {
+; BE-LABEL: trunc_i64_to_i32_be:
+; BE:       // %bb.0:
+; BE-NEXT:    fmov d0, x0
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    mov w0, v0.s[1]
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i64_to_i32_be:
+; LE:       // %bb.0:
+; LE-NEXT:    fmov d0, x0
+; LE-NEXT:    mov w0, v0.s[1]
+; LE-NEXT:    ret
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <4 x i32>
+  %ext = extractelement <4 x i32> %bc, i32 1
+  ret i32 %ext
+}
+
+define i16 @trunc_i64_to_i16_le(i64 %x) {
+; BE-LABEL: trunc_i64_to_i16_le:
+; BE:       // %bb.0:
+; BE-NEXT:    fmov d0, x0
+; BE-NEXT:    rev64 v0.8h, v0.8h
+; BE-NEXT:    umov w0, v0.h[0]
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i64_to_i16_le:
+; LE:       // %bb.0:
+; LE-NEXT:    fmov d0, x0
+; LE-NEXT:    umov w0, v0.h[0]
+; LE-NEXT:    ret
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <8 x i16>
+  %ext = extractelement <8 x i16> %bc, i32 0
+  ret i16 %ext
+}
+
+define i16 @trunc_i64_to_i16_be(i64 %x) {
+; BE-LABEL: trunc_i64_to_i16_be:
+; BE:       // %bb.0:
+; BE-NEXT:    fmov d0, x0
+; BE-NEXT:    rev64 v0.8h, v0.8h
+; BE-NEXT:    umov w0, v0.h[3]
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i64_to_i16_be:
+; LE:       // %bb.0:
+; LE-NEXT:    fmov d0, x0
+; LE-NEXT:    umov w0, v0.h[3]
+; LE-NEXT:    ret
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <8 x i16>
+  %ext = extractelement <8 x i16> %bc, i32 3
+  ret i16 %ext
+}
+
+define i8 @trunc_i32_to_i8_le(i32 %x) {
+; BE-LABEL: trunc_i32_to_i8_le:
+; BE:       // %bb.0:
+; BE-NEXT:    fmov s0, w0
+; BE-NEXT:    rev32 v0.16b, v0.16b
+; BE-NEXT:    umov w0, v0.b[0]
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i32_to_i8_le:
+; LE:       // %bb.0:
+; LE-NEXT:    fmov s0, w0
+; LE-NEXT:    umov w0, v0.b[0]
+; LE-NEXT:    ret
+  %ins = insertelement <4 x i32> undef, i32 %x, i32 0
+  %bc = bitcast <4 x i32> %ins to <16 x i8>
+  %ext = extractelement <16 x i8> %bc, i32 0
+  ret i8 %ext
+}
+
+define i8 @trunc_i32_to_i8_be(i32 %x) {
+; BE-LABEL: trunc_i32_to_i8_be:
+; BE:       // %bb.0:
+; BE-NEXT:    fmov s0, w0
+; BE-NEXT:    rev32 v0.16b, v0.16b
+; BE-NEXT:    umov w0, v0.b[3]
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i32_to_i8_be:
+; LE:       // %bb.0:
+; LE-NEXT:    fmov s0, w0
+; LE-NEXT:    umov w0, v0.b[3]
+; LE-NEXT:    ret
+  %ins = insertelement <4 x i32> undef, i32 %x, i32 0
+  %bc = bitcast <4 x i32> %ins to <16 x i8>
+  %ext = extractelement <16 x i8> %bc, i32 3
+  ret i8 %ext
+}
+
diff --git a/test/CodeGen/X86/extract-insert.ll b/test/CodeGen/X86/extract-insert.ll
index b3fb50de718..2393e32ebf6 100644
--- a/test/CodeGen/X86/extract-insert.ll
+++ b/test/CodeGen/X86/extract-insert.ll
@@ -29,7 +29,7 @@ define i8 @extractelt_bitcast(i32 %x) nounwind {
 }
 
 ; TODO: This should have folded to avoid vector ops, but the transform
-; is guarded by 'hasOneUse'. That limitation apparently makes some AMDGPU 
+; is guarded by 'hasOneUse'. That limitation apparently makes some AMDGPU
 ; codegen better.
 
 define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind {
@@ -60,3 +60,56 @@ define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind {
   ret i8 %ext
 }
 
+define i32 @trunc_i64_to_i32_le(i64 %x) {
+; X86-LABEL: trunc_i64_to_i32_le:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: trunc_i64_to_i32_le:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %xmm0
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    retq
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <4 x i32>
+  %ext = extractelement <4 x i32> %bc, i32 0
+  ret i32 %ext
+}
+
+define i16 @trunc_i64_to_i16_le(i64 %x) {
+; X86-LABEL: trunc_i64_to_i16_le:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: trunc_i64_to_i16_le:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %xmm0
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <8 x i16>
+  %ext = extractelement <8 x i16> %bc, i32 0
+  ret i16 %ext
+}
+
+define i8 @trunc_i32_to_i8_le(i32 %x) {
+; X86-LABEL: trunc_i32_to_i8_le:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: trunc_i32_to_i8_le:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %ins = insertelement <4 x i32> undef, i32 %x, i32 0
+  %bc = bitcast <4 x i32> %ins to <16 x i8>
+  %ext = extractelement <16 x i8> %bc, i32 0
+  ret i8 %ext
+}
+
-- 
GitLab


From 150e4ae6c16145b2c7da7f09fe253db8dbe5eb44 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Oct 2018 15:48:47 +0000
Subject: [PATCH 0106/1116] Pull out repeated value types. NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344354 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 852415647b1..8cc37b5f233 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1079,7 +1079,7 @@ SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
   if (Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF &&
       TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) {
     SDLoc DL(Op);
-    return DAG.getNode(ISD::CTLZ, DL, Op.getValueType(), Op.getOperand(0));
+    return DAG.getNode(ISD::CTLZ, DL, VT, Op.getOperand(0));
   }
 
   // If we have the appropriate vector bit operations, it is better to use them
@@ -1095,10 +1095,12 @@ SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
 }
 
 SDValue VectorLegalizer::ExpandCTTZ_ZERO_UNDEF(SDValue Op) {
+  EVT VT = Op.getValueType();
+
   // If the non-ZERO_UNDEF version is supported we can use that instead.
-  if (TLI.isOperationLegalOrCustom(ISD::CTTZ, Op.getValueType())) {
+  if (TLI.isOperationLegalOrCustom(ISD::CTTZ, VT)) {
     SDLoc DL(Op);
-    return DAG.getNode(ISD::CTTZ, DL, Op.getValueType(), Op.getOperand(0));
+    return DAG.getNode(ISD::CTTZ, DL, VT, Op.getOperand(0));
   }
 
   // Otherwise go ahead and unroll.
-- 
GitLab


From 7dac907c9c291ea9e3fb1ae1ca0e8223894485f4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Oct 2018 15:49:19 +0000
Subject: [PATCH 0107/1116] Pull out repeated value types. NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344355 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index a6c0610f963..56025110f0a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2708,10 +2708,11 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
 /// Expand the specified bitcount instruction into operations.
 SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
                                              const SDLoc &dl) {
+  EVT VT = Op.getValueType();
+
   switch (Opc) {
   default: llvm_unreachable("Cannot expand this yet!");
   case ISD::CTPOP: {
-    EVT VT = Op.getValueType();
     EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     unsigned Len = VT.getSizeInBits();
 
@@ -2758,9 +2759,8 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
   }
   case ISD::CTLZ_ZERO_UNDEF:
     // This trivially expands to CTLZ.
-    return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op);
+    return DAG.getNode(ISD::CTLZ, dl, VT, Op);
   case ISD::CTLZ: {
-    EVT VT = Op.getValueType();
     unsigned Len = VT.getScalarSizeInBits();
 
     if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
@@ -2792,9 +2792,8 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
   }
   case ISD::CTTZ_ZERO_UNDEF:
     // This trivially expands to CTTZ.
-    return DAG.getNode(ISD::CTTZ, dl, Op.getValueType(), Op);
+    return DAG.getNode(ISD::CTTZ, dl, VT, Op);
   case ISD::CTTZ: {
-    EVT VT = Op.getValueType();
     unsigned Len = VT.getSizeInBits();
 
     if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
@@ -2818,7 +2817,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
         TLI.isOperationLegalOrCustom(ISD::CTLZ, VT))
       return DAG.getNode(ISD::SUB, dl, VT,
-                         DAG.getConstant(VT.getSizeInBits(), dl, VT),
+                         DAG.getConstant(Len, dl, VT),
                          DAG.getNode(ISD::CTLZ, dl, VT, Tmp3));
     return DAG.getNode(ISD::CTPOP, dl, VT, Tmp3);
   }
-- 
GitLab


From 185de913d577af58e9ec8264b549a0bec097808f Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Fri, 12 Oct 2018 16:24:09 +0000
Subject: [PATCH 0108/1116] Make YAML quote forward slashes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344357 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/YAMLTraits.h                         | 7 ++++++-
 test/CodeGen/AArch64/arm64-spill-remarks.ll               | 8 ++++----
 test/ObjectYAML/MachO/DWARF-BigEndian.yaml                | 4 ++--
 test/ObjectYAML/MachO/DWARF-LittleEndian.yaml             | 4 ++--
 test/ObjectYAML/MachO/DWARF-debug_str.yaml                | 2 +-
 test/ObjectYAML/MachO/dylib_dylinker_command.yaml         | 4 ++--
 test/Other/size-remarks.ll                                | 4 ++--
 test/Transforms/GVN/opt-remarks.ll                        | 6 +++---
 .../Transforms/Inline/optimization-remarks-passed-yaml.ll | 6 +++---
 test/Transforms/Inline/optimization-remarks-yaml.ll       | 8 ++++----
 unittests/Support/YAMLIOTest.cpp                          | 4 +++-
 11 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 5d029ad5ce9..6219755e83a 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -578,7 +578,6 @@ inline QuotingType needsQuotes(StringRef S) {
     // Safe scalar characters.
     case '_':
     case '-':
-    case '/':
     case '^':
     case '.':
     case ',':
@@ -595,6 +594,12 @@ inline QuotingType needsQuotes(StringRef S) {
     // DEL (0x7F) are excluded from the allowed character range.
     case 0x7F:
       return QuotingType::Double;
+    // Forward slash is allowed to be unquoted, but we quote it anyway.  We have
+    // many tests that use FileCheck against YAML output, and this output often
+    // contains paths.  If we quote backslashes but not forward slashes then
+    // paths will come out either quoted or unquoted depending on which platform
+    // the test is run on, making FileCheck comparisons difficult.
+    case '/':
     default: {
       // C0 control block (0x0 - 0x1F) is excluded from the allowed character
       // range.
diff --git a/test/CodeGen/AArch64/arm64-spill-remarks.ll b/test/CodeGen/AArch64/arm64-spill-remarks.ll
index 53a16ed748b..2d187a74445 100644
--- a/test/CodeGen/AArch64/arm64-spill-remarks.ll
+++ b/test/CodeGen/AArch64/arm64-spill-remarks.ll
@@ -38,7 +38,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 3, Column: 20 }
+; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 3, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         300
 ; YAML: Args:
@@ -51,7 +51,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 2, Column: 20 }
+; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 2, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         30000
 ; YAML: Args:
@@ -64,7 +64,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 1, Column: 20 }
+; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 1, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         300
 ; YAML: Args:
@@ -79,7 +79,7 @@
 ; THRESHOLD_YAML: --- !Missed
 ; THRESHOLD_YAML: Pass:            regalloc
 ; THRESHOLD_YAML: Name:            LoopSpillReload
-; THRESHOLD_YAML: DebugLoc:        { File: /tmp/kk.c, Line: 2, Column: 20 }
+; THRESHOLD_YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 2, Column: 20 }
 ; THRESHOLD_YAML: Function:        fpr128
 ; THRESHOLD_YAML: Hotness:         30000
 ; THRESHOLD_YAML: Args:
diff --git a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
index adc95b95010..c6a45cd36ea 100644
--- a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
+++ b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
@@ -376,8 +376,8 @@ DWARF:
 #CHECK: DWARF:           
 #CHECK:   debug_str:       
 #CHECK:     - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)'
-#CHECK:     - ../compiler-rt/lib/builtins/absvdi2.c
-#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
+#CHECK:     - '../compiler-rt/lib/builtins/absvdi2.c'
+#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
 #CHECK:     - int
 #CHECK:     - di_int
 #CHECK:     - long long int
diff --git a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
index 1d6da66a073..1e136e67be1 100644
--- a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
+++ b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
@@ -365,8 +365,8 @@ DWARF:
 #CHECK: DWARF:           
 #CHECK:   debug_str:       
 #CHECK:     - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)'
-#CHECK:     - ../compiler-rt/lib/builtins/absvdi2.c
-#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
+#CHECK:     - '../compiler-rt/lib/builtins/absvdi2.c'
+#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
 #CHECK:     - int
 #CHECK:     - di_int
 #CHECK:     - long long int
diff --git a/test/ObjectYAML/MachO/DWARF-debug_str.yaml b/test/ObjectYAML/MachO/DWARF-debug_str.yaml
index 417a755642b..84c5e22d255 100644
--- a/test/ObjectYAML/MachO/DWARF-debug_str.yaml
+++ b/test/ObjectYAML/MachO/DWARF-debug_str.yaml
@@ -257,7 +257,7 @@ DWARF:
 #CHECK:     - ''
 #CHECK:     - 'clang version 4.0.0 (trunk 288677) (llvm/trunk 288676)'
 #CHECK:     - hello_world.c
-#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
+#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
 #CHECK:     - main
 #CHECK:     - argc
 #CHECK:     - argv
diff --git a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
index 9184e3c5143..5fc6afa536e 100644
--- a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
+++ b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
@@ -40,7 +40,7 @@ LoadCommands:
 #CHECK:   - cmd:             LC_LOAD_DYLINKER
 #CHECK:     cmdsize:         32
 #CHECK:     name:            12
-#CHECK:     PayloadString:   /usr/lib/dyld
+#CHECK:     PayloadString:   '/usr/lib/dyld'
 #CHECK:     ZeroPadBytes:    7
 #CHECK:   - cmd:             LC_LOAD_DYLIB
 #CHECK:     cmdsize:         48
@@ -58,5 +58,5 @@ LoadCommands:
 #CHECK:       timestamp:       2
 #CHECK:       current_version: 80349697
 #CHECK:       compatibility_version: 65536
-#CHECK:     PayloadString:   /usr/lib/libSystem.B.dylib
+#CHECK:     PayloadString:   '/usr/lib/libSystem.B.dylib'
 #CHECK:     ZeroPadBytes:    6
diff --git a/test/Other/size-remarks.ll b/test/Other/size-remarks.ll
index 34cb1202bb9..1e96dd02207 100644
--- a/test/Other/size-remarks.ll
+++ b/test/Other/size-remarks.ll
@@ -32,7 +32,7 @@
 ; CGSCC-NEXT: Name:            IRSizeChange
 ; CGSCC-NEXT: Function:
 ; CGSCC-NEXT: Args:
-; CGSCC-NEXT:  - Pass:            Function Integration/Inlining
+; CGSCC-NEXT:  - Pass:            'Function Integration/Inlining'
 ; CGSCC-NEXT:  - String:          ': IR instruction count changed from '
 ; CGSCC-NEXT:  - IRInstrsBefore:  '[[ORIG]]'
 ; CGSCC-NEXT:  - String:          ' to '
@@ -44,7 +44,7 @@
 ; CGSCC-NEXT: Name:            FunctionIRSizeChange
 ; CGSCC-NEXT: Function:
 ; CGSCC-NEXT: Args:
-; CGSCC-NEXT:   - Pass:            Function Integration/Inlining
+; CGSCC-NEXT:   - Pass:            'Function Integration/Inlining'
 ; CGSCC-NEXT:   - String:          ': Function: '
 ; CGSCC-NEXT:   - Function:        bar
 ; CGSCC-NEXT:   - String:          ': IR instruction count changed from '
diff --git a/test/Transforms/GVN/opt-remarks.ll b/test/Transforms/GVN/opt-remarks.ll
index 6919528bb83..120ff36f204 100644
--- a/test/Transforms/GVN/opt-remarks.ll
+++ b/test/Transforms/GVN/opt-remarks.ll
@@ -49,7 +49,7 @@
 ; YAML-NEXT: --- !Missed
 ; YAML-NEXT: Pass:            gvn
 ; YAML-NEXT: Name:            LoadClobbered
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 3, Column: 3 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 3, Column: 3 }
 ; YAML-NEXT: Function:        may_alias
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'load of type '
@@ -57,10 +57,10 @@
 ; YAML-NEXT:   - String:          ' not eliminated'
 ; YAML-NEXT:   - String:          ' in favor of '
 ; YAML-NEXT:   - OtherAccess:     load
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 1, Column: 13 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 1, Column: 13 }
 ; YAML-NEXT:   - String:          ' because it is clobbered by '
 ; YAML-NEXT:   - ClobberedBy:     store
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 2, Column: 10 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 2, Column: 10 }
 ; YAML-NEXT: ...
 
 define i32 @arg(i32* %p, i32 %i) {
diff --git a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
index 0ac76354a2b..8692abfaf19 100644
--- a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
+++ b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
@@ -22,15 +22,15 @@
 ; YAML:      --- !Passed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            Inlined
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 10 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 10 }
 ; YAML-NEXT: Function:        bar
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: foo
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 1, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 1, Column: 0 }
 ; YAML-NEXT:   - String: ' inlined into '
 ; YAML-NEXT:   - Caller: bar
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 3, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 3, Column: 0 }
 ; YAML-NEXT:   - String: ' with '
 ; YAML-NEXT:   - String: '(cost='
 ; YAML-NEXT:   - Cost: '{{[0-9\-]+}}'
diff --git a/test/Transforms/Inline/optimization-remarks-yaml.ll b/test/Transforms/Inline/optimization-remarks-yaml.ll
index cb366dbbdd3..10a93f5cd79 100644
--- a/test/Transforms/Inline/optimization-remarks-yaml.ll
+++ b/test/Transforms/Inline/optimization-remarks-yaml.ll
@@ -52,27 +52,27 @@
 ; YAML:      --- !Missed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            NoDefinition
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 5, Column: 10 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 5, Column: 10 }
 ; YAML-NEXT: Function:        baz
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: foo
 ; YAML-NEXT:   - String: ' will not be inlined into '
 ; YAML-NEXT:   - Caller: baz
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 0 }
 ; YAML-NEXT:   - String: ' because its definition is unavailable'
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Missed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            NoDefinition
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 5, Column: 18 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 5, Column: 18 }
 ; YAML-NEXT: Function:        baz
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: bar
 ; YAML-NEXT:   - String: ' will not be inlined into '
 ; YAML-NEXT:   - Caller: baz
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 0 }
 ; YAML-NEXT:   - String: ' because its definition is unavailable'
 ; YAML-NEXT: ...
 
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index 4530482ec80..94e9874147f 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -2543,7 +2543,9 @@ TEST(YAMLIO, TestEscaped) {
   // Single quote
   TestEscaped("@abc@", "'@abc@'");
   // No quote
-  TestEscaped("abc/", "abc/");
+  TestEscaped("abc", "abc");
+  // Forward slash quoted
+  TestEscaped("abc/", "'abc/'");
   // Double quote non-printable
   TestEscaped("\01@abc@", "\"\\x01@abc@\"");
   // Double quote inside single quote
-- 
GitLab


From 7b3c18864147ef2d8995cbecd44bf6f7659406af Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Fri, 12 Oct 2018 16:31:08 +0000
Subject: [PATCH 0109/1116] Revert "Make YAML quote forward slashes."

This reverts commit b86c16ad8c97dadc1f529da72a5bb74e9eaed344.

This is being reverted because I forgot to write a useful
commit message, so I'm going to resubmit it with an actual
commit message.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344358 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/YAMLTraits.h                         | 7 +------
 test/CodeGen/AArch64/arm64-spill-remarks.ll               | 8 ++++----
 test/ObjectYAML/MachO/DWARF-BigEndian.yaml                | 4 ++--
 test/ObjectYAML/MachO/DWARF-LittleEndian.yaml             | 4 ++--
 test/ObjectYAML/MachO/DWARF-debug_str.yaml                | 2 +-
 test/ObjectYAML/MachO/dylib_dylinker_command.yaml         | 4 ++--
 test/Other/size-remarks.ll                                | 4 ++--
 test/Transforms/GVN/opt-remarks.ll                        | 6 +++---
 .../Transforms/Inline/optimization-remarks-passed-yaml.ll | 6 +++---
 test/Transforms/Inline/optimization-remarks-yaml.ll       | 8 ++++----
 unittests/Support/YAMLIOTest.cpp                          | 4 +---
 11 files changed, 25 insertions(+), 32 deletions(-)

diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 6219755e83a..5d029ad5ce9 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -578,6 +578,7 @@ inline QuotingType needsQuotes(StringRef S) {
     // Safe scalar characters.
     case '_':
     case '-':
+    case '/':
     case '^':
     case '.':
     case ',':
@@ -594,12 +595,6 @@ inline QuotingType needsQuotes(StringRef S) {
     // DEL (0x7F) are excluded from the allowed character range.
     case 0x7F:
       return QuotingType::Double;
-    // Forward slash is allowed to be unquoted, but we quote it anyway.  We have
-    // many tests that use FileCheck against YAML output, and this output often
-    // contains paths.  If we quote backslashes but not forward slashes then
-    // paths will come out either quoted or unquoted depending on which platform
-    // the test is run on, making FileCheck comparisons difficult.
-    case '/':
     default: {
       // C0 control block (0x0 - 0x1F) is excluded from the allowed character
       // range.
diff --git a/test/CodeGen/AArch64/arm64-spill-remarks.ll b/test/CodeGen/AArch64/arm64-spill-remarks.ll
index 2d187a74445..53a16ed748b 100644
--- a/test/CodeGen/AArch64/arm64-spill-remarks.ll
+++ b/test/CodeGen/AArch64/arm64-spill-remarks.ll
@@ -38,7 +38,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 3, Column: 20 }
+; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 3, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         300
 ; YAML: Args:
@@ -51,7 +51,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 2, Column: 20 }
+; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 2, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         30000
 ; YAML: Args:
@@ -64,7 +64,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 1, Column: 20 }
+; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 1, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         300
 ; YAML: Args:
@@ -79,7 +79,7 @@
 ; THRESHOLD_YAML: --- !Missed
 ; THRESHOLD_YAML: Pass:            regalloc
 ; THRESHOLD_YAML: Name:            LoopSpillReload
-; THRESHOLD_YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 2, Column: 20 }
+; THRESHOLD_YAML: DebugLoc:        { File: /tmp/kk.c, Line: 2, Column: 20 }
 ; THRESHOLD_YAML: Function:        fpr128
 ; THRESHOLD_YAML: Hotness:         30000
 ; THRESHOLD_YAML: Args:
diff --git a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
index c6a45cd36ea..adc95b95010 100644
--- a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
+++ b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
@@ -376,8 +376,8 @@ DWARF:
 #CHECK: DWARF:           
 #CHECK:   debug_str:       
 #CHECK:     - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)'
-#CHECK:     - '../compiler-rt/lib/builtins/absvdi2.c'
-#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
+#CHECK:     - ../compiler-rt/lib/builtins/absvdi2.c
+#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
 #CHECK:     - int
 #CHECK:     - di_int
 #CHECK:     - long long int
diff --git a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
index 1e136e67be1..1d6da66a073 100644
--- a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
+++ b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
@@ -365,8 +365,8 @@ DWARF:
 #CHECK: DWARF:           
 #CHECK:   debug_str:       
 #CHECK:     - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)'
-#CHECK:     - '../compiler-rt/lib/builtins/absvdi2.c'
-#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
+#CHECK:     - ../compiler-rt/lib/builtins/absvdi2.c
+#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
 #CHECK:     - int
 #CHECK:     - di_int
 #CHECK:     - long long int
diff --git a/test/ObjectYAML/MachO/DWARF-debug_str.yaml b/test/ObjectYAML/MachO/DWARF-debug_str.yaml
index 84c5e22d255..417a755642b 100644
--- a/test/ObjectYAML/MachO/DWARF-debug_str.yaml
+++ b/test/ObjectYAML/MachO/DWARF-debug_str.yaml
@@ -257,7 +257,7 @@ DWARF:
 #CHECK:     - ''
 #CHECK:     - 'clang version 4.0.0 (trunk 288677) (llvm/trunk 288676)'
 #CHECK:     - hello_world.c
-#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
+#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
 #CHECK:     - main
 #CHECK:     - argc
 #CHECK:     - argv
diff --git a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
index 5fc6afa536e..9184e3c5143 100644
--- a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
+++ b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
@@ -40,7 +40,7 @@ LoadCommands:
 #CHECK:   - cmd:             LC_LOAD_DYLINKER
 #CHECK:     cmdsize:         32
 #CHECK:     name:            12
-#CHECK:     PayloadString:   '/usr/lib/dyld'
+#CHECK:     PayloadString:   /usr/lib/dyld
 #CHECK:     ZeroPadBytes:    7
 #CHECK:   - cmd:             LC_LOAD_DYLIB
 #CHECK:     cmdsize:         48
@@ -58,5 +58,5 @@ LoadCommands:
 #CHECK:       timestamp:       2
 #CHECK:       current_version: 80349697
 #CHECK:       compatibility_version: 65536
-#CHECK:     PayloadString:   '/usr/lib/libSystem.B.dylib'
+#CHECK:     PayloadString:   /usr/lib/libSystem.B.dylib
 #CHECK:     ZeroPadBytes:    6
diff --git a/test/Other/size-remarks.ll b/test/Other/size-remarks.ll
index 1e96dd02207..34cb1202bb9 100644
--- a/test/Other/size-remarks.ll
+++ b/test/Other/size-remarks.ll
@@ -32,7 +32,7 @@
 ; CGSCC-NEXT: Name:            IRSizeChange
 ; CGSCC-NEXT: Function:
 ; CGSCC-NEXT: Args:
-; CGSCC-NEXT:  - Pass:            'Function Integration/Inlining'
+; CGSCC-NEXT:  - Pass:            Function Integration/Inlining
 ; CGSCC-NEXT:  - String:          ': IR instruction count changed from '
 ; CGSCC-NEXT:  - IRInstrsBefore:  '[[ORIG]]'
 ; CGSCC-NEXT:  - String:          ' to '
@@ -44,7 +44,7 @@
 ; CGSCC-NEXT: Name:            FunctionIRSizeChange
 ; CGSCC-NEXT: Function:
 ; CGSCC-NEXT: Args:
-; CGSCC-NEXT:   - Pass:            'Function Integration/Inlining'
+; CGSCC-NEXT:   - Pass:            Function Integration/Inlining
 ; CGSCC-NEXT:   - String:          ': Function: '
 ; CGSCC-NEXT:   - Function:        bar
 ; CGSCC-NEXT:   - String:          ': IR instruction count changed from '
diff --git a/test/Transforms/GVN/opt-remarks.ll b/test/Transforms/GVN/opt-remarks.ll
index 120ff36f204..6919528bb83 100644
--- a/test/Transforms/GVN/opt-remarks.ll
+++ b/test/Transforms/GVN/opt-remarks.ll
@@ -49,7 +49,7 @@
 ; YAML-NEXT: --- !Missed
 ; YAML-NEXT: Pass:            gvn
 ; YAML-NEXT: Name:            LoadClobbered
-; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 3, Column: 3 }
+; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 3, Column: 3 }
 ; YAML-NEXT: Function:        may_alias
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'load of type '
@@ -57,10 +57,10 @@
 ; YAML-NEXT:   - String:          ' not eliminated'
 ; YAML-NEXT:   - String:          ' in favor of '
 ; YAML-NEXT:   - OtherAccess:     load
-; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 1, Column: 13 }
+; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 1, Column: 13 }
 ; YAML-NEXT:   - String:          ' because it is clobbered by '
 ; YAML-NEXT:   - ClobberedBy:     store
-; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 2, Column: 10 }
+; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 2, Column: 10 }
 ; YAML-NEXT: ...
 
 define i32 @arg(i32* %p, i32 %i) {
diff --git a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
index 8692abfaf19..0ac76354a2b 100644
--- a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
+++ b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
@@ -22,15 +22,15 @@
 ; YAML:      --- !Passed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            Inlined
-; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 10 }
+; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 10 }
 ; YAML-NEXT: Function:        bar
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: foo
-; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 1, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 1, Column: 0 }
 ; YAML-NEXT:   - String: ' inlined into '
 ; YAML-NEXT:   - Caller: bar
-; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 3, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 3, Column: 0 }
 ; YAML-NEXT:   - String: ' with '
 ; YAML-NEXT:   - String: '(cost='
 ; YAML-NEXT:   - Cost: '{{[0-9\-]+}}'
diff --git a/test/Transforms/Inline/optimization-remarks-yaml.ll b/test/Transforms/Inline/optimization-remarks-yaml.ll
index 10a93f5cd79..cb366dbbdd3 100644
--- a/test/Transforms/Inline/optimization-remarks-yaml.ll
+++ b/test/Transforms/Inline/optimization-remarks-yaml.ll
@@ -52,27 +52,27 @@
 ; YAML:      --- !Missed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            NoDefinition
-; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 5, Column: 10 }
+; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 5, Column: 10 }
 ; YAML-NEXT: Function:        baz
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: foo
 ; YAML-NEXT:   - String: ' will not be inlined into '
 ; YAML-NEXT:   - Caller: baz
-; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 0 }
 ; YAML-NEXT:   - String: ' because its definition is unavailable'
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Missed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            NoDefinition
-; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 5, Column: 18 }
+; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 5, Column: 18 }
 ; YAML-NEXT: Function:        baz
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: bar
 ; YAML-NEXT:   - String: ' will not be inlined into '
 ; YAML-NEXT:   - Caller: baz
-; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 0 }
 ; YAML-NEXT:   - String: ' because its definition is unavailable'
 ; YAML-NEXT: ...
 
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index 94e9874147f..4530482ec80 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -2543,9 +2543,7 @@ TEST(YAMLIO, TestEscaped) {
   // Single quote
   TestEscaped("@abc@", "'@abc@'");
   // No quote
-  TestEscaped("abc", "abc");
-  // Forward slash quoted
-  TestEscaped("abc/", "'abc/'");
+  TestEscaped("abc/", "abc/");
   // Double quote non-printable
   TestEscaped("\01@abc@", "\"\\x01@abc@\"");
   // Double quote inside single quote
-- 
GitLab


From d5e155bacd7b191e759082a68424c0f2cc60f29f Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Fri, 12 Oct 2018 16:31:20 +0000
Subject: [PATCH 0110/1116] Make YAML quote forward slashes.

If you have the string /usr/bin, prior to this patch it would not
be quoted by our YAML serializer.  But a string like C:\src would
be, due to the presence of a backslash.  This makes the quoting
rules of basically every single file path different depending on
the path syntax (posix vs. Windows).

While technically not required by the YAML specification to quote
forward slashes, when the behavior of paths is inconsistent it
makes it difficult to portably write FileCheck lines that will
work with either kind of path.

Differential Revision: https://reviews.llvm.org/D53169

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344359 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/YAMLTraits.h                         | 7 ++++++-
 test/CodeGen/AArch64/arm64-spill-remarks.ll               | 8 ++++----
 test/ObjectYAML/MachO/DWARF-BigEndian.yaml                | 4 ++--
 test/ObjectYAML/MachO/DWARF-LittleEndian.yaml             | 4 ++--
 test/ObjectYAML/MachO/DWARF-debug_str.yaml                | 2 +-
 test/ObjectYAML/MachO/dylib_dylinker_command.yaml         | 4 ++--
 test/Other/size-remarks.ll                                | 4 ++--
 test/Transforms/GVN/opt-remarks.ll                        | 6 +++---
 .../Transforms/Inline/optimization-remarks-passed-yaml.ll | 6 +++---
 test/Transforms/Inline/optimization-remarks-yaml.ll       | 8 ++++----
 unittests/Support/YAMLIOTest.cpp                          | 4 +++-
 11 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 5d029ad5ce9..6219755e83a 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -578,7 +578,6 @@ inline QuotingType needsQuotes(StringRef S) {
     // Safe scalar characters.
     case '_':
     case '-':
-    case '/':
     case '^':
     case '.':
     case ',':
@@ -595,6 +594,12 @@ inline QuotingType needsQuotes(StringRef S) {
     // DEL (0x7F) are excluded from the allowed character range.
     case 0x7F:
       return QuotingType::Double;
+    // Forward slash is allowed to be unquoted, but we quote it anyway.  We have
+    // many tests that use FileCheck against YAML output, and this output often
+    // contains paths.  If we quote backslashes but not forward slashes then
+    // paths will come out either quoted or unquoted depending on which platform
+    // the test is run on, making FileCheck comparisons difficult.
+    case '/':
     default: {
       // C0 control block (0x0 - 0x1F) is excluded from the allowed character
       // range.
diff --git a/test/CodeGen/AArch64/arm64-spill-remarks.ll b/test/CodeGen/AArch64/arm64-spill-remarks.ll
index 53a16ed748b..2d187a74445 100644
--- a/test/CodeGen/AArch64/arm64-spill-remarks.ll
+++ b/test/CodeGen/AArch64/arm64-spill-remarks.ll
@@ -38,7 +38,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 3, Column: 20 }
+; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 3, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         300
 ; YAML: Args:
@@ -51,7 +51,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 2, Column: 20 }
+; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 2, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         30000
 ; YAML: Args:
@@ -64,7 +64,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 1, Column: 20 }
+; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 1, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         300
 ; YAML: Args:
@@ -79,7 +79,7 @@
 ; THRESHOLD_YAML: --- !Missed
 ; THRESHOLD_YAML: Pass:            regalloc
 ; THRESHOLD_YAML: Name:            LoopSpillReload
-; THRESHOLD_YAML: DebugLoc:        { File: /tmp/kk.c, Line: 2, Column: 20 }
+; THRESHOLD_YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 2, Column: 20 }
 ; THRESHOLD_YAML: Function:        fpr128
 ; THRESHOLD_YAML: Hotness:         30000
 ; THRESHOLD_YAML: Args:
diff --git a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
index adc95b95010..c6a45cd36ea 100644
--- a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
+++ b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
@@ -376,8 +376,8 @@ DWARF:
 #CHECK: DWARF:           
 #CHECK:   debug_str:       
 #CHECK:     - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)'
-#CHECK:     - ../compiler-rt/lib/builtins/absvdi2.c
-#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
+#CHECK:     - '../compiler-rt/lib/builtins/absvdi2.c'
+#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
 #CHECK:     - int
 #CHECK:     - di_int
 #CHECK:     - long long int
diff --git a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
index 1d6da66a073..1e136e67be1 100644
--- a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
+++ b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
@@ -365,8 +365,8 @@ DWARF:
 #CHECK: DWARF:           
 #CHECK:   debug_str:       
 #CHECK:     - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)'
-#CHECK:     - ../compiler-rt/lib/builtins/absvdi2.c
-#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
+#CHECK:     - '../compiler-rt/lib/builtins/absvdi2.c'
+#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
 #CHECK:     - int
 #CHECK:     - di_int
 #CHECK:     - long long int
diff --git a/test/ObjectYAML/MachO/DWARF-debug_str.yaml b/test/ObjectYAML/MachO/DWARF-debug_str.yaml
index 417a755642b..84c5e22d255 100644
--- a/test/ObjectYAML/MachO/DWARF-debug_str.yaml
+++ b/test/ObjectYAML/MachO/DWARF-debug_str.yaml
@@ -257,7 +257,7 @@ DWARF:
 #CHECK:     - ''
 #CHECK:     - 'clang version 4.0.0 (trunk 288677) (llvm/trunk 288676)'
 #CHECK:     - hello_world.c
-#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
+#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
 #CHECK:     - main
 #CHECK:     - argc
 #CHECK:     - argv
diff --git a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
index 9184e3c5143..5fc6afa536e 100644
--- a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
+++ b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
@@ -40,7 +40,7 @@ LoadCommands:
 #CHECK:   - cmd:             LC_LOAD_DYLINKER
 #CHECK:     cmdsize:         32
 #CHECK:     name:            12
-#CHECK:     PayloadString:   /usr/lib/dyld
+#CHECK:     PayloadString:   '/usr/lib/dyld'
 #CHECK:     ZeroPadBytes:    7
 #CHECK:   - cmd:             LC_LOAD_DYLIB
 #CHECK:     cmdsize:         48
@@ -58,5 +58,5 @@ LoadCommands:
 #CHECK:       timestamp:       2
 #CHECK:       current_version: 80349697
 #CHECK:       compatibility_version: 65536
-#CHECK:     PayloadString:   /usr/lib/libSystem.B.dylib
+#CHECK:     PayloadString:   '/usr/lib/libSystem.B.dylib'
 #CHECK:     ZeroPadBytes:    6
diff --git a/test/Other/size-remarks.ll b/test/Other/size-remarks.ll
index 34cb1202bb9..1e96dd02207 100644
--- a/test/Other/size-remarks.ll
+++ b/test/Other/size-remarks.ll
@@ -32,7 +32,7 @@
 ; CGSCC-NEXT: Name:            IRSizeChange
 ; CGSCC-NEXT: Function:
 ; CGSCC-NEXT: Args:
-; CGSCC-NEXT:  - Pass:            Function Integration/Inlining
+; CGSCC-NEXT:  - Pass:            'Function Integration/Inlining'
 ; CGSCC-NEXT:  - String:          ': IR instruction count changed from '
 ; CGSCC-NEXT:  - IRInstrsBefore:  '[[ORIG]]'
 ; CGSCC-NEXT:  - String:          ' to '
@@ -44,7 +44,7 @@
 ; CGSCC-NEXT: Name:            FunctionIRSizeChange
 ; CGSCC-NEXT: Function:
 ; CGSCC-NEXT: Args:
-; CGSCC-NEXT:   - Pass:            Function Integration/Inlining
+; CGSCC-NEXT:   - Pass:            'Function Integration/Inlining'
 ; CGSCC-NEXT:   - String:          ': Function: '
 ; CGSCC-NEXT:   - Function:        bar
 ; CGSCC-NEXT:   - String:          ': IR instruction count changed from '
diff --git a/test/Transforms/GVN/opt-remarks.ll b/test/Transforms/GVN/opt-remarks.ll
index 6919528bb83..120ff36f204 100644
--- a/test/Transforms/GVN/opt-remarks.ll
+++ b/test/Transforms/GVN/opt-remarks.ll
@@ -49,7 +49,7 @@
 ; YAML-NEXT: --- !Missed
 ; YAML-NEXT: Pass:            gvn
 ; YAML-NEXT: Name:            LoadClobbered
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 3, Column: 3 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 3, Column: 3 }
 ; YAML-NEXT: Function:        may_alias
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'load of type '
@@ -57,10 +57,10 @@
 ; YAML-NEXT:   - String:          ' not eliminated'
 ; YAML-NEXT:   - String:          ' in favor of '
 ; YAML-NEXT:   - OtherAccess:     load
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 1, Column: 13 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 1, Column: 13 }
 ; YAML-NEXT:   - String:          ' because it is clobbered by '
 ; YAML-NEXT:   - ClobberedBy:     store
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 2, Column: 10 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 2, Column: 10 }
 ; YAML-NEXT: ...
 
 define i32 @arg(i32* %p, i32 %i) {
diff --git a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
index 0ac76354a2b..8692abfaf19 100644
--- a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
+++ b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
@@ -22,15 +22,15 @@
 ; YAML:      --- !Passed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            Inlined
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 10 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 10 }
 ; YAML-NEXT: Function:        bar
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: foo
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 1, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 1, Column: 0 }
 ; YAML-NEXT:   - String: ' inlined into '
 ; YAML-NEXT:   - Caller: bar
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 3, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 3, Column: 0 }
 ; YAML-NEXT:   - String: ' with '
 ; YAML-NEXT:   - String: '(cost='
 ; YAML-NEXT:   - Cost: '{{[0-9\-]+}}'
diff --git a/test/Transforms/Inline/optimization-remarks-yaml.ll b/test/Transforms/Inline/optimization-remarks-yaml.ll
index cb366dbbdd3..10a93f5cd79 100644
--- a/test/Transforms/Inline/optimization-remarks-yaml.ll
+++ b/test/Transforms/Inline/optimization-remarks-yaml.ll
@@ -52,27 +52,27 @@
 ; YAML:      --- !Missed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            NoDefinition
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 5, Column: 10 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 5, Column: 10 }
 ; YAML-NEXT: Function:        baz
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: foo
 ; YAML-NEXT:   - String: ' will not be inlined into '
 ; YAML-NEXT:   - Caller: baz
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 0 }
 ; YAML-NEXT:   - String: ' because its definition is unavailable'
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Missed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            NoDefinition
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 5, Column: 18 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 5, Column: 18 }
 ; YAML-NEXT: Function:        baz
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: bar
 ; YAML-NEXT:   - String: ' will not be inlined into '
 ; YAML-NEXT:   - Caller: baz
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 0 }
 ; YAML-NEXT:   - String: ' because its definition is unavailable'
 ; YAML-NEXT: ...
 
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index 4530482ec80..94e9874147f 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -2543,7 +2543,9 @@ TEST(YAMLIO, TestEscaped) {
   // Single quote
   TestEscaped("@abc@", "'@abc@'");
   // No quote
-  TestEscaped("abc/", "abc/");
+  TestEscaped("abc", "abc");
+  // Forward slash quoted
+  TestEscaped("abc/", "'abc/'");
   // Double quote non-printable
   TestEscaped("\01@abc@", "\"\\x01@abc@\"");
   // Double quote inside single quote
-- 
GitLab


From b609135fc92b83d634e9ae99e6a1b84ceb2f9c3c Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Fri, 12 Oct 2018 16:35:44 +0000
Subject: [PATCH 0111/1116] [MC][ELF] fix newly added test

Summary:
Reland of
- r344197 "[MC][ELF] compute entity size for explicit sections"
- r344206 "[MC][ELF] Fix section_mergeable_size.ll"
after being reverted in r344278 due to build breakages from not
specifying a target triple.

Move test from test/CodeGen/Generic/ to test/MC/ELF/.
Add explicit target triple so we don't try to run
this test on non ELF targets.

Reported: https://reviews.llvm.org/D53056#1261707

Reviewers: fhahn, rnk, espindola, NoQ

Reviewed By: fhahn, rnk

Subscribers: NoQ, MaskRay, rengolin, emaste, arichardson, llvm-commits, pirama, srhines

Differential Revision: https://reviews.llvm.org/D53146

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344360 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/TargetLoweringObjectFileImpl.cpp | 50 ++++++++++----------
 test/CodeGen/X86/section_mergeable_size.ll   |  3 ++
 2 files changed, 28 insertions(+), 25 deletions(-)
 create mode 100644 test/CodeGen/X86/section_mergeable_size.ll

diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index f6882c40531..b046cd81d6c 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -506,6 +506,30 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
   return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr;
 }
 
+static unsigned getEntrySizeForKind(SectionKind Kind) {
+  if (Kind.isMergeable1ByteCString())
+    return 1;
+  else if (Kind.isMergeable2ByteCString())
+    return 2;
+  else if (Kind.isMergeable4ByteCString())
+    return 4;
+  else if (Kind.isMergeableConst4())
+    return 4;
+  else if (Kind.isMergeableConst8())
+    return 8;
+  else if (Kind.isMergeableConst16())
+    return 16;
+  else if (Kind.isMergeableConst32())
+    return 32;
+  else {
+    // We shouldn't have mergeable C strings or mergeable constants that we
+    // didn't handle above.
+    assert(!Kind.isMergeableCString() && "unknown string width");
+    assert(!Kind.isMergeableConst() && "unknown data width");
+    return 0;
+  }
+}
+
 MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   StringRef SectionName = GO->getSection();
@@ -550,7 +574,7 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
 
   MCSectionELF *Section = getContext().getELFSection(
       SectionName, getELFSectionType(SectionName, Kind), Flags,
-      /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol);
+      getEntrySizeForKind(Kind), Group, UniqueID, AssociatedSymbol);
   // Make sure that we did not get some other section with incompatible sh_link.
   // This should not be possible due to UniqueID code above.
   assert(Section->getAssociatedSymbol() == AssociatedSymbol &&
@@ -577,30 +601,6 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
   return ".data.rel.ro";
 }
 
-static unsigned getEntrySizeForKind(SectionKind Kind) {
-  if (Kind.isMergeable1ByteCString())
-    return 1;
-  else if (Kind.isMergeable2ByteCString())
-    return 2;
-  else if (Kind.isMergeable4ByteCString())
-    return 4;
-  else if (Kind.isMergeableConst4())
-    return 4;
-  else if (Kind.isMergeableConst8())
-    return 8;
-  else if (Kind.isMergeableConst16())
-    return 16;
-  else if (Kind.isMergeableConst32())
-    return 32;
-  else {
-    // We shouldn't have mergeable C strings or mergeable constants that we
-    // didn't handle above.
-    assert(!Kind.isMergeableCString() && "unknown string width");
-    assert(!Kind.isMergeableConst() && "unknown data width");
-    return 0;
-  }
-}
-
 static MCSectionELF *selectELFSectionForGlobal(
     MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang,
     const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags,
diff --git a/test/CodeGen/X86/section_mergeable_size.ll b/test/CodeGen/X86/section_mergeable_size.ll
new file mode 100644
index 00000000000..73b70c47f03
--- /dev/null
+++ b/test/CodeGen/X86/section_mergeable_size.ll
@@ -0,0 +1,3 @@
+; RUN: llc -mtriple x86_64-linux-gnu < %s | FileCheck %s
+@a = internal unnamed_addr constant [1 x [1 x i32]] zeroinitializer, section ".init.rodata", align 4
+; CHECK: .init.rodata,"aM",{{[@%]}}progbits,4
-- 
GitLab


From 8f99faa030cc8542c434dc6fd982f38ba09655a3 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 12 Oct 2018 16:41:02 +0000
Subject: [PATCH 0112/1116] [x86] add and use fast horizontal vector math
 subtarget feature

This is the planned follow-up to D52997. Here we are reducing horizontal vector math codegen
by default. AMD Jaguar (btver2) should have no difference with this patch because it has
fast-hops. (If we want to set that bit for other CPUs, let me know.)

The code changes are small, but there are many test diffs. For files that are specifically
testing for hops, I added RUNs to distinguish fast/slow, so we can see the consequences
side-by-side. For files that are primarily concerned with codegen other than hops, I just
updated the CHECK lines to reflect the new default codegen.

To recap the recent horizontal op story:

1. Before rL343727, we were producing hops for all subtargets for a variety of patterns.
   Hops were likely not optimal for all targets though.
2. The IR improvement in r343727 exposed a hole in the backend hop pattern matching, so
   we reduced hop codegen for all subtargets. That was bad for Jaguar (PR39195).
3. We restored the hop codegen for all targets with rL344141. Good for Jaguar, but
   probably bad for other CPUs.
4. This patch allows us to distinguish when we want to produce hops, so everyone can be
   happy. I'm not sure if we have the best predicate here, but the intent is to undo the
   extra hop-iness that was enabled by r344141.

Differential Revision: https://reviews.llvm.org/D53095


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344361 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.td                         |  12 +-
 lib/Target/X86/X86ISelLowering.cpp            |  22 +-
 lib/Target/X86/X86Subtarget.h                 |   4 +
 test/CodeGen/X86/avx2-phaddsub.ll             |  36 +-
 .../X86/avx512-intrinsics-fast-isel.ll        |  21 +-
 test/CodeGen/X86/haddsub-shuf.ll              | 891 ++++++++++++++----
 test/CodeGen/X86/haddsub-undef.ll             | 410 ++++++--
 test/CodeGen/X86/haddsub.ll                   | 385 ++++++--
 test/CodeGen/X86/madd.ll                      |  69 +-
 test/CodeGen/X86/phaddsub.ll                  | 616 ++++++++----
 test/CodeGen/X86/required-vector-width.ll     |   6 +-
 test/CodeGen/X86/sad.ll                       | 238 ++---
 test/CodeGen/X86/vector-reduce-add.ll         | 356 +++----
 test/CodeGen/X86/vector-reduce-fadd-fast.ll   | 493 +++++-----
 test/CodeGen/X86/vector-shuffle-combining.ll  |  39 +-
 15 files changed, 2378 insertions(+), 1220 deletions(-)

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 5d627f34c55..d1263a1fb45 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -404,6 +404,15 @@ def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
           "Indicates that the BEXTR instruction is implemented as a single uop "
           "with good throughput.">;
 
+// Combine vector math operations with shuffles into horizontal math
+// instructions if a CPU implements horizontal operations (introduced with
+// SSE3) with better latency/throughput than the alternative sequence.
+def FeatureFastHorizontalOps
+    : SubtargetFeature<
+        "fast-hops", "HasFastHorizontalOps", "true",
+        "Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
+        "normal vector instructions with shuffles", [FeatureSSE3]>;
+
 // Merge branches using three-way conditional code.
 def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
                                         "ThreewayBranchProfitable", "true",
@@ -998,7 +1007,8 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeatureLAHFSAHF,
   FeatureFast15ByteNOP,
   FeatureFastBEXTR,
-  FeatureFastPartialYMMorZMMWrite
+  FeatureFastPartialYMMorZMMWrite,
+  FeatureFastHorizontalOps
 ]>;
 
 // Bulldozer
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 872d90ad004..97731dff9b2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -37031,9 +37031,6 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
       // The  low half of the 128-bit result must choose from A.
       // The high half of the 128-bit result must choose from B,
       // unless B is undef. In that case, we are always choosing from A.
-      // TODO: Using a horizontal op on a single input is likely worse for
-      // performance on many CPUs, so this should be limited here or reversed
-      // in a later pass.
       unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
       unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
 
@@ -37051,6 +37048,16 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   return true;
 }
 
+/// Horizontal vector math instructions may be slower than normal math with
+/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
+/// implementation, and likely shuffle complexity of the alternate sequence.
+static bool shouldCombineToHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
+                                        const X86Subtarget &Subtarget) {
+  bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize();
+  bool HasFastHOps = Subtarget.hasFastHorizontalOps();
+  return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+}
+
 /// Do target-specific dag combines on floating-point adds/subs.
 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
@@ -37063,7 +37070,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
-      isHorizontalBinOp(LHS, RHS, IsFadd)) {
+      isHorizontalBinOp(LHS, RHS, IsFadd) &&
+      shouldCombineToHorizontalOp(LHS == RHS, DAG, Subtarget)) {
     auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
     return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
   }
@@ -39787,7 +39795,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   // Try to synthesize horizontal adds from adds of shuffles.
   if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
        VT == MVT::v8i32) &&
-      Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) {
+      Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) &&
+      shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
     auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                           ArrayRef<SDValue> Ops) {
       return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
@@ -39918,7 +39927,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
        VT == MVT::v8i32) &&
-      Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) {
+      Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) &&
+      shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
     auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                           ArrayRef<SDValue> Ops) {
       return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index ddee9a692e1..0df3058c374 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -388,6 +388,9 @@ protected:
   /// Processor has a single uop BEXTR implementation.
   bool HasFastBEXTR = false;
 
+  /// Try harder to combine to horizontal vector ops if they are fast.
+  bool HasFastHorizontalOps = false;
+
   /// Use a retpoline thunk rather than indirect calls to block speculative
   /// execution.
   bool UseRetpolineIndirectCalls = false;
@@ -636,6 +639,7 @@ public:
   bool hasFastLZCNT() const { return HasFastLZCNT; }
   bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
   bool hasFastBEXTR() const { return HasFastBEXTR; }
+  bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
   bool hasMacroFusion() const { return HasMacroFusion; }
   bool hasERMSB() const { return HasERMSB; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
diff --git a/test/CodeGen/X86/avx2-phaddsub.ll b/test/CodeGen/X86/avx2-phaddsub.ll
index 67ea37575ab..99cdb100e3f 100644
--- a/test/CodeGen/X86/avx2-phaddsub.ll
+++ b/test/CodeGen/X86/avx2-phaddsub.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686--   -mattr=+avx2           | FileCheck %s --check-prefixes=X32,X32-SLOW
+; RUN: llc < %s -mtriple=i686--   -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X32,X32-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2           | FileCheck %s --check-prefixes=X64,X64-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X64,X64-FAST
 
 define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) {
 ; X32-LABEL: phaddw1:
@@ -67,15 +69,29 @@ define <8 x i32> @phaddd2(<8 x i32> %x, <8 x i32> %y) {
 }
 
 define <8 x i32> @phaddd3(<8 x i32> %x) {
-; X32-LABEL: phaddd3:
-; X32:       # %bb.0:
-; X32-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-SLOW-LABEL: phaddd3:
+; X32-SLOW:       # %bb.0:
+; X32-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; X32-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; X32-SLOW-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; X32-SLOW-NEXT:    retl
 ;
-; X64-LABEL: phaddd3:
-; X64:       # %bb.0:
-; X64-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X32-FAST-LABEL: phaddd3:
+; X32-FAST:       # %bb.0:
+; X32-FAST-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; X32-FAST-NEXT:    retl
+;
+; X64-SLOW-LABEL: phaddd3:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; X64-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; X64-SLOW-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; X64-SLOW-NEXT:    retq
+;
+; X64-FAST-LABEL: phaddd3:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; X64-FAST-NEXT:    retq
   %a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
   %b = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
   %r = add <8 x i32> %a, %b
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index 20c509732c8..fa37d2148f2 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -6860,7 +6860,8 @@ define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
@@ -6989,7 +6990,8 @@ define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
 ; X86-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -7004,7 +7006,8 @@ define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
 ; X64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X64-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; X64-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; X64-NEXT:    vmovd %xmm0, %eax
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
@@ -7210,7 +7213,8 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
 ; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
@@ -7225,7 +7229,8 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
 ; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 entry:
@@ -7405,7 +7410,8 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
@@ -7422,7 +7428,8 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/haddsub-shuf.ll b/test/CodeGen/X86/haddsub-shuf.ll
index ac5d5a70e30..0ece3fe1414 100644
--- a/test/CodeGen/X86/haddsub-shuf.ll
+++ b/test/CodeGen/X86/haddsub-shuf.ll
@@ -1,21 +1,54 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST
 
 ; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
 ; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
 
 define <4 x float> @hadd_v4f32(<4 x float> %a) {
-; SSSE3-LABEL: hadd_v4f32:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    haddps %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v4f32:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    addps %xmm1, %xmm0
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v4f32:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    haddps %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v4f32:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v4f32:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v4f32:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v4f32:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %hop = fadd <2 x float> %a02, %a13
@@ -54,16 +87,51 @@ define <8 x float> @hadd_v8f32a(<8 x float> %a) {
 }
 
 define <8 x float> @hadd_v8f32b(<8 x float> %a) {
-; SSSE3-LABEL: hadd_v8f32b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    haddps %xmm0, %xmm0
-; SSSE3-NEXT:    haddps %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v8f32b:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v8f32b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movaps %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT:    movaps %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    addps %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT:    addps %xmm3, %xmm1
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v8f32b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    haddps %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    haddps %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v8f32b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v8f32b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v8f32b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v8f32b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = fadd <8 x float> %a0, %a1
@@ -72,15 +140,45 @@ define <8 x float> @hadd_v8f32b(<8 x float> %a) {
 }
 
 define <4 x float> @hsub_v4f32(<4 x float> %a) {
-; SSSE3-LABEL: hsub_v4f32:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v4f32:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    subps %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v4f32:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v4f32:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v4f32:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v4f32:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v4f32:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %hop = fsub <2 x float> %a02, %a13
@@ -119,16 +217,51 @@ define <8 x float> @hsub_v8f32a(<8 x float> %a) {
 }
 
 define <8 x float> @hsub_v8f32b(<8 x float> %a) {
-; SSSE3-LABEL: hsub_v8f32b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSSE3-NEXT:    hsubps %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v8f32b:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v8f32b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movaps %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT:    movaps %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    subps %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT:    subps %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm3[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v8f32b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    hsubps %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v8f32b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v8f32b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v8f32b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v8f32b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = fsub <8 x float> %a0, %a1
@@ -137,15 +270,42 @@ define <8 x float> @hsub_v8f32b(<8 x float> %a) {
 }
 
 define <2 x double> @hadd_v2f64(<2 x double> %a) {
-; SSSE3-LABEL: hadd_v2f64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    haddpd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v2f64:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSSE3_SLOW-NEXT:    addpd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v2f64:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v2f64:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1_SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v2f64:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v2f64:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2_SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v2f64:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %hop = fadd <2 x double> %a0, %a1
@@ -154,16 +314,47 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) {
 }
 
 define <4 x double> @hadd_v4f64(<4 x double> %a) {
-; SSSE3-LABEL: hadd_v4f64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    haddpd %xmm0, %xmm0
-; SSSE3-NEXT:    haddpd %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v4f64:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSSE3_SLOW-NEXT:    movapd %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSSE3_SLOW-NEXT:    addpd %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    addpd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm3[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v4f64:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    haddpd %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v4f64:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX1_SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v4f64:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v4f64:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX2_SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v4f64:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
   %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
   %hop = fadd <4 x double> %a0, %a1
@@ -172,15 +363,42 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) {
 }
 
 define <2 x double> @hsub_v2f64(<2 x double> %a) {
-; SSSE3-LABEL: hsub_v2f64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    hsubpd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v2f64:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSSE3_SLOW-NEXT:    subpd %xmm1, %xmm0
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v2f64:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    hsubpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v2f64:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1_SLOW-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v2f64:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v2f64:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2_SLOW-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v2f64:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %hop = fsub <2 x double> %a0, %a1
@@ -189,16 +407,47 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) {
 }
 
 define <4 x double> @hsub_v4f64(<4 x double> %a) {
-; SSSE3-LABEL: hsub_v4f64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    hsubpd %xmm0, %xmm0
-; SSSE3-NEXT:    hsubpd %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v4f64:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSSE3_SLOW-NEXT:    movapd %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSSE3_SLOW-NEXT:    subpd %xmm3, %xmm1
+; SSSE3_SLOW-NEXT:    subpd %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v4f64:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    hsubpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    hsubpd %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v4f64:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX1_SLOW-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v4f64:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v4f64:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX2_SLOW-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v4f64:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
   %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
   %hop = fsub <4 x double> %a0, %a1
@@ -207,15 +456,44 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) {
 }
 
 define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
-; SSSE3-LABEL: hadd_v4i32:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v4i32:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v4i32:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v4i32:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v4i32:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v4i32:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v4i32:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %hop = add <4 x i32> %a02, %a13
@@ -254,25 +532,57 @@ define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
 }
 
 define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
-; SSSE3-LABEL: hadd_v8i32b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    phaddd %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX1-LABEL: hadd_v8i32b:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: hadd_v8i32b:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v8i32b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    paddd %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT:    paddd %xmm3, %xmm1
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v8i32b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    phaddd %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v8i32b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1_SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1_SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v8i32b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v8i32b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v8i32b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = add <8 x i32> %a0, %a1
@@ -281,15 +591,44 @@ define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
 }
 
 define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
-; SSSE3-LABEL: hsub_v4i32:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v4i32:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    psubd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v4i32:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phsubd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v4i32:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v4i32:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v4i32:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v4i32:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %hop = sub <4 x i32> %a02, %a13
@@ -328,25 +667,57 @@ define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
 }
 
 define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
-; SSSE3-LABEL: hsub_v8i32b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    phsubd %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX1-LABEL: hsub_v8i32b:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: hsub_v8i32b:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v8i32b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    psubd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT:    psubd %xmm0, %xmm3
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v8i32b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phsubd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    phsubd %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v8i32b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1_SLOW-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; AVX1_SLOW-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v8i32b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v8i32b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v8i32b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = sub <8 x i32> %a0, %a1
@@ -355,15 +726,45 @@ define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
 }
 
 define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
-; SSSE3-LABEL: hadd_v8i16:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v8i16:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT:    paddw %xmm1, %xmm0
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v8i16:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v8i16:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v8i16:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v8i16:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX2_SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v8i16:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = add <8 x i16> %a0246, %a1357
@@ -402,25 +803,64 @@ define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
 }
 
 define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
-; SSSE3-LABEL: hadd_v16i16b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    phaddw %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX1-LABEL: hadd_v16i16b:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: hadd_v16i16b:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v16i16b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm3
+; SSSE3_SLOW-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3_SLOW-NEXT:    movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    paddw %xmm3, %xmm0
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3_SLOW-NEXT:    paddw %xmm4, %xmm1
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v16i16b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    phaddw %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v16i16b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1_SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX1_SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX1_SLOW-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; AVX1_SLOW-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
+; AVX1_SLOW-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v16i16b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v16i16b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
+; AVX2_SLOW-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v16i16b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = add <16 x i16> %a0, %a1
@@ -429,15 +869,45 @@ define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
 }
 
 define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
-; SSSE3-LABEL: hsub_v8i16:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubw %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v8i16:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT:    psubw %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v8i16:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phsubw %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v8i16:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v8i16:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v8i16:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX2_SLOW-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v8i16:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = sub <8 x i16> %a0246, %a1357
@@ -476,25 +946,64 @@ define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
 }
 
 define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
-; SSSE3-LABEL: hsub_v16i16b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubw %xmm0, %xmm0
-; SSSE3-NEXT:    phsubw %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX1-LABEL: hsub_v16i16b:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: hsub_v16i16b:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vphsubw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v16i16b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm3
+; SSSE3_SLOW-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3_SLOW-NEXT:    movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    psubw %xmm0, %xmm3
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3_SLOW-NEXT:    psubw %xmm1, %xmm4
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v16i16b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phsubw %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    phsubw %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v16i16b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1_SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX1_SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX1_SLOW-NEXT:    vpsubw %xmm0, %xmm2, %xmm0
+; AVX1_SLOW-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
+; AVX1_SLOW-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
+; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v16i16b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v16i16b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
+; AVX2_SLOW-NEXT:    vpsubw %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v16i16b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphsubw %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = sub <16 x i16> %a0, %a1
diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll
index d7c0936a474..e0590a76615 100644
--- a/test/CodeGen/X86/haddsub-undef.ll
+++ b/test/CodeGen/X86/haddsub-undef.ll
@@ -1,7 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSE,SSE-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
 
 ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
 
@@ -339,8 +342,6 @@ define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
   ret <8 x i32> %vecinit5
 }
 
-; On AVX2, the following sequence can be folded into a single horizontal add.
-; If the Subtarget doesn't support AVX2, then we avoid emitting two packed
 ; integer horizontal adds instead of two scalar adds followed by vector inserts.
 define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: test15_undef:
@@ -451,15 +452,38 @@ define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <2 x double> @add_pd_003(<2 x double> %x) {
-; SSE-LABEL: add_pd_003:
-; SSE:       # %bb.0:
-; SSE-NEXT:    haddpd %xmm0, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_pd_003:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT:    addpd %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_pd_003:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_pd_003:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_pd_003:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_pd_003:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_pd_003:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_pd_003:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
   %add = fadd <2 x double> %l, %x
   ret <2 x double> %add
@@ -468,31 +492,84 @@ define <2 x double> @add_pd_003(<2 x double> %x) {
 ; Change shuffle mask - no undefs.
 
 define <2 x double> @add_pd_003_2(<2 x double> %x) {
-; SSE-LABEL: add_pd_003_2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    haddpd %xmm0, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_pd_003_2:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movapd %xmm0, %xmm1
+; SSE-SLOW-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE-SLOW-NEXT:    addpd %xmm0, %xmm1
+; SSE-SLOW-NEXT:    movapd %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_pd_003_2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_pd_003_2:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_pd_003_2:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_pd_003_2:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_pd_003_2:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_pd_003_2:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   %add = fadd <2 x double> %l, %x
   ret <2 x double> %add
 }
 
 define <2 x double> @add_pd_010(<2 x double> %x) {
-; SSE-LABEL: add_pd_010:
-; SSE:       # %bb.0:
-; SSE-NEXT:    haddpd %xmm0, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_pd_010:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT:    addpd %xmm0, %xmm1
+; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-SLOW-NEXT:    movapd %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_pd_010:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_pd_010:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_pd_010:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_pd_010:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_pd_010:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_pd_010:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
   %add = fadd <2 x double> %l, %x
   %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -500,15 +577,42 @@ define <2 x double> @add_pd_010(<2 x double> %x) {
 }
 
 define <4 x float> @add_ps_007(<4 x float> %x) {
-; SSE-LABEL: add_ps_007:
-; SSE:       # %bb.0:
-; SSE-NEXT:    haddps %xmm0, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_007:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_007:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_007:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_007:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_007:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_007:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_007:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = fadd <4 x float> %l, %r
@@ -516,17 +620,48 @@ define <4 x float> @add_ps_007(<4 x float> %x) {
 }
 
 define <4 x float> @add_ps_030(<4 x float> %x) {
-; SSE-LABEL: add_ps_030:
-; SSE:       # %bb.0:
-; SSE-NEXT:    haddps %xmm0, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_030:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_030:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_030:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_030:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_030:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_030:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_030:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = fadd <4 x float> %l, %r
@@ -535,15 +670,41 @@ define <4 x float> @add_ps_030(<4 x float> %x) {
 }
 
 define <4 x float> @add_ps_007_2(<4 x float> %x) {
-; SSE-LABEL: add_ps_007_2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    haddps %xmm0, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_007_2:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_007_2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_007_2:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_007_2:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_007_2:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_007_2:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_007_2:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = fadd <4 x float> %l, %r
@@ -551,32 +712,83 @@ define <4 x float> @add_ps_007_2(<4 x float> %x) {
 }
 
 define <4 x float> @add_ps_008(<4 x float> %x) {
-; SSE-LABEL: add_ps_008:
-; SSE:       # %bb.0:
-; SSE-NEXT:    haddps %xmm0, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_008:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_008:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_008:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_008:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_008:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_008:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_008:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = fadd <4 x float> %l, %x
   ret <4 x float> %add
 }
 
 define <4 x float> @add_ps_017(<4 x float> %x) {
-; SSE-LABEL: add_ps_017:
-; SSE:       # %bb.0:
-; SSE-NEXT:    haddps %xmm0, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_017:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE-SLOW-NEXT:    addps %xmm0, %xmm1
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-SLOW-NEXT:    movaps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_017:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_017:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_017:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_017:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_017:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_017:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = fadd <4 x float> %l, %x
   %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
@@ -584,17 +796,47 @@ define <4 x float> @add_ps_017(<4 x float> %x) {
 }
 
 define <4 x float> @add_ps_018(<4 x float> %x) {
-; SSE-LABEL: add_ps_018:
-; SSE:       # %bb.0:
-; SSE-NEXT:    haddps %xmm0, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_018:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_018:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_018:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_018:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_018:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_018:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_018:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = fadd <4 x float> %l, %r
diff --git a/test/CodeGen/X86/haddsub.ll b/test/CodeGen/X86/haddsub.ll
index 030de9c7f14..6221d4e43bc 100644
--- a/test/CodeGen/X86/haddsub.ll
+++ b/test/CodeGen/X86/haddsub.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3           | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx            | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST
 
 define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
 ; SSE3-LABEL: haddpd1:
@@ -35,15 +37,29 @@ define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
 }
 
 define <2 x double> @haddpd3(<2 x double> %x) {
-; SSE3-LABEL: haddpd3:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddpd %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddpd3:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-SLOW-NEXT:    addpd %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddpd3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddpd3:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddpd3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddpd3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %r = fadd <2 x double> %a, %b
@@ -83,15 +99,30 @@ define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
 }
 
 define <4 x float> @haddps3(<4 x float> %x) {
-; SSE3-LABEL: haddps3:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddps3:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddps3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddps3:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddps3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddps3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = fadd <4 x float> %a, %b
@@ -99,15 +130,30 @@ define <4 x float> @haddps3(<4 x float> %x) {
 }
 
 define <4 x float> @haddps4(<4 x float> %x) {
-; SSE3-LABEL: haddps4:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddps4:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddps4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddps4:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddps4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddps4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -115,15 +161,30 @@ define <4 x float> @haddps4(<4 x float> %x) {
 }
 
 define <4 x float> @haddps5(<4 x float> %x) {
-; SSE3-LABEL: haddps5:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddps5:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,3]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddps5:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddps5:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddps5:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,3]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddps5:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -131,15 +192,27 @@ define <4 x float> @haddps5(<4 x float> %x) {
 }
 
 define <4 x float> @haddps6(<4 x float> %x) {
-; SSE3-LABEL: haddps6:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddps6:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddps6:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddps6:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddps6:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddps6:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -147,15 +220,30 @@ define <4 x float> @haddps6(<4 x float> %x) {
 }
 
 define <4 x float> @haddps7(<4 x float> %x) {
-; SSE3-LABEL: haddps7:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddps7:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddps7:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddps7:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddps7:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddps7:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -179,15 +267,28 @@ define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
 }
 
 define <2 x double> @hsubpd2(<2 x double> %x) {
-; SSE3-LABEL: hsubpd2:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    hsubpd %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: hsubpd2:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-SLOW-NEXT:    subpd %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: hsubpd2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: hsubpd2:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: hsubpd2:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: hsubpd2:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %r = fsub <2 x double> %a, %b
@@ -211,15 +312,31 @@ define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
 }
 
 define <4 x float> @hsubps2(<4 x float> %x) {
-; SSE3-LABEL: hsubps2:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: hsubps2:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE3-SLOW-NEXT:    subps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: hsubps2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: hsubps2:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: hsubps2:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-SLOW-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: hsubps2:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = fsub <4 x float> %a, %b
@@ -227,15 +344,31 @@ define <4 x float> @hsubps2(<4 x float> %x) {
 }
 
 define <4 x float> @hsubps3(<4 x float> %x) {
-; SSE3-LABEL: hsubps3:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: hsubps3:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT:    subps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: hsubps3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: hsubps3:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: hsubps3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: hsubps3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = fsub <4 x float> %a, %b
@@ -243,15 +376,27 @@ define <4 x float> @hsubps3(<4 x float> %x) {
 }
 
 define <4 x float> @hsubps4(<4 x float> %x) {
-; SSE3-LABEL: hsubps4:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: hsubps4:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE3-SLOW-NEXT:    subps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: hsubps4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: hsubps4:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: hsubps4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-SLOW-NEXT:    vsubps %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: hsubps4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = fsub <4 x float> %a, %b
@@ -293,16 +438,35 @@ define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
 }
 
 define <8 x float> @vhaddps3(<8 x float> %x) {
-; SSE3-LABEL: vhaddps3:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    haddps %xmm1, %xmm1
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: vhaddps3:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm2
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm3
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE3-SLOW-NEXT:    addps %xmm2, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT:    addps %xmm3, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: vhaddps3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: vhaddps3:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: vhaddps3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX-SLOW-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: vhaddps3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
   %r = fadd <8 x float> %a, %b
@@ -327,16 +491,37 @@ define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
 }
 
 define <8 x float> @vhsubps3(<8 x float> %x) {
-; SSE3-LABEL: vhsubps3:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSE3-NEXT:    hsubps %xmm1, %xmm1
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: vhsubps3:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm2
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm3
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE3-SLOW-NEXT:    subps %xmm1, %xmm2
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT:    subps %xmm0, %xmm3
+; SSE3-SLOW-NEXT:    movaps %xmm3, %xmm0
+; SSE3-SLOW-NEXT:    movaps %xmm2, %xmm1
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: vhsubps3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: vhsubps3:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    hsubps %xmm1, %xmm1
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: vhsubps3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX-SLOW-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: vhsubps3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
   %r = fsub <8 x float> %a, %b
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index 30320a750e0..c36faecbf85 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -50,7 +50,8 @@ define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture read
 ; AVX-NEXT:  # %bb.2: # %middle.block
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 entry:
@@ -129,7 +130,8 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -153,7 +155,8 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
@@ -252,7 +255,8 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -278,7 +282,8 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -437,7 +442,8 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -469,7 +475,8 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -620,7 +627,8 @@ define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly
 ; AVX-NEXT:  # %bb.2: # %middle.block
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 entry:
@@ -704,7 +712,8 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -729,7 +738,8 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
@@ -836,7 +846,8 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -863,7 +874,8 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1039,7 +1051,8 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1073,7 +1086,8 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1222,7 +1236,8 @@ define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture read
 ; AVX-NEXT:  # %bb.2: # %middle.block
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 entry:
@@ -1313,7 +1328,8 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1338,7 +1354,8 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
@@ -1460,7 +1477,8 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1491,7 +1509,8 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1699,7 +1718,8 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm8, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1742,7 +1762,8 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -2692,7 +2713,8 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    retq
 ;
@@ -2707,7 +2729,8 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
diff --git a/test/CodeGen/X86/phaddsub.ll b/test/CodeGen/X86/phaddsub.ll
index 7b3f8db76c4..b4ff08cd254 100644
--- a/test/CodeGen/X86/phaddsub.ll
+++ b/test/CodeGen/X86/phaddsub.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX-FAST
 
 define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
 ; SSSE3-LABEL: phaddw1:
@@ -67,15 +69,29 @@ define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
 }
 
 define <4 x i32> @phaddd3(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd3:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd3:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd3:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = add <4 x i32> %a, %b
@@ -83,15 +99,29 @@ define <4 x i32> @phaddd3(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd4(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd4:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd4:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd4:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = add <4 x i32> %a, %b
@@ -99,15 +129,29 @@ define <4 x i32> @phaddd4(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd5(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd5:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd5:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,3,2,3]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd5:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd5:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd5:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,3]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd5:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
   %r = add <4 x i32> %a, %b
@@ -115,15 +159,27 @@ define <4 x i32> @phaddd5(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd6(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd6:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd6:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd6:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd6:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd6:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd6:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = add <4 x i32> %a, %b
@@ -131,15 +187,29 @@ define <4 x i32> @phaddd6(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd7(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd7:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd7:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd7:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd7:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd7:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd7:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
   %r = add <4 x i32> %a, %b
@@ -179,15 +249,30 @@ define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
 }
 
 define <4 x i32> @phsubd2(<4 x i32> %x) {
-; SSSE3-LABEL: phsubd2:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phsubd2:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT:    psubd %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phsubd2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phsubd2:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phsubd2:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phsubd2:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = sub <4 x i32> %a, %b
@@ -195,15 +280,30 @@ define <4 x i32> @phsubd2(<4 x i32> %x) {
 }
 
 define <4 x i32> @phsubd3(<4 x i32> %x) {
-; SSSE3-LABEL: phsubd3:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phsubd3:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-SLOW-NEXT:    psubd %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phsubd3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phsubd3:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phsubd3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phsubd3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = sub <4 x i32> %a, %b
@@ -211,15 +311,27 @@ define <4 x i32> @phsubd3(<4 x i32> %x) {
 }
 
 define <4 x i32> @phsubd4(<4 x i32> %x) {
-; SSSE3-LABEL: phsubd4:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phsubd4:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSSE3-SLOW-NEXT:    psubd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phsubd4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phsubd4:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phsubd4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phsubd4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = sub <4 x i32> %a, %b
@@ -284,15 +396,29 @@ define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
 }
 
 define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source1:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source1:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source1:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source1:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source1:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = add <4 x i32> %l, %r
@@ -300,17 +426,33 @@ define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source2:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source2:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source2:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source2:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source2:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = add <4 x i32> %l, %r
@@ -319,15 +461,29 @@ define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source3:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source3:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source3:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = add <4 x i32> %l, %r
@@ -335,32 +491,58 @@ define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd_single_source4(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source4:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source4:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source4:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = add <4 x i32> %l, %x
   ret <4 x i32> %add
 }
 
 define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source5:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source5:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source5:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source5:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source5:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source5:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = add <4 x i32> %l, %x
   %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
@@ -368,17 +550,33 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source6:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source6:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source6:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source6:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source6:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source6:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = add <4 x i32> %l, %r
@@ -387,15 +585,30 @@ define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
 }
 
 define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source1:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddw_single_source1:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13]
+; SSSE3-SLOW-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15]
+; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddw_single_source1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddw_single_source1:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source1:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13]
+; AVX-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15]
+; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddw_single_source1:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
   %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
   %add = add <8 x i16> %l, %r
@@ -403,19 +616,41 @@ define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
 }
 
 define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source2:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddw_single_source2:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddw_single_source2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddw_single_source2:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source2:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddw_single_source2:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
   %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
   %add = add <8 x i16> %l, %r
@@ -424,15 +659,33 @@ define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
 }
 
 define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source3:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddw_single_source3:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddw_single_source3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddw_single_source3:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddw_single_source3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef>
   %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef>
   %add = add <8 x i16> %l, %r
@@ -440,32 +693,63 @@ define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
 }
 
 define <8 x i16> @phaddw_single_source4(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source4:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddw_single_source4:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    pslld $16, %xmm1
+; SSSE3-SLOW-NEXT:    paddw %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddw_single_source4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddw_single_source4:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpslld $16, %xmm0, %xmm1
+; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddw_single_source4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6>
   %add = add <8 x i16> %l, %x
   ret <8 x i16> %add
 }
 
 define <8 x i16> @phaddw_single_source6(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source6:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddw_single_source6:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-SLOW-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddw_single_source6:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddw_single_source6:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source6:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddw_single_source6:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
   %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
   %add = add <8 x i16> %l, %r
diff --git a/test/CodeGen/X86/required-vector-width.ll b/test/CodeGen/X86/required-vector-width.ll
index 368c8acd4f8..6693e3c67a5 100644
--- a/test/CodeGen/X86/required-vector-width.ll
+++ b/test/CodeGen/X86/required-vector-width.ll
@@ -190,7 +190,8 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -319,7 +320,8 @@ define i32 @sad_16i8_256() "required-vector-width"="256" {
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
index 314b0c74f9f..d7d1511d19d 100644
--- a/test/CodeGen/X86/sad.ll
+++ b/test/CodeGen/X86/sad.ll
@@ -56,7 +56,8 @@ define i32 @sad_16i8() nounwind {
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -80,7 +81,8 @@ define i32 @sad_16i8() nounwind {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -152,16 +154,16 @@ define i32 @sad_32i8() nounwind {
 ; SSE2-NEXT:    pxor %xmm12, %xmm12
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm6, %xmm6
 ; SSE2-NEXT:    pxor %xmm13, %xmm13
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm15, %xmm15
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm14, %xmm14
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB1_1: # %vector.body
@@ -219,17 +221,17 @@ define i32 @sad_32i8() nounwind {
 ; SSE2-NEXT:    psrad $31, %xmm6
 ; SSE2-NEXT:    paddd %xmm6, %xmm7
 ; SSE2-NEXT:    pxor %xmm6, %xmm7
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm7, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm4, %xmm6
 ; SSE2-NEXT:    psrad $31, %xmm6
 ; SSE2-NEXT:    paddd %xmm6, %xmm4
 ; SSE2-NEXT:    pxor %xmm6, %xmm4
 ; SSE2-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm4, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
@@ -244,9 +246,9 @@ define i32 @sad_32i8() nounwind {
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm5
 ; SSE2-NEXT:    pxor %xmm1, %xmm5
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
@@ -256,9 +258,9 @@ define i32 @sad_32i8() nounwind {
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm8, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm8
@@ -267,13 +269,13 @@ define i32 @sad_32i8() nounwind {
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm15, %xmm0
 ; SSE2-NEXT:    paddd %xmm14, %xmm13
 ; SSE2-NEXT:    paddd %xmm0, %xmm13
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd %xmm13, %xmm6
 ; SSE2-NEXT:    paddd %xmm0, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1]
@@ -317,7 +319,8 @@ define i32 @sad_32i8() nounwind {
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -343,7 +346,8 @@ define i32 @sad_32i8() nounwind {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -420,42 +424,42 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    pxor %xmm14, %xmm14
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB2_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SSE2-NEXT:    movaps a+1040(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm12
 ; SSE2-NEXT:    movdqa a+1056(%rax), %xmm15
 ; SSE2-NEXT:    movdqa a+1072(%rax), %xmm4
@@ -516,7 +520,7 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
 ; SSE2-NEXT:    psubd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
 ; SSE2-NEXT:    psubd %xmm0, %xmm15
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
@@ -524,8 +528,8 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
 ; SSE2-NEXT:    psubd %xmm3, %xmm9
-; SSE2-NEXT:    movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm2, %xmm9
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
@@ -534,7 +538,7 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
 ; SSE2-NEXT:    psubd %xmm0, %xmm13
-; SSE2-NEXT:    movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm9, %xmm0
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
@@ -563,16 +567,16 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    psrad $31, %xmm3
 ; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm6, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm6
 ; SSE2-NEXT:    pxor %xmm1, %xmm6
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm6, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm5, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm5
@@ -584,118 +588,118 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm8, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm8
 ; SSE2-NEXT:    pxor %xmm1, %xmm8
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm11, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm11
 ; SSE2-NEXT:    pxor %xmm1, %xmm11
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm11, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm15, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm15
 ; SSE2-NEXT:    pxor %xmm1, %xmm15
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm15, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm10, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm10
 ; SSE2-NEXT:    pxor %xmm1, %xmm10
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm10, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm12, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm12
 ; SSE2-NEXT:    pxor %xmm1, %xmm12
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm12, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm9, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm9
 ; SSE2-NEXT:    pxor %xmm0, %xmm9
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm9, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm7
 ; SSE2-NEXT:    pxor %xmm0, %xmm7
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm7, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm13, %xmm1
 ; SSE2-NEXT:    movdqa %xmm13, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB2_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd (%rsp), %xmm1 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
@@ -737,30 +741,30 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm4
@@ -803,27 +807,27 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX1-NEXT:    vpabsd %xmm4, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX1-NEXT:    vpaddd %xmm13, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm13
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; AVX1-NEXT:    vpaddd %xmm8, %xmm1, %xmm1
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm8
 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; AVX1-NEXT:    vpaddd %xmm9, %xmm1, %xmm1
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm9
 ; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; AVX1-NEXT:    vpaddd %xmm10, %xmm1, %xmm1
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm10
 ; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
@@ -858,7 +862,8 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm14, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    addq $24, %rsp
 ; AVX1-NEXT:    vzeroupper
@@ -886,10 +891,10 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm8
-; AVX2-NEXT:    vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm9
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -903,9 +908,9 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpsubd %ymm15, %ymm14, %ymm14
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm15
-; AVX2-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX2-NEXT:    vpaddd %ymm7, %ymm8, %ymm7
 ; AVX2-NEXT:    vpabsd %ymm9, %ymm8
 ; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
@@ -935,7 +940,8 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1430,7 +1436,8 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    retq
 ;
@@ -1448,7 +1455,8 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1533,7 +1541,8 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    retq
 ;
@@ -1548,7 +1557,8 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-reduce-add.ll b/test/CodeGen/X86/vector-reduce-add.ll
index 7a5e5f34ad3..21c10c97f49 100644
--- a/test/CodeGen/X86/vector-reduce-add.ll
+++ b/test/CodeGen/X86/vector-reduce-add.ll
@@ -195,28 +195,21 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ;
 
 define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE2-LABEL: test_v4i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v4i32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    phaddd %xmm1, %xmm1
-; SSE41-NEXT:    movd %xmm1, %eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 ;
@@ -224,7 +217,8 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
   %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a0)
@@ -232,24 +226,15 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 }
 
 define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE2-LABEL: test_v8i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8i32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    phaddd %xmm1, %xmm1
-; SSE41-NEXT:    movd %xmm1, %eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v8i32:
 ; AVX1:       # %bb.0:
@@ -257,7 +242,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -268,7 +254,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -279,7 +266,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -288,28 +276,17 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 }
 
 define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE2-LABEL: test_v16i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16i32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddd %xmm3, %xmm1
-; SSE41-NEXT:    paddd %xmm2, %xmm1
-; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT:    paddd %xmm1, %xmm0
-; SSE41-NEXT:    phaddd %xmm0, %xmm0
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddd %xmm3, %xmm1
+; SSE-NEXT:    paddd %xmm2, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i32:
 ; AVX1:       # %bb.0:
@@ -320,7 +297,8 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -332,7 +310,8 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -355,36 +334,21 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 }
 
 define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE2-LABEL: test_v32i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddd %xmm6, %xmm2
-; SSE2-NEXT:    paddd %xmm7, %xmm3
-; SSE2-NEXT:    paddd %xmm5, %xmm3
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    paddd %xmm4, %xmm2
-; SSE2-NEXT:    paddd %xmm3, %xmm2
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v32i32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddd %xmm6, %xmm2
-; SSE41-NEXT:    paddd %xmm7, %xmm3
-; SSE41-NEXT:    paddd %xmm5, %xmm3
-; SSE41-NEXT:    paddd %xmm1, %xmm3
-; SSE41-NEXT:    paddd %xmm4, %xmm2
-; SSE41-NEXT:    paddd %xmm3, %xmm2
-; SSE41-NEXT:    paddd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT:    paddd %xmm2, %xmm0
-; SSE41-NEXT:    phaddd %xmm0, %xmm0
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v32i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddd %xmm6, %xmm2
+; SSE-NEXT:    paddd %xmm7, %xmm3
+; SSE-NEXT:    paddd %xmm5, %xmm3
+; SSE-NEXT:    paddd %xmm1, %xmm3
+; SSE-NEXT:    paddd %xmm4, %xmm2
+; SSE-NEXT:    paddd %xmm3, %xmm2
+; SSE-NEXT:    paddd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
@@ -401,7 +365,8 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -415,7 +380,8 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -443,29 +409,18 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ;
 
 define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE2-LABEL: test_v8i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8i16:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    phaddw %xmm0, %xmm0
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8i16:
 ; AVX:       # %bb.0:
@@ -473,7 +428,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
@@ -484,7 +440,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
@@ -493,31 +450,19 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 }
 
 define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE2-LABEL: test_v16i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16i16:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    phaddw %xmm0, %xmm0
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i16:
 ; AVX1:       # %bb.0:
@@ -527,7 +472,8 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -541,7 +487,8 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -555,7 +502,8 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -565,35 +513,21 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 }
 
 define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE2-LABEL: test_v32i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddw %xmm3, %xmm1
-; SSE2-NEXT:    paddw %xmm2, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v32i16:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddw %xmm3, %xmm1
-; SSE41-NEXT:    paddw %xmm2, %xmm1
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    phaddw %xmm1, %xmm1
-; SSE41-NEXT:    movd %xmm1, %eax
-; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddw %xmm3, %xmm1
+; SSE-NEXT:    paddw %xmm2, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v32i16:
 ; AVX1:       # %bb.0:
@@ -606,7 +540,8 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -621,7 +556,8 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -648,43 +584,25 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 }
 
 define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE2-LABEL: test_v64i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddw %xmm6, %xmm2
-; SSE2-NEXT:    paddw %xmm7, %xmm3
-; SSE2-NEXT:    paddw %xmm5, %xmm3
-; SSE2-NEXT:    paddw %xmm1, %xmm3
-; SSE2-NEXT:    paddw %xmm4, %xmm2
-; SSE2-NEXT:    paddw %xmm3, %xmm2
-; SSE2-NEXT:    paddw %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    paddw %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v64i16:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddw %xmm6, %xmm2
-; SSE41-NEXT:    paddw %xmm7, %xmm3
-; SSE41-NEXT:    paddw %xmm5, %xmm3
-; SSE41-NEXT:    paddw %xmm1, %xmm3
-; SSE41-NEXT:    paddw %xmm4, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm2
-; SSE41-NEXT:    paddw %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT:    paddw %xmm2, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    phaddw %xmm1, %xmm1
-; SSE41-NEXT:    movd %xmm1, %eax
-; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v64i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddw %xmm6, %xmm2
+; SSE-NEXT:    paddw %xmm7, %xmm3
+; SSE-NEXT:    paddw %xmm5, %xmm3
+; SSE-NEXT:    paddw %xmm1, %xmm3
+; SSE-NEXT:    paddw %xmm4, %xmm2
+; SSE-NEXT:    paddw %xmm3, %xmm2
+; SSE-NEXT:    paddw %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    paddw %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v64i16:
 ; AVX1:       # %bb.0:
@@ -703,7 +621,8 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -720,7 +639,8 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/test/CodeGen/X86/vector-reduce-fadd-fast.ll
index b17734b83e7..281c4f28d99 100644
--- a/test/CodeGen/X86/vector-reduce-fadd-fast.ll
+++ b/test/CodeGen/X86/vector-reduce-fadd-fast.ll
@@ -20,18 +20,20 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
 ;
 ; SSE41-LABEL: test_v2f32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    haddps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm1, %xmm1, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
@@ -50,24 +52,27 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
 ;
 ; SSE41-LABEL: test_v4f32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE41-NEXT:    addps %xmm1, %xmm0
-; SSE41-NEXT:    haddps %xmm0, %xmm0
+; SSE41-NEXT:    movaps %xmm1, %xmm2
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT:    addps %xmm1, %xmm2
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    addps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
 ; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v4f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
 ; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
@@ -88,10 +93,11 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; SSE41-LABEL: test_v8f32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    addps %xmm2, %xmm1
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE41-NEXT:    addps %xmm1, %xmm0
-; SSE41-NEXT:    haddps %xmm0, %xmm0
+; SSE41-NEXT:    movaps %xmm1, %xmm2
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT:    addps %xmm1, %xmm2
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    addps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8f32:
@@ -100,7 +106,8 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX-NEXT:    vaddps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -111,7 +118,8 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -138,10 +146,11 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; SSE41-NEXT:    addps %xmm4, %xmm2
 ; SSE41-NEXT:    addps %xmm3, %xmm1
 ; SSE41-NEXT:    addps %xmm2, %xmm1
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE41-NEXT:    addps %xmm1, %xmm0
-; SSE41-NEXT:    haddps %xmm0, %xmm0
+; SSE41-NEXT:    movaps %xmm1, %xmm2
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT:    addps %xmm1, %xmm2
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    addps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v16f32:
@@ -151,7 +160,8 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -188,17 +198,20 @@ define float @test_v2f32_zero(<2 x float> %a0) {
 ;
 ; SSE41-LABEL: test_v2f32_zero:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    haddps %xmm0, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f32_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f32_zero:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
   ret float %1
@@ -220,7 +233,8 @@ define float @test_v4f32_zero(<4 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -228,14 +242,16 @@ define float @test_v4f32_zero(<4 x float> %a0) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v4f32_zero:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
   ret float %1
@@ -259,7 +275,8 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -269,7 +286,8 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -280,7 +298,8 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -310,7 +329,8 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -321,7 +341,8 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -358,17 +379,20 @@ define float @test_v2f32_undef(<2 x float> %a0) {
 ;
 ; SSE41-LABEL: test_v2f32_undef:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    haddps %xmm0, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f32_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f32_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
   ret float %1
@@ -390,7 +414,8 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -398,14 +423,16 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v4f32_undef:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
   ret float %1
@@ -429,7 +456,8 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -439,7 +467,8 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -450,7 +479,8 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -480,7 +510,8 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -491,7 +522,8 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -518,53 +550,43 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ;
 
 define double @test_v2f64(double %a0, <2 x double> %a1) {
-; SSE2-LABEL: test_v2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm1, %xmm1, %xmm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddpd %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
 define double @test_v4f64(double %a0, <4 x double> %a1) {
-; SSE2-LABEL: test_v4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm2, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -573,7 +595,8 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX512-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -582,31 +605,23 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 }
 
 define double @test_v8f64(double %a0, <8 x double> %a1) {
-; SSE2-LABEL: test_v8f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm4, %xmm2
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    addpd %xmm4, %xmm2
-; SSE41-NEXT:    addpd %xmm3, %xmm0
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v8f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm4, %xmm2
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd %xmm2, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8f64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -627,32 +642,19 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 }
 
 define double @test_v16f64(double %a0, <16 x double> %a1) {
-; SSE2-LABEL: test_v16f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm6, %xmm2
-; SSE2-NEXT:    addpd %xmm7, %xmm3
-; SSE2-NEXT:    addpd %xmm5, %xmm1
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    addpd %xmm2, %xmm4
-; SSE2-NEXT:    addpd %xmm1, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movapd %xmm4, %xmm0
-; SSE41-NEXT:    addpd %xmm6, %xmm2
-; SSE41-NEXT:    addpd %xmm7, %xmm3
-; SSE41-NEXT:    addpd %xmm5, %xmm1
-; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm0
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v16f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm6, %xmm2
+; SSE-NEXT:    addpd %xmm7, %xmm3
+; SSE-NEXT:    addpd %xmm5, %xmm1
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    addpd %xmm2, %xmm4
+; SSE-NEXT:    addpd %xmm1, %xmm4
+; SSE-NEXT:    movapd %xmm4, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE-NEXT:    addpd %xmm4, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v16f64:
 ; AVX:       # %bb.0:
@@ -661,7 +663,8 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -687,53 +690,45 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ;
 
 define double @test_v2f64_zero(<2 x double> %a0) {
-; SSE2-LABEL: test_v2f64_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v2f64_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v2f64_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f64_zero:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
   ret double %1
 }
 
 define double @test_v4f64_zero(<4 x double> %a0) {
-; SSE2-LABEL: test_v4f64_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v4f64_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v4f64_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f64_zero:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -742,7 +737,8 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -751,31 +747,24 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 }
 
 define double @test_v8f64_zero(<8 x double> %a0) {
-; SSE2-LABEL: test_v8f64_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm2, %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8f64_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v8f64_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd %xmm2, %xmm0
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8f64_zero:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -796,32 +785,19 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 }
 
 define double @test_v16f64_zero(<16 x double> %a0) {
-; SSE2-LABEL: test_v16f64_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm6, %xmm2
-; SSE2-NEXT:    addpd %xmm4, %xmm0
-; SSE2-NEXT:    addpd %xmm2, %xmm0
-; SSE2-NEXT:    addpd %xmm7, %xmm3
-; SSE2-NEXT:    addpd %xmm5, %xmm1
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16f64_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm6, %xmm2
-; SSE41-NEXT:    addpd %xmm4, %xmm0
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    addpd %xmm7, %xmm3
-; SSE41-NEXT:    addpd %xmm5, %xmm1
-; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd %xmm0, %xmm1
-; SSE41-NEXT:    haddpd %xmm1, %xmm1
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v16f64_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm6, %xmm2
+; SSE-NEXT:    addpd %xmm4, %xmm0
+; SSE-NEXT:    addpd %xmm2, %xmm0
+; SSE-NEXT:    addpd %xmm7, %xmm3
+; SSE-NEXT:    addpd %xmm5, %xmm1
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v16f64_zero:
 ; AVX:       # %bb.0:
@@ -830,7 +806,8 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -856,53 +833,45 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ;
 
 define double @test_v2f64_undef(<2 x double> %a0) {
-; SSE2-LABEL: test_v2f64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v2f64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v2f64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f64_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
   ret double %1
 }
 
 define double @test_v4f64_undef(<4 x double> %a0) {
-; SSE2-LABEL: test_v4f64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v4f64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v4f64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f64_undef:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -911,7 +880,8 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -920,31 +890,24 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 }
 
 define double @test_v8f64_undef(<8 x double> %a0) {
-; SSE2-LABEL: test_v8f64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm2, %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8f64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v8f64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd %xmm2, %xmm0
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8f64_undef:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -965,32 +928,19 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 }
 
 define double @test_v16f64_undef(<16 x double> %a0) {
-; SSE2-LABEL: test_v16f64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm6, %xmm2
-; SSE2-NEXT:    addpd %xmm4, %xmm0
-; SSE2-NEXT:    addpd %xmm2, %xmm0
-; SSE2-NEXT:    addpd %xmm7, %xmm3
-; SSE2-NEXT:    addpd %xmm5, %xmm1
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16f64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm6, %xmm2
-; SSE41-NEXT:    addpd %xmm4, %xmm0
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    addpd %xmm7, %xmm3
-; SSE41-NEXT:    addpd %xmm5, %xmm1
-; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd %xmm0, %xmm1
-; SSE41-NEXT:    haddpd %xmm1, %xmm1
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v16f64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm6, %xmm2
+; SSE-NEXT:    addpd %xmm4, %xmm0
+; SSE-NEXT:    addpd %xmm2, %xmm0
+; SSE-NEXT:    addpd %xmm7, %xmm3
+; SSE-NEXT:    addpd %xmm5, %xmm1
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v16f64_undef:
 ; AVX:       # %bb.0:
@@ -999,7 +949,8 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index 5c0a223d496..2eb9362947e 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2700,36 +2700,21 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
 }
 
 define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: PR22377:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movaps %xmm0, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: PR22377:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movaps %xmm0, %xmm1
-; SSSE3-NEXT:    haddps %xmm0, %xmm1
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: PR22377:
-; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    movaps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm0, %xmm1
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: PR22377:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; SSE-NEXT:    addps %xmm0, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR22377:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX-NEXT:    retq
 entry:
   %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
-- 
GitLab


From f0303e4307941c869627facd190e9bab1977baf0 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Fri, 12 Oct 2018 17:01:46 +0000
Subject: [PATCH 0113/1116] [BPF] Add BTF generation for BPF target

BTF is the debug format for BPF, a kernel virtual machine
and widely used for tracing, networking and security, etc ([1]).

Currently only instruction streams are passed to kernel,
the kernel verifier verifies them before execution. In order to
provide better visibility of bpf programs to user space
tools, some debug information, e.g., function names and
debug line information are desirable for kernel so tools
can get such information with better annotation
for jited instructions for performance or other reasons.

The dwarf is too complicated in kernel and for BPF.
Hence, BTF is designed to be the debug format for BPF ([2]).
Right now, pahole supports BTF for types, which
are generated based on dwarf sections in the ELF file.

In order to annotate performance metrics for jited bpf insns,
it is necessary to pass debug line info to the kernel.
Furthermore, we want to pass the actual code to the
kernel because of the following reasons:

. bpf program typically is small so storage overhead
  should be small.
. in bpf land, it is totally possible that
  an application loads the bpf program into the
  kernel and then that application quits, so
  holding debug info by the user space application
  is not practical.
. having source codes directly kept by kernel
  would ease deployment since the original source
  code does not need ship on every hosts and
  kernel-devel package does not need to be
  deployed even if kernel headers are used.

The only reliable time to get the source code is
during compilation time. This will result in both more
accurate information and easier deployment as
stated in the above.

Another consideration is for JIT. The project like bcc
use MCJIT to compile a C program into bpf insns and
load them to the kernel ([3]). The generated BTF sections
will be readily available for such cases as well.

This patch implemented generation of BTF info in llvm
compiler. The BTF related sections will be generated
when both -target bpf and -g are specified. Two sections
are generated:
  .BTF contains all the type and string information, and
  .BTF.ext contains the func_info and line_info.

The separation is related to how two sections are used
differently in bpf loader, e.g., linux libbpf ([4]).
The .BTF section can be loaded into the kernel directly
while .BTF.ext needs loader manipulation before loading
to the kernel. The format of the each section is roughly
defined in llvm:include/llvm/MC/MCBTFContext.h and
from the implementation in llvm:lib/MC/MCBTFContext.cpp.
A later example also shows the contents in each section.

The type and func_info are gathered during CodeGen/AsmPrinter
by traversing dwarf debug_info. The line_info is
gathered in MCObjectStreamer before writing to
the object file. After all the information is gathered,
the two sections are emitted in MCObjectStreamer::finishImpl.

With cmake CMAKE_BUILD_TYPE=Debug, the compiler can
dump out all the tables except insn offset, which
will be resolved later as relocation records.
The debug type "btf" is used for BTFContext dump.

Dwarf tests the debug info generation with
llvm-dwarfdump to decode the binary sections and
check whether the result is expected. Currently
we do not have such a tool yet. We will implement
btf dump functionality in bpftool ([5]) as the bpftool is
considered the recommended tool for bpf introspection.
The implementation for type and func_info is tested
with linux kernel test cases. The line_info is visually
checked with dump from linux kernel libbpf ([4]) and
checked with readelf dumping section raw data.

Note that the .BTF and .BTF.ext information will not
be emitted to assembly code and there is no assembler
support for BTF either.

In the below, with a clang/llvm built with CMAKE_BUILD_TYPE=Debug,
Each table contents are shown for a simple C program.

  -bash-4.2$ cat -n test.c
     1  struct A {
     2    int a;
     3    char b;
     4  };
     5
     6  int test(struct A *t) {
     7    return t->a;
     8  }
  -bash-4.2$ clang -O2 -target bpf -g -mllvm -debug-only=btf -c test.c
  Type Table:
  [1] FUNC name_off=1 info=0x0c000001 size/type=2
        param_type=3
  [2] INT name_off=12 info=0x01000000 size/type=4
        desc=0x01000020
  [3] PTR name_off=0 info=0x02000000 size/type=4
  [4] STRUCT name_off=16 info=0x04000002 size/type=8
        name_off=18 type=2 bit_offset=0
        name_off=20 type=5 bit_offset=32
  [5] INT name_off=22 info=0x01000000 size/type=1
        desc=0x02000008

  String Table:
  0 :
  1 : test
  6 : .text
  12 : int
  16 : A
  18 : a
  20 : b
  22 : char
  27 : test.c
  34 : int test(struct A *t) {
  58 :   return t->a;

  FuncInfo Table:
  sec_name_off=6
        insn_offset=<Omitted> type_id=1

  LineInfo Table:
  sec_name_off=6
        insn_offset=<Omitted> file_name_off=27 line_off=34 line_num=6 column_num=0
        insn_offset=<Omitted> file_name_off=27 line_off=58 line_num=7 column_num=3
  -bash-4.2$ readelf -S test.o
  ......
    [12] .BTF              PROGBITS         0000000000000000  0000028d
       00000000000000c1  0000000000000000           0     0     1
    [13] .BTF.ext          PROGBITS         0000000000000000  0000034e
       0000000000000050  0000000000000000           0     0     1
    [14] .rel.BTF.ext      REL              0000000000000000  00000648
       0000000000000030  0000000000000010          16    13     8
  ......
  -bash-4.2$

The latest linux kernel ([6]) can already support .BTF with type information.
The [7] has the reference implementation in linux kernel side
to support .BTF.ext func_info. The .BTF.ext line_info support is not
implemented yet. If you have difficulty accessing [6], you can
manually do the following to access the code:

  git clone https://github.com/yonghong-song/bpf-next-linux.git
  cd bpf-next-linux
  git checkout btf

The change will push to linux kernel soon once this patch is landed.

References:
[1]. https://www.kernel.org/doc/Documentation/networking/filter.txt
[2]. https://lwn.net/Articles/750695/
[3]. https://github.com/iovisor/bcc
[4]. https://github.com/torvalds/linux/tree/master/tools/lib/bpf
[5]. https://github.com/torvalds/linux/tree/master/tools/bpf/bpftool
[6]. https://github.com/torvalds/linux
[7]. https://github.com/yonghong-song/bpf-next-linux/tree/btf

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>

Differential Revision: https://reviews.llvm.org/D52950

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344366 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCBTFContext.h        | 364 +++++++++++++++++++
 include/llvm/MC/MCContext.h           |   7 +
 include/llvm/MC/MCObjectFileInfo.h    |   8 +
 include/llvm/MC/MCObjectStreamer.h    |   1 +
 lib/CodeGen/AsmPrinter/CMakeLists.txt |   1 +
 lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp  | 501 ++++++++++++++++++++++++++
 lib/CodeGen/AsmPrinter/Dwarf2BTF.h    | 134 +++++++
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp |  10 +
 lib/CodeGen/AsmPrinter/DwarfDebug.h   |   3 +
 lib/CodeGen/AsmPrinter/DwarfFile.cpp  |  10 +
 lib/CodeGen/AsmPrinter/DwarfFile.h    |   3 +
 lib/MC/CMakeLists.txt                 |   2 +
 lib/MC/MCBTFContext.cpp               | 235 ++++++++++++
 lib/MC/MCContext.cpp                  |  11 +-
 lib/MC/MCDwarf2BTF.cpp                |  99 +++++
 lib/MC/MCDwarf2BTF.h                  |  29 ++
 lib/MC/MCObjectFileInfo.cpp           |   3 +
 lib/MC/MCObjectStreamer.cpp           |  34 ++
 18 files changed, 1454 insertions(+), 1 deletion(-)
 create mode 100644 include/llvm/MC/MCBTFContext.h
 create mode 100644 lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
 create mode 100644 lib/CodeGen/AsmPrinter/Dwarf2BTF.h
 create mode 100644 lib/MC/MCBTFContext.cpp
 create mode 100644 lib/MC/MCDwarf2BTF.cpp
 create mode 100644 lib/MC/MCDwarf2BTF.h

diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h
new file mode 100644
index 00000000000..f180a69340b
--- /dev/null
+++ b/include/llvm/MC/MCBTFContext.h
@@ -0,0 +1,364 @@
+//===- MCBTFContext.h ---------------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This header file contains two parts. The first part is the BTF ELF
+// specification in C format, and the second part is the various
+// C++ classes to manipulate the data structure in order to generate
+// the BTF related ELF sections.
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_MC_MCBTFCONTEXT_H
+#define LLVM_MC_MCBTFCONTEXT_H
+
+#include <linux/types.h>
+
+#define BTF_MAGIC	0xeB9F
+#define BTF_VERSION	1
+
+struct btf_header {
+	__u16	magic;
+	__u8	version;
+	__u8	flags;
+	__u32	hdr_len;
+
+	/* All offsets are in bytes relative to the end of this header */
+	__u32	type_off;	/* offset of type section	*/
+	__u32	type_len;	/* length of type section	*/
+	__u32	str_off;	/* offset of string section	*/
+	__u32	str_len;	/* length of string section	*/
+};
+
+/* Max # of type identifier */
+#define BTF_MAX_TYPE	0x0000ffff
+/* Max offset into the string section */
+#define BTF_MAX_NAME_OFFSET	0x0000ffff
+/* Max # of struct/union/enum members or func args */
+#define BTF_MAX_VLEN	0xffff
+
+struct btf_type {
+	__u32 name_off;
+	/* "info" bits arrangement
+	 * bits  0-15: vlen (e.g. # of struct's members)
+	 * bits 16-23: unused
+	 * bits 24-27: kind (e.g. int, ptr, array...etc)
+	 * bits 28-31: unused
+	 */
+	__u32 info;
+	/* "size" is used by INT, ENUM, STRUCT and UNION.
+	 * "size" tells the size of the type it is describing.
+	 *
+	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
+	 * FUNC and FUNC_PROTO.
+	 * "type" is a type_id referring to another type.
+	 */
+	union {
+		__u32 size;
+		__u32 type;
+	};
+};
+
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x0f)
+#define BTF_INFO_VLEN(info)	((info) & 0xffff)
+
+#define BTF_KIND_UNKN		0	/* Unknown	*/
+#define BTF_KIND_INT		1	/* Integer	*/
+#define BTF_KIND_PTR		2	/* Pointer	*/
+#define BTF_KIND_ARRAY		3	/* Array	*/
+#define BTF_KIND_STRUCT		4	/* Struct	*/
+#define BTF_KIND_UNION		5	/* Union	*/
+#define BTF_KIND_ENUM		6	/* Enumeration	*/
+#define BTF_KIND_FWD		7	/* Forward	*/
+#define BTF_KIND_TYPEDEF	8	/* Typedef	*/
+#define BTF_KIND_VOLATILE	9	/* Volatile	*/
+#define BTF_KIND_CONST		10	/* Const	*/
+#define BTF_KIND_RESTRICT	11	/* Restrict	*/
+#define BTF_KIND_FUNC		12	/* Function	*/
+#define BTF_KIND_FUNC_PROTO	13	/* Function Prototype	*/
+#define BTF_KIND_MAX		13
+#define NR_BTF_KINDS		14
+
+/* For some specific BTF_KIND, "struct btf_type" is immediately
+ * followed by extra data.
+ */
+
+/* BTF_KIND_INT is followed by a u32 and the following
+ * is the 32 bits arrangement:
+ */
+#define BTF_INT_ENCODING(VAL)	(((VAL) & 0x0f000000) >> 24)
+#define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
+#define BTF_INT_BITS(VAL)	((VAL)  & 0x000000ff)
+
+/* Attributes stored in the BTF_INT_ENCODING */
+#define BTF_INT_SIGNED	(1 << 0)
+#define BTF_INT_CHAR	(1 << 1)
+#define BTF_INT_BOOL	(1 << 2)
+
+/* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
+ * The exact number of btf_enum is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_enum {
+	__u32	name_off;
+	__s32	val;
+};
+
+/* BTF_KIND_ARRAY is followed by one "struct btf_array" */
+struct btf_array {
+	__u32	type;
+	__u32	index_type;
+	__u32	nelems;
+};
+
+/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed
+ * by multiple "struct btf_member".  The exact number
+ * of btf_member is stored in the vlen (of the info in
+ * "struct btf_type").
+ */
+struct btf_member {
+	__u32	name_off;
+	__u32	type;
+	__u32	offset;	/* offset in bits */
+};
+
+/* .BTF.ext section contains func_info and line_info.
+ */
+struct btf_ext_header {
+	__u16	magic;
+	__u8	version;
+	__u8	flags;
+	__u32	hdr_len;
+
+	__u32	func_info_off;
+	__u32	func_info_len;
+	__u32	line_info_off;
+	__u32	line_info_len;
+};
+
+struct bpf_func_info {
+	__u32	insn_offset;
+	__u32	type_id;
+};
+
+struct btf_sec_func_info {
+	__u32	sec_name_off;
+	__u32	num_func_info;
+};
+
+struct bpf_line_info {
+	__u32	insn_offset;
+	__u32	file_name_off;
+	__u32	line_off;
+	__u32	line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */
+};
+
+struct btf_sec_line_info {
+	__u32	sec_name_off;
+	__u32	num_line_info;
+};
+
+namespace llvm {
+
+const char *const btf_kind_str[NR_BTF_KINDS] = {
+	[BTF_KIND_UNKN]		= "UNKNOWN",
+	[BTF_KIND_INT]		= "INT",
+	[BTF_KIND_PTR]		= "PTR",
+	[BTF_KIND_ARRAY]	= "ARRAY",
+	[BTF_KIND_STRUCT]	= "STRUCT",
+	[BTF_KIND_UNION]	= "UNION",
+	[BTF_KIND_ENUM]		= "ENUM",
+	[BTF_KIND_FWD]		= "FWD",
+	[BTF_KIND_TYPEDEF]	= "TYPEDEF",
+	[BTF_KIND_VOLATILE]	= "VOLATILE",
+	[BTF_KIND_CONST]	= "CONST",
+	[BTF_KIND_RESTRICT]	= "RESTRICT",
+	[BTF_KIND_FUNC]		= "FUNC",
+	[BTF_KIND_FUNC_PROTO]	= "FUNC_PROTO",
+};
+
+#include "llvm/ADT/SmallVector.h"
+#include <map>
+
+class MCBTFContext;
+class MCObjectStreamer;
+
+// This is base class of all BTF KIND. It is also used directly
+// by the reference kinds:
+//   BTF_KIND_CONST,  BTF_KIND_PTR,  BTF_KIND_VOLATILE,
+//   BTF_KIND_TYPEDEF, BTF_KIND_RESTRICT, and BTF_KIND_FWD
+class BTFTypeEntry {
+protected:
+  size_t Id;  /* type index in the BTF list, started from 1 */
+  struct btf_type BTFType;
+
+public:
+  BTFTypeEntry(size_t id, struct btf_type &type) :
+    Id(id), BTFType(type) {}
+  unsigned char getKind() { return BTF_INFO_KIND(BTFType.info); }
+  void setId(size_t Id) { this->Id = Id; }
+  size_t getId() { return Id; }
+  void setNameOff(unsigned NameOff) { BTFType.name_off = NameOff; }
+
+  unsigned getTypeIndex() { return BTFType.type; }
+  unsigned getNameOff() { return BTFType.name_off; }
+  virtual size_t getSize() { return sizeof(struct btf_type); }
+  virtual void print(raw_ostream &s, MCBTFContext& BTFContext);
+  virtual void emitData(MCObjectStreamer *MCOS);
+};
+
+// BTF_KIND_INT
+class BTFTypeEntryInt : public BTFTypeEntry {
+  unsigned IntVal;  // encoding, offset, bits
+
+public:
+  BTFTypeEntryInt(size_t id, struct btf_type &type, unsigned intval) :
+    BTFTypeEntry(id, type), IntVal(intval) {}
+  size_t getSize() { return BTFTypeEntry::getSize() + sizeof(unsigned); }
+  void print(raw_ostream &s, MCBTFContext& BTFContext);
+  void emitData(MCObjectStreamer *MCOS);
+};
+
+// BTF_KIND_ENUM
+class BTFTypeEntryEnum : public BTFTypeEntry {
+  std::vector<struct btf_enum> EnumValues;
+
+public:
+  BTFTypeEntryEnum(size_t id, struct btf_type &type,
+                   std::vector<struct btf_enum> &values) :
+    BTFTypeEntry(id, type), EnumValues(values) {}
+  size_t getSize() {
+    return BTFTypeEntry::getSize() +
+      BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_enum);
+  }
+  void print(raw_ostream &s, MCBTFContext& BTFContext);
+  void emitData(MCObjectStreamer *MCOS);
+};
+
+// BTF_KIND_ARRAY
+class BTFTypeEntryArray : public BTFTypeEntry {
+  struct btf_array ArrayInfo;
+
+public:
+  BTFTypeEntryArray(size_t id, struct btf_type &type,
+                    struct btf_array &arrayinfo) :
+    BTFTypeEntry(id, type), ArrayInfo(arrayinfo) {}
+  size_t getSize() {
+    return BTFTypeEntry::getSize() +  sizeof(struct btf_array);
+  }
+  void print(raw_ostream &s, MCBTFContext& BTFContext);
+  void emitData(MCObjectStreamer *MCOS);
+};
+
+// BTF_KIND_STRUCT and BTF_KIND_UNION
+class BTFTypeEntryStruct : public BTFTypeEntry {
+  std::vector<struct btf_member> Members;
+
+public:
+  BTFTypeEntryStruct(size_t id, struct btf_type &type,
+                     std::vector<struct btf_member> &members) :
+    BTFTypeEntry(id, type), Members(members) {}
+  size_t getSize() {
+    return BTFTypeEntry::getSize() +
+      BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_member);
+  }
+  void print(raw_ostream &s, MCBTFContext& BTFContext);
+  void emitData(MCObjectStreamer *MCOS);
+};
+
+// BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO
+class BTFTypeEntryFunc : public BTFTypeEntry {
+  std::vector<unsigned> Parameters;
+
+public:
+  BTFTypeEntryFunc(size_t id, struct btf_type &type,
+                   std::vector<unsigned> &params) :
+    BTFTypeEntry(id, type), Parameters(params) {}
+  size_t getSize() {
+    return BTFTypeEntry::getSize() +
+      BTF_INFO_VLEN(BTFType.info) * sizeof(unsigned);
+  }
+  void print(raw_ostream &s, MCBTFContext& BTFContext);
+  void emitData(MCObjectStreamer *MCOS);
+};
+
+class BTFStringTable {
+  size_t Size;  // total size in bytes
+  std::map<size_t, unsigned> OffsetToIdMap;
+  std::vector<std::string> Table;
+
+ public:
+  BTFStringTable() : Size(0) {}
+  size_t getSize() { return Size; }
+  std::vector<std::string> &getTable() { return Table; }
+  size_t addString(std::string S) {
+    // check whether the string already exists
+    for (auto &OffsetM : OffsetToIdMap) {
+      if (Table[OffsetM.second] == S)
+        return OffsetM.first;
+    }
+    // not find, add to the string table
+    size_t Offset = Size;
+    OffsetToIdMap[Offset] = Table.size();
+    Table.push_back(S);
+    Size += S.size() + 1;
+    return Offset;
+  }
+  std::string &getStringAtOffset(size_t Offset) {
+    return Table[OffsetToIdMap[Offset]];
+  }
+  void showTable(raw_ostream &OS) {
+    for (auto OffsetM : OffsetToIdMap)
+      OS << OffsetM.first << " : " << Table[OffsetM.second]
+         << "\n";
+  }
+};
+
+struct BTFFuncInfo  {
+    const MCSymbol *Label;
+    unsigned int TypeId;
+};
+
+struct BTFLineInfo  {
+    MCSymbol *Label;
+    unsigned int FileNameOff;
+    unsigned int LineOff;
+    unsigned int LineNum;
+    unsigned int ColumnNum;
+};
+
+class MCBTFContext {
+  std::vector<std::unique_ptr<BTFTypeEntry>> TypeEntries;
+  BTFStringTable StringTable;
+  std::map<unsigned, std::vector<BTFFuncInfo>> FuncInfoTable;
+  std::map<unsigned, std::vector<BTFLineInfo>> LineInfoTable;
+
+  friend class BTFTypeEntry;
+  friend class BTFTypeEntryInt;
+  friend class BTFTypeEntryEnum;
+  friend class BTFTypeEntryArray;
+  friend class BTFTypeEntryStruct;
+  friend class BTFTypeEntryFunc;
+
+public:
+  void dump(raw_ostream& OS);
+  void emitAll(MCObjectStreamer *MCOS);
+  void emitCommonHeader(MCObjectStreamer *MCOS);
+  void emitBTFSection(MCObjectStreamer *MCOS);
+  void emitBTFExtSection(MCObjectStreamer *MCOS);
+
+  size_t addString(std::string S) {
+    return StringTable.addString(S);
+  }
+  void addTypeEntry(std::unique_ptr<BTFTypeEntry> Entry);
+  void addFuncInfo(unsigned SecNameOff, BTFFuncInfo Info) {
+    FuncInfoTable[SecNameOff].push_back(Info);
+  }
+  void addLineInfo(unsigned SecNameOff, BTFLineInfo Info) {
+    LineInfoTable[SecNameOff].push_back(Info);
+  }
+};
+
+}
+#endif
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 3b8ac8b79e2..d5c49408c68 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -56,6 +56,7 @@ namespace llvm {
   class MCSymbolWasm;
   class SMLoc;
   class SourceMgr;
+  class MCBTFContext;
 
   /// Context object for machine code objects.  This class owns all of the
   /// sections that it creates.
@@ -278,6 +279,9 @@ namespace llvm {
     /// Map of currently defined macros.
     StringMap<MCAsmMacro> MacroMap;
 
+    /// for BTF debug information
+    std::unique_ptr<MCBTFContext> BTFCtx;
+
   public:
     explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI,
                        const MCObjectFileInfo *MOFI,
@@ -286,6 +290,9 @@ namespace llvm {
     MCContext &operator=(const MCContext &) = delete;
     ~MCContext();
 
+    void setBTFContext(std::unique_ptr<MCBTFContext> Ctx);
+    std::unique_ptr<MCBTFContext> &getBTFContext() { return BTFCtx; }
+
     const SourceMgr *getSourceManager() const { return SrcMgr; }
 
     void setInlineSourceManager(SourceMgr *SM) { InlineSrcMgr = SM; }
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 8cf9e1cc55a..1dda7b0712f 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -207,6 +207,10 @@ protected:
   MCSection *SXDataSection;
   MCSection *GFIDsSection;
 
+  // BTF specific sections.
+  MCSection *BTFSection;
+  MCSection *BTFExtSection;
+
 public:
   void InitMCObjectFileInfo(const Triple &TT, bool PIC, MCContext &ctx,
                             bool LargeCodeModel = false);
@@ -372,6 +376,10 @@ public:
     return EHFrameSection;
   }
 
+  // BTF specific sections.
+  MCSection *getBTFSection() const { return BTFSection; }
+  MCSection *getBTFExtSection() const { return BTFExtSection; }
+
   enum Environment { IsMachO, IsELF, IsCOFF, IsWasm };
   Environment getObjectFileType() const { return Env; }
 
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index c9e577b7e29..9d15086ac63 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -138,6 +138,7 @@ public:
                                 unsigned PointerSize);
   void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                  const MCSymbol *Label);
+  void EmitBTFAdvanceLineAddr(const MCSymbol *Label, unsigned Size);
   void EmitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line,
                           unsigned Column, bool PrologueEnd, bool IsStmt,
                           StringRef FileName, SMLoc Loc) override;
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index 6cba4a0d4b8..14c895a9c82 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_library(LLVMAsmPrinter
   DwarfFile.cpp
   DwarfStringPool.cpp
   DwarfUnit.cpp
+  Dwarf2BTF.cpp
   EHStreamer.cpp
   ErlangGCPrinter.cpp
   OcamlGCPrinter.cpp
diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
new file mode 100644
index 00000000000..20eab4d1fb8
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
@@ -0,0 +1,501 @@
+//===- Dwarf2BTF.cpp ------------------------------------------ *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfUnit.h"
+#include "Dwarf2BTF.h"
+#include "llvm/MC/MCBTFContext.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+
+namespace llvm {
+
+unsigned char Die2BTFEntry::getDieKind(const DIE & Die) {
+  auto Tag = Die.getTag();
+
+  switch (Tag) {
+    case dwarf::DW_TAG_base_type:
+      if (getBaseTypeEncoding(Die) == BTF_INVALID_ENCODING)
+        return BTF_KIND_UNKN;
+      return BTF_KIND_INT;
+    case dwarf::DW_TAG_const_type:
+      return BTF_KIND_CONST;
+    case dwarf::DW_TAG_pointer_type:
+      return BTF_KIND_PTR;
+    case dwarf::DW_TAG_restrict_type:
+      return BTF_KIND_RESTRICT;
+    case dwarf::DW_TAG_volatile_type:
+      return BTF_KIND_VOLATILE;
+    case dwarf::DW_TAG_typedef:
+      return BTF_KIND_TYPEDEF;
+    case dwarf::DW_TAG_structure_type:
+    case dwarf::DW_TAG_class_type:
+      if (Die.findAttribute(dwarf::DW_AT_declaration).getType()
+          != DIEValue::isNone)
+        return BTF_KIND_FWD;
+      else
+        return BTF_KIND_STRUCT;
+    case dwarf::DW_TAG_union_type:
+      if (Die.findAttribute(dwarf::DW_AT_declaration).getType()
+          != DIEValue::isNone)
+        return BTF_KIND_FWD;
+      else
+        return BTF_KIND_UNION;
+    case dwarf::DW_TAG_enumeration_type:
+      return BTF_KIND_ENUM;
+    case dwarf::DW_TAG_array_type:
+      return BTF_KIND_ARRAY;
+    case dwarf::DW_TAG_subprogram:
+      return BTF_KIND_FUNC;
+    case dwarf::DW_TAG_subroutine_type:
+      return BTF_KIND_FUNC_PROTO;
+    default:
+      break;
+  }
+
+  return BTF_KIND_UNKN;
+}
+
+std::unique_ptr<Die2BTFEntry> Die2BTFEntry::dieToBTFTypeEntry(const DIE &Die) {
+  unsigned char Kind = getDieKind(Die);
+
+  switch (Kind) {
+    case BTF_KIND_INT:
+      return make_unique<Die2BTFEntryInt>(Die);
+    case BTF_KIND_PTR:
+    case BTF_KIND_TYPEDEF:
+    case BTF_KIND_VOLATILE:
+    case BTF_KIND_CONST:
+    case BTF_KIND_RESTRICT:
+    case BTF_KIND_FWD:
+      return make_unique<Die2BTFEntry>(Die);
+    case BTF_KIND_ARRAY:
+      return make_unique<Die2BTFEntryArray>(Die);
+    case BTF_KIND_STRUCT:
+    case BTF_KIND_UNION:
+      return make_unique<Die2BTFEntryStruct>(Die);
+    case BTF_KIND_ENUM:
+      return make_unique<Die2BTFEntryEnum>(Die);
+    case BTF_KIND_FUNC:
+    case BTF_KIND_FUNC_PROTO:
+      return make_unique<Die2BTFEntryFunc>(Die);
+    default:
+      break;
+  }
+  return nullptr;
+}
+
+bool Die2BTFEntry::shouldSkipDie(const DIE &Die) {
+  auto Tag = Die.getTag();
+
+  switch (Tag) {
+    case dwarf::DW_TAG_const_type:
+    case dwarf::DW_TAG_pointer_type:
+    case dwarf::DW_TAG_restrict_type:
+    case dwarf::DW_TAG_typedef:
+    case dwarf::DW_TAG_volatile_type:
+    {
+      auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
+      if (TypeV.getType() == DIEValue::isNone)
+        return false;
+      auto &TypeDie = TypeV.getDIEEntry().getEntry();
+      return Die2BTFEntry::shouldSkipDie(TypeDie);
+    }
+    default:
+      return getDieKind(Die) == BTF_KIND_UNKN;
+  }
+  return true;
+}
+unsigned char Die2BTFEntry::getBaseTypeEncoding(const DIE &Die) {
+  auto V = Die.findAttribute(dwarf::DW_AT_encoding);
+
+  if (V.getType() != DIEValue::isInteger)
+    return BTF_INVALID_ENCODING;
+
+  switch (V.getDIEInteger().getValue()) {
+    case dwarf::DW_ATE_boolean:
+      return BTF_INT_BOOL;
+    case dwarf::DW_ATE_signed:
+      return BTF_INT_SIGNED;
+    case dwarf::DW_ATE_signed_char:
+      return BTF_INT_CHAR;
+    case dwarf::DW_ATE_unsigned:
+      return 0;
+    case dwarf::DW_ATE_unsigned_char:
+      return BTF_INT_CHAR;
+    case dwarf::DW_ATE_imaginary_float:
+    case dwarf::DW_ATE_packed_decimal:
+    case dwarf::DW_ATE_numeric_string:
+    case dwarf::DW_ATE_edited:
+    case dwarf::DW_ATE_signed_fixed:
+    case dwarf::DW_ATE_address:
+    case dwarf::DW_ATE_complex_float:
+    case dwarf::DW_ATE_float:
+    default:
+      break;
+  }
+  return BTF_INVALID_ENCODING;
+}
+
+Die2BTFEntry::Die2BTFEntry(const DIE &Die) : Die(Die) {
+  unsigned char Kind = getDieKind(Die);
+
+  switch (Kind) {
+    case BTF_KIND_CONST:
+    case BTF_KIND_FWD:
+    case BTF_KIND_PTR:
+    case BTF_KIND_RESTRICT:
+    case BTF_KIND_TYPEDEF:
+    case BTF_KIND_VOLATILE:
+      break;
+    default:
+      assert("Invalid Die passed into BTFTypeEntry()");
+      break;
+  }
+
+  BTFType.info = (Kind & 0xf) << 24;
+}
+
+void Die2BTFEntry::completeData(class Dwarf2BTF &Dwarf2BTF) {
+    auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
+    if (TypeV.getType() == DIEValue::isNone) {
+      BTFType.type = 0;
+    } else {
+      auto &TypeDie = TypeV.getDIEEntry().getEntry();
+      auto Type = Dwarf2BTF.getTypeIndex(TypeDie);
+      BTFType.type = Type;
+    }
+
+    unsigned char Kind = getDieKind(Die);
+    if (Kind != BTF_KIND_FWD) {
+      BTFType.name_off = 0;
+    } else {
+      auto NameV = Die.findAttribute(dwarf::DW_AT_name);
+      auto Str = NameV.getDIEString().getString();
+      BTFType.name_off = Dwarf2BTF.addBTFString(Str);
+    }
+
+    auto typeEntry = make_unique<BTFTypeEntry>(Id, BTFType);
+    Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
+}
+
+Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) {
+  unsigned char Kind = getDieKind(Die);
+
+  switch (Kind) {
+    case BTF_KIND_INT:
+      break;
+    default:
+      assert("Invalid Die passed into BTFTypeEntryInt()");
+      break;
+  }
+
+  // handle BTF_INT_ENCODING in IntVal
+  auto Encoding = Die2BTFEntry::getBaseTypeEncoding(Die);
+  assert((Encoding != BTF_INVALID_ENCODING) &&
+         "Invalid Die passed to BTFTypeEntryInt()");
+  __u32 IntVal = (Encoding & 0xf) << 24;
+
+  // handle BTF_INT_OFFSET in IntVal
+  auto V = Die.findAttribute(dwarf::DW_AT_bit_offset);
+  if (V.getType() == DIEValue::isInteger)
+    IntVal |= (V.getDIEInteger().getValue() & 0xff) << 16;
+
+  // get btf_type.size
+  V = Die.findAttribute(dwarf::DW_AT_byte_size);
+  __u32 Size = V.getDIEInteger().getValue() & 0xffffffff;
+
+// handle BTF_INT_BITS in IntVal
+  V = Die.findAttribute(dwarf::DW_AT_bit_size);
+  if (V.getType() == DIEValue::isInteger)
+    IntVal |= V.getDIEInteger().getValue() & 0xff;
+  else
+    IntVal |= (Size << 3) & 0xff;
+
+  BTFType.info = BTF_KIND_INT << 24;
+  BTFType.size = Size;
+  this->IntVal = IntVal;
+}
+
+void Die2BTFEntryInt::completeData(class Dwarf2BTF &Dwarf2BTF) {
+    auto NameV = Die.findAttribute(dwarf::DW_AT_name);
+    auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
+    auto Str = NameV.getDIEString().getString();
+
+    BTFType.name_off = Dwarf2BTF.addBTFString(Str);
+
+    auto typeEntry = make_unique<BTFTypeEntryInt>(Id, BTFType, IntVal);
+    Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
+}
+
+Die2BTFEntryEnum::Die2BTFEntryEnum(const DIE &Die) : Die2BTFEntry(Die) {
+  // get btf_type.size
+  auto V = Die.findAttribute(dwarf::DW_AT_byte_size);
+  __u32 Size = V.getDIEInteger().getValue() & 0xffffffff;
+
+  int Vlen = 0;
+  for (auto &ChildDie : Die.children())
+    if (ChildDie.getTag() == dwarf::DW_TAG_enumerator)
+      Vlen++;
+
+  BTFType.info = (BTF_KIND_ENUM << 24) | (Vlen & BTF_MAX_VLEN);
+  BTFType.type = Size;
+}
+
+void Die2BTFEntryEnum::completeData(class Dwarf2BTF &Dwarf2BTF) {
+  auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
+  auto NameV = Die.findAttribute(dwarf::DW_AT_name);
+
+  if (NameV.getType() != DIEValue::isNone) {
+    auto Str = NameV.getDIEString().getString();
+    BTFType.name_off = Dwarf2BTF.addBTFString(Str);
+  } else
+    BTFType.name_off = 0;
+
+  for (auto &ChildDie : Die.children()) {
+    struct btf_enum BTFEnum;
+    auto ChildNameV = ChildDie.findAttribute(dwarf::DW_AT_name);
+    auto Str = ChildNameV.getDIEString().getString();
+
+    BTFEnum.name_off = Dwarf2BTF.addBTFString(Str);
+    auto ChildValueV = ChildDie.findAttribute(dwarf::DW_AT_const_value);
+    BTFEnum.val = (__s32)(ChildValueV.getDIEInteger().getValue());
+
+    EnumValues.push_back(BTFEnum);
+  }
+
+  auto typeEntry = make_unique<BTFTypeEntryEnum>(Id, BTFType, EnumValues);
+  Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
+}
+
+Die2BTFEntryArray::Die2BTFEntryArray(const DIE &Die) :
+    Die2BTFEntry(Die) {
+  BTFType.info = (BTF_KIND_ARRAY << 24);
+  BTFType.size = 0;
+}
+
+void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) {
+  auto NameV = Die.findAttribute(dwarf::DW_AT_name);
+
+  std::string Str;
+  if (NameV.getType() != DIEValue::isNone)
+    Str = NameV.getDIEString().getString();
+  BTFType.name_off = Dwarf2BTF.addBTFString(Str);
+
+  auto &ArrayTypeDie = Die.findAttribute(dwarf::DW_AT_type).getDIEEntry().getEntry();
+  ArrayInfo.type = Dwarf2BTF.getTypeIndex(ArrayTypeDie);
+
+  // The number of elements should count all subranges
+  unsigned Nelems = 1;
+  bool IsFirstSubrange = true;
+  for (auto &ChildDie : Die.children()) {
+    if (ChildDie.getTag() == dwarf::DW_TAG_subrange_type) {
+      if (IsFirstSubrange) {
+        auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_type);
+        auto &TypeDie = TypeV.getDIEEntry().getEntry();
+        ArrayInfo.index_type = Dwarf2BTF.getTypeIndex(TypeDie);
+        IsFirstSubrange = false;
+      }
+      auto CountV = ChildDie.findAttribute(dwarf::DW_AT_count);
+      if (CountV.getType() == DIEValue::isNone) {
+        // array like a[] which essentially a pointer
+        Nelems = 0;
+        break;
+      }
+      Nelems *= (__u32)(CountV.getDIEInteger().getValue());
+    }
+  }
+  ArrayInfo.nelems = Nelems;
+
+  auto TypeEntry = make_unique<BTFTypeEntryArray>(Id, BTFType, ArrayInfo);
+  Dwarf2BTF.addBTFTypeEntry(std::move(TypeEntry));
+}
+
+Die2BTFEntryStruct::Die2BTFEntryStruct(const DIE &Die) : Die2BTFEntry(Die) {
+  // get btf_type.size
+  auto V = Die.findAttribute(dwarf::DW_AT_byte_size);
+  __u32 Size = V.getDIEInteger().getValue() & 0xffffffff;
+  auto Kind = Die2BTFEntry::getDieKind(Die);
+
+  int Vlen = 0;
+  for (auto &ChildDie : Die.children())
+    if (ChildDie.getTag() == dwarf::DW_TAG_member)
+      Vlen++;
+
+  BTFType.size = Size;
+  BTFType.info = (Kind << 24) | (Vlen & BTF_MAX_VLEN);
+}
+
+void Die2BTFEntryStruct::completeData(class Dwarf2BTF &Dwarf2BTF) {
+  auto NameV = Die.findAttribute(dwarf::DW_AT_name);
+
+  if (NameV.getType() != DIEValue::isNone) {
+    auto Str = NameV.getDIEString().getString();
+    BTFType.name_off = Dwarf2BTF.addBTFString(Str);
+  } else
+    BTFType.name_off = 0;
+
+
+  for (auto &ChildDie : Die.children()) {
+    if (ChildDie.getTag() != dwarf::DW_TAG_member)
+      continue;
+
+    struct btf_member BTFMember;
+    auto ChildNameV = ChildDie.findAttribute(dwarf::DW_AT_name);
+
+    if (ChildNameV.getType() != DIEValue::isNone) {
+      auto Str = ChildNameV.getDIEString().getString();
+      BTFMember.name_off = Dwarf2BTF.addBTFString(Str);
+    } else
+      BTFMember.name_off = 0;
+
+    auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_type);
+    auto &TypeDie = TypeV.getDIEEntry().getEntry();
+    BTFMember.type = Dwarf2BTF.getTypeIndex(TypeDie);
+
+    auto MemLocV = ChildDie.findAttribute(dwarf::DW_AT_data_member_location);
+    unsigned MemLoc = MemLocV.getDIEInteger().getValue() * 8;
+
+    auto ByteSizeV = ChildDie.findAttribute(dwarf::DW_AT_byte_size);
+    if (ByteSizeV.getType() != DIEValue::isNone) {
+      unsigned ByteSize = ByteSizeV.getDIEInteger().getValue();
+      auto BitOffsetV = ChildDie.findAttribute(dwarf::DW_AT_bit_offset);
+      unsigned BitOffset = BitOffsetV.getDIEInteger().getValue();
+      auto BitSizeV = ChildDie.findAttribute(dwarf::DW_AT_bit_size);
+      unsigned BitSize = BitSizeV.getDIEInteger().getValue();
+      if (Dwarf2BTF.isLittleEndian())
+        MemLoc += ByteSize * 8 - BitSize - BitOffset;
+      else
+        MemLoc += BitOffset;
+    }
+    BTFMember.offset = MemLoc;
+
+    Members.push_back(BTFMember);
+  }
+
+  auto typeEntry = make_unique<BTFTypeEntryStruct>(Id, BTFType, Members);
+  Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
+}
+
+Die2BTFEntryFunc::Die2BTFEntryFunc(const DIE &Die) : Die2BTFEntry(Die) {
+  auto Kind = Die2BTFEntry::getDieKind(Die);
+
+  int Vlen = 0;
+  for (auto &ChildDie : Die.children())
+    if (ChildDie.getTag() == dwarf::DW_TAG_formal_parameter)
+      Vlen++;
+
+  BTFType.size = 0;
+  BTFType.info = (Kind << 24) | (Vlen & BTF_MAX_VLEN);
+}
+
+void Die2BTFEntryFunc::completeData(class Dwarf2BTF &Dwarf2BTF) {
+  auto NameV = Die.findAttribute(dwarf::DW_AT_name);
+  if (NameV.getType() == DIEValue::isNone) {
+    BTFType.name_off = 0;
+  } else {
+    auto Str = NameV.getDIEString().getString();
+    BTFType.name_off = Dwarf2BTF.addBTFString(Str);
+  }
+
+  auto RetTypeV = Die.findAttribute(dwarf::DW_AT_type);
+  if (RetTypeV.getType() != DIEValue::isNone) {
+    auto &TypeDie = RetTypeV.getDIEEntry().getEntry();
+    BTFType.type = Dwarf2BTF.getTypeIndex(TypeDie);
+  } else {
+    BTFType.type = 0;
+  }
+
+  for (auto &ChildDie : Die.children()) {
+    if (ChildDie.getTag() == dwarf::DW_TAG_formal_parameter) {
+      auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_abstract_origin);
+      if (TypeV.getType() != DIEValue::isNone) {
+        auto &AbsOriginDie = TypeV.getDIEEntry().getEntry();
+        assert(AbsOriginDie.getTag() == dwarf::DW_TAG_formal_parameter);
+        TypeV = AbsOriginDie.findAttribute(dwarf::DW_AT_type);
+      } else {
+        TypeV = ChildDie.findAttribute(dwarf::DW_AT_type);
+      }
+      auto &TypeDie = TypeV.getDIEEntry().getEntry();
+      Parameters.push_back(Dwarf2BTF.getTypeIndex(TypeDie));
+    } else if (ChildDie.getTag() == dwarf::DW_TAG_unspecified_parameters) {
+      Parameters.push_back(0);
+    }
+  }
+
+  auto typeEntry = make_unique<BTFTypeEntryFunc>(Id, BTFType, Parameters);
+  Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
+
+  if (BTF_INFO_KIND(BTFType.info) == BTF_KIND_FUNC) {
+    auto LowPCV = Die.findAttribute(dwarf::DW_AT_low_pc);
+    if (LowPCV.getType() != DIEValue::isNone) {
+      const MCSymbol *Label = LowPCV.getDIELabel().getValue();
+      BTFFuncInfo FuncInfo;
+      unsigned SecNameOff;
+
+      FuncInfo.Label = Label;
+      FuncInfo.TypeId = Id;
+      if (Label->isInSection()) {
+        MCSection &Section = Label->getSection();
+        MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
+        assert(SectionELF);
+        SecNameOff = Dwarf2BTF.addBTFString(SectionELF->getSectionName().str());
+      } else {
+        SecNameOff = Dwarf2BTF.addBTFString(".text");
+      }
+      Dwarf2BTF.addBTFFuncInfo(SecNameOff, FuncInfo);
+    }
+  }
+}
+
+Dwarf2BTF::Dwarf2BTF(MCContext &Context, bool IsLittleEndian)
+  : OuterCtx(Context), IsLE(IsLittleEndian) {
+  BTFContext = make_unique<MCBTFContext>();
+}
+
+void Dwarf2BTF::addTypeEntry(const DIE &Die) {
+  for (auto &ChildDie : Die.children())
+    addTypeEntry(ChildDie);
+  if (Die2BTFEntry::shouldSkipDie(Die))
+    return;
+  auto Kind = Die2BTFEntry::getDieKind(Die);
+  if (Kind != BTF_KIND_UNKN) {
+    auto TypeEntry = Die2BTFEntry::dieToBTFTypeEntry(Die);
+    if (TypeEntry != nullptr) {
+      TypeEntry->setId(TypeEntries.size() + 1);
+      DieToIdMap[const_cast<DIE*>(&Die)] = TypeEntry->getId();
+      TypeEntries.push_back(std::move(TypeEntry));
+    }
+  }
+}
+
+void Dwarf2BTF::addBTFTypeEntry(std::unique_ptr<BTFTypeEntry> Entry) {
+  BTFContext->addTypeEntry(std::move(Entry));
+}
+
+void Dwarf2BTF::completeData() {
+  BTFContext->addString("\0");
+
+  for (auto &TypeEntry : TypeEntries)
+    TypeEntry->completeData(*this);
+}
+
+void Dwarf2BTF::addDwarfCU(DwarfUnit *TheU) {
+  DIE &CuDie = TheU->getUnitDie();
+
+  assert((CuDie.getTag() == dwarf::DW_TAG_compile_unit) &&
+         "Not a compile unit");
+  addTypeEntry(CuDie);
+}
+
+void Dwarf2BTF::finish() {
+  completeData();
+  OuterCtx.setBTFContext(std::move(BTFContext));
+}
+
+}
diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
new file mode 100644
index 00000000000..3df4dd802a7
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
@@ -0,0 +1,134 @@
+//===- Dwarf2BTF.h -------------------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARF2BTF_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARF2BTF_H
+
+#include "DwarfUnit.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/MC/MCBTFContext.h"
+#include <map>
+
+namespace llvm {
+
+class Dwarf2BTF;
+class MCBTFContext;
+
+#define BTF_INVALID_ENCODING 0xff
+
+class Die2BTFEntry {
+protected:
+  const DIE &Die;
+  size_t Id;  /* type index in the BTF list, started from 1 */
+  struct btf_type BTFType;
+
+public:
+  // Return desired BTF_KIND for the Die, return BTF_KIND_UNKN for
+  // invalid/unsupported Die
+  static unsigned char getDieKind(const DIE &Die);
+
+  // Return proper BTF_INT_ENCODING of a basetype.
+  // Return BTF_INVALID_ENCODING for unsupported (float, etc.)
+  static unsigned char getBaseTypeEncoding(const DIE &Die);
+
+  // Return whether this Die should be skipped.
+  // We currently skip unsupported data type (e.g. float)
+  // and references to unsupported types
+  static bool shouldSkipDie(const DIE &Die);
+
+  static std::unique_ptr<Die2BTFEntry> dieToBTFTypeEntry(const DIE &Die);
+
+  Die2BTFEntry(const DIE &Die);
+  void setId(size_t Id) { this->Id = Id; }
+  size_t getId() { return Id; }
+  virtual void completeData(class Dwarf2BTF &Dwarf2BTF);
+};
+
+// BTF_KIND_INT
+class Die2BTFEntryInt : public Die2BTFEntry {
+  __u32 IntVal;  // encoding, offset, bits
+
+public:
+  Die2BTFEntryInt(const DIE &Die);
+  void completeData(class Dwarf2BTF &Dwarf2BTF);
+};
+
+// BTF_KIND_ENUM
+class Die2BTFEntryEnum : public Die2BTFEntry {
+  std::vector<struct btf_enum> EnumValues;
+
+public:
+  Die2BTFEntryEnum(const DIE &Die);
+  void completeData(class Dwarf2BTF &Dwarf2BTF);
+};
+
+// BTF_KIND_ARRAY
+class Die2BTFEntryArray : public Die2BTFEntry {
+  struct btf_array ArrayInfo;
+
+public:
+  Die2BTFEntryArray(const DIE &Die);
+  void completeData(class Dwarf2BTF &Dwarf2BTF);
+};
+
+// BTF_KIND_STRUCT and BTF_KIND_UNION
+class Die2BTFEntryStruct : public Die2BTFEntry {
+  std::vector<struct btf_member> Members;
+
+public:
+  Die2BTFEntryStruct(const DIE &Die);
+  void completeData(class Dwarf2BTF &Dwarf2BTF);
+};
+
+// BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO
+class Die2BTFEntryFunc : public Die2BTFEntry {
+  std::vector<__u32> Parameters;
+
+public:
+  Die2BTFEntryFunc(const DIE &Die);
+  void completeData(class Dwarf2BTF &Dwarf2BTF);
+};
+
+class Dwarf2BTF {
+  std::vector<std::unique_ptr<Die2BTFEntry>> TypeEntries;
+  std::map<DIE*, size_t> DieToIdMap;
+  std::unique_ptr<MCBTFContext> BTFContext;
+  MCContext &OuterCtx;
+  bool IsLE;
+
+public:
+  Dwarf2BTF(MCContext &Context, bool IsLittleEndian);
+  bool isLittleEndian() { return IsLE; }
+  void addDwarfCU(DwarfUnit *TheU);
+  void finish();
+  __u32 getTypeIndex(DIE &Die) {
+    DIE *DiePtr = const_cast<DIE*>(&Die);
+    assert((DieToIdMap.find(DiePtr) != DieToIdMap.end()) &&
+           "Die not added to in the BTFContext");
+    return DieToIdMap[DiePtr];
+  }
+  size_t addBTFString(std::string S) {
+    return BTFContext->addString(S);
+  }
+  void addBTFTypeEntry(std::unique_ptr<BTFTypeEntry> Entry);
+  void addBTFFuncInfo(unsigned SecNameOff, BTFFuncInfo FuncInfo) {
+    BTFContext->addFuncInfo(SecNameOff, FuncInfo);
+  }
+
+private:
+  void addTypeEntry(const DIE &Die);
+  bool alreadyAdded(DIE &Die) {
+    return DieToIdMap.find(const_cast<DIE*>(&Die)) != DieToIdMap.end();
+  }
+  void completeData();
+};
+
+}
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 94e12658cfe..184ec4dabe9 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -971,6 +971,10 @@ void DwarfDebug::endModule() {
   // Emit the pubnames and pubtypes sections if requested.
   emitDebugPubSections();
 
+  const Triple &TT = Asm->TM.getTargetTriple();
+  if (TT.getArch() == Triple::bpfel || TT.getArch() == Triple::bpfeb)
+    emitBTFSection(TT.getArch() == Triple::bpfel);
+
   // clean up.
   // FIXME: AbstractVariables.clear();
 }
@@ -2455,6 +2459,12 @@ MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) {
   return &SplitTypeUnitFileTable;
 }
 
+void DwarfDebug::emitBTFSection(bool IsLittleEndian) {
+  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+
+  Holder.emitBTFSection(IsLittleEndian);
+}
+
 uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) {
   MD5 Hash;
   Hash.update(Identifier);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index fecf8056765..1350317db02 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -567,6 +567,9 @@ class DwarfDebug : public DebugHandlerBase {
   /// Emit the reference to the section.
   void emitSectionReference(const DwarfCompileUnit &CU);
 
+  // Emit the BTF sections
+  void emitBTFSection(bool IsLittleEndian);
+
 protected:
   /// Gather pre-function debug information.
   void beginFunctionImpl(const MachineFunction *MF) override;
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 0ab9ea87c23..7ac16b34c4c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Dwarf2BTF.h"
 #include "DwarfFile.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
@@ -15,6 +16,8 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/MC/MCBTFContext.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include <algorithm>
 #include <cstdint>
@@ -88,6 +91,13 @@ void DwarfFile::emitStrings(MCSection *StrSection, MCSection *OffsetSection,
   StrPool.emit(*Asm, StrSection, OffsetSection, UseRelativeOffsets);
 }
 
+void DwarfFile::emitBTFSection(bool IsLittleEndian) {
+  Dwarf2BTF Dwarf2BTF(Asm->OutContext, IsLittleEndian);
+  for (auto &TheU : CUs)
+    Dwarf2BTF.addDwarfCU(TheU.get());
+  Dwarf2BTF.finish();
+}
+
 bool DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
   auto &ScopeVars = ScopeVariables[LS];
   const DILocalVariable *DV = Var->getVariable();
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index c315f44a8d8..9aafe2613f6 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -114,6 +114,9 @@ public:
   void emitStrings(MCSection *StrSection, MCSection *OffsetSection = nullptr,
                    bool UseRelativeOffsets = false);
 
+  // Emit all data for the BTF section
+  void emitBTFSection(bool IsLittleEndian);
+
   /// Returns the string pool.
   DwarfStringPool &getStringPool() { return StrPool; }
 
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index ba36d99e8f7..85bf1616fd6 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -10,11 +10,13 @@ add_llvm_library(LLVMMC
   MCAsmMacro.cpp
   MCAsmStreamer.cpp
   MCAssembler.cpp
+  MCBTFContext.cpp
   MCCodeEmitter.cpp
   MCCodePadder.cpp
   MCCodeView.cpp
   MCContext.cpp
   MCDwarf.cpp
+  MCDwarf2BTF.cpp
   MCELFObjectTargetWriter.cpp
   MCELFStreamer.cpp
   MCExpr.cpp
diff --git a/lib/MC/MCBTFContext.cpp b/lib/MC/MCBTFContext.cpp
new file mode 100644
index 00000000000..cb846ee5e51
--- /dev/null
+++ b/lib/MC/MCBTFContext.cpp
@@ -0,0 +1,235 @@
+//===- lib/MC/MCBTFContext.cpp - Machine Code BTF Context -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCBTFContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "btf"
+
+void MCBTFContext::addTypeEntry(std::unique_ptr<BTFTypeEntry> Entry) {
+   TypeEntries.push_back(std::move(Entry));
+}
+
+void MCBTFContext::dump(raw_ostream &OS) {
+  OS << "Type Table:\n";
+  for (size_t i = 0; i < TypeEntries.size(); i++) {
+    auto TypeEntry = TypeEntries[i].get();
+    TypeEntry->print(OS, *this);
+  }
+
+  OS << "\nString Table:\n";
+  StringTable.showTable(OS);
+
+  OS << "\nFuncInfo Table:\n";
+  for (auto &FuncSec : FuncInfoTable) {
+    OS << "sec_name_off=" << FuncSec.first << "\n";
+    for (auto &FuncInfo : FuncSec.second) {
+      OS << "\tinsn_offset=<Omitted> type_id="
+         << FuncInfo.TypeId << "\n";
+    }
+  }
+
+  OS << "\nLineInfo Table:\n";
+  for (auto &LineSec : LineInfoTable) {
+    OS << "sec_name_off=" << LineSec.first << "\n";
+    for (auto &LineInfo : LineSec.second) {
+      OS << "\tinsn_offset=<Omitted> file_name_off="
+         << LineInfo.FileNameOff
+         << " line_off=" << LineInfo.LineOff
+         << " line_num=" << LineInfo.LineNum
+         << " column_num=" << LineInfo.ColumnNum
+         << "\n";
+    }
+  }
+}
+
+void MCBTFContext::emitCommonHeader(MCObjectStreamer *MCOS) {
+  MCOS->EmitIntValue(BTF_MAGIC, 2);
+  MCOS->EmitIntValue(BTF_VERSION, 1);
+  MCOS->EmitIntValue(0, 1);
+}
+
+void MCBTFContext::emitBTFSection(MCObjectStreamer *MCOS) {
+  MCContext &context = MCOS->getContext();
+  MCOS->SwitchSection(context.getObjectFileInfo()->getBTFSection());
+
+  // emit header
+  emitCommonHeader(MCOS);
+  MCOS->EmitIntValue(sizeof(struct btf_header), 4);
+
+  uint32_t type_len = 0, str_len;
+  for (auto &TypeEntry : TypeEntries)
+    type_len += TypeEntry->getSize();
+  str_len = StringTable.getSize();
+
+  MCOS->EmitIntValue(0, 4);
+  MCOS->EmitIntValue(type_len, 4);
+  MCOS->EmitIntValue(type_len, 4);
+  MCOS->EmitIntValue(str_len, 4);
+
+  // emit type table
+  for (auto &TypeEntry: TypeEntries)
+    TypeEntry->emitData(MCOS);
+
+  // emit string table
+  for (auto &S : StringTable.getTable()) {
+    for (auto C : S)
+      MCOS->EmitIntValue(C, 1);
+    MCOS->EmitIntValue('\0', 1);
+  }
+}
+
+void MCBTFContext::emitBTFExtSection(MCObjectStreamer *MCOS) {
+  MCContext &context = MCOS->getContext();
+  MCOS->SwitchSection(context.getObjectFileInfo()->getBTFExtSection());
+
+  // emit header
+  emitCommonHeader(MCOS);
+  MCOS->EmitIntValue(sizeof(struct btf_ext_header), 4);
+
+  uint32_t func_len = 0, line_len = 0;
+  for (auto &FuncSec : FuncInfoTable) {
+    func_len += sizeof(struct btf_sec_func_info);
+    func_len += FuncSec.second.size() * sizeof(struct bpf_func_info);
+  }
+  for (auto &LineSec : LineInfoTable) {
+    line_len += sizeof(struct btf_sec_line_info);
+    line_len += LineSec.second.size() * sizeof(struct bpf_line_info);
+  }
+
+  MCOS->EmitIntValue(0, 4);
+  MCOS->EmitIntValue(func_len, 4);
+  MCOS->EmitIntValue(func_len, 4);
+  MCOS->EmitIntValue(line_len, 4);
+
+  // emit func_info table
+  for (const auto &FuncSec : FuncInfoTable) {
+    MCOS->EmitIntValue(FuncSec.first, 4);
+    MCOS->EmitIntValue(FuncSec.second.size(), 4);
+    for (const auto &FuncInfo : FuncSec.second) {
+      MCOS->EmitBTFAdvanceLineAddr(FuncInfo.Label, 4);
+      MCOS->EmitIntValue(FuncInfo.TypeId, 4);
+    }
+  }
+
+  // emit line_info table
+  for (const auto &LineSec : LineInfoTable) {
+    MCOS->EmitIntValue(LineSec.first, 4);
+    MCOS->EmitIntValue(LineSec.second.size(), 4);
+    for (const auto &LineInfo : LineSec.second) {
+      MCOS->EmitBTFAdvanceLineAddr(LineInfo.Label, 4);
+      MCOS->EmitIntValue(LineInfo.FileNameOff, 4);
+      MCOS->EmitIntValue(LineInfo.LineOff, 4);
+      MCOS->EmitIntValue(LineInfo.LineNum << 10 | LineInfo.ColumnNum, 4);
+    }
+  }
+}
+
+void MCBTFContext::emitAll(MCObjectStreamer *MCOS) {
+  LLVM_DEBUG(dump(dbgs()));
+  emitBTFSection(MCOS);
+  emitBTFExtSection(MCOS);
+}
+
+void BTFTypeEntry::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
+  OS << "[" << Id << "] "
+     << btf_kind_str[BTF_INFO_KIND(BTFType.info)]
+     << " name_off=" << BTFType.name_off
+     << " info=" << format("0x%08lx", BTFType.info)
+     << " size/type=" << BTFType.size << "\n";
+}
+
+void BTFTypeEntry::emitData(MCObjectStreamer *MCOS) {
+  MCOS->EmitIntValue(BTFType.name_off, 4);
+  MCOS->EmitIntValue(BTFType.info, 4);
+  MCOS->EmitIntValue(BTFType.size, 4);
+}
+
+void BTFTypeEntryInt::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
+  BTFTypeEntry::print(OS, MCBTFContext);
+  OS << "\tdesc=" << format("0x%08lx", IntVal) << "\n";
+}
+
+void BTFTypeEntryInt::emitData(MCObjectStreamer *MCOS) {
+  BTFTypeEntry::emitData(MCOS);
+  MCOS->EmitIntValue(IntVal, 4);
+}
+
+void BTFTypeEntryEnum::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
+ BTFTypeEntry::print(OS, MCBTFContext);
+  for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) {
+    auto &EnumValue = EnumValues[i];
+    OS << "\tname_off=" << EnumValue.name_off
+       << " value=" << EnumValue.val << "\n";
+  }
+}
+
+void BTFTypeEntryEnum::emitData(MCObjectStreamer *MCOS) {
+  BTFTypeEntry::emitData(MCOS);
+  for (auto &EnumValue : EnumValues) {
+    MCOS->EmitIntValue(EnumValue.name_off, 4);
+    MCOS->EmitIntValue(EnumValue.val, 4);
+  }
+}
+
+void BTFTypeEntryArray::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
+  BTFTypeEntry::print(OS, MCBTFContext);
+  OS << "\telem_type=" << format("0x%08lx", ArrayInfo.type)
+     << " index_type=" << format("0x%08lx", ArrayInfo.index_type)
+     << " num_element=" << ArrayInfo.nelems << "\n";
+}
+
+void BTFTypeEntryArray::emitData(MCObjectStreamer *MCOS) {
+  BTFTypeEntry::emitData(MCOS);
+  MCOS->EmitIntValue(ArrayInfo.type, 4);
+  MCOS->EmitIntValue(ArrayInfo.index_type, 4);
+  MCOS->EmitIntValue(ArrayInfo.nelems, 4);
+}
+
+void BTFTypeEntryStruct::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
+  BTFTypeEntry::print(OS, MCBTFContext);
+   for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) {
+    auto &Member = Members[i];
+    OS << "\tname_off=" << Member.name_off
+       << " type=" << Member.type
+       << " bit_offset=" << Member.offset << "\n";
+  }
+}
+
+void BTFTypeEntryStruct::emitData(MCObjectStreamer *MCOS) {
+  BTFTypeEntry::emitData(MCOS);
+  for (auto &Member : Members) {
+    MCOS->EmitIntValue(Member.name_off, 4);
+    MCOS->EmitIntValue(Member.type, 4);
+    MCOS->EmitIntValue(Member.offset, 4);
+  }
+}
+
+void BTFTypeEntryFunc::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
+  BTFTypeEntry::print(OS, MCBTFContext);
+   for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) {
+    auto Parameter = Parameters[i];
+    OS << "\tparam_type=" << Parameter << "\n";
+  }
+}
+
+void BTFTypeEntryFunc::emitData(MCObjectStreamer *MCOS) {
+  BTFTypeEntry::emitData(MCOS);
+  for (auto &Parameter: Parameters)
+    MCOS->EmitIntValue(Parameter, 4);
+}
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index fab517075c5..18250a474b7 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -17,6 +17,7 @@
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCBTFContext.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
@@ -60,7 +61,7 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
     : SrcMgr(mgr), InlineSrcMgr(nullptr), MAI(mai), MRI(mri), MOFI(mofi),
       Symbols(Allocator), UsedNames(Allocator),
       CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0),
-      AutoReset(DoAutoReset) {
+      AutoReset(DoAutoReset), BTFCtx(nullptr) {
   SecureLogFile = AsSecureLogFileName;
 
   if (SrcMgr && SrcMgr->getNumBuffers())
@@ -114,6 +115,14 @@ void MCContext::reset() {
   GenDwarfFileNumber = 0;
 
   HadError = false;
+  BTFCtx.reset();
+}
+
+//===----------------------------------------------------------------------===//
+// BTFCtx Manipulation
+//===----------------------------------------------------------------------===//
+void MCContext::setBTFContext(std::unique_ptr<MCBTFContext> Ctx) {
+  BTFCtx = std::move(Ctx);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/MCDwarf2BTF.cpp b/lib/MC/MCDwarf2BTF.cpp
new file mode 100644
index 00000000000..08a70e6f318
--- /dev/null
+++ b/lib/MC/MCDwarf2BTF.cpp
@@ -0,0 +1,99 @@
+//===- MCDwarf2BTF.cpp ---------------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCDwarf2BTF.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCBTFContext.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+#include <fstream>
+
+using namespace llvm;
+
+void MCDwarf2BTF::addFiles(MCObjectStreamer *MCOS, std::string &FileName,
+  std::vector<FileContent> &Files) {
+  std::vector<std::string> Content;
+
+  std::ifstream Inputfile(FileName);
+  std::string Line;
+  Content.push_back(Line); // line 0 for empty string
+  while (std::getline(Inputfile, Line))
+    Content.push_back(Line);
+
+  Files.push_back(FileContent(FileName, Content));
+}
+
+void MCDwarf2BTF::addLines(MCObjectStreamer *MCOS, StringRef &SectionName,
+  std::vector<FileContent> &Files,
+  const MCLineSection::MCDwarfLineEntryCollection &LineEntries) {
+  MCContext &Context = MCOS->getContext();
+  auto &BTFCxt = Context.getBTFContext();
+
+  unsigned SecNameOff = BTFCxt->addString(SectionName.str());
+  for (const MCDwarfLineEntry &LineEntry : LineEntries) {
+    BTFLineInfo LineInfo;
+    unsigned FileNum = LineEntry.getFileNum();
+    unsigned Line = LineEntry.getLine();
+
+    LineInfo.Label = LineEntry.getLabel();
+    if (FileNum < Files.size()) {
+      LineInfo.FileNameOff = BTFCxt->addString(Files[FileNum].first);
+      if (Line < Files[FileNum].second.size())
+        LineInfo.LineOff = BTFCxt->addString(Files[FileNum].second[Line]);
+      else
+        LineInfo.LineOff = 0;
+    } else {
+      LineInfo.FileNameOff = 0;
+      LineInfo.LineOff = 0;
+    }
+    LineInfo.LineNum = Line;
+    LineInfo.ColumnNum = LineEntry.getColumn();
+    BTFCxt->addLineInfo(SecNameOff, LineInfo);
+  }
+}
+
+void MCDwarf2BTF::addDwarfLineInfo(MCObjectStreamer *MCOS) {
+  MCContext &Context = MCOS->getContext();
+
+  auto &LineTables = Context.getMCDwarfLineTables();
+  if (LineTables.empty())
+    return;
+
+  for (const auto &CUIDTablePair : LineTables) {
+    std::vector<std::string> Dirs;
+    std::vector<FileContent> Files;
+
+    for (auto &Dir : CUIDTablePair.second.getMCDwarfDirs())
+      Dirs.push_back(Dir);
+    for (auto &File : CUIDTablePair.second.getMCDwarfFiles()) {
+      std::string FileName;
+      if (File.DirIndex == 0)
+        FileName = File.Name;
+      else
+        FileName = Dirs[File.DirIndex - 1] + "/" + File.Name;
+      MCDwarf2BTF::addFiles(MCOS, FileName, Files);
+    }
+    for (const auto &LineSec: CUIDTablePair.second.getMCLineSections().getMCLineEntries()) {
+      MCSection *Section = LineSec.first;
+      const MCLineSection::MCDwarfLineEntryCollection &LineEntries = LineSec.second;
+
+      StringRef SectionName;
+      if (MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(Section))
+        SectionName = SectionELF->getSectionName();
+      else
+        return;
+      MCDwarf2BTF::addLines(MCOS, SectionName, Files, LineEntries);
+    }
+  }
+}
diff --git a/lib/MC/MCDwarf2BTF.h b/lib/MC/MCDwarf2BTF.h
new file mode 100644
index 00000000000..22d1b7741a5
--- /dev/null
+++ b/lib/MC/MCDwarf2BTF.h
@@ -0,0 +1,29 @@
+//===- MCDwarf2BTF.h ------------------------------------------ *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_MC_MCDWARF2BTF_H
+#define LLVM_LIB_MC_MCDWARF2BTF_H
+
+#include "llvm/MC/MCDwarf.h"
+
+namespace llvm {
+
+using FileContent = std::pair<std::string, std::vector<std::string>>;
+
+class MCDwarf2BTF {
+public:
+  static void addFiles(MCObjectStreamer *MCOS, std::string &FileName,
+    std::vector<FileContent> &Files);
+  static void addLines(MCObjectStreamer *MCOS, StringRef &SectionName,
+    std::vector<FileContent> &Files,
+    const MCLineSection::MCDwarfLineEntryCollection &LineEntries);
+  static void addDwarfLineInfo(MCObjectStreamer *MCOS);
+};
+
+}
+#endif
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index edfccfcb9ed..bddcf459ac0 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -468,6 +468,9 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
       Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags);
 
   StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0);
+
+  BTFSection = Ctx->getELFSection(".BTF", ELF::SHT_PROGBITS, 0);
+  BTFExtSection = Ctx->getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0);
 }
 
 void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 8c88db009bd..4f74f4101c8 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCBTFContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
@@ -21,6 +22,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
+#include "MCDwarf2BTF.h"
 using namespace llvm;
 
 MCObjectStreamer::MCObjectStreamer(MCContext &Context,
@@ -439,6 +441,31 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
   insert(new MCDwarfCallFrameFragment(*AddrDelta));
 }
 
+void MCObjectStreamer::EmitBTFAdvanceLineAddr(const MCSymbol *Label,
+                                              unsigned Size) {
+  const MCExpr *Value = MCSymbolRefExpr::create(Label, getContext());
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  // Avoid fixups when possible.
+  int64_t AbsValue;
+  SMLoc Loc;
+
+  if (Value->evaluateAsAbsolute(AbsValue, getAssemblerPtr())) {
+    if (!isUIntN(8 * Size, AbsValue) && !isIntN(8 * Size, AbsValue)) {
+      getContext().reportError(
+          Loc, "value evaluated as " + Twine(AbsValue) + " is out of range.");
+      return;
+    }
+    EmitIntValue(AbsValue, Size);
+    return;
+  }
+
+  DF->getFixups().push_back(
+      MCFixup::create(DF->getContents().size(), Value,
+                      MCFixup::getKindForSize(Size, false), Loc));
+  DF->getContents().resize(DF->getContents().size() + Size, 0);
+}
+
 void MCObjectStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo,
                                           unsigned Line, unsigned Column,
                                           bool PrologueEnd, bool IsStmt,
@@ -688,6 +715,13 @@ void MCObjectStreamer::FinishImpl() {
   // Dump out the dwarf file & directory tables and line tables.
   MCDwarfLineTable::Emit(this, getAssembler().getDWARFLinetableParams());
 
+  auto &BTFCtx = getContext().getBTFContext();
+  if (BTFCtx) {
+    MCDwarf2BTF::addDwarfLineInfo(this);
+    BTFCtx->emitAll(this);
+    BTFCtx.reset();
+  }
+
   flushPendingLabels();
   getAssembler().Finish();
 }
-- 
GitLab


From 2649515d3f3ba7deba9958fea4ef4b65132884a5 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Fri, 12 Oct 2018 17:22:07 +0000
Subject: [PATCH 0114/1116] [Support] exit with custom return code for SIGPIPE

Summary:
We tell the user to file a bug report on LLVM right now, and
SIGPIPE isn't LLVM's fault so our error message is wrong.

Allows frontends to detect SIGPIPE from writing to closed readers.
This can be seen commonly from piping into head, tee, or split.

Fixes PR25349, rdar://problem/14285346, b/77310947

Reviewers: jfb

Reviewed By: jfb

Subscribers: majnemer, kristina, llvm-commits, thakis, srhines

Differential Revision: https://reviews.llvm.org/D53000

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344372 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Unix/Signals.inc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index de26695d64e..ad88d5e9690 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -47,6 +47,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <string>
+#include <sysexits.h>
 #ifdef HAVE_BACKTRACE
 # include BACKTRACE_HEADER         // For backtrace().
 #endif
@@ -334,6 +335,10 @@ static RETSIGTYPE SignalHandler(int Sig) {
       if (auto OldInterruptFunction = InterruptFunction.exchange(nullptr))
         return OldInterruptFunction();
 
+      // Send a special return code that drivers can check for, from sysexits.h.
+      if (Sig == SIGPIPE)
+        exit(EX_IOERR);
+
       raise(Sig);   // Execute the default handler.
       return;
    }
-- 
GitLab


From 1172319f2e0dc803f720874c96ae9a85fd265c93 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 12 Oct 2018 17:23:25 +0000
Subject: [PATCH 0115/1116] [BPF] Some fixes after rL344366

* Move #include outside of namespaces
* Add missing #include
* Add out-of-line virtual destructor to BTFTypeEntry

designated initializers should also be fixed

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344376 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCBTFContext.h       | 32 +++++++++++++++-------------
 lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp |  2 ++
 lib/CodeGen/AsmPrinter/Dwarf2BTF.h   |  1 +
 lib/MC/MCBTFContext.cpp              |  4 ++--
 4 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h
index f180a69340b..fd9edbcf7a8 100644
--- a/include/llvm/MC/MCBTFContext.h
+++ b/include/llvm/MC/MCBTFContext.h
@@ -13,22 +13,26 @@
 #ifndef LLVM_MC_MCBTFCONTEXT_H
 #define LLVM_MC_MCBTFCONTEXT_H
 
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
 #include <linux/types.h>
+#include <map>
+#include <vector>
 
-#define BTF_MAGIC	0xeB9F
-#define BTF_VERSION	1
+#define BTF_MAGIC 0xeB9F
+#define BTF_VERSION 1
 
 struct btf_header {
-	__u16	magic;
-	__u8	version;
-	__u8	flags;
-	__u32	hdr_len;
-
-	/* All offsets are in bytes relative to the end of this header */
-	__u32	type_off;	/* offset of type section	*/
-	__u32	type_len;	/* length of type section	*/
-	__u32	str_off;	/* offset of string section	*/
-	__u32	str_len;	/* length of string section	*/
+  __u16 magic;
+  __u8 version;
+  __u8 flags;
+  __u32 hdr_len;
+
+  /* All offsets are in bytes relative to the end of this header */
+  __u32 type_off; /* offset of type section	*/
+  __u32 type_len; /* length of type section	*/
+  __u32 str_off;  /* offset of string section	*/
+  __u32 str_len;  /* length of string section	*/
 };
 
 /* Max # of type identifier */
@@ -178,9 +182,6 @@ const char *const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_FUNC_PROTO]	= "FUNC_PROTO",
 };
 
-#include "llvm/ADT/SmallVector.h"
-#include <map>
-
 class MCBTFContext;
 class MCObjectStreamer;
 
@@ -196,6 +197,7 @@ protected:
 public:
   BTFTypeEntry(size_t id, struct btf_type &type) :
     Id(id), BTFType(type) {}
+  virtual ~BTFTypeEntry();
   unsigned char getKind() { return BTF_INFO_KIND(BTFType.info); }
   void setId(size_t Id) { this->Id = Id; }
   size_t getId() { return Id; }
diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
index 20eab4d1fb8..8b16e389963 100644
--- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
+++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
@@ -15,6 +15,8 @@
 
 namespace llvm {
 
+Die2BTFEntry::~Die2BTFEntry() {}
+
 unsigned char Die2BTFEntry::getDieKind(const DIE & Die) {
   auto Tag = Die.getTag();
 
diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
index 3df4dd802a7..125441d37b3 100644
--- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
+++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
@@ -30,6 +30,7 @@ protected:
   struct btf_type BTFType;
 
 public:
+  virtual ~Die2BTFEntry();
   // Return desired BTF_KIND for the Die, return BTF_KIND_UNKN for
   // invalid/unsupported Die
   static unsigned char getDieKind(const DIE &Die);
diff --git a/lib/MC/MCBTFContext.cpp b/lib/MC/MCBTFContext.cpp
index cb846ee5e51..d1c30dd0b88 100644
--- a/lib/MC/MCBTFContext.cpp
+++ b/lib/MC/MCBTFContext.cpp
@@ -11,8 +11,6 @@
 #include "llvm/MC/MCBTFContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
 #include <tuple>
 #include <utility>
@@ -21,6 +19,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "btf"
 
+BTFTypeEntry::~BTFTypeEntry() {}
+
 void MCBTFContext::addTypeEntry(std::unique_ptr<BTFTypeEntry> Entry) {
    TypeEntries.push_back(std::move(Entry));
 }
-- 
GitLab


From 91defc144e119483c2d4196a875bfa957ec81303 Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Fri, 12 Oct 2018 17:26:19 +0000
Subject: [PATCH 0116/1116] Better support for POSIX paths in PDBs.

This a resubmission of a patch which was previously reverted
due to breaking several lld tests.  The issues causing those
failures have been fixed, so the patch is now resubmitted.

---Original Commit Message---

While it doesn't make a *ton* of sense for POSIX paths to be
in PDBs, it's possible to occur in real scenarios involving
cross compilation.

The tools need to be able to handle this, because certain types
of debugging scenarios are possible without a running process
and so don't necessarily require you to be on a Windows system.
These include post-mortem debugging and binary forensics (e.g.
using a debugger to disassemble functions and examine symbols
without running the process).

There's changes in clang, LLD, and lldb in this patch.  After
this the cross-platform disassembly and source-list tests pass
on Linux.

Furthermore, the behavior of LLD can now be summarized by a much
simpler rule than before: Unless you specify /pdbsourcepath and
/pdbaltpath, the PDB ends up with paths that are valid within
the context of the machine that the link is performed on.

Differential Revision: https://reviews.llvm.org/D53149

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344377 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 4d45a103c5a..8232f076a93 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -73,6 +73,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -134,7 +135,9 @@ StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
 
   // If this is a Unix-style path, just use it as is. Don't try to canonicalize
   // it textually because one of the path components could be a symlink.
-  if (!Dir.empty() && Dir[0] == '/') {
+  if (Dir.startswith("/") || Filename.startswith("/")) {
+    if (llvm::sys::path::is_absolute(Filename, llvm::sys::path::Style::posix))
+      return Filename;
     Filepath = Dir;
     if (Dir.back() != '/')
       Filepath += '/';
-- 
GitLab


From a73f1cf1de29de3f3cd66589e9dc6139cf17d89e Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 12 Oct 2018 17:41:12 +0000
Subject: [PATCH 0117/1116] [BPF] Don't include linux/types.h and fix style

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344381 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCBTFContext.h       | 255 ++++++++++++------------
 lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp | 280 +++++++++++++--------------
 lib/CodeGen/AsmPrinter/Dwarf2BTF.h   |  16 +-
 lib/CodeGen/AsmPrinter/DwarfFile.h   |   4 +-
 lib/MC/MCBTFContext.cpp              |  48 ++---
 lib/MC/MCDwarf2BTF.cpp               |  17 +-
 lib/MC/MCDwarf2BTF.h                 |  11 +-
 7 files changed, 309 insertions(+), 322 deletions(-)

diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h
index fd9edbcf7a8..43103273acb 100644
--- a/include/llvm/MC/MCBTFContext.h
+++ b/include/llvm/MC/MCBTFContext.h
@@ -15,10 +15,16 @@
 
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/raw_ostream.h"
-#include <linux/types.h>
 #include <map>
 #include <vector>
 
+typedef __signed__ char __s8;
+typedef unsigned char __u8;
+typedef __signed__ short __s16;
+typedef unsigned short __u16;
+typedef __signed__ int __s32;
+typedef unsigned int __u32;
+
 #define BTF_MAGIC 0xeB9F
 #define BTF_VERSION 1
 
@@ -36,53 +42,53 @@ struct btf_header {
 };
 
 /* Max # of type identifier */
-#define BTF_MAX_TYPE	0x0000ffff
+#define BTF_MAX_TYPE 0x0000ffff
 /* Max offset into the string section */
-#define BTF_MAX_NAME_OFFSET	0x0000ffff
+#define BTF_MAX_NAME_OFFSET 0x0000ffff
 /* Max # of struct/union/enum members or func args */
-#define BTF_MAX_VLEN	0xffff
+#define BTF_MAX_VLEN 0xffff
 
 struct btf_type {
-	__u32 name_off;
-	/* "info" bits arrangement
-	 * bits  0-15: vlen (e.g. # of struct's members)
-	 * bits 16-23: unused
-	 * bits 24-27: kind (e.g. int, ptr, array...etc)
-	 * bits 28-31: unused
-	 */
-	__u32 info;
-	/* "size" is used by INT, ENUM, STRUCT and UNION.
-	 * "size" tells the size of the type it is describing.
-	 *
-	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
-	 * FUNC and FUNC_PROTO.
-	 * "type" is a type_id referring to another type.
-	 */
-	union {
-		__u32 size;
-		__u32 type;
-	};
+  __u32 name_off;
+  /* "info" bits arrangement
+   * bits  0-15: vlen (e.g. # of struct's members)
+   * bits 16-23: unused
+   * bits 24-27: kind (e.g. int, ptr, array...etc)
+   * bits 28-31: unused
+   */
+  __u32 info;
+  /* "size" is used by INT, ENUM, STRUCT and UNION.
+   * "size" tells the size of the type it is describing.
+   *
+   * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
+   * FUNC and FUNC_PROTO.
+   * "type" is a type_id referring to another type.
+   */
+  union {
+    __u32 size;
+    __u32 type;
+  };
 };
 
-#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x0f)
-#define BTF_INFO_VLEN(info)	((info) & 0xffff)
-
-#define BTF_KIND_UNKN		0	/* Unknown	*/
-#define BTF_KIND_INT		1	/* Integer	*/
-#define BTF_KIND_PTR		2	/* Pointer	*/
-#define BTF_KIND_ARRAY		3	/* Array	*/
-#define BTF_KIND_STRUCT		4	/* Struct	*/
-#define BTF_KIND_UNION		5	/* Union	*/
-#define BTF_KIND_ENUM		6	/* Enumeration	*/
-#define BTF_KIND_FWD		7	/* Forward	*/
-#define BTF_KIND_TYPEDEF	8	/* Typedef	*/
-#define BTF_KIND_VOLATILE	9	/* Volatile	*/
-#define BTF_KIND_CONST		10	/* Const	*/
-#define BTF_KIND_RESTRICT	11	/* Restrict	*/
-#define BTF_KIND_FUNC		12	/* Function	*/
-#define BTF_KIND_FUNC_PROTO	13	/* Function Prototype	*/
-#define BTF_KIND_MAX		13
-#define NR_BTF_KINDS		14
+#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f)
+#define BTF_INFO_VLEN(info) ((info)&0xffff)
+
+#define BTF_KIND_UNKN 0        /* Unknown	*/
+#define BTF_KIND_INT 1         /* Integer	*/
+#define BTF_KIND_PTR 2         /* Pointer	*/
+#define BTF_KIND_ARRAY 3       /* Array	*/
+#define BTF_KIND_STRUCT 4      /* Struct	*/
+#define BTF_KIND_UNION 5       /* Union	*/
+#define BTF_KIND_ENUM 6        /* Enumeration	*/
+#define BTF_KIND_FWD 7         /* Forward	*/
+#define BTF_KIND_TYPEDEF 8     /* Typedef	*/
+#define BTF_KIND_VOLATILE 9    /* Volatile	*/
+#define BTF_KIND_CONST 10      /* Const	*/
+#define BTF_KIND_RESTRICT 11   /* Restrict	*/
+#define BTF_KIND_FUNC 12       /* Function	*/
+#define BTF_KIND_FUNC_PROTO 13 /* Function Prototype	*/
+#define BTF_KIND_MAX 13
+#define NR_BTF_KINDS 14
 
 /* For some specific BTF_KIND, "struct btf_type" is immediately
  * followed by extra data.
@@ -91,29 +97,29 @@ struct btf_type {
 /* BTF_KIND_INT is followed by a u32 and the following
  * is the 32 bits arrangement:
  */
-#define BTF_INT_ENCODING(VAL)	(((VAL) & 0x0f000000) >> 24)
-#define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
-#define BTF_INT_BITS(VAL)	((VAL)  & 0x000000ff)
+#define BTF_INT_ENCODING(VAL) (((VAL)&0x0f000000) >> 24)
+#define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16)
+#define BTF_INT_BITS(VAL) ((VAL)&0x000000ff)
 
 /* Attributes stored in the BTF_INT_ENCODING */
-#define BTF_INT_SIGNED	(1 << 0)
-#define BTF_INT_CHAR	(1 << 1)
-#define BTF_INT_BOOL	(1 << 2)
+#define BTF_INT_SIGNED (1 << 0)
+#define BTF_INT_CHAR (1 << 1)
+#define BTF_INT_BOOL (1 << 2)
 
 /* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
  * The exact number of btf_enum is stored in the vlen (of the
  * info in "struct btf_type").
  */
 struct btf_enum {
-	__u32	name_off;
-	__s32	val;
+  __u32 name_off;
+  __s32 val;
 };
 
 /* BTF_KIND_ARRAY is followed by one "struct btf_array" */
 struct btf_array {
-	__u32	type;
-	__u32	index_type;
-	__u32	nelems;
+  __u32 type;
+  __u32 index_type;
+  __u32 nelems;
 };
 
 /* BTF_KIND_STRUCT and BTF_KIND_UNION are followed
@@ -122,64 +128,57 @@ struct btf_array {
  * "struct btf_type").
  */
 struct btf_member {
-	__u32	name_off;
-	__u32	type;
-	__u32	offset;	/* offset in bits */
+  __u32 name_off;
+  __u32 type;
+  __u32 offset; /* offset in bits */
 };
 
 /* .BTF.ext section contains func_info and line_info.
  */
 struct btf_ext_header {
-	__u16	magic;
-	__u8	version;
-	__u8	flags;
-	__u32	hdr_len;
-
-	__u32	func_info_off;
-	__u32	func_info_len;
-	__u32	line_info_off;
-	__u32	line_info_len;
+  __u16 magic;
+  __u8 version;
+  __u8 flags;
+  __u32 hdr_len;
+
+  __u32 func_info_off;
+  __u32 func_info_len;
+  __u32 line_info_off;
+  __u32 line_info_len;
 };
 
 struct bpf_func_info {
-	__u32	insn_offset;
-	__u32	type_id;
+  __u32 insn_offset;
+  __u32 type_id;
 };
 
 struct btf_sec_func_info {
-	__u32	sec_name_off;
-	__u32	num_func_info;
+  __u32 sec_name_off;
+  __u32 num_func_info;
 };
 
 struct bpf_line_info {
-	__u32	insn_offset;
-	__u32	file_name_off;
-	__u32	line_off;
-	__u32	line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */
+  __u32 insn_offset;
+  __u32 file_name_off;
+  __u32 line_off;
+  __u32 line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */
 };
 
 struct btf_sec_line_info {
-	__u32	sec_name_off;
-	__u32	num_line_info;
+  __u32 sec_name_off;
+  __u32 num_line_info;
 };
 
 namespace llvm {
 
 const char *const btf_kind_str[NR_BTF_KINDS] = {
-	[BTF_KIND_UNKN]		= "UNKNOWN",
-	[BTF_KIND_INT]		= "INT",
-	[BTF_KIND_PTR]		= "PTR",
-	[BTF_KIND_ARRAY]	= "ARRAY",
-	[BTF_KIND_STRUCT]	= "STRUCT",
-	[BTF_KIND_UNION]	= "UNION",
-	[BTF_KIND_ENUM]		= "ENUM",
-	[BTF_KIND_FWD]		= "FWD",
-	[BTF_KIND_TYPEDEF]	= "TYPEDEF",
-	[BTF_KIND_VOLATILE]	= "VOLATILE",
-	[BTF_KIND_CONST]	= "CONST",
-	[BTF_KIND_RESTRICT]	= "RESTRICT",
-	[BTF_KIND_FUNC]		= "FUNC",
-	[BTF_KIND_FUNC_PROTO]	= "FUNC_PROTO",
+    [BTF_KIND_UNKN] = "UNKNOWN",    [BTF_KIND_INT] = "INT",
+    [BTF_KIND_PTR] = "PTR",         [BTF_KIND_ARRAY] = "ARRAY",
+    [BTF_KIND_STRUCT] = "STRUCT",   [BTF_KIND_UNION] = "UNION",
+    [BTF_KIND_ENUM] = "ENUM",       [BTF_KIND_FWD] = "FWD",
+    [BTF_KIND_TYPEDEF] = "TYPEDEF", [BTF_KIND_VOLATILE] = "VOLATILE",
+    [BTF_KIND_CONST] = "CONST",     [BTF_KIND_RESTRICT] = "RESTRICT",
+    [BTF_KIND_FUNC] = "FUNC",       [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
 };
 
 class MCBTFContext;
@@ -191,12 +190,11 @@ class MCObjectStreamer;
 //   BTF_KIND_TYPEDEF, BTF_KIND_RESTRICT, and BTF_KIND_FWD
 class BTFTypeEntry {
 protected:
-  size_t Id;  /* type index in the BTF list, started from 1 */
+  size_t Id; /* type index in the BTF list, started from 1 */
   struct btf_type BTFType;
 
 public:
-  BTFTypeEntry(size_t id, struct btf_type &type) :
-    Id(id), BTFType(type) {}
+  BTFTypeEntry(size_t id, struct btf_type &type) : Id(id), BTFType(type) {}
   virtual ~BTFTypeEntry();
   unsigned char getKind() { return BTF_INFO_KIND(BTFType.info); }
   void setId(size_t Id) { this->Id = Id; }
@@ -206,19 +204,19 @@ public:
   unsigned getTypeIndex() { return BTFType.type; }
   unsigned getNameOff() { return BTFType.name_off; }
   virtual size_t getSize() { return sizeof(struct btf_type); }
-  virtual void print(raw_ostream &s, MCBTFContext& BTFContext);
+  virtual void print(raw_ostream &s, MCBTFContext &BTFContext);
   virtual void emitData(MCObjectStreamer *MCOS);
 };
 
 // BTF_KIND_INT
 class BTFTypeEntryInt : public BTFTypeEntry {
-  unsigned IntVal;  // encoding, offset, bits
+  unsigned IntVal; // encoding, offset, bits
 
 public:
-  BTFTypeEntryInt(size_t id, struct btf_type &type, unsigned intval) :
-    BTFTypeEntry(id, type), IntVal(intval) {}
+  BTFTypeEntryInt(size_t id, struct btf_type &type, unsigned intval)
+      : BTFTypeEntry(id, type), IntVal(intval) {}
   size_t getSize() { return BTFTypeEntry::getSize() + sizeof(unsigned); }
-  void print(raw_ostream &s, MCBTFContext& BTFContext);
+  void print(raw_ostream &s, MCBTFContext &BTFContext);
   void emitData(MCObjectStreamer *MCOS);
 };
 
@@ -228,13 +226,13 @@ class BTFTypeEntryEnum : public BTFTypeEntry {
 
 public:
   BTFTypeEntryEnum(size_t id, struct btf_type &type,
-                   std::vector<struct btf_enum> &values) :
-    BTFTypeEntry(id, type), EnumValues(values) {}
+                   std::vector<struct btf_enum> &values)
+      : BTFTypeEntry(id, type), EnumValues(values) {}
   size_t getSize() {
     return BTFTypeEntry::getSize() +
-      BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_enum);
+           BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_enum);
   }
-  void print(raw_ostream &s, MCBTFContext& BTFContext);
+  void print(raw_ostream &s, MCBTFContext &BTFContext);
   void emitData(MCObjectStreamer *MCOS);
 };
 
@@ -244,12 +242,12 @@ class BTFTypeEntryArray : public BTFTypeEntry {
 
 public:
   BTFTypeEntryArray(size_t id, struct btf_type &type,
-                    struct btf_array &arrayinfo) :
-    BTFTypeEntry(id, type), ArrayInfo(arrayinfo) {}
+                    struct btf_array &arrayinfo)
+      : BTFTypeEntry(id, type), ArrayInfo(arrayinfo) {}
   size_t getSize() {
-    return BTFTypeEntry::getSize() +  sizeof(struct btf_array);
+    return BTFTypeEntry::getSize() + sizeof(struct btf_array);
   }
-  void print(raw_ostream &s, MCBTFContext& BTFContext);
+  void print(raw_ostream &s, MCBTFContext &BTFContext);
   void emitData(MCObjectStreamer *MCOS);
 };
 
@@ -259,13 +257,13 @@ class BTFTypeEntryStruct : public BTFTypeEntry {
 
 public:
   BTFTypeEntryStruct(size_t id, struct btf_type &type,
-                     std::vector<struct btf_member> &members) :
-    BTFTypeEntry(id, type), Members(members) {}
+                     std::vector<struct btf_member> &members)
+      : BTFTypeEntry(id, type), Members(members) {}
   size_t getSize() {
     return BTFTypeEntry::getSize() +
-      BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_member);
+           BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_member);
   }
-  void print(raw_ostream &s, MCBTFContext& BTFContext);
+  void print(raw_ostream &s, MCBTFContext &BTFContext);
   void emitData(MCObjectStreamer *MCOS);
 };
 
@@ -275,22 +273,22 @@ class BTFTypeEntryFunc : public BTFTypeEntry {
 
 public:
   BTFTypeEntryFunc(size_t id, struct btf_type &type,
-                   std::vector<unsigned> &params) :
-    BTFTypeEntry(id, type), Parameters(params) {}
+                   std::vector<unsigned> &params)
+      : BTFTypeEntry(id, type), Parameters(params) {}
   size_t getSize() {
     return BTFTypeEntry::getSize() +
-      BTF_INFO_VLEN(BTFType.info) * sizeof(unsigned);
+           BTF_INFO_VLEN(BTFType.info) * sizeof(unsigned);
   }
-  void print(raw_ostream &s, MCBTFContext& BTFContext);
+  void print(raw_ostream &s, MCBTFContext &BTFContext);
   void emitData(MCObjectStreamer *MCOS);
 };
 
 class BTFStringTable {
-  size_t Size;  // total size in bytes
+  size_t Size; // total size in bytes
   std::map<size_t, unsigned> OffsetToIdMap;
   std::vector<std::string> Table;
 
- public:
+public:
   BTFStringTable() : Size(0) {}
   size_t getSize() { return Size; }
   std::vector<std::string> &getTable() { return Table; }
@@ -312,22 +310,21 @@ class BTFStringTable {
   }
   void showTable(raw_ostream &OS) {
     for (auto OffsetM : OffsetToIdMap)
-      OS << OffsetM.first << " : " << Table[OffsetM.second]
-         << "\n";
+      OS << OffsetM.first << " : " << Table[OffsetM.second] << "\n";
   }
 };
 
-struct BTFFuncInfo  {
-    const MCSymbol *Label;
-    unsigned int TypeId;
+struct BTFFuncInfo {
+  const MCSymbol *Label;
+  unsigned int TypeId;
 };
 
-struct BTFLineInfo  {
-    MCSymbol *Label;
-    unsigned int FileNameOff;
-    unsigned int LineOff;
-    unsigned int LineNum;
-    unsigned int ColumnNum;
+struct BTFLineInfo {
+  MCSymbol *Label;
+  unsigned int FileNameOff;
+  unsigned int LineOff;
+  unsigned int LineNum;
+  unsigned int ColumnNum;
 };
 
 class MCBTFContext {
@@ -344,15 +341,13 @@ class MCBTFContext {
   friend class BTFTypeEntryFunc;
 
 public:
-  void dump(raw_ostream& OS);
+  void dump(raw_ostream &OS);
   void emitAll(MCObjectStreamer *MCOS);
   void emitCommonHeader(MCObjectStreamer *MCOS);
   void emitBTFSection(MCObjectStreamer *MCOS);
   void emitBTFExtSection(MCObjectStreamer *MCOS);
 
-  size_t addString(std::string S) {
-    return StringTable.addString(S);
-  }
+  size_t addString(std::string S) { return StringTable.addString(S); }
   void addTypeEntry(std::unique_ptr<BTFTypeEntry> Entry);
   void addFuncInfo(unsigned SecNameOff, BTFFuncInfo Info) {
     FuncInfoTable[SecNameOff].push_back(Info);
diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
index 8b16e389963..44484c2ae05 100644
--- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
+++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DwarfUnit.h"
 #include "Dwarf2BTF.h"
+#include "DwarfUnit.h"
 #include "llvm/MC/MCBTFContext.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -17,47 +17,47 @@ namespace llvm {
 
 Die2BTFEntry::~Die2BTFEntry() {}
 
-unsigned char Die2BTFEntry::getDieKind(const DIE & Die) {
+unsigned char Die2BTFEntry::getDieKind(const DIE &Die) {
   auto Tag = Die.getTag();
 
   switch (Tag) {
-    case dwarf::DW_TAG_base_type:
-      if (getBaseTypeEncoding(Die) == BTF_INVALID_ENCODING)
-        return BTF_KIND_UNKN;
-      return BTF_KIND_INT;
-    case dwarf::DW_TAG_const_type:
-      return BTF_KIND_CONST;
-    case dwarf::DW_TAG_pointer_type:
-      return BTF_KIND_PTR;
-    case dwarf::DW_TAG_restrict_type:
-      return BTF_KIND_RESTRICT;
-    case dwarf::DW_TAG_volatile_type:
-      return BTF_KIND_VOLATILE;
-    case dwarf::DW_TAG_typedef:
-      return BTF_KIND_TYPEDEF;
-    case dwarf::DW_TAG_structure_type:
-    case dwarf::DW_TAG_class_type:
-      if (Die.findAttribute(dwarf::DW_AT_declaration).getType()
-          != DIEValue::isNone)
-        return BTF_KIND_FWD;
-      else
-        return BTF_KIND_STRUCT;
-    case dwarf::DW_TAG_union_type:
-      if (Die.findAttribute(dwarf::DW_AT_declaration).getType()
-          != DIEValue::isNone)
-        return BTF_KIND_FWD;
-      else
-        return BTF_KIND_UNION;
-    case dwarf::DW_TAG_enumeration_type:
-      return BTF_KIND_ENUM;
-    case dwarf::DW_TAG_array_type:
-      return BTF_KIND_ARRAY;
-    case dwarf::DW_TAG_subprogram:
-      return BTF_KIND_FUNC;
-    case dwarf::DW_TAG_subroutine_type:
-      return BTF_KIND_FUNC_PROTO;
-    default:
-      break;
+  case dwarf::DW_TAG_base_type:
+    if (getBaseTypeEncoding(Die) == BTF_INVALID_ENCODING)
+      return BTF_KIND_UNKN;
+    return BTF_KIND_INT;
+  case dwarf::DW_TAG_const_type:
+    return BTF_KIND_CONST;
+  case dwarf::DW_TAG_pointer_type:
+    return BTF_KIND_PTR;
+  case dwarf::DW_TAG_restrict_type:
+    return BTF_KIND_RESTRICT;
+  case dwarf::DW_TAG_volatile_type:
+    return BTF_KIND_VOLATILE;
+  case dwarf::DW_TAG_typedef:
+    return BTF_KIND_TYPEDEF;
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_class_type:
+    if (Die.findAttribute(dwarf::DW_AT_declaration).getType() !=
+        DIEValue::isNone)
+      return BTF_KIND_FWD;
+    else
+      return BTF_KIND_STRUCT;
+  case dwarf::DW_TAG_union_type:
+    if (Die.findAttribute(dwarf::DW_AT_declaration).getType() !=
+        DIEValue::isNone)
+      return BTF_KIND_FWD;
+    else
+      return BTF_KIND_UNION;
+  case dwarf::DW_TAG_enumeration_type:
+    return BTF_KIND_ENUM;
+  case dwarf::DW_TAG_array_type:
+    return BTF_KIND_ARRAY;
+  case dwarf::DW_TAG_subprogram:
+    return BTF_KIND_FUNC;
+  case dwarf::DW_TAG_subroutine_type:
+    return BTF_KIND_FUNC_PROTO;
+  default:
+    break;
   }
 
   return BTF_KIND_UNKN;
@@ -67,27 +67,27 @@ std::unique_ptr<Die2BTFEntry> Die2BTFEntry::dieToBTFTypeEntry(const DIE &Die) {
   unsigned char Kind = getDieKind(Die);
 
   switch (Kind) {
-    case BTF_KIND_INT:
-      return make_unique<Die2BTFEntryInt>(Die);
-    case BTF_KIND_PTR:
-    case BTF_KIND_TYPEDEF:
-    case BTF_KIND_VOLATILE:
-    case BTF_KIND_CONST:
-    case BTF_KIND_RESTRICT:
-    case BTF_KIND_FWD:
-      return make_unique<Die2BTFEntry>(Die);
-    case BTF_KIND_ARRAY:
-      return make_unique<Die2BTFEntryArray>(Die);
-    case BTF_KIND_STRUCT:
-    case BTF_KIND_UNION:
-      return make_unique<Die2BTFEntryStruct>(Die);
-    case BTF_KIND_ENUM:
-      return make_unique<Die2BTFEntryEnum>(Die);
-    case BTF_KIND_FUNC:
-    case BTF_KIND_FUNC_PROTO:
-      return make_unique<Die2BTFEntryFunc>(Die);
-    default:
-      break;
+  case BTF_KIND_INT:
+    return make_unique<Die2BTFEntryInt>(Die);
+  case BTF_KIND_PTR:
+  case BTF_KIND_TYPEDEF:
+  case BTF_KIND_VOLATILE:
+  case BTF_KIND_CONST:
+  case BTF_KIND_RESTRICT:
+  case BTF_KIND_FWD:
+    return make_unique<Die2BTFEntry>(Die);
+  case BTF_KIND_ARRAY:
+    return make_unique<Die2BTFEntryArray>(Die);
+  case BTF_KIND_STRUCT:
+  case BTF_KIND_UNION:
+    return make_unique<Die2BTFEntryStruct>(Die);
+  case BTF_KIND_ENUM:
+    return make_unique<Die2BTFEntryEnum>(Die);
+  case BTF_KIND_FUNC:
+  case BTF_KIND_FUNC_PROTO:
+    return make_unique<Die2BTFEntryFunc>(Die);
+  default:
+    break;
   }
   return nullptr;
 }
@@ -96,20 +96,19 @@ bool Die2BTFEntry::shouldSkipDie(const DIE &Die) {
   auto Tag = Die.getTag();
 
   switch (Tag) {
-    case dwarf::DW_TAG_const_type:
-    case dwarf::DW_TAG_pointer_type:
-    case dwarf::DW_TAG_restrict_type:
-    case dwarf::DW_TAG_typedef:
-    case dwarf::DW_TAG_volatile_type:
-    {
-      auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
-      if (TypeV.getType() == DIEValue::isNone)
-        return false;
-      auto &TypeDie = TypeV.getDIEEntry().getEntry();
-      return Die2BTFEntry::shouldSkipDie(TypeDie);
-    }
-    default:
-      return getDieKind(Die) == BTF_KIND_UNKN;
+  case dwarf::DW_TAG_const_type:
+  case dwarf::DW_TAG_pointer_type:
+  case dwarf::DW_TAG_restrict_type:
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_volatile_type: {
+    auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
+    if (TypeV.getType() == DIEValue::isNone)
+      return false;
+    auto &TypeDie = TypeV.getDIEEntry().getEntry();
+    return Die2BTFEntry::shouldSkipDie(TypeDie);
+  }
+  default:
+    return getDieKind(Die) == BTF_KIND_UNKN;
   }
   return true;
 }
@@ -120,26 +119,26 @@ unsigned char Die2BTFEntry::getBaseTypeEncoding(const DIE &Die) {
     return BTF_INVALID_ENCODING;
 
   switch (V.getDIEInteger().getValue()) {
-    case dwarf::DW_ATE_boolean:
-      return BTF_INT_BOOL;
-    case dwarf::DW_ATE_signed:
-      return BTF_INT_SIGNED;
-    case dwarf::DW_ATE_signed_char:
-      return BTF_INT_CHAR;
-    case dwarf::DW_ATE_unsigned:
-      return 0;
-    case dwarf::DW_ATE_unsigned_char:
-      return BTF_INT_CHAR;
-    case dwarf::DW_ATE_imaginary_float:
-    case dwarf::DW_ATE_packed_decimal:
-    case dwarf::DW_ATE_numeric_string:
-    case dwarf::DW_ATE_edited:
-    case dwarf::DW_ATE_signed_fixed:
-    case dwarf::DW_ATE_address:
-    case dwarf::DW_ATE_complex_float:
-    case dwarf::DW_ATE_float:
-    default:
-      break;
+  case dwarf::DW_ATE_boolean:
+    return BTF_INT_BOOL;
+  case dwarf::DW_ATE_signed:
+    return BTF_INT_SIGNED;
+  case dwarf::DW_ATE_signed_char:
+    return BTF_INT_CHAR;
+  case dwarf::DW_ATE_unsigned:
+    return 0;
+  case dwarf::DW_ATE_unsigned_char:
+    return BTF_INT_CHAR;
+  case dwarf::DW_ATE_imaginary_float:
+  case dwarf::DW_ATE_packed_decimal:
+  case dwarf::DW_ATE_numeric_string:
+  case dwarf::DW_ATE_edited:
+  case dwarf::DW_ATE_signed_fixed:
+  case dwarf::DW_ATE_address:
+  case dwarf::DW_ATE_complex_float:
+  case dwarf::DW_ATE_float:
+  default:
+    break;
   }
   return BTF_INVALID_ENCODING;
 }
@@ -148,53 +147,53 @@ Die2BTFEntry::Die2BTFEntry(const DIE &Die) : Die(Die) {
   unsigned char Kind = getDieKind(Die);
 
   switch (Kind) {
-    case BTF_KIND_CONST:
-    case BTF_KIND_FWD:
-    case BTF_KIND_PTR:
-    case BTF_KIND_RESTRICT:
-    case BTF_KIND_TYPEDEF:
-    case BTF_KIND_VOLATILE:
-      break;
-    default:
-      assert("Invalid Die passed into BTFTypeEntry()");
-      break;
+  case BTF_KIND_CONST:
+  case BTF_KIND_FWD:
+  case BTF_KIND_PTR:
+  case BTF_KIND_RESTRICT:
+  case BTF_KIND_TYPEDEF:
+  case BTF_KIND_VOLATILE:
+    break;
+  default:
+    assert("Invalid Die passed into BTFTypeEntry()");
+    break;
   }
 
   BTFType.info = (Kind & 0xf) << 24;
 }
 
 void Die2BTFEntry::completeData(class Dwarf2BTF &Dwarf2BTF) {
-    auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
-    if (TypeV.getType() == DIEValue::isNone) {
-      BTFType.type = 0;
-    } else {
-      auto &TypeDie = TypeV.getDIEEntry().getEntry();
-      auto Type = Dwarf2BTF.getTypeIndex(TypeDie);
-      BTFType.type = Type;
-    }
+  auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
+  if (TypeV.getType() == DIEValue::isNone) {
+    BTFType.type = 0;
+  } else {
+    auto &TypeDie = TypeV.getDIEEntry().getEntry();
+    auto Type = Dwarf2BTF.getTypeIndex(TypeDie);
+    BTFType.type = Type;
+  }
 
-    unsigned char Kind = getDieKind(Die);
-    if (Kind != BTF_KIND_FWD) {
-      BTFType.name_off = 0;
-    } else {
-      auto NameV = Die.findAttribute(dwarf::DW_AT_name);
-      auto Str = NameV.getDIEString().getString();
-      BTFType.name_off = Dwarf2BTF.addBTFString(Str);
-    }
+  unsigned char Kind = getDieKind(Die);
+  if (Kind != BTF_KIND_FWD) {
+    BTFType.name_off = 0;
+  } else {
+    auto NameV = Die.findAttribute(dwarf::DW_AT_name);
+    auto Str = NameV.getDIEString().getString();
+    BTFType.name_off = Dwarf2BTF.addBTFString(Str);
+  }
 
-    auto typeEntry = make_unique<BTFTypeEntry>(Id, BTFType);
-    Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
+  auto typeEntry = make_unique<BTFTypeEntry>(Id, BTFType);
+  Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
 }
 
 Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) {
   unsigned char Kind = getDieKind(Die);
 
   switch (Kind) {
-    case BTF_KIND_INT:
-      break;
-    default:
-      assert("Invalid Die passed into BTFTypeEntryInt()");
-      break;
+  case BTF_KIND_INT:
+    break;
+  default:
+    assert("Invalid Die passed into BTFTypeEntryInt()");
+    break;
   }
 
   // handle BTF_INT_ENCODING in IntVal
@@ -212,7 +211,7 @@ Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) {
   V = Die.findAttribute(dwarf::DW_AT_byte_size);
   __u32 Size = V.getDIEInteger().getValue() & 0xffffffff;
 
-// handle BTF_INT_BITS in IntVal
+  // handle BTF_INT_BITS in IntVal
   V = Die.findAttribute(dwarf::DW_AT_bit_size);
   if (V.getType() == DIEValue::isInteger)
     IntVal |= V.getDIEInteger().getValue() & 0xff;
@@ -225,14 +224,14 @@ Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) {
 }
 
 void Die2BTFEntryInt::completeData(class Dwarf2BTF &Dwarf2BTF) {
-    auto NameV = Die.findAttribute(dwarf::DW_AT_name);
-    auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
-    auto Str = NameV.getDIEString().getString();
+  auto NameV = Die.findAttribute(dwarf::DW_AT_name);
+  auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
+  auto Str = NameV.getDIEString().getString();
 
-    BTFType.name_off = Dwarf2BTF.addBTFString(Str);
+  BTFType.name_off = Dwarf2BTF.addBTFString(Str);
 
-    auto typeEntry = make_unique<BTFTypeEntryInt>(Id, BTFType, IntVal);
-    Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
+  auto typeEntry = make_unique<BTFTypeEntryInt>(Id, BTFType, IntVal);
+  Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
 }
 
 Die2BTFEntryEnum::Die2BTFEntryEnum(const DIE &Die) : Die2BTFEntry(Die) {
@@ -275,8 +274,7 @@ void Die2BTFEntryEnum::completeData(class Dwarf2BTF &Dwarf2BTF) {
   Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
 }
 
-Die2BTFEntryArray::Die2BTFEntryArray(const DIE &Die) :
-    Die2BTFEntry(Die) {
+Die2BTFEntryArray::Die2BTFEntryArray(const DIE &Die) : Die2BTFEntry(Die) {
   BTFType.info = (BTF_KIND_ARRAY << 24);
   BTFType.size = 0;
 }
@@ -289,7 +287,8 @@ void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) {
     Str = NameV.getDIEString().getString();
   BTFType.name_off = Dwarf2BTF.addBTFString(Str);
 
-  auto &ArrayTypeDie = Die.findAttribute(dwarf::DW_AT_type).getDIEEntry().getEntry();
+  auto &ArrayTypeDie =
+      Die.findAttribute(dwarf::DW_AT_type).getDIEEntry().getEntry();
   ArrayInfo.type = Dwarf2BTF.getTypeIndex(ArrayTypeDie);
 
   // The number of elements should count all subranges
@@ -342,7 +341,6 @@ void Die2BTFEntryStruct::completeData(class Dwarf2BTF &Dwarf2BTF) {
   } else
     BTFType.name_off = 0;
 
-
   for (auto &ChildDie : Die.children()) {
     if (ChildDie.getTag() != dwarf::DW_TAG_member)
       continue;
@@ -456,7 +454,7 @@ void Die2BTFEntryFunc::completeData(class Dwarf2BTF &Dwarf2BTF) {
 }
 
 Dwarf2BTF::Dwarf2BTF(MCContext &Context, bool IsLittleEndian)
-  : OuterCtx(Context), IsLE(IsLittleEndian) {
+    : OuterCtx(Context), IsLE(IsLittleEndian) {
   BTFContext = make_unique<MCBTFContext>();
 }
 
@@ -470,7 +468,7 @@ void Dwarf2BTF::addTypeEntry(const DIE &Die) {
     auto TypeEntry = Die2BTFEntry::dieToBTFTypeEntry(Die);
     if (TypeEntry != nullptr) {
       TypeEntry->setId(TypeEntries.size() + 1);
-      DieToIdMap[const_cast<DIE*>(&Die)] = TypeEntry->getId();
+      DieToIdMap[const_cast<DIE *>(&Die)] = TypeEntry->getId();
       TypeEntries.push_back(std::move(TypeEntry));
     }
   }
@@ -500,4 +498,4 @@ void Dwarf2BTF::finish() {
   OuterCtx.setBTFContext(std::move(BTFContext));
 }
 
-}
+} // namespace llvm
diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
index 125441d37b3..a472d68ed7e 100644
--- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
+++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
@@ -26,7 +26,7 @@ class MCBTFContext;
 class Die2BTFEntry {
 protected:
   const DIE &Die;
-  size_t Id;  /* type index in the BTF list, started from 1 */
+  size_t Id; /* type index in the BTF list, started from 1 */
   struct btf_type BTFType;
 
 public:
@@ -54,7 +54,7 @@ public:
 
 // BTF_KIND_INT
 class Die2BTFEntryInt : public Die2BTFEntry {
-  __u32 IntVal;  // encoding, offset, bits
+  __u32 IntVal; // encoding, offset, bits
 
 public:
   Die2BTFEntryInt(const DIE &Die);
@@ -99,7 +99,7 @@ public:
 
 class Dwarf2BTF {
   std::vector<std::unique_ptr<Die2BTFEntry>> TypeEntries;
-  std::map<DIE*, size_t> DieToIdMap;
+  std::map<DIE *, size_t> DieToIdMap;
   std::unique_ptr<MCBTFContext> BTFContext;
   MCContext &OuterCtx;
   bool IsLE;
@@ -110,14 +110,12 @@ public:
   void addDwarfCU(DwarfUnit *TheU);
   void finish();
   __u32 getTypeIndex(DIE &Die) {
-    DIE *DiePtr = const_cast<DIE*>(&Die);
+    DIE *DiePtr = const_cast<DIE *>(&Die);
     assert((DieToIdMap.find(DiePtr) != DieToIdMap.end()) &&
            "Die not added to in the BTFContext");
     return DieToIdMap[DiePtr];
   }
-  size_t addBTFString(std::string S) {
-    return BTFContext->addString(S);
-  }
+  size_t addBTFString(std::string S) { return BTFContext->addString(S); }
   void addBTFTypeEntry(std::unique_ptr<BTFTypeEntry> Entry);
   void addBTFFuncInfo(unsigned SecNameOff, BTFFuncInfo FuncInfo) {
     BTFContext->addFuncInfo(SecNameOff, FuncInfo);
@@ -126,10 +124,10 @@ public:
 private:
   void addTypeEntry(const DIE &Die);
   bool alreadyAdded(DIE &Die) {
-    return DieToIdMap.find(const_cast<DIE*>(&Die)) != DieToIdMap.end();
+    return DieToIdMap.find(const_cast<DIE *>(&Die)) != DieToIdMap.end();
   }
   void completeData();
 };
 
-}
+} // namespace llvm
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 9aafe2613f6..114f98f725d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -137,9 +137,7 @@ public:
     return ScopeVariables;
   }
 
-  DenseMap<LexicalScope *, LabelList> &getScopeLabels() {
-    return ScopeLabels;
-  }
+  DenseMap<LexicalScope *, LabelList> &getScopeLabels() { return ScopeLabels; }
 
   DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
     return AbstractSPDies;
diff --git a/lib/MC/MCBTFContext.cpp b/lib/MC/MCBTFContext.cpp
index d1c30dd0b88..cb121c41552 100644
--- a/lib/MC/MCBTFContext.cpp
+++ b/lib/MC/MCBTFContext.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCBTFContext.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include <cstdlib>
@@ -22,7 +22,7 @@ using namespace llvm;
 BTFTypeEntry::~BTFTypeEntry() {}
 
 void MCBTFContext::addTypeEntry(std::unique_ptr<BTFTypeEntry> Entry) {
-   TypeEntries.push_back(std::move(Entry));
+  TypeEntries.push_back(std::move(Entry));
 }
 
 void MCBTFContext::dump(raw_ostream &OS) {
@@ -39,8 +39,7 @@ void MCBTFContext::dump(raw_ostream &OS) {
   for (auto &FuncSec : FuncInfoTable) {
     OS << "sec_name_off=" << FuncSec.first << "\n";
     for (auto &FuncInfo : FuncSec.second) {
-      OS << "\tinsn_offset=<Omitted> type_id="
-         << FuncInfo.TypeId << "\n";
+      OS << "\tinsn_offset=<Omitted> type_id=" << FuncInfo.TypeId << "\n";
     }
   }
 
@@ -48,12 +47,9 @@ void MCBTFContext::dump(raw_ostream &OS) {
   for (auto &LineSec : LineInfoTable) {
     OS << "sec_name_off=" << LineSec.first << "\n";
     for (auto &LineInfo : LineSec.second) {
-      OS << "\tinsn_offset=<Omitted> file_name_off="
-         << LineInfo.FileNameOff
-         << " line_off=" << LineInfo.LineOff
-         << " line_num=" << LineInfo.LineNum
-         << " column_num=" << LineInfo.ColumnNum
-         << "\n";
+      OS << "\tinsn_offset=<Omitted> file_name_off=" << LineInfo.FileNameOff
+         << " line_off=" << LineInfo.LineOff << " line_num=" << LineInfo.LineNum
+         << " column_num=" << LineInfo.ColumnNum << "\n";
     }
   }
 }
@@ -83,7 +79,7 @@ void MCBTFContext::emitBTFSection(MCObjectStreamer *MCOS) {
   MCOS->EmitIntValue(str_len, 4);
 
   // emit type table
-  for (auto &TypeEntry: TypeEntries)
+  for (auto &TypeEntry : TypeEntries)
     TypeEntry->emitData(MCOS);
 
   // emit string table
@@ -146,9 +142,8 @@ void MCBTFContext::emitAll(MCObjectStreamer *MCOS) {
   emitBTFExtSection(MCOS);
 }
 
-void BTFTypeEntry::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
-  OS << "[" << Id << "] "
-     << btf_kind_str[BTF_INFO_KIND(BTFType.info)]
+void BTFTypeEntry::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
+  OS << "[" << Id << "] " << btf_kind_str[BTF_INFO_KIND(BTFType.info)]
      << " name_off=" << BTFType.name_off
      << " info=" << format("0x%08lx", BTFType.info)
      << " size/type=" << BTFType.size << "\n";
@@ -160,7 +155,7 @@ void BTFTypeEntry::emitData(MCObjectStreamer *MCOS) {
   MCOS->EmitIntValue(BTFType.size, 4);
 }
 
-void BTFTypeEntryInt::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
+void BTFTypeEntryInt::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
   BTFTypeEntry::print(OS, MCBTFContext);
   OS << "\tdesc=" << format("0x%08lx", IntVal) << "\n";
 }
@@ -170,12 +165,12 @@ void BTFTypeEntryInt::emitData(MCObjectStreamer *MCOS) {
   MCOS->EmitIntValue(IntVal, 4);
 }
 
-void BTFTypeEntryEnum::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
- BTFTypeEntry::print(OS, MCBTFContext);
+void BTFTypeEntryEnum::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
+  BTFTypeEntry::print(OS, MCBTFContext);
   for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) {
     auto &EnumValue = EnumValues[i];
-    OS << "\tname_off=" << EnumValue.name_off
-       << " value=" << EnumValue.val << "\n";
+    OS << "\tname_off=" << EnumValue.name_off << " value=" << EnumValue.val
+       << "\n";
   }
 }
 
@@ -187,7 +182,7 @@ void BTFTypeEntryEnum::emitData(MCObjectStreamer *MCOS) {
   }
 }
 
-void BTFTypeEntryArray::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
+void BTFTypeEntryArray::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
   BTFTypeEntry::print(OS, MCBTFContext);
   OS << "\telem_type=" << format("0x%08lx", ArrayInfo.type)
      << " index_type=" << format("0x%08lx", ArrayInfo.index_type)
@@ -201,12 +196,11 @@ void BTFTypeEntryArray::emitData(MCObjectStreamer *MCOS) {
   MCOS->EmitIntValue(ArrayInfo.nelems, 4);
 }
 
-void BTFTypeEntryStruct::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
+void BTFTypeEntryStruct::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
   BTFTypeEntry::print(OS, MCBTFContext);
-   for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) {
+  for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) {
     auto &Member = Members[i];
-    OS << "\tname_off=" << Member.name_off
-       << " type=" << Member.type
+    OS << "\tname_off=" << Member.name_off << " type=" << Member.type
        << " bit_offset=" << Member.offset << "\n";
   }
 }
@@ -220,9 +214,9 @@ void BTFTypeEntryStruct::emitData(MCObjectStreamer *MCOS) {
   }
 }
 
-void BTFTypeEntryFunc::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
+void BTFTypeEntryFunc::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
   BTFTypeEntry::print(OS, MCBTFContext);
-   for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) {
+  for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) {
     auto Parameter = Parameters[i];
     OS << "\tparam_type=" << Parameter << "\n";
   }
@@ -230,6 +224,6 @@ void BTFTypeEntryFunc::print(raw_ostream &OS, MCBTFContext& MCBTFContext) {
 
 void BTFTypeEntryFunc::emitData(MCObjectStreamer *MCOS) {
   BTFTypeEntry::emitData(MCOS);
-  for (auto &Parameter: Parameters)
+  for (auto &Parameter : Parameters)
     MCOS->EmitIntValue(Parameter, 4);
 }
diff --git a/lib/MC/MCDwarf2BTF.cpp b/lib/MC/MCDwarf2BTF.cpp
index 08a70e6f318..9809a2153ec 100644
--- a/lib/MC/MCDwarf2BTF.cpp
+++ b/lib/MC/MCDwarf2BTF.cpp
@@ -10,11 +10,11 @@
 #include "MCDwarf2BTF.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCBTFContext.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCBTFContext.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include <fstream>
@@ -22,7 +22,7 @@
 using namespace llvm;
 
 void MCDwarf2BTF::addFiles(MCObjectStreamer *MCOS, std::string &FileName,
-  std::vector<FileContent> &Files) {
+                           std::vector<FileContent> &Files) {
   std::vector<std::string> Content;
 
   std::ifstream Inputfile(FileName);
@@ -34,9 +34,10 @@ void MCDwarf2BTF::addFiles(MCObjectStreamer *MCOS, std::string &FileName,
   Files.push_back(FileContent(FileName, Content));
 }
 
-void MCDwarf2BTF::addLines(MCObjectStreamer *MCOS, StringRef &SectionName,
-  std::vector<FileContent> &Files,
-  const MCLineSection::MCDwarfLineEntryCollection &LineEntries) {
+void MCDwarf2BTF::addLines(
+    MCObjectStreamer *MCOS, StringRef &SectionName,
+    std::vector<FileContent> &Files,
+    const MCLineSection::MCDwarfLineEntryCollection &LineEntries) {
   MCContext &Context = MCOS->getContext();
   auto &BTFCxt = Context.getBTFContext();
 
@@ -84,9 +85,11 @@ void MCDwarf2BTF::addDwarfLineInfo(MCObjectStreamer *MCOS) {
         FileName = Dirs[File.DirIndex - 1] + "/" + File.Name;
       MCDwarf2BTF::addFiles(MCOS, FileName, Files);
     }
-    for (const auto &LineSec: CUIDTablePair.second.getMCLineSections().getMCLineEntries()) {
+    for (const auto &LineSec :
+         CUIDTablePair.second.getMCLineSections().getMCLineEntries()) {
       MCSection *Section = LineSec.first;
-      const MCLineSection::MCDwarfLineEntryCollection &LineEntries = LineSec.second;
+      const MCLineSection::MCDwarfLineEntryCollection &LineEntries =
+          LineSec.second;
 
       StringRef SectionName;
       if (MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(Section))
diff --git a/lib/MC/MCDwarf2BTF.h b/lib/MC/MCDwarf2BTF.h
index 22d1b7741a5..69983374a09 100644
--- a/lib/MC/MCDwarf2BTF.h
+++ b/lib/MC/MCDwarf2BTF.h
@@ -18,12 +18,13 @@ using FileContent = std::pair<std::string, std::vector<std::string>>;
 class MCDwarf2BTF {
 public:
   static void addFiles(MCObjectStreamer *MCOS, std::string &FileName,
-    std::vector<FileContent> &Files);
-  static void addLines(MCObjectStreamer *MCOS, StringRef &SectionName,
-    std::vector<FileContent> &Files,
-    const MCLineSection::MCDwarfLineEntryCollection &LineEntries);
+                       std::vector<FileContent> &Files);
+  static void
+  addLines(MCObjectStreamer *MCOS, StringRef &SectionName,
+           std::vector<FileContent> &Files,
+           const MCLineSection::MCDwarfLineEntryCollection &LineEntries);
   static void addDwarfLineInfo(MCObjectStreamer *MCOS);
 };
 
-}
+} // namespace llvm
 #endif
-- 
GitLab


From 9a80e3fe5a8cbb8e16744aded8c3015b3873153a Mon Sep 17 00:00:00 2001
From: Eric Liu <ioeric@google.com>
Date: Fri, 12 Oct 2018 17:55:21 +0000
Subject: [PATCH 0118/1116] Disambiguate: s/make_unique/llvm::make_unique/. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344385 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
index 44484c2ae05..5afd2c902ca 100644
--- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
+++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
@@ -68,24 +68,24 @@ std::unique_ptr<Die2BTFEntry> Die2BTFEntry::dieToBTFTypeEntry(const DIE &Die) {
 
   switch (Kind) {
   case BTF_KIND_INT:
-    return make_unique<Die2BTFEntryInt>(Die);
+    return llvm::make_unique<Die2BTFEntryInt>(Die);
   case BTF_KIND_PTR:
   case BTF_KIND_TYPEDEF:
   case BTF_KIND_VOLATILE:
   case BTF_KIND_CONST:
   case BTF_KIND_RESTRICT:
   case BTF_KIND_FWD:
-    return make_unique<Die2BTFEntry>(Die);
+    return llvm::make_unique<Die2BTFEntry>(Die);
   case BTF_KIND_ARRAY:
-    return make_unique<Die2BTFEntryArray>(Die);
+    return llvm::make_unique<Die2BTFEntryArray>(Die);
   case BTF_KIND_STRUCT:
   case BTF_KIND_UNION:
-    return make_unique<Die2BTFEntryStruct>(Die);
+    return llvm::make_unique<Die2BTFEntryStruct>(Die);
   case BTF_KIND_ENUM:
-    return make_unique<Die2BTFEntryEnum>(Die);
+    return llvm::make_unique<Die2BTFEntryEnum>(Die);
   case BTF_KIND_FUNC:
   case BTF_KIND_FUNC_PROTO:
-    return make_unique<Die2BTFEntryFunc>(Die);
+    return llvm::make_unique<Die2BTFEntryFunc>(Die);
   default:
     break;
   }
@@ -181,7 +181,7 @@ void Die2BTFEntry::completeData(class Dwarf2BTF &Dwarf2BTF) {
     BTFType.name_off = Dwarf2BTF.addBTFString(Str);
   }
 
-  auto typeEntry = make_unique<BTFTypeEntry>(Id, BTFType);
+  auto typeEntry = llvm::make_unique<BTFTypeEntry>(Id, BTFType);
   Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
 }
 
@@ -230,7 +230,7 @@ void Die2BTFEntryInt::completeData(class Dwarf2BTF &Dwarf2BTF) {
 
   BTFType.name_off = Dwarf2BTF.addBTFString(Str);
 
-  auto typeEntry = make_unique<BTFTypeEntryInt>(Id, BTFType, IntVal);
+  auto typeEntry = llvm::make_unique<BTFTypeEntryInt>(Id, BTFType, IntVal);
   Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
 }
 
@@ -270,7 +270,7 @@ void Die2BTFEntryEnum::completeData(class Dwarf2BTF &Dwarf2BTF) {
     EnumValues.push_back(BTFEnum);
   }
 
-  auto typeEntry = make_unique<BTFTypeEntryEnum>(Id, BTFType, EnumValues);
+  auto typeEntry = llvm::make_unique<BTFTypeEntryEnum>(Id, BTFType, EnumValues);
   Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
 }
 
@@ -313,7 +313,7 @@ void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) {
   }
   ArrayInfo.nelems = Nelems;
 
-  auto TypeEntry = make_unique<BTFTypeEntryArray>(Id, BTFType, ArrayInfo);
+  auto TypeEntry = llvm::make_unique<BTFTypeEntryArray>(Id, BTFType, ArrayInfo);
   Dwarf2BTF.addBTFTypeEntry(std::move(TypeEntry));
 }
 
@@ -378,7 +378,7 @@ void Die2BTFEntryStruct::completeData(class Dwarf2BTF &Dwarf2BTF) {
     Members.push_back(BTFMember);
   }
 
-  auto typeEntry = make_unique<BTFTypeEntryStruct>(Id, BTFType, Members);
+  auto typeEntry = llvm::make_unique<BTFTypeEntryStruct>(Id, BTFType, Members);
   Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
 }
 
@@ -428,7 +428,7 @@ void Die2BTFEntryFunc::completeData(class Dwarf2BTF &Dwarf2BTF) {
     }
   }
 
-  auto typeEntry = make_unique<BTFTypeEntryFunc>(Id, BTFType, Parameters);
+  auto typeEntry = llvm::make_unique<BTFTypeEntryFunc>(Id, BTFType, Parameters);
   Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
 
   if (BTF_INFO_KIND(BTFType.info) == BTF_KIND_FUNC) {
@@ -455,7 +455,7 @@ void Die2BTFEntryFunc::completeData(class Dwarf2BTF &Dwarf2BTF) {
 
 Dwarf2BTF::Dwarf2BTF(MCContext &Context, bool IsLittleEndian)
     : OuterCtx(Context), IsLE(IsLittleEndian) {
-  BTFContext = make_unique<MCBTFContext>();
+  BTFContext = llvm::make_unique<MCBTFContext>();
 }
 
 void Dwarf2BTF::addTypeEntry(const DIE &Die) {
-- 
GitLab


From ee500a522114e465a9738a5969705969cfe5243e Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 12 Oct 2018 17:57:07 +0000
Subject: [PATCH 0119/1116] [BPF] Use cstdint {,u}int*_t instead of
 linux/types.h __u32 __u16 ...

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344387 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCBTFContext.h       | 85 +++++++++++++---------------
 lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp | 12 ++--
 lib/CodeGen/AsmPrinter/Dwarf2BTF.h   |  6 +-
 3 files changed, 48 insertions(+), 55 deletions(-)

diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h
index 43103273acb..5b4bafd7aee 100644
--- a/include/llvm/MC/MCBTFContext.h
+++ b/include/llvm/MC/MCBTFContext.h
@@ -18,27 +18,20 @@
 #include <map>
 #include <vector>
 
-typedef __signed__ char __s8;
-typedef unsigned char __u8;
-typedef __signed__ short __s16;
-typedef unsigned short __u16;
-typedef __signed__ int __s32;
-typedef unsigned int __u32;
-
 #define BTF_MAGIC 0xeB9F
 #define BTF_VERSION 1
 
 struct btf_header {
-  __u16 magic;
-  __u8 version;
-  __u8 flags;
-  __u32 hdr_len;
+  uint16_t magic;
+  uint8_t version;
+  uint8_t flags;
+  uint32_t hdr_len;
 
   /* All offsets are in bytes relative to the end of this header */
-  __u32 type_off; /* offset of type section	*/
-  __u32 type_len; /* length of type section	*/
-  __u32 str_off;  /* offset of string section	*/
-  __u32 str_len;  /* length of string section	*/
+  uint32_t type_off; // offset of type section
+  uint32_t type_len; // length of type section
+  uint32_t str_off;  // offset of string section
+  uint32_t str_len;  // length of string section
 };
 
 /* Max # of type identifier */
@@ -49,14 +42,14 @@ struct btf_header {
 #define BTF_MAX_VLEN 0xffff
 
 struct btf_type {
-  __u32 name_off;
+  uint32_t name_off;
   /* "info" bits arrangement
    * bits  0-15: vlen (e.g. # of struct's members)
    * bits 16-23: unused
    * bits 24-27: kind (e.g. int, ptr, array...etc)
    * bits 28-31: unused
    */
-  __u32 info;
+  uint32_t info;
   /* "size" is used by INT, ENUM, STRUCT and UNION.
    * "size" tells the size of the type it is describing.
    *
@@ -65,8 +58,8 @@ struct btf_type {
    * "type" is a type_id referring to another type.
    */
   union {
-    __u32 size;
-    __u32 type;
+    uint32_t size;
+    uint32_t type;
   };
 };
 
@@ -111,15 +104,15 @@ struct btf_type {
  * info in "struct btf_type").
  */
 struct btf_enum {
-  __u32 name_off;
-  __s32 val;
+  uint32_t name_off;
+  int32_t val;
 };
 
 /* BTF_KIND_ARRAY is followed by one "struct btf_array" */
 struct btf_array {
-  __u32 type;
-  __u32 index_type;
-  __u32 nelems;
+  uint32_t type;
+  uint32_t index_type;
+  uint32_t nelems;
 };
 
 /* BTF_KIND_STRUCT and BTF_KIND_UNION are followed
@@ -128,45 +121,45 @@ struct btf_array {
  * "struct btf_type").
  */
 struct btf_member {
-  __u32 name_off;
-  __u32 type;
-  __u32 offset; /* offset in bits */
+  uint32_t name_off;
+  uint32_t type;
+  uint32_t offset; /* offset in bits */
 };
 
 /* .BTF.ext section contains func_info and line_info.
  */
 struct btf_ext_header {
-  __u16 magic;
-  __u8 version;
-  __u8 flags;
-  __u32 hdr_len;
-
-  __u32 func_info_off;
-  __u32 func_info_len;
-  __u32 line_info_off;
-  __u32 line_info_len;
+  uint16_t magic;
+  uint8_t version;
+  uint8_t flags;
+  uint32_t hdr_len;
+
+  uint32_t func_info_off;
+  uint32_t func_info_len;
+  uint32_t line_info_off;
+  uint32_t line_info_len;
 };
 
 struct bpf_func_info {
-  __u32 insn_offset;
-  __u32 type_id;
+  uint32_t insn_offset;
+  uint32_t type_id;
 };
 
 struct btf_sec_func_info {
-  __u32 sec_name_off;
-  __u32 num_func_info;
+  uint32_t sec_name_off;
+  uint32_t num_func_info;
 };
 
 struct bpf_line_info {
-  __u32 insn_offset;
-  __u32 file_name_off;
-  __u32 line_off;
-  __u32 line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */
+  uint32_t insn_offset;
+  uint32_t file_name_off;
+  uint32_t line_off;
+  uint32_t line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */
 };
 
 struct btf_sec_line_info {
-  __u32 sec_name_off;
-  __u32 num_line_info;
+  uint32_t sec_name_off;
+  uint32_t num_line_info;
 };
 
 namespace llvm {
diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
index 5afd2c902ca..20cc61df9b6 100644
--- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
+++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
@@ -200,7 +200,7 @@ Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) {
   auto Encoding = Die2BTFEntry::getBaseTypeEncoding(Die);
   assert((Encoding != BTF_INVALID_ENCODING) &&
          "Invalid Die passed to BTFTypeEntryInt()");
-  __u32 IntVal = (Encoding & 0xf) << 24;
+  uint32_t IntVal = (Encoding & 0xf) << 24;
 
   // handle BTF_INT_OFFSET in IntVal
   auto V = Die.findAttribute(dwarf::DW_AT_bit_offset);
@@ -209,7 +209,7 @@ Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) {
 
   // get btf_type.size
   V = Die.findAttribute(dwarf::DW_AT_byte_size);
-  __u32 Size = V.getDIEInteger().getValue() & 0xffffffff;
+  uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff;
 
   // handle BTF_INT_BITS in IntVal
   V = Die.findAttribute(dwarf::DW_AT_bit_size);
@@ -237,7 +237,7 @@ void Die2BTFEntryInt::completeData(class Dwarf2BTF &Dwarf2BTF) {
 Die2BTFEntryEnum::Die2BTFEntryEnum(const DIE &Die) : Die2BTFEntry(Die) {
   // get btf_type.size
   auto V = Die.findAttribute(dwarf::DW_AT_byte_size);
-  __u32 Size = V.getDIEInteger().getValue() & 0xffffffff;
+  uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff;
 
   int Vlen = 0;
   for (auto &ChildDie : Die.children())
@@ -265,7 +265,7 @@ void Die2BTFEntryEnum::completeData(class Dwarf2BTF &Dwarf2BTF) {
 
     BTFEnum.name_off = Dwarf2BTF.addBTFString(Str);
     auto ChildValueV = ChildDie.findAttribute(dwarf::DW_AT_const_value);
-    BTFEnum.val = (__s32)(ChildValueV.getDIEInteger().getValue());
+    BTFEnum.val = (int32_t)(ChildValueV.getDIEInteger().getValue());
 
     EnumValues.push_back(BTFEnum);
   }
@@ -308,7 +308,7 @@ void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) {
         Nelems = 0;
         break;
       }
-      Nelems *= (__u32)(CountV.getDIEInteger().getValue());
+      Nelems *= (uint32_t)(CountV.getDIEInteger().getValue());
     }
   }
   ArrayInfo.nelems = Nelems;
@@ -320,7 +320,7 @@ void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) {
 Die2BTFEntryStruct::Die2BTFEntryStruct(const DIE &Die) : Die2BTFEntry(Die) {
   // get btf_type.size
   auto V = Die.findAttribute(dwarf::DW_AT_byte_size);
-  __u32 Size = V.getDIEInteger().getValue() & 0xffffffff;
+  uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff;
   auto Kind = Die2BTFEntry::getDieKind(Die);
 
   int Vlen = 0;
diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
index a472d68ed7e..ae13847214c 100644
--- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
+++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
@@ -54,7 +54,7 @@ public:
 
 // BTF_KIND_INT
 class Die2BTFEntryInt : public Die2BTFEntry {
-  __u32 IntVal; // encoding, offset, bits
+  uint32_t IntVal; // encoding, offset, bits
 
 public:
   Die2BTFEntryInt(const DIE &Die);
@@ -90,7 +90,7 @@ public:
 
 // BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO
 class Die2BTFEntryFunc : public Die2BTFEntry {
-  std::vector<__u32> Parameters;
+  std::vector<uint32_t> Parameters;
 
 public:
   Die2BTFEntryFunc(const DIE &Die);
@@ -109,7 +109,7 @@ public:
   bool isLittleEndian() { return IsLE; }
   void addDwarfCU(DwarfUnit *TheU);
   void finish();
-  __u32 getTypeIndex(DIE &Die) {
+  uint32_t getTypeIndex(DIE &Die) {
     DIE *DiePtr = const_cast<DIE *>(&Die);
     assert((DieToIdMap.find(DiePtr) != DieToIdMap.end()) &&
            "Die not added to in the BTFContext");
-- 
GitLab


From f5782f7024e1f46381bd455f8eefb147669766e1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Oct 2018 18:10:04 +0000
Subject: [PATCH 0120/1116] Fix MCBTF string array initialization so its MSVC
 friendly. NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344390 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCBTFContext.h | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h
index 5b4bafd7aee..5ef35f12609 100644
--- a/include/llvm/MC/MCBTFContext.h
+++ b/include/llvm/MC/MCBTFContext.h
@@ -165,13 +165,20 @@ struct btf_sec_line_info {
 namespace llvm {
 
 const char *const btf_kind_str[NR_BTF_KINDS] = {
-    [BTF_KIND_UNKN] = "UNKNOWN",    [BTF_KIND_INT] = "INT",
-    [BTF_KIND_PTR] = "PTR",         [BTF_KIND_ARRAY] = "ARRAY",
-    [BTF_KIND_STRUCT] = "STRUCT",   [BTF_KIND_UNION] = "UNION",
-    [BTF_KIND_ENUM] = "ENUM",       [BTF_KIND_FWD] = "FWD",
-    [BTF_KIND_TYPEDEF] = "TYPEDEF", [BTF_KIND_VOLATILE] = "VOLATILE",
-    [BTF_KIND_CONST] = "CONST",     [BTF_KIND_RESTRICT] = "RESTRICT",
-    [BTF_KIND_FUNC] = "FUNC",       [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
+    "UNKNOWN",    /* BTF_KIND_UNKN */
+    "INT",        /* BTF_KIND_INT */
+    "PTR",        /* BTF_KIND_PTR */
+    "ARRAY",      /* BTF_KIND_ARRAY */
+    "STRUCT",     /* BTF_KIND_STRUCT */
+    "UNION",      /* BTF_KIND_UNION */
+    "ENUM",       /* BTF_KIND_ENUM */
+    "FWD",        /* BTF_KIND_FWD */
+    "TYPEDEF",    /* BTF_KIND_TYPEDEF */
+    "VOLATILE",   /* BTF_KIND_VOLATILE */
+    "CONST",      /* BTF_KIND_CONST */
+    "RESTRICT",   /* BTF_KIND_CONST */
+    "FUNC",       /* BTF_KIND_FUNC */
+    "FUNC_PROTO", /* BTF_KIND_FUNC_PROTO */
 };
 
 class MCBTFContext;
-- 
GitLab


From 40c1d29a9d14e55fbe5db9fbb3f433ef01af4e5f Mon Sep 17 00:00:00 2001
From: Jonathan Metzman <metzman@chromium.org>
Date: Fri, 12 Oct 2018 18:11:47 +0000
Subject: [PATCH 0121/1116] [SanitizerCoverage] Prevent /OPT:REF from stripping
 constructors

Summary:
Linking with the /OPT:REF linker flag when building COFF files causes
the linker to strip SanitizerCoverage's constructors. Prevent this by
giving the constructors WeakODR linkage and by passing the linker a
directive to include sancov.module_ctor.

Include a test in compiler-rt to verify libFuzzer can be linked using
/OPT:REF

Reviewers: morehouse, rnk

Reviewed By: morehouse, rnk

Subscribers: rnk, morehouse, hiraditya

Differential Revision: https://reviews.llvm.org/D52119

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344391 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Instrumentation/SanitizerCoverage.cpp     | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index bf461c61ede..0bed4139518 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -29,6 +29,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
@@ -298,6 +299,26 @@ Function *SanitizerCoverageModule::CreateInitCallsForSections(
   } else {
     appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
   }
+
+  if (TargetTriple.getObjectFormat() == Triple::COFF) {
+    // In COFF files, if the contructors are set as COMDAT (they are because
+    // COFF supports COMDAT) and the linker flag /OPT:REF (strip unreferenced
+    // functions and data) is used, the constructors get stripped. To prevent
+    // this, give the constructors weak ODR linkage and tell the linker to
+    // always include the sancov constructor. This way the linker can
+    // deduplicate the constructors but always leave one copy.
+    CtorFunc->setLinkage(GlobalValue::WeakODRLinkage);
+    SmallString<20> PartialIncDirective("/include:");
+    // Get constructor's mangled name in order to support i386.
+    SmallString<40> MangledName;
+    Mangler().getNameWithPrefix(MangledName, CtorFunc, true);
+    Twine IncDirective = PartialIncDirective + MangledName;
+    Metadata *Args[1] = {MDString::get(*C, IncDirective.str())};
+    MDNode *MetadataNode = MDNode::get(*C, Args);
+    NamedMDNode *NamedMetadata =
+        M.getOrInsertNamedMetadata("llvm.linker.options");
+    NamedMetadata->addOperand(MetadataNode);
+  }
   return CtorFunc;
 }
 
-- 
GitLab


From 787355713025c285219b7e239d9ce184249646d1 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Fri, 12 Oct 2018 18:18:53 +0000
Subject: [PATCH 0122/1116] [llvm-mca] Correctly set aliases for register
 writes introduced by optimized register moves.

This fixes a problem introduced by r344334. A write from a non-zero move
eliminated at register renaming stage was not correctly handled by the PRF. This
would have led to an assertion failure if the processor model declares a PRF
that enables non-zero move elimination.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344392 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../include/HardwareUnits/RegisterFile.h      |  8 ++-
 .../lib/HardwareUnits/RegisterFile.cpp        | 65 +++++++++++++++----
 tools/llvm-mca/lib/Stages/DispatchStage.cpp   | 15 +++--
 3 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
index 6a45c707de0..4b8b623bfe6 100644
--- a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
+++ b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
@@ -109,12 +109,18 @@ class RegisterFile : public HardwareUnit {
   //
   // Field `AllowMoveElimination` is set for registers that are used as
   // destination by optimizable register moves.
+  //
+  // Field `AliasRegID` is set by writes from register moves that have been
+  // eliminated at register renaming stage. A move eliminated at register
+  // renaming stage is effectively bypassed, and its write aliases the source
+  // register definition.
   struct RegisterRenamingInfo {
     IndexPlusCostPairTy IndexPlusCost;
     llvm::MCPhysReg RenameAs;
+    llvm::MCPhysReg AliasRegID;
     bool AllowMoveElimination;
     RegisterRenamingInfo()
-        : IndexPlusCost(std::make_pair(0U, 1U)), RenameAs(0U),
+        : IndexPlusCost(std::make_pair(0U, 1U)), RenameAs(0U), AliasRegID(0U),
           AllowMoveElimination(false) {}
   };
 
diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
index 481e2e18fa9..4a2a00523ae 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
@@ -171,7 +171,8 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
   // implicitly clears the upper portion of the underlying register.
   // If a write clears its super-registers, then it is renamed as `RenameAs`.
   bool IsWriteZero = WS.isWriteZero();
-  bool ShouldAllocatePhysRegs = !IsWriteZero;
+  bool IsEliminated = WS.isEliminated();
+  bool ShouldAllocatePhysRegs = !IsWriteZero && !IsEliminated;
   const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
 
   if (RRI.RenameAs && RRI.RenameAs != RegID) {
@@ -187,6 +188,7 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
       if (OtherWrite.getWriteState() &&
           (OtherWrite.getSourceIndex() != Write.getSourceIndex())) {
         // This partial write has a false dependency on RenameAs.
+        assert(!IsEliminated && "Unexpected partial update!");
         WS.setDependentWrite(OtherWrite.getWriteState());
       }
     }
@@ -205,22 +207,33 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
       ZeroRegisters.clearBit(*I);
   }
 
-  // Update the mapping for register RegID including its sub-registers.
-  RegisterMappings[RegID].first = Write;
-  for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
-    RegisterMappings[*I].first = Write;
+  // If this is move has been eliminated, then the call to tryEliminateMove
+  // should have already updated all the register mappings.
+  if (!IsEliminated) {
+    // Update the mapping for register RegID including its sub-registers.
+    RegisterMappings[RegID].first = Write;
+    RegisterMappings[RegID].second.AliasRegID = 0U;
+    for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
+      RegisterMappings[*I].first = Write;
+      RegisterMappings[*I].second.AliasRegID = 0U;
+    }
 
-  // No physical registers are allocated for instructions that are optimized in
-  // hardware. For example, zero-latency data-dependency breaking instructions
-  // don't consume physical registers.
-  if (ShouldAllocatePhysRegs)
-    allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs);
+    // No physical registers are allocated for instructions that are optimized in
+    // hardware. For example, zero-latency data-dependency breaking instructions
+    // don't consume physical registers.
+    if (ShouldAllocatePhysRegs)
+      allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs);
+  }
 
   if (!WS.clearsSuperRegisters())
     return;
 
   for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) {
-    RegisterMappings[*I].first = Write;
+    if (!IsEliminated) {
+      RegisterMappings[*I].first = Write;
+      RegisterMappings[*I].second.AliasRegID = 0U;
+    }
+
     if (IsWriteZero)
       ZeroRegisters.setBit(*I);
     else
@@ -230,6 +243,11 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
 
 void RegisterFile::removeRegisterWrite(
     const WriteState &WS, MutableArrayRef<unsigned> FreedPhysRegs) {
+  // Early exit if this write was eliminated. A write eliminated at register
+  // renaming stage generates an alias, and it is not added to the PRF.
+  if (WS.isEliminated())
+    return;
+
   unsigned RegID = WS.getRegisterID();
 
   assert(RegID != 0 && "Invalidating an already invalid register?");
@@ -313,10 +331,29 @@ bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) {
   if (RMT.AllowZeroMoveEliminationOnly && !IsZeroMove)
     return false;
 
+  MCPhysReg FromReg = RS.getRegisterID();
+  MCPhysReg ToReg = WS.getRegisterID();
+
+  // Construct an alias.
+  MCPhysReg AliasReg = FromReg;
+  if (RRIFrom.RenameAs)
+    AliasReg = RRIFrom.RenameAs;
+
+  const RegisterRenamingInfo &RMAlias = RegisterMappings[AliasReg].second;
+  if (RMAlias.AliasRegID)
+    AliasReg = RMAlias.AliasRegID;
+
+  if (AliasReg != ToReg) {
+    RegisterMappings[ToReg].second.AliasRegID = AliasReg;
+    for (MCSubRegIterator I(ToReg, &MRI); I.isValid(); ++I)
+      RegisterMappings[*I].second.AliasRegID = AliasReg;
+  }
+
   RMT.NumMoveEliminated++;
   if (IsZeroMove)
     WS.setWriteZero();
   WS.setEliminated();
+
   return true;
 }
 
@@ -325,6 +362,12 @@ void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
   assert(RegID && RegID < RegisterMappings.size());
   LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register "
                     << MRI.getName(RegID) << '\n');
+
+  // Check if this is an alias.
+  const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+  if (RRI.AliasRegID)
+    RegID = RRI.AliasRegID;
+
   const WriteRef &WR = RegisterMappings[RegID].first;
   if (WR.isValid())
     Writes.push_back(WR);
diff --git a/tools/llvm-mca/lib/Stages/DispatchStage.cpp b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
index c33b86027da..a6be2474554 100644
--- a/tools/llvm-mca/lib/Stages/DispatchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
@@ -101,10 +101,11 @@ Error DispatchStage::dispatch(InstRef IR) {
   }
 
   // Check if this is an optimizable reg-reg move.
+  bool IsEliminated = false;
   if (IS.isOptimizableMove()) {
     assert(IS.getDefs().size() == 1 && "Expected a single input!");
     assert(IS.getUses().size() == 1 && "Expected a single output!");
-    PRF.tryEliminateMove(*IS.getDefs()[0], *IS.getUses()[0]);
+    IsEliminated = PRF.tryEliminateMove(*IS.getDefs()[0], *IS.getUses()[0]);
   }
 
   // A dependency-breaking instruction doesn't have to wait on the register
@@ -113,9 +114,15 @@ Error DispatchStage::dispatch(InstRef IR) {
   // instruction. A dependency-breaking instruction is a zero-latency
   // instruction that doesn't consume hardware resources.
   // An example of dependency-breaking instruction on X86 is a zero-idiom XOR.
-  for (std::unique_ptr<ReadState> &RS : IS.getUses())
-    if (!RS->isIndependentFromDef())
-      updateRAWDependencies(*RS, STI);
+  //
+  // We also don't update data dependencies for instructions that have been
+  // eliminated at register renaming stage.
+  if (!IsEliminated) {
+    for (std::unique_ptr<ReadState> &RS : IS.getUses()) {
+      if (!RS->isIndependentFromDef())
+        updateRAWDependencies(*RS, STI);
+    }
+  }
 
   // By default, a dependency-breaking zero-idiom is expected to be optimized
   // at register renaming stage. That means, no physical register is allocated
-- 
GitLab


From a5213c4729d78e374823bb5b15a279a659aa389d Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 12 Oct 2018 18:19:06 +0000
Subject: [PATCH 0123/1116] [codeview] Emit S_BUILDINFO and LF_BUILDINFO with
 cwd and source file

Summary: We can fill in the command line and compiler path later if we want.

Reviewers: zturner

Subscribers: hiraditya, llvm-commits

Differential Revision: https://reviews.llvm.org/D53179

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344393 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/CodeView/TypeRecord.h | 12 ++++-
 lib/CodeGen/AsmPrinter/CodeViewDebug.cpp     | 48 ++++++++++++++++++++
 lib/CodeGen/AsmPrinter/CodeViewDebug.h       |  2 +
 test/DebugInfo/COFF/build-info.ll            | 39 ++++++++++++++++
 4 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 test/DebugInfo/COFF/build-info.ll

diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h
index ee6f53854e7..af4e8f40575 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h
@@ -655,7 +655,17 @@ public:
 
   ArrayRef<TypeIndex> getArgs() const { return ArgIndices; }
 
-  SmallVector<TypeIndex, 4> ArgIndices;
+  /// Indices of known build info arguments.
+  enum BuildInfoArg {
+    CurrentDirectory, //< Absolute CWD path
+    BuildTool,        //< Absolute compiler path
+    SourceFile,       //< Path to main source file, relative or absolute
+    TypeServerPDB,    //< Absoulte path of type server PDB (/Fd)
+    CommandLine,      //< Full canonical command line (maybe -cc1)
+    MaxArgs
+  };
+
+  SmallVector<TypeIndex, MaxArgs> ArgIndices;
 };
 
 // LF_VFTABLE
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 8232f076a93..3b503b683a0 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -561,6 +561,11 @@ void CodeViewDebug::endModule() {
   OS.AddComment("String table");
   OS.EmitCVStringTableDirective();
 
+  // Emit S_BUILDINFO, which points to LF_BUILDINFO. Put this in its own symbol
+  // subsection in the generic .debug$S section at the end. There is no
+  // particular reason for this ordering other than to match MSVC.
+  emitBuildInfo();
+
   // Emit type information and hashes last, so that any types we translate while
   // emitting function info are included.
   emitTypeInformation();
@@ -772,6 +777,49 @@ void CodeViewDebug::emitCompilerInformation() {
   OS.EmitLabel(CompilerEnd);
 }
 
+static TypeIndex getStringIdTypeIdx(GlobalTypeTableBuilder &TypeTable,
+                                    StringRef S) {
+  StringIdRecord SIR(TypeIndex(0x0), S);
+  return TypeTable.writeLeafType(SIR);
+}
+
+void CodeViewDebug::emitBuildInfo() {
+  // First, make LF_BUILDINFO. It's a sequence of strings with various bits of
+  // build info. The known prefix is:
+  // - Absolute path of current directory
+  // - Compiler path
+  // - Main source file path, relative to CWD or absolute
+  // - Type server PDB file
+  // - Canonical compiler command line
+  // If frontend and backend compilation are separated (think llc or LTO), it's
+  // not clear if the compiler path should refer to the executable for the
+  // frontend or the backend. Leave it blank for now.
+  TypeIndex BuildInfoArgs[BuildInfoRecord::MaxArgs] = {};
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  const MDNode *Node = *CUs->operands().begin(); // FIXME: Multiple CUs.
+  const auto *CU = cast<DICompileUnit>(Node);
+  const DIFile *MainSourceFile = CU->getFile();
+  BuildInfoArgs[BuildInfoRecord::CurrentDirectory] =
+      getStringIdTypeIdx(TypeTable, MainSourceFile->getDirectory());
+  BuildInfoArgs[BuildInfoRecord::SourceFile] =
+      getStringIdTypeIdx(TypeTable, MainSourceFile->getFilename());
+  // FIXME: Path to compiler and command line. PDB is intentionally blank unless
+  // we implement /Zi type servers.
+  BuildInfoRecord BIR(BuildInfoArgs);
+  TypeIndex BuildInfoIndex = TypeTable.writeLeafType(BIR);
+
+  // Make a new .debug$S subsection for the S_BUILDINFO record, which points
+  // from the module symbols into the type stream.
+  MCSymbol *BuildInfoEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
+  OS.AddComment("Record length");
+  OS.EmitIntValue(6, 2);
+  OS.AddComment("Record kind: S_BUILDINFO");
+  OS.EmitIntValue(unsigned(SymbolKind::S_BUILDINFO), 2);
+  OS.AddComment("LF_BUILDINFO index");
+  OS.EmitIntValue(BuildInfoIndex.getIndex(), 4);
+  endCVSubsection(BuildInfoEnd);
+}
+
 void CodeViewDebug::emitInlineeLinesSubsection() {
   if (InlinedSubprograms.empty())
     return;
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index b97092a642e..b6fbdc1373f 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -272,6 +272,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
   void emitCompilerInformation();
 
+  void emitBuildInfo();
+
   void emitInlineeLinesSubsection();
 
   void emitDebugInfoForThunk(const Function *GV,
diff --git a/test/DebugInfo/COFF/build-info.ll b/test/DebugInfo/COFF/build-info.ll
new file mode 100644
index 00000000000..94f006c3b09
--- /dev/null
+++ b/test/DebugInfo/COFF/build-info.ll
@@ -0,0 +1,39 @@
+; RUN: llc -filetype=obj -mtriple i686-pc-windows-msvc %s -o %t.o
+; RUN: llvm-pdbutil dump %t.o -symbols -types | FileCheck %s
+
+; CHECK: [[INFO_IDX:0x[^ ]*]] | LF_BUILDINFO
+; CHECK-NEXT:          0x{{.*}}: `D:\src\scopes\clang`
+; CHECK-NEXT:          <no type>: ``
+; CHECK-NEXT:          0x{{.*}}: `D:\src\scopes\foo.cpp`
+; CHECK-NEXT:          <no type>: ``
+; CHECK-NEXT:          <no type>: ``
+
+; CHECK: {{.*}} | S_BUILDINFO [size = 8] BuildId = `[[INFO_IDX]]`
+
+; ModuleID = 'D:\src\scopes\foo.cpp'
+source_filename = "D:\5Csrc\5Cscopes\5Cfoo.cpp"
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc19.0.23918"
+
+define i32 @"?foo@@YAHXZ"() !dbg !10 {
+entry:
+  ret i32 42, !dbg !14
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+; One .debug$S section should contain an S_COMPILE3 record that identifies the
+; source language and the version of the compiler based on the DICompileUnit.
+!1 = !DIFile(filename: "D:\5Csrc\5Cscopes\5Cfoo.cpp", directory: "D:\5Csrc\5Cscopes\5Cclang")
+!2 = !{}
+!7 = !{i32 2, !"CodeView", i32 1}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 4.0.0 "}
+!10 = distinct !DISubprogram(name: "foo", linkageName: "\01?foo@@YAHXZ", scope: !1, file: !1, line: 1, type: !11, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!11 = !DISubroutineType(types: !12)
+!12 = !{!13}
+!13 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!14 = !DILocation(line: 2, scope: !10)
-- 
GitLab


From 6316db4486f9763e51db5ea07c22275ad84936d4 Mon Sep 17 00:00:00 2001
From: Rui Ueyama <ruiu@google.com>
Date: Fri, 12 Oct 2018 18:29:30 +0000
Subject: [PATCH 0124/1116] Replace assert() with llvm_unreachable because it's
 obviously a typo.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344395 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
index 20cc61df9b6..b3e6fce97b6 100644
--- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
+++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
@@ -155,7 +155,7 @@ Die2BTFEntry::Die2BTFEntry(const DIE &Die) : Die(Die) {
   case BTF_KIND_VOLATILE:
     break;
   default:
-    assert("Invalid Die passed into BTFTypeEntry()");
+    llvm_unreachable("Invalid Die passed into BTFTypeEntry()");
     break;
   }
 
-- 
GitLab


From c27563a142810a55593bb298c5b642085a971485 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Oct 2018 19:03:54 +0000
Subject: [PATCH 0125/1116] Regenerate test. NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344399 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/x86-interleaved-access.ll | 32 +++++++++++-----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index bf087e12833..e4624eaf363 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1591,7 +1591,7 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
 ; AVX1-NEXT:    vorps %ymm12, %ymm14, %ymm12
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm14
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm15[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vmovdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vandnps %ymm14, %ymm13, %ymm14
 ; AVX1-NEXT:    vandps %ymm13, %ymm7, %ymm7
 ; AVX1-NEXT:    vorps %ymm14, %ymm7, %ymm13
@@ -1616,7 +1616,7 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm4, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm0
-; AVX1-NEXT:    vpaddb -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm11, %xmm12, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
@@ -1732,22 +1732,22 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm14
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX1-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm3
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
@@ -1756,7 +1756,7 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm1
-; AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
 ; AVX1-NEXT:    vmovdqa %xmm8, %xmm2
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
@@ -1765,16 +1765,16 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm9, %ymm14
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm9, %ymm9
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
@@ -1788,7 +1788,7 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm9, %ymm6
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm15, %ymm10, %ymm2
-; AVX1-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm3 # 32-byte Reload
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm3, %ymm0
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm7, %ymm3
-- 
GitLab


From c6422e18ae6edcce196d7667d6970cd175a8d560 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Oct 2018 19:30:43 +0000
Subject: [PATCH 0126/1116] Fix Wdocumentation warning. NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344402 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/CodeView/TypeRecord.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h
index af4e8f40575..9a06a6a3344 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h
@@ -657,11 +657,11 @@ public:
 
   /// Indices of known build info arguments.
   enum BuildInfoArg {
-    CurrentDirectory, //< Absolute CWD path
-    BuildTool,        //< Absolute compiler path
-    SourceFile,       //< Path to main source file, relative or absolute
-    TypeServerPDB,    //< Absoulte path of type server PDB (/Fd)
-    CommandLine,      //< Full canonical command line (maybe -cc1)
+    CurrentDirectory, ///< Absolute CWD path
+    BuildTool,        ///< Absolute compiler path
+    SourceFile,       ///< Path to main source file, relative or absolute
+    TypeServerPDB,    ///< Absolute path of type server PDB (/Fd)
+    CommandLine,      ///< Full canonical command line (maybe -cc1)
     MaxArgs
   };
 
-- 
GitLab


From 6147a037f6cd132b4659c2ae0553a728f376c33c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 12 Oct 2018 19:37:47 +0000
Subject: [PATCH 0127/1116] [LegalizeVectorTypes] When unrolling in
 WidenVecRes_Convert, make sure we use the original vector element count. Not
 min of the widened result type and the possibly widened input type.

If the input type is widened as well, but we still were forced to unroll, we shouldn't be considering the widened input element count. We should only create as many scalar operations as the original type called for.

This will be important for an upcoming patch.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344403 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/LegalizeVectorTypes.cpp       | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1b07358561a..6bee966a327 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2809,11 +2809,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
     if (WidenNumElts % InVTNumElts == 0) {
       // Widen the input and call convert on the widened input vector.
       unsigned NumConcat = WidenNumElts/InVTNumElts;
-      SmallVector<SDValue, 16> Ops(NumConcat);
+      SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
       Ops[0] = InOp;
-      SDValue UndefVal = DAG.getUNDEF(InVT);
-      for (unsigned i = 1; i != NumConcat; ++i)
-        Ops[i] = UndefVal;
       SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InVec);
@@ -2832,11 +2829,12 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   }
 
   // Otherwise unroll into some nasty scalar code and rebuild the vector.
-  SmallVector<SDValue, 16> Ops(WidenNumElts);
   EVT EltVT = WidenVT.getVectorElementType();
-  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
-  unsigned i;
-  for (i=0; i < MinElts; ++i) {
+  SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
+  // Use the original element count so we don't do more scalar opts than
+  // necessary.
+  unsigned MinElts = N->getValueType(0).getVectorNumElements();
+  for (unsigned i=0; i < MinElts; ++i) {
     SDValue Val = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
         DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
@@ -2846,10 +2844,6 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags);
   }
 
-  SDValue UndefVal = DAG.getUNDEF(EltVT);
-  for (; i < WidenNumElts; ++i)
-    Ops[i] = UndefVal;
-
   return DAG.getBuildVector(WidenVT, DL, Ops);
 }
 
-- 
GitLab


From c646992975f1223770858f996a2fd91660dc1ca0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 12 Oct 2018 19:37:49 +0000
Subject: [PATCH 0128/1116] [LegalizeVectorTypes] When widening the operands to
 a concat_vectors, see if we can use the widened operand 0 if the width
 matches and the other operands are undef.

This saves a conversion to extracts and build_vector. We already do this when both the result and the input need to be widened to the same type.

This changed the sse-intrinsics-fast-isel test because we don't lower (insert_vector_elt (scalar_to_vector X), Y, 1) well. We turn it into (vector_shuffle (scalar_to_vector X), (scalar_to_vector Y), <0, 4, 2, 3>) losing track of the fact that the upper elts could be undef.

We should probably find a way to prevent the scalarization of the <2 x f32> load on these tests.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344404 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 21 +++++++++++++-----
 test/CodeGen/X86/sse-intrinsics-fast-isel.ll  | 22 +++++++++----------
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 6bee966a327..310f5ef5dc7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3794,20 +3794,31 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
-  // If the input vector is not legal, it is likely that we will not find a
-  // legal vector of the same size. Replace the concatenate vector with a
-  // nasty build vector.
   EVT VT = N->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
+  EVT InVT = N->getOperand(0).getValueType();
   SDLoc dl(N);
+
+  // If the widen width for this operand is the same as the width of the concat
+  // and all but the first operand is undef, just use the widened operand.
+  unsigned NumOperands = N->getNumOperands();
+  if (VT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
+    unsigned i;
+    for (i = 1; i < NumOperands; ++i)
+      if (!N->getOperand(i).isUndef())
+        break;
+
+    if (i == NumOperands)
+      return GetWidenedVector(N->getOperand(0));
+  }
+
+  // Otherwise, fall back to a nasty build vector.
   unsigned NumElts = VT.getVectorNumElements();
   SmallVector<SDValue, 16> Ops(NumElts);
 
-  EVT InVT = N->getOperand(0).getValueType();
   unsigned NumInElts = InVT.getVectorNumElements();
 
   unsigned Idx = 0;
-  unsigned NumOperands = N->getNumOperands();
   for (unsigned i=0; i < NumOperands; ++i) {
     SDValue InOp = N->getOperand(i);
     assert(getTypeAction(InOp.getValueType()) ==
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 47649a54e80..1ccd586c453 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -1320,10 +1320,10 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
 ; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04]
 ; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
-; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X86-SSE-NEXT:    shufps $0, %xmm1, %xmm2 # encoding: [0x0f,0xc6,0xd1,0x00]
+; X86-SSE-NEXT:    # xmm2 = xmm2[0,0],xmm1[0,0]
+; X86-SSE-NEXT:    shufps $36, %xmm2, %xmm0 # encoding: [0x0f,0xc6,0xc2,0x24]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0,1],xmm2[2,0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_loadh_pi:
@@ -1378,14 +1378,14 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
 ; X86-SSE-LABEL: test_mm_loadl_pi:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movss (%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x08]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04]
+; X86-SSE-NEXT:    movss (%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x10]
 ; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE-NEXT:    shufps $228, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe4]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0,1],xmm0[2,3]
+; X86-SSE-NEXT:    movss 4(%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x48,0x04]
+; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    shufps $0, %xmm2, %xmm1 # encoding: [0x0f,0xc6,0xca,0x00]
+; X86-SSE-NEXT:    # xmm1 = xmm1[0,0],xmm2[0,0]
+; X86-SSE-NEXT:    shufps $226, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe2]
+; X86-SSE-NEXT:    # xmm1 = xmm1[2,0],xmm0[2,3]
 ; X86-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
-- 
GitLab


From c206978e8af6632a2f32f2b02e544ca122be480f Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Fri, 12 Oct 2018 19:41:05 +0000
Subject: [PATCH 0129/1116] Revert BTF commit series.

The initial patch was not reviewed, and does not have any tests;
it should not have been merged.

This reverts 344395, 344390, 344387, 344385, 344381, 344376,
and 344366.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344405 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCBTFContext.h        | 361 -------------------
 include/llvm/MC/MCContext.h           |   7 -
 include/llvm/MC/MCObjectFileInfo.h    |   8 -
 include/llvm/MC/MCObjectStreamer.h    |   1 -
 lib/CodeGen/AsmPrinter/CMakeLists.txt |   1 -
 lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp  | 501 --------------------------
 lib/CodeGen/AsmPrinter/Dwarf2BTF.h    | 133 -------
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp |  10 -
 lib/CodeGen/AsmPrinter/DwarfDebug.h   |   3 -
 lib/CodeGen/AsmPrinter/DwarfFile.cpp  |  10 -
 lib/CodeGen/AsmPrinter/DwarfFile.h    |   7 +-
 lib/MC/CMakeLists.txt                 |   2 -
 lib/MC/MCBTFContext.cpp               | 229 ------------
 lib/MC/MCContext.cpp                  |  11 +-
 lib/MC/MCDwarf2BTF.cpp                | 102 ------
 lib/MC/MCDwarf2BTF.h                  |  30 --
 lib/MC/MCObjectFileInfo.cpp           |   3 -
 lib/MC/MCObjectStreamer.cpp           |  34 --
 18 files changed, 4 insertions(+), 1449 deletions(-)
 delete mode 100644 include/llvm/MC/MCBTFContext.h
 delete mode 100644 lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
 delete mode 100644 lib/CodeGen/AsmPrinter/Dwarf2BTF.h
 delete mode 100644 lib/MC/MCBTFContext.cpp
 delete mode 100644 lib/MC/MCDwarf2BTF.cpp
 delete mode 100644 lib/MC/MCDwarf2BTF.h

diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h
deleted file mode 100644
index 5ef35f12609..00000000000
--- a/include/llvm/MC/MCBTFContext.h
+++ /dev/null
@@ -1,361 +0,0 @@
-//===- MCBTFContext.h ---------------------------------------- *- C++ --*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-// This header file contains two parts. The first part is the BTF ELF
-// specification in C format, and the second part is the various
-// C++ classes to manipulate the data structure in order to generate
-// the BTF related ELF sections.
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_MC_MCBTFCONTEXT_H
-#define LLVM_MC_MCBTFCONTEXT_H
-
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/raw_ostream.h"
-#include <map>
-#include <vector>
-
-#define BTF_MAGIC 0xeB9F
-#define BTF_VERSION 1
-
-struct btf_header {
-  uint16_t magic;
-  uint8_t version;
-  uint8_t flags;
-  uint32_t hdr_len;
-
-  /* All offsets are in bytes relative to the end of this header */
-  uint32_t type_off; // offset of type section
-  uint32_t type_len; // length of type section
-  uint32_t str_off;  // offset of string section
-  uint32_t str_len;  // length of string section
-};
-
-/* Max # of type identifier */
-#define BTF_MAX_TYPE 0x0000ffff
-/* Max offset into the string section */
-#define BTF_MAX_NAME_OFFSET 0x0000ffff
-/* Max # of struct/union/enum members or func args */
-#define BTF_MAX_VLEN 0xffff
-
-struct btf_type {
-  uint32_t name_off;
-  /* "info" bits arrangement
-   * bits  0-15: vlen (e.g. # of struct's members)
-   * bits 16-23: unused
-   * bits 24-27: kind (e.g. int, ptr, array...etc)
-   * bits 28-31: unused
-   */
-  uint32_t info;
-  /* "size" is used by INT, ENUM, STRUCT and UNION.
-   * "size" tells the size of the type it is describing.
-   *
-   * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
-   * FUNC and FUNC_PROTO.
-   * "type" is a type_id referring to another type.
-   */
-  union {
-    uint32_t size;
-    uint32_t type;
-  };
-};
-
-#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f)
-#define BTF_INFO_VLEN(info) ((info)&0xffff)
-
-#define BTF_KIND_UNKN 0        /* Unknown	*/
-#define BTF_KIND_INT 1         /* Integer	*/
-#define BTF_KIND_PTR 2         /* Pointer	*/
-#define BTF_KIND_ARRAY 3       /* Array	*/
-#define BTF_KIND_STRUCT 4      /* Struct	*/
-#define BTF_KIND_UNION 5       /* Union	*/
-#define BTF_KIND_ENUM 6        /* Enumeration	*/
-#define BTF_KIND_FWD 7         /* Forward	*/
-#define BTF_KIND_TYPEDEF 8     /* Typedef	*/
-#define BTF_KIND_VOLATILE 9    /* Volatile	*/
-#define BTF_KIND_CONST 10      /* Const	*/
-#define BTF_KIND_RESTRICT 11   /* Restrict	*/
-#define BTF_KIND_FUNC 12       /* Function	*/
-#define BTF_KIND_FUNC_PROTO 13 /* Function Prototype	*/
-#define BTF_KIND_MAX 13
-#define NR_BTF_KINDS 14
-
-/* For some specific BTF_KIND, "struct btf_type" is immediately
- * followed by extra data.
- */
-
-/* BTF_KIND_INT is followed by a u32 and the following
- * is the 32 bits arrangement:
- */
-#define BTF_INT_ENCODING(VAL) (((VAL)&0x0f000000) >> 24)
-#define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16)
-#define BTF_INT_BITS(VAL) ((VAL)&0x000000ff)
-
-/* Attributes stored in the BTF_INT_ENCODING */
-#define BTF_INT_SIGNED (1 << 0)
-#define BTF_INT_CHAR (1 << 1)
-#define BTF_INT_BOOL (1 << 2)
-
-/* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
- * The exact number of btf_enum is stored in the vlen (of the
- * info in "struct btf_type").
- */
-struct btf_enum {
-  uint32_t name_off;
-  int32_t val;
-};
-
-/* BTF_KIND_ARRAY is followed by one "struct btf_array" */
-struct btf_array {
-  uint32_t type;
-  uint32_t index_type;
-  uint32_t nelems;
-};
-
-/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed
- * by multiple "struct btf_member".  The exact number
- * of btf_member is stored in the vlen (of the info in
- * "struct btf_type").
- */
-struct btf_member {
-  uint32_t name_off;
-  uint32_t type;
-  uint32_t offset; /* offset in bits */
-};
-
-/* .BTF.ext section contains func_info and line_info.
- */
-struct btf_ext_header {
-  uint16_t magic;
-  uint8_t version;
-  uint8_t flags;
-  uint32_t hdr_len;
-
-  uint32_t func_info_off;
-  uint32_t func_info_len;
-  uint32_t line_info_off;
-  uint32_t line_info_len;
-};
-
-struct bpf_func_info {
-  uint32_t insn_offset;
-  uint32_t type_id;
-};
-
-struct btf_sec_func_info {
-  uint32_t sec_name_off;
-  uint32_t num_func_info;
-};
-
-struct bpf_line_info {
-  uint32_t insn_offset;
-  uint32_t file_name_off;
-  uint32_t line_off;
-  uint32_t line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */
-};
-
-struct btf_sec_line_info {
-  uint32_t sec_name_off;
-  uint32_t num_line_info;
-};
-
-namespace llvm {
-
-const char *const btf_kind_str[NR_BTF_KINDS] = {
-    "UNKNOWN",    /* BTF_KIND_UNKN */
-    "INT",        /* BTF_KIND_INT */
-    "PTR",        /* BTF_KIND_PTR */
-    "ARRAY",      /* BTF_KIND_ARRAY */
-    "STRUCT",     /* BTF_KIND_STRUCT */
-    "UNION",      /* BTF_KIND_UNION */
-    "ENUM",       /* BTF_KIND_ENUM */
-    "FWD",        /* BTF_KIND_FWD */
-    "TYPEDEF",    /* BTF_KIND_TYPEDEF */
-    "VOLATILE",   /* BTF_KIND_VOLATILE */
-    "CONST",      /* BTF_KIND_CONST */
-    "RESTRICT",   /* BTF_KIND_CONST */
-    "FUNC",       /* BTF_KIND_FUNC */
-    "FUNC_PROTO", /* BTF_KIND_FUNC_PROTO */
-};
-
-class MCBTFContext;
-class MCObjectStreamer;
-
-// This is base class of all BTF KIND. It is also used directly
-// by the reference kinds:
-//   BTF_KIND_CONST,  BTF_KIND_PTR,  BTF_KIND_VOLATILE,
-//   BTF_KIND_TYPEDEF, BTF_KIND_RESTRICT, and BTF_KIND_FWD
-class BTFTypeEntry {
-protected:
-  size_t Id; /* type index in the BTF list, started from 1 */
-  struct btf_type BTFType;
-
-public:
-  BTFTypeEntry(size_t id, struct btf_type &type) : Id(id), BTFType(type) {}
-  virtual ~BTFTypeEntry();
-  unsigned char getKind() { return BTF_INFO_KIND(BTFType.info); }
-  void setId(size_t Id) { this->Id = Id; }
-  size_t getId() { return Id; }
-  void setNameOff(unsigned NameOff) { BTFType.name_off = NameOff; }
-
-  unsigned getTypeIndex() { return BTFType.type; }
-  unsigned getNameOff() { return BTFType.name_off; }
-  virtual size_t getSize() { return sizeof(struct btf_type); }
-  virtual void print(raw_ostream &s, MCBTFContext &BTFContext);
-  virtual void emitData(MCObjectStreamer *MCOS);
-};
-
-// BTF_KIND_INT
-class BTFTypeEntryInt : public BTFTypeEntry {
-  unsigned IntVal; // encoding, offset, bits
-
-public:
-  BTFTypeEntryInt(size_t id, struct btf_type &type, unsigned intval)
-      : BTFTypeEntry(id, type), IntVal(intval) {}
-  size_t getSize() { return BTFTypeEntry::getSize() + sizeof(unsigned); }
-  void print(raw_ostream &s, MCBTFContext &BTFContext);
-  void emitData(MCObjectStreamer *MCOS);
-};
-
-// BTF_KIND_ENUM
-class BTFTypeEntryEnum : public BTFTypeEntry {
-  std::vector<struct btf_enum> EnumValues;
-
-public:
-  BTFTypeEntryEnum(size_t id, struct btf_type &type,
-                   std::vector<struct btf_enum> &values)
-      : BTFTypeEntry(id, type), EnumValues(values) {}
-  size_t getSize() {
-    return BTFTypeEntry::getSize() +
-           BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_enum);
-  }
-  void print(raw_ostream &s, MCBTFContext &BTFContext);
-  void emitData(MCObjectStreamer *MCOS);
-};
-
-// BTF_KIND_ARRAY
-class BTFTypeEntryArray : public BTFTypeEntry {
-  struct btf_array ArrayInfo;
-
-public:
-  BTFTypeEntryArray(size_t id, struct btf_type &type,
-                    struct btf_array &arrayinfo)
-      : BTFTypeEntry(id, type), ArrayInfo(arrayinfo) {}
-  size_t getSize() {
-    return BTFTypeEntry::getSize() + sizeof(struct btf_array);
-  }
-  void print(raw_ostream &s, MCBTFContext &BTFContext);
-  void emitData(MCObjectStreamer *MCOS);
-};
-
-// BTF_KIND_STRUCT and BTF_KIND_UNION
-class BTFTypeEntryStruct : public BTFTypeEntry {
-  std::vector<struct btf_member> Members;
-
-public:
-  BTFTypeEntryStruct(size_t id, struct btf_type &type,
-                     std::vector<struct btf_member> &members)
-      : BTFTypeEntry(id, type), Members(members) {}
-  size_t getSize() {
-    return BTFTypeEntry::getSize() +
-           BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_member);
-  }
-  void print(raw_ostream &s, MCBTFContext &BTFContext);
-  void emitData(MCObjectStreamer *MCOS);
-};
-
-// BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO
-class BTFTypeEntryFunc : public BTFTypeEntry {
-  std::vector<unsigned> Parameters;
-
-public:
-  BTFTypeEntryFunc(size_t id, struct btf_type &type,
-                   std::vector<unsigned> &params)
-      : BTFTypeEntry(id, type), Parameters(params) {}
-  size_t getSize() {
-    return BTFTypeEntry::getSize() +
-           BTF_INFO_VLEN(BTFType.info) * sizeof(unsigned);
-  }
-  void print(raw_ostream &s, MCBTFContext &BTFContext);
-  void emitData(MCObjectStreamer *MCOS);
-};
-
-class BTFStringTable {
-  size_t Size; // total size in bytes
-  std::map<size_t, unsigned> OffsetToIdMap;
-  std::vector<std::string> Table;
-
-public:
-  BTFStringTable() : Size(0) {}
-  size_t getSize() { return Size; }
-  std::vector<std::string> &getTable() { return Table; }
-  size_t addString(std::string S) {
-    // check whether the string already exists
-    for (auto &OffsetM : OffsetToIdMap) {
-      if (Table[OffsetM.second] == S)
-        return OffsetM.first;
-    }
-    // not find, add to the string table
-    size_t Offset = Size;
-    OffsetToIdMap[Offset] = Table.size();
-    Table.push_back(S);
-    Size += S.size() + 1;
-    return Offset;
-  }
-  std::string &getStringAtOffset(size_t Offset) {
-    return Table[OffsetToIdMap[Offset]];
-  }
-  void showTable(raw_ostream &OS) {
-    for (auto OffsetM : OffsetToIdMap)
-      OS << OffsetM.first << " : " << Table[OffsetM.second] << "\n";
-  }
-};
-
-struct BTFFuncInfo {
-  const MCSymbol *Label;
-  unsigned int TypeId;
-};
-
-struct BTFLineInfo {
-  MCSymbol *Label;
-  unsigned int FileNameOff;
-  unsigned int LineOff;
-  unsigned int LineNum;
-  unsigned int ColumnNum;
-};
-
-class MCBTFContext {
-  std::vector<std::unique_ptr<BTFTypeEntry>> TypeEntries;
-  BTFStringTable StringTable;
-  std::map<unsigned, std::vector<BTFFuncInfo>> FuncInfoTable;
-  std::map<unsigned, std::vector<BTFLineInfo>> LineInfoTable;
-
-  friend class BTFTypeEntry;
-  friend class BTFTypeEntryInt;
-  friend class BTFTypeEntryEnum;
-  friend class BTFTypeEntryArray;
-  friend class BTFTypeEntryStruct;
-  friend class BTFTypeEntryFunc;
-
-public:
-  void dump(raw_ostream &OS);
-  void emitAll(MCObjectStreamer *MCOS);
-  void emitCommonHeader(MCObjectStreamer *MCOS);
-  void emitBTFSection(MCObjectStreamer *MCOS);
-  void emitBTFExtSection(MCObjectStreamer *MCOS);
-
-  size_t addString(std::string S) { return StringTable.addString(S); }
-  void addTypeEntry(std::unique_ptr<BTFTypeEntry> Entry);
-  void addFuncInfo(unsigned SecNameOff, BTFFuncInfo Info) {
-    FuncInfoTable[SecNameOff].push_back(Info);
-  }
-  void addLineInfo(unsigned SecNameOff, BTFLineInfo Info) {
-    LineInfoTable[SecNameOff].push_back(Info);
-  }
-};
-
-}
-#endif
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index d5c49408c68..3b8ac8b79e2 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -56,7 +56,6 @@ namespace llvm {
   class MCSymbolWasm;
   class SMLoc;
   class SourceMgr;
-  class MCBTFContext;
 
   /// Context object for machine code objects.  This class owns all of the
   /// sections that it creates.
@@ -279,9 +278,6 @@ namespace llvm {
     /// Map of currently defined macros.
     StringMap<MCAsmMacro> MacroMap;
 
-    /// for BTF debug information
-    std::unique_ptr<MCBTFContext> BTFCtx;
-
   public:
     explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI,
                        const MCObjectFileInfo *MOFI,
@@ -290,9 +286,6 @@ namespace llvm {
     MCContext &operator=(const MCContext &) = delete;
     ~MCContext();
 
-    void setBTFContext(std::unique_ptr<MCBTFContext> Ctx);
-    std::unique_ptr<MCBTFContext> &getBTFContext() { return BTFCtx; }
-
     const SourceMgr *getSourceManager() const { return SrcMgr; }
 
     void setInlineSourceManager(SourceMgr *SM) { InlineSrcMgr = SM; }
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 1dda7b0712f..8cf9e1cc55a 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -207,10 +207,6 @@ protected:
   MCSection *SXDataSection;
   MCSection *GFIDsSection;
 
-  // BTF specific sections.
-  MCSection *BTFSection;
-  MCSection *BTFExtSection;
-
 public:
   void InitMCObjectFileInfo(const Triple &TT, bool PIC, MCContext &ctx,
                             bool LargeCodeModel = false);
@@ -376,10 +372,6 @@ public:
     return EHFrameSection;
   }
 
-  // BTF specific sections.
-  MCSection *getBTFSection() const { return BTFSection; }
-  MCSection *getBTFExtSection() const { return BTFExtSection; }
-
   enum Environment { IsMachO, IsELF, IsCOFF, IsWasm };
   Environment getObjectFileType() const { return Env; }
 
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index 9d15086ac63..c9e577b7e29 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -138,7 +138,6 @@ public:
                                 unsigned PointerSize);
   void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                  const MCSymbol *Label);
-  void EmitBTFAdvanceLineAddr(const MCSymbol *Label, unsigned Size);
   void EmitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line,
                           unsigned Column, bool PrologueEnd, bool IsStmt,
                           StringRef FileName, SMLoc Loc) override;
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index 14c895a9c82..6cba4a0d4b8 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -17,7 +17,6 @@ add_llvm_library(LLVMAsmPrinter
   DwarfFile.cpp
   DwarfStringPool.cpp
   DwarfUnit.cpp
-  Dwarf2BTF.cpp
   EHStreamer.cpp
   ErlangGCPrinter.cpp
   OcamlGCPrinter.cpp
diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
deleted file mode 100644
index b3e6fce97b6..00000000000
--- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp
+++ /dev/null
@@ -1,501 +0,0 @@
-//===- Dwarf2BTF.cpp ------------------------------------------ *- C++ --*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Dwarf2BTF.h"
-#include "DwarfUnit.h"
-#include "llvm/MC/MCBTFContext.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSectionELF.h"
-
-namespace llvm {
-
-Die2BTFEntry::~Die2BTFEntry() {}
-
-unsigned char Die2BTFEntry::getDieKind(const DIE &Die) {
-  auto Tag = Die.getTag();
-
-  switch (Tag) {
-  case dwarf::DW_TAG_base_type:
-    if (getBaseTypeEncoding(Die) == BTF_INVALID_ENCODING)
-      return BTF_KIND_UNKN;
-    return BTF_KIND_INT;
-  case dwarf::DW_TAG_const_type:
-    return BTF_KIND_CONST;
-  case dwarf::DW_TAG_pointer_type:
-    return BTF_KIND_PTR;
-  case dwarf::DW_TAG_restrict_type:
-    return BTF_KIND_RESTRICT;
-  case dwarf::DW_TAG_volatile_type:
-    return BTF_KIND_VOLATILE;
-  case dwarf::DW_TAG_typedef:
-    return BTF_KIND_TYPEDEF;
-  case dwarf::DW_TAG_structure_type:
-  case dwarf::DW_TAG_class_type:
-    if (Die.findAttribute(dwarf::DW_AT_declaration).getType() !=
-        DIEValue::isNone)
-      return BTF_KIND_FWD;
-    else
-      return BTF_KIND_STRUCT;
-  case dwarf::DW_TAG_union_type:
-    if (Die.findAttribute(dwarf::DW_AT_declaration).getType() !=
-        DIEValue::isNone)
-      return BTF_KIND_FWD;
-    else
-      return BTF_KIND_UNION;
-  case dwarf::DW_TAG_enumeration_type:
-    return BTF_KIND_ENUM;
-  case dwarf::DW_TAG_array_type:
-    return BTF_KIND_ARRAY;
-  case dwarf::DW_TAG_subprogram:
-    return BTF_KIND_FUNC;
-  case dwarf::DW_TAG_subroutine_type:
-    return BTF_KIND_FUNC_PROTO;
-  default:
-    break;
-  }
-
-  return BTF_KIND_UNKN;
-}
-
-std::unique_ptr<Die2BTFEntry> Die2BTFEntry::dieToBTFTypeEntry(const DIE &Die) {
-  unsigned char Kind = getDieKind(Die);
-
-  switch (Kind) {
-  case BTF_KIND_INT:
-    return llvm::make_unique<Die2BTFEntryInt>(Die);
-  case BTF_KIND_PTR:
-  case BTF_KIND_TYPEDEF:
-  case BTF_KIND_VOLATILE:
-  case BTF_KIND_CONST:
-  case BTF_KIND_RESTRICT:
-  case BTF_KIND_FWD:
-    return llvm::make_unique<Die2BTFEntry>(Die);
-  case BTF_KIND_ARRAY:
-    return llvm::make_unique<Die2BTFEntryArray>(Die);
-  case BTF_KIND_STRUCT:
-  case BTF_KIND_UNION:
-    return llvm::make_unique<Die2BTFEntryStruct>(Die);
-  case BTF_KIND_ENUM:
-    return llvm::make_unique<Die2BTFEntryEnum>(Die);
-  case BTF_KIND_FUNC:
-  case BTF_KIND_FUNC_PROTO:
-    return llvm::make_unique<Die2BTFEntryFunc>(Die);
-  default:
-    break;
-  }
-  return nullptr;
-}
-
-bool Die2BTFEntry::shouldSkipDie(const DIE &Die) {
-  auto Tag = Die.getTag();
-
-  switch (Tag) {
-  case dwarf::DW_TAG_const_type:
-  case dwarf::DW_TAG_pointer_type:
-  case dwarf::DW_TAG_restrict_type:
-  case dwarf::DW_TAG_typedef:
-  case dwarf::DW_TAG_volatile_type: {
-    auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
-    if (TypeV.getType() == DIEValue::isNone)
-      return false;
-    auto &TypeDie = TypeV.getDIEEntry().getEntry();
-    return Die2BTFEntry::shouldSkipDie(TypeDie);
-  }
-  default:
-    return getDieKind(Die) == BTF_KIND_UNKN;
-  }
-  return true;
-}
-unsigned char Die2BTFEntry::getBaseTypeEncoding(const DIE &Die) {
-  auto V = Die.findAttribute(dwarf::DW_AT_encoding);
-
-  if (V.getType() != DIEValue::isInteger)
-    return BTF_INVALID_ENCODING;
-
-  switch (V.getDIEInteger().getValue()) {
-  case dwarf::DW_ATE_boolean:
-    return BTF_INT_BOOL;
-  case dwarf::DW_ATE_signed:
-    return BTF_INT_SIGNED;
-  case dwarf::DW_ATE_signed_char:
-    return BTF_INT_CHAR;
-  case dwarf::DW_ATE_unsigned:
-    return 0;
-  case dwarf::DW_ATE_unsigned_char:
-    return BTF_INT_CHAR;
-  case dwarf::DW_ATE_imaginary_float:
-  case dwarf::DW_ATE_packed_decimal:
-  case dwarf::DW_ATE_numeric_string:
-  case dwarf::DW_ATE_edited:
-  case dwarf::DW_ATE_signed_fixed:
-  case dwarf::DW_ATE_address:
-  case dwarf::DW_ATE_complex_float:
-  case dwarf::DW_ATE_float:
-  default:
-    break;
-  }
-  return BTF_INVALID_ENCODING;
-}
-
-Die2BTFEntry::Die2BTFEntry(const DIE &Die) : Die(Die) {
-  unsigned char Kind = getDieKind(Die);
-
-  switch (Kind) {
-  case BTF_KIND_CONST:
-  case BTF_KIND_FWD:
-  case BTF_KIND_PTR:
-  case BTF_KIND_RESTRICT:
-  case BTF_KIND_TYPEDEF:
-  case BTF_KIND_VOLATILE:
-    break;
-  default:
-    llvm_unreachable("Invalid Die passed into BTFTypeEntry()");
-    break;
-  }
-
-  BTFType.info = (Kind & 0xf) << 24;
-}
-
-void Die2BTFEntry::completeData(class Dwarf2BTF &Dwarf2BTF) {
-  auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
-  if (TypeV.getType() == DIEValue::isNone) {
-    BTFType.type = 0;
-  } else {
-    auto &TypeDie = TypeV.getDIEEntry().getEntry();
-    auto Type = Dwarf2BTF.getTypeIndex(TypeDie);
-    BTFType.type = Type;
-  }
-
-  unsigned char Kind = getDieKind(Die);
-  if (Kind != BTF_KIND_FWD) {
-    BTFType.name_off = 0;
-  } else {
-    auto NameV = Die.findAttribute(dwarf::DW_AT_name);
-    auto Str = NameV.getDIEString().getString();
-    BTFType.name_off = Dwarf2BTF.addBTFString(Str);
-  }
-
-  auto typeEntry = llvm::make_unique<BTFTypeEntry>(Id, BTFType);
-  Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
-}
-
-Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) {
-  unsigned char Kind = getDieKind(Die);
-
-  switch (Kind) {
-  case BTF_KIND_INT:
-    break;
-  default:
-    assert("Invalid Die passed into BTFTypeEntryInt()");
-    break;
-  }
-
-  // handle BTF_INT_ENCODING in IntVal
-  auto Encoding = Die2BTFEntry::getBaseTypeEncoding(Die);
-  assert((Encoding != BTF_INVALID_ENCODING) &&
-         "Invalid Die passed to BTFTypeEntryInt()");
-  uint32_t IntVal = (Encoding & 0xf) << 24;
-
-  // handle BTF_INT_OFFSET in IntVal
-  auto V = Die.findAttribute(dwarf::DW_AT_bit_offset);
-  if (V.getType() == DIEValue::isInteger)
-    IntVal |= (V.getDIEInteger().getValue() & 0xff) << 16;
-
-  // get btf_type.size
-  V = Die.findAttribute(dwarf::DW_AT_byte_size);
-  uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff;
-
-  // handle BTF_INT_BITS in IntVal
-  V = Die.findAttribute(dwarf::DW_AT_bit_size);
-  if (V.getType() == DIEValue::isInteger)
-    IntVal |= V.getDIEInteger().getValue() & 0xff;
-  else
-    IntVal |= (Size << 3) & 0xff;
-
-  BTFType.info = BTF_KIND_INT << 24;
-  BTFType.size = Size;
-  this->IntVal = IntVal;
-}
-
-void Die2BTFEntryInt::completeData(class Dwarf2BTF &Dwarf2BTF) {
-  auto NameV = Die.findAttribute(dwarf::DW_AT_name);
-  auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
-  auto Str = NameV.getDIEString().getString();
-
-  BTFType.name_off = Dwarf2BTF.addBTFString(Str);
-
-  auto typeEntry = llvm::make_unique<BTFTypeEntryInt>(Id, BTFType, IntVal);
-  Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
-}
-
-Die2BTFEntryEnum::Die2BTFEntryEnum(const DIE &Die) : Die2BTFEntry(Die) {
-  // get btf_type.size
-  auto V = Die.findAttribute(dwarf::DW_AT_byte_size);
-  uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff;
-
-  int Vlen = 0;
-  for (auto &ChildDie : Die.children())
-    if (ChildDie.getTag() == dwarf::DW_TAG_enumerator)
-      Vlen++;
-
-  BTFType.info = (BTF_KIND_ENUM << 24) | (Vlen & BTF_MAX_VLEN);
-  BTFType.type = Size;
-}
-
-void Die2BTFEntryEnum::completeData(class Dwarf2BTF &Dwarf2BTF) {
-  auto TypeV = Die.findAttribute(dwarf::DW_AT_type);
-  auto NameV = Die.findAttribute(dwarf::DW_AT_name);
-
-  if (NameV.getType() != DIEValue::isNone) {
-    auto Str = NameV.getDIEString().getString();
-    BTFType.name_off = Dwarf2BTF.addBTFString(Str);
-  } else
-    BTFType.name_off = 0;
-
-  for (auto &ChildDie : Die.children()) {
-    struct btf_enum BTFEnum;
-    auto ChildNameV = ChildDie.findAttribute(dwarf::DW_AT_name);
-    auto Str = ChildNameV.getDIEString().getString();
-
-    BTFEnum.name_off = Dwarf2BTF.addBTFString(Str);
-    auto ChildValueV = ChildDie.findAttribute(dwarf::DW_AT_const_value);
-    BTFEnum.val = (int32_t)(ChildValueV.getDIEInteger().getValue());
-
-    EnumValues.push_back(BTFEnum);
-  }
-
-  auto typeEntry = llvm::make_unique<BTFTypeEntryEnum>(Id, BTFType, EnumValues);
-  Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
-}
-
-Die2BTFEntryArray::Die2BTFEntryArray(const DIE &Die) : Die2BTFEntry(Die) {
-  BTFType.info = (BTF_KIND_ARRAY << 24);
-  BTFType.size = 0;
-}
-
-void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) {
-  auto NameV = Die.findAttribute(dwarf::DW_AT_name);
-
-  std::string Str;
-  if (NameV.getType() != DIEValue::isNone)
-    Str = NameV.getDIEString().getString();
-  BTFType.name_off = Dwarf2BTF.addBTFString(Str);
-
-  auto &ArrayTypeDie =
-      Die.findAttribute(dwarf::DW_AT_type).getDIEEntry().getEntry();
-  ArrayInfo.type = Dwarf2BTF.getTypeIndex(ArrayTypeDie);
-
-  // The number of elements should count all subranges
-  unsigned Nelems = 1;
-  bool IsFirstSubrange = true;
-  for (auto &ChildDie : Die.children()) {
-    if (ChildDie.getTag() == dwarf::DW_TAG_subrange_type) {
-      if (IsFirstSubrange) {
-        auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_type);
-        auto &TypeDie = TypeV.getDIEEntry().getEntry();
-        ArrayInfo.index_type = Dwarf2BTF.getTypeIndex(TypeDie);
-        IsFirstSubrange = false;
-      }
-      auto CountV = ChildDie.findAttribute(dwarf::DW_AT_count);
-      if (CountV.getType() == DIEValue::isNone) {
-        // array like a[] which essentially a pointer
-        Nelems = 0;
-        break;
-      }
-      Nelems *= (uint32_t)(CountV.getDIEInteger().getValue());
-    }
-  }
-  ArrayInfo.nelems = Nelems;
-
-  auto TypeEntry = llvm::make_unique<BTFTypeEntryArray>(Id, BTFType, ArrayInfo);
-  Dwarf2BTF.addBTFTypeEntry(std::move(TypeEntry));
-}
-
-Die2BTFEntryStruct::Die2BTFEntryStruct(const DIE &Die) : Die2BTFEntry(Die) {
-  // get btf_type.size
-  auto V = Die.findAttribute(dwarf::DW_AT_byte_size);
-  uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff;
-  auto Kind = Die2BTFEntry::getDieKind(Die);
-
-  int Vlen = 0;
-  for (auto &ChildDie : Die.children())
-    if (ChildDie.getTag() == dwarf::DW_TAG_member)
-      Vlen++;
-
-  BTFType.size = Size;
-  BTFType.info = (Kind << 24) | (Vlen & BTF_MAX_VLEN);
-}
-
-void Die2BTFEntryStruct::completeData(class Dwarf2BTF &Dwarf2BTF) {
-  auto NameV = Die.findAttribute(dwarf::DW_AT_name);
-
-  if (NameV.getType() != DIEValue::isNone) {
-    auto Str = NameV.getDIEString().getString();
-    BTFType.name_off = Dwarf2BTF.addBTFString(Str);
-  } else
-    BTFType.name_off = 0;
-
-  for (auto &ChildDie : Die.children()) {
-    if (ChildDie.getTag() != dwarf::DW_TAG_member)
-      continue;
-
-    struct btf_member BTFMember;
-    auto ChildNameV = ChildDie.findAttribute(dwarf::DW_AT_name);
-
-    if (ChildNameV.getType() != DIEValue::isNone) {
-      auto Str = ChildNameV.getDIEString().getString();
-      BTFMember.name_off = Dwarf2BTF.addBTFString(Str);
-    } else
-      BTFMember.name_off = 0;
-
-    auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_type);
-    auto &TypeDie = TypeV.getDIEEntry().getEntry();
-    BTFMember.type = Dwarf2BTF.getTypeIndex(TypeDie);
-
-    auto MemLocV = ChildDie.findAttribute(dwarf::DW_AT_data_member_location);
-    unsigned MemLoc = MemLocV.getDIEInteger().getValue() * 8;
-
-    auto ByteSizeV = ChildDie.findAttribute(dwarf::DW_AT_byte_size);
-    if (ByteSizeV.getType() != DIEValue::isNone) {
-      unsigned ByteSize = ByteSizeV.getDIEInteger().getValue();
-      auto BitOffsetV = ChildDie.findAttribute(dwarf::DW_AT_bit_offset);
-      unsigned BitOffset = BitOffsetV.getDIEInteger().getValue();
-      auto BitSizeV = ChildDie.findAttribute(dwarf::DW_AT_bit_size);
-      unsigned BitSize = BitSizeV.getDIEInteger().getValue();
-      if (Dwarf2BTF.isLittleEndian())
-        MemLoc += ByteSize * 8 - BitSize - BitOffset;
-      else
-        MemLoc += BitOffset;
-    }
-    BTFMember.offset = MemLoc;
-
-    Members.push_back(BTFMember);
-  }
-
-  auto typeEntry = llvm::make_unique<BTFTypeEntryStruct>(Id, BTFType, Members);
-  Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
-}
-
-Die2BTFEntryFunc::Die2BTFEntryFunc(const DIE &Die) : Die2BTFEntry(Die) {
-  auto Kind = Die2BTFEntry::getDieKind(Die);
-
-  int Vlen = 0;
-  for (auto &ChildDie : Die.children())
-    if (ChildDie.getTag() == dwarf::DW_TAG_formal_parameter)
-      Vlen++;
-
-  BTFType.size = 0;
-  BTFType.info = (Kind << 24) | (Vlen & BTF_MAX_VLEN);
-}
-
-void Die2BTFEntryFunc::completeData(class Dwarf2BTF &Dwarf2BTF) {
-  auto NameV = Die.findAttribute(dwarf::DW_AT_name);
-  if (NameV.getType() == DIEValue::isNone) {
-    BTFType.name_off = 0;
-  } else {
-    auto Str = NameV.getDIEString().getString();
-    BTFType.name_off = Dwarf2BTF.addBTFString(Str);
-  }
-
-  auto RetTypeV = Die.findAttribute(dwarf::DW_AT_type);
-  if (RetTypeV.getType() != DIEValue::isNone) {
-    auto &TypeDie = RetTypeV.getDIEEntry().getEntry();
-    BTFType.type = Dwarf2BTF.getTypeIndex(TypeDie);
-  } else {
-    BTFType.type = 0;
-  }
-
-  for (auto &ChildDie : Die.children()) {
-    if (ChildDie.getTag() == dwarf::DW_TAG_formal_parameter) {
-      auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_abstract_origin);
-      if (TypeV.getType() != DIEValue::isNone) {
-        auto &AbsOriginDie = TypeV.getDIEEntry().getEntry();
-        assert(AbsOriginDie.getTag() == dwarf::DW_TAG_formal_parameter);
-        TypeV = AbsOriginDie.findAttribute(dwarf::DW_AT_type);
-      } else {
-        TypeV = ChildDie.findAttribute(dwarf::DW_AT_type);
-      }
-      auto &TypeDie = TypeV.getDIEEntry().getEntry();
-      Parameters.push_back(Dwarf2BTF.getTypeIndex(TypeDie));
-    } else if (ChildDie.getTag() == dwarf::DW_TAG_unspecified_parameters) {
-      Parameters.push_back(0);
-    }
-  }
-
-  auto typeEntry = llvm::make_unique<BTFTypeEntryFunc>(Id, BTFType, Parameters);
-  Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry));
-
-  if (BTF_INFO_KIND(BTFType.info) == BTF_KIND_FUNC) {
-    auto LowPCV = Die.findAttribute(dwarf::DW_AT_low_pc);
-    if (LowPCV.getType() != DIEValue::isNone) {
-      const MCSymbol *Label = LowPCV.getDIELabel().getValue();
-      BTFFuncInfo FuncInfo;
-      unsigned SecNameOff;
-
-      FuncInfo.Label = Label;
-      FuncInfo.TypeId = Id;
-      if (Label->isInSection()) {
-        MCSection &Section = Label->getSection();
-        MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
-        assert(SectionELF);
-        SecNameOff = Dwarf2BTF.addBTFString(SectionELF->getSectionName().str());
-      } else {
-        SecNameOff = Dwarf2BTF.addBTFString(".text");
-      }
-      Dwarf2BTF.addBTFFuncInfo(SecNameOff, FuncInfo);
-    }
-  }
-}
-
-Dwarf2BTF::Dwarf2BTF(MCContext &Context, bool IsLittleEndian)
-    : OuterCtx(Context), IsLE(IsLittleEndian) {
-  BTFContext = llvm::make_unique<MCBTFContext>();
-}
-
-void Dwarf2BTF::addTypeEntry(const DIE &Die) {
-  for (auto &ChildDie : Die.children())
-    addTypeEntry(ChildDie);
-  if (Die2BTFEntry::shouldSkipDie(Die))
-    return;
-  auto Kind = Die2BTFEntry::getDieKind(Die);
-  if (Kind != BTF_KIND_UNKN) {
-    auto TypeEntry = Die2BTFEntry::dieToBTFTypeEntry(Die);
-    if (TypeEntry != nullptr) {
-      TypeEntry->setId(TypeEntries.size() + 1);
-      DieToIdMap[const_cast<DIE *>(&Die)] = TypeEntry->getId();
-      TypeEntries.push_back(std::move(TypeEntry));
-    }
-  }
-}
-
-void Dwarf2BTF::addBTFTypeEntry(std::unique_ptr<BTFTypeEntry> Entry) {
-  BTFContext->addTypeEntry(std::move(Entry));
-}
-
-void Dwarf2BTF::completeData() {
-  BTFContext->addString("\0");
-
-  for (auto &TypeEntry : TypeEntries)
-    TypeEntry->completeData(*this);
-}
-
-void Dwarf2BTF::addDwarfCU(DwarfUnit *TheU) {
-  DIE &CuDie = TheU->getUnitDie();
-
-  assert((CuDie.getTag() == dwarf::DW_TAG_compile_unit) &&
-         "Not a compile unit");
-  addTypeEntry(CuDie);
-}
-
-void Dwarf2BTF::finish() {
-  completeData();
-  OuterCtx.setBTFContext(std::move(BTFContext));
-}
-
-} // namespace llvm
diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
deleted file mode 100644
index ae13847214c..00000000000
--- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h
+++ /dev/null
@@ -1,133 +0,0 @@
-//===- Dwarf2BTF.h -------------------------------------------- *- C++ --*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARF2BTF_H
-#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARF2BTF_H
-
-#include "DwarfUnit.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/DIE.h"
-#include "llvm/MC/MCBTFContext.h"
-#include <map>
-
-namespace llvm {
-
-class Dwarf2BTF;
-class MCBTFContext;
-
-#define BTF_INVALID_ENCODING 0xff
-
-class Die2BTFEntry {
-protected:
-  const DIE &Die;
-  size_t Id; /* type index in the BTF list, started from 1 */
-  struct btf_type BTFType;
-
-public:
-  virtual ~Die2BTFEntry();
-  // Return desired BTF_KIND for the Die, return BTF_KIND_UNKN for
-  // invalid/unsupported Die
-  static unsigned char getDieKind(const DIE &Die);
-
-  // Return proper BTF_INT_ENCODING of a basetype.
-  // Return BTF_INVALID_ENCODING for unsupported (float, etc.)
-  static unsigned char getBaseTypeEncoding(const DIE &Die);
-
-  // Return whether this Die should be skipped.
-  // We currently skip unsupported data type (e.g. float)
-  // and references to unsupported types
-  static bool shouldSkipDie(const DIE &Die);
-
-  static std::unique_ptr<Die2BTFEntry> dieToBTFTypeEntry(const DIE &Die);
-
-  Die2BTFEntry(const DIE &Die);
-  void setId(size_t Id) { this->Id = Id; }
-  size_t getId() { return Id; }
-  virtual void completeData(class Dwarf2BTF &Dwarf2BTF);
-};
-
-// BTF_KIND_INT
-class Die2BTFEntryInt : public Die2BTFEntry {
-  uint32_t IntVal; // encoding, offset, bits
-
-public:
-  Die2BTFEntryInt(const DIE &Die);
-  void completeData(class Dwarf2BTF &Dwarf2BTF);
-};
-
-// BTF_KIND_ENUM
-class Die2BTFEntryEnum : public Die2BTFEntry {
-  std::vector<struct btf_enum> EnumValues;
-
-public:
-  Die2BTFEntryEnum(const DIE &Die);
-  void completeData(class Dwarf2BTF &Dwarf2BTF);
-};
-
-// BTF_KIND_ARRAY
-class Die2BTFEntryArray : public Die2BTFEntry {
-  struct btf_array ArrayInfo;
-
-public:
-  Die2BTFEntryArray(const DIE &Die);
-  void completeData(class Dwarf2BTF &Dwarf2BTF);
-};
-
-// BTF_KIND_STRUCT and BTF_KIND_UNION
-class Die2BTFEntryStruct : public Die2BTFEntry {
-  std::vector<struct btf_member> Members;
-
-public:
-  Die2BTFEntryStruct(const DIE &Die);
-  void completeData(class Dwarf2BTF &Dwarf2BTF);
-};
-
-// BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO
-class Die2BTFEntryFunc : public Die2BTFEntry {
-  std::vector<uint32_t> Parameters;
-
-public:
-  Die2BTFEntryFunc(const DIE &Die);
-  void completeData(class Dwarf2BTF &Dwarf2BTF);
-};
-
-class Dwarf2BTF {
-  std::vector<std::unique_ptr<Die2BTFEntry>> TypeEntries;
-  std::map<DIE *, size_t> DieToIdMap;
-  std::unique_ptr<MCBTFContext> BTFContext;
-  MCContext &OuterCtx;
-  bool IsLE;
-
-public:
-  Dwarf2BTF(MCContext &Context, bool IsLittleEndian);
-  bool isLittleEndian() { return IsLE; }
-  void addDwarfCU(DwarfUnit *TheU);
-  void finish();
-  uint32_t getTypeIndex(DIE &Die) {
-    DIE *DiePtr = const_cast<DIE *>(&Die);
-    assert((DieToIdMap.find(DiePtr) != DieToIdMap.end()) &&
-           "Die not added to in the BTFContext");
-    return DieToIdMap[DiePtr];
-  }
-  size_t addBTFString(std::string S) { return BTFContext->addString(S); }
-  void addBTFTypeEntry(std::unique_ptr<BTFTypeEntry> Entry);
-  void addBTFFuncInfo(unsigned SecNameOff, BTFFuncInfo FuncInfo) {
-    BTFContext->addFuncInfo(SecNameOff, FuncInfo);
-  }
-
-private:
-  void addTypeEntry(const DIE &Die);
-  bool alreadyAdded(DIE &Die) {
-    return DieToIdMap.find(const_cast<DIE *>(&Die)) != DieToIdMap.end();
-  }
-  void completeData();
-};
-
-} // namespace llvm
-#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 184ec4dabe9..94e12658cfe 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -971,10 +971,6 @@ void DwarfDebug::endModule() {
   // Emit the pubnames and pubtypes sections if requested.
   emitDebugPubSections();
 
-  const Triple &TT = Asm->TM.getTargetTriple();
-  if (TT.getArch() == Triple::bpfel || TT.getArch() == Triple::bpfeb)
-    emitBTFSection(TT.getArch() == Triple::bpfel);
-
   // clean up.
   // FIXME: AbstractVariables.clear();
 }
@@ -2459,12 +2455,6 @@ MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) {
   return &SplitTypeUnitFileTable;
 }
 
-void DwarfDebug::emitBTFSection(bool IsLittleEndian) {
-  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
-
-  Holder.emitBTFSection(IsLittleEndian);
-}
-
 uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) {
   MD5 Hash;
   Hash.update(Identifier);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 1350317db02..fecf8056765 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -567,9 +567,6 @@ class DwarfDebug : public DebugHandlerBase {
   /// Emit the reference to the section.
   void emitSectionReference(const DwarfCompileUnit &CU);
 
-  // Emit the BTF sections
-  void emitBTFSection(bool IsLittleEndian);
-
 protected:
   /// Gather pre-function debug information.
   void beginFunctionImpl(const MachineFunction *MF) override;
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 7ac16b34c4c..0ab9ea87c23 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Dwarf2BTF.h"
 #include "DwarfFile.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
@@ -16,8 +15,6 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/MC/MCBTFContext.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include <algorithm>
 #include <cstdint>
@@ -91,13 +88,6 @@ void DwarfFile::emitStrings(MCSection *StrSection, MCSection *OffsetSection,
   StrPool.emit(*Asm, StrSection, OffsetSection, UseRelativeOffsets);
 }
 
-void DwarfFile::emitBTFSection(bool IsLittleEndian) {
-  Dwarf2BTF Dwarf2BTF(Asm->OutContext, IsLittleEndian);
-  for (auto &TheU : CUs)
-    Dwarf2BTF.addDwarfCU(TheU.get());
-  Dwarf2BTF.finish();
-}
-
 bool DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
   auto &ScopeVars = ScopeVariables[LS];
   const DILocalVariable *DV = Var->getVariable();
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 114f98f725d..c315f44a8d8 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -114,9 +114,6 @@ public:
   void emitStrings(MCSection *StrSection, MCSection *OffsetSection = nullptr,
                    bool UseRelativeOffsets = false);
 
-  // Emit all data for the BTF section
-  void emitBTFSection(bool IsLittleEndian);
-
   /// Returns the string pool.
   DwarfStringPool &getStringPool() { return StrPool; }
 
@@ -137,7 +134,9 @@ public:
     return ScopeVariables;
   }
 
-  DenseMap<LexicalScope *, LabelList> &getScopeLabels() { return ScopeLabels; }
+  DenseMap<LexicalScope *, LabelList> &getScopeLabels() {
+    return ScopeLabels;
+  }
 
   DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
     return AbstractSPDies;
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 85bf1616fd6..ba36d99e8f7 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -10,13 +10,11 @@ add_llvm_library(LLVMMC
   MCAsmMacro.cpp
   MCAsmStreamer.cpp
   MCAssembler.cpp
-  MCBTFContext.cpp
   MCCodeEmitter.cpp
   MCCodePadder.cpp
   MCCodeView.cpp
   MCContext.cpp
   MCDwarf.cpp
-  MCDwarf2BTF.cpp
   MCELFObjectTargetWriter.cpp
   MCELFStreamer.cpp
   MCExpr.cpp
diff --git a/lib/MC/MCBTFContext.cpp b/lib/MC/MCBTFContext.cpp
deleted file mode 100644
index cb121c41552..00000000000
--- a/lib/MC/MCBTFContext.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-//===- lib/MC/MCBTFContext.cpp - Machine Code BTF Context -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCBTFContext.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectStreamer.h"
-#include <cstdlib>
-#include <tuple>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "btf"
-
-BTFTypeEntry::~BTFTypeEntry() {}
-
-void MCBTFContext::addTypeEntry(std::unique_ptr<BTFTypeEntry> Entry) {
-  TypeEntries.push_back(std::move(Entry));
-}
-
-void MCBTFContext::dump(raw_ostream &OS) {
-  OS << "Type Table:\n";
-  for (size_t i = 0; i < TypeEntries.size(); i++) {
-    auto TypeEntry = TypeEntries[i].get();
-    TypeEntry->print(OS, *this);
-  }
-
-  OS << "\nString Table:\n";
-  StringTable.showTable(OS);
-
-  OS << "\nFuncInfo Table:\n";
-  for (auto &FuncSec : FuncInfoTable) {
-    OS << "sec_name_off=" << FuncSec.first << "\n";
-    for (auto &FuncInfo : FuncSec.second) {
-      OS << "\tinsn_offset=<Omitted> type_id=" << FuncInfo.TypeId << "\n";
-    }
-  }
-
-  OS << "\nLineInfo Table:\n";
-  for (auto &LineSec : LineInfoTable) {
-    OS << "sec_name_off=" << LineSec.first << "\n";
-    for (auto &LineInfo : LineSec.second) {
-      OS << "\tinsn_offset=<Omitted> file_name_off=" << LineInfo.FileNameOff
-         << " line_off=" << LineInfo.LineOff << " line_num=" << LineInfo.LineNum
-         << " column_num=" << LineInfo.ColumnNum << "\n";
-    }
-  }
-}
-
-void MCBTFContext::emitCommonHeader(MCObjectStreamer *MCOS) {
-  MCOS->EmitIntValue(BTF_MAGIC, 2);
-  MCOS->EmitIntValue(BTF_VERSION, 1);
-  MCOS->EmitIntValue(0, 1);
-}
-
-void MCBTFContext::emitBTFSection(MCObjectStreamer *MCOS) {
-  MCContext &context = MCOS->getContext();
-  MCOS->SwitchSection(context.getObjectFileInfo()->getBTFSection());
-
-  // emit header
-  emitCommonHeader(MCOS);
-  MCOS->EmitIntValue(sizeof(struct btf_header), 4);
-
-  uint32_t type_len = 0, str_len;
-  for (auto &TypeEntry : TypeEntries)
-    type_len += TypeEntry->getSize();
-  str_len = StringTable.getSize();
-
-  MCOS->EmitIntValue(0, 4);
-  MCOS->EmitIntValue(type_len, 4);
-  MCOS->EmitIntValue(type_len, 4);
-  MCOS->EmitIntValue(str_len, 4);
-
-  // emit type table
-  for (auto &TypeEntry : TypeEntries)
-    TypeEntry->emitData(MCOS);
-
-  // emit string table
-  for (auto &S : StringTable.getTable()) {
-    for (auto C : S)
-      MCOS->EmitIntValue(C, 1);
-    MCOS->EmitIntValue('\0', 1);
-  }
-}
-
-void MCBTFContext::emitBTFExtSection(MCObjectStreamer *MCOS) {
-  MCContext &context = MCOS->getContext();
-  MCOS->SwitchSection(context.getObjectFileInfo()->getBTFExtSection());
-
-  // emit header
-  emitCommonHeader(MCOS);
-  MCOS->EmitIntValue(sizeof(struct btf_ext_header), 4);
-
-  uint32_t func_len = 0, line_len = 0;
-  for (auto &FuncSec : FuncInfoTable) {
-    func_len += sizeof(struct btf_sec_func_info);
-    func_len += FuncSec.second.size() * sizeof(struct bpf_func_info);
-  }
-  for (auto &LineSec : LineInfoTable) {
-    line_len += sizeof(struct btf_sec_line_info);
-    line_len += LineSec.second.size() * sizeof(struct bpf_line_info);
-  }
-
-  MCOS->EmitIntValue(0, 4);
-  MCOS->EmitIntValue(func_len, 4);
-  MCOS->EmitIntValue(func_len, 4);
-  MCOS->EmitIntValue(line_len, 4);
-
-  // emit func_info table
-  for (const auto &FuncSec : FuncInfoTable) {
-    MCOS->EmitIntValue(FuncSec.first, 4);
-    MCOS->EmitIntValue(FuncSec.second.size(), 4);
-    for (const auto &FuncInfo : FuncSec.second) {
-      MCOS->EmitBTFAdvanceLineAddr(FuncInfo.Label, 4);
-      MCOS->EmitIntValue(FuncInfo.TypeId, 4);
-    }
-  }
-
-  // emit line_info table
-  for (const auto &LineSec : LineInfoTable) {
-    MCOS->EmitIntValue(LineSec.first, 4);
-    MCOS->EmitIntValue(LineSec.second.size(), 4);
-    for (const auto &LineInfo : LineSec.second) {
-      MCOS->EmitBTFAdvanceLineAddr(LineInfo.Label, 4);
-      MCOS->EmitIntValue(LineInfo.FileNameOff, 4);
-      MCOS->EmitIntValue(LineInfo.LineOff, 4);
-      MCOS->EmitIntValue(LineInfo.LineNum << 10 | LineInfo.ColumnNum, 4);
-    }
-  }
-}
-
-void MCBTFContext::emitAll(MCObjectStreamer *MCOS) {
-  LLVM_DEBUG(dump(dbgs()));
-  emitBTFSection(MCOS);
-  emitBTFExtSection(MCOS);
-}
-
-void BTFTypeEntry::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
-  OS << "[" << Id << "] " << btf_kind_str[BTF_INFO_KIND(BTFType.info)]
-     << " name_off=" << BTFType.name_off
-     << " info=" << format("0x%08lx", BTFType.info)
-     << " size/type=" << BTFType.size << "\n";
-}
-
-void BTFTypeEntry::emitData(MCObjectStreamer *MCOS) {
-  MCOS->EmitIntValue(BTFType.name_off, 4);
-  MCOS->EmitIntValue(BTFType.info, 4);
-  MCOS->EmitIntValue(BTFType.size, 4);
-}
-
-void BTFTypeEntryInt::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
-  BTFTypeEntry::print(OS, MCBTFContext);
-  OS << "\tdesc=" << format("0x%08lx", IntVal) << "\n";
-}
-
-void BTFTypeEntryInt::emitData(MCObjectStreamer *MCOS) {
-  BTFTypeEntry::emitData(MCOS);
-  MCOS->EmitIntValue(IntVal, 4);
-}
-
-void BTFTypeEntryEnum::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
-  BTFTypeEntry::print(OS, MCBTFContext);
-  for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) {
-    auto &EnumValue = EnumValues[i];
-    OS << "\tname_off=" << EnumValue.name_off << " value=" << EnumValue.val
-       << "\n";
-  }
-}
-
-void BTFTypeEntryEnum::emitData(MCObjectStreamer *MCOS) {
-  BTFTypeEntry::emitData(MCOS);
-  for (auto &EnumValue : EnumValues) {
-    MCOS->EmitIntValue(EnumValue.name_off, 4);
-    MCOS->EmitIntValue(EnumValue.val, 4);
-  }
-}
-
-void BTFTypeEntryArray::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
-  BTFTypeEntry::print(OS, MCBTFContext);
-  OS << "\telem_type=" << format("0x%08lx", ArrayInfo.type)
-     << " index_type=" << format("0x%08lx", ArrayInfo.index_type)
-     << " num_element=" << ArrayInfo.nelems << "\n";
-}
-
-void BTFTypeEntryArray::emitData(MCObjectStreamer *MCOS) {
-  BTFTypeEntry::emitData(MCOS);
-  MCOS->EmitIntValue(ArrayInfo.type, 4);
-  MCOS->EmitIntValue(ArrayInfo.index_type, 4);
-  MCOS->EmitIntValue(ArrayInfo.nelems, 4);
-}
-
-void BTFTypeEntryStruct::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
-  BTFTypeEntry::print(OS, MCBTFContext);
-  for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) {
-    auto &Member = Members[i];
-    OS << "\tname_off=" << Member.name_off << " type=" << Member.type
-       << " bit_offset=" << Member.offset << "\n";
-  }
-}
-
-void BTFTypeEntryStruct::emitData(MCObjectStreamer *MCOS) {
-  BTFTypeEntry::emitData(MCOS);
-  for (auto &Member : Members) {
-    MCOS->EmitIntValue(Member.name_off, 4);
-    MCOS->EmitIntValue(Member.type, 4);
-    MCOS->EmitIntValue(Member.offset, 4);
-  }
-}
-
-void BTFTypeEntryFunc::print(raw_ostream &OS, MCBTFContext &MCBTFContext) {
-  BTFTypeEntry::print(OS, MCBTFContext);
-  for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) {
-    auto Parameter = Parameters[i];
-    OS << "\tparam_type=" << Parameter << "\n";
-  }
-}
-
-void BTFTypeEntryFunc::emitData(MCObjectStreamer *MCOS) {
-  BTFTypeEntry::emitData(MCOS);
-  for (auto &Parameter : Parameters)
-    MCOS->EmitIntValue(Parameter, 4);
-}
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 18250a474b7..fab517075c5 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -17,7 +17,6 @@
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCBTFContext.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
@@ -61,7 +60,7 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
     : SrcMgr(mgr), InlineSrcMgr(nullptr), MAI(mai), MRI(mri), MOFI(mofi),
       Symbols(Allocator), UsedNames(Allocator),
       CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0),
-      AutoReset(DoAutoReset), BTFCtx(nullptr) {
+      AutoReset(DoAutoReset) {
   SecureLogFile = AsSecureLogFileName;
 
   if (SrcMgr && SrcMgr->getNumBuffers())
@@ -115,14 +114,6 @@ void MCContext::reset() {
   GenDwarfFileNumber = 0;
 
   HadError = false;
-  BTFCtx.reset();
-}
-
-//===----------------------------------------------------------------------===//
-// BTFCtx Manipulation
-//===----------------------------------------------------------------------===//
-void MCContext::setBTFContext(std::unique_ptr<MCBTFContext> Ctx) {
-  BTFCtx = std::move(Ctx);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/MCDwarf2BTF.cpp b/lib/MC/MCDwarf2BTF.cpp
deleted file mode 100644
index 9809a2153ec..00000000000
--- a/lib/MC/MCDwarf2BTF.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-//===- MCDwarf2BTF.cpp ---------------------------------------- *- C++ --*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCDwarf2BTF.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCBTFContext.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/EndianStream.h"
-#include <fstream>
-
-using namespace llvm;
-
-void MCDwarf2BTF::addFiles(MCObjectStreamer *MCOS, std::string &FileName,
-                           std::vector<FileContent> &Files) {
-  std::vector<std::string> Content;
-
-  std::ifstream Inputfile(FileName);
-  std::string Line;
-  Content.push_back(Line); // line 0 for empty string
-  while (std::getline(Inputfile, Line))
-    Content.push_back(Line);
-
-  Files.push_back(FileContent(FileName, Content));
-}
-
-void MCDwarf2BTF::addLines(
-    MCObjectStreamer *MCOS, StringRef &SectionName,
-    std::vector<FileContent> &Files,
-    const MCLineSection::MCDwarfLineEntryCollection &LineEntries) {
-  MCContext &Context = MCOS->getContext();
-  auto &BTFCxt = Context.getBTFContext();
-
-  unsigned SecNameOff = BTFCxt->addString(SectionName.str());
-  for (const MCDwarfLineEntry &LineEntry : LineEntries) {
-    BTFLineInfo LineInfo;
-    unsigned FileNum = LineEntry.getFileNum();
-    unsigned Line = LineEntry.getLine();
-
-    LineInfo.Label = LineEntry.getLabel();
-    if (FileNum < Files.size()) {
-      LineInfo.FileNameOff = BTFCxt->addString(Files[FileNum].first);
-      if (Line < Files[FileNum].second.size())
-        LineInfo.LineOff = BTFCxt->addString(Files[FileNum].second[Line]);
-      else
-        LineInfo.LineOff = 0;
-    } else {
-      LineInfo.FileNameOff = 0;
-      LineInfo.LineOff = 0;
-    }
-    LineInfo.LineNum = Line;
-    LineInfo.ColumnNum = LineEntry.getColumn();
-    BTFCxt->addLineInfo(SecNameOff, LineInfo);
-  }
-}
-
-void MCDwarf2BTF::addDwarfLineInfo(MCObjectStreamer *MCOS) {
-  MCContext &Context = MCOS->getContext();
-
-  auto &LineTables = Context.getMCDwarfLineTables();
-  if (LineTables.empty())
-    return;
-
-  for (const auto &CUIDTablePair : LineTables) {
-    std::vector<std::string> Dirs;
-    std::vector<FileContent> Files;
-
-    for (auto &Dir : CUIDTablePair.second.getMCDwarfDirs())
-      Dirs.push_back(Dir);
-    for (auto &File : CUIDTablePair.second.getMCDwarfFiles()) {
-      std::string FileName;
-      if (File.DirIndex == 0)
-        FileName = File.Name;
-      else
-        FileName = Dirs[File.DirIndex - 1] + "/" + File.Name;
-      MCDwarf2BTF::addFiles(MCOS, FileName, Files);
-    }
-    for (const auto &LineSec :
-         CUIDTablePair.second.getMCLineSections().getMCLineEntries()) {
-      MCSection *Section = LineSec.first;
-      const MCLineSection::MCDwarfLineEntryCollection &LineEntries =
-          LineSec.second;
-
-      StringRef SectionName;
-      if (MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(Section))
-        SectionName = SectionELF->getSectionName();
-      else
-        return;
-      MCDwarf2BTF::addLines(MCOS, SectionName, Files, LineEntries);
-    }
-  }
-}
diff --git a/lib/MC/MCDwarf2BTF.h b/lib/MC/MCDwarf2BTF.h
deleted file mode 100644
index 69983374a09..00000000000
--- a/lib/MC/MCDwarf2BTF.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- MCDwarf2BTF.h ------------------------------------------ *- C++ --*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_MC_MCDWARF2BTF_H
-#define LLVM_LIB_MC_MCDWARF2BTF_H
-
-#include "llvm/MC/MCDwarf.h"
-
-namespace llvm {
-
-using FileContent = std::pair<std::string, std::vector<std::string>>;
-
-class MCDwarf2BTF {
-public:
-  static void addFiles(MCObjectStreamer *MCOS, std::string &FileName,
-                       std::vector<FileContent> &Files);
-  static void
-  addLines(MCObjectStreamer *MCOS, StringRef &SectionName,
-           std::vector<FileContent> &Files,
-           const MCLineSection::MCDwarfLineEntryCollection &LineEntries);
-  static void addDwarfLineInfo(MCObjectStreamer *MCOS);
-};
-
-} // namespace llvm
-#endif
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index bddcf459ac0..edfccfcb9ed 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -468,9 +468,6 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
       Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags);
 
   StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0);
-
-  BTFSection = Ctx->getELFSection(".BTF", ELF::SHT_PROGBITS, 0);
-  BTFExtSection = Ctx->getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0);
 }
 
 void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 4f74f4101c8..8c88db009bd 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -14,7 +14,6 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCBTFContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
@@ -22,7 +21,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
-#include "MCDwarf2BTF.h"
 using namespace llvm;
 
 MCObjectStreamer::MCObjectStreamer(MCContext &Context,
@@ -441,31 +439,6 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
   insert(new MCDwarfCallFrameFragment(*AddrDelta));
 }
 
-void MCObjectStreamer::EmitBTFAdvanceLineAddr(const MCSymbol *Label,
-                                              unsigned Size) {
-  const MCExpr *Value = MCSymbolRefExpr::create(Label, getContext());
-  MCDataFragment *DF = getOrCreateDataFragment();
-
-  // Avoid fixups when possible.
-  int64_t AbsValue;
-  SMLoc Loc;
-
-  if (Value->evaluateAsAbsolute(AbsValue, getAssemblerPtr())) {
-    if (!isUIntN(8 * Size, AbsValue) && !isIntN(8 * Size, AbsValue)) {
-      getContext().reportError(
-          Loc, "value evaluated as " + Twine(AbsValue) + " is out of range.");
-      return;
-    }
-    EmitIntValue(AbsValue, Size);
-    return;
-  }
-
-  DF->getFixups().push_back(
-      MCFixup::create(DF->getContents().size(), Value,
-                      MCFixup::getKindForSize(Size, false), Loc));
-  DF->getContents().resize(DF->getContents().size() + Size, 0);
-}
-
 void MCObjectStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo,
                                           unsigned Line, unsigned Column,
                                           bool PrologueEnd, bool IsStmt,
@@ -715,13 +688,6 @@ void MCObjectStreamer::FinishImpl() {
   // Dump out the dwarf file & directory tables and line tables.
   MCDwarfLineTable::Emit(this, getAssembler().getDWARFLinetableParams());
 
-  auto &BTFCtx = getContext().getBTFContext();
-  if (BTFCtx) {
-    MCDwarf2BTF::addDwarfLineInfo(this);
-    BTFCtx->emitAll(this);
-    BTFCtx.reset();
-  }
-
   flushPendingLabels();
   getAssembler().Finish();
 }
-- 
GitLab


From 5650e8ff9073049ea2ef27f881e301ebafb94c49 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 12 Oct 2018 21:59:55 +0000
Subject: [PATCH 0130/1116] [LegalizeVectorTypes] When widening the result of a
 bitcast from a scalar type, use a scalar_to_vector to turn the scalar into a
 vector intead of a build vector full of mostly undefs.

This is more consistent with what we usually do and matches some code X86 custom emits in some cases that I think I can cleanup.

The MIPS test change just looks to be an instruction ordering change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344422 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 26 +++++++++----------
 test/CodeGen/Mips/cconv/vector.ll             | 24 ++++++++---------
 2 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 310f5ef5dc7..f4cad796863 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3022,22 +3022,20 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
     }
 
     if (TLI.isTypeLegal(NewInVT)) {
-      // Because the result and the input are different vector types, widening
-      // the result could create a legal type but widening the input might make
-      // it an illegal type that might lead to repeatedly splitting the input
-      // and then widening it. To avoid this, we widen the input only if
-      // it results in a legal type.
-      SmallVector<SDValue, 16> Ops(NewNumElts);
-      SDValue UndefVal = DAG.getUNDEF(InVT);
-      Ops[0] = InOp;
-      for (unsigned i = 1; i < NewNumElts; ++i)
-        Ops[i] = UndefVal;
-
       SDValue NewVec;
-      if (InVT.isVector())
+      if (InVT.isVector()) {
+        // Because the result and the input are different vector types, widening
+        // the result could create a legal type but widening the input might make
+        // it an illegal type that might lead to repeatedly splitting the input
+        // and then widening it. To avoid this, we widen the input only if
+        // it results in a legal type.
+        SmallVector<SDValue, 16> Ops(NewNumElts, DAG.getUNDEF(InVT));
+        Ops[0] = InOp;
+
         NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
-      else
-        NewVec = DAG.getBuildVector(NewInVT, dl, Ops);
+      } else {
+        NewVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewInVT, InOp);
+      }
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec);
     }
   }
diff --git a/test/CodeGen/Mips/cconv/vector.ll b/test/CodeGen/Mips/cconv/vector.ll
index 29ffe23f712..d6e260786d1 100644
--- a/test/CodeGen/Mips/cconv/vector.ll
+++ b/test/CodeGen/Mips/cconv/vector.ll
@@ -2420,10 +2420,10 @@ define void @float_2(<2 x float> %a, <2 x float> %b) {
 ; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(float_2)))
 ; MIPS64R5EB-NEXT:    daddu $1, $1, $25
 ; MIPS64R5EB-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(float_2)))
-; MIPS64R5EB-NEXT:    sd $5, 0($sp)
-; MIPS64R5EB-NEXT:    sd $4, 16($sp)
-; MIPS64R5EB-NEXT:    ld.w $w0, 0($sp)
-; MIPS64R5EB-NEXT:    ld.w $w1, 16($sp)
+; MIPS64R5EB-NEXT:    sd $5, 16($sp)
+; MIPS64R5EB-NEXT:    sd $4, 0($sp)
+; MIPS64R5EB-NEXT:    ld.w $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.w $w1, 0($sp)
 ; MIPS64R5EB-NEXT:    fadd.w $w0, $w1, $w0
 ; MIPS64R5EB-NEXT:    shf.w $w0, $w0, 177
 ; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[0]
@@ -2463,10 +2463,10 @@ define void @float_2(<2 x float> %a, <2 x float> %b) {
 ; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(float_2)))
 ; MIPS64R5EL-NEXT:    daddu $1, $1, $25
 ; MIPS64R5EL-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(float_2)))
-; MIPS64R5EL-NEXT:    sd $5, 0($sp)
-; MIPS64R5EL-NEXT:    sd $4, 16($sp)
-; MIPS64R5EL-NEXT:    ld.w $w0, 0($sp)
-; MIPS64R5EL-NEXT:    ld.w $w1, 16($sp)
+; MIPS64R5EL-NEXT:    sd $5, 16($sp)
+; MIPS64R5EL-NEXT:    sd $4, 0($sp)
+; MIPS64R5EL-NEXT:    ld.w $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.w $w1, 0($sp)
 ; MIPS64R5EL-NEXT:    fadd.w $w0, $w1, $w0
 ; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[0]
 ; MIPS64R5EL-NEXT:    ld $1, %got_disp(float_res_v2f32)($1)
@@ -6211,14 +6211,14 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) {
 ; MIPS64R5-NEXT:    andi $1, $1, 255
 ; MIPS64R5-NEXT:    sw $1, 36($sp)
 ; MIPS64R5-NEXT:    sw $1, 32($sp)
-; MIPS64R5-NEXT:    sd $4, 16($sp)
+; MIPS64R5-NEXT:    sd $4, 0($sp)
 ; MIPS64R5-NEXT:    ld.w $w0, 32($sp)
 ; MIPS64R5-NEXT:    ffint_s.w $w0, $w0
-; MIPS64R5-NEXT:    ld.w $w1, 16($sp)
-; MIPS64R5-NEXT:    fadd.w $w0, $w0, $w1
-; MIPS64R5-NEXT:    sd $6, 0($sp)
 ; MIPS64R5-NEXT:    ld.w $w1, 0($sp)
 ; MIPS64R5-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS64R5-NEXT:    sd $6, 16($sp)
+; MIPS64R5-NEXT:    ld.w $w1, 16($sp)
+; MIPS64R5-NEXT:    fadd.w $w0, $w0, $w1
 ; MIPS64R5-NEXT:    splati.w $w1, $w0[1]
 ; MIPS64R5-NEXT:    add.s $f0, $f0, $f1
 ; MIPS64R5-NEXT:    daddiu $sp, $sp, 48
-- 
GitLab


From 18cda8141231bc1afa46d2ea805c319edf4589e6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 12 Oct 2018 21:59:58 +0000
Subject: [PATCH 0131/1116] [X86] Skip (v2i32/v4i16/v8i8 (bitcast (f64)))
 handling in ReplaceNodeResults if the dest type can be widened by generic
 legalization. NFCI

The algorithm we would do previously was identical to generic legalization. If we ever switch to legalizing integer vectors via widening we'll be able to kill off the code since it now only runs for promotion.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344423 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 97731dff9b2..220e2e2fdc0 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26297,7 +26297,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
 
     if (SrcVT != MVT::f64 ||
-        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) ||
+        getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
       return;
 
     unsigned NumElts = DstVT.getVectorNumElements();
@@ -26307,13 +26308,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                    MVT::v2f64, N->getOperand(0));
     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
 
-    if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
-      // If we are legalizing vectors by widening, we already have the desired
-      // legal vector type, just return it.
-      Results.push_back(ToVecInt);
-      return;
-    }
-
     SmallVector<SDValue, 8> Elts;
     for (unsigned i = 0, e = NumElts; i != e; ++i)
       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
-- 
GitLab


From e9abd40f5c6ad7f17936b45a8421b4b4995488aa Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 12 Oct 2018 22:00:00 +0000
Subject: [PATCH 0132/1116] [X86] Simplify the end of custom type legalization
 for (v2i32/v4i16/v8i8 (bitcast (f64))) by just emitting an EXTRACT_SUBVECTOR
 instead of a BUILD_VECTOR.

Generic legalization should be able to finish legalizing the EXTRACT_SUBVECTOR probably by turning it into a BUILD_VECTOR. But we should emit the simplest sequence.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344424 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 220e2e2fdc0..ffb5acf3386 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26307,13 +26307,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                    MVT::v2f64, N->getOperand(0));
     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
-
-    SmallVector<SDValue, 8> Elts;
-    for (unsigned i = 0, e = NumElts; i != e; ++i)
-      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
-                                   ToVecInt, DAG.getIntPtrConstant(i, dl)));
-
-    Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
+    SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT,
+                                  ToVecInt, DAG.getIntPtrConstant(0, dl));
+    Results.push_back(Extract);
     return;
   }
   case ISD::MGATHER: {
-- 
GitLab


From bb098ae625465cc0b6341b22b9f1160dc9a7a110 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 12 Oct 2018 22:00:04 +0000
Subject: [PATCH 0133/1116] [X86] Improve type legalization of
 (v2i32/v4i16/v8i16 (bitcast (v2f32))) to avoid a stack stack temporary.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344425 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp      | 20 +++++++++++++-------
 test/CodeGen/X86/2012-01-18-vbitcast.ll |  9 +--------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ffb5acf3386..86141965393 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26296,7 +26296,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
-    if (SrcVT != MVT::f64 ||
+    if ((SrcVT != MVT::f64 && SrcVT != MVT::v2f32) ||
         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) ||
         getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
       return;
@@ -26304,12 +26304,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     unsigned NumElts = DstVT.getVectorNumElements();
     EVT SVT = DstVT.getVectorElementType();
     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
-    SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                   MVT::v2f64, N->getOperand(0));
-    SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
-    SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT,
-                                  ToVecInt, DAG.getIntPtrConstant(0, dl));
-    Results.push_back(Extract);
+    SDValue Res;
+    if (SrcVT == MVT::f64)
+      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                             MVT::v2f64, N->getOperand(0));
+    else
+      Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0),
+                        DAG.getUNDEF(MVT::v2f32));
+
+    Res = DAG.getBitcast(WiderVT, Res);
+    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
+                      DAG.getIntPtrConstant(0, dl));
+    Results.push_back(Res);
     return;
   }
   case ISD::MGATHER: {
diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll
index 61c25021bba..ab57b61770d 100644
--- a/test/CodeGen/X86/2012-01-18-vbitcast.ll
+++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll
@@ -4,17 +4,10 @@
 define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: vcast:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $16, %rsp
-; CHECK-NEXT:    .seh_stackalloc 16
-; CHECK-NEXT:    .seh_endprologue
-; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    psubq %xmm1, %xmm0
-; CHECK-NEXT:    addq $16, %rsp
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    .seh_handlerdata
-; CHECK-NEXT:    .text
-; CHECK-NEXT:    .seh_endproc
   %af = bitcast <2 x float> %a to <2 x i32>
   %bf = bitcast <2 x float> %b to <2 x i32>
   %x = sub <2 x i32> %af, %bf
-- 
GitLab


From cb064c84c5432b8193815b2ad04939c81f821c18 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 12 Oct 2018 22:55:17 +0000
Subject: [PATCH 0134/1116] [LegalizeVectorTypes] Use TLI.getVectorIdxTy
 instead of DAG.getIntPtrConstant.

There's no guarantee that vector indices should use pointer types. So use the correct query method.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344428 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index f4cad796863..a08a41ccaf2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3749,8 +3749,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
                                 InVT.getVectorNumElements());
   if (TLI.isTypeLegal(WideVT)) {
     SDValue Res = DAG.getNode(Opcode, dl, WideVT, InOp);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
-                       DAG.getIntPtrConstant(0, dl));
+    return DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
 
   EVT InEltVT = InVT.getVectorElementType();
-- 
GitLab


From 8aea7592db278dee40dee36e9a4fdcf58a544223 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 12 Oct 2018 22:57:57 +0000
Subject: [PATCH 0135/1116] [llvm-readobj] Fix an error message about
 .llvm.call-graph-profile

.note.llvm.cgprofile was an obvious typo in rL333823

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344430 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-readobj/ELFDumper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index 6f71d2d8b6b..5e7eae1b272 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -1423,7 +1423,7 @@ ELFDumper<ELFT>::ELFDumper(const ELFFile<ELFT> *Obj, ScopedPrinter &Writer)
       break;
     case ELF::SHT_LLVM_CALL_GRAPH_PROFILE:
       if (DotCGProfileSec != nullptr)
-        reportError("Multiple .note.llvm.cgprofile");
+        reportError("Multiple .llvm.call-graph-profile");
       DotCGProfileSec = &Sec;
       break;
     case ELF::SHT_LLVM_ADDRSIG:
-- 
GitLab


From 56ebea371fc81717d6269a8c35e0e9a553195f18 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@lowrisc.org>
Date: Fri, 12 Oct 2018 23:18:52 +0000
Subject: [PATCH 0136/1116] [RISCV] Eliminate unnecessary masking of promoted
 shift amounts

SelectionDAGBuilder::visitShift will always zero-extend a shift amount when it
is promoted to the ShiftAmountTy. This results in zero-extension (masking)
which is unnecessary for RISC-V as the shift operations only read the lower 5
or 6 bits (RV32 or RV64).

I initially proposed adding a getExtendForShiftAmount hook so the shift amount
can be any-extended (D52975). @efriedma explained this was unsafe, so I have
instead eliminate the unnecessary and operations at instruction selection time
in a manner similar to X86InstrCompiler.td.

Differential Revision: https://reviews.llvm.org/D53224


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344432 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/RISCV/RISCVInstrInfo.td       | 23 +++++++-
 test/CodeGen/RISCV/alu16.ll              |  9 ---
 test/CodeGen/RISCV/alu8.ll               |  5 --
 test/CodeGen/RISCV/shift-masked-shamt.ll | 70 ++++++++++++++++++++++++
 4 files changed, 90 insertions(+), 17 deletions(-)
 create mode 100644 test/CodeGen/RISCV/shift-masked-shamt.ll

diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td
index 5ca1cbd165d..50012569a74 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/lib/Target/RISCV/RISCVInstrInfo.td
@@ -205,6 +205,12 @@ def ixlenimm : Operand<XLenVT> {
 // Standalone (codegen-only) immleaf patterns.
 def simm32     : ImmLeaf<XLenVT, [{return isInt<32>(Imm);}]>;
 def simm32hi20 : ImmLeaf<XLenVT, [{return isShiftedInt<20, 12>(Imm);}]>;
+// A mask value that won't affect significant shift bits.
+def immshiftxlen : ImmLeaf<XLenVT, [{
+  if (Subtarget->is64Bit())
+    return countTrailingOnes<uint64_t>(Imm) >= 6;
+  return countTrailingOnes<uint64_t>(Imm) >= 5;
+}]>;
 
 // Addressing modes.
 // Necessary because a frameindex can't be matched directly in a pattern.
@@ -646,13 +652,24 @@ def : PatGprGpr<and, AND>;
 def : PatGprSimm12<and, ANDI>;
 def : PatGprGpr<xor, XOR>;
 def : PatGprSimm12<xor, XORI>;
-def : PatGprGpr<shl, SLL>;
 def : PatGprUimmLog2XLen<shl, SLLI>;
-def : PatGprGpr<srl, SRL>;
 def : PatGprUimmLog2XLen<srl, SRLI>;
-def : PatGprGpr<sra, SRA>;
 def : PatGprUimmLog2XLen<sra, SRAI>;
 
+// Match both a plain shift and one where the shift amount is masked (this is
+// typically introduced when the legalizer promotes the shift amount and
+// zero-extends it). For RISC-V, the mask is unnecessary as shifts in the base
+// ISA only read the least significant 5 bits (RV32I) or 6 bits (RV64I).
+multiclass VarShiftXLenPat<PatFrag ShiftOp, RVInst Inst> {
+  def : Pat<(ShiftOp GPR:$rs1, GPR:$rs2), (Inst GPR:$rs1, GPR:$rs2)>;
+  def : Pat<(ShiftOp GPR:$rs1, (and GPR:$rs2, immshiftxlen)),
+            (Inst GPR:$rs1, GPR:$rs2)>;
+}
+
+defm : VarShiftXLenPat<shl, SLL>;
+defm : VarShiftXLenPat<srl, SRL>;
+defm : VarShiftXLenPat<sra, SRA>;
+
 /// FrameIndex calculations
 
 def : Pat<(add (i32 AddrFI:$Rs), simm12:$imm12),
diff --git a/test/CodeGen/RISCV/alu16.ll b/test/CodeGen/RISCV/alu16.ll
index 20b79a987f6..79e74ffc8a5 100644
--- a/test/CodeGen/RISCV/alu16.ll
+++ b/test/CodeGen/RISCV/alu16.ll
@@ -6,8 +6,6 @@
 ; that legalisation of these non-native types doesn't introduce unnecessary
 ; inefficiencies.
 
-; TODO: it's unnecessary to mask (zero-extend) the shift amount.
-
 define i16 @addi(i16 %a) nounwind {
 ; RV32I-LABEL: addi:
 ; RV32I:       # %bb.0:
@@ -122,9 +120,6 @@ define i16 @sub(i16 %a, i16 %b) nounwind {
 define i16 @sll(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: sll:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a2, 16
-; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    ret
   %1 = shl i16 %a, %b
@@ -173,7 +168,6 @@ define i16 @srl(i16 %a, i16 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 16
 ; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -184,9 +178,6 @@ define i16 @srl(i16 %a, i16 %b) nounwind {
 define i16 @sra(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: sra:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a2, 16
-; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sra a0, a0, a1
diff --git a/test/CodeGen/RISCV/alu8.ll b/test/CodeGen/RISCV/alu8.ll
index f7d0e8beef3..ad97e620319 100644
--- a/test/CodeGen/RISCV/alu8.ll
+++ b/test/CodeGen/RISCV/alu8.ll
@@ -6,8 +6,6 @@
 ; that legalisation of these non-native types doesn't introduce unnecessary
 ; inefficiencies.
 
-; TODO: it's unnecessary to mask (zero-extend) the shift amount.
-
 define i8 @addi(i8 %a) nounwind {
 ; RV32I-LABEL: addi:
 ; RV32I:       # %bb.0:
@@ -118,7 +116,6 @@ define i8 @sub(i8 %a, i8 %b) nounwind {
 define i8 @sll(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: sll:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a1, a1, 255
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    ret
   %1 = shl i8 %a, %b
@@ -163,7 +160,6 @@ define i8 @xor(i8 %a, i8 %b) nounwind {
 define i8 @srl(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: srl:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a1, a1, 255
 ; RV32I-NEXT:    andi a0, a0, 255
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -174,7 +170,6 @@ define i8 @srl(i8 %a, i8 %b) nounwind {
 define i8 @sra(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: sra:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a1, a1, 255
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sra a0, a0, a1
diff --git a/test/CodeGen/RISCV/shift-masked-shamt.ll b/test/CodeGen/RISCV/shift-masked-shamt.ll
new file mode 100644
index 00000000000..5c77aa2d77f
--- /dev/null
+++ b/test/CodeGen/RISCV/shift-masked-shamt.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+
+; This test checks that unnecessary masking of shift amount operands is
+; eliminated during instruction selection. The test needs to ensure that the
+; masking is not removed if it may affect the shift amount.
+
+define i32 @sll_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sll_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 31
+  %2 = shl i32 %a, %1
+  ret i32 %2
+}
+
+define i32 @sll_non_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sll_non_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a1, a1, 15
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 15
+  %2 = shl i32 %a, %1
+  ret i32 %2
+}
+
+define i32 @srl_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: srl_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 4095
+  %2 = lshr i32 %a, %1
+  ret i32 %2
+}
+
+define i32 @srl_non_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: srl_non_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a1, a1, 7
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 7
+  %2 = lshr i32 %a, %1
+  ret i32 %2
+}
+
+define i32 @sra_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sra_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 65535
+  %2 = ashr i32 %a, %1
+  ret i32 %2
+}
+
+define i32 @sra_non_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sra_non_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a1, a1, 32
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 32
+  %2 = ashr i32 %a, %1
+  ret i32 %2
+}
-- 
GitLab


From d069d45aa888c5b8a44521d4cade32767c09b35e Mon Sep 17 00:00:00 2001
From: Kostya Serebryany <kcc@google.com>
Date: Fri, 12 Oct 2018 23:21:48 +0000
Subject: [PATCH 0137/1116] move GetOrCreateFunctionComdat to
 Instrumentation.cpp/Instrumentation.h

Summary:
GetOrCreateFunctionComdat is currently used in SanitizerCoverage,
where it's defined. I'm planing to use it in HWASAN as well,
so moving it into a common location.
NFC

Reviewers: morehouse

Reviewed By: morehouse

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53218

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344433 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Instrumentation.h     |  6 +++++
 .../Instrumentation/Instrumentation.cpp       | 15 +++++++++++++
 .../Instrumentation/SanitizerCoverage.cpp     | 22 ++++---------------
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 2157fcab726..0011a5b3c51 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -27,6 +27,7 @@ namespace llvm {
 class FunctionPass;
 class ModulePass;
 class OptimizationRemarkEmitter;
+class Comdat;
 
 /// Instrumentation passes often insert conditional checks into entry blocks.
 /// Call this function before splitting the entry block to move instructions
@@ -41,6 +42,11 @@ GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
                                              bool AllowMerging,
                                              const char *NamePrefix = "");
 
+// Returns F.getComdat() if it exists.
+// Otherwise creates a new comdat, sets F's comdat, and returns it.
+// Returns nullptr on failure.
+Comdat *GetOrCreateFunctionComdat(Function &F, const std::string &ModuleId);
+
 // Insert GCOV profiling instrumentation
 struct GCOVOptions {
   static GCOVOptions getDefault();
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 1c739c09e39..55b449ffca1 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -70,6 +70,21 @@ GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str,
   return GV;
 }
 
+Comdat *llvm::GetOrCreateFunctionComdat(Function &F,
+                                        const std::string &ModuleId) {
+  if (auto Comdat = F.getComdat()) return Comdat;
+  assert(F.hasName());
+  Module *M = F.getParent();
+  std::string Name = F.getName();
+  if (F.hasLocalLinkage()) {
+    if (ModuleId.empty())
+      return nullptr;
+    Name += ModuleId;
+  }
+  F.setComdat(M->getOrInsertComdat(Name));
+  return F.getComdat();
+}
+
 /// initializeInstrumentation - Initialize all passes in the TransformUtils
 /// library.
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 0bed4139518..b3450728f04 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -220,8 +220,6 @@ private:
                    MDNode::get(*C, None));
   }
 
-  Comdat *GetOrCreateFunctionComdat(Function &F);
-
   std::string getSectionName(const std::string &Section) const;
   std::string getSectionStart(const std::string &Section) const;
   std::string getSectionEnd(const std::string &Section) const;
@@ -590,28 +588,16 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
   return true;
 }
 
-Comdat *SanitizerCoverageModule::GetOrCreateFunctionComdat(Function &F) {
-  if (auto Comdat = F.getComdat()) return Comdat;
-  if (!TargetTriple.isOSBinFormatELF()) return nullptr;
-  assert(F.hasName());
-  std::string Name = F.getName();
-  if (F.hasLocalLinkage()) {
-    if (CurModuleUniqueId.empty()) return nullptr;
-    Name += CurModuleUniqueId;
-  }
-  auto Comdat = CurModule->getOrInsertComdat(Name);
-  F.setComdat(Comdat);
-  return Comdat;
-}
-
 GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
     size_t NumElements, Function &F, Type *Ty, const char *Section) {
   ArrayType *ArrayTy = ArrayType::get(Ty, NumElements);
   auto Array = new GlobalVariable(
       *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage,
       Constant::getNullValue(ArrayTy), "__sancov_gen_");
-  if (auto Comdat = GetOrCreateFunctionComdat(F))
-    Array->setComdat(Comdat);
+
+  if (TargetTriple.isOSBinFormatELF())
+    if (auto Comdat = GetOrCreateFunctionComdat(F, CurModuleUniqueId))
+      Array->setComdat(Comdat);
   Array->setSection(getSectionName(Section));
   Array->setAlignment(Ty->isPointerTy() ? DL->getPointerSize()
                                         : Ty->getPrimitiveSizeInBits() / 8);
-- 
GitLab


From 93c7b61d509c53ed8dc790934eb9ca6bca64e57a Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Sat, 13 Oct 2018 07:09:10 +0000
Subject: [PATCH 0138/1116] [WebAssembly][NFC] Unify ARGUMENT classes

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53172

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344436 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyAsmPrinter.cpp     | 16 +++++++-------
 .../WebAssembly/WebAssemblyFastISel.cpp       | 10 ++++-----
 .../WebAssembly/WebAssemblyInstrInfo.td       | 18 ++++++++--------
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 21 ++++++-------------
 .../WebAssembly/WebAssemblyUtilities.cpp      | 16 +++++++-------
 5 files changed, 36 insertions(+), 45 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 2ea3760b923..b8ac85943eb 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -176,14 +176,14 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   LLVM_DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
 
   switch (MI->getOpcode()) {
-  case WebAssembly::ARGUMENT_I32:
-  case WebAssembly::ARGUMENT_I32_S:
-  case WebAssembly::ARGUMENT_I64:
-  case WebAssembly::ARGUMENT_I64_S:
-  case WebAssembly::ARGUMENT_F32:
-  case WebAssembly::ARGUMENT_F32_S:
-  case WebAssembly::ARGUMENT_F64:
-  case WebAssembly::ARGUMENT_F64_S:
+  case WebAssembly::ARGUMENT_i32:
+  case WebAssembly::ARGUMENT_i32_S:
+  case WebAssembly::ARGUMENT_i64:
+  case WebAssembly::ARGUMENT_i64_S:
+  case WebAssembly::ARGUMENT_f32:
+  case WebAssembly::ARGUMENT_f32_S:
+  case WebAssembly::ARGUMENT_f64:
+  case WebAssembly::ARGUMENT_f64_S:
   case WebAssembly::ARGUMENT_v16i8:
   case WebAssembly::ARGUMENT_v16i8_S:
   case WebAssembly::ARGUMENT_v8i16:
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 8dc535445d6..0be4f228347 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -646,19 +646,19 @@ bool WebAssemblyFastISel::fastLowerArguments() {
     case MVT::i8:
     case MVT::i16:
     case MVT::i32:
-      Opc = WebAssembly::ARGUMENT_I32;
+      Opc = WebAssembly::ARGUMENT_i32;
       RC = &WebAssembly::I32RegClass;
       break;
     case MVT::i64:
-      Opc = WebAssembly::ARGUMENT_I64;
+      Opc = WebAssembly::ARGUMENT_i64;
       RC = &WebAssembly::I64RegClass;
       break;
     case MVT::f32:
-      Opc = WebAssembly::ARGUMENT_F32;
+      Opc = WebAssembly::ARGUMENT_f32;
       RC = &WebAssembly::F32RegClass;
       break;
     case MVT::f64:
-      Opc = WebAssembly::ARGUMENT_F64;
+      Opc = WebAssembly::ARGUMENT_f64;
       RC = &WebAssembly::F64RegClass;
       break;
     case MVT::v16i8:
@@ -686,7 +686,7 @@ bool WebAssemblyFastISel::fastLowerArguments() {
       RC = &WebAssembly::V128RegClass;
       break;
     case MVT::ExceptRef:
-      Opc = WebAssembly::ARGUMENT_EXCEPT_REF;
+      Opc = WebAssembly::ARGUMENT_ExceptRef;
       RC = &WebAssembly::EXCEPT_REFRegClass;
       break;
     default:
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 9e1409cf90e..8d98510c67d 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -163,18 +163,18 @@ include "WebAssemblyInstrFormats.td"
 // Additional instructions.
 //===----------------------------------------------------------------------===//
 
-multiclass ARGUMENT<WebAssemblyRegClass vt> {
+multiclass ARGUMENT<WebAssemblyRegClass reg, ValueType vt> {
   let hasSideEffects = 1, isCodeGenOnly = 1,
       Defs = []<Register>, Uses = [ARGUMENTS] in
-  defm ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
-                        (outs), (ins i32imm:$argno),
-                        [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
+  defm ARGUMENT_#vt :
+    I<(outs reg:$res), (ins i32imm:$argno), (outs), (ins i32imm:$argno),
+      [(set (vt reg:$res), (WebAssemblyargument timm:$argno))]>;
 }
-defm "": ARGUMENT<I32>;
-defm "": ARGUMENT<I64>;
-defm "": ARGUMENT<F32>;
-defm "": ARGUMENT<F64>;
-defm "": ARGUMENT<EXCEPT_REF>;
+defm "": ARGUMENT<I32, i32>;
+defm "": ARGUMENT<I64, i64>;
+defm "": ARGUMENT<F32, f32>;
+defm "": ARGUMENT<F64, f64>;
+defm "": ARGUMENT<EXCEPT_REF, ExceptRef>;
 
 // get_local and set_local are not generated by instruction selection; they
 // are implied by virtual register uses and defs.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index b575a039ae0..1eb38588c81 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -21,21 +21,12 @@ multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
             Requires<[HasSIMD128]>;
 }
 
-multiclass SIMD_ARGUMENT<ValueType vt> {
-  let hasSideEffects = 1, isCodeGenOnly = 1,
-      Defs = []<Register>, Uses = [ARGUMENTS] in
-  defm ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno),
-                             (outs), (ins i32imm:$argno),
-                             [(set (vt V128:$res),
-                                  (WebAssemblyargument timm:$argno))]>;
-}
-
-defm "": SIMD_ARGUMENT<v16i8>;
-defm "": SIMD_ARGUMENT<v8i16>;
-defm "": SIMD_ARGUMENT<v4i32>;
-defm "": SIMD_ARGUMENT<v2i64>;
-defm "": SIMD_ARGUMENT<v4f32>;
-defm "": SIMD_ARGUMENT<v2f64>;
+defm "" : ARGUMENT<V128, v16i8>;
+defm "" : ARGUMENT<V128, v8i16>;
+defm "" : ARGUMENT<V128, v4i32>;
+defm "" : ARGUMENT<V128, v2i64>;
+defm "" : ARGUMENT<V128, v4f32>;
+defm "" : ARGUMENT<V128, v2f64>;
 
 // Constrained immediate argument types
 foreach SIZE = [8, 16] in
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index a25ec7cf4c2..ada6fb9a96d 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -27,14 +27,14 @@ const char *const WebAssembly::PersonalityWrapperFn =
 
 bool WebAssembly::isArgument(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
-  case WebAssembly::ARGUMENT_I32:
-  case WebAssembly::ARGUMENT_I32_S:
-  case WebAssembly::ARGUMENT_I64:
-  case WebAssembly::ARGUMENT_I64_S:
-  case WebAssembly::ARGUMENT_F32:
-  case WebAssembly::ARGUMENT_F32_S:
-  case WebAssembly::ARGUMENT_F64:
-  case WebAssembly::ARGUMENT_F64_S:
+  case WebAssembly::ARGUMENT_i32:
+  case WebAssembly::ARGUMENT_i32_S:
+  case WebAssembly::ARGUMENT_i64:
+  case WebAssembly::ARGUMENT_i64_S:
+  case WebAssembly::ARGUMENT_f32:
+  case WebAssembly::ARGUMENT_f32_S:
+  case WebAssembly::ARGUMENT_f64:
+  case WebAssembly::ARGUMENT_f64_S:
   case WebAssembly::ARGUMENT_v16i8:
   case WebAssembly::ARGUMENT_v16i8_S:
   case WebAssembly::ARGUMENT_v8i16:
-- 
GitLab


From 6e3463c0eb418dd2265a827d807814ab6ac53554 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Sat, 13 Oct 2018 07:21:44 +0000
Subject: [PATCH 0139/1116] [Intrinsic] Add llvm.minimum and llvm.maximum
 instrinsic functions

Summary:
These new intrinsics have the semantics of the `minimum` and `maximum`
operations specified by the latest draft of IEEE 754-2018. Unlike
llvm.minnum and llvm.maxnum, these new intrinsics propagate NaNs and
always treat -0.0 as less than 0.0. `minimum` and `maximum` lower
directly to the existing `fminnan` and `fmaxnan` ISel DAG nodes. It is
safe to reuse these DAG nodes because before this patch were only
emitted in situations where there were known to be no NaN arguments or
where NaN propagation was correct and there were known to be no zero
arguments. I know of only four backends that lower fminnan and
fmaxnan: WebAssembly, ARM, AArch64, and SystemZ, and each of these
lowers fminnan and fmaxnan to instructions that are compatible with
the IEEE 754-2018 semantics.

Reviewers: aheejin, dschuff, sunfish, javed.absar

Subscribers: kristof.beyls, dexonsmith, kristina, llvm-commits

Differential Revision: https://reviews.llvm.org/D52764

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344437 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                              | 76 +++++++++++++++++++
 include/llvm/ADT/APFloat.h                    | 26 +++++++
 include/llvm/CodeGen/ISDOpcodes.h             |  5 +-
 include/llvm/IR/IRBuilder.h                   | 10 +++
 include/llvm/IR/Intrinsics.td                 |  8 ++
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 12 +++
 unittests/ADT/APFloatTest.cpp                 | 30 ++++++++
 unittests/IR/IRBuilderTest.cpp                |  8 ++
 8 files changed, 173 insertions(+), 2 deletions(-)

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 9fcfd29a6e8..e977657d1cb 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -11560,6 +11560,82 @@ NaN, the intrinsic lowering is responsible for quieting the inputs to
 correctly return the non-NaN input (e.g. by using the equivalent of
 ``llvm.canonicalize``).
 
+'``llvm.minimum.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.minimum`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.minimum.f32(float %Val0, float %Val1)
+      declare double    @llvm.minimum.f64(double %Val0, double %Val1)
+      declare x86_fp80  @llvm.minimum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
+      declare fp128     @llvm.minimum.f128(fp128 %Val0, fp128 %Val1)
+      declare ppc_fp128 @llvm.minimum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1)
+
+Overview:
+"""""""""
+
+The '``llvm.minimum.*``' intrinsics return the minimum of the two
+arguments, propagating NaNs and treating -0.0 as less than +0.0.
+
+
+Arguments:
+""""""""""
+
+The arguments and return value are floating-point numbers of the same
+type.
+
+Semantics:
+""""""""""
+If either operand is a NaN, returns NaN. Otherwise returns the lesser
+of the two arguments. -0.0 is considered to be less than +0.0 for this
+intrinsic. Note that these are the semantics specified in the draft of
+IEEE 754-2018.
+
+'``llvm.maximum.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.maximum`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.maximum.f32(float %Val0, float %Val1)
+      declare double    @llvm.maximum.f64(double %Val0, double %Val1)
+      declare x86_fp80  @llvm.maximum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
+      declare fp128     @llvm.maximum.f128(fp128 %Val0, fp128 %Val1)
+      declare ppc_fp128 @llvm.maximum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1)
+
+Overview:
+"""""""""
+
+The '``llvm.maximum.*``' intrinsics return the maximum of the two
+arguments, propagating NaNs and treating -0.0 as less than +0.0.
+
+
+Arguments:
+""""""""""
+
+The arguments and return value are floating-point numbers of the same
+type.
+
+Semantics:
+""""""""""
+If either operand is a NaN, returns NaN. Otherwise returns the greater
+of the two arguments. -0.0 is considered to be less than +0.0 for this
+intrinsic. Note that these are the semantics specified in the draft of
+IEEE 754-2018.
+
 '``llvm.copysign.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index 5c59af4c04b..52ed183c78a 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h
@@ -1243,6 +1243,32 @@ inline APFloat maxnum(const APFloat &A, const APFloat &B) {
   return (A.compare(B) == APFloat::cmpLessThan) ? B : A;
 }
 
+/// Implements IEEE 754-2018 minimum semantics. Returns the smaller of 2
+/// arguments, propagating NaNs and treating -0 as less than +0.
+LLVM_READONLY
+inline APFloat minimum(const APFloat &A, const APFloat &B) {
+  if (A.isNaN())
+    return A;
+  if (B.isNaN())
+    return B;
+  if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative()))
+    return A.isNegative() ? A : B;
+  return (B.compare(A) == APFloat::cmpLessThan) ? B : A;
+}
+
+/// Implements IEEE 754-2018 maximum semantics. Returns the larger of 2
+/// arguments, propagating NaNs and treating -0 as less than +0.
+LLVM_READONLY
+inline APFloat maximum(const APFloat &A, const APFloat &B) {
+  if (A.isNaN())
+    return A;
+  if (B.isNaN())
+    return B;
+  if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative()))
+    return A.isNegative() ? B : A;
+  return (A.compare(B) == APFloat::cmpLessThan) ? B : A;
+}
+
 } // namespace llvm
 
 #undef APFLOAT_DISPATCH_ON_SEMANTICS
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index ec9c46140d7..d9a513fe247 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -560,8 +560,9 @@ namespace ISD {
     ///
     /// The return value of (FMINNUM 0.0, -0.0) could be either 0.0 or -0.0.
     FMINNUM, FMAXNUM,
-    /// FMINNAN/FMAXNAN - Behave identically to FMINNUM/FMAXNUM, except that
-    /// when a single input is NaN, NaN is returned.
+    /// FMINNAN/FMAXNAN - NaN-propagating minimum/maximum that also treat -0.0
+    /// as less than 0.0. While FMINNUM/FMAXNUM follow IEEE 754-2008 semantics,
+    /// FMINNAN/FMAXNAN follow IEEE 754-2018 draft semantics.
     FMINNAN, FMAXNAN,
 
     /// FSINCOS - Compute both fsin and fcos as a single operation.
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index 0af53c5b3f4..e89c44380d0 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -705,6 +705,16 @@ public:
     return CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS, nullptr, Name);
   }
 
+  /// Create call to the minimum intrinsic.
+  CallInst *CreateMinimum(Value *LHS, Value *RHS, const Twine &Name = "") {
+    return CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS, nullptr, Name);
+  }
+
+  /// Create call to the maximum intrinsic.
+  CallInst *CreateMaximum(Value *LHS, Value *RHS, const Twine &Name = "") {
+    return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name);
+  }
+
 private:
   /// Create a call to a masked intrinsic with given Id.
   CallInst *CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef<Value *> Ops,
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index b405e86ef40..410e35f9acb 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -453,6 +453,14 @@ def int_maxnum : Intrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
   [IntrNoMem, IntrSpeculatable, Commutative]
 >;
+def int_minimum : Intrinsic<[llvm_anyfloat_ty],
+  [LLVMMatchType<0>, LLVMMatchType<0>],
+  [IntrNoMem, IntrSpeculatable, Commutative]
+>;
+def int_maximum : Intrinsic<[llvm_anyfloat_ty],
+  [LLVMMatchType<0>, LLVMMatchType<0>],
+  [IntrNoMem, IntrSpeculatable, Commutative]
+>;
 
 // NOTE: these are internal interfaces.
 def int_setjmp     : Intrinsic<[llvm_i32_ty],  [llvm_ptr_ty]>;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 868160c77a3..f7866665bcb 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5584,6 +5584,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(1))));
     return nullptr;
   }
+  case Intrinsic::minimum:
+    setValue(&I, DAG.getNode(ISD::FMINNAN, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
+  case Intrinsic::maximum:
+    setValue(&I, DAG.getNode(ISD::FMAXNAN, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
   case Intrinsic::copysign:
     setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index 1212b45fb57..b739e857849 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -555,6 +555,36 @@ TEST(APFloatTest, MaxNum) {
   EXPECT_EQ(1.0, maxnum(nan, f1).convertToDouble());
 }
 
+TEST(APFloatTest, Minimum) {
+  APFloat f1(1.0);
+  APFloat f2(2.0);
+  APFloat zp(0.0);
+  APFloat zn(-0.0);
+  APFloat nan = APFloat::getNaN(APFloat::IEEEdouble());
+
+  EXPECT_EQ(1.0, minimum(f1, f2).convertToDouble());
+  EXPECT_EQ(1.0, minimum(f2, f1).convertToDouble());
+  EXPECT_EQ(-0.0, minimum(zp, zn).convertToDouble());
+  EXPECT_EQ(-0.0, minimum(zn, zp).convertToDouble());
+  EXPECT_TRUE(std::isnan(minimum(f1, nan).convertToDouble()));
+  EXPECT_TRUE(std::isnan(minimum(nan, f1).convertToDouble()));
+}
+
+TEST(APFloatTest, Maximum) {
+  APFloat f1(1.0);
+  APFloat f2(2.0);
+  APFloat zp(0.0);
+  APFloat zn(-0.0);
+  APFloat nan = APFloat::getNaN(APFloat::IEEEdouble());
+
+  EXPECT_EQ(2.0, maximum(f1, f2).convertToDouble());
+  EXPECT_EQ(2.0, maximum(f2, f1).convertToDouble());
+  EXPECT_EQ(0.0, maximum(zp, zn).convertToDouble());
+  EXPECT_EQ(0.0, maximum(zn, zp).convertToDouble());
+  EXPECT_TRUE(std::isnan(maximum(f1, nan).convertToDouble()));
+  EXPECT_TRUE(std::isnan(maximum(nan, f1).convertToDouble()));
+}
+
 TEST(APFloatTest, Denormal) {
   APFloat::roundingMode rdmd = APFloat::rmNearestTiesToEven;
 
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index 42c0393d382..713c0a14f66 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -68,6 +68,14 @@ TEST_F(IRBuilderTest, Intrinsics) {
   II = cast<IntrinsicInst>(Call);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::maxnum);
 
+  Call = Builder.CreateMinimum(V, V);
+  II = cast<IntrinsicInst>(Call);
+  EXPECT_EQ(II->getIntrinsicID(), Intrinsic::minimum);
+
+  Call = Builder.CreateMaximum(V, V);
+  II = cast<IntrinsicInst>(Call);
+  EXPECT_EQ(II->getIntrinsicID(), Intrinsic::maximum);
+
   Call = Builder.CreateIntrinsic(Intrinsic::readcyclecounter, {}, {});
   II = cast<IntrinsicInst>(Call);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::readcyclecounter);
-- 
GitLab


From 3baba1cf36ad30ab99ca557ab8f208c3be88aaaf Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Sat, 13 Oct 2018 07:26:10 +0000
Subject: [PATCH 0140/1116] [WebAssembly] SIMD min and max

Summary: Depends on D52324 and D52764.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D52325

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344438 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  14 +-
 test/CodeGen/WebAssembly/f32.ll               |  24 ++-
 test/CodeGen/WebAssembly/f64.ll               |  24 ++-
 test/CodeGen/WebAssembly/simd-arith.ll        | 168 ++++++++++++++++++
 test/MC/WebAssembly/simd-encodings.s          |  12 ++
 5 files changed, 223 insertions(+), 19 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 1eb38588c81..af5c03599cd 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -699,21 +699,21 @@ defm "" : SIMDAbs<v2f64, "f64x2", 128>;
 // Floating-point min and max
 //===----------------------------------------------------------------------===//
 
+multiclass SIMDBinaryFP<SDNode node, string name, bits<32> baseInst> {
+  defm "" : SIMDBinary<v4f32, "f32x4", node, name, baseInst>;
+  defm "" : SIMDBinary<v2f64, "f64x2", node, name, !add(baseInst, 1)>;
+}
+
 // NaN-propagating minimum: min
-// TODO
+defm MIN : SIMDBinaryFP<fminnan, "min", 129>;
 
 // NaN-propagating maximum: max
-// TODO
+defm MAX : SIMDBinaryFP<fmaxnan, "max", 131>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point arithmetic
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDBinaryFP<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDBinary<v4f32, "f32x4", node, name, baseInst>;
-  defm "" : SIMDBinary<v2f64, "f64x2", node, name, !add(baseInst, 1)>;
-}
-
 // Addition: add
 let isCommutable = 1 in
 defm ADD : SIMDBinaryFP<fadd, "add", 133>;
diff --git a/test/CodeGen/WebAssembly/f32.ll b/test/CodeGen/WebAssembly/f32.ll
index 9314b2e6e5f..27520d035c9 100644
--- a/test/CodeGen/WebAssembly/f32.ll
+++ b/test/CodeGen/WebAssembly/f32.ll
@@ -123,12 +123,6 @@ define float @nearest32_via_rint(float %x) {
   ret float %a
 }
 
-; Min and max tests. LLVM currently only forms fminnan and fmaxnan nodes in
-; cases where there's a single fcmp with a select and it can prove that one
-; of the arms is never NaN, so we only test that case. In the future if LLVM
-; learns to form fminnan/fmaxnan in more cases, we can write more general
-; tests.
-
 ; CHECK-LABEL: fmin32:
 ; CHECK: f32.min $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -147,6 +141,24 @@ define float @fmax32(float %x) {
   ret float %b
 }
 
+; CHECK-LABEL: fmin32_intrinsic:
+; CHECK: f32.min $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+declare float @llvm.minimum.f32(float, float)
+define float @fmin32_intrinsic(float %x, float %y) {
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  ret float %a
+}
+
+; CHECK-LABEL: fmax32_intrinsic:
+; CHECK: f32.max $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+declare float @llvm.maximum.f32(float, float)
+define float @fmax32_intrinsic(float %x, float %y) {
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  ret float %a
+}
+
 ; CHECK-LABEL: fma32:
 ; CHECK: {{^}} f32.call $push[[LR:[0-9]+]]=, fmaf@FUNCTION, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[LR]]{{$}}
diff --git a/test/CodeGen/WebAssembly/f64.ll b/test/CodeGen/WebAssembly/f64.ll
index 5635e326561..d02767fa3a1 100644
--- a/test/CodeGen/WebAssembly/f64.ll
+++ b/test/CodeGen/WebAssembly/f64.ll
@@ -123,12 +123,6 @@ define double @nearest64_via_rint(double %x) {
   ret double %a
 }
 
-; Min and max tests. LLVM currently only forms fminnan and fmaxnan nodes in
-; cases where there's a single fcmp with a select and it can prove that one
-; of the arms is never NaN, so we only test that case. In the future if LLVM
-; learns to form fminnan/fmaxnan in more cases, we can write more general
-; tests.
-
 ; CHECK-LABEL: fmin64:
 ; CHECK: f64.min $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -147,6 +141,24 @@ define double @fmax64(double %x) {
   ret double %b
 }
 
+; CHECK-LABEL: fmin64_intrinsic:
+; CHECK: f64.min $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+declare double @llvm.minimum.f64(double, double)
+define double @fmin64_intrinsic(double %x, double %y) {
+  %a = call double @llvm.minimum.f64(double %x, double %y)
+  ret double %a
+}
+
+; CHECK-LABEL: fmax64_intrinsic:
+; CHECK: f64.max $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+declare double @llvm.maximum.f64(double, double)
+define double @fmax64_intrinsic(double %x, double %y) {
+  %a = call double @llvm.maximum.f64(double %x, double %y)
+  ret double %a
+}
+
 ; CHECK-LABEL: fma64:
 ; CHECK: {{^}} f64.call $push[[LR:[0-9]+]]=, fma@FUNCTION, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[LR]]{{$}}
diff --git a/test/CodeGen/WebAssembly/simd-arith.ll b/test/CodeGen/WebAssembly/simd-arith.ll
index f3e70156d8b..973f78b30dc 100644
--- a/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/test/CodeGen/WebAssembly/simd-arith.ll
@@ -765,6 +765,90 @@ define <4 x float> @abs_v4f32(<4 x float> %x) {
   ret <4 x float> %a
 }
 
+; CHECK-LABEL: min_unordered_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @min_unordered_v4f32(<4 x float> %x) {
+  %cmps = fcmp ule <4 x float> %x, <float 5., float 5., float 5., float 5.>
+  %a = select <4 x i1> %cmps, <4 x float> %x,
+    <4 x float> <float 5., float 5., float 5., float 5.>
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: max_unordered_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @max_unordered_v4f32(<4 x float> %x) {
+  %cmps = fcmp uge <4 x float> %x, <float 5., float 5., float 5., float 5.>
+  %a = select <4 x i1> %cmps, <4 x float> %x,
+    <4 x float> <float 5., float 5., float 5., float 5.>
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: min_ordered_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @min_ordered_v4f32(<4 x float> %x) {
+  %cmps = fcmp ole <4 x float> <float 5., float 5., float 5., float 5.>, %x
+  %a = select <4 x i1> %cmps,
+    <4 x float> <float 5., float 5., float 5., float 5.>, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: max_ordered_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @max_ordered_v4f32(<4 x float> %x) {
+  %cmps = fcmp oge <4 x float> <float 5., float 5., float 5., float 5.>, %x
+  %a = select <4 x i1> %cmps,
+    <4 x float> <float 5., float 5., float 5., float 5.>, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: min_intrinsic_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
+define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
+  %a = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: max_intrinsic_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
+define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
+  %a = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %a
+}
+
 ; CHECK-LABEL: add_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -848,6 +932,90 @@ define <2 x double> @abs_v2f64(<2 x double> %x) {
   ret <2 x double> %a
 }
 
+; CHECK-LABEL: min_unordered_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @min_unordered_v2f64(<2 x double> %x) {
+  %cmps = fcmp ule <2 x double> %x, <double 5., double 5.>
+  %a = select <2 x i1> %cmps, <2 x double> %x,
+    <2 x double> <double 5., double 5.>
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: max_unordered_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @max_unordered_v2f64(<2 x double> %x) {
+  %cmps = fcmp uge <2 x double> %x, <double 5., double 5.>
+  %a = select <2 x i1> %cmps, <2 x double> %x,
+    <2 x double> <double 5., double 5.>
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: min_ordered_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @min_ordered_v2f64(<2 x double> %x) {
+  %cmps = fcmp ole <2 x double> <double 5., double 5.>, %x
+  %a = select <2 x i1> %cmps, <2 x double> <double 5., double 5.>,
+    <2 x double> %x
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: max_ordered_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @max_ordered_v2f64(<2 x double> %x) {
+  %cmps = fcmp oge <2 x double> <double 5., double 5.>, %x
+  %a = select <2 x i1> %cmps, <2 x double> <double 5., double 5.>,
+    <2 x double> %x
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: min_intrinsic_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
+define <2 x double> @min_intrinsic_v2f64(<2 x double> %x, <2 x double> %y) {
+  %a = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: max_intrinsic_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+define <2 x double> @max_intrinsic_v2f64(<2 x double> %x, <2 x double> %y) {
+  %a = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %a
+}
+
 ; CHECK-LABEL: add_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f62x2
diff --git a/test/MC/WebAssembly/simd-encodings.s b/test/MC/WebAssembly/simd-encodings.s
index 02d07674c16..8cd4bc9cd34 100644
--- a/test/MC/WebAssembly/simd-encodings.s
+++ b/test/MC/WebAssembly/simd-encodings.s
@@ -382,6 +382,18 @@
     # CHECK: f64x2.abs # encoding: [0xfd,0x80]
     f64x2.abs
 
+    # CHECK: f32x4.min # encoding: [0xfd,0x81]
+    f32x4.min
+
+    # CHECK: f64x2.min # encoding: [0xfd,0x82]
+    f64x2.min
+
+    # CHECK: f32x4.max # encoding: [0xfd,0x83]
+    f32x4.max
+
+    # CHECK: f64x2.max # encoding: [0xfd,0x84]
+    f64x2.max
+
     # CHECK: f32x4.add # encoding: [0xfd,0x85]
     f32x4.add
 
-- 
GitLab


From 952b7309b14ccede0e01ccd43f85d8c28aa99280 Mon Sep 17 00:00:00 2001
From: "Arnaud A. de Grandmaison" <arnaud.degrandmaison@arm.com>
Date: Sat, 13 Oct 2018 07:43:56 +0000
Subject: [PATCH 0141/1116] [AArch64] Swap comparison operands if that enables
 some folding.

Summary:
AArch64 can fold some shift+extend operations on the RHS operand of
comparisons, so swap the operands if that makes sense.

This provides a fix for https://bugs.llvm.org/show_bug.cgi?id=38751

Reviewers: efriedma, t.p.northover, javed.absar

Subscribers: mcrosier, kristof.beyls, llvm-commits

Differential Revision: https://reviews.llvm.org/D53067

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344439 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp    |  86 ++-
 test/CodeGen/AArch64/and-mask-removal.ll      |  12 +-
 .../lack-of-signed-truncation-check.ll        |  45 +-
 test/CodeGen/AArch64/sat-add.ll               |  16 +-
 .../AArch64/signed-truncation-check.ll        |  45 +-
 test/CodeGen/AArch64/swap-compare-operands.ll | 632 ++++++++++++++++++
 6 files changed, 752 insertions(+), 84 deletions(-)
 create mode 100644 test/CodeGen/AArch64/swap-compare-operands.ll

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8cf9d55a950..90633807cdf 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1460,6 +1460,21 @@ static bool isLegalArithImmed(uint64_t C) {
   return IsLegal;
 }
 
+// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
+// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
+// can be set differently by this operation. It comes down to whether
+// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+// everything is fine. If not then the optimization is wrong. Thus general
+// comparisons are only valid if op2 != 0.
+//
+// So, finally, the only LLVM-native comparisons that don't mention C and V
+// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+// the absence of information about op2.
+static bool isCMN(SDValue Op, ISD::CondCode CC) {
+  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
+         (CC == ISD::SETEQ || CC == ISD::SETNE);
+}
+
 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               const SDLoc &dl, SelectionDAG &DAG) {
   EVT VT = LHS.getValueType();
@@ -1482,18 +1497,8 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
   // register to WZR/XZR if it ends up being unused.
   unsigned Opcode = AArch64ISD::SUBS;
 
-  if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
-      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
-    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
-    // can be set differently by this operation. It comes down to whether
-    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
-    // everything is fine. If not then the optimization is wrong. Thus general
-    // comparisons are only valid if op2 != 0.
-
-    // So, finally, the only LLVM-native comparisons that don't mention C and V
-    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
-    // the absence of information about op2.
+  if (isCMN(RHS, CC)) {
+    // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
     Opcode = AArch64ISD::ADDS;
     RHS = RHS.getOperand(1);
   } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
@@ -1765,6 +1770,42 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
 
 /// @}
 
+/// Returns how profitable it is to fold a comparison's operand's shift and/or
+/// extension operations.
+static unsigned getCmpOperandFoldingProfit(SDValue Op) {
+  auto isSupportedExtend = [&](SDValue V) {
+    if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
+      return true;
+
+    if (V.getOpcode() == ISD::AND)
+      if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
+        uint64_t Mask = MaskCst->getZExtValue();
+        return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
+      }
+
+    return false;
+  };
+
+  if (!Op.hasOneUse())
+    return 0;
+
+  if (isSupportedExtend(Op))
+    return 1;
+
+  unsigned Opc = Op.getOpcode();
+  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
+    if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      uint64_t Shift = ShiftCst->getZExtValue();
+      if (isSupportedExtend(Op.getOperand(0)))
+        return (Shift <= 4) ? 2 : 1;
+      EVT VT = Op.getValueType();
+      if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
+        return 1;
+    }
+
+  return 0;
+}
+
 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                              SDValue &AArch64cc, SelectionDAG &DAG,
                              const SDLoc &dl) {
@@ -1822,6 +1863,27 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       }
     }
   }
+
+  // Comparisons are canonicalized so that the RHS operand is simpler than the
+  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
+  // can fold some shift+extend operations on the RHS operand, so swap the
+  // operands if that can be done.
+  //
+  // For example:
+  //    lsl     w13, w11, #1
+  //    cmp     w13, w12
+  // can be turned into:
+  //    cmp     w12, w11, lsl #1
+  if (!isa<ConstantSDNode>(RHS) ||
+      !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
+    SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
+
+    if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
+      std::swap(LHS, RHS);
+      CC = ISD::getSetCCSwappedOperands(CC);
+    }
+  }
+
   SDValue Cmp;
   AArch64CC::CondCode AArch64CC;
   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
diff --git a/test/CodeGen/AArch64/and-mask-removal.ll b/test/CodeGen/AArch64/and-mask-removal.ll
index c02bc881cd3..4424b0e4112 100644
--- a/test/CodeGen/AArch64/and-mask-removal.ll
+++ b/test/CodeGen/AArch64/and-mask-removal.ll
@@ -179,7 +179,9 @@ ret_false:
 ret_true:
   ret i1 true
 ; CHECK-LABEL: test16_2
-; CHECK: and
+; CHECK: mov	[[CST:w[0-9]+]], #16882
+; CHECK: add	[[ADD:w[0-9]+]], w0, [[CST]]
+; CHECK: cmp	{{.*}}, [[ADD]], uxth
 ; CHECK: ret
 }
 
@@ -207,7 +209,9 @@ ret_false:
 ret_true:
   ret i1 true
 ; CHECK-LABEL: test16_4
-; CHECK: and
+; CHECK: mov	[[CST:w[0-9]+]], #29985
+; CHECK: add	[[ADD:w[0-9]+]], w0, [[CST]]
+; CHECK: cmp	{{.*}}, [[ADD]], uxth
 ; CHECK: ret
 }
 
@@ -249,7 +253,9 @@ ret_false:
 ret_true:
   ret i1 true
 ; CHECK-LABEL: test16_7
-; CHECK: and
+; CHECK: mov	[[CST:w[0-9]+]], #9272
+; CHECK: add	[[ADD:w[0-9]+]], w0, [[CST]]
+; CHECK: cmp	{{.*}}, [[ADD]], uxth
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll b/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll
index d8ae73293d9..f4680354d7e 100644
--- a/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll
+++ b/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll
@@ -35,8 +35,7 @@ define i1 @shifts_necmp_i16_i8(i16 %x) nounwind {
 define i1 @shifts_necmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 16 ; 32-16
@@ -48,8 +47,7 @@ define i1 @shifts_necmp_i32_i16(i32 %x) nounwind {
 define i1 @shifts_necmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 24 ; 32-8
@@ -61,8 +59,7 @@ define i1 @shifts_necmp_i32_i8(i32 %x) nounwind {
 define i1 @shifts_necmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 32 ; 64-32
@@ -74,8 +71,7 @@ define i1 @shifts_necmp_i64_i32(i64 %x) nounwind {
 define i1 @shifts_necmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 48 ; 64-16
@@ -87,8 +83,7 @@ define i1 @shifts_necmp_i64_i16(i64 %x) nounwind {
 define i1 @shifts_necmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 56 ; 64-8
@@ -117,8 +112,7 @@ define i1 @add_ultcmp_i16_i8(i16 %x) nounwind {
 define i1 @add_ultcmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -32768 ; ~0U << (16-1)
@@ -129,8 +123,7 @@ define i1 @add_ultcmp_i32_i16(i32 %x) nounwind {
 define i1 @add_ultcmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -128 ; ~0U << (8-1)
@@ -141,8 +134,7 @@ define i1 @add_ultcmp_i32_i8(i32 %x) nounwind {
 define i1 @add_ultcmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -2147483648 ; ~0U << (32-1)
@@ -153,8 +145,7 @@ define i1 @add_ultcmp_i64_i32(i64 %x) nounwind {
 define i1 @add_ultcmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -32768 ; ~0U << (16-1)
@@ -165,8 +156,7 @@ define i1 @add_ultcmp_i64_i16(i64 %x) nounwind {
 define i1 @add_ultcmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -128 ; ~0U << (8-1)
@@ -208,8 +198,7 @@ define i1 @add_ugecmp_i16_i8(i16 %x) nounwind {
 define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 32768 ; 1U << (16-1)
@@ -220,8 +209,7 @@ define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
 define i1 @add_ugecmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 128 ; 1U << (8-1)
@@ -232,8 +220,7 @@ define i1 @add_ugecmp_i32_i8(i32 %x) nounwind {
 define i1 @add_ugecmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 2147483648 ; 1U << (32-1)
@@ -244,8 +231,7 @@ define i1 @add_ugecmp_i64_i32(i64 %x) nounwind {
 define i1 @add_ugecmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 32768 ; 1U << (16-1)
@@ -256,8 +242,7 @@ define i1 @add_ugecmp_i64_i16(i64 %x) nounwind {
 define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 128 ; 1U << (8-1)
diff --git a/test/CodeGen/AArch64/sat-add.ll b/test/CodeGen/AArch64/sat-add.ll
index d9082859988..4d865a2b14b 100644
--- a/test/CodeGen/AArch64/sat-add.ll
+++ b/test/CodeGen/AArch64/sat-add.ll
@@ -52,11 +52,10 @@ define i8 @unsigned_sat_constant_i8_using_cmp_notval(i8 %x) {
 define i16 @unsigned_sat_constant_i16_using_min(i16 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i16_using_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w9, #65493
-; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w8, #65493
+; CHECK-NEXT:    cmp w8, w0, uxth
 ; CHECK-NEXT:    mov w8, #-43
-; CHECK-NEXT:    csel w8, w0, w8, lo
+; CHECK-NEXT:    csel w8, w0, w8, hi
 ; CHECK-NEXT:    add w0, w8, #42 // =42
 ; CHECK-NEXT:    ret
   %c = icmp ult i16 %x, -43
@@ -82,11 +81,10 @@ define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) {
 define i16 @unsigned_sat_constant_i16_using_cmp_notval(i16 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w10, #65493
-; CHECK-NEXT:    add w9, w0, #42 // =42
-; CHECK-NEXT:    cmp w8, w10
-; CHECK-NEXT:    csinv w0, w9, wzr, ls
+; CHECK-NEXT:    mov w9, #65493
+; CHECK-NEXT:    add w8, w0, #42 // =42
+; CHECK-NEXT:    cmp w9, w0, uxth
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %a = add i16 %x, 42
   %c = icmp ugt i16 %x, -43
diff --git a/test/CodeGen/AArch64/signed-truncation-check.ll b/test/CodeGen/AArch64/signed-truncation-check.ll
index f475dbc2f74..edd61b10d00 100644
--- a/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -35,8 +35,7 @@ define i1 @shifts_eqcmp_i16_i8(i16 %x) nounwind {
 define i1 @shifts_eqcmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 16 ; 32-16
@@ -48,8 +47,7 @@ define i1 @shifts_eqcmp_i32_i16(i32 %x) nounwind {
 define i1 @shifts_eqcmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 24 ; 32-8
@@ -61,8 +59,7 @@ define i1 @shifts_eqcmp_i32_i8(i32 %x) nounwind {
 define i1 @shifts_eqcmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 32 ; 64-32
@@ -74,8 +71,7 @@ define i1 @shifts_eqcmp_i64_i32(i64 %x) nounwind {
 define i1 @shifts_eqcmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 48 ; 64-16
@@ -87,8 +83,7 @@ define i1 @shifts_eqcmp_i64_i16(i64 %x) nounwind {
 define i1 @shifts_eqcmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 56 ; 64-8
@@ -117,8 +112,7 @@ define i1 @add_ugecmp_i16_i8(i16 %x) nounwind {
 define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -32768 ; ~0U << (16-1)
@@ -129,8 +123,7 @@ define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
 define i1 @add_ugecmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -128 ; ~0U << (8-1)
@@ -141,8 +134,7 @@ define i1 @add_ugecmp_i32_i8(i32 %x) nounwind {
 define i1 @add_ugecmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -2147483648 ; ~0U << (32-1)
@@ -153,8 +145,7 @@ define i1 @add_ugecmp_i64_i32(i64 %x) nounwind {
 define i1 @add_ugecmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -32768 ; ~0U << (16-1)
@@ -165,8 +156,7 @@ define i1 @add_ugecmp_i64_i16(i64 %x) nounwind {
 define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -128 ; ~0U << (8-1)
@@ -208,8 +198,7 @@ define i1 @add_ultcmp_i16_i8(i16 %x) nounwind {
 define i1 @add_ultcmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 32768 ; 1U << (16-1)
@@ -220,8 +209,7 @@ define i1 @add_ultcmp_i32_i16(i32 %x) nounwind {
 define i1 @add_ultcmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 128 ; 1U << (8-1)
@@ -232,8 +220,7 @@ define i1 @add_ultcmp_i32_i8(i32 %x) nounwind {
 define i1 @add_ultcmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 2147483648 ; 1U << (32-1)
@@ -244,8 +231,7 @@ define i1 @add_ultcmp_i64_i32(i64 %x) nounwind {
 define i1 @add_ultcmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 32768 ; 1U << (16-1)
@@ -256,8 +242,7 @@ define i1 @add_ultcmp_i64_i16(i64 %x) nounwind {
 define i1 @add_ultcmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 128 ; 1U << (8-1)
diff --git a/test/CodeGen/AArch64/swap-compare-operands.ll b/test/CodeGen/AArch64/swap-compare-operands.ll
new file mode 100644
index 00000000000..7c19b911166
--- /dev/null
+++ b/test/CodeGen/AArch64/swap-compare-operands.ll
@@ -0,0 +1,632 @@
+; RUN: llc < %s -mtriple=arm64 | FileCheck %s
+
+define i1 @testSwapCmpWithLSL64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSL64_1:
+; CHECK:      cmp     x1, x0, lsl #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i64 %a, 1
+  %cmp = icmp slt i64 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSL64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSL64_63:
+; CHECK:      cmp     x1, x0, lsl #63
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i64 %a, 63
+  %cmp = icmp slt i64 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSL32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSL32_1:
+; CHECK:      cmp     w1, w0, lsl #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i32 %a, 1
+  %cmp = icmp slt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSL32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSL32_31:
+; CHECK:      cmp     w1, w0, lsl #31
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i32 %a, 31
+  %cmp = icmp slt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSR64_1:
+; CHECK:      cmp     x1, x0, lsr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i64 %a, 1
+  %cmp = icmp slt i64 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSR64_63:
+; CHECK:      cmp     x1, x0, lsr #63
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i64 %a, 63
+  %cmp = icmp slt i64 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSR32_1:
+; CHECK:      cmp     w1, w0, lsr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i32 %a, 1
+  %cmp = icmp slt i32 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSR32_31:
+; CHECK:      cmp     w1, w0, lsr #31
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i32 %a, 31
+  %cmp = icmp slt i32 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithASR64_1:
+; CHECK:      cmp     x1, x0, asr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i64 %a, 1
+  %cmp = icmp slt i64 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithASR64_63:
+; CHECK:      cmp     x1, x0, asr #63
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i64 %a, 63
+  %cmp = icmp slt i64 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithASR32_1:
+; CHECK:      cmp     w1, w0, asr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i32 %a, 1
+  %cmp = icmp slt i32 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithASR32_31:
+; CHECK:      cmp     w1, w0, asr #31
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i32 %a, 31
+  %cmp = icmp slt i32 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend32_64(i32 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend32_64
+; CHECK:      cmp    x1, w0, uxtw #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = zext i32 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend16_64(i16 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend16_64
+; CHECK:      cmp    x1, w0, uxth #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = zext i16 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend8_64(i8 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64
+; CHECK:      cmp    x1, w0, uxtb #4
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a64 = zext i8 %a to i64
+  %shl.2 = shl i64 %a64, 4
+  %cmp = icmp ugt i64 %shl.2, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend16_32(i16 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64
+; CHECK:      cmp    w1, w0, uxth #3
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a32 = zext i16 %a to i32
+  %shl = shl i32 %a32, 3
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64
+; CHECK:      cmp    w1, w0, uxtb #4
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a32 = zext i8 %a to i32
+  %shl = shl i32 %a32, 4
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithTooLargeShiftedZeroExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithTooLargeShiftedZeroExtend8_64
+; CHECK:      and    [[REG:w[0-9]+]], w0, #0xff
+; CHECK:      cmp    w1, [[REG]], lsl #5
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = zext i8 %a to i32
+  %shl = shl i32 %a32, 5
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithZeroExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithZeroExtend8_64
+; CHECK:      cmp    w1, w0, uxtb
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = zext i8 %a to i32
+  %cmp = icmp ugt i32 %a32, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend32_64(i32 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend32_64
+; CHECK:      cmp    x1, w0, sxtw #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = sext i32 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend16_64(i16 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend16_64
+; CHECK:      cmp    x1, w0, sxth #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = sext i16 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend8_64(i8 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64
+; CHECK:      cmp    x1, w0, sxtb #4
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a64 = sext i8 %a to i64
+  %shl.2 = shl i64 %a64, 4
+  %cmp = icmp ugt i64 %shl.2, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend16_32(i16 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64
+; CHECK:      cmp    w1, w0, sxth #3
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a32 = sext i16 %a to i32
+  %shl = shl i32 %a32, 3
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64
+; CHECK:      cmp    w1, w0, sxtb #4
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = sext i8 %a to i32
+  %shl = shl i32 %a32, 4
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithTooLargeShiftedSignExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithTooLargeShiftedSignExtend8_64
+; CHECK:      sxtb   [[REG:w[0-9]+]], w0
+; CHECK-NEXT: cmp    w1, [[REG]], lsl #5
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = sext i8 %a to i32
+  %shl = shl i32 %a32, 5
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithSignExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithSignExtend8_64
+; CHECK:      cmp    w1, w0, sxtb
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = sext i8 %a to i32
+  %cmp = icmp ugt i32 %a32, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSL64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSL64_1:
+; CHECK:      cmn    x1, x0, lsl #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i64 %a, 1
+  %na = sub i64 0, %shl
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 62 bits shift as 63 has another optimization kicking in.
+define i1 @testSwapCmnWithLSL64_62(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSL64_62:
+; CHECK:      cmn    x1, x0, lsl #62
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i64 %a, 62
+  %na = sub i64 0, %shl
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 63 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSL64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSL64_63:
+; CHECK:      cmp    x1, x0, lsl #63
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i64 %a, 63
+  %na = sub i64 0, %shl
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSL32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSL32_1:
+; CHECK:      cmn    w1, w0, lsl #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i32 %a, 1
+  %na = sub i32 0, %shl
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 30 bits shift as 30 has another optimization kicking in.
+define i1 @testSwapCmnWithLSL32_30(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSL32_30:
+; CHECK:      cmn    w1, w0, lsl #30
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i32 %a, 30
+  %na = sub i32 0, %shl
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 31 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSL32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSL32_31:
+; CHECK:      cmp    w1, w0, lsl #31
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i32 %a, 31
+  %na = sub i32 0, %shl
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSR64_1:
+; CHECK:      cmn    x1, x0, lsr #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i64 %a, 1
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 62 bits shift as 63 has another optimization kicking in.
+define i1 @testSwapCmnWithLSR64_62(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSR64_62:
+; CHECK:      cmn    x1, x0, lsr #62
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i64 %a, 62
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 63 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSR64_63:
+; CHECK:      cmp    x1, x0, asr #63
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i64 %a, 63
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSR32_1:
+; CHECK:      cmn    w1, w0, lsr #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i32 %a, 1
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 30 bits shift as 31 has another optimization kicking in.
+define i1 @testSwapCmnWithLSR32_30(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSR32_30:
+; CHECK:      cmn    w1, w0, lsr #30
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i32 %a, 30
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 31 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSR32_31:
+; CHECK:      cmp    w1, w0, asr #31
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i32 %a, 31
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithASR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithASR64_1:
+; CHECK:      cmn    x1, x0, asr #3
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i64 %a, 3
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 62 bits shift as 63 has another optimization kicking in.
+define i1 @testSwapCmnWithASR64_62(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithASR64_62:
+; CHECK:      cmn    x1, x0, asr #62
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i64 %a, 62
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 63 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithASR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithASR64_63:
+; CHECK:      cmp    x1, x0, lsr #63
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i64 %a, 63
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithASR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithASR32_1:
+; CHECK:      cmn    w1, w0, asr #1
+; CHECK-NEXT: cset   w0, eq
+entry:
+  %lshr = ashr i32 %a, 1
+  %na = sub i32 0, %lshr
+  %cmp = icmp eq i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 30 bits shift as 31 has another optimization kicking in.
+define i1 @testSwapCmnWithASR32_30(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithASR32_30:
+; CHECK:      cmn    w1, w0, asr #30
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i32 %a, 30
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 31 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithASR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithASR32_31:
+; CHECK:      cmp    w1, w0, lsr #31
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i32 %a, 31
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+define i64 @testSwapCmpToCmnWithZeroExtend(i32 %a32, i16 %a16, i8 %a8, i64 %b64, i32 %b32) {
+; CHECK-LABEL testSwapCmpToCmnWithZeroExtend:
+t0:
+  %conv0 = zext i32 %a32 to i64
+  %shl0 = shl i64 %conv0, 1
+  %na0 = sub i64 0, %shl0
+  %cmp0 = icmp ne i64 %na0, %b64
+; CHECK: cmn    x3, w0, uxtw #1
+  br i1 %cmp0, label %t1, label %end
+
+t1:
+  %conv1 = zext i16 %a16 to i64
+  %shl1 = shl i64 %conv1, 4
+  %na1 = sub i64 0, %shl1
+  %cmp1 = icmp ne i64 %na1, %b64
+; CHECK: cmn    x3, w1, uxth #4
+  br i1 %cmp1, label %t2, label %end
+
+t2:
+  %conv2 = zext i8 %a8 to i64
+  %shl2 = shl i64 %conv2, 3
+  %na2 = sub i64 0, %shl2
+  %cmp2 = icmp ne i64 %na2, %b64
+; CHECK: cmn    x3, w2, uxtb #3
+  br i1 %cmp2, label %t3, label %end
+
+t3:
+  %conv3 = zext i16 %a16 to i32
+  %shl3 = shl i32 %conv3, 2
+  %na3 = sub i32 0, %shl3
+  %cmp3 = icmp ne i32 %na3, %b32
+; CHECK: cmn    w4, w1, uxth #2
+  br i1 %cmp3, label %t4, label %end
+
+t4:
+  %conv4 = zext i8 %a8 to i32
+  %shl4 = shl i32 %conv4, 1
+  %na4 = sub i32 0, %shl4
+  %cmp4 = icmp ne i32 %na4, %b32
+; CHECK: cmn    w4, w2, uxtb #1
+  br i1 %cmp4, label %t5, label %end
+
+t5:
+  %conv5 = zext i8 %a8 to i32
+  %shl5 = shl i32 %conv5, 5
+  %na5 = sub i32 0, %shl5
+  %cmp5 = icmp ne i32 %na5, %b32
+; CHECK: and    [[REG:w[0-9]+]], w2, #0xff
+; CHECK: cmn    w4, [[REG]], lsl #5
+  br i1 %cmp5, label %t6, label %end
+
+t6:
+  %conv6 = zext i8 %a8 to i32
+  %na6 = sub i32 0, %conv6
+  %cmp6 = icmp ne i32 %na6, %b32
+; CHECK: cmn    w4, w2, uxtb
+  br i1 %cmp6, label %t7, label %end
+
+t7:
+  ret i64 0
+
+end:
+  ret i64 1
+}
+define i64 @testSwapCmpToCmnWithSignExtend(i32 %a32, i16 %a16, i8 %a8, i64 %b64, i32 %b32) {
+; CHECK-LABEL testSwapCmpToCmnWithSignExtend:
+t0:
+  %conv0 = sext i32 %a32 to i64
+  %shl0 = shl i64 %conv0, 1
+  %na0 = sub i64 0, %shl0
+  %cmp0 = icmp ne i64 %na0, %b64
+; CHECK: cmn     x3, w0, sxtw #1
+  br i1 %cmp0, label %t1, label %end
+
+t1:
+  %conv1 = sext i16 %a16 to i64
+  %shl1 = shl i64 %conv1, 4
+  %na1 = sub i64 0, %shl1
+  %cmp1 = icmp ne i64 %na1, %b64
+; CHECK: cmn     x3, w1, sxth #4
+  br i1 %cmp1, label %t2, label %end
+
+t2:
+  %conv2 = sext i8 %a8 to i64
+  %shl2 = shl i64 %conv2, 3
+  %na2 = sub i64 0, %shl2
+  %cmp2 = icmp ne i64 %na2, %b64
+; CHECK: cmn     x3, w2, sxtb #3
+  br i1 %cmp2, label %t3, label %end
+
+t3:
+  %conv3 = sext i16 %a16 to i32
+  %shl3 = shl i32 %conv3, 2
+  %na3 = sub i32 0, %shl3
+  %cmp3 = icmp ne i32 %na3, %b32
+; CHECK: cmn     w4, w1, sxth #2
+  br i1 %cmp3, label %t4, label %end
+
+t4:
+  %conv4 = sext i8 %a8 to i32
+  %shl4 = shl i32 %conv4, 1
+  %na4 = sub i32 0, %shl4
+  %cmp4 = icmp ne i32 %na4, %b32
+; CHECK: cmn     w4, w2, sxtb #1
+  br i1 %cmp4, label %t5, label %end
+
+t5:
+  %conv5 = sext i8 %a8 to i32
+  %shl5 = shl i32 %conv5, 5
+  %na5 = sub i32 0, %shl5
+  %cmp5 = icmp ne i32 %na5, %b32
+; CHECK: sxtb    [[REG:w[0-9]+]], w2
+; CHECK: cmn     w4, [[REG]], lsl #5
+  br i1 %cmp5, label %t6, label %end
+
+t6:
+  %conv6 = sext i8 %a8 to i32
+  %na6 = sub i32 0, %conv6
+  %cmp6 = icmp ne i32 %na6, %b32
+; CHECK: cmn     w4, w2, sxtb
+  br i1 %cmp6, label %t7, label %end
+
+t7:
+  ret i64 0
+
+end:
+  ret i64 1
+}
-- 
GitLab


From 0f13604417ac2a666797fa40530a3164ba917a7d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 11:38:10 +0000
Subject: [PATCH 0142/1116] [X86][AVX] Add
 lowerVectorShuffleAsLanePermuteAndPermute for v4f64 shuffles (PR39161)

Add shuffle lowering for the case where we can shuffle the lanes into place followed by an in-lane permute.

This is mainly for cases where we can have non-repeating permutes in each lane, but for now I've just enabled it for v4f64 unary shuffles to fix PR39161 - there is no test coverage for other shuffles that might benefit yet.

We now have several cross-lane shuffle lowering methods that all do something similar - I've looked at merging some of these (notably by making the repeated mask mechanism in lowerVectorShuffleByMerging128BitLanes optional), but there is a lot of assertions/assumptions in the way that makes this tricky - I ended up going for adding yet another relatively simple method instead.

Differential Revision: https://reviews.llvm.org/D53148

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344446 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp        | 60 +++++++++++++++++++++++
 test/CodeGen/X86/vector-shuffle-256-v4.ll | 16 +++---
 2 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 86141965393..9020eebe203 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -13430,6 +13430,60 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
 }
 
+/// Lower a vector shuffle crossing multiple 128-bit lanes as
+/// a lane permutation followed by a per-lane permutation.
+///
+/// This is mainly for cases where we can have non-repeating permutes
+/// in each lane.
+///
+/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes,
+/// we should investigate merging them.
+static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits() / 128;
+  int NumEltsPerLane = NumElts / NumLanes;
+
+  SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
+  SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
+  SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
+
+  for (int i = 0; i != NumElts; ++i) {
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+
+    // Ensure that each lane comes from a single source lane.
+    int SrcLane = M / NumEltsPerLane;
+    int DstLane = i / NumEltsPerLane;
+    if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
+      return SDValue();
+    SrcLaneMask[DstLane] = SrcLane;
+
+    LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane);
+    PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
+  }
+
+  // If we're only shuffling a single lowest lane and the rest are identity
+  // then don't bother.
+  // TODO - isShuffleMaskInputInPlace could be extended to something like this.
+  int NumIdentityLanes = 0;
+  bool OnlyShuffleLowestLane = true;
+  for (int i = 0; i != NumLanes; ++i) {
+    if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
+                                   i * NumEltsPerLane))
+      NumIdentityLanes++;
+    else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
+      OnlyShuffleLowestLane = false;
+  }
+  if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
+    return SDValue();
+
+  SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
+  return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
+}
+
 /// Lower a vector shuffle crossing multiple 128-bit lanes as
 /// a permutation and blend of those lanes.
 ///
@@ -14166,6 +14220,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return V;
 
+    // Try to permute the lanes and then use a per-lane permute.
+    if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+            DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget))
+      return V;
+
     // Otherwise, fall back.
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
                                                    DAG, Subtarget);
@@ -14200,6 +14259,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Result;
+
   // If we have VLX support, we can use VEXPAND.
   if (Subtarget.hasVLX())
     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index ed281c31d46..b3750b74ad3 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -91,9 +91,8 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_1000:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4f64_1000:
@@ -174,10 +173,8 @@ define <4 x double> @shuffle_v4f64_2222_bc(<4 x i64> %a, <4 x i64> %b) {
 define <4 x double> @shuffle_v4f64_2233(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_2233:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4f64_2233:
@@ -766,9 +763,8 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_1000:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4i64_1000:
-- 
GitLab


From e76d7099025f36d5c09a44404756b1f41f55af96 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 12:12:06 +0000
Subject: [PATCH 0143/1116] [X86][SSE] Change CTTZ vector lowering to cttz(x) =
 ctpop(~x & (x - 1))

This patch changes the vector CTTZ lowering from:

cttz(x) = ctpop((x & -x) - 1)

to:

cttz(x) = ctpop(~x & (x - 1))

Not only does this make better use of the PANDN instruction, but it also matches the LegalizeDAG method which should allow us to remove the x86 specific code at some point in the future (we need to fix some issues with the bitcasted logic ops and CTPOP lowering first).

Differential Revision: https://reviews.llvm.org/D53214

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344447 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp   |   20 +-
 test/CodeGen/X86/vec_ctbits.ll       |   82 +-
 test/CodeGen/X86/vector-tzcnt-128.ll | 1593 ++++++++++++--------------
 test/CodeGen/X86/vector-tzcnt-256.ll |  924 +++++++--------
 test/CodeGen/X86/vector-tzcnt-512.ll |  526 ++++-----
 5 files changed, 1413 insertions(+), 1732 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9020eebe203..5fb3ece19f2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -22968,7 +22968,8 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
   return Op;
 }
 
-static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
+                         SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   unsigned NumBits = VT.getScalarSizeInBits();
   SDLoc dl(Op);
@@ -22977,21 +22978,24 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
     SDValue N0 = Op.getOperand(0);
     SDValue Zero = DAG.getConstant(0, dl, VT);
 
-    // lsb(x) = (x & -x)
-    SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
-                              DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
+    // Decompose 256-bit ops into smaller 128-bit ops.
+    if (VT.is256BitVector() && !Subtarget.hasInt256())
+      return Lower256IntUnary(Op, DAG);
 
-    // cttz_undef(x) = (width - 1) - ctlz(lsb)
+    // cttz_undef(x) = (width - 1) - ctlz(x & -x)
     if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
       SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
+      SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
+                                DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
       return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
                          DAG.getNode(ISD::CTLZ, dl, VT, LSB));
     }
 
-    // cttz(x) = ctpop(lsb - 1)
+    // cttz(x) = ctpop(~x & (x - 1))
     SDValue One = DAG.getConstant(1, dl, VT);
     return DAG.getNode(ISD::CTPOP, dl, VT,
-                       DAG.getNode(ISD::SUB, dl, VT, LSB, One));
+                       DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT),
+                                   DAG.getNode(ISD::SUB, dl, VT, N0, One)));
   }
 
   assert(Op.getOpcode() == ISD::CTTZ &&
@@ -25918,7 +25922,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
   case ISD::CTTZ:
-  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
+  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, Subtarget, DAG);
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   case ISD::MULHS:
   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 40e101756ef..002bcebdf71 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll
@@ -8,27 +8,26 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
 define <2 x i64> @footz(<2 x i64> %a) nounwind {
 ; CHECK-LABEL: footz:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm2, %xmm2
-; CHECK-NEXT:    psubq %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm3
-; CHECK-NEXT:    paddq %xmm2, %xmm3
-; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlw $1, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubb %xmm0, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    psrlw $2, %xmm3
-; CHECK-NEXT:    pand %xmm0, %xmm3
-; CHECK-NEXT:    paddb %xmm2, %xmm3
-; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlw $4, %xmm0
-; CHECK-NEXT:    paddb %xmm3, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psadbw %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    pandn %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm1, %xmm2
+; CHECK-NEXT:    psrlw $2, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm0, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    psadbw %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %c = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
   ret <2 x i64> %c
@@ -112,27 +111,26 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind {
 ; CHECK-LABEL: promtz:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    por {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm2, %xmm2
-; CHECK-NEXT:    psubq %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm3
-; CHECK-NEXT:    paddq %xmm2, %xmm3
-; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlw $1, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubb %xmm0, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    psrlw $2, %xmm3
-; CHECK-NEXT:    pand %xmm0, %xmm3
-; CHECK-NEXT:    paddb %xmm2, %xmm3
-; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlw $4, %xmm0
-; CHECK-NEXT:    paddb %xmm3, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psadbw %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    pandn %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm1, %xmm2
+; CHECK-NEXT:    psrlw $2, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm0, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    psadbw %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
   ret <2 x i32> %c
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index a532794f89d..1430ca72f68 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -18,121 +18,112 @@
 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    psubq %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    paddq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
-; SSE2-NEXT:    paddb %xmm3, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv2i64:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm2, %xmm2
-; SSE3-NEXT:    psubq %xmm0, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT:    paddq %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlw $2, %xmm3
-; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddb %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
-; SSE3-NEXT:    paddb %xmm3, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddq %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    pxor %xmm0, %xmm0
+; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv2i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    psubq %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT:    paddq %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddq %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pand %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    psrlw $4, %xmm3
-; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pshufb %xmm3, %xmm0
-; SSSE3-NEXT:    paddb %xmm5, %xmm0
-; SSSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
+; SSSE3-NEXT:    paddb %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv2i64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    psubq %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT:    paddq %xmm2, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddq %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pand %xmm2, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pshufb %xmm4, %xmm5
-; SSE41-NEXT:    psrlw $4, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
-; SSE41-NEXT:    pshufb %xmm3, %xmm0
-; SSE41-NEXT:    paddb %xmm5, %xmm0
-; SSE41-NEXT:    psadbw %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm2, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv2i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
@@ -140,55 +131,50 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv2i64:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv2i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG_NOVLX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv2i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv2i64:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    psubq %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm0, %xmm2
-; X32-SSE-NEXT:    psubq {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X32-SSE-NEXT:    pand %xmm3, %xmm4
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
-; X32-SSE-NEXT:    pshufb %xmm4, %xmm5
-; X32-SSE-NEXT:    psrlw $4, %xmm2
-; X32-SSE-NEXT:    pand %xmm3, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    paddb %xmm5, %xmm0
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psubq {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    paddb %xmm4, %xmm3
+; X32-SSE-NEXT:    pxor %xmm0, %xmm0
+; X32-SSE-NEXT:    psadbw %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0)
   ret <2 x i64> %out
@@ -197,129 +183,121 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64u:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    psubq %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    paddq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
-; SSE2-NEXT:    paddb %xmm3, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv2i64u:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm2, %xmm2
-; SSE3-NEXT:    psubq %xmm0, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT:    paddq %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlw $2, %xmm3
-; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddb %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
-; SSE3-NEXT:    paddb %xmm3, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddq %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    pxor %xmm0, %xmm0
+; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv2i64u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    psubq %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT:    paddq %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddq %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pand %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    psrlw $4, %xmm3
-; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pshufb %xmm3, %xmm0
-; SSSE3-NEXT:    paddb %xmm5, %xmm0
-; SSSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
+; SSSE3-NEXT:    paddb %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv2i64u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    psubq %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT:    paddq %xmm2, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddq %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pand %xmm2, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pshufb %xmm4, %xmm5
-; SSE41-NEXT:    psrlw $4, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
-; SSE41-NEXT:    pshufb %xmm3, %xmm0
-; SSE41-NEXT:    paddb %xmm5, %xmm0
-; SSE41-NEXT:    psadbw %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm2, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: testv2i64u:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv2i64u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -346,11 +324,9 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv2i64u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
@@ -358,55 +334,50 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv2i64u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv2i64u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG_NOVLX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv2i64u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv2i64u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    psubq %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm0, %xmm2
-; X32-SSE-NEXT:    psubq {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X32-SSE-NEXT:    pand %xmm3, %xmm4
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
-; X32-SSE-NEXT:    pshufb %xmm4, %xmm5
-; X32-SSE-NEXT:    psrlw $4, %xmm2
-; X32-SSE-NEXT:    pand %xmm3, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    paddb %xmm5, %xmm0
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psubq {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    paddb %xmm4, %xmm3
+; X32-SSE-NEXT:    pxor %xmm0, %xmm0
+; X32-SSE-NEXT:    psadbw %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1)
   ret <2 x i64> %out
@@ -415,130 +386,124 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE2-LABEL: testv4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
-; SSE2-NEXT:    paddb %xmm3, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    psadbw %xmm1, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    psadbw %xmm1, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psadbw %xmm0, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv4i32:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm2, %xmm2
-; SSE3-NEXT:    psubd %xmm0, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT:    paddd %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlw $2, %xmm3
-; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddb %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
-; SSE3-NEXT:    paddb %xmm3, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddd %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE3-NEXT:    psadbw %xmm1, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE3-NEXT:    psadbw %xmm1, %xmm0
-; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    pxor %xmm0, %xmm0
+; SSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE3-NEXT:    psadbw %xmm0, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    psubd %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT:    paddd %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pand %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    psrlw $4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pshufb %xmm3, %xmm0
-; SSSE3-NEXT:    paddb %xmm5, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    psadbw %xmm1, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    psadbw %xmm1, %xmm0
-; SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm1
+; SSSE3-NEXT:    paddb %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psadbw %xmm0, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    psubd %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT:    paddd %xmm2, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm4, %xmm5
-; SSE41-NEXT:    pshufb %xmm3, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pshufb %xmm2, %xmm4
 ; SSE41-NEXT:    psrlw $4, %xmm0
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    pshufb %xmm0, %xmm4
-; SSE41-NEXT:    paddb %xmm5, %xmm4
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE41-NEXT:    psadbw %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE41-NEXT:    psadbw %xmm1, %xmm3
 ; SSE41-NEXT:    psadbw %xmm1, %xmm0
-; SSE41-NEXT:    packuswb %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: testv4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -548,19 +513,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX2-LABEL: testv4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -570,19 +534,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv4i32:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512CDVL-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512CDVL-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512CDVL-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; AVX512CDVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -592,19 +555,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX512CD-LABEL: testv4i32:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512CD-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; AVX512CD-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -614,11 +576,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv4i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
@@ -626,22 +586,19 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv4i32:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv4i32:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG_NOVLX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -652,12 +609,11 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; BITALG-LABEL: testv4i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -667,27 +623,25 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv4i32:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    psubd %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm0, %xmm2
-; X32-SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; X32-SSE-NEXT:    paddd %xmm2, %xmm0
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pand %xmm2, %xmm3
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
-; X32-SSE-NEXT:    pshufb %xmm3, %xmm5
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddd %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm4
-; X32-SSE-NEXT:    paddb %xmm5, %xmm4
-; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
-; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm4
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    paddb %xmm4, %xmm3
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
+; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X32-SSE-NEXT:    psadbw %xmm1, %xmm3
 ; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
-; X32-SSE-NEXT:    packuswb %xmm4, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
   ret <4 x i32> %out
@@ -696,130 +650,124 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; SSE2-LABEL: testv4i32u:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
-; SSE2-NEXT:    paddb %xmm3, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    psadbw %xmm1, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    psadbw %xmm1, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psadbw %xmm0, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv4i32u:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm2, %xmm2
-; SSE3-NEXT:    psubd %xmm0, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT:    paddd %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlw $2, %xmm3
-; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddb %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
-; SSE3-NEXT:    paddb %xmm3, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddd %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE3-NEXT:    psadbw %xmm1, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE3-NEXT:    psadbw %xmm1, %xmm0
-; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    pxor %xmm0, %xmm0
+; SSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE3-NEXT:    psadbw %xmm0, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv4i32u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    psubd %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT:    paddd %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pand %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    psrlw $4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pshufb %xmm3, %xmm0
-; SSSE3-NEXT:    paddb %xmm5, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    psadbw %xmm1, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    psadbw %xmm1, %xmm0
-; SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm1
+; SSSE3-NEXT:    paddb %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psadbw %xmm0, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv4i32u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    psubd %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT:    paddd %xmm2, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm4, %xmm5
-; SSE41-NEXT:    pshufb %xmm3, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pshufb %xmm2, %xmm4
 ; SSE41-NEXT:    psrlw $4, %xmm0
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    pshufb %xmm0, %xmm4
-; SSE41-NEXT:    paddb %xmm5, %xmm4
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE41-NEXT:    psadbw %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE41-NEXT:    psadbw %xmm1, %xmm3
 ; SSE41-NEXT:    psadbw %xmm1, %xmm0
-; SSE41-NEXT:    packuswb %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: testv4i32u:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -829,19 +777,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX2-LABEL: testv4i32u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -872,11 +819,9 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv4i32u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
@@ -884,22 +829,19 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv4i32u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv4i32u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG_NOVLX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -910,12 +852,11 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; BITALG-LABEL: testv4i32u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -925,27 +866,25 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv4i32u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    psubd %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm0, %xmm2
-; X32-SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; X32-SSE-NEXT:    paddd %xmm2, %xmm0
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pand %xmm2, %xmm3
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
-; X32-SSE-NEXT:    pshufb %xmm3, %xmm5
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddd %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm4
-; X32-SSE-NEXT:    paddb %xmm5, %xmm4
-; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
-; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm4
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    paddb %xmm4, %xmm3
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
+; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X32-SSE-NEXT:    psadbw %xmm1, %xmm3
 ; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
-; X32-SSE-NEXT:    packuswb %xmm4, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
   ret <4 x i32> %out
@@ -954,11 +893,9 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE2-LABEL: testv8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psubw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
@@ -981,11 +918,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; SSE3-LABEL: testv8i16:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    psubw %xmm0, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE3-NEXT:    paddw %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddw %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
@@ -1008,11 +943,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv8i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    psubw %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddw %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSSE3-NEXT:    pand %xmm1, %xmm2
@@ -1031,11 +964,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; SSE41-LABEL: testv8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    psubw %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    pand %xmm1, %xmm2
@@ -1054,11 +985,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; AVX-LABEL: testv8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1074,11 +1003,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i16:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
@@ -1088,11 +1015,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv8i16:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
@@ -1101,11 +1026,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv8i16:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
@@ -1113,21 +1036,17 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i16:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    psubw %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; X32-SSE-NEXT:    paddw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddw %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X32-SSE-NEXT:    pand %xmm1, %xmm2
@@ -1150,11 +1069,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE2-LABEL: testv8i16u:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psubw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
@@ -1177,11 +1094,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; SSE3-LABEL: testv8i16u:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    psubw %xmm0, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE3-NEXT:    paddw %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddw %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
@@ -1204,11 +1119,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv8i16u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    psubw %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddw %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSSE3-NEXT:    pand %xmm1, %xmm2
@@ -1227,11 +1140,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; SSE41-LABEL: testv8i16u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    psubw %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    pand %xmm1, %xmm2
@@ -1250,11 +1161,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; AVX-LABEL: testv8i16u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1270,11 +1179,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i16u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
@@ -1284,11 +1191,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv8i16u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
@@ -1297,11 +1202,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv8i16u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
@@ -1309,21 +1212,17 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i16u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv8i16u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    psubw %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; X32-SSE-NEXT:    paddw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddw %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X32-SSE-NEXT:    pand %xmm1, %xmm2
@@ -1346,95 +1245,89 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; SSE2-LABEL: testv16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psubb %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv16i8:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    psubb %xmm0, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrlw $2, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv16i8:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    psubb %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT:    paddb %xmm1, %xmm2
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddb %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
-; SSSE3-NEXT:    psrlw $4, %xmm2
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufb %xmm2, %xmm0
-; SSSE3-NEXT:    paddb %xmm4, %xmm0
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm1
+; SSSE3-NEXT:    paddb %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    psubb %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    paddb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    pshufb %xmm3, %xmm4
-; SSE41-NEXT:    psrlw $4, %xmm2
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufb %xmm2, %xmm0
-; SSE41-NEXT:    paddb %xmm4, %xmm0
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm1
+; SSE41-NEXT:    paddb %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1447,11 +1340,9 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1460,11 +1351,9 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv16i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1473,11 +1362,9 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv16i8:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
@@ -1485,31 +1372,28 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; BITALG-LABEL: testv16i8:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    psubb %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT:    paddb %xmm1, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddb %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
 ; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
-; X32-SSE-NEXT:    psrlw $4, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    paddb %xmm4, %xmm0
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm4, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0)
   ret <16 x i8> %out
@@ -1518,95 +1402,89 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ; SSE2-LABEL: testv16i8u:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psubb %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv16i8u:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    psubb %xmm0, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrlw $2, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv16i8u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    psubb %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT:    paddb %xmm1, %xmm2
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddb %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
-; SSSE3-NEXT:    psrlw $4, %xmm2
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufb %xmm2, %xmm0
-; SSSE3-NEXT:    paddb %xmm4, %xmm0
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm1
+; SSSE3-NEXT:    paddb %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    psubb %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    paddb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    pshufb %xmm3, %xmm4
-; SSE41-NEXT:    psrlw $4, %xmm2
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufb %xmm2, %xmm0
-; SSE41-NEXT:    paddb %xmm4, %xmm0
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm1
+; SSE41-NEXT:    paddb %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1619,11 +1497,9 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i8u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1632,11 +1508,9 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv16i8u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1645,11 +1519,9 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv16i8u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
@@ -1657,31 +1529,28 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; BITALG-LABEL: testv16i8u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv16i8u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    psubb %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT:    paddb %xmm1, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddb %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
 ; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
-; X32-SSE-NEXT:    psrlw $4, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    paddb %xmm4, %xmm0
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm4, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1)
   ret <16 x i8> %out
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index cae0a2d605a..46c34fb0d44 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -15,144 +15,132 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512CDVL-LABEL: testv4i64:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CDVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CDVL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i64:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv4i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv4i64:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv4i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG_NOVLX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv4i64:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    retl
   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
@@ -163,48 +151,45 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv4i64u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -230,61 +215,54 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv4i64u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv4i64u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv4i64u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG_NOVLX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i64u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv4i64u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    retl
   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
@@ -295,56 +273,53 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm6, %xmm6
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -354,19 +329,18 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv8i32:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CDVL-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CDVL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512CDVL-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; AVX512CDVL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -376,19 +350,18 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; AVX512CD-LABEL: testv8i32:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -398,33 +371,28 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv8i32:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv8i32:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG_NOVLX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG_NOVLX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -434,12 +402,11 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -449,19 +416,18 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv8i32:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-AVX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; X32-AVX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -476,56 +442,53 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm6, %xmm6
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv8i32u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -555,33 +518,28 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i32u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv8i32u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv8i32u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG_NOVLX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG_NOVLX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -591,12 +549,11 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i32u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -606,19 +563,18 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv8i32u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-AVX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; X32-AVX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -632,31 +588,28 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm5
-; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -666,11 +619,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX2-LABEL: testv16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -686,11 +637,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv16i16:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -706,11 +655,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512CD-LABEL: testv16i16:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -726,11 +673,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i16:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
@@ -738,11 +683,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv16i16:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
@@ -750,32 +693,26 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv16i16:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv16i16:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv16i16:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -795,31 +732,28 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16u:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm5
-; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -829,11 +763,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX2-LABEL: testv16i16u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -849,11 +781,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv16i16u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -869,11 +799,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512CD-LABEL: testv16i16u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -889,11 +817,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i16u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
@@ -901,11 +827,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv16i16u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
@@ -913,32 +837,26 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv16i16u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv16i16u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv16i16u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -959,38 +877,33 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1003,11 +916,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv32i8:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1020,11 +931,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512CD-LABEL: testv32i8:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1037,11 +946,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1054,11 +961,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1071,32 +976,26 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv32i8:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv32i8:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv32i8:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1114,38 +1013,33 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1158,11 +1052,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv32i8u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1175,11 +1067,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; AVX512CD-LABEL: testv32i8u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1192,11 +1082,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv32i8u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1209,11 +1097,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv32i8u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1226,32 +1112,26 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv32i8u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv32i8u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv32i8u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 4a9fd82593a..300d7b4ac6c 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -8,11 +8,9 @@
 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512CD-LABEL: testv8i64:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512CD-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm3
@@ -36,58 +34,53 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ;
 ; AVX512CDBW-LABEL: testv8i64:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512CDBW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDBW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
+; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512CDBW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512CDBW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512CDBW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
 ; AVX512BW-LABEL: testv8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv8i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; BITALG-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
@@ -117,40 +110,36 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv8i64u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i64u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv8i64u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; BITALG-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
@@ -160,11 +149,9 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512CD-LABEL: testv16i32:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512CD-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm3
@@ -196,19 +183,18 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ;
 ; AVX512CDBW-LABEL: testv16i32:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512CDBW-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDBW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
+; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512CDBW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512CDBW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512CDBW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; AVX512CDBW-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
 ; AVX512CDBW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -218,19 +204,18 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -240,22 +225,19 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv16i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; BITALG-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -289,19 +271,18 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv16i32u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -311,22 +292,19 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i32u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv16i32u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; BITALG-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
+; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -340,30 +318,27 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512CD-LABEL: testv32i16:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
-; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512CD-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT:    vpaddw %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm5
-; AVX512CD-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm4
+; AVX512CD-NEXT:    vpaddb %ymm0, %ymm4, %ymm0
 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT:    vpaddw %ymm2, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpsllw $8, %ymm1, %ymm2
 ; AVX512CD-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
@@ -372,11 +347,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; AVX512CDBW-LABEL: testv32i16:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -392,11 +365,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -412,17 +383,14 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv32i16:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm3, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm2, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm1, %zmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm1, %ymm1
@@ -430,11 +398,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; BITALG-LABEL: testv32i16:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; BITALG-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
@@ -444,30 +410,27 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ; AVX512CD-LABEL: testv32i16u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
-; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512CD-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT:    vpaddw %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm5
-; AVX512CD-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm4
+; AVX512CD-NEXT:    vpaddb %ymm0, %ymm4, %ymm0
 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT:    vpaddw %ymm2, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpsllw $8, %ymm1, %ymm2
 ; AVX512CD-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
@@ -476,11 +439,9 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ;
 ; AVX512CDBW-LABEL: testv32i16u:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -496,11 +457,9 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv32i16u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -516,17 +475,14 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv32i16u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm3, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm2, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm1, %zmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm1, %ymm1
@@ -534,11 +490,9 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ;
 ; BITALG-LABEL: testv32i16u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; BITALG-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
@@ -548,37 +502,32 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512CD-LABEL: testv64i8:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
-; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv64i8:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -591,11 +540,9 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -608,37 +555,32 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv64i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv64i8:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; BITALG-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0)
@@ -648,37 +590,32 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; AVX512CD-LABEL: testv64i8u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
-; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv64i8u:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -691,11 +628,9 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv64i8u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -708,37 +643,32 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv64i8u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv64i8u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; BITALG-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1)
-- 
GitLab


From 3b18a97f3ac2eb58715703462cef16f8e0e8d4e1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 13:05:19 +0000
Subject: [PATCH 0144/1116] [X86][SSE] Improve CTTZ lowering when CTLZ is legal

If we have better CTLZ support than CTPOP, then use cttz(x) = width - ctlz(~x & (x - 1)) - and remove the CTTZ_ZERO_UNDEF handling as it no longer gives better codegen.

Similar to rL344447, this is also closer to LegalizeDAG's approach

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344448 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp   |  24 ++---
 test/CodeGen/X86/vector-tzcnt-128.ll | 137 +++++++++++++++------------
 test/CodeGen/X86/vector-tzcnt-256.ll |  92 ++++++------------
 test/CodeGen/X86/vector-tzcnt-512.ll | 114 ++++++----------------
 4 files changed, 146 insertions(+), 221 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 5fb3ece19f2..5f1e9ef1b03 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -22982,20 +22982,22 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
     if (VT.is256BitVector() && !Subtarget.hasInt256())
       return Lower256IntUnary(Op, DAG);
 
-    // cttz_undef(x) = (width - 1) - ctlz(x & -x)
-    if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
-      SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
-      SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
-                                DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
-      return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
-                         DAG.getNode(ISD::CTLZ, dl, VT, LSB));
+    // Tmp = ~x & (x - 1)
+    SDValue One = DAG.getConstant(1, dl, VT);
+    SDValue Tmp = DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT),
+                              DAG.getNode(ISD::SUB, dl, VT, N0, One));
+
+    // cttz(x) = width - ctlz(~x & (x - 1))
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLI.isOperationLegal(ISD::CTLZ, VT) &&
+        !TLI.isOperationLegal(ISD::CTPOP, VT)) {
+      SDValue Width = DAG.getConstant(NumBits, dl, VT);
+      return DAG.getNode(ISD::SUB, dl, VT, Width,
+                         DAG.getNode(ISD::CTLZ, dl, VT, Tmp));
     }
 
     // cttz(x) = ctpop(~x & (x - 1))
-    SDValue One = DAG.getConstant(1, dl, VT);
-    return DAG.getNode(ISD::CTPOP, dl, VT,
-                       DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT),
-                                   DAG.getNode(ISD::SUB, dl, VT, N0, One)));
+    return DAG.getNode(ISD::CTPOP, dl, VT, Tmp);
   }
 
   assert(Op.getOpcode() == ISD::CTTZ &&
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 1430ca72f68..0d392bb5117 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -102,22 +102,60 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE41-NEXT:    psadbw %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: testv2i64:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vplzcntq %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
+; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: testv2i64:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
+; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT:    vzeroupper
+; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv2i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
@@ -303,21 +341,21 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv2i64u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512CDVL-NEXT:    vplzcntq %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
 ; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv2i64u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
 ; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
@@ -537,20 +575,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX512CDVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512CDVL-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; AVX512CDVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512CDVL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vplzcntd %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
+; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i32:
@@ -558,20 +585,10 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX512CD-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512CD-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512CD-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
+; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv4i32:
@@ -798,21 +815,21 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv4i32u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512CDVL-NEXT:    vplzcntd %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
 ; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i32u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
 ; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index 46c34fb0d44..59911e5805b 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -62,16 +62,9 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CDVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
 ; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vplzcntq %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64]
+; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i64:
@@ -79,16 +72,9 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
 ; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64]
+; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv4i64:
@@ -195,21 +181,21 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv4i64u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vplzcntq %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63]
+; AVX512CDVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64]
 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i64u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63]
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64]
 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
@@ -332,20 +318,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CDVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
 ; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
-; AVX512CDVL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vplzcntd %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32]
+; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv8i32:
@@ -353,20 +328,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
 ; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32]
+; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i32:
@@ -498,21 +462,21 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv8i32u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vplzcntd %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31]
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32]
 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv8i32u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31]
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32]
 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 300d7b4ac6c..1de03463e19 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -11,25 +11,9 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512CD-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CD-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm5
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64]
+; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv8i64:
@@ -37,16 +21,9 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
-; AVX512CDBW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64]
+; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
 ; AVX512BW-LABEL: testv8i64:
@@ -90,21 +67,21 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; AVX512CD-LABEL: testv8i64u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512CD-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63]
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64]
 ; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv8i64u:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63]
+; AVX512CDBW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64]
 ; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
@@ -152,33 +129,9 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512CD-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CD-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm5, %ymm5
-; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpackuswb %ymm5, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm5
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5]
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv16i32:
@@ -186,20 +139,9 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
-; AVX512CDBW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; AVX512CDBW-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
-; AVX512CDBW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; AVX512CDBW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
 ; AVX512BW-LABEL: testv16i32:
@@ -251,21 +193,21 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512CD-LABEL: testv16i32u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512CD-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv16i32u:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512CDBW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
-- 
GitLab


From 1a9bbe2528fdcbac90886b04897d51f10a1ecb84 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 13:30:10 +0000
Subject: [PATCH 0145/1116] Remove unused variable. NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344449 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 5f1e9ef1b03..bb75f6e0f17 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -22976,7 +22976,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
 
   if (VT.isVector()) {
     SDValue N0 = Op.getOperand(0);
-    SDValue Zero = DAG.getConstant(0, dl, VT);
 
     // Decompose 256-bit ops into smaller 128-bit ops.
     if (VT.is256BitVector() && !Subtarget.hasInt256())
-- 
GitLab


From 7ecda486e97cf17a7d97bbf9e6bfa7e768d51879 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 13:33:32 +0000
Subject: [PATCH 0146/1116] Pull out repeated getOperand(). NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344450 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index bb75f6e0f17..835e272f52b 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -22972,11 +22972,10 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
                          SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   unsigned NumBits = VT.getScalarSizeInBits();
+  SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
 
   if (VT.isVector()) {
-    SDValue N0 = Op.getOperand(0);
-
     // Decompose 256-bit ops into smaller 128-bit ops.
     if (VT.is256BitVector() && !Subtarget.hasInt256())
       return Lower256IntUnary(Op, DAG);
@@ -23004,7 +23003,7 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
 
   // Issue a bsf (scan bits forward) which also sets EFLAGS.
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
+  Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
 
   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   SDValue Ops[] = {
-- 
GitLab


From 8965b5dc749a3b320c5e902cd639837f7a734015 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 14:28:40 +0000
Subject: [PATCH 0147/1116] [X86] Pull out target constant splat helper
 function. NFCI.

The code in LowerScalarImmediateShift is just a more powerful version of ISD::isConstantSplatVector.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344451 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 44 ++++++++++++++++++------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 835e272f52b..d6699c6e678 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5830,6 +5830,30 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   return false;
 }
 
+static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
+  APInt UndefElts;
+  SmallVector<APInt, 16> EltBits;
+  if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
+                                    UndefElts, EltBits, true, false)) {
+    int SplatIndex = -1;
+    for (int i = 0, e = EltBits.size(); i != e; ++i) {
+      if (UndefElts[i])
+        continue;
+      if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
+        SplatIndex = -1;
+        break;
+      }
+      SplatIndex = i;
+    }
+    if (0 <= SplatIndex) {
+      SplatVal = EltBits[SplatIndex];
+      return true;
+    }
+  }
+
+  return false;
+}
+
 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
                                         unsigned MaskEltSizeInBits,
                                         SmallVectorImpl<uint64_t> &RawMask) {
@@ -23600,7 +23624,6 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
-  unsigned EltSizeInBits = VT.getScalarSizeInBits();
   unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
 
   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
@@ -23644,24 +23667,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   };
 
   // Optimize shl/srl/sra with constant shift amount.
-  APInt UndefElts;
-  SmallVector<APInt, 8> EltBits;
-  if (!getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits,
-                                     true, false))
-    return SDValue();
-
-  int SplatIndex = -1;
-  for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
-    if (UndefElts[i])
-      continue;
-    if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex])
-      return SDValue();
-    SplatIndex = i;
-  }
-  if (SplatIndex < 0)
+  APInt APIntShiftAmt;
+  if (!isConstantSplat(Amt, APIntShiftAmt))
     return SDValue();
+  uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
 
-  uint64_t ShiftAmt = EltBits[SplatIndex].getZExtValue();
   if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
 
-- 
GitLab


From 097be3b516e9c3d213769b842b55800b0159339b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 14:45:44 +0000
Subject: [PATCH 0148/1116] [X86][SSE] combineIncDecVector - use
 isConstantSplat

Use isConstantSplat instead of ISD::isConstantSplatVector to let us us peek through to illegal types (in this case for i686 targets to recognise i64 constants)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344452 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp   | 4 +---
 test/CodeGen/X86/vector-tzcnt-128.ll | 8 ++++----
 test/CodeGen/X86/vector-tzcnt-256.ll | 6 ++++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index d6699c6e678..18c5f60f2f2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -39603,10 +39603,8 @@ static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
     return SDValue();
 
-  SDNode *N1 = N->getOperand(1).getNode();
   APInt SplatVal;
-  if (!ISD::isConstantSplatVector(N1, SplatVal) ||
-      !SplatVal.isOneValue())
+  if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue())
     return SDValue();
 
   SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 0d392bb5117..21142ff3970 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -198,8 +198,8 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv2i64:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    psubq {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddq %xmm0, %xmm1
 ; X32-SSE-NEXT:    pandn %xmm1, %xmm0
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -401,8 +401,8 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv2i64u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    psubq {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddq %xmm0, %xmm1
 ; X32-SSE-NEXT:    pandn %xmm1, %xmm0
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index 59911e5805b..c7087037e01 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -116,7 +116,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv4i64:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
 ; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
@@ -238,7 +239,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv4i64u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
 ; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-- 
GitLab


From e3800c79ea4638c860b7fe2d6bfd120793cd3c48 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 15:16:55 +0000
Subject: [PATCH 0149/1116] [X86][SSE] Begin removing vector CTTZ custom
 lowering and use LegalizeDAG instead.

Adds CTTZ vector legalization support and begins the removal of the X86/SSE custom lowering.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344453 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp       |  2 +-
 lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 18 +++++++++++++++---
 lib/Target/X86/X86ISelLowering.cpp             | 17 ++++++++---------
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 56025110f0a..884d7174440 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2794,7 +2794,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     // This trivially expands to CTTZ.
     return DAG.getNode(ISD::CTTZ, dl, VT, Op);
   case ISD::CTTZ: {
-    unsigned Len = VT.getSizeInBits();
+    unsigned Len = VT.getScalarSizeInBits();
 
     if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
       EVT SetCCVT = getSetCCResultType(VT);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8cc37b5f233..58d86e8e52e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -129,7 +129,7 @@ class VectorLegalizer {
   SDValue ExpandFSUB(SDValue Op);
   SDValue ExpandBITREVERSE(SDValue Op);
   SDValue ExpandCTLZ(SDValue Op);
-  SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op);
+  SDValue ExpandCTTZ(SDValue Op);
   SDValue ExpandStrictFPOp(SDValue Op);
 
   /// Implements vector promotion.
@@ -717,8 +717,9 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
     return ExpandCTLZ(Op);
+  case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
-    return ExpandCTTZ_ZERO_UNDEF(Op);
+    return ExpandCTTZ(Op);
   case ISD::STRICT_FADD:
   case ISD::STRICT_FSUB:
   case ISD::STRICT_FMUL:
@@ -1094,8 +1095,9 @@ SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
-SDValue VectorLegalizer::ExpandCTTZ_ZERO_UNDEF(SDValue Op) {
+SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
   EVT VT = Op.getValueType();
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
 
   // If the non-ZERO_UNDEF version is supported we can use that instead.
   if (TLI.isOperationLegalOrCustom(ISD::CTTZ, VT)) {
@@ -1103,6 +1105,16 @@ SDValue VectorLegalizer::ExpandCTTZ_ZERO_UNDEF(SDValue Op) {
     return DAG.getNode(ISD::CTTZ, DL, VT, Op.getOperand(0));
   }
 
+  // If we have the appropriate vector bit operations, it is better to use them
+  // than unrolling and expanding each component.
+  if (isPowerOf2_32(NumBitsPerElt) &&
+      (TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) ||
+       TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) &&
+      TLI.isOperationLegalOrCustom(ISD::SUB, VT) &&
+      TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT) &&
+      TLI.isOperationLegalOrCustomOrPromote(ISD::XOR, VT))
+    return Op;
+
   // Otherwise go ahead and unroll.
   return DAG.UnrollVectorOp(Op.getNode());
 }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 18c5f60f2f2..1411cf18902 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -23004,22 +23004,21 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
     if (VT.is256BitVector() && !Subtarget.hasInt256())
       return Lower256IntUnary(Op, DAG);
 
-    // Tmp = ~x & (x - 1)
-    SDValue One = DAG.getConstant(1, dl, VT);
-    SDValue Tmp = DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT),
-                              DAG.getNode(ISD::SUB, dl, VT, N0, One));
-
     // cttz(x) = width - ctlz(~x & (x - 1))
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     if (TLI.isOperationLegal(ISD::CTLZ, VT) &&
         !TLI.isOperationLegal(ISD::CTPOP, VT)) {
+      SDValue One = DAG.getConstant(1, dl, VT);
       SDValue Width = DAG.getConstant(NumBits, dl, VT);
-      return DAG.getNode(ISD::SUB, dl, VT, Width,
-                         DAG.getNode(ISD::CTLZ, dl, VT, Tmp));
+      return DAG.getNode(
+          ISD::SUB, dl, VT, Width,
+          DAG.getNode(ISD::CTLZ, dl, VT,
+                      DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT),
+                                  DAG.getNode(ISD::SUB, dl, VT, N0, One))));
     }
 
-    // cttz(x) = ctpop(~x & (x - 1))
-    return DAG.getNode(ISD::CTPOP, dl, VT, Tmp);
+    // Else leave it to the legalizer.
+    return SDValue();
   }
 
   assert(Op.getOpcode() == ISD::CTTZ &&
-- 
GitLab


From c323e923ae334e978f63714e73517d6d1a51abe9 Mon Sep 17 00:00:00 2001
From: David Bolvansky <david.bolvansky@gmail.com>
Date: Sat, 13 Oct 2018 15:21:55 +0000
Subject: [PATCH 0150/1116] [InstCombine] Fixed crash with aliased functions

Summary: Fixes PR39177

Reviewers: spatel, jbuening

Reviewed By: jbuening

Subscribers: jbuening, llvm-commits

Differential Revision: https://reviews.llvm.org/D53129

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344454 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/BuildLibCalls.h |  2 +-
 lib/Transforms/IPO/InferFunctionAttrs.cpp     |  2 +-
 lib/Transforms/Scalar/LoopIdiomRecognize.cpp  |  2 +-
 lib/Transforms/Utils/BuildLibCalls.cpp        | 46 +++++++------
 lib/Transforms/Utils/SimplifyLibCalls.cpp     |  2 +-
 test/Transforms/InstCombine/pr39177.ll        | 66 +++++++++++++++++++
 6 files changed, 95 insertions(+), 25 deletions(-)
 create mode 100644 test/Transforms/InstCombine/pr39177.ll

diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index bdcdf6f361f..ab7d22c024c 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -28,7 +28,7 @@ namespace llvm {
   /// If the library function is unavailable, this doesn't modify it.
   ///
   /// Returns true if any attributes were set and false otherwise.
-  bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI);
+  bool inferLibFuncAttributes(Function *Func, const TargetLibraryInfo &TLI);
 
   /// Check whether the overloaded unary floating point function
   /// corresponding to \a Ty is available.
diff --git a/lib/Transforms/IPO/InferFunctionAttrs.cpp b/lib/Transforms/IPO/InferFunctionAttrs.cpp
index 470f97b8ba6..c53a9b5e819 100644
--- a/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -27,7 +27,7 @@ static bool inferAllPrototypeAttributes(Module &M,
     // We only infer things using the prototype and the name; we don't need
     // definitions.
     if (F.isDeclaration() && !F.hasFnAttribute((Attribute::OptimizeNone)))
-      Changed |= inferLibFuncAttributes(F, TLI);
+      Changed |= inferLibFuncAttributes(&F, TLI);
 
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 68abf9719a9..9a45551f64b 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -931,7 +931,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Value *MSP =
         M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
                                Int8PtrTy, Int8PtrTy, IntPtr);
-    inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
+    inferLibFuncAttributes(M->getFunction("memset_pattern16"), *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
     // an constant array of 16-bytes.  Plop the value into a mergable global.
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 6eb39e5b959..234449b2bf8 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -121,7 +121,11 @@ static bool setNonLazyBind(Function &F) {
   return true;
 }
 
-bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
+bool llvm::inferLibFuncAttributes(Function *Func,
+                                  const TargetLibraryInfo &TLI) {
+  if (!Func)
+    return false;
+  Function &F = *Func;
   LibFunc TheLibFunc;
   if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
     return false;
@@ -773,7 +777,7 @@ Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrLen = M->getOrInsertFunction("strlen", DL.getIntPtrType(Context),
                                             B.getInt8PtrTy());
-  inferLibFuncAttributes(*M->getFunction("strlen"), *TLI);
+  inferLibFuncAttributes(M->getFunction("strlen"), *TLI);
   CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), "strlen");
   if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -791,7 +795,7 @@ Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
   Type *I32Ty = B.getInt32Ty();
   Constant *StrChr =
       M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty);
-  inferLibFuncAttributes(*M->getFunction("strchr"), *TLI);
+  inferLibFuncAttributes(M->getFunction("strchr"), *TLI);
   CallInst *CI = B.CreateCall(
       StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr");
   if (const Function *F = dyn_cast<Function>(StrChr->stripPointerCasts()))
@@ -809,7 +813,7 @@ Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
   Value *StrNCmp = M->getOrInsertFunction("strncmp", B.getInt32Ty(),
                                           B.getInt8PtrTy(), B.getInt8PtrTy(),
                                           DL.getIntPtrType(Context));
-  inferLibFuncAttributes(*M->getFunction("strncmp"), *TLI);
+  inferLibFuncAttributes(M->getFunction("strncmp"), *TLI);
   CallInst *CI = B.CreateCall(
       StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "strncmp");
 
@@ -827,7 +831,7 @@ Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
   Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr);
-  inferLibFuncAttributes(*M->getFunction(Name), *TLI);
+  inferLibFuncAttributes(M->getFunction(Name), *TLI);
   CallInst *CI =
       B.CreateCall(StrCpy, {castToCStr(Dst, B), castToCStr(Src, B)}, Name);
   if (const Function *F = dyn_cast<Function>(StrCpy->stripPointerCasts()))
@@ -844,7 +848,7 @@ Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType());
-  inferLibFuncAttributes(*M->getFunction(Name), *TLI);
+  inferLibFuncAttributes(M->getFunction(Name), *TLI);
   CallInst *CI = B.CreateCall(
       StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, "strncpy");
   if (const Function *F = dyn_cast<Function>(StrNCpy->stripPointerCasts()))
@@ -885,7 +889,7 @@ Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
   Value *MemChr = M->getOrInsertFunction("memchr", B.getInt8PtrTy(),
                                          B.getInt8PtrTy(), B.getInt32Ty(),
                                          DL.getIntPtrType(Context));
-  inferLibFuncAttributes(*M->getFunction("memchr"), *TLI);
+  inferLibFuncAttributes(M->getFunction("memchr"), *TLI);
   CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, "memchr");
 
   if (const Function *F = dyn_cast<Function>(MemChr->stripPointerCasts()))
@@ -904,7 +908,7 @@ Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
   Value *MemCmp = M->getOrInsertFunction("memcmp", B.getInt32Ty(),
                                          B.getInt8PtrTy(), B.getInt8PtrTy(),
                                          DL.getIntPtrType(Context));
-  inferLibFuncAttributes(*M->getFunction("memcmp"), *TLI);
+  inferLibFuncAttributes(M->getFunction("memcmp"), *TLI);
   CallInst *CI = B.CreateCall(
       MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "memcmp");
 
@@ -974,7 +978,7 @@ Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(), B.getInt32Ty());
-  inferLibFuncAttributes(*M->getFunction("putchar"), *TLI);
+  inferLibFuncAttributes(M->getFunction("putchar"), *TLI);
   CallInst *CI = B.CreateCall(PutChar,
                               B.CreateIntCast(Char,
                               B.getInt32Ty(),
@@ -995,7 +999,7 @@ Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
   Module *M = B.GetInsertBlock()->getModule();
   Value *PutS =
       M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy());
-  inferLibFuncAttributes(*M->getFunction("puts"), *TLI);
+  inferLibFuncAttributes(M->getFunction("puts"), *TLI);
   CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), "puts");
   if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -1011,7 +1015,7 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
   Constant *F = M->getOrInsertFunction("fputc", B.getInt32Ty(), B.getInt32Ty(),
                                        File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction("fputc"), *TLI);
+    inferLibFuncAttributes(M->getFunction("fputc"), *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
                          "chari");
   CallInst *CI = B.CreateCall(F, {Char, File}, "fputc");
@@ -1030,7 +1034,7 @@ Value *llvm::emitFPutCUnlocked(Value *Char, Value *File, IRBuilder<> &B,
   Constant *F = M->getOrInsertFunction("fputc_unlocked", B.getInt32Ty(),
                                        B.getInt32Ty(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction("fputc_unlocked"), *TLI);
+    inferLibFuncAttributes(M->getFunction("fputc_unlocked"), *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/ true, "chari");
   CallInst *CI = B.CreateCall(F, {Char, File}, "fputc_unlocked");
 
@@ -1049,7 +1053,7 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
   Constant *F = M->getOrInsertFunction(
       FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction(FPutsName), *TLI);
+    inferLibFuncAttributes(M->getFunction(FPutsName), *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs");
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
@@ -1067,7 +1071,7 @@ Value *llvm::emitFPutSUnlocked(Value *Str, Value *File, IRBuilder<> &B,
   Constant *F = M->getOrInsertFunction(FPutsUnlockedName, B.getInt32Ty(),
                                        B.getInt8PtrTy(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction(FPutsUnlockedName), *TLI);
+    inferLibFuncAttributes(M->getFunction(FPutsUnlockedName), *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs_unlocked");
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
@@ -1088,7 +1092,7 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
       DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
 
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction(FWriteName), *TLI);
+    inferLibFuncAttributes(M->getFunction(FWriteName), *TLI);
   CallInst *CI =
       B.CreateCall(F, {castToCStr(Ptr, B), Size,
                        ConstantInt::get(DL.getIntPtrType(Context), 1), File});
@@ -1107,7 +1111,7 @@ Value *llvm::emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL,
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *Malloc = M->getOrInsertFunction("malloc", B.getInt8PtrTy(),
                                          DL.getIntPtrType(Context));
-  inferLibFuncAttributes(*M->getFunction("malloc"), *TLI);
+  inferLibFuncAttributes(M->getFunction("malloc"), *TLI);
   CallInst *CI = B.CreateCall(Malloc, Num, "malloc");
 
   if (const Function *F = dyn_cast<Function>(Malloc->stripPointerCasts()))
@@ -1126,7 +1130,7 @@ Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
   IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
   Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
                                          PtrType, PtrType);
-  inferLibFuncAttributes(*M->getFunction("calloc"), TLI);
+  inferLibFuncAttributes(M->getFunction("calloc"), TLI);
   CallInst *CI = B.CreateCall(Calloc, {Num, Size}, "calloc");
 
   if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
@@ -1149,7 +1153,7 @@ Value *llvm::emitFWriteUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
       DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
 
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction(FWriteUnlockedName), *TLI);
+    inferLibFuncAttributes(M->getFunction(FWriteUnlockedName), *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
@@ -1166,7 +1170,7 @@ Value *llvm::emitFGetCUnlocked(Value *File, IRBuilder<> &B,
   Constant *F =
       M->getOrInsertFunction("fgetc_unlocked", B.getInt32Ty(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction("fgetc_unlocked"), *TLI);
+    inferLibFuncAttributes(M->getFunction("fgetc_unlocked"), *TLI);
   CallInst *CI = B.CreateCall(F, File, "fgetc_unlocked");
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
@@ -1183,7 +1187,7 @@ Value *llvm::emitFGetSUnlocked(Value *Str, Value *Size, Value *File,
   Constant *F =
       M->getOrInsertFunction("fgets_unlocked", B.getInt8PtrTy(),
                              B.getInt8PtrTy(), B.getInt32Ty(), File->getType());
-  inferLibFuncAttributes(*M->getFunction("fgets_unlocked"), *TLI);
+  inferLibFuncAttributes(M->getFunction("fgets_unlocked"), *TLI);
   CallInst *CI =
       B.CreateCall(F, {castToCStr(Str, B), Size, File}, "fgets_unlocked");
 
@@ -1206,7 +1210,7 @@ Value *llvm::emitFReadUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
       DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
 
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction(FReadUnlockedName), *TLI);
+    inferLibFuncAttributes(M->getFunction(FReadUnlockedName), *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 41a495a0484..6f24dc10e1e 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -145,7 +145,7 @@ static bool isLocallyOpenedFile(Value *File, CallInst *CI, IRBuilder<> &B,
       Func != LibFunc_fopen)
     return false;
 
-  inferLibFuncAttributes(*CI->getCalledFunction(), *TLI);
+  inferLibFuncAttributes(CI->getCalledFunction(), *TLI);
   if (PointerMayBeCaptured(File, true, true))
     return false;
 
diff --git a/test/Transforms/InstCombine/pr39177.ll b/test/Transforms/InstCombine/pr39177.ll
new file mode 100644
index 00000000000..750e17a01f1
--- /dev/null
+++ b/test/Transforms/InstCombine/pr39177.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s -instcombine -S
+
+%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
+
+@stderr = external global %struct._IO_FILE*, align 8
+@.str = private constant [8 x i8] c"crash!\0A\00", align 1
+
+@fwrite = alias i64 (i8*, i64, i64, %struct._IO_FILE*), i64 (i8*, i64, i64, %struct._IO_FILE*)* @__fwrite_alias
+
+define i64 @__fwrite_alias(i8* %ptr, i64 %size, i64 %n, %struct._IO_FILE* %s) {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %size.addr = alloca i64, align 8
+  %n.addr = alloca i64, align 8
+  %s.addr = alloca %struct._IO_FILE*, align 8
+  store i8* %ptr, i8** %ptr.addr, align 8
+  store i64 %size, i64* %size.addr, align 8
+  store i64 %n, i64* %n.addr, align 8
+  store %struct._IO_FILE* %s, %struct._IO_FILE** %s.addr, align 8
+  ret i64 0
+}
+
+define void @foo() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
+  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0))
+  ret void
+}
+
+declare i32 @fprintf(%struct._IO_FILE*, i8*, ...)
+; RUN: opt < %s -instcombine -S
+
+%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
+
+@stderr = external global %struct._IO_FILE*, align 8
+@.str = private constant [8 x i8] c"crash!\0A\00", align 1
+
+@fwrite = alias i64 (i8*, i64, i64, %struct._IO_FILE*), i64 (i8*, i64, i64, %struct._IO_FILE*)* @__fwrite_alias
+
+define i64 @__fwrite_alias(i8* %ptr, i64 %size, i64 %n, %struct._IO_FILE* %s) {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %size.addr = alloca i64, align 8
+  %n.addr = alloca i64, align 8
+  %s.addr = alloca %struct._IO_FILE*, align 8
+  store i8* %ptr, i8** %ptr.addr, align 8
+  store i64 %size, i64* %size.addr, align 8
+  store i64 %n, i64* %n.addr, align 8
+  store %struct._IO_FILE* %s, %struct._IO_FILE** %s.addr, align 8
+  ret i64 0
+}
+
+define void @foo() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
+  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0))
+  ret void
+}
+
+declare i32 @fprintf(%struct._IO_FILE*, i8*, ...)
-- 
GitLab


From e9226f019e3c62982bb6f81acf2e24701b0748b7 Mon Sep 17 00:00:00 2001
From: David Bolvansky <david.bolvansky@gmail.com>
Date: Sat, 13 Oct 2018 15:26:13 +0000
Subject: [PATCH 0151/1116] [NFC] Fixed duplicated test file

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344455 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/pr39177.ll | 33 --------------------------
 1 file changed, 33 deletions(-)

diff --git a/test/Transforms/InstCombine/pr39177.ll b/test/Transforms/InstCombine/pr39177.ll
index 750e17a01f1..a047a079f58 100644
--- a/test/Transforms/InstCombine/pr39177.ll
+++ b/test/Transforms/InstCombine/pr39177.ll
@@ -31,36 +31,3 @@ entry:
 }
 
 declare i32 @fprintf(%struct._IO_FILE*, i8*, ...)
-; RUN: opt < %s -instcombine -S
-
-%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
-%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
-
-@stderr = external global %struct._IO_FILE*, align 8
-@.str = private constant [8 x i8] c"crash!\0A\00", align 1
-
-@fwrite = alias i64 (i8*, i64, i64, %struct._IO_FILE*), i64 (i8*, i64, i64, %struct._IO_FILE*)* @__fwrite_alias
-
-define i64 @__fwrite_alias(i8* %ptr, i64 %size, i64 %n, %struct._IO_FILE* %s) {
-entry:
-  %ptr.addr = alloca i8*, align 8
-  %size.addr = alloca i64, align 8
-  %n.addr = alloca i64, align 8
-  %s.addr = alloca %struct._IO_FILE*, align 8
-  store i8* %ptr, i8** %ptr.addr, align 8
-  store i64 %size, i64* %size.addr, align 8
-  store i64 %n, i64* %n.addr, align 8
-  store %struct._IO_FILE* %s, %struct._IO_FILE** %s.addr, align 8
-  ret i64 0
-}
-
-define void @foo() {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval, align 4
-  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
-  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0))
-  ret void
-}
-
-declare i32 @fprintf(%struct._IO_FILE*, i8*, ...)
-- 
GitLab


From 7875f53fbec75964572823e79a58eee5df2e514d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 13 Oct 2018 16:02:47 +0000
Subject: [PATCH 0152/1116] [InstCombine] add tests for operand complexity
 canonicalization; NFC

The tests with undef vector elements demonstrate a hole in
the current pattern matching.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344456 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/operand-complexity.ll         | 136 ++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 test/Transforms/InstCombine/operand-complexity.ll

diff --git a/test/Transforms/InstCombine/operand-complexity.ll b/test/Transforms/InstCombine/operand-complexity.ll
new file mode 100644
index 00000000000..747b0c836a5
--- /dev/null
+++ b/test/Transforms/InstCombine/operand-complexity.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; 'Negate' is considered less complex than a normal binop, so the mul should have the binop as the first operand.
+
+define i8 @neg(i8 %x) {
+; CHECK-LABEL: @neg(
+; CHECK-NEXT:    [[BO:%.*]] = udiv i8 [[X:%.*]], 42
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i8 0, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = mul i8 [[BO]], [[NEGX]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %bo = udiv i8 %x, 42
+  %negx = sub i8 0, %x
+  %r = mul i8 %negx, %bo
+  ret i8 %r
+}
+
+define <2 x i8> @neg_vec(<2 x i8> %x) {
+; CHECK-LABEL: @neg_vec(
+; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
+; CHECK-NEXT:    [[NEGX:%.*]] = sub <2 x i8> zeroinitializer, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[BO]], [[NEGX]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
+  %negx = sub <2 x i8> <i8 0, i8 0>, %x
+  %r = mul <2 x i8> %negx, %bo
+  ret <2 x i8> %r
+}
+
+define <2 x i8> @neg_vec_undef(<2 x i8> %x) {
+; CHECK-LABEL: @neg_vec_undef(
+; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
+; CHECK-NEXT:    [[NEGX:%.*]] = sub <2 x i8> <i8 0, i8 undef>, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[NEGX]], [[BO]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
+  %negx = sub <2 x i8> <i8 0, i8 undef>, %x
+  %r = mul <2 x i8> %negx, %bo
+  ret <2 x i8> %r
+}
+
+; 'Not' is considered less complex than a normal binop, so the mul should have the binop as the first operand.
+
+define i8 @not(i8 %x) {
+; CHECK-LABEL: @not(
+; CHECK-NEXT:    [[BO:%.*]] = udiv i8 [[X:%.*]], 42
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = mul i8 [[BO]], [[NOTX]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %bo = udiv i8 %x, 42
+  %notx = xor i8 -1, %x
+  %r = mul i8 %notx, %bo
+  ret i8 %r
+}
+
+define <2 x i8> @not_vec(<2 x i8> %x) {
+; CHECK-LABEL: @not_vec(
+; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
+; CHECK-NEXT:    [[NOTX:%.*]] = xor <2 x i8> [[X]], <i8 -1, i8 -1>
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[BO]], [[NOTX]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
+  %notx = xor <2 x i8> <i8 -1, i8 -1>, %x
+  %r = mul <2 x i8> %notx, %bo
+  ret <2 x i8> %r
+}
+
+define <2 x i8> @not_vec_undef(<2 x i8> %x) {
+; CHECK-LABEL: @not_vec_undef(
+; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
+; CHECK-NEXT:    [[NOTX:%.*]] = xor <2 x i8> [[X]], <i8 -1, i8 undef>
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[NOTX]], [[BO]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
+  %notx = xor <2 x i8> <i8 -1, i8 undef>, %x
+  %r = mul <2 x i8> %notx, %bo
+  ret <2 x i8> %r
+}
+
+; 'Fneg' is considered less complex than a normal binop, so the fmul should have the binop as the first operand.
+; Extra uses are required to ensure that the fneg is not canonicalized after the fmul.
+
+declare void @use(float)
+declare void @use_vec(<2 x float>)
+
+define float @fneg(float %x) {
+; CHECK-LABEL: @fneg(
+; CHECK-NEXT:    [[BO:%.*]] = fdiv float [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    [[FNEGX:%.*]] = fsub float -0.000000e+00, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = fmul float [[BO]], [[FNEGX]]
+; CHECK-NEXT:    call void @use(float [[FNEGX]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %bo = fdiv float %x, 42.0
+  %fnegx = fsub float -0.0, %x
+  %r = fmul float %fnegx, %bo
+  call void @use(float %fnegx)
+  ret float %r
+}
+
+define <2 x float> @fneg_vec(<2 x float> %x) {
+; CHECK-LABEL: @fneg_vec(
+; CHECK-NEXT:    [[BO:%.*]] = fdiv <2 x float> [[X:%.*]], <float 4.200000e+01, float -4.200000e+01>
+; CHECK-NEXT:    [[FNEGX:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = fmul <2 x float> [[BO]], [[FNEGX]]
+; CHECK-NEXT:    call void @use_vec(<2 x float> [[FNEGX]])
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %bo = fdiv <2 x float> %x, <float 42.0, float -42.0>
+  %fnegx = fsub <2 x float> <float -0.0, float -0.0>, %x
+  %r = fmul <2 x float> %fnegx, %bo
+  call void @use_vec(<2 x float> %fnegx)
+  ret <2 x float> %r
+}
+
+define <2 x float> @fneg_vec_undef(<2 x float> %x) {
+; CHECK-LABEL: @fneg_vec_undef(
+; CHECK-NEXT:    [[BO:%.*]] = fdiv <2 x float> [[X:%.*]], <float 4.200000e+01, float -4.200000e+01>
+; CHECK-NEXT:    [[FNEGX:%.*]] = fsub <2 x float> <float -0.000000e+00, float undef>, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = fmul <2 x float> [[FNEGX]], [[BO]]
+; CHECK-NEXT:    call void @use_vec(<2 x float> [[FNEGX]])
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %bo = fdiv <2 x float> %x, <float 42.0, float -42.0>
+  %fnegx = fsub <2 x float> <float -0.0, float undef>, %x
+  %r = fmul <2 x float> %fnegx, %bo
+  call void @use_vec(<2 x float> %fnegx)
+  ret <2 x float> %r
+}
+
-- 
GitLab


From 1000de2443cae3c5e9cb2f3c39bf8e846c6005b8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 16:11:15 +0000
Subject: [PATCH 0153/1116] [X86][SSE] Remove most of vector CTTZ custom
 lowering and use LegalizeDAG instead.

There is one remnant - AVX1 custom splitting of 256-bit vectors - which is due to a regression where the X86ISD::ANDNP is still performed as a YMM.

I've also tightened the CTLZ or CTPOP lowering in SelectionDAGLegalize::ExpandBitCount to require a legal CTLZ - it doesn't affect existing users and fixes an issue with AVX512 codegen.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344457 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  4 +--
 lib/Target/X86/X86ISelLowering.cpp       | 35 +++++-------------------
 2 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 884d7174440..3564a767a09 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2814,8 +2814,8 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
                                DAG.getNode(ISD::SUB, dl, VT, Op,
                                            DAG.getConstant(1, dl, VT)));
     // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
-    if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
-        TLI.isOperationLegalOrCustom(ISD::CTLZ, VT))
+    if (!TLI.isOperationLegal(ISD::CTPOP, VT) &&
+        TLI.isOperationLegal(ISD::CTLZ, VT))
       return DAG.getNode(ISD::SUB, dl, VT,
                          DAG.getConstant(Len, dl, VT),
                          DAG.getNode(ISD::CTLZ, dl, VT, Tmp3));
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1411cf18902..1abe642a830 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -826,7 +826,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SETCC,              VT, Custom);
       setOperationAction(ISD::CTPOP,              VT, Custom);
-      setOperationAction(ISD::CTTZ,               VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1083,9 +1082,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SETCC,           VT, Custom);
       setOperationAction(ISD::CTPOP,           VT, Custom);
-      setOperationAction(ISD::CTTZ,            VT, Custom);
       setOperationAction(ISD::CTLZ,            VT, Custom);
 
+      // TODO - remove this once 256-bit X86ISD::ANDNP correctly split.
+      setOperationAction(ISD::CTTZ,  VT, HasInt256 ? Expand : Custom);
+
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
       setCondCodeAction(ISD::SETLT, VT, Custom);
@@ -1371,7 +1372,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SHL,              VT, Custom);
       setOperationAction(ISD::SRA,              VT, Custom);
       setOperationAction(ISD::CTPOP,            VT, Custom);
-      setOperationAction(ISD::CTTZ,             VT, Custom);
       setOperationAction(ISD::ROTL,             VT, Custom);
       setOperationAction(ISD::ROTR,             VT, Custom);
       setOperationAction(ISD::SETCC,            VT, Custom);
@@ -1402,7 +1402,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
       for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
         setOperationAction(ISD::CTLZ,            VT, Legal);
-        setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
       }
     } // Subtarget.hasCDI()
 
@@ -1491,7 +1490,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (Subtarget.hasCDI()) {
       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
         setOperationAction(ISD::CTLZ,            VT, Legal);
-        setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
       }
     } // Subtarget.hasCDI()
 
@@ -1586,7 +1584,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MLOAD,        VT, Legal);
       setOperationAction(ISD::MSTORE,       VT, Legal);
       setOperationAction(ISD::CTPOP,        VT, Custom);
-      setOperationAction(ISD::CTTZ,         VT, Custom);
       setOperationAction(ISD::CTLZ,         VT, Custom);
       setOperationAction(ISD::SMAX,         VT, Legal);
       setOperationAction(ISD::UMAX,         VT, Legal);
@@ -22999,29 +22996,11 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
 
-  if (VT.isVector()) {
-    // Decompose 256-bit ops into smaller 128-bit ops.
-    if (VT.is256BitVector() && !Subtarget.hasInt256())
-      return Lower256IntUnary(Op, DAG);
-
-    // cttz(x) = width - ctlz(~x & (x - 1))
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (TLI.isOperationLegal(ISD::CTLZ, VT) &&
-        !TLI.isOperationLegal(ISD::CTPOP, VT)) {
-      SDValue One = DAG.getConstant(1, dl, VT);
-      SDValue Width = DAG.getConstant(NumBits, dl, VT);
-      return DAG.getNode(
-          ISD::SUB, dl, VT, Width,
-          DAG.getNode(ISD::CTLZ, dl, VT,
-                      DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT),
-                                  DAG.getNode(ISD::SUB, dl, VT, N0, One))));
-    }
-
-    // Else leave it to the legalizer.
-    return SDValue();
-  }
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
+    return Lower256IntUnary(Op, DAG);
 
-  assert(Op.getOpcode() == ISD::CTTZ &&
+  assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
          "Only scalar CTTZ requires custom lowering");
 
   // Issue a bsf (scan bits forward) which also sets EFLAGS.
-- 
GitLab


From 21706932d747dc7d4908e03577b3823fb27683d9 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 13 Oct 2018 16:15:37 +0000
Subject: [PATCH 0154/1116] [InstCombine] fix complexity canonicalization with
 fake unary vector ops

This is a preliminary step to avoid regressions when we add
an actual 'fneg' instruction to IR. See D52934 and D53205.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344458 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineInternal.h  | 4 ++--
 test/Transforms/InstCombine/operand-complexity.ll | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 46c598d4bfb..3a18744e434 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -82,8 +82,8 @@ class User;
 ///   5 -> Other instructions
 static inline unsigned getComplexity(Value *V) {
   if (isa<Instruction>(V)) {
-    if (isa<CastInst>(V) || BinaryOperator::isNeg(V) ||
-        BinaryOperator::isFNeg(V) || BinaryOperator::isNot(V))
+    if (isa<CastInst>(V) || match(V, m_Neg(m_Value())) ||
+        match(V, m_Not(m_Value())) || match(V, m_FNeg(m_Value())))
       return 4;
     return 5;
   }
diff --git a/test/Transforms/InstCombine/operand-complexity.ll b/test/Transforms/InstCombine/operand-complexity.ll
index 747b0c836a5..20abe7b48f9 100644
--- a/test/Transforms/InstCombine/operand-complexity.ll
+++ b/test/Transforms/InstCombine/operand-complexity.ll
@@ -33,7 +33,7 @@ define <2 x i8> @neg_vec_undef(<2 x i8> %x) {
 ; CHECK-LABEL: @neg_vec_undef(
 ; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
 ; CHECK-NEXT:    [[NEGX:%.*]] = sub <2 x i8> <i8 0, i8 undef>, [[X]]
-; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[NEGX]], [[BO]]
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[BO]], [[NEGX]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
@@ -74,7 +74,7 @@ define <2 x i8> @not_vec_undef(<2 x i8> %x) {
 ; CHECK-LABEL: @not_vec_undef(
 ; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
 ; CHECK-NEXT:    [[NOTX:%.*]] = xor <2 x i8> [[X]], <i8 -1, i8 undef>
-; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[NOTX]], [[BO]]
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[BO]], [[NOTX]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
@@ -123,7 +123,7 @@ define <2 x float> @fneg_vec_undef(<2 x float> %x) {
 ; CHECK-LABEL: @fneg_vec_undef(
 ; CHECK-NEXT:    [[BO:%.*]] = fdiv <2 x float> [[X:%.*]], <float 4.200000e+01, float -4.200000e+01>
 ; CHECK-NEXT:    [[FNEGX:%.*]] = fsub <2 x float> <float -0.000000e+00, float undef>, [[X]]
-; CHECK-NEXT:    [[R:%.*]] = fmul <2 x float> [[FNEGX]], [[BO]]
+; CHECK-NEXT:    [[R:%.*]] = fmul <2 x float> [[BO]], [[FNEGX]]
 ; CHECK-NEXT:    call void @use_vec(<2 x float> [[FNEGX]])
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-- 
GitLab


From bb14c3e538cceb4a3e9dd8f81e1cc46191030eef Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Sat, 13 Oct 2018 16:58:03 +0000
Subject: [PATCH 0155/1116] [WebAssembly][NFC] Fix signed/unsigned comparison
 warning

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344459 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyInstrSIMD.td | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index af5c03599cd..b0fd6cab229 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -30,7 +30,9 @@ defm "" : ARGUMENT<V128, v2f64>;
 
 // Constrained immediate argument types
 foreach SIZE = [8, 16] in
-def ImmI#SIZE : ImmLeaf<i32, "return (Imm & ((1UL << "#SIZE#") - 1)) == Imm;">;
+def ImmI#SIZE : ImmLeaf<i32,
+  "return ((uint64_t)Imm & ((1UL << "#SIZE#") - 1)) == (uint64_t)Imm;"
+>;
 foreach SIZE = [2, 4, 8, 16, 32] in
 def LaneIdx#SIZE : ImmLeaf<i32, "return 0 <= Imm && Imm < "#SIZE#";">;
 
-- 
GitLab


From ebbe7135795e31b45ac408aa2174ca9314ce7bfd Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sat, 13 Oct 2018 17:47:20 +0000
Subject: [PATCH 0156/1116] [LegalizeTypes] Prevent an assertion from
 PromoteIntRes_BSWAP and PromoteIntRes_BITREVERSE if the shift amount is too
 large for the VT returned by getShiftAmountTy

Summary:
getShiftAmountTy for X86 returns MVT::i8. If a BSWAP or BITREVERSE is created that requires promotion and the difference between the original VT and the promoted VT is more than 255 then we won't able to create the constant.

This patch adds a check to replace the result from getShiftAmountTy to MVT::i32 if the difference won't fit. This should get legalized later when the shift is ultimately expanded since its clearly an illegal type that we're only promoting to make it a power of 2 bit width. Alternatively we could base the decision completely on the largest shift amount the promoted VT could use.

Vectors should be immune here because getShiftAmountTy always returns the incoming VT for vectors. Only the scalar shift amount can be changed by the targets.

Reviewers: eli.friedman, RKSimon, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53232

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344460 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  28 +-
 test/CodeGen/X86/bitreverse.ll                | 618 ++++++++++++++++++
 test/CodeGen/X86/bswap.ll                     | 150 +++++
 3 files changed, 788 insertions(+), 8 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index e11a18fd0c4..064e9e5875b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -311,6 +311,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
                      CreateStackStoreLoad(InOp, OutVT));
 }
 
+// Helper for BSWAP/BITREVERSE promotion to ensure we can fit the shift amount
+// in the VT returned by getShiftAmountTy and to return a safe VT if we can't.
+static EVT getShiftAmountTyForConstant(unsigned Val, EVT VT,
+                                       const TargetLowering &TLI,
+                                       SelectionDAG &DAG) {
+  EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  // If the value won't fit in the prefered type, just use something safe. It
+  // will be legalized when the shift is expanded.
+  if ((Log2_32(Val) + 1) > ShiftVT.getScalarSizeInBits())
+    ShiftVT = MVT::i32;
+  return ShiftVT;
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   EVT OVT = N->getValueType(0);
@@ -318,10 +331,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
   SDLoc dl(N);
 
   unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
-  return DAG.getNode(
-      ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
-      DAG.getConstant(DiffBits, dl,
-                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
+  EVT ShiftVT = getShiftAmountTyForConstant(DiffBits, NVT, TLI, DAG);
+  return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
+                     DAG.getConstant(DiffBits, dl, ShiftVT));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
@@ -331,10 +343,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
   SDLoc dl(N);
 
   unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
-  return DAG.getNode(
-      ISD::SRL, dl, NVT, DAG.getNode(ISD::BITREVERSE, dl, NVT, Op),
-      DAG.getConstant(DiffBits, dl,
-                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
+  EVT ShiftVT = getShiftAmountTyForConstant(DiffBits, NVT, TLI, DAG);
+  return DAG.getNode(ISD::SRL, dl, NVT,
+                     DAG.getNode(ISD::BITREVERSE, dl, NVT, Op),
+                     DAG.getConstant(DiffBits, dl, ShiftVT));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
diff --git a/test/CodeGen/X86/bitreverse.ll b/test/CodeGen/X86/bitreverse.ll
index 2e35fde6c55..aeac9e88dd0 100644
--- a/test/CodeGen/X86/bitreverse.ll
+++ b/test/CodeGen/X86/bitreverse.ll
@@ -523,3 +523,621 @@ define <2 x i16> @undef_v2i16() {
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> undef)
   ret <2 x i16> %b
 }
+
+; Make sure we don't assert during type legalization promoting a large
+; bitreverse due to the need for a large shift that won't fit in the i8 returned
+; from getShiftAmountTy.
+define i528 @large_promotion(i528 %A) nounwind {
+; X86-LABEL: large_promotion:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    bswapl %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    andl $252645135, %ebp # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ebp
+; X86-NEXT:    andl $-252645136, %ebx # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %ebx
+; X86-NEXT:    orl %ebp, %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    andl $858993459, %ebp # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %ebx # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %ebx
+; X86-NEXT:    leal (%ebx,%ebp,4), %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    andl $1431633920, %ebp # imm = 0x55550000
+; X86-NEXT:    andl $-1431699456, %ebx # imm = 0xAAAA0000
+; X86-NEXT:    shrl %ebx
+; X86-NEXT:    leal (%ebx,%ebp,2), %ebx
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    bswapl %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ebx
+; X86-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %edi
+; X86-NEXT:    leal (%edi,%ebx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %edi # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    leal (%edi,%ebx,2), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edi
+; X86-NEXT:    andl $-252645136, %esi # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %esi # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %esi
+; X86-NEXT:    leal (%esi,%edi,4), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %esi # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    leal (%esi,%edi,2), %ebx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    andl $-252645136, %edx # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %edx # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %edx # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    andl $-252645136, %ecx # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %ecx # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %ecx # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %edx
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %esi
+; X86-NEXT:    shrdl $16, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ecx, %ebx
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ebp, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ebx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %ebx
+; X86-NEXT:    shrdl $16, %edi, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrdl $16, %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, 60(%eax)
+; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    movl %ebx, 52(%eax)
+; X86-NEXT:    movl %ebp, 48(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 44(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 40(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 36(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    shrl $16, %edx
+; X86-NEXT:    movw %dx, 64(%eax)
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: large_promotion:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movq %rdi, %r12
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    bswapq %rbx
+; X64-NEXT:    movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    movq %rbx, %r10
+; X64-NEXT:    andq %r13, %r10
+; X64-NEXT:    shlq $4, %r10
+; X64-NEXT:    movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0
+; X64-NEXT:    andq %rax, %rbx
+; X64-NEXT:    shrq $4, %rbx
+; X64-NEXT:    orq %r10, %rbx
+; X64-NEXT:    movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333
+; X64-NEXT:    movq %rbx, %r10
+; X64-NEXT:    andq %r11, %r10
+; X64-NEXT:    movabsq $-3689348814741910324, %r14 # imm = 0xCCCCCCCCCCCCCCCC
+; X64-NEXT:    andq %r14, %rbx
+; X64-NEXT:    shrq $2, %rbx
+; X64-NEXT:    leaq (%rbx,%r10,4), %r10
+; X64-NEXT:    movabsq $6148820866244280320, %rbx # imm = 0x5555000000000000
+; X64-NEXT:    andq %r10, %rbx
+; X64-NEXT:    movabsq $-6149102341220990976, %rdi # imm = 0xAAAA000000000000
+; X64-NEXT:    andq %r10, %rdi
+; X64-NEXT:    shrq %rdi
+; X64-NEXT:    leaq (%rdi,%rbx,2), %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    bswapq %rbp
+; X64-NEXT:    movq %rbp, %rdi
+; X64-NEXT:    andq %r13, %rdi
+; X64-NEXT:    shlq $4, %rdi
+; X64-NEXT:    andq %rax, %rbp
+; X64-NEXT:    shrq $4, %rbp
+; X64-NEXT:    orq %rdi, %rbp
+; X64-NEXT:    movq %rbp, %rdi
+; X64-NEXT:    andq %r11, %rdi
+; X64-NEXT:    andq %r14, %rbp
+; X64-NEXT:    shrq $2, %rbp
+; X64-NEXT:    leaq (%rbp,%rdi,4), %rbp
+; X64-NEXT:    movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555
+; X64-NEXT:    movq %rbp, %r10
+; X64-NEXT:    andq %rbx, %r10
+; X64-NEXT:    movabsq $-6148914691236517206, %rdi # imm = 0xAAAAAAAAAAAAAAAA
+; X64-NEXT:    andq %rdi, %rbp
+; X64-NEXT:    shrq %rbp
+; X64-NEXT:    leaq (%rbp,%r10,2), %rbp
+; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; X64-NEXT:    bswapq %rbp
+; X64-NEXT:    movq %rbp, %r10
+; X64-NEXT:    andq %r13, %r10
+; X64-NEXT:    shlq $4, %r10
+; X64-NEXT:    andq %rax, %rbp
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    shrq $4, %rbp
+; X64-NEXT:    orq %r10, %rbp
+; X64-NEXT:    movq %rbp, %r10
+; X64-NEXT:    andq %r11, %r10
+; X64-NEXT:    andq %r14, %rbp
+; X64-NEXT:    shrq $2, %rbp
+; X64-NEXT:    leaq (%rbp,%r10,4), %rbp
+; X64-NEXT:    movq %rbp, %r10
+; X64-NEXT:    andq %rbx, %r10
+; X64-NEXT:    andq %rdi, %rbp
+; X64-NEXT:    shrq %rbp
+; X64-NEXT:    leaq (%rbp,%r10,2), %rbp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    bswapq %r10
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    andq %r13, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    andq %r15, %r10
+; X64-NEXT:    shrq $4, %r10
+; X64-NEXT:    orq %rax, %r10
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    andq %r11, %rax
+; X64-NEXT:    andq %r14, %r10
+; X64-NEXT:    shrq $2, %r10
+; X64-NEXT:    leaq (%r10,%rax,4), %rax
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    andq %rbx, %r10
+; X64-NEXT:    movabsq $-6148914691236517206, %r15 # imm = 0xAAAAAAAAAAAAAAAA
+; X64-NEXT:    andq %r15, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    leaq (%rax,%r10,2), %r10
+; X64-NEXT:    bswapq %r9
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    andq %r13, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    andq %rdi, %r9
+; X64-NEXT:    shrq $4, %r9
+; X64-NEXT:    orq %rax, %r9
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    andq %r11, %rax
+; X64-NEXT:    andq %r14, %r9
+; X64-NEXT:    shrq $2, %r9
+; X64-NEXT:    leaq (%r9,%rax,4), %rax
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    andq %rbx, %r9
+; X64-NEXT:    andq %r15, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    leaq (%rax,%r9,2), %r9
+; X64-NEXT:    bswapq %r8
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    andq %r13, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    andq %rdi, %r8
+; X64-NEXT:    shrq $4, %r8
+; X64-NEXT:    orq %rax, %r8
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    andq %r11, %rax
+; X64-NEXT:    andq %r14, %r8
+; X64-NEXT:    shrq $2, %r8
+; X64-NEXT:    leaq (%r8,%rax,4), %rax
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    andq %rbx, %r8
+; X64-NEXT:    andq %r15, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    leaq (%rax,%r8,2), %r8
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    andq %r13, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    andq %rdi, %rcx
+; X64-NEXT:    shrq $4, %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    andq %r11, %rax
+; X64-NEXT:    andq %r14, %rcx
+; X64-NEXT:    shrq $2, %rcx
+; X64-NEXT:    leaq (%rcx,%rax,4), %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    andq %rbx, %rcx
+; X64-NEXT:    andq %r15, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    leaq (%rax,%rcx,2), %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    andq %r13, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    andq %rdi, %rdx
+; X64-NEXT:    shrq $4, %rdx
+; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    andq %r11, %rax
+; X64-NEXT:    andq %r14, %rdx
+; X64-NEXT:    shrq $2, %rdx
+; X64-NEXT:    leaq (%rdx,%rax,4), %rax
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    andq %rbx, %rdx
+; X64-NEXT:    andq %r15, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    leaq (%rax,%rdx,2), %rax
+; X64-NEXT:    bswapq %rsi
+; X64-NEXT:    andq %rsi, %r13
+; X64-NEXT:    andq %rdi, %rsi
+; X64-NEXT:    shlq $4, %r13
+; X64-NEXT:    shrq $4, %rsi
+; X64-NEXT:    orq %r13, %rsi
+; X64-NEXT:    andq %rsi, %r11
+; X64-NEXT:    andq %r14, %rsi
+; X64-NEXT:    shrq $2, %rsi
+; X64-NEXT:    leaq (%rsi,%r11,4), %rdx
+; X64-NEXT:    andq %rdx, %rbx
+; X64-NEXT:    andq %r15, %rdx
+; X64-NEXT:    shrq %rdx
+; X64-NEXT:    leaq (%rdx,%rbx,2), %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    shrdq $48, %rdi, %rsi
+; X64-NEXT:    shrdq $48, %rbp, %rdi
+; X64-NEXT:    shrdq $48, %r10, %rbp
+; X64-NEXT:    shrdq $48, %r9, %r10
+; X64-NEXT:    shrdq $48, %r8, %r9
+; X64-NEXT:    shrdq $48, %rcx, %r8
+; X64-NEXT:    shrdq $48, %rax, %rcx
+; X64-NEXT:    shrdq $48, %rdx, %rax
+; X64-NEXT:    movq %rax, 56(%r12)
+; X64-NEXT:    movq %rcx, 48(%r12)
+; X64-NEXT:    movq %r8, 40(%r12)
+; X64-NEXT:    movq %r9, 32(%r12)
+; X64-NEXT:    movq %r10, 24(%r12)
+; X64-NEXT:    movq %rbp, 16(%r12)
+; X64-NEXT:    movq %rdi, 8(%r12)
+; X64-NEXT:    movq %rsi, (%r12)
+; X64-NEXT:    shrq $48, %rdx
+; X64-NEXT:    movw %dx, 64(%r12)
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+  %Z = call i528 @llvm.bitreverse.i528(i528 %A)
+  ret i528 %Z
+}
+declare i528 @llvm.bitreverse.i528(i528)
diff --git a/test/CodeGen/X86/bswap.ll b/test/CodeGen/X86/bswap.ll
index 756dd7fa6f6..4753fc27cc0 100644
--- a/test/CodeGen/X86/bswap.ll
+++ b/test/CodeGen/X86/bswap.ll
@@ -206,3 +206,153 @@ define i64 @finally_useful_bswap() {
   ret i64 %swapped
 }
 
+; Make sure we don't assert during type legalization promoting a large
+; bswap due to the need for a large shift that won't fit in the i8 returned
+; from getShiftAmountTy.
+define i528 @large_promotion(i528 %A) nounwind {
+; CHECK-LABEL: large_promotion:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $44, %esp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    shrdl $16, %ecx, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    bswapl %edx
+; CHECK-NEXT:    shrdl $16, %edx, %ecx
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    bswapl %esi
+; CHECK-NEXT:    shrdl $16, %esi, %edx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    bswapl %edi
+; CHECK-NEXT:    shrdl $16, %edi, %esi
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    bswapl %ebx
+; CHECK-NEXT:    shrdl $16, %ebx, %edi
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    bswapl %ebp
+; CHECK-NEXT:    shrdl $16, %ebp, %ebx
+; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    shrdl $16, %ecx, %ebp
+; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    shrdl $16, %eax, %ecx
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    shrdl $16, %ecx, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    shrdl $16, %eax, %ecx
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT:    bswapl %ebp
+; CHECK-NEXT:    shrdl $16, %ebp, %eax
+; CHECK-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    bswapl %ebx
+; CHECK-NEXT:    shrdl $16, %ebx, %ebp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    bswapl %esi
+; CHECK-NEXT:    shrdl $16, %esi, %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    bswapl %edx
+; CHECK-NEXT:    shrdl $16, %edx, %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    shrdl $16, %ecx, %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    bswapl %edi
+; CHECK-NEXT:    shrdl $16, %edi, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %ecx, 60(%eax)
+; CHECK-NEXT:    movl %edx, 56(%eax)
+; CHECK-NEXT:    movl %esi, 52(%eax)
+; CHECK-NEXT:    movl %ebx, 48(%eax)
+; CHECK-NEXT:    movl %ebp, 44(%eax)
+; CHECK-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 40(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 36(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 32(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 28(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 24(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 20(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 16(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 12(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 8(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 4(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    shrl $16, %edi
+; CHECK-NEXT:    movw %di, 64(%eax)
+; CHECK-NEXT:    addl $44, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl $4
+;
+; CHECK64-LABEL: large_promotion:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    pushq %rbx
+; CHECK64-NEXT:    movq %rdi, %rax
+; CHECK64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; CHECK64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; CHECK64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; CHECK64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK64-NEXT:    bswapq %r10
+; CHECK64-NEXT:    bswapq %rdi
+; CHECK64-NEXT:    shrdq $48, %rdi, %r10
+; CHECK64-NEXT:    bswapq %r11
+; CHECK64-NEXT:    shrdq $48, %r11, %rdi
+; CHECK64-NEXT:    bswapq %rbx
+; CHECK64-NEXT:    shrdq $48, %rbx, %r11
+; CHECK64-NEXT:    bswapq %r9
+; CHECK64-NEXT:    shrdq $48, %r9, %rbx
+; CHECK64-NEXT:    bswapq %r8
+; CHECK64-NEXT:    shrdq $48, %r8, %r9
+; CHECK64-NEXT:    bswapq %rcx
+; CHECK64-NEXT:    shrdq $48, %rcx, %r8
+; CHECK64-NEXT:    bswapq %rdx
+; CHECK64-NEXT:    shrdq $48, %rdx, %rcx
+; CHECK64-NEXT:    bswapq %rsi
+; CHECK64-NEXT:    shrdq $48, %rsi, %rdx
+; CHECK64-NEXT:    shrq $48, %rsi
+; CHECK64-NEXT:    movq %rdx, 56(%rax)
+; CHECK64-NEXT:    movq %rcx, 48(%rax)
+; CHECK64-NEXT:    movq %r8, 40(%rax)
+; CHECK64-NEXT:    movq %r9, 32(%rax)
+; CHECK64-NEXT:    movq %rbx, 24(%rax)
+; CHECK64-NEXT:    movq %r11, 16(%rax)
+; CHECK64-NEXT:    movq %rdi, 8(%rax)
+; CHECK64-NEXT:    movq %r10, (%rax)
+; CHECK64-NEXT:    movw %si, 64(%rax)
+; CHECK64-NEXT:    popq %rbx
+; CHECK64-NEXT:    retq
+  %Z = call i528 @llvm.bswap.i528(i528 %A)
+  ret i528 %Z
+}
+declare i528 @llvm.bswap.i528(i528)
-- 
GitLab


From 80d4302554550430bb1a9ea4331fe49a97df4f57 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 18:40:48 +0000
Subject: [PATCH 0157/1116] Pull out repeated variables from
 SelectionDAGLegalize::ExpandBitCount.

The CTPOP case has been changed from VT.getSizeInBits to VT.getScalarSizeInBits - but this fits in with future work for vector support (PR32655) and doesn't affect any current (scalar) uses.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344461 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 3564a767a09..bb2c76a6a41 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2709,13 +2709,12 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
 SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
                                              const SDLoc &dl) {
   EVT VT = Op.getValueType();
+  EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  unsigned Len = VT.getScalarSizeInBits();
 
   switch (Opc) {
   default: llvm_unreachable("Cannot expand this yet!");
   case ISD::CTPOP: {
-    EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-    unsigned Len = VT.getSizeInBits();
-
     assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
            "CTPOP not implemented for this type.");
 
@@ -2761,8 +2760,6 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     // This trivially expands to CTLZ.
     return DAG.getNode(ISD::CTLZ, dl, VT, Op);
   case ISD::CTLZ: {
-    unsigned Len = VT.getScalarSizeInBits();
-
     if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
       EVT SetCCVT = getSetCCResultType(VT);
       SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
@@ -2781,7 +2778,6 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     // return popcount(~x);
     //
     // Ref: "Hacker's Delight" by Henry Warren
-    EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     for (unsigned i = 0; (1U << i) <= (Len / 2); ++i) {
       SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT);
       Op = DAG.getNode(ISD::OR, dl, VT, Op,
@@ -2794,8 +2790,6 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     // This trivially expands to CTTZ.
     return DAG.getNode(ISD::CTTZ, dl, VT, Op);
   case ISD::CTTZ: {
-    unsigned Len = VT.getScalarSizeInBits();
-
     if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
       EVT SetCCVT = getSetCCResultType(VT);
       SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
-- 
GitLab


From 128986073212af3e3bf947d29247cdbf04d7e8e8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 21:32:49 +0000
Subject: [PATCH 0158/1116] [ARM] Regenerate popcnt tests

Improve codegen view as part of PR32655

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344465 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/ARM/popcnt.ll | 311 ++++++++++++++++++++++++++++++-------
 1 file changed, 257 insertions(+), 54 deletions(-)

diff --git a/test/CodeGen/ARM/popcnt.ll b/test/CodeGen/ARM/popcnt.ll
index fd61811f49c..224d5dcb3a6 100644
--- a/test/CodeGen/ARM/popcnt.ll
+++ b/test/CodeGen/ARM/popcnt.ll
@@ -1,17 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 ; Implement ctpop with vcnt
 
 define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vcnt8:
-;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-LABEL: vcnt8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1)
 	ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: vcntQ8:
-;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK-LABEL: vcntQ8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
 	%tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1)
 	ret <16 x i8> %tmp2
@@ -19,11 +29,16 @@ define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
 
 define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind {
 ; CHECK-LABEL: vcnt16:
-; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vrev16.8 d17, d16
+; CHECK-NEXT:    vadd.i8 d16, d16, d17
+; CHECK-NEXT:    vorr d17, d16, d16
+; CHECK-NEXT:    vuzp.8 d16, d17
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %tmp1)
 	ret <4 x i16> %tmp2
@@ -31,11 +46,17 @@ define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind {
 
 define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind {
 ; CHECK-LABEL: vcntQ16:
-; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vrev16.8 q9, q8
+; CHECK-NEXT:    vadd.i8 q8, q8, q9
+; CHECK-NEXT:    vorr q9, q8, q8
+; CHECK-NEXT:    vuzp.8 q8, q9
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i16>, <8 x i16>* %A
 	%tmp2 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %tmp1)
 	ret <8 x i16> %tmp2
@@ -43,14 +64,21 @@ define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind {
 
 define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind {
 ; CHECK-LABEL: vcnt32:
-; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
-; CHECK: vrev32.16 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vuzp.16 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vrev16.8 d17, d16
+; CHECK-NEXT:    vadd.i8 d16, d16, d17
+; CHECK-NEXT:    vorr d17, d16, d16
+; CHECK-NEXT:    vuzp.8 d16, d17
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vrev32.16 d18, d16
+; CHECK-NEXT:    vadd.i16 d16, d16, d18
+; CHECK-NEXT:    vorr d17, d16, d16
+; CHECK-NEXT:    vuzp.16 d16, d17
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %tmp1)
 	ret <2 x i32> %tmp2
@@ -58,14 +86,22 @@ define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind {
 
 define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind {
 ; CHECK-LABEL: vcntQ32:
-; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
-; CHECK: vrev32.16 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vuzp.16 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vrev16.8 q9, q8
+; CHECK-NEXT:    vadd.i8 q8, q8, q9
+; CHECK-NEXT:    vorr q9, q8, q8
+; CHECK-NEXT:    vuzp.8 q8, q9
+; CHECK-NEXT:    vmovl.u8 q9, d16
+; CHECK-NEXT:    vrev32.16 q9, q9
+; CHECK-NEXT:    vaddw.u8 q8, q9, d16
+; CHECK-NEXT:    vorr q9, q8, q8
+; CHECK-NEXT:    vuzp.16 q8, q9
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i32>, <4 x i32>* %A
 	%tmp2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %tmp1)
 	ret <4 x i32> %tmp2
@@ -73,6 +109,51 @@ define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind {
 
 define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind {
 ; CHECK-LABEL: vcnt64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    ldr r2, .LCPI6_0
+; CHECK-NEXT:    vmov.32 r0, d16[0]
+; CHECK-NEXT:    ldr r3, .LCPI6_3
+; CHECK-NEXT:    vmov.32 r1, d16[1]
+; CHECK-NEXT:    ldr lr, .LCPI6_2
+; CHECK-NEXT:    ldr r12, .LCPI6_1
+; CHECK-NEXT:    vldr s1, .LCPI6_4
+; CHECK-NEXT:    and r4, r2, r0, lsr #1
+; CHECK-NEXT:    sub r0, r0, r4
+; CHECK-NEXT:    and r2, r2, r1, lsr #1
+; CHECK-NEXT:    sub r1, r1, r2
+; CHECK-NEXT:    and r4, r0, r3
+; CHECK-NEXT:    and r0, r3, r0, lsr #2
+; CHECK-NEXT:    and r2, r1, r3
+; CHECK-NEXT:    add r0, r4, r0
+; CHECK-NEXT:    and r1, r3, r1, lsr #2
+; CHECK-NEXT:    add r1, r2, r1
+; CHECK-NEXT:    add r0, r0, r0, lsr #4
+; CHECK-NEXT:    and r0, r0, lr
+; CHECK-NEXT:    add r1, r1, r1, lsr #4
+; CHECK-NEXT:    mul r2, r0, r12
+; CHECK-NEXT:    and r0, r1, lr
+; CHECK-NEXT:    mul r1, r0, r12
+; CHECK-NEXT:    lsr r0, r2, #24
+; CHECK-NEXT:    add r0, r0, r1, lsr #24
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    pop {r4, lr}
+; CHECK-NEXT:    mov pc, lr
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI6_0:
+; CHECK-NEXT:    .long 1431655765 @ 0x55555555
+; CHECK-NEXT:  .LCPI6_1:
+; CHECK-NEXT:    .long 16843009 @ 0x1010101
+; CHECK-NEXT:  .LCPI6_2:
+; CHECK-NEXT:    .long 252645135 @ 0xf0f0f0f
+; CHECK-NEXT:  .LCPI6_3:
+; CHECK-NEXT:    .long 858993459 @ 0x33333333
+; CHECK-NEXT:  .LCPI6_4:
+; CHECK-NEXT:    .long 0 @ float 0
 	%tmp1 = load <1 x i64>, <1 x i64>* %A
 	%tmp2 = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %tmp1)
 	ret <1 x i64> %tmp2
@@ -80,6 +161,74 @@ define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind {
 
 define <2 x i64> @vcntQ64(<2 x i64>* %A) nounwind {
 ; CHECK-LABEL: vcntQ64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov.32 r1, d17[1]
+; CHECK-NEXT:    ldr lr, .LCPI7_0
+; CHECK-NEXT:    vmov.32 r2, d17[0]
+; CHECK-NEXT:    ldr r0, .LCPI7_2
+; CHECK-NEXT:    vmov.32 r3, d16[0]
+; CHECK-NEXT:    ldr r12, .LCPI7_1
+; CHECK-NEXT:    ldr r5, .LCPI7_3
+; CHECK-NEXT:    vldr s3, .LCPI7_4
+; CHECK-NEXT:    and r4, lr, r1, lsr #1
+; CHECK-NEXT:    sub r1, r1, r4
+; CHECK-NEXT:    and r4, r1, r0
+; CHECK-NEXT:    and r1, r0, r1, lsr #2
+; CHECK-NEXT:    add r1, r4, r1
+; CHECK-NEXT:    and r4, lr, r2, lsr #1
+; CHECK-NEXT:    sub r2, r2, r4
+; CHECK-NEXT:    and r4, r2, r0
+; CHECK-NEXT:    add r1, r1, r1, lsr #4
+; CHECK-NEXT:    and r2, r0, r2, lsr #2
+; CHECK-NEXT:    and r6, r1, r12
+; CHECK-NEXT:    add r2, r4, r2
+; CHECK-NEXT:    and r4, lr, r3, lsr #1
+; CHECK-NEXT:    sub r3, r3, r4
+; CHECK-NEXT:    and r4, r3, r0
+; CHECK-NEXT:    add r2, r2, r2, lsr #4
+; CHECK-NEXT:    and r3, r0, r3, lsr #2
+; CHECK-NEXT:    and r2, r2, r12
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    add r3, r3, r3, lsr #4
+; CHECK-NEXT:    and r3, r3, r12
+; CHECK-NEXT:    mul r4, r3, r5
+; CHECK-NEXT:    vmov.32 r3, d16[1]
+; CHECK-NEXT:    and r1, lr, r3, lsr #1
+; CHECK-NEXT:    sub r1, r3, r1
+; CHECK-NEXT:    and r3, r1, r0
+; CHECK-NEXT:    and r0, r0, r1, lsr #2
+; CHECK-NEXT:    mul r1, r2, r5
+; CHECK-NEXT:    add r0, r3, r0
+; CHECK-NEXT:    mul r2, r6, r5
+; CHECK-NEXT:    add r0, r0, r0, lsr #4
+; CHECK-NEXT:    and r0, r0, r12
+; CHECK-NEXT:    mul r3, r0, r5
+; CHECK-NEXT:    lsr r0, r1, #24
+; CHECK-NEXT:    lsr r1, r4, #24
+; CHECK-NEXT:    add r0, r0, r2, lsr #24
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    add r0, r1, r3, lsr #24
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    pop {r4, r5, r6, lr}
+; CHECK-NEXT:    mov pc, lr
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI7_0:
+; CHECK-NEXT:    .long 1431655765 @ 0x55555555
+; CHECK-NEXT:  .LCPI7_1:
+; CHECK-NEXT:    .long 252645135 @ 0xf0f0f0f
+; CHECK-NEXT:  .LCPI7_2:
+; CHECK-NEXT:    .long 858993459 @ 0x33333333
+; CHECK-NEXT:  .LCPI7_3:
+; CHECK-NEXT:    .long 16843009 @ 0x1010101
+; CHECK-NEXT:  .LCPI7_4:
+; CHECK-NEXT:    .long 0 @ float 0
 	%tmp1 = load <2 x i64>, <2 x i64>* %A
 	%tmp2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp1)
 	ret <2 x i64> %tmp2
@@ -95,48 +244,75 @@ declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>) nounwind readnone
 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
 
 define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vclz8:
-;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-LABEL: vclz8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vclz.i8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0)
 	ret <8 x i8> %tmp2
 }
 
 define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vclz16:
-;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-LABEL: vclz16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vclz.i16 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0)
 	ret <4 x i16> %tmp2
 }
 
 define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vclz32:
-;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-LABEL: vclz32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0)
 	ret <2 x i32> %tmp2
 }
 
 define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: vclzQ8:
-;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK-LABEL: vclzQ8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vclz.i8 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
 	%tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0)
 	ret <16 x i8> %tmp2
 }
 
 define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: vclzQ16:
-;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK-LABEL: vclzQ16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vclz.i16 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i16>, <8 x i16>* %A
 	%tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0)
 	ret <8 x i16> %tmp2
 }
 
 define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: vclzQ32:
-;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK-LABEL: vclzQ32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vclz.i32 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i32>, <4 x i32>* %A
 	%tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0)
 	ret <4 x i32> %tmp2
@@ -151,48 +327,75 @@ declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
 
 define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vclss8:
-;CHECK: vcls.s8
+; CHECK-LABEL: vclss8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcls.s8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
 	ret <8 x i8> %tmp2
 }
 
 define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vclss16:
-;CHECK: vcls.s16
+; CHECK-LABEL: vclss16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcls.s16 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
 	ret <4 x i16> %tmp2
 }
 
 define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vclss32:
-;CHECK: vcls.s32
+; CHECK-LABEL: vclss32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcls.s32 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
 	ret <2 x i32> %tmp2
 }
 
 define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: vclsQs8:
-;CHECK: vcls.s8
+; CHECK-LABEL: vclsQs8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcls.s8 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
 	%tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
 	ret <16 x i8> %tmp2
 }
 
 define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: vclsQs16:
-;CHECK: vcls.s16
+; CHECK-LABEL: vclsQs16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcls.s16 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i16>, <8 x i16>* %A
 	%tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
 	ret <8 x i16> %tmp2
 }
 
 define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: vclsQs32:
-;CHECK: vcls.s32
+; CHECK-LABEL: vclsQs32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcls.s32 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i32>, <4 x i32>* %A
 	%tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
 	ret <4 x i32> %tmp2
-- 
GitLab


From cc018b73f8c92f38cf715249d13015b141ebe458 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Oct 2018 21:50:15 +0000
Subject: [PATCH 0159/1116] [AARCH64] Regenerate popcnt tests

Improve codegen view as part of PR32655

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344466 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/AArch64/arm64-vpopcnt.ll | 157 +++++++++++++++++++++++---
 1 file changed, 141 insertions(+), 16 deletions(-)

diff --git a/test/CodeGen/AArch64/arm64-vpopcnt.ll b/test/CodeGen/AArch64/arm64-vpopcnt.ll
index 4fb73ca4805..0c223ced9ac 100644
--- a/test/CodeGen/AArch64/arm64-vpopcnt.ll
+++ b/test/CodeGen/AArch64/arm64-vpopcnt.ll
@@ -1,65 +1,190 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-apple- -mcpu=cyclone | FileCheck %s
 
 ; The non-byte ones used to fail with "Cannot select"
 
-; CHECK-LABEL: ctpopv8i8
-; CHECK: cnt.8b
 define <8 x i8> @ctpopv8i8(<8 x i8> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    ret
   %cnt = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %x)
   ret <8 x i8> %cnt
 }
 
 declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
 
-; CHECK-LABEL: ctpopv4i16
-; CHECK: cnt.8b
 define <4 x i16> @ctpopv4i16(<4 x i16> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    cnt v1.8b, v1.8b
+; CHECK-NEXT:    uaddlv h1, v1.8b
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    fmov d2, x8
+; CHECK-NEXT:    cnt v2.8b, v2.8b
+; CHECK-NEXT:    uaddlv h2, v2.8b
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov v1.h[1], w8
+; CHECK-NEXT:    umov w8, v0.h[2]
+; CHECK-NEXT:    fmov d2, x8
+; CHECK-NEXT:    cnt v2.8b, v2.8b
+; CHECK-NEXT:    uaddlv h2, v2.8b
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov v1.h[2], w8
+; CHECK-NEXT:    umov w8, v0.h[3]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlv h0, v0.8b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov v1.h[3], w8
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %cnt = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %x)
   ret <4 x i16> %cnt
 }
 
 declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
 
-; CHECK-LABEL: ctpopv2i32
-; CHECK: cnt.8b
 define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlv h0, v0.8b
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    cnt v1.8b, v1.8b
+; CHECK-NEXT:    uaddlv h1, v1.8b
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
   ret <2 x i32> %cnt
 }
 
 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
 
-
-; CHECK-LABEL: ctpopv16i8
-; CHECK: cnt.16b
 define <16 x i8> @ctpopv16i8(<16 x i8> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    ret
   %cnt = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %x)
   ret <16 x i8> %cnt
 }
 
 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
 
-; CHECK-LABEL: ctpopv8i16
-; CHECK: cnt.8b
 define <8 x i16> @ctpopv8i16(<8 x i16> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    cnt v1.8b, v1.8b
+; CHECK-NEXT:    uaddlv h1, v1.8b
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    umov w9, v0.h[0]
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    cnt v1.8b, v1.8b
+; CHECK-NEXT:    uaddlv h1, v1.8b
+; CHECK-NEXT:    mov v1.h[1], w8
+; CHECK-NEXT:    umov w8, v0.h[2]
+; CHECK-NEXT:    fmov d2, x8
+; CHECK-NEXT:    cnt v2.8b, v2.8b
+; CHECK-NEXT:    uaddlv h2, v2.8b
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov v1.h[2], w8
+; CHECK-NEXT:    umov w8, v0.h[3]
+; CHECK-NEXT:    fmov d2, x8
+; CHECK-NEXT:    cnt v2.8b, v2.8b
+; CHECK-NEXT:    uaddlv h2, v2.8b
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov v1.h[3], w8
+; CHECK-NEXT:    umov w8, v0.h[4]
+; CHECK-NEXT:    fmov d2, x8
+; CHECK-NEXT:    cnt v2.8b, v2.8b
+; CHECK-NEXT:    uaddlv h2, v2.8b
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov v1.h[4], w8
+; CHECK-NEXT:    umov w8, v0.h[5]
+; CHECK-NEXT:    fmov d2, x8
+; CHECK-NEXT:    cnt v2.8b, v2.8b
+; CHECK-NEXT:    uaddlv h2, v2.8b
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov v1.h[5], w8
+; CHECK-NEXT:    umov w8, v0.h[6]
+; CHECK-NEXT:    fmov d2, x8
+; CHECK-NEXT:    cnt v2.8b, v2.8b
+; CHECK-NEXT:    uaddlv h2, v2.8b
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov v1.h[6], w8
+; CHECK-NEXT:    umov w8, v0.h[7]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlv h0, v0.8b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov v1.h[7], w8
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %cnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x)
   ret <8 x i16> %cnt
 }
 
 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
 
-; CHECK-LABEL: ctpopv4i32
-; CHECK: cnt.8b
 define <4 x i32> @ctpopv4i32(<4 x i32> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    mov w9, v0.s[2]
+; CHECK-NEXT:    mov w10, v0.s[3]
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlv h0, v0.8b
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    cnt v1.8b, v1.8b
+; CHECK-NEXT:    uaddlv h1, v1.8b
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    cnt v1.8b, v1.8b
+; CHECK-NEXT:    uaddlv h1, v1.8b
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov v0.s[2], w8
+; CHECK-NEXT:    fmov d1, x10
+; CHECK-NEXT:    cnt v1.8b, v1.8b
+; CHECK-NEXT:    uaddlv h1, v1.8b
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    ret
   %cnt = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
   ret <4 x i32> %cnt
 }
 
 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
 
-; CHECK-LABEL: ctpopv2i64
-; CHECK: cnt.8b
 define <2 x i64> @ctpopv2i64(<2 x i64> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v1.8b, v0.8b
+; CHECK-NEXT:    uaddlv h1, v1.8b
+; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    fmov d1, x0
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlv h0, v0.8b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov v1.d[1], x8
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %cnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
   ret <2 x i64> %cnt
 }
-- 
GitLab


From 54d4881c352796b18bfe7314662a294754e3a752 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sat, 13 Oct 2018 21:53:40 +0000
Subject: [PATCH 0160/1116] [ORC] During lookup, do not match against hidden
 symbols in other JITDylibs.

This adds two arguments to the main ExecutionSession::lookup method:
MatchNonExportedInJD, and MatchNonExported. These control whether and where
hidden symbols should be matched when searching a list of JITDylibs.

A similar effect could have been achieved by filtering search results, but
this would have involved materializing symbol definitions (since materialization
is triggered on lookup) only to throw the results away, among other issues.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344467 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/Orc/Core.h       | 56 ++++++-----
 lib/ExecutionEngine/Orc/Core.cpp              | 94 +++++++++++--------
 lib/ExecutionEngine/Orc/ExecutionUtils.cpp    |  5 +-
 lib/ExecutionEngine/Orc/IndirectionUtils.cpp  |  3 +-
 lib/ExecutionEngine/Orc/LLJIT.cpp             |  2 +-
 lib/ExecutionEngine/Orc/LazyReexports.cpp     |  4 +-
 .../Orc/RTDyldObjectLinkingLayer.cpp          |  2 +-
 .../ExecutionEngine/Orc/CoreAPIsTest.cpp      | 60 +++++++-----
 8 files changed, 129 insertions(+), 97 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h
index f3ea2aef620..24cdeeae42e 100644
--- a/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/include/llvm/ExecutionEngine/Orc/Core.h
@@ -628,10 +628,12 @@ private:
                                 const SymbolNameSet &Names);
 
   void lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                  SymbolNameSet &Unresolved, MaterializationUnitList &MUs);
+                  SymbolNameSet &Unresolved, JITDylib *MatchNonExportedInJD,
+                  bool MatchNonExported, MaterializationUnitList &MUs);
 
   void lodgeQueryImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                      SymbolNameSet &Unresolved, MaterializationUnitList &MUs);
+                      SymbolNameSet &Unresolved, JITDylib *MatchNonExportedInJD,
+                      bool MatchNonExported, MaterializationUnitList &MUs);
 
   LookupImplActionFlags
   lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
@@ -766,9 +768,19 @@ public:
   /// dependenant symbols for this query (e.g. it is being made by a top level
   /// client to get an address to call) then the value NoDependenciesToRegister
   /// can be used.
+  ///
+  /// If the MatchNonExportedInJD pointer is non-null, then the lookup will find
+  /// non-exported symbols defined in the JITDylib pointed to by
+  /// MatchNonExportedInJD.
+  /// If MatchNonExported is true the lookup will find non-exported symbols in
+  /// any JITDylib (setting MatchNonExportedInJD is redundant in such cases).
+  /// If MatchNonExported is false and MatchNonExportedInJD is null,
+  /// non-exported symbols will never be found.
   void lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
               SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
-              RegisterDependenciesFunction RegisterDependencies);
+              RegisterDependenciesFunction RegisterDependencies,
+              JITDylib *MatchNonExportedInJD = nullptr,
+              bool MatchNonExported = false);
 
   /// Blocking version of lookup above. Returns the resolved symbol map.
   /// If WaitUntilReady is true (the default), will not return until all
@@ -779,18 +791,22 @@ public:
   /// error will be reported via reportErrors.
   Expected<SymbolMap> lookup(const JITDylibList &JDs,
                              const SymbolNameSet &Symbols,
-                             RegisterDependenciesFunction RegisterDependencies,
-                             bool WaitUntilReady = true);
-
-  /// Convenience version of the blocking version of lookup above. Uses the main
-  /// JITDylib's search order as the lookup order, and registers no
-  /// dependencies.
-  Expected<SymbolMap> lookup(const SymbolNameSet &Symbols) {
-    return getMainJITDylib().withSearchOrderDo(
-        [&](const JITDylibList &SearchOrder) {
-          return lookup(SearchOrder, Symbols, NoDependenciesToRegister, true);
-        });
-  }
+                             RegisterDependenciesFunction RegisterDependencies =
+                                 NoDependenciesToRegister,
+                             bool WaitUntilReady = true,
+                             JITDylib *MatchNonExportedInJD = nullptr,
+                             bool MatchNonExported = false);
+
+  /// Convenience version of blocking lookup.
+  /// Performs a single-symbol lookup.
+  Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs,
+                                      SymbolStringPtr Symbol,
+                                      bool MatchNonExported = false);
+
+  /// Convenience version of blocking lookup.
+  /// Performs a single-symbol lookup, auto-interning the given symbol name.
+  Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs, StringRef Symbol,
+                                      bool MatchNonExported = false);
 
   /// Materialize the given unit.
   void dispatchMaterialization(JITDylib &JD,
@@ -873,16 +889,6 @@ Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &MU) {
   });
 }
 
-/// Look up the given names in the given JITDylibs.
-/// JDs will be searched in order and no JITDylib pointer may be null.
-/// All symbols must be found within the given JITDylibs or an error
-/// will be returned.
-Expected<SymbolMap> lookup(const JITDylibList &JDs, SymbolNameSet Names);
-
-/// Look up a symbol by searching a list of JITDylibs.
-Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs,
-                                    SymbolStringPtr Name);
-
 /// Mangles symbol names then uniques them in the context of an
 /// ExecutionSession.
 class MangleAndInterner {
diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index 86a7ecaaf07..c9cfacef61b 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -646,7 +646,7 @@ void ReExportsMaterializationUnit::materialize(
     auto OnReady = [&ES](Error Err) { ES.reportError(std::move(Err)); };
 
     ES.lookup({&SrcJD}, QuerySymbols, std::move(OnResolve), std::move(OnReady),
-              std::move(RegisterDependencies));
+              std::move(RegisterDependencies), nullptr, true);
   }
 }
 
@@ -1151,16 +1151,18 @@ SymbolNameSet JITDylib::lookupFlagsImpl(SymbolFlagsMap &Flags,
 
 void JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
                           SymbolNameSet &Unresolved,
+                          JITDylib *MatchNonExportedInJD, bool MatchNonExported,
                           MaterializationUnitList &MUs) {
   assert(Q && "Query can not be null");
 
-  lodgeQueryImpl(Q, Unresolved, MUs);
+  lodgeQueryImpl(Q, Unresolved, MatchNonExportedInJD, MatchNonExported, MUs);
   if (FallbackDefinitionGenerator && !Unresolved.empty()) {
     auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved);
     if (!FallbackDefs.empty()) {
       for (auto &D : FallbackDefs)
         Unresolved.erase(D);
-      lodgeQueryImpl(Q, FallbackDefs, MUs);
+      lodgeQueryImpl(Q, FallbackDefs, MatchNonExportedInJD, MatchNonExported,
+                     MUs);
       assert(FallbackDefs.empty() &&
              "All fallback defs should have been found by lookupImpl");
     }
@@ -1169,6 +1171,7 @@ void JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
 
 void JITDylib::lodgeQueryImpl(
     std::shared_ptr<AsynchronousSymbolQuery> &Q, SymbolNameSet &Unresolved,
+    JITDylib *MatchNonExportedInJD, bool MatchNonExported,
     std::vector<std::unique_ptr<MaterializationUnit>> &MUs) {
   for (auto I = Unresolved.begin(), E = Unresolved.end(); I != E;) {
     auto TmpI = I++;
@@ -1179,8 +1182,15 @@ void JITDylib::lodgeQueryImpl(
     if (SymI == Symbols.end())
       continue;
 
-    // If we found Name in JD, remove it frome the Unresolved set and add it
-    // to the added set.
+    // If this is a non-exported symbol, then check the values of
+    // MatchNonExportedInJD and MatchNonExported. Skip if we should not match
+    // against this symbol.
+    if (!SymI->second.getFlags().isExported())
+      if (!MatchNonExported && MatchNonExportedInJD != this)
+        continue;
+
+    // If we matched against Name in JD, remove it frome the Unresolved set and
+    // add it to the added set.
     Unresolved.erase(TmpI);
 
     // If the symbol has an address then resolve it.
@@ -1695,18 +1705,20 @@ Expected<SymbolMap> ExecutionSession::legacyLookup(
 #endif
 }
 
-void ExecutionSession::lookup(
-    const JITDylibList &JDs, SymbolNameSet Symbols,
-    SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
-    RegisterDependenciesFunction RegisterDependencies) {
+void ExecutionSession::lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
+                              SymbolsResolvedCallback OnResolve,
+                              SymbolsReadyCallback OnReady,
+                              RegisterDependenciesFunction RegisterDependencies,
+                              JITDylib *MatchNonExportedInJD,
+                              bool MatchNonExported) {
 
   // lookup can be re-entered recursively if running on a single thread. Run any
-  // outstanding MUs in case this query depends on them, otherwise the main
-  // thread will starve waiting for a result from an MU that it failed to run.
+  // outstanding MUs in case this query depends on them, otherwise this lookup
+  // will starve waiting for a result from an MU that is stuck in the queue.
   runOutstandingMUs();
 
   auto Unresolved = std::move(Symbols);
-  std::map<JITDylib *, MaterializationUnitList> MUsMap;
+  std::map<JITDylib *, MaterializationUnitList> CollectedMUsMap;
   auto Q = std::make_shared<AsynchronousSymbolQuery>(
       Unresolved, std::move(OnResolve), std::move(OnReady));
   bool QueryIsFullyResolved = false;
@@ -1716,9 +1728,10 @@ void ExecutionSession::lookup(
   runSessionLocked([&]() {
     for (auto *JD : JDs) {
       assert(JD && "JITDylibList entries must not be null");
-      assert(!MUsMap.count(JD) &&
+      assert(!CollectedMUsMap.count(JD) &&
              "JITDylibList should not contain duplicate entries");
-      JD->lodgeQuery(Q, Unresolved, MUsMap[JD]);
+      JD->lodgeQuery(Q, Unresolved, MatchNonExportedInJD, MatchNonExported,
+                     CollectedMUsMap[JD]);
     }
 
     if (Unresolved.empty()) {
@@ -1741,7 +1754,7 @@ void ExecutionSession::lookup(
       Q->detach();
 
       // Replace the MUs.
-      for (auto &KV : MUsMap)
+      for (auto &KV : CollectedMUsMap)
         for (auto &MU : KV.second)
           KV.first->replace(std::move(MU));
     }
@@ -1761,7 +1774,7 @@ void ExecutionSession::lookup(
   {
     std::lock_guard<std::recursive_mutex> Lock(OutstandingMUsMutex);
 
-    for (auto &KV : MUsMap)
+    for (auto &KV : CollectedMUsMap)
       for (auto &MU : KV.second)
         OutstandingMUs.push_back(std::make_pair(KV.first, std::move(MU)));
   }
@@ -1772,7 +1785,8 @@ void ExecutionSession::lookup(
 Expected<SymbolMap>
 ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
                          RegisterDependenciesFunction RegisterDependencies,
-                         bool WaitUntilReady) {
+                         bool WaitUntilReady, JITDylib *MatchNonExportedInJD,
+                         bool MatchNonExported) {
 #if LLVM_ENABLE_THREADS
   // In the threaded case we use promises to return the results.
   std::promise<SymbolMap> PromisedResult;
@@ -1839,7 +1853,8 @@ ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
 #endif
 
   // Perform the asynchronous lookup.
-  lookup(JDs, Symbols, OnResolve, OnReady, RegisterDependencies);
+  lookup(JDs, Symbols, OnResolve, OnReady, RegisterDependencies,
+         MatchNonExportedInJD, MatchNonExported);
 
 #if LLVM_ENABLE_THREADS
   auto ResultFuture = PromisedResult.get_future();
@@ -1882,6 +1897,27 @@ ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
 #endif
 }
 
+/// Look up a symbol by searching a list of JDs.
+Expected<JITEvaluatedSymbol> ExecutionSession::lookup(const JITDylibList &JDs,
+                                                      SymbolStringPtr Name,
+                                                      bool MatchNonExported) {
+  SymbolNameSet Names({Name});
+
+  if (auto ResultMap = lookup(JDs, std::move(Names), NoDependenciesToRegister,
+                              true, nullptr, MatchNonExported)) {
+    assert(ResultMap->size() == 1 && "Unexpected number of results");
+    assert(ResultMap->count(Name) && "Missing result for symbol");
+    return std::move(ResultMap->begin()->second);
+  } else
+    return ResultMap.takeError();
+}
+
+Expected<JITEvaluatedSymbol> ExecutionSession::lookup(const JITDylibList &JDs,
+                                                      StringRef Name,
+                                                      bool MatchNonExported) {
+  return lookup(JDs, intern(Name), MatchNonExported);
+}
+
 void ExecutionSession::dump(raw_ostream &OS) {
   runSessionLocked([this, &OS]() {
     for (auto &JD : JDs)
@@ -1910,28 +1946,6 @@ void ExecutionSession::runOutstandingMUs() {
   }
 }
 
-Expected<SymbolMap> lookup(const JITDylibList &JDs, SymbolNameSet Names) {
-
-  if (JDs.empty())
-    return SymbolMap();
-
-  auto &ES = (*JDs.begin())->getExecutionSession();
-
-  return ES.lookup(JDs, Names, NoDependenciesToRegister, true);
-}
-
-/// Look up a symbol by searching a list of JDs.
-Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs,
-                                    SymbolStringPtr Name) {
-  SymbolNameSet Names({Name});
-  if (auto ResultMap = lookup(JDs, std::move(Names))) {
-    assert(ResultMap->size() == 1 && "Unexpected number of results");
-    assert(ResultMap->count(Name) && "Missing result for symbol");
-    return std::move(ResultMap->begin()->second);
-  } else
-    return ResultMap.takeError();
-}
-
 MangleAndInterner::MangleAndInterner(ExecutionSession &ES, const DataLayout &DL)
     : ES(ES), DL(DL) {}
 
diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 47cb273ee12..6a180106240 100644
--- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -128,7 +128,10 @@ Error CtorDtorRunner2::run() {
     }
   }
 
-  if (auto CtorDtorMap = lookup({&JD}, std::move(Names))) {
+  auto &ES = JD.getExecutionSession();
+  if (auto CtorDtorMap =
+          ES.lookup({&JD}, std::move(Names), NoDependenciesToRegister, true,
+                    nullptr, true)) {
     for (auto &KV : CtorDtorsByPriority) {
       for (auto &Name : KV.second) {
         assert(CtorDtorMap->count(Name) && "No entry for Name");
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index d7fd57b6e53..6bc33c90cbc 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -99,9 +99,10 @@ JITTargetAddress JITCompileCallbackManager::executeCompileCallback(
       Name = I->second;
   }
 
-  if (auto Sym = lookup({&CallbacksJD}, Name))
+  if (auto Sym = ES.lookup({&CallbacksJD}, Name, true))
     return Sym->getAddress();
   else {
+    llvm::dbgs() << "Didn't find callback.\n";
     // If anything goes wrong materializing Sym then report it to the session
     // and return the ErrorHandlerAddress;
     ES.reportError(Sym.takeError());
diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp
index 47baa45a8aa..39bb4c48067 100644
--- a/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -78,7 +78,7 @@ Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
 
 Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
                                                         StringRef Name) {
-  return llvm::orc::lookup({&JD}, ES->intern(Name));
+  return ES->lookup({&JD}, ES->intern(Name));
 }
 
 LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES,
diff --git a/lib/ExecutionEngine/Orc/LazyReexports.cpp b/lib/ExecutionEngine/Orc/LazyReexports.cpp
index 0d8049178b5..1cce0c6cd2c 100644
--- a/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -52,8 +52,8 @@ LazyCallThroughManager::callThroughToSymbol(JITTargetAddress TrampolineAddr) {
     SymbolName = I->second.second;
   }
 
-  auto LookupResult =
-      ES.lookup({SourceJD}, {SymbolName}, NoDependenciesToRegister);
+  auto LookupResult = ES.lookup({SourceJD}, {SymbolName},
+                                NoDependenciesToRegister, true, nullptr, true);
 
   if (!LookupResult) {
     ES.reportError(LookupResult.takeError());
diff --git a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index a2c4a2f2081..e84295ca215 100644
--- a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -52,7 +52,7 @@ public:
 
     MR.getTargetJITDylib().withSearchOrderDo([&](const JITDylibList &JDs) {
       ES.lookup(JDs, InternedSymbols, OnResolvedWithUnwrap, OnReady,
-                RegisterDependencies);
+                RegisterDependencies, &MR.getTargetJITDylib());
     });
   }
 
diff --git a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
index cd742187ffb..c8fa6ef5297 100644
--- a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
+++ b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
@@ -220,6 +220,24 @@ TEST_F(CoreAPIsStandardTest, ChainedJITDylibLookup) {
   EXPECT_TRUE(OnReadyRun) << "OnReady was not run for empty query";
 }
 
+TEST_F(CoreAPIsStandardTest, LookupWithHiddenSymbols) {
+  auto BarHiddenFlags = BarSym.getFlags() & ~JITSymbolFlags::Exported;
+  auto BarHiddenSym = JITEvaluatedSymbol(BarSym.getAddress(), BarHiddenFlags);
+
+  cantFail(JD.define(absoluteSymbols({{Foo, FooSym}, {Bar, BarHiddenSym}})));
+
+  auto &JD2 = ES.createJITDylib("JD2");
+  cantFail(JD2.define(absoluteSymbols({{Bar, QuxSym}})));
+
+  auto Result = cantFail(ES.lookup({&JD, &JD2}, {Foo, Bar}));
+
+  EXPECT_EQ(Result.size(), 2U) << "Unexpected number of results";
+  EXPECT_EQ(Result.count(Foo), 1U) << "Missing result for \"Foo\"";
+  EXPECT_EQ(Result.count(Bar), 1U) << "Missing result for \"Bar\"";
+  EXPECT_EQ(Result[Bar].getAddress(), QuxSym.getAddress())
+      << "Wrong result for \"Bar\"";
+}
+
 TEST_F(CoreAPIsStandardTest, LookupFlagsTest) {
   // Test that lookupFlags works on a predefined symbol, and does not trigger
   // materialization of a lazy symbol. Make the lazy symbol weak to test that
@@ -257,7 +275,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicAliases) {
                                     {Qux, {Bar, JITSymbolFlags::Weak}}})));
   cantFail(JD.define(absoluteSymbols({{Qux, QuxSym}})));
 
-  auto Result = lookup({&JD}, {Baz, Qux});
+  auto Result = ES.lookup({&JD}, {Baz, Qux});
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
   EXPECT_EQ(Result->count(Qux), 1U) << "No result for \"qux\"";
@@ -272,7 +290,7 @@ TEST_F(CoreAPIsStandardTest, TestChainedAliases) {
   cantFail(JD.define(symbolAliases(
       {{Baz, {Bar, BazSym.getFlags()}}, {Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = lookup({&JD}, {Bar, Baz});
+  auto Result = ES.lookup({&JD}, {Bar, Baz});
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Bar), 1U) << "No result for \"bar\"";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
@@ -291,7 +309,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicReExports) {
 
   cantFail(JD2.define(reexports(JD, {{Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = cantFail(lookup({&JD2}, Bar));
+  auto Result = cantFail(ES.lookup({&JD2}, Bar));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Bar for symbol Foo should match FooSym's address";
 }
@@ -317,7 +335,7 @@ TEST_F(CoreAPIsStandardTest, TestThatReExportsDontUnnecessarilyMaterialize) {
   cantFail(JD2.define(reexports(
       JD, {{Baz, {Foo, BazSym.getFlags()}}, {Qux, {Bar, QuxSym.getFlags()}}})));
 
-  auto Result = cantFail(lookup({&JD2}, Baz));
+  auto Result = cantFail(ES.lookup({&JD2}, Baz));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Baz for symbol Foo should match FooSym's address";
 
@@ -340,7 +358,7 @@ TEST_F(CoreAPIsStandardTest, TestReexportsFallbackGenerator) {
   EXPECT_EQ(Flags.size(), 1U) << "Unexpected number of results";
   EXPECT_EQ(Flags[Foo], FooSym.getFlags()) << "Unexpected flags for Foo";
 
-  auto Result = cantFail(lookup({&JD}, Foo));
+  auto Result = cantFail(ES.lookup({&JD}, Foo));
 
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Incorrect reexported symbol address";
@@ -650,13 +668,13 @@ TEST_F(CoreAPIsStandardTest, DefineMaterializingSymbol) {
       });
 
   cantFail(JD.define(MU));
-  cantFail(lookup({&JD}, Foo));
+  cantFail(ES.lookup({&JD}, Foo));
 
   // Assert that materialization is complete by now.
   ExpectNoMoreMaterialization = true;
 
   // Look up bar to verify that no further materialization happens.
-  auto BarResult = cantFail(lookup({&JD}, Bar));
+  auto BarResult = cantFail(ES.lookup({&JD}, Bar));
   EXPECT_EQ(BarResult.getAddress(), BarSym.getAddress())
       << "Expected Bar == BarSym";
 }
@@ -670,7 +688,7 @@ TEST_F(CoreAPIsStandardTest, FallbackDefinitionGeneratorTest) {
         return SymbolNameSet({Bar});
       });
 
-  auto Result = cantFail(lookup({&JD}, {Foo, Bar}));
+  auto Result = cantFail(ES.lookup({&JD}, {Foo, Bar}));
 
   EXPECT_EQ(Result.count(Bar), 1U) << "Expected to find fallback def for 'bar'";
   EXPECT_EQ(Result[Bar].getAddress(), BarSym.getAddress())
@@ -679,14 +697,14 @@ TEST_F(CoreAPIsStandardTest, FallbackDefinitionGeneratorTest) {
 
 TEST_F(CoreAPIsStandardTest, FailResolution) {
   auto MU = llvm::make_unique<SimpleMaterializationUnit>(
-      SymbolFlagsMap(
-          {{Foo, JITSymbolFlags::Weak}, {Bar, JITSymbolFlags::Weak}}),
+      SymbolFlagsMap({{Foo, JITSymbolFlags::Exported | JITSymbolFlags::Weak},
+                      {Bar, JITSymbolFlags::Exported | JITSymbolFlags::Weak}}),
       [&](MaterializationResponsibility R) { R.failMaterialization(); });
 
   cantFail(JD.define(MU));
 
   SymbolNameSet Names({Foo, Bar});
-  auto Result = lookup({&JD}, Names);
+  auto Result = ES.lookup({&JD}, Names);
 
   EXPECT_FALSE(!!Result) << "Expected failure";
   if (!Result) {
@@ -718,7 +736,7 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithUnthreadedMaterialization) {
 
   cantFail(JD.define(MU));
 
-  auto FooLookupResult = cantFail(lookup({&JD}, Foo));
+  auto FooLookupResult = cantFail(ES.lookup({&JD}, Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -739,7 +757,7 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithThreadedMaterialization) {
 
   cantFail(JD.define(absoluteSymbols({{Foo, FooSym}})));
 
-  auto FooLookupResult = cantFail(lookup({&JD}, Foo));
+  auto FooLookupResult = cantFail(ES.lookup({&JD}, Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -787,14 +805,14 @@ TEST_F(CoreAPIsStandardTest, TestGetRequestedSymbolsAndReplace) {
   EXPECT_FALSE(FooMaterialized) << "Foo should not be materialized yet";
   EXPECT_FALSE(BarMaterialized) << "Bar should not be materialized yet";
 
-  auto FooSymResult = cantFail(lookup({&JD}, Foo));
+  auto FooSymResult = cantFail(ES.lookup({&JD}, Foo));
   EXPECT_EQ(FooSymResult.getAddress(), FooSym.getAddress())
       << "Address mismatch for Foo";
 
   EXPECT_TRUE(FooMaterialized) << "Foo should be materialized now";
   EXPECT_FALSE(BarMaterialized) << "Bar still should not be materialized";
 
-  auto BarSymResult = cantFail(lookup({&JD}, Bar));
+  auto BarSymResult = cantFail(ES.lookup({&JD}, Bar));
   EXPECT_EQ(BarSymResult.getAddress(), BarSym.getAddress())
       << "Address mismatch for Bar";
   EXPECT_TRUE(BarMaterialized) << "Bar should be materialized now";
@@ -814,7 +832,7 @@ TEST_F(CoreAPIsStandardTest, TestMaterializationResponsibilityDelegation) {
 
   cantFail(JD.define(MU));
 
-  auto Result = lookup({&JD}, {Foo, Bar});
+  auto Result = ES.lookup({&JD}, {Foo, Bar});
 
   EXPECT_TRUE(!!Result) << "Result should be a success value";
   EXPECT_EQ(Result->count(Foo), 1U) << "\"Foo\" entry missing";
@@ -865,14 +883,4 @@ TEST_F(CoreAPIsStandardTest, TestMaterializeWeakSymbol) {
   FooResponsibility->emit();
 }
 
-TEST_F(CoreAPIsStandardTest, TestMainJITDylibAndDefaultLookupOrder) {
-  cantFail(ES.getMainJITDylib().define(absoluteSymbols({{Foo, FooSym}})));
-  auto Results = cantFail(ES.lookup({Foo}));
-
-  EXPECT_EQ(Results.size(), 1U) << "Incorrect number of results";
-  EXPECT_EQ(Results.count(Foo), 1U) << "Expected result for 'Foo'";
-  EXPECT_EQ(Results[Foo].getAddress(), FooSym.getAddress())
-      << "Expected result address to match Foo's address";
-}
-
 } // namespace
-- 
GitLab


From 3898e47d1e7f74ec12eee0cc8529b6abc9d27a71 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sat, 13 Oct 2018 22:18:22 +0000
Subject: [PATCH 0161/1116] Move some helpers from the global namespace into
 anonymous ones.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344468 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Demangle/MicrosoftDemangle.cpp                    | 11 ++++++-----
 lib/Target/Mips/MipsCallLowering.cpp                  |  8 ++++----
 lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp   |  2 +-
 lib/Target/X86/X86CondBrFolding.cpp                   |  2 ++
 .../Instrumentation/ControlHeightReduction.cpp        |  7 ++++---
 5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/lib/Demangle/MicrosoftDemangle.cpp b/lib/Demangle/MicrosoftDemangle.cpp
index 9f60eb22cc4..59fb7c9ae9f 100644
--- a/lib/Demangle/MicrosoftDemangle.cpp
+++ b/lib/Demangle/MicrosoftDemangle.cpp
@@ -652,7 +652,7 @@ Demangler::demangleLiteralOperatorIdentifier(StringView &MangledName) {
   return N;
 }
 
-IntrinsicFunctionKind
+static IntrinsicFunctionKind
 translateIntrinsicFunctionCode(char CH, FunctionIdentifierCodeGroup Group) {
   // Not all ? identifiers are intrinsics *functions*.  This function only maps
   // operator codes for the special functions, all others are handled elsewhere,
@@ -1220,7 +1220,7 @@ static void outputEscapedChar(OutputStream &OS, unsigned C) {
   outputHex(OS, C);
 }
 
-unsigned countTrailingNullBytes(const uint8_t *StringBytes, int Length) {
+static unsigned countTrailingNullBytes(const uint8_t *StringBytes, int Length) {
   const uint8_t *End = StringBytes + Length - 1;
   unsigned Count = 0;
   while (Length > 0 && *End == 0) {
@@ -1231,7 +1231,8 @@ unsigned countTrailingNullBytes(const uint8_t *StringBytes, int Length) {
   return Count;
 }
 
-unsigned countEmbeddedNulls(const uint8_t *StringBytes, unsigned Length) {
+static unsigned countEmbeddedNulls(const uint8_t *StringBytes,
+                                   unsigned Length) {
   unsigned Result = 0;
   for (unsigned I = 0; I < Length; ++I) {
     if (*StringBytes++ == 0)
@@ -1240,8 +1241,8 @@ unsigned countEmbeddedNulls(const uint8_t *StringBytes, unsigned Length) {
   return Result;
 }
 
-unsigned guessCharByteSize(const uint8_t *StringBytes, unsigned NumChars,
-                           unsigned NumBytes) {
+static unsigned guessCharByteSize(const uint8_t *StringBytes, unsigned NumChars,
+                                  unsigned NumBytes) {
   assert(NumBytes > 0);
 
   // If the number of bytes is odd, this is guaranteed to be a char string.
diff --git a/lib/Target/Mips/MipsCallLowering.cpp b/lib/Target/Mips/MipsCallLowering.cpp
index 8babdbf902a..4d070f9f523 100644
--- a/lib/Target/Mips/MipsCallLowering.cpp
+++ b/lib/Target/Mips/MipsCallLowering.cpp
@@ -298,8 +298,8 @@ static bool isSupportedType(Type *T) {
   return false;
 }
 
-CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT,
-                                      const ISD::ArgFlagsTy &Flags) {
+static CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT,
+                                             const ISD::ArgFlagsTy &Flags) {
   // > does not mean loss of information as type RegisterVT can't hold type VT,
   // it means that type VT is split into multiple registers of type RegisterVT
   if (VT.getSizeInBits() >= RegisterVT.getSizeInBits())
@@ -312,8 +312,8 @@ CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT,
 }
 
 template <typename T>
-void setLocInfo(SmallVectorImpl<CCValAssign> &ArgLocs,
-                const SmallVectorImpl<T> &Arguments) {
+static void setLocInfo(SmallVectorImpl<CCValAssign> &ArgLocs,
+                       const SmallVectorImpl<T> &Arguments) {
   for (unsigned i = 0; i < ArgLocs.size(); ++i) {
     const CCValAssign &VA = ArgLocs[i];
     CCValAssign::LocInfo LocInfo = determineLocInfo(
diff --git a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index 936b801a9a0..98953f09482 100644
--- a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -59,7 +59,7 @@ FunctionPass *llvm::createWebAssemblyLateEHPrepare() {
 // possible search paths should be the same.
 // Returns nullptr in case it does not find any EH pad in the search, or finds
 // multiple different EH pads.
-MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) {
+static MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) {
   MachineFunction *MF = MI->getParent()->getParent();
   SmallVector<MachineBasicBlock *, 2> WL;
   SmallPtrSet<MachineBasicBlock *, 2> Visited;
diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp
index 8b9ef20d916..1d221930c2a 100644
--- a/lib/Target/X86/X86CondBrFolding.cpp
+++ b/lib/Target/X86/X86CondBrFolding.cpp
@@ -84,6 +84,7 @@ FunctionPass *llvm::createX86CondBrFolding() {
   return new X86CondBrFoldingPass();
 }
 
+namespace {
 // A class the stores the auxiliary information for each MBB.
 struct TargetMBBInfo {
   MachineBasicBlock *TBB;
@@ -129,6 +130,7 @@ private:
     return MBBInfos[MBB->getNumber()].get();
   }
 };
+} // namespace
 
 // Find a valid path that we can reuse the CondCode.
 // The resulted path (if return true) is stored in BranchPath.
diff --git a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 2c0721f7366..8f4159d3d19 100644
--- a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -621,9 +621,10 @@ static BranchProbability getCHRBiasThreshold() {
 // CHRBiasThreshold, put Key into TrueSet and return true. If FalseProb >=
 // CHRBiasThreshold, put Key into FalseSet and return true. Otherwise, return
 // false.
-template<typename K, typename S, typename M>
-bool checkBias(K *Key, BranchProbability TrueProb, BranchProbability FalseProb,
-               S &TrueSet, S &FalseSet, M &BiasMap) {
+template <typename K, typename S, typename M>
+static bool checkBias(K *Key, BranchProbability TrueProb,
+                      BranchProbability FalseProb, S &TrueSet, S &FalseSet,
+                      M &BiasMap) {
   BranchProbability Threshold = getCHRBiasThreshold();
   if (TrueProb >= Threshold) {
     TrueSet.insert(Key);
-- 
GitLab


From 1ccfde68b896d5c3f03c7fff42113c8db425e92d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 14 Oct 2018 03:36:27 +0000
Subject: [PATCH 0162/1116] [X86] Type legalize v2f32 stores by widening to
 v4f32, casting to v2f64, extracting f64 and storing.

Summary: This is similar to what D52528 did for loads. It should match what generic type legalization does in 64-bit mode where it uses a v2i64 cast and an i64 store.

Reviewers: RKSimon, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53173

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344470 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp           | 47 ++++++++++++++------
 test/CodeGen/X86/2011-10-19-widen_vselect.ll | 16 +++----
 test/CodeGen/X86/sse-schedule.ll             | 44 +++++++-----------
 test/CodeGen/X86/vec_fptrunc.ll              |  6 +--
 test/CodeGen/X86/widen_conv-3.ll             | 31 ++++---------
 5 files changed, 67 insertions(+), 77 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1abe642a830..7d8fb392b07 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -902,8 +902,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 
     // We want to legalize this to an f64 load rather than an i64 load on
-    // 64-bit targets and two 32-bit loads on a 32-bit target.
+    // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
+    // store.
     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
+    setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
 
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
@@ -19943,18 +19945,36 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
   SDValue StoredVal = St->getValue();
 
   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
-  assert(StoredVal.getValueType().isVector() &&
-         StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
-         StoredVal.getValueType().getVectorNumElements() <= 8 &&
-         "Unexpected VT");
-  assert(!St->isTruncatingStore() && "Expected non-truncating store");
-  assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
-         "Expected AVX512F without AVX512DQI");
-
-  StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
-                          DAG.getUNDEF(MVT::v8i1), StoredVal,
+  if (StoredVal.getValueType().isVector() &&
+            StoredVal.getValueType().getVectorElementType() == MVT::i1) {
+    assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
+           "Unexpected VT");
+    assert(!St->isTruncatingStore() && "Expected non-truncating store");
+    assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+           "Expected AVX512F without AVX512DQI");
+
+    StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+                            DAG.getUNDEF(MVT::v8i1), StoredVal,
+                            DAG.getIntPtrConstant(0, dl));
+    StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
+
+    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+                        St->getPointerInfo(), St->getAlignment(),
+                        St->getMemOperand()->getFlags());
+  }
+
+  if (St->isTruncatingStore())
+    return SDValue();
+
+  assert(StoredVal.getValueType() == MVT::v2f32 && "Unexpected VT");
+
+  // Widen the vector, cast to a v2x64 type, extract the single 64-bit
+  // element and store it.
+  StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, StoredVal,
+                          DAG.getUNDEF(MVT::v2f32));
+  StoredVal = DAG.getBitcast(MVT::v2f64, StoredVal);
+  StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, StoredVal,
                           DAG.getIntPtrConstant(0, dl));
-  StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
 
   return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
                       St->getPointerInfo(), St->getAlignment(),
@@ -36912,7 +36932,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
     // pair instead.
     if (Subtarget.is64Bit() || F64IsLegal) {
-      MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
+      MVT LdVT = (Subtarget.is64Bit() &&
+                  (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64;
       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
                                   Ld->getMemOperand());
 
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index a84a85e2ecd..d09abf5fbb1 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -8,8 +8,7 @@
 define void @simple_widen(<2 x float> %a, <2 x float> %b) {
 ; X32-LABEL: simple_widen:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    extractps $1, %xmm1, (%eax)
-; X32-NEXT:    movss %xmm1, (%eax)
+; X32-NEXT:    movlps %xmm1, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: simple_widen:
@@ -28,8 +27,7 @@ define void @complex_inreg_work(<2 x float> %a, <2 x float> %b) {
 ; X32-NEXT:    movaps %xmm0, %xmm2
 ; X32-NEXT:    cmpordps %xmm0, %xmm0
 ; X32-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; X32-NEXT:    extractps $1, %xmm1, (%eax)
-; X32-NEXT:    movss %xmm1, (%eax)
+; X32-NEXT:    movlps %xmm1, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: complex_inreg_work:
@@ -50,8 +48,7 @@ define void @zero_test() {
 ; X32-LABEL: zero_test:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    extractps $1, %xmm0, (%eax)
-; X32-NEXT:    movss %xmm0, (%eax)
+; X32-NEXT:    movlps %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: zero_test:
@@ -82,11 +79,8 @@ define void @full_test() {
 ; X32-NEXT:    cmpeqps %xmm2, %xmm1
 ; X32-NEXT:    movaps %xmm1, %xmm0
 ; X32-NEXT:    blendvps %xmm0, %xmm2, %xmm4
-; X32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
-; X32-NEXT:    movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; X32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
-; X32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movlps %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movlps %xmm4, {{[0-9]+}}(%esp)
 ; X32-NEXT:    addl $60, %esp
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index 662061d8c88..cd1fdfbc6aa 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -2712,8 +2712,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; GENERIC-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2723,16 +2722,14 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; ATOM-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
 ; ATOM-NEXT:    addps %xmm1, %xmm2 # sched: [5:5.00]
 ; ATOM-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
-; ATOM-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1] sched: [1:1.00]
-; ATOM-NEXT:    movlps %xmm2, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT:    movhps %xmm2, (%rdi) # sched: [1:1.00]
 ; ATOM-NEXT:    retq # sched: [79:39.50]
 ;
 ; SLM-LABEL: test_movhps:
 ; SLM:       # %bb.0:
 ; SLM-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00]
 ; SLM-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; SLM-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SLM-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SLM-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SLM-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
@@ -2740,8 +2737,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; SANDY-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SANDY-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SANDY-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2749,7 +2745,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; SANDY-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2757,8 +2753,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; HASWELL-SSE:       # %bb.0:
 ; HASWELL-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; HASWELL-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; HASWELL-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; HASWELL-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2766,7 +2761,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; HASWELL-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2774,8 +2769,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; BROADWELL-SSE:       # %bb.0:
 ; BROADWELL-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BROADWELL-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BROADWELL-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; BROADWELL-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; BROADWELL-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2783,7 +2777,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BROADWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BROADWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; BROADWELL-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2791,8 +2785,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SKYLAKE-SSE:       # %bb.0:
 ; SKYLAKE-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKYLAKE-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [4:0.50]
-; SKYLAKE-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SKYLAKE-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SKYLAKE-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2800,7 +2793,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKYLAKE-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKYLAKE-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; SKYLAKE-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2808,8 +2801,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SKX-SSE:       # %bb.0:
 ; SKX-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKX-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SKX-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2817,7 +2809,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2825,8 +2817,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BTVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:0.50]
-; BTVER2-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [2:1.00]
+; BTVER2-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [2:1.00]
 ; BTVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
@@ -2834,7 +2825,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [3:1.00]
+; BTVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [2:1.00]
 ; BTVER2-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
@@ -2842,8 +2833,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; ZNVER1-SSE:       # %bb.0:
 ; ZNVER1-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
 ; ZNVER1-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; ZNVER1-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:0.50]
-; ZNVER1-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:0.50]
 ; ZNVER1-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.25]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
@@ -2851,7 +2841,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; ZNVER1:       # %bb.0:
 ; ZNVER1-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
 ; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:3.00]
+; ZNVER1-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:0.50]
 ; ZNVER1-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = bitcast x86_mmx* %a2 to <2 x float>*
diff --git a/test/CodeGen/X86/vec_fptrunc.ll b/test/CodeGen/X86/vec_fptrunc.ll
index 79abeb0c59f..bb6be6cd9e8 100644
--- a/test/CodeGen/X86/vec_fptrunc.ll
+++ b/test/CodeGen/X86/vec_fptrunc.ll
@@ -10,8 +10,7 @@ define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) {
 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-SSE-NEXT:    cvtpd2ps (%ecx), %xmm0
-; X32-SSE-NEXT:    extractps $1, %xmm0, 4(%eax)
-; X32-SSE-NEXT:    movss %xmm0, (%eax)
+; X32-SSE-NEXT:    movlpd %xmm0, (%eax)
 ; X32-SSE-NEXT:    retl
 ;
 ; X32-AVX-LABEL: fptrunc_frommem2:
@@ -19,8 +18,7 @@ define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) {
 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-AVX-NEXT:    vcvtpd2psx (%ecx), %xmm0
-; X32-AVX-NEXT:    vextractps $1, %xmm0, 4(%eax)
-; X32-AVX-NEXT:    vmovss %xmm0, (%eax)
+; X32-AVX-NEXT:    vmovlpd %xmm0, (%eax)
 ; X32-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: fptrunc_frommem2:
diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll
index 1eb76b283c0..038c6cb33b6 100644
--- a/test/CodeGen/X86/widen_conv-3.ll
+++ b/test/CodeGen/X86/widen_conv-3.ll
@@ -7,28 +7,15 @@
 ; sign to float v2i16 to v2f32
 
 define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind {
-; X86-SSE2-LABEL: convert_v2i16_to_v2f32:
-; X86-SSE2:       # %bb.0: # %entry
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    psllq $48, %xmm0
-; X86-SSE2-NEXT:    psrad $16, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; X86-SSE2-NEXT:    movss %xmm0, (%eax)
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X86-SSE2-NEXT:    movss %xmm0, 4(%eax)
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: convert_v2i16_to_v2f32:
-; X86-SSE42:       # %bb.0: # %entry
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    psllq $48, %xmm0
-; X86-SSE42-NEXT:    psrad $16, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-SSE42-NEXT:    cvtdq2ps %xmm0, %xmm0
-; X86-SSE42-NEXT:    extractps $1, %xmm0, 4(%eax)
-; X86-SSE42-NEXT:    movss %xmm0, (%eax)
-; X86-SSE42-NEXT:    retl
+; X86-LABEL: convert_v2i16_to_v2f32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    psllq $48, %xmm0
+; X86-NEXT:    psrad $16, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X86-NEXT:    movlps %xmm0, (%eax)
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: convert_v2i16_to_v2f32:
 ; X64:       # %bb.0: # %entry
-- 
GitLab


From 6c09fbd91a88e899354b804cf1f9d0aa78d75cdf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 14 Oct 2018 04:01:40 +0000
Subject: [PATCH 0163/1116] [X86] Fix bad indentation. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344471 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7d8fb392b07..441f26dd4c6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -19946,7 +19946,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
 
   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
   if (StoredVal.getValueType().isVector() &&
-            StoredVal.getValueType().getVectorElementType() == MVT::i1) {
+      StoredVal.getValueType().getVectorElementType() == MVT::i1) {
     assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
            "Unexpected VT");
     assert(!St->isTruncatingStore() && "Expected non-truncating store");
-- 
GitLab


From a3ff03e8e2bf2c8271d5641cff28d4da0c9bbbbf Mon Sep 17 00:00:00 2001
From: Dorit Nuzman <dorit.nuzman@intel.com>
Date: Sun, 14 Oct 2018 07:06:16 +0000
Subject: [PATCH 0164/1116] [IAI,LV] Add support for vectorizing predicated
 strided accesses using masked interleave-group

The vectorizer currently does not attempt to create interleave-groups that
contain predicated loads/stores; predicated strided accesses can currently be
vectorized only using masked gather/scatter or scalarization. This patch makes
predicated loads/stores candidates for forming interleave-groups during the
Loop-Vectorizer's analysis, and adds the proper support for masked-interleave-
groups to the Loop-Vectorizer's planning and transformation stages. The patch
also extends the TTI API to allow querying the cost of masked interleave groups
(which each target can control); Targets that support masked vector loads/
stores may choose to enable this feature and allow vectorizing predicated
strided loads/stores using masked wide loads/stores and shuffles.

Reviewers: Ayal, hsaito, dcaballe, fhahn, javed.absar

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D53011


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344472 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/TargetTransformInfo.h   |  19 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   6 +-
 include/llvm/Analysis/VectorUtils.h           |  21 +-
 include/llvm/CodeGen/BasicTTIImpl.h           |  38 ++-
 lib/Analysis/TargetTransformInfo.cpp          |  10 +-
 lib/Analysis/VectorUtils.cpp                  |  29 ++-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   7 +-
 .../AArch64/AArch64TargetTransformInfo.h      |   2 +-
 lib/Target/ARM/ARMTargetTransformInfo.cpp     |   8 +-
 lib/Target/ARM/ARMTargetTransformInfo.h       |   2 +-
 .../Hexagon/HexagonTargetTransformInfo.cpp    |   6 +-
 .../Hexagon/HexagonTargetTransformInfo.h      |   2 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.cpp |   7 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.h   |   3 +-
 .../SystemZ/SystemZTargetTransformInfo.cpp    |   6 +-
 .../SystemZ/SystemZTargetTransformInfo.h      |   2 +-
 lib/Target/X86/X86TargetTransformInfo.cpp     |  23 +-
 lib/Target/X86/X86TargetTransformInfo.h       |   9 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 126 ++++++++--
 lib/Transforms/Vectorize/VPRecipeBuilder.h    |   3 +-
 lib/Transforms/Vectorize/VPlan.h              |   8 +-
 .../x86-interleaved-accesses-masked-group.ll  | 164 +++++++++++++
 .../interleaved-accesses-masked-group.ll      | 222 ++++++++++++++++++
 .../interleaved-accesses-pred-stores.ll       |   1 +
 24 files changed, 654 insertions(+), 70 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
 create mode 100644 test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll

diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 18b5a5cf0e5..c2a9d1ec195 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -587,6 +587,10 @@ public:
   /// Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
+  /// Enable matching of interleaved access groups that contain predicated 
+  /// accesses and are vectorized using masked vector loads/stores.
+  bool enableMaskedInterleavedAccessVectorization() const;
+
   /// Indicate that it is potentially unsafe to automatically vectorize
   /// floating-point operations because the semantics of vector and scalar
   /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
@@ -821,9 +825,11 @@ public:
   ///    load allows gaps)
   /// \p Alignment is the alignment of the memory operation
   /// \p AddressSpace is address space of the pointer.
+  /// \p IsMasked indicates if the memory access is predicated.
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace) const;
+                                 unsigned AddressSpace, 
+                                 bool IsMasked = false) const;
 
   /// Calculate the cost of performing a vector reduction.
   ///
@@ -1072,6 +1078,7 @@ public:
   virtual const MemCmpExpansionOptions *enableMemCmpExpansion(
       bool IsZeroCmp) const = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
+  virtual bool enableMaskedInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
                                               unsigned BitWidth,
@@ -1132,7 +1139,8 @@ public:
                                          unsigned Factor,
                                          ArrayRef<unsigned> Indices,
                                          unsigned Alignment,
-                                         unsigned AddressSpace) = 0;
+                                         unsigned AddressSpace,
+                                         bool IsMasked = false) = 0;
   virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                          bool IsPairwiseForm) = 0;
   virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
@@ -1346,6 +1354,9 @@ public:
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
+  bool enableMaskedInterleavedAccessVectorization() override {
+    return Impl.enableMaskedInterleavedAccessVectorization();
+  }
   bool isFPVectorizationPotentiallyUnsafe() override {
     return Impl.isFPVectorizationPotentiallyUnsafe();
   }
@@ -1471,9 +1482,9 @@ public:
   }
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace) override {
+                                 unsigned AddressSpace, bool IsMasked) override {
     return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace, IsMasked);
   }
   int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                  bool IsPairwiseForm) override {
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index e39fe66c0a4..c64d4d36805 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -313,6 +313,8 @@ public:
 
   bool enableInterleavedAccessVectorization() { return false; }
 
+  bool enableMaskedInterleavedAccessVectorization() { return false; }
+
   bool isFPVectorizationPotentiallyUnsafe() { return false; }
 
   bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
@@ -450,8 +452,8 @@ public:
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace) {
+                                      unsigned Alignment, unsigned AddressSpace,
+                                      bool IsMasked = false) {
     return 1;
   }
 
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index 622d932f74f..2ac49f67662 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -125,6 +125,21 @@ computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
 /// This function always sets a (possibly null) value for each K in Kinds.
 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 
+/// Create a mask with replicated elements.
+///
+/// This function creates a shuffle mask for replicating each of the \p VF 
+/// elements in a vector \p ReplicationFactor times. It can be used to
+/// transform a mask of \p VF elements into a mask of
+/// \p VF * \p ReplicationFactor elements used by a predicated
+/// interleaved-group of loads/stores whose Interleaved-factor ==
+/// \p ReplicationFactor.
+///
+/// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is:
+///
+///   <0,0,0,1,1,1,2,2,2,3,3,3>
+Constant *createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor,
+                               unsigned VF);
+
 /// Create an interleave shuffle mask.
 ///
 /// This function creates a shuffle mask for interleaving \p NumVecs vectors of
@@ -328,7 +343,7 @@ public:
   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
                         DominatorTree *DT, LoopInfo *LI,
                         const LoopAccessInfo *LAI)
-    : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
+      : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
 
   ~InterleavedAccessInfo() {
     SmallPtrSet<InterleaveGroup *, 4> DelSet;
@@ -341,7 +356,9 @@ public:
 
   /// Analyze the interleaved accesses and collect them in interleave
   /// groups. Substitute symbolic strides using \p Strides.
-  void analyzeInterleaving();
+  /// Consider also predicated loads/stores in the analysis if
+  /// \p EnableMaskedInterleavedGroup is true.
+  void analyzeInterleaving(bool EnableMaskedInterleavedGroup);
 
   /// Check if \p Instr belongs to any interleave group.
   bool isInterleaved(Instruction *Instr) const {
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index b460cdc0ba1..e740fe57172 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -783,8 +783,8 @@ public:
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace) {
+                                      unsigned Alignment, unsigned AddressSpace,
+                                      bool IsMasked = false) {
     VectorType *VT = dyn_cast<VectorType>(VecTy);
     assert(VT && "Expect a vector type for interleaved memory op");
 
@@ -795,8 +795,13 @@ public:
     VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
 
     // Firstly, the cost of load/store operation.
-    unsigned Cost = static_cast<T *>(this)->getMemoryOpCost(
-        Opcode, VecTy, Alignment, AddressSpace);
+    unsigned Cost;
+    if (IsMasked)
+      Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
+          Opcode, VecTy, Alignment, AddressSpace);
+    else
+      Cost = static_cast<T *>(this)->getMemoryOpCost(Opcode, VecTy, Alignment,
+                                                     AddressSpace);
 
     // Legalize the vector type, and get the legalized and unlegalized type
     // sizes.
@@ -892,6 +897,31 @@ public:
                     ->getVectorInstrCost(Instruction::InsertElement, VT, i);
     }
 
+    if (!IsMasked)
+      return Cost;
+
+    Type *I8Type = Type::getInt8Ty(VT->getContext());
+    VectorType *MaskVT = VectorType::get(I8Type, NumElts);
+    SubVT = VectorType::get(I8Type, NumSubElts);
+
+    // The Mask shuffling cost is extract all the elements of the Mask
+    // and insert each of them Factor times into the wide vector:
+    //
+    // E.g. an interleaved group with factor 3:
+    //    %mask = icmp ult <8 x i32> %vec1, %vec2
+    //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
+    //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
+    // The cost is estimated as extract all mask elements from the <8xi1> mask
+    // vector and insert them factor times into the <24xi1> shuffled mask
+    // vector.
+    for (unsigned i = 0; i < NumSubElts; i++)
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::ExtractElement, SubVT, i);
+
+    for (unsigned i = 0; i < NumElts; i++)
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::InsertElement, MaskVT, i);
+
     return Cost;
   }
 
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 4ad48e351a4..867403d0ef1 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -268,6 +268,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
 
+bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const {
+  return TTIImpl->enableMaskedInterleavedAccessVectorization();
+}
+
 bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const {
   return TTIImpl->isFPVectorizationPotentiallyUnsafe();
 }
@@ -515,9 +519,9 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
 
 int TargetTransformInfo::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace) const {
-  int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                                 Alignment, AddressSpace);
+    unsigned Alignment, unsigned AddressSpace, bool IsMasked) const {
+  int Cost = TTIImpl->getInterleavedMemoryOpCost(
+      Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 272c665ace1..e14449b8838 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -502,6 +502,16 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
   return Inst;
 }
 
+Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, 
+                                     unsigned ReplicationFactor, unsigned VF) {
+  SmallVector<Constant *, 16> MaskVec;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < ReplicationFactor; j++)
+      MaskVec.push_back(Builder.getInt32(i));
+
+  return ConstantVector::get(MaskVec);
+}
+
 Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
                                      unsigned NumVecs) {
   SmallVector<Constant *, 16> Mask;
@@ -672,7 +682,8 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
 // this group because it and (2) are dependent. However, (1) can be grouped
 // with other accesses that may precede it in program order. Note that a
 // bottom-up order does not imply that WAW dependences should not be checked.
-void InterleavedAccessInfo::analyzeInterleaving() {
+void InterleavedAccessInfo::analyzeInterleaving(
+                                 bool EnablePredicatedInterleavedMemAccesses) {
   LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
   const ValueToValueMap &Strides = LAI->getSymbolicStrides();
 
@@ -712,9 +723,8 @@ void InterleavedAccessInfo::analyzeInterleaving() {
     // create a group for B, we continue with the bottom-up algorithm to ensure
     // we don't break any of B's dependences.
     InterleaveGroup *Group = nullptr;
-    // TODO: Ignore B if it is in a predicated block. This restriction can be 
-    // relaxed in the future once we handle masked interleaved groups.
-    if (isStrided(DesB.Stride) && !isPredicated(B->getParent())) {
+    if (isStrided(DesB.Stride) && 
+        (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
       Group = getInterleaveGroup(B);
       if (!Group) {
         LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
@@ -808,11 +818,12 @@ void InterleavedAccessInfo::analyzeInterleaving() {
       if (DistanceToB % static_cast<int64_t>(DesB.Size))
         continue;
 
-      // Ignore A if either A or B is in a predicated block. Although we
-      // currently prevent group formation for predicated accesses, we may be
-      // able to relax this limitation in the future once we handle more
-      // complicated blocks.
-      if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
+      // All members of a predicated interleave-group must have the same predicate,
+      // and currently must reside in the same BB.
+      BasicBlock *BlockA = A->getParent();  
+      BasicBlock *BlockB = B->getParent();  
+      if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
+          (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
         continue;
 
       // The index of A is the index of B plus A's distance to B in multiples
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 96e751e8697..a16de89cf10 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -659,11 +659,12 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace) {
+                                               unsigned AddressSpace,
+                                               bool IsMasked) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
-  if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+  if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
@@ -676,7 +677,7 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace, IsMasked);
 }
 
 int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index c056a7d2428..b3893d32850 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -146,7 +146,7 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace, bool IsMasked = false);
 
   bool
   shouldConsiderAddressTypePromotion(const Instruction &I,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 1b0d162f726..bac3e6c2387 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -542,14 +542,16 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace) {
+                                           unsigned AddressSpace
+                                           bool IsMasked) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
   // vldN/vstN doesn't support vector types of i64/f64 element.
   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
 
-  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
+  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
+      !IsMasked) {
     unsigned NumElts = VecTy->getVectorNumElements();
     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
@@ -562,7 +564,7 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace, IsMasked);
 }
 
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 7d14bd7c256..84e3055c6bc 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -169,7 +169,7 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace, bool IsMasked);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 4d0e7dc52e8..79b269bccfe 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -206,10 +206,10 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
 
 unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
       Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      unsigned Alignment, unsigned AddressSpace) {
-  if (Indices.size() != Factor)
+      unsigned Alignment, unsigned AddressSpace, bool IsMasked) {
+  if (Indices.size() != Factor || IsMasked)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace);
+                                             Alignment, AddressSpace, IsMasked);
   return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr);
 }
 
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 2c03cd268ff..901a91692e8 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -123,7 +123,7 @@ public:
             bool VariableMask, unsigned Alignment);
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
             unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
-            unsigned AddressSpace);
+            unsigned AddressSpace, bool IsMasked);
   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
             const Instruction *I);
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index b0da9b5a6d7..2c81661cb17 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -473,7 +473,12 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace) {
+                                           unsigned AddressSpace,
+                                           Bool IsMasked) {
+  if (IsMasked)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace, IsMasked);
+
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
 
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 2ee2b3eb808..252d46e7a2a 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -90,7 +90,8 @@ public:
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace,
+                                 bool IsMasked = false);
 
   /// @}
 };
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 6f553d5bed3..1eaeb9699bf 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -909,7 +909,11 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace) {
+                                               unsigned AddressSpace,
+                                               bool IsMasked) {
+  if (IsMasked)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace, IsMasked);
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
 
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index bfa942357c5..92b2b9bdcb8 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -92,7 +92,7 @@ public:
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace, bool IsMasked = false);
   /// @}
 };
 
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index d3a75123935..82e4dfe25b7 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2723,7 +2723,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace) {
+                                               unsigned AddressSpace,
+                                               bool IsMasked) {
+
+  if (IsMasked)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace, IsMasked);
 
   // We currently Support only fully-interleaved groups, with no gaps.
   // TODO: Support also strided loads (interleaved-groups with gaps).
@@ -2832,7 +2837,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                                  unsigned Factor,
                                                  ArrayRef<unsigned> Indices,
                                                  unsigned Alignment,
-                                                 unsigned AddressSpace) {
+                                                 unsigned AddressSpace,
+                                                 bool IsMasked) {
+
+  if (IsMasked)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace, IsMasked);
 
   // VecTy for interleave memop is <VF*Factor x Elt>.
   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -2950,7 +2960,8 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace) {
+                                           unsigned AddressSpace,
+                                           bool IsMasked) {
   auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
     Type *EltTy = VecTy->getVectorElementType();
     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
@@ -2962,11 +2973,11 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   };
   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
     return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
-                                            Alignment, AddressSpace);
+                                            Alignment, AddressSpace, IsMasked);
   if (ST->hasAVX2())
     return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
-                                          Alignment, AddressSpace);
+                                          Alignment, AddressSpace, IsMasked);
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace, IsMasked);
 }
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 3df89903882..2bd778a4211 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -101,13 +101,16 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace);
+                                 unsigned Alignment, unsigned AddressSpace,
+                                 bool IsMasked = false);
   int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace);
+                                 unsigned Alignment, unsigned AddressSpace,
+                                 bool IsMasked = false);
   int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace);
+                                 unsigned Alignment, unsigned AddressSpace,
+                                 bool IsMasked = false);
 
   int getIntImmCost(int64_t);
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7ebe8d102b7..e93cfb34156 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -172,6 +172,10 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 
+static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
+    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
+
 /// We don't interleave loops with a known constant trip count below this
 /// number.
 static const unsigned TinyTripCountInterleaveThreshold = 128;
@@ -408,8 +412,10 @@ public:
   /// Construct the vector value of a scalarized value \p V one lane at a time.
   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
 
-  /// Try to vectorize the interleaved access group that \p Instr belongs to.
-  void vectorizeInterleaveGroup(Instruction *Instr);
+  /// Try to vectorize the interleaved access group that \p Instr belongs to,
+  /// optionally masking the vector operations if \p BlockInMask is non-null.
+  void vectorizeInterleaveGroup(Instruction *Instr,
+                                VectorParts *BlockInMask = nullptr);
 
   /// Vectorize Load and Store instructions, optionally masking the vector
   /// operations if \p BlockInMask is non-null.
@@ -1112,6 +1118,11 @@ public:
   /// access that can be widened.
   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
 
+  /// Returns true if \p I is a memory instruction in an interleaved-group
+  /// of memory accesses that can be vectorized with wide vector loads/stores
+  /// and shuffles.
+  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
+
   /// Check if \p Instr belongs to any interleaved access group.
   bool isAccessInterleaved(Instruction *Instr) {
     return InterleaveInfo.isInterleaved(Instr);
@@ -1946,7 +1957,8 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
-void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
+void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
+                                                   VectorParts *BlockInMask) {
   const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
   assert(Group && "Fail to get an interleaved access group.");
 
@@ -1968,6 +1980,15 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   SmallVector<Value *, 2> NewPtrs;
   unsigned Index = Group->getIndex(Instr);
 
+  VectorParts Mask;
+  bool IsMaskRequired = BlockInMask;
+  if (IsMaskRequired) {
+    Mask = *BlockInMask;
+    // TODO: extend the masked interleaved-group support to reversed access.
+    assert(!Group->isReverse() && "Reversed masked interleave-group "
+                                  "not supported."); 
+  }
+
   // If the group is reverse, adjust the index to refer to the last vector lane
   // instead of the first. We adjust the index from the first vector lane,
   // rather than directly getting the pointer for lane VF - 1, because the
@@ -2011,8 +2032,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     // For each unroll part, create a wide load for the group.
     SmallVector<Value *, 2> NewLoads;
     for (unsigned Part = 0; Part < UF; Part++) {
-      auto *NewLoad = Builder.CreateAlignedLoad(
-          NewPtrs[Part], Group->getAlignment(), "wide.vec");
+      Instruction *NewLoad;
+      if (IsMaskRequired) {
+        auto *Undefs = UndefValue::get(Mask[Part]->getType());
+        auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+        Value *ShuffledMask = Builder.CreateShuffleVector(
+            Mask[Part], Undefs, RepMask, "interleaved.mask");
+        NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 
+                                           ShuffledMask, UndefVec,
+                                           "wide.masked.vec");
+      }
+      else
+        NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], 
+          Group->getAlignment(), "wide.vec");
       Group->addMetadata(NewLoad);
       NewLoads.push_back(NewLoad);
     }
@@ -2079,8 +2111,18 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
                                               "interleaved.vec");
 
-    Instruction *NewStoreInstr =
-        Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
+    Instruction *NewStoreInstr;
+    if (IsMaskRequired) {
+      auto *Undefs = UndefValue::get(Mask[Part]->getType());
+      auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+      Value *ShuffledMask = Builder.CreateShuffleVector(
+          Mask[Part], Undefs, RepMask, "interleaved.mask");
+      NewStoreInstr = Builder.CreateMaskedStore(
+          IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
+    }
+    else
+      NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 
+        Group->getAlignment());
 
     Group->addMetadata(NewStoreInstr);
   }
@@ -4253,6 +4295,32 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
   return false;
 }
 
+static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
+  if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0))
+    return TTI.enableMaskedInterleavedAccessVectorization();
+
+  // If an override option has been passed in for interleaved accesses, use it.
+  return EnableMaskedInterleavedMemAccesses;
+}
+
+bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
+                                                               unsigned VF) {
+  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
+  assert(getWideningDecision(I, VF) == CM_Unknown &&
+         "Decision should not be set yet.");
+
+  if (!Legal->blockNeedsPredication(I->getParent()) ||
+      !Legal->isMaskRequired(I))
+    return true;
+
+  if (!useMaskedInterleavedAccesses(TTI))
+    return false;
+
+  auto *Ty = getMemInstValueType(I);
+  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) 
+                          : TTI.isLegalMaskedStore(Ty);
+}
+
 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
                                                                unsigned VF) {
   // Get and ensure we have a valid memory instruction.
@@ -5371,13 +5439,17 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   }
 
   // Calculate the cost of the whole interleaved group.
-  unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
-                                                 Group->getFactor(), Indices,
-                                                 Group->getAlignment(), AS);
-
-  if (Group->isReverse())
+  unsigned Cost = TTI.getInterleavedMemoryOpCost(
+      I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
+      Group->getAlignment(), AS, Legal->isMaskRequired(I));
+
+  if (Group->isReverse()) {
+    // TODO: Add support for reversed masked interleaved access.
+    assert(!Legal->isMaskRequired(I) && 
+           "Reverse masked interleaved access not supported.");
     Cost += Group->getNumMembers() *
             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  }
   return Cost;
 }
 
@@ -5479,7 +5551,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
           continue;
 
         NumAccesses = Group->getNumMembers();
-        InterleaveCost = getInterleaveGroupCost(&I, VF);
+        if (interleavedAccessCanBeWidened(&I, VF))
+          InterleaveCost = getInterleaveGroupCost(&I, VF);
       }
 
       unsigned GatherScatterCost =
@@ -6152,7 +6225,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
 }
 
 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
-                                                           VFRange &Range) {
+                                                           VFRange &Range,
+                                                           VPlanPtr &Plan) {
   const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
   if (!IG)
     return nullptr;
@@ -6174,7 +6248,11 @@ VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
   assert(I == IG->getInsertPos() &&
          "Generating a recipe for an adjunct member of an interleave group");
 
-  return new VPInterleaveRecipe(IG);
+  VPValue *Mask = nullptr;
+  if (Legal->isMaskRequired(I))
+    Mask = createBlockInMask(I->getParent(), Plan);
+
+  return new VPInterleaveRecipe(IG, Mask);
 }
 
 VPWidenMemoryInstructionRecipe *
@@ -6442,7 +6520,7 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
   VPRecipeBase *Recipe = nullptr;
   // Check if Instr should belong to an interleave memory recipe, or already
   // does. In the latter case Instr is irrelevant.
-  if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
+  if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
     VPBB->appendRecipe(Recipe);
     return true;
   }
@@ -6669,6 +6747,10 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
   O << " +\n"
     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
+  if (User) {
+    O << ", ";
+    User->getOperand(0)->printAsOperand(O);
+  }
   O << "\\l\"";
   for (unsigned i = 0; i < IG->getFactor(); ++i)
     if (Instruction *I = IG->getMember(i))
@@ -6731,7 +6813,15 @@ void VPBlendRecipe::execute(VPTransformState &State) {
 
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Interleave group being replicated.");
-  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+  if (!User)
+    return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+
+  // Last (and currently only) operand is a mask.
+  InnerLoopVectorizer::VectorParts MaskValues(State.UF);
+  VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
+  for (unsigned Part = 0; Part < State.UF; ++Part)
+    MaskValues[Part] = State.get(Mask, Part);
+  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
 }
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
@@ -7030,7 +7120,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Analyze interleaved memory accesses.
   if (UseInterleaved) {
-    IAI.analyzeInterleaving();
+    IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
   }
 
   // Use the cost model.
diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h
index f43a8bb123b..15d38ac9c84 100644
--- a/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -69,7 +69,8 @@ public:
   /// \return value is <true, nullptr>, as it is handled by another recipe.
   /// \p Range.End may be decreased to ensure same decision from \p Range.Start
   /// to \p Range.End.
-  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
+  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range,
+                                            VPlanPtr &Plan);
 
   /// Check if \I is a memory instruction to be widened for \p Range.Start and
   /// potentially masked. Such instructions are handled by a recipe that takes
diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
index c3123b41600..81b1986c97d 100644
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h
@@ -769,10 +769,14 @@ public:
 class VPInterleaveRecipe : public VPRecipeBase {
 private:
   const InterleaveGroup *IG;
+  std::unique_ptr<VPUser> User;
 
 public:
-  VPInterleaveRecipe(const InterleaveGroup *IG)
-      : VPRecipeBase(VPInterleaveSC), IG(IG) {}
+  VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask)
+      : VPRecipeBase(VPInterleaveSC), IG(IG) {
+    if (Mask) // Create a VPInstruction to register as a user of the mask.
+      User.reset(new VPUser({Mask}));
+  }
   ~VPInterleaveRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
new file mode 100644
index 00000000000..b1163d0a199
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -0,0 +1,164 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; When masked-interleaved-groups are disabled:
+; Check that the predicated load is not vectorized as an
+; interleaved-group but rather as a scalarized accesses.
+; (For SKX, Gather is not supported by the compiler for chars, therefore
+;  the only remaining alternative is to scalarize).
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load for an interleave-group (with
+; a single member).
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Check also a scenario with full interleave-groups (no gaps) as well as both
+; load and store groups. We check that when masked-interleave-group is disabled
+; the predicated loads (and stores) are not vectorized as an
+; interleaved-group but rather as four separate scalarized accesses.
+; (For SKX, gather/scatter is not supported by the compiler for chars, therefore
+; the only remaining alternative is to scalarize).
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load/store for the two interleave-
+; groups.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:        %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+;ENABLED_MASKED_STRIDED:       call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr  {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
new file mode 100644
index 00000000000..9ed66a22dbf
--- /dev/null
+++ b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
@@ -0,0 +1,222 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; We test here that the loop-vectorizer forms an interleave-groups from 
+; predicated memory accesses only if they are both in the same (predicated)
+; block (first scenario below).
+; If the accesses are not in the same predicated block, an interleave-group
+; is not formed (scenarios 2,3 below).
+
+; Scenario 1: Check the case where it is legal to create masked interleave-
+; groups. Altogether two groups are created (one for loads and one for stores)
+; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses
+; are disabled we do not create any interleave-group.
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  store i8  %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:   %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with   %{{.*}} = load i8, i8* %{{.*}}, align 1
+
+; Scenario 2: Check the case where it is illegal to create a masked interleave-
+; group because the first access is predicated, and the second isn't.
+; We therefore create a separate interleave-group with gaps for each of the
+; stores (if masked-interleaved-accesses are enabled) and these are later
+; invalidated because interleave-groups of stores with gaps are not supported. 
+; If masked-interleaved-accesses is not enabled we create only one interleave
+; group of stores (for the non-predicated store) and it is later invalidated
+; due to gaps.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     q[2*ix+1] = 2;
+; }
+;}
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; Scenario 3: Check the case where it is illegal to create a masked interleave-
+; group because the two accesses are in separate predicated blocks.
+; We therefore create a separate interleave-group with gaps for each of the accesses,
+; (which are later invalidated because interleave-groups of stores with gaps are 
+; not supported).
+; If masked-interleaved-accesses is not enabled we don't create any interleave
+; group because all accesses are predicated.
+;
+; void masked_strided3(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     if (ix > guard2) {
+;         q[2*ix+1] = 2;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; ModuleID = 'test.c'
+source_filename = "test.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.012, 1
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  %cmp1 = icmp ugt i32 %ix.012, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %add = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.012, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard1 to i32
+  %conv3 = zext i8 %guard2 to i32
+  br label %for.body
+
+for.body:
+  %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.018, 1
+  %cmp1 = icmp ugt i32 %ix.018, %conv
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  br label %if.end
+
+if.end:
+  %cmp4 = icmp ugt i32 %ix.018, %conv3
+  br i1 %cmp4, label %if.then6, label %for.inc
+
+if.then6:
+  %add = or i32 %mul, 1
+  %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx7, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.018, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = {  "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"  }
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index 89c0ac10916..c647f586b18 100644
--- a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 %pair = type { i64, i64 }
-- 
GitLab


From 473da03560118802fa66f089afeda9d0b38b2ab4 Mon Sep 17 00:00:00 2001
From: Dorit Nuzman <dorit.nuzman@intel.com>
Date: Sun, 14 Oct 2018 07:21:20 +0000
Subject: [PATCH 0165/1116] revert  344472 due to failures.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344473 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/TargetTransformInfo.h   |  19 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   6 +-
 include/llvm/Analysis/VectorUtils.h           |  21 +-
 include/llvm/CodeGen/BasicTTIImpl.h           |  38 +--
 lib/Analysis/TargetTransformInfo.cpp          |  10 +-
 lib/Analysis/VectorUtils.cpp                  |  29 +--
 .../AArch64/AArch64TargetTransformInfo.cpp    |   7 +-
 .../AArch64/AArch64TargetTransformInfo.h      |   2 +-
 lib/Target/ARM/ARMTargetTransformInfo.cpp     |   8 +-
 lib/Target/ARM/ARMTargetTransformInfo.h       |   2 +-
 .../Hexagon/HexagonTargetTransformInfo.cpp    |   6 +-
 .../Hexagon/HexagonTargetTransformInfo.h      |   2 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.cpp |   7 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.h   |   3 +-
 .../SystemZ/SystemZTargetTransformInfo.cpp    |   6 +-
 .../SystemZ/SystemZTargetTransformInfo.h      |   2 +-
 lib/Target/X86/X86TargetTransformInfo.cpp     |  23 +-
 lib/Target/X86/X86TargetTransformInfo.h       |   9 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 126 ++--------
 lib/Transforms/Vectorize/VPRecipeBuilder.h    |   3 +-
 lib/Transforms/Vectorize/VPlan.h              |   8 +-
 .../x86-interleaved-accesses-masked-group.ll  | 164 -------------
 .../interleaved-accesses-masked-group.ll      | 222 ------------------
 .../interleaved-accesses-pred-stores.ll       |   1 -
 24 files changed, 70 insertions(+), 654 deletions(-)
 delete mode 100644 test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
 delete mode 100644 test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll

diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index c2a9d1ec195..18b5a5cf0e5 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -587,10 +587,6 @@ public:
   /// Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
-  /// Enable matching of interleaved access groups that contain predicated 
-  /// accesses and are vectorized using masked vector loads/stores.
-  bool enableMaskedInterleavedAccessVectorization() const;
-
   /// Indicate that it is potentially unsafe to automatically vectorize
   /// floating-point operations because the semantics of vector and scalar
   /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
@@ -825,11 +821,9 @@ public:
   ///    load allows gaps)
   /// \p Alignment is the alignment of the memory operation
   /// \p AddressSpace is address space of the pointer.
-  /// \p IsMasked indicates if the memory access is predicated.
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, 
-                                 bool IsMasked = false) const;
+                                 unsigned AddressSpace) const;
 
   /// Calculate the cost of performing a vector reduction.
   ///
@@ -1078,7 +1072,6 @@ public:
   virtual const MemCmpExpansionOptions *enableMemCmpExpansion(
       bool IsZeroCmp) const = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
-  virtual bool enableMaskedInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
                                               unsigned BitWidth,
@@ -1139,8 +1132,7 @@ public:
                                          unsigned Factor,
                                          ArrayRef<unsigned> Indices,
                                          unsigned Alignment,
-                                         unsigned AddressSpace,
-                                         bool IsMasked = false) = 0;
+                                         unsigned AddressSpace) = 0;
   virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                          bool IsPairwiseForm) = 0;
   virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
@@ -1354,9 +1346,6 @@ public:
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
-  bool enableMaskedInterleavedAccessVectorization() override {
-    return Impl.enableMaskedInterleavedAccessVectorization();
-  }
   bool isFPVectorizationPotentiallyUnsafe() override {
     return Impl.isFPVectorizationPotentiallyUnsafe();
   }
@@ -1482,9 +1471,9 @@ public:
   }
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked) override {
+                                 unsigned AddressSpace) override {
     return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace);
   }
   int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                  bool IsPairwiseForm) override {
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index c64d4d36805..e39fe66c0a4 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -313,8 +313,6 @@ public:
 
   bool enableInterleavedAccessVectorization() { return false; }
 
-  bool enableMaskedInterleavedAccessVectorization() { return false; }
-
   bool isFPVectorizationPotentiallyUnsafe() { return false; }
 
   bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
@@ -452,8 +450,8 @@ public:
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
-                                      unsigned Alignment, unsigned AddressSpace,
-                                      bool IsMasked = false) {
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) {
     return 1;
   }
 
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index 2ac49f67662..622d932f74f 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -125,21 +125,6 @@ computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
 /// This function always sets a (possibly null) value for each K in Kinds.
 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 
-/// Create a mask with replicated elements.
-///
-/// This function creates a shuffle mask for replicating each of the \p VF 
-/// elements in a vector \p ReplicationFactor times. It can be used to
-/// transform a mask of \p VF elements into a mask of
-/// \p VF * \p ReplicationFactor elements used by a predicated
-/// interleaved-group of loads/stores whose Interleaved-factor ==
-/// \p ReplicationFactor.
-///
-/// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is:
-///
-///   <0,0,0,1,1,1,2,2,2,3,3,3>
-Constant *createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor,
-                               unsigned VF);
-
 /// Create an interleave shuffle mask.
 ///
 /// This function creates a shuffle mask for interleaving \p NumVecs vectors of
@@ -343,7 +328,7 @@ public:
   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
                         DominatorTree *DT, LoopInfo *LI,
                         const LoopAccessInfo *LAI)
-      : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
+    : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
 
   ~InterleavedAccessInfo() {
     SmallPtrSet<InterleaveGroup *, 4> DelSet;
@@ -356,9 +341,7 @@ public:
 
   /// Analyze the interleaved accesses and collect them in interleave
   /// groups. Substitute symbolic strides using \p Strides.
-  /// Consider also predicated loads/stores in the analysis if
-  /// \p EnableMaskedInterleavedGroup is true.
-  void analyzeInterleaving(bool EnableMaskedInterleavedGroup);
+  void analyzeInterleaving();
 
   /// Check if \p Instr belongs to any interleave group.
   bool isInterleaved(Instruction *Instr) const {
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index e740fe57172..b460cdc0ba1 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -783,8 +783,8 @@ public:
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
-                                      unsigned Alignment, unsigned AddressSpace,
-                                      bool IsMasked = false) {
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) {
     VectorType *VT = dyn_cast<VectorType>(VecTy);
     assert(VT && "Expect a vector type for interleaved memory op");
 
@@ -795,13 +795,8 @@ public:
     VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
 
     // Firstly, the cost of load/store operation.
-    unsigned Cost;
-    if (IsMasked)
-      Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
-          Opcode, VecTy, Alignment, AddressSpace);
-    else
-      Cost = static_cast<T *>(this)->getMemoryOpCost(Opcode, VecTy, Alignment,
-                                                     AddressSpace);
+    unsigned Cost = static_cast<T *>(this)->getMemoryOpCost(
+        Opcode, VecTy, Alignment, AddressSpace);
 
     // Legalize the vector type, and get the legalized and unlegalized type
     // sizes.
@@ -897,31 +892,6 @@ public:
                     ->getVectorInstrCost(Instruction::InsertElement, VT, i);
     }
 
-    if (!IsMasked)
-      return Cost;
-
-    Type *I8Type = Type::getInt8Ty(VT->getContext());
-    VectorType *MaskVT = VectorType::get(I8Type, NumElts);
-    SubVT = VectorType::get(I8Type, NumSubElts);
-
-    // The Mask shuffling cost is extract all the elements of the Mask
-    // and insert each of them Factor times into the wide vector:
-    //
-    // E.g. an interleaved group with factor 3:
-    //    %mask = icmp ult <8 x i32> %vec1, %vec2
-    //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
-    //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
-    // The cost is estimated as extract all mask elements from the <8xi1> mask
-    // vector and insert them factor times into the <24xi1> shuffled mask
-    // vector.
-    for (unsigned i = 0; i < NumSubElts; i++)
-      Cost += static_cast<T *>(this)->getVectorInstrCost(
-          Instruction::ExtractElement, SubVT, i);
-
-    for (unsigned i = 0; i < NumElts; i++)
-      Cost += static_cast<T *>(this)->getVectorInstrCost(
-          Instruction::InsertElement, MaskVT, i);
-
     return Cost;
   }
 
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 867403d0ef1..4ad48e351a4 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -268,10 +268,6 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
 
-bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const {
-  return TTIImpl->enableMaskedInterleavedAccessVectorization();
-}
-
 bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const {
   return TTIImpl->isFPVectorizationPotentiallyUnsafe();
 }
@@ -519,9 +515,9 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
 
 int TargetTransformInfo::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace, bool IsMasked) const {
-  int Cost = TTIImpl->getInterleavedMemoryOpCost(
-      Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked);
+    unsigned Alignment, unsigned AddressSpace) const {
+  int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                                 Alignment, AddressSpace);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index e14449b8838..272c665ace1 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -502,16 +502,6 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
   return Inst;
 }
 
-Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, 
-                                     unsigned ReplicationFactor, unsigned VF) {
-  SmallVector<Constant *, 16> MaskVec;
-  for (unsigned i = 0; i < VF; i++)
-    for (unsigned j = 0; j < ReplicationFactor; j++)
-      MaskVec.push_back(Builder.getInt32(i));
-
-  return ConstantVector::get(MaskVec);
-}
-
 Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
                                      unsigned NumVecs) {
   SmallVector<Constant *, 16> Mask;
@@ -682,8 +672,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
 // this group because it and (2) are dependent. However, (1) can be grouped
 // with other accesses that may precede it in program order. Note that a
 // bottom-up order does not imply that WAW dependences should not be checked.
-void InterleavedAccessInfo::analyzeInterleaving(
-                                 bool EnablePredicatedInterleavedMemAccesses) {
+void InterleavedAccessInfo::analyzeInterleaving() {
   LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
   const ValueToValueMap &Strides = LAI->getSymbolicStrides();
 
@@ -723,8 +712,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
     // create a group for B, we continue with the bottom-up algorithm to ensure
     // we don't break any of B's dependences.
     InterleaveGroup *Group = nullptr;
-    if (isStrided(DesB.Stride) && 
-        (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
+    // TODO: Ignore B if it is in a predicated block. This restriction can be 
+    // relaxed in the future once we handle masked interleaved groups.
+    if (isStrided(DesB.Stride) && !isPredicated(B->getParent())) {
       Group = getInterleaveGroup(B);
       if (!Group) {
         LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
@@ -818,12 +808,11 @@ void InterleavedAccessInfo::analyzeInterleaving(
       if (DistanceToB % static_cast<int64_t>(DesB.Size))
         continue;
 
-      // All members of a predicated interleave-group must have the same predicate,
-      // and currently must reside in the same BB.
-      BasicBlock *BlockA = A->getParent();  
-      BasicBlock *BlockB = B->getParent();  
-      if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
-          (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
+      // Ignore A if either A or B is in a predicated block. Although we
+      // currently prevent group formation for predicated accesses, we may be
+      // able to relax this limitation in the future once we handle more
+      // complicated blocks.
+      if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
         continue;
 
       // The index of A is the index of B plus A's distance to B in multiples
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a16de89cf10..96e751e8697 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -659,12 +659,11 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace,
-                                               bool IsMasked) {
+                                               unsigned AddressSpace) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
-  if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+  if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
@@ -677,7 +676,7 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace);
 }
 
 int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index b3893d32850..c056a7d2428 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -146,7 +146,7 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked = false);
+                                 unsigned AddressSpace);
 
   bool
   shouldConsiderAddressTypePromotion(const Instruction &I,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index bac3e6c2387..1b0d162f726 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -542,16 +542,14 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace
-                                           bool IsMasked) {
+                                           unsigned AddressSpace) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
   // vldN/vstN doesn't support vector types of i64/f64 element.
   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
 
-  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
-      !IsMasked) {
+  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
     unsigned NumElts = VecTy->getVectorNumElements();
     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
@@ -564,7 +562,7 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace);
 }
 
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 84e3055c6bc..7d14bd7c256 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -169,7 +169,7 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked);
+                                 unsigned AddressSpace);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 79b269bccfe..4d0e7dc52e8 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -206,10 +206,10 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
 
 unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
       Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      unsigned Alignment, unsigned AddressSpace, bool IsMasked) {
-  if (Indices.size() != Factor || IsMasked)
+      unsigned Alignment, unsigned AddressSpace) {
+  if (Indices.size() != Factor)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                             Alignment, AddressSpace);
   return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr);
 }
 
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 901a91692e8..2c03cd268ff 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -123,7 +123,7 @@ public:
             bool VariableMask, unsigned Alignment);
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
             unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
-            unsigned AddressSpace, bool IsMasked);
+            unsigned AddressSpace);
   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
             const Instruction *I);
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2c81661cb17..b0da9b5a6d7 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -473,12 +473,7 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace,
-                                           Bool IsMasked) {
-  if (IsMasked)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
-
+                                           unsigned AddressSpace) {
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
 
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 252d46e7a2a..2ee2b3eb808 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -90,8 +90,7 @@ public:
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
-                                 unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 unsigned AddressSpace);
 
   /// @}
 };
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 1eaeb9699bf..6f553d5bed3 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -909,11 +909,7 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace,
-                                               bool IsMasked) {
-  if (IsMasked)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                               unsigned AddressSpace) {
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
 
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 92b2b9bdcb8..bfa942357c5 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -92,7 +92,7 @@ public:
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked = false);
+                                 unsigned AddressSpace);
   /// @}
 };
 
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 82e4dfe25b7..d3a75123935 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2723,12 +2723,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace,
-                                               bool IsMasked) {
-
-  if (IsMasked)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                               unsigned AddressSpace) {
 
   // We currently Support only fully-interleaved groups, with no gaps.
   // TODO: Support also strided loads (interleaved-groups with gaps).
@@ -2837,12 +2832,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                                  unsigned Factor,
                                                  ArrayRef<unsigned> Indices,
                                                  unsigned Alignment,
-                                                 unsigned AddressSpace,
-                                                 bool IsMasked) {
-
-  if (IsMasked)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                                 unsigned AddressSpace) {
 
   // VecTy for interleave memop is <VF*Factor x Elt>.
   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -2960,8 +2950,7 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace,
-                                           bool IsMasked) {
+                                           unsigned AddressSpace) {
   auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
     Type *EltTy = VecTy->getVectorElementType();
     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
@@ -2973,11 +2962,11 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   };
   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
     return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
-                                            Alignment, AddressSpace, IsMasked);
+                                            Alignment, AddressSpace);
   if (ST->hasAVX2())
     return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
-                                          Alignment, AddressSpace, IsMasked);
+                                          Alignment, AddressSpace);
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace);
 }
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 2bd778a4211..3df89903882 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -101,16 +101,13 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 unsigned Alignment, unsigned AddressSpace);
   int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 unsigned Alignment, unsigned AddressSpace);
   int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 unsigned Alignment, unsigned AddressSpace);
 
   int getIntImmCost(int64_t);
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index e93cfb34156..7ebe8d102b7 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -172,10 +172,6 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 
-static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
-    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
-    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
-
 /// We don't interleave loops with a known constant trip count below this
 /// number.
 static const unsigned TinyTripCountInterleaveThreshold = 128;
@@ -412,10 +408,8 @@ public:
   /// Construct the vector value of a scalarized value \p V one lane at a time.
   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
 
-  /// Try to vectorize the interleaved access group that \p Instr belongs to,
-  /// optionally masking the vector operations if \p BlockInMask is non-null.
-  void vectorizeInterleaveGroup(Instruction *Instr,
-                                VectorParts *BlockInMask = nullptr);
+  /// Try to vectorize the interleaved access group that \p Instr belongs to.
+  void vectorizeInterleaveGroup(Instruction *Instr);
 
   /// Vectorize Load and Store instructions, optionally masking the vector
   /// operations if \p BlockInMask is non-null.
@@ -1118,11 +1112,6 @@ public:
   /// access that can be widened.
   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
 
-  /// Returns true if \p I is a memory instruction in an interleaved-group
-  /// of memory accesses that can be vectorized with wide vector loads/stores
-  /// and shuffles.
-  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
-
   /// Check if \p Instr belongs to any interleaved access group.
   bool isAccessInterleaved(Instruction *Instr) {
     return InterleaveInfo.isInterleaved(Instr);
@@ -1957,8 +1946,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
-void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
-                                                   VectorParts *BlockInMask) {
+void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
   assert(Group && "Fail to get an interleaved access group.");
 
@@ -1980,15 +1968,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
   SmallVector<Value *, 2> NewPtrs;
   unsigned Index = Group->getIndex(Instr);
 
-  VectorParts Mask;
-  bool IsMaskRequired = BlockInMask;
-  if (IsMaskRequired) {
-    Mask = *BlockInMask;
-    // TODO: extend the masked interleaved-group support to reversed access.
-    assert(!Group->isReverse() && "Reversed masked interleave-group "
-                                  "not supported."); 
-  }
-
   // If the group is reverse, adjust the index to refer to the last vector lane
   // instead of the first. We adjust the index from the first vector lane,
   // rather than directly getting the pointer for lane VF - 1, because the
@@ -2032,19 +2011,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
     // For each unroll part, create a wide load for the group.
     SmallVector<Value *, 2> NewLoads;
     for (unsigned Part = 0; Part < UF; Part++) {
-      Instruction *NewLoad;
-      if (IsMaskRequired) {
-        auto *Undefs = UndefValue::get(Mask[Part]->getType());
-        auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
-        Value *ShuffledMask = Builder.CreateShuffleVector(
-            Mask[Part], Undefs, RepMask, "interleaved.mask");
-        NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 
-                                           ShuffledMask, UndefVec,
-                                           "wide.masked.vec");
-      }
-      else
-        NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], 
-          Group->getAlignment(), "wide.vec");
+      auto *NewLoad = Builder.CreateAlignedLoad(
+          NewPtrs[Part], Group->getAlignment(), "wide.vec");
       Group->addMetadata(NewLoad);
       NewLoads.push_back(NewLoad);
     }
@@ -2111,18 +2079,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
                                               "interleaved.vec");
 
-    Instruction *NewStoreInstr;
-    if (IsMaskRequired) {
-      auto *Undefs = UndefValue::get(Mask[Part]->getType());
-      auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
-      Value *ShuffledMask = Builder.CreateShuffleVector(
-          Mask[Part], Undefs, RepMask, "interleaved.mask");
-      NewStoreInstr = Builder.CreateMaskedStore(
-          IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
-    }
-    else
-      NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 
-        Group->getAlignment());
+    Instruction *NewStoreInstr =
+        Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
 
     Group->addMetadata(NewStoreInstr);
   }
@@ -4295,32 +4253,6 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
   return false;
 }
 
-static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
-  if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0))
-    return TTI.enableMaskedInterleavedAccessVectorization();
-
-  // If an override option has been passed in for interleaved accesses, use it.
-  return EnableMaskedInterleavedMemAccesses;
-}
-
-bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
-                                                               unsigned VF) {
-  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
-  assert(getWideningDecision(I, VF) == CM_Unknown &&
-         "Decision should not be set yet.");
-
-  if (!Legal->blockNeedsPredication(I->getParent()) ||
-      !Legal->isMaskRequired(I))
-    return true;
-
-  if (!useMaskedInterleavedAccesses(TTI))
-    return false;
-
-  auto *Ty = getMemInstValueType(I);
-  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) 
-                          : TTI.isLegalMaskedStore(Ty);
-}
-
 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
                                                                unsigned VF) {
   // Get and ensure we have a valid memory instruction.
@@ -5439,17 +5371,13 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   }
 
   // Calculate the cost of the whole interleaved group.
-  unsigned Cost = TTI.getInterleavedMemoryOpCost(
-      I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
-      Group->getAlignment(), AS, Legal->isMaskRequired(I));
-
-  if (Group->isReverse()) {
-    // TODO: Add support for reversed masked interleaved access.
-    assert(!Legal->isMaskRequired(I) && 
-           "Reverse masked interleaved access not supported.");
+  unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
+                                                 Group->getFactor(), Indices,
+                                                 Group->getAlignment(), AS);
+
+  if (Group->isReverse())
     Cost += Group->getNumMembers() *
             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
-  }
   return Cost;
 }
 
@@ -5551,8 +5479,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
           continue;
 
         NumAccesses = Group->getNumMembers();
-        if (interleavedAccessCanBeWidened(&I, VF))
-          InterleaveCost = getInterleaveGroupCost(&I, VF);
+        InterleaveCost = getInterleaveGroupCost(&I, VF);
       }
 
       unsigned GatherScatterCost =
@@ -6225,8 +6152,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
 }
 
 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
-                                                           VFRange &Range,
-                                                           VPlanPtr &Plan) {
+                                                           VFRange &Range) {
   const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
   if (!IG)
     return nullptr;
@@ -6248,11 +6174,7 @@ VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
   assert(I == IG->getInsertPos() &&
          "Generating a recipe for an adjunct member of an interleave group");
 
-  VPValue *Mask = nullptr;
-  if (Legal->isMaskRequired(I))
-    Mask = createBlockInMask(I->getParent(), Plan);
-
-  return new VPInterleaveRecipe(IG, Mask);
+  return new VPInterleaveRecipe(IG);
 }
 
 VPWidenMemoryInstructionRecipe *
@@ -6520,7 +6442,7 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
   VPRecipeBase *Recipe = nullptr;
   // Check if Instr should belong to an interleave memory recipe, or already
   // does. In the latter case Instr is irrelevant.
-  if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
+  if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
     VPBB->appendRecipe(Recipe);
     return true;
   }
@@ -6747,10 +6669,6 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
   O << " +\n"
     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
-  if (User) {
-    O << ", ";
-    User->getOperand(0)->printAsOperand(O);
-  }
   O << "\\l\"";
   for (unsigned i = 0; i < IG->getFactor(); ++i)
     if (Instruction *I = IG->getMember(i))
@@ -6813,15 +6731,7 @@ void VPBlendRecipe::execute(VPTransformState &State) {
 
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Interleave group being replicated.");
-  if (!User)
-    return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
-
-  // Last (and currently only) operand is a mask.
-  InnerLoopVectorizer::VectorParts MaskValues(State.UF);
-  VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
-  for (unsigned Part = 0; Part < State.UF; ++Part)
-    MaskValues[Part] = State.get(Mask, Part);
-  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
+  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
 }
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
@@ -7120,7 +7030,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Analyze interleaved memory accesses.
   if (UseInterleaved) {
-    IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
+    IAI.analyzeInterleaving();
   }
 
   // Use the cost model.
diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 15d38ac9c84..f43a8bb123b 100644
--- a/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -69,8 +69,7 @@ public:
   /// \return value is <true, nullptr>, as it is handled by another recipe.
   /// \p Range.End may be decreased to ensure same decision from \p Range.Start
   /// to \p Range.End.
-  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range,
-                                            VPlanPtr &Plan);
+  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
 
   /// Check if \I is a memory instruction to be widened for \p Range.Start and
   /// potentially masked. Such instructions are handled by a recipe that takes
diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
index 81b1986c97d..c3123b41600 100644
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h
@@ -769,14 +769,10 @@ public:
 class VPInterleaveRecipe : public VPRecipeBase {
 private:
   const InterleaveGroup *IG;
-  std::unique_ptr<VPUser> User;
 
 public:
-  VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask)
-      : VPRecipeBase(VPInterleaveSC), IG(IG) {
-    if (Mask) // Create a VPInstruction to register as a user of the mask.
-      User.reset(new VPUser({Mask}));
-  }
+  VPInterleaveRecipe(const InterleaveGroup *IG)
+      : VPRecipeBase(VPInterleaveSC), IG(IG) {}
   ~VPInterleaveRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
deleted file mode 100644
index b1163d0a199..00000000000
--- a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ /dev/null
@@ -1,164 +0,0 @@
-; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
-; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
-
-target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-target triple = "i386-unknown-linux-gnu"
-
-; When masked-interleaved-groups are disabled:
-; Check that the predicated load is not vectorized as an
-; interleaved-group but rather as a scalarized accesses.
-; (For SKX, Gather is not supported by the compiler for chars, therefore
-;  the only remaining alternative is to scalarize).
-; When  masked-interleave-group is enabled we expect to find the proper mask
-; shuffling code, feeding the wide masked load for an interleave-group (with
-; a single member).
-;
-; void masked_strided1(const unsigned char* restrict p,
-;                      unsigned char* restrict q,
-;                      unsigned char guard) {
-;   for(ix=0; ix < 1024; ++ix) {
-;     if (ix > guard) {
-;         char t = p[2*ix];
-;         q[ix] = t;
-;     }
-;   }
-; }
-
-;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1(
-;DISABLED_MASKED_STRIDED: vector.body:
-;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
-;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
-;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
-;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-;DISABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
-;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
-;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
-;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
-;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
-;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-
-;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
-;ENABLED_MASKED_STRIDED: vector.body:
-;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
-;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
-;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
-;ENABLED_MASKED_STRIDED-NEXT:  %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-
-define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
-entry:
-  %conv = zext i8 %guard to i32
-  br label %for.body
-
-for.body:
-  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %cmp1 = icmp ugt i32 %ix.09, %conv
-  br i1 %cmp1, label %if.then, label %for.inc
-
-if.then:
-  %mul = shl nuw nsw i32 %ix.09, 1
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
-  %0 = load i8, i8* %arrayidx, align 1
-  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
-  store i8 %0, i8* %arrayidx3, align 1
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i32 %ix.09, 1
-  %exitcond = icmp eq i32 %inc, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-; Check also a scenario with full interleave-groups (no gaps) as well as both
-; load and store groups. We check that when masked-interleave-group is disabled
-; the predicated loads (and stores) are not vectorized as an
-; interleaved-group but rather as four separate scalarized accesses.
-; (For SKX, gather/scatter is not supported by the compiler for chars, therefore
-; the only remaining alternative is to scalarize).
-; When  masked-interleave-group is enabled we expect to find the proper mask
-; shuffling code, feeding the wide masked load/store for the two interleave-
-; groups.
-;
-; void masked_strided2(const unsigned char* restrict p,
-;                     unsigned char* restrict q,
-;                     unsigned char guard) {
-; for(ix=0; ix < 1024; ++ix) {
-;     if (ix > guard) {
-;         char left = p[2*ix];
-;         char right = p[2*ix + 1];
-;         char max = max(left, right);
-;         q[2*ix] = max;
-;         q[2*ix+1] = 0 - max;
-;     }
-; }
-;}
-
-;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2(
-;DISABLED_MASKED_STRIDED: vector.body:
-;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
-;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
-;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
-;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
-;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-;DISABLED_MASKED_STRIDED:        %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
-;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
-;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
-;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
-;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
-;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
-;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-
-;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2(
-;ENABLED_MASKED_STRIDED: vector.body:
-;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32
-;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
-;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
-;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-;ENABLED_MASKED_STRIDED:       call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask)
-
-; Function Attrs: norecurse nounwind
-define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr  {
-entry:
-  %conv = zext i8 %guard to i32
-  br label %for.body
-
-for.body:
-  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %cmp1 = icmp ugt i32 %ix.024, %conv
-  br i1 %cmp1, label %if.then, label %for.inc
-
-if.then:
-  %mul = shl nuw nsw i32 %ix.024, 1
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
-  %0 = load i8, i8* %arrayidx, align 1
-  %add = or i32 %mul, 1
-  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
-  %1 = load i8, i8* %arrayidx4, align 1
-  %cmp.i = icmp slt i8 %0, %1
-  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
-  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
-  store i8 %spec.select.i, i8* %arrayidx6, align 1
-  %sub = sub i8 0, %spec.select.i
-  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
-  store i8 %sub, i8* %arrayidx11, align 1
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i32 %ix.024, 1
-  %exitcond = icmp eq i32 %inc, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
deleted file mode 100644
index 9ed66a22dbf..00000000000
--- a/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
+++ /dev/null
@@ -1,222 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED
-; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED
-
-target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-
-; We test here that the loop-vectorizer forms an interleave-groups from 
-; predicated memory accesses only if they are both in the same (predicated)
-; block (first scenario below).
-; If the accesses are not in the same predicated block, an interleave-group
-; is not formed (scenarios 2,3 below).
-
-; Scenario 1: Check the case where it is legal to create masked interleave-
-; groups. Altogether two groups are created (one for loads and one for stores)
-; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses
-; are disabled we do not create any interleave-group.
-;
-; void masked_strided1(const unsigned char* restrict p,
-;                     unsigned char* restrict q,
-;                     unsigned char guard) {
-; for(ix=0; ix < 1024; ++ix) {
-;     if (ix > guard) {
-;         char left = p[2*ix];
-;         char right = p[2*ix + 1];
-;         char max = max(left, right);
-;         q[2*ix] = max;
-;         q[2*ix+1] = 0 - max;
-;     }
-; }
-;}
-
-
-; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" 
-; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
-; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
-
-; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" 
-; STRIDED_MASKED: LV: Analyzing interleaved accesses...
-; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 %{{.*}}, i8* %{{.*}}, align 1
-; STRIDED_MASKED-NEXT: LV: Inserted:  store i8  %{{.*}}, i8* %{{.*}}, align 1
-; STRIDED_MASKED-NEXT:     into the interleave group with  store i8 %{{.*}}, i8* %{{.*}}, align 1
-; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:   %{{.*}} = load i8, i8* %{{.*}}, align 1
-; STRIDED_MASKED-NEXT: LV: Inserted:  %{{.*}} = load i8, i8* %{{.*}}, align 1
-; STRIDED_MASKED-NEXT:     into the interleave group with   %{{.*}} = load i8, i8* %{{.*}}, align 1
-
-; Scenario 2: Check the case where it is illegal to create a masked interleave-
-; group because the first access is predicated, and the second isn't.
-; We therefore create a separate interleave-group with gaps for each of the
-; stores (if masked-interleaved-accesses are enabled) and these are later
-; invalidated because interleave-groups of stores with gaps are not supported. 
-; If masked-interleaved-accesses is not enabled we create only one interleave
-; group of stores (for the non-predicated store) and it is later invalidated
-; due to gaps.
-;
-; void masked_strided2(const unsigned char* restrict p,
-;                     unsigned char* restrict q,
-;                     unsigned char guard1,
-;                     unsigned char guard2) {
-; for(ix=0; ix < 1024; ++ix) {
-;     if (ix > guard1) {
-;         q[2*ix] = 1;
-;     }
-;     q[2*ix+1] = 2;
-; }
-;}
-
-; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" 
-; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
-; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
-; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
-; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
-
-; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" 
-; STRIDED_MASKED: LV: Analyzing interleaved accesses...
-; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
-; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
-; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
-; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
-
-
-; Scenario 3: Check the case where it is illegal to create a masked interleave-
-; group because the two accesses are in separate predicated blocks.
-; We therefore create a separate interleave-group with gaps for each of the accesses,
-; (which are later invalidated because interleave-groups of stores with gaps are 
-; not supported).
-; If masked-interleaved-accesses is not enabled we don't create any interleave
-; group because all accesses are predicated.
-;
-; void masked_strided3(const unsigned char* restrict p,
-;                     unsigned char* restrict q,
-;                     unsigned char guard1,
-;                     unsigned char guard2) {
-; for(ix=0; ix < 1024; ++ix) {
-;     if (ix > guard1) {
-;         q[2*ix] = 1;
-;     }
-;     if (ix > guard2) {
-;         q[2*ix+1] = 2;
-;     }
-; }
-;}
-
-
-; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" 
-; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
-; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
-
-; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" 
-; STRIDED_MASKED: LV: Analyzing interleaved accesses...
-; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
-; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
-; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
-; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
-
-
-; ModuleID = 'test.c'
-source_filename = "test.c"
-target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-target triple = "i386-unknown-linux-gnu"
-
-define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
-entry:
-  %conv = zext i8 %guard to i32
-  br label %for.body
-
-for.body:
-  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %cmp1 = icmp ugt i32 %ix.024, %conv
-  br i1 %cmp1, label %if.then, label %for.inc
-
-if.then:
-  %mul = shl nuw nsw i32 %ix.024, 1
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
-  %0 = load i8, i8* %arrayidx, align 1
-  %add = or i32 %mul, 1
-  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
-  %1 = load i8, i8* %arrayidx4, align 1
-  %cmp.i = icmp slt i8 %0, %1
-  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
-  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
-  store i8 %spec.select.i, i8* %arrayidx6, align 1
-  %sub = sub i8 0, %spec.select.i
-  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
-  store i8 %sub, i8* %arrayidx11, align 1
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i32 %ix.024, 1
-  %exitcond = icmp eq i32 %inc, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-
-define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
-entry:
-  %conv = zext i8 %guard to i32
-  br label %for.body
-
-for.body:
-  %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %mul = shl nuw nsw i32 %ix.012, 1
-  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
-  store i8 1, i8* %arrayidx, align 1
-  %cmp1 = icmp ugt i32 %ix.012, %conv
-  br i1 %cmp1, label %if.then, label %for.inc
-
-if.then:
-  %add = or i32 %mul, 1
-  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add
-  store i8 2, i8* %arrayidx3, align 1
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i32 %ix.012, 1
-  %exitcond = icmp eq i32 %inc, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-
-define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 {
-entry:
-  %conv = zext i8 %guard1 to i32
-  %conv3 = zext i8 %guard2 to i32
-  br label %for.body
-
-for.body:
-  %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %mul = shl nuw nsw i32 %ix.018, 1
-  %cmp1 = icmp ugt i32 %ix.018, %conv
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:
-  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
-  store i8 1, i8* %arrayidx, align 1
-  br label %if.end
-
-if.end:
-  %cmp4 = icmp ugt i32 %ix.018, %conv3
-  br i1 %cmp4, label %if.then6, label %for.inc
-
-if.then6:
-  %add = or i32 %mul, 1
-  %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add
-  store i8 2, i8* %arrayidx7, align 1
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i32 %ix.018, 1
-  %exitcond = icmp eq i32 %inc, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-attributes #0 = {  "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"  }
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index c647f586b18..89c0ac10916 100644
--- a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -1,5 +1,4 @@
 ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s
-; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 %pair = type { i64, i64 }
-- 
GitLab


From 7d7250490bddc598c7811fe7f30a9ffb8bb5acfe Mon Sep 17 00:00:00 2001
From: Dorit Nuzman <dorit.nuzman@intel.com>
Date: Sun, 14 Oct 2018 08:50:06 +0000
Subject: [PATCH 0166/1116] recommit 344472 after fixing build failure on ARM
 and PPC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344475 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/TargetTransformInfo.h   |  19 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   6 +-
 include/llvm/Analysis/VectorUtils.h           |  21 +-
 include/llvm/CodeGen/BasicTTIImpl.h           |  38 ++-
 lib/Analysis/TargetTransformInfo.cpp          |  10 +-
 lib/Analysis/VectorUtils.cpp                  |  29 ++-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   7 +-
 .../AArch64/AArch64TargetTransformInfo.h      |   2 +-
 lib/Target/ARM/ARMTargetTransformInfo.cpp     |   8 +-
 lib/Target/ARM/ARMTargetTransformInfo.h       |   2 +-
 .../Hexagon/HexagonTargetTransformInfo.cpp    |   6 +-
 .../Hexagon/HexagonTargetTransformInfo.h      |   2 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.cpp |   7 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.h   |   3 +-
 .../SystemZ/SystemZTargetTransformInfo.cpp    |   6 +-
 .../SystemZ/SystemZTargetTransformInfo.h      |   2 +-
 lib/Target/X86/X86TargetTransformInfo.cpp     |  23 +-
 lib/Target/X86/X86TargetTransformInfo.h       |   9 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 126 ++++++++--
 lib/Transforms/Vectorize/VPRecipeBuilder.h    |   3 +-
 lib/Transforms/Vectorize/VPlan.h              |   8 +-
 .../x86-interleaved-accesses-masked-group.ll  | 164 +++++++++++++
 .../interleaved-accesses-masked-group.ll      | 222 ++++++++++++++++++
 .../interleaved-accesses-pred-stores.ll       |   1 +
 24 files changed, 654 insertions(+), 70 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
 create mode 100644 test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll

diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 18b5a5cf0e5..c2a9d1ec195 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -587,6 +587,10 @@ public:
   /// Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
+  /// Enable matching of interleaved access groups that contain predicated 
+  /// accesses and are vectorized using masked vector loads/stores.
+  bool enableMaskedInterleavedAccessVectorization() const;
+
   /// Indicate that it is potentially unsafe to automatically vectorize
   /// floating-point operations because the semantics of vector and scalar
   /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
@@ -821,9 +825,11 @@ public:
   ///    load allows gaps)
   /// \p Alignment is the alignment of the memory operation
   /// \p AddressSpace is address space of the pointer.
+  /// \p IsMasked indicates if the memory access is predicated.
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace) const;
+                                 unsigned AddressSpace, 
+                                 bool IsMasked = false) const;
 
   /// Calculate the cost of performing a vector reduction.
   ///
@@ -1072,6 +1078,7 @@ public:
   virtual const MemCmpExpansionOptions *enableMemCmpExpansion(
       bool IsZeroCmp) const = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
+  virtual bool enableMaskedInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
                                               unsigned BitWidth,
@@ -1132,7 +1139,8 @@ public:
                                          unsigned Factor,
                                          ArrayRef<unsigned> Indices,
                                          unsigned Alignment,
-                                         unsigned AddressSpace) = 0;
+                                         unsigned AddressSpace,
+                                         bool IsMasked = false) = 0;
   virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                          bool IsPairwiseForm) = 0;
   virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
@@ -1346,6 +1354,9 @@ public:
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
+  bool enableMaskedInterleavedAccessVectorization() override {
+    return Impl.enableMaskedInterleavedAccessVectorization();
+  }
   bool isFPVectorizationPotentiallyUnsafe() override {
     return Impl.isFPVectorizationPotentiallyUnsafe();
   }
@@ -1471,9 +1482,9 @@ public:
   }
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace) override {
+                                 unsigned AddressSpace, bool IsMasked) override {
     return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace, IsMasked);
   }
   int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                  bool IsPairwiseForm) override {
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index e39fe66c0a4..c64d4d36805 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -313,6 +313,8 @@ public:
 
   bool enableInterleavedAccessVectorization() { return false; }
 
+  bool enableMaskedInterleavedAccessVectorization() { return false; }
+
   bool isFPVectorizationPotentiallyUnsafe() { return false; }
 
   bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
@@ -450,8 +452,8 @@ public:
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace) {
+                                      unsigned Alignment, unsigned AddressSpace,
+                                      bool IsMasked = false) {
     return 1;
   }
 
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index 622d932f74f..2ac49f67662 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -125,6 +125,21 @@ computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
 /// This function always sets a (possibly null) value for each K in Kinds.
 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 
+/// Create a mask with replicated elements.
+///
+/// This function creates a shuffle mask for replicating each of the \p VF 
+/// elements in a vector \p ReplicationFactor times. It can be used to
+/// transform a mask of \p VF elements into a mask of
+/// \p VF * \p ReplicationFactor elements used by a predicated
+/// interleaved-group of loads/stores whose Interleaved-factor ==
+/// \p ReplicationFactor.
+///
+/// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is:
+///
+///   <0,0,0,1,1,1,2,2,2,3,3,3>
+Constant *createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor,
+                               unsigned VF);
+
 /// Create an interleave shuffle mask.
 ///
 /// This function creates a shuffle mask for interleaving \p NumVecs vectors of
@@ -328,7 +343,7 @@ public:
   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
                         DominatorTree *DT, LoopInfo *LI,
                         const LoopAccessInfo *LAI)
-    : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
+      : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
 
   ~InterleavedAccessInfo() {
     SmallPtrSet<InterleaveGroup *, 4> DelSet;
@@ -341,7 +356,9 @@ public:
 
   /// Analyze the interleaved accesses and collect them in interleave
   /// groups. Substitute symbolic strides using \p Strides.
-  void analyzeInterleaving();
+  /// Consider also predicated loads/stores in the analysis if
+  /// \p EnableMaskedInterleavedGroup is true.
+  void analyzeInterleaving(bool EnableMaskedInterleavedGroup);
 
   /// Check if \p Instr belongs to any interleave group.
   bool isInterleaved(Instruction *Instr) const {
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index b460cdc0ba1..e740fe57172 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -783,8 +783,8 @@ public:
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace) {
+                                      unsigned Alignment, unsigned AddressSpace,
+                                      bool IsMasked = false) {
     VectorType *VT = dyn_cast<VectorType>(VecTy);
     assert(VT && "Expect a vector type for interleaved memory op");
 
@@ -795,8 +795,13 @@ public:
     VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
 
     // Firstly, the cost of load/store operation.
-    unsigned Cost = static_cast<T *>(this)->getMemoryOpCost(
-        Opcode, VecTy, Alignment, AddressSpace);
+    unsigned Cost;
+    if (IsMasked)
+      Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
+          Opcode, VecTy, Alignment, AddressSpace);
+    else
+      Cost = static_cast<T *>(this)->getMemoryOpCost(Opcode, VecTy, Alignment,
+                                                     AddressSpace);
 
     // Legalize the vector type, and get the legalized and unlegalized type
     // sizes.
@@ -892,6 +897,31 @@ public:
                     ->getVectorInstrCost(Instruction::InsertElement, VT, i);
     }
 
+    if (!IsMasked)
+      return Cost;
+
+    Type *I8Type = Type::getInt8Ty(VT->getContext());
+    VectorType *MaskVT = VectorType::get(I8Type, NumElts);
+    SubVT = VectorType::get(I8Type, NumSubElts);
+
+    // The Mask shuffling cost is extract all the elements of the Mask
+    // and insert each of them Factor times into the wide vector:
+    //
+    // E.g. an interleaved group with factor 3:
+    //    %mask = icmp ult <8 x i32> %vec1, %vec2
+    //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
+    //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
+    // The cost is estimated as extract all mask elements from the <8xi1> mask
+    // vector and insert them factor times into the <24xi1> shuffled mask
+    // vector.
+    for (unsigned i = 0; i < NumSubElts; i++)
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::ExtractElement, SubVT, i);
+
+    for (unsigned i = 0; i < NumElts; i++)
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::InsertElement, MaskVT, i);
+
     return Cost;
   }
 
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 4ad48e351a4..867403d0ef1 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -268,6 +268,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
 
+bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const {
+  return TTIImpl->enableMaskedInterleavedAccessVectorization();
+}
+
 bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const {
   return TTIImpl->isFPVectorizationPotentiallyUnsafe();
 }
@@ -515,9 +519,9 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
 
 int TargetTransformInfo::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace) const {
-  int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                                 Alignment, AddressSpace);
+    unsigned Alignment, unsigned AddressSpace, bool IsMasked) const {
+  int Cost = TTIImpl->getInterleavedMemoryOpCost(
+      Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 272c665ace1..e14449b8838 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -502,6 +502,16 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
   return Inst;
 }
 
+Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, 
+                                     unsigned ReplicationFactor, unsigned VF) {
+  SmallVector<Constant *, 16> MaskVec;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < ReplicationFactor; j++)
+      MaskVec.push_back(Builder.getInt32(i));
+
+  return ConstantVector::get(MaskVec);
+}
+
 Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
                                      unsigned NumVecs) {
   SmallVector<Constant *, 16> Mask;
@@ -672,7 +682,8 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
 // this group because it and (2) are dependent. However, (1) can be grouped
 // with other accesses that may precede it in program order. Note that a
 // bottom-up order does not imply that WAW dependences should not be checked.
-void InterleavedAccessInfo::analyzeInterleaving() {
+void InterleavedAccessInfo::analyzeInterleaving(
+                                 bool EnablePredicatedInterleavedMemAccesses) {
   LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
   const ValueToValueMap &Strides = LAI->getSymbolicStrides();
 
@@ -712,9 +723,8 @@ void InterleavedAccessInfo::analyzeInterleaving() {
     // create a group for B, we continue with the bottom-up algorithm to ensure
     // we don't break any of B's dependences.
     InterleaveGroup *Group = nullptr;
-    // TODO: Ignore B if it is in a predicated block. This restriction can be 
-    // relaxed in the future once we handle masked interleaved groups.
-    if (isStrided(DesB.Stride) && !isPredicated(B->getParent())) {
+    if (isStrided(DesB.Stride) && 
+        (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
       Group = getInterleaveGroup(B);
       if (!Group) {
         LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
@@ -808,11 +818,12 @@ void InterleavedAccessInfo::analyzeInterleaving() {
       if (DistanceToB % static_cast<int64_t>(DesB.Size))
         continue;
 
-      // Ignore A if either A or B is in a predicated block. Although we
-      // currently prevent group formation for predicated accesses, we may be
-      // able to relax this limitation in the future once we handle more
-      // complicated blocks.
-      if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
+      // All members of a predicated interleave-group must have the same predicate,
+      // and currently must reside in the same BB.
+      BasicBlock *BlockA = A->getParent();  
+      BasicBlock *BlockB = B->getParent();  
+      if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
+          (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
         continue;
 
       // The index of A is the index of B plus A's distance to B in multiples
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 96e751e8697..a16de89cf10 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -659,11 +659,12 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace) {
+                                               unsigned AddressSpace,
+                                               bool IsMasked) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
-  if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+  if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
@@ -676,7 +677,7 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace, IsMasked);
 }
 
 int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index c056a7d2428..b3893d32850 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -146,7 +146,7 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace, bool IsMasked = false);
 
   bool
   shouldConsiderAddressTypePromotion(const Instruction &I,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 1b0d162f726..90e0cd96682 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -542,14 +542,16 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace) {
+                                           unsigned AddressSpace,
+                                           bool IsMasked) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
   // vldN/vstN doesn't support vector types of i64/f64 element.
   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
 
-  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
+  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
+      !IsMasked) {
     unsigned NumElts = VecTy->getVectorNumElements();
     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
@@ -562,7 +564,7 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace, IsMasked);
 }
 
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 7d14bd7c256..84e3055c6bc 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -169,7 +169,7 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace, bool IsMasked);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 4d0e7dc52e8..79b269bccfe 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -206,10 +206,10 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
 
 unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
       Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      unsigned Alignment, unsigned AddressSpace) {
-  if (Indices.size() != Factor)
+      unsigned Alignment, unsigned AddressSpace, bool IsMasked) {
+  if (Indices.size() != Factor || IsMasked)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace);
+                                             Alignment, AddressSpace, IsMasked);
   return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr);
 }
 
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 2c03cd268ff..901a91692e8 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -123,7 +123,7 @@ public:
             bool VariableMask, unsigned Alignment);
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
             unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
-            unsigned AddressSpace);
+            unsigned AddressSpace, bool IsMasked);
   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
             const Instruction *I);
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index b0da9b5a6d7..f67bacc87ec 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -473,7 +473,12 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace) {
+                                           unsigned AddressSpace,
+                                           bool IsMasked) {
+  if (IsMasked)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace, IsMasked);
+
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
 
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 2ee2b3eb808..252d46e7a2a 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -90,7 +90,8 @@ public:
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace,
+                                 bool IsMasked = false);
 
   /// @}
 };
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 6f553d5bed3..1eaeb9699bf 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -909,7 +909,11 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace) {
+                                               unsigned AddressSpace,
+                                               bool IsMasked) {
+  if (IsMasked)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace, IsMasked);
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
 
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index bfa942357c5..92b2b9bdcb8 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -92,7 +92,7 @@ public:
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace, bool IsMasked = false);
   /// @}
 };
 
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index d3a75123935..82e4dfe25b7 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2723,7 +2723,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace) {
+                                               unsigned AddressSpace,
+                                               bool IsMasked) {
+
+  if (IsMasked)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace, IsMasked);
 
   // We currently Support only fully-interleaved groups, with no gaps.
   // TODO: Support also strided loads (interleaved-groups with gaps).
@@ -2832,7 +2837,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                                  unsigned Factor,
                                                  ArrayRef<unsigned> Indices,
                                                  unsigned Alignment,
-                                                 unsigned AddressSpace) {
+                                                 unsigned AddressSpace,
+                                                 bool IsMasked) {
+
+  if (IsMasked)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace, IsMasked);
 
   // VecTy for interleave memop is <VF*Factor x Elt>.
   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -2950,7 +2960,8 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace) {
+                                           unsigned AddressSpace,
+                                           bool IsMasked) {
   auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
     Type *EltTy = VecTy->getVectorElementType();
     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
@@ -2962,11 +2973,11 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   };
   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
     return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
-                                            Alignment, AddressSpace);
+                                            Alignment, AddressSpace, IsMasked);
   if (ST->hasAVX2())
     return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
-                                          Alignment, AddressSpace);
+                                          Alignment, AddressSpace, IsMasked);
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace, IsMasked);
 }
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 3df89903882..2bd778a4211 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -101,13 +101,16 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace);
+                                 unsigned Alignment, unsigned AddressSpace,
+                                 bool IsMasked = false);
   int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace);
+                                 unsigned Alignment, unsigned AddressSpace,
+                                 bool IsMasked = false);
   int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace);
+                                 unsigned Alignment, unsigned AddressSpace,
+                                 bool IsMasked = false);
 
   int getIntImmCost(int64_t);
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7ebe8d102b7..e93cfb34156 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -172,6 +172,10 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 
+static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
+    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
+
 /// We don't interleave loops with a known constant trip count below this
 /// number.
 static const unsigned TinyTripCountInterleaveThreshold = 128;
@@ -408,8 +412,10 @@ public:
   /// Construct the vector value of a scalarized value \p V one lane at a time.
   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
 
-  /// Try to vectorize the interleaved access group that \p Instr belongs to.
-  void vectorizeInterleaveGroup(Instruction *Instr);
+  /// Try to vectorize the interleaved access group that \p Instr belongs to,
+  /// optionally masking the vector operations if \p BlockInMask is non-null.
+  void vectorizeInterleaveGroup(Instruction *Instr,
+                                VectorParts *BlockInMask = nullptr);
 
   /// Vectorize Load and Store instructions, optionally masking the vector
   /// operations if \p BlockInMask is non-null.
@@ -1112,6 +1118,11 @@ public:
   /// access that can be widened.
   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
 
+  /// Returns true if \p I is a memory instruction in an interleaved-group
+  /// of memory accesses that can be vectorized with wide vector loads/stores
+  /// and shuffles.
+  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
+
   /// Check if \p Instr belongs to any interleaved access group.
   bool isAccessInterleaved(Instruction *Instr) {
     return InterleaveInfo.isInterleaved(Instr);
@@ -1946,7 +1957,8 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
-void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
+void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
+                                                   VectorParts *BlockInMask) {
   const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
   assert(Group && "Fail to get an interleaved access group.");
 
@@ -1968,6 +1980,15 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   SmallVector<Value *, 2> NewPtrs;
   unsigned Index = Group->getIndex(Instr);
 
+  VectorParts Mask;
+  bool IsMaskRequired = BlockInMask;
+  if (IsMaskRequired) {
+    Mask = *BlockInMask;
+    // TODO: extend the masked interleaved-group support to reversed access.
+    assert(!Group->isReverse() && "Reversed masked interleave-group "
+                                  "not supported."); 
+  }
+
   // If the group is reverse, adjust the index to refer to the last vector lane
   // instead of the first. We adjust the index from the first vector lane,
   // rather than directly getting the pointer for lane VF - 1, because the
@@ -2011,8 +2032,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     // For each unroll part, create a wide load for the group.
     SmallVector<Value *, 2> NewLoads;
     for (unsigned Part = 0; Part < UF; Part++) {
-      auto *NewLoad = Builder.CreateAlignedLoad(
-          NewPtrs[Part], Group->getAlignment(), "wide.vec");
+      Instruction *NewLoad;
+      if (IsMaskRequired) {
+        auto *Undefs = UndefValue::get(Mask[Part]->getType());
+        auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+        Value *ShuffledMask = Builder.CreateShuffleVector(
+            Mask[Part], Undefs, RepMask, "interleaved.mask");
+        NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 
+                                           ShuffledMask, UndefVec,
+                                           "wide.masked.vec");
+      }
+      else
+        NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], 
+          Group->getAlignment(), "wide.vec");
       Group->addMetadata(NewLoad);
       NewLoads.push_back(NewLoad);
     }
@@ -2079,8 +2111,18 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
                                               "interleaved.vec");
 
-    Instruction *NewStoreInstr =
-        Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
+    Instruction *NewStoreInstr;
+    if (IsMaskRequired) {
+      auto *Undefs = UndefValue::get(Mask[Part]->getType());
+      auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+      Value *ShuffledMask = Builder.CreateShuffleVector(
+          Mask[Part], Undefs, RepMask, "interleaved.mask");
+      NewStoreInstr = Builder.CreateMaskedStore(
+          IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
+    }
+    else
+      NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 
+        Group->getAlignment());
 
     Group->addMetadata(NewStoreInstr);
   }
@@ -4253,6 +4295,32 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
   return false;
 }
 
+static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
+  if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0))
+    return TTI.enableMaskedInterleavedAccessVectorization();
+
+  // If an override option has been passed in for interleaved accesses, use it.
+  return EnableMaskedInterleavedMemAccesses;
+}
+
+bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
+                                                               unsigned VF) {
+  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
+  assert(getWideningDecision(I, VF) == CM_Unknown &&
+         "Decision should not be set yet.");
+
+  if (!Legal->blockNeedsPredication(I->getParent()) ||
+      !Legal->isMaskRequired(I))
+    return true;
+
+  if (!useMaskedInterleavedAccesses(TTI))
+    return false;
+
+  auto *Ty = getMemInstValueType(I);
+  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) 
+                          : TTI.isLegalMaskedStore(Ty);
+}
+
 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
                                                                unsigned VF) {
   // Get and ensure we have a valid memory instruction.
@@ -5371,13 +5439,17 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   }
 
   // Calculate the cost of the whole interleaved group.
-  unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
-                                                 Group->getFactor(), Indices,
-                                                 Group->getAlignment(), AS);
-
-  if (Group->isReverse())
+  unsigned Cost = TTI.getInterleavedMemoryOpCost(
+      I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
+      Group->getAlignment(), AS, Legal->isMaskRequired(I));
+
+  if (Group->isReverse()) {
+    // TODO: Add support for reversed masked interleaved access.
+    assert(!Legal->isMaskRequired(I) && 
+           "Reverse masked interleaved access not supported.");
     Cost += Group->getNumMembers() *
             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  }
   return Cost;
 }
 
@@ -5479,7 +5551,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
           continue;
 
         NumAccesses = Group->getNumMembers();
-        InterleaveCost = getInterleaveGroupCost(&I, VF);
+        if (interleavedAccessCanBeWidened(&I, VF))
+          InterleaveCost = getInterleaveGroupCost(&I, VF);
       }
 
       unsigned GatherScatterCost =
@@ -6152,7 +6225,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
 }
 
 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
-                                                           VFRange &Range) {
+                                                           VFRange &Range,
+                                                           VPlanPtr &Plan) {
   const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
   if (!IG)
     return nullptr;
@@ -6174,7 +6248,11 @@ VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
   assert(I == IG->getInsertPos() &&
          "Generating a recipe for an adjunct member of an interleave group");
 
-  return new VPInterleaveRecipe(IG);
+  VPValue *Mask = nullptr;
+  if (Legal->isMaskRequired(I))
+    Mask = createBlockInMask(I->getParent(), Plan);
+
+  return new VPInterleaveRecipe(IG, Mask);
 }
 
 VPWidenMemoryInstructionRecipe *
@@ -6442,7 +6520,7 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
   VPRecipeBase *Recipe = nullptr;
   // Check if Instr should belong to an interleave memory recipe, or already
   // does. In the latter case Instr is irrelevant.
-  if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
+  if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
     VPBB->appendRecipe(Recipe);
     return true;
   }
@@ -6669,6 +6747,10 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
   O << " +\n"
     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
+  if (User) {
+    O << ", ";
+    User->getOperand(0)->printAsOperand(O);
+  }
   O << "\\l\"";
   for (unsigned i = 0; i < IG->getFactor(); ++i)
     if (Instruction *I = IG->getMember(i))
@@ -6731,7 +6813,15 @@ void VPBlendRecipe::execute(VPTransformState &State) {
 
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Interleave group being replicated.");
-  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+  if (!User)
+    return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+
+  // Last (and currently only) operand is a mask.
+  InnerLoopVectorizer::VectorParts MaskValues(State.UF);
+  VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
+  for (unsigned Part = 0; Part < State.UF; ++Part)
+    MaskValues[Part] = State.get(Mask, Part);
+  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
 }
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
@@ -7030,7 +7120,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Analyze interleaved memory accesses.
   if (UseInterleaved) {
-    IAI.analyzeInterleaving();
+    IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
   }
 
   // Use the cost model.
diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h
index f43a8bb123b..15d38ac9c84 100644
--- a/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -69,7 +69,8 @@ public:
   /// \return value is <true, nullptr>, as it is handled by another recipe.
   /// \p Range.End may be decreased to ensure same decision from \p Range.Start
   /// to \p Range.End.
-  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
+  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range,
+                                            VPlanPtr &Plan);
 
   /// Check if \I is a memory instruction to be widened for \p Range.Start and
   /// potentially masked. Such instructions are handled by a recipe that takes
diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
index c3123b41600..81b1986c97d 100644
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h
@@ -769,10 +769,14 @@ public:
 class VPInterleaveRecipe : public VPRecipeBase {
 private:
   const InterleaveGroup *IG;
+  std::unique_ptr<VPUser> User;
 
 public:
-  VPInterleaveRecipe(const InterleaveGroup *IG)
-      : VPRecipeBase(VPInterleaveSC), IG(IG) {}
+  VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask)
+      : VPRecipeBase(VPInterleaveSC), IG(IG) {
+    if (Mask) // Create a VPInstruction to register as a user of the mask.
+      User.reset(new VPUser({Mask}));
+  }
   ~VPInterleaveRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
new file mode 100644
index 00000000000..b1163d0a199
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -0,0 +1,164 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; When masked-interleaved-groups are disabled:
+; Check that the predicated load is not vectorized as an
+; interleaved-group but rather as a scalarized accesses.
+; (For SKX, Gather is not supported by the compiler for chars, therefore
+;  the only remaining alternative is to scalarize).
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load for an interleave-group (with
+; a single member).
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Check also a scenario with full interleave-groups (no gaps) as well as both
+; load and store groups. We check that when masked-interleave-group is disabled
+; the predicated loads (and stores) are not vectorized as an
+; interleaved-group but rather as four separate scalarized accesses.
+; (For SKX, gather/scatter is not supported by the compiler for chars, therefore
+; the only remaining alternative is to scalarize).
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load/store for the two interleave-
+; groups.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:        %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+;ENABLED_MASKED_STRIDED:       call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr  {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
new file mode 100644
index 00000000000..9ed66a22dbf
--- /dev/null
+++ b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
@@ -0,0 +1,222 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; We test here that the loop-vectorizer forms an interleave-groups from 
+; predicated memory accesses only if they are both in the same (predicated)
+; block (first scenario below).
+; If the accesses are not in the same predicated block, an interleave-group
+; is not formed (scenarios 2,3 below).
+
+; Scenario 1: Check the case where it is legal to create masked interleave-
+; groups. Altogether two groups are created (one for loads and one for stores)
+; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses
+; are disabled we do not create any interleave-group.
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  store i8  %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:   %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with   %{{.*}} = load i8, i8* %{{.*}}, align 1
+
+; Scenario 2: Check the case where it is illegal to create a masked interleave-
+; group because the first access is predicated, and the second isn't.
+; We therefore create a separate interleave-group with gaps for each of the
+; stores (if masked-interleaved-accesses are enabled) and these are later
+; invalidated because interleave-groups of stores with gaps are not supported. 
+; If masked-interleaved-accesses is not enabled we create only one interleave
+; group of stores (for the non-predicated store) and it is later invalidated
+; due to gaps.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     q[2*ix+1] = 2;
+; }
+;}
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; Scenario 3: Check the case where it is illegal to create a masked interleave-
+; group because the two accesses are in separate predicated blocks.
+; We therefore create a separate interleave-group with gaps for each of the accesses,
+; (which are later invalidated because interleave-groups of stores with gaps are 
+; not supported).
+; If masked-interleaved-accesses is not enabled we don't create any interleave
+; group because all accesses are predicated.
+;
+; void masked_strided3(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     if (ix > guard2) {
+;         q[2*ix+1] = 2;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; ModuleID = 'test.c'
+source_filename = "test.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.012, 1
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  %cmp1 = icmp ugt i32 %ix.012, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %add = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.012, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard1 to i32
+  %conv3 = zext i8 %guard2 to i32
+  br label %for.body
+
+for.body:
+  %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.018, 1
+  %cmp1 = icmp ugt i32 %ix.018, %conv
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  br label %if.end
+
+if.end:
+  %cmp4 = icmp ugt i32 %ix.018, %conv3
+  br i1 %cmp4, label %if.then6, label %for.inc
+
+if.then6:
+  %add = or i32 %mul, 1
+  %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx7, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.018, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = {  "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"  }
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index 89c0ac10916..c647f586b18 100644
--- a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 %pair = type { i64, i64 }
-- 
GitLab


From 3394148166680c0a876eab4de4ba9092fef5cd3a Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 14 Oct 2018 15:25:06 +0000
Subject: [PATCH 0167/1116] [InstCombine] combine a shuffle and an extract
 subvector shuffle

This is part of the missing IR-level folding noted in D52912.
This should be ok as a canonicalization because the new shuffle mask can't
be any more complicated than the existing shuffle mask. If there's some
target where the shorter vector shuffle is not legal, it should just end up
expanding to something like the pair of shuffles that we're starting with here.

Differential Revision: https://reviews.llvm.org/D53037


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344476 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineVectorOps.cpp      | 38 +++++++++++++++++++
 test/Transforms/InstCombine/vec_shuffle.ll    |  8 ++--
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 61a3e31f960..bdd8fe3eead 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1477,6 +1477,41 @@ static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,
   return SelectInst::Create(NarrowCond, NarrowX, NarrowY);
 }
 
+/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask.
+static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
+  Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
+  if (!Shuf.isIdentityWithExtract() || !isa<UndefValue>(Op1))
+    return nullptr;
+
+  Value *X, *Y;
+  Constant *Mask;
+  if (!match(Op0, m_ShuffleVector(m_Value(X), m_Value(Y), m_Constant(Mask))))
+    return nullptr;
+
+  // We are extracting a subvector from a shuffle. Remove excess elements from
+  // the 1st shuffle mask to eliminate the extract.
+  //
+  // This transform is conservatively limited to identity extracts because we do
+  // not allow arbitrary shuffle mask creation as a target-independent transform
+  // (because we can't guarantee that will lower efficiently).
+  //
+  // If the extracting shuffle has an undef mask element, it transfers to the
+  // new shuffle mask. Otherwise, copy the original mask element. Example:
+  //   shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> -->
+  //   shuf X, Y, <C0, undef, C2, undef>
+  unsigned NumElts = Shuf.getType()->getVectorNumElements();
+  SmallVector<Constant *, 16> NewMask(NumElts);
+  assert(NumElts < Mask->getType()->getVectorNumElements() &&
+         "Identity with extract must have less elements than its inputs");
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *ExtractMaskElt = Shuf.getMask()->getAggregateElement(i);
+    Constant *MaskElt = Mask->getAggregateElement(i);
+    NewMask[i] = isa<UndefValue>(ExtractMaskElt) ? ExtractMaskElt : MaskElt;
+  }
+  return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask));
+}
+
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
@@ -1499,6 +1534,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     return &SVI;
   }
 
+  if (Instruction *I = foldIdentityExtractShuffle(SVI))
+    return I;
+
   SmallVector<int, 16> Mask = SVI.getShuffleMask();
   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
   unsigned LHSWidth = LHS->getType()->getVectorNumElements();
diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll
index e9c3539ef6b..7692fe3e05c 100644
--- a/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/test/Transforms/InstCombine/vec_shuffle.ll
@@ -170,12 +170,11 @@ define <8 x i8> @test12a(<8 x i8> %t6, <8 x i8> %t2) {
   ret <8 x i8> %t3
 }
 
-; TODO: The mask length of the 1st shuffle can be reduced to eliminate the 2nd shuffle.
+; The mask length of the 1st shuffle can be reduced to eliminate the 2nd shuffle.
 
 define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @extract_subvector_of_shuffle(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <3 x i32> <i32 0, i32 2, i32 undef>
-; CHECK-NEXT:    [[EXTRACT_SUBV:%.*]] = shufflevector <3 x i8> [[SHUF]], <3 x i8> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    ret <2 x i8> [[EXTRACT_SUBV]]
 ;
   %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <3 x i32> <i32 0, i32 2, i32 0>
@@ -183,7 +182,6 @@ define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i8> %extract_subv
 }
 
-; TODO:
 ; Extra uses are ok.
 ; Undef elements in either mask are ok. Undefs from the 2nd shuffle mask should propagate to the new shuffle.
 ; The type of the inputs does not have to match the output type.
@@ -194,7 +192,7 @@ define <4 x i8> @extract_subvector_of_shuffle_extra_use(<2 x i8> %x, <2 x i8> %y
 ; CHECK-LABEL: @extract_subvector_of_shuffle_extra_use(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <5 x i32> <i32 undef, i32 2, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    call void @use_v5i8(<5 x i8> [[SHUF]])
-; CHECK-NEXT:    [[EXTRACT_SUBV:%.*]] = shufflevector <5 x i8> [[SHUF]], <5 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT:    [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X]], <2 x i8> [[Y]], <4 x i32> <i32 undef, i32 2, i32 0, i32 undef>
 ; CHECK-NEXT:    ret <4 x i8> [[EXTRACT_SUBV]]
 ;
   %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <5 x i32> <i32 undef, i32 2, i32 0, i32 1, i32 0>
-- 
GitLab


From ffc6fe6727622c4e63572c4d676658b26e21ceba Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 14 Oct 2018 15:56:28 +0000
Subject: [PATCH 0168/1116] [LegalizeDAG] Don't bother with final MUL+SRL stage
 for byte CTPOP.

The final stage of CTPOP expansion (v = (v * 0x01010101...) >> (Len - 8)) is completely pointless for the byte (Len = 8) case as it reduces to (v = (v * 0x01...) >> 0), but annoyingly this doesn't always get optimized away.

Found while investigating generic vector CTPOP expansion (PR32655).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344477 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bb2c76a6a41..175df889ef2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2750,9 +2750,10 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
                                              DAG.getConstant(4, dl, ShVT))),
                      Mask0F);
     // v = (v * 0x01010101...) >> (Len - 8)
-    Op = DAG.getNode(ISD::SRL, dl, VT,
-                     DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
-                     DAG.getConstant(Len - 8, dl, ShVT));
+    if (Len > 8)
+      Op = DAG.getNode(ISD::SRL, dl, VT,
+                       DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
+                       DAG.getConstant(Len - 8, dl, ShVT));
 
     return Op;
   }
-- 
GitLab


From c8309b5ac33fcd1c4ac977a1cfbba7f4e1cdf6e0 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sun, 14 Oct 2018 16:09:59 +0000
Subject: [PATCH 0169/1116] [ORC] Remove XXLayer::add methods that default to
 using the main JITDylib.

They're not currently used and may complicate upcoming changes to add's
signature and behavior.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344478 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/Orc/Layer.h | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/Layer.h b/include/llvm/ExecutionEngine/Orc/Layer.h
index 3bd23ae5416..be5d9653dd8 100644
--- a/include/llvm/ExecutionEngine/Orc/Layer.h
+++ b/include/llvm/ExecutionEngine/Orc/Layer.h
@@ -51,12 +51,6 @@ public:
   /// JITDylib.
   virtual Error add(JITDylib &JD, VModuleKey K, ThreadSafeModule TSM);
 
-  /// Adds a MaterializationUnit representing the given IR to the main
-  /// JITDylib.
-  Error add(VModuleKey K, ThreadSafeModule TSM) {
-    return add(ES.getMainJITDylib(), K, std::move(TSM));
-  }
-
   /// Emit should materialize the given IR.
   virtual void emit(MaterializationResponsibility R, VModuleKey K,
                     ThreadSafeModule TSM) = 0;
@@ -127,12 +121,6 @@ public:
   /// JITDylib.
   virtual Error add(JITDylib &JD, VModuleKey K, std::unique_ptr<MemoryBuffer> O);
 
-  /// Adds a MaterializationUnit representing the given object to the main
-  /// JITDylib.
-  Error add(VModuleKey K, std::unique_ptr<MemoryBuffer> O) {
-    return add(ES.getMainJITDylib(), K, std::move(O));
-  }
-
   /// Emit should materialize the given IR.
   virtual void emit(MaterializationResponsibility R, VModuleKey K,
                     std::unique_ptr<MemoryBuffer> O) = 0;
-- 
GitLab


From 7e9c8da5fd4730d5beb7b8d3625f8a4a5383a54b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 14 Oct 2018 16:49:04 +0000
Subject: [PATCH 0170/1116] [ARM] Regenerate cttz tests

Improve codegen view as part of PR32655

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344479 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/ARM/cttz_vector.ll | 419 +++++++++++++++++++++-----------
 1 file changed, 283 insertions(+), 136 deletions(-)

diff --git a/test/CodeGen/ARM/cttz_vector.ll b/test/CodeGen/ARM/cttz_vector.ll
index bed64498041..f27c1e4b417 100644
--- a/test/CodeGen/ARM/cttz_vector.ll
+++ b/test/CodeGen/ARM/cttz_vector.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple armv7-linux-gnueabihf -mattr=+neon | FileCheck %s
 
 ; This test checks the @llvm.cttz.* intrinsics for vectors.
@@ -23,7 +24,14 @@ declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
 ;------------------------------------------------------------------------------
 
 define void @test_v1i8(<1 x i8>* %p) {
-; CHECK-LABEL: test_v1i8
+; CHECK-LABEL: test_v1i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldrb r1, [r0]
+; CHECK-NEXT:    orr r1, r1, #256
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    strb r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i8>, <1 x i8>* %p
   %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 false)
   store <1 x i8> %tmp, <1 x i8>* %p
@@ -32,6 +40,21 @@ define void @test_v1i8(<1 x i8>* %p) {
 
 define void @test_v2i8(<2 x i8>* %p) {
 ; CHECK-LABEL: test_v2i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.16 {d16[0]}, [r0:16]
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vorr.i32 d16, #0x100
+; CHECK-NEXT:    vneg.s32 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vmov.i32 d17, #0x1f
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vsub.i32 d16, d17, d16
+; CHECK-NEXT:    vmov.32 r1, d16[1]
+; CHECK-NEXT:    vmov.32 r2, d16[0]
+; CHECK-NEXT:    strb r1, [r0, #1]
+; CHECK-NEXT:    strb r2, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i8>, <2 x i8>* %p
   %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 false)
   store <2 x i8> %tmp, <2 x i8>* %p
@@ -40,6 +63,19 @@ define void @test_v2i8(<2 x i8>* %p) {
 
 define void @test_v4i8(<4 x i8>* %p) {
 ; CHECK-LABEL: test_v4i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    vmov.i16 d19, #0x1
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vorr.i16 d16, #0x100
+; CHECK-NEXT:    vneg.s16 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vsub.i16 d16, d16, d19
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vuzp.8 d16, d17
+; CHECK-NEXT:    vst1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i8>, <4 x i8>* %p
   %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 false)
   store <4 x i8> %tmp, <4 x i8>* %p
@@ -48,13 +84,15 @@ define void @test_v4i8(<4 x i8>* %p) {
 
 define void @test_v8i8(<8 x i8>* %p) {
 ; CHECK-LABEL: test_v8i8:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i8	[[D2:d[0-9]+]], #0x1
-; CHECK: vneg.s8	[[D3:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D3]]
-; CHECK: vsub.i8	[[D1]], [[D1]], [[D2]]
-; CHECK: vcnt.8		[[D1]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vmov.i8 d18, #0x1
+; CHECK-NEXT:    vneg.s8 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vsub.i8 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <8 x i8>, <8 x i8>* %p
   %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 false)
   store <8 x i8> %tmp, <8 x i8>* %p
@@ -63,13 +101,15 @@ define void @test_v8i8(<8 x i8>* %p) {
 
 define void @test_v16i8(<16 x i8>* %p) {
 ; CHECK-LABEL: test_v16i8:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i8	[[Q2:q[0-9]+]], #0x1
-; CHECK: vneg.s8	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
-; CHECK: vsub.i8	[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vcnt.8		[[Q1]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov.i8 q10, #0x1
+; CHECK-NEXT:    vneg.s8 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vsub.i8 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <16 x i8>, <16 x i8>* %p
   %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
   store <16 x i8> %tmp, <16 x i8>* %p
@@ -78,6 +118,13 @@ define void @test_v16i8(<16 x i8>* %p) {
 
 define void @test_v1i16(<1 x i16>* %p) {
 ; CHECK-LABEL: test_v1i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldrh r1, [r0]
+; CHECK-NEXT:    orr r1, r1, #65536
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    strh r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i16>, <1 x i16>* %p
   %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 false)
   store <1 x i16> %tmp, <1 x i16>* %p
@@ -86,6 +133,18 @@ define void @test_v1i16(<1 x i16>* %p) {
 
 define void @test_v2i16(<2 x i16>* %p) {
 ; CHECK-LABEL: test_v2i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vorr.i32 d16, #0x10000
+; CHECK-NEXT:    vneg.s32 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vmov.i32 d17, #0x1f
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vsub.i32 d16, d17, d16
+; CHECK-NEXT:    vuzp.16 d16, d17
+; CHECK-NEXT:    vst1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i16>, <2 x i16>* %p
   %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 false)
   store <2 x i16> %tmp, <2 x i16>* %p
@@ -94,14 +153,16 @@ define void @test_v2i16(<2 x i16>* %p) {
 
 define void @test_v4i16(<4 x i16>* %p) {
 ; CHECK-LABEL: test_v4i16:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i16	[[D2:d[0-9]+]], #0x1
-; CHECK: vneg.s16	[[D3:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D3]]
-; CHECK: vsub.i16	[[D1]], [[D1]], [[D2]]
-; CHECK: vcnt.8		[[D1]], [[D1]]
-; CHECK: vpaddl.u8	[[D1]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vmov.i16 d18, #0x1
+; CHECK-NEXT:    vneg.s16 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vsub.i16 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i16>, <4 x i16>* %p
   %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 false)
   store <4 x i16> %tmp, <4 x i16>* %p
@@ -110,14 +171,16 @@ define void @test_v4i16(<4 x i16>* %p) {
 
 define void @test_v8i16(<8 x i16>* %p) {
 ; CHECK-LABEL: test_v8i16:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i16	[[Q2:q[0-9]+]], #0x1
-; CHECK: vneg.s16	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
-; CHECK: vsub.i16	[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vcnt.8		[[Q1]], [[Q1]]
-; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov.i16 q10, #0x1
+; CHECK-NEXT:    vneg.s16 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vsub.i16 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <8 x i16>, <8 x i16>* %p
   %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
   store <8 x i16> %tmp, <8 x i16>* %p
@@ -126,6 +189,12 @@ define void @test_v8i16(<8 x i16>* %p) {
 
 define void @test_v1i32(<1 x i32>* %p) {
 ; CHECK-LABEL: test_v1i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldr r1, [r0]
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i32>, <1 x i32>* %p
   %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 false)
   store <1 x i32> %tmp, <1 x i32>* %p
@@ -134,15 +203,17 @@ define void @test_v1i32(<1 x i32>* %p) {
 
 define void @test_v2i32(<2 x i32>* %p) {
 ; CHECK-LABEL: test_v2i32:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x1
-; CHECK: vneg.s32	[[D3:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D3]]
-; CHECK: vsub.i32	[[D1]], [[D1]], [[D2]]
-; CHECK: vcnt.8		[[D1]], [[D1]]
-; CHECK: vpaddl.u8	[[D1]], [[D1]]
-; CHECK: vpaddl.u16	[[D1]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vmov.i32 d18, #0x1
+; CHECK-NEXT:    vneg.s32 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vsub.i32 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vpaddl.u16 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i32>, <2 x i32>* %p
   %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
   store <2 x i32> %tmp, <2 x i32>* %p
@@ -151,15 +222,17 @@ define void @test_v2i32(<2 x i32>* %p) {
 
 define void @test_v4i32(<4 x i32>* %p) {
 ; CHECK-LABEL: test_v4i32:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x1
-; CHECK: vneg.s32	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
-; CHECK: vsub.i32	[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vcnt.8		[[Q1]], [[Q1]]
-; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
-; CHECK: vpaddl.u16	[[Q1]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov.i32 q10, #0x1
+; CHECK-NEXT:    vneg.s32 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vsub.i32 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vpaddl.u16 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i32>, <4 x i32>* %p
   %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
   store <4 x i32> %tmp, <4 x i32>* %p
@@ -168,17 +241,19 @@ define void @test_v4i32(<4 x i32>* %p) {
 
 define void @test_v1i64(<1 x i64>* %p) {
 ; CHECK-LABEL: test_v1i64:
-; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x0
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i64	[[D3:d[0-9]+]], #0xffffffffffffffff
-; CHECK: vsub.i64	[[D2]], [[D2]], [[D1]]
-; CHECK: vand		[[D2]], [[D1]], [[D2]]
-; CHECK: vadd.i64	[[D2]], [[D2]], [[D3]]
-; CHECK: vcnt.8		[[D2]], [[D2]]
-; CHECK: vpaddl.u8	[[D2]], [[D2]]
-; CHECK: vpaddl.u16	[[D2]], [[D2]]
-; CHECK: vpaddl.u32	[[D2]], [[D2]]
-; CHECK: vstr		[[D2]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d16, #0x0
+; CHECK-NEXT:    vldr d17, [r0]
+; CHECK-NEXT:    vmov.i64 d18, #0xffffffffffffffff
+; CHECK-NEXT:    vsub.i64 d16, d16, d17
+; CHECK-NEXT:    vand d16, d17, d16
+; CHECK-NEXT:    vadd.i64 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vpaddl.u16 d16, d16
+; CHECK-NEXT:    vpaddl.u32 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i64>, <1 x i64>* %p
   %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 false)
   store <1 x i64> %tmp, <1 x i64>* %p
@@ -187,17 +262,19 @@ define void @test_v1i64(<1 x i64>* %p) {
 
 define void @test_v2i64(<2 x i64>* %p) {
 ; CHECK-LABEL: test_v2i64:
-; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x0
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i64	[[Q3:q[0-9]+]], #0xffffffffffffffff
-; CHECK: vsub.i64	[[Q2]], [[Q2]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q2]], [[Q1]], [[Q2]]
-; CHECK: vadd.i64	[[Q2]], [[Q2]], [[Q3]]
-; CHECK: vcnt.8		[[Q2]], [[Q2]]
-; CHECK: vpaddl.u8	[[Q2]], [[Q2]]
-; CHECK: vpaddl.u16	[[Q2]], [[Q2]]
-; CHECK: vpaddl.u32	[[Q2]], [[Q2]]
-; CHECK: vst1.64	{d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q8, #0x0
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vmov.i64 q10, #0xffffffffffffffff
+; CHECK-NEXT:    vsub.i64 q8, q8, q9
+; CHECK-NEXT:    vand q8, q9, q8
+; CHECK-NEXT:    vadd.i64 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vpaddl.u16 q8, q8
+; CHECK-NEXT:    vpaddl.u32 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i64>, <2 x i64>* %p
   %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
   store <2 x i64> %tmp, <2 x i64>* %p
@@ -207,7 +284,13 @@ define void @test_v2i64(<2 x i64>* %p) {
 ;------------------------------------------------------------------------------
 
 define void @test_v1i8_zero_undef(<1 x i8>* %p) {
-; CHECK-LABEL: test_v1i8_zero_undef
+; CHECK-LABEL: test_v1i8_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldrb r1, [r0]
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    strb r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i8>, <1 x i8>* %p
   %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 true)
   store <1 x i8> %tmp, <1 x i8>* %p
@@ -216,6 +299,20 @@ define void @test_v1i8_zero_undef(<1 x i8>* %p) {
 
 define void @test_v2i8_zero_undef(<2 x i8>* %p) {
 ; CHECK-LABEL: test_v2i8_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.16 {d16[0]}, [r0:16]
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vneg.s32 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vmov.i32 d17, #0x1f
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vsub.i32 d16, d17, d16
+; CHECK-NEXT:    vmov.32 r1, d16[1]
+; CHECK-NEXT:    vmov.32 r2, d16[0]
+; CHECK-NEXT:    strb r1, [r0, #1]
+; CHECK-NEXT:    strb r2, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i8>, <2 x i8>* %p
   %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
   store <2 x i8> %tmp, <2 x i8>* %p
@@ -224,6 +321,17 @@ define void @test_v2i8_zero_undef(<2 x i8>* %p) {
 
 define void @test_v4i8_zero_undef(<4 x i8>* %p) {
 ; CHECK-LABEL: test_v4i8_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vneg.s16 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vmov.i16 d17, #0xf
+; CHECK-NEXT:    vclz.i16 d16, d16
+; CHECK-NEXT:    vsub.i16 d16, d17, d16
+; CHECK-NEXT:    vuzp.8 d16, d17
+; CHECK-NEXT:    vst1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i8>, <4 x i8>* %p
   %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
   store <4 x i8> %tmp, <4 x i8>* %p
@@ -232,13 +340,15 @@ define void @test_v4i8_zero_undef(<4 x i8>* %p) {
 
 define void @test_v8i8_zero_undef(<8 x i8>* %p) {
 ; CHECK-LABEL: test_v8i8_zero_undef:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i8	[[D2:d[0-9]+]], #0x1
-; CHECK: vneg.s8	[[D3:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D3]]
-; CHECK: vsub.i8	[[D1]], [[D1]], [[D2]]
-; CHECK: vcnt.8		[[D1]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vmov.i8 d18, #0x1
+; CHECK-NEXT:    vneg.s8 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vsub.i8 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <8 x i8>, <8 x i8>* %p
   %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
   store <8 x i8> %tmp, <8 x i8>* %p
@@ -247,13 +357,15 @@ define void @test_v8i8_zero_undef(<8 x i8>* %p) {
 
 define void @test_v16i8_zero_undef(<16 x i8>* %p) {
 ; CHECK-LABEL: test_v16i8_zero_undef:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i8	[[Q2:q[0-9]+]], #0x1
-; CHECK: vneg.s8	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
-; CHECK: vsub.i8	[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vcnt.8		[[Q1]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov.i8 q10, #0x1
+; CHECK-NEXT:    vneg.s8 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vsub.i8 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <16 x i8>, <16 x i8>* %p
   %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
   store <16 x i8> %tmp, <16 x i8>* %p
@@ -262,6 +374,12 @@ define void @test_v16i8_zero_undef(<16 x i8>* %p) {
 
 define void @test_v1i16_zero_undef(<1 x i16>* %p) {
 ; CHECK-LABEL: test_v1i16_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldrh r1, [r0]
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    strh r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i16>, <1 x i16>* %p
   %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 true)
   store <1 x i16> %tmp, <1 x i16>* %p
@@ -270,6 +388,17 @@ define void @test_v1i16_zero_undef(<1 x i16>* %p) {
 
 define void @test_v2i16_zero_undef(<2 x i16>* %p) {
 ; CHECK-LABEL: test_v2i16_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vneg.s32 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vmov.i32 d17, #0x1f
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vsub.i32 d16, d17, d16
+; CHECK-NEXT:    vuzp.16 d16, d17
+; CHECK-NEXT:    vst1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i16>, <2 x i16>* %p
   %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
   store <2 x i16> %tmp, <2 x i16>* %p
@@ -278,13 +407,15 @@ define void @test_v2i16_zero_undef(<2 x i16>* %p) {
 
 define void @test_v4i16_zero_undef(<4 x i16>* %p) {
 ; CHECK-LABEL: test_v4i16_zero_undef:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vneg.s16	[[D2:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D2]]
-; CHECK: vmov.i16	[[D3:d[0-9]+]], #0xf
-; CHECK: vclz.i16	[[D1]], [[D1]]
-; CHECK: vsub.i16	[[D1]], [[D3]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vneg.s16 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vmov.i16 d17, #0xf
+; CHECK-NEXT:    vclz.i16 d16, d16
+; CHECK-NEXT:    vsub.i16 d16, d17, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i16>, <4 x i16>* %p
   %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
   store <4 x i16> %tmp, <4 x i16>* %p
@@ -293,13 +424,15 @@ define void @test_v4i16_zero_undef(<4 x i16>* %p) {
 
 define void @test_v8i16_zero_undef(<8 x i16>* %p) {
 ; CHECK-LABEL: test_v8i16_zero_undef:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vneg.s16	[[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vmov.i16	[[Q3:q[0-9]+]], #0xf
-; CHECK: vclz.i16	[[Q1]], [[Q1]]
-; CHECK: vsub.i16	[[Q1]], [[Q3]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vneg.s16 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vmov.i16 q9, #0xf
+; CHECK-NEXT:    vclz.i16 q8, q8
+; CHECK-NEXT:    vsub.i16 q8, q9, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <8 x i16>, <8 x i16>* %p
   %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
   store <8 x i16> %tmp, <8 x i16>* %p
@@ -308,6 +441,12 @@ define void @test_v8i16_zero_undef(<8 x i16>* %p) {
 
 define void @test_v1i32_zero_undef(<1 x i32>* %p) {
 ; CHECK-LABEL: test_v1i32_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldr r1, [r0]
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i32>, <1 x i32>* %p
   %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 true)
   store <1 x i32> %tmp, <1 x i32>* %p
@@ -316,13 +455,15 @@ define void @test_v1i32_zero_undef(<1 x i32>* %p) {
 
 define void @test_v2i32_zero_undef(<2 x i32>* %p) {
 ; CHECK-LABEL: test_v2i32_zero_undef:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vneg.s32	[[D2:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D2]]
-; CHECK: vmov.i32	[[D3:d[0-9]+]], #0x1f
-; CHECK: vclz.i32	[[D1]], [[D1]]
-; CHECK: vsub.i32	[[D1]], [[D3]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vneg.s32 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vmov.i32 d17, #0x1f
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vsub.i32 d16, d17, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i32>, <2 x i32>* %p
   %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
   store <2 x i32> %tmp, <2 x i32>* %p
@@ -331,13 +472,15 @@ define void @test_v2i32_zero_undef(<2 x i32>* %p) {
 
 define void @test_v4i32_zero_undef(<4 x i32>* %p) {
 ; CHECK-LABEL: test_v4i32_zero_undef:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vneg.s32	[[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vmov.i32	[[Q3:q[0-9]+]], #0x1f
-; CHECK: vclz.i32	[[Q1]], [[Q1]]
-; CHECK: vsub.i32	[[Q1]], [[Q3]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vneg.s32 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vmov.i32 q9, #0x1f
+; CHECK-NEXT:    vclz.i32 q8, q8
+; CHECK-NEXT:    vsub.i32 q8, q9, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i32>, <4 x i32>* %p
   %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
   store <4 x i32> %tmp, <4 x i32>* %p
@@ -346,17 +489,19 @@ define void @test_v4i32_zero_undef(<4 x i32>* %p) {
 
 define void @test_v1i64_zero_undef(<1 x i64>* %p) {
 ; CHECK-LABEL: test_v1i64_zero_undef:
-; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x0
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i64	[[D3:d[0-9]+]], #0xffffffffffffffff
-; CHECK: vsub.i64	[[D2]], [[D2]], [[D1]]
-; CHECK: vand		[[D2]], [[D1]], [[D2]]
-; CHECK: vadd.i64	[[D2]], [[D2]], [[D3]]
-; CHECK: vcnt.8		[[D2]], [[D2]]
-; CHECK: vpaddl.u8	[[D2]], [[D2]]
-; CHECK: vpaddl.u16	[[D2]], [[D2]]
-; CHECK: vpaddl.u32	[[D2]], [[D2]]
-; CHECK: vstr		[[D2]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d16, #0x0
+; CHECK-NEXT:    vldr d17, [r0]
+; CHECK-NEXT:    vmov.i64 d18, #0xffffffffffffffff
+; CHECK-NEXT:    vsub.i64 d16, d16, d17
+; CHECK-NEXT:    vand d16, d17, d16
+; CHECK-NEXT:    vadd.i64 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vpaddl.u16 d16, d16
+; CHECK-NEXT:    vpaddl.u32 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i64>, <1 x i64>* %p
   %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 true)
   store <1 x i64> %tmp, <1 x i64>* %p
@@ -365,17 +510,19 @@ define void @test_v1i64_zero_undef(<1 x i64>* %p) {
 
 define void @test_v2i64_zero_undef(<2 x i64>* %p) {
 ; CHECK-LABEL: test_v2i64_zero_undef:
-; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x0
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i64	[[Q3:q[0-9]+]], #0xffffffffffffffff
-; CHECK: vsub.i64	[[Q2]], [[Q2]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q2]], [[Q1]], [[Q2]]
-; CHECK: vadd.i64	[[Q2]], [[Q2]], [[Q3]]
-; CHECK: vcnt.8		[[Q2]], [[Q2]]
-; CHECK: vpaddl.u8	[[Q2]], [[Q2]]
-; CHECK: vpaddl.u16	[[Q2]], [[Q2]]
-; CHECK: vpaddl.u32	[[Q2]], [[Q2]]
-; CHECK: vst1.64	{d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q8, #0x0
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vmov.i64 q10, #0xffffffffffffffff
+; CHECK-NEXT:    vsub.i64 q8, q8, q9
+; CHECK-NEXT:    vand q8, q9, q8
+; CHECK-NEXT:    vadd.i64 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vpaddl.u16 q8, q8
+; CHECK-NEXT:    vpaddl.u32 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i64>, <2 x i64>* %p
   %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
   store <2 x i64> %tmp, <2 x i64>* %p
-- 
GitLab


From d92ffe66987bc6686a36e13f6de086727525b8e0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 14 Oct 2018 17:34:20 +0000
Subject: [PATCH 0171/1116] [X86][AVX] Enable
 lowerVectorShuffleAsLanePermuteAndPermute v16i16/v32i8 shuffle lowering

Extends D53148 from v4f64 now that we have test coverage for v16i16/v32i8 shuffles.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344481 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp         | 10 ++++++++++
 test/CodeGen/X86/vector-shuffle-256-v16.ll |  6 ++----
 test/CodeGen/X86/vector-shuffle-256-v32.ll |  6 ++----
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 441f26dd4c6..4eaf1cc921b 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -14692,6 +14692,11 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
     return Result;
 
+  // Try to permute the lanes and then use a per-lane permute.
+  if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+          DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
+    return V;
+
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
 }
@@ -14772,6 +14777,11 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return Result;
 
+  // Try to permute the lanes and then use a per-lane permute.
+  if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+          DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+    return V;
+
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
 }
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 90970f15fea..2f0be026fd9 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -4063,10 +4063,8 @@ define <16 x i16> @shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_2
 ;
 ; AVX2-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 5e9f30a727d..9dfbb6af075 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -2505,16 +2505,14 @@ define <32 x i8> @shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_
 ;
 ; AVX2-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,16,18,20,22,24,26,28,30,17,19,21,23,25,27,29,31]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
 ; AVX512VLBW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,16,18,20,22,24,26,28,30,17,19,21,23,25,27,29,31]
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47:
-- 
GitLab


From 631cfd79b3240f5df03548a0a3eab519709743e9 Mon Sep 17 00:00:00 2001
From: Ayal Zaks <ayal.zaks@intel.com>
Date: Sun, 14 Oct 2018 17:53:02 +0000
Subject: [PATCH 0172/1116] [LV] Fix comments reported when not vectorizing
 single iteration loops; NFC

Landing this as a separate part of https://reviews.llvm.org/D50480, being a
seemingly unrelated change ([LV] Vectorizing loops of arbitrary trip count
without remainder under opt for size).


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344483 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index e93cfb34156..2ba2f00b4a5 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4558,8 +4558,15 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
   // If we optimize the program for size, avoid creating the tail loop.
   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
+  if (TC == 1) {
+    ORE->emit(createMissedAnalysis("SingleIterationLoop")
+              << "loop trip count is one, irrelevant for vectorization");
+    LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
+    return None;
+  }
+
   // If we don't know the precise trip count, don't try to vectorize.
-  if (TC < 2) {
+  if (TC == 0) {
     ORE->emit(
         createMissedAnalysis("UnknownLoopCountComplexCFG")
         << "unable to calculate the loop count due to complex control flow");
-- 
GitLab


From c76c02e1ed1a158628019f61aea45fcf87712d2c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 14 Oct 2018 20:14:33 +0000
Subject: [PATCH 0173/1116] [InstCombine] Add PR27343 test cases

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344484 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/pr27343.ll | 33 ++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 test/Transforms/InstCombine/pr27343.ll

diff --git a/test/Transforms/InstCombine/pr27343.ll b/test/Transforms/InstCombine/pr27343.ll
new file mode 100644
index 00000000000..5a9267b16af
--- /dev/null
+++ b/test/Transforms/InstCombine/pr27343.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -instcombine | FileCheck %s
+
+define i32 @__isnan(float %x) alwaysinline nounwind optsize {
+; CHECK-LABEL: @__isnan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast float [[X:%.*]] to i32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[DOTCAST]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SHL]], -16777216
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %x.addr = alloca float, align 4
+  store float %x, float* %x.addr, align 4
+  %0 = load float, float* %x.addr, align 4
+  %1 = bitcast float %0 to i32
+  %shl = shl i32 %1, 1
+  %cmp = icmp ugt i32 %shl, -16777216
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i1 @icmp_shl7(i32 %x) {
+; CHECK-LABEL: @icmp_shl7(
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 4608
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl i32 %x, 7
+  %cmp = icmp slt i32 %shl, 4608
+  ret i1 %cmp
+}
-- 
GitLab


From be51e5f9632255eda60f8f9e96e777c99f9415a1 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 15 Oct 2018 01:51:50 +0000
Subject: [PATCH 0174/1116] [X86] Autogenerate complete checks. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344485 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/splat-for-size.ll | 51 ++++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll
index 5a98a00338b..99ed8e8ccb6 100644
--- a/test/CodeGen/X86/splat-for-size.ll
+++ b/test/CodeGen/X86/splat-for-size.ll
@@ -19,7 +19,7 @@ define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
 define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
 ; CHECK-LABEL: splat_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm1
+; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
@@ -29,7 +29,7 @@ define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
 define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: splat_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
@@ -39,7 +39,7 @@ define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
 define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
 ; CHECK-LABEL: splat_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
@@ -57,7 +57,7 @@ define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 {
 ;
 ; AVX2-LABEL: splat_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %add = add <2 x i64> %x, <i64 2, i64 2>
@@ -78,7 +78,7 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
 ;
 ; AVX2-LABEL: splat_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2]
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %add = add <4 x i64> %x, <i64 2, i64 2, i64 2, i64 2>
@@ -89,13 +89,13 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
 define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 {
 ; AVX-LABEL: splat_v4i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2,2,2,2]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: splat_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %add = add <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
@@ -107,7 +107,7 @@ define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
 ; AVX-LABEL: splat_v8i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2,2,2,2]
 ; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -115,7 +115,7 @@ define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
 ;
 ; AVX2-LABEL: splat_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %add = add <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
@@ -131,7 +131,7 @@ define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 {
 ;
 ; AVX2-LABEL: splat_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %add = add <8 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
@@ -151,7 +151,7 @@ define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
 ;
 ; AVX2-LABEL: splat_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %add = add <16 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
@@ -167,7 +167,7 @@ define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 {
 ;
 ; AVX2-LABEL: splat_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %add = add <16 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
@@ -187,7 +187,7 @@ define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
 ;
 ; AVX2-LABEL: splat_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %add = add <32 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
@@ -201,6 +201,31 @@ define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
 @A = common global <3 x i64> zeroinitializer, align 32
 
 define <8 x i64> @pr23259() #1 {
+; AVX-LABEL: pr23259:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq $1
+; AVX-NEXT:    .cfi_adjust_cfa_offset 8
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    .cfi_adjust_cfa_offset -8
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: pr23259:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*}}(%rip), %ymm0
+; AVX2-NEXT:    pushq $1
+; AVX2-NEXT:    .cfi_adjust_cfa_offset 8
+; AVX2-NEXT:    popq %rax
+; AVX2-NEXT:    .cfi_adjust_cfa_offset -8
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,1,1]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
+; AVX2-NEXT:    retq
 entry:
   %0 = load <4 x i64>, <4 x i64>* bitcast (<3 x i64>* @A to <4 x i64>*), align 32
   %1 = shufflevector <4 x i64> %0, <4 x i64> undef, <3 x i32> <i32 undef, i32 undef, i32 2>
-- 
GitLab


From 5854a1f28315df26b0aefea9566e05b3ddbf520f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 15 Oct 2018 01:51:53 +0000
Subject: [PATCH 0175/1116] [X86] Add 128 MOVDDUP to the constant pool printing
 in X86AsmPrinter::EmitInstruction.

We use this instruction to broadcast a single 64-bit value to a v2i64/v2f64 vector.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344486 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86MCInstLower.cpp             |  6 ++
 test/CodeGen/X86/avg.ll                       |  6 +-
 .../X86/bitcast-int-to-vector-bool-sext.ll    |  3 +-
 .../X86/bitcast-int-to-vector-bool-zext.ll    |  3 +-
 .../CodeGen/X86/bitcast-int-to-vector-bool.ll |  3 +-
 .../X86/broadcast-elm-cross-splat-vec.ll      | 72 ++++++++++++-------
 test/CodeGen/X86/splat-for-size.ll            |  9 ++-
 test/CodeGen/X86/urem-seteq-vec-nonsplat.ll   |  3 +-
 8 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index acb2bc20858..76f0dd4837b 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -2133,6 +2133,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       }
     }
     break;
+  case X86::MOVDDUPrm:
+  case X86::VMOVDDUPrm:
+  case X86::VMOVDDUPZ128rm:
   case X86::VBROADCASTSSrm:
   case X86::VBROADCASTSSYrm:
   case X86::VBROADCASTSSZ128m:
@@ -2169,6 +2172,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       int NumElts;
       switch (MI->getOpcode()) {
       default: llvm_unreachable("Invalid opcode");
+      case X86::MOVDDUPrm:         NumElts = 2;  break;
+      case X86::VMOVDDUPrm:        NumElts = 2;  break;
+      case X86::VMOVDDUPZ128rm:    NumElts = 2;  break;
       case X86::VBROADCASTSSrm:    NumElts = 4;  break;
       case X86::VBROADCASTSSYrm:   NumElts = 8;  break;
       case X86::VBROADCASTSSZ128m: NumElts = 4;  break;
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index e8a03fe6a7b..84f1296d51c 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -1256,7 +1256,8 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX1-NEXT:    # xmm2 = mem[0,0]
 ; AVX1-NEXT:    vpavgb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1310,7 +1311,8 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
 ; AVX1-NEXT:    vpavgb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpavgb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index 74c48e35bfe..c022d7908a1 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -158,7 +158,8 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 6cd52c4d25c..75b5b701113 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -200,7 +200,8 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
index 1acc83485ce..3deac92d9ed 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -163,7 +163,8 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index 90f65597810..bb79efcbad4 100644
--- a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -87,21 +87,24 @@ define <16 x i8> @f16xi8_i32(<16 x i8> %a) {
 define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
 ; AVX-LABEL: f16xi8_i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX-NEXT:    # xmm1 = mem[0,0]
 ; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retl
 ;
 ; ALL32-LABEL: f16xi8_i64:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; ALL32-NEXT:    # xmm1 = mem[0,0]
 ; ALL32-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    retl
 ;
 ; AVX-64-LABEL: f16xi8_i64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX-64-NEXT:    # xmm1 = mem[0,0]
 ; AVX-64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    retq
@@ -202,7 +205,8 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
 ; AVX-LABEL: f32xi8_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX-NEXT:    # xmm2 = mem[0,0]
 ; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -219,7 +223,8 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
 ; AVX-64-LABEL: f32xi8_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX-64-NEXT:    # xmm2 = mem[0,0]
 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -424,7 +429,8 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 ; AVX-LABEL: f64xi8_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX-NEXT:    # xmm3 = mem[0,0]
 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -456,7 +462,8 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 ; AVX-64-LABEL: f64xi8_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX-64-NEXT:    # xmm3 = mem[0,0]
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -675,21 +682,24 @@ define <8 x i16> @f8xi16_i32(<8 x i16> %a) {
 define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
 ; AVX-LABEL: f8xi16_i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX-NEXT:    # xmm1 = mem[0,0]
 ; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retl
 ;
 ; ALL32-LABEL: f8xi16_i64:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; ALL32-NEXT:    # xmm1 = mem[0,0]
 ; ALL32-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    retl
 ;
 ; AVX-64-LABEL: f8xi16_i64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX-64-NEXT:    # xmm1 = mem[0,0]
 ; AVX-64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    retq
@@ -750,7 +760,8 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
 ; AVX-LABEL: f16xi16_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX-NEXT:    # xmm2 = mem[0,0]
 ; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -767,7 +778,8 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
 ; AVX-64-LABEL: f16xi16_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX-64-NEXT:    # xmm2 = mem[0,0]
 ; AVX-64-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; AVX-64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -902,7 +914,8 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 ; AVX-LABEL: f32xi16_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX-NEXT:    # xmm3 = mem[0,0]
 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -934,7 +947,8 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 ; AVX-64-LABEL: f32xi16_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX-64-NEXT:    # xmm3 = mem[0,0]
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -1120,21 +1134,24 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
 define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
 ; AVX-LABEL: f4xi32_i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; AVX-NEXT:    # xmm1 = mem[0,0]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retl
 ;
 ; ALL32-LABEL: f4xi32_i64:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; ALL32-NEXT:    # xmm1 = mem[0,0]
 ; ALL32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    retl
 ;
 ; AVX-64-LABEL: f4xi32_i64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; AVX-64-NEXT:    # xmm1 = mem[0,0]
 ; AVX-64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    retq
@@ -1155,7 +1172,8 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
 ; AVX-LABEL: f8xi32_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; AVX-NEXT:    # xmm2 = mem[0,0]
 ; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1172,7 +1190,8 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
 ; AVX-64-LABEL: f8xi32_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; AVX-64-NEXT:    # xmm2 = mem[0,0]
 ; AVX-64-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX-64-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1237,7 +1256,8 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX-LABEL: f16xi32_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; AVX-NEXT:    # xmm3 = mem[0,0]
 ; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -1269,7 +1289,8 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX-64-LABEL: f16xi32_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; AVX-64-NEXT:    # xmm3 = mem[0,0]
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -1573,21 +1594,24 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
 define <4 x float> @f4xf32_f64(<4 x float> %a) {
 ; AVX-LABEL: f4xf32_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [0.0078125018626451492,0.0078125018626451492]
+; AVX-NEXT:    # xmm1 = mem[0,0]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retl
 ;
 ; ALL32-LABEL: f4xf32_f64:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = [0.0078125018626451492,0.0078125018626451492]
+; ALL32-NEXT:    # xmm1 = mem[0,0]
 ; ALL32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; ALL32-NEXT:    retl
 ;
 ; AVX-64-LABEL: f4xf32_f64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [0.0078125018626451492,0.0078125018626451492]
+; AVX-64-NEXT:    # xmm1 = mem[0,0]
 ; AVX-64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; AVX-64-NEXT:    retq
diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll
index 99ed8e8ccb6..7567dbcdad0 100644
--- a/test/CodeGen/X86/splat-for-size.ll
+++ b/test/CodeGen/X86/splat-for-size.ll
@@ -9,7 +9,8 @@
 define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
 ; CHECK-LABEL: splat_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = [1,1]
+; CHECK-NEXT:    # xmm1 = mem[0,0]
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %add = fadd <2 x double> %x, <double 1.0, double 1.0>
@@ -51,7 +52,8 @@ define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
 define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 {
 ; AVX-LABEL: splat_v2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [2,2]
+; AVX-NEXT:    # xmm1 = mem[0,0]
 ; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -70,7 +72,8 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
 ; AVX-LABEL: splat_v4i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [2,2]
+; AVX-NEXT:    # xmm2 = mem[0,0]
 ; AVX-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 27541c44b9d..82385386c88 100644
--- a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -629,7 +629,8 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone {
 ;
 ; CHECK-AVX1-LABEL: test_urem_both:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [-9.255967385052751E+61,-9.255967385052751E+61]
+; CHECK-AVX1-NEXT:    # xmm1 = mem[0,0]
 ; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-- 
GitLab


From a0b9470673cf59fa27e8415a962151380fcc4981 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 15 Oct 2018 01:51:58 +0000
Subject: [PATCH 0176/1116] [X86] Move promotion of vector and/or/xor from
 legalization to DAG combine

Summary:
I've noticed that the bitcasts we introduce for these make computeKnownBits and computeNumSignBits not work well in LegalizeVectorOps. LegalizeVectorOps legalizes bottom up while LegalizeDAG legalizes top down. The bottom up strategy for LegalizeVectorOps means operands are legalized before their uses. So we promote and/or/xor before we legalize the operands that use them making computeKnownBits/computeNumSignBits in places like LowerTruncate suboptimal. I looked at changing LegalizeVectorOps to be top down as well, but that was more disruptive and caused some regressions. I also looked at just moving promotion of binops to LegalizeDAG, but that had a few issues one around matching AND,ANDN,OR into VSELECT because I had to create ANDN as vXi64, but the other nodes hadn't legalized yet, I didn't look too hard at fixing that.

This patch seems to produce better results overall than my other attempts. We now form broadcasts of constants better in some cases. For at least some of them the AND was being introduced in LegalizeDAG, promoted to vXi64, and the BUILD_VECTOR was also legalized there. I think we got bad ordering of that. Now the promotion is out of the legalizer so we handle this better.

In the longer term I think we really should evaluate whether we should be doing this promotion at all. It's really there to reduce isel pattern count, but I'm wondering if we'd be better served just eating the pattern cost or doing C++ based isel for vector and/or/xor in X86ISelDAGToDAG. The masked and/or/xor will definitely be difficult in patterns if a bitcast gets between the vselect and the and/or/xor node. That becomes a lot of permutations to cover.

Reviewers: RKSimon, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53107

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344487 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            |  51 ++--
 test/CodeGen/X86/avx-logic.ll                 |  12 +-
 test/CodeGen/X86/avx512-ext.ll                |   4 +-
 test/CodeGen/X86/avx512-insert-extract.ll     |   3 -
 test/CodeGen/X86/avx512-schedule.ll           |   8 +-
 test/CodeGen/X86/avx512vl-vec-masked-cmp.ll   |  88 -------
 test/CodeGen/X86/cast-vsel.ll                 |  15 +-
 test/CodeGen/X86/combine-sdiv.ll              |  13 +-
 test/CodeGen/X86/combine-srl.ll               |  65 +++--
 test/CodeGen/X86/gather-addresses.ll          |  32 +--
 test/CodeGen/X86/horizontal-reduce-umax.ll    | 110 ++++-----
 test/CodeGen/X86/horizontal-reduce-umin.ll    | 110 ++++-----
 test/CodeGen/X86/known-bits.ll                |   4 +-
 test/CodeGen/X86/nontemporal-loads.ll         |  36 +--
 test/CodeGen/X86/paddus.ll                    | 158 ++++++------
 test/CodeGen/X86/psubus.ll                    |   2 +-
 test/CodeGen/X86/sat-add.ll                   |  61 ++---
 test/CodeGen/X86/setcc-lowering.ll            |  12 +-
 test/CodeGen/X86/sse2-intrinsics-canonical.ll |  12 +-
 ...-masked-merge-vector-variablemask-const.ll |   8 +-
 test/CodeGen/X86/v8i1-masks.ll                |  10 +-
 test/CodeGen/X86/vector-blend.ll              |   2 +-
 test/CodeGen/X86/vector-reduce-umax.ll        | 102 ++++----
 test/CodeGen/X86/vector-reduce-umin.ll        | 102 ++++----
 test/CodeGen/X86/vector-shift-lshr-128.ll     |  12 +-
 test/CodeGen/X86/vector-shift-shl-128.ll      |   8 +-
 test/CodeGen/X86/vector-shuffle-256-v16.ll    |   4 +-
 test/CodeGen/X86/vector-trunc-math.ll         | 228 +++++++++---------
 test/CodeGen/X86/vector-trunc-packus.ll       |  30 +--
 test/CodeGen/X86/vector-trunc-ssat.ll         |   3 +-
 test/CodeGen/X86/vector-trunc-usat.ll         |   3 +-
 test/CodeGen/X86/vector-trunc.ll              |  21 +-
 test/CodeGen/X86/vshift-6.ll                  |   4 +-
 test/CodeGen/X86/x86-interleaved-access.ll    |   6 +-
 34 files changed, 592 insertions(+), 747 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 4eaf1cc921b..be6f9ed2188 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -871,9 +871,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
-      setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
-      setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
-      setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
     }
 
@@ -1183,9 +1180,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
-      setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
-      setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
-      setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
     }
 
@@ -1384,13 +1378,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
-    // Need to promote to 64-bit even though we have 32-bit masked instructions
-    // because the IR optimizers rearrange bitcasts around logic ops leaving
-    // too many variations to handle if we don't promote them.
-    setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
-    setOperationPromotedToType(ISD::OR,  MVT::v16i32, MVT::v8i64);
-    setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
-
     if (Subtarget.hasDQI()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
@@ -1593,10 +1580,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UMIN,         VT, Legal);
       setOperationAction(ISD::SETCC,        VT, Custom);
 
-      setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
-      setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
-      setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
-
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
       setCondCodeAction(ISD::SETLT, VT, Custom);
@@ -35226,6 +35209,10 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
       !SplatVal.isMask())
     return SDValue();
 
+  // Don't prevent creation of ANDN.
+  if (isBitwiseNot(Op0))
+    return SDValue();
+
   if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
     return SDValue();
 
@@ -35426,6 +35413,27 @@ static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
 }
 
+// This promotes vectors and/or/xor to a vXi64 type. We used to do this during
+// op legalization, but DAG combine yields better results.
+// TODO: This is largely just to reduce the number of isel patterns. Maybe we
+// can just add all the patterns or do C++ based selection in X86ISelDAGToDAG?
+static SDValue promoteVecLogicOp(SDNode *N, SelectionDAG &DAG) {
+  MVT VT = N->getSimpleValueType(0);
+
+  if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
+    return SDValue();
+
+  // Already correct type.
+  if (VT.getVectorElementType() == MVT::i64)
+    return SDValue();
+
+  MVT NewVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+  SDValue Op0 = DAG.getBitcast(NewVT, N->getOperand(0));
+  SDValue Op1 = DAG.getBitcast(NewVT, N->getOperand(1));
+  return DAG.getBitcast(VT, DAG.getNode(N->getOpcode(), SDLoc(N), NewVT,
+                                        Op0, Op1));
+}
+
 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -35460,6 +35468,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  if (SDValue V = promoteVecLogicOp(N, DAG))
+    return V;
+
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
@@ -35782,6 +35793,9 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  if (SDValue V = promoteVecLogicOp(N, DAG))
+    return V;
+
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
@@ -37810,6 +37824,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  if (SDValue V = promoteVecLogicOp(N, DAG))
+    return V;
+
   if (SDValue SetCC = foldXor1SetCC(N, DAG))
     return SetCC;
 
diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll
index 0fe5cbacc84..f22c6257e45 100644
--- a/test/CodeGen/X86/avx-logic.ll
+++ b/test/CodeGen/X86/avx-logic.ll
@@ -314,7 +314,7 @@ define <8 x i32> @and_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
@@ -342,7 +342,7 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
 ; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpandn %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
@@ -450,7 +450,7 @@ define <8 x i32> @or_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z)
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
@@ -479,7 +479,7 @@ define <8 x i32> @xor_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
@@ -537,7 +537,7 @@ define <8 x i32> @or_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [281470681808895,281470681808895]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
@@ -566,7 +566,7 @@ define <8 x i32> @xor_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [281470681808895,281470681808895]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index c23a474a97f..d56cf0fe09e 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -2157,7 +2157,7 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
 define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
 ; ALL-LABEL: zext_4xi1_to_4x32:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; ALL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
 ; ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; ALL-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; ALL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -2171,7 +2171,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
 define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
 ; ALL-LABEL: zext_2xi1_to_2xi64:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; ALL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255]
 ; ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; ALL-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; ALL-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index db3716c9530..e29d62b2605 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -993,7 +993,6 @@ define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
 ; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $2, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1023,7 +1022,6 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
 ; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1059,7 +1057,6 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
 ; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll
index 71dabf70a18..e0237ff0d83 100755
--- a/test/CodeGen/X86/avx512-schedule.ll
+++ b/test/CodeGen/X86/avx512-schedule.ll
@@ -4711,7 +4711,7 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
 define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
 ; GENERIC-LABEL: zext_4xi1_to_4x32:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50]
+; GENERIC-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] sched: [7:0.50]
 ; GENERIC-NEXT:    vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
@@ -4720,7 +4720,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
 ;
 ; SKX-LABEL: zext_4xi1_to_4x32:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50]
+; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] sched: [6:0.50]
 ; SKX-NEXT:    vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
 ; SKX-NEXT:    vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
@@ -4734,7 +4734,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
 define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
 ; GENERIC-LABEL: zext_2xi1_to_2xi64:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50]
+; GENERIC-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255] sched: [7:0.50]
 ; GENERIC-NEXT:    vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
@@ -4743,7 +4743,7 @@ define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
 ;
 ; SKX-LABEL: zext_2xi1_to_2xi64:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50]
+; SKX-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255] sched: [6:0.50]
 ; SKX-NEXT:    vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
 ; SKX-NEXT:    vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index 8c3fe900336..79de4aec42b 100644
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -9780,7 +9780,6 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -9807,7 +9806,6 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -9835,7 +9833,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -9866,7 +9863,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -9897,7 +9893,6 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -9925,7 +9920,6 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -9954,7 +9948,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -9985,7 +9978,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10017,12 +10009,10 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -10052,12 +10042,10 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -10088,14 +10076,12 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -10130,14 +10116,12 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -10172,7 +10156,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -10201,7 +10184,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -10231,7 +10213,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10264,7 +10245,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10296,7 +10276,6 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10323,7 +10302,6 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10351,7 +10329,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10382,7 +10359,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10413,7 +10389,6 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -10441,7 +10416,6 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -10470,7 +10444,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10502,7 +10475,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10535,7 +10507,6 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10563,7 +10534,6 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10592,7 +10562,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10624,7 +10593,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10656,7 +10624,6 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -10685,7 +10652,6 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -10715,7 +10681,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10747,7 +10712,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10782,12 +10746,10 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vpternlogq $15, %zmm2, %zmm2, %zmm2
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -10820,12 +10782,10 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -10856,7 +10816,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm2
 ; NoVLX-NEXT:    vpternlogq $15, %zmm2, %zmm2, %zmm2
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10866,7 +10825,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -10901,7 +10859,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10911,7 +10868,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -14768,7 +14724,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -14795,7 +14750,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -14824,7 +14778,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -14855,7 +14808,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -14887,7 +14839,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -14915,7 +14866,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -14945,7 +14895,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -14976,7 +14925,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15009,12 +14957,10 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -15044,12 +14990,10 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -15081,14 +15025,12 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -15123,14 +15065,12 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -15166,7 +15106,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -15195,7 +15134,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -15226,7 +15164,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15259,7 +15196,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15292,7 +15228,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -15319,7 +15254,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -15348,7 +15282,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15379,7 +15312,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15411,7 +15343,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -15439,7 +15370,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -15469,7 +15399,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15501,7 +15430,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15535,7 +15463,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -15563,7 +15490,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -15593,7 +15519,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15625,7 +15550,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15658,7 +15582,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -15687,7 +15610,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -15718,7 +15640,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15750,7 +15671,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15785,14 +15705,12 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vpmaxuw %ymm3, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpcmpeqw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -15823,14 +15741,12 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vpmaxuw 32(%rdi), %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -15862,7 +15778,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
 ; NoVLX-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
 ; NoVLX-NEXT:    vpternlogq $15, %zmm2, %zmm2, %zmm2
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15873,7 +15788,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -15908,7 +15822,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm1
 ; NoVLX-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15918,7 +15831,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll
index ff41083835f..b1e4243d01a 100644
--- a/test/CodeGen/X86/cast-vsel.ll
+++ b/test/CodeGen/X86/cast-vsel.ll
@@ -357,17 +357,16 @@ define void @example25() nounwind {
 ; AVX2-LABEL: example25:
 ; AVX2:       # %bb.0: # %vector.ph
 ; AVX2-NEXT:    movq $-4096, %rax # imm = 0xF000
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    .p2align 4, 0x90
 ; AVX2-NEXT:  .LBB5_1: # %vector.body
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT:    vmovups da+4096(%rax), %ymm1
-; AVX2-NEXT:    vcmpltps db+4096(%rax), %ymm1, %ymm1
-; AVX2-NEXT:    vmovups dc+4096(%rax), %ymm2
-; AVX2-NEXT:    vcmpltps dd+4096(%rax), %ymm2, %ymm2
-; AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm2
-; AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vmovups %ymm1, dj+4096(%rax)
+; AVX2-NEXT:    vmovups da+4096(%rax), %ymm0
+; AVX2-NEXT:    vcmpltps db+4096(%rax), %ymm0, %ymm0
+; AVX2-NEXT:    vmovups dc+4096(%rax), %ymm1
+; AVX2-NEXT:    vcmpltps dd+4096(%rax), %ymm1, %ymm1
+; AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, dj+4096(%rax)
 ; AVX2-NEXT:    addq $32, %rax
 ; AVX2-NEXT:    jne .LBB5_1
 ; AVX2-NEXT:  # %bb.2: # %for.end
diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll
index 72d458c8513..26a3cd47645 100644
--- a/test/CodeGen/X86/combine-sdiv.ll
+++ b/test/CodeGen/X86/combine-sdiv.ll
@@ -726,7 +726,8 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
 ; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
@@ -777,7 +778,9 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
 ; XOP-NEXT:    vpaddw %xmm3, %xmm0, %xmm3
 ; XOP-NEXT:    vpshaw %xmm2, %xmm3, %xmm2
 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; XOP-NEXT:    vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0
+; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; XOP-NEXT:    # ymm2 = mem[0,1,0,1]
+; XOP-NEXT:    vpcmov %ymm2, %ymm0, %ymm1, %ymm0
 ; XOP-NEXT:    retq
   %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
   ret <16 x i16> %1
@@ -960,7 +963,8 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
 ; AVX1-NEXT:    vpsraw $1, %xmm5, %xmm5
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX1-NEXT:    vandps %ymm5, %ymm2, %ymm2
 ; AVX1-NEXT:    vandnps %ymm0, %ymm5, %ymm0
 ; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
@@ -1055,7 +1059,8 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
 ; XOP-NEXT:    vpaddw %xmm5, %xmm0, %xmm5
 ; XOP-NEXT:    vpshaw %xmm3, %xmm5, %xmm5
 ; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; XOP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; XOP-NEXT:    # ymm5 = mem[0,1,0,1]
 ; XOP-NEXT:    vpcmov %ymm5, %ymm0, %ymm2, %ymm0
 ; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; XOP-NEXT:    vpsraw $15, %xmm2, %xmm6
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 9bd0be073f6..e0692166171 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -357,55 +357,50 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm3, %xmm4
-; SSE-NEXT:    pshufb %xmm1, %xmm4
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    psrlw $4, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE-NEXT:    movdqa %xmm3, %xmm4
 ; SSE-NEXT:    pshufb %xmm1, %xmm3
 ; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm4, %xmm1
-; SSE-NEXT:    paddb %xmm3, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSE-NEXT:    psrlw $8, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE-NEXT:    pand %xmm0, %xmm5
+; SSE-NEXT:    pshufb %xmm5, %xmm4
+; SSE-NEXT:    pand %xmm1, %xmm4
+; SSE-NEXT:    paddb %xmm4, %xmm3
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
 ; SSE-NEXT:    psrlw $8, %xmm1
-; SSE-NEXT:    paddw %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    psrlw $8, %xmm3
+; SSE-NEXT:    paddw %xmm1, %xmm3
 ; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
 ; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    psrld $5, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    psrld $16, %xmm3
+; SSE-NEXT:    paddd %xmm3, %xmm0
+; SSE-NEXT:    psrld $5, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
-; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
-; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
-; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm4
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
+; AVX-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX-NEXT:    vpand %xmm3, %xmm1, %xmm3
 ; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index ca8fd2acfa3..6468523b3c4 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -149,11 +149,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
 ; LIN-SSE2-NEXT:    andl %ecx, %edx
 ; LIN-SSE2-NEXT:    andl %ecx, %esi
 ; LIN-SSE2-NEXT:    andl %ecx, %edi
-; LIN-SSE2-NEXT:    movd %eax, %xmm0
-; LIN-SSE2-NEXT:    movd %edx, %xmm1
+; LIN-SSE2-NEXT:    movq %rax, %xmm0
+; LIN-SSE2-NEXT:    movq %rdx, %xmm1
 ; LIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; LIN-SSE2-NEXT:    movd %edi, %xmm2
-; LIN-SSE2-NEXT:    movd %esi, %xmm1
+; LIN-SSE2-NEXT:    movq %rdi, %xmm2
+; LIN-SSE2-NEXT:    movq %rsi, %xmm1
 ; LIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; LIN-SSE2-NEXT:    retq
 ;
@@ -169,11 +169,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
 ; LIN-SSE4-NEXT:    andl %ecx, %edx
 ; LIN-SSE4-NEXT:    andl %ecx, %esi
 ; LIN-SSE4-NEXT:    andl %ecx, %edi
-; LIN-SSE4-NEXT:    movd %edx, %xmm1
-; LIN-SSE4-NEXT:    movd %eax, %xmm0
+; LIN-SSE4-NEXT:    movq %rdx, %xmm1
+; LIN-SSE4-NEXT:    movq %rax, %xmm0
 ; LIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; LIN-SSE4-NEXT:    movd %edi, %xmm2
-; LIN-SSE4-NEXT:    movd %esi, %xmm1
+; LIN-SSE4-NEXT:    movq %rdi, %xmm2
+; LIN-SSE4-NEXT:    movq %rsi, %xmm1
 ; LIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; LIN-SSE4-NEXT:    retq
 ;
@@ -192,11 +192,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
 ; WIN-SSE2-NEXT:    andl %r9d, %ecx
 ; WIN-SSE2-NEXT:    andl %r9d, %r8d
 ; WIN-SSE2-NEXT:    andl %r9d, %edx
-; WIN-SSE2-NEXT:    movd %eax, %xmm0
-; WIN-SSE2-NEXT:    movd %ecx, %xmm1
+; WIN-SSE2-NEXT:    movq %rax, %xmm0
+; WIN-SSE2-NEXT:    movq %rcx, %xmm1
 ; WIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; WIN-SSE2-NEXT:    movd %edx, %xmm2
-; WIN-SSE2-NEXT:    movd %r8d, %xmm1
+; WIN-SSE2-NEXT:    movq %rdx, %xmm2
+; WIN-SSE2-NEXT:    movq %r8, %xmm1
 ; WIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; WIN-SSE2-NEXT:    retq
 ;
@@ -212,11 +212,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
 ; WIN-SSE4-NEXT:    andl %r9d, %ecx
 ; WIN-SSE4-NEXT:    andl %r9d, %r8d
 ; WIN-SSE4-NEXT:    andl %r9d, %edx
-; WIN-SSE4-NEXT:    movd %ecx, %xmm1
-; WIN-SSE4-NEXT:    movd %eax, %xmm0
+; WIN-SSE4-NEXT:    movq %rcx, %xmm1
+; WIN-SSE4-NEXT:    movq %rax, %xmm0
 ; WIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; WIN-SSE4-NEXT:    movd %edx, %xmm2
-; WIN-SSE4-NEXT:    movd %r8d, %xmm1
+; WIN-SSE4-NEXT:    movq %rdx, %xmm2
+; WIN-SSE4-NEXT:    movq %r8, %xmm1
 ; WIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; WIN-SSE4-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll
index a4888e1cd3f..88f6b01131a 100644
--- a/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -230,15 +230,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -273,15 +272,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -832,20 +830,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -896,20 +893,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1670,35 +1666,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v32i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pmaxsw %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
 ; X86-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1748,35 +1739,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-SSE2-LABEL: test_reduce_v32i16:
 ; X64-SSE2:       ## %bb.0:
 ; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm3
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X64-SSE2-NEXT:    pmaxsw %xmm3, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
 ; X64-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll
index 3ce01cfdf4d..482d0826037 100644
--- a/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -232,15 +232,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -269,15 +268,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -772,20 +770,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -827,20 +824,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1574,35 +1570,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v32i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pminsw %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pminsw %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    pminsw %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
 ; X86-SSE2-NEXT:    pminsw %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1643,35 +1634,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-SSE2-LABEL: test_reduce_v32i16:
 ; X64-SSE2:       ## %bb.0:
 ; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pminsw %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm3
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X64-SSE2-NEXT:    pminsw %xmm3, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE2-NEXT:    pminsw %xmm1, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
 ; X64-SSE2-NEXT:    pminsw %xmm2, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/known-bits.ll b/test/CodeGen/X86/known-bits.ll
index 5d574391c50..5066e4777cc 100644
--- a/test/CodeGen/X86/known-bits.ll
+++ b/test/CodeGen/X86/known-bits.ll
@@ -19,7 +19,7 @@ define void @knownbits_zext_in_reg(i8*) nounwind {
 ; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; X32-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; X32-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; X32-NEXT:    movzbl %cl, %eax
 ; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
@@ -69,7 +69,7 @@ define void @knownbits_zext_in_reg(i8*) nounwind {
 ; X64-NEXT:    movzbl %cl, %ecx
 ; X64-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm1
-; X64-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; X64-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll
index 56428979568..efc08ca1718 100644
--- a/test/CodeGen/X86/nontemporal-loads.ll
+++ b/test/CodeGen/X86/nontemporal-loads.ll
@@ -1800,35 +1800,23 @@ define <64 x i8> @test_unaligned_v64i8(<64 x i8>* %src) {
 define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
 ; SSE2-LABEL: test_masked_v16i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NEXT:    pxor %xmm12, %xmm12
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    pxor %xmm0, %xmm8
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm9
-; SSE2-NEXT:    pxor %xmm0, %xmm9
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm11
-; SSE2-NEXT:    pxor %xmm0, %xmm11
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    pandn (%rdi), %xmm4
-; SSE2-NEXT:    pandn %xmm10, %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NEXT:    pandn 16(%rdi), %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm11
-; SSE2-NEXT:    por %xmm5, %xmm11
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm6, %xmm2
 ; SSE2-NEXT:    pandn 32(%rdi), %xmm6
-; SSE2-NEXT:    pandn %xmm2, %xmm9
-; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm7, %xmm3
 ; SSE2-NEXT:    pandn 48(%rdi), %xmm7
-; SSE2-NEXT:    pandn %xmm3, %xmm8
-; SSE2-NEXT:    por %xmm7, %xmm8
-; SSE2-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NEXT:    movdqa %xmm9, %xmm2
-; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test_masked_v16i32:
diff --git a/test/CodeGen/X86/paddus.ll b/test/CodeGen/X86/paddus.ll
index 75b0597d389..63ef093fdd9 100644
--- a/test/CodeGen/X86/paddus.ll
+++ b/test/CodeGen/X86/paddus.ll
@@ -801,22 +801,20 @@ define <8 x i16> @test23(<8 x i16> %x) {
 ; SSE2-LABEL: test23:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pcmpgtw %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test23:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm0, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtw %xmm2, %xmm1
-; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -1029,37 +1027,33 @@ define <16 x i16> @test28(<16 x i16> %x) {
 define <16 x i16> @test29(<16 x i16> %x) {
 ; SSE2-LABEL: test29:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtw %xmm4, %xmm3
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test29:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpgtw %xmm3, %xmm2
-; SSSE3-NEXT:    pxor %xmm0, %xmm4
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpgtw %xmm4, %xmm3
-; SSSE3-NEXT:    por %xmm0, %xmm3
-; SSSE3-NEXT:    por %xmm1, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm1, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pcmpgtw %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
@@ -1343,66 +1337,58 @@ define <32 x i16> @test34(<32 x i16> %x) {
 define <32 x i16> @test35(<32 x i16> %x) {
 ; SSE2-LABEL: test35:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    movdqa %xmm3, %xmm8
-; SSE2-NEXT:    pcmpgtw %xmm5, %xmm8
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pcmpgtw %xmm6, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    pxor %xmm4, %xmm7
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    pcmpgtw %xmm7, %xmm6
-; SSE2-NEXT:    pxor %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NEXT:    pcmpgtw %xmm4, %xmm7
-; SSE2-NEXT:    por %xmm0, %xmm7
-; SSE2-NEXT:    por %xmm1, %xmm6
-; SSE2-NEXT:    por %xmm2, %xmm5
-; SSE2-NEXT:    por %xmm3, %xmm8
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    pxor %xmm5, %xmm8
+; SSE2-NEXT:    pxor %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
 ; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtw %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm7, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    por %xmm7, %xmm2
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test35:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa %xmm3, %xmm8
-; SSSE3-NEXT:    pcmpgtw %xmm5, %xmm8
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm4, %xmm6
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pcmpgtw %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm6
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm7
-; SSSE3-NEXT:    pxor %xmm4, %xmm7
-; SSSE3-NEXT:    movdqa %xmm1, %xmm6
-; SSSE3-NEXT:    pcmpgtw %xmm7, %xmm6
-; SSSE3-NEXT:    pxor %xmm0, %xmm4
-; SSSE3-NEXT:    movdqa %xmm0, %xmm7
-; SSSE3-NEXT:    pcmpgtw %xmm4, %xmm7
-; SSSE3-NEXT:    por %xmm0, %xmm7
-; SSSE3-NEXT:    por %xmm1, %xmm6
-; SSSE3-NEXT:    por %xmm2, %xmm5
-; SSSE3-NEXT:    por %xmm3, %xmm8
-; SSSE3-NEXT:    movdqa %xmm7, %xmm0
-; SSSE3-NEXT:    movdqa %xmm6, %xmm1
-; SSSE3-NEXT:    movdqa %xmm5, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm7
+; SSSE3-NEXT:    movdqa %xmm2, %xmm8
+; SSSE3-NEXT:    pxor %xmm5, %xmm8
+; SSSE3-NEXT:    pxor %xmm3, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pcmpgtw %xmm3, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm8, %xmm3
+; SSSE3-NEXT:    pcmpgtw %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm7, %xmm2
+; SSSE3-NEXT:    pcmpgtw %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm6, %xmm1
+; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSSE3-NEXT:    por %xmm6, %xmm1
+; SSSE3-NEXT:    por %xmm7, %xmm2
+; SSSE3-NEXT:    por %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm4, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: test35:
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index e2089f6b0d2..a6bdfe9780c 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -792,7 +792,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsubd %xmm9, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubd %xmm11, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
diff --git a/test/CodeGen/X86/sat-add.ll b/test/CodeGen/X86/sat-add.ll
index 3cb11b11ec3..ec160c94f5e 100644
--- a/test/CodeGen/X86/sat-add.ll
+++ b/test/CodeGen/X86/sat-add.ll
@@ -679,13 +679,12 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_notval(<16 x i8> %x, <16
 define <8 x i16> @unsigned_sat_variable_v8i16_using_min(<8 x i16> %x, <8 x i16> %y) {
 ; SSE2-LABEL: unsigned_sat_variable_v8i16_using_min:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pminsw %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pminsw %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    paddw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -717,15 +716,12 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i
 define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_notval(<8 x i16> %x, <8 x i16> %y) {
 ; SSE2-LABEL: unsigned_sat_variable_v8i16_using_cmp_notval:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    paddw %xmm1, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtw %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    paddw %xmm1, %xmm2
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: unsigned_sat_variable_v8i16_using_cmp_notval:
@@ -750,17 +746,15 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32>
 ; SSE2-LABEL: unsigned_sat_variable_v4i32_using_min:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -809,15 +803,12 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i
 define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_notval(<4 x i32> %x, <4 x i32> %y) {
 ; SSE2-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval:
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index ce057b28cc9..100461d22c9 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -45,19 +45,17 @@ define void @pr26232(i64 %a, <16 x i1> %b) {
 ; AVX-LABEL: pr26232:
 ; AVX:       # %bb.0: # %allocas
 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX-NEXT:    .p2align 4, 0x90
 ; AVX-NEXT:  .LBB1_1: # %for_loop599
 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    cmpq $65536, %rdi # imm = 0x10000
 ; AVX-NEXT:    setl %al
-; AVX-NEXT:    vmovd %eax, %xmm3
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX-NEXT:    vpand %xmm0, %xmm3, %xmm3
-; AVX-NEXT:    vpsllw $7, %xmm3, %xmm3
-; AVX-NEXT:    vpand %xmm2, %xmm3, %xmm3
-; AVX-NEXT:    vpmovmskb %xmm3, %eax
+; AVX-NEXT:    vmovd %eax, %xmm2
+; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX-NEXT:    vpand %xmm0, %xmm2, %xmm2
+; AVX-NEXT:    vpsllw $7, %xmm2, %xmm2
+; AVX-NEXT:    vpmovmskb %xmm2, %eax
 ; AVX-NEXT:    testw %ax, %ax
 ; AVX-NEXT:    jne .LBB1_1
 ; AVX-NEXT:  # %bb.2: # %for_exit600
diff --git a/test/CodeGen/X86/sse2-intrinsics-canonical.ll b/test/CodeGen/X86/sse2-intrinsics-canonical.ll
index 04cd7ec47a1..506fb9eb100 100644
--- a/test/CodeGen/X86/sse2-intrinsics-canonical.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-canonical.ll
@@ -198,9 +198,9 @@ define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) {
 ;
 ; AVX2-LABEL: test_x86_sse2_psubus_b_64:
 ; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX2-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI6_0, kind: FK_Data_4
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3 ## encoding: [0xc5,0xf1,0xdb,0xda]
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc2]
 ; AVX2-NEXT:    vpmaxuw %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3e,0xc3]
@@ -209,9 +209,9 @@ define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) {
 ;
 ; SKX-LABEL: test_x86_sse2_psubus_b_64:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vmovdqa LCPI6_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SKX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
-; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI6_0, kind: FK_Data_4
+; SKX-NEXT:    vpbroadcastw LCPI6_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,255,255,255,255,255,255,255]
+; SKX-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4
 ; SKX-NEXT:    vpand %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xda]
 ; SKX-NEXT:    vpand %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc2]
 ; SKX-NEXT:    vpmaxuw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc3]
diff --git a/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
index 7cb0d3ff58f..f109d69621c 100644
--- a/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
+++ b/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
@@ -132,9 +132,9 @@ define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py,
 ;
 ; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm2
+; CHECK-SSE2-NEXT:    movdqa (%rdi), %xmm2
 ; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm0
@@ -142,9 +142,9 @@ define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py,
 ;
 ; CHECK-XOP-LABEL: in_constant_varx_mone_invmask:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
 ; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-XOP-NEXT:    vpxor (%rdx), %xmm1, %xmm2
+; CHECK-XOP-NEXT:    vpxor (%rdi), %xmm1, %xmm2
 ; CHECK-XOP-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll
index a799b0e6f12..7f9ae2e8518 100644
--- a/test/CodeGen/X86/v8i1-masks.ll
+++ b/test/CodeGen/X86/v8i1-masks.ll
@@ -44,10 +44,9 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
 ; X32-AVX2-NEXT:    vcmpltps %ymm0, %ymm1, %ymm1
 ; X32-AVX2-NEXT:    vmovups (%eax), %ymm2
 ; X32-AVX2-NEXT:    vcmpltps %ymm0, %ymm2, %ymm0
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
-; X32-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; X32-AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vmovaps %ymm0, (%eax)
+; X32-AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vmovdqa %ymm0, (%eax)
 ; X32-AVX2-NEXT:    vzeroupper
 ; X32-AVX2-NEXT:    retl
 ;
@@ -58,10 +57,9 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
 ; X64-AVX2-NEXT:    vcmpltps %ymm0, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vmovups (%rdx), %ymm2
 ; X64-AVX2-NEXT:    vcmpltps %ymm0, %ymm2, %ymm0
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
-; X64-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vmovaps %ymm0, (%rax)
+; X64-AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovdqa %ymm0, (%rax)
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
   %v0 = load <8 x float>, <8 x float>* %a, align 16
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index 934d1027e9b..5008a1e865d 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -629,7 +629,7 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
 ;
 ; AVX1-LABEL: constant_pblendvb_avx2:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303]
 ; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
diff --git a/test/CodeGen/X86/vector-reduce-umax.ll b/test/CodeGen/X86/vector-reduce-umax.ll
index 680a5c52e63..52b42ce9bcb 100644
--- a/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/test/CodeGen/X86/vector-reduce-umax.ll
@@ -1141,15 +1141,14 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1207,20 +1206,19 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1296,35 +1294,30 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE2-LABEL: test_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    pmaxsw %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    retq
 ;
@@ -1406,47 +1399,38 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE2-LABEL: test_v64i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm5, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    pmaxsw %xmm7, %xmm3
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm6
 ; SSE2-NEXT:    pxor %xmm8, %xmm2
 ; SSE2-NEXT:    pmaxsw %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm8, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm4, %xmm0
 ; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pmaxsw %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm5, %xmm1
 ; SSE2-NEXT:    pmaxsw %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-reduce-umin.ll b/test/CodeGen/X86/vector-reduce-umin.ll
index 52adee5ab26..32a1cdf0f17 100644
--- a/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/test/CodeGen/X86/vector-reduce-umin.ll
@@ -1140,15 +1140,14 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pminsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1187,20 +1186,19 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pminsw %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1253,35 +1251,30 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE2-LABEL: test_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pminsw %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    pminsw %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    pminsw %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-NEXT:    pminsw %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    retq
 ;
@@ -1338,47 +1331,38 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE2-LABEL: test_v64i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pminsw %xmm5, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    pminsw %xmm7, %xmm3
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pminsw %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm6
 ; SSE2-NEXT:    pxor %xmm8, %xmm2
 ; SSE2-NEXT:    pminsw %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm8, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pminsw %xmm4, %xmm0
 ; SSE2-NEXT:    pminsw %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pminsw %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pminsw %xmm5, %xmm1
 ; SSE2-NEXT:    pminsw %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pminsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pminsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index bd77311479b..7ce33dcfe24 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -373,8 +373,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    pandn %xmm0, %xmm4
 ; SSE2-NEXT:    psrlw $4, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
@@ -382,16 +382,16 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    pandn %xmm0, %xmm4
 ; SSE2-NEXT:    psrlw $2, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -488,8 +488,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    por %xmm4, %xmm0
 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
@@ -497,16 +497,16 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
 ; X32-SSE-NEXT:    psrlw $2, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    por %xmm4, %xmm0
 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $1, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    por %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %shift = lshr <16 x i8> %a, %b
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 67963b1f992..a26fccd44c8 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -295,8 +295,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    pandn %xmm0, %xmm4
 ; SSE2-NEXT:    psllw $4, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
@@ -304,8 +304,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    pandn %xmm0, %xmm4
 ; SSE2-NEXT:    psllw $2, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
@@ -405,8 +405,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
 ; X32-SSE-NEXT:    psllw $4, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    por %xmm4, %xmm0
 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
@@ -414,8 +414,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
 ; X32-SSE-NEXT:    psllw $2, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    por %xmm4, %xmm0
 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 2f0be026fd9..2ade0c5c646 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -846,7 +846,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
 define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41]
 ; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
@@ -863,7 +863,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_3
 define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41]
 ; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
index d9f186e64f1..e552f5f4036 100644
--- a/test/CodeGen/X86/vector-trunc-math.ll
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -233,7 +233,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm7 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
@@ -347,7 +348,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
@@ -680,22 +681,23 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -781,13 +783,13 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
@@ -1106,7 +1108,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm7 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
@@ -1220,7 +1223,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
@@ -1575,7 +1578,8 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm7
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm4 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm7, %xmm3
@@ -1687,7 +1691,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
@@ -2275,7 +2279,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm4 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
@@ -2451,7 +2456,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
@@ -2909,7 +2914,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm6 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
@@ -3049,7 +3055,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
@@ -3351,27 +3357,28 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ;
 ; AVX1-LABEL: trunc_and_v16i64_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT:    vandps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT:    vandpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandpd %ymm5, %ymm1, %ymm1
+; AVX1-NEXT:    vandpd %ymm6, %ymm2, %ymm2
+; AVX1-NEXT:    vandpd %ymm7, %ymm3, %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -3468,7 +3475,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
@@ -3751,22 +3758,23 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -3852,13 +3860,13 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
@@ -4153,27 +4161,28 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ;
 ; AVX1-LABEL: trunc_xor_v16i64_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vxorps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT:    vxorps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT:    vxorps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT:    vxorpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorpd %ymm5, %ymm1, %ymm1
+; AVX1-NEXT:    vxorpd %ymm6, %ymm2, %ymm2
+; AVX1-NEXT:    vxorpd %ymm7, %ymm3, %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -4270,7 +4279,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
@@ -4553,22 +4562,23 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -4654,13 +4664,13 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
@@ -4955,27 +4965,28 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
 ;
 ; AVX1-LABEL: trunc_or_v16i64_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT:    vorps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT:    vorpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vorpd %ymm5, %ymm1, %ymm1
+; AVX1-NEXT:    vorpd %ymm6, %ymm2, %ymm2
+; AVX1-NEXT:    vorpd %ymm7, %ymm3, %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -5072,7 +5083,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
 ; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
@@ -5355,22 +5366,23 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -5456,13 +5468,13 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-trunc-packus.ll b/test/CodeGen/X86/vector-trunc-packus.ll
index 91ede6cb062..61935dce8f8 100644
--- a/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/test/CodeGen/X86/vector-trunc-packus.ll
@@ -2070,24 +2070,26 @@ define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm7, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpand %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm6, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm0, %xmm8, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-trunc-ssat.ll b/test/CodeGen/X86/vector-trunc-ssat.ll
index 3e5dcc5c3c2..500d8ba1511 100644
--- a/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -2001,7 +2001,8 @@ define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovapd {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
 ; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandpd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/vector-trunc-usat.ll b/test/CodeGen/X86/vector-trunc-usat.ll
index 1bde6c3a141..5b00ab58495 100644
--- a/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/test/CodeGen/X86/vector-trunc-usat.ll
@@ -1417,7 +1417,8 @@ define void @trunc_usat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovapd {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
 ; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandpd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index db3692f318f..79cbb8cc924 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -286,13 +286,14 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
 ; AVX1-LABEL: trunc8i64_8i8:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
@@ -907,13 +908,13 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
 ; AVX1-LABEL: trunc16i32_16i8:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
diff --git a/test/CodeGen/X86/vshift-6.ll b/test/CodeGen/X86/vshift-6.ll
index 5cfa38ab833..36e29abf8d7 100644
--- a/test/CodeGen/X86/vshift-6.ll
+++ b/test/CodeGen/X86/vshift-6.ll
@@ -50,8 +50,8 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
 ; X32-NEXT:    movdqa %xmm2, %xmm4
 ; X32-NEXT:    pandn %xmm0, %xmm4
 ; X32-NEXT:    psllw $2, %xmm0
-; X32-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-NEXT:    pand %xmm2, %xmm0
+; X32-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-NEXT:    por %xmm4, %xmm0
 ; X32-NEXT:    paddb %xmm1, %xmm1
 ; X32-NEXT:    pcmpgtb %xmm1, %xmm3
@@ -85,8 +85,8 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
 ; X64-NEXT:    movdqa %xmm2, %xmm4
 ; X64-NEXT:    pandn %xmm0, %xmm4
 ; X64-NEXT:    psllw $2, %xmm0
-; X64-NEXT:    pand {{.*}}(%rip), %xmm0
 ; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    pand {{.*}}(%rip), %xmm0
 ; X64-NEXT:    por %xmm4, %xmm0
 ; X64-NEXT:    paddb %xmm1, %xmm1
 ; X64-NEXT:    pcmpgtb %xmm1, %xmm3
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index e4624eaf363..41d69e544aa 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1029,7 +1029,8 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm2
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX1-NEXT:    vandnps %ymm2, %ymm5, %ymm2
 ; AVX1-NEXT:    vandps %ymm5, %ymm8, %ymm5
 ; AVX1-NEXT:    vorps %ymm2, %ymm5, %ymm2
@@ -1585,7 +1586,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm12
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX1-NEXT:    vandnps %ymm12, %ymm13, %ymm12
 ; AVX1-NEXT:    vandps %ymm13, %ymm14, %ymm14
 ; AVX1-NEXT:    vorps %ymm12, %ymm14, %ymm12
-- 
GitLab


From aa8c49dafa13b3565af79710ef7a4933180dd84b Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 15 Oct 2018 05:07:54 +0000
Subject: [PATCH 0177/1116] [ORC] Simplify naming for JITDylib definition
 generators.

Renames:
  JITDylib's setFallbackDefinitionGenerator method to setGenerator.
  DynamicLibraryFallbackGenerator class to DynamicLibrarySearchGenerator.
  ReexportsFallbackDefinitionGenerator to ReexportsGenerator.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344489 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/Orc/Core.h       | 31 +++++++-----
 .../llvm/ExecutionEngine/Orc/ExecutionUtils.h | 32 ++++++------
 lib/ExecutionEngine/Orc/Core.cpp              | 49 +++++++++----------
 lib/ExecutionEngine/Orc/ExecutionUtils.cpp    | 20 +++++---
 tools/lli/lli.cpp                             |  4 +-
 .../ExecutionEngine/Orc/CoreAPIsTest.cpp      | 19 +++----
 6 files changed, 80 insertions(+), 75 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h
index 24cdeeae42e..67b16894f6c 100644
--- a/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/include/llvm/ExecutionEngine/Orc/Core.h
@@ -395,15 +395,22 @@ reexports(JITDylib &SourceJD, SymbolAliasMap Aliases) {
 Expected<SymbolAliasMap>
 buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols);
 
-class ReexportsFallbackDefinitionGenerator {
+/// ReexportsGenerator can be used with JITDylib::setGenerator to automatically
+/// re-export a subset of the source JITDylib's symbols in the target.
+class ReexportsGenerator {
 public:
   using SymbolPredicate = std::function<bool(SymbolStringPtr)>;
-  ReexportsFallbackDefinitionGenerator(JITDylib &BackingJD,
-                                       SymbolPredicate Allow);
+
+  /// Create a reexports generator. If an Allow predicate is passed, only
+  /// symbols for which the predicate returns true will be reexported. If no
+  /// Allow predicate is passed, all symbols will be exported.
+  ReexportsGenerator(JITDylib &SourceJD,
+                     SymbolPredicate Allow = SymbolPredicate());
+
   SymbolNameSet operator()(JITDylib &JD, const SymbolNameSet &Names);
 
 private:
-  JITDylib &BackingJD;
+  JITDylib &SourceJD;
   SymbolPredicate Allow;
 };
 
@@ -478,7 +485,7 @@ class JITDylib {
   friend class ExecutionSession;
   friend class MaterializationResponsibility;
 public:
-  using FallbackDefinitionGeneratorFunction = std::function<SymbolNameSet(
+  using GeneratorFunction = std::function<SymbolNameSet(
       JITDylib &Parent, const SymbolNameSet &Names)>;
 
   using AsynchronousSymbolQuerySet =
@@ -495,12 +502,12 @@ public:
   /// Get a reference to the ExecutionSession for this JITDylib.
   ExecutionSession &getExecutionSession() const { return ES; }
 
-  /// Set a fallback defenition generator. If set, lookup and lookupFlags will
-  /// pass the unresolved symbols set to the fallback definition generator,
-  /// allowing it to add a new definition to the JITDylib.
-  void setFallbackDefinitionGenerator(
-      FallbackDefinitionGeneratorFunction FallbackDefinitionGenerator) {
-    this->FallbackDefinitionGenerator = std::move(FallbackDefinitionGenerator);
+  /// Set a definition generator. If set, whenever a symbol fails to resolve
+  /// within this JITDylib, lookup and lookupFlags will pass the unresolved
+  /// symbols set to the definition generator. The generator can optionally
+  /// add a definition for the unresolved symbols to the dylib.
+  void setGenerator(GeneratorFunction DefGenerator) {
+    this->DefGenerator = std::move(DefGenerator);
   }
 
   /// Set the search order to be used when fixing up definitions in JITDylib.
@@ -667,7 +674,7 @@ private:
   SymbolMap Symbols;
   UnmaterializedInfosMap UnmaterializedInfos;
   MaterializingInfosMap MaterializingInfos;
-  FallbackDefinitionGeneratorFunction FallbackDefinitionGenerator;
+  GeneratorFunction DefGenerator;
   JITDylibList SearchOrder;
 };
 
diff --git a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
index 52250662a95..662ed7b78e4 100644
--- a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
@@ -212,32 +212,30 @@ public:
 /// If an instance of this class is attached to a JITDylib as a fallback
 /// definition generator, then any symbol found in the given DynamicLibrary that
 /// passes the 'Allow' predicate will be added to the JITDylib.
-class DynamicLibraryFallbackGenerator {
+class DynamicLibrarySearchGenerator {
 public:
   using SymbolPredicate = std::function<bool(SymbolStringPtr)>;
 
-  static bool AllowAll(SymbolStringPtr Name) { return true; }
-
-  /// Create a DynamicLibraryFallbackGenerator that searches for symbols in the
+  /// Create a DynamicLibrarySearchGenerator that searches for symbols in the
   /// given sys::DynamicLibrary.
-  /// Only symbols that match the 'Allow' predicate will be searched for.
-  DynamicLibraryFallbackGenerator(sys::DynamicLibrary Dylib,
-                                  const DataLayout &DL,
-                                  SymbolPredicate Allow = AllowAll);
+  /// If the Allow predicate is given then only symbols matching the predicate
+  /// will be searched for in the DynamicLibrary. If the predicate is not given
+  /// then all symbols will be searched for.
+  DynamicLibrarySearchGenerator(sys::DynamicLibrary Dylib, const DataLayout &DL,
+                                SymbolPredicate Allow = SymbolPredicate());
 
   /// Permanently loads the library at the given path and, on success, returns
-  /// a DynamicLibraryFallbackGenerator that will search it for symbol
-  /// definitions matching the Allow predicate.
-  /// On failure returns the reason the library failed to load.
-  static Expected<DynamicLibraryFallbackGenerator>
+  /// a DynamicLibrarySearchGenerator that will search it for symbol definitions
+  /// in the library. On failure returns the reason the library failed to load.
+  static Expected<DynamicLibrarySearchGenerator>
   Load(const char *FileName, const DataLayout &DL,
-       SymbolPredicate Allow = AllowAll);
+       SymbolPredicate Allow = SymbolPredicate());
 
-  /// Creates a DynamicLibraryFallbackGenerator that searches for symbols in
+  /// Creates a DynamicLibrarySearchGenerator that searches for symbols in
   /// the current process.
-  static Expected<DynamicLibraryFallbackGenerator>
-  CreateForCurrentProcess(const DataLayout &DL,
-                          SymbolPredicate Allow = AllowAll) {
+  static Expected<DynamicLibrarySearchGenerator>
+  GetForCurrentProcess(const DataLayout &DL,
+                       SymbolPredicate Allow = SymbolPredicate()) {
     return Load(nullptr, DL, std::move(Allow));
   }
 
diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index c9cfacef61b..3fa28a5af6f 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -686,26 +686,26 @@ buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols) {
   return Result;
 }
 
-ReexportsFallbackDefinitionGenerator::ReexportsFallbackDefinitionGenerator(
-    JITDylib &BackingJD, SymbolPredicate Allow)
-    : BackingJD(BackingJD), Allow(std::move(Allow)) {}
+ReexportsGenerator::ReexportsGenerator(JITDylib &SourceJD,
+                                       SymbolPredicate Allow)
+    : SourceJD(SourceJD), Allow(std::move(Allow)) {}
 
-SymbolNameSet ReexportsFallbackDefinitionGenerator::
-operator()(JITDylib &JD, const SymbolNameSet &Names) {
+SymbolNameSet ReexportsGenerator::operator()(JITDylib &JD,
+                                             const SymbolNameSet &Names) {
   orc::SymbolNameSet Added;
   orc::SymbolAliasMap AliasMap;
 
-  auto Flags = BackingJD.lookupFlags(Names);
+  auto Flags = SourceJD.lookupFlags(Names);
 
   for (auto &KV : Flags) {
-    if (!Allow(KV.first))
+    if (Allow && !Allow(KV.first))
       continue;
     AliasMap[KV.first] = SymbolAliasMapEntry(KV.first, KV.second);
     Added.insert(KV.first);
   }
 
   if (!Added.empty())
-    cantFail(JD.define(reexports(BackingJD, AliasMap)));
+    cantFail(JD.define(reexports(SourceJD, AliasMap)));
 
   return Added;
 }
@@ -1117,10 +1117,10 @@ SymbolFlagsMap JITDylib::lookupFlags(const SymbolNameSet &Names) {
   return ES.runSessionLocked([&, this]() {
     SymbolFlagsMap Result;
     auto Unresolved = lookupFlagsImpl(Result, Names);
-    if (FallbackDefinitionGenerator && !Unresolved.empty()) {
-      auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved);
-      if (!FallbackDefs.empty()) {
-        auto Unresolved2 = lookupFlagsImpl(Result, FallbackDefs);
+    if (DefGenerator && !Unresolved.empty()) {
+      auto NewDefs = DefGenerator(*this, Unresolved);
+      if (!NewDefs.empty()) {
+        auto Unresolved2 = lookupFlagsImpl(Result, NewDefs);
         (void)Unresolved2;
         assert(Unresolved2.empty() &&
                "All fallback defs should have been found by lookupFlagsImpl");
@@ -1156,14 +1156,13 @@ void JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
   assert(Q && "Query can not be null");
 
   lodgeQueryImpl(Q, Unresolved, MatchNonExportedInJD, MatchNonExported, MUs);
-  if (FallbackDefinitionGenerator && !Unresolved.empty()) {
-    auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved);
-    if (!FallbackDefs.empty()) {
-      for (auto &D : FallbackDefs)
+  if (DefGenerator && !Unresolved.empty()) {
+    auto NewDefs = DefGenerator(*this, Unresolved);
+    if (!NewDefs.empty()) {
+      for (auto &D : NewDefs)
         Unresolved.erase(D);
-      lodgeQueryImpl(Q, FallbackDefs, MatchNonExportedInJD, MatchNonExported,
-                     MUs);
-      assert(FallbackDefs.empty() &&
+      lodgeQueryImpl(Q, NewDefs, MatchNonExportedInJD, MatchNonExported, MUs);
+      assert(NewDefs.empty() &&
              "All fallback defs should have been found by lookupImpl");
     }
   }
@@ -1250,15 +1249,15 @@ SymbolNameSet JITDylib::legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
   SymbolNameSet Unresolved = std::move(Names);
   ES.runSessionLocked([&, this]() {
     ActionFlags = lookupImpl(Q, MUs, Unresolved);
-    if (FallbackDefinitionGenerator && !Unresolved.empty()) {
+    if (DefGenerator && !Unresolved.empty()) {
       assert(ActionFlags == None &&
              "ActionFlags set but unresolved symbols remain?");
-      auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved);
-      if (!FallbackDefs.empty()) {
-        for (auto &D : FallbackDefs)
+      auto NewDefs = DefGenerator(*this, Unresolved);
+      if (!NewDefs.empty()) {
+        for (auto &D : NewDefs)
           Unresolved.erase(D);
-        ActionFlags = lookupImpl(Q, MUs, FallbackDefs);
-        assert(FallbackDefs.empty() &&
+        ActionFlags = lookupImpl(Q, MUs, NewDefs);
+        assert(NewDefs.empty() &&
                "All fallback defs should have been found by lookupImpl");
       }
     }
diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 6a180106240..667237373ca 100644
--- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -178,21 +178,22 @@ Error LocalCXXRuntimeOverrides2::enable(JITDylib &JD,
   return JD.define(absoluteSymbols(std::move(RuntimeInterposes)));
 }
 
-DynamicLibraryFallbackGenerator::DynamicLibraryFallbackGenerator(
+DynamicLibrarySearchGenerator::DynamicLibrarySearchGenerator(
     sys::DynamicLibrary Dylib, const DataLayout &DL, SymbolPredicate Allow)
     : Dylib(std::move(Dylib)), Allow(std::move(Allow)),
       GlobalPrefix(DL.getGlobalPrefix()) {}
 
-Expected<DynamicLibraryFallbackGenerator> DynamicLibraryFallbackGenerator::Load(
-    const char *FileName, const DataLayout &DL, SymbolPredicate Allow) {
+Expected<DynamicLibrarySearchGenerator>
+DynamicLibrarySearchGenerator::Load(const char *FileName, const DataLayout &DL,
+                                    SymbolPredicate Allow) {
   std::string ErrMsg;
   auto Lib = sys::DynamicLibrary::getPermanentLibrary(FileName, &ErrMsg);
   if (!Lib.isValid())
     return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode());
-  return DynamicLibraryFallbackGenerator(std::move(Lib), DL, std::move(Allow));
+  return DynamicLibrarySearchGenerator(std::move(Lib), DL, std::move(Allow));
 }
 
-SymbolNameSet DynamicLibraryFallbackGenerator::
+SymbolNameSet DynamicLibrarySearchGenerator::
 operator()(JITDylib &JD, const SymbolNameSet &Names) {
   orc::SymbolNameSet Added;
   orc::SymbolMap NewSymbols;
@@ -200,7 +201,10 @@ operator()(JITDylib &JD, const SymbolNameSet &Names) {
   bool HasGlobalPrefix = (GlobalPrefix != '\0');
 
   for (auto &Name : Names) {
-    if (!Allow(Name) || (*Name).empty())
+    if ((*Name).empty())
+      continue;
+
+    if (Allow && !Allow(Name))
       continue;
 
     if (HasGlobalPrefix && (*Name).front() != GlobalPrefix)
@@ -215,8 +219,8 @@ operator()(JITDylib &JD, const SymbolNameSet &Names) {
     }
   }
 
-  // Add any new symbols to JD. Since the fallback generator is only called for
-  // symbols that are not already defined, this will never trigger a duplicate
+  // Add any new symbols to JD. Since the generator is only called for symbols
+  // that are not already defined, this will never trigger a duplicate
   // definition error, so we can wrap this call in a 'cantFail'.
   if (!NewSymbols.empty())
     cantFail(JD.define(absoluteSymbols(std::move(NewSymbols))));
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 4794fe532a5..d633fe6f800 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -793,8 +793,8 @@ int runOrcLazyJIT(const char *ProgName) {
     }
     return Dump(std::move(TSM), R);
   });
-  J->getMainJITDylib().setFallbackDefinitionGenerator(ExitOnErr(
-      orc::DynamicLibraryFallbackGenerator::CreateForCurrentProcess(DL)));
+  J->getMainJITDylib().setGenerator(
+      ExitOnErr(orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
 
   orc::MangleAndInterner Mangle(J->getExecutionSession(), DL);
   orc::LocalCXXRuntimeOverrides2 CXXRuntimeOverrides;
diff --git a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
index c8fa6ef5297..1ccc4755957 100644
--- a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
+++ b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
@@ -342,17 +342,15 @@ TEST_F(CoreAPIsStandardTest, TestThatReExportsDontUnnecessarilyMaterialize) {
   EXPECT_FALSE(BarMaterialized) << "Bar should not have been materialized";
 }
 
-TEST_F(CoreAPIsStandardTest, TestReexportsFallbackGenerator) {
-  // Test that a re-exports fallback generator can dynamically generate
-  // reexports.
+TEST_F(CoreAPIsStandardTest, TestReexportsGenerator) {
+  // Test that a re-exports generator can dynamically generate reexports.
 
   auto &JD2 = ES.createJITDylib("JD2");
   cantFail(JD2.define(absoluteSymbols({{Foo, FooSym}, {Bar, BarSym}})));
 
   auto Filter = [this](SymbolStringPtr Name) { return Name != Bar; };
 
-  JD.setFallbackDefinitionGenerator(
-      ReexportsFallbackDefinitionGenerator(JD2, Filter));
+  JD.setGenerator(ReexportsGenerator(JD2, Filter));
 
   auto Flags = JD.lookupFlags({Foo, Bar, Baz});
   EXPECT_EQ(Flags.size(), 1U) << "Unexpected number of results";
@@ -679,14 +677,13 @@ TEST_F(CoreAPIsStandardTest, DefineMaterializingSymbol) {
       << "Expected Bar == BarSym";
 }
 
-TEST_F(CoreAPIsStandardTest, FallbackDefinitionGeneratorTest) {
+TEST_F(CoreAPIsStandardTest, GeneratorTest) {
   cantFail(JD.define(absoluteSymbols({{Foo, FooSym}})));
 
-  JD.setFallbackDefinitionGenerator(
-      [&](JITDylib &JD2, const SymbolNameSet &Names) {
-        cantFail(JD2.define(absoluteSymbols({{Bar, BarSym}})));
-        return SymbolNameSet({Bar});
-      });
+  JD.setGenerator([&](JITDylib &JD2, const SymbolNameSet &Names) {
+    cantFail(JD2.define(absoluteSymbols({{Bar, BarSym}})));
+    return SymbolNameSet({Bar});
+  });
 
   auto Result = cantFail(ES.lookup({&JD}, {Foo, Bar}));
 
-- 
GitLab


From 15ca92098aa137c257b7dfea86c7b4daa8eaf1af Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 15 Oct 2018 05:31:24 +0000
Subject: [PATCH 0178/1116] [X86] Autogenerate checks. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344490 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/fold-vex.ll | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/test/CodeGen/X86/fold-vex.ll b/test/CodeGen/X86/fold-vex.ll
index 006db6effdf..c7b376a053d 100644
--- a/test/CodeGen/X86/fold-vex.ll
+++ b/test/CodeGen/X86/fold-vex.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Use CPU parameters to ensure that a CPU-specific attribute is not overriding the AVX definition.
 
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown                  -mattr=+avx | FileCheck %s
@@ -14,18 +15,20 @@
 ; unless specially configured on some CPUs such as AMD Family 10H.
 
 define <4 x i32> @test1(<4 x i32>* %p0, <4 x i32> %in1) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vandps (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+;
+; SSE-LABEL: test1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movups (%rdi), %xmm1
+; SSE-NEXT:    andps %xmm1, %xmm0
+; SSE-NEXT:    retq
   %in0 = load <4 x i32>, <4 x i32>* %p0, align 2
   %a = and <4 x i32> %in0, %in1
   ret <4 x i32> %a
 
-; CHECK-LABEL: @test1
-; CHECK-NOT:   vmovups
-; CHECK:       vandps (%rdi), %xmm0, %xmm0
-; CHECK-NEXT:  ret
 
-; SSE-LABEL: @test1
-; SSE:       movups (%rdi), %xmm1
-; SSE-NEXT:  andps %xmm1, %xmm0
-; SSE-NEXT:  ret
 }
 
-- 
GitLab


From ffc5ec8c8122b984871c59516426cdd8ad18e7ea Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson@ericsson.com>
Date: Mon, 15 Oct 2018 08:36:03 +0000
Subject: [PATCH 0179/1116] [TwoAddressInstructionPass] Replace subregister
 uses when processing tied operands

Summary:
TwoAddressInstruction pass typically rewrites
  %1:short = foo %0.sub_lo:long
as
  %1:short = COPY %0.sub_lo:long
  %1:short = foo %1:short
when having tied operands.

If there are extra un-tied operands that uses the same reg and
subreg, such as the second and third inputs to fie here:
  %1:short = fie %0.sub_lo:long, %0.sub_hi:long, %0.sub_lo:long
then there was a bug which replaced the register %0 also for
the un-tied operand, but without changing the subregister indices.
So we used to get:
  %1:short = COPY %0.sub_lo:long
  %1:short = fie %1, %1.sub_hi:short, %1.sub_lo:short
With this fix we instead get:
  %1:short = COPY %0.sub_lo:long
  %1:short = fie %1, %0.sub_hi:long, %1

Reviewers: arsenm, JesperAntonsson, kparzysz, MatzeB

Reviewed By: MatzeB

Subscribers: bjope, kparzysz, wdng, llvm-commits

Differential Revision: https://reviews.llvm.org/D36224

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344492 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/TwoAddressInstructionPass.cpp     | 21 ++++---
 .../CodeGen/Hexagon/two-addr-tied-subregs.mir | 56 +++++++++++++++++++
 2 files changed, 69 insertions(+), 8 deletions(-)
 create mode 100644 test/CodeGen/Hexagon/two-addr-tied-subregs.mir

diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 99ccb0f9c9f..2e2fe72e539 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1608,23 +1608,28 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
   }
 
   if (AllUsesCopied) {
+    bool ReplacedAllUntiedUses = true;
     if (!IsEarlyClobber) {
       // Replace other (un-tied) uses of regB with LastCopiedReg.
       for (MachineOperand &MO : MI->operands()) {
-        if (MO.isReg() && MO.getReg() == RegB &&
-            MO.isUse()) {
-          if (MO.isKill()) {
-            MO.setIsKill(false);
-            RemovedKillFlag = true;
+        if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
+          if (MO.getSubReg() == SubRegB) {
+            if (MO.isKill()) {
+              MO.setIsKill(false);
+              RemovedKillFlag = true;
+            }
+            MO.setReg(LastCopiedReg);
+            MO.setSubReg(0);
+          } else {
+            ReplacedAllUntiedUses = false;
           }
-          MO.setReg(LastCopiedReg);
-          MO.setSubReg(MO.getSubReg());
         }
       }
     }
 
     // Update live variables for regB.
-    if (RemovedKillFlag && LV && LV->getVarInfo(RegB).removeKill(*MI)) {
+    if (RemovedKillFlag && ReplacedAllUntiedUses &&
+        LV && LV->getVarInfo(RegB).removeKill(*MI)) {
       MachineBasicBlock::iterator PrevMI = MI;
       --PrevMI;
       LV->addVirtualRegisterKilled(RegB, *PrevMI);
diff --git a/test/CodeGen/Hexagon/two-addr-tied-subregs.mir b/test/CodeGen/Hexagon/two-addr-tied-subregs.mir
new file mode 100644
index 00000000000..87e117c461b
--- /dev/null
+++ b/test/CodeGen/Hexagon/two-addr-tied-subregs.mir
@@ -0,0 +1,56 @@
+# RUN: llc -march hexagon -run-pass livevars -run-pass twoaddressinstruction  -verify-machineinstrs -o - %s | FileCheck %s
+
+
+###############################################################################
+
+---
+name:            test1
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $d0
+
+    %0:doubleregs = COPY killed $d0
+    %1:intregs = S2_lsr_i_r_acc %0.isub_lo, %0.isub_lo, 16
+
+...
+
+# Verify that both uses if %0.isub_lo are replaced here.
+# (we used to get %1:intregs = S2_lsr_i_r_acc %1, %1.isub_lo, 16)
+#
+# CHECK-LABEL: name:            test1
+# CHECK:  bb.0.entry:
+# CHECK:      %0:doubleregs = COPY killed $d0
+# CHECK-NEXT: %1:intregs = COPY killed %0.isub_lo
+# CHECK-NEXT: %1:intregs = S2_lsr_i_r_acc %1, %1, 16
+
+
+###############################################################################
+
+---
+name:            test2
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $d0
+
+    %0:doubleregs = COPY killed $d0
+    %1:intregs = S2_lsr_i_r_acc %0.isub_lo, %0.isub_hi, 16
+
+...
+
+# Verify that the use of %0.isub_hi isn't replaced here.
+# (we used to get %1:intregs = S2_lsr_i_r_acc %1, %1.isub_hi, 16)
+#
+# We also used to get an incorrect "killed" for %0 in the second COPY.
+# So we verify that we do not get machine verifier complaints here.
+# An improvement could be to get a "killed" attribute on the last
+# use of %0.isub_hi, but we do not need it for the IR to be valid.
+#
+# CHECK-LABEL: name:            test2
+# CHECK:  bb.0.entry:
+# CHECK:      %0:doubleregs = COPY killed $d0
+# CHECK-NEXT: %1:intregs = COPY %0.isub_lo
+# CHECK-NEXT: %1:intregs = S2_lsr_i_r_acc %1, %0.isub_hi, 16
+
+###############################################################################
-- 
GitLab


From 919972ec1f052ca0b29942c71745336298d694ef Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Mon, 15 Oct 2018 09:09:19 +0000
Subject: [PATCH 0180/1116] [llvm-exegesis][NFC] Return many CodeTemplates
 instead of one.

Summary: This is part one of the change where I simply changed the signature of the functions. More work need to be done to actually produce more than one CodeTemplate per instruction.

Reviewers: courbet

Subscribers: tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D53209

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344493 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/Latency.cpp           | 24 +++----
 tools/llvm-exegesis/lib/Latency.h             |  8 +--
 tools/llvm-exegesis/lib/SnippetGenerator.cpp  | 67 +++++++++++--------
 tools/llvm-exegesis/lib/SnippetGenerator.h    | 23 ++++---
 tools/llvm-exegesis/lib/Uops.cpp              | 12 ++--
 tools/llvm-exegesis/lib/Uops.h                |  4 +-
 tools/llvm-exegesis/lib/X86/Target.cpp        | 18 ++---
 .../X86/SnippetGeneratorTest.cpp              | 12 ++--
 8 files changed, 90 insertions(+), 78 deletions(-)

diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp
index f6786b123ad..040b42b53e2 100644
--- a/tools/llvm-exegesis/lib/Latency.cpp
+++ b/tools/llvm-exegesis/lib/Latency.cpp
@@ -22,9 +22,9 @@ namespace exegesis {
 
 LatencySnippetGenerator::~LatencySnippetGenerator() = default;
 
-llvm::Expected<CodeTemplate>
-LatencySnippetGenerator::generateTwoInstructionPrototype(
-    const Instruction &Instr) const {
+llvm::Expected<std::vector<CodeTemplate>>
+generateTwoInstructionPrototypes(const LLVMState &State,
+                                 const Instruction &Instr) {
   std::vector<unsigned> Opcodes;
   Opcodes.resize(State.getInstrInfo().getNumOpcodes());
   std::iota(Opcodes.begin(), Opcodes.end(), 0U);
@@ -50,23 +50,23 @@ LatencySnippetGenerator::generateTwoInstructionPrototype(
                             State.getInstrInfo().getName(OtherOpcode));
     CT.Instructions.push_back(std::move(ThisIT));
     CT.Instructions.push_back(std::move(OtherIT));
-    return std::move(CT);
+    return getSingleton(CT);
   }
   return llvm::make_error<BenchmarkFailure>(
       "Infeasible : Didn't find any scheme to make the instruction serial");
 }
 
-llvm::Expected<CodeTemplate>
-LatencySnippetGenerator::generateCodeTemplate(const Instruction &Instr) const {
+llvm::Expected<std::vector<CodeTemplate>>
+LatencySnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
   if (Instr.hasMemoryOperands())
     return llvm::make_error<BenchmarkFailure>(
         "Infeasible : has memory operands");
-  if (auto CT = generateSelfAliasingCodeTemplate(Instr))
-    return CT;
-  else
-    llvm::consumeError(CT.takeError());
-  // No self aliasing, trying to create a dependency through another opcode.
-  return generateTwoInstructionPrototype(Instr);
+  return llvm::handleExpected( //
+      generateSelfAliasingCodeTemplates(Instr),
+      [this, &Instr]() {
+        return generateTwoInstructionPrototypes(State, Instr);
+      },
+      [](const BenchmarkFailure &) { /*Consume Error*/ });
 }
 
 const char *LatencyBenchmarkRunner::getCounterName() const {
diff --git a/tools/llvm-exegesis/lib/Latency.h b/tools/llvm-exegesis/lib/Latency.h
index 83c798f60f3..f78f12615c7 100644
--- a/tools/llvm-exegesis/lib/Latency.h
+++ b/tools/llvm-exegesis/lib/Latency.h
@@ -26,12 +26,8 @@ public:
   LatencySnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {}
   ~LatencySnippetGenerator() override;
 
-  llvm::Expected<CodeTemplate>
-  generateCodeTemplate(const Instruction &Instr) const override;
-
-private:
-  llvm::Expected<CodeTemplate>
-  generateTwoInstructionPrototype(const Instruction &Instr) const;
+  llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const override;
 };
 
 class LatencyBenchmarkRunner : public BenchmarkRunner {
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
index f7a76d88ccf..9b577fd65a9 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
@@ -22,6 +22,12 @@
 
 namespace exegesis {
 
+std::vector<CodeTemplate> getSingleton(CodeTemplate &CT) {
+  std::vector<CodeTemplate> Result;
+  Result.push_back(std::move(CT));
+  return Result;
+}
+
 SnippetGeneratorFailure::SnippetGeneratorFailure(const llvm::Twine &S)
     : llvm::StringError(S, llvm::inconvertibleErrorCode()) {}
 
@@ -31,26 +37,28 @@ SnippetGenerator::~SnippetGenerator() = default;
 
 llvm::Expected<std::vector<BenchmarkCode>>
 SnippetGenerator::generateConfigurations(const Instruction &Instr) const {
-  if (auto E = generateCodeTemplate(Instr)) {
-    CodeTemplate &CT = E.get();
+  if (auto E = generateCodeTemplates(Instr)) {
     const auto &RATC = State.getRATC();
-    const llvm::BitVector &ForbiddenRegs =
-        CT.ScratchSpacePointerInReg
-            ? RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits()
-            : RATC.emptyRegisters();
     std::vector<BenchmarkCode> Output;
-    // TODO: Generate as many BenchmarkCode as needed.
-    {
-      BenchmarkCode BC;
-      BC.Info = CT.Info;
-      for (InstructionTemplate &IT : CT.Instructions) {
-        randomizeUnsetVariables(ForbiddenRegs, IT);
-        BC.Instructions.push_back(IT.build());
+    for (CodeTemplate &CT : E.get()) {
+      const llvm::BitVector &ForbiddenRegs =
+          CT.ScratchSpacePointerInReg
+              ? RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits()
+              : RATC.emptyRegisters();
+      // TODO: Generate as many BenchmarkCode as needed.
+      {
+        BenchmarkCode BC;
+        BC.Info = CT.Info;
+        for (InstructionTemplate &IT : CT.Instructions) {
+          randomizeUnsetVariables(ForbiddenRegs, IT);
+          BC.Instructions.push_back(IT.build());
+        }
+        if (CT.ScratchSpacePointerInReg)
+          BC.LiveIns.push_back(CT.ScratchSpacePointerInReg);
+        BC.RegisterInitialValues =
+            computeRegisterInitialValues(CT.Instructions);
+        Output.push_back(std::move(BC));
       }
-      if (CT.ScratchSpacePointerInReg)
-        BC.LiveIns.push_back(CT.ScratchSpacePointerInReg);
-      BC.RegisterInitialValues = computeRegisterInitialValues(CT.Instructions);
-      Output.push_back(std::move(BC));
     }
     return Output;
   } else
@@ -99,13 +107,14 @@ std::vector<RegisterValue> SnippetGenerator::computeRegisterInitialValues(
   return RIV;
 }
 
-llvm::Expected<CodeTemplate> SnippetGenerator::generateSelfAliasingCodeTemplate(
-    const Instruction &Instr) const {
+llvm::Expected<std::vector<CodeTemplate>>
+generateSelfAliasingCodeTemplates(const Instruction &Instr) {
   const AliasingConfigurations SelfAliasing(Instr, Instr);
-  if (SelfAliasing.empty()) {
+  if (SelfAliasing.empty())
     return llvm::make_error<SnippetGeneratorFailure>("empty self aliasing");
-  }
-  CodeTemplate CT;
+  std::vector<CodeTemplate> Result;
+  Result.emplace_back();
+  CodeTemplate &CT = Result.back();
   InstructionTemplate IT(Instr);
   if (SelfAliasing.hasImplicitAliasing()) {
     CT.Info = "implicit Self cycles, picking random values.";
@@ -116,16 +125,18 @@ llvm::Expected<CodeTemplate> SnippetGenerator::generateSelfAliasingCodeTemplate(
     setRandomAliasing(SelfAliasing, IT, IT);
   }
   CT.Instructions.push_back(std::move(IT));
-  return std::move(CT);
+  return Result;
 }
 
-llvm::Expected<CodeTemplate>
-SnippetGenerator::generateUnconstrainedCodeTemplate(const Instruction &Instr,
-                                                    llvm::StringRef Msg) const {
-  CodeTemplate CT;
+llvm::Expected<std::vector<CodeTemplate>>
+generateUnconstrainedCodeTemplates(const Instruction &Instr,
+                                   llvm::StringRef Msg) {
+  std::vector<CodeTemplate> Result;
+  Result.emplace_back();
+  CodeTemplate &CT = Result.back();
   CT.Info = llvm::formatv("{0}, repeating an unconstrained assignment", Msg);
   CT.Instructions.emplace_back(Instr);
-  return std::move(CT);
+  return Result;
 }
 
 std::mt19937 &randomGenerator() {
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.h b/tools/llvm-exegesis/lib/SnippetGenerator.h
index c9a19cd0eeb..e48cf0cfeb0 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.h
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.h
@@ -30,6 +30,17 @@
 
 namespace exegesis {
 
+std::vector<CodeTemplate> getSingleton(CodeTemplate &CT);
+
+// Generates code templates that has a self-dependency.
+llvm::Expected<std::vector<CodeTemplate>>
+generateSelfAliasingCodeTemplates(const Instruction &Instr);
+
+// Generates code templates without assignment constraints.
+llvm::Expected<std::vector<CodeTemplate>>
+generateUnconstrainedCodeTemplates(const Instruction &Instr,
+                                   llvm::StringRef Msg);
+
 // A class representing failures that happened during Benchmark, they are used
 // to report informations to the user.
 class SnippetGeneratorFailure : public llvm::StringError {
@@ -55,18 +66,10 @@ public:
 protected:
   const LLVMState &State;
 
-  // Generates a single code template that has a self-dependency.
-  llvm::Expected<CodeTemplate>
-  generateSelfAliasingCodeTemplate(const Instruction &Instr) const;
-  // Generates a single code template without assignment constraints.
-  llvm::Expected<CodeTemplate>
-  generateUnconstrainedCodeTemplate(const Instruction &Instr,
-                                    llvm::StringRef Msg) const;
-
 private:
   // API to be implemented by subclasses.
-  virtual llvm::Expected<CodeTemplate>
-  generateCodeTemplate(const Instruction &Instr) const = 0;
+  virtual llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const = 0;
 };
 
 // A global Random Number Generator to randomize configurations.
diff --git a/tools/llvm-exegesis/lib/Uops.cpp b/tools/llvm-exegesis/lib/Uops.cpp
index 1a701d169eb..a3ada77ef8c 100644
--- a/tools/llvm-exegesis/lib/Uops.cpp
+++ b/tools/llvm-exegesis/lib/Uops.cpp
@@ -124,8 +124,8 @@ void UopsSnippetGenerator::instantiateMemoryOperands(
          "not enough scratch space");
 }
 
-llvm::Expected<CodeTemplate>
-UopsSnippetGenerator::generateCodeTemplate(const Instruction &Instr) const {
+llvm::Expected<std::vector<CodeTemplate>>
+UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
   CodeTemplate CT;
   const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
   if (Instr.hasMemoryOperands()) {
@@ -153,13 +153,13 @@ UopsSnippetGenerator::generateCodeTemplate(const Instruction &Instr) const {
     CT.Info = "instruction is parallel, repeating a random one.";
     CT.Instructions.push_back(std::move(IT));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return std::move(CT);
+    return getSingleton(CT);
   }
   if (SelfAliasing.hasImplicitAliasing()) {
     CT.Info = "instruction is serial, repeating a random one.";
     CT.Instructions.push_back(std::move(IT));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return std::move(CT);
+    return getSingleton(CT);
   }
   const auto TiedVariables = getVariablesWithTiedOperands(Instr);
   if (!TiedVariables.empty()) {
@@ -181,7 +181,7 @@ UopsSnippetGenerator::generateCodeTemplate(const Instruction &Instr) const {
       CT.Instructions.push_back(std::move(TmpIT));
     }
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return std::move(CT);
+    return getSingleton(CT);
   }
   const auto &ReservedRegisters = State.getRATC().reservedRegisters();
   // No tied variables, we pick random values for defs.
@@ -218,7 +218,7 @@ UopsSnippetGenerator::generateCodeTemplate(const Instruction &Instr) const {
       "instruction has no tied variables picking Uses different from defs";
   CT.Instructions.push_back(std::move(IT));
   instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-  return std::move(CT);
+  return getSingleton(CT);
 }
 
 std::vector<BenchmarkMeasure>
diff --git a/tools/llvm-exegesis/lib/Uops.h b/tools/llvm-exegesis/lib/Uops.h
index 1cfa8242078..e6f6d4a09cb 100644
--- a/tools/llvm-exegesis/lib/Uops.h
+++ b/tools/llvm-exegesis/lib/Uops.h
@@ -25,8 +25,8 @@ public:
   UopsSnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {}
   ~UopsSnippetGenerator() override;
 
-  llvm::Expected<CodeTemplate>
-  generateCodeTemplate(const Instruction &Instr) const override;
+  llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const override;
 
   static constexpr const size_t kMinNumDifferentAddresses = 6;
 
diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index 0e9a6de95ce..20bb65ebde5 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -38,14 +38,14 @@ class X86LatencySnippetGenerator : public LatencySnippetGenerator {
 public:
   using LatencySnippetGenerator::LatencySnippetGenerator;
 
-  llvm::Expected<CodeTemplate>
-  generateCodeTemplate(const Instruction &Instr) const override {
+  llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const override {
     if (auto E = IsInvalidOpcode(Instr))
       return std::move(E);
 
     switch (GetX86FPFlags(Instr)) {
     case llvm::X86II::NotFP:
-      return LatencySnippetGenerator::generateCodeTemplate(Instr);
+      return LatencySnippetGenerator::generateCodeTemplates(Instr);
     case llvm::X86II::ZeroArgFP:
     case llvm::X86II::OneArgFP:
     case llvm::X86II::SpecialFP:
@@ -58,7 +58,7 @@ public:
       //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
       //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
       // They are intrinsically serial and do not modify the state of the stack.
-      return generateSelfAliasingCodeTemplate(Instr);
+      return generateSelfAliasingCodeTemplates(Instr);
     default:
       llvm_unreachable("Unknown FP Type!");
     }
@@ -69,14 +69,14 @@ class X86UopsSnippetGenerator : public UopsSnippetGenerator {
 public:
   using UopsSnippetGenerator::UopsSnippetGenerator;
 
-  llvm::Expected<CodeTemplate>
-  generateCodeTemplate(const Instruction &Instr) const override {
+  llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const override {
     if (auto E = IsInvalidOpcode(Instr))
       return std::move(E);
 
     switch (GetX86FPFlags(Instr)) {
     case llvm::X86II::NotFP:
-      return UopsSnippetGenerator::generateCodeTemplate(Instr);
+      return UopsSnippetGenerator::generateCodeTemplates(Instr);
     case llvm::X86II::ZeroArgFP:
     case llvm::X86II::OneArgFP:
     case llvm::X86II::SpecialFP:
@@ -88,12 +88,12 @@ public:
       //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
       // They are intrinsically serial and do not modify the state of the stack.
       // We generate the same code for latency and uops.
-      return generateSelfAliasingCodeTemplate(Instr);
+      return generateSelfAliasingCodeTemplates(Instr);
     case llvm::X86II::CompareFP:
     case llvm::X86II::CondMovFP:
       // We can compute uops for any FP instruction that does not grow or shrink
       // the stack (either do not touch the stack or push as much as they pop).
-      return generateUnconstrainedCodeTemplate(
+      return generateUnconstrainedCodeTemplates(
           Instr, "instruction does not grow/shrink the FP stack");
     default:
       llvm_unreachable("Unknown FP Type!");
diff --git a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
index 4df489df06f..6cc24a02cfc 100644
--- a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -60,9 +60,11 @@ protected:
   CodeTemplate checkAndGetCodeTemplate(unsigned Opcode) {
     randomGenerator().seed(0); // Initialize seed.
     const Instruction Instr(State, Opcode);
-    auto CodeTemplateOrError = Generator.generateCodeTemplate(Instr);
+    auto CodeTemplateOrError = Generator.generateCodeTemplates(Instr);
     EXPECT_FALSE(CodeTemplateOrError.takeError()); // Valid configuration.
-    return std::move(CodeTemplateOrError.get());
+    auto &CodeTemplate = CodeTemplateOrError.get();
+    EXPECT_EQ(CodeTemplate.size(), 1U);
+    return std::move(CodeTemplate.front());
   }
 
   SnippetGeneratorT Generator;
@@ -240,7 +242,7 @@ TEST_F(UopsSnippetGeneratorTest, MemoryUse_Movsb) {
   // MOVSB writes to scratch memory register.
   const unsigned Opcode = llvm::X86::MOVSB;
   const Instruction Instr(State, Opcode);
-  auto Error = Generator.generateCodeTemplate(Instr).takeError();
+  auto Error = Generator.generateCodeTemplates(Instr).takeError();
   EXPECT_TRUE((bool)Error);
   llvm::consumeError(std::move(Error));
 }
@@ -254,8 +256,8 @@ public:
   }
 
 private:
-  llvm::Expected<CodeTemplate>
-  generateCodeTemplate(const Instruction &Instr) const override {
+  llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const override {
     return llvm::make_error<llvm::StringError>("not implemented",
                                                llvm::inconvertibleErrorCode());
   }
-- 
GitLab


From dc5c9c28094836e848bb845d24a3b1e933342aaa Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 15 Oct 2018 09:17:09 +0000
Subject: [PATCH 0181/1116] [TI removal] Remove TerminatorInst as an input
 parameter from all public LLVM APIs. There weren't very many.

We still have the instruction visitor, and APIs with TerminatorInst as
a return type or an output parameter.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344494 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/CFG.h                     | 3 +--
 include/llvm/Transforms/Utils/BasicBlockUtils.h | 2 +-
 lib/Analysis/CFG.cpp                            | 3 ++-
 lib/Transforms/Utils/BreakCriticalEdges.cpp     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/llvm/Analysis/CFG.h b/include/llvm/Analysis/CFG.h
index cccdd163741..caae0b6e2a8 100644
--- a/include/llvm/Analysis/CFG.h
+++ b/include/llvm/Analysis/CFG.h
@@ -25,7 +25,6 @@ class DominatorTree;
 class Function;
 class Instruction;
 class LoopInfo;
-class TerminatorInst;
 
 /// Analyze the specified function to find all of the loop backedges in the
 /// function and return them.  This is a relatively cheap (compared to
@@ -46,7 +45,7 @@ unsigned GetSuccessorNumber(const BasicBlock *BB, const BasicBlock *Succ);
 /// edges from a block with multiple successors to a block with multiple
 /// predecessors.
 ///
-bool isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
+bool isCriticalEdge(const Instruction *TI, unsigned SuccNum,
                     bool AllowIdenticalEdges = false);
 
 /// Determine whether instruction 'To' is reachable from 'From',
diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index dee1541f9d2..f0ba5c6c9c2 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -128,7 +128,7 @@ struct CriticalEdgeSplittingOptions {
 /// IndirectBrInst.  Splitting these edges will almost always create an invalid
 /// program because the address of the new block won't be the one that is jumped
 /// to.
-BasicBlock *SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
+BasicBlock *SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
                               const CriticalEdgeSplittingOptions &Options =
                                   CriticalEdgeSplittingOptions());
 
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index a319be8092f..aaea5995429 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp
@@ -85,8 +85,9 @@ unsigned llvm::GetSuccessorNumber(const BasicBlock *BB,
 /// isCriticalEdge - Return true if the specified edge is a critical edge.
 /// Critical edges are edges from a block with multiple successors to a block
 /// with multiple predecessors.
-bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
+bool llvm::isCriticalEdge(const Instruction *TI, unsigned SuccNum,
                           bool AllowIdenticalEdges) {
+  assert(TI->isTerminator() && "Must be a terminator to have successors!");
   assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!");
   if (TI->getNumSuccessors() == 1) return false;
 
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 63b37e37943..c3d67087ae7 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -130,7 +130,7 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
 }
 
 BasicBlock *
-llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
+llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
                         const CriticalEdgeSplittingOptions &Options) {
   if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges))
     return nullptr;
-- 
GitLab


From 7c0f083bcb9e916d3b65a7894db7dd602e2d8536 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 15 Oct 2018 09:17:38 +0000
Subject: [PATCH 0182/1116] [TI removal] Remove a unnecessary use of
 `TerminatorInst` from an IR header. NFC.

Part of the removal of `TerminatorInst` from the type hierarchy.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344495 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/CFG.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llvm/IR/CFG.h b/include/llvm/IR/CFG.h
index fd384ef4949..4140c8a212e 100644
--- a/include/llvm/IR/CFG.h
+++ b/include/llvm/IR/CFG.h
@@ -73,7 +73,7 @@ public:
 
   inline reference operator*() const {
     assert(!It.atEnd() && "pred_iterator out of range!");
-    return cast<TerminatorInst>(*It)->getParent();
+    return cast<Instruction>(*It)->getParent();
   }
   inline pointer *operator->() const { return &operator*(); }
 
-- 
GitLab


From 9d078e56967c22e40709bd25deab73b981ee7f09 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Mon, 15 Oct 2018 09:21:21 +0000
Subject: [PATCH 0183/1116] [llvm-exegesis] Fix missing std::move.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344496 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/SnippetGenerator.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
index 9b577fd65a9..feee61d113c 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
@@ -125,7 +125,7 @@ generateSelfAliasingCodeTemplates(const Instruction &Instr) {
     setRandomAliasing(SelfAliasing, IT, IT);
   }
   CT.Instructions.push_back(std::move(IT));
-  return Result;
+  return std::move(Result);
 }
 
 llvm::Expected<std::vector<CodeTemplate>>
@@ -136,7 +136,7 @@ generateUnconstrainedCodeTemplates(const Instruction &Instr,
   CodeTemplate &CT = Result.back();
   CT.Info = llvm::formatv("{0}, repeating an unconstrained assignment", Msg);
   CT.Instructions.emplace_back(Instr);
-  return Result;
+  return std::move(Result);
 }
 
 std::mt19937 &randomGenerator() {
-- 
GitLab


From f2c212eed34f9b1f4fa601de8c02f31f6bfe3f48 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 15 Oct 2018 09:33:40 +0000
Subject: [PATCH 0184/1116] [TI removal] Just use Instruction in the CFG
 printer code. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344497 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/CFGPrinter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llvm/Analysis/CFGPrinter.h b/include/llvm/Analysis/CFGPrinter.h
index a4b642b9ea3..5996dd90bcf 100644
--- a/include/llvm/Analysis/CFGPrinter.h
+++ b/include/llvm/Analysis/CFGPrinter.h
@@ -150,7 +150,7 @@ struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
   /// Display the raw branch weights from PGO.
   std::string getEdgeAttributes(const BasicBlock *Node, succ_const_iterator I,
                                 const Function *F) {
-    const TerminatorInst *TI = Node->getTerminator();
+    const Instruction *TI = Node->getTerminator();
     if (TI->getNumSuccessors() == 1)
       return "";
 
-- 
GitLab


From ce1e09bcf53d60db89215c1800c1a6e2562d5b0b Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 15 Oct 2018 09:34:05 +0000
Subject: [PATCH 0185/1116] [TI removal] Remove `TerminatorInst` from
 BasicBlockUtils.h

This requires updating a number of .cpp files to adapt to the new API.
I've just systematically updated all uses of `TerminatorInst` within
these files te `Instruction` so thta I won't have to touch them again in
the future.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344498 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/Transforms/Utils/BasicBlockUtils.h   | 18 ++++++-------
 .../Instrumentation/AddressSanitizer.cpp      | 13 +++++-----
 .../Instrumentation/EfficiencySanitizer.cpp   |  2 +-
 .../Instrumentation/HWAddressSanitizer.cpp    |  2 +-
 lib/Transforms/Scalar/JumpThreading.cpp       | 16 ++++++------
 lib/Transforms/Utils/BasicBlockUtils.cpp      | 25 ++++++++++---------
 lib/Transforms/Utils/CallPromotionUtils.cpp   |  4 +--
 lib/Transforms/Utils/LibCallsShrinkWrap.cpp   |  2 +-
 lib/Transforms/Utils/LowerMemIntrinsics.cpp   |  2 +-
 9 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index f0ba5c6c9c2..a0fc18825a5 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -148,7 +148,7 @@ inline bool SplitCriticalEdge(BasicBlock *Succ, pred_iterator PI,
                               const CriticalEdgeSplittingOptions &Options =
                                   CriticalEdgeSplittingOptions()) {
   bool MadeChange = false;
-  TerminatorInst *TI = (*PI)->getTerminator();
+  Instruction *TI = (*PI)->getTerminator();
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
     if (TI->getSuccessor(i) == Succ)
       MadeChange |= !!SplitCriticalEdge(TI, i, Options);
@@ -162,7 +162,7 @@ inline BasicBlock *
 SplitCriticalEdge(BasicBlock *Src, BasicBlock *Dst,
                   const CriticalEdgeSplittingOptions &Options =
                       CriticalEdgeSplittingOptions()) {
-  TerminatorInst *TI = Src->getTerminator();
+  Instruction *TI = Src->getTerminator();
   unsigned i = 0;
   while (true) {
     assert(i != TI->getNumSuccessors() && "Edge doesn't exist!");
@@ -257,11 +257,11 @@ ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
 /// Returns the NewBasicBlock's terminator.
 ///
 /// Updates DT and LI if given.
-TerminatorInst *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
-                                          bool Unreachable,
-                                          MDNode *BranchWeights = nullptr,
-                                          DominatorTree *DT = nullptr,
-                                          LoopInfo *LI = nullptr);
+Instruction *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
+                                       bool Unreachable,
+                                       MDNode *BranchWeights = nullptr,
+                                       DominatorTree *DT = nullptr,
+                                       LoopInfo *LI = nullptr);
 
 /// SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen,
 /// but also creates the ElseBlock.
@@ -278,8 +278,8 @@ TerminatorInst *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
 ///   SplitBefore
 ///   Tail
 void SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
-                                   TerminatorInst **ThenTerm,
-                                   TerminatorInst **ElseTerm,
+                                   Instruction **ThenTerm,
+                                   Instruction **ElseTerm,
                                    MDNode *BranchWeights = nullptr);
 
 /// Check whether BB is the merge point of a if-region.
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b832417154e..ad07b608934 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1443,7 +1443,7 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
     } else {
       IRBuilder<> IRB(I);
       Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
-      TerminatorInst *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+      Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
       InsertBefore = ThenTerm;
     }
 
@@ -1596,8 +1596,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
     Value *TagCheck =
         IRB.CreateICmpEQ(Tag, ConstantInt::get(IntptrTy, kMyriadDDRTag));
 
-    TerminatorInst *TagCheckTerm = SplitBlockAndInsertIfThen(
-        TagCheck, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
+    Instruction *TagCheckTerm =
+        SplitBlockAndInsertIfThen(TagCheck, InsertBefore, false,
+                                  MDBuilder(*C).createBranchWeights(1, 100000));
     assert(cast<BranchInst>(TagCheckTerm)->isUnconditional());
     IRB.SetInsertPoint(TagCheckTerm);
     InsertBefore = TagCheckTerm;
@@ -1613,12 +1614,12 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
   size_t Granularity = 1ULL << Mapping.Scale;
-  TerminatorInst *CrashTerm = nullptr;
+  Instruction *CrashTerm = nullptr;
 
   if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
     // We use branch weights for the slow path check, to indicate that the slow
     // path is rarely taken. This seems to be the case for SPEC benchmarks.
-    TerminatorInst *CheckTerm = SplitBlockAndInsertIfThen(
+    Instruction *CheckTerm = SplitBlockAndInsertIfThen(
         Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
     assert(cast<BranchInst>(CheckTerm)->isUnconditional());
     BasicBlock *NextBB = CheckTerm->getSuccessor(0);
@@ -3116,7 +3117,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
       //     <This is not a fake stack; unpoison the redzones>
       Value *Cmp =
           IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
-      TerminatorInst *ThenTerm, *ElseTerm;
+      Instruction *ThenTerm, *ElseTerm;
       SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
 
       IRBuilder<> IRBPoison(ThenTerm);
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 0ab915de60d..db438e78ded 100644
--- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -887,7 +887,7 @@ bool EfficiencySanitizer::instrumentFastpathWorkingSet(
   Value *OldValue = IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
   // The AND and CMP will be turned into a TEST instruction by the compiler.
   Value *Cmp = IRB.CreateICmpNE(IRB.CreateAnd(OldValue, ValueMask), ValueMask);
-  TerminatorInst *CmpTerm = SplitBlockAndInsertIfThen(Cmp, I, false);
+  Instruction *CmpTerm = SplitBlockAndInsertIfThen(Cmp, I, false);
   // FIXME: do I need to call SetCurrentDebugLocation?
   IRB.SetInsertPoint(CmpTerm);
   // We use OR to set the shadow bits to avoid corrupting the middle 6 bits,
diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 510b1b058d0..63bd8ee35c6 100644
--- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -467,7 +467,7 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *PtrLong, bool IsWrite,
     TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored);
   }
 
-  TerminatorInst *CheckTerm =
+  Instruction *CheckTerm =
       SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, !Recover,
                                 MDBuilder(*C).createBranchWeights(1, 100000));
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 1fc8f3988f9..849ff71e198 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -947,7 +947,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
 /// Since we can pick an arbitrary destination, we pick the successor with the
 /// fewest predecessors.  This should reduce the in-degree of the others.
 static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
-  TerminatorInst *BBTerm = BB->getTerminator();
+  Instruction *BBTerm = BB->getTerminator();
   unsigned MinSucc = 0;
   BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
   // Compute the successor with the minimum number of predecessors.
@@ -988,7 +988,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // because now the condition in this block can be threaded through
   // predecessors of our predecessor block.
   if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
-    const TerminatorInst *TI = SinglePred->getTerminator();
+    const Instruction *TI = SinglePred->getTerminator();
     if (!TI->isExceptionalTerminator() && TI->getNumSuccessors() == 1 &&
         SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
       // If SinglePred was a loop header, BB becomes one.
@@ -1080,7 +1080,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     std::vector<DominatorTree::UpdateType> Updates;
 
     // Fold the branch/switch.
-    TerminatorInst *BBTerm = BB->getTerminator();
+    Instruction *BBTerm = BB->getTerminator();
     Updates.reserve(BBTerm->getNumSuccessors());
     for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
       if (i == BestSucc) continue;
@@ -1549,7 +1549,7 @@ FindMostPopularDest(BasicBlock *BB,
   // successor list.
   if (!SamePopularity.empty()) {
     SamePopularity.push_back(MostPopularDest);
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     for (unsigned i = 0; ; ++i) {
       assert(i != TI->getNumSuccessors() && "Didn't find any successor!");
 
@@ -1669,7 +1669,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
       }
 
       // Finally update the terminator.
-      TerminatorInst *Term = BB->getTerminator();
+      Instruction *Term = BB->getTerminator();
       BranchInst::Create(OnlyDest, Term);
       Term->eraseFromParent();
       DTU->applyUpdates(Updates);
@@ -2006,7 +2006,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
   // Update the terminator of PredBB to jump to NewBB instead of BB.  This
   // eliminates predecessors from BB, which requires us to simplify any PHI
   // nodes in BB.
-  TerminatorInst *PredTerm = PredBB->getTerminator();
+  Instruction *PredTerm = PredBB->getTerminator();
   for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
     if (PredTerm->getSuccessor(i) == BB) {
       BB->removePredecessor(PredBB, true);
@@ -2115,7 +2115,7 @@ BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
 }
 
 bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   assert(TI->getNumSuccessors() > 1 && "not a split");
 
   MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
@@ -2538,7 +2538,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
     if (!SI)
       continue;
     // Expand the select.
-    TerminatorInst *Term =
+    Instruction *Term =
         SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
     BasicBlock *SplitBB = SI->getParent();
     BasicBlock *NewBB = Term->getParent();
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 99914fcf81b..11a0114150f 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -52,7 +52,7 @@ void llvm::DeleteDeadBlock(BasicBlock *BB, DomTreeUpdater *DTU) {
   assert((pred_begin(BB) == pred_end(BB) ||
          // Can delete self loop.
          BB->getSinglePredecessor() == BB) && "Block is not dead!");
-  TerminatorInst *BBTerm = BB->getTerminator();
+  Instruction *BBTerm = BB->getTerminator();
   std::vector<DominatorTree::UpdateType> Updates;
 
   // Loop through all of our successors and make sure they know that one
@@ -270,7 +270,7 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
   unsigned SuccNum = GetSuccessorNumber(BB, Succ);
 
   // If this is a critical edge, let SplitCriticalEdge do it.
-  TerminatorInst *LatchTerm = BB->getTerminator();
+  Instruction *LatchTerm = BB->getTerminator();
   if (SplitCriticalEdge(
           LatchTerm, SuccNum,
           CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()))
@@ -298,7 +298,7 @@ llvm::SplitAllCriticalEdges(Function &F,
                             const CriticalEdgeSplittingOptions &Options) {
   unsigned NumBroken = 0;
   for (BasicBlock &BB : F) {
-    TerminatorInst *TI = BB.getTerminator();
+    Instruction *TI = BB.getTerminator();
     if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI))
       for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
         if (SplitCriticalEdge(TI, i, Options))
@@ -705,16 +705,17 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   return cast<ReturnInst>(NewRet);
 }
 
-TerminatorInst *
-llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
-                                bool Unreachable, MDNode *BranchWeights,
-                                DominatorTree *DT, LoopInfo *LI) {
+Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond,
+                                             Instruction *SplitBefore,
+                                             bool Unreachable,
+                                             MDNode *BranchWeights,
+                                             DominatorTree *DT, LoopInfo *LI) {
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
-  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  Instruction *HeadOldTerm = Head->getTerminator();
   LLVMContext &C = Head->getContext();
   BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
-  TerminatorInst *CheckTerm;
+  Instruction *CheckTerm;
   if (Unreachable)
     CheckTerm = new UnreachableInst(C, ThenBlock);
   else
@@ -749,12 +750,12 @@ llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
 }
 
 void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
-                                         TerminatorInst **ThenTerm,
-                                         TerminatorInst **ElseTerm,
+                                         Instruction **ThenTerm,
+                                         Instruction **ElseTerm,
                                          MDNode *BranchWeights) {
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
-  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  Instruction *HeadOldTerm = Head->getTerminator();
   LLVMContext &C = Head->getContext();
   BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
   BasicBlock *ElseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp
index 261ab87c3e7..4db579156d9 100644
--- a/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -270,8 +270,8 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,
   // Create an if-then-else structure. The original instruction is moved into
   // the "else" block, and a clone of the original instruction is placed in the
   // "then" block.
-  TerminatorInst *ThenTerm = nullptr;
-  TerminatorInst *ElseTerm = nullptr;
+  Instruction *ThenTerm = nullptr;
+  Instruction *ElseTerm = nullptr;
   SplitBlockAndInsertIfThenElse(Cond, CS.getInstruction(), &ThenTerm, &ElseTerm,
                                 BranchWeights);
   BasicBlock *ThenBlock = ThenTerm->getParent();
diff --git a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 9832a6f24e1..e1592c86763 100644
--- a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -487,7 +487,7 @@ void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
   MDNode *BranchWeights =
       MDBuilder(CI->getContext()).createBranchWeights(1, 2000);
 
-  TerminatorInst *NewInst =
+  Instruction *NewInst =
       SplitBlockAndInsertIfThen(Cond, CI, false, BranchWeights, DT);
   BasicBlock *CallBB = NewInst->getParent();
   CallBB->setName("cdce.call");
diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 03006ef3a2d..661b4fa5bcb 100644
--- a/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -301,7 +301,7 @@ static void createMemMoveLoop(Instruction *InsertBefore,
   // the appropriate conditional branches when the loop is built.
   ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT,
                                       SrcAddr, DstAddr, "compare_src_dst");
-  TerminatorInst *ThenTerm, *ElseTerm;
+  Instruction *ThenTerm, *ElseTerm;
   SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm,
                                 &ElseTerm);
 
-- 
GitLab


From ac346921b5695fdac3883c83262295399b7df0ca Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 15 Oct 2018 09:34:31 +0000
Subject: [PATCH 0186/1116] [TI removal] Remove a dead forward declaration of
 TerminatorInst. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344499 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Scalar.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 9491e1bbac9..fe4ff621c6f 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -26,7 +26,6 @@ class ModulePass;
 class Pass;
 class GetElementPtrInst;
 class PassInfo;
-class TerminatorInst;
 class TargetLowering;
 class TargetMachine;
 
-- 
GitLab


From fc6649b88c16738a266813e6ce32e022d16a5439 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 15 Oct 2018 09:47:26 +0000
Subject: [PATCH 0187/1116] [TI removal] Remove `TerminatorInst` from
 SparsePropagation.h and related code.

This is simple as we just need to replace the type and move to the
concept of visiting a "terminator" rather than a specific instruction
subclass.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344500 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/SparsePropagation.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/llvm/Analysis/SparsePropagation.h b/include/llvm/Analysis/SparsePropagation.h
index 04e94f7cd52..02a2e64268b 100644
--- a/include/llvm/Analysis/SparsePropagation.h
+++ b/include/llvm/Analysis/SparsePropagation.h
@@ -189,12 +189,12 @@ private:
 
   /// getFeasibleSuccessors - Return a vector of booleans to indicate which
   /// successors are reachable from a given terminator instruction.
-  void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs,
+  void getFeasibleSuccessors(Instruction &TI, SmallVectorImpl<bool> &Succs,
                              bool AggressiveUndef);
 
   void visitInst(Instruction &I);
   void visitPHINode(PHINode &I);
-  void visitTerminatorInst(TerminatorInst &TI);
+  void visitTerminator(Instruction &TI);
 };
 
 //===----------------------------------------------------------------------===//
@@ -286,7 +286,7 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::markEdgeExecutable(
 
 template <class LatticeKey, class LatticeVal, class KeyInfo>
 void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::getFeasibleSuccessors(
-    TerminatorInst &TI, SmallVectorImpl<bool> &Succs, bool AggressiveUndef) {
+    Instruction &TI, SmallVectorImpl<bool> &Succs, bool AggressiveUndef) {
   Succs.resize(TI.getNumSuccessors());
   if (TI.getNumSuccessors() == 0)
     return;
@@ -374,7 +374,7 @@ template <class LatticeKey, class LatticeVal, class KeyInfo>
 bool SparseSolver<LatticeKey, LatticeVal, KeyInfo>::isEdgeFeasible(
     BasicBlock *From, BasicBlock *To, bool AggressiveUndef) {
   SmallVector<bool, 16> SuccFeasible;
-  TerminatorInst *TI = From->getTerminator();
+  Instruction *TI = From->getTerminator();
   getFeasibleSuccessors(*TI, SuccFeasible, AggressiveUndef);
 
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
@@ -385,8 +385,8 @@ bool SparseSolver<LatticeKey, LatticeVal, KeyInfo>::isEdgeFeasible(
 }
 
 template <class LatticeKey, class LatticeVal, class KeyInfo>
-void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::visitTerminatorInst(
-    TerminatorInst &TI) {
+void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::visitTerminator(
+    Instruction &TI) {
   SmallVector<bool, 16> SuccFeasible;
   getFeasibleSuccessors(TI, SuccFeasible, true);
 
@@ -465,8 +465,8 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::visitInst(Instruction &I) {
     if (ChangedValue.second != LatticeFunc->getUntrackedVal())
       UpdateState(ChangedValue.first, ChangedValue.second);
 
-  if (TerminatorInst *TI = dyn_cast<TerminatorInst>(&I))
-    visitTerminatorInst(*TI);
+  if (I.isTerminator())
+    visitTerminator(I);
 }
 
 template <class LatticeKey, class LatticeVal, class KeyInfo>
-- 
GitLab


From aa517f562fbd28786bcdf69f9e8f365e791fad80 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 15 Oct 2018 10:00:15 +0000
Subject: [PATCH 0188/1116] [TI removal] Remove `TerminatorInst` from GVN.h and
 GVN.cpp.

This is the last interesting usage in all of LLVM's headers. The
remaining usages in headers are the core typesystem bits (Core.h,
instruction types, and InstVisitor) and as the return of
`BasicBlock::getTerminator`. The latter is the big remaining API point
that I'll remove after mass updates to user code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344501 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Scalar/GVN.h | 2 +-
 lib/Transforms/Scalar/GVN.cpp        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/llvm/Transforms/Scalar/GVN.h b/include/llvm/Transforms/Scalar/GVN.h
index c01a1d77e96..784de7f9fe2 100644
--- a/include/llvm/Transforms/Scalar/GVN.h
+++ b/include/llvm/Transforms/Scalar/GVN.h
@@ -237,7 +237,7 @@ private:
   }
 
   // List of critical edges to be split between iterations.
-  SmallVector<std::pair<TerminatorInst *, unsigned>, 4> toSplit;
+  SmallVector<std::pair<Instruction *, unsigned>, 4> toSplit;
 
   // Helper functions of redundant load elimination
   bool processLoad(LoadInst *L);
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index d6c2824a299..c080c2a1813 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -2341,7 +2341,7 @@ bool GVN::splitCriticalEdges() {
   if (toSplit.empty())
     return false;
   do {
-    std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val();
+    std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val();
     SplitCriticalEdge(Edge.first, Edge.second,
                       CriticalEdgeSplittingOptions(DT));
   } while (!toSplit.empty());
-- 
GitLab


From 2aaf7228e0e39c5afc21788b832430f0154f185b Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 15 Oct 2018 10:04:59 +0000
Subject: [PATCH 0189/1116] [TI removal] Make variables declared as
 `TerminatorInst` and initialized by `getTerminator()` calls instead be
 declared as `Instruction`.

This is the biggest remaining chunk of the usage of `getTerminator()`
that insists on the narrow type and so is an easy batch of updates.
Several files saw more extensive updates where this would cascade to
requiring API updates within the file to use `Instruction` instead of
`TerminatorInst`. All of these were trivial in nature (pervasively using
`Instruction` instead just worked).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344502 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/BranchProbabilityInfo.cpp        | 10 +--
 lib/Analysis/CFG.cpp                          |  2 +-
 lib/Analysis/EHPersonalities.cpp              |  2 +-
 lib/Analysis/InlineCost.cpp                   |  2 +-
 lib/Analysis/LoopInfo.cpp                     |  4 +-
 lib/Analysis/MemorySSAUpdater.cpp             |  2 +-
 lib/Analysis/ScalarEvolution.cpp              |  2 +-
 lib/CodeGen/Analysis.cpp                      |  2 +-
 lib/CodeGen/SelectionDAG/FastISel.cpp         |  2 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  2 +-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |  2 +-
 lib/CodeGen/WinEHPrepare.cpp                  |  4 +-
 lib/IR/BasicBlock.cpp                         |  4 +-
 lib/IR/Dominators.cpp                         |  2 +-
 lib/IR/SafepointIRVerifier.cpp                |  2 +-
 lib/IR/Verifier.cpp                           |  2 +-
 lib/Target/AMDGPU/SIAnnotateControlFlow.cpp   |  2 +-
 lib/Target/NVPTX/NVPTXAllocaHoisting.cpp      |  2 +-
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp        |  2 +-
 .../WebAssemblyLowerEmscriptenEHSjLj.cpp      |  2 +-
 lib/Target/X86/X86WinEHState.cpp              |  2 +-
 lib/Transforms/IPO/HotColdSplitting.cpp       |  2 +-
 lib/Transforms/IPO/LoopExtractor.cpp          |  4 +-
 lib/Transforms/IPO/PartialInlining.cpp        |  2 +-
 lib/Transforms/IPO/PruneEH.cpp                |  2 +-
 lib/Transforms/IPO/SampleProfile.cpp          |  2 +-
 .../InstCombine/InstCombineCalls.cpp          |  2 +-
 lib/Transforms/InstCombine/InstCombinePHI.cpp |  4 +-
 .../InstCombine/InstructionCombining.cpp      |  4 +-
 lib/Transforms/Instrumentation/CFGMST.h       |  2 +-
 .../Instrumentation/GCOVProfiling.cpp         |  6 +-
 .../Instrumentation/PGOInstrumentation.cpp    |  8 +-
 lib/Transforms/Scalar/ADCE.cpp                |  8 +-
 lib/Transforms/Scalar/CallSiteSplitting.cpp   |  2 +-
 .../Scalar/DeadStoreElimination.cpp           |  2 +-
 lib/Transforms/Scalar/LoopUnrollPass.cpp      |  2 +-
 lib/Transforms/Scalar/LoopUnswitch.cpp        | 21 +++--
 lib/Transforms/Scalar/PlaceSafepoints.cpp     |  6 +-
 .../Scalar/RewriteStatepointsForGC.cpp        | 12 +--
 lib/Transforms/Scalar/SCCP.cpp                |  2 +-
 lib/Transforms/Scalar/SROA.cpp                |  4 +-
 lib/Transforms/Scalar/SimpleLoopUnswitch.cpp  |  2 +-
 lib/Transforms/Scalar/StructurizeCFG.cpp      |  2 +-
 .../Scalar/TailRecursionElimination.cpp       |  2 +-
 lib/Transforms/Utils/BreakCriticalEdges.cpp   |  2 +-
 lib/Transforms/Utils/CloneFunction.cpp        |  4 +-
 lib/Transforms/Utils/CodeExtractor.cpp        |  8 +-
 lib/Transforms/Utils/EscapeEnumerator.cpp     |  2 +-
 lib/Transforms/Utils/FlattenCFG.cpp           |  6 +-
 lib/Transforms/Utils/FunctionComparator.cpp   |  6 +-
 lib/Transforms/Utils/InlineFunction.cpp       |  2 +-
 lib/Transforms/Utils/Local.cpp                |  6 +-
 lib/Transforms/Utils/LoopRotationUtils.cpp    |  2 +-
 lib/Transforms/Utils/LoopSimplify.cpp         |  4 +-
 lib/Transforms/Utils/LoopUnroll.cpp           |  2 +-
 lib/Transforms/Utils/LoopUnrollAndJam.cpp     |  2 +-
 lib/Transforms/Utils/SimplifyCFG.cpp          | 80 +++++++++----------
 lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp |  2 +-
 58 files changed, 143 insertions(+), 144 deletions(-)

diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 54a657073f0..7f544b27fe9 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -135,7 +135,7 @@ static const uint32_t IH_NONTAKEN_WEIGHT = 1;
 /// Add \p BB to PostDominatedByUnreachable set if applicable.
 void
 BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   if (TI->getNumSuccessors() == 0) {
     if (isa<UnreachableInst>(TI) ||
         // If this block is terminated by a call to
@@ -167,7 +167,7 @@ BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) {
 void
 BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) {
   assert(!PostDominatedByColdCall.count(BB));
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   if (TI->getNumSuccessors() == 0)
     return;
 
@@ -202,7 +202,7 @@ BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) {
 /// Predict that a successor which leads necessarily to an
 /// unreachable-terminated block as extremely unlikely.
 bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   (void) TI;
   assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
   assert(!isa<InvokeInst>(TI) &&
@@ -246,7 +246,7 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
 // heuristic. The probability of the edge coming to unreachable block is
 // set to min of metadata and unreachable heuristic.
 bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
   if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) || isa<IndirectBrInst>(TI)))
     return false;
@@ -348,7 +348,7 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
 /// Return true if we could compute the weights for cold edges.
 /// Return false, otherwise.
 bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   (void) TI;
   assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
   assert(!isa<InvokeInst>(TI) &&
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index aaea5995429..aa880a62b75 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp
@@ -71,7 +71,7 @@ void llvm::FindFunctionBackedges(const Function &F,
 /// successor.
 unsigned llvm::GetSuccessorNumber(const BasicBlock *BB,
     const BasicBlock *Succ) {
-  const TerminatorInst *Term = BB->getTerminator();
+  const Instruction *Term = BB->getTerminator();
 #ifndef NDEBUG
   unsigned e = Term->getNumSuccessors();
 #endif
diff --git a/lib/Analysis/EHPersonalities.cpp b/lib/Analysis/EHPersonalities.cpp
index 2d35a3fa911..0df73aeebbd 100644
--- a/lib/Analysis/EHPersonalities.cpp
+++ b/lib/Analysis/EHPersonalities.cpp
@@ -120,7 +120,7 @@ DenseMap<BasicBlock *, ColorVector> llvm::colorEHFunclets(Function &F) {
                            << "\'.\n");
 
     BasicBlock *SuccColor = Color;
-    TerminatorInst *Terminator = Visiting->getTerminator();
+    Instruction *Terminator = Visiting->getTerminator();
     if (auto *CatchRet = dyn_cast<CatchReturnInst>(Terminator)) {
       Value *ParentPad = CatchRet->getCatchSwitchParentPad();
       if (isa<ConstantTokenNone>(ParentPad))
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index fb032e0404c..1b5150a0d18 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -1831,7 +1831,7 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) {
     if (!IR)
       return IR;
 
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
 
     // Add in the live successors by first checking whether we have terminator
     // that may be simplified based on the values simplified by this call.
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 99ff25a3fd3..4b174b66d1e 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -219,7 +219,7 @@ MDNode *Loop::getLoopID() const {
   SmallVector<BasicBlock *, 4> LatchesBlocks;
   getLoopLatches(LatchesBlocks);
   for (BasicBlock *BB : LatchesBlocks) {
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     MDNode *MD = TI->getMetadata(LLVMContext::MD_loop);
 
     if (!MD)
@@ -250,7 +250,7 @@ void Loop::setLoopID(MDNode *LoopID) const {
          "The loop should have no single latch at this point");
   BasicBlock *H = getHeader();
   for (BasicBlock *BB : this->blocks()) {
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     for (BasicBlock *Successor : successors(TI)) {
       if (Successor == H)
         TI->setMetadata(LLVMContext::MD_loop, LoopID);
diff --git a/lib/Analysis/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp
index 51a5733a3ef..880dc2f2785 100644
--- a/lib/Analysis/MemorySSAUpdater.cpp
+++ b/lib/Analysis/MemorySSAUpdater.cpp
@@ -1104,7 +1104,7 @@ void MemorySSAUpdater::removeBlocks(
     const SmallPtrSetImpl<BasicBlock *> &DeadBlocks) {
   // First delete all uses of BB in MemoryPhis.
   for (BasicBlock *BB : DeadBlocks) {
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     assert(TI && "Basic block expected to have a terminator instruction");
     for (BasicBlock *Succ : successors(TI))
       if (!DeadBlocks.count(Succ))
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 193020ed92f..4a30447f647 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -7078,7 +7078,7 @@ ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock,
     return getCouldNotCompute();
 
   bool IsOnlyExit = (L->getExitingBlock() != nullptr);
-  TerminatorInst *Term = ExitingBlock->getTerminator();
+  Instruction *Term = ExitingBlock->getTerminator();
   if (BranchInst *BI = dyn_cast<BranchInst>(Term)) {
     assert(BI->isConditional() && "If unconditional, it can't be in loop!");
     bool ExitIfTrue = !L->contains(BI->getSuccessor(0));
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index b769e92590f..aae04a573af 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -471,7 +471,7 @@ static bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
 bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
   const Instruction *I = CS.getInstruction();
   const BasicBlock *ExitBB = I->getParent();
-  const TerminatorInst *Term = ExitBB->getTerminator();
+  const Instruction *Term = ExitBB->getTerminator();
   const ReturnInst *Ret = dyn_cast<ReturnInst>(Term);
 
   // The block must end in a return statement or unreachable.
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index ad416017470..542cc10371e 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -2223,7 +2223,7 @@ unsigned FastISel::fastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) {
 /// might result in multiple MBB's for one BB.  As such, the start of the
 /// BB might correspond to a different MBB than the end.
 bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
-  const TerminatorInst *TI = LLVMBB->getTerminator();
+  const Instruction *TI = LLVMBB->getTerminator();
 
   SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
   FuncInfo.OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f7866665bcb..1a99ef734f1 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -9249,7 +9249,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
 /// the end.
 void
 SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
-  const TerminatorInst *TI = LLVMBB->getTerminator();
+  const Instruction *TI = LLVMBB->getTerminator();
 
   SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index feb57eeafe7..2b4a590f19f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -451,7 +451,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       if (!succ_empty(&BB))
         continue;
 
-      const TerminatorInst *Term = BB.getTerminator();
+      const Instruction *Term = BB.getTerminator();
       if (isa<UnreachableInst>(Term) || isa<ReturnInst>(Term))
         continue;
 
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
index a3243235854..6a15240fa6e 100644
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -218,7 +218,7 @@ static void calculateStateNumbersForInvokes(const Function *Fn,
 // to. If the unwind edge came from an invoke, return null.
 static const BasicBlock *getEHPadFromPredecessor(const BasicBlock *BB,
                                                  Value *ParentPad) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   if (isa<InvokeInst>(TI))
     return nullptr;
   if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
@@ -977,7 +977,7 @@ void WinEHPrepare::removeImplausibleInstructions(Function &F) {
         break;
       }
 
-      TerminatorInst *TI = BB->getTerminator();
+      Instruction *TI = BB->getTerminator();
       // CatchPadInst and CleanupPadInst can't transfer control to a ReturnInst.
       bool IsUnreachableRet = isa<ReturnInst>(TI) && FuncletPad;
       // The token consumed by a CatchReturnInst must match the funclet token.
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index d04af9261e3..03fb5ccaffc 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -437,7 +437,7 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) {
 }
 
 void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) {
-  TerminatorInst *TI = getTerminator();
+  Instruction *TI = getTerminator();
   if (!TI)
     // Cope with being called on a BasicBlock that doesn't have a terminator
     // yet. Clang's CodeGenFunction::EmitReturnBlock() likes to do this.
@@ -468,7 +468,7 @@ const LandingPadInst *BasicBlock::getLandingPadInst() const {
 }
 
 Optional<uint64_t> BasicBlock::getIrrLoopHeaderWeight() const {
-  const TerminatorInst *TI = getTerminator();
+  const Instruction *TI = getTerminator();
   if (MDNode *MDIrrLoopHeader =
       TI->getMetadata(LLVMContext::MD_irr_loop)) {
     MDString *MDName = cast<MDString>(MDIrrLoopHeader->getOperand(0));
diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp
index c78f220439a..cf9f5759ba5 100644
--- a/lib/IR/Dominators.cpp
+++ b/lib/IR/Dominators.cpp
@@ -41,7 +41,7 @@ static constexpr bool ExpensiveChecksEnabled = false;
 #endif
 
 bool BasicBlockEdge::isSingleEdge() const {
-  const TerminatorInst *TI = Start->getTerminator();
+  const Instruction *TI = Start->getTerminator();
   unsigned NumEdgesToEnd = 0;
   for (unsigned int i = 0, n = TI->getNumSuccessors(); i < n; ++i) {
     if (TI->getSuccessor(i) == End)
diff --git a/lib/IR/SafepointIRVerifier.cpp b/lib/IR/SafepointIRVerifier.cpp
index 7af48f5301f..d2102138d79 100644
--- a/lib/IR/SafepointIRVerifier.cpp
+++ b/lib/IR/SafepointIRVerifier.cpp
@@ -134,7 +134,7 @@ public:
     // Top-down walk of the dominator tree
     ReversePostOrderTraversal<const Function *> RPOT(&F);
     for (const BasicBlock *BB : RPOT) {
-      const TerminatorInst *TI = BB->getTerminator();
+      const Instruction *TI = BB->getTerminator();
       assert(TI && "blocks must be well formed");
 
       // For conditional branches, we can perform simple conditional propagation on
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 8304ec6e8f4..d96555ca5f9 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -3450,7 +3450,7 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
   Instruction *ToPad = &I;
   Value *ToPadParent = getParentPad(ToPad);
   for (BasicBlock *PredBB : predecessors(BB)) {
-    TerminatorInst *TI = PredBB->getTerminator();
+    Instruction *TI = PredBB->getTerminator();
     Value *FromPad;
     if (auto *II = dyn_cast<InvokeInst>(TI)) {
       Assert(II->getUnwindDest() == BB && II->getNormalDest() != BB,
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index c52313f84ef..8248dbe1b0f 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -288,7 +288,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
         }
       }
 
-      TerminatorInst *Insert = From->getTerminator();
+      Instruction *Insert = From->getTerminator();
       Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
       NewPhi->setIncomingValue(i, PhiArg);
     }
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
index bed52293197..bf922eb8a19 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@@ -41,7 +41,7 @@ public:
 bool NVPTXAllocaHoisting::runOnFunction(Function &function) {
   bool functionModified = false;
   Function::iterator I = function.begin();
-  TerminatorInst *firstTerminatorInst = (I++)->getTerminator();
+  Instruction *firstTerminatorInst = (I++)->getTerminator();
 
   for (Function::iterator E = function.end(); I != E; ++I) {
     for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 5ec7b102884..8861de6f0d8 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -490,7 +490,7 @@ static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
   if (!FuncInfo->BPI) return PPC::BR_NO_HINT;
 
   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
-  const TerminatorInst *BBTerm = BB->getTerminator();
+  const Instruction *BBTerm = BB->getTerminator();
 
   if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT;
 
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index b5a88129c6b..f0d24075801 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -1030,7 +1030,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
 
   // Free setjmpTable buffer before each return instruction
   for (BasicBlock &BB : F) {
-    TerminatorInst *TI = BB.getTerminator();
+    Instruction *TI = BB.getTerminator();
     if (isa<ReturnInst>(TI))
       CallInst::CreateFree(SetjmpTable, TI);
   }
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index dde9c734f49..c11e7e365a1 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -369,7 +369,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
 
   // Insert an unlink before all returns.
   for (BasicBlock &BB : *F) {
-    TerminatorInst *T = BB.getTerminator();
+    Instruction *T = BB.getTerminator();
     if (!isa<ReturnInst>(T))
       continue;
     Builder.SetInsertPoint(T);
diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index 810fdf418a2..9d2634f1bc9 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -104,7 +104,7 @@ static bool isSingleEntrySingleExit(BasicBlock *Entry, const BasicBlock *Exit,
 bool blockEndsInUnreachable(const BasicBlock &BB) {
   if (BB.empty())
     return true;
-  const TerminatorInst *I = BB.getTerminator();
+  const Instruction *I = BB.getTerminator();
   if (isa<ReturnInst>(I) || isa<IndirectBrInst>(I))
     return true;
   // Unreachable blocks do not have any successor.
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 8c86f7cb806..733235d45a0 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -104,8 +104,8 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
   bool ShouldExtractLoop = false;
 
   // Extract the loop if the entry block doesn't branch to the loop header.
-  TerminatorInst *EntryTI =
-    L->getHeader()->getParent()->getEntryBlock().getTerminator();
+  Instruction *EntryTI =
+      L->getHeader()->getParent()->getEntryBlock().getTerminator();
   if (!isa<BranchInst>(EntryTI) ||
       !cast<BranchInst>(EntryTI)->isUnconditional() ||
       EntryTI->getSuccessor(0) != L->getHeader()) {
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 709222dbec0..11c4bbc437c 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -556,7 +556,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   };
 
   auto IsReturnBlock = [](BasicBlock *BB) {
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     return isa<ReturnInst>(TI);
   };
 
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 2caee294221..ae586c01747 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -107,7 +107,7 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) {
         continue;
 
       for (const BasicBlock &BB : *F) {
-        const TerminatorInst *TI = BB.getTerminator();
+        const Instruction *TI = BB.getTerminator();
         if (CheckUnwind && TI->mayThrow()) {
           SCCMightUnwind = true;
         } else if (CheckReturn && isa<ReturnInst>(TI)) {
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 4a69a0c2806..a78e0d459c8 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -1297,7 +1297,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
         }
       }
     }
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     if (TI->getNumSuccessors() == 1)
       continue;
     if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 714c6176884..6d2ac2274de 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3732,7 +3732,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Scan down this block to see if there is another stack restore in the
     // same block without an intervening call/alloca.
     BasicBlock::iterator BI(II);
-    TerminatorInst *TI = II->getParent()->getTerminator();
+    Instruction *TI = II->getParent()->getTerminator();
     bool CannotRemove = false;
     for (++BI; &*BI != TI; ++BI) {
       if (isa<AllocaInst>(BI)) {
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 0289abe472e..94745094c15 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -652,7 +652,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
 Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
   // We cannot create a new instruction after the PHI if the terminator is an
   // EHPad because there is no valid insertion point.
-  if (TerminatorInst *TI = Phi.getParent()->getTerminator())
+  if (Instruction *TI = Phi.getParent()->getTerminator())
     if (TI->isEHPad())
       return nullptr;
 
@@ -726,7 +726,7 @@ Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
 Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   // We cannot create a new instruction after the PHI if the terminator is an
   // EHPad because there is no valid insertion point.
-  if (TerminatorInst *TI = PN.getParent()->getTerminator())
+  if (Instruction *TI = PN.getParent()->getTerminator())
     if (TI->isEHPad())
       return nullptr;
 
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 00ffe9e2dc2..ae7d08149c6 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2347,7 +2347,7 @@ tryToMoveFreeBeforeNullTest(CallInst &FI) {
     return nullptr;
 
   // Validate the rest of constraint #1 by matching on the pred branch.
-  TerminatorInst *TI = PredBB->getTerminator();
+  Instruction *TI = PredBB->getTerminator();
   BasicBlock *TrueBB, *FalseBB;
   ICmpInst::Predicate Pred;
   if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Op), m_Zero()), TrueBB, FalseBB)))
@@ -3285,7 +3285,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
 
     // Recursively visit successors.  If this is a branch or switch on a
     // constant, only visit the reachable successor.
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
         bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
diff --git a/lib/Transforms/Instrumentation/CFGMST.h b/lib/Transforms/Instrumentation/CFGMST.h
index cc9b149d0b6..e178ef386e6 100644
--- a/lib/Transforms/Instrumentation/CFGMST.h
+++ b/lib/Transforms/Instrumentation/CFGMST.h
@@ -119,7 +119,7 @@ public:
     static const uint32_t CriticalEdgeMultiplier = 1000;
 
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
+      Instruction *TI = BB->getTerminator();
       uint64_t BBWeight =
           (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2);
       uint64_t Weight = 2;
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index a060dd53513..ee546a9a828 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -578,7 +578,7 @@ void GCOVProfiler::emitProfileNotes() {
 
       for (auto &BB : F) {
         GCOVBlock &Block = Func.getBlock(&BB);
-        TerminatorInst *TI = BB.getTerminator();
+        Instruction *TI = BB.getTerminator();
         if (int successors = TI->getNumSuccessors()) {
           for (int i = 0; i != successors; ++i) {
             Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
@@ -646,7 +646,7 @@ bool GCOVProfiler::emitProfileArcs() {
       DenseMap<std::pair<BasicBlock *, BasicBlock *>, unsigned> EdgeToCounter;
       unsigned Edges = 0;
       for (auto &BB : F) {
-        TerminatorInst *TI = BB.getTerminator();
+        Instruction *TI = BB.getTerminator();
         if (isa<ReturnInst>(TI)) {
           EdgeToCounter[{&BB, nullptr}] = Edges++;
         } else {
@@ -690,7 +690,7 @@ bool GCOVProfiler::emitProfileArcs() {
           Count = Builder.CreateAdd(Count, Builder.getInt64(1));
           Builder.CreateStore(Count, Phi);
 
-          TerminatorInst *TI = BB.getTerminator();
+          Instruction *TI = BB.getTerminator();
           if (isa<ReturnInst>(TI)) {
             auto It = EdgeToCounter.find({&BB, nullptr});
             assert(It != EdgeToCounter.end());
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index ac851f660d9..4790c9e5cfe 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -586,7 +586,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
   std::vector<char> Indexes;
   JamCRC JC;
   for (auto &BB : F) {
-    const TerminatorInst *TI = BB.getTerminator();
+    const Instruction *TI = BB.getTerminator();
     for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
       BasicBlock *Succ = TI->getSuccessor(I);
       auto BI = findBBInfo(Succ);
@@ -698,7 +698,7 @@ BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
 
   // Instrument the SrcBB if it has a single successor,
   // otherwise, the DestBB if this is not a critical edge.
-  TerminatorInst *TI = SrcBB->getTerminator();
+  Instruction *TI = SrcBB->getTerminator();
   if (TI->getNumSuccessors() <= 1)
     return SrcBB;
   if (!E->IsCritical)
@@ -1167,7 +1167,7 @@ void PGOUseFunc::setBranchWeights() {
   // Generate MD_prof metadata for every branch instruction.
   LLVM_DEBUG(dbgs() << "\nSetting branch weights.\n");
   for (auto &BB : F) {
-    TerminatorInst *TI = BB.getTerminator();
+    Instruction *TI = BB.getTerminator();
     if (TI->getNumSuccessors() < 2)
       continue;
     if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
@@ -1213,7 +1213,7 @@ void PGOUseFunc::annotateIrrLoopHeaderWeights() {
     // to become an irreducible loop header after the indirectbr tail
     // duplication.
     if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) {
-      TerminatorInst *TI = BB.getTerminator();
+      Instruction *TI = BB.getTerminator();
       const UseBBInfo &BBCountInfo = getBBInfo(&BB);
       setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
     }
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 883d2e17350..b0602d96798 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -103,7 +103,7 @@ struct BlockInfoType {
   BasicBlock *BB = nullptr;
 
   /// Cache of BB->getTerminator().
-  TerminatorInst *Terminator = nullptr;
+  Instruction *Terminator = nullptr;
 
   /// Post-order numbering of reverse control flow graph.
   unsigned PostOrder;
@@ -206,7 +206,7 @@ bool AggressiveDeadCodeElimination::performDeadCodeElimination() {
   return removeDeadInstructions();
 }
 
-static bool isUnconditionalBranch(TerminatorInst *Term) {
+static bool isUnconditionalBranch(Instruction *Term) {
   auto *BR = dyn_cast<BranchInst>(Term);
   return BR && BR->isUnconditional();
 }
@@ -277,7 +277,7 @@ void AggressiveDeadCodeElimination::initialize() {
     // treat all edges to a block already seen as loop back edges
     // and mark the branch live it if there is a back edge.
     for (auto *BB: depth_first_ext(&F.getEntryBlock(), State)) {
-      TerminatorInst *Term = BB->getTerminator();
+      Instruction *Term = BB->getTerminator();
       if (isLive(Term))
         continue;
 
@@ -643,7 +643,7 @@ void AggressiveDeadCodeElimination::computeReversePostOrder() {
 
 void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
                                                       BasicBlock *Target) {
-  TerminatorInst *PredTerm = BB->getTerminator();
+  Instruction *PredTerm = BB->getTerminator();
   // Collect the live debug info scopes attached to this instruction.
   if (const DILocation *DL = PredTerm->getDebugLoc())
     collectLiveScopes(*DL);
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 54385155cd2..e82682e08ab 100644
--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -248,7 +248,7 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
   ReturnInst* RI = dyn_cast<ReturnInst>(&*II);
   assert(RI && "`musttail` call must be followed by `ret` instruction");
 
-  TerminatorInst *TI = SplitBB->getTerminator();
+  Instruction *TI = SplitBB->getTerminator();
   Value *V = NewCI;
   if (BCI)
     V = cloneInstForMustTail(BCI, TI, V);
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 545b0060c13..69112f3cee2 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -643,7 +643,7 @@ static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
   for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
     BasicBlock *Pred = *I;
     if (Pred == BB) continue;
-    TerminatorInst *PredTI = Pred->getTerminator();
+    Instruction *PredTI = Pred->getTerminator();
     if (PredTI->getNumSuccessors() != 1)
       continue;
 
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index d980cde49b6..34d2b2a8b27 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -540,7 +540,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
         }
       }
 
-      TerminatorInst *TI = BB->getTerminator();
+      Instruction *TI = BB->getTerminator();
 
       // Add in the live successors by first checking whether we have terminator
       // that may be simplified based on the values simplified by this call.
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index f67bff7fe93..13e6bd13754 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -246,11 +246,11 @@ namespace {
     bool TryTrivialLoopUnswitch(bool &Changed);
 
     bool UnswitchIfProfitable(Value *LoopCond, Constant *Val,
-                              TerminatorInst *TI = nullptr);
+                              Instruction *TI = nullptr);
     void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
-                                  BasicBlock *ExitBlock, TerminatorInst *TI);
+                                  BasicBlock *ExitBlock, Instruction *TI);
     void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
-                                     TerminatorInst *TI);
+                                     Instruction *TI);
 
     void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
                                               Constant *Val, bool isEqual);
@@ -258,8 +258,7 @@ namespace {
     void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
                                         BasicBlock *TrueDest,
                                         BasicBlock *FalseDest,
-                                        BranchInst *OldBranch,
-                                        TerminatorInst *TI);
+                                        BranchInst *OldBranch, Instruction *TI);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
 
@@ -713,7 +712,7 @@ bool LoopUnswitch::processCurrentLoop() {
   // loop.
   for (Loop::block_iterator I = currentLoop->block_begin(),
          E = currentLoop->block_end(); I != E; ++I) {
-    TerminatorInst *TI = (*I)->getTerminator();
+    Instruction *TI = (*I)->getTerminator();
 
     // Unswitching on a potentially uninitialized predicate is not
     // MSan-friendly. Limit this to the cases when the original predicate is
@@ -876,7 +875,7 @@ static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
 /// simplify the loop.  If we decide that this is profitable,
 /// unswitch the loop, reprocess the pieces, then return true.
 bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
-                                        TerminatorInst *TI) {
+                                        Instruction *TI) {
   // Check to see if it would be profitable to unswitch current loop.
   if (!BranchesInfo.CostAllowsUnswitching()) {
     LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
@@ -931,7 +930,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
                                                   BasicBlock *TrueDest,
                                                   BasicBlock *FalseDest,
                                                   BranchInst *OldBranch,
-                                                  TerminatorInst *TI) {
+                                                  Instruction *TI) {
   assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
   assert(TrueDest != FalseDest && "Branch targets should be different");
   // Insert a conditional branch on LIC to the two preheaders.  The original
@@ -996,7 +995,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
 /// outside of the loop and updating loop info.
 void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
                                             BasicBlock *ExitBlock,
-                                            TerminatorInst *TI) {
+                                            Instruction *TI) {
   LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
                     << loopHeader->getName() << " [" << L->getBlocks().size()
                     << " blocks] in Function "
@@ -1054,7 +1053,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
 /// condition.
 bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
   BasicBlock *CurrentBB = currentLoop->getHeader();
-  TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+  Instruction *CurrentTerm = CurrentBB->getTerminator();
   LLVMContext &Context = CurrentBB->getContext();
 
   // If loop header has only one reachable successor (currently via an
@@ -1227,7 +1226,7 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
 /// Split it into loop versions and test the condition outside of either loop.
 /// Return the loops created as Out1/Out2.
 void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
-                                               Loop *L, TerminatorInst *TI) {
+                                               Loop *L, Instruction *TI) {
   Function *F = loopHeader->getParent();
   LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
                     << loopHeader->getName() << " [" << L->getBlocks().size()
diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 7f9aad24883..fd2eb85fd7b 100644
--- a/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -105,7 +105,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
 
   /// The output of the pass - gives a list of each backedge (described by
   /// pointing at the branch) which need a poll inserted.
-  std::vector<TerminatorInst *> PollLocations;
+  std::vector<Instruction *> PollLocations;
 
   /// True unless we're running spp-no-calls in which case we need to disable
   /// the call-dependent placement opts.
@@ -348,7 +348,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
     // Safepoint insertion would involve creating a new basic block (as the
     // target of the current backedge) which does the safepoint (of all live
     // variables) and branches to the true header
-    TerminatorInst *Term = Pred->getTerminator();
+    Instruction *Term = Pred->getTerminator();
 
     LLVM_DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
 
@@ -535,7 +535,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
 
     // Insert a poll at each point the analysis pass identified
     // The poll location must be the terminator of a loop latch block.
-    for (TerminatorInst *Term : PollLocations) {
+    for (Instruction *Term : PollLocations) {
       // We are inserting a poll, the function is modified
       Modified = true;
 
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 5e23a8a3dcd..cf2ce03049a 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1851,13 +1851,13 @@ static void relocationViaAlloca(
     StoreInst *Store = new StoreInst(Def, Alloca);
     if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
       if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
-        // InvokeInst is a TerminatorInst so the store need to be inserted
-        // into its normal destination block.
+        // InvokeInst is a terminator so the store need to be inserted into its
+        // normal destination block.
         BasicBlock *NormalDest = Invoke->getNormalDest();
         Store->insertBefore(NormalDest->getFirstNonPHI());
       } else {
         assert(!Inst->isTerminator() &&
-               "The only TerminatorInst that can produce a value is "
+               "The only terminator that can produce a value is "
                "InvokeInst which is handled above.");
         Store->insertAfter(Inst);
       }
@@ -2584,7 +2584,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
   // increase the liveset of any statepoint we move over.  This is profitable
   // as long as all statepoints are in rare blocks.  If we had in-register
   // lowering for live values this would be a much safer transform.
-  auto getConditionInst = [](TerminatorInst *TI) -> Instruction* {
+  auto getConditionInst = [](Instruction *TI) -> Instruction * {
     if (auto *BI = dyn_cast<BranchInst>(TI))
       if (BI->isConditional())
         return dyn_cast<Instruction>(BI->getCondition());
@@ -2592,7 +2592,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
     return nullptr;
   };
   for (BasicBlock &BB : F) {
-    TerminatorInst *TI = BB.getTerminator();
+    Instruction *TI = BB.getTerminator();
     if (auto *Cond = getConditionInst(TI))
       // TODO: Handle more than just ICmps here.  We should be able to move
       // most instructions without side effects or memory access.
@@ -2675,7 +2675,7 @@ static SetVector<Value *> computeKillSet(BasicBlock *BB) {
 /// Check that the items in 'Live' dominate 'TI'.  This is used as a basic
 /// sanity check for the liveness computation.
 static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live,
-                          TerminatorInst *TI, bool TermOkay = false) {
+                          Instruction *TI, bool TermOkay = false) {
   for (Value *V : Live) {
     if (auto *I = dyn_cast<Instruction>(V)) {
       // The terminator can be a member of the LiveOut set.  LLVM's definition
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 7196bc82edc..11e5549c332 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -1614,7 +1614,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
     // Check to see if we have a branch or switch on an undefined value.  If so
     // we force the branch to go one way or the other to make the successor
     // values live.  It doesn't really matter which way we force it.
-    TerminatorInst *TI = BB.getTerminator();
+    Instruction *TI = BB.getTerminator();
     if (auto *BI = dyn_cast<BranchInst>(TI)) {
       if (!BI->isConditional()) continue;
       if (!getValueState(BI->getCondition()).isUnknown())
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 6e991409bf0..0f43ee6bbd7 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -1211,7 +1211,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
   // predecessor blocks. The only thing to watch out for is that we can't put
   // a possibly trapping load in the predecessor if it is a critical edge.
   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
-    TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
+    Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
     Value *InVal = PN.getIncomingValue(Idx);
 
     // If the value is produced by the terminator of the predecessor (an
@@ -1275,7 +1275,7 @@ static void speculatePHINodeLoads(PHINode &PN) {
       continue;
     }
 
-    TerminatorInst *TI = Pred->getTerminator();
+    Instruction *TI = Pred->getTerminator();
     IRBuilderTy PredBuilder(TI);
 
     LoadInst *Load = PredBuilder.CreateLoad(
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 17035f469da..6c4773aa92e 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -783,7 +783,7 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
                      [](Instruction &I) { return I.mayHaveSideEffects(); }))
       return Changed;
 
-    TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+    Instruction *CurrentTerm = CurrentBB->getTerminator();
 
     if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
       // Don't bother trying to unswitch past a switch with a constant
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index f58f79f8b14..2bfd9927411 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -636,7 +636,7 @@ void StructurizeCFG::setPhiValues() {
 
 /// Remove phi values from all successors and then remove the terminator.
 void StructurizeCFG::killTerminator(BasicBlock *BB) {
-  TerminatorInst *Term = BB->getTerminator();
+  Instruction *Term = BB->getTerminator();
   if (!Term)
     return;
 
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 6a77a2d414f..0f6db21f73b 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -702,7 +702,7 @@ static bool foldReturnAndProcessPred(
   SmallVector<BranchInst*, 8> UncondBranchPreds;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *Pred = *PI;
-    TerminatorInst *PTI = Pred->getTerminator();
+    Instruction *PTI = Pred->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(PTI))
       if (BI->isUnconditional())
         UncondBranchPreds.push_back(BI);
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index c3d67087ae7..fafc9aaba5c 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -318,7 +318,7 @@ findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
   BasicBlock *IBB = nullptr;
   for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
     BasicBlock *PredBB = PN->getIncomingBlock(Pred);
-    TerminatorInst *PredTerm = PredBB->getTerminator();
+    Instruction *PredTerm = PredBB->getTerminator();
     switch (PredTerm->getOpcode()) {
     case Instruction::IndirectBr:
       if (IBB)
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index a9257a8c670..000af808945 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -365,7 +365,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
   }
 
   // Finally, clone over the terminator.
-  const TerminatorInst *OldTI = BB->getTerminator();
+  const Instruction *OldTI = BB->getTerminator();
   bool TerminatorDone = false;
   if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
     if (BI->isConditional()) {
@@ -414,7 +414,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
           CodeInfo->OperandBundleCallSites.push_back(NewInst);
 
     // Recursively clone any reachable successor blocks.
-    const TerminatorInst *TI = BB->getTerminator();
+    const Instruction *TI = BB->getTerminator();
     for (const BasicBlock *Succ : successors(TI))
       ToClone.push_back(Succ);
   }
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 7f26c53ecf3..0e9e3219033 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -566,7 +566,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
     // changing them to branch to NewBB instead.
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
       if (Blocks.count(PN->getIncomingBlock(i))) {
-        TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator();
+        Instruction *TI = PN->getIncomingBlock(i)->getTerminator();
         TI->replaceUsesOfWith(OldPred, NewBB);
       }
 
@@ -778,7 +778,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       Value *Idx[2];
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
-      TerminatorInst *TI = newFunction->begin()->getTerminator();
+      Instruction *TI = newFunction->begin()->getTerminator();
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
           StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI);
       RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI);
@@ -972,7 +972,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
 
   unsigned switchVal = 0;
   for (BasicBlock *Block : Blocks) {
-    TerminatorInst *TI = Block->getTerminator();
+    Instruction *TI = Block->getTerminator();
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
       if (!Blocks.count(TI->getSuccessor(i))) {
         BasicBlock *OldTarget = TI->getSuccessor(i);
@@ -1078,7 +1078,7 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
   using BlockNode = BlockFrequencyInfoImplBase::BlockNode;
 
   // Update the branch weights for the exit block.
-  TerminatorInst *TI = CodeReplacer->getTerminator();
+  Instruction *TI = CodeReplacer->getTerminator();
   SmallVector<unsigned, 8> BranchWeights(TI->getNumSuccessors(), 0);
 
   // Block Frequency distribution with dummy node.
diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp
index c9c96fbe5da..762a374c135 100644
--- a/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -37,7 +37,7 @@ IRBuilder<> *EscapeEnumerator::Next() {
 
     // Branches and invokes do not escape, only unwind, resume, and return
     // do.
-    TerminatorInst *TI = CurBB->getTerminator();
+    Instruction *TI = CurBB->getTerminator();
     if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
       continue;
 
diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp
index 3c6c9c9a5df..d9778f4a1fb 100644
--- a/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/lib/Transforms/Utils/FlattenCFG.cpp
@@ -232,7 +232,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
   if (!FirstCondBlock || !LastCondBlock || (FirstCondBlock == LastCondBlock))
     return false;
 
-  TerminatorInst *TBB = LastCondBlock->getTerminator();
+  Instruction *TBB = LastCondBlock->getTerminator();
   BasicBlock *PS1 = TBB->getSuccessor(0);
   BasicBlock *PS2 = TBB->getSuccessor(1);
   BranchInst *PBI1 = dyn_cast<BranchInst>(PS1->getTerminator());
@@ -325,7 +325,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
 bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
                                          BasicBlock *Block1,
                                          BasicBlock *Block2) {
-  TerminatorInst *PTI2 = Head2->getTerminator();
+  Instruction *PTI2 = Head2->getTerminator();
   Instruction *PBI2 = &Head2->front();
 
   bool eq1 = (Block1 == Head1);
@@ -421,7 +421,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   if ((IfTrue2 != SecondEntryBlock) && (IfFalse2 != SecondEntryBlock))
     return false;
 
-  TerminatorInst *PTI2 = SecondEntryBlock->getTerminator();
+  Instruction *PTI2 = SecondEntryBlock->getTerminator();
   Instruction *PBI2 = &SecondEntryBlock->front();
 
   if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1,
diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp
index 69203f9f248..ef991d715fd 100644
--- a/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/lib/Transforms/Utils/FunctionComparator.cpp
@@ -867,8 +867,8 @@ int FunctionComparator::compare() {
     if (int Res = cmpBasicBlocks(BBL, BBR))
       return Res;
 
-    const TerminatorInst *TermL = BBL->getTerminator();
-    const TerminatorInst *TermR = BBR->getTerminator();
+    const Instruction *TermL = BBL->getTerminator();
+    const Instruction *TermR = BBR->getTerminator();
 
     assert(TermL->getNumSuccessors() == TermR->getNumSuccessors());
     for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) {
@@ -938,7 +938,7 @@ FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
     for (auto &Inst : *BB) {
       H.add(Inst.getOpcode());
     }
-    const TerminatorInst *Term = BB->getTerminator();
+    const Instruction *Term = BB->getTerminator();
     for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
       if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
         continue;
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index f8226f529ee..bda2ee2d8a3 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -2247,7 +2247,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // Change the branch that used to go to AfterCallBB to branch to the first
   // basic block of the inlined function.
   //
-  TerminatorInst *Br = OrigBB->getTerminator();
+  Instruction *Br = OrigBB->getTerminator();
   assert(Br && Br->getOpcode() == Instruction::Br &&
          "splitBasicBlock broken!");
   Br->setOperand(0, &*FirstNewBlock);
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 879145cea6b..04db1c8c4c7 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -105,7 +105,7 @@ STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
 bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
                                   const TargetLibraryInfo *TLI,
                                   DomTreeUpdater *DTU) {
-  TerminatorInst *T = BB->getTerminator();
+  Instruction *T = BB->getTerminator();
   IRBuilder<> Builder(T);
 
   // Branch - See if we are conditional jumping on constant
@@ -2101,7 +2101,7 @@ static bool markAliveBlocks(Function &F,
       }
     }
 
-    TerminatorInst *Terminator = BB->getTerminator();
+    Instruction *Terminator = BB->getTerminator();
     if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
       // Turn invokes that call 'nounwind' functions into ordinary calls.
       Value *Callee = II->getCalledValue();
@@ -2176,7 +2176,7 @@ static bool markAliveBlocks(Function &F,
 }
 
 void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
-  TerminatorInst *TI = BB->getTerminator();
+  Instruction *TI = BB->getTerminator();
 
   if (auto *II = dyn_cast<InvokeInst>(TI)) {
     changeToCall(II, DTU);
diff --git a/lib/Transforms/Utils/LoopRotationUtils.cpp b/lib/Transforms/Utils/LoopRotationUtils.cpp
index a6320d8dbf4..73f67f3219d 100644
--- a/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -299,7 +299,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
   // For the rest of the instructions, either hoist to the OrigPreheader if
   // possible or create a clone in the OldPreHeader if not.
-  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+  Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
 
   // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
   using DbgIntrinsicHash =
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index fc59cafa331..380f4fca54d 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -435,7 +435,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   unsigned LoopMDKind = BEBlock->getContext().getMDKindID("llvm.loop");
   MDNode *LoopMD = nullptr;
   for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) {
-    TerminatorInst *TI = BackedgeBlocks[i]->getTerminator();
+    Instruction *TI = BackedgeBlocks[i]->getTerminator();
     if (!LoopMD)
       LoopMD = TI->getMetadata(LoopMDKind);
     TI->setMetadata(LoopMDKind, nullptr);
@@ -488,7 +488,7 @@ ReprocessLoop:
                         << P->getName() << "\n");
 
       // Zap the dead pred's terminator and replace it with unreachable.
-      TerminatorInst *TI = P->getTerminator();
+      Instruction *TI = P->getTerminator();
       changeToUnreachable(TI, /*UseLLVMTrap=*/false, PreserveLCSSA);
       Changed = true;
     }
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index a8ec75c0baf..877e0e4dcf9 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -781,7 +781,7 @@ LoopUnrollResult llvm::UnrollLoop(
         // there is no such latch.
         NewIDom = Latches.back();
         for (BasicBlock *IterLatch : Latches) {
-          TerminatorInst *Term = IterLatch->getTerminator();
+          Instruction *Term = IterLatch->getTerminator();
           if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
             NewIDom = IterLatch;
             break;
diff --git a/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 1ce2f844489..c17a64f0187 100644
--- a/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -72,7 +72,7 @@ static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
   for (BasicBlock *BB : ForeBlocks) {
     if (BB == SubLoopPreHeader)
       continue;
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
       if (!ForeBlocks.count(TI->getSuccessor(i)))
         return false;
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index ebbcf800254..8dad6176c51 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -175,13 +175,13 @@ class SimplifyCFGOpt {
   const SimplifyCFGOptions &Options;
   bool Resimplify;
 
-  Value *isValueEqualityComparison(TerminatorInst *TI);
+  Value *isValueEqualityComparison(Instruction *TI);
   BasicBlock *GetValueEqualityComparisonCases(
-      TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases);
-  bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
+      Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases);
+  bool SimplifyEqualityComparisonWithOnlyPredecessor(Instruction *TI,
                                                      BasicBlock *Pred,
                                                      IRBuilder<> &Builder);
-  bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+  bool FoldValueComparisonIntoPredecessors(Instruction *TI,
                                            IRBuilder<> &Builder);
 
   bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
@@ -219,7 +219,7 @@ public:
 /// Return true if it is safe to merge these two
 /// terminator instructions together.
 static bool
-SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2,
+SafeToMergeTerminators(Instruction *SI1, Instruction *SI2,
                        SmallSetVector<BasicBlock *, 4> *FailBlocks = nullptr) {
   if (SI1 == SI2)
     return false; // Can't merge with self!
@@ -670,7 +670,7 @@ private:
 
 } // end anonymous namespace
 
-static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
+static void EraseTerminatorAndDCECond(Instruction *TI) {
   Instruction *Cond = nullptr;
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cond = dyn_cast<Instruction>(SI->getCondition());
@@ -688,7 +688,7 @@ static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
 
 /// Return true if the specified terminator checks
 /// to see if a value is equal to constant integer value.
-Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
+Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) {
   Value *CV = nullptr;
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     // Do not permit merging of large switch instructions into their
@@ -716,7 +716,7 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
 /// Given a value comparison instruction,
 /// decode all of the 'cases' that it represents and return the 'default' block.
 BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
-    TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
+    Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cases.reserve(SI->getNumCases());
     for (auto Case : SI->cases())
@@ -806,7 +806,7 @@ static void setBranchWeights(Instruction *I, uint32_t TrueWeight,
 /// determines the outcome of this comparison. If so, simplify TI. This does a
 /// very limited form of jump threading.
 bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
-    TerminatorInst *TI, BasicBlock *Pred, IRBuilder<> &Builder) {
+    Instruction *TI, BasicBlock *Pred, IRBuilder<> &Builder) {
   Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
   if (!PredVal)
     return false; // Not a value comparison in predecessor.
@@ -854,7 +854,7 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
                         << "Through successor TI: " << *TI << "Leaving: " << *NI
                         << "\n");
 
-      EraseTerminatorInstAndDCECond(TI);
+      EraseTerminatorAndDCECond(TI);
       return true;
     }
 
@@ -936,7 +936,7 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
                     << "Through successor TI: " << *TI << "Leaving: " << *NI
                     << "\n");
 
-  EraseTerminatorInstAndDCECond(TI);
+  EraseTerminatorAndDCECond(TI);
   return true;
 }
 
@@ -971,10 +971,10 @@ static inline bool HasBranchWeights(const Instruction *I) {
   return false;
 }
 
-/// Get Weights of a given TerminatorInst, the default weight is at the front
+/// Get Weights of a given terminator, the default weight is at the front
 /// of the vector. If TI is a conditional eq, we need to swap the branch-weight
 /// metadata.
-static void GetBranchWeights(TerminatorInst *TI,
+static void GetBranchWeights(Instruction *TI,
                              SmallVectorImpl<uint64_t> &Weights) {
   MDNode *MD = TI->getMetadata(LLVMContext::MD_prof);
   assert(MD);
@@ -1008,7 +1008,7 @@ static void FitWeights(MutableArrayRef<uint64_t> Weights) {
 /// (either a switch or a branch on "X == c").
 /// See if any of the predecessors of the terminator block are value comparisons
 /// on the same value.  If so, and if safe to do so, fold them together.
-bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(Instruction *TI,
                                                          IRBuilder<> &Builder) {
   BasicBlock *BB = TI->getParent();
   Value *CV = isValueEqualityComparison(TI); // CondVal
@@ -1020,7 +1020,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
     BasicBlock *Pred = Preds.pop_back_val();
 
     // See if the predecessor is a comparison with the same value.
-    TerminatorInst *PTI = Pred->getTerminator();
+    Instruction *PTI = Pred->getTerminator();
     Value *PCV = isValueEqualityComparison(PTI); // PredCondVal
 
     if (PCV == CV && TI != PTI) {
@@ -1197,7 +1197,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         setBranchWeights(NewSI, MDWeights);
       }
 
-      EraseTerminatorInstAndDCECond(PTI);
+      EraseTerminatorAndDCECond(PTI);
 
       // Okay, last check.  If BB is still a successor of PSI, then we must
       // have an infinite loop case.  If so, add an infinitely looping block
@@ -1413,7 +1413,7 @@ HoistTerminator:
   for (BasicBlock *Succ : successors(BB1))
     AddPredecessorToBlock(Succ, BIParent, BB1);
 
-  EraseTerminatorInstAndDCECond(BI);
+  EraseTerminatorAndDCECond(BI);
   return true;
 }
 
@@ -2247,7 +2247,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
 
     // Loop over all of the edges from PredBB to BB, changing them to branch
     // to EdgeBB instead.
-    TerminatorInst *PredBBTI = PredBB->getTerminator();
+    Instruction *PredBBTI = PredBB->getTerminator();
     for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i)
       if (PredBBTI->getSuccessor(i) == BB) {
         BB->removePredecessor(PredBB);
@@ -2408,7 +2408,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement
   // has been flattened.  Change DomBlock to jump directly to our new block to
   // avoid other simplifycfg's kicking in on the diamond.
-  TerminatorInst *OldTI = DomBlock->getTerminator();
+  Instruction *OldTI = DomBlock->getTerminator();
   Builder.SetInsertPoint(OldTI);
   Builder.CreateBr(BB);
   OldTI->eraseFromParent();
@@ -2442,7 +2442,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
     TrueSucc->removePredecessor(BI->getParent());
     FalseSucc->removePredecessor(BI->getParent());
     Builder.CreateRetVoid();
-    EraseTerminatorInstAndDCECond(BI);
+    EraseTerminatorAndDCECond(BI);
     return true;
   }
 
@@ -2498,7 +2498,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
                     << "\n  " << *BI << "NewRet = " << *RI << "TRUEBLOCK: "
                     << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
 
-  EraseTerminatorInstAndDCECond(BI);
+  EraseTerminatorAndDCECond(BI);
 
   return true;
 }
@@ -2822,7 +2822,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
       }
       // Change PBI from Conditional to Unconditional.
       BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI);
-      EraseTerminatorInstAndDCECond(PBI);
+      EraseTerminatorAndDCECond(PBI);
       PBI = New_PBI;
     }
 
@@ -3417,7 +3417,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 // Takes care of updating the successors and removing the old terminator.
 // Also makes sure not to introduce new successors by assuming that edges to
 // non-successor TrueBBs and FalseBBs aren't reachable.
-static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
+static bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
                                        BasicBlock *TrueBB, BasicBlock *FalseBB,
                                        uint32_t TrueWeight,
                                        uint32_t FalseWeight) {
@@ -3472,7 +3472,7 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
       Builder.CreateBr(FalseBB);
   }
 
-  EraseTerminatorInstAndDCECond(OldTerm);
+  EraseTerminatorAndDCECond(OldTerm);
   return true;
 }
 
@@ -3715,7 +3715,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
     BasicBlock *NewBB =
         BB->splitBasicBlock(BI->getIterator(), "switch.early.test");
     // Remove the uncond branch added to the old block.
-    TerminatorInst *OldTI = BB->getTerminator();
+    Instruction *OldTI = BB->getTerminator();
     Builder.SetInsertPoint(OldTI);
 
     if (TrueWhenEqual)
@@ -3759,7 +3759,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
   }
 
   // Erase the old branch instruction.
-  EraseTerminatorInstAndDCECond(BI);
+  EraseTerminatorAndDCECond(BI);
 
   LLVM_DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
   return true;
@@ -4007,7 +4007,7 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) {
     if (UnwindDest == nullptr) {
       removeUnwindEdge(PredBB);
     } else {
-      TerminatorInst *TI = PredBB->getTerminator();
+      Instruction *TI = PredBB->getTerminator();
       TI->replaceUsesOfWith(BB, UnwindDest);
     }
   }
@@ -4076,7 +4076,7 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   SmallVector<BranchInst *, 8> CondBranchPreds;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *P = *PI;
-    TerminatorInst *PTI = P->getTerminator();
+    Instruction *PTI = P->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) {
       if (BI->isUnconditional())
         UncondBranchPreds.push_back(P);
@@ -4181,7 +4181,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
 
   SmallVector<BasicBlock *, 8> Preds(pred_begin(BB), pred_end(BB));
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
-    TerminatorInst *TI = Preds[i]->getTerminator();
+    Instruction *TI = Preds[i]->getTerminator();
     IRBuilder<> Builder(TI);
     if (auto *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isUnconditional()) {
@@ -4193,10 +4193,10 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
       } else {
         if (BI->getSuccessor(0) == BB) {
           Builder.CreateBr(BI->getSuccessor(1));
-          EraseTerminatorInstAndDCECond(BI);
+          EraseTerminatorAndDCECond(BI);
         } else if (BI->getSuccessor(1) == BB) {
           Builder.CreateBr(BI->getSuccessor(0));
-          EraseTerminatorInstAndDCECond(BI);
+          EraseTerminatorAndDCECond(BI);
           Changed = true;
         }
       }
@@ -4438,7 +4438,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
     SplitBlock(&*NewDefault, &NewDefault->front());
     auto *OldTI = NewDefault->getTerminator();
     new UnreachableInst(SI->getContext(), OldTI);
-    EraseTerminatorInstAndDCECond(OldTI);
+    EraseTerminatorAndDCECond(OldTI);
     return true;
   }
 
@@ -4649,12 +4649,12 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
   SmallDenseMap<Value *, Constant *> ConstantPool;
   ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
   for (Instruction &I :CaseDest->instructionsWithoutDebug()) {
-    if (TerminatorInst *T = dyn_cast<TerminatorInst>(&I)) {
+    if (I.isTerminator()) {
       // If the terminator is a simple branch, continue to the next block.
-      if (T->getNumSuccessors() != 1 || T->isExceptionalTerminator())
+      if (I.getNumSuccessors() != 1 || I.isExceptionalTerminator())
         return false;
       Pred = CaseDest;
-      CaseDest = T->getSuccessor(0);
+      CaseDest = I.getSuccessor(0);
     } else if (Constant *C = ConstantFold(&I, DL, ConstantPool)) {
       // Instruction is side-effect free and constant.
 
@@ -5663,14 +5663,14 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
   if (IBI->getNumDestinations() == 0) {
     // If the indirectbr has no successors, change it to unreachable.
     new UnreachableInst(IBI->getContext(), IBI);
-    EraseTerminatorInstAndDCECond(IBI);
+    EraseTerminatorAndDCECond(IBI);
     return true;
   }
 
   if (IBI->getNumDestinations() == 1) {
     // If the indirectbr has one successor, change it to a direct branch.
     BranchInst::Create(IBI->getDestination(0), IBI);
-    EraseTerminatorInstAndDCECond(IBI);
+    EraseTerminatorAndDCECond(IBI);
     return true;
   }
 
@@ -5892,7 +5892,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
-      TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator();
+      Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
       if (Succ0TI->getNumSuccessors() == 1 &&
           Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
         if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI))
@@ -5901,7 +5901,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
     // If Successor #0 has multiple preds, we may be able to conditionally
     // execute Successor #1 if it branches to Successor #0.
-    TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator();
+    Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
     if (Succ1TI->getNumSuccessors() == 1 &&
         Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
       if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI))
@@ -5991,7 +5991,7 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
   for (PHINode &PHI : BB->phis())
     for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i)
       if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) {
-        TerminatorInst *T = PHI.getIncomingBlock(i)->getTerminator();
+        Instruction *T = PHI.getIncomingBlock(i)->getTerminator();
         IRBuilder<> Builder(T);
         if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
           BB->removePredecessor(PHI.getIncomingBlock(i));
diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index b6307acb947..0f42694e193 100644
--- a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -268,7 +268,7 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
     // Set VPBB successors. We create empty VPBBs for successors if they don't
     // exist already. Recipes will be created when the successor is visited
     // during the RPO traversal.
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     assert(TI && "Terminator expected.");
     unsigned NumSuccs = TI->getNumSuccessors();
 
-- 
GitLab


From 2b7e80d846d5029709839555a1e1f7eabd17fb8f Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 15 Oct 2018 10:10:54 +0000
Subject: [PATCH 0190/1116] [TI removal] Rework `InstVisitor` to support
 visiting instructions that are terminators without relying on the specific
 `TerminatorInst` type.

This required cleaning up two users of `InstVisitor`s usage of
`TerminatorInst` as well.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344503 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/InstVisitor.h  | 43 +++++++++++++++++++++++++---------
 lib/IR/Verifier.cpp            | 32 ++++++++++++-------------
 lib/Transforms/Scalar/SCCP.cpp | 16 ++++++-------
 3 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/include/llvm/IR/InstVisitor.h b/include/llvm/IR/InstVisitor.h
index 55536f237d4..554417f984a 100644
--- a/include/llvm/IR/InstVisitor.h
+++ b/include/llvm/IR/InstVisitor.h
@@ -166,15 +166,6 @@ public:
   // Specific Instruction type classes... note that all of the casts are
   // necessary because we use the instruction classes as opaque types...
   //
-  RetTy visitReturnInst(ReturnInst &I)            { DELEGATE(TerminatorInst);}
-  RetTy visitBranchInst(BranchInst &I)            { DELEGATE(TerminatorInst);}
-  RetTy visitSwitchInst(SwitchInst &I)            { DELEGATE(TerminatorInst);}
-  RetTy visitIndirectBrInst(IndirectBrInst &I)    { DELEGATE(TerminatorInst);}
-  RetTy visitResumeInst(ResumeInst &I)            { DELEGATE(TerminatorInst);}
-  RetTy visitUnreachableInst(UnreachableInst &I)  { DELEGATE(TerminatorInst);}
-  RetTy visitCleanupReturnInst(CleanupReturnInst &I) { DELEGATE(TerminatorInst);}
-  RetTy visitCatchReturnInst(CatchReturnInst &I)  { DELEGATE(TerminatorInst); }
-  RetTy visitCatchSwitchInst(CatchSwitchInst &I)  { DELEGATE(TerminatorInst);}
   RetTy visitICmpInst(ICmpInst &I)                { DELEGATE(CmpInst);}
   RetTy visitFCmpInst(FCmpInst &I)                { DELEGATE(CmpInst);}
   RetTy visitAllocaInst(AllocaInst &I)            { DELEGATE(UnaryInstruction);}
@@ -236,6 +227,37 @@ public:
     return static_cast<SubClass*>(this)->visitCallSite(&I);
   }
 
+  // While terminators don't have a distinct type modeling them, we support
+  // intercepting them with dedicated a visitor callback.
+  RetTy visitReturnInst(ReturnInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitBranchInst(BranchInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitSwitchInst(SwitchInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitIndirectBrInst(IndirectBrInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitResumeInst(ResumeInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitUnreachableInst(UnreachableInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitCleanupReturnInst(CleanupReturnInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitCatchReturnInst(CatchReturnInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitCatchSwitchInst(CatchSwitchInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitTerminator(Instruction &I)    { DELEGATE(Instruction);}
+
   // Next level propagators: If the user does not overload a specific
   // instruction type, they can overload one of these to get the whole class
   // of instructions...
@@ -243,7 +265,6 @@ public:
   RetTy visitCastInst(CastInst &I)                { DELEGATE(UnaryInstruction);}
   RetTy visitBinaryOperator(BinaryOperator &I)    { DELEGATE(Instruction);}
   RetTy visitCmpInst(CmpInst &I)                  { DELEGATE(Instruction);}
-  RetTy visitTerminatorInst(TerminatorInst &I)    { DELEGATE(Instruction);}
   RetTy visitUnaryInstruction(UnaryInstruction &I){ DELEGATE(Instruction);}
 
   // Provide a special visitor for a 'callsite' that visits both calls and
@@ -256,7 +277,7 @@ public:
       DELEGATE(Instruction);
 
     assert(CS.isInvoke());
-    DELEGATE(TerminatorInst);
+    return static_cast<SubClass *>(this)->visitTerminator(I);
   }
 
   // If the user wants a 'default' case, they can choose to override this
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index d96555ca5f9..6e0bb5ad358 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -287,7 +287,7 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
 
   // Maps catchswitches and cleanuppads that unwind to siblings to the
   // terminators that indicate the unwind, used to detect cycles therein.
-  MapVector<Instruction *, TerminatorInst *> SiblingFuncletInfo;
+  MapVector<Instruction *, Instruction *> SiblingFuncletInfo;
 
   /// Cache of constants visited in search of ConstantExprs.
   SmallPtrSet<const Constant *, 32> ConstantExprVisited;
@@ -457,7 +457,7 @@ private:
   void visitStoreInst(StoreInst &SI);
   void verifyDominatesUse(Instruction &I, unsigned i);
   void visitInstruction(Instruction &I);
-  void visitTerminatorInst(TerminatorInst &I);
+  void visitTerminator(Instruction &I);
   void visitBranchInst(BranchInst &BI);
   void visitReturnInst(ReturnInst &RI);
   void visitSwitchInst(SwitchInst &SI);
@@ -2009,7 +2009,7 @@ void Verifier::verifyFrameRecoverIndices() {
   }
 }
 
-static Instruction *getSuccPad(TerminatorInst *Terminator) {
+static Instruction *getSuccPad(Instruction *Terminator) {
   BasicBlock *UnwindDest;
   if (auto *II = dyn_cast<InvokeInst>(Terminator))
     UnwindDest = II->getUnwindDest();
@@ -2028,7 +2028,7 @@ void Verifier::verifySiblingFuncletUnwinds() {
     if (Visited.count(PredPad))
       continue;
     Active.insert(PredPad);
-    TerminatorInst *Terminator = Pair.second;
+    Instruction *Terminator = Pair.second;
     do {
       Instruction *SuccPad = getSuccPad(Terminator);
       if (Active.count(SuccPad)) {
@@ -2037,7 +2037,7 @@ void Verifier::verifySiblingFuncletUnwinds() {
         SmallVector<Instruction *, 8> CycleNodes;
         do {
           CycleNodes.push_back(CyclePad);
-          TerminatorInst *CycleTerminator = SiblingFuncletInfo[CyclePad];
+          Instruction *CycleTerminator = SiblingFuncletInfo[CyclePad];
           if (CycleTerminator != CyclePad)
             CycleNodes.push_back(CycleTerminator);
           CyclePad = getSuccPad(CycleTerminator);
@@ -2352,7 +2352,7 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
   }
 }
 
-void Verifier::visitTerminatorInst(TerminatorInst &I) {
+void Verifier::visitTerminator(Instruction &I) {
   // Ensure that terminators only exist at the end of the basic block.
   Assert(&I == I.getParent()->getTerminator(),
          "Terminator found in the middle of a basic block!", I.getParent());
@@ -2364,7 +2364,7 @@ void Verifier::visitBranchInst(BranchInst &BI) {
     Assert(BI.getCondition()->getType()->isIntegerTy(1),
            "Branch condition is not 'i1' type!", &BI, BI.getCondition());
   }
-  visitTerminatorInst(BI);
+  visitTerminator(BI);
 }
 
 void Verifier::visitReturnInst(ReturnInst &RI) {
@@ -2383,7 +2383,7 @@ void Verifier::visitReturnInst(ReturnInst &RI) {
 
   // Check to make sure that the return value has necessary properties for
   // terminators...
-  visitTerminatorInst(RI);
+  visitTerminator(RI);
 }
 
 void Verifier::visitSwitchInst(SwitchInst &SI) {
@@ -2398,7 +2398,7 @@ void Verifier::visitSwitchInst(SwitchInst &SI) {
            "Duplicate integer as switch case", &SI, Case.getCaseValue());
   }
 
-  visitTerminatorInst(SI);
+  visitTerminator(SI);
 }
 
 void Verifier::visitIndirectBrInst(IndirectBrInst &BI) {
@@ -2408,7 +2408,7 @@ void Verifier::visitIndirectBrInst(IndirectBrInst &BI) {
     Assert(BI.getDestination(i)->getType()->isLabelTy(),
            "Indirectbr destinations must all have pointer type!", &BI);
 
-  visitTerminatorInst(BI);
+  visitTerminator(BI);
 }
 
 void Verifier::visitSelectInst(SelectInst &SI) {
@@ -2987,7 +2987,7 @@ void Verifier::visitInvokeInst(InvokeInst &II) {
       "The unwind destination does not have an exception handling instruction!",
       &II);
 
-  visitTerminatorInst(II);
+  visitTerminator(II);
 }
 
 /// visitBinaryOperator - Check that both arguments to the binary operator are
@@ -3538,7 +3538,7 @@ void Verifier::visitResumeInst(ResumeInst &RI) {
            "inside a function.",
            &RI);
 
-  visitTerminatorInst(RI);
+  visitTerminator(RI);
 }
 
 void Verifier::visitCatchPadInst(CatchPadInst &CPI) {
@@ -3566,7 +3566,7 @@ void Verifier::visitCatchReturnInst(CatchReturnInst &CatchReturn) {
          "CatchReturnInst needs to be provided a CatchPad", &CatchReturn,
          CatchReturn.getOperand(0));
 
-  visitTerminatorInst(CatchReturn);
+  visitTerminator(CatchReturn);
 }
 
 void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) {
@@ -3687,7 +3687,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
           // Record cleanup sibling unwinds for verifySiblingFuncletUnwinds
           if (isa<CleanupPadInst>(&FPI) && !isa<ConstantTokenNone>(UnwindPad) &&
               getParentPad(UnwindPad) == getParentPad(&FPI))
-            SiblingFuncletInfo[&FPI] = cast<TerminatorInst>(U);
+            SiblingFuncletInfo[&FPI] = cast<Instruction>(U);
         }
       }
       // Make sure we visit all uses of FPI, but for nested pads stop as
@@ -3788,7 +3788,7 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
   }
 
   visitEHPadPredecessors(CatchSwitch);
-  visitTerminatorInst(CatchSwitch);
+  visitTerminator(CatchSwitch);
 }
 
 void Verifier::visitCleanupReturnInst(CleanupReturnInst &CRI) {
@@ -3804,7 +3804,7 @@ void Verifier::visitCleanupReturnInst(CleanupReturnInst &CRI) {
            &CRI);
   }
 
-  visitTerminatorInst(CRI);
+  visitTerminator(CRI);
 }
 
 void Verifier::verifyDominatesUse(Instruction &I, unsigned i) {
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 11e5549c332..b7340f294fd 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -563,7 +563,7 @@ private:
 
   // getFeasibleSuccessors - Return a vector of booleans to indicate which
   // successors are reachable from a given terminator instruction.
-  void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
+  void getFeasibleSuccessors(Instruction &TI, SmallVectorImpl<bool> &Succs);
 
   // OperandChangedState - This method is invoked on all of the users of an
   // instruction that was just changed state somehow.  Based on this
@@ -604,7 +604,7 @@ private:
   // Terminators
 
   void visitReturnInst(ReturnInst &I);
-  void visitTerminatorInst(TerminatorInst &TI);
+  void visitTerminator(Instruction &TI);
 
   void visitCastInst(CastInst &I);
   void visitSelectInst(SelectInst &I);
@@ -615,7 +615,7 @@ private:
 
   void visitCatchSwitchInst(CatchSwitchInst &CPI) {
     markOverdefined(&CPI);
-    visitTerminatorInst(CPI);
+    visitTerminator(CPI);
   }
 
   // Instructions that cannot be folded away.
@@ -630,12 +630,12 @@ private:
 
   void visitInvokeInst    (InvokeInst &II) {
     visitCallSite(&II);
-    visitTerminatorInst(II);
+    visitTerminator(II);
   }
 
   void visitCallSite      (CallSite CS);
-  void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
-  void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+  void visitResumeInst    (ResumeInst &I) { /*returns void*/ }
+  void visitUnreachableInst(UnreachableInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
 
   void visitInstruction(Instruction &I) {
@@ -650,7 +650,7 @@ private:
 
 // getFeasibleSuccessors - Return a vector of booleans to indicate which
 // successors are reachable from a given terminator instruction.
-void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
+void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
                                        SmallVectorImpl<bool> &Succs) {
   Succs.resize(TI.getNumSuccessors());
   if (auto *BI = dyn_cast<BranchInst>(&TI)) {
@@ -837,7 +837,7 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) {
   }
 }
 
-void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) {
+void SCCPSolver::visitTerminator(Instruction &TI) {
   SmallVector<bool, 16> SuccFeasible;
   getFeasibleSuccessors(TI, SuccFeasible);
 
-- 
GitLab


From d8d8371469074d303b39e8047f74e382e5eeb8b7 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 15 Oct 2018 10:42:50 +0000
Subject: [PATCH 0191/1116] [TI removal] Make `getTerminator()` return a
 generic `Instruction`.

This removes the primary remaining API producing `TerminatorInst` which
will reduce the rate at which code is introduced trying to use it and
generally make it much easier to remove the remaining APIs across the
codebase.

Also clean up some of the stragglers that the previous mechanical update
of variables missed.

Users of LLVM and out-of-tree code generally will need to update any
explicit variable types to handle this. Replacing `TerminatorInst` with
`Instruction` (or `auto`) almost always works. Most of these edits were
made in prior commits using the perl one-liner:
```
perl -i -ple 's/TerminatorInst(\b.* = .*getTerminator\(\))/Instruction\1/g'
```

This also my break some rare use cases where people overload for both
`Instruction` and `TerminatorInst`, but these should be easily fixed by
removing the `TerminatorInst` overload.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344504 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/BasicBlock.h               | 21 ++++++++++-----------
 lib/IR/BasicBlock.cpp                      |  7 ++++---
 lib/Transforms/Coroutines/CoroFrame.cpp    |  2 +-
 lib/Transforms/Scalar/GVNHoist.cpp         |  2 +-
 lib/Transforms/Vectorize/SLPVectorizer.cpp | 18 +++++++++---------
 tools/bugpoint/CrashDebugger.cpp           |  2 +-
 tools/llvm-diff/DifferenceEngine.cpp       |  4 ++--
 unittests/IR/DominatorTreeTest.cpp         |  2 +-
 unittests/IR/IRBuilderTest.cpp             |  2 +-
 9 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index 1ee19975af7..7244bba1ca5 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -38,7 +38,6 @@ class LandingPadInst;
 class LLVMContext;
 class Module;
 class PHINode;
-class TerminatorInst;
 class ValueSymbolTable;
 
 /// LLVM Basic Block Representation
@@ -50,12 +49,12 @@ class ValueSymbolTable;
 /// represents a label to which a branch can jump.
 ///
 /// A well formed basic block is formed of a list of non-terminating
-/// instructions followed by a single TerminatorInst instruction.
-/// TerminatorInst's may not occur in the middle of basic blocks, and must
-/// terminate the blocks. The BasicBlock class allows malformed basic blocks to
-/// occur because it may be useful in the intermediate stage of constructing or
-/// modifying a program. However, the verifier will ensure that basic blocks
-/// are "well formed".
+/// instructions followed by a single terminator instruction. Terminator
+/// instructions may not occur in the middle of basic blocks, and must terminate
+/// the blocks. The BasicBlock class allows malformed basic blocks to occur
+/// because it may be useful in the intermediate stage of constructing or
+/// modifying a program. However, the verifier will ensure that basic blocks are
+/// "well formed".
 class BasicBlock final : public Value, // Basic blocks are data objects also
                          public ilist_node_with_parent<BasicBlock, Function> {
 public:
@@ -120,10 +119,10 @@ public:
 
   /// Returns the terminator instruction if the block is well formed or null
   /// if the block is not well formed.
-  const TerminatorInst *getTerminator() const LLVM_READONLY;
-  TerminatorInst *getTerminator() {
-    return const_cast<TerminatorInst *>(
-                        static_cast<const BasicBlock *>(this)->getTerminator());
+  const Instruction *getTerminator() const LLVM_READONLY;
+  Instruction *getTerminator() {
+    return const_cast<Instruction *>(
+        static_cast<const BasicBlock *>(this)->getTerminator());
   }
 
   /// Returns the call instruction calling \@llvm.experimental.deoptimize
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 03fb5ccaffc..12ab2e2ace4 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -135,9 +135,10 @@ const Module *BasicBlock::getModule() const {
   return getParent()->getParent();
 }
 
-const TerminatorInst *BasicBlock::getTerminator() const {
-  if (InstList.empty()) return nullptr;
-  return dyn_cast<TerminatorInst>(&InstList.back());
+const Instruction *BasicBlock::getTerminator() const {
+  if (InstList.empty() || !InstList.back().isTerminator())
+    return nullptr;
+  return &InstList.back();
 }
 
 const CallInst *BasicBlock::getTerminatingMustTailCall() const {
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index 4357948d5ab..4cb0a52961c 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -601,7 +601,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
 }
 
 // Sets the unwind edge of an instruction to a particular successor.
-static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) {
+static void setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) {
   if (auto *II = dyn_cast<InvokeInst>(TI))
     II->setUnwindDest(Succ);
   else if (auto *CS = dyn_cast<CatchSwitchInst>(TI))
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
index 3043df9cca7..0797ce9adea 100644
--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -577,7 +577,7 @@ private:
   // Returns the edge via which an instruction in BB will get the values from.
 
   // Returns true when the values are flowing out to each edge.
-  bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const {
+  bool valueAnticipable(CHIArgs C, Instruction *TI) const {
     if (TI->getNumSuccessors() > (unsigned)size(C))
       return false; // Not enough args in this CHI.
 
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 79b575b78cd..5fdbf219009 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1536,12 +1536,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // Check for terminator values (e.g. invoke).
       for (unsigned j = 0; j < VL.size(); ++j)
         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
-          TerminatorInst *Term = dyn_cast<TerminatorInst>(
-              cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
-          if (Term) {
-            LLVM_DEBUG(
-                dbgs()
-                << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+          Instruction *Term = dyn_cast<Instruction>(
+              cast<PHINode>(VL[j])->getIncomingValueForBlock(
+                  PH->getIncomingBlock(i)));
+          if (Term && Term->isTerminator()) {
+            LLVM_DEBUG(dbgs()
+                       << "SLP: Need to swizzle PHINodes (terminator use).\n");
             BS.cancelScheduling(VL, VL0);
             newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
             return;
@@ -3652,7 +3652,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       if (PHINode *PH = dyn_cast<PHINode>(User)) {
         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
           if (PH->getIncomingValue(i) == Scalar) {
-            TerminatorInst *IncomingTerminator =
+            Instruction *IncomingTerminator =
                 PH->getIncomingBlock(i)->getTerminator();
             if (isa<CatchSwitchInst>(IncomingTerminator)) {
               Builder.SetInsertPoint(VecI->getParent(),
@@ -3960,7 +3960,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     ScheduleEnd = I->getNextNode();
     if (isOneOf(S, I) != I)
       CheckSheduleForI(I);
-    assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+    assert(ScheduleEnd && "tried to vectorize a terminator?");
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
     return true;
   }
@@ -3996,7 +3996,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
         ScheduleEnd = I->getNextNode();
         if (isOneOf(S, I) != I)
           CheckSheduleForI(I);
-        assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+        assert(ScheduleEnd && "tried to vectorize a terminator?");
         LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I
                           << "\n");
         return true;
diff --git a/tools/bugpoint/CrashDebugger.cpp b/tools/bugpoint/CrashDebugger.cpp
index e973bfef4dc..a50ff4c255b 100644
--- a/tools/bugpoint/CrashDebugger.cpp
+++ b/tools/bugpoint/CrashDebugger.cpp
@@ -409,7 +409,7 @@ bool ReduceCrashingBlocks::TestBlocks(std::vector<const BasicBlock *> &BBs) {
         for (BasicBlock *Succ : successors(&BB))
           Succ->removePredecessor(&BB);
 
-        TerminatorInst *BBTerm = BB.getTerminator();
+        Instruction *BBTerm = BB.getTerminator();
         if (BBTerm->isEHPad() || BBTerm->getType()->isTokenTy())
           continue;
         if (!BBTerm->getType()->isVoidTy())
diff --git a/tools/llvm-diff/DifferenceEngine.cpp b/tools/llvm-diff/DifferenceEngine.cpp
index b2673c1407f..acff8bb3e89 100644
--- a/tools/llvm-diff/DifferenceEngine.cpp
+++ b/tools/llvm-diff/DifferenceEngine.cpp
@@ -629,8 +629,8 @@ void FunctionDifferenceEngine::runBlockDiff(BasicBlock::iterator LStart,
   // If the terminators have different kinds, but one is an invoke and the
   // other is an unconditional branch immediately following a call, unify
   // the results and the destinations.
-  TerminatorInst *LTerm = LStart->getParent()->getTerminator();
-  TerminatorInst *RTerm = RStart->getParent()->getTerminator();
+  Instruction *LTerm = LStart->getParent()->getTerminator();
+  Instruction *RTerm = RStart->getParent()->getTerminator();
   if (isa<BranchInst>(LTerm) && isa<InvokeInst>(RTerm)) {
     if (cast<BranchInst>(LTerm)->isConditional()) return;
     BasicBlock::iterator I = LTerm->getIterator();
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index cf81623d0d1..7539bbc860b 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -301,7 +301,7 @@ TEST(DominatorTree, NonUniqueEdges) {
         BasicBlock *BB1 = &*FI++;
         BasicBlock *BB2 = &*FI++;
 
-        const TerminatorInst *TI = BB0->getTerminator();
+        const Instruction *TI = BB0->getTerminator();
         assert(TI->getNumSuccessors() == 3 && "Switch has three successors");
 
         BasicBlockEdge Edge_BB0_BB2(BB0, TI->getSuccessor(0));
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index 713c0a14f66..be29b41309a 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -160,7 +160,7 @@ TEST_F(IRBuilderTest, CreateCondBr) {
   BasicBlock *FBB = BasicBlock::Create(Ctx, "", F);
 
   BranchInst *BI = Builder.CreateCondBr(Builder.getTrue(), TBB, FBB);
-  TerminatorInst *TI = BB->getTerminator();
+  Instruction *TI = BB->getTerminator();
   EXPECT_EQ(BI, TI);
   EXPECT_EQ(2u, TI->getNumSuccessors());
   EXPECT_EQ(TBB, TI->getSuccessor(0));
-- 
GitLab


From 4b284c14ecdd7b86ec6d28370e7860012e546cae Mon Sep 17 00:00:00 2001
From: Fedor Sergeev <fedor.sergeev@azul.com>
Date: Mon, 15 Oct 2018 10:46:35 +0000
Subject: [PATCH 0192/1116] [NewPM] implement SCC printing for
 -print-before-all/-print-after-all

Removing deficiency of initial implementation of -print-before-all/-after-all
- it was effectively skipping IR printing for all the SCC passes.

Now LazyCallGraph:SCC gets its IR printed.

Reviewed By: skatkov
Differential Revision: https://reviews.llvm.org/D53270

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344505 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Passes/StandardInstrumentations.cpp | 32 ++++++++++++++--
 test/Other/scc-pass-printer.ll          | 49 +++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 4 deletions(-)
 create mode 100644 test/Other/scc-pass-printer.ll

diff --git a/lib/Passes/StandardInstrumentations.cpp b/lib/Passes/StandardInstrumentations.cpp
index aa34584fa12..48d36e5a01e 100644
--- a/lib/Passes/StandardInstrumentations.cpp
+++ b/lib/Passes/StandardInstrumentations.cpp
@@ -37,10 +37,6 @@ namespace PrintIR {
 /// Generic IR-printing helper that unpacks a pointer to IRUnit wrapped into
 /// llvm::Any and does actual print job.
 void unwrapAndPrint(StringRef Banner, Any IR) {
-  if (any_isa<const CallGraphSCC *>(IR) ||
-      any_isa<const LazyCallGraph::SCC *>(IR))
-    return;
-
   SmallString<40> Extra{"\n"};
   const Module *M = nullptr;
   if (any_isa<const Module *>(IR)) {
@@ -55,6 +51,34 @@ void unwrapAndPrint(StringRef Banner, Any IR) {
     }
     M = F->getParent();
     Extra = formatv(" (function: {0})\n", F->getName());
+  } else if (any_isa<const LazyCallGraph::SCC *>(IR)) {
+    const LazyCallGraph::SCC *C = any_cast<const LazyCallGraph::SCC *>(IR);
+    assert(C);
+    if (!llvm::forcePrintModuleIR()) {
+      Extra = formatv(" (scc: {0})\n", C->getName());
+      bool BannerPrinted = false;
+      for (const LazyCallGraph::Node &N : *C) {
+        const Function &F = N.getFunction();
+        if (!F.isDeclaration() && isFunctionInPrintList(F.getName())) {
+          if (!BannerPrinted) {
+            dbgs() << Banner << Extra;
+            BannerPrinted = true;
+          }
+          F.print(dbgs());
+        }
+      }
+      return;
+    }
+    for (const LazyCallGraph::Node &N : *C) {
+      const Function &F = N.getFunction();
+      if (!F.isDeclaration() && isFunctionInPrintList(F.getName())) {
+        M = F.getParent();
+        break;
+      }
+    }
+    if (!M)
+      return;
+    Extra = formatv(" (for scc: {0})\n", C->getName());
   } else if (any_isa<const Loop *>(IR)) {
     const Loop *L = any_cast<const Loop *>(IR);
     const Function *F = L->getHeader()->getParent();
diff --git a/test/Other/scc-pass-printer.ll b/test/Other/scc-pass-printer.ll
new file mode 100644
index 00000000000..9d86bf03963
--- /dev/null
+++ b/test/Other/scc-pass-printer.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -inline -print-after-all | FileCheck %s -check-prefix=INL
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -passes=inline -print-after-all | FileCheck %s -check-prefix=INL
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -inline -print-after-all -print-module-scope | FileCheck %s -check-prefix=INL-MOD
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -passes=inline -print-after-all -print-module-scope | FileCheck %s -check-prefix=INL-MOD
+
+; INL: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .bar, foo}}
+; INL: define void @bar()
+; INL-NEXT:  call void @foo()
+; INL: define void @foo()
+; INL-NEXT:   call void @bar()
+; INL: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .tester}}
+; INL: define void @tester()
+; INL-NEXT:  call void @foo()
+; INL: IR Dump After
+
+; INL-MOD: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .bar, foo}}
+; INL-MOD: define void @tester()
+; INL-MOD-NEXT:  call void @foo()
+; INL-MOD: define void @foo()
+; INL-MOD-NEXT:   call void @bar()
+; INL-MOD: define void @bar()
+; INL-MOD-NEXT:  call void @foo()
+; INL-MOD: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .tester}}
+; INL-MOD: define void @tester()
+; INL-MOD-NEXT:  call void @foo()
+; INL-MOD: define void @foo()
+; INL-MOD-NEXT:   call void @bar()
+; INL-MOD: define void @bar()
+; INL-MOD-NEXT:  call void @foo()
+; INL-MOD: IR Dump After
+
+define void @tester() noinline {
+  call void @foo()
+  ret void
+}
+
+define void @foo() noinline {
+  call void @bar()
+  ret void
+}
+
+define void @bar() noinline {
+  call void @foo()
+  ret void
+}
-- 
GitLab


From 69b3f302bf683ae0975608aa35fc9602f5026e39 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Mon, 15 Oct 2018 11:37:04 +0000
Subject: [PATCH 0193/1116] AMDGPU: Test showing a scalar buffer load
 deficiency

Change-Id: I5b64a565f22a8482aa0712488d85e45163ac3d12

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344506 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/AMDGPU/smrd.ll | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 612943b66c4..6596119f8b3 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -511,6 +511,29 @@ main_body:
   ret void
 }
 
+; GCN-LABEL: {{^}}smrd_uniform_loop:
+;
+; TODO: this should use an s_buffer_load
+;
+; GCN: buffer_load_dword
+define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
+main_body:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop ]
+  %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop ]
+  %offset = shl i32 %counter, 2
+  %v = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset)
+  %sum.next = fadd float %sum, %v
+  %counter.next = add i32 %counter, 1
+  %cc = icmp uge i32 %counter.next, %bound
+  br i1 %cc, label %exit, label %loop
+
+exit:
+  ret float %sum.next
+}
+
 
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
-- 
GitLab


From ee084ebe8c45915458aecd5dbb9ff131c63aee97 Mon Sep 17 00:00:00 2001
From: Aleksandar Beserminji <abeserminji@wavecomp.com>
Date: Mon, 15 Oct 2018 12:59:17 +0000
Subject: [PATCH 0194/1116] [mips][micromips] Fix overlaping FDEs error

When compiling static executable for micromips, CFI symbols
are incorrectly labeled as MICROMIPS, which cause
".eh_frame_hdr refers to overlapping FDEs." error.

This patch does not label CFI symbols as MICROMIPS, and FDEs do not
overlap anymore. This patch also exposes another bug, which is fixed
here: https://reviews.llvm.org/D52985

Differential Revision: https://reviews.llvm.org/D52987


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344511 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Mips/MCTargetDesc/MipsELFStreamer.cpp     | 17 +++++++++
 .../Mips/MCTargetDesc/MipsELFStreamer.h       |  7 ++++
 test/DebugInfo/Mips/eh_frame.ll               | 38 +++++++++++++++++++
 3 files changed, 62 insertions(+)
 create mode 100644 test/DebugInfo/Mips/eh_frame.ll

diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index 7b9a02503ce..21b01e85096 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -53,6 +54,22 @@ void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
   createPendingLabelRelocs();
 }
 
+void MipsELFStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
+  Frame.Begin = getContext().createTempSymbol();
+  MCELFStreamer::EmitLabel(Frame.Begin);
+}
+
+MCSymbol *MipsELFStreamer::EmitCFILabel() {
+  MCSymbol *Label = getContext().createTempSymbol("cfi", true);
+  MCELFStreamer::EmitLabel(Label);
+  return Label;
+}
+
+void MipsELFStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+  Frame.End = getContext().createTempSymbol();
+  MCELFStreamer::EmitLabel(Frame.End);
+}
+
 void MipsELFStreamer::createPendingLabelRelocs() {
   MipsTargetELFStreamer *ELFTargetStreamer =
       static_cast<MipsTargetELFStreamer *>(getTargetStreamer());
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index d141f5d77c6..d140201494f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -25,6 +25,7 @@ namespace llvm {
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
+class MCDwarfFrameInfo;
 class MCSubtargetInfo;
 
 class MipsELFStreamer : public MCELFStreamer {
@@ -60,6 +61,12 @@ public:
   void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
   void EmitIntValue(uint64_t Value, unsigned Size) override;
 
+  // Overriding these functions allows us to avoid recording of these labels
+  // in EmitLabel and later marking them as microMIPS.
+  void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
+  void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
+  MCSymbol *EmitCFILabel() override;
+
   /// Emits all the option records stored up until the point it's called.
   void EmitMipsOptionRecords();
 
diff --git a/test/DebugInfo/Mips/eh_frame.ll b/test/DebugInfo/Mips/eh_frame.ll
new file mode 100644
index 00000000000..4687443cb1c
--- /dev/null
+++ b/test/DebugInfo/Mips/eh_frame.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple mips-unknown-linux-gnu -mattr=+micromips -O3 -filetype=obj -o - %s | llvm-readelf -r | FileCheck %s
+
+; CHECK: .rel.eh_frame
+; CHECK: DW.ref.__gxx_personality_v0
+; CHECK-NEXT: .text
+; CHECK-NEXT: .gcc_except_table
+
+@_ZTIi = external constant i8*
+
+define dso_local i32 @main() local_unnamed_addr personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind
+  %0 = bitcast i8* %exception.i to i32*
+  store i32 5, i32* %0, align 16
+  invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn
+          to label %.noexc unwind label %return
+
+.noexc:
+  unreachable
+
+return:
+  %1 = landingpad { i8*, i32 }
+          catch i8* null
+  %2 = extractvalue { i8*, i32 } %1, 0
+  %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind
+  tail call void @__cxa_end_catch()
+  ret i32 0
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr
+
+declare void @__cxa_end_catch() local_unnamed_addr
+
+declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr
+
+declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
-- 
GitLab


From 7f770f7d215d6109fb4a919776261b190c53735d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 15 Oct 2018 13:20:41 +0000
Subject: [PATCH 0195/1116] [ARM][NEON] Improve vector popcnt lowering with
 PADDL (PR39281)

As I suggested on PR39281, this patch uses PADDL pairwise addition to widen from the vXi8 CTPOP result to the target vector type.

This is a blocker for moving more x86 code to generic vector CTPOP expansion (P32655 + D53258) - ARM's vXi64 CTPOP currently expands, which would generate a vXi64 MUL but ARM's custom lowering expands the general MUL case and vectors aren't well handled in LegalizeDAG - improving the CTPOP lowering was a lot easier than fixing the MUL lowering for this one case......

Differential Revision: https://reviews.llvm.org/D53257

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344512 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMISelLowering.cpp | 156 +++++------------------------
 test/CodeGen/ARM/popcnt.ll         | 154 ++++------------------------
 2 files changed, 43 insertions(+), 267 deletions(-)

diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index bfff368a8fe..3527d049f50 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -669,8 +669,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
-    setOperationAction(ISD::CTPOP,      MVT::v1i64, Expand);
-    setOperationAction(ISD::CTPOP,      MVT::v2i64, Expand);
+    setOperationAction(ISD::CTPOP,      MVT::v1i64, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v2i64, Custom);
 
     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
@@ -5409,10 +5409,6 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
 
     // Compute with: cttz(x) = ctpop(lsb - 1)
 
-    // Since we can only compute the number of bits in a byte with vcnt.8, we
-    // have to gather the result with pairwise addition (vpaddl) for i16, i32,
-    // and i64.
-
     // Compute LSB - 1.
     SDValue Bits;
     if (ElemTy == MVT::i64) {
@@ -5425,32 +5421,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
                                 DAG.getTargetConstant(1, dl, ElemTy));
       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
     }
-
-    // Count #bits with vcnt.8.
-    EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
-    SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
-    SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
-
-    // Gather the #bits with vpaddl (pairwise add.)
-    EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
-    SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
-        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
-        Cnt8);
-    if (ElemTy == MVT::i16)
-      return Cnt16;
-
-    EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
-    SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
-        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
-        Cnt16);
-    if (ElemTy == MVT::i32)
-      return Cnt32;
-
-    assert(ElemTy == MVT::i64);
-    SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
-        Cnt32);
-    return Cnt64;
+    return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
   }
 
   if (!ST->hasV6T2Ops())
@@ -5460,112 +5431,37 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
 }
 
-/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
-/// for each 16-bit element from operand, repeated.  The basic idea is to
-/// leverage vcnt to get the 8-bit counts, gather and add the results.
-///
-/// Trace for v4i16:
-/// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
-/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
-/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
-/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
-///            [b0 b1 b2 b3 b4 b5 b6 b7]
-///           +[b1 b0 b3 b2 b5 b4 b7 b6]
-/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
-/// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
-static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  SDLoc DL(N);
-
-  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
-  SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
-  SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
-  SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
-  SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
-  return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
-}
-
-/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
-/// bit-count for each 16-bit element from the operand.  We need slightly
-/// different sequencing for v4i16 and v8i16 to stay within NEON's available
-/// 64/128-bit registers.
-///
-/// Trace for v4i16:
-/// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
-/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
-/// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
-/// v4i16:Extracted = [k0    k1    k2    k3    ]
-static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
+static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
-  SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
-  if (VT.is64BitVector()) {
-    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
-                       DAG.getIntPtrConstant(0, DL));
-  } else {
-    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
-                                    BitCounts, DAG.getIntPtrConstant(0, DL));
-    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
-  }
-}
-
-/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
-/// bit-count for each 32-bit element from the operand.  The idea here is
-/// to split the vector into 16-bit elements, leverage the 16-bit count
-/// routine, and then combine the results.
-///
-/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
-/// input    = [v0    v1    ] (vi: 32-bit elements)
-/// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
-/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
-/// vrev: N0 = [k1 k0 k3 k2 ]
-///            [k0 k1 k2 k3 ]
-///       N1 =+[k1 k0 k3 k2 ]
-///            [k0 k2 k1 k3 ]
-///       N2 =+[k1 k3 k0 k2 ]
-///            [k0    k2    k1    k3    ]
-/// Extended =+[k1    k3    k0    k2    ]
-///            [k0    k2    ]
-/// Extracted=+[k1    k3    ]
-///
-static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  SDLoc DL(N);
+  assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
+  assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
+          VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
+         "Unexpected type for custom ctpop lowering");
 
-  EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+  SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
+  Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
 
-  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
-  SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
-  SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
-  SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
-  SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
+  // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
+  unsigned EltSize = 8;
+  unsigned NumElts = VT.is64BitVector() ? 8 : 16;
+  while (EltSize != VT.getScalarSizeInBits()) {
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
+                                  TLI.getPointerTy(DAG.getDataLayout())));
+    Ops.push_back(Res);
 
-  if (VT.is64BitVector()) {
-    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
-                       DAG.getIntPtrConstant(0, DL));
-  } else {
-    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
-                                    DAG.getIntPtrConstant(0, DL));
-    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
+    EltSize *= 2;
+    NumElts /= 2;
+    MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
+    Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
   }
-}
 
-static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
-                          const ARMSubtarget *ST) {
-  EVT VT = N->getValueType(0);
-
-  assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
-  assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
-          VT == MVT::v4i16 || VT == MVT::v8i16) &&
-         "Unexpected type for custom ctpop lowering");
-
-  if (VT.getVectorElementType() == MVT::i32)
-    return lowerCTPOP32BitElements(N, DAG);
-  else
-    return lowerCTPOP16BitElements(N, DAG);
+  return Res;
 }
 
 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
diff --git a/test/CodeGen/ARM/popcnt.ll b/test/CodeGen/ARM/popcnt.ll
index 224d5dcb3a6..e3ce5cd1ff9 100644
--- a/test/CodeGen/ARM/popcnt.ll
+++ b/test/CodeGen/ARM/popcnt.ll
@@ -32,11 +32,7 @@ define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vldr d16, [r0]
 ; CHECK-NEXT:    vcnt.8 d16, d16
-; CHECK-NEXT:    vrev16.8 d17, d16
-; CHECK-NEXT:    vadd.i8 d16, d16, d17
-; CHECK-NEXT:    vorr d17, d16, d16
-; CHECK-NEXT:    vuzp.8 d16, d17
-; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
@@ -49,11 +45,7 @@ define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
 ; CHECK-NEXT:    vcnt.8 q8, q8
-; CHECK-NEXT:    vrev16.8 q9, q8
-; CHECK-NEXT:    vadd.i8 q8, q8, q9
-; CHECK-NEXT:    vorr q9, q8, q8
-; CHECK-NEXT:    vuzp.8 q8, q9
-; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vpaddl.u8 q8, q8
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
 ; CHECK-NEXT:    mov pc, lr
@@ -67,16 +59,8 @@ define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vldr d16, [r0]
 ; CHECK-NEXT:    vcnt.8 d16, d16
-; CHECK-NEXT:    vrev16.8 d17, d16
-; CHECK-NEXT:    vadd.i8 d16, d16, d17
-; CHECK-NEXT:    vorr d17, d16, d16
-; CHECK-NEXT:    vuzp.8 d16, d17
-; CHECK-NEXT:    vmovl.u8 q8, d16
-; CHECK-NEXT:    vrev32.16 d18, d16
-; CHECK-NEXT:    vadd.i16 d16, d16, d18
-; CHECK-NEXT:    vorr d17, d16, d16
-; CHECK-NEXT:    vuzp.16 d16, d17
-; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vpaddl.u16 d16, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
@@ -89,16 +73,8 @@ define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
 ; CHECK-NEXT:    vcnt.8 q8, q8
-; CHECK-NEXT:    vrev16.8 q9, q8
-; CHECK-NEXT:    vadd.i8 q8, q8, q9
-; CHECK-NEXT:    vorr q9, q8, q8
-; CHECK-NEXT:    vuzp.8 q8, q9
-; CHECK-NEXT:    vmovl.u8 q9, d16
-; CHECK-NEXT:    vrev32.16 q9, q9
-; CHECK-NEXT:    vaddw.u8 q8, q9, d16
-; CHECK-NEXT:    vorr q9, q8, q8
-; CHECK-NEXT:    vuzp.16 q8, q9
-; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vpaddl.u16 q8, q8
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
 ; CHECK-NEXT:    mov pc, lr
@@ -110,50 +86,13 @@ define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind {
 define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind {
 ; CHECK-LABEL: vcnt64:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vldr d16, [r0]
-; CHECK-NEXT:    ldr r2, .LCPI6_0
-; CHECK-NEXT:    vmov.32 r0, d16[0]
-; CHECK-NEXT:    ldr r3, .LCPI6_3
-; CHECK-NEXT:    vmov.32 r1, d16[1]
-; CHECK-NEXT:    ldr lr, .LCPI6_2
-; CHECK-NEXT:    ldr r12, .LCPI6_1
-; CHECK-NEXT:    vldr s1, .LCPI6_4
-; CHECK-NEXT:    and r4, r2, r0, lsr #1
-; CHECK-NEXT:    sub r0, r0, r4
-; CHECK-NEXT:    and r2, r2, r1, lsr #1
-; CHECK-NEXT:    sub r1, r1, r2
-; CHECK-NEXT:    and r4, r0, r3
-; CHECK-NEXT:    and r0, r3, r0, lsr #2
-; CHECK-NEXT:    and r2, r1, r3
-; CHECK-NEXT:    add r0, r4, r0
-; CHECK-NEXT:    and r1, r3, r1, lsr #2
-; CHECK-NEXT:    add r1, r2, r1
-; CHECK-NEXT:    add r0, r0, r0, lsr #4
-; CHECK-NEXT:    and r0, r0, lr
-; CHECK-NEXT:    add r1, r1, r1, lsr #4
-; CHECK-NEXT:    mul r2, r0, r12
-; CHECK-NEXT:    and r0, r1, lr
-; CHECK-NEXT:    mul r1, r0, r12
-; CHECK-NEXT:    lsr r0, r2, #24
-; CHECK-NEXT:    add r0, r0, r1, lsr #24
-; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vmov r0, r1, d0
-; CHECK-NEXT:    pop {r4, lr}
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vpaddl.u16 d16, d16
+; CHECK-NEXT:    vpaddl.u32 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI6_0:
-; CHECK-NEXT:    .long 1431655765 @ 0x55555555
-; CHECK-NEXT:  .LCPI6_1:
-; CHECK-NEXT:    .long 16843009 @ 0x1010101
-; CHECK-NEXT:  .LCPI6_2:
-; CHECK-NEXT:    .long 252645135 @ 0xf0f0f0f
-; CHECK-NEXT:  .LCPI6_3:
-; CHECK-NEXT:    .long 858993459 @ 0x33333333
-; CHECK-NEXT:  .LCPI6_4:
-; CHECK-NEXT:    .long 0 @ float 0
 	%tmp1 = load <1 x i64>, <1 x i64>* %A
 	%tmp2 = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %tmp1)
 	ret <1 x i64> %tmp2
@@ -162,73 +101,14 @@ define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind {
 define <2 x i64> @vcntQ64(<2 x i64>* %A) nounwind {
 ; CHECK-LABEL: vcntQ64:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
-; CHECK-NEXT:    vmov.32 r1, d17[1]
-; CHECK-NEXT:    ldr lr, .LCPI7_0
-; CHECK-NEXT:    vmov.32 r2, d17[0]
-; CHECK-NEXT:    ldr r0, .LCPI7_2
-; CHECK-NEXT:    vmov.32 r3, d16[0]
-; CHECK-NEXT:    ldr r12, .LCPI7_1
-; CHECK-NEXT:    ldr r5, .LCPI7_3
-; CHECK-NEXT:    vldr s3, .LCPI7_4
-; CHECK-NEXT:    and r4, lr, r1, lsr #1
-; CHECK-NEXT:    sub r1, r1, r4
-; CHECK-NEXT:    and r4, r1, r0
-; CHECK-NEXT:    and r1, r0, r1, lsr #2
-; CHECK-NEXT:    add r1, r4, r1
-; CHECK-NEXT:    and r4, lr, r2, lsr #1
-; CHECK-NEXT:    sub r2, r2, r4
-; CHECK-NEXT:    and r4, r2, r0
-; CHECK-NEXT:    add r1, r1, r1, lsr #4
-; CHECK-NEXT:    and r2, r0, r2, lsr #2
-; CHECK-NEXT:    and r6, r1, r12
-; CHECK-NEXT:    add r2, r4, r2
-; CHECK-NEXT:    and r4, lr, r3, lsr #1
-; CHECK-NEXT:    sub r3, r3, r4
-; CHECK-NEXT:    and r4, r3, r0
-; CHECK-NEXT:    add r2, r2, r2, lsr #4
-; CHECK-NEXT:    and r3, r0, r3, lsr #2
-; CHECK-NEXT:    and r2, r2, r12
-; CHECK-NEXT:    add r3, r4, r3
-; CHECK-NEXT:    add r3, r3, r3, lsr #4
-; CHECK-NEXT:    and r3, r3, r12
-; CHECK-NEXT:    mul r4, r3, r5
-; CHECK-NEXT:    vmov.32 r3, d16[1]
-; CHECK-NEXT:    and r1, lr, r3, lsr #1
-; CHECK-NEXT:    sub r1, r3, r1
-; CHECK-NEXT:    and r3, r1, r0
-; CHECK-NEXT:    and r0, r0, r1, lsr #2
-; CHECK-NEXT:    mul r1, r2, r5
-; CHECK-NEXT:    add r0, r3, r0
-; CHECK-NEXT:    mul r2, r6, r5
-; CHECK-NEXT:    add r0, r0, r0, lsr #4
-; CHECK-NEXT:    and r0, r0, r12
-; CHECK-NEXT:    mul r3, r0, r5
-; CHECK-NEXT:    lsr r0, r1, #24
-; CHECK-NEXT:    lsr r1, r4, #24
-; CHECK-NEXT:    add r0, r0, r2, lsr #24
-; CHECK-NEXT:    vmov s2, r0
-; CHECK-NEXT:    add r0, r1, r3, lsr #24
-; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vmov.f32 s1, s3
-; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    vmov r0, r1, d0
-; CHECK-NEXT:    pop {r4, r5, r6, lr}
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vpaddl.u16 q8, q8
+; CHECK-NEXT:    vpaddl.u32 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
 ; CHECK-NEXT:    mov pc, lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI7_0:
-; CHECK-NEXT:    .long 1431655765 @ 0x55555555
-; CHECK-NEXT:  .LCPI7_1:
-; CHECK-NEXT:    .long 252645135 @ 0xf0f0f0f
-; CHECK-NEXT:  .LCPI7_2:
-; CHECK-NEXT:    .long 858993459 @ 0x33333333
-; CHECK-NEXT:  .LCPI7_3:
-; CHECK-NEXT:    .long 16843009 @ 0x1010101
-; CHECK-NEXT:  .LCPI7_4:
-; CHECK-NEXT:    .long 0 @ float 0
 	%tmp1 = load <2 x i64>, <2 x i64>* %A
 	%tmp2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp1)
 	ret <2 x i64> %tmp2
-- 
GitLab


From 1f57e3857aaae24a7afec22084f6c2d421c29c3b Mon Sep 17 00:00:00 2001
From: Aleksandar Beserminji <abeserminji@wavecomp.com>
Date: Mon, 15 Oct 2018 14:36:48 +0000
Subject: [PATCH 0196/1116] [mips][micromips] Revert "Fix overlaping FDEs
 error"

This reverts r344511.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344515 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Mips/MCTargetDesc/MipsELFStreamer.cpp     | 17 ---------
 .../Mips/MCTargetDesc/MipsELFStreamer.h       |  7 ----
 test/DebugInfo/Mips/eh_frame.ll               | 38 -------------------
 3 files changed, 62 deletions(-)
 delete mode 100644 test/DebugInfo/Mips/eh_frame.ll

diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index 21b01e85096..7b9a02503ce 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -15,7 +15,6 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -54,22 +53,6 @@ void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
   createPendingLabelRelocs();
 }
 
-void MipsELFStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
-  Frame.Begin = getContext().createTempSymbol();
-  MCELFStreamer::EmitLabel(Frame.Begin);
-}
-
-MCSymbol *MipsELFStreamer::EmitCFILabel() {
-  MCSymbol *Label = getContext().createTempSymbol("cfi", true);
-  MCELFStreamer::EmitLabel(Label);
-  return Label;
-}
-
-void MipsELFStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
-  Frame.End = getContext().createTempSymbol();
-  MCELFStreamer::EmitLabel(Frame.End);
-}
-
 void MipsELFStreamer::createPendingLabelRelocs() {
   MipsTargetELFStreamer *ELFTargetStreamer =
       static_cast<MipsTargetELFStreamer *>(getTargetStreamer());
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index d140201494f..d141f5d77c6 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -25,7 +25,6 @@ namespace llvm {
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
-class MCDwarfFrameInfo;
 class MCSubtargetInfo;
 
 class MipsELFStreamer : public MCELFStreamer {
@@ -61,12 +60,6 @@ public:
   void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
   void EmitIntValue(uint64_t Value, unsigned Size) override;
 
-  // Overriding these functions allows us to avoid recording of these labels
-  // in EmitLabel and later marking them as microMIPS.
-  void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
-  void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
-  MCSymbol *EmitCFILabel() override;
-
   /// Emits all the option records stored up until the point it's called.
   void EmitMipsOptionRecords();
 
diff --git a/test/DebugInfo/Mips/eh_frame.ll b/test/DebugInfo/Mips/eh_frame.ll
deleted file mode 100644
index 4687443cb1c..00000000000
--- a/test/DebugInfo/Mips/eh_frame.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc -mtriple mips-unknown-linux-gnu -mattr=+micromips -O3 -filetype=obj -o - %s | llvm-readelf -r | FileCheck %s
-
-; CHECK: .rel.eh_frame
-; CHECK: DW.ref.__gxx_personality_v0
-; CHECK-NEXT: .text
-; CHECK-NEXT: .gcc_except_table
-
-@_ZTIi = external constant i8*
-
-define dso_local i32 @main() local_unnamed_addr personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-entry:
-  %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind
-  %0 = bitcast i8* %exception.i to i32*
-  store i32 5, i32* %0, align 16
-  invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn
-          to label %.noexc unwind label %return
-
-.noexc:
-  unreachable
-
-return:
-  %1 = landingpad { i8*, i32 }
-          catch i8* null
-  %2 = extractvalue { i8*, i32 } %1, 0
-  %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind
-  tail call void @__cxa_end_catch()
-  ret i32 0
-}
-
-declare i32 @__gxx_personality_v0(...)
-
-declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr
-
-declare void @__cxa_end_catch() local_unnamed_addr
-
-declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr
-
-declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
-- 
GitLab


From c2c7e976de76a9e5eb5e56993c3b6289e314f8c9 Mon Sep 17 00:00:00 2001
From: Aleksandar Beserminji <abeserminji@wavecomp.com>
Date: Mon, 15 Oct 2018 14:39:12 +0000
Subject: [PATCH 0197/1116] [mips][micromips] Fix overlaping FDEs error

When compiling static executable for micromips, CFI symbols
are incorrectly labeled as MICROMIPS, which cause
".eh_frame_hdr refers to overlapping FDEs." error.

This patch does not label CFI symbols as MICROMIPS, and FDEs do not
overlap anymore. This patch also exposes another bug, which is fixed
here: https://reviews.llvm.org/D52985

Differential Revision: https://reviews.llvm.org/D52987


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344516 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Mips/MCTargetDesc/MipsELFStreamer.cpp     | 17 +++++++++
 .../Mips/MCTargetDesc/MipsELFStreamer.h       |  7 ++++
 test/DebugInfo/Mips/eh_frame.ll               | 38 +++++++++++++++++++
 3 files changed, 62 insertions(+)
 create mode 100644 test/DebugInfo/Mips/eh_frame.ll

diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index 7b9a02503ce..21b01e85096 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -53,6 +54,22 @@ void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
   createPendingLabelRelocs();
 }
 
+void MipsELFStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
+  Frame.Begin = getContext().createTempSymbol();
+  MCELFStreamer::EmitLabel(Frame.Begin);
+}
+
+MCSymbol *MipsELFStreamer::EmitCFILabel() {
+  MCSymbol *Label = getContext().createTempSymbol("cfi", true);
+  MCELFStreamer::EmitLabel(Label);
+  return Label;
+}
+
+void MipsELFStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+  Frame.End = getContext().createTempSymbol();
+  MCELFStreamer::EmitLabel(Frame.End);
+}
+
 void MipsELFStreamer::createPendingLabelRelocs() {
   MipsTargetELFStreamer *ELFTargetStreamer =
       static_cast<MipsTargetELFStreamer *>(getTargetStreamer());
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index d141f5d77c6..56a0ff96c7b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -26,6 +26,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCSubtargetInfo;
+struct MCDwarfFrameInfo;
 
 class MipsELFStreamer : public MCELFStreamer {
   SmallVector<std::unique_ptr<MipsOptionRecord>, 8> MipsOptionRecords;
@@ -60,6 +61,12 @@ public:
   void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
   void EmitIntValue(uint64_t Value, unsigned Size) override;
 
+  // Overriding these functions allows us to avoid recording of these labels
+  // in EmitLabel and later marking them as microMIPS.
+  void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
+  void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
+  MCSymbol *EmitCFILabel() override;
+
   /// Emits all the option records stored up until the point it's called.
   void EmitMipsOptionRecords();
 
diff --git a/test/DebugInfo/Mips/eh_frame.ll b/test/DebugInfo/Mips/eh_frame.ll
new file mode 100644
index 00000000000..4687443cb1c
--- /dev/null
+++ b/test/DebugInfo/Mips/eh_frame.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple mips-unknown-linux-gnu -mattr=+micromips -O3 -filetype=obj -o - %s | llvm-readelf -r | FileCheck %s
+
+; CHECK: .rel.eh_frame
+; CHECK: DW.ref.__gxx_personality_v0
+; CHECK-NEXT: .text
+; CHECK-NEXT: .gcc_except_table
+
+@_ZTIi = external constant i8*
+
+define dso_local i32 @main() local_unnamed_addr personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind
+  %0 = bitcast i8* %exception.i to i32*
+  store i32 5, i32* %0, align 16
+  invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn
+          to label %.noexc unwind label %return
+
+.noexc:
+  unreachable
+
+return:
+  %1 = landingpad { i8*, i32 }
+          catch i8* null
+  %2 = extractvalue { i8*, i32 } %1, 0
+  %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind
+  tail call void @__cxa_end_catch()
+  ret i32 0
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr
+
+declare void @__cxa_end_catch() local_unnamed_addr
+
+declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr
+
+declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
-- 
GitLab


From bce0a9abfff5f789f591d301b0bb1344ed14921e Mon Sep 17 00:00:00 2001
From: Fedor Sergeev <fedor.sergeev@azul.com>
Date: Mon, 15 Oct 2018 15:00:18 +0000
Subject: [PATCH 0198/1116] [NewPM] teach -passes= to emit meaningful error
 messages

Summary:
All the PassBuilder::parse interfaces now return descriptive StringError
instead of a plain bool. It allows to make -passes/aa-pipeline parsing
errors context-specific and thus less confusing.

TODO: ideally we should also make suggestions for misspelled pass names,
but that requires some extensions to PassBuilder.

Reviewed By: philip.pfaffe, chandlerc
Differential Revision: https://reviews.llvm.org/D53246

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344519 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Passes/PassBuilder.h          |  51 +--
 lib/LTO/LTOBackend.cpp                     |  14 +-
 lib/Passes/PassBuilder.cpp                 | 366 ++++++++++++---------
 test/Other/pass-pipeline-parsing.ll        |  83 ++++-
 test/tools/llvm-lto2/X86/pipeline.ll       |   4 +-
 test/tools/llvm-opt-fuzzer/command-line.ll |   2 +-
 tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp  |  11 +-
 tools/opt/NewPMDriver.cpp                  |  84 +++--
 unittests/IR/CMakeLists.txt                |   2 +
 unittests/IR/PassBuilderCallbacksTest.cpp  |  37 ++-
 unittests/Passes/CMakeLists.txt            |   1 +
 unittests/Passes/PluginsTest.cpp           |   5 +-
 12 files changed, 394 insertions(+), 266 deletions(-)

diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h
index 91314430a96..22e5eb0caa0 100644
--- a/include/llvm/Passes/PassBuilder.h
+++ b/include/llvm/Passes/PassBuilder.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include <vector>
@@ -384,8 +385,9 @@ public:
   /// If the sequence of passes aren't all the exact same kind of pass, it will
   /// be an error. You cannot mix different levels implicitly, you must
   /// explicitly form a pass manager in which to nest passes.
-  bool parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
+  Error parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
 
   /// {{@ Parse a textual pass pipeline description into a specific PassManager
   ///
@@ -394,12 +396,15 @@ public:
   /// this is the valid pipeline text:
   ///
   ///   function(lpass)
-  bool parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
-  bool parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
-  bool parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
+  Error parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
+  Error parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
+  Error parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
   /// @}}
 
   /// Parse a textual alias analysis pipeline into the provided AA manager.
@@ -417,7 +422,7 @@ public:
   /// Returns false if the text cannot be parsed cleanly. The specific state of
   /// the \p AA manager is unspecified if such an error is encountered and this
   /// returns false.
-  bool parseAAPipeline(AAManager &AA, StringRef PipelineText);
+  Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
 
   /// Register a callback for a default optimizer pipeline extension
   /// point
@@ -565,28 +570,28 @@ private:
   static Optional<std::vector<PipelineElement>>
   parsePipelineText(StringRef Text);
 
-  bool parseModulePass(ModulePassManager &MPM, const PipelineElement &E,
+  Error parseModulePass(ModulePassManager &MPM, const PipelineElement &E,
+                        bool VerifyEachPass, bool DebugLogging);
+  Error parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E,
                        bool VerifyEachPass, bool DebugLogging);
-  bool parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E,
+  Error parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E,
+                          bool VerifyEachPass, bool DebugLogging);
+  Error parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
                       bool VerifyEachPass, bool DebugLogging);
-  bool parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E,
-                     bool VerifyEachPass, bool DebugLogging);
-  bool parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
-                     bool VerifyEachPass, bool DebugLogging);
   bool parseAAPassName(AAManager &AA, StringRef Name);
 
-  bool parseLoopPassPipeline(LoopPassManager &LPM,
-                             ArrayRef<PipelineElement> Pipeline,
-                             bool VerifyEachPass, bool DebugLogging);
-  bool parseFunctionPassPipeline(FunctionPassManager &FPM,
-                                 ArrayRef<PipelineElement> Pipeline,
-                                 bool VerifyEachPass, bool DebugLogging);
-  bool parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
+  Error parseLoopPassPipeline(LoopPassManager &LPM,
                               ArrayRef<PipelineElement> Pipeline,
                               bool VerifyEachPass, bool DebugLogging);
-  bool parseModulePassPipeline(ModulePassManager &MPM,
+  Error parseFunctionPassPipeline(FunctionPassManager &FPM,
+                                  ArrayRef<PipelineElement> Pipeline,
+                                  bool VerifyEachPass, bool DebugLogging);
+  Error parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
                                ArrayRef<PipelineElement> Pipeline,
                                bool VerifyEachPass, bool DebugLogging);
+  Error parseModulePassPipeline(ModulePassManager &MPM,
+                                ArrayRef<PipelineElement> Pipeline,
+                                bool VerifyEachPass, bool DebugLogging);
 
   void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
                          OptimizationLevel Level, bool RunProfileGen,
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index 20fc40de4b9..1f9d60a5bdf 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -162,7 +162,7 @@ static void runNewPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
   AAManager AA;
 
   // Parse a custom AA pipeline if asked to.
-  if (!PB.parseAAPipeline(AA, "default"))
+  if (auto Err = PB.parseAAPipeline(AA, "default"))
     report_fatal_error("Error parsing default AA pipeline");
 
   LoopAnalysisManager LAM(Conf.DebugPassManager);
@@ -221,9 +221,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
 
   // Parse a custom AA pipeline if asked to.
   if (!AAPipelineDesc.empty())
-    if (!PB.parseAAPipeline(AA, AAPipelineDesc))
-      report_fatal_error("unable to parse AA pipeline description: " +
-                         AAPipelineDesc);
+    if (auto Err = PB.parseAAPipeline(AA, AAPipelineDesc))
+      report_fatal_error("unable to parse AA pipeline description '" +
+                         AAPipelineDesc + "': " + toString(std::move(Err)));
 
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
@@ -246,9 +246,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
   MPM.addPass(VerifierPass());
 
   // Now, add all the passes we've been requested to.
-  if (!PB.parsePassPipeline(MPM, PipelineDesc))
-    report_fatal_error("unable to parse pass pipeline description: " +
-                       PipelineDesc);
+  if (auto Err = PB.parsePassPipeline(MPM, PipelineDesc))
+    report_fatal_error("unable to parse pass pipeline description '" +
+                       PipelineDesc + "': " + toString(std::move(Err)));
 
   if (!DisableVerify)
     MPM.addPass(VerifierPass());
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 09758dc5651..f6313d23e2d 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
@@ -1402,9 +1403,9 @@ PassBuilder::parsePipelineText(StringRef Text) {
   return {std::move(ResultPipeline)};
 }
 
-bool PassBuilder::parseModulePass(ModulePassManager &MPM,
-                                  const PipelineElement &E, bool VerifyEachPass,
-                                  bool DebugLogging) {
+Error PassBuilder::parseModulePass(ModulePassManager &MPM,
+                                   const PipelineElement &E,
+                                   bool VerifyEachPass, bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1412,50 +1413,56 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
   if (!InnerPipeline.empty()) {
     if (Name == "module") {
       ModulePassManager NestedMPM(DebugLogging);
-      if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass,
-                                   DebugLogging))
-        return false;
+      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline,
+                                             VerifyEachPass, DebugLogging))
+        return Err;
       MPM.addPass(std::move(NestedMPM));
-      return true;
+      return Error::success();
     }
     if (Name == "cgscc") {
       CGSCCPassManager CGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass,
+                                            DebugLogging))
+        return Err;
       MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
-      return true;
+      return Error::success();
     }
     if (Name == "function") {
       FunctionPassManager FPM(DebugLogging);
-      if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       ModulePassManager NestedMPM(DebugLogging);
-      if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass,
-                                   DebugLogging))
-        return false;
+      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline,
+                                             VerifyEachPass, DebugLogging))
+        return Err;
       MPM.addPass(createRepeatedPass(*Count, std::move(NestedMPM)));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : ModulePipelineParsingCallbacks)
       if (C(Name, MPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as module pipeline", Name).str(),
+        inconvertibleErrorCode());
+    ;
   }
 
   // Manually handle aliases for pre-configured pipeline fragments.
   if (startsWithDefaultPipelineAliasPrefix(Name)) {
     SmallVector<StringRef, 3> Matches;
     if (!DefaultAliasRegex.match(Name, &Matches))
-      return false;
+      return make_error<StringError>(
+          formatv("unknown default pipeline alias '{0}'", Name).str(),
+          inconvertibleErrorCode());
+
     assert(Matches.size() == 3 && "Must capture two matched strings!");
 
     OptimizationLevel L = StringSwitch<OptimizationLevel>(Matches[2])
@@ -1467,7 +1474,7 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
                               .Case("Oz", Oz);
     if (L == O0)
       // At O0 we do nothing at all!
-      return true;
+      return Error::success();
 
     if (Matches[1] == "default") {
       MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging));
@@ -1481,38 +1488,40 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
       assert(Matches[1] == "lto" && "Not one of the matched options!");
       MPM.addPass(buildLTODefaultPipeline(L, DebugLogging, nullptr));
     }
-    return true;
+    return Error::success();
   }
 
   // Finally expand the basic registered passes from the .inc file.
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   if (Name == NAME) {                                                          \
     MPM.addPass(CREATE_PASS);                                                  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">") {                                           \
     MPM.addPass(                                                               \
         RequireAnalysisPass<                                                   \
             std::remove_reference<decltype(CREATE_PASS)>::type, Module>());    \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     MPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : ModulePipelineParsingCallbacks)
     if (C(Name, MPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(
+      formatv("unknown module pass '{0}'", Name).str(),
+      inconvertibleErrorCode());
 }
 
-bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
-                                 const PipelineElement &E, bool VerifyEachPass,
-                                 bool DebugLogging) {
+Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
+                                  const PipelineElement &E, bool VerifyEachPass,
+                                  bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1520,53 +1529,55 @@ bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
   if (!InnerPipeline.empty()) {
     if (Name == "cgscc") {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
+                                            VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       CGPM.addPass(std::move(NestedCGPM));
-      return true;
+      return Error::success();
     }
     if (Name == "function") {
       FunctionPassManager FPM(DebugLogging);
-      if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       CGPM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
+                                            VerifyEachPass, DebugLogging))
+        return Err;
       CGPM.addPass(createRepeatedPass(*Count, std::move(NestedCGPM)));
-      return true;
+      return Error::success();
     }
     if (auto MaxRepetitions = parseDevirtPassName(Name)) {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
+                                            VerifyEachPass, DebugLogging))
+        return Err;
       CGPM.addPass(
           createDevirtSCCRepeatedPass(std::move(NestedCGPM), *MaxRepetitions));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : CGSCCPipelineParsingCallbacks)
       if (C(Name, CGPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as cgscc pipeline", Name).str(),
+        inconvertibleErrorCode());
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define CGSCC_PASS(NAME, CREATE_PASS)                                          \
   if (Name == NAME) {                                                          \
     CGPM.addPass(CREATE_PASS);                                                 \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   if (Name == "require<" NAME ">") {                                           \
@@ -1574,24 +1585,26 @@ bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
                  std::remove_reference<decltype(CREATE_PASS)>::type,           \
                  LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,    \
                  CGSCCUpdateResult &>());                                      \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     CGPM.addPass(InvalidateAnalysisPass<                                       \
                  std::remove_reference<decltype(CREATE_PASS)>::type>());       \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : CGSCCPipelineParsingCallbacks)
     if (C(Name, CGPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(
+      formatv("unknown cgscc pass '{0}'", Name).str(),
+      inconvertibleErrorCode());
 }
 
-bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
-                                    const PipelineElement &E,
-                                    bool VerifyEachPass, bool DebugLogging) {
+Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
+                                     const PipelineElement &E,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1599,68 +1612,72 @@ bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
   if (!InnerPipeline.empty()) {
     if (Name == "function") {
       FunctionPassManager NestedFPM(DebugLogging);
-      if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       FPM.addPass(std::move(NestedFPM));
-      return true;
+      return Error::success();
     }
     if (Name == "loop") {
       LoopPassManager LPM(DebugLogging);
-      if (!parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass,
-                                 DebugLogging))
-        return false;
+      if (auto Err = parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass,
+                                           DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       FPM.addPass(
           createFunctionToLoopPassAdaptor(std::move(LPM), DebugLogging));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       FunctionPassManager NestedFPM(DebugLogging);
-      if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       FPM.addPass(createRepeatedPass(*Count, std::move(NestedFPM)));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : FunctionPipelineParsingCallbacks)
       if (C(Name, FPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as function pipeline", Name).str(),
+        inconvertibleErrorCode());
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
     FPM.addPass(CREATE_PASS);                                                  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   if (Name == "require<" NAME ">") {                                           \
     FPM.addPass(                                                               \
         RequireAnalysisPass<                                                   \
             std::remove_reference<decltype(CREATE_PASS)>::type, Function>());  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     FPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : FunctionPipelineParsingCallbacks)
     if (C(Name, FPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(
+      formatv("unknown function pass '{0}'", Name).str(),
+      inconvertibleErrorCode());
 }
 
-bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
-                                bool VerifyEachPass, bool DebugLogging) {
+Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
+                                 bool VerifyEachPass, bool DebugLogging) {
   StringRef Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1668,35 +1685,37 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
   if (!InnerPipeline.empty()) {
     if (Name == "loop") {
       LoopPassManager NestedLPM(DebugLogging);
-      if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass,
-                                 DebugLogging))
-        return false;
+      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline,
+                                           VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       LPM.addPass(std::move(NestedLPM));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       LoopPassManager NestedLPM(DebugLogging);
-      if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass,
-                                 DebugLogging))
-        return false;
+      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline,
+                                           VerifyEachPass, DebugLogging))
+        return Err;
       LPM.addPass(createRepeatedPass(*Count, std::move(NestedLPM)));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : LoopPipelineParsingCallbacks)
       if (C(Name, LPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as loop pipeline", Name).str(),
+        inconvertibleErrorCode());
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
     LPM.addPass(CREATE_PASS);                                                  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
   if (Name == "require<" NAME ">") {                                           \
@@ -1704,19 +1723,20 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
                 std::remove_reference<decltype(CREATE_PASS)>::type, Loop,      \
                 LoopAnalysisManager, LoopStandardAnalysisResults &,            \
                 LPMUpdater &>());                                              \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     LPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : LoopPipelineParsingCallbacks)
     if (C(Name, LPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(formatv("unknown loop pass '{0}'", Name).str(),
+                                 inconvertibleErrorCode());
 }
 
 bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) {
@@ -1740,41 +1760,42 @@ bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) {
   return false;
 }
 
-bool PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM,
-                                        ArrayRef<PipelineElement> Pipeline,
-                                        bool VerifyEachPass,
-                                        bool DebugLogging) {
+Error PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM,
+                                         ArrayRef<PipelineElement> Pipeline,
+                                         bool VerifyEachPass,
+                                         bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err = parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     // FIXME: No verifier support for Loop passes!
   }
-  return true;
+  return Error::success();
 }
 
-bool PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM,
-                                            ArrayRef<PipelineElement> Pipeline,
-                                            bool VerifyEachPass,
-                                            bool DebugLogging) {
+Error PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM,
+                                             ArrayRef<PipelineElement> Pipeline,
+                                             bool VerifyEachPass,
+                                             bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err =
+            parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     if (VerifyEachPass)
       FPM.addPass(VerifierPass());
   }
-  return true;
+  return Error::success();
 }
 
-bool PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
-                                         ArrayRef<PipelineElement> Pipeline,
-                                         bool VerifyEachPass,
-                                         bool DebugLogging) {
+Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
+                                          ArrayRef<PipelineElement> Pipeline,
+                                          bool VerifyEachPass,
+                                          bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err = parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     // FIXME: No verifier support for CGSCC passes!
   }
-  return true;
+  return Error::success();
 }
 
 void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
@@ -1790,28 +1811,30 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
   LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
 }
 
-bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
-                                          ArrayRef<PipelineElement> Pipeline,
-                                          bool VerifyEachPass,
-                                          bool DebugLogging) {
+Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
+                                           ArrayRef<PipelineElement> Pipeline,
+                                           bool VerifyEachPass,
+                                           bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseModulePass(MPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err = parseModulePass(MPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     if (VerifyEachPass)
       MPM.addPass(VerifierPass());
   }
-  return true;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c ModulePassManager
 // FIXME: Should this routine accept a TargetMachine or require the caller to
 // pre-populate the analysis managers with target-specific stuff?
-bool PassBuilder::parsePassPipeline(ModulePassManager &MPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
   // If the first name isn't at the module layer, wrap the pipeline up
   // automatically.
@@ -1828,73 +1851,106 @@ bool PassBuilder::parsePassPipeline(ModulePassManager &MPM,
     } else {
       for (auto &C : TopLevelPipelineParsingCallbacks)
         if (C(MPM, *Pipeline, VerifyEachPass, DebugLogging))
-          return true;
-
-      // Unknown pass name!
-      return false;
+          return Error::success();
+
+      // Unknown pass or pipeline name!
+      auto &InnerPipeline = Pipeline->front().InnerPipeline;
+      return make_error<StringError>(
+          formatv("unknown {0} name '{1}'",
+                  (InnerPipeline.empty() ? "pass" : "pipeline"), FirstName)
+              .str(),
+          inconvertibleErrorCode());
     }
   }
 
-  return parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging);
+  if (auto Err =
+          parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging))
+    return Err;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c CGSCCPassManager
-bool PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
   StringRef FirstName = Pipeline->front().Name;
   if (!isCGSCCPassName(FirstName, CGSCCPipelineParsingCallbacks))
-    return false;
-
-  return parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging);
+    return make_error<StringError>(
+        formatv("unknown cgscc pass '{0}' in pipeline '{1}'", FirstName,
+                PipelineText)
+            .str(),
+        inconvertibleErrorCode());
+
+  if (auto Err =
+          parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging))
+    return Err;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c
 // FunctionPassManager
-bool PassBuilder::parsePassPipeline(FunctionPassManager &FPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(FunctionPassManager &FPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
   StringRef FirstName = Pipeline->front().Name;
   if (!isFunctionPassName(FirstName, FunctionPipelineParsingCallbacks))
-    return false;
-
-  return parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass,
-                                   DebugLogging);
+    return make_error<StringError>(
+        formatv("unknown function pass '{0}' in pipeline '{1}'", FirstName,
+                PipelineText)
+            .str(),
+        inconvertibleErrorCode());
+
+  if (auto Err = parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass,
+                                           DebugLogging))
+    return Err;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c LoopPassManager
-bool PassBuilder::parsePassPipeline(LoopPassManager &CGPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(LoopPassManager &CGPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
-  return parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging);
+  if (auto Err =
+          parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging))
+    return Err;
+
+  return Error::success();
 }
 
-bool PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
+Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
   // If the pipeline just consists of the word 'default' just replace the AA
   // manager with our default one.
   if (PipelineText == "default") {
     AA = buildDefaultAAPipeline();
-    return true;
+    return Error::success();
   }
 
   while (!PipelineText.empty()) {
     StringRef Name;
     std::tie(Name, PipelineText) = PipelineText.split(',');
     if (!parseAAPassName(AA, Name))
-      return false;
+      return make_error<StringError>(
+          formatv("unknown alias analysis name '{0}'", Name).str(),
+          inconvertibleErrorCode());
   }
 
-  return true;
+  return Error::success();
 }
diff --git a/test/Other/pass-pipeline-parsing.ll b/test/Other/pass-pipeline-parsing.ll
index b303318c796..d13a977dbce 100644
--- a/test/Other/pass-pipeline-parsing.ll
+++ b/test/Other/pass-pipeline-parsing.ll
@@ -54,52 +54,52 @@
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED1
-; CHECK-UNBALANCED1: unable to parse pass pipeline description
+; CHECK-UNBALANCED1: invalid pipeline 'no-op-module)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='module(no-op-module))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED2
-; CHECK-UNBALANCED2: unable to parse pass pipeline description
+; CHECK-UNBALANCED2: invalid pipeline 'module(no-op-module))'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='module(no-op-module' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED3
-; CHECK-UNBALANCED3: unable to parse pass pipeline description
+; CHECK-UNBALANCED3: invalid pipeline 'module(no-op-module'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED4
-; CHECK-UNBALANCED4: unable to parse pass pipeline description
+; CHECK-UNBALANCED4: invalid pipeline 'no-op-function)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED5
-; CHECK-UNBALANCED5: unable to parse pass pipeline description
+; CHECK-UNBALANCED5: invalid pipeline 'function(no-op-function))'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(function(no-op-function)))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED6
-; CHECK-UNBALANCED6: unable to parse pass pipeline description
+; CHECK-UNBALANCED6: invalid pipeline 'function(function(no-op-function)))'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED7
-; CHECK-UNBALANCED7: unable to parse pass pipeline description
+; CHECK-UNBALANCED7: invalid pipeline 'function(no-op-function'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(function(no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED8
-; CHECK-UNBALANCED8: unable to parse pass pipeline description
+; CHECK-UNBALANCED8: invalid pipeline 'function(function(no-op-function)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module,)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED9
-; CHECK-UNBALANCED9: unable to parse pass pipeline description
+; CHECK-UNBALANCED9: invalid pipeline 'no-op-module,)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function,)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED10
-; CHECK-UNBALANCED10: unable to parse pass pipeline description
+; CHECK-UNBALANCED10: invalid pipeline 'no-op-function,)'
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes=no-op-cgscc,no-op-cgscc %s 2>&1 \
@@ -176,37 +176,86 @@
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function)function(no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-MISSING-COMMA1
-; CHECK-MISSING-COMMA1: unable to parse pass pipeline description
+; CHECK-MISSING-COMMA1: invalid pipeline 'function(no-op-function)function(no-op-function)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function()' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-EMPTY-INNER-PIPELINE
-; CHECK-EMPTY-INNER-PIPELINE: unable to parse pass pipeline description
+; CHECK-EMPTY-INNER-PIPELINE: unknown function pass ''
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module(no-op-module,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-MODULE-PASS
-; CHECK-PIPELINE-ON-MODULE-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-MODULE-PASS: invalid use of 'no-op-module' pass as module pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-cgscc(no-op-cgscc,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-CGSCC-PASS
-; CHECK-PIPELINE-ON-CGSCC-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-CGSCC-PASS: invalid use of 'no-op-cgscc' pass as cgscc pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function(no-op-function,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-FUNCTION-PASS
-; CHECK-PIPELINE-ON-FUNCTION-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-FUNCTION-PASS: invalid use of 'no-op-function' pass as function pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-loop(no-op-loop,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-LOOP-PASS
-; CHECK-PIPELINE-ON-LOOP-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-LOOP-PASS: invalid use of 'no-op-loop' pass as loop pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function()' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-EMPTY-PIPELINE-ON-PASS
-; CHECK-EMPTY-PIPELINE-ON-PASS: unable to parse pass pipeline description
+; CHECK-EMPTY-PIPELINE-ON-PASS: invalid use of 'no-op-function' pass as function pipeline
+
+; RUN: not opt -passes='no-op-module,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-MODULE
+; CHECK-UNKNOWN-MODULE: opt: unknown module pass 'bad'
+
+; RUN: not opt -passes='no-op-loop,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-LOOP
+; CHECK-UNKNOWN-LOOP: opt: unknown loop pass 'bad'
+
+; RUN: not opt -passes='no-op-cgscc,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-CGSCC
+; CHECK-UNKNOWN-CGSCC: opt: unknown cgscc pass 'bad'
+
+; RUN: not opt -passes='no-op-function,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='function(bad,pipeline,text)' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='module(no-op-module,function(bad,pipeline,text))' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='no-op-module,function(bad,pipeline,text)' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='module(cgscc(function(bad,pipeline,text)))' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; CHECK-UNKNOWN-FUNCTION: opt: unknown function pass 'bad'
+
+; RUN: not opt -aa-pipeline=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=AA-PIPELINE-ERR
+; AA-PIPELINE-ERR: unknown alias analysis name 'bad'
+; RUN: opt -passes-ep-peephole=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PEEPHOLE-ERR
+; PASSES-EP-PEEPHOLE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
+; RUN: opt -passes-ep-late-loop-optimizations=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LATELOOPOPT-ERR
+; PASSES-EP-LATELOOPOPT-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
+; RUN: opt -passes-ep-loop-optimizer-end=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LOOPOPTEND-ERR
+; PASSES-EP-LOOPOPTEND-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
+; RUN: opt -passes-ep-scalar-optimizer-late=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-SCALAROPTLATE-ERR
+; PASSES-EP-SCALAROPTLATE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
+; RUN: opt -passes-ep-cgscc-optimizer-late=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-CGSCCOPTLATE-ERR
+; PASSES-EP-CGSCCOPTLATE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
+; RUN: opt -passes-ep-vectorizer-start=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-VECTORIZERSTART-ERR
+; PASSES-EP-VECTORIZERSTART-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
+; RUN: opt -passes-ep-pipeline-start=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PIPELINESTART-ERR
+; PASSES-EP-PIPELINESTART-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
 
 define void @f() {
 entry:
diff --git a/test/tools/llvm-lto2/X86/pipeline.ll b/test/tools/llvm-lto2/X86/pipeline.ll
index 29276d8d13a..9ab81ac70a7 100644
--- a/test/tools/llvm-lto2/X86/pipeline.ll
+++ b/test/tools/llvm-lto2/X86/pipeline.ll
@@ -32,11 +32,11 @@ define void @patatino() {
 ; RUN:  -r %t1.bc,patatino,px -opt-pipeline foogoo 2>&1 | \
 ; RUN:  FileCheck %s --check-prefix=ERR
 
-; ERR: LLVM ERROR: unable to parse pass pipeline description: foogoo
+; ERR: LLVM ERROR: unable to parse pass pipeline description 'foogoo': unknown pass name 'foogoo'
 
 ; RUN: not llvm-lto2 run %t1.bc -o %t.o \
 ; RUN:  -r %t1.bc,patatino,px -aa-pipeline patatino \
 ; RUN:  -opt-pipeline loweratomic 2>&1 | \
 ; RUN:  FileCheck %s --check-prefix=AAERR
 
-; AAERR: LLVM ERROR: unable to parse AA pipeline description: patatino
+; AAERR: LLVM ERROR: unable to parse AA pipeline description 'patatino': unknown alias analysis name 'patatino'
diff --git a/test/tools/llvm-opt-fuzzer/command-line.ll b/test/tools/llvm-opt-fuzzer/command-line.ll
index f747bba431b..8c3f6b60154 100644
--- a/test/tools/llvm-opt-fuzzer/command-line.ll
+++ b/test/tools/llvm-opt-fuzzer/command-line.ll
@@ -13,7 +13,7 @@
 
 ; Don't start with incorrect passes specified
 ; RUN: not llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes no-pass 2>&1 | FileCheck -check-prefix=PIPELINE %s
-; PIPELINE: can't parse pass pipeline
+; PIPELINE: unknown pass name 'no-pass'
 
 ; Correct command line
 ; RUN: llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes instcombine 2>&1 | FileCheck -check-prefix=CORRECT %s
diff --git a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
index 98d5428ddd1..57e75b1db9e 100644
--- a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
+++ b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
@@ -144,9 +144,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  bool Ok = PB.parsePassPipeline(MPM, PassPipeline, false, false);
-  assert(Ok && "Should have been checked during fuzzer initialization");
-  (void)Ok; // silence unused variable warning on release builds
+  auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false);
+  assert(!Err && "Should have been checked during fuzzer initialization");
+  // Only fail with assert above, otherwise ignore the parsing error.
+  consumeError(std::move(Err));
 
   // Run passes which we need to test
   //
@@ -235,8 +236,8 @@ extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize(
 
   PassBuilder PB(TM.get());
   ModulePassManager MPM;
-  if (!PB.parsePassPipeline(MPM, PassPipeline, false, false)) {
-    errs() << *argv[0] << ": can't parse pass pipeline\n";
+  if (auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false)) {
+    errs() << *argv[0] << ": " << toString(std::move(Err)) << "\n";
     exit(1);
   }
 
diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp
index e63547a79d0..11879d26a6c 100644
--- a/tools/opt/NewPMDriver.cpp
+++ b/tools/opt/NewPMDriver.cpp
@@ -124,12 +124,12 @@ bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) {
 
   // Verify the pipeline is parseable:
   PassManagerT PM;
-  if (PB.parsePassPipeline(PM, PipelineText))
-    return true;
-
-  errs() << "Could not parse pipeline '" << PipelineText
-         << "'. I'm going to igore it.\n";
-  return false;
+  if (auto Err = PB.parsePassPipeline(PM, PipelineText)) {
+    errs() << "Could not parse pipeline '" << PipelineText
+           << "'. I'm going to ignore it.\n";
+    return false;
+  }
+  return true;
 }
 
 /// If one of the EPPipeline command line options was given, register callbacks
@@ -137,50 +137,61 @@ bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) {
 static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass,
                                 bool DebugLogging) {
   if (tryParsePipelineText<FunctionPassManager>(PB, PeepholeEPPipeline))
-    PB.registerPeepholeEPCallback([&PB, VerifyEachPass, DebugLogging](
-        FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerPeepholeEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse PeepholeEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass,
+                                   DebugLogging));
+        });
   if (tryParsePipelineText<LoopPassManager>(PB,
                                             LateLoopOptimizationsEPPipeline))
     PB.registerLateLoopOptimizationsEPCallback(
         [&PB, VerifyEachPass, DebugLogging](
             LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline,
-                               VerifyEachPass, DebugLogging);
+          ExitOnError Err("Unable to parse LateLoopOptimizationsEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline,
+                                   VerifyEachPass, DebugLogging));
         });
   if (tryParsePipelineText<LoopPassManager>(PB, LoopOptimizerEndEPPipeline))
-    PB.registerLoopOptimizerEndEPCallback([&PB, VerifyEachPass, DebugLogging](
-        LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerLoopOptimizerEndEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse LoopOptimizerEndEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline,
+                                   VerifyEachPass, DebugLogging));
+        });
   if (tryParsePipelineText<FunctionPassManager>(PB,
                                                 ScalarOptimizerLateEPPipeline))
     PB.registerScalarOptimizerLateEPCallback(
         [&PB, VerifyEachPass, DebugLogging](
             FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline,
-                               VerifyEachPass, DebugLogging);
+          ExitOnError Err("Unable to parse ScalarOptimizerLateEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline,
+                                   VerifyEachPass, DebugLogging));
         });
   if (tryParsePipelineText<CGSCCPassManager>(PB, CGSCCOptimizerLateEPPipeline))
-    PB.registerCGSCCOptimizerLateEPCallback([&PB, VerifyEachPass, DebugLogging](
-        CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerCGSCCOptimizerLateEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse CGSCCOptimizerLateEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline,
+                                   VerifyEachPass, DebugLogging));
+        });
   if (tryParsePipelineText<FunctionPassManager>(PB, VectorizerStartEPPipeline))
-    PB.registerVectorizerStartEPCallback([&PB, VerifyEachPass, DebugLogging](
-        FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, VectorizerStartEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerVectorizerStartEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse VectorizerStartEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, VectorizerStartEPPipeline,
+                                   VerifyEachPass, DebugLogging));
+        });
   if (tryParsePipelineText<ModulePassManager>(PB, PipelineStartEPPipeline))
     PB.registerPipelineStartEPCallback(
         [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM) {
-          PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass,
-                               DebugLogging);
+          ExitOnError Err("Unable to parse PipelineStartEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass,
+                                   DebugLogging));
         });
 }
 
@@ -258,8 +269,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   // Specially handle the alias analysis manager so that we can register
   // a custom pipeline of AA passes with it.
   AAManager AA;
-  if (!PB.parseAAPipeline(AA, AAPipeline)) {
-    errs() << Arg0 << ": unable to parse AA pipeline description.\n";
+  if (auto Err = PB.parseAAPipeline(AA, AAPipeline)) {
+    errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
     return false;
   }
 
@@ -284,8 +295,9 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   if (EnableDebugify)
     MPM.addPass(NewPMDebugifyPass());
 
-  if (!PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) {
-    errs() << Arg0 << ": unable to parse pass pipeline description.\n";
+  if (auto Err =
+          PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) {
+    errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
     return false;
   }
 
diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index 211ab109131..7498983b260 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@@ -40,3 +40,5 @@ add_llvm_unittest(IRTests
   VerifierTest.cpp
   WaymarkTest.cpp
   )
+
+target_link_libraries(IRTests PRIVATE LLVMTestingSupport)
diff --git a/unittests/IR/PassBuilderCallbacksTest.cpp b/unittests/IR/PassBuilderCallbacksTest.cpp
index 97bbb81a6b0..20c47b045e7 100644
--- a/unittests/IR/PassBuilderCallbacksTest.cpp
+++ b/unittests/IR/PassBuilderCallbacksTest.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Testing/Support/Error.h"
 #include <functional>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -460,7 +461,7 @@ TEST_F(ModuleCallbacksTest, Passes) {
       .WillOnce(Invoke(getAnalysisResult));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -494,7 +495,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -525,7 +526,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -537,7 +538,7 @@ TEST_F(FunctionCallbacksTest, Passes) {
       .WillOnce(Invoke(getAnalysisResult));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -571,7 +572,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -604,7 +605,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -615,7 +616,7 @@ TEST_F(LoopCallbacksTest, Passes) {
       .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult)));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -650,7 +651,7 @@ TEST_F(LoopCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -682,7 +683,7 @@ TEST_F(LoopCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -693,7 +694,7 @@ TEST_F(CGSCCCallbacksTest, Passes) {
       .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult)));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -727,7 +728,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -759,7 +760,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -774,7 +775,7 @@ TEST_F(ModuleCallbacksTest, AnalysisUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("<string>"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -784,7 +785,7 @@ TEST_F(CGSCCCallbacksTest, PassUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("(foo)"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -794,7 +795,7 @@ TEST_F(FunctionCallbacksTest, AnalysisUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("foo"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -805,7 +806,7 @@ TEST_F(LoopCallbacksTest, PassUtilities) {
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
 
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -845,13 +846,13 @@ TEST_F(ModuleCallbacksTest, ParseTopLevelPipeline) {
 
   StringRef PipelineText =
       "another-pipeline(test-transform,invalidate<test-analysis>)";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 
   /// Test the negative case
   PipelineText = "another-pipeline(instcombine)";
-  ASSERT_FALSE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Failed())
       << "Pipeline was: " << PipelineText;
 }
 } // end anonymous namespace
diff --git a/unittests/Passes/CMakeLists.txt b/unittests/Passes/CMakeLists.txt
index d90df209d4e..415f3a71734 100644
--- a/unittests/Passes/CMakeLists.txt
+++ b/unittests/Passes/CMakeLists.txt
@@ -12,6 +12,7 @@ add_llvm_unittest(PluginsTests
   PluginsTest.cpp
   )
 export_executable_symbols(PluginsTests)
+target_link_libraries(PluginsTests PRIVATE LLVMTestingSupport)
 
 set(LLVM_LINK_COMPONENTS)
 add_llvm_loadable_module(TestPlugin
diff --git a/unittests/Passes/PluginsTest.cpp b/unittests/Passes/PluginsTest.cpp
index 726978714e8..abb7b57ee0c 100644
--- a/unittests/Passes/PluginsTest.cpp
+++ b/unittests/Passes/PluginsTest.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Testing/Support/Error.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "gtest/gtest.h"
 
@@ -54,8 +55,8 @@ TEST(PluginsTests, LoadPlugin) {
 
   PassBuilder PB;
   ModulePassManager PM;
-  ASSERT_FALSE(PB.parsePassPipeline(PM, "plugin-pass"));
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Failed());
 
   Plugin->registerPassBuilderCallbacks(PB);
-  ASSERT_TRUE(PB.parsePassPipeline(PM, "plugin-pass"));
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Succeeded());
 }
-- 
GitLab


From cb8b3a2740adc2ea3b866bab8b727b0a93e96353 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 15 Oct 2018 15:26:47 +0000
Subject: [PATCH 0199/1116] [ADT] Adds equality operators for DenseMap and
 DenseSet, and an initializer_list constructor for DenseMap (DenseSet already
 had an initializer_list constructor).

These changes make it easier to migrate existing code that uses std::map and
std::set (which support initializer_list construction and equality comparison)
to DenseMap and DenseSet.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344522 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/DenseMap.h    | 43 ++++++++++++++++++++++++++++++++++
 include/llvm/ADT/DenseSet.h    | 28 ++++++++++++++++++++++
 unittests/ADT/DenseMapTest.cpp | 20 ++++++++++++++++
 unittests/ADT/DenseSetTest.cpp |  9 +++++++
 4 files changed, 100 insertions(+)

diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index 8fe0f48adf2..ac1e5c632d3 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -25,6 +25,7 @@
 #include <cassert>
 #include <cstddef>
 #include <cstring>
+#include <initializer_list>
 #include <iterator>
 #include <new>
 #include <type_traits>
@@ -38,6 +39,9 @@ namespace detail {
 // implementation without requiring two members.
 template <typename KeyT, typename ValueT>
 struct DenseMapPair : public std::pair<KeyT, ValueT> {
+
+  using std::pair<KeyT, ValueT>::pair;
+
   KeyT &getFirst() { return std::pair<KeyT, ValueT>::first; }
   const KeyT &getFirst() const { return std::pair<KeyT, ValueT>::first; }
   ValueT &getSecond() { return std::pair<KeyT, ValueT>::second; }
@@ -640,6 +644,40 @@ public:
   }
 };
 
+/// Equality comparison for DenseMap.
+///
+/// Iterates over elements of LHS confirming that each (key, value) pair in LHS
+/// is also in RHS, and that no additional pairs are in RHS.
+/// Equivalent to N calls to RHS.find and N value comparisons. Amortized
+/// complexity is linear, worst case is O(N^2) (if every hash collides).
+template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
+          typename BucketT>
+bool operator==(
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &LHS,
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &RHS) {
+  if (LHS.size() != RHS.size())
+    return false;
+
+  for (auto &KV : LHS) {
+    auto I = RHS.find(KV.first);
+    if (I == RHS.end() || I->second != KV.second)
+      return false;
+  }
+
+  return true;
+}
+
+/// Inequality comparison for DenseMap.
+///
+/// Equivalent to !(LHS == RHS). See operator== for performance notes.
+template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
+          typename BucketT>
+bool operator!=(
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &LHS,
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &RHS) {
+  return !(LHS == RHS);
+}
+
 template <typename KeyT, typename ValueT,
           typename KeyInfoT = DenseMapInfo<KeyT>,
           typename BucketT = llvm::detail::DenseMapPair<KeyT, ValueT>>
@@ -677,6 +715,11 @@ public:
     this->insert(I, E);
   }
 
+  DenseMap(std::initializer_list<typename BaseT::value_type> Vals) {
+    init(Vals.size());
+    this->insert(Vals.begin(), Vals.end());
+  }
+
   ~DenseMap() {
     this->destroyAll();
     operator delete(Buckets);
diff --git a/include/llvm/ADT/DenseSet.h b/include/llvm/ADT/DenseSet.h
index 52fe4adb5bd..404b2f74766 100644
--- a/include/llvm/ADT/DenseSet.h
+++ b/include/llvm/ADT/DenseSet.h
@@ -214,6 +214,34 @@ public:
   }
 };
 
+/// Equality comparison for DenseSet.
+///
+/// Iterates over elements of LHS confirming that each element is also a member
+/// of RHS, and that RHS contains no additional values.
+/// Equivalent to N calls to RHS.count. Amortized complexity is linear, worst
+/// case is O(N^2) (if every hash collides).
+template <typename ValueT, typename MapTy, typename ValueInfoT>
+bool operator==(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
+                const DenseSetImpl<ValueT, MapTy, ValueInfoT> &RHS) {
+  if (LHS.size() != RHS.size())
+    return false;
+
+  for (auto &E : LHS)
+    if (!RHS.count(E))
+      return false;
+
+  return true;
+}
+
+/// Inequality comparison for DenseSet.
+///
+/// Equivalent to !(LHS == RHS). See operator== for performance notes.
+template <typename ValueT, typename MapTy, typename ValueInfoT>
+bool operator!=(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
+                const DenseSetImpl<ValueT, MapTy, ValueInfoT> &RHS) {
+  return !(LHS == RHS);
+}
+
 } // end namespace detail
 
 /// Implements a dense probed hash-table based set.
diff --git a/unittests/ADT/DenseMapTest.cpp b/unittests/ADT/DenseMapTest.cpp
index 87f22f6f403..ee9c5dd3800 100644
--- a/unittests/ADT/DenseMapTest.cpp
+++ b/unittests/ADT/DenseMapTest.cpp
@@ -362,6 +362,26 @@ int CountCopyAndMove::Move = 0;
 
 } // anonymous namespace
 
+// Test initializer list construction.
+TEST(DenseMapCustomTest, InitializerList) {
+  DenseMap<int, int> M({{0, 0}, {0, 1}, {1, 2}});
+  EXPECT_EQ(2u, M.size());
+  EXPECT_EQ(1u, M.count(0));
+  EXPECT_EQ(0, M[0]);
+  EXPECT_EQ(1u, M.count(1));
+  EXPECT_EQ(2, M[1]);
+}
+
+// Test initializer list construction.
+TEST(DenseMapCustomTest, EqualityComparison) {
+  DenseMap<int, int> M1({{0, 0}, {1, 2}});
+  DenseMap<int, int> M2({{0, 0}, {1, 2}});
+  DenseMap<int, int> M3({{0, 0}, {1, 3}});
+
+  EXPECT_EQ(M1, M2);
+  EXPECT_NE(M1, M3);
+}
+
 // Test for the default minimum size of a DenseMap
 TEST(DenseMapCustomTest, DefaultMinReservedSizeTest) {
   // IF THIS VALUE CHANGE, please update InitialSizeTest, InitFromIterator, and
diff --git a/unittests/ADT/DenseSetTest.cpp b/unittests/ADT/DenseSetTest.cpp
index 0247f023dce..04f84e041fb 100644
--- a/unittests/ADT/DenseSetTest.cpp
+++ b/unittests/ADT/DenseSetTest.cpp
@@ -121,6 +121,15 @@ TYPED_TEST(DenseSetTest, FindAsTest) {
   EXPECT_TRUE(set.find_as("d") == set.end());
 }
 
+TYPED_TEST(DenseSetTest, EqualityComparisonTest) {
+  TypeParam set1({1, 2, 3, 4});
+  TypeParam set2({4, 3, 2, 1});
+  TypeParam set3({2, 3, 4, 5});
+
+  EXPECT_EQ(set1, set2);
+  EXPECT_NE(set1, set3);
+}
+
 // Simple class that counts how many moves and copy happens when growing a map
 struct CountCopyAndMove {
   static int Move;
-- 
GitLab


From 8559689cb23c3dd7d21dcdf113748f1cf5fefb85 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 15 Oct 2018 15:28:44 +0000
Subject: [PATCH 0200/1116] [x86] add tests for fma with undef elts; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344523 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/fma_patterns.ll | 46 ++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 2d01c570f99..d0d0dfed352 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -636,6 +636,29 @@ define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_add_x_one_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %a = fadd <4 x float> %x, <float 1.0, float undef, float 1.0, float undef>
+  %m = fmul <4 x float> %y, %a
+  ret <4 x float> %m
+}
+
 define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_add_x_negone_y:
 ; FMA-INFS:       # %bb.0:
@@ -712,6 +735,29 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %a = fadd <4 x float> %x, <float undef, float -1.0, float undef, float -1.0>
+  %m = fmul <4 x float> %y, %a
+  ret <4 x float> %m
+}
+
 define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
 ; FMA-INFS:       # %bb.0:
-- 
GitLab


From dea3b338ac8a25004094b5adffe2b0a33f7142fb Mon Sep 17 00:00:00 2001
From: Fedor Sergeev <fedor.sergeev@azul.com>
Date: Mon, 15 Oct 2018 15:36:08 +0000
Subject: [PATCH 0201/1116] Revert "[NewPM] teach -passes= to emit meaningful
 error messages"

This reverts r344519 due to failures in pipeline-parsing test.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344524 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Passes/PassBuilder.h          |  51 ++-
 lib/LTO/LTOBackend.cpp                     |  14 +-
 lib/Passes/PassBuilder.cpp                 | 366 +++++++++------------
 test/Other/pass-pipeline-parsing.ll        |  83 +----
 test/tools/llvm-lto2/X86/pipeline.ll       |   4 +-
 test/tools/llvm-opt-fuzzer/command-line.ll |   2 +-
 tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp  |  11 +-
 tools/opt/NewPMDriver.cpp                  |  84 ++---
 unittests/IR/CMakeLists.txt                |   2 -
 unittests/IR/PassBuilderCallbacksTest.cpp  |  37 +--
 unittests/Passes/CMakeLists.txt            |   1 -
 unittests/Passes/PluginsTest.cpp           |   5 +-
 12 files changed, 266 insertions(+), 394 deletions(-)

diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h
index 22e5eb0caa0..91314430a96 100644
--- a/include/llvm/Passes/PassBuilder.h
+++ b/include/llvm/Passes/PassBuilder.h
@@ -19,7 +19,6 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Support/Error.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include <vector>
@@ -385,9 +384,8 @@ public:
   /// If the sequence of passes aren't all the exact same kind of pass, it will
   /// be an error. You cannot mix different levels implicitly, you must
   /// explicitly form a pass manager in which to nest passes.
-  Error parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
-                          bool VerifyEachPass = true,
-                          bool DebugLogging = false);
+  bool parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
+                         bool VerifyEachPass = true, bool DebugLogging = false);
 
   /// {{@ Parse a textual pass pipeline description into a specific PassManager
   ///
@@ -396,15 +394,12 @@ public:
   /// this is the valid pipeline text:
   ///
   ///   function(lpass)
-  Error parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText,
-                          bool VerifyEachPass = true,
-                          bool DebugLogging = false);
-  Error parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText,
-                          bool VerifyEachPass = true,
-                          bool DebugLogging = false);
-  Error parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText,
-                          bool VerifyEachPass = true,
-                          bool DebugLogging = false);
+  bool parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText,
+                         bool VerifyEachPass = true, bool DebugLogging = false);
+  bool parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText,
+                         bool VerifyEachPass = true, bool DebugLogging = false);
+  bool parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText,
+                         bool VerifyEachPass = true, bool DebugLogging = false);
   /// @}}
 
   /// Parse a textual alias analysis pipeline into the provided AA manager.
@@ -422,7 +417,7 @@ public:
   /// Returns false if the text cannot be parsed cleanly. The specific state of
   /// the \p AA manager is unspecified if such an error is encountered and this
   /// returns false.
-  Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
+  bool parseAAPipeline(AAManager &AA, StringRef PipelineText);
 
   /// Register a callback for a default optimizer pipeline extension
   /// point
@@ -570,28 +565,28 @@ private:
   static Optional<std::vector<PipelineElement>>
   parsePipelineText(StringRef Text);
 
-  Error parseModulePass(ModulePassManager &MPM, const PipelineElement &E,
-                        bool VerifyEachPass, bool DebugLogging);
-  Error parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E,
+  bool parseModulePass(ModulePassManager &MPM, const PipelineElement &E,
                        bool VerifyEachPass, bool DebugLogging);
-  Error parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E,
-                          bool VerifyEachPass, bool DebugLogging);
-  Error parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
+  bool parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E,
                       bool VerifyEachPass, bool DebugLogging);
+  bool parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E,
+                     bool VerifyEachPass, bool DebugLogging);
+  bool parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
+                     bool VerifyEachPass, bool DebugLogging);
   bool parseAAPassName(AAManager &AA, StringRef Name);
 
-  Error parseLoopPassPipeline(LoopPassManager &LPM,
+  bool parseLoopPassPipeline(LoopPassManager &LPM,
+                             ArrayRef<PipelineElement> Pipeline,
+                             bool VerifyEachPass, bool DebugLogging);
+  bool parseFunctionPassPipeline(FunctionPassManager &FPM,
+                                 ArrayRef<PipelineElement> Pipeline,
+                                 bool VerifyEachPass, bool DebugLogging);
+  bool parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
                               ArrayRef<PipelineElement> Pipeline,
                               bool VerifyEachPass, bool DebugLogging);
-  Error parseFunctionPassPipeline(FunctionPassManager &FPM,
-                                  ArrayRef<PipelineElement> Pipeline,
-                                  bool VerifyEachPass, bool DebugLogging);
-  Error parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
+  bool parseModulePassPipeline(ModulePassManager &MPM,
                                ArrayRef<PipelineElement> Pipeline,
                                bool VerifyEachPass, bool DebugLogging);
-  Error parseModulePassPipeline(ModulePassManager &MPM,
-                                ArrayRef<PipelineElement> Pipeline,
-                                bool VerifyEachPass, bool DebugLogging);
 
   void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
                          OptimizationLevel Level, bool RunProfileGen,
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index 1f9d60a5bdf..20fc40de4b9 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -162,7 +162,7 @@ static void runNewPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
   AAManager AA;
 
   // Parse a custom AA pipeline if asked to.
-  if (auto Err = PB.parseAAPipeline(AA, "default"))
+  if (!PB.parseAAPipeline(AA, "default"))
     report_fatal_error("Error parsing default AA pipeline");
 
   LoopAnalysisManager LAM(Conf.DebugPassManager);
@@ -221,9 +221,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
 
   // Parse a custom AA pipeline if asked to.
   if (!AAPipelineDesc.empty())
-    if (auto Err = PB.parseAAPipeline(AA, AAPipelineDesc))
-      report_fatal_error("unable to parse AA pipeline description '" +
-                         AAPipelineDesc + "': " + toString(std::move(Err)));
+    if (!PB.parseAAPipeline(AA, AAPipelineDesc))
+      report_fatal_error("unable to parse AA pipeline description: " +
+                         AAPipelineDesc);
 
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
@@ -246,9 +246,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
   MPM.addPass(VerifierPass());
 
   // Now, add all the passes we've been requested to.
-  if (auto Err = PB.parsePassPipeline(MPM, PipelineDesc))
-    report_fatal_error("unable to parse pass pipeline description '" +
-                       PipelineDesc + "': " + toString(std::move(Err)));
+  if (!PB.parsePassPipeline(MPM, PipelineDesc))
+    report_fatal_error("unable to parse pass pipeline description: " +
+                       PipelineDesc);
 
   if (!DisableVerify)
     MPM.addPass(VerifierPass());
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index f6313d23e2d..09758dc5651 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -58,7 +58,6 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
@@ -1403,9 +1402,9 @@ PassBuilder::parsePipelineText(StringRef Text) {
   return {std::move(ResultPipeline)};
 }
 
-Error PassBuilder::parseModulePass(ModulePassManager &MPM,
-                                   const PipelineElement &E,
-                                   bool VerifyEachPass, bool DebugLogging) {
+bool PassBuilder::parseModulePass(ModulePassManager &MPM,
+                                  const PipelineElement &E, bool VerifyEachPass,
+                                  bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1413,56 +1412,50 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
   if (!InnerPipeline.empty()) {
     if (Name == "module") {
       ModulePassManager NestedMPM(DebugLogging);
-      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline,
-                                             VerifyEachPass, DebugLogging))
-        return Err;
+      if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass,
+                                   DebugLogging))
+        return false;
       MPM.addPass(std::move(NestedMPM));
-      return Error::success();
+      return true;
     }
     if (Name == "cgscc") {
       CGSCCPassManager CGPM(DebugLogging);
-      if (auto Err = parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass,
-                                            DebugLogging))
-        return Err;
+      if (!parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass,
+                                  DebugLogging))
+        return false;
       MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
-      return Error::success();
+      return true;
     }
     if (Name == "function") {
       FunctionPassManager FPM(DebugLogging);
-      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline,
-                                               VerifyEachPass, DebugLogging))
-        return Err;
+      if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass,
+                                     DebugLogging))
+        return false;
       MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-      return Error::success();
+      return true;
     }
     if (auto Count = parseRepeatPassName(Name)) {
       ModulePassManager NestedMPM(DebugLogging);
-      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline,
-                                             VerifyEachPass, DebugLogging))
-        return Err;
+      if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass,
+                                   DebugLogging))
+        return false;
       MPM.addPass(createRepeatedPass(*Count, std::move(NestedMPM)));
-      return Error::success();
+      return true;
     }
 
     for (auto &C : ModulePipelineParsingCallbacks)
       if (C(Name, MPM, InnerPipeline))
-        return Error::success();
+        return true;
 
     // Normal passes can't have pipelines.
-    return make_error<StringError>(
-        formatv("invalid use of '{0}' pass as module pipeline", Name).str(),
-        inconvertibleErrorCode());
-    ;
+    return false;
   }
 
   // Manually handle aliases for pre-configured pipeline fragments.
   if (startsWithDefaultPipelineAliasPrefix(Name)) {
     SmallVector<StringRef, 3> Matches;
     if (!DefaultAliasRegex.match(Name, &Matches))
-      return make_error<StringError>(
-          formatv("unknown default pipeline alias '{0}'", Name).str(),
-          inconvertibleErrorCode());
-
+      return false;
     assert(Matches.size() == 3 && "Must capture two matched strings!");
 
     OptimizationLevel L = StringSwitch<OptimizationLevel>(Matches[2])
@@ -1474,7 +1467,7 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
                               .Case("Oz", Oz);
     if (L == O0)
       // At O0 we do nothing at all!
-      return Error::success();
+      return true;
 
     if (Matches[1] == "default") {
       MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging));
@@ -1488,40 +1481,38 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
       assert(Matches[1] == "lto" && "Not one of the matched options!");
       MPM.addPass(buildLTODefaultPipeline(L, DebugLogging, nullptr));
     }
-    return Error::success();
+    return true;
   }
 
   // Finally expand the basic registered passes from the .inc file.
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   if (Name == NAME) {                                                          \
     MPM.addPass(CREATE_PASS);                                                  \
-    return Error::success();                                                   \
+    return true;                                                               \
   }
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">") {                                           \
     MPM.addPass(                                                               \
         RequireAnalysisPass<                                                   \
             std::remove_reference<decltype(CREATE_PASS)>::type, Module>());    \
-    return Error::success();                                                   \
+    return true;                                                               \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     MPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return Error::success();                                                   \
+    return true;                                                               \
   }
 #include "PassRegistry.def"
 
   for (auto &C : ModulePipelineParsingCallbacks)
     if (C(Name, MPM, InnerPipeline))
-      return Error::success();
-  return make_error<StringError>(
-      formatv("unknown module pass '{0}'", Name).str(),
-      inconvertibleErrorCode());
+      return true;
+  return false;
 }
 
-Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
-                                  const PipelineElement &E, bool VerifyEachPass,
-                                  bool DebugLogging) {
+bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
+                                 const PipelineElement &E, bool VerifyEachPass,
+                                 bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1529,55 +1520,53 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
   if (!InnerPipeline.empty()) {
     if (Name == "cgscc") {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
-                                            VerifyEachPass, DebugLogging))
-        return Err;
+      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
+                                  DebugLogging))
+        return false;
       // Add the nested pass manager with the appropriate adaptor.
       CGPM.addPass(std::move(NestedCGPM));
-      return Error::success();
+      return true;
     }
     if (Name == "function") {
       FunctionPassManager FPM(DebugLogging);
-      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline,
-                                               VerifyEachPass, DebugLogging))
-        return Err;
+      if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass,
+                                     DebugLogging))
+        return false;
       // Add the nested pass manager with the appropriate adaptor.
       CGPM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
-      return Error::success();
+      return true;
     }
     if (auto Count = parseRepeatPassName(Name)) {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
-                                            VerifyEachPass, DebugLogging))
-        return Err;
+      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
+                                  DebugLogging))
+        return false;
       CGPM.addPass(createRepeatedPass(*Count, std::move(NestedCGPM)));
-      return Error::success();
+      return true;
     }
     if (auto MaxRepetitions = parseDevirtPassName(Name)) {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
-                                            VerifyEachPass, DebugLogging))
-        return Err;
+      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
+                                  DebugLogging))
+        return false;
       CGPM.addPass(
           createDevirtSCCRepeatedPass(std::move(NestedCGPM), *MaxRepetitions));
-      return Error::success();
+      return true;
     }
 
     for (auto &C : CGSCCPipelineParsingCallbacks)
       if (C(Name, CGPM, InnerPipeline))
-        return Error::success();
+        return true;
 
     // Normal passes can't have pipelines.
-    return make_error<StringError>(
-        formatv("invalid use of '{0}' pass as cgscc pipeline", Name).str(),
-        inconvertibleErrorCode());
+    return false;
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define CGSCC_PASS(NAME, CREATE_PASS)                                          \
   if (Name == NAME) {                                                          \
     CGPM.addPass(CREATE_PASS);                                                 \
-    return Error::success();                                                   \
+    return true;                                                               \
   }
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   if (Name == "require<" NAME ">") {                                           \
@@ -1585,26 +1574,24 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
                  std::remove_reference<decltype(CREATE_PASS)>::type,           \
                  LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,    \
                  CGSCCUpdateResult &>());                                      \
-    return Error::success();                                                   \
+    return true;                                                               \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     CGPM.addPass(InvalidateAnalysisPass<                                       \
                  std::remove_reference<decltype(CREATE_PASS)>::type>());       \
-    return Error::success();                                                   \
+    return true;                                                               \
   }
 #include "PassRegistry.def"
 
   for (auto &C : CGSCCPipelineParsingCallbacks)
     if (C(Name, CGPM, InnerPipeline))
-      return Error::success();
-  return make_error<StringError>(
-      formatv("unknown cgscc pass '{0}'", Name).str(),
-      inconvertibleErrorCode());
+      return true;
+  return false;
 }
 
-Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
-                                     const PipelineElement &E,
-                                     bool VerifyEachPass, bool DebugLogging) {
+bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
+                                    const PipelineElement &E,
+                                    bool VerifyEachPass, bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1612,72 +1599,68 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
   if (!InnerPipeline.empty()) {
     if (Name == "function") {
       FunctionPassManager NestedFPM(DebugLogging);
-      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline,
-                                               VerifyEachPass, DebugLogging))
-        return Err;
+      if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass,
+                                     DebugLogging))
+        return false;
       // Add the nested pass manager with the appropriate adaptor.
       FPM.addPass(std::move(NestedFPM));
-      return Error::success();
+      return true;
     }
     if (Name == "loop") {
       LoopPassManager LPM(DebugLogging);
-      if (auto Err = parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass,
-                                           DebugLogging))
-        return Err;
+      if (!parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass,
+                                 DebugLogging))
+        return false;
       // Add the nested pass manager with the appropriate adaptor.
       FPM.addPass(
           createFunctionToLoopPassAdaptor(std::move(LPM), DebugLogging));
-      return Error::success();
+      return true;
     }
     if (auto Count = parseRepeatPassName(Name)) {
       FunctionPassManager NestedFPM(DebugLogging);
-      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline,
-                                               VerifyEachPass, DebugLogging))
-        return Err;
+      if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass,
+                                     DebugLogging))
+        return false;
       FPM.addPass(createRepeatedPass(*Count, std::move(NestedFPM)));
-      return Error::success();
+      return true;
     }
 
     for (auto &C : FunctionPipelineParsingCallbacks)
       if (C(Name, FPM, InnerPipeline))
-        return Error::success();
+        return true;
 
     // Normal passes can't have pipelines.
-    return make_error<StringError>(
-        formatv("invalid use of '{0}' pass as function pipeline", Name).str(),
-        inconvertibleErrorCode());
+    return false;
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
     FPM.addPass(CREATE_PASS);                                                  \
-    return Error::success();                                                   \
+    return true;                                                               \
   }
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   if (Name == "require<" NAME ">") {                                           \
     FPM.addPass(                                                               \
         RequireAnalysisPass<                                                   \
             std::remove_reference<decltype(CREATE_PASS)>::type, Function>());  \
-    return Error::success();                                                   \
+    return true;                                                               \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     FPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return Error::success();                                                   \
+    return true;                                                               \
   }
 #include "PassRegistry.def"
 
   for (auto &C : FunctionPipelineParsingCallbacks)
     if (C(Name, FPM, InnerPipeline))
-      return Error::success();
-  return make_error<StringError>(
-      formatv("unknown function pass '{0}'", Name).str(),
-      inconvertibleErrorCode());
+      return true;
+  return false;
 }
 
-Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
-                                 bool VerifyEachPass, bool DebugLogging) {
+bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
+                                bool VerifyEachPass, bool DebugLogging) {
   StringRef Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1685,37 +1668,35 @@ Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
   if (!InnerPipeline.empty()) {
     if (Name == "loop") {
       LoopPassManager NestedLPM(DebugLogging);
-      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline,
-                                           VerifyEachPass, DebugLogging))
-        return Err;
+      if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass,
+                                 DebugLogging))
+        return false;
       // Add the nested pass manager with the appropriate adaptor.
       LPM.addPass(std::move(NestedLPM));
-      return Error::success();
+      return true;
     }
     if (auto Count = parseRepeatPassName(Name)) {
       LoopPassManager NestedLPM(DebugLogging);
-      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline,
-                                           VerifyEachPass, DebugLogging))
-        return Err;
+      if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass,
+                                 DebugLogging))
+        return false;
       LPM.addPass(createRepeatedPass(*Count, std::move(NestedLPM)));
-      return Error::success();
+      return true;
     }
 
     for (auto &C : LoopPipelineParsingCallbacks)
       if (C(Name, LPM, InnerPipeline))
-        return Error::success();
+        return true;
 
     // Normal passes can't have pipelines.
-    return make_error<StringError>(
-        formatv("invalid use of '{0}' pass as loop pipeline", Name).str(),
-        inconvertibleErrorCode());
+    return false;
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
     LPM.addPass(CREATE_PASS);                                                  \
-    return Error::success();                                                   \
+    return true;                                                               \
   }
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
   if (Name == "require<" NAME ">") {                                           \
@@ -1723,20 +1704,19 @@ Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
                 std::remove_reference<decltype(CREATE_PASS)>::type, Loop,      \
                 LoopAnalysisManager, LoopStandardAnalysisResults &,            \
                 LPMUpdater &>());                                              \
-    return Error::success();                                                   \
+    return true;                                                               \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     LPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return Error::success();                                                   \
+    return true;                                                               \
   }
 #include "PassRegistry.def"
 
   for (auto &C : LoopPipelineParsingCallbacks)
     if (C(Name, LPM, InnerPipeline))
-      return Error::success();
-  return make_error<StringError>(formatv("unknown loop pass '{0}'", Name).str(),
-                                 inconvertibleErrorCode());
+      return true;
+  return false;
 }
 
 bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) {
@@ -1760,42 +1740,41 @@ bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) {
   return false;
 }
 
-Error PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM,
-                                         ArrayRef<PipelineElement> Pipeline,
-                                         bool VerifyEachPass,
-                                         bool DebugLogging) {
+bool PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM,
+                                        ArrayRef<PipelineElement> Pipeline,
+                                        bool VerifyEachPass,
+                                        bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (auto Err = parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging))
-      return Err;
+    if (!parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging))
+      return false;
     // FIXME: No verifier support for Loop passes!
   }
-  return Error::success();
+  return true;
 }
 
-Error PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM,
-                                             ArrayRef<PipelineElement> Pipeline,
-                                             bool VerifyEachPass,
-                                             bool DebugLogging) {
+bool PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM,
+                                            ArrayRef<PipelineElement> Pipeline,
+                                            bool VerifyEachPass,
+                                            bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (auto Err =
-            parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging))
-      return Err;
+    if (!parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging))
+      return false;
     if (VerifyEachPass)
       FPM.addPass(VerifierPass());
   }
-  return Error::success();
+  return true;
 }
 
-Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
-                                          ArrayRef<PipelineElement> Pipeline,
-                                          bool VerifyEachPass,
-                                          bool DebugLogging) {
+bool PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
+                                         ArrayRef<PipelineElement> Pipeline,
+                                         bool VerifyEachPass,
+                                         bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (auto Err = parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging))
-      return Err;
+    if (!parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging))
+      return false;
     // FIXME: No verifier support for CGSCC passes!
   }
-  return Error::success();
+  return true;
 }
 
 void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
@@ -1811,30 +1790,28 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
   LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
 }
 
-Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
-                                           ArrayRef<PipelineElement> Pipeline,
-                                           bool VerifyEachPass,
-                                           bool DebugLogging) {
+bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
+                                          ArrayRef<PipelineElement> Pipeline,
+                                          bool VerifyEachPass,
+                                          bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (auto Err = parseModulePass(MPM, Element, VerifyEachPass, DebugLogging))
-      return Err;
+    if (!parseModulePass(MPM, Element, VerifyEachPass, DebugLogging))
+      return false;
     if (VerifyEachPass)
       MPM.addPass(VerifierPass());
   }
-  return Error::success();
+  return true;
 }
 
 // Primary pass pipeline description parsing routine for a \c ModulePassManager
 // FIXME: Should this routine accept a TargetMachine or require the caller to
 // pre-populate the analysis managers with target-specific stuff?
-Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
-                                     StringRef PipelineText,
-                                     bool VerifyEachPass, bool DebugLogging) {
+bool PassBuilder::parsePassPipeline(ModulePassManager &MPM,
+                                    StringRef PipelineText, bool VerifyEachPass,
+                                    bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return make_error<StringError>(
-        formatv("invalid pipeline '{0}'", PipelineText).str(),
-        inconvertibleErrorCode());
+    return false;
 
   // If the first name isn't at the module layer, wrap the pipeline up
   // automatically.
@@ -1851,106 +1828,73 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
     } else {
       for (auto &C : TopLevelPipelineParsingCallbacks)
         if (C(MPM, *Pipeline, VerifyEachPass, DebugLogging))
-          return Error::success();
-
-      // Unknown pass or pipeline name!
-      auto &InnerPipeline = Pipeline->front().InnerPipeline;
-      return make_error<StringError>(
-          formatv("unknown {0} name '{1}'",
-                  (InnerPipeline.empty() ? "pass" : "pipeline"), FirstName)
-              .str(),
-          inconvertibleErrorCode());
+          return true;
+
+      // Unknown pass name!
+      return false;
     }
   }
 
-  if (auto Err =
-          parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging))
-    return Err;
-  return Error::success();
+  return parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging);
 }
 
 // Primary pass pipeline description parsing routine for a \c CGSCCPassManager
-Error PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
-                                     StringRef PipelineText,
-                                     bool VerifyEachPass, bool DebugLogging) {
+bool PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
+                                    StringRef PipelineText, bool VerifyEachPass,
+                                    bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return make_error<StringError>(
-        formatv("invalid pipeline '{0}'", PipelineText).str(),
-        inconvertibleErrorCode());
+    return false;
 
   StringRef FirstName = Pipeline->front().Name;
   if (!isCGSCCPassName(FirstName, CGSCCPipelineParsingCallbacks))
-    return make_error<StringError>(
-        formatv("unknown cgscc pass '{0}' in pipeline '{1}'", FirstName,
-                PipelineText)
-            .str(),
-        inconvertibleErrorCode());
-
-  if (auto Err =
-          parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging))
-    return Err;
-  return Error::success();
+    return false;
+
+  return parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging);
 }
 
 // Primary pass pipeline description parsing routine for a \c
 // FunctionPassManager
-Error PassBuilder::parsePassPipeline(FunctionPassManager &FPM,
-                                     StringRef PipelineText,
-                                     bool VerifyEachPass, bool DebugLogging) {
+bool PassBuilder::parsePassPipeline(FunctionPassManager &FPM,
+                                    StringRef PipelineText, bool VerifyEachPass,
+                                    bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return make_error<StringError>(
-        formatv("invalid pipeline '{0}'", PipelineText).str(),
-        inconvertibleErrorCode());
+    return false;
 
   StringRef FirstName = Pipeline->front().Name;
   if (!isFunctionPassName(FirstName, FunctionPipelineParsingCallbacks))
-    return make_error<StringError>(
-        formatv("unknown function pass '{0}' in pipeline '{1}'", FirstName,
-                PipelineText)
-            .str(),
-        inconvertibleErrorCode());
-
-  if (auto Err = parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass,
-                                           DebugLogging))
-    return Err;
-  return Error::success();
+    return false;
+
+  return parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass,
+                                   DebugLogging);
 }
 
 // Primary pass pipeline description parsing routine for a \c LoopPassManager
-Error PassBuilder::parsePassPipeline(LoopPassManager &CGPM,
-                                     StringRef PipelineText,
-                                     bool VerifyEachPass, bool DebugLogging) {
+bool PassBuilder::parsePassPipeline(LoopPassManager &CGPM,
+                                    StringRef PipelineText, bool VerifyEachPass,
+                                    bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return make_error<StringError>(
-        formatv("invalid pipeline '{0}'", PipelineText).str(),
-        inconvertibleErrorCode());
-
-  if (auto Err =
-          parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging))
-    return Err;
+    return false;
 
-  return Error::success();
+  return parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging);
 }
 
-Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
+bool PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
   // If the pipeline just consists of the word 'default' just replace the AA
   // manager with our default one.
   if (PipelineText == "default") {
     AA = buildDefaultAAPipeline();
-    return Error::success();
+    return true;
   }
 
   while (!PipelineText.empty()) {
     StringRef Name;
     std::tie(Name, PipelineText) = PipelineText.split(',');
     if (!parseAAPassName(AA, Name))
-      return make_error<StringError>(
-          formatv("unknown alias analysis name '{0}'", Name).str(),
-          inconvertibleErrorCode());
+      return false;
   }
 
-  return Error::success();
+  return true;
 }
diff --git a/test/Other/pass-pipeline-parsing.ll b/test/Other/pass-pipeline-parsing.ll
index d13a977dbce..b303318c796 100644
--- a/test/Other/pass-pipeline-parsing.ll
+++ b/test/Other/pass-pipeline-parsing.ll
@@ -54,52 +54,52 @@
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED1
-; CHECK-UNBALANCED1: invalid pipeline 'no-op-module)'
+; CHECK-UNBALANCED1: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='module(no-op-module))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED2
-; CHECK-UNBALANCED2: invalid pipeline 'module(no-op-module))'
+; CHECK-UNBALANCED2: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='module(no-op-module' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED3
-; CHECK-UNBALANCED3: invalid pipeline 'module(no-op-module'
+; CHECK-UNBALANCED3: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED4
-; CHECK-UNBALANCED4: invalid pipeline 'no-op-function)'
+; CHECK-UNBALANCED4: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED5
-; CHECK-UNBALANCED5: invalid pipeline 'function(no-op-function))'
+; CHECK-UNBALANCED5: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(function(no-op-function)))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED6
-; CHECK-UNBALANCED6: invalid pipeline 'function(function(no-op-function)))'
+; CHECK-UNBALANCED6: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED7
-; CHECK-UNBALANCED7: invalid pipeline 'function(no-op-function'
+; CHECK-UNBALANCED7: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(function(no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED8
-; CHECK-UNBALANCED8: invalid pipeline 'function(function(no-op-function)'
+; CHECK-UNBALANCED8: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module,)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED9
-; CHECK-UNBALANCED9: invalid pipeline 'no-op-module,)'
+; CHECK-UNBALANCED9: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function,)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED10
-; CHECK-UNBALANCED10: invalid pipeline 'no-op-function,)'
+; CHECK-UNBALANCED10: unable to parse pass pipeline description
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes=no-op-cgscc,no-op-cgscc %s 2>&1 \
@@ -176,86 +176,37 @@
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function)function(no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-MISSING-COMMA1
-; CHECK-MISSING-COMMA1: invalid pipeline 'function(no-op-function)function(no-op-function)'
+; CHECK-MISSING-COMMA1: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function()' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-EMPTY-INNER-PIPELINE
-; CHECK-EMPTY-INNER-PIPELINE: unknown function pass ''
+; CHECK-EMPTY-INNER-PIPELINE: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module(no-op-module,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-MODULE-PASS
-; CHECK-PIPELINE-ON-MODULE-PASS: invalid use of 'no-op-module' pass as module pipeline
+; CHECK-PIPELINE-ON-MODULE-PASS: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-cgscc(no-op-cgscc,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-CGSCC-PASS
-; CHECK-PIPELINE-ON-CGSCC-PASS: invalid use of 'no-op-cgscc' pass as cgscc pipeline
+; CHECK-PIPELINE-ON-CGSCC-PASS: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function(no-op-function,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-FUNCTION-PASS
-; CHECK-PIPELINE-ON-FUNCTION-PASS: invalid use of 'no-op-function' pass as function pipeline
+; CHECK-PIPELINE-ON-FUNCTION-PASS: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-loop(no-op-loop,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-LOOP-PASS
-; CHECK-PIPELINE-ON-LOOP-PASS: invalid use of 'no-op-loop' pass as loop pipeline
+; CHECK-PIPELINE-ON-LOOP-PASS: unable to parse pass pipeline description
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function()' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-EMPTY-PIPELINE-ON-PASS
-; CHECK-EMPTY-PIPELINE-ON-PASS: invalid use of 'no-op-function' pass as function pipeline
-
-; RUN: not opt -passes='no-op-module,bad' \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-MODULE
-; CHECK-UNKNOWN-MODULE: opt: unknown module pass 'bad'
-
-; RUN: not opt -passes='no-op-loop,bad' \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-LOOP
-; CHECK-UNKNOWN-LOOP: opt: unknown loop pass 'bad'
-
-; RUN: not opt -passes='no-op-cgscc,bad' \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-CGSCC
-; CHECK-UNKNOWN-CGSCC: opt: unknown cgscc pass 'bad'
-
-; RUN: not opt -passes='no-op-function,bad' \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
-; RUN: not opt -passes='function(bad,pipeline,text)' \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
-; RUN: not opt -passes='module(no-op-module,function(bad,pipeline,text))' \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
-; RUN: not opt -passes='no-op-module,function(bad,pipeline,text)' \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
-; RUN: not opt -passes='module(cgscc(function(bad,pipeline,text)))' \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
-; CHECK-UNKNOWN-FUNCTION: opt: unknown function pass 'bad'
-
-; RUN: not opt -aa-pipeline=bad -passes=no-op-function \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=AA-PIPELINE-ERR
-; AA-PIPELINE-ERR: unknown alias analysis name 'bad'
-; RUN: opt -passes-ep-peephole=bad -passes=no-op-function \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PEEPHOLE-ERR
-; PASSES-EP-PEEPHOLE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
-; RUN: opt -passes-ep-late-loop-optimizations=bad -passes=no-op-function \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LATELOOPOPT-ERR
-; PASSES-EP-LATELOOPOPT-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
-; RUN: opt -passes-ep-loop-optimizer-end=bad -passes=no-op-function \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LOOPOPTEND-ERR
-; PASSES-EP-LOOPOPTEND-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
-; RUN: opt -passes-ep-scalar-optimizer-late=bad -passes=no-op-function \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-SCALAROPTLATE-ERR
-; PASSES-EP-SCALAROPTLATE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
-; RUN: opt -passes-ep-cgscc-optimizer-late=bad -passes=no-op-function \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-CGSCCOPTLATE-ERR
-; PASSES-EP-CGSCCOPTLATE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
-; RUN: opt -passes-ep-vectorizer-start=bad -passes=no-op-function \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-VECTORIZERSTART-ERR
-; PASSES-EP-VECTORIZERSTART-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
-; RUN: opt -passes-ep-pipeline-start=bad -passes=no-op-function \
-; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PIPELINESTART-ERR
-; PASSES-EP-PIPELINESTART-ERR: Could not parse pipeline 'bad'. I'm going to ignore it.
+; CHECK-EMPTY-PIPELINE-ON-PASS: unable to parse pass pipeline description
 
 define void @f() {
 entry:
diff --git a/test/tools/llvm-lto2/X86/pipeline.ll b/test/tools/llvm-lto2/X86/pipeline.ll
index 9ab81ac70a7..29276d8d13a 100644
--- a/test/tools/llvm-lto2/X86/pipeline.ll
+++ b/test/tools/llvm-lto2/X86/pipeline.ll
@@ -32,11 +32,11 @@ define void @patatino() {
 ; RUN:  -r %t1.bc,patatino,px -opt-pipeline foogoo 2>&1 | \
 ; RUN:  FileCheck %s --check-prefix=ERR
 
-; ERR: LLVM ERROR: unable to parse pass pipeline description 'foogoo': unknown pass name 'foogoo'
+; ERR: LLVM ERROR: unable to parse pass pipeline description: foogoo
 
 ; RUN: not llvm-lto2 run %t1.bc -o %t.o \
 ; RUN:  -r %t1.bc,patatino,px -aa-pipeline patatino \
 ; RUN:  -opt-pipeline loweratomic 2>&1 | \
 ; RUN:  FileCheck %s --check-prefix=AAERR
 
-; AAERR: LLVM ERROR: unable to parse AA pipeline description 'patatino': unknown alias analysis name 'patatino'
+; AAERR: LLVM ERROR: unable to parse AA pipeline description: patatino
diff --git a/test/tools/llvm-opt-fuzzer/command-line.ll b/test/tools/llvm-opt-fuzzer/command-line.ll
index 8c3f6b60154..f747bba431b 100644
--- a/test/tools/llvm-opt-fuzzer/command-line.ll
+++ b/test/tools/llvm-opt-fuzzer/command-line.ll
@@ -13,7 +13,7 @@
 
 ; Don't start with incorrect passes specified
 ; RUN: not llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes no-pass 2>&1 | FileCheck -check-prefix=PIPELINE %s
-; PIPELINE: unknown pass name 'no-pass'
+; PIPELINE: can't parse pass pipeline
 
 ; Correct command line
 ; RUN: llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes instcombine 2>&1 | FileCheck -check-prefix=CORRECT %s
diff --git a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
index 57e75b1db9e..98d5428ddd1 100644
--- a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
+++ b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
@@ -144,10 +144,9 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false);
-  assert(!Err && "Should have been checked during fuzzer initialization");
-  // Only fail with assert above, otherwise ignore the parsing error.
-  consumeError(std::move(Err));
+  bool Ok = PB.parsePassPipeline(MPM, PassPipeline, false, false);
+  assert(Ok && "Should have been checked during fuzzer initialization");
+  (void)Ok; // silence unused variable warning on release builds
 
   // Run passes which we need to test
   //
@@ -236,8 +235,8 @@ extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize(
 
   PassBuilder PB(TM.get());
   ModulePassManager MPM;
-  if (auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false)) {
-    errs() << *argv[0] << ": " << toString(std::move(Err)) << "\n";
+  if (!PB.parsePassPipeline(MPM, PassPipeline, false, false)) {
+    errs() << *argv[0] << ": can't parse pass pipeline\n";
     exit(1);
   }
 
diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp
index 11879d26a6c..e63547a79d0 100644
--- a/tools/opt/NewPMDriver.cpp
+++ b/tools/opt/NewPMDriver.cpp
@@ -124,12 +124,12 @@ bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) {
 
   // Verify the pipeline is parseable:
   PassManagerT PM;
-  if (auto Err = PB.parsePassPipeline(PM, PipelineText)) {
-    errs() << "Could not parse pipeline '" << PipelineText
-           << "'. I'm going to ignore it.\n";
-    return false;
-  }
-  return true;
+  if (PB.parsePassPipeline(PM, PipelineText))
+    return true;
+
+  errs() << "Could not parse pipeline '" << PipelineText
+         << "'. I'm going to igore it.\n";
+  return false;
 }
 
 /// If one of the EPPipeline command line options was given, register callbacks
@@ -137,61 +137,50 @@ bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) {
 static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass,
                                 bool DebugLogging) {
   if (tryParsePipelineText<FunctionPassManager>(PB, PeepholeEPPipeline))
-    PB.registerPeepholeEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](
-            FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          ExitOnError Err("Unable to parse PeepholeEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass,
-                                   DebugLogging));
-        });
+    PB.registerPeepholeEPCallback([&PB, VerifyEachPass, DebugLogging](
+        FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+      PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass,
+                           DebugLogging);
+    });
   if (tryParsePipelineText<LoopPassManager>(PB,
                                             LateLoopOptimizationsEPPipeline))
     PB.registerLateLoopOptimizationsEPCallback(
         [&PB, VerifyEachPass, DebugLogging](
             LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          ExitOnError Err("Unable to parse LateLoopOptimizationsEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline,
-                                   VerifyEachPass, DebugLogging));
+          PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline,
+                               VerifyEachPass, DebugLogging);
         });
   if (tryParsePipelineText<LoopPassManager>(PB, LoopOptimizerEndEPPipeline))
-    PB.registerLoopOptimizerEndEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](
-            LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          ExitOnError Err("Unable to parse LoopOptimizerEndEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline,
-                                   VerifyEachPass, DebugLogging));
-        });
+    PB.registerLoopOptimizerEndEPCallback([&PB, VerifyEachPass, DebugLogging](
+        LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
+      PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline, VerifyEachPass,
+                           DebugLogging);
+    });
   if (tryParsePipelineText<FunctionPassManager>(PB,
                                                 ScalarOptimizerLateEPPipeline))
     PB.registerScalarOptimizerLateEPCallback(
         [&PB, VerifyEachPass, DebugLogging](
             FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          ExitOnError Err("Unable to parse ScalarOptimizerLateEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline,
-                                   VerifyEachPass, DebugLogging));
+          PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline,
+                               VerifyEachPass, DebugLogging);
         });
   if (tryParsePipelineText<CGSCCPassManager>(PB, CGSCCOptimizerLateEPPipeline))
-    PB.registerCGSCCOptimizerLateEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](
-            CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          ExitOnError Err("Unable to parse CGSCCOptimizerLateEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline,
-                                   VerifyEachPass, DebugLogging));
-        });
+    PB.registerCGSCCOptimizerLateEPCallback([&PB, VerifyEachPass, DebugLogging](
+        CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
+      PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline, VerifyEachPass,
+                           DebugLogging);
+    });
   if (tryParsePipelineText<FunctionPassManager>(PB, VectorizerStartEPPipeline))
-    PB.registerVectorizerStartEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](
-            FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          ExitOnError Err("Unable to parse VectorizerStartEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, VectorizerStartEPPipeline,
-                                   VerifyEachPass, DebugLogging));
-        });
+    PB.registerVectorizerStartEPCallback([&PB, VerifyEachPass, DebugLogging](
+        FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+      PB.parsePassPipeline(PM, VectorizerStartEPPipeline, VerifyEachPass,
+                           DebugLogging);
+    });
   if (tryParsePipelineText<ModulePassManager>(PB, PipelineStartEPPipeline))
     PB.registerPipelineStartEPCallback(
         [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM) {
-          ExitOnError Err("Unable to parse PipelineStartEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass,
-                                   DebugLogging));
+          PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass,
+                               DebugLogging);
         });
 }
 
@@ -269,8 +258,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   // Specially handle the alias analysis manager so that we can register
   // a custom pipeline of AA passes with it.
   AAManager AA;
-  if (auto Err = PB.parseAAPipeline(AA, AAPipeline)) {
-    errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
+  if (!PB.parseAAPipeline(AA, AAPipeline)) {
+    errs() << Arg0 << ": unable to parse AA pipeline description.\n";
     return false;
   }
 
@@ -295,9 +284,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   if (EnableDebugify)
     MPM.addPass(NewPMDebugifyPass());
 
-  if (auto Err =
-          PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) {
-    errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
+  if (!PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) {
+    errs() << Arg0 << ": unable to parse pass pipeline description.\n";
     return false;
   }
 
diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index 7498983b260..211ab109131 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@@ -40,5 +40,3 @@ add_llvm_unittest(IRTests
   VerifierTest.cpp
   WaymarkTest.cpp
   )
-
-target_link_libraries(IRTests PRIVATE LLVMTestingSupport)
diff --git a/unittests/IR/PassBuilderCallbacksTest.cpp b/unittests/IR/PassBuilderCallbacksTest.cpp
index 20c47b045e7..97bbb81a6b0 100644
--- a/unittests/IR/PassBuilderCallbacksTest.cpp
+++ b/unittests/IR/PassBuilderCallbacksTest.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Testing/Support/Error.h"
 #include <functional>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -461,7 +460,7 @@ TEST_F(ModuleCallbacksTest, Passes) {
       .WillOnce(Invoke(getAnalysisResult));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -495,7 +494,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -526,7 +525,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -538,7 +537,7 @@ TEST_F(FunctionCallbacksTest, Passes) {
       .WillOnce(Invoke(getAnalysisResult));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -572,7 +571,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -605,7 +604,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -616,7 +615,7 @@ TEST_F(LoopCallbacksTest, Passes) {
       .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult)));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -651,7 +650,7 @@ TEST_F(LoopCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -683,7 +682,7 @@ TEST_F(LoopCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -694,7 +693,7 @@ TEST_F(CGSCCCallbacksTest, Passes) {
       .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult)));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -728,7 +727,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -760,7 +759,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -775,7 +774,7 @@ TEST_F(ModuleCallbacksTest, AnalysisUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("<string>"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -785,7 +784,7 @@ TEST_F(CGSCCCallbacksTest, PassUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("(foo)"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -795,7 +794,7 @@ TEST_F(FunctionCallbacksTest, AnalysisUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("foo"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -806,7 +805,7 @@ TEST_F(LoopCallbacksTest, PassUtilities) {
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
 
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -846,13 +845,13 @@ TEST_F(ModuleCallbacksTest, ParseTopLevelPipeline) {
 
   StringRef PipelineText =
       "another-pipeline(test-transform,invalidate<test-analysis>)";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
+  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 
   /// Test the negative case
   PipelineText = "another-pipeline(instcombine)";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Failed())
+  ASSERT_FALSE(PB.parsePassPipeline(PM, PipelineText, true))
       << "Pipeline was: " << PipelineText;
 }
 } // end anonymous namespace
diff --git a/unittests/Passes/CMakeLists.txt b/unittests/Passes/CMakeLists.txt
index 415f3a71734..d90df209d4e 100644
--- a/unittests/Passes/CMakeLists.txt
+++ b/unittests/Passes/CMakeLists.txt
@@ -12,7 +12,6 @@ add_llvm_unittest(PluginsTests
   PluginsTest.cpp
   )
 export_executable_symbols(PluginsTests)
-target_link_libraries(PluginsTests PRIVATE LLVMTestingSupport)
 
 set(LLVM_LINK_COMPONENTS)
 add_llvm_loadable_module(TestPlugin
diff --git a/unittests/Passes/PluginsTest.cpp b/unittests/Passes/PluginsTest.cpp
index abb7b57ee0c..726978714e8 100644
--- a/unittests/Passes/PluginsTest.cpp
+++ b/unittests/Passes/PluginsTest.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Testing/Support/Error.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "gtest/gtest.h"
 
@@ -55,8 +54,8 @@ TEST(PluginsTests, LoadPlugin) {
 
   PassBuilder PB;
   ModulePassManager PM;
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Failed());
+  ASSERT_FALSE(PB.parsePassPipeline(PM, "plugin-pass"));
 
   Plugin->registerPassBuilderCallbacks(PB);
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Succeeded());
+  ASSERT_TRUE(PB.parsePassPipeline(PM, "plugin-pass"));
 }
-- 
GitLab


From fb06745cac34b3c3803b41de0e7958e9e9f85dc0 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 15 Oct 2018 15:38:38 +0000
Subject: [PATCH 0202/1116] [DAGCombiner] allow undef elts in vector fma
 matching

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344525 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 19 ++---
 test/CodeGen/X86/fma_patterns.ll         | 90 ++++++++++++++++--------
 2 files changed, 70 insertions(+), 39 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7ec5fac390b..f2779a3475e 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10794,17 +10794,18 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
 
-  // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y)
-  // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y))
+  // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y)
+  // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y))
   auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
     if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
-      auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
-      if (XC1 && XC1->isExactlyValue(+1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                           Y, Flags);
-      if (XC1 && XC1->isExactlyValue(-1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                           DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
+      if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) {
+        if (C->isExactlyValue(+1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             Y, Flags);
+        if (C->isExactlyValue(-1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
+      }
     }
     return SDValue();
   };
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index d0d0dfed352..5395ae46d47 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -637,23 +637,38 @@ define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
 }
 
 define <4 x float> @test_v4f32_mul_y_add_x_one_undefs(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_add_x_one_undefs:
-; FMA:       # %bb.0:
-; FMA-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_add_x_one_undefs:
-; FMA4:       # %bb.0:
-; FMA4-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_add_x_one_undefs:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
+; AVX512-NOINFS-NEXT:    retq
   %a = fadd <4 x float> %x, <float 1.0, float undef, float 1.0, float undef>
   %m = fmul <4 x float> %y, %a
   ret <4 x float> %m
@@ -736,23 +751,38 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y
 }
 
 define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
-; FMA:       # %bb.0:
-; FMA-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
-; FMA4:       # %bb.0:
-; FMA4-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
+; AVX512-NOINFS-NEXT:    retq
   %a = fadd <4 x float> %x, <float undef, float -1.0, float undef, float -1.0>
   %m = fmul <4 x float> %y, %a
   ret <4 x float> %m
-- 
GitLab


From 0ea7fc0dde41258d1352daa7f22e0438769b1a63 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 15 Oct 2018 15:47:37 +0000
Subject: [PATCH 0203/1116] [x86] add tests for fma with undef elts; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344527 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/fma_patterns.ll | 98 ++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 5395ae46d47..9ab2b1281f7 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -870,6 +870,32 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vmovaps {{.*#+}} xmm2 = <1,u,1,1>
+; FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vmovaps {{.*#+}} xmm2 = <1,u,1,1>
+; FMA4-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
+; AVX512-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> <float 1.0, float undef, float 1.0, float 1.0>, %x
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
 define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
 ; FMA-INFS:       # %bb.0:
@@ -952,6 +978,32 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1>
+; FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1>
+; FMA4-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; AVX512-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> <float -1.0, float -1.0, float undef, float -1.0>, %x
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
 define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_sub_x_one_y:
 ; FMA-INFS:       # %bb.0:
@@ -1028,6 +1080,29 @@ define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float undef>
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
 define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_sub_x_negone_y:
 ; FMA-INFS:       # %bb.0:
@@ -1104,6 +1179,29 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %s = fsub <4 x float> %x, <float undef, float -1.0, float -1.0, float -1.0>
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
 ;
 ; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
 ;
-- 
GitLab


From 907565571c1306f9914da6d7ecb98b781c658b1d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 15 Oct 2018 15:56:39 +0000
Subject: [PATCH 0204/1116] [DAGCombiner] allow undef elts in vector fma
 matching

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344528 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  43 ++---
 test/CodeGen/X86/fma_patterns.ll         | 192 +++++++++++++++--------
 2 files changed, 148 insertions(+), 87 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f2779a3475e..846830b3b28 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10815,29 +10815,30 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   if (SDValue FMA = FuseFADD(N1, N0, Flags))
     return FMA;
 
-  // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y)
-  // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y))
-  // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y))
-  // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y)
+  // fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y)
+  // fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y))
+  // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y))
+  // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y)
   auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
     if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
-      auto XC0 = isConstOrConstSplatFP(X.getOperand(0));
-      if (XC0 && XC0->isExactlyValue(+1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                           DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
-                           Y, Flags);
-      if (XC0 && XC0->isExactlyValue(-1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                           DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
-                           DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
-
-      auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
-      if (XC1 && XC1->isExactlyValue(+1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                           DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
-      if (XC1 && XC1->isExactlyValue(-1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                           Y, Flags);
+      if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) {
+        if (C0->isExactlyValue(+1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                             DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+                             Y, Flags);
+        if (C0->isExactlyValue(-1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                             DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+                             DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
+      }
+      if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) {
+        if (C1->isExactlyValue(+1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
+        if (C1->isExactlyValue(-1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             Y, Flags);
+      }
     }
     return SDValue();
   };
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 9ab2b1281f7..038836bd524 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -871,26 +871,41 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
 }
 
 define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
-; FMA:       # %bb.0:
-; FMA-NEXT:    vmovaps {{.*#+}} xmm2 = <1,u,1,1>
-; FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <1,u,1,1>
+; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
-; FMA4:       # %bb.0:
-; FMA4-NEXT:    vmovaps {{.*#+}} xmm2 = <1,u,1,1>
-; FMA4-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <1,u,1,1>
+; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
-; AVX512-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
+; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> <float 1.0, float undef, float 1.0, float 1.0>, %x
   %m = fmul <4 x float> %y, %s
   ret <4 x float> %m
@@ -979,26 +994,41 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y
 }
 
 define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
-; FMA:       # %bb.0:
-; FMA-NEXT:    vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1>
-; FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1>
+; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
-; FMA4:       # %bb.0:
-; FMA4-NEXT:    vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1>
-; FMA4-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1>
+; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1]
-; AVX512-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> <float -1.0, float -1.0, float undef, float -1.0>, %x
   %m = fmul <4 x float> %y, %s
   ret <4 x float> %m
@@ -1081,23 +1111,38 @@ define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
 }
 
 define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
-; FMA:       # %bb.0:
-; FMA-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
-; FMA4:       # %bb.0:
-; FMA4-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float undef>
   %m = fmul <4 x float> %y, %s
   ret <4 x float> %m
@@ -1180,23 +1225,38 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y
 }
 
 define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
-; FMA:       # %bb.0:
-; FMA-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
-; FMA4:       # %bb.0:
-; FMA4-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> %x, <float undef, float -1.0, float -1.0, float -1.0>
   %m = fmul <4 x float> %y, %s
   ret <4 x float> %m
-- 
GitLab


From 772e632e25431f43d61d5f20e3cb1cbea69caaee Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 15 Oct 2018 16:44:00 +0000
Subject: [PATCH 0205/1116] [AArch64] add tests for fmul x, -2.0 with undef
 elts; NFC

Also, add tests with commuted operands. There was no coverage for that case.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344531 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/AArch64/fadd-combines.ll | 55 ++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 10 deletions(-)

diff --git a/test/CodeGen/AArch64/fadd-combines.ll b/test/CodeGen/AArch64/fadd-combines.ll
index be027a7b558..c2e4430029a 100644
--- a/test/CodeGen/AArch64/fadd-combines.ll
+++ b/test/CodeGen/AArch64/fadd-combines.ll
@@ -51,8 +51,8 @@ define double @test4(double %a, double %b, double %c) {
   ret double %add2
 }
 
-define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: test5:
+define <4 x float> @fmulnegtwo_vec(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmulnegtwo_vec:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fadd v1.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
@@ -62,6 +62,41 @@ define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %add
 }
 
+define <4 x float> @fmulnegtwo_vec_commute(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmulnegtwo_vec_commute:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v1.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %mul = fmul <4 x float> %b, <float -2.0, float -2.0, float -2.0, float -2.0>
+  %add = fadd <4 x float> %mul, %a
+  ret <4 x float> %add
+}
+
+define <4 x float> @fmulnegtwo_vec_undefs(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmulnegtwo_vec_undefs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.4s, #192, lsl #24
+; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %mul = fmul <4 x float> %b, <float undef, float -2.0, float undef, float -2.0>
+  %add = fadd <4 x float> %a, %mul
+  ret <4 x float> %add
+}
+
+define <4 x float> @fmulnegtwo_vec_commute_undefs(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmulnegtwo_vec_commute_undefs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.4s, #192, lsl #24
+; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+  %mul = fmul <4 x float> %b, <float -2.0, float undef, float -2.0, float -2.0>
+  %add = fadd <4 x float> %mul, %a
+  ret <4 x float> %add
+}
+
 define <4 x float> @test6(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: test6:
 ; CHECK:       // %bb.0:
@@ -99,10 +134,10 @@ define double @test7(double %a, double %b) nounwind {
 define float @fadd_const_multiuse_fmf(float %x) {
 ; CHECK-LABEL: fadd_const_multiuse_fmf:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI7_0
-; CHECK-NEXT:    adrp x9, .LCPI7_1
-; CHECK-NEXT:    ldr s1, [x8, :lo12:.LCPI7_0]
-; CHECK-NEXT:    ldr s2, [x9, :lo12:.LCPI7_1]
+; CHECK-NEXT:    adrp x8, .LCPI10_0
+; CHECK-NEXT:    adrp x9, .LCPI10_1
+; CHECK-NEXT:    ldr s1, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT:    ldr s2, [x9, :lo12:.LCPI10_1]
 ; CHECK-NEXT:    fadd s1, s0, s1
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s1, s0
@@ -120,10 +155,10 @@ define float @fadd_const_multiuse_fmf(float %x) {
 define float @fadd_const_multiuse_attr(float %x) #0 {
 ; CHECK-LABEL: fadd_const_multiuse_attr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x9, .LCPI8_1
-; CHECK-NEXT:    adrp x8, .LCPI8_0
-; CHECK-NEXT:    ldr s1, [x9, :lo12:.LCPI8_1]
-; CHECK-NEXT:    ldr s2, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT:    adrp x9, .LCPI11_1
+; CHECK-NEXT:    adrp x8, .LCPI11_0
+; CHECK-NEXT:    ldr s1, [x9, :lo12:.LCPI11_1]
+; CHECK-NEXT:    ldr s2, [x8, :lo12:.LCPI11_0]
 ; CHECK-NEXT:    fadd s1, s0, s1
 ; CHECK-NEXT:    fadd s1, s2, s1
 ; CHECK-NEXT:    fadd s0, s0, s1
-- 
GitLab


From 9e0d834cc5697eeefc8b74b2fdf17895a20d4718 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 15 Oct 2018 16:47:01 +0000
Subject: [PATCH 0206/1116] [DAGCombiner] refactor folds for fadd (fmul X,
 -2.0), Y; NFCI

The transform doesn't work if the vector constant has undef elements.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344532 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 34 +++++++++++++-----------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 846830b3b28..ab871a25d07 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10851,14 +10851,6 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   return SDValue();
 }
 
-static bool isFMulNegTwo(SDValue &N) {
-  if (N.getOpcode() != ISD::FMUL)
-    return false;
-  if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1)))
-    return CFP->isExactlyValue(-2.0);
-  return false;
-}
-
 SDValue DAGCombiner::visitFADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -10903,14 +10895,24 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     return DAG.getNode(ISD::FSUB, DL, VT, N1,
                        GetNegatedExpression(N0, DAG, LegalOperations), Flags);
 
-  // fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B))
-  // fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B))
-  if ((isFMulNegTwo(N0) && N0.hasOneUse()) ||
-      (isFMulNegTwo(N1) && N1.hasOneUse())) {
-    bool N1IsFMul = isFMulNegTwo(N1);
-    SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0);
-    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags);
-    return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags);
+  auto isFMulNegTwo = [](SDValue FMul) {
+    if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)
+      return false;
+    auto *C = isConstOrConstSplatFP(FMul.getOperand(1));
+    return C && C->isExactlyValue(-2.0);
+  };
+
+  // fadd (fmul B, -2.0), A --> fsub A, (fadd B, B)
+  if (isFMulNegTwo(N0)) {
+    SDValue B = N0.getOperand(0);
+    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags);
+    return DAG.getNode(ISD::FSUB, DL, VT, N1, Add, Flags);
+  }
+  // fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B)
+  if (isFMulNegTwo(N1)) {
+    SDValue B = N1.getOperand(0);
+    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags);
+    return DAG.getNode(ISD::FSUB, DL, VT, N0, Add, Flags);
   }
 
   // No FP constant should be created after legalization as Instruction
-- 
GitLab


From 2ef4e14af3ffff12baf279eff50f4e4792c97c34 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 15 Oct 2018 16:54:07 +0000
Subject: [PATCH 0207/1116] [DAGCombiner] allow undef elts in vector fmul
 matching

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344534 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  2 +-
 test/CodeGen/AArch64/fadd-combines.ll    | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ab871a25d07..11cc699ffe1 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10898,7 +10898,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   auto isFMulNegTwo = [](SDValue FMul) {
     if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)
       return false;
-    auto *C = isConstOrConstSplatFP(FMul.getOperand(1));
+    auto *C = isConstOrConstSplatFP(FMul.getOperand(1), true);
     return C && C->isExactlyValue(-2.0);
   };
 
diff --git a/test/CodeGen/AArch64/fadd-combines.ll b/test/CodeGen/AArch64/fadd-combines.ll
index c2e4430029a..7332101a481 100644
--- a/test/CodeGen/AArch64/fadd-combines.ll
+++ b/test/CodeGen/AArch64/fadd-combines.ll
@@ -76,9 +76,8 @@ define <4 x float> @fmulnegtwo_vec_commute(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @fmulnegtwo_vec_undefs(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: fmulnegtwo_vec_undefs:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #192, lsl #24
-; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v1.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %mul = fmul <4 x float> %b, <float undef, float -2.0, float undef, float -2.0>
   %add = fadd <4 x float> %a, %mul
@@ -88,9 +87,8 @@ define <4 x float> @fmulnegtwo_vec_undefs(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @fmulnegtwo_vec_commute_undefs(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: fmulnegtwo_vec_commute_undefs:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #192, lsl #24
-; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fadd v1.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %mul = fmul <4 x float> %b, <float -2.0, float undef, float -2.0, float -2.0>
   %add = fadd <4 x float> %mul, %a
-- 
GitLab


From abfefc95baa455d8beadac40792d1384228736b1 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 15 Oct 2018 18:05:34 +0000
Subject: [PATCH 0208/1116] [SelectionDAG] allow FP binops in
 SimplifyDemandedVectorElts

This is intended to make the backend on par with functionality that was
added to the IR version of SimplifyDemandedVectorElts in:
rL343727
...and the original motivation is that we need to improve demanded-vector-elements
in several ways to avoid problems that would be exposed in D51553.

Differential Revision: https://reviews.llvm.org/D52912


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344541 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |  7 ++++-
 test/CodeGen/X86/avx512-hadd-hsub.ll          | 30 +++++++++----------
 .../X86/avx512-intrinsics-fast-isel.ll        | 16 +++++-----
 test/CodeGen/X86/vector-shuffle-combining.ll  |  4 +--
 4 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index d3a50788f79..150d22cffa7 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1712,7 +1712,12 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     break;
   }
   case ISD::ADD:
-  case ISD::SUB: {
+  case ISD::SUB:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM: {
     APInt SrcUndef, SrcZero;
     if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef,
                                    SrcZero, TLO, Depth + 1))
diff --git a/test/CodeGen/X86/avx512-hadd-hsub.ll b/test/CodeGen/X86/avx512-hadd-hsub.ll
index 510553b56d4..aed182179cf 100644
--- a/test/CodeGen/X86/avx512-hadd-hsub.ll
+++ b/test/CodeGen/X86/avx512-hadd-hsub.ll
@@ -178,16 +178,16 @@ define <8 x double> @fhadd_16_4(<8 x double> %x225, <8 x double> %x227) {
 define <4 x double> @fadd_noundef_low(<8 x double> %x225, <8 x double> %x227) {
 ; KNL-LABEL: fadd_noundef_low:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; KNL-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; KNL-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; KNL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; KNL-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fadd_noundef_low:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; SKX-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
 ; SKX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; SKX-NEXT:    retq
@@ -252,17 +252,15 @@ define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) {
 define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) {
 ; KNL-LABEL: fadd_noundef_eel:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; KNL-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; KNL-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; KNL-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fadd_noundef_eel:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; SKX-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; SKX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; SKX-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -278,18 +276,18 @@ define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) {
 define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) {
 ; KNL-LABEL: fsub_noundef_ee:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; KNL-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; KNL-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
+; KNL-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
+; KNL-NEXT:    vbroadcastsd %xmm0, %zmm0
+; KNL-NEXT:    vsubpd %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
 ; KNL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fsub_noundef_ee:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; SKX-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
+; SKX-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
+; SKX-NEXT:    vbroadcastsd %xmm0, %zmm0
+; SKX-NEXT:    vsubpd %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
 ; SKX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; SKX-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index fa37d2148f2..aa89ee7c390 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -7304,7 +7304,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -7321,7 +7321,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
@@ -7354,7 +7354,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -7371,7 +7371,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
@@ -7516,7 +7516,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -7535,7 +7535,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
@@ -7573,7 +7573,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -7593,7 +7593,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index 2eb9362947e..01e36681400 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2703,7 +2703,7 @@ define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: PR22377:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
 ; SSE-NEXT:    addps %xmm0, %xmm1
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -2711,7 +2711,7 @@ define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
 ;
 ; AVX-LABEL: PR22377:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,2,3]
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
 ; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm1
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-- 
GitLab


From fcb831da6a919bd658d85ea26268dccb5cc9d086 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 15 Oct 2018 18:34:36 +0000
Subject: [PATCH 0209/1116] [ADT] Fix a bug in DenseSet's initializer_list
 constructor.

Without this fix, DenseSet crashes with an assertion if constructed with an
initializer_list whose length is not a power of two.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344542 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/DenseSet.h    | 3 ++-
 unittests/ADT/DenseSetTest.cpp | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/llvm/ADT/DenseSet.h b/include/llvm/ADT/DenseSet.h
index 404b2f74766..e85a38587e4 100644
--- a/include/llvm/ADT/DenseSet.h
+++ b/include/llvm/ADT/DenseSet.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/type_traits.h"
 #include <algorithm>
 #include <cstddef>
@@ -67,7 +68,7 @@ public:
   explicit DenseSetImpl(unsigned InitialReserve = 0) : TheMap(InitialReserve) {}
 
   DenseSetImpl(std::initializer_list<ValueT> Elems)
-      : DenseSetImpl(Elems.size()) {
+      : DenseSetImpl(PowerOf2Ceil(Elems.size())) {
     insert(Elems.begin(), Elems.end());
   }
 
diff --git a/unittests/ADT/DenseSetTest.cpp b/unittests/ADT/DenseSetTest.cpp
index 04f84e041fb..7368e2ed0e0 100644
--- a/unittests/ADT/DenseSetTest.cpp
+++ b/unittests/ADT/DenseSetTest.cpp
@@ -80,6 +80,14 @@ TYPED_TEST(DenseSetTest, InitializerList) {
   EXPECT_EQ(0u, set.count(3));
 }
 
+TYPED_TEST(DenseSetTest, InitializerListWithNonPowerOfTwoLength) {
+  TypeParam set({1, 2, 3});
+  EXPECT_EQ(3u, set.size());
+  EXPECT_EQ(1u, set.count(1));
+  EXPECT_EQ(1u, set.count(2));
+  EXPECT_EQ(1u, set.count(3));
+}
+
 TYPED_TEST(DenseSetTest, ConstIteratorComparison) {
   TypeParam set({1});
   const TypeParam &cset = set;
-- 
GitLab


From 0a2a30e517cde9ce927b9215158875c17141e794 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Mon, 15 Oct 2018 19:22:20 +0000
Subject: [PATCH 0210/1116] [CodeExtractor] Erase debug intrinsics in outlined
 thunks (fix PR22900)

Variable updates within the outlined function are invisible to
debuggers. This could be improved by defining a DISubprogram for the
new function. For the moment, simply erase the debug intrinsics instead.

This fixes verifier failures about function-local metadata being used in
the wrong function, seen while testing the hot/cold splitting pass.

rdar://45142482

Differential Revision: https://reviews.llvm.org/D53267

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344545 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/CodeExtractor.cpp        | 13 +++++
 .../HotColdSplit/split-out-dbg-val-of-arg.ll  | 51 +++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll

diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 0e9e3219033..7b45b1799c4 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1286,6 +1286,19 @@ Function *CodeExtractor::extractCodeRegion() {
         }
     }
 
+  // Erase debug info intrinsics. Variable updates within the new function are
+  // invisible to debuggers. This could be improved by defining a DISubprogram
+  // for the new function.
+  for (BasicBlock &BB : *newFunction) {
+    auto BlockIt = BB.begin();
+    while (BlockIt != BB.end()) {
+      Instruction *Inst = &*BlockIt;
+      ++BlockIt;
+      if (isa<DbgInfoIntrinsic>(Inst))
+        Inst->eraseFromParent();
+    }
+  }
+
   LLVM_DEBUG(if (verifyFunction(*newFunction))
                  report_fatal_error("verifyFunction failed!"));
   return newFunction;
diff --git a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
new file mode 100644
index 00000000000..4b81de7b35b
--- /dev/null
+++ b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
@@ -0,0 +1,51 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+; CHECK-LABEL: define {{.*}}@foo_if.end
+; CHECK-NOT: llvm.dbg.value
+
+define void @foo(i32 %arg1) !dbg !6 {
+entry:
+  %var = add i32 0, 0, !dbg !11
+  br i1 undef, label %if.then, label %if.end, !dbg !12
+
+if.then:                                          ; preds = %entry
+  unreachable, !dbg !13
+
+if.end:                                           ; preds = %entry
+  call void @llvm.dbg.value(metadata i32 %arg1, metadata !9, metadata !DIExpression()), !dbg !11
+  br label %if.then12, !dbg !14
+
+if.then12:                                        ; preds = %if.end
+  br label %cleanup40, !dbg !15
+
+cleanup40:                                        ; preds = %if.then12
+  br label %return, !dbg !16
+
+return:                                           ; preds = %cleanup40
+  ret void, !dbg !17
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!3, !4}
+!llvm.module.flags = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!2 = !{}
+!3 = !{i32 7}
+!4 = !{i32 1}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+!7 = !DISubroutineType(types: !2)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+!11 = !DILocation(line: 1, column: 1, scope: !6)
+!12 = !DILocation(line: 2, column: 1, scope: !6)
+!13 = !DILocation(line: 3, column: 1, scope: !6)
+!14 = !DILocation(line: 4, column: 1, scope: !6)
+!15 = !DILocation(line: 5, column: 1, scope: !6)
+!16 = !DILocation(line: 6, column: 1, scope: !6)
+!17 = !DILocation(line: 7, column: 1, scope: !6)
-- 
GitLab


From ecabdb23f632398ccbe0fc5a9a521469fd70de23 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Mon, 15 Oct 2018 20:15:58 +0000
Subject: [PATCH 0211/1116] [llvm-objcopy] NFC: update TODO test comment

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344550 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-objcopy/input-output-target.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/tools/llvm-objcopy/input-output-target.test b/test/tools/llvm-objcopy/input-output-target.test
index e81770a239a..7a7df9fd503 100644
--- a/test/tools/llvm-objcopy/input-output-target.test
+++ b/test/tools/llvm-objcopy/input-output-target.test
@@ -11,7 +11,7 @@
 # RUN: llvm-objcopy --target binary -B i386:x86-64 %t.txt %t.3.txt
 # RUN: cmp %t-copy.txt %t.3.txt
 
-# TODO: check --target and --input-target/--output-target are incompatible
+# --target is incompatibile with --input-target/--output-target
 # RUN: not llvm-objcopy --target binary --input-target binary -B i386:x86-64 \
 # RUN:     %t.txt %t.4.txt 2>&1 \
 # RUN:     | FileCheck %s --check-prefix=BAD-FLAG
-- 
GitLab


From bea8b730d34af6991a91a4fe563234c5ea6eeabc Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Mon, 15 Oct 2018 20:37:47 +0000
Subject: [PATCH 0212/1116] AMDGPU: Generate .amdgcn_target for object code v3

Differential Revision: https://reviews.llvm.org/D53221


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344552 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        | 13 ++++-
 .../CodeGen/AMDGPU/directive-amdgcn-target.ll | 58 +++++++++++++++++++
 2 files changed, 68 insertions(+), 3 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/directive-amdgcn-target.ll

diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 7e6a406b1e3..7448dd71004 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -116,9 +116,16 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
 }
 
 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
-      TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    return;
+  if (IsaInfo::hasCodeObjectV3(getSTI())) {
+    std::string ExpectedTarget;
+    raw_string_ostream ExpectedTargetOS(ExpectedTarget);
+    IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS);
+
+    getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
+
+    if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
+      return;
+  }
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
       TM.getTargetTriple().getOS() != Triple::AMDPAL)
diff --git a/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
new file mode 100644
index 00000000000..757da908af9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
@@ -0,0 +1,58 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX600 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX600 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx601 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hainan -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=oland -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=pitcairn -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=verde -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX700 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX700 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx701 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX701 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX701 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx702 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX702 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kabini -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=mullins -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX704 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX704 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX801 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX801 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris10 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris11 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX810 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX810 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX902 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX904 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX906 %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+code-object-v3,+xnack < %s | FileCheck --check-prefixes=XNACK-GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+code-object-v3,-xnack < %s | FileCheck --check-prefixes=NO-XNACK-GFX902 %s
+
+; GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600"
+; GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601"
+; GFX700: .amdgcn_target "amdgcn-amd-amdhsa--gfx700"
+; GFX701: .amdgcn_target "amdgcn-amd-amdhsa--gfx701"
+; GFX702: .amdgcn_target "amdgcn-amd-amdhsa--gfx702"
+; GFX703: .amdgcn_target "amdgcn-amd-amdhsa--gfx703"
+; GFX704: .amdgcn_target "amdgcn-amd-amdhsa--gfx704"
+; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801+xnack"
+; GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802"
+; GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803"
+; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack"
+; GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900"
+; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack"
+; GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904"
+; GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906"
+
+; XNACK-GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack"
+; NO-XNACK-GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902
+
+define amdgpu_kernel void @directive_amdgcn_target() {
+  ret void
+}
-- 
GitLab


From b34f2ee301ef507dacf1984181f87048cd81b9b6 Mon Sep 17 00:00:00 2001
From: Chris Bieneman <chris.bieneman@me.com>
Date: Mon, 15 Oct 2018 21:14:19 +0000
Subject: [PATCH 0213/1116] [CMake] Change the default value of LLVM_ENABLE_IDE

There really aren't any generator behaviors that we need to take `CMAKE_EXTRA_GENERATOR` into account for. Where we need to take different behaviors for IDEs is mostly in enabling or disabling certain build system features that are optional but trip up the IDE UIs. Like the generation of lots of utility targets.

By changing the LLVM_ENABLE_IDE default to only being on for multi-configuration generators, we allow gating where it will impact the UI presentation, while also supporting optionally disabling the generation if your tooling workflow encounters problems. Presently being able to manually disable extra target generation is useful for Visual Studio 2017's CMake integration where the IDE has trouble displaying and working with the large number of optional targets.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344553 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/HandleLLVMOptions.cmake | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 85aebf6ed71..27875781d22 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -868,16 +868,19 @@ else()
   set(LLVM_ENABLE_PLUGINS ON)
 endif()
 
-# Remove LLVM_ENABLE_IDE from the CMake cache. This is a temporary change to
-# allow CMake caches to be cleaned up so that we can change the default for this
-# option and how it is used.
-unset(LLVM_ENABLE_IDE CACHE)
-#set(LLVM_ENABLE_IDE_default OFF)
-#if (XCODE OR MSVC_IDE OR CMAKE_EXTRA_GENERATOR)
-#  set(LLVM_ENABLE_IDE_default ON)
-#endif()
-#option(LLVM_ENABLE_IDE "Generate targets and process sources for use with an IDE"
-#    ${LLVM_ENABLE_IDE_default})
+# By default we should enable LLVM_ENABLE_IDE only for multi-configuration
+# generators. This option disables optional build system features that make IDEs
+# less usable.
+set(LLVM_ENABLE_IDE_default OFF)
+if (CMAKE_CONFIGURATION_TYPES)
+  set(LLVM_ENABLE_IDE_default ON)
+endif()
+option(LLVM_ENABLE_IDE
+       "Disable optional build system features that cause problems for IDE generators"
+       ${LLVM_ENABLE_IDE_default})
+if (CMAKE_CONFIGURATION_TYPES AND NOT LLVM_ENABLE_IDE)
+  message(WARNING "Disabling LLVM_ENABLE_IDE on multi-configuration generators is not recommended.")
+endif()
 
 function(get_compile_definitions)
   get_directory_property(top_dir_definitions DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
-- 
GitLab


From 5e9d76b982a1307ce32d562aa26fd8256506721f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 15 Oct 2018 21:15:58 +0000
Subject: [PATCH 0214/1116] [AARCH64] Improve vector popcnt lowering with ADDLP

AARCH64 equivalent to D53257 - uses widening pairwise adds on vXi8 CTPOP to support i16/i32/i64 vectors.

This is a blocker for generic vector CTPOP expansion (P32655) - this will remove the aarch64 diff from D53258.

Differential Revision: https://reviews.llvm.org/D53259

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344554 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp |  48 +++++--
 test/CodeGen/AArch64/arm64-vpopcnt.ll      | 140 ++++-----------------
 2 files changed, 62 insertions(+), 126 deletions(-)

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 90633807cdf..fea1531540f 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -792,9 +792,9 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   for (MVT InnerVT : MVT::all_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 
-  // CNT supports only B element sizes.
+  // CNT supports only B element sizes, then use UADDLP to widen.
   if (VT != MVT::v8i8 && VT != MVT::v16i8)
-    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Custom);
 
   setOperationAction(ISD::UDIV, VT, Expand);
   setOperationAction(ISD::SDIV, VT, Expand);
@@ -4539,18 +4539,42 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
-  if (VT == MVT::i32)
-    Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
-  Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
+  if (VT == MVT::i32 || VT == MVT::i64) {
+    if (VT == MVT::i32)
+      Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
 
-  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
-  SDValue UaddLV = DAG.getNode(
-      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
-      DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
+    SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
+    SDValue UaddLV = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+        DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
 
-  if (VT == MVT::i64)
-    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
-  return UaddLV;
+    if (VT == MVT::i64)
+      UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
+    return UaddLV;
+  }
+
+  assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
+          VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
+         "Unexpected type for custom ctpop lowering");
+
+  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+  Val = DAG.getBitcast(VT8Bit, Val);
+  Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
+
+  // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
+  unsigned EltSize = 8;
+  unsigned NumElts = VT.is64BitVector() ? 8 : 16;
+  while (EltSize != VT.getScalarSizeInBits()) {
+    EltSize *= 2;
+    NumElts /= 2;
+    MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
+    Val = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
+        DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
+  }
+
+  return Val;
 }
 
 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
diff --git a/test/CodeGen/AArch64/arm64-vpopcnt.ll b/test/CodeGen/AArch64/arm64-vpopcnt.ll
index 0c223ced9ac..6fe1176eaa8 100644
--- a/test/CodeGen/AArch64/arm64-vpopcnt.ll
+++ b/test/CodeGen/AArch64/arm64-vpopcnt.ll
@@ -17,30 +17,8 @@ declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
 define <4 x i16> @ctpopv4i16(<4 x i16> %x) nounwind readnone {
 ; CHECK-LABEL: ctpopv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.h[0]
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    cnt v1.8b, v1.8b
-; CHECK-NEXT:    uaddlv h1, v1.8b
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    cnt v2.8b, v2.8b
-; CHECK-NEXT:    uaddlv h2, v2.8b
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov v1.h[1], w8
-; CHECK-NEXT:    umov w8, v0.h[2]
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    cnt v2.8b, v2.8b
-; CHECK-NEXT:    uaddlv h2, v2.8b
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov v1.h[2], w8
-; CHECK-NEXT:    umov w8, v0.h[3]
-; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov v1.h[3], w8
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-NEXT:    ret
   %cnt = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %x)
   ret <4 x i16> %cnt
@@ -51,18 +29,9 @@ declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
 define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone {
 ; CHECK-LABEL: ctpopv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    cnt v1.8b, v1.8b
-; CHECK-NEXT:    uaddlv h1, v1.8b
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov v0.s[1], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    uaddlp v0.2s, v0.4h
 ; CHECK-NEXT:    ret
   %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
   ret <2 x i32> %cnt
@@ -70,6 +39,20 @@ define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone {
 
 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
 
+define <1 x i64> @ctpopv1i64(<1 x i64> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    uaddlp v0.2s, v0.4h
+; CHECK-NEXT:    uaddlp v0.1d, v0.2s
+; CHECK-NEXT:    ret
+  %cnt = tail call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %x)
+  ret <1 x i64> %cnt
+}
+
+declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>) nounwind readnone
+
 define <16 x i8> @ctpopv16i8(<16 x i8> %x) nounwind readnone {
 ; CHECK-LABEL: ctpopv16i8:
 ; CHECK:       // %bb.0:
@@ -84,53 +67,8 @@ declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
 define <8 x i16> @ctpopv8i16(<8 x i16> %x) nounwind readnone {
 ; CHECK-LABEL: ctpopv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    cnt v1.8b, v1.8b
-; CHECK-NEXT:    uaddlv h1, v1.8b
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    umov w9, v0.h[0]
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    cnt v1.8b, v1.8b
-; CHECK-NEXT:    uaddlv h1, v1.8b
-; CHECK-NEXT:    mov v1.h[1], w8
-; CHECK-NEXT:    umov w8, v0.h[2]
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    cnt v2.8b, v2.8b
-; CHECK-NEXT:    uaddlv h2, v2.8b
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov v1.h[2], w8
-; CHECK-NEXT:    umov w8, v0.h[3]
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    cnt v2.8b, v2.8b
-; CHECK-NEXT:    uaddlv h2, v2.8b
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov v1.h[3], w8
-; CHECK-NEXT:    umov w8, v0.h[4]
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    cnt v2.8b, v2.8b
-; CHECK-NEXT:    uaddlv h2, v2.8b
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov v1.h[4], w8
-; CHECK-NEXT:    umov w8, v0.h[5]
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    cnt v2.8b, v2.8b
-; CHECK-NEXT:    uaddlv h2, v2.8b
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov v1.h[5], w8
-; CHECK-NEXT:    umov w8, v0.h[6]
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    cnt v2.8b, v2.8b
-; CHECK-NEXT:    uaddlv h2, v2.8b
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov v1.h[6], w8
-; CHECK-NEXT:    umov w8, v0.h[7]
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov v1.h[7], w8
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
 ; CHECK-NEXT:    ret
   %cnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x)
   ret <8 x i16> %cnt
@@ -141,28 +79,9 @@ declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
 define <4 x i32> @ctpopv4i32(<4 x i32> %x) nounwind readnone {
 ; CHECK-LABEL: ctpopv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    mov w9, v0.s[2]
-; CHECK-NEXT:    mov w10, v0.s[3]
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    cnt v1.8b, v1.8b
-; CHECK-NEXT:    uaddlv h1, v1.8b
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov v0.s[1], w8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    cnt v1.8b, v1.8b
-; CHECK-NEXT:    uaddlv h1, v1.8b
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov v0.s[2], w8
-; CHECK-NEXT:    fmov d1, x10
-; CHECK-NEXT:    cnt v1.8b, v1.8b
-; CHECK-NEXT:    uaddlv h1, v1.8b
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    uaddlp v0.4s, v0.8h
 ; CHECK-NEXT:    ret
   %cnt = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
   ret <4 x i32> %cnt
@@ -173,17 +92,10 @@ declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
 define <2 x i64> @ctpopv2i64(<2 x i64> %x) nounwind readnone {
 ; CHECK-LABEL: ctpopv2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cnt v1.8b, v0.8b
-; CHECK-NEXT:    uaddlv h1, v1.8b
-; CHECK-NEXT:    fmov w0, s1
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov v1.d[1], x8
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-NEXT:    uaddlp v0.2d, v0.4s
 ; CHECK-NEXT:    ret
   %cnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
   ret <2 x i64> %cnt
-- 
GitLab


From 11b69c205d90be3ef1ebc5c676bdaa485a3c6475 Mon Sep 17 00:00:00 2001
From: Chris Bieneman <chris.bieneman@me.com>
Date: Mon, 15 Oct 2018 21:20:02 +0000
Subject: [PATCH 0215/1116] [CMake] Use LLVM_ENABLE_IDE instead of
 CMAKE_CONFIGURATION_TYPES

There are several places where we use CMAKE_CONFIGURATION_TYPES to determine if we are using an IDE generator and in turn decide not to generate some of the convenience targets (like all the install-* and check-llvm-* targets). This decision is made because IDEs don't always deal well with the thousands of targets LLVM can generate.

This approach does not work for Visual Studio 15's new CMake integration. Because VS15 uses a Ninja generator, it isn't a multi-configuration build, and generating all these extra targets mucks up the UI and adds little value.

With this change we still don't generate these targets by default for Visual Studio and Xcode generators, and LLVM_ENABLE_IDE becomes a switch that can be enabled on the VS15 CMake builds, to improve the IDE experience.

This is a re-land of r340435, with a few minor fix-ups. The issues causing the revert were addressed in r344218, r344219, and r344553.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344555 91177308-0d34-0410-b5e6-96231b3b80d8
---
 CMakeLists.txt                       |  6 +++---
 cmake/modules/AddLLVM.cmake          | 12 ++++++------
 cmake/modules/CMakeLists.txt         |  2 +-
 tools/xcode-toolchain/CMakeLists.txt |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e7a9dd8988..374bddbec2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -974,7 +974,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
   add_custom_target(llvm-headers DEPENDS intrinsics_gen)
   set_target_properties(llvm-headers PROPERTIES FOLDER "Misc")
 
-  if (NOT CMAKE_CONFIGURATION_TYPES)
+  if (NOT LLVM_ENABLE_IDE)
     add_llvm_install_targets(install-llvm-headers
                              DEPENDS llvm-headers
                              COMPONENT llvm-headers)
@@ -984,7 +984,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
   add_custom_target(llvm-libraries)
   set_target_properties(llvm-libraries PROPERTIES FOLDER "Misc")
 
-  if (NOT CMAKE_CONFIGURATION_TYPES)
+  if (NOT LLVM_ENABLE_IDE)
     add_llvm_install_targets(install-llvm-libraries
                              DEPENDS llvm-libraries
                              COMPONENT llvm-libraries)
@@ -1005,7 +1005,7 @@ endif()
 # This must be at the end of the LLVM root CMakeLists file because it must run
 # after all targets are created.
 if(LLVM_DISTRIBUTION_COMPONENTS)
-  if(CMAKE_CONFIGURATION_TYPES)
+  if(LLVM_ENABLE_IDE)
     message(FATAL_ERROR "LLVM_DISTRIBUTION_COMPONENTS cannot be specified with multi-configuration generators (i.e. Xcode or Visual Studio)")
   endif()
 
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 4dde95e30f3..410308d46d6 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -659,7 +659,7 @@ macro(add_llvm_library name)
               ${install_type} DESTINATION ${install_dir}
               COMPONENT ${name})
 
-      if (NOT CMAKE_CONFIGURATION_TYPES)
+      if (NOT LLVM_ENABLE_IDE)
         add_llvm_install_targets(install-${name}
                                  DEPENDS ${name}
                                  COMPONENT ${name})
@@ -890,7 +890,7 @@ macro(add_llvm_tool name)
               RUNTIME DESTINATION ${LLVM_TOOLS_INSTALL_DIR}
               COMPONENT ${name})
 
-      if (NOT CMAKE_CONFIGURATION_TYPES)
+      if (NOT LLVM_ENABLE_IDE)
         add_llvm_install_targets(install-${name}
                                  DEPENDS ${name}
                                  COMPONENT ${name})
@@ -928,7 +928,7 @@ macro(add_llvm_utility name)
     install (TARGETS ${name}
       RUNTIME DESTINATION ${LLVM_UTILS_INSTALL_DIR}
       COMPONENT ${name})
-    if (NOT CMAKE_CONFIGURATION_TYPES)
+    if (NOT LLVM_ENABLE_IDE)
       add_llvm_install_targets(install-${name}
                                DEPENDS ${name}
                                COMPONENT ${name})
@@ -1409,7 +1409,7 @@ function(add_lit_testsuite target comment)
 endfunction()
 
 function(add_lit_testsuites project directory)
-  if (NOT CMAKE_CONFIGURATION_TYPES)
+  if (NOT LLVM_ENABLE_IDE)
     cmake_parse_arguments(ARG "" "" "PARAMS;DEPENDS;ARGS" ${ARGN})
 
     # Search recursively for test directories by assuming anything not
@@ -1468,7 +1468,7 @@ function(llvm_install_library_symlink name dest type)
           CODE "install_symlink(${full_name} ${full_dest} ${output_dir})"
           COMPONENT ${component})
 
-  if (NOT CMAKE_CONFIGURATION_TYPES AND NOT ARG_ALWAYS_GENERATE)
+  if (NOT LLVM_ENABLE_IDE AND NOT ARG_ALWAYS_GENERATE)
     add_llvm_install_targets(install-${name}
                              DEPENDS ${name} ${dest} install-${dest}
                              COMPONENT ${name})
@@ -1501,7 +1501,7 @@ function(llvm_install_symlink name dest)
           CODE "install_symlink(${full_name} ${full_dest} ${LLVM_TOOLS_INSTALL_DIR})"
           COMPONENT ${component})
 
-  if (NOT CMAKE_CONFIGURATION_TYPES AND NOT ARG_ALWAYS_GENERATE)
+  if (NOT LLVM_ENABLE_IDE AND NOT ARG_ALWAYS_GENERATE)
     add_llvm_install_targets(install-${name}
                              DEPENDS ${name} ${dest} install-${dest}
                              COMPONENT ${name})
diff --git a/cmake/modules/CMakeLists.txt b/cmake/modules/CMakeLists.txt
index 6c316a2f04f..f5cc0006fa0 100644
--- a/cmake/modules/CMakeLists.txt
+++ b/cmake/modules/CMakeLists.txt
@@ -132,7 +132,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
     PATTERN LLVM-Config.cmake EXCLUDE
     PATTERN GetHostTriple.cmake EXCLUDE)
 
-  if (NOT CMAKE_CONFIGURATION_TYPES)
+  if (NOT LLVM_ENABLE_IDE)
     # Add a dummy target so this can be used with LLVM_DISTRIBUTION_COMPONENTS
     add_custom_target(cmake-exports)
     add_llvm_install_targets(install-cmake-exports
diff --git a/tools/xcode-toolchain/CMakeLists.txt b/tools/xcode-toolchain/CMakeLists.txt
index 0ae5e374fe9..6167f5f6bdd 100644
--- a/tools/xcode-toolchain/CMakeLists.txt
+++ b/tools/xcode-toolchain/CMakeLists.txt
@@ -100,7 +100,7 @@ add_llvm_install_targets(install-xcode-toolchain
                          PREFIX ${LLVMToolchainDir}/usr/)
 
 if(LLVM_DISTRIBUTION_COMPONENTS)
-  if(CMAKE_CONFIGURATION_TYPES)
+  if(LLVM_ENABLE_IDE)
     message(FATAL_ERROR "LLVM_DISTRIBUTION_COMPONENTS cannot be specified with multi-configuration generators (i.e. Xcode or Visual Studio)")
   endif()
 
-- 
GitLab


From e2d6b27abc89920c7b964c3b2d6544a2e51480d2 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <sebpop@gmail.com>
Date: Mon, 15 Oct 2018 21:43:11 +0000
Subject: [PATCH 0216/1116] [hot-cold-split] fix static analysis of cold
 regions

Make the code of blockEndsInUnreachable to match the function
blockEndsInUnreachable in CodeGen/BranchFolding.cpp. I also have
added a note to make sure the code of this function will not be
modified unless the back-end version is also modified.

An early return before outlining has been added to avoid
outlining the full function body when the first block in the
function is marked cold.

The static analysis of cold code has been amended to avoid
marking the whole function as cold by back-propagation
because the back-propagation would mark blocks with return
statements as cold.

The patch adds debug statements to help discover these problems.

Differential Revision: https://reviews.llvm.org/D52904

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344558 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/HotColdSplitting.cpp      | 48 +++++++++++++++++---
 test/Transforms/HotColdSplit/split-cold-1.ll | 24 ++++++++--
 test/Transforms/HotColdSplit/split-cold-2.ll |  4 ++
 3 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index 9d2634f1bc9..fcea40dffd7 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -101,14 +101,19 @@ static bool isSingleEntrySingleExit(BasicBlock *Entry, const BasicBlock *Exit,
   return true;
 }
 
+// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
+// this function unless you modify the MBB version as well.
+//
+/// A no successor, non-return block probably ends in unreachable and is cold.
+/// Also consider a block that ends in an indirect branch to be a return block,
+/// since many targets use plain indirect branches to return.
 bool blockEndsInUnreachable(const BasicBlock &BB) {
+  if (!succ_empty(&BB))
+    return false;
   if (BB.empty())
     return true;
   const Instruction *I = BB.getTerminator();
-  if (isa<ReturnInst>(I) || isa<IndirectBrInst>(I))
-    return true;
-  // Unreachable blocks do not have any successor.
-  return succ_empty(&BB);
+  return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
 }
 
 static bool exceptionHandlingFunctions(const CallInst *CI) {
@@ -123,8 +128,7 @@ static bool exceptionHandlingFunctions(const CallInst *CI) {
          FName == "__cxa_end_catch";
 }
 
-static
-bool unlikelyExecuted(const BasicBlock &BB) {
+static bool unlikelyExecuted(const BasicBlock &BB) {
   if (blockEndsInUnreachable(BB))
     return true;
   // Exception handling blocks are unlikely executed.
@@ -145,13 +149,32 @@ bool unlikelyExecuted(const BasicBlock &BB) {
   return false;
 }
 
+static bool returnsOrHasSideEffects(const BasicBlock &BB) {
+  const TerminatorInst *I = BB.getTerminator();
+  if (isa<ReturnInst>(I) || isa<IndirectBrInst>(I) || isa<InvokeInst>(I))
+    return true;
+
+  for (const Instruction &I : BB)
+    if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (CI->hasFnAttr(Attribute::NoReturn))
+        return true;
+
+      if (isa<InlineAsm>(CI->getCalledValue()))
+        return true;
+    }
+
+  return false;
+}
+
 static DenseSetBB getHotBlocks(Function &F) {
 
   // Mark all cold basic blocks.
   DenseSetBB ColdBlocks;
   for (BasicBlock &BB : F)
-    if (unlikelyExecuted(BB))
+    if (unlikelyExecuted(BB)) {
+      LLVM_DEBUG(llvm::dbgs() << "\nForward propagation marks cold: " << BB);
       ColdBlocks.insert((const BasicBlock *)&BB);
+    }
 
   // Forward propagation: basic blocks are hot when they are reachable from the
   // beginning of the function through a path that does not contain cold blocks.
@@ -203,7 +226,12 @@ static DenseSetBB getHotBlocks(Function &F) {
     if (ColdBlocks.count(It))
       continue;
 
+    // Do not back-propagate to blocks that return or have side effects.
+    if (returnsOrHasSideEffects(*It))
+      continue;
+
     // Move the block from HotBlocks to ColdBlocks.
+    LLVM_DEBUG(llvm::dbgs() << "\nBack propagation marks cold: " << *It);
     HotBlocks.erase(It);
     ColdBlocks.insert(It);
 
@@ -353,6 +381,12 @@ const Function *HotColdSplitting::outlineColdBlocks(Function &F,
   // Walking the dominator tree allows us to find the largest
   // cold region.
   BasicBlock *Begin = DT->getRootNode()->getBlock();
+
+  // Early return if the beginning of the function has been marked cold,
+  // otherwise all the function gets outlined.
+  if (PSI->isColdBB(Begin, BFI) || !HotBlocks.count(Begin))
+    return nullptr;
+
   for (auto I = df_begin(Begin), E = df_end(Begin); I != E; ++I) {
     BasicBlock *BB = *I;
     if (PSI->isColdBB(BB, BFI) || !HotBlocks.count(BB)) {
diff --git a/test/Transforms/HotColdSplit/split-cold-1.ll b/test/Transforms/HotColdSplit/split-cold-1.ll
index 60ec234ab83..1a8138fe0d3 100644
--- a/test/Transforms/HotColdSplit/split-cold-1.ll
+++ b/test/Transforms/HotColdSplit/split-cold-1.ll
@@ -1,9 +1,11 @@
 ; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
 ; RUN: opt -passes=hotcoldsplit -S < %s | FileCheck %s
 
-; Outlined function is called from a basic block named codeRepl
-; CHECK: codeRepl:
-; CHECK-NEXT: call void @foo
+; Check that the function is not split. Outlined function is called from a
+; basic block named codeRepl.
+
+; CHECK-LABEL: @foo
+; CHECK-NOT: codeRepl
 define void @foo() {
 entry:
   br i1 undef, label %if.then, label %if.end
@@ -23,3 +25,19 @@ cleanup40:                                        ; preds = %if.then12
 return:                                           ; preds = %cleanup40
   ret void
 }
+
+; Check that the function is not split. We used to outline the full function.
+
+; CHECK-LABEL: @fun
+; CHECK-NOT: codeRepl
+
+define void @fun() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %entry
+  ret void
+}
diff --git a/test/Transforms/HotColdSplit/split-cold-2.ll b/test/Transforms/HotColdSplit/split-cold-2.ll
index 101bc11cba9..e243a47623a 100644
--- a/test/Transforms/HotColdSplit/split-cold-2.ll
+++ b/test/Transforms/HotColdSplit/split-cold-2.ll
@@ -4,6 +4,10 @@
 ; Make sure this compiles. This test used to fail with an invalid phi node: the
 ; two predecessors were outlined and the SSA representation was invalid.
 
+; CHECK-LABEL: @fun
+; CHECK: codeRepl:
+; CHECK-NEXT: call void @fun_if.else
+
 define void @fun() {
 entry:
   br i1 undef, label %if.then, label %if.else
-- 
GitLab


From a379b4f9b55f90f8263719f18aa5c2f408d321b6 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 15 Oct 2018 21:43:53 +0000
Subject: [PATCH 0217/1116] [InstCombine] add tests for bitwise logic -->
 select; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344559 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/logical-select.ll | 36 +++++++++++++++++++
 test/Transforms/InstCombine/vec_sext.ll       |  8 ++---
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index dd95cc02751..3ee0ba169b3 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -531,3 +531,39 @@ define <4 x i32> @vec_sel_xor_multi_use(<4 x i32> %a, <4 x i32> %b, <4 x i1> %c)
   ret <4 x i32> %add
 }
 
+; The 'ashr' guarantees that we have a bitmask, so this is select with truncated condition.
+
+define i32 @allSignBits(i32 %cond, i32 %tval, i32 %fval) {
+; CHECK-LABEL: @allSignBits(
+; CHECK-NEXT:    [[BITMASK:%.*]] = ashr i32 [[COND:%.*]], 31
+; CHECK-NEXT:    [[NOT_BITMASK:%.*]] = xor i32 [[BITMASK]], -1
+; CHECK-NEXT:    [[A1:%.*]] = and i32 [[BITMASK]], [[TVAL:%.*]]
+; CHECK-NEXT:    [[A2:%.*]] = and i32 [[NOT_BITMASK]], [[FVAL:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = or i32 [[A1]], [[A2]]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %bitmask = ashr i32 %cond, 31
+  %not_bitmask = xor i32 %bitmask, -1
+  %a1 = and i32 %tval, %bitmask
+  %a2 = and i32 %not_bitmask, %fval
+  %sel = or i32 %a1, %a2
+  ret i32 %sel
+}
+
+define <4 x i8> @allSignBits_vec(<4 x i8> %cond, <4 x i8> %tval, <4 x i8> %fval) {
+; CHECK-LABEL: @allSignBits_vec(
+; CHECK-NEXT:    [[BITMASK:%.*]] = ashr <4 x i8> [[COND:%.*]], <i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:    [[NOT_BITMASK:%.*]] = xor <4 x i8> [[BITMASK]], <i8 -1, i8 -1, i8 -1, i8 -1>
+; CHECK-NEXT:    [[A1:%.*]] = and <4 x i8> [[BITMASK]], [[TVAL:%.*]]
+; CHECK-NEXT:    [[A2:%.*]] = and <4 x i8> [[NOT_BITMASK]], [[FVAL:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = or <4 x i8> [[A2]], [[A1]]
+; CHECK-NEXT:    ret <4 x i8> [[SEL]]
+;
+  %bitmask = ashr <4 x i8> %cond, <i8 7, i8 7, i8 7, i8 7>
+  %not_bitmask = xor <4 x i8> %bitmask, <i8 -1, i8 -1, i8 -1, i8 -1>
+  %a1 = and <4 x i8> %tval, %bitmask
+  %a2 = and <4 x i8> %fval, %not_bitmask
+  %sel = or <4 x i8> %a2, %a1
+  ret <4 x i8> %sel
+}
+
diff --git a/test/Transforms/InstCombine/vec_sext.ll b/test/Transforms/InstCombine/vec_sext.ll
index ea76115fc44..f244d49527b 100644
--- a/test/Transforms/InstCombine/vec_sext.ll
+++ b/test/Transforms/InstCombine/vec_sext.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-define <4 x i32> @psignd_3(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @psignd_3(
+define <4 x i32> @vec_select(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @vec_select(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]]
 ; CHECK-NEXT:    [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], <i32 31, i32 31, i32 31, i32 31>
 ; CHECK-NEXT:    [[T1:%.*]] = xor <4 x i32> [[B_LOBIT1]], <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -23,8 +23,8 @@ define <4 x i32> @psignd_3(<4 x i32> %a, <4 x i32> %b) {
   ret <4 x i32> %cond
 }
 
-define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @test1(
+define <4 x i32> @vec_select_alternate_sign_bit_test(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @vec_select_alternate_sign_bit_test(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]]
 ; CHECK-NEXT:    [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], <i32 31, i32 31, i32 31, i32 31>
 ; CHECK-NEXT:    [[B_LOBIT1_NOT:%.*]] = xor <4 x i32> [[B_LOBIT1]], <i32 -1, i32 -1, i32 -1, i32 -1>
-- 
GitLab


From dea373926eb58e3af50777a7c643c4d75a7c1a61 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 15 Oct 2018 21:51:22 +0000
Subject: [PATCH 0218/1116] [X86] Regenerate avx2-intrinsics-x86.ll to compress
 the 32 vs 64 bit mode checks.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344560 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx2-intrinsics-x86.ll | 1747 ++++++++---------------
 1 file changed, 559 insertions(+), 1188 deletions(-)

diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 7eaa7f1cf98..5b649df410b 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -5,25 +5,15 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL
 
 define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packssdw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packssdw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packssdw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packssdw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_packssdw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_packssdw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -64,25 +54,15 @@ define <16 x i16> @test_x86_avx2_packssdw_fold() {
 
 
 define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packsswb:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packsswb:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packsswb:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packsswb:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_packsswb:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_packsswb:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -123,25 +103,15 @@ define <32 x i8> @test_x86_avx2_packsswb_fold() {
 
 
 define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packuswb:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packuswb:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packuswb:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packuswb:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_packuswb:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_packuswb:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -182,25 +152,15 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() {
 
 
 define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_padds_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_padds_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_padds_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_padds_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_padds_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_padds_b:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -208,25 +168,15 @@ declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_padds_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_padds_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_padds_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_padds_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_padds_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_padds_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -234,25 +184,15 @@ declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmadd_wd:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf5,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmadd_wd:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmadd_wd:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf5,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmadd_wd:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmadd_wd:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf5,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmadd_wd:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -260,25 +200,15 @@ declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <16 x i16> @test_x86_avx2_pmaxs_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxs_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xee,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxs_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxs_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xee,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxs_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxs_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xee,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxs_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -286,25 +216,15 @@ declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <32 x i8> @test_x86_avx2_pmaxu_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxu_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xde,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxu_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxu_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xde,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxu_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxu_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xde,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxu_b:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -312,25 +232,15 @@ declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pmins_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmins_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xea,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmins_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmins_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xea,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmins_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmins_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xea,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmins_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -338,25 +248,15 @@ declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <32 x i8> @test_x86_avx2_pminu_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pminu_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xda,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pminu_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pminu_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xda,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pminu_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pminu_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xda,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pminu_b:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -364,17 +264,11 @@ declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovmskb:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
-; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_pmovmskb:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
-; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pmovmskb:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -382,25 +276,15 @@ declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmulh_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe5,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmulh_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmulh_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe5,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmulh_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmulh_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe5,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmulh_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -408,25 +292,15 @@ declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <16 x i16> @test_x86_avx2_pmulhu_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmulhu_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe4,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmulhu_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmulhu_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe4,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmulhu_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmulhu_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe4,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmulhu_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -434,25 +308,15 @@ declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind read
 
 
 define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psad_bw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psad_bw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psad_bw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psad_bw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psad_bw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psad_bw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -460,25 +324,15 @@ declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psll_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf2,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psll_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psll_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf2,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psll_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psll_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf2,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -486,25 +340,15 @@ declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psll_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf3,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psll_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psll_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf3,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psll_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psll_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf3,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -512,25 +356,15 @@ declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psll_w(<16 x i16> %a0, <8 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psll_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf1,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psll_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psll_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf1,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psll_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psll_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf1,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -538,25 +372,15 @@ declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnon
 
 
 define <8 x i32> @test_x86_avx2_pslli_d(<8 x i32> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_pslli_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpslld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xf0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pslli_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pslli_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpslld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xf0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pslli_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pslli_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpslld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xf0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -564,25 +388,15 @@ declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pslli_q(<4 x i64> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_pslli_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xf0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pslli_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pslli_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xf0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pslli_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pslli_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xf0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -590,25 +404,15 @@ declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pslli_w(<16 x i16> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_pslli_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xf0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pslli_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pslli_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xf0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pslli_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pslli_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xf0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -616,25 +420,15 @@ declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psra_d(<8 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psra_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe2,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psra_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psra_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe2,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psra_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psra_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe2,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psra_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -642,25 +436,15 @@ declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psra_w(<16 x i16> %a0, <8 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psra_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe1,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psra_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psra_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe1,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psra_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psra_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe1,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psra_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -668,25 +452,15 @@ declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnon
 
 
 define <8 x i32> @test_x86_avx2_psrai_d(<8 x i32> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_psrai_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrad $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xe0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrai_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrai_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrad $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xe0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrai_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrai_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrad $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xe0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrai_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -694,25 +468,15 @@ declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psrai_w(<16 x i16> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_psrai_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsraw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xe0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrai_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrai_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsraw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xe0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrai_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrai_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsraw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xe0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrai_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -720,25 +484,15 @@ declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrl_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd2,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrl_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrl_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd2,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrl_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrl_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd2,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -746,25 +500,15 @@ declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrl_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd3,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrl_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrl_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd3,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrl_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrl_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd3,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -772,25 +516,15 @@ declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrl_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrl_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrl_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrl_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrl_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -798,25 +532,15 @@ declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnon
 
 
 define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_psrli_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xd0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrli_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrli_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xd0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrli_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrli_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xd0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -824,25 +548,15 @@ declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psrli_q(<4 x i64> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_psrli_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xd0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrli_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrli_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xd0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrli_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrli_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xd0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -850,25 +564,15 @@ declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psrli_w(<16 x i16> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_psrli_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xd0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrli_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrli_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xd0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrli_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrli_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xd0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -876,25 +580,15 @@ declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
 
 
 define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psubs_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psubs_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psubs_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psubs_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psubs_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psubs_b:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -902,40 +596,25 @@ declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psubs_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psubs_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psubs_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psubs_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_phadd_d:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phadd_d:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phadd_d:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -943,15 +622,10 @@ declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_phadd_sw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phadd_sw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phadd_sw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -959,15 +633,10 @@ declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind read
 
 
 define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_phadd_w:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phadd_w:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phadd_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -975,15 +644,10 @@ declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_phsub_d:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phsub_d:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phsub_d:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -991,15 +655,10 @@ declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_phsub_sw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phsub_sw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phsub_sw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1007,15 +666,10 @@ declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind read
 
 
 define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_phsub_w:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phsub_w:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phsub_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1023,25 +677,15 @@ declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmadd_ub_sw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmadd_ub_sw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmadd_ub_sw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1080,25 +724,15 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(<32 x i8>* %ptr, <32 x i8>
 }
 
 define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmul_hr_sw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmul_hr_sw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmul_hr_sw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1106,25 +740,15 @@ declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind re
 
 
 define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pshuf_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pshuf_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pshuf_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pshuf_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pshuf_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pshuf_b:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -1132,15 +756,10 @@ declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: test_x86_avx2_psign_b:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_psign_b:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_psign_b:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -1148,15 +767,10 @@ declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_psign_d:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_psign_d:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_psign_d:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1164,15 +778,10 @@ declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_psign_w:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_psign_w:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_psign_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1180,15 +789,10 @@ declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: test_x86_avx2_mpsadbw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_mpsadbw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_mpsadbw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1196,25 +800,15 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea
 
 
 define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packusdw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packusdw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packusdw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packusdw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_packusdw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_packusdw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1255,15 +849,10 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() {
 
 
 define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
-; X86-LABEL: test_x86_avx2_pblendvb:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_pblendvb:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pblendvb:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -1271,17 +860,11 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw
 
 
 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_pblendw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07]
-; X86-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_pblendw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07]
-; X64-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pblendw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07]
+; CHECK-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1289,25 +872,15 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind r
 
 
 define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxsb:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxsb:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxsb:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxsb:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxsb:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxsb:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -1315,25 +888,15 @@ declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxsd:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxsd:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxsd:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxsd:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxsd:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxsd:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1341,25 +904,15 @@ declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pmaxud(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxud:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxud:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxud:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxud:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxud:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxud:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1367,25 +920,15 @@ declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxuw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxuw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxuw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxuw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxuw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxuw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1393,25 +936,15 @@ declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <32 x i8> @test_x86_avx2_pminsb(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pminsb:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pminsb:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pminsb:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pminsb:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pminsb:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pminsb:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -1419,25 +952,15 @@ declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pminsd(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pminsd:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pminsd:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pminsd:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pminsd:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pminsd:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pminsd:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1445,25 +968,15 @@ declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pminud(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pminud:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pminud:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pminud:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pminud:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pminud:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pminud:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1471,25 +984,15 @@ declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pminuw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pminuw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pminuw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pminuw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pminuw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pminuw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1497,17 +1000,11 @@ declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pblendd_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08]
-; X86-NEXT:    ## xmm0 = xmm1[0,1,2],xmm0[3]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_pblendd_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08]
-; X64-NEXT:    ## xmm0 = xmm1[0,1,2],xmm0[3]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pblendd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08]
+; CHECK-NEXT:    ## xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -1515,17 +1012,11 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind
 
 
 define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pblendd_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vblendps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07]
-; X86-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_pblendd_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vblendps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07]
-; X64-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pblendd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vblendps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07]
+; CHECK-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1536,25 +1027,15 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind
 ; and its lowering. Indeed, the offsets are the first source in
 ; the instruction.
 define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_permd:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_permd:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_permd:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_permd:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_permd:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_permd:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1565,25 +1046,15 @@ declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
 ; and its lowering. Indeed, the offsets are the first source in
 ; the instruction.
 define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_permps:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_permps:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_permps:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_permps:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_permps:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_permps:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
 }
@@ -1731,25 +1202,15 @@ declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind
 
 
 define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psllv_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x47,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psllv_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x47,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psllv_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x47,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -1757,25 +1218,15 @@ declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psllv_d_256:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d_256:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psllv_d_256:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d_256:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psllv_d_256:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_d_256:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1783,25 +1234,15 @@ declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind read
 
 
 define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psllv_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psllv_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psllv_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psllv_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   ret <2 x i64> %res
 }
@@ -1809,25 +1250,15 @@ declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psllv_q_256:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psllv_q_256:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psllv_q_256:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q_256:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psllv_q_256:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_q_256:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -1835,25 +1266,15 @@ declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind read
 
 
 define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrlv_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x45,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrlv_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x45,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrlv_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x45,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -1861,25 +1282,15 @@ declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrlv_d_256:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrlv_d_256:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrlv_d_256:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_d_256:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1887,25 +1298,15 @@ declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind read
 
 
 define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrlv_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrlv_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrlv_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   ret <2 x i64> %res
 }
@@ -1913,25 +1314,15 @@ declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrlv_q_256:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrlv_q_256:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrlv_q_256:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_q_256:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -1939,25 +1330,15 @@ declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind read
 
 
 define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrav_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrav_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrav_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -2004,25 +1385,15 @@ define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {
 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrav_d_256:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrav_d_256:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrav_d_256:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d_256:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
-- 
GitLab


From 672e9ba7300c48646dcdc16a46be60c4e0acadef Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 15 Oct 2018 21:51:26 +0000
Subject: [PATCH 0219/1116] [X86] Disable the peephole pass on
 avx2-intrinsics-x86.ll and avx512bw-intrinsics.ll to ensure any load folding
 tests are testing isel not load folding tables.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344561 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx2-intrinsics-x86.ll | 8 ++++----
 test/CodeGen/X86/avx512bw-intrinsics.ll | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 5b649df410b..10d40e556c8 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL
+; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX
+; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL
 
 define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX2-LABEL: test_x86_avx2_packssdw:
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index c17ba57d11a..4cd51bc1e91 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
 define i32 @test_int_x86_avx512_kadd_d(<32 x i16> %A, <32 x i16> %B) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_kadd_d:
-- 
GitLab


From f27f35e3318010160d579446899eec6ec12569bb Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 15 Oct 2018 21:51:29 +0000
Subject: [PATCH 0220/1116] [X86] Add test cases showing failure to fold load
 into vpsrlw when EVEX encoded instructions are used.

There's a bad bitcast being used in the isel patterns for the vXi16 shift instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344562 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx2-intrinsics-x86.ll | 84 +++++++++++++++++--------
 test/CodeGen/X86/avx512bw-intrinsics.ll | 17 +++++
 test/CodeGen/X86/sse2-intrinsics-x86.ll | 41 ++++++++++++
 3 files changed, 115 insertions(+), 27 deletions(-)

diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 10d40e556c8..bba70b139e2 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -531,6 +531,36 @@ define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) {
 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
 
 
+define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, <8 x i16>* %p) {
+; X86-AVX-LABEL: test_x86_avx2_psrl_w_load:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT:    vpsrlw (%eax), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0x00]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_psrl_w_load:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08]
+; X86-AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_psrl_w_load:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0x07]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_psrl_w_load:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f]
+; X64-AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %a1 = load <8 x i16>, <8 x i16>* %p
+  %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+
+
 define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) {
 ; AVX2-LABEL: test_x86_avx2_psrli_d:
 ; AVX2:       ## %bb.0:
@@ -820,28 +850,28 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() {
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X86-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI50_0, kind: FK_Data_4
+; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI51_0, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovaps LCPI50_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
+; X86-AVX512VL-NEXT:    vmovaps LCPI51_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI50_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI51_0, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold:
 ; X64-AVX:       ## %bb.0:
 ; X64-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X64-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI50_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI51_0-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
 ; X64-AVX512VL:       ## %bb.0:
 ; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI50_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI51_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
   ret <16 x i16> %res
@@ -1348,36 +1378,36 @@ define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
 ; X86-AVX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI82_0, kind: FK_Data_4
-; X86-AVX-NEXT:    vpsravd LCPI82_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI82_1, kind: FK_Data_4
+; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI83_0, kind: FK_Data_4
+; X86-AVX-NEXT:    vpsravd LCPI83_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI83_1, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovdqa LCPI82_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
+; X86-AVX512VL-NEXT:    vmovdqa LCPI83_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
 ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI82_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpsravd LCPI82_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI82_1, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI83_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    vpsravd LCPI83_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI83_1, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const:
 ; X64-AVX:       ## %bb.0:
 ; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
 ; X64-AVX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI82_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI83_0-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI82_1-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI83_1-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
 ; X64-AVX512VL:       ## %bb.0:
 ; X64-AVX512VL-NEXT:    vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
 ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI82_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI83_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI82_1-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI83_1-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
   ret <4 x i32> %res
@@ -1403,36 +1433,36 @@ define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1)
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X86-AVX-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
-; X86-AVX-NEXT:    vpsravd LCPI84_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
+; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4
+; X86-AVX-NEXT:    vpsravd LCPI85_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovdqa LCPI84_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; X86-AVX512VL-NEXT:    vmovdqa LCPI85_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpsravd LCPI84_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    vpsravd LCPI85_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X64-AVX:       ## %bb.0:
 ; X64-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X64-AVX-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI84_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI85_0-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI84_1-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI85_1-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X64-AVX512VL:       ## %bb.0:
 ; X64-AVX512VL-NEXT:    vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI84_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI85_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI84_1-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI85_1-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
   ret <8 x i32> %res
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 4cd51bc1e91..650235d51b3 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1948,6 +1948,23 @@ define <32 x i16> @test_x86_avx512_maskz_psrl_w_512(<32 x i16> %a0, <8 x i16> %a
 }
 declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone
 
+define <32 x i16> @test_x86_avx512_psrl_w_512_load(<32 x i16> %a0, <8 x i16>* %p) {
+; X86-LABEL: test_x86_avx512_psrl_w_512_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa (%eax), %xmm1 # encoding: [0xc5,0xf9,0x6f,0x08]
+; X86-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_psrl_w_512_load:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa (%rdi), %xmm1 # encoding: [0xc5,0xf9,0x6f,0x0f]
+; X64-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %a1 = load <8 x i16>, <8 x i16>* %p
+  %res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
+  ret <32 x i16> %res
+}
 
 define <32 x i16> @test_x86_avx512_psrli_w_512(<32 x i16> %a0) {
 ; CHECK-LABEL: test_x86_avx512_psrli_w_512:
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 068b0421a0b..020c4985943 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1418,6 +1418,47 @@ define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) {
 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 
+define <8 x i16> @test_x86_sse2_psrl_w_load(<8 x i16> %a0, <8 x i16>* %p) {
+; X86-SSE-LABEL: test_x86_sse2_psrl_w_load:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    psrlw (%eax), %xmm0 ## encoding: [0x66,0x0f,0xd1,0x00]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX1-LABEL: test_x86_sse2_psrl_w_load:
+; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX1-NEXT:    vpsrlw (%eax), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd1,0x00]
+; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512-LABEL: test_x86_sse2_psrl_w_load:
+; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512-NEXT:    vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08]
+; X86-AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xc1]
+; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: test_x86_sse2_psrl_w_load:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    psrlw (%rdi), %xmm0 ## encoding: [0x66,0x0f,0xd1,0x07]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX1-LABEL: test_x86_sse2_psrl_w_load:
+; X64-AVX1:       ## %bb.0:
+; X64-AVX1-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd1,0x07]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: test_x86_sse2_psrl_w_load:
+; X64-AVX512:       ## %bb.0:
+; X64-AVX512-NEXT:    vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f]
+; X64-AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xc1]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
+  %a1 = load <8 x i16>, <8 x i16>* %p
+  %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
+
 define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) {
 ; SSE-LABEL: test_x86_sse2_psrli_d:
 ; SSE:       ## %bb.0:
-- 
GitLab


From c6a0661256ae8e6d00a1ed6e87cd646818b3b1d4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 15 Oct 2018 21:51:32 +0000
Subject: [PATCH 0221/1116] [X86] Fix a bad bitcast in the load form of vXi16
 uniform shift patterns for EVEX encoded instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344563 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrAVX512.td        | 19 ++++++++++---------
 test/CodeGen/X86/avx2-intrinsics-x86.ll |  6 ++----
 test/CodeGen/X86/avx512bw-intrinsics.ll |  6 ++----
 test/CodeGen/X86/sse2-intrinsics-x86.ll |  6 ++----
 4 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index b1cb1545ec4..158aba447ed 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -5826,7 +5826,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
 
 multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86FoldableSchedWrite sched, ValueType SrcVT,
-                            PatFrag bc_frag, X86VectorVTInfo _> {
+                            X86VectorVTInfo _> {
    // src2 is always 128-bit
   let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -5837,7 +5837,8 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2))))>,
+                   (_.VT (OpNode _.RC:$src1,
+                                 (SrcVT (bitconvert (loadv2i64 addr:$src2)))))>,
                    AVX512BIBase,
                    EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -5845,18 +5846,18 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               X86SchedWriteWidths sched, ValueType SrcVT,
-                              PatFrag bc_frag, AVX512VLVectorVTInfo VTInfo,
+                              AVX512VLVectorVTInfo VTInfo,
                               Predicate prd> {
   let Predicates = [prd] in
   defm Z    : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.ZMM, SrcVT,
-                               bc_frag, VTInfo.info512>, EVEX_V512,
+                               VTInfo.info512>, EVEX_V512,
                                EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
   let Predicates = [prd, HasVLX] in {
   defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.YMM, SrcVT,
-                               bc_frag, VTInfo.info256>, EVEX_V256,
+                               VTInfo.info256>, EVEX_V256,
                                EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
   defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.XMM, SrcVT,
-                               bc_frag, VTInfo.info128>, EVEX_V128,
+                               VTInfo.info128>, EVEX_V128,
                                EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
   }
 }
@@ -5866,12 +5867,12 @@ multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
                               X86SchedWriteWidths sched,
                               bit NotEVEX2VEXConvertibleQ = 0> {
   defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, sched, v4i32,
-                              bc_v4i32, avx512vl_i32_info, HasAVX512>;
+                              avx512vl_i32_info, HasAVX512>;
   let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in
   defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, sched, v2i64,
-                              bc_v2i64, avx512vl_i64_info, HasAVX512>, VEX_W;
+                              avx512vl_i64_info, HasAVX512>, VEX_W;
   defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, sched, v8i16,
-                              bc_v2i64, avx512vl_i16_info, HasBWI>;
+                              avx512vl_i16_info, HasBWI>;
 }
 
 multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index bba70b139e2..101448e22ac 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -541,8 +541,7 @@ define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, <8 x i16>* %p) {
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrl_w_load:
 ; X86-AVX512VL:       ## %bb.0:
 ; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512VL-NEXT:    vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08]
-; X86-AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1]
+; X86-AVX512VL-NEXT:    vpsrlw (%eax), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0x00]
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrl_w_load:
@@ -552,8 +551,7 @@ define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, <8 x i16>* %p) {
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_psrl_w_load:
 ; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f]
-; X64-AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1]
+; X64-AVX512VL-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0x07]
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %a1 = load <8 x i16>, <8 x i16>* %p
   %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 650235d51b3..cf52746c3a5 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1952,14 +1952,12 @@ define <32 x i16> @test_x86_avx512_psrl_w_512_load(<32 x i16> %a0, <8 x i16>* %p
 ; X86-LABEL: test_x86_avx512_psrl_w_512_load:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    vmovdqa (%eax), %xmm1 # encoding: [0xc5,0xf9,0x6f,0x08]
-; X86-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xc1]
+; X86-NEXT:    vpsrlw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_avx512_psrl_w_512_load:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rdi), %xmm1 # encoding: [0xc5,0xf9,0x6f,0x0f]
-; X64-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xc1]
+; X64-NEXT:    vpsrlw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %a1 = load <8 x i16>, <8 x i16>* %p
   %res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 020c4985943..8dedce5fc8b 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1434,8 +1434,7 @@ define <8 x i16> @test_x86_sse2_psrl_w_load(<8 x i16> %a0, <8 x i16>* %p) {
 ; X86-AVX512-LABEL: test_x86_sse2_psrl_w_load:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08]
-; X86-AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xc1]
+; X86-AVX512-NEXT:    vpsrlw (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0x00]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_x86_sse2_psrl_w_load:
@@ -1450,8 +1449,7 @@ define <8 x i16> @test_x86_sse2_psrl_w_load(<8 x i16> %a0, <8 x i16>* %p) {
 ;
 ; X64-AVX512-LABEL: test_x86_sse2_psrl_w_load:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f]
-; X64-AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xc1]
+; X64-AVX512-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0x07]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %a1 = load <8 x i16>, <8 x i16>* %p
   %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-- 
GitLab


From 726b0ec4982cd872ba4577900f4d6adb2b1753b6 Mon Sep 17 00:00:00 2001
From: Erik Pilkington <erik.pilkington@gmail.com>
Date: Mon, 15 Oct 2018 22:03:53 +0000
Subject: [PATCH 0222/1116] NFC: Fix a -Wsign-conversion warning

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344564 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Demangle/ItaniumDemangle.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp
index 2165cbab7e7..8d132c7580f 100644
--- a/lib/Demangle/ItaniumDemangle.cpp
+++ b/lib/Demangle/ItaniumDemangle.cpp
@@ -112,14 +112,20 @@ struct DumpVisitor {
     printStr("}");
     --Depth;
   }
+
   // Overload used when T is exactly 'bool', not merely convertible to 'bool'.
-  template<typename T, T * = (bool*)nullptr>
-  void print(T B) {
-    printStr(B ? "true" : "false");
+  void print(bool B) { printStr(B ? "true" : "false"); }
+
+  template <class T>
+  typename std::enable_if<std::is_unsigned<T>::value>::type print(T N) {
+    fprintf(stderr, "%llu", (unsigned long long)N);
   }
-  void print(size_t N) {
-    fprintf(stderr, "%zu", N);
+
+  template <class T>
+  typename std::enable_if<std::is_signed<T>::value>::type print(T N) {
+    fprintf(stderr, "%lld", (long long)N);
   }
+
   void print(ReferenceKind RK) {
     switch (RK) {
     case ReferenceKind::LValue:
-- 
GitLab


From 52ff03cff907cf240f95cbcbbec5e0f3b43c76b0 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 15 Oct 2018 22:27:02 +0000
Subject: [PATCH 0223/1116] [ORC] Switch to DenseMap/DenseSet for ORC symbol
 map/set types.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344565 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/Orc/Core.h       | 19 +++----
 .../ExecutionEngine/Orc/SymbolStringPool.h    | 27 ++++++++++
 lib/ExecutionEngine/Orc/Core.cpp              | 53 +++++++++++--------
 lib/ExecutionEngine/Orc/ExecutionUtils.cpp    | 14 ++---
 4 files changed, 73 insertions(+), 40 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h
index 67b16894f6c..86c5ebb6d27 100644
--- a/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/include/llvm/ExecutionEngine/Orc/Core.h
@@ -20,10 +20,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 
-#include <list>
-#include <map>
 #include <memory>
-#include <set>
 #include <vector>
 
 #define DEBUG_TYPE "orc"
@@ -44,18 +41,18 @@ using VModuleKey = uint64_t;
 
 /// A set of symbol names (represented by SymbolStringPtrs for
 //         efficiency).
-using SymbolNameSet = std::set<SymbolStringPtr>;
+using SymbolNameSet = DenseSet<SymbolStringPtr>;
 
 /// A map from symbol names (as SymbolStringPtrs) to JITSymbols
 ///        (address/flags pairs).
-using SymbolMap = std::map<SymbolStringPtr, JITEvaluatedSymbol>;
+using SymbolMap = DenseMap<SymbolStringPtr, JITEvaluatedSymbol>;
 
 /// A map from symbol names (as SymbolStringPtrs) to JITSymbolFlags.
-using SymbolFlagsMap = std::map<SymbolStringPtr, JITSymbolFlags>;
+using SymbolFlagsMap = DenseMap<SymbolStringPtr, JITSymbolFlags>;
 
 /// A base class for materialization failures that allows the failing
 ///        symbols to be obtained for logging.
-using SymbolDependenceMap = std::map<JITDylib *, SymbolNameSet>;
+using SymbolDependenceMap = DenseMap<JITDylib *, SymbolNameSet>;
 
 /// A list of JITDylib pointers.
 using JITDylibList = std::vector<JITDylib *>;
@@ -339,7 +336,7 @@ struct SymbolAliasMapEntry {
 };
 
 /// A map of Symbols to (Symbol, Flags) pairs.
-using SymbolAliasMap = std::map<SymbolStringPtr, SymbolAliasMapEntry>;
+using SymbolAliasMap = DenseMap<SymbolStringPtr, SymbolAliasMapEntry>;
 
 /// A materialization unit for symbol aliases. Allows existing symbols to be
 /// aliased with alternate flags.
@@ -489,7 +486,7 @@ public:
       JITDylib &Parent, const SymbolNameSet &Names)>;
 
   using AsynchronousSymbolQuerySet =
-      std::set<std::shared_ptr<AsynchronousSymbolQuery>>;
+    std::set<std::shared_ptr<AsynchronousSymbolQuery>>;
 
   JITDylib(const JITDylib &) = delete;
   JITDylib &operator=(const JITDylib &) = delete;
@@ -609,7 +606,7 @@ private:
   };
 
   using UnmaterializedInfosMap =
-      std::map<SymbolStringPtr, std::shared_ptr<UnmaterializedInfo>>;
+      DenseMap<SymbolStringPtr, std::shared_ptr<UnmaterializedInfo>>;
 
   struct MaterializingInfo {
     AsynchronousSymbolQueryList PendingQueries;
@@ -618,7 +615,7 @@ private:
     bool IsEmitted = false;
   };
 
-  using MaterializingInfosMap = std::map<SymbolStringPtr, MaterializingInfo>;
+  using MaterializingInfosMap = DenseMap<SymbolStringPtr, MaterializingInfo>;
 
   using LookupImplActionFlags = enum {
     None = 0,
diff --git a/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h b/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
index 4c45cfd199d..717076e2560 100644
--- a/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
+++ b/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H
 #define LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include <atomic>
 #include <mutex>
@@ -49,10 +50,13 @@ private:
 /// Pointer to a pooled string representing a symbol name.
 class SymbolStringPtr {
   friend class SymbolStringPool;
+  friend struct DenseMapInfo<SymbolStringPtr>;
   friend bool operator==(const SymbolStringPtr &LHS,
                          const SymbolStringPtr &RHS);
   friend bool operator<(const SymbolStringPtr &LHS, const SymbolStringPtr &RHS);
 
+  static SymbolStringPool::PoolMapEntry Tombstone;
+
 public:
   SymbolStringPtr() = default;
   SymbolStringPtr(const SymbolStringPtr &Other)
@@ -142,6 +146,29 @@ inline bool SymbolStringPool::empty() const {
 }
 
 } // end namespace orc
+
+template <>
+struct DenseMapInfo<orc::SymbolStringPtr> {
+
+  static orc::SymbolStringPtr getEmptyKey() {
+    return orc::SymbolStringPtr();
+  }
+
+  static orc::SymbolStringPtr getTombstoneKey() {
+    return orc::SymbolStringPtr(&orc::SymbolStringPtr::Tombstone);
+  }
+
+  static unsigned getHashValue(orc::SymbolStringPtr V) {
+    uintptr_t IV = reinterpret_cast<uintptr_t>(V.S);
+    return unsigned(IV) ^ unsigned(IV >> 9);
+  }
+
+  static bool isEqual(const orc::SymbolStringPtr &LHS,
+                      const orc::SymbolStringPtr &RHS) {
+    return LHS.S == RHS.S;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H
diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index 3fa28a5af6f..d477ca523d8 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -134,6 +134,8 @@ struct PrintSymbolMapElemsMatchingCLOpts {
 namespace llvm {
 namespace orc {
 
+  SymbolStringPool::PoolMapEntry SymbolStringPtr::Tombstone(0);
+
 char FailedToMaterialize::ID = 0;
 char SymbolsNotFound::ID = 0;
 char SymbolsCouldNotBeRemoved::ID = 0;
@@ -575,20 +577,22 @@ void ReExportsMaterializationUnit::materialize(
     SymbolNameSet QuerySymbols;
     SymbolAliasMap QueryAliases;
 
-    for (auto I = RequestedAliases.begin(), E = RequestedAliases.end();
-         I != E;) {
-      auto Tmp = I++;
-
+    // Collect as many aliases as we can without including a chain.
+    for (auto &KV : RequestedAliases) {
       // Chain detected. Skip this symbol for this round.
-      if (&SrcJD == &TgtJD && (QueryAliases.count(Tmp->second.Aliasee) ||
-                               RequestedAliases.count(Tmp->second.Aliasee)))
+      if (&SrcJD == &TgtJD && (QueryAliases.count(KV.second.Aliasee) ||
+                               RequestedAliases.count(KV.second.Aliasee)))
         continue;
 
-      ResponsibilitySymbols.insert(Tmp->first);
-      QuerySymbols.insert(Tmp->second.Aliasee);
-      QueryAliases[Tmp->first] = std::move(Tmp->second);
-      RequestedAliases.erase(Tmp);
+      ResponsibilitySymbols.insert(KV.first);
+      QuerySymbols.insert(KV.second.Aliasee);
+      QueryAliases[KV.first] = std::move(KV.second);
     }
+
+    // Remove the aliases collected this round from the RequestedAliases map.
+    for (auto &KV : QueryAliases)
+      RequestedAliases.erase(KV.first);
+
     assert(!QuerySymbols.empty() && "Alias cycle detected!");
 
     auto QueryInfo = std::make_shared<OnResolveInfo>(
@@ -1172,10 +1176,9 @@ void JITDylib::lodgeQueryImpl(
     std::shared_ptr<AsynchronousSymbolQuery> &Q, SymbolNameSet &Unresolved,
     JITDylib *MatchNonExportedInJD, bool MatchNonExported,
     std::vector<std::unique_ptr<MaterializationUnit>> &MUs) {
-  for (auto I = Unresolved.begin(), E = Unresolved.end(); I != E;) {
-    auto TmpI = I++;
-    auto Name = *TmpI;
 
+  std::vector<SymbolStringPtr> ToRemove;
+  for (auto Name : Unresolved) {
     // Search for the name in Symbols. Skip it if not found.
     auto SymI = Symbols.find(Name);
     if (SymI == Symbols.end())
@@ -1188,9 +1191,9 @@ void JITDylib::lodgeQueryImpl(
       if (!MatchNonExported && MatchNonExportedInJD != this)
         continue;
 
-    // If we matched against Name in JD, remove it frome the Unresolved set and
-    // add it to the added set.
-    Unresolved.erase(TmpI);
+    // If we matched against Name in JD, mark it to be removed from the Unresolved
+    // set.
+    ToRemove.push_back(Name);
 
     // If the symbol has an address then resolve it.
     if (SymI->second.getAddress() != 0)
@@ -1235,6 +1238,10 @@ void JITDylib::lodgeQueryImpl(
     MI.PendingQueries.push_back(Q);
     Q->addQueryDependence(*this, Name);
   }
+
+  // Remove any symbols that we found.
+  for (auto &Name : ToRemove)
+    Unresolved.erase(Name);
 }
 
 SymbolNameSet JITDylib::legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
@@ -1294,19 +1301,17 @@ JITDylib::lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
                      std::vector<std::unique_ptr<MaterializationUnit>> &MUs,
                      SymbolNameSet &Unresolved) {
   LookupImplActionFlags ActionFlags = None;
+  std::vector<SymbolStringPtr> ToRemove;
 
-  for (auto I = Unresolved.begin(), E = Unresolved.end(); I != E;) {
-    auto TmpI = I++;
-    auto Name = *TmpI;
+  for (auto Name : Unresolved) {
 
     // Search for the name in Symbols. Skip it if not found.
     auto SymI = Symbols.find(Name);
     if (SymI == Symbols.end())
       continue;
 
-    // If we found Name, remove it frome the Unresolved set and add it
-    // to the dependencies set.
-    Unresolved.erase(TmpI);
+    // If we found Name, mark it to be removed from the Unresolved set.
+    ToRemove.push_back(Name);
 
     // If the symbol has an address then resolve it.
     if (SymI->second.getAddress() != 0) {
@@ -1357,6 +1362,10 @@ JITDylib::lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
     Q->addQueryDependence(*this, Name);
   }
 
+  // Remove any marked symbols from the Unresolved set.
+  for (auto &Name : ToRemove)
+    Unresolved.erase(Name);
+
   return ActionFlags;
 }
 
diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 667237373ca..4c8f725df54 100644
--- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -167,13 +167,13 @@ int LocalCXXRuntimeOverridesBase::CXAAtExitOverride(DestructorPtr Destructor,
 
 Error LocalCXXRuntimeOverrides2::enable(JITDylib &JD,
                                         MangleAndInterner &Mangle) {
-  SymbolMap RuntimeInterposes(
-      {{Mangle("__dso_handle"),
-        JITEvaluatedSymbol(toTargetAddress(&DSOHandleOverride),
-                           JITSymbolFlags::Exported)},
-       {Mangle("__cxa_atexit"),
-        JITEvaluatedSymbol(toTargetAddress(&CXAAtExitOverride),
-                           JITSymbolFlags::Exported)}});
+  SymbolMap RuntimeInterposes;
+  RuntimeInterposes[Mangle("__dso_handle")] =
+    JITEvaluatedSymbol(toTargetAddress(&DSOHandleOverride),
+                       JITSymbolFlags::Exported);
+  RuntimeInterposes[Mangle("__cxa_atexit")] =
+    JITEvaluatedSymbol(toTargetAddress(&CXAAtExitOverride),
+                       JITSymbolFlags::Exported);
 
   return JD.define(absoluteSymbols(std::move(RuntimeInterposes)));
 }
-- 
GitLab


From 6712561e1900fcf553d66bb73f90759319ef17f8 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 15 Oct 2018 22:27:03 +0000
Subject: [PATCH 0224/1116] Change a TerminatorInst* to an Instruction* in
 HotColdSplitting.cpp.

r344558 added an assignment to a TerminatorInst* from
BasicBlock::getTerminatorInst(), but BasicBlock::getTerminatorInst() returns an
Instruction* rather than a TerminatorInst* since r344504 so this fails to
compile.

Changing the variable to an Instruction* should get the bots building again.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344566 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/HotColdSplitting.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index fcea40dffd7..be4da249955 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -150,7 +150,7 @@ static bool unlikelyExecuted(const BasicBlock &BB) {
 }
 
 static bool returnsOrHasSideEffects(const BasicBlock &BB) {
-  const TerminatorInst *I = BB.getTerminator();
+  const Instruction *I = BB.getTerminator();
   if (isa<ReturnInst>(I) || isa<IndirectBrInst>(I) || isa<InvokeInst>(I))
     return true;
 
-- 
GitLab


From 4812114f295f744ad7b77d4006b8bcdeaa46c1f2 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 15 Oct 2018 22:36:22 +0000
Subject: [PATCH 0225/1116] [ORC] Rename MultiThreadedSimpleCompiler to
 ConcurrentIRCompiler.

The new name is a better fit: This class does not actually spawn any new
threads for compilation, it is just safe to call from multiple threads
concurrently.

The "Simple" part of the name did not convey much either, so it was
dropped.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344567 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/Orc/CompileUtils.h | 8 ++++----
 lib/ExecutionEngine/Orc/LLJIT.cpp               | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/CompileUtils.h b/include/llvm/ExecutionEngine/Orc/CompileUtils.h
index 3d02f9d05e4..f34f88311ba 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileUtils.h
@@ -38,7 +38,7 @@ namespace orc {
 
 /// Simple compile functor: Takes a single IR module and returns an ObjectFile.
 /// This compiler supports a single compilation thread and LLVMContext only.
-/// For multithreaded compilation, use MultiThreadedSimpleCompiler below.
+/// For multithreaded compilation, use ConcurrentIRCompiler below.
 class SimpleCompiler {
 public:
   using CompileResult = std::unique_ptr<MemoryBuffer>;
@@ -105,10 +105,10 @@ private:
 ///
 /// This class creates a new TargetMachine and SimpleCompiler instance for each
 /// compile.
-class MultiThreadedSimpleCompiler {
+class ConcurrentIRCompiler {
 public:
-  MultiThreadedSimpleCompiler(JITTargetMachineBuilder JTMB,
-                              ObjectCache *ObjCache = nullptr)
+  ConcurrentIRCompiler(JITTargetMachineBuilder JTMB,
+                       ObjectCache *ObjCache = nullptr)
       : JTMB(std::move(JTMB)), ObjCache(ObjCache) {}
 
   void setObjectCache(ObjectCache *ObjCache) { this->ObjCache = ObjCache; }
diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp
index 39bb4c48067..478ac2e2148 100644
--- a/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -96,7 +96,7 @@ LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES, JITTargetMachineBuilder JTMB,
       ObjLinkingLayer(*this->ES,
                       [this](VModuleKey K) { return getMemoryManager(K); }),
       CompileLayer(*this->ES, ObjLinkingLayer,
-                   MultiThreadedSimpleCompiler(std::move(JTMB))),
+                   ConcurrentIRCompiler(std::move(JTMB))),
       CtorRunner(Main), DtorRunner(Main) {
   assert(NumCompileThreads != 0 &&
          "Multithreaded LLJIT instance can not be created with 0 threads");
-- 
GitLab


From 01e314f12f6e0a8cbc69995cdceb6e0480df189a Mon Sep 17 00:00:00 2001
From: Chris Bieneman <chris.bieneman@me.com>
Date: Mon, 15 Oct 2018 22:36:59 +0000
Subject: [PATCH 0226/1116] [CMake] Fix a missing LLVM_ENABLE_IDE from r344555

This is just one place I missed swapping CMAKE_CONFIGURATION_TYPES with LLVM_ENABLE_IDE.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344568 91177308-0d34-0410-b5e6-96231b3b80d8
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 374bddbec2d..c189bd875b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -995,7 +995,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
     list(REMOVE_DUPLICATES LLVM_LIBS)
     foreach(lib ${LLVM_LIBS})
       add_dependencies(llvm-libraries ${lib})
-      if (NOT CMAKE_CONFIGURATION_TYPES)
+      if (NOT LLVM_ENABLE_IDE)
         add_dependencies(install-llvm-libraries install-${lib})
       endif()
     endforeach()
-- 
GitLab


From 270bd836f891657ed003961b1ae43da950d531cb Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Mon, 15 Oct 2018 22:37:46 +0000
Subject: [PATCH 0227/1116] StructurizeCFG,AMDGPU: Test case of a redundant phi
 and codegen consequences

Change-Id: I9681f9e41ca30f82576f3d1f965c3a550a34b171

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344569 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/AMDGPU/smrd.ll                   | 34 ++++++++++++++
 .../StructurizeCFG/loop-continue-phi.ll       | 45 +++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 test/Transforms/StructurizeCFG/loop-continue-phi.ll

diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 6596119f8b3..b4220c25f00 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -535,6 +535,40 @@ exit:
 }
 
 
+; GCN-LABEL: {{^}}smrd_uniform_loop2:
+; (this test differs from smrd_uniform_loop by the more complex structure of phis,
+; which currently confuses the DivergenceAnalysis after structurization)
+;
+; TODO: this should use an s_buffer_load
+;
+; GCN: buffer_load_dword
+define amdgpu_ps float @smrd_uniform_loop2(<4 x i32> inreg %desc, i32 %bound, i32 %bound.a) #0 {
+main_body:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop.a ], [ %counter.next, %loop.b ]
+  %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop.a ], [ %sum.next.b, %loop.b ]
+  %offset = shl i32 %counter, 2
+  %v = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset)
+  %sum.next = fadd float %sum, %v
+  %counter.next = add i32 %counter, 1
+  %cc = icmp uge i32 %counter.next, %bound
+  br i1 %cc, label %exit, label %loop.a
+
+loop.a:
+  %cc.a = icmp uge i32 %counter.next, %bound.a
+  br i1 %cc, label %loop, label %loop.b
+
+loop.b:
+  %sum.next.b = fadd float %sum.next, 1.0
+  br label %loop
+
+exit:
+  ret float %sum.next
+}
+
+
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
diff --git a/test/Transforms/StructurizeCFG/loop-continue-phi.ll b/test/Transforms/StructurizeCFG/loop-continue-phi.ll
new file mode 100644
index 00000000000..7e1c0b9413f
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/loop-continue-phi.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -o - -structurizecfg < %s | FileCheck %s
+
+;
+; TODO: eliminate redundant phis for the loop counter
+;
+define void @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[CTR_NEXT:%.*]], [[LOOP_B:%.*]] ], [ [[CTR_NEXT]], [[LOOP_A:%.*]] ]
+; CHECK-NEXT:    br label [[FLOW1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[FLOW1]] ]
+; CHECK-NEXT:    [[CTR_NEXT]] = add i32 [[CTR]], 1
+; CHECK-NEXT:    br i1 undef, label [[LOOP_A]], label [[FLOW1]]
+; CHECK:       loop.a:
+; CHECK-NEXT:    br i1 undef, label [[LOOP_B]], label [[FLOW:%.*]]
+; CHECK:       loop.b:
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP1]] = phi i32 [ [[TMP0]], [[FLOW]] ], [ undef, [[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ false, [[FLOW]] ], [ true, [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %ctr = phi i32 [ 0, %entry ], [ %ctr.next, %loop.a ], [ %ctr.next, %loop.b ]
+  %ctr.next = add i32 %ctr, 1
+  br i1 undef, label %exit, label %loop.a
+
+loop.a:
+  br i1 undef, label %loop, label %loop.b
+
+loop.b:
+  br label %loop
+
+exit:
+  ret void
+}
-- 
GitLab


From 582b11962408a54fc88125ba5c75f7470998fe51 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 15 Oct 2018 22:56:10 +0000
Subject: [PATCH 0228/1116] [ORC] Rename ORC layers to make the "new" ORC
 layers the default.

This commit adds a 'Legacy' prefix to old ORC layers and utilities, and removes
the '2' suffix from the new ORC layers. If you wish to continue using the old
ORC layers you will need to add a 'Legacy' prefix to your classes. If you were
already using the new ORC layers you will need to drop the '2' suffix.

The legacy layers will remain in-tree until the new layers reach feature
parity with them. This will involve adding support for removing code from the
new layers, and ensuring that performance is comperable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344572 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../BuildingAJIT/Chapter1/KaleidoscopeJIT.h   |   6 +-
 .../BuildingAJIT/Chapter2/KaleidoscopeJIT.h   |   8 +-
 .../BuildingAJIT/Chapter3/KaleidoscopeJIT.h   |  10 +-
 .../BuildingAJIT/Chapter4/KaleidoscopeJIT.h   |   8 +-
 .../BuildingAJIT/Chapter5/KaleidoscopeJIT.h   |   8 +-
 .../Kaleidoscope/include/KaleidoscopeJIT.h    |   4 +-
 .../Orc/CompileOnDemandLayer.h                |  24 +-
 .../llvm/ExecutionEngine/Orc/ExecutionUtils.h |  14 +-
 .../llvm/ExecutionEngine/Orc/IRCompileLayer.h |  12 +-
 .../ExecutionEngine/Orc/IRTransformLayer.h    |  12 +-
 include/llvm/ExecutionEngine/Orc/LLJIT.h      |  16 +-
 .../Orc/ObjectTransformLayer.h                |  12 +-
 .../Orc/RTDyldObjectLinkingLayer.h            |  26 +-
 .../Orc/CompileOnDemandLayer.cpp              |  30 +-
 lib/ExecutionEngine/Orc/ExecutionUtils.cpp    |   6 +-
 lib/ExecutionEngine/Orc/IRCompileLayer.cpp    |   6 +-
 lib/ExecutionEngine/Orc/IRTransformLayer.cpp  |   4 +-
 lib/ExecutionEngine/Orc/LLJIT.cpp             |   2 +-
 .../Orc/ObjectTransformLayer.cpp              |  10 +-
 lib/ExecutionEngine/Orc/OrcCBindingsStack.h   |  16 +-
 .../Orc/OrcMCJITReplacement.cpp               |   2 +-
 lib/ExecutionEngine/Orc/OrcMCJITReplacement.h |   4 +-
 .../Orc/RTDyldObjectLinkingLayer.cpp          |   8 +-
 tools/lli/lli.cpp                             |   6 +-
 unittests/ExecutionEngine/Orc/CMakeLists.txt  |   4 +-
 ...cpp => LegacyCompileOnDemandLayerTest.cpp} |   4 +-
 .../LegacyRTDyldObjectLinkingLayerTest.cpp    | 282 +++++++++++++++
 .../Orc/ObjectTransformLayerTest.cpp          |  24 +-
 .../Orc/RTDyldObjectLinkingLayer2Test.cpp     | 228 ------------
 .../Orc/RTDyldObjectLinkingLayerTest.cpp      | 332 ++++++++----------
 30 files changed, 564 insertions(+), 564 deletions(-)
 rename unittests/ExecutionEngine/Orc/{CompileOnDemandLayerTest.cpp => LegacyCompileOnDemandLayerTest.cpp} (95%)
 create mode 100644 unittests/ExecutionEngine/Orc/LegacyRTDyldObjectLinkingLayerTest.cpp
 delete mode 100644 unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayer2Test.cpp

diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
index 0b8bb381d08..8c1af40be15 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
@@ -42,8 +42,8 @@ private:
   std::shared_ptr<SymbolResolver> Resolver;
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  RTDyldObjectLinkingLayer ObjectLayer;
-  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  LegacyRTDyldObjectLinkingLayer ObjectLayer;
+  LegacyIRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
 
 public:
   KaleidoscopeJIT()
@@ -63,7 +63,7 @@ public:
         TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
         ObjectLayer(ES,
                     [this](VModuleKey) {
-                      return RTDyldObjectLinkingLayer::Resources{
+                      return LegacyRTDyldObjectLinkingLayer::Resources{
                           std::make_shared<SectionMemoryManager>(), Resolver};
                     }),
         CompileLayer(ObjectLayer, SimpleCompiler(*TM)) {
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
index 9ea84d1a858..7c803b138c0 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
@@ -47,13 +47,13 @@ private:
   std::shared_ptr<SymbolResolver> Resolver;
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  RTDyldObjectLinkingLayer ObjectLayer;
-  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  LegacyRTDyldObjectLinkingLayer ObjectLayer;
+  LegacyIRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
 
   using OptimizeFunction =
       std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
-  IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
+  LegacyIRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
 public:
   KaleidoscopeJIT()
@@ -73,7 +73,7 @@ public:
         TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
         ObjectLayer(ES,
                     [this](VModuleKey) {
-                      return RTDyldObjectLinkingLayer::Resources{
+                      return LegacyRTDyldObjectLinkingLayer::Resources{
                           std::make_shared<SectionMemoryManager>(), Resolver};
                     }),
         CompileLayer(ObjectLayer, SimpleCompiler(*TM)),
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
index 80c39bd70f7..ce0111d2f6b 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
@@ -51,23 +51,23 @@ private:
   std::map<VModuleKey, std::shared_ptr<SymbolResolver>> Resolvers;
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  RTDyldObjectLinkingLayer ObjectLayer;
-  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  LegacyRTDyldObjectLinkingLayer ObjectLayer;
+  LegacyIRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
 
   using OptimizeFunction =
       std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
-  IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
+  LegacyIRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
   std::unique_ptr<JITCompileCallbackManager> CompileCallbackManager;
-  CompileOnDemandLayer<decltype(OptimizeLayer)> CODLayer;
+  LegacyCompileOnDemandLayer<decltype(OptimizeLayer)> CODLayer;
 
 public:
   KaleidoscopeJIT()
       : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
         ObjectLayer(ES,
                     [this](VModuleKey K) {
-                      return RTDyldObjectLinkingLayer::Resources{
+                      return LegacyRTDyldObjectLinkingLayer::Resources{
                           std::make_shared<SectionMemoryManager>(),
                           Resolvers[K]};
                     }),
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
index 04ad86e34bf..ffca65fbcd4 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
@@ -77,13 +77,13 @@ private:
   std::shared_ptr<SymbolResolver> Resolver;
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  RTDyldObjectLinkingLayer ObjectLayer;
-  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  LegacyRTDyldObjectLinkingLayer ObjectLayer;
+  LegacyIRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
 
   using OptimizeFunction =
       std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
-  IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
+  LegacyIRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
   std::unique_ptr<JITCompileCallbackManager> CompileCallbackMgr;
   std::unique_ptr<IndirectStubsManager> IndirectStubsMgr;
@@ -108,7 +108,7 @@ public:
         TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
         ObjectLayer(ES,
                     [this](VModuleKey K) {
-                      return RTDyldObjectLinkingLayer::Resources{
+                      return LegacyRTDyldObjectLinkingLayer::Resources{
                           std::make_shared<SectionMemoryManager>(), Resolver};
                     }),
         CompileLayer(ObjectLayer, SimpleCompiler(*TM)),
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
index 010f5436377..f1ae5b02289 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
@@ -82,13 +82,13 @@ private:
   std::shared_ptr<SymbolResolver> Resolver;
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  RTDyldObjectLinkingLayer ObjectLayer;
-  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  LegacyRTDyldObjectLinkingLayer ObjectLayer;
+  LegacyIRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
 
   using OptimizeFunction =
       std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
-  IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
+  LegacyIRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
   JITCompileCallbackManager *CompileCallbackMgr;
   std::unique_ptr<IndirectStubsManager> IndirectStubsMgr;
@@ -116,7 +116,7 @@ public:
         DL(TM->createDataLayout()),
         ObjectLayer(ES,
                     [this](VModuleKey K) {
-                      return RTDyldObjectLinkingLayer::Resources{
+                      return LegacyRTDyldObjectLinkingLayer::Resources{
                           cantFail(this->Remote.createRemoteMemoryManager()),
                           Resolver};
                     }),
diff --git a/examples/Kaleidoscope/include/KaleidoscopeJIT.h b/examples/Kaleidoscope/include/KaleidoscopeJIT.h
index 7239aea7ba1..972773a64f7 100644
--- a/examples/Kaleidoscope/include/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/include/KaleidoscopeJIT.h
@@ -40,8 +40,8 @@ namespace orc {
 
 class KaleidoscopeJIT {
 public:
-  using ObjLayerT = RTDyldObjectLinkingLayer;
-  using CompileLayerT = IRCompileLayer<ObjLayerT, SimpleCompiler>;
+  using ObjLayerT = LegacyRTDyldObjectLinkingLayer;
+  using CompileLayerT = LegacyIRCompileLayer<ObjLayerT, SimpleCompiler>;
 
   KaleidoscopeJIT()
       : Resolver(createLegacyLookupResolver(
diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index 2003f8e43b8..7721f74fe0c 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -62,7 +62,7 @@ namespace orc {
 
 class ExtractingIRMaterializationUnit;
 
-class CompileOnDemandLayer2 : public IRLayer {
+class CompileOnDemandLayer : public IRLayer {
   friend class PartitioningIRMaterializationUnit;
 
 public:
@@ -84,8 +84,8 @@ public:
   /// symbol in them is requested.
   static Optional<GlobalValueSet> compileWholeModule(GlobalValueSet Requested);
 
-  /// Construct a CompileOnDemandLayer2.
-  CompileOnDemandLayer2(ExecutionSession &ES, IRLayer &BaseLayer,
+  /// Construct a CompileOnDemandLayer.
+  CompileOnDemandLayer(ExecutionSession &ES, IRLayer &BaseLayer,
                         LazyCallThroughManager &LCTMgr,
                         IndirectStubsManagerBuilder BuildIndirectStubsManager);
 
@@ -142,7 +142,7 @@ private:
 template <typename BaseLayerT,
           typename CompileCallbackMgrT = JITCompileCallbackManager,
           typename IndirectStubsMgrT = IndirectStubsManager>
-class CompileOnDemandLayer {
+class LegacyCompileOnDemandLayer {
 private:
   template <typename MaterializerFtor>
   class LambdaMaterializer final : public ValueMaterializer {
@@ -266,13 +266,13 @@ public:
       std::function<void(VModuleKey K, std::shared_ptr<SymbolResolver> R)>;
 
   /// Construct a compile-on-demand layer instance.
-  CompileOnDemandLayer(ExecutionSession &ES, BaseLayerT &BaseLayer,
-                       SymbolResolverGetter GetSymbolResolver,
-                       SymbolResolverSetter SetSymbolResolver,
-                       PartitioningFtor Partition,
-                       CompileCallbackMgrT &CallbackMgr,
-                       IndirectStubsManagerBuilderT CreateIndirectStubsManager,
-                       bool CloneStubsIntoPartitions = true)
+  LegacyCompileOnDemandLayer(ExecutionSession &ES, BaseLayerT &BaseLayer,
+                             SymbolResolverGetter GetSymbolResolver,
+                             SymbolResolverSetter SetSymbolResolver,
+                             PartitioningFtor Partition,
+                             CompileCallbackMgrT &CallbackMgr,
+                             IndirectStubsManagerBuilderT CreateIndirectStubsManager,
+                             bool CloneStubsIntoPartitions = true)
       : ES(ES), BaseLayer(BaseLayer),
         GetSymbolResolver(std::move(GetSymbolResolver)),
         SetSymbolResolver(std::move(SetSymbolResolver)),
@@ -280,7 +280,7 @@ public:
         CreateIndirectStubsManager(std::move(CreateIndirectStubsManager)),
         CloneStubsIntoPartitions(CloneStubsIntoPartitions) {}
 
-  ~CompileOnDemandLayer() {
+  ~LegacyCompileOnDemandLayer() {
     // FIXME: Report error on log.
     while (!LogicalDylibs.empty())
       consumeError(removeModule(LogicalDylibs.begin()->first));
diff --git a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
index 662ed7b78e4..88559f822e5 100644
--- a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
@@ -94,11 +94,11 @@ iterator_range<CtorDtorIterator> getDestructors(const Module &M);
 /// Convenience class for recording constructor/destructor names for
 ///        later execution.
 template <typename JITLayerT>
-class CtorDtorRunner {
+class LegacyCtorDtorRunner {
 public:
   /// Construct a CtorDtorRunner for the given range using the given
   ///        name mangling function.
-  CtorDtorRunner(std::vector<std::string> CtorDtorNames, VModuleKey K)
+  LegacyCtorDtorRunner(std::vector<std::string> CtorDtorNames, VModuleKey K)
       : CtorDtorNames(std::move(CtorDtorNames)), K(K) {}
 
   /// Run the recorded constructors/destructors through the given JIT
@@ -129,9 +129,9 @@ private:
   orc::VModuleKey K;
 };
 
-class CtorDtorRunner2 {
+class CtorDtorRunner {
 public:
-  CtorDtorRunner2(JITDylib &JD) : JD(JD) {}
+  CtorDtorRunner(JITDylib &JD) : JD(JD) {}
   void add(iterator_range<CtorDtorIterator> CtorDtors);
   Error run();
 
@@ -177,11 +177,11 @@ protected:
                                void *DSOHandle);
 };
 
-class LocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase {
+class LegacyLocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase {
 public:
   /// Create a runtime-overrides class.
   template <typename MangleFtorT>
-  LocalCXXRuntimeOverrides(const MangleFtorT &Mangle) {
+  LegacyLocalCXXRuntimeOverrides(const MangleFtorT &Mangle) {
     addOverride(Mangle("__dso_handle"), toTargetAddress(&DSOHandleOverride));
     addOverride(Mangle("__cxa_atexit"), toTargetAddress(&CXAAtExitOverride));
   }
@@ -202,7 +202,7 @@ private:
   StringMap<JITTargetAddress> CXXRuntimeOverrides;
 };
 
-class LocalCXXRuntimeOverrides2 : public LocalCXXRuntimeOverridesBase {
+class LocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase {
 public:
   Error enable(JITDylib &JD, MangleAndInterner &Mangler);
 };
diff --git a/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h b/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
index cb8df26bfdc..a62d8be2fa6 100644
--- a/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
@@ -28,7 +28,7 @@ class Module;
 
 namespace orc {
 
-class IRCompileLayer2 : public IRLayer {
+class IRCompileLayer : public IRLayer {
 public:
   using CompileFunction =
       std::function<Expected<std::unique_ptr<MemoryBuffer>>(Module &)>;
@@ -36,8 +36,8 @@ public:
   using NotifyCompiledFunction =
       std::function<void(VModuleKey K, ThreadSafeModule TSM)>;
 
-  IRCompileLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer,
-                  CompileFunction Compile);
+  IRCompileLayer(ExecutionSession &ES, ObjectLayer &BaseLayer,
+                 CompileFunction Compile);
 
   void setNotifyCompiled(NotifyCompiledFunction NotifyCompiled);
 
@@ -57,15 +57,15 @@ private:
 /// object file and adds this module file to the layer below, which must
 /// implement the object layer concept.
 template <typename BaseLayerT, typename CompileFtor>
-class IRCompileLayer {
+class LegacyIRCompileLayer {
 public:
   /// Callback type for notifications when modules are compiled.
   using NotifyCompiledCallback =
       std::function<void(VModuleKey K, std::unique_ptr<Module>)>;
 
-  /// Construct an IRCompileLayer with the given BaseLayer, which must
+  /// Construct an LegacyIRCompileLayer with the given BaseLayer, which must
   ///        implement the ObjectLayer concept.
-  IRCompileLayer(
+  LegacyIRCompileLayer(
       BaseLayerT &BaseLayer, CompileFtor Compile,
       NotifyCompiledCallback NotifyCompiled = NotifyCompiledCallback())
       : BaseLayer(BaseLayer), Compile(std::move(Compile)),
diff --git a/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h b/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
index d5f91cef359..55a1ce4c930 100644
--- a/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
@@ -23,13 +23,13 @@ namespace llvm {
 class Module;
 namespace orc {
 
-class IRTransformLayer2 : public IRLayer {
+class IRTransformLayer : public IRLayer {
 public:
   using TransformFunction = std::function<Expected<ThreadSafeModule>(
       ThreadSafeModule, const MaterializationResponsibility &R)>;
 
-  IRTransformLayer2(ExecutionSession &ES, IRLayer &BaseLayer,
-                    TransformFunction Transform = identityTransform);
+  IRTransformLayer(ExecutionSession &ES, IRLayer &BaseLayer,
+                   TransformFunction Transform = identityTransform);
 
   void setTransform(TransformFunction Transform) {
     this->Transform = std::move(Transform);
@@ -54,11 +54,11 @@ private:
 ///   This layer applies a user supplied transform to each module that is added,
 /// then adds the transformed module to the layer below.
 template <typename BaseLayerT, typename TransformFtor>
-class IRTransformLayer {
+class LegacyIRTransformLayer {
 public:
 
-  /// Construct an IRTransformLayer with the given BaseLayer
-  IRTransformLayer(BaseLayerT &BaseLayer,
+  /// Construct an LegacyIRTransformLayer with the given BaseLayer
+  LegacyIRTransformLayer(BaseLayerT &BaseLayer,
                    TransformFtor Transform = TransformFtor())
     : BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
diff --git a/include/llvm/ExecutionEngine/Orc/LLJIT.h b/include/llvm/ExecutionEngine/Orc/LLJIT.h
index 400d4cbe7f0..05a566fedb6 100644
--- a/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -99,7 +99,7 @@ public:
   Error runDestructors() { return DtorRunner.run(); }
 
   /// Returns a reference to the ObjLinkingLayer
-  RTDyldObjectLinkingLayer2 &getObjLinkingLayer() { return ObjLinkingLayer; }
+  RTDyldObjectLinkingLayer &getObjLinkingLayer() { return ObjLinkingLayer; }
 
 protected:
 
@@ -125,10 +125,10 @@ protected:
   DataLayout DL;
   std::unique_ptr<ThreadPool> CompileThreads;
 
-  RTDyldObjectLinkingLayer2 ObjLinkingLayer;
-  IRCompileLayer2 CompileLayer;
+  RTDyldObjectLinkingLayer ObjLinkingLayer;
+  IRCompileLayer CompileLayer;
 
-  CtorDtorRunner2 CtorRunner, DtorRunner;
+  CtorDtorRunner CtorRunner, DtorRunner;
 };
 
 /// An extended version of LLJIT that supports lazy function-at-a-time
@@ -145,13 +145,13 @@ public:
 
   /// Set an IR transform (e.g. pass manager pipeline) to run on each function
   /// when it is compiled.
-  void setLazyCompileTransform(IRTransformLayer2::TransformFunction Transform) {
+  void setLazyCompileTransform(IRTransformLayer::TransformFunction Transform) {
     TransformLayer.setTransform(std::move(Transform));
   }
 
   /// Sets the partition function.
   void
-  setPartitionFunction(CompileOnDemandLayer2::PartitionFunction Partition) {
+  setPartitionFunction(CompileOnDemandLayer::PartitionFunction Partition) {
     CODLayer.setPartitionFunction(std::move(Partition));
   }
 
@@ -180,8 +180,8 @@ private:
   std::unique_ptr<LazyCallThroughManager> LCTMgr;
   std::function<std::unique_ptr<IndirectStubsManager>()> ISMBuilder;
 
-  IRTransformLayer2 TransformLayer;
-  CompileOnDemandLayer2 CODLayer;
+  IRTransformLayer TransformLayer;
+  CompileOnDemandLayer CODLayer;
 };
 
 } // End namespace orc
diff --git a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
index c6b43a9c8ed..6cd688ad58a 100644
--- a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
@@ -23,14 +23,14 @@
 namespace llvm {
 namespace orc {
 
-class ObjectTransformLayer2 : public ObjectLayer {
+class ObjectTransformLayer : public ObjectLayer {
 public:
   using TransformFunction =
       std::function<Expected<std::unique_ptr<MemoryBuffer>>(
           std::unique_ptr<MemoryBuffer>)>;
 
-  ObjectTransformLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer,
-                        TransformFunction Transform);
+  ObjectTransformLayer(ExecutionSession &ES, ObjectLayer &BaseLayer,
+                       TransformFunction Transform);
 
   void emit(MaterializationResponsibility R, VModuleKey K,
             std::unique_ptr<MemoryBuffer> O) override;
@@ -46,11 +46,11 @@ private:
 /// immediately applies the user supplied functor to each object, then adds
 /// the set of transformed objects to the layer below.
 template <typename BaseLayerT, typename TransformFtor>
-class ObjectTransformLayer {
+class LegacyObjectTransformLayer {
 public:
   /// Construct an ObjectTransformLayer with the given BaseLayer
-  ObjectTransformLayer(BaseLayerT &BaseLayer,
-                       TransformFtor Transform = TransformFtor())
+  LegacyObjectTransformLayer(BaseLayerT &BaseLayer,
+                             TransformFtor Transform = TransformFtor())
       : BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
   /// Apply the transform functor to each object in the object set, then
diff --git a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
index 0c30520a21b..bbd782fdece 100644
--- a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -36,7 +36,7 @@
 namespace llvm {
 namespace orc {
 
-class RTDyldObjectLinkingLayer2 : public ObjectLayer {
+class RTDyldObjectLinkingLayer : public ObjectLayer {
 public:
   /// Functor for receiving object-loaded notifications.
   using NotifyLoadedFunction =
@@ -51,7 +51,7 @@ public:
 
   /// Construct an ObjectLinkingLayer with the given NotifyLoaded,
   ///        and NotifyEmitted functors.
-  RTDyldObjectLinkingLayer2(
+  RTDyldObjectLinkingLayer(
       ExecutionSession &ES, GetMemoryManagerFunction GetMemoryManager,
       NotifyLoadedFunction NotifyLoaded = NotifyLoadedFunction(),
       NotifyEmittedFunction NotifyEmitted = NotifyEmittedFunction());
@@ -66,7 +66,7 @@ public:
   /// the memory manager, rather than just the sections required for execution.
   ///
   /// This is kludgy, and may be removed in the future.
-  RTDyldObjectLinkingLayer2 &setProcessAllSections(bool ProcessAllSections) {
+  RTDyldObjectLinkingLayer &setProcessAllSections(bool ProcessAllSections) {
     this->ProcessAllSections = ProcessAllSections;
     return *this;
   }
@@ -79,13 +79,13 @@ public:
   ///
   /// FIXME: We should be able to remove this if/when COFF properly tracks
   /// exported symbols.
-  RTDyldObjectLinkingLayer2 &
+  RTDyldObjectLinkingLayer &
   setOverrideObjectFlagsWithResponsibilityFlags(bool OverrideObjectFlags) {
     this->OverrideObjectFlags = OverrideObjectFlags;
     return *this;
   }
 
-  /// If set, this RTDyldObjectLinkingLayer2 instance will claim responsibility
+  /// If set, this RTDyldObjectLinkingLayer instance will claim responsibility
   /// for any symbols provided by a given object file that were not already in
   /// the MaterializationResponsibility instance. Setting this flag allows
   /// higher-level program representations (e.g. LLVM IR) to be added based on
@@ -96,7 +96,7 @@ public:
   /// deterministically). If this option is set, clashes for the additional
   /// symbols may not be detected until late, and detection may depend on
   /// the flow of control through JIT'd code. Use with care.
-  RTDyldObjectLinkingLayer2 &
+  RTDyldObjectLinkingLayer &
   setAutoClaimResponsibilityForObjectSymbols(bool AutoClaimObjectSymbols) {
     this->AutoClaimObjectSymbols = AutoClaimObjectSymbols;
     return *this;
@@ -121,7 +121,7 @@ private:
   std::map<VModuleKey, std::shared_ptr<RuntimeDyld::MemoryManager>> MemMgrs;
 };
 
-class RTDyldObjectLinkingLayerBase {
+class LegacyRTDyldObjectLinkingLayerBase {
 public:
   using ObjectPtr = std::unique_ptr<MemoryBuffer>;
 
@@ -173,10 +173,10 @@ protected:
 /// object files to be loaded into memory, linked, and the addresses of their
 /// symbols queried. All objects added to this layer can see each other's
 /// symbols.
-class RTDyldObjectLinkingLayer : public RTDyldObjectLinkingLayerBase {
+class LegacyRTDyldObjectLinkingLayer : public LegacyRTDyldObjectLinkingLayerBase {
 public:
 
-  using RTDyldObjectLinkingLayerBase::ObjectPtr;
+  using LegacyRTDyldObjectLinkingLayerBase::ObjectPtr;
 
   /// Functor for receiving object-loaded notifications.
   using NotifyLoadedFtor =
@@ -197,7 +197,7 @@ private:
   template <typename MemoryManagerPtrT>
   class ConcreteLinkedObject : public LinkedObject {
   public:
-    ConcreteLinkedObject(RTDyldObjectLinkingLayer &Parent, VModuleKey K,
+    ConcreteLinkedObject(LegacyRTDyldObjectLinkingLayer &Parent, VModuleKey K,
                          OwnedObject Obj, MemoryManagerPtrT MemMgr,
                          std::shared_ptr<SymbolResolver> Resolver,
                          bool ProcessAllSections)
@@ -313,7 +313,7 @@ private:
     };
 
     VModuleKey K;
-    RTDyldObjectLinkingLayer &Parent;
+    LegacyRTDyldObjectLinkingLayer &Parent;
     MemoryManagerPtrT MemMgr;
     OwnedObject ObjForNotify;
     std::unique_ptr<PreFinalizeContents> PFC;
@@ -321,7 +321,7 @@ private:
 
   template <typename MemoryManagerPtrT>
   std::unique_ptr<ConcreteLinkedObject<MemoryManagerPtrT>>
-  createLinkedObject(RTDyldObjectLinkingLayer &Parent, VModuleKey K,
+  createLinkedObject(LegacyRTDyldObjectLinkingLayer &Parent, VModuleKey K,
                      OwnedObject Obj, MemoryManagerPtrT MemMgr,
                      std::shared_ptr<SymbolResolver> Resolver,
                      bool ProcessAllSections) {
@@ -341,7 +341,7 @@ public:
 
   /// Construct an ObjectLinkingLayer with the given NotifyLoaded,
   ///        and NotifyFinalized functors.
-  RTDyldObjectLinkingLayer(
+  LegacyRTDyldObjectLinkingLayer(
       ExecutionSession &ES, ResourcesGetter GetResources,
       NotifyLoadedFtor NotifyLoaded = NotifyLoadedFtor(),
       NotifyFinalizedFtor NotifyFinalized = NotifyFinalizedFtor(),
diff --git a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index ae1c7e84259..f27a814f33f 100644
--- a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -68,13 +68,13 @@ namespace orc {
 class PartitioningIRMaterializationUnit : public IRMaterializationUnit {
 public:
   PartitioningIRMaterializationUnit(ExecutionSession &ES, ThreadSafeModule TSM,
-                                    CompileOnDemandLayer2 &Parent)
+                                    CompileOnDemandLayer &Parent)
       : IRMaterializationUnit(ES, std::move(TSM)), Parent(Parent) {}
 
   PartitioningIRMaterializationUnit(
       ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
       SymbolNameToDefinitionMap SymbolToDefinition,
-      CompileOnDemandLayer2 &Parent)
+      CompileOnDemandLayer &Parent)
       : IRMaterializationUnit(std::move(TSM), std::move(SymbolFlags),
                               std::move(SymbolToDefinition)),
         Parent(Parent) {}
@@ -93,30 +93,30 @@ private:
   }
 
   mutable std::mutex SourceModuleMutex;
-  CompileOnDemandLayer2 &Parent;
+  CompileOnDemandLayer &Parent;
 };
 
-Optional<CompileOnDemandLayer2::GlobalValueSet>
-CompileOnDemandLayer2::compileRequested(GlobalValueSet Requested) {
+Optional<CompileOnDemandLayer::GlobalValueSet>
+CompileOnDemandLayer::compileRequested(GlobalValueSet Requested) {
   return std::move(Requested);
 }
 
-Optional<CompileOnDemandLayer2::GlobalValueSet>
-CompileOnDemandLayer2::compileWholeModule(GlobalValueSet Requested) {
+Optional<CompileOnDemandLayer::GlobalValueSet>
+CompileOnDemandLayer::compileWholeModule(GlobalValueSet Requested) {
   return None;
 }
 
-CompileOnDemandLayer2::CompileOnDemandLayer2(
+CompileOnDemandLayer::CompileOnDemandLayer(
     ExecutionSession &ES, IRLayer &BaseLayer, LazyCallThroughManager &LCTMgr,
     IndirectStubsManagerBuilder BuildIndirectStubsManager)
     : IRLayer(ES), BaseLayer(BaseLayer), LCTMgr(LCTMgr),
       BuildIndirectStubsManager(std::move(BuildIndirectStubsManager)) {}
 
-void CompileOnDemandLayer2::setPartitionFunction(PartitionFunction Partition) {
+void CompileOnDemandLayer::setPartitionFunction(PartitionFunction Partition) {
   this->Partition = std::move(Partition);
 }
 
-void CompileOnDemandLayer2::emit(MaterializationResponsibility R, VModuleKey K,
+void CompileOnDemandLayer::emit(MaterializationResponsibility R, VModuleKey K,
                                  ThreadSafeModule TSM) {
   assert(TSM.getModule() && "Null module");
 
@@ -160,8 +160,8 @@ void CompileOnDemandLayer2::emit(MaterializationResponsibility R, VModuleKey K,
                           std::move(Callables)));
 }
 
-CompileOnDemandLayer2::PerDylibResources &
-CompileOnDemandLayer2::getPerDylibResources(JITDylib &TargetD) {
+CompileOnDemandLayer::PerDylibResources &
+CompileOnDemandLayer::getPerDylibResources(JITDylib &TargetD) {
   auto I = DylibResources.find(&TargetD);
   if (I == DylibResources.end()) {
     auto &ImplD =
@@ -176,7 +176,7 @@ CompileOnDemandLayer2::getPerDylibResources(JITDylib &TargetD) {
   return I->second;
 }
 
-void CompileOnDemandLayer2::cleanUpModule(Module &M) {
+void CompileOnDemandLayer::cleanUpModule(Module &M) {
   for (auto &F : M.functions()) {
     if (F.isDeclaration())
       continue;
@@ -189,7 +189,7 @@ void CompileOnDemandLayer2::cleanUpModule(Module &M) {
   }
 }
 
-void CompileOnDemandLayer2::expandPartition(GlobalValueSet &Partition) {
+void CompileOnDemandLayer::expandPartition(GlobalValueSet &Partition) {
   // Expands the partition to ensure the following rules hold:
   // (1) If any alias is in the partition, its aliasee is also in the partition.
   // (2) If any aliasee is in the partition, its aliases are also in the
@@ -221,7 +221,7 @@ void CompileOnDemandLayer2::expandPartition(GlobalValueSet &Partition) {
     Partition.insert(GV);
 }
 
-void CompileOnDemandLayer2::emitPartition(
+void CompileOnDemandLayer::emitPartition(
     MaterializationResponsibility R, ThreadSafeModule TSM,
     IRMaterializationUnit::SymbolNameToDefinitionMap Defs) {
 
diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 4c8f725df54..21a604f71ca 100644
--- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -87,7 +87,7 @@ iterator_range<CtorDtorIterator> getDestructors(const Module &M) {
                     CtorDtorIterator(DtorsList, true));
 }
 
-void CtorDtorRunner2::add(iterator_range<CtorDtorIterator> CtorDtors) {
+void CtorDtorRunner::add(iterator_range<CtorDtorIterator> CtorDtors) {
   if (CtorDtors.begin() == CtorDtors.end())
     return;
 
@@ -115,7 +115,7 @@ void CtorDtorRunner2::add(iterator_range<CtorDtorIterator> CtorDtors) {
   }
 }
 
-Error CtorDtorRunner2::run() {
+Error CtorDtorRunner::run() {
   using CtorDtorTy = void (*)();
 
   SymbolNameSet Names;
@@ -165,7 +165,7 @@ int LocalCXXRuntimeOverridesBase::CXAAtExitOverride(DestructorPtr Destructor,
   return 0;
 }
 
-Error LocalCXXRuntimeOverrides2::enable(JITDylib &JD,
+Error LocalCXXRuntimeOverrides::enable(JITDylib &JD,
                                         MangleAndInterner &Mangle) {
   SymbolMap RuntimeInterposes;
   RuntimeInterposes[Mangle("__dso_handle")] =
diff --git a/lib/ExecutionEngine/Orc/IRCompileLayer.cpp b/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
index 5dee1c80e0b..6d029e16ba9 100644
--- a/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
+++ b/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
@@ -12,16 +12,16 @@
 namespace llvm {
 namespace orc {
 
-IRCompileLayer2::IRCompileLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer,
+IRCompileLayer::IRCompileLayer(ExecutionSession &ES, ObjectLayer &BaseLayer,
                                  CompileFunction Compile)
     : IRLayer(ES), BaseLayer(BaseLayer), Compile(std::move(Compile)) {}
 
-void IRCompileLayer2::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) {
+void IRCompileLayer::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) {
   std::lock_guard<std::mutex> Lock(IRLayerMutex);
   this->NotifyCompiled = std::move(NotifyCompiled);
 }
 
-void IRCompileLayer2::emit(MaterializationResponsibility R, VModuleKey K,
+void IRCompileLayer::emit(MaterializationResponsibility R, VModuleKey K,
                            ThreadSafeModule TSM) {
   assert(TSM.getModule() && "Module must not be null");
 
diff --git a/lib/ExecutionEngine/Orc/IRTransformLayer.cpp b/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
index 7a79a382d8d..acba7916d40 100644
--- a/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
+++ b/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
@@ -13,12 +13,12 @@
 namespace llvm {
 namespace orc {
 
-IRTransformLayer2::IRTransformLayer2(ExecutionSession &ES,
+IRTransformLayer::IRTransformLayer(ExecutionSession &ES,
                                      IRLayer &BaseLayer,
                                      TransformFunction Transform)
     : IRLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
-void IRTransformLayer2::emit(MaterializationResponsibility R, VModuleKey K,
+void IRTransformLayer::emit(MaterializationResponsibility R, VModuleKey K,
                              ThreadSafeModule TSM) {
   assert(TSM.getModule() && "Module must not be null");
 
diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp
index 478ac2e2148..e464da267ae 100644
--- a/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -21,7 +21,7 @@ namespace {
       : llvm::orc::SimpleCompiler(*TM), TM(std::move(TM)) {}
   private:
     // FIXME: shared because std::functions (and thus
-    // IRCompileLayer2::CompileFunction) are not moveable.
+    // IRCompileLayer::CompileFunction) are not moveable.
     std::shared_ptr<llvm::TargetMachine> TM;
   };
 
diff --git a/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp b/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
index 6980c8140fd..0be23f2e1a4 100644
--- a/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
+++ b/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
@@ -13,13 +13,13 @@
 namespace llvm {
 namespace orc {
 
-ObjectTransformLayer2::ObjectTransformLayer2(ExecutionSession &ES,
-                                             ObjectLayer &BaseLayer,
-                                             TransformFunction Transform)
+ObjectTransformLayer::ObjectTransformLayer(ExecutionSession &ES,
+                                            ObjectLayer &BaseLayer,
+                                            TransformFunction Transform)
     : ObjectLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
-void ObjectTransformLayer2::emit(MaterializationResponsibility R, VModuleKey K,
-                                 std::unique_ptr<MemoryBuffer> O) {
+void ObjectTransformLayer::emit(MaterializationResponsibility R, VModuleKey K,
+                                std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Module must not be null");
 
   if (auto TransformedObj = Transform(std::move(O)))
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index 3fedba1caa6..deddfcb10e1 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -77,9 +77,9 @@ public:
   };
 
   template <>
-  class GenericLayerImpl<orc::RTDyldObjectLinkingLayer> : public GenericLayer {
+  class GenericLayerImpl<orc::LegacyRTDyldObjectLinkingLayer> : public GenericLayer {
   private:
-    using LayerT = orc::RTDyldObjectLinkingLayer;
+    using LayerT = orc::LegacyRTDyldObjectLinkingLayer;
   public:
     GenericLayerImpl(LayerT &Layer) : Layer(Layer) {}
 
@@ -107,10 +107,10 @@ class OrcCBindingsStack {
 public:
 
   using CompileCallbackMgr = orc::JITCompileCallbackManager;
-  using ObjLayerT = orc::RTDyldObjectLinkingLayer;
-  using CompileLayerT = orc::IRCompileLayer<ObjLayerT, orc::SimpleCompiler>;
+  using ObjLayerT = orc::LegacyRTDyldObjectLinkingLayer;
+  using CompileLayerT = orc::LegacyIRCompileLayer<ObjLayerT, orc::SimpleCompiler>;
   using CODLayerT =
-        orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>;
+        orc::LegacyCompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>;
 
   using CallbackManagerBuilder =
       std::function<std::unique_ptr<CompileCallbackMgr>()>;
@@ -312,7 +312,7 @@ public:
 
     // Run the static constructors, and save the static destructor runner for
     // execution when the JIT is torn down.
-    orc::CtorDtorRunner<OrcCBindingsStack> CtorRunner(std::move(CtorNames), K);
+    orc::LegacyCtorDtorRunner<OrcCBindingsStack> CtorRunner(std::move(CtorNames), K);
     if (auto Err = CtorRunner.runViaLayer(*this))
       return std::move(Err);
 
@@ -517,8 +517,8 @@ private:
 
   std::map<orc::VModuleKey, std::unique_ptr<detail::GenericLayer>> KeyLayers;
 
-  orc::LocalCXXRuntimeOverrides CXXRuntimeOverrides;
-  std::vector<orc::CtorDtorRunner<OrcCBindingsStack>> IRStaticDestructorRunners;
+  orc::LegacyLocalCXXRuntimeOverrides CXXRuntimeOverrides;
+  std::vector<orc::LegacyCtorDtorRunner<OrcCBindingsStack>> IRStaticDestructorRunners;
   std::string ErrMsg;
 
   ResolverMap Resolvers;
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
index 4def579e709..617bc2fc64b 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
@@ -128,7 +128,7 @@ void OrcMCJITReplacement::runStaticConstructorsDestructors(bool isDtors) {
   auto &CtorDtorsMap = isDtors ? UnexecutedDestructors : UnexecutedConstructors;
 
   for (auto &KV : CtorDtorsMap)
-    cantFail(CtorDtorRunner<LazyEmitLayerT>(std::move(KV.second), KV.first)
+    cantFail(LegacyCtorDtorRunner<LazyEmitLayerT>(std::move(KV.second), KV.first)
                  .runViaLayer(LazyEmitLayer));
 
   CtorDtorsMap.clear();
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index 1195d39561d..36e7e83a8ba 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -461,8 +461,8 @@ private:
     return MangledName;
   }
 
-  using ObjectLayerT = RTDyldObjectLinkingLayer;
-  using CompileLayerT = IRCompileLayer<ObjectLayerT, orc::SimpleCompiler>;
+  using ObjectLayerT = LegacyRTDyldObjectLinkingLayer;
+  using CompileLayerT = LegacyIRCompileLayer<ObjectLayerT, orc::SimpleCompiler>;
   using LazyEmitLayerT = LazyEmittingLayer<CompileLayerT>;
 
   ExecutionSession ES;
diff --git a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index e84295ca215..fa574140d48 100644
--- a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -76,14 +76,14 @@ private:
 namespace llvm {
 namespace orc {
 
-RTDyldObjectLinkingLayer2::RTDyldObjectLinkingLayer2(
+RTDyldObjectLinkingLayer::RTDyldObjectLinkingLayer(
     ExecutionSession &ES, GetMemoryManagerFunction GetMemoryManager,
     NotifyLoadedFunction NotifyLoaded, NotifyEmittedFunction NotifyEmitted)
     : ObjectLayer(ES), GetMemoryManager(GetMemoryManager),
       NotifyLoaded(std::move(NotifyLoaded)),
       NotifyEmitted(std::move(NotifyEmitted)) {}
 
-void RTDyldObjectLinkingLayer2::emit(MaterializationResponsibility R,
+void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
                                      VModuleKey K,
                                      std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Object must not be null");
@@ -153,7 +153,7 @@ void RTDyldObjectLinkingLayer2::emit(MaterializationResponsibility R,
       });
 }
 
-Error RTDyldObjectLinkingLayer2::onObjLoad(
+Error RTDyldObjectLinkingLayer::onObjLoad(
     VModuleKey K, MaterializationResponsibility &R, object::ObjectFile &Obj,
     std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo,
     std::map<StringRef, JITEvaluatedSymbol> Resolved,
@@ -196,7 +196,7 @@ Error RTDyldObjectLinkingLayer2::onObjLoad(
   return Error::success();
 }
 
-void RTDyldObjectLinkingLayer2::onObjEmit(VModuleKey K,
+void RTDyldObjectLinkingLayer::onObjEmit(VModuleKey K,
                                           MaterializationResponsibility &R,
                                           Error Err) {
   if (Err) {
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index d633fe6f800..f4585dc080d 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -696,7 +696,7 @@ int main(int argc, char **argv, char * const *envp) {
   return Result;
 }
 
-static orc::IRTransformLayer2::TransformFunction createDebugDumper() {
+static orc::IRTransformLayer::TransformFunction createDebugDumper() {
   switch (OrcDumpKind) {
   case DumpKind::NoDump:
     return [](orc::ThreadSafeModule TSM,
@@ -781,7 +781,7 @@ int runOrcLazyJIT(const char *ProgName) {
   auto J = ExitOnErr(orc::LLLazyJIT::Create(std::move(JTMB), DL, LazyJITCompileThreads));
 
   if (PerModuleLazy)
-    J->setPartitionFunction(orc::CompileOnDemandLayer2::compileWholeModule);
+    J->setPartitionFunction(orc::CompileOnDemandLayer::compileWholeModule);
 
   auto Dump = createDebugDumper();
 
@@ -797,7 +797,7 @@ int runOrcLazyJIT(const char *ProgName) {
       ExitOnErr(orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
 
   orc::MangleAndInterner Mangle(J->getExecutionSession(), DL);
-  orc::LocalCXXRuntimeOverrides2 CXXRuntimeOverrides;
+  orc::LocalCXXRuntimeOverrides CXXRuntimeOverrides;
   ExitOnErr(CXXRuntimeOverrides.enable(J->getMainJITDylib(), Mangle));
 
   // Add the main module.
diff --git a/unittests/ExecutionEngine/Orc/CMakeLists.txt b/unittests/ExecutionEngine/Orc/CMakeLists.txt
index 8b0d5fc2435..019437d4ad5 100644
--- a/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -10,7 +10,6 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_unittest(OrcJITTests
-  CompileOnDemandLayerTest.cpp
   CoreAPIsTest.cpp
   IndirectionUtilsTest.cpp
   GlobalMappingLayerTest.cpp
@@ -18,6 +17,8 @@ add_llvm_unittest(OrcJITTests
   LazyCallThroughAndReexportsTest.cpp
   LazyEmittingLayerTest.cpp
   LegacyAPIInteropTest.cpp
+  LegacyCompileOnDemandLayerTest.cpp
+  LegacyRTDyldObjectLinkingLayerTest.cpp
   ObjectTransformLayerTest.cpp
   OrcCAPITest.cpp
   OrcTestCommon.cpp
@@ -25,7 +26,6 @@ add_llvm_unittest(OrcJITTests
   RemoteObjectLayerTest.cpp
   RPCUtilsTest.cpp
   RTDyldObjectLinkingLayerTest.cpp
-  RTDyldObjectLinkingLayer2Test.cpp
   SymbolStringPoolTest.cpp
   ThreadSafeModuleTest.cpp
   )
diff --git a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp b/unittests/ExecutionEngine/Orc/LegacyCompileOnDemandLayerTest.cpp
similarity index 95%
rename from unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
rename to unittests/ExecutionEngine/Orc/LegacyCompileOnDemandLayerTest.cpp
index 9aa4437550b..38f7a654571 100644
--- a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/LegacyCompileOnDemandLayerTest.cpp
@@ -54,7 +54,7 @@ public:
   }
 };
 
-TEST(CompileOnDemandLayerTest, FindSymbol) {
+TEST(LegacyCompileOnDemandLayerTest, FindSymbol) {
   MockBaseLayer<int, std::shared_ptr<Module>> TestBaseLayer;
   TestBaseLayer.findSymbolImpl =
     [](const std::string &Name, bool) {
@@ -76,7 +76,7 @@ TEST(CompileOnDemandLayerTest, FindSymbol) {
     llvm_unreachable("Should never be called");
   };
 
-  llvm::orc::CompileOnDemandLayer<decltype(TestBaseLayer)> COD(
+  llvm::orc::LegacyCompileOnDemandLayer<decltype(TestBaseLayer)> COD(
       ES, TestBaseLayer, GetResolver, SetResolver,
       [](Function &F) { return std::set<Function *>{&F}; }, CallbackMgr,
       [] { return llvm::make_unique<DummyStubsManager>(); }, true);
diff --git a/unittests/ExecutionEngine/Orc/LegacyRTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/LegacyRTDyldObjectLinkingLayerTest.cpp
new file mode 100644
index 00000000000..8c9c958cc42
--- /dev/null
+++ b/unittests/ExecutionEngine/Orc/LegacyRTDyldObjectLinkingLayerTest.cpp
@@ -0,0 +1,282 @@
+//===- RTDyldObjectLinkingLayerTest.cpp - RTDyld linking layer unit tests -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "OrcTestCommon.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
+#include "llvm/ExecutionEngine/Orc/Legacy.h"
+#include "llvm/ExecutionEngine/Orc/NullResolver.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::orc;
+
+namespace {
+
+class LegacyRTDyldObjectLinkingLayerExecutionTest : public testing::Test,
+                                              public OrcExecutionTest {
+
+};
+
+class SectionMemoryManagerWrapper : public SectionMemoryManager {
+public:
+  int FinalizationCount = 0;
+  int NeedsToReserveAllocationSpaceCount = 0;
+
+  bool needsToReserveAllocationSpace() override {
+    ++NeedsToReserveAllocationSpaceCount;
+    return SectionMemoryManager::needsToReserveAllocationSpace();
+  }
+
+  bool finalizeMemory(std::string *ErrMsg = nullptr) override {
+    ++FinalizationCount;
+    return SectionMemoryManager::finalizeMemory(ErrMsg);
+  }
+};
+
+TEST(LegacyRTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
+  class MemoryManagerWrapper : public SectionMemoryManager {
+  public:
+    MemoryManagerWrapper(bool &DebugSeen) : DebugSeen(DebugSeen) {}
+    uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                 unsigned SectionID,
+                                 StringRef SectionName,
+                                 bool IsReadOnly) override {
+      if (SectionName == ".debug_str")
+        DebugSeen = true;
+      return SectionMemoryManager::allocateDataSection(Size, Alignment,
+                                                         SectionID,
+                                                         SectionName,
+                                                         IsReadOnly);
+    }
+  private:
+    bool &DebugSeen;
+  };
+
+  bool DebugSectionSeen = false;
+  auto MM = std::make_shared<MemoryManagerWrapper>(DebugSectionSeen);
+
+  ExecutionSession ES;
+
+  LegacyRTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey) {
+    return LegacyRTDyldObjectLinkingLayer::Resources{
+        MM, std::make_shared<NullResolver>()};
+  });
+
+  LLVMContext Context;
+  auto M = llvm::make_unique<Module>("", Context);
+  M->setTargetTriple("x86_64-unknown-linux-gnu");
+  Type *Int32Ty = IntegerType::get(Context, 32);
+  GlobalVariable *GV =
+    new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
+                         ConstantInt::get(Int32Ty, 42), "foo");
+
+  GV->setSection(".debug_str");
+
+
+  // Initialize the native target in case this is the first unit test
+  // to try to build a TM.
+  OrcNativeTarget::initialize();
+  std::unique_ptr<TargetMachine> TM(
+    EngineBuilder().selectTarget(Triple(M->getTargetTriple()), "", "",
+                                 SmallVector<std::string, 1>()));
+  if (!TM)
+    return;
+
+  auto Obj = SimpleCompiler(*TM)(*M);
+
+  {
+    // Test with ProcessAllSections = false (the default).
+    auto K = ES.allocateVModule();
+    cantFail(ObjLayer.addObject(
+        K, MemoryBuffer::getMemBufferCopy(Obj->getBuffer())));
+    cantFail(ObjLayer.emitAndFinalize(K));
+    EXPECT_EQ(DebugSectionSeen, false)
+      << "Unexpected debug info section";
+    cantFail(ObjLayer.removeObject(K));
+  }
+
+  {
+    // Test with ProcessAllSections = true.
+    ObjLayer.setProcessAllSections(true);
+    auto K = ES.allocateVModule();
+    cantFail(ObjLayer.addObject(K, std::move(Obj)));
+    cantFail(ObjLayer.emitAndFinalize(K));
+    EXPECT_EQ(DebugSectionSeen, true)
+      << "Expected debug info section not seen";
+    cantFail(ObjLayer.removeObject(K));
+  }
+}
+
+TEST_F(LegacyRTDyldObjectLinkingLayerExecutionTest, NoDuplicateFinalization) {
+  if (!SupportsJIT)
+    return;
+
+  ExecutionSession ES;
+
+  auto MM = std::make_shared<SectionMemoryManagerWrapper>();
+
+  std::map<orc::VModuleKey, std::shared_ptr<orc::SymbolResolver>> Resolvers;
+
+  LegacyRTDyldObjectLinkingLayer ObjLayer(ES, [&](VModuleKey K) {
+    auto I = Resolvers.find(K);
+    assert(I != Resolvers.end() && "Missing resolver");
+    auto R = std::move(I->second);
+    Resolvers.erase(I);
+    return LegacyRTDyldObjectLinkingLayer::Resources{MM, std::move(R)};
+  });
+  SimpleCompiler Compile(*TM);
+
+  // Create a pair of modules that will trigger recursive finalization:
+  // Module 1:
+  //   int bar() { return 42; }
+  // Module 2:
+  //   int bar();
+  //   int foo() { return bar(); }
+  //
+  // Verify that the memory manager is only finalized once (for Module 2).
+  // Failure suggests that finalize is being called on the inner RTDyld
+  // instance (for Module 1) which is unsafe, as it will prevent relocation of
+  // Module 2.
+
+  ModuleBuilder MB1(Context, "", "dummy");
+  {
+    MB1.getModule()->setDataLayout(TM->createDataLayout());
+    Function *BarImpl = MB1.createFunctionDecl<int32_t(void)>("bar");
+    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
+    IRBuilder<> Builder(BarEntry);
+    IntegerType *Int32Ty = IntegerType::get(Context, 32);
+    Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42);
+    Builder.CreateRet(FourtyTwo);
+  }
+
+  auto Obj1 = Compile(*MB1.getModule());
+
+  ModuleBuilder MB2(Context, "", "dummy");
+  {
+    MB2.getModule()->setDataLayout(TM->createDataLayout());
+    Function *BarDecl = MB2.createFunctionDecl<int32_t(void)>("bar");
+    Function *FooImpl = MB2.createFunctionDecl<int32_t(void)>("foo");
+    BasicBlock *FooEntry = BasicBlock::Create(Context, "entry", FooImpl);
+    IRBuilder<> Builder(FooEntry);
+    Builder.CreateRet(Builder.CreateCall(BarDecl));
+  }
+  auto Obj2 = Compile(*MB2.getModule());
+
+  auto K1 = ES.allocateVModule();
+  Resolvers[K1] = std::make_shared<NullResolver>();
+  cantFail(ObjLayer.addObject(K1, std::move(Obj1)));
+
+  auto K2 = ES.allocateVModule();
+  auto LegacyLookup = [&](const std::string &Name) {
+    return ObjLayer.findSymbol(Name, true);
+  };
+
+  Resolvers[K2] = createSymbolResolver(
+      [&](const SymbolNameSet &Symbols) {
+        return cantFail(
+            getResponsibilitySetWithLegacyFn(Symbols, LegacyLookup));
+      },
+      [&](std::shared_ptr<AsynchronousSymbolQuery> Query,
+          const SymbolNameSet &Symbols) {
+        return lookupWithLegacyFn(ES, *Query, Symbols, LegacyLookup);
+      });
+
+  cantFail(ObjLayer.addObject(K2, std::move(Obj2)));
+  cantFail(ObjLayer.emitAndFinalize(K2));
+  cantFail(ObjLayer.removeObject(K2));
+
+  // Finalization of module 2 should trigger finalization of module 1.
+  // Verify that finalize on SMMW is only called once.
+  EXPECT_EQ(MM->FinalizationCount, 1)
+      << "Extra call to finalize";
+}
+
+TEST_F(LegacyRTDyldObjectLinkingLayerExecutionTest, NoPrematureAllocation) {
+  if (!SupportsJIT)
+    return;
+
+  ExecutionSession ES;
+
+  auto MM = std::make_shared<SectionMemoryManagerWrapper>();
+
+  LegacyRTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey K) {
+    return LegacyRTDyldObjectLinkingLayer::Resources{
+        MM, std::make_shared<NullResolver>()};
+  });
+  SimpleCompiler Compile(*TM);
+
+  // Create a pair of unrelated modules:
+  //
+  // Module 1:
+  //   int foo() { return 42; }
+  // Module 2:
+  //   int bar() { return 7; }
+  //
+  // Both modules will share a memory manager. We want to verify that the
+  // second object is not loaded before the first one is finalized. To do this
+  // in a portable way, we abuse the
+  // RuntimeDyld::MemoryManager::needsToReserveAllocationSpace hook, which is
+  // called once per object before any sections are allocated.
+
+  ModuleBuilder MB1(Context, "", "dummy");
+  {
+    MB1.getModule()->setDataLayout(TM->createDataLayout());
+    Function *BarImpl = MB1.createFunctionDecl<int32_t(void)>("foo");
+    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
+    IRBuilder<> Builder(BarEntry);
+    IntegerType *Int32Ty = IntegerType::get(Context, 32);
+    Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42);
+    Builder.CreateRet(FourtyTwo);
+  }
+
+  auto Obj1 = Compile(*MB1.getModule());
+
+  ModuleBuilder MB2(Context, "", "dummy");
+  {
+    MB2.getModule()->setDataLayout(TM->createDataLayout());
+    Function *BarImpl = MB2.createFunctionDecl<int32_t(void)>("bar");
+    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
+    IRBuilder<> Builder(BarEntry);
+    IntegerType *Int32Ty = IntegerType::get(Context, 32);
+    Value *Seven = ConstantInt::getSigned(Int32Ty, 7);
+    Builder.CreateRet(Seven);
+  }
+  auto Obj2 = Compile(*MB2.getModule());
+
+  auto K = ES.allocateVModule();
+  cantFail(ObjLayer.addObject(K, std::move(Obj1)));
+  cantFail(ObjLayer.addObject(ES.allocateVModule(), std::move(Obj2)));
+  cantFail(ObjLayer.emitAndFinalize(K));
+  cantFail(ObjLayer.removeObject(K));
+
+  // Only one call to needsToReserveAllocationSpace should have been made.
+  EXPECT_EQ(MM->NeedsToReserveAllocationSpaceCount, 1)
+      << "More than one call to needsToReserveAllocationSpace "
+         "(multiple unrelated objects loaded prior to finalization)";
+}
+
+TEST_F(LegacyRTDyldObjectLinkingLayerExecutionTest, TestNotifyLoadedSignature) {
+  ExecutionSession ES;
+  LegacyRTDyldObjectLinkingLayer ObjLayer(
+      ES,
+      [](VModuleKey) {
+        return LegacyRTDyldObjectLinkingLayer::Resources{
+            nullptr, std::make_shared<NullResolver>()};
+      },
+      [](VModuleKey, const object::ObjectFile &obj,
+         const RuntimeDyld::LoadedObjectInfo &info) {});
+}
+
+} // end anonymous namespace
diff --git a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
index 6ad3c19ada9..1c530247a7c 100644
--- a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
@@ -175,19 +175,19 @@ private:
   }
 };
 
-// Test each operation on ObjectTransformLayer.
-TEST(ObjectTransformLayerTest, Main) {
+// Test each operation on LegacyObjectTransformLayer.
+TEST(LegacyObjectTransformLayerTest, Main) {
   MockBaseLayer M;
 
   ExecutionSession ES(std::make_shared<SymbolStringPool>());
 
   // Create one object transform layer using a transform (as a functor)
   // that allocates new objects, and deals in unique pointers.
-  ObjectTransformLayer<MockBaseLayer, AllocatingTransform> T1(M);
+  LegacyObjectTransformLayer<MockBaseLayer, AllocatingTransform> T1(M);
 
   // Create a second object transform layer using a transform (as a lambda)
   // that mutates objects in place, and deals in naked pointers
-  ObjectTransformLayer<MockBaseLayer,
+  LegacyObjectTransformLayer<MockBaseLayer,
                          std::function<std::shared_ptr<MockObjectFile>(
                            std::shared_ptr<MockObjectFile>)>>
     T2(M, [](std::shared_ptr<MockObjectFile> Obj) {
@@ -257,9 +257,9 @@ TEST(ObjectTransformLayerTest, Main) {
   if (!RunStaticChecks)
     return;
 
-  // Make sure that ObjectTransformLayer implements the object layer concept
+  // Make sure that LegacyObjectTransformLayer implements the object layer concept
   // correctly by sandwitching one between an ObjectLinkingLayer and an
-  // IRCompileLayer, verifying that it compiles if we have a call to the
+  // LegacyIRCompileLayer, verifying that it compiles if we have a call to the
   // IRComileLayer's addModule that should call the transform layer's
   // addObject, and also calling the other public transform layer methods
   // directly to make sure the methods they intend to forward to exist on
@@ -282,8 +282,8 @@ TEST(ObjectTransformLayerTest, Main) {
   };
 
   // Construct the jit layers.
-  RTDyldObjectLinkingLayer BaseLayer(ES, [](VModuleKey) {
-    return RTDyldObjectLinkingLayer::Resources{
+  LegacyRTDyldObjectLinkingLayer BaseLayer(ES, [](VModuleKey) {
+    return LegacyRTDyldObjectLinkingLayer::Resources{
         std::make_shared<llvm::SectionMemoryManager>(),
         std::make_shared<NullResolver>()};
   });
@@ -291,20 +291,20 @@ TEST(ObjectTransformLayerTest, Main) {
   auto IdentityTransform = [](std::unique_ptr<llvm::MemoryBuffer> Obj) {
     return Obj;
   };
-  ObjectTransformLayer<decltype(BaseLayer), decltype(IdentityTransform)>
+  LegacyObjectTransformLayer<decltype(BaseLayer), decltype(IdentityTransform)>
       TransformLayer(BaseLayer, IdentityTransform);
   auto NullCompiler = [](llvm::Module &) {
     return std::unique_ptr<llvm::MemoryBuffer>(nullptr);
   };
-  IRCompileLayer<decltype(TransformLayer), decltype(NullCompiler)>
+  LegacyIRCompileLayer<decltype(TransformLayer), decltype(NullCompiler)>
     CompileLayer(TransformLayer, NullCompiler);
 
-  // Make sure that the calls from IRCompileLayer to ObjectTransformLayer
+  // Make sure that the calls from LegacyIRCompileLayer to LegacyObjectTransformLayer
   // compile.
   cantFail(CompileLayer.addModule(ES.allocateVModule(),
                                   std::unique_ptr<llvm::Module>()));
 
-  // Make sure that the calls from ObjectTransformLayer to ObjectLinkingLayer
+  // Make sure that the calls from LegacyObjectTransformLayer to ObjectLinkingLayer
   // compile.
   VModuleKey DummyKey = ES.allocateVModule();
   cantFail(TransformLayer.emitAndFinalize(DummyKey));
diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayer2Test.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayer2Test.cpp
deleted file mode 100644
index 1dbd48b5972..00000000000
--- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayer2Test.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-//===--- RTDyldObjectLinkingLayer2Test.cpp - RTDyld linking layer tests ---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "OrcTestCommon.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
-#include "llvm/ExecutionEngine/Orc/Legacy.h"
-#include "llvm/ExecutionEngine/Orc/NullResolver.h"
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/LLVMContext.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-using namespace llvm::orc;
-
-namespace {
-
-class RTDyldObjectLinkingLayer2ExecutionTest : public testing::Test,
-                                               public OrcExecutionTest {};
-
-// Adds an object with a debug section to RuntimeDyld and then returns whether
-// the debug section was passed to the memory manager.
-static bool testSetProcessAllSections(std::unique_ptr<MemoryBuffer> Obj,
-                                      bool ProcessAllSections) {
-  class MemoryManagerWrapper : public SectionMemoryManager {
-  public:
-    MemoryManagerWrapper(bool &DebugSeen) : DebugSeen(DebugSeen) {}
-    uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID, StringRef SectionName,
-                                 bool IsReadOnly) override {
-      if (SectionName == ".debug_str")
-        DebugSeen = true;
-      return SectionMemoryManager::allocateDataSection(
-          Size, Alignment, SectionID, SectionName, IsReadOnly);
-    }
-
-  private:
-    bool &DebugSeen;
-  };
-
-  bool DebugSectionSeen = false;
-
-  ExecutionSession ES;
-  auto &JD = ES.createJITDylib("main");
-  auto Foo = ES.intern("foo");
-
-  RTDyldObjectLinkingLayer2 ObjLayer(ES, [&DebugSectionSeen](VModuleKey) {
-    return llvm::make_unique<MemoryManagerWrapper>(DebugSectionSeen);
-  });
-
-  auto OnResolveDoNothing = [](Expected<SymbolMap> R) {
-    cantFail(std::move(R));
-  };
-
-  auto OnReadyDoNothing = [](Error Err) { cantFail(std::move(Err)); };
-
-  ObjLayer.setProcessAllSections(ProcessAllSections);
-  auto K = ES.allocateVModule();
-  cantFail(ObjLayer.add(JD, K, std::move(Obj)));
-  ES.lookup({&JD}, {Foo}, OnResolveDoNothing, OnReadyDoNothing,
-            NoDependenciesToRegister);
-  return DebugSectionSeen;
-}
-
-TEST(RTDyldObjectLinkingLayer2Test, TestSetProcessAllSections) {
-  LLVMContext Context;
-  auto M = llvm::make_unique<Module>("", Context);
-  M->setTargetTriple("x86_64-unknown-linux-gnu");
-  Type *Int32Ty = IntegerType::get(Context, 32);
-  GlobalVariable *GV =
-      new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
-                         ConstantInt::get(Int32Ty, 42), "foo");
-
-  GV->setSection(".debug_str");
-
-  // Initialize the native target in case this is the first unit test
-  // to try to build a TM.
-  OrcNativeTarget::initialize();
-  std::unique_ptr<TargetMachine> TM(EngineBuilder().selectTarget(
-      Triple(M->getTargetTriple()), "", "", SmallVector<std::string, 1>()));
-  if (!TM)
-    return;
-
-  auto Obj = SimpleCompiler(*TM)(*M);
-
-  EXPECT_FALSE(testSetProcessAllSections(
-      MemoryBuffer::getMemBufferCopy(Obj->getBuffer()), false))
-      << "Debug section seen despite ProcessAllSections being false";
-  EXPECT_TRUE(testSetProcessAllSections(std::move(Obj), true))
-      << "Expected to see debug section when ProcessAllSections is true";
-}
-
-TEST(RTDyldObjectLinkingLayer2Test, TestOverrideObjectFlags) {
-
-  OrcNativeTarget::initialize();
-
-  std::unique_ptr<TargetMachine> TM(
-      EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "",
-                                   SmallVector<std::string, 1>()));
-
-  if (!TM)
-    return;
-
-  // Our compiler is going to modify symbol visibility settings without telling
-  // ORC. This will test our ability to override the flags later.
-  class FunkySimpleCompiler : public SimpleCompiler {
-  public:
-    FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {}
-
-    CompileResult operator()(Module &M) {
-      auto *Foo = M.getFunction("foo");
-      assert(Foo && "Expected function Foo not found");
-      Foo->setVisibility(GlobalValue::HiddenVisibility);
-      return SimpleCompiler::operator()(M);
-    }
-  };
-
-  // Create a module with two void() functions: foo and bar.
-  ThreadSafeContext TSCtx(llvm::make_unique<LLVMContext>());
-  ThreadSafeModule M;
-  {
-    ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy");
-    MB.getModule()->setDataLayout(TM->createDataLayout());
-
-    Function *FooImpl = MB.createFunctionDecl<void()>("foo");
-    BasicBlock *FooEntry =
-        BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl);
-    IRBuilder<> B1(FooEntry);
-    B1.CreateRetVoid();
-
-    Function *BarImpl = MB.createFunctionDecl<void()>("bar");
-    BasicBlock *BarEntry =
-        BasicBlock::Create(*TSCtx.getContext(), "entry", BarImpl);
-    IRBuilder<> B2(BarEntry);
-    B2.CreateRetVoid();
-
-    M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx));
-  }
-
-  // Create a simple stack and set the override flags option.
-  ExecutionSession ES;
-  auto &JD = ES.createJITDylib("main");
-  auto Foo = ES.intern("foo");
-  RTDyldObjectLinkingLayer2 ObjLayer(
-      ES, [](VModuleKey) { return llvm::make_unique<SectionMemoryManager>(); });
-  IRCompileLayer2 CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM));
-
-  ObjLayer.setOverrideObjectFlagsWithResponsibilityFlags(true);
-
-  cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M)));
-  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
-            [](Error Err) { cantFail(std::move(Err)); },
-            NoDependenciesToRegister);
-}
-
-TEST(RTDyldObjectLinkingLayer2Test, TestAutoClaimResponsibilityForSymbols) {
-
-  OrcNativeTarget::initialize();
-
-  std::unique_ptr<TargetMachine> TM(
-      EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "",
-                                   SmallVector<std::string, 1>()));
-
-  if (!TM)
-    return;
-
-  // Our compiler is going to add a new symbol without telling ORC.
-  // This will test our ability to auto-claim responsibility later.
-  class FunkySimpleCompiler : public SimpleCompiler {
-  public:
-    FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {}
-
-    CompileResult operator()(Module &M) {
-      Function *BarImpl =
-          Function::Create(TypeBuilder<void(), false>::get(M.getContext()),
-                           GlobalValue::ExternalLinkage, "bar", &M);
-      BasicBlock *BarEntry =
-          BasicBlock::Create(M.getContext(), "entry", BarImpl);
-      IRBuilder<> B(BarEntry);
-      B.CreateRetVoid();
-
-      return SimpleCompiler::operator()(M);
-    }
-  };
-
-  // Create a module with two void() functions: foo and bar.
-  ThreadSafeContext TSCtx(llvm::make_unique<LLVMContext>());
-  ThreadSafeModule M;
-  {
-    ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy");
-    MB.getModule()->setDataLayout(TM->createDataLayout());
-
-    Function *FooImpl = MB.createFunctionDecl<void()>("foo");
-    BasicBlock *FooEntry =
-        BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl);
-    IRBuilder<> B(FooEntry);
-    B.CreateRetVoid();
-
-    M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx));
-  }
-
-  // Create a simple stack and set the override flags option.
-  ExecutionSession ES;
-  auto &JD = ES.createJITDylib("main");
-  auto Foo = ES.intern("foo");
-  RTDyldObjectLinkingLayer2 ObjLayer(
-      ES, [](VModuleKey) { return llvm::make_unique<SectionMemoryManager>(); });
-  IRCompileLayer2 CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM));
-
-  ObjLayer.setAutoClaimResponsibilityForObjectSymbols(true);
-
-  cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M)));
-  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
-            [](Error Err) { cantFail(std::move(Err)); },
-            NoDependenciesToRegister);
-}
-
-} // end anonymous namespace
diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
index 62c6b7dfa31..75ccfc9ab0d 100644
--- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
@@ -1,4 +1,4 @@
-//===- RTDyldObjectLinkingLayerTest.cpp - RTDyld linking layer unit tests -===//
+//===--- RTDyldObjectLinkingLayerTest.cpp - RTDyld linking layer tests ---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "OrcTestCommon.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
 #include "llvm/ExecutionEngine/Orc/Legacy.h"
 #include "llvm/ExecutionEngine/Orc/NullResolver.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
@@ -25,258 +26,203 @@ using namespace llvm::orc;
 namespace {
 
 class RTDyldObjectLinkingLayerExecutionTest : public testing::Test,
-                                              public OrcExecutionTest {
-
-};
-
-class SectionMemoryManagerWrapper : public SectionMemoryManager {
-public:
-  int FinalizationCount = 0;
-  int NeedsToReserveAllocationSpaceCount = 0;
-
-  bool needsToReserveAllocationSpace() override {
-    ++NeedsToReserveAllocationSpaceCount;
-    return SectionMemoryManager::needsToReserveAllocationSpace();
-  }
-
-  bool finalizeMemory(std::string *ErrMsg = nullptr) override {
-    ++FinalizationCount;
-    return SectionMemoryManager::finalizeMemory(ErrMsg);
-  }
-};
+                                               public OrcExecutionTest {};
 
-TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
+// Adds an object with a debug section to RuntimeDyld and then returns whether
+// the debug section was passed to the memory manager.
+static bool testSetProcessAllSections(std::unique_ptr<MemoryBuffer> Obj,
+                                      bool ProcessAllSections) {
   class MemoryManagerWrapper : public SectionMemoryManager {
   public:
     MemoryManagerWrapper(bool &DebugSeen) : DebugSeen(DebugSeen) {}
     uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID,
-                                 StringRef SectionName,
+                                 unsigned SectionID, StringRef SectionName,
                                  bool IsReadOnly) override {
       if (SectionName == ".debug_str")
         DebugSeen = true;
-      return SectionMemoryManager::allocateDataSection(Size, Alignment,
-                                                         SectionID,
-                                                         SectionName,
-                                                         IsReadOnly);
+      return SectionMemoryManager::allocateDataSection(
+          Size, Alignment, SectionID, SectionName, IsReadOnly);
     }
+
   private:
     bool &DebugSeen;
   };
 
   bool DebugSectionSeen = false;
-  auto MM = std::make_shared<MemoryManagerWrapper>(DebugSectionSeen);
 
   ExecutionSession ES;
+  auto &JD = ES.createJITDylib("main");
+  auto Foo = ES.intern("foo");
 
-  RTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey) {
-    return RTDyldObjectLinkingLayer::Resources{
-        MM, std::make_shared<NullResolver>()};
+  RTDyldObjectLinkingLayer ObjLayer(ES, [&DebugSectionSeen](VModuleKey) {
+    return llvm::make_unique<MemoryManagerWrapper>(DebugSectionSeen);
   });
 
+  auto OnResolveDoNothing = [](Expected<SymbolMap> R) {
+    cantFail(std::move(R));
+  };
+
+  auto OnReadyDoNothing = [](Error Err) { cantFail(std::move(Err)); };
+
+  ObjLayer.setProcessAllSections(ProcessAllSections);
+  auto K = ES.allocateVModule();
+  cantFail(ObjLayer.add(JD, K, std::move(Obj)));
+  ES.lookup({&JD}, {Foo}, OnResolveDoNothing, OnReadyDoNothing,
+            NoDependenciesToRegister);
+  return DebugSectionSeen;
+}
+
+TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
   LLVMContext Context;
   auto M = llvm::make_unique<Module>("", Context);
   M->setTargetTriple("x86_64-unknown-linux-gnu");
   Type *Int32Ty = IntegerType::get(Context, 32);
   GlobalVariable *GV =
-    new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
+      new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
                          ConstantInt::get(Int32Ty, 42), "foo");
 
   GV->setSection(".debug_str");
 
-
   // Initialize the native target in case this is the first unit test
   // to try to build a TM.
   OrcNativeTarget::initialize();
-  std::unique_ptr<TargetMachine> TM(
-    EngineBuilder().selectTarget(Triple(M->getTargetTriple()), "", "",
-                                 SmallVector<std::string, 1>()));
+  std::unique_ptr<TargetMachine> TM(EngineBuilder().selectTarget(
+      Triple(M->getTargetTriple()), "", "", SmallVector<std::string, 1>()));
   if (!TM)
     return;
 
   auto Obj = SimpleCompiler(*TM)(*M);
 
-  {
-    // Test with ProcessAllSections = false (the default).
-    auto K = ES.allocateVModule();
-    cantFail(ObjLayer.addObject(
-        K, MemoryBuffer::getMemBufferCopy(Obj->getBuffer())));
-    cantFail(ObjLayer.emitAndFinalize(K));
-    EXPECT_EQ(DebugSectionSeen, false)
-      << "Unexpected debug info section";
-    cantFail(ObjLayer.removeObject(K));
-  }
-
-  {
-    // Test with ProcessAllSections = true.
-    ObjLayer.setProcessAllSections(true);
-    auto K = ES.allocateVModule();
-    cantFail(ObjLayer.addObject(K, std::move(Obj)));
-    cantFail(ObjLayer.emitAndFinalize(K));
-    EXPECT_EQ(DebugSectionSeen, true)
-      << "Expected debug info section not seen";
-    cantFail(ObjLayer.removeObject(K));
-  }
+  EXPECT_FALSE(testSetProcessAllSections(
+      MemoryBuffer::getMemBufferCopy(Obj->getBuffer()), false))
+      << "Debug section seen despite ProcessAllSections being false";
+  EXPECT_TRUE(testSetProcessAllSections(std::move(Obj), true))
+      << "Expected to see debug section when ProcessAllSections is true";
 }
 
-TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoDuplicateFinalization) {
-  if (!SupportsJIT)
-    return;
+TEST(RTDyldObjectLinkingLayerTest, TestOverrideObjectFlags) {
 
-  ExecutionSession ES;
+  OrcNativeTarget::initialize();
 
-  auto MM = std::make_shared<SectionMemoryManagerWrapper>();
+  std::unique_ptr<TargetMachine> TM(
+      EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "",
+                                   SmallVector<std::string, 1>()));
 
-  std::map<orc::VModuleKey, std::shared_ptr<orc::SymbolResolver>> Resolvers;
+  if (!TM)
+    return;
 
-  RTDyldObjectLinkingLayer ObjLayer(ES, [&](VModuleKey K) {
-    auto I = Resolvers.find(K);
-    assert(I != Resolvers.end() && "Missing resolver");
-    auto R = std::move(I->second);
-    Resolvers.erase(I);
-    return RTDyldObjectLinkingLayer::Resources{MM, std::move(R)};
-  });
-  SimpleCompiler Compile(*TM);
-
-  // Create a pair of modules that will trigger recursive finalization:
-  // Module 1:
-  //   int bar() { return 42; }
-  // Module 2:
-  //   int bar();
-  //   int foo() { return bar(); }
-  //
-  // Verify that the memory manager is only finalized once (for Module 2).
-  // Failure suggests that finalize is being called on the inner RTDyld
-  // instance (for Module 1) which is unsafe, as it will prevent relocation of
-  // Module 2.
-
-  ModuleBuilder MB1(Context, "", "dummy");
-  {
-    MB1.getModule()->setDataLayout(TM->createDataLayout());
-    Function *BarImpl = MB1.createFunctionDecl<int32_t(void)>("bar");
-    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
-    IRBuilder<> Builder(BarEntry);
-    IntegerType *Int32Ty = IntegerType::get(Context, 32);
-    Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42);
-    Builder.CreateRet(FourtyTwo);
-  }
+  // Our compiler is going to modify symbol visibility settings without telling
+  // ORC. This will test our ability to override the flags later.
+  class FunkySimpleCompiler : public SimpleCompiler {
+  public:
+    FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {}
 
-  auto Obj1 = Compile(*MB1.getModule());
+    CompileResult operator()(Module &M) {
+      auto *Foo = M.getFunction("foo");
+      assert(Foo && "Expected function Foo not found");
+      Foo->setVisibility(GlobalValue::HiddenVisibility);
+      return SimpleCompiler::operator()(M);
+    }
+  };
 
-  ModuleBuilder MB2(Context, "", "dummy");
+  // Create a module with two void() functions: foo and bar.
+  ThreadSafeContext TSCtx(llvm::make_unique<LLVMContext>());
+  ThreadSafeModule M;
   {
-    MB2.getModule()->setDataLayout(TM->createDataLayout());
-    Function *BarDecl = MB2.createFunctionDecl<int32_t(void)>("bar");
-    Function *FooImpl = MB2.createFunctionDecl<int32_t(void)>("foo");
-    BasicBlock *FooEntry = BasicBlock::Create(Context, "entry", FooImpl);
-    IRBuilder<> Builder(FooEntry);
-    Builder.CreateRet(Builder.CreateCall(BarDecl));
+    ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy");
+    MB.getModule()->setDataLayout(TM->createDataLayout());
+
+    Function *FooImpl = MB.createFunctionDecl<void()>("foo");
+    BasicBlock *FooEntry =
+        BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl);
+    IRBuilder<> B1(FooEntry);
+    B1.CreateRetVoid();
+
+    Function *BarImpl = MB.createFunctionDecl<void()>("bar");
+    BasicBlock *BarEntry =
+        BasicBlock::Create(*TSCtx.getContext(), "entry", BarImpl);
+    IRBuilder<> B2(BarEntry);
+    B2.CreateRetVoid();
+
+    M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx));
   }
-  auto Obj2 = Compile(*MB2.getModule());
 
-  auto K1 = ES.allocateVModule();
-  Resolvers[K1] = std::make_shared<NullResolver>();
-  cantFail(ObjLayer.addObject(K1, std::move(Obj1)));
+  // Create a simple stack and set the override flags option.
+  ExecutionSession ES;
+  auto &JD = ES.createJITDylib("main");
+  auto Foo = ES.intern("foo");
+  RTDyldObjectLinkingLayer ObjLayer(
+      ES, [](VModuleKey) { return llvm::make_unique<SectionMemoryManager>(); });
+  IRCompileLayer CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM));
 
-  auto K2 = ES.allocateVModule();
-  auto LegacyLookup = [&](const std::string &Name) {
-    return ObjLayer.findSymbol(Name, true);
-  };
+  ObjLayer.setOverrideObjectFlagsWithResponsibilityFlags(true);
 
-  Resolvers[K2] = createSymbolResolver(
-      [&](const SymbolNameSet &Symbols) {
-        return cantFail(
-            getResponsibilitySetWithLegacyFn(Symbols, LegacyLookup));
-      },
-      [&](std::shared_ptr<AsynchronousSymbolQuery> Query,
-          const SymbolNameSet &Symbols) {
-        return lookupWithLegacyFn(ES, *Query, Symbols, LegacyLookup);
-      });
-
-  cantFail(ObjLayer.addObject(K2, std::move(Obj2)));
-  cantFail(ObjLayer.emitAndFinalize(K2));
-  cantFail(ObjLayer.removeObject(K2));
-
-  // Finalization of module 2 should trigger finalization of module 1.
-  // Verify that finalize on SMMW is only called once.
-  EXPECT_EQ(MM->FinalizationCount, 1)
-      << "Extra call to finalize";
+  cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M)));
+  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
+            [](Error Err) { cantFail(std::move(Err)); },
+            NoDependenciesToRegister);
 }
 
-TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoPrematureAllocation) {
-  if (!SupportsJIT)
-    return;
+TEST(RTDyldObjectLinkingLayerTest, TestAutoClaimResponsibilityForSymbols) {
 
-  ExecutionSession ES;
+  OrcNativeTarget::initialize();
 
-  auto MM = std::make_shared<SectionMemoryManagerWrapper>();
+  std::unique_ptr<TargetMachine> TM(
+      EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "",
+                                   SmallVector<std::string, 1>()));
 
-  RTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey K) {
-    return RTDyldObjectLinkingLayer::Resources{
-        MM, std::make_shared<NullResolver>()};
-  });
-  SimpleCompiler Compile(*TM);
-
-  // Create a pair of unrelated modules:
-  //
-  // Module 1:
-  //   int foo() { return 42; }
-  // Module 2:
-  //   int bar() { return 7; }
-  //
-  // Both modules will share a memory manager. We want to verify that the
-  // second object is not loaded before the first one is finalized. To do this
-  // in a portable way, we abuse the
-  // RuntimeDyld::MemoryManager::needsToReserveAllocationSpace hook, which is
-  // called once per object before any sections are allocated.
-
-  ModuleBuilder MB1(Context, "", "dummy");
-  {
-    MB1.getModule()->setDataLayout(TM->createDataLayout());
-    Function *BarImpl = MB1.createFunctionDecl<int32_t(void)>("foo");
-    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
-    IRBuilder<> Builder(BarEntry);
-    IntegerType *Int32Ty = IntegerType::get(Context, 32);
-    Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42);
-    Builder.CreateRet(FourtyTwo);
-  }
+  if (!TM)
+    return;
 
-  auto Obj1 = Compile(*MB1.getModule());
+  // Our compiler is going to add a new symbol without telling ORC.
+  // This will test our ability to auto-claim responsibility later.
+  class FunkySimpleCompiler : public SimpleCompiler {
+  public:
+    FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {}
+
+    CompileResult operator()(Module &M) {
+      Function *BarImpl =
+          Function::Create(TypeBuilder<void(), false>::get(M.getContext()),
+                           GlobalValue::ExternalLinkage, "bar", &M);
+      BasicBlock *BarEntry =
+          BasicBlock::Create(M.getContext(), "entry", BarImpl);
+      IRBuilder<> B(BarEntry);
+      B.CreateRetVoid();
+
+      return SimpleCompiler::operator()(M);
+    }
+  };
 
-  ModuleBuilder MB2(Context, "", "dummy");
+  // Create a module with two void() functions: foo and bar.
+  ThreadSafeContext TSCtx(llvm::make_unique<LLVMContext>());
+  ThreadSafeModule M;
   {
-    MB2.getModule()->setDataLayout(TM->createDataLayout());
-    Function *BarImpl = MB2.createFunctionDecl<int32_t(void)>("bar");
-    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
-    IRBuilder<> Builder(BarEntry);
-    IntegerType *Int32Ty = IntegerType::get(Context, 32);
-    Value *Seven = ConstantInt::getSigned(Int32Ty, 7);
-    Builder.CreateRet(Seven);
-  }
-  auto Obj2 = Compile(*MB2.getModule());
+    ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy");
+    MB.getModule()->setDataLayout(TM->createDataLayout());
 
-  auto K = ES.allocateVModule();
-  cantFail(ObjLayer.addObject(K, std::move(Obj1)));
-  cantFail(ObjLayer.addObject(ES.allocateVModule(), std::move(Obj2)));
-  cantFail(ObjLayer.emitAndFinalize(K));
-  cantFail(ObjLayer.removeObject(K));
-
-  // Only one call to needsToReserveAllocationSpace should have been made.
-  EXPECT_EQ(MM->NeedsToReserveAllocationSpaceCount, 1)
-      << "More than one call to needsToReserveAllocationSpace "
-         "(multiple unrelated objects loaded prior to finalization)";
-}
+    Function *FooImpl = MB.createFunctionDecl<void()>("foo");
+    BasicBlock *FooEntry =
+        BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl);
+    IRBuilder<> B(FooEntry);
+    B.CreateRetVoid();
 
-TEST_F(RTDyldObjectLinkingLayerExecutionTest, TestNotifyLoadedSignature) {
+    M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx));
+  }
+
+  // Create a simple stack and set the override flags option.
   ExecutionSession ES;
+  auto &JD = ES.createJITDylib("main");
+  auto Foo = ES.intern("foo");
   RTDyldObjectLinkingLayer ObjLayer(
-      ES,
-      [](VModuleKey) {
-        return RTDyldObjectLinkingLayer::Resources{
-            nullptr, std::make_shared<NullResolver>()};
-      },
-      [](VModuleKey, const object::ObjectFile &obj,
-         const RuntimeDyld::LoadedObjectInfo &info) {});
+      ES, [](VModuleKey) { return llvm::make_unique<SectionMemoryManager>(); });
+  IRCompileLayer CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM));
+
+  ObjLayer.setAutoClaimResponsibilityForObjectSymbols(true);
+
+  cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M)));
+  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
+            [](Error Err) { cantFail(std::move(Err)); },
+            NoDependenciesToRegister);
 }
 
 } // end anonymous namespace
-- 
GitLab


From 3926274437d3fa4050a03e06074535388e1d7e9f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 15 Oct 2018 23:34:58 +0000
Subject: [PATCH 0229/1116] [X86] Remove some isel patterns that shouldn't be
 possible.

These included a bitcast of a load from v4f32 to v2f64, but DAG combine should have already changed the type of the load to remove the cast.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344573 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrAVX512.td | 2 --
 lib/Target/X86/X86InstrSSE.td    | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 158aba447ed..f617de7dd7d 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -4421,8 +4421,6 @@ let Predicates = [HasAVX512] in {
             (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
             (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
-  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
   def : Pat<(v2f64 (X86vzload addr:$src)),
             (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
 
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index b3c639f4f0c..8a836d8c173 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -265,8 +265,6 @@ let Predicates = [UseAVX] in {
             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
-  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzload addr:$src)),
             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
 
@@ -349,8 +347,6 @@ let Predicates = [UseSSE2] in {
             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
-  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzload addr:$src)),
             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
 }
-- 
GitLab


From 2fa550c88a10d1fd710493b0fd6885bfe4d75f88 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 16 Oct 2018 00:09:12 +0000
Subject: [PATCH 0230/1116] [WebAssembly] LSDA info generation

Summary:
This adds support for LSDA (exception table) generation for wasm EH.
Wasm EH mostly follows the structure of Itanium-style exception tables,
with one exception: a call site table entry in wasm EH corresponds to
not a call site but a landing pad.

In wasm EH, the VM is responsible for stack unwinding. After an
exception occurs and the stack is unwound, the control flow is
transferred to wasm 'catch' instruction by the VM, after which the
personality function is called from the compiler-generated code. (Refer
to WasmEHPrepare pass for more information on this part.)

This patch:
- Changes wasm.landingpad.index intrinsic to take a token argument, to
make this 1:1 match with a catchpad instruction
- Stores landingpad index info and catch type info MachineFunction in
before instruction selection
- Lowers wasm.lsda intrinsic to an MCSymbol pointing to the start of an
exception table
- Adds WasmException class with overridden methods for table generation
- Adds support for LSDA section in Wasm object writer

Reviewers: dschuff, sbc100, rnk

Subscribers: mgorny, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D52748

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344575 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineFunction.h        |  22 +-
 include/llvm/IR/IntrinsicsWebAssembly.td      |   3 +-
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp         |   3 +-
 lib/CodeGen/AsmPrinter/CMakeLists.txt         |   1 +
 lib/CodeGen/AsmPrinter/EHStreamer.cpp         |  10 +-
 lib/CodeGen/AsmPrinter/EHStreamer.h           |  11 +-
 lib/CodeGen/AsmPrinter/WasmException.cpp      |  81 ++++++
 lib/CodeGen/AsmPrinter/WasmException.h        |  42 +++
 lib/CodeGen/MachineFunction.cpp               |  45 ++--
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  10 +-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |  83 ++++--
 lib/CodeGen/TargetLoweringObjectFileImpl.cpp  |   4 +
 lib/CodeGen/WasmEHPrepare.cpp                 |   2 +-
 lib/MC/MCObjectFileInfo.cpp                   |   6 +
 lib/MC/WasmObjectWriter.cpp                   |   4 +-
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  15 +-
 .../WebAssembly/WebAssemblyInstrInfo.td       |   2 +
 .../WebAssembly/WebAssemblyMCInstLower.cpp    |   7 +
 test/CodeGen/WebAssembly/eh-lsda.ll           | 239 ++++++++++++++++++
 test/CodeGen/WebAssembly/wasmehprepare.ll     |   6 +-
 20 files changed, 529 insertions(+), 67 deletions(-)
 create mode 100644 lib/CodeGen/AsmPrinter/WasmException.cpp
 create mode 100644 lib/CodeGen/AsmPrinter/WasmException.h
 create mode 100644 test/CodeGen/WebAssembly/eh-lsda.ll

diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 7471b314846..bc81e485a80 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -316,6 +316,9 @@ class MachineFunction {
   /// Map a landing pad's EH symbol to the call site indexes.
   DenseMap<MCSymbol*, SmallVector<unsigned, 4>> LPadToCallSiteMap;
 
+  /// Map a landing pad to its index.
+  DenseMap<const MachineBasicBlock *, unsigned> WasmLPadToIndexMap;
+
   /// Map of invoke call site index values to associated begin EH_LABEL.
   DenseMap<MCSymbol*, unsigned> CallSiteMap;
 
@@ -810,7 +813,8 @@ public:
   LandingPadInfo &getOrCreateLandingPadInfo(MachineBasicBlock *LandingPad);
 
   /// Remap landing pad labels and remove any deleted landing pads.
-  void tidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap = nullptr);
+  void tidyLandingPads(DenseMap<MCSymbol *, uintptr_t> *LPMap = nullptr,
+                       bool TidyIfNoBeginLabels = true);
 
   /// Return a reference to the landing pad info for the current function.
   const std::vector<LandingPadInfo> &getLandingPads() const {
@@ -853,6 +857,22 @@ public:
   /// Map the landing pad's EH symbol to the call site indexes.
   void setCallSiteLandingPad(MCSymbol *Sym, ArrayRef<unsigned> Sites);
 
+  /// Map the landing pad to its index. Used for Wasm exception handling.
+  void setWasmLandingPadIndex(const MachineBasicBlock *LPad, unsigned Index) {
+    WasmLPadToIndexMap[LPad] = Index;
+  }
+
+  /// Returns true if the landing pad has an associate index in wasm EH.
+  bool hasWasmLandingPadIndex(const MachineBasicBlock *LPad) const {
+    return WasmLPadToIndexMap.count(LPad);
+  }
+
+  /// Get the index in wasm EH for a given landing pad.
+  unsigned getWasmLandingPadIndex(const MachineBasicBlock *LPad) const {
+    assert(hasWasmLandingPadIndex(LPad));
+    return WasmLPadToIndexMap.lookup(LPad);
+  }
+
   /// Get the call site indexes for a landing pad EH symbol.
   SmallVectorImpl<unsigned> &getCallSiteLandingPad(MCSymbol *Sym) {
     assert(hasCallSiteLandingPad(Sym) &&
diff --git a/include/llvm/IR/IntrinsicsWebAssembly.td b/include/llvm/IR/IntrinsicsWebAssembly.td
index adf7cb0ba0e..9aa2a4ebeca 100644
--- a/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -71,7 +71,8 @@ def int_wasm_catch : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty],
 // WebAssembly EH must maintain the landingpads in the order assigned to them
 // by WasmEHPrepare pass to generate landingpad table in EHStreamer. This is
 // used in order to give them the indices in WasmEHPrepare.
-def int_wasm_landingpad_index: Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
+def int_wasm_landingpad_index: Intrinsic<[], [llvm_token_ty, llvm_i32_ty],
+                                         [IntrNoMem]>;
 
 // Returns LSDA address of the current function.
 def int_wasm_lsda : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 63c5b262edc..526f7ce3083 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -16,6 +16,7 @@
 #include "CodeViewDebug.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
+#include "WasmException.h"
 #include "WinCFGuard.h"
 #include "WinException.h"
 #include "llvm/ADT/APFloat.h"
@@ -356,7 +357,7 @@ bool AsmPrinter::doInitialization(Module &M) {
     }
     break;
   case ExceptionHandling::Wasm:
-    // TODO to prevent warning
+    ES = new WasmException(this);
     break;
   }
   if (ES)
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index 6cba4a0d4b8..3fb088ab6f0 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_library(LLVMAsmPrinter
   WinCFGuard.cpp
   WinException.cpp
   CodeViewDebug.cpp
+  WasmException.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index be04b9a6e8c..7599121de2b 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -345,7 +345,9 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
 ///     unwound and handling continues.
 ///  3. Type ID table contains references to all the C++ typeinfo for all
 ///     catches in the function.  This tables is reverse indexed base 1.
-void EHStreamer::emitExceptionTable() {
+///
+/// Returns the starting symbol of an exception table.
+MCSymbol *EHStreamer::emitExceptionTable() {
   const MachineFunction *MF = Asm->MF;
   const std::vector<const GlobalValue *> &TypeInfos = MF->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MF->getFilterIds();
@@ -375,6 +377,7 @@ void EHStreamer::emitExceptionTable() {
   computeCallSiteTable(CallSites, LandingPads, FirstActions);
 
   bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
+  bool IsWasm = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::Wasm;
   unsigned CallSiteEncoding =
       IsSJLJ ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_uleb128;
   bool HaveTTData = !TypeInfos.empty() || !FilterIds.empty();
@@ -457,8 +460,8 @@ void EHStreamer::emitExceptionTable() {
   Asm->EmitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
   Asm->OutStreamer->EmitLabel(CstBeginLabel);
 
-  // SjLj Exception handling
-  if (IsSJLJ) {
+  // SjLj / Wasm Exception handling
+  if (IsSJLJ || IsWasm) {
     unsigned idx = 0;
     for (SmallVectorImpl<CallSiteEntry>::const_iterator
          I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) {
@@ -604,6 +607,7 @@ void EHStreamer::emitExceptionTable() {
   }
 
   Asm->EmitAlignment(2);
+  return GCCETSym;
 }
 
 void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
index b89421a1e06..e3a6f8e9d58 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -85,9 +85,10 @@ protected:
   /// zero for the landing pad and the action.  Calls marked 'nounwind' have
   /// no entry and must not be contained in the try-range of any entry - they
   /// form gaps in the table.  Entries must be ordered by try-range address.
-  void computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                            const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
-                            const SmallVectorImpl<unsigned> &FirstActions);
+  virtual void computeCallSiteTable(
+      SmallVectorImpl<CallSiteEntry> &CallSites,
+      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+      const SmallVectorImpl<unsigned> &FirstActions);
 
   /// Emit landing pads and actions.
   ///
@@ -108,7 +109,9 @@ protected:
   ///     found the frame is unwound and handling continues.
   ///  3. Type id table contains references to all the C++ typeinfo for all
   ///     catches in the function.  This tables is reversed indexed base 1.
-  void emitExceptionTable();
+  ///
+  /// Returns the starting symbol of an exception table.
+  MCSymbol *emitExceptionTable();
 
   virtual void emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel);
 
diff --git a/lib/CodeGen/AsmPrinter/WasmException.cpp b/lib/CodeGen/AsmPrinter/WasmException.cpp
new file mode 100644
index 00000000000..46745d08c9f
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/WasmException.cpp
@@ -0,0 +1,81 @@
+//===-- CodeGen/AsmPrinter/WasmException.cpp - Wasm Exception Impl --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing WebAssembly exception info into asm
+// files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "WasmException.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+void WasmException::markFunctionEnd() {
+  // Get rid of any dead landing pads.
+  if (!Asm->MF->getLandingPads().empty()) {
+    auto *NonConstMF = const_cast<MachineFunction *>(Asm->MF);
+    // Wasm does not set BeginLabel and EndLabel information for landing pads,
+    // so we should set the second argument false.
+    NonConstMF->tidyLandingPads(nullptr, /* TidyIfNoBeginLabels */ false);
+  }
+}
+
+void WasmException::endFunction(const MachineFunction *MF) {
+  bool ShouldEmitExceptionTable = false;
+  for (const LandingPadInfo &Info : MF->getLandingPads()) {
+    if (MF->hasWasmLandingPadIndex(Info.LandingPadBlock)) {
+      ShouldEmitExceptionTable = true;
+      break;
+    }
+  }
+  if (!ShouldEmitExceptionTable)
+    return;
+  MCSymbol *LSDALabel = emitExceptionTable();
+  assert(LSDALabel && ".GCC_exception_table has not been emitted!");
+
+  // Wasm requires every data section symbol to have a .size set. So we emit an
+  // end marker and set the size as the difference between the start end the end
+  // marker.
+  MCSymbol *LSDAEndLabel = Asm->createTempSymbol("GCC_except_table_end");
+  Asm->OutStreamer->EmitLabel(LSDAEndLabel);
+  MCContext &OutContext = Asm->OutStreamer->getContext();
+  const MCExpr *SizeExp = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(LSDAEndLabel, OutContext),
+      MCSymbolRefExpr::create(LSDALabel, OutContext), OutContext);
+  Asm->OutStreamer->emitELFSize(LSDALabel, SizeExp);
+}
+
+// Compute the call-site table for wasm EH. Even though we use the same function
+// name to share the common routines, a call site entry in the table corresponds
+// to not a call site for possibly-throwing functions but a landing pad. In wasm
+// EH the VM is responsible for stack unwinding. After an exception occurs and
+// the stack is unwound, the control flow is transferred to wasm 'catch'
+// instruction by the VM, after which the personality function is called from
+// the compiler-generated code. Refer to WasmEHPrepare pass for more
+// information.
+void WasmException::computeCallSiteTable(
+    SmallVectorImpl<CallSiteEntry> &CallSites,
+    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+    const SmallVectorImpl<unsigned> &FirstActions) {
+  MachineFunction &MF = *Asm->MF;
+  for (unsigned I = 0, N = LandingPads.size(); I < N; ++I) {
+    const LandingPadInfo *Info = LandingPads[I];
+    MachineBasicBlock *LPad = Info->LandingPadBlock;
+    // We don't emit LSDA for single catch (...).
+    if (!MF.hasWasmLandingPadIndex(LPad))
+      continue;
+    // Wasm EH must maintain the EH pads in the order assigned to them by the
+    // WasmEHPrepare pass.
+    unsigned LPadIndex = MF.getWasmLandingPadIndex(LPad);
+    CallSiteEntry Site = {nullptr, nullptr, Info, FirstActions[I]};
+    if (CallSites.size() < LPadIndex + 1)
+      CallSites.resize(LPadIndex + 1);
+    CallSites[LPadIndex] = Site;
+  }
+}
diff --git a/lib/CodeGen/AsmPrinter/WasmException.h b/lib/CodeGen/AsmPrinter/WasmException.h
new file mode 100644
index 00000000000..09a9a25ce8d
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/WasmException.h
@@ -0,0 +1,42 @@
+//===-- WasmException.h - Wasm Exception Framework -------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing WebAssembly exception info into asm
+// files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H
+
+#include "EHStreamer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+
+namespace llvm {
+
+class LLVM_LIBRARY_VISIBILITY WasmException : public EHStreamer {
+public:
+  WasmException(AsmPrinter *A) : EHStreamer(A) {}
+
+  void endModule() override {}
+  void beginFunction(const MachineFunction *MF) override {}
+  virtual void markFunctionEnd() override;
+  void endFunction(const MachineFunction *MF) override;
+
+protected:
+  // Compute the call site table for wasm EH.
+  void computeCallSiteTable(
+      SmallVectorImpl<CallSiteEntry> &CallSites,
+      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+      const SmallVectorImpl<unsigned> &FirstActions) override;
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 431484f078b..9e4963c4bdb 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -661,8 +661,11 @@ MCSymbol *MachineFunction::addLandingPad(MachineBasicBlock *LandingPad) {
       }
     }
 
-  } else if (isa<CatchPadInst>(FirstI)) {
-    // TODO
+  } else if (const auto *CPI = dyn_cast<CatchPadInst>(FirstI)) {
+    for (unsigned I = CPI->getNumArgOperands(); I != 0; --I) {
+      Value *TypeInfo = CPI->getArgOperand(I - 1)->stripPointerCasts();
+      addCatchTypeInfo(LandingPad, dyn_cast<GlobalValue>(TypeInfo));
+    }
 
   } else {
     assert(isa<CleanupPadInst>(FirstI) && "Invalid landingpad!");
@@ -687,7 +690,8 @@ void MachineFunction::addFilterTypeInfo(MachineBasicBlock *LandingPad,
   LP.TypeIds.push_back(getFilterIDFor(IdsInFilter));
 }
 
-void MachineFunction::tidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
+void MachineFunction::tidyLandingPads(DenseMap<MCSymbol *, uintptr_t> *LPMap,
+                                      bool TidyIfNoBeginLabels) {
   for (unsigned i = 0; i != LandingPads.size(); ) {
     LandingPadInfo &LandingPad = LandingPads[i];
     if (LandingPad.LandingPadLabel &&
@@ -702,24 +706,25 @@ void MachineFunction::tidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
       continue;
     }
 
-    for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) {
-      MCSymbol *BeginLabel = LandingPad.BeginLabels[j];
-      MCSymbol *EndLabel = LandingPad.EndLabels[j];
-      if ((BeginLabel->isDefined() ||
-           (LPMap && (*LPMap)[BeginLabel] != 0)) &&
-          (EndLabel->isDefined() ||
-           (LPMap && (*LPMap)[EndLabel] != 0))) continue;
-
-      LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
-      LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
-      --j;
-      --e;
-    }
+    if (TidyIfNoBeginLabels) {
+      for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) {
+        MCSymbol *BeginLabel = LandingPad.BeginLabels[j];
+        MCSymbol *EndLabel = LandingPad.EndLabels[j];
+        if ((BeginLabel->isDefined() || (LPMap && (*LPMap)[BeginLabel] != 0)) &&
+            (EndLabel->isDefined() || (LPMap && (*LPMap)[EndLabel] != 0)))
+          continue;
+
+        LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
+        LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
+        --j;
+        --e;
+      }
 
-    // Remove landing pads with no try-ranges.
-    if (LandingPads[i].BeginLabels.empty()) {
-      LandingPads.erase(LandingPads.begin() + i);
-      continue;
+      // Remove landing pads with no try-ranges.
+      if (LandingPads[i].BeginLabels.empty()) {
+        LandingPads.erase(LandingPads.begin() + i);
+        continue;
+      }
     }
 
     // If there is no landing pad, ensure that the list of typeids is empty.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1a99ef734f1..3907f647142 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6282,12 +6282,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
 
-  case Intrinsic::wasm_landingpad_index: {
-    // TODO store landing pad index in a map, which will be used when generating
-    // LSDA information
+  case Intrinsic::wasm_landingpad_index:
+    // Information this intrinsic contained has been transferred to
+    // MachineFunction in SelectionDAGISel::PrepareEHLandingPad. We can safely
+    // delete it now.
     return nullptr;
   }
-  }
 }
 
 void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
@@ -6444,7 +6444,7 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
       WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
       EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS.getInstruction()),
                                 BeginLabel, EndLabel);
-    } else {
+    } else if (!isScopedEHPersonality(Pers)) {
       MF.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 2b4a590f19f..90bcaa653c3 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -1128,6 +1129,36 @@ static bool hasExceptionPointerOrCodeUser(const CatchPadInst *CPI) {
   return false;
 }
 
+// wasm.landingpad.index intrinsic is for associating a landing pad index number
+// with a catchpad instruction. Retrieve the landing pad index in the intrinsic
+// and store the mapping in the function.
+static void mapWasmLandingPadIndex(MachineBasicBlock *MBB,
+                                   const CatchPadInst *CPI) {
+  MachineFunction *MF = MBB->getParent();
+  // In case of single catch (...), we don't emit LSDA, so we don't need
+  // this information.
+  bool IsSingleCatchAllClause =
+      CPI->getNumArgOperands() == 1 &&
+      cast<Constant>(CPI->getArgOperand(0))->isNullValue();
+  if (!IsSingleCatchAllClause) {
+    // Create a mapping from landing pad label to landing pad index.
+    bool IntrFound = false;
+    for (const User *U : CPI->users()) {
+      if (const auto *Call = dyn_cast<IntrinsicInst>(U)) {
+        Intrinsic::ID IID = Call->getIntrinsicID();
+        if (IID == Intrinsic::wasm_landingpad_index) {
+          Value *IndexArg = Call->getArgOperand(1);
+          int Index = cast<ConstantInt>(IndexArg)->getZExtValue();
+          MF->setWasmLandingPadIndex(MBB, Index);
+          IntrFound = true;
+          break;
+        }
+      }
+    }
+    assert(IntrFound && "wasm.landingpad.index intrinsic not found!");
+  }
+}
+
 /// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
 /// do other setup for EH landing-pad blocks.
 bool SelectionDAGISel::PrepareEHLandingPad() {
@@ -1137,44 +1168,48 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
   const TargetRegisterClass *PtrRC =
       TLI->getRegClassFor(TLI->getPointerTy(CurDAG->getDataLayout()));
 
+  auto Pers = classifyEHPersonality(PersonalityFn);
+
   // Catchpads have one live-in register, which typically holds the exception
   // pointer or code.
-  if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
-    if (hasExceptionPointerOrCodeUser(CPI)) {
-      // Get or create the virtual register to hold the pointer or code.  Mark
-      // the live in physreg and copy into the vreg.
-      MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
-      assert(EHPhysReg && "target lacks exception pointer register");
-      MBB->addLiveIn(EHPhysReg);
-      unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
-      BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
-              TII->get(TargetOpcode::COPY), VReg)
-          .addReg(EHPhysReg, RegState::Kill);
+  if (isFuncletEHPersonality(Pers)) {
+    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
+      if (hasExceptionPointerOrCodeUser(CPI)) {
+        // Get or create the virtual register to hold the pointer or code.  Mark
+        // the live in physreg and copy into the vreg.
+        MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
+        assert(EHPhysReg && "target lacks exception pointer register");
+        MBB->addLiveIn(EHPhysReg);
+        unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
+        BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
+                TII->get(TargetOpcode::COPY), VReg)
+            .addReg(EHPhysReg, RegState::Kill);
+      }
     }
     return true;
   }
 
-  if (!LLVMBB->isLandingPad())
-    return true;
-
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
   MCSymbol *Label = MF->addLandingPad(MBB);
 
-  // Assign the call site to the landing pad's begin label.
-  MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
-
   const MCInstrDesc &II = TII->get(TargetOpcode::EH_LABEL);
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
-  // Mark exception register as live in.
-  if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
-    FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
-
-  // Mark exception selector register as live in.
-  if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
-    FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
+  if (Pers == EHPersonality::Wasm_CXX) {
+    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI()))
+      mapWasmLandingPadIndex(MBB, CPI);
+  } else {
+    // Assign the call site to the landing pad's begin label.
+    MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
+    // Mark exception register as live in.
+    if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
+      FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
+    // Mark exception selector register as live in.
+    if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
+      FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
+  }
 
   return true;
 }
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index b046cd81d6c..341ab927861 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1748,6 +1748,10 @@ const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference(
 void TargetLoweringObjectFileWasm::InitializeWasm() {
   StaticCtorSection =
       getContext().getWasmSection(".init_array", SectionKind::getData());
+
+  // We don't use PersonalityEncoding and LSDAEncoding because we don't emit
+  // .cfi directives. We use TTypeEncoding to encode typeinfo global variables.
+  TTypeEncoding = dwarf::DW_EH_PE_absptr;
 }
 
 MCSection *TargetLoweringObjectFileWasm::getStaticCtorSection(
diff --git a/lib/CodeGen/WasmEHPrepare.cpp b/lib/CodeGen/WasmEHPrepare.cpp
index 83d04da5dd0..6f02a05f561 100644
--- a/lib/CodeGen/WasmEHPrepare.cpp
+++ b/lib/CodeGen/WasmEHPrepare.cpp
@@ -300,7 +300,7 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, unsigned Index) {
   // This is to create a map of <landingpad EH label, landingpad index> in
   // SelectionDAGISel, which is to be used in EHStreamer to emit LSDA tables.
   // Pseudocode: wasm.landingpad.index(Index);
-  IRB.CreateCall(LPadIndexF, IRB.getInt32(Index));
+  IRB.CreateCall(LPadIndexF, {FPI, IRB.getInt32(Index)});
 
   // Pseudocode: __wasm_lpad_context.lpad_index = index;
   IRB.CreateStore(IRB.getInt32(Index), LPadIndexField);
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index edfccfcb9ed..b1e03f8efee 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -743,6 +743,12 @@ void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
   DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", SectionKind::getMetadata());
   DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", SectionKind::getMetadata());
 
+  // Wasm use data section for LSDA.
+  // TODO Consider putting each function's exception table in a separate
+  // section, as in -function-sections, to facilitate lld's --gc-section.
+  LSDASection = Ctx->getWasmSection(".rodata.gcc_except_table",
+                                    SectionKind::getReadOnlyWithRel());
+
   // TODO: Define more sections.
 }
 
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index cbbe161ae82..f9318ad5801 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -635,10 +635,12 @@ static void addData(SmallVectorImpl<char> &DataBytes,
         llvm_unreachable("The fill should be an assembler constant");
       DataBytes.insert(DataBytes.end(), Fill->getValueSize() * NumValues,
                        Fill->getValue());
+    } else if (auto *LEB = dyn_cast<MCLEBFragment>(&Frag)) {
+      const SmallVectorImpl<char> &Contents = LEB->getContents();
+      DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
     } else {
       const auto &DataFrag = cast<MCDataFragment>(Frag);
       const SmallVectorImpl<char> &Contents = DataFrag.getContents();
-
       DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
     }
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 30c2e843408..080bfe771a4 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -966,9 +967,17 @@ WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   default:
     return {}; // Don't custom lower most intrinsics.
 
-  case Intrinsic::wasm_lsda:
-    // TODO For now, just return 0 not to crash
-    return DAG.getConstant(0, DL, Op.getValueType());
+  case Intrinsic::wasm_lsda: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    EVT VT = Op.getValueType();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    auto &Context = MF.getMMI().getContext();
+    MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
+                                            Twine(MF.getFunctionNumber()));
+    return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
+                       DAG.getMCSymbol(S, PtrVT));
+  }
   }
 }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 8d98510c67d..4acad5f5943 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -269,6 +269,8 @@ def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
           (CONST_I32 tglobaladdr:$addr)>;
 def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
           (CONST_I32 texternalsym:$addr)>;
+def : Pat<(i32 (WebAssemblywrapper mcsym:$sym)), (CONST_I32 mcsym:$sym)>;
+def : Pat<(i64 (WebAssemblywrapper mcsym:$sym)), (CONST_I64 mcsym:$sym)>;
 
 //===----------------------------------------------------------------------===//
 // Additional sets of instructions.
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index e9a0cf51905..15b3da4c8b8 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -226,6 +226,13 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
           (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_FUNCTION) != 0,
           (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_GLOBAL) != 0);
       break;
+    case MachineOperand::MO_MCSymbol:
+      // This is currently used only for LSDA symbols (GCC_except_table),
+      // because global addresses or other external symbols are handled above.
+      assert(MO.getTargetFlags() == 0 &&
+             "WebAssembly does not use target flags on MCSymbol");
+      MCOp = LowerSymbolOperand(MO.getMCSymbol(), /*Offset=*/0, false, false);
+      break;
     }
 
     OutMI.addOperand(MCOp);
diff --git a/test/CodeGen/WebAssembly/eh-lsda.ll b/test/CodeGen/WebAssembly/eh-lsda.ll
new file mode 100644
index 00000000000..fd550938c42
--- /dev/null
+++ b/test/CodeGen/WebAssembly/eh-lsda.ll
@@ -0,0 +1,239 @@
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling | FileCheck -allow-deprecated-dag-overlap %s
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+@_ZTIi = external constant i8*
+@_ZTIf = external constant i8*
+@_ZTId = external constant i8*
+
+; Single catch (...) does not need an exception table.
+;
+; try {
+;   may_throw();
+; } catch (...) {
+; }
+; CHECK-LABEL: test0:
+; CHECK-NOT: GCC_except_table
+define void @test0() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+entry:
+  invoke void @may_throw()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null]
+  %2 = call i8* @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch.start
+  ret void
+}
+
+; Exception table generation + shared action test.
+;
+; try {
+;   may_throw();
+; } catch (int) {
+; } catch (float) {
+; } catch (double) {
+; } catch (...) {
+; }
+;
+; try {
+;   may_throw();
+; } catch (double) {
+; } catch (...) {
+; }
+;
+; try {
+;   may_throw();
+; } catch (int) {
+; } catch (float) {
+; }
+;
+; There are three landing pads. The second landing pad should share action table
+; entries with the first landing pad because they end with the same sequence
+; (double -> ...). But the third landing table cannot share action table entries
+; with others, so it should create its own entries.
+; CHECK-LABEL: test1:
+; CHECK: .section  .rodata.gcc_except_table,"",@
+; CHECK-NEXT:   .p2align  2
+; CHECK-NEXT: GCC_except_table[[START:[0-9]+]]:
+; CHECK-NEXT: .Lexception0:
+; CHECK-NEXT:   .int8  255                     # @LPStart Encoding = omit
+; CHECK-NEXT:   .int8  0                       # @TType Encoding = absptr
+; CHECK-NEXT:   .uleb128 .Lttbase0-.Lttbaseref0
+; CHECK-NEXT: .Lttbaseref0:
+; CHECK-NEXT:   .int8  1                       # Call site Encoding = uleb128
+; CHECK-NEXT:   .uleb128 .Lcst_end0-.Lcst_begin0
+; CHECK-NEXT: .Lcst_begin0:
+; CHECK-NEXT:   .int8  0                       # >> Call Site 0 <<
+; CHECK-NEXT:                                  #   On exception at call site 0
+; CHECK-NEXT:   .int8  7                       #   Action: 4
+; CHECK-NEXT:   .int8  1                       # >> Call Site 1 <<
+; CHECK-NEXT:                                  #   On exception at call site 1
+; CHECK-NEXT:   .int8  3                       #   Action: 2
+; CHECK-NEXT:   .int8  2                       # >> Call Site 2 <<
+; CHECK-NEXT:                                  #   On exception at call site 2
+; CHECK-NEXT:   .int8  11                      #   Action: 6
+; CHECK-NEXT: .Lcst_end0:
+; CHECK-NEXT:   .int8  1                       # >> Action Record 1 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 1
+; CHECK-NEXT:   .int8  0                       #   No further actions
+; CHECK-NEXT:   .int8  2                       # >> Action Record 2 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 2
+; CHECK-NEXT:   .int8  125                     #   Continue to action 1
+; CHECK-NEXT:   .int8  3                       # >> Action Record 3 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 3
+; CHECK-NEXT:   .int8  125                     #   Continue to action 2
+; CHECK-NEXT:   .int8  4                       # >> Action Record 4 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 4
+; CHECK-NEXT:   .int8  125                     #   Continue to action 3
+; CHECK-NEXT:   .int8  3                       # >> Action Record 5 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 3
+; CHECK-NEXT:   .int8  0                       #   No further actions
+; CHECK-NEXT:   .int8  4                       # >> Action Record 6 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 4
+; CHECK-NEXT:   .int8  125                     #   Continue to action 5
+; CHECK-NEXT:   .p2align  2
+; CHECK-NEXT:                                  # >> Catch TypeInfos <<
+; CHECK-NEXT:   .int32  _ZTIi                  # TypeInfo 4
+; CHECK-NEXT:   .int32  _ZTIf                  # TypeInfo 3
+; CHECK-NEXT:   .int32  _ZTId                  # TypeInfo 2
+; CHECK-NEXT:   .int32  0                      # TypeInfo 1
+; CHECK-NEXT: .Lttbase0:
+; CHECK-NEXT:   .p2align  2
+; CHECK-NEXT: .LGCC_except_table_end[[END:[0-9]+]]:
+; CHECK-NEXT:   .size  GCC_except_table[[START]], .LGCC_except_table_end[[END]]-GCC_except_table[[START]]
+define void @test1() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+entry:
+  invoke void @may_throw()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIf to i8*), i8* bitcast (i8** @_ZTId to i8*), i8* null]
+  %2 = call i8* @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch10, label %catch.fallthrough
+
+catch10:                                          ; preds = %catch.start
+  %5 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  %6 = bitcast i8* %5 to i32*
+  %7 = load i32, i32* %6, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch, %catch4, %catch7, %catch10
+  invoke void @may_throw()
+          to label %try.cont23 unwind label %catch.dispatch14
+
+catch.dispatch14:                                 ; preds = %try.cont
+  %8 = catchswitch within none [label %catch.start15] unwind to caller
+
+catch.start15:                                    ; preds = %catch.dispatch14
+  %9 = catchpad within %8 [i8* bitcast (i8** @_ZTId to i8*), i8* null]
+  %10 = call i8* @llvm.wasm.get.exception(token %9)
+  %11 = call i32 @llvm.wasm.get.ehselector(token %9)
+  %12 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*))
+  %matches16 = icmp eq i32 %11, %12
+  %13 = call i8* @__cxa_begin_catch(i8* %10) [ "funclet"(token %9) ]
+  br i1 %matches16, label %catch20, label %catch17
+
+catch20:                                          ; preds = %catch.start15
+  %14 = bitcast i8* %13 to double*
+  %15 = load double, double* %14, align 8
+  call void @__cxa_end_catch() [ "funclet"(token %9) ]
+  catchret from %9 to label %try.cont23
+
+try.cont23:                                       ; preds = %try.cont, %catch17, %catch20
+  invoke void @may_throw()
+          to label %try.cont36 unwind label %catch.dispatch25
+
+catch.dispatch25:                                 ; preds = %try.cont23
+  %16 = catchswitch within none [label %catch.start26] unwind to caller
+
+catch.start26:                                    ; preds = %catch.dispatch25
+  %17 = catchpad within %16 [i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIf to i8*)]
+  %18 = call i8* @llvm.wasm.get.exception(token %17)
+  %19 = call i32 @llvm.wasm.get.ehselector(token %17)
+  %20 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches27 = icmp eq i32 %19, %20
+  br i1 %matches27, label %catch33, label %catch.fallthrough28
+
+catch33:                                          ; preds = %catch.start26
+  %21 = call i8* @__cxa_begin_catch(i8* %18) [ "funclet"(token %17) ]
+  %22 = bitcast i8* %21 to i32*
+  %23 = load i32, i32* %22, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %17) ]
+  catchret from %17 to label %try.cont36
+
+catch.fallthrough28:                              ; preds = %catch.start26
+  %24 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*))
+  %matches29 = icmp eq i32 %19, %24
+  br i1 %matches29, label %catch30, label %rethrow
+
+catch30:                                          ; preds = %catch.fallthrough28
+  %25 = call i8* @__cxa_begin_catch(i8* %18) [ "funclet"(token %17) ]
+  %26 = bitcast i8* %25 to float*
+  %27 = load float, float* %26, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %17) ]
+  catchret from %17 to label %try.cont36
+
+rethrow:                                          ; preds = %catch.fallthrough28
+  call void @__cxa_rethrow() [ "funclet"(token %17) ]
+  unreachable
+
+try.cont36:                                       ; preds = %try.cont23, %catch30, %catch33
+  ret void
+
+catch17:                                          ; preds = %catch.start15
+  call void @__cxa_end_catch() [ "funclet"(token %9) ]
+  catchret from %9 to label %try.cont23
+
+catch.fallthrough:                                ; preds = %catch.start
+  %28 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*))
+  %matches1 = icmp eq i32 %3, %28
+  br i1 %matches1, label %catch7, label %catch.fallthrough2
+
+catch7:                                           ; preds = %catch.fallthrough
+  %29 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  %30 = bitcast i8* %29 to float*
+  %31 = load float, float* %30, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+catch.fallthrough2:                               ; preds = %catch.fallthrough
+  %32 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*))
+  %matches3 = icmp eq i32 %3, %32
+  %33 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  br i1 %matches3, label %catch4, label %catch
+
+catch4:                                           ; preds = %catch.fallthrough2
+  %34 = bitcast i8* %33 to double*
+  %35 = load double, double* %34, align 8
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+catch:                                            ; preds = %catch.fallthrough2
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+}
+
+declare void @may_throw()
+declare i32 @llvm.eh.typeid.for(i8*)
+declare i8* @llvm.wasm.get.exception(token)
+declare i32 @llvm.wasm.get.ehselector(token)
+declare void @__cxa_rethrow()
+declare i8* @__cxa_begin_catch(i8*)
+declare void @__cxa_end_catch()
+declare i32 @__gxx_wasm_personality_v0(...)
diff --git a/test/CodeGen/WebAssembly/wasmehprepare.ll b/test/CodeGen/WebAssembly/wasmehprepare.ll
index e6005e34057..67e198eb058 100644
--- a/test/CodeGen/WebAssembly/wasmehprepare.ll
+++ b/test/CodeGen/WebAssembly/wasmehprepare.ll
@@ -30,7 +30,7 @@ catch.start:                                      ; preds = %catch.dispatch
 ; CHECK: catch.start:
 ; CHECK-NEXT:   %[[CATCHPAD:.*]] = catchpad
 ; CHECK-NEXT:   %[[EXN:.*]] = call i8* @llvm.wasm.catch(i32 0)
-; CHECK-NEXT:   call void @llvm.wasm.landingpad.index(i32 0)
+; CHECK-NEXT:   call void @llvm.wasm.landingpad.index(token %[[CATCHPAD]], i32 0)
 ; CHECK-NEXT:   store i32 0, i32* getelementptr inbounds ({ i32, i8*, i32 }, { i32, i8*, i32 }* @__wasm_lpad_context, i32 0, i32 0)
 ; CHECK-NEXT:   %[[LSDA:.*]] = call i8* @llvm.wasm.lsda()
 ; CHECK-NEXT:   store i8* %[[LSDA]], i8** getelementptr inbounds ({ i32, i8*, i32 }, { i32, i8*, i32 }* @__wasm_lpad_context, i32 0, i32 1)
@@ -98,7 +98,7 @@ catch.start3:                                     ; preds = %catch.dispatch2
   %matches = icmp eq i32 %8, %9
   br i1 %matches, label %catch4, label %rethrow
 ; CHECK: catch.start3:
-; CHECK:   call void @llvm.wasm.landingpad.index(i32 0)
+; CHECK:   call void @llvm.wasm.landingpad.index(token %{{.+}}, i32 0)
 
 catch4:                                           ; preds = %catch.start3
   %10 = call i8* @__cxa_begin_catch(i8* %7) [ "funclet"(token %6) ]
@@ -311,7 +311,7 @@ declare void @__cxa_rethrow()
 declare void @__clang_call_terminate(i8*)
 
 ; CHECK-DAG: declare i8* @llvm.wasm.catch(i32)
-; CHECK-DAG: declare void @llvm.wasm.landingpad.index(i32)
+; CHECK-DAG: declare void @llvm.wasm.landingpad.index(token, i32)
 ; CHECK-DAG: declare i8* @llvm.wasm.lsda()
 ; CHECK-DAG: declare i32 @_Unwind_CallPersonality(i8*)
 
-- 
GitLab


From 341f13c81dcaa21a6beab540e71e0bc15c526e66 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <sebpop@gmail.com>
Date: Tue, 16 Oct 2018 00:42:07 +0000
Subject: [PATCH 0231/1116] [hot-cold-split] fix failing testcases

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344577 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/HotColdSplit/split-cold-2.ll       |  2 +-
 .../HotColdSplit/split-out-dbg-val-of-arg.ll       | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/test/Transforms/HotColdSplit/split-cold-2.ll b/test/Transforms/HotColdSplit/split-cold-2.ll
index e243a47623a..3e1a567113a 100644
--- a/test/Transforms/HotColdSplit/split-cold-2.ll
+++ b/test/Transforms/HotColdSplit/split-cold-2.ll
@@ -13,7 +13,7 @@ entry:
   br i1 undef, label %if.then, label %if.else
 
 if.then:
-  unreachable
+  ret void
 
 if.else:
   br label %if.then4
diff --git a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
index 4b81de7b35b..dcaff122442 100644
--- a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
+++ b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
@@ -9,7 +9,7 @@ entry:
   br i1 undef, label %if.then, label %if.end, !dbg !12
 
 if.then:                                          ; preds = %entry
-  unreachable, !dbg !13
+  ret void, !dbg !13
 
 if.end:                                           ; preds = %entry
   call void @llvm.dbg.value(metadata i32 %arg1, metadata !9, metadata !DIExpression()), !dbg !11
@@ -19,10 +19,16 @@ if.then12:                                        ; preds = %if.end
   br label %cleanup40, !dbg !15
 
 cleanup40:                                        ; preds = %if.then12
-  br label %return, !dbg !16
+  br i1 undef, label %if.then5, label %if.end1, !dbg !16
+
+if.then5:
+  br label %return, !dbg !17
+
+if.end1:
+  br label %return, !dbg !18
 
 return:                                           ; preds = %cleanup40
-  ret void, !dbg !17
+  unreachable, !dbg !19
 }
 
 declare void @llvm.dbg.value(metadata, metadata, metadata)
@@ -49,3 +55,5 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !15 = !DILocation(line: 5, column: 1, scope: !6)
 !16 = !DILocation(line: 6, column: 1, scope: !6)
 !17 = !DILocation(line: 7, column: 1, scope: !6)
+!18 = !DILocation(line: 8, column: 1, scope: !6)
+!19 = !DILocation(line: 9, column: 1, scope: !6)
-- 
GitLab


From f2cb5da6a45f63427c0d1e6a3f0deca57c44429e Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 16 Oct 2018 05:26:21 +0000
Subject: [PATCH 0232/1116] [SCEV] Limit AddRec "simplifications" to avoid
 combinatorial explosions

SCEV's transform that turns `{A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L>` into
a single AddRec of size `2n+1` with complex combinatorial coefficients can easily
trigger exponential growth of the SCEV (in case if nothing gets folded and simplified).
We tried to restrain this transform using the option `scalar-evolution-max-add-rec-size`,
but its default value seems to be insufficiently small: the test attached to this patch
with default value of this option `16` has a SCEV of >3M symbols (when printed out).

This patch reduces the simplification limit. It is not a cure to combinatorial
explosions, but at least it reduces this corner case to something more or less
reasonable.

Differential Revision: https://reviews.llvm.org/D53282
Reviewed By: sanjoy


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344584 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ScalarEvolution.cpp              |  2 +-
 .../ScalarEvolution/binomial-explision.ll     | 47 +++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 test/Analysis/ScalarEvolution/binomial-explision.ll

diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 4a30447f647..60cd1cb4127 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -204,7 +204,7 @@ static cl::opt<unsigned>
 static cl::opt<unsigned>
     MaxAddRecSize("scalar-evolution-max-add-rec-size", cl::Hidden,
                   cl::desc("Max coefficients in AddRec during evolving"),
-                  cl::init(16));
+                  cl::init(8));
 
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
diff --git a/test/Analysis/ScalarEvolution/binomial-explision.ll b/test/Analysis/ScalarEvolution/binomial-explision.ll
new file mode 100644
index 00000000000..82d0beda6b5
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/binomial-explision.ll
@@ -0,0 +1,47 @@
+; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+; Check that we don't have unreasonably huge SCEVs and in particular only a
+; reasonable amount of AddRecs in the notation of %tmp19. If we "simplify" SCEVs
+; too aggressively, we may end up with huge nested expressions.
+define void @test(i32 %x, i64 %y, i1 %cond) {
+
+; CHECK: %tmp19 = mul i32 %tmp17, %tmp18
+; CHECK: ((((
+; CHECK-NOT: (((((
+; CHECK: %tmp20 = add i32 %tmp19, %x
+
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb3, %bb
+  %tmp = phi i64 [ %y, %bb ], [ %tmp22, %bb3 ]
+  %tmp2 = phi i32 [ %x, %bb ], [ %tmp4, %bb3 ]
+  br label %bb5
+
+bb3:                                              ; preds = %bb5
+  %tmp4 = add i32 %tmp2, %x
+  br label %bb1
+
+bb5:                                              ; preds = %bb5, %bb1
+  %tmp6 = phi i32 [ %tmp23, %bb5 ], [ %tmp2, %bb1 ]
+  %tmp7 = sub i32 -119, %tmp6
+  %tmp8 = mul i32 %tmp7, %x
+  %tmp9 = sub i32 -120, %tmp6
+  %tmp10 = mul i32 %tmp8, %tmp9
+  %tmp11 = mul i32 %x, %tmp10
+  %tmp12 = sub i32 -121, %tmp6
+  %tmp13 = mul i32 %tmp10, %tmp12
+  %tmp14 = mul i32 %tmp11, %tmp13
+  %tmp15 = sub i32 -122, %tmp6
+  %tmp16 = mul i32 %tmp13, %tmp15
+  %tmp17 = mul i32 %tmp14, %tmp16
+  %tmp18 = mul i32 %tmp16, %x
+  %tmp19 = mul i32 %tmp17, %tmp18
+  %tmp20 = add i32 %tmp19, %x
+  %tmp21 = sext i32 %tmp20 to i64
+  %tmp22 = add i64 %y, %tmp21
+  %tmp23 = add i32 %tmp6, 7
+  br i1 %cond, label %bb5, label %bb3
+}
-- 
GitLab


From d043791034148b50b5479124613a622f2b17cb7e Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov <shal1t712@gmail.com>
Date: Tue, 16 Oct 2018 05:40:18 +0000
Subject: [PATCH 0233/1116] [llvm-objcopy] Factor out Buffer

In this diff we move out the hierarchy of buffers from Object.h/Object.cpp
into separate files since it is not ELF-specific and will be reused later.
After this change Object.h/Object.cpp are almost exclusively ELF-specific.

Test plan: make check-all

Differential revision: https://reviews.llvm.org/D53298


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344585 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objcopy/Buffer.cpp       | 51 ++++++++++++++++++++++
 tools/llvm-objcopy/Buffer.h         | 66 +++++++++++++++++++++++++++++
 tools/llvm-objcopy/CMakeLists.txt   |  1 +
 tools/llvm-objcopy/Object.cpp       | 31 --------------
 tools/llvm-objcopy/Object.h         | 44 +------------------
 tools/llvm-objcopy/llvm-objcopy.cpp |  1 +
 6 files changed, 120 insertions(+), 74 deletions(-)
 create mode 100644 tools/llvm-objcopy/Buffer.cpp
 create mode 100644 tools/llvm-objcopy/Buffer.h

diff --git a/tools/llvm-objcopy/Buffer.cpp b/tools/llvm-objcopy/Buffer.cpp
new file mode 100644
index 00000000000..8044b023aaa
--- /dev/null
+++ b/tools/llvm-objcopy/Buffer.cpp
@@ -0,0 +1,51 @@
+//===- Buffer.cpp ---------------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Buffer.h"
+#include "llvm-objcopy.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <memory>
+
+namespace llvm {
+namespace objcopy {
+
+Buffer::~Buffer() {}
+
+void FileBuffer::allocate(size_t Size) {
+  Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+      FileOutputBuffer::create(getName(), Size, FileOutputBuffer::F_executable);
+  handleAllErrors(BufferOrErr.takeError(), [this](const ErrorInfoBase &E) {
+    error("failed to open " + getName() + ": " + E.message());
+  });
+  Buf = std::move(*BufferOrErr);
+}
+
+Error FileBuffer::commit() { return Buf->commit(); }
+
+uint8_t *FileBuffer::getBufferStart() {
+  return reinterpret_cast<uint8_t *>(Buf->getBufferStart());
+}
+
+void MemBuffer::allocate(size_t Size) {
+  Buf = WritableMemoryBuffer::getNewMemBuffer(Size, getName());
+}
+
+Error MemBuffer::commit() { return Error::success(); }
+
+uint8_t *MemBuffer::getBufferStart() {
+  return reinterpret_cast<uint8_t *>(Buf->getBufferStart());
+}
+
+std::unique_ptr<WritableMemoryBuffer> MemBuffer::releaseMemoryBuffer() {
+  return std::move(Buf);
+}
+
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/tools/llvm-objcopy/Buffer.h b/tools/llvm-objcopy/Buffer.h
new file mode 100644
index 00000000000..e5b9c5b2d22
--- /dev/null
+++ b/tools/llvm-objcopy/Buffer.h
@@ -0,0 +1,66 @@
+//===- Buffer.h -------------------------------------------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_OBJCOPY_BUFFER_H
+#define LLVM_TOOLS_OBJCOPY_BUFFER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <memory>
+
+namespace llvm {
+namespace objcopy {
+
+// The class Buffer abstracts out the common interface of FileOutputBuffer and
+// WritableMemoryBuffer so that the hierarchy of Writers depends on this
+// abstract interface and doesn't depend on a particular implementation.
+// TODO: refactor the buffer classes in LLVM to enable us to use them here
+// directly.
+class Buffer {
+  StringRef Name;
+
+public:
+  virtual ~Buffer();
+  virtual void allocate(size_t Size) = 0;
+  virtual uint8_t *getBufferStart() = 0;
+  virtual Error commit() = 0;
+
+  explicit Buffer(StringRef Name) : Name(Name) {}
+  StringRef getName() const { return Name; }
+};
+
+class FileBuffer : public Buffer {
+  std::unique_ptr<FileOutputBuffer> Buf;
+
+public:
+  void allocate(size_t Size) override;
+  uint8_t *getBufferStart() override;
+  Error commit() override;
+
+  explicit FileBuffer(StringRef FileName) : Buffer(FileName) {}
+};
+
+class MemBuffer : public Buffer {
+  std::unique_ptr<WritableMemoryBuffer> Buf;
+
+public:
+  void allocate(size_t Size) override;
+  uint8_t *getBufferStart() override;
+  Error commit() override;
+
+  explicit MemBuffer(StringRef Name) : Buffer(Name) {}
+
+  std::unique_ptr<WritableMemoryBuffer> releaseMemoryBuffer();
+};
+
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_TOOLS_OBJCOPY_BUFFER_H
diff --git a/tools/llvm-objcopy/CMakeLists.txt b/tools/llvm-objcopy/CMakeLists.txt
index 8d963e56758..9ac7d0eb4c2 100644
--- a/tools/llvm-objcopy/CMakeLists.txt
+++ b/tools/llvm-objcopy/CMakeLists.txt
@@ -14,6 +14,7 @@ tablegen(LLVM StripOpts.inc -gen-opt-parser-defs)
 add_public_tablegen_target(StripOptsTableGen)
 
 add_llvm_tool(llvm-objcopy
+  Buffer.cpp
   CopyConfig.cpp
   llvm-objcopy.cpp
   Object.cpp
diff --git a/tools/llvm-objcopy/Object.cpp b/tools/llvm-objcopy/Object.cpp
index ddf811a769b..d677579ea23 100644
--- a/tools/llvm-objcopy/Object.cpp
+++ b/tools/llvm-objcopy/Object.cpp
@@ -33,37 +33,6 @@ using namespace llvm::objcopy;
 using namespace object;
 using namespace ELF;
 
-Buffer::~Buffer() {}
-
-void FileBuffer::allocate(size_t Size) {
-  Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
-      FileOutputBuffer::create(getName(), Size, FileOutputBuffer::F_executable);
-  handleAllErrors(BufferOrErr.takeError(), [this](const ErrorInfoBase &E) {
-    error("failed to open " + getName() + ": " + E.message());
-  });
-  Buf = std::move(*BufferOrErr);
-}
-
-Error FileBuffer::commit() { return Buf->commit(); }
-
-uint8_t *FileBuffer::getBufferStart() {
-  return reinterpret_cast<uint8_t *>(Buf->getBufferStart());
-}
-
-void MemBuffer::allocate(size_t Size) {
-  Buf = WritableMemoryBuffer::getNewMemBuffer(Size, getName());
-}
-
-Error MemBuffer::commit() { return Error::success(); }
-
-uint8_t *MemBuffer::getBufferStart() {
-  return reinterpret_cast<uint8_t *>(Buf->getBufferStart());
-}
-
-std::unique_ptr<WritableMemoryBuffer> MemBuffer::releaseMemoryBuffer() {
-  return std::move(Buf);
-}
-
 template <class ELFT> void ELFWriter<ELFT>::writePhdr(const Segment &Seg) {
   uint8_t *B = Buf.getBufferStart();
   B += Obj.ProgramHdrSegment.Offset + Seg.Index * sizeof(Elf_Phdr);
diff --git a/tools/llvm-objcopy/Object.h b/tools/llvm-objcopy/Object.h
index 46c8f1ca4bf..1019391fa89 100644
--- a/tools/llvm-objcopy/Object.h
+++ b/tools/llvm-objcopy/Object.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_TOOLS_OBJCOPY_OBJECT_H
 #define LLVM_TOOLS_OBJCOPY_OBJECT_H
 
+#include "Buffer.h"
 #include "CopyConfig.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -30,7 +31,6 @@ namespace llvm {
 enum class DebugCompressionType;
 namespace objcopy {
 
-class Buffer;
 class SectionBase;
 class Section;
 class OwnedDataSection;
@@ -146,48 +146,6 @@ public:
   explicit BinarySectionWriter(Buffer &Buf) : SectionWriter(Buf) {}
 };
 
-// The class Buffer abstracts out the common interface of FileOutputBuffer and
-// WritableMemoryBuffer so that the hierarchy of Writers depends on this
-// abstract interface and doesn't depend on a particular implementation.
-// TODO: refactor the buffer classes in LLVM to enable us to use them here
-// directly.
-class Buffer {
-  StringRef Name;
-
-public:
-  virtual ~Buffer();
-  virtual void allocate(size_t Size) = 0;
-  virtual uint8_t *getBufferStart() = 0;
-  virtual Error commit() = 0;
-
-  explicit Buffer(StringRef Name) : Name(Name) {}
-  StringRef getName() const { return Name; }
-};
-
-class FileBuffer : public Buffer {
-  std::unique_ptr<FileOutputBuffer> Buf;
-
-public:
-  void allocate(size_t Size) override;
-  uint8_t *getBufferStart() override;
-  Error commit() override;
-
-  explicit FileBuffer(StringRef FileName) : Buffer(FileName) {}
-};
-
-class MemBuffer : public Buffer {
-  std::unique_ptr<WritableMemoryBuffer> Buf;
-
-public:
-  void allocate(size_t Size) override;
-  uint8_t *getBufferStart() override;
-  Error commit() override;
-
-  explicit MemBuffer(StringRef Name) : Buffer(Name) {}
-
-  std::unique_ptr<WritableMemoryBuffer> releaseMemoryBuffer();
-};
-
 class Writer {
 protected:
   Object &Obj;
diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index c9b170d1d61..b7dbf6c66b3 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-objcopy.h"
+#include "Buffer.h"
 #include "CopyConfig.h"
 #include "Object.h"
 
-- 
GitLab


From 600d43cad2724c404c52e8401ef934f1e1f90e41 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 16 Oct 2018 06:34:53 +0000
Subject: [PATCH 0234/1116] [NFC] Turn isGuaranteedToExecute into a method

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344587 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/MustExecute.h    | 11 +++++------
 lib/Analysis/MustExecute.cpp           | 16 ++++++++--------
 lib/Transforms/Scalar/LICM.cpp         |  6 +++---
 lib/Transforms/Scalar/LoopUnswitch.cpp |  2 +-
 4 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h
index 40a02735d1b..82387476a6d 100644
--- a/include/llvm/Analysis/MustExecute.h
+++ b/include/llvm/Analysis/MustExecute.h
@@ -82,15 +82,14 @@ public:
   /// LoopSafetyInfo.  Some callers rely on this fact.
   void computeLoopSafetyInfo(Loop *);
 
+  /// Returns true if the instruction in a loop is guaranteed to execute at
+  /// least once (under the assumption that the loop is entered).
+  bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT,
+                             const Loop *CurLoop) const;
+
   LoopSafetyInfo() = default;
 };
 
-/// Returns true if the instruction in a loop is guaranteed to execute at least
-/// once (under the assumption that the loop is entered).
-bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT,
-                           const Loop *CurLoop,
-                           const LoopSafetyInfo *SafetyInfo);
-
 }
 
 #endif
diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp
index 79ec8e400c0..7c1ce86d15b 100644
--- a/lib/Analysis/MustExecute.cpp
+++ b/lib/Analysis/MustExecute.cpp
@@ -176,9 +176,9 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop,
 
 /// Returns true if the instruction in a loop is guaranteed to execute at least
 /// once.
-bool llvm::isGuaranteedToExecute(const Instruction &Inst,
-                                 const DominatorTree *DT, const Loop *CurLoop,
-                                 const LoopSafetyInfo *SafetyInfo) {
+bool LoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
+                                           const DominatorTree *DT,
+                                           const Loop *CurLoop) const {
   // We have to check to make sure that the instruction dominates all
   // of the exit blocks.  If it doesn't, then there is a path out of the loop
   // which does not execute this instruction, so we can't hoist it.
@@ -191,17 +191,17 @@ bool llvm::isGuaranteedToExecute(const Instruction &Inst,
     // Inst unless we can prove that Inst comes before the potential implicit
     // exit.  At the moment, we use a (cheap) hack for the common case where
     // the instruction of interest is the first one in the block.
-    return !SafetyInfo->headerMayThrow() ||
-      Inst.getParent()->getFirstNonPHIOrDbg() == &Inst;
+    return !headerMayThrow() ||
+           Inst.getParent()->getFirstNonPHIOrDbg() == &Inst;
 
   // Somewhere in this loop there is an instruction which may throw and make us
   // exit the loop.
-  if (SafetyInfo->anyBlockMayThrow())
+  if (anyBlockMayThrow())
     return false;
 
   // If there is a path from header to exit or latch that doesn't lead to our
   // instruction's block, return false.
-  if (!SafetyInfo->allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT))
+  if (!allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT))
     return false;
 
   return true;
@@ -242,7 +242,7 @@ static bool isMustExecuteIn(const Instruction &I, Loop *L, DominatorTree *DT) {
   // caller actually gets the full power at the moment.
   LoopSafetyInfo LSI;
   LSI.computeLoopSafetyInfo(L);
-  return isGuaranteedToExecute(I, DT, L, &LSI) ||
+  return LSI.isGuaranteedToExecute(I, DT, L) ||
     isGuaranteedToExecuteForEveryIteration(&I, L);
 }
 
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 601d49fc03f..9bf75a4ffbf 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -1116,7 +1116,7 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
       // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning
       // time in isGuaranteedToExecute if we don't actually have anything to
       // drop.  It is a compile time optimization, not required for correctness.
-      !isGuaranteedToExecute(I, DT, CurLoop, SafetyInfo))
+      !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop))
     I.dropUnknownNonDebugMetadata();
 
   // Move the new node to the Preheader, before its terminator.
@@ -1150,7 +1150,7 @@ static bool isSafeToExecuteUnconditionally(Instruction &Inst,
     return true;
 
   bool GuaranteedToExecute =
-      isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+      SafetyInfo->isGuaranteedToExecute(Inst, DT, CurLoop);
 
   if (!GuaranteedToExecute) {
     auto *LI = dyn_cast<LoadInst>(&Inst);
@@ -1408,7 +1408,7 @@ bool llvm::promoteLoopAccessesToScalars(
 
         if (!DereferenceableInPH || !SafeToInsertStore ||
             (InstAlignment > Alignment)) {
-          if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
+          if (SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop)) {
             DereferenceableInPH = true;
             SafeToInsertStore = true;
             Alignment = std::max(Alignment, InstAlignment);
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 13e6bd13754..cd49f51283f 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -721,7 +721,7 @@ bool LoopUnswitch::processCurrentLoop() {
     // This is a workaround for the discrepancy between LLVM IR and MSan
     // semantics. See PR28054 for more details.
     if (SanitizeMemory &&
-        !isGuaranteedToExecute(*TI, DT, currentLoop, &SafetyInfo))
+        !SafetyInfo.isGuaranteedToExecute(*TI, DT, currentLoop))
       continue;
 
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-- 
GitLab


From 2173a4b23e50db6a3df8ddc7c17bf73cd39a1828 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 16 Oct 2018 07:50:14 +0000
Subject: [PATCH 0235/1116] [NFC] Move block throw check inside
 allLoopPathsLeadToBlock

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344588 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/MustExecute.h |  7 +++++--
 lib/Analysis/MustExecute.cpp        | 16 ++++++++++------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h
index 82387476a6d..e643e4ec563 100644
--- a/include/llvm/Analysis/MustExecute.h
+++ b/include/llvm/Analysis/MustExecute.h
@@ -65,13 +65,16 @@ public:
   /// abnormally.
   bool headerMayThrow() const;
 
+  /// Returns true iff the block \p BB potentially may throw exception. It can
+  /// be false-positive in cases when we want to avoid complex analysis.
+  bool blockMayThrow(const BasicBlock *BB) const;
+
   /// Returns true iff any block of the loop for which this info is contains an
   /// instruction that may throw or otherwise exit abnormally.
   bool anyBlockMayThrow() const;
 
   /// Return true if we must reach the block \p BB under assumption that the
-  /// loop \p CurLoop is entered and no instruction throws or otherwise exits
-  /// abnormally.
+  /// loop \p CurLoop is entered.
   bool allLoopPathsLeadToBlock(const Loop *CurLoop, const BasicBlock *BB,
                                const DominatorTree *DT) const;
 
diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp
index 7c1ce86d15b..7f0912de26b 100644
--- a/lib/Analysis/MustExecute.cpp
+++ b/lib/Analysis/MustExecute.cpp
@@ -26,6 +26,11 @@ bool LoopSafetyInfo::headerMayThrow() const {
   return HeaderMayThrow;
 }
 
+bool LoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const {
+  (void)BB;
+  return anyBlockMayThrow();
+}
+
 bool LoopSafetyInfo::anyBlockMayThrow() const {
   return MayThrow;
 }
@@ -148,7 +153,10 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop,
   // 3) Exit blocks which are not taken on 1st iteration.
   // Memoize blocks we've already checked.
   SmallPtrSet<const BasicBlock *, 4> CheckedSuccessors;
-  for (auto *Pred : Predecessors)
+  for (auto *Pred : Predecessors) {
+    // Predecessor block may throw, so it has a side exit.
+    if (blockMayThrow(Pred))
+      return false;
     for (auto *Succ : successors(Pred))
       if (CheckedSuccessors.insert(Succ).second &&
           Succ != BB && !Predecessors.count(Succ))
@@ -169,6 +177,7 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop,
         if (CurLoop->contains(Succ) ||
             !CanProveNotTakenFirstIteration(Succ, DT, CurLoop))
           return false;
+  }
 
   // All predecessors can only lead us to BB.
   return true;
@@ -194,11 +203,6 @@ bool LoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
     return !headerMayThrow() ||
            Inst.getParent()->getFirstNonPHIOrDbg() == &Inst;
 
-  // Somewhere in this loop there is an instruction which may throw and make us
-  // exit the loop.
-  if (anyBlockMayThrow())
-    return false;
-
   // If there is a path from header to exit or latch that doesn't lead to our
   // instruction's block, return false.
   if (!allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT))
-- 
GitLab


From 72b430c741dde011dbb9e5445f4cfffa807b3f2a Mon Sep 17 00:00:00 2001
From: David Stenberg <david.stenberg@ericsson.com>
Date: Tue, 16 Oct 2018 08:06:48 +0000
Subject: [PATCH 0236/1116] [DebugInfo][LCSSA] Rewrite pre-existing debug
 values outside loop

Summary:
Extend LCSSA so that debug values outside loops are rewritten to
use the PHI nodes that the pass creates.

This fixes PR39019. In that case, we ran LCSSA on a loop that
was later on vectorized, which left us with something like this:

  for.cond.cleanup:
    %add.lcssa = phi i32 [ %add, %for.body ], [ %34, %middle.block ]
    call void @llvm.dbg.value(metadata i32 %add,
    ret i32 %add.lcssa

  for.body:
    %add =
    [...]
    br i1 %exitcond, label %for.cond.cleanup, label %for.body

which later resulted in the debug.value becoming undef when
removing the scalar loop (and the location would have probably
been wrong for the vectorized case otherwise).

As we now may need to query the AvailableVals cache more than
once for a basic block, FindAvailableVals() in SSAUpdaterImpl is
changed so that it updates the cache for blocks that we do not
create a PHI node for, regardless of the block's number of
predecessors. The debug value in the attached IR reproducer
would not be properly rewritten without this.

Debug values residing in blocks where we have not inserted any
PHI nodes are currently left as-is by this patch. I'm not sure
what should be done with those uses.

Reviewers: mattd, aprantl, vsk, probinson

Reviewed By: mattd, aprantl

Subscribers: jmorse, gbedwell, JDevlieghere, llvm-commits

Differential Revision: https://reviews.llvm.org/D53130

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344589 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/SSAUpdater.h    |  4 ++
 .../llvm/Transforms/Utils/SSAUpdaterImpl.h    |  7 +-
 lib/Transforms/Utils/LCSSA.cpp                | 16 +++++
 lib/Transforms/Utils/SSAUpdater.cpp           |  5 ++
 .../LCSSA/rewrite-existing-dbg-values.ll      | 69 +++++++++++++++++++
 5 files changed, 97 insertions(+), 4 deletions(-)
 create mode 100644 test/Transforms/LCSSA/rewrite-existing-dbg-values.ll

diff --git a/include/llvm/Transforms/Utils/SSAUpdater.h b/include/llvm/Transforms/Utils/SSAUpdater.h
index 4a791166299..d02607acbbb 100644
--- a/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -76,6 +76,10 @@ public:
   /// block.
   bool HasValueForBlock(BasicBlock *BB) const;
 
+  /// Return the value for the specified block if the SSAUpdater has one,
+  /// otherwise return nullptr.
+  Value *FindValueForBlock(BasicBlock *BB) const;
+
   /// Construct SSA form, materializing a value that is live at the end
   /// of the specified block.
   Value *GetValueAtEndOfBlock(BasicBlock *BB);
diff --git a/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index b7649ba8833..cab0f3e7157 100644
--- a/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -357,10 +357,9 @@ public:
       BBInfo *Info = *I;
 
       if (Info->DefBB != Info) {
-        // Record the available value at join nodes to speed up subsequent
-        // uses of this SSAUpdater for the same value.
-        if (Info->NumPreds > 1)
-          (*AvailableVals)[Info->BB] = Info->DefBB->AvailableVal;
+        // Record the available value to speed up subsequent uses of this
+        // SSAUpdater for the same value.
+        (*AvailableVals)[Info->BB] = Info->DefBB->AvailableVal;
         continue;
       }
 
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index a1f8e7484bc..53d444b309d 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PredIteratorCache.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils.h"
@@ -201,6 +202,21 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
       SSAUpdate.RewriteUse(*UseToRewrite);
     }
 
+    SmallVector<DbgValueInst *, 4> DbgValues;
+    llvm::findDbgValues(DbgValues, I);
+
+    // Update pre-existing debug value uses that reside outside the loop.
+    auto &Ctx = I->getContext();
+    for (auto DVI : DbgValues) {
+      BasicBlock *UserBB = DVI->getParent();
+      if (InstBB == UserBB || L->contains(UserBB))
+        continue;
+      // We currently only handle debug values residing in blocks where we have
+      // inserted a PHI instruction.
+      if (Value *V = SSAUpdate.FindValueForBlock(UserBB))
+        DVI->setOperand(0, MetadataAsValue::get(Ctx, ValueAsMetadata::get(V)));
+    }
+
     // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
     // to post-process them to keep LCSSA form.
     for (PHINode *InsertedPN : InsertedPHIs) {
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 4a1fd8d571a..9e5fb0e7172 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -64,6 +64,11 @@ bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
   return getAvailableVals(AV).count(BB);
 }
 
+Value *SSAUpdater::FindValueForBlock(BasicBlock *BB) const {
+  AvailableValsTy::iterator AVI = getAvailableVals(AV).find(BB);
+  return (AVI != getAvailableVals(AV).end()) ? AVI->second : nullptr;
+}
+
 void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
   assert(ProtoType && "Need to initialize SSAUpdater");
   assert(ProtoType == V->getType() &&
diff --git a/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll b/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll
new file mode 100644
index 00000000000..563a75f407f
--- /dev/null
+++ b/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -lcssa < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Reproducer for PR39019.
+;
+; Verify that the llvm.dbg.value in the %for.cond.cleanup2 block is rewritten
+; to use the PHI node for %add that is created by LCSSA.
+
+; CHECK-LABEL: for.cond.cleanup2:
+; CHECK-NEXT: [[PN:%[^ ]*]] = phi i32 [ %add.lcssa, %for.cond.cleanup1 ]
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[PN]], metadata [[VAR:![0-9]+]], metadata !DIExpression())
+; CHECK-NEXT: call void @bar(i32 [[PN]])
+
+; CHECK-LABEL: for.body:
+; CHECK: %add = add nsw i32 0, 2
+; CHECK: call void @llvm.dbg.value(metadata i32 %add, metadata [[VAR]], metadata !DIExpression())
+
+; CHECK: [[VAR]] = !DILocalVariable(name: "sum",
+
+; Function Attrs: nounwind
+define void @foo() #0 !dbg !6 {
+entry:
+  br label %for.cond.preheader, !dbg !12
+
+for.cond.preheader:                               ; preds = %for.cond.cleanup1, %entry
+  br label %for.body, !dbg !12
+
+for.cond.cleanup2:                                ; preds = %for.cond.cleanup1
+  call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !12
+  tail call void @bar(i32 %add) #0, !dbg !12
+  ret void, !dbg !12
+
+for.cond.cleanup1:                                ; preds = %for.body
+  br i1 false, label %for.cond.preheader, label %for.cond.cleanup2, !dbg !12
+
+for.body:                                         ; preds = %for.body, %for.cond.preheader
+  %add = add nsw i32 0, 2, !dbg !12
+  call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !12
+  br i1 false, label %for.body, label %for.cond.cleanup1, !dbg !12
+}
+
+; Function Attrs: nounwind
+declare void @bar(i32) #0
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2, nameTableKind: None)
+!1 = !DIFile(filename: "foo.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 8.0.0"}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 10, type: !7, isLocal: false, isDefinition: true, scopeLine: 10, isOptimized: true, unit: !0, retainedNodes: !8)
+!7 = !DISubroutineType(types: !2)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "sum", scope: !10, file: !1, line: 11, type: !11)
+!10 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 0)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DILocation(line: 0, scope: !10)
-- 
GitLab


From d871042d6011d26a98608ce19cd90a2dc6794301 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 16 Oct 2018 08:07:14 +0000
Subject: [PATCH 0237/1116] [NFC] Encapsulate work with BlockColors in
 LoopSafetyInfo

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344590 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/MustExecute.h | 16 +++++++++++++---
 lib/Analysis/MustExecute.cpp        | 17 ++++++++++++++++-
 lib/Transforms/Scalar/LICM.cpp      | 15 ++++++---------
 3 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h
index e643e4ec563..d78b38bdead 100644
--- a/include/llvm/Analysis/MustExecute.h
+++ b/include/llvm/Analysis/MustExecute.h
@@ -49,6 +49,9 @@ class LoopSafetyInfo {
                                // may throw.
   bool HeaderMayThrow = false; // Same as previous, but specific to loop header
 
+  // Used to update funclet bundle operands.
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+
   /// Collect all blocks from \p CurLoop which lie on all possible paths from
   /// the header of \p CurLoop (inclusive) to BB (exclusive) into the set
   /// \p Predecessors. If \p BB is the header, \p Predecessors will be empty.
@@ -56,9 +59,16 @@ class LoopSafetyInfo {
       const Loop *CurLoop, const BasicBlock *BB,
       SmallPtrSetImpl<const BasicBlock *> &Predecessors) const;
 
+protected:
+  /// Computes block colors.
+  void computeBlockColors(const Loop *CurLoop);
+
 public:
-  // Used to update funclet bundle operands.
-  DenseMap<BasicBlock *, ColorVector> BlockColors;
+  /// Returns block colors map that is used to update funclet operand bundles.
+  const DenseMap<BasicBlock *, ColorVector> &getBlockColors() const;
+
+  /// Copy colors of block \p Old into the block \p New.
+  void copyColors(BasicBlock *New, BasicBlock *Old);
 
   /// Returns true iff the header block of the loop for which this info is
   /// calculated contains an instruction that may throw or otherwise exit
@@ -83,7 +93,7 @@ public:
   /// as argument. Updates safety information in LoopSafetyInfo argument.
   /// Note: This is defined to clear and reinitialize an already initialized
   /// LoopSafetyInfo.  Some callers rely on this fact.
-  void computeLoopSafetyInfo(Loop *);
+  void computeLoopSafetyInfo(const Loop *CurLoop);
 
   /// Returns true if the instruction in a loop is guaranteed to execute at
   /// least once (under the assumption that the loop is entered).
diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp
index 7f0912de26b..bce941be26c 100644
--- a/lib/Analysis/MustExecute.cpp
+++ b/lib/Analysis/MustExecute.cpp
@@ -22,6 +22,17 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+const DenseMap<BasicBlock *, ColorVector> &
+LoopSafetyInfo::getBlockColors() const {
+  return BlockColors;
+}
+
+void LoopSafetyInfo::copyColors(BasicBlock *New, BasicBlock *Old) {
+  ColorVector &ColorsForNewBlock = BlockColors[New];
+  ColorVector &ColorsForOldBlock = BlockColors[Old];
+  ColorsForNewBlock = ColorsForOldBlock;
+}
+
 bool LoopSafetyInfo::headerMayThrow() const {
   return HeaderMayThrow;
 }
@@ -35,7 +46,7 @@ bool LoopSafetyInfo::anyBlockMayThrow() const {
   return MayThrow;
 }
 
-void LoopSafetyInfo::computeLoopSafetyInfo(Loop *CurLoop) {
+void LoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) {
   assert(CurLoop != nullptr && "CurLoop can't be null");
   BasicBlock *Header = CurLoop->getHeader();
   // Iterate over header and compute safety info.
@@ -51,6 +62,10 @@ void LoopSafetyInfo::computeLoopSafetyInfo(Loop *CurLoop) {
        (BB != BBE) && !MayThrow; ++BB)
     MayThrow |= !isGuaranteedToTransferExecutionToSuccessor(*BB);
 
+  computeBlockColors(CurLoop);
+}
+
+void LoopSafetyInfo::computeBlockColors(const Loop *CurLoop) {
   // Compute funclet colors if we might sink/hoist in a function with a funclet
   // personality routine.
   Function *Fn = CurLoop->getHeader()->getParent();
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 9bf75a4ffbf..6c899289593 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -798,7 +798,7 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
 static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
                                   const LoopSafetyInfo *SafetyInfo,
                                   TargetTransformInfo *TTI, bool &FreeInLoop) {
-  const auto &BlockColors = SafetyInfo->BlockColors;
+  const auto &BlockColors = SafetyInfo->getBlockColors();
   bool IsFree = isFreeInLoop(I, CurLoop, TTI);
   for (const User *U : I.users()) {
     const Instruction *UI = cast<Instruction>(U);
@@ -833,7 +833,7 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
                             const LoopSafetyInfo *SafetyInfo) {
   Instruction *New;
   if (auto *CI = dyn_cast<CallInst>(&I)) {
-    const auto &BlockColors = SafetyInfo->BlockColors;
+    const auto &BlockColors = SafetyInfo->getBlockColors();
 
     // Sinking call-sites need to be handled differently from other
     // instructions.  The cloned call-site needs a funclet bundle operand
@@ -913,7 +913,7 @@ static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) {
   // it require updating BlockColors for all offspring blocks accordingly. By
   // skipping such corner case, we can make updating BlockColors after splitting
   // predecessor fairly simple.
-  if (!SafetyInfo->BlockColors.empty() && BB->getFirstNonPHI()->isEHPad())
+  if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad())
     return false;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *BBPred = *PI;
@@ -967,7 +967,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
   // LE:
   //   %p = phi [%p1, %LE.split], [%p2, %LE.split2]
   //
-  auto &BlockColors = SafetyInfo->BlockColors;
+  const auto &BlockColors = SafetyInfo->getBlockColors();
   SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
   while (!PredBBs.empty()) {
     BasicBlock *PredBB = *PredBBs.begin();
@@ -979,14 +979,11 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
       // Since we do not allow splitting EH-block with BlockColors in
       // canSplitPredecessors(), we can simply assign predecessor's color to
       // the new block.
-      if (!BlockColors.empty()) {
+      if (!BlockColors.empty())
         // Grab a reference to the ColorVector to be inserted before getting the
         // reference to the vector we are copying because inserting the new
         // element in BlockColors might cause the map to be reallocated.
-        ColorVector &ColorsForNewBlock = BlockColors[NewPred];
-        ColorVector &ColorsForOldBlock = BlockColors[PredBB];
-        ColorsForNewBlock = ColorsForOldBlock;
-      }
+        SafetyInfo->copyColors(NewPred, PredBB);
     }
     PredBBs.remove(PredBB);
   }
-- 
GitLab


From e3a3e26e8f64091f0d640d63d560219d6e198e74 Mon Sep 17 00:00:00 2001
From: Aleksandar Beserminji <abeserminji@wavecomp.com>
Date: Tue, 16 Oct 2018 08:27:28 +0000
Subject: [PATCH 0238/1116] [mips][micromips] Fix how values in
 .gcc_except_table are calculated

When a landing pad is calculated in a program that is compiled
for micromips, it will point to an even address. Such an error will
cause a segmentation fault, as the instructions in micromips are
aligned on odd addresses. This patch sets the last bit of the offset
where a landing pad is, to 1, which will effectively be
an odd address and point to the instruction exactly.

Differential Revision: https://reviews.llvm.org/D52985


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344591 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCAsmBackend.h                |  5 +++
 lib/MC/MCExpr.cpp                             |  5 +++
 .../Mips/MCTargetDesc/MipsAsmBackend.cpp      |  8 ++++
 lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h |  2 +
 .../Mips/micromips-gcc-except-table.ll        | 37 +++++++++++++++++++
 5 files changed, 57 insertions(+)
 create mode 100644 test/CodeGen/Mips/micromips-gcc-except-table.ll

diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index 030d3c05aa5..07835c21fce 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h
@@ -165,6 +165,11 @@ public:
     return 0;
   }
 
+  /// Check whether a given symbol has been flagged with MICROMIPS flag.
+  virtual bool isMicroMips(const MCSymbol *Sym) const {
+    return false;
+  }
+
   /// Handles all target related code padding when starting to write a new
   /// basic block to an object file.
   ///
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index a4458e64bd3..38f311be7c6 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -526,6 +526,11 @@ static void AttemptToFoldSymbolOffsetDifference(
     if (Asm->isThumbFunc(&SA))
       Addend |= 1;
 
+    // If symbol is labeled as micromips, we set low-bit to ensure
+    // correct offset in .gcc_except_table
+    if (Asm->getBackend().isMicroMips(&SA))
+      Addend |= 1;
+
     // Clear the symbol expr pointers to indicate we have folded these
     // operands.
     A = B = nullptr;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 4544be9f27f..63f9151da6b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -569,6 +569,14 @@ bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   }
 }
 
+bool MipsAsmBackend::isMicroMips(const MCSymbol *Sym) const {
+  if (const auto *ElfSym = dyn_cast<const MCSymbolELF>(Sym)) {
+    if (ElfSym->getOther() & ELF::STO_MIPS_MICROMIPS)
+      return true;
+  }
+  return false;
+}
+
 MCAsmBackend *llvm::createMipsAsmBackend(const Target &T,
                                          const MCSubtargetInfo &STI,
                                          const MCRegisterInfo &MRI,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 3d5e16fcf9b..30359132e92 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -25,6 +25,7 @@ class MCAssembler;
 struct MCFixupKindInfo;
 class MCObjectWriter;
 class MCRegisterInfo;
+class MCSymbolELF;
 class Target;
 
 class MipsAsmBackend : public MCAsmBackend {
@@ -90,6 +91,7 @@ public:
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
                              const MCValue &Target) override;
 
+  bool isMicroMips(const MCSymbol *Sym) const override;
 }; // class MipsAsmBackend
 
 } // namespace
diff --git a/test/CodeGen/Mips/micromips-gcc-except-table.ll b/test/CodeGen/Mips/micromips-gcc-except-table.ll
new file mode 100644
index 00000000000..38a76927e2a
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-gcc-except-table.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=mips-linux-gnu -mcpu=mips32r2 -mattr=+micromips -O3 -filetype=obj < %s | llvm-objdump -s -j .gcc_except_table - | FileCheck %s
+
+; CHECK: Contents of section .gcc_except_table:
+; CHECK-NEXT: 0000 ff9b1501 0c011100 00110e1f 011f1800
+; CHECK-NEXT: 0010 00010000 00000000
+
+@_ZTIi = external constant i8*
+
+define dso_local i32 @main() local_unnamed_addr norecurse personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind
+  %0 = bitcast i8* %exception.i to i32*
+  store i32 5, i32* %0, align 16
+  invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn
+          to label %.noexc unwind label %return
+
+.noexc:
+  unreachable
+
+return:
+  %1 = landingpad { i8*, i32 }
+          catch i8* null
+  %2 = extractvalue { i8*, i32 } %1, 0
+  %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind
+  tail call void @__cxa_end_catch()
+  ret i32 0
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr
+
+declare void @__cxa_end_catch() local_unnamed_addr
+
+declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr
+
+declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
-- 
GitLab


From 288477a9492e7bdc4b3650b1fe2b8df092078f86 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 16 Oct 2018 08:31:05 +0000
Subject: [PATCH 0239/1116] [NFC] Make LoopSafetyInfo abstract to allow
 alternative implementations

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344592 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/MustExecute.h          | 46 +++++++++++++++-----
 lib/Analysis/MustExecute.cpp                 | 16 +++----
 lib/Transforms/Scalar/LICM.cpp               |  2 +-
 lib/Transforms/Scalar/LoopIdiomRecognize.cpp |  2 +-
 lib/Transforms/Scalar/LoopUnswitch.cpp       |  2 +-
 lib/Transforms/Utils/LoopUnrollAndJam.cpp    |  2 +-
 6 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h
index d78b38bdead..6a6a127b7c3 100644
--- a/include/llvm/Analysis/MustExecute.h
+++ b/include/llvm/Analysis/MustExecute.h
@@ -45,10 +45,6 @@ class Loop;
 /// loop were made and the info wasn't recomputed properly, the behavior of all
 /// methods except for computeLoopSafetyInfo is undefined.
 class LoopSafetyInfo {
-  bool MayThrow = false;       // The current loop contains an instruction which
-                               // may throw.
-  bool HeaderMayThrow = false; // Same as previous, but specific to loop header
-
   // Used to update funclet bundle operands.
   DenseMap<BasicBlock *, ColorVector> BlockColors;
 
@@ -73,15 +69,15 @@ public:
   /// Returns true iff the header block of the loop for which this info is
   /// calculated contains an instruction that may throw or otherwise exit
   /// abnormally.
-  bool headerMayThrow() const;
+  virtual bool headerMayThrow() const = 0;
 
   /// Returns true iff the block \p BB potentially may throw exception. It can
   /// be false-positive in cases when we want to avoid complex analysis.
-  bool blockMayThrow(const BasicBlock *BB) const;
+  virtual bool blockMayThrow(const BasicBlock *BB) const = 0;
 
   /// Returns true iff any block of the loop for which this info is contains an
   /// instruction that may throw or otherwise exit abnormally.
-  bool anyBlockMayThrow() const;
+  virtual bool anyBlockMayThrow() const = 0;
 
   /// Return true if we must reach the block \p BB under assumption that the
   /// loop \p CurLoop is entered.
@@ -93,14 +89,44 @@ public:
   /// as argument. Updates safety information in LoopSafetyInfo argument.
   /// Note: This is defined to clear and reinitialize an already initialized
   /// LoopSafetyInfo.  Some callers rely on this fact.
-  void computeLoopSafetyInfo(const Loop *CurLoop);
+  virtual void computeLoopSafetyInfo(const Loop *CurLoop) = 0;
 
   /// Returns true if the instruction in a loop is guaranteed to execute at
   /// least once (under the assumption that the loop is entered).
-  bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT,
-                             const Loop *CurLoop) const;
+  virtual bool isGuaranteedToExecute(const Instruction &Inst,
+                                     const DominatorTree *DT,
+                                     const Loop *CurLoop) const = 0;
 
   LoopSafetyInfo() = default;
+
+  virtual ~LoopSafetyInfo() = default;
+};
+
+
+/// Simple and conservative implementation of LoopSafetyInfo that can give
+/// false-positive answers to its queries in order to avoid complicated
+/// analysis.
+class SimpleLoopSafetyInfo: public LoopSafetyInfo {
+  bool MayThrow = false;       // The current loop contains an instruction which
+                               // may throw.
+  bool HeaderMayThrow = false; // Same as previous, but specific to loop header
+
+public:
+  virtual bool headerMayThrow() const;
+
+  virtual bool blockMayThrow(const BasicBlock *BB) const;
+
+  virtual bool anyBlockMayThrow() const;
+
+  virtual void computeLoopSafetyInfo(const Loop *CurLoop);
+
+  virtual bool isGuaranteedToExecute(const Instruction &Inst,
+                                     const DominatorTree *DT,
+                                     const Loop *CurLoop) const;
+
+  SimpleLoopSafetyInfo() : LoopSafetyInfo() {};
+
+  virtual ~SimpleLoopSafetyInfo() {};
 };
 
 }
diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp
index bce941be26c..618e2e3e30d 100644
--- a/lib/Analysis/MustExecute.cpp
+++ b/lib/Analysis/MustExecute.cpp
@@ -33,20 +33,20 @@ void LoopSafetyInfo::copyColors(BasicBlock *New, BasicBlock *Old) {
   ColorsForNewBlock = ColorsForOldBlock;
 }
 
-bool LoopSafetyInfo::headerMayThrow() const {
+bool SimpleLoopSafetyInfo::headerMayThrow() const {
   return HeaderMayThrow;
 }
 
-bool LoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const {
+bool SimpleLoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const {
   (void)BB;
   return anyBlockMayThrow();
 }
 
-bool LoopSafetyInfo::anyBlockMayThrow() const {
+bool SimpleLoopSafetyInfo::anyBlockMayThrow() const {
   return MayThrow;
 }
 
-void LoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) {
+void SimpleLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) {
   assert(CurLoop != nullptr && "CurLoop can't be null");
   BasicBlock *Header = CurLoop->getHeader();
   // Iterate over header and compute safety info.
@@ -200,9 +200,9 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop,
 
 /// Returns true if the instruction in a loop is guaranteed to execute at least
 /// once.
-bool LoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
-                                           const DominatorTree *DT,
-                                           const Loop *CurLoop) const {
+bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
+                                                 const DominatorTree *DT,
+                                                 const Loop *CurLoop) const {
   // We have to check to make sure that the instruction dominates all
   // of the exit blocks.  If it doesn't, then there is a path out of the loop
   // which does not execute this instruction, so we can't hoist it.
@@ -259,7 +259,7 @@ static bool isMustExecuteIn(const Instruction &I, Loop *L, DominatorTree *DT) {
   // TODO: merge these two routines.  For the moment, we display the best
   // result obtained by *either* implementation.  This is a bit unfair since no
   // caller actually gets the full power at the moment.
-  LoopSafetyInfo LSI;
+  SimpleLoopSafetyInfo LSI;
   LSI.computeLoopSafetyInfo(L);
   return LSI.isGuaranteedToExecute(I, DT, L) ||
     isGuaranteedToExecuteForEveryIteration(&I, L);
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 6c899289593..e72342b88b6 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -267,7 +267,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
   BasicBlock *Preheader = L->getLoopPreheader();
 
   // Compute loop safety information.
-  LoopSafetyInfo SafetyInfo;
+  SimpleLoopSafetyInfo SafetyInfo;
   SafetyInfo.computeLoopSafetyInfo(L);
 
   // We want to visit all of the instructions in this loop... that are not parts
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 9a45551f64b..4b375956a12 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -320,7 +320,7 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
 
   // The following transforms hoist stores/memsets into the loop pre-header.
   // Give up if the loop has instructions may throw.
-  LoopSafetyInfo SafetyInfo;
+  SimpleLoopSafetyInfo SafetyInfo;
   SafetyInfo.computeLoopSafetyInfo(CurLoop);
   if (SafetyInfo.anyBlockMayThrow())
     return MadeChange;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index cd49f51283f..4a089dfa7db 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -189,7 +189,7 @@ namespace {
     BasicBlock *loopPreheader = nullptr;
 
     bool SanitizeMemory;
-    LoopSafetyInfo SafetyInfo;
+    SimpleLoopSafetyInfo SafetyInfo;
 
     // LoopBlocks contains all of the basic blocks of the loop, including the
     // preheader of the loop, the body of the loop, and the exit blocks of the
diff --git a/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index c17a64f0187..8949c603a84 100644
--- a/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -761,7 +761,7 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
   }
 
   // Check the loop safety info for exceptions.
-  LoopSafetyInfo LSI;
+  SimpleLoopSafetyInfo LSI;
   LSI.computeLoopSafetyInfo(L);
   if (LSI.anyBlockMayThrow()) {
     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
-- 
GitLab


From 141415c0fe2b6e60203a7dc08b672b179cff371b Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 16 Oct 2018 09:11:25 +0000
Subject: [PATCH 0240/1116] [NFC] Remove obsolete method headerMayThrow

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344596 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/MustExecute.h |  7 -------
 lib/Analysis/MustExecute.cpp        | 15 ++-------------
 2 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h
index 6a6a127b7c3..f136ff750de 100644
--- a/include/llvm/Analysis/MustExecute.h
+++ b/include/llvm/Analysis/MustExecute.h
@@ -66,11 +66,6 @@ public:
   /// Copy colors of block \p Old into the block \p New.
   void copyColors(BasicBlock *New, BasicBlock *Old);
 
-  /// Returns true iff the header block of the loop for which this info is
-  /// calculated contains an instruction that may throw or otherwise exit
-  /// abnormally.
-  virtual bool headerMayThrow() const = 0;
-
   /// Returns true iff the block \p BB potentially may throw exception. It can
   /// be false-positive in cases when we want to avoid complex analysis.
   virtual bool blockMayThrow(const BasicBlock *BB) const = 0;
@@ -112,8 +107,6 @@ class SimpleLoopSafetyInfo: public LoopSafetyInfo {
   bool HeaderMayThrow = false; // Same as previous, but specific to loop header
 
 public:
-  virtual bool headerMayThrow() const;
-
   virtual bool blockMayThrow(const BasicBlock *BB) const;
 
   virtual bool anyBlockMayThrow() const;
diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp
index 618e2e3e30d..4e42f336dc7 100644
--- a/lib/Analysis/MustExecute.cpp
+++ b/lib/Analysis/MustExecute.cpp
@@ -33,10 +33,6 @@ void LoopSafetyInfo::copyColors(BasicBlock *New, BasicBlock *Old) {
   ColorsForNewBlock = ColorsForOldBlock;
 }
 
-bool SimpleLoopSafetyInfo::headerMayThrow() const {
-  return HeaderMayThrow;
-}
-
 bool SimpleLoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const {
   (void)BB;
   return anyBlockMayThrow();
@@ -203,10 +199,6 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop,
 bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
                                                  const DominatorTree *DT,
                                                  const Loop *CurLoop) const {
-  // We have to check to make sure that the instruction dominates all
-  // of the exit blocks.  If it doesn't, then there is a path out of the loop
-  // which does not execute this instruction, so we can't hoist it.
-
   // If the instruction is in the header block for the loop (which is very
   // common), it is always guaranteed to dominate the exit blocks.  Since this
   // is a common case, and can save some work, check it now.
@@ -215,15 +207,12 @@ bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
     // Inst unless we can prove that Inst comes before the potential implicit
     // exit.  At the moment, we use a (cheap) hack for the common case where
     // the instruction of interest is the first one in the block.
-    return !headerMayThrow() ||
+    return !HeaderMayThrow ||
            Inst.getParent()->getFirstNonPHIOrDbg() == &Inst;
 
   // If there is a path from header to exit or latch that doesn't lead to our
   // instruction's block, return false.
-  if (!allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT))
-    return false;
-
-  return true;
+  return allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT);
 }
 
 
-- 
GitLab


From 6ccf5849d202cf261cf33335a027f8967e98b0f0 Mon Sep 17 00:00:00 2001
From: Renato Golin <renato.golin@linaro.org>
Date: Tue, 16 Oct 2018 09:37:52 +0000
Subject: [PATCH 0241/1116] [VPlan] Script to extract VPlan digraphs from log

The vectoriser's debug log prints VPlan digraphs, but it's a bit
cumbersome to extract them and render them into PNG images. This script
does exactly that, being careful enough to extract all individual plans,
name them appropriately and save in either .dot or .png files.

Example usage:

$ opt -O3 -debug-only=loop-vectorize file.ll -S -o /dev/null 2> debug.log

$ $LLVM_SRC/utils/extract_vplan.py < debug.log
Exporting VF1UF1 to DOT: VPlanVF1UF1.dot
Exporting VF24UF1 to DOT: VPlanVF24UF1.dot

$ $LLVM_SRC/utils/extract_vplan.py --png < debug.log
Exporting VF1UF1 to PNG via dot: VPlanVF1UF1.png
Exporting VF24UF1 to PNG via dot: VPlanVF24UF1.png

$ xdot VPlanVF1UF1.dot

Differential Revision: https://reviews.llvm.org/D53142


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344599 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/extract_vplan.py | 46 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100755 utils/extract_vplan.py

diff --git a/utils/extract_vplan.py b/utils/extract_vplan.py
new file mode 100755
index 00000000000..ac0055d2e79
--- /dev/null
+++ b/utils/extract_vplan.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+# This script extracts the VPlan digraphs from the vectoriser debug messages
+# and saves them in individual dot files (one for each plan). Optionally, and
+# providing 'dot' is installed, it can also render the dot into a PNG file.
+
+import sys
+import re
+import argparse
+import shutil
+import subprocess
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--png', action='store_true')
+args = parser.parse_args()
+
+dot = shutil.which('dot')
+if args.png and not dot:
+    raise RuntimeError("Can't export to PNG without 'dot' in the system")
+
+pattern = re.compile(r"(digraph VPlan {.*?\n})",re.DOTALL)
+matches = re.findall(pattern, sys.stdin.read())
+
+for vplan in matches:
+    m = re.search("graph \[.+(VF=.+,UF.+), ", vplan)
+    if not m:
+        raise ValueError("Can't get the right VPlan name")
+    name = re.sub('[^a-zA-Z0-9]', '', m.group(1))
+
+    if args.png:
+        filename = 'VPlan' + name + '.png'
+        print("Exporting " + name + " to PNG via dot: " + filename)
+        p = subprocess.Popen([dot, '-Tpng', '-o', filename],
+                              encoding='utf-8',
+                              stdin=subprocess.PIPE,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE)
+        out, err = p.communicate(input=vplan)
+        if err:
+            raise RuntimeError("Error running dot: " + err)
+
+    else:
+        filename = 'VPlan' + name + '.dot'
+        print("Exporting " + name + " to DOT: " + filename)
+        with open(filename, 'w') as out:
+            out.write(vplan)
-- 
GitLab


From 1f1ae517ddc8777691807df63e3e3752164a5fa2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 16 Oct 2018 09:50:16 +0000
Subject: [PATCH 0242/1116] [X86] Fix Skylake ReadAfterLd for PADDrm etc.

Missed in rL343868 as due to their custom InstrRW.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344600 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86SchedSkylakeClient.td   |   6 +-
 lib/Target/X86/X86SchedSkylakeServer.td   |   6 +-
 test/tools/llvm-mca/X86/read-after-ld-2.s | 104 +++++++++++-----------
 3 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td
index b5d842a52b5..d4a3eb07b98 100644
--- a/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/lib/Target/X86/X86SchedSkylakeClient.td
@@ -1133,7 +1133,8 @@ def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> {
 def: InstRW<[SKLWriteResGroup91], (instrs VINSERTF128rm,
                                           VINSERTI128rm,
                                           VPBLENDDrmi)>;
-def: InstRW<[SKLWriteResGroup91], (instregex "(V?)PADD(B|D|Q|W)rm",
+def: InstRW<[SKLWriteResGroup91, ReadAfterVecXLd],
+                                  (instregex "(V?)PADD(B|D|Q|W)rm",
                                              "(V?)PSUB(B|D|Q|W)rm")>;
 
 def SKLWriteResGroup92 : SchedWriteRes<[SKLPort5,SKLPort23]> {
@@ -1230,7 +1231,8 @@ def SKLWriteResGroup110 : SchedWriteRes<[SKLPort23,SKLPort015]> {
   let ResourceCycles = [1,1];
 }
 def: InstRW<[SKLWriteResGroup110], (instrs VPBLENDDYrmi)>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPADD(B|D|Q|W)Yrm",
+def: InstRW<[SKLWriteResGroup110, ReadAfterVecYLd],
+                                   (instregex "VPADD(B|D|Q|W)Yrm",
                                               "VPSUB(B|D|Q|W)Yrm")>;
 
 def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td
index d3fa912be11..cbcb6a6e58b 100644
--- a/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/lib/Target/X86/X86SchedSkylakeServer.td
@@ -1339,7 +1339,8 @@ def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> {
 }
 def: InstRW<[SKXWriteResGroup95], (instrs VMOVNTDQAZ128rm,
                                           VPBLENDDrmi)>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPDZ128rm(b?)",
+def: InstRW<[SKXWriteResGroup95, ReadAfterVecXLd],
+                                  (instregex "VBLENDMPDZ128rm(b?)",
                                              "VBLENDMPSZ128rm(b?)",
                                              "VBROADCASTI32X2Z128m(b?)",
                                              "VBROADCASTSSZ128m(b?)",
@@ -1534,7 +1535,8 @@ def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> {
 }
 def: InstRW<[SKXWriteResGroup121], (instrs VMOVNTDQAZ256rm,
                                            VPBLENDDYrmi)>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPD(Z|Z256)rm(b?)",
+def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd],
+                                   (instregex "VBLENDMPD(Z|Z256)rm(b?)",
                                               "VBLENDMPS(Z|Z256)rm(b?)",
                                               "VBROADCASTF32X2Z256m(b?)",
                                               "VBROADCASTF32X2Zm(b?)",
diff --git a/test/tools/llvm-mca/X86/read-after-ld-2.s b/test/tools/llvm-mca/X86/read-after-ld-2.s
index ee39b645d5a..7d549b39595 100644
--- a/test/tools/llvm-mca/X86/read-after-ld-2.s
+++ b/test/tools/llvm-mca/X86/read-after-ld-2.s
@@ -23,7 +23,7 @@ cmp     %edi, %edx
 # HASWELL-NEXT: Total Cycles:      143
 # HASWELL-NEXT: Total uOps:        500
 
-# SKYLAKE-NEXT: Total Cycles:      803
+# SKYLAKE-NEXT: Total Cycles:      110
 # SKYLAKE-NEXT: Total uOps:        500
 
 # ZNVER1-NEXT:  Total Cycles:      110
@@ -40,8 +40,8 @@ cmp     %edi, %edx
 # HASWELL-NEXT: Block RThroughput: 1.3
 
 # SKYLAKE:      Dispatch Width:    6
-# SKYLAKE-NEXT: uOps Per Cycle:    0.62
-# SKYLAKE-NEXT: IPC:               0.50
+# SKYLAKE-NEXT: uOps Per Cycle:    4.55
+# SKYLAKE-NEXT: IPC:               3.64
 # SKYLAKE-NEXT: Block RThroughput: 0.8
 
 # ZNVER1:       Dispatch Width:    4
@@ -57,8 +57,8 @@ cmp     %edi, %edx
 # HASWELL-NEXT:                     0123456789
 # HASWELL-NEXT: Index     0123456789          012
 
-# SKYLAKE-NEXT:                     0123456789          0123456789          0123456789          01234
-# SKYLAKE-NEXT: Index     0123456789          0123456789          0123456789          0123456789
+# SKYLAKE-NEXT:                     0123456789
+# SKYLAKE-NEXT: Index     0123456789
 
 # ZNVER1-NEXT:                      0123456789
 # ZNVER1-NEXT:  Index     0123456789
@@ -145,43 +145,46 @@ cmp     %edi, %edx
 # HASWELL-NEXT: [9,2]     .    .    . DeE-------R   addq	$32, %r8
 # HASWELL-NEXT: [9,3]     .    .    .  DeE------R   cmpl	%edi, %edx
 
-# SKYLAKE:      [0,0]     DeER .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [0,1]     DeeeeeeeeER    .    .    .    .    .    .    .    .    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [0,2]     DeE-------R    .    .    .    .    .    .    .    .    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [0,3]     D=eE------R    .    .    .    .    .    .    .    .    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [1,0]     D=eE------R    .    .    .    .    .    .    .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [1,1]     .D=======eeeeeeeeER .    .    .    .    .    .    .    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [1,2]     .DeE--------------R .    .    .    .    .    .    .    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [1,3]     .D=eE-------------R .    .    .    .    .    .    .    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [2,0]     .D=eE-------------R .    .    .    .    .    .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [2,1]     . D==============eeeeeeeeER   .    .    .    .    .    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [2,2]     . DeE---------------------R   .    .    .    .    .    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [2,3]     . D=eE--------------------R   .    .    .    .    .    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [3,0]     . D=eE--------------------R   .    .    .    .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [3,1]     .  D=====================eeeeeeeeER.    .    .    .    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [3,2]     .  DeE----------------------------R.    .    .    .    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [3,3]     .  D=eE---------------------------R.    .    .    .    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [4,0]     .  D=eE---------------------------R.    .    .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [4,1]     .   D============================eeeeeeeeER  .    .    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [4,2]     .   DeE-----------------------------------R  .    .    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [4,3]     .   D=eE----------------------------------R  .    .    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [5,0]     .   D=eE----------------------------------R  .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [5,1]     .    D===================================eeeeeeeeER    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [5,2]     .    DeE------------------------------------------R    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [5,3]     .    D=eE-----------------------------------------R    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [6,0]     .    D=eE-----------------------------------------R    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [6,1]     .    .D==========================================eeeeeeeeER .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [6,2]     .    .DeE-------------------------------------------------R .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [6,3]     .    .D=eE------------------------------------------------R .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [7,0]     .    .D=eE------------------------------------------------R .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [7,1]     .    . D=================================================eeeeeeeeER   .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [7,2]     .    . DeE--------------------------------------------------------R   .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [7,3]     .    . D=eE-------------------------------------------------------R   .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [8,0]     .    . D=eE-------------------------------------------------------R   .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [8,1]     .    .  D========================================================eeeeeeeeER   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [8,2]     .    .  DeE---------------------------------------------------------------R   addq	$32, %r8
-# SKYLAKE-NEXT: [8,3]     .    .  D=eE--------------------------------------------------------------R   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [9,0]     .    .  D=eE--------------------------------------------------------------R   addl	$1, %edx
+# SKYLAKE:      [0,0]     DeER .    .    .   .   addl	$1, %edx
+# SKYLAKE-NEXT: [0,1]     DeeeeeeeeER    .   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [0,2]     DeE-------R    .   .   addq	$32, %r8
+# SKYLAKE-NEXT: [0,3]     D=eE------R    .   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [1,0]     D=eE------R    .   .   addl	$1, %edx
+# SKYLAKE-NEXT: [1,1]     .DeeeeeeeeER   .   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [1,2]     .DeE-------R   .   .   addq	$32, %r8
+# SKYLAKE-NEXT: [1,3]     .D=eE------R   .   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [2,0]     .D=eE------R   .   .   addl	$1, %edx
+# SKYLAKE-NEXT: [2,1]     . DeeeeeeeeER  .   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [2,2]     . DeE-------R  .   .   addq	$32, %r8
+# SKYLAKE-NEXT: [2,3]     . D=eE------R  .   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [3,0]     . D=eE------R  .   .   addl	$1, %edx
+# SKYLAKE-NEXT: [3,1]     .  DeeeeeeeeER .   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [3,2]     .  DeE-------R .   .   addq	$32, %r8
+# SKYLAKE-NEXT: [3,3]     .  D=eE------R .   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [4,0]     .  D=eE------R .   .   addl	$1, %edx
+# SKYLAKE-NEXT: [4,1]     .   DeeeeeeeeER.   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [4,2]     .   DeE-------R.   .   addq	$32, %r8
+# SKYLAKE-NEXT: [4,3]     .   D=eE------R.   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [5,0]     .   D=eE------R.   .   addl	$1, %edx
+# SKYLAKE-NEXT: [5,1]     .    DeeeeeeeeER   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [5,2]     .    DeE-------R   .   addq	$32, %r8
+# SKYLAKE-NEXT: [5,3]     .    D=eE------R   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [6,0]     .    D=eE------R   .   addl	$1, %edx
+# SKYLAKE-NEXT: [6,1]     .    .DeeeeeeeeER  .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [6,2]     .    .DeE-------R  .   addq	$32, %r8
+# SKYLAKE-NEXT: [6,3]     .    .D=eE------R  .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [7,0]     .    .D=eE------R  .   addl	$1, %edx
+# SKYLAKE-NEXT: [7,1]     .    . DeeeeeeeeER .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [7,2]     .    . DeE-------R .   addq	$32, %r8
+# SKYLAKE-NEXT: [7,3]     .    . D=eE------R .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [8,0]     .    . D=eE------R .   addl	$1, %edx
+# SKYLAKE-NEXT: [8,1]     .    .  DeeeeeeeeER.   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [8,2]     .    .  DeE-------R.   addq	$32, %r8
+# SKYLAKE-NEXT: [8,3]     .    .  D=eE------R.   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [9,0]     .    .  D=eE------R.   addl	$1, %edx
+# SKYLAKE-NEXT: [9,1]     .    .   DeeeeeeeeER   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [9,2]     .    .   DeE-------R   addq	$32, %r8
+# SKYLAKE-NEXT: [9,3]     .    .   D=eE------R   cmpl	%edi, %edx
 
 # ZNVER1:       [0,0]     DeER .    .    .   .   addl	$1, %edx
 # ZNVER1-NEXT:  [0,1]     DeeeeeeeeER    .   .   vpaddd	(%r8), %ymm0, %ymm0
@@ -233,21 +236,20 @@ cmp     %edi, %edx
 # ALL:                [0]    [1]    [2]    [3]
 
 # BDWELL-NEXT:  0.     10    1.0    0.4    4.5       addl	$1, %edx
-# BDWELL-NEXT:  1.     10    1.0    0.1    0.0       vpaddd	(%r8), %ymm0, %ymm0
+# HASWELL-NEXT: 0.     10    1.0    0.4    5.4       addl	$1, %edx
+# SKYLAKE-NEXT: 0.     10    1.9    0.1    5.4       addl	$1, %edx
+# ZNVER1-NEXT:  0.     10    1.0    0.1    5.4       addl	$1, %edx
+
+# ALL-NEXT:     1.     10    1.0    0.1    0.0       vpaddd	(%r8), %ymm0, %ymm0
+
 # BDWELL-NEXT:  2.     10    1.0    0.4    5.7       addq	$32, %r8
 # BDWELL-NEXT:  3.     10    1.0    0.0    5.3       cmpl	%edi, %edx
 
-# HASWELL-NEXT: 0.     10    1.0    0.4    5.4       addl	$1, %edx
-# HASWELL-NEXT: 1.     10    1.0    0.1    0.0       vpaddd	(%r8), %ymm0, %ymm0
 # HASWELL-NEXT: 2.     10    1.0    0.4    6.7       addq	$32, %r8
 # HASWELL-NEXT: 3.     10    1.0    0.0    6.3       cmpl	%edi, %edx
 
-# SKYLAKE-NEXT: 0.     10    1.9    0.1    30.6      addl	$1, %edx
-# SKYLAKE-NEXT: 1.     10    32.5   0.1    0.0       vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: 2.     10    1.0    0.1    38.5      addq	$32, %r8
-# SKYLAKE-NEXT: 3.     10    2.0    0.0    37.5      cmpl	%edi, %edx
+# SKYLAKE-NEXT: 2.     10    1.0    0.1    7.0       addq	$32, %r8
+# SKYLAKE-NEXT: 3.     10    2.0    0.0    6.0       cmpl	%edi, %edx
 
-# ZNVER1-NEXT:  0.     10    1.0    0.1    5.4       addl	$1, %edx
-# ZNVER1-NEXT:  1.     10    1.0    0.1    0.0       vpaddd	(%r8), %ymm0, %ymm0
 # ZNVER1-NEXT:  2.     10    1.0    0.1    7.0       addq	$32, %r8
 # ZNVER1-NEXT:  3.     10    2.0    0.0    6.0       cmpl	%edi, %edx
-- 
GitLab


From b22a1a5cdb2fd318aa7e2273c23e34949efab70e Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 16 Oct 2018 09:58:09 +0000
Subject: [PATCH 0243/1116] [NFC] Introduce ICFLoopSafetyInfo

This is an alternative implementation of LoopSafetyInfo that uses the implicit
control flow tracking to give precise answers on queries "whether or not this
block contains throwing instructions". This rules out false-positive answers on
LoopSafetyInfo's queries.

This patch only introduces the new implementation. It is not currently used in
any pass. The enabling patches will go separately, through review.

The plan is to completely replace all uses of LoopSafetyInfo with
ICFLoopSafetyInfo in the future, but to avoid introducing functional problems,
we will do it pass by pass.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344601 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/MustExecute.h | 32 +++++++++++++++++++++++++++++
 lib/Analysis/MustExecute.cpp        | 31 ++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h
index f136ff750de..62d9b056e88 100644
--- a/include/llvm/Analysis/MustExecute.h
+++ b/include/llvm/Analysis/MustExecute.h
@@ -19,6 +19,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/InstructionPrecedenceTracking.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
@@ -122,6 +123,37 @@ public:
   virtual ~SimpleLoopSafetyInfo() {};
 };
 
+/// This implementation of LoopSafetyInfo use ImplicitControlFlowTracking to
+/// give precise answers on "may throw" queries. This implementation uses cache
+/// that should be invalidated by calling the method dropCachedInfo whenever we
+/// modify a basic block's contents by adding or removing instructions.
+class ICFLoopSafetyInfo: public LoopSafetyInfo {
+  bool MayThrow = false;       // The current loop contains an instruction which
+                               // may throw.
+  // Contains information about implicit control flow in this loop's blocks.
+  mutable ImplicitControlFlowTracking ICF;
+
+public:
+  virtual bool blockMayThrow(const BasicBlock *BB) const;
+
+  virtual bool anyBlockMayThrow() const;
+
+  virtual void computeLoopSafetyInfo(const Loop *CurLoop);
+
+  virtual bool isGuaranteedToExecute(const Instruction &Inst,
+                                     const DominatorTree *DT,
+                                     const Loop *CurLoop) const;
+
+  /// Drops cached information regarding the implicit control flow in block
+  /// \p BB. It should be called for every block in which we add or remove any
+  /// instructions  to a block before we make queries to it.
+  void dropCachedInfo(const BasicBlock *BB);
+
+  ICFLoopSafetyInfo(DominatorTree *DT) : LoopSafetyInfo(), ICF(DT) {};
+
+  virtual ~ICFLoopSafetyInfo() {};
+};
+
 }
 
 #endif
diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp
index 4e42f336dc7..64ee2a7e5b0 100644
--- a/lib/Analysis/MustExecute.cpp
+++ b/lib/Analysis/MustExecute.cpp
@@ -61,6 +61,31 @@ void SimpleLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) {
   computeBlockColors(CurLoop);
 }
 
+bool ICFLoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const {
+  return ICF.hasICF(BB);
+}
+
+bool ICFLoopSafetyInfo::anyBlockMayThrow() const {
+  return MayThrow;
+}
+
+void ICFLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) {
+  assert(CurLoop != nullptr && "CurLoop can't be null");
+  ICF.clear();
+  MayThrow = false;
+  // Figure out the fact that at least one block may throw.
+  for (auto &BB : CurLoop->blocks())
+    if (ICF.hasICF(&*BB)) {
+      MayThrow = true;
+      break;
+    }
+  computeBlockColors(CurLoop);
+}
+
+void ICFLoopSafetyInfo::dropCachedInfo(const BasicBlock *BB) {
+  ICF.invalidateBlock(BB);
+}
+
 void LoopSafetyInfo::computeBlockColors(const Loop *CurLoop) {
   // Compute funclet colors if we might sink/hoist in a function with a funclet
   // personality routine.
@@ -215,6 +240,12 @@ bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
   return allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT);
 }
 
+bool ICFLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
+                                              const DominatorTree *DT,
+                                              const Loop *CurLoop) const {
+  return !ICF.isDominatedByICFIFromSameBlock(&Inst) &&
+         allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT);
+}
 
 namespace {
   struct MustExecutePrinter : public FunctionPass {
-- 
GitLab


From 09fdf061bbfb5187bfd678fd2c215217013798fe Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 16 Oct 2018 10:06:15 +0000
Subject: [PATCH 0244/1116] [LegalizeDAG] ExpandLegalINT_TO_FP - cleanup
 UINT_TO_FP i64 -> f64 expansion.

Use SrcVT/DestVT types, correct shift type and AND instead of ZERO_EXTEND_IN_REG.

Part of prep work for D52965

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344602 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 37 +++++++++++-------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 175df889ef2..07a37a5092a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2310,6 +2310,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
                                                    EVT DestVT,
                                                    const SDLoc &dl) {
   EVT SrcVT = Op0.getValueType();
+  EVT ShiftVT = TLI.getShiftAmountTy(SrcVT, DAG.getDataLayout());
 
   // TODO: Should any fast-math-flags be set for the created nodes?
   LLVM_DEBUG(dbgs() << "Legalizing INT_TO_FP\n");
@@ -2371,24 +2372,21 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
   // TODO: Generalize this for use with other types.
   if (SrcVT == MVT::i64 && DestVT == MVT::f64) {
     LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f64\n");
-    SDValue TwoP52 =
-      DAG.getConstant(UINT64_C(0x4330000000000000), dl, MVT::i64);
-    SDValue TwoP84PlusTwoP52 =
-      DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), dl,
-                        MVT::f64);
-    SDValue TwoP84 =
-      DAG.getConstant(UINT64_C(0x4530000000000000), dl, MVT::i64);
-
-    SDValue Lo = DAG.getZeroExtendInReg(Op0, dl, MVT::i32);
-    SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0,
-                             DAG.getConstant(32, dl, MVT::i64));
-    SDValue LoOr = DAG.getNode(ISD::OR, dl, MVT::i64, Lo, TwoP52);
-    SDValue HiOr = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, TwoP84);
-    SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, LoOr);
-    SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, HiOr);
-    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt,
-                                TwoP84PlusTwoP52);
-    return DAG.getNode(ISD::FADD, dl, MVT::f64, LoFlt, HiSub);
+    SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
+    SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
+        BitsToDouble(UINT64_C(0x4530000000100000)), dl, DestVT);
+    SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
+    SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
+    SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);
+
+    SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Op0, LoMask);
+    SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, HiShift);
+    SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52);
+    SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
+    SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, DestVT, LoOr);
+    SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, DestVT, HiOr);
+    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DestVT, HiFlt, TwoP84PlusTwoP52);
+    return DAG.getNode(ISD::FADD, dl, DestVT, LoFlt, HiSub);
   }
 
   // TODO: Generalize this for use with other types.
@@ -2399,8 +2397,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
     if (!isSigned) {
       SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Op0);
 
-      SDValue ShiftConst = DAG.getConstant(
-          1, dl, TLI.getShiftAmountTy(SrcVT, DAG.getDataLayout()));
+      SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
       SDValue Shr = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0, ShiftConst);
       SDValue AndConst = DAG.getConstant(1, dl, MVT::i64);
       SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, AndConst);
-- 
GitLab


From dacda52aca98da2ae2f3a4160f7197abb97093a6 Mon Sep 17 00:00:00 2001
From: Ayal Zaks <ayal.zaks@intel.com>
Date: Tue, 16 Oct 2018 14:25:02 +0000
Subject: [PATCH 0245/1116] [LV] Add test checks when vectorizing loops under
 opt for size; NFC

Landing this as a separate part of https://reviews.llvm.org/D50480, recording
current behavior more accurately, to clarify subsequent diff ([LV] Vectorizing
loops of arbitrary trip count without remainder under opt for size).


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344606 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/LoopVectorize/X86/optsize.ll  |  57 +++++++++
 .../LoopVectorize/X86/small-size.ll           | 107 +++++++++++++++--
 .../X86/vect.omp.force.small-tc.ll            | 108 ++++++++++++++++--
 3 files changed, 253 insertions(+), 19 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/X86/optsize.ll

diff --git a/test/Transforms/LoopVectorize/X86/optsize.ll b/test/Transforms/LoopVectorize/X86/optsize.ll
new file mode 100644
index 00000000000..057c72044d9
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -0,0 +1,57 @@
+; This test verifies that the loop vectorizer will NOT vectorize loops that
+; will produce a tail loop with the optimize for size or the minimize size
+; attributes. This is a target-dependent version of the test.
+; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_optsize() #0 {
+; CHECK-LABEL: @foo_optsize(
+; CHECK-NOT: x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #0 = { optsize }
+
+define i32 @foo_minsize() #1 {
+; CHECK-LABEL: @foo_minsize(
+; CHECK-NOT: x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #1 = { minsize }
+
diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll
index 89d69e232f5..8af7b2e7df9 100644
--- a/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -20,12 +21,33 @@ target triple = "x86_64-apple-macosx10.8.0"
 @dj = common global [1024 x i32] zeroinitializer, align 16
 
 ; We can optimize this test without a tail.
-;CHECK-LABEL: @example1(
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: ret void
 define void @example1() optsize {
+; CHECK-LABEL: @example1(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 16
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP10:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP9:%.*]]
+; CHECK:         br i1 undef, label [[TMP10]], label [[TMP9]], !llvm.loop !2
+; CHECK:         ret void
+;
   br label %1
 
 ; <label>:1                                       ; preds = %1, %0
@@ -142,10 +164,31 @@ define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize {
 
 
 ; We CAN vectorize this example because the pointers are marked as noalias.
-;CHECK-LABEL: @example23b(
-;CHECK: <4 x i32>
-;CHECK: ret void
 define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+; CHECK-LABEL: @example23b(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[NEXT_GEP]] to <4 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <4 x i32> [[TMP2]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[NEXT_GEP4]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:         br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !5
+; CHECK:         ret void
+;
   br label %1
 
 ; <label>:1                                       ; preds = %1, %0
@@ -166,4 +209,52 @@ define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst
   ret void
 }
 
+; We CAN'T vectorize this example because it would entail a tail.
+define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+; CHECK-LABEL: @example23c(
+; CHECK-NOT: <4 x
+; CHECK: ret void
+  br label %1
 
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; We CAN'T vectorize this example because it would entail a tail.
+define i64 @example23d(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+;CHECK-LABEL: @example23d(
+; CHECK-NOT: <4 x
+; CHECK: ret i64
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i64 %7
+}
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 46fd022af66..2db08b0363a 100644
--- a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -1,16 +1,8 @@
-; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S -vectorizer-min-trip-count=21 2>&1 | FileCheck %s
-; REQUIRES: asserts
-
-; CHECK: LV: Loop hints: force=enabled
-; CHECK: LV: Loop hints: force=?
-; CHECK: LV: Loop hints: force=?
-; No more loops in the module
-; CHECK-NOT: LV: Loop hints: force=
-; CHECK: 3 loop-vectorize               - Number of loops analyzed for vectorization
-; CHECK: 2 loop-vectorize               - Number of loops vectorized
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -mcpu=corei7-avx -S -vectorizer-min-trip-count=21 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
+target triple = "x86_64-unknown-linux"
 
 ;
 ; The source code for the test:
@@ -25,6 +17,51 @@ target triple = "x86_64-apple-macosx10.8.0"
 ; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata.
 ;
 define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.mem.parallel_loop_access !3
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !3
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !4
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -51,6 +88,10 @@ for.end:
 ; This loop will not be vectorized as the trip count is below the threshold.
 ;
 define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @not_vectorized(
+; CHECK-NOT:   x float>
+; CHECK:       for.end:
+;
 entry:
   br label %for.body
 
@@ -77,6 +118,51 @@ for.end:
 ; scalar iterations are needed.
 ;
 define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.mem.parallel_loop_access !6
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !6
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !6
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !8
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
-- 
GitLab


From 42fced4e53b5559bdce12749117a2896cda65d5a Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 16 Oct 2018 14:35:21 +0000
Subject: [PATCH 0246/1116] [InstCombine] try harder to form select from logic
 ops

This is part of solving PR37549:
https://bugs.llvm.org/show_bug.cgi?id=37549

The patterns shown here are a special case of something
that we already convert to select. Using ComputeNumSignBits()
catches that case (but not the more complicated motivating
patterns yet).

The backend has hooks/logic to convert back to logic ops
if that's better for the target.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344609 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineAndOrXor.cpp       | 66 +++++++++++--------
 .../InstCombine/InstCombineInternal.h         |  3 +
 test/Transforms/InstCombine/logical-select.ll | 18 ++---
 test/Transforms/InstCombine/vec_sext.ll       | 18 ++---
 4 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index a6280ec95a9..5ffbf83508c 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1831,14 +1831,28 @@ static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
 /// We have an expression of the form (A & C) | (B & D). If A is a scalar or
 /// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
 /// B, it can be used as the condition operand of a select instruction.
-static Value *getSelectCondition(Value *A, Value *B,
-                                 InstCombiner::BuilderTy &Builder) {
-  // If these are scalars or vectors of i1, A can be used directly.
+Value *InstCombiner::getSelectCondition(Value *A, Value *B) {
+  // Step 1: We need 0 or all-1's bitmasks.
   Type *Ty = A->getType();
-  if (match(A, m_Not(m_Specific(B))) && Ty->isIntOrIntVectorTy(1))
-    return A;
+  if (ComputeNumSignBits(A) != Ty->getScalarSizeInBits())
+    return nullptr;
+
+  // Step 2: If B is the 'not' value of A, we have our answer.
+  if (match(A, m_Not(m_Specific(B)))) {
+    // If these are scalars or vectors of i1, A can be used directly.
+    if (Ty->isIntOrIntVectorTy(1))
+      return A;
+    return Builder.CreateTrunc(A, CmpInst::makeCmpResultType(Ty));
+  }
+
+  // If both operands are constants, see if the constants are inverse bitmasks.
+  Constant *AConst, *BConst;
+  if (match(A, m_Constant(AConst)) && match(B, m_Constant(BConst)))
+    if (AConst == ConstantExpr::getNot(BConst))
+      return Builder.CreateZExtOrTrunc(A, CmpInst::makeCmpResultType(Ty));
 
-  // If A and B are sign-extended, look through the sexts to find the booleans.
+  // Look for more complex patterns. The 'not' op may be hidden behind various
+  // casts. Look through sexts and bitcasts to find the booleans.
   Value *Cond;
   Value *NotB;
   if (match(A, m_SExt(m_Value(Cond))) &&
@@ -1854,36 +1868,30 @@ static Value *getSelectCondition(Value *A, Value *B,
   if (!Ty->isVectorTy())
     return nullptr;
 
-  // If both operands are constants, see if the constants are inverse bitmasks.
-  Constant *AC, *BC;
-  if (match(A, m_Constant(AC)) && match(B, m_Constant(BC)) &&
-      areInverseVectorBitmasks(AC, BC)) {
-    return Builder.CreateZExtOrTrunc(AC, CmpInst::makeCmpResultType(Ty));
-  }
-
   // If both operands are xor'd with constants using the same sexted boolean
   // operand, see if the constants are inverse bitmasks.
-  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AC)))) &&
-      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BC)))) &&
+  // TODO: Use ConstantExpr::getNot()?
+  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AConst)))) &&
+      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BConst)))) &&
       Cond->getType()->isIntOrIntVectorTy(1) &&
-      areInverseVectorBitmasks(AC, BC)) {
-    AC = ConstantExpr::getTrunc(AC, CmpInst::makeCmpResultType(Ty));
-    return Builder.CreateXor(Cond, AC);
+      areInverseVectorBitmasks(AConst, BConst)) {
+    AConst = ConstantExpr::getTrunc(AConst, CmpInst::makeCmpResultType(Ty));
+    return Builder.CreateXor(Cond, AConst);
   }
   return nullptr;
 }
 
 /// We have an expression of the form (A & C) | (B & D). Try to simplify this
 /// to "A' ? C : D", where A' is a boolean or vector of booleans.
-static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
-                                   InstCombiner::BuilderTy &Builder) {
+Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
+                                         Value *D) {
   // The potential condition of the select may be bitcasted. In that case, look
   // through its bitcast and the corresponding bitcast of the 'not' condition.
   Type *OrigType = A->getType();
   A = peekThroughBitcast(A, true);
   B = peekThroughBitcast(B, true);
 
-  if (Value *Cond = getSelectCondition(A, B, Builder)) {
+  if (Value *Cond = getSelectCondition(A, B)) {
     // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
     // The bitcasts will either all exist or all not exist. The builder will
     // not create unnecessary casts if the types already match.
@@ -2234,21 +2242,21 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     // 'or' that it is replacing.
     if (Op0->hasOneUse() || Op1->hasOneUse()) {
       // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants.
-      if (Value *V = matchSelectFromAndOr(A, C, B, D, Builder))
+      if (Value *V = matchSelectFromAndOr(A, C, B, D))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(A, C, D, B, Builder))
+      if (Value *V = matchSelectFromAndOr(A, C, D, B))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(C, A, B, D, Builder))
+      if (Value *V = matchSelectFromAndOr(C, A, B, D))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(C, A, D, B, Builder))
+      if (Value *V = matchSelectFromAndOr(C, A, D, B))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(B, D, A, C, Builder))
+      if (Value *V = matchSelectFromAndOr(B, D, A, C))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(B, D, C, A, Builder))
+      if (Value *V = matchSelectFromAndOr(B, D, C, A))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(D, B, A, C, Builder))
+      if (Value *V = matchSelectFromAndOr(D, B, A, C))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(D, B, C, A, Builder))
+      if (Value *V = matchSelectFromAndOr(D, B, C, A))
         return replaceInstUsesWith(I, V);
     }
   }
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 3a18744e434..128365fc22e 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -589,6 +589,9 @@ private:
 
   Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
                                        bool JoinedByAnd, Instruction &CxtI);
+  Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D);
+  Value *getSelectCondition(Value *A, Value *B);
+
 public:
   /// Inserts an instruction \p New before instruction \p Old
   ///
diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index 3ee0ba169b3..e817bb9c19d 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -535,12 +535,9 @@ define <4 x i32> @vec_sel_xor_multi_use(<4 x i32> %a, <4 x i32> %b, <4 x i1> %c)
 
 define i32 @allSignBits(i32 %cond, i32 %tval, i32 %fval) {
 ; CHECK-LABEL: @allSignBits(
-; CHECK-NEXT:    [[BITMASK:%.*]] = ashr i32 [[COND:%.*]], 31
-; CHECK-NEXT:    [[NOT_BITMASK:%.*]] = xor i32 [[BITMASK]], -1
-; CHECK-NEXT:    [[A1:%.*]] = and i32 [[BITMASK]], [[TVAL:%.*]]
-; CHECK-NEXT:    [[A2:%.*]] = and i32 [[NOT_BITMASK]], [[FVAL:%.*]]
-; CHECK-NEXT:    [[SEL:%.*]] = or i32 [[A1]], [[A2]]
-; CHECK-NEXT:    ret i32 [[SEL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[COND:%.*]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[TVAL:%.*]], i32 [[FVAL:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
   %bitmask = ashr i32 %cond, 31
   %not_bitmask = xor i32 %bitmask, -1
@@ -552,12 +549,9 @@ define i32 @allSignBits(i32 %cond, i32 %tval, i32 %fval) {
 
 define <4 x i8> @allSignBits_vec(<4 x i8> %cond, <4 x i8> %tval, <4 x i8> %fval) {
 ; CHECK-LABEL: @allSignBits_vec(
-; CHECK-NEXT:    [[BITMASK:%.*]] = ashr <4 x i8> [[COND:%.*]], <i8 7, i8 7, i8 7, i8 7>
-; CHECK-NEXT:    [[NOT_BITMASK:%.*]] = xor <4 x i8> [[BITMASK]], <i8 -1, i8 -1, i8 -1, i8 -1>
-; CHECK-NEXT:    [[A1:%.*]] = and <4 x i8> [[BITMASK]], [[TVAL:%.*]]
-; CHECK-NEXT:    [[A2:%.*]] = and <4 x i8> [[NOT_BITMASK]], [[FVAL:%.*]]
-; CHECK-NEXT:    [[SEL:%.*]] = or <4 x i8> [[A2]], [[A1]]
-; CHECK-NEXT:    ret <4 x i8> [[SEL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i8> [[COND:%.*]], <i8 -1, i8 -1, i8 -1, i8 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> [[FVAL:%.*]], <4 x i8> [[TVAL:%.*]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
 ;
   %bitmask = ashr <4 x i8> %cond, <i8 7, i8 7, i8 7, i8 7>
   %not_bitmask = xor <4 x i8> %bitmask, <i8 -1, i8 -1, i8 -1, i8 -1>
diff --git a/test/Transforms/InstCombine/vec_sext.ll b/test/Transforms/InstCombine/vec_sext.ll
index f244d49527b..39bd4087416 100644
--- a/test/Transforms/InstCombine/vec_sext.ll
+++ b/test/Transforms/InstCombine/vec_sext.ll
@@ -4,12 +4,9 @@
 define <4 x i32> @vec_select(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @vec_select(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]]
-; CHECK-NEXT:    [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    [[T1:%.*]] = xor <4 x i32> [[B_LOBIT1]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> [[T1]], [[A]]
-; CHECK-NEXT:    [[T3:%.*]] = and <4 x i32> [[B_LOBIT1]], [[SUB]]
-; CHECK-NEXT:    [[COND:%.*]] = or <4 x i32> [[T2]], [[T3]]
-; CHECK-NEXT:    ret <4 x i32> [[COND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A]], <4 x i32> [[SUB]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %cmp = icmp slt <4 x i32> %b, zeroinitializer
   %sext = sext <4 x i1> %cmp to <4 x i32>
@@ -26,12 +23,9 @@ define <4 x i32> @vec_select(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_select_alternate_sign_bit_test(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @vec_select_alternate_sign_bit_test(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]]
-; CHECK-NEXT:    [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    [[B_LOBIT1_NOT:%.*]] = xor <4 x i32> [[B_LOBIT1]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> [[B_LOBIT1]], [[A]]
-; CHECK-NEXT:    [[T3:%.*]] = and <4 x i32> [[B_LOBIT1_NOT]], [[SUB]]
-; CHECK-NEXT:    [[COND:%.*]] = or <4 x i32> [[T2]], [[T3]]
-; CHECK-NEXT:    ret <4 x i32> [[COND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[SUB]], <4 x i32> [[A]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %cmp = icmp sgt <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   %sext = sext <4 x i1> %cmp to <4 x i32>
-- 
GitLab


From 0f096ec04e276fa55a493238b33b1ea10982f3d5 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 16 Oct 2018 14:44:50 +0000
Subject: [PATCH 0247/1116] [InstCombine] make sure type is integer before
 calling ComputeNumSignBits

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344610 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineAndOrXor.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 5ffbf83508c..ecda713a6bc 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1834,7 +1834,8 @@ static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
 Value *InstCombiner::getSelectCondition(Value *A, Value *B) {
   // Step 1: We need 0 or all-1's bitmasks.
   Type *Ty = A->getType();
-  if (ComputeNumSignBits(A) != Ty->getScalarSizeInBits())
+  if (Ty->isIntOrIntVectorTy() &&
+      ComputeNumSignBits(A) != Ty->getScalarSizeInBits())
     return nullptr;
 
   // Step 2: If B is the 'not' value of A, we have our answer.
-- 
GitLab


From 5973d705524e5432b05634a9aab38ecd2360ff13 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 16 Oct 2018 15:26:08 +0000
Subject: [PATCH 0248/1116] revert rL344609: [InstCombine] try harder to form
 select from logic ops

I noticed a missing check and added it at rL344610, but there actually
are codegen tests that will fail without that, so I'll edit those and
submit a fixed patch with more tests.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344612 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineAndOrXor.cpp       | 67 ++++++++-----------
 .../InstCombine/InstCombineInternal.h         |  3 -
 test/Transforms/InstCombine/logical-select.ll | 18 +++--
 test/Transforms/InstCombine/vec_sext.ll       | 18 +++--
 4 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index ecda713a6bc..a6280ec95a9 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1831,29 +1831,14 @@ static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
 /// We have an expression of the form (A & C) | (B & D). If A is a scalar or
 /// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
 /// B, it can be used as the condition operand of a select instruction.
-Value *InstCombiner::getSelectCondition(Value *A, Value *B) {
-  // Step 1: We need 0 or all-1's bitmasks.
+static Value *getSelectCondition(Value *A, Value *B,
+                                 InstCombiner::BuilderTy &Builder) {
+  // If these are scalars or vectors of i1, A can be used directly.
   Type *Ty = A->getType();
-  if (Ty->isIntOrIntVectorTy() &&
-      ComputeNumSignBits(A) != Ty->getScalarSizeInBits())
-    return nullptr;
-
-  // Step 2: If B is the 'not' value of A, we have our answer.
-  if (match(A, m_Not(m_Specific(B)))) {
-    // If these are scalars or vectors of i1, A can be used directly.
-    if (Ty->isIntOrIntVectorTy(1))
-      return A;
-    return Builder.CreateTrunc(A, CmpInst::makeCmpResultType(Ty));
-  }
-
-  // If both operands are constants, see if the constants are inverse bitmasks.
-  Constant *AConst, *BConst;
-  if (match(A, m_Constant(AConst)) && match(B, m_Constant(BConst)))
-    if (AConst == ConstantExpr::getNot(BConst))
-      return Builder.CreateZExtOrTrunc(A, CmpInst::makeCmpResultType(Ty));
+  if (match(A, m_Not(m_Specific(B))) && Ty->isIntOrIntVectorTy(1))
+    return A;
 
-  // Look for more complex patterns. The 'not' op may be hidden behind various
-  // casts. Look through sexts and bitcasts to find the booleans.
+  // If A and B are sign-extended, look through the sexts to find the booleans.
   Value *Cond;
   Value *NotB;
   if (match(A, m_SExt(m_Value(Cond))) &&
@@ -1869,30 +1854,36 @@ Value *InstCombiner::getSelectCondition(Value *A, Value *B) {
   if (!Ty->isVectorTy())
     return nullptr;
 
+  // If both operands are constants, see if the constants are inverse bitmasks.
+  Constant *AC, *BC;
+  if (match(A, m_Constant(AC)) && match(B, m_Constant(BC)) &&
+      areInverseVectorBitmasks(AC, BC)) {
+    return Builder.CreateZExtOrTrunc(AC, CmpInst::makeCmpResultType(Ty));
+  }
+
   // If both operands are xor'd with constants using the same sexted boolean
   // operand, see if the constants are inverse bitmasks.
-  // TODO: Use ConstantExpr::getNot()?
-  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AConst)))) &&
-      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BConst)))) &&
+  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AC)))) &&
+      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BC)))) &&
       Cond->getType()->isIntOrIntVectorTy(1) &&
-      areInverseVectorBitmasks(AConst, BConst)) {
-    AConst = ConstantExpr::getTrunc(AConst, CmpInst::makeCmpResultType(Ty));
-    return Builder.CreateXor(Cond, AConst);
+      areInverseVectorBitmasks(AC, BC)) {
+    AC = ConstantExpr::getTrunc(AC, CmpInst::makeCmpResultType(Ty));
+    return Builder.CreateXor(Cond, AC);
   }
   return nullptr;
 }
 
 /// We have an expression of the form (A & C) | (B & D). Try to simplify this
 /// to "A' ? C : D", where A' is a boolean or vector of booleans.
-Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
-                                         Value *D) {
+static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
+                                   InstCombiner::BuilderTy &Builder) {
   // The potential condition of the select may be bitcasted. In that case, look
   // through its bitcast and the corresponding bitcast of the 'not' condition.
   Type *OrigType = A->getType();
   A = peekThroughBitcast(A, true);
   B = peekThroughBitcast(B, true);
 
-  if (Value *Cond = getSelectCondition(A, B)) {
+  if (Value *Cond = getSelectCondition(A, B, Builder)) {
     // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
     // The bitcasts will either all exist or all not exist. The builder will
     // not create unnecessary casts if the types already match.
@@ -2243,21 +2234,21 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     // 'or' that it is replacing.
     if (Op0->hasOneUse() || Op1->hasOneUse()) {
       // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants.
-      if (Value *V = matchSelectFromAndOr(A, C, B, D))
+      if (Value *V = matchSelectFromAndOr(A, C, B, D, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(A, C, D, B))
+      if (Value *V = matchSelectFromAndOr(A, C, D, B, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(C, A, B, D))
+      if (Value *V = matchSelectFromAndOr(C, A, B, D, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(C, A, D, B))
+      if (Value *V = matchSelectFromAndOr(C, A, D, B, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(B, D, A, C))
+      if (Value *V = matchSelectFromAndOr(B, D, A, C, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(B, D, C, A))
+      if (Value *V = matchSelectFromAndOr(B, D, C, A, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(D, B, A, C))
+      if (Value *V = matchSelectFromAndOr(D, B, A, C, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(D, B, C, A))
+      if (Value *V = matchSelectFromAndOr(D, B, C, A, Builder))
         return replaceInstUsesWith(I, V);
     }
   }
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 128365fc22e..3a18744e434 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -589,9 +589,6 @@ private:
 
   Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
                                        bool JoinedByAnd, Instruction &CxtI);
-  Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D);
-  Value *getSelectCondition(Value *A, Value *B);
-
 public:
   /// Inserts an instruction \p New before instruction \p Old
   ///
diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index e817bb9c19d..3ee0ba169b3 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -535,9 +535,12 @@ define <4 x i32> @vec_sel_xor_multi_use(<4 x i32> %a, <4 x i32> %b, <4 x i1> %c)
 
 define i32 @allSignBits(i32 %cond, i32 %tval, i32 %fval) {
 ; CHECK-LABEL: @allSignBits(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[COND:%.*]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[TVAL:%.*]], i32 [[FVAL:%.*]]
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; CHECK-NEXT:    [[BITMASK:%.*]] = ashr i32 [[COND:%.*]], 31
+; CHECK-NEXT:    [[NOT_BITMASK:%.*]] = xor i32 [[BITMASK]], -1
+; CHECK-NEXT:    [[A1:%.*]] = and i32 [[BITMASK]], [[TVAL:%.*]]
+; CHECK-NEXT:    [[A2:%.*]] = and i32 [[NOT_BITMASK]], [[FVAL:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = or i32 [[A1]], [[A2]]
+; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %bitmask = ashr i32 %cond, 31
   %not_bitmask = xor i32 %bitmask, -1
@@ -549,9 +552,12 @@ define i32 @allSignBits(i32 %cond, i32 %tval, i32 %fval) {
 
 define <4 x i8> @allSignBits_vec(<4 x i8> %cond, <4 x i8> %tval, <4 x i8> %fval) {
 ; CHECK-LABEL: @allSignBits_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i8> [[COND:%.*]], <i8 -1, i8 -1, i8 -1, i8 -1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> [[FVAL:%.*]], <4 x i8> [[TVAL:%.*]]
-; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
+; CHECK-NEXT:    [[BITMASK:%.*]] = ashr <4 x i8> [[COND:%.*]], <i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:    [[NOT_BITMASK:%.*]] = xor <4 x i8> [[BITMASK]], <i8 -1, i8 -1, i8 -1, i8 -1>
+; CHECK-NEXT:    [[A1:%.*]] = and <4 x i8> [[BITMASK]], [[TVAL:%.*]]
+; CHECK-NEXT:    [[A2:%.*]] = and <4 x i8> [[NOT_BITMASK]], [[FVAL:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = or <4 x i8> [[A2]], [[A1]]
+; CHECK-NEXT:    ret <4 x i8> [[SEL]]
 ;
   %bitmask = ashr <4 x i8> %cond, <i8 7, i8 7, i8 7, i8 7>
   %not_bitmask = xor <4 x i8> %bitmask, <i8 -1, i8 -1, i8 -1, i8 -1>
diff --git a/test/Transforms/InstCombine/vec_sext.ll b/test/Transforms/InstCombine/vec_sext.ll
index 39bd4087416..f244d49527b 100644
--- a/test/Transforms/InstCombine/vec_sext.ll
+++ b/test/Transforms/InstCombine/vec_sext.ll
@@ -4,9 +4,12 @@
 define <4 x i32> @vec_select(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @vec_select(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A]], <4 x i32> [[SUB]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    [[T1:%.*]] = xor <4 x i32> [[B_LOBIT1]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> [[T1]], [[A]]
+; CHECK-NEXT:    [[T3:%.*]] = and <4 x i32> [[B_LOBIT1]], [[SUB]]
+; CHECK-NEXT:    [[COND:%.*]] = or <4 x i32> [[T2]], [[T3]]
+; CHECK-NEXT:    ret <4 x i32> [[COND]]
 ;
   %cmp = icmp slt <4 x i32> %b, zeroinitializer
   %sext = sext <4 x i1> %cmp to <4 x i32>
@@ -23,9 +26,12 @@ define <4 x i32> @vec_select(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_select_alternate_sign_bit_test(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @vec_select_alternate_sign_bit_test(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[SUB]], <4 x i32> [[A]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    [[B_LOBIT1_NOT:%.*]] = xor <4 x i32> [[B_LOBIT1]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> [[B_LOBIT1]], [[A]]
+; CHECK-NEXT:    [[T3:%.*]] = and <4 x i32> [[B_LOBIT1_NOT]], [[SUB]]
+; CHECK-NEXT:    [[COND:%.*]] = or <4 x i32> [[T2]], [[T3]]
+; CHECK-NEXT:    ret <4 x i32> [[COND]]
 ;
   %cmp = icmp sgt <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   %sext = sext <4 x i1> %cmp to <4 x i32>
-- 
GitLab


From c2874102cb09d7199a3cd3a5b7d877b56648a5d7 Mon Sep 17 00:00:00 2001
From: Anna Thomas <anna@azul.com>
Date: Tue, 16 Oct 2018 15:46:26 +0000
Subject: [PATCH 0249/1116] [LV] Teach vectorizer about variant value store
 into uniform address

Summary:
Teach vectorizer about vectorizing variant value stores to uniform
address. Similar to rL343028, we do not allow vectorization if we have
multiple stores to the same uniform address.

Cost model already has the change for considering the extract
instruction cost for a variant value store. See added test cases for how
vectorization is done.
The patch also contains changes to the ORE messages.

Reviewers: Ayal, mkuper, anemet, hsaito

Subscribers: rkruppe, llvm-commits

Differential Revision: https://reviews.llvm.org/D52656

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344613 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/LoopAccessAnalysis.h    |  10 +-
 lib/Analysis/LoopAccessAnalysis.cpp           |  16 +-
 .../Vectorize/LoopVectorizationLegality.cpp   |   7 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    |   1 -
 .../memcheck-wrapping-pointers.ll             |   2 +-
 .../store-to-invariant-check1.ll              |  16 +-
 .../store-to-invariant-check2.ll              |   4 +-
 .../store-to-invariant-check3.ll              |   6 +-
 .../illegal-parallel-loop-uniform-write.ll    |  67 +++-
 .../X86/invariant-store-vectorization.ll      | 105 ++++++
 .../invariant-store-vectorization.ll          | 310 +++++++++++++++++-
 11 files changed, 501 insertions(+), 43 deletions(-)

diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 86b402b2394..c59c86c4994 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -564,10 +564,10 @@ public:
   /// Print the information about the memory accesses in the loop.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
-  /// If the loop has any store of a variant value to an invariant address, then
+  /// If the loop has multiple stores to an invariant address, then
   /// return true, else return false.
-  bool hasVariantStoreToLoopInvariantAddress() const {
-    return HasVariantStoreToLoopInvariantAddress;
+  bool hasMultipleStoresToLoopInvariantAddress() const {
+    return HasMultipleStoresToLoopInvariantAddress;
   }
 
   /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
@@ -620,8 +620,8 @@ private:
   /// Cache the result of analyzeLoop.
   bool CanVecMem;
 
-  /// Indicator that there is a store of a variant value to a uniform address.
-  bool HasVariantStoreToLoopInvariantAddress;
+  /// Indicator that there are multiple stores to a uniform address.
+  bool HasMultipleStoresToLoopInvariantAddress;
 
   /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index b43e290956d..4b8e8afdabb 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1869,13 +1869,9 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
   for (StoreInst *ST : Stores) {
     Value *Ptr = ST->getPointerOperand();
 
-    if (isUniform(Ptr)) {
-      // Consider multiple stores to the same uniform address as a store of a
-      // variant value.
-      bool MultipleStoresToUniformPtr = !UniformStores.insert(Ptr).second;
-      HasVariantStoreToLoopInvariantAddress |=
-          (!isUniform(ST->getValueOperand()) || MultipleStoresToUniformPtr);
-    }
+    if (isUniform(Ptr))
+      HasMultipleStoresToLoopInvariantAddress |=
+          !UniformStores.insert(Ptr).second;
 
     // If we did *not* see this pointer before, insert it to  the read-write
     // list. At this phase it is only a 'write' list.
@@ -2276,7 +2272,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
       PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
       DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
       NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
-      HasVariantStoreToLoopInvariantAddress(false) {
+      HasMultipleStoresToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);
 }
@@ -2308,8 +2304,8 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
   PtrRtChecking->print(OS, Depth);
   OS << "\n";
 
-  OS.indent(Depth) << "Variant Store to invariant address was "
-                   << (HasVariantStoreToLoopInvariantAddress ? "" : "not ")
+  OS.indent(Depth) << "Multiple stores to invariant address were "
+                   << (HasMultipleStoresToLoopInvariantAddress ? "" : "not ")
                    << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 7e11504c0e0..bde90a71b41 100644
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -817,11 +817,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   if (!LAI->canVectorizeMemory())
     return false;
 
-  if (LAI->hasVariantStoreToLoopInvariantAddress()) {
+  if (LAI->hasMultipleStoresToLoopInvariantAddress()) {
     ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
-              << "write of variant value to a loop invariant address could not "
+              << "multiple writes to a loop invariant address could not "
                  "be vectorized");
-    LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+    LLVM_DEBUG(
+        dbgs() << "LV: We don't allow multiple stores to a uniform address\n");
     return false;
   }
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2ba2f00b4a5..5a11c5a54ae 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1189,7 +1189,6 @@ private:
   /// Load: scalar load + broadcast.
   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
   /// element)
-  /// TODO: Test the extra cost of the extract when loop variant value stored.
   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
 
   /// Returns whether the instruction is a load or store and will be a emitted
diff --git a/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll b/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
index 10f9c767904..0d0fe65694c 100644
--- a/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
+++ b/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
@@ -39,7 +39,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 ; CHECK-NEXT:      Group
 ; CHECK-NEXT:        (Low: %b High: ((4 * (1 umax %x)) + %b))
 ; CHECK-NEXT:          Member: {%b,+,4}<%for.body>
-; CHECK:         Variant Store to invariant address was not found in loop.
+; CHECK:         Multiple stores to invariant address were not found in loop.
 ; CHECK-NEXT:    SCEV assumptions:
 ; CHECK-NEXT:    {1,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:    {0,+,1}<%for.body> Added Flags: <nusw>
diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
index ad9b1295a6d..f24211d1e0d 100644
--- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
+++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
@@ -1,26 +1,27 @@
 ; RUN: opt < %s -loop-accesses -analyze | FileCheck -check-prefix=OLDPM %s
 ; RUN: opt -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' -disable-output  < %s 2>&1 | FileCheck -check-prefix=NEWPM %s
 
-; Test to confirm LAA will find store to invariant address.
-; Inner loop has a store to invariant address.
+; Test to confirm LAA will find multiple stores to an invariant address in the
+; inner loop.
 ;
 ;  for(; i < itr; i++) {
 ;    for(; j < itr; j++) {
 ;      var1[i] = var2[j] + var1[i];
+;      var1[i]++;
 ;    }
 ;  }
 
 ; The LAA with the new PM is a loop pass so we go from inner to outer loops.
 
 ; OLDPM: for.cond1.preheader:
-; OLDPM:   Variant Store to invariant address was not found in loop.
+; OLDPM:   Multiple stores to invariant address were not found in loop.
 ; OLDPM: for.body3:
-; OLDPM:   Variant Store to invariant address was found in loop.
+; OLDPM:   Multiple stores to invariant address were found in loop.
 
 ; NEWPM: for.body3:
-; NEWPM:   Variant Store to invariant address was found in loop.
+; NEWPM:   Multiple stores to invariant address were found in loop.
 ; NEWPM: for.cond1.preheader:
-; NEWPM:   Variant Store to invariant address was not found in loop.
+; NEWPM:   Multiple stores to invariant address were not found in loop.
 
 define i32 @foo(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
 entry:
@@ -45,6 +46,9 @@ for.body3:                                        ; preds = %for.body3, %for.bod
   %2 = load i32, i32* %arrayidx5, align 4
   %add = add nsw i32 %2, %1
   store i32 %add, i32* %arrayidx5, align 4
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nsw i32 %3, 1
+  store i32 %4, i32* %arrayidx5, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %itr
diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
index e40c9e733cd..07bcdcc5c66 100644
--- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
+++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
@@ -10,8 +10,8 @@
 ;    }
 ;  }
 
-; CHECK: Variant Store to invariant address was not found in loop.
-; CHECK-NOT: Variant Store to invariant address was found in loop.
+; CHECK: Multiple stores to invariant address were not found in loop.
+; CHECK-NOT: Multiple stores to invariant address were found in loop.
 
 
 define i32 @foo(i32* nocapture readonly %var1, i32* nocapture %var2, i32 %itr) #0 {
diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
index eaadcfecaa3..8d7452471f5 100644
--- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
+++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
@@ -1,8 +1,8 @@
 ; RUN: opt < %s -loop-accesses -analyze | FileCheck %s
 ; RUN: opt -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' -disable-output  < %s 2>&1 | FileCheck %s
 
-; Test to confirm LAA will find store to invariant address.
-; Inner loop has a store to invariant address.
+; Inner loop has a store to invariant address, but LAA does not need to identify
+; the store to invariant address, since it is a single store.
 ;
 ;  for(; i < itr; i++) {
 ;    for(; j < itr; j++) {
@@ -10,7 +10,7 @@
 ;    }
 ;  }
 
-; CHECK: Variant Store to invariant address was found in loop.
+; CHECK: Multiple stores to invariant address were not found in loop.
 
 define void @foo(i32* nocapture %var1, i32* nocapture %var2, i32 %itr) #0 {
 entry:
diff --git a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index cbba5300b9c..c78bcdd1721 100644
--- a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -3,9 +3,23 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-;CHECK-LABEL: @foo(
-;CHECK-NOT: <4 x i32>
-;CHECK: ret void
+; CHECK-LABEL: @foo(
+; CHECK: <4 x i32>
+; CHECK: ret void
+
+; PR15794
+; incorrect addition of llvm.mem.parallel_loop_access metadata is undefined
+; behaviour. Vectorizer ignores the memory dependency checks and goes ahead and
+; vectorizes this loop with uniform stores which has an output dependency.
+
+; void foo(int *a, int *b, int k, int m) {
+;   for (int i = 0; i < m; i++) {
+;     for (int j = 0; j < m; j++) {
+;       a[i] = a[i + j + k] + 1; <<<
+;     }
+;     b[i] = b[i] + 3;
+;   }
+; }
 
 ; Function Attrs: nounwind uwtable 
 define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
@@ -48,6 +62,53 @@ for.end15:                                        ; preds = %for.end.us, %entry
   ret void
 }
 
+; Same test as above, but without the invalid parallel_loop_access metadata.
+
+; Here we can see the vectorizer does the mem dep checks and decides it is
+; unsafe to vectorize.
+; CHECK-LABEL: no-par-mem-metadata(
+; CHECK-NOT: <4 x i32>
+; CHECK:     ret void
+define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+entry:
+  %cmp27 = icmp sgt i32 %m, 0
+  br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
+
+for.end.us:                                       ; preds = %for.body3.us
+  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
+  %0 = load i32, i32* %arrayidx9.us, align 4
+  %add10.us = add nsw i32 %0, 3
+  store i32 %add10.us, i32* %arrayidx9.us, align 4
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
+  br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop !5
+
+for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+  %indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ]
+  %1 = trunc i64 %indvars.iv29 to i32
+  %add4.us = add i32 %add.us, %1
+  %idxprom.us = sext i32 %add4.us to i64
+  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
+  %2 = load i32, i32* %arrayidx.us, align 4
+  %add5.us = add nsw i32 %2, 1
+  store i32 %add5.us, i32* %arrayidx7.us, align 4
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
+  br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop !4
+
+for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
+  %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
+  %3 = trunc i64 %indvars.iv33 to i32
+  %add.us = add i32 %3, %k
+  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
+  br label %for.body3.us
+
+for.end15:                                        ; preds = %for.end.us, %entry
+  ret void
+}
+
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !3 = !{!4, !5}
diff --git a/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index e8c369da3be..9428a6d6f74 100644
--- a/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -130,3 +130,108 @@ latch:
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32* %c, i32 %k) {
+; CHECK-LABEL: @variant_val_store_to_inv_address_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[C5:%.*]] = bitcast i32* [[C:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i32, i32* [[C]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ugt i32* [[SCEVGEP6]], [[B]]
+; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C]]
+; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ugt i32* [[SCEVGEP6]], [[A]]
+; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ugt i8* [[UGLYGEP]], [[C5]]
+; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT16]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT18]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT21:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT20]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT17]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT19]], <16 x i32>* [[TMP5]], align 4, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP7]], i32 8, <16 x i1> [[TMP4]], <16 x i32> undef), !alias.scope !21
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT21]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !22, !noalias !21
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !23
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 8
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !24
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  %tmp3 = getelementptr inbounds i32, i32* %c, i64 %i
+  %tmp4 = load i32, i32* %tmp3, align 8
+  store i32 %tmp4, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 7bdfd405455..69e202f8889 100644
--- a/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -162,10 +162,74 @@ for.end:                                          ; preds = %for.body
 
 ; Instcombine'd version of above test. Now the store is no longer of invariant
 ; value.
-; TODO: We should be able to vectorize this loop once we support vectorizing
-; stores of variant values to invariant addresses.
+; scalar store the value extracted from the last element of the vector value.
 ; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic
-; CHECK-NOT:   <4 x
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32> [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[A]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       cond_store_k:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]
+; CHECK-NEXT:    store i32 [[STOREVAL]], i32* [[A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) {
 entry:
   %ntrunc = trunc i64 %n to i32
@@ -199,10 +263,75 @@ for.end:                                          ; preds = %for.body
 ; invariant val stored to invariant address predicated on invariant condition
 ; This is not treated as a predicated store since the block the store belongs to
 ; is the latch block (which doesn't need to be predicated).
-; TODO: We should vectorize this loop once we relax the check for
 ; variant/invariant values being stored to invariant address.
+; test checks that the last element of the phi is extracted and scalar stored
+; into the uniform address within the loop.
+; Since the condition and the phi is loop invariant, they are LICM'ed after
+; vectorization.
 ; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv
-; CHECK-NOT: <4 x
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[K]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP2]], <i1 undef, i1 undef, i1 undef, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP3]], <4 x i32> [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[A]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       cond_store_k:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]
+; CHECK-NEXT:    store i32 [[STOREVAL]], i32* [[A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) {
 entry:
   %ntrunc = trunc i64 %n to i32
@@ -233,10 +362,67 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-; TODO: This loop can be vectorized once we support variant value being
-; stored into invariant address.
+; variant value stored to uniform address tests that the code gen extracts the
+; last element from the variant vector and scalar stores it into the uniform
+; address.
 ; CHECK-LABEL: variant_val_store_to_inv_address
-; CHECK-NOT: <4 x i32>
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[B2:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A1]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX3:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[DOTLCSSA]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[DOTLCSSA]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[TMP3_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
 define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
 entry:
   %ntrunc = trunc i64 %n to i32
@@ -255,6 +441,112 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %cond, label %for.body, label %for.end
 
 for.end:                                          ; preds = %for.body
-  %rdx.lcssa = phi i32 [ %tmp0, %for.body ]
+  %rdx.lcssa = phi i32 [ %tmp3, %for.body ]
   ret i32 %rdx.lcssa
 }
+
+; Multiple variant stores to the same uniform address
+; We do not vectorize such loops currently.
+;  for(; i < itr; i++) {
+;    for(; j < itr; j++) {
+;      var1[i] = var2[j] + var1[i];
+;      var1[i]++;
+;    }
+;  }
+
+; CHECK-LABEL: multiple_uniform_stores
+; CHECK-NOT:     <4 x i32>
+define i32 @multiple_uniform_stores(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+entry:
+  %cmp20 = icmp eq i32 %itr, 0
+  br i1 %cmp20, label %for.end10, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.inc8 ], [ 0, %entry ]
+  %j.022 = phi i32 [ %j.1.lcssa, %for.inc8 ], [ 0, %entry ]
+  %cmp218 = icmp ult i32 %j.022, %itr
+  br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
+  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
+  %0 = zext i32 %j.022 to i64
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, i32* %arrayidx5, align 4
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nsw i32 %3, 1
+  store i32 %4, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %itr
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3, %for.cond1.preheader
+  %j.1.lcssa = phi i32 [ %j.022, %for.cond1.preheader ], [ %itr, %for.body3 ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv.next24 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, %itr
+  br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
+
+for.end10:                                        ; preds = %for.inc8, %entry
+  ret i32 undef
+}
+
+; second uniform store to the same address is conditional.
+; we do not vectorize this.
+; CHECK-LABEL: multiple_uniform_stores_conditional
+; CHECK-NOT:    <4 x i32>
+define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+entry:
+  %cmp20 = icmp eq i32 %itr, 0
+  br i1 %cmp20, label %for.end10, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.inc8 ], [ 0, %entry ]
+  %j.022 = phi i32 [ %j.1.lcssa, %for.inc8 ], [ 0, %entry ]
+  %cmp218 = icmp ult i32 %j.022, %itr
+  br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
+  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
+  %0 = zext i32 %j.022 to i64
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %latch ]
+  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, i32* %arrayidx5, align 4
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nsw i32 %3, 1
+  %5 = icmp ugt i32 %3, 42
+  br i1 %5, label %cond_store, label %latch
+
+cond_store:
+  store i32 %4, i32* %arrayidx5, align 4
+  br label %latch
+
+latch:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %itr
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3, %for.cond1.preheader
+  %j.1.lcssa = phi i32 [ %j.022, %for.cond1.preheader ], [ %itr, %latch ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv.next24 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, %itr
+  br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
+
+for.end10:                                        ; preds = %for.inc8, %entry
+  ret i32 undef
+}
-- 
GitLab


From 6d4ce2a09e20bafba20b5fe2364d6c8014428f5b Mon Sep 17 00:00:00 2001
From: Chris Lattner <sabre@nondot.org>
Date: Tue, 16 Oct 2018 16:54:10 +0000
Subject: [PATCH 0250/1116] fix an out of date paragraph noticed by Bryce
 Lelbach

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344621 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/DeveloperPolicy.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index 4eda6c77b9f..9125197a73a 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -730,10 +730,6 @@ code already distributed under a more liberal license (like the UIUC license),
 and GPL-containing subprojects are kept in separate SVN repositories whose
 LICENSE.txt files specifically indicate that they contain GPL code.
 
-We have no plans to change the license of LLVM.  If you have questions or
-comments about the license, please contact the `LLVM Developer's Mailing
-List <mailto:llvm-dev@lists.llvm.org>`_.
-
 Patents
 -------
 
-- 
GitLab


From f0b741493a2fb00e483ce425723151dc41317a07 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Tue, 16 Oct 2018 17:19:28 +0000
Subject: [PATCH 0251/1116] [NFC][AArch64] Refactor macro fusion

Simplify API of checking functions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344624 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64MacroFusion.cpp | 166 ++++++++++++----------
 1 file changed, 90 insertions(+), 76 deletions(-)

diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index 43ebcfd9893..3d1ca7c4804 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -21,14 +21,16 @@ using namespace llvm;
 namespace {
 
 /// CMN, CMP, TST followed by Bcc
-static bool isArithmeticBccPair(unsigned FirstOpcode, unsigned SecondOpcode,
-                                const MachineInstr *FirstMI) {
-  if (SecondOpcode != AArch64::Bcc)
+static bool isArithmeticBccPair(const MachineInstr *FirstMI,
+                                const MachineInstr &SecondMI) {
+  if (SecondMI.getOpcode() != AArch64::Bcc)
     return false;
 
-  switch (FirstOpcode) {
-  case AArch64::INSTRUCTION_LIST_END:
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  if (FirstMI == nullptr)
     return true;
+
+  switch (FirstMI->getOpcode()) {
   case AArch64::ADDSWri:
   case AArch64::ADDSWrr:
   case AArch64::ADDSXri:
@@ -55,21 +57,28 @@ static bool isArithmeticBccPair(unsigned FirstOpcode, unsigned SecondOpcode,
     // Shift value can be 0 making these behave like the "rr" variant...
     return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
   }
+
   return false;
 }
 
 /// ALU operations followed by CBZ/CBNZ.
-static bool isArithmeticCbzPair(unsigned FirstOpcode, unsigned SecondOpcode,
-                                const MachineInstr *FirstMI) {
-  if (SecondOpcode != AArch64::CBNZW &&
-      SecondOpcode != AArch64::CBNZX &&
-      SecondOpcode != AArch64::CBZW &&
-      SecondOpcode != AArch64::CBZX)
+static bool isArithmeticCbzPair(const MachineInstr *FirstMI,
+                                const MachineInstr &SecondMI) {
+  switch (SecondMI.getOpcode()) {
+  default:
     return false;
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+    LLVM_FALLTHROUGH;
+  }
 
-  switch (FirstOpcode) {
-  case AArch64::INSTRUCTION_LIST_END:
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  if (FirstMI == nullptr)
     return true;
+
+  switch (FirstMI->getOpcode()) {
   case AArch64::ADDWri:
   case AArch64::ADDWrr:
   case AArch64::ADDXri:
@@ -102,34 +111,39 @@ static bool isArithmeticCbzPair(unsigned FirstOpcode, unsigned SecondOpcode,
     // Shift value can be 0 making these behave like the "rr" variant...
     return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
   }
+
   return false;
 }
 
 /// AES crypto encoding or decoding.
-static bool isAESPair(unsigned FirstOpcode, unsigned SecondOpcode) {
+static bool isAESPair(const MachineInstr *FirstMI,
+                      const MachineInstr &SecondMI) {
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  switch (SecondMI.getOpcode()) {
   // AES encode.
-  if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-       FirstOpcode == AArch64::AESErr) &&
-      (SecondOpcode == AArch64::AESMCrr ||
-       SecondOpcode == AArch64::AESMCrrTied))
-    return true;
+  case AArch64::AESMCrr:
+  case AArch64::AESMCrrTied:
+    return FirstMI == nullptr || FirstMI->getOpcode() == AArch64::AESErr;
   // AES decode.
-  else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-            FirstOpcode == AArch64::AESDrr) &&
-           (SecondOpcode == AArch64::AESIMCrr ||
-            SecondOpcode == AArch64::AESIMCrrTied))
-    return true;
+  case AArch64::AESIMCrr:
+  case AArch64::AESIMCrrTied:
+    return FirstMI == nullptr || FirstMI->getOpcode() == AArch64::AESDrr;
+  }
 
   return false;
 }
 
 /// AESE/AESD/PMULL + EOR.
-static bool isCryptoEORPair(unsigned FirstOpcode, unsigned SecondOpcode) {
-  if (SecondOpcode != AArch64::EORv16i8)
+static bool isCryptoEORPair(const MachineInstr *FirstMI,
+                            const MachineInstr &SecondMI) {
+  if (SecondMI.getOpcode() != AArch64::EORv16i8)
     return false;
 
-  switch (FirstOpcode) {
-  case AArch64::INSTRUCTION_LIST_END:
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  if (FirstMI == nullptr)
+    return true;
+
+  switch (FirstMI->getOpcode()) {
   case AArch64::AESErr:
   case AArch64::AESDrr:
   case AArch64::PMULLv16i8:
@@ -138,45 +152,47 @@ static bool isCryptoEORPair(unsigned FirstOpcode, unsigned SecondOpcode) {
   case AArch64::PMULLv2i64:
     return true;
   }
+
   return false;
 }
 
 /// Literal generation.
-static bool isLiteralsPair(unsigned FirstOpcode, unsigned SecondOpcode,
-                           const MachineInstr *FirstMI,
+static bool isLiteralsPair(const MachineInstr *FirstMI,
                            const MachineInstr &SecondMI) {
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+
   // PC relative address.
-  if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-       FirstOpcode == AArch64::ADRP) &&
-      SecondOpcode == AArch64::ADDXri)
+  if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::ADRP) &&
+      SecondMI.getOpcode() == AArch64::ADDXri)
     return true;
+
   // 32 bit immediate.
-  else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-            FirstOpcode == AArch64::MOVZWi) &&
-           (SecondOpcode == AArch64::MOVKWi &&
-            SecondMI.getOperand(3).getImm() == 16))
+  if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVZWi) &&
+      (SecondMI.getOpcode() == AArch64::MOVKWi &&
+       SecondMI.getOperand(3).getImm() == 16))
     return true;
+
   // Lower half of 64 bit immediate.
-  else if((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-           FirstOpcode == AArch64::MOVZXi) &&
-          (SecondOpcode == AArch64::MOVKXi &&
-           SecondMI.getOperand(3).getImm() == 16))
+  if((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVZXi) &&
+     (SecondMI.getOpcode() == AArch64::MOVKXi &&
+      SecondMI.getOperand(3).getImm() == 16))
     return true;
+
   // Upper half of 64 bit immediate.
-  else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-            (FirstOpcode == AArch64::MOVKXi &&
-             FirstMI->getOperand(3).getImm() == 32)) &&
-           (SecondOpcode == AArch64::MOVKXi &&
-            SecondMI.getOperand(3).getImm() == 48))
+  if ((FirstMI == nullptr ||
+       (FirstMI->getOpcode() == AArch64::MOVKXi &&
+        FirstMI->getOperand(3).getImm() == 32)) &&
+      (SecondMI.getOpcode() == AArch64::MOVKXi &&
+       SecondMI.getOperand(3).getImm() == 48))
     return true;
 
   return false;
 }
 
-// Fuse address generation and loads or stores.
-static bool isAddressLdStPair(unsigned FirstOpcode, unsigned SecondOpcode,
+/// Fuse address generation and loads or stores.
+static bool isAddressLdStPair(const MachineInstr *FirstMI,
                               const MachineInstr &SecondMI) {
-  switch (SecondOpcode) {
+  switch (SecondMI.getOpcode()) {
   case AArch64::STRBBui:
   case AArch64::STRBui:
   case AArch64::STRDui:
@@ -200,29 +216,32 @@ static bool isAddressLdStPair(unsigned FirstOpcode, unsigned SecondOpcode,
   case AArch64::LDRSHWui:
   case AArch64::LDRSHXui:
   case AArch64::LDRSWui:
-    switch (FirstOpcode) {
-    case AArch64::INSTRUCTION_LIST_END:
+    // Assume the 1st instr to be a wildcard if it is unspecified.
+    if (FirstMI == nullptr)
       return true;
+
+   switch (FirstMI->getOpcode()) {
     case AArch64::ADR:
       return SecondMI.getOperand(2).getImm() == 0;
     case AArch64::ADRP:
       return true;
     }
   }
+
   return false;
 }
 
-// Compare and conditional select.
-static bool isCCSelectPair(unsigned FirstOpcode, unsigned SecondOpcode,
-                           const MachineInstr *FirstMI) {
+/// Compare and conditional select.
+static bool isCCSelectPair(const MachineInstr *FirstMI,
+                           const MachineInstr &SecondMI) {
   // 32 bits
-  if (SecondOpcode == AArch64::CSELWr) {
+  if (SecondMI.getOpcode() == AArch64::CSELWr) {
     // Assume the 1st instr to be a wildcard if it is unspecified.
-    if (FirstOpcode == AArch64::INSTRUCTION_LIST_END)
+    if (FirstMI == nullptr)
       return true;
 
     if (FirstMI->definesRegister(AArch64::WZR))
-      switch (FirstOpcode) {
+      switch (FirstMI->getOpcode()) {
       case AArch64::SUBSWrs:
         return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
       case AArch64::SUBSWrx:
@@ -232,14 +251,15 @@ static bool isCCSelectPair(unsigned FirstOpcode, unsigned SecondOpcode,
         return true;
       }
   }
+
   // 64 bits
-  else if (SecondOpcode == AArch64::CSELXr) {
+  if (SecondMI.getOpcode() == AArch64::CSELXr) {
     // Assume the 1st instr to be a wildcard if it is unspecified.
-    if (FirstOpcode == AArch64::INSTRUCTION_LIST_END)
+    if (FirstMI == nullptr)
       return true;
 
     if (FirstMI->definesRegister(AArch64::XZR))
-      switch (FirstOpcode) {
+      switch (FirstMI->getOpcode()) {
       case AArch64::SUBSXrs:
         return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
       case AArch64::SUBSXrx:
@@ -250,6 +270,7 @@ static bool isCCSelectPair(unsigned FirstOpcode, unsigned SecondOpcode,
         return true;
       }
   }
+
   return false;
 }
 
@@ -262,28 +283,21 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
                                    const MachineInstr &SecondMI) {
   const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
 
-  // Assume the 1st instr to be a wildcard if it is unspecified.
-  unsigned FirstOpc =
-      FirstMI ? FirstMI->getOpcode()
-              : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
-  unsigned SecondOpc = SecondMI.getOpcode();
-
-  if (ST.hasArithmeticBccFusion() &&
-      isArithmeticBccPair(FirstOpc, SecondOpc, FirstMI))
+  // All checking functions assume that the 1st instr is a wildcard if it is
+  // unspecified.
+  if (ST.hasArithmeticBccFusion() && isArithmeticBccPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasArithmeticCbzFusion() &&
-      isArithmeticCbzPair(FirstOpc, SecondOpc, FirstMI))
+  if (ST.hasArithmeticCbzFusion() && isArithmeticCbzPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasFuseAES() && isAESPair(FirstOpc, SecondOpc))
+  if (ST.hasFuseAES() && isAESPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasFuseCryptoEOR() && isCryptoEORPair(FirstOpc, SecondOpc))
+  if (ST.hasFuseCryptoEOR() && isCryptoEORPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasFuseLiterals() &&
-      isLiteralsPair(FirstOpc, SecondOpc, FirstMI, SecondMI))
+  if (ST.hasFuseLiterals() && isLiteralsPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasFuseAddress() && isAddressLdStPair(FirstOpc, SecondOpc, SecondMI))
+  if (ST.hasFuseAddress() && isAddressLdStPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasFuseCCSelect() && isCCSelectPair(FirstOpc, SecondOpc, FirstMI))
+  if (ST.hasFuseCCSelect() && isCCSelectPair(FirstMI, SecondMI))
     return true;
 
   return false;
-- 
GitLab


From ae40630a7b6c1932bf5a3c0a66265afb52a402dc Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Tue, 16 Oct 2018 17:19:51 +0000
Subject: [PATCH 0252/1116] [NFC][ARM] Refactor macro fusion

Simplify code for wildcards.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344625 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMMacroFusion.cpp | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/lib/Target/ARM/ARMMacroFusion.cpp b/lib/Target/ARM/ARMMacroFusion.cpp
index d11fe9d5c50..df1da9d8e47 100644
--- a/lib/Target/ARM/ARMMacroFusion.cpp
+++ b/lib/Target/ARM/ARMMacroFusion.cpp
@@ -23,20 +23,13 @@ namespace llvm {
 static bool isAESPair(const MachineInstr *FirstMI,
                       const MachineInstr &SecondMI) {
   // Assume the 1st instr to be a wildcard if it is unspecified.
-  unsigned FirstOpcode =
-      FirstMI ? FirstMI->getOpcode()
-              : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
-  unsigned SecondOpcode = SecondMI.getOpcode();
-
-  switch(SecondOpcode) {
+  switch(SecondMI.getOpcode()) {
   // AES encode.
   case ARM::AESMC :
-    return FirstOpcode == ARM::AESE ||
-           FirstOpcode == ARM::INSTRUCTION_LIST_END;
+    return FirstMI == nullptr || FirstMI->getOpcode() == ARM::AESE;
   // AES decode.
   case ARM::AESIMC:
-    return FirstOpcode == ARM::AESD ||
-           FirstOpcode == ARM::INSTRUCTION_LIST_END;
+    return FirstMI == nullptr || FirstMI->getOpcode() == ARM::AESD;
   }
 
   return false;
@@ -46,15 +39,8 @@ static bool isAESPair(const MachineInstr *FirstMI,
 static bool isLiteralsPair(const MachineInstr *FirstMI,
                            const MachineInstr &SecondMI) {
   // Assume the 1st instr to be a wildcard if it is unspecified.
-  unsigned FirstOpcode =
-      FirstMI ? FirstMI->getOpcode()
-              : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
-  unsigned SecondOpcode = SecondMI.getOpcode();
-
-  // 32 bit immediate.
-  if ((FirstOpcode == ARM::INSTRUCTION_LIST_END ||
-       FirstOpcode == ARM::MOVi16) &&
-      SecondOpcode == ARM::MOVTi16)
+  if ((FirstMI == nullptr || FirstMI->getOpcode() == ARM::MOVi16) &&
+      SecondMI.getOpcode() == ARM::MOVTi16)
     return true;
 
   return false;
-- 
GitLab


From b325eb110cae91331ba16fabe7517790f4b4f545 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Tue, 16 Oct 2018 17:35:41 +0000
Subject: [PATCH 0253/1116] [Intrinsic] Signed Saturation Addition Intrinsic

Add an intrinsic that takes 2 integers and perform saturation addition on them.

This is a part of implementing fixed point arithmetic in clang where some of
the more complex operations will be implemented as intrinsics.

Differential Revision: https://reviews.llvm.org/D53053

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344629 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/ISDOpcodes.h             |   8 +
 include/llvm/CodeGen/TargetLowering.h         |   5 +
 include/llvm/IR/Intrinsics.td                 |   6 +
 include/llvm/Target/TargetSelectionDAG.td     |   2 +
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |   8 +
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  39 +++
 lib/CodeGen/SelectionDAG/LegalizeTypes.h      |   2 +
 .../SelectionDAG/LegalizeVectorOps.cpp        |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   3 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   6 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   2 +
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |  43 +++
 lib/CodeGen/TargetLoweringBase.cpp            |   1 +
 lib/IR/Verifier.cpp                           |   9 +
 test/CodeGen/X86/sadd_sat.ll                  | 267 ++++++++++++++++++
 15 files changed, 402 insertions(+)
 create mode 100644 test/CodeGen/X86/sadd_sat.ll

diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index d9a513fe247..535fc4f0bf4 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -256,6 +256,14 @@ namespace ISD {
     /// Same for multiplication.
     SMULO, UMULO,
 
+    /// RESULT = SADDSAT(LHS, RHS) - Perform signed saturation addition on 2
+    /// integers with the same bit width (W). If the true value of LHS + RHS
+    /// exceeds the largest signed value that can be represented by W bits, the
+    /// resulting value is this maximum value. Otherwise, if this value is less
+    /// than the smallest signed value that can be represented by W bits, the
+    /// resulting value is this minimum value.
+    SADDSAT,
+
     /// Simple binary floating point operators.
     FADD, FSUB, FMUL, FDIV, FREM,
 
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index a5939070476..d22f707d259 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -3681,6 +3681,11 @@ public:
   SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
                                   SDValue Index) const;
 
+  /// Method for building the DAG expansion of ISD::SADDSAT. This method accepts
+  /// integers or vectors of integers as its arguments.
+  SDValue getExpandedSignedSaturationAddition(SDNode *Node,
+                                              SelectionDAG &DAG) const;
+
   //===--------------------------------------------------------------------===//
   // Instruction Emitting Hooks
   //
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 410e35f9acb..978f471f7ea 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -708,6 +708,12 @@ def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
                                        [LLVMMatchType<0>, LLVMMatchType<0>],
                                        [IntrNoMem, IntrSpeculatable]>;
 
+//===------------------------- Fixed Point Intrinsics ---------------------===//
+//
+def int_sadd_sat : Intrinsic<[llvm_anyint_ty],
+                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+
 //===------------------------- Memory Use Markers -------------------------===//
 //
 def int_lifetime_start  : Intrinsic<[],
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 4e463b9281d..1ea370d39e9 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -373,6 +373,8 @@ def umin       : SDNode<"ISD::UMIN"      , SDTIntBinOp,
 def umax       : SDNode<"ISD::UMAX"      , SDTIntBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
 
+def saddsat    : SDNode<"ISD::SADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
+
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
 def zext_invec : SDNode<"ISD::ZERO_EXTEND_VECTOR_INREG", SDTExtInvec>;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 07a37a5092a..71d124c74ce 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1115,6 +1115,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getStrictFPOperationAction(Node->getOpcode(),
                                             Node->getValueType(0));
     break;
+  case ISD::SADDSAT: {
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    break;
+  }
   case ISD::MSCATTER:
     Action = TLI.getOperationAction(Node->getOpcode(),
                     cast<MaskedScatterSDNode>(Node)->getValue().getValueType());
@@ -3451,6 +3455,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
     break;
   }
+  case ISD::SADDSAT: {
+    Results.push_back(TLI.getExpandedSignedSaturationAddition(Node, DAG));
+    break;
+  }
   case ISD::SADDO:
   case ISD::SSUBO: {
     SDValue LHS = Node->getOperand(0);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 064e9e5875b..fffebaf194e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -141,6 +141,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:    Res = PromoteIntRes_ADDSUBCARRY(N, ResNo); break;
 
+  case ISD::SADDSAT:     Res = PromoteIntRes_SADDSAT(N); break;
+
   case ISD::ATOMIC_LOAD:
     Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
 
@@ -546,6 +548,35 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   return SDValue(Res.getNode(), 1);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_SADDSAT(SDNode *N) {
+  // For promoting iN -> iM, this can be expanded by
+  // 1. ANY_EXTEND iN to iM
+  // 2. SHL by M-N
+  // 3. SADDSAT
+  // 4. ASHR by M-N
+  SDLoc dl(N);
+  SDValue Op1 = N->getOperand(0);
+  SDValue Op2 = N->getOperand(1);
+  unsigned OldBits = Op1.getValueSizeInBits();
+
+  SDValue Op1Promoted = GetPromotedInteger(Op1);
+  SDValue Op2Promoted = GetPromotedInteger(Op2);
+
+  EVT PromotedType = Op1Promoted.getValueType();
+  unsigned NewBits = Op1Promoted.getValueSizeInBits();
+  unsigned SHLAmount = NewBits - OldBits;
+  EVT SHVT = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
+  SDValue ShiftAmount = DAG.getConstant(SHLAmount, dl, SHVT);
+  Op1Promoted =
+      DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
+  Op2Promoted =
+      DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
+
+  SDValue Result =
+      DAG.getNode(ISD::SADDSAT, dl, PromotedType, Op1Promoted, Op2Promoted);
+  return DAG.getNode(ISD::SRA, dl, PromotedType, Result, ShiftAmount);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
   if (ResNo == 1)
     return PromoteIntRes_Overflow(N);
@@ -1466,6 +1497,8 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::USUBO: ExpandIntRes_UADDSUBO(N, Lo, Hi); break;
   case ISD::UMULO:
   case ISD::SMULO: ExpandIntRes_XMULO(N, Lo, Hi); break;
+
+  case ISD::SADDSAT: ExpandIntRes_SADDSAT(N, Lo, Hi); break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -2428,6 +2461,12 @@ void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo,
   ReplaceValueWith(SDValue(N, 1), R.getValue(2));
 }
 
+void DAGTypeLegalizer::ExpandIntRes_SADDSAT(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  SDValue Result = TLI.getExpandedSignedSaturationAddition(N, DAG);
+  SplitInteger(Result, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue LHS = Node->getOperand(0);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 3c93563440b..83429ec6e98 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -330,6 +330,7 @@ private:
   SDValue PromoteIntRes_UNDEF(SDNode *N);
   SDValue PromoteIntRes_VAARG(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_SADDSAT(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -414,6 +415,7 @@ private:
   void ExpandIntRes_SADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_XMULO             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SADDSAT           (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_ATOMIC_LOAD       (SDNode *N, SDValue &Lo, SDValue &Hi);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 58d86e8e52e..2c1a4942f68 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -386,6 +386,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI:
   case ISD::FCANONICALIZE:
+  case ISD::SADDSAT:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::FP_ROUND_INREG:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index a08a41ccaf2..8d00b3249d1 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -120,6 +120,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UMIN:
   case ISD::UMAX:
 
+  case ISD::SADDSAT:
+
   case ISD::FPOW:
   case ISD::FREM:
   case ISD::FSUB:
@@ -800,6 +802,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
+  case ISD::SADDSAT:
     SplitVecRes_BinOp(N, Lo, Hi);
     break;
   case ISD::FMA:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3907f647142..2e0456edef7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5771,6 +5771,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, DAG.getSelect(sdl, VT, IsZeroShift, IsFSHL ? X : Y, Or));
     return nullptr;
   }
+  case Intrinsic::sadd_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SADDSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return nullptr;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 594a587e412..9967f0eba10 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -282,6 +282,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::SRA_PARTS:                  return "sra_parts";
   case ISD::SRL_PARTS:                  return "srl_parts";
 
+  case ISD::SADDSAT:                    return "saddsat";
+
   // Conversion operators.
   case ISD::SIGN_EXTEND:                return "sign_extend";
   case ISD::ZERO_EXTEND:                return "zero_extend";
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 150d22cffa7..b9b99b386af 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4651,3 +4651,46 @@ SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
   }
   return SDValue();
 }
+
+SDValue
+TargetLowering::getExpandedSignedSaturationAddition(SDNode *Node,
+                                                    SelectionDAG &DAG) const {
+  assert(Node->getOpcode() == ISD::SADDSAT &&
+         "Expected method to receive SADDSAT node.");
+  assert(Node->getNumOperands() == 2 &&
+         "Expected SADDSAT node to have 2 operands.");
+
+  SDLoc dl(Node);
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  assert(LHS.getValueType().isScalarInteger() &&
+         "Expected operands to be integers. Vector of int arguments should "
+         "already be unrolled.");
+  assert(RHS.getValueType().isScalarInteger() &&
+         "Expected operands to be integers. Vector of int arguments should "
+         "already be unrolled.");
+  assert(LHS.getValueType() == RHS.getValueType() &&
+         "Expected both operands of SADDSAT to be the same type");
+
+  unsigned BitWidth = LHS.getValueSizeInBits();
+  EVT ResultType = LHS.getValueType();
+  EVT BoolVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ResultType);
+  SDValue Result =
+      DAG.getNode(ISD::SADDO, dl, DAG.getVTList(ResultType, BoolVT), LHS, RHS);
+  SDValue Sum = Result.getValue(0);
+  SDValue Overflow = Result.getValue(1);
+
+  // SatMax -> Overflow && Sum < 0
+  // SatMin -> Overflow && Sum > 0
+  SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
+
+  SDValue SumNeg = DAG.getSetCC(dl, BoolVT, Sum, Zero, ISD::SETLT);
+  APInt MinVal = APInt::getSignedMinValue(BitWidth);
+  APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
+  SDValue SatMin = DAG.getConstant(MinVal, dl, ResultType);
+  SDValue SatMax = DAG.getConstant(MaxVal, dl, ResultType);
+
+  Result = DAG.getSelect(dl, ResultType, SumNeg, SatMax, SatMin);
+  return DAG.getSelect(dl, ResultType, Overflow, Result, Sum);
+}
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index b785fdc42a3..03a29a3edf6 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -608,6 +608,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::UMIN, VT, Expand);
     setOperationAction(ISD::UMAX, VT, Expand);
     setOperationAction(ISD::ABS, VT, Expand);
+    setOperationAction(ISD::SADDSAT, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 6e0bb5ad358..dc6c1f663d6 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -4474,6 +4474,15 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
 
     break;
   }
+  case Intrinsic::sadd_sat: {
+    Value *Op1 = CS.getArgOperand(0);
+    Value *Op2 = CS.getArgOperand(1);
+    Assert(Op1->getType()->isIntOrIntVectorTy(),
+           "first operand of sadd_sat must be an int type or vector of ints");
+    Assert(Op2->getType()->isIntOrIntVectorTy(),
+           "second operand of sadd_sat must be an int type or vector of ints");
+    break;
+  }
   };
 }
 
diff --git a/test/CodeGen/X86/sadd_sat.ll b/test/CodeGen/X86/sadd_sat.ll
new file mode 100644
index 00000000000..39788e86cc7
--- /dev/null
+++ b/test/CodeGen/X86/sadd_sat.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=CHECK32
+
+declare  i4  @llvm.sadd.sat.i4   (i4,  i4)
+declare  i32 @llvm.sadd.sat.i32  (i32, i32)
+declare  i64 @llvm.sadd.sat.i64  (i64, i64)
+declare  <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
+
+define i32 @func(i32 %x, i32 %y) {
+; CHECK-LABEL: func:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    addl %esi, %ecx
+; CHECK-NEXT:    setns %al
+; CHECK-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    cmovnol %edi, %eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    .cfi_offset %esi, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    movl %eax, %esi
+; CHECK32-NEXT:    addl %edx, %esi
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    addl %edx, %eax
+; CHECK32-NEXT:    cmovol %ecx, %eax
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+  %tmp = call i32 @llvm.sadd.sat.i32(i32 %x, i32 %y);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) {
+; CHECK-LABEL: func2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    addq %rsi, %rax
+; CHECK-NEXT:    setns %cl
+; CHECK-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    addq %rsi, %rdi
+; CHECK-NEXT:    cmovnoq %rdi, %rax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func2:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    movl %ebx, %ebp
+; CHECK32-NEXT:    adcl %esi, %ebp
+; CHECK32-NEXT:    movl %ebp, %eax
+; CHECK32-NEXT:    sarl $31, %eax
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    testl %ebp, %ebp
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    movl %ecx, %edx
+; CHECK32-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    testl %ebx, %ebx
+; CHECK32-NEXT:    setns %bl
+; CHECK32-NEXT:    cmpb %cl, %bl
+; CHECK32-NEXT:    setne %cl
+; CHECK32-NEXT:    testl %esi, %esi
+; CHECK32-NEXT:    setns %ch
+; CHECK32-NEXT:    cmpb %ch, %bl
+; CHECK32-NEXT:    sete %ch
+; CHECK32-NEXT:    testb %cl, %ch
+; CHECK32-NEXT:    cmovel %ebp, %edx
+; CHECK32-NEXT:    cmovel %edi, %eax
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+  %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %y);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) {
+; CHECK-LABEL: func3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlb $4, %sil
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    addb %sil, %cl
+; CHECK-NEXT:    setns %cl
+; CHECK-NEXT:    addb %sil, %al
+; CHECK-NEXT:    jno .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    addb $127, %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    sarb $4, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func3:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; CHECK32-NEXT:    shlb $4, %dl
+; CHECK32-NEXT:    shlb $4, %al
+; CHECK32-NEXT:    movl %eax, %ecx
+; CHECK32-NEXT:    addb %dl, %cl
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    addb %dl, %al
+; CHECK32-NEXT:    jno .LBB2_2
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    addb $127, %cl
+; CHECK32-NEXT:    movl %ecx, %eax
+; CHECK32-NEXT:  .LBB2_2:
+; CHECK32-NEXT:    sarb $4, %al
+; CHECK32-NEXT:    retl
+  %tmp = call i4 @llvm.sadd.sat.i4(i4 %x, i4 %y);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %r8d
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %r8d, %esi
+; CHECK-NEXT:    addl %ecx, %esi
+; CHECK-NEXT:    setns %dl
+; CHECK-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; CHECK-NEXT:    addl %ecx, %r8d
+; CHECK-NEXT:    cmovol %edx, %r8d
+; CHECK-NEXT:    movd %xmm1, %edx
+; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movl %ecx, %edi
+; CHECK-NEXT:    addl %edx, %edi
+; CHECK-NEXT:    setns %sil
+; CHECK-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    cmovol %esi, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; CHECK-NEXT:    movd %xmm2, %edx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm2, %eax
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    addl %edx, %esi
+; CHECK-NEXT:    setns %dil
+; CHECK-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    cmovol %edi, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; CHECK-NEXT:    movd %xmm1, %r9d
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-NEXT:    movd %xmm0, %edx
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    movl %edx, %esi
+; CHECK-NEXT:    addl %r9d, %esi
+; CHECK-NEXT:    setns %dil
+; CHECK-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    addl %r9d, %edx
+; CHECK-NEXT:    cmovol %edi, %edx
+; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    movd %eax, %xmm1
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    movd %ecx, %xmm0
+; CHECK-NEXT:    movd %r8d, %xmm2
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: vec:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %ecx, %esi
+; CHECK32-NEXT:    addl %edx, %esi
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    addl %edx, %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    cmovol %eax, %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %edx, %edi
+; CHECK32-NEXT:    addl %esi, %edi
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    addl %esi, %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    cmovol %eax, %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %esi, %ebx
+; CHECK32-NEXT:    addl %edi, %ebx
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    addl %edi, %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    cmovol %eax, %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    xorl %ebx, %ebx
+; CHECK32-NEXT:    movl %edi, %ebp
+; CHECK32-NEXT:    addl %eax, %ebp
+; CHECK32-NEXT:    setns %bl
+; CHECK32-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    addl %eax, %edi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    cmovol %ebx, %edi
+; CHECK32-NEXT:    movl %ecx, 12(%eax)
+; CHECK32-NEXT:    movl %edx, 8(%eax)
+; CHECK32-NEXT:    movl %esi, 4(%eax)
+; CHECK32-NEXT:    movl %edi, (%eax)
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
+  ret <4 x i32> %tmp;
+}
-- 
GitLab


From db949a772162f6002928f2113b66b04238df06ae Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 16 Oct 2018 17:37:45 +0000
Subject: [PATCH 0254/1116] [LTO] Call InitLLVM from llvm-lto2

Summary:
D45602 added this to most tools, including llvm-lto, but not to
llvm-lto2. Add it there and test that it works in both lto tools.

Reviewers: ruiu

Subscribers: mehdi_amini, inglorion, eraman, steven_wu, dexonsmith, llvm-commits

Differential Revision: https://reviews.llvm.org/D53330

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344631 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/ThinLTO/X86/deadstrip.ll | 8 +++++---
 tools/llvm-lto2/llvm-lto2.cpp | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll
index ed52222f43a..76dc09e8a9e 100644
--- a/test/ThinLTO/X86/deadstrip.ll
+++ b/test/ThinLTO/X86/deadstrip.ll
@@ -8,10 +8,10 @@
 ; RUN: llvm-lto -exported-symbol=_main -thinlto-action=promote %t1.bc -thinlto-index=%t.index.bc -o - | llvm-lto -exported-symbol=_main -thinlto-action=internalize -thinlto-index %t.index.bc -thinlto-module-id=%t1.bc - -o - | llvm-dis -o - | FileCheck %s
 ; RUN: llvm-lto -exported-symbol=_main -thinlto-action=promote %t2.bc -thinlto-index=%t.index.bc -o - | llvm-lto -exported-symbol=_main -thinlto-action=internalize -thinlto-index %t.index.bc -thinlto-module-id=%t2.bc - -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK2
 
-; RUN: llvm-lto -exported-symbol=_main -thinlto-action=run %t1.bc %t2.bc
+; RUN: llvm-lto -exported-symbol=_main -thinlto-action=run -stats %t1.bc %t2.bc 2>&1 | FileCheck %s --check-prefix=STATS
 ; RUN: llvm-nm %t1.bc.thinlto.o | FileCheck %s --check-prefix=CHECK-NM
 
-; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.out -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.out -save-temps -stats \
 ; RUN:   -r %t1.bc,_main,plx \
 ; RUN:   -r %t1.bc,_bar,pl \
 ; RUN:   -r %t1.bc,_dead_func,pl \
@@ -25,7 +25,7 @@
 ; RUN:   -r %t2.bc,_dead_func,l \
 ; RUN:   -r %t2.bc,_another_dead_func,pl \
 ; RUN:   -thinlto-threads=1 \
-; RUN:	 -debug-only=function-import 2>&1 | FileCheck %s --check-prefix=DEBUG
+; RUN:	 -debug-only=function-import 2>&1 | FileCheck %s --check-prefix=DEBUG --check-prefix=STATS
 ; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=LTO2
 ; RUN: llvm-dis < %t.out.2.3.import.bc | FileCheck %s --check-prefix=LTO2-CHECK2
 ; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM
@@ -89,6 +89,8 @@
 ; DEBUG-DAG: Initialize import for 15611644523426561710 (boo)
 ; DEBUG-DAG: Ignores Dead GUID: 2384416018110111308 (another_dead_func)
 
+; STATS: 3 function-import  - Number of dead stripped symbols in index
+
 ; Next test the case where Inputs/deadstrip.ll does not get a module index,
 ; which will cause it to be handled by regular LTO in the new LTO API.
 ; In that case there are uses of @dead_func in the regular LTO partition
diff --git a/tools/llvm-lto2/llvm-lto2.cpp b/tools/llvm-lto2/llvm-lto2.cpp
index 442973f9020..26426367e25 100644
--- a/tools/llvm-lto2/llvm-lto2.cpp
+++ b/tools/llvm-lto2/llvm-lto2.cpp
@@ -23,6 +23,7 @@
 #include "llvm/LTO/LTO.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/Threading.h"
 
@@ -388,6 +389,7 @@ static int dumpSymtab(int argc, char **argv) {
 }
 
 int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
   InitializeAllTargets();
   InitializeAllTargetMCs();
   InitializeAllAsmPrinters();
-- 
GitLab


From 93dff1efc9f7b5e5392a48fa1e76a774454df69e Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Tue, 16 Oct 2018 17:41:45 +0000
Subject: [PATCH 0255/1116] [PATCH] [NFC][AArch64] Fix refactoring of macro
 fusion

Fix compiler error.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344632 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64MacroFusion.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index 3d1ca7c4804..fb8a339dc4d 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -64,15 +64,11 @@ static bool isArithmeticBccPair(const MachineInstr *FirstMI,
 /// ALU operations followed by CBZ/CBNZ.
 static bool isArithmeticCbzPair(const MachineInstr *FirstMI,
                                 const MachineInstr &SecondMI) {
-  switch (SecondMI.getOpcode()) {
-  default:
+  if (SecondMI.getOpcode() != AArch64::CBZW &&
+      SecondMI.getOpcode() != AArch64::CBZX &&
+      SecondMI.getOpcode() != AArch64::CBNZW &&
+      SecondMI.getOpcode() != AArch64::CBNZX)
     return false;
-  case AArch64::CBNZW:
-  case AArch64::CBNZX:
-  case AArch64::CBZW:
-  case AArch64::CBZX:
-    LLVM_FALLTHROUGH;
-  }
 
   // Assume the 1st instr to be a wildcard if it is unspecified.
   if (FirstMI == nullptr)
-- 
GitLab


From 4998e62d5745cca132cf92cec718be0746e70bcf Mon Sep 17 00:00:00 2001
From: Krasimir Georgiev <krasimir@google.com>
Date: Tue, 16 Oct 2018 18:50:09 +0000
Subject: [PATCH 0256/1116] Revert "[WebAssembly] LSDA info generation"

This reverts commit r344575.
Newly introduced test eh-lsda.ll.test fails with use-after-free under
ASAN build.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344639 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineFunction.h        |  22 +-
 include/llvm/IR/IntrinsicsWebAssembly.td      |   3 +-
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp         |   3 +-
 lib/CodeGen/AsmPrinter/CMakeLists.txt         |   1 -
 lib/CodeGen/AsmPrinter/EHStreamer.cpp         |  10 +-
 lib/CodeGen/AsmPrinter/EHStreamer.h           |  11 +-
 lib/CodeGen/AsmPrinter/WasmException.cpp      |  81 ------
 lib/CodeGen/AsmPrinter/WasmException.h        |  42 ---
 lib/CodeGen/MachineFunction.cpp               |  45 ++--
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  10 +-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |  83 ++----
 lib/CodeGen/TargetLoweringObjectFileImpl.cpp  |   4 -
 lib/CodeGen/WasmEHPrepare.cpp                 |   2 +-
 lib/MC/MCObjectFileInfo.cpp                   |   6 -
 lib/MC/WasmObjectWriter.cpp                   |   4 +-
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  15 +-
 .../WebAssembly/WebAssemblyInstrInfo.td       |   2 -
 .../WebAssembly/WebAssemblyMCInstLower.cpp    |   7 -
 test/CodeGen/WebAssembly/eh-lsda.ll           | 239 ------------------
 test/CodeGen/WebAssembly/wasmehprepare.ll     |   6 +-
 20 files changed, 67 insertions(+), 529 deletions(-)
 delete mode 100644 lib/CodeGen/AsmPrinter/WasmException.cpp
 delete mode 100644 lib/CodeGen/AsmPrinter/WasmException.h
 delete mode 100644 test/CodeGen/WebAssembly/eh-lsda.ll

diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index bc81e485a80..7471b314846 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -316,9 +316,6 @@ class MachineFunction {
   /// Map a landing pad's EH symbol to the call site indexes.
   DenseMap<MCSymbol*, SmallVector<unsigned, 4>> LPadToCallSiteMap;
 
-  /// Map a landing pad to its index.
-  DenseMap<const MachineBasicBlock *, unsigned> WasmLPadToIndexMap;
-
   /// Map of invoke call site index values to associated begin EH_LABEL.
   DenseMap<MCSymbol*, unsigned> CallSiteMap;
 
@@ -813,8 +810,7 @@ public:
   LandingPadInfo &getOrCreateLandingPadInfo(MachineBasicBlock *LandingPad);
 
   /// Remap landing pad labels and remove any deleted landing pads.
-  void tidyLandingPads(DenseMap<MCSymbol *, uintptr_t> *LPMap = nullptr,
-                       bool TidyIfNoBeginLabels = true);
+  void tidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap = nullptr);
 
   /// Return a reference to the landing pad info for the current function.
   const std::vector<LandingPadInfo> &getLandingPads() const {
@@ -857,22 +853,6 @@ public:
   /// Map the landing pad's EH symbol to the call site indexes.
   void setCallSiteLandingPad(MCSymbol *Sym, ArrayRef<unsigned> Sites);
 
-  /// Map the landing pad to its index. Used for Wasm exception handling.
-  void setWasmLandingPadIndex(const MachineBasicBlock *LPad, unsigned Index) {
-    WasmLPadToIndexMap[LPad] = Index;
-  }
-
-  /// Returns true if the landing pad has an associate index in wasm EH.
-  bool hasWasmLandingPadIndex(const MachineBasicBlock *LPad) const {
-    return WasmLPadToIndexMap.count(LPad);
-  }
-
-  /// Get the index in wasm EH for a given landing pad.
-  unsigned getWasmLandingPadIndex(const MachineBasicBlock *LPad) const {
-    assert(hasWasmLandingPadIndex(LPad));
-    return WasmLPadToIndexMap.lookup(LPad);
-  }
-
   /// Get the call site indexes for a landing pad EH symbol.
   SmallVectorImpl<unsigned> &getCallSiteLandingPad(MCSymbol *Sym) {
     assert(hasCallSiteLandingPad(Sym) &&
diff --git a/include/llvm/IR/IntrinsicsWebAssembly.td b/include/llvm/IR/IntrinsicsWebAssembly.td
index 9aa2a4ebeca..adf7cb0ba0e 100644
--- a/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -71,8 +71,7 @@ def int_wasm_catch : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty],
 // WebAssembly EH must maintain the landingpads in the order assigned to them
 // by WasmEHPrepare pass to generate landingpad table in EHStreamer. This is
 // used in order to give them the indices in WasmEHPrepare.
-def int_wasm_landingpad_index: Intrinsic<[], [llvm_token_ty, llvm_i32_ty],
-                                         [IntrNoMem]>;
+def int_wasm_landingpad_index: Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
 
 // Returns LSDA address of the current function.
 def int_wasm_lsda : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 526f7ce3083..63c5b262edc 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -16,7 +16,6 @@
 #include "CodeViewDebug.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
-#include "WasmException.h"
 #include "WinCFGuard.h"
 #include "WinException.h"
 #include "llvm/ADT/APFloat.h"
@@ -357,7 +356,7 @@ bool AsmPrinter::doInitialization(Module &M) {
     }
     break;
   case ExceptionHandling::Wasm:
-    ES = new WasmException(this);
+    // TODO to prevent warning
     break;
   }
   if (ES)
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index 3fb088ab6f0..6cba4a0d4b8 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -23,7 +23,6 @@ add_llvm_library(LLVMAsmPrinter
   WinCFGuard.cpp
   WinException.cpp
   CodeViewDebug.cpp
-  WasmException.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 7599121de2b..be04b9a6e8c 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -345,9 +345,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
 ///     unwound and handling continues.
 ///  3. Type ID table contains references to all the C++ typeinfo for all
 ///     catches in the function.  This tables is reverse indexed base 1.
-///
-/// Returns the starting symbol of an exception table.
-MCSymbol *EHStreamer::emitExceptionTable() {
+void EHStreamer::emitExceptionTable() {
   const MachineFunction *MF = Asm->MF;
   const std::vector<const GlobalValue *> &TypeInfos = MF->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MF->getFilterIds();
@@ -377,7 +375,6 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   computeCallSiteTable(CallSites, LandingPads, FirstActions);
 
   bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
-  bool IsWasm = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::Wasm;
   unsigned CallSiteEncoding =
       IsSJLJ ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_uleb128;
   bool HaveTTData = !TypeInfos.empty() || !FilterIds.empty();
@@ -460,8 +457,8 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   Asm->EmitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
   Asm->OutStreamer->EmitLabel(CstBeginLabel);
 
-  // SjLj / Wasm Exception handling
-  if (IsSJLJ || IsWasm) {
+  // SjLj Exception handling
+  if (IsSJLJ) {
     unsigned idx = 0;
     for (SmallVectorImpl<CallSiteEntry>::const_iterator
          I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) {
@@ -607,7 +604,6 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   }
 
   Asm->EmitAlignment(2);
-  return GCCETSym;
 }
 
 void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
index e3a6f8e9d58..b89421a1e06 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -85,10 +85,9 @@ protected:
   /// zero for the landing pad and the action.  Calls marked 'nounwind' have
   /// no entry and must not be contained in the try-range of any entry - they
   /// form gaps in the table.  Entries must be ordered by try-range address.
-  virtual void computeCallSiteTable(
-      SmallVectorImpl<CallSiteEntry> &CallSites,
-      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
-      const SmallVectorImpl<unsigned> &FirstActions);
+  void computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
+                            const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+                            const SmallVectorImpl<unsigned> &FirstActions);
 
   /// Emit landing pads and actions.
   ///
@@ -109,9 +108,7 @@ protected:
   ///     found the frame is unwound and handling continues.
   ///  3. Type id table contains references to all the C++ typeinfo for all
   ///     catches in the function.  This tables is reversed indexed base 1.
-  ///
-  /// Returns the starting symbol of an exception table.
-  MCSymbol *emitExceptionTable();
+  void emitExceptionTable();
 
   virtual void emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel);
 
diff --git a/lib/CodeGen/AsmPrinter/WasmException.cpp b/lib/CodeGen/AsmPrinter/WasmException.cpp
deleted file mode 100644
index 46745d08c9f..00000000000
--- a/lib/CodeGen/AsmPrinter/WasmException.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-//===-- CodeGen/AsmPrinter/WasmException.cpp - Wasm Exception Impl --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains support for writing WebAssembly exception info into asm
-// files.
-//
-//===----------------------------------------------------------------------===//
-
-#include "WasmException.h"
-#include "llvm/MC/MCStreamer.h"
-using namespace llvm;
-
-void WasmException::markFunctionEnd() {
-  // Get rid of any dead landing pads.
-  if (!Asm->MF->getLandingPads().empty()) {
-    auto *NonConstMF = const_cast<MachineFunction *>(Asm->MF);
-    // Wasm does not set BeginLabel and EndLabel information for landing pads,
-    // so we should set the second argument false.
-    NonConstMF->tidyLandingPads(nullptr, /* TidyIfNoBeginLabels */ false);
-  }
-}
-
-void WasmException::endFunction(const MachineFunction *MF) {
-  bool ShouldEmitExceptionTable = false;
-  for (const LandingPadInfo &Info : MF->getLandingPads()) {
-    if (MF->hasWasmLandingPadIndex(Info.LandingPadBlock)) {
-      ShouldEmitExceptionTable = true;
-      break;
-    }
-  }
-  if (!ShouldEmitExceptionTable)
-    return;
-  MCSymbol *LSDALabel = emitExceptionTable();
-  assert(LSDALabel && ".GCC_exception_table has not been emitted!");
-
-  // Wasm requires every data section symbol to have a .size set. So we emit an
-  // end marker and set the size as the difference between the start end the end
-  // marker.
-  MCSymbol *LSDAEndLabel = Asm->createTempSymbol("GCC_except_table_end");
-  Asm->OutStreamer->EmitLabel(LSDAEndLabel);
-  MCContext &OutContext = Asm->OutStreamer->getContext();
-  const MCExpr *SizeExp = MCBinaryExpr::createSub(
-      MCSymbolRefExpr::create(LSDAEndLabel, OutContext),
-      MCSymbolRefExpr::create(LSDALabel, OutContext), OutContext);
-  Asm->OutStreamer->emitELFSize(LSDALabel, SizeExp);
-}
-
-// Compute the call-site table for wasm EH. Even though we use the same function
-// name to share the common routines, a call site entry in the table corresponds
-// to not a call site for possibly-throwing functions but a landing pad. In wasm
-// EH the VM is responsible for stack unwinding. After an exception occurs and
-// the stack is unwound, the control flow is transferred to wasm 'catch'
-// instruction by the VM, after which the personality function is called from
-// the compiler-generated code. Refer to WasmEHPrepare pass for more
-// information.
-void WasmException::computeCallSiteTable(
-    SmallVectorImpl<CallSiteEntry> &CallSites,
-    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
-    const SmallVectorImpl<unsigned> &FirstActions) {
-  MachineFunction &MF = *Asm->MF;
-  for (unsigned I = 0, N = LandingPads.size(); I < N; ++I) {
-    const LandingPadInfo *Info = LandingPads[I];
-    MachineBasicBlock *LPad = Info->LandingPadBlock;
-    // We don't emit LSDA for single catch (...).
-    if (!MF.hasWasmLandingPadIndex(LPad))
-      continue;
-    // Wasm EH must maintain the EH pads in the order assigned to them by the
-    // WasmEHPrepare pass.
-    unsigned LPadIndex = MF.getWasmLandingPadIndex(LPad);
-    CallSiteEntry Site = {nullptr, nullptr, Info, FirstActions[I]};
-    if (CallSites.size() < LPadIndex + 1)
-      CallSites.resize(LPadIndex + 1);
-    CallSites[LPadIndex] = Site;
-  }
-}
diff --git a/lib/CodeGen/AsmPrinter/WasmException.h b/lib/CodeGen/AsmPrinter/WasmException.h
deleted file mode 100644
index 09a9a25ce8d..00000000000
--- a/lib/CodeGen/AsmPrinter/WasmException.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//===-- WasmException.h - Wasm Exception Framework -------------*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains support for writing WebAssembly exception info into asm
-// files.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H
-#define LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H
-
-#include "EHStreamer.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-
-namespace llvm {
-
-class LLVM_LIBRARY_VISIBILITY WasmException : public EHStreamer {
-public:
-  WasmException(AsmPrinter *A) : EHStreamer(A) {}
-
-  void endModule() override {}
-  void beginFunction(const MachineFunction *MF) override {}
-  virtual void markFunctionEnd() override;
-  void endFunction(const MachineFunction *MF) override;
-
-protected:
-  // Compute the call site table for wasm EH.
-  void computeCallSiteTable(
-      SmallVectorImpl<CallSiteEntry> &CallSites,
-      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
-      const SmallVectorImpl<unsigned> &FirstActions) override;
-};
-
-} // End of namespace llvm
-
-#endif
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 9e4963c4bdb..431484f078b 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -661,11 +661,8 @@ MCSymbol *MachineFunction::addLandingPad(MachineBasicBlock *LandingPad) {
       }
     }
 
-  } else if (const auto *CPI = dyn_cast<CatchPadInst>(FirstI)) {
-    for (unsigned I = CPI->getNumArgOperands(); I != 0; --I) {
-      Value *TypeInfo = CPI->getArgOperand(I - 1)->stripPointerCasts();
-      addCatchTypeInfo(LandingPad, dyn_cast<GlobalValue>(TypeInfo));
-    }
+  } else if (isa<CatchPadInst>(FirstI)) {
+    // TODO
 
   } else {
     assert(isa<CleanupPadInst>(FirstI) && "Invalid landingpad!");
@@ -690,8 +687,7 @@ void MachineFunction::addFilterTypeInfo(MachineBasicBlock *LandingPad,
   LP.TypeIds.push_back(getFilterIDFor(IdsInFilter));
 }
 
-void MachineFunction::tidyLandingPads(DenseMap<MCSymbol *, uintptr_t> *LPMap,
-                                      bool TidyIfNoBeginLabels) {
+void MachineFunction::tidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
   for (unsigned i = 0; i != LandingPads.size(); ) {
     LandingPadInfo &LandingPad = LandingPads[i];
     if (LandingPad.LandingPadLabel &&
@@ -706,25 +702,24 @@ void MachineFunction::tidyLandingPads(DenseMap<MCSymbol *, uintptr_t> *LPMap,
       continue;
     }
 
-    if (TidyIfNoBeginLabels) {
-      for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) {
-        MCSymbol *BeginLabel = LandingPad.BeginLabels[j];
-        MCSymbol *EndLabel = LandingPad.EndLabels[j];
-        if ((BeginLabel->isDefined() || (LPMap && (*LPMap)[BeginLabel] != 0)) &&
-            (EndLabel->isDefined() || (LPMap && (*LPMap)[EndLabel] != 0)))
-          continue;
-
-        LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
-        LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
-        --j;
-        --e;
-      }
+    for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) {
+      MCSymbol *BeginLabel = LandingPad.BeginLabels[j];
+      MCSymbol *EndLabel = LandingPad.EndLabels[j];
+      if ((BeginLabel->isDefined() ||
+           (LPMap && (*LPMap)[BeginLabel] != 0)) &&
+          (EndLabel->isDefined() ||
+           (LPMap && (*LPMap)[EndLabel] != 0))) continue;
+
+      LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
+      LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
+      --j;
+      --e;
+    }
 
-      // Remove landing pads with no try-ranges.
-      if (LandingPads[i].BeginLabels.empty()) {
-        LandingPads.erase(LandingPads.begin() + i);
-        continue;
-      }
+    // Remove landing pads with no try-ranges.
+    if (LandingPads[i].BeginLabels.empty()) {
+      LandingPads.erase(LandingPads.begin() + i);
+      continue;
     }
 
     // If there is no landing pad, ensure that the list of typeids is empty.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2e0456edef7..be4a219efe5 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6288,12 +6288,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
 
-  case Intrinsic::wasm_landingpad_index:
-    // Information this intrinsic contained has been transferred to
-    // MachineFunction in SelectionDAGISel::PrepareEHLandingPad. We can safely
-    // delete it now.
+  case Intrinsic::wasm_landingpad_index: {
+    // TODO store landing pad index in a map, which will be used when generating
+    // LSDA information
     return nullptr;
   }
+  }
 }
 
 void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
@@ -6450,7 +6450,7 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
       WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
       EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS.getInstruction()),
                                 BeginLabel, EndLabel);
-    } else if (!isScopedEHPersonality(Pers)) {
+    } else {
       MF.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 90bcaa653c3..2b4a590f19f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -1129,36 +1128,6 @@ static bool hasExceptionPointerOrCodeUser(const CatchPadInst *CPI) {
   return false;
 }
 
-// wasm.landingpad.index intrinsic is for associating a landing pad index number
-// with a catchpad instruction. Retrieve the landing pad index in the intrinsic
-// and store the mapping in the function.
-static void mapWasmLandingPadIndex(MachineBasicBlock *MBB,
-                                   const CatchPadInst *CPI) {
-  MachineFunction *MF = MBB->getParent();
-  // In case of single catch (...), we don't emit LSDA, so we don't need
-  // this information.
-  bool IsSingleCatchAllClause =
-      CPI->getNumArgOperands() == 1 &&
-      cast<Constant>(CPI->getArgOperand(0))->isNullValue();
-  if (!IsSingleCatchAllClause) {
-    // Create a mapping from landing pad label to landing pad index.
-    bool IntrFound = false;
-    for (const User *U : CPI->users()) {
-      if (const auto *Call = dyn_cast<IntrinsicInst>(U)) {
-        Intrinsic::ID IID = Call->getIntrinsicID();
-        if (IID == Intrinsic::wasm_landingpad_index) {
-          Value *IndexArg = Call->getArgOperand(1);
-          int Index = cast<ConstantInt>(IndexArg)->getZExtValue();
-          MF->setWasmLandingPadIndex(MBB, Index);
-          IntrFound = true;
-          break;
-        }
-      }
-    }
-    assert(IntrFound && "wasm.landingpad.index intrinsic not found!");
-  }
-}
-
 /// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
 /// do other setup for EH landing-pad blocks.
 bool SelectionDAGISel::PrepareEHLandingPad() {
@@ -1168,48 +1137,44 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
   const TargetRegisterClass *PtrRC =
       TLI->getRegClassFor(TLI->getPointerTy(CurDAG->getDataLayout()));
 
-  auto Pers = classifyEHPersonality(PersonalityFn);
-
   // Catchpads have one live-in register, which typically holds the exception
   // pointer or code.
-  if (isFuncletEHPersonality(Pers)) {
-    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
-      if (hasExceptionPointerOrCodeUser(CPI)) {
-        // Get or create the virtual register to hold the pointer or code.  Mark
-        // the live in physreg and copy into the vreg.
-        MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
-        assert(EHPhysReg && "target lacks exception pointer register");
-        MBB->addLiveIn(EHPhysReg);
-        unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
-        BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
-                TII->get(TargetOpcode::COPY), VReg)
-            .addReg(EHPhysReg, RegState::Kill);
-      }
+  if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
+    if (hasExceptionPointerOrCodeUser(CPI)) {
+      // Get or create the virtual register to hold the pointer or code.  Mark
+      // the live in physreg and copy into the vreg.
+      MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
+      assert(EHPhysReg && "target lacks exception pointer register");
+      MBB->addLiveIn(EHPhysReg);
+      unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
+      BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
+              TII->get(TargetOpcode::COPY), VReg)
+          .addReg(EHPhysReg, RegState::Kill);
     }
     return true;
   }
 
+  if (!LLVMBB->isLandingPad())
+    return true;
+
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
   MCSymbol *Label = MF->addLandingPad(MBB);
 
+  // Assign the call site to the landing pad's begin label.
+  MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
+
   const MCInstrDesc &II = TII->get(TargetOpcode::EH_LABEL);
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
-  if (Pers == EHPersonality::Wasm_CXX) {
-    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI()))
-      mapWasmLandingPadIndex(MBB, CPI);
-  } else {
-    // Assign the call site to the landing pad's begin label.
-    MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
-    // Mark exception register as live in.
-    if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
-      FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
-    // Mark exception selector register as live in.
-    if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
-      FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
-  }
+  // Mark exception register as live in.
+  if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
+    FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
+
+  // Mark exception selector register as live in.
+  if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
+    FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
 
   return true;
 }
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 341ab927861..b046cd81d6c 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1748,10 +1748,6 @@ const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference(
 void TargetLoweringObjectFileWasm::InitializeWasm() {
   StaticCtorSection =
       getContext().getWasmSection(".init_array", SectionKind::getData());
-
-  // We don't use PersonalityEncoding and LSDAEncoding because we don't emit
-  // .cfi directives. We use TTypeEncoding to encode typeinfo global variables.
-  TTypeEncoding = dwarf::DW_EH_PE_absptr;
 }
 
 MCSection *TargetLoweringObjectFileWasm::getStaticCtorSection(
diff --git a/lib/CodeGen/WasmEHPrepare.cpp b/lib/CodeGen/WasmEHPrepare.cpp
index 6f02a05f561..83d04da5dd0 100644
--- a/lib/CodeGen/WasmEHPrepare.cpp
+++ b/lib/CodeGen/WasmEHPrepare.cpp
@@ -300,7 +300,7 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, unsigned Index) {
   // This is to create a map of <landingpad EH label, landingpad index> in
   // SelectionDAGISel, which is to be used in EHStreamer to emit LSDA tables.
   // Pseudocode: wasm.landingpad.index(Index);
-  IRB.CreateCall(LPadIndexF, {FPI, IRB.getInt32(Index)});
+  IRB.CreateCall(LPadIndexF, IRB.getInt32(Index));
 
   // Pseudocode: __wasm_lpad_context.lpad_index = index;
   IRB.CreateStore(IRB.getInt32(Index), LPadIndexField);
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index b1e03f8efee..edfccfcb9ed 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -743,12 +743,6 @@ void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
   DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", SectionKind::getMetadata());
   DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", SectionKind::getMetadata());
 
-  // Wasm use data section for LSDA.
-  // TODO Consider putting each function's exception table in a separate
-  // section, as in -function-sections, to facilitate lld's --gc-section.
-  LSDASection = Ctx->getWasmSection(".rodata.gcc_except_table",
-                                    SectionKind::getReadOnlyWithRel());
-
   // TODO: Define more sections.
 }
 
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index f9318ad5801..cbbe161ae82 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -635,12 +635,10 @@ static void addData(SmallVectorImpl<char> &DataBytes,
         llvm_unreachable("The fill should be an assembler constant");
       DataBytes.insert(DataBytes.end(), Fill->getValueSize() * NumValues,
                        Fill->getValue());
-    } else if (auto *LEB = dyn_cast<MCLEBFragment>(&Frag)) {
-      const SmallVectorImpl<char> &Contents = LEB->getContents();
-      DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
     } else {
       const auto &DataFrag = cast<MCDataFragment>(Frag);
       const SmallVectorImpl<char> &Contents = DataFrag.getContents();
+
       DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
     }
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 080bfe771a4..30c2e843408 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -21,7 +21,6 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -967,17 +966,9 @@ WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   default:
     return {}; // Don't custom lower most intrinsics.
 
-  case Intrinsic::wasm_lsda: {
-    MachineFunction &MF = DAG.getMachineFunction();
-    EVT VT = Op.getValueType();
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
-    auto &Context = MF.getMMI().getContext();
-    MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
-                                            Twine(MF.getFunctionNumber()));
-    return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
-                       DAG.getMCSymbol(S, PtrVT));
-  }
+  case Intrinsic::wasm_lsda:
+    // TODO For now, just return 0 not to crash
+    return DAG.getConstant(0, DL, Op.getValueType());
   }
 }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 4acad5f5943..8d98510c67d 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -269,8 +269,6 @@ def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
           (CONST_I32 tglobaladdr:$addr)>;
 def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
           (CONST_I32 texternalsym:$addr)>;
-def : Pat<(i32 (WebAssemblywrapper mcsym:$sym)), (CONST_I32 mcsym:$sym)>;
-def : Pat<(i64 (WebAssemblywrapper mcsym:$sym)), (CONST_I64 mcsym:$sym)>;
 
 //===----------------------------------------------------------------------===//
 // Additional sets of instructions.
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 15b3da4c8b8..e9a0cf51905 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -226,13 +226,6 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
           (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_FUNCTION) != 0,
           (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_GLOBAL) != 0);
       break;
-    case MachineOperand::MO_MCSymbol:
-      // This is currently used only for LSDA symbols (GCC_except_table),
-      // because global addresses or other external symbols are handled above.
-      assert(MO.getTargetFlags() == 0 &&
-             "WebAssembly does not use target flags on MCSymbol");
-      MCOp = LowerSymbolOperand(MO.getMCSymbol(), /*Offset=*/0, false, false);
-      break;
     }
 
     OutMI.addOperand(MCOp);
diff --git a/test/CodeGen/WebAssembly/eh-lsda.ll b/test/CodeGen/WebAssembly/eh-lsda.ll
deleted file mode 100644
index fd550938c42..00000000000
--- a/test/CodeGen/WebAssembly/eh-lsda.ll
+++ /dev/null
@@ -1,239 +0,0 @@
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling | FileCheck -allow-deprecated-dag-overlap %s
-target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
-
-@_ZTIi = external constant i8*
-@_ZTIf = external constant i8*
-@_ZTId = external constant i8*
-
-; Single catch (...) does not need an exception table.
-;
-; try {
-;   may_throw();
-; } catch (...) {
-; }
-; CHECK-LABEL: test0:
-; CHECK-NOT: GCC_except_table
-define void @test0() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
-entry:
-  invoke void @may_throw()
-          to label %try.cont unwind label %catch.dispatch
-
-catch.dispatch:                                   ; preds = %entry
-  %0 = catchswitch within none [label %catch.start] unwind to caller
-
-catch.start:                                      ; preds = %catch.dispatch
-  %1 = catchpad within %0 [i8* null]
-  %2 = call i8* @llvm.wasm.get.exception(token %1)
-  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
-  %4 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
-  call void @__cxa_end_catch() [ "funclet"(token %1) ]
-  catchret from %1 to label %try.cont
-
-try.cont:                                         ; preds = %entry, %catch.start
-  ret void
-}
-
-; Exception table generation + shared action test.
-;
-; try {
-;   may_throw();
-; } catch (int) {
-; } catch (float) {
-; } catch (double) {
-; } catch (...) {
-; }
-;
-; try {
-;   may_throw();
-; } catch (double) {
-; } catch (...) {
-; }
-;
-; try {
-;   may_throw();
-; } catch (int) {
-; } catch (float) {
-; }
-;
-; There are three landing pads. The second landing pad should share action table
-; entries with the first landing pad because they end with the same sequence
-; (double -> ...). But the third landing table cannot share action table entries
-; with others, so it should create its own entries.
-; CHECK-LABEL: test1:
-; CHECK: .section  .rodata.gcc_except_table,"",@
-; CHECK-NEXT:   .p2align  2
-; CHECK-NEXT: GCC_except_table[[START:[0-9]+]]:
-; CHECK-NEXT: .Lexception0:
-; CHECK-NEXT:   .int8  255                     # @LPStart Encoding = omit
-; CHECK-NEXT:   .int8  0                       # @TType Encoding = absptr
-; CHECK-NEXT:   .uleb128 .Lttbase0-.Lttbaseref0
-; CHECK-NEXT: .Lttbaseref0:
-; CHECK-NEXT:   .int8  1                       # Call site Encoding = uleb128
-; CHECK-NEXT:   .uleb128 .Lcst_end0-.Lcst_begin0
-; CHECK-NEXT: .Lcst_begin0:
-; CHECK-NEXT:   .int8  0                       # >> Call Site 0 <<
-; CHECK-NEXT:                                  #   On exception at call site 0
-; CHECK-NEXT:   .int8  7                       #   Action: 4
-; CHECK-NEXT:   .int8  1                       # >> Call Site 1 <<
-; CHECK-NEXT:                                  #   On exception at call site 1
-; CHECK-NEXT:   .int8  3                       #   Action: 2
-; CHECK-NEXT:   .int8  2                       # >> Call Site 2 <<
-; CHECK-NEXT:                                  #   On exception at call site 2
-; CHECK-NEXT:   .int8  11                      #   Action: 6
-; CHECK-NEXT: .Lcst_end0:
-; CHECK-NEXT:   .int8  1                       # >> Action Record 1 <<
-; CHECK-NEXT:                                  #   Catch TypeInfo 1
-; CHECK-NEXT:   .int8  0                       #   No further actions
-; CHECK-NEXT:   .int8  2                       # >> Action Record 2 <<
-; CHECK-NEXT:                                  #   Catch TypeInfo 2
-; CHECK-NEXT:   .int8  125                     #   Continue to action 1
-; CHECK-NEXT:   .int8  3                       # >> Action Record 3 <<
-; CHECK-NEXT:                                  #   Catch TypeInfo 3
-; CHECK-NEXT:   .int8  125                     #   Continue to action 2
-; CHECK-NEXT:   .int8  4                       # >> Action Record 4 <<
-; CHECK-NEXT:                                  #   Catch TypeInfo 4
-; CHECK-NEXT:   .int8  125                     #   Continue to action 3
-; CHECK-NEXT:   .int8  3                       # >> Action Record 5 <<
-; CHECK-NEXT:                                  #   Catch TypeInfo 3
-; CHECK-NEXT:   .int8  0                       #   No further actions
-; CHECK-NEXT:   .int8  4                       # >> Action Record 6 <<
-; CHECK-NEXT:                                  #   Catch TypeInfo 4
-; CHECK-NEXT:   .int8  125                     #   Continue to action 5
-; CHECK-NEXT:   .p2align  2
-; CHECK-NEXT:                                  # >> Catch TypeInfos <<
-; CHECK-NEXT:   .int32  _ZTIi                  # TypeInfo 4
-; CHECK-NEXT:   .int32  _ZTIf                  # TypeInfo 3
-; CHECK-NEXT:   .int32  _ZTId                  # TypeInfo 2
-; CHECK-NEXT:   .int32  0                      # TypeInfo 1
-; CHECK-NEXT: .Lttbase0:
-; CHECK-NEXT:   .p2align  2
-; CHECK-NEXT: .LGCC_except_table_end[[END:[0-9]+]]:
-; CHECK-NEXT:   .size  GCC_except_table[[START]], .LGCC_except_table_end[[END]]-GCC_except_table[[START]]
-define void @test1() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
-entry:
-  invoke void @may_throw()
-          to label %try.cont unwind label %catch.dispatch
-
-catch.dispatch:                                   ; preds = %entry
-  %0 = catchswitch within none [label %catch.start] unwind to caller
-
-catch.start:                                      ; preds = %catch.dispatch
-  %1 = catchpad within %0 [i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIf to i8*), i8* bitcast (i8** @_ZTId to i8*), i8* null]
-  %2 = call i8* @llvm.wasm.get.exception(token %1)
-  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
-  %4 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
-  %matches = icmp eq i32 %3, %4
-  br i1 %matches, label %catch10, label %catch.fallthrough
-
-catch10:                                          ; preds = %catch.start
-  %5 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
-  %6 = bitcast i8* %5 to i32*
-  %7 = load i32, i32* %6, align 4
-  call void @__cxa_end_catch() [ "funclet"(token %1) ]
-  catchret from %1 to label %try.cont
-
-try.cont:                                         ; preds = %entry, %catch, %catch4, %catch7, %catch10
-  invoke void @may_throw()
-          to label %try.cont23 unwind label %catch.dispatch14
-
-catch.dispatch14:                                 ; preds = %try.cont
-  %8 = catchswitch within none [label %catch.start15] unwind to caller
-
-catch.start15:                                    ; preds = %catch.dispatch14
-  %9 = catchpad within %8 [i8* bitcast (i8** @_ZTId to i8*), i8* null]
-  %10 = call i8* @llvm.wasm.get.exception(token %9)
-  %11 = call i32 @llvm.wasm.get.ehselector(token %9)
-  %12 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*))
-  %matches16 = icmp eq i32 %11, %12
-  %13 = call i8* @__cxa_begin_catch(i8* %10) [ "funclet"(token %9) ]
-  br i1 %matches16, label %catch20, label %catch17
-
-catch20:                                          ; preds = %catch.start15
-  %14 = bitcast i8* %13 to double*
-  %15 = load double, double* %14, align 8
-  call void @__cxa_end_catch() [ "funclet"(token %9) ]
-  catchret from %9 to label %try.cont23
-
-try.cont23:                                       ; preds = %try.cont, %catch17, %catch20
-  invoke void @may_throw()
-          to label %try.cont36 unwind label %catch.dispatch25
-
-catch.dispatch25:                                 ; preds = %try.cont23
-  %16 = catchswitch within none [label %catch.start26] unwind to caller
-
-catch.start26:                                    ; preds = %catch.dispatch25
-  %17 = catchpad within %16 [i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIf to i8*)]
-  %18 = call i8* @llvm.wasm.get.exception(token %17)
-  %19 = call i32 @llvm.wasm.get.ehselector(token %17)
-  %20 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
-  %matches27 = icmp eq i32 %19, %20
-  br i1 %matches27, label %catch33, label %catch.fallthrough28
-
-catch33:                                          ; preds = %catch.start26
-  %21 = call i8* @__cxa_begin_catch(i8* %18) [ "funclet"(token %17) ]
-  %22 = bitcast i8* %21 to i32*
-  %23 = load i32, i32* %22, align 4
-  call void @__cxa_end_catch() [ "funclet"(token %17) ]
-  catchret from %17 to label %try.cont36
-
-catch.fallthrough28:                              ; preds = %catch.start26
-  %24 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*))
-  %matches29 = icmp eq i32 %19, %24
-  br i1 %matches29, label %catch30, label %rethrow
-
-catch30:                                          ; preds = %catch.fallthrough28
-  %25 = call i8* @__cxa_begin_catch(i8* %18) [ "funclet"(token %17) ]
-  %26 = bitcast i8* %25 to float*
-  %27 = load float, float* %26, align 4
-  call void @__cxa_end_catch() [ "funclet"(token %17) ]
-  catchret from %17 to label %try.cont36
-
-rethrow:                                          ; preds = %catch.fallthrough28
-  call void @__cxa_rethrow() [ "funclet"(token %17) ]
-  unreachable
-
-try.cont36:                                       ; preds = %try.cont23, %catch30, %catch33
-  ret void
-
-catch17:                                          ; preds = %catch.start15
-  call void @__cxa_end_catch() [ "funclet"(token %9) ]
-  catchret from %9 to label %try.cont23
-
-catch.fallthrough:                                ; preds = %catch.start
-  %28 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*))
-  %matches1 = icmp eq i32 %3, %28
-  br i1 %matches1, label %catch7, label %catch.fallthrough2
-
-catch7:                                           ; preds = %catch.fallthrough
-  %29 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
-  %30 = bitcast i8* %29 to float*
-  %31 = load float, float* %30, align 4
-  call void @__cxa_end_catch() [ "funclet"(token %1) ]
-  catchret from %1 to label %try.cont
-
-catch.fallthrough2:                               ; preds = %catch.fallthrough
-  %32 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*))
-  %matches3 = icmp eq i32 %3, %32
-  %33 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
-  br i1 %matches3, label %catch4, label %catch
-
-catch4:                                           ; preds = %catch.fallthrough2
-  %34 = bitcast i8* %33 to double*
-  %35 = load double, double* %34, align 8
-  call void @__cxa_end_catch() [ "funclet"(token %1) ]
-  catchret from %1 to label %try.cont
-
-catch:                                            ; preds = %catch.fallthrough2
-  call void @__cxa_end_catch() [ "funclet"(token %1) ]
-  catchret from %1 to label %try.cont
-}
-
-declare void @may_throw()
-declare i32 @llvm.eh.typeid.for(i8*)
-declare i8* @llvm.wasm.get.exception(token)
-declare i32 @llvm.wasm.get.ehselector(token)
-declare void @__cxa_rethrow()
-declare i8* @__cxa_begin_catch(i8*)
-declare void @__cxa_end_catch()
-declare i32 @__gxx_wasm_personality_v0(...)
diff --git a/test/CodeGen/WebAssembly/wasmehprepare.ll b/test/CodeGen/WebAssembly/wasmehprepare.ll
index 67e198eb058..e6005e34057 100644
--- a/test/CodeGen/WebAssembly/wasmehprepare.ll
+++ b/test/CodeGen/WebAssembly/wasmehprepare.ll
@@ -30,7 +30,7 @@ catch.start:                                      ; preds = %catch.dispatch
 ; CHECK: catch.start:
 ; CHECK-NEXT:   %[[CATCHPAD:.*]] = catchpad
 ; CHECK-NEXT:   %[[EXN:.*]] = call i8* @llvm.wasm.catch(i32 0)
-; CHECK-NEXT:   call void @llvm.wasm.landingpad.index(token %[[CATCHPAD]], i32 0)
+; CHECK-NEXT:   call void @llvm.wasm.landingpad.index(i32 0)
 ; CHECK-NEXT:   store i32 0, i32* getelementptr inbounds ({ i32, i8*, i32 }, { i32, i8*, i32 }* @__wasm_lpad_context, i32 0, i32 0)
 ; CHECK-NEXT:   %[[LSDA:.*]] = call i8* @llvm.wasm.lsda()
 ; CHECK-NEXT:   store i8* %[[LSDA]], i8** getelementptr inbounds ({ i32, i8*, i32 }, { i32, i8*, i32 }* @__wasm_lpad_context, i32 0, i32 1)
@@ -98,7 +98,7 @@ catch.start3:                                     ; preds = %catch.dispatch2
   %matches = icmp eq i32 %8, %9
   br i1 %matches, label %catch4, label %rethrow
 ; CHECK: catch.start3:
-; CHECK:   call void @llvm.wasm.landingpad.index(token %{{.+}}, i32 0)
+; CHECK:   call void @llvm.wasm.landingpad.index(i32 0)
 
 catch4:                                           ; preds = %catch.start3
   %10 = call i8* @__cxa_begin_catch(i8* %7) [ "funclet"(token %6) ]
@@ -311,7 +311,7 @@ declare void @__cxa_rethrow()
 declare void @__clang_call_terminate(i8*)
 
 ; CHECK-DAG: declare i8* @llvm.wasm.catch(i32)
-; CHECK-DAG: declare void @llvm.wasm.landingpad.index(token, i32)
+; CHECK-DAG: declare void @llvm.wasm.landingpad.index(i32)
 ; CHECK-DAG: declare i8* @llvm.wasm.lsda()
 ; CHECK-DAG: declare i32 @_Unwind_CallPersonality(i8*)
 
-- 
GitLab


From e87aaa1093866a4508421db59822a9c2a6b50c4a Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 16 Oct 2018 20:13:06 +0000
Subject: [PATCH 0257/1116] [ORC] Make the VModuleKey optional, propagate it
 via MaterializationUnit and MaterializationResponsibility.

VModuleKeys are intended to enable selective removal of modules from a JIT
session, however for a wide variety of use cases selective removal is not
needed and introduces unnecessary overhead. As of this commit, the default
constructed VModuleKey value is reserved as a "do not track" value, and
becomes the default when adding a new module to the JIT.

This commit also changes the propagation of VModuleKeys. They were passed
alongside the MaterializationResponsibity instance in XXLayer::emit methods,
but are now propagated as part of the MaterializationResponsibility instance
itself (and as part of MaterializationUnit when stored in a JITDylib).
Associating VModuleKeys with MaterializationUnits in this way should allow
for a thread-safe module removal mechanism in the future, even when a module
is in the process of being compiled, by having the
MaterializationResponsibility object check in on its VModuleKey's state
before commiting its results to the JITDylib.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344643 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Orc/CompileOnDemandLayer.h                |  3 +-
 include/llvm/ExecutionEngine/Orc/Core.h       | 40 ++++++++++++-------
 .../llvm/ExecutionEngine/Orc/IRCompileLayer.h |  3 +-
 .../ExecutionEngine/Orc/IRTransformLayer.h    |  3 +-
 include/llvm/ExecutionEngine/Orc/LLJIT.h      |  2 -
 include/llvm/ExecutionEngine/Orc/Layer.h      | 18 +++++----
 .../llvm/ExecutionEngine/Orc/LazyReexports.h  |  8 ++--
 .../Orc/ObjectTransformLayer.h                |  2 +-
 .../Orc/RTDyldObjectLinkingLayer.h            |  6 +--
 .../Orc/CompileOnDemandLayer.cpp              | 20 +++++-----
 lib/ExecutionEngine/Orc/Core.cpp              | 25 +++++++-----
 lib/ExecutionEngine/Orc/IRCompileLayer.cpp    |  8 ++--
 lib/ExecutionEngine/Orc/IRTransformLayer.cpp  |  6 +--
 lib/ExecutionEngine/Orc/IndirectionUtils.cpp  |  8 ++--
 lib/ExecutionEngine/Orc/LLJIT.cpp             | 24 +++++------
 lib/ExecutionEngine/Orc/Layer.cpp             | 27 +++++++------
 lib/ExecutionEngine/Orc/LazyReexports.cpp     |  4 +-
 .../Orc/ObjectTransformLayer.cpp              |  4 +-
 .../Orc/RTDyldObjectLinkingLayer.cpp          | 15 ++-----
 unittests/ExecutionEngine/Orc/OrcTestCommon.h |  2 +-
 .../Orc/RTDyldObjectLinkingLayerTest.cpp      | 13 +++---
 21 files changed, 124 insertions(+), 117 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index 7721f74fe0c..884878925cd 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -94,8 +94,7 @@ public:
 
   /// Emits the given module. This should not be called by clients: it will be
   /// called by the JIT when a definition added via the add method is requested.
-  void emit(MaterializationResponsibility R, VModuleKey K,
-            ThreadSafeModule TSM) override;
+  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
 
 private:
   struct PerDylibResources {
diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h
index 86c5ebb6d27..2e56854340c 100644
--- a/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/include/llvm/ExecutionEngine/Orc/Core.h
@@ -168,6 +168,9 @@ public:
   ///        into.
   JITDylib &getTargetJITDylib() const { return JD; }
 
+  /// Returns the VModuleKey for this instance.
+  VModuleKey getVModuleKey() const { return K; }
+
   /// Returns the symbol flags map for this responsibility instance.
   /// Note: The returned flags may have transient flags (Lazy, Materializing)
   /// set. These should be stripped with JITSymbolFlags::stripTransientFlags
@@ -218,7 +221,8 @@ public:
   /// Delegates responsibility for the given symbols to the returned
   /// materialization responsibility. Useful for breaking up work between
   /// threads, or different kinds of materialization processes.
-  MaterializationResponsibility delegate(const SymbolNameSet &Symbols);
+  MaterializationResponsibility delegate(const SymbolNameSet &Symbols,
+                                         VModuleKey NewKey = VModuleKey());
 
   void addDependencies(const SymbolStringPtr &Name,
                        const SymbolDependenceMap &Dependencies);
@@ -229,10 +233,12 @@ public:
 private:
   /// Create a MaterializationResponsibility for the given JITDylib and
   ///        initial symbols.
-  MaterializationResponsibility(JITDylib &JD, SymbolFlagsMap SymbolFlags);
+  MaterializationResponsibility(JITDylib &JD, SymbolFlagsMap SymbolFlags,
+                                VModuleKey K);
 
   JITDylib &JD;
   SymbolFlagsMap SymbolFlags;
+  VModuleKey K;
 };
 
 /// A MaterializationUnit represents a set of symbol definitions that can
@@ -245,8 +251,8 @@ private:
 /// stronger definition is added or already present.
 class MaterializationUnit {
 public:
-  MaterializationUnit(SymbolFlagsMap InitalSymbolFlags)
-      : SymbolFlags(std::move(InitalSymbolFlags)) {}
+  MaterializationUnit(SymbolFlagsMap InitalSymbolFlags, VModuleKey K)
+      : SymbolFlags(std::move(InitalSymbolFlags)), K(std::move(K)) {}
 
   virtual ~MaterializationUnit() {}
 
@@ -261,7 +267,8 @@ public:
   /// ExecutionSession::DispatchMaterializationFunction) to trigger
   /// materialization of this MaterializationUnit.
   void doMaterialize(JITDylib &JD) {
-    materialize(MaterializationResponsibility(JD, std::move(SymbolFlags)));
+    materialize(MaterializationResponsibility(JD, std::move(SymbolFlags),
+                                              std::move(K)));
   }
 
   /// Called by JITDylibs to notify MaterializationUnits that the given symbol
@@ -273,6 +280,7 @@ public:
 
 protected:
   SymbolFlagsMap SymbolFlags;
+  VModuleKey K;
 
 private:
   virtual void anchor();
@@ -298,7 +306,7 @@ using MaterializationUnitList =
 /// materialized.
 class AbsoluteSymbolsMaterializationUnit : public MaterializationUnit {
 public:
-  AbsoluteSymbolsMaterializationUnit(SymbolMap Symbols);
+  AbsoluteSymbolsMaterializationUnit(SymbolMap Symbols, VModuleKey K);
 
   StringRef getName() const override;
 
@@ -321,9 +329,9 @@ private:
 /// \endcode
 ///
 inline std::unique_ptr<AbsoluteSymbolsMaterializationUnit>
-absoluteSymbols(SymbolMap Symbols) {
+absoluteSymbols(SymbolMap Symbols, VModuleKey K = VModuleKey()) {
   return llvm::make_unique<AbsoluteSymbolsMaterializationUnit>(
-      std::move(Symbols));
+      std::move(Symbols), std::move(K));
 }
 
 struct SymbolAliasMapEntry {
@@ -349,7 +357,8 @@ public:
   /// Note: Care must be taken that no sets of aliases form a cycle, as such
   ///       a cycle will result in a deadlock when any symbol in the cycle is
   ///       resolved.
-  ReExportsMaterializationUnit(JITDylib *SourceJD, SymbolAliasMap Aliases);
+  ReExportsMaterializationUnit(JITDylib *SourceJD, SymbolAliasMap Aliases,
+                               VModuleKey K);
 
   StringRef getName() const override;
 
@@ -374,17 +383,18 @@ private:
 ///     return Err;
 /// \endcode
 inline std::unique_ptr<ReExportsMaterializationUnit>
-symbolAliases(SymbolAliasMap Aliases) {
-  return llvm::make_unique<ReExportsMaterializationUnit>(nullptr,
-                                                         std::move(Aliases));
+symbolAliases(SymbolAliasMap Aliases, VModuleKey K = VModuleKey()) {
+  return llvm::make_unique<ReExportsMaterializationUnit>(
+      nullptr, std::move(Aliases), std::move(K));
 }
 
 /// Create a materialization unit for re-exporting symbols from another JITDylib
 /// with alternative names/flags.
 inline std::unique_ptr<ReExportsMaterializationUnit>
-reexports(JITDylib &SourceJD, SymbolAliasMap Aliases) {
-  return llvm::make_unique<ReExportsMaterializationUnit>(&SourceJD,
-                                                         std::move(Aliases));
+reexports(JITDylib &SourceJD, SymbolAliasMap Aliases,
+          VModuleKey K = VModuleKey()) {
+  return llvm::make_unique<ReExportsMaterializationUnit>(
+      &SourceJD, std::move(Aliases), std::move(K));
 }
 
 /// Build a SymbolAliasMap for the common case where you want to re-export
diff --git a/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h b/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
index a62d8be2fa6..30d71e69cd7 100644
--- a/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
@@ -41,8 +41,7 @@ public:
 
   void setNotifyCompiled(NotifyCompiledFunction NotifyCompiled);
 
-  void emit(MaterializationResponsibility R, VModuleKey K,
-            ThreadSafeModule TSM) override;
+  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
 
 private:
   mutable std::mutex IRLayerMutex;
diff --git a/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h b/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
index 55a1ce4c930..49e65b9f2a8 100644
--- a/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
@@ -35,8 +35,7 @@ public:
     this->Transform = std::move(Transform);
   }
 
-  void emit(MaterializationResponsibility R, VModuleKey K,
-            ThreadSafeModule TSM) override;
+  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
 
   static ThreadSafeModule
   identityTransform(ThreadSafeModule TSM,
diff --git a/include/llvm/ExecutionEngine/Orc/LLJIT.h b/include/llvm/ExecutionEngine/Orc/LLJIT.h
index 05a566fedb6..8b6465e1f02 100644
--- a/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -111,8 +111,6 @@ protected:
   LLJIT(std::unique_ptr<ExecutionSession> ES, JITTargetMachineBuilder JTMB,
         DataLayout DL, unsigned NumCompileThreads);
 
-  std::unique_ptr<RuntimeDyld::MemoryManager> getMemoryManager(VModuleKey K);
-
   std::string mangle(StringRef UnmangledName);
 
   Error applyDataLayout(Module &M);
diff --git a/include/llvm/ExecutionEngine/Orc/Layer.h b/include/llvm/ExecutionEngine/Orc/Layer.h
index be5d9653dd8..cd797445a2e 100644
--- a/include/llvm/ExecutionEngine/Orc/Layer.h
+++ b/include/llvm/ExecutionEngine/Orc/Layer.h
@@ -49,11 +49,11 @@ public:
 
   /// Adds a MaterializationUnit representing the given IR to the given
   /// JITDylib.
-  virtual Error add(JITDylib &JD, VModuleKey K, ThreadSafeModule TSM);
+  virtual Error add(JITDylib &JD, ThreadSafeModule TSM,
+                    VModuleKey K = VModuleKey());
 
   /// Emit should materialize the given IR.
-  virtual void emit(MaterializationResponsibility R, VModuleKey K,
-                    ThreadSafeModule TSM) = 0;
+  virtual void emit(MaterializationResponsibility R, ThreadSafeModule TSM) = 0;
 
 private:
   bool CloneToNewContextOnEmit = false;
@@ -70,14 +70,16 @@ public:
 
   /// Create an IRMaterializationLayer. Scans the module to build the
   /// SymbolFlags and SymbolToDefinition maps.
-  IRMaterializationUnit(ExecutionSession &ES, ThreadSafeModule TSM);
+  IRMaterializationUnit(ExecutionSession &ES, ThreadSafeModule TSM,
+                        VModuleKey K);
 
   /// Create an IRMaterializationLayer from a module, and pre-existing
   /// SymbolFlags and SymbolToDefinition maps. The maps must provide
   /// entries for each definition in M.
   /// This constructor is useful for delegating work from one
   /// IRMaterializationUnit to another.
-  IRMaterializationUnit(ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
+  IRMaterializationUnit(ThreadSafeModule TSM, VModuleKey K,
+                        SymbolFlagsMap SymbolFlags,
                         SymbolNameToDefinitionMap SymbolToDefinition);
 
   /// Return the ModuleIdentifier as the name for this MaterializationUnit.
@@ -119,10 +121,11 @@ public:
 
   /// Adds a MaterializationUnit representing the given IR to the given
   /// JITDylib.
-  virtual Error add(JITDylib &JD, VModuleKey K, std::unique_ptr<MemoryBuffer> O);
+  virtual Error add(JITDylib &JD, std::unique_ptr<MemoryBuffer> O,
+                    VModuleKey K = VModuleKey());
 
   /// Emit should materialize the given IR.
-  virtual void emit(MaterializationResponsibility R, VModuleKey K,
+  virtual void emit(MaterializationResponsibility R,
                     std::unique_ptr<MemoryBuffer> O) = 0;
 
 private:
@@ -149,7 +152,6 @@ private:
   void discard(const JITDylib &JD, const SymbolStringPtr &Name) override;
 
   ObjectLayer &L;
-  VModuleKey K;
   std::unique_ptr<MemoryBuffer> O;
 };
 
diff --git a/include/llvm/ExecutionEngine/Orc/LazyReexports.h b/include/llvm/ExecutionEngine/Orc/LazyReexports.h
index 8f897009ac2..b5041325bce 100644
--- a/include/llvm/ExecutionEngine/Orc/LazyReexports.h
+++ b/include/llvm/ExecutionEngine/Orc/LazyReexports.h
@@ -159,7 +159,8 @@ public:
   LazyReexportsMaterializationUnit(LazyCallThroughManager &LCTManager,
                                    IndirectStubsManager &ISManager,
                                    JITDylib &SourceJD,
-                                   SymbolAliasMap CallableAliases);
+                                   SymbolAliasMap CallableAliases,
+                                   VModuleKey K);
 
   StringRef getName() const override;
 
@@ -182,9 +183,10 @@ private:
 inline std::unique_ptr<LazyReexportsMaterializationUnit>
 lazyReexports(LazyCallThroughManager &LCTManager,
               IndirectStubsManager &ISManager, JITDylib &SourceJD,
-              SymbolAliasMap CallableAliases) {
+              SymbolAliasMap CallableAliases, VModuleKey K = VModuleKey()) {
   return llvm::make_unique<LazyReexportsMaterializationUnit>(
-      LCTManager, ISManager, SourceJD, std::move(CallableAliases));
+      LCTManager, ISManager, SourceJD, std::move(CallableAliases),
+      std::move(K));
 }
 
 } // End namespace orc
diff --git a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
index 6cd688ad58a..44d6b490e19 100644
--- a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
@@ -32,7 +32,7 @@ public:
   ObjectTransformLayer(ExecutionSession &ES, ObjectLayer &BaseLayer,
                        TransformFunction Transform);
 
-  void emit(MaterializationResponsibility R, VModuleKey K,
+  void emit(MaterializationResponsibility R,
             std::unique_ptr<MemoryBuffer> O) override;
 
 private:
diff --git a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
index bbd782fdece..401f6e3fa81 100644
--- a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -47,7 +47,7 @@ public:
   using NotifyEmittedFunction = std::function<void(VModuleKey)>;
 
   using GetMemoryManagerFunction =
-      std::function<std::unique_ptr<RuntimeDyld::MemoryManager>(VModuleKey)>;
+      std::function<std::unique_ptr<RuntimeDyld::MemoryManager>()>;
 
   /// Construct an ObjectLinkingLayer with the given NotifyLoaded,
   ///        and NotifyEmitted functors.
@@ -57,7 +57,7 @@ public:
       NotifyEmittedFunction NotifyEmitted = NotifyEmittedFunction());
 
   /// Emit the object.
-  void emit(MaterializationResponsibility R, VModuleKey K,
+  void emit(MaterializationResponsibility R,
             std::unique_ptr<MemoryBuffer> O) override;
 
   /// Set the 'ProcessAllSections' flag.
@@ -118,7 +118,7 @@ private:
   bool ProcessAllSections = false;
   bool OverrideObjectFlags = false;
   bool AutoClaimObjectSymbols = false;
-  std::map<VModuleKey, std::shared_ptr<RuntimeDyld::MemoryManager>> MemMgrs;
+  std::vector<std::unique_ptr<RuntimeDyld::MemoryManager>> MemMgrs;
 };
 
 class LegacyRTDyldObjectLinkingLayerBase {
diff --git a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index f27a814f33f..de1fa079dde 100644
--- a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -68,14 +68,16 @@ namespace orc {
 class PartitioningIRMaterializationUnit : public IRMaterializationUnit {
 public:
   PartitioningIRMaterializationUnit(ExecutionSession &ES, ThreadSafeModule TSM,
-                                    CompileOnDemandLayer &Parent)
-      : IRMaterializationUnit(ES, std::move(TSM)), Parent(Parent) {}
+                                    VModuleKey K, CompileOnDemandLayer &Parent)
+      : IRMaterializationUnit(ES, std::move(TSM), std::move(K)),
+        Parent(Parent) {}
 
   PartitioningIRMaterializationUnit(
       ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
       SymbolNameToDefinitionMap SymbolToDefinition,
       CompileOnDemandLayer &Parent)
-      : IRMaterializationUnit(std::move(TSM), std::move(SymbolFlags),
+      : IRMaterializationUnit(std::move(TSM), std::move(K),
+                              std::move(SymbolFlags),
                               std::move(SymbolToDefinition)),
         Parent(Parent) {}
 
@@ -116,8 +118,8 @@ void CompileOnDemandLayer::setPartitionFunction(PartitionFunction Partition) {
   this->Partition = std::move(Partition);
 }
 
-void CompileOnDemandLayer::emit(MaterializationResponsibility R, VModuleKey K,
-                                 ThreadSafeModule TSM) {
+void CompileOnDemandLayer::emit(MaterializationResponsibility R,
+                                ThreadSafeModule TSM) {
   assert(TSM.getModule() && "Null module");
 
   auto &ES = getExecutionSession();
@@ -149,7 +151,7 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R, VModuleKey K,
   // implementation dylib.
   if (auto Err = PDR.getImplDylib().define(
           llvm::make_unique<PartitioningIRMaterializationUnit>(
-              ES, std::move(TSM), *this))) {
+              ES, std::move(TSM), R.getVModuleKey(), *this))) {
     ES.reportError(std::move(Err));
     R.failMaterialization();
     return;
@@ -245,7 +247,7 @@ void CompileOnDemandLayer::emitPartition(
   // unmodified to the base layer.
   if (GVsToExtract == None) {
     Defs.clear();
-    BaseLayer.emit(std::move(R), ES.allocateVModule(), std::move(TSM));
+    BaseLayer.emit(std::move(R), std::move(TSM));
     return;
   }
 
@@ -285,9 +287,9 @@ void CompileOnDemandLayer::emitPartition(
 
   auto ExtractedTSM = extractSubModule(TSM, ".submodule", ShouldExtract);
   R.replace(llvm::make_unique<PartitioningIRMaterializationUnit>(
-      ES, std::move(TSM), *this));
+      ES, std::move(TSM), R.getVModuleKey(), *this));
 
-  BaseLayer.emit(std::move(R), ES.allocateVModule(), std::move(ExtractedTSM));
+  BaseLayer.emit(std::move(R), std::move(ExtractedTSM));
 }
 
 } // end namespace orc
diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index d477ca523d8..5e31e448c7d 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -368,8 +368,8 @@ void AsynchronousSymbolQuery::detach() {
 }
 
 MaterializationResponsibility::MaterializationResponsibility(
-    JITDylib &JD, SymbolFlagsMap SymbolFlags)
-    : JD(JD), SymbolFlags(std::move(SymbolFlags)) {
+    JITDylib &JD, SymbolFlagsMap SymbolFlags, VModuleKey K)
+    : JD(JD), SymbolFlags(std::move(SymbolFlags)), K(std::move(K)) {
   assert(!this->SymbolFlags.empty() && "Materializing nothing?");
 
 #ifndef NDEBUG
@@ -461,7 +461,12 @@ void MaterializationResponsibility::replace(
 }
 
 MaterializationResponsibility
-MaterializationResponsibility::delegate(const SymbolNameSet &Symbols) {
+MaterializationResponsibility::delegate(const SymbolNameSet &Symbols,
+                                        VModuleKey NewKey) {
+
+  if (NewKey == VModuleKey())
+    NewKey = K;
+
   SymbolFlagsMap DelegatedFlags;
 
   for (auto &Name : Symbols) {
@@ -474,7 +479,8 @@ MaterializationResponsibility::delegate(const SymbolNameSet &Symbols) {
     SymbolFlags.erase(I);
   }
 
-  return MaterializationResponsibility(JD, std::move(DelegatedFlags));
+  return MaterializationResponsibility(JD, std::move(DelegatedFlags),
+                                       std::move(NewKey));
 }
 
 void MaterializationResponsibility::addDependencies(
@@ -491,8 +497,9 @@ void MaterializationResponsibility::addDependenciesForAll(
 }
 
 AbsoluteSymbolsMaterializationUnit::AbsoluteSymbolsMaterializationUnit(
-    SymbolMap Symbols)
-    : MaterializationUnit(extractFlags(Symbols)), Symbols(std::move(Symbols)) {}
+    SymbolMap Symbols, VModuleKey K)
+    : MaterializationUnit(extractFlags(Symbols), std::move(K)),
+      Symbols(std::move(Symbols)) {}
 
 StringRef AbsoluteSymbolsMaterializationUnit::getName() const {
   return "<Absolute Symbols>";
@@ -519,9 +526,9 @@ AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) {
 }
 
 ReExportsMaterializationUnit::ReExportsMaterializationUnit(
-    JITDylib *SourceJD, SymbolAliasMap Aliases)
-    : MaterializationUnit(extractFlags(Aliases)), SourceJD(SourceJD),
-      Aliases(std::move(Aliases)) {}
+    JITDylib *SourceJD, SymbolAliasMap Aliases, VModuleKey K)
+    : MaterializationUnit(extractFlags(Aliases), std::move(K)),
+      SourceJD(SourceJD), Aliases(std::move(Aliases)) {}
 
 StringRef ReExportsMaterializationUnit::getName() const {
   return "<Reexports>";
diff --git a/lib/ExecutionEngine/Orc/IRCompileLayer.cpp b/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
index 6d029e16ba9..d952d1be70d 100644
--- a/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
+++ b/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
@@ -21,19 +21,19 @@ void IRCompileLayer::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) {
   this->NotifyCompiled = std::move(NotifyCompiled);
 }
 
-void IRCompileLayer::emit(MaterializationResponsibility R, VModuleKey K,
-                           ThreadSafeModule TSM) {
+void IRCompileLayer::emit(MaterializationResponsibility R,
+                          ThreadSafeModule TSM) {
   assert(TSM.getModule() && "Module must not be null");
 
   if (auto Obj = Compile(*TSM.getModule())) {
     {
       std::lock_guard<std::mutex> Lock(IRLayerMutex);
       if (NotifyCompiled)
-        NotifyCompiled(K, std::move(TSM));
+        NotifyCompiled(R.getVModuleKey(), std::move(TSM));
       else
         TSM = ThreadSafeModule();
     }
-    BaseLayer.emit(std::move(R), std::move(K), std::move(*Obj));
+    BaseLayer.emit(std::move(R), std::move(*Obj));
   } else {
     R.failMaterialization();
     getExecutionSession().reportError(Obj.takeError());
diff --git a/lib/ExecutionEngine/Orc/IRTransformLayer.cpp b/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
index acba7916d40..7bc0d696e3a 100644
--- a/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
+++ b/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
@@ -18,12 +18,12 @@ IRTransformLayer::IRTransformLayer(ExecutionSession &ES,
                                      TransformFunction Transform)
     : IRLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
-void IRTransformLayer::emit(MaterializationResponsibility R, VModuleKey K,
-                             ThreadSafeModule TSM) {
+void IRTransformLayer::emit(MaterializationResponsibility R,
+                            ThreadSafeModule TSM) {
   assert(TSM.getModule() && "Module must not be null");
 
   if (auto TransformedTSM = Transform(std::move(TSM), R))
-    BaseLayer.emit(std::move(R), std::move(K), std::move(*TransformedTSM));
+    BaseLayer.emit(std::move(R), std::move(*TransformedTSM));
   else {
     R.failMaterialization();
     getExecutionSession().reportError(TransformedTSM.takeError());
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 6bc33c90cbc..c10d15ab117 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -27,8 +27,9 @@ public:
   using CompileFunction = JITCompileCallbackManager::CompileFunction;
 
   CompileCallbackMaterializationUnit(SymbolStringPtr Name,
-                                     CompileFunction Compile)
-      : MaterializationUnit(SymbolFlagsMap({{Name, JITSymbolFlags::Exported}})),
+                                     CompileFunction Compile, VModuleKey K)
+      : MaterializationUnit(SymbolFlagsMap({{Name, JITSymbolFlags::Exported}}),
+                            std::move(K)),
         Name(std::move(Name)), Compile(std::move(Compile)) {}
 
   StringRef getName() const override { return "<Compile Callbacks>"; }
@@ -67,7 +68,8 @@ JITCompileCallbackManager::getCompileCallback(CompileFunction Compile) {
     AddrToSymbol[*TrampolineAddr] = CallbackName;
     cantFail(CallbacksJD.define(
         llvm::make_unique<CompileCallbackMaterializationUnit>(
-            std::move(CallbackName), std::move(Compile))));
+            std::move(CallbackName), std::move(Compile),
+            ES.allocateVModule())));
     return *TrampolineAddr;
   } else
     return TrampolineAddr.takeError();
diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp
index e464da267ae..ac71a5e7673 100644
--- a/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -65,15 +65,13 @@ Error LLJIT::addIRModule(JITDylib &JD, ThreadSafeModule TSM) {
   if (auto Err = applyDataLayout(*TSM.getModule()))
     return Err;
 
-  auto K = ES->allocateVModule();
-  return CompileLayer.add(JD, K, std::move(TSM));
+  return CompileLayer.add(JD, std::move(TSM), ES->allocateVModule());
 }
 
 Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
   assert(Obj && "Can not add null object");
 
-  auto K = ES->allocateVModule();
-  return ObjLinkingLayer.add(JD, K, std::move(Obj));
+  return ObjLinkingLayer.add(JD, std::move(Obj), ES->allocateVModule());
 }
 
 Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
@@ -84,8 +82,9 @@ Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
 LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES,
              std::unique_ptr<TargetMachine> TM, DataLayout DL)
     : ES(std::move(ES)), Main(this->ES->getMainJITDylib()), DL(std::move(DL)),
-      ObjLinkingLayer(*this->ES,
-                      [this](VModuleKey K) { return getMemoryManager(K); }),
+      ObjLinkingLayer(
+          *this->ES,
+          []() { return llvm::make_unique<SectionMemoryManager>(); }),
       CompileLayer(*this->ES, ObjLinkingLayer,
                    TMOwningSimpleCompiler(std::move(TM))),
       CtorRunner(Main), DtorRunner(Main) {}
@@ -93,8 +92,9 @@ LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES,
 LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES, JITTargetMachineBuilder JTMB,
              DataLayout DL, unsigned NumCompileThreads)
     : ES(std::move(ES)), Main(this->ES->getMainJITDylib()), DL(std::move(DL)),
-      ObjLinkingLayer(*this->ES,
-                      [this](VModuleKey K) { return getMemoryManager(K); }),
+      ObjLinkingLayer(
+          *this->ES,
+          []() { return llvm::make_unique<SectionMemoryManager>(); }),
       CompileLayer(*this->ES, ObjLinkingLayer,
                    ConcurrentIRCompiler(std::move(JTMB))),
       CtorRunner(Main), DtorRunner(Main) {
@@ -117,11 +117,6 @@ LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES, JITTargetMachineBuilder JTMB,
       });
 }
 
-std::unique_ptr<RuntimeDyld::MemoryManager>
-LLJIT::getMemoryManager(VModuleKey K) {
-  return llvm::make_unique<SectionMemoryManager>();
-}
-
 std::string LLJIT::mangle(StringRef UnmangledName) {
   std::string MangledName;
   {
@@ -187,8 +182,7 @@ Error LLLazyJIT::addLazyIRModule(JITDylib &JD, ThreadSafeModule TSM) {
 
   recordCtorDtors(*TSM.getModule());
 
-  auto K = ES->allocateVModule();
-  return CODLayer.add(JD, K, std::move(TSM));
+  return CODLayer.add(JD, std::move(TSM), ES->allocateVModule());
 }
 
 LLLazyJIT::LLLazyJIT(
diff --git a/lib/ExecutionEngine/Orc/Layer.cpp b/lib/ExecutionEngine/Orc/Layer.cpp
index 22dbf5c26d1..11af76825e9 100644
--- a/lib/ExecutionEngine/Orc/Layer.cpp
+++ b/lib/ExecutionEngine/Orc/Layer.cpp
@@ -19,14 +19,14 @@ namespace orc {
 IRLayer::IRLayer(ExecutionSession &ES) : ES(ES) {}
 IRLayer::~IRLayer() {}
 
-Error IRLayer::add(JITDylib &JD, VModuleKey K, ThreadSafeModule TSM) {
+Error IRLayer::add(JITDylib &JD, ThreadSafeModule TSM, VModuleKey K) {
   return JD.define(llvm::make_unique<BasicIRLayerMaterializationUnit>(
       *this, std::move(K), std::move(TSM)));
 }
 
 IRMaterializationUnit::IRMaterializationUnit(ExecutionSession &ES,
-                                             ThreadSafeModule TSM)
-    : MaterializationUnit(SymbolFlagsMap()), TSM(std::move(TSM)) {
+                                             ThreadSafeModule TSM, VModuleKey K)
+    : MaterializationUnit(SymbolFlagsMap(), std::move(K)), TSM(std::move(TSM)) {
 
   assert(this->TSM && "Module must not be null");
 
@@ -42,10 +42,10 @@ IRMaterializationUnit::IRMaterializationUnit(ExecutionSession &ES,
 }
 
 IRMaterializationUnit::IRMaterializationUnit(
-    ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
+    ThreadSafeModule TSM, VModuleKey K, SymbolFlagsMap SymbolFlags,
     SymbolNameToDefinitionMap SymbolToDefinition)
-    : MaterializationUnit(std::move(SymbolFlags)), TSM(std::move(TSM)),
-      SymbolToDefinition(std::move(SymbolToDefinition)) {}
+    : MaterializationUnit(std::move(SymbolFlags), std::move(K)),
+      TSM(std::move(TSM)), SymbolToDefinition(std::move(SymbolToDefinition)) {}
 
 StringRef IRMaterializationUnit::getName() const {
   if (TSM.getModule())
@@ -71,8 +71,9 @@ void IRMaterializationUnit::discard(const JITDylib &JD,
 
 BasicIRLayerMaterializationUnit::BasicIRLayerMaterializationUnit(
     IRLayer &L, VModuleKey K, ThreadSafeModule TSM)
-    : IRMaterializationUnit(L.getExecutionSession(), std::move(TSM)), L(L),
-      K(std::move(K)) {}
+    : IRMaterializationUnit(L.getExecutionSession(), std::move(TSM),
+                            std::move(K)),
+      L(L), K(std::move(K)) {}
 
 void BasicIRLayerMaterializationUnit::materialize(
     MaterializationResponsibility R) {
@@ -94,7 +95,7 @@ void BasicIRLayerMaterializationUnit::materialize(
     dbgs() << "Emitting, for " << R.getTargetJITDylib().getName() << ", "
            << *this << "\n";
   }););
-  L.emit(std::move(R), std::move(K), std::move(TSM));
+  L.emit(std::move(R), std::move(TSM));
   LLVM_DEBUG(ES.runSessionLocked([&]() {
     dbgs() << "Finished emitting, for " << R.getTargetJITDylib().getName()
            << ", " << *this << "\n";
@@ -105,8 +106,8 @@ ObjectLayer::ObjectLayer(ExecutionSession &ES) : ES(ES) {}
 
 ObjectLayer::~ObjectLayer() {}
 
-Error ObjectLayer::add(JITDylib &JD, VModuleKey K,
-                       std::unique_ptr<MemoryBuffer> O) {
+Error ObjectLayer::add(JITDylib &JD, std::unique_ptr<MemoryBuffer> O,
+                       VModuleKey K) {
   auto ObjMU = BasicObjectLayerMaterializationUnit::Create(*this, std::move(K),
                                                            std::move(O));
   if (!ObjMU)
@@ -131,7 +132,7 @@ BasicObjectLayerMaterializationUnit::Create(ObjectLayer &L, VModuleKey K,
 BasicObjectLayerMaterializationUnit::BasicObjectLayerMaterializationUnit(
     ObjectLayer &L, VModuleKey K, std::unique_ptr<MemoryBuffer> O,
     SymbolFlagsMap SymbolFlags)
-    : MaterializationUnit(std::move(SymbolFlags)), L(L), K(std::move(K)),
+    : MaterializationUnit(std::move(SymbolFlags), std::move(K)), L(L),
       O(std::move(O)) {}
 
 StringRef BasicObjectLayerMaterializationUnit::getName() const {
@@ -142,7 +143,7 @@ StringRef BasicObjectLayerMaterializationUnit::getName() const {
 
 void BasicObjectLayerMaterializationUnit::materialize(
     MaterializationResponsibility R) {
-  L.emit(std::move(R), std::move(K), std::move(O));
+  L.emit(std::move(R), std::move(O));
 }
 
 void BasicObjectLayerMaterializationUnit::discard(const JITDylib &JD,
diff --git a/lib/ExecutionEngine/Orc/LazyReexports.cpp b/lib/ExecutionEngine/Orc/LazyReexports.cpp
index 1cce0c6cd2c..af4c508d7f1 100644
--- a/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -125,8 +125,8 @@ createLocalLazyCallThroughManager(const Triple &T, ExecutionSession &ES,
 
 LazyReexportsMaterializationUnit::LazyReexportsMaterializationUnit(
     LazyCallThroughManager &LCTManager, IndirectStubsManager &ISManager,
-    JITDylib &SourceJD, SymbolAliasMap CallableAliases)
-    : MaterializationUnit(extractFlags(CallableAliases)),
+    JITDylib &SourceJD, SymbolAliasMap CallableAliases, VModuleKey K)
+    : MaterializationUnit(extractFlags(CallableAliases), std::move(K)),
       LCTManager(LCTManager), ISManager(ISManager), SourceJD(SourceJD),
       CallableAliases(std::move(CallableAliases)),
       NotifyResolved(LazyCallThroughManager::createNotifyResolvedFunction(
diff --git a/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp b/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
index 0be23f2e1a4..825f5320473 100644
--- a/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
+++ b/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
@@ -18,12 +18,12 @@ ObjectTransformLayer::ObjectTransformLayer(ExecutionSession &ES,
                                             TransformFunction Transform)
     : ObjectLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
-void ObjectTransformLayer::emit(MaterializationResponsibility R, VModuleKey K,
+void ObjectTransformLayer::emit(MaterializationResponsibility R,
                                 std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Module must not be null");
 
   if (auto TransformedObj = Transform(std::move(O)))
-    BaseLayer.emit(std::move(R), std::move(K), std::move(*TransformedObj));
+    BaseLayer.emit(std::move(R), std::move(*TransformedObj));
   else {
     R.failMaterialization();
     getExecutionSession().reportError(TransformedObj.takeError());
diff --git a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index fa574140d48..8511e41c4f2 100644
--- a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -84,8 +84,7 @@ RTDyldObjectLinkingLayer::RTDyldObjectLinkingLayer(
       NotifyEmitted(std::move(NotifyEmitted)) {}
 
 void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
-                                     VModuleKey K,
-                                     std::unique_ptr<MemoryBuffer> O) {
+                                    std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Object must not be null");
 
   // This method launches an asynchronous link step that will fulfill our
@@ -121,15 +120,9 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
     }
   }
 
-  auto MemoryManager = GetMemoryManager(K);
-  auto &MemMgr = *MemoryManager;
-  {
-    std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
-
-    assert(!MemMgrs.count(K) &&
-           "A memory manager already exists for this key?");
-    MemMgrs[K] = std::move(MemoryManager);
-  }
+  auto K = R.getVModuleKey();
+  MemMgrs.push_back(GetMemoryManager());
+  auto &MemMgr = *MemMgrs.back();
 
   JITDylibSearchOrderResolver Resolver(*SharedR);
 
diff --git a/unittests/ExecutionEngine/Orc/OrcTestCommon.h b/unittests/ExecutionEngine/Orc/OrcTestCommon.h
index 284a1e37f10..e76d2fae5e3 100644
--- a/unittests/ExecutionEngine/Orc/OrcTestCommon.h
+++ b/unittests/ExecutionEngine/Orc/OrcTestCommon.h
@@ -97,7 +97,7 @@ public:
       orc::SymbolFlagsMap SymbolFlags, MaterializeFunction Materialize,
       DiscardFunction Discard = DiscardFunction(),
       DestructorFunction Destructor = DestructorFunction())
-      : MaterializationUnit(std::move(SymbolFlags)),
+      : MaterializationUnit(std::move(SymbolFlags), orc::VModuleKey()),
         Materialize(std::move(Materialize)), Discard(std::move(Discard)),
         Destructor(std::move(Destructor)) {}
 
diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
index 75ccfc9ab0d..1660670ae63 100644
--- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
@@ -54,7 +54,7 @@ static bool testSetProcessAllSections(std::unique_ptr<MemoryBuffer> Obj,
   auto &JD = ES.createJITDylib("main");
   auto Foo = ES.intern("foo");
 
-  RTDyldObjectLinkingLayer ObjLayer(ES, [&DebugSectionSeen](VModuleKey) {
+  RTDyldObjectLinkingLayer ObjLayer(ES, [&DebugSectionSeen]() {
     return llvm::make_unique<MemoryManagerWrapper>(DebugSectionSeen);
   });
 
@@ -65,8 +65,7 @@ static bool testSetProcessAllSections(std::unique_ptr<MemoryBuffer> Obj,
   auto OnReadyDoNothing = [](Error Err) { cantFail(std::move(Err)); };
 
   ObjLayer.setProcessAllSections(ProcessAllSections);
-  auto K = ES.allocateVModule();
-  cantFail(ObjLayer.add(JD, K, std::move(Obj)));
+  cantFail(ObjLayer.add(JD, std::move(Obj), ES.allocateVModule()));
   ES.lookup({&JD}, {Foo}, OnResolveDoNothing, OnReadyDoNothing,
             NoDependenciesToRegister);
   return DebugSectionSeen;
@@ -152,12 +151,12 @@ TEST(RTDyldObjectLinkingLayerTest, TestOverrideObjectFlags) {
   auto &JD = ES.createJITDylib("main");
   auto Foo = ES.intern("foo");
   RTDyldObjectLinkingLayer ObjLayer(
-      ES, [](VModuleKey) { return llvm::make_unique<SectionMemoryManager>(); });
+      ES, []() { return llvm::make_unique<SectionMemoryManager>(); });
   IRCompileLayer CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM));
 
   ObjLayer.setOverrideObjectFlagsWithResponsibilityFlags(true);
 
-  cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M)));
+  cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
   ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
             [](Error Err) { cantFail(std::move(Err)); },
             NoDependenciesToRegister);
@@ -214,12 +213,12 @@ TEST(RTDyldObjectLinkingLayerTest, TestAutoClaimResponsibilityForSymbols) {
   auto &JD = ES.createJITDylib("main");
   auto Foo = ES.intern("foo");
   RTDyldObjectLinkingLayer ObjLayer(
-      ES, [](VModuleKey) { return llvm::make_unique<SectionMemoryManager>(); });
+      ES, []() { return llvm::make_unique<SectionMemoryManager>(); });
   IRCompileLayer CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM));
 
   ObjLayer.setAutoClaimResponsibilityForObjectSymbols(true);
 
-  cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M)));
+  cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
   ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
             [](Error Err) { cantFail(std::move(Err)); },
             NoDependenciesToRegister);
-- 
GitLab


From 2914ea59ea6f03329456a09743adca0f96e87bf3 Mon Sep 17 00:00:00 2001
From: David Bolvansky <david.bolvansky@gmail.com>
Date: Tue, 16 Oct 2018 21:18:31 +0000
Subject: [PATCH 0258/1116] [InstCombine] Cleanup libfunc attribute inferring

Reviewers: efriedma

Reviewed By: efriedma

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53338

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344645 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/BuildLibCalls.h |   3 +-
 lib/Transforms/IPO/InferFunctionAttrs.cpp     |   2 +-
 lib/Transforms/Scalar/LoopIdiomRecognize.cpp  |   5 +-
 lib/Transforms/Utils/BuildLibCalls.cpp        | 121 ++++++++++--------
 lib/Transforms/Utils/SimplifyLibCalls.cpp     |   2 +-
 test/Transforms/InstCombine/pr39177.ll        |  13 +-
 6 files changed, 88 insertions(+), 58 deletions(-)

diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index ab7d22c024c..eafe07f4928 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -28,7 +28,8 @@ namespace llvm {
   /// If the library function is unavailable, this doesn't modify it.
   ///
   /// Returns true if any attributes were set and false otherwise.
-  bool inferLibFuncAttributes(Function *Func, const TargetLibraryInfo &TLI);
+  bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI);
+  bool inferLibFuncAttributes(Module *M, StringRef Name, const TargetLibraryInfo &TLI);
 
   /// Check whether the overloaded unary floating point function
   /// corresponding to \a Ty is available.
diff --git a/lib/Transforms/IPO/InferFunctionAttrs.cpp b/lib/Transforms/IPO/InferFunctionAttrs.cpp
index c53a9b5e819..470f97b8ba6 100644
--- a/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -27,7 +27,7 @@ static bool inferAllPrototypeAttributes(Module &M,
     // We only infer things using the prototype and the name; we don't need
     // definitions.
     if (F.isDeclaration() && !F.hasFnAttribute((Attribute::OptimizeNone)))
-      Changed |= inferLibFuncAttributes(&F, TLI);
+      Changed |= inferLibFuncAttributes(F, TLI);
 
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 4b375956a12..241dbed30e1 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -928,10 +928,11 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Type *Int8PtrTy = DestInt8PtrTy;
 
     Module *M = TheStore->getModule();
+    StringRef FuncName = "memset_pattern16";
     Value *MSP =
-        M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
+        M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
                                Int8PtrTy, Int8PtrTy, IntPtr);
-    inferLibFuncAttributes(M->getFunction("memset_pattern16"), *TLI);
+    inferLibFuncAttributes(M, FuncName, *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
     // an constant array of 16-bytes.  Plop the value into a mergable global.
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 234449b2bf8..06d197be095 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -121,11 +121,15 @@ static bool setNonLazyBind(Function &F) {
   return true;
 }
 
-bool llvm::inferLibFuncAttributes(Function *Func,
+bool llvm::inferLibFuncAttributes(Module *M, StringRef Name,
                                   const TargetLibraryInfo &TLI) {
-  if (!Func)
+  Function *F = M->getFunction(Name);
+  if (!F)
     return false;
-  Function &F = *Func;
+  return inferLibFuncAttributes(*F, TLI);
+}
+
+bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   LibFunc TheLibFunc;
   if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
     return false;
@@ -774,11 +778,12 @@ Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef StrlenName = TLI->getName(LibFunc_strlen);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrLen = M->getOrInsertFunction("strlen", DL.getIntPtrType(Context),
+  Constant *StrLen = M->getOrInsertFunction(StrlenName, DL.getIntPtrType(Context),
                                             B.getInt8PtrTy());
-  inferLibFuncAttributes(M->getFunction("strlen"), *TLI);
-  CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), "strlen");
+  inferLibFuncAttributes(M, StrlenName, *TLI);
+  CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), StrlenName);
   if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
 
@@ -791,13 +796,14 @@ Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef StrChrName = TLI->getName(LibFunc_strchr);
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
   Constant *StrChr =
-      M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty);
-  inferLibFuncAttributes(M->getFunction("strchr"), *TLI);
+      M->getOrInsertFunction(StrChrName, I8Ptr, I8Ptr, I32Ty);
+  inferLibFuncAttributes(M, StrChrName, *TLI);
   CallInst *CI = B.CreateCall(
-      StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr");
+      StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, StrChrName);
   if (const Function *F = dyn_cast<Function>(StrChr->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
@@ -809,13 +815,14 @@ Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef StrNCmpName = TLI->getName(LibFunc_strncmp);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *StrNCmp = M->getOrInsertFunction("strncmp", B.getInt32Ty(),
+  Value *StrNCmp = M->getOrInsertFunction(StrNCmpName, B.getInt32Ty(),
                                           B.getInt8PtrTy(), B.getInt8PtrTy(),
                                           DL.getIntPtrType(Context));
-  inferLibFuncAttributes(M->getFunction("strncmp"), *TLI);
+  inferLibFuncAttributes(M, StrNCmpName, *TLI);
   CallInst *CI = B.CreateCall(
-      StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "strncmp");
+      StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, StrNCmpName);
 
   if (const Function *F = dyn_cast<Function>(StrNCmp->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -831,7 +838,7 @@ Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
   Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr);
-  inferLibFuncAttributes(M->getFunction(Name), *TLI);
+  inferLibFuncAttributes(M, Name, *TLI);
   CallInst *CI =
       B.CreateCall(StrCpy, {castToCStr(Dst, B), castToCStr(Src, B)}, Name);
   if (const Function *F = dyn_cast<Function>(StrCpy->stripPointerCasts()))
@@ -848,9 +855,9 @@ Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType());
-  inferLibFuncAttributes(M->getFunction(Name), *TLI);
+  inferLibFuncAttributes(M, Name, *TLI);
   CallInst *CI = B.CreateCall(
-      StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, "strncpy");
+      StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, Name);
   if (const Function *F = dyn_cast<Function>(StrNCpy->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
@@ -885,12 +892,13 @@ Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef MemChrName = TLI->getName(LibFunc_memchr);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemChr = M->getOrInsertFunction("memchr", B.getInt8PtrTy(),
+  Value *MemChr = M->getOrInsertFunction(MemChrName, B.getInt8PtrTy(),
                                          B.getInt8PtrTy(), B.getInt32Ty(),
                                          DL.getIntPtrType(Context));
-  inferLibFuncAttributes(M->getFunction("memchr"), *TLI);
-  CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, "memchr");
+  inferLibFuncAttributes(M, MemChrName, *TLI);
+  CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, MemChrName);
 
   if (const Function *F = dyn_cast<Function>(MemChr->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -904,13 +912,14 @@ Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef MemCmpName = TLI->getName(LibFunc_memcmp);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemCmp = M->getOrInsertFunction("memcmp", B.getInt32Ty(),
+  Value *MemCmp = M->getOrInsertFunction(MemCmpName, B.getInt32Ty(),
                                          B.getInt8PtrTy(), B.getInt8PtrTy(),
                                          DL.getIntPtrType(Context));
-  inferLibFuncAttributes(M->getFunction("memcmp"), *TLI);
+  inferLibFuncAttributes(M, MemCmpName, *TLI);
   CallInst *CI = B.CreateCall(
-      MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "memcmp");
+      MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, MemCmpName);
 
   if (const Function *F = dyn_cast<Function>(MemCmp->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -977,14 +986,15 @@ Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(), B.getInt32Ty());
-  inferLibFuncAttributes(M->getFunction("putchar"), *TLI);
+  StringRef PutCharName = TLI->getName(LibFunc_putchar);
+  Value *PutChar = M->getOrInsertFunction(PutCharName, B.getInt32Ty(), B.getInt32Ty());
+  inferLibFuncAttributes(M, PutCharName, *TLI);
   CallInst *CI = B.CreateCall(PutChar,
                               B.CreateIntCast(Char,
                               B.getInt32Ty(),
                               /*isSigned*/true,
                               "chari"),
-                              "putchar");
+                              PutCharName);
 
   if (const Function *F = dyn_cast<Function>(PutChar->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -997,10 +1007,11 @@ Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef PutsName = TLI->getName(LibFunc_puts);
   Value *PutS =
-      M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy());
-  inferLibFuncAttributes(M->getFunction("puts"), *TLI);
-  CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), "puts");
+      M->getOrInsertFunction(PutsName, B.getInt32Ty(), B.getInt8PtrTy());
+  inferLibFuncAttributes(M, PutsName, *TLI);
+  CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), PutsName);
   if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
@@ -1012,13 +1023,14 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  Constant *F = M->getOrInsertFunction("fputc", B.getInt32Ty(), B.getInt32Ty(),
+  StringRef FPutcName = TLI->getName(LibFunc_fputc);
+  Constant *F = M->getOrInsertFunction(FPutcName, B.getInt32Ty(), B.getInt32Ty(),
                                        File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M->getFunction("fputc"), *TLI);
+    inferLibFuncAttributes(M, FPutcName, *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
                          "chari");
-  CallInst *CI = B.CreateCall(F, {Char, File}, "fputc");
+  CallInst *CI = B.CreateCall(F, {Char, File}, FPutcName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1031,12 +1043,13 @@ Value *llvm::emitFPutCUnlocked(Value *Char, Value *File, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  Constant *F = M->getOrInsertFunction("fputc_unlocked", B.getInt32Ty(),
+  StringRef FPutcUnlockedName = TLI->getName(LibFunc_fputc_unlocked);
+  Constant *F = M->getOrInsertFunction(FPutcUnlockedName, B.getInt32Ty(),
                                        B.getInt32Ty(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M->getFunction("fputc_unlocked"), *TLI);
+    inferLibFuncAttributes(M, FPutcUnlockedName, *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/ true, "chari");
-  CallInst *CI = B.CreateCall(F, {Char, File}, "fputc_unlocked");
+  CallInst *CI = B.CreateCall(F, {Char, File}, FPutcUnlockedName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1053,8 +1066,8 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
   Constant *F = M->getOrInsertFunction(
       FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M->getFunction(FPutsName), *TLI);
-  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs");
+    inferLibFuncAttributes(M, FPutsName, *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1071,8 +1084,8 @@ Value *llvm::emitFPutSUnlocked(Value *Str, Value *File, IRBuilder<> &B,
   Constant *F = M->getOrInsertFunction(FPutsUnlockedName, B.getInt32Ty(),
                                        B.getInt8PtrTy(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M->getFunction(FPutsUnlockedName), *TLI);
-  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs_unlocked");
+    inferLibFuncAttributes(M, FPutsUnlockedName, *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsUnlockedName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1092,7 +1105,7 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
       DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
 
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M->getFunction(FWriteName), *TLI);
+    inferLibFuncAttributes(M, FWriteName, *TLI);
   CallInst *CI =
       B.CreateCall(F, {castToCStr(Ptr, B), Size,
                        ConstantInt::get(DL.getIntPtrType(Context), 1), File});
@@ -1108,11 +1121,12 @@ Value *llvm::emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef MallocName = TLI->getName(LibFunc_malloc);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *Malloc = M->getOrInsertFunction("malloc", B.getInt8PtrTy(),
+  Value *Malloc = M->getOrInsertFunction(MallocName, B.getInt8PtrTy(),
                                          DL.getIntPtrType(Context));
-  inferLibFuncAttributes(M->getFunction("malloc"), *TLI);
-  CallInst *CI = B.CreateCall(Malloc, Num, "malloc");
+  inferLibFuncAttributes(M, MallocName, *TLI);
+  CallInst *CI = B.CreateCall(Malloc, Num, MallocName);
 
   if (const Function *F = dyn_cast<Function>(Malloc->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -1126,12 +1140,13 @@ Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef CallocName = TLI.getName(LibFunc_calloc);
   const DataLayout &DL = M->getDataLayout();
   IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
-  Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
+  Value *Calloc = M->getOrInsertFunction(CallocName, Attrs, B.getInt8PtrTy(),
                                          PtrType, PtrType);
-  inferLibFuncAttributes(M->getFunction("calloc"), TLI);
-  CallInst *CI = B.CreateCall(Calloc, {Num, Size}, "calloc");
+  inferLibFuncAttributes(M, CallocName, TLI);
+  CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName);
 
   if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -1153,7 +1168,7 @@ Value *llvm::emitFWriteUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
       DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
 
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M->getFunction(FWriteUnlockedName), *TLI);
+    inferLibFuncAttributes(M, FWriteUnlockedName, *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
@@ -1167,11 +1182,12 @@ Value *llvm::emitFGetCUnlocked(Value *File, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef FGetCUnlockedName = TLI->getName(LibFunc_fgetc_unlocked);
   Constant *F =
-      M->getOrInsertFunction("fgetc_unlocked", B.getInt32Ty(), File->getType());
+      M->getOrInsertFunction(FGetCUnlockedName, B.getInt32Ty(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M->getFunction("fgetc_unlocked"), *TLI);
-  CallInst *CI = B.CreateCall(F, File, "fgetc_unlocked");
+    inferLibFuncAttributes(M, FGetCUnlockedName, *TLI);
+  CallInst *CI = B.CreateCall(F, File, FGetCUnlockedName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1184,12 +1200,13 @@ Value *llvm::emitFGetSUnlocked(Value *Str, Value *Size, Value *File,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef FGetSUnlockedName = TLI->getName(LibFunc_fgets_unlocked);
   Constant *F =
-      M->getOrInsertFunction("fgets_unlocked", B.getInt8PtrTy(),
+      M->getOrInsertFunction(FGetSUnlockedName, B.getInt8PtrTy(),
                              B.getInt8PtrTy(), B.getInt32Ty(), File->getType());
-  inferLibFuncAttributes(M->getFunction("fgets_unlocked"), *TLI);
+  inferLibFuncAttributes(M, FGetSUnlockedName, *TLI);
   CallInst *CI =
-      B.CreateCall(F, {castToCStr(Str, B), Size, File}, "fgets_unlocked");
+      B.CreateCall(F, {castToCStr(Str, B), Size, File}, FGetSUnlockedName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1210,7 +1227,7 @@ Value *llvm::emitFReadUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
       DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
 
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M->getFunction(FReadUnlockedName), *TLI);
+    inferLibFuncAttributes(M, FReadUnlockedName, *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 6f24dc10e1e..41a495a0484 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -145,7 +145,7 @@ static bool isLocallyOpenedFile(Value *File, CallInst *CI, IRBuilder<> &B,
       Func != LibFunc_fopen)
     return false;
 
-  inferLibFuncAttributes(CI->getCalledFunction(), *TLI);
+  inferLibFuncAttributes(*CI->getCalledFunction(), *TLI);
   if (PointerMayBeCaptured(File, true, true))
     return false;
 
diff --git a/test/Transforms/InstCombine/pr39177.ll b/test/Transforms/InstCombine/pr39177.ll
index a047a079f58..35c5ce0d3f6 100644
--- a/test/Transforms/InstCombine/pr39177.ll
+++ b/test/Transforms/InstCombine/pr39177.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -instcombine -S
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
 %struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
@@ -9,6 +10,10 @@
 @fwrite = alias i64 (i8*, i64, i64, %struct._IO_FILE*), i64 (i8*, i64, i64, %struct._IO_FILE*)* @__fwrite_alias
 
 define i64 @__fwrite_alias(i8* %ptr, i64 %size, i64 %n, %struct._IO_FILE* %s) {
+; CHECK-LABEL: @__fwrite_alias(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i64 0
+;
 entry:
   %ptr.addr = alloca i8*, align 8
   %size.addr = alloca i64, align 8
@@ -22,6 +27,12 @@ entry:
 }
 
 define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @__fwrite_alias(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i64 7, i64 1, %struct._IO_FILE* [[TMP0]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %retval = alloca i32, align 4
   store i32 0, i32* %retval, align 4
-- 
GitLab


From 3b4af70c0920f9f190df5fd79a23ea315c8d8b15 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 16 Oct 2018 22:29:36 +0000
Subject: [PATCH 0259/1116] [X86] Match (cmp (and (shr X, C), mask), 0) to
 BEXTR+TEST.

Without this we match the CMP+AND to a TEST and then match the SHR separately. I'm trusting analyzeCompare to remove the TEST during the peephole pass. Otherwise we need to check the flag users to see if they only use the Z flag.

This recovers a case lost by r344270.

Differential Revision: https://reviews.llvm.org/D53310

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344649 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp | 47 ++++++++++++++++++++----------
 test/CodeGen/X86/tbm_patterns.ll   |  6 ++--
 2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index ede1c0bd7df..c06ad11589d 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -454,7 +454,7 @@ namespace {
     }
 
     bool foldLoadStoreIntoMemOperand(SDNode *Node);
-    bool matchBEXTRFromAndImm(SDNode *Node);
+    MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
     bool matchBEXTR(SDNode *Node);
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
@@ -2714,7 +2714,7 @@ bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) {
 }
 
 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
-bool X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
+MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   SDLoc dl(Node);
 
@@ -2729,30 +2729,30 @@ bool X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
   // BEXTR?
   if (!Subtarget->hasTBM() &&
       !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
-    return false;
+    return nullptr;
 
   // Must have a shift right.
   if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
-    return false;
+    return nullptr;
 
   // Shift can't have additional users.
   if (!N0->hasOneUse())
-    return false;
+    return nullptr;
 
   // Only supported for 32 and 64 bits.
   if (NVT != MVT::i32 && NVT != MVT::i64)
-    return false;
+    return nullptr;
 
   // Shift amount and RHS of and must be constant.
   ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
   ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
   if (!MaskCst || !ShiftCst)
-    return false;
+    return nullptr;
 
   // And RHS must be a mask.
   uint64_t Mask = MaskCst->getZExtValue();
   if (!isMask_64(Mask))
-    return false;
+    return nullptr;
 
   uint64_t Shift = ShiftCst->getZExtValue();
   uint64_t MaskSize = countPopulation(Mask);
@@ -2760,12 +2760,12 @@ bool X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
   // Don't interfere with something that can be handled by extracting AH.
   // TODO: If we are able to fold a load, BEXTR might still be better than AH.
   if (Shift == 8 && MaskSize == 8)
-    return false;
+    return nullptr;
 
   // Make sure we are only using bits that were in the original value, not
   // shifted in.
   if (Shift + MaskSize > NVT.getSizeInBits())
-    return false;
+    return nullptr;
 
   SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
   unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
@@ -2794,9 +2794,7 @@ bool X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
     NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
   }
 
-  ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
-  CurDAG->RemoveDeadNode(Node);
-  return true;
+  return NewNode;
 }
 
 // Emit a PCMISTR(I/M) instruction.
@@ -3106,8 +3104,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case ISD::AND:
-    if (matchBEXTRFromAndImm(Node))
+    if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
+      ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+      CurDAG->RemoveDeadNode(Node);
       return;
+    }
     if (matchBEXTR(Node))
       return;
     if (AndImmShrink && shrinkAndImmediate(Node))
@@ -3551,6 +3552,22 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     // Save the original VT of the compare.
     MVT CmpVT = N0.getSimpleValueType();
 
+    // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
+    // by a test instruction. The test should be removed later by
+    // analyzeCompare if we are using only the zero flag.
+    // TODO: Should we check the users and use the BEXTR flags directly?
+    if (isNullConstant(N1) && N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+      if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
+        unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
+                                             : X86::TEST32rr;
+        SDValue BEXTR = SDValue(NewNode, 0);
+        NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
+        ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+        CurDAG->RemoveDeadNode(Node);
+        return;
+      }
+    }
+
     // We can peek through truncates, but we need to be careful below.
     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
       N0 = N0.getOperand(0);
@@ -3561,7 +3578,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     if (N0.getOpcode() == ISD::AND &&
         N0.getNode()->hasOneUse() &&
         N0.getValueType() != MVT::i8 &&
-        X86::isZeroNode(N1)) {
+        isNullConstant(N1)) {
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
       if (!C) break;
       uint64_t Mask = C->getZExtValue();
diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll
index 6865cc5a0ef..2b335ea4268 100644
--- a/test/CodeGen/X86/tbm_patterns.ll
+++ b/test/CodeGen/X86/tbm_patterns.ll
@@ -53,8 +53,7 @@ define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    shrl $4, %edi
-; CHECK-NEXT:    testl $4095, %edi # imm = 0xFFF
+; CHECK-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
 ; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = lshr i32 %a, 4
@@ -114,8 +113,7 @@ define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    shrl $4, %edi
-; CHECK-NEXT:    testl $4095, %edi # imm = 0xFFF
+; CHECK-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
 ; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = lshr i64 %a, 4
-- 
GitLab


From 97dd9835befe8db3403bb3c7321a842f74e7c1e6 Mon Sep 17 00:00:00 2001
From: Jonathan Metzman <metzman@chromium.org>
Date: Tue, 16 Oct 2018 23:43:57 +0000
Subject: [PATCH 0260/1116] [SanitizerCoverage] Don't duplicate code to get
 section pointers

Summary:
Merge code used to get section start and section end pointers
for SanitizerCoverage constructors. This includes code that handles
getting the start pointers when targeting MSVC.

Reviewers: kcc, morehouse

Reviewed By: morehouse

Subscribers: kcc, hiraditya

Differential Revision: https://reviews.llvm.org/D53211

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344657 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Instrumentation/SanitizerCoverage.cpp     | 48 ++++++-------------
 1 file changed, 15 insertions(+), 33 deletions(-)

diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index b3450728f04..074ae1347f1 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -212,8 +212,8 @@ private:
                              bool IsLeafFunc = true);
   Function *CreateInitCallsForSections(Module &M, const char *InitFunctionName,
                                        Type *Ty, const char *Section);
-  std::pair<GlobalVariable *, GlobalVariable *>
-  CreateSecStartEnd(Module &M, const char *Section, Type *Ty);
+  std::pair<Value *, Value *> CreateSecStartEnd(Module &M, const char *Section,
+                                                Type *Ty);
 
   void SetNoSanitizeMetadata(Instruction *I) {
     I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
@@ -251,7 +251,7 @@ private:
 
 } // namespace
 
-std::pair<GlobalVariable *, GlobalVariable *>
+std::pair<Value *, Value *>
 SanitizerCoverageModule::CreateSecStartEnd(Module &M, const char *Section,
                                            Type *Ty) {
   GlobalVariable *SecStart =
@@ -262,33 +262,28 @@ SanitizerCoverageModule::CreateSecStartEnd(Module &M, const char *Section,
       new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
                          nullptr, getSectionEnd(Section));
   SecEnd->setVisibility(GlobalValue::HiddenVisibility);
+  IRBuilder<> IRB(M.getContext());
+  Value *SecEndPtr = IRB.CreatePointerCast(SecEnd, Ty);
+  if (TargetTriple.getObjectFormat() != Triple::COFF)
+    return std::make_pair(IRB.CreatePointerCast(SecStart, Ty), SecEndPtr);
 
-  return std::make_pair(SecStart, SecEnd);
+  // Account for the fact that on windows-msvc __start_* symbols actually
+  // point to a uint64_t before the start of the array.
+  auto SecStartI8Ptr = IRB.CreatePointerCast(SecStart, Int8PtrTy);
+  auto GEP = IRB.CreateGEP(SecStartI8Ptr,
+                           ConstantInt::get(IntptrTy, sizeof(uint64_t)));
+  return std::make_pair(IRB.CreatePointerCast(GEP, Ty), SecEndPtr);
 }
 
-
 Function *SanitizerCoverageModule::CreateInitCallsForSections(
     Module &M, const char *InitFunctionName, Type *Ty,
     const char *Section) {
-  IRBuilder<> IRB(M.getContext());
   auto SecStartEnd = CreateSecStartEnd(M, Section, Ty);
   auto SecStart = SecStartEnd.first;
   auto SecEnd = SecStartEnd.second;
   Function *CtorFunc;
-  Value *SecStartPtr = nullptr;
-  // Account for the fact that on windows-msvc __start_* symbols actually
-  // point to a uint64_t before the start of the array.
-  if (TargetTriple.getObjectFormat() == Triple::COFF) {
-    auto SecStartI8Ptr = IRB.CreatePointerCast(SecStart, Int8PtrTy);
-    auto GEP = IRB.CreateGEP(SecStartI8Ptr,
-                             ConstantInt::get(IntptrTy, sizeof(uint64_t)));
-    SecStartPtr = IRB.CreatePointerCast(GEP, Ty);
-  } else {
-    SecStartPtr = IRB.CreatePointerCast(SecStart, Ty);
-  }
   std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
-      M, SanCovModuleCtorName, InitFunctionName, {Ty, Ty},
-      {SecStartPtr, IRB.CreatePointerCast(SecEnd, Ty)});
+      M, SanCovModuleCtorName, InitFunctionName, {Ty, Ty}, {SecStart, SecEnd});
 
   if (TargetTriple.supportsCOMDAT()) {
     // Use comdat to dedup CtorFunc.
@@ -431,20 +426,7 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
     Function *InitFunction = declareSanitizerInitFunction(
         M, SanCovPCsInitName, {IntptrPtrTy, IntptrPtrTy});
     IRBuilder<> IRBCtor(Ctor->getEntryBlock().getTerminator());
-    Value *SecStartPtr = nullptr;
-    // Account for the fact that on windows-msvc __start_pc_table actually
-    // points to a uint64_t before the start of the PC table.
-    if (TargetTriple.getObjectFormat() == Triple::COFF) {
-      auto SecStartI8Ptr = IRB.CreatePointerCast(SecStartEnd.first, Int8PtrTy);
-      auto GEP = IRB.CreateGEP(SecStartI8Ptr,
-                               ConstantInt::get(IntptrTy, sizeof(uint64_t)));
-      SecStartPtr = IRB.CreatePointerCast(GEP, IntptrPtrTy);
-    } else {
-      SecStartPtr = IRB.CreatePointerCast(SecStartEnd.first, IntptrPtrTy);
-    }
-    IRBCtor.CreateCall(
-        InitFunction,
-        {SecStartPtr, IRB.CreatePointerCast(SecStartEnd.second, IntptrPtrTy)});
+    IRBCtor.CreateCall(InitFunction, {SecStartEnd.first, SecStartEnd.second});
   }
   // We don't reference these arrays directly in any of our runtime functions,
   // so we need to prevent them from being dead stripped.
-- 
GitLab


From 6d3a501fba9820488aa5b1b27f1a9014a09eadb3 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 16 Oct 2018 23:49:50 +0000
Subject: [PATCH 0261/1116] [ThinLTO] Add importing stats to thin link

Summary:
Previously we could only get the number of imported functions and
variables from the backend. This adds stats to the thin link where the
importing is decided.

Reviewers: wmi

Subscribers: inglorion, dexonsmith, llvm-commits

Differential Revision: https://reviews.llvm.org/D53337

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344658 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/FunctionImport.cpp         | 32 ++++++++--
 .../FunctionImport/Inputs/import_stats.ll     | 13 ++++
 .../Transforms/FunctionImport/import_stats.ll | 64 +++++++++++++++++++
 3 files changed, 104 insertions(+), 5 deletions(-)
 create mode 100644 test/Transforms/FunctionImport/Inputs/import_stats.ll
 create mode 100644 test/Transforms/FunctionImport/import_stats.ll

diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 366ac2b95f4..16a3d112b29 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -60,8 +60,17 @@ using namespace llvm;
 
 #define DEBUG_TYPE "function-import"
 
-STATISTIC(NumImportedFunctions, "Number of functions imported");
-STATISTIC(NumImportedGlobalVars, "Number of global variables imported");
+STATISTIC(NumImportedFunctionsThinLink,
+          "Number of functions thin link decided to import");
+STATISTIC(NumImportedHotFunctionsThinLink,
+          "Number of hot functions thin link decided to import");
+STATISTIC(NumImportedCriticalFunctionsThinLink,
+          "Number of critical functions thin link decided to import");
+STATISTIC(NumImportedGlobalVarsThinLink,
+          "Number of global variables thin link decided to import");
+STATISTIC(NumImportedFunctions, "Number of functions imported in backend");
+STATISTIC(NumImportedGlobalVars,
+          "Number of global variables imported in backend");
 STATISTIC(NumImportedModules, "Number of modules imported from");
 STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
 STATISTIC(NumLiveSymbols, "Number of live symbols in index");
@@ -281,7 +290,10 @@ static void computeImportForReferencedGlobals(
           !RefSummary->notEligibleToImport() &&
           !GlobalValue::isInterposableLinkage(RefSummary->linkage()) &&
           RefSummary->refs().empty()) {
-        ImportList[RefSummary->modulePath()].insert(VI.getGUID());
+        auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID());
+        // Only update stat if we haven't already imported this variable.
+        if (ILI.second)
+          NumImportedGlobalVarsThinLink++;
         if (ExportLists)
           (*ExportLists)[RefSummary->modulePath()].insert(VI.getGUID());
         break;
@@ -363,6 +375,11 @@ static void computeImportForFunction(
     auto &CalleeSummary = std::get<1>(IT.first->second);
     auto &FailureInfo = std::get<2>(IT.first->second);
 
+    bool IsHotCallsite =
+        Edge.second.getHotness() == CalleeInfo::HotnessType::Hot;
+    bool IsCriticalCallsite =
+        Edge.second.getHotness() == CalleeInfo::HotnessType::Critical;
+
     const FunctionSummary *ResolvedCalleeSummary = nullptr;
     if (CalleeSummary) {
       assert(PreviouslyVisited);
@@ -434,6 +451,13 @@ static void computeImportForFunction(
       // We previously decided to import this GUID definition if it was already
       // inserted in the set of imports from the exporting module.
       bool PreviouslyImported = !ILI.second;
+      if (!PreviouslyImported) {
+        NumImportedFunctionsThinLink++;
+        if (IsHotCallsite)
+          NumImportedHotFunctionsThinLink++;
+        if (IsCriticalCallsite)
+          NumImportedCriticalFunctionsThinLink++;
+      }
 
       // Make exports in the source module.
       if (ExportLists) {
@@ -467,8 +491,6 @@ static void computeImportForFunction(
       return Threshold * ImportInstrFactor;
     };
 
-    bool IsHotCallsite =
-        Edge.second.getHotness() == CalleeInfo::HotnessType::Hot;
     const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite);
 
     ImportCount++;
diff --git a/test/Transforms/FunctionImport/Inputs/import_stats.ll b/test/Transforms/FunctionImport/Inputs/import_stats.ll
new file mode 100644
index 00000000000..4313883d926
--- /dev/null
+++ b/test/Transforms/FunctionImport/Inputs/import_stats.ll
@@ -0,0 +1,13 @@
+; ModuleID = 'import_stats2.ll'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @hot() {
+  ret void
+}
+define void @critical() {
+  ret void
+}
+define void @none() {
+  ret void
+}
diff --git a/test/Transforms/FunctionImport/import_stats.ll b/test/Transforms/FunctionImport/import_stats.ll
new file mode 100644
index 00000000000..43540856bd3
--- /dev/null
+++ b/test/Transforms/FunctionImport/import_stats.ll
@@ -0,0 +1,64 @@
+; Test to check thin link importing stats
+
+; RUN: opt -module-summary %s -o %t.bc
+; RUN: opt -module-summary %p/Inputs/import_stats.ll -o %t2.bc
+
+; Test thin link stats with both new and old LTO
+; RUN: llvm-lto -thinlto-action=run -stats %t.bc %t2.bc \
+; RUN:		2>&1 | FileCheck %s --check-prefix=THINLINKSTATS
+; RUN: llvm-lto2 run -stats -o %t3 %t.bc %t2.bc \
+; RUN:          -r %t.bc,hot_function,plx \
+; RUN:          -r %t.bc,hot, \
+; RUN:          -r %t.bc,critical, \
+; RUN:          -r %t.bc,none, \
+; RUN:          -r %t2.bc,hot,plx \
+; RUN:          -r %t2.bc,critical,plx \
+; RUN:          -r %t2.bc,none,plx \
+; RUN:          2>&1 | FileCheck %s --check-prefix=THINLINKSTATS
+
+; THINLINKSTATS-DAG: 1 function-import  - Number of critical functions thin link decided to import
+; THINLINKSTATS-DAG: 3 function-import  - Number of functions thin link decided to import
+; THINLINKSTATS-DAG: 1 function-import  - Number of hot functions thin link decided to import
+
+; ModuleID = 'import_stats.ll'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This function has a high profile count, so entry block is hot.
+define void @hot_function(i1 %a) !prof !20 {
+entry:
+  call void @hot()
+  call void @critical()
+  br i1 %a, label %None1, label %None2, !prof !42
+None1:          ; half goes here
+  call void @none()
+  br label %exit
+None2:          ; half goes here
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @hot()
+declare void @none()
+declare void @critical()
+
+!42 = !{!"branch_weights", i32 1, i32 1}
+
+!llvm.module.flags = !{!1}
+!20 = !{!"function_entry_count", i64 100, i64 696010031887058302}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 300}
+!5 = !{!"MaxCount", i64 100}
+!6 = !{!"MaxInternalCount", i64 100}
+!7 = !{!"MaxFunctionCount", i64 100}
+!8 = !{!"NumCounts", i64 4}
+!9 = !{!"NumFunctions", i64 1}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 4}
-- 
GitLab


From ef4467e35f5ca0444ea8b48c3bbb525fbc9ea0d5 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Wed, 17 Oct 2018 00:16:07 +0000
Subject: [PATCH 0262/1116] [Sanitizer][PassManager] Fix for failing ASan tests
 on arm-linux-gnueabihf

Forgot to initialize the legacy pass in it's constructor.

Differential Revision: https://reviews.llvm.org/D53350

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344659 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Instrumentation/AddressSanitizer.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index ad07b608934..dcbaf7a62f2 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1056,7 +1056,9 @@ public:
                                       bool Recover = false,
                                       bool UseAfterScope = false)
       : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover),
-        UseAfterScope(UseAfterScope) {}
+        UseAfterScope(UseAfterScope) {
+    initializeAddressSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
 
   StringRef getPassName() const override {
     return "AddressSanitizerFunctionPass";
-- 
GitLab


From 125615443086943d62648a64ea6b3a3c47533745 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 17 Oct 2018 00:19:21 +0000
Subject: [PATCH 0263/1116] [ThinLTO] Fix test to require asserts

New test added in r344658 requires asserts due to -stats.

While here, augment it to test new global variable importing
message as well.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344660 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/FunctionImport/Inputs/import_stats.ll | 3 +++
 test/Transforms/FunctionImport/import_stats.ll        | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/test/Transforms/FunctionImport/Inputs/import_stats.ll b/test/Transforms/FunctionImport/Inputs/import_stats.ll
index 4313883d926..818fbf20d6f 100644
--- a/test/Transforms/FunctionImport/Inputs/import_stats.ll
+++ b/test/Transforms/FunctionImport/Inputs/import_stats.ll
@@ -2,7 +2,10 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+@globalvar = global i32 1, align 4
+
 define void @hot() {
+  store i32 0, i32* @globalvar, align 4
   ret void
 }
 define void @critical() {
diff --git a/test/Transforms/FunctionImport/import_stats.ll b/test/Transforms/FunctionImport/import_stats.ll
index 43540856bd3..03f5f8f0198 100644
--- a/test/Transforms/FunctionImport/import_stats.ll
+++ b/test/Transforms/FunctionImport/import_stats.ll
@@ -1,5 +1,8 @@
 ; Test to check thin link importing stats
 
+; -stats requires asserts
+; REQUIRES: asserts
+
 ; RUN: opt -module-summary %s -o %t.bc
 ; RUN: opt -module-summary %p/Inputs/import_stats.ll -o %t2.bc
 
@@ -14,8 +17,10 @@
 ; RUN:          -r %t2.bc,hot,plx \
 ; RUN:          -r %t2.bc,critical,plx \
 ; RUN:          -r %t2.bc,none,plx \
+; RUN:          -r %t2.bc,globalvar,plx \
 ; RUN:          2>&1 | FileCheck %s --check-prefix=THINLINKSTATS
 
+; THINLINKSTATS-DAG: 1 function-import   - Number of global variables thin link decided to import
 ; THINLINKSTATS-DAG: 1 function-import  - Number of critical functions thin link decided to import
 ; THINLINKSTATS-DAG: 3 function-import  - Number of functions thin link decided to import
 ; THINLINKSTATS-DAG: 1 function-import  - Number of hot functions thin link decided to import
-- 
GitLab


From ea56a5932c312b9064e5e6b1f6f189b570964b99 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 17 Oct 2018 00:59:14 +0000
Subject: [PATCH 0264/1116] New test requires x86-registered-target

New test added in r344658 also requires x86-registered-target.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344662 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/FunctionImport/import_stats.ll | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/Transforms/FunctionImport/import_stats.ll b/test/Transforms/FunctionImport/import_stats.ll
index 03f5f8f0198..2cb415d1e96 100644
--- a/test/Transforms/FunctionImport/import_stats.ll
+++ b/test/Transforms/FunctionImport/import_stats.ll
@@ -3,6 +3,8 @@
 ; -stats requires asserts
 ; REQUIRES: asserts
 
+; REQUIRES: x86-registered-target
+
 ; RUN: opt -module-summary %s -o %t.bc
 ; RUN: opt -module-summary %p/Inputs/import_stats.ll -o %t2.bc
 
-- 
GitLab


From 73405ef1630bd1a2b1c34c9f0cc63cb5847fcf09 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 17 Oct 2018 03:34:09 +0000
Subject: [PATCH 0265/1116] [BuildingAJIT] Update chapter 1 to use the ORCv2
 APIs.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344667 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/tutorial/BuildingAJIT1.rst               | 395 +++++++-----------
 .../BuildingAJIT/Chapter1/KaleidoscopeJIT.h   |  87 ++--
 .../BuildingAJIT/Chapter1/toy.cpp             | 154 ++++---
 3 files changed, 272 insertions(+), 364 deletions(-)

diff --git a/docs/tutorial/BuildingAJIT1.rst b/docs/tutorial/BuildingAJIT1.rst
index 2b83df42fc2..f1e93bf12b3 100644
--- a/docs/tutorial/BuildingAJIT1.rst
+++ b/docs/tutorial/BuildingAJIT1.rst
@@ -8,18 +8,19 @@ Building a JIT: Starting out with KaleidoscopeJIT
 Chapter 1 Introduction
 ======================
 
-**Warning: This text is currently out of date due to ORC API updates.**
+**Warning: This tutorial is currently being updated to account for ORC API
+changes. Only Chapter 1 is up-to-date.**
 
-**The example code has been updated and can be used. The text will be updated
-once the API churn dies down.**
+**Example code from Chapters 2 to 4 will compile and run, but has not been
+updated**
 
 Welcome to Chapter 1 of the "Building an ORC-based JIT in LLVM" tutorial. This
 tutorial runs through the implementation of a JIT compiler using LLVM's
 On-Request-Compilation (ORC) APIs. It begins with a simplified version of the
 KaleidoscopeJIT class used in the
 `Implementing a language with LLVM <LangImpl01.html>`_ tutorials and then
-introduces new features like optimization, lazy compilation and remote
-execution.
+introduces new features like concurrent compilation, optimization, lazy
+compilation and remote execution.
 
 The goal of this tutorial is to introduce you to LLVM's ORC JIT APIs, show how
 these APIs interact with other parts of LLVM, and to teach you how to recombine
@@ -45,11 +46,9 @@ The structure of the tutorial is:
 - `Chapter #5 <BuildingAJIT5.html>`_: Add process isolation by JITing code into
   a remote process with reduced privileges using the JIT Remote APIs.
 
-To provide input for our JIT we will use the Kaleidoscope REPL from
-`Chapter 7 <LangImpl07.html>`_ of the "Implementing a language in LLVM tutorial",
-with one minor modification: We will remove the FunctionPassManager from the
-code for that chapter and replace it with optimization support in our JIT class
-in Chapter #2.
+To provide input for our JIT we will use a lightly modified version of the
+Kaleidoscope REPL from `Chapter 7 <LangImpl07.html>`_ of the "Implementing a
+language in LLVM tutorial".
 
 Finally, a word on API generations: ORC is the 3rd generation of LLVM JIT API.
 It was preceded by MCJIT, and before that by the (now deleted) legacy JIT.
@@ -63,14 +62,13 @@ JIT API Basics
 
 The purpose of a JIT compiler is to compile code "on-the-fly" as it is needed,
 rather than compiling whole programs to disk ahead of time as a traditional
-compiler does. To support that aim our initial, bare-bones JIT API will be:
+compiler does. To support that aim our initial, bare-bones JIT API will have
+just two functions:
 
 1. Handle addModule(Module &M) -- Make the given IR module available for
    execution.
-2. JITSymbol findSymbol(const std::string &Name) -- Search for pointers to
+2. Expected<JITSymbol> lookup() -- Search for pointers to
    symbols (functions or variables) that have been added to the JIT.
-3. void removeModule(Handle H) -- Remove a module from the JIT, releasing any
-   memory that had been used for the compiled code.
 
 A basic use-case for this API, executing the 'main' function from a module,
 will look like:
@@ -79,16 +77,15 @@ will look like:
 
   std::unique_ptr<Module> M = buildModule();
   JIT J;
-  Handle H = J.addModule(*M);
-  int (*Main)(int, char*[]) = (int(*)(int, char*[]))J.getSymbolAddress("main");
+  J.addModule(*M);
+  auto *Main = (int(*)(int, char*[]))J.lookup("main");.getAddress();
   int Result = Main();
-  J.removeModule(H);
 
 The APIs that we build in these tutorials will all be variations on this simple
-theme. Behind the API we will refine the implementation of the JIT to add
-support for optimization and lazy compilation. Eventually we will extend the
-API itself to allow higher-level program representations (e.g. ASTs) to be
-added to the JIT.
+theme. Behind this API we will refine the implementation of the JIT to add
+support for concurrent compilation, optimization and lazy compilation.
+Eventually we will extend the API itself to allow higher-level program
+representations (e.g. ASTs) to be added to the JIT.
 
 KaleidoscopeJIT
 ===============
@@ -100,12 +97,10 @@ the REPL code from `Chapter 7 <LangImpl07.html>`_ of that tutorial to supply the
 input for our JIT: Each time the user enters an expression the REPL will add a
 new IR module containing the code for that expression to the JIT. If the
 expression is a top-level expression like '1+1' or 'sin(x)', the REPL will also
-use the findSymbol method of our JIT class find and execute the code for the
-expression, and then use the removeModule method to remove the code again
-(since there's no way to re-invoke an anonymous expression). In later chapters
-of this tutorial we'll modify the REPL to enable new interactions with our JIT
-class, but for now we will take this setup for granted and focus our attention on
-the implementation of our JIT itself.
+use the lookup method of our JIT class find and execute the code for the
+expression. In later chapters of this tutorial we will modify the REPL to enable
+new interactions with our JIT class, but for now we will take this setup for
+granted and focus our attention on the implementation of our JIT itself.
 
 Our KaleidoscopeJIT class is defined in the KaleidoscopeJIT.h header. After the
 usual include guards and #includes [2]_, we get to the definition of our class:
@@ -115,216 +110,154 @@ usual include guards and #includes [2]_, we get to the definition of our class:
   #ifndef LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H
   #define LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H
 
-  #include "llvm/ADT/STLExtras.h"
-  #include "llvm/ExecutionEngine/ExecutionEngine.h"
+  #include "llvm/ADT/StringRef.h"
   #include "llvm/ExecutionEngine/JITSymbol.h"
-  #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
-  #include "llvm/ExecutionEngine/SectionMemoryManager.h"
   #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+  #include "llvm/ExecutionEngine/Orc/Core.h"
+  #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
   #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-  #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
+  #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
   #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+  #include "llvm/ExecutionEngine/SectionMemoryManager.h"
   #include "llvm/IR/DataLayout.h"
-  #include "llvm/IR/Mangler.h"
-  #include "llvm/Support/DynamicLibrary.h"
-  #include "llvm/Support/raw_ostream.h"
-  #include "llvm/Target/TargetMachine.h"
-  #include <algorithm>
+  #include "llvm/IR/LLVMContext.h"
   #include <memory>
-  #include <string>
-  #include <vector>
 
   namespace llvm {
   namespace orc {
 
   class KaleidoscopeJIT {
   private:
-    std::unique_ptr<TargetMachine> TM;
-    const DataLayout DL;
-    RTDyldObjectLinkingLayer ObjectLayer;
-    IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
 
-  public:
-    using ModuleHandle = decltype(CompileLayer)::ModuleHandleT;
-
-Our class begins with four members: A TargetMachine, TM, which will be used to
-build our LLVM compiler instance; A DataLayout, DL, which will be used for
-symbol mangling (more on that later), and two ORC *layers*: an
-RTDyldObjectLinkingLayer and a CompileLayer. We'll be talking more about layers
-in the next chapter, but for now you can think of them as analogous to LLVM
-Passes: they wrap up useful JIT utilities behind an easy to compose interface.
-The first layer, ObjectLayer, is the foundation of our JIT: it takes in-memory
-object files produced by a compiler and links them on the fly to make them
-executable. This JIT-on-top-of-a-linker design was introduced in MCJIT, however
-the linker was hidden inside the MCJIT class. In ORC we expose the linker so
-that clients can access and configure it directly if they need to. In this
-tutorial our ObjectLayer will just be used to support the next layer in our
-stack: the CompileLayer, which will be responsible for taking LLVM IR, compiling
-it, and passing the resulting in-memory object files down to the object linking
-layer below.
-
-That's it for member variables, after that we have a single typedef:
-ModuleHandle. This is the handle type that will be returned from our JIT's
-addModule method, and can be passed to the removeModule method to remove a
-module. The IRCompileLayer class already provides a convenient handle type
-(IRCompileLayer::ModuleHandleT), so we just alias our ModuleHandle to this.
+    ExecutionSession ES;
+    RTDyldObjectLinkingLayer ObjectLayer{ES, getMemoryMgr};
+    IRCompileLayer CompileLayer{ES, ObjectLayer,
+                                ConcurrentIRCompiler(getJTMB())};
+    DataLayout DL{cantFail(getJTMB().getDefaultDataLayoutForTarget())};
+    MangleAndInterner Mangle{ES, DL};
+    ThreadSafeContext Ctx{llvm::make_unique<LLVMContext>()};
+
+    static JITTargetMachineBuilder getJTMB() {
+      return cantFail(JITTargetMachineBuilder::detectHost());
+    }
+
+    static std::unique_ptr<SectionMemoryManager> getMemoryMgr(VModuleKey) {
+      return llvm::make_unique<SectionMemoryManager>();
+    }
+
+We begin with the ExecutionSession member, ``ES``, which provides context for
+our running JIT'd code. It holds the string pool for symbol names, the global
+mutex that guards the critical sections of JIT operations, error logging
+facilities, and other utilities. For basic use cases such as this, a default
+constructed ExecutionSession is all we will need. We will investigate more
+advanced uses of ExecutionSession in later chapters. Following our
+ExecutionSession we have two ORC *layers*: an RTDyldObjectLinkingLayer and an
+IRCompileLayer. We will be talking more about layers in the next chapter, but
+for now you can think of them as analogous to LLVM Passes: they wrap up useful
+JIT utilities behind an easy to compose interface. The first layer, ObjectLayer,
+is the foundation of our JIT: it takes in-memory object files produced by a
+compiler and links them on the fly to make them executable. This
+JIT-on-top-of-a-linker design was introduced in MCJIT, however the linker was
+hidden inside the MCJIT class. In ORC we expose the linker so that clients can
+access and configure it directly if they need to. In this tutorial our
+ObjectLayer will just be used to support the next layer in our stack: the
+CompileLayer, which will be responsible for taking LLVM IR, compiling it, and
+passing the resulting in-memory object files down to the object linking layer
+below. Our ObjectLayer is constructed with a reference to the ExecutionSession
+and the getMemoryMgr utility function, which it uses to generate a new memory
+manager for each object file as it is added. Next up is our CompileLayer, which
+is initialized with a reference to the ExecutionSession, a reference to the
+ObjectLayer (where it will send the objects produced by the compiler), and an IR
+compiler instance. In this case we are using the ConcurrentIRCompiler class
+which is constructed with a JITTargetMachineBuilder and can be called to compile
+IR concurrently from several threads (though in this chapter we will only use
+one).
+
+Following the ExecutionSession and layers we have three supporting member
+variables. The DataLayout, ``DL``; and MangleAndInterner, ``Mangle`` members are
+used to support portable lookups based on IR symbol names (more on that when we
+get to our ``lookup`` function below), and the ThreadSafeContext member,
+``Ctx``, manages an LLVMContext that can be used while building IR Modules for
+the JIT.
+
+After that, we have two static utility functions. The ``getJTMB()`` function
+returns a JITTargetMachineBuilder, which is a factory for building LLVM
+TargetMachine instances that are used by the compiler. In this first tutorial we
+will only need one (implicitly created) TargetMachine, but in future tutorials
+that enable concurrent compilation we will need one per thread. This is why we
+use a target machine builder, rather than a single TargetMachine. (note: Older
+LLVM JIT APIs that did not support concurrent compilation were constructed with
+a single TargetMachines). The ``getMemoryMgr()`` function constructs instances
+of RuntimeDyld::MemoryManager, and is used by the linking layer to generate a
+new memory manager for each object file.
 
 .. code-block:: c++
 
-  KaleidoscopeJIT()
-      : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
-        ObjectLayer([]() { return std::make_shared<SectionMemoryManager>(); }),
-        CompileLayer(ObjectLayer, SimpleCompiler(*TM)) {
-    llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
-  }
+  public:
 
-  TargetMachine &getTargetMachine() { return *TM; }
-
-Next up we have our class constructor. We begin by initializing TM using the
-EngineBuilder::selectTarget helper method which constructs a TargetMachine for
-the current process. Then we use our newly created TargetMachine to initialize
-DL, our DataLayout. After that we need to initialize our ObjectLayer. The
-ObjectLayer requires a function object that will build a JIT memory manager for
-each module that is added (a JIT memory manager manages memory allocations,
-memory permissions, and registration of exception handlers for JIT'd code). For
-this we use a lambda that returns a SectionMemoryManager, an off-the-shelf
-utility that provides all the basic memory management functionality required for
-this chapter. Next we initialize our CompileLayer. The CompileLayer needs two
-things: (1) A reference to our object layer, and (2) a compiler instance to use
-to perform the actual compilation from IR to object files. We use the
-off-the-shelf SimpleCompiler instance for now. Finally, in the body of the
-constructor, we call the DynamicLibrary::LoadLibraryPermanently method with a
-nullptr argument. Normally the LoadLibraryPermanently method is called with the
-path of a dynamic library to load, but when passed a null pointer it will 'load'
-the host process itself, making its exported symbols available for execution.
+    KaleidoscopeJIT() {
+      ES.getMainJITDylib().setGenerator(
+        cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
+    }
 
-.. code-block:: c++
+    const DataLayout &getDataLayout() const { return DL; }
 
-  ModuleHandle addModule(std::unique_ptr<Module> M) {
-    // Build our symbol resolver:
-    // Lambda 1: Look back into the JIT itself to find symbols that are part of
-    //           the same "logical dylib".
-    // Lambda 2: Search for external symbols in the host process.
-    auto Resolver = createLambdaResolver(
-        [&](const std::string &Name) {
-          if (auto Sym = CompileLayer.findSymbol(Name, false))
-            return Sym;
-          return JITSymbol(nullptr);
-        },
-        [](const std::string &Name) {
-          if (auto SymAddr =
-                RTDyldMemoryManager::getSymbolAddressInProcess(Name))
-            return JITSymbol(SymAddr, JITSymbolFlags::Exported);
-          return JITSymbol(nullptr);
-        });
-
-    // Add the set to the JIT with the resolver we created above and a newly
-    // created SectionMemoryManager.
-    return cantFail(CompileLayer.addModule(std::move(M),
-                                           std::move(Resolver)));
-  }
+    LLVMContext &getContext() { return *Ctx.getContext(); }
 
-Now we come to the first of our JIT API methods: addModule. This method is
-responsible for adding IR to the JIT and making it available for execution. In
-this initial implementation of our JIT we will make our modules "available for
-execution" by adding them straight to the CompileLayer, which will immediately
-compile them. In later chapters we will teach our JIT to defer compilation
-of individual functions until they're actually called.
-
-To add our module to the CompileLayer we need to supply both the module and a
-symbol resolver. The symbol resolver is responsible for supplying the JIT with
-an address for each *external symbol* in the module we are adding. External
-symbols are any symbol not defined within the module itself, including calls to
-functions outside the JIT and calls to functions defined in other modules that
-have already been added to the JIT. (It may seem as though modules added to the
-JIT should know about one another by default, but since we would still have to
-supply a symbol resolver for references to code outside the JIT it turns out to
-be easier to re-use this one mechanism for all symbol resolution.) This has the
-added benefit that the user has full control over the symbol resolution
-process. Should we search for definitions within the JIT first, then fall back
-on external definitions? Or should we prefer external definitions where
-available and only JIT code if we don't already have an available
-implementation? By using a single symbol resolution scheme we are free to choose
-whatever makes the most sense for any given use case.
-
-Building a symbol resolver is made especially easy by the *createLambdaResolver*
-function. This function takes two lambdas [3]_ and returns a JITSymbolResolver
-instance. The first lambda is used as the implementation of the resolver's
-findSymbolInLogicalDylib method, which searches for symbol definitions that
-should be thought of as being part of the same "logical" dynamic library as this
-Module. If you are familiar with static linking: this means that
-findSymbolInLogicalDylib should expose symbols with common linkage and hidden
-visibility. If all this sounds foreign you can ignore the details and just
-remember that this is the first method that the linker will use to try to find a
-symbol definition. If the findSymbolInLogicalDylib method returns a null result
-then the linker will call the second symbol resolver method, called findSymbol,
-which searches for symbols that should be thought of as external to (but
-visibile from) the module and its logical dylib. In this tutorial we will adopt
-the following simple scheme: All modules added to the JIT will behave as if they
-were linked into a single, ever-growing logical dylib. To implement this our
-first lambda (the one defining findSymbolInLogicalDylib) will just search for
-JIT'd code by calling the CompileLayer's findSymbol method. If we don't find a
-symbol in the JIT itself we'll fall back to our second lambda, which implements
-findSymbol. This will use the RTDyldMemoryManager::getSymbolAddressInProcess
-method to search for the symbol within the program itself. If we can't find a
-symbol definition via either of these paths, the JIT will refuse to accept our
-module, returning a "symbol not found" error.
-
-Now that we've built our symbol resolver, we're ready to add our module to the
-JIT. We do this by calling the CompileLayer's addModule method. The addModule
-method returns an ``Expected<CompileLayer::ModuleHandle>``, since in more
-advanced JIT configurations it could fail. In our basic configuration we know
-that it will always succeed so we use the cantFail utility to assert that no
-error occurred, and extract the handle value. Since we have already typedef'd
-our ModuleHandle type to be the same as the CompileLayer's handle type, we can
-return the unwrapped handle directly.
+Next up we have our class constructor. Our members have already been
+initialized, so the one thing that remains to do is to tweak the configuration
+of the *JITDylib* that we will store our code in. We want to modify this dylib
+to contain not only the symbols that we add to it, but also the symbols from
+our REPL process as well. We do this by attaching a
+``DynamicLibrarySearchGenerator`` instance using the
+``DynamicLibrarySearchGenerator::GetForCurrentProcess`` method.
 
-.. code-block:: c++
+Following the constructor we have the ``getDataLayout()`` and ``getContext()``
+methods. These are used to make data structures created and managed by the JIT
+(especially the LLVMContext) available to the REPL code that will build our
+IR modules.
 
-  JITSymbol findSymbol(const std::string Name) {
-    std::string MangledName;
-    raw_string_ostream MangledNameStream(MangledName);
-    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
-    return CompileLayer.findSymbol(MangledNameStream.str(), true);
-  }
+.. code-block:: c++
 
-  JITTargetAddress getSymbolAddress(const std::string Name) {
-    return cantFail(findSymbol(Name).getAddress());
+  void addModule(std::unique_ptr<Module> M) {
+    cantFail(CompileLayer.add(ES.getMainJITDylib(),
+                              ThreadSafeModule(std::move(M), Ctx)));
   }
 
-  void removeModule(ModuleHandle H) {
-    cantFail(CompileLayer.removeModule(H));
+  Expected<JITEvaluatedSymbol> lookup(StringRef Name) {
+    return ES.lookup({&ES.getMainJITDylib()}, Mangle(Name.str()));
   }
 
-Now that we can add code to our JIT, we need a way to find the symbols we've
-added to it. To do that we call the findSymbol method on our CompileLayer, but
-with a twist: We have to *mangle* the name of the symbol we're searching for
-first. The ORC JIT components use mangled symbols internally the same way a
-static compiler and linker would, rather than using plain IR symbol names. This
-allows JIT'd code to interoperate easily with precompiled code in the
-application or shared libraries. The kind of mangling will depend on the
-DataLayout, which in turn depends on the target platform. To allow us to remain
-portable and search based on the un-mangled name, we just re-produce this
-mangling ourselves.
-
-Next we have a convenience function, getSymbolAddress, which returns the address
-of a given symbol. Like CompileLayer's addModule function, JITSymbol's getAddress
-function is allowed to fail [4]_, however we know that it will not in our simple
-example, so we wrap it in a call to cantFail.
-
-We now come to the last method in our JIT API: removeModule. This method is
-responsible for destructing the MemoryManager and SymbolResolver that were
-added with a given module, freeing any resources they were using in the
-process. In our Kaleidoscope demo we rely on this method to remove the module
-representing the most recent top-level expression, preventing it from being
-treated as a duplicate definition when the next top-level expression is
-entered. It is generally good to free any module that you know you won't need
-to call further, just to free up the resources dedicated to it. However, you
-don't strictly need to do this: All resources will be cleaned up when your
-JIT class is destructed, if they haven't been freed before then. Like
-``CompileLayer::addModule`` and ``JITSymbol::getAddress``, removeModule may
-fail in general but will never fail in our example, so we wrap it in a call to
-cantFail.
+Now we come to the first of our JIT API methods: addModule. This method is
+responsible for adding IR to the JIT and making it available for execution. In
+this initial implementation of our JIT we will make our modules "available for
+execution" by adding them to the CompileLayer, which will it turn store the
+Module in the main JITDylib. This process will create new symbol table entries
+in the JITDylib for each definition in the module, and will defer compilation of
+the module until any of its definitions is looked up. Note that this is not lazy
+compilation: just referencing a definition, even if it is never used, will be
+enough to trigger compilation. In later chapters we will teach our JIT to defer
+compilation of functions until they're actually called.  To add our Module we
+must first wrap it in a ThreadSafeModule instance, which manages the lifetime of
+the Module's LLVMContext (our Ctx member) in a thread-friendly way. In our
+example, all modules will share the Ctx member, which will exist for the
+duration of the JIT. Once we switch to concurrent compilation in later chapters
+we will use a new context per module.
+
+Our last method is ``lookup``, which allows us to look up addresses for
+function and variable definitions added to the JIT based on their symbol names.
+As noted above, lookup will implicitly trigger compilation for any symbol
+that has not already been compiled. Our lookup method calls through to
+`ExecutionSession::lookup`, passing in a list of dylibs to search (in our case
+just the main dylib), and the symbol name to search for, with a twist: We have
+to *mangle* the name of the symbol we're searching for first. The ORC JIT
+components use mangled symbols internally the same way a static compiler and
+linker would, rather than using plain IR symbol names. This allows JIT'd code
+to interoperate easily with precompiled code in the application or shared
+libraries. The kind of mangling will depend on the DataLayout, which in turn
+depends on the target platform. To allow us to remain portable and search based
+on the un-mangled name, we just re-produce this mangling ourselves using our
+``Mangle`` member function object.
 
 This brings us to the end of Chapter 1 of Building a JIT. You now have a basic
 but fully functioning JIT stack that you can use to take LLVM IR and make it
@@ -362,42 +295,26 @@ Here is the code:
 .. [2] +-----------------------------+-----------------------------------------------+
        |         File                |               Reason for inclusion            |
        +=============================+===============================================+
-       |      STLExtras.h            | LLVM utilities that are useful when working   |
-       |                             | with the STL.                                 |
-       +-----------------------------+-----------------------------------------------+
-       |   ExecutionEngine.h         | Access to the EngineBuilder::selectTarget     |
-       |                             | method.                                       |
+       |        JITSymbol.h          | Defines the lookup result type                |
+       |                             | JITEvaluatedSymbol                            |
        +-----------------------------+-----------------------------------------------+
-       |                             | Access to the                                 |
-       | RTDyldMemoryManager.h       | RTDyldMemoryManager::getSymbolAddressInProcess|
-       |                             | method.                                       |
+       |       CompileUtils.h        | Provides the SimpleCompiler class.            |
        +-----------------------------+-----------------------------------------------+
-       |    CompileUtils.h           | Provides the SimpleCompiler class.            |
+       |           Core.h            | Core utilities such as ExecutionSession and   |
+       |                             | JITDylib.                                     |
        +-----------------------------+-----------------------------------------------+
-       |   IRCompileLayer.h          | Provides the IRCompileLayer class.            |
+       |      ExecutionUtils.h       | Provides the DynamicLibrarySearchGenerator    |
+       |                             | class.                                        |
        +-----------------------------+-----------------------------------------------+
-       |                             | Access the createLambdaResolver function,     |
-       |   LambdaResolver.h          | which provides easy construction of symbol    |
-       |                             | resolvers.                                    |
+       |      IRCompileLayer.h       | Provides the IRCompileLayer class.            |
        +-----------------------------+-----------------------------------------------+
-       |  RTDyldObjectLinkingLayer.h | Provides the RTDyldObjectLinkingLayer class.  |
+       |  JITTargetMachineBuilder.h  | Provides the JITTargetMachineBuilder class.   |
        +-----------------------------+-----------------------------------------------+
-       |       Mangler.h             | Provides the Mangler class for platform       |
-       |                             | specific name-mangling.                       |
+       | RTDyldObjectLinkingLayer.h  | Provides the RTDyldObjectLinkingLayer class.  |
        +-----------------------------+-----------------------------------------------+
-       |   DynamicLibrary.h          | Provides the DynamicLibrary class, which      |
-       |                             | makes symbols in the host process searchable. |
+       |   SectionMemoryManager.h    | Provides the SectionMemoryManager class.      |
        +-----------------------------+-----------------------------------------------+
-       |                             | A fast output stream class. We use the        |
-       |     raw_ostream.h           | raw_string_ostream subclass for symbol        |
-       |                             | mangling                                      |
+       |        DataLayout.h         | Provides the DataLayout class.                |
        +-----------------------------+-----------------------------------------------+
-       |   TargetMachine.h           | LLVM target machine description class.        |
+       |        LLVMContext.h        | Provides the LLVMContext class.               |
        +-----------------------------+-----------------------------------------------+
-
-.. [3] Actually they don't have to be lambdas, any object with a call operator
-       will do, including plain old functions or std::functions.
-
-.. [4] ``JITSymbol::getAddress`` will force the JIT to compile the definition of
-       the symbol if it hasn't already been compiled, and since the compilation
-       process could fail getAddress must be able to return this failure.
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
index 8c1af40be15..d9e320f5478 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
@@ -14,84 +14,59 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H
 #define LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include <algorithm>
+#include "llvm/IR/LLVMContext.h"
 #include <memory>
-#include <string>
-#include <vector>
 
 namespace llvm {
 namespace orc {
 
 class KaleidoscopeJIT {
 private:
+
   ExecutionSession ES;
-  std::shared_ptr<SymbolResolver> Resolver;
-  std::unique_ptr<TargetMachine> TM;
-  const DataLayout DL;
-  LegacyRTDyldObjectLinkingLayer ObjectLayer;
-  LegacyIRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  RTDyldObjectLinkingLayer ObjectLayer{ES, getMemoryMgr};
+  IRCompileLayer CompileLayer{ES, ObjectLayer,
+                              ConcurrentIRCompiler(getJTMB())};
+  DataLayout DL{cantFail(getJTMB().getDefaultDataLayoutForTarget())};
+  MangleAndInterner Mangle{ES, DL};
+  ThreadSafeContext Ctx{llvm::make_unique<LLVMContext>()};
 
-public:
-  KaleidoscopeJIT()
-      : Resolver(createLegacyLookupResolver(
-            ES,
-            [this](const std::string &Name) -> JITSymbol {
-              if (auto Sym = CompileLayer.findSymbol(Name, false))
-                return Sym;
-              else if (auto Err = Sym.takeError())
-                return std::move(Err);
-              if (auto SymAddr =
-                      RTDyldMemoryManager::getSymbolAddressInProcess(Name))
-                return JITSymbol(SymAddr, JITSymbolFlags::Exported);
-              return nullptr;
-            },
-            [](Error Err) { cantFail(std::move(Err), "lookupFlags failed"); })),
-        TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
-        ObjectLayer(ES,
-                    [this](VModuleKey) {
-                      return LegacyRTDyldObjectLinkingLayer::Resources{
-                          std::make_shared<SectionMemoryManager>(), Resolver};
-                    }),
-        CompileLayer(ObjectLayer, SimpleCompiler(*TM)) {
-    llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
+  static JITTargetMachineBuilder getJTMB() {
+    return cantFail(JITTargetMachineBuilder::detectHost());
   }
 
-  TargetMachine &getTargetMachine() { return *TM; }
-
-  VModuleKey addModule(std::unique_ptr<Module> M) {
-    // Add the module to the JIT with a new VModuleKey.
-    auto K = ES.allocateVModule();
-    cantFail(CompileLayer.addModule(K, std::move(M)));
-    return K;
+  static std::unique_ptr<SectionMemoryManager> getMemoryMgr() {
+    return llvm::make_unique<SectionMemoryManager>();
   }
 
-  JITSymbol findSymbol(const std::string Name) {
-    std::string MangledName;
-    raw_string_ostream MangledNameStream(MangledName);
-    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
-    return CompileLayer.findSymbol(MangledNameStream.str(), true);
+public:
+
+  KaleidoscopeJIT() {
+    ES.getMainJITDylib().setGenerator(
+      cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
   }
 
-  JITTargetAddress getSymbolAddress(const std::string Name) {
-    return cantFail(findSymbol(Name).getAddress());
+  const DataLayout &getDataLayout() const { return DL; }
+
+  LLVMContext &getContext() { return *Ctx.getContext(); }
+
+  void addModule(std::unique_ptr<Module> M) {
+    cantFail(CompileLayer.add(ES.getMainJITDylib(),
+                              ThreadSafeModule(std::move(M), Ctx)));
   }
 
-  void removeModule(VModuleKey K) {
-    cantFail(CompileLayer.removeModule(K));
+  Expected<JITEvaluatedSymbol> lookup(StringRef Name) {
+    return ES.lookup({&ES.getMainJITDylib()}, Mangle(Name.str()));
   }
 };
 
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp b/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
index 7652e80c69a..1d0730f99ef 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
@@ -676,10 +676,11 @@ static std::unique_ptr<FunctionAST> ParseDefinition() {
 }
 
 /// toplevelexpr ::= expression
-static std::unique_ptr<FunctionAST> ParseTopLevelExpr() {
+static std::unique_ptr<FunctionAST> ParseTopLevelExpr(unsigned ExprCount) {
   if (auto E = ParseExpression()) {
     // Make an anonymous proto.
-    auto Proto = llvm::make_unique<PrototypeAST>("__anon_expr",
+    auto Proto = llvm::make_unique<PrototypeAST>(("__anon_expr" +
+                                                  Twine(ExprCount)).str(),
                                                  std::vector<std::string>());
     return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(E));
   }
@@ -696,11 +697,11 @@ static std::unique_ptr<PrototypeAST> ParseExtern() {
 // Code Generation
 //===----------------------------------------------------------------------===//
 
-static LLVMContext TheContext;
-static IRBuilder<> Builder(TheContext);
+static std::unique_ptr<KaleidoscopeJIT> TheJIT;
+static LLVMContext *TheContext;
+static std::unique_ptr<IRBuilder<>> Builder;
 static std::unique_ptr<Module> TheModule;
 static std::map<std::string, AllocaInst *> NamedValues;
-static std::unique_ptr<KaleidoscopeJIT> TheJIT;
 static std::map<std::string, std::unique_ptr<PrototypeAST>> FunctionProtos;
 
 Value *LogErrorV(const char *Str) {
@@ -729,11 +730,11 @@ static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction,
                                           const std::string &VarName) {
   IRBuilder<> TmpB(&TheFunction->getEntryBlock(),
                    TheFunction->getEntryBlock().begin());
-  return TmpB.CreateAlloca(Type::getDoubleTy(TheContext), nullptr, VarName);
+  return TmpB.CreateAlloca(Type::getDoubleTy(*TheContext), nullptr, VarName);
 }
 
 Value *NumberExprAST::codegen() {
-  return ConstantFP::get(TheContext, APFloat(Val));
+  return ConstantFP::get(*TheContext, APFloat(Val));
 }
 
 Value *VariableExprAST::codegen() {
@@ -743,7 +744,7 @@ Value *VariableExprAST::codegen() {
     return LogErrorV("Unknown variable name");
 
   // Load the value.
-  return Builder.CreateLoad(V, Name.c_str());
+  return Builder->CreateLoad(V, Name.c_str());
 }
 
 Value *UnaryExprAST::codegen() {
@@ -755,7 +756,7 @@ Value *UnaryExprAST::codegen() {
   if (!F)
     return LogErrorV("Unknown unary operator");
 
-  return Builder.CreateCall(F, OperandV, "unop");
+  return Builder->CreateCall(F, OperandV, "unop");
 }
 
 Value *BinaryExprAST::codegen() {
@@ -778,7 +779,7 @@ Value *BinaryExprAST::codegen() {
     if (!Variable)
       return LogErrorV("Unknown variable name");
 
-    Builder.CreateStore(Val, Variable);
+    Builder->CreateStore(Val, Variable);
     return Val;
   }
 
@@ -789,15 +790,15 @@ Value *BinaryExprAST::codegen() {
 
   switch (Op) {
   case '+':
-    return Builder.CreateFAdd(L, R, "addtmp");
+    return Builder->CreateFAdd(L, R, "addtmp");
   case '-':
-    return Builder.CreateFSub(L, R, "subtmp");
+    return Builder->CreateFSub(L, R, "subtmp");
   case '*':
-    return Builder.CreateFMul(L, R, "multmp");
+    return Builder->CreateFMul(L, R, "multmp");
   case '<':
-    L = Builder.CreateFCmpULT(L, R, "cmptmp");
+    L = Builder->CreateFCmpULT(L, R, "cmptmp");
     // Convert bool 0/1 to double 0.0 or 1.0
-    return Builder.CreateUIToFP(L, Type::getDoubleTy(TheContext), "booltmp");
+    return Builder->CreateUIToFP(L, Type::getDoubleTy(*TheContext), "booltmp");
   default:
     break;
   }
@@ -808,7 +809,7 @@ Value *BinaryExprAST::codegen() {
   assert(F && "binary operator not found!");
 
   Value *Ops[] = {L, R};
-  return Builder.CreateCall(F, Ops, "binop");
+  return Builder->CreateCall(F, Ops, "binop");
 }
 
 Value *CallExprAST::codegen() {
@@ -828,7 +829,7 @@ Value *CallExprAST::codegen() {
       return nullptr;
   }
 
-  return Builder.CreateCall(CalleeF, ArgsV, "calltmp");
+  return Builder->CreateCall(CalleeF, ArgsV, "calltmp");
 }
 
 Value *IfExprAST::codegen() {
@@ -837,46 +838,46 @@ Value *IfExprAST::codegen() {
     return nullptr;
 
   // Convert condition to a bool by comparing equal to 0.0.
-  CondV = Builder.CreateFCmpONE(
-      CondV, ConstantFP::get(TheContext, APFloat(0.0)), "ifcond");
+  CondV = Builder->CreateFCmpONE(
+      CondV, ConstantFP::get(*TheContext, APFloat(0.0)), "ifcond");
 
-  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+  Function *TheFunction = Builder->GetInsertBlock()->getParent();
 
   // Create blocks for the then and else cases.  Insert the 'then' block at the
   // end of the function.
-  BasicBlock *ThenBB = BasicBlock::Create(TheContext, "then", TheFunction);
-  BasicBlock *ElseBB = BasicBlock::Create(TheContext, "else");
-  BasicBlock *MergeBB = BasicBlock::Create(TheContext, "ifcont");
+  BasicBlock *ThenBB = BasicBlock::Create(*TheContext, "then", TheFunction);
+  BasicBlock *ElseBB = BasicBlock::Create(*TheContext, "else");
+  BasicBlock *MergeBB = BasicBlock::Create(*TheContext, "ifcont");
 
-  Builder.CreateCondBr(CondV, ThenBB, ElseBB);
+  Builder->CreateCondBr(CondV, ThenBB, ElseBB);
 
   // Emit then value.
-  Builder.SetInsertPoint(ThenBB);
+  Builder->SetInsertPoint(ThenBB);
 
   Value *ThenV = Then->codegen();
   if (!ThenV)
     return nullptr;
 
-  Builder.CreateBr(MergeBB);
+  Builder->CreateBr(MergeBB);
   // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
-  ThenBB = Builder.GetInsertBlock();
+  ThenBB = Builder->GetInsertBlock();
 
   // Emit else block.
   TheFunction->getBasicBlockList().push_back(ElseBB);
-  Builder.SetInsertPoint(ElseBB);
+  Builder->SetInsertPoint(ElseBB);
 
   Value *ElseV = Else->codegen();
   if (!ElseV)
     return nullptr;
 
-  Builder.CreateBr(MergeBB);
+  Builder->CreateBr(MergeBB);
   // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
-  ElseBB = Builder.GetInsertBlock();
+  ElseBB = Builder->GetInsertBlock();
 
   // Emit merge block.
   TheFunction->getBasicBlockList().push_back(MergeBB);
-  Builder.SetInsertPoint(MergeBB);
-  PHINode *PN = Builder.CreatePHI(Type::getDoubleTy(TheContext), 2, "iftmp");
+  Builder->SetInsertPoint(MergeBB);
+  PHINode *PN = Builder->CreatePHI(Type::getDoubleTy(*TheContext), 2, "iftmp");
 
   PN->addIncoming(ThenV, ThenBB);
   PN->addIncoming(ElseV, ElseBB);
@@ -903,7 +904,7 @@ Value *IfExprAST::codegen() {
 //   br endcond, loop, endloop
 // outloop:
 Value *ForExprAST::codegen() {
-  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+  Function *TheFunction = Builder->GetInsertBlock()->getParent();
 
   // Create an alloca for the variable in the entry block.
   AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
@@ -914,17 +915,17 @@ Value *ForExprAST::codegen() {
     return nullptr;
 
   // Store the value into the alloca.
-  Builder.CreateStore(StartVal, Alloca);
+  Builder->CreateStore(StartVal, Alloca);
 
   // Make the new basic block for the loop header, inserting after current
   // block.
-  BasicBlock *LoopBB = BasicBlock::Create(TheContext, "loop", TheFunction);
+  BasicBlock *LoopBB = BasicBlock::Create(*TheContext, "loop", TheFunction);
 
   // Insert an explicit fall through from the current block to the LoopBB.
-  Builder.CreateBr(LoopBB);
+  Builder->CreateBr(LoopBB);
 
   // Start insertion in LoopBB.
-  Builder.SetInsertPoint(LoopBB);
+  Builder->SetInsertPoint(LoopBB);
 
   // Within the loop, the variable is defined equal to the PHI node.  If it
   // shadows an existing variable, we have to restore it, so save it now.
@@ -945,7 +946,7 @@ Value *ForExprAST::codegen() {
       return nullptr;
   } else {
     // If not specified, use 1.0.
-    StepVal = ConstantFP::get(TheContext, APFloat(1.0));
+    StepVal = ConstantFP::get(*TheContext, APFloat(1.0));
   }
 
   // Compute the end condition.
@@ -955,23 +956,23 @@ Value *ForExprAST::codegen() {
 
   // Reload, increment, and restore the alloca.  This handles the case where
   // the body of the loop mutates the variable.
-  Value *CurVar = Builder.CreateLoad(Alloca, VarName.c_str());
-  Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar");
-  Builder.CreateStore(NextVar, Alloca);
+  Value *CurVar = Builder->CreateLoad(Alloca, VarName.c_str());
+  Value *NextVar = Builder->CreateFAdd(CurVar, StepVal, "nextvar");
+  Builder->CreateStore(NextVar, Alloca);
 
   // Convert condition to a bool by comparing equal to 0.0.
-  EndCond = Builder.CreateFCmpONE(
-      EndCond, ConstantFP::get(TheContext, APFloat(0.0)), "loopcond");
+  EndCond = Builder->CreateFCmpONE(
+      EndCond, ConstantFP::get(*TheContext, APFloat(0.0)), "loopcond");
 
   // Create the "after loop" block and insert it.
   BasicBlock *AfterBB =
-      BasicBlock::Create(TheContext, "afterloop", TheFunction);
+      BasicBlock::Create(*TheContext, "afterloop", TheFunction);
 
   // Insert the conditional branch into the end of LoopEndBB.
-  Builder.CreateCondBr(EndCond, LoopBB, AfterBB);
+  Builder->CreateCondBr(EndCond, LoopBB, AfterBB);
 
   // Any new code will be inserted in AfterBB.
-  Builder.SetInsertPoint(AfterBB);
+  Builder->SetInsertPoint(AfterBB);
 
   // Restore the unshadowed variable.
   if (OldVal)
@@ -980,13 +981,13 @@ Value *ForExprAST::codegen() {
     NamedValues.erase(VarName);
 
   // for expr always returns 0.0.
-  return Constant::getNullValue(Type::getDoubleTy(TheContext));
+  return Constant::getNullValue(Type::getDoubleTy(*TheContext));
 }
 
 Value *VarExprAST::codegen() {
   std::vector<AllocaInst *> OldBindings;
 
-  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+  Function *TheFunction = Builder->GetInsertBlock()->getParent();
 
   // Register all variables and emit their initializer.
   for (unsigned i = 0, e = VarNames.size(); i != e; ++i) {
@@ -1004,11 +1005,11 @@ Value *VarExprAST::codegen() {
       if (!InitVal)
         return nullptr;
     } else { // If not specified, use 0.0.
-      InitVal = ConstantFP::get(TheContext, APFloat(0.0));
+      InitVal = ConstantFP::get(*TheContext, APFloat(0.0));
     }
 
     AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
-    Builder.CreateStore(InitVal, Alloca);
+    Builder->CreateStore(InitVal, Alloca);
 
     // Remember the old variable binding so that we can restore the binding when
     // we unrecurse.
@@ -1033,9 +1034,9 @@ Value *VarExprAST::codegen() {
 
 Function *PrototypeAST::codegen() {
   // Make the function type:  double(double,double) etc.
-  std::vector<Type *> Doubles(Args.size(), Type::getDoubleTy(TheContext));
+  std::vector<Type *> Doubles(Args.size(), Type::getDoubleTy(*TheContext));
   FunctionType *FT =
-      FunctionType::get(Type::getDoubleTy(TheContext), Doubles, false);
+      FunctionType::get(Type::getDoubleTy(*TheContext), Doubles, false);
 
   Function *F =
       Function::Create(FT, Function::ExternalLinkage, Name, TheModule.get());
@@ -1062,8 +1063,8 @@ Function *FunctionAST::codegen() {
     BinopPrecedence[P.getOperatorName()] = P.getBinaryPrecedence();
 
   // Create a new basic block to start insertion into.
-  BasicBlock *BB = BasicBlock::Create(TheContext, "entry", TheFunction);
-  Builder.SetInsertPoint(BB);
+  BasicBlock *BB = BasicBlock::Create(*TheContext, "entry", TheFunction);
+  Builder->SetInsertPoint(BB);
 
   // Record the function arguments in the NamedValues map.
   NamedValues.clear();
@@ -1072,7 +1073,7 @@ Function *FunctionAST::codegen() {
     AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, Arg.getName());
 
     // Store the initial value into the alloca.
-    Builder.CreateStore(&Arg, Alloca);
+    Builder->CreateStore(&Arg, Alloca);
 
     // Add arguments to variable symbol table.
     NamedValues[Arg.getName()] = Alloca;
@@ -1080,7 +1081,7 @@ Function *FunctionAST::codegen() {
 
   if (Value *RetVal = Body->codegen()) {
     // Finish off the function.
-    Builder.CreateRet(RetVal);
+    Builder->CreateRet(RetVal);
 
     // Validate the generated code, checking for consistency.
     verifyFunction(*TheFunction);
@@ -1102,8 +1103,11 @@ Function *FunctionAST::codegen() {
 
 static void InitializeModule() {
   // Open a new module.
-  TheModule = llvm::make_unique<Module>("my cool jit", TheContext);
-  TheModule->setDataLayout(TheJIT->getTargetMachine().createDataLayout());
+  TheModule = llvm::make_unique<Module>("my cool jit", *TheContext);
+  TheModule->setDataLayout(TheJIT->getDataLayout());
+
+  // Create a new builder for the module.
+  Builder = llvm::make_unique<IRBuilder<>>(*TheContext);
 }
 
 static void HandleDefinition() {
@@ -1136,23 +1140,34 @@ static void HandleExtern() {
 }
 
 static void HandleTopLevelExpression() {
+  static unsigned ExprCount = 0;
+
+  // Update ExprCount. This number will be added to anonymous expressions to
+  // prevent them from clashing.
+  ++ExprCount;
+
   // Evaluate a top-level expression into an anonymous function.
-  if (auto FnAST = ParseTopLevelExpr()) {
+  if (auto FnAST = ParseTopLevelExpr(ExprCount)) {
     if (FnAST->codegen()) {
       // JIT the module containing the anonymous expression, keeping a handle so
       // we can free it later.
-      auto H = TheJIT->addModule(std::move(TheModule));
+      TheJIT->addModule(std::move(TheModule));
       InitializeModule();
 
-      // Get the anonymous expression's address and cast it to the right type,
-      // double(*)(), so we can call it as a native function.
-      double (*FP)() =
-        (double (*)())(intptr_t)TheJIT->getSymbolAddress("__anon_expr");
-      assert(FP && "Failed to codegen function");
-      fprintf(stderr, "Evaluated to %f\n", FP());
-
-      // Delete the anonymous expression module from the JIT.
-      TheJIT->removeModule(H);
+      // Get the anonymous expression's JITSymbol.
+      auto Sym =  TheJIT->lookup(("__anon_expr" + Twine(ExprCount)).str());
+
+      if (Sym) {
+        // If the lookup succeeded, cast the symbol's address to a function
+        // pointer then call it.
+        auto *FP = (double (*)())(intptr_t)Sym->getAddress();
+        assert(FP && "Failed to codegen function");
+        fprintf(stderr, "Evaluated to %f\n", FP());
+      } else {
+        // Otherwise log the reason the symbol lookup failed.
+        logAllUnhandledErrors(Sym.takeError(), errs(),
+                              "Could not evaluate: ");
+      }
     }
   } else {
     // Skip token for error recovery.
@@ -1221,6 +1236,7 @@ int main() {
   getNextToken();
 
   TheJIT = llvm::make_unique<KaleidoscopeJIT>();
+  TheContext = &TheJIT->getContext();
 
   InitializeModule();
 
-- 
GitLab


From 5090f032e2f9038457214297ac111d454f410ae0 Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Wed, 17 Oct 2018 06:35:10 +0000
Subject: [PATCH 0266/1116] Document the behavior of option passing when using
 -DCLANG_ENABLE_BOOTSTRAP=On Also document -DCLANG_BOOTSTRAP_PASSTHROUGH

Reviewers: ecbeckmann

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53018

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344669 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/AdvancedBuilds.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/AdvancedBuilds.rst b/docs/AdvancedBuilds.rst
index c559bdeb280..d2a2ef58b23 100644
--- a/docs/AdvancedBuilds.rst
+++ b/docs/AdvancedBuilds.rst
@@ -41,6 +41,16 @@ This command itself isn't terribly useful because it assumes default
 configurations for each stage. The next series of examples utilize CMake cache
 scripts to provide more complex options.
 
+By default, only a few CMake options will be passed between stages.
+The list, called _BOOTSTRAP_DEFAULT_PASSTHROUGH, is defined in clang/CMakeLists.txt.
+To force the passing of the variables between stages, use the -DCLANG_BOOTSTRAP_PASSTHROUGH
+CMake option, each variable separated by a ";". As example:
+
+.. code-block:: console
+
+  $ cmake -G Ninja -DCLANG_ENABLE_BOOTSTRAP=On -DCLANG_BOOTSTRAP_PASSTHROUGH="CMAKE_INSTALL_PREFIX;CMAKE_VERBOSE_MAKEFILE" <path to source>
+  $ ninja stage2
+
 The clang build system refers to builds as stages. A stage1 build is a standard
 build using the compiler installed on the host, and a stage2 build is built
 using the stage1 compiler. This nomenclature holds up to more stages too. In
-- 
GitLab


From 5e40c8ba6a0e0c18e3444071dd6be63db070c443 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <sjoerd.meijer@arm.com>
Date: Wed, 17 Oct 2018 07:26:35 +0000
Subject: [PATCH 0267/1116] [ARM][NFCI] Do not fuse VADD and VMUL, continued
 (1/2)

This is a follow up of rL342874, which stopped fusing muls and adds into VMLAs
for performance reasons on the Cortex-M4 and Cortex-M33.  This is a serie of 2
patches, that is trying to achieve the same for VFMA.  The second column in the
table below shows what we were generating before rL342874, the third column
what changed with rL342874, and the last column what we want to achieve with
these 2 patches:

 --------------------------------------------------------
 | Opt   |  < rL342874   |  >= rL342874   |             |
 |------------------------------------------------------|
 |-O3    |     vmla      |      vmul      |     vmul    |
 |       |               |      vadd      |     vadd    |
 |------------------------------------------------------|
 |-Ofast |     vfma      |      vfma      |     vmul    |
 |       |               |                |     vadd    |
 |------------------------------------------------------|
 |-Oz    |     vmla      |      vmla      |     vmla    |
 --------------------------------------------------------

This patch 1/2, is a cleanup of the spaghetti predicate logic on the different
VMLA and VFMA codegen rules, so that we can make the final functional change in
patch 2/2.  This also fixes a typo in the regression test added in rL342874.

Differential revision: https://reviews.llvm.org/D53314


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344671 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMInstrInfo.td |  9 +++---
 lib/Target/ARM/ARMInstrNEON.td | 20 ++++++-------
 lib/Target/ARM/ARMInstrVFP.td  | 54 +++++++++++++++++-----------------
 test/CodeGen/ARM/fmacs.ll      |  7 +++--
 4 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index e1a077ef166..8aa05fac8a3 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -357,7 +357,10 @@ let RecomputePerFunction = 1 in {
   def DontUseMovt      : Predicate<"!Subtarget->useMovt(*MF)">;
   def UseMovtInPic     : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
   def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
-  def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx() || MF->getFunction().optForMinSize()">;
+
+  def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&"
+                           " !TM.Options.AllowFPOpFusion == FPOpFusion::Fast) ||"
+                           "MF->getFunction().optForMinSize())">;
 }
 def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
 
@@ -368,10 +371,6 @@ def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
                                  " FPOpFusion::Fast && "
                                  " Subtarget->hasVFP4()) && "
                                  "!Subtarget->isTargetDarwin()">;
-def DontUseFusedMAC  : Predicate<"!(TM.Options.AllowFPOpFusion =="
-                                 " FPOpFusion::Fast &&"
-                                 " Subtarget->hasVFP4()) || "
-                                 "Subtarget->isTargetDarwin()">;
 
 def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
 def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index a7bb32d31f6..2085507056b 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -4402,16 +4402,16 @@ defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
 def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
                           v2f32, fmul_su, fadd_mlx>,
-                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
                           v4f32, fmul_su, fadd_mlx>,
-                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLAhd   : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16",
                           v4f16, fmul_su, fadd_mlx>,
-                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
 def  VMLAhq   : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16",
                           v8f16, fmul_su, fadd_mlx>,
-                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
 defm VMLAsl   : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
                               IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
 def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
@@ -4632,16 +4632,16 @@ defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
 def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
                           v2f32, fmul_su, fsub_mlx>,
-                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
                           v4f32, fmul_su, fsub_mlx>,
-                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLShd   : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16",
                           v4f16, fmul, fsub>,
-                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
 def  VMLShq   : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16",
                           v8f16, fmul, fsub>,
-                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
 defm VMLSsl   : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
                               IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
 def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
@@ -7084,9 +7084,9 @@ def : N3VSPat<fadd, VADDfd>;
 def : N3VSPat<fsub, VSUBfd>;
 def : N3VSPat<fmul, VMULfd>;
 def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
-      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
 def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
-      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
 def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
       Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
 def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index b4e28b90747..b58730c452f 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -1814,7 +1814,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
               Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
@@ -1823,7 +1823,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                   [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
               Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -1836,17 +1836,17 @@ def VMLAH : AHbI<0b11100, 0b00, 0, 0,
                   [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
                                            HPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasFullFP16,UseFPVMLx]>;
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>;
 def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
           (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
-          Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>;
 
 
 def VMLSD : ADbI<0b11100, 0b00, 1, 0,
@@ -1855,7 +1855,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
               Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
@@ -1864,7 +1864,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                   [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
               Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -1877,17 +1877,17 @@ def VMLSH : AHbI<0b11100, 0b00, 1, 0,
                   [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
                                            HPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasFullFP16,UseFPVMLx]>;
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
           (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
-          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1895,7 +1895,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+                Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
                 Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
@@ -1904,7 +1904,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                   [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
                 Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -1917,29 +1917,29 @@ def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
                   [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
                                            HPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+                Requires<[HasFullFP16,UseFPVMLx]>;
 
 // (-(a * b) - dst) -> -(dst + (a * b))
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin),
           (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
-          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 // (-dst - (a * b)) -> -(dst + (a * b))
 def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
           (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)),
           (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
-          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1947,7 +1947,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+               Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
                Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
@@ -1955,7 +1955,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
                   IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
              [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
              Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -1967,17 +1967,17 @@ def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
                   IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
              [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+                Requires<[HasFullFP16,UseFPVMLx]>;
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin),
           (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
-          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 //===----------------------------------------------------------------------===//
 // Fused FP Multiply-Accumulate Operations.
diff --git a/test/CodeGen/ARM/fmacs.ll b/test/CodeGen/ARM/fmacs.ll
index 027991ef2c9..140ab933d0c 100644
--- a/test/CodeGen/ARM/fmacs.ll
+++ b/test/CodeGen/ARM/fmacs.ll
@@ -27,10 +27,11 @@ entry:
 	ret float %1
 }
 
-define float @vlma_minsize(float %acc, float %a, float %b) #0 {
+define float @vmla_minsize(float %acc, float %a, float %b) #0 {
 entry:
-; VMLA-LABEL: vlma_minsize:
-; VLMA:       vmla.f32  s0, s1, s2
+; VMLA-LABEL: vmla_minsize:
+; VMLA:       vmla.f32  s0, s1, s2
+; VMLA-NEXT:  bx  lr
 
   %0 = fmul float %a, %b
   %1 = fadd float %acc, %0
-- 
GitLab


From 860a0bdd39a9ed3922830ce59284a152b6b2fd51 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <sjoerd.meijer@arm.com>
Date: Wed, 17 Oct 2018 07:51:24 +0000
Subject: [PATCH 0268/1116] [ARM] Follow up of rL344671, attempt to pacify a
 buildbot

It was rightfully complaining about an unpretty logical expression.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344677 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMInstrInfo.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 8aa05fac8a3..529446ce809 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -359,7 +359,7 @@ let RecomputePerFunction = 1 in {
   def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
 
   def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&"
-                           " !TM.Options.AllowFPOpFusion == FPOpFusion::Fast) ||"
+                           "  TM.Options.AllowFPOpFusion != FPOpFusion::Fast) ||"
                            "MF->getFunction().optForMinSize())">;
 }
 def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
-- 
GitLab


From edcfeaeb8e6b38087d5740bd20dadd5546bf31fd Mon Sep 17 00:00:00 2001
From: Fedor Sergeev <fedor.sergeev@azul.com>
Date: Wed, 17 Oct 2018 09:02:54 +0000
Subject: [PATCH 0269/1116] [LoopPredication] add some simple stats

Just adding some useful statistics to LoopPredication pass
which was lacking any of these.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344681 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopPredication.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
index cbb6594cf8f..ccaf10142d5 100644
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -178,6 +178,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -196,6 +197,9 @@
 
 #define DEBUG_TYPE "loop-predication"
 
+STATISTIC(TotalConsidered, "Number of guards considered");
+STATISTIC(TotalWidened, "Number of checks widened");
+
 using namespace llvm;
 
 static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
@@ -574,6 +578,8 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
   LLVM_DEBUG(dbgs() << "Processing guard:\n");
   LLVM_DEBUG(Guard->dump());
 
+  TotalConsidered++;
+
   IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator()));
 
   // The guard condition is expected to be in form of:
@@ -615,6 +621,8 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
   if (NumWidened == 0)
     return false;
 
+  TotalWidened += NumWidened;
+
   // Emit the new guard condition
   Builder.SetInsertPoint(Guard);
   Value *LastCheck = nullptr;
-- 
GitLab


From 573330f677f4651417e60374899c68691c3c7bb9 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <sjoerd.meijer@arm.com>
Date: Wed, 17 Oct 2018 10:05:44 +0000
Subject: [PATCH 0270/1116] [ARM] Do not fuse VADD and VMUL, continued (2/2)

This is patch 2/2, following up on D53314, and is the functional change
to prevent fusing mul + add sequences into VFMAs.

Differential revision: https://reviews.llvm.org/D53315


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344683 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMInstrInfo.td | 6 ++++--
 test/CodeGen/ARM/fusedMAC.ll   | 9 +++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 529446ce809..fc8ed95ce8b 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -365,12 +365,14 @@ let RecomputePerFunction = 1 in {
 def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
 
 // Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
-// But only select them if more precision in FP computation is allowed.
+// But only select them if more precision in FP computation is allowed, and when
+// they are not slower than a mul + add sequence.
 // Do not use them for Darwin platforms.
 def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
                                  " FPOpFusion::Fast && "
                                  " Subtarget->hasVFP4()) && "
-                                 "!Subtarget->isTargetDarwin()">;
+                                 "!Subtarget->isTargetDarwin() &&"
+                                 "Subtarget->useFPVMLx()">;
 
 def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
 def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
diff --git a/test/CodeGen/ARM/fusedMAC.ll b/test/CodeGen/ARM/fusedMAC.ll
index 6f6cdc11491..6b922895b00 100644
--- a/test/CodeGen/ARM/fusedMAC.ll
+++ b/test/CodeGen/ARM/fusedMAC.ll
@@ -1,4 +1,8 @@
 ; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=arm-arm-eabi -mcpu=cortex-m7  -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=arm-arm-eabi -mcpu=cortex-m4  -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE
+; RUN: llc < %s -mtriple=arm-arm-eabi -mcpu=cortex-m33 -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE
+
 ; Check generated fused MAC and MLS.
 
 define double @fusedMACTest1(double %d1, double %d2, double %d3) {
@@ -12,6 +16,11 @@ define double @fusedMACTest1(double %d1, double %d2, double %d3) {
 define float @fusedMACTest2(float %f1, float %f2, float %f3) {
 ;CHECK-LABEL: fusedMACTest2:
 ;CHECK: vfma.f32
+
+;DONT-FUSE-LABEL: fusedMACTest2:
+;DONT-FUSE:       vmul.f32
+;DONT-FUSE-NEXT:  vadd.f32
+
   %1 = fmul float %f1, %f2
   %2 = fadd float %1, %f3
   ret float %2
-- 
GitLab


From c99132a7275bcecf84c29bc7ddb548645c52209d Mon Sep 17 00:00:00 2001
From: Petar Jovanovic <petar.jovanovic@mips.com>
Date: Wed, 17 Oct 2018 10:30:03 +0000
Subject: [PATCH 0271/1116] [MIPS GlobalISel] Legalize constants

Legalize s1, s8, s16 and s64 G_CONSTANT for MIPS32.

Patch by Petar Avramovic.

Differential Revision: https://reviews.llvm.org/D53077


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344684 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MipsLegalizerInfo.cpp         |  25 ++-
 .../Mips/GlobalISel/legalizer/constants.mir   | 164 ++++++++++++++++++
 .../Mips/GlobalISel/llvm-ir/constants.ll      | 108 ++++++++++++
 3 files changed, 296 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
 create mode 100644 test/CodeGen/Mips/GlobalISel/llvm-ir/constants.ll

diff --git a/lib/Target/Mips/MipsLegalizerInfo.cpp b/lib/Target/Mips/MipsLegalizerInfo.cpp
index 6a16e7955a1..525f2143190 100644
--- a/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -40,7 +40,9 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
       .minScalar(0, s32);
 
   getActionDefinitionsBuilder(G_CONSTANT)
-      .legalFor({s32});
+      .legalFor({s32})
+      .minScalar(0, s32)
+      .customFor({s64});
 
   getActionDefinitionsBuilder(G_GEP)
       .legalFor({{p0, s32}});
@@ -91,6 +93,27 @@ bool MipsLegalizerInfo::legalizeCustom(MachineInstr &MI,
     MI.eraseFromParent();
     break;
   }
+  case G_CONSTANT: {
+
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    const LLT sHalf = LLT::scalar(Size / 2);
+
+    const APInt &CImmValue = MI.getOperand(1).getCImm()->getValue();
+
+    unsigned ResLow = MRI.createGenericVirtualRegister(sHalf);
+    unsigned ResHigh = MRI.createGenericVirtualRegister(sHalf);
+    MIRBuilder.buildConstant(
+        ResLow, *ConstantInt::get(MI.getMF()->getFunction().getContext(),
+                                  CImmValue.trunc(Size / 2)));
+    MIRBuilder.buildConstant(
+        ResHigh, *ConstantInt::get(MI.getMF()->getFunction().getContext(),
+                                   CImmValue.lshr(Size / 2).trunc(Size / 2)));
+
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResHigh, ResLow});
+
+    MI.eraseFromParent();
+    break;
+  }
   default:
     return false;
   }
diff --git a/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir b/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
new file mode 100644
index 00000000000..4ed50f2d7ef
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
@@ -0,0 +1,164 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=mipsel-linux-gnu -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=MIPS32
+--- |
+
+  define void @any_i64() {entry: ret void}
+  define void @any_i32() {entry: ret void}
+  define void @signed_i16() {entry: ret void}
+  define void @signed_i8() {entry: ret void}
+  define void @unsigned_i16() {entry: ret void}
+  define void @unsigned_i8() {entry: ret void}
+  define void @i1_true() {entry: ret void}
+  define void @i1_false() {entry: ret void}
+
+...
+---
+name:            any_i64
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: any_i64
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; MIPS32: $v0 = COPY [[C]](s32)
+    ; MIPS32: $v1 = COPY [[C1]](s32)
+    ; MIPS32: RetRA implicit $v0, implicit $v1
+    %0:_(s64) = G_CONSTANT i64 -9223372036854775808
+    %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(s64)
+    $v0 = COPY %2(s32)
+    $v1 = COPY %1(s32)
+    RetRA implicit $v0, implicit $v1
+
+...
+---
+name:            any_i32
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: any_i32
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; MIPS32: $v0 = COPY [[C]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s32) = G_CONSTANT i32 -2147483648
+    $v0 = COPY %0(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            signed_i16
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: signed_i16
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C1]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]]
+    ; MIPS32: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s16) = G_CONSTANT i16 -32768
+    %1:_(s32) = G_SEXT %0(s16)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            signed_i8
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: signed_i8
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C1]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]]
+    ; MIPS32: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s8) = G_CONSTANT i8 -128
+    %1:_(s32) = G_SEXT %0(s8)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            unsigned_i16
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: unsigned_i16
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; MIPS32: $v0 = COPY [[AND]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s16) = G_CONSTANT i16 -32768
+    %1:_(s32) = G_ZEXT %0(s16)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            unsigned_i8
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: unsigned_i8
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; MIPS32: $v0 = COPY [[AND]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s8) = G_CONSTANT i8 -128
+    %1:_(s32) = G_ZEXT %0(s8)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            i1_true
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: i1_true
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; MIPS32: $v0 = COPY [[AND]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s1) = G_CONSTANT i1 true
+    %1:_(s32) = G_ZEXT %0(s1)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            i1_false
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: i1_false
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; MIPS32: $v0 = COPY [[AND]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s1) = G_CONSTANT i1 false
+    %1:_(s32) = G_ZEXT %0(s1)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
diff --git a/test/CodeGen/Mips/GlobalISel/llvm-ir/constants.ll b/test/CodeGen/Mips/GlobalISel/llvm-ir/constants.ll
new file mode 100644
index 00000000000..ef7600402e0
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/llvm-ir/constants.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc  -O0 -mtriple=mipsel-linux-gnu -global-isel  -verify-machineinstrs %s -o -| FileCheck %s -check-prefixes=MIPS32
+
+define i64 @any_i64() {
+; MIPS32-LABEL: any_i64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 0
+; MIPS32-NEXT:    ori $2, $1, 0
+; MIPS32-NEXT:    lui $1, 32768
+; MIPS32-NEXT:    ori $3, $1, 0
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i64 -9223372036854775808
+}
+
+define i32 @any_i32() {
+; MIPS32-LABEL: any_i32:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 32768
+; MIPS32-NEXT:    ori $2, $1, 0
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i32 -2147483648
+}
+
+define signext i16 @signed_i16() {
+; MIPS32-LABEL: signed_i16:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 65535
+; MIPS32-NEXT:    ori $1, $1, 32768
+; MIPS32-NEXT:    sll $1, $1, 16
+; MIPS32-NEXT:    sra $2, $1, 16
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i16 -32768
+}
+
+define signext i8 @signed_i8() {
+; MIPS32-LABEL: signed_i8:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 65535
+; MIPS32-NEXT:    ori $1, $1, 65408
+; MIPS32-NEXT:    sll $1, $1, 24
+; MIPS32-NEXT:    sra $2, $1, 24
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i8 -128
+}
+
+define zeroext i16 @unsigned_i16() {
+; MIPS32-LABEL: unsigned_i16:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 65535
+; MIPS32-NEXT:    ori $1, $1, 32768
+; MIPS32-NEXT:    lui $2, 0
+; MIPS32-NEXT:    ori $2, $2, 65535
+; MIPS32-NEXT:    and $2, $1, $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i16 -32768
+}
+
+define zeroext i8 @unsigned_i8() {
+; MIPS32-LABEL: unsigned_i8:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 65535
+; MIPS32-NEXT:    ori $1, $1, 65408
+; MIPS32-NEXT:    lui $2, 0
+; MIPS32-NEXT:    ori $2, $2, 255
+; MIPS32-NEXT:    and $2, $1, $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i8 -128
+}
+
+define zeroext i1 @i1_true() {
+; MIPS32-LABEL: i1_true:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 65535
+; MIPS32-NEXT:    ori $1, $1, 65535
+; MIPS32-NEXT:    lui $2, 0
+; MIPS32-NEXT:    ori $2, $2, 1
+; MIPS32-NEXT:    and $2, $1, $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i1 true
+}
+
+define zeroext i1 @i1_false() {
+; MIPS32-LABEL: i1_false:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 0
+; MIPS32-NEXT:    ori $1, $1, 0
+; MIPS32-NEXT:    lui $2, 0
+; MIPS32-NEXT:    ori $2, $2, 1
+; MIPS32-NEXT:    and $2, $1, $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i1 false
+}
-- 
GitLab


From ff88649977a3c1b1d4c2cd289db4443b024b1dbe Mon Sep 17 00:00:00 2001
From: Fedor Sergeev <fedor.sergeev@azul.com>
Date: Wed, 17 Oct 2018 10:36:23 +0000
Subject: [PATCH 0272/1116] [NewPM] teach -passes= to emit meaningful error
 messages

All the PassBuilder::parse interfaces now return descriptive StringError
instead of a plain bool. It allows to make -passes/aa-pipeline parsing
errors context-specific and thus less confusing.

TODO: ideally we should also make suggestions for misspelled pass names,
but that requires some extensions to PassBuilder.

Reviewed By: philip.pfaffe, chandlerc
Differential Revision: https://reviews.llvm.org/D53246

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344685 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Passes/PassBuilder.h          |  51 +--
 lib/LTO/LTOBackend.cpp                     |  14 +-
 lib/Passes/PassBuilder.cpp                 | 366 ++++++++++++---------
 test/Other/pass-pipeline-parsing.ll        |  83 ++++-
 test/tools/llvm-lto2/X86/pipeline.ll       |   4 +-
 test/tools/llvm-opt-fuzzer/command-line.ll |   2 +-
 tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp  |  11 +-
 tools/opt/NewPMDriver.cpp                  |  90 ++---
 unittests/IR/CMakeLists.txt                |   2 +
 unittests/IR/PassBuilderCallbacksTest.cpp  |  37 ++-
 unittests/Passes/CMakeLists.txt            |   1 +
 unittests/Passes/PluginsTest.cpp           |   5 +-
 12 files changed, 398 insertions(+), 268 deletions(-)

diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h
index 91314430a96..22e5eb0caa0 100644
--- a/include/llvm/Passes/PassBuilder.h
+++ b/include/llvm/Passes/PassBuilder.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include <vector>
@@ -384,8 +385,9 @@ public:
   /// If the sequence of passes aren't all the exact same kind of pass, it will
   /// be an error. You cannot mix different levels implicitly, you must
   /// explicitly form a pass manager in which to nest passes.
-  bool parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
+  Error parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
 
   /// {{@ Parse a textual pass pipeline description into a specific PassManager
   ///
@@ -394,12 +396,15 @@ public:
   /// this is the valid pipeline text:
   ///
   ///   function(lpass)
-  bool parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
-  bool parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
-  bool parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
+  Error parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
+  Error parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
+  Error parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
   /// @}}
 
   /// Parse a textual alias analysis pipeline into the provided AA manager.
@@ -417,7 +422,7 @@ public:
   /// Returns false if the text cannot be parsed cleanly. The specific state of
   /// the \p AA manager is unspecified if such an error is encountered and this
   /// returns false.
-  bool parseAAPipeline(AAManager &AA, StringRef PipelineText);
+  Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
 
   /// Register a callback for a default optimizer pipeline extension
   /// point
@@ -565,28 +570,28 @@ private:
   static Optional<std::vector<PipelineElement>>
   parsePipelineText(StringRef Text);
 
-  bool parseModulePass(ModulePassManager &MPM, const PipelineElement &E,
+  Error parseModulePass(ModulePassManager &MPM, const PipelineElement &E,
+                        bool VerifyEachPass, bool DebugLogging);
+  Error parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E,
                        bool VerifyEachPass, bool DebugLogging);
-  bool parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E,
+  Error parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E,
+                          bool VerifyEachPass, bool DebugLogging);
+  Error parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
                       bool VerifyEachPass, bool DebugLogging);
-  bool parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E,
-                     bool VerifyEachPass, bool DebugLogging);
-  bool parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
-                     bool VerifyEachPass, bool DebugLogging);
   bool parseAAPassName(AAManager &AA, StringRef Name);
 
-  bool parseLoopPassPipeline(LoopPassManager &LPM,
-                             ArrayRef<PipelineElement> Pipeline,
-                             bool VerifyEachPass, bool DebugLogging);
-  bool parseFunctionPassPipeline(FunctionPassManager &FPM,
-                                 ArrayRef<PipelineElement> Pipeline,
-                                 bool VerifyEachPass, bool DebugLogging);
-  bool parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
+  Error parseLoopPassPipeline(LoopPassManager &LPM,
                               ArrayRef<PipelineElement> Pipeline,
                               bool VerifyEachPass, bool DebugLogging);
-  bool parseModulePassPipeline(ModulePassManager &MPM,
+  Error parseFunctionPassPipeline(FunctionPassManager &FPM,
+                                  ArrayRef<PipelineElement> Pipeline,
+                                  bool VerifyEachPass, bool DebugLogging);
+  Error parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
                                ArrayRef<PipelineElement> Pipeline,
                                bool VerifyEachPass, bool DebugLogging);
+  Error parseModulePassPipeline(ModulePassManager &MPM,
+                                ArrayRef<PipelineElement> Pipeline,
+                                bool VerifyEachPass, bool DebugLogging);
 
   void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
                          OptimizationLevel Level, bool RunProfileGen,
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index 20fc40de4b9..1f9d60a5bdf 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -162,7 +162,7 @@ static void runNewPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
   AAManager AA;
 
   // Parse a custom AA pipeline if asked to.
-  if (!PB.parseAAPipeline(AA, "default"))
+  if (auto Err = PB.parseAAPipeline(AA, "default"))
     report_fatal_error("Error parsing default AA pipeline");
 
   LoopAnalysisManager LAM(Conf.DebugPassManager);
@@ -221,9 +221,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
 
   // Parse a custom AA pipeline if asked to.
   if (!AAPipelineDesc.empty())
-    if (!PB.parseAAPipeline(AA, AAPipelineDesc))
-      report_fatal_error("unable to parse AA pipeline description: " +
-                         AAPipelineDesc);
+    if (auto Err = PB.parseAAPipeline(AA, AAPipelineDesc))
+      report_fatal_error("unable to parse AA pipeline description '" +
+                         AAPipelineDesc + "': " + toString(std::move(Err)));
 
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
@@ -246,9 +246,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
   MPM.addPass(VerifierPass());
 
   // Now, add all the passes we've been requested to.
-  if (!PB.parsePassPipeline(MPM, PipelineDesc))
-    report_fatal_error("unable to parse pass pipeline description: " +
-                       PipelineDesc);
+  if (auto Err = PB.parsePassPipeline(MPM, PipelineDesc))
+    report_fatal_error("unable to parse pass pipeline description '" +
+                       PipelineDesc + "': " + toString(std::move(Err)));
 
   if (!DisableVerify)
     MPM.addPass(VerifierPass());
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 09758dc5651..f6313d23e2d 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
@@ -1402,9 +1403,9 @@ PassBuilder::parsePipelineText(StringRef Text) {
   return {std::move(ResultPipeline)};
 }
 
-bool PassBuilder::parseModulePass(ModulePassManager &MPM,
-                                  const PipelineElement &E, bool VerifyEachPass,
-                                  bool DebugLogging) {
+Error PassBuilder::parseModulePass(ModulePassManager &MPM,
+                                   const PipelineElement &E,
+                                   bool VerifyEachPass, bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1412,50 +1413,56 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
   if (!InnerPipeline.empty()) {
     if (Name == "module") {
       ModulePassManager NestedMPM(DebugLogging);
-      if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass,
-                                   DebugLogging))
-        return false;
+      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline,
+                                             VerifyEachPass, DebugLogging))
+        return Err;
       MPM.addPass(std::move(NestedMPM));
-      return true;
+      return Error::success();
     }
     if (Name == "cgscc") {
       CGSCCPassManager CGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass,
+                                            DebugLogging))
+        return Err;
       MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
-      return true;
+      return Error::success();
     }
     if (Name == "function") {
       FunctionPassManager FPM(DebugLogging);
-      if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       ModulePassManager NestedMPM(DebugLogging);
-      if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass,
-                                   DebugLogging))
-        return false;
+      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline,
+                                             VerifyEachPass, DebugLogging))
+        return Err;
       MPM.addPass(createRepeatedPass(*Count, std::move(NestedMPM)));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : ModulePipelineParsingCallbacks)
       if (C(Name, MPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as module pipeline", Name).str(),
+        inconvertibleErrorCode());
+    ;
   }
 
   // Manually handle aliases for pre-configured pipeline fragments.
   if (startsWithDefaultPipelineAliasPrefix(Name)) {
     SmallVector<StringRef, 3> Matches;
     if (!DefaultAliasRegex.match(Name, &Matches))
-      return false;
+      return make_error<StringError>(
+          formatv("unknown default pipeline alias '{0}'", Name).str(),
+          inconvertibleErrorCode());
+
     assert(Matches.size() == 3 && "Must capture two matched strings!");
 
     OptimizationLevel L = StringSwitch<OptimizationLevel>(Matches[2])
@@ -1467,7 +1474,7 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
                               .Case("Oz", Oz);
     if (L == O0)
       // At O0 we do nothing at all!
-      return true;
+      return Error::success();
 
     if (Matches[1] == "default") {
       MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging));
@@ -1481,38 +1488,40 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
       assert(Matches[1] == "lto" && "Not one of the matched options!");
       MPM.addPass(buildLTODefaultPipeline(L, DebugLogging, nullptr));
     }
-    return true;
+    return Error::success();
   }
 
   // Finally expand the basic registered passes from the .inc file.
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   if (Name == NAME) {                                                          \
     MPM.addPass(CREATE_PASS);                                                  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">") {                                           \
     MPM.addPass(                                                               \
         RequireAnalysisPass<                                                   \
             std::remove_reference<decltype(CREATE_PASS)>::type, Module>());    \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     MPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : ModulePipelineParsingCallbacks)
     if (C(Name, MPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(
+      formatv("unknown module pass '{0}'", Name).str(),
+      inconvertibleErrorCode());
 }
 
-bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
-                                 const PipelineElement &E, bool VerifyEachPass,
-                                 bool DebugLogging) {
+Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
+                                  const PipelineElement &E, bool VerifyEachPass,
+                                  bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1520,53 +1529,55 @@ bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
   if (!InnerPipeline.empty()) {
     if (Name == "cgscc") {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
+                                            VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       CGPM.addPass(std::move(NestedCGPM));
-      return true;
+      return Error::success();
     }
     if (Name == "function") {
       FunctionPassManager FPM(DebugLogging);
-      if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       CGPM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
+                                            VerifyEachPass, DebugLogging))
+        return Err;
       CGPM.addPass(createRepeatedPass(*Count, std::move(NestedCGPM)));
-      return true;
+      return Error::success();
     }
     if (auto MaxRepetitions = parseDevirtPassName(Name)) {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
+                                            VerifyEachPass, DebugLogging))
+        return Err;
       CGPM.addPass(
           createDevirtSCCRepeatedPass(std::move(NestedCGPM), *MaxRepetitions));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : CGSCCPipelineParsingCallbacks)
       if (C(Name, CGPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as cgscc pipeline", Name).str(),
+        inconvertibleErrorCode());
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define CGSCC_PASS(NAME, CREATE_PASS)                                          \
   if (Name == NAME) {                                                          \
     CGPM.addPass(CREATE_PASS);                                                 \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   if (Name == "require<" NAME ">") {                                           \
@@ -1574,24 +1585,26 @@ bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
                  std::remove_reference<decltype(CREATE_PASS)>::type,           \
                  LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,    \
                  CGSCCUpdateResult &>());                                      \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     CGPM.addPass(InvalidateAnalysisPass<                                       \
                  std::remove_reference<decltype(CREATE_PASS)>::type>());       \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : CGSCCPipelineParsingCallbacks)
     if (C(Name, CGPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(
+      formatv("unknown cgscc pass '{0}'", Name).str(),
+      inconvertibleErrorCode());
 }
 
-bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
-                                    const PipelineElement &E,
-                                    bool VerifyEachPass, bool DebugLogging) {
+Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
+                                     const PipelineElement &E,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1599,68 +1612,72 @@ bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
   if (!InnerPipeline.empty()) {
     if (Name == "function") {
       FunctionPassManager NestedFPM(DebugLogging);
-      if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       FPM.addPass(std::move(NestedFPM));
-      return true;
+      return Error::success();
     }
     if (Name == "loop") {
       LoopPassManager LPM(DebugLogging);
-      if (!parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass,
-                                 DebugLogging))
-        return false;
+      if (auto Err = parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass,
+                                           DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       FPM.addPass(
           createFunctionToLoopPassAdaptor(std::move(LPM), DebugLogging));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       FunctionPassManager NestedFPM(DebugLogging);
-      if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       FPM.addPass(createRepeatedPass(*Count, std::move(NestedFPM)));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : FunctionPipelineParsingCallbacks)
       if (C(Name, FPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as function pipeline", Name).str(),
+        inconvertibleErrorCode());
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
     FPM.addPass(CREATE_PASS);                                                  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   if (Name == "require<" NAME ">") {                                           \
     FPM.addPass(                                                               \
         RequireAnalysisPass<                                                   \
             std::remove_reference<decltype(CREATE_PASS)>::type, Function>());  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     FPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : FunctionPipelineParsingCallbacks)
     if (C(Name, FPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(
+      formatv("unknown function pass '{0}'", Name).str(),
+      inconvertibleErrorCode());
 }
 
-bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
-                                bool VerifyEachPass, bool DebugLogging) {
+Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
+                                 bool VerifyEachPass, bool DebugLogging) {
   StringRef Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1668,35 +1685,37 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
   if (!InnerPipeline.empty()) {
     if (Name == "loop") {
       LoopPassManager NestedLPM(DebugLogging);
-      if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass,
-                                 DebugLogging))
-        return false;
+      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline,
+                                           VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       LPM.addPass(std::move(NestedLPM));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       LoopPassManager NestedLPM(DebugLogging);
-      if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass,
-                                 DebugLogging))
-        return false;
+      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline,
+                                           VerifyEachPass, DebugLogging))
+        return Err;
       LPM.addPass(createRepeatedPass(*Count, std::move(NestedLPM)));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : LoopPipelineParsingCallbacks)
       if (C(Name, LPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as loop pipeline", Name).str(),
+        inconvertibleErrorCode());
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
     LPM.addPass(CREATE_PASS);                                                  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
   if (Name == "require<" NAME ">") {                                           \
@@ -1704,19 +1723,20 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
                 std::remove_reference<decltype(CREATE_PASS)>::type, Loop,      \
                 LoopAnalysisManager, LoopStandardAnalysisResults &,            \
                 LPMUpdater &>());                                              \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     LPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : LoopPipelineParsingCallbacks)
     if (C(Name, LPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(formatv("unknown loop pass '{0}'", Name).str(),
+                                 inconvertibleErrorCode());
 }
 
 bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) {
@@ -1740,41 +1760,42 @@ bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) {
   return false;
 }
 
-bool PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM,
-                                        ArrayRef<PipelineElement> Pipeline,
-                                        bool VerifyEachPass,
-                                        bool DebugLogging) {
+Error PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM,
+                                         ArrayRef<PipelineElement> Pipeline,
+                                         bool VerifyEachPass,
+                                         bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err = parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     // FIXME: No verifier support for Loop passes!
   }
-  return true;
+  return Error::success();
 }
 
-bool PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM,
-                                            ArrayRef<PipelineElement> Pipeline,
-                                            bool VerifyEachPass,
-                                            bool DebugLogging) {
+Error PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM,
+                                             ArrayRef<PipelineElement> Pipeline,
+                                             bool VerifyEachPass,
+                                             bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err =
+            parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     if (VerifyEachPass)
       FPM.addPass(VerifierPass());
   }
-  return true;
+  return Error::success();
 }
 
-bool PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
-                                         ArrayRef<PipelineElement> Pipeline,
-                                         bool VerifyEachPass,
-                                         bool DebugLogging) {
+Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
+                                          ArrayRef<PipelineElement> Pipeline,
+                                          bool VerifyEachPass,
+                                          bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err = parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     // FIXME: No verifier support for CGSCC passes!
   }
-  return true;
+  return Error::success();
 }
 
 void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
@@ -1790,28 +1811,30 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
   LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
 }
 
-bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
-                                          ArrayRef<PipelineElement> Pipeline,
-                                          bool VerifyEachPass,
-                                          bool DebugLogging) {
+Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
+                                           ArrayRef<PipelineElement> Pipeline,
+                                           bool VerifyEachPass,
+                                           bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseModulePass(MPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err = parseModulePass(MPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     if (VerifyEachPass)
       MPM.addPass(VerifierPass());
   }
-  return true;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c ModulePassManager
 // FIXME: Should this routine accept a TargetMachine or require the caller to
 // pre-populate the analysis managers with target-specific stuff?
-bool PassBuilder::parsePassPipeline(ModulePassManager &MPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
   // If the first name isn't at the module layer, wrap the pipeline up
   // automatically.
@@ -1828,73 +1851,106 @@ bool PassBuilder::parsePassPipeline(ModulePassManager &MPM,
     } else {
       for (auto &C : TopLevelPipelineParsingCallbacks)
         if (C(MPM, *Pipeline, VerifyEachPass, DebugLogging))
-          return true;
-
-      // Unknown pass name!
-      return false;
+          return Error::success();
+
+      // Unknown pass or pipeline name!
+      auto &InnerPipeline = Pipeline->front().InnerPipeline;
+      return make_error<StringError>(
+          formatv("unknown {0} name '{1}'",
+                  (InnerPipeline.empty() ? "pass" : "pipeline"), FirstName)
+              .str(),
+          inconvertibleErrorCode());
     }
   }
 
-  return parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging);
+  if (auto Err =
+          parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging))
+    return Err;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c CGSCCPassManager
-bool PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
   StringRef FirstName = Pipeline->front().Name;
   if (!isCGSCCPassName(FirstName, CGSCCPipelineParsingCallbacks))
-    return false;
-
-  return parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging);
+    return make_error<StringError>(
+        formatv("unknown cgscc pass '{0}' in pipeline '{1}'", FirstName,
+                PipelineText)
+            .str(),
+        inconvertibleErrorCode());
+
+  if (auto Err =
+          parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging))
+    return Err;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c
 // FunctionPassManager
-bool PassBuilder::parsePassPipeline(FunctionPassManager &FPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(FunctionPassManager &FPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
   StringRef FirstName = Pipeline->front().Name;
   if (!isFunctionPassName(FirstName, FunctionPipelineParsingCallbacks))
-    return false;
-
-  return parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass,
-                                   DebugLogging);
+    return make_error<StringError>(
+        formatv("unknown function pass '{0}' in pipeline '{1}'", FirstName,
+                PipelineText)
+            .str(),
+        inconvertibleErrorCode());
+
+  if (auto Err = parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass,
+                                           DebugLogging))
+    return Err;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c LoopPassManager
-bool PassBuilder::parsePassPipeline(LoopPassManager &CGPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(LoopPassManager &CGPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
-  return parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging);
+  if (auto Err =
+          parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging))
+    return Err;
+
+  return Error::success();
 }
 
-bool PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
+Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
   // If the pipeline just consists of the word 'default' just replace the AA
   // manager with our default one.
   if (PipelineText == "default") {
     AA = buildDefaultAAPipeline();
-    return true;
+    return Error::success();
   }
 
   while (!PipelineText.empty()) {
     StringRef Name;
     std::tie(Name, PipelineText) = PipelineText.split(',');
     if (!parseAAPassName(AA, Name))
-      return false;
+      return make_error<StringError>(
+          formatv("unknown alias analysis name '{0}'", Name).str(),
+          inconvertibleErrorCode());
   }
 
-  return true;
+  return Error::success();
 }
diff --git a/test/Other/pass-pipeline-parsing.ll b/test/Other/pass-pipeline-parsing.ll
index b303318c796..d26d000ec8d 100644
--- a/test/Other/pass-pipeline-parsing.ll
+++ b/test/Other/pass-pipeline-parsing.ll
@@ -54,52 +54,52 @@
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED1
-; CHECK-UNBALANCED1: unable to parse pass pipeline description
+; CHECK-UNBALANCED1: invalid pipeline 'no-op-module)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='module(no-op-module))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED2
-; CHECK-UNBALANCED2: unable to parse pass pipeline description
+; CHECK-UNBALANCED2: invalid pipeline 'module(no-op-module))'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='module(no-op-module' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED3
-; CHECK-UNBALANCED3: unable to parse pass pipeline description
+; CHECK-UNBALANCED3: invalid pipeline 'module(no-op-module'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED4
-; CHECK-UNBALANCED4: unable to parse pass pipeline description
+; CHECK-UNBALANCED4: invalid pipeline 'no-op-function)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED5
-; CHECK-UNBALANCED5: unable to parse pass pipeline description
+; CHECK-UNBALANCED5: invalid pipeline 'function(no-op-function))'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(function(no-op-function)))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED6
-; CHECK-UNBALANCED6: unable to parse pass pipeline description
+; CHECK-UNBALANCED6: invalid pipeline 'function(function(no-op-function)))'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED7
-; CHECK-UNBALANCED7: unable to parse pass pipeline description
+; CHECK-UNBALANCED7: invalid pipeline 'function(no-op-function'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(function(no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED8
-; CHECK-UNBALANCED8: unable to parse pass pipeline description
+; CHECK-UNBALANCED8: invalid pipeline 'function(function(no-op-function)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module,)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED9
-; CHECK-UNBALANCED9: unable to parse pass pipeline description
+; CHECK-UNBALANCED9: invalid pipeline 'no-op-module,)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function,)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED10
-; CHECK-UNBALANCED10: unable to parse pass pipeline description
+; CHECK-UNBALANCED10: invalid pipeline 'no-op-function,)'
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes=no-op-cgscc,no-op-cgscc %s 2>&1 \
@@ -176,37 +176,86 @@
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function)function(no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-MISSING-COMMA1
-; CHECK-MISSING-COMMA1: unable to parse pass pipeline description
+; CHECK-MISSING-COMMA1: invalid pipeline 'function(no-op-function)function(no-op-function)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function()' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-EMPTY-INNER-PIPELINE
-; CHECK-EMPTY-INNER-PIPELINE: unable to parse pass pipeline description
+; CHECK-EMPTY-INNER-PIPELINE: unknown function pass ''
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module(no-op-module,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-MODULE-PASS
-; CHECK-PIPELINE-ON-MODULE-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-MODULE-PASS: invalid use of 'no-op-module' pass as module pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-cgscc(no-op-cgscc,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-CGSCC-PASS
-; CHECK-PIPELINE-ON-CGSCC-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-CGSCC-PASS: invalid use of 'no-op-cgscc' pass as cgscc pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function(no-op-function,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-FUNCTION-PASS
-; CHECK-PIPELINE-ON-FUNCTION-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-FUNCTION-PASS: invalid use of 'no-op-function' pass as function pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-loop(no-op-loop,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-LOOP-PASS
-; CHECK-PIPELINE-ON-LOOP-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-LOOP-PASS: invalid use of 'no-op-loop' pass as loop pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function()' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-EMPTY-PIPELINE-ON-PASS
-; CHECK-EMPTY-PIPELINE-ON-PASS: unable to parse pass pipeline description
+; CHECK-EMPTY-PIPELINE-ON-PASS: invalid use of 'no-op-function' pass as function pipeline
+
+; RUN: not opt -passes='no-op-module,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-MODULE
+; CHECK-UNKNOWN-MODULE: opt: unknown module pass 'bad'
+
+; RUN: not opt -passes='no-op-loop,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-LOOP
+; CHECK-UNKNOWN-LOOP: opt: unknown loop pass 'bad'
+
+; RUN: not opt -passes='no-op-cgscc,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-CGSCC
+; CHECK-UNKNOWN-CGSCC: opt: unknown cgscc pass 'bad'
+
+; RUN: not opt -passes='no-op-function,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='function(bad,pipeline,text)' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='module(no-op-module,function(bad,pipeline,text))' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='no-op-module,function(bad,pipeline,text)' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='module(cgscc(function(bad,pipeline,text)))' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; CHECK-UNKNOWN-FUNCTION: opt: unknown function pass 'bad'
+
+; RUN: not opt -aa-pipeline=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=AA-PIPELINE-ERR
+; AA-PIPELINE-ERR: unknown alias analysis name 'bad'
+; RUN: opt -passes-ep-peephole=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PEEPHOLE-ERR
+; PASSES-EP-PEEPHOLE-ERR: Could not parse -passes-ep-peephole pipeline: unknown function pass 'bad'
+; RUN: opt -passes-ep-late-loop-optimizations=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LATELOOPOPT-ERR
+; PASSES-EP-LATELOOPOPT-ERR: Could not parse -passes-ep-late-loop-optimizations pipeline: unknown loop pass 'bad'
+; RUN: opt -passes-ep-loop-optimizer-end=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LOOPOPTEND-ERR
+; PASSES-EP-LOOPOPTEND-ERR: Could not parse -passes-ep-loop-optimizer-end pipeline: unknown loop pass 'bad'
+; RUN: opt -passes-ep-scalar-optimizer-late=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-SCALAROPTLATE-ERR
+; PASSES-EP-SCALAROPTLATE-ERR: Could not parse -passes-ep-scalar-optimizer-late pipeline: unknown function pass 'bad'
+; RUN: opt -passes-ep-cgscc-optimizer-late=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-CGSCCOPTLATE-ERR
+; PASSES-EP-CGSCCOPTLATE-ERR: Could not parse -passes-ep-cgscc-optimizer-late pipeline: unknown cgscc pass 'bad'
+; RUN: opt -passes-ep-vectorizer-start=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-VECTORIZERSTART-ERR
+; PASSES-EP-VECTORIZERSTART-ERR: Could not parse -passes-ep-vectorizer-start pipeline: unknown function pass 'bad'
+; RUN: opt -passes-ep-pipeline-start=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PIPELINESTART-ERR
+; PASSES-EP-PIPELINESTART-ERR: Could not parse -passes-ep-pipeline-start pipeline: unknown pass name 'bad'
 
 define void @f() {
 entry:
diff --git a/test/tools/llvm-lto2/X86/pipeline.ll b/test/tools/llvm-lto2/X86/pipeline.ll
index 29276d8d13a..9ab81ac70a7 100644
--- a/test/tools/llvm-lto2/X86/pipeline.ll
+++ b/test/tools/llvm-lto2/X86/pipeline.ll
@@ -32,11 +32,11 @@ define void @patatino() {
 ; RUN:  -r %t1.bc,patatino,px -opt-pipeline foogoo 2>&1 | \
 ; RUN:  FileCheck %s --check-prefix=ERR
 
-; ERR: LLVM ERROR: unable to parse pass pipeline description: foogoo
+; ERR: LLVM ERROR: unable to parse pass pipeline description 'foogoo': unknown pass name 'foogoo'
 
 ; RUN: not llvm-lto2 run %t1.bc -o %t.o \
 ; RUN:  -r %t1.bc,patatino,px -aa-pipeline patatino \
 ; RUN:  -opt-pipeline loweratomic 2>&1 | \
 ; RUN:  FileCheck %s --check-prefix=AAERR
 
-; AAERR: LLVM ERROR: unable to parse AA pipeline description: patatino
+; AAERR: LLVM ERROR: unable to parse AA pipeline description 'patatino': unknown alias analysis name 'patatino'
diff --git a/test/tools/llvm-opt-fuzzer/command-line.ll b/test/tools/llvm-opt-fuzzer/command-line.ll
index f747bba431b..8c3f6b60154 100644
--- a/test/tools/llvm-opt-fuzzer/command-line.ll
+++ b/test/tools/llvm-opt-fuzzer/command-line.ll
@@ -13,7 +13,7 @@
 
 ; Don't start with incorrect passes specified
 ; RUN: not llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes no-pass 2>&1 | FileCheck -check-prefix=PIPELINE %s
-; PIPELINE: can't parse pass pipeline
+; PIPELINE: unknown pass name 'no-pass'
 
 ; Correct command line
 ; RUN: llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes instcombine 2>&1 | FileCheck -check-prefix=CORRECT %s
diff --git a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
index 98d5428ddd1..57e75b1db9e 100644
--- a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
+++ b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
@@ -144,9 +144,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  bool Ok = PB.parsePassPipeline(MPM, PassPipeline, false, false);
-  assert(Ok && "Should have been checked during fuzzer initialization");
-  (void)Ok; // silence unused variable warning on release builds
+  auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false);
+  assert(!Err && "Should have been checked during fuzzer initialization");
+  // Only fail with assert above, otherwise ignore the parsing error.
+  consumeError(std::move(Err));
 
   // Run passes which we need to test
   //
@@ -235,8 +236,8 @@ extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize(
 
   PassBuilder PB(TM.get());
   ModulePassManager MPM;
-  if (!PB.parsePassPipeline(MPM, PassPipeline, false, false)) {
-    errs() << *argv[0] << ": can't parse pass pipeline\n";
+  if (auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false)) {
+    errs() << *argv[0] << ": " << toString(std::move(Err)) << "\n";
     exit(1);
   }
 
diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp
index e63547a79d0..e2f9a06523a 100644
--- a/tools/opt/NewPMDriver.cpp
+++ b/tools/opt/NewPMDriver.cpp
@@ -118,18 +118,20 @@ static cl::opt<bool> DebugInfoForProfiling(
 /// @}}
 
 template <typename PassManagerT>
-bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) {
-  if (PipelineText.empty())
+bool tryParsePipelineText(PassBuilder &PB,
+                          const cl::opt<std::string> &PipelineOpt) {
+  if (PipelineOpt.empty())
     return false;
 
   // Verify the pipeline is parseable:
   PassManagerT PM;
-  if (PB.parsePassPipeline(PM, PipelineText))
-    return true;
-
-  errs() << "Could not parse pipeline '" << PipelineText
-         << "'. I'm going to igore it.\n";
-  return false;
+  if (auto Err = PB.parsePassPipeline(PM, PipelineOpt)) {
+    errs() << "Could not parse -" << PipelineOpt.ArgStr
+           << " pipeline: " << toString(std::move(Err))
+           << "... I'm going to ignore it.\n";
+    return false;
+  }
+  return true;
 }
 
 /// If one of the EPPipeline command line options was given, register callbacks
@@ -137,50 +139,61 @@ bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) {
 static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass,
                                 bool DebugLogging) {
   if (tryParsePipelineText<FunctionPassManager>(PB, PeepholeEPPipeline))
-    PB.registerPeepholeEPCallback([&PB, VerifyEachPass, DebugLogging](
-        FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerPeepholeEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse PeepholeEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass,
+                                   DebugLogging));
+        });
   if (tryParsePipelineText<LoopPassManager>(PB,
                                             LateLoopOptimizationsEPPipeline))
     PB.registerLateLoopOptimizationsEPCallback(
         [&PB, VerifyEachPass, DebugLogging](
             LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline,
-                               VerifyEachPass, DebugLogging);
+          ExitOnError Err("Unable to parse LateLoopOptimizationsEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline,
+                                   VerifyEachPass, DebugLogging));
         });
   if (tryParsePipelineText<LoopPassManager>(PB, LoopOptimizerEndEPPipeline))
-    PB.registerLoopOptimizerEndEPCallback([&PB, VerifyEachPass, DebugLogging](
-        LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerLoopOptimizerEndEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse LoopOptimizerEndEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline,
+                                   VerifyEachPass, DebugLogging));
+        });
   if (tryParsePipelineText<FunctionPassManager>(PB,
                                                 ScalarOptimizerLateEPPipeline))
     PB.registerScalarOptimizerLateEPCallback(
         [&PB, VerifyEachPass, DebugLogging](
             FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline,
-                               VerifyEachPass, DebugLogging);
+          ExitOnError Err("Unable to parse ScalarOptimizerLateEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline,
+                                   VerifyEachPass, DebugLogging));
         });
   if (tryParsePipelineText<CGSCCPassManager>(PB, CGSCCOptimizerLateEPPipeline))
-    PB.registerCGSCCOptimizerLateEPCallback([&PB, VerifyEachPass, DebugLogging](
-        CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerCGSCCOptimizerLateEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse CGSCCOptimizerLateEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline,
+                                   VerifyEachPass, DebugLogging));
+        });
   if (tryParsePipelineText<FunctionPassManager>(PB, VectorizerStartEPPipeline))
-    PB.registerVectorizerStartEPCallback([&PB, VerifyEachPass, DebugLogging](
-        FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, VectorizerStartEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerVectorizerStartEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse VectorizerStartEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, VectorizerStartEPPipeline,
+                                   VerifyEachPass, DebugLogging));
+        });
   if (tryParsePipelineText<ModulePassManager>(PB, PipelineStartEPPipeline))
     PB.registerPipelineStartEPCallback(
         [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM) {
-          PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass,
-                               DebugLogging);
+          ExitOnError Err("Unable to parse PipelineStartEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass,
+                                   DebugLogging));
         });
 }
 
@@ -258,8 +271,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   // Specially handle the alias analysis manager so that we can register
   // a custom pipeline of AA passes with it.
   AAManager AA;
-  if (!PB.parseAAPipeline(AA, AAPipeline)) {
-    errs() << Arg0 << ": unable to parse AA pipeline description.\n";
+  if (auto Err = PB.parseAAPipeline(AA, AAPipeline)) {
+    errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
     return false;
   }
 
@@ -284,8 +297,9 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   if (EnableDebugify)
     MPM.addPass(NewPMDebugifyPass());
 
-  if (!PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) {
-    errs() << Arg0 << ": unable to parse pass pipeline description.\n";
+  if (auto Err =
+          PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) {
+    errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
     return false;
   }
 
diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index 211ab109131..7498983b260 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@@ -40,3 +40,5 @@ add_llvm_unittest(IRTests
   VerifierTest.cpp
   WaymarkTest.cpp
   )
+
+target_link_libraries(IRTests PRIVATE LLVMTestingSupport)
diff --git a/unittests/IR/PassBuilderCallbacksTest.cpp b/unittests/IR/PassBuilderCallbacksTest.cpp
index 97bbb81a6b0..20c47b045e7 100644
--- a/unittests/IR/PassBuilderCallbacksTest.cpp
+++ b/unittests/IR/PassBuilderCallbacksTest.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Testing/Support/Error.h"
 #include <functional>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -460,7 +461,7 @@ TEST_F(ModuleCallbacksTest, Passes) {
       .WillOnce(Invoke(getAnalysisResult));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -494,7 +495,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -525,7 +526,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -537,7 +538,7 @@ TEST_F(FunctionCallbacksTest, Passes) {
       .WillOnce(Invoke(getAnalysisResult));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -571,7 +572,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -604,7 +605,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -615,7 +616,7 @@ TEST_F(LoopCallbacksTest, Passes) {
       .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult)));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -650,7 +651,7 @@ TEST_F(LoopCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -682,7 +683,7 @@ TEST_F(LoopCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -693,7 +694,7 @@ TEST_F(CGSCCCallbacksTest, Passes) {
       .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult)));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -727,7 +728,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -759,7 +760,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -774,7 +775,7 @@ TEST_F(ModuleCallbacksTest, AnalysisUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("<string>"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -784,7 +785,7 @@ TEST_F(CGSCCCallbacksTest, PassUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("(foo)"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -794,7 +795,7 @@ TEST_F(FunctionCallbacksTest, AnalysisUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("foo"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -805,7 +806,7 @@ TEST_F(LoopCallbacksTest, PassUtilities) {
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
 
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -845,13 +846,13 @@ TEST_F(ModuleCallbacksTest, ParseTopLevelPipeline) {
 
   StringRef PipelineText =
       "another-pipeline(test-transform,invalidate<test-analysis>)";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 
   /// Test the negative case
   PipelineText = "another-pipeline(instcombine)";
-  ASSERT_FALSE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Failed())
       << "Pipeline was: " << PipelineText;
 }
 } // end anonymous namespace
diff --git a/unittests/Passes/CMakeLists.txt b/unittests/Passes/CMakeLists.txt
index d90df209d4e..415f3a71734 100644
--- a/unittests/Passes/CMakeLists.txt
+++ b/unittests/Passes/CMakeLists.txt
@@ -12,6 +12,7 @@ add_llvm_unittest(PluginsTests
   PluginsTest.cpp
   )
 export_executable_symbols(PluginsTests)
+target_link_libraries(PluginsTests PRIVATE LLVMTestingSupport)
 
 set(LLVM_LINK_COMPONENTS)
 add_llvm_loadable_module(TestPlugin
diff --git a/unittests/Passes/PluginsTest.cpp b/unittests/Passes/PluginsTest.cpp
index 726978714e8..abb7b57ee0c 100644
--- a/unittests/Passes/PluginsTest.cpp
+++ b/unittests/Passes/PluginsTest.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Testing/Support/Error.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "gtest/gtest.h"
 
@@ -54,8 +55,8 @@ TEST(PluginsTests, LoadPlugin) {
 
   PassBuilder PB;
   ModulePassManager PM;
-  ASSERT_FALSE(PB.parsePassPipeline(PM, "plugin-pass"));
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Failed());
 
   Plugin->registerPassBuilderCallbacks(PB);
-  ASSERT_TRUE(PB.parsePassPipeline(PM, "plugin-pass"));
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Succeeded());
 }
-- 
GitLab


From 422c9c46fe56c92f2909237ba551ecf211d1fc18 Mon Sep 17 00:00:00 2001
From: Fedor Sergeev <fedor.sergeev@azul.com>
Date: Wed, 17 Oct 2018 11:01:15 +0000
Subject: [PATCH 0273/1116] [NewPM] Fixing test failure on Windows - removed
 opt binary name from pattern

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344686 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Other/pass-pipeline-parsing.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/Other/pass-pipeline-parsing.ll b/test/Other/pass-pipeline-parsing.ll
index d26d000ec8d..2e8bc7c8730 100644
--- a/test/Other/pass-pipeline-parsing.ll
+++ b/test/Other/pass-pipeline-parsing.ll
@@ -210,15 +210,15 @@
 
 ; RUN: not opt -passes='no-op-module,bad' \
 ; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-MODULE
-; CHECK-UNKNOWN-MODULE: opt: unknown module pass 'bad'
+; CHECK-UNKNOWN-MODULE: unknown module pass 'bad'
 
 ; RUN: not opt -passes='no-op-loop,bad' \
 ; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-LOOP
-; CHECK-UNKNOWN-LOOP: opt: unknown loop pass 'bad'
+; CHECK-UNKNOWN-LOOP: unknown loop pass 'bad'
 
 ; RUN: not opt -passes='no-op-cgscc,bad' \
 ; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-CGSCC
-; CHECK-UNKNOWN-CGSCC: opt: unknown cgscc pass 'bad'
+; CHECK-UNKNOWN-CGSCC: unknown cgscc pass 'bad'
 
 ; RUN: not opt -passes='no-op-function,bad' \
 ; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
@@ -230,7 +230,7 @@
 ; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
 ; RUN: not opt -passes='module(cgscc(function(bad,pipeline,text)))' \
 ; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
-; CHECK-UNKNOWN-FUNCTION: opt: unknown function pass 'bad'
+; CHECK-UNKNOWN-FUNCTION: unknown function pass 'bad'
 
 ; RUN: not opt -aa-pipeline=bad -passes=no-op-function \
 ; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=AA-PIPELINE-ERR
-- 
GitLab


From 1e04734fb63189fd334a0d67bc6cd1cff95a5f4b Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Wed, 17 Oct 2018 11:16:25 +0000
Subject: [PATCH 0274/1116] [NFC] Remove GOTO from SCEV

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344687 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ScalarEvolution.cpp | 34 +++++++++++++-------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 60cd1cb4127..8fe500f150b 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -8808,7 +8808,13 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
                                            const SCEV *&LHS, const SCEV *&RHS,
                                            unsigned Depth) {
   bool Changed = false;
-
+  // Simplifies ICMP to trivial true or false by turning it into '0 == 0' or
+  // '0 != 0'.
+  auto TrivialCase = [&](bool TriviallyTrue) {
+    LHS = RHS = getConstant(ConstantInt::getFalse(getContext()));
+    Pred = TriviallyTrue ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+    return true;
+  };
   // If we hit the max recursion limit bail out.
   if (Depth >= 3)
     return false;
@@ -8820,9 +8826,9 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
       if (ConstantExpr::getICmp(Pred,
                                 LHSC->getValue(),
                                 RHSC->getValue())->isNullValue())
-        goto trivially_false;
+        return TrivialCase(false);
       else
-        goto trivially_true;
+        return TrivialCase(true);
     }
     // Otherwise swap the operands to put the constant on the right.
     std::swap(LHS, RHS);
@@ -8852,9 +8858,9 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
     if (!ICmpInst::isEquality(Pred)) {
       ConstantRange ExactCR = ConstantRange::makeExactICmpRegion(Pred, RA);
       if (ExactCR.isFullSet())
-        goto trivially_true;
+        return TrivialCase(true);
       else if (ExactCR.isEmptySet())
-        goto trivially_false;
+        return TrivialCase(false);
 
       APInt NewRHS;
       CmpInst::Predicate NewPred;
@@ -8890,7 +8896,7 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
         // The "Should have been caught earlier!" messages refer to the fact
         // that the ExactCR.isFullSet() or ExactCR.isEmptySet() check above
         // should have fired on the corresponding cases, and canonicalized the
-        // check to trivially_true or trivially_false.
+        // check to trivial case.
 
       case ICmpInst::ICMP_UGE:
         assert(!RA.isMinValue() && "Should have been caught earlier!");
@@ -8923,9 +8929,9 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
   // Check for obvious equality.
   if (HasSameValue(LHS, RHS)) {
     if (ICmpInst::isTrueWhenEqual(Pred))
-      goto trivially_true;
+      return TrivialCase(true);
     if (ICmpInst::isFalseWhenEqual(Pred))
-      goto trivially_false;
+      return TrivialCase(false);
   }
 
   // If possible, canonicalize GE/LE comparisons to GT/LT comparisons, by
@@ -8993,18 +8999,6 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
     return SimplifyICmpOperands(Pred, LHS, RHS, Depth+1);
 
   return Changed;
-
-trivially_true:
-  // Return 0 == 0.
-  LHS = RHS = getConstant(ConstantInt::getFalse(getContext()));
-  Pred = ICmpInst::ICMP_EQ;
-  return true;
-
-trivially_false:
-  // Return 0 != 0.
-  LHS = RHS = getConstant(ConstantInt::getFalse(getContext()));
-  Pred = ICmpInst::ICMP_NE;
-  return true;
 }
 
 bool ScalarEvolution::isKnownNegative(const SCEV *S) {
-- 
GitLab


From 91e6826692e9bbf42880df69c51226f46b0a9167 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 17 Oct 2018 11:37:28 +0000
Subject: [PATCH 0275/1116] [llvm-exegeis] Computing Latency configuration
 upfront so we can generate many CodeTemplates at once.

Summary: LatencyGenerator now computes all possible mode of serial execution for an Instruction upfront and generates CodeTemplate for the ones that give the best results (e.g. no need to generate a two instructions snippet when repeating a single one would do). The next step is to generate even more configurations for cases (e.g. for XOR we should generate "XOR EAX, EAX, EAX" and "XOR EAX, EAX, EBX")

Reviewers: courbet

Reviewed By: courbet

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53320

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344689 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/CodeTemplate.cpp      |  50 ++++
 tools/llvm-exegesis/lib/CodeTemplate.h        |  60 ++++-
 tools/llvm-exegesis/lib/Latency.cpp           | 155 +++++++++---
 tools/llvm-exegesis/lib/MCInstrDescView.cpp   |  16 +-
 tools/llvm-exegesis/lib/MCInstrDescView.h     |   8 +-
 tools/llvm-exegesis/lib/SnippetGenerator.cpp  |   2 +-
 tools/llvm-exegesis/lib/SnippetGenerator.h    |   2 +-
 tools/llvm-exegesis/lib/Uops.cpp              |   8 +-
 .../X86/SnippetGeneratorTest.cpp              | 237 +++++++++++++-----
 9 files changed, 426 insertions(+), 112 deletions(-)

diff --git a/tools/llvm-exegesis/lib/CodeTemplate.cpp b/tools/llvm-exegesis/lib/CodeTemplate.cpp
index 34433daa231..df9d18b94bb 100644
--- a/tools/llvm-exegesis/lib/CodeTemplate.cpp
+++ b/tools/llvm-exegesis/lib/CodeTemplate.cpp
@@ -65,4 +65,54 @@ llvm::MCInst InstructionTemplate::build() const {
   return Result;
 }
 
+bool isEnumValue(ExecutionMode Execution) {
+  return llvm::isPowerOf2_32(static_cast<uint32_t>(Execution));
+}
+
+llvm::StringRef getName(ExecutionMode Bit) {
+  assert(isEnumValue(Bit) && "Bit must be a power of two");
+  switch (Bit) {
+  case ExecutionMode::UNKNOWN:
+    return "UNKNOWN";
+  case ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS:
+    return "ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS";
+  case ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS:
+    return "ALWAYS_SERIAL_TIED_REGS_ALIAS";
+  case ExecutionMode::SERIAL_VIA_MEMORY_INSTR:
+    return "SERIAL_VIA_MEMORY_INSTR";
+  case ExecutionMode::SERIAL_VIA_EXPLICIT_REGS:
+    return "SERIAL_VIA_EXPLICIT_REGS";
+  case ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR:
+    return "SERIAL_VIA_NON_MEMORY_INSTR";
+  case ExecutionMode::ALWAYS_PARALLEL_MISSING_USE_OR_DEF:
+    return "ALWAYS_PARALLEL_MISSING_USE_OR_DEF";
+  case ExecutionMode::PARALLEL_VIA_EXPLICIT_REGS:
+    return "PARALLEL_VIA_EXPLICIT_REGS";
+  }
+  llvm_unreachable("Missing enum case");
+}
+
+static const ExecutionMode kAllExecutionModeBits[] = {
+    ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS,
+    ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS,
+    ExecutionMode::SERIAL_VIA_MEMORY_INSTR,
+    ExecutionMode::SERIAL_VIA_EXPLICIT_REGS,
+    ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR,
+    ExecutionMode::ALWAYS_PARALLEL_MISSING_USE_OR_DEF,
+    ExecutionMode::PARALLEL_VIA_EXPLICIT_REGS,
+};
+
+llvm::ArrayRef<ExecutionMode> getAllExecutionBits() {
+  return kAllExecutionModeBits;
+}
+
+llvm::SmallVector<ExecutionMode, 4>
+getExecutionModeBits(ExecutionMode Execution) {
+  llvm::SmallVector<ExecutionMode, 4> Result;
+  for (const auto Bit : getAllExecutionBits())
+    if ((Execution & Bit) == Bit)
+      Result.push_back(Bit);
+  return Result;
+}
+
 } // namespace exegesis
diff --git a/tools/llvm-exegesis/lib/CodeTemplate.h b/tools/llvm-exegesis/lib/CodeTemplate.h
index e5006eb74c9..734992f0afa 100644
--- a/tools/llvm-exegesis/lib/CodeTemplate.h
+++ b/tools/llvm-exegesis/lib/CodeTemplate.h
@@ -17,6 +17,7 @@
 #define LLVM_TOOLS_LLVM_EXEGESIS_CODETEMPLATE_H
 
 #include "MCInstrDescView.h"
+#include "llvm/ADT/BitmaskEnum.h"
 
 namespace exegesis {
 
@@ -45,9 +46,65 @@ struct InstructionTemplate {
   llvm::SmallVector<llvm::MCOperand, 4> VariableValues;
 };
 
+enum class ExecutionMode : uint8_t {
+  UNKNOWN = 0U,
+  // The instruction is always serial because implicit Use and Def alias.
+  // e.g. AAA (alias via EFLAGS)
+  ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS = 1u << 0,
+
+  // The instruction is always serial because one Def is tied to a Use.
+  // e.g. AND32ri (alias via tied GR32)
+  ALWAYS_SERIAL_TIED_REGS_ALIAS = 1u << 1,
+
+  // The execution can be made serial by inserting a second instruction that
+  // clobbers/reads memory.
+  // e.g. MOV8rm
+  SERIAL_VIA_MEMORY_INSTR = 1u << 2,
+
+  // The execution can be made serial by picking one Def that aliases with one
+  // Use.
+  // e.g. VXORPSrr XMM1, XMM1, XMM2
+  SERIAL_VIA_EXPLICIT_REGS = 1u << 3,
+
+  // The execution can be made serial by inserting a second instruction that
+  // uses one of the Defs and defs one of the Uses.
+  // e.g.
+  // 1st instruction: MMX_PMOVMSKBrr ECX, MM7
+  // 2nd instruction: MMX_MOVD64rr MM7, ECX
+  //  or instruction: MMX_MOVD64to64rr MM7, ECX
+  //  or instruction: MMX_PINSRWrr MM7, MM7, ECX, 1
+  SERIAL_VIA_NON_MEMORY_INSTR = 1u << 4,
+
+  // The execution is always parallel because the instruction is missing Use or
+  // Def operands.
+  ALWAYS_PARALLEL_MISSING_USE_OR_DEF = 1u << 5,
+
+  // The execution can be made parallel by repeating the same instruction but
+  // making sure that Defs of one instruction do not alias with Uses of the
+  // second one.
+  PARALLEL_VIA_EXPLICIT_REGS = 1u << 6,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/*Largest*/ PARALLEL_VIA_EXPLICIT_REGS)
+};
+
+// Returns whether Execution is one of the values defined in the enum above.
+bool isEnumValue(ExecutionMode Execution);
+
+// Returns a human readable string for the enum.
+llvm::StringRef getName(ExecutionMode Execution);
+
+// Returns a sequence of increasing powers of two corresponding to all the
+// Execution flags.
+llvm::ArrayRef<ExecutionMode> getAllExecutionBits();
+
+// Decomposes Execution into individual set bits.
+llvm::SmallVector<ExecutionMode, 4> getExecutionModeBits(ExecutionMode);
+
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
 // A CodeTemplate is a set of InstructionTemplates that may not be fully
 // specified (i.e. some variables are not yet set). This allows the
-// BenchmarkRunner to instantiate it many times with specific values to study
+// SnippetGenerator to instantiate it many times with specific values to study
 // their impact on instruction's performance.
 struct CodeTemplate {
   CodeTemplate() = default;
@@ -57,6 +114,7 @@ struct CodeTemplate {
   CodeTemplate(const CodeTemplate &) = delete;
   CodeTemplate &operator=(const CodeTemplate &) = delete;
 
+  ExecutionMode Execution = ExecutionMode::UNKNOWN;
   // Some information about how this template has been created.
   std::string Info;
   // The list of the instructions for this template.
diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp
index 040b42b53e2..7b991a452aa 100644
--- a/tools/llvm-exegesis/lib/Latency.cpp
+++ b/tools/llvm-exegesis/lib/Latency.cpp
@@ -20,53 +20,148 @@
 
 namespace exegesis {
 
-LatencySnippetGenerator::~LatencySnippetGenerator() = default;
+struct ExecutionClass {
+  ExecutionMode Mask;
+  const char *Description;
+} static const kExecutionClasses[] = {
+    {ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS |
+         ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS,
+     "Repeating a single implicitly serial instruction"},
+    {ExecutionMode::SERIAL_VIA_EXPLICIT_REGS,
+     "Repeating a single explicitly serial instruction"},
+    {ExecutionMode::SERIAL_VIA_MEMORY_INSTR |
+         ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR,
+     "Repeating two instructions"},
+};
 
-llvm::Expected<std::vector<CodeTemplate>>
-generateTwoInstructionPrototypes(const LLVMState &State,
-                                 const Instruction &Instr) {
+static constexpr size_t kMaxAliasingInstructions = 10;
+
+static std::vector<Instruction>
+computeAliasingInstructions(const LLVMState &State, const Instruction &Instr,
+                            size_t MaxAliasingInstructions) {
+  // Randomly iterate the set of instructions.
   std::vector<unsigned> Opcodes;
   Opcodes.resize(State.getInstrInfo().getNumOpcodes());
   std::iota(Opcodes.begin(), Opcodes.end(), 0U);
   std::shuffle(Opcodes.begin(), Opcodes.end(), randomGenerator());
+
+  std::vector<Instruction> AliasingInstructions;
   for (const unsigned OtherOpcode : Opcodes) {
-    if (OtherOpcode == Instr.Description->Opcode)
+    if (OtherOpcode == Instr.Description->getOpcode())
       continue;
     const Instruction OtherInstr(State, OtherOpcode);
     if (OtherInstr.hasMemoryOperands())
       continue;
-    const AliasingConfigurations Forward(Instr, OtherInstr);
-    const AliasingConfigurations Back(OtherInstr, Instr);
-    if (Forward.empty() || Back.empty())
-      continue;
-    InstructionTemplate ThisIT(Instr);
-    InstructionTemplate OtherIT(OtherInstr);
-    if (!Forward.hasImplicitAliasing())
-      setRandomAliasing(Forward, ThisIT, OtherIT);
-    if (!Back.hasImplicitAliasing())
-      setRandomAliasing(Back, OtherIT, ThisIT);
+    if (Instr.hasAliasingRegistersThrough(OtherInstr))
+      AliasingInstructions.push_back(std::move(OtherInstr));
+    if (AliasingInstructions.size() >= MaxAliasingInstructions)
+      break;
+  }
+  return AliasingInstructions;
+}
+
+static ExecutionMode getExecutionModes(const Instruction &Instr) {
+  ExecutionMode EM;
+  if (Instr.hasAliasingImplicitRegisters())
+    EM |= ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS;
+  if (Instr.hasTiedRegisters())
+    EM |= ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS;
+  if (Instr.hasMemoryOperands())
+    EM |= ExecutionMode::SERIAL_VIA_MEMORY_INSTR;
+  else {
+    if (Instr.hasAliasingRegisters())
+      EM |= ExecutionMode::SERIAL_VIA_EXPLICIT_REGS;
+    if (Instr.hasOneUseOrOneDef())
+      EM |= ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR;
+  }
+  return EM;
+}
+
+static void appendCodeTemplates(const LLVMState &State,
+                                const Instruction &Instr,
+                                ExecutionMode ExecutionModeBit,
+                                llvm::StringRef ExecutionClassDescription,
+                                std::vector<CodeTemplate> &CodeTemplates) {
+  assert(isEnumValue(ExecutionModeBit) && "Bit must be a power of two");
+  switch (ExecutionModeBit) {
+  case ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS:
+    // Nothing to do, the instruction is always serial.
+    LLVM_FALLTHROUGH;
+  case ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS: {
+    // Picking whatever value for the tied variable will make the instruction
+    // serial.
     CodeTemplate CT;
-    CT.Info = llvm::formatv("creating cycle through {0}.",
-                            State.getInstrInfo().getName(OtherOpcode));
-    CT.Instructions.push_back(std::move(ThisIT));
-    CT.Instructions.push_back(std::move(OtherIT));
-    return getSingleton(CT);
+    CT.Execution = ExecutionModeBit;
+    CT.Info = ExecutionClassDescription;
+    CT.Instructions.push_back(Instr);
+    CodeTemplates.push_back(std::move(CT));
+    return;
+  }
+  case ExecutionMode::SERIAL_VIA_MEMORY_INSTR: {
+    // Select back-to-back memory instruction.
+    // TODO: Implement me.
+    return;
+  }
+  case ExecutionMode::SERIAL_VIA_EXPLICIT_REGS: {
+    // Making the execution of this instruction serial by selecting one def
+    // register to alias with one use register.
+    const AliasingConfigurations SelfAliasing(Instr, Instr);
+    assert(!SelfAliasing.empty() && !SelfAliasing.hasImplicitAliasing() &&
+           "Instr must alias itself explicitly");
+    InstructionTemplate IT(Instr);
+    // This is a self aliasing instruction so defs and uses are from the same
+    // instance, hence twice IT in the following call.
+    setRandomAliasing(SelfAliasing, IT, IT);
+    CodeTemplate CT;
+    CT.Execution = ExecutionModeBit;
+    CT.Info = ExecutionClassDescription;
+    CT.Instructions.push_back(std::move(IT));
+    CodeTemplates.push_back(std::move(CT));
+    return;
+  }
+  case ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR: {
+    // Select back-to-back non-memory instruction.
+    for (const auto OtherInstr :
+         computeAliasingInstructions(State, Instr, kMaxAliasingInstructions)) {
+      const AliasingConfigurations Forward(Instr, OtherInstr);
+      const AliasingConfigurations Back(OtherInstr, Instr);
+      InstructionTemplate ThisIT(Instr);
+      InstructionTemplate OtherIT(OtherInstr);
+      if (!Forward.hasImplicitAliasing())
+        setRandomAliasing(Forward, ThisIT, OtherIT);
+      if (!Back.hasImplicitAliasing())
+        setRandomAliasing(Back, OtherIT, ThisIT);
+      CodeTemplate CT;
+      CT.Execution = ExecutionModeBit;
+      CT.Info = ExecutionClassDescription;
+      CT.Instructions.push_back(std::move(ThisIT));
+      CT.Instructions.push_back(std::move(OtherIT));
+      CodeTemplates.push_back(std::move(CT));
+    }
+    return;
+  }
+  default:
+    llvm_unreachable("Unhandled enum value");
   }
-  return llvm::make_error<BenchmarkFailure>(
-      "Infeasible : Didn't find any scheme to make the instruction serial");
 }
 
+LatencySnippetGenerator::~LatencySnippetGenerator() = default;
+
 llvm::Expected<std::vector<CodeTemplate>>
 LatencySnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
-  if (Instr.hasMemoryOperands())
+  std::vector<CodeTemplate> Results;
+  const ExecutionMode EM = getExecutionModes(Instr);
+  for (const auto EC : kExecutionClasses) {
+    for (const auto ExecutionModeBit : getExecutionModeBits(EM & EC.Mask))
+      appendCodeTemplates(State, Instr, ExecutionModeBit, EC.Description,
+                          Results);
+    if (!Results.empty())
+      break;
+  }
+  if (Results.empty())
     return llvm::make_error<BenchmarkFailure>(
-        "Infeasible : has memory operands");
-  return llvm::handleExpected( //
-      generateSelfAliasingCodeTemplates(Instr),
-      [this, &Instr]() {
-        return generateTwoInstructionPrototypes(State, Instr);
-      },
-      [](const BenchmarkFailure &) { /*Consume Error*/ });
+        "No strategy found to make the execution serial");
+  return std::move(Results);
 }
 
 const char *LatencyBenchmarkRunner::getCounterName() const {
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
index fa9378856f4..59f56520efc 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.cpp
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
@@ -27,7 +27,14 @@ unsigned Variable::getPrimaryOperandIndex() const {
   return TiedOperands[0];
 }
 
-bool Variable::hasTiedOperands() const { return TiedOperands.size() > 1; }
+bool Variable::hasTiedOperands() const {
+  assert(TiedOperands.size() <= 2 &&
+         "No more than two operands can be tied together");
+  // By definition only Use and Def operands can be tied together.
+  // TiedOperands[0] is the Def operand (LLVM stores defs first).
+  // TiedOperands[1] is the Use operand.
+  return TiedOperands.size() > 1;
+}
 
 unsigned Operand::getIndex() const {
   assert(Index >= 0 && "Index must be set");
@@ -197,6 +204,10 @@ bool Instruction::hasAliasingRegisters() const {
   return AllDefRegs.anyCommon(AllUseRegs);
 }
 
+bool Instruction::hasOneUseOrOneDef() const {
+  return AllDefRegs.count() || AllUseRegs.count();
+}
+
 void Instruction::dump(const llvm::MCRegisterInfo &RegInfo,
                        llvm::raw_ostream &Stream) const {
   Stream << "- " << Name << "\n";
@@ -288,8 +299,7 @@ bool AliasingConfigurations::hasImplicitAliasing() const {
 }
 
 AliasingConfigurations::AliasingConfigurations(
-    const Instruction &DefInstruction, const Instruction &UseInstruction)
-    : DefInstruction(DefInstruction), UseInstruction(UseInstruction) {
+    const Instruction &DefInstruction, const Instruction &UseInstruction) {
   if (UseInstruction.AllUseRegs.anyCommon(DefInstruction.AllDefRegs)) {
     auto CommonRegisters = UseInstruction.AllUseRegs;
     CommonRegisters &= DefInstruction.AllDefRegs;
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.h b/tools/llvm-exegesis/lib/MCInstrDescView.h
index 6910538a31f..17f3e2b930d 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.h
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.h
@@ -125,6 +125,11 @@ struct Instruction {
   // reads or write the same memory region.
   bool hasMemoryOperands() const;
 
+  // Returns whether this instruction as at least one use or one def.
+  // Repeating this instruction may execute sequentially by adding an
+  // instruction that aliases one of these.
+  bool hasOneUseOrOneDef() const;
+
   // Convenient function to help with debugging.
   void dump(const llvm::MCRegisterInfo &RegInfo,
             llvm::raw_ostream &Stream) const;
@@ -174,10 +179,7 @@ struct AliasingConfigurations {
 
   bool empty() const; // True if no aliasing configuration is found.
   bool hasImplicitAliasing() const;
-  void setExplicitAliasing() const;
 
-  const Instruction &DefInstruction;
-  const Instruction &UseInstruction;
   llvm::SmallVector<AliasingRegisterOperands, 32> Configurations;
 };
 
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
index feee61d113c..cdf54a32e4f 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
@@ -22,7 +22,7 @@
 
 namespace exegesis {
 
-std::vector<CodeTemplate> getSingleton(CodeTemplate &CT) {
+std::vector<CodeTemplate> getSingleton(CodeTemplate &&CT) {
   std::vector<CodeTemplate> Result;
   Result.push_back(std::move(CT));
   return Result;
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.h b/tools/llvm-exegesis/lib/SnippetGenerator.h
index e48cf0cfeb0..4b307fd75ac 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.h
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.h
@@ -30,7 +30,7 @@
 
 namespace exegesis {
 
-std::vector<CodeTemplate> getSingleton(CodeTemplate &CT);
+std::vector<CodeTemplate> getSingleton(CodeTemplate &&CT);
 
 // Generates code templates that has a self-dependency.
 llvm::Expected<std::vector<CodeTemplate>>
diff --git a/tools/llvm-exegesis/lib/Uops.cpp b/tools/llvm-exegesis/lib/Uops.cpp
index a3ada77ef8c..d8065adbdb2 100644
--- a/tools/llvm-exegesis/lib/Uops.cpp
+++ b/tools/llvm-exegesis/lib/Uops.cpp
@@ -153,13 +153,13 @@ UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
     CT.Info = "instruction is parallel, repeating a random one.";
     CT.Instructions.push_back(std::move(IT));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return getSingleton(CT);
+    return getSingleton(std::move(CT));
   }
   if (SelfAliasing.hasImplicitAliasing()) {
     CT.Info = "instruction is serial, repeating a random one.";
     CT.Instructions.push_back(std::move(IT));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return getSingleton(CT);
+    return getSingleton(std::move(CT));
   }
   const auto TiedVariables = getVariablesWithTiedOperands(Instr);
   if (!TiedVariables.empty()) {
@@ -181,7 +181,7 @@ UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
       CT.Instructions.push_back(std::move(TmpIT));
     }
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return getSingleton(CT);
+    return getSingleton(std::move(CT));
   }
   const auto &ReservedRegisters = State.getRATC().reservedRegisters();
   // No tied variables, we pick random values for defs.
@@ -218,7 +218,7 @@ UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
       "instruction has no tied variables picking Uses different from defs";
   CT.Instructions.push_back(std::move(IT));
   instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-  return getSingleton(CT);
+  return getSingleton(std::move(CT));
 }
 
 std::vector<BenchmarkMeasure>
diff --git a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
index 6cc24a02cfc..4b3fa5455a3 100644
--- a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -25,6 +25,7 @@ namespace {
 
 using testing::AnyOf;
 using testing::ElementsAre;
+using testing::Gt;
 using testing::HasSubstr;
 using testing::Not;
 using testing::SizeIs;
@@ -57,14 +58,12 @@ class SnippetGeneratorTest : public X86SnippetGeneratorTest {
 protected:
   SnippetGeneratorTest() : Generator(State) {}
 
-  CodeTemplate checkAndGetCodeTemplate(unsigned Opcode) {
+  std::vector<CodeTemplate> checkAndGetCodeTemplates(unsigned Opcode) {
     randomGenerator().seed(0); // Initialize seed.
     const Instruction Instr(State, Opcode);
     auto CodeTemplateOrError = Generator.generateCodeTemplates(Instr);
     EXPECT_FALSE(CodeTemplateOrError.takeError()); // Valid configuration.
-    auto &CodeTemplate = CodeTemplateOrError.get();
-    EXPECT_EQ(CodeTemplate.size(), 1U);
-    return std::move(CodeTemplate.front());
+    return std::move(CodeTemplateOrError.get());
   }
 
   SnippetGeneratorT Generator;
@@ -75,21 +74,25 @@ using LatencySnippetGeneratorTest =
 
 using UopsSnippetGeneratorTest = SnippetGeneratorTest<UopsSnippetGenerator>;
 
-TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependency) {
-  // ADC16i16 self alias because of implicit use and def.
-
-  // explicit use 0       : imm
-  // implicit def         : AX
-  // implicit def         : EFLAGS
-  // implicit use         : AX
-  // implicit use         : EFLAGS
+TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependencyThroughImplicitReg) {
+  // - ADC16i16
+  // - Op0 Explicit Use Immediate
+  // - Op1 Implicit Def Reg(AX)
+  // - Op2 Implicit Def Reg(EFLAGS)
+  // - Op3 Implicit Use Reg(AX)
+  // - Op4 Implicit Use Reg(EFLAGS)
+  // - Var0 [Op0]
+  // - hasAliasingImplicitRegisters (execution is always serial)
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::ADC16i16;
   EXPECT_THAT(MCInstrInfo.get(Opcode).getImplicitDefs()[0], llvm::X86::AX);
   EXPECT_THAT(MCInstrInfo.get(Opcode).getImplicitDefs()[1], llvm::X86::EFLAGS);
   EXPECT_THAT(MCInstrInfo.get(Opcode).getImplicitUses()[0], llvm::X86::AX);
   EXPECT_THAT(MCInstrInfo.get(Opcode).getImplicitUses()[1], llvm::X86::EFLAGS);
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
-  EXPECT_THAT(CT.Info, HasSubstr("implicit"));
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
+  EXPECT_THAT(CT.Execution, ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS);
   ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
@@ -97,63 +100,105 @@ TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependency) {
   EXPECT_THAT(IT.VariableValues[0], IsInvalid()) << "Immediate is not set";
 }
 
-TEST_F(LatencySnippetGeneratorTest, ExplicitSelfDependency) {
-  // ADD16ri self alias because Op0 and Op1 are tied together.
-
-  // explicit def 0       : reg RegClass=GR16
-  // explicit use 1       : reg RegClass=GR16 | TIED_TO:0
-  // explicit use 2       : imm
-  // implicit def         : EFLAGS
+TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependencyThroughTiedRegs) {
+  // - ADD16ri
+  // - Op0 Explicit Def RegClass(GR16)
+  // - Op1 Explicit Use RegClass(GR16) TiedToOp0
+  // - Op2 Explicit Use Immediate
+  // - Op3 Implicit Def Reg(EFLAGS)
+  // - Var0 [Op0,Op1]
+  // - Var1 [Op2]
+  // - hasTiedRegisters (execution is always serial)
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::ADD16ri;
   EXPECT_THAT(MCInstrInfo.get(Opcode).getImplicitDefs()[0], llvm::X86::EFLAGS);
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
-  EXPECT_THAT(CT.Info, HasSubstr("explicit"));
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
+  EXPECT_THAT(CT.Execution, ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS);
   ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
   ASSERT_THAT(IT.VariableValues, SizeIs(2));
-  EXPECT_THAT(IT.VariableValues[0], IsReg()) << "Operand 0 and 1";
+  EXPECT_THAT(IT.VariableValues[0], IsInvalid()) << "Operand 1 is not set";
   EXPECT_THAT(IT.VariableValues[1], IsInvalid()) << "Operand 2 is not set";
 }
 
-TEST_F(LatencySnippetGeneratorTest, DependencyThroughOtherOpcode) {
-  // CMP64rr
-  // explicit use 0       : reg RegClass=GR64
-  // explicit use 1       : reg RegClass=GR64
-  // implicit def         : EFLAGS
-
-  const unsigned Opcode = llvm::X86::CMP64rr;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
-  EXPECT_THAT(CT.Info, HasSubstr("cycle through"));
-  ASSERT_THAT(CT.Instructions, SizeIs(2));
+TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependencyThroughExplicitRegs) {
+  // - VXORPSrr
+  // - Op0 Explicit Def RegClass(VR128)
+  // - Op1 Explicit Use RegClass(VR128)
+  // - Op2 Explicit Use RegClass(VR128)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - hasAliasingRegisters
+  const unsigned Opcode = llvm::X86::VXORPSrr;
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
+  EXPECT_THAT(CT.Execution, ExecutionMode::SERIAL_VIA_EXPLICIT_REGS);
+  ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
-  ASSERT_THAT(IT.VariableValues, SizeIs(2));
-  EXPECT_THAT(IT.VariableValues, AnyOf(ElementsAre(IsReg(), IsInvalid()),
-                                       ElementsAre(IsInvalid(), IsReg())));
-  EXPECT_THAT(CT.Instructions[1].getOpcode(), Not(Opcode));
-  // TODO: check that the two instructions alias each other.
+  ASSERT_THAT(IT.VariableValues, SizeIs(3));
+  EXPECT_THAT(IT.VariableValues,
+              AnyOf(ElementsAre(IsReg(), IsInvalid(), IsReg()),
+                    ElementsAre(IsReg(), IsReg(), IsInvalid())))
+      << "Op0 is either set to Op1 or to Op2";
+}
+
+TEST_F(LatencySnippetGeneratorTest, DependencyThroughOtherOpcode) {
+  // - CMP64rr
+  // - Op0 Explicit Use RegClass(GR64)
+  // - Op1 Explicit Use RegClass(GR64)
+  // - Op2 Implicit Def Reg(EFLAGS)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  const unsigned Opcode = llvm::X86::CMP64rr;
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(Gt(1U))) << "Many templates are available";
+  for (const auto &CT : CodeTemplates) {
+    EXPECT_THAT(CT.Execution, ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR);
+    ASSERT_THAT(CT.Instructions, SizeIs(2));
+    const InstructionTemplate &IT = CT.Instructions[0];
+    EXPECT_THAT(IT.getOpcode(), Opcode);
+    ASSERT_THAT(IT.VariableValues, SizeIs(2));
+    EXPECT_THAT(IT.VariableValues, AnyOf(ElementsAre(IsReg(), IsInvalid()),
+                                         ElementsAre(IsInvalid(), IsReg())));
+    EXPECT_THAT(CT.Instructions[1].getOpcode(), Not(Opcode));
+    // TODO: check that the two instructions alias each other.
+  }
 }
 
 TEST_F(LatencySnippetGeneratorTest, LAHF) {
+  // - LAHF
+  // - Op0 Implicit Def Reg(AH)
+  // - Op1 Implicit Use Reg(EFLAGS)
   const unsigned Opcode = llvm::X86::LAHF;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
-  EXPECT_THAT(CT.Info, HasSubstr("cycle through"));
-  ASSERT_THAT(CT.Instructions, SizeIs(2));
-  const InstructionTemplate &IT = CT.Instructions[0];
-  EXPECT_THAT(IT.getOpcode(), Opcode);
-  ASSERT_THAT(IT.VariableValues, SizeIs(0));
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(Gt(1U))) << "Many templates are available";
+  for (const auto &CT : CodeTemplates) {
+    EXPECT_THAT(CT.Execution, ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR);
+    ASSERT_THAT(CT.Instructions, SizeIs(2));
+    const InstructionTemplate &IT = CT.Instructions[0];
+    EXPECT_THAT(IT.getOpcode(), Opcode);
+    ASSERT_THAT(IT.VariableValues, SizeIs(0));
+  }
 }
 
 TEST_F(UopsSnippetGeneratorTest, ParallelInstruction) {
-  // BNDCL32rr is parallel no matter what.
-
-  // explicit use 0       : reg RegClass=BNDR
-  // explicit use 1       : reg RegClass=GR32
-
+  // - BNDCL32rr
+  // - Op0 Explicit Use RegClass(BNDR)
+  // - Op1 Explicit Use RegClass(GR32)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
   const unsigned Opcode = llvm::X86::BNDCL32rr;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
   EXPECT_THAT(CT.Info, HasSubstr("parallel"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
@@ -163,14 +208,18 @@ TEST_F(UopsSnippetGeneratorTest, ParallelInstruction) {
 }
 
 TEST_F(UopsSnippetGeneratorTest, SerialInstruction) {
-  // CDQ is serial no matter what.
-
-  // implicit def         : EAX
-  // implicit def         : EDX
-  // implicit use         : EAX
+  // - CDQ
+  // - Op0 Implicit Def Reg(EAX)
+  // - Op1 Implicit Def Reg(EDX)
+  // - Op2 Implicit Use Reg(EAX)
+  // - hasAliasingImplicitRegisters (execution is always serial)
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::CDQ;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
   EXPECT_THAT(CT.Info, HasSubstr("serial"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
@@ -181,13 +230,21 @@ TEST_F(UopsSnippetGeneratorTest, StaticRenaming) {
   // CMOVA32rr has tied variables, we enumerate the possible values to execute
   // as many in parallel as possible.
 
-  // explicit def 0       : reg RegClass=GR32
-  // explicit use 1       : reg RegClass=GR32 | TIED_TO:0
-  // explicit use 2       : reg RegClass=GR32
-  // implicit use         : EFLAGS
+  // - CMOVA32rr
+  // - Op0 Explicit Def RegClass(GR32)
+  // - Op1 Explicit Use RegClass(GR32) TiedToOp0
+  // - Op2 Explicit Use RegClass(GR32)
+  // - Op3 Implicit Use Reg(EFLAGS)
+  // - Var0 [Op0,Op1]
+  // - Var1 [Op2]
+  // - hasTiedRegisters (execution is always serial)
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::CMOVA32rr;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
   EXPECT_THAT(CT.Info, HasSubstr("static renaming"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   constexpr const unsigned kInstructionCount = 15;
   ASSERT_THAT(CT.Instructions, SizeIs(kInstructionCount));
   std::unordered_set<unsigned> AllDefRegisters;
@@ -203,14 +260,23 @@ TEST_F(UopsSnippetGeneratorTest, NoTiedVariables) {
   // CMOV_GR32 has no tied variables, we make sure def and use are different
   // from each other.
 
-  // explicit def 0       : reg RegClass=GR32
-  // explicit use 1       : reg RegClass=GR32
-  // explicit use 2       : reg RegClass=GR32
-  // explicit use 3       : imm
-  // implicit use         : EFLAGS
+  // - CMOV_GR32
+  // - Op0 Explicit Def RegClass(GR32)
+  // - Op1 Explicit Use RegClass(GR32)
+  // - Op2 Explicit Use RegClass(GR32)
+  // - Op3 Explicit Use Immediate
+  // - Op4 Implicit Use Reg(EFLAGS)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - Var3 [Op3]
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::CMOV_GR32;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
   EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
@@ -224,9 +290,27 @@ TEST_F(UopsSnippetGeneratorTest, NoTiedVariables) {
 
 TEST_F(UopsSnippetGeneratorTest, MemoryUse) {
   // Mov32rm reads from memory.
+  // - MOV32rm
+  // - Op0 Explicit Def RegClass(GR32)
+  // - Op1 Explicit Use Memory RegClass(GR8)
+  // - Op2 Explicit Use Memory
+  // - Op3 Explicit Use Memory RegClass(GRH8)
+  // - Op4 Explicit Use Memory
+  // - Op5 Explicit Use Memory RegClass(SEGMENT_REG)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - Var3 [Op3]
+  // - Var4 [Op4]
+  // - Var5 [Op5]
+  // - hasMemoryOperands
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::MOV32rm;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
   EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   ASSERT_THAT(CT.Instructions,
               SizeIs(UopsSnippetGenerator::kMinNumDifferentAddresses));
   const InstructionTemplate &IT = CT.Instructions[0];
@@ -240,6 +324,21 @@ TEST_F(UopsSnippetGeneratorTest, MemoryUse) {
 
 TEST_F(UopsSnippetGeneratorTest, MemoryUse_Movsb) {
   // MOVSB writes to scratch memory register.
+  // - MOVSB
+  // - Op0 Explicit Use Memory RegClass(GR8)
+  // - Op1 Explicit Use Memory RegClass(GR8)
+  // - Op2 Explicit Use Memory RegClass(SEGMENT_REG)
+  // - Op3 Implicit Def Reg(EDI)
+  // - Op4 Implicit Def Reg(ESI)
+  // - Op5 Implicit Use Reg(EDI)
+  // - Op6 Implicit Use Reg(ESI)
+  // - Op7 Implicit Use Reg(DF)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - hasMemoryOperands
+  // - hasAliasingImplicitRegisters (execution is always serial)
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::MOVSB;
   const Instruction Instr(State, Opcode);
   auto Error = Generator.generateCodeTemplates(Instr).takeError();
-- 
GitLab


From 2a8a161371d5c55dff1e664cd070de2165f23b92 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 17 Oct 2018 12:09:21 +0000
Subject: [PATCH 0276/1116] BuildBot fix, compiler complains about array decay
 to pointer

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344690 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/CodeTemplate.cpp | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/tools/llvm-exegesis/lib/CodeTemplate.cpp b/tools/llvm-exegesis/lib/CodeTemplate.cpp
index df9d18b94bb..614d4749b1f 100644
--- a/tools/llvm-exegesis/lib/CodeTemplate.cpp
+++ b/tools/llvm-exegesis/lib/CodeTemplate.cpp
@@ -92,18 +92,17 @@ llvm::StringRef getName(ExecutionMode Bit) {
   llvm_unreachable("Missing enum case");
 }
 
-static const ExecutionMode kAllExecutionModeBits[] = {
-    ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS,
-    ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS,
-    ExecutionMode::SERIAL_VIA_MEMORY_INSTR,
-    ExecutionMode::SERIAL_VIA_EXPLICIT_REGS,
-    ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR,
-    ExecutionMode::ALWAYS_PARALLEL_MISSING_USE_OR_DEF,
-    ExecutionMode::PARALLEL_VIA_EXPLICIT_REGS,
-};
-
 llvm::ArrayRef<ExecutionMode> getAllExecutionBits() {
-  return kAllExecutionModeBits;
+  static const ExecutionMode kAllExecutionModeBits[] = {
+      ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS,
+      ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS,
+      ExecutionMode::SERIAL_VIA_MEMORY_INSTR,
+      ExecutionMode::SERIAL_VIA_EXPLICIT_REGS,
+      ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR,
+      ExecutionMode::ALWAYS_PARALLEL_MISSING_USE_OR_DEF,
+      ExecutionMode::PARALLEL_VIA_EXPLICIT_REGS,
+  };
+  return llvm::makeArrayRef(kAllExecutionModeBits);
 }
 
 llvm::SmallVector<ExecutionMode, 4>
-- 
GitLab


From e20d360976af48e5d2af4a9f4662afea5cb71d50 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Wed, 17 Oct 2018 12:14:26 +0000
Subject: [PATCH 0277/1116] AMDGPU: Remove dead TableGen code

Summary: Change-Id: Ic1f2c1d0cf9e90a0baa9fc6bacd0d3c386069fb0

Reviewers: tpr

Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D53318

Change-Id: Ib4d143c898801e5cf6cb9999a495d62c91ae77fb

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344691 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/BUFInstructions.td | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 7b2dc3494ab..18a59729faa 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -17,8 +17,6 @@ def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [],
 def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
 def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
-def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">;
-def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">;
 
 class MubufLoad <SDPatternOperator op> : PatFrag <
   (ops node:$ptr), (op node:$ptr), [{
-- 
GitLab


From d259ba70025f6c87f9e01d9adadc7da208f9bed2 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 17 Oct 2018 12:27:46 +0000
Subject: [PATCH 0278/1116] Fix uninitialized variable

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344692 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/Latency.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp
index 7b991a452aa..0dd84cc0958 100644
--- a/tools/llvm-exegesis/lib/Latency.cpp
+++ b/tools/llvm-exegesis/lib/Latency.cpp
@@ -61,7 +61,7 @@ computeAliasingInstructions(const LLVMState &State, const Instruction &Instr,
 }
 
 static ExecutionMode getExecutionModes(const Instruction &Instr) {
-  ExecutionMode EM;
+  ExecutionMode EM = ExecutionMode::UNKNOWN;
   if (Instr.hasAliasingImplicitRegisters())
     EM |= ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS;
   if (Instr.hasTiedRegisters())
-- 
GitLab


From 0cb92ac202acd34252952c9c04b89d7cee74cfe2 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Wed, 17 Oct 2018 13:02:48 +0000
Subject: [PATCH 0279/1116] [ARM] bottom-top mul support in ARMParallelDSP

Previously reverted in rL343082.

Original commit message:

On failing to find sequences that can be converted into dual macs,
try to find sequential 16-bit loads that are used by muls which we
can then use smultb, smulbt, smultt with a wide load.

Differential Revision: https://reviews.llvm.org/D51983


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344693 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMParallelDSP.cpp             | 221 +++++++++++++--
 test/CodeGen/ARM/{ => ParallelDSP}/smlad0.ll  |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad1.ll  |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad10.ll |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad11.ll |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad12.ll |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad2.ll  |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad3.ll  |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad4.ll  |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad5.ll  |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad6.ll  |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad7.ll  |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad8.ll  |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlad9.ll  |   0
 .../CodeGen/ARM/{ => ParallelDSP}/smladx-1.ll |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlald0.ll |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlald1.ll |   0
 test/CodeGen/ARM/{ => ParallelDSP}/smlald2.ll |   0
 .../ARM/{ => ParallelDSP}/smlaldx-1.ll        |   0
 .../ARM/{ => ParallelDSP}/smlaldx-2.ll        |   0
 .../ARM/ParallelDSP/top-bottom-multi-use.ll   |  74 +++++
 .../ARM/ParallelDSP/top-bottom-neg-vec.ll     |  98 +++++++
 .../CodeGen/ARM/ParallelDSP/top-bottom-neg.ll | 210 +++++++++++++++
 .../ARM/ParallelDSP/top-bottom-order.ll       |  54 ++++
 test/CodeGen/ARM/ParallelDSP/top-bottom.ll    | 252 ++++++++++++++++++
 25 files changed, 882 insertions(+), 27 deletions(-)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad0.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad1.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad10.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad11.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad12.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad2.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad3.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad4.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad5.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad6.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad7.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad8.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlad9.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smladx-1.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlald0.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlald1.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlald2.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlaldx-1.ll (100%)
 rename test/CodeGen/ARM/{ => ParallelDSP}/smlaldx-2.ll (100%)
 create mode 100644 test/CodeGen/ARM/ParallelDSP/top-bottom-multi-use.ll
 create mode 100644 test/CodeGen/ARM/ParallelDSP/top-bottom-neg-vec.ll
 create mode 100644 test/CodeGen/ARM/ParallelDSP/top-bottom-neg.ll
 create mode 100644 test/CodeGen/ARM/ParallelDSP/top-bottom-order.ll
 create mode 100644 test/CodeGen/ARM/ParallelDSP/top-bottom.ll

diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp
index 3ab9298c110..e5f6a61852e 100644
--- a/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/lib/Target/ARM/ARMParallelDSP.cpp
@@ -55,6 +55,7 @@ namespace {
   using ReductionList   = SmallVector<Reduction, 8>;
   using ValueList       = SmallVector<Value*, 8>;
   using MemInstList     = SmallVector<Instruction*, 8>;
+  using LoadInstList    = SmallVector<LoadInst*, 8>;
   using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
   using PMACPairList    = SmallVector<PMACPair, 8>;
   using Instructions    = SmallVector<Instruction*,16>;
@@ -63,7 +64,8 @@ namespace {
   struct OpChain {
     Instruction   *Root;
     ValueList     AllValues;
-    MemInstList   VecLd;    // List of all load instructions.
+    MemInstList   VecLd;    // List of all sequential load instructions.
+    LoadInstList  Loads;    // List of all load instructions.
     MemLocList    MemLocs;  // All memory locations read by this tree.
     bool          ReadOnly = true;
 
@@ -76,8 +78,10 @@ namespace {
         if (auto *I = dyn_cast<Instruction>(V)) {
           if (I->mayWriteToMemory())
             ReadOnly = false;
-          if (auto *Ld = dyn_cast<LoadInst>(V))
+          if (auto *Ld = dyn_cast<LoadInst>(V)) {
             MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
+            Loads.push_back(Ld);
+          }
         }
       }
     }
@@ -135,6 +139,7 @@ namespace {
     /// exchange the halfwords of the second operand before performing the
     /// arithmetic.
     bool MatchSMLAD(Function &F);
+    bool MatchTopBottomMuls(BasicBlock *LoopBody);
 
   public:
     static char ID;
@@ -203,6 +208,8 @@ namespace {
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
       Changes = MatchSMLAD(F);
+      if (!Changes)
+        Changes = MatchTopBottomMuls(Header);
       return Changes;
     }
   };
@@ -496,10 +503,10 @@ static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
   );
 }
 
-static void AddMACCandidate(OpChainList &Candidates,
+static void AddMulCandidate(OpChainList &Candidates,
                             Instruction *Mul,
                             Value *MulOp0, Value *MulOp1) {
-  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
+  LLVM_DEBUG(dbgs() << "OK, found mul:\t"; Mul->dump());
   assert(Mul->getOpcode() == Instruction::Mul &&
          "expected mul instruction");
   ValueList LHS;
@@ -533,14 +540,14 @@ static void MatchParallelMACSequences(Reduction &R,
       break;
     case Instruction::Mul:
       if (match (I, (m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
-        AddMACCandidate(Candidates, I, MulOp0, MulOp1);
+        AddMulCandidate(Candidates, I, MulOp0, MulOp1);
         return false;
       }
       break;
     case Instruction::SExt:
       if (match (I, (m_SExt(m_Mul(m_Value(MulOp0), m_Value(MulOp1)))))) {
         Instruction *Mul = cast<Instruction>(I->getOperand(0));
-        AddMACCandidate(Candidates, Mul, MulOp0, MulOp1);
+        AddMulCandidate(Candidates, Mul, MulOp0, MulOp1);
         return false;
       }
       break;
@@ -569,23 +576,24 @@ static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
 // the memory locations accessed by the MAC-chains.
 // TODO: we need the read statements when we accept more complicated chains.
 static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
-                       Instructions &Writes, OpChainList &MACCandidates) {
+                       Instructions &Writes, OpChainList &Candidates) {
   LLVM_DEBUG(dbgs() << "Alias checks:\n");
-  for (auto &MAC : MACCandidates) {
-    LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
+  for (auto &Candidate : Candidates) {
+    LLVM_DEBUG(dbgs() << "mul: "; Candidate->Root->dump());
+    Candidate->SetMemoryLocations();
 
     // At the moment, we allow only simple chains that only consist of reads,
     // accumulate their result with an integer add, and thus that don't write
     // memory, and simply bail if they do.
-    if (!MAC->ReadOnly)
+    if (!Candidate->ReadOnly)
       return true;
 
     // Now for all writes in the basic block, check that they don't alias with
     // the memory locations accessed by our MAC-chain:
     for (auto *I : Writes) {
       LLVM_DEBUG(dbgs() << "- "; I->dump());
-      assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
-      for (auto &MemLoc : MAC->MemLocs) {
+      assert(Candidate->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
+      for (auto &MemLoc : Candidate->MemLocs) {
         if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
                                           ModRefInfo::ModRef))) {
           LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
@@ -599,7 +607,7 @@ static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
   return false;
 }
 
-static bool CheckMACMemory(OpChainList &Candidates) {
+static bool CheckMulMemory(OpChainList &Candidates) {
   for (auto &C : Candidates) {
     // A mul has 2 operands, and a narrow op consist of sext and a load; thus
     // we expect at least 4 items in this operand value list.
@@ -607,7 +615,6 @@ static bool CheckMACMemory(OpChainList &Candidates) {
       LLVM_DEBUG(dbgs() << "Operand list too short.\n");
       return false;
     }
-    C->SetMemoryLocations();
     ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
     ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
 
@@ -620,6 +627,173 @@ static bool CheckMACMemory(OpChainList &Candidates) {
   return true;
 }
 
+static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst *BaseLoad,
+                               const Type *LoadTy) {
+  const unsigned AddrSpace = BaseLoad->getPointerAddressSpace();
+ 
+  Value *VecPtr = IRB.CreateBitCast(BaseLoad->getPointerOperand(),
+                                     LoadTy->getPointerTo(AddrSpace));
+  return IRB.CreateAlignedLoad(VecPtr, BaseLoad->getAlignment());
+}
+
+/// Given two instructions, return the one that comes first in the basic block.
+/// A work around for not being able to do > or < on bb iterators.
+static Instruction* GetFirst(Instruction *A, Instruction *B) {
+  BasicBlock::iterator First(A);
+  BasicBlock::iterator Second(B);
+
+  BasicBlock *BB = A->getParent();
+  assert(BB == B->getParent() &&
+         "Can't compare instructions in different blocks");
+  BasicBlock::iterator Last = BB->end();
+
+  // Iterate through the block, if the 'First' iterator is found, then return
+  // Second.
+  while (Second != Last) {
+    if (Second == First)
+      return B;
+    ++Second;
+  }
+  return A;
+}
+
+/// Attempt to widen loads and use smulbb, smulbt, smultb and smultt muls.
+// TODO: This, like smlad generation, expects the leave operands to be loads
+// that are sign extended. We should be able to handle scalar values as well
+// performing these muls on word x half types to generate smulwb and smulwt.
+bool ARMParallelDSP::MatchTopBottomMuls(BasicBlock *LoopBody) {
+  LLVM_DEBUG(dbgs() << "Attempting to find BT|TB muls.\n");
+
+  OpChainList Candidates;
+  for (auto &I : *LoopBody) {
+    if (I.getOpcode() == Instruction::Mul) {
+      Type *Ty = I.getType();
+      if (Ty->isIntegerTy() &&
+          (Ty->getScalarSizeInBits() == 32 ||
+           Ty->getScalarSizeInBits() == 64))
+      AddMulCandidate(Candidates, &I, I.getOperand(0), I.getOperand(1));
+    }
+  }
+
+  if (Candidates.empty())
+    return false;
+
+  Instructions Reads;
+  Instructions Writes;
+  AliasCandidates(LoopBody, Reads, Writes);
+
+  if (AreAliased(AA, Reads, Writes, Candidates))
+    return false;
+
+  DenseMap<LoadInst*, LoadInst*> SeqLoads;
+  SmallPtrSet<LoadInst*, 8> OffsetLoads;
+
+  for (unsigned i = 0; i < Candidates.size(); ++i) {
+    for (unsigned j = 0; j < Candidates.size(); ++j) {
+      if (i == j)
+        continue;
+
+      OpChain *MulChain0 = Candidates[i].get();
+      OpChain *MulChain1 = Candidates[j].get();
+
+      for (auto *Ld0 : MulChain0->Loads) {
+        if (SeqLoads.count(Ld0) || OffsetLoads.count(Ld0))
+          continue;
+
+        for (auto *Ld1 : MulChain1->Loads) {
+          if (SeqLoads.count(Ld1) || OffsetLoads.count(Ld1))
+            continue;
+
+          MemInstList VecMem;
+          if (AreSequentialLoads(Ld0, Ld1, VecMem)) {
+            SeqLoads[Ld0] = Ld1;
+            OffsetLoads.insert(Ld1);
+          }
+        }
+      }
+    }
+  }
+
+  if (SeqLoads.empty())
+    return false;
+
+  IRBuilder<NoFolder> IRB(LoopBody);
+  const Type *Ty = IntegerType::get(M->getContext(), 32);
+
+  auto IsUserMul = [](Use &U) {
+    auto *Mul = cast<Instruction>(U.getUser());
+    return Mul->getOpcode() == Instruction::Mul;
+  };
+
+  LLVM_DEBUG(dbgs() << "Found some sequential loads, now widening:\n");
+  for (auto &Pair : SeqLoads) {
+    LoadInst *BaseLd = Pair.first;
+    LoadInst *OffsetLd = Pair.second;
+
+    // Check that all the base users are muls.
+    auto *BaseSExt = cast<Instruction>(BaseLd->user_back());
+    for (Use &U : BaseSExt->uses()) {
+      if (!IsUserMul(U))
+        return false;
+    }
+
+    // Check that all the offset users are muls.
+    // TODO We exit early on finding a sext user which isn't a mul, but many
+    // arm instructions would be able to perform the necessary shift too.
+    auto *OffsetSExt = cast<Instruction>(OffsetLd->user_back());
+    for (Use &U : OffsetSExt->uses()) {
+      if (!IsUserMul(U))
+        return false;
+    }
+
+    LLVM_DEBUG(dbgs() << " - with base load: " << *BaseLd << "\n");
+    LLVM_DEBUG(dbgs() << " - with offset load: " << *OffsetLd << "\n");
+    Instruction *InsertPt = GetFirst(BaseLd, OffsetLd);
+    IRB.SetInsertPoint(InsertPt);
+    LoadInst *WideLd = CreateLoadIns(IRB, BaseLd, Ty);
+    LLVM_DEBUG(dbgs() << " - created wide load: " << *WideLd << "\n");
+
+    // Move the pointer operands before their users.
+    std::function<void(Instruction*, Instruction*)> MoveBefore =
+      [&MoveBefore](Instruction *Source, Instruction *Sink) -> void {
+      Source->moveBefore(Sink);
+      for (Use &U : Source->operands()) {
+        Value *Op = U.get();
+        if (auto *I = dyn_cast<Instruction>(Op)) {
+          if (isa<PHINode>(I) || I->getParent() != Source->getParent())
+            continue;
+          MoveBefore(I, Source);
+        }
+      }
+    };
+
+    // If we're inserting the load before BaseLd, we probably need to move the
+    // the pointer operand too. This operand is cast to an i32* in
+    // CreateLoadIns.
+    if (InsertPt != BaseLd) {
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(BaseLd->getPointerOperand()))
+        MoveBefore(GEP, cast<Instruction>(WideLd->getPointerOperand()));
+    }
+
+    // BaseUser needs to: (asr (shl WideLoad, 16), 16)
+    // OffsetUser needs to: (asr WideLoad, 16)
+    auto *Top = cast<Instruction>(IRB.CreateAShr(WideLd, 16));
+    auto *Shl = cast<Instruction>(IRB.CreateShl(WideLd, 16));
+    auto *Bottom = cast<Instruction>(IRB.CreateAShr(Shl, 16));
+
+    BaseSExt->replaceAllUsesWith(Bottom);
+    OffsetSExt->replaceAllUsesWith(Top);
+
+    BaseSExt->eraseFromParent();
+    OffsetSExt->eraseFromParent();
+    BaseLd->eraseFromParent();
+    OffsetLd->eraseFromParent();
+  }
+  LLVM_DEBUG(dbgs() << "Block after top bottom mul replacements:\n"
+             << *LoopBody << "\n");
+  return true;
+}
+
 // Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
 // multiplications.
 // To use SMLAD:
@@ -658,14 +832,15 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
              dbgs() << "Header block:\n"; Header->dump();
              dbgs() << "Loop info:\n\n"; L->dump());
 
-  bool Changed = false;
   ReductionList Reductions;
   MatchReductions(F, L, Header, Reductions);
+  if (Reductions.empty())
+    return false;
 
   for (auto &R : Reductions) {
     OpChainList MACCandidates;
     MatchParallelMACSequences(R, MACCandidates);
-    if (!CheckMACMemory(MACCandidates))
+    if (!CheckMulMemory(MACCandidates))
       continue;
 
     R.MACCandidates = std::move(MACCandidates);
@@ -682,6 +857,7 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
   Instructions Reads, Writes;
   AliasCandidates(Header, Reads, Writes);
 
+  bool Changed = false;
   for (auto &R : Reductions) {
     if (AreAliased(AA, Reads, Writes, R.MACCandidates))
       return false;
@@ -693,15 +869,6 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
   return Changed;
 }
 
-static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad,
-                               const Type *LoadTy) {
-  const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
-
-  Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
-                                    LoadTy->getPointerTo(AddrSpace));
-  return IRB.CreateAlignedLoad(VecPtr, BaseLoad.getAlignment());
-}
-
 Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
                                              Instruction *Acc, bool Exchange,
                                              Instruction *InsertAfter) {
@@ -716,8 +883,8 @@ Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
 
   // Replace the reduction chain with an intrinsic call
   const Type *Ty = IntegerType::get(M->getContext(), 32);
-  LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
-  LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
+  LoadInst *NewLd0 = CreateLoadIns(Builder, &VecLd0[0], Ty);
+  LoadInst *NewLd1 = CreateLoadIns(Builder, &VecLd1[0], Ty);
   Value* Args[] = { NewLd0, NewLd1, Acc };
   Function *SMLAD = nullptr;
   if (Exchange)
diff --git a/test/CodeGen/ARM/smlad0.ll b/test/CodeGen/ARM/ParallelDSP/smlad0.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad0.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad0.ll
diff --git a/test/CodeGen/ARM/smlad1.ll b/test/CodeGen/ARM/ParallelDSP/smlad1.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad1.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad1.ll
diff --git a/test/CodeGen/ARM/smlad10.ll b/test/CodeGen/ARM/ParallelDSP/smlad10.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad10.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad10.ll
diff --git a/test/CodeGen/ARM/smlad11.ll b/test/CodeGen/ARM/ParallelDSP/smlad11.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad11.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad11.ll
diff --git a/test/CodeGen/ARM/smlad12.ll b/test/CodeGen/ARM/ParallelDSP/smlad12.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad12.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad12.ll
diff --git a/test/CodeGen/ARM/smlad2.ll b/test/CodeGen/ARM/ParallelDSP/smlad2.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad2.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad2.ll
diff --git a/test/CodeGen/ARM/smlad3.ll b/test/CodeGen/ARM/ParallelDSP/smlad3.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad3.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad3.ll
diff --git a/test/CodeGen/ARM/smlad4.ll b/test/CodeGen/ARM/ParallelDSP/smlad4.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad4.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad4.ll
diff --git a/test/CodeGen/ARM/smlad5.ll b/test/CodeGen/ARM/ParallelDSP/smlad5.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad5.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad5.ll
diff --git a/test/CodeGen/ARM/smlad6.ll b/test/CodeGen/ARM/ParallelDSP/smlad6.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad6.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad6.ll
diff --git a/test/CodeGen/ARM/smlad7.ll b/test/CodeGen/ARM/ParallelDSP/smlad7.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad7.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad7.ll
diff --git a/test/CodeGen/ARM/smlad8.ll b/test/CodeGen/ARM/ParallelDSP/smlad8.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad8.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad8.ll
diff --git a/test/CodeGen/ARM/smlad9.ll b/test/CodeGen/ARM/ParallelDSP/smlad9.ll
similarity index 100%
rename from test/CodeGen/ARM/smlad9.ll
rename to test/CodeGen/ARM/ParallelDSP/smlad9.ll
diff --git a/test/CodeGen/ARM/smladx-1.ll b/test/CodeGen/ARM/ParallelDSP/smladx-1.ll
similarity index 100%
rename from test/CodeGen/ARM/smladx-1.ll
rename to test/CodeGen/ARM/ParallelDSP/smladx-1.ll
diff --git a/test/CodeGen/ARM/smlald0.ll b/test/CodeGen/ARM/ParallelDSP/smlald0.ll
similarity index 100%
rename from test/CodeGen/ARM/smlald0.ll
rename to test/CodeGen/ARM/ParallelDSP/smlald0.ll
diff --git a/test/CodeGen/ARM/smlald1.ll b/test/CodeGen/ARM/ParallelDSP/smlald1.ll
similarity index 100%
rename from test/CodeGen/ARM/smlald1.ll
rename to test/CodeGen/ARM/ParallelDSP/smlald1.ll
diff --git a/test/CodeGen/ARM/smlald2.ll b/test/CodeGen/ARM/ParallelDSP/smlald2.ll
similarity index 100%
rename from test/CodeGen/ARM/smlald2.ll
rename to test/CodeGen/ARM/ParallelDSP/smlald2.ll
diff --git a/test/CodeGen/ARM/smlaldx-1.ll b/test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll
similarity index 100%
rename from test/CodeGen/ARM/smlaldx-1.ll
rename to test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll
diff --git a/test/CodeGen/ARM/smlaldx-2.ll b/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll
similarity index 100%
rename from test/CodeGen/ARM/smlaldx-2.ll
rename to test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/top-bottom-multi-use.ll b/test/CodeGen/ARM/ParallelDSP/top-bottom-multi-use.ll
new file mode 100644
index 00000000000..ed2b3fedbb6
--- /dev/null
+++ b/test/CodeGen/ARM/ParallelDSP/top-bottom-multi-use.ll
@@ -0,0 +1,74 @@
+; RUN: opt -mtriple=thumbv8m.main -mcpu=cortex-m33 -S -arm-parallel-dsp %s -o - | FileCheck %s
+; RUN: opt -mtriple=thumbv7a-linux-android -arm-parallel-dsp -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: sext_multi_use_undef
+define void @sext_multi_use_undef() {
+entry:
+  br label %for.body
+
+for.body:
+  %0 = load i16, i16* undef, align 2
+  %conv3 = sext i16 %0 to i32
+  %1 = load i16, i16* undef, align 2
+  %conv7 = sext i16 %1 to i32
+  %mul8 = mul nsw i32 %conv7, %conv3
+  %x.addr.180 = getelementptr inbounds i16, i16* undef, i32 1
+  %2 = load i16, i16* %x.addr.180, align 2
+  %conv1582 = sext i16 %2 to i32
+  %mul.i7284 = mul nsw i32 %conv7, %conv1582
+  br label %for.body
+}
+
+; CHECK-LABEL: sext_multi_use
+; CHECK: [[PtrA:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[DataA:%[^ ]+]] = load i32, i32* [[PtrA]], align 2
+; CHECK: [[Top:%[^ ]+]] = ashr i32 [[DataA]], 16
+; CHECK: [[Shl:%[^ ]+]] = shl i32 [[DataA]], 16
+; CHECK: [[Bottom:%[^ ]+]] = ashr i32 [[Shl]], 16
+; CHECK: [[DataB:%[^ ]+]] = load i16, i16* %b, align 2
+; CHECK: [[SextB:%[^ ]+]] = sext i16 [[DataB]] to i32
+; CHECK: [[Mul0:%[^ ]+]] = mul nsw i32 [[SextB]], [[Bottom]]
+; CHECK: [[Mul1:%[^ ]+]] = mul nsw i32 [[SextB]], [[Top]]
+define void @sext_multi_use(i16* %a, i16* %b) {
+entry:
+  br label %for.body
+
+for.body:
+  %0 = load i16, i16* %a, align 2
+  %conv3 = sext i16 %0 to i32
+  %1 = load i16, i16* %b, align 2
+  %conv7 = sext i16 %1 to i32
+  %mul8 = mul nsw i32 %conv7, %conv3
+  %x.addr.180 = getelementptr inbounds i16, i16* %a, i32 1
+  %2 = load i16, i16* %x.addr.180, align 2
+  %conv1582 = sext i16 %2 to i32
+  %mul.i7284 = mul nsw i32 %conv7, %conv1582
+  br label %for.body
+}
+
+; CHECK-LABEL: sext_multi_use_reorder
+; CHECK: [[PtrA:%[^ ]+]] = bitcast i16* %a to i32*
+; CHECK: [[DataA:%[^ ]+]] = load i32, i32* [[PtrA]], align 2
+; CHECK: [[Top:%[^ ]+]] = ashr i32 [[DataA]], 16
+; CHECK: [[Shl:%[^ ]+]] = shl i32 [[DataA]], 16
+; CHECK: [[Bottom:%[^ ]+]] = ashr i32 [[Shl]], 16
+; CHECK: [[Mul0:%[^ ]+]] = mul nsw i32 [[Top]], [[Bottom]]
+; CHECK: [[DataB:%[^ ]+]] = load i16, i16* %b, align 2
+; CHECK: [[SextB:%[^ ]+]] = sext i16 [[DataB]] to i32
+; CHECK: [[Mul1:%[^ ]+]] = mul nsw i32 [[Top]], [[SextB]]
+define void @sext_multi_use_reorder(i16* %a, i16* %b) {
+entry:
+  br label %for.body
+
+for.body:
+  %0 = load i16, i16* %a, align 2
+  %conv3 = sext i16 %0 to i32
+  %x.addr.180 = getelementptr inbounds i16, i16* %a, i32 1
+  %1 = load i16, i16* %x.addr.180, align 2
+  %conv7 = sext i16 %1 to i32
+  %mul8 = mul nsw i32 %conv7, %conv3
+  %2 = load i16, i16* %b, align 2
+  %conv1582 = sext i16 %2 to i32
+  %mul.i7284 = mul nsw i32 %conv7, %conv1582
+  br label %for.body
+}
diff --git a/test/CodeGen/ARM/ParallelDSP/top-bottom-neg-vec.ll b/test/CodeGen/ARM/ParallelDSP/top-bottom-neg-vec.ll
new file mode 100644
index 00000000000..ea60c656a06
--- /dev/null
+++ b/test/CodeGen/ARM/ParallelDSP/top-bottom-neg-vec.ll
@@ -0,0 +1,98 @@
+; RUN: opt -mtriple=thumbv7-unknown-linux-android -arm-parallel-dsp -S %s -o - | FileCheck %s
+
+@a = local_unnamed_addr global i32 0, align 4
+@b = local_unnamed_addr global i8* null, align 4
+@c = local_unnamed_addr global i8 0, align 1
+@d = local_unnamed_addr global i16* null, align 4
+
+; CHECK-LABEL: @convolve
+; CHECK-NOT: bitcast i16* [[ANY:%[^ ]+]] to i32*
+define void @convolve() local_unnamed_addr #0 {
+entry:
+  br label %for.cond
+
+for.cond:
+  %e.0 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.end ]
+  %f.0 = phi i32 [ undef, %entry ], [ %f.1.lcssa, %for.end ]
+  %g.0 = phi i32 [ undef, %entry ], [ %g.1.lcssa, %for.end ]
+  %cmp13 = icmp slt i32 %g.0, 1
+  br i1 %cmp13, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load i16*, i16** @d, align 4
+  %1 = load i8*, i8** @b, align 4
+  %2 = load i32, i32* @a, align 4
+  %3 = sub i32 1, %g.0
+  %min.iters.check = icmp ugt i32 %3, 3
+  %ident.check = icmp eq i32 %2, 1
+  %or.cond = and i1 %min.iters.check, %ident.check
+  br i1 %or.cond, label %vector.ph, label %for.body.preheader
+
+vector.ph:
+  %n.vec = and i32 %3, -4
+  %ind.end = add i32 %g.0, %n.vec
+  %4 = mul i32 %2, %n.vec
+  %ind.end20 = add i32 %f.0, %4
+  %5 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %e.0, i32 0
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ %5, %vector.ph ], [ %14, %vector.body ]
+  %offset.idx = add i32 %g.0, %index
+  %6 = mul i32 %2, %index
+  %offset.idx21 = add i32 %f.0, %6
+  %7 = getelementptr inbounds i16, i16* %0, i32 %offset.idx
+  %8 = bitcast i16* %7 to <4 x i16>*
+  %wide.load = load <4 x i16>, <4 x i16>* %8, align 2
+  %9 = sext <4 x i16> %wide.load to <4 x i32>
+  %10 = getelementptr inbounds i8, i8* %1, i32 %offset.idx21
+  %11 = bitcast i8* %10 to <4 x i8>*
+  %wide.load25 = load <4 x i8>, <4 x i8>* %11, align 1
+  %12 = zext <4 x i8> %wide.load25 to <4 x i32>
+  %13 = mul nsw <4 x i32> %12, %9
+  %14 = add nsw <4 x i32> %13, %vec.phi
+  %index.next = add i32 %index, 4
+  %15 = icmp eq i32 %index.next, %n.vec
+  br i1 %15, label %middle.block, label %vector.body
+
+middle.block:
+  %rdx.shuf = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx = add <4 x i32> %14, %rdx.shuf
+  %rdx.shuf26 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx27 = add <4 x i32> %bin.rdx, %rdx.shuf26
+  %16 = extractelement <4 x i32> %bin.rdx27, i32 0
+  %cmp.n = icmp eq i32 %3, %n.vec
+  br i1 %cmp.n, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  %g.116.ph = phi i32 [ %g.0, %for.body.lr.ph ], [ %ind.end, %middle.block ]
+  %f.115.ph = phi i32 [ %f.0, %for.body.lr.ph ], [ %ind.end20, %middle.block ]
+  %e.114.ph = phi i32 [ %e.0, %for.body.lr.ph ], [ %16, %middle.block ]
+  br label %for.body
+
+for.body:
+  %g.116 = phi i32 [ %inc, %for.body ], [ %g.116.ph, %for.body.preheader ]
+  %f.115 = phi i32 [ %add4, %for.body ], [ %f.115.ph, %for.body.preheader ]
+  %e.114 = phi i32 [ %add, %for.body ], [ %e.114.ph, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %0, i32 %g.116
+  %17 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %17 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %1, i32 %f.115
+  %18 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %18 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %add = add nsw i32 %mul, %e.114
+  %inc = add nsw i32 %g.116, 1
+  %add4 = add nsw i32 %2, %f.115
+  %cmp = icmp slt i32 %g.116, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  %e.1.lcssa = phi i32 [ %e.0, %for.cond ], [ %16, %middle.block ], [ %add, %for.body ]
+  %f.1.lcssa = phi i32 [ %f.0, %for.cond ], [ %ind.end20, %middle.block ], [ %add4, %for.body ]
+  %g.1.lcssa = phi i32 [ %g.0, %for.cond ], [ %ind.end, %middle.block ], [ %inc, %for.body ]
+  %conv5 = trunc i32 %e.1.lcssa to i8
+  store i8 %conv5, i8* @c, align 1
+  br label %for.cond
+}
diff --git a/test/CodeGen/ARM/ParallelDSP/top-bottom-neg.ll b/test/CodeGen/ARM/ParallelDSP/top-bottom-neg.ll
new file mode 100644
index 00000000000..0c4aaeee7cc
--- /dev/null
+++ b/test/CodeGen/ARM/ParallelDSP/top-bottom-neg.ll
@@ -0,0 +1,210 @@
+; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
+; RUN: opt -mtriple=thumbv7a-linux-android -arm-parallel-dsp -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: topbottom_mul_alias
+; CHECK-NOT: bitcast i16*
+define void @topbottom_mul_alias(i32 %N, i32* nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
+  %In1.0 = load i16, i16* %PIn1.0, align 2
+  %SIn1.0 = sext i16 %In1.0 to i32
+  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
+  %In2.0 = load i16, i16* %PIn2.0, align 2
+  %SIn2.0 = sext i16 %In2.0 to i32
+  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
+  %In1.1 = load i16, i16* %PIn1.1, align 2
+  %SIn1.1 = sext i16 %In1.1 to i32
+  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
+  %In2.1 = load i16, i16* %PIn2.1, align 2
+  %SIn2.1 = sext i16 %In2.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
+  %In1.2 = load i16, i16* %PIn1.2, align 2
+  %SIn1.2 = sext i16 %In1.2 to i32
+  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
+  %In2.2 = load i16, i16* %PIn2.2, align 2
+  %SIn2.2 = sext i16 %In2.2 to i32
+  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
+  %In1.3 = load i16, i16* %PIn1.3, align 2
+  %SIn1.3 = sext i16 %In1.3 to i32
+  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
+  %In2.3 = load i16, i16* %PIn2.3, align 2
+  %SIn2.3 = sext i16 %In2.3 to i32
+  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; TODO: We should be able to handle this by splatting the const value.
+; CHECK-LABEL: topbottom_mul_const
+; CHECK-NOT: bitcast i16*
+define void @topbottom_mul_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i16 signext %const) {
+entry:
+  %conv4.i.i = sext i16 %const to i32
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
+  %In.0 = load i16, i16* %PIn.0, align 2
+  %conv.us.i144.i = sext i16 %In.0 to i32
+  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %conv4.i.i
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
+  %In.1 = load i16, i16* %PIn.1, align 2
+  %conv.us.i144.1.i = sext i16 %In.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %conv4.i.i
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
+  %In.3 = load i16, i16* %PIn.2, align 2
+  %conv.us.i144.2.i = sext i16 %In.3 to i32
+  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %conv4.i.i
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
+  %In.4 = load i16, i16* %PIn.3, align 2
+  %conv.us.i144.3.i = sext i16 %In.4 to i32
+  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %conv4.i.i
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; TODO: We should be able to handle this and use smulwt and smulwb.
+; CHECK-LABEL: topbottom_mul_word_load_const
+; CHECK-NOT: bitcast i16*
+define void @topbottom_mul_word_load_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i32* %C) {
+entry:
+  %const = load i32, i32* %C
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
+  %In.0 = load i16, i16* %PIn.0, align 2
+  %conv.us.i144.i = sext i16 %In.0 to i32
+  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %const
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
+  %In.1 = load i16, i16* %PIn.1, align 2
+  %conv.us.i144.1.i = sext i16 %In.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %const
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
+  %In.3 = load i16, i16* %PIn.2, align 2
+  %conv.us.i144.2.i = sext i16 %In.3 to i32
+  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %const
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
+  %In.4 = load i16, i16* %PIn.3, align 2
+  %conv.us.i144.3.i = sext i16 %In.4 to i32
+  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %const
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: topbottom_mul_8
+; CHECK-NOT: bitcast i16*
+define void @topbottom_mul_8(i32 %N, i32* noalias nocapture readnone %Out, i8* nocapture readonly %In1, i8* nocapture readonly %In2) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn1.0 = getelementptr inbounds i8, i8* %In1, i32 %iv
+  %In1.0 = load i8, i8* %PIn1.0, align 1
+  %SIn1.0 = sext i8 %In1.0 to i32
+  %PIn2.0 = getelementptr inbounds i8, i8* %In2, i32 %iv
+  %In2.0 = load i8, i8* %PIn2.0, align 1
+  %SIn2.0 = sext i8 %In2.0 to i32
+  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn1.1 = getelementptr inbounds i8, i8* %In1, i32 %iv.1
+  %In1.1 = load i8, i8* %PIn1.1, align 1
+  %SIn1.1 = sext i8 %In1.1 to i32
+  %PIn2.1 = getelementptr inbounds i8, i8* %In2, i32 %iv.1
+  %In2.1 = load i8, i8* %PIn2.1, align 1
+  %SIn2.1 = sext i8 %In2.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn1.2 = getelementptr inbounds i8, i8* %In1, i32 %iv.2
+  %In1.2 = load i8, i8* %PIn1.2, align 1
+  %SIn1.2 = sext i8 %In1.2 to i32
+  %PIn2.2 = getelementptr inbounds i8, i8* %In2, i32 %iv.2
+  %In2.2 = load i8, i8* %PIn2.2, align 1
+  %SIn2.2 = sext i8 %In2.2 to i32
+  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn1.3 = getelementptr inbounds i8, i8* %In1, i32 %iv.3
+  %In1.3 = load i8, i8* %PIn1.3, align 1
+  %SIn1.3 = sext i8 %In1.3 to i32
+  %PIn2.3 = getelementptr inbounds i8, i8* %In2, i32 %iv.3
+  %In2.3 = load i8, i8* %PIn2.3, align 1
+  %SIn2.3 = sext i8 %In2.3 to i32
+  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/ARM/ParallelDSP/top-bottom-order.ll b/test/CodeGen/ARM/ParallelDSP/top-bottom-order.ll
new file mode 100644
index 00000000000..e78afc80f15
--- /dev/null
+++ b/test/CodeGen/ARM/ParallelDSP/top-bottom-order.ll
@@ -0,0 +1,54 @@
+; RUN: opt -mtriple=thumbv8m.main -mcpu=cortex-m33 -arm-parallel-dsp -S %s -o - | FileCheck %s
+; RUN: opt -mtriple=thumbv7a-linux-android -arm-parallel-dsp -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: reorder_gep_arguments 
+; CHECK: [[Sub:%[^ ]+]] = xor i32 %iv, -1
+; CHECK: [[IdxPtr:%[^ ]+]] = getelementptr inbounds i16, i16* %arrayidx.us, i32 [[Sub]]
+; CHECK: [[IdxPtrCast:%[^ ]+]] = bitcast i16* [[IdxPtr]] to i32*
+; CHECK: [[Idx:%[^ ]+]] = load i32, i32* [[IdxPtrCast]], align 2
+; CHECK: [[Top:%[^ ]+]] = ashr i32 [[Idx]], 16
+; CHECK: [[Shl:%[^ ]+]] = shl i32 [[Idx]], 16
+; CHECK: [[Bottom:%[^ ]+]] = ashr i32 [[Shl]], 16
+; CHECK: [[BPtr:%[^ ]+]] = getelementptr inbounds i16, i16* %B, i32 %iv
+; CHECK: [[BData:%[^ ]+]] = load i16, i16* [[BPtr]], align 2
+; CHECK: [[BSext:%[^ ]+]] = sext i16 [[BData]] to i32
+; CHECK: [[Mul0:%[^ ]+]] = mul nsw i32 [[BSext]], [[Top]]
+; CHECK: [[BPtr1:%[^ ]+]] = getelementptr inbounds i16, i16* %B, i32 %add48.us
+; CHECK: [[BData1:%[^ ]+]] = load i16, i16* [[BPtr1]], align 2
+; CHECK: [[B1Sext:%[^ ]+]] = sext i16 [[BData1]] to i32
+; CHECK: [[Mul1:%[^ ]+]] = mul nsw i32 [[B1Sext]], [[Bottom]]
+
+define i32 @reorder_gep_arguments(i16* %B, i16* %arrayidx.us, i32 %d) {
+entry:
+  br label %for.body36.us
+
+for.body36.us:
+  %iv = phi i32 [ %add53.us, %for.body36.us ], [ 5, %entry ]
+  %out32_Q12.0114.us = phi i32 [ %add52.us, %for.body36.us ], [ 0, %entry ]
+  %sub37.us = sub nsw i32 0, %iv
+  %arrayidx38.us = getelementptr inbounds i16, i16* %arrayidx.us, i32 %sub37.us
+  %0 = load i16, i16* %arrayidx38.us, align 2
+  %conv39.us = sext i16 %0 to i32
+  %arrayidx40.us = getelementptr inbounds i16, i16* %B, i32 %iv
+  %1 = load i16, i16* %arrayidx40.us, align 2
+  %conv41.us = sext i16 %1 to i32
+  %mul42.us = mul nsw i32 %conv41.us, %conv39.us
+  %add43.us = add i32 %mul42.us, %out32_Q12.0114.us
+  %sub45.us = xor i32 %iv, -1
+  %arrayidx46.us = getelementptr inbounds i16, i16* %arrayidx.us, i32 %sub45.us
+  %2 = load i16, i16* %arrayidx46.us, align 2
+  %conv47.us = sext i16 %2 to i32
+  %add48.us = or i32 %iv, 1
+  %arrayidx49.us = getelementptr inbounds i16, i16* %B, i32 %add48.us
+  %3 = load i16, i16* %arrayidx49.us, align 2
+  %conv50.us = sext i16 %3 to i32
+  %mul51.us = mul nsw i32 %conv50.us, %conv47.us
+  %add52.us = add i32 %add43.us, %mul51.us
+  %add53.us = add nuw nsw i32 %iv, 2
+  %cmp34.us = icmp slt i32 %add53.us, %d
+  br i1 %cmp34.us, label %for.body36.us, label %exit
+
+exit:
+  ret i32 %add52.us
+}
+
diff --git a/test/CodeGen/ARM/ParallelDSP/top-bottom.ll b/test/CodeGen/ARM/ParallelDSP/top-bottom.ll
new file mode 100644
index 00000000000..e82a5d4e1c9
--- /dev/null
+++ b/test/CodeGen/ARM/ParallelDSP/top-bottom.ll
@@ -0,0 +1,252 @@
+; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
+; RUN: opt -mtriple=thumbv7a-linux-android -arm-parallel-dsp -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: topbottom_mul
+define void @topbottom_mul(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
+entry:
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: [[Cast_PIn1_0:%[^ ]+]] = bitcast i16* %PIn1.0 to i32*
+; CHECK: [[PIn1_01:%[^ ]+]] = load i32, i32* [[Cast_PIn1_0]], align 2
+; CHECK: [[PIn1_1:%[^ ]+]] = ashr i32 [[PIn1_01]], 16
+; CHECK: [[PIn1_01_shl:%[^ ]+]] = shl i32 [[PIn1_01]], 16
+; CHECK: [[PIn1_0:%[^ ]+]] = ashr i32 [[PIn1_01_shl]], 16
+
+; CHECK: [[Cast_PIn2_0:%[^ ]+]] = bitcast i16* %PIn2.0 to i32*
+; CHECK: [[PIn2_01:%[^ ]+]] = load i32, i32* [[Cast_PIn2_0]], align 2
+; CHECK: [[PIn2_1:%[^ ]+]] = ashr i32 [[PIn2_01]], 16
+; CHECK: [[PIn2_01_shl:%[^ ]+]] = shl i32 [[PIn2_01]], 16
+; CHECK: [[PIn2_0:%[^ ]+]] = ashr i32 [[PIn2_01_shl]], 16
+
+; CHECK: mul nsw i32 [[PIn1_0]], [[PIn2_0]]
+; CHECK: mul nsw i32 [[PIn1_1]], [[PIn2_1]]
+
+; CHECK: [[Cast_PIn1_2:%[^ ]+]] = bitcast i16* %PIn1.2 to i32*
+; CHECK: [[PIn1_23:%[^ ]+]] = load i32, i32* [[Cast_PIn1_2]], align 2
+; CHECK: [[PIn1_3:%[^ ]+]] = ashr i32 [[PIn1_23]], 16
+; CHECK: [[PIn1_23_shl:%[^ ]+]] = shl i32 [[PIn1_23]], 16
+; CHECK: [[PIn1_2:%[^ ]+]] = ashr i32 [[PIn1_23_shl]], 16
+
+; CHECK: [[Cast_PIn2_2:%[^ ]+]] = bitcast i16* %PIn2.2 to i32*
+; CHECK: [[PIn2_23:%[^ ]+]] = load i32, i32* [[Cast_PIn2_2]], align 2
+; CHECK: [[PIn2_3:%[^ ]+]] = ashr i32 [[PIn2_23]], 16
+; CHECK: [[PIn2_23_shl:%[^ ]+]] = shl i32 [[PIn2_23]], 16
+; CHECK: [[PIn2_2:%[^ ]+]] = ashr i32 [[PIn2_23_shl]], 16
+
+; CHECK: mul nsw i32 [[PIn1_2]], [[PIn2_2]]
+; CHECK: mul nsw i32 [[PIn1_3]], [[PIn2_3]]
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
+  %In1.0 = load i16, i16* %PIn1.0, align 2
+  %SIn1.0 = sext i16 %In1.0 to i32
+  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
+  %In2.0 = load i16, i16* %PIn2.0, align 2
+  %SIn2.0 = sext i16 %In2.0 to i32
+  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
+  %In1.1 = load i16, i16* %PIn1.1, align 2
+  %SIn1.1 = sext i16 %In1.1 to i32
+  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
+  %In2.1 = load i16, i16* %PIn2.1, align 2
+  %SIn2.1 = sext i16 %In2.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
+  %In1.2 = load i16, i16* %PIn1.2, align 2
+  %SIn1.2 = sext i16 %In1.2 to i32
+  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
+  %In2.2 = load i16, i16* %PIn2.2, align 2
+  %SIn2.2 = sext i16 %In2.2 to i32
+  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
+  %In1.3 = load i16, i16* %PIn1.3, align 2
+  %SIn1.3 = sext i16 %In1.3 to i32
+  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
+  %In2.3 = load i16, i16* %PIn2.3, align 2
+  %SIn2.3 = sext i16 %In2.3 to i32
+  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: topbottom_mul_load_const
+define void @topbottom_mul_load_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i16* %C) {
+entry:
+  %const = load i16, i16* %C
+  %conv4.i.i = sext i16 %const to i32
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: [[Cast_PIn_0:%[^ ]+]] = bitcast i16* %PIn.0 to i32*
+; CHECK: [[PIn_01:%[^ ]+]] = load i32, i32* [[Cast_PIn_0]], align 2
+; CHECK: [[PIn_1:%[^ ]+]] = ashr i32 [[PIn_01]], 16
+; CHECK: [[PIn_01_shl:%[^ ]+]] = shl i32 [[PIn_01]], 16
+; CHECK: [[PIn_0:%[^ ]+]] = ashr i32 [[PIn_01_shl]], 16
+
+; CHECK: mul nsw i32 [[PIn_0]], %conv4.i.i
+; CHECK: mul nsw i32 [[PIn_1]], %conv4.i.i
+
+; CHECK: [[Cast_PIn_2:%[^ ]+]] = bitcast i16* %PIn.2 to i32*
+; CHECK: [[PIn_23:%[^ ]+]] = load i32, i32* [[Cast_PIn_2]], align 2
+; CHECK: [[PIn_3:%[^ ]+]] = ashr i32 [[PIn_23]], 16
+; CHECK: [[PIn_23_shl:%[^ ]+]] = shl i32 [[PIn_23]], 16
+; CHECK: [[PIn_2:%[^ ]+]] = ashr i32 [[PIn_23_shl]], 16
+
+; CHECK: mul nsw i32 [[PIn_2]], %conv4.i.i
+; CHECK: mul nsw i32 [[PIn_3]], %conv4.i.i
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
+  %In.0 = load i16, i16* %PIn.0, align 2
+  %conv.us.i144.i = sext i16 %In.0 to i32
+  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %conv4.i.i
+  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
+  store i32 %mul5.us.i.i, i32* %Out.0, align 4
+  %iv.1 = or i32 %iv, 1
+  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
+  %In.1 = load i16, i16* %PIn.1, align 2
+  %conv.us.i144.1.i = sext i16 %In.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %conv4.i.i
+  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
+  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
+  %In.3 = load i16, i16* %PIn.2, align 2
+  %conv.us.i144.2.i = sext i16 %In.3 to i32
+  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %conv4.i.i
+  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
+  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
+  %iv.3 = or i32 %iv, 3
+  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
+  %In.4 = load i16, i16* %PIn.3, align 2
+  %conv.us.i144.3.i = sext i16 %In.4 to i32
+  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %conv4.i.i
+  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
+  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
+  %iv.next = add i32 %iv, 4
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: topbottom_mul_64
+define void @topbottom_mul_64(i32 %N, i64* noalias nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
+entry:
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: [[Cast_PIn1_0:%[^ ]+]] = bitcast i16* %PIn1.0 to i32*
+; CHECK: [[PIn1_01:%[^ ]+]] = load i32, i32* [[Cast_PIn1_0]], align 2
+; CHECK: [[PIn1_1:%[^ ]+]] = ashr i32 [[PIn1_01]], 16
+; CHECK: [[PIn1_01_shl:%[^ ]+]] = shl i32 [[PIn1_01]], 16
+; CHECK: [[PIn1_0:%[^ ]+]] = ashr i32 [[PIn1_01_shl]], 16
+
+; CHECK: [[Cast_PIn2_0:%[^ ]+]] = bitcast i16* %PIn2.0 to i32*
+; CHECK: [[PIn2_01:%[^ ]+]] = load i32, i32* [[Cast_PIn2_0]], align 2
+; CHECK: [[PIn2_1:%[^ ]+]] = ashr i32 [[PIn2_01]], 16
+; CHECK: [[PIn2_01_shl:%[^ ]+]] = shl i32 [[PIn2_01]], 16
+; CHECK: [[PIn2_0:%[^ ]+]] = ashr i32 [[PIn2_01_shl]], 16
+
+; CHECK: [[Mul0:%[^ ]+]] = mul nsw i32 [[PIn1_0]], [[PIn2_0]]
+; CHECK: [[SMul0:%[^ ]+]] = sext i32 [[Mul0]] to i64
+; CHECK: [[Mul1:%[^ ]+]] = mul nsw i32 [[PIn1_1]], [[PIn2_1]]
+; CHECK: [[SMul1:%[^ ]+]] = sext i32 [[Mul1]] to i64
+; CHECK: add i64 [[SMul0]], [[SMul1]]
+
+; CHECK: [[Cast_PIn1_2:%[^ ]+]] = bitcast i16* %PIn1.2 to i32*
+; CHECK: [[PIn1_23:%[^ ]+]] = load i32, i32* [[Cast_PIn1_2]], align 2
+; CHECK: [[PIn1_3:%[^ ]+]] = ashr i32 [[PIn1_23]], 16
+; CHECK: [[PIn1_23_shl:%[^ ]+]] = shl i32 [[PIn1_23]], 16
+; CHECK: [[PIn1_2:%[^ ]+]] = ashr i32 [[PIn1_23_shl]], 16
+
+; CHECK: [[Cast_PIn2_2:%[^ ]+]] = bitcast i16* %PIn2.2 to i32*
+; CHECK: [[PIn2_23:%[^ ]+]] = load i32, i32* [[Cast_PIn2_2]], align 2
+; CHECK: [[PIn2_3:%[^ ]+]] = ashr i32 [[PIn2_23]], 16
+; CHECK: [[PIn2_23_shl:%[^ ]+]] = shl i32 [[PIn2_23]], 16
+; CHECK: [[PIn2_2:%[^ ]+]] = ashr i32 [[PIn2_23_shl]], 16
+
+; CHECK: [[Mul2:%[^ ]+]] = mul nsw i32 [[PIn1_2]], [[PIn2_2]]
+; CHECK: [[SMul2:%[^ ]+]] = sext i32 [[Mul2]] to i64
+; CHECK: [[Mul3:%[^ ]+]] = mul nsw i32 [[PIn1_3]], [[PIn2_3]]
+; CHECK: [[SMul3:%[^ ]+]] = sext i32 [[Mul3]] to i64
+; CHECK: add i64 [[SMul2]], [[SMul3]]
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %iv.out = phi i32 [ 0, %entry] , [ %iv.out.next, %for.body ]
+  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
+  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
+  %In1.0 = load i16, i16* %PIn1.0, align 2
+  %SIn1.0 = sext i16 %In1.0 to i32
+  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
+  %In2.0 = load i16, i16* %PIn2.0, align 2
+  %SIn2.0 = sext i16 %In2.0 to i32
+  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
+  %sext.0 = sext i32 %mul5.us.i.i to i64
+  %iv.1 = or i32 %iv, 1
+  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
+  %In1.1 = load i16, i16* %PIn1.1, align 2
+  %SIn1.1 = sext i16 %In1.1 to i32
+  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
+  %In2.1 = load i16, i16* %PIn2.1, align 2
+  %SIn2.1 = sext i16 %In2.1 to i32
+  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
+  %sext.1 = sext i32 %mul5.us.i.1.i to i64
+  %mac.0 = add i64 %sext.0, %sext.1
+  %Out.0 = getelementptr inbounds i64, i64* %Out, i32 %iv.out
+  store i64 %mac.0, i64* %Out.0, align 4
+  %iv.2 = or i32 %iv, 2
+  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
+  %In1.2 = load i16, i16* %PIn1.2, align 2
+  %SIn1.2 = sext i16 %In1.2 to i32
+  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
+  %In2.2 = load i16, i16* %PIn2.2, align 2
+  %SIn2.2 = sext i16 %In2.2 to i32
+  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
+  %sext.2 = sext i32 %mul5.us.i.2.i to i64
+  %iv.3 = or i32 %iv, 3
+  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
+  %In1.3 = load i16, i16* %PIn1.3, align 2
+  %SIn1.3 = sext i16 %In1.3 to i32
+  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
+  %In2.3 = load i16, i16* %PIn2.3, align 2
+  %SIn2.3 = sext i16 %In2.3 to i32
+  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
+  %sext.3 = sext i32 %mul5.us.i.3.i to i64
+  %mac.1 = add i64 %sext.2, %sext.3
+  %iv.out.1 = or i32 %iv.out, 1
+  %Out.1 = getelementptr inbounds i64, i64* %Out, i32 %iv.out.1
+  store i64 %mac.1, i64* %Out.1, align 4
+  %iv.next = add i32 %iv, 4
+  %iv.out.next = add i32 %iv.out, 2
+  %count.next = add i32 %count, -4
+  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
+  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  ret void
+}
-- 
GitLab


From 0a1aef0095570ff65451c9bae9507672583881cf Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Wed, 17 Oct 2018 15:04:15 +0000
Subject: [PATCH 0280/1116] [llvm-exegesis] Allow measuring several
 instructions in a single run.

Summary:
We try to recover gracefully on instructions that would crash the
program.

This includes some refactoring of runMeasurement() implementations.

Reviewers: gchatelet

Subscribers: tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D53371

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344695 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/CommandGuide/llvm-exegesis.rst         |  5 +-
 tools/llvm-exegesis/lib/BenchmarkRunner.cpp | 65 +++++++++++++++--
 tools/llvm-exegesis/lib/BenchmarkRunner.h   | 14 +++-
 tools/llvm-exegesis/lib/Latency.cpp         | 29 ++++----
 tools/llvm-exegesis/lib/Latency.h           |  5 +-
 tools/llvm-exegesis/lib/Uops.cpp            | 43 ++++-------
 tools/llvm-exegesis/lib/Uops.h              |  5 +-
 tools/llvm-exegesis/llvm-exegesis.cpp       | 80 +++++++++++++++------
 8 files changed, 160 insertions(+), 86 deletions(-)

diff --git a/docs/CommandGuide/llvm-exegesis.rst b/docs/CommandGuide/llvm-exegesis.rst
index 4181a998721..bf21563722f 100644
--- a/docs/CommandGuide/llvm-exegesis.rst
+++ b/docs/CommandGuide/llvm-exegesis.rst
@@ -175,9 +175,10 @@ OPTIONS
  Specify the opcode to measure, by index. See example 1 for details.
  Either `opcode-index`, `opcode-name` or `snippets-file` must be set.
 
-.. option:: -opcode-name=<LLVM opcode name>
+.. option:: -opcode-name=<opcode name 1>,<opcode name 2>,...
 
- Specify the opcode to measure, by name. See example 1 for details.
+ Specify the opcode to measure, by name. Several opcodes can be specified as
+ a comma-separated list. See example 1 for details.
  Either `opcode-index`, `opcode-name` or `snippets-file` must be set.
 
  .. option:: -snippets-file=<filename>
diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 7addb0acd7e..4eb5f1e880c 100644
--- a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -13,9 +13,11 @@
 #include "Assembler.h"
 #include "BenchmarkRunner.h"
 #include "MCInstrDescView.h"
+#include "PerfHelper.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
@@ -43,6 +45,54 @@ GenerateInstructions(const BenchmarkCode &BC, const size_t MinInstructions) {
   return Code;
 }
 
+namespace {
+class FunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
+public:
+  FunctionExecutorImpl(const LLVMState &State,
+                       llvm::object::OwningBinary<llvm::object::ObjectFile> Obj,
+                       BenchmarkRunner::ScratchSpace *Scratch)
+      : Function(State.createTargetMachine(), std::move(Obj)),
+        Scratch(Scratch) {}
+
+private:
+  llvm::Expected<int64_t> runAndMeasure(const char *Counters) const override {
+    // We sum counts when there are several counters for a single ProcRes
+    // (e.g. P23 on SandyBridge).
+    int64_t CounterValue = 0;
+    llvm::SmallVector<llvm::StringRef, 2> CounterNames;
+    llvm::StringRef(Counters).split(CounterNames, ',');
+    char *const ScratchPtr = Scratch->ptr();
+    for (const auto &CounterName : CounterNames) {
+      pfm::PerfEvent PerfEvent(CounterName);
+      if (!PerfEvent.valid())
+        llvm::report_fatal_error(
+            llvm::Twine("invalid perf event ").concat(Counters));
+      pfm::Counter Counter(PerfEvent);
+      Scratch->clear();
+      {
+        llvm::CrashRecoveryContext CRC;
+        llvm::CrashRecoveryContext::Enable();
+        const bool Crashed = !CRC.RunSafely([this, &Counter, ScratchPtr]() {
+          Counter.start();
+          Function(ScratchPtr);
+          Counter.stop();
+        });
+        llvm::CrashRecoveryContext::Disable();
+        // FIXME: Better diagnosis.
+        if (Crashed)
+          return llvm::make_error<BenchmarkFailure>(
+              "snippet crashed while running");
+      }
+      CounterValue += Counter.read();
+    }
+    return CounterValue;
+  }
+
+  const ExecutableFunction Function;
+  BenchmarkRunner::ScratchSpace *const Scratch;
+};
+} // namespace
+
 InstructionBenchmark
 BenchmarkRunner::runConfiguration(const BenchmarkCode &BC,
                                   unsigned NumRepetitions) const {
@@ -86,16 +136,21 @@ BenchmarkRunner::runConfiguration(const BenchmarkCode &BC,
   }
   llvm::outs() << "Check generated assembly with: /usr/bin/objdump -d "
                << *ObjectFilePath << "\n";
-  const ExecutableFunction EF(State.createTargetMachine(),
-                              getObjectFromFile(*ObjectFilePath));
-  InstrBenchmark.Measurements = runMeasurements(EF, *Scratch);
+  const FunctionExecutorImpl Executor(State, getObjectFromFile(*ObjectFilePath),
+                                      Scratch.get());
+  auto Measurements = runMeasurements(Executor);
+  if (llvm::Error E = Measurements.takeError()) {
+    InstrBenchmark.Error = llvm::toString(std::move(E));
+    return InstrBenchmark;
+  }
+  InstrBenchmark.Measurements = std::move(*Measurements);
   assert(InstrBenchmark.NumRepetitions > 0 && "invalid NumRepetitions");
   for (BenchmarkMeasure &BM : InstrBenchmark.Measurements) {
     // Scale the measurements by instruction.
     BM.PerInstructionValue /= InstrBenchmark.NumRepetitions;
     // Scale the measurements by snippet.
     BM.PerSnippetValue *= static_cast<double>(BC.Instructions.size()) /
-                   InstrBenchmark.NumRepetitions;
+                          InstrBenchmark.NumRepetitions;
   }
 
   return InstrBenchmark;
@@ -115,4 +170,6 @@ BenchmarkRunner::writeObjectFile(const BenchmarkCode &BC,
   return ResultPath.str();
 }
 
+BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}
+
 } // namespace exegesis
diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.h b/tools/llvm-exegesis/lib/BenchmarkRunner.h
index e5b567f2463..46405898954 100644
--- a/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -64,13 +64,21 @@ public:
     char *const AlignedPtr;
   };
 
+  // A helper to measure counters while executing a function in a sandboxed
+  // context.
+  class FunctionExecutor {
+  public:
+    ~FunctionExecutor();
+    virtual llvm::Expected<int64_t>
+    runAndMeasure(const char *Counters) const = 0;
+  };
+
 protected:
   const LLVMState &State;
 
 private:
-  virtual std::vector<BenchmarkMeasure>
-  runMeasurements(const ExecutableFunction &EF,
-                  ScratchSpace &Scratch) const = 0;
+  virtual llvm::Expected<std::vector<BenchmarkMeasure>>
+  runMeasurements(const FunctionExecutor &Executor) const = 0;
 
   llvm::Expected<std::string>
   writeObjectFile(const BenchmarkCode &Configuration,
diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp
index 0dd84cc0958..7d68d60c48b 100644
--- a/tools/llvm-exegesis/lib/Latency.cpp
+++ b/tools/llvm-exegesis/lib/Latency.cpp
@@ -12,7 +12,6 @@
 #include "Assembler.h"
 #include "BenchmarkRunner.h"
 #include "MCInstrDescView.h"
-#include "PerfHelper.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
@@ -178,30 +177,26 @@ const char *LatencyBenchmarkRunner::getCounterName() const {
 
 LatencyBenchmarkRunner::~LatencyBenchmarkRunner() = default;
 
-std::vector<BenchmarkMeasure>
-LatencyBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
-                                        ScratchSpace &Scratch) const {
+llvm::Expected<std::vector<BenchmarkMeasure>>
+LatencyBenchmarkRunner::runMeasurements(
+    const FunctionExecutor &Executor) const {
   // Cycle measurements include some overhead from the kernel. Repeat the
   // measure several times and take the minimum value.
   constexpr const int NumMeasurements = 30;
-  int64_t MinLatency = std::numeric_limits<int64_t>::max();
+  int64_t MinValue = std::numeric_limits<int64_t>::max();
   const char *CounterName = getCounterName();
   if (!CounterName)
     llvm::report_fatal_error("could not determine cycle counter name");
-  const pfm::PerfEvent CyclesPerfEvent(CounterName);
-  if (!CyclesPerfEvent.valid())
-    llvm::report_fatal_error("invalid perf event");
   for (size_t I = 0; I < NumMeasurements; ++I) {
-    pfm::Counter Counter(CyclesPerfEvent);
-    Scratch.clear();
-    Counter.start();
-    Function(Scratch.ptr());
-    Counter.stop();
-    const int64_t Value = Counter.read();
-    if (Value < MinLatency)
-      MinLatency = Value;
+    auto ExpectedCounterValue = Executor.runAndMeasure(CounterName);
+    if (!ExpectedCounterValue)
+      return ExpectedCounterValue.takeError();
+    if (*ExpectedCounterValue < MinValue)
+      MinValue = *ExpectedCounterValue;
   }
-  return {BenchmarkMeasure::Create("latency", MinLatency)};
+  std::vector<BenchmarkMeasure> Result = {
+      BenchmarkMeasure::Create("latency", MinValue)};
+  return std::move(Result);
 }
 
 } // namespace exegesis
diff --git a/tools/llvm-exegesis/lib/Latency.h b/tools/llvm-exegesis/lib/Latency.h
index f78f12615c7..cb55f340a5a 100644
--- a/tools/llvm-exegesis/lib/Latency.h
+++ b/tools/llvm-exegesis/lib/Latency.h
@@ -37,9 +37,8 @@ public:
   ~LatencyBenchmarkRunner() override;
 
 private:
-  std::vector<BenchmarkMeasure>
-  runMeasurements(const ExecutableFunction &EF,
-                  ScratchSpace &Scratch) const override;
+  llvm::Expected<std::vector<BenchmarkMeasure>>
+  runMeasurements(const FunctionExecutor &Executor) const override;
 
   virtual const char *getCounterName() const;
 };
diff --git a/tools/llvm-exegesis/lib/Uops.cpp b/tools/llvm-exegesis/lib/Uops.cpp
index d8065adbdb2..50be707feb2 100644
--- a/tools/llvm-exegesis/lib/Uops.cpp
+++ b/tools/llvm-exegesis/lib/Uops.cpp
@@ -12,7 +12,6 @@
 #include "Assembler.h"
 #include "BenchmarkRunner.h"
 #include "MCInstrDescView.h"
-#include "PerfHelper.h"
 #include "Target.h"
 
 // FIXME: Load constants into registers (e.g. with fld1) to not break
@@ -221,33 +220,10 @@ UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
   return getSingleton(std::move(CT));
 }
 
-std::vector<BenchmarkMeasure>
-UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
-                                     ScratchSpace &Scratch) const {
+llvm::Expected<std::vector<BenchmarkMeasure>>
+UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
   const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
 
-  const auto RunMeasurement = [&Function,
-                               &Scratch](const char *const Counters) {
-    // We sum counts when there are several counters for a single ProcRes
-    // (e.g. P23 on SandyBridge).
-    int64_t CounterValue = 0;
-    llvm::SmallVector<llvm::StringRef, 2> CounterNames;
-    llvm::StringRef(Counters).split(CounterNames, ',');
-    for (const auto &CounterName : CounterNames) {
-      pfm::PerfEvent UopPerfEvent(CounterName);
-      if (!UopPerfEvent.valid())
-        llvm::report_fatal_error(
-            llvm::Twine("invalid perf event ").concat(Counters));
-      pfm::Counter Counter(UopPerfEvent);
-      Scratch.clear();
-      Counter.start();
-      Function(Scratch.ptr());
-      Counter.stop();
-      CounterValue += Counter.read();
-    }
-    return CounterValue;
-  };
-
   std::vector<BenchmarkMeasure> Result;
   const auto &PfmCounters = SchedModel.getExtraProcessorInfo().PfmCounters;
   // Uops per port.
@@ -256,16 +232,21 @@ UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
     const char *const Counters = PfmCounters.IssueCounters[ProcResIdx];
     if (!Counters)
       continue;
-    const double CounterValue = RunMeasurement(Counters);
+    auto ExpectedCounterValue = Executor.runAndMeasure(Counters);
+    if (!ExpectedCounterValue)
+      return ExpectedCounterValue.takeError();
     Result.push_back(BenchmarkMeasure::Create(
-        SchedModel.getProcResource(ProcResIdx)->Name, CounterValue));
+        SchedModel.getProcResource(ProcResIdx)->Name, *ExpectedCounterValue));
   }
   // NumMicroOps.
   if (const char *const UopsCounter = PfmCounters.UopsCounter) {
-    const double CounterValue = RunMeasurement(UopsCounter);
-    Result.push_back(BenchmarkMeasure::Create("NumMicroOps", CounterValue));
+    auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
+    if (!ExpectedCounterValue)
+      return ExpectedCounterValue.takeError();
+    Result.push_back(
+        BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
   }
-  return Result;
+  return std::move(Result);
 }
 
 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
diff --git a/tools/llvm-exegesis/lib/Uops.h b/tools/llvm-exegesis/lib/Uops.h
index e6f6d4a09cb..f75f2edd552 100644
--- a/tools/llvm-exegesis/lib/Uops.h
+++ b/tools/llvm-exegesis/lib/Uops.h
@@ -68,9 +68,8 @@ public:
   static constexpr const size_t kMinNumDifferentAddresses = 6;
 
 private:
-  std::vector<BenchmarkMeasure>
-  runMeasurements(const ExecutableFunction &EF,
-                  ScratchSpace &Scratch) const override;
+  llvm::Expected<std::vector<BenchmarkMeasure>>
+  runMeasurements(const FunctionExecutor &Executor) const override;
 };
 
 } // namespace exegesis
diff --git a/tools/llvm-exegesis/llvm-exegesis.cpp b/tools/llvm-exegesis/llvm-exegesis.cpp
index b4891f1f1db..39044d48b4b 100644
--- a/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -38,13 +38,14 @@
 #include <algorithm>
 #include <string>
 
-static llvm::cl::opt<unsigned>
+static llvm::cl::opt<int>
     OpcodeIndex("opcode-index", llvm::cl::desc("opcode to measure, by index"),
                 llvm::cl::init(0));
 
-static llvm::cl::opt<std::string>
-    OpcodeName("opcode-name", llvm::cl::desc("opcode to measure, by name"),
-               llvm::cl::init(""));
+static llvm::cl::opt<std::string> OpcodeNames(
+    "opcode-name",
+    llvm::cl::desc("comma-separated list of opcodes to measure, by name"),
+    llvm::cl::init(""));
 
 static llvm::cl::opt<std::string>
     SnippetsFile("snippets-file", llvm::cl::desc("code snippets to measure"),
@@ -99,11 +100,12 @@ static llvm::ExitOnError ExitOnErr;
 void LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET();
 #endif
 
-// Checks that only one of OpcodeName, OpcodeIndex or SnippetsFile is provided,
-// and returns the opcode index or 0 if snippets should be read from
+// Checks that only one of OpcodeNames, OpcodeIndex or SnippetsFile is provided,
+// and returns the opcode indices or {} if snippets should be read from
 // `SnippetsFile`.
-static unsigned getOpcodeOrDie(const llvm::MCInstrInfo &MCInstrInfo) {
-  const size_t NumSetFlags = (OpcodeName.empty() ? 0 : 1) +
+static std::vector<unsigned>
+getOpcodesOrDie(const llvm::MCInstrInfo &MCInstrInfo) {
+  const size_t NumSetFlags = (OpcodeNames.empty() ? 0 : 1) +
                              (OpcodeIndex == 0 ? 0 : 1) +
                              (SnippetsFile.empty() ? 0 : 1);
   if (NumSetFlags != 1)
@@ -111,14 +113,35 @@ static unsigned getOpcodeOrDie(const llvm::MCInstrInfo &MCInstrInfo) {
         "please provide one and only one of 'opcode-index', 'opcode-name' or "
         "'snippets-file'");
   if (!SnippetsFile.empty())
-    return 0;
+    return {};
   if (OpcodeIndex > 0)
-    return OpcodeIndex;
+    return {static_cast<unsigned>(OpcodeIndex)};
+  if (OpcodeIndex < 0) {
+    std::vector<unsigned> Result;
+    for (unsigned I = 1, E = MCInstrInfo.getNumOpcodes(); I <= E; ++I)
+      Result.push_back(I);
+    return Result;
+  }
   // Resolve opcode name -> opcode.
-  for (unsigned I = 0, E = MCInstrInfo.getNumOpcodes(); I < E; ++I)
-    if (MCInstrInfo.getName(I) == OpcodeName)
-      return I;
-  llvm::report_fatal_error(llvm::Twine("unknown opcode ").concat(OpcodeName));
+  const auto ResolveName =
+      [&MCInstrInfo](llvm::StringRef OpcodeName) -> unsigned {
+    for (unsigned I = 1, E = MCInstrInfo.getNumOpcodes(); I < E; ++I)
+      if (MCInstrInfo.getName(I) == OpcodeName)
+        return I;
+    return 0u;
+  };
+  llvm::SmallVector<llvm::StringRef, 2> Pieces;
+  llvm::StringRef(OpcodeNames.getValue())
+      .split(Pieces, ",", /* MaxSplit */ -1, /* KeepEmpty */ false);
+  std::vector<unsigned> Result;
+  for (const llvm::StringRef OpcodeName : Pieces) {
+    if (unsigned Opcode = ResolveName(OpcodeName))
+      Result.push_back(Opcode);
+    else
+      llvm::report_fatal_error(
+          llvm::Twine("unknown opcode ").concat(OpcodeName));
+  }
+  return Result;
 }
 
 // Generates code snippets for opcode `Opcode`.
@@ -299,18 +322,29 @@ void benchmarkMain() {
 #endif
 
   const LLVMState State;
-  const auto Opcode = getOpcodeOrDie(State.getInstrInfo());
+  const auto Opcodes = getOpcodesOrDie(State.getInstrInfo());
 
   std::vector<BenchmarkCode> Configurations;
-  if (Opcode > 0) {
-    // Ignore instructions without a sched class if -ignore-invalid-sched-class
-    // is passed.
-    if (IgnoreInvalidSchedClass &&
-        State.getInstrInfo().get(Opcode).getSchedClass() == 0) {
-      llvm::errs() << "ignoring instruction without sched class\n";
-      return;
+  if (!Opcodes.empty()) {
+    for (const unsigned Opcode : Opcodes) {
+      // Ignore instructions without a sched class if
+      // -ignore-invalid-sched-class is passed.
+      if (IgnoreInvalidSchedClass &&
+          State.getInstrInfo().get(Opcode).getSchedClass() == 0) {
+        llvm::errs() << State.getInstrInfo().getName(Opcode)
+                     << ": ignoring instruction without sched class\n";
+        continue;
+      }
+      auto ConfigsForInstr = generateSnippets(State, Opcode);
+      if (!ConfigsForInstr) {
+        llvm::logAllUnhandledErrors(
+            ConfigsForInstr.takeError(), llvm::errs(),
+            llvm::Twine(State.getInstrInfo().getName(Opcode)).concat(": "));
+        continue;
+      }
+      std::move(ConfigsForInstr->begin(), ConfigsForInstr->end(),
+                std::back_inserter(Configurations));
     }
-    Configurations = ExitOnErr(generateSnippets(State, Opcode));
   } else {
     Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
   }
-- 
GitLab


From cc436fd26637b0629b95fd8e60fde61cec4b421f Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Wed, 17 Oct 2018 15:37:30 +0000
Subject: [PATCH 0281/1116] AMDGPU: Divergence-driven selection of scalar
 buffer load intrinsics

Summary:
Moving SMRD to VMEM in SIFixSGPRCopies is rather bad for performance if
the load is really uniform. So select the scalar load intrinsics directly
to either VMEM or SMRD buffer loads based on divergence analysis.

If an offset happens to end up in a VGPR -- either because a floating
point calculation was involved, or due to other remaining deficiencies
in SIFixSGPRCopies -- we use v_readfirstlane.

There is some unrelated churn in tests since we now select MUBUF offsets
in a unified way with non-scalar buffer loads.

Change-Id: I170e6816323beb1348677b358c9d380865cd1a19

Reviewers: arsenm, alex-t, rampitec, tpr

Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D53283

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344696 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIISelLowering.cpp       | 107 ++++++++----
 lib/Target/AMDGPU/SIISelLowering.h         |   4 +-
 lib/Target/AMDGPU/SIInstrInfo.cpp          | 185 +--------------------
 lib/Target/AMDGPU/SIInstrInfo.h            |   2 -
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp |   7 +-
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h   |   5 +-
 test/CodeGen/AMDGPU/smrd-fold-offset.mir   |   8 +-
 test/CodeGen/AMDGPU/smrd.ll                |  67 +++++++-
 8 files changed, 157 insertions(+), 228 deletions(-)

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 06223287396..c2cf30763c2 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4796,6 +4796,70 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   return SDValue(NewNode, 0);
 }
 
+SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
+                                       SDValue Offset, SDValue GLC,
+                                       SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+          MachineMemOperand::MOInvariant,
+      VT.getStoreSize(), VT.getStoreSize());
+
+  if (!Offset->isDivergent()) {
+    SDValue Ops[] = {
+        Rsrc,
+        Offset, // Offset
+        GLC     // glc
+    };
+    return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+                                   DAG.getVTList(VT), Ops, VT, MMO);
+  }
+
+  // We have a divergent offset. Emit a MUBUF buffer load instead. We can
+  // assume that the buffer is unswizzled.
+  SmallVector<SDValue, 4> Loads;
+  unsigned NumLoads = 1;
+  MVT LoadVT = VT.getSimpleVT();
+
+  assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 ||
+         LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32);
+
+  if (VT == MVT::v8i32 || VT == MVT::v16i32) {
+    NumLoads = VT == MVT::v16i32 ? 4 : 2;
+    LoadVT = MVT::v4i32;
+  }
+
+  SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
+  unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
+  SDValue Ops[] = {
+      DAG.getEntryNode(),                         // Chain
+      Rsrc,                                       // rsrc
+      DAG.getConstant(0, DL, MVT::i32),           // vindex
+      {},                                         // voffset
+      {},                                         // soffset
+      {},                                         // offset
+      DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
+      DAG.getConstant(0, DL, MVT::i1),            // idxen
+  };
+
+  // Use the alignment to ensure that the required offsets will fit into the
+  // immediate offsets.
+  setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
+
+  uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
+  for (unsigned i = 0; i < NumLoads; ++i) {
+    Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
+    Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
+                                            Ops, LoadVT, MMO));
+  }
+
+  if (VT == MVT::v8i32 || VT == MVT::v16i32)
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
+
+  return Loads[0];
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -4951,38 +5015,15 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDZ);
   case AMDGPUIntrinsic::SI_load_const: {
-    SDValue Ops[] = {
-      Op.getOperand(1),   // Ptr
-      Op.getOperand(2),   // Offset
-      DAG.getTargetConstant(0, DL, MVT::i1) // glc
-    };
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo(),
-        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
-            MachineMemOperand::MOInvariant,
-        VT.getStoreSize(), 4);
-    SDVTList VTList = DAG.getVTList(MVT::i32);
-    SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
-                                           VTList, Ops, MVT::i32, MMO);
-
+    SDValue Load =
+        lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
+                     DAG.getTargetConstant(0, DL, MVT::i1), DAG);
     return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
   }
   case Intrinsic::amdgcn_s_buffer_load: {
     unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-    SDValue Ops[] = {
-      Op.getOperand(1), // Ptr
-      Op.getOperand(2), // Offset
-      DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc
-    };
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo(),
-        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
-            MachineMemOperand::MOInvariant,
-        VT.getStoreSize(), VT.getStoreSize());
-    return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
+    return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
+                        DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
   }
   case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
@@ -6017,13 +6058,13 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
 // pointed to by Offsets.
 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
-                                        SelectionDAG &DAG,
-                                        SDValue *Offsets) const {
+                                        SelectionDAG &DAG, SDValue *Offsets,
+                                        unsigned Align) const {
   SDLoc DL(CombinedOffset);
   if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
     uint32_t Imm = C->getZExtValue();
     uint32_t SOffset, ImmOffset;
-    if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) {
+    if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
       Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
       Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
@@ -6035,8 +6076,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
     SDValue N1 = CombinedOffset.getOperand(1);
     uint32_t SOffset, ImmOffset;
     int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
-    if (Offset >= 0
-        && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) {
+    if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
+                                                Subtarget, Align)) {
       Offsets[0] = N0;
       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
       Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 6c02688483b..1b0cb06a9b0 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -60,6 +60,8 @@ private:
                                  MVT VT, unsigned Offset) const;
   SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
                      SelectionDAG &DAG) const;
+  SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
+                       SDValue GLC, SelectionDAG &DAG) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -190,7 +192,7 @@ private:
   // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
   // pointed to by Offsets.
   void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
-                        SDValue *Offsets) const;
+                        SDValue *Offsets, unsigned Align = 4) const;
 
 public:
   SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 61a0030aea2..aa79ad8b9b3 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3576,8 +3576,13 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
   // pointer value is uniform.
   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
-      unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
-      SBase->setReg(SGPR);
+    unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+    SBase->setReg(SGPR);
+  }
+  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
+  if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
+    unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
+    SOff->setReg(SGPR);
   }
 }
 
@@ -4206,115 +4211,6 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
       Inst.eraseFromParent();
       continue;
-
-    case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
-    case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
-    case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
-    case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
-    case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
-      unsigned VDst;
-      unsigned NewOpcode;
-
-      switch(Opcode) {
-      case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
-        NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
-        VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-        break;
-      case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
-        NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
-        VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-        break;
-      case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
-        NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
-        VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
-        break;
-      case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
-      case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
-        splitScalarBuffer(Worklist, Inst);
-        Inst.eraseFromParent();
-        continue;
-      }
-
-      const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
-      auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
-      unsigned Offset = 0;
-
-      // FIXME: This isn't safe because the addressing mode doesn't work
-      // correctly if vaddr is negative.
-      //
-      // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
-      //
-      // See if we can extract an immediate offset by recognizing one of these:
-      //   V_ADD_I32_e32 dst, imm, src1
-      //   V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
-      // V_ADD will be removed by "Remove dead machine instructions".
-      if (Add &&
-          (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
-           Add->getOpcode() == AMDGPU::V_ADD_U32_e32 ||
-           Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
-        static const unsigned SrcNames[2] = {
-          AMDGPU::OpName::src0,
-          AMDGPU::OpName::src1,
-        };
-
-        // Find a literal offset in one of source operands.
-        for (int i = 0; i < 2; i++) {
-          const MachineOperand *Src =
-            getNamedOperand(*Add, SrcNames[i]);
-
-          if (Src->isReg()) {
-            MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg());
-            if (Def) {
-              if (Def->isMoveImmediate())
-                Src = &Def->getOperand(1);
-              else if (Def->isCopy()) {
-                auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
-                if (Mov && Mov->isMoveImmediate()) {
-                  Src = &Mov->getOperand(1);
-                }
-              }
-            }
-          }
-
-          if (Src) {
-            if (Src->isImm())
-              Offset = Src->getImm();
-            else if (Src->isCImm())
-              Offset = Src->getCImm()->getZExtValue();
-          }
-
-          if (Offset && isLegalMUBUFImmOffset(Offset)) {
-            VAddr = getNamedOperand(*Add, SrcNames[!i]);
-            break;
-          }
-
-          Offset = 0;
-        }
-      }
-
-      MachineInstr *NewInstr =
-          BuildMI(*MBB, Inst, Inst.getDebugLoc(),
-                  get(NewOpcode), VDst)
-              .add(*VAddr)                                        // vaddr
-              .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
-              .addImm(0)                                          // soffset
-              .addImm(Offset)                                     // offset
-              .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
-              .addImm(0) // slc
-              .addImm(0) // tfe
-              .cloneMemRefs(Inst)
-              .getInstr();
-
-      MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
-                         VDst);
-      addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
-      Inst.eraseFromParent();
-
-      // Legalize all operands other than the offset. Notably, convert the srsrc
-      // into SGPRs using v_readfirstlane if needed.
-      legalizeOperands(*NewInstr, MDT);
-      continue;
-    }
     }
 
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
@@ -4796,73 +4692,6 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist,
-                                    MachineInstr &Inst) const {
-  MachineBasicBlock &MBB = *Inst.getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
-  MachineBasicBlock::iterator MII = Inst;
-  auto &DL = Inst.getDebugLoc();
-
-  MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);;
-  MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase);
-  MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff);
-  MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc);
-
-  unsigned Opcode = Inst.getOpcode();
-  unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
-  unsigned Count = 0;
-  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
-  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
-
-  switch(Opcode) {
-  default:
-    return;
-  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
-    Count = 2;
-    break;
-  case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
-    Count = 4;
-    break;
-  }
-
-  // FIXME: Should also attempt to build VAddr and Offset like the non-split
-  // case (see call site for this function)
-
-  // Create a vector of result registers
-  SmallVector<unsigned, 8> ResultRegs;
-  for (unsigned i = 0; i < Count ; ++i) {
-    unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
-    MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg)
-      .addReg(Offset.getReg())  // offset
-      .addReg(Rsrc.getReg())    // rsrc
-      .addImm(0)                // soffset
-      .addImm(i << 4)           // inst_offset
-      .addImm(Glc.getImm())     // glc
-      .addImm(0)                // slc
-      .addImm(0)                // tfe
-      .addMemOperand(*Inst.memoperands_begin());
-    // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE
-    auto &NewDestOp = NewMI.getOperand(0);
-    for (unsigned i = 0 ; i < 4 ; i++)
-      ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass,
-                                              RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass));
-  }
-  // Create a new combined result to replace original with
-  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
-  MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL,
-                                  get(TargetOpcode::REG_SEQUENCE), FullDestReg);
-
-  for (unsigned i = 0 ; i < Count * 4 ; ++i) {
-    CombinedResBuilder
-      .addReg(ResultRegs[i])
-      .addImm(RI.getSubRegFromChannel(i));
-  }
-
-  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
-  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
-}
-
 void SIInstrInfo::addUsersToMoveToVALUWorklist(
   unsigned DstReg,
   MachineRegisterInfo &MRI,
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 2f51b199950..34cac88cbf1 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -103,8 +103,6 @@ private:
                             MachineInstr &Inst) const;
   void splitScalar64BitBFE(SetVectorType &Worklist,
                            MachineInstr &Inst) const;
-  void splitScalarBuffer(SetVectorType &Worklist,
-                         MachineInstr &Inst) const;
   void movePackToVALU(SetVectorType &Worklist,
                       MachineRegisterInfo &MRI,
                       MachineInstr &Inst) const;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b242345c52a..00e9ff7abfd 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -888,9 +888,12 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
 // Given Imm, split it into the values to put into the SOffset and ImmOffset
 // fields in an MUBUF instruction. Return false if it is not possible (due to a
 // hardware bug needing a workaround).
+//
+// The required alignment ensures that individual address components remain
+// aligned if they are aligned to begin with. It also ensures that additional
+// offsets within the given alignment can be added to the resulting ImmOffset.
 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
-                      const GCNSubtarget *Subtarget) {
-  const uint32_t Align = 4;
+                      const GCNSubtarget *Subtarget, uint32_t Align) {
   const uint32_t MaxImm = alignDown(4095, Align);
   uint32_t Overflow = 0;
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index da004a6a841..699b17061d7 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -440,11 +440,8 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 /// not the encoded offset.
 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 
-// Given Imm, split it into the values to put into the SOffset and ImmOffset
-// fields in an MUBUF instruction. Return false if it is not possible (due to a
-// hardware bug needing a workaround).
 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
-                      const GCNSubtarget *Subtarget);
+                      const GCNSubtarget *Subtarget, uint32_t Align = 4);
 
 /// \returns true if the intrinsic is divergent
 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/test/CodeGen/AMDGPU/smrd-fold-offset.mir b/test/CodeGen/AMDGPU/smrd-fold-offset.mir
index 44954f06523..10601ccaeb7 100644
--- a/test/CodeGen/AMDGPU/smrd-fold-offset.mir
+++ b/test/CodeGen/AMDGPU/smrd-fold-offset.mir
@@ -1,6 +1,8 @@
 # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies -o - %s | FileCheck -check-prefix=GCN %s
 
-# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
+# GCN-LABEL: name: smrd_vgpr_offset_imm
+# GCN: V_READFIRSTLANE_B32
+# GCN: S_BUFFER_LOAD_DWORD_SGPR
 ---
 name:            smrd_vgpr_offset_imm
 body:             |
@@ -22,7 +24,9 @@ body:             |
     SI_RETURN_TO_EPILOG $vgpr0
 ...
 
-# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
+# GCN-LABEL: name: smrd_vgpr_offset_imm_add_u32
+# GCN: V_READFIRSTLANE_B32
+# GCN: S_BUFFER_LOAD_DWORD_SGPR
 ---
 name:            smrd_vgpr_offset_imm_add_u32
 body:             |
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index b4220c25f00..0e89f8516bf 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -292,18 +292,19 @@ main_body:
 
 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm:
 ; GCN-NEXT: %bb.
-; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ;
+; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ;
 define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 {
 main_body:
-  %off = add i32 %offset, 4095
+  %off = add i32 %offset, 4092
   %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off)
   ret float %r
 }
 
 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large:
 ; GCN-NEXT: %bb.
-; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
-; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
+; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
+; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
+; VIGFX9-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ;
 define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 {
 main_body:
   %off = add i32 %offset, 4096
@@ -495,6 +496,59 @@ main_body:
   ret void
 }
 
+; SMRD load with a non-const non-uniform offset of > 4 dwords (requires splitting)
+; GCN-LABEL: {{^}}smrd_load_nonconst3:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; GCN: ; return to shader part epilog
+define amdgpu_ps <16 x float> @smrd_load_nonconst3(<4 x i32> inreg %rsrc, i32 %off) #0 {
+main_body:
+  %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off, i32 0)
+  %bc = bitcast <16 x i32> %ld to <16 x float>
+  ret <16 x float> %bc
+}
+
+; GCN-LABEL: {{^}}smrd_load_nonconst4:
+; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
+; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ;
+; GCN: ; return to shader part epilog
+define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 {
+main_body:
+  %off.2 = add i32 %off, 4088
+  %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0)
+  %bc = bitcast <16 x i32> %ld to <16 x float>
+  ret <16 x float> %bc
+}
+
+; GCN-LABEL: {{^}}smrd_load_nonconst5:
+; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0
+; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; VIGFX9: s_movk_i32 s4, 0xfc0
+; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ;
+; GCN: ; return to shader part epilog
+define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 {
+main_body:
+  %off.2 = add i32 %off, 4100
+  %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0)
+  %bc = bitcast <16 x i32> %ld to <16 x float>
+  ret <16 x float> %bc
+}
+
 ; SMRD load dwordx2
 ; GCN-LABEL: {{^}}smrd_load_dwordx2:
 ; SIVIGFX9: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
@@ -513,9 +567,10 @@ main_body:
 
 ; GCN-LABEL: {{^}}smrd_uniform_loop:
 ;
-; TODO: this should use an s_buffer_load
+; TODO: we should keep the loop counter in an SGPR
 ;
-; GCN: buffer_load_dword
+; GCN: v_readfirstlane_b32
+; GCN: s_buffer_load_dword
 define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
 main_body:
   br label %loop
-- 
GitLab


From 2026bfdfbd1d01f3835abfb597169dd8241b78c2 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Wed, 17 Oct 2018 15:37:41 +0000
Subject: [PATCH 0282/1116] StructurizeCFG: Simplify inserted PHI nodes

Summary:
This improves subsequent divergence analysis in some cases.

Change-Id: I5e95e7ec7fd3fa80d414d1a53a02fea23e3d67d3

Reviewers: arsenm, rampitec

Subscribers: jvesely, wdng, llvm-commits

Differential Revision: https://reviews.llvm.org/D53316

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344697 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/StructurizeCFG.cpp      | 24 ++++++++++++++++++-
 .../AMDGPU/multi-divergent-exit-region.ll     | 11 ++++-----
 test/CodeGen/AMDGPU/multilevel-break.ll       |  3 +++
 test/CodeGen/AMDGPU/smrd.ll                   |  7 +++---
 .../StructurizeCFG/invert-constantexpr.ll     |  7 +++---
 .../StructurizeCFG/loop-continue-phi.ll       | 15 ++++--------
 .../one-loop-multiple-backedges.ll            | 19 ++++++++-------
 7 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index 2bfd9927411..0db762d846f 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
@@ -596,7 +597,8 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
 
 /// Add the real PHI value as soon as everything is set up
 void StructurizeCFG::setPhiValues() {
-  SSAUpdater Updater;
+  SmallVector<PHINode *, 8> InsertedPhis;
+  SSAUpdater Updater(&InsertedPhis);
   for (const auto &AddedPhi : AddedPhis) {
     BasicBlock *To = AddedPhi.first;
     const BBVector &From = AddedPhi.second;
@@ -632,6 +634,26 @@ void StructurizeCFG::setPhiValues() {
     DeletedPhis.erase(To);
   }
   assert(DeletedPhis.empty());
+
+  // Simplify any phis inserted by the SSAUpdater if possible
+  bool Changed;
+  do {
+    Changed = false;
+
+    SimplifyQuery Q(Func->getParent()->getDataLayout());
+    Q.DT = DT;
+    for (size_t i = 0; i < InsertedPhis.size(); ++i) {
+      PHINode *Phi = InsertedPhis[i];
+      if (Value *V = SimplifyInstruction(Phi, Q)) {
+        Phi->replaceAllUsesWith(V);
+        Phi->eraseFromParent();
+        InsertedPhis[i] = InsertedPhis.back();
+        InsertedPhis.pop_back();
+        i--;
+        Changed = true;
+      }
+    }
+  } while (Changed);
 }
 
 /// Remove phi values from all successors and then remove the terminator.
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 9e6efc565e4..fbdf9832b29 100644
--- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -312,13 +312,12 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 
 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
 ; IR: Flow2:
-; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
-; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %20)
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
 
 ; IR: UnifiedReturnBlock:
-; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %15)
+; IR: %UnifiedRetVal = phi float [ 2.000000e+00, %Flow2 ], [ 1.000000e+00, %exit0 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
 ; IR: ret float %UnifiedRetVal
 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
 entry:
@@ -353,8 +352,8 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; GCN: {{^}}[[FLOW]]:
 ; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
 
-; GCN: v_mov_b32_e32 v0, 2.0
 ; GCN: s_or_b64 exec, exec
+; GCN: v_mov_b32_e32 v0, 2.0
 ; GCN-NOT: s_and_b64 exec, exec
 ; GCN: v_mov_b32_e32 v0, 1.0
 
diff --git a/test/CodeGen/AMDGPU/multilevel-break.ll b/test/CodeGen/AMDGPU/multilevel-break.ll
index 3f7df7b6005..216ca1973b5 100644
--- a/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -33,6 +33,8 @@
 ; GCN-NEXT: s_mov_b64
 ; GCN-NEXT: s_and_b64 [[MASKED_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_BREAK]]
 ; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
+; TODO: get rid of redundant loop counter moves
+; GCN-NEXT: v_mov_b32_e32
 ; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
 ; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]
 
@@ -43,6 +45,7 @@
 ; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]]
 ; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_BREAK]]
 ; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
+; GCN-NEXT: v_mov_b32_e32
 ; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]]
 ; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]]
 define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 0e89f8516bf..f453cfdbd1f 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -592,11 +592,12 @@ exit:
 
 ; GCN-LABEL: {{^}}smrd_uniform_loop2:
 ; (this test differs from smrd_uniform_loop by the more complex structure of phis,
-; which currently confuses the DivergenceAnalysis after structurization)
+; which used to confuse the DivergenceAnalysis after structurization)
 ;
-; TODO: this should use an s_buffer_load
+; TODO: we should keep the loop counter in an SGPR
 ;
-; GCN: buffer_load_dword
+; GCN: v_readfirstlane_b32
+; GCN: s_buffer_load_dword
 define amdgpu_ps float @smrd_uniform_loop2(<4 x i32> inreg %desc, i32 %bound, i32 %bound.a) #0 {
 main_body:
   br label %loop
diff --git a/test/Transforms/StructurizeCFG/invert-constantexpr.ll b/test/Transforms/StructurizeCFG/invert-constantexpr.ll
index ac12b5d6b65..61482bb73ad 100644
--- a/test/Transforms/StructurizeCFG/invert-constantexpr.ll
+++ b/test/Transforms/StructurizeCFG/invert-constantexpr.ll
@@ -12,13 +12,12 @@ define void @invert_constantexpr_condition(i32 %arg, i32 %arg1) #0 {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[FLOW]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP1:%.*]], [[FLOW]] ], [ [[TMP7:%.*]], [[BB6:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ undef, [[FLOW]] ], [ [[TMP7:%.*]], [[BB6:%.*]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], icmp eq (i32 ptrtoint (i32* @g to i32), i32 0)
 ; CHECK-NEXT:    br label [[BB8:%.*]]
 ; CHECK:       Flow:
-; CHECK-NEXT:    [[TMP1]] = phi i1 [ undef, [[BB2]] ], [ undef, [[BB:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ [[TMP0]], [[BB2]] ], [ icmp ne (i32 ptrtoint (i32* @g to i32), i32 0), [[BB]] ]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[BB6]], label [[BB3:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[BB2]] ], [ icmp ne (i32 ptrtoint (i32* @g to i32), i32 0), [[BB:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB6]], label [[BB3:%.*]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    [[TMP7]] = icmp slt i32 [[ARG]], [[ARG1:%.*]]
 ; CHECK-NEXT:    br label [[BB3]]
diff --git a/test/Transforms/StructurizeCFG/loop-continue-phi.ll b/test/Transforms/StructurizeCFG/loop-continue-phi.ll
index 7e1c0b9413f..2300aea077f 100644
--- a/test/Transforms/StructurizeCFG/loop-continue-phi.ll
+++ b/test/Transforms/StructurizeCFG/loop-continue-phi.ll
@@ -1,28 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -o - -structurizecfg < %s | FileCheck %s
 
-;
-; TODO: eliminate redundant phis for the loop counter
-;
 define void @test1() {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       Flow:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[CTR_NEXT:%.*]], [[LOOP_B:%.*]] ], [ [[CTR_NEXT]], [[LOOP_A:%.*]] ]
 ; CHECK-NEXT:    br label [[FLOW1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[FLOW1]] ]
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[CTR_NEXT:%.*]], [[FLOW1]] ]
 ; CHECK-NEXT:    [[CTR_NEXT]] = add i32 [[CTR]], 1
-; CHECK-NEXT:    br i1 undef, label [[LOOP_A]], label [[FLOW1]]
+; CHECK-NEXT:    br i1 undef, label [[LOOP_A:%.*]], label [[FLOW1]]
 ; CHECK:       loop.a:
-; CHECK-NEXT:    br i1 undef, label [[LOOP_B]], label [[FLOW:%.*]]
+; CHECK-NEXT:    br i1 undef, label [[LOOP_B:%.*]], label [[FLOW:%.*]]
 ; CHECK:       loop.b:
 ; CHECK-NEXT:    br label [[FLOW]]
 ; CHECK:       Flow1:
-; CHECK-NEXT:    [[TMP1]] = phi i32 [ [[TMP0]], [[FLOW]] ], [ undef, [[LOOP]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ false, [[FLOW]] ], [ true, [[LOOP]] ]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[FLOW]] ], [ true, [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll b/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll
index 668a1e99d81..0af25d61b92 100644
--- a/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll
+++ b/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll
@@ -8,33 +8,36 @@ bb:
   br label %bb3
 
 ; CHECK: bb3:
+; CHECK:   %0 = xor i1 %tmp4, true
+; CHECK:   br i1 %0, label %bb5, label %Flow
 bb3:                                              ; preds = %bb7, %bb
   %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ]
   %tmp4 = fcmp ult float %arg1, 3.500000e+00
-; CHECK: %0 = xor i1 %tmp4, true
-; CHECK: br i1 %0, label %bb5, label %Flow
   br i1 %tmp4, label %bb7, label %bb5
 
 ; CHECK: bb5:
+; CHECK:   %1 = xor i1 %tmp6, true
+; CHECK:   br label %Flow
 bb5:                                              ; preds = %bb3
   %tmp6 = fcmp olt float 0.000000e+00, %arg2
-; CHECK: br label %Flow
   br i1 %tmp6, label %bb10, label %bb7
 
 ; CHECK: Flow:
-; CHECK: br i1 %3, label %bb7, label %Flow1
+; CHECK:   %2 = phi i1 [ %1, %bb5 ], [ %tmp4, %bb3 ]
+; CHECK:   br i1 %2, label %bb7, label %Flow1
 
-; CHECK: bb7
+; CHECK: bb7:
+; CHECK:   br label %Flow1
 bb7:                                              ; preds = %bb5, %bb3
   %tmp8 = add nuw nsw i64 %tmp, 1
   %tmp9 = icmp slt i64 %tmp8, 5
-; CHECK: br label %Flow1
   br i1 %tmp9, label %bb3, label %bb10
 
 ; CHECK: Flow1:
-; CHECK: br i1 %7, label %bb10, label %bb3
+; CHECK:   %6 = phi i1 [ %3, %bb7 ], [ true, %Flow ]
+; CHECK:   br i1 %6, label %bb10, label %bb3
 
-; CHECK: bb10
+; CHECK: bb10:
 bb10:                                             ; preds = %bb7, %bb5
   %tmp11 = phi i32 [ 15, %bb5 ], [ 255, %bb7 ]
   store i32 %tmp11, i32 addrspace(1)* %arg, align 4
-- 
GitLab


From 1db6c096861a48b487395f69a6857248a9a58a06 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Wed, 17 Oct 2018 15:37:48 +0000
Subject: [PATCH 0283/1116] AMDGPU: Avoid selecting ds_{read,write}2_b32 on SI

Summary:
To workaround a hardware issue in the (base + offset) calculation
when base is negative. The impact on code quality should be limited
since SILoadStoreOptimizer still runs afterwards and is able to
combine loads/stores based on known sign information.

This fixes visible corruption in Hitman on SI (easily reproducible
by running benchmark mode).

Change-Id: Ia178d207a5e2ac38ae7cd98b532ea2ae74704e5f
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=99923

Reviewers: arsenm, mareko

Subscribers: jholewinski, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D53160

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344698 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |   2 -
 lib/Target/AMDGPU/DSInstructions.td      |   4 +-
 lib/Target/AMDGPU/SIISelLowering.cpp     |  23 ++++
 test/CodeGen/AMDGPU/lds-bounds.ll        | 129 +++++++++++++++++++++++
 4 files changed, 155 insertions(+), 3 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/lds-bounds.ll

diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 4010b77172c..025e2de742d 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -978,8 +978,6 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
 
   // default case
 
-  // FIXME: This is broken on SI where we still need to check if the base
-  // pointer is positive here.
   Base = Addr;
   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index cdc6ab9412e..31d2ebef481 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -728,7 +728,9 @@ class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
               (i1 0))
 >;
 
-let OtherPredicates = [LDSRequiresM0Init] in {
+// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
+// related to bounds checking.
+let OtherPredicates = [LDSRequiresM0Init, isCIVI] in {
 def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
 def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
 }
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index c2cf30763c2..81ff640f704 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6292,6 +6292,17 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
     if (NumElements > 2)
       return SplitVectorLoad(Op, DAG);
+
+    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
+    // address is negative, then the instruction is incorrectly treated as
+    // out-of-bounds even if base + offsets is in bounds. Split vectorized
+    // loads here to avoid emitting ds_read2_b32. We may re-combine the
+    // load later in the SILoadStoreOptimizer.
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+        NumElements == 2 && MemVT.getStoreSize() == 8 &&
+        Load->getAlignment() < 8) {
+      return SplitVectorLoad(Op, DAG);
+    }
   }
   return SDValue();
 }
@@ -6694,6 +6705,18 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
     if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
+
+    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
+    // address is negative, then the instruction is incorrectly treated as
+    // out-of-bounds even if base + offsets is in bounds. Split vectorized
+    // stores here to avoid emitting ds_write2_b32. We may re-combine the
+    // store later in the SILoadStoreOptimizer.
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+        NumElements == 2 && VT.getStoreSize() == 8 &&
+        Store->getAlignment() < 8) {
+      return SplitVectorStore(Op, DAG);
+    }
+
     return SDValue();
   } else {
     llvm_unreachable("unhandled address space");
diff --git a/test/CodeGen/AMDGPU/lds-bounds.ll b/test/CodeGen/AMDGPU/lds-bounds.ll
new file mode 100644
index 00000000000..80a26281216
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-bounds.ll
@@ -0,0 +1,129 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOSI %s
+
+@compute_lds = external addrspace(3) global [512 x i32], align 16
+
+; GCN-LABEL: {{^}}store_aligned:
+; GCN: ds_write_b64
+define amdgpu_cs void @store_aligned(i32 addrspace(3)* %ptr) #0 {
+entry:
+  %ptr.gep.1 = getelementptr i32, i32 addrspace(3)* %ptr, i32 1
+
+  store i32 42, i32 addrspace(3)* %ptr, align 8
+  store i32 43, i32 addrspace(3)* %ptr.gep.1
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}load_aligned:
+; GCN: ds_read_b64
+define amdgpu_cs <2 x float> @load_aligned(i32 addrspace(3)* %ptr) #0 {
+entry:
+  %ptr.gep.1 = getelementptr i32, i32 addrspace(3)* %ptr, i32 1
+
+  %v.0 = load i32, i32 addrspace(3)* %ptr, align 8
+  %v.1 = load i32, i32 addrspace(3)* %ptr.gep.1
+
+  %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
+  %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
+  %bc = bitcast <2 x i32> %r.1 to <2 x float>
+  ret <2 x float> %bc
+}
+
+
+; GCN-LABEL: {{^}}store_global_const_idx:
+; GCN: ds_write2_b32
+define amdgpu_cs void @store_global_const_idx() #0 {
+entry:
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 3
+  %ptr.b = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 4
+
+  store i32 42, i32 addrspace(3)* %ptr.a
+  store i32 43, i32 addrspace(3)* %ptr.b
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}load_global_const_idx:
+; GCN: ds_read2_b32
+define amdgpu_cs <2 x float> @load_global_const_idx() #0 {
+entry:
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 3
+  %ptr.b = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 4
+
+  %v.0 = load i32, i32 addrspace(3)* %ptr.a
+  %v.1 = load i32, i32 addrspace(3)* %ptr.b
+
+  %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
+  %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
+  %bc = bitcast <2 x i32> %r.1 to <2 x float>
+  ret <2 x float> %bc
+}
+
+
+; GCN-LABEL: {{^}}store_global_var_idx_case1:
+; SI: ds_write_b32
+; SI: ds_write_b32
+; NONSI: ds_write2_b32
+define amdgpu_cs void @store_global_var_idx_case1(i32 %idx) #0 {
+entry:
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx
+  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+
+  store i32 42, i32 addrspace(3)* %ptr.a
+  store i32 43, i32 addrspace(3)* %ptr.b
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}load_global_var_idx_case1:
+; SI: ds_read_b32
+; SI: ds_read_b32
+; NONSI: ds_read2_b32
+define amdgpu_cs <2 x float> @load_global_var_idx_case1(i32 %idx) #0 {
+entry:
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx
+  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+
+  %v.0 = load i32, i32 addrspace(3)* %ptr.a
+  %v.1 = load i32, i32 addrspace(3)* %ptr.b
+
+  %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
+  %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
+  %bc = bitcast <2 x i32> %r.1 to <2 x float>
+  ret <2 x float> %bc
+}
+
+
+; GCN-LABEL: {{^}}store_global_var_idx_case2:
+; GCN: ds_write2_b32
+define amdgpu_cs void @store_global_var_idx_case2(i32 %idx) #0 {
+entry:
+  %idx.and = and i32 %idx, 255
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx.and
+  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+
+  store i32 42, i32 addrspace(3)* %ptr.a
+  store i32 43, i32 addrspace(3)* %ptr.b
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}load_global_var_idx_case2:
+; GCN: ds_read2_b32
+define amdgpu_cs <2 x float> @load_global_var_idx_case2(i32 %idx) #0 {
+entry:
+  %idx.and = and i32 %idx, 255
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx.and
+  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+
+  %v.0 = load i32, i32 addrspace(3)* %ptr.a
+  %v.1 = load i32, i32 addrspace(3)* %ptr.b
+
+  %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
+  %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
+  %bc = bitcast <2 x i32> %r.1 to <2 x float>
+  ret <2 x float> %bc
+}
+
+attributes #0 = { nounwind }
-- 
GitLab


From 97b215a77328e768822f560f5e38386d9a7503dd Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Wed, 17 Oct 2018 18:50:25 +0000
Subject: [PATCH 0284/1116] Port libcxxabi r344607 into llvm

Summary:
The original commit message was:
    This uses CRTP (for performance reasons) to allow a user the override
    demangler functions to implement custom parsing logic. The motivation
    for this is LLDB, which needs to occasionaly modify the mangled names.
    One such instance is already implemented via the TypeCallback member,
    but this is very specific functionality which does not help with any
    other use case. Currently we have a use case for modifying the
    constructor flavours, which would require adding another callback. This
    approach does not scale.

    With CRTP, the user (LLDB) can override any function it needs without
    any special support from the demangler library. After LLDB is ported to
    use this instead of the TypeCallback mechanism, the callback can be
    removed.

The only difference here is the addition of a unit test which exercises
the CRTP mechanism to override a function in the parser.

Reviewers: erik.pilkington, rsmith, EricWF

Subscribers: mgorny, kristina, llvm-commits

Differential Revision: https://reviews.llvm.org/D53300

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344703 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Demangle/ItaniumDemangle.h      | 664 ++++++++++---------
 lib/Demangle/ItaniumDemangle.cpp             |   2 +-
 lib/Support/ItaniumManglingCanonicalizer.cpp |   3 +-
 unittests/Demangle/CMakeLists.txt            |   2 +
 unittests/Demangle/ItaniumDemangleTest.cpp   |  54 ++
 5 files changed, 421 insertions(+), 304 deletions(-)
 create mode 100644 unittests/Demangle/ItaniumDemangleTest.cpp

diff --git a/include/llvm/Demangle/ItaniumDemangle.h b/include/llvm/Demangle/ItaniumDemangle.h
index bc60bc3454e..a465921843f 100644
--- a/include/llvm/Demangle/ItaniumDemangle.h
+++ b/include/llvm/Demangle/ItaniumDemangle.h
@@ -2134,8 +2134,7 @@ public:
   }
 };
 
-template <typename Alloc>
-struct Db {
+template <typename Derived, typename Alloc> struct AbstractManglingParser {
   const char *First;
   const char *Last;
 
@@ -2167,7 +2166,10 @@ struct Db {
 
   Alloc ASTAllocator;
 
-  Db(const char *First_, const char *Last_) : First(First_), Last(Last_) {}
+  AbstractManglingParser(const char *First_, const char *Last_)
+      : First(First_), Last(Last_) {}
+
+  Derived &getDerived() { return static_cast<Derived &>(*this); }
 
   void reset(const char *First_, const char *Last_) {
     First = First_;
@@ -2274,7 +2276,7 @@ struct Db {
     FunctionRefQual ReferenceQualifier = FrefQualNone;
     size_t ForwardTemplateRefsBegin;
 
-    NameState(Db *Enclosing)
+    NameState(AbstractManglingParser *Enclosing)
         : ForwardTemplateRefsBegin(Enclosing->ForwardTemplateRefs.size()) {}
   };
 
@@ -2324,35 +2326,36 @@ const char* parse_discriminator(const char* first, const char* last);
 //
 // <unscoped-template-name> ::= <unscoped-name>
 //                          ::= <substitution>
-template<typename Alloc> Node *Db<Alloc>::parseName(NameState *State) {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseName(NameState *State) {
   consumeIf('L'); // extension
 
   if (look() == 'N')
-    return parseNestedName(State);
+    return getDerived().parseNestedName(State);
   if (look() == 'Z')
-    return parseLocalName(State);
+    return getDerived().parseLocalName(State);
 
   //        ::= <unscoped-template-name> <template-args>
   if (look() == 'S' && look(1) != 't') {
-    Node *S = parseSubstitution();
+    Node *S = getDerived().parseSubstitution();
     if (S == nullptr)
       return nullptr;
     if (look() != 'I')
       return nullptr;
-    Node *TA = parseTemplateArgs(State != nullptr);
+    Node *TA = getDerived().parseTemplateArgs(State != nullptr);
     if (TA == nullptr)
       return nullptr;
     if (State) State->EndsWithTemplateArgs = true;
     return make<NameWithTemplateArgs>(S, TA);
   }
 
-  Node *N = parseUnscopedName(State);
+  Node *N = getDerived().parseUnscopedName(State);
   if (N == nullptr)
     return nullptr;
   //        ::= <unscoped-template-name> <template-args>
   if (look() == 'I') {
     Subs.push_back(N);
-    Node *TA = parseTemplateArgs(State != nullptr);
+    Node *TA = getDerived().parseTemplateArgs(State != nullptr);
     if (TA == nullptr)
       return nullptr;
     if (State) State->EndsWithTemplateArgs = true;
@@ -2365,10 +2368,11 @@ template<typename Alloc> Node *Db<Alloc>::parseName(NameState *State) {
 // <local-name> := Z <function encoding> E <entity name> [<discriminator>]
 //              := Z <function encoding> E s [<discriminator>]
 //              := Z <function encoding> Ed [ <parameter number> ] _ <entity name>
-template<typename Alloc> Node *Db<Alloc>::parseLocalName(NameState *State) {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseLocalName(NameState *State) {
   if (!consumeIf('Z'))
     return nullptr;
-  Node *Encoding = parseEncoding();
+  Node *Encoding = getDerived().parseEncoding();
   if (Encoding == nullptr || !consumeIf('E'))
     return nullptr;
 
@@ -2384,13 +2388,13 @@ template<typename Alloc> Node *Db<Alloc>::parseLocalName(NameState *State) {
     parseNumber(true);
     if (!consumeIf('_'))
       return nullptr;
-    Node *N = parseName(State);
+    Node *N = getDerived().parseName(State);
     if (N == nullptr)
       return nullptr;
     return make<LocalName>(Encoding, N);
   }
 
-  Node *Entity = parseName(State);
+  Node *Entity = getDerived().parseName(State);
   if (Entity == nullptr)
     return nullptr;
   First = parse_discriminator(First, Last);
@@ -2400,14 +2404,16 @@ template<typename Alloc> Node *Db<Alloc>::parseLocalName(NameState *State) {
 // <unscoped-name> ::= <unqualified-name>
 //                 ::= St <unqualified-name>   # ::std::
 // extension       ::= StL<unqualified-name>
-template<typename Alloc> Node *Db<Alloc>::parseUnscopedName(NameState *State) {
- if (consumeIf("StL") || consumeIf("St")) {
-   Node *R = parseUnqualifiedName(State);
-   if (R == nullptr)
-     return nullptr;
-   return make<StdQualifiedName>(R);
- }
- return parseUnqualifiedName(State);
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseUnscopedName(NameState *State) {
+  if (consumeIf("StL") || consumeIf("St")) {
+    Node *R = getDerived().parseUnqualifiedName(State);
+    if (R == nullptr)
+      return nullptr;
+    return make<StdQualifiedName>(R);
+  }
+  return getDerived().parseUnqualifiedName(State);
 }
 
 // <unqualified-name> ::= <operator-name> [abi-tags]
@@ -2415,27 +2421,28 @@ template<typename Alloc> Node *Db<Alloc>::parseUnscopedName(NameState *State) {
 //                    ::= <source-name>
 //                    ::= <unnamed-type-name>
 //                    ::= DC <source-name>+ E      # structured binding declaration
-template<typename Alloc>
-Node *Db<Alloc>::parseUnqualifiedName(NameState *State) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseUnqualifiedName(NameState *State) {
   // <ctor-dtor-name>s are special-cased in parseNestedName().
   Node *Result;
   if (look() == 'U')
-    Result = parseUnnamedTypeName(State);
+    Result = getDerived().parseUnnamedTypeName(State);
   else if (look() >= '1' && look() <= '9')
-    Result = parseSourceName(State);
+    Result = getDerived().parseSourceName(State);
   else if (consumeIf("DC")) {
     size_t BindingsBegin = Names.size();
     do {
-      Node *Binding = parseSourceName(State);
+      Node *Binding = getDerived().parseSourceName(State);
       if (Binding == nullptr)
         return nullptr;
       Names.push_back(Binding);
     } while (!consumeIf('E'));
     Result = make<StructuredBindingName>(popTrailingNodeArray(BindingsBegin));
   } else
-    Result = parseOperatorName(State);
+    Result = getDerived().parseOperatorName(State);
   if (Result != nullptr)
-    Result = parseAbiTags(Result);
+    Result = getDerived().parseAbiTags(Result);
   return Result;
 }
 
@@ -2445,7 +2452,9 @@ Node *Db<Alloc>::parseUnqualifiedName(NameState *State) {
 // <closure-type-name> ::= Ul <lambda-sig> E [ <nonnegative number> ] _
 //
 // <lambda-sig> ::= <parameter type>+  # Parameter types or "v" if the lambda has no parameters
-template<typename Alloc> Node *Db<Alloc>::parseUnnamedTypeName(NameState *) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseUnnamedTypeName(NameState *) {
   if (consumeIf("Ut")) {
     StringView Count = parseNumber();
     if (!consumeIf('_'))
@@ -2458,7 +2467,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnnamedTypeName(NameState *) {
     if (!consumeIf("vE")) {
       size_t ParamsBegin = Names.size();
       do {
-        Node *P = parseType();
+        Node *P = getDerived().parseType();
         if (P == nullptr)
           return nullptr;
         Names.push_back(P);
@@ -2474,7 +2483,8 @@ template<typename Alloc> Node *Db<Alloc>::parseUnnamedTypeName(NameState *) {
 }
 
 // <source-name> ::= <positive length number> <identifier>
-template<typename Alloc> Node *Db<Alloc>::parseSourceName(NameState *) {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseSourceName(NameState *) {
   size_t Length = 0;
   if (parsePositiveInteger(&Length))
     return nullptr;
@@ -2538,7 +2548,9 @@ template<typename Alloc> Node *Db<Alloc>::parseSourceName(NameState *) {
 //                   ::= rS    # >>=
 //                   ::= ss    # <=> C++2a
 //                   ::= v <digit> <source-name>        # vendor extended operator
-template<typename Alloc> Node *Db<Alloc>::parseOperatorName(NameState *State) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseOperatorName(NameState *State) {
   switch (look()) {
   case 'a':
     switch (look(1)) {
@@ -2578,7 +2590,7 @@ template<typename Alloc> Node *Db<Alloc>::parseOperatorName(NameState *State) {
       SwapAndRestore<bool> SavePermit(PermitForwardTemplateReferences,
                                       PermitForwardTemplateReferences ||
                                           State != nullptr);
-      Node* Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       if (State) State->CtorDtorConversion = true;
@@ -2642,7 +2654,7 @@ template<typename Alloc> Node *Db<Alloc>::parseOperatorName(NameState *State) {
     //                   ::= li <source-name>  # operator ""
     case 'i': {
       First += 2;
-      Node *SN = parseSourceName(State);
+      Node *SN = getDerived().parseSourceName(State);
       if (SN == nullptr)
         return nullptr;
       return make<LiteralOperator>(SN);
@@ -2763,7 +2775,7 @@ template<typename Alloc> Node *Db<Alloc>::parseOperatorName(NameState *State) {
   case 'v':
     if (std::isdigit(look(1))) {
       First += 2;
-      Node *SN = parseSourceName(State);
+      Node *SN = getDerived().parseSourceName(State);
       if (SN == nullptr)
         return nullptr;
       return make<ConversionOperatorType>(SN);
@@ -2781,8 +2793,10 @@ template<typename Alloc> Node *Db<Alloc>::parseOperatorName(NameState *State) {
 //                  ::= D1  # complete object destructor
 //                  ::= D2  # base object destructor
 //   extension      ::= D5    # ?
-template<typename Alloc>
-Node *Db<Alloc>::parseCtorDtorName(Node *&SoFar, NameState *State) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseCtorDtorName(Node *&SoFar,
+                                                          NameState *State) {
   if (SoFar->getKind() == Node::KSpecialSubstitution) {
     auto SSK = static_cast<SpecialSubstitution *>(SoFar)->SSK;
     switch (SSK) {
@@ -2806,7 +2820,7 @@ Node *Db<Alloc>::parseCtorDtorName(Node *&SoFar, NameState *State) {
     ++First;
     if (State) State->CtorDtorConversion = true;
     if (IsInherited) {
-      if (parseName(State) == nullptr)
+      if (getDerived().parseName(State) == nullptr)
         return nullptr;
     }
     return make<CtorDtorName>(SoFar, false, Variant);
@@ -2840,7 +2854,9 @@ Node *Db<Alloc>::parseCtorDtorName(Node *&SoFar, NameState *State) {
 // <template-prefix> ::= <prefix> <template unqualified-name>
 //                   ::= <template-param>
 //                   ::= <substitution>
-template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseNestedName(NameState *State) {
   if (!consumeIf('N'))
     return nullptr;
 
@@ -2881,7 +2897,7 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
 
     //          ::= <template-param>
     if (look() == 'T') {
-      if (!PushComponent(parseTemplateParam()))
+      if (!PushComponent(getDerived().parseTemplateParam()))
         return nullptr;
       Subs.push_back(SoFar);
       continue;
@@ -2889,7 +2905,7 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
 
     //          ::= <template-prefix> <template-args>
     if (look() == 'I') {
-      Node *TA = parseTemplateArgs(State != nullptr);
+      Node *TA = getDerived().parseTemplateArgs(State != nullptr);
       if (TA == nullptr || SoFar == nullptr)
         return nullptr;
       SoFar = make<NameWithTemplateArgs>(SoFar, TA);
@@ -2902,7 +2918,7 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
 
     //          ::= <decltype>
     if (look() == 'D' && (look(1) == 't' || look(1) == 'T')) {
-      if (!PushComponent(parseDecltype()))
+      if (!PushComponent(getDerived().parseDecltype()))
         return nullptr;
       Subs.push_back(SoFar);
       continue;
@@ -2910,7 +2926,7 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
 
     //          ::= <substitution>
     if (look() == 'S' && look(1) != 't') {
-      Node *S = parseSubstitution();
+      Node *S = getDerived().parseSubstitution();
       if (!PushComponent(S))
         return nullptr;
       if (SoFar != S)
@@ -2922,9 +2938,9 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
     if (look() == 'C' || (look() == 'D' && look(1) != 'C')) {
       if (SoFar == nullptr)
         return nullptr;
-      if (!PushComponent(parseCtorDtorName(SoFar, State)))
+      if (!PushComponent(getDerived().parseCtorDtorName(SoFar, State)))
         return nullptr;
-      SoFar = parseAbiTags(SoFar);
+      SoFar = getDerived().parseAbiTags(SoFar);
       if (SoFar == nullptr)
         return nullptr;
       Subs.push_back(SoFar);
@@ -2932,7 +2948,7 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
     }
 
     //          ::= <prefix> <unqualified-name>
-    if (!PushComponent(parseUnqualifiedName(State)))
+    if (!PushComponent(getDerived().parseUnqualifiedName(State)))
       return nullptr;
     Subs.push_back(SoFar);
   }
@@ -2945,12 +2961,13 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
 }
 
 // <simple-id> ::= <source-name> [ <template-args> ]
-template<typename Alloc> Node *Db<Alloc>::parseSimpleId() {
-  Node *SN = parseSourceName(/*NameState=*/nullptr);
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseSimpleId() {
+  Node *SN = getDerived().parseSourceName(/*NameState=*/nullptr);
   if (SN == nullptr)
     return nullptr;
   if (look() == 'I') {
-    Node *TA = parseTemplateArgs();
+    Node *TA = getDerived().parseTemplateArgs();
     if (TA == nullptr)
       return nullptr;
     return make<NameWithTemplateArgs>(SN, TA);
@@ -2960,12 +2977,13 @@ template<typename Alloc> Node *Db<Alloc>::parseSimpleId() {
 
 // <destructor-name> ::= <unresolved-type>  # e.g., ~T or ~decltype(f())
 //                   ::= <simple-id>        # e.g., ~A<2*N>
-template<typename Alloc> Node *Db<Alloc>::parseDestructorName() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseDestructorName() {
   Node *Result;
   if (std::isdigit(look()))
-    Result = parseSimpleId();
+    Result = getDerived().parseSimpleId();
   else
-    Result = parseUnresolvedType();
+    Result = getDerived().parseUnresolvedType();
   if (Result == nullptr)
     return nullptr;
   return make<DtorName>(Result);
@@ -2974,22 +2992,23 @@ template<typename Alloc> Node *Db<Alloc>::parseDestructorName() {
 // <unresolved-type> ::= <template-param>
 //                   ::= <decltype>
 //                   ::= <substitution>
-template<typename Alloc> Node *Db<Alloc>::parseUnresolvedType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseUnresolvedType() {
   if (look() == 'T') {
-    Node *TP = parseTemplateParam();
+    Node *TP = getDerived().parseTemplateParam();
     if (TP == nullptr)
       return nullptr;
     Subs.push_back(TP);
     return TP;
   }
   if (look() == 'D') {
-    Node *DT = parseDecltype();
+    Node *DT = getDerived().parseDecltype();
     if (DT == nullptr)
       return nullptr;
     Subs.push_back(DT);
     return DT;
   }
-  return parseSubstitution();
+  return getDerived().parseSubstitution();
 }
 
 // <base-unresolved-name> ::= <simple-id>                                # unresolved name
@@ -2999,20 +3018,21 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedType() {
 //                        ::= on <operator-name> <template-args>         # unresolved operator template-id
 //                        ::= dn <destructor-name>                       # destructor or pseudo-destructor;
 //                                                                         # e.g. ~X or ~X<N-1>
-template<typename Alloc> Node *Db<Alloc>::parseBaseUnresolvedName() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseBaseUnresolvedName() {
   if (std::isdigit(look()))
-    return parseSimpleId();
+    return getDerived().parseSimpleId();
 
   if (consumeIf("dn"))
-    return parseDestructorName();
+    return getDerived().parseDestructorName();
 
   consumeIf("on");
 
-  Node *Oper = parseOperatorName(/*NameState=*/nullptr);
+  Node *Oper = getDerived().parseOperatorName(/*NameState=*/nullptr);
   if (Oper == nullptr)
     return nullptr;
   if (look() == 'I') {
-    Node *TA = parseTemplateArgs();
+    Node *TA = getDerived().parseTemplateArgs();
     if (TA == nullptr)
       return nullptr;
     return make<NameWithTemplateArgs>(Oper, TA);
@@ -3031,18 +3051,19 @@ template<typename Alloc> Node *Db<Alloc>::parseBaseUnresolvedName() {
 //  (ignored)        ::= srN <unresolved-type>  <unresolved-qualifier-level>+ E <base-unresolved-name>
 //
 // <unresolved-qualifier-level> ::= <simple-id>
-template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseUnresolvedName() {
   Node *SoFar = nullptr;
 
   // srN <unresolved-type> [<template-args>] <unresolved-qualifier-level>* E <base-unresolved-name>
   // srN <unresolved-type>                   <unresolved-qualifier-level>+ E <base-unresolved-name>
   if (consumeIf("srN")) {
-    SoFar = parseUnresolvedType();
+    SoFar = getDerived().parseUnresolvedType();
     if (SoFar == nullptr)
       return nullptr;
 
     if (look() == 'I') {
-      Node *TA = parseTemplateArgs();
+      Node *TA = getDerived().parseTemplateArgs();
       if (TA == nullptr)
         return nullptr;
       SoFar = make<NameWithTemplateArgs>(SoFar, TA);
@@ -3051,7 +3072,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
     }
 
     while (!consumeIf('E')) {
-      Node *Qual = parseSimpleId();
+      Node *Qual = getDerived().parseSimpleId();
       if (Qual == nullptr)
         return nullptr;
       SoFar = make<QualifiedName>(SoFar, Qual);
@@ -3059,7 +3080,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
         return nullptr;
     }
 
-    Node *Base = parseBaseUnresolvedName();
+    Node *Base = getDerived().parseBaseUnresolvedName();
     if (Base == nullptr)
       return nullptr;
     return make<QualifiedName>(SoFar, Base);
@@ -3069,7 +3090,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
 
   // [gs] <base-unresolved-name>                     # x or (with "gs") ::x
   if (!consumeIf("sr")) {
-    SoFar = parseBaseUnresolvedName();
+    SoFar = getDerived().parseBaseUnresolvedName();
     if (SoFar == nullptr)
       return nullptr;
     if (Global)
@@ -3080,7 +3101,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
   // [gs] sr <unresolved-qualifier-level>+ E   <base-unresolved-name>
   if (std::isdigit(look())) {
     do {
-      Node *Qual = parseSimpleId();
+      Node *Qual = getDerived().parseSimpleId();
       if (Qual == nullptr)
         return nullptr;
       if (SoFar)
@@ -3096,12 +3117,12 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
   //      sr <unresolved-type>                 <base-unresolved-name>
   //      sr <unresolved-type> <template-args> <base-unresolved-name>
   else {
-    SoFar = parseUnresolvedType();
+    SoFar = getDerived().parseUnresolvedType();
     if (SoFar == nullptr)
       return nullptr;
 
     if (look() == 'I') {
-      Node *TA = parseTemplateArgs();
+      Node *TA = getDerived().parseTemplateArgs();
       if (TA == nullptr)
         return nullptr;
       SoFar = make<NameWithTemplateArgs>(SoFar, TA);
@@ -3112,7 +3133,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
 
   assert(SoFar != nullptr);
 
-  Node *Base = parseBaseUnresolvedName();
+  Node *Base = getDerived().parseBaseUnresolvedName();
   if (Base == nullptr)
     return nullptr;
   return make<QualifiedName>(SoFar, Base);
@@ -3120,7 +3141,8 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
 
 // <abi-tags> ::= <abi-tag> [<abi-tags>]
 // <abi-tag> ::= B <source-name>
-template<typename Alloc> Node *Db<Alloc>::parseAbiTags(Node *N) {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseAbiTags(Node *N) {
   while (consumeIf('B')) {
     StringView SN = parseBareSourceName();
     if (SN.empty())
@@ -3133,8 +3155,9 @@ template<typename Alloc> Node *Db<Alloc>::parseAbiTags(Node *N) {
 }
 
 // <number> ::= [n] <non-negative decimal integer>
-template<typename Alloc>
-StringView Db<Alloc>::parseNumber(bool AllowNegative) {
+template <typename Alloc, typename Derived>
+StringView
+AbstractManglingParser<Alloc, Derived>::parseNumber(bool AllowNegative) {
   const char *Tmp = First;
   if (AllowNegative)
     consumeIf('n');
@@ -3146,7 +3169,8 @@ StringView Db<Alloc>::parseNumber(bool AllowNegative) {
 }
 
 // <positive length number> ::= [0-9]*
-template<typename Alloc> bool Db<Alloc>::parsePositiveInteger(size_t *Out) {
+template <typename Alloc, typename Derived>
+bool AbstractManglingParser<Alloc, Derived>::parsePositiveInteger(size_t *Out) {
   *Out = 0;
   if (look() < '0' || look() > '9')
     return true;
@@ -3157,7 +3181,8 @@ template<typename Alloc> bool Db<Alloc>::parsePositiveInteger(size_t *Out) {
   return false;
 }
 
-template<typename Alloc> StringView Db<Alloc>::parseBareSourceName() {
+template <typename Alloc, typename Derived>
+StringView AbstractManglingParser<Alloc, Derived>::parseBareSourceName() {
   size_t Int = 0;
   if (parsePositiveInteger(&Int) || numLeft() < Int)
     return StringView();
@@ -3174,7 +3199,8 @@ template<typename Alloc> StringView Db<Alloc>::parseBareSourceName() {
 //
 // <ref-qualifier> ::= R                   # & ref-qualifier
 // <ref-qualifier> ::= O                   # && ref-qualifier
-template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseFunctionType() {
   Qualifiers CVQuals = parseCVQualifiers();
 
   Node *ExceptionSpec = nullptr;
@@ -3183,7 +3209,7 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
     if (!ExceptionSpec)
       return nullptr;
   } else if (consumeIf("DO")) {
-    Node *E = parseExpr();
+    Node *E = getDerived().parseExpr();
     if (E == nullptr || !consumeIf('E'))
       return nullptr;
     ExceptionSpec = make<NoexceptSpec>(E);
@@ -3192,7 +3218,7 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
   } else if (consumeIf("Dw")) {
     size_t SpecsBegin = Names.size();
     while (!consumeIf('E')) {
-      Node *T = parseType();
+      Node *T = getDerived().parseType();
       if (T == nullptr)
         return nullptr;
       Names.push_back(T);
@@ -3208,7 +3234,7 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
   if (!consumeIf('F'))
     return nullptr;
   consumeIf('Y'); // extern "C"
-  Node *ReturnType = parseType();
+  Node *ReturnType = getDerived().parseType();
   if (ReturnType == nullptr)
     return nullptr;
 
@@ -3227,7 +3253,7 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
       ReferenceQualifier = FrefQualRValue;
       break;
     }
-    Node *T = parseType();
+    Node *T = getDerived().parseType();
     if (T == nullptr)
       return nullptr;
     Names.push_back(T);
@@ -3243,7 +3269,8 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
 //                         ::= Dv [<dimension expression>] _ <element type>
 // <extended element type> ::= <element type>
 //                         ::= p # AltiVec vector pixel
-template<typename Alloc> Node *Db<Alloc>::parseVectorType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseVectorType() {
   if (!consumeIf("Dv"))
     return nullptr;
   if (look() >= '1' && look() <= '9') {
@@ -3252,24 +3279,24 @@ template<typename Alloc> Node *Db<Alloc>::parseVectorType() {
       return nullptr;
     if (consumeIf('p'))
       return make<PixelVectorType>(DimensionNumber);
-    Node *ElemType = parseType();
+    Node *ElemType = getDerived().parseType();
     if (ElemType == nullptr)
       return nullptr;
     return make<VectorType>(ElemType, DimensionNumber);
   }
 
   if (!consumeIf('_')) {
-    Node *DimExpr = parseExpr();
+    Node *DimExpr = getDerived().parseExpr();
     if (!DimExpr)
       return nullptr;
     if (!consumeIf('_'))
       return nullptr;
-    Node *ElemType = parseType();
+    Node *ElemType = getDerived().parseType();
     if (!ElemType)
       return nullptr;
     return make<VectorType>(ElemType, DimExpr);
   }
-  Node *ElemType = parseType();
+  Node *ElemType = getDerived().parseType();
   if (!ElemType)
     return nullptr;
   return make<VectorType>(ElemType, StringView());
@@ -3277,12 +3304,13 @@ template<typename Alloc> Node *Db<Alloc>::parseVectorType() {
 
 // <decltype>  ::= Dt <expression> E  # decltype of an id-expression or class member access (C++0x)
 //             ::= DT <expression> E  # decltype of an expression (C++0x)
-template<typename Alloc> Node *Db<Alloc>::parseDecltype() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseDecltype() {
   if (!consumeIf('D'))
     return nullptr;
   if (!consumeIf('t') && !consumeIf('T'))
     return nullptr;
-  Node *E = parseExpr();
+  Node *E = getDerived().parseExpr();
   if (E == nullptr)
     return nullptr;
   if (!consumeIf('E'))
@@ -3292,7 +3320,8 @@ template<typename Alloc> Node *Db<Alloc>::parseDecltype() {
 
 // <array-type> ::= A <positive dimension number> _ <element type>
 //              ::= A [<dimension expression>] _ <element type>
-template<typename Alloc> Node *Db<Alloc>::parseArrayType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseArrayType() {
   if (!consumeIf('A'))
     return nullptr;
 
@@ -3303,7 +3332,7 @@ template<typename Alloc> Node *Db<Alloc>::parseArrayType() {
     if (!consumeIf('_'))
       return nullptr;
   } else if (!consumeIf('_')) {
-    Node *DimExpr = parseExpr();
+    Node *DimExpr = getDerived().parseExpr();
     if (DimExpr == nullptr)
       return nullptr;
     if (!consumeIf('_'))
@@ -3311,20 +3340,21 @@ template<typename Alloc> Node *Db<Alloc>::parseArrayType() {
     Dimension = DimExpr;
   }
 
-  Node *Ty = parseType();
+  Node *Ty = getDerived().parseType();
   if (Ty == nullptr)
     return nullptr;
   return make<ArrayType>(Ty, Dimension);
 }
 
 // <pointer-to-member-type> ::= M <class type> <member type>
-template<typename Alloc> Node *Db<Alloc>::parsePointerToMemberType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parsePointerToMemberType() {
   if (!consumeIf('M'))
     return nullptr;
-  Node *ClassType = parseType();
+  Node *ClassType = getDerived().parseType();
   if (ClassType == nullptr)
     return nullptr;
-  Node *MemberType = parseType();
+  Node *MemberType = getDerived().parseType();
   if (MemberType == nullptr)
     return nullptr;
   return make<PointerToMemberType>(ClassType, MemberType);
@@ -3334,7 +3364,8 @@ template<typename Alloc> Node *Db<Alloc>::parsePointerToMemberType() {
 //                   ::= Ts <name>  # dependent elaborated type specifier using 'struct' or 'class'
 //                   ::= Tu <name>  # dependent elaborated type specifier using 'union'
 //                   ::= Te <name>  # dependent elaborated type specifier using 'enum'
-template<typename Alloc> Node *Db<Alloc>::parseClassEnumType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseClassEnumType() {
   StringView ElabSpef;
   if (consumeIf("Ts"))
     ElabSpef = "struct";
@@ -3343,7 +3374,7 @@ template<typename Alloc> Node *Db<Alloc>::parseClassEnumType() {
   else if (consumeIf("Te"))
     ElabSpef = "enum";
 
-  Node *Name = parseName();
+  Node *Name = getDerived().parseName();
   if (Name == nullptr)
     return nullptr;
 
@@ -3356,7 +3387,8 @@ template<typename Alloc> Node *Db<Alloc>::parseClassEnumType() {
 // <qualified-type>     ::= <qualifiers> <type>
 // <qualifiers> ::= <extended-qualifier>* <CV-qualifiers>
 // <extended-qualifier> ::= U <source-name> [<template-args>] # vendor extended type qualifier
-template<typename Alloc> Node *Db<Alloc>::parseQualifiedType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseQualifiedType() {
   if (consumeIf('U')) {
     StringView Qual = parseBareSourceName();
     if (Qual.empty())
@@ -3375,20 +3407,20 @@ template<typename Alloc> Node *Db<Alloc>::parseQualifiedType() {
       }
       if (Proto.empty())
         return nullptr;
-      Node *Child = parseQualifiedType();
+      Node *Child = getDerived().parseQualifiedType();
       if (Child == nullptr)
         return nullptr;
       return make<ObjCProtoName>(Child, Proto);
     }
 
-    Node *Child = parseQualifiedType();
+    Node *Child = getDerived().parseQualifiedType();
     if (Child == nullptr)
       return nullptr;
     return make<VendorExtQualType>(Child, Qual);
   }
 
   Qualifiers Quals = parseCVQualifiers();
-  Node *Ty = parseType();
+  Node *Ty = getDerived().parseType();
   if (Ty == nullptr)
     return nullptr;
   if (Quals != QualNone)
@@ -3416,7 +3448,8 @@ template<typename Alloc> Node *Db<Alloc>::parseQualifiedType() {
 //
 // <objc-name> ::= <k0 number> objcproto <k1 number> <identifier>  # k0 = 9 + <number of digits in k1> + k1
 // <objc-type> ::= <source-name>  # PU<11+>objcproto 11objc_object<source-name> 11objc_object -> id<source-name>
-template<typename Alloc> Node *Db<Alloc>::parseType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseType() {
   Node *Result = nullptr;
 
   if (TypeCallback != nullptr)
@@ -3436,13 +3469,13 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
         (look(AfterQuals) == 'D' &&
          (look(AfterQuals + 1) == 'o' || look(AfterQuals + 1) == 'O' ||
           look(AfterQuals + 1) == 'w' || look(AfterQuals + 1) == 'x'))) {
-      Result = parseFunctionType();
+      Result = getDerived().parseFunctionType();
       break;
     }
     LLVM_FALLTHROUGH;
   }
   case 'U': {
-    Result = parseQualifiedType();
+    Result = getDerived().parseQualifiedType();
     break;
   }
   // <builtin-type> ::= v    # void
@@ -3580,18 +3613,18 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
     //             ::= <decltype>
     case 't':
     case 'T': {
-      Result = parseDecltype();
+      Result = getDerived().parseDecltype();
       break;
     }
     // extension   ::= <vector-type> # <vector-type> starts with Dv
     case 'v': {
-      Result = parseVectorType();
+      Result = getDerived().parseVectorType();
       break;
     }
     //           ::= Dp <type>       # pack expansion (C++0x)
     case 'p': {
       First += 2;
-      Node *Child = parseType();
+      Node *Child = getDerived().parseType();
       if (!Child)
         return nullptr;
       Result = make<ParameterPackExpansion>(Child);
@@ -3603,34 +3636,34 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
     case 'w':
     // Transaction safe function type.
     case 'x':
-      Result = parseFunctionType();
+      Result = getDerived().parseFunctionType();
       break;
     }
     break;
   //             ::= <function-type>
   case 'F': {
-    Result = parseFunctionType();
+    Result = getDerived().parseFunctionType();
     break;
   }
   //             ::= <array-type>
   case 'A': {
-    Result = parseArrayType();
+    Result = getDerived().parseArrayType();
     break;
   }
   //             ::= <pointer-to-member-type>
   case 'M': {
-    Result = parsePointerToMemberType();
+    Result = getDerived().parsePointerToMemberType();
     break;
   }
   //             ::= <template-param>
   case 'T': {
     // This could be an elaborate type specifier on a <class-enum-type>.
     if (look(1) == 's' || look(1) == 'u' || look(1) == 'e') {
-      Result = parseClassEnumType();
+      Result = getDerived().parseClassEnumType();
       break;
     }
 
-    Result = parseTemplateParam();
+    Result = getDerived().parseTemplateParam();
     if (Result == nullptr)
       return nullptr;
 
@@ -3645,7 +3678,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
     // parse them, take the second production.
 
     if (TryToParseTemplateArgs && look() == 'I') {
-      Node *TA = parseTemplateArgs();
+      Node *TA = getDerived().parseTemplateArgs();
       if (TA == nullptr)
         return nullptr;
       Result = make<NameWithTemplateArgs>(Result, TA);
@@ -3655,7 +3688,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= P <type>        # pointer
   case 'P': {
     ++First;
-    Node *Ptr = parseType();
+    Node *Ptr = getDerived().parseType();
     if (Ptr == nullptr)
       return nullptr;
     Result = make<PointerType>(Ptr);
@@ -3664,7 +3697,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= R <type>        # l-value reference
   case 'R': {
     ++First;
-    Node *Ref = parseType();
+    Node *Ref = getDerived().parseType();
     if (Ref == nullptr)
       return nullptr;
     Result = make<ReferenceType>(Ref, ReferenceKind::LValue);
@@ -3673,7 +3706,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= O <type>        # r-value reference (C++11)
   case 'O': {
     ++First;
-    Node *Ref = parseType();
+    Node *Ref = getDerived().parseType();
     if (Ref == nullptr)
       return nullptr;
     Result = make<ReferenceType>(Ref, ReferenceKind::RValue);
@@ -3682,7 +3715,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= C <type>        # complex pair (C99)
   case 'C': {
     ++First;
-    Node *P = parseType();
+    Node *P = getDerived().parseType();
     if (P == nullptr)
       return nullptr;
     Result = make<PostfixQualifiedType>(P, " complex");
@@ -3691,7 +3724,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= G <type>        # imaginary (C99)
   case 'G': {
     ++First;
-    Node *P = parseType();
+    Node *P = getDerived().parseType();
     if (P == nullptr)
       return P;
     Result = make<PostfixQualifiedType>(P, " imaginary");
@@ -3700,7 +3733,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= <substitution>  # See Compression below
   case 'S': {
     if (look(1) && look(1) != 't') {
-      Node *Sub = parseSubstitution();
+      Node *Sub = getDerived().parseSubstitution();
       if (Sub == nullptr)
         return nullptr;
 
@@ -3715,7 +3748,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
       // parse them, take the second production.
 
       if (TryToParseTemplateArgs && look() == 'I') {
-        Node *TA = parseTemplateArgs();
+        Node *TA = getDerived().parseTemplateArgs();
         if (TA == nullptr)
           return nullptr;
         Result = make<NameWithTemplateArgs>(Sub, TA);
@@ -3730,7 +3763,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   }
   //        ::= <class-enum-type>
   default: {
-    Result = parseClassEnumType();
+    Result = getDerived().parseClassEnumType();
     break;
   }
   }
@@ -3743,24 +3776,28 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   return Result;
 }
 
-template<typename Alloc> Node *Db<Alloc>::parsePrefixExpr(StringView Kind) {
-  Node *E = parseExpr();
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parsePrefixExpr(StringView Kind) {
+  Node *E = getDerived().parseExpr();
   if (E == nullptr)
     return nullptr;
   return make<PrefixExpr>(Kind, E);
 }
 
-template<typename Alloc> Node *Db<Alloc>::parseBinaryExpr(StringView Kind) {
-  Node *LHS = parseExpr();
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseBinaryExpr(StringView Kind) {
+  Node *LHS = getDerived().parseExpr();
   if (LHS == nullptr)
     return nullptr;
-  Node *RHS = parseExpr();
+  Node *RHS = getDerived().parseExpr();
   if (RHS == nullptr)
     return nullptr;
   return make<BinaryExpr>(LHS, Kind, RHS);
 }
 
-template<typename Alloc> Node *Db<Alloc>::parseIntegerLiteral(StringView Lit) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseIntegerLiteral(StringView Lit) {
   StringView Tmp = parseNumber(true);
   if (!Tmp.empty() && consumeIf('E'))
     return make<IntegerLiteral>(Lit, Tmp);
@@ -3768,7 +3805,8 @@ template<typename Alloc> Node *Db<Alloc>::parseIntegerLiteral(StringView Lit) {
 }
 
 // <CV-Qualifiers> ::= [r] [V] [K]
-template<typename Alloc> Qualifiers Db<Alloc>::parseCVQualifiers() {
+template <typename Alloc, typename Derived>
+Qualifiers AbstractManglingParser<Alloc, Derived>::parseCVQualifiers() {
   Qualifiers CVR = QualNone;
   if (consumeIf('r'))
     CVR |= QualRestrict;
@@ -3783,7 +3821,8 @@ template<typename Alloc> Qualifiers Db<Alloc>::parseCVQualifiers() {
 //                  ::= fp <top-level CV-Qualifiers> <parameter-2 non-negative number> _   # L == 0, second and later parameters
 //                  ::= fL <L-1 non-negative number> p <top-level CV-Qualifiers> _         # L > 0, first parameter
 //                  ::= fL <L-1 non-negative number> p <top-level CV-Qualifiers> <parameter-2 non-negative number> _   # L > 0, second and later parameters
-template<typename Alloc> Node *Db<Alloc>::parseFunctionParam() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseFunctionParam() {
   if (consumeIf("fp")) {
     parseCVQualifiers();
     StringView Num = parseNumber();
@@ -3810,26 +3849,27 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionParam() {
 // [gs] na <expression>* _ <type> E                     # new[] (expr-list) type
 // [gs] na <expression>* _ <type> <initializer>         # new[] (expr-list) type (init)
 // <initializer> ::= pi <expression>* E                 # parenthesized initialization
-template<typename Alloc> Node *Db<Alloc>::parseNewExpr() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseNewExpr() {
   bool Global = consumeIf("gs");
   bool IsArray = look(1) == 'a';
   if (!consumeIf("nw") && !consumeIf("na"))
     return nullptr;
   size_t Exprs = Names.size();
   while (!consumeIf('_')) {
-    Node *Ex = parseExpr();
+    Node *Ex = getDerived().parseExpr();
     if (Ex == nullptr)
       return nullptr;
     Names.push_back(Ex);
   }
   NodeArray ExprList = popTrailingNodeArray(Exprs);
-  Node *Ty = parseType();
+  Node *Ty = getDerived().parseType();
   if (Ty == nullptr)
     return Ty;
   if (consumeIf("pi")) {
     size_t InitsBegin = Names.size();
     while (!consumeIf('E')) {
-      Node *Init = parseExpr();
+      Node *Init = getDerived().parseExpr();
       if (Init == nullptr)
         return Init;
       Names.push_back(Init);
@@ -3843,13 +3883,14 @@ template<typename Alloc> Node *Db<Alloc>::parseNewExpr() {
 
 // cv <type> <expression>                               # conversion with one argument
 // cv <type> _ <expression>* E                          # conversion with a different number of arguments
-template<typename Alloc> Node *Db<Alloc>::parseConversionExpr() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseConversionExpr() {
   if (!consumeIf("cv"))
     return nullptr;
   Node *Ty;
   {
     SwapAndRestore<bool> SaveTemp(TryToParseTemplateArgs, false);
-    Ty = parseType();
+    Ty = getDerived().parseType();
   }
 
   if (Ty == nullptr)
@@ -3858,7 +3899,7 @@ template<typename Alloc> Node *Db<Alloc>::parseConversionExpr() {
   if (consumeIf('_')) {
     size_t ExprsBegin = Names.size();
     while (!consumeIf('E')) {
-      Node *E = parseExpr();
+      Node *E = getDerived().parseExpr();
       if (E == nullptr)
         return E;
       Names.push_back(E);
@@ -3867,7 +3908,7 @@ template<typename Alloc> Node *Db<Alloc>::parseConversionExpr() {
     return make<ConversionExpr>(Ty, Exprs);
   }
 
-  Node *E[1] = {parseExpr()};
+  Node *E[1] = {getDerived().parseExpr()};
   if (E[0] == nullptr)
     return nullptr;
   return make<ConversionExpr>(Ty, makeNodeArray(E, E + 1));
@@ -3879,13 +3920,14 @@ template<typename Alloc> Node *Db<Alloc>::parseConversionExpr() {
 //                ::= L <nullptr type> E                                 # nullptr literal (i.e., "LDnE")
 // FIXME:         ::= L <type> <real-part float> _ <imag-part float> E   # complex floating point literal (C 2000)
 //                ::= L <mangled-name> E                                 # external name
-template<typename Alloc> Node *Db<Alloc>::parseExprPrimary() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseExprPrimary() {
   if (!consumeIf('L'))
     return nullptr;
   switch (look()) {
   case 'w':
     ++First;
-    return parseIntegerLiteral("wchar_t");
+    return getDerived().parseIntegerLiteral("wchar_t");
   case 'b':
     if (consumeIf("b0E"))
       return make<BoolExpr>(0);
@@ -3894,55 +3936,55 @@ template<typename Alloc> Node *Db<Alloc>::parseExprPrimary() {
     return nullptr;
   case 'c':
     ++First;
-    return parseIntegerLiteral("char");
+    return getDerived().parseIntegerLiteral("char");
   case 'a':
     ++First;
-    return parseIntegerLiteral("signed char");
+    return getDerived().parseIntegerLiteral("signed char");
   case 'h':
     ++First;
-    return parseIntegerLiteral("unsigned char");
+    return getDerived().parseIntegerLiteral("unsigned char");
   case 's':
     ++First;
-    return parseIntegerLiteral("short");
+    return getDerived().parseIntegerLiteral("short");
   case 't':
     ++First;
-    return parseIntegerLiteral("unsigned short");
+    return getDerived().parseIntegerLiteral("unsigned short");
   case 'i':
     ++First;
-    return parseIntegerLiteral("");
+    return getDerived().parseIntegerLiteral("");
   case 'j':
     ++First;
-    return parseIntegerLiteral("u");
+    return getDerived().parseIntegerLiteral("u");
   case 'l':
     ++First;
-    return parseIntegerLiteral("l");
+    return getDerived().parseIntegerLiteral("l");
   case 'm':
     ++First;
-    return parseIntegerLiteral("ul");
+    return getDerived().parseIntegerLiteral("ul");
   case 'x':
     ++First;
-    return parseIntegerLiteral("ll");
+    return getDerived().parseIntegerLiteral("ll");
   case 'y':
     ++First;
-    return parseIntegerLiteral("ull");
+    return getDerived().parseIntegerLiteral("ull");
   case 'n':
     ++First;
-    return parseIntegerLiteral("__int128");
+    return getDerived().parseIntegerLiteral("__int128");
   case 'o':
     ++First;
-    return parseIntegerLiteral("unsigned __int128");
+    return getDerived().parseIntegerLiteral("unsigned __int128");
   case 'f':
     ++First;
-    return parseFloatingLiteral<float>();
+    return getDerived().template parseFloatingLiteral<float>();
   case 'd':
     ++First;
-    return parseFloatingLiteral<double>();
+    return getDerived().template parseFloatingLiteral<double>();
   case 'e':
     ++First;
-    return parseFloatingLiteral<long double>();
+    return getDerived().template parseFloatingLiteral<long double>();
   case '_':
     if (consumeIf("_Z")) {
-      Node *R = parseEncoding();
+      Node *R = getDerived().parseEncoding();
       if (R != nullptr && consumeIf('E'))
         return R;
     }
@@ -3953,7 +3995,7 @@ template<typename Alloc> Node *Db<Alloc>::parseExprPrimary() {
     return nullptr;
   default: {
     // might be named type
-    Node *T = parseType();
+    Node *T = getDerived().parseType();
     if (T == nullptr)
       return nullptr;
     StringView N = parseNumber();
@@ -3973,45 +4015,46 @@ template<typename Alloc> Node *Db<Alloc>::parseExprPrimary() {
 //                     ::= di <field source-name> <braced-expression>    # .name = expr
 //                     ::= dx <index expression> <braced-expression>     # [expr] = expr
 //                     ::= dX <range begin expression> <range end expression> <braced-expression>
-template<typename Alloc> Node *Db<Alloc>::parseBracedExpr() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseBracedExpr() {
   if (look() == 'd') {
     switch (look(1)) {
     case 'i': {
       First += 2;
-      Node *Field = parseSourceName(/*NameState=*/nullptr);
+      Node *Field = getDerived().parseSourceName(/*NameState=*/nullptr);
       if (Field == nullptr)
         return nullptr;
-      Node *Init = parseBracedExpr();
+      Node *Init = getDerived().parseBracedExpr();
       if (Init == nullptr)
         return nullptr;
       return make<BracedExpr>(Field, Init, /*isArray=*/false);
     }
     case 'x': {
       First += 2;
-      Node *Index = parseExpr();
+      Node *Index = getDerived().parseExpr();
       if (Index == nullptr)
         return nullptr;
-      Node *Init = parseBracedExpr();
+      Node *Init = getDerived().parseBracedExpr();
       if (Init == nullptr)
         return nullptr;
       return make<BracedExpr>(Index, Init, /*isArray=*/true);
     }
     case 'X': {
       First += 2;
-      Node *RangeBegin = parseExpr();
+      Node *RangeBegin = getDerived().parseExpr();
       if (RangeBegin == nullptr)
         return nullptr;
-      Node *RangeEnd = parseExpr();
+      Node *RangeEnd = getDerived().parseExpr();
       if (RangeEnd == nullptr)
         return nullptr;
-      Node *Init = parseBracedExpr();
+      Node *Init = getDerived().parseBracedExpr();
       if (Init == nullptr)
         return nullptr;
       return make<BracedRangeExpr>(RangeBegin, RangeEnd, Init);
     }
     }
   }
-  return parseExpr();
+  return getDerived().parseExpr();
 }
 
 // (not yet in the spec)
@@ -4019,7 +4062,8 @@ template<typename Alloc> Node *Db<Alloc>::parseBracedExpr() {
 //             ::= fR <binary-operator-name> <expression> <expression>
 //             ::= fl <binary-operator-name> <expression>
 //             ::= fr <binary-operator-name> <expression>
-template<typename Alloc> Node *Db<Alloc>::parseFoldExpr() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseFoldExpr() {
   if (!consumeIf('f'))
     return nullptr;
 
@@ -4069,11 +4113,11 @@ template<typename Alloc> Node *Db<Alloc>::parseFoldExpr() {
   else if (consumeIf("rS")) OperatorName = ">>=";
   else return nullptr;
 
-  Node *Pack = parseExpr(), *Init = nullptr;
+  Node *Pack = getDerived().parseExpr(), *Init = nullptr;
   if (Pack == nullptr)
     return nullptr;
   if (HasInitializer) {
-    Init = parseExpr();
+    Init = getDerived().parseExpr();
     if (Init == nullptr)
       return nullptr;
   }
@@ -4128,49 +4172,50 @@ template<typename Alloc> Node *Db<Alloc>::parseFoldExpr() {
 //              ::= fl <binary-operator-name> <expression>
 //              ::= fr <binary-operator-name> <expression>
 //              ::= <expr-primary>
-template<typename Alloc> Node *Db<Alloc>::parseExpr() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
   bool Global = consumeIf("gs");
   if (numLeft() < 2)
     return nullptr;
 
   switch (*First) {
   case 'L':
-    return parseExprPrimary();
+    return getDerived().parseExprPrimary();
   case 'T':
-    return parseTemplateParam();
+    return getDerived().parseTemplateParam();
   case 'f': {
     // Disambiguate a fold expression from a <function-param>.
     if (look(1) == 'p' || (look(1) == 'L' && std::isdigit(look(2))))
-      return parseFunctionParam();
-    return parseFoldExpr();
+      return getDerived().parseFunctionParam();
+    return getDerived().parseFoldExpr();
   }
   case 'a':
     switch (First[1]) {
     case 'a':
       First += 2;
-      return parseBinaryExpr("&&");
+      return getDerived().parseBinaryExpr("&&");
     case 'd':
       First += 2;
-      return parsePrefixExpr("&");
+      return getDerived().parsePrefixExpr("&");
     case 'n':
       First += 2;
-      return parseBinaryExpr("&");
+      return getDerived().parseBinaryExpr("&");
     case 'N':
       First += 2;
-      return parseBinaryExpr("&=");
+      return getDerived().parseBinaryExpr("&=");
     case 'S':
       First += 2;
-      return parseBinaryExpr("=");
+      return getDerived().parseBinaryExpr("=");
     case 't': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       return make<EnclosingExpr>("alignof (", Ty, ")");
     }
     case 'z': {
       First += 2;
-      Node *Ty = parseExpr();
+      Node *Ty = getDerived().parseExpr();
       if (Ty == nullptr)
         return nullptr;
       return make<EnclosingExpr>("alignof (", Ty, ")");
@@ -4182,10 +4227,10 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     // cc <type> <expression>                               # const_cast<type>(expression)
     case 'c': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return Ty;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<CastExpr>("const_cast", Ty, Ex);
@@ -4193,12 +4238,12 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     // cl <expression>+ E                                   # call
     case 'l': {
       First += 2;
-      Node *Callee = parseExpr();
+      Node *Callee = getDerived().parseExpr();
       if (Callee == nullptr)
         return Callee;
       size_t ExprsBegin = Names.size();
       while (!consumeIf('E')) {
-        Node *E = parseExpr();
+        Node *E = getDerived().parseExpr();
         if (E == nullptr)
           return E;
         Names.push_back(E);
@@ -4207,104 +4252,104 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     }
     case 'm':
       First += 2;
-      return parseBinaryExpr(",");
+      return getDerived().parseBinaryExpr(",");
     case 'o':
       First += 2;
-      return parsePrefixExpr("~");
+      return getDerived().parsePrefixExpr("~");
     case 'v':
-      return parseConversionExpr();
+      return getDerived().parseConversionExpr();
     }
     return nullptr;
   case 'd':
     switch (First[1]) {
     case 'a': {
       First += 2;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<DeleteExpr>(Ex, Global, /*is_array=*/true);
     }
     case 'c': {
       First += 2;
-      Node *T = parseType();
+      Node *T = getDerived().parseType();
       if (T == nullptr)
         return T;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<CastExpr>("dynamic_cast", T, Ex);
     }
     case 'e':
       First += 2;
-      return parsePrefixExpr("*");
+      return getDerived().parsePrefixExpr("*");
     case 'l': {
       First += 2;
-      Node *E = parseExpr();
+      Node *E = getDerived().parseExpr();
       if (E == nullptr)
         return E;
       return make<DeleteExpr>(E, Global, /*is_array=*/false);
     }
     case 'n':
-      return parseUnresolvedName();
+      return getDerived().parseUnresolvedName();
     case 's': {
       First += 2;
-      Node *LHS = parseExpr();
+      Node *LHS = getDerived().parseExpr();
       if (LHS == nullptr)
         return nullptr;
-      Node *RHS = parseExpr();
+      Node *RHS = getDerived().parseExpr();
       if (RHS == nullptr)
         return nullptr;
       return make<MemberExpr>(LHS, ".*", RHS);
     }
     case 't': {
       First += 2;
-      Node *LHS = parseExpr();
+      Node *LHS = getDerived().parseExpr();
       if (LHS == nullptr)
         return LHS;
-      Node *RHS = parseExpr();
+      Node *RHS = getDerived().parseExpr();
       if (RHS == nullptr)
         return nullptr;
       return make<MemberExpr>(LHS, ".", RHS);
     }
     case 'v':
       First += 2;
-      return parseBinaryExpr("/");
+      return getDerived().parseBinaryExpr("/");
     case 'V':
       First += 2;
-      return parseBinaryExpr("/=");
+      return getDerived().parseBinaryExpr("/=");
     }
     return nullptr;
   case 'e':
     switch (First[1]) {
     case 'o':
       First += 2;
-      return parseBinaryExpr("^");
+      return getDerived().parseBinaryExpr("^");
     case 'O':
       First += 2;
-      return parseBinaryExpr("^=");
+      return getDerived().parseBinaryExpr("^=");
     case 'q':
       First += 2;
-      return parseBinaryExpr("==");
+      return getDerived().parseBinaryExpr("==");
     }
     return nullptr;
   case 'g':
     switch (First[1]) {
     case 'e':
       First += 2;
-      return parseBinaryExpr(">=");
+      return getDerived().parseBinaryExpr(">=");
     case 't':
       First += 2;
-      return parseBinaryExpr(">");
+      return getDerived().parseBinaryExpr(">");
     }
     return nullptr;
   case 'i':
     switch (First[1]) {
     case 'x': {
       First += 2;
-      Node *Base = parseExpr();
+      Node *Base = getDerived().parseExpr();
       if (Base == nullptr)
         return nullptr;
-      Node *Index = parseExpr();
+      Node *Index = getDerived().parseExpr();
       if (Index == nullptr)
         return Index;
       return make<ArraySubscriptExpr>(Base, Index);
@@ -4313,7 +4358,7 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
       First += 2;
       size_t InitsBegin = Names.size();
       while (!consumeIf('E')) {
-        Node *E = parseBracedExpr();
+        Node *E = getDerived().parseBracedExpr();
         if (E == nullptr)
           return nullptr;
         Names.push_back(E);
@@ -4326,37 +4371,37 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     switch (First[1]) {
     case 'e':
       First += 2;
-      return parseBinaryExpr("<=");
+      return getDerived().parseBinaryExpr("<=");
     case 's':
       First += 2;
-      return parseBinaryExpr("<<");
+      return getDerived().parseBinaryExpr("<<");
     case 'S':
       First += 2;
-      return parseBinaryExpr("<<=");
+      return getDerived().parseBinaryExpr("<<=");
     case 't':
       First += 2;
-      return parseBinaryExpr("<");
+      return getDerived().parseBinaryExpr("<");
     }
     return nullptr;
   case 'm':
     switch (First[1]) {
     case 'i':
       First += 2;
-      return parseBinaryExpr("-");
+      return getDerived().parseBinaryExpr("-");
     case 'I':
       First += 2;
-      return parseBinaryExpr("-=");
+      return getDerived().parseBinaryExpr("-=");
     case 'l':
       First += 2;
-      return parseBinaryExpr("*");
+      return getDerived().parseBinaryExpr("*");
     case 'L':
       First += 2;
-      return parseBinaryExpr("*=");
+      return getDerived().parseBinaryExpr("*=");
     case 'm':
       First += 2;
       if (consumeIf('_'))
-        return parsePrefixExpr("--");
-      Node *Ex = parseExpr();
+        return getDerived().parsePrefixExpr("--");
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return nullptr;
       return make<PostfixExpr>(Ex, "--");
@@ -4366,19 +4411,19 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     switch (First[1]) {
     case 'a':
     case 'w':
-      return parseNewExpr();
+      return getDerived().parseNewExpr();
     case 'e':
       First += 2;
-      return parseBinaryExpr("!=");
+      return getDerived().parseBinaryExpr("!=");
     case 'g':
       First += 2;
-      return parsePrefixExpr("-");
+      return getDerived().parsePrefixExpr("-");
     case 't':
       First += 2;
-      return parsePrefixExpr("!");
+      return getDerived().parsePrefixExpr("!");
     case 'x':
       First += 2;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<EnclosingExpr>("noexcept (", Ex, ")");
@@ -4387,47 +4432,47 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
   case 'o':
     switch (First[1]) {
     case 'n':
-      return parseUnresolvedName();
+      return getDerived().parseUnresolvedName();
     case 'o':
       First += 2;
-      return parseBinaryExpr("||");
+      return getDerived().parseBinaryExpr("||");
     case 'r':
       First += 2;
-      return parseBinaryExpr("|");
+      return getDerived().parseBinaryExpr("|");
     case 'R':
       First += 2;
-      return parseBinaryExpr("|=");
+      return getDerived().parseBinaryExpr("|=");
     }
     return nullptr;
   case 'p':
     switch (First[1]) {
     case 'm':
       First += 2;
-      return parseBinaryExpr("->*");
+      return getDerived().parseBinaryExpr("->*");
     case 'l':
       First += 2;
-      return parseBinaryExpr("+");
+      return getDerived().parseBinaryExpr("+");
     case 'L':
       First += 2;
-      return parseBinaryExpr("+=");
+      return getDerived().parseBinaryExpr("+=");
     case 'p': {
       First += 2;
       if (consumeIf('_'))
-        return parsePrefixExpr("++");
-      Node *Ex = parseExpr();
+        return getDerived().parsePrefixExpr("++");
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<PostfixExpr>(Ex, "++");
     }
     case 's':
       First += 2;
-      return parsePrefixExpr("+");
+      return getDerived().parsePrefixExpr("+");
     case 't': {
       First += 2;
-      Node *L = parseExpr();
+      Node *L = getDerived().parseExpr();
       if (L == nullptr)
         return nullptr;
-      Node *R = parseExpr();
+      Node *R = getDerived().parseExpr();
       if (R == nullptr)
         return nullptr;
       return make<MemberExpr>(L, "->", R);
@@ -4437,13 +4482,13 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
   case 'q':
     if (First[1] == 'u') {
       First += 2;
-      Node *Cond = parseExpr();
+      Node *Cond = getDerived().parseExpr();
       if (Cond == nullptr)
         return nullptr;
-      Node *LHS = parseExpr();
+      Node *LHS = getDerived().parseExpr();
       if (LHS == nullptr)
         return nullptr;
-      Node *RHS = parseExpr();
+      Node *RHS = getDerived().parseExpr();
       if (RHS == nullptr)
         return nullptr;
       return make<ConditionalExpr>(Cond, LHS, RHS);
@@ -4453,59 +4498,59 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     switch (First[1]) {
     case 'c': {
       First += 2;
-      Node *T = parseType();
+      Node *T = getDerived().parseType();
       if (T == nullptr)
         return T;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<CastExpr>("reinterpret_cast", T, Ex);
     }
     case 'm':
       First += 2;
-      return parseBinaryExpr("%");
+      return getDerived().parseBinaryExpr("%");
     case 'M':
       First += 2;
-      return parseBinaryExpr("%=");
+      return getDerived().parseBinaryExpr("%=");
     case 's':
       First += 2;
-      return parseBinaryExpr(">>");
+      return getDerived().parseBinaryExpr(">>");
     case 'S':
       First += 2;
-      return parseBinaryExpr(">>=");
+      return getDerived().parseBinaryExpr(">>=");
     }
     return nullptr;
   case 's':
     switch (First[1]) {
     case 'c': {
       First += 2;
-      Node *T = parseType();
+      Node *T = getDerived().parseType();
       if (T == nullptr)
         return T;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<CastExpr>("static_cast", T, Ex);
     }
     case 'p': {
       First += 2;
-      Node *Child = parseExpr();
+      Node *Child = getDerived().parseExpr();
       if (Child == nullptr)
         return nullptr;
       return make<ParameterPackExpansion>(Child);
     }
     case 'r':
-      return parseUnresolvedName();
+      return getDerived().parseUnresolvedName();
     case 't': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return Ty;
       return make<EnclosingExpr>("sizeof (", Ty, ")");
     }
     case 'z': {
       First += 2;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<EnclosingExpr>("sizeof (", Ex, ")");
@@ -4513,12 +4558,12 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     case 'Z':
       First += 2;
       if (look() == 'T') {
-        Node *R = parseTemplateParam();
+        Node *R = getDerived().parseTemplateParam();
         if (R == nullptr)
           return nullptr;
         return make<SizeofParamPackExpr>(R);
       } else if (look() == 'f') {
-        Node *FP = parseFunctionParam();
+        Node *FP = getDerived().parseFunctionParam();
         if (FP == nullptr)
           return nullptr;
         return make<EnclosingExpr>("sizeof... (", FP, ")");
@@ -4528,7 +4573,7 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
       First += 2;
       size_t ArgsBegin = Names.size();
       while (!consumeIf('E')) {
-        Node *Arg = parseTemplateArg();
+        Node *Arg = getDerived().parseTemplateArg();
         if (Arg == nullptr)
           return nullptr;
         Names.push_back(Arg);
@@ -4544,26 +4589,26 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     switch (First[1]) {
     case 'e': {
       First += 2;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<EnclosingExpr>("typeid (", Ex, ")");
     }
     case 'i': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return Ty;
       return make<EnclosingExpr>("typeid (", Ty, ")");
     }
     case 'l': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       size_t InitsBegin = Names.size();
       while (!consumeIf('E')) {
-        Node *E = parseBracedExpr();
+        Node *E = getDerived().parseBracedExpr();
         if (E == nullptr)
           return nullptr;
         Names.push_back(E);
@@ -4575,7 +4620,7 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
       return make<NameType>("throw");
     case 'w': {
       First += 2;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return nullptr;
       return make<ThrowExpr>(Ex);
@@ -4591,7 +4636,7 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
   case '7':
   case '8':
   case '9':
-    return parseUnresolvedName();
+    return getDerived().parseUnresolvedName();
   }
   return nullptr;
 }
@@ -4604,7 +4649,8 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
 //
 // <v-offset>  ::= <offset number> _ <virtual offset number>
 //               # virtual base override, with vcall offset
-template<typename Alloc> bool Db<Alloc>::parseCallOffset() {
+template <typename Alloc, typename Derived>
+bool AbstractManglingParser<Alloc, Derived>::parseCallOffset() {
   // Just scan through the call offset, we never add this information into the
   // output.
   if (consumeIf('h'))
@@ -4633,14 +4679,15 @@ template<typename Alloc> bool Db<Alloc>::parseCallOffset() {
 //                ::= GR <object name> <seq-id> _    # Subsequent temporaries
 //      extension ::= TC <first type> <number> _ <second type> # construction vtable for second-in-first
 //      extension ::= GR <object name> # reference temporary for object
-template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
   switch (look()) {
   case 'T':
     switch (look(1)) {
     // TV <type>    # virtual table
     case 'V': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       return make<SpecialName>("vtable for ", Ty);
@@ -4648,7 +4695,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // TT <type>    # VTT structure (construction vtable index)
     case 'T': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       return make<SpecialName>("VTT for ", Ty);
@@ -4656,7 +4703,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // TI <type>    # typeinfo structure
     case 'I': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       return make<SpecialName>("typeinfo for ", Ty);
@@ -4664,7 +4711,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // TS <type>    # typeinfo name (null-terminated byte string)
     case 'S': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       return make<SpecialName>("typeinfo name for ", Ty);
@@ -4674,7 +4721,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
       First += 2;
       if (parseCallOffset() || parseCallOffset())
         return nullptr;
-      Node *Encoding = parseEncoding();
+      Node *Encoding = getDerived().parseEncoding();
       if (Encoding == nullptr)
         return nullptr;
       return make<SpecialName>("covariant return thunk to ", Encoding);
@@ -4683,12 +4730,12 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     //               # construction vtable for second-in-first
     case 'C': {
       First += 2;
-      Node *FirstType = parseType();
+      Node *FirstType = getDerived().parseType();
       if (FirstType == nullptr)
         return nullptr;
       if (parseNumber(true).empty() || !consumeIf('_'))
         return nullptr;
-      Node *SecondType = parseType();
+      Node *SecondType = getDerived().parseType();
       if (SecondType == nullptr)
         return nullptr;
       return make<CtorVtableSpecialName>(SecondType, FirstType);
@@ -4696,7 +4743,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // TW <object name> # Thread-local wrapper
     case 'W': {
       First += 2;
-      Node *Name = parseName();
+      Node *Name = getDerived().parseName();
       if (Name == nullptr)
         return nullptr;
       return make<SpecialName>("thread-local wrapper routine for ", Name);
@@ -4704,7 +4751,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // TH <object name> # Thread-local initialization
     case 'H': {
       First += 2;
-      Node *Name = parseName();
+      Node *Name = getDerived().parseName();
       if (Name == nullptr)
         return nullptr;
       return make<SpecialName>("thread-local initialization routine for ", Name);
@@ -4715,7 +4762,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
       bool IsVirt = look() == 'v';
       if (parseCallOffset())
         return nullptr;
-      Node *BaseEncoding = parseEncoding();
+      Node *BaseEncoding = getDerived().parseEncoding();
       if (BaseEncoding == nullptr)
         return nullptr;
       if (IsVirt)
@@ -4729,7 +4776,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // GV <object name> # Guard variable for one-time initialization
     case 'V': {
       First += 2;
-      Node *Name = parseName();
+      Node *Name = getDerived().parseName();
       if (Name == nullptr)
         return nullptr;
       return make<SpecialName>("guard variable for ", Name);
@@ -4739,7 +4786,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // GR <object name> <seq-id> _    # Subsequent temporaries
     case 'R': {
       First += 2;
-      Node *Name = parseName();
+      Node *Name = getDerived().parseName();
       if (Name == nullptr)
         return nullptr;
       size_t Count;
@@ -4756,9 +4803,10 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
 // <encoding> ::= <function name> <bare-function-type>
 //            ::= <data name>
 //            ::= <special-name>
-template<typename Alloc> Node *Db<Alloc>::parseEncoding() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
   if (look() == 'G' || look() == 'T')
-    return parseSpecialName();
+    return getDerived().parseSpecialName();
 
   auto IsEndOfEncoding = [&] {
     // The set of chars that can potentially follow an <encoding> (none of which
@@ -4768,7 +4816,7 @@ template<typename Alloc> Node *Db<Alloc>::parseEncoding() {
   };
 
   NameState NameInfo(this);
-  Node *Name = parseName(&NameInfo);
+  Node *Name = getDerived().parseName(&NameInfo);
   if (Name == nullptr)
     return nullptr;
 
@@ -4782,7 +4830,7 @@ template<typename Alloc> Node *Db<Alloc>::parseEncoding() {
   if (consumeIf("Ua9enable_ifI")) {
     size_t BeforeArgs = Names.size();
     while (!consumeIf('E')) {
-      Node *Arg = parseTemplateArg();
+      Node *Arg = getDerived().parseTemplateArg();
       if (Arg == nullptr)
         return nullptr;
       Names.push_back(Arg);
@@ -4794,7 +4842,7 @@ template<typename Alloc> Node *Db<Alloc>::parseEncoding() {
 
   Node *ReturnType = nullptr;
   if (!NameInfo.CtorDtorConversion && NameInfo.EndsWithTemplateArgs) {
-    ReturnType = parseType();
+    ReturnType = getDerived().parseType();
     if (ReturnType == nullptr)
       return nullptr;
   }
@@ -4806,7 +4854,7 @@ template<typename Alloc> Node *Db<Alloc>::parseEncoding() {
 
   size_t ParamsBegin = Names.size();
   do {
-    Node *Ty = parseType();
+    Node *Ty = getDerived().parseType();
     if (Ty == nullptr)
       return nullptr;
     Names.push_back(Ty);
@@ -4852,9 +4900,9 @@ struct FloatData<long double>
     static constexpr const char *spec = "%LaL";
 };
 
-template<typename Alloc>
-template<class Float>
-Node *Db<Alloc>::parseFloatingLiteral() {
+template <typename Alloc, typename Derived>
+template <class Float>
+Node *AbstractManglingParser<Alloc, Derived>::parseFloatingLiteral() {
   const size_t N = FloatData<Float>::mangled_size;
   if (numLeft() <= N)
     return nullptr;
@@ -4869,7 +4917,8 @@ Node *Db<Alloc>::parseFloatingLiteral() {
 }
 
 // <seq-id> ::= <0-9A-Z>+
-template<typename Alloc> bool Db<Alloc>::parseSeqId(size_t *Out) {
+template <typename Alloc, typename Derived>
+bool AbstractManglingParser<Alloc, Derived>::parseSeqId(size_t *Out) {
   if (!(look() >= '0' && look() <= '9') &&
       !(look() >= 'A' && look() <= 'Z'))
     return true;
@@ -4900,7 +4949,8 @@ template<typename Alloc> bool Db<Alloc>::parseSeqId(size_t *Out) {
 // <substitution> ::= Si # ::std::basic_istream<char,  std::char_traits<char> >
 // <substitution> ::= So # ::std::basic_ostream<char,  std::char_traits<char> >
 // <substitution> ::= Sd # ::std::basic_iostream<char, std::char_traits<char> >
-template<typename Alloc> Node *Db<Alloc>::parseSubstitution() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseSubstitution() {
   if (!consumeIf('S'))
     return nullptr;
 
@@ -4939,7 +4989,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSubstitution() {
     // Itanium C++ ABI 5.1.2: If a name that would use a built-in <substitution>
     // has ABI tags, the tags are appended to the substitution; the result is a
     // substitutable component.
-    Node *WithTags = parseAbiTags(SpecialSub);
+    Node *WithTags = getDerived().parseAbiTags(SpecialSub);
     if (WithTags != SpecialSub) {
       Subs.push_back(WithTags);
       SpecialSub = WithTags;
@@ -4966,7 +5016,8 @@ template<typename Alloc> Node *Db<Alloc>::parseSubstitution() {
 
 // <template-param> ::= T_    # first template parameter
 //                  ::= T <parameter-2 non-negative number> _
-template<typename Alloc> Node *Db<Alloc>::parseTemplateParam() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseTemplateParam() {
   if (!consumeIf('T'))
     return nullptr;
 
@@ -5007,11 +5058,12 @@ template<typename Alloc> Node *Db<Alloc>::parseTemplateParam() {
 //                ::= <expr-primary>            # simple expressions
 //                ::= J <template-arg>* E       # argument pack
 //                ::= LZ <encoding> E           # extension
-template<typename Alloc> Node *Db<Alloc>::parseTemplateArg() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseTemplateArg() {
   switch (look()) {
   case 'X': {
     ++First;
-    Node *Arg = parseExpr();
+    Node *Arg = getDerived().parseExpr();
     if (Arg == nullptr || !consumeIf('E'))
       return nullptr;
     return Arg;
@@ -5020,7 +5072,7 @@ template<typename Alloc> Node *Db<Alloc>::parseTemplateArg() {
     ++First;
     size_t ArgsBegin = Names.size();
     while (!consumeIf('E')) {
-      Node *Arg = parseTemplateArg();
+      Node *Arg = getDerived().parseTemplateArg();
       if (Arg == nullptr)
         return nullptr;
       Names.push_back(Arg);
@@ -5032,23 +5084,24 @@ template<typename Alloc> Node *Db<Alloc>::parseTemplateArg() {
     //                ::= LZ <encoding> E           # extension
     if (look(1) == 'Z') {
       First += 2;
-      Node *Arg = parseEncoding();
+      Node *Arg = getDerived().parseEncoding();
       if (Arg == nullptr || !consumeIf('E'))
         return nullptr;
       return Arg;
     }
     //                ::= <expr-primary>            # simple expressions
-    return parseExprPrimary();
+    return getDerived().parseExprPrimary();
   }
   default:
-    return parseType();
+    return getDerived().parseType();
   }
 }
 
 // <template-args> ::= I <template-arg>* E
 //     extension, the abi says <template-arg>+
-template <typename Alloc>
-Node *Db<Alloc>::parseTemplateArgs(bool TagTemplates) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
   if (!consumeIf('I'))
     return nullptr;
 
@@ -5061,7 +5114,7 @@ Node *Db<Alloc>::parseTemplateArgs(bool TagTemplates) {
   while (!consumeIf('E')) {
     if (TagTemplates) {
       auto OldParams = std::move(TemplateParams);
-      Node *Arg = parseTemplateArg();
+      Node *Arg = getDerived().parseTemplateArg();
       TemplateParams = std::move(OldParams);
       if (Arg == nullptr)
         return nullptr;
@@ -5075,7 +5128,7 @@ Node *Db<Alloc>::parseTemplateArgs(bool TagTemplates) {
       }
       TemplateParams.push_back(TableEntry);
     } else {
-      Node *Arg = parseTemplateArg();
+      Node *Arg = getDerived().parseTemplateArg();
       if (Arg == nullptr)
         return nullptr;
       Names.push_back(Arg);
@@ -5089,9 +5142,10 @@ Node *Db<Alloc>::parseTemplateArgs(bool TagTemplates) {
 // extension      ::= ___Z <encoding> _block_invoke
 // extension      ::= ___Z <encoding> _block_invoke<decimal-digit>+
 // extension      ::= ___Z <encoding> _block_invoke_<decimal-digit>+
-template<typename Alloc> Node *Db<Alloc>::parse() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parse() {
   if (consumeIf("_Z")) {
-    Node *Encoding = parseEncoding();
+    Node *Encoding = getDerived().parseEncoding();
     if (Encoding == nullptr)
       return nullptr;
     if (look() == '.') {
@@ -5104,7 +5158,7 @@ template<typename Alloc> Node *Db<Alloc>::parse() {
   }
 
   if (consumeIf("___Z")) {
-    Node *Encoding = parseEncoding();
+    Node *Encoding = getDerived().parseEncoding();
     if (Encoding == nullptr || !consumeIf("_block_invoke"))
       return nullptr;
     bool RequireNumber = consumeIf('_');
@@ -5117,12 +5171,18 @@ template<typename Alloc> Node *Db<Alloc>::parse() {
     return make<SpecialName>("invocation function for block in ", Encoding);
   }
 
-  Node *Ty = parseType();
+  Node *Ty = getDerived().parseType();
   if (numLeft() != 0)
     return nullptr;
   return Ty;
 }
 
+template <typename Alloc>
+struct ManglingParser : AbstractManglingParser<ManglingParser<Alloc>, Alloc> {
+  using AbstractManglingParser<ManglingParser<Alloc>,
+                               Alloc>::AbstractManglingParser;
+};
+
 }  // namespace itanium_demangle
 }  // namespace llvm
 
diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp
index 8d132c7580f..b6b11dbddf2 100644
--- a/lib/Demangle/ItaniumDemangle.cpp
+++ b/lib/Demangle/ItaniumDemangle.cpp
@@ -322,7 +322,7 @@ public:
 // Code beyond this point should not be synchronized with libc++abi.
 //===----------------------------------------------------------------------===//
 
-using Demangler = itanium_demangle::Db<DefaultAllocator>;
+using Demangler = itanium_demangle::ManglingParser<DefaultAllocator>;
 
 char *llvm::itaniumDemangle(const char *MangledName, char *Buf,
                             size_t *N, int *Status) {
diff --git a/lib/Support/ItaniumManglingCanonicalizer.cpp b/lib/Support/ItaniumManglingCanonicalizer.cpp
index ca63c6d1c7d..e55dcd76180 100644
--- a/lib/Support/ItaniumManglingCanonicalizer.cpp
+++ b/lib/Support/ItaniumManglingCanonicalizer.cpp
@@ -221,7 +221,8 @@ struct CanonicalizerAllocator::MakeNodeImpl<
 
 // FIXME: Also expand built-in substitutions?
 
-using CanonicalizingDemangler = itanium_demangle::Db<CanonicalizerAllocator>;
+using CanonicalizingDemangler =
+    itanium_demangle::ManglingParser<CanonicalizerAllocator>;
 }
 
 struct ItaniumManglingCanonicalizer::Impl {
diff --git a/unittests/Demangle/CMakeLists.txt b/unittests/Demangle/CMakeLists.txt
index 48d959c0852..2f9d71a37e9 100644
--- a/unittests/Demangle/CMakeLists.txt
+++ b/unittests/Demangle/CMakeLists.txt
@@ -1,8 +1,10 @@
 set(LLVM_LINK_COMPONENTS
   Demangle
+  Support
 )
 
 add_llvm_unittest(DemangleTests
+  ItaniumDemangleTest.cpp
   PartialDemangleTest.cpp
   FindTypesInMangledNameTest.cpp
 )
diff --git a/unittests/Demangle/ItaniumDemangleTest.cpp b/unittests/Demangle/ItaniumDemangleTest.cpp
new file mode 100644
index 00000000000..abb690c626a
--- /dev/null
+++ b/unittests/Demangle/ItaniumDemangleTest.cpp
@@ -0,0 +1,54 @@
+//===------------------ ItaniumDemangleTest.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Demangle/ItaniumDemangle.h"
+#include "llvm/Support/Allocator.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <cstdlib>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::itanium_demangle;
+
+namespace {
+class TestAllocator {
+  BumpPtrAllocator Alloc;
+
+public:
+  void reset() { Alloc.Reset(); }
+
+  template <typename T, typename... Args> T *makeNode(Args &&... args) {
+    return new (Alloc.Allocate(sizeof(T), alignof(T)))
+        T(std::forward<Args>(args)...);
+  }
+
+  void *allocateNodeArray(size_t sz) {
+    return Alloc.Allocate(sizeof(Node *) * sz, alignof(Node *));
+  }
+};
+} // namespace
+
+TEST(ItaniumDemangle, MethodOverride) {
+  struct TestParser : AbstractManglingParser<TestParser, TestAllocator> {
+    std::vector<char> Types;
+
+    TestParser(const char *Str)
+        : AbstractManglingParser(Str, Str + strlen(Str)) {}
+
+    Node *parseType() {
+      Types.push_back(*First);
+      return AbstractManglingParser<TestParser, TestAllocator>::parseType();
+    }
+  };
+
+  TestParser Parser("_Z1fIiEjl");
+  ASSERT_NE(nullptr, Parser.parse());
+  EXPECT_THAT(Parser.Types, testing::ElementsAre('i', 'j', 'l'));
+}
-- 
GitLab


From 8728549da335c22ef3c39551c30df55506f534a8 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 17 Oct 2018 19:35:38 +0000
Subject: [PATCH 0285/1116] [BuildingAJIT] Fix a function signature in the
 documentation.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344705 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/tutorial/BuildingAJIT1.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/tutorial/BuildingAJIT1.rst b/docs/tutorial/BuildingAJIT1.rst
index f1e93bf12b3..1342c10a9d1 100644
--- a/docs/tutorial/BuildingAJIT1.rst
+++ b/docs/tutorial/BuildingAJIT1.rst
@@ -65,8 +65,8 @@ rather than compiling whole programs to disk ahead of time as a traditional
 compiler does. To support that aim our initial, bare-bones JIT API will have
 just two functions:
 
-1. Handle addModule(Module &M) -- Make the given IR module available for
-   execution.
+1. void addModule(std::unique_ptr<Module> M) -- Make the given IR module
+   available for execution.
 2. Expected<JITSymbol> lookup() -- Search for pointers to
    symbols (functions or variables) that have been added to the JIT.
 
-- 
GitLab


From 986e22dd7d3f68b96dda6c50a1c9ce04521d8c8b Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 17 Oct 2018 22:27:09 +0000
Subject: [PATCH 0286/1116] [BuildingAJIT] Simplify a tutorial example and fix
 a syntax error.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344712 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/tutorial/BuildingAJIT1.rst | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/tutorial/BuildingAJIT1.rst b/docs/tutorial/BuildingAJIT1.rst
index 1342c10a9d1..7960ffe0bac 100644
--- a/docs/tutorial/BuildingAJIT1.rst
+++ b/docs/tutorial/BuildingAJIT1.rst
@@ -75,10 +75,9 @@ will look like:
 
 .. code-block:: c++
 
-  std::unique_ptr<Module> M = buildModule();
   JIT J;
-  J.addModule(*M);
-  auto *Main = (int(*)(int, char*[]))J.lookup("main");.getAddress();
+  J.addModule(buildModule());
+  auto *Main = (int(*)(int, char*[]))J.lookup("main").getAddress();
   int Result = Main();
 
 The APIs that we build in these tutorials will all be variations on this simple
-- 
GitLab


From 5b21ab8321ccfc4464e5379461d5ddadf1648a17 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Thu, 18 Oct 2018 00:36:15 +0000
Subject: [PATCH 0287/1116] [TI removal] Switch an analysis to just use
 Instruction.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344713 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/LegacyDivergenceAnalysis.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Analysis/LegacyDivergenceAnalysis.cpp b/lib/Analysis/LegacyDivergenceAnalysis.cpp
index c417862524e..2089d1c53d0 100644
--- a/lib/Analysis/LegacyDivergenceAnalysis.cpp
+++ b/lib/Analysis/LegacyDivergenceAnalysis.cpp
@@ -93,7 +93,7 @@ private:
   // A helper function that explores data dependents of V.
   void exploreDataDependency(Value *V);
   // A helper function that explores sync dependents of TI.
-  void exploreSyncDependency(TerminatorInst *TI);
+  void exploreSyncDependency(Instruction *TI);
   // Computes the influence region from Start to End. This region includes all
   // basic blocks on any simple path from Start to End.
   void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End,
@@ -128,7 +128,7 @@ void DivergencePropagator::populateWithSourcesOfDivergence() {
   }
 }
 
-void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) {
+void DivergencePropagator::exploreSyncDependency(Instruction *TI) {
   // Propagation rule 1: if branch TI is divergent, all PHINodes in TI's
   // immediate post dominator are divergent. This rule handles if-then-else
   // patterns. For example,
@@ -252,11 +252,11 @@ void DivergencePropagator::propagate() {
   while (!Worklist.empty()) {
     Value *V = Worklist.back();
     Worklist.pop_back();
-    if (TerminatorInst *TI = dyn_cast<TerminatorInst>(V)) {
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
       // Terminators with less than two successors won't introduce sync
       // dependency. Ignore them.
-      if (TI->getNumSuccessors() > 1)
-        exploreSyncDependency(TI);
+      if (I->isTerminator() && I->getNumSuccessors() > 1)
+        exploreSyncDependency(I);
     }
     exploreDataDependency(V);
   }
-- 
GitLab


From c01b38a2d4af72e948ee9adcdd2fc56d435e35f8 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Thu, 18 Oct 2018 00:37:37 +0000
Subject: [PATCH 0288/1116] [TI removal] Switch MergeFunctions to directly use
 Instruction API.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344714 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/MergeFunctions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 3bebb96c6d3..e8056e6cc61 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -608,7 +608,7 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
         LLVM_DEBUG(BI->print(dbgs()));
         LLVM_DEBUG(dbgs() << "\n");
       }
-    } else if (dyn_cast<TerminatorInst>(BI) == GEntryBlock->getTerminator()) {
+    } else if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) {
       LLVM_DEBUG(dbgs() << " Will Include Terminator: ");
       LLVM_DEBUG(BI->print(dbgs()));
       LLVM_DEBUG(dbgs() << "\n");
-- 
GitLab


From cb0797beaa93523eaf3241aee1cc516131596f8c Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Thu, 18 Oct 2018 00:38:34 +0000
Subject: [PATCH 0289/1116] [TI removal] Switch ObjCARC code to directly use
 the nice range-based successors API or directly build the iterators out of
 the terminator instruction and avoid requiring a TerminatorInst variable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344715 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/ObjCARC/DependencyAnalysis.cpp |  5 +----
 lib/Transforms/ObjCARC/ObjCARCOpts.cpp        | 20 ++++++++-----------
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 464805051c6..52a5e8c96ab 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -266,13 +266,10 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor,
   for (const BasicBlock *BB : Visited) {
     if (BB == StartBB)
       continue;
-    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-    for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
-      const BasicBlock *Succ = *SI;
+    for (const BasicBlock *Succ : successors(BB))
       if (Succ != StartBB && !Visited.count(Succ)) {
         DependingInsts.insert(reinterpret_cast<Instruction *>(-1));
         return;
       }
-    }
   }
 }
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 21e2848030f..6ffaadc2b5f 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -914,8 +914,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
           GetRCIdentityRoot(PN->getIncomingValue(i));
         if (IsNullOrUndef(Incoming))
           HasNull = true;
-        else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
-                   .getNumSuccessors() != 1) {
+        else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() !=
+                 1) {
           HasCriticalEdges = true;
           break;
         }
@@ -1084,18 +1084,15 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
            "Unknown top down sequence state.");
 
     const Value *Arg = I->first;
-    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
     bool SomeSuccHasSame = false;
     bool AllSuccsHaveSame = true;
     bool NotAllSeqEqualButKnownSafe = false;
 
-    succ_const_iterator SI(TI), SE(TI, false);
-
-    for (; SI != SE; ++SI) {
+    for (const BasicBlock *Succ : successors(BB)) {
       // If VisitBottomUp has pointer information for this successor, take
       // what we know about it.
       const DenseMap<const BasicBlock *, BBState>::iterator BBI =
-        BBStates.find(*SI);
+          BBStates.find(Succ);
       assert(BBI != BBStates.end());
       const BottomUpPtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
       const Sequence SuccSSeq = SuccS.GetSeq();
@@ -1414,21 +1411,20 @@ ComputePostOrders(Function &F,
   BasicBlock *EntryBB = &F.getEntryBlock();
   BBState &MyStates = BBStates[EntryBB];
   MyStates.SetAsEntry();
-  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
+  Instruction *EntryTI = EntryBB->getTerminator();
   SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
   Visited.insert(EntryBB);
   OnStack.insert(EntryBB);
   do {
   dfs_next_succ:
     BasicBlock *CurrBB = SuccStack.back().first;
-    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
-    succ_iterator SE(TI, false);
+    succ_iterator SE(CurrBB->getTerminator(), false);
 
     while (SuccStack.back().second != SE) {
       BasicBlock *SuccBB = *SuccStack.back().second++;
       if (Visited.insert(SuccBB).second) {
-        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
-        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
+        SuccStack.push_back(
+            std::make_pair(SuccBB, succ_iterator(SuccBB->getTerminator())));
         BBStates[CurrBB].addSucc(SuccBB);
         BBState &SuccStates = BBStates[SuccBB];
         SuccStates.addPred(CurrBB);
-- 
GitLab


From 7d0753a00130b75bbf4977f2d6998cf69402592c Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Thu, 18 Oct 2018 00:38:54 +0000
Subject: [PATCH 0290/1116] [TI removal] Update CodeExtractor to use
 Instruction directly.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344716 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/CodeExtractor.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 7b45b1799c4..27b982578c4 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -808,10 +808,10 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   for (unsigned i = 0, e = Users.size(); i != e; ++i)
     // The BasicBlock which contains the branch is not in the region
     // modify the branch target to a new block
-    if (TerminatorInst *TI = dyn_cast<TerminatorInst>(Users[i]))
-      if (!Blocks.count(TI->getParent()) &&
-          TI->getParent()->getParent() == oldFunction)
-        TI->replaceUsesOfWith(header, newHeader);
+    if (Instruction *I = dyn_cast<Instruction>(Users[i]))
+      if (I->isTerminator() && !Blocks.count(I->getParent()) &&
+          I->getParent()->getParent() == oldFunction)
+        I->replaceUsesOfWith(header, newHeader);
 
   return newFunction;
 }
-- 
GitLab


From 17441525816ed9b23aff04608381d0f8b62e3f89 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Thu, 18 Oct 2018 00:39:18 +0000
Subject: [PATCH 0291/1116] [TI removal] Use `Instruction` instead of
 `TerminatorInst` for a variable's type.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344717 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/Local.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 04db1c8c4c7..1153f3cbd15 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -2183,7 +2183,7 @@ void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
     return;
   }
 
-  TerminatorInst *NewTI;
+  Instruction *NewTI;
   BasicBlock *UnwindDest;
 
   if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
@@ -2260,7 +2260,7 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
       continue;
     }
     if (DTU) {
-      // Remove the TerminatorInst of BB to clear the successor list of BB.
+      // Remove the terminator of BB to clear the successor list of BB.
       if (BB->getTerminator())
         BB->getInstList().pop_back();
       new UnreachableInst(BB->getContext(), BB);
-- 
GitLab


From a2da1d0155683a592f192155722217357f7f41f6 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Thu, 18 Oct 2018 00:39:46 +0000
Subject: [PATCH 0292/1116] [TI removal] Switch NewGVN to directly use
 `Instruction`.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344718 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/NewGVN.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index ed9f868af61..f5c1493781d 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -777,7 +777,7 @@ private:
 
   // Reachability handling.
   void updateReachableEdge(BasicBlock *, BasicBlock *);
-  void processOutgoingEdges(TerminatorInst *, BasicBlock *);
+  void processOutgoingEdges(Instruction *, BasicBlock *);
   Value *findConditionEquivalence(Value *) const;
 
   // Elimination.
@@ -2483,7 +2483,7 @@ Value *NewGVN::findConditionEquivalence(Value *Cond) const {
 }
 
 // Process the outgoing edges of a block for reachability.
-void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
+void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) {
   // Evaluate reachability of terminator instruction.
   BranchInst *BR;
   if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
@@ -3133,7 +3133,7 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
       auto *Symbolized = createUnknownExpression(I);
       performCongruenceFinding(I, Symbolized);
     }
-    processOutgoingEdges(dyn_cast<TerminatorInst>(I), I->getParent());
+    processOutgoingEdges(I, I->getParent());
   }
 }
 
-- 
GitLab


From 56c3851517623166e74af01fc9564f6ea413ff2b Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Thu, 18 Oct 2018 00:40:26 +0000
Subject: [PATCH 0293/1116] [TI removal] Switch simple loop unswitch to
 `Instruction`.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344719 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 6c4773aa92e..96db249584e 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1793,7 +1793,7 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
 }
 
 static bool unswitchNontrivialInvariants(
-    Loop &L, TerminatorInst &TI, ArrayRef<Value *> Invariants,
+    Loop &L, Instruction &TI, ArrayRef<Value *> Invariants,
     DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
     function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
     ScalarEvolution *SE) {
@@ -2188,7 +2188,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
                       ScalarEvolution *SE) {
   // Collect all invariant conditions within this loop (as opposed to an inner
   // loop which would be handled when visiting that inner loop).
-  SmallVector<std::pair<TerminatorInst *, TinyPtrVector<Value *>>, 4>
+  SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4>
       UnswitchCandidates;
   for (auto *BB : L.blocks()) {
     if (LI.getLoopFor(BB) != &L)
@@ -2298,7 +2298,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
   SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
   // Given a terminator which might be unswitched, computes the non-duplicated
   // cost for that terminator.
-  auto ComputeUnswitchedCost = [&](TerminatorInst &TI, bool FullUnswitch) {
+  auto ComputeUnswitchedCost = [&](Instruction &TI, bool FullUnswitch) {
     BasicBlock &BB = *TI.getParent();
     SmallPtrSet<BasicBlock *, 4> Visited;
 
@@ -2349,11 +2349,11 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
            "Cannot unswitch a condition without multiple distinct successors!");
     return Cost * (Visited.size() - 1);
   };
-  TerminatorInst *BestUnswitchTI = nullptr;
+  Instruction *BestUnswitchTI = nullptr;
   int BestUnswitchCost;
   ArrayRef<Value *> BestUnswitchInvariants;
   for (auto &TerminatorAndInvariants : UnswitchCandidates) {
-    TerminatorInst &TI = *TerminatorAndInvariants.first;
+    Instruction &TI = *TerminatorAndInvariants.first;
     ArrayRef<Value *> Invariants = TerminatorAndInvariants.second;
     BranchInst *BI = dyn_cast<BranchInst>(&TI);
     int CandidateCost = ComputeUnswitchedCost(
-- 
GitLab


From 009a7621a37a96407c63a43eec6702a4234019c7 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 18 Oct 2018 00:51:38 +0000
Subject: [PATCH 0294/1116] [BuildingAJIT] Update the Ch1 KaleidoscopeJIT class
 to expose errors to clients.

Returning the error to clients provides an opportunity to introduce readers to
the Expected and Error APIs and makes the tutorial more useful as a starting
point for a real JIT class, while only slightly complicating the code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344720 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/tutorial/BuildingAJIT1.rst               | 164 +++++++++---------
 .../BuildingAJIT/Chapter1/KaleidoscopeJIT.h   |  48 ++---
 .../BuildingAJIT/Chapter1/toy.cpp             |  26 ++-
 3 files changed, 122 insertions(+), 116 deletions(-)

diff --git a/docs/tutorial/BuildingAJIT1.rst b/docs/tutorial/BuildingAJIT1.rst
index 7960ffe0bac..fcb755bd286 100644
--- a/docs/tutorial/BuildingAJIT1.rst
+++ b/docs/tutorial/BuildingAJIT1.rst
@@ -9,9 +9,9 @@ Chapter 1 Introduction
 ======================
 
 **Warning: This tutorial is currently being updated to account for ORC API
-changes. Only Chapter 1 is up-to-date.**
+changes. Only Chapters 1 and 2 are up-to-date.**
 
-**Example code from Chapters 2 to 4 will compile and run, but has not been
+**Example code from Chapters 3 to 5 will compile and run, but has not been
 updated**
 
 Welcome to Chapter 1 of the "Building an ORC-based JIT in LLVM" tutorial. This
@@ -65,9 +65,9 @@ rather than compiling whole programs to disk ahead of time as a traditional
 compiler does. To support that aim our initial, bare-bones JIT API will have
 just two functions:
 
-1. void addModule(std::unique_ptr<Module> M) -- Make the given IR module
+1. ``Error addModule(std::unique_ptr<Module> M)``: Make the given IR module
    available for execution.
-2. Expected<JITSymbol> lookup() -- Search for pointers to
+2. ``Expected<JITEvaluatedSymbol> lookup()``: Search for pointers to
    symbols (functions or variables) that have been added to the JIT.
 
 A basic use-case for this API, executing the 'main' function from a module,
@@ -127,94 +127,95 @@ usual include guards and #includes [2]_, we get to the definition of our class:
 
   class KaleidoscopeJIT {
   private:
-
     ExecutionSession ES;
-    RTDyldObjectLinkingLayer ObjectLayer{ES, getMemoryMgr};
-    IRCompileLayer CompileLayer{ES, ObjectLayer,
-                                ConcurrentIRCompiler(getJTMB())};
-    DataLayout DL{cantFail(getJTMB().getDefaultDataLayoutForTarget())};
-    MangleAndInterner Mangle{ES, DL};
-    ThreadSafeContext Ctx{llvm::make_unique<LLVMContext>()};
-
-    static JITTargetMachineBuilder getJTMB() {
-      return cantFail(JITTargetMachineBuilder::detectHost());
-    }
+    RTDyldObjectLinkingLayer ObjectLayer;
+    IRCompileLayer CompileLayer;
+
+    DataLayout DL;
+    MangleAndInterner Mangle;
+    ThreadSafeContext Ctx;
 
-    static std::unique_ptr<SectionMemoryManager> getMemoryMgr(VModuleKey) {
-      return llvm::make_unique<SectionMemoryManager>();
+  public:
+    KaleidoscopeJIT(JITTargetMachineBuilder JTMB, DataLayout DL)
+        : ObjectLayer(ES,
+                      []() { return llvm::make_unique<SectionMemoryManager>(); }),
+          CompileLayer(ES, ObjectLayer, ConcurrentIRCompiler(std::move(JTMB))),
+          DL(std::move(DL)), Mangle(ES, this->DL),
+          Ctx(llvm::make_unique<LLVMContext>()) {
+      ES.getMainJITDylib().setGenerator(
+          cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
     }
 
-We begin with the ExecutionSession member, ``ES``, which provides context for
-our running JIT'd code. It holds the string pool for symbol names, the global
-mutex that guards the critical sections of JIT operations, error logging
-facilities, and other utilities. For basic use cases such as this, a default
-constructed ExecutionSession is all we will need. We will investigate more
-advanced uses of ExecutionSession in later chapters. Following our
-ExecutionSession we have two ORC *layers*: an RTDyldObjectLinkingLayer and an
-IRCompileLayer. We will be talking more about layers in the next chapter, but
-for now you can think of them as analogous to LLVM Passes: they wrap up useful
-JIT utilities behind an easy to compose interface. The first layer, ObjectLayer,
-is the foundation of our JIT: it takes in-memory object files produced by a
-compiler and links them on the fly to make them executable. This
-JIT-on-top-of-a-linker design was introduced in MCJIT, however the linker was
-hidden inside the MCJIT class. In ORC we expose the linker so that clients can
-access and configure it directly if they need to. In this tutorial our
-ObjectLayer will just be used to support the next layer in our stack: the
-CompileLayer, which will be responsible for taking LLVM IR, compiling it, and
-passing the resulting in-memory object files down to the object linking layer
-below. Our ObjectLayer is constructed with a reference to the ExecutionSession
-and the getMemoryMgr utility function, which it uses to generate a new memory
-manager for each object file as it is added. Next up is our CompileLayer, which
-is initialized with a reference to the ExecutionSession, a reference to the
-ObjectLayer (where it will send the objects produced by the compiler), and an IR
-compiler instance. In this case we are using the ConcurrentIRCompiler class
-which is constructed with a JITTargetMachineBuilder and can be called to compile
-IR concurrently from several threads (though in this chapter we will only use
-one).
-
-Following the ExecutionSession and layers we have three supporting member
-variables. The DataLayout, ``DL``; and MangleAndInterner, ``Mangle`` members are
-used to support portable lookups based on IR symbol names (more on that when we
-get to our ``lookup`` function below), and the ThreadSafeContext member,
-``Ctx``, manages an LLVMContext that can be used while building IR Modules for
-the JIT.
-
-After that, we have two static utility functions. The ``getJTMB()`` function
-returns a JITTargetMachineBuilder, which is a factory for building LLVM
-TargetMachine instances that are used by the compiler. In this first tutorial we
-will only need one (implicitly created) TargetMachine, but in future tutorials
-that enable concurrent compilation we will need one per thread. This is why we
-use a target machine builder, rather than a single TargetMachine. (note: Older
-LLVM JIT APIs that did not support concurrent compilation were constructed with
-a single TargetMachines). The ``getMemoryMgr()`` function constructs instances
-of RuntimeDyld::MemoryManager, and is used by the linking layer to generate a
-new memory manager for each object file.
+Our class begins with six member variables: An ExecutionSession member, ``ES``,
+which provides context for our running JIT'd code (including the string pool,
+global mutex, and error reporting facilities); An RTDyldObjectLinkingLayer,
+``ObjectLayer``, that can be used to add object files to our JIT (though we will
+not use it directly); An IRCompileLayer, ``CompileLayer``, that can be used to
+add LLVM Modules to our JIT (and which builds on the ObjectLayer), A DataLayout
+and MangleAndInterner, ``DL`` and ``Mangle``, that will be used for symbol mangling
+(more on that later); and finally an LLVMContext that clients will use when
+building IR files for the JIT.
+
+Next up we have our class constructor, which takes a `JITTargetMachineBuilder``
+that will be used by our IRCompiler, and a ``DataLayout`` that we will use to
+initialize our DL member. The constructor begins by initializing our
+ObjectLayer.  The ObjectLayer requires a reference to the ExecutionSession, and
+a function object that will build a JIT memory manager for each module that is
+added (a JIT memory manager manages memory allocations, memory permissions, and
+registration of exception handlers for JIT'd code). For this we use a lambda
+that returns a SectionMemoryManager, an off-the-shelf utility that provides all
+the basic memory management functionality required for this chapter. Next we
+initialize our CompileLayer. The CompileLayer needs three things: (1) A
+reference to the ExecutionSession, (2) A reference to our object layer, and (3)
+a compiler instance to use to perform the actual compilation from IR to object
+files. We use the off-the-shelf ConcurrentIRCompiler utility as our compiler,
+which we construct using this constructor's JITTargetMachineBuilder argument.
+The ConcurrentIRCompiler utility will use the JITTargetMachineBuilder to build
+llvm TargetMachines (which are not thread safe) as needed for compiles. After
+this, we initialize our supporting members: ``DL``, ``Mangler`` and ``Ctx`` with
+the input DataLayout, the ExecutionSession and DL member, and a new default
+constucted LLVMContext respectively. Now that our members have been initialized,
+so the one thing that remains to do is to tweak the configuration of the
+*JITDylib* that we will store our code in. We want to modify this dylib to
+contain not only the symbols that we add to it, but also the symbols from our
+REPL process as well. We do this by attaching a
+``DynamicLibrarySearchGenerator`` instance using the
+``DynamicLibrarySearchGenerator::GetForCurrentProcess`` method.
+
 
 .. code-block:: c++
 
-  public:
+  static Expected<std::unique_ptr<KaleidoscopeJIT>> Create() {
+    auto JTMB = JITTargetMachineBuilder::detectHost();
 
-    KaleidoscopeJIT() {
-      ES.getMainJITDylib().setGenerator(
-        cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
-    }
+    if (!JTMB)
+      return JTMB.takeError();
 
-    const DataLayout &getDataLayout() const { return DL; }
+    auto DL = JTMB->getDefaultDataLayoutForTarget();
+    if (!DL)
+      return DL.takeError();
 
-    LLVMContext &getContext() { return *Ctx.getContext(); }
+    return llvm::make_unique<KaleidoscopeJIT>(std::move(*JTMB), std::move(*DL));
+  }
 
-Next up we have our class constructor. Our members have already been
-initialized, so the one thing that remains to do is to tweak the configuration
-of the *JITDylib* that we will store our code in. We want to modify this dylib
-to contain not only the symbols that we add to it, but also the symbols from
-our REPL process as well. We do this by attaching a
-``DynamicLibrarySearchGenerator`` instance using the
-``DynamicLibrarySearchGenerator::GetForCurrentProcess`` method.
+  const DataLayout &getDataLayout() const { return DL; }
+
+  LLVMContext &getContext() { return *Ctx.getContext(); }
 
-Following the constructor we have the ``getDataLayout()`` and ``getContext()``
-methods. These are used to make data structures created and managed by the JIT
-(especially the LLVMContext) available to the REPL code that will build our
-IR modules.
+Next we have a named constructor, ``Create``, which will build a KaleidoscopeJIT
+instance that is configured to generate code for our host process. It does this
+by first generating a JITTargetMachineBuilder instance using that clases's
+detectHost method and then using that instance to generate a datalayout for
+the target process. Each of these operations can fail, so each returns its
+result wrapped in an Expected value [3]_ that we must check for error before
+continuing. If both operations succeed we can unwrap their results (using the
+dereference operator) and pass them into KaleidoscopeJIT's constructor on the
+last line of the function.
+
+Following the named constructor we have the ``getDataLayout()`` and
+``getContext()`` methods. These are used to make data structures created and
+managed by the JIT (especially the LLVMContext) available to the REPL code that
+will build our IR modules.
 
 .. code-block:: c++
 
@@ -317,3 +318,6 @@ Here is the code:
        +-----------------------------+-----------------------------------------------+
        |        LLVMContext.h        | Provides the LLVMContext class.               |
        +-----------------------------+-----------------------------------------------+
+
+.. [3] See the ErrorHandling section in the LLVM Programmer's Manual
+       (http://llvm.org/docs/ProgrammersManual.html#error-handling)
\ No newline at end of file
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
index d9e320f5478..1df5aff0869 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
@@ -32,37 +32,45 @@ namespace orc {
 
 class KaleidoscopeJIT {
 private:
-
   ExecutionSession ES;
-  RTDyldObjectLinkingLayer ObjectLayer{ES, getMemoryMgr};
-  IRCompileLayer CompileLayer{ES, ObjectLayer,
-                              ConcurrentIRCompiler(getJTMB())};
-  DataLayout DL{cantFail(getJTMB().getDefaultDataLayoutForTarget())};
-  MangleAndInterner Mangle{ES, DL};
-  ThreadSafeContext Ctx{llvm::make_unique<LLVMContext>()};
-
-  static JITTargetMachineBuilder getJTMB() {
-    return cantFail(JITTargetMachineBuilder::detectHost());
-  }
+  RTDyldObjectLinkingLayer ObjectLayer;
+  IRCompileLayer CompileLayer;
 
-  static std::unique_ptr<SectionMemoryManager> getMemoryMgr() {
-    return llvm::make_unique<SectionMemoryManager>();
-  }
+  DataLayout DL;
+  MangleAndInterner Mangle;
+  ThreadSafeContext Ctx;
 
 public:
-
-  KaleidoscopeJIT() {
+  KaleidoscopeJIT(JITTargetMachineBuilder JTMB, DataLayout DL)
+      : ObjectLayer(ES,
+                    []() { return llvm::make_unique<SectionMemoryManager>(); }),
+        CompileLayer(ES, ObjectLayer, ConcurrentIRCompiler(std::move(JTMB))),
+        DL(std::move(DL)), Mangle(ES, this->DL),
+        Ctx(llvm::make_unique<LLVMContext>()) {
     ES.getMainJITDylib().setGenerator(
-      cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
+        cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
+  }
+
+  static Expected<std::unique_ptr<KaleidoscopeJIT>> Create() {
+    auto JTMB = JITTargetMachineBuilder::detectHost();
+
+    if (!JTMB)
+      return JTMB.takeError();
+
+    auto DL = JTMB->getDefaultDataLayoutForTarget();
+    if (!DL)
+      return DL.takeError();
+
+    return llvm::make_unique<KaleidoscopeJIT>(std::move(*JTMB), std::move(*DL));
   }
 
   const DataLayout &getDataLayout() const { return DL; }
 
   LLVMContext &getContext() { return *Ctx.getContext(); }
 
-  void addModule(std::unique_ptr<Module> M) {
-    cantFail(CompileLayer.add(ES.getMainJITDylib(),
-                              ThreadSafeModule(std::move(M), Ctx)));
+  Error addModule(std::unique_ptr<Module> M) {
+    return CompileLayer.add(ES.getMainJITDylib(),
+                            ThreadSafeModule(std::move(M), Ctx));
   }
 
   Expected<JITEvaluatedSymbol> lookup(StringRef Name) {
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp b/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
index 1d0730f99ef..5a66b367c27 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
@@ -703,6 +703,7 @@ static std::unique_ptr<IRBuilder<>> Builder;
 static std::unique_ptr<Module> TheModule;
 static std::map<std::string, AllocaInst *> NamedValues;
 static std::map<std::string, std::unique_ptr<PrototypeAST>> FunctionProtos;
+static ExitOnError ExitOnErr;
 
 Value *LogErrorV(const char *Str) {
   LogError(Str);
@@ -1116,7 +1117,7 @@ static void HandleDefinition() {
       fprintf(stderr, "Read function definition:");
       FnIR->print(errs());
       fprintf(stderr, "\n");
-      TheJIT->addModule(std::move(TheModule));
+      ExitOnErr(TheJIT->addModule(std::move(TheModule)));
       InitializeModule();
     }
   } else {
@@ -1151,23 +1152,16 @@ static void HandleTopLevelExpression() {
     if (FnAST->codegen()) {
       // JIT the module containing the anonymous expression, keeping a handle so
       // we can free it later.
-      TheJIT->addModule(std::move(TheModule));
+      ExitOnErr(TheJIT->addModule(std::move(TheModule)));
       InitializeModule();
 
       // Get the anonymous expression's JITSymbol.
-      auto Sym =  TheJIT->lookup(("__anon_expr" + Twine(ExprCount)).str());
-
-      if (Sym) {
-        // If the lookup succeeded, cast the symbol's address to a function
-        // pointer then call it.
-        auto *FP = (double (*)())(intptr_t)Sym->getAddress();
-        assert(FP && "Failed to codegen function");
-        fprintf(stderr, "Evaluated to %f\n", FP());
-      } else {
-        // Otherwise log the reason the symbol lookup failed.
-        logAllUnhandledErrors(Sym.takeError(), errs(),
-                              "Could not evaluate: ");
-      }
+      auto Sym =
+        ExitOnErr(TheJIT->lookup(("__anon_expr" + Twine(ExprCount)).str()));
+
+      auto *FP = (double (*)())(intptr_t)Sym.getAddress();
+      assert(FP && "Failed to codegen function");
+      fprintf(stderr, "Evaluated to %f\n", FP());
     }
   } else {
     // Skip token for error recovery.
@@ -1235,7 +1229,7 @@ int main() {
   fprintf(stderr, "ready> ");
   getNextToken();
 
-  TheJIT = llvm::make_unique<KaleidoscopeJIT>();
+  TheJIT = ExitOnErr(KaleidoscopeJIT::Create());
   TheContext = &TheJIT->getContext();
 
   InitializeModule();
-- 
GitLab


From d2a4fba6e8bd8266f8e2844f49af91faf1c63db9 Mon Sep 17 00:00:00 2001
From: Krasimir Georgiev <krasimir@google.com>
Date: Thu, 18 Oct 2018 02:06:16 +0000
Subject: [PATCH 0295/1116] [llvm-exegesis] Mark destructor virtual after
 r344695

This was causing a -Wnon-virtual-dtor warning.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344721 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/BenchmarkRunner.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.h b/tools/llvm-exegesis/lib/BenchmarkRunner.h
index 46405898954..517155dbdfb 100644
--- a/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -68,7 +68,7 @@ public:
   // context.
   class FunctionExecutor {
   public:
-    ~FunctionExecutor();
+    virtual ~FunctionExecutor();
     virtual llvm::Expected<int64_t>
     runAndMeasure(const char *Counters) const = 0;
   };
-- 
GitLab


From e97176ffbab1ac950d9ad57cdafc38a0e069ff11 Mon Sep 17 00:00:00 2001
From: Kristina Brooks <kristina@nym.hush.com>
Date: Thu, 18 Oct 2018 03:14:37 +0000
Subject: [PATCH 0296/1116] [X86] Support for the mno-tls-direct-seg-refs flag

Allows to disable direct TLS segment access (%fs or %gs). GCC supports
a similar flag, it can be useful in some circumstances, e.g. when a thread
context block needs to be updated directly from user space. More info
and specific use cases: https://bugs.llvm.org/show_bug.cgi?id=16145

There is another revision for clang as well.
Related: D53102

All X86 CodeGen tests appear to pass:
```
[46/47] Running lit suite /SourceCache/llvm-trunk-8.0/test/CodeGen
Testing Time: 23.17s
  Expected Passes    : 3801
  Expected Failures  : 15
  Unsupported Tests  : 8021
```

Reviewed by: Craig Topper.

Patch by nruslan (Ruslan Nikolaev).

Differential Revision: https://reviews.llvm.org/D53103


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344723 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                   |  4 +++
 lib/Target/X86/X86ISelDAGToDAG.cpp |  6 ++++
 test/CodeGen/X86/tls.ll            | 58 ++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+)

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index e977657d1cb..d396e3f1cbf 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -1450,6 +1450,10 @@ example:
 ``noredzone``
     This attribute indicates that the code generator should not use a
     red zone, even if the target-specific ABI normally permits it.
+``indirect-tls-seg-refs``
+    This attribute indicates that the code generator should not use
+    direct TLS access through segment registers, even if the
+    target-specific ABI normally permits it.
 ``noreturn``
     This function attribute indicates that the function never returns
     normally. This produces undefined behavior at runtime if the
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index c06ad11589d..d6bcdcdf149 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -165,6 +165,9 @@ namespace {
     /// If true, selector should try to optimize for minimum code size.
     bool OptForMinSize;
 
+    /// Disable direct TLS access through segment registers.
+    bool IndirectTlsSegRefs;
+
   public:
     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
         : SelectionDAGISel(tm, OptLevel), OptForSize(false),
@@ -177,6 +180,8 @@ namespace {
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Reset the subtarget each time through.
       Subtarget = &MF.getSubtarget<X86Subtarget>();
+      IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
+                             "indirect-tls-seg-refs");
       SelectionDAGISel::runOnMachineFunction(MF);
       return true;
     }
@@ -981,6 +986,7 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   // For more information see http://people.redhat.com/drepper/tls.pdf
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
     if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
+        !IndirectTlsSegRefs &&
         (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
          Subtarget->isTargetFuchsia()))
       switch (N->getPointerInfo().getAddrSpace()) {
diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll
index ddfebcd0b66..759f3d7c855 100644
--- a/test/CodeGen/X86/tls.ll
+++ b/test/CodeGen/X86/tls.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck -check-prefix=X86_LINUX %s
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64_LINUX %s
+; RUN: llc < %s -mtriple=i386-linux-gnu -fast-isel | FileCheck -check-prefix=X86_ISEL_LINUX %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck -check-prefix=X64_ISEL_LINUX %s
 ; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck -check-prefix=X86_WIN %s
 ; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=X64_WIN %s
 ; RUN: llc < %s -mtriple=i686-pc-windows-gnu | FileCheck -check-prefix=MINGW32 %s
@@ -453,3 +455,59 @@ define i32* @f16() {
 
   ret i32* @i6
 }
+
+; NOTE: Similar to f1() but with direct TLS segment access disabled
+define i32 @f17() #0 {
+; X86_LINUX-LABEL: f17:
+; X86_LINUX:      movl %gs:0, %eax
+; X86_LINUX-NEXT: movl i1@NTPOFF(%eax), %eax
+; X86_LINUX-NEXT: ret
+; X64_LINUX-LABEL: f17:
+; X64_LINUX:      movq %fs:0, %rax
+; X64_LINUX-NEXT: movl i1@TPOFF(%rax), %eax
+; X64_LINUX-NEXT: ret
+; X86_ISEL_LINUX-LABEL: f17:
+; X86_ISEL_LINUX:      movl %gs:0, %eax
+; X86_ISEL_LINUX-NEXT: movl i1@NTPOFF(%eax), %eax
+; X86_ISEL_LINUX-NEXT: ret
+; X64_ISEL_LINUX-LABEL: f17:
+; X64_ISEL_LINUX:      movq %fs:0, %rax
+; X64_ISEL_LINUX-NEXT: movl i1@TPOFF(%rax), %eax
+; X64_ISEL_LINUX-NEXT: ret
+
+entry:
+	%tmp1 = load i32, i32* @i1
+	ret i32 %tmp1
+}
+
+; NOTE: Similar to f3() but with direct TLS segment access disabled
+define i32 @f18() #1 {
+; X86_LINUX-LABEL: f18:
+; X86_LINUX:      movl i2@INDNTPOFF, %eax
+; X86_LINUX-NEXT: movl %gs:0, %ecx
+; X86_LINUX-NEXT: movl (%ecx,%eax), %eax
+; X86_LINUX-NEXT: ret
+; X64_LINUX-LABEL: f18:
+; X64_LINUX:      movq i2@GOTTPOFF(%rip), %rax
+; X64_LINUX-NEXT: movq %fs:0, %rcx
+; X64_LINUX-NEXT: movl (%rcx,%rax), %eax
+; X64_LINUX-NEXT: ret
+; X86_ISEL_LINUX-LABEL: f18:
+; X86_ISEL_LINUX:      movl i2@INDNTPOFF, %eax
+; X86_ISEL_LINUX-NEXT: movl %gs:0, %ecx
+; X86_ISEL_LINUX-NEXT: movl (%ecx,%eax), %eax
+; X86_ISEL_LINUX-NEXT: ret
+; X64_ISEL_LINUX-LABEL: f18:
+; X64_ISEL_LINUX:      movq i2@GOTTPOFF(%rip), %rax
+; X64_ISEL_LINUX-NEXT: movq %fs:0, %rcx
+; X64_ISEL_LINUX-NEXT: movl (%rcx,%rax), %eax
+; X64_ISEL_LINUX-NEXT: ret
+
+
+entry:
+	%tmp1 = load i32, i32* @i2
+	ret i32 %tmp1
+}
+
+attributes #0 = { "indirect-tls-seg-refs" }
+attributes #1 = { nounwind "indirect-tls-seg-refs" }
-- 
GitLab


From 577c9cec20a08d178592dad641b5c4753e3bbf88 Mon Sep 17 00:00:00 2001
From: Mikael Holmen <mikael.holmen@ericsson.com>
Date: Thu, 18 Oct 2018 06:27:53 +0000
Subject: [PATCH 0297/1116] Add a emitUnaryFloatFnCall version that fetches the
 function name from TLI

Summary:
In several places in the code we use the following pattern:

  if (hasUnaryFloatFn(&TLI, Ty, LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) {
    [...]
    Value *Res = emitUnaryFloatFnCall(X, TLI.getName(LibFunc_tan), B, Attrs);
    [...]
  }

In short, we check if there is a lib-function for a certain type, and then
we _always_ fetch the name of the "double" version of the lib function and
construct a call to the appropriate function, that we just checked exists,
using that "double" name as a basis.

This is of course a problem in cases where the target doesn't support the
"double" version, but e.g. only the "float" version.

In that case TLI.getName(LibFunc_tan) returns "", and
emitUnaryFloatFnCall happily appends an "f" to "", and we erroneously end
up with a call to a function called "f".

To solve this, the above pattern is changed to

  if (hasUnaryFloatFn(&TLI, Ty, LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) {
    [...]
    Value *Res = emitUnaryFloatFnCall(X, &TLI, LibFunc_tan, LibFunc_tanf,
                                      LibFunc_tanl, B, Attrs);
    [...]
  }

I.e instead of first fetching the name of the "double" version and then
letting emitUnaryFloatFnCall() add the final "f" or "l", we let
emitUnaryFloatFnCall() fetch the right name from TLI.

Reviewers: eli.friedman, efriedma

Reviewed By: efriedma

Subscribers: efriedma, bjope, llvm-commits

Differential Revision: https://reviews.llvm.org/D53370

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344725 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/BuildLibCalls.h | 13 +++++
 .../InstCombine/InstCombineMulDivRem.cpp      |  3 +-
 lib/Transforms/Utils/BuildLibCalls.cpp        | 47 +++++++++++++++++--
 lib/Transforms/Utils/SimplifyLibCalls.cpp     | 26 +++++++---
 4 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index eafe07f4928..28efce6ac3f 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -37,6 +37,12 @@ namespace llvm {
                        LibFunc DoubleFn, LibFunc FloatFn,
                        LibFunc LongDoubleFn);
 
+  /// Get the name of the overloaded unary floating point function
+  /// corresponding to \a Ty.
+  StringRef getUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                            LibFunc DoubleFn, LibFunc FloatFn,
+                            LibFunc LongDoubleFn);
+
   /// Return V if it is an i8*, otherwise cast it to i8*.
   Value *castToCStr(Value *V, IRBuilder<> &B);
 
@@ -94,6 +100,13 @@ namespace llvm {
   Value *emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
                               const AttributeList &Attrs);
 
+  /// Emit a call to the unary function DoubleFn, FloatFn or LongDoubleFn,
+  /// depending of the type of Op.
+  Value *emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
+                              LibFunc DoubleFn, LibFunc FloatFn,
+                              LibFunc LongDoubleFn, IRBuilder<> &B,
+                              const AttributeList &Attrs);
+
   /// Emit a call to the binary function named 'Name' (e.g. 'fmin'). This
   /// function is known to take type matching 'Op1' and 'Op2' and return one
   /// value with the same type. If 'Op1/Op2' are long double, 'l' is added as
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 6427c818e02..c348aecb2d4 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1157,7 +1157,8 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
       IRBuilder<>::FastMathFlagGuard FMFGuard(B);
       B.setFastMathFlags(I.getFastMathFlags());
       AttributeList Attrs = CallSite(Op0).getCalledFunction()->getAttributes();
-      Value *Res = emitUnaryFloatFnCall(X, TLI.getName(LibFunc_tan), B, Attrs);
+      Value *Res = emitUnaryFloatFnCall(X, &TLI, LibFunc_tan, LibFunc_tanf,
+                                        LibFunc_tanl, B, Attrs);
       if (IsCot)
         Res = B.CreateFDiv(ConstantFP::get(I.getType(), 1.0), Res);
       return replaceInstUsesWith(I, Res);
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 06d197be095..3466dedd323 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -765,6 +765,24 @@ bool llvm::hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
   }
 }
 
+StringRef llvm::getUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                                LibFunc DoubleFn, LibFunc FloatFn,
+                                LibFunc LongDoubleFn) {
+  assert(hasUnaryFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
+         "Cannot get name for unavailable function!");
+
+  switch (Ty->getTypeID()) {
+  case Type::HalfTyID:
+    llvm_unreachable("No name for HalfTy!");
+  case Type::FloatTyID:
+    return TLI->getName(FloatFn);
+  case Type::DoubleTyID:
+    return TLI->getName(DoubleFn);
+  default:
+    return TLI->getName(LongDoubleFn);
+  }
+}
+
 //- Emit LibCalls ------------------------------------------------------------//
 
 Value *llvm::castToCStr(Value *V, IRBuilder<> &B) {
@@ -942,10 +960,10 @@ static void appendTypeSuffix(Value *Op, StringRef &Name,
   }
 }
 
-Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
-                                  const AttributeList &Attrs) {
-  SmallString<20> NameBuffer;
-  appendTypeSuffix(Op, Name, NameBuffer);
+static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
+                                         IRBuilder<> &B,
+                                         const AttributeList &Attrs) {
+  assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall");
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
@@ -964,8 +982,29 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
   return CI;
 }
 
+Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
+                                  const AttributeList &Attrs) {
+  SmallString<20> NameBuffer;
+  appendTypeSuffix(Op, Name, NameBuffer);
+
+  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+}
+
+Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
+                                  LibFunc DoubleFn, LibFunc FloatFn,
+                                  LibFunc LongDoubleFn, IRBuilder<> &B,
+                                  const AttributeList &Attrs) {
+  // Get the name of the function according to TLI.
+  StringRef Name = getUnaryFloatFn(TLI, Op->getType(),
+                                   DoubleFn, FloatFn, LongDoubleFn);
+
+  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+}
+
 Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
                                    IRBuilder<> &B, const AttributeList &Attrs) {
+  assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
+
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op1, Name, NameBuffer);
 
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 41a495a0484..63229bf0399 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1219,17 +1219,26 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
       StringRef ExpName;
       Intrinsic::ID ID;
       Value *ExpFn;
+      LibFunc LibFnFloat;
+      LibFunc LibFnDouble;
+      LibFunc LibFnLongDouble;
 
       switch (LibFn) {
       default:
         return nullptr;
       case LibFunc_expf:  case LibFunc_exp:  case LibFunc_expl:
-        ExpName = TLI->getName(LibFunc_exp);
+        ExpName = "exp";
         ID = Intrinsic::exp;
+        LibFnFloat = LibFunc_expf;
+        LibFnDouble = LibFunc_exp;
+        LibFnLongDouble = LibFunc_expl;
         break;
       case LibFunc_exp2f: case LibFunc_exp2: case LibFunc_exp2l:
-        ExpName = TLI->getName(LibFunc_exp2);
+        ExpName = "exp2";
         ID = Intrinsic::exp2;
+        LibFnFloat = LibFunc_exp2f;
+        LibFnDouble = LibFunc_exp2;
+        LibFnLongDouble = LibFunc_exp2l;
         break;
       }
 
@@ -1238,7 +1247,9 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
       ExpFn = BaseFn->doesNotAccessMemory()
               ? B.CreateCall(Intrinsic::getDeclaration(Mod, ID, Ty),
                              FMul, ExpName)
-              : emitUnaryFloatFnCall(FMul, ExpName, B, BaseFn->getAttributes());
+              : emitUnaryFloatFnCall(FMul, TLI, LibFnDouble, LibFnFloat,
+                                     LibFnLongDouble, B,
+                                     BaseFn->getAttributes());
 
       // Since the new exp{,2}() is different from the original one, dead code
       // elimination cannot be trusted to remove it, since it may have side
@@ -1275,7 +1286,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
         return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
                             FMul, "exp2");
       else
-        return emitUnaryFloatFnCall(FMul, TLI->getName(LibFunc_exp2), B, Attrs);
+        return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
+                                    LibFunc_exp2l, B, Attrs);
     }
   }
 
@@ -1283,7 +1295,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
   // TODO: There is no exp10() intrinsic yet, but some day there shall be one.
   if (match(Base, m_SpecificFP(10.0)) &&
       hasUnaryFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
-    return emitUnaryFloatFnCall(Expo, TLI->getName(LibFunc_exp10), B, Attrs);
+    return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f,
+                                LibFunc_exp10l, B, Attrs);
 
   return nullptr;
 }
@@ -1304,7 +1317,8 @@ static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
     // TODO: We also should check that the target can in fact lower the sqrt()
     // libcall. We currently have no way to ask this question, so we ask if
     // the target has a sqrt() libcall, which is not exactly the same.
-    return emitUnaryFloatFnCall(V, TLI->getName(LibFunc_sqrt), B, Attrs);
+    return emitUnaryFloatFnCall(V, TLI, LibFunc_sqrt, LibFunc_sqrtf,
+                                LibFunc_sqrtl, B, Attrs);
 
   return nullptr;
 }
-- 
GitLab


From 17fa14ff8e012b623c4ae37545c39830648c5147 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Thu, 18 Oct 2018 07:40:03 +0000
Subject: [PATCH 0298/1116] [TI removal] Remove TerminatorInst references from
 bindings.

For the Go bindings, this just removes the no longer useful "isa"-style
wrapper. If there is a user that is interested, they can add a wrapper
for `Instruction::isTerminator`.

For the OCaml bindings, this is just a documentation update.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344726 91177308-0d34-0410-b5e6-96231b3b80d8
---
 bindings/go/llvm/ir.go       | 1 -
 bindings/ocaml/llvm/llvm.mli | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/bindings/go/llvm/ir.go b/bindings/go/llvm/ir.go
index 0f4877429cc..cad21814cd7 100644
--- a/bindings/go/llvm/ir.go
+++ b/bindings/go/llvm/ir.go
@@ -739,7 +739,6 @@ func (v Value) IsAPHINode() (rv Value)             { rv.C = C.LLVMIsAPHINode(v.C
 func (v Value) IsASelectInst() (rv Value)          { rv.C = C.LLVMIsASelectInst(v.C); return }
 func (v Value) IsAShuffleVectorInst() (rv Value)   { rv.C = C.LLVMIsAShuffleVectorInst(v.C); return }
 func (v Value) IsAStoreInst() (rv Value)           { rv.C = C.LLVMIsAStoreInst(v.C); return }
-func (v Value) IsATerminatorInst() (rv Value)      { rv.C = C.LLVMIsATerminatorInst(v.C); return }
 func (v Value) IsABranchInst() (rv Value)          { rv.C = C.LLVMIsABranchInst(v.C); return }
 func (v Value) IsAInvokeInst() (rv Value)          { rv.C = C.LLVMIsAInvokeInst(v.C); return }
 func (v Value) IsAReturnInst() (rv Value)          { rv.C = C.LLVMIsAReturnInst(v.C); return }
diff --git a/bindings/ocaml/llvm/llvm.mli b/bindings/ocaml/llvm/llvm.mli
index 97b6a695fa2..f12eb6efa61 100644
--- a/bindings/ocaml/llvm/llvm.mli
+++ b/bindings/ocaml/llvm/llvm.mli
@@ -1887,16 +1887,16 @@ val set_volatile : bool -> llvalue -> unit
 val is_terminator : llvalue -> bool
 
 (** [successor v i] returns the successor at index [i] for the value [v].
-    See the method [llvm::TerminatorInst::getSuccessor]. *)
+    See the method [llvm::Instruction::getSuccessor]. *)
 val successor : llvalue -> int -> llbasicblock
 
 (** [set_successor v i o] sets the successor of the value [v] at the index [i] to
     the value [o].
-    See the method [llvm::TerminatorInst::setSuccessor]. *)
+    See the method [llvm::Instruction::setSuccessor]. *)
 val set_successor : llvalue -> int -> llbasicblock -> unit
 
 (** [num_successors v] returns the number of successors for the value [v].
-    See the method [llvm::TerminatorInst::getNumSuccessors]. *)
+    See the method [llvm::Instruction::getNumSuccessors]. *)
 val num_successors : llvalue -> int
 
 (** [successors v] returns the successors of [v]. *)
-- 
GitLab


From d95ef31baaa927fabeff6af2ad64aba897b69f26 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Thu, 18 Oct 2018 07:40:24 +0000
Subject: [PATCH 0299/1116] [TI removal] Remove discussion of `TerminatorInst`
 from the LLVM documentation.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344727 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/BranchWeightMetadata.rst |  8 ++++----
 docs/ProgrammersManual.rst    | 11 ++---------
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/docs/BranchWeightMetadata.rst b/docs/BranchWeightMetadata.rst
index 9bd8bd4ae74..e09587179ec 100644
--- a/docs/BranchWeightMetadata.rst
+++ b/docs/BranchWeightMetadata.rst
@@ -9,10 +9,10 @@ Introduction
 ============
 
 Branch Weight Metadata represents branch weights as its likeliness to be taken
-(see :doc:`BlockFrequencyTerminology`). Metadata is assigned to the
-``TerminatorInst`` as a ``MDNode`` of the ``MD_prof`` kind. The first operator
-is always a ``MDString`` node with the string "branch_weights".  Number of
-operators depends on the terminator type.
+(see :doc:`BlockFrequencyTerminology`). Metadata is assigned to an
+``Instruction`` that is a terminator as a ``MDNode`` of the ``MD_prof`` kind.
+The first operator is always a ``MDString`` node with the string
+"branch_weights".  Number of operators depends on the terminator type.
 
 Branch weights might be fetch from the profiling file, or generated based on
 `__builtin_expect`_ instruction.
diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index 64b7de5be15..88c56700eb3 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst
@@ -3736,13 +3736,6 @@ Important Subclasses of the ``Instruction`` class
   `ICmpInst <LangRef.html#i_icmp>`_ (integer opreands), and
   `FCmpInst <LangRef.html#i_fcmp>`_ (floating point operands).
 
-.. _TerminatorInst:
-
-* ``TerminatorInst``
-
-  This subclass is the parent of all terminator instructions (those which can
-  terminate a block).
-
 .. _m_Instruction:
 
 Important Public Members of the ``Instruction`` class
@@ -4068,7 +4061,7 @@ This class represents a single entry single exit section of the code, commonly
 known as a basic block by the compiler community.  The ``BasicBlock`` class
 maintains a list of Instruction_\ s, which form the body of the block.  Matching
 the language definition, the last element of this list of instructions is always
-a terminator instruction (a subclass of the TerminatorInst_ class).
+a terminator instruction.
 
 In addition to tracking the list of instructions that make up the block, the
 ``BasicBlock`` class also keeps track of the :ref:`Function <c_Function>` that
@@ -4119,7 +4112,7 @@ Important Public Members of the ``BasicBlock`` class
   Returns a pointer to :ref:`Function <c_Function>` the block is embedded into,
   or a null pointer if it is homeless.
 
-* ``TerminatorInst *getTerminator()``
+* ``Instruction *getTerminator()``
 
   Returns a pointer to the terminator instruction that appears at the end of the
   ``BasicBlock``.  If there is no terminator instruction, or if the last
-- 
GitLab


From acedb9c3916e017a4595f2c57f781bb0cfb84d61 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 18 Oct 2018 08:20:50 +0000
Subject: [PATCH 0300/1116] [llvm-exegesis] Fix off by one error

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344731 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/llvm-exegesis.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-exegesis/llvm-exegesis.cpp b/tools/llvm-exegesis/llvm-exegesis.cpp
index 39044d48b4b..bbc1c9ba28c 100644
--- a/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -118,7 +118,7 @@ getOpcodesOrDie(const llvm::MCInstrInfo &MCInstrInfo) {
     return {static_cast<unsigned>(OpcodeIndex)};
   if (OpcodeIndex < 0) {
     std::vector<unsigned> Result;
-    for (unsigned I = 1, E = MCInstrInfo.getNumOpcodes(); I <= E; ++I)
+    for (unsigned I = 1, E = MCInstrInfo.getNumOpcodes(); I < E; ++I)
       Result.push_back(I);
     return Result;
   }
-- 
GitLab


From 6071e3bbc0027598049dbd9b6379d88491ca597d Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Thu, 18 Oct 2018 08:47:24 +0000
Subject: [PATCH 0301/1116] [Support] json::Value construction from
 std::vector<T> and std::map<string,T>.

Summary: Previously this required a conversion to json::Array/json::Object first.

Reviewers: ioeric

Subscribers: kristina, llvm-commits

Differential Revision: https://reviews.llvm.org/D53385

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344732 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/JSON.h    | 4 ++++
 unittests/Support/JSONTest.cpp | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/include/llvm/Support/JSON.h b/include/llvm/Support/JSON.h
index 2fc0e7ddb90..7a04fd52bc5 100644
--- a/include/llvm/Support/JSON.h
+++ b/include/llvm/Support/JSON.h
@@ -294,9 +294,13 @@ public:
   Value(json::Array &&Elements) : Type(T_Array) {
     create<json::Array>(std::move(Elements));
   }
+  template <typename Elt>
+  Value(const std::vector<Elt> &C) : Value(json::Array(C)) {}
   Value(json::Object &&Properties) : Type(T_Object) {
     create<json::Object>(std::move(Properties));
   }
+  template <typename Elt>
+  Value(const std::map<std::string, Elt> &C) : Value(json::Object(C)) {}
   // Strings: types with value semantics. Must be valid UTF-8.
   Value(std::string V) : Type(T_String) {
     if (LLVM_UNLIKELY(!isUTF8(V))) {
diff --git a/unittests/Support/JSONTest.cpp b/unittests/Support/JSONTest.cpp
index 64a2bb97bd8..9f2d47b9aa9 100644
--- a/unittests/Support/JSONTest.cpp
+++ b/unittests/Support/JSONTest.cpp
@@ -47,6 +47,8 @@ TEST(JSONTest, Constructors) {
             s(Object{{"A", Object{{"B", Object{{"X", "Y"}}}}}}));
   EXPECT_EQ("null", s(llvm::Optional<double>()));
   EXPECT_EQ("2.5", s(llvm::Optional<double>(2.5)));
+  EXPECT_EQ("[[2.5,null]]", s(std::vector<std::vector<llvm::Optional<double>>>{
+                                 {2.5, llvm::None}}));
 }
 
 TEST(JSONTest, StringOwnership) {
-- 
GitLab


From 981ceb83bd0a6e998dca372c3d0660604b21a974 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Thu, 18 Oct 2018 09:38:44 +0000
Subject: [PATCH 0302/1116] [DA] DivergenceAnalysis for unstructured, reducible
 CFGs

Summary:
This is patch 2 of the new DivergenceAnalysis (https://reviews.llvm.org/D50433).

This patch contains a generic divergence analysis implementation for
unstructured, reducible Control-Flow Graphs. It contains two new classes.
The `SyncDependenceAnalysis` class lazily computes sync dependences, which
relate divergent branches to points of joining divergent control. The
`DivergenceAnalysis` class contains the generic divergence analysis
implementation.

Reviewers: nhaehnle

Reviewed By: nhaehnle

Subscribers: sameerds, kristina, nhaehnle, xbolva00, tschuett, mgorny, llvm-commits

Differential Revision: https://reviews.llvm.org/D51491

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344734 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/PostOrderIterator.h          |   3 +
 include/llvm/Analysis/DivergenceAnalysis.h    | 178 ++++++++
 .../llvm/Analysis/SyncDependenceAnalysis.h    |  88 ++++
 lib/Analysis/CMakeLists.txt                   |   2 +
 lib/Analysis/DivergenceAnalysis.cpp           | 425 +++++++++++++++++
 lib/Analysis/SyncDependenceAnalysis.cpp       | 380 +++++++++++++++
 unittests/Analysis/CMakeLists.txt             |   1 +
 unittests/Analysis/DivergenceAnalysisTest.cpp | 431 ++++++++++++++++++
 8 files changed, 1508 insertions(+)
 create mode 100644 include/llvm/Analysis/DivergenceAnalysis.h
 create mode 100644 include/llvm/Analysis/SyncDependenceAnalysis.h
 create mode 100644 lib/Analysis/DivergenceAnalysis.cpp
 create mode 100644 lib/Analysis/SyncDependenceAnalysis.cpp
 create mode 100644 unittests/Analysis/DivergenceAnalysisTest.cpp

diff --git a/include/llvm/ADT/PostOrderIterator.h b/include/llvm/ADT/PostOrderIterator.h
index dc8a9b6e78b..d77b12228cb 100644
--- a/include/llvm/ADT/PostOrderIterator.h
+++ b/include/llvm/ADT/PostOrderIterator.h
@@ -296,12 +296,15 @@ class ReversePostOrderTraversal {
 
 public:
   using rpo_iterator = typename std::vector<NodeRef>::reverse_iterator;
+  using const_rpo_iterator = typename std::vector<NodeRef>::const_reverse_iterator;
 
   ReversePostOrderTraversal(GraphT G) { Initialize(GT::getEntryNode(G)); }
 
   // Because we want a reverse post order, use reverse iterators from the vector
   rpo_iterator begin() { return Blocks.rbegin(); }
+  const_rpo_iterator begin() const { return Blocks.crbegin(); }
   rpo_iterator end() { return Blocks.rend(); }
+  const_rpo_iterator end() const { return Blocks.crend(); }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Analysis/DivergenceAnalysis.h b/include/llvm/Analysis/DivergenceAnalysis.h
new file mode 100644
index 00000000000..356c144e7e5
--- /dev/null
+++ b/include/llvm/Analysis/DivergenceAnalysis.h
@@ -0,0 +1,178 @@
+//===- llvm/Analysis/DivergenceAnalysis.h - Divergence Analysis -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// The divergence analysis determines which instructions and branches are
+// divergent given a set of divergent source instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
+#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/SyncDependenceAnalysis.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include <vector>
+
+namespace llvm {
+class Module;
+class Value;
+class Instruction;
+class Loop;
+class raw_ostream;
+class TargetTransformInfo;
+
+/// \brief Generic divergence analysis for reducible CFGs.
+///
+/// This analysis propagates divergence in a data-parallel context from sources
+/// of divergence to all users. It requires reducible CFGs. All assignments
+/// should be in SSA form.
+class DivergenceAnalysis {
+public:
+  /// \brief This instance will analyze the whole function \p F or the loop \p
+  /// RegionLoop.
+  ///
+  /// \param RegionLoop if non-null the analysis is restricted to \p RegionLoop.
+  /// Otherwise the whole function is analyzed.
+  /// \param IsLCSSAForm whether the analysis may assume that the IR in the
+  /// region in in LCSSA form.
+  DivergenceAnalysis(const Function &F, const Loop *RegionLoop,
+                     const DominatorTree &DT, const LoopInfo &LI,
+                     SyncDependenceAnalysis &SDA, bool IsLCSSAForm);
+
+  /// \brief The loop that defines the analyzed region (if any).
+  const Loop *getRegionLoop() const { return RegionLoop; }
+  const Function &getFunction() const { return F; }
+
+  /// \brief Whether \p BB is part of the region.
+  bool inRegion(const BasicBlock &BB) const;
+  /// \brief Whether \p I is part of the region.
+  bool inRegion(const Instruction &I) const;
+
+  /// \brief Mark \p UniVal as a value that is always uniform.
+  void addUniformOverride(const Value &UniVal);
+
+  /// \brief Mark \p DivVal as a value that is always divergent.
+  void markDivergent(const Value &DivVal);
+
+  /// \brief Propagate divergence to all instructions in the region.
+  /// Divergence is seeded by calls to \p markDivergent.
+  void compute();
+
+  /// \brief Whether any value was marked or analyzed to be divergent.
+  bool hasDetectedDivergence() const { return !DivergentValues.empty(); }
+
+  /// \brief Whether \p Val will always return a uniform value regardless of its
+  /// operands
+  bool isAlwaysUniform(const Value &Val) const;
+
+  /// \brief Whether \p Val is a divergent value
+  bool isDivergent(const Value &Val) const;
+
+  void print(raw_ostream &OS, const Module *) const;
+
+private:
+  bool updateTerminator(const TerminatorInst &Term) const;
+  bool updatePHINode(const PHINode &Phi) const;
+
+  /// \brief Computes whether \p Inst is divergent based on the
+  /// divergence of its operands.
+  ///
+  /// \returns Whether \p Inst is divergent.
+  ///
+  /// This should only be called for non-phi, non-terminator instructions.
+  bool updateNormalInstruction(const Instruction &Inst) const;
+
+  /// \brief Mark users of live-out users as divergent.
+  ///
+  /// \param LoopHeader the header of the divergent loop.
+  ///
+  /// Marks all users of live-out values of the loop headed by \p LoopHeader
+  /// as divergent and puts them on the worklist.
+  void taintLoopLiveOuts(const BasicBlock &LoopHeader);
+
+  /// \brief Push all users of \p Val (in the region) to the worklist
+  void pushUsers(const Value &I);
+
+  /// \brief Push all phi nodes in @block to the worklist
+  void pushPHINodes(const BasicBlock &Block);
+
+  /// \brief Mark \p Block as join divergent
+  ///
+  /// A block is join divergent if two threads may reach it from different
+  /// incoming blocks at the same time.
+  void markBlockJoinDivergent(const BasicBlock &Block) {
+    DivergentJoinBlocks.insert(&Block);
+  }
+
+  /// \brief Whether \p Val is divergent when read in \p ObservingBlock.
+  bool isTemporalDivergent(const BasicBlock &ObservingBlock,
+                           const Value &Val) const;
+
+  /// \brief Whether \p Block is join divergent
+  ///
+  /// (see markBlockJoinDivergent).
+  bool isJoinDivergent(const BasicBlock &Block) const {
+    return DivergentJoinBlocks.find(&Block) != DivergentJoinBlocks.end();
+  }
+
+  /// \brief Propagate control-induced divergence to users (phi nodes and
+  /// instructions).
+  //
+  // \param JoinBlock is a divergent loop exit or join point of two disjoint
+  // paths.
+  // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop.
+  bool propagateJoinDivergence(const BasicBlock &JoinBlock,
+                               const Loop *TermLoop);
+
+  /// \brief Propagate induced value divergence due to control divergence in \p
+  /// Term.
+  void propagateBranchDivergence(const TerminatorInst &Term);
+
+  /// \brief Propagate divergent caused by a divergent loop exit.
+  ///
+  /// \param ExitingLoop is a divergent loop.
+  void propagateLoopDivergence(const Loop &ExitingLoop);
+
+private:
+  const Function &F;
+  // If regionLoop != nullptr, analysis is only performed within \p RegionLoop.
+  // Otw, analyze the whole function
+  const Loop *RegionLoop;
+
+  const DominatorTree &DT;
+  const LoopInfo &LI;
+
+  // Recognized divergent loops
+  DenseSet<const Loop *> DivergentLoops;
+
+  // The SDA links divergent branches to divergent control-flow joins.
+  SyncDependenceAnalysis &SDA;
+
+  // Use simplified code path for LCSSA form.
+  bool IsLCSSAForm;
+
+  // Set of known-uniform values.
+  DenseSet<const Value *> UniformOverrides;
+
+  // Blocks with joining divergent control from different predecessors.
+  DenseSet<const BasicBlock *> DivergentJoinBlocks;
+
+  // Detected/marked divergent values.
+  DenseSet<const Value *> DivergentValues;
+
+  // Internal worklist for divergence propagation.
+  std::vector<const Instruction *> Worklist;
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
diff --git a/include/llvm/Analysis/SyncDependenceAnalysis.h b/include/llvm/Analysis/SyncDependenceAnalysis.h
new file mode 100644
index 00000000000..f464c4d3e9e
--- /dev/null
+++ b/include/llvm/Analysis/SyncDependenceAnalysis.h
@@ -0,0 +1,88 @@
+//===- SyncDependenceAnalysis.h - Divergent Branch Dependence -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file defines the SyncDependenceAnalysis class, which computes for
+// every divergent branch the set of phi nodes that the branch will make
+// divergent.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H
+#define LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include <memory>
+
+namespace llvm {
+
+class BasicBlock;
+class DominatorTree;
+class Loop;
+class PostDominatorTree;
+class TerminatorInst;
+class TerminatorInst;
+
+using ConstBlockSet = SmallPtrSet<const BasicBlock *, 4>;
+
+/// \brief Relates points of divergent control to join points in
+/// reducible CFGs.
+///
+/// This analysis relates points of divergent control to points of converging
+/// divergent control. The analysis requires all loops to be reducible.
+class SyncDependenceAnalysis {
+  void visitSuccessor(const BasicBlock &succBlock, const Loop *termLoop,
+                      const BasicBlock *defBlock);
+
+public:
+  bool inRegion(const BasicBlock &BB) const;
+
+  ~SyncDependenceAnalysis();
+  SyncDependenceAnalysis(const DominatorTree &DT, const PostDominatorTree &PDT,
+                         const LoopInfo &LI);
+
+  /// \brief Computes divergent join points and loop exits caused by branch
+  /// divergence in \p Term.
+  ///
+  /// The set of blocks which are reachable by disjoint paths from \p Term.
+  /// The set also contains loop exits if there two disjoint paths:
+  /// one from \p Term to the loop exit and another from \p Term to the loop
+  /// header. Those exit blocks are added to the returned set.
+  /// If L is the parent loop of \p Term and an exit of L is in the returned
+  /// set then L is a divergent loop.
+  const ConstBlockSet &join_blocks(const TerminatorInst &Term);
+
+  /// \brief Computes divergent join points and loop exits (in the surrounding
+  /// loop) caused by the divergent loop exits of\p Loop.
+  ///
+  /// The set of blocks which are reachable by disjoint paths from the
+  /// loop exits of \p Loop.
+  /// This treats the loop as a single node in \p Loop's parent loop.
+  /// The returned set has the same properties as for join_blocks(TermInst&).
+  const ConstBlockSet &join_blocks(const Loop &Loop);
+
+private:
+  static ConstBlockSet EmptyBlockSet;
+
+  ReversePostOrderTraversal<const Function *> FuncRPOT;
+  const DominatorTree &DT;
+  const PostDominatorTree &PDT;
+  const LoopInfo &LI;
+
+  std::map<const Loop *, std::unique_ptr<ConstBlockSet>> CachedLoopExitJoins;
+  std::map<const TerminatorInst *, std::unique_ptr<ConstBlockSet>>
+      CachedBranchJoins;
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 6fdbda4e03f..c33e2a88127 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMAnalysis
   Delinearization.cpp
   DemandedBits.cpp
   DependenceAnalysis.cpp
+  DivergenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
   EHPersonalities.cpp
@@ -80,6 +81,7 @@ add_llvm_library(LLVMAnalysis
   ScalarEvolutionAliasAnalysis.cpp
   ScalarEvolutionExpander.cpp
   ScalarEvolutionNormalization.cpp
+  SyncDependenceAnalysis.cpp
   SyntheticCountsUtils.cpp
   TargetLibraryInfo.cpp
   TargetTransformInfo.cpp
diff --git a/lib/Analysis/DivergenceAnalysis.cpp b/lib/Analysis/DivergenceAnalysis.cpp
new file mode 100644
index 00000000000..9453f680110
--- /dev/null
+++ b/lib/Analysis/DivergenceAnalysis.cpp
@@ -0,0 +1,425 @@
+//===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a general divergence analysis for loop vectorization
+// and GPU programs. It determines which branches and values in a loop or GPU
+// program are divergent. It can help branch optimizations such as jump
+// threading and loop unswitching to make better decisions.
+//
+// GPU programs typically use the SIMD execution model, where multiple threads
+// in the same execution group have to execute in lock-step. Therefore, if the
+// code contains divergent branches (i.e., threads in a group do not agree on
+// which path of the branch to take), the group of threads has to execute all
+// the paths from that branch with different subsets of threads enabled until
+// they re-converge.
+//
+// Due to this execution model, some optimizations such as jump
+// threading and loop unswitching can interfere with thread re-convergence.
+// Therefore, an analysis that computes which branches in a GPU program are
+// divergent can help the compiler to selectively run these optimizations.
+//
+// This implementation is derived from the Vectorization Analysis of the
+// Region Vectorizer (RV). That implementation in turn is based on the approach
+// described in
+//
+//   Improving Performance of OpenCL on CPUs
+//   Ralf Karrenberg and Sebastian Hack
+//   CC '12
+//
+// This DivergenceAnalysis implementation is generic in the sense that it does
+// not itself identify original sources of divergence.
+// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and
+// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence
+// (e.g., special variables that hold the thread ID or the iteration variable).
+//
+// The generic implementation propagates divergence to variables that are data
+// or sync dependent on a source of divergence.
+//
+// While data dependency is a well-known concept, the notion of sync dependency
+// is worth more explanation. Sync dependence characterizes the control flow
+// aspect of the propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// The sync dependence detection (which branch induces divergence in which join
+// points) is implemented in the SyncDependenceAnalysis.
+//
+// The current DivergenceAnalysis implementation has the following limitations:
+// 1. intra-procedural. It conservatively considers the arguments of a
+//    non-kernel-entry function and the return value of a function call as
+//    divergent.
+// 2. memory as black box. It conservatively considers values loaded from
+//    generic or local address as divergent. This can be improved by leveraging
+//    pointer analysis and/or by modelling non-escaping memory objects in SSA
+//    as done in RV.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "divergence-analysis"
+
+// class DivergenceAnalysis
+DivergenceAnalysis::DivergenceAnalysis(
+    const Function &F, const Loop *RegionLoop, const DominatorTree &DT,
+    const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm)
+    : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA),
+      IsLCSSAForm(IsLCSSAForm) {}
+
+void DivergenceAnalysis::markDivergent(const Value &DivVal) {
+  assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal));
+  assert(!isAlwaysUniform(DivVal) && "cannot be a divergent");
+  DivergentValues.insert(&DivVal);
+}
+
+void DivergenceAnalysis::addUniformOverride(const Value &UniVal) {
+  UniformOverrides.insert(&UniVal);
+}
+
+bool DivergenceAnalysis::updateTerminator(const TerminatorInst &Term) const {
+  if (Term.getNumSuccessors() <= 1)
+    return false;
+  if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) {
+    assert(BranchTerm->isConditional());
+    return isDivergent(*BranchTerm->getCondition());
+  }
+  if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) {
+    return isDivergent(*SwitchTerm->getCondition());
+  }
+  if (isa<InvokeInst>(Term)) {
+    return false; // ignore abnormal executions through landingpad
+  }
+
+  llvm_unreachable("unexpected terminator");
+}
+
+bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const {
+  // TODO function calls with side effects, etc
+  for (const auto &Op : I.operands()) {
+    if (isDivergent(*Op))
+      return true;
+  }
+  return false;
+}
+
+bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock,
+                                             const Value &Val) const {
+  const auto *Inst = dyn_cast<const Instruction>(&Val);
+  if (!Inst)
+    return false;
+  // check whether any divergent loop carrying Val terminates before control
+  // proceeds to ObservingBlock
+  for (const auto *Loop = LI.getLoopFor(Inst->getParent());
+       Loop != RegionLoop && !Loop->contains(&ObservingBlock);
+       Loop = Loop->getParentLoop()) {
+    if (DivergentLoops.find(Loop) != DivergentLoops.end())
+      return true;
+  }
+
+  return false;
+}
+
+bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const {
+  // joining divergent disjoint path in Phi parent block
+  if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) {
+    return true;
+  }
+
+  // An incoming value could be divergent by itself.
+  // Otherwise, an incoming value could be uniform within the loop
+  // that carries its definition but it may appear divergent
+  // from outside the loop. This happens when divergent loop exits
+  // drop definitions of that uniform value in different iterations.
+  //
+  // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
+  //   if (i % thread_id == 0) break;    // divergent loop exit
+  // }
+  // int divI = i;                 // divI is divergent
+  for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) {
+    const auto *InVal = Phi.getIncomingValue(i);
+    if (isDivergent(*Phi.getIncomingValue(i)) ||
+        isTemporalDivergent(*Phi.getParent(), *InVal)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool DivergenceAnalysis::inRegion(const Instruction &I) const {
+  return I.getParent() && inRegion(*I.getParent());
+}
+
+bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const {
+  return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
+}
+
+// marks all users of loop-carried values of the loop headed by LoopHeader as
+// divergent
+void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) {
+  auto *DivLoop = LI.getLoopFor(&LoopHeader);
+  assert(DivLoop && "loopHeader is not actually part of a loop");
+
+  SmallVector<BasicBlock *, 8> TaintStack;
+  DivLoop->getExitBlocks(TaintStack);
+
+  // Otherwise potential users of loop-carried values could be anywhere in the
+  // dominance region of DivLoop (including its fringes for phi nodes)
+  DenseSet<const BasicBlock *> Visited;
+  for (auto *Block : TaintStack) {
+    Visited.insert(Block);
+  }
+  Visited.insert(&LoopHeader);
+
+  while (!TaintStack.empty()) {
+    auto *UserBlock = TaintStack.back();
+    TaintStack.pop_back();
+
+    // don't spread divergence beyond the region
+    if (!inRegion(*UserBlock))
+      continue;
+
+    assert(!DivLoop->contains(UserBlock) &&
+           "irreducible control flow detected");
+
+    // phi nodes at the fringes of the dominance region
+    if (!DT.dominates(&LoopHeader, UserBlock)) {
+      // all PHI nodes of UserBlock become divergent
+      for (auto &Phi : UserBlock->phis()) {
+        Worklist.push_back(&Phi);
+      }
+      continue;
+    }
+
+    // taint outside users of values carried by DivLoop
+    for (auto &I : *UserBlock) {
+      if (isAlwaysUniform(I))
+        continue;
+      if (isDivergent(I))
+        continue;
+
+      for (auto &Op : I.operands()) {
+        auto *OpInst = dyn_cast<Instruction>(&Op);
+        if (!OpInst)
+          continue;
+        if (DivLoop->contains(OpInst->getParent())) {
+          markDivergent(I);
+          pushUsers(I);
+          break;
+        }
+      }
+    }
+
+    // visit all blocks in the dominance region
+    for (auto *SuccBlock : successors(UserBlock)) {
+      if (!Visited.insert(SuccBlock).second) {
+        continue;
+      }
+      TaintStack.push_back(SuccBlock);
+    }
+  }
+}
+
+void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) {
+  for (const auto &Phi : Block.phis()) {
+    if (isDivergent(Phi))
+      continue;
+    Worklist.push_back(&Phi);
+  }
+}
+
+void DivergenceAnalysis::pushUsers(const Value &V) {
+  for (const auto *User : V.users()) {
+    const auto *UserInst = dyn_cast<const Instruction>(User);
+    if (!UserInst)
+      continue;
+
+    if (isDivergent(*UserInst))
+      continue;
+
+    // only compute divergent inside loop
+    if (!inRegion(*UserInst))
+      continue;
+    Worklist.push_back(UserInst);
+  }
+}
+
+bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock,
+                                                 const Loop *BranchLoop) {
+  LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
+
+  // ignore divergence outside the region
+  if (!inRegion(JoinBlock)) {
+    return false;
+  }
+
+  // push non-divergent phi nodes in JoinBlock to the worklist
+  pushPHINodes(JoinBlock);
+
+  // JoinBlock is a divergent loop exit
+  if (BranchLoop && !BranchLoop->contains(&JoinBlock)) {
+    return true;
+  }
+
+  // disjoint-paths divergent at JoinBlock
+  markBlockJoinDivergent(JoinBlock);
+  return false;
+}
+
+void DivergenceAnalysis::propagateBranchDivergence(const TerminatorInst &Term) {
+  LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
+
+  markDivergent(Term);
+
+  const auto *BranchLoop = LI.getLoopFor(Term.getParent());
+
+  // whether there is a divergent loop exit from BranchLoop (if any)
+  bool IsBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint from Term within the loop
+  // also iterates over loop exits that become divergent due to Term.
+  for (const auto *JoinBlock : SDA.join_blocks(Term)) {
+    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+  }
+
+  // Branch loop is a divergent loop due to the divergent branch in Term
+  if (IsBranchLoopDivergent) {
+    assert(BranchLoop);
+    if (!DivergentLoops.insert(BranchLoop).second) {
+      return;
+    }
+    propagateLoopDivergence(*BranchLoop);
+  }
+}
+
+void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) {
+  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n");
+
+  // don't propagate beyond region
+  if (!inRegion(*ExitingLoop.getHeader()))
+    return;
+
+  const auto *BranchLoop = ExitingLoop.getParentLoop();
+
+  // Uses of loop-carried values could occur anywhere
+  // within the dominance region of the definition. All loop-carried
+  // definitions are dominated by the loop header (reducible control).
+  // Thus all users have to be in the dominance region of the loop header,
+  // except PHI nodes that can also live at the fringes of the dom region
+  // (incoming defining value).
+  if (!IsLCSSAForm)
+    taintLoopLiveOuts(*ExitingLoop.getHeader());
+
+  // whether there is a divergent loop exit from BranchLoop (if any)
+  bool IsBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint paths from exits of
+  // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn
+  // become divergent.
+  for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) {
+    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+  }
+
+  // Branch loop is a divergent due to divergent loop exit in ExitingLoop
+  if (IsBranchLoopDivergent) {
+    assert(BranchLoop);
+    if (!DivergentLoops.insert(BranchLoop).second) {
+      return;
+    }
+    propagateLoopDivergence(*BranchLoop);
+  }
+}
+
+void DivergenceAnalysis::compute() {
+  for (auto *DivVal : DivergentValues) {
+    pushUsers(*DivVal);
+  }
+
+  // propagate divergence
+  while (!Worklist.empty()) {
+    const Instruction &I = *Worklist.back();
+    Worklist.pop_back();
+
+    // maintain uniformity of overrides
+    if (isAlwaysUniform(I))
+      continue;
+
+    bool WasDivergent = isDivergent(I);
+    if (WasDivergent)
+      continue;
+
+    // propagate divergence caused by terminator
+    if (isa<TerminatorInst>(I)) {
+      auto &Term = cast<TerminatorInst>(I);
+      if (updateTerminator(Term)) {
+        // propagate control divergence to affected instructions
+        propagateBranchDivergence(Term);
+        continue;
+      }
+    }
+
+    // update divergence of I due to divergent operands
+    bool DivergentUpd = false;
+    const auto *Phi = dyn_cast<const PHINode>(&I);
+    if (Phi) {
+      DivergentUpd = updatePHINode(*Phi);
+    } else {
+      DivergentUpd = updateNormalInstruction(I);
+    }
+
+    // propagate value divergence to users
+    if (DivergentUpd) {
+      markDivergent(I);
+      pushUsers(I);
+    }
+  }
+}
+
+bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const {
+  return UniformOverrides.find(&V) != UniformOverrides.end();
+}
+
+bool DivergenceAnalysis::isDivergent(const Value &V) const {
+  return DivergentValues.find(&V) != DivergentValues.end();
+}
+
+void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
+  if (DivergentValues.empty())
+    return;
+  // iterate instructions using instructions() to ensure a deterministic order.
+  for (auto &I : instructions(F)) {
+    if (isDivergent(I))
+      OS << "DIVERGENT:" << I << '\n';
+  }
+}
diff --git a/lib/Analysis/SyncDependenceAnalysis.cpp b/lib/Analysis/SyncDependenceAnalysis.cpp
new file mode 100644
index 00000000000..9c40ffe0cc7
--- /dev/null
+++ b/lib/Analysis/SyncDependenceAnalysis.cpp
@@ -0,0 +1,380 @@
+//===- SyncDependenceAnalysis.cpp - Divergent Branch Dependence Calculation
+//--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an algorithm that returns for a divergent branch
+// the set of basic blocks whose phi nodes become divergent due to divergent
+// control. These are the blocks that are reachable by two disjoint paths from
+// the branch or loop exits that have a reaching path that is disjoint from a
+// path to the loop latch.
+//
+// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model
+// control-induced divergence in phi nodes.
+//
+// -- Summary --
+// The SyncDependenceAnalysis lazily computes sync dependences [3].
+// The analysis evaluates the disjoint path criterion [2] by a reduction
+// to SSA construction. The SSA construction algorithm is implemented as
+// a simple data-flow analysis [1].
+//
+// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy
+// [2] "Efficiently Computing Static Single Assignment Form
+//     and the Control Dependence Graph", TOPLAS '91,
+//           Cytron, Ferrante, Rosen, Wegman and Zadeck
+// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack
+// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira
+//
+// -- Sync dependence --
+// Sync dependence [4] characterizes the control flow aspect of the
+// propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// -- Reduction to SSA construction --
+// There are two disjoint paths from A to X, if a certain variant of SSA
+// construction places a phi node in X under the following set-up scheme [2].
+//
+// This variant of SSA construction ignores incoming undef values.
+// That is paths from the entry without a definition do not result in
+// phi nodes.
+//
+//       entry
+//     /      \
+//    A        \
+//  /   \       Y
+// B     C     /
+//  \   /  \  /
+//    D     E
+//     \   /
+//       F
+// Assume that A contains a divergent branch. We are interested
+// in the set of all blocks where each block is reachable from A
+// via two disjoint paths. This would be the set {D, F} in this
+// case.
+// To generally reduce this query to SSA construction we introduce
+// a virtual variable x and assign to x different values in each
+// successor block of A.
+//           entry
+//         /      \
+//        A        \
+//      /   \       Y
+// x = 0   x = 1   /
+//      \  /   \  /
+//        D     E
+//         \   /
+//           F
+// Our flavor of SSA construction for x will construct the following
+//            entry
+//          /      \
+//         A        \
+//       /   \       Y
+// x0 = 0   x1 = 1  /
+//       \   /   \ /
+//      x2=phi    E
+//         \     /
+//          x3=phi
+// The blocks D and F contain phi nodes and are thus each reachable
+// by two disjoins paths from A.
+//
+// -- Remarks --
+// In case of loop exits we need to check the disjoint path criterion for loops
+// [2]. To this end, we check whether the definition of x differs between the
+// loop exit and the loop header (_after_ SSA construction).
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/SyncDependenceAnalysis.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+
+#include <stack>
+#include <unordered_set>
+
+#define DEBUG_TYPE "sync-dependence"
+
+namespace llvm {
+
+ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet;
+
+SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT,
+                                               const PostDominatorTree &PDT,
+                                               const LoopInfo &LI)
+    : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI) {}
+
+SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
+
+using FunctionRPOT = ReversePostOrderTraversal<const Function *>;
+
+// divergence propagator for reducible CFGs
+struct DivergencePropagator {
+  const FunctionRPOT &FuncRPOT;
+  const DominatorTree &DT;
+  const PostDominatorTree &PDT;
+  const LoopInfo &LI;
+
+  // identified join points
+  std::unique_ptr<ConstBlockSet> JoinBlocks;
+
+  // reached loop exits (by a path disjoint to a path to the loop header)
+  SmallPtrSet<const BasicBlock *, 4> ReachedLoopExits;
+
+  // if DefMap[B] == C then C is the dominating definition at block B
+  // if DefMap[B] ~ undef then we haven't seen B yet
+  // if DefMap[B] == B then B is a join point of disjoint paths from X or B is
+  // an immediate successor of X (initial value).
+  using DefiningBlockMap = std::map<const BasicBlock *, const BasicBlock *>;
+  DefiningBlockMap DefMap;
+
+  // all blocks with pending visits
+  std::unordered_set<const BasicBlock *> PendingUpdates;
+
+  DivergencePropagator(const FunctionRPOT &FuncRPOT, const DominatorTree &DT,
+                       const PostDominatorTree &PDT, const LoopInfo &LI)
+      : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI),
+        JoinBlocks(new ConstBlockSet) {}
+
+  // set the definition at @block and mark @block as pending for a visit
+  void addPending(const BasicBlock &Block, const BasicBlock &DefBlock) {
+    bool WasAdded = DefMap.emplace(&Block, &DefBlock).second;
+    if (WasAdded)
+      PendingUpdates.insert(&Block);
+  }
+
+  void printDefs(raw_ostream &Out) {
+    Out << "Propagator::DefMap {\n";
+    for (const auto *Block : FuncRPOT) {
+      auto It = DefMap.find(Block);
+      Out << Block->getName() << " : ";
+      if (It == DefMap.end()) {
+        Out << "\n";
+      } else {
+        const auto *DefBlock = It->second;
+        Out << (DefBlock ? DefBlock->getName() : "<null>") << "\n";
+      }
+    }
+    Out << "}\n";
+  }
+
+  // process @succBlock with reaching definition @defBlock
+  // the original divergent branch was in @parentLoop (if any)
+  void visitSuccessor(const BasicBlock &SuccBlock, const Loop *ParentLoop,
+                      const BasicBlock &DefBlock) {
+
+    // @succBlock is a loop exit
+    if (ParentLoop && !ParentLoop->contains(&SuccBlock)) {
+      DefMap.emplace(&SuccBlock, &DefBlock);
+      ReachedLoopExits.insert(&SuccBlock);
+      return;
+    }
+
+    // first reaching def?
+    auto ItLastDef = DefMap.find(&SuccBlock);
+    if (ItLastDef == DefMap.end()) {
+      addPending(SuccBlock, DefBlock);
+      return;
+    }
+
+    // a join of at least two definitions
+    if (ItLastDef->second != &DefBlock) {
+      // do we know this join already?
+      if (!JoinBlocks->insert(&SuccBlock).second)
+        return;
+
+      // update the definition
+      addPending(SuccBlock, SuccBlock);
+    }
+  }
+
+  // find all blocks reachable by two disjoint paths from @rootTerm.
+  // This method works for both divergent TerminatorInsts and loops with
+  // divergent exits.
+  // @rootBlock is either the block containing the branch or the header of the
+  // divergent loop.
+  // @nodeSuccessors is the set of successors of the node (Loop or Terminator)
+  // headed by @rootBlock.
+  // @parentLoop is the parent loop of the Loop or the loop that contains the
+  // Terminator.
+  template <typename SuccessorIterable>
+  std::unique_ptr<ConstBlockSet>
+  computeJoinPoints(const BasicBlock &RootBlock,
+                    SuccessorIterable NodeSuccessors, const Loop *ParentLoop) {
+    assert(JoinBlocks);
+
+    // immediate post dominator (no join block beyond that block)
+    const auto *PdNode = PDT.getNode(const_cast<BasicBlock *>(&RootBlock));
+    const auto *IpdNode = PdNode->getIDom();
+    const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+
+    // bootstrap with branch targets
+    for (const auto *SuccBlock : NodeSuccessors) {
+      DefMap.emplace(SuccBlock, SuccBlock);
+
+      if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
+        // immediate loop exit from node.
+        ReachedLoopExits.insert(SuccBlock);
+        continue;
+      } else {
+        // regular successor
+        PendingUpdates.insert(SuccBlock);
+      }
+    }
+
+    auto ItBeginRPO = FuncRPOT.begin();
+
+    // skip until term (TODO RPOT won't let us start at @term directly)
+    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {}
+
+    auto ItEndRPO = FuncRPOT.end();
+    assert(ItBeginRPO != ItEndRPO);
+
+    // propagate definitions at the immediate successors of the node in RPO
+    auto ItBlockRPO = ItBeginRPO;
+    while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) {
+      const auto *Block = *ItBlockRPO;
+
+      // skip @block if not pending update
+      auto ItPending = PendingUpdates.find(Block);
+      if (ItPending == PendingUpdates.end())
+        continue;
+      PendingUpdates.erase(ItPending);
+
+      // propagate definition at @block to its successors
+      auto ItDef = DefMap.find(Block);
+      const auto *DefBlock = ItDef->second;
+      assert(DefBlock);
+
+      auto *BlockLoop = LI.getLoopFor(Block);
+      if (ParentLoop &&
+          (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) {
+        // if the successor is the header of a nested loop pretend its a
+        // single node with the loop's exits as successors
+        SmallVector<BasicBlock *, 4> BlockLoopExits;
+        BlockLoop->getExitBlocks(BlockLoopExits);
+        for (const auto *BlockLoopExit : BlockLoopExits) {
+          visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock);
+        }
+
+      } else {
+        // the successors are either on the same loop level or loop exits
+        for (const auto *SuccBlock : successors(Block)) {
+          visitSuccessor(*SuccBlock, ParentLoop, *DefBlock);
+        }
+      }
+    }
+
+    // We need to know the definition at the parent loop header to decide
+    // whether the definition at the header is different from the definition at
+    // the loop exits, which would indicate a divergent loop exits.
+    //
+    // A // loop header
+    // |
+    // B // nested loop header
+    // |
+    // C -> X (exit from B loop) -..-> (A latch)
+    // |
+    // D -> back to B (B latch)
+    // |
+    // proper exit from both loops
+    //
+    // D post-dominates B as it is the only proper exit from the "A loop".
+    // If C has a divergent branch, propagation will therefore stop at D.
+    // That implies that B will never receive a definition.
+    // But that definition can only be the same as at D (D itself in thise case)
+    // because all paths to anywhere have to pass through D.
+    //
+    const BasicBlock *ParentLoopHeader =
+        ParentLoop ? ParentLoop->getHeader() : nullptr;
+    if (ParentLoop && ParentLoop->contains(PdBoundBlock)) {
+      DefMap[ParentLoopHeader] = DefMap[PdBoundBlock];
+    }
+
+    // analyze reached loop exits
+    if (!ReachedLoopExits.empty()) {
+      assert(ParentLoop);
+      const auto *HeaderDefBlock = DefMap[ParentLoopHeader];
+      LLVM_DEBUG(printDefs(dbgs()));
+      assert(HeaderDefBlock && "no definition in header of carrying loop");
+
+      for (const auto *ExitBlock : ReachedLoopExits) {
+        auto ItExitDef = DefMap.find(ExitBlock);
+        assert((ItExitDef != DefMap.end()) &&
+               "no reaching def at reachable loop exit");
+        if (ItExitDef->second != HeaderDefBlock) {
+          JoinBlocks->insert(ExitBlock);
+        }
+      }
+    }
+
+    return std::move(JoinBlocks);
+  }
+};
+
+const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) {
+  using LoopExitVec = SmallVector<BasicBlock *, 4>;
+  LoopExitVec LoopExits;
+  Loop.getExitBlocks(LoopExits);
+  if (LoopExits.size() < 1) {
+    return EmptyBlockSet;
+  }
+
+  // already available in cache?
+  auto ItCached = CachedLoopExitJoins.find(&Loop);
+  if (ItCached != CachedLoopExitJoins.end())
+    return *ItCached->second;
+
+  // compute all join points
+  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+  auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
+      *Loop.getHeader(), LoopExits, Loop.getParentLoop());
+
+  auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks));
+  assert(ItInserted.second);
+  return *ItInserted.first->second;
+}
+
+const ConstBlockSet &
+SyncDependenceAnalysis::join_blocks(const TerminatorInst &Term) {
+  // trivial case
+  if (Term.getNumSuccessors() < 1) {
+    return EmptyBlockSet;
+  }
+
+  // already available in cache?
+  auto ItCached = CachedBranchJoins.find(&Term);
+  if (ItCached != CachedBranchJoins.end())
+    return *ItCached->second;
+
+  // compute all join points
+  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+  const auto &TermBlock = *Term.getParent();
+  auto JoinBlocks = Propagator.computeJoinPoints<succ_const_range>(
+      TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock));
+
+  auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
+  assert(ItInserted.second);
+  return *ItInserted.first->second;
+}
+
+} // namespace llvm
diff --git a/unittests/Analysis/CMakeLists.txt b/unittests/Analysis/CMakeLists.txt
index cf1c072fdc3..7d4fd33716e 100644
--- a/unittests/Analysis/CMakeLists.txt
+++ b/unittests/Analysis/CMakeLists.txt
@@ -14,6 +14,7 @@ add_llvm_unittest(AnalysisTests
   CallGraphTest.cpp
   CFGTest.cpp
   CGSCCPassManagerTest.cpp
+  DivergenceAnalysisTest.cpp
   GlobalsModRefTest.cpp
   ValueLatticeTest.cpp
   LazyCallGraphTest.cpp
diff --git a/unittests/Analysis/DivergenceAnalysisTest.cpp b/unittests/Analysis/DivergenceAnalysisTest.cpp
new file mode 100644
index 00000000000..8afd4bf4e66
--- /dev/null
+++ b/unittests/Analysis/DivergenceAnalysisTest.cpp
@@ -0,0 +1,431 @@
+//===- DivergenceAnalysisTest.cpp - DivergenceAnalysis unit tests ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/SyncDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace {
+
+BasicBlock *GetBlockByName(StringRef BlockName, Function &F) {
+  for (auto &BB : F) {
+    if (BB.getName() != BlockName)
+      continue;
+    return &BB;
+  }
+  return nullptr;
+}
+
+// We use this fixture to ensure that we clean up DivergenceAnalysis before
+// deleting the PassManager.
+class DivergenceAnalysisTest : public testing::Test {
+protected:
+  LLVMContext Context;
+  Module M;
+  TargetLibraryInfoImpl TLII;
+  TargetLibraryInfo TLI;
+
+  std::unique_ptr<DominatorTree> DT;
+  std::unique_ptr<PostDominatorTree> PDT;
+  std::unique_ptr<LoopInfo> LI;
+  std::unique_ptr<SyncDependenceAnalysis> SDA;
+
+  DivergenceAnalysisTest() : M("", Context), TLII(), TLI(TLII) {}
+
+  DivergenceAnalysis buildDA(Function &F, bool IsLCSSA) {
+    DT.reset(new DominatorTree(F));
+    PDT.reset(new PostDominatorTree(F));
+    LI.reset(new LoopInfo(*DT));
+    SDA.reset(new SyncDependenceAnalysis(*DT, *PDT, *LI));
+    return DivergenceAnalysis(F, nullptr, *DT, *LI, *SDA, IsLCSSA);
+  }
+
+  void runWithDA(
+      Module &M, StringRef FuncName, bool IsLCSSA,
+      function_ref<void(Function &F, LoopInfo &LI, DivergenceAnalysis &DA)>
+          Test) {
+    auto *F = M.getFunction(FuncName);
+    ASSERT_NE(F, nullptr) << "Could not find " << FuncName;
+    DivergenceAnalysis DA = buildDA(*F, IsLCSSA);
+    Test(*F, *LI, DA);
+  }
+};
+
+// Simple initial state test
+TEST_F(DivergenceAnalysisTest, DAInitialState) {
+  IntegerType *IntTy = IntegerType::getInt32Ty(Context);
+  FunctionType *FTy =
+      FunctionType::get(Type::getVoidTy(Context), {IntTy}, false);
+  Function *F = cast<Function>(M.getOrInsertFunction("f", FTy));
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", F);
+  ReturnInst::Create(Context, nullptr, BB);
+
+  DivergenceAnalysis DA = buildDA(*F, false);
+
+  // Whole function region
+  EXPECT_EQ(DA.getRegionLoop(), nullptr);
+
+  // No divergence in initial state
+  EXPECT_FALSE(DA.hasDetectedDivergence());
+
+  // No spurious divergence
+  DA.compute();
+  EXPECT_FALSE(DA.hasDetectedDivergence());
+
+  // Detected divergence after marking
+  Argument &arg = *F->arg_begin();
+  DA.markDivergent(arg);
+
+  EXPECT_TRUE(DA.hasDetectedDivergence());
+  EXPECT_TRUE(DA.isDivergent(arg));
+
+  DA.compute();
+  EXPECT_TRUE(DA.hasDetectedDivergence());
+  EXPECT_TRUE(DA.isDivergent(arg));
+}
+
+TEST_F(DivergenceAnalysisTest, DANoLCSSA) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  std::unique_ptr<Module> M = parseAssemblyString(
+      "target datalayout = \"e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128\" "
+      " "
+      "define i32 @f_1(i8* nocapture %arr, i32 %n, i32* %A, i32* %B) "
+      "    local_unnamed_addr { "
+      "entry: "
+      "  br label %loop.ph "
+      " "
+      "loop.ph: "
+      "  br label %loop "
+      " "
+      "loop: "
+      "  %iv0 = phi i32 [ %iv0.inc, %loop ], [ 0, %loop.ph ] "
+      "  %iv1 = phi i32 [ %iv1.inc, %loop ], [ -2147483648, %loop.ph ] "
+      "  %iv0.inc = add i32 %iv0, 1 "
+      "  %iv1.inc = add i32 %iv1, 3 "
+      "  %cond.cont = icmp slt i32 %iv0, %n "
+      "  br i1 %cond.cont, label %loop, label %for.end.loopexit "
+      " "
+      "for.end.loopexit: "
+      "  ret i32 %iv0 "
+      "} ",
+      Err, C);
+
+  Function *F = M->getFunction("f_1");
+  DivergenceAnalysis DA = buildDA(*F, false);
+  EXPECT_FALSE(DA.hasDetectedDivergence());
+
+  auto ItArg = F->arg_begin();
+  ItArg++;
+  auto &NArg = *ItArg;
+
+  // Seed divergence in argument %n
+  DA.markDivergent(NArg);
+
+  DA.compute();
+  EXPECT_TRUE(DA.hasDetectedDivergence());
+
+  // Verify that "ret %iv.0" is divergent
+  auto ItBlock = F->begin();
+  std::advance(ItBlock, 3);
+  auto &ExitBlock = *GetBlockByName("for.end.loopexit", *F);
+  auto &RetInst = *cast<ReturnInst>(ExitBlock.begin());
+  EXPECT_TRUE(DA.isDivergent(RetInst));
+}
+
+TEST_F(DivergenceAnalysisTest, DALCSSA) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  std::unique_ptr<Module> M = parseAssemblyString(
+      "target datalayout = \"e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128\" "
+      " "
+      "define i32 @f_lcssa(i8* nocapture %arr, i32 %n, i32* %A, i32* %B) "
+      "    local_unnamed_addr { "
+      "entry: "
+      "  br label %loop.ph "
+      " "
+      "loop.ph: "
+      "  br label %loop "
+      " "
+      "loop: "
+      "  %iv0 = phi i32 [ %iv0.inc, %loop ], [ 0, %loop.ph ] "
+      "  %iv1 = phi i32 [ %iv1.inc, %loop ], [ -2147483648, %loop.ph ] "
+      "  %iv0.inc = add i32 %iv0, 1 "
+      "  %iv1.inc = add i32 %iv1, 3 "
+      "  %cond.cont = icmp slt i32 %iv0, %n "
+      "  br i1 %cond.cont, label %loop, label %for.end.loopexit "
+      " "
+      "for.end.loopexit: "
+      "  %val.ret = phi i32 [ %iv0, %loop ] "
+      "  br label %detached.return "
+      " "
+      "detached.return: "
+      "  ret i32 %val.ret "
+      "} ",
+      Err, C);
+
+  Function *F = M->getFunction("f_lcssa");
+  DivergenceAnalysis DA = buildDA(*F, true);
+  EXPECT_FALSE(DA.hasDetectedDivergence());
+
+  auto ItArg = F->arg_begin();
+  ItArg++;
+  auto &NArg = *ItArg;
+
+  // Seed divergence in argument %n
+  DA.markDivergent(NArg);
+
+  DA.compute();
+  EXPECT_TRUE(DA.hasDetectedDivergence());
+
+  // Verify that "ret %iv.0" is divergent
+  auto ItBlock = F->begin();
+  std::advance(ItBlock, 4);
+  auto &ExitBlock = *GetBlockByName("detached.return", *F);
+  auto &RetInst = *cast<ReturnInst>(ExitBlock.begin());
+  EXPECT_TRUE(DA.isDivergent(RetInst));
+}
+
+TEST_F(DivergenceAnalysisTest, DAJoinDivergence) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  std::unique_ptr<Module> M = parseAssemblyString(
+      "target datalayout = \"e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128\" "
+      " "
+      "define void @f_1(i1 %a, i1 %b, i1 %c) "
+      "    local_unnamed_addr { "
+      "A: "
+      "  br i1 %a, label %B, label %C "
+      " "
+      "B: "
+      "  br i1 %b, label %C, label %D "
+      " "
+      "C: "
+      "  %c.join = phi i32 [ 0, %A ], [ 1, %B ] "
+      "  br i1 %c, label %D, label %E "
+      " "
+      "D: "
+      "  %d.join = phi i32 [ 0, %B ], [ 1, %C ] "
+      "  br label %E "
+      " "
+      "E: "
+      "  %e.join = phi i32 [ 0, %C ], [ 1, %D ] "
+      "  ret void "
+      "} "
+      " "
+      "define void @f_2(i1 %a, i1 %b, i1 %c) "
+      "    local_unnamed_addr { "
+      "A: "
+      "  br i1 %a, label %B, label %E "
+      " "
+      "B: "
+      "  br i1 %b, label %C, label %D "
+      " "
+      "C: "
+      "  br label %D "
+      " "
+      "D: "
+      "  %d.join = phi i32 [ 0, %B ], [ 1, %C ] "
+      "  br label %E "
+      " "
+      "E: "
+      "  %e.join = phi i32 [ 0, %A ], [ 1, %D ] "
+      "  ret void "
+      "} "
+      " "
+      "define void @f_3(i1 %a, i1 %b, i1 %c)"
+      "    local_unnamed_addr { "
+      "A: "
+      "  br i1 %a, label %B, label %C "
+      " "
+      "B: "
+      "  br label %C "
+      " "
+      "C: "
+      "  %c.join = phi i32 [ 0, %A ], [ 1, %B ] "
+      "  br i1 %c, label %D, label %E "
+      " "
+      "D: "
+      "  br label %E "
+      " "
+      "E: "
+      "  %e.join = phi i32 [ 0, %C ], [ 1, %D ] "
+      "  ret void "
+      "} ",
+      Err, C);
+
+  // Maps divergent conditions to the basic blocks whose Phi nodes become
+  // divergent. Blocks need to be listed in IR order.
+  using SmallBlockVec = SmallVector<const BasicBlock *, 4>;
+  using InducedDivJoinMap = std::map<const Value *, SmallBlockVec>;
+
+  // Actual function performing the checks.
+  auto CheckDivergenceFunc = [this](Function &F,
+                                    InducedDivJoinMap &ExpectedDivJoins) {
+    for (auto &ItCase : ExpectedDivJoins) {
+      auto *DivVal = ItCase.first;
+      auto DA = buildDA(F, false);
+      DA.markDivergent(*DivVal);
+      DA.compute();
+
+      // List of basic blocks that shall host divergent Phi nodes.
+      auto ItDivJoins = ItCase.second.begin();
+
+      for (auto &BB : F) {
+        auto *Phi = dyn_cast<PHINode>(BB.begin());
+        if (!Phi)
+          continue;
+
+        if (&BB == *ItDivJoins) {
+          EXPECT_TRUE(DA.isDivergent(*Phi));
+          // Advance to next block with expected divergent PHI node.
+          ++ItDivJoins;
+        } else {
+          EXPECT_FALSE(DA.isDivergent(*Phi));
+        }
+      }
+    }
+  };
+
+  {
+    auto *F = M->getFunction("f_1");
+    auto ItBlocks = F->begin();
+    ItBlocks++; // Skip A
+    ItBlocks++; // Skip B
+    auto *C = &*ItBlocks++;
+    auto *D = &*ItBlocks++;
+    auto *E = &*ItBlocks;
+
+    auto ItArg = F->arg_begin();
+    auto *AArg = &*ItArg++;
+    auto *BArg = &*ItArg++;
+    auto *CArg = &*ItArg;
+
+    InducedDivJoinMap DivJoins;
+    DivJoins.emplace(AArg, SmallBlockVec({C, D, E}));
+    DivJoins.emplace(BArg, SmallBlockVec({D, E}));
+    DivJoins.emplace(CArg, SmallBlockVec({E}));
+
+    CheckDivergenceFunc(*F, DivJoins);
+  }
+
+  {
+    auto *F = M->getFunction("f_2");
+    auto ItBlocks = F->begin();
+    ItBlocks++; // Skip A
+    ItBlocks++; // Skip B
+    ItBlocks++; // Skip C
+    auto *D = &*ItBlocks++;
+    auto *E = &*ItBlocks;
+
+    auto ItArg = F->arg_begin();
+    auto *AArg = &*ItArg++;
+    auto *BArg = &*ItArg++;
+    auto *CArg = &*ItArg;
+
+    InducedDivJoinMap DivJoins;
+    DivJoins.emplace(AArg, SmallBlockVec({E}));
+    DivJoins.emplace(BArg, SmallBlockVec({D}));
+    DivJoins.emplace(CArg, SmallBlockVec({}));
+
+    CheckDivergenceFunc(*F, DivJoins);
+  }
+
+  {
+    auto *F = M->getFunction("f_3");
+    auto ItBlocks = F->begin();
+    ItBlocks++; // Skip A
+    ItBlocks++; // Skip B
+    auto *C = &*ItBlocks++;
+    ItBlocks++; // Skip D
+    auto *E = &*ItBlocks;
+
+    auto ItArg = F->arg_begin();
+    auto *AArg = &*ItArg++;
+    auto *BArg = &*ItArg++;
+    auto *CArg = &*ItArg;
+
+    InducedDivJoinMap DivJoins;
+    DivJoins.emplace(AArg, SmallBlockVec({C}));
+    DivJoins.emplace(BArg, SmallBlockVec({}));
+    DivJoins.emplace(CArg, SmallBlockVec({E}));
+
+    CheckDivergenceFunc(*F, DivJoins);
+  }
+}
+
+TEST_F(DivergenceAnalysisTest, DASwitchUnreachableDefault) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  std::unique_ptr<Module> M = parseAssemblyString(
+      "target datalayout = \"e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128\" "
+      " "
+      "define void @switch_unreachable_default(i32 %cond) local_unnamed_addr { "
+      "entry: "
+      "  switch i32 %cond, label %sw.default [ "
+      "    i32 0, label %sw.bb0 "
+      "    i32 1, label %sw.bb1 "
+      "  ] "
+      " "
+      "sw.bb0: "
+      "  br label %sw.epilog "
+      " "
+      "sw.bb1: "
+      "  br label %sw.epilog "
+      " "
+      "sw.default: "
+      "  unreachable "
+      " "
+      "sw.epilog: "
+      "  %div.dbl = phi double [ 0.0, %sw.bb0], [ -1.0, %sw.bb1 ] "
+      "  ret void "
+      "}",
+      Err, C);
+
+  auto *F = M->getFunction("switch_unreachable_default");
+  auto &CondArg = *F->arg_begin();
+  auto DA = buildDA(*F, false);
+
+  EXPECT_FALSE(DA.hasDetectedDivergence());
+
+  DA.markDivergent(CondArg);
+  DA.compute();
+
+  // Still %CondArg is divergent.
+  EXPECT_TRUE(DA.hasDetectedDivergence());
+
+  // The join uni.dbl is not divergent (see D52221)
+  auto &ExitBlock = *GetBlockByName("sw.epilog", *F);
+  auto &DivDblPhi = *cast<PHINode>(ExitBlock.begin());
+  EXPECT_TRUE(DA.isDivergent(DivDblPhi));
+}
+
+} // end anonymous namespace
+} // end namespace llvm
-- 
GitLab


From c7fe2166e90fd8b96b791042a0f0b14455059444 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Thu, 18 Oct 2018 12:54:39 +0000
Subject: [PATCH 0303/1116] DivergenceAnalysisTest: fix use of uninitialized
 memory

Thanks to Simon Moll for chasing it down.

Change-Id: If188f07c4aaec217f40a7a2ca029818f9202f1cb

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344738 91177308-0d34-0410-b5e6-96231b3b80d8
---
 unittests/Analysis/DivergenceAnalysisTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unittests/Analysis/DivergenceAnalysisTest.cpp b/unittests/Analysis/DivergenceAnalysisTest.cpp
index 8afd4bf4e66..97dbd18af51 100644
--- a/unittests/Analysis/DivergenceAnalysisTest.cpp
+++ b/unittests/Analysis/DivergenceAnalysisTest.cpp
@@ -302,7 +302,7 @@ TEST_F(DivergenceAnalysisTest, DAJoinDivergence) {
         if (!Phi)
           continue;
 
-        if (&BB == *ItDivJoins) {
+        if (ItDivJoins != ItCase.second.end() && &BB == *ItDivJoins) {
           EXPECT_TRUE(DA.isDivergent(*Phi));
           // Advance to next block with expected divergent PHI node.
           ++ItDivJoins;
-- 
GitLab


From 9e75857c9293153e2dfea8c8bbd0cba2d24f6614 Mon Sep 17 00:00:00 2001
From: Ayal Zaks <ayal.zaks@intel.com>
Date: Thu, 18 Oct 2018 15:03:15 +0000
Subject: [PATCH 0304/1116] [LV] Fold tail by masking to vectorize loops of
 arbitrary trip count under opt for size

When optimizing for size, a loop is vectorized only if the resulting vector loop
completely replaces the original scalar loop. This holds if no runtime guards
are needed, if the original trip-count TC does not overflow, and if TC is a
known constant that is a multiple of the VF. The last two TC-related conditions
can be overcome by
1. rounding the trip-count of the vector loop up from TC to a multiple of VF;
2. masking the vector body under a newly introduced "if (i <= TC-1)" condition.

The patch allows loops with arbitrary trip counts to be vectorized under -Os,
subject to the existing cost model considerations. It also applies to loops with
small trip counts (under -O2) which are currently handled as if under -Os.

The patch does not handle loops with reductions, live-outs, or w/o a primary
induction variable, and disallows interleave groups.

(Third, final and main part of -)
Differential Revision: https://reviews.llvm.org/D50480


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344743 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/VectorUtils.h           |  21 ++-
 .../Vectorize/LoopVectorizationLegality.h     |   4 +
 .../Vectorize/LoopVectorizationLegality.cpp   |  55 ++++++
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 126 +++++++++----
 lib/Transforms/Vectorize/VPlan.cpp            |  24 ++-
 lib/Transforms/Vectorize/VPlan.h              |  21 ++-
 test/Transforms/LoopVectorize/X86/optsize.ll  |  85 ++++++++-
 .../LoopVectorize/X86/small-size.ll           | 172 ++++++++++++++++--
 .../X86/vect.omp.force.small-tc.ll            |  47 ++++-
 9 files changed, 489 insertions(+), 66 deletions(-)

diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index 2ac49f67662..937a52fb968 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -345,20 +345,29 @@ public:
                         const LoopAccessInfo *LAI)
       : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
 
-  ~InterleavedAccessInfo() {
+  ~InterleavedAccessInfo() { reset(); }
+
+  /// Analyze the interleaved accesses and collect them in interleave
+  /// groups. Substitute symbolic strides using \p Strides.
+  /// Consider also predicated loads/stores in the analysis if
+  /// \p EnableMaskedInterleavedGroup is true.
+  void analyzeInterleaving(bool EnableMaskedInterleavedGroup);
+
+  /// Invalidate groups, e.g., in case all blocks in loop will be predicated
+  /// contrary to original assumption. Although we currently prevent group
+  /// formation for predicated accesses, we may be able to relax this limitation
+  /// in the future once we handle more complicated blocks.
+  void reset() {
     SmallPtrSet<InterleaveGroup *, 4> DelSet;
     // Avoid releasing a pointer twice.
     for (auto &I : InterleaveGroupMap)
       DelSet.insert(I.second);
     for (auto *Ptr : DelSet)
       delete Ptr;
+    InterleaveGroupMap.clear();
+    RequiresScalarEpilogue = false;
   }
 
-  /// Analyze the interleaved accesses and collect them in interleave
-  /// groups. Substitute symbolic strides using \p Strides.
-  /// Consider also predicated loads/stores in the analysis if
-  /// \p EnableMaskedInterleavedGroup is true.
-  void analyzeInterleaving(bool EnableMaskedInterleavedGroup);
 
   /// Check if \p Instr belongs to any interleave group.
   bool isInterleaved(Instruction *Instr) const {
diff --git a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 2a6242099b2..ceb660daa28 100644
--- a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -241,6 +241,10 @@ public:
   /// If false, good old LV code.
   bool canVectorize(bool UseVPlanNativePath);
 
+  /// Return true if we can vectorize this loop while folding its tail by
+  /// masking.
+  bool canFoldTailByMasking();
+
   /// Returns the primary induction variable.
   PHINode *getPrimaryInduction() { return PrimaryInduction; }
 
diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index bde90a71b41..755ad32a7bf 100644
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1134,4 +1134,59 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   return Result;
 }
 
+bool LoopVectorizationLegality::canFoldTailByMasking() {
+
+  LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
+
+  if (!PrimaryInduction) {
+    ORE->emit(createMissedAnalysis("NoPrimaryInduction")
+              << "Missing a primary induction variable in the loop, which is "
+              << "needed in order to fold tail by masking as required.");
+    LLVM_DEBUG(dbgs() << "LV: No primary induction, cannot fold tail by "
+                      << "masking.\n");
+    return false;
+  }
+
+  // TODO: handle reductions when tail is folded by masking.
+  if (!Reductions.empty()) {
+    ORE->emit(createMissedAnalysis("ReductionFoldingTailByMasking")
+              << "Cannot fold tail by masking in the presence of reductions.");
+    LLVM_DEBUG(dbgs() << "LV: Loop has reductions, cannot fold tail by "
+                      << "masking.\n");
+    return false;
+  }
+
+  // TODO: handle outside users when tail is folded by masking.
+  for (auto *AE : AllowedExit) {
+    // Check that all users of allowed exit values are inside the loop.
+    for (User *U : AE->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if (TheLoop->contains(UI))
+        continue;
+      ORE->emit(createMissedAnalysis("LiveOutFoldingTailByMasking")
+                << "Cannot fold tail by masking in the presence of live outs.");
+      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop has an "
+                        << "outside user for : " << *UI << '\n');
+      return false;
+    }
+  }
+
+  // The list of pointers that we can safely read and write to remains empty.
+  SmallPtrSet<Value *, 8> SafePointers;
+
+  // Check and mark all blocks for predication, including those that ordinarily
+  // do not need predication such as the header block.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    if (!blockCanBePredicated(BB, SafePointers)) {
+      ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+                << "control flow cannot be substituted for a select");
+      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as required.\n");
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
+  return true;
+}
+
 } // namespace llvm
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5a11c5a54ae..a395183398d 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1105,7 +1105,7 @@ public:
   // through scalar predication or masked load/store or masked gather/scatter.
   // Superset of instructions that return true for isScalarWithPredication.
   bool isPredicatedInst(Instruction *I) {
-    if (!Legal->blockNeedsPredication(I->getParent()))
+    if (!blockNeedsPredication(I->getParent()))
       return false;
     // Loads and stores that need some form of masked operation are predicated
     // instructions.
@@ -1139,6 +1139,13 @@ public:
     return InterleaveInfo.requiresScalarEpilogue();
   }
 
+  /// Returns true if all loop blocks should be masked to fold tail loop.
+  bool foldTailByMasking() const { return FoldTailByMasking; }
+
+  bool blockNeedsPredication(BasicBlock *BB) {
+    return foldTailByMasking() || Legal->blockNeedsPredication(BB);
+  }
+
 private:
   unsigned NumPredStores = 0;
 
@@ -1222,6 +1229,9 @@ private:
   /// vectorization as a predicated block.
   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
 
+  /// All blocks of loop are to be masked to fold tail of scalar iterations.
+  bool FoldTailByMasking = false;
+
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
@@ -2339,6 +2349,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
   if (TripCount)
     return TripCount;
 
+  assert(L && "Create Trip Count for null loop.");
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   // Find the loop boundaries.
   ScalarEvolution *SE = PSE.getSE();
@@ -2388,12 +2399,26 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   Value *TC = getOrCreateTripCount(L);
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
 
+  Type *Ty = TC->getType();
+  Constant *Step = ConstantInt::get(Ty, VF * UF);
+
+  // If the tail is to be folded by masking, round the number of iterations N
+  // up to a multiple of Step instead of rounding down. This is done by first
+  // adding Step-1 and then rounding down. Note that it's ok if this addition
+  // overflows: the vector induction variable will eventually wrap to zero given
+  // that it starts at zero and its Step is a power of two; the loop will then
+  // exit, with the last early-exit vector comparison also producing all-true.
+  if (Cost->foldTailByMasking()) {
+    assert(isPowerOf2_32(VF * UF) &&
+           "VF*UF must be a power of 2 when folding tail by masking");
+    TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
+  }
+
   // Now we need to generate the expression for the part of the loop that the
   // vectorized body will execute. This is equal to N - (N % Step) if scalar
   // iterations are not required for correctness, or N - Step, otherwise. Step
   // is equal to the vectorization factor (number of SIMD elements) times the
   // unroll factor (number of SIMD instructions).
-  Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
 
   // If there is a non-reversed interleaved group that may speculatively access
@@ -2456,8 +2481,13 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
   // of zero. In this case we will also jump to the scalar loop.
   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
                                           : ICmpInst::ICMP_ULT;
-  Value *CheckMinIters = Builder.CreateICmp(
-      P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
+
+  // If tail is to be folded, vector loop takes care of all iterations.
+  Value *CheckMinIters = Builder.getFalse();
+  if (!Cost->foldTailByMasking())
+    CheckMinIters = Builder.CreateICmp(
+        P, Count, ConstantInt::get(Count->getType(), VF * UF),
+        "min.iters.check");
 
   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
   // Update dominator tree immediately if the generated block is a
@@ -2486,6 +2516,7 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
     if (C->isZero())
       return;
 
+  assert(!Cost->foldTailByMasking() && "Cannot check stride when folding tail");
   // Create a new block containing the stride check.
   BB->setName("vector.scevcheck");
   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -2518,6 +2549,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   if (!MemRuntimeCheck)
     return;
 
+  assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
   // Create a new block containing the memory check.
   BB->setName("vector.memcheck");
   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -2786,9 +2818,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
-  Value *CmpN =
-      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                      CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
+  // If tail is to be folded, we know we don't need to run the remainder.
+  Value *CmpN = Builder.getTrue();
+  if (!Cost->foldTailByMasking())
+    CmpN =
+        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+                        CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
   ReplaceInstWithInst(MiddleBlock->getTerminator(),
                       BranchInst::Create(ExitBlock, ScalarPH, CmpN));
 
@@ -4262,7 +4297,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
 }
 
 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
-  if (!Legal->blockNeedsPredication(I->getParent()))
+  if (!blockNeedsPredication(I->getParent()))
     return false;
   switch(I->getOpcode()) {
   default:
@@ -4564,36 +4599,36 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
     return None;
   }
 
-  // If we don't know the precise trip count, don't try to vectorize.
+  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
+
+  if (TC > 0 && TC % MaxVF == 0) {
+    LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+    return MaxVF;
+  }
+
+  // If we don't know the precise trip count, or if the trip count that we
+  // found modulo the vectorization factor is not zero, try to fold the tail
+  // by masking.
+  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+  // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
+  //        smaller MaxVF that does not require a scalar epilog.
+  if (Legal->canFoldTailByMasking()) {
+    FoldTailByMasking = true;
+    return MaxVF;
+  }
+
   if (TC == 0) {
     ORE->emit(
         createMissedAnalysis("UnknownLoopCountComplexCFG")
         << "unable to calculate the loop count due to complex control flow");
-    LLVM_DEBUG(
-        dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
     return None;
   }
 
-  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
-
-  if (TC % MaxVF != 0) {
-    // If the trip count that we found modulo the vectorization factor is not
-    // zero then we require a tail.
-    // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
-    // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
-    //        smaller MaxVF that does not require a scalar epilog.
-
-    ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
-              << "cannot optimize for size and vectorize at the "
-                 "same time. Enable vectorization of this loop "
-                 "with '#pragma clang loop vectorize(enable)' "
-                 "when compiling with -Os/-Oz");
-    LLVM_DEBUG(
-        dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
-    return None;
-  }
-
-  return MaxVF;
+  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
+            << "cannot optimize for size and vectorize at the same time. "
+               "Enable vectorization of this loop with '#pragma clang loop "
+               "vectorize(enable)' when compiling with -Os/-Oz");
+  return None;
 }
 
 unsigned
@@ -4831,6 +4866,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   // fit without causing spills. All of this is rounded down if necessary to be
   // a power of two. We want power of two interleave count to simplify any
   // addressing operations or alignment considerations.
+  // We also want power of two interleave counts to ensure that the induction
+  // variable of the vector loop wraps to zero, when tail is folded by masking;
+  // this currently happens when OptForSize, in which case IC is set to 1 above.
   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
                               R.MaxLocalUsers);
 
@@ -5117,7 +5155,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
   // determine if it would be better to not if-convert the blocks they are in.
   // If so, we also record the instructions to scalarize.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!Legal->blockNeedsPredication(BB))
+    if (!blockNeedsPredication(BB))
       continue;
     for (Instruction &I : *BB)
       if (isScalarWithPredication(&I)) {
@@ -5282,7 +5320,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
     // unconditionally executed. For the scalar case, we may not always execute
     // the predicated block. Thus, scale the block's cost by the probability of
     // executing it.
-    if (VF == 1 && Legal->blockNeedsPredication(BB))
+    if (VF == 1 && blockNeedsPredication(BB))
       BlockCost.first /= getReciprocalPredBlockProb();
 
     Cost.first += BlockCost.first;
@@ -5973,6 +6011,10 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
   if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
     return NoVectorization;
 
+  // Invalidate interleave groups if all blocks of loop will be predicated.
+  if (CM.blockNeedsPredication(OrigLoop->getHeader()))
+    CM.InterleaveInfo.reset();
+
   if (UserVF) {
     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
@@ -6029,6 +6071,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
                          &ILV,   CallbackILV};
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
+  State.TripCount = ILV.getOrCreateTripCount(nullptr);
 
   //===------------------------------------------------===//
   //
@@ -6209,9 +6252,17 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
   // load/store/gather/scatter. Initialize BlockMask to no-mask.
   VPValue *BlockMask = nullptr;
 
-  // Loop incoming mask is all-one.
-  if (OrigLoop->getHeader() == BB)
+  if (OrigLoop->getHeader() == BB) {
+    if (!CM.blockNeedsPredication(BB))
+      return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
+
+    // Introduce the early-exit compare IV <= BTC to form header block mask.
+    // This is used instead of IV < TC because TC may wrap, unlike BTC.
+    VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
+    VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
+    BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
     return BlockMaskCache[BB] = BlockMask;
+  }
 
   // This is the block mask. We OR all incoming edges.
   for (auto *Predecessor : predecessors(BB)) {
@@ -6577,6 +6628,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
       NeedDef.insert(Branch->getCondition());
   }
 
+  // If the tail is to be folded by masking, the primary induction variable
+  // needs to be represented in VPlan for it to model early-exit masking.
+  if (CM.foldTailByMasking())
+    NeedDef.insert(Legal->getPrimaryInduction());
+
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
   // example, original induction update instructions can become dead because we
diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp
index 39cb4e9ec68..a3c15a36b05 100644
--- a/lib/Transforms/Vectorize/VPlan.cpp
+++ b/lib/Transforms/Vectorize/VPlan.cpp
@@ -303,6 +303,13 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     State.set(this, V, Part);
     break;
   }
+  case VPInstruction::ICmpULE: {
+    Value *IV = State.get(getOperand(0), Part);
+    Value *TC = State.get(getOperand(1), Part);
+    Value *V = Builder.CreateICmpULE(IV, TC);
+    State.set(this, V, Part);
+    break;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -328,6 +335,9 @@ void VPInstruction::print(raw_ostream &O) const {
   case VPInstruction::Not:
     O << "not";
     break;
+  case VPInstruction::ICmpULE:
+    O << "icmp ule";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -342,6 +352,15 @@ void VPInstruction::print(raw_ostream &O) const {
 /// LoopVectorBody basic-block was created for this. Introduce additional
 /// basic-blocks as needed, and fill them all.
 void VPlan::execute(VPTransformState *State) {
+  // -1. Check if the backedge taken count is needed, and if so build it.
+  if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
+    Value *TC = State->TripCount;
+    IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
+    auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
+                                   "trip.count.minus.1");
+    Value2VPValue[TCMO] = BackedgeTakenCount;
+  }
+
   // 0. Set the reverse mapping from VPValues to Values for code generation.
   for (auto &Entry : Value2VPValue)
     State->VPValue2Value[Entry.second] = Entry.first;
@@ -469,8 +488,11 @@ void VPlanPrinter::dump() {
   OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
   if (!Plan.getName().empty())
     OS << "\\n" << DOT::EscapeString(Plan.getName());
-  if (!Plan.Value2VPValue.empty()) {
+  if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) {
     OS << ", where:";
+    if (Plan.BackedgeTakenCount)
+      OS << "\\n"
+         << *Plan.getOrCreateBackedgeTakenCount() << " := BackedgeTakenCount";
     for (auto Entry : Plan.Value2VPValue) {
       OS << "\\n" << *Entry.second;
       OS << DOT::EscapeString(" := ");
diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
index 81b1986c97d..9daaea1acde 100644
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h
@@ -317,6 +317,9 @@ struct VPTransformState {
   /// Values they correspond to.
   VPValue2ValueTy VPValue2Value;
 
+  /// Hold the trip count of the scalar loop.
+  Value *TripCount = nullptr;
+
   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
   InnerLoopVectorizer *ILV;
 
@@ -607,7 +610,7 @@ class VPInstruction : public VPUser, public VPRecipeBase {
 
 public:
   /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
-  enum { Not = Instruction::OtherOpsEnd + 1 };
+  enum { Not = Instruction::OtherOpsEnd + 1, ICmpULE };
 
 private:
   typedef unsigned char OpcodeTy;
@@ -1115,6 +1118,10 @@ private:
   // (operators '==' and '<').
   SmallPtrSet<VPValue *, 16> VPExternalDefs;
 
+  /// Represents the backedge taken count of the original loop, for folding
+  /// the tail.
+  VPValue *BackedgeTakenCount = nullptr;
+
   /// Holds a mapping between Values and their corresponding VPValue inside
   /// VPlan.
   Value2VPValueTy Value2VPValue;
@@ -1132,7 +1139,10 @@ public:
     if (Entry)
       VPBlockBase::deleteCFG(Entry);
     for (auto &MapEntry : Value2VPValue)
-      delete MapEntry.second;
+      if (MapEntry.second != BackedgeTakenCount)
+        delete MapEntry.second;
+    if (BackedgeTakenCount)
+      delete BackedgeTakenCount; // Delete once, if in Value2VPValue or not.
     for (VPValue *Def : VPExternalDefs)
       delete Def;
     for (VPValue *CBV : VPCBVs)
@@ -1147,6 +1157,13 @@ public:
 
   VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; }
 
+  /// The backedge taken count of the original loop.
+  VPValue *getOrCreateBackedgeTakenCount() {
+    if (!BackedgeTakenCount)
+      BackedgeTakenCount = new VPValue();
+    return BackedgeTakenCount;
+  }
+
   void addVF(unsigned VF) { VFs.insert(VF); }
 
   bool hasVF(unsigned VF) { return VFs.count(VF); }
diff --git a/test/Transforms/LoopVectorize/X86/optsize.ll b/test/Transforms/LoopVectorize/X86/optsize.ll
index 057c72044d9..508823475ea 100644
--- a/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; This test verifies that the loop vectorizer will NOT vectorize loops that
 ; will produce a tail loop with the optimize for size or the minimize size
 ; attributes. This is a target-dependent version of the test.
@@ -9,7 +10,47 @@ target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
 
 define i32 @foo_optsize() #0 {
 ; CHECK-LABEL: @foo_optsize(
-; CHECK-NOT: x i8>
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
+; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
 
 entry:
   br label %for.body
@@ -33,7 +74,47 @@ attributes #0 = { optsize }
 
 define i32 @foo_minsize() #1 {
 ; CHECK-LABEL: @foo_minsize(
-; CHECK-NOT: x i8>
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
+; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !5
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
 
 entry:
   br label %for.body
diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll
index 8af7b2e7df9..2027963f071 100644
--- a/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -68,11 +68,68 @@ define void @example1() optsize {
   ret void
 }
 
-; Can't vectorize in 'optsize' mode because we need a tail.
-;CHECK-LABEL: @example2(
-;CHECK-NOT: store <4 x i32>
-;CHECK: ret void
+; Can vectorize in 'optsize' mode by masking the needed tail.
 define void @example2(i32 %n, i32 %x) optsize {
+; CHECK-LABEL: @example2(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH5_PREHEADER:%.*]], label [[DOTPREHEADER:%.*]]
+; CHECK:       .lr.ph5.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP2]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[TMP10]], align 16
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP12]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP6]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP14]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP7]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP16]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[DOT_PREHEADER_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret void
+;
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph5, label %.preheader
 
@@ -113,7 +170,8 @@ define void @example2(i32 %n, i32 %x) optsize {
   ret void
 }
 
-; N is unknown, we need a tail. Can't vectorize.
+; N is unknown, we need a tail. Can't vectorize because loop has no primary
+; induction.
 ;CHECK-LABEL: @example3(
 ;CHECK-NOT: <4 x i32>
 ;CHECK: ret void
@@ -181,12 +239,12 @@ define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[TMP6:%.*]]
-; CHECK:         br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !5
+; CHECK:         br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !7
 ; CHECK:         ret void
 ;
   br label %1
@@ -209,11 +267,102 @@ define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst
   ret void
 }
 
-; We CAN'T vectorize this example because it would entail a tail.
+; We CAN vectorize this example by folding the tail it entails.
 define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
 ; CHECK-LABEL: @example23c(
-; CHECK-NOT: <4 x
-; CHECK: ret void
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE22:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], <i64 257, i64 257, i64 257, i64 257>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, i16* [[NEXT_GEP]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i16 [ undef, [[VECTOR_BODY]] ], [ [[TMP3]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; CHECK:       pred.load.if11:
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, i16* [[NEXT_GEP4]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
+; CHECK:       pred.load.continue12:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE]] ], [ [[TMP7]], [[PRED_LOAD_IF11]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
+; CHECK:       pred.load.if13:
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, i16* [[NEXT_GEP5]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
+; CHECK:       pred.load.continue14:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP11]], [[PRED_LOAD_IF13]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
+; CHECK:       pred.load.if15:
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, i16* [[NEXT_GEP6]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
+; CHECK:       pred.load.continue16:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP15]], [[PRED_LOAD_IF15]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK:       pred.store.if17:
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw i32 [[TMP21]], 7
+; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP23]]
+; CHECK-NEXT:    store i32 [[TMP22]], i32* [[NEXT_GEP8]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP24]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK:       pred.store.if19:
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = shl nuw nsw i32 [[TMP25]], 7
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP27]]
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[NEXT_GEP9]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; CHECK:       pred.store.continue20:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.if21:
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i16 [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP30:%.*]] = shl nuw nsw i32 [[TMP29]], 7
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP31]]
+; CHECK-NEXT:    store i32 [[TMP30]], i32* [[NEXT_GEP10]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.continue22:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP33:%.*]]
+; CHECK:         br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !9
+; CHECK:         ret void
+;
   br label %1
 
 ; <label>:1                                       ; preds = %1, %0
@@ -234,7 +383,8 @@ define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst
   ret void
 }
 
-; We CAN'T vectorize this example because it would entail a tail.
+; We CAN'T vectorize this example because it would entail a tail and an
+; induction is used outside the loop.
 define i64 @example23d(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
 ;CHECK-LABEL: @example23d(
 ; CHECK-NOT: <4 x
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 2db08b0363a..81f3113bf22 100644
--- a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -85,12 +85,41 @@ for.end:
 !2 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 ;
-; This loop will not be vectorized as the trip count is below the threshold.
+; This loop will be vectorized as the trip count is below the threshold but no
+; scalar iterations are needed thanks to folding its tail.
 ;
-define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
-; CHECK-LABEL: @not_vectorized(
-; CHECK-NOT:   x float>
+define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP7]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP8]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
 ; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -142,7 +171,7 @@ define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !9
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -152,14 +181,14 @@ define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.mem.parallel_loop_access !6
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.mem.parallel_loop_access !7
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !6
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !7
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !6
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !7
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
-- 
GitLab


From 39197f38a5a2ac0f67a224abf26d873ac2321dcc Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sgundapa@codeaurora.org>
Date: Thu, 18 Oct 2018 15:51:16 +0000
Subject: [PATCH 0305/1116] [Pipeliner] copyToPhi DAG Mutation to improve
 scheduling.

In a loop, create artificial dependences between the source of a
COPY/REG_SEQUENCE to the use in next iteration.

Eg:
SRC ----Data Dep--> COPY
COPY ---Anti Dep--> PHI (implies, to be used in next iteration)
PHI ----Data Dep--> USE

This patches creates
USE ----Artificial Dep---> SRC

This will effectively schedule the COPY late to eliminate additional copies.
Before this patch, the schedule can be
SRC, COPY, USE : The COPY is used in next iteration and it needs to be
preserved.

After this patch, the schedule can be
USE, SRC, COPY : The COPY is used in next iteration and the live interval is
reduced.

Differential Revision: https://reviews.llvm.org/D53303


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344748 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachinePipeliner.cpp          | 96 ++++++++++++++++++++++-
 test/CodeGen/Hexagon/swp-copytophi-dag.ll | 72 +++++++++++++++++
 2 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/Hexagon/swp-copytophi-dag.ll

diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index 02344225391..a341aac227a 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -102,6 +102,7 @@
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -171,6 +172,12 @@ static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
                                      cl::ReallyHidden, cl::init(false),
                                      cl::ZeroOrMore, cl::desc("Ignore RecMII"));
 
+// A command line option to enable the CopyToPhi DAG mutation.
+static cl::opt<bool>
+    SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+                       cl::init(true), cl::ZeroOrMore,
+                       cl::desc("Enable CopyToPhi DAG Mutation"));
+
 namespace {
 
 class NodeSet;
@@ -307,12 +314,18 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     void unblock(int U);
   };
 
+  struct CopyToPhiMutation : public ScheduleDAGMutation {
+    void apply(ScheduleDAGInstrs *DAG) override;
+  };
+
 public:
   SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
                     const RegisterClassInfo &rci)
       : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
         RegClassInfo(rci), Topo(SUnits, &ExitSU) {
     P.MF->getSubtarget().getSMSMutations(Mutations);
+    if (SwpEnableCopyToPhi)
+      Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
   }
 
   void schedule() override;
@@ -391,6 +404,8 @@ public:
     Mutations.push_back(std::move(Mutation));
   }
 
+  static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
+
 private:
   void addLoopCarriedDependences(AliasAnalysis *AA);
   void updatePhiDependences();
@@ -893,8 +908,8 @@ void SwingSchedulerDAG::schedule() {
   addLoopCarriedDependences(AA);
   updatePhiDependences();
   Topo.InitDAGTopologicalSorting();
-  postprocessDAG();
   changeDependences();
+  postprocessDAG();
   LLVM_DEBUG(dump());
 
   NodeSetType NodeSets;
@@ -1624,6 +1639,85 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
   swapAntiDependences(SUnits);
 }
 
+// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
+// is loop-carried to the USE in next iteration. This will help pipeliner avoid
+// additional copies that are needed across iterations. An artificial dependence
+// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
+
+// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
+// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
+// PHI-------True-Dep------> USEOfPhi
+
+// The mutation creates
+// USEOfPHI -------Artificial-Dep---> SRCOfCopy
+
+// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
+// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
+// late  to avoid additional copies across iterations. The possible scheduling
+// order would be
+// USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.
+
+void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
+  for (SUnit &SU : DAG->SUnits) {
+    // Find the COPY/REG_SEQUENCE instruction.
+    if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
+      continue;
+
+    // Record the loop carried PHIs.
+    SmallVector<SUnit *, 4> PHISUs;
+    // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
+    SmallVector<SUnit *, 4> SrcSUs;
+
+    for (auto &Dep : SU.Preds) {
+      SUnit *TmpSU = Dep.getSUnit();
+      MachineInstr *TmpMI = TmpSU->getInstr();
+      SDep::Kind DepKind = Dep.getKind();
+      // Save the loop carried PHI.
+      if (DepKind == SDep::Anti && TmpMI->isPHI())
+        PHISUs.push_back(TmpSU);
+      // Save the source of COPY/REG_SEQUENCE.
+      // If the source has no pre-decessors, we will end up creating cycles.
+      else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
+        SrcSUs.push_back(TmpSU);
+    }
+
+    if (PHISUs.size() == 0 || SrcSUs.size() == 0)
+      continue;
+
+    // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
+    // SUnit to the container.
+    SmallVector<SUnit *, 8> UseSUs;
+    for (auto I = PHISUs.begin(); I != PHISUs.end(); ++I) {
+      for (auto &Dep : (*I)->Succs) {
+        if (Dep.getKind() != SDep::Data)
+          continue;
+
+        SUnit *TmpSU = Dep.getSUnit();
+        MachineInstr *TmpMI = TmpSU->getInstr();
+        if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
+          PHISUs.push_back(TmpSU);
+          continue;
+        }
+        UseSUs.push_back(TmpSU);
+      }
+    }
+
+    if (UseSUs.size() == 0)
+      continue;
+
+    SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
+    // Add the artificial dependencies if it does not form a cycle.
+    for (auto I : UseSUs) {
+      for (auto Src : SrcSUs) {
+        if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
+          Src->addPred(SDep(I, SDep::Artificial));
+          SDAG->Topo.AddPred(Src, I);
+        }
+      }
+    }
+  }
+}
+
 /// Return true for DAG nodes that we ignore when computing the cost functions.
 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
 /// in the calculation of the ASAP, ALAP, etc functions.
diff --git a/test/CodeGen/Hexagon/swp-copytophi-dag.ll b/test/CodeGen/Hexagon/swp-copytophi-dag.ll
new file mode 100644
index 00000000000..a239baae141
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-copytophi-dag.ll
@@ -0,0 +1,72 @@
+; RUN: llc -march=hexagon -enable-pipeliner=true -debug-only=pipeliner < %s \
+; RUN: 2>&1 | FileCheck %s
+
+; Test that the artificial dependence is created as a result of
+; CopyToPhi DAG mutation.
+; CHECK: Ord  Latency=0 Artificial
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define void @foo(i64* nocapture readonly %r64, i16 zeroext %n, i16 zeroext %s, i64* nocapture %p64) #0 {
+entry:
+  %conv = zext i16 %n to i32
+  %cmp = icmp eq i16 %n, 0
+  br i1 %cmp, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = load i64, i64* %r64, align 8
+  %v.sroa.0.0.extract.trunc = trunc i64 %tmp to i16
+  %v.sroa.4.0.extract.shift = lshr i64 %tmp, 16
+  %v.sroa.4.0.extract.trunc = trunc i64 %v.sroa.4.0.extract.shift to i16
+  %v.sroa.5.0.extract.shift = lshr i64 %tmp, 32
+  %v.sroa.5.0.extract.trunc = trunc i64 %v.sroa.5.0.extract.shift to i16
+  %v.sroa.6.0.extract.shift = lshr i64 %tmp, 48
+  %v.sroa.6.0.extract.trunc = trunc i64 %v.sroa.6.0.extract.shift to i16
+  %tmp1 = bitcast i64* %p64 to i16*
+  %conv2 = zext i16 %s to i32
+  %add.ptr = getelementptr inbounds i16, i16* %tmp1, i32 %conv2
+  %add.ptr.sum = add nuw nsw i32 %conv2, 1
+  %add.ptr3 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum
+  %add.ptr.sum50 = add nuw nsw i32 %conv2, 2
+  %add.ptr4 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum50
+  %add.ptr.sum51 = add nuw nsw i32 %conv2, 3
+  %add.ptr5 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum51
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %add.ptr11.phi = phi i16* [ %add.ptr11.inc, %for.body ], [ %add.ptr, %for.body.preheader ]
+  %add.ptr16.phi = phi i16* [ %add.ptr16.inc, %for.body ], [ %add.ptr3, %for.body.preheader ]
+  %add.ptr21.phi = phi i16* [ %add.ptr21.inc, %for.body ], [ %add.ptr4, %for.body.preheader ]
+  %add.ptr26.phi = phi i16* [ %add.ptr26.inc, %for.body ], [ %add.ptr5, %for.body.preheader ]
+  %i.058.pmt = phi i32 [ %inc.pmt, %for.body ], [ 0, %for.body.preheader ]
+  %v.sroa.0.157 = phi i16 [ %v.sroa.0.0.extract.trunc34, %for.body ], [ %v.sroa.0.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.4.156 = phi i16 [ %v.sroa.4.0.extract.trunc36, %for.body ], [ %v.sroa.4.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.5.155 = phi i16 [ %v.sroa.5.0.extract.trunc38, %for.body ], [ %v.sroa.5.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.6.154 = phi i16 [ %v.sroa.6.0.extract.trunc40, %for.body ], [ %v.sroa.6.0.extract.trunc, %for.body.preheader ]
+  %q64.153.pn = phi i64* [ %q64.153, %for.body ], [ %r64, %for.body.preheader ]
+  %q64.153 = getelementptr inbounds i64, i64* %q64.153.pn, i32 1
+  store i16 %v.sroa.0.157, i16* %add.ptr11.phi, align 2
+  store i16 %v.sroa.4.156, i16* %add.ptr16.phi, align 2
+  store i16 %v.sroa.5.155, i16* %add.ptr21.phi, align 2
+  store i16 %v.sroa.6.154, i16* %add.ptr26.phi, align 2
+  %tmp2 = load i64, i64* %q64.153, align 8
+  %v.sroa.0.0.extract.trunc34 = trunc i64 %tmp2 to i16
+  %v.sroa.4.0.extract.shift35 = lshr i64 %tmp2, 16
+  %v.sroa.4.0.extract.trunc36 = trunc i64 %v.sroa.4.0.extract.shift35 to i16
+  %v.sroa.5.0.extract.shift37 = lshr i64 %tmp2, 32
+  %v.sroa.5.0.extract.trunc38 = trunc i64 %v.sroa.5.0.extract.shift37 to i16
+  %v.sroa.6.0.extract.shift39 = lshr i64 %tmp2, 48
+  %v.sroa.6.0.extract.trunc40 = trunc i64 %v.sroa.6.0.extract.shift39 to i16
+  %inc.pmt = add i32 %i.058.pmt, 1
+  %cmp8 = icmp slt i32 %inc.pmt, %conv
+  %add.ptr11.inc = getelementptr i16, i16* %add.ptr11.phi, i32 4
+  %add.ptr16.inc = getelementptr i16, i16* %add.ptr16.phi, i32 4
+  %add.ptr21.inc = getelementptr i16, i16* %add.ptr21.phi, i32 4
+  %add.ptr26.inc = getelementptr i16, i16* %add.ptr26.phi, i32 4
+  br i1 %cmp8, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv65" }
-- 
GitLab


From 14914d033ea5989383d5a165e6e2fbd4d1d85bfc Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Thu, 18 Oct 2018 19:34:30 +0000
Subject: [PATCH 0306/1116] Revert r344693 ("[ARM] bottom-top mul support in
 ARMParallelDSP")

Still causing failures on the polly-aosp buildbot; I'll follow up
with a reduced testcase.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344752 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMParallelDSP.cpp             | 221 ++-------------
 .../ARM/ParallelDSP/top-bottom-multi-use.ll   |  74 -----
 .../ARM/ParallelDSP/top-bottom-neg-vec.ll     |  98 -------
 .../CodeGen/ARM/ParallelDSP/top-bottom-neg.ll | 210 ---------------
 .../ARM/ParallelDSP/top-bottom-order.ll       |  54 ----
 test/CodeGen/ARM/ParallelDSP/top-bottom.ll    | 252 ------------------
 test/CodeGen/ARM/{ParallelDSP => }/smlad0.ll  |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad1.ll  |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad10.ll |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad11.ll |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad12.ll |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad2.ll  |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad3.ll  |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad4.ll  |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad5.ll  |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad6.ll  |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad7.ll  |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad8.ll  |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlad9.ll  |   0
 .../CodeGen/ARM/{ParallelDSP => }/smladx-1.ll |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlald0.ll |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlald1.ll |   0
 test/CodeGen/ARM/{ParallelDSP => }/smlald2.ll |   0
 .../ARM/{ParallelDSP => }/smlaldx-1.ll        |   0
 .../ARM/{ParallelDSP => }/smlaldx-2.ll        |   0
 25 files changed, 27 insertions(+), 882 deletions(-)
 delete mode 100644 test/CodeGen/ARM/ParallelDSP/top-bottom-multi-use.ll
 delete mode 100644 test/CodeGen/ARM/ParallelDSP/top-bottom-neg-vec.ll
 delete mode 100644 test/CodeGen/ARM/ParallelDSP/top-bottom-neg.ll
 delete mode 100644 test/CodeGen/ARM/ParallelDSP/top-bottom-order.ll
 delete mode 100644 test/CodeGen/ARM/ParallelDSP/top-bottom.ll
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad0.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad1.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad10.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad11.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad12.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad2.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad3.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad4.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad5.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad6.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad7.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad8.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlad9.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smladx-1.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlald0.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlald1.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlald2.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlaldx-1.ll (100%)
 rename test/CodeGen/ARM/{ParallelDSP => }/smlaldx-2.ll (100%)

diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp
index e5f6a61852e..3ab9298c110 100644
--- a/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/lib/Target/ARM/ARMParallelDSP.cpp
@@ -55,7 +55,6 @@ namespace {
   using ReductionList   = SmallVector<Reduction, 8>;
   using ValueList       = SmallVector<Value*, 8>;
   using MemInstList     = SmallVector<Instruction*, 8>;
-  using LoadInstList    = SmallVector<LoadInst*, 8>;
   using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
   using PMACPairList    = SmallVector<PMACPair, 8>;
   using Instructions    = SmallVector<Instruction*,16>;
@@ -64,8 +63,7 @@ namespace {
   struct OpChain {
     Instruction   *Root;
     ValueList     AllValues;
-    MemInstList   VecLd;    // List of all sequential load instructions.
-    LoadInstList  Loads;    // List of all load instructions.
+    MemInstList   VecLd;    // List of all load instructions.
     MemLocList    MemLocs;  // All memory locations read by this tree.
     bool          ReadOnly = true;
 
@@ -78,10 +76,8 @@ namespace {
         if (auto *I = dyn_cast<Instruction>(V)) {
           if (I->mayWriteToMemory())
             ReadOnly = false;
-          if (auto *Ld = dyn_cast<LoadInst>(V)) {
+          if (auto *Ld = dyn_cast<LoadInst>(V))
             MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
-            Loads.push_back(Ld);
-          }
         }
       }
     }
@@ -139,7 +135,6 @@ namespace {
     /// exchange the halfwords of the second operand before performing the
     /// arithmetic.
     bool MatchSMLAD(Function &F);
-    bool MatchTopBottomMuls(BasicBlock *LoopBody);
 
   public:
     static char ID;
@@ -208,8 +203,6 @@ namespace {
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
       Changes = MatchSMLAD(F);
-      if (!Changes)
-        Changes = MatchTopBottomMuls(Header);
       return Changes;
     }
   };
@@ -503,10 +496,10 @@ static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
   );
 }
 
-static void AddMulCandidate(OpChainList &Candidates,
+static void AddMACCandidate(OpChainList &Candidates,
                             Instruction *Mul,
                             Value *MulOp0, Value *MulOp1) {
-  LLVM_DEBUG(dbgs() << "OK, found mul:\t"; Mul->dump());
+  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
   assert(Mul->getOpcode() == Instruction::Mul &&
          "expected mul instruction");
   ValueList LHS;
@@ -540,14 +533,14 @@ static void MatchParallelMACSequences(Reduction &R,
       break;
     case Instruction::Mul:
       if (match (I, (m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
-        AddMulCandidate(Candidates, I, MulOp0, MulOp1);
+        AddMACCandidate(Candidates, I, MulOp0, MulOp1);
         return false;
       }
       break;
     case Instruction::SExt:
       if (match (I, (m_SExt(m_Mul(m_Value(MulOp0), m_Value(MulOp1)))))) {
         Instruction *Mul = cast<Instruction>(I->getOperand(0));
-        AddMulCandidate(Candidates, Mul, MulOp0, MulOp1);
+        AddMACCandidate(Candidates, Mul, MulOp0, MulOp1);
         return false;
       }
       break;
@@ -576,24 +569,23 @@ static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
 // the memory locations accessed by the MAC-chains.
 // TODO: we need the read statements when we accept more complicated chains.
 static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
-                       Instructions &Writes, OpChainList &Candidates) {
+                       Instructions &Writes, OpChainList &MACCandidates) {
   LLVM_DEBUG(dbgs() << "Alias checks:\n");
-  for (auto &Candidate : Candidates) {
-    LLVM_DEBUG(dbgs() << "mul: "; Candidate->Root->dump());
-    Candidate->SetMemoryLocations();
+  for (auto &MAC : MACCandidates) {
+    LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
 
     // At the moment, we allow only simple chains that only consist of reads,
     // accumulate their result with an integer add, and thus that don't write
     // memory, and simply bail if they do.
-    if (!Candidate->ReadOnly)
+    if (!MAC->ReadOnly)
       return true;
 
     // Now for all writes in the basic block, check that they don't alias with
     // the memory locations accessed by our MAC-chain:
     for (auto *I : Writes) {
       LLVM_DEBUG(dbgs() << "- "; I->dump());
-      assert(Candidate->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
-      for (auto &MemLoc : Candidate->MemLocs) {
+      assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
+      for (auto &MemLoc : MAC->MemLocs) {
         if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
                                           ModRefInfo::ModRef))) {
           LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
@@ -607,7 +599,7 @@ static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
   return false;
 }
 
-static bool CheckMulMemory(OpChainList &Candidates) {
+static bool CheckMACMemory(OpChainList &Candidates) {
   for (auto &C : Candidates) {
     // A mul has 2 operands, and a narrow op consist of sext and a load; thus
     // we expect at least 4 items in this operand value list.
@@ -615,6 +607,7 @@ static bool CheckMulMemory(OpChainList &Candidates) {
       LLVM_DEBUG(dbgs() << "Operand list too short.\n");
       return false;
     }
+    C->SetMemoryLocations();
     ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
     ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
 
@@ -627,173 +620,6 @@ static bool CheckMulMemory(OpChainList &Candidates) {
   return true;
 }
 
-static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst *BaseLoad,
-                               const Type *LoadTy) {
-  const unsigned AddrSpace = BaseLoad->getPointerAddressSpace();
- 
-  Value *VecPtr = IRB.CreateBitCast(BaseLoad->getPointerOperand(),
-                                     LoadTy->getPointerTo(AddrSpace));
-  return IRB.CreateAlignedLoad(VecPtr, BaseLoad->getAlignment());
-}
-
-/// Given two instructions, return the one that comes first in the basic block.
-/// A work around for not being able to do > or < on bb iterators.
-static Instruction* GetFirst(Instruction *A, Instruction *B) {
-  BasicBlock::iterator First(A);
-  BasicBlock::iterator Second(B);
-
-  BasicBlock *BB = A->getParent();
-  assert(BB == B->getParent() &&
-         "Can't compare instructions in different blocks");
-  BasicBlock::iterator Last = BB->end();
-
-  // Iterate through the block, if the 'First' iterator is found, then return
-  // Second.
-  while (Second != Last) {
-    if (Second == First)
-      return B;
-    ++Second;
-  }
-  return A;
-}
-
-/// Attempt to widen loads and use smulbb, smulbt, smultb and smultt muls.
-// TODO: This, like smlad generation, expects the leave operands to be loads
-// that are sign extended. We should be able to handle scalar values as well
-// performing these muls on word x half types to generate smulwb and smulwt.
-bool ARMParallelDSP::MatchTopBottomMuls(BasicBlock *LoopBody) {
-  LLVM_DEBUG(dbgs() << "Attempting to find BT|TB muls.\n");
-
-  OpChainList Candidates;
-  for (auto &I : *LoopBody) {
-    if (I.getOpcode() == Instruction::Mul) {
-      Type *Ty = I.getType();
-      if (Ty->isIntegerTy() &&
-          (Ty->getScalarSizeInBits() == 32 ||
-           Ty->getScalarSizeInBits() == 64))
-      AddMulCandidate(Candidates, &I, I.getOperand(0), I.getOperand(1));
-    }
-  }
-
-  if (Candidates.empty())
-    return false;
-
-  Instructions Reads;
-  Instructions Writes;
-  AliasCandidates(LoopBody, Reads, Writes);
-
-  if (AreAliased(AA, Reads, Writes, Candidates))
-    return false;
-
-  DenseMap<LoadInst*, LoadInst*> SeqLoads;
-  SmallPtrSet<LoadInst*, 8> OffsetLoads;
-
-  for (unsigned i = 0; i < Candidates.size(); ++i) {
-    for (unsigned j = 0; j < Candidates.size(); ++j) {
-      if (i == j)
-        continue;
-
-      OpChain *MulChain0 = Candidates[i].get();
-      OpChain *MulChain1 = Candidates[j].get();
-
-      for (auto *Ld0 : MulChain0->Loads) {
-        if (SeqLoads.count(Ld0) || OffsetLoads.count(Ld0))
-          continue;
-
-        for (auto *Ld1 : MulChain1->Loads) {
-          if (SeqLoads.count(Ld1) || OffsetLoads.count(Ld1))
-            continue;
-
-          MemInstList VecMem;
-          if (AreSequentialLoads(Ld0, Ld1, VecMem)) {
-            SeqLoads[Ld0] = Ld1;
-            OffsetLoads.insert(Ld1);
-          }
-        }
-      }
-    }
-  }
-
-  if (SeqLoads.empty())
-    return false;
-
-  IRBuilder<NoFolder> IRB(LoopBody);
-  const Type *Ty = IntegerType::get(M->getContext(), 32);
-
-  auto IsUserMul = [](Use &U) {
-    auto *Mul = cast<Instruction>(U.getUser());
-    return Mul->getOpcode() == Instruction::Mul;
-  };
-
-  LLVM_DEBUG(dbgs() << "Found some sequential loads, now widening:\n");
-  for (auto &Pair : SeqLoads) {
-    LoadInst *BaseLd = Pair.first;
-    LoadInst *OffsetLd = Pair.second;
-
-    // Check that all the base users are muls.
-    auto *BaseSExt = cast<Instruction>(BaseLd->user_back());
-    for (Use &U : BaseSExt->uses()) {
-      if (!IsUserMul(U))
-        return false;
-    }
-
-    // Check that all the offset users are muls.
-    // TODO We exit early on finding a sext user which isn't a mul, but many
-    // arm instructions would be able to perform the necessary shift too.
-    auto *OffsetSExt = cast<Instruction>(OffsetLd->user_back());
-    for (Use &U : OffsetSExt->uses()) {
-      if (!IsUserMul(U))
-        return false;
-    }
-
-    LLVM_DEBUG(dbgs() << " - with base load: " << *BaseLd << "\n");
-    LLVM_DEBUG(dbgs() << " - with offset load: " << *OffsetLd << "\n");
-    Instruction *InsertPt = GetFirst(BaseLd, OffsetLd);
-    IRB.SetInsertPoint(InsertPt);
-    LoadInst *WideLd = CreateLoadIns(IRB, BaseLd, Ty);
-    LLVM_DEBUG(dbgs() << " - created wide load: " << *WideLd << "\n");
-
-    // Move the pointer operands before their users.
-    std::function<void(Instruction*, Instruction*)> MoveBefore =
-      [&MoveBefore](Instruction *Source, Instruction *Sink) -> void {
-      Source->moveBefore(Sink);
-      for (Use &U : Source->operands()) {
-        Value *Op = U.get();
-        if (auto *I = dyn_cast<Instruction>(Op)) {
-          if (isa<PHINode>(I) || I->getParent() != Source->getParent())
-            continue;
-          MoveBefore(I, Source);
-        }
-      }
-    };
-
-    // If we're inserting the load before BaseLd, we probably need to move the
-    // the pointer operand too. This operand is cast to an i32* in
-    // CreateLoadIns.
-    if (InsertPt != BaseLd) {
-      if (auto *GEP = dyn_cast<GetElementPtrInst>(BaseLd->getPointerOperand()))
-        MoveBefore(GEP, cast<Instruction>(WideLd->getPointerOperand()));
-    }
-
-    // BaseUser needs to: (asr (shl WideLoad, 16), 16)
-    // OffsetUser needs to: (asr WideLoad, 16)
-    auto *Top = cast<Instruction>(IRB.CreateAShr(WideLd, 16));
-    auto *Shl = cast<Instruction>(IRB.CreateShl(WideLd, 16));
-    auto *Bottom = cast<Instruction>(IRB.CreateAShr(Shl, 16));
-
-    BaseSExt->replaceAllUsesWith(Bottom);
-    OffsetSExt->replaceAllUsesWith(Top);
-
-    BaseSExt->eraseFromParent();
-    OffsetSExt->eraseFromParent();
-    BaseLd->eraseFromParent();
-    OffsetLd->eraseFromParent();
-  }
-  LLVM_DEBUG(dbgs() << "Block after top bottom mul replacements:\n"
-             << *LoopBody << "\n");
-  return true;
-}
-
 // Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
 // multiplications.
 // To use SMLAD:
@@ -832,15 +658,14 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
              dbgs() << "Header block:\n"; Header->dump();
              dbgs() << "Loop info:\n\n"; L->dump());
 
+  bool Changed = false;
   ReductionList Reductions;
   MatchReductions(F, L, Header, Reductions);
-  if (Reductions.empty())
-    return false;
 
   for (auto &R : Reductions) {
     OpChainList MACCandidates;
     MatchParallelMACSequences(R, MACCandidates);
-    if (!CheckMulMemory(MACCandidates))
+    if (!CheckMACMemory(MACCandidates))
       continue;
 
     R.MACCandidates = std::move(MACCandidates);
@@ -857,7 +682,6 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
   Instructions Reads, Writes;
   AliasCandidates(Header, Reads, Writes);
 
-  bool Changed = false;
   for (auto &R : Reductions) {
     if (AreAliased(AA, Reads, Writes, R.MACCandidates))
       return false;
@@ -869,6 +693,15 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
   return Changed;
 }
 
+static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad,
+                               const Type *LoadTy) {
+  const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
+
+  Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
+                                    LoadTy->getPointerTo(AddrSpace));
+  return IRB.CreateAlignedLoad(VecPtr, BaseLoad.getAlignment());
+}
+
 Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
                                              Instruction *Acc, bool Exchange,
                                              Instruction *InsertAfter) {
@@ -883,8 +716,8 @@ Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
 
   // Replace the reduction chain with an intrinsic call
   const Type *Ty = IntegerType::get(M->getContext(), 32);
-  LoadInst *NewLd0 = CreateLoadIns(Builder, &VecLd0[0], Ty);
-  LoadInst *NewLd1 = CreateLoadIns(Builder, &VecLd1[0], Ty);
+  LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
+  LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
   Value* Args[] = { NewLd0, NewLd1, Acc };
   Function *SMLAD = nullptr;
   if (Exchange)
diff --git a/test/CodeGen/ARM/ParallelDSP/top-bottom-multi-use.ll b/test/CodeGen/ARM/ParallelDSP/top-bottom-multi-use.ll
deleted file mode 100644
index ed2b3fedbb6..00000000000
--- a/test/CodeGen/ARM/ParallelDSP/top-bottom-multi-use.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: opt -mtriple=thumbv8m.main -mcpu=cortex-m33 -S -arm-parallel-dsp %s -o - | FileCheck %s
-; RUN: opt -mtriple=thumbv7a-linux-android -arm-parallel-dsp -S %s -o - | FileCheck %s
-
-; CHECK-LABEL: sext_multi_use_undef
-define void @sext_multi_use_undef() {
-entry:
-  br label %for.body
-
-for.body:
-  %0 = load i16, i16* undef, align 2
-  %conv3 = sext i16 %0 to i32
-  %1 = load i16, i16* undef, align 2
-  %conv7 = sext i16 %1 to i32
-  %mul8 = mul nsw i32 %conv7, %conv3
-  %x.addr.180 = getelementptr inbounds i16, i16* undef, i32 1
-  %2 = load i16, i16* %x.addr.180, align 2
-  %conv1582 = sext i16 %2 to i32
-  %mul.i7284 = mul nsw i32 %conv7, %conv1582
-  br label %for.body
-}
-
-; CHECK-LABEL: sext_multi_use
-; CHECK: [[PtrA:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[DataA:%[^ ]+]] = load i32, i32* [[PtrA]], align 2
-; CHECK: [[Top:%[^ ]+]] = ashr i32 [[DataA]], 16
-; CHECK: [[Shl:%[^ ]+]] = shl i32 [[DataA]], 16
-; CHECK: [[Bottom:%[^ ]+]] = ashr i32 [[Shl]], 16
-; CHECK: [[DataB:%[^ ]+]] = load i16, i16* %b, align 2
-; CHECK: [[SextB:%[^ ]+]] = sext i16 [[DataB]] to i32
-; CHECK: [[Mul0:%[^ ]+]] = mul nsw i32 [[SextB]], [[Bottom]]
-; CHECK: [[Mul1:%[^ ]+]] = mul nsw i32 [[SextB]], [[Top]]
-define void @sext_multi_use(i16* %a, i16* %b) {
-entry:
-  br label %for.body
-
-for.body:
-  %0 = load i16, i16* %a, align 2
-  %conv3 = sext i16 %0 to i32
-  %1 = load i16, i16* %b, align 2
-  %conv7 = sext i16 %1 to i32
-  %mul8 = mul nsw i32 %conv7, %conv3
-  %x.addr.180 = getelementptr inbounds i16, i16* %a, i32 1
-  %2 = load i16, i16* %x.addr.180, align 2
-  %conv1582 = sext i16 %2 to i32
-  %mul.i7284 = mul nsw i32 %conv7, %conv1582
-  br label %for.body
-}
-
-; CHECK-LABEL: sext_multi_use_reorder
-; CHECK: [[PtrA:%[^ ]+]] = bitcast i16* %a to i32*
-; CHECK: [[DataA:%[^ ]+]] = load i32, i32* [[PtrA]], align 2
-; CHECK: [[Top:%[^ ]+]] = ashr i32 [[DataA]], 16
-; CHECK: [[Shl:%[^ ]+]] = shl i32 [[DataA]], 16
-; CHECK: [[Bottom:%[^ ]+]] = ashr i32 [[Shl]], 16
-; CHECK: [[Mul0:%[^ ]+]] = mul nsw i32 [[Top]], [[Bottom]]
-; CHECK: [[DataB:%[^ ]+]] = load i16, i16* %b, align 2
-; CHECK: [[SextB:%[^ ]+]] = sext i16 [[DataB]] to i32
-; CHECK: [[Mul1:%[^ ]+]] = mul nsw i32 [[Top]], [[SextB]]
-define void @sext_multi_use_reorder(i16* %a, i16* %b) {
-entry:
-  br label %for.body
-
-for.body:
-  %0 = load i16, i16* %a, align 2
-  %conv3 = sext i16 %0 to i32
-  %x.addr.180 = getelementptr inbounds i16, i16* %a, i32 1
-  %1 = load i16, i16* %x.addr.180, align 2
-  %conv7 = sext i16 %1 to i32
-  %mul8 = mul nsw i32 %conv7, %conv3
-  %2 = load i16, i16* %b, align 2
-  %conv1582 = sext i16 %2 to i32
-  %mul.i7284 = mul nsw i32 %conv7, %conv1582
-  br label %for.body
-}
diff --git a/test/CodeGen/ARM/ParallelDSP/top-bottom-neg-vec.ll b/test/CodeGen/ARM/ParallelDSP/top-bottom-neg-vec.ll
deleted file mode 100644
index ea60c656a06..00000000000
--- a/test/CodeGen/ARM/ParallelDSP/top-bottom-neg-vec.ll
+++ /dev/null
@@ -1,98 +0,0 @@
-; RUN: opt -mtriple=thumbv7-unknown-linux-android -arm-parallel-dsp -S %s -o - | FileCheck %s
-
-@a = local_unnamed_addr global i32 0, align 4
-@b = local_unnamed_addr global i8* null, align 4
-@c = local_unnamed_addr global i8 0, align 1
-@d = local_unnamed_addr global i16* null, align 4
-
-; CHECK-LABEL: @convolve
-; CHECK-NOT: bitcast i16* [[ANY:%[^ ]+]] to i32*
-define void @convolve() local_unnamed_addr #0 {
-entry:
-  br label %for.cond
-
-for.cond:
-  %e.0 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.end ]
-  %f.0 = phi i32 [ undef, %entry ], [ %f.1.lcssa, %for.end ]
-  %g.0 = phi i32 [ undef, %entry ], [ %g.1.lcssa, %for.end ]
-  %cmp13 = icmp slt i32 %g.0, 1
-  br i1 %cmp13, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:
-  %0 = load i16*, i16** @d, align 4
-  %1 = load i8*, i8** @b, align 4
-  %2 = load i32, i32* @a, align 4
-  %3 = sub i32 1, %g.0
-  %min.iters.check = icmp ugt i32 %3, 3
-  %ident.check = icmp eq i32 %2, 1
-  %or.cond = and i1 %min.iters.check, %ident.check
-  br i1 %or.cond, label %vector.ph, label %for.body.preheader
-
-vector.ph:
-  %n.vec = and i32 %3, -4
-  %ind.end = add i32 %g.0, %n.vec
-  %4 = mul i32 %2, %n.vec
-  %ind.end20 = add i32 %f.0, %4
-  %5 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %e.0, i32 0
-  br label %vector.body
-
-vector.body:
-  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %vec.phi = phi <4 x i32> [ %5, %vector.ph ], [ %14, %vector.body ]
-  %offset.idx = add i32 %g.0, %index
-  %6 = mul i32 %2, %index
-  %offset.idx21 = add i32 %f.0, %6
-  %7 = getelementptr inbounds i16, i16* %0, i32 %offset.idx
-  %8 = bitcast i16* %7 to <4 x i16>*
-  %wide.load = load <4 x i16>, <4 x i16>* %8, align 2
-  %9 = sext <4 x i16> %wide.load to <4 x i32>
-  %10 = getelementptr inbounds i8, i8* %1, i32 %offset.idx21
-  %11 = bitcast i8* %10 to <4 x i8>*
-  %wide.load25 = load <4 x i8>, <4 x i8>* %11, align 1
-  %12 = zext <4 x i8> %wide.load25 to <4 x i32>
-  %13 = mul nsw <4 x i32> %12, %9
-  %14 = add nsw <4 x i32> %13, %vec.phi
-  %index.next = add i32 %index, 4
-  %15 = icmp eq i32 %index.next, %n.vec
-  br i1 %15, label %middle.block, label %vector.body
-
-middle.block:
-  %rdx.shuf = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %bin.rdx = add <4 x i32> %14, %rdx.shuf
-  %rdx.shuf26 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %bin.rdx27 = add <4 x i32> %bin.rdx, %rdx.shuf26
-  %16 = extractelement <4 x i32> %bin.rdx27, i32 0
-  %cmp.n = icmp eq i32 %3, %n.vec
-  br i1 %cmp.n, label %for.end, label %for.body.preheader
-
-for.body.preheader:
-  %g.116.ph = phi i32 [ %g.0, %for.body.lr.ph ], [ %ind.end, %middle.block ]
-  %f.115.ph = phi i32 [ %f.0, %for.body.lr.ph ], [ %ind.end20, %middle.block ]
-  %e.114.ph = phi i32 [ %e.0, %for.body.lr.ph ], [ %16, %middle.block ]
-  br label %for.body
-
-for.body:
-  %g.116 = phi i32 [ %inc, %for.body ], [ %g.116.ph, %for.body.preheader ]
-  %f.115 = phi i32 [ %add4, %for.body ], [ %f.115.ph, %for.body.preheader ]
-  %e.114 = phi i32 [ %add, %for.body ], [ %e.114.ph, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i16, i16* %0, i32 %g.116
-  %17 = load i16, i16* %arrayidx, align 2
-  %conv = sext i16 %17 to i32
-  %arrayidx2 = getelementptr inbounds i8, i8* %1, i32 %f.115
-  %18 = load i8, i8* %arrayidx2, align 1
-  %conv3 = zext i8 %18 to i32
-  %mul = mul nsw i32 %conv3, %conv
-  %add = add nsw i32 %mul, %e.114
-  %inc = add nsw i32 %g.116, 1
-  %add4 = add nsw i32 %2, %f.115
-  %cmp = icmp slt i32 %g.116, 0
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:
-  %e.1.lcssa = phi i32 [ %e.0, %for.cond ], [ %16, %middle.block ], [ %add, %for.body ]
-  %f.1.lcssa = phi i32 [ %f.0, %for.cond ], [ %ind.end20, %middle.block ], [ %add4, %for.body ]
-  %g.1.lcssa = phi i32 [ %g.0, %for.cond ], [ %ind.end, %middle.block ], [ %inc, %for.body ]
-  %conv5 = trunc i32 %e.1.lcssa to i8
-  store i8 %conv5, i8* @c, align 1
-  br label %for.cond
-}
diff --git a/test/CodeGen/ARM/ParallelDSP/top-bottom-neg.ll b/test/CodeGen/ARM/ParallelDSP/top-bottom-neg.ll
deleted file mode 100644
index 0c4aaeee7cc..00000000000
--- a/test/CodeGen/ARM/ParallelDSP/top-bottom-neg.ll
+++ /dev/null
@@ -1,210 +0,0 @@
-; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
-; RUN: opt -mtriple=thumbv7a-linux-android -arm-parallel-dsp -S %s -o - | FileCheck %s
-
-; CHECK-LABEL: topbottom_mul_alias
-; CHECK-NOT: bitcast i16*
-define void @topbottom_mul_alias(i32 %N, i32* nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
-entry:
-  br label %for.body
-
-for.body:
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
-  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
-  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
-  %In1.0 = load i16, i16* %PIn1.0, align 2
-  %SIn1.0 = sext i16 %In1.0 to i32
-  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
-  %In2.0 = load i16, i16* %PIn2.0, align 2
-  %SIn2.0 = sext i16 %In2.0 to i32
-  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
-  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
-  store i32 %mul5.us.i.i, i32* %Out.0, align 4
-  %iv.1 = or i32 %iv, 1
-  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
-  %In1.1 = load i16, i16* %PIn1.1, align 2
-  %SIn1.1 = sext i16 %In1.1 to i32
-  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
-  %In2.1 = load i16, i16* %PIn2.1, align 2
-  %SIn2.1 = sext i16 %In2.1 to i32
-  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
-  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
-  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
-  %iv.2 = or i32 %iv, 2
-  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
-  %In1.2 = load i16, i16* %PIn1.2, align 2
-  %SIn1.2 = sext i16 %In1.2 to i32
-  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
-  %In2.2 = load i16, i16* %PIn2.2, align 2
-  %SIn2.2 = sext i16 %In2.2 to i32
-  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
-  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
-  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
-  %iv.3 = or i32 %iv, 3
-  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
-  %In1.3 = load i16, i16* %PIn1.3, align 2
-  %SIn1.3 = sext i16 %In1.3 to i32
-  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
-  %In2.3 = load i16, i16* %PIn2.3, align 2
-  %SIn2.3 = sext i16 %In2.3 to i32
-  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
-  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
-  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
-  %iv.next = add i32 %iv, 4
-  %count.next = add i32 %count, -4
-  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
-  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
-
-exit:
-  ret void
-}
-
-; TODO: We should be able to handle this by splatting the const value.
-; CHECK-LABEL: topbottom_mul_const
-; CHECK-NOT: bitcast i16*
-define void @topbottom_mul_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i16 signext %const) {
-entry:
-  %conv4.i.i = sext i16 %const to i32
-  br label %for.body
-
-for.body:
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
-  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
-  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
-  %In.0 = load i16, i16* %PIn.0, align 2
-  %conv.us.i144.i = sext i16 %In.0 to i32
-  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %conv4.i.i
-  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
-  store i32 %mul5.us.i.i, i32* %Out.0, align 4
-  %iv.1 = or i32 %iv, 1
-  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
-  %In.1 = load i16, i16* %PIn.1, align 2
-  %conv.us.i144.1.i = sext i16 %In.1 to i32
-  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %conv4.i.i
-  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
-  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
-  %iv.2 = or i32 %iv, 2
-  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
-  %In.3 = load i16, i16* %PIn.2, align 2
-  %conv.us.i144.2.i = sext i16 %In.3 to i32
-  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %conv4.i.i
-  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
-  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
-  %iv.3 = or i32 %iv, 3
-  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
-  %In.4 = load i16, i16* %PIn.3, align 2
-  %conv.us.i144.3.i = sext i16 %In.4 to i32
-  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %conv4.i.i
-  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
-  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
-  %iv.next = add i32 %iv, 4
-  %count.next = add i32 %count, -4
-  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
-  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
-
-exit:
-  ret void
-}
-
-; TODO: We should be able to handle this and use smulwt and smulwb.
-; CHECK-LABEL: topbottom_mul_word_load_const
-; CHECK-NOT: bitcast i16*
-define void @topbottom_mul_word_load_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i32* %C) {
-entry:
-  %const = load i32, i32* %C
-  br label %for.body
-
-for.body:
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
-  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
-  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
-  %In.0 = load i16, i16* %PIn.0, align 2
-  %conv.us.i144.i = sext i16 %In.0 to i32
-  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %const
-  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
-  store i32 %mul5.us.i.i, i32* %Out.0, align 4
-  %iv.1 = or i32 %iv, 1
-  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
-  %In.1 = load i16, i16* %PIn.1, align 2
-  %conv.us.i144.1.i = sext i16 %In.1 to i32
-  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %const
-  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
-  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
-  %iv.2 = or i32 %iv, 2
-  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
-  %In.3 = load i16, i16* %PIn.2, align 2
-  %conv.us.i144.2.i = sext i16 %In.3 to i32
-  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %const
-  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
-  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
-  %iv.3 = or i32 %iv, 3
-  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
-  %In.4 = load i16, i16* %PIn.3, align 2
-  %conv.us.i144.3.i = sext i16 %In.4 to i32
-  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %const
-  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
-  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
-  %iv.next = add i32 %iv, 4
-  %count.next = add i32 %count, -4
-  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
-  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
-
-exit:
-  ret void
-}
-
-; CHECK-LABEL: topbottom_mul_8
-; CHECK-NOT: bitcast i16*
-define void @topbottom_mul_8(i32 %N, i32* noalias nocapture readnone %Out, i8* nocapture readonly %In1, i8* nocapture readonly %In2) {
-entry:
-  br label %for.body
-
-for.body:
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
-  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
-  %PIn1.0 = getelementptr inbounds i8, i8* %In1, i32 %iv
-  %In1.0 = load i8, i8* %PIn1.0, align 1
-  %SIn1.0 = sext i8 %In1.0 to i32
-  %PIn2.0 = getelementptr inbounds i8, i8* %In2, i32 %iv
-  %In2.0 = load i8, i8* %PIn2.0, align 1
-  %SIn2.0 = sext i8 %In2.0 to i32
-  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
-  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
-  store i32 %mul5.us.i.i, i32* %Out.0, align 4
-  %iv.1 = or i32 %iv, 1
-  %PIn1.1 = getelementptr inbounds i8, i8* %In1, i32 %iv.1
-  %In1.1 = load i8, i8* %PIn1.1, align 1
-  %SIn1.1 = sext i8 %In1.1 to i32
-  %PIn2.1 = getelementptr inbounds i8, i8* %In2, i32 %iv.1
-  %In2.1 = load i8, i8* %PIn2.1, align 1
-  %SIn2.1 = sext i8 %In2.1 to i32
-  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
-  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
-  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
-  %iv.2 = or i32 %iv, 2
-  %PIn1.2 = getelementptr inbounds i8, i8* %In1, i32 %iv.2
-  %In1.2 = load i8, i8* %PIn1.2, align 1
-  %SIn1.2 = sext i8 %In1.2 to i32
-  %PIn2.2 = getelementptr inbounds i8, i8* %In2, i32 %iv.2
-  %In2.2 = load i8, i8* %PIn2.2, align 1
-  %SIn2.2 = sext i8 %In2.2 to i32
-  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
-  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
-  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
-  %iv.3 = or i32 %iv, 3
-  %PIn1.3 = getelementptr inbounds i8, i8* %In1, i32 %iv.3
-  %In1.3 = load i8, i8* %PIn1.3, align 1
-  %SIn1.3 = sext i8 %In1.3 to i32
-  %PIn2.3 = getelementptr inbounds i8, i8* %In2, i32 %iv.3
-  %In2.3 = load i8, i8* %PIn2.3, align 1
-  %SIn2.3 = sext i8 %In2.3 to i32
-  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
-  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
-  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
-  %iv.next = add i32 %iv, 4
-  %count.next = add i32 %count, -4
-  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
-  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
-
-exit:
-  ret void
-}
diff --git a/test/CodeGen/ARM/ParallelDSP/top-bottom-order.ll b/test/CodeGen/ARM/ParallelDSP/top-bottom-order.ll
deleted file mode 100644
index e78afc80f15..00000000000
--- a/test/CodeGen/ARM/ParallelDSP/top-bottom-order.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: opt -mtriple=thumbv8m.main -mcpu=cortex-m33 -arm-parallel-dsp -S %s -o - | FileCheck %s
-; RUN: opt -mtriple=thumbv7a-linux-android -arm-parallel-dsp -S %s -o - | FileCheck %s
-
-; CHECK-LABEL: reorder_gep_arguments 
-; CHECK: [[Sub:%[^ ]+]] = xor i32 %iv, -1
-; CHECK: [[IdxPtr:%[^ ]+]] = getelementptr inbounds i16, i16* %arrayidx.us, i32 [[Sub]]
-; CHECK: [[IdxPtrCast:%[^ ]+]] = bitcast i16* [[IdxPtr]] to i32*
-; CHECK: [[Idx:%[^ ]+]] = load i32, i32* [[IdxPtrCast]], align 2
-; CHECK: [[Top:%[^ ]+]] = ashr i32 [[Idx]], 16
-; CHECK: [[Shl:%[^ ]+]] = shl i32 [[Idx]], 16
-; CHECK: [[Bottom:%[^ ]+]] = ashr i32 [[Shl]], 16
-; CHECK: [[BPtr:%[^ ]+]] = getelementptr inbounds i16, i16* %B, i32 %iv
-; CHECK: [[BData:%[^ ]+]] = load i16, i16* [[BPtr]], align 2
-; CHECK: [[BSext:%[^ ]+]] = sext i16 [[BData]] to i32
-; CHECK: [[Mul0:%[^ ]+]] = mul nsw i32 [[BSext]], [[Top]]
-; CHECK: [[BPtr1:%[^ ]+]] = getelementptr inbounds i16, i16* %B, i32 %add48.us
-; CHECK: [[BData1:%[^ ]+]] = load i16, i16* [[BPtr1]], align 2
-; CHECK: [[B1Sext:%[^ ]+]] = sext i16 [[BData1]] to i32
-; CHECK: [[Mul1:%[^ ]+]] = mul nsw i32 [[B1Sext]], [[Bottom]]
-
-define i32 @reorder_gep_arguments(i16* %B, i16* %arrayidx.us, i32 %d) {
-entry:
-  br label %for.body36.us
-
-for.body36.us:
-  %iv = phi i32 [ %add53.us, %for.body36.us ], [ 5, %entry ]
-  %out32_Q12.0114.us = phi i32 [ %add52.us, %for.body36.us ], [ 0, %entry ]
-  %sub37.us = sub nsw i32 0, %iv
-  %arrayidx38.us = getelementptr inbounds i16, i16* %arrayidx.us, i32 %sub37.us
-  %0 = load i16, i16* %arrayidx38.us, align 2
-  %conv39.us = sext i16 %0 to i32
-  %arrayidx40.us = getelementptr inbounds i16, i16* %B, i32 %iv
-  %1 = load i16, i16* %arrayidx40.us, align 2
-  %conv41.us = sext i16 %1 to i32
-  %mul42.us = mul nsw i32 %conv41.us, %conv39.us
-  %add43.us = add i32 %mul42.us, %out32_Q12.0114.us
-  %sub45.us = xor i32 %iv, -1
-  %arrayidx46.us = getelementptr inbounds i16, i16* %arrayidx.us, i32 %sub45.us
-  %2 = load i16, i16* %arrayidx46.us, align 2
-  %conv47.us = sext i16 %2 to i32
-  %add48.us = or i32 %iv, 1
-  %arrayidx49.us = getelementptr inbounds i16, i16* %B, i32 %add48.us
-  %3 = load i16, i16* %arrayidx49.us, align 2
-  %conv50.us = sext i16 %3 to i32
-  %mul51.us = mul nsw i32 %conv50.us, %conv47.us
-  %add52.us = add i32 %add43.us, %mul51.us
-  %add53.us = add nuw nsw i32 %iv, 2
-  %cmp34.us = icmp slt i32 %add53.us, %d
-  br i1 %cmp34.us, label %for.body36.us, label %exit
-
-exit:
-  ret i32 %add52.us
-}
-
diff --git a/test/CodeGen/ARM/ParallelDSP/top-bottom.ll b/test/CodeGen/ARM/ParallelDSP/top-bottom.ll
deleted file mode 100644
index e82a5d4e1c9..00000000000
--- a/test/CodeGen/ARM/ParallelDSP/top-bottom.ll
+++ /dev/null
@@ -1,252 +0,0 @@
-; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s
-; RUN: opt -mtriple=thumbv7a-linux-android -arm-parallel-dsp -S %s -o - | FileCheck %s
-
-; CHECK-LABEL: topbottom_mul
-define void @topbottom_mul(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
-entry:
-  br label %for.body
-
-; CHECK: for.body:
-; CHECK: [[Cast_PIn1_0:%[^ ]+]] = bitcast i16* %PIn1.0 to i32*
-; CHECK: [[PIn1_01:%[^ ]+]] = load i32, i32* [[Cast_PIn1_0]], align 2
-; CHECK: [[PIn1_1:%[^ ]+]] = ashr i32 [[PIn1_01]], 16
-; CHECK: [[PIn1_01_shl:%[^ ]+]] = shl i32 [[PIn1_01]], 16
-; CHECK: [[PIn1_0:%[^ ]+]] = ashr i32 [[PIn1_01_shl]], 16
-
-; CHECK: [[Cast_PIn2_0:%[^ ]+]] = bitcast i16* %PIn2.0 to i32*
-; CHECK: [[PIn2_01:%[^ ]+]] = load i32, i32* [[Cast_PIn2_0]], align 2
-; CHECK: [[PIn2_1:%[^ ]+]] = ashr i32 [[PIn2_01]], 16
-; CHECK: [[PIn2_01_shl:%[^ ]+]] = shl i32 [[PIn2_01]], 16
-; CHECK: [[PIn2_0:%[^ ]+]] = ashr i32 [[PIn2_01_shl]], 16
-
-; CHECK: mul nsw i32 [[PIn1_0]], [[PIn2_0]]
-; CHECK: mul nsw i32 [[PIn1_1]], [[PIn2_1]]
-
-; CHECK: [[Cast_PIn1_2:%[^ ]+]] = bitcast i16* %PIn1.2 to i32*
-; CHECK: [[PIn1_23:%[^ ]+]] = load i32, i32* [[Cast_PIn1_2]], align 2
-; CHECK: [[PIn1_3:%[^ ]+]] = ashr i32 [[PIn1_23]], 16
-; CHECK: [[PIn1_23_shl:%[^ ]+]] = shl i32 [[PIn1_23]], 16
-; CHECK: [[PIn1_2:%[^ ]+]] = ashr i32 [[PIn1_23_shl]], 16
-
-; CHECK: [[Cast_PIn2_2:%[^ ]+]] = bitcast i16* %PIn2.2 to i32*
-; CHECK: [[PIn2_23:%[^ ]+]] = load i32, i32* [[Cast_PIn2_2]], align 2
-; CHECK: [[PIn2_3:%[^ ]+]] = ashr i32 [[PIn2_23]], 16
-; CHECK: [[PIn2_23_shl:%[^ ]+]] = shl i32 [[PIn2_23]], 16
-; CHECK: [[PIn2_2:%[^ ]+]] = ashr i32 [[PIn2_23_shl]], 16
-
-; CHECK: mul nsw i32 [[PIn1_2]], [[PIn2_2]]
-; CHECK: mul nsw i32 [[PIn1_3]], [[PIn2_3]]
-
-for.body:
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
-  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
-  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
-  %In1.0 = load i16, i16* %PIn1.0, align 2
-  %SIn1.0 = sext i16 %In1.0 to i32
-  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
-  %In2.0 = load i16, i16* %PIn2.0, align 2
-  %SIn2.0 = sext i16 %In2.0 to i32
-  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
-  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
-  store i32 %mul5.us.i.i, i32* %Out.0, align 4
-  %iv.1 = or i32 %iv, 1
-  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
-  %In1.1 = load i16, i16* %PIn1.1, align 2
-  %SIn1.1 = sext i16 %In1.1 to i32
-  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
-  %In2.1 = load i16, i16* %PIn2.1, align 2
-  %SIn2.1 = sext i16 %In2.1 to i32
-  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
-  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
-  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
-  %iv.2 = or i32 %iv, 2
-  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
-  %In1.2 = load i16, i16* %PIn1.2, align 2
-  %SIn1.2 = sext i16 %In1.2 to i32
-  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
-  %In2.2 = load i16, i16* %PIn2.2, align 2
-  %SIn2.2 = sext i16 %In2.2 to i32
-  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
-  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
-  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
-  %iv.3 = or i32 %iv, 3
-  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
-  %In1.3 = load i16, i16* %PIn1.3, align 2
-  %SIn1.3 = sext i16 %In1.3 to i32
-  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
-  %In2.3 = load i16, i16* %PIn2.3, align 2
-  %SIn2.3 = sext i16 %In2.3 to i32
-  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
-  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
-  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
-  %iv.next = add i32 %iv, 4
-  %count.next = add i32 %count, -4
-  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
-  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
-
-exit:
-  ret void
-}
-
-; CHECK-LABEL: topbottom_mul_load_const
-define void @topbottom_mul_load_const(i32 %N, i32* noalias nocapture readnone %Out, i16* nocapture readonly %In, i16* %C) {
-entry:
-  %const = load i16, i16* %C
-  %conv4.i.i = sext i16 %const to i32
-  br label %for.body
-
-; CHECK: for.body:
-; CHECK: [[Cast_PIn_0:%[^ ]+]] = bitcast i16* %PIn.0 to i32*
-; CHECK: [[PIn_01:%[^ ]+]] = load i32, i32* [[Cast_PIn_0]], align 2
-; CHECK: [[PIn_1:%[^ ]+]] = ashr i32 [[PIn_01]], 16
-; CHECK: [[PIn_01_shl:%[^ ]+]] = shl i32 [[PIn_01]], 16
-; CHECK: [[PIn_0:%[^ ]+]] = ashr i32 [[PIn_01_shl]], 16
-
-; CHECK: mul nsw i32 [[PIn_0]], %conv4.i.i
-; CHECK: mul nsw i32 [[PIn_1]], %conv4.i.i
-
-; CHECK: [[Cast_PIn_2:%[^ ]+]] = bitcast i16* %PIn.2 to i32*
-; CHECK: [[PIn_23:%[^ ]+]] = load i32, i32* [[Cast_PIn_2]], align 2
-; CHECK: [[PIn_3:%[^ ]+]] = ashr i32 [[PIn_23]], 16
-; CHECK: [[PIn_23_shl:%[^ ]+]] = shl i32 [[PIn_23]], 16
-; CHECK: [[PIn_2:%[^ ]+]] = ashr i32 [[PIn_23_shl]], 16
-
-; CHECK: mul nsw i32 [[PIn_2]], %conv4.i.i
-; CHECK: mul nsw i32 [[PIn_3]], %conv4.i.i
-
-for.body:
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
-  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
-  %PIn.0 = getelementptr inbounds i16, i16* %In, i32 %iv
-  %In.0 = load i16, i16* %PIn.0, align 2
-  %conv.us.i144.i = sext i16 %In.0 to i32
-  %mul5.us.i.i = mul nsw i32 %conv.us.i144.i, %conv4.i.i
-  %Out.0 = getelementptr inbounds i32, i32* %Out, i32 %iv
-  store i32 %mul5.us.i.i, i32* %Out.0, align 4
-  %iv.1 = or i32 %iv, 1
-  %PIn.1 = getelementptr inbounds i16, i16* %In, i32 %iv.1
-  %In.1 = load i16, i16* %PIn.1, align 2
-  %conv.us.i144.1.i = sext i16 %In.1 to i32
-  %mul5.us.i.1.i = mul nsw i32 %conv.us.i144.1.i, %conv4.i.i
-  %Out.1 = getelementptr inbounds i32, i32* %Out, i32 %iv.1
-  store i32 %mul5.us.i.1.i, i32* %Out.1, align 4
-  %iv.2 = or i32 %iv, 2
-  %PIn.2 = getelementptr inbounds i16, i16* %In, i32 %iv.2
-  %In.3 = load i16, i16* %PIn.2, align 2
-  %conv.us.i144.2.i = sext i16 %In.3 to i32
-  %mul5.us.i.2.i = mul nsw i32 %conv.us.i144.2.i, %conv4.i.i
-  %Out.2 = getelementptr inbounds i32, i32* %Out, i32 %iv.2
-  store i32 %mul5.us.i.2.i, i32* %Out.2, align 4
-  %iv.3 = or i32 %iv, 3
-  %PIn.3 = getelementptr inbounds i16, i16* %In, i32 %iv.3
-  %In.4 = load i16, i16* %PIn.3, align 2
-  %conv.us.i144.3.i = sext i16 %In.4 to i32
-  %mul5.us.i.3.i = mul nsw i32 %conv.us.i144.3.i, %conv4.i.i
-  %Out.3 = getelementptr inbounds i32, i32* %Out, i32 %iv.3
-  store i32 %mul5.us.i.3.i, i32* %Out.3, align 4
-  %iv.next = add i32 %iv, 4
-  %count.next = add i32 %count, -4
-  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
-  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
-
-exit:
-  ret void
-}
-
-; CHECK-LABEL: topbottom_mul_64
-define void @topbottom_mul_64(i32 %N, i64* noalias nocapture readnone %Out, i16* nocapture readonly %In1, i16* nocapture readonly %In2) {
-entry:
-  br label %for.body
-
-; CHECK: for.body:
-; CHECK: [[Cast_PIn1_0:%[^ ]+]] = bitcast i16* %PIn1.0 to i32*
-; CHECK: [[PIn1_01:%[^ ]+]] = load i32, i32* [[Cast_PIn1_0]], align 2
-; CHECK: [[PIn1_1:%[^ ]+]] = ashr i32 [[PIn1_01]], 16
-; CHECK: [[PIn1_01_shl:%[^ ]+]] = shl i32 [[PIn1_01]], 16
-; CHECK: [[PIn1_0:%[^ ]+]] = ashr i32 [[PIn1_01_shl]], 16
-
-; CHECK: [[Cast_PIn2_0:%[^ ]+]] = bitcast i16* %PIn2.0 to i32*
-; CHECK: [[PIn2_01:%[^ ]+]] = load i32, i32* [[Cast_PIn2_0]], align 2
-; CHECK: [[PIn2_1:%[^ ]+]] = ashr i32 [[PIn2_01]], 16
-; CHECK: [[PIn2_01_shl:%[^ ]+]] = shl i32 [[PIn2_01]], 16
-; CHECK: [[PIn2_0:%[^ ]+]] = ashr i32 [[PIn2_01_shl]], 16
-
-; CHECK: [[Mul0:%[^ ]+]] = mul nsw i32 [[PIn1_0]], [[PIn2_0]]
-; CHECK: [[SMul0:%[^ ]+]] = sext i32 [[Mul0]] to i64
-; CHECK: [[Mul1:%[^ ]+]] = mul nsw i32 [[PIn1_1]], [[PIn2_1]]
-; CHECK: [[SMul1:%[^ ]+]] = sext i32 [[Mul1]] to i64
-; CHECK: add i64 [[SMul0]], [[SMul1]]
-
-; CHECK: [[Cast_PIn1_2:%[^ ]+]] = bitcast i16* %PIn1.2 to i32*
-; CHECK: [[PIn1_23:%[^ ]+]] = load i32, i32* [[Cast_PIn1_2]], align 2
-; CHECK: [[PIn1_3:%[^ ]+]] = ashr i32 [[PIn1_23]], 16
-; CHECK: [[PIn1_23_shl:%[^ ]+]] = shl i32 [[PIn1_23]], 16
-; CHECK: [[PIn1_2:%[^ ]+]] = ashr i32 [[PIn1_23_shl]], 16
-
-; CHECK: [[Cast_PIn2_2:%[^ ]+]] = bitcast i16* %PIn2.2 to i32*
-; CHECK: [[PIn2_23:%[^ ]+]] = load i32, i32* [[Cast_PIn2_2]], align 2
-; CHECK: [[PIn2_3:%[^ ]+]] = ashr i32 [[PIn2_23]], 16
-; CHECK: [[PIn2_23_shl:%[^ ]+]] = shl i32 [[PIn2_23]], 16
-; CHECK: [[PIn2_2:%[^ ]+]] = ashr i32 [[PIn2_23_shl]], 16
-
-; CHECK: [[Mul2:%[^ ]+]] = mul nsw i32 [[PIn1_2]], [[PIn2_2]]
-; CHECK: [[SMul2:%[^ ]+]] = sext i32 [[Mul2]] to i64
-; CHECK: [[Mul3:%[^ ]+]] = mul nsw i32 [[PIn1_3]], [[PIn2_3]]
-; CHECK: [[SMul3:%[^ ]+]] = sext i32 [[Mul3]] to i64
-; CHECK: add i64 [[SMul2]], [[SMul3]]
-
-for.body:
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
-  %iv.out = phi i32 [ 0, %entry] , [ %iv.out.next, %for.body ]
-  %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ]
-  %PIn1.0 = getelementptr inbounds i16, i16* %In1, i32 %iv
-  %In1.0 = load i16, i16* %PIn1.0, align 2
-  %SIn1.0 = sext i16 %In1.0 to i32
-  %PIn2.0 = getelementptr inbounds i16, i16* %In2, i32 %iv
-  %In2.0 = load i16, i16* %PIn2.0, align 2
-  %SIn2.0 = sext i16 %In2.0 to i32
-  %mul5.us.i.i = mul nsw i32 %SIn1.0, %SIn2.0
-  %sext.0 = sext i32 %mul5.us.i.i to i64
-  %iv.1 = or i32 %iv, 1
-  %PIn1.1 = getelementptr inbounds i16, i16* %In1, i32 %iv.1
-  %In1.1 = load i16, i16* %PIn1.1, align 2
-  %SIn1.1 = sext i16 %In1.1 to i32
-  %PIn2.1 = getelementptr inbounds i16, i16* %In2, i32 %iv.1
-  %In2.1 = load i16, i16* %PIn2.1, align 2
-  %SIn2.1 = sext i16 %In2.1 to i32
-  %mul5.us.i.1.i = mul nsw i32 %SIn1.1, %SIn2.1
-  %sext.1 = sext i32 %mul5.us.i.1.i to i64
-  %mac.0 = add i64 %sext.0, %sext.1
-  %Out.0 = getelementptr inbounds i64, i64* %Out, i32 %iv.out
-  store i64 %mac.0, i64* %Out.0, align 4
-  %iv.2 = or i32 %iv, 2
-  %PIn1.2 = getelementptr inbounds i16, i16* %In1, i32 %iv.2
-  %In1.2 = load i16, i16* %PIn1.2, align 2
-  %SIn1.2 = sext i16 %In1.2 to i32
-  %PIn2.2 = getelementptr inbounds i16, i16* %In2, i32 %iv.2
-  %In2.2 = load i16, i16* %PIn2.2, align 2
-  %SIn2.2 = sext i16 %In2.2 to i32
-  %mul5.us.i.2.i = mul nsw i32 %SIn1.2, %SIn2.2
-  %sext.2 = sext i32 %mul5.us.i.2.i to i64
-  %iv.3 = or i32 %iv, 3
-  %PIn1.3 = getelementptr inbounds i16, i16* %In1, i32 %iv.3
-  %In1.3 = load i16, i16* %PIn1.3, align 2
-  %SIn1.3 = sext i16 %In1.3 to i32
-  %PIn2.3 = getelementptr inbounds i16, i16* %In2, i32 %iv.3
-  %In2.3 = load i16, i16* %PIn2.3, align 2
-  %SIn2.3 = sext i16 %In2.3 to i32
-  %mul5.us.i.3.i = mul nsw i32 %SIn1.3, %SIn2.3
-  %sext.3 = sext i32 %mul5.us.i.3.i to i64
-  %mac.1 = add i64 %sext.2, %sext.3
-  %iv.out.1 = or i32 %iv.out, 1
-  %Out.1 = getelementptr inbounds i64, i64* %Out, i32 %iv.out.1
-  store i64 %mac.1, i64* %Out.1, align 4
-  %iv.next = add i32 %iv, 4
-  %iv.out.next = add i32 %iv.out, 2
-  %count.next = add i32 %count, -4
-  %niter375.ncmp.3.i = icmp eq i32 %count.next, 0
-  br i1 %niter375.ncmp.3.i, label %exit, label %for.body
-
-exit:
-  ret void
-}
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad0.ll b/test/CodeGen/ARM/smlad0.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad0.ll
rename to test/CodeGen/ARM/smlad0.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad1.ll b/test/CodeGen/ARM/smlad1.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad1.ll
rename to test/CodeGen/ARM/smlad1.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad10.ll b/test/CodeGen/ARM/smlad10.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad10.ll
rename to test/CodeGen/ARM/smlad10.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad11.ll b/test/CodeGen/ARM/smlad11.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad11.ll
rename to test/CodeGen/ARM/smlad11.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad12.ll b/test/CodeGen/ARM/smlad12.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad12.ll
rename to test/CodeGen/ARM/smlad12.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad2.ll b/test/CodeGen/ARM/smlad2.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad2.ll
rename to test/CodeGen/ARM/smlad2.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad3.ll b/test/CodeGen/ARM/smlad3.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad3.ll
rename to test/CodeGen/ARM/smlad3.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad4.ll b/test/CodeGen/ARM/smlad4.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad4.ll
rename to test/CodeGen/ARM/smlad4.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad5.ll b/test/CodeGen/ARM/smlad5.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad5.ll
rename to test/CodeGen/ARM/smlad5.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad6.ll b/test/CodeGen/ARM/smlad6.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad6.ll
rename to test/CodeGen/ARM/smlad6.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad7.ll b/test/CodeGen/ARM/smlad7.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad7.ll
rename to test/CodeGen/ARM/smlad7.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad8.ll b/test/CodeGen/ARM/smlad8.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad8.ll
rename to test/CodeGen/ARM/smlad8.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlad9.ll b/test/CodeGen/ARM/smlad9.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlad9.ll
rename to test/CodeGen/ARM/smlad9.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smladx-1.ll b/test/CodeGen/ARM/smladx-1.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smladx-1.ll
rename to test/CodeGen/ARM/smladx-1.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlald0.ll b/test/CodeGen/ARM/smlald0.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlald0.ll
rename to test/CodeGen/ARM/smlald0.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlald1.ll b/test/CodeGen/ARM/smlald1.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlald1.ll
rename to test/CodeGen/ARM/smlald1.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlald2.ll b/test/CodeGen/ARM/smlald2.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlald2.ll
rename to test/CodeGen/ARM/smlald2.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll b/test/CodeGen/ARM/smlaldx-1.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll
rename to test/CodeGen/ARM/smlaldx-1.ll
diff --git a/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll b/test/CodeGen/ARM/smlaldx-2.ll
similarity index 100%
rename from test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll
rename to test/CodeGen/ARM/smlaldx-2.ll
-- 
GitLab


From 4aeb7d033aef4e4345e52330160ea21ade905859 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Thu, 18 Oct 2018 19:49:44 +0000
Subject: [PATCH 0307/1116] Make Function::getInstructionCount const

Summary: Function::getInstructionCount can be const.

Reviewers: davidxl, paquette

Reviewed By: davidxl

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53378

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344754 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/Function.h | 2 +-
 lib/IR/Function.cpp        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 1b91537c5d9..630f47e8bb5 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -158,7 +158,7 @@ public:
   /// Returns the number of non-debug IR instructions in this function.
   /// This is equivalent to the sum of the sizes of each basic block contained
   /// within this function.
-  unsigned getInstructionCount();
+  unsigned getInstructionCount() const;
 
   /// Returns the FunctionType for me.
   FunctionType *getFunctionType() const {
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 36ba8d0721f..ec094812ceb 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -195,9 +195,9 @@ LLVMContext &Function::getContext() const {
   return getType()->getContext();
 }
 
-unsigned Function::getInstructionCount() {
+unsigned Function::getInstructionCount() const {
   unsigned NumInstrs = 0;
-  for (BasicBlock &BB : BasicBlocks)
+  for (const BasicBlock &BB : BasicBlocks)
     NumInstrs += std::distance(BB.instructionsWithoutDebug().begin(),
                                BB.instructionsWithoutDebug().end());
   return NumInstrs;
-- 
GitLab


From 7d144ecd33f0186b947dcb700d55ae4ee14f7c74 Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Thu, 18 Oct 2018 20:07:44 +0000
Subject: [PATCH 0308/1116] Support of hurd in llvm-shlib

Svante Signell


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344756 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-shlib/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/llvm-shlib/CMakeLists.txt b/tools/llvm-shlib/CMakeLists.txt
index dab1893c687..187066e5ded 100644
--- a/tools/llvm-shlib/CMakeLists.txt
+++ b/tools/llvm-shlib/CMakeLists.txt
@@ -44,6 +44,7 @@ if(LLVM_BUILD_LLVM_DYLIB)
   list(REMOVE_DUPLICATES LIB_NAMES)
   if(("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux") OR (MINGW) OR (HAIKU)
      OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "FreeBSD")
+     OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "GNU")
      OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "OpenBSD")
      OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "Fuchsia")
      OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "DragonFly")
-- 
GitLab


From 356cab04d8da672ee39a6527cd04f59ffae59fe1 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 18 Oct 2018 22:42:32 +0000
Subject: [PATCH 0309/1116] [ORC] Add a createJITDylib method to LLJIT.

Because I'm about to get on stage at the dev meeting and claim that it exists.

This method creates a JITDylib instance with the given name and returns a
reference to it.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344763 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/Orc/LLJIT.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/llvm/ExecutionEngine/Orc/LLJIT.h b/include/llvm/ExecutionEngine/Orc/LLJIT.h
index 8b6465e1f02..b7ef8834706 100644
--- a/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -49,6 +49,11 @@ public:
   /// Returns a reference to the JITDylib representing the JIT'd main program.
   JITDylib &getMainJITDylib() { return Main; }
 
+  /// Create a new JITDylib with the given name and return a reference to it.
+  JITDylib &createJITDylib(std::string Name) {
+    return ES->createJITDylib(std::move(Name));
+  }
+
   /// Convenience method for defining an absolute symbol.
   Error defineAbsolute(StringRef Name, JITEvaluatedSymbol Address);
 
-- 
GitLab


From 8a6d7347bab8107e51ab5e3f8293102736d5e72f Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Thu, 18 Oct 2018 23:03:55 +0000
Subject: [PATCH 0310/1116] [TI removal] Update the C API for the move away
 from `TerminatorInst`.

This updates the C API for the removal of `TerminatorInst`. It converts
the type query to a predicate query and moves the generic methods to
work on `Instruction` instances that satisfy this predicate rather than
requiring a specific type. It also clarifies that the C API wrapping
`BasicBlock::getTerminator` just returns an `Instruction`. Because this
was always wrapped opaquely as a value and the functions consuming these
values will work on `Instruction` objects, this shouldn't break any
clients.

This is a completely compatible change to the C API.

Differential Revision: https://reviews.llvm.org/D52968

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344764 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm-c/Core.h | 40 ++++++++++++++++++++++++----------------
 lib/IR/Core.cpp       | 11 ++++++++---
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 2e8c29c23bf..0c274b62567 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -1539,16 +1539,15 @@ LLVMTypeRef LLVMX86MMXType(void);
       macro(SelectInst)                     \
       macro(ShuffleVectorInst)              \
       macro(StoreInst)                      \
-      macro(TerminatorInst)                 \
-        macro(BranchInst)                   \
-        macro(IndirectBrInst)               \
-        macro(InvokeInst)                   \
-        macro(ReturnInst)                   \
-        macro(SwitchInst)                   \
-        macro(UnreachableInst)              \
-        macro(ResumeInst)                   \
-        macro(CleanupReturnInst)            \
-        macro(CatchReturnInst)              \
+      macro(BranchInst)                     \
+      macro(IndirectBrInst)                 \
+      macro(InvokeInst)                     \
+      macro(ReturnInst)                     \
+      macro(SwitchInst)                     \
+      macro(UnreachableInst)                \
+      macro(ResumeInst)                     \
+      macro(CleanupReturnInst)              \
+      macro(CatchReturnInst)                \
       macro(FuncletPadInst)                 \
         macro(CatchPadInst)                 \
         macro(CleanupPadInst)               \
@@ -2679,7 +2678,7 @@ LLVMValueRef LLVMGetBasicBlockParent(LLVMBasicBlockRef BB);
  * If the basic block does not have a terminator (it is not well-formed
  * if it doesn't), then NULL is returned.
  *
- * The returned LLVMValueRef corresponds to a llvm::TerminatorInst.
+ * The returned LLVMValueRef corresponds to an llvm::Instruction.
  *
  * @see llvm::BasicBlock::getTerminator()
  */
@@ -2951,6 +2950,15 @@ LLVMRealPredicate LLVMGetFCmpPredicate(LLVMValueRef Inst);
  */
 LLVMValueRef LLVMInstructionClone(LLVMValueRef Inst);
 
+/**
+ * Determine whether an instruction is a terminator. This routine is named to
+ * be compatible with historical functions that did this by querying the
+ * underlying C++ type.
+ *
+ * @see llvm::Instruction::isTerminator()
+ */
+LLVMValueRef LLVMIsATerminatorInst(LLVMValueRef Inst);
+
 /**
  * @defgroup LLVMCCoreValueInstructionCall Call Sites and Invocations
  *
@@ -3091,8 +3099,8 @@ void LLVMSetUnwindDest(LLVMValueRef InvokeInst, LLVMBasicBlockRef B);
 /**
  * @defgroup LLVMCCoreValueInstructionTerminator Terminators
  *
- * Functions in this group only apply to instructions that map to
- * llvm::TerminatorInst instances.
+ * Functions in this group only apply to instructions for which
+ * LLVMIsATerminatorInst returns true.
  *
  * @{
  */
@@ -3100,21 +3108,21 @@ void LLVMSetUnwindDest(LLVMValueRef InvokeInst, LLVMBasicBlockRef B);
 /**
  * Return the number of successors that this terminator has.
  *
- * @see llvm::TerminatorInst::getNumSuccessors
+ * @see llvm::Instruction::getNumSuccessors
  */
 unsigned LLVMGetNumSuccessors(LLVMValueRef Term);
 
 /**
  * Return the specified successor.
  *
- * @see llvm::TerminatorInst::getSuccessor
+ * @see llvm::Instruction::getSuccessor
  */
 LLVMBasicBlockRef LLVMGetSuccessor(LLVMValueRef Term, unsigned i);
 
 /**
  * Update the specified successor to point at the provided block.
  *
- * @see llvm::TerminatorInst::setSuccessor
+ * @see llvm::Instruction::setSuccessor
  */
 void LLVMSetSuccessor(LLVMValueRef Term, unsigned i, LLVMBasicBlockRef block);
 
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 639b6b4489a..27906e68636 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -2595,6 +2595,11 @@ LLVMValueRef LLVMInstructionClone(LLVMValueRef Inst) {
   return nullptr;
 }
 
+LLVMValueRef LLVMIsATerminatorInst(LLVMValueRef Inst) {
+  Instruction *I = dyn_cast<Instruction>(unwrap(Inst));
+  return (I && I->isTerminator()) ? wrap(I) : nullptr;
+}
+
 unsigned LLVMGetNumArgOperands(LLVMValueRef Instr) {
   if (FuncletPadInst *FPI = dyn_cast<FuncletPadInst>(unwrap(Instr))) {
     return FPI->getNumArgOperands();
@@ -2710,15 +2715,15 @@ void LLVMSetUnwindDest(LLVMValueRef Invoke, LLVMBasicBlockRef B) {
 /*--.. Operations on terminators ...........................................--*/
 
 unsigned LLVMGetNumSuccessors(LLVMValueRef Term) {
-  return unwrap<TerminatorInst>(Term)->getNumSuccessors();
+  return unwrap<Instruction>(Term)->getNumSuccessors();
 }
 
 LLVMBasicBlockRef LLVMGetSuccessor(LLVMValueRef Term, unsigned i) {
-  return wrap(unwrap<TerminatorInst>(Term)->getSuccessor(i));
+  return wrap(unwrap<Instruction>(Term)->getSuccessor(i));
 }
 
 void LLVMSetSuccessor(LLVMValueRef Term, unsigned i, LLVMBasicBlockRef block) {
-  return unwrap<TerminatorInst>(Term)->setSuccessor(i,unwrap(block));
+  return unwrap<Instruction>(Term)->setSuccessor(i, unwrap(block));
 }
 
 /*--.. Operations on branch instructions (only) ............................--*/
-- 
GitLab


From 39edf631ef871343d29a6ad450c4c40add9f188e Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Fri, 19 Oct 2018 00:22:10 +0000
Subject: [PATCH 0311/1116] [TI removal] Switch some newly added code over to
 use `Instruction` directly.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344768 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/DivergenceAnalysis.h     |  4 ++--
 include/llvm/Analysis/SyncDependenceAnalysis.h |  6 ++----
 lib/Analysis/DivergenceAnalysis.cpp            | 11 +++++------
 lib/Analysis/SyncDependenceAnalysis.cpp        |  4 ++--
 4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/include/llvm/Analysis/DivergenceAnalysis.h b/include/llvm/Analysis/DivergenceAnalysis.h
index 356c144e7e5..9fadf52288b 100644
--- a/include/llvm/Analysis/DivergenceAnalysis.h
+++ b/include/llvm/Analysis/DivergenceAnalysis.h
@@ -80,7 +80,7 @@ public:
   void print(raw_ostream &OS, const Module *) const;
 
 private:
-  bool updateTerminator(const TerminatorInst &Term) const;
+  bool updateTerminator(const Instruction &Term) const;
   bool updatePHINode(const PHINode &Phi) const;
 
   /// \brief Computes whether \p Inst is divergent based on the
@@ -135,7 +135,7 @@ private:
 
   /// \brief Propagate induced value divergence due to control divergence in \p
   /// Term.
-  void propagateBranchDivergence(const TerminatorInst &Term);
+  void propagateBranchDivergence(const Instruction &Term);
 
   /// \brief Propagate divergent caused by a divergent loop exit.
   ///
diff --git a/include/llvm/Analysis/SyncDependenceAnalysis.h b/include/llvm/Analysis/SyncDependenceAnalysis.h
index f464c4d3e9e..df693d9d8e8 100644
--- a/include/llvm/Analysis/SyncDependenceAnalysis.h
+++ b/include/llvm/Analysis/SyncDependenceAnalysis.h
@@ -29,8 +29,6 @@ class BasicBlock;
 class DominatorTree;
 class Loop;
 class PostDominatorTree;
-class TerminatorInst;
-class TerminatorInst;
 
 using ConstBlockSet = SmallPtrSet<const BasicBlock *, 4>;
 
@@ -59,7 +57,7 @@ public:
   /// header. Those exit blocks are added to the returned set.
   /// If L is the parent loop of \p Term and an exit of L is in the returned
   /// set then L is a divergent loop.
-  const ConstBlockSet &join_blocks(const TerminatorInst &Term);
+  const ConstBlockSet &join_blocks(const Instruction &Term);
 
   /// \brief Computes divergent join points and loop exits (in the surrounding
   /// loop) caused by the divergent loop exits of\p Loop.
@@ -79,7 +77,7 @@ private:
   const LoopInfo &LI;
 
   std::map<const Loop *, std::unique_ptr<ConstBlockSet>> CachedLoopExitJoins;
-  std::map<const TerminatorInst *, std::unique_ptr<ConstBlockSet>>
+  std::map<const Instruction *, std::unique_ptr<ConstBlockSet>>
       CachedBranchJoins;
 };
 
diff --git a/lib/Analysis/DivergenceAnalysis.cpp b/lib/Analysis/DivergenceAnalysis.cpp
index 9453f680110..de47445c5e0 100644
--- a/lib/Analysis/DivergenceAnalysis.cpp
+++ b/lib/Analysis/DivergenceAnalysis.cpp
@@ -108,7 +108,7 @@ void DivergenceAnalysis::addUniformOverride(const Value &UniVal) {
   UniformOverrides.insert(&UniVal);
 }
 
-bool DivergenceAnalysis::updateTerminator(const TerminatorInst &Term) const {
+bool DivergenceAnalysis::updateTerminator(const Instruction &Term) const {
   if (Term.getNumSuccessors() <= 1)
     return false;
   if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) {
@@ -297,7 +297,7 @@ bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock,
   return false;
 }
 
-void DivergenceAnalysis::propagateBranchDivergence(const TerminatorInst &Term) {
+void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) {
   LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
 
   markDivergent(Term);
@@ -380,11 +380,10 @@ void DivergenceAnalysis::compute() {
       continue;
 
     // propagate divergence caused by terminator
-    if (isa<TerminatorInst>(I)) {
-      auto &Term = cast<TerminatorInst>(I);
-      if (updateTerminator(Term)) {
+    if (I.isTerminator()) {
+      if (updateTerminator(I)) {
         // propagate control divergence to affected instructions
-        propagateBranchDivergence(Term);
+        propagateBranchDivergence(I);
         continue;
       }
     }
diff --git a/lib/Analysis/SyncDependenceAnalysis.cpp b/lib/Analysis/SyncDependenceAnalysis.cpp
index 9c40ffe0cc7..e1a7e4476d1 100644
--- a/lib/Analysis/SyncDependenceAnalysis.cpp
+++ b/lib/Analysis/SyncDependenceAnalysis.cpp
@@ -208,7 +208,7 @@ struct DivergencePropagator {
   }
 
   // find all blocks reachable by two disjoint paths from @rootTerm.
-  // This method works for both divergent TerminatorInsts and loops with
+  // This method works for both divergent terminators and loops with
   // divergent exits.
   // @rootBlock is either the block containing the branch or the header of the
   // divergent loop.
@@ -355,7 +355,7 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) {
 }
 
 const ConstBlockSet &
-SyncDependenceAnalysis::join_blocks(const TerminatorInst &Term) {
+SyncDependenceAnalysis::join_blocks(const Instruction &Term) {
   // trivial case
   if (Term.getNumSuccessors() < 1) {
     return EmptyBlockSet;
-- 
GitLab


From 09ebf7a44c378e6f7aea41e497d9b725d394def8 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Fri, 19 Oct 2018 00:22:37 +0000
Subject: [PATCH 0312/1116] [TI removal] Remove `TerminatorInst` from the IR
 type system!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344769 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/InstrTypes.h   |  28 ------
 include/llvm/IR/Instructions.h |  34 +++-----
 lib/IR/Instructions.cpp        | 153 ++++++++++++++++-----------------
 3 files changed, 84 insertions(+), 131 deletions(-)

diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index 95cdb70e4fd..e42bfc3afce 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -45,34 +45,6 @@
 
 namespace llvm {
 
-//===----------------------------------------------------------------------===//
-//                            TerminatorInst Class
-//===----------------------------------------------------------------------===//
-
-/// Subclasses of this class are all able to terminate a basic
-/// block. Thus, these are all the flow control type of operations.
-///
-class TerminatorInst : public Instruction {
-protected:
-  TerminatorInst(Type *Ty, Instruction::TermOps iType,
-                 Use *Ops, unsigned NumOps,
-                 Instruction *InsertBefore = nullptr)
-    : Instruction(Ty, iType, Ops, NumOps, InsertBefore) {}
-
-  TerminatorInst(Type *Ty, Instruction::TermOps iType,
-                 Use *Ops, unsigned NumOps, BasicBlock *InsertAtEnd)
-    : Instruction(Ty, iType, Ops, NumOps, InsertAtEnd) {}
-
-public:
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static bool classof(const Instruction *I) {
-    return I->isTerminator();
-  }
-  static bool classof(const Value *V) {
-    return isa<Instruction>(V) && classof(cast<Instruction>(V));
-  }
-};
-
 //===----------------------------------------------------------------------===//
 //                          UnaryInstruction Class
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 8bdc935425d..faea2973773 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -1357,8 +1357,6 @@ class InvokeInst;
 
 template <class T> struct CallBaseParent { using type = Instruction; };
 
-template <> struct CallBaseParent<InvokeInst> { using type = TerminatorInst; };
-
 //===----------------------------------------------------------------------===//
 /// Base class for all callable instructions (InvokeInst and CallInst)
 /// Holds everything related to calling a function, abstracting from the base
@@ -3265,7 +3263,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(LandingPadInst, Value)
 /// Return a value (possibly void), from a function.  Execution
 /// does not continue in this function any longer.
 ///
-class ReturnInst : public TerminatorInst {
+class ReturnInst : public Instruction {
   ReturnInst(const ReturnInst &RI);
 
 private:
@@ -3325,8 +3323,6 @@ public:
   }
 
 private:
-  friend TerminatorInst;
-
   BasicBlock *getSuccessor(unsigned idx) const {
     llvm_unreachable("ReturnInst has no successors!");
   }
@@ -3349,7 +3345,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ReturnInst, Value)
 //===---------------------------------------------------------------------------
 /// Conditional or Unconditional Branch instruction.
 ///
-class BranchInst : public TerminatorInst {
+class BranchInst : public Instruction {
   /// Ops list - Branches are strange.  The operands are ordered:
   ///  [Cond, FalseDest,] TrueDest.  This makes some accessors faster because
   /// they don't have to check for cond/uncond branchness. These are mostly
@@ -3493,7 +3489,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
 //===---------------------------------------------------------------------------
 /// Multiway switch
 ///
-class SwitchInst : public TerminatorInst {
+class SwitchInst : public Instruction {
   unsigned ReservedSpace;
 
   // Operand[0]    = Value to switch on
@@ -3576,7 +3572,7 @@ public:
     /// Returns number of current case.
     unsigned getCaseIndex() const { return Index; }
 
-    /// Returns TerminatorInst's successor index for current case successor.
+    /// Returns successor index for current case successor.
     unsigned getSuccessorIndex() const {
       assert(((unsigned)Index == DefaultPseudoIndex ||
               (unsigned)Index < SI->getNumCases()) &&
@@ -3632,7 +3628,7 @@ public:
     CaseIteratorImpl(SwitchInstT *SI, unsigned CaseNum) : Case(SI, CaseNum) {}
 
     /// Initializes case iterator for given SwitchInst and for given
-    /// TerminatorInst's successor index.
+    /// successor index.
     static CaseIteratorImpl fromSuccessorIndex(SwitchInstT *SI,
                                                unsigned SuccessorIndex) {
       assert(SuccessorIndex < SI->getNumSuccessors() &&
@@ -3850,7 +3846,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SwitchInst, Value)
 //===---------------------------------------------------------------------------
 /// Indirect Branch Instruction.
 ///
-class IndirectBrInst : public TerminatorInst {
+class IndirectBrInst : public Instruction {
   unsigned ReservedSpace;
 
   // Operand[0]   = Address to jump to
@@ -4226,7 +4222,7 @@ InvokeInst::InvokeInst(Value *Func, BasicBlock *IfNormal,
 //===---------------------------------------------------------------------------
 /// Resume the propagation of an exception.
 ///
-class ResumeInst : public TerminatorInst {
+class ResumeInst : public Instruction {
   ResumeInst(const ResumeInst &RI);
 
   explicit ResumeInst(Value *Exn, Instruction *InsertBefore=nullptr);
@@ -4264,8 +4260,6 @@ public:
   }
 
 private:
-  friend TerminatorInst;
-
   BasicBlock *getSuccessor(unsigned idx) const {
     llvm_unreachable("ResumeInst has no successors!");
   }
@@ -4285,7 +4279,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ResumeInst, Value)
 //===----------------------------------------------------------------------===//
 //                         CatchSwitchInst Class
 //===----------------------------------------------------------------------===//
-class CatchSwitchInst : public TerminatorInst {
+class CatchSwitchInst : public Instruction {
   /// The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
   unsigned ReservedSpace;
@@ -4551,7 +4545,7 @@ public:
 //                               CatchReturnInst Class
 //===----------------------------------------------------------------------===//
 
-class CatchReturnInst : public TerminatorInst {
+class CatchReturnInst : public Instruction {
   CatchReturnInst(const CatchReturnInst &RI);
   CatchReturnInst(Value *CatchPad, BasicBlock *BB, Instruction *InsertBefore);
   CatchReturnInst(Value *CatchPad, BasicBlock *BB, BasicBlock *InsertAtEnd);
@@ -4611,8 +4605,6 @@ public:
   }
 
 private:
-  friend TerminatorInst;
-
   BasicBlock *getSuccessor(unsigned Idx) const {
     assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
     return getSuccessor();
@@ -4634,7 +4626,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CatchReturnInst, Value)
 //                               CleanupReturnInst Class
 //===----------------------------------------------------------------------===//
 
-class CleanupReturnInst : public TerminatorInst {
+class CleanupReturnInst : public Instruction {
 private:
   CleanupReturnInst(const CleanupReturnInst &RI);
   CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, unsigned Values,
@@ -4707,8 +4699,6 @@ public:
   }
 
 private:
-  friend TerminatorInst;
-
   BasicBlock *getSuccessor(unsigned Idx) const {
     assert(Idx == 0);
     return getUnwindDest();
@@ -4741,7 +4731,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CleanupReturnInst, Value)
 /// presence of this instruction indicates some higher level knowledge that the
 /// end of the block cannot be reached.
 ///
-class UnreachableInst : public TerminatorInst {
+class UnreachableInst : public Instruction {
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
@@ -4768,8 +4758,6 @@ public:
   }
 
 private:
-  friend TerminatorInst;
-
   BasicBlock *getSuccessor(unsigned idx) const {
     llvm_unreachable("UnreachableInst has no successors!");
   }
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 126a96635ee..e1d1c0f2a6b 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -622,55 +622,53 @@ LandingPadInst *InvokeInst::getLandingPadInst() const {
 //===----------------------------------------------------------------------===//
 
 ReturnInst::ReturnInst(const ReturnInst &RI)
-  : TerminatorInst(Type::getVoidTy(RI.getContext()), Instruction::Ret,
-                   OperandTraits<ReturnInst>::op_end(this) -
-                     RI.getNumOperands(),
-                   RI.getNumOperands()) {
+    : Instruction(Type::getVoidTy(RI.getContext()), Instruction::Ret,
+                  OperandTraits<ReturnInst>::op_end(this) - RI.getNumOperands(),
+                  RI.getNumOperands()) {
   if (RI.getNumOperands())
     Op<0>() = RI.Op<0>();
   SubclassOptionalData = RI.SubclassOptionalData;
 }
 
 ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(C), Instruction::Ret,
-                   OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
-                   InsertBefore) {
+    : Instruction(Type::getVoidTy(C), Instruction::Ret,
+                  OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
+                  InsertBefore) {
   if (retVal)
     Op<0>() = retVal;
 }
 
 ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(C), Instruction::Ret,
-                   OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
-                   InsertAtEnd) {
+    : Instruction(Type::getVoidTy(C), Instruction::Ret,
+                  OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
+                  InsertAtEnd) {
   if (retVal)
     Op<0>() = retVal;
 }
 
 ReturnInst::ReturnInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(Context), Instruction::Ret,
-                   OperandTraits<ReturnInst>::op_end(this), 0, InsertAtEnd) {
-}
+    : Instruction(Type::getVoidTy(Context), Instruction::Ret,
+                  OperandTraits<ReturnInst>::op_end(this), 0, InsertAtEnd) {}
 
 //===----------------------------------------------------------------------===//
 //                        ResumeInst Implementation
 //===----------------------------------------------------------------------===//
 
 ResumeInst::ResumeInst(const ResumeInst &RI)
-  : TerminatorInst(Type::getVoidTy(RI.getContext()), Instruction::Resume,
-                   OperandTraits<ResumeInst>::op_begin(this), 1) {
+    : Instruction(Type::getVoidTy(RI.getContext()), Instruction::Resume,
+                  OperandTraits<ResumeInst>::op_begin(this), 1) {
   Op<0>() = RI.Op<0>();
 }
 
 ResumeInst::ResumeInst(Value *Exn, Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
-                   OperandTraits<ResumeInst>::op_begin(this), 1, InsertBefore) {
+    : Instruction(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
+                  OperandTraits<ResumeInst>::op_begin(this), 1, InsertBefore) {
   Op<0>() = Exn;
 }
 
 ResumeInst::ResumeInst(Value *Exn, BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
-                   OperandTraits<ResumeInst>::op_begin(this), 1, InsertAtEnd) {
+    : Instruction(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
+                  OperandTraits<ResumeInst>::op_begin(this), 1, InsertAtEnd) {
   Op<0>() = Exn;
 }
 
@@ -679,10 +677,10 @@ ResumeInst::ResumeInst(Value *Exn, BasicBlock *InsertAtEnd)
 //===----------------------------------------------------------------------===//
 
 CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI)
-    : TerminatorInst(CRI.getType(), Instruction::CleanupRet,
-                     OperandTraits<CleanupReturnInst>::op_end(this) -
-                         CRI.getNumOperands(),
-                     CRI.getNumOperands()) {
+    : Instruction(CRI.getType(), Instruction::CleanupRet,
+                  OperandTraits<CleanupReturnInst>::op_end(this) -
+                      CRI.getNumOperands(),
+                  CRI.getNumOperands()) {
   setInstructionSubclassData(CRI.getSubclassDataFromInstruction());
   Op<0>() = CRI.Op<0>();
   if (CRI.hasUnwindDest())
@@ -700,19 +698,19 @@ void CleanupReturnInst::init(Value *CleanupPad, BasicBlock *UnwindBB) {
 
 CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
                                      unsigned Values, Instruction *InsertBefore)
-    : TerminatorInst(Type::getVoidTy(CleanupPad->getContext()),
-                     Instruction::CleanupRet,
-                     OperandTraits<CleanupReturnInst>::op_end(this) - Values,
-                     Values, InsertBefore) {
+    : Instruction(Type::getVoidTy(CleanupPad->getContext()),
+                  Instruction::CleanupRet,
+                  OperandTraits<CleanupReturnInst>::op_end(this) - Values,
+                  Values, InsertBefore) {
   init(CleanupPad, UnwindBB);
 }
 
 CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
                                      unsigned Values, BasicBlock *InsertAtEnd)
-    : TerminatorInst(Type::getVoidTy(CleanupPad->getContext()),
-                     Instruction::CleanupRet,
-                     OperandTraits<CleanupReturnInst>::op_end(this) - Values,
-                     Values, InsertAtEnd) {
+    : Instruction(Type::getVoidTy(CleanupPad->getContext()),
+                  Instruction::CleanupRet,
+                  OperandTraits<CleanupReturnInst>::op_end(this) - Values,
+                  Values, InsertAtEnd) {
   init(CleanupPad, UnwindBB);
 }
 
@@ -725,25 +723,25 @@ void CatchReturnInst::init(Value *CatchPad, BasicBlock *BB) {
 }
 
 CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI)
-    : TerminatorInst(Type::getVoidTy(CRI.getContext()), Instruction::CatchRet,
-                     OperandTraits<CatchReturnInst>::op_begin(this), 2) {
+    : Instruction(Type::getVoidTy(CRI.getContext()), Instruction::CatchRet,
+                  OperandTraits<CatchReturnInst>::op_begin(this), 2) {
   Op<0>() = CRI.Op<0>();
   Op<1>() = CRI.Op<1>();
 }
 
 CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
                                  Instruction *InsertBefore)
-    : TerminatorInst(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
-                     OperandTraits<CatchReturnInst>::op_begin(this), 2,
-                     InsertBefore) {
+    : Instruction(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
+                  OperandTraits<CatchReturnInst>::op_begin(this), 2,
+                  InsertBefore) {
   init(CatchPad, BB);
 }
 
 CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
                                  BasicBlock *InsertAtEnd)
-    : TerminatorInst(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
-                     OperandTraits<CatchReturnInst>::op_begin(this), 2,
-                     InsertAtEnd) {
+    : Instruction(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
+                  OperandTraits<CatchReturnInst>::op_begin(this), 2,
+                  InsertAtEnd) {
   init(CatchPad, BB);
 }
 
@@ -755,8 +753,8 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
                                  unsigned NumReservedValues,
                                  const Twine &NameStr,
                                  Instruction *InsertBefore)
-    : TerminatorInst(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
-                     InsertBefore) {
+    : Instruction(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
+                  InsertBefore) {
   if (UnwindDest)
     ++NumReservedValues;
   init(ParentPad, UnwindDest, NumReservedValues + 1);
@@ -766,8 +764,8 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
 CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
                                  unsigned NumReservedValues,
                                  const Twine &NameStr, BasicBlock *InsertAtEnd)
-    : TerminatorInst(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
-                     InsertAtEnd) {
+    : Instruction(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
+                  InsertAtEnd) {
   if (UnwindDest)
     ++NumReservedValues;
   init(ParentPad, UnwindDest, NumReservedValues + 1);
@@ -775,8 +773,8 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
 }
 
 CatchSwitchInst::CatchSwitchInst(const CatchSwitchInst &CSI)
-    : TerminatorInst(CSI.getType(), Instruction::CatchSwitch, nullptr,
-                     CSI.getNumOperands()) {
+    : Instruction(CSI.getType(), Instruction::CatchSwitch, nullptr,
+                  CSI.getNumOperands()) {
   init(CSI.getParentPad(), CSI.getUnwindDest(), CSI.getNumOperands());
   setNumHungOffUseOperands(ReservedSpace);
   Use *OL = getOperandList();
@@ -874,13 +872,11 @@ FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
 
 UnreachableInst::UnreachableInst(LLVMContext &Context,
                                  Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(Context), Instruction::Unreachable,
-                   nullptr, 0, InsertBefore) {
-}
+    : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr,
+                  0, InsertBefore) {}
 UnreachableInst::UnreachableInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(Context), Instruction::Unreachable,
-                   nullptr, 0, InsertAtEnd) {
-}
+    : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr,
+                  0, InsertAtEnd) {}
 
 //===----------------------------------------------------------------------===//
 //                        BranchInst Implementation
@@ -893,18 +889,18 @@ void BranchInst::AssertOK() {
 }
 
 BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
-                   OperandTraits<BranchInst>::op_end(this) - 1,
-                   1, InsertBefore) {
+    : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - 1, 1,
+                  InsertBefore) {
   assert(IfTrue && "Branch destination may not be null!");
   Op<-1>() = IfTrue;
 }
 
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
                        Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
-                   OperandTraits<BranchInst>::op_end(this) - 3,
-                   3, InsertBefore) {
+    : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - 3, 3,
+                  InsertBefore) {
   Op<-1>() = IfTrue;
   Op<-2>() = IfFalse;
   Op<-3>() = Cond;
@@ -914,18 +910,16 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
 }
 
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
-                   OperandTraits<BranchInst>::op_end(this) - 1,
-                   1, InsertAtEnd) {
+    : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - 1, 1, InsertAtEnd) {
   assert(IfTrue && "Branch destination may not be null!");
   Op<-1>() = IfTrue;
 }
 
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
-           BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
-                   OperandTraits<BranchInst>::op_end(this) - 3,
-                   3, InsertAtEnd) {
+                       BasicBlock *InsertAtEnd)
+    : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - 3, 3, InsertAtEnd) {
   Op<-1>() = IfTrue;
   Op<-2>() = IfFalse;
   Op<-3>() = Cond;
@@ -934,10 +928,10 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
 #endif
 }
 
-BranchInst::BranchInst(const BranchInst &BI) :
-  TerminatorInst(Type::getVoidTy(BI.getContext()), Instruction::Br,
-                 OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
-                 BI.getNumOperands()) {
+BranchInst::BranchInst(const BranchInst &BI)
+    : Instruction(Type::getVoidTy(BI.getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
+                  BI.getNumOperands()) {
   Op<-1>() = BI.Op<-1>();
   if (BI.getNumOperands() != 1) {
     assert(BI.getNumOperands() == 3 && "BR can have 1 or 3 operands!");
@@ -3567,8 +3561,8 @@ void SwitchInst::init(Value *Value, BasicBlock *Default, unsigned NumReserved) {
 /// constructor can also autoinsert before another instruction.
 SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
                        Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(Value->getContext()), Instruction::Switch,
-                   nullptr, 0, InsertBefore) {
+    : Instruction(Type::getVoidTy(Value->getContext()), Instruction::Switch,
+                  nullptr, 0, InsertBefore) {
   init(Value, Default, 2+NumCases*2);
 }
 
@@ -3578,13 +3572,13 @@ SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
 /// constructor also autoinserts at the end of the specified BasicBlock.
 SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
                        BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(Value->getContext()), Instruction::Switch,
-                   nullptr, 0, InsertAtEnd) {
+    : Instruction(Type::getVoidTy(Value->getContext()), Instruction::Switch,
+                  nullptr, 0, InsertAtEnd) {
   init(Value, Default, 2+NumCases*2);
 }
 
 SwitchInst::SwitchInst(const SwitchInst &SI)
-  : TerminatorInst(SI.getType(), Instruction::Switch, nullptr, 0) {
+    : Instruction(SI.getType(), Instruction::Switch, nullptr, 0) {
   init(SI.getCondition(), SI.getDefaultDest(), SI.getNumOperands());
   setNumHungOffUseOperands(SI.getNumOperands());
   Use *OL = getOperandList();
@@ -3596,7 +3590,6 @@ SwitchInst::SwitchInst(const SwitchInst &SI)
   SubclassOptionalData = SI.SubclassOptionalData;
 }
 
-
 /// addCase - Add an entry to the switch instruction...
 ///
 void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
@@ -3675,21 +3668,21 @@ void IndirectBrInst::growOperands() {
 
 IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
                                Instruction *InsertBefore)
-: TerminatorInst(Type::getVoidTy(Address->getContext()),Instruction::IndirectBr,
-                 nullptr, 0, InsertBefore) {
+    : Instruction(Type::getVoidTy(Address->getContext()),
+                  Instruction::IndirectBr, nullptr, 0, InsertBefore) {
   init(Address, NumCases);
 }
 
 IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
                                BasicBlock *InsertAtEnd)
-: TerminatorInst(Type::getVoidTy(Address->getContext()),Instruction::IndirectBr,
-                 nullptr, 0, InsertAtEnd) {
+    : Instruction(Type::getVoidTy(Address->getContext()),
+                  Instruction::IndirectBr, nullptr, 0, InsertAtEnd) {
   init(Address, NumCases);
 }
 
 IndirectBrInst::IndirectBrInst(const IndirectBrInst &IBI)
-    : TerminatorInst(Type::getVoidTy(IBI.getContext()), Instruction::IndirectBr,
-                     nullptr, IBI.getNumOperands()) {
+    : Instruction(Type::getVoidTy(IBI.getContext()), Instruction::IndirectBr,
+                  nullptr, IBI.getNumOperands()) {
   allocHungoffUses(IBI.getNumOperands());
   Use *OL = getOperandList();
   const Use *InOL = IBI.getOperandList();
-- 
GitLab


From deadb20e811a6441410237b0f8804973190411f8 Mon Sep 17 00:00:00 2001
From: Hsiangkai Wang <hsiangkai@gmail.com>
Date: Fri, 19 Oct 2018 01:52:54 +0000
Subject: [PATCH 0313/1116] [CodeGen] Fix for PR39094.

When using MachineInstr to get SlotIndex, the MI could not be a debug
instruction. mi2iMap does not contain debug instructions in it.

After enabling DBG_LABEL in the generated code, the first instruction in
the bundle may be a debug instruction. In this patch, I use the first
non-debug instruction in the bundle to query SlotIndex in mi2iMap.

Bugzilla report: https://bugs.llvm.org/show_bug.cgi?id=39094

Differential revision: https://reviews.llvm.org/D52927

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344770 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/SlotIndexes.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index 55082222b7a..b6e5088b986 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -413,10 +413,14 @@ class raw_ostream;
     /// Returns the base index for the given instruction.
     SlotIndex getInstructionIndex(const MachineInstr &MI) const {
       // Instructions inside a bundle have the same number as the bundle itself.
-      const MachineInstr &BundleStart = *getBundleStart(MI.getIterator());
-      assert(!BundleStart.isDebugInstr() &&
+      auto BundleStart = getBundleStart(MI.getIterator());
+      auto BundleEnd = getBundleEnd(MI.getIterator());
+      // Use the first non-debug instruction in the bundle to get SlotIndex.
+      const MachineInstr &BundleNonDebug =
+          *skipDebugInstructionsForward(BundleStart, BundleEnd);
+      assert(!BundleNonDebug.isDebugInstr() &&
              "Could not use a debug instruction to query mi2iMap.");
-      Mi2IndexMap::const_iterator itr = mi2iMap.find(&BundleStart);
+      Mi2IndexMap::const_iterator itr = mi2iMap.find(&BundleNonDebug);
       assert(itr != mi2iMap.end() && "Instruction not found in maps.");
       return itr->second;
     }
-- 
GitLab


From 7d3ea70f0019ed32f818fa9cb68e95b64df1c9b1 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 19 Oct 2018 06:12:02 +0000
Subject: [PATCH 0314/1116] Use llvm::{all,any,none}_of instead
 std::{all,any,none}_of. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344774 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/DwarfExpression.cpp    |  7 +++---
 lib/Target/AArch64/AArch64InstrInfo.cpp       | 19 ++++++---------
 .../Hexagon/HexagonLoopIdiomRecognition.cpp   |  2 +-
 lib/Transforms/InstCombine/InstCombinePHI.cpp | 23 ++++++++-----------
 lib/Transforms/Scalar/CallSiteSplitting.cpp   |  7 +++---
 tools/llvm-dwarfdump/llvm-dwarfdump.cpp       |  2 +-
 tools/llvm-exegesis/lib/Analysis.cpp          | 10 ++++----
 tools/llvm-exegesis/lib/MCInstrDescView.cpp   |  2 +-
 .../lib/HardwareUnits/ResourceManager.cpp     | 13 +++++------
 utils/TableGen/AsmMatcherEmitter.cpp          |  7 +++---
 10 files changed, 40 insertions(+), 52 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index af51d276634..19c350afbf1 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -254,10 +254,9 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
 
   // Don't emit locations that cannot be expressed without DW_OP_stack_value.
   if (DwarfVersion < 4)
-    if (std::any_of(ExprCursor.begin(), ExprCursor.end(),
-                    [](DIExpression::ExprOperand Op) -> bool {
-                      return Op.getOp() == dwarf::DW_OP_stack_value;
-                    })) {
+    if (any_of(ExprCursor, [](DIExpression::ExprOperand Op) -> bool {
+          return Op.getOp() == dwarf::DW_OP_stack_value;
+        })) {
       DwarfRegs.clear();
       LocationKind = Unknown;
       return false;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index f0f5bfa351d..2452d6a0298 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5084,12 +5084,9 @@ AArch64InstrInfo::getOutliningCandidateInfo(
   unsigned FrameID = MachineOutlinerDefault;
   unsigned NumBytesToCreateFrame = 4;
 
-  bool HasBTI =
-      std::any_of(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
-                  [](outliner::Candidate &C) {
-                    return C.getMF()->getFunction().hasFnAttribute(
-                        "branch-target-enforcement");
-                  });
+  bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
+    return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
+  });
 
   // If the last instruction in any candidate is a terminator, then we should
   // tail call all of the candidates.
@@ -5124,10 +5121,9 @@ AArch64InstrInfo::getOutliningCandidateInfo(
   // LR is live, so we need to save it. Decide whether it should be saved to
   // the stack, or if it can be saved to a register.
   else {
-    if (std::all_of(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
-                    [this](outliner::Candidate &C) {
-                      return findRegisterToSaveLRTo(C);
-                    })) {
+    if (all_of(RepeatedSequenceLocs, [this](outliner::Candidate &C) {
+          return findRegisterToSaveLRTo(C);
+        })) {
       // Every candidate has an available callee-saved register for the save.
       // We can save LR to a register.
       FrameID = MachineOutlinerRegSave;
@@ -5195,8 +5191,7 @@ AArch64InstrInfo::getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const {
   unsigned Flags = 0x0;
   // Check if there's a call inside this MachineBasicBlock. If there is, then
   // set a flag.
-  if (std::any_of(MBB.begin(), MBB.end(),
-                  [](MachineInstr &MI) { return MI.isCall(); }))
+  if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
     Flags |= MachineOutlinerMBBFlags::HasCalls;
 
   // Check if LR is available through all of the MBB. If it's not, then set
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index f38992bef69..702d68fad9b 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -2360,7 +2360,7 @@ bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB,
   auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
     return DT->dominates(BB, EB);
   };
-  if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB))
+  if (!all_of(ExitBlocks, DominatedByBB))
     return false;
 
   bool MadeChange = false;
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 94745094c15..a71ebdcd346 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -211,23 +211,20 @@ Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {
   }
 
   // If it requires a conversion for every PHI operand, do not do it.
-  if (std::all_of(AvailablePtrVals.begin(), AvailablePtrVals.end(),
-                  [&](Value *V) {
-                    return (V->getType() != IntToPtr->getType()) ||
-                           isa<IntToPtrInst>(V);
-                  }))
+  if (all_of(AvailablePtrVals, [&](Value *V) {
+        return (V->getType() != IntToPtr->getType()) || isa<IntToPtrInst>(V);
+      }))
     return nullptr;
 
   // If any of the operand that requires casting is a terminator
   // instruction, do not do it.
-  if (std::any_of(AvailablePtrVals.begin(), AvailablePtrVals.end(),
-                  [&](Value *V) {
-                    if (V->getType() == IntToPtr->getType())
-                      return false;
-
-                    auto *Inst = dyn_cast<Instruction>(V);
-                    return Inst && Inst->isTerminator();
-                  }))
+  if (any_of(AvailablePtrVals, [&](Value *V) {
+        if (V->getType() == IntToPtr->getType())
+          return false;
+
+        auto *Inst = dyn_cast<Instruction>(V);
+        return Inst && Inst->isTerminator();
+      }))
     return nullptr;
 
   PHINode *NewPtrPHI = PHINode::Create(
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
index e82682e08ab..bac6ef99f03 100644
--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -461,10 +461,9 @@ static bool tryToSplitOnPredicatedArgument(CallSite CS, DominatorTree *DT) {
     PredsCS.push_back({Pred, Conditions});
   }
 
-  if (std::all_of(PredsCS.begin(), PredsCS.end(),
-                  [](const std::pair<BasicBlock *, ConditionsTy> &P) {
-                    return P.second.empty();
-                  }))
+  if (all_of(PredsCS, [](const std::pair<BasicBlock *, ConditionsTy> &P) {
+        return P.second.empty();
+      }))
     return false;
 
   splitCallSite(CS, PredsCS, DT);
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 42992641eb7..af21a41a108 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -624,7 +624,7 @@ int main(int argc, char **argv) {
 
   if (Verify) {
     // If we encountered errors during verify, exit with a non-zero exit status.
-    if (!std::all_of(Objects.begin(), Objects.end(), [&](std::string Object) {
+    if (!all_of(Objects, [&](std::string Object) {
           return handleFile(Object, verifyObjectFile, OS);
         }))
       exit(1);
diff --git a/tools/llvm-exegesis/lib/Analysis.cpp b/tools/llvm-exegesis/lib/Analysis.cpp
index eaacb5b1d65..73c54f53225 100644
--- a/tools/llvm-exegesis/lib/Analysis.cpp
+++ b/tools/llvm-exegesis/lib/Analysis.cpp
@@ -657,11 +657,11 @@ llvm::Error Analysis::run<Analysis::PrintSchedClassInconsistencies>(
 
     // Print any scheduling class that has at least one cluster that does not
     // match the checked-in data.
-    if (std::all_of(SchedClassClusters.begin(), SchedClassClusters.end(),
-                    [this, &RSCAndPoints](const SchedClassCluster &C) {
-                      return C.measurementsMatch(*SubtargetInfo_,
-                                                 RSCAndPoints.RSC, Clustering_);
-                    }))
+    if (llvm::all_of(SchedClassClusters,
+                     [this, &RSCAndPoints](const SchedClassCluster &C) {
+                       return C.measurementsMatch(
+                           *SubtargetInfo_, RSCAndPoints.RSC, Clustering_);
+                     }))
       continue; // Nothing weird.
 
     OS << "<div class=\"inconsistency\"><p>Sched Class <span "
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
index 59f56520efc..6fdb5a68419 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.cpp
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
@@ -174,7 +174,7 @@ const Operand &Instruction::getPrimaryOperand(const Variable &Var) const {
 }
 
 bool Instruction::hasMemoryOperands() const {
-  return std::any_of(Operands.begin(), Operands.end(), [](const Operand &Op) {
+  return any_of(Operands, [](const Operand &Op) {
     return Op.isReg() && Op.isExplicit() && Op.isMemory();
   });
 }
diff --git a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp b/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
index 46a374c2102..bb6ed309c26 100644
--- a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
@@ -218,13 +218,12 @@ void ResourceManager::releaseBuffers(ArrayRef<uint64_t> Buffers) {
 }
 
 bool ResourceManager::canBeIssued(const InstrDesc &Desc) const {
-  return std::all_of(Desc.Resources.begin(), Desc.Resources.end(),
-                     [&](const std::pair<uint64_t, const ResourceUsage> &E) {
-                       unsigned NumUnits =
-                           E.second.isReserved() ? 0U : E.second.NumUnits;
-                       unsigned Index = getResourceStateIndex(E.first);
-                       return Resources[Index]->isReady(NumUnits);
-                     });
+  return all_of(
+      Desc.Resources, [&](const std::pair<uint64_t, const ResourceUsage> &E) {
+        unsigned NumUnits = E.second.isReserved() ? 0U : E.second.NumUnits;
+        unsigned Index = getResourceStateIndex(E.first);
+        return Resources[Index]->isReady(NumUnits);
+      });
 }
 
 // Returns true if all resources are in-order, and there is at least one
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index e808661b7a5..5b4229e6468 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -2415,10 +2415,9 @@ static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info, raw_ostream &
 static void emitRegisterMatchErrorFunc(AsmMatcherInfo &Info, raw_ostream &OS) {
   OS << "static unsigned getDiagKindFromRegisterClass(MatchClassKind "
         "RegisterClass) {\n";
-  if (std::none_of(Info.Classes.begin(), Info.Classes.end(),
-                   [](const ClassInfo &CI) {
-                     return CI.isRegisterClass() && !CI.DiagnosticType.empty();
-                   })) {
+  if (none_of(Info.Classes, [](const ClassInfo &CI) {
+        return CI.isRegisterClass() && !CI.DiagnosticType.empty();
+      })) {
     OS << "  return MCTargetAsmParser::Match_InvalidOperand;\n";
   } else {
     OS << "  switch (RegisterClass) {\n";
-- 
GitLab


From 5eae65a2bb3c2d8c86a7c911d1bd433ccd14ddd3 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 19 Oct 2018 06:20:01 +0000
Subject: [PATCH 0315/1116] [pipeliner] Fix test added in rL344748 to require
 asserts

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344775 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/Hexagon/swp-copytophi-dag.ll | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/CodeGen/Hexagon/swp-copytophi-dag.ll b/test/CodeGen/Hexagon/swp-copytophi-dag.ll
index a239baae141..69743407c14 100644
--- a/test/CodeGen/Hexagon/swp-copytophi-dag.ll
+++ b/test/CodeGen/Hexagon/swp-copytophi-dag.ll
@@ -1,3 +1,5 @@
+; REQUIRES: asserts
+;
 ; RUN: llc -march=hexagon -enable-pipeliner=true -debug-only=pipeliner < %s \
 ; RUN: 2>&1 | FileCheck %s
 
-- 
GitLab


From d6be509b7567ffd83b2616bd308068b85bd275cc Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Fri, 19 Oct 2018 09:56:54 +0000
Subject: [PATCH 0316/1116] [llvm-exegesis] X87 RFP setup code.

Summary:
This was lost during refactoring in rL342644.

Fix and simplify simplify value size handling: always go through a 80 bit value,
because the value can be 1 byte). Add unit tests.

Reviewers: gchatelet

Subscribers: tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D53423

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344779 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/X86/Target.cpp        | 40 +++++++++-----
 .../tools/llvm-exegesis/X86/TargetTest.cpp    | 54 ++++++++++++++++---
 2 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index 20bb65ebde5..ae5c2e8d25e 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -182,12 +182,10 @@ struct ConstantInliner {
     return std::move(Instructions);
   }
 
-  std::vector<llvm::MCInst>
-  loadX87AndFinalize(unsigned Reg, unsigned RegBitWidth, unsigned Opcode) {
-    assert((RegBitWidth & 7) == 0 &&
-           "RegBitWidth must be a multiple of 8 bits");
-    initStack(RegBitWidth / 8);
-    add(llvm::MCInstBuilder(Opcode)
+  std::vector<llvm::MCInst> loadX87STAndFinalize(unsigned Reg) {
+    initStack(kF80Bytes);
+    add(llvm::MCInstBuilder(llvm::X86::LD_F80m)
+            // Address = ESP
             .addReg(llvm::X86::RSP) // BaseReg
             .addImm(1)              // ScaleAmt
             .addReg(0)              // IndexReg
@@ -195,7 +193,21 @@ struct ConstantInliner {
             .addReg(0));            // Segment
     if (Reg != llvm::X86::ST0)
       add(llvm::MCInstBuilder(llvm::X86::ST_Frr).addReg(Reg));
-    add(releaseStackSpace(RegBitWidth / 8));
+    add(releaseStackSpace(kF80Bytes));
+    return std::move(Instructions);
+  }
+
+  std::vector<llvm::MCInst> loadX87FPAndFinalize(unsigned Reg) {
+    initStack(kF80Bytes);
+    add(llvm::MCInstBuilder(llvm::X86::LD_Fp80m)
+            .addReg(Reg)
+            // Address = ESP
+            .addReg(llvm::X86::RSP) // BaseReg
+            .addImm(1)              // ScaleAmt
+            .addReg(0)              // IndexReg
+            .addImm(0)              // Disp
+            .addReg(0));            // Segment
+    add(releaseStackSpace(kF80Bytes));
     return std::move(Instructions);
   }
 
@@ -206,6 +218,8 @@ struct ConstantInliner {
   }
 
 private:
+  static constexpr const unsigned kF80Bytes = 10; // 80 bits.
+
   ConstantInliner &add(const llvm::MCInst &Inst) {
     Instructions.push_back(Inst);
     return *this;
@@ -318,12 +332,12 @@ class ExegesisX86Target : public ExegesisTarget {
       if (STI.getFeatureBits()[llvm::X86::FeatureAVX512])
         return CI.loadAndFinalize(Reg, 512, llvm::X86::VMOVDQU32Zrm);
     if (llvm::X86::RSTRegClass.contains(Reg)) {
-      if (Value.getBitWidth() == 32)
-        return CI.loadX87AndFinalize(Reg, 32, llvm::X86::LD_F32m);
-      if (Value.getBitWidth() == 64)
-        return CI.loadX87AndFinalize(Reg, 64, llvm::X86::LD_F64m);
-      if (Value.getBitWidth() == 80)
-        return CI.loadX87AndFinalize(Reg, 80, llvm::X86::LD_F80m);
+      return CI.loadX87STAndFinalize(Reg);
+    }
+    if (llvm::X86::RFP32RegClass.contains(Reg) ||
+        llvm::X86::RFP64RegClass.contains(Reg) ||
+        llvm::X86::RFP80RegClass.contains(Reg)) {
+      return CI.loadX87FPAndFinalize(Reg);
     }
     if (Reg == llvm::X86::EFLAGS)
       return CI.popFlagAndFinalize();
diff --git a/unittests/tools/llvm-exegesis/X86/TargetTest.cpp b/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
index 6e7554c8445..5ada03b2e9e 100644
--- a/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
@@ -296,12 +296,17 @@ TEST_F(Core2Avx512TargetTest, SetRegToVR512Value) {
            IsStackDeallocate(64)}));
 }
 
+// Note: We always put 80 bits on the stack independently of the size of the
+// value. This uses a bit more space but makes the code simpler.
+
 TEST_F(Core2TargetTest, SetRegToST0_32Bits) {
   EXPECT_THAT(
       setRegTo(llvm::X86::ST0, APInt(32, 0x11112222ULL)),
-      ElementsAre(IsStackAllocate(4),
+      ElementsAre(IsStackAllocate(10),
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x11112222UL, 0),
-                  OpcodeIs(llvm::X86::LD_F32m), IsStackDeallocate(4)));
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x00000000UL, 4),
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
+                  OpcodeIs(llvm::X86::LD_F80m), IsStackDeallocate(10)));
 }
 
 TEST_F(Core2TargetTest, SetRegToST1_32Bits) {
@@ -309,19 +314,22 @@ TEST_F(Core2TargetTest, SetRegToST1_32Bits) {
       llvm::MCInstBuilder(llvm::X86::ST_Frr).addReg(llvm::X86::ST1);
   EXPECT_THAT(
       setRegTo(llvm::X86::ST1, APInt(32, 0x11112222ULL)),
-      ElementsAre(IsStackAllocate(4),
+      ElementsAre(IsStackAllocate(10),
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x11112222UL, 0),
-                  OpcodeIs(llvm::X86::LD_F32m), CopySt0ToSt1,
-                  IsStackDeallocate(4)));
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x00000000UL, 4),
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
+                  OpcodeIs(llvm::X86::LD_F80m), CopySt0ToSt1,
+                  IsStackDeallocate(10)));
 }
 
 TEST_F(Core2TargetTest, SetRegToST0_64Bits) {
   EXPECT_THAT(
       setRegTo(llvm::X86::ST0, APInt(64, 0x1111222233334444ULL)),
-      ElementsAre(IsStackAllocate(8),
+      ElementsAre(IsStackAllocate(10),
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x33334444UL, 0),
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x11112222UL, 4),
-                  OpcodeIs(llvm::X86::LD_F64m), IsStackDeallocate(8)));
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
+                  OpcodeIs(llvm::X86::LD_F80m), IsStackDeallocate(10)));
 }
 
 TEST_F(Core2TargetTest, SetRegToST0_80Bits) {
@@ -334,5 +342,37 @@ TEST_F(Core2TargetTest, SetRegToST0_80Bits) {
                   OpcodeIs(llvm::X86::LD_F80m), IsStackDeallocate(10)));
 }
 
+TEST_F(Core2TargetTest, SetRegToFP0_80Bits) {
+  EXPECT_THAT(
+      setRegTo(llvm::X86::FP0, APInt(80, "11112222333344445555", 16)),
+      ElementsAre(IsStackAllocate(10),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x44445555UL, 0),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x22223333UL, 4),
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x1111UL, 8),
+                  OpcodeIs(llvm::X86::LD_Fp80m), IsStackDeallocate(10)));
+}
+
+TEST_F(Core2TargetTest, SetRegToFP1_32Bits) {
+  EXPECT_THAT(
+      setRegTo(llvm::X86::FP1, APInt(32, 0x11112222ULL)),
+      ElementsAre(IsStackAllocate(10),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x11112222UL, 0),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x00000000UL, 4),
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
+                  OpcodeIs(llvm::X86::LD_Fp80m),
+                  IsStackDeallocate(10)));
+}
+
+TEST_F(Core2TargetTest, SetRegToFP1_4Bits) {
+  EXPECT_THAT(
+      setRegTo(llvm::X86::FP1, APInt(4, 0x1ULL)),
+      ElementsAre(IsStackAllocate(10),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x00000001UL, 0),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x00000000UL, 4),
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
+                  OpcodeIs(llvm::X86::LD_Fp80m),
+                  IsStackDeallocate(10)));
+}
+
 } // namespace
 } // namespace exegesis
-- 
GitLab


From cbce2985b72452ca55a84487a20bb9a9763c3118 Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Fri, 19 Oct 2018 12:08:05 +0000
Subject: [PATCH 0317/1116] [llvm-exegesis] Re-enable liveliness tracker.

Reviewers: gchatelet

Subscribers: tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D53429

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344780 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/Assembler.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/llvm-exegesis/lib/Assembler.cpp b/tools/llvm-exegesis/lib/Assembler.cpp
index 2b67682cde7..527b10146a3 100644
--- a/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/tools/llvm-exegesis/lib/Assembler.cpp
@@ -33,6 +33,7 @@ generateSnippetSetupCode(const ExegesisTarget &ET,
                          const llvm::MCSubtargetInfo *const MSI,
                          llvm::ArrayRef<RegisterValue> RegisterInitialValues,
                          bool &IsSnippetSetupComplete) {
+  IsSnippetSetupComplete = true;
   std::vector<llvm::MCInst> Result;
   for (const RegisterValue &RV : RegisterInitialValues) {
     // Load a constant in the register.
@@ -170,7 +171,7 @@ void assembleToStream(const ExegesisTarget &ET,
   for (const unsigned Reg : LiveIns)
     MF.getRegInfo().addLiveIn(Reg);
 
-  bool IsSnippetSetupComplete = false;
+  bool IsSnippetSetupComplete;
   std::vector<llvm::MCInst> Code =
       generateSnippetSetupCode(ET, TM->getMCSubtargetInfo(),
                                RegisterInitialValues, IsSnippetSetupComplete);
-- 
GitLab


From ddc35926a19e8ec4141629c813fa5f29289c852a Mon Sep 17 00:00:00 2001
From: Kristina Brooks <kristina@nym.hush.com>
Date: Fri, 19 Oct 2018 12:14:30 +0000
Subject: [PATCH 0318/1116] [MC][DWARF][AsmParser] Ensure nested CFI frames are
 diagnosed.

This avoids a crash (with asserts) or bad codegen (without asserts)
in Dwarf streamer later on. This patch fixes this condition in
MCStreamer and propogates SMLoc down when it's available with an
added bonus of source locations for those specific types of errors.

Further patches could use similar improvements as currently most
non-Windows CFI directives lack an SMLoc parameter.

Modified an existing test to verify source location propogation and
added an object-file version of it to verify that it does not crash in
addition to a standalone test to only ensure it does not crash.

Differential Revision: https://reviews.llvm.org/D51695


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344781 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCStreamer.h                |  2 +-
 lib/MC/MCParser/AsmParser.cpp               |  9 +++++++--
 lib/MC/MCStreamer.cpp                       |  6 +++---
 test/MC/X86/cfi-open-within-another-crash.s | 18 ++++++++++++++++++
 test/MC/X86/cfi-scope-errors.s              | 15 +++++++++++----
 5 files changed, 40 insertions(+), 10 deletions(-)
 create mode 100644 test/MC/X86/cfi-open-within-another-crash.s

diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 91fb4e537b4..2e9a9d61c67 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -870,7 +870,7 @@ public:
 
   virtual MCSymbol *getDwarfLineTableSymbol(unsigned CUID);
   virtual void EmitCFISections(bool EH, bool Debug);
-  void EmitCFIStartProc(bool IsSimple);
+  void EmitCFIStartProc(bool IsSimple, SMLoc Loc = SMLoc());
   void EmitCFIEndProc();
   virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
   virtual void EmitCFIDefCfaOffset(int64_t Offset);
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 6eb7fd0d0b6..529f16525fe 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -3919,8 +3919,13 @@ bool AsmParser::parseDirectiveCFIStartProc() {
         parseToken(AsmToken::EndOfStatement))
       return addErrorSuffix(" in '.cfi_startproc' directive");
   }
-
-  getStreamer().EmitCFIStartProc(!Simple.empty());
+  
+  // TODO(kristina): Deal with a corner case of incorrect diagnostic context
+  // being produced if this directive is emitted as part of preprocessor macro
+  // expansion which can *ONLY* happen if Clang's cc1as is the API consumer.
+  // Tools like llvm-mc on the other hand are not affected by it, and report
+  // correct context information.
+  getStreamer().EmitCFIStartProc(!Simple.empty(), Lexer.getLoc());
   return false;
 }
 
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index fa0d1f46cbb..bfcf6d47a78 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -347,10 +347,10 @@ void MCStreamer::EmitCFISections(bool EH, bool Debug) {
   assert(EH || Debug);
 }
 
-void MCStreamer::EmitCFIStartProc(bool IsSimple) {
+void MCStreamer::EmitCFIStartProc(bool IsSimple, SMLoc Loc) {
   if (hasUnfinishedDwarfFrameInfo())
-    getContext().reportError(
-        SMLoc(), "starting new .cfi frame before finishing the previous one");
+    return getContext().reportError(
+        Loc, "starting new .cfi frame before finishing the previous one");
 
   MCDwarfFrameInfo Frame;
   Frame.IsSimple = IsSimple;
diff --git a/test/MC/X86/cfi-open-within-another-crash.s b/test/MC/X86/cfi-open-within-another-crash.s
new file mode 100644
index 00000000000..81627f4459c
--- /dev/null
+++ b/test/MC/X86/cfi-open-within-another-crash.s
@@ -0,0 +1,18 @@
+# Test for D51695 ensuring there is no crash when two .cfi_startproc are opened
+# without the first one being closed.
+
+# RUN: not llvm-mc %s -filetype=obj -triple=x86_64-unknown-linux -o /dev/null 2>&1 | FileCheck %s
+
+.text
+.globl proc_one
+proc_one:
+ .cfi_startproc
+ 
+.text
+.globl proc_two
+proc_two:
+ .cfi_startproc
+ 
+ .cfi_endproc
+
+# CHECK: error: starting new .cfi frame before finishing the previous one
diff --git a/test/MC/X86/cfi-scope-errors.s b/test/MC/X86/cfi-scope-errors.s
index a61f817f741..a7d6a8a157a 100644
--- a/test/MC/X86/cfi-scope-errors.s
+++ b/test/MC/X86/cfi-scope-errors.s
@@ -1,6 +1,5 @@
-# RUN: not llvm-mc %s -triple x86_64-linux -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
-
-# FIXME: Push source locations into diagnostics.
+# RUN: not llvm-mc %s -triple x86_64-linux -o /dev/null 2>&1 | FileCheck %s
+# RUN: not llvm-mc %s -triple x86_64-linux -filetype=obj -o /dev/null 2>&1 | FileCheck %s
 
 .text
 .cfi_def_cfa rsp, 8
@@ -9,8 +8,16 @@
 .cfi_startproc
 nop
 
+# TODO(kristina): As Reid suggested, this now supports source locations as a side effect
+# of another patch aimed at fixing the crash that would occur here, however the other
+# ones do not unfortunately. Will address it in a further patch propogating SMLoc down to
+# other CFI directives at which point more LINE checks can be added to ensure proper source
+# location reporting.
+
+# This tests source location correctness as well as the error and it not crashing.
+# CHECK: [[@LINE+2]]:1: error: starting new .cfi frame before finishing the previous one
 .cfi_startproc
-# CHECK: error: starting new .cfi frame before finishing the previous one
+
 nop
 .cfi_endproc
 
-- 
GitLab


From 0624b8fbe06bc69928a056f5449fd2e15ca2a4bc Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Fri, 19 Oct 2018 12:24:49 +0000
Subject: [PATCH 0319/1116] [llvm-exegesis] Mark second-form X87 instructions
 as unsupported.

Summary:
We only support the first form because we rely on information that is
only available there.

Reviewers: gchatelet

Subscribers: tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D53430

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344782 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/X86/Target.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index ae5c2e8d25e..1a7d290b1af 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -26,7 +26,14 @@ static llvm::Error IsInvalidOpcode(const Instruction &Instr) {
   if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") ||
       OpcodeName.startswith("ADJCALLSTACK"))
     return llvm::make_error<BenchmarkFailure>(
-        "Unsupported opcode: Push/Pop/AdjCallStack");
+        "unsupported opcode: Push/Pop/AdjCallStack");
+  // We do not handle second-form X87 instructions. We only handle first-form
+  // ones (_Fp), see comment in X86InstrFPStack.td.
+  for (const Operand &Op : Instr.Operands)
+    if (Op.isReg() && Op.isExplicit() &&
+        Op.getExplicitOperandInfo().RegClass == llvm::X86::RSTRegClassID)
+      return llvm::make_error<BenchmarkFailure>(
+          "unsupported second-form X87 instruction");
   return llvm::Error::success();
 }
 
-- 
GitLab


From 42984f10a2371cc00a5aef7464556b1f91eea259 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 19 Oct 2018 17:26:22 +0000
Subject: [PATCH 0320/1116] [InstCombine] move/add tests for sub/neg; NFC

These should all be handled using "dyn_castNegVal",
but that misses vectors with undef elements.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344790 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/sub.ll  | 227 +++++++++++++++++++++-------
 test/Transforms/InstSimplify/sub.ll |  53 +++++++
 2 files changed, 228 insertions(+), 52 deletions(-)
 create mode 100644 test/Transforms/InstSimplify/sub.ll

diff --git a/test/Transforms/InstCombine/sub.ll b/test/Transforms/InstCombine/sub.ll
index 8a568602f2b..299633b25ac 100644
--- a/test/Transforms/InstCombine/sub.ll
+++ b/test/Transforms/InstCombine/sub.ll
@@ -1,43 +1,175 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
 target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
-; Optimize subtracts.
+define i32 @sub_constant(i32 %x) {
+; CHECK-LABEL: @sub_constant(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[X:%.*]], -42
+; CHECK-NEXT:    ret i32 [[R]]
 ;
-; RUN: opt < %s -instcombine -S | FileCheck %s
+  %r = sub i32 %x, 42
+  ret i32 %r
+}
+
+@g = global i32 0
 
-define i32 @test1(i32 %A) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    ret i32 0
+define i32 @sub_constant_expression(i32 %x) {
+; CHECK-LABEL: @sub_constant_expression(
+; CHECK-NEXT:    [[R:%.*]] = sub i32 [[X:%.*]], ptrtoint (i32* @g to i32)
+; CHECK-NEXT:    ret i32 [[R]]
 ;
-  %B = sub i32 %A, %A
-  ret i32 %B
+  %r = sub i32 %x, ptrtoint (i32* @g to i32)
+  ret i32 %r
 }
 
-define i32 @test2(i32 %A) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    ret i32 [[A:%.*]]
+define <2 x i32> @sub_constant_vec(<2 x i32> %x) {
+; CHECK-LABEL: @sub_constant_vec(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[X:%.*]], <i32 -42, i32 12>
+; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %B = sub i32 %A, 0
-  ret i32 %B
+  %r = sub <2 x i32> %x, <i32 42, i32 -12>
+  ret <2 x i32> %r
 }
 
-define i32 @test3(i32 %A) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    ret i32 [[A:%.*]]
+define <3 x i33> @sub_constant_vec_weird_type(<3 x i33> %x) {
+; CHECK-LABEL: @sub_constant_vec_weird_type(
+; CHECK-NEXT:    [[R:%.*]] = add <3 x i33> [[X:%.*]], <i33 42, i33 -42, i33 12>
+; CHECK-NEXT:    ret <3 x i33> [[R]]
 ;
-  %B = sub i32 0, %A
-  %C = sub i32 0, %B
-  ret i32 %C
+  %r = sub <3 x i33> %x, <i33 -42, i33 42, i33 -12>
+  ret <3 x i33> %r
 }
 
-define i32 @test4(i32 %A, i32 %x) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[C:%.*]] = add i32 [[X:%.*]], [[A:%.*]]
-; CHECK-NEXT:    ret i32 [[C]]
+define <4 x i32> @sub_constant_expression_vec(<4 x i32> %x) {
+; CHECK-LABEL: @sub_constant_expression_vec(
+; CHECK-NEXT:    [[R:%.*]] = sub <4 x i32> [[X:%.*]], bitcast (i128 ptrtoint (i32* @g to i128) to <4 x i32>)
+; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %B = sub i32 0, %A
-  %C = sub i32 %x, %B
-  ret i32 %C
+  %r = sub <4 x i32> %x, bitcast (i128 ptrtoint (i32* @g to i128) to <4 x i32>)
+  ret <4 x i32> %r
+}
+
+define i32 @neg_sub(i32 %x, i32 %y) {
+; CHECK-LABEL: @neg_sub(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %neg = sub i32 0, %x
+  %r = sub i32 %y, %neg
+  ret i32 %r
+}
+
+define i32 @neg_nsw_sub(i32 %x, i32 %y) {
+; CHECK-LABEL: @neg_nsw_sub(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %neg = sub nsw i32 0, %x
+  %r = sub i32 %y, %neg
+  ret i32 %r
+}
+
+define i32 @neg_sub_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: @neg_sub_nsw(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %neg = sub i32 0, %x
+  %r = sub nsw i32 %y, %neg
+  ret i32 %r
+}
+
+define i32 @neg_nsw_sub_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: @neg_nsw_sub_nsw(
+; CHECK-NEXT:    [[R:%.*]] = add nsw i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %neg = sub nsw i32 0, %x
+  %r = sub nsw i32 %y, %neg
+  ret i32 %r
+}
+
+define <2 x i32> @neg_sub_vec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_sub_vec(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub <2 x i32> zeroinitializer, %x
+  %r = sub <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_nsw_sub_vec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_nsw_sub_vec(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub nsw <2 x i32> zeroinitializer, %x
+  %r = sub <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_sub_nsw_vec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_sub_nsw_vec(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub <2 x i32> zeroinitializer, %x
+  %r = sub nsw <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_nsw_sub_nsw_vec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_nsw_sub_nsw_vec(
+; CHECK-NEXT:    [[R:%.*]] = add nsw <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub nsw <2 x i32> zeroinitializer, %x
+  %r = sub nsw <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_sub_vec_undef(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub <2 x i32> <i32 0, i32 undef>, %x
+  %r = sub <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_nsw_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_nsw_sub_vec_undef(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub nsw <2 x i32> <i32 undef, i32 0>, %x
+  %r = sub <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_sub_nsw_vec_undef(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_sub_nsw_vec_undef(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub <2 x i32> <i32 undef, i32 0>, %x
+  %r = sub nsw <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+; TODO: This should not drop 'nsw'.
+
+define <2 x i32> @neg_nsw_sub_nsw_vec_undef(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_nsw_sub_nsw_vec_undef(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub nsw <2 x i32> <i32 0, i32 undef>, %x
+  %r = sub nsw <2 x i32> %y, %neg
+  ret <2 x i32> %r
 }
 
 ; (~X) - (~Y) --> Y - X
@@ -499,7 +631,7 @@ define <2 x i32> @test27commutedvecmixed(<2 x i32> %x, <2 x i32> %y) {
 
 define i32 @test28(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @test28(
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[Z:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[Y:%.*]], [[Z:%.*]]
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
@@ -626,16 +758,6 @@ define i32 @test38(i32 %A) {
   ret i32 %sub
 }
 
-define i32 @test39(i32 %A, i32 %x) {
-; CHECK-LABEL: @test39(
-; CHECK-NEXT:    [[C:%.*]] = add i32 [[X:%.*]], [[A:%.*]]
-; CHECK-NEXT:    ret i32 [[C]]
-;
-  %B = sub i32 0, %A
-  %C = sub nsw i32 %x, %B
-  ret i32 %C
-}
-
 define i16 @test40(i16 %a, i16 %b) {
 ; CHECK-LABEL: @test40(
 ; CHECK-NEXT:    [[ASHR:%.*]] = ashr i16 [[A:%.*]], 1
@@ -921,7 +1043,8 @@ define i32 @test56(i32 %A, i32 %B) {
 ;
   %X = add i32 %A, %B
   %Y = sub i32 %A, %X
-  ret i32 %Y                                                                                                                                                                                                                                             }
+  ret i32 %Y
+}
 
 define i32 @test57(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test57(
@@ -930,22 +1053,22 @@ define i32 @test57(i32 %A, i32 %B) {
 ;
   %X = add i32 %B, %A
   %Y = sub i32 %A, %X
-  ret i32 %Y                                                                                                                                                                                                                                             }
+  ret i32 %Y
+}
 
 @dummy_global1 = external global i8*
 @dummy_global2 = external global i8*
 
 define i64 @test58([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
-; CHECK-LABEL: @test58(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[J:%.*]], 4200
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[I:%.*]], 4200
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP2:%.*]] [[TMP1:%.*]]
-; CHECK-NEXT:    ret i64 [[TMP3]]
-;
 ; Note the reassociate pass and another instcombine pass will further optimize this to
 ; "%sub = i64 %i, %j, ret i64 %sub"
-;
 ; gep1 and gep2 have only one use
+; CHECK-LABEL: @test58(
+; CHECK-NEXT:    [[GEP2_OFFS:%.*]] = add i64 [[J:%.*]], 4200
+; CHECK-NEXT:    [[GEP1_OFFS:%.*]] = add i64 [[I:%.*]], 4200
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[GEP1_OFFS]], [[GEP2_OFFS]]
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
   %gep1 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %i
   %gep2 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %j
   %cast1 = ptrtoint i8* %gep1 to i64
@@ -956,11 +1079,11 @@ define i64 @test58([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
 
 define i64 @test59([100 x [100 x i8]]* %foo, i64 %i) {
 ; CHECK-LABEL: @test59(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %i
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO:%.*]], i64 0, i64 42, i64 [[I:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO]], i64 0, i64 42, i64 0
 ; CHECK-NEXT:    store i8* [[GEP1]], i8** @dummy_global1, align 8
 ; CHECK-NEXT:    store i8* [[GEP2]], i8** @dummy_global2, align 8
-; CHECK-NEXT:    ret i64 %i
+; CHECK-NEXT:    ret i64 [[I]]
 ;
 ; gep1 and gep2 have more than one uses
   %gep1 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %i
@@ -975,8 +1098,8 @@ define i64 @test59([100 x [100 x i8]]* %foo, i64 %i) {
 
 define i64 @test60([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 %j, i64 %i
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO:%.*]], i64 0, i64 [[J:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO]], i64 0, i64 42, i64 0
 ; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint i8* [[GEP1]] to i64
 ; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint i8* [[GEP2]] to i64
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[CAST1]], [[CAST2]]
@@ -995,8 +1118,8 @@ define i64 @test60([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
 
 define i64 @test61([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test61(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 %j, i64 %i
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO:%.*]], i64 0, i64 42, i64 0
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO]], i64 0, i64 [[J:%.*]], i64 [[I:%.*]]
 ; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint i8* [[GEP1]] to i64
 ; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint i8* [[GEP2]] to i64
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[CAST1]], [[CAST2]]
diff --git a/test/Transforms/InstSimplify/sub.ll b/test/Transforms/InstSimplify/sub.ll
new file mode 100644
index 00000000000..4e2064527c4
--- /dev/null
+++ b/test/Transforms/InstSimplify/sub.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define i32 @sub_self(i32 %A) {
+; CHECK-LABEL: @sub_self(
+; CHECK-NEXT:    ret i32 0
+;
+  %B = sub i32 %A, %A
+  ret i32 %B
+}
+
+define <2 x i32> @sub_self_vec(<2 x i32> %A) {
+; CHECK-LABEL: @sub_self_vec(
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+;
+  %B = sub <2 x i32> %A, %A
+  ret <2 x i32> %B
+}
+
+define i32 @sub_zero(i32 %A) {
+; CHECK-LABEL: @sub_zero(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
+  %B = sub i32 %A, 0
+  ret i32 %B
+}
+
+define <2 x i32> @sub_zero_vec(<2 x i32> %A) {
+; CHECK-LABEL: @sub_zero_vec(
+; CHECK-NEXT:    ret <2 x i32> [[A:%.*]]
+;
+  %B = sub <2 x i32> %A, <i32 0, i32 undef>
+  ret <2 x i32> %B
+}
+
+define i32 @neg_neg(i32 %A) {
+; CHECK-LABEL: @neg_neg(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
+  %B = sub i32 0, %A
+  %C = sub i32 0, %B
+  ret i32 %C
+}
+
+define <2 x i32> @neg_neg_vec(<2 x i32> %A) {
+; CHECK-LABEL: @neg_neg_vec(
+; CHECK-NEXT:    ret <2 x i32> [[A:%.*]]
+;
+  %B = sub <2 x i32> <i32 0, i32 undef>, %A
+  %C = sub <2 x i32> <i32 0, i32 undef>, %B
+  ret <2 x i32> %C
+}
+
-- 
GitLab


From 012966636778f5053198a33b442a6779542398c5 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Fri, 19 Oct 2018 17:31:11 +0000
Subject: [PATCH 0321/1116] [Hexagon] Remove support for V4

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344791 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/CMakeLists.txt             |   1 -
 lib/Target/Hexagon/Hexagon.td                 |  18 +-
 lib/Target/Hexagon/HexagonCopyToCombine.cpp   |   3 +-
 lib/Target/Hexagon/HexagonDepArch.h           |   2 +-
 lib/Target/Hexagon/HexagonDepArch.td          |   3 -
 lib/Target/Hexagon/HexagonDepInstrInfo.td     | 140 ++--
 lib/Target/Hexagon/HexagonISelLowering.cpp    | 151 +---
 lib/Target/Hexagon/HexagonInstrFormats.td     |  16 +-
 ...rFormatsV4.td => HexagonInstrFormatsV5.td} |   4 +-
 lib/Target/Hexagon/HexagonIntrinsics.td       |   2 -
 lib/Target/Hexagon/HexagonIntrinsicsV3.td     |  27 -
 lib/Target/Hexagon/HexagonIntrinsicsV4.td     | 305 --------
 lib/Target/Hexagon/HexagonIntrinsicsV5.td     | 308 +++++++-
 lib/Target/Hexagon/HexagonPatterns.td         | 226 +++---
 lib/Target/Hexagon/HexagonRegisterInfo.cpp    |  13 +-
 lib/Target/Hexagon/HexagonSchedule.td         |   4 +-
 ...agonScheduleV4.td => HexagonScheduleV5.td} |  22 +-
 lib/Target/Hexagon/HexagonSubtarget.cpp       |   1 -
 lib/Target/Hexagon/HexagonSubtarget.h         |   6 +-
 lib/Target/Hexagon/HexagonVLIWPacketizer.cpp  |  31 +-
 lib/Target/Hexagon/HexagonVLIWPacketizer.h    |   2 +-
 .../MCTargetDesc/HexagonMCDuplexInfo.cpp      |   3 +-
 .../MCTargetDesc/HexagonMCTargetDesc.cpp      |  36 +-
 test/CodeGen/Hexagon/cfi-late.ll              |   4 +-
 test/CodeGen/Hexagon/double.ll                |  36 +-
 test/CodeGen/Hexagon/float.ll                 |  40 +-
 .../Hexagon/floatconvert-ieee-rnd-near.ll     |  40 +-
 test/CodeGen/Hexagon/gp-plus-offset-load.ll   |  72 +-
 test/CodeGen/Hexagon/gp-plus-offset-store.ll  |  45 +-
 test/CodeGen/Hexagon/gp-rel.ll                |  55 +-
 test/CodeGen/Hexagon/hwloop-cleanup.ll        | 106 +--
 test/CodeGen/Hexagon/hwloop-const.ll          |  40 +-
 test/CodeGen/Hexagon/hwloop-dbg.ll            | 105 +--
 test/CodeGen/Hexagon/hwloop-le.ll             | 694 +++++++++---------
 test/CodeGen/Hexagon/hwloop-ne.ll             | 694 +++++++++---------
 test/CodeGen/Hexagon/i16_VarArg.ll            |  68 +-
 test/CodeGen/Hexagon/i1_VarArg.ll             |  76 +-
 test/CodeGen/Hexagon/i8_VarArg.ll             |  68 +-
 test/CodeGen/Hexagon/macint.ll                |  15 +-
 test/CodeGen/Hexagon/misaligned-access.ll     |  27 +-
 test/CodeGen/Hexagon/mpy.ll                   |  32 +-
 test/CodeGen/Hexagon/newvaluejump.ll          |  57 +-
 test/CodeGen/Hexagon/packetize_cond_inst.ll   |  33 +-
 test/CodeGen/Hexagon/postinc-load.ll          |  43 +-
 test/CodeGen/Hexagon/postinc-store.ll         |  43 +-
 test/CodeGen/Hexagon/pred-gp.ll               |  42 +-
 test/CodeGen/Hexagon/pred-instrs.ll           |  44 +-
 test/CodeGen/Hexagon/predicate-copy.ll        |   8 +-
 test/CodeGen/Hexagon/remove_lsr.ll            | 112 +--
 test/CodeGen/Hexagon/simpletailcall.ll        |  18 +-
 test/CodeGen/Hexagon/union-1.ll               |  30 +-
 test/CodeGen/Hexagon/vaddh.ll                 |  23 +-
 test/CodeGen/Hexagon/validate-offset.ll       |  56 +-
 test/MC/Hexagon/elf-flags.s                   |   2 -
 54 files changed, 1919 insertions(+), 2133 deletions(-)
 rename lib/Target/Hexagon/{HexagonInstrFormatsV4.td => HexagonInstrFormatsV5.td} (95%)
 delete mode 100644 lib/Target/Hexagon/HexagonIntrinsicsV3.td
 delete mode 100644 lib/Target/Hexagon/HexagonIntrinsicsV4.td
 rename lib/Target/Hexagon/{HexagonScheduleV4.td => HexagonScheduleV5.td} (70%)

diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 59377f4f359..3536aa81fb2 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -73,4 +73,3 @@ add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(TargetInfo)
-
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 69e263a425f..8853dd6d550 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -323,31 +323,27 @@ class Proc<string Name, SchedMachineModel Model,
  : ProcessorModel<Name, Model, Features>;
 
 def : Proc<"generic", HexagonModelV60,
-           [ArchV4, ArchV5, ArchV55, ArchV60,
+           [ArchV5, ArchV55, ArchV60,
             FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
             FeaturePackets, FeatureSmallData]>;
-def : Proc<"hexagonv4",  HexagonModelV4,
-           [ArchV4,
-            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
-            FeaturePackets, FeatureSmallData]>;
-def : Proc<"hexagonv5",  HexagonModelV4,
-           [ArchV4, ArchV5,
+def : Proc<"hexagonv5",  HexagonModelV5,
+           [ArchV5,
             FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
             FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv55", HexagonModelV55,
-           [ArchV4, ArchV5, ArchV55,
+           [ArchV5, ArchV55,
             FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
             FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv60", HexagonModelV60,
-           [ArchV4, ArchV5, ArchV55, ArchV60,
+           [ArchV5, ArchV55, ArchV60,
             FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
             FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv62", HexagonModelV62,
-           [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62,
+           [ArchV5, ArchV55, ArchV60, ArchV62,
             FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
             FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv65", HexagonModelV65,
-           [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62, ArchV65,
+           [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65,
             FeatureDuplex, FeatureMemNoShuf, FeatureMemops, FeatureNVJ,
             FeatureNVS, FeaturePackets, FeatureSmallData]>;
 
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index fccde96d8a3..28965b69e28 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -555,8 +555,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
     if ((!IsI1LowReg && !IsI2LowReg) || !isEvenReg(FirstRegIndex))
       continue;
 
-    // Check that the two instructions are combinable. V4 allows more
-    // instructions to be merged into a combine.
+    // Check that the two instructions are combinable.
     // The order matters because in a A2_tfrsi we might can encode a int8 as
     // the hi reg operand but only a uint6 as the low reg operand.
     if ((IsI2LowReg && !areCombinableOperations(TRI, I1, *I2, AllowC64)) ||
diff --git a/lib/Target/Hexagon/HexagonDepArch.h b/lib/Target/Hexagon/HexagonDepArch.h
index dc75f8f6340..1bcf4022061 100644
--- a/lib/Target/Hexagon/HexagonDepArch.h
+++ b/lib/Target/Hexagon/HexagonDepArch.h
@@ -15,7 +15,7 @@
 #define HEXAGON_DEP_ARCH_H
 namespace llvm {
 namespace Hexagon {
-enum class ArchEnum { V4,V5,V55,V60,V62,V65 };
+enum class ArchEnum { NoArch,Generic,V5,V55,V60,V62,V65 };
 } // namespace Hexagon
 } // namespace llvm;
 #endif // HEXAGON_DEP_ARCH_H
diff --git a/lib/Target/Hexagon/HexagonDepArch.td b/lib/Target/Hexagon/HexagonDepArch.td
index 3594379aa84..ce795692610 100644
--- a/lib/Target/Hexagon/HexagonDepArch.td
+++ b/lib/Target/Hexagon/HexagonDepArch.td
@@ -18,7 +18,4 @@ def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "Hexagon::ArchEnum::V
 def HasV60 : Predicate<"HST->hasV60Ops()">, AssemblerPredicate<"ArchV60">;
 def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "Hexagon::ArchEnum::V55", "Enable Hexagon V55 architecture">;
 def HasV55 : Predicate<"HST->hasV55Ops()">, AssemblerPredicate<"ArchV55">;
-def ArchV4: SubtargetFeature<"v4", "HexagonArchVersion", "Hexagon::ArchEnum::V4", "Enable Hexagon V4 architecture">;
-def HasV4 : Predicate<"HST->hasV4Ops()">, AssemblerPredicate<"ArchV4">;
 def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "Hexagon::ArchEnum::V5", "Enable Hexagon V5 architecture">;
-def HasV5 : Predicate<"HST->hasV5Ops()">, AssemblerPredicate<"ArchV5">;
diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td
index 5c9ed271cea..0b5efda933d 100644
--- a/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -991,7 +991,7 @@ def A2_roundsat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = round($Rss32):sat",
-tc_c2f7d806, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_c2f7d806, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000110;
 let hasNewValue = 1;
@@ -3314,7 +3314,7 @@ def A5_vaddhubs : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vaddhub($Rss32,$Rtt32):sat",
-tc_2b6f77c6, TypeS_3op>, Enc_d2216a, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_3op>, Enc_d2216a {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -4059,7 +4059,7 @@ def F2_conv_d2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_d2df($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4069,7 +4069,7 @@ def F2_conv_d2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_d2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000010;
 let hasNewValue = 1;
@@ -4081,7 +4081,7 @@ def F2_conv_df2d : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2d($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4091,7 +4091,7 @@ def F2_conv_df2d_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2d($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4101,7 +4101,7 @@ def F2_conv_df2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -4113,7 +4113,7 @@ def F2_conv_df2ud : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2ud($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4123,7 +4123,7 @@ def F2_conv_df2ud_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2ud($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4133,7 +4133,7 @@ def F2_conv_df2uw : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2uw($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -4145,7 +4145,7 @@ def F2_conv_df2uw_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2uw($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000101;
 let hasNewValue = 1;
@@ -4157,7 +4157,7 @@ def F2_conv_df2w : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2w($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -4169,7 +4169,7 @@ def F2_conv_df2w_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2w($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000111;
 let hasNewValue = 1;
@@ -4181,7 +4181,7 @@ def F2_conv_sf2d : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2d($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4191,7 +4191,7 @@ def F2_conv_sf2d_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2d($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4201,7 +4201,7 @@ def F2_conv_sf2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4211,7 +4211,7 @@ def F2_conv_sf2ud : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2ud($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4221,7 +4221,7 @@ def F2_conv_sf2ud_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2ud($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4231,7 +4231,7 @@ def F2_conv_sf2uw : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2uw($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011011;
 let hasNewValue = 1;
@@ -4243,7 +4243,7 @@ def F2_conv_sf2uw_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2uw($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001011011;
 let hasNewValue = 1;
@@ -4255,7 +4255,7 @@ def F2_conv_sf2w : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2w($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011100;
 let hasNewValue = 1;
@@ -4267,7 +4267,7 @@ def F2_conv_sf2w_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2w($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001011100;
 let hasNewValue = 1;
@@ -4279,7 +4279,7 @@ def F2_conv_ud2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_ud2df($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4289,7 +4289,7 @@ def F2_conv_ud2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_ud2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000001;
 let hasNewValue = 1;
@@ -4301,7 +4301,7 @@ def F2_conv_uw2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_uw2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4311,7 +4311,7 @@ def F2_conv_uw2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_uw2sf($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011001;
 let hasNewValue = 1;
@@ -4323,7 +4323,7 @@ def F2_conv_w2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_w2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4333,7 +4333,7 @@ def F2_conv_w2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_w2sf($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011010;
 let hasNewValue = 1;
@@ -4345,7 +4345,7 @@ def F2_dfclass : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Pd4 = dfclass($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_1f19b5, Requires<[HasV5]> {
+tc_7a830544, TypeALU64>, Enc_1f19b5 {
 let Inst{4-2} = 0b100;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b11011100100;
@@ -4356,7 +4356,7 @@ def F2_dfcmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.eq($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4368,7 +4368,7 @@ def F2_dfcmpge : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.ge($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4380,7 +4380,7 @@ def F2_dfcmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.gt($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4392,7 +4392,7 @@ def F2_dfcmpuo : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.uo($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4404,7 +4404,7 @@ def F2_dfimm_n : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u10_0Imm:$Ii),
 "$Rdd32 = dfmake(#$Ii):neg",
-tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5]> {
+tc_234a11a5, TypeALU64>, Enc_e6c957 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101100101;
 let prefersSlot3 = 1;
@@ -4413,7 +4413,7 @@ def F2_dfimm_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u10_0Imm:$Ii),
 "$Rdd32 = dfmake(#$Ii):pos",
-tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5]> {
+tc_234a11a5, TypeALU64>, Enc_e6c957 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101100100;
 let prefersSlot3 = 1;
@@ -4422,7 +4422,7 @@ def F2_sfadd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfadd($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011000;
@@ -4436,7 +4436,7 @@ def F2_sfclass : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Pd4 = sfclass($Rs32,#$Ii)",
-tc_7a830544, TypeS_2op>, Enc_83ee64, Requires<[HasV5]> {
+tc_7a830544, TypeS_2op>, Enc_83ee64 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000101111;
@@ -4447,7 +4447,7 @@ def F2_sfcmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.eq($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4459,7 +4459,7 @@ def F2_sfcmpge : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.ge($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4471,7 +4471,7 @@ def F2_sfcmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.gt($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4483,7 +4483,7 @@ def F2_sfcmpuo : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.uo($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4495,7 +4495,7 @@ def F2_sffixupd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sffixupd($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011110;
@@ -4507,7 +4507,7 @@ def F2_sffixupn : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sffixupn($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011110;
@@ -4519,7 +4519,7 @@ def F2_sffixupr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sffixupr($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011101;
 let hasNewValue = 1;
@@ -4530,7 +4530,7 @@ def F2_sffma : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += sfmpy($Rs32,$Rt32)",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
+tc_d580173f, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4544,7 +4544,7 @@ def F2_sffma_lib : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += sfmpy($Rs32,$Rt32):lib",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
+tc_d580173f, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4558,7 +4558,7 @@ def F2_sffma_sc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32, PredRegs:$Pu4),
 "$Rx32 += sfmpy($Rs32,$Rt32,$Pu4):scale",
-tc_038a1342, TypeM>, Enc_437f33, Requires<[HasV5]> {
+tc_038a1342, TypeM>, Enc_437f33 {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111011;
@@ -4572,7 +4572,7 @@ def F2_sffms : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= sfmpy($Rs32,$Rt32)",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
+tc_d580173f, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4586,7 +4586,7 @@ def F2_sffms_lib : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= sfmpy($Rs32,$Rt32):lib",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
+tc_d580173f, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4600,7 +4600,7 @@ def F2_sfimm_n : HInst<
 (outs IntRegs:$Rd32),
 (ins u10_0Imm:$Ii),
 "$Rd32 = sfmake(#$Ii):neg",
-tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5]> {
+tc_234a11a5, TypeALU64>, Enc_6c9440 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101011001;
 let hasNewValue = 1;
@@ -4611,7 +4611,7 @@ def F2_sfimm_p : HInst<
 (outs IntRegs:$Rd32),
 (ins u10_0Imm:$Ii),
 "$Rd32 = sfmake(#$Ii):pos",
-tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5]> {
+tc_234a11a5, TypeALU64>, Enc_6c9440 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101011000;
 let hasNewValue = 1;
@@ -4622,7 +4622,7 @@ def F2_sfinvsqrta : HInst<
 (outs IntRegs:$Rd32, PredRegs:$Pe4),
 (ins IntRegs:$Rs32),
 "$Rd32,$Pe4 = sfinvsqrta($Rs32)",
-tc_4d99bca9, TypeS_2op>, Enc_890909, Requires<[HasV5]> {
+tc_4d99bca9, TypeS_2op>, Enc_890909 {
 let Inst{13-7} = 0b0000000;
 let Inst{31-21} = 0b10001011111;
 let hasNewValue = 1;
@@ -4634,7 +4634,7 @@ def F2_sfmax : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmax($Rs32,$Rt32)",
-tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_976ddc4f, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011100;
@@ -4648,7 +4648,7 @@ def F2_sfmin : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmin($Rs32,$Rt32)",
-tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_976ddc4f, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011100;
@@ -4662,7 +4662,7 @@ def F2_sfmpy : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmpy($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011010;
@@ -4676,7 +4676,7 @@ def F2_sfrecipa : HInst<
 (outs IntRegs:$Rd32, PredRegs:$Pe4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32,$Pe4 = sfrecipa($Rs32,$Rt32)",
-tc_9c00ce8d, TypeM>, Enc_a94f3b, Requires<[HasV5]> {
+tc_9c00ce8d, TypeM>, Enc_a94f3b {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011111;
@@ -4689,7 +4689,7 @@ def F2_sfsub : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfsub($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011000;
@@ -16981,7 +16981,7 @@ def M4_cmpyi_whc : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyiwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5]> {
+tc_8fd5f294, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -17007,7 +17007,7 @@ def M4_cmpyr_whc : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyrwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5]> {
+tc_8fd5f294, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -17360,7 +17360,7 @@ def M5_vdmacbsu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vdmpybsu($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c, Requires<[HasV5]> {
+tc_e913dc32, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -17372,7 +17372,7 @@ def M5_vdmpybsu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vdmpybsu($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825, Requires<[HasV5]> {
+tc_8fd5f294, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -18207,7 +18207,7 @@ def S2_asr_i_p_rnd : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asr($Rss32,#$Ii):rnd",
-tc_2b6f77c6, TypeS_2op>, Enc_5eac98, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op>, Enc_5eac98 {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b10000000110;
 let prefersSlot3 = 1;
@@ -18216,7 +18216,7 @@ def S2_asr_i_p_rnd_goodsyntax : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asrrnd($Rss32,#$Ii)",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op> {
 let isPseudo = 1;
 }
 def S2_asr_i_r : HInst<
@@ -25151,7 +25151,7 @@ def S5_asrhub_rnd_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):raw",
-tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op>, Enc_11a146 {
 let Inst{7-5} = 0b100;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10001000011;
@@ -25164,7 +25164,7 @@ def S5_asrhub_rnd_sat_goodsyntax : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):rnd:sat",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -25173,7 +25173,7 @@ def S5_asrhub_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):sat",
-tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op>, Enc_11a146 {
 let Inst{7-5} = 0b101;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10001000011;
@@ -25186,7 +25186,7 @@ def S5_popcountp : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = popcount($Rss32)",
-tc_00afc57e, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_00afc57e, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -25197,7 +25197,7 @@ def S5_vasrhrnd : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii):raw",
-tc_2b6f77c6, TypeS_2op>, Enc_12b6e9, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op>, Enc_12b6e9 {
 let Inst{7-5} = 0b000;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10000000001;
@@ -25207,7 +25207,7 @@ def S5_vasrhrnd_goodsyntax : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii):rnd",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op> {
 let isPseudo = 1;
 }
 def S6_allocframe_to_raw : HInst<
@@ -37007,7 +37007,7 @@ def Y5_l2fetch : HInst<
 (outs),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "l2fetch($Rs32,$Rtt32)",
-tc_daa058fa, TypeST>, Enc_e6abcf, Requires<[HasV5]> {
+tc_daa058fa, TypeST>, Enc_e6abcf {
 let Inst{7-0} = 0b00000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10100110100;
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index aad457fc051..f2c27e5e39b 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1228,7 +1228,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
                                              const HexagonSubtarget &ST)
     : TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)),
       Subtarget(ST) {
-  bool IsV4 = !Subtarget.hasV5Ops();
   auto &HRI = *Subtarget.getRegisterInfo();
 
   setPrefLoopAlignment(4);
@@ -1270,10 +1269,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass);
   addRegisterClass(MVT::v2i32, &Hexagon::DoubleRegsRegClass);
 
-  if (Subtarget.hasV5Ops()) {
-    addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
-    addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
-  }
+  addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
+  addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
 
   //
   // Handling of scalar operations.
@@ -1351,8 +1348,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTTZ, MVT::i8,  Promote);
   setOperationAction(ISD::CTTZ, MVT::i16, Promote);
 
-  // In V5, popcount can count # of 1s in i64 but returns i32.
-  // On V4 it will be expanded (set later).
+  // Popcount can count # of 1s in i64 but returns i32.
   setOperationAction(ISD::CTPOP, MVT::i8,  Promote);
   setOperationAction(ISD::CTPOP, MVT::i16, Promote);
   setOperationAction(ISD::CTPOP, MVT::i32, Promote);
@@ -1515,57 +1511,28 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ROTL, MVT::i32, Custom);
     setOperationAction(ISD::ROTL, MVT::i64, Custom);
   }
-  if (Subtarget.hasV5Ops()) {
-    setOperationAction(ISD::FMA,  MVT::f64, Expand);
-    setOperationAction(ISD::FADD, MVT::f64, Expand);
-    setOperationAction(ISD::FSUB, MVT::f64, Expand);
-    setOperationAction(ISD::FMUL, MVT::f64, Expand);
-
-    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-
-    setOperationAction(ISD::FP_TO_UINT, MVT::i1,  Promote);
-    setOperationAction(ISD::FP_TO_UINT, MVT::i8,  Promote);
-    setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
-    setOperationAction(ISD::FP_TO_SINT, MVT::i1,  Promote);
-    setOperationAction(ISD::FP_TO_SINT, MVT::i8,  Promote);
-    setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i1,  Promote);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
-    setOperationAction(ISD::SINT_TO_FP, MVT::i1,  Promote);
-    setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
-    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
-  } else { // V4
-    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
-    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Expand);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
-    setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
-    setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
-    setOperationAction(ISD::FP_EXTEND,  MVT::f32, Expand);
-    setOperationAction(ISD::FP_ROUND,   MVT::f64, Expand);
-    setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
-
-    setOperationAction(ISD::CTPOP, MVT::i8,  Expand);
-    setOperationAction(ISD::CTPOP, MVT::i16, Expand);
-    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
-    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
-
-    // Expand these operations for both f32 and f64:
-    for (unsigned FPExpOpV4 :
-         {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FABS, ISD::FNEG, ISD::FMA}) {
-      setOperationAction(FPExpOpV4, MVT::f32, Expand);
-      setOperationAction(FPExpOpV4, MVT::f64, Expand);
-    }
 
-    for (ISD::CondCode FPExpCCV4 :
-         {ISD::SETOEQ, ISD::SETOGT, ISD::SETOLT, ISD::SETOGE, ISD::SETOLE,
-          ISD::SETUO,  ISD::SETO}) {
-      setCondCodeAction(FPExpCCV4, MVT::f32, Expand);
-      setCondCodeAction(FPExpCCV4, MVT::f64, Expand);
-    }
-  }
+  // V5+.
+  setOperationAction(ISD::FMA,  MVT::f64, Expand);
+  setOperationAction(ISD::FADD, MVT::f64, Expand);
+  setOperationAction(ISD::FSUB, MVT::f64, Expand);
+  setOperationAction(ISD::FMUL, MVT::f64, Expand);
+
+  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+
+  setOperationAction(ISD::FP_TO_UINT, MVT::i1,  Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i8,  Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i1,  Promote);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i8,  Promote);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i1,  Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i1,  Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
 
   // Handling of indexed loads/stores: default is "expand".
   //
@@ -1601,42 +1568,18 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
   setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
 
-  if (IsV4) {
-    // Handle single-precision floating point operations on V4.
-    if (FastMath) {
-      setLibcallName(RTLIB::ADD_F32, "__hexagon_fast_addsf3");
-      setLibcallName(RTLIB::SUB_F32, "__hexagon_fast_subsf3");
-      setLibcallName(RTLIB::MUL_F32, "__hexagon_fast_mulsf3");
-      setLibcallName(RTLIB::OGT_F32, "__hexagon_fast_gtsf2");
-      setLibcallName(RTLIB::OLT_F32, "__hexagon_fast_ltsf2");
-      // Double-precision compares.
-      setLibcallName(RTLIB::OGT_F64, "__hexagon_fast_gtdf2");
-      setLibcallName(RTLIB::OLT_F64, "__hexagon_fast_ltdf2");
-    } else {
-      setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
-      setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
-      setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
-      setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
-      setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
-      // Double-precision compares.
-      setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
-      setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
-    }
-  }
-
   // This is the only fast library function for sqrtd.
   if (FastMath)
     setLibcallName(RTLIB::SQRT_F64, "__hexagon_fast2_sqrtdf2");
 
   // Prefix is: nothing  for "slow-math",
-  //            "fast2_" for V4 fast-math and V5+ fast-math double-precision
+  //            "fast2_" for V5+ fast-math double-precision
   // (actually, keep fast-math and fast-math2 separate for now)
   if (FastMath) {
     setLibcallName(RTLIB::ADD_F64, "__hexagon_fast_adddf3");
     setLibcallName(RTLIB::SUB_F64, "__hexagon_fast_subdf3");
     setLibcallName(RTLIB::MUL_F64, "__hexagon_fast_muldf3");
     setLibcallName(RTLIB::DIV_F64, "__hexagon_fast_divdf3");
-    // Calling __hexagon_fast2_divsf3 with fast-math on V5 (ok).
     setLibcallName(RTLIB::DIV_F32, "__hexagon_fast_divsf3");
   } else {
     setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
@@ -1646,44 +1589,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
   }
 
-  if (Subtarget.hasV5Ops()) {
-    if (FastMath)
-      setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf");
-    else
-      setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf");
-  } else {
-    // V4
-    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
-    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
-    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
-    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
-    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
-    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
-    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
-    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
-    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
-    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
-    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
-    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
-    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
-    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
-    setLibcallName(RTLIB::FPEXT_F32_F64,    "__hexagon_extendsfdf2");
-    setLibcallName(RTLIB::FPROUND_F64_F32,  "__hexagon_truncdfsf2");
-    setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
-    setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
-    setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
-    setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
-    setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
-    setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
-    setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
-    setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
-    setLibcallName(RTLIB::UO_F32,  "__hexagon_unordsf2");
-    setLibcallName(RTLIB::UO_F64,  "__hexagon_unorddf2");
-    setLibcallName(RTLIB::O_F32,   "__hexagon_unordsf2");
-    setLibcallName(RTLIB::O_F64,   "__hexagon_unorddf2");
-  }
+  if (FastMath)
+    setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf");
+  else
+    setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf");
 
   // These cause problems when the shift amount is non-constant.
   setLibcallName(RTLIB::SHL_I128, nullptr);
@@ -3007,7 +2916,7 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  return Subtarget.hasV5Ops();
+  return true;
 }
 
 /// isLegalAddressingMode - Return true if the addressing mode represented by
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index 1bb3bc1ea31..a1082e7a777 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -194,8 +194,6 @@ class HInst<dag outs, dag ins, string asmstr, InstrItinClass itin, IType type> :
 //                         Instruction Classes Definitions +
 //===----------------------------------------------------------------------===//
 
-// LD Instruction Class in V2/V3/V4.
-// Definition of the instruction class NOT CHANGED.
 let mayLoad = 1 in
 class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
@@ -205,9 +203,6 @@ class CONSTLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
 
-// ST Instruction Class in V2/V3 can take SLOT0 only.
-// ST Instruction Class in V4    can take SLOT0 & SLOT1.
-// Definition of the instruction class CHANGED from V2/V3 to V4.
 let mayStore = 1 in
 class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
@@ -235,15 +230,6 @@ class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 //                         Instruction Classes Definitions -
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// V4 Instruction Format Definitions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrFormatsV4.td"
-
-//===----------------------------------------------------------------------===//
-// V60+ Instruction Format Definitions +
-//===----------------------------------------------------------------------===//
-
+include "HexagonInstrFormatsV5.td"
 include "HexagonInstrFormatsV60.td"
 include "HexagonInstrFormatsV65.td"
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV5.td
similarity index 95%
rename from lib/Target/Hexagon/HexagonInstrFormatsV4.td
rename to lib/Target/Hexagon/HexagonInstrFormatsV5.td
index c5fa2599521..482688ab90a 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV5.td
@@ -1,4 +1,4 @@
-//==- HexagonInstrFormatsV4.td - Hexagon Instruction Formats --*- tablegen -==//
+//==- HexagonInstrFormatsV5.td - Hexagon Instruction Formats --*- tablegen -==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file describes the Hexagon V4 instruction classes in TableGen format.
+// This file describes the Hexagon V5 instruction classes in TableGen format.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index b25e316709c..206e74983d2 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -1398,7 +1398,5 @@ def: T_R_pat<Y2_dczeroa,     int_hexagon_Y2_dczeroa>;
 def: T_RR_pat<Y4_l2fetch,    int_hexagon_Y4_l2fetch>;
 def: T_RP_pat<Y5_l2fetch,    int_hexagon_Y5_l2fetch>;
 
-include "HexagonIntrinsicsV3.td"
-include "HexagonIntrinsicsV4.td"
 include "HexagonIntrinsicsV5.td"
 include "HexagonIntrinsicsV60.td"
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV3.td b/lib/Target/Hexagon/HexagonIntrinsicsV3.td
deleted file mode 100644
index 6152cb09882..00000000000
--- a/lib/Target/Hexagon/HexagonIntrinsicsV3.td
+++ /dev/null
@@ -1,27 +0,0 @@
-//=- HexagonIntrinsicsV3.td - Target Description for Hexagon -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V3 Compiler Intrinsics in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-// Vector reduce complex multiply real or imaginary
-def : T_PR_pat <M2_vrcmpys_s1,     int_hexagon_M2_vrcmpys_s1>;
-def : T_PPR_pat<M2_vrcmpys_acc_s1, int_hexagon_M2_vrcmpys_acc_s1>;
-def : T_PR_pat <M2_vrcmpys_s1rp,   int_hexagon_M2_vrcmpys_s1rp>;
-
-// Vector reduce add unsigned halfwords
-def : T_PP_pat<M2_vradduh, int_hexagon_M2_vradduh>;
-
-def: T_RP_pat<A2_addsp,   int_hexagon_A2_addsp>;
-def: T_PP_pat<A2_addpsat, int_hexagon_A2_addpsat>;
-def: T_PP_pat<A2_minp,    int_hexagon_A2_minp>;
-def: T_PP_pat<A2_minup,   int_hexagon_A2_minup>;
-def: T_PP_pat<A2_maxp,    int_hexagon_A2_maxp>;
-def: T_PP_pat<A2_maxup,   int_hexagon_A2_maxup>;
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV4.td b/lib/Target/Hexagon/HexagonIntrinsicsV4.td
deleted file mode 100644
index 2affe531515..00000000000
--- a/lib/Target/Hexagon/HexagonIntrinsicsV4.td
+++ /dev/null
@@ -1,305 +0,0 @@
-//===- HexagonIntrinsicsV4.td - V4 Instruction intrinsics --*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// This is populated based on the following specs:
-// Hexagon V4 Architecture Extensions
-// Application-Level Specification
-// 80-V9418-12 Rev. A
-// June 15, 2010
-
-// Vector reduce multiply word by signed half (32x16)
-//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
-def : T_PP_pat <M4_vrmpyeh_s0, int_hexagon_M4_vrmpyeh_s0>;
-def : T_PP_pat <M4_vrmpyeh_s1, int_hexagon_M4_vrmpyeh_s1>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def : T_PP_pat <M4_vrmpyoh_s0, int_hexagon_M4_vrmpyoh_s0>;
-def : T_PP_pat <M4_vrmpyoh_s1, int_hexagon_M4_vrmpyoh_s1>;
-
-//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
-def : T_PPP_pat <M4_vrmpyeh_acc_s0, int_hexagon_M4_vrmpyeh_acc_s0>;
-def : T_PPP_pat <M4_vrmpyeh_acc_s1, int_hexagon_M4_vrmpyeh_acc_s1>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def : T_PPP_pat <M4_vrmpyoh_acc_s0, int_hexagon_M4_vrmpyoh_acc_s0>;
-def : T_PPP_pat <M4_vrmpyoh_acc_s1, int_hexagon_M4_vrmpyoh_acc_s1>;
-
-// Vector multiply halfwords, signed by unsigned
-// Rdd=vmpyhsu(Rs,Rt)[:<<1]:sat
-def : T_RR_pat <M2_vmpy2su_s0, int_hexagon_M2_vmpy2su_s0>;
-def : T_RR_pat <M2_vmpy2su_s1, int_hexagon_M2_vmpy2su_s1>;
-
-// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
-def : T_PRR_pat <M2_vmac2su_s0, int_hexagon_M2_vmac2su_s0>;
-def : T_PRR_pat <M2_vmac2su_s1, int_hexagon_M2_vmac2su_s1>;
-
-// Vector polynomial multiply halfwords
-// Rdd=vpmpyh(Rs,Rt)
-def : T_RR_pat <M4_vpmpyh, int_hexagon_M4_vpmpyh>;
-// Rxx[^]=vpmpyh(Rs,Rt)
-def : T_PRR_pat <M4_vpmpyh_acc, int_hexagon_M4_vpmpyh_acc>;
-
-// Polynomial multiply words
-// Rdd=pmpyw(Rs,Rt)
-def : T_RR_pat <M4_pmpyw, int_hexagon_M4_pmpyw>;
-// Rxx^=pmpyw(Rs,Rt)
-def : T_PRR_pat <M4_pmpyw_acc, int_hexagon_M4_pmpyw_acc>;
-
-//Rxx^=asr(Rss,Rt)
-def : T_PPR_pat <S2_asr_r_p_xor, int_hexagon_S2_asr_r_p_xor>;
-//Rxx^=asl(Rss,Rt)
-def : T_PPR_pat <S2_asl_r_p_xor, int_hexagon_S2_asl_r_p_xor>;
-//Rxx^=lsr(Rss,Rt)
-def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>;
-//Rxx^=lsl(Rss,Rt)
-def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>;
-
-// Multiply and use upper result
-def : T_RR_pat <M2_mpysu_up, int_hexagon_M2_mpysu_up>;
-def : T_RR_pat <M2_mpy_up_s1, int_hexagon_M2_mpy_up_s1>;
-def : T_RR_pat <M2_hmmpyh_s1, int_hexagon_M2_hmmpyh_s1>;
-def : T_RR_pat <M2_hmmpyl_s1, int_hexagon_M2_hmmpyl_s1>;
-def : T_RR_pat <M2_mpy_up_s1_sat, int_hexagon_M2_mpy_up_s1_sat>;
-
-def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddb_map>;
-def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubb_map>;
-
-// Vector reduce add unsigned halfwords
-def : T_PP_pat <M2_vraddh, int_hexagon_M2_vraddh>;
-
-def: T_P_pat<S2_brevp, int_hexagon_S2_brevp>;
-def: T_P_pat<S2_ct0p,  int_hexagon_S2_ct0p>;
-def: T_P_pat<S2_ct1p,  int_hexagon_S2_ct1p>;
-
-def: T_Q_RR_pat<C4_nbitsset,  int_hexagon_C4_nbitsset>;
-def: T_Q_RR_pat<C4_nbitsclr,  int_hexagon_C4_nbitsclr>;
-def: T_Q_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
-
-def : T_Q_PI_pat<A4_vcmpbeqi,     int_hexagon_A4_vcmpbeqi>;
-def : T_Q_PI_pat<A4_vcmpbgti,     int_hexagon_A4_vcmpbgti>;
-def : T_Q_PI_pat<A4_vcmpbgtui,    int_hexagon_A4_vcmpbgtui>;
-def : T_Q_PI_pat<A4_vcmpheqi,     int_hexagon_A4_vcmpheqi>;
-def : T_Q_PI_pat<A4_vcmphgti,     int_hexagon_A4_vcmphgti>;
-def : T_Q_PI_pat<A4_vcmphgtui,    int_hexagon_A4_vcmphgtui>;
-def : T_Q_PI_pat<A4_vcmpweqi,     int_hexagon_A4_vcmpweqi>;
-def : T_Q_PI_pat<A4_vcmpwgti,     int_hexagon_A4_vcmpwgti>;
-def : T_Q_PI_pat<A4_vcmpwgtui,    int_hexagon_A4_vcmpwgtui>;
-def : T_Q_PP_pat<A4_vcmpbeq_any,  int_hexagon_A4_vcmpbeq_any>;
-
-def : T_Q_RR_pat<A4_cmpbeq,   int_hexagon_A4_cmpbeq>;
-def : T_Q_RR_pat<A4_cmpbgt,   int_hexagon_A4_cmpbgt>;
-def : T_Q_RR_pat<A4_cmpbgtu,  int_hexagon_A4_cmpbgtu>;
-def : T_Q_RR_pat<A4_cmpheq,   int_hexagon_A4_cmpheq>;
-def : T_Q_RR_pat<A4_cmphgt,   int_hexagon_A4_cmphgt>;
-def : T_Q_RR_pat<A4_cmphgtu,  int_hexagon_A4_cmphgtu>;
-
-def : T_Q_RI_pat<A4_cmpbeqi,  int_hexagon_A4_cmpbeqi>;
-def : T_Q_RI_pat<A4_cmpbgti,  int_hexagon_A4_cmpbgti>;
-def : T_Q_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
-
-def : T_Q_RI_pat<A4_cmpheqi,  int_hexagon_A4_cmpheqi>;
-def : T_Q_RI_pat<A4_cmphgti,  int_hexagon_A4_cmphgti>;
-def : T_Q_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
-
-def : T_Q_RP_pat<A4_boundscheck, int_hexagon_A4_boundscheck>;
-def : T_Q_PR_pat<A4_tlbmatch,    int_hexagon_A4_tlbmatch>;
-
-def : T_RRR_pat <M4_mpyrr_addr,    int_hexagon_M4_mpyrr_addr>;
-def : T_IRR_pat <M4_mpyrr_addi,    int_hexagon_M4_mpyrr_addi>;
-def : T_IRI_pat <M4_mpyri_addi,    int_hexagon_M4_mpyri_addi>;
-def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>;
-def : T_RRI_pat <M4_mpyri_addr,    int_hexagon_M4_mpyri_addr>;
-def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>;
-def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>;
-
-// Complex multiply 32x16
-def : T_PR_pat <M4_cmpyi_wh, int_hexagon_M4_cmpyi_wh>;
-def : T_PR_pat <M4_cmpyr_wh, int_hexagon_M4_cmpyr_wh>;
-
-def : T_PR_pat <M4_cmpyi_whc, int_hexagon_M4_cmpyi_whc>;
-def : T_PR_pat <M4_cmpyr_whc, int_hexagon_M4_cmpyr_whc>;
-
-def : T_PP_pat<A4_andnp, int_hexagon_A4_andnp>;
-def : T_PP_pat<A4_ornp,  int_hexagon_A4_ornp>;
-
-// Complex add/sub halfwords/words
-def : T_PP_pat <S4_vxaddsubw, int_hexagon_S4_vxaddsubw>;
-def : T_PP_pat <S4_vxsubaddw, int_hexagon_S4_vxsubaddw>;
-def : T_PP_pat <S4_vxaddsubh, int_hexagon_S4_vxaddsubh>;
-def : T_PP_pat <S4_vxsubaddh, int_hexagon_S4_vxsubaddh>;
-
-def : T_PP_pat <S4_vxaddsubhr, int_hexagon_S4_vxaddsubhr>;
-def : T_PP_pat <S4_vxsubaddhr, int_hexagon_S4_vxsubaddhr>;
-
-// Extract bitfield
-def : T_PP_pat  <S4_extractp_rp, int_hexagon_S4_extractp_rp>;
-def : T_RP_pat  <S4_extract_rp, int_hexagon_S4_extract_rp>;
-def : T_PII_pat <S4_extractp, int_hexagon_S4_extractp>;
-def : T_RII_pat <S4_extract, int_hexagon_S4_extract>;
-
-// Vector conditional negate
-// Rdd=vcnegh(Rss,Rt)
-def : T_PR_pat <S2_vcnegh, int_hexagon_S2_vcnegh>;
-
-// Shift an immediate left by register amount
-def : T_IR_pat<S4_lsli, int_hexagon_S4_lsli>;
-
-// Vector reduce maximum halfwords
-def : T_PPR_pat <A4_vrmaxh, int_hexagon_A4_vrmaxh>;
-def : T_PPR_pat <A4_vrmaxuh, int_hexagon_A4_vrmaxuh>;
-
-// Vector reduce maximum words
-def : T_PPR_pat <A4_vrmaxw, int_hexagon_A4_vrmaxw>;
-def : T_PPR_pat <A4_vrmaxuw, int_hexagon_A4_vrmaxuw>;
-
-// Vector reduce minimum halfwords
-def : T_PPR_pat <A4_vrminh, int_hexagon_A4_vrminh>;
-def : T_PPR_pat <A4_vrminuh, int_hexagon_A4_vrminuh>;
-
-// Vector reduce minimum words
-def : T_PPR_pat <A4_vrminw, int_hexagon_A4_vrminw>;
-def : T_PPR_pat <A4_vrminuw, int_hexagon_A4_vrminuw>;
-
-// Rotate and reduce bytes
-def : Pat <(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2,
-                                     u2_0ImmPred:$src3),
-           (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>;
-
-// Rotate and reduce bytes with accumulation
-// Rxx+=vrcrotate(Rss,Rt,#u2)
-def : Pat <(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
-                                         IntRegs:$src3, u2_0ImmPred:$src4),
-           (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
-                             IntRegs:$src3, u2_0ImmPred:$src4)>;
-
-// Vector conditional negate
-def : T_PPR_pat<S2_vrcnegh, int_hexagon_S2_vrcnegh>;
-
-// Logical xor with xor accumulation
-def : T_PPP_pat<M4_xor_xacc, int_hexagon_M4_xor_xacc>;
-
-// ALU64 - Vector min/max byte
-def : T_PP_pat <A2_vminb, int_hexagon_A2_vminb>;
-def : T_PP_pat <A2_vmaxb, int_hexagon_A2_vmaxb>;
-
-// Shift and add/sub/and/or
-def : T_IRI_pat <S4_andi_asl_ri, int_hexagon_S4_andi_asl_ri>;
-def : T_IRI_pat <S4_ori_asl_ri,  int_hexagon_S4_ori_asl_ri>;
-def : T_IRI_pat <S4_addi_asl_ri, int_hexagon_S4_addi_asl_ri>;
-def : T_IRI_pat <S4_subi_asl_ri, int_hexagon_S4_subi_asl_ri>;
-def : T_IRI_pat <S4_andi_lsr_ri, int_hexagon_S4_andi_lsr_ri>;
-def : T_IRI_pat <S4_ori_lsr_ri,  int_hexagon_S4_ori_lsr_ri>;
-def : T_IRI_pat <S4_addi_lsr_ri, int_hexagon_S4_addi_lsr_ri>;
-def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>;
-
-// Split bitfield
-def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>;
-def : T_RR_pat <A4_bitsplit,  int_hexagon_A4_bitsplit>;
-
-def: T_RR_pat<S4_parity,      int_hexagon_S4_parity>;
-
-def: T_Q_RI_pat<S4_ntstbit_i, int_hexagon_S4_ntstbit_i>;
-def: T_Q_RR_pat<S4_ntstbit_r, int_hexagon_S4_ntstbit_r>;
-
-def: T_RI_pat<S4_clbaddi,     int_hexagon_S4_clbaddi>;
-def: T_PI_pat<S4_clbpaddi,    int_hexagon_S4_clbpaddi>;
-def: T_P_pat <S4_clbpnorm,    int_hexagon_S4_clbpnorm>;
-
-//*******************************************************************
-//            ALU32/ALU
-//*******************************************************************
-
-// ALU32 / ALU / Logical Operations.
-def: T_RR_pat<A4_andn, int_hexagon_A4_andn>;
-def: T_RR_pat<A4_orn,  int_hexagon_A4_orn>;
-
-//*******************************************************************
-//            ALU32/PERM
-//*******************************************************************
-
-// Combine Words Into Doublewords.
-def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32_0ImmPred>;
-def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32_0ImmPred>;
-
-//*******************************************************************
-//           ALU32/PRED
-//*******************************************************************
-
-// Compare
-def : T_Q_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32_0ImmPred>;
-def : T_Q_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32_0ImmPred>;
-def : T_Q_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32_0ImmPred>;
-
-// Compare To General Register.
-def: T_Q_RR_pat<C4_cmpneq,  int_hexagon_C4_cmpneq>;
-def: T_Q_RR_pat<C4_cmplte,  int_hexagon_C4_cmplte>;
-def: T_Q_RR_pat<C4_cmplteu, int_hexagon_C4_cmplteu>;
-
-def: T_RR_pat<A4_rcmpeq,  int_hexagon_A4_rcmpeq>;
-def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
-
-def: T_RI_pat<A4_rcmpeqi,  int_hexagon_A4_rcmpeqi>;
-def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>;
-
-//*******************************************************************
-//           CR
-//*******************************************************************
-
-// CR / Logical Operations On Predicates.
-def: T_Q_QQQ_pat<C4_and_and,  int_hexagon_C4_and_and>;
-def: T_Q_QQQ_pat<C4_and_andn, int_hexagon_C4_and_andn>;
-def: T_Q_QQQ_pat<C4_and_or,   int_hexagon_C4_and_or>;
-def: T_Q_QQQ_pat<C4_and_orn,  int_hexagon_C4_and_orn>;
-def: T_Q_QQQ_pat<C4_or_and,   int_hexagon_C4_or_and>;
-def: T_Q_QQQ_pat<C4_or_andn,  int_hexagon_C4_or_andn>;
-def: T_Q_QQQ_pat<C4_or_or,    int_hexagon_C4_or_or>;
-def: T_Q_QQQ_pat<C4_or_orn,   int_hexagon_C4_or_orn>;
-
-//*******************************************************************
-//           XTYPE/ALU
-//*******************************************************************
-
-// Add And Accumulate.
-
-def : T_RRI_pat <S4_addaddi, int_hexagon_S4_addaddi>;
-def : T_RIR_pat <S4_subaddi, int_hexagon_S4_subaddi>;
-
-
-// XTYPE / ALU / Logical-logical Words.
-def : T_RRR_pat <M4_or_xor,   int_hexagon_M4_or_xor>;
-def : T_RRR_pat <M4_and_xor,  int_hexagon_M4_and_xor>;
-def : T_RRR_pat <M4_or_and,   int_hexagon_M4_or_and>;
-def : T_RRR_pat <M4_and_and,  int_hexagon_M4_and_and>;
-def : T_RRR_pat <M4_xor_and,  int_hexagon_M4_xor_and>;
-def : T_RRR_pat <M4_or_or,    int_hexagon_M4_or_or>;
-def : T_RRR_pat <M4_and_or,   int_hexagon_M4_and_or>;
-def : T_RRR_pat <M4_xor_or,   int_hexagon_M4_xor_or>;
-def : T_RRR_pat <M4_or_andn,  int_hexagon_M4_or_andn>;
-def : T_RRR_pat <M4_and_andn, int_hexagon_M4_and_andn>;
-def : T_RRR_pat <M4_xor_andn, int_hexagon_M4_xor_andn>;
-
-def : T_RRI_pat <S4_or_andi, int_hexagon_S4_or_andi>;
-def : T_RRI_pat <S4_or_andix,  int_hexagon_S4_or_andix>;
-def : T_RRI_pat <S4_or_ori, int_hexagon_S4_or_ori>;
-
-// Modulo wrap.
-def : T_RR_pat <A4_modwrapu, int_hexagon_A4_modwrapu>;
-
-// Arithmetic/Convergent round
-// Rd=[cround|round](Rs,Rt)[:sat]
-// Rd=[cround|round](Rs,#u5)[:sat]
-def : T_RI_pat <A4_cround_ri, int_hexagon_A4_cround_ri>;
-def : T_RR_pat <A4_cround_rr, int_hexagon_A4_cround_rr>;
-
-def : T_RI_pat <A4_round_ri, int_hexagon_A4_round_ri>;
-def : T_RR_pat <A4_round_rr, int_hexagon_A4_round_rr>;
-
-def : T_RI_pat <A4_round_ri_sat, int_hexagon_A4_round_ri_sat>;
-def : T_RR_pat <A4_round_rr_sat, int_hexagon_A4_round_rr_sat>;
-
-def : T_P_pat <A2_roundsat, int_hexagon_A2_roundsat>;
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
index 29f67cffcf8..a852394f216 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV5.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -7,9 +7,314 @@
 //
 //===----------------------------------------------------------------------===//
 
+def : T_PR_pat <M2_vrcmpys_s1,     int_hexagon_M2_vrcmpys_s1>;
+def : T_PPR_pat<M2_vrcmpys_acc_s1, int_hexagon_M2_vrcmpys_acc_s1>;
+def : T_PR_pat <M2_vrcmpys_s1rp,   int_hexagon_M2_vrcmpys_s1rp>;
+
+// Vector reduce add unsigned halfwords
+def : T_PP_pat<M2_vradduh, int_hexagon_M2_vradduh>;
+
+def: T_RP_pat<A2_addsp,   int_hexagon_A2_addsp>;
+def: T_PP_pat<A2_addpsat, int_hexagon_A2_addpsat>;
+def: T_PP_pat<A2_minp,    int_hexagon_A2_minp>;
+def: T_PP_pat<A2_minup,   int_hexagon_A2_minup>;
+def: T_PP_pat<A2_maxp,    int_hexagon_A2_maxp>;
+def: T_PP_pat<A2_maxup,   int_hexagon_A2_maxup>;
+
+// Vector reduce multiply word by signed half (32x16)
+//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
+def : T_PP_pat <M4_vrmpyeh_s0, int_hexagon_M4_vrmpyeh_s0>;
+def : T_PP_pat <M4_vrmpyeh_s1, int_hexagon_M4_vrmpyeh_s1>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def : T_PP_pat <M4_vrmpyoh_s0, int_hexagon_M4_vrmpyoh_s0>;
+def : T_PP_pat <M4_vrmpyoh_s1, int_hexagon_M4_vrmpyoh_s1>;
+
+//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
+def : T_PPP_pat <M4_vrmpyeh_acc_s0, int_hexagon_M4_vrmpyeh_acc_s0>;
+def : T_PPP_pat <M4_vrmpyeh_acc_s1, int_hexagon_M4_vrmpyeh_acc_s1>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def : T_PPP_pat <M4_vrmpyoh_acc_s0, int_hexagon_M4_vrmpyoh_acc_s0>;
+def : T_PPP_pat <M4_vrmpyoh_acc_s1, int_hexagon_M4_vrmpyoh_acc_s1>;
+
+// Vector multiply halfwords, signed by unsigned
+// Rdd=vmpyhsu(Rs,Rt)[:<<1]:sat
+def : T_RR_pat <M2_vmpy2su_s0, int_hexagon_M2_vmpy2su_s0>;
+def : T_RR_pat <M2_vmpy2su_s1, int_hexagon_M2_vmpy2su_s1>;
+
+// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
+def : T_PRR_pat <M2_vmac2su_s0, int_hexagon_M2_vmac2su_s0>;
+def : T_PRR_pat <M2_vmac2su_s1, int_hexagon_M2_vmac2su_s1>;
+
+// Vector polynomial multiply halfwords
+// Rdd=vpmpyh(Rs,Rt)
+def : T_RR_pat <M4_vpmpyh, int_hexagon_M4_vpmpyh>;
+// Rxx[^]=vpmpyh(Rs,Rt)
+def : T_PRR_pat <M4_vpmpyh_acc, int_hexagon_M4_vpmpyh_acc>;
+
+// Polynomial multiply words
+// Rdd=pmpyw(Rs,Rt)
+def : T_RR_pat <M4_pmpyw, int_hexagon_M4_pmpyw>;
+// Rxx^=pmpyw(Rs,Rt)
+def : T_PRR_pat <M4_pmpyw_acc, int_hexagon_M4_pmpyw_acc>;
+
+//Rxx^=asr(Rss,Rt)
+def : T_PPR_pat <S2_asr_r_p_xor, int_hexagon_S2_asr_r_p_xor>;
+//Rxx^=asl(Rss,Rt)
+def : T_PPR_pat <S2_asl_r_p_xor, int_hexagon_S2_asl_r_p_xor>;
+//Rxx^=lsr(Rss,Rt)
+def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>;
+//Rxx^=lsl(Rss,Rt)
+def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>;
+
+// Multiply and use upper result
+def : T_RR_pat <M2_mpysu_up, int_hexagon_M2_mpysu_up>;
+def : T_RR_pat <M2_mpy_up_s1, int_hexagon_M2_mpy_up_s1>;
+def : T_RR_pat <M2_hmmpyh_s1, int_hexagon_M2_hmmpyh_s1>;
+def : T_RR_pat <M2_hmmpyl_s1, int_hexagon_M2_hmmpyl_s1>;
+def : T_RR_pat <M2_mpy_up_s1_sat, int_hexagon_M2_mpy_up_s1_sat>;
+
+def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddb_map>;
+def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubb_map>;
+
+// Vector reduce add unsigned halfwords
+def : T_PP_pat <M2_vraddh, int_hexagon_M2_vraddh>;
+
+def: T_P_pat<S2_brevp, int_hexagon_S2_brevp>;
+def: T_P_pat<S2_ct0p,  int_hexagon_S2_ct0p>;
+def: T_P_pat<S2_ct1p,  int_hexagon_S2_ct1p>;
+
+def: T_Q_RR_pat<C4_nbitsset,  int_hexagon_C4_nbitsset>;
+def: T_Q_RR_pat<C4_nbitsclr,  int_hexagon_C4_nbitsclr>;
+def: T_Q_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
+
+def : T_Q_PI_pat<A4_vcmpbeqi,     int_hexagon_A4_vcmpbeqi>;
+def : T_Q_PI_pat<A4_vcmpbgti,     int_hexagon_A4_vcmpbgti>;
+def : T_Q_PI_pat<A4_vcmpbgtui,    int_hexagon_A4_vcmpbgtui>;
+def : T_Q_PI_pat<A4_vcmpheqi,     int_hexagon_A4_vcmpheqi>;
+def : T_Q_PI_pat<A4_vcmphgti,     int_hexagon_A4_vcmphgti>;
+def : T_Q_PI_pat<A4_vcmphgtui,    int_hexagon_A4_vcmphgtui>;
+def : T_Q_PI_pat<A4_vcmpweqi,     int_hexagon_A4_vcmpweqi>;
+def : T_Q_PI_pat<A4_vcmpwgti,     int_hexagon_A4_vcmpwgti>;
+def : T_Q_PI_pat<A4_vcmpwgtui,    int_hexagon_A4_vcmpwgtui>;
+def : T_Q_PP_pat<A4_vcmpbeq_any,  int_hexagon_A4_vcmpbeq_any>;
+
+def : T_Q_RR_pat<A4_cmpbeq,   int_hexagon_A4_cmpbeq>;
+def : T_Q_RR_pat<A4_cmpbgt,   int_hexagon_A4_cmpbgt>;
+def : T_Q_RR_pat<A4_cmpbgtu,  int_hexagon_A4_cmpbgtu>;
+def : T_Q_RR_pat<A4_cmpheq,   int_hexagon_A4_cmpheq>;
+def : T_Q_RR_pat<A4_cmphgt,   int_hexagon_A4_cmphgt>;
+def : T_Q_RR_pat<A4_cmphgtu,  int_hexagon_A4_cmphgtu>;
+
+def : T_Q_RI_pat<A4_cmpbeqi,  int_hexagon_A4_cmpbeqi>;
+def : T_Q_RI_pat<A4_cmpbgti,  int_hexagon_A4_cmpbgti>;
+def : T_Q_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
+
+def : T_Q_RI_pat<A4_cmpheqi,  int_hexagon_A4_cmpheqi>;
+def : T_Q_RI_pat<A4_cmphgti,  int_hexagon_A4_cmphgti>;
+def : T_Q_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
+
+def : T_Q_RP_pat<A4_boundscheck, int_hexagon_A4_boundscheck>;
+def : T_Q_PR_pat<A4_tlbmatch,    int_hexagon_A4_tlbmatch>;
+
+def : T_RRR_pat <M4_mpyrr_addr,    int_hexagon_M4_mpyrr_addr>;
+def : T_IRR_pat <M4_mpyrr_addi,    int_hexagon_M4_mpyrr_addi>;
+def : T_IRI_pat <M4_mpyri_addi,    int_hexagon_M4_mpyri_addi>;
+def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>;
+def : T_RRI_pat <M4_mpyri_addr,    int_hexagon_M4_mpyri_addr>;
+def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>;
+def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>;
+
+// Complex multiply 32x16
+def : T_PR_pat <M4_cmpyi_wh, int_hexagon_M4_cmpyi_wh>;
+def : T_PR_pat <M4_cmpyr_wh, int_hexagon_M4_cmpyr_wh>;
+
+def : T_PR_pat <M4_cmpyi_whc, int_hexagon_M4_cmpyi_whc>;
+def : T_PR_pat <M4_cmpyr_whc, int_hexagon_M4_cmpyr_whc>;
+
+def : T_PP_pat<A4_andnp, int_hexagon_A4_andnp>;
+def : T_PP_pat<A4_ornp,  int_hexagon_A4_ornp>;
+
+// Complex add/sub halfwords/words
+def : T_PP_pat <S4_vxaddsubw, int_hexagon_S4_vxaddsubw>;
+def : T_PP_pat <S4_vxsubaddw, int_hexagon_S4_vxsubaddw>;
+def : T_PP_pat <S4_vxaddsubh, int_hexagon_S4_vxaddsubh>;
+def : T_PP_pat <S4_vxsubaddh, int_hexagon_S4_vxsubaddh>;
+
+def : T_PP_pat <S4_vxaddsubhr, int_hexagon_S4_vxaddsubhr>;
+def : T_PP_pat <S4_vxsubaddhr, int_hexagon_S4_vxsubaddhr>;
+
+// Extract bitfield
+def : T_PP_pat  <S4_extractp_rp, int_hexagon_S4_extractp_rp>;
+def : T_RP_pat  <S4_extract_rp, int_hexagon_S4_extract_rp>;
+def : T_PII_pat <S4_extractp, int_hexagon_S4_extractp>;
+def : T_RII_pat <S4_extract, int_hexagon_S4_extract>;
+
+// Vector conditional negate
+// Rdd=vcnegh(Rss,Rt)
+def : T_PR_pat <S2_vcnegh, int_hexagon_S2_vcnegh>;
+
+// Shift an immediate left by register amount
+def : T_IR_pat<S4_lsli, int_hexagon_S4_lsli>;
+
+// Vector reduce maximum halfwords
+def : T_PPR_pat <A4_vrmaxh, int_hexagon_A4_vrmaxh>;
+def : T_PPR_pat <A4_vrmaxuh, int_hexagon_A4_vrmaxuh>;
+
+// Vector reduce maximum words
+def : T_PPR_pat <A4_vrmaxw, int_hexagon_A4_vrmaxw>;
+def : T_PPR_pat <A4_vrmaxuw, int_hexagon_A4_vrmaxuw>;
+
+// Vector reduce minimum halfwords
+def : T_PPR_pat <A4_vrminh, int_hexagon_A4_vrminh>;
+def : T_PPR_pat <A4_vrminuh, int_hexagon_A4_vrminuh>;
+
+// Vector reduce minimum words
+def : T_PPR_pat <A4_vrminw, int_hexagon_A4_vrminw>;
+def : T_PPR_pat <A4_vrminuw, int_hexagon_A4_vrminuw>;
+
+// Rotate and reduce bytes
+def : Pat <(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2,
+                                     u2_0ImmPred:$src3),
+           (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>;
+
+// Rotate and reduce bytes with accumulation
+// Rxx+=vrcrotate(Rss,Rt,#u2)
+def : Pat <(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
+                                         IntRegs:$src3, u2_0ImmPred:$src4),
+           (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
+                             IntRegs:$src3, u2_0ImmPred:$src4)>;
+
+// Vector conditional negate
+def : T_PPR_pat<S2_vrcnegh, int_hexagon_S2_vrcnegh>;
+
+// Logical xor with xor accumulation
+def : T_PPP_pat<M4_xor_xacc, int_hexagon_M4_xor_xacc>;
+
+// ALU64 - Vector min/max byte
+def : T_PP_pat <A2_vminb, int_hexagon_A2_vminb>;
+def : T_PP_pat <A2_vmaxb, int_hexagon_A2_vmaxb>;
+
+// Shift and add/sub/and/or
+def : T_IRI_pat <S4_andi_asl_ri, int_hexagon_S4_andi_asl_ri>;
+def : T_IRI_pat <S4_ori_asl_ri,  int_hexagon_S4_ori_asl_ri>;
+def : T_IRI_pat <S4_addi_asl_ri, int_hexagon_S4_addi_asl_ri>;
+def : T_IRI_pat <S4_subi_asl_ri, int_hexagon_S4_subi_asl_ri>;
+def : T_IRI_pat <S4_andi_lsr_ri, int_hexagon_S4_andi_lsr_ri>;
+def : T_IRI_pat <S4_ori_lsr_ri,  int_hexagon_S4_ori_lsr_ri>;
+def : T_IRI_pat <S4_addi_lsr_ri, int_hexagon_S4_addi_lsr_ri>;
+def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>;
+
+// Split bitfield
+def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>;
+def : T_RR_pat <A4_bitsplit,  int_hexagon_A4_bitsplit>;
+
+def: T_RR_pat<S4_parity,      int_hexagon_S4_parity>;
+
+def: T_Q_RI_pat<S4_ntstbit_i, int_hexagon_S4_ntstbit_i>;
+def: T_Q_RR_pat<S4_ntstbit_r, int_hexagon_S4_ntstbit_r>;
+
+def: T_RI_pat<S4_clbaddi,     int_hexagon_S4_clbaddi>;
+def: T_PI_pat<S4_clbpaddi,    int_hexagon_S4_clbpaddi>;
+def: T_P_pat <S4_clbpnorm,    int_hexagon_S4_clbpnorm>;
+
+//*******************************************************************
+//            ALU32/ALU
+//*******************************************************************
+
+// ALU32 / ALU / Logical Operations.
+def: T_RR_pat<A4_andn, int_hexagon_A4_andn>;
+def: T_RR_pat<A4_orn,  int_hexagon_A4_orn>;
+
+//*******************************************************************
+//            ALU32/PERM
+//*******************************************************************
+
+// Combine Words Into Doublewords.
+def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32_0ImmPred>;
+def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32_0ImmPred>;
+
+//*******************************************************************
+//           ALU32/PRED
+//*******************************************************************
+
+// Compare
+def : T_Q_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32_0ImmPred>;
+def : T_Q_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32_0ImmPred>;
+def : T_Q_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32_0ImmPred>;
+
+// Compare To General Register.
+def: T_Q_RR_pat<C4_cmpneq,  int_hexagon_C4_cmpneq>;
+def: T_Q_RR_pat<C4_cmplte,  int_hexagon_C4_cmplte>;
+def: T_Q_RR_pat<C4_cmplteu, int_hexagon_C4_cmplteu>;
+
+def: T_RR_pat<A4_rcmpeq,  int_hexagon_A4_rcmpeq>;
+def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
+
+def: T_RI_pat<A4_rcmpeqi,  int_hexagon_A4_rcmpeqi>;
+def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>;
+
+//*******************************************************************
+//           CR
+//*******************************************************************
+
+// CR / Logical Operations On Predicates.
+def: T_Q_QQQ_pat<C4_and_and,  int_hexagon_C4_and_and>;
+def: T_Q_QQQ_pat<C4_and_andn, int_hexagon_C4_and_andn>;
+def: T_Q_QQQ_pat<C4_and_or,   int_hexagon_C4_and_or>;
+def: T_Q_QQQ_pat<C4_and_orn,  int_hexagon_C4_and_orn>;
+def: T_Q_QQQ_pat<C4_or_and,   int_hexagon_C4_or_and>;
+def: T_Q_QQQ_pat<C4_or_andn,  int_hexagon_C4_or_andn>;
+def: T_Q_QQQ_pat<C4_or_or,    int_hexagon_C4_or_or>;
+def: T_Q_QQQ_pat<C4_or_orn,   int_hexagon_C4_or_orn>;
+
+//*******************************************************************
+//           XTYPE/ALU
+//*******************************************************************
+
+// Add And Accumulate.
+
+def : T_RRI_pat <S4_addaddi, int_hexagon_S4_addaddi>;
+def : T_RIR_pat <S4_subaddi, int_hexagon_S4_subaddi>;
+
+
+// XTYPE / ALU / Logical-logical Words.
+def : T_RRR_pat <M4_or_xor,   int_hexagon_M4_or_xor>;
+def : T_RRR_pat <M4_and_xor,  int_hexagon_M4_and_xor>;
+def : T_RRR_pat <M4_or_and,   int_hexagon_M4_or_and>;
+def : T_RRR_pat <M4_and_and,  int_hexagon_M4_and_and>;
+def : T_RRR_pat <M4_xor_and,  int_hexagon_M4_xor_and>;
+def : T_RRR_pat <M4_or_or,    int_hexagon_M4_or_or>;
+def : T_RRR_pat <M4_and_or,   int_hexagon_M4_and_or>;
+def : T_RRR_pat <M4_xor_or,   int_hexagon_M4_xor_or>;
+def : T_RRR_pat <M4_or_andn,  int_hexagon_M4_or_andn>;
+def : T_RRR_pat <M4_and_andn, int_hexagon_M4_and_andn>;
+def : T_RRR_pat <M4_xor_andn, int_hexagon_M4_xor_andn>;
+
+def : T_RRI_pat <S4_or_andi, int_hexagon_S4_or_andi>;
+def : T_RRI_pat <S4_or_andix,  int_hexagon_S4_or_andix>;
+def : T_RRI_pat <S4_or_ori, int_hexagon_S4_or_ori>;
+
+// Modulo wrap.
+def : T_RR_pat <A4_modwrapu, int_hexagon_A4_modwrapu>;
+
+// Arithmetic/Convergent round
+// Rd=[cround|round](Rs,Rt)[:sat]
+// Rd=[cround|round](Rs,#u5)[:sat]
+def : T_RI_pat <A4_cround_ri, int_hexagon_A4_cround_ri>;
+def : T_RR_pat <A4_cround_rr, int_hexagon_A4_cround_rr>;
+
+def : T_RI_pat <A4_round_ri, int_hexagon_A4_round_ri>;
+def : T_RR_pat <A4_round_rr, int_hexagon_A4_round_rr>;
+
+def : T_RI_pat <A4_round_ri_sat, int_hexagon_A4_round_ri_sat>;
+def : T_RR_pat <A4_round_rr_sat, int_hexagon_A4_round_rr_sat>;
+
+def : T_P_pat <A2_roundsat, int_hexagon_A2_roundsat>;
+
 //Rdd[+]=vrmpybsu(Rss,Rtt)
 //Rdd[+]=vrmpybuu(Rss,Rtt)
-let Predicates = [HasV5]  in {
 def : T_PP_pat  <M5_vrmpybsu, int_hexagon_M5_vrmpybsu>;
 def : T_PP_pat  <M5_vrmpybuu, int_hexagon_M5_vrmpybuu>;
 
@@ -31,7 +336,6 @@ def : T_PRR_pat <M5_vmacbuu, int_hexagon_M5_vmacbuu>;
 
 // Rd=vaddhub(Rss,Rtt):sat
 def : T_PP_pat <A5_vaddhubs, int_hexagon_A5_vaddhubs>;
-}
 
 def : T_FF_pat<F2_sfadd, int_hexagon_F2_sfadd>;
 def : T_FF_pat<F2_sfsub, int_hexagon_F2_sfsub>;
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index f671238ec12..ddf5a9ca364 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -365,38 +365,34 @@ def ToI32: OutPatFrag<(ops node:$V), (A2_tfrsi $V)>;
 // --(2) Type cast -------------------------------------------------------
 //
 
-let Predicates = [HasV5] in {
-  def: OpR_R_pat<F2_conv_sf2df,      pf1<fpextend>,   f64, F32>;
-  def: OpR_R_pat<F2_conv_df2sf,      pf1<fpround>,    f32, F64>;
+def: OpR_R_pat<F2_conv_sf2df,      pf1<fpextend>,   f64, F32>;
+def: OpR_R_pat<F2_conv_df2sf,      pf1<fpround>,    f32, F64>;
 
-  def: OpR_R_pat<F2_conv_w2sf,       pf1<sint_to_fp>, f32, I32>;
-  def: OpR_R_pat<F2_conv_d2sf,       pf1<sint_to_fp>, f32, I64>;
-  def: OpR_R_pat<F2_conv_w2df,       pf1<sint_to_fp>, f64, I32>;
-  def: OpR_R_pat<F2_conv_d2df,       pf1<sint_to_fp>, f64, I64>;
+def: OpR_R_pat<F2_conv_w2sf,       pf1<sint_to_fp>, f32, I32>;
+def: OpR_R_pat<F2_conv_d2sf,       pf1<sint_to_fp>, f32, I64>;
+def: OpR_R_pat<F2_conv_w2df,       pf1<sint_to_fp>, f64, I32>;
+def: OpR_R_pat<F2_conv_d2df,       pf1<sint_to_fp>, f64, I64>;
 
-  def: OpR_R_pat<F2_conv_uw2sf,      pf1<uint_to_fp>, f32, I32>;
-  def: OpR_R_pat<F2_conv_ud2sf,      pf1<uint_to_fp>, f32, I64>;
-  def: OpR_R_pat<F2_conv_uw2df,      pf1<uint_to_fp>, f64, I32>;
-  def: OpR_R_pat<F2_conv_ud2df,      pf1<uint_to_fp>, f64, I64>;
+def: OpR_R_pat<F2_conv_uw2sf,      pf1<uint_to_fp>, f32, I32>;
+def: OpR_R_pat<F2_conv_ud2sf,      pf1<uint_to_fp>, f32, I64>;
+def: OpR_R_pat<F2_conv_uw2df,      pf1<uint_to_fp>, f64, I32>;
+def: OpR_R_pat<F2_conv_ud2df,      pf1<uint_to_fp>, f64, I64>;
 
-  def: OpR_R_pat<F2_conv_sf2w_chop,  pf1<fp_to_sint>, i32, F32>;
-  def: OpR_R_pat<F2_conv_df2w_chop,  pf1<fp_to_sint>, i32, F64>;
-  def: OpR_R_pat<F2_conv_sf2d_chop,  pf1<fp_to_sint>, i64, F32>;
-  def: OpR_R_pat<F2_conv_df2d_chop,  pf1<fp_to_sint>, i64, F64>;
+def: OpR_R_pat<F2_conv_sf2w_chop,  pf1<fp_to_sint>, i32, F32>;
+def: OpR_R_pat<F2_conv_df2w_chop,  pf1<fp_to_sint>, i32, F64>;
+def: OpR_R_pat<F2_conv_sf2d_chop,  pf1<fp_to_sint>, i64, F32>;
+def: OpR_R_pat<F2_conv_df2d_chop,  pf1<fp_to_sint>, i64, F64>;
 
-  def: OpR_R_pat<F2_conv_sf2uw_chop, pf1<fp_to_uint>, i32, F32>;
-  def: OpR_R_pat<F2_conv_df2uw_chop, pf1<fp_to_uint>, i32, F64>;
-  def: OpR_R_pat<F2_conv_sf2ud_chop, pf1<fp_to_uint>, i64, F32>;
-  def: OpR_R_pat<F2_conv_df2ud_chop, pf1<fp_to_uint>, i64, F64>;
-}
+def: OpR_R_pat<F2_conv_sf2uw_chop, pf1<fp_to_uint>, i32, F32>;
+def: OpR_R_pat<F2_conv_df2uw_chop, pf1<fp_to_uint>, i32, F64>;
+def: OpR_R_pat<F2_conv_sf2ud_chop, pf1<fp_to_uint>, i64, F32>;
+def: OpR_R_pat<F2_conv_df2ud_chop, pf1<fp_to_uint>, i64, F64>;
 
 // Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
-let Predicates = [HasV5] in {
-  def: Pat<(i32 (bitconvert F32:$v)), (I32:$v)>;
-  def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>;
-  def: Pat<(i64 (bitconvert F64:$v)), (I64:$v)>;
-  def: Pat<(f64 (bitconvert I64:$v)), (F64:$v)>;
-}
+def: Pat<(i32 (bitconvert F32:$v)), (I32:$v)>;
+def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>;
+def: Pat<(i64 (bitconvert F64:$v)), (I64:$v)>;
+def: Pat<(f64 (bitconvert I64:$v)), (F64:$v)>;
 
 multiclass Cast_pat<ValueType Ta, ValueType Tb, RegisterClass RC> {
   def: Pat<(Tb (bitconvert (Ta RC:$Rs))), (Tb RC:$Rs)>;
@@ -599,31 +595,29 @@ def: OpR_RR_pat<A2_vcmpwgtu,  RevCmp<setult>, v2i1, V2I32>;
 def: OpR_RR_pat<A2_vcmpwgtu,  setugt,         i1,   V2I32>;
 def: OpR_RR_pat<A2_vcmpwgtu,  setugt,         v2i1, V2I32>;
 
-let Predicates = [HasV5] in {
-  def: OpR_RR_pat<F2_sfcmpeq,   seteq,          i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   setgt,          i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   setge,          i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpeq,   setoeq,         i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   setogt,         i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   setoge,         i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setolt>, i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setole>, i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setlt>,  i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setle>,  i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpuo,   setuo,          i1, F32>;
-
-  def: OpR_RR_pat<F2_dfcmpeq,   seteq,          i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   setgt,          i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   setge,          i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpeq,   setoeq,         i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   setogt,         i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   setoge,         i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setolt>, i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setole>, i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setlt>,  i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setle>,  i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpuo,   setuo,          i1, F64>;
-}
+def: OpR_RR_pat<F2_sfcmpeq,   seteq,          i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   setgt,          i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   setge,          i1, F32>;
+def: OpR_RR_pat<F2_sfcmpeq,   setoeq,         i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   setogt,         i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   setoge,         i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setolt>, i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setole>, i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setlt>,  i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setle>,  i1, F32>;
+def: OpR_RR_pat<F2_sfcmpuo,   setuo,          i1, F32>;
+
+def: OpR_RR_pat<F2_dfcmpeq,   seteq,          i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   setgt,          i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   setge,          i1, F64>;
+def: OpR_RR_pat<F2_dfcmpeq,   setoeq,         i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   setogt,         i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   setoge,         i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setolt>, i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setole>, i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setlt>,  i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setle>,  i1, F64>;
+def: OpR_RR_pat<F2_dfcmpuo,   setuo,          i1, F64>;
 
 // Avoid C4_cmpneqi, C4_cmpltei, C4_cmplteui, since they cannot form compounds.
 
@@ -746,32 +740,28 @@ class Cmpud<InstHexagon MI>:  T3<C2_or,  F2_dfcmpuo, MI>;
 class Cmpufn<InstHexagon MI>: T3<C2_orn, F2_sfcmpuo, MI>;
 class Cmpudn<InstHexagon MI>: T3<C2_orn, F2_dfcmpuo, MI>;
 
-let Predicates = [HasV5] in {
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpeq>,  setueq,         i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  setuge,         i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  setugt,         i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  RevCmp<setule>, i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  RevCmp<setult>, i1, F32>;
-  def: OpmR_RR_pat<Cmpufn<F2_sfcmpeq>, setune,         i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpeq>,  setueq,         i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  setuge,         i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  setugt,         i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  RevCmp<setule>, i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  RevCmp<setult>, i1, F32>;
+def: OpmR_RR_pat<Cmpufn<F2_sfcmpeq>, setune,         i1, F32>;
 
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpeq>,  setueq,         i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  setuge,         i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  setugt,         i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  RevCmp<setule>, i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  RevCmp<setult>, i1, F64>;
-  def: OpmR_RR_pat<Cmpudn<F2_dfcmpeq>, setune,         i1, F64>;
-}
+def: OpmR_RR_pat<Cmpud<F2_dfcmpeq>,  setueq,         i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  setuge,         i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  setugt,         i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  RevCmp<setule>, i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  RevCmp<setult>, i1, F64>;
+def: OpmR_RR_pat<Cmpudn<F2_dfcmpeq>, setune,         i1, F64>;
 
-let Predicates = [HasV5] in {
-  def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setone, i1, F32>;
-  def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setne,  i1, F32>;
+def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setone, i1, F32>;
+def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setne,  i1, F32>;
 
-  def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setone, i1, F64>;
-  def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setne,  i1, F64>;
+def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setone, i1, F64>;
+def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setne,  i1, F64>;
 
-  def: OpmR_RR_pat<Outn<F2_sfcmpuo>, seto,   i1, F32>;
-  def: OpmR_RR_pat<Outn<F2_dfcmpuo>, seto,   i1, F64>;
-}
+def: OpmR_RR_pat<Outn<F2_sfcmpuo>, seto,   i1, F32>;
+def: OpmR_RR_pat<Outn<F2_dfcmpuo>, seto,   i1, F64>;
 
 
 // --(6) Select ----------------------------------------------------------
@@ -801,27 +791,25 @@ def: Pat<(select I1:$Pu, I64:$Rs, I64:$Rt),
          (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
                    (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
 
-let Predicates = [HasV5] in {
-  def: Pat<(select I1:$Pu, F32:$Rs, f32ImmPred:$I),
-           (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
-  def: Pat<(select I1:$Pu, f32ImmPred:$I, F32:$Rt),
-           (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
-  def: Pat<(select I1:$Pu, F32:$Rs, F32:$Rt),
-           (C2_mux I1:$Pu, F32:$Rs, F32:$Rt)>;
-  def: Pat<(select I1:$Pu, F64:$Rs, F64:$Rt),
-           (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
-                     (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
+def: Pat<(select I1:$Pu, F32:$Rs, f32ImmPred:$I),
+         (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
+def: Pat<(select I1:$Pu, f32ImmPred:$I, F32:$Rt),
+         (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
+def: Pat<(select I1:$Pu, F32:$Rs, F32:$Rt),
+         (C2_mux I1:$Pu, F32:$Rs, F32:$Rt)>;
+def: Pat<(select I1:$Pu, F64:$Rs, F64:$Rt),
+         (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
+                   (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
 
-  def: Pat<(select (i1 (setult F32:$Ra, F32:$Rb)), F32:$Rs, F32:$Rt),
-           (C2_mux (F2_sfcmpgt F32:$Rb, F32:$Ra), F32:$Rs, F32:$Rt)>;
-  def: Pat<(select (i1 (setult F64:$Ra, F64:$Rb)), F64:$Rs, F64:$Rt),
-           (C2_vmux (F2_dfcmpgt F64:$Rb, F64:$Ra), F64:$Rs, F64:$Rt)>;
+def: Pat<(select (i1 (setult F32:$Ra, F32:$Rb)), F32:$Rs, F32:$Rt),
+         (C2_mux (F2_sfcmpgt F32:$Rb, F32:$Ra), F32:$Rs, F32:$Rt)>;
+def: Pat<(select (i1 (setult F64:$Ra, F64:$Rb)), F64:$Rs, F64:$Rt),
+         (C2_vmux (F2_dfcmpgt F64:$Rb, F64:$Ra), F64:$Rs, F64:$Rt)>;
 
-  def: Pat<(select (not I1:$Pu), f32ImmPred:$I, F32:$Rs),
-           (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
-  def: Pat<(select (not I1:$Pu), F32:$Rt, f32ImmPred:$I),
-           (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
-}
+def: Pat<(select (not I1:$Pu), f32ImmPred:$I, F32:$Rs),
+         (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
+def: Pat<(select (not I1:$Pu), F32:$Rt, f32ImmPred:$I),
+         (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
 
 def: Pat<(select I1:$Pu, V4I8:$Rs, V4I8:$Rt),
          (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>;
@@ -889,7 +877,7 @@ let AddedComplexity = 200 in {
   defm: SelMinMax_pats<setult, I64, A2_minup, A2_maxup>;
 }
 
-let AddedComplexity = 100, Predicates = [HasV5] in {
+let AddedComplexity = 100 in {
   defm: SelMinMax_pats<setolt, F32, F2_sfmin, F2_sfmax>;
   defm: SelMinMax_pats<setole, F32, F2_sfmin, F2_sfmax>;
   defm: SelMinMax_pats<setogt, F32, F2_sfmax, F2_sfmin>;
@@ -1014,7 +1002,7 @@ let Predicates = [HasV60] in {
 def: Pat<(sra (add (sra I32:$Rs, u5_0ImmPred:$u5), 1), (i32 1)),
          (S2_asr_i_r_rnd I32:$Rs, imm:$u5)>;
 def: Pat<(sra (add (sra I64:$Rs, u6_0ImmPred:$u6), 1), (i32 1)),
-         (S2_asr_i_p_rnd I64:$Rs, imm:$u6)>, Requires<[HasV5]>;
+         (S2_asr_i_p_rnd I64:$Rs, imm:$u6)>;
 
 // Prefer S2_addasl_rrri over S2_asl_i_r_acc.
 let AddedComplexity = 120 in
@@ -1191,17 +1179,15 @@ def: Pat<(not  I32:$Rs), (A2_subri -1, I32:$Rs)>;
 def: Pat<(not  I64:$Rs), (A2_notp  I64:$Rs)>;
 def: Pat<(ineg I64:$Rs), (A2_negp  I64:$Rs)>;
 
-let Predicates = [HasV5] in {
-  def: Pat<(fabs F32:$Rs), (S2_clrbit_i    F32:$Rs, 31)>;
-  def: Pat<(fneg F32:$Rs), (S2_togglebit_i F32:$Rs, 31)>;
+def: Pat<(fabs F32:$Rs), (S2_clrbit_i    F32:$Rs, 31)>;
+def: Pat<(fneg F32:$Rs), (S2_togglebit_i F32:$Rs, 31)>;
 
-  def: Pat<(fabs F64:$Rs),
-           (Combinew (S2_clrbit_i (HiReg $Rs), 31),
-                     (i32 (LoReg $Rs)))>;
-  def: Pat<(fneg F64:$Rs),
-           (Combinew (S2_togglebit_i (HiReg $Rs), 31),
-                     (i32 (LoReg $Rs)))>;
-}
+def: Pat<(fabs F64:$Rs),
+         (Combinew (S2_clrbit_i (HiReg $Rs), 31),
+                   (i32 (LoReg $Rs)))>;
+def: Pat<(fneg F64:$Rs),
+         (Combinew (S2_togglebit_i (HiReg $Rs), 31),
+                   (i32 (LoReg $Rs)))>;
 
 def: Pat<(add I32:$Rs, anyimm:$s16),   (A2_addi   I32:$Rs,  imm:$s16)>;
 def: Pat<(or  I32:$Rs, anyimm:$s10),   (A2_orir   I32:$Rs,  imm:$s10)>;
@@ -1267,13 +1253,11 @@ def: OpR_RR_pat<C2_and,       Mul,        v2i1,  V2I1>;
 def: OpR_RR_pat<C2_and,       Mul,        v4i1,  V4I1>;
 def: OpR_RR_pat<C2_and,       Mul,        v8i1,  V8I1>;
 
-let Predicates = [HasV5] in {
-  def: OpR_RR_pat<F2_sfadd,     pf2<fadd>,    f32, F32>;
-  def: OpR_RR_pat<F2_sfsub,     pf2<fsub>,    f32, F32>;
-  def: OpR_RR_pat<F2_sfmpy,     pf2<fmul>,    f32, F32>;
-  def: OpR_RR_pat<F2_sfmin,     pf2<fminnum>, f32, F32>;
-  def: OpR_RR_pat<F2_sfmax,     pf2<fmaxnum>, f32, F32>;
-}
+def: OpR_RR_pat<F2_sfadd,     pf2<fadd>,    f32, F32>;
+def: OpR_RR_pat<F2_sfsub,     pf2<fsub>,    f32, F32>;
+def: OpR_RR_pat<F2_sfmpy,     pf2<fmul>,    f32, F32>;
+def: OpR_RR_pat<F2_sfmin,     pf2<fminnum>, f32, F32>;
+def: OpR_RR_pat<F2_sfmax,     pf2<fmaxnum>, f32, F32>;
 
 // In expressions like a0*b0 + a1*b1 + ..., prefer to generate multiply-add,
 // over add-add with individual multiplies as inputs.
@@ -1506,14 +1490,12 @@ def: Pat<(add I32:$Ru, (Su<Mul> I32:$Ry, I32:$Rs)),
          (M4_mpyrr_addr IntRegs:$Ru, IntRegs:$Ry, IntRegs:$Rs)>;
 
 
-let Predicates = [HasV5] in {
-  def: Pat<(fma F32:$Rs, F32:$Rt, F32:$Rx),
-           (F2_sffma F32:$Rx, F32:$Rs, F32:$Rt)>;
-  def: Pat<(fma (fneg F32:$Rs), F32:$Rt, F32:$Rx),
-           (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
-  def: Pat<(fma F32:$Rs, (fneg F32:$Rt), F32:$Rx),
-           (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
-}
+def: Pat<(fma F32:$Rs, F32:$Rt, F32:$Rx),
+         (F2_sffma F32:$Rx, F32:$Rs, F32:$Rt)>;
+def: Pat<(fma (fneg F32:$Rs), F32:$Rt, F32:$Rx),
+         (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
+def: Pat<(fma F32:$Rs, (fneg F32:$Rt), F32:$Rx),
+         (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
 
 
 def: Pat<(mul V2I32:$Rs, V2I32:$Rt),
@@ -1540,14 +1522,12 @@ def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)),
 
 // Multiplies two v4i8 vectors.
 def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)),
-         (S2_vtrunehb (M5_vmpybuu V4I8:$Rs, V4I8:$Rt))>,
-     Requires<[HasV5]>;
+         (S2_vtrunehb (M5_vmpybuu V4I8:$Rs, V4I8:$Rt))>;
 
 // Multiplies two v8i8 vectors.
 def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
          (Combinew (S2_vtrunehb (M5_vmpybuu (HiReg $Rs), (HiReg $Rt))),
-                   (S2_vtrunehb (M5_vmpybuu (LoReg $Rs), (LoReg $Rt))))>,
-     Requires<[HasV5]>;
+                   (S2_vtrunehb (M5_vmpybuu (LoReg $Rs), (LoReg $Rt))))>;
 
 
 // --(10) Bit ------------------------------------------------------------
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 2e11f875c0f..545def45a1c 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -118,18 +118,7 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
   bool HasEHReturn = MF->getInfo<HexagonMachineFunctionInfo>()->hasEHReturn();
 
-  switch (MF->getSubtarget<HexagonSubtarget>().getHexagonArchVersion()) {
-  case Hexagon::ArchEnum::V4:
-  case Hexagon::ArchEnum::V5:
-  case Hexagon::ArchEnum::V55:
-  case Hexagon::ArchEnum::V60:
-  case Hexagon::ArchEnum::V62:
-  case Hexagon::ArchEnum::V65:
-    return HasEHReturn ? CalleeSavedRegsV3EHReturn : CalleeSavedRegsV3;
-  }
-
-  llvm_unreachable("Callee saved registers requested for unknown architecture "
-                   "version");
+  return HasEHReturn ? CalleeSavedRegsV3EHReturn : CalleeSavedRegsV3;
 }
 
 
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index a1dfb66017a..fa4f9ca639c 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -57,10 +57,10 @@ include "HexagonDepIICScalar.td"
 include "HexagonDepIICHVX.td"
 
 //===----------------------------------------------------------------------===//
-// V4 Machine Info +
+// V5 Machine Info +
 //===----------------------------------------------------------------------===//
 
-include "HexagonScheduleV4.td"
+include "HexagonScheduleV5.td"
 
 // V55 Machine Info +
 include "HexagonScheduleV55.td"
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV5.td
similarity index 70%
rename from lib/Target/Hexagon/HexagonScheduleV4.td
rename to lib/Target/Hexagon/HexagonScheduleV5.td
index 69b704a805b..9a893f6dde0 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV5.td
@@ -1,4 +1,4 @@
-//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//=-HexagonScheduleV5.td - HexagonV5 Scheduling Definitions --*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,8 +10,8 @@
 def LD_tc_ld_SLOT01 : InstrItinClass;
 def ST_tc_st_SLOT01 : InstrItinClass;
 
-class HexagonV4PseudoItin {
-  list<InstrItinData> V4PseudoItin_list = [
+class HexagonV5PseudoItin {
+  list<InstrItinData> V5PseudoItin_list = [
     InstrItinData<PSEUDO,     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData<PSEUDOM,    [InstrStage<1, [SLOT2, SLOT3], 0>,
                                InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -20,27 +20,27 @@ class HexagonV4PseudoItin {
   ];
 }
 
-def HexagonV4ItinList : DepScalarItinV4, HexagonV4PseudoItin {
-  list<InstrItinData> V4Itin_list = [
+def HexagonV5ItinList : DepScalarItinV5, HexagonV5PseudoItin {
+  list<InstrItinData> V5Itin_list = [
     InstrItinData<LD_tc_ld_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData<ST_tc_st_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>]>
   ];
   list<InstrItinData> ItinList =
-    !listconcat(V4Itin_list, DepScalarItinV4_list, V4PseudoItin_list);
+    !listconcat(V5Itin_list, DepScalarItinV5_list, V5PseudoItin_list);
 }
 
-def HexagonItinerariesV4 :
+def HexagonItinerariesV5 :
       ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP],
-                           [Hex_FWD], HexagonV4ItinList.ItinList>;
+                           [Hex_FWD], HexagonV5ItinList.ItinList>;
 
-def HexagonModelV4 : SchedMachineModel {
+def HexagonModelV5 : SchedMachineModel {
   // Max issue per cycle == bundle width.
   let IssueWidth = 4;
-  let Itineraries = HexagonItinerariesV4;
+  let Itineraries = HexagonItinerariesV5;
   let LoadLatency = 1;
   let CompleteModel = 0;
 }
 
 //===----------------------------------------------------------------------===//
-// Hexagon V4 Resource Definitions -
+// Hexagon V5 Resource Definitions -
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 0686d6eb611..68e276be0f6 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -93,7 +93,6 @@ HexagonSubtarget &
 HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   static std::map<StringRef, Hexagon::ArchEnum> CpuTable{
       {"generic", Hexagon::ArchEnum::V60},
-      {"hexagonv4", Hexagon::ArchEnum::V4},
       {"hexagonv5", Hexagon::ArchEnum::V5},
       {"hexagonv55", Hexagon::ArchEnum::V55},
       {"hexagonv60", Hexagon::ArchEnum::V60},
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index dc8d173a505..eaae4db6ba9 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -59,7 +59,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
 
 public:
   Hexagon::ArchEnum HexagonArchVersion;
-  Hexagon::ArchEnum HexagonHVXVersion = Hexagon::ArchEnum::V4;
+  Hexagon::ArchEnum HexagonHVXVersion = Hexagon::ArchEnum::NoArch;
   CodeGenOpt::Level OptLevel;
   /// True if the target should use Back-Skip-Back scheduling. This is the
   /// default for V60.
@@ -158,7 +158,9 @@ public:
   bool useNewValueStores() const { return UseNewValueStores; }
   bool useSmallData() const { return UseSmallData; }
 
-  bool useHVXOps() const { return HexagonHVXVersion > Hexagon::ArchEnum::V4; }
+  bool useHVXOps() const {
+    return HexagonHVXVersion > Hexagon::ArchEnum::NoArch;
+  }
   bool useHVX128BOps() const { return useHVXOps() && UseHVX128BOps; }
   bool useHVX64BOps() const { return useHVXOps() && UseHVX64BOps; }
 
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index a896700df1b..93b5bedbb38 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -768,7 +768,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
 
   // Make sure that for non-POST_INC stores:
   // 1. The only use of reg is DepReg and no other registers.
-  //    This handles V4 base+index registers.
+  //    This handles base+index registers.
   //    The following store can not be dot new.
   //    Eg.   r0 = add(r0, #3)
   //          memw(r1+r0<<#2) = r0
@@ -838,11 +838,7 @@ static bool isImplicitDependency(const MachineInstr &I, bool CheckDef,
   return false;
 }
 
-// Check to see if an instruction can be dot new
-// There are three kinds.
-// 1. dot new on predicate - V2/V3/V4
-// 2. dot new on stores NV/ST - V4
-// 3. dot new on jump NV/J - V4 -- This is generated in a pass.
+// Check to see if an instruction can be dot new.
 bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
       const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII,
       const TargetRegisterClass* RC) {
@@ -1075,9 +1071,6 @@ bool HexagonPacketizerList::isSoloInstruction(const MachineInstr &MI) {
   if (MI.isInlineAsm() && !ScheduleInlineAsm)
     return true;
 
-  // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints:
-  // trap, pause, barrier, icinva, isync, and syncht are solo instructions.
-  // They must not be grouped with other instructions in a packet.
   if (isSchedBarrier(MI))
     return true;
 
@@ -1289,8 +1282,8 @@ bool HexagonPacketizerList::hasRegMaskDependence(const MachineInstr &I,
   return false;
 }
 
-bool HexagonPacketizerList::hasV4SpecificDependence(const MachineInstr &I,
-                                                    const MachineInstr &J) {
+bool HexagonPacketizerList::hasDualStoreDependence(const MachineInstr &I,
+                                                   const MachineInstr &J) {
   bool SysI = isSystemInstr(I), SysJ = isSystemInstr(J);
   bool StoreI = I.mayStore(), StoreJ = J.mayStore();
   if ((SysI && StoreJ) || (SysJ && StoreI))
@@ -1343,10 +1336,10 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
   if (Dependence)
     return false;
 
-  // V4 allows dual stores. It does not allow second store, if the first
-  // store is not in SLOT0. New value store, new value jump, dealloc_return
-  // and memop always take SLOT0. Arch spec 3.4.4.2.
-  Dependence = hasV4SpecificDependence(I, J);
+  // Dual-store does not allow second store, if the first store is not
+  // in SLOT0. New value store, new value jump, dealloc_return and memop
+  // always take SLOT0. Arch spec 3.4.4.2.
+  Dependence = hasDualStoreDependence(I, J);
   if (Dependence)
     return false;
 
@@ -1505,10 +1498,10 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     }
 
     // For Order dependences:
-    // 1. On V4 or later, volatile loads/stores can be packetized together,
-    //    unless other rules prevent is.
+    // 1. Volatile loads/stores can be packetized together, unless other
+    //    rules prevent is.
     // 2. Store followed by a load is not allowed.
-    // 3. Store followed by a store is only valid on V4 or later.
+    // 3. Store followed by a store is valid.
     // 4. Load followed by any memory operation is allowed.
     if (DepType == SDep::Order) {
       if (!PacketizeVolatiles) {
@@ -1555,7 +1548,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       continue;
     }
 
-    // For V4, special case ALLOCFRAME. Even though there is dependency
+    // Special case for ALLOCFRAME: even though there is dependency
     // between ALLOCFRAME and subsequent store, allow it to be packetized
     // in a same packet. This implies that the store is using the caller's
     // SP. Hence, offset needs to be updated accordingly.
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index d54dd7050e1..ca70cf967a4 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -149,7 +149,7 @@ protected:
   bool hasDeadDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasControlDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasRegMaskDependence(const MachineInstr &I, const MachineInstr &J);
-  bool hasV4SpecificDependence(const MachineInstr &I, const MachineInstr &J);
+  bool hasDualStoreDependence(const MachineInstr &I, const MachineInstr &J);
   bool producesStall(const MachineInstr &MI);
 };
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index b208a366812..c707dcb0316 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -634,8 +634,7 @@ bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
       return false;
   }
 
-  if (STI.getCPU().equals_lower("hexagonv4") ||
-      STI.getCPU().equals_lower("hexagonv5") ||
+  if (STI.getCPU().equals_lower("hexagonv5") ||
       STI.getCPU().equals_lower("hexagonv55") ||
       STI.getCPU().equals_lower("hexagonv60")) {
     // If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index b211a81524f..8f3c09e7204 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -61,8 +61,6 @@ cl::opt<bool> llvm::HexagonDisableDuplex
    cl::desc("Disable looking for duplex instructions for Hexagon"));
 
 namespace { // These flags are to be deprecated
-cl::opt<bool> MV4("mv4", cl::Hidden, cl::desc("Build for Hexagon V4"),
-                  cl::init(false));
 cl::opt<bool> MV5("mv5", cl::Hidden, cl::desc("Build for Hexagon V5"),
                   cl::init(false));
 cl::opt<bool> MV55("mv55", cl::Hidden, cl::desc("Build for Hexagon V55"),
@@ -83,18 +81,18 @@ cl::opt<Hexagon::ArchEnum>
         clEnumValN(Hexagon::ArchEnum::V62, "v62", "Build for HVX v62"),
         clEnumValN(Hexagon::ArchEnum::V65, "v65", "Build for HVX v65"),
         // Sentinal for no value specified
-        clEnumValN(Hexagon::ArchEnum::V5, "", "")),
+        clEnumValN(Hexagon::ArchEnum::Generic, "", "")),
       // Sentinal for flag not present
-      cl::init(Hexagon::ArchEnum::V4), cl::ValueOptional);
+      cl::init(Hexagon::ArchEnum::NoArch), cl::ValueOptional);
+
 static cl::opt<bool>
-  DisableHVX("mno-hvx", cl::Hidden, cl::desc("Disable Hexagon Vector eXtensions"));
+  DisableHVX("mno-hvx", cl::Hidden,
+             cl::desc("Disable Hexagon Vector eXtensions"));
 
 
 static StringRef DefaultArch = "hexagonv60";
 
 static StringRef HexagonGetArchVariant() {
-  if (MV4)
-    return "hexagonv4";
   if (MV5)
     return "hexagonv5";
   if (MV55)
@@ -123,7 +121,7 @@ StringRef Hexagon_MC::selectHexagonCPU(StringRef CPU) {
   return ArchV;
 }
 
-unsigned llvm::HexagonGetLastSlot() { return HexagonItinerariesV4FU::SLOT3; }
+unsigned llvm::HexagonGetLastSlot() { return HexagonItinerariesV5FU::SLOT3; }
 
 namespace {
 
@@ -279,6 +277,7 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
     Result.push_back(FS);
 
   switch (EnableHVX) {
+  case Hexagon::ArchEnum::V5:
   case Hexagon::ArchEnum::V55:
     break;
   case Hexagon::ArchEnum::V60:
@@ -290,14 +289,14 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
   case Hexagon::ArchEnum::V65:
     Result.push_back("+hvxv65");
     break;
-  case Hexagon::ArchEnum::V5:{
+  case Hexagon::ArchEnum::Generic:{
     Result.push_back(StringSwitch<StringRef>(CPU)
              .Case("hexagonv60", "+hvxv60")
              .Case("hexagonv62", "+hvxv62")
              .Case("hexagonv65", "+hvxv65"));
     break;
   }
-  case Hexagon::ArchEnum::V4:
+  case Hexagon::ArchEnum::NoArch:
     // Sentinal if -mhvx isn't specified
     break;
   }
@@ -307,15 +306,9 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
 
 static bool isCPUValid(std::string CPU)
 {
-  std::vector<std::string> table
-  {
-    "generic",
-    "hexagonv4",
-    "hexagonv5",
-    "hexagonv55",
-    "hexagonv60",
-    "hexagonv62",
-    "hexagonv65",
+  std::vector<std::string> table {
+    "generic",    "hexagonv5",  "hexagonv55", "hexagonv60",
+    "hexagonv62", "hexagonv65",
   };
 
   return std::find(table.begin(), table.end(), CPU) != table.end();
@@ -336,8 +329,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
   // Make sure that +hvx-length turns hvx on, and that "hvx" alone
   // turns on hvxvNN, corresponding to the existing ArchVNN.
   FeatureBitset FB = S;
-  unsigned CpuArch = ArchV4;
-  for (unsigned F : {ArchV65, ArchV62, ArchV60, ArchV55, ArchV5, ArchV4}) {
+  unsigned CpuArch = ArchV5;
+  for (unsigned F : {ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {
     if (!FB.test(F))
       continue;
     CpuArch = F;
@@ -402,7 +395,6 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
 
 unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
   static std::map<StringRef,unsigned> ElfFlags = {
-    {"hexagonv4",  ELF::EF_HEXAGON_MACH_V4},
     {"hexagonv5",  ELF::EF_HEXAGON_MACH_V5},
     {"hexagonv55", ELF::EF_HEXAGON_MACH_V55},
     {"hexagonv60", ELF::EF_HEXAGON_MACH_V60},
diff --git a/test/CodeGen/Hexagon/cfi-late.ll b/test/CodeGen/Hexagon/cfi-late.ll
index b5bdb59cc15..460b645b4a4 100644
--- a/test/CodeGen/Hexagon/cfi-late.ll
+++ b/test/CodeGen/Hexagon/cfi-late.ll
@@ -32,8 +32,8 @@ declare i32 @bar(i32, i32) #1
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv4" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv4" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
+attributes #1 = { "target-cpu"="hexagonv5" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 
diff --git a/test/CodeGen/Hexagon/double.ll b/test/CodeGen/Hexagon/double.ll
index b4d025cd7fd..336f32fee61 100644
--- a/test/CodeGen/Hexagon/double.ll
+++ b/test/CodeGen/Hexagon/double.ll
@@ -1,22 +1,24 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; CHECK: __hexagon_adddf3
 ; CHECK: __hexagon_subdf3
 
-define void @foo(double* %acc, double %num, double %num2) nounwind {
-entry:
-  %acc.addr = alloca double*, align 4
-  %num.addr = alloca double, align 8
-  %num2.addr = alloca double, align 8
-  store double* %acc, double** %acc.addr, align 4
-  store double %num, double* %num.addr, align 8
-  store double %num2, double* %num2.addr, align 8
-  %0 = load double*, double** %acc.addr, align 4
-  %1 = load double, double* %0
-  %2 = load double, double* %num.addr, align 8
-  %add = fadd double %1, %2
-  %3 = load double, double* %num2.addr, align 8
-  %sub = fsub double %add, %3
-  %4 = load double*, double** %acc.addr, align 4
-  store double %sub, double* %4
+define void @f0(double* %a0, double %a1, double %a2) #0 {
+b0:
+  %v0 = alloca double*, align 4
+  %v1 = alloca double, align 8
+  %v2 = alloca double, align 8
+  store double* %a0, double** %v0, align 4
+  store double %a1, double* %v1, align 8
+  store double %a2, double* %v2, align 8
+  %v3 = load double*, double** %v0, align 4
+  %v4 = load double, double* %v3
+  %v5 = load double, double* %v1, align 8
+  %v6 = fadd double %v4, %v5
+  %v7 = load double, double* %v2, align 8
+  %v8 = fsub double %v6, %v7
+  %v9 = load double*, double** %v0, align 4
+  store double %v8, double* %v9
   ret void
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/float.ll b/test/CodeGen/Hexagon/float.ll
index 03d1fbf44cb..cc024a76d03 100644
--- a/test/CodeGen/Hexagon/float.ll
+++ b/test/CodeGen/Hexagon/float.ll
@@ -1,22 +1,24 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: __hexagon_addsf3
-; CHECK: __hexagon_subsf3
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: sfadd
+; CHECK: sfsub
 
-define void @foo(float* %acc, float %num, float %num2) nounwind {
-entry:
-  %acc.addr = alloca float*, align 4
-  %num.addr = alloca float, align 4
-  %num2.addr = alloca float, align 4
-  store float* %acc, float** %acc.addr, align 4
-  store float %num, float* %num.addr, align 4
-  store float %num2, float* %num2.addr, align 4
-  %0 = load float*, float** %acc.addr, align 4
-  %1 = load float, float* %0
-  %2 = load float, float* %num.addr, align 4
-  %add = fadd float %1, %2
-  %3 = load float, float* %num2.addr, align 4
-  %sub = fsub float %add, %3
-  %4 = load float*, float** %acc.addr, align 4
-  store float %sub, float* %4
+define void @f0(float* %a0, float %a1, float %a2) #0 {
+b0:
+  %v0 = alloca float*, align 4
+  %v1 = alloca float, align 4
+  %v2 = alloca float, align 4
+  store float* %a0, float** %v0, align 4
+  store float %a1, float* %v1, align 4
+  store float %a2, float* %v2, align 4
+  %v3 = load float*, float** %v0, align 4
+  %v4 = load float, float* %v3
+  %v5 = load float, float* %v1, align 4
+  %v6 = fadd float %v4, %v5
+  %v7 = load float, float* %v2, align 4
+  %v8 = fsub float %v6, %v7
+  %v9 = load float*, float** %v0, align 4
+  store float %v8, float* %v9
   ret void
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/floatconvert-ieee-rnd-near.ll b/test/CodeGen/Hexagon/floatconvert-ieee-rnd-near.ll
index 03d1fbf44cb..cc024a76d03 100644
--- a/test/CodeGen/Hexagon/floatconvert-ieee-rnd-near.ll
+++ b/test/CodeGen/Hexagon/floatconvert-ieee-rnd-near.ll
@@ -1,22 +1,24 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: __hexagon_addsf3
-; CHECK: __hexagon_subsf3
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: sfadd
+; CHECK: sfsub
 
-define void @foo(float* %acc, float %num, float %num2) nounwind {
-entry:
-  %acc.addr = alloca float*, align 4
-  %num.addr = alloca float, align 4
-  %num2.addr = alloca float, align 4
-  store float* %acc, float** %acc.addr, align 4
-  store float %num, float* %num.addr, align 4
-  store float %num2, float* %num2.addr, align 4
-  %0 = load float*, float** %acc.addr, align 4
-  %1 = load float, float* %0
-  %2 = load float, float* %num.addr, align 4
-  %add = fadd float %1, %2
-  %3 = load float, float* %num2.addr, align 4
-  %sub = fsub float %add, %3
-  %4 = load float*, float** %acc.addr, align 4
-  store float %sub, float* %4
+define void @f0(float* %a0, float %a1, float %a2) #0 {
+b0:
+  %v0 = alloca float*, align 4
+  %v1 = alloca float, align 4
+  %v2 = alloca float, align 4
+  store float* %a0, float** %v0, align 4
+  store float %a1, float* %v1, align 4
+  store float %a2, float* %v2, align 4
+  %v3 = load float*, float** %v0, align 4
+  %v4 = load float, float* %v3
+  %v5 = load float, float* %v1, align 4
+  %v6 = fadd float %v4, %v5
+  %v7 = load float, float* %v2, align 4
+  %v8 = fsub float %v6, %v7
+  %v9 = load float*, float** %v0, align 4
+  store float %v8, float* %v9
   ret void
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/gp-plus-offset-load.ll b/test/CodeGen/Hexagon/gp-plus-offset-load.ll
index 57783d421a4..2514d4109c0 100644
--- a/test/CodeGen/Hexagon/gp-plus-offset-load.ll
+++ b/test/CodeGen/Hexagon/gp-plus-offset-load.ll
@@ -1,51 +1,57 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we generate load instructions with global + offset
 
-%struct.struc = type { i8, i8, i16, i32 }
 
-@foo = common global %struct.struc zeroinitializer, align 4
+%s.0 = type { i8, i8, i16, i32 }
 
-define void @loadWord(i32 %val1, i32 %val2, i32* nocapture %ival) nounwind {
-; CHECK: r{{[0-9]+}} = memw(##foo+4)
-entry:
-  %cmp = icmp sgt i32 %val1, %val2
-  br i1 %cmp, label %if.then, label %if.end
+@g0 = common global %s.0 zeroinitializer, align 4
 
-if.then:                                          ; preds = %entry
-  %0 = load i32, i32* getelementptr inbounds (%struct.struc, %struct.struc* @foo, i32 0, i32 3), align 4
-  store i32 %0, i32* %ival, align 4
-  br label %if.end
+; CHECK-LABEL: f0:
+; CHECK: r{{[0-9]+}} = memw(##g0+4)
+define void @f0(i32 %a0, i32 %a1, i32* nocapture %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.end:                                           ; preds = %if.then, %entry
+b1:                                               ; preds = %b0
+  %v1 = load i32, i32* getelementptr inbounds (%s.0, %s.0* @g0, i32 0, i32 3), align 4
+  store i32 %v1, i32* %a2, align 4
+  br label %b2
+
+b2:                                               ; preds = %b1, %b0
   ret void
 }
 
-define void @loadByte(i32 %val1, i32 %val2, i8* nocapture %ival) nounwind {
-; CHECK: r{{[0-9]+}} = memub(##foo+1)
-entry:
-  %cmp = icmp sgt i32 %val1, %val2
-  br i1 %cmp, label %if.then, label %if.end
+; CHECK-LABEL: f1:
+; CHECK: r{{[0-9]+}} = memub(##g0+1)
+define void @f1(i32 %a0, i32 %a1, i8* nocapture %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.then:                                          ; preds = %entry
-  %0 = load i8, i8* getelementptr inbounds (%struct.struc, %struct.struc* @foo, i32 0, i32 1), align 1
-  store i8 %0, i8* %ival, align 1
-  br label %if.end
+b1:                                               ; preds = %b0
+  %v1 = load i8, i8* getelementptr inbounds (%s.0, %s.0* @g0, i32 0, i32 1), align 1
+  store i8 %v1, i8* %a2, align 1
+  br label %b2
 
-if.end:                                           ; preds = %if.then, %entry
+b2:                                               ; preds = %b1, %b0
   ret void
 }
 
-define void @loadHWord(i32 %val1, i32 %val2, i16* %ival) nounwind {
-; CHECK: r{{[0-9]+}} = memuh(##foo+2)
-entry:
-  %cmp = icmp sgt i32 %val1, %val2
-  br i1 %cmp, label %if.then, label %if.end
+; CHECK-LABEL: f2:
+; CHECK: r{{[0-9]+}} = memuh(##g0+2)
+define void @f2(i32 %a0, i32 %a1, i16* %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.then:                                          ; preds = %entry
-  %0 = load i16, i16* getelementptr inbounds (%struct.struc, %struct.struc* @foo, i32 0, i32 2), align 2
-  store i16 %0, i16* %ival, align 2
-  br label %if.end
+b1:                                               ; preds = %b0
+  %v1 = load i16, i16* getelementptr inbounds (%s.0, %s.0* @g0, i32 0, i32 2), align 2
+  store i16 %v1, i16* %a2, align 2
+  br label %b2
 
-if.end:                                           ; preds = %if.then, %entry
+b2:                                               ; preds = %b1, %b0
   ret void
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/gp-plus-offset-store.ll b/test/CodeGen/Hexagon/gp-plus-offset-store.ll
index 66391b954d0..91e412f7c13 100644
--- a/test/CodeGen/Hexagon/gp-plus-offset-store.ll
+++ b/test/CodeGen/Hexagon/gp-plus-offset-store.ll
@@ -1,35 +1,38 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we generate store instructions with global + offset
 
-%struct.struc = type { i8, i8, i16, i32 }
+%s.0 = type { i8, i8, i16, i32 }
 
-@foo = common global %struct.struc zeroinitializer, align 4
+@g0 = common global %s.0 zeroinitializer, align 4
 
-define void @storeByte(i32 %val1, i32 %val2, i8 zeroext %ival) nounwind {
-; CHECK: memb(##foo+1) = r{{[0-9]+}}
-entry:
-  %cmp = icmp sgt i32 %val1, %val2
-  br i1 %cmp, label %if.then, label %if.end
+; CHECK-LABEL: f0:
+; CHECK: memb(##g0+1) = r{{[0-9]+}}
+define void @f0(i32 %a0, i32 %a1, i8 zeroext %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.then:                                          ; preds = %entry
-  store i8 %ival, i8* getelementptr inbounds (%struct.struc, %struct.struc* @foo, i32 0, i32 1), align 1
-  br label %if.end
+b1:                                               ; preds = %b0
+  store i8 %a2, i8* getelementptr inbounds (%s.0, %s.0* @g0, i32 0, i32 1), align 1
+  br label %b2
 
-if.end:                                           ; preds = %if.then, %entry
+b2:                                               ; preds = %b1, %b0
   ret void
 }
 
-define void @storeHW(i32 %val1, i32 %val2, i16 signext %ival) nounwind {
-; CHECK: memh(##foo+2) = r{{[0-9]+}}
-entry:
-  %cmp = icmp sgt i32 %val1, %val2
-  br i1 %cmp, label %if.then, label %if.end
+; CHECK-LABEL: f1:
+; CHECK: memh(##g0+2) = r{{[0-9]+}}
+define void @f1(i32 %a0, i32 %a1, i16 signext %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.then:                                          ; preds = %entry
-  store i16 %ival, i16* getelementptr inbounds (%struct.struc, %struct.struc* @foo, i32 0, i32 2), align 2
-  br label %if.end
+b1:                                               ; preds = %b0
+  store i16 %a2, i16* getelementptr inbounds (%s.0, %s.0* @g0, i32 0, i32 2), align 2
+  br label %b2
 
-if.end:                                           ; preds = %if.then, %entry
+b2:                                               ; preds = %b1, %b0
   ret void
 }
 
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/gp-rel.ll b/test/CodeGen/Hexagon/gp-rel.ll
index ef913134f7c..3ce40bb5470 100644
--- a/test/CodeGen/Hexagon/gp-rel.ll
+++ b/test/CodeGen/Hexagon/gp-rel.ll
@@ -1,33 +1,36 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that gp-relative instructions are being generated.
 
-@a = common global i32 0, align 4
-@b = common global i32 0, align 4
-@c = common global i32 0, align 4
+; CHECK: r{{[0-9]+}} = memw(gp+#g0)
+; CHECK: r{{[0-9]+}} = memw(gp+#g1)
+; CHECK: if (p{{[0-3]}}) memw(##g2) = r{{[0-9]+}}
 
-define i32 @foo(i32 %p) #0 {
-entry:
-; CHECK: r{{[0-9]+}} = memw(gp+#a)
-; CHECK: r{{[0-9]+}} = memw(gp+#b)
-; CHECK: if (p{{[0-3]}}) memw(##c) = r{{[0-9]+}}
-  %0 = load i32, i32* @a, align 4
-  %1 = load i32, i32* @b, align 4
-  %add = add nsw i32 %1, %0
-  %cmp = icmp eq i32 %0, %1
-  br i1 %cmp, label %if.then, label %entry.if.end_crit_edge
+@g0 = common global i32 0, align 4
+@g1 = common global i32 0, align 4
+@g2 = common global i32 0, align 4
 
-entry.if.end_crit_edge:
-  %.pre = load i32, i32* @c, align 4
-  br label %if.end
+define i32 @f0(i32 %a0) #0 {
+b0:
+  %v0 = load i32, i32* @g0, align 4
+  %v1 = load i32, i32* @g1, align 4
+  %v2 = add nsw i32 %v1, %v0
+  %v3 = icmp eq i32 %v0, %v1
+  br i1 %v3, label %b2, label %b1
 
-if.then:
-  %add1 = add nsw i32 %add, %0
-  store i32 %add1, i32* @c, align 4
-  br label %if.end
+b1:                                               ; preds = %b0
+  %v4 = load i32, i32* @g2, align 4
+  br label %b3
 
-if.end:
-  %2 = phi i32 [ %.pre, %entry.if.end_crit_edge ], [ %add1, %if.then ]
-  %cmp2 = icmp eq i32 %add, %2
-  %sel1 = select i1 %cmp2, i32 %2, i32 %1
-  ret i32 %sel1
+b2:                                               ; preds = %b0
+  %v5 = add nsw i32 %v2, %v0
+  store i32 %v5, i32* @g2, align 4
+  br label %b3
+
+b3:                                               ; preds = %b2, %b1
+  %v6 = phi i32 [ %v4, %b1 ], [ %v5, %b2 ]
+  %v7 = icmp eq i32 %v2, %v6
+  %v8 = select i1 %v7, i32 %v6, i32 %v1
+  ret i32 %v8
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/hwloop-cleanup.ll b/test/CodeGen/Hexagon/hwloop-cleanup.ll
index 56a6fedf81e..71e1bf10fe6 100644
--- a/test/CodeGen/Hexagon/hwloop-cleanup.ll
+++ b/test/CodeGen/Hexagon/hwloop-cleanup.ll
@@ -1,87 +1,91 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -no-phi-elim-live-out-early-exit \
-; RUN:    < %s | FileCheck %s
+; RUN: llc -march=hexagon -no-phi-elim-live-out-early-exit < %s | FileCheck %s
 ; Check that we remove the compare and induction variable instructions
 ; after generating hardware loops.
 ; Bug 6685.
 
+; CHECK-LABEL: f0:
 ; CHECK: loop0
 ; CHECK-NOT: r{{[0-9]+}} = add(r{{[0-9]+}},#-1)
 ; CHECK-NOT: cmp.eq
 ; CHECK: endloop0
 
-define i32 @test1(i32* nocapture %b, i32 %n) nounwind readonly {
-entry:
-  %cmp1 = icmp sgt i32 %n, 0
-  br i1 %cmp1, label %for.body.preheader, label %for.end
+define i32 @f0(i32* nocapture %a0, i32 %a1) #0 {
+b0:
+  %v0 = icmp sgt i32 %a1, 0
+  br i1 %v0, label %b1, label %b4
 
-for.body.preheader:
-  br label %for.body
+b1:                                               ; preds = %b0
+  br label %b2
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %sum.03 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %b, %for.body.preheader ]
-  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %0 = load i32, i32* %arrayidx.phi, align 4
-  %add = add nsw i32 %0, %sum.03
-  %inc = add nsw i32 %i.02, 1
-  %exitcond = icmp eq i32 %inc, %n
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %v5, %b2 ], [ 0, %b1 ]
+  %v2 = phi i32* [ %v8, %b2 ], [ %a0, %b1 ]
+  %v3 = phi i32 [ %v6, %b2 ], [ 0, %b1 ]
+  %v4 = load i32, i32* %v2, align 4
+  %v5 = add nsw i32 %v4, %v1
+  %v6 = add nsw i32 %v3, 1
+  %v7 = icmp eq i32 %v6, %a1
+  %v8 = getelementptr i32, i32* %v2, i32 1
+  br i1 %v7, label %b3, label %b2
 
-for.end.loopexit:
-  br label %for.end
+b3:                                               ; preds = %b2
+  br label %b4
 
-for.end:
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
-  ret i32 %sum.0.lcssa
+b4:                                               ; preds = %b3, %b0
+  %v9 = phi i32 [ 0, %b0 ], [ %v5, %b3 ]
+  ret i32 %v9
 }
 
 ; This test checks that that initial loop count value is removed.
+; CHECK-LABEL: f1:
 ; CHECK-NOT: ={{.}}#40
 ; CHECK: loop0
 ; CHECK-NOT: r{{[0-9]+}} = add(r{{[0-9]+}},#-1)
 ; CHECK-NOT: cmp.eq
 ; CHECK: endloop0
 
-define i32 @test2(i32* nocapture %b) nounwind readonly {
-entry:
-  br label %for.body
+define i32 @f1(i32* nocapture %a0) #0 {
+b0:
+  br label %b1
 
-for.body:
-  %sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %arrayidx.phi = phi i32* [ %b, %entry ], [ %arrayidx.inc, %for.body ]
-  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %0 = load i32, i32* %arrayidx.phi, align 4
-  %add = add nsw i32 %0, %sum.02
-  %inc = add nsw i32 %i.01, 1
-  %exitcond = icmp eq i32 %inc, 40
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  br i1 %exitcond, label %for.end, label %for.body
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32 [ 0, %b0 ], [ %v4, %b1 ]
+  %v1 = phi i32* [ %a0, %b0 ], [ %v7, %b1 ]
+  %v2 = phi i32 [ 0, %b0 ], [ %v5, %b1 ]
+  %v3 = load i32, i32* %v1, align 4
+  %v4 = add nsw i32 %v3, %v0
+  %v5 = add nsw i32 %v2, 1
+  %v6 = icmp eq i32 %v5, 40
+  %v7 = getelementptr i32, i32* %v1, i32 1
+  br i1 %v6, label %b2, label %b1
 
-for.end:
-  ret i32 %add
+b2:                                               ; preds = %b1
+  ret i32 %v4
 }
 
 ; This test checks that we don't remove the induction variable since it's used.
+; CHECK-LABEL: f2:
 ; CHECK: loop0
 ; CHECK: r{{[0-9]+}} = add(r{{[0-9]+}},#1)
 ; CHECK-NOT: cmp.eq
 ; CHECK: endloop0
-define i32 @test3(i32* nocapture %b) nounwind {
-entry:
-  br label %for.body
 
-for.body:
-  %arrayidx.phi = phi i32* [ %b, %entry ], [ %arrayidx.inc, %for.body ]
-  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  store i32 %i.01, i32* %arrayidx.phi, align 4
-  %inc = add nsw i32 %i.01, 1
-  %exitcond = icmp eq i32 %inc, 40
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  br i1 %exitcond, label %for.end, label %for.body
+define i32 @f2(i32* nocapture %a0) #1 {
+b0:
+  br label %b1
 
-for.end:
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32* [ %a0, %b0 ], [ %v4, %b1 ]
+  %v1 = phi i32 [ 0, %b0 ], [ %v2, %b1 ]
+  store i32 %v1, i32* %v0, align 4
+  %v2 = add nsw i32 %v1, 1
+  %v3 = icmp eq i32 %v2, 40
+  %v4 = getelementptr i32, i32* %v0, i32 1
+  br i1 %v3, label %b2, label %b1
+
+b2:                                               ; preds = %b1
   ret i32 0
 }
 
-
+attributes #0 = { nounwind readonly "target-cpu"="hexagonv5" }
+attributes #1 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/hwloop-const.ll b/test/CodeGen/Hexagon/hwloop-const.ll
index d549c1fef8c..eb105a33768 100644
--- a/test/CodeGen/Hexagon/hwloop-const.ll
+++ b/test/CodeGen/Hexagon/hwloop-const.ll
@@ -1,27 +1,27 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -O2 < %s | FileCheck %s
-; ModuleID = 'hwloop-const.c'
-target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: endloop
+
 target triple = "hexagon-unknown-linux-gnu"
 
-@b = common global [25000 x i32] zeroinitializer, align 8
-@a = common global [25000 x i32] zeroinitializer, align 8
-@c = common global [25000 x i32] zeroinitializer, align 8
+@g0 = common global [25000 x i32] zeroinitializer, align 8
+@g1 = common global [25000 x i32] zeroinitializer, align 8
 
-define i32 @hwloop_bug() nounwind {
-entry:
-  br label %for.body
+define i32 @f0() #0 {
+b0:
+  br label %b1
 
-; CHECK: endloop
-for.body:                                         ; preds = %for.body, %entry
-  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds [25000 x i32], [25000 x i32]* @b, i32 0, i32 %i.02
-  store i32 %i.02, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds [25000 x i32], [25000 x i32]* @a, i32 0, i32 %i.02
-  store i32 %i.02, i32* %arrayidx1, align 4
-  %inc = add nsw i32 %i.02, 1
-  %exitcond = icmp eq i32 %inc, 25000
-  br i1 %exitcond, label %for.end, label %for.body
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32 [ 0, %b0 ], [ %v3, %b1 ]
+  %v1 = getelementptr inbounds [25000 x i32], [25000 x i32]* @g0, i32 0, i32 %v0
+  store i32 %v0, i32* %v1, align 4
+  %v2 = getelementptr inbounds [25000 x i32], [25000 x i32]* @g1, i32 0, i32 %v0
+  store i32 %v0, i32* %v2, align 4
+  %v3 = add nsw i32 %v0, 1
+  %v4 = icmp eq i32 %v3, 25000
+  br i1 %v4, label %b2, label %b1
 
-for.end:                                          ; preds = %for.body
+b2:                                               ; preds = %b1
   ret i32 0
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index 10f3af73de1..443e4b59e9d 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -1,63 +1,64 @@
-; RUN: llc < %s -march=hexagon -mcpu=hexagonv4 -O2 -disable-lsr | FileCheck %s
-; ModuleID = 'hwloop-dbg.o'
-target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
-target triple = "hexagon"
-
-define void @foo(i32* nocapture %a, i32* nocapture %b) nounwind !dbg !5 {
-entry:
-  tail call void @llvm.dbg.value(metadata i32* %a, i64 0, metadata !13, metadata !DIExpression()), !dbg !17
-  tail call void @llvm.dbg.value(metadata i32* %b, i64 0, metadata !14, metadata !DIExpression()), !dbg !18
-  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !DIExpression()), !dbg !19
-  br label %for.body, !dbg !19
+; RUN: llc < %s -march=hexagon -disable-lsr | FileCheck %s
 
-for.body:                                         ; preds = %for.body, %entry
 ; CHECK:     loop0(
 ; CHECK-NOT: add({{r[0-9]*}}, #
 ; CHECK:     endloop0
-  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
-  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %b.addr.01 = phi i32* [ %b, %entry ], [ %incdec.ptr, %for.body ]
-  %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.01, i32 1, !dbg !21
-  tail call void @llvm.dbg.value(metadata i32* %incdec.ptr, i64 0, metadata !14, metadata !DIExpression()), !dbg !21
-  %0 = load i32, i32* %b.addr.01, align 4, !dbg !21
-  store i32 %0, i32* %arrayidx.phi, align 4, !dbg !21
-  %inc = add nsw i32 %i.02, 1, !dbg !26
-  tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !15, metadata !DIExpression()), !dbg !26
-  %exitcond = icmp eq i32 %inc, 10, !dbg !19
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  br i1 %exitcond, label %for.end, label %for.body, !dbg !19
-
-for.end:                                          ; preds = %for.body
-  ret void, !dbg !27
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define void @f0(i32* nocapture %a0, i32* nocapture %a1) #0 !dbg !4 {
+b0:
+  call void @llvm.dbg.value(metadata i32* %a0, metadata !10, metadata !DIExpression()), !dbg !14
+  call void @llvm.dbg.value(metadata i32* %a1, metadata !11, metadata !DIExpression()), !dbg !15
+  call void @llvm.dbg.value(metadata i32 0, metadata !12, metadata !DIExpression()), !dbg !16
+  br label %b1, !dbg !16
+
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32* [ %a0, %b0 ], [ %v7, %b1 ]
+  %v1 = phi i32 [ 0, %b0 ], [ %v5, %b1 ]
+  %v2 = phi i32* [ %a1, %b0 ], [ %v3, %b1 ]
+  %v3 = getelementptr inbounds i32, i32* %v2, i32 1, !dbg !18
+  call void @llvm.dbg.value(metadata i32* %v3, metadata !11, metadata !DIExpression()), !dbg !18
+  %v4 = load i32, i32* %v2, align 4, !dbg !18
+  store i32 %v4, i32* %v0, align 4, !dbg !18
+  %v5 = add nsw i32 %v1, 1, !dbg !20
+  call void @llvm.dbg.value(metadata i32 %v5, metadata !12, metadata !DIExpression()), !dbg !20
+  %v6 = icmp eq i32 %v5, 10, !dbg !16
+  %v7 = getelementptr i32, i32* %v0, i32 1
+  br i1 %v6, label %b2, label %b1, !dbg !16
+
+b2:                                               ; preds = %b1
+  ret void, !dbg !21
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
 
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
+attributes #1 = { nounwind readnone speculatable }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!29}
+!llvm.module.flags = !{!3}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", isOptimized: true, emissionKind: FullDebug, file: !28, enums: !2, retainedTypes: !2, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "QuIC LLVM Hexagon Clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2)
+!1 = !DIFile(filename: "hwloop-dbg.c", directory: "/test")
 !2 = !{}
-!5 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !28, scope: null, type: !7, retainedNodes: !11)
-!6 = !DIFile(filename: "hwloop-dbg.c", directory: "/usr2/kparzysz/s.hex/t")
-!7 = !DISubroutineType(types: !8)
-!8 = !{null, !9, !9}
-!9 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, baseType: !10)
-!10 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!11 = !{!13, !14, !15}
-!13 = !DILocalVariable(name: "a", line: 1, arg: 1, scope: !5, file: !6, type: !9)
-!14 = !DILocalVariable(name: "b", line: 1, arg: 2, scope: !5, file: !6, type: !9)
-!15 = !DILocalVariable(name: "i", line: 2, scope: !16, file: !6, type: !10)
-!16 = distinct !DILexicalBlock(line: 1, column: 26, file: !28, scope: !5)
-!17 = !DILocation(line: 1, column: 15, scope: !5)
-!18 = !DILocation(line: 1, column: 23, scope: !5)
-!19 = !DILocation(line: 3, column: 8, scope: !20)
-!20 = distinct !DILexicalBlock(line: 3, column: 3, file: !28, scope: !16)
-!21 = !DILocation(line: 4, column: 5, scope: !22)
-!22 = distinct !DILexicalBlock(line: 3, column: 28, file: !28, scope: !20)
-!26 = !DILocation(line: 3, column: 23, scope: !20)
-!27 = !DILocation(line: 6, column: 1, scope: !16)
-!28 = !DIFile(filename: "hwloop-dbg.c", directory: "/usr2/kparzysz/s.hex/t")
-!29 = !{i32 1, !"Debug Info Version", i32 3}
-!30 = !{i32 0}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", scope: null, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !9)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7, !7}
+!7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 32, align: 32)
+!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !{!10, !11, !12}
+!10 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 1, type: !7)
+!11 = !DILocalVariable(name: "b", arg: 2, scope: !4, file: !1, line: 1, type: !7)
+!12 = !DILocalVariable(name: "i", scope: !13, file: !1, line: 2, type: !8)
+!13 = distinct !DILexicalBlock(scope: !4, file: !1, line: 1, column: 26)
+!14 = !DILocation(line: 1, column: 15, scope: !4)
+!15 = !DILocation(line: 1, column: 23, scope: !4)
+!16 = !DILocation(line: 3, column: 8, scope: !17)
+!17 = distinct !DILexicalBlock(scope: !13, file: !1, line: 3, column: 3)
+!18 = !DILocation(line: 4, column: 5, scope: !19)
+!19 = distinct !DILexicalBlock(scope: !17, file: !1, line: 3, column: 28)
+!20 = !DILocation(line: 3, column: 23, scope: !17)
+!21 = !DILocation(line: 6, column: 1, scope: !13)
diff --git a/test/CodeGen/Hexagon/hwloop-le.ll b/test/CodeGen/Hexagon/hwloop-le.ll
index 85a1b3db673..d78b234d4ec 100644
--- a/test/CodeGen/Hexagon/hwloop-le.ll
+++ b/test/CodeGen/Hexagon/hwloop-le.ll
@@ -1,438 +1,408 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
 
-
-; CHECK: test_pos1_ir_sle
+; CHECK-LABEL: f0:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 28395, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 28395, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f0(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 28395, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 28395, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_ir_sle
+; CHECK-LABEL: f1:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 9073, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 9073, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f1(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 9073, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 9073, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_ir_sle
+; CHECK-LABEL: f2:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 21956, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 21956, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f2(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 21956, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 21956, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_ir_sle
+; CHECK-LABEL: f3:
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 16782, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 16782, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f3(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 16782, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 16782, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_ir_sle
+; CHECK-LABEL: f4:
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 19097, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 19097, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f4(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 19097, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 19097, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos1_ri_sle
+; CHECK-LABEL: f5:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, 14040
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp sle i32 %inc, 14040
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f5(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, 14040
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp sle i32 %v7, 14040
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_ri_sle
+; CHECK-LABEL: f6:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, 13710
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp sle i32 %inc, 13710
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f6(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, 13710
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp sle i32 %v7, 13710
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_ri_sle
+; CHECK-LABEL: f7:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, 9920
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp sle i32 %inc, 9920
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f7(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, 9920
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp sle i32 %v7, 9920
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_ri_sle
+; CHECK-LABEL: f8:
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, 18924
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp sle i32 %inc, 18924
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f8(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, 18924
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp sle i32 %v7, 18924
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_ri_sle
+; CHECK-LABEL: f9:
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, 11812
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp sle i32 %inc, 11812
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f9(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, 11812
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp sle i32 %v7, 11812
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos1_rr_sle
+; CHECK-LABEL: f10:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f10(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_rr_sle
+; CHECK-LABEL: f11:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f11(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_rr_sle
+; CHECK-LABEL: f12:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f12(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_rr_sle
+; CHECK-LABEL: f13:
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f13(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_rr_sle
+; CHECK-LABEL: f14:
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f14(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/hwloop-ne.ll b/test/CodeGen/Hexagon/hwloop-ne.ll
index 12ef3b5dd0b..301a31a7c0b 100644
--- a/test/CodeGen/Hexagon/hwloop-ne.ll
+++ b/test/CodeGen/Hexagon/hwloop-ne.ll
@@ -1,438 +1,408 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s
+; RUN: llc -march=hexagon -O3 < %s | FileCheck %s
 
-
-; CHECK: test_pos1_ir_ne
+; CHECK-LABEL: f0:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 32623, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 32623, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f0(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 32623, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 32623, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_ir_ne
+; CHECK-LABEL: f1:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 29554, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 29554, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f1(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 29554, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 29554, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_ir_ne
+; CHECK-LABEL: f2:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 15692, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 15692, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f2(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 15692, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 15692, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_ir_ne
+; CHECK-LABEL: f3:
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 10449, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 10449, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f3(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 10449, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 10449, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_ir_ne
+; CHECK-LABEL: f4:
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 32087, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 32087, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f4(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 32087, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 32087, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos1_ri_ne
+; CHECK-LABEL: f5:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, 3472
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp ne i32 %inc, 3472
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f5(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, 3472
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp ne i32 %v7, 3472
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_ri_ne
+; CHECK-LABEL: f6:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, 8730
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp ne i32 %inc, 8730
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f6(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, 8730
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp ne i32 %v7, 8730
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_ri_ne
+; CHECK-LABEL: f7:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, 1493
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp ne i32 %inc, 1493
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f7(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, 1493
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp ne i32 %v7, 1493
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_ri_ne
+; CHECK-LABEL: f8:
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, 1706
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp ne i32 %inc, 1706
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f8(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, 1706
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp ne i32 %v7, 1706
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_ri_ne
+; CHECK-LABEL: f9:
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, 1886
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp ne i32 %inc, 1886
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f9(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, 1886
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp ne i32 %v7, 1886
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos1_rr_ne
+; CHECK-LABEL: f10:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f10(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_rr_ne
+; CHECK-LABEL: f11:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f11(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_rr_ne
+; CHECK-LABEL: f12:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f12(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_rr_ne
+; CHECK-LABEL: f13
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f13(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_rr_ne
+; CHECK-LABEL: f14
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f14(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/i16_VarArg.ll b/test/CodeGen/Hexagon/i16_VarArg.ll
index 74d066e4936..af2682edc4b 100644
--- a/test/CodeGen/Hexagon/i16_VarArg.ll
+++ b/test/CodeGen/Hexagon/i16_VarArg.ll
@@ -1,40 +1,36 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: call __hexagon_{{[A-Z_a-z0-9]+}}
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: dfcmp
 
-@a_str = internal constant [8 x i8] c"a = %f\0A\00"
-@b_str = internal constant [8 x i8] c"b = %f\0A\00"
-@add_str = internal constant [12 x i8] c"a + b = %f\0A\00"
-@sub_str = internal constant [12 x i8] c"a - b = %f\0A\00"
-@mul_str = internal constant [12 x i8] c"a * b = %f\0A\00"
-@div_str = internal constant [12 x i8] c"b / a = %f\0A\00"
-@rem_str = internal constant [13 x i8] c"b %% a = %f\0A\00"
-@lt_str = internal constant [12 x i8] c"a < b = %d\0A\00"
-@le_str = internal constant [13 x i8] c"a <= b = %d\0A\00"
-@gt_str = internal constant [12 x i8] c"a > b = %d\0A\00"
-@ge_str = internal constant [13 x i8] c"a >= b = %d\0A\00"
-@eq_str = internal constant [13 x i8] c"a == b = %d\0A\00"
-@ne_str = internal constant [13 x i8] c"a != b = %d\0A\00"
-@A = global double 2.000000e+00
-@B = global double 5.000000e+00
+@g0 = internal constant [12 x i8] c"a < b = %d\0A\00"
+@g1 = internal constant [13 x i8] c"a <= b = %d\0A\00"
+@g2 = internal constant [12 x i8] c"a > b = %d\0A\00"
+@g3 = internal constant [13 x i8] c"a >= b = %d\0A\00"
+@g4 = internal constant [13 x i8] c"a == b = %d\0A\00"
+@g5 = internal constant [13 x i8] c"a != b = %d\0A\00"
+@g6 = global double 2.000000e+00
+@g7 = global double 5.000000e+00
 
-declare i32 @printf(i8*, ...)
+declare i32 @f0(i8*, ...) #0
 
-define i32 @main() {
-        %a = load double, double* @A
-        %b = load double, double* @B
-        %lt_r = fcmp olt double %a, %b
-        %le_r = fcmp ole double %a, %b
-        %gt_r = fcmp ogt double %a, %b
-        %ge_r = fcmp oge double %a, %b
-        %eq_r = fcmp oeq double %a, %b
-        %ne_r = fcmp une double %a, %b
-        %val1 = zext i1 %lt_r to i16
-        %lt_s = getelementptr [12 x i8], [12 x i8]* @lt_str, i64 0, i64 0
-        %le_s = getelementptr [13 x i8], [13 x i8]* @le_str, i64 0, i64 0
-        %gt_s = getelementptr [12 x i8], [12 x i8]* @gt_str, i64 0, i64 0
-        %ge_s = getelementptr [13 x i8], [13 x i8]* @ge_str, i64 0, i64 0
-        %eq_s = getelementptr [13 x i8], [13 x i8]* @eq_str, i64 0, i64 0
-        %ne_s = getelementptr [13 x i8], [13 x i8]* @ne_str, i64 0, i64 0
-        call i32 (i8*, ...) @printf( i8* %lt_s, i16 %val1 )
-        ret i32 0
+define i32 @f1() #0 {
+b0:
+  %v0 = load double, double* @g6
+  %v1 = load double, double* @g7
+  %v2 = fcmp olt double %v0, %v1
+  %v3 = fcmp ole double %v0, %v1
+  %v4 = fcmp ogt double %v0, %v1
+  %v5 = fcmp oge double %v0, %v1
+  %v6 = fcmp oeq double %v0, %v1
+  %v7 = fcmp une double %v0, %v1
+  %v8 = zext i1 %v2 to i16
+  %v9 = getelementptr [12 x i8], [12 x i8]* @g0, i64 0, i64 0
+  %v10 = getelementptr [13 x i8], [13 x i8]* @g1, i64 0, i64 0
+  %v11 = getelementptr [12 x i8], [12 x i8]* @g2, i64 0, i64 0
+  %v12 = getelementptr [13 x i8], [13 x i8]* @g3, i64 0, i64 0
+  %v13 = getelementptr [13 x i8], [13 x i8]* @g4, i64 0, i64 0
+  %v14 = getelementptr [13 x i8], [13 x i8]* @g5, i64 0, i64 0
+  %v15 = call i32 (i8*, ...) @f0(i8* %v9, i16 %v8)
+  ret i32 0
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/i1_VarArg.ll b/test/CodeGen/Hexagon/i1_VarArg.ll
index 4078c0f3f00..01619bc5424 100644
--- a/test/CodeGen/Hexagon/i1_VarArg.ll
+++ b/test/CodeGen/Hexagon/i1_VarArg.ll
@@ -1,44 +1,40 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: call __hexagon_{{[_A-Za-z0-9]+}}
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: dfcmp
 
-@a_str = internal constant [8 x i8] c"a = %f\0A\00"
-@b_str = internal constant [8 x i8] c"b = %f\0A\00"
-@add_str = internal constant [12 x i8] c"a + b = %f\0A\00"
-@sub_str = internal constant [12 x i8] c"a - b = %f\0A\00"
-@mul_str = internal constant [12 x i8] c"a * b = %f\0A\00"
-@div_str = internal constant [12 x i8] c"b / a = %f\0A\00"
-@rem_str = internal constant [13 x i8] c"b %% a = %f\0A\00"
-@lt_str = internal constant [12 x i8] c"a < b = %d\0A\00"
-@le_str = internal constant [13 x i8] c"a <= b = %d\0A\00"
-@gt_str = internal constant [12 x i8] c"a > b = %d\0A\00"
-@ge_str = internal constant [13 x i8] c"a >= b = %d\0A\00"
-@eq_str = internal constant [13 x i8] c"a == b = %d\0A\00"
-@ne_str = internal constant [13 x i8] c"a != b = %d\0A\00"
-@A = global double 2.000000e+00
-@B = global double 5.000000e+00
+@g0 = internal constant [12 x i8] c"a < b = %d\0A\00"
+@g1 = internal constant [13 x i8] c"a <= b = %d\0A\00"
+@g2 = internal constant [12 x i8] c"a > b = %d\0A\00"
+@g3 = internal constant [13 x i8] c"a >= b = %d\0A\00"
+@g4 = internal constant [13 x i8] c"a == b = %d\0A\00"
+@g5 = internal constant [13 x i8] c"a != b = %d\0A\00"
+@g6 = global double 2.000000e+00
+@g7 = global double 5.000000e+00
 
-declare i32 @printf(i8*, ...)
+declare i32 @f0(i8*, ...) #0
 
-define i32 @main() {
-        %a = load double, double* @A
-        %b = load double, double* @B
-        %lt_r = fcmp olt double %a, %b
-        %le_r = fcmp ole double %a, %b
-        %gt_r = fcmp ogt double %a, %b
-        %ge_r = fcmp oge double %a, %b
-        %eq_r = fcmp oeq double %a, %b
-        %ne_r = fcmp une double %a, %b
-        %lt_s = getelementptr [12 x i8], [12 x i8]* @lt_str, i64 0, i64 0
-        %le_s = getelementptr [13 x i8], [13 x i8]* @le_str, i64 0, i64 0
-        %gt_s = getelementptr [12 x i8], [12 x i8]* @gt_str, i64 0, i64 0
-        %ge_s = getelementptr [13 x i8], [13 x i8]* @ge_str, i64 0, i64 0
-        %eq_s = getelementptr [13 x i8], [13 x i8]* @eq_str, i64 0, i64 0
-        %ne_s = getelementptr [13 x i8], [13 x i8]* @ne_str, i64 0, i64 0
-        call i32 (i8*, ...) @printf( i8* %lt_s, i1 %lt_r )
-        call i32 (i8*, ...) @printf( i8* %le_s, i1 %le_r )
-        call i32 (i8*, ...) @printf( i8* %gt_s, i1 %gt_r )
-        call i32 (i8*, ...) @printf( i8* %ge_s, i1 %ge_r )
-        call i32 (i8*, ...) @printf( i8* %eq_s, i1 %eq_r )
-        call i32 (i8*, ...) @printf( i8* %ne_s, i1 %ne_r )
-        ret i32 0
+define i32 @f1() #0 {
+b0:
+  %v0 = load double, double* @g6
+  %v1 = load double, double* @g7
+  %v2 = fcmp olt double %v0, %v1
+  %v3 = fcmp ole double %v0, %v1
+  %v4 = fcmp ogt double %v0, %v1
+  %v5 = fcmp oge double %v0, %v1
+  %v6 = fcmp oeq double %v0, %v1
+  %v7 = fcmp une double %v0, %v1
+  %v8 = getelementptr [12 x i8], [12 x i8]* @g0, i64 0, i64 0
+  %v9 = getelementptr [13 x i8], [13 x i8]* @g1, i64 0, i64 0
+  %v10 = getelementptr [12 x i8], [12 x i8]* @g2, i64 0, i64 0
+  %v11 = getelementptr [13 x i8], [13 x i8]* @g3, i64 0, i64 0
+  %v12 = getelementptr [13 x i8], [13 x i8]* @g4, i64 0, i64 0
+  %v13 = getelementptr [13 x i8], [13 x i8]* @g5, i64 0, i64 0
+  %v14 = call i32 (i8*, ...) @f0(i8* %v8, i1 %v2)
+  %v15 = call i32 (i8*, ...) @f0(i8* %v9, i1 %v3)
+  %v16 = call i32 (i8*, ...) @f0(i8* %v10, i1 %v4)
+  %v17 = call i32 (i8*, ...) @f0(i8* %v11, i1 %v5)
+  %v18 = call i32 (i8*, ...) @f0(i8* %v12, i1 %v6)
+  %v19 = call i32 (i8*, ...) @f0(i8* %v13, i1 %v7)
+  ret i32 0
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/i8_VarArg.ll b/test/CodeGen/Hexagon/i8_VarArg.ll
index 1353de47a97..247952d0c5c 100644
--- a/test/CodeGen/Hexagon/i8_VarArg.ll
+++ b/test/CodeGen/Hexagon/i8_VarArg.ll
@@ -1,40 +1,36 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: call __hexagon_{{[A-Z_a-z0-9]+}}
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: dfcmp
 
-@a_str = internal constant [8 x i8] c"a = %f\0A\00"
-@b_str = internal constant [8 x i8] c"b = %f\0A\00"
-@add_str = internal constant [12 x i8] c"a + b = %f\0A\00"
-@sub_str = internal constant [12 x i8] c"a - b = %f\0A\00"
-@mul_str = internal constant [12 x i8] c"a * b = %f\0A\00"
-@div_str = internal constant [12 x i8] c"b / a = %f\0A\00"
-@rem_str = internal constant [13 x i8] c"b %% a = %f\0A\00"
-@lt_str = internal constant [12 x i8] c"a < b = %d\0A\00"
-@le_str = internal constant [13 x i8] c"a <= b = %d\0A\00"
-@gt_str = internal constant [12 x i8] c"a > b = %d\0A\00"
-@ge_str = internal constant [13 x i8] c"a >= b = %d\0A\00"
-@eq_str = internal constant [13 x i8] c"a == b = %d\0A\00"
-@ne_str = internal constant [13 x i8] c"a != b = %d\0A\00"
-@A = global double 2.000000e+00
-@B = global double 5.000000e+00
+@g0 = internal constant [12 x i8] c"a < b = %d\0A\00"
+@g1 = internal constant [13 x i8] c"a <= b = %d\0A\00"
+@g2 = internal constant [12 x i8] c"a > b = %d\0A\00"
+@g3 = internal constant [13 x i8] c"a >= b = %d\0A\00"
+@g4 = internal constant [13 x i8] c"a == b = %d\0A\00"
+@g5 = internal constant [13 x i8] c"a != b = %d\0A\00"
+@g6 = global double 2.000000e+00
+@g7 = global double 5.000000e+00
 
-declare i32 @printf(i8*, ...)
+declare i32 @f0(i8*, ...) #0
 
-define i32 @main() {
-        %a = load double, double* @A
-        %b = load double, double* @B
-        %lt_r = fcmp olt double %a, %b
-        %le_r = fcmp ole double %a, %b
-        %gt_r = fcmp ogt double %a, %b
-        %ge_r = fcmp oge double %a, %b
-        %eq_r = fcmp oeq double %a, %b
-        %ne_r = fcmp une double %a, %b
-        %val1 = zext i1 %lt_r to i8
-        %lt_s = getelementptr [12 x i8], [12 x i8]* @lt_str, i64 0, i64 0
-        %le_s = getelementptr [13 x i8], [13 x i8]* @le_str, i64 0, i64 0
-        %gt_s = getelementptr [12 x i8], [12 x i8]* @gt_str, i64 0, i64 0
-        %ge_s = getelementptr [13 x i8], [13 x i8]* @ge_str, i64 0, i64 0
-        %eq_s = getelementptr [13 x i8], [13 x i8]* @eq_str, i64 0, i64 0
-        %ne_s = getelementptr [13 x i8], [13 x i8]* @ne_str, i64 0, i64 0
-        call i32 (i8*, ...) @printf( i8* %lt_s, i8 %val1 )
-        ret i32 0
+define i32 @f1() #0 {
+b0:
+  %v0 = load double, double* @g6
+  %v1 = load double, double* @g7
+  %v2 = fcmp olt double %v0, %v1
+  %v3 = fcmp ole double %v0, %v1
+  %v4 = fcmp ogt double %v0, %v1
+  %v5 = fcmp oge double %v0, %v1
+  %v6 = fcmp oeq double %v0, %v1
+  %v7 = fcmp une double %v0, %v1
+  %v8 = zext i1 %v2 to i8
+  %v9 = getelementptr [12 x i8], [12 x i8]* @g0, i64 0, i64 0
+  %v10 = getelementptr [13 x i8], [13 x i8]* @g1, i64 0, i64 0
+  %v11 = getelementptr [12 x i8], [12 x i8]* @g2, i64 0, i64 0
+  %v12 = getelementptr [13 x i8], [13 x i8]* @g3, i64 0, i64 0
+  %v13 = getelementptr [13 x i8], [13 x i8]* @g4, i64 0, i64 0
+  %v14 = getelementptr [13 x i8], [13 x i8]* @g5, i64 0, i64 0
+  %v15 = call i32 (i8*, ...) @f0(i8* %v9, i8 %v8)
+  ret i32 0
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/macint.ll b/test/CodeGen/Hexagon/macint.ll
index 514ba5b9130..47856f2fcb5 100644
--- a/test/CodeGen/Hexagon/macint.ll
+++ b/test/CodeGen/Hexagon/macint.ll
@@ -1,14 +1,15 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4  < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we generate integer multiply accumulate.
 
 ; CHECK: r{{[0-9]+}} {{\+|\-}}= mpyi(r{{[0-9]+}},
 
-define i32 @main(i32* %a, i32* %b) nounwind {
-  entry:
-  %0 = load i32, i32* %a, align 4
-  %div = udiv i32 %0, 10000
-  %rem = urem i32 %div, 10
-  store i32 %rem, i32* %b, align 4
+define i32 @f0(i32* %a0, i32* %a1) #0 {
+b0:
+  %v0 = load i32, i32* %a0, align 4
+  %v1 = udiv i32 %v0, 10000
+  %v2 = urem i32 %v1, 10
+  store i32 %v2, i32* %a1, align 4
   ret i32 0
 }
 
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/misaligned-access.ll b/test/CodeGen/Hexagon/misaligned-access.ll
index f4b0cb9cb1e..7eb85ffcc22 100644
--- a/test/CodeGen/Hexagon/misaligned-access.ll
+++ b/test/CodeGen/Hexagon/misaligned-access.ll
@@ -1,16 +1,19 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s
+; RUN: llc -march=hexagon < %s
 ; Check that the mis-aligned load doesn't cause compiler to assert.
 
-declare i32 @_hi(i64) #1
-@temp1 = common global i32 0, align 4
+@g0 = common global i32 0, align 4
 
-define i32 @CSDRSEARCH_executeSearchManager() #0 {
-entry:
-  %temp = alloca i32, align 4
-  %0 = load i32, i32* @temp1, align 4
-  store i32 %0, i32* %temp, align 4
-  %1 = bitcast i32* %temp to i64*
-  %2 = load i64, i64* %1, align 8
-  %call = call i32 @_hi(i64 %2)
-  ret i32 %call
+declare i32 @f0(i64) #0
+
+define i32 @f1() #0 {
+b0:
+  %v0 = alloca i32, align 4
+  %v1 = load i32, i32* @g0, align 4
+  store i32 %v1, i32* %v0, align 4
+  %v2 = bitcast i32* %v0 to i64*
+  %v3 = load i64, i64* %v2, align 8
+  %v4 = call i32 @f0(i64 %v3)
+  ret i32 %v4
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/mpy.ll b/test/CodeGen/Hexagon/mpy.ll
index 3ecf7d46ccb..7c1e8c8d3f0 100644
--- a/test/CodeGen/Hexagon/mpy.ll
+++ b/test/CodeGen/Hexagon/mpy.ll
@@ -1,19 +1,21 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; CHECK: += mpyi
 
-define void @foo(i32 %acc, i32 %num, i32 %num2) nounwind {
-entry:
-  %acc.addr = alloca i32, align 4
-  %num.addr = alloca i32, align 4
-  %num2.addr = alloca i32, align 4
-  store i32 %acc, i32* %acc.addr, align 4
-  store i32 %num, i32* %num.addr, align 4
-  store i32 %num2, i32* %num2.addr, align 4
-  %0 = load i32, i32* %num.addr, align 4
-  %1 = load i32, i32* %acc.addr, align 4
-  %mul = mul nsw i32 %0, %1
-  %2 = load i32, i32* %num2.addr, align 4
-  %add = add nsw i32 %mul, %2
-  store i32 %add, i32* %num.addr, align 4
+define void @f0(i32 %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = alloca i32, align 4
+  %v1 = alloca i32, align 4
+  %v2 = alloca i32, align 4
+  store i32 %a0, i32* %v0, align 4
+  store i32 %a1, i32* %v1, align 4
+  store i32 %a2, i32* %v2, align 4
+  %v3 = load i32, i32* %v1, align 4
+  %v4 = load i32, i32* %v0, align 4
+  %v5 = mul nsw i32 %v3, %v4
+  %v6 = load i32, i32* %v2, align 4
+  %v7 = add nsw i32 %v5, %v6
+  store i32 %v7, i32* %v1, align 4
   ret void
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/newvaluejump.ll b/test/CodeGen/Hexagon/newvaluejump.ll
index e1437f369c8..0697d297d71 100644
--- a/test/CodeGen/Hexagon/newvaluejump.ll
+++ b/test/CodeGen/Hexagon/newvaluejump.ll
@@ -1,33 +1,36 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we generate new value jump.
 
-@i = global i32 0, align 4
-@j = global i32 10, align 4
-
-define i32 @foo(i32 %a) nounwind {
-entry:
 ; CHECK: if (cmp.eq(r{{[0-9]+}}.new,#0)) jump{{.}}
-  %addr1 = alloca i32, align 4
-  %addr2 = alloca i32, align 4
-  %0 = load i32, i32* @i, align 4
-  store i32 %0, i32* %addr1, align 4
-  call void @bar(i32 1, i32 2)
-  %1 = load i32, i32* @j, align 4
-  %tobool = icmp ne i32 %1, 0
-  br i1 %tobool, label %if.then, label %if.else
-
-if.then:
-  call void @baz(i32 1, i32 2)
-  br label %if.end
-
-if.else:
-  call void @guy(i32 10, i32 20)
-  br label %if.end
-
-if.end:
+
+@g0 = global i32 0, align 4
+@g1 = global i32 10, align 4
+
+define i32 @f0(i32 %a0) #0 {
+b0:
+  %v0 = alloca i32, align 4
+  %v1 = alloca i32, align 4
+  %v2 = load i32, i32* @g0, align 4
+  store i32 %v2, i32* %v0, align 4
+  call void @f2(i32 1, i32 2)
+  %v3 = load i32, i32* @g1, align 4
+  %v4 = icmp ne i32 %v3, 0
+  br i1 %v4, label %b1, label %b2
+
+b1:                                               ; preds = %b0
+  call void @f3(i32 1, i32 2)
+  br label %b3
+
+b2:                                               ; preds = %b0
+  call void @f1(i32 10, i32 20)
+  br label %b3
+
+b3:                                               ; preds = %b2, %b1
   ret i32 0
 }
 
-declare void @guy(i32, i32)
-declare void @bar(i32, i32)
-declare void @baz(i32, i32)
+declare void @f1(i32, i32) #0
+declare void @f2(i32, i32) #0
+declare void @f3(i32, i32) #0
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/packetize_cond_inst.ll b/test/CodeGen/Hexagon/packetize_cond_inst.ll
index 1fc6e82959e..8dca8f28114 100644
--- a/test/CodeGen/Hexagon/packetize_cond_inst.ll
+++ b/test/CodeGen/Hexagon/packetize_cond_inst.ll
@@ -1,10 +1,8 @@
-; RUN: llc -mcpu=hexagonv4 -tail-dup-size=1 < %s | FileCheck %s
+; RUN: llc -march=hexagon -tail-dup-size=1 < %s | FileCheck %s
 
-target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
 target triple = "hexagon-unknown--elf"
 
 ; Make sure we put the two conditionally executed adds in a packet.
-; ifcnv_add:
 ;     {
 ;       p0 = cmp.gt(r2, r1)
 ;       if (!p0.new) r0 = add(r2, r1)
@@ -13,20 +11,23 @@ target triple = "hexagon-unknown--elf"
 ; CHECK: cmp
 ; CHECK-NEXT: add
 ; CHECK-NEXT: add
-define i32 @ifcnv_add(i32, i32, i32) nounwind readnone {
-  %4 = icmp sgt i32 %2, %1
-  br i1 %4, label %5, label %7
+define i32 @f0(i32 %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a2, %a1
+  br i1 %v0, label %b1, label %b2
 
-; <label>:5                                       ; preds = %3
-  %6 = add nsw i32 %0, 10
-  br label %9
+b1:                                               ; preds = %b0
+  %v1 = add nsw i32 %a0, 10
+  br label %b3
 
-; <label>:7                                       ; preds = %3
-  %8 = add nsw i32 %2, %1
-  br label %9
+b2:                                               ; preds = %b0
+  %v2 = add nsw i32 %a2, %a1
+  br label %b3
 
-; <label>:9                                       ; preds = %7, %5
-  %10 = phi i32 [ %6, %5 ], [ %8, %7 ]
-  %11 = add nsw i32 %10, 1
-  ret i32 %11
+b3:                                               ; preds = %b2, %b1
+  %v3 = phi i32 [ %v1, %b1 ], [ %v2, %b2 ]
+  %v4 = add nsw i32 %v3, 1
+  ret i32 %v4
 }
+
+attributes #0 = { nounwind readnone "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/postinc-load.ll b/test/CodeGen/Hexagon/postinc-load.ll
index 8d8c93d76bf..825e16976a5 100644
--- a/test/CodeGen/Hexagon/postinc-load.ll
+++ b/test/CodeGen/Hexagon/postinc-load.ll
@@ -1,29 +1,30 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 
 ; Check that post-increment load instructions are being generated.
 ; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}++#4)
 
-define i32 @sum(i32* nocapture %a, i16* nocapture %b, i32 %n) nounwind {
-entry:
-  br label %for.body
+define i32 @f0(i32* nocapture %a0, i16* nocapture %a1, i32 %a2) #0 {
+b0:
+  br label %b1
 
-for.body:
-  %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 10, %entry ]
-  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
-  %arrayidx1.phi = phi i16* [ %b, %entry ], [ %arrayidx1.inc, %for.body ]
-  %sum.03 = phi i32 [ 0, %entry ], [ %add2, %for.body ]
-  %0 = load i32, i32* %arrayidx.phi, align 4
-  %1 = load i16, i16* %arrayidx1.phi, align 2
-  %conv = sext i16 %1 to i32
-  %add = add i32 %0, %sum.03
-  %add2 = add i32 %add, %conv
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  %arrayidx1.inc = getelementptr i16, i16* %arrayidx1.phi, i32 1
-  %lsr.iv.next = add i32 %lsr.iv, -1
-  %exitcond = icmp eq i32 %lsr.iv.next, 0
-  br i1 %exitcond, label %for.end, label %for.body
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32 [ %v11, %b1 ], [ 10, %b0 ]
+  %v1 = phi i32* [ %a0, %b0 ], [ %v9, %b1 ]
+  %v2 = phi i16* [ %a1, %b0 ], [ %v10, %b1 ]
+  %v3 = phi i32 [ 0, %b0 ], [ %v8, %b1 ]
+  %v4 = load i32, i32* %v1, align 4
+  %v5 = load i16, i16* %v2, align 2
+  %v6 = sext i16 %v5 to i32
+  %v7 = add i32 %v4, %v3
+  %v8 = add i32 %v7, %v6
+  %v9 = getelementptr i32, i32* %v1, i32 1
+  %v10 = getelementptr i16, i16* %v2, i32 1
+  %v11 = add i32 %v0, -1
+  %v12 = icmp eq i32 %v11, 0
+  br i1 %v12, label %b2, label %b1
 
-for.end:
-  ret i32 %add2
+b2:                                               ; preds = %b1
+  ret i32 %v8
 }
 
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/postinc-store.ll b/test/CodeGen/Hexagon/postinc-store.ll
index 276a7d8e0ff..2dabc7991e3 100644
--- a/test/CodeGen/Hexagon/postinc-store.ll
+++ b/test/CodeGen/Hexagon/postinc-store.ll
@@ -1,29 +1,30 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 
 ; Check that post-increment store instructions are being generated.
 ; CHECK: memw(r{{[0-9]+}}++#4) = r{{[0-9]+}}
 
-define i32 @sum(i32* nocapture %a, i16* nocapture %b, i32 %n) nounwind {
-entry:
-  br label %for.body
+define i32 @f0(i32* nocapture %a0, i16* nocapture %a1, i32 %a2) #0 {
+b0:
+  br label %b1
 
-for.body:                                         ; preds = %for.body, %entry
-  %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 10, %entry ]
-  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
-  %arrayidx1.phi = phi i16* [ %b, %entry ], [ %arrayidx1.inc, %for.body ]
-  %0 = load i32, i32* %arrayidx.phi, align 4
-  %1 = load i16, i16* %arrayidx1.phi, align 2
-  %conv = sext i16 %1 to i32
-  %factor = mul i32 %0, 2
-  %add3 = add i32 %factor, %conv
-  store i32 %add3, i32* %arrayidx.phi, align 4
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32 [ %v10, %b1 ], [ 10, %b0 ]
+  %v1 = phi i32* [ %a0, %b0 ], [ %v8, %b1 ]
+  %v2 = phi i16* [ %a1, %b0 ], [ %v9, %b1 ]
+  %v3 = load i32, i32* %v1, align 4
+  %v4 = load i16, i16* %v2, align 2
+  %v5 = sext i16 %v4 to i32
+  %v6 = mul i32 %v3, 2
+  %v7 = add i32 %v6, %v5
+  store i32 %v7, i32* %v1, align 4
+  %v8 = getelementptr i32, i32* %v1, i32 1
+  %v9 = getelementptr i16, i16* %v2, i32 1
+  %v10 = add i32 %v0, -1
+  %v11 = icmp eq i32 %v10, 0
+  br i1 %v11, label %b2, label %b1
 
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  %arrayidx1.inc = getelementptr i16, i16* %arrayidx1.phi, i32 1
-  %lsr.iv.next = add i32 %lsr.iv, -1
-  %exitcond = icmp eq i32 %lsr.iv.next, 0
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
+b2:                                               ; preds = %b1
   ret i32 0
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/pred-gp.ll b/test/CodeGen/Hexagon/pred-gp.ll
index 76a621699b6..4d50abf6283 100644
--- a/test/CodeGen/Hexagon/pred-gp.ll
+++ b/test/CodeGen/Hexagon/pred-gp.ll
@@ -1,28 +1,30 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we are able to predicate instructions with gp-relative
 ; addressing mode.
 
-@d = external global i32
-@c = common global i32 0, align 4
+; CHECK: if ({{!?}}p{{[0-3]+}}{{(.new)?}}) r{{[0-9]+}} = memw(##g{{[01]}})
+; CHECK: if ({{!?}}p{{[0-3]+}}) r{{[0-9]+}} = memw(##g{{[01]}})
 
-; Function Attrs: nounwind
-define i32 @test2(i8 zeroext %a, i8 zeroext %b) #0 {
-; CHECK: if ({{!?}}p{{[0-3]+}}{{(.new)?}}) r{{[0-9]+}} = memw(##{{[cd]}})
-; CHECK: if ({{!?}}p{{[0-3]+}}) r{{[0-9]+}} = memw(##{{[cd]}})
-entry:
-  %cmp = icmp eq i8 %a, %b
-  br i1 %cmp, label %if.then, label %entry.if.end_crit_edge
+@g0 = external global i32
+@g1 = common global i32 0, align 4
 
-entry.if.end_crit_edge:
-  %.pre = load i32, i32* @c, align 4
-  br label %if.end
+define i32 @f0(i8 zeroext %a0, i8 zeroext %a1) #0 {
+b0:
+  %v0 = icmp eq i8 %a0, %a1
+  br i1 %v0, label %b2, label %b1
 
-if.then:
-  %0 = load i32, i32* @d, align 4
-  store i32 %0, i32* @c, align 4
-  br label %if.end
+b1:                                               ; preds = %b0
+  %v1 = load i32, i32* @g1, align 4
+  br label %b3
 
-if.end:
-  %1 = phi i32 [ %.pre, %entry.if.end_crit_edge ], [ %0, %if.then ]
-  ret i32 %1
+b2:                                               ; preds = %b0
+  %v2 = load i32, i32* @g0, align 4
+  store i32 %v2, i32* @g1, align 4
+  br label %b3
+
+b3:                                               ; preds = %b2, %b1
+  %v3 = phi i32 [ %v1, %b1 ], [ %v2, %b2 ]
+  ret i32 %v3
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/pred-instrs.ll b/test/CodeGen/Hexagon/pred-instrs.ll
index da8ace98a0b..27986f872d9 100644
--- a/test/CodeGen/Hexagon/pred-instrs.ll
+++ b/test/CodeGen/Hexagon/pred-instrs.ll
@@ -1,30 +1,32 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we are able to predicate instructions.
 
 ; CHECK: if ({{!?}}p{{[0-3]}}{{(.new)?}}) r{{[0-9]+}} = {{and|aslh}}
 ; CHECK: if ({{!?}}p{{[0-3]}}{{(.new)?}}) r{{[0-9]+}} = {{and|aslh}}
-@a = external global i32
-@d = external global i32
 
-; Function Attrs: nounwind
-define i32 @test1(i8 zeroext %la, i8 zeroext %lb) {
-entry:
-  %cmp = icmp eq i8 %la, %lb
-  br i1 %cmp, label %if.then, label %if.else
+@g0 = external global i32
+@g1 = external global i32
 
-if.then:                                          ; preds = %entry
-  %conv1 = zext i8 %la to i32
-  %shl = shl nuw nsw i32 %conv1, 16
-  br label %if.end
+define i32 @f0(i8 zeroext %a0, i8 zeroext %a1) #0 {
+b0:
+  %v0 = icmp eq i8 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.else:                                          ; preds = %entry
-  %and8 = and i8 %lb, %la
-  %and = zext i8 %and8 to i32
-  br label %if.end
+b1:                                               ; preds = %b0
+  %v1 = zext i8 %a0 to i32
+  %v2 = shl nuw nsw i32 %v1, 16
+  br label %b3
 
-if.end:                                           ; preds = %if.else, %if.then
-  %storemerge = phi i32 [ %and, %if.else ], [ %shl, %if.then ]
-  store i32 %storemerge, i32* @a, align 4
-  %0 = load i32, i32* @d, align 4
-  ret i32 %0
+b2:                                               ; preds = %b0
+  %v3 = and i8 %a1, %a0
+  %v4 = zext i8 %v3 to i32
+  br label %b3
+
+b3:                                               ; preds = %b2, %b1
+  %v5 = phi i32 [ %v4, %b2 ], [ %v2, %b1 ]
+  store i32 %v5, i32* @g0, align 4
+  %v6 = load i32, i32* @g1, align 4
+  ret i32 %v6
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/predicate-copy.ll b/test/CodeGen/Hexagon/predicate-copy.ll
index 552b6879419..1b58ec9e790 100644
--- a/test/CodeGen/Hexagon/predicate-copy.ll
+++ b/test/CodeGen/Hexagon/predicate-copy.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s
+; RUN: llc -march=hexagon -O3 < %s | FileCheck %s
 
 ; CHECK: r{{[0-9]+}} = p{{[0-9]+}}
-define i1 @foo() {
-entry:
+
+define i1 @f0() #0 {
+b0:
   ret i1 false
 }
 
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/remove_lsr.ll b/test/CodeGen/Hexagon/remove_lsr.ll
index 3b85c486348..dee384520e5 100644
--- a/test/CodeGen/Hexagon/remove_lsr.ll
+++ b/test/CodeGen/Hexagon/remove_lsr.ll
@@ -1,6 +1,6 @@
 ; Test fix for PR-13709.
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: foo
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: f0
 ; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
 ; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
 
@@ -13,64 +13,64 @@
 ; This makes the lsr instruction dead and it gets removed subsequently
 ; by a dead code removal pass.
 
-%union.vect64 = type { i64 }
-%union.vect32 = type { i32 }
 
-define void @foo(%union.vect64* nocapture %sss_extracted_bit_rx_data_ptr,
- %union.vect32* nocapture %s_even, %union.vect32* nocapture %s_odd,
- i8* nocapture %scr_s_even_code_ptr, i8* nocapture %scr_s_odd_code_ptr)
- nounwind {
-entry:
-  %scevgep = getelementptr %union.vect64, %union.vect64* %sss_extracted_bit_rx_data_ptr, i32 1
-  %scevgep28 = getelementptr %union.vect32, %union.vect32* %s_odd, i32 1
-  %scevgep32 = getelementptr %union.vect32, %union.vect32* %s_even, i32 1
-  %scevgep36 = getelementptr i8, i8* %scr_s_odd_code_ptr, i32 1
-  %scevgep39 = getelementptr i8, i8* %scr_s_even_code_ptr, i32 1
-  br label %for.body
+%s.0 = type { i64 }
+%s.1 = type { i32 }
 
-for.body:                                         ; preds = %for.body, %entry
-  %lsr.iv42 = phi i32 [ %lsr.iv.next, %for.body ], [ 2, %entry ]
-  %lsr.iv40 = phi i8* [ %scevgep41, %for.body ], [ %scevgep39, %entry ]
-  %lsr.iv37 = phi i8* [ %scevgep38, %for.body ], [ %scevgep36, %entry ]
-  %lsr.iv33 = phi %union.vect32* [ %scevgep34, %for.body ], [ %scevgep32, %entry ]
-  %lsr.iv29 = phi %union.vect32* [ %scevgep30, %for.body ], [ %scevgep28, %entry ]
-  %lsr.iv = phi %union.vect64* [ %scevgep26, %for.body ], [ %scevgep, %entry ]
-  %predicate_1.023 = phi i8 [ undef, %entry ], [ %10, %for.body ]
-  %predicate.022 = phi i8 [ undef, %entry ], [ %9, %for.body ]
-  %val.021 = phi i64 [ undef, %entry ], [ %srcval, %for.body ]
-  %lsr.iv3335 = bitcast %union.vect32* %lsr.iv33 to i32*
-  %lsr.iv2931 = bitcast %union.vect32* %lsr.iv29 to i32*
-  %lsr.iv27 = bitcast %union.vect64* %lsr.iv to i64*
-  %0 = tail call i64 @llvm.hexagon.A2.vsubhs(i64 0, i64 %val.021)
-  %conv3 = sext i8 %predicate.022 to i32
-  %1 = trunc i64 %val.021 to i32
-  %2 = trunc i64 %0 to i32
-  %3 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv3, i32 %1, i32 %2)
-  store i32 %3, i32* %lsr.iv3335, align 4
-  %conv8 = sext i8 %predicate_1.023 to i32
-  %4 = lshr i64 %val.021, 32
-  %5 = trunc i64 %4 to i32
-  %6 = lshr i64 %0, 32
-  %7 = trunc i64 %6 to i32
-  %8 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv8, i32 %5, i32 %7)
-  store i32 %8, i32* %lsr.iv2931, align 4
-  %srcval = load i64, i64* %lsr.iv27, align 8
-  %9 = load i8, i8* %lsr.iv40, align 1
-  %10 = load i8, i8* %lsr.iv37, align 1
-  %lftr.wideiv = trunc i32 %lsr.iv42 to i8
-  %exitcond = icmp eq i8 %lftr.wideiv, 32
-  %scevgep26 = getelementptr %union.vect64, %union.vect64* %lsr.iv, i32 1
-  %scevgep30 = getelementptr %union.vect32, %union.vect32* %lsr.iv29, i32 1
-  %scevgep34 = getelementptr %union.vect32, %union.vect32* %lsr.iv33, i32 1
-  %scevgep38 = getelementptr i8, i8* %lsr.iv37, i32 1
-  %scevgep41 = getelementptr i8, i8* %lsr.iv40, i32 1
-  %lsr.iv.next = add i32 %lsr.iv42, 1
-  br i1 %exitcond, label %for.end, label %for.body
+define void @f0(%s.0* nocapture %a0, %s.1* nocapture %a1, %s.1* nocapture %a2, i8* nocapture %a3, i8* nocapture %a4) #0 {
+b0:
+  %v0 = getelementptr %s.0, %s.0* %a0, i32 1
+  %v1 = getelementptr %s.1, %s.1* %a2, i32 1
+  %v2 = getelementptr %s.1, %s.1* %a1, i32 1
+  %v3 = getelementptr i8, i8* %a4, i32 1
+  %v4 = getelementptr i8, i8* %a3, i32 1
+  br label %b1
 
-for.end:                                          ; preds = %for.body
+b1:                                               ; preds = %b1, %b0
+  %v5 = phi i32 [ %v38, %b1 ], [ 2, %b0 ]
+  %v6 = phi i8* [ %v37, %b1 ], [ %v4, %b0 ]
+  %v7 = phi i8* [ %v36, %b1 ], [ %v3, %b0 ]
+  %v8 = phi %s.1* [ %v35, %b1 ], [ %v2, %b0 ]
+  %v9 = phi %s.1* [ %v34, %b1 ], [ %v1, %b0 ]
+  %v10 = phi %s.0* [ %v33, %b1 ], [ %v0, %b0 ]
+  %v11 = phi i8 [ undef, %b0 ], [ %v30, %b1 ]
+  %v12 = phi i8 [ undef, %b0 ], [ %v29, %b1 ]
+  %v13 = phi i64 [ undef, %b0 ], [ %v28, %b1 ]
+  %v14 = bitcast %s.1* %v8 to i32*
+  %v15 = bitcast %s.1* %v9 to i32*
+  %v16 = bitcast %s.0* %v10 to i64*
+  %v17 = tail call i64 @llvm.hexagon.A2.vsubhs(i64 0, i64 %v13)
+  %v18 = sext i8 %v12 to i32
+  %v19 = trunc i64 %v13 to i32
+  %v20 = trunc i64 %v17 to i32
+  %v21 = tail call i32 @llvm.hexagon.C2.mux(i32 %v18, i32 %v19, i32 %v20)
+  store i32 %v21, i32* %v14, align 4
+  %v22 = sext i8 %v11 to i32
+  %v23 = lshr i64 %v13, 32
+  %v24 = trunc i64 %v23 to i32
+  %v25 = lshr i64 %v17, 32
+  %v26 = trunc i64 %v25 to i32
+  %v27 = tail call i32 @llvm.hexagon.C2.mux(i32 %v22, i32 %v24, i32 %v26)
+  store i32 %v27, i32* %v15, align 4
+  %v28 = load i64, i64* %v16, align 8
+  %v29 = load i8, i8* %v6, align 1
+  %v30 = load i8, i8* %v7, align 1
+  %v31 = trunc i32 %v5 to i8
+  %v32 = icmp eq i8 %v31, 32
+  %v33 = getelementptr %s.0, %s.0* %v10, i32 1
+  %v34 = getelementptr %s.1, %s.1* %v9, i32 1
+  %v35 = getelementptr %s.1, %s.1* %v8, i32 1
+  %v36 = getelementptr i8, i8* %v7, i32 1
+  %v37 = getelementptr i8, i8* %v6, i32 1
+  %v38 = add i32 %v5, 1
+  br i1 %v32, label %b2, label %b1
+
+b2:                                               ; preds = %b1
   ret void
 }
 
-declare i64 @llvm.hexagon.A2.vsubhs(i64, i64) nounwind readnone
+declare i64 @llvm.hexagon.A2.vsubhs(i64, i64) #1
+declare i32 @llvm.hexagon.C2.mux(i32, i32, i32) #1
 
-declare i32 @llvm.hexagon.C2.mux(i32, i32, i32) nounwind readnone
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
+attributes #1 = { nounwind readnone "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/simpletailcall.ll b/test/CodeGen/Hexagon/simpletailcall.ll
index 287640489a5..76854bc1981 100644
--- a/test/CodeGen/Hexagon/simpletailcall.ll
+++ b/test/CodeGen/Hexagon/simpletailcall.ll
@@ -1,14 +1,16 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: foo_empty
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: f0
 ; CHECK-NOT: allocframe
 ; CHECK-NOT: memd(r29
-; CHECK: jump bar_empty
+; CHECK: jump f1
 
-define void @foo_empty(i32 %h) nounwind {
-entry:
-  %add = add nsw i32 %h, 3
-  %call = tail call i32 bitcast (i32 (...)* @bar_empty to i32 (i32)*)(i32 %add) nounwind
+define void @f0(i32 %a0) #0 {
+b0:
+  %v0 = add nsw i32 %a0, 3
+  %v1 = tail call i32 bitcast (i32 (...)* @f1 to i32 (i32)*)(i32 %v0) #0
   ret void
 }
 
-declare i32 @bar_empty(...)
+declare i32 @f1(...) #0
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/union-1.ll b/test/CodeGen/Hexagon/union-1.ll
index 8f2ff28b381..970ded79deb 100644
--- a/test/CodeGen/Hexagon/union-1.ll
+++ b/test/CodeGen/Hexagon/union-1.ll
@@ -1,19 +1,21 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: word
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: f0
 ; CHECK-NOT: combine(#0
-; CHECK: jump bar
+; CHECK: jump f1
 
-define void @word(i32* nocapture %a) nounwind {
-entry:
-  %0 = load i32, i32* %a, align 4
-  %1 = zext i32 %0 to i64
-  %add.ptr = getelementptr inbounds i32, i32* %a, i32 1
-  %2 = load i32, i32* %add.ptr, align 4
-  %3 = zext i32 %2 to i64
-  %4 = shl nuw i64 %3, 32
-  %ins = or i64 %4, %1
-  tail call void @bar(i64 %ins) nounwind
+define void @f0(i32* nocapture %a0) #0 {
+b0:
+  %v0 = load i32, i32* %a0, align 4
+  %v1 = zext i32 %v0 to i64
+  %v2 = getelementptr inbounds i32, i32* %a0, i32 1
+  %v3 = load i32, i32* %v2, align 4
+  %v4 = zext i32 %v3 to i64
+  %v5 = shl nuw i64 %v4, 32
+  %v6 = or i64 %v5, %v1
+  tail call void @f1(i64 %v6) #0
   ret void
 }
 
-declare void @bar(i64)
+declare void @f1(i64) #0
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/vaddh.ll b/test/CodeGen/Hexagon/vaddh.ll
index a4fb33de4ac..f139c288bb5 100644
--- a/test/CodeGen/Hexagon/vaddh.ll
+++ b/test/CodeGen/Hexagon/vaddh.ll
@@ -1,16 +1,19 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; CHECK: vaddh(r{{[0-9]+}},r{{[0-9]+}})
 
-@j = external global i32
-@k = external global i32
+@g0 = external global i32
+@g1 = external global i32
 
-define void @foo() nounwind {
-entry:
-  %0 = load i32, i32* @j, align 4
-  %1 = load i32, i32* @k, align 4
-  %2 = call i32 @llvm.hexagon.A2.svaddh(i32 %0, i32 %1)
-  store i32 %2, i32* @k, align 4
+define void @f0() #0 {
+b0:
+  %v0 = load i32, i32* @g0, align 4
+  %v1 = load i32, i32* @g1, align 4
+  %v2 = call i32 @llvm.hexagon.A2.svaddh(i32 %v0, i32 %v1)
+  store i32 %v2, i32* @g1, align 4
   ret void
 }
 
-declare i32 @llvm.hexagon.A2.svaddh(i32, i32) nounwind readnone
+declare i32 @llvm.hexagon.A2.svaddh(i32, i32) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
+attributes #1 = { nounwind readnone "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/validate-offset.ll b/test/CodeGen/Hexagon/validate-offset.ll
index 8de006c80b1..ed98f281e4b 100644
--- a/test/CodeGen/Hexagon/validate-offset.ll
+++ b/test/CodeGen/Hexagon/validate-offset.ll
@@ -1,36 +1,38 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s -O0
+; RUN: llc -march=hexagon -O0 < %s
 
 ; This is a regression test which makes sure that the offset check
 ; is available for STRiw_indexed instruction. This is required
 ; by 'Hexagon Expand Predicate Spill Code' pass.
 
-define i32 @f(i32 %a, i32 %b) nounwind {
-entry:
-  %retval = alloca i32, align 4
-  %a.addr = alloca i32, align 4
-  %b.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  store i32 %b, i32* %b.addr, align 4
-  %0 = load i32, i32* %a.addr, align 4
-  %1 = load i32, i32* %b.addr, align 4
-  %cmp = icmp sgt i32 %0, %1
-  br i1 %cmp, label %if.then, label %if.else
+define i32 @f0(i32 %a0, i32 %a1) #0 {
+b0:
+  %v0 = alloca i32, align 4
+  %v1 = alloca i32, align 4
+  %v2 = alloca i32, align 4
+  store i32 %a0, i32* %v1, align 4
+  store i32 %a1, i32* %v2, align 4
+  %v3 = load i32, i32* %v1, align 4
+  %v4 = load i32, i32* %v2, align 4
+  %v5 = icmp sgt i32 %v3, %v4
+  br i1 %v5, label %b1, label %b2
 
-if.then:
-  %2 = load i32, i32* %a.addr, align 4
-  %3 = load i32, i32* %b.addr, align 4
-  %add = add nsw i32 %2, %3
-  store i32 %add, i32* %retval
-  br label %return
+b1:                                               ; preds = %b0
+  %v6 = load i32, i32* %v1, align 4
+  %v7 = load i32, i32* %v2, align 4
+  %v8 = add nsw i32 %v6, %v7
+  store i32 %v8, i32* %v0
+  br label %b3
 
-if.else:
-  %4 = load i32, i32* %a.addr, align 4
-  %5 = load i32, i32* %b.addr, align 4
-  %sub = sub nsw i32 %4, %5
-  store i32 %sub, i32* %retval
-  br label %return
+b2:                                               ; preds = %b0
+  %v9 = load i32, i32* %v1, align 4
+  %v10 = load i32, i32* %v2, align 4
+  %v11 = sub nsw i32 %v9, %v10
+  store i32 %v11, i32* %v0
+  br label %b3
 
-return:
-  %6 = load i32, i32* %retval
-  ret i32 %6
+b3:                                               ; preds = %b2, %b1
+  %v12 = load i32, i32* %v0
+  ret i32 %v12
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/MC/Hexagon/elf-flags.s b/test/MC/Hexagon/elf-flags.s
index 0d2f007cb3d..e5c4a8d93fe 100644
--- a/test/MC/Hexagon/elf-flags.s
+++ b/test/MC/Hexagon/elf-flags.s
@@ -1,10 +1,8 @@
-# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv4 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V4 %s
 # RUN: llvm-mc -arch=hexagon -mcpu=hexagonv5 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V5 %s
 # RUN: llvm-mc -arch=hexagon -mcpu=hexagonv55 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V55 %s
 # RUN: llvm-mc -arch=hexagon -mcpu=hexagonv60 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V60 %s
 # RUN: llvm-mc -arch=hexagon -mcpu=hexagonv62 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V62 %s
 
-# CHECK-V4: Flags: 0x3
 # CHECK-V5: Flags: 0x4
 # CHECK-V55: Flags: 0x5
 # CHECK-V60: Flags: 0x60
-- 
GitLab


From 69b6ec046a58fe2e4d2867d52e06ca77d4af13ea Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 19 Oct 2018 17:54:53 +0000
Subject: [PATCH 0322/1116] [InstCombine] use m_Neg() in dyn_castNegVal() to
 match vectors with undef elts

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344793 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstructionCombining.cpp |  5 +++--
 test/Transforms/InstCombine/sub.ll                  | 10 +++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index ae7d08149c6..34a5e1955b6 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -747,8 +747,9 @@ Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
 /// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
 /// constant zero (which is the 'negate' form).
 Value *InstCombiner::dyn_castNegVal(Value *V) const {
-  if (BinaryOperator::isNeg(V))
-    return BinaryOperator::getNegArgument(V);
+  Value *NegV;
+  if (match(V, m_Neg(m_Value(NegV))))
+    return NegV;
 
   // Constants can be considered to be negated values if they can be folded.
   if (ConstantInt *C = dyn_cast<ConstantInt>(V))
diff --git a/test/Transforms/InstCombine/sub.ll b/test/Transforms/InstCombine/sub.ll
index 299633b25ac..dd9fadf2023 100644
--- a/test/Transforms/InstCombine/sub.ll
+++ b/test/Transforms/InstCombine/sub.ll
@@ -132,7 +132,7 @@ define <2 x i32> @neg_nsw_sub_nsw_vec(<2 x i32> %x, <2 x i32> %y) {
 
 define <2 x i32> @neg_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @neg_sub_vec_undef(
-; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %neg = sub <2 x i32> <i32 0, i32 undef>, %x
@@ -142,7 +142,7 @@ define <2 x i32> @neg_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) {
 
 define <2 x i32> @neg_nsw_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @neg_nsw_sub_vec_undef(
-; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %neg = sub nsw <2 x i32> <i32 undef, i32 0>, %x
@@ -152,7 +152,7 @@ define <2 x i32> @neg_nsw_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) {
 
 define <2 x i32> @neg_sub_nsw_vec_undef(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @neg_sub_nsw_vec_undef(
-; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %neg = sub <2 x i32> <i32 undef, i32 0>, %x
@@ -160,11 +160,11 @@ define <2 x i32> @neg_sub_nsw_vec_undef(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i32> %r
 }
 
-; TODO: This should not drop 'nsw'.
+; This should not drop 'nsw'.
 
 define <2 x i32> @neg_nsw_sub_nsw_vec_undef(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @neg_nsw_sub_nsw_vec_undef(
-; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = add nsw <2 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %neg = sub nsw <2 x i32> <i32 0, i32 undef>, %x
-- 
GitLab


From 65b8c0fe7a58b803fdf8cea386bb04e87c47aaef Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 19 Oct 2018 17:57:53 +0000
Subject: [PATCH 0323/1116] [dwarfdump] Hide ranges in diff-mode.

llvm-dwarfdump --diff should not print DW_AT_ranges. This patch fixes
that.

Differential revision: https://reviews.llvm.org/D53353

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344794 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/DWARF/DWARFDie.cpp                  |  4 +++-
 test/DebugInfo/X86/dwarfdump-ranges-unrelocated.s | 10 ++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 35567d0f67a..cf10c1134a7 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -59,12 +59,14 @@ static void dumpRanges(const DWARFObject &Obj, raw_ostream &OS,
                        const DWARFAddressRangesVector &Ranges,
                        unsigned AddressSize, unsigned Indent,
                        const DIDumpOptions &DumpOpts) {
+  if (!DumpOpts.ShowAddresses)
+    return;
+
   ArrayRef<SectionName> SectionNames;
   if (DumpOpts.Verbose)
     SectionNames = Obj.getSectionNames();
 
   for (const DWARFAddressRange &R : Ranges) {
-
     OS << '\n';
     OS.indent(Indent);
     R.dump(OS, AddressSize);
diff --git a/test/DebugInfo/X86/dwarfdump-ranges-unrelocated.s b/test/DebugInfo/X86/dwarfdump-ranges-unrelocated.s
index 2bb46707cc8..d779dac1a41 100644
--- a/test/DebugInfo/X86/dwarfdump-ranges-unrelocated.s
+++ b/test/DebugInfo/X86/dwarfdump-ranges-unrelocated.s
@@ -21,6 +21,16 @@
 # BRIEF-NEXT:  [0x0000000000000000, 0x0000000000000002)
 # BRIEF-NEXT:  [0x0000000000000000, 0x0000000000000003))
 
+# RUN: llvm-dwarfdump -diff %t | FileCheck %s --check-prefix=DIFF
+# DIFF: DW_TAG_compile_unit
+# DIFF-NEXT: DW_AT_producer	()
+# DIFF-NEXT: DW_AT_language	(DW_LANG_C_plus_plus)
+# DIFF-NEXT: DW_AT_name	()
+# DIFF-NEXT: DW_AT_stmt_list	()
+# DIFF-NEXT: DW_AT_comp_dir	()
+# DIFF-NEXT: DW_AT_low_pc	()
+# DIFF-NEXT: DW_AT_ranges	()
+
 ## Asm code for testcase is a reduced and modified output from next
 ## invocation and source:
 # clang test.cpp -S -o test.s -gmlt -ffunction-sections
-- 
GitLab


From aff56dc8f1ecb9cef81f8168e3ebbe5c125da7d2 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Fri, 19 Oct 2018 18:15:32 +0000
Subject: [PATCH 0324/1116] [ConstantFolding] Constant fold minimum and maximum
 intrinsics

Summary: Depends on D52764

Reviewers: aheejin, dschuff

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D52765

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344796 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ConstantFolding.cpp         |  14 +++
 test/Analysis/ConstantFolding/min-max.ll | 136 +++++++++++++++++++++++
 2 files changed, 150 insertions(+)
 create mode 100644 test/Analysis/ConstantFolding/min-max.ll

diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index c73250a3845..9ae8f1728c2 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1363,6 +1363,8 @@ bool llvm::canConstantFoldCallTo(ImmutableCallSite CS, const Function *F) {
   case Intrinsic::fabs:
   case Intrinsic::minnum:
   case Intrinsic::maxnum:
+  case Intrinsic::minimum:
+  case Intrinsic::maximum:
   case Intrinsic::log:
   case Intrinsic::log2:
   case Intrinsic::log10:
@@ -1912,6 +1914,18 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
           return ConstantFP::get(Ty->getContext(), maxnum(C1, C2));
         }
 
+        if (IntrinsicID == Intrinsic::minimum) {
+          const APFloat &C1 = Op1->getValueAPF();
+          const APFloat &C2 = Op2->getValueAPF();
+          return ConstantFP::get(Ty->getContext(), minimum(C1, C2));
+        }
+
+        if (IntrinsicID == Intrinsic::maximum) {
+          const APFloat &C1 = Op1->getValueAPF();
+          const APFloat &C2 = Op2->getValueAPF();
+          return ConstantFP::get(Ty->getContext(), maximum(C1, C2));
+        }
+
         if (!TLI)
           return nullptr;
         if ((Name == "pow" && TLI->has(LibFunc_pow)) ||
diff --git a/test/Analysis/ConstantFolding/min-max.ll b/test/Analysis/ConstantFolding/min-max.ll
new file mode 100644
index 00000000000..b872e4a1aca
--- /dev/null
+++ b/test/Analysis/ConstantFolding/min-max.ll
@@ -0,0 +1,136 @@
+; RUN: opt -instcombine -S -o - %s | FileCheck %s
+; Tests that constant folding of min and max operations works as expected.
+
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+
+declare float @llvm.minimum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
+declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
+
+; CHECK: define float @minnum_float() {
+define float @minnum_float() {
+  ; CHECK-NEXT: ret float 5.000000e+00
+  %1 = call float @llvm.minnum.f32(float 5.0, float 42.0)
+  ret float %1
+}
+
+; Check that minnum constant folds to propagate non-NaN or smaller argument
+; CHECK: define <4 x float> @minnum_float_vec() {
+define <4 x float> @minnum_float_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0x7FF8000000000000, float 5.000000e+00,
+  ; CHECK-SAME:                  float 4.200000e+01, float 5.000000e+00>
+  %1 = call <4 x float> @llvm.minnum.v4f32(
+    <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 42., float 42.>,
+    <4 x float> <float 0x7FF8000000000000, float 5., float 0x7FF8000000000000, float 5.>
+  )
+  ret <4 x float> %1
+}
+
+; Check that minnum constant folds to propagate one of its argument zeros
+; CHECK: define <4 x float> @minnum_float_zeros_vec() {
+define <4 x float> @minnum_float_zeros_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0.000000e+00, float {{-?}}0.000000e+00,
+  ; CHECK-SAME:                  float {{-?}}0.000000e+00, float -0.000000e+00>
+  %1 = call <4 x float> @llvm.minnum.v4f32(
+    <4 x float> <float 0.0, float -0.0, float 0.0, float -0.0>,
+    <4 x float> <float 0.0, float 0.0, float -0.0, float -0.0>
+  )
+  ret <4 x float> %1
+}
+
+; CHECK: define float @maxnum_float() {
+define float @maxnum_float() {
+  ; CHECK-NEXT: ret float 4.200000e+01
+  %1 = call float @llvm.maxnum.f32(float 5.0, float 42.0)
+  ret float %1
+}
+
+; Check that maxnum constant folds to propagate non-NaN or greater argument
+; CHECK: define <4 x float> @maxnum_float_vec() {
+define <4 x float> @maxnum_float_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0x7FF8000000000000, float 5.000000e+00,
+  ; CHECK-SAME:                  float 4.200000e+01, float 4.200000e+01>
+  %1 = call <4 x float> @llvm.maxnum.v4f32(
+    <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 42., float 42.>,
+    <4 x float> <float 0x7FF8000000000000, float 5., float 0x7FF8000000000000, float 5.>
+  )
+  ret <4 x float> %1
+}
+
+; Check that maxnum constant folds to propagate one of its argument zeros
+; CHECK: define <4 x float> @maxnum_float_zeros_vec() {
+define <4 x float> @maxnum_float_zeros_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0.000000e+00, float {{-?}}0.000000e+00,
+  ; CHECK-SAME:                  float {{-?}}0.000000e+00, float -0.000000e+00>
+  %1 = call <4 x float> @llvm.maxnum.v4f32(
+    <4 x float> <float 0.0, float -0.0, float 0.0, float -0.0>,
+    <4 x float> <float 0.0, float 0.0, float -0.0, float -0.0>
+  )
+  ret <4 x float> %1
+}
+
+; CHECK: define float @minimum_float() {
+define float @minimum_float() {
+  ; CHECK-NEXT: ret float 5.000000e+00
+  %1 = call float @llvm.minimum.f32(float 5.0, float 42.0)
+  ret float %1
+}
+
+; Check that minimum propagates its NaN or smaller argument
+; CHECK: define <4 x float> @minimum_float_vec() {
+define <4 x float> @minimum_float_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000,
+  ; CHECK-SAME:                  float 0x7FF8000000000000, float 5.000000e+00>
+  %1 = call <4 x float> @llvm.minimum.v4f32(
+    <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 42., float 42.>,
+    <4 x float> <float 0x7FF8000000000000, float 5., float 0x7FF8000000000000, float 5.>
+  )
+  ret <4 x float> %1
+}
+
+; Check that minimum treats -0.0 as smaller than 0.0 while constant folding
+; CHECK: define <4 x float> @minimum_float_zeros_vec() {
+define <4 x float> @minimum_float_zeros_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0.000000e+00, float -0.000000e+00,
+  ; CHECK-SAME:                  float -0.000000e+00, float -0.000000e+00>
+  %1 = call <4 x float> @llvm.minimum.v4f32(
+    <4 x float> <float 0.0, float -0.0, float 0.0, float -0.0>,
+    <4 x float> <float 0.0, float 0.0, float -0.0, float -0.0>
+  )
+  ret <4 x float> %1
+}
+
+; CHECK: define float @maximum_float() {
+define float @maximum_float() {
+  ; CHECK-NEXT: ret float 4.200000e+01
+  %1 = call float @llvm.maximum.f32(float 5.0, float 42.0)
+  ret float %1
+}
+
+; Check that maximum propagates its NaN or greater argument
+; CHECK: define <4 x float> @maximum_float_vec() {
+define <4 x float> @maximum_float_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000,
+  ; CHECK-SAME:                  float 0x7FF8000000000000, float 4.200000e+01>
+  %1 = call <4 x float> @llvm.maximum.v4f32(
+    <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 42., float 42.>,
+    <4 x float> <float 0x7FF8000000000000, float 5., float 0x7FF8000000000000, float 5.>
+  )
+  ret <4 x float> %1
+}
+
+; Check that maximum treats -0.0 as smaller than 0.0 while constant folding
+; CHECK: define <4 x float> @maximum_float_zeros_vec() {
+define <4 x float> @maximum_float_zeros_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0.000000e+00, float 0.000000e+00,
+  ; CHECK-SAME:                  float 0.000000e+00, float -0.000000e+00>
+  %1 = call <4 x float> @llvm.maximum.v4f32(
+    <4 x float> <float 0.0, float -0.0, float 0.0, float -0.0>,
+    <4 x float> <float 0.0, float 0.0, float -0.0, float -0.0>
+  )
+  ret <4 x float> %1
+}
-- 
GitLab


From 37f8c97acd3eb1b0b1f1dfa7d1b45ceda029acca Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Fri, 19 Oct 2018 18:39:29 +0000
Subject: [PATCH 0325/1116] [llvm-mca] Remove a stale TODO comment. NFC

Starting from revision r344334, we can now describe optimizable
register-register moves in the machine scheduling models.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344797 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/include/Instruction.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/llvm-mca/include/Instruction.h b/tools/llvm-mca/include/Instruction.h
index 0d7db11795e..ca84b86d70d 100644
--- a/tools/llvm-mca/include/Instruction.h
+++ b/tools/llvm-mca/include/Instruction.h
@@ -342,8 +342,6 @@ class Instruction {
   // This field is set for instructions that are candidates for move
   // elimination. For more information about move elimination, see the
   // definition of RegisterMappingTracker in RegisterFile.h
-  //
-  // TODO: Teach subtargets how to describe optimizable register moves.
   bool IsOptimizableMove;
 
   using UniqueDef = std::unique_ptr<WriteState>;
-- 
GitLab


From 9e79a4ff275325b155e277ee8eb16190d8cc0b9e Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Fri, 19 Oct 2018 19:01:26 +0000
Subject: [PATCH 0326/1116] [InstCombine] InstCombine and InstSimplify for
 minimum and maximum

Summary: Depends on D52765

Reviewers: aheejin, dschuff

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D52766

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344799 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/InstructionSimplify.cpp          |  25 +-
 lib/Analysis/ValueTracking.cpp                |   6 +
 .../InstCombine/InstCombineCalls.cpp          |  27 +-
 test/Transforms/InstCombine/maximum.ll        | 238 +++++++++++++
 test/Transforms/InstCombine/minimum.ll        | 263 +++++++++++++++
 .../InstSimplify/floating-point-arithmetic.ll | 317 +++++++++++++++++-
 .../InstSimplify/floating-point-compare.ll    |  14 +-
 test/Transforms/LICM/hoist-round.ll           |  12 +-
 test/Transforms/SimplifyCFG/speculate-math.ll |  36 ++
 9 files changed, 920 insertions(+), 18 deletions(-)
 create mode 100644 test/Transforms/InstCombine/maximum.ll
 create mode 100644 test/Transforms/InstCombine/minimum.ll

diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 86f5652f830..6ff72638512 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -4827,13 +4827,24 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
     }
     break;
   case Intrinsic::maxnum:
-  case Intrinsic::minnum: {
+  case Intrinsic::minnum:
+  case Intrinsic::maximum:
+  case Intrinsic::minimum: {
     // If the arguments are the same, this is a no-op.
     if (Op0 == Op1) return Op0;
 
-    // If one argument is NaN or undef, return the other argument.
-    if (match(Op0, m_CombineOr(m_NaN(), m_Undef()))) return Op1;
-    if (match(Op1, m_CombineOr(m_NaN(), m_Undef()))) return Op0;
+    // If one argument is undef, return the other argument.
+    if (match(Op0, m_Undef()))
+      return Op1;
+    if (match(Op1, m_Undef()))
+      return Op0;
+
+    // If one argument is NaN, return other or NaN appropriately.
+    bool PropagateNaN = IID == Intrinsic::minimum || IID == Intrinsic::maximum;
+    if (match(Op0, m_NaN()))
+      return PropagateNaN ? Op0 : Op1;
+    if (match(Op1, m_NaN()))
+      return PropagateNaN ? Op1 : Op0;
 
     // Min/max of the same operation with common operand:
     // m(m(X, Y)), X --> m(X, Y) (4 commuted variants)
@@ -4846,9 +4857,9 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
           (M1->getOperand(0) == Op0 || M1->getOperand(1) == Op0))
         return Op1;
 
-    // minnum(X, -Inf) --> -Inf (and commuted variant)
-    // maxnum(X, +Inf) --> +Inf (and commuted variant)
-    bool UseNegInf = IID == Intrinsic::minnum;
+    // min(X, -Inf) --> -Inf (and commuted variant)
+    // max(X, +Inf) --> +Inf (and commuted variant)
+    bool UseNegInf = IID == Intrinsic::minnum || IID == Intrinsic::minimum;
     const APFloat *C;
     if ((match(Op0, m_APFloat(C)) && C->isInfinity() &&
          C->isNegative() == UseNegInf) ||
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 02d071717f0..b7ff81f9d54 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -2898,7 +2898,13 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
               cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI,
                                               SignBitOnly, Depth + 1));
 
+    case Intrinsic::maximum:
+      return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                             Depth + 1) ||
+             cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
+                                             Depth + 1);
     case Intrinsic::minnum:
+    case Intrinsic::minimum:
       return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                              Depth + 1) &&
              cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 6d2ac2274de..116b11386f0 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2020,7 +2020,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   }
 
   case Intrinsic::minnum:
-  case Intrinsic::maxnum: {
+  case Intrinsic::maxnum:
+  case Intrinsic::minimum:
+  case Intrinsic::maximum: {
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
     // Canonicalize constants to the RHS.
@@ -2034,10 +2036,25 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
         (Arg0->hasOneUse() || Arg1->hasOneUse())) {
       // If both operands are negated, invert the call and negate the result:
-      // minnum(-X, -Y) --> -(maxnum(X, Y))
-      // maxnum(-X, -Y) --> -(minnum(X, Y))
-      Intrinsic::ID NewIID = II->getIntrinsicID() == Intrinsic::maxnum ?
-          Intrinsic::minnum : Intrinsic::maxnum;
+      // min(-X, -Y) --> -(max(X, Y))
+      // max(-X, -Y) --> -(min(X, Y))
+      Intrinsic::ID NewIID;
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::maxnum:
+        NewIID = Intrinsic::minnum;
+        break;
+      case Intrinsic::minnum:
+        NewIID = Intrinsic::maxnum;
+        break;
+      case Intrinsic::maximum:
+        NewIID = Intrinsic::minimum;
+        break;
+      case Intrinsic::minimum:
+        NewIID = Intrinsic::maximum;
+        break;
+      default:
+        llvm_unreachable("unexpected intrinsic ID");
+      }
       Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II);
       Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall);
       FNeg->copyIRFlags(II);
diff --git a/test/Transforms/InstCombine/maximum.ll b/test/Transforms/InstCombine/maximum.ll
new file mode 100644
index 00000000000..302b21cf626
--- /dev/null
+++ b/test/Transforms/InstCombine/maximum.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+declare float @llvm.maximum.f32(float, float)
+declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
+
+declare double @llvm.maximum.f64(double, double)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+define float @constant_fold_maximum_f32() {
+; CHECK-LABEL: @constant_fold_maximum_f32(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float 1.0, float 2.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_inv() {
+; CHECK-LABEL: @constant_fold_maximum_f32_inv(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float 2.0, float 1.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_nan0() {
+; CHECK-LABEL: @constant_fold_maximum_f32_nan0(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.maximum.f32(float 0x7FF8000000000000, float 2.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_nan1() {
+; CHECK-LABEL: @constant_fold_maximum_f32_nan1(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.maximum.f32(float 2.0, float 0x7FF8000000000000)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_nan_nan() {
+; CHECK-LABEL: @constant_fold_maximum_f32_nan_nan(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.maximum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_p0_p0() {
+; CHECK-LABEL: @constant_fold_maximum_f32_p0_p0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float 0.0, float 0.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_p0_n0() {
+; CHECK-LABEL: @constant_fold_maximum_f32_p0_n0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float 0.0, float -0.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_n0_p0() {
+; CHECK-LABEL: @constant_fold_maximum_f32_n0_p0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float -0.0, float 0.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_n0_n0() {
+; CHECK-LABEL: @constant_fold_maximum_f32_n0_n0(
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float -0.0, float -0.0)
+  ret float %x
+}
+
+define <4 x float> @constant_fold_maximum_v4f32() {
+; CHECK-LABEL: @constant_fold_maximum_v4f32(
+; CHECK-NEXT:    ret <4 x float> <float 2.000000e+00, float 8.000000e+00, float 1.000000e+01, float 9.000000e+00>
+;
+  %x = call <4 x float> @llvm.maximum.v4f32(<4 x float> <float 1.0, float 8.0, float 3.0, float 9.0>, <4 x float> <float 2.0, float 2.0, float 10.0, float 5.0>)
+  ret <4 x float> %x
+}
+
+define double @constant_fold_maximum_f64() {
+; CHECK-LABEL: @constant_fold_maximum_f64(
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %x = call double @llvm.maximum.f64(double 1.0, double 2.0)
+  ret double %x
+}
+
+define double @constant_fold_maximum_f64_nan0() {
+; CHECK-LABEL: @constant_fold_maximum_f64_nan0(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.maximum.f64(double 0x7FF8000000000000, double 2.0)
+  ret double %x
+}
+
+define double @constant_fold_maximum_f64_nan1() {
+; CHECK-LABEL: @constant_fold_maximum_f64_nan1(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.maximum.f64(double 2.0, double 0x7FF8000000000000)
+  ret double %x
+}
+
+define double @constant_fold_maximum_f64_nan_nan() {
+; CHECK-LABEL: @constant_fold_maximum_f64_nan_nan(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.maximum.f64(double 0x7FF8000000000000, double 0x7FF8000000000000)
+  ret double %x
+}
+
+define float @canonicalize_constant_maximum_f32(float %x) {
+; CHECK-LABEL: @canonicalize_constant_maximum_f32(
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float 1.000000e+00)
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %y = call float @llvm.maximum.f32(float 1.0, float %x)
+  ret float %y
+}
+
+define float @maximum_f32_nan_val(float %x) {
+; CHECK-LABEL: @maximum_f32_nan_val(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %y = call float @llvm.maximum.f32(float 0x7FF8000000000000, float %x)
+  ret float %y
+}
+
+define float @maximum_f32_val_nan(float %x) {
+; CHECK-LABEL: @maximum_f32_val_nan(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %y = call float @llvm.maximum.f32(float %x, float 0x7FF8000000000000)
+  ret float %y
+}
+
+define float @maximum4(float %x, float %y, float %z, float %w) {
+; CHECK-LABEL: @maximum4(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.maximum.f32(float [[Z:%.*]], float [[W:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = call float @llvm.maximum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[C]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %z, float %w)
+  %c = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %c
+}
+
+; PR37404 - https://bugs.llvm.org/show_bug.cgi?id=37404
+
+define <2 x float> @neg_neg(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_neg(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.minimum.v2f32(<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %negx = fsub <2 x float> <float -0.0, float -0.0>, %x
+  %negy = fsub <2 x float> <float -0.0, float -0.0>, %y
+  %r = call <2 x float> @llvm.maximum.v2f32(<2 x float> %negx, <2 x float> %negy)
+  ret <2 x float> %r
+}
+
+; FMF is not required, but it should be propagated from the intrinsic (not the fnegs).
+
+define float @neg_neg_vec_fmf(float %x, float %y) {
+; CHECK-LABEL: @neg_neg_vec_fmf(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub fast float -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %negx = fsub arcp float -0.0, %x
+  %negy = fsub afn float -0.0, %y
+  %r = call fast float @llvm.maximum.f32(float %negx, float %negy)
+  ret float %r
+}
+
+; 1 extra use of an intermediate value should still allow the fold,
+; but 2 would require more instructions than we started with.
+
+declare void @use(float)
+define float @neg_neg_extra_use_x(float %x, float %y) {
+; CHECK-LABEL: @neg_neg_extra_use_x(
+; CHECK-NEXT:    [[NEGX:%.*]] = fsub float -0.000000e+00, [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.minimum.f32(float [[X]], float [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub float -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    call void @use(float [[NEGX]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %negx = fsub float -0.0, %x
+  %negy = fsub float -0.0, %y
+  %r = call float @llvm.maximum.f32(float %negx, float %negy)
+  call void @use(float %negx)
+  ret float %r
+}
+
+define float @neg_neg_extra_use_y(float %x, float %y) {
+; CHECK-LABEL: @neg_neg_extra_use_y(
+; CHECK-NEXT:    [[NEGY:%.*]] = fsub float -0.000000e+00, [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y]])
+; CHECK-NEXT:    [[R:%.*]] = fsub float -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    call void @use(float [[NEGY]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %negx = fsub float -0.0, %x
+  %negy = fsub float -0.0, %y
+  %r = call float @llvm.maximum.f32(float %negx, float %negy)
+  call void @use(float %negy)
+  ret float %r
+}
+
+define float @neg_neg_extra_use_x_and_y(float %x, float %y) {
+; CHECK-LABEL: @neg_neg_extra_use_x_and_y(
+; CHECK-NEXT:    [[NEGX:%.*]] = fsub float -0.000000e+00, [[X:%.*]]
+; CHECK-NEXT:    [[NEGY:%.*]] = fsub float -0.000000e+00, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.maximum.f32(float [[NEGX]], float [[NEGY]])
+; CHECK-NEXT:    call void @use(float [[NEGX]])
+; CHECK-NEXT:    call void @use(float [[NEGY]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %negx = fsub float -0.0, %x
+  %negy = fsub float -0.0, %y
+  %r = call float @llvm.maximum.f32(float %negx, float %negy)
+  call void @use(float %negx)
+  call void @use(float %negy)
+  ret float %r
+}
diff --git a/test/Transforms/InstCombine/minimum.ll b/test/Transforms/InstCombine/minimum.ll
new file mode 100644
index 00000000000..858a3c1d377
--- /dev/null
+++ b/test/Transforms/InstCombine/minimum.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+declare float @llvm.minimum.f32(float, float)
+declare float @llvm.minimum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
+
+declare double @llvm.minimum.f64(double, double)
+declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
+
+declare float @llvm.maximum.f32(float, float)
+
+define float @constant_fold_minimum_f32() {
+; CHECK-LABEL: @constant_fold_minimum_f32(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float 1.0, float 2.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_inv() {
+; CHECK-LABEL: @constant_fold_minimum_f32_inv(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float 2.0, float 1.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_nan0() {
+; CHECK-LABEL: @constant_fold_minimum_f32_nan0(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.minimum.f32(float 0x7FF8000000000000, float 2.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_nan1() {
+; CHECK-LABEL: @constant_fold_minimum_f32_nan1(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.minimum.f32(float 2.0, float 0x7FF8000000000000)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_nan_nan() {
+; CHECK-LABEL: @constant_fold_minimum_f32_nan_nan(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.minimum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_p0_p0() {
+; CHECK-LABEL: @constant_fold_minimum_f32_p0_p0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float 0.0, float 0.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_p0_n0() {
+; CHECK-LABEL: @constant_fold_minimum_f32_p0_n0(
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float 0.0, float -0.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_n0_p0() {
+; CHECK-LABEL: @constant_fold_minimum_f32_n0_p0(
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float -0.0, float 0.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_n0_n0() {
+; CHECK-LABEL: @constant_fold_minimum_f32_n0_n0(
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float -0.0, float -0.0)
+  ret float %x
+}
+
+define <4 x float> @constant_fold_minimum_v4f32() {
+; CHECK-LABEL: @constant_fold_minimum_v4f32(
+; CHECK-NEXT:    ret <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 5.000000e+00>
+;
+  %x = call <4 x float> @llvm.minimum.v4f32(<4 x float> <float 1.0, float 8.0, float 3.0, float 9.0>, <4 x float> <float 2.0, float 2.0, float 10.0, float 5.0>)
+  ret <4 x float> %x
+}
+
+define double @constant_fold_minimum_f64() {
+; CHECK-LABEL: @constant_fold_minimum_f64(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %x = call double @llvm.minimum.f64(double 1.0, double 2.0)
+  ret double %x
+}
+
+define double @constant_fold_minimum_f64_nan0() {
+; CHECK-LABEL: @constant_fold_minimum_f64_nan0(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.minimum.f64(double 0x7FF8000000000000, double 2.0)
+  ret double %x
+}
+
+define double @constant_fold_minimum_f64_nan1() {
+; CHECK-LABEL: @constant_fold_minimum_f64_nan1(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.minimum.f64(double 2.0, double 0x7FF8000000000000)
+  ret double %x
+}
+
+define double @constant_fold_minimum_f64_nan_nan() {
+; CHECK-LABEL: @constant_fold_minimum_f64_nan_nan(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.minimum.f64(double 0x7FF8000000000000, double 0x7FF8000000000000)
+  ret double %x
+}
+
+define float @canonicalize_constant_minimum_f32(float %x) {
+; CHECK-LABEL: @canonicalize_constant_minimum_f32(
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float 1.000000e+00)
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %y = call float @llvm.minimum.f32(float 1.0, float %x)
+  ret float %y
+}
+
+define float @minimum_f32_nan_val(float %x) {
+; CHECK-LABEL: @minimum_f32_nan_val(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %y = call float @llvm.minimum.f32(float 0x7FF8000000000000, float %x)
+  ret float %y
+}
+
+define float @minimum_f32_val_nan(float %x) {
+; CHECK-LABEL: @minimum_f32_val_nan(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %y = call float @llvm.minimum.f32(float %x, float 0x7FF8000000000000)
+  ret float %y
+}
+
+define float @minimum4(float %x, float %y, float %z, float %w) {
+; CHECK-LABEL: @minimum4(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.minimum.f32(float [[Z:%.*]], float [[W:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = call float @llvm.minimum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[C]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %z, float %w)
+  %c = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %c
+}
+
+define float @minimum_x_maximum_x_y(float %x, float %y) {
+; CHECK-LABEL: @minimum_x_maximum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.minimum.f32(float [[X]], float [[A]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %x, float %a)
+  ret float %b
+}
+
+define float @maximum_x_minimum_x_y(float %x, float %y) {
+; CHECK-LABEL: @maximum_x_minimum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.maximum.f32(float [[X]], float [[A]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %x, float %a)
+  ret float %b
+}
+
+; PR37405 - https://bugs.llvm.org/show_bug.cgi?id=37405
+
+define double @neg_neg(double %x, double %y) {
+; CHECK-LABEL: @neg_neg(
+; CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.maximum.f64(double [[X:%.*]], double [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub double -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    ret double [[R]]
+;
+  %negx = fsub double -0.0, %x
+  %negy = fsub double -0.0, %y
+  %r = call double @llvm.minimum.f64(double %negx, double %negy)
+  ret double %r
+}
+
+; FMF is not required, but it should be propagated from the intrinsic (not the fnegs).
+; Also, make sure this works with vectors.
+
+define <2 x double> @neg_neg_vec_fmf(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @neg_neg_vec_fmf(
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf <2 x double> @llvm.maximum.v2f64(<2 x double> [[X:%.*]], <2 x double> [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub nnan ninf <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
+  %negx = fsub reassoc <2 x double> <double -0.0, double -0.0>, %x
+  %negy = fsub fast <2 x double> <double -0.0, double -0.0>, %y
+  %r = call nnan ninf <2 x double> @llvm.minimum.v2f64(<2 x double> %negx, <2 x double> %negy)
+  ret <2 x double> %r
+}
+
+; 1 extra use of an intermediate value should still allow the fold,
+; but 2 would require more instructions than we started with.
+
+declare void @use(double)
+define double @neg_neg_extra_use_x(double %x, double %y) {
+; CHECK-LABEL: @neg_neg_extra_use_x(
+; CHECK-NEXT:    [[NEGX:%.*]] = fsub double -0.000000e+00, [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.maximum.f64(double [[X]], double [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub double -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    call void @use(double [[NEGX]])
+; CHECK-NEXT:    ret double [[R]]
+;
+  %negx = fsub double -0.0, %x
+  %negy = fsub double -0.0, %y
+  %r = call double @llvm.minimum.f64(double %negx, double %negy)
+  call void @use(double %negx)
+  ret double %r
+}
+
+define double @neg_neg_extra_use_y(double %x, double %y) {
+; CHECK-LABEL: @neg_neg_extra_use_y(
+; CHECK-NEXT:    [[NEGY:%.*]] = fsub double -0.000000e+00, [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.maximum.f64(double [[X:%.*]], double [[Y]])
+; CHECK-NEXT:    [[R:%.*]] = fsub double -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    call void @use(double [[NEGY]])
+; CHECK-NEXT:    ret double [[R]]
+;
+  %negx = fsub double -0.0, %x
+  %negy = fsub double -0.0, %y
+  %r = call double @llvm.minimum.f64(double %negx, double %negy)
+  call void @use(double %negy)
+  ret double %r
+}
+
+define double @neg_neg_extra_use_x_and_y(double %x, double %y) {
+; CHECK-LABEL: @neg_neg_extra_use_x_and_y(
+; CHECK-NEXT:    [[NEGX:%.*]] = fsub double -0.000000e+00, [[X:%.*]]
+; CHECK-NEXT:    [[NEGY:%.*]] = fsub double -0.000000e+00, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = call double @llvm.minimum.f64(double [[NEGX]], double [[NEGY]])
+; CHECK-NEXT:    call void @use(double [[NEGX]])
+; CHECK-NEXT:    call void @use(double [[NEGY]])
+; CHECK-NEXT:    ret double [[R]]
+;
+  %negx = fsub double -0.0, %x
+  %negy = fsub double -0.0, %y
+  %r = call double @llvm.minimum.f64(double %negx, double %negy)
+  call void @use(double %negx)
+  call void @use(double %negy)
+  ret double %r
+}
diff --git a/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index 6b6ae48f516..acc24d9ba60 100644
--- a/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -790,6 +790,322 @@ define float @maxnum_neginf(float %x) {
   ret float %val
 }
 
+declare float @llvm.minimum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
+declare double @llvm.minimum.f64(double, double)
+declare double @llvm.maximum.f64(double, double)
+declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+; From the LangRef for minimum/maximum:
+; "If either operand is a NaN, returns NaN."
+
+define double @maximum_nan_op0(double %x) {
+; CHECK-LABEL: @maximum_nan_op0(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %r = call double @llvm.maximum.f64(double 0x7ff8000000000000, double %x)
+  ret double %r
+}
+
+define double @maximum_nan_op1(double %x) {
+; CHECK-LABEL: @maximum_nan_op1(
+; CHECK-NEXT:    ret double 0x7FF800000000DEAD
+;
+  %r = call double @llvm.maximum.f64(double %x, double 0x7ff800000000dead)
+  ret double %r
+}
+
+define double @minimum_nan_op0(double %x) {
+; CHECK-LABEL: @minimum_nan_op0(
+; CHECK-NEXT:    ret double 0x7FF8000DEAD00000
+;
+  %r = call double @llvm.minimum.f64(double 0x7ff8000dead00000, double %x)
+  ret double %r
+}
+
+define double @minimum_nan_op1(double %x) {
+; CHECK-LABEL: @minimum_nan_op1(
+; CHECK-NEXT:    ret double 0x7FF800DEAD00DEAD
+;
+  %r = call double @llvm.minimum.f64(double %x, double 0x7ff800dead00dead)
+  ret double %r
+}
+
+define <2 x double> @maximum_nan_op0_vec(<2 x double> %x) {
+; CHECK-LABEL: @maximum_nan_op0_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double undef>
+;
+  %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> <double 0x7ff8000000000000, double undef>, <2 x double> %x)
+  ret <2 x double> %r
+}
+
+define <2 x double> @maximum_nan_op1_vec(<2 x double> %x) {
+; CHECK-LABEL: @maximum_nan_op1_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF800000000DEAD, double 0x7FF8FFFFFFFFFFFF>
+;
+  %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> <double 0x7ff800000000dead, double 0x7ff8ffffffffffff>)
+  ret <2 x double> %r
+}
+
+define <2 x double> @minimum_nan_op0_vec(<2 x double> %x) {
+; CHECK-LABEL: @minimum_nan_op0_vec(
+; CHECK-NEXT:    ret <2 x double> <double undef, double 0x7FF8000DEAD00000>
+;
+  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> <double undef, double 0x7ff8000dead00000>, <2 x double> %x)
+  ret <2 x double> %r
+}
+
+define <2 x double> @minimum_nan_op1_vec(<2 x double> %x) {
+; CHECK-LABEL: @minimum_nan_op1_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF800DEAD00DEAD, double 0x7FF800DEAD00DEAD>
+;
+  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double 0x7ff800dead00dead, double 0x7ff800dead00dead>)
+  ret <2 x double> %r
+}
+
+define float @maximum_undef_op1(float %x) {
+; CHECK-LABEL: @maximum_undef_op1(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %val = call float @llvm.maximum.f32(float %x, float undef)
+  ret float %val
+}
+
+define float @maximum_undef_op0(float %x) {
+; CHECK-LABEL: @maximum_undef_op0(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %val = call float @llvm.maximum.f32(float undef, float %x)
+  ret float %val
+}
+
+define float @minimum_undef_op1(float %x) {
+; CHECK-LABEL: @minimum_undef_op1(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %val = call float @llvm.minimum.f32(float %x, float undef)
+  ret float %val
+}
+
+define float @minimum_undef_op0(float %x) {
+; CHECK-LABEL: @minimum_undef_op0(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %val = call float @llvm.minimum.f32(float undef, float %x)
+  ret float %val
+}
+
+define float @minimum_undef_undef(float %x) {
+; CHECK-LABEL: @minimum_undef_undef(
+; CHECK-NEXT:    ret float undef
+;
+  %val = call float @llvm.minimum.f32(float undef, float undef)
+  ret float %val
+}
+
+define float @maximum_undef_undef(float %x) {
+; CHECK-LABEL: @maximum_undef_undef(
+; CHECK-NEXT:    ret float undef
+;
+  %val = call float @llvm.maximum.f32(float undef, float undef)
+  ret float %val
+}
+
+define float @minimum_same_args(float %x) {
+; CHECK-LABEL: @minimum_same_args(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %y = call float @llvm.minimum.f32(float %x, float %x)
+  ret float %y
+}
+
+define float @maximum_same_args(float %x) {
+; CHECK-LABEL: @maximum_same_args(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %y = call float @llvm.maximum.f32(float %x, float %x)
+  ret float %y
+}
+
+define float @minimum_x_minimum_x_y(float %x, float %y) {
+; CHECK-LABEL: @minimum_x_minimum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %x, float %a)
+  ret float %b
+}
+
+define float @minimum_y_minimum_x_y(float %x, float %y) {
+; CHECK-LABEL: @minimum_y_minimum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %y, float %a)
+  ret float %b
+}
+
+define float @minimum_x_y_minimum_x(float %x, float %y) {
+; CHECK-LABEL: @minimum_x_y_minimum_x(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %a, float %x)
+  ret float %b
+}
+
+define float @minimum_x_y_minimum_y(float %x, float %y) {
+; CHECK-LABEL: @minimum_x_y_minimum_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %a, float %y)
+  ret float %b
+}
+
+; negative test
+
+define float @minimum_z_minimum_x_y(float %x, float %y, float %z) {
+; CHECK-LABEL: @minimum_z_minimum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.minimum.f32(float [[Z:%.*]], float [[A]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %z, float %a)
+  ret float %b
+}
+
+; negative test
+
+define float @minimum_x_y_minimum_z(float %x, float %y, float %z) {
+; CHECK-LABEL: @minimum_x_y_minimum_z(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.minimum.f32(float [[A]], float [[Z:%.*]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %a, float %z)
+  ret float %b
+}
+
+; minimum(X, -INF) --> -INF
+
+define float @minimum_neginf(float %x) {
+; CHECK-LABEL: @minimum_neginf(
+; CHECK-NEXT:    ret float 0xFFF0000000000000
+;
+  %val = call float @llvm.minimum.f32(float %x, float 0xFFF0000000000000)
+  ret float %val
+}
+
+define <2 x double> @minimum_neginf_commute_vec(<2 x double> %x) {
+; CHECK-LABEL: @minimum_neginf_commute_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000>
+;
+  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000>, <2 x double> %x)
+  ret <2 x double> %r
+}
+
+; negative test
+
+define float @minimum_inf(float %x) {
+; CHECK-LABEL: @minimum_inf(
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.minimum.f32(float 0x7FF0000000000000, float [[X:%.*]])
+; CHECK-NEXT:    ret float [[VAL]]
+;
+  %val = call float @llvm.minimum.f32(float 0x7FF0000000000000, float %x)
+  ret float %val
+}
+define float @maximum_x_maximum_x_y(float %x, float %y) {
+; CHECK-LABEL: @maximum_x_maximum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %x, float %a)
+  ret float %b
+}
+
+define float @maximum_y_maximum_x_y(float %x, float %y) {
+; CHECK-LABEL: @maximum_y_maximum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %y, float %a)
+  ret float %b
+}
+
+define float @maximum_x_y_maximum_x(float %x, float %y) {
+; CHECK-LABEL: @maximum_x_y_maximum_x(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %a, float %x)
+  ret float %b
+}
+
+define float @maximum_x_y_maximum_y(float %x, float %y) {
+; CHECK-LABEL: @maximum_x_y_maximum_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %a, float %y)
+  ret float %b
+}
+
+; negative test
+
+define float @maximum_z_maximum_x_y(float %x, float %y, float %z) {
+; CHECK-LABEL: @maximum_z_maximum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.maximum.f32(float [[Z:%.*]], float [[A]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %z, float %a)
+  ret float %b
+}
+
+; negative test
+
+define float @maximum_x_y_maximum_z(float %x, float %y, float %z) {
+; CHECK-LABEL: @maximum_x_y_maximum_z(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.maximum.f32(float [[A]], float [[Z:%.*]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %a, float %z)
+  ret float %b
+}
+
+; maximum(X, INF) --> INF
+
+define <2 x double> @maximum_inf(<2 x double> %x) {
+; CHECK-LABEL: @maximum_inf(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000>
+;
+  %val = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double><double 0x7FF0000000000000, double 0x7FF0000000000000>)
+  ret <2 x double> %val
+}
+
+define float @maximum_inf_commute(float %x) {
+; CHECK-LABEL: @maximum_inf_commute(
+; CHECK-NEXT:    ret float 0x7FF0000000000000
+;
+  %val = call float @llvm.maximum.f32(float 0x7FF0000000000000, float %x)
+  ret float %val
+}
+
 ; Y - (Y - X) --> X
 
 define float @fsub_fsub_common_op(float %x, float %y) {
@@ -951,4 +1267,3 @@ define float @fsub_fadd_common_op_wrong_commute_commute(float %x, float %y) {
   %r = fadd reassoc nsz float %s, %y
   ret float %r
 }
-
diff --git a/test/Transforms/InstSimplify/floating-point-compare.ll b/test/Transforms/InstSimplify/floating-point-compare.ll
index bc5c58a698e..eeae34c3059 100644
--- a/test/Transforms/InstSimplify/floating-point-compare.ll
+++ b/test/Transforms/InstSimplify/floating-point-compare.ll
@@ -179,6 +179,7 @@ declare double @llvm.powi.f64(double,i32)
 declare float @llvm.exp.f32(float)
 declare float @llvm.minnum.f32(float, float)
 declare float @llvm.maxnum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
 declare double @llvm.exp2.f64(double)
 declare float @llvm.fma.f32(float,float,float)
 
@@ -282,6 +283,18 @@ define i1 @orderedLessZeroMaxNum(float, float) {
   ret i1 %uge
 }
 
+; But using maximum, we can simplify, since the NaN would be propagated
+
+define i1 @orderedLessZeroMaximum(float, float) {
+; CHECK-LABEL: @orderedLessZeroMaximum(
+; CHECK-NEXT:    ret i1 true
+;
+  %a = call float @llvm.exp.f32(float %0)
+  %b = call float @llvm.maximum.f32(float %a, float %1)
+  %uge = fcmp uge float %b, 0.000000e+00
+  ret i1 %uge
+}
+
 define i1 @known_positive_olt_with_negative_constant(double %a) {
 ; CHECK-LABEL: @known_positive_olt_with_negative_constant(
 ; CHECK-NEXT:    ret i1 false
@@ -375,4 +388,3 @@ define <2 x i1> @unorderedCompareWithNaNVector_undef_elt(<2 x double> %A) {
   %cmp = fcmp ult <2 x double> %A, <double undef, double 0xFFFFFFFFFFFFFFFF>
   ret <2 x i1> %cmp
 }
-
diff --git a/test/Transforms/LICM/hoist-round.ll b/test/Transforms/LICM/hoist-round.ll
index 87a7050668d..35851f39d25 100644
--- a/test/Transforms/LICM/hoist-round.ll
+++ b/test/Transforms/LICM/hoist-round.ll
@@ -4,8 +4,8 @@
 target datalayout = "E-m:e-p:32:32-i8:8:8-i16:16:16-i64:32:32-f64:32:32-v64:32:32-v128:32:32-a0:0:32-n32"
 
 ; This test verifies that ceil, floor, nearbyint, trunc, rint, round,
-; copysign, minnum, maxnum and fabs intrinsics are considered safe
-; to speculate.
+; copysign, minnum, maxnum, minimum, maximum, and fabs intrinsics are
+; considered safe to speculate.
 
 ; CHECK-LABEL: @test
 ; CHECK: call float @llvm.ceil.f32
@@ -41,8 +41,10 @@ for.body:
   %tmp.8 = call float @llvm.copysign.f32(float %tmp.7, float %arg2)
   %tmp.9 = call float @llvm.minnum.f32(float %tmp.8, float %arg2)
   %tmp.10 = call float @llvm.maxnum.f32(float %tmp.9, float %arg2)
-  %tmp.11 = call float @llvm.powi.f32(float %tmp.10, i32 4)
-  call void @consume(float %tmp.11)
+  %tmp.11 = call float @llvm.minimum.f32(float %tmp.10, float %arg2)
+  %tmp.12 = call float @llvm.maximum.f32(float %tmp.11, float %arg2)
+  %tmp.13 = call float @llvm.powi.f32(float %tmp.12, i32 4)
+  call void @consume(float %tmp.13)
   %IND.new = add i32 %IND, 1
   br label %for.head
 
@@ -62,4 +64,6 @@ declare float @llvm.fabs.f32(float)
 declare float @llvm.copysign.f32(float, float)
 declare float @llvm.minnum.f32(float, float)
 declare float @llvm.maxnum.f32(float, float)
+declare float @llvm.minimum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
 declare float @llvm.powi.f32(float, i32)
diff --git a/test/Transforms/SimplifyCFG/speculate-math.ll b/test/Transforms/SimplifyCFG/speculate-math.ll
index 5655d5d7882..87e01663edf 100644
--- a/test/Transforms/SimplifyCFG/speculate-math.ll
+++ b/test/Transforms/SimplifyCFG/speculate-math.ll
@@ -7,6 +7,8 @@ declare float @llvm.fmuladd.f32(float, float, float) nounwind readonly
 declare float @llvm.fabs.f32(float) nounwind readonly
 declare float @llvm.minnum.f32(float, float) nounwind readonly
 declare float @llvm.maxnum.f32(float, float) nounwind readonly
+declare float @llvm.minimum.f32(float, float) nounwind readonly
+declare float @llvm.maximum.f32(float, float) nounwind readonly
 
 ; ALL-LABEL: @fdiv_test(
 ; EXPENSIVE: select i1 %cmp, double %div, double 0.0
@@ -127,3 +129,37 @@ test_maxnum.exit:                                   ; preds = %cond.else.i, %ent
   store float %cond.i, float addrspace(1)* %out, align 4
   ret void
 }
+
+; ALL-LABEL: @minimum_test(
+; ALL: select
+define void @minimum_test(float addrspace(1)* noalias nocapture %out, float %a, float %b) nounwind {
+entry:
+  %cmp.i = fcmp olt float %a, 0.000000e+00
+  br i1 %cmp.i, label %test_minimum.exit, label %cond.else.i
+
+cond.else.i:                                      ; preds = %entry
+  %0 = tail call float @llvm.minimum.f32(float %a, float %b) nounwind readnone
+  br label %test_minimum.exit
+
+test_minimum.exit:                                   ; preds = %cond.else.i, %entry
+  %cond.i = phi float [ %0, %cond.else.i ], [ 0x7FF8000000000000, %entry ]
+  store float %cond.i, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; ALL-LABEL: @maximum_test(
+; ALL: select
+define void @maximum_test(float addrspace(1)* noalias nocapture %out, float %a, float %b) nounwind {
+entry:
+  %cmp.i = fcmp olt float %a, 0.000000e+00
+  br i1 %cmp.i, label %test_maximum.exit, label %cond.else.i
+
+cond.else.i:                                      ; preds = %entry
+  %0 = tail call float @llvm.maximum.f32(float %a, float %b) nounwind readnone
+  br label %test_maximum.exit
+
+test_maximum.exit:                                   ; preds = %cond.else.i, %entry
+  %cond.i = phi float [ %0, %cond.else.i ], [ 0x7FF8000000000000, %entry ]
+  store float %cond.i, float addrspace(1)* %out, align 4
+  ret void
+}
-- 
GitLab


From 4806c82bb2a4022c1c69aecc9cb66d53b8cbe46e Mon Sep 17 00:00:00 2001
From: Krzysztof Pszeniczny <krzysztof.pszeniczny@gmail.com>
Date: Fri, 19 Oct 2018 19:02:16 +0000
Subject: [PATCH 0327/1116] Fix a use-after-RAUW bug in large GEP splitting

Summary:
Large GEP splitting, introduced in rL332015, uses a `DenseMap<AssertingVH<Value>, ...>`. This causes an assertion to fail (in debug builds) or undefined behaviour to occur (in release builds) when a value is RAUWed.

This manifested itself in the 7zip benchmark from the llvm test suite built on ARM with `-fstrict-vtable-pointers` enabled while RAUWing invariant group launders and splits in CodeGenPrepare.

This patch merges the large offsets of the argument and the result of an invariant.group strip/launder intrinsic before RAUWing.

Reviewers: Prazek, javed.absar, haicheng, efriedma

Reviewed By: Prazek, efriedma

Subscribers: kristof.beyls, hiraditya, llvm-commits

Differential Revision: https://reviews.llvm.org/D51936

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344802 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/CodeGenPrepare.cpp                | 17 +++++++++--
 .../AArch64/large-offset-gep.ll               | 28 +++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index dfbfae85a86..fa5cc4dc969 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -1721,11 +1721,22 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       return true;
     }
     case Intrinsic::launder_invariant_group:
-    case Intrinsic::strip_invariant_group:
-      II->replaceAllUsesWith(II->getArgOperand(0));
+    case Intrinsic::strip_invariant_group: {
+      Value *ArgVal = II->getArgOperand(0);
+      auto it = LargeOffsetGEPMap.find(II);
+      if (it != LargeOffsetGEPMap.end()) {
+          // Merge entries in LargeOffsetGEPMap to reflect the RAUW.
+          // Make sure not to have to deal with iterator invalidation
+          // after possibly adding ArgVal to LargeOffsetGEPMap.
+          auto GEPs = std::move(it->second);
+          LargeOffsetGEPMap[ArgVal].append(GEPs.begin(), GEPs.end());
+          LargeOffsetGEPMap.erase(II);
+      }
+
+      II->replaceAllUsesWith(ArgVal);
       II->eraseFromParent();
       return true;
-
+    }
     case Intrinsic::cttz:
     case Intrinsic::ctlz:
       // If counting zeros is expensive, try to avoid it.
diff --git a/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
index a5878e00e03..5cc00b75962 100644
--- a/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
+++ b/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
@@ -145,3 +145,31 @@ while_body:
 while_end:
   ret void
 }
+
+declare i8* @llvm.strip.invariant.group.p0i8(i8*)
+
+define void @test_invariant_group(i32) {
+; CHECK-LABEL: test_invariant_group
+  br i1 undef, label %8, label %7
+
+; <label>:2:                                      ; preds = %8, %2
+  br i1 undef, label %2, label %7
+
+; <label>:3:                                      ; preds = %8
+  %4 = getelementptr inbounds i8, i8* %9, i32 40000
+  %5 = bitcast i8* %4 to i64*
+  br i1 undef, label %7, label %6
+
+; <label>:6:                                      ; preds = %3
+  store i64 1, i64* %5, align 8
+  br label %7
+
+; <label>:7:                                      ; preds = %6, %3, %2, %1
+  ret void
+
+; <label>:8:                                      ; preds = %1
+  %9 = call i8* @llvm.strip.invariant.group.p0i8(i8* nonnull undef)
+  %10 = icmp eq i32 %0, 0
+  br i1 %10, label %3, label %2
+}
+
-- 
GitLab


From 203691d83a7283fec25121e672042a922166feb4 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Fri, 19 Oct 2018 19:08:06 +0000
Subject: [PATCH 0328/1116] [WebAssembly] Handle undefined lane indices in SIMD
 patterns

Summary:
Undefined indices in shuffles can be used when not all lanes of the
output vector will be used. This happens for example in the expansion
of vector reduce operations. Regardless, undefs are legal as lane
indices in IR and should be supported.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53057

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344803 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |   6 +-
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  36 +++
 test/CodeGen/WebAssembly/simd.ll              | 266 ++++++++++++++++++
 3 files changed, 306 insertions(+), 2 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 30c2e843408..6ca619c910a 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -990,8 +990,10 @@ WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // Expand mask indices to byte indices and materialize them as operands
   for (size_t I = 0, Lanes = Mask.size(); I < Lanes; ++I) {
     for (size_t J = 0; J < LaneBytes; ++J) {
-      Ops[OpIdx++] =
-          DAG.getConstant((uint64_t)Mask[I] * LaneBytes + J, DL, MVT::i32);
+      // Lower undefs (represented by -1 in mask) to zero
+      uint64_t ByteIndex =
+          Mask[I] == -1 ? 0 : (uint64_t)Mask[I] * LaneBytes + J;
+      Ops[OpIdx++] = DAG.getConstant(ByteIndex, DL, MVT::i32);
     }
   }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index b0fd6cab229..95c87266273 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -181,6 +181,28 @@ def : Pat<(i32 (vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx))),
 def : Pat<(i32 (vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx))),
           (EXTRACT_LANE_v8i16_u V128:$vec, (i32 LaneIdx8:$idx))>;
 
+// Lower undef lane indices to zero
+def : Pat<(and (i32 (vector_extract (v16i8 V128:$vec), undef)), (i32 0xff)),
+          (EXTRACT_LANE_v16i8_u V128:$vec, 0)>;
+def : Pat<(and (i32 (vector_extract (v8i16 V128:$vec), undef)), (i32 0xffff)),
+          (EXTRACT_LANE_v8i16_u V128:$vec, 0)>;
+def : Pat<(i32 (vector_extract (v16i8 V128:$vec), undef)),
+          (EXTRACT_LANE_v16i8_u V128:$vec, 0)>;
+def : Pat<(i32 (vector_extract (v8i16 V128:$vec), undef)),
+          (EXTRACT_LANE_v8i16_u V128:$vec, 0)>;
+def : Pat<(sext_inreg (i32 (vector_extract (v16i8 V128:$vec), undef)), i8),
+          (EXTRACT_LANE_v16i8_s V128:$vec, 0)>;
+def : Pat<(sext_inreg (i32 (vector_extract (v8i16 V128:$vec), undef)), i16),
+          (EXTRACT_LANE_v8i16_s V128:$vec, 0)>;
+def : Pat<(vector_extract (v4i32 V128:$vec), undef),
+          (EXTRACT_LANE_v4i32 V128:$vec, 0)>;
+def : Pat<(vector_extract (v2i64 V128:$vec), undef),
+          (EXTRACT_LANE_v2i64 V128:$vec, 0)>;
+def : Pat<(vector_extract (v4f32 V128:$vec), undef),
+          (EXTRACT_LANE_v4f32 V128:$vec, 0)>;
+def : Pat<(vector_extract (v2f64 V128:$vec), undef),
+          (EXTRACT_LANE_v2f64 V128:$vec, 0)>;
+
 // Replace lane value: replace_lane
 multiclass ReplaceLane<ValueType vec_t, string vec, ImmLeaf imm_t,
                        WebAssemblyRegClass reg_t, ValueType lane_t,
@@ -201,6 +223,20 @@ defm "" : ReplaceLane<v2i64, "i64x2", LaneIdx2, I64, i64, 20>;
 defm "" : ReplaceLane<v4f32, "f32x4", LaneIdx4, F32, f32, 21>;
 defm "" : ReplaceLane<v2f64, "f64x2", LaneIdx2, F64, f64, 22>;
 
+// Lower undef lane indices to zero
+def : Pat<(vector_insert (v16i8 V128:$vec), I32:$x, undef),
+          (REPLACE_LANE_v16i8 V128:$vec, 0, I32:$x)>;
+def : Pat<(vector_insert (v8i16 V128:$vec), I32:$x, undef),
+          (REPLACE_LANE_v8i16 V128:$vec, 0, I32:$x)>;
+def : Pat<(vector_insert (v4i32 V128:$vec), I32:$x, undef),
+          (REPLACE_LANE_v4i32 V128:$vec, 0, I32:$x)>;
+def : Pat<(vector_insert (v2i64 V128:$vec), I64:$x, undef),
+          (REPLACE_LANE_v2i64 V128:$vec, 0, I64:$x)>;
+def : Pat<(vector_insert (v4f32 V128:$vec), F32:$x, undef),
+          (REPLACE_LANE_v4f32 V128:$vec, 0, F32:$x)>;
+def : Pat<(vector_insert (v2f64 V128:$vec), F64:$x, undef),
+          (REPLACE_LANE_v2f64 V128:$vec, 0, F64:$x)>;
+
 // Arbitrary other BUILD_VECTOR patterns
 def : Pat<(v16i8 (build_vector
             (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3),
diff --git a/test/CodeGen/WebAssembly/simd.ll b/test/CodeGen/WebAssembly/simd.ll
index 193e3120b9e..1e1feeb35df 100644
--- a/test/CodeGen/WebAssembly/simd.ll
+++ b/test/CodeGen/WebAssembly/simd.ll
@@ -54,6 +54,18 @@ define i32 @extract_v16i8_s(<16 x i8> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_undef_v16i8_s:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_s $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_undef_v16i8_s(<16 x i8> %v) {
+  %elem = extractelement <16 x i8> %v, i8 undef
+  %a = sext i8 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_v16i8_u:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128{{$}}
@@ -66,6 +78,18 @@ define i32 @extract_v16i8_u(<16 x i8> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_undef_v16i8_u:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_undef_v16i8_u(<16 x i8> %v) {
+  %elem = extractelement <16 x i8> %v, i8 undef
+  %a = zext i8 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128{{$}}
@@ -77,6 +101,17 @@ define i8 @extract_v16i8(<16 x i8> %v) {
   ret i8 %elem
 }
 
+; CHECK-LABEL: extract_undef_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i8 @extract_undef_v16i8(<16 x i8> %v) {
+  %elem = extractelement <16 x i8> %v, i8 undef
+  ret i8 %elem
+}
+
 ; CHECK-LABEL: replace_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -88,6 +123,17 @@ define <16 x i8> @replace_v16i8(<16 x i8> %v, i8 %x) {
   ret <16 x i8> %res
 }
 
+; CHECK-LABEL: replace_undef_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @replace_undef_v16i8(<16 x i8> %v, i8 %x) {
+  %res = insertelement <16 x i8> %v, i8 %x, i32 undef
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: shuffle_v16i8:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -102,6 +148,22 @@ define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i8> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v16i8:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %res = shufflevector <16 x i8> %x, <16 x i8> %y,
+    <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: build_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32{{$}}
@@ -190,6 +252,18 @@ define i32 @extract_v8i16_s(<8 x i16> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_undef_v8i16_s:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_s $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_undef_v8i16_s(<8 x i16> %v) {
+  %elem = extractelement <8 x i16> %v, i16 undef
+  %a = sext i16 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_v8i16_u:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128{{$}}
@@ -202,6 +276,18 @@ define i32 @extract_v8i16_u(<8 x i16> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_undef_v8i16_u:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_undef_v8i16_u(<8 x i16> %v) {
+  %elem = extractelement <8 x i16> %v, i16 undef
+  %a = zext i16 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128{{$}}
@@ -213,6 +299,17 @@ define i16 @extract_v8i16(<8 x i16> %v) {
   ret i16 %elem
 }
 
+; CHECK-LABEL: extract_undef_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i16 @extract_undef_v8i16(<8 x i16> %v) {
+  %elem = extractelement <8 x i16> %v, i16 undef
+  ret i16 %elem
+}
+
 ; CHECK-LABEL: replace_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -224,6 +321,17 @@ define <8 x i16> @replace_v8i16(<8 x i16> %v, i16 %x) {
   ret <8 x i16> %res
 }
 
+; CHECK-LABEL: replace_undef_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @replace_undef_v8i16(<8 x i16> %v, i16 %x) {
+  %res = insertelement <8 x i16> %v, i16 %x, i32 undef
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: shuffle_v8i16:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -237,6 +345,20 @@ define <8 x i16> @shuffle_v8i16(<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i16> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v8i16:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
+  %res = shufflevector <8 x i16> %x, <8 x i16> %y,
+    <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
+               i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: build_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param i32, i32, i32, i32, i32, i32, i32, i32{{$}}
@@ -305,6 +427,17 @@ define i32 @extract_v4i32(<4 x i32> %v) {
   ret i32 %elem
 }
 
+; CHECK-LABEL: extract_undef_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_undef_v4i32(<4 x i32> %v) {
+  %elem = extractelement <4 x i32> %v, i32 undef
+  ret i32 %elem
+}
+
 ; CHECK-LABEL: replace_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -316,6 +449,17 @@ define <4 x i32> @replace_v4i32(<4 x i32> %v, i32 %x) {
   ret <4 x i32> %res
 }
 
+; CHECK-LABEL: replace_undef_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @replace_undef_v4i32(<4 x i32> %v, i32 %x) {
+  %res = insertelement <4 x i32> %v, i32 %x, i32 undef
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: shuffle_v4i32:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -329,6 +473,19 @@ define <4 x i32> @shuffle_v4i32(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i32> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v4i32:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
+  %res = shufflevector <4 x i32> %x, <4 x i32> %y,
+    <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: build_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param i32, i32, i32, i32{{$}}
@@ -390,6 +547,18 @@ define i64 @extract_v2i64(<2 x i64> %v) {
   ret i64 %elem
 }
 
+; CHECK-LABEL: extract_undef_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-VM-NOT: i64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i64{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i64 @extract_undef_v2i64(<2 x i64> %v) {
+  %elem = extractelement <2 x i64> %v, i64 undef
+  ret i64 %elem
+}
+
 ; CHECK-LABEL: replace_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
@@ -402,6 +571,18 @@ define <2 x i64> @replace_v2i64(<2 x i64> %v, i64 %x) {
   ret <2 x i64> %res
 }
 
+; CHECK-LABEL: replace_undef_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-VM-NOT: i64x2
+; SIMD128-NEXT: .param v128, i64{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @replace_undef_v2i64(<2 x i64> %v, i64 %x) {
+  %res = insertelement <2 x i64> %v, i64 %x, i32 undef
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: shuffle_v2i64:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -414,6 +595,19 @@ define <2 x i64> @shuffle_v2i64(<2 x i64> %x, <2 x i64> %y) {
   ret <2 x i64> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v2i64:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shuffle_undef_v2i64(<2 x i64> %x, <2 x i64> %y) {
+  %res = shufflevector <2 x i64> %x, <2 x i64> %y,
+    <2 x i32> <i32 1, i32 undef>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: build_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
@@ -472,6 +666,17 @@ define float @extract_v4f32(<4 x float> %v) {
   ret float %elem
 }
 
+; CHECK-LABEL: extract_undef_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result f32{{$}}
+; SIMD128-NEXT: f32x4.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define float @extract_undef_v4f32(<4 x float> %v) {
+  %elem = extractelement <4 x float> %v, i32 undef
+  ret float %elem
+}
+
 ; CHECK-LABEL: replace_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, f32{{$}}
@@ -483,6 +688,17 @@ define <4 x float> @replace_v4f32(<4 x float> %v, float %x) {
   ret <4 x float> %res
 }
 
+; CHECK-LABEL: replace_undef_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, f32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @replace_undef_v4f32(<4 x float> %v, float %x) {
+  %res = insertelement <4 x float> %v, float %x, i32 undef
+  ret <4 x float> %res
+}
+
 ; CHECK-LABEL: shuffle_v4f32:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -496,6 +712,19 @@ define <4 x float> @shuffle_v4f32(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v4f32:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
+  %res = shufflevector <4 x float> %x, <4 x float> %y,
+    <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  ret <4 x float> %res
+}
+
 ; CHECK-LABEL: build_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param f32, f32, f32, f32{{$}}
@@ -556,6 +785,18 @@ define double @extract_v2f64(<2 x double> %v) {
   ret double %elem
 }
 
+; CHECK-LABEL: extract_undef_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result f64{{$}}
+; SIMD128-NEXT: f64x2.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define double @extract_undef_v2f64(<2 x double> %v) {
+  %elem = extractelement <2 x double> %v, i32 undef
+  ret double %elem
+}
+
 ; CHECK-LABEL: replace_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -568,6 +809,18 @@ define <2 x double> @replace_v2f64(<2 x double> %v, double %x) {
   ret <2 x double> %res
 }
 
+; CHECK-LABEL: replace_undef_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, f64{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @replace_undef_v2f64(<2 x double> %v, double %x) {
+  %res = insertelement <2 x double> %v, double %x, i32 undef
+  ret <2 x double> %res
+}
+
 ; CHECK-LABEL: shuffle_v2f64:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -581,6 +834,19 @@ define <2 x double> @shuffle_v2f64(<2 x double> %x, <2 x double> %y) {
   ret <2 x double> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v2f64:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @shuffle_undef_v2f64(<2 x double> %x, <2 x double> %y) {
+  %res = shufflevector <2 x double> %x, <2 x double> %y,
+    <2 x i32> <i32 1, i32 undef>
+  ret <2 x double> %res
+}
+
 ; CHECK-LABEL: build_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-- 
GitLab


From 3b86c68554418cc2e731a8521b7329feed6f21a3 Mon Sep 17 00:00:00 2001
From: Wolfgang Pieb <Wolfgang.Pieb@sony.com>
Date: Fri, 19 Oct 2018 19:23:16 +0000
Subject: [PATCH 0329/1116] [DWARF] Make llvm-dwarfdump display location lists
 in a .dwp file correctly. Fixes PR38990.

Considers the index when extracting location lists from a .dwp file.
Majority of the patch by David Blaikie.

Reviewers: dblaikie

Differential revision: https://reviews.llvm.org/D53155


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344807 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/DebugInfo/DWARF/DWARFCompileUnit.h   |  8 +--
 include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h  |  8 +--
 include/llvm/DebugInfo/DWARF/DWARFUnit.h      | 23 ++++---
 lib/DebugInfo/DWARF/DWARFDie.cpp              | 10 ++-
 lib/DebugInfo/DWARF/DWARFUnit.cpp             | 48 ++++++++------
 lib/DebugInfo/DWARF/DWARFVerifier.cpp         | 14 +++--
 test/DebugInfo/Inputs/loclists-dwp-b.ll       | 32 ++++++++++
 test/DebugInfo/X86/loclists-dwp.ll            | 62 +++++++++++++++++++
 8 files changed, 160 insertions(+), 45 deletions(-)
 create mode 100644 test/DebugInfo/Inputs/loclists-dwp-b.ll
 create mode 100644 test/DebugInfo/X86/loclists-dwp.ll

diff --git a/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
index 27d56d72f0a..33797419a7b 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
@@ -18,13 +18,13 @@ namespace llvm {
 class DWARFCompileUnit : public DWARFUnit {
 public:
   DWARFCompileUnit(DWARFContext &Context, const DWARFSection &Section,
-                   const DWARFUnitHeader &Header,
-                   const DWARFDebugAbbrev *DA, const DWARFSection *RS,
+                   const DWARFUnitHeader &Header, const DWARFDebugAbbrev *DA,
+                   const DWARFSection *RS, const DWARFSection *LocSection,
                    StringRef SS, const DWARFSection &SOS,
                    const DWARFSection *AOS, const DWARFSection &LS, bool LE,
                    bool IsDWO, const DWARFUnitVector &UnitVector)
-      : DWARFUnit(Context, Section, Header, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
-                  UnitVector) {}
+      : DWARFUnit(Context, Section, Header, DA, RS, LocSection, SS, SOS, AOS,
+                  LS, LE, IsDWO, UnitVector) {}
 
   /// VTable anchor.
   ~DWARFCompileUnit() override;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
index 0a5a1aaa79d..8ca5ba13fc2 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
@@ -26,13 +26,13 @@ class raw_ostream;
 class DWARFTypeUnit : public DWARFUnit {
 public:
   DWARFTypeUnit(DWARFContext &Context, const DWARFSection &Section,
-                const DWARFUnitHeader &Header,
-                const DWARFDebugAbbrev *DA, const DWARFSection *RS,
+                const DWARFUnitHeader &Header, const DWARFDebugAbbrev *DA,
+                const DWARFSection *RS, const DWARFSection *LocSection,
                 StringRef SS, const DWARFSection &SOS, const DWARFSection *AOS,
                 const DWARFSection &LS, bool LE, bool IsDWO,
                 const DWARFUnitVector &UnitVector)
-      : DWARFUnit(Context, Section, Header, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
-                  UnitVector) {}
+      : DWARFUnit(Context, Section, Header, DA, RS, LocSection, SS, SOS, AOS,
+                  LS, LE, IsDWO, UnitVector) {}
 
   uint64_t getTypeHash() const { return getHeader().getTypeHash(); }
   uint32_t getTypeOffset() const { return getHeader().getTypeOffset(); }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index 39d43b91485..c267cf173d1 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -153,10 +153,10 @@ public:
 private:
   void addUnitsImpl(DWARFContext &Context, const DWARFObject &Obj,
                     const DWARFSection &Section, const DWARFDebugAbbrev *DA,
-                    const DWARFSection *RS, StringRef SS,
-                    const DWARFSection &SOS, const DWARFSection *AOS,
-                    const DWARFSection &LS, bool LE, bool IsDWO, bool Lazy,
-                    DWARFSectionKind SectionKind);
+                    const DWARFSection *RS, const DWARFSection *LocSection,
+                    StringRef SS, const DWARFSection &SOS,
+                    const DWARFSection *AOS, const DWARFSection &LS, bool LE,
+                    bool IsDWO, bool Lazy, DWARFSectionKind SectionKind);
 };
 
 /// Represents base address of the CU.
@@ -198,6 +198,12 @@ class DWARFUnit {
   const DWARFDebugAbbrev *Abbrev;
   const DWARFSection *RangeSection;
   uint32_t RangeSectionBase;
+  /// We either keep track of the location list section or its data, depending
+  /// on whether we are handling a split DWARF section or not.
+  union {
+    const DWARFSection *LocSection;
+    StringRef LocSectionData;
+  };
   const DWARFSection &LineSection;
   StringRef StringSection;
   const DWARFSection &StringOffsetSection;
@@ -258,16 +264,19 @@ protected:
 
 public:
   DWARFUnit(DWARFContext &Context, const DWARFSection &Section,
-            const DWARFUnitHeader &Header,
-            const DWARFDebugAbbrev *DA, const DWARFSection *RS, StringRef SS,
-            const DWARFSection &SOS, const DWARFSection *AOS,
+            const DWARFUnitHeader &Header, const DWARFDebugAbbrev *DA,
+            const DWARFSection *RS, const DWARFSection *LocSection,
+            StringRef SS, const DWARFSection &SOS, const DWARFSection *AOS,
             const DWARFSection &LS, bool LE, bool IsDWO,
             const DWARFUnitVector &UnitVector);
 
   virtual ~DWARFUnit();
 
+  bool isDWOUnit() const { return isDWO; }
   DWARFContext& getContext() const { return Context; }
   const DWARFSection &getInfoSection() const { return InfoSection; }
+  const DWARFSection *getLocSection() const { return LocSection; }
+  StringRef getLocSectionData() const { return LocSectionData; }
   uint32_t getOffset() const { return Header.getOffset(); }
   const dwarf::FormParams &getFormParams() const {
     return Header.getFormParams();
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index cf10c1134a7..6b69b822aad 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -101,12 +101,10 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
 
   FormValue.dump(OS, DumpOpts);
   if (FormValue.isFormClass(DWARFFormValue::FC_SectionOffset)) {
-    const DWARFSection &LocSection = Obj.getLocSection();
-    const DWARFSection &LocDWOSection = Obj.getLocDWOSection();
     uint32_t Offset = *FormValue.getAsSectionOffset();
-    if (!LocSection.Data.empty()) {
+    if (!U->isDWOUnit()) {
       DWARFDebugLoc DebugLoc;
-      DWARFDataExtractor Data(Obj, LocSection, Ctx.isLittleEndian(),
+      DWARFDataExtractor Data(Obj, *U->getLocSection(), Ctx.isLittleEndian(),
                               Obj.getAddressSize());
       auto LL = DebugLoc.parseOneLocationList(Data, &Offset);
       if (LL) {
@@ -117,8 +115,8 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
                  Indent);
       } else
         OS << "error extracting location list.";
-    } else if (!LocDWOSection.Data.empty()) {
-      DataExtractor Data(LocDWOSection.Data, Ctx.isLittleEndian(), 0);
+    } else {
+      DataExtractor Data(U->getLocSectionData(), Ctx.isLittleEndian(), 0);
       auto LL = DWARFDebugLocDWO::parseOneLocationList(Data, &Offset);
       if (LL)
         LL->dump(OS, Ctx.isLittleEndian(), Obj.getAddressSize(), MRI, Indent);
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 6c3c62d86ef..dbac5a82b57 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -39,9 +39,10 @@ void DWARFUnitVector::addUnitsForSection(DWARFContext &C,
                                          DWARFSectionKind SectionKind) {
   const DWARFObject &D = C.getDWARFObj();
   addUnitsImpl(C, D, Section, C.getDebugAbbrev(), &D.getRangeSection(),
-               D.getStringSection(), D.getStringOffsetSection(),
-               &D.getAddrSection(), D.getLineSection(), D.isLittleEndian(),
-               false, false, SectionKind);
+               &D.getLocSection(), D.getStringSection(),
+               D.getStringOffsetSection(), &D.getAddrSection(),
+               D.getLineSection(), D.isLittleEndian(), false, false,
+               SectionKind);
 }
 
 void DWARFUnitVector::addUnitsForDWOSection(DWARFContext &C,
@@ -50,16 +51,18 @@ void DWARFUnitVector::addUnitsForDWOSection(DWARFContext &C,
                                             bool Lazy) {
   const DWARFObject &D = C.getDWARFObj();
   addUnitsImpl(C, D, DWOSection, C.getDebugAbbrevDWO(), &D.getRangeDWOSection(),
-               D.getStringDWOSection(), D.getStringOffsetDWOSection(),
-               &D.getAddrSection(), D.getLineDWOSection(), C.isLittleEndian(),
-               true, Lazy, SectionKind);
+               &D.getLocDWOSection(), D.getStringDWOSection(),
+               D.getStringOffsetDWOSection(), &D.getAddrSection(),
+               D.getLineDWOSection(), C.isLittleEndian(), true, Lazy,
+               SectionKind);
 }
 
 void DWARFUnitVector::addUnitsImpl(
     DWARFContext &Context, const DWARFObject &Obj, const DWARFSection &Section,
-    const DWARFDebugAbbrev *DA, const DWARFSection *RS, StringRef SS,
-    const DWARFSection &SOS, const DWARFSection *AOS, const DWARFSection &LS,
-    bool LE, bool IsDWO, bool Lazy, DWARFSectionKind SectionKind) {
+    const DWARFDebugAbbrev *DA, const DWARFSection *RS,
+    const DWARFSection *LocSection, StringRef SS, const DWARFSection &SOS,
+    const DWARFSection *AOS, const DWARFSection &LS, bool LE, bool IsDWO,
+    bool Lazy, DWARFSectionKind SectionKind) {
   DWARFDataExtractor Data(Obj, Section, LE, 0);
   // Lazy initialization of Parser, now that we have all section info.
   if (!Parser) {
@@ -79,12 +82,12 @@ void DWARFUnitVector::addUnitsImpl(
       std::unique_ptr<DWARFUnit> U;
       if (Header.isTypeUnit())
         U = llvm::make_unique<DWARFTypeUnit>(Context, InfoSection, Header, DA,
-                                             RS, SS, SOS, AOS, LS, LE, IsDWO,
-                                             *this);
+                                             RS, LocSection, SS, SOS, AOS, LS,
+                                             LE, IsDWO, *this);
       else
         U = llvm::make_unique<DWARFCompileUnit>(Context, InfoSection, Header,
-                                                DA, RS, SS, SOS, AOS, LS, LE,
-                                                IsDWO, *this);
+                                                DA, RS, LocSection, SS, SOS,
+                                                AOS, LS, LE, IsDWO, *this);
       return U;
     };
   }
@@ -164,16 +167,25 @@ DWARFUnitVector::getUnitForIndexEntry(const DWARFUnitIndex::Entry &E) {
 }
 
 DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
-                     const DWARFUnitHeader &Header,
-                     const DWARFDebugAbbrev *DA, const DWARFSection *RS,
+                     const DWARFUnitHeader &Header, const DWARFDebugAbbrev *DA,
+                     const DWARFSection *RS, const DWARFSection *LocSection,
                      StringRef SS, const DWARFSection &SOS,
                      const DWARFSection *AOS, const DWARFSection &LS, bool LE,
                      bool IsDWO, const DWARFUnitVector &UnitVector)
     : Context(DC), InfoSection(Section), Header(Header), Abbrev(DA),
-      RangeSection(RS), LineSection(LS), StringSection(SS),
-      StringOffsetSection(SOS),  AddrOffsetSection(AOS), isLittleEndian(LE),
-      isDWO(IsDWO), UnitVector(UnitVector) {
+      RangeSection(RS), LocSection(LocSection), LineSection(LS),
+      StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS),
+      isLittleEndian(LE), isDWO(IsDWO), UnitVector(UnitVector) {
   clear();
+  // For split DWARF we only need to keep track of the location list section's
+  // data (no relocations), and if we are reading a package file, we need to
+  // adjust the location list data based on the index entries.
+  if (IsDWO) {
+    LocSectionData = LocSection->Data;
+    if (auto *IndexEntry = Header.getIndexEntry())
+      if (const auto *C = IndexEntry->getOffset(DW_SECT_LOC))
+        LocSectionData = LocSectionData.substr(C->Offset, C->Length);
+  }
 }
 
 DWARFUnit::~DWARFUnit() = default;
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index e78e13bf4af..d30600accd0 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -325,9 +325,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
       case dwarf::DW_UT_split_type: {
         Unit = TypeUnitVector.addUnit(llvm::make_unique<DWARFTypeUnit>(
             DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangeSection(),
-            DObj.getStringSection(), DObj.getStringOffsetSection(),
-            &DObj.getAppleObjCSection(), DObj.getLineSection(),
-            DCtx.isLittleEndian(), false, TypeUnitVector));
+            &DObj.getLocSection(), DObj.getStringSection(),
+            DObj.getStringOffsetSection(), &DObj.getAppleObjCSection(),
+            DObj.getLineSection(), DCtx.isLittleEndian(), false,
+            TypeUnitVector));
         break;
       }
       case dwarf::DW_UT_skeleton:
@@ -338,9 +339,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
       case 0: {
         Unit = CompileUnitVector.addUnit(llvm::make_unique<DWARFCompileUnit>(
             DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangeSection(),
-            DObj.getStringSection(), DObj.getStringOffsetSection(),
-            &DObj.getAppleObjCSection(), DObj.getLineSection(),
-            DCtx.isLittleEndian(), false, CompileUnitVector));
+            &DObj.getLocSection(), DObj.getStringSection(),
+            DObj.getStringOffsetSection(), &DObj.getAppleObjCSection(),
+            DObj.getLineSection(), DCtx.isLittleEndian(), false,
+            CompileUnitVector));
         break;
       }
       default: { llvm_unreachable("Invalid UnitType."); }
diff --git a/test/DebugInfo/Inputs/loclists-dwp-b.ll b/test/DebugInfo/Inputs/loclists-dwp-b.ll
new file mode 100644
index 00000000000..77081bd7c28
--- /dev/null
+++ b/test/DebugInfo/Inputs/loclists-dwp-b.ll
@@ -0,0 +1,32 @@
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @_Z1bi(i32 %i) local_unnamed_addr !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %i, metadata !12, metadata !DIExpression()), !dbg !13
+  tail call void asm sideeffect "", "~{rdi},~{dirflag},~{fpsr},~{flags}"() , !dbg !14, !srcloc !15
+  ret void, !dbg !16
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (https://git.llvm.org/git/clang.git/ 41055c6168135fe539801799e5c5636247cf0302) (https://git.llvm.org/git/llvm.git/ de0558be123ffbb5b5bd692c17dbd57a75fe684f)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "b.cpp", directory: "/home/test/PRs/PR38990")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (https://git.llvm.org/git/clang.git/ 41055c6168135fe539801799e5c5636247cf0302) (https://git.llvm.org/git/llvm.git/ de0558be123ffbb5b5bd692c17dbd57a75fe684f)"}
+!7 = distinct !DISubprogram(name: "b", linkageName: "_Z1bi", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !{!12}
+!12 = !DILocalVariable(name: "i", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!13 = !DILocation(line: 1, column: 12, scope: !7)
+!14 = !DILocation(line: 1, column: 17, scope: !7)
+!15 = !{i32 22}
+!16 = !DILocation(line: 1, column: 38, scope: !7)
diff --git a/test/DebugInfo/X86/loclists-dwp.ll b/test/DebugInfo/X86/loclists-dwp.ll
new file mode 100644
index 00000000000..a5ce92270d6
--- /dev/null
+++ b/test/DebugInfo/X86/loclists-dwp.ll
@@ -0,0 +1,62 @@
+; RUN: llc -split-dwarf-file=%t1.dwo -filetype=obj -o %t1.o < %s
+; RUN: llc -split-dwarf-file=%t2.dwo -filetype=obj -o %t2.o < %p/../Inputs/loclists-dwp-b.ll 
+; RUN: llvm-dwp %t1.o %t2.o -o %t.dwp
+; RUN: llvm-dwarfdump -v %t.dwp | FileCheck %s
+
+; Make sure that 2 location lists from different units within a dwp file are 
+; dumped correctly. The 2 location lists differ in the length of their address
+; ranges.
+; 
+; Generate both .ll files with clang -S -emit-llvm from the following sources:
+; a.cpp:
+; void y();
+; void a(int i) {
+;   y();
+;   asm("" : : : "rdi");
+; }
+;
+; b.cpp:
+; void b(int i) { asm("" : : : "rdi"); }
+
+; CHECK:      DW_AT_location [DW_FORM_sec_offset]   (0x00000000
+; CHECK-NEXT: Addr idx 0 (w/ length 6): DW_OP_reg5 RDI)
+
+; CHECK:      DW_AT_location [DW_FORM_sec_offset]   (0x00000000
+; CHECK-NEXT: Addr idx 0 (w/ length 0): DW_OP_reg5 RDI)
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @_Z1ai(i32 %i) local_unnamed_addr !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %i, metadata !12, metadata !DIExpression()), !dbg !13
+  tail call void @_Z1yv(), !dbg !14
+  tail call void asm sideeffect "", "~{rdi},~{dirflag},~{fpsr},~{flags}"(), !dbg !15, !srcloc !16
+  ret void, !dbg !17
+}
+
+declare dso_local void @_Z1yv() local_unnamed_addr
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (https://git.llvm.org/git/clang.git/ 41055c6168135fe539801799e5c5636247cf0302) (https://git.llvm.org/git/llvm.git/ de0558be123ffbb5b5bd692c17dbd57a75fe684f)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "a.cpp", directory: "/home/test/PRs/PR38990")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (https://git.llvm.org/git/clang.git/ 41055c6168135fe539801799e5c5636247cf0302) (https://git.llvm.org/git/llvm.git/ de0558be123ffbb5b5bd692c17dbd57a75fe684f)"}
+!7 = distinct !DISubprogram(name: "a", linkageName: "_Z1ai", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !{!12}
+!12 = !DILocalVariable(name: "i", arg: 1, scope: !7, file: !1, line: 2, type: !10)
+!13 = !DILocation(line: 2, column: 12, scope: !7)
+!14 = !DILocation(line: 3, column: 3, scope: !7)
+!15 = !DILocation(line: 4, column: 3, scope: !7)
+!16 = !{i32 41}
+!17 = !DILocation(line: 5, column: 1, scope: !7)
-- 
GitLab


From 582cb39ac306b0eb355c582df76f77424dbc1e1c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 19 Oct 2018 19:24:42 +0000
Subject: [PATCH 0330/1116] [X86] In PostprocessISelDAG, start from
 allnodes_end, not the root.

There is no guarantee the root is at the end if isel created any nodes without morphing them. This includes the nodes created by manual isel from C++ code in X86ISelDAGToDAG.

This is similar to r333415 from PowerPC which is where I originally stole the peephole loop from.

I don't have a test case, but without this a future patch doesn't work which is how I found it.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344808 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index d6bcdcdf149..efd6349871f 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -849,8 +849,7 @@ void X86DAGToDAGISel::PostprocessISelDAG() {
 
   // Attempt to remove vectors moves that were inserted to zero upper bits.
 
-  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
-  ++Position;
+  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
   while (Position != CurDAG->allnodes_begin()) {
     SDNode *N = &*--Position;
-- 
GitLab


From 4505dff9b286ed80786b69e5a9705b8869723071 Mon Sep 17 00:00:00 2001
From: Aditya Nandakumar <aditya_nandakumar@apple.com>
Date: Fri, 19 Oct 2018 20:11:52 +0000
Subject: [PATCH 0331/1116] [GISel]: Allow PHIs to be DCEd

https://reviews.llvm.org/D53304

Currently dead phis are not cleaned up during DCE. This patch allows
dead PHI and G_PHI insts to be deleted.

Reviewed by: dsanders

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344811 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/GlobalISel/Utils.cpp                 | 2 +-
 test/CodeGen/AArch64/GlobalISel/legalize-phi.mir | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index 1a5f88743d5..4d3a3753559 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -137,7 +137,7 @@ bool llvm::isTriviallyDead(const MachineInstr &MI,
   // If we can move an instruction, we can remove it.  Otherwise, it has
   // a side-effect of some sort.
   bool SawStore = false;
-  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore))
+  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore) && !MI.isPHI())
     return false;
 
   // Instructions without side-effects are dead iff they only define dead vregs.
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir b/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
index 7c4bbfcc63f..d8f2542d907 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
@@ -121,6 +121,7 @@ body:             |
 
   bb.3:
     %9(s1) = G_PHI %5(s1), %bb.1, %8(s1), %bb.2
+    %11:_(s1) = G_PHI %5(s1), %bb.1, %8(s1), %bb.2
     %10(s32) = G_ZEXT %9(s1)
     $w0 = COPY %10(s32)
     RET_ReallyLR implicit $w0
-- 
GitLab


From 1c40da960520e63515ecca7de06adc35b7c237d4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 19 Oct 2018 20:17:05 +0000
Subject: [PATCH 0332/1116] Fix typos in assert message

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344812 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/SlotIndexes.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index b6e5088b986..8c8a7be459f 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -448,7 +448,7 @@ class raw_ostream;
     /// MI is not required to have an index.
     SlotIndex getIndexBefore(const MachineInstr &MI) const {
       const MachineBasicBlock *MBB = MI.getParent();
-      assert(MBB && "MI must be inserted inna basic block");
+      assert(MBB && "MI must be inserted in a basic block");
       MachineBasicBlock::const_iterator I = MI, B = MBB->begin();
       while (true) {
         if (I == B)
@@ -465,7 +465,7 @@ class raw_ostream;
     /// MI is not required to have an index.
     SlotIndex getIndexAfter(const MachineInstr &MI) const {
       const MachineBasicBlock *MBB = MI.getParent();
-      assert(MBB && "MI must be inserted inna basic block");
+      assert(MBB && "MI must be inserted in a basic block");
       MachineBasicBlock::const_iterator I = MI, E = MBB->end();
       while (true) {
         ++I;
-- 
GitLab


From 87f7bbe2f2383d650114a5494af1de02afcb1b0d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 19 Oct 2018 20:44:33 +0000
Subject: [PATCH 0333/1116] [X86] Remove some left over code from when MVT:i1
 was a legal type for AVX512.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344813 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FastISel.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 888c43afd8a..a49ad8bd59d 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -3734,9 +3734,6 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
   switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected value type");
   case MVT::i1:
-    // TODO: Support this properly.
-    if (Subtarget->hasAVX512())
-      return 0;
     VT = MVT::i8;
     LLVM_FALLTHROUGH;
   case MVT::i8:  Opc = X86::MOV8ri;  break;
-- 
GitLab


From 71c6b614f3dddacdc98e62b6e2a682b44d3bee32 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Fri, 19 Oct 2018 20:57:45 +0000
Subject: [PATCH 0334/1116] [NFC][InstCombine] Undo stray change

Undo stray change introduced by r344725.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344814 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/SimplifyLibCalls.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 63229bf0399..a50575b0256 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1227,14 +1227,14 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
       default:
         return nullptr;
       case LibFunc_expf:  case LibFunc_exp:  case LibFunc_expl:
-        ExpName = "exp";
+        ExpName = TLI->getName(LibFunc_exp);
         ID = Intrinsic::exp;
         LibFnFloat = LibFunc_expf;
         LibFnDouble = LibFunc_exp;
         LibFnLongDouble = LibFunc_expl;
         break;
       case LibFunc_exp2f: case LibFunc_exp2: case LibFunc_exp2l:
-        ExpName = "exp2";
+        ExpName = TLI->getName(LibFunc_exp2);
         ID = Intrinsic::exp2;
         LibFnFloat = LibFunc_exp2f;
         LibFnDouble = LibFunc_exp2;
-- 
GitLab


From 595770b8c9c156a48b2783b17695114bda3de826 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@gmail.com>
Date: Fri, 19 Oct 2018 21:09:21 +0000
Subject: [PATCH 0335/1116] AMDGPU: Add support pattern for SUB of one bit

Summary:
  Add selection patterns to support one bit Sub.

Reviewers:
  rampitec, arsenm

Differential Revision:
  https://reviews.llvm.org/D52946

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344815 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIInstructions.td | 10 ++++++
 test/CodeGen/AMDGPU/add_i1.ll       | 26 ++++++++++++++++
 test/CodeGen/AMDGPU/sub_i1.ll       | 47 +++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)
 create mode 100644 test/CodeGen/AMDGPU/sub_i1.ll

diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index c6043ea1c24..1336a576e84 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1327,11 +1327,21 @@ def : GCNPat <
   (S_XOR_B64 $src0, $src1)
 >;
 
+def : GCNPat <
+  (i1 (sub i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
 let AddedComplexity = 1 in {
 def : GCNPat <
   (i1 (add i1:$src0, (i1 -1))),
   (S_NOT_B64 $src0)
 >;
+
+def : GCNPat <
+  (i1 (sub i1:$src0, (i1 -1))),
+  (S_NOT_B64 $src0)
+>;
 }
 
 def : GCNPat <
diff --git a/test/CodeGen/AMDGPU/add_i1.ll b/test/CodeGen/AMDGPU/add_i1.ll
index 1f44940018c..fb3b69ca3bd 100644
--- a/test/CodeGen/AMDGPU/add_i1.ll
+++ b/test/CodeGen/AMDGPU/add_i1.ll
@@ -19,3 +19,29 @@ define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)
   store i1 %add, i1 addrspace(1)* %out
   ret void
 }
+
+; GCN-LABEL: {{^}}add_i1_cf:
+; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}
+; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+define amdgpu_kernel void @add_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %0 = load volatile i1, i1 addrspace(1)* %a
+  br label %endif
+
+else:
+  %1 = load volatile i1, i1 addrspace(1)* %b
+  br label %endif
+
+endif:
+  %2 = phi i1 [%0, %if], [%1, %else]
+  %3 = add i1 %2, -1
+  store i1 %3, i1 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/sub_i1.ll b/test/CodeGen/AMDGPU/sub_i1.ll
new file mode 100644
index 00000000000..70562a59f0a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sub_i1.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+
+; GCN-LABEL: {{^}}sub_var_var_i1:
+; GCN: s_xor_b64
+define amdgpu_kernel void @sub_var_var_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
+  %a = load volatile i1, i1 addrspace(1)* %in0
+  %b = load volatile i1, i1 addrspace(1)* %in1
+  %sub = sub i1 %a, %b
+  store i1 %sub, i1 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}sub_var_imm_i1:
+; GCN: s_not_b64
+define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {
+  %a = load volatile i1, i1 addrspace(1)* %in
+  %sub = sub i1 %a, 1
+  store i1 %sub, i1 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}sub_i1_cf:
+; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}
+; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+define amdgpu_kernel void @sub_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %0 = load volatile i1, i1 addrspace(1)* %a
+  br label %endif
+
+else:
+  %1 = load volatile i1, i1 addrspace(1)* %b
+  br label %endif
+
+endif:
+  %2 = phi i1 [%0, %if], [%1, %else]
+  %3 = sub i1 %2, -1
+  store i1 %3, i1 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
-- 
GitLab


From 97a6779252c2613585a8519f64468e77c420daa3 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Fri, 19 Oct 2018 21:11:43 +0000
Subject: [PATCH 0336/1116] [LoopVectorize] Loop vectorization for minimum and
 maximum

Summary: Depends on D52766.

Reviewers: aheejin, dschuff

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D52767

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344816 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/VectorUtils.cpp               |  2 +
 test/Transforms/LoopVectorize/intrinsic.ll | 56 ++++++++++++++++++++++
 test/Transforms/Scalarizer/intrinsics.ll   | 24 ++++++++++
 3 files changed, 82 insertions(+)

diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index e14449b8838..5fd6fe0ef31 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -54,6 +54,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::fabs:
   case Intrinsic::minnum:
   case Intrinsic::maxnum:
+  case Intrinsic::minimum:
+  case Intrinsic::maximum:
   case Intrinsic::copysign:
   case Intrinsic::floor:
   case Intrinsic::ceil:
diff --git a/test/Transforms/LoopVectorize/intrinsic.ll b/test/Transforms/LoopVectorize/intrinsic.ll
index 178d602e7f3..203c4435c88 100644
--- a/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/test/Transforms/LoopVectorize/intrinsic.ll
@@ -1247,3 +1247,59 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
+
+declare float @llvm.minimum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @minimum_f32(
+;CHECK: llvm.minimum.v4f32
+;CHECK: ret void
+define void @minimum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.minimum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.maximum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @maximum_f32(
+;CHECK: llvm.maximum.v4f32
+;CHECK: ret void
+define void @maximum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.maximum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Transforms/Scalarizer/intrinsics.ll b/test/Transforms/Scalarizer/intrinsics.ll
index 6c85ac3d092..7cebdffab7c 100644
--- a/test/Transforms/Scalarizer/intrinsics.ll
+++ b/test/Transforms/Scalarizer/intrinsics.ll
@@ -5,6 +5,8 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
 
 ; Binary fp
 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
 
 ; Ternary fp
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
@@ -40,6 +42,28 @@ define <2 x float> @scalarize_minnum_v2f32(<2 x float> %x, <2 x float> %y) #0 {
   ret <2 x float> %minnum
 }
 
+; CHECK-LABEL: @scalarize_minimum_v2f32(
+; CHECK: %minimum.i0 = call float @llvm.minimum.f32(float %x.i0, float %y.i0)
+; CHECK: %minimum.i1 = call float @llvm.minimum.f32(float %x.i1, float %y.i1)
+; CHECK: %minimum.upto0 = insertelement <2 x float> undef, float %minimum.i0, i32 0
+; CHECK: %minimum = insertelement <2 x float> %minimum.upto0, float %minimum.i1, i32 1
+; CHECK: ret <2 x float> %minimum
+define <2 x float> @scalarize_minimum_v2f32(<2 x float> %x, <2 x float> %y) #0 {
+  %minimum = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> %y)
+  ret <2 x float> %minimum
+}
+
+; CHECK-LABEL: @scalarize_maximum_v2f32(
+; CHECK: %maximum.i0 = call float @llvm.maximum.f32(float %x.i0, float %y.i0)
+; CHECK: %maximum.i1 = call float @llvm.maximum.f32(float %x.i1, float %y.i1)
+; CHECK: %maximum.upto0 = insertelement <2 x float> undef, float %maximum.i0, i32 0
+; CHECK: %maximum = insertelement <2 x float> %maximum.upto0, float %maximum.i1, i32 1
+; CHECK: ret <2 x float> %maximum
+define <2 x float> @scalarize_maximum_v2f32(<2 x float> %x, <2 x float> %y) #0 {
+  %maximum = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> %y)
+  ret <2 x float> %maximum
+}
+
 ; CHECK-LABEL: @scalarize_fma_v2f32(
 ; CHECK: %fma.i0 = call float @llvm.fma.f32(float %x.i0, float %y.i0, float %z.i0)
 ; CHECK: %fma.i1 = call float @llvm.fma.f32(float %x.i1, float %y.i1, float %z.i1)
-- 
GitLab


From 95eb91571b2a31d02953a88b9a3fefb618ba2f3e Mon Sep 17 00:00:00 2001
From: Petar Jovanovic <petar.jovanovic@mips.com>
Date: Fri, 19 Oct 2018 22:16:49 +0000
Subject: [PATCH 0337/1116] [llvm-objdump] Fix --file-headers (-f) option

Changed the format call to match the surrounding code. Previously it was
printing an unsigned int while the return type being printed was
long unsigned int or wider. This caused problems for big-endian systems
which were discovered on mips64.
Also, the printed address had less characters than it should because the
character count was directly obtained from the number of bytes in the
address.
The tests were adapted to fit this fix and now use longer addresses.

Patch by Milos Stojanovic.

Differential Revision: https://reviews.llvm.org/D53403


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344818 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-objdump/file-headers-coff.test | 2 +-
 test/tools/llvm-objdump/file-headers-elf.test  | 4 ++--
 test/tools/llvm-objdump/file-headers-pe.test   | 4 ++--
 tools/llvm-objdump/llvm-objdump.cpp            | 5 ++++-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/test/tools/llvm-objdump/file-headers-coff.test b/test/tools/llvm-objdump/file-headers-coff.test
index 784b0124a2d..144532d6fd9 100644
--- a/test/tools/llvm-objdump/file-headers-coff.test
+++ b/test/tools/llvm-objdump/file-headers-coff.test
@@ -10,4 +10,4 @@ sections:
 symbols:
 
 # CHECK: architecture: i386
-# CHECK: start address: 0x0000
+# CHECK: start address: 0x00000000
diff --git a/test/tools/llvm-objdump/file-headers-elf.test b/test/tools/llvm-objdump/file-headers-elf.test
index ade59cf05da..397b9035bd4 100644
--- a/test/tools/llvm-objdump/file-headers-elf.test
+++ b/test/tools/llvm-objdump/file-headers-elf.test
@@ -8,7 +8,7 @@ FileHeader:
   Data:            ELFDATA2LSB
   Type:            ET_REL
   Machine:         EM_X86_64
-  Entry:           0x123456
+  Entry:           0x123456789abcde
 
 # CHECK: architecture: x86_64
-# CHECK: start address: 0x00123456
+# CHECK: start address: 0x00123456789abcde
diff --git a/test/tools/llvm-objdump/file-headers-pe.test b/test/tools/llvm-objdump/file-headers-pe.test
index 1e2fb2c4c3d..68c086163bb 100644
--- a/test/tools/llvm-objdump/file-headers-pe.test
+++ b/test/tools/llvm-objdump/file-headers-pe.test
@@ -7,7 +7,7 @@ header: !Header
   Machine: IMAGE_FILE_MACHINE_I386
   Characteristics: [ IMAGE_FILE_DEBUG_STRIPPED ]
 OptionalHeader:
-  AddressOfEntryPoint: 0x1234
+  AddressOfEntryPoint: 0x123456
 # Unfortunately, all these flags are mandatory to set AddressOfEntryPoint.
 # All the values are randomly picked. They can't interfere in what
 # we are testing here.
@@ -30,4 +30,4 @@ sections:
 symbols:
 
 # CHECK: architecture: i386
-# CHECK: start address: 0x1234
+# CHECK: start address: 0x00123456
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 64c3823295c..7107966b18d 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -2220,8 +2220,11 @@ static void printFileHeaders(const ObjectFile *o) {
   Expected<uint64_t> StartAddrOrErr = o->getStartAddress();
   if (!StartAddrOrErr)
     report_error(o->getFileName(), StartAddrOrErr.takeError());
+
+  StringRef Fmt = o->getBytesInAddress() > 4 ? "%016" PRIx64 : "%08" PRIx64;
+  uint64_t Address = StartAddrOrErr.get();
   outs() << "start address: "
-         << format("0x%0*x", o->getBytesInAddress(), StartAddrOrErr.get())
+         << "0x" << format(Fmt.data(), Address)
          << "\n";
 }
 
-- 
GitLab


From 315f6cf87cde1b71ce91fc3108f173dad114bf6d Mon Sep 17 00:00:00 2001
From: Roman Tereshin <rtereshin@apple.com>
Date: Sat, 20 Oct 2018 00:06:15 +0000
Subject: [PATCH 0338/1116] [MachineCSE][GlobalISel] Making sure MachineCSE
 works mid-GlobalISel (again)

Change of approach, it looks like it's a much better idea to deal with
the vregs that have LLTs and reg classes both properly, than trying to
avoid creating those across all GlobalISel passes and all targets.

The change mostly touches MachineRegisterInfo::constrainRegClass,
which is apparently only used by MachineCSE. The changes are NFC for
any pipeline but one that contains MachineCSE mid-GlobalISel.

NOTE on isCallerPreservedOrConstPhysReg change in MachineCSE:

    There is no test covering it as the only way to insert a new pass
(MachineCSE) from a command line I know of is llc's -run-pass option,
which only works with MIR, but MIRParser freezes reserved registers upon
MachineFunctions creation, making it impossible to reproduce the state
that exposes the issue.

Reviwed By: aditya_nandakumar

Differential Revision: https://reviews.llvm.org/D53144

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344822 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineRegisterInfo.h    | 15 +++--
 lib/CodeGen/MachineCSE.cpp                    | 17 +++++-
 lib/CodeGen/MachineRegisterInfo.cpp           | 55 ++++++++-----------
 .../GlobalISel/machine-cse-mid-pipeline.mir   | 38 +++++++++++--
 4 files changed, 77 insertions(+), 48 deletions(-)

diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index a6836a53f04..fef010a23ef 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -689,15 +689,14 @@ public:
                                                unsigned MinNumRegs = 0);
 
   /// Constrain the register class or the register bank of the virtual register
-  /// \p Reg to be a common subclass and a common bank of both registers
-  /// provided respectively. Do nothing if any of the attributes (classes,
-  /// banks, or low-level types) of the registers are deemed incompatible, or if
-  /// the resulting register will have a class smaller than before and of size
-  /// less than \p MinNumRegs. Return true if such register attributes exist,
-  /// false otherwise.
+  /// \p Reg (and low-level type) to be a common subclass or a common bank of
+  /// both registers provided respectively (and a common low-level type). Do
+  /// nothing if any of the attributes (classes, banks, or low-level types) of
+  /// the registers are deemed incompatible, or if the resulting register will
+  /// have a class smaller than before and of size less than \p MinNumRegs.
+  /// Return true if such register attributes exist, false otherwise.
   ///
-  /// \note Assumes that each register has either a low-level type or a class
-  /// assigned, but not both. Use this method instead of constrainRegClass and
+  /// \note Use this method instead of constrainRegClass and
   /// RegisterBankInfo::constrainGenericRegister everywhere but SelectionDAG
   /// ISel / FastISel and GlobalISel's InstructionSelect pass respectively.
   bool constrainRegAttrs(unsigned Reg, unsigned ConstrainingReg,
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index dcb6f7cca4f..6ee8571c28a 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -235,6 +235,21 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
   return false;
 }
 
+static bool isCallerPreservedOrConstPhysReg(unsigned Reg,
+                                            const MachineFunction &MF,
+                                            const TargetRegisterInfo &TRI) {
+  // MachineRegisterInfo::isConstantPhysReg directly called by
+  // MachineRegisterInfo::isCallerPreservedOrConstPhysReg expects the
+  // reserved registers to be frozen. That doesn't cause a problem  post-ISel as
+  // most (if not all) targets freeze reserved registers right after ISel.
+  //
+  // It does cause issues mid-GlobalISel, however, hence the additional
+  // reservedRegsFrozen check.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  return TRI.isCallerPreservedPhysReg(Reg, MF) ||
+         (MRI.reservedRegsFrozen() && MRI.isConstantPhysReg(Reg));
+}
+
 /// hasLivePhysRegDefUses - Return true if the specified instruction read/write
 /// physical registers (except for dead defs of physical registers). It also
 /// returns the physical register def by reference if it's the only one and the
@@ -254,7 +269,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
     if (TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
     // Reading either caller preserved or constant physregs is ok.
-    if (!MRI->isCallerPreservedOrConstPhysReg(Reg))
+    if (!isCallerPreservedOrConstPhysReg(Reg, *MI->getMF(), *TRI))
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
         PhysRefs.insert(*AI);
   }
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 1da99d91760..6e5ca45d5e5 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -93,36 +93,29 @@ bool
 MachineRegisterInfo::constrainRegAttrs(unsigned Reg,
                                        unsigned ConstrainingReg,
                                        unsigned MinNumRegs) {
-  auto const *OldRC = getRegClassOrNull(Reg);
-  auto const *RC = getRegClassOrNull(ConstrainingReg);
-  // A virtual register at any point must have either a low-level type
-  // or a class assigned, but not both. The only exception is the internals of
-  // GlobalISel's instruction selection pass, which is allowed to temporarily
-  // introduce registers with types and classes both.
-  assert((OldRC || getType(Reg).isValid()) && "Reg has neither class nor type");
-  assert((!OldRC || !getType(Reg).isValid()) && "Reg has class and type both");
-  assert((RC || getType(ConstrainingReg).isValid()) &&
-         "ConstrainingReg has neither class nor type");
-  assert((!RC || !getType(ConstrainingReg).isValid()) &&
-         "ConstrainingReg has class and type both");
-  if (OldRC && RC)
-    return ::constrainRegClass(*this, Reg, OldRC, RC, MinNumRegs);
-  // If one of the virtual registers is generic (used in generic machine
-  // instructions, has a low-level type, doesn't have a class), and the other is
-  // concrete (used in target specific instructions, doesn't have a low-level
-  // type, has a class), we can not unify them.
-  if (OldRC || RC)
+  const LLT RegTy = getType(Reg);
+  const LLT ConstrainingRegTy = getType(ConstrainingReg);
+  if (RegTy.isValid() && ConstrainingRegTy.isValid() &&
+      RegTy != ConstrainingRegTy)
     return false;
-  // At this point, both registers are guaranteed to have a valid low-level
-  // type, and they must agree.
-  if (getType(Reg) != getType(ConstrainingReg))
-    return false;
-  auto const *OldRB = getRegBankOrNull(Reg);
-  auto const *RB = getRegBankOrNull(ConstrainingReg);
-  if (OldRB)
-    return !RB || RB == OldRB;
-  if (RB)
-    setRegBank(Reg, *RB);
+  const auto ConstrainingRegCB = getRegClassOrRegBank(ConstrainingReg);
+  if (!ConstrainingRegCB.isNull()) {
+    const auto RegCB = getRegClassOrRegBank(Reg);
+    if (RegCB.isNull())
+      setRegClassOrRegBank(Reg, ConstrainingRegCB);
+    else if (RegCB.is<const TargetRegisterClass *>() !=
+             ConstrainingRegCB.is<const TargetRegisterClass *>())
+      return false;
+    else if (RegCB.is<const TargetRegisterClass *>()) {
+      if (!::constrainRegClass(
+              *this, Reg, RegCB.get<const TargetRegisterClass *>(),
+              ConstrainingRegCB.get<const TargetRegisterClass *>(), MinNumRegs))
+        return false;
+    } else if (RegCB != ConstrainingRegCB)
+      return false;
+  }
+  if (ConstrainingRegTy.isValid())
+    setType(Reg, ConstrainingRegTy);
   return true;
 }
 
@@ -188,10 +181,6 @@ unsigned MachineRegisterInfo::cloneVirtualRegister(unsigned VReg,
 }
 
 void MachineRegisterInfo::setType(unsigned VReg, LLT Ty) {
-  // Check that VReg doesn't have a class.
-  assert((getRegClassOrRegBank(VReg).isNull() ||
-         !getRegClassOrRegBank(VReg).is<const TargetRegisterClass *>()) &&
-         "Can't set the size of a non-generic virtual register");
   VRegToType.grow(VReg);
   VRegToType[VReg] = Ty;
 }
diff --git a/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir b/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir
index 8ca81a3bd40..667a7690466 100644
--- a/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir
+++ b/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir
@@ -130,9 +130,8 @@ regBankSelected: false
 selected:        false
 body:             |
   ; CHECK-LABEL: name: generic_to_concrete_copy
-  ; CHECK:      %[[S1:[0-9]+]]:_(s32) = G_ADD %{{[0-9]+}}, %{{[0-9]+}}
-  ; CHECK-NEXT: %[[S2:[0-9]+]]:gpr32 = COPY %[[S1]](s32)
-  ; CHECK-NEXT: %{{[0-9]+}}:gpr32 = ADDWrr %[[S2]], %[[S2]]
+  ; CHECK:      %[[S1:[0-9]+]]:gpr32(s32) = G_ADD %{{[0-9]+}}, %{{[0-9]+}}
+  ; CHECK-NEXT: %{{[0-9]+}}:gpr32 = ADDWrr %[[S1]](s32), %[[S1]](s32)
   bb.0:
     %0:_(s32) = COPY $w0
     %1:_(s32) = COPY $w1
@@ -149,9 +148,8 @@ regBankSelected: false
 selected:        false
 body:             |
   ; CHECK-LABEL: name: concrete_to_generic_copy
-  ; CHECK:      %[[S1:[0-9]+]]:gpr32 = ADDWrr %{{[0-9]+}}, %{{[0-9]+}}
-  ; CHECK-NEXT: %[[S2:[0-9]+]]:_(s32) = COPY %[[S1]]
-  ; CHECK-NEXT: %{{[0-9]+}}:_(s32) = G_ADD %[[S2]], %[[S2]]
+  ; CHECK:      %[[S1:[0-9]+]]:gpr32(s32) = ADDWrr %{{[0-9]+}}, %{{[0-9]+}}
+  ; CHECK-NEXT: %{{[0-9]+}}:_(s32) = G_ADD %[[S1]], %[[S1]]
   bb.0:
     %0:gpr32 = COPY $w0
     %1:gpr32 = COPY $w1
@@ -278,3 +276,31 @@ body:             |
     $w0 = COPY %23(s32)
     RET_ReallyLR implicit $w0
 ...
+---
+name:            variadic_defs_unmerge_vector_constraints_mix
+legalized:       true
+regBankSelected: false
+selected:        false
+body:             |
+  ; CHECK-LABEL: name: variadic_defs_unmerge_vector_constraints_mix
+  ; CHECK:      [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK-NEXT: [[UV0:%[0-9]+]]:gpr(s32), [[UV1:%[0-9]+]]:gpr(s32), [[UV2:%[0-9]+]]:gpr32(s32), [[UV3:%[0-9]+]]:gpr32(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
+  ; CHECK-NEXT: [[ADD0:%[0-9]+]]:_(s32) = G_ADD [[UV0]], [[UV1]]
+  ; CHECK-NEXT: [[ADD1:%[0-9]+]]:gpr32(s32) = ADDWrr [[UV2]](s32), [[UV3]](s32)
+  ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD0]], [[ADD1]]
+  ; CHECK-NEXT: $w0 = COPY [[ADD2]](s32)
+  ; CHECK-NEXT: RET_ReallyLR implicit $w0
+  bb.0:
+    %0 :_(<4 x s32>) = COPY $q0
+    %1 :_(s32), %2 : _ (s32), %3 :_(s32), %4 :  _  (s32) = G_UNMERGE_VALUES %0(<4 x s32>)
+    %5 :_(s32), %6 :gpr(s32), %7 :_(s32), %8 :  _  (s32) = G_UNMERGE_VALUES %0(<4 x s32>)
+    %9 :_(s32), %10: _ (s32), %11:_(s32), %12:  _  (s32) = G_UNMERGE_VALUES %0(<4 x s32>)
+    %13:_(s32), %14: _ (s32), %15:_(s32), %16:gpr32(s32) = G_UNMERGE_VALUES %0(<4 x s32>)
+    %21:gpr(s32) = COPY %1(s32)
+    %17:_(s32) = G_ADD %21, %6
+    %18:gpr32 = COPY %11(s32)
+    %19:gpr32(s32) = ADDWrr %18, %16
+    %20:_(s32) = G_ADD %17, %19
+    $w0 = COPY %20(s32)
+    RET_ReallyLR implicit $w0
+...
-- 
GitLab


From 7366ef73c4802c89c54e19d619bca572428cc4ac Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Sat, 20 Oct 2018 01:31:18 +0000
Subject: [PATCH 0339/1116] [WebAssembly] Custom lower i64x2 constant shifts to
 avoid wrap

Summary: Depends on D53057.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53251

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344825 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyISD.def     |  3 ++
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 38 +++++++++++++++++++
 .../WebAssembly/WebAssemblyISelLowering.h     |  1 +
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 13 +++++++
 test/CodeGen/WebAssembly/simd-arith.ll        | 29 ++++++++++++--
 5 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def
index 3c44d04598c..444a087605e 100644
--- a/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -22,5 +22,8 @@ HANDLE_NODETYPE(Wrapper)
 HANDLE_NODETYPE(BR_IF)
 HANDLE_NODETYPE(BR_TABLE)
 HANDLE_NODETYPE(SHUFFLE)
+HANDLE_NODETYPE(VEC_SHL)
+HANDLE_NODETYPE(VEC_SHR_S)
+HANDLE_NODETYPE(VEC_SHR_U)
 
 // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 6ca619c910a..e6fe1f85487 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -137,6 +137,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     }
   }
 
+  // Custom lowering to avoid having to emit a wrap for 2xi64 constant shifts
+  if (Subtarget->hasSIMD128() && EnableUnimplementedWasmSIMDInstrs)
+    for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})
+      setOperationAction(Op, MVT::v2i64, Custom);
+
   // As a special case, these operators use the type to mean the type to
   // sign-extend from.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
@@ -823,6 +828,10 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    return LowerShift(Op, DAG);
   }
 }
 
@@ -1000,6 +1009,35 @@ WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, MVT::v16i8, Ops);
 }
 
+SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  auto *ShiftVec = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!ShiftVec || !ShiftVec->isConstantSplat(SplatValue, SplatUndef,
+                                              SplatBitSize, HasAnyUndefs))
+    return Op;
+  unsigned Opcode;
+  switch (Op.getOpcode()) {
+  case ISD::SHL:
+    Opcode = WebAssemblyISD::VEC_SHL;
+    break;
+  case ISD::SRA:
+    Opcode = WebAssemblyISD::VEC_SHR_S;
+    break;
+  case ISD::SRL:
+    Opcode = WebAssemblyISD::VEC_SHR_U;
+    break;
+  default:
+    llvm_unreachable("unexpected opcode");
+    return Op;
+  }
+  return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(0),
+                     DAG.getConstant(SplatValue.trunc(32), DL, MVT::i32));
+}
+
 //===----------------------------------------------------------------------===//
 //                          WebAssembly Optimization Hooks
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 7b22651ff6d..61e78c71f2e 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -99,6 +99,7 @@ private:
   SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace WebAssembly {
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 95c87266273..0b09da7dcef 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -515,6 +515,19 @@ foreach shifts = [[shl, SHL_v2i64], [sra, SHR_S_v2i64], [srl, SHR_U_v2i64]] in
 def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), (v2i64 (splat2 I64:$x)))),
           (v2i64 (shifts[1] (v2i64 V128:$vec), (I32_WRAP_I64 I64:$x)))>;
 
+// 2xi64 shifts with constant shift amounts are custom lowered to avoid wrapping
+def wasm_shift_t : SDTypeProfile<1, 2,
+  [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]
+>;
+def wasm_shl : SDNode<"WebAssemblyISD::VEC_SHL", wasm_shift_t>;
+def wasm_shr_s : SDNode<"WebAssemblyISD::VEC_SHR_S", wasm_shift_t>;
+def wasm_shr_u : SDNode<"WebAssemblyISD::VEC_SHR_U", wasm_shift_t>;
+foreach shifts = [[wasm_shl, SHL_v2i64],
+                  [wasm_shr_s, SHR_S_v2i64],
+                  [wasm_shr_u, SHR_U_v2i64]] in
+def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), I32:$x)),
+          (v2i64 (shifts[1] (v2i64 V128:$vec), I32:$x))>;
+
 //===----------------------------------------------------------------------===//
 // Bitwise operations
 //===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/WebAssembly/simd-arith.ll b/test/CodeGen/WebAssembly/simd-arith.ll
index 973f78b30dc..689853fa7bb 100644
--- a/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/test/CodeGen/WebAssembly/simd-arith.ll
@@ -605,9 +605,8 @@ define <2 x i64> @shl_nozext_v2i64(<2 x i64> %v, i64 %x) {
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: i64.const $push[[L0:[0-9]+]]=, 5{{$}}
-; SIMD128-NEXT: i32.wrap/i64 $push[[L1:[0-9]+]]=, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i64x2.shl $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
+; SIMD128-NEXT: i64x2.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shl_const_v2i64(<2 x i64> %v) {
   %a = shl <2 x i64> %v, <i64 5, i64 5>
@@ -642,6 +641,18 @@ define <2 x i64> @shr_s_nozext_v2i64(<2 x i64> %v, i64 %x) {
   ret <2 x i64> %a
 }
 
+; CHECK-LABEL: shr_s_const_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
+; SIMD128-NEXT: i64x2.shr_s $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shr_s_const_v2i64(<2 x i64> %v) {
+  %a = ashr <2 x i64> %v, <i64 5, i64 5>
+  ret <2 x i64> %a
+}
+
 ; CHECK-LABEL: shr_u_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -670,6 +681,18 @@ define <2 x i64> @shr_u_nozext_v2i64(<2 x i64> %v, i64 %x) {
   ret <2 x i64> %a
 }
 
+; CHECK-LABEL: shr_u_const_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
+; SIMD128-NEXT: i64x2.shr_u $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shr_u_const_v2i64(<2 x i64> %v) {
+  %a = lshr <2 x i64> %v, <i64 5, i64 5>
+  ret <2 x i64> %a
+}
+
 ; CHECK-LABEL: and_v2i64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-- 
GitLab


From 1a048fad673a557227b261d29d0459b843bd584c Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Sat, 20 Oct 2018 01:35:23 +0000
Subject: [PATCH 0340/1116] [WebAssembly] Implement vector sext_inreg and tests
 with comparisons

Summary: Depends on D53251.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53252

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344826 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |   4 +
 .../WebAssembly/WebAssemblyInstrSIMD.td       |   9 +-
 test/CodeGen/WebAssembly/simd-comparisons.ll  | 830 +++++++++++++++++-
 test/CodeGen/WebAssembly/simd-sext-inreg.ll   |  65 ++
 4 files changed, 884 insertions(+), 24 deletions(-)
 create mode 100644 test/CodeGen/WebAssembly/simd-sext-inreg.ll

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index e6fe1f85487..1da66af5560 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -48,6 +48,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
   // Booleans always contain 0 or 1.
   setBooleanContents(ZeroOrOneBooleanContent);
+  // Except in SIMD vectors
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
   // WebAssembly does not produce floating-point exceptions on normal floating
   // point operations.
   setHasFloatingPointExceptions(false);
@@ -149,6 +151,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     for (auto T : {MVT::i8, MVT::i16, MVT::i32})
       setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
   }
+  for (auto T : MVT::integer_vector_valuetypes())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
 
   // Dynamic stack allocation: use the default expansion.
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 0b09da7dcef..711d42a219e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -613,7 +613,8 @@ multiclass SIMDCondition<ValueType vec_t, ValueType out_t, string vec,
   defm _#vec_t :
     SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
            [(set (out_t V128:$dst),
-             (setcc (vec_t V128:$lhs), (vec_t V128:$rhs), cond))],
+             (setcc (vec_t V128:$lhs), (vec_t V128:$rhs), cond)
+           )],
            vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name, simdop>;
 }
 
@@ -621,15 +622,15 @@ multiclass SIMDConditionInt<string name, CondCode cond, bits<32> baseInst,
                             int step = 1> {
   defm "" : SIMDCondition<v16i8, v16i8, "i8x16", name, cond, baseInst>;
   defm "" : SIMDCondition<v8i16, v8i16, "i16x8", name, cond,
-                              !add(baseInst, step)>;
+                          !add(baseInst, step)>;
   defm "" : SIMDCondition<v4i32, v4i32, "i32x4", name, cond,
-                              !add(!add(baseInst, step), step)>;
+                          !add(!add(baseInst, step), step)>;
 }
 
 multiclass SIMDConditionFP<string name, CondCode cond, bits<32> baseInst> {
   defm "" : SIMDCondition<v4f32, v4i32, "f32x4", name, cond, baseInst>;
   defm "" : SIMDCondition<v2f64, v2i64, "f64x2", name, cond,
-                              !add(baseInst, 1)>;
+                          !add(baseInst, 1)>;
 }
 
 // Equality: eq
diff --git a/test/CodeGen/WebAssembly/simd-comparisons.ll b/test/CodeGen/WebAssembly/simd-comparisons.ll
index 790bbb70646..5f0a1e93b45 100644
--- a/test/CodeGen/WebAssembly/simd-comparisons.ll
+++ b/test/CodeGen/WebAssembly/simd-comparisons.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -wasm-enable-unimplemented-simd -mattr=+simd128,+sign-ext --show-mc-encoding | FileCheck %s --check-prefixes CHECK,SIMD128
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=+simd128,+sign-ext --show-mc-encoding | FileCheck %s --check-prefixes CHECK,SIMD128-VM
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=-simd128,+sign-ext --show-mc-encoding | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -wasm-enable-unimplemented-simd -mattr=+simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=+simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128-VM
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=-simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,NO-SIMD128
 
 ; Test SIMD comparison operators
 
@@ -18,6 +18,18 @@ define <16 x i1> @compare_eq_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_eq_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_eq_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp eq <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_ne_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -29,6 +41,18 @@ define <16 x i1> @compare_ne_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ne_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_ne_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp ne <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_slt_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -40,6 +64,18 @@ define <16 x i1> @compare_slt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_slt_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_slt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp slt <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_ult_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -51,6 +87,18 @@ define <16 x i1> @compare_ult_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ult_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_ult_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp ult <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_sle_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -62,6 +110,18 @@ define <16 x i1> @compare_sle_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sle_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_sle_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp sle <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_ule_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -73,6 +133,18 @@ define <16 x i1> @compare_ule_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ule_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_ule_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp ule <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_sgt_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -84,6 +156,18 @@ define <16 x i1> @compare_sgt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sgt_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_sgt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp sgt <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_ugt_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -95,6 +179,18 @@ define <16 x i1> @compare_ugt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ugt_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_ugt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp ugt <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_sge_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -106,6 +202,18 @@ define <16 x i1> @compare_sge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sge_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_sge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp sge <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_uge_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -117,6 +225,18 @@ define <16 x i1> @compare_uge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uge_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_uge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp uge <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_eq_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -128,6 +248,18 @@ define <8 x i1> @compare_eq_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_eq_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_eq_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp eq <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_ne_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -139,6 +271,18 @@ define <8 x i1> @compare_ne_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ne_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_ne_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp ne <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_slt_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -150,6 +294,18 @@ define <8 x i1> @compare_slt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_slt_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_slt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp slt <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_ult_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -161,6 +317,18 @@ define <8 x i1> @compare_ult_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ult_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_ult_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp ult <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_sle_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -172,6 +340,18 @@ define <8 x i1> @compare_sle_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sle_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_sle_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp sle <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_ule_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -183,6 +363,18 @@ define <8 x i1> @compare_ule_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ule_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_ule_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp ule <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_sgt_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -194,6 +386,18 @@ define <8 x i1> @compare_sgt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sgt_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_sgt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp sgt <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_ugt_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -205,6 +409,18 @@ define <8 x i1> @compare_ugt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ugt_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_ugt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp ugt <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_sge_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -216,6 +432,18 @@ define <8 x i1> @compare_sge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sge_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_sge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp sge <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_uge_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -227,6 +455,18 @@ define <8 x i1> @compare_uge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uge_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_uge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp uge <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_eq_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -238,6 +478,18 @@ define <4 x i1> @compare_eq_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_eq_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_eq_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp eq <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ne_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -249,6 +501,18 @@ define <4 x i1> @compare_ne_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ne_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ne_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp ne <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_slt_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -260,6 +524,18 @@ define <4 x i1> @compare_slt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_slt_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_slt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp slt <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ult_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -271,6 +547,18 @@ define <4 x i1> @compare_ult_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ult_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ult_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp ult <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_sle_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -282,6 +570,18 @@ define <4 x i1> @compare_sle_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sle_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_sle_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp sle <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ule_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -293,6 +593,18 @@ define <4 x i1> @compare_ule_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ule_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ule_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp ule <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_sgt_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -304,6 +616,18 @@ define <4 x i1> @compare_sgt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sgt_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_sgt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp sgt <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ugt_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -315,6 +639,18 @@ define <4 x i1> @compare_ugt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ugt_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ugt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp ugt <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_sge_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -326,6 +662,18 @@ define <4 x i1> @compare_sge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sge_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_sge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp sge <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_uge_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -337,6 +685,18 @@ define <4 x i1> @compare_uge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uge_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_uge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp uge <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_oeq_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -348,6 +708,18 @@ define <4 x i1> @compare_oeq_v4f32 (<4 x float> %x, <4 x float> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_oeq_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_oeq_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp oeq <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ogt_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -359,6 +731,18 @@ define <4 x i1> @compare_ogt_v4f32 (<4 x float> %x, <4 x float> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ogt_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.gt $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ogt_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ogt <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_oge_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -370,6 +754,18 @@ define <4 x i1> @compare_oge_v4f32 (<4 x float> %x, <4 x float> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_oge_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.ge $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_oge_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp oge <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_olt_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -381,6 +777,18 @@ define <4 x i1> @compare_olt_v4f32 (<4 x float> %x, <4 x float> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_olt_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.lt $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_olt_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp olt <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ole_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -392,97 +800,257 @@ define <4 x i1> @compare_ole_v4f32 (<4 x float> %x, <4 x float> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ole_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.le $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ole_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ole <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_one_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.ne
+; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_one_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp one <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_one_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_one_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp one <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ord_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.eq
+; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return   $pop[[R]]{{$}}
 define <4 x i1> @compare_ord_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp ord <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ord_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return   $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ord_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ord <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ueq_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.eq
+; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ueq_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp ueq <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ueq_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]
+define <4 x i32> @compare_sext_ueq_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ueq <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ugt_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.le
+; SIMD128-NEXT: f32x4.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ugt_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp ugt <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ugt_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ugt_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ugt <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_uge_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.lt
+; SIMD128-NEXT: f32x4.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_uge_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp uge <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uge_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_uge_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp uge <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ult_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.ge
+; SIMD128-NEXT: f32x4.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ult_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp ult <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ult_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ult_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ult <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ule_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.gt
+; SIMD128-NEXT: f32x4.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ule_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp ule <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ule_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ule_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ule <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_une_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_une_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp une <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_une_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_une_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp une <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_uno_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.ne
+; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_uno_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp uno <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uno_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_uno_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp uno <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_oeq_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -495,6 +1063,19 @@ define <2 x i1> @compare_oeq_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_oeq_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_oeq_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp oeq <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ogt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -507,6 +1088,19 @@ define <2 x i1> @compare_ogt_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ogt_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.gt $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ogt_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ogt <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_oge_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -519,6 +1113,19 @@ define <2 x i1> @compare_oge_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_oge_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.ge $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_oge_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp oge <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_olt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -531,6 +1138,19 @@ define <2 x i1> @compare_olt_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_olt_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.lt $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_olt_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp olt <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ole_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -543,83 +1163,222 @@ define <2 x i1> @compare_ole_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ole_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.le $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ole_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ole <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_one_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.ne
+; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_one_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp one <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_one_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_one_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp one <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ord_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.eq
+; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return   $pop[[R]]{{$}}
 define <2 x i1> @compare_ord_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp ord <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ord_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return   $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ord_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ord <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ueq_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.eq
+; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_ueq_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp ueq <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ueq_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ueq_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ueq <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ugt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.le
+; SIMD128-NEXT: f64x2.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_ugt_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp ugt <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ugt_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ugt_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ugt <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_uge_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.lt
+; SIMD128-NEXT: f64x2.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_uge_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp uge <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uge_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_uge_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp uge <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ult_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.ge
+; SIMD128-NEXT: f64x2.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_ult_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp ult <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ult_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ult_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ult <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ule_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.gt
+; SIMD128-NEXT: f64x2.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_ule_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp ule <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ule_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ule_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ule <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_une_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -632,13 +1391,44 @@ define <2 x i1> @compare_une_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_une_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_une_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp une <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_uno_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.ne
+; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_uno_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp uno <2 x double> %x, %y
   ret <2 x i1> %res
 }
+
+; CHECK-LABEL: compare_sext_uno_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_uno_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp uno <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
diff --git a/test/CodeGen/WebAssembly/simd-sext-inreg.ll b/test/CodeGen/WebAssembly/simd-sext-inreg.ll
new file mode 100644
index 00000000000..1001d0db168
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-sext-inreg.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -asm-verbose=false -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-enable-unimplemented-simd -mattr=+simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128
+; RUN: llc < %s -asm-verbose=false -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mattr=+simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128-VM
+; RUN: llc < %s -asm-verbose=false -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mattr=-simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+
+; Test that vector sign extensions lower to shifts
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: sext_inreg_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i8x16.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
+; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i8x16.shr_s $push[[R:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @sext_inreg_v16i8(<16 x i1> %x) {
+  %res = sext <16 x i1> %x to <16 x i8>
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: sext_inreg_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 15{{$}}
+; SIMD128-NEXT: i16x8.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
+; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 15{{$}}
+; SIMD128-NEXT: i16x8.shr_s $push[[R:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @sext_inreg_v8i16(<8 x i1> %x) {
+  %res = sext <8 x i1> %x to <8 x i16>
+  ret <8 x i16> %res
+}
+
+; CHECK-LABEL: sext_inreg_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 31{{$}}
+; SIMD128-NEXT: i32x4.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
+; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 31{{$}}
+; SIMD128-NEXT: i32x4.shr_s $push[[R:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @sext_inreg_v4i32(<4 x i1> %x) {
+  %res = sext <4 x i1> %x to <4 x i32>
+  ret <4 x i32> %res
+}
+
+; CHECK-LABEL: sext_inreg_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SDIM128-VM-NOT: i64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 63{{$}}
+; SIMD128-NEXT: i64x2.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
+; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 63{{$}}
+; SIMD128-NEXT: i64x2.shr_s $push[[R:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @sext_inreg_v2i64(<2 x i1> %x) {
+  %res = sext <2 x i1> %x to <2 x i64>
+  ret <2 x i64> %res
+}
-- 
GitLab


From 616537d9aa742af7fa934fb29cc6fcb3b88b6a44 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sat, 20 Oct 2018 03:51:43 +0000
Subject: [PATCH 0341/1116] [X86] Add additional CPUs and features to Host.cpp
 and X86TargetParser.def to match compiler-rt and enable
 __builtin_cpu_supports/__builtin_cpu_is support in clang

Summary: This matches LLVM to D53461 for compiler-rt.

Reviewers: echristo, erichkeane

Reviewed By: echristo

Subscribers: dberris, llvm-commits

Differential Revision: https://reviews.llvm.org/D53462

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344831 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/X86TargetParser.def |  45 +++++----
 lib/Support/Host.cpp                     | 123 ++++++++++++++---------
 2 files changed, 99 insertions(+), 69 deletions(-)

diff --git a/include/llvm/Support/X86TargetParser.def b/include/llvm/Support/X86TargetParser.def
index e4af0657a35..eb45ed6a76a 100644
--- a/include/llvm/Support/X86TargetParser.def
+++ b/include/llvm/Support/X86TargetParser.def
@@ -34,17 +34,20 @@ X86_VENDOR(VENDOR_AMD,   "amd")
 #ifndef X86_CPU_TYPE
 #define X86_CPU_TYPE(ARCHNAME, ENUM)
 #endif
-X86_CPU_TYPE_COMPAT_WITH_ALIAS("bonnell",    INTEL_BONNELL,    "bonnell", "atom")
-X86_CPU_TYPE_COMPAT           ("core2",      INTEL_CORE2,      "core2")
-X86_CPU_TYPE_COMPAT           ("nehalem",    INTEL_COREI7,     "corei7")
-X86_CPU_TYPE_COMPAT_WITH_ALIAS("amdfam10",   AMDFAM10H,        "amdfam10h", "amdfam10")
-X86_CPU_TYPE_COMPAT_WITH_ALIAS("bdver1",     AMDFAM15H,        "amdfam15h", "amdfam15")
-X86_CPU_TYPE_COMPAT_WITH_ALIAS("silvermont", INTEL_SILVERMONT, "silvermont", "slm")
-X86_CPU_TYPE_COMPAT           ("knl",        INTEL_KNL,        "knl")
-X86_CPU_TYPE_COMPAT           ("btver1",     AMD_BTVER1,       "btver1")
-X86_CPU_TYPE_COMPAT           ("btver2",     AMD_BTVER2,       "btver2")
-X86_CPU_TYPE_COMPAT           ("znver1",     AMDFAM17H,        "amdfam17h")
-X86_CPU_TYPE_COMPAT           ("knm",        INTEL_KNM,        "knm")
+X86_CPU_TYPE_COMPAT_WITH_ALIAS("bonnell",       INTEL_BONNELL,       "bonnell", "atom")
+X86_CPU_TYPE_COMPAT           ("core2",         INTEL_CORE2,         "core2")
+X86_CPU_TYPE_COMPAT           ("nehalem",       INTEL_COREI7,        "corei7")
+X86_CPU_TYPE_COMPAT_WITH_ALIAS("amdfam10",      AMDFAM10H,           "amdfam10h", "amdfam10")
+X86_CPU_TYPE_COMPAT_WITH_ALIAS("bdver1",        AMDFAM15H,           "amdfam15h", "amdfam15")
+X86_CPU_TYPE_COMPAT_WITH_ALIAS("silvermont",    INTEL_SILVERMONT,    "silvermont", "slm")
+X86_CPU_TYPE_COMPAT           ("knl",           INTEL_KNL,           "knl")
+X86_CPU_TYPE_COMPAT           ("btver1",        AMD_BTVER1,          "btver1")
+X86_CPU_TYPE_COMPAT           ("btver2",        AMD_BTVER2,          "btver2")
+X86_CPU_TYPE_COMPAT           ("znver1",        AMDFAM17H,           "amdfam17h")
+X86_CPU_TYPE_COMPAT           ("knm",           INTEL_KNM,           "knm")
+X86_CPU_TYPE_COMPAT           ("goldmont",      INTEL_GOLDMONT,      "goldmont")
+X86_CPU_TYPE_COMPAT           ("goldmont-plus", INTEL_GOLDMONT_PLUS, "goldmont-plus")
+X86_CPU_TYPE_COMPAT           ("tremont",       INTEL_TREMONT,       "tremont")
 // Entries below this are not in libgcc/compiler-rt.
 X86_CPU_TYPE                  ("i386",        INTEL_i386)
 X86_CPU_TYPE                  ("i486",        INTEL_i486)
@@ -64,9 +67,6 @@ X86_CPU_TYPE                  ("athlon",      AMD_ATHLON)
 X86_CPU_TYPE                  ("athlon-xp",   AMD_ATHLON_XP)
 X86_CPU_TYPE                  ("k8",          AMD_K8)
 X86_CPU_TYPE                  ("k8-sse3",     AMD_K8SSE3)
-X86_CPU_TYPE                  ("goldmont",    INTEL_GOLDMONT)
-X86_CPU_TYPE                  ("goldmont-plus", INTEL_GOLDMONT_PLUS)
-X86_CPU_TYPE                  ("tremont",     INTEL_TREMONT)
 #undef X86_CPU_TYPE_COMPAT_WITH_ALIAS
 #undef X86_CPU_TYPE_COMPAT
 #undef X86_CPU_TYPE
@@ -97,6 +97,8 @@ X86_CPU_SUBTYPE_COMPAT("broadwell",      INTEL_COREI7_BROADWELL,      "broadwell
 X86_CPU_SUBTYPE_COMPAT("skylake",        INTEL_COREI7_SKYLAKE,        "skylake")
 X86_CPU_SUBTYPE_COMPAT("skylake-avx512", INTEL_COREI7_SKYLAKE_AVX512, "skylake-avx512")
 X86_CPU_SUBTYPE_COMPAT("cannonlake",     INTEL_COREI7_CANNONLAKE,     "cannonlake")
+X86_CPU_SUBTYPE_COMPAT("icelake-client", INTEL_COREI7_ICELAKE_CLIENT, "icelake-client")
+X86_CPU_SUBTYPE_COMPAT("icelake-server", INTEL_COREI7_ICELAKE_SERVER, "icelake-server")
 // Entries below this are not in libgcc/compiler-rt.
 X86_CPU_SUBTYPE       ("core2",          INTEL_CORE2_65)
 X86_CPU_SUBTYPE       ("penryn",         INTEL_CORE2_45)
@@ -147,11 +149,16 @@ X86_FEATURE_COMPAT(27, FEATURE_AVX512IFMA,      "avx512ifma")
 X86_FEATURE_COMPAT(28, FEATURE_AVX5124VNNIW,    "avx5124vnniw")
 X86_FEATURE_COMPAT(29, FEATURE_AVX5124FMAPS,    "avx5124fmaps")
 X86_FEATURE_COMPAT(30, FEATURE_AVX512VPOPCNTDQ, "avx512vpopcntdq")
+X86_FEATURE_COMPAT(31, FEATURE_AVX512VBMI2,     "avx512vbmi2")
+X86_FEATURE_COMPAT(32, FEATURE_GFNI,            "gfni")
+X86_FEATURE_COMPAT(33, FEATURE_VPCLMULQDQ,      "vpclmulqdq")
+X86_FEATURE_COMPAT(34, FEATURE_AVX512VNNI,      "avx512vnni")
+X86_FEATURE_COMPAT(35, FEATURE_AVX512BITALG,    "avx512bitalg")
 // Features below here are not in libgcc/compiler-rt.
-X86_FEATURE       (32, FEATURE_MOVBE)
-X86_FEATURE       (33, FEATURE_ADX)
-X86_FEATURE       (34, FEATURE_EM64T)
-X86_FEATURE       (35, FEATURE_CLFLUSHOPT)
-X86_FEATURE       (36, FEATURE_SHA)
+X86_FEATURE       (64, FEATURE_MOVBE)
+X86_FEATURE       (65, FEATURE_ADX)
+X86_FEATURE       (66, FEATURE_EM64T)
+X86_FEATURE       (67, FEATURE_CLFLUSHOPT)
+X86_FEATURE       (68, FEATURE_SHA)
 #undef X86_FEATURE_COMPAT
 #undef X86_FEATURE
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 7de2a9e3fbb..ebf03cc176f 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -511,8 +511,8 @@ static void detectX86FamilyModel(unsigned EAX, unsigned *Family,
 static void
 getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
                                 unsigned Brand_id, unsigned Features,
-                                unsigned Features2, unsigned *Type,
-                                unsigned *Subtype) {
+                                unsigned Features2, unsigned Features3,
+                                unsigned *Type, unsigned *Subtype) {
   if (Brand_id != 0)
     return;
   switch (Family) {
@@ -696,8 +696,8 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         break;
       }
 
-      if (Features2 & (1 << (X86::FEATURE_CLFLUSHOPT - 32))) {
-        if (Features2 & (1 << (X86::FEATURE_SHA - 32))) {
+      if (Features3 & (1 << (X86::FEATURE_CLFLUSHOPT - 64))) {
+        if (Features3 & (1 << (X86::FEATURE_SHA - 64))) {
           *Type = X86::INTEL_GOLDMONT;
         } else {
           *Type = X86::INTEL_COREI7;
@@ -705,7 +705,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         }
         break;
       }
-      if (Features2 & (1 << (X86::FEATURE_ADX - 32))) {
+      if (Features3 & (1 << (X86::FEATURE_ADX - 64))) {
         *Type = X86::INTEL_COREI7;
         *Subtype = X86::INTEL_COREI7_BROADWELL;
         break;
@@ -721,7 +721,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         break;
       }
       if (Features & (1 << X86::FEATURE_SSE4_2)) {
-        if (Features2 & (1 << (X86::FEATURE_MOVBE - 32))) {
+        if (Features3 & (1 << (X86::FEATURE_MOVBE - 64))) {
           *Type = X86::INTEL_SILVERMONT;
         } else {
           *Type = X86::INTEL_COREI7;
@@ -735,7 +735,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         break;
       }
       if (Features & (1 << X86::FEATURE_SSSE3)) {
-        if (Features2 & (1 << (X86::FEATURE_MOVBE - 32))) {
+        if (Features3 & (1 << (X86::FEATURE_MOVBE - 64))) {
           *Type = X86::INTEL_BONNELL; // "bonnell"
         } else {
           *Type = X86::INTEL_CORE2; // "core2"
@@ -743,7 +743,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         }
         break;
       }
-      if (Features2 & (1 << (X86::FEATURE_EM64T - 32))) {
+      if (Features3 & (1 << (X86::FEATURE_EM64T - 64))) {
         *Type = X86::INTEL_CORE2; // "core2"
         *Subtype = X86::INTEL_CORE2_65;
         break;
@@ -769,7 +769,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     }
     break;
   case 15: {
-    if (Features2 & (1 << (X86::FEATURE_EM64T - 32))) {
+    if (Features3 & (1 << (X86::FEATURE_EM64T - 64))) {
       *Type = X86::INTEL_NOCONA;
       break;
     }
@@ -877,40 +877,51 @@ static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
 }
 
 static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
-                                 unsigned *FeaturesOut,
-                                 unsigned *Features2Out) {
+                                 unsigned *FeaturesOut, unsigned *Features2Out,
+                                 unsigned *Features3Out) {
   unsigned Features = 0;
   unsigned Features2 = 0;
+  unsigned Features3 = 0;
   unsigned EAX, EBX;
 
+#define setFeature(F)              \
+  do {                             \
+    if (F < 32)                    \
+      Features |= 1 << F;          \
+    else if (F < 64)               \
+      Features2 |= 1 << (F - 32);  \
+    else if (F < 96)               \
+      Features3 |= 1 << (F - 64);  \
+  } while (0)
+
   if ((EDX >> 15) & 1)
-    Features |= 1 << X86::FEATURE_CMOV;
+    setFeature(X86::FEATURE_CMOV);
   if ((EDX >> 23) & 1)
-    Features |= 1 << X86::FEATURE_MMX;
+    setFeature(X86::FEATURE_MMX);
   if ((EDX >> 25) & 1)
-    Features |= 1 << X86::FEATURE_SSE;
+    setFeature(X86::FEATURE_SSE);
   if ((EDX >> 26) & 1)
-    Features |= 1 << X86::FEATURE_SSE2;
+    setFeature(X86::FEATURE_SSE2);
 
   if ((ECX >> 0) & 1)
-    Features |= 1 << X86::FEATURE_SSE3;
+    setFeature(X86::FEATURE_SSE3);
   if ((ECX >> 1) & 1)
-    Features |= 1 << X86::FEATURE_PCLMUL;
+    setFeature(X86::FEATURE_PCLMUL);
   if ((ECX >> 9) & 1)
-    Features |= 1 << X86::FEATURE_SSSE3;
+    setFeature(X86::FEATURE_SSSE3);
   if ((ECX >> 12) & 1)
-    Features |= 1 << X86::FEATURE_FMA;
+    setFeature(X86::FEATURE_FMA);
   if ((ECX >> 19) & 1)
-    Features |= 1 << X86::FEATURE_SSE4_1;
+    setFeature(X86::FEATURE_SSE4_1);
   if ((ECX >> 20) & 1)
-    Features |= 1 << X86::FEATURE_SSE4_2;
+    setFeature(X86::FEATURE_SSE4_2);
   if ((ECX >> 23) & 1)
-    Features |= 1 << X86::FEATURE_POPCNT;
+    setFeature(X86::FEATURE_POPCNT);
   if ((ECX >> 25) & 1)
-    Features |= 1 << X86::FEATURE_AES;
+    setFeature(X86::FEATURE_AES);
 
   if ((ECX >> 22) & 1)
-    Features2 |= 1 << (X86::FEATURE_MOVBE - 32);
+    setFeature(X86::FEATURE_MOVBE);
 
   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
   // indicates that the AVX registers will be saved and restored on context
@@ -921,49 +932,59 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   bool HasAVX512Save = HasAVX && ((EAX & 0xe0) == 0xe0);
 
   if (HasAVX)
-    Features |= 1 << X86::FEATURE_AVX;
+    setFeature(X86::FEATURE_AVX);
 
   bool HasLeaf7 =
       MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
 
   if (HasLeaf7 && ((EBX >> 3) & 1))
-    Features |= 1 << X86::FEATURE_BMI;
+    setFeature(X86::FEATURE_BMI);
   if (HasLeaf7 && ((EBX >> 5) & 1) && HasAVX)
-    Features |= 1 << X86::FEATURE_AVX2;
+    setFeature(X86::FEATURE_AVX2);
   if (HasLeaf7 && ((EBX >> 9) & 1))
-    Features |= 1 << X86::FEATURE_BMI2;
+    setFeature(X86::FEATURE_BMI2);
   if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512F;
+    setFeature(X86::FEATURE_AVX512F);
   if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512DQ;
+    setFeature(X86::FEATURE_AVX512DQ);
   if (HasLeaf7 && ((EBX >> 19) & 1))
-    Features2 |= 1 << (X86::FEATURE_ADX - 32);
+    setFeature(X86::FEATURE_ADX);
   if (HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512IFMA;
+    setFeature(X86::FEATURE_AVX512IFMA);
   if (HasLeaf7 && ((EBX >> 23) & 1))
-    Features2 |= 1 << (X86::FEATURE_CLFLUSHOPT - 32);
+    setFeature(X86::FEATURE_CLFLUSHOPT);
   if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512PF;
+    setFeature(X86::FEATURE_AVX512PF);
   if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512ER;
+    setFeature(X86::FEATURE_AVX512ER);
   if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512CD;
+    setFeature(X86::FEATURE_AVX512CD);
   if (HasLeaf7 && ((EBX >> 29) & 1))
-    Features2 |= 1 << (X86::FEATURE_SHA - 32);
+    setFeature(X86::FEATURE_SHA);
   if (HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512BW;
+    setFeature(X86::FEATURE_AVX512BW);
   if (HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512VL;
+    setFeature(X86::FEATURE_AVX512VL);
 
   if (HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512VBMI;
+    setFeature(X86::FEATURE_AVX512VBMI);
+  if (HasLeaf7 && ((ECX >> 6) & 1) && HasAVX512Save)
+    setFeature(X86::FEATURE_AVX512VBMI2);
+  if (HasLeaf7 && ((ECX >> 8) & 1))
+    setFeature(X86::FEATURE_GFNI);
+  if (HasLeaf7 && ((ECX >> 10) & 1) && HasAVX)
+    setFeature(X86::FEATURE_VPCLMULQDQ);
+  if (HasLeaf7 && ((ECX >> 11) & 1) && HasAVX512Save)
+    setFeature(X86::FEATURE_AVX512VNNI);
+  if (HasLeaf7 && ((ECX >> 12) & 1) && HasAVX512Save)
+    setFeature(X86::FEATURE_AVX512BITALG);
   if (HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512VPOPCNTDQ;
+    setFeature(X86::FEATURE_AVX512VPOPCNTDQ);
 
   if (HasLeaf7 && ((EDX >> 2) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX5124VNNIW;
+    setFeature(X86::FEATURE_AVX5124VNNIW);
   if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX5124FMAPS;
+    setFeature(X86::FEATURE_AVX5124FMAPS);
 
   unsigned MaxExtLevel;
   getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
@@ -971,17 +992,19 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 &&
                      !getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
   if (HasExtLeaf1 && ((ECX >> 6) & 1))
-    Features |= 1 << X86::FEATURE_SSE4_A;
+    setFeature(X86::FEATURE_SSE4_A);
   if (HasExtLeaf1 && ((ECX >> 11) & 1))
-    Features |= 1 << X86::FEATURE_XOP;
+    setFeature(X86::FEATURE_XOP);
   if (HasExtLeaf1 && ((ECX >> 16) & 1))
-    Features |= 1 << X86::FEATURE_FMA4;
+    setFeature(X86::FEATURE_FMA4);
 
   if (HasExtLeaf1 && ((EDX >> 29) & 1))
-    Features2 |= 1 << (X86::FEATURE_EM64T - 32);
+    setFeature(X86::FEATURE_EM64T);
 
   *FeaturesOut  = Features;
   *Features2Out = Features2;
+  *Features3Out = Features3;
+#undef setFeature
 }
 
 StringRef sys::getHostCPUName() {
@@ -1002,16 +1025,16 @@ StringRef sys::getHostCPUName() {
 
   unsigned Brand_id = EBX & 0xff;
   unsigned Family = 0, Model = 0;
-  unsigned Features = 0, Features2 = 0;
+  unsigned Features = 0, Features2 = 0, Features3 = 0;
   detectX86FamilyModel(EAX, &Family, &Model);
-  getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2);
+  getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2, &Features3);
 
   unsigned Type = 0;
   unsigned Subtype = 0;
 
   if (Vendor == SIG_INTEL) {
     getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features,
-                                    Features2, &Type, &Subtype);
+                                    Features2, Features3, &Type, &Subtype);
   } else if (Vendor == SIG_AMD) {
     getAMDProcessorTypeAndSubtype(Family, Model, Features, &Type, &Subtype);
   }
-- 
GitLab


From b5d5e5ebf64f442e1d5d451986c6f86db1b3bf7e Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 20 Oct 2018 06:02:15 +0000
Subject: [PATCH 0342/1116] DebugInfo: Use debug_addr for non-dwo addresses in
 DWARF 5

Putting addresses in the address pool, even with non-fission, can reduce
relocations - reusing the addresses from debug_info and debug_rnglists
(the latter coming soon)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344834 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../DebugInfo/DWARF/DWARFDebugRangeList.h     |  3 +-
 .../llvm/DebugInfo/DWARF/DWARFDebugRnglists.h |  2 +-
 include/llvm/DebugInfo/DWARF/DWARFFormValue.h |  8 ++++++
 include/llvm/DebugInfo/DWARF/DWARFSection.h   |  5 ++++
 include/llvm/DebugInfo/DWARF/DWARFUnit.h      | 11 ++------
 lib/CodeGen/AsmPrinter/AddressPool.cpp        |  6 ++--
 lib/CodeGen/AsmPrinter/DIE.cpp                |  2 ++
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp   |  6 ++--
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp         | 16 ++++++-----
 lib/CodeGen/AsmPrinter/DwarfUnit.cpp          |  6 ++--
 lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp   |  2 +-
 lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp    |  2 +-
 lib/DebugInfo/DWARF/DWARFDie.cpp              |  2 +-
 lib/DebugInfo/DWARF/DWARFFormValue.cpp        | 28 +++++++++++++------
 lib/DebugInfo/DWARF/DWARFUnit.cpp             | 23 +++++++--------
 test/DebugInfo/X86/debug_addr.ll              |  3 +-
 test/DebugInfo/X86/fission-ranges.ll          |  1 +
 17 files changed, 78 insertions(+), 48 deletions(-)

diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index ce7436d9faa..bc26edf0064 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -18,7 +18,6 @@
 
 namespace llvm {
 
-struct BaseAddress;
 class raw_ostream;
 
 class DWARFDebugRangeList {
@@ -78,7 +77,7 @@ public:
   /// list. Has to be passed base address of the compile unit referencing this
   /// range list.
   DWARFAddressRangesVector
-  getAbsoluteRanges(llvm::Optional<BaseAddress> BaseAddr) const;
+  getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr) const;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
index e2e8ab5ed21..0615bbf0308 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
@@ -44,7 +44,7 @@ class DWARFDebugRnglist : public DWARFListType<RangeListEntry> {
 public:
   /// Build a DWARFAddressRangesVector from a rangelist.
   DWARFAddressRangesVector
-  getAbsoluteRanges(llvm::Optional<BaseAddress> BaseAddr) const;
+  getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr) const;
 };
 
 class DWARFDebugRnglistTable : public DWARFListTableBase<DWARFDebugRnglist> {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 1b5f71c946f..edf9442acd0 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -101,6 +101,7 @@ public:
   Optional<int64_t> getAsSignedConstant() const;
   Optional<const char *> getAsCString() const;
   Optional<uint64_t> getAsAddress() const;
+  Optional<SectionedAddress> getAsSectionedAddress() const;
   Optional<uint64_t> getAsSectionOffset() const;
   Optional<ArrayRef<uint8_t>> getAsBlock() const;
   Optional<uint64_t> getAsCStringOffset() const;
@@ -238,6 +239,13 @@ inline Optional<uint64_t> toAddress(const Optional<DWARFFormValue> &V) {
   return None;
 }
 
+inline Optional<SectionedAddress>
+toSectionedAddress(const Optional<DWARFFormValue> &V) {
+  if (V)
+    return V->getAsSectionedAddress();
+  return None;
+}
+
 /// Take an optional DWARFFormValue and extract a address.
 ///
 /// \param V and optional DWARFFormValue to attempt to extract the value from.
diff --git a/include/llvm/DebugInfo/DWARF/DWARFSection.h b/include/llvm/DebugInfo/DWARF/DWARFSection.h
index 77045f0794a..7f823596529 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFSection.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFSection.h
@@ -23,6 +23,11 @@ struct SectionName {
   bool IsNameUnique;
 };
 
+struct SectionedAddress {
+  uint64_t Address;
+  uint64_t SectionIndex;
+};
+
 } // end namespace llvm
 
 #endif // LLVM_DEBUGINFO_DWARF_DWARFSECTION_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index c267cf173d1..ae0e8cc8db1 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -160,11 +160,6 @@ private:
 };
 
 /// Represents base address of the CU.
-struct BaseAddress {
-  uint64_t Address;
-  uint64_t SectionIndex;
-};
-
 /// Represents a unit's contribution to the string offsets table.
 struct StrOffsetsContributionDescriptor {
   uint64_t Base = 0;
@@ -221,7 +216,7 @@ class DWARFUnit {
   Optional<DWARFDebugRnglistTable> RngListTable;
 
   mutable const DWARFAbbreviationDeclarationSet *Abbrevs;
-  llvm::Optional<BaseAddress> BaseAddr;
+  llvm::Optional<SectionedAddress> BaseAddr;
   /// The compile unit debug information entry items.
   std::vector<DWARFDebugInfoEntry> DieArray;
 
@@ -310,7 +305,7 @@ public:
     RangeSectionBase = Base;
   }
 
-  bool getAddrOffsetSectionItem(uint32_t Index, uint64_t &Result) const;
+  Optional<SectionedAddress> getAddrOffsetSectionItem(uint32_t Index) const;
   bool getStringOffsetSectionItem(uint32_t Index, uint64_t &Result) const;
 
   DWARFDataExtractor getDebugInfoExtractor() const;
@@ -381,7 +376,7 @@ public:
     llvm_unreachable("Invalid UnitType.");
   }
 
-  llvm::Optional<BaseAddress> getBaseAddress();
+  llvm::Optional<SectionedAddress> getBaseAddress();
 
   DWARFDie getUnitDIE(bool ExtractUnitDIEOnly = true) {
     extractDIEsIfNeeded(ExtractUnitDIEOnly);
diff --git a/lib/CodeGen/AsmPrinter/AddressPool.cpp b/lib/CodeGen/AsmPrinter/AddressPool.cpp
index c21616766fa..f8143b903d5 100644
--- a/lib/CodeGen/AsmPrinter/AddressPool.cpp
+++ b/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -39,6 +39,9 @@ void AddressPool::emitHeader(AsmPrinter &Asm, MCSection *Section) {
 
 // Emit addresses into the section given.
 void AddressPool::emit(AsmPrinter &Asm, MCSection *AddrSection) {
+  if (isEmpty())
+    return;
+
   // Start the dwarf addr section.
   Asm.OutStreamer->SwitchSection(AddrSection);
 
@@ -49,9 +52,6 @@ void AddressPool::emit(AsmPrinter &Asm, MCSection *AddrSection) {
   // It is referenced via DW_AT_addr_base.
   Asm.OutStreamer->EmitLabel(AddressTableBaseSym);
 
-  if (Pool.empty())
-    return;
-
   // Order the address pool entries by ID
   SmallVector<const MCExpr *, 64> Entries(Pool.size());
 
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 570424a79c8..6ffb6123509 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -414,6 +414,7 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_GNU_addr_index:
   case dwarf::DW_FORM_ref_udata:
   case dwarf::DW_FORM_strx:
+  case dwarf::DW_FORM_addrx:
   case dwarf::DW_FORM_udata:
     Asm->EmitULEB128(Integer);
     return;
@@ -440,6 +441,7 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   case dwarf::DW_FORM_GNU_addr_index:
   case dwarf::DW_FORM_ref_udata:
   case dwarf::DW_FORM_strx:
+  case dwarf::DW_FORM_addrx:
   case dwarf::DW_FORM_udata:
     return getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata:
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 443c8879f13..5731541e595 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -69,14 +69,16 @@ void DwarfCompileUnit::addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
   // pool from the skeleton - maybe even in non-fission (possibly fewer
   // relocations by sharing them in the pool, but we have other ideas about how
   // to reduce the number of relocations as well/instead).
-  if (!DD->useSplitDwarf() || !Skeleton)
+  if ((!DD->useSplitDwarf() || !Skeleton) && DD->getDwarfVersion() < 5)
     return addLocalLabelAddress(Die, Attribute, Label);
 
   if (Label)
     DD->addArangeLabel(SymbolCU(this, Label));
 
   unsigned idx = DD->getAddressPool().getIndex(Label);
-  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_GNU_addr_index,
+  Die.addValue(DIEValueAllocator, Attribute,
+               DD->getDwarfVersion() >= 5 ? dwarf::DW_FORM_addrx
+                                          : dwarf::DW_FORM_GNU_addr_index,
                DIEInteger(idx));
 }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 94e12658cfe..e16ca4c4608 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -851,10 +851,6 @@ void DwarfDebug::finalizeModuleInfo() {
         SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
                       dwarf::DW_FORM_data8, ID);
       }
-      // We don't keep track of which addresses are used in which CU so this
-      // is a bit pessimistic under LTO.
-      if (!AddrPool.isEmpty())
-        SkCU->addAddrTableBase();
 
       if (getDwarfVersion() < 5 && !SkCU->getRangeLists().empty()) {
         const MCSymbol *Sym = TLOF.getDwarfRangesSection()->getBeginSymbol();
@@ -870,6 +866,12 @@ void DwarfDebug::finalizeModuleInfo() {
     // .subsections_via_symbols in mach-o. This would mean turning on
     // ranges for all subprogram DIEs for mach-o.
     DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
+
+    // We don't keep track of which addresses are used in which CU so this
+    // is a bit pessimistic under LTO.
+    if (!AddrPool.isEmpty())
+      U.addAddrTableBase();
+
     if (unsigned NumRanges = TheCU.getRanges().size()) {
       if (NumRanges > 1 && useRangesSection())
         // A DW_AT_low_pc attribute may also be specified in combination with
@@ -948,9 +950,10 @@ void DwarfDebug::endModule() {
     emitDebugInfoDWO();
     emitDebugAbbrevDWO();
     emitDebugLineDWO();
-    emitDebugAddr();
   }
 
+  emitDebugAddr();
+
   // Emit info into the dwarf accelerator table sections.
   switch (getAccelTableKind()) {
   case AccelTableKind::Apple:
@@ -2439,9 +2442,8 @@ void DwarfDebug::emitDebugStrDWO() {
                          OffSec, /* UseRelativeOffsets = */ false);
 }
 
-// Emit DWO addresses.
+// Emit address pool.
 void DwarfDebug::emitDebugAddr() {
-  assert(useSplitDwarf() && "No split dwarf?");
   AddrPool.emit(*Asm, Asm->getObjFileLowering().getDwarfAddrSection());
 }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 14e59c3df27..8a168f4845d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1652,6 +1652,8 @@ void DwarfUnit::addRnglistsBase() {
 void DwarfUnit::addAddrTableBase() {
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   MCSymbol *Label = DD->getAddressPool().getLabel();
-  addSectionLabel(getUnitDie(), dwarf::DW_AT_GNU_addr_base, Label,
-                  TLOF.getDwarfAddrSection()->getBeginSymbol());
+  addSectionLabel(getUnitDie(),
+                  getDwarfVersion() >= 5 ? dwarf::DW_AT_addr_base
+                                         : dwarf::DW_AT_GNU_addr_base,
+                  Label, TLOF.getDwarfAddrSection()->getBeginSymbol());
 }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index 84e3c634f54..dfb913000a4 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -69,7 +69,7 @@ void DWARFDebugRangeList::dump(raw_ostream &OS) const {
 }
 
 DWARFAddressRangesVector DWARFDebugRangeList::getAbsoluteRanges(
-    llvm::Optional<BaseAddress> BaseAddr) const {
+    llvm::Optional<SectionedAddress> BaseAddr) const {
   DWARFAddressRangesVector Res;
   for (const RangeListEntry &RLE : Entries) {
     if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
index eeb85edf5b2..72f84159957 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
@@ -101,7 +101,7 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
 }
 
 DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
-    llvm::Optional<BaseAddress> BaseAddr) const {
+    llvm::Optional<SectionedAddress> BaseAddr) const {
   DWARFAddressRangesVector Res;
   for (const RangeListEntry &RLE : Entries) {
     if (RLE.EntryKind == dwarf::DW_RLE_end_of_list)
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 6b69b822aad..b4413653290 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -109,7 +109,7 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
       auto LL = DebugLoc.parseOneLocationList(Data, &Offset);
       if (LL) {
         uint64_t BaseAddr = 0;
-        if (Optional<BaseAddress> BA = U->getBaseAddress())
+        if (Optional<SectionedAddress> BA = U->getBaseAddress())
           BaseAddr = BA->Address;
         LL->dump(OS, Ctx.isLittleEndian(), Obj.getAddressSize(), MRI, BaseAddr,
                  Indent);
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 27895da8058..ed510a0e4cd 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -308,6 +308,7 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
       break;
     case DW_FORM_GNU_addr_index:
     case DW_FORM_GNU_str_index:
+    case DW_FORM_addrx:
     case DW_FORM_strx:
       Value.uval = Data.getULEB128(OffsetPtr);
       break;
@@ -340,13 +341,17 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   case DW_FORM_addr:
     AddrOS << format("0x%016" PRIx64, UValue);
     break;
+  case DW_FORM_addrx:
+  case DW_FORM_addrx1:
+  case DW_FORM_addrx2:
+  case DW_FORM_addrx3:
+  case DW_FORM_addrx4:
   case DW_FORM_GNU_addr_index: {
     AddrOS << format(" indexed (%8.8x) address = ", (uint32_t)UValue);
-    uint64_t Address;
     if (U == nullptr)
       OS << "<invalid dwarf unit>";
-    else if (U->getAddrOffsetSectionItem(UValue, Address))
-      AddrOS << format("0x%016" PRIx64, Address);
+    else if (Optional<SectionedAddress> A = U->getAddrOffsetSectionItem(UValue))
+      AddrOS << format("0x%016" PRIx64, A->Address);
     else
       OS << "<no .debug_addr section>";
     break;
@@ -555,16 +560,23 @@ Optional<const char *> DWARFFormValue::getAsCString() const {
 }
 
 Optional<uint64_t> DWARFFormValue::getAsAddress() const {
+  if (auto SA = getAsSectionedAddress())
+    return SA->Address;
+  return None;
+}
+Optional<SectionedAddress> DWARFFormValue::getAsSectionedAddress() const {
   if (!isFormClass(FC_Address))
     return None;
-  if (Form == DW_FORM_GNU_addr_index) {
+  if (Form == DW_FORM_GNU_addr_index || Form == DW_FORM_addrx) {
     uint32_t Index = Value.uval;
-    uint64_t Result;
-    if (!U || !U->getAddrOffsetSectionItem(Index, Result))
+    if (!U)
+      return None;
+    Optional<SectionedAddress> SA = U->getAddrOffsetSectionItem(Index);
+    if (!SA)
       return None;
-    return Result;
+    return SA;
   }
-  return Value.uval;
+  return {{Value.uval, Value.SectionIndex}};
 }
 
 Optional<uint64_t> DWARFFormValue::getAsReference() const {
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index dbac5a82b57..081163ba61f 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -195,15 +195,16 @@ DWARFDataExtractor DWARFUnit::getDebugInfoExtractor() const {
                             getAddressByteSize());
 }
 
-bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
-                                                uint64_t &Result) const {
+Optional<SectionedAddress>
+DWARFUnit::getAddrOffsetSectionItem(uint32_t Index) const {
   uint32_t Offset = AddrOffsetSectionBase + Index * getAddressByteSize();
   if (AddrOffsetSection->Data.size() < Offset + getAddressByteSize())
-    return false;
+    return None;
   DWARFDataExtractor DA(Context.getDWARFObj(), *AddrOffsetSection,
                         isLittleEndian, getAddressByteSize());
-  Result = DA.getRelocatedAddress(&Offset);
-  return true;
+  uint64_t Section;
+  uint64_t Address = DA.getRelocatedAddress(&Offset, &Section);
+  return {{Address, Section}};
 }
 
 bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
@@ -401,8 +402,10 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
     if (!isDWO) {
       assert(AddrOffsetSectionBase == 0);
       assert(RangeSectionBase == 0);
-      AddrOffsetSectionBase =
-          toSectionOffset(UnitDie.find(DW_AT_GNU_addr_base), 0);
+      AddrOffsetSectionBase = toSectionOffset(UnitDie.find(DW_AT_addr_base), 0);
+      if (!AddrOffsetSectionBase)
+        AddrOffsetSectionBase =
+            toSectionOffset(UnitDie.find(DW_AT_GNU_addr_base), 0);
       RangeSectionBase = toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0);
     }
 
@@ -760,15 +763,13 @@ const DWARFAbbreviationDeclarationSet *DWARFUnit::getAbbreviations() const {
   return Abbrevs;
 }
 
-llvm::Optional<BaseAddress> DWARFUnit::getBaseAddress() {
+llvm::Optional<SectionedAddress> DWARFUnit::getBaseAddress() {
   if (BaseAddr)
     return BaseAddr;
 
   DWARFDie UnitDie = getUnitDIE();
   Optional<DWARFFormValue> PC = UnitDie.find({DW_AT_low_pc, DW_AT_entry_pc});
-  if (Optional<uint64_t> Addr = toAddress(PC))
-    BaseAddr = {*Addr, PC->getSectionIndex()};
-
+  BaseAddr = toSectionedAddress(PC);
   return BaseAddr;
 }
 
diff --git a/test/DebugInfo/X86/debug_addr.ll b/test/DebugInfo/X86/debug_addr.ll
index ea7c8bda7bc..b50428a282c 100644
--- a/test/DebugInfo/X86/debug_addr.ll
+++ b/test/DebugInfo/X86/debug_addr.ll
@@ -31,7 +31,8 @@
 ; DWARF5: DW_TAG_compile_unit
 ; DWARF5-NOT: DW_TAG_{{.*}}
 ; DWARF5: DW_AT_GNU_dwo_name{{.*}}test.dwo
-; DWARF5: DW_AT_GNU_addr_base{{.*}}0x00000008
+; DWARF5: DW_AT_addr_base{{.*}}0x00000008
+; DWARF5: DW_AT_low_pc [DW_FORM_addrx] ( indexed (00000000) address = 0x0000000000000000)
 ; DWARF5: .debug_addr contents:
 ; DWARF5-NEXT: 0x00000000: Addr Section: length = 0x0000000c, version = 0x0005, addr_size = 0x04, seg_size = 0x00
 ; DWARF5-NEXT: Addrs: [
diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
index 3ea5aa2e350..e7402787dc3 100644
--- a/test/DebugInfo/X86/fission-ranges.ll
+++ b/test/DebugInfo/X86/fission-ranges.ll
@@ -11,6 +11,7 @@
 ; CHECK-NEXT: DW_AT_GNU_dwo_name
 ; CHECK-NEXT: DW_AT_comp_dir
 ; CHECK-NEXT: DW_AT_GNU_dwo_id
+; CHECK-NEXT: DW_AT_GNU_ranges_base
 ; CHECK-NEXT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]                   (0x00000000)
 
 ; CHECK: .debug_info.dwo contents:
-- 
GitLab


From 3a1bc560ac4b0244d0f1967fa9e254fcf0150005 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 20 Oct 2018 06:16:25 +0000
Subject: [PATCH 0343/1116] llvm-dwarfdump: Support RLE_addressx and
 RLE_startx_length in .debug_rnglists

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344835 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/DebugInfo/DWARF/DWARFDebugRnglists.h |  9 ++-
 include/llvm/DebugInfo/DWARF/DWARFListTable.h | 14 +++-
 lib/DebugInfo/DWARF/DWARFContext.cpp          | 22 +++--
 lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp    | 80 +++++++++++++++----
 lib/DebugInfo/DWARF/DWARFUnit.cpp             |  2 +-
 .../tools/llvm-dwarfdump/X86/debug_rnglists.s | 27 ++++++-
 6 files changed, 123 insertions(+), 31 deletions(-)

diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
index 0615bbf0308..5cc8d789e59 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_DWARFDEBUGRNGLISTS_H
 #define LLVM_DEBUGINFO_DWARFDEBUGRNGLISTS_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
@@ -23,6 +24,7 @@ namespace llvm {
 
 class Error;
 class raw_ostream;
+class DWARFUnit;
 
 /// A class representing a single range list entry.
 struct RangeListEntry : public DWARFListEntryBase {
@@ -35,7 +37,9 @@ struct RangeListEntry : public DWARFListEntryBase {
 
   Error extract(DWARFDataExtractor Data, uint32_t End, uint32_t *OffsetPtr);
   void dump(raw_ostream &OS, uint8_t AddrSize, uint8_t MaxEncodingStringLength,
-            uint64_t &CurrentBase, DIDumpOptions DumpOpts) const;
+            uint64_t &CurrentBase, DIDumpOptions DumpOpts,
+            llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+                LookupPooledAddress) const;
   bool isSentinel() const { return EntryKind == dwarf::DW_RLE_end_of_list; }
 };
 
@@ -44,7 +48,8 @@ class DWARFDebugRnglist : public DWARFListType<RangeListEntry> {
 public:
   /// Build a DWARFAddressRangesVector from a rangelist.
   DWARFAddressRangesVector
-  getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr) const;
+  getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr,
+                    DWARFUnit &U) const;
 };
 
 class DWARFDebugRnglistTable : public DWARFListTableBase<DWARFDebugRnglist> {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFListTable.h b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
index 8cf9e400892..8c15d9d58d4 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFListTable.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
@@ -156,7 +156,10 @@ public:
   uint32_t getHeaderOffset() const { return Header.getHeaderOffset(); }
   uint8_t getAddrSize() const { return Header.getAddrSize(); }
 
-  void dump(raw_ostream &OS, DIDumpOptions DumpOpts = {}) const;
+  void dump(raw_ostream &OS,
+            llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+                LookupPooledAddress,
+            DIDumpOptions DumpOpts = {}) const;
 
   /// Return the contents of the offset entry designated by a given index.
   Optional<uint32_t> getOffsetEntry(uint32_t Index) const {
@@ -229,8 +232,11 @@ Error DWARFListType<ListEntryType>::extract(DWARFDataExtractor Data,
 }
 
 template <typename DWARFListType>
-void DWARFListTableBase<DWARFListType>::dump(raw_ostream &OS,
-                                             DIDumpOptions DumpOpts) const {
+void DWARFListTableBase<DWARFListType>::dump(
+    raw_ostream &OS,
+    llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+        LookupPooledAddress,
+    DIDumpOptions DumpOpts) const {
   Header.dump(OS, DumpOpts);
   OS << HeaderString << "\n";
 
@@ -249,7 +255,7 @@ void DWARFListTableBase<DWARFListType>::dump(raw_ostream &OS,
   for (const auto &List : ListMap)
     for (const auto &Entry : List.second.getEntries())
       Entry.dump(OS, getAddrSize(), MaxEncodingStringLength, CurrentBase,
-                 DumpOpts);
+                 DumpOpts, LookupPooledAddress);
 }
 
 template <typename DWARFListType>
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index ddabc7a4652..18ec8476e9d 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -269,9 +269,11 @@ static void dumpAddrSection(raw_ostream &OS, DWARFDataExtractor &AddrData,
 }
 
 // Dump the .debug_rnglists or .debug_rnglists.dwo section (DWARF v5).
-static void dumpRnglistsSection(raw_ostream &OS,
-                                DWARFDataExtractor &rnglistData,
-                                DIDumpOptions DumpOpts) {
+static void
+dumpRnglistsSection(raw_ostream &OS, DWARFDataExtractor &rnglistData,
+                    llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+                        LookupPooledAddress,
+                    DIDumpOptions DumpOpts) {
   uint32_t Offset = 0;
   while (rnglistData.isValidOffset(Offset)) {
     llvm::DWARFDebugRnglistTable Rnglists;
@@ -285,7 +287,7 @@ static void dumpRnglistsSection(raw_ostream &OS,
         break;
       Offset = TableOffset + Length;
     } else {
-      Rnglists.dump(OS, DumpOpts);
+      Rnglists.dump(OS, LookupPooledAddress, DumpOpts);
     }
   }
 }
@@ -495,18 +497,26 @@ void DWARFContext::dump(
     }
   }
 
+  auto LookupPooledAddress = [&](uint32_t Index) -> Optional<SectionedAddress> {
+    const auto &CUs = compile_units();
+    auto I = CUs.begin();
+    if (I == CUs.end())
+      return None;
+    return (*I)->getAddrOffsetSectionItem(Index);
+  };
+
   if (shouldDump(Explicit, ".debug_rnglists", DIDT_ID_DebugRnglists,
                  DObj->getRnglistsSection().Data)) {
     DWARFDataExtractor RnglistData(*DObj, DObj->getRnglistsSection(),
                                    isLittleEndian(), 0);
-    dumpRnglistsSection(OS, RnglistData, DumpOpts);
+    dumpRnglistsSection(OS, RnglistData, LookupPooledAddress, DumpOpts);
   }
 
   if (shouldDump(ExplicitDWO, ".debug_rnglists.dwo", DIDT_ID_DebugRnglists,
                  DObj->getRnglistsDWOSection().Data)) {
     DWARFDataExtractor RnglistData(*DObj, DObj->getRnglistsDWOSection(),
                                    isLittleEndian(), 0);
-    dumpRnglistsSection(OS, RnglistData, DumpOpts);
+    dumpRnglistsSection(OS, RnglistData, LookupPooledAddress, DumpOpts);
   }
 
   if (shouldDump(Explicit, ".debug_pubnames", DIDT_ID_DebugPubnames,
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
index 72f84159957..cb5fb0d49da 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
@@ -32,21 +32,34 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
     Value0 = Value1 = 0;
     break;
   // TODO: Support other encodings.
-  case dwarf::DW_RLE_base_addressx:
-    return createStringError(errc::not_supported,
-                       "unsupported rnglists encoding DW_RLE_base_addressx "
-                       "at offset 0x%" PRIx32,
-                       *OffsetPtr - 1);
+  case dwarf::DW_RLE_base_addressx: {
+    uint32_t PreviousOffset = *OffsetPtr - 1;
+    Value0 = Data.getULEB128(OffsetPtr);
+    if (End < *OffsetPtr)
+      return createStringError(
+          errc::invalid_argument,
+          "read past end of table when reading "
+          "DW_RLE_base_addressx encoding at offset 0x%" PRIx32,
+          PreviousOffset);
+    break;
+  }
   case dwarf::DW_RLE_startx_endx:
     return createStringError(errc::not_supported,
                        "unsupported rnglists encoding DW_RLE_startx_endx at "
                        "offset 0x%" PRIx32,
                        *OffsetPtr - 1);
-  case dwarf::DW_RLE_startx_length:
-    return createStringError(errc::not_supported,
-                       "unsupported rnglists encoding DW_RLE_startx_length "
-                       "at offset 0x%" PRIx32,
-                       *OffsetPtr - 1);
+  case dwarf::DW_RLE_startx_length: {
+    uint32_t PreviousOffset = *OffsetPtr - 1;
+    Value0 = Data.getULEB128(OffsetPtr);
+    Value1 = Data.getULEB128(OffsetPtr);
+    if (End < *OffsetPtr)
+      return createStringError(
+          errc::invalid_argument,
+          "read past end of table when reading "
+          "DW_RLE_startx_length encoding at offset 0x%" PRIx32,
+          PreviousOffset);
+    break;
+  }
   case dwarf::DW_RLE_offset_pair: {
     uint32_t PreviousOffset = *OffsetPtr - 1;
     Value0 = Data.getULEB128(OffsetPtr);
@@ -100,12 +113,19 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
   return Error::success();
 }
 
-DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
-    llvm::Optional<SectionedAddress> BaseAddr) const {
+DWARFAddressRangesVector
+DWARFDebugRnglist::getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr,
+                                     DWARFUnit &U) const {
   DWARFAddressRangesVector Res;
   for (const RangeListEntry &RLE : Entries) {
     if (RLE.EntryKind == dwarf::DW_RLE_end_of_list)
       break;
+    if (RLE.EntryKind == dwarf::DW_RLE_base_addressx) {
+      BaseAddr = U.getAddrOffsetSectionItem(RLE.Value0);
+      if (!BaseAddr)
+        BaseAddr = {RLE.Value0, 0};
+      continue;
+    }
     if (RLE.EntryKind == dwarf::DW_RLE_base_address) {
       BaseAddr = {RLE.Value0, RLE.SectionIndex};
       continue;
@@ -133,6 +153,15 @@ DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
       E.LowPC = RLE.Value0;
       E.HighPC = E.LowPC + RLE.Value1;
       break;
+    case dwarf::DW_RLE_startx_length: {
+      auto Start = U.getAddrOffsetSectionItem(RLE.Value0);
+      if (!Start)
+        Start = {0, 0};
+      E.SectionIndex = Start->SectionIndex;
+      E.LowPC = Start->Address;
+      E.HighPC = E.LowPC + RLE.Value1;
+      break;
+    }
     default:
       // Unsupported encodings should have been reported during extraction,
       // so we should not run into any here.
@@ -143,9 +172,11 @@ DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
   return Res;
 }
 
-void RangeListEntry::dump(raw_ostream &OS, uint8_t AddrSize,
-                          uint8_t MaxEncodingStringLength,
-                          uint64_t &CurrentBase, DIDumpOptions DumpOpts) const {
+void RangeListEntry::dump(
+    raw_ostream &OS, uint8_t AddrSize, uint8_t MaxEncodingStringLength,
+    uint64_t &CurrentBase, DIDumpOptions DumpOpts,
+    llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+        LookupPooledAddress) const {
   auto PrintRawEntry = [](raw_ostream &OS, const RangeListEntry &Entry,
                           uint8_t AddrSize, DIDumpOptions DumpOpts) {
     if (DumpOpts.Verbose) {
@@ -172,6 +203,17 @@ void RangeListEntry::dump(raw_ostream &OS, uint8_t AddrSize,
   case dwarf::DW_RLE_end_of_list:
     OS << (DumpOpts.Verbose ? "" : "<End of list>");
     break;
+    //  case dwarf::DW_RLE_base_addressx:
+  case dwarf::DW_RLE_base_addressx: {
+    if (auto SA = LookupPooledAddress(Value0))
+      CurrentBase = SA->Address;
+    else
+      CurrentBase = Value0;
+    if (!DumpOpts.Verbose)
+      return;
+    OS << format(" 0x%*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value0);
+    break;
+  }
   case dwarf::DW_RLE_base_address:
     // In non-verbose mode we do not print anything for this entry.
     CurrentBase = Value0;
@@ -191,6 +233,14 @@ void RangeListEntry::dump(raw_ostream &OS, uint8_t AddrSize,
   case dwarf::DW_RLE_start_end:
     DWARFAddressRange(Value0, Value1).dump(OS, AddrSize, DumpOpts);
     break;
+  case dwarf::DW_RLE_startx_length: {
+    PrintRawEntry(OS, *this, AddrSize, DumpOpts);
+    uint64_t Start = 0;
+    if (auto SA = LookupPooledAddress(Value0))
+      Start = SA->Address;
+    DWARFAddressRange(Start, Start + Value1).dump(OS, AddrSize, DumpOpts);
+    break;
+  } break;
   default:
     llvm_unreachable("Unsupported range list encoding");
   }
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 081163ba61f..4cf1f938c3e 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -539,7 +539,7 @@ DWARFUnit::findRnglistFromOffset(uint32_t Offset) {
                                   isLittleEndian, RngListTable->getAddrSize());
     auto RangeListOrError = RngListTable->findList(RangesData, Offset);
     if (RangeListOrError)
-      return RangeListOrError.get().getAbsoluteRanges(getBaseAddress());
+      return RangeListOrError.get().getAbsoluteRanges(getBaseAddress(), *this);
     return RangeListOrError.takeError();
   }
 
diff --git a/test/tools/llvm-dwarfdump/X86/debug_rnglists.s b/test/tools/llvm-dwarfdump/X86/debug_rnglists.s
index 8f718b699f5..60533ca2721 100644
--- a/test/tools/llvm-dwarfdump/X86/debug_rnglists.s
+++ b/test/tools/llvm-dwarfdump/X86/debug_rnglists.s
@@ -57,6 +57,29 @@
 # BOTH:         ranges:
 # BOTH-NOT:     [
 
+# TERSE-NEXT:   range list header: length = 0x0000000b, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+
+# VERBOSE-NEXT: 0x{{[0-9a-f]*}}:
+# VERBOSE-SAME: range list header: length = 0x0000000b, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+
+# BOTH-NEXT:    ranges:
+# TERSE-NEXT:   <End of list>
+
+# VERBOSE-NEXT: 0x00000082: [DW_RLE_base_addressx]:  0x0000000000000000
+# VERBOSE-NEXT: 0x00000084: [DW_RLE_end_of_list ]
+
+# TERSE-NEXT:   range list header: length = 0x0000000c, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+
+# VERBOSE-NEXT: 0x{{[0-9a-f]*}}:
+# VERBOSE-SAME: range list header: length = 0x0000000c, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+
+# BOTH-NEXT:    ranges:
+# TERSE-NEXT:   [0x0000000000000000, 0x000000000000002a)
+# TERSE-NEXT:   <End of list>
+
+# VERBOSE-NEXT: 0x000000a1: [DW_RLE_startx_length]:  0x0000000000000002, 0x000000000000002a => [0x0000000000000000, 0x000000000000002a)
+# VERBOSE-NEXT: 0x000000a4: [DW_RLE_end_of_list ]
+
 # TERSE-NEXT:   range list header: length = 0x0000000e, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
 
 # VERBOSE-NEXT: 0x{{[0-9a-f]*}}:
@@ -87,9 +110,7 @@
 # BOTH-NOT:     range list header:
 
 # ERR-NOT:  error:
-# ERR:      error: unsupported rnglists encoding DW_RLE_base_addressx at offset 0x82
-# ERR-NEXT: error: unsupported rnglists encoding DW_RLE_startx_endx at offset 0x91
-# ERR-NEXT: error: unsupported rnglists encoding DW_RLE_startx_length at offset 0xa1
+# ERR: error: unsupported rnglists encoding DW_RLE_startx_endx at offset 0x91
 # ERR-NOT:  error:
 
 .section .debug_rnglists,"",@progbits
-- 
GitLab


From 9a16d2deaa0aa55126cc1fb7fc7b7fda93152fbe Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 20 Oct 2018 07:36:39 +0000
Subject: [PATCH 0344/1116] DebugInfo: Use address pool forms in debug_rnglists

Save no relocations by reusing addresses from the address pool.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344836 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/DIE.cpp              |   2 +
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp |  26 +++--
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.h   |  11 +-
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp       | 109 +++++++++-----------
 lib/CodeGen/AsmPrinter/DwarfDebug.h         |   4 +-
 lib/CodeGen/AsmPrinter/DwarfFile.cpp        |   8 ++
 lib/CodeGen/AsmPrinter/DwarfFile.h          |  43 ++++++++
 lib/CodeGen/AsmPrinter/DwarfUnit.h          |  27 -----
 test/DebugInfo/X86/fission-ranges.ll        |  11 +-
 test/DebugInfo/X86/range_reloc.ll           |  16 +--
 test/DebugInfo/X86/rnglists_curanges.ll     |   4 +-
 11 files changed, 140 insertions(+), 121 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 6ffb6123509..ca3a7506789 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -415,6 +415,7 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_ref_udata:
   case dwarf::DW_FORM_strx:
   case dwarf::DW_FORM_addrx:
+  case dwarf::DW_FORM_rnglistx:
   case dwarf::DW_FORM_udata:
     Asm->EmitULEB128(Integer);
     return;
@@ -442,6 +443,7 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   case dwarf::DW_FORM_ref_udata:
   case dwarf::DW_FORM_strx:
   case dwarf::DW_FORM_addrx:
+  case dwarf::DW_FORM_rnglistx:
   case dwarf::DW_FORM_udata:
     return getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata:
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 5731541e595..f638087b50f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -424,24 +424,30 @@ void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
           ? TLOF.getDwarfRnglistsSection()->getBeginSymbol()
           : TLOF.getDwarfRangesSection()->getBeginSymbol();
 
-  RangeSpanList List(Asm->createTempSymbol("debug_ranges"), std::move(Range));
+  HasRangeLists = true;
+
+  // Add the range list to the set of ranges to be emitted.
+  auto IndexAndList =
+      (DD->getDwarfVersion() < 5 && Skeleton ? Skeleton->DU : DU)
+          ->addRange((Skeleton ? Skeleton->BaseAddress : BaseAddress),
+                     std::move(Range));
+
+  uint32_t Index = IndexAndList.first;
+  auto &List = *IndexAndList.second;
 
   // Under fission, ranges are specified by constant offsets relative to the
   // CU's DW_AT_GNU_ranges_base.
   // FIXME: For DWARF v5, do not generate the DW_AT_ranges attribute under
   // fission until we support the forms using the .debug_addr section
   // (DW_RLE_startx_endx etc.).
-  if (isDwoUnit()) {
-    if (DD->getDwarfVersion() < 5)
-      addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
-                      RangeSectionSym);
-  } else {
+  if (isDwoUnit())
+    addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                    RangeSectionSym);
+  else if (DD->getDwarfVersion() >= 5)
+    addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_rnglistx, Index);
+  else
     addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
                     RangeSectionSym);
-  }
-
-  // Add the range list to the set of ranges to be emitted.
-  (Skeleton ? Skeleton : this)->CURangeLists.push_back(std::move(List));
 }
 
 void DwarfCompileUnit::attachRangesOrLowHighPC(
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 6389ccd686d..97a944e9b95 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -44,6 +44,7 @@ class MDNode;
 class DwarfCompileUnit final : public DwarfUnit {
   /// A numeric ID unique among all CUs in the module
   unsigned UniqueID;
+  bool HasRangeLists = false;
 
   /// The attribute index of DW_AT_stmt_list in the compile unit DIE, avoiding
   /// the need to search for it in applyStmtList.
@@ -69,10 +70,6 @@ class DwarfCompileUnit final : public DwarfUnit {
   /// GlobalTypes - A map of globally visible types for this unit.
   StringMap<const DIE *> GlobalTypes;
 
-  // List of range lists for a given compile unit, separate from the ranges for
-  // the CU itself.
-  SmallVector<RangeSpanList, 1> CURangeLists;
-
   // List of ranges for a given compile unit.
   SmallVector<RangeSpan, 2> CURanges;
 
@@ -108,6 +105,7 @@ public:
   DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
                    DwarfDebug *DW, DwarfFile *DWU);
 
+  bool hasRangeLists() const { return HasRangeLists; }
   unsigned getUniqueID() const { return UniqueID; }
 
   DwarfCompileUnit *getSkeleton() const {
@@ -299,11 +297,6 @@ public:
 
   void applyLabelAttributes(const DbgLabel &Label, DIE &LabelDie);
 
-  /// getRangeLists - Get the vector of range lists.
-  const SmallVectorImpl<RangeSpanList> &getRangeLists() const {
-    return (Skeleton ? Skeleton : this)->CURangeLists;
-  }
-
   /// getRanges - Get the list of ranges for this unit.
   const SmallVectorImpl<RangeSpan> &getRanges() const { return CURanges; }
   SmallVector<RangeSpan, 2> takeRanges() { return std::move(CURanges); }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index e16ca4c4608..e3805bf72b2 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -728,9 +728,13 @@ void DwarfDebug::beginModule() {
 
   // Create the symbol that designates the start of the DWARF v5 range list
   // table. It is located past the header and before the offsets table.
-  if (getDwarfVersion() >= 5)
+  if (getDwarfVersion() >= 5) {
     (useSplitDwarf() ? SkeletonHolder : InfoHolder)
         .setRnglistsTableBaseSym(Asm->createTempSymbol("rnglists_table_base"));
+    if (useSplitDwarf())
+      InfoHolder.setRnglistsTableBaseSym(
+          Asm->createTempSymbol("rnglists_dwo_table_base"));
+  }
 
   // Create the symbol that points to the first entry following the debug
   // address table (.debug_addr) header.
@@ -852,7 +856,7 @@ void DwarfDebug::finalizeModuleInfo() {
                       dwarf::DW_FORM_data8, ID);
       }
 
-      if (getDwarfVersion() < 5 && !SkCU->getRangeLists().empty()) {
+      if (getDwarfVersion() < 5 && !SkeletonHolder.getRangeLists().empty()) {
         const MCSymbol *Sym = TLOF.getDwarfRangesSection()->getBeginSymbol();
         SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_ranges_base,
                               Sym, Sym);
@@ -884,8 +888,7 @@ void DwarfDebug::finalizeModuleInfo() {
       U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges());
     }
 
-    if (getDwarfVersion() >= 5 && !useSplitDwarf() &&
-        !U.getRangeLists().empty())
+    if (getDwarfVersion() >= 5 && U.hasRangeLists())
       U.addRnglistsBase();
 
     auto *CUNode = cast<DICompileUnit>(P.first);
@@ -2134,10 +2137,10 @@ void DwarfDebug::emitDebugARanges() {
 }
 
 /// Emit a single range list. We handle both DWARF v5 and earlier.
-static void emitRangeList(AsmPrinter *Asm, DwarfCompileUnit *CU,
+static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm,
                           const RangeSpanList &List) {
 
-  auto DwarfVersion = CU->getDwarfVersion();
+  auto DwarfVersion = DD.getDwarfVersion();
   // Emit our symbol so we can find the beginning of the range.
   Asm->OutStreamer->EmitLabel(List.getSym());
   // Gather all the ranges that apply to the same section so they can share
@@ -2149,7 +2152,7 @@ static void emitRangeList(AsmPrinter *Asm, DwarfCompileUnit *CU,
   for (const RangeSpan &Range : List.getRanges())
     SectionRanges[&Range.getStart()->getSection()].push_back(&Range);
 
-  auto *CUBase = CU->getBaseAddress();
+  const MCSymbol *CUBase = List.getBaseAddress();
   bool BaseIsSet = false;
   for (const auto &P : SectionRanges) {
     // Don't bother with a base address entry if there's only one range in
@@ -2166,12 +2169,15 @@ static void emitRangeList(AsmPrinter *Asm, DwarfCompileUnit *CU,
       // the lowest address/range in this object.
       Base = P.second.front()->getStart();
       if (DwarfVersion >= 5) {
-        Asm->OutStreamer->AddComment("DW_RLE_base_address");
-        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_base_address, 1);
-      } else
+        Asm->OutStreamer->AddComment("DW_RLE_base_addressx");
+        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_base_addressx, 1);
+        Asm->OutStreamer->AddComment("  base address index");
+        Asm->EmitULEB128(DD.getAddressPool().getIndex(Base));
+      } else {
         Asm->OutStreamer->EmitIntValue(-1, Size);
-      Asm->OutStreamer->AddComment("  base address");
-      Asm->OutStreamer->EmitSymbolValue(Base, Size);
+        Asm->OutStreamer->AddComment("  base address");
+        Asm->OutStreamer->EmitSymbolValue(Base, Size);
+      }
     } else if (BaseIsSet && DwarfVersion < 5) {
       BaseIsSet = false;
       assert(!Base);
@@ -2198,10 +2204,10 @@ static void emitRangeList(AsmPrinter *Asm, DwarfCompileUnit *CU,
           Asm->EmitLabelDifference(End, Base, Size);
         }
       } else if (DwarfVersion >= 5) {
-        Asm->OutStreamer->AddComment("DW_RLE_start_length");
-        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_start_length, 1);
-        Asm->OutStreamer->AddComment("  start");
-        Asm->OutStreamer->EmitSymbolValue(Begin, Size);
+        Asm->OutStreamer->AddComment("DW_RLE_startx_length");
+        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_startx_length, 1);
+        Asm->OutStreamer->AddComment("  start index");
+        Asm->EmitULEB128(DD.getAddressPool().getIndex(Begin));
         Asm->OutStreamer->AddComment("  length");
         Asm->EmitLabelDifferenceAsULEB128(End, Begin);
       } else {
@@ -2223,85 +2229,72 @@ static void emitRangeList(AsmPrinter *Asm, DwarfCompileUnit *CU,
 // Emit the header of a DWARF 5 range list table. Returns the symbol that
 // designates the end of the table for the caller to emit when the table is
 // complete.
-static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm, DwarfFile &Holder) {
+static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
+                                         const DwarfFile &Holder) {
   // The length is described by a starting label right after the length field
   // and an end label.
   MCSymbol *TableStart = Asm->createTempSymbol("debug_rnglist_table_start");
   MCSymbol *TableEnd = Asm->createTempSymbol("debug_rnglist_table_end");
   // Build the range table header, which starts with the length field.
+  Asm->OutStreamer->AddComment("Length");
   Asm->EmitLabelDifference(TableEnd, TableStart, 4);
   Asm->OutStreamer->EmitLabel(TableStart);
   // Version number (DWARF v5 and later).
+  Asm->OutStreamer->AddComment("Version");
   Asm->emitInt16(Asm->OutStreamer->getContext().getDwarfVersion());
-  // Address size.
+  Asm->OutStreamer->AddComment("Address size");
   Asm->emitInt8(Asm->MAI->getCodePointerSize());
-  // Segment selector size.
+  Asm->OutStreamer->AddComment("Segment selector size");
   Asm->emitInt8(0);
 
-  MCSymbol *RnglistTableBaseSym = Holder.getRnglistsTableBaseSym();
+  MCSymbol *RnglistsTableBaseSym = Holder.getRnglistsTableBaseSym();
 
   // FIXME: Generate the offsets table and use DW_FORM_rnglistx with the
   // DW_AT_ranges attribute. Until then set the number of offsets to 0.
-  Asm->emitInt32(0);
-  Asm->OutStreamer->EmitLabel(RnglistTableBaseSym);
+  Asm->OutStreamer->AddComment("Offset entry count");
+  Asm->emitInt32(Holder.getRangeLists().size());
+  Asm->OutStreamer->EmitLabel(RnglistsTableBaseSym);
+  for (const RangeSpanList &List : Holder.getRangeLists())
+    Asm->EmitLabelDifference(List.getSym(), RnglistsTableBaseSym, 4);
   return TableEnd;
 }
 
+void emitDebugRangesImpl(DwarfDebug &DD, AsmPrinter *Asm,
+                         const DwarfFile &Holder, MCSymbol *TableEnd) {
+  for (const RangeSpanList &List : Holder.getRangeLists())
+    emitRangeList(DD, Asm, List);
+
+  if (TableEnd)
+    Asm->OutStreamer->EmitLabel(TableEnd);
+}
+
 /// Emit address ranges into the .debug_ranges section or into the DWARF v5
 /// .debug_rnglists section.
 void DwarfDebug::emitDebugRanges() {
   if (CUMap.empty())
     return;
 
-  auto NoRangesPresent = [this]() {
-    return llvm::all_of(
-        CUMap, [](const decltype(CUMap)::value_type &Pair) {
-          return Pair.second->getRangeLists().empty();
-        });
-  };
-
-  if (llvm::all_of(CUMap, [](const decltype(CUMap)::value_type &Pair) {
-        return Pair.second->getCUNode()->isDebugDirectivesOnly();
-      })) {
-    assert(NoRangesPresent() && "No debug ranges expected.");
-    return;
-  }
+  const auto &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
 
-  if (!useRangesSection()) {
-    assert(NoRangesPresent() && "No debug ranges expected.");
+  if (Holder.getRangeLists().empty())
     return;
-  }
 
-  if (NoRangesPresent())
-    return;
+  assert(useRangesSection());
+  assert(llvm::none_of(CUMap, [](const decltype(CUMap)::value_type &Pair) {
+    return Pair.second->getCUNode()->isDebugDirectivesOnly();
+  }));
 
   // Start the dwarf ranges section.
   MCSymbol *TableEnd = nullptr;
   if (getDwarfVersion() >= 5) {
     Asm->OutStreamer->SwitchSection(
         Asm->getObjFileLowering().getDwarfRnglistsSection());
-    TableEnd = emitRnglistsTableHeader(Asm, useSplitDwarf() ? SkeletonHolder
-                                                            : InfoHolder);
+    TableEnd = emitRnglistsTableHeader(Asm, Holder);
   } else
     Asm->OutStreamer->SwitchSection(
         Asm->getObjFileLowering().getDwarfRangesSection());
 
-  // Grab the specific ranges for the compile units in the module.
-  for (const auto &I : CUMap) {
-    DwarfCompileUnit *TheCU = I.second;
-    if (TheCU->getCUNode()->isDebugDirectivesOnly())
-      continue;
-
-    if (auto *Skel = TheCU->getSkeleton())
-      TheCU = Skel;
-
-    // Iterate over the misc ranges for the compile units in the module.
-    for (const RangeSpanList &List : TheCU->getRangeLists())
-      emitRangeList(Asm, TheCU, List);
-  }
-
-  if (TableEnd)
-    Asm->OutStreamer->EmitLabel(TableEnd);
+  emitDebugRangesImpl(*this, Asm, Holder, TableEnd);
 }
 
 void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index fecf8056765..b98d9267455 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -490,9 +490,7 @@ class DwarfDebug : public DebugHandlerBase {
 
   /// Emit address ranges into a debug ranges section.
   void emitDebugRanges();
-
-  /// Emit range lists into a DWARF v5 debug rnglists section.
-  void emitDebugRnglists();
+  void emitDebugRangesDWO();
 
   /// Emit macros into a debug macinfo section.
   void emitDebugMacinfo();
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 0ab9ea87c23..1e5b7f18958 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -109,3 +109,11 @@ void DwarfFile::addScopeLabel(LexicalScope *LS, DbgLabel *Label) {
   SmallVectorImpl<DbgLabel *> &Labels = ScopeLabels[LS];
   Labels.push_back(Label);
 }
+
+std::pair<uint32_t, RangeSpanList *>
+DwarfFile::addRange(const MCSymbol *&CUBaseAddress,
+                    SmallVector<RangeSpan, 2> R) {
+  CURangeLists.push_back(RangeSpanList(Asm->createTempSymbol("debug_ranges"),
+                                       CUBaseAddress, std::move(R)));
+  return std::make_pair(CURangeLists.size() - 1, &CURangeLists.back());
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index c315f44a8d8..c764c6c5afb 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -32,6 +32,37 @@ class DwarfUnit;
 class LexicalScope;
 class MCSection;
 
+// Data structure to hold a range for range lists.
+class RangeSpan {
+public:
+  RangeSpan(MCSymbol *S, MCSymbol *E) : Start(S), End(E) {}
+  const MCSymbol *getStart() const { return Start; }
+  const MCSymbol *getEnd() const { return End; }
+  void setEnd(const MCSymbol *E) { End = E; }
+
+private:
+  const MCSymbol *Start, *End;
+};
+
+class RangeSpanList {
+private:
+  // Index for locating within the debug_range section this particular span.
+  MCSymbol *RangeSym;
+  const MCSymbol **CUBaseAddress;
+  // List of ranges.
+  SmallVector<RangeSpan, 2> Ranges;
+
+public:
+  RangeSpanList(MCSymbol *Sym, const MCSymbol *&CUBaseAddress,
+                SmallVector<RangeSpan, 2> Ranges)
+      : RangeSym(Sym), CUBaseAddress(&CUBaseAddress),
+        Ranges(std::move(Ranges)) {}
+  MCSymbol *getSym() const { return RangeSym; }
+  const MCSymbol *&getBaseAddress() const { return *CUBaseAddress; }
+  const SmallVectorImpl<RangeSpan> &getRanges() const { return Ranges; }
+  void addRange(RangeSpan Range) { Ranges.push_back(Range); }
+};
+
 class DwarfFile {
   // Target of Dwarf emission, used for sizing of abbreviations.
   AsmPrinter *Asm;
@@ -46,6 +77,10 @@ class DwarfFile {
 
   DwarfStringPool StrPool;
 
+  // List of range lists for a given compile unit, separate from the ranges for
+  // the CU itself.
+  SmallVector<RangeSpanList, 1> CURangeLists;
+
   /// DWARF v5: The symbol that designates the start of the contribution to
   /// the string offsets table. The contribution is shared by all units.
   MCSymbol *StringOffsetsStartSym = nullptr;
@@ -84,6 +119,14 @@ public:
     return CUs;
   }
 
+  std::pair<uint32_t, RangeSpanList *> addRange(const MCSymbol *&CUBaseAddress,
+                                                SmallVector<RangeSpan, 2> R);
+
+  /// getRangeLists - Get the vector of range lists.
+  const SmallVectorImpl<RangeSpanList> &getRangeLists() const {
+    return CURangeLists;
+  }
+
   /// Compute the size and offset of a DIE given an incoming Offset.
   unsigned computeSizeAndOffset(DIE &Die, unsigned Offset);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 6e2bd273cb6..1a36ea9ec55 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -35,33 +35,6 @@ class ConstantFP;
 class DbgVariable;
 class DwarfCompileUnit;
 
-// Data structure to hold a range for range lists.
-class RangeSpan {
-public:
-  RangeSpan(MCSymbol *S, MCSymbol *E) : Start(S), End(E) {}
-  const MCSymbol *getStart() const { return Start; }
-  const MCSymbol *getEnd() const { return End; }
-  void setEnd(const MCSymbol *E) { End = E; }
-
-private:
-  const MCSymbol *Start, *End;
-};
-
-class RangeSpanList {
-private:
-  // Index for locating within the debug_range section this particular span.
-  MCSymbol *RangeSym;
-  // List of ranges.
-  SmallVector<RangeSpan, 2> Ranges;
-
-public:
-  RangeSpanList(MCSymbol *Sym, SmallVector<RangeSpan, 2> Ranges)
-      : RangeSym(Sym), Ranges(std::move(Ranges)) {}
-  MCSymbol *getSym() const { return RangeSym; }
-  const SmallVectorImpl<RangeSpan> &getRanges() const { return Ranges; }
-  void addRange(RangeSpan Range) { Ranges.push_back(Range); }
-};
-
 //===----------------------------------------------------------------------===//
 /// This dwarf writer support class manages information associated with a
 /// source file.
diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
index e7402787dc3..5883d2b0c40 100644
--- a/test/DebugInfo/X86/fission-ranges.ll
+++ b/test/DebugInfo/X86/fission-ranges.ll
@@ -56,11 +56,14 @@
 ; V5RNGLISTS-NOT:  DW_TAG
 ; V5RNGLISTS:      DW_AT_rnglists_base [DW_FORM_sec_offset]  (0x0000000c)
 ; V5RNGLISTS:      .debug_rnglists contents:
-; V5RNGLISTS-NEXT: 0x00000000: range list header: length = 0x00000015, version = 0x0005,
-; V5RNGLISTS-SAME: addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+; V5RNGLISTS-NEXT: 0x00000000: range list header: length = 0x00000019, version = 0x0005,
+; V5RNGLISTS-SAME: addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000001
+; V5RNGLISTS-NEXT: offsets: [
+; V5RNGLISTS-NEXT: => 0x00000010
+; V5RNGLISTS-NEXT: ]
 ; V5RNGLISTS-NEXT: ranges:
-; V5RNGLISTS-NEXT: 0x0000000c: [DW_RLE_offset_pair]:
-; V5RNGLISTS-NEXT: 0x0000000f: [DW_RLE_offset_pair]:
+; V5RNGLISTS-NEXT: 0x00000010: [DW_RLE_offset_pair]:
+; V5RNGLISTS-NEXT: 0x00000013: [DW_RLE_offset_pair]:
 ; V5RNGLISTS:      0x{{[0-9a-f]+}}: [DW_RLE_end_of_list]
 
 ; From the code:
diff --git a/test/DebugInfo/X86/range_reloc.ll b/test/DebugInfo/X86/range_reloc.ll
index d1479e232f6..bcb11cc504e 100644
--- a/test/DebugInfo/X86/range_reloc.ll
+++ b/test/DebugInfo/X86/range_reloc.ll
@@ -38,22 +38,22 @@
 ; COMMON-NEXT:   .quad   0
 
 ; DWARF5: {{^.Ldebug_ranges0}}
-; DWARF5-NEXT:                                      # DW_RLE_start_length
-; DWARF5-NEXT: .quad    .Lfunc_begin0               #   start
+; DWARF5-NEXT:                                      # DW_RLE_startx_length
+; DWARF5-NEXT: .byte 0                              #   start index
 ; DWARF5-NEXT: .uleb128 .Lfunc_end0-.Lfunc_begin0   #   length
-; DWARF5-NEXT:                                      # DW_RLE_base_address
-; DWARF5-NEXT: .quad    .Lfunc_begin1               #   base address
+; DWARF5-NEXT:                                      # DW_RLE_base_addressx
+; DWARF5-NEXT: .byte 1                              #   base address index
 ; DWARF5-NEXT:                                      # DW_RLE_offset_pair
 ; DWARF5-NEXT: .uleb128 .Lfunc_begin1-.Lfunc_begin1 #   starting offset
 ; DWARF5-NEXT: .uleb128 .Lfunc_end1-.Lfunc_begin1   #   ending offset
 ; DWARF5-NEXT:                                      # DW_RLE_offset_pair
 ; DWARF5-NEXT: .uleb128 .Lfunc_begin3-.Lfunc_begin1 #   starting offset
 ; DWARF5-NEXT: .uleb128 .Lfunc_end3-.Lfunc_begin1   #   ending offset
-; DWARF5-NEXT:                                      # DW_RLE_start_length
-; DWARF5-NEXT: .quad	   .Lfunc_begin4               #   start
+; DWARF5-NEXT:                                      # DW_RLE_startx_length
+; DWARF5-NEXT: .byte 3                              #   start index
 ; DWARF5-NEXT: .uleb128 .Lfunc_end4-.Lfunc_begin4   #   length
-; DWARF5-NEXT:                                      # DW_RLE_start_length
-; DWARF5-NEXT: .quad	   .Lfunc_begin5               #   start
+; DWARF5-NEXT:                                      # DW_RLE_startx_length
+; DWARF5-NEXT: .byte 4                              #   start index
 ; DWARF5-NEXT: .uleb128 .Lfunc_end5-.Lfunc_begin5   #   length
 ; DWARF5-NEXT:                                      # DW_RLE_end_of_list
 
diff --git a/test/DebugInfo/X86/rnglists_curanges.ll b/test/DebugInfo/X86/rnglists_curanges.ll
index aac0ef59eee..05206a7b5b2 100644
--- a/test/DebugInfo/X86/rnglists_curanges.ll
+++ b/test/DebugInfo/X86/rnglists_curanges.ll
@@ -16,8 +16,8 @@
 ; CHECK-NOT:  DW_TAG
 ; CHECK:      DW_AT_rnglists_base [DW_FORM_sec_offset]                   (0x0000000c)
 ; CHECK:      .debug_rnglists contents:
-; CHECK:      0x00000000: range list header: length = 0x0000001d, version = 0x0005,
-; CHECK-SAME: addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+; CHECK:      0x00000000: range list header: length = 0x00000013, version = 0x0005,
+; CHECK-SAME: addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000001
 
 ; Function Attrs: noinline nounwind optnone uwtable
 define dso_local void @f1() section "text.foo" !dbg !7 {
-- 
GitLab


From 9d207e6821d69aaa104f854aacb3140837e36637 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 20 Oct 2018 08:12:36 +0000
Subject: [PATCH 0345/1116] DebugInfo: Implement debug_rnglists.dwo

Save space/relocations in .o files by keeping dwo ranges in the dwo
file rather than the .o file.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344837 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp |  6 +-
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp       | 26 +++++++
 test/DebugInfo/X86/split-dwarf-v5-ranges.ll | 78 +++++++++++++++++++++
 3 files changed, 107 insertions(+), 3 deletions(-)
 create mode 100644 test/DebugInfo/X86/split-dwarf-v5-ranges.ll

diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index f638087b50f..81eb0c2aa9e 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -440,11 +440,11 @@ void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
   // FIXME: For DWARF v5, do not generate the DW_AT_ranges attribute under
   // fission until we support the forms using the .debug_addr section
   // (DW_RLE_startx_endx etc.).
-  if (isDwoUnit())
+  if (DD->getDwarfVersion() >= 5)
+    addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_rnglistx, Index);
+  else if (isDwoUnit())
     addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
                     RangeSectionSym);
-  else if (DD->getDwarfVersion() >= 5)
-    addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_rnglistx, Index);
   else
     addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
                     RangeSectionSym);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index e3805bf72b2..3d842e70138 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -953,6 +953,7 @@ void DwarfDebug::endModule() {
     emitDebugInfoDWO();
     emitDebugAbbrevDWO();
     emitDebugLineDWO();
+    emitDebugRangesDWO();
   }
 
   emitDebugAddr();
@@ -2297,6 +2298,31 @@ void DwarfDebug::emitDebugRanges() {
   emitDebugRangesImpl(*this, Asm, Holder, TableEnd);
 }
 
+void DwarfDebug::emitDebugRangesDWO() {
+  assert(useSplitDwarf());
+
+  if (CUMap.empty())
+    return;
+
+  const auto &Holder = InfoHolder;
+
+  if (Holder.getRangeLists().empty())
+    return;
+
+  assert(getDwarfVersion() >= 5);
+  assert(useRangesSection());
+  assert(llvm::none_of(CUMap, [](const decltype(CUMap)::value_type &Pair) {
+    return Pair.second->getCUNode()->isDebugDirectivesOnly();
+  }));
+
+  // Start the dwarf ranges section.
+  Asm->OutStreamer->SwitchSection(
+      Asm->getObjFileLowering().getDwarfRnglistsDWOSection());
+  MCSymbol *TableEnd = emitRnglistsTableHeader(Asm, Holder);
+
+  emitDebugRangesImpl(*this, Asm, Holder, TableEnd);
+}
+
 void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) {
   for (auto *MN : Nodes) {
     if (auto *M = dyn_cast<DIMacro>(MN))
diff --git a/test/DebugInfo/X86/split-dwarf-v5-ranges.ll b/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
new file mode 100644
index 00000000000..4404d5c3639
--- /dev/null
+++ b/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
@@ -0,0 +1,78 @@
+; RUN: llc -split-dwarf-file=foo.dwo -mtriple=x86_64-unknown-linux-gnu -filetype=obj < %s \
+; RUN: 	    | llvm-dwarfdump -v -debug-info -debug-rnglists - | FileCheck %s
+
+; CHECK: .debug_info contents:
+; CHECK: .debug_info.dwo contents:
+; CHECK: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x0) rangelist = 0x00000004
+; CHECK:          [0x0000000000000000, 0x000000000000000b) "x"
+; CHECK:          [0x000000000000000d, 0x0000000000000012) "x")
+
+; CHECK: .debug_rnglists.dwo contents:
+; CHECK: 0x00000000: range list header: length = 0x00000015, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000001
+; CHECK: offsets: [
+; CHECK: 0x00000004 => 0x00000010
+; CHECK: ]
+; CHECK: ranges:
+; CHECK: 0x00000010: [DW_RLE_base_addressx]:  0x0000000000000002
+; CHECK: 0x00000012: [DW_RLE_offset_pair  ]:  0x0000000000000000, 0x000000000000000b => [0x0000000000000001, 0x000000000000000c)
+; CHECK: 0x00000015: [DW_RLE_offset_pair  ]:  0x000000000000000d, 0x0000000000000012 => [0x000000000000000e, 0x0000000000000013)
+; CHECK: 0x00000018: [DW_RLE_end_of_list  ]
+
+; Function Attrs: noinline optnone uwtable
+define dso_local void @_Z2f3v() !dbg !7 {
+entry:
+  %x = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata i32* %x, metadata !10, metadata !DIExpression()), !dbg !13
+  %call = call i32 @_Z2f2v(), !dbg !14
+  store i32 %call, i32* %x, align 4, !dbg !13
+  %0 = load i32, i32* %x, align 4, !dbg !13
+  %tobool = icmp ne i32 %0, 0, !dbg !13
+  br i1 %tobool, label %if.then, label %if.end, !dbg !15
+
+if.then:                                          ; preds = %entry
+  call void @_Z2f1v(), !dbg !16
+  br label %if.end, !dbg !18
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void, !dbg !19
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+declare dso_local i32 @_Z2f2v()
+
+declare dso_local void @_Z2f1v()
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @_Z2f4v() #3 section "x" !dbg !20 {
+entry:
+  ret void, !dbg !21
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 344806) (llvm/trunk 344835)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: GNU)
+!1 = !DIFile(filename: "ranges.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch", checksumkind: CSK_MD5, checksum: "a1e825b91fba21d696f05eb06d440aa3")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk 344806) (llvm/trunk 344835)"}
+!7 = distinct !DISubprogram(name: "f3", linkageName: "_Z2f3v", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !DILocalVariable(name: "x", scope: !11, file: !1, line: 4, type: !12)
+!11 = distinct !DILexicalBlock(scope: !7, file: !1, line: 4, column: 11)
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DILocation(line: 4, column: 11, scope: !11)
+!14 = !DILocation(line: 4, column: 15, scope: !11)
+!15 = !DILocation(line: 4, column: 11, scope: !7)
+!16 = !DILocation(line: 5, column: 5, scope: !17)
+!17 = distinct !DILexicalBlock(scope: !11, file: !1, line: 4, column: 21)
+!18 = !DILocation(line: 6, column: 3, scope: !17)
+!19 = !DILocation(line: 7, column: 1, scope: !7)
+!20 = distinct !DISubprogram(name: "f4", linkageName: "_Z2f4v", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!21 = !DILocation(line: 8, column: 42, scope: !20)
-- 
GitLab


From b772afaaa9e655cd59115d63a0bbc7bd7c3e1980 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 20 Oct 2018 08:54:05 +0000
Subject: [PATCH 0346/1116] DebugInfo: Use DW_OP_addrx in DWARFv5

Reuse addresses in the address pool, even in non-split cases.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344838 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/DwarfUnit.cpp    | 15 ++++++++++----
 lib/DebugInfo/DWARF/DWARFExpression.cpp |  1 +
 test/CodeGen/X86/dwarf-headers.ll       |  4 ++--
 test/DebugInfo/X86/v5-loc.ll            | 27 +++++++++++++++++++++++++
 4 files changed, 41 insertions(+), 6 deletions(-)
 create mode 100644 test/DebugInfo/X86/v5-loc.ll

diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 8a168f4845d..42aa0c933ef 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -315,14 +315,21 @@ unsigned DwarfTypeUnit::getOrCreateSourceID(const DIFile *File) {
 }
 
 void DwarfUnit::addOpAddress(DIELoc &Die, const MCSymbol *Sym) {
-  if (!DD->useSplitDwarf()) {
-    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    addLabel(Die, dwarf::DW_FORM_udata, Sym);
-  } else {
+  if (DD->getDwarfVersion() >= 5) {
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addrx);
+    addUInt(Die, dwarf::DW_FORM_addrx, DD->getAddressPool().getIndex(Sym));
+    return;
+  }
+
+  if (DD->useSplitDwarf()) {
     addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
     addUInt(Die, dwarf::DW_FORM_GNU_addr_index,
             DD->getAddressPool().getIndex(Sym));
+    return;
   }
+
+  addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+  addLabel(Die, dwarf::DW_FORM_udata, Sym);
 }
 
 void DwarfUnit::addLabelDelta(DIE &Die, dwarf::Attribute Attribute,
diff --git a/lib/DebugInfo/DWARF/DWARFExpression.cpp b/lib/DebugInfo/DWARF/DWARFExpression.cpp
index a9ea26c476c..2df4456053f 100644
--- a/lib/DebugInfo/DWARF/DWARFExpression.cpp
+++ b/lib/DebugInfo/DWARF/DWARFExpression.cpp
@@ -94,6 +94,7 @@ static DescVector getDescriptions() {
       Desc(Op::Dwarf3, Op::SizeLEB, Op::SizeBlock);
   Descriptions[DW_OP_stack_value] = Desc(Op::Dwarf3);
   Descriptions[DW_OP_GNU_push_tls_address] = Desc(Op::Dwarf3);
+  Descriptions[DW_OP_addrx] = Desc(Op::Dwarf4, Op::SizeLEB);
   Descriptions[DW_OP_GNU_addr_index] = Desc(Op::Dwarf4, Op::SizeLEB);
   Descriptions[DW_OP_GNU_const_index] = Desc(Op::Dwarf4, Op::SizeLEB);
   return Descriptions;
diff --git a/test/CodeGen/X86/dwarf-headers.ll b/test/CodeGen/X86/dwarf-headers.ll
index b2a3115cf5e..fa2080d1e1d 100644
--- a/test/CodeGen/X86/dwarf-headers.ll
+++ b/test/CodeGen/X86/dwarf-headers.ll
@@ -74,12 +74,12 @@
 ;
 ; O-5: .debug_info contents:
 ; O-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_skeleton abbr_offset
-; O-5-SAME:        DWO_id = 0x4ed74084f749d96b
+; O-5-SAME:        DWO_id = 0xccd7e58ef8bf4aa6
 ; O-5: 0x00000014: DW_TAG_compile_unit
 ;
 ; DWO-5: .debug_info.dwo contents:
 ; DWO-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_compile abbr_offset
-; DWO-5-SAME:        DWO_id = 0x4ed74084f749d96b
+; DWO-5-SAME:        DWO_id = 0xccd7e58ef8bf4aa6
 ; DWO-5: 0x00000014: DW_TAG_compile_unit
 ;
 ; FIXME: V5 wants type units in .debug_info.dwo not .debug_types.dwo.
diff --git a/test/DebugInfo/X86/v5-loc.ll b/test/DebugInfo/X86/v5-loc.ll
new file mode 100644
index 00000000000..08789a51813
--- /dev/null
+++ b/test/DebugInfo/X86/v5-loc.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -filetype=obj < %s \
+; RUN: 	    | llvm-dwarfdump -v -debug-info - | FileCheck %s
+
+; CHECK: DW_AT_location [DW_FORM_exprloc] (DW_OP_addrx 0x0)
+
+%struct.foo = type { i32 }
+
+@f = dso_local global %struct.foo zeroinitializer, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!10, !11, !12}
+!llvm.ident = !{!13}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "f", scope: !2, file: !3, line: 5, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 8.0.0 (trunk 344833) (llvm/trunk 344837)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: GNU)
+!3 = !DIFile(filename: "loc.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch", checksumkind: CSK_MD5, checksum: "e579a1a06fae14a4526216e905198a01")
+!4 = !{}
+!5 = !{!0}
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo", file: !3, line: 1, size: 32, flags: DIFlagTypePassByValue | DIFlagTrivial, elements: !7, identifier: "_ZTS3foo")
+!7 = !{!8}
+!8 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !6, file: !3, line: 2, baseType: !9, size: 32)
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !{i32 2, !"Dwarf Version", i32 5}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{i32 1, !"wchar_size", i32 4}
+!13 = !{!"clang version 8.0.0 (trunk 344833) (llvm/trunk 344837)"}
-- 
GitLab


From cbf08ad2291bc9d9f12a75897fe0c43371a7e0fb Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 20 Oct 2018 08:55:51 +0000
Subject: [PATCH 0347/1116] Add missed file from previous commit (r344838)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344839 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/MC/MCObjectFileInfo.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index edfccfcb9ed..465b62ce8c9 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -254,6 +254,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
   DwarfStrOffSection =
       Ctx->getMachOSection("__DWARF", "__debug_str_offs", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "section_str_off");
+  DwarfAddrSection =
+      Ctx->getMachOSection("__DWARF", "__debug_addr", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata(), "section_info");
   DwarfLocSection =
       Ctx->getMachOSection("__DWARF", "__debug_loc", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "section_debug_loc");
-- 
GitLab


From 101b0b8556c2416fc8e52f917460d12233a095a7 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 20 Oct 2018 09:16:49 +0000
Subject: [PATCH 0348/1116] DebugInfo: Use base address specifiers more
 aggressively

Using a base address specifier even for a single-element range is a size
win for object files (7 words versus 8 words - more significant savings
if the debug info is compressed (since it's 3 words of uncompressable
reloc + 4 compressable words compared to 6 uncompressable reloc + 2
compressable words) - does trade off executable size increase though.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344841 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp |  2 +-
 test/DebugInfo/X86/range_reloc.ll     | 33 ++++++++++++++++++---------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 3d842e70138..2f14f5464fd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2163,7 +2163,7 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm,
     // or optnone where there may be holes in a single CU's section
     // contributions.
     auto *Base = CUBase;
-    if (!Base && P.second.size() > 1 &&
+    if (!Base && (P.second.size() > 1 || DwarfVersion < 5) &&
         (UseDwarfRangesBaseAddressSpecifier || DwarfVersion >= 5)) {
       BaseIsSet = true;
       // FIXME/use care: This may not be a useful base address if it's not
diff --git a/test/DebugInfo/X86/range_reloc.ll b/test/DebugInfo/X86/range_reloc.ll
index bcb11cc504e..5c40cf31944 100644
--- a/test/DebugInfo/X86/range_reloc.ll
+++ b/test/DebugInfo/X86/range_reloc.ll
@@ -16,8 +16,21 @@
 ; smaller (the growth of debug_ranges itself would be more significant).
 
 ; COMMON: {{^.Ldebug_ranges0}}
-; COMMON-NEXT:   .quad   .Lfunc_begin0
-; COMMON-NEXT:   .quad   .Lfunc_end0
+; NOBASE-NEXT:   .quad   .Lfunc_begin0
+; NOBASE-NEXT:   .quad   .Lfunc_end0
+; NOBASE-NEXT:   .quad   .Lfunc_begin1
+; NOBASE-NEXT:   .quad   .Lfunc_end1
+; NOBASE-NEXT:   .quad   .Lfunc_begin3
+; NOBASE-NEXT:   .quad   .Lfunc_end3
+; NOBASE-NEXT:   .quad   .Lfunc_begin4
+; NOBASE-NEXT:   .quad   .Lfunc_end4
+; NOBASE-NEXT:   .quad   .Lfunc_begin5
+; NOBASE-NEXT:   .quad   .Lfunc_end5
+
+; BASE-NEXT:   .quad   -1
+; BASE-NEXT:   .quad   .Lfunc_begin0
+; BASE-NEXT:   .quad   .Lfunc_begin0-.Lfunc_begin0
+; BASE-NEXT:   .quad   .Lfunc_end0-.Lfunc_begin0
 ; BASE-NEXT:   .quad   -1
 ; BASE-NEXT:   .quad   .Lfunc_begin1
 ; BASE-NEXT:   .quad   .Lfunc_begin1-.Lfunc_begin1
@@ -25,15 +38,13 @@
 ; BASE-NEXT:   .quad   .Lfunc_begin3-.Lfunc_begin1
 ; BASE-NEXT:   .quad   .Lfunc_end3-.Lfunc_begin1
 ; BASE-NEXT:   .quad   -1
-; BASE-NEXT:   .quad   0
-; NOBASE-NEXT:   .quad   .Lfunc_begin1
-; NOBASE-NEXT:   .quad   .Lfunc_end1
-; NOBASE-NEXT:   .quad   .Lfunc_begin3
-; NOBASE-NEXT:   .quad   .Lfunc_end3
-; COMMON-NEXT:   .quad   .Lfunc_begin4
-; COMMON-NEXT:   .quad   .Lfunc_end4
-; COMMON-NEXT:   .quad   .Lfunc_begin5
-; COMMON-NEXT:   .quad   .Lfunc_end5
+; BASE-NEXT:   .quad   .Lfunc_begin4
+; BASE-NEXT:   .quad   .Lfunc_begin4-.Lfunc_begin4
+; BASE-NEXT:   .quad   .Lfunc_end4-.Lfunc_begin4
+; BASE-NEXT:   .quad   -1
+; BASE-NEXT:   .quad   .Lfunc_begin5
+; BASE-NEXT:   .quad   .Lfunc_begin5-.Lfunc_begin5
+; BASE-NEXT:   .quad   .Lfunc_end5-.Lfunc_begin5
 ; COMMON-NEXT:   .quad   0
 ; COMMON-NEXT:   .quad   0
 
-- 
GitLab


From 6493c695297ecb8c90d8b5592ed9c01eb8239378 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 20 Oct 2018 13:16:31 +0000
Subject: [PATCH 0349/1116] Replace setFeature macro with lambda to fix MSVC
 "shift count negative or too big" warnings. NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344843 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Host.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index ebf03cc176f..91e98a33b37 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -884,15 +884,16 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   unsigned Features3 = 0;
   unsigned EAX, EBX;
 
-#define setFeature(F)              \
-  do {                             \
-    if (F < 32)                    \
-      Features |= 1 << F;          \
-    else if (F < 64)               \
-      Features2 |= 1 << (F - 32);  \
-    else if (F < 96)               \
-      Features3 |= 1 << (F - 64);  \
-  } while (0)
+  auto setFeature = [&](unsigned F) {
+    if (F < 32)
+      Features |= 1 << F;
+    else if (F < 64)
+      Features2 |= 1 << (F - 32);
+    else if (F < 96)
+      Features3 |= 1 << (F - 64);
+    else
+      llvm_unreachable("Unexpected FeatureBit");
+  };
 
   if ((EDX >> 15) & 1)
     setFeature(X86::FEATURE_CMOV);
@@ -1004,7 +1005,6 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   *FeaturesOut  = Features;
   *Features2Out = Features2;
   *Features3Out = Features3;
-#undef setFeature
 }
 
 StringRef sys::getHostCPUName() {
-- 
GitLab


From bc54072bd317a837f947fa15a6ac8d12a3a1fb7e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 20 Oct 2018 14:29:59 +0000
Subject: [PATCH 0350/1116] [CostModel][X86] Add integer vector reduction cost
 tests

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344846 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Analysis/CostModel/X86/reduce-add.ll  | 277 +++++++++++++++++++
 test/Analysis/CostModel/X86/reduce-and.ll  | 273 +++++++++++++++++++
 test/Analysis/CostModel/X86/reduce-mul.ll  | 293 +++++++++++++++++++++
 test/Analysis/CostModel/X86/reduce-or.ll   | 273 +++++++++++++++++++
 test/Analysis/CostModel/X86/reduce-smax.ll | 293 +++++++++++++++++++++
 test/Analysis/CostModel/X86/reduce-smin.ll | 293 +++++++++++++++++++++
 test/Analysis/CostModel/X86/reduce-umax.ll | 293 +++++++++++++++++++++
 test/Analysis/CostModel/X86/reduce-umin.ll | 293 +++++++++++++++++++++
 test/Analysis/CostModel/X86/reduce-xor.ll  | 273 +++++++++++++++++++
 9 files changed, 2561 insertions(+)
 create mode 100644 test/Analysis/CostModel/X86/reduce-add.ll
 create mode 100644 test/Analysis/CostModel/X86/reduce-and.ll
 create mode 100644 test/Analysis/CostModel/X86/reduce-mul.ll
 create mode 100644 test/Analysis/CostModel/X86/reduce-or.ll
 create mode 100644 test/Analysis/CostModel/X86/reduce-smax.ll
 create mode 100644 test/Analysis/CostModel/X86/reduce-smin.ll
 create mode 100644 test/Analysis/CostModel/X86/reduce-umax.ll
 create mode 100644 test/Analysis/CostModel/X86/reduce-umin.ll
 create mode 100644 test/Analysis/CostModel/X86/reduce-xor.ll

diff --git a/test/Analysis/CostModel/X86/reduce-add.ll b/test/Analysis/CostModel/X86/reduce-add.ll
new file mode 100644
index 00000000000..046aaf04e33
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-add.ll
@@ -0,0 +1,277 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-and.ll b/test/Analysis/CostModel/X86/reduce-and.ll
new file mode 100644
index 00000000000..18abdd4a6dc
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-and.ll
@@ -0,0 +1,273 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE-LABEL: 'reduce_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE-LABEL: 'reduce_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE-LABEL: 'reduce_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i1(i32 %arg) {
+; SSE-LABEL: 'reduce_i1'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i1'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1   = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8>)
+
+declare i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1>)
diff --git a/test/Analysis/CostModel/X86/reduce-mul.ll b/test/Analysis/CostModel/X86/reduce-mul.ll
new file mode 100644
index 00000000000..1e659a180fb
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i64'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i64'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i64'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i32'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i32'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i32'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE-LABEL: 'reduce_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 239 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 325 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 157 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 226 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 182 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 182 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-or.ll b/test/Analysis/CostModel/X86/reduce-or.ll
new file mode 100644
index 00000000000..47e473147a9
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-or.ll
@@ -0,0 +1,273 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE-LABEL: 'reduce_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE-LABEL: 'reduce_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE-LABEL: 'reduce_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i1(i32 %arg) {
+; SSE-LABEL: 'reduce_i1'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i1'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1   = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8>)
+
+declare i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1>)
diff --git a/test/Analysis/CostModel/X86/reduce-smax.ll b/test/Analysis/CostModel/X86/reduce-smax.ll
new file mode 100644
index 00000000000..23d8b2c5c18
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-smax.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-smin.ll b/test/Analysis/CostModel/X86/reduce-smin.ll
new file mode 100644
index 00000000000..0b3c72b9daa
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-smin.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-umax.ll b/test/Analysis/CostModel/X86/reduce-umax.ll
new file mode 100644
index 00000000000..ae542a07dd3
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-umax.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-umin.ll b/test/Analysis/CostModel/X86/reduce-umin.ll
new file mode 100644
index 00000000000..3462c6ec0c6
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-umin.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-xor.ll b/test/Analysis/CostModel/X86/reduce-xor.ll
new file mode 100644
index 00000000000..e7eb295f63f
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-xor.ll
@@ -0,0 +1,273 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE-LABEL: 'reduce_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE-LABEL: 'reduce_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE-LABEL: 'reduce_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i1(i32 %arg) {
+; SSE-LABEL: 'reduce_i1'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i1'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1   = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8>)
+
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1>)
-- 
GitLab


From 40da074bdf4812668f0b7703b001b15b9dd457f3 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 20 Oct 2018 14:53:07 +0000
Subject: [PATCH 0351/1116] [SLPVectorizer] regenerate test checks; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344848 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SLPVectorizer/AArch64/transpose.ll        | 61 ++++++++++---------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 780665d94ed..9e9f40825ef 100644
--- a/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -instcombine -S | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -5,10 +6,10 @@ target triple = "aarch64--linux-gnu"
 
 define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: @build_vec_v2i64(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i64> %v0, i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i64> %v0, i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i64> %v1, i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i64> %v1, i32 1
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i64> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i64> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i64> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i64> [[V1]], i32 1
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
 ; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
@@ -36,12 +37,12 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 
 define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
 ; CHECK-LABEL: @store_chain_v2i64(
-; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* %a, i64 1
-; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* %b, i64 1
-; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* %c, i64 1
-; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* %a, align 8
+; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* [[A]], align 8
 ; CHECK-NEXT:    [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
-; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* %b, align 8
+; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* [[B]], align 8
 ; CHECK-NEXT:    [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
@@ -49,7 +50,7 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
 ; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
 ; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
 ; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    store i64 [[TMP2_0]], i64* %c, align 8
+; CHECK-NEXT:    store i64 [[TMP2_0]], i64* [[C]], align 8
 ; CHECK-NEXT:    store i64 [[TMP2_1]], i64* [[C_1]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -76,16 +77,16 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
 
 define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
@@ -122,13 +123,13 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 
 define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
@@ -155,10 +156,10 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 
 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
 ; CHECK-NEXT:    [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
@@ -197,13 +198,13 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 
 define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_3_binops(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
@@ -239,16 +240,16 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 
 define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @reduction_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
-- 
GitLab


From db6c8767cd779688c964b4e23a7669a2607e5a63 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 20 Oct 2018 15:17:27 +0000
Subject: [PATCH 0352/1116] [SLPVectorizer][X86] Add mul/and/or/xor unrolled
 reduction tests

We miss arithmetic reduction for everything but Add/FAdd (I assume because that's the only cases which x86 has horizontal ops for.....)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344849 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SLPVectorizer/X86/reduction_unrolled.ll   | 358 +++++++++++++++++-
 1 file changed, 351 insertions(+), 7 deletions(-)

diff --git a/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
index 8ee37df4e90..b5a96025764 100644
--- a/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -3,19 +3,19 @@
 ; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 -debug < %s 2>&1 | FileCheck --check-prefix=SSE2 %s
 ; REQUIRES: asserts
 
-; int test(unsigned int *p) {
-;   int sum = 0;
+; int test_add(unsigned int *p) {
+;   int result = 0;
 ;   for (int i = 0; i < 8; i++)
-;     sum += p[i];
-;   return sum;
+;     result += p[i];
+;   return result;
 ; }
 
 ; Vector cost is 5, Scalar cost is 7
 ; CHECK: Adding cost -2 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
 ; Vector cost is 11, Scalar cost is 7
 ; SSE2:  Adding cost 4 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
-define i32 @test(i32* nocapture readonly %p) {
-; CHECK-LABEL: @test(
+define i32 @test_add(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_add(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
@@ -42,7 +42,7 @@ define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
-; SSE2-LABEL: @test(
+; SSE2-LABEL: @test_add(
 ; SSE2-NEXT:  entry:
 ; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
@@ -94,3 +94,347 @@ entry:
   %mul.714 = add i32 %7, %mul.613
   ret i32 %mul.714
 }
+
+; int test_mul(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result *= p[i];
+;   return result;
+; }
+
+define i32 @test_mul(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[MUL_714]]
+;
+; SSE2-LABEL: @test_mul(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
+; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; SSE2-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
+; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; SSE2-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
+; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; SSE2-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
+; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; SSE2-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
+; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; SSE2-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
+; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; SSE2-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[MUL_714]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = mul i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = mul i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = mul i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = mul i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = mul i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = mul i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = mul i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_and(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result &= p[i];
+;   return result;
+; }
+
+define i32 @test_and(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = and i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[MUL_29:%.*]] = and i32 [[TMP2]], [[MUL_18]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[MUL_310:%.*]] = and i32 [[TMP3]], [[MUL_29]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[MUL_411:%.*]] = and i32 [[TMP4]], [[MUL_310]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[MUL_512:%.*]] = and i32 [[TMP5]], [[MUL_411]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[MUL_613:%.*]] = and i32 [[TMP6]], [[MUL_512]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[MUL_714:%.*]] = and i32 [[TMP7]], [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[MUL_714]]
+;
+; SSE2-LABEL: @test_and(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = and i32 [[TMP1]], [[TMP0]]
+; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; SSE2-NEXT:    [[MUL_29:%.*]] = and i32 [[TMP2]], [[MUL_18]]
+; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; SSE2-NEXT:    [[MUL_310:%.*]] = and i32 [[TMP3]], [[MUL_29]]
+; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; SSE2-NEXT:    [[MUL_411:%.*]] = and i32 [[TMP4]], [[MUL_310]]
+; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; SSE2-NEXT:    [[MUL_512:%.*]] = and i32 [[TMP5]], [[MUL_411]]
+; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; SSE2-NEXT:    [[MUL_613:%.*]] = and i32 [[TMP6]], [[MUL_512]]
+; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; SSE2-NEXT:    [[MUL_714:%.*]] = and i32 [[TMP7]], [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[MUL_714]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = and i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = and i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = and i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = and i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = and i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = and i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = and i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_or(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result |= p[i];
+;   return result;
+; }
+
+define i32 @test_or(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = or i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[MUL_29:%.*]] = or i32 [[TMP2]], [[MUL_18]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[MUL_310:%.*]] = or i32 [[TMP3]], [[MUL_29]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[MUL_411:%.*]] = or i32 [[TMP4]], [[MUL_310]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[MUL_512:%.*]] = or i32 [[TMP5]], [[MUL_411]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[MUL_613:%.*]] = or i32 [[TMP6]], [[MUL_512]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[MUL_714:%.*]] = or i32 [[TMP7]], [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[MUL_714]]
+;
+; SSE2-LABEL: @test_or(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = or i32 [[TMP1]], [[TMP0]]
+; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; SSE2-NEXT:    [[MUL_29:%.*]] = or i32 [[TMP2]], [[MUL_18]]
+; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; SSE2-NEXT:    [[MUL_310:%.*]] = or i32 [[TMP3]], [[MUL_29]]
+; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; SSE2-NEXT:    [[MUL_411:%.*]] = or i32 [[TMP4]], [[MUL_310]]
+; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; SSE2-NEXT:    [[MUL_512:%.*]] = or i32 [[TMP5]], [[MUL_411]]
+; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; SSE2-NEXT:    [[MUL_613:%.*]] = or i32 [[TMP6]], [[MUL_512]]
+; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; SSE2-NEXT:    [[MUL_714:%.*]] = or i32 [[TMP7]], [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[MUL_714]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = or i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = or i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = or i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = or i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = or i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = or i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = or i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_xor(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result ^= p[i];
+;   return result;
+; }
+
+define i32 @test_xor(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = xor i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[MUL_29:%.*]] = xor i32 [[TMP2]], [[MUL_18]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[MUL_310:%.*]] = xor i32 [[TMP3]], [[MUL_29]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[MUL_411:%.*]] = xor i32 [[TMP4]], [[MUL_310]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[MUL_512:%.*]] = xor i32 [[TMP5]], [[MUL_411]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[MUL_613:%.*]] = xor i32 [[TMP6]], [[MUL_512]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[MUL_714:%.*]] = xor i32 [[TMP7]], [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[MUL_714]]
+;
+; SSE2-LABEL: @test_xor(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = xor i32 [[TMP1]], [[TMP0]]
+; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; SSE2-NEXT:    [[MUL_29:%.*]] = xor i32 [[TMP2]], [[MUL_18]]
+; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; SSE2-NEXT:    [[MUL_310:%.*]] = xor i32 [[TMP3]], [[MUL_29]]
+; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; SSE2-NEXT:    [[MUL_411:%.*]] = xor i32 [[TMP4]], [[MUL_310]]
+; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; SSE2-NEXT:    [[MUL_512:%.*]] = xor i32 [[TMP5]], [[MUL_411]]
+; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; SSE2-NEXT:    [[MUL_613:%.*]] = xor i32 [[TMP6]], [[MUL_512]]
+; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; SSE2-NEXT:    [[MUL_714:%.*]] = xor i32 [[TMP7]], [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[MUL_714]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = xor i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = xor i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = xor i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = xor i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = xor i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = xor i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = xor i32 %7, %mul.613
+  ret i32 %mul.714
+}
-- 
GitLab


From 04e0dbca23f3d089ac2d41991f309970ac693da6 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 20 Oct 2018 16:25:55 +0000
Subject: [PATCH 0353/1116] [InstCombine] add explanatory comment for strange
 vector logic; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344852 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineVectorOps.cpp         | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index bdd8fe3eead..bcf2a25aefc 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -900,6 +900,22 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
 
       // If this insertelement isn't used by some other insertelement, turn it
       // (and any insertelements it points to), into one big shuffle.
+
+      // TODO: Looking at the user(s) to determine if this insert is a
+      // fold-to-shuffle opportunity does not match the usual instcombine
+      // constraints. We should decide if the transform is worthy based only
+      // on this instruction and its operands, but that may not work currently.
+      //
+      // Here, we are trying to avoid creating shuffles before reaching
+      // the end of a chain of extract-insert pairs. This is complicated because
+      // we do not generally form arbitrary shuffle masks in instcombine
+      // (because those may codegen poorly), but collectShuffleElements() does
+      // exactly that.
+      //
+      // The rules for determining what is an acceptable target-independent
+      // shuffle mask are fuzzy because they evolve based on the backend's
+      // capabilities and real-world impact.
+
       if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {
         SmallVector<Constant*, 16> Mask;
         ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
-- 
GitLab


From f521828b267a4013d0570da3937b9f9b3f369904 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 20 Oct 2018 16:58:27 +0000
Subject: [PATCH 0354/1116] [InstCombine] make code more flexible with lambda;
 NFC

I couldn't tell from svn history when these checks were added,
but it pre-dates the split of instcombine into its own directory
at rL92459.

The motivation for changing the check is partly shown by the
code in PR34724:
https://bugs.llvm.org/show_bug.cgi?id=34724

There are also existing regression tests for SLPVectorizer with
sequences of extract+insert that are likely assumed to become
shuffles by the vectorizer cost models.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344854 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineVectorOps.cpp           | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index bcf2a25aefc..0c8d64bff43 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -898,9 +898,6 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
       if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx)
         return replaceInstUsesWith(IE, VecOp);
 
-      // If this insertelement isn't used by some other insertelement, turn it
-      // (and any insertelements it points to), into one big shuffle.
-
       // TODO: Looking at the user(s) to determine if this insert is a
       // fold-to-shuffle opportunity does not match the usual instcombine
       // constraints. We should decide if the transform is worthy based only
@@ -915,8 +912,17 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
       // The rules for determining what is an acceptable target-independent
       // shuffle mask are fuzzy because they evolve based on the backend's
       // capabilities and real-world impact.
+      auto isShuffleRootCandidate = [](InsertElementInst &Insert) {
+        if (!Insert.hasOneUse())
+          return true;
+        auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back());
+        if (!InsertUser)
+          return true;
+        return false;
+      };
 
-      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {
+      // Try to form a shuffle from a chain of extract-insert ops.
+      if (isShuffleRootCandidate(IE)) {
         SmallVector<Constant*, 16> Mask;
         ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
 
-- 
GitLab


From 7f64213612b8af7f23787655170422cfc74de6b1 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 20 Oct 2018 17:15:57 +0000
Subject: [PATCH 0355/1116] [InstCombine] use 'match' to simplify code; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344855 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineVectorOps.cpp      | 115 +++++++++---------
 1 file changed, 56 insertions(+), 59 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 0c8d64bff43..75f77779ab7 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -876,65 +876,62 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
   if (isa<UndefValue>(ScalarOp) || isa<UndefValue>(IdxOp))
     replaceInstUsesWith(IE, VecOp);
 
-  // If the inserted element was extracted from some other vector, and if the
-  // indexes are constant, try to turn this into a shufflevector operation.
-  if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
-    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
-      unsigned NumInsertVectorElts = IE.getType()->getNumElements();
-      unsigned NumExtractVectorElts =
-          EI->getOperand(0)->getType()->getVectorNumElements();
-      unsigned ExtractedIdx =
-        cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
-      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
-
-      if (ExtractedIdx >= NumExtractVectorElts) // Out of range extract.
-        return replaceInstUsesWith(IE, VecOp);
-
-      if (InsertedIdx >= NumInsertVectorElts)  // Out of range insert.
-        return replaceInstUsesWith(IE, UndefValue::get(IE.getType()));
-
-      // If we are extracting a value from a vector, then inserting it right
-      // back into the same place, just use the input vector.
-      if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx)
-        return replaceInstUsesWith(IE, VecOp);
-
-      // TODO: Looking at the user(s) to determine if this insert is a
-      // fold-to-shuffle opportunity does not match the usual instcombine
-      // constraints. We should decide if the transform is worthy based only
-      // on this instruction and its operands, but that may not work currently.
-      //
-      // Here, we are trying to avoid creating shuffles before reaching
-      // the end of a chain of extract-insert pairs. This is complicated because
-      // we do not generally form arbitrary shuffle masks in instcombine
-      // (because those may codegen poorly), but collectShuffleElements() does
-      // exactly that.
-      //
-      // The rules for determining what is an acceptable target-independent
-      // shuffle mask are fuzzy because they evolve based on the backend's
-      // capabilities and real-world impact.
-      auto isShuffleRootCandidate = [](InsertElementInst &Insert) {
-        if (!Insert.hasOneUse())
-          return true;
-        auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back());
-        if (!InsertUser)
-          return true;
-        return false;
-      };
-
-      // Try to form a shuffle from a chain of extract-insert ops.
-      if (isShuffleRootCandidate(IE)) {
-        SmallVector<Constant*, 16> Mask;
-        ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
-
-        // The proposed shuffle may be trivial, in which case we shouldn't
-        // perform the combine.
-        if (LR.first != &IE && LR.second != &IE) {
-          // We now have a shuffle of LHS, RHS, Mask.
-          if (LR.second == nullptr)
-            LR.second = UndefValue::get(LR.first->getType());
-          return new ShuffleVectorInst(LR.first, LR.second,
-                                       ConstantVector::get(Mask));
-        }
+  // If the inserted element was extracted from some other vector and both
+  // indexes are constant, try to turn this into a shuffle.
+  uint64_t InsertedIdx, ExtractedIdx;
+  Value *ExtVecOp;
+  if (match(IdxOp, m_ConstantInt(InsertedIdx)) &&
+      match(ScalarOp, m_ExtractElement(m_Value(ExtVecOp),
+                                       m_ConstantInt(ExtractedIdx)))) {
+    unsigned NumInsertVectorElts = IE.getType()->getNumElements();
+    unsigned NumExtractVectorElts = ExtVecOp->getType()->getVectorNumElements();
+    if (ExtractedIdx >= NumExtractVectorElts) // Out of range extract.
+      return replaceInstUsesWith(IE, VecOp);
+
+    if (InsertedIdx >= NumInsertVectorElts)  // Out of range insert.
+      return replaceInstUsesWith(IE, UndefValue::get(IE.getType()));
+
+    // If we are extracting a value from a vector, then inserting it right
+    // back into the same place, just use the input vector.
+    if (ExtVecOp == VecOp && ExtractedIdx == InsertedIdx)
+      return replaceInstUsesWith(IE, VecOp);
+
+    // TODO: Looking at the user(s) to determine if this insert is a
+    // fold-to-shuffle opportunity does not match the usual instcombine
+    // constraints. We should decide if the transform is worthy based only
+    // on this instruction and its operands, but that may not work currently.
+    //
+    // Here, we are trying to avoid creating shuffles before reaching
+    // the end of a chain of extract-insert pairs. This is complicated because
+    // we do not generally form arbitrary shuffle masks in instcombine
+    // (because those may codegen poorly), but collectShuffleElements() does
+    // exactly that.
+    //
+    // The rules for determining what is an acceptable target-independent
+    // shuffle mask are fuzzy because they evolve based on the backend's
+    // capabilities and real-world impact.
+    auto isShuffleRootCandidate = [](InsertElementInst &Insert) {
+      if (!Insert.hasOneUse())
+        return true;
+      auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back());
+      if (!InsertUser)
+        return true;
+      return false;
+    };
+
+    // Try to form a shuffle from a chain of extract-insert ops.
+    if (isShuffleRootCandidate(IE)) {
+      SmallVector<Constant*, 16> Mask;
+      ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
+
+      // The proposed shuffle may be trivial, in which case we shouldn't
+      // perform the combine.
+      if (LR.first != &IE && LR.second != &IE) {
+        // We now have a shuffle of LHS, RHS, Mask.
+        if (LR.second == nullptr)
+          LR.second = UndefValue::get(LR.first->getType());
+        return new ShuffleVectorInst(LR.first, LR.second,
+                                     ConstantVector::get(Mask));
       }
     }
   }
-- 
GitLab


From a4ef13eefb62677c1536b3b5b5bf14e4d92a0422 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 20 Oct 2018 17:38:33 +0000
Subject: [PATCH 0356/1116] [CostModel][X86] Add some initial extract/insert
 subvector shuffle cost tests

Just f64/i64 tests initially to demonstrate PR39368

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344857 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../X86/shuffle-extract_subvector.ll          |  91 ++++++++++
 .../CostModel/X86/shuffle-insert_subvector.ll | 161 ++++++++++++++++++
 2 files changed, 252 insertions(+)
 create mode 100644 test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
 create mode 100644 test/Analysis/CostModel/X86/shuffle-insert_subvector.ll

diff --git a/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
new file mode 100644
index 00000000000..5bb2e1a756d
--- /dev/null
+++ b/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=CHECK,AVX512
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+;
+; Verify the cost model for extract_subector style shuffles.
+;
+
+define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
+; CHECK-LABEL: 'test_vXf64'
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXf64'
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+  %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+  %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+  %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret void
+}
+
+define void @test_vXfi64(<4 x i64> %src256, <8 x i64> %src512) {
+; CHECK-LABEL: 'test_vXfi64'
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXfi64'
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
new file mode 100644
index 00000000000..94e56643472
--- /dev/null
+++ b/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=CHECK,AVX512
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+;
+; Verify the cost model for insert_subector style shuffles.
+;
+
+define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
+; SSE-LABEL: 'test_vXf64'
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXf64'
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXf64'
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXf64'
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+  %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+  %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret void
+}
+
+define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
+; SSE-LABEL: 'test_vXi64'
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi64'
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXi64'
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXi64'
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+  %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+  %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret void
+}
-- 
GitLab


From c4cfe4768cded87ed18267dda379a854b714d684 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 20 Oct 2018 18:18:55 +0000
Subject: [PATCH 0357/1116] [InstCombine] add test for possible shuffle fold;
 NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344860 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/insert-extract-shuffle.ll     | 82 ++++++++++++-------
 1 file changed, 51 insertions(+), 31 deletions(-)

diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll
index fb25c234279..2a0b9be218a 100644
--- a/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -3,7 +3,7 @@
 
 define <1 x i8> @test1(<8 x i8> %in) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[VEC:%.*]] = shufflevector <8 x i8> %in, <8 x i8> undef, <1 x i32> <i32 5>
+; CHECK-NEXT:    [[VEC:%.*]] = shufflevector <8 x i8> [[IN:%.*]], <8 x i8> undef, <1 x i32> <i32 5>
 ; CHECK-NEXT:    ret <1 x i8> [[VEC]]
 ;
   %val = extractelement <8 x i8> %in, i32 5
@@ -13,7 +13,7 @@ define <1 x i8> @test1(<8 x i8> %in) {
 
 define <4 x i16> @test2(<8 x i16> %in, <8 x i16> %in2) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[VEC_3:%.*]] = shufflevector <8 x i16> %in2, <8 x i16> %in, <4 x i32> <i32 11, i32 9, i32 0, i32 10>
+; CHECK-NEXT:    [[VEC_3:%.*]] = shufflevector <8 x i16> [[IN2:%.*]], <8 x i16> [[IN:%.*]], <4 x i32> <i32 11, i32 9, i32 0, i32 10>
 ; CHECK-NEXT:    ret <4 x i16> [[VEC_3]]
 ;
   %elt0 = extractelement <8 x i16> %in, i32 3
@@ -31,8 +31,8 @@ define <4 x i16> @test2(<8 x i16> %in, <8 x i16> %in2) {
 
 define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: @test_vcopyq_lane_p64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 0, i32 undef>
-; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i64> %a, <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i64> [[B:%.*]], <1 x i64> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %elt = extractelement <1 x i64> %b, i32 0
@@ -44,8 +44,8 @@ define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) {
 
 define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) {
 ; CHECK-LABEL: @widen_extract2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[I2:%.*]] = shufflevector <4 x float> %ins, <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 2, i32 5>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[EXT:%.*]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[I2:%.*]] = shufflevector <4 x float> [[INS:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 2, i32 5>
 ; CHECK-NEXT:    ret <4 x float> [[I2]]
 ;
   %e1 = extractelement <2 x float> %ext, i32 0
@@ -57,8 +57,8 @@ define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) {
 
 define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) {
 ; CHECK-LABEL: @widen_extract3(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> %ext, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-; CHECK-NEXT:    [[I3:%.*]] = shufflevector <4 x float> %ins, <4 x float> [[TMP1]], <4 x i32> <i32 6, i32 5, i32 4, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[EXT:%.*]], <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT:    [[I3:%.*]] = shufflevector <4 x float> [[INS:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 6, i32 5, i32 4, i32 3>
 ; CHECK-NEXT:    ret <4 x float> [[I3]]
 ;
   %e1 = extractelement <3 x float> %ext, i32 0
@@ -72,8 +72,8 @@ define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) {
 
 define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {
 ; CHECK-LABEL: @widen_extract4(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[I1:%.*]] = shufflevector <8 x float> %ins, <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[EXT:%.*]], <2 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[I1:%.*]] = shufflevector <8 x float> [[INS:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x float> [[I1]]
 ;
   %e1 = extractelement <2 x float> %ext, i32 0
@@ -86,7 +86,7 @@ define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {
 
 define <8 x i16> @pr26015(<4 x i16> %t0) {
 ; CHECK-LABEL: @pr26015(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[T0:%.*]], <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 undef>, <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 10, i32 4, i32 5, i32 6, i32 11>
 ; CHECK-NEXT:    ret <8 x i16> [[T5]]
 ;
@@ -103,10 +103,10 @@ define <8 x i16> @pr26015(<4 x i16> %t0) {
 
 define <8 x i16> @pr25999(<4 x i16> %t0, i1 %b) {
 ; CHECK-LABEL: @pr25999(
-; CHECK-NEXT:    [[T1:%.*]] = extractelement <4 x i16> %t0, i32 2
-; CHECK-NEXT:    br i1 %b, label %if, label %end
+; CHECK-NEXT:    [[T1:%.*]] = extractelement <4 x i16> [[T0:%.*]], i32 2
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[IF:%.*]], label [[END:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[T0]], <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[T3:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 undef>, i16 [[T1]], i32 3
 ; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> [[T3]], <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
 ; CHECK-NEXT:    ret <8 x i16> [[T5]]
@@ -137,13 +137,13 @@ end:
 define <4 x double> @pr25999_phis1(i1 %c, <2 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @pr25999_phis1(
 ; CHECK-NEXT:  bb1:
-; CHECK-NEXT:    br i1 %c, label %bb2, label %bb3
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @dummy(<2 x double> %a)
-; CHECK-NEXT:    br label %bb3
+; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @dummy(<2 x double> [[A:%.*]])
+; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ %a, %bb1 ], [ [[R]], %bb2 ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ [[A]], [[BB1:%.*]] ], [ [[R]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x double> [ [[B:%.*]], [[BB1]] ], [ zeroinitializer, [[BB2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
 ; CHECK-NEXT:    ret <4 x double> [[TMP4]]
@@ -168,13 +168,13 @@ declare <2 x double> @dummy(<2 x double>)
 define <4 x double> @pr25999_phis2(i1 %c, <2 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @pr25999_phis2(
 ; CHECK-NEXT:  bb1:
-; CHECK-NEXT:    br i1 %c, label %bb2, label %bb3
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @dummy(<2 x double> %a)
-; CHECK-NEXT:    br label %bb3
+; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @dummy(<2 x double> [[A:%.*]])
+; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ %a, %bb1 ], [ [[R]], %bb2 ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ [[A]], [[BB1:%.*]] ], [ [[R]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x double> [ [[B:%.*]], [[BB1]] ], [ zeroinitializer, [[BB2]] ]
 ; CHECK-NEXT:    [[D:%.*]] = fadd <2 x double> [[TMP1]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x double> [[D]], <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
@@ -202,15 +202,15 @@ bb3:
 define double @pr26354(<2 x double>* %tmp, i1 %B) {
 ; CHECK-LABEL: @pr26354(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* %tmp, align 16
+; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* [[TMP:%.*]], align 16
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[LD]], i32 0
-; CHECK-NEXT:    br i1 %B, label %if, label %end
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[IF:%.*]], label [[END:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[LD]], i32 1
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double undef>, double [[E2]], i32 3
-; CHECK-NEXT:    br label %end
+; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PH:%.*]] = phi <4 x double> [ undef, %entry ], [ [[I1]], %if ]
+; CHECK-NEXT:    [[PH:%.*]] = phi <4 x double> [ undef, [[ENTRY:%.*]] ], [ [[I1]], [[IF]] ]
 ; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x double> [[PH]], i32 1
 ; CHECK-NEXT:    [[MU:%.*]] = fmul double [[E1]], [[E3]]
 ; CHECK-NEXT:    ret double [[MU]]
@@ -239,11 +239,11 @@ end:
 define <4 x float> @PR30923(<2 x float> %x) {
 ; CHECK-LABEL: @PR30923(
 ; CHECK-NEXT:  bb1:
-; CHECK-NEXT:    [[EXT1:%.*]] = extractelement <2 x float> %x, i32 1
+; CHECK-NEXT:    [[EXT1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
 ; CHECK-NEXT:    store float [[EXT1]], float* undef, align 4
-; CHECK-NEXT:    br label %bb2
+; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[EXT2:%.*]] = extractelement <2 x float> %x, i32 0
+; CHECK-NEXT:    [[EXT2:%.*]] = extractelement <2 x float> [[X]], i32 0
 ; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef>, float [[EXT2]], i32 2
 ; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float [[EXT1]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[INS2]]
@@ -283,3 +283,23 @@ entry:
   %ret = select i1 %e, <4 x i32> %b, <4 x i32> zeroinitializer
   ret <4 x i32> %ret
 }
+
+; PR34724: https://bugs.llvm.org/show_bug.cgi?id=34724
+
+define <4 x float> @collectShuffleElts(<2 x float> %x, float %y) {
+; CHECK-LABEL: @collectShuffleElts(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> undef, float [[X0]], i32 1
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[X1]], i32 2
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[Y:%.*]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[V3]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %v1 = insertelement <4 x float> undef, float %x0, i32 1
+  %v2 = insertelement <4 x float> %v1, float %x1, i32 2
+  %v3 = insertelement <4 x float> %v2, float %y, i32 3
+  ret <4 x float> %v3
+}
+
-- 
GitLab


From dae6a45f280443691a29132af1d63b8eef027d95 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sat, 20 Oct 2018 20:39:53 +0000
Subject: [PATCH 0358/1116] [ORC] Add some more basic sanity tests for the
 LLJIT.

minimal.ll contains a main function that returns zero, and
single-function-call.ll contains a main function that calls a foo function that
returns zero. These minimal tests can help to rule out some trivial JIT bugs
when other tests fail.

This commit also renames hello.ll to global-ctors-and-dtors.ll, which better
reflects what it is actually testing.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344863 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../{hello.ll => global-ctors-and-dtors.ll}       |  2 ++
 test/ExecutionEngine/OrcLazy/minimal.ll           |  8 ++++++++
 .../OrcLazy/single-function-call.ll               | 15 +++++++++++++++
 3 files changed, 25 insertions(+)
 rename test/ExecutionEngine/OrcLazy/{hello.ll => global-ctors-and-dtors.ll} (95%)
 create mode 100644 test/ExecutionEngine/OrcLazy/minimal.ll
 create mode 100644 test/ExecutionEngine/OrcLazy/single-function-call.ll

diff --git a/test/ExecutionEngine/OrcLazy/hello.ll b/test/ExecutionEngine/OrcLazy/global-ctors-and-dtors.ll
similarity index 95%
rename from test/ExecutionEngine/OrcLazy/hello.ll
rename to test/ExecutionEngine/OrcLazy/global-ctors-and-dtors.ll
index 86d9a9a4b31..00b54fbf73f 100644
--- a/test/ExecutionEngine/OrcLazy/hello.ll
+++ b/test/ExecutionEngine/OrcLazy/global-ctors-and-dtors.ll
@@ -1,5 +1,7 @@
 ; RUN: lli -jit-kind=orc-lazy -orc-lazy-debug=funcs-to-stdout %s | FileCheck %s
 ;
+; Test that global constructors and destructors are run.
+;
 ; CHECK: Hello
 ; CHECK: [ {{.*}}main{{.*}} ]
 ; CHECK: Goodbye
diff --git a/test/ExecutionEngine/OrcLazy/minimal.ll b/test/ExecutionEngine/OrcLazy/minimal.ll
new file mode 100644
index 00000000000..86087bc3878
--- /dev/null
+++ b/test/ExecutionEngine/OrcLazy/minimal.ll
@@ -0,0 +1,8 @@
+; RUN: lli -jit-kind=orc-lazy %s
+;
+; Basic sanity check: A module with a single no-op main function runs.
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
+entry:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcLazy/single-function-call.ll b/test/ExecutionEngine/OrcLazy/single-function-call.ll
new file mode 100644
index 00000000000..fba52026660
--- /dev/null
+++ b/test/ExecutionEngine/OrcLazy/single-function-call.ll
@@ -0,0 +1,15 @@
+; RUN: lli -jit-kind=orc-lazy %s
+;
+; Basic sanity check: We can make a call inside lazily JIT'd code.
+; Compared to minimal.ll, this demonstrates that we can call through a stub.
+
+define i32 @foo() {
+entry:
+  ret i32 0
+}
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
+entry:
+  %0 = call i32() @foo()
+  ret i32 %0
+}
-- 
GitLab


From ef0e188cb54443dc8e026677d35340a6b472462c Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Sun, 21 Oct 2018 11:16:50 +0000
Subject: [PATCH 0359/1116] [WebAssembly] Change tabs to spaces in
 basic-assembly.s

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344866 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/WebAssembly/basic-assembly.s | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/MC/WebAssembly/basic-assembly.s b/test/MC/WebAssembly/basic-assembly.s
index cc60143639f..a22fe7962d4 100644
--- a/test/MC/WebAssembly/basic-assembly.s
+++ b/test/MC/WebAssembly/basic-assembly.s
@@ -47,7 +47,7 @@ test0:
     #i32x4.trunc_s/f32x4:sat
     i32.trunc_s/f32
     #i32.trunc_s:sat/f32
-    get_global	__stack_pointer@GLOBAL
+    get_global  __stack_pointer@GLOBAL
     end_function
 
 
@@ -88,5 +88,5 @@ test0:
 # CHECK-NEXT:      get_local   5
 # CHECK-NEXT:      f32x4.add
 # CHECK-NEXT:      i32.trunc_s/f32
-# CHECK-NEXT:      get_global	__stack_pointer@GLOBAL
+# CHECK-NEXT:      get_global  __stack_pointer@GLOBAL
 # CHECK-NEXT:      end_function
-- 
GitLab


From 593972ff78a8542e4c9d415c52a1988c1d76a5d8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 21 Oct 2018 11:55:56 +0000
Subject: [PATCH 0360/1116] [X86] Only extract constant pool shuffle mask data
 with zero offsets

D53306 exposes an issue where we sometimes use constant pool data from bigger vectors than the target shuffle mask. This should be safe to do, but we have to be certain that we're using the bottom most part of the vector as the shuffle mask decoders have no way to peek into subvectors with non-zero offsets.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344867 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 2 +-
 lib/Target/X86/X86MCInstLower.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index be6f9ed2188..8dc9f624554 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5536,7 +5536,7 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
     Ptr = Ptr->getOperand(0);
 
   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
-  if (!CNode || CNode->isMachineConstantPoolEntry())
+  if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
     return nullptr;
 
   return dyn_cast<Constant>(CNode->getConstVal());
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 76f0dd4837b..58b1c505944 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1379,7 +1379,7 @@ PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
 
 static const Constant *getConstantFromPool(const MachineInstr &MI,
                                            const MachineOperand &Op) {
-  if (!Op.isCPI())
+  if (!Op.isCPI() || Op.getOffset() != 0)
     return nullptr;
 
   ArrayRef<MachineConstantPoolEntry> Constants =
-- 
GitLab


From e24f4ecbb050628588bb4743fb57b6ebe85b25ea Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 21 Oct 2018 17:07:50 +0000
Subject: [PATCH 0361/1116] [X86][AVX] Enable
 lowerVectorShuffleAsLanePermuteAndPermute v16i16/v32i8 unary shuffle lowering

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344868 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            |  14 ++-
 .../X86/bitcast-int-to-vector-bool-sext.ll    |  83 ++++----------
 .../X86/bitcast-int-to-vector-bool-zext.ll    | 104 +++++-------------
 .../CodeGen/X86/bitcast-int-to-vector-bool.ll |   8 +-
 test/CodeGen/X86/vector-shuffle-256-v16.ll    |  10 +-
 test/CodeGen/X86/vector-shuffle-256-v32.ll    |  18 +--
 6 files changed, 69 insertions(+), 168 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 8dc9f624554..9e431162083 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -14647,9 +14647,14 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (V2.isUndef()) {
     // There are no generalized cross-lane shuffle operations available on i16
     // element types.
-    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
+      if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+              DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
+        return V;
+
       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
                                                      Mask, DAG, Subtarget);
+    }
 
     SmallVector<int, 8> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
@@ -14742,9 +14747,14 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // There are no generalized cross-lane shuffle operations available on i8
   // element types.
-  if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+  if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
+    if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+            DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+      return V;
+
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
                                                    DAG, Subtarget);
+  }
 
   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
           DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index c022d7908a1..2964c905946 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -381,31 +381,15 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: ext_i32_32i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovd %edi, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-SLOW-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: ext_i32_32i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovd %edi, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-FAST-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: ext_i32_32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: ext_i32_32i8:
 ; AVX512:       # %bb.0:
@@ -697,43 +681,18 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: ext_i64_64i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovq %rdi, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
-; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: ext_i64_64i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovq %rdi, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-FAST-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
-; AVX2-FAST-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: ext_i64_64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %rdi, %xmm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: ext_i64_64i8:
 ; AVX512:       # %bb.0:
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 75b5b701113..139fabd25c9 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -485,35 +485,17 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: ext_i32_32i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovd %edi, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-SLOW-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: ext_i32_32i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovd %edi, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-FAST-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: ext_i32_32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: ext_i32_32i8:
 ; AVX512F:       # %bb.0:
@@ -896,53 +878,23 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: ext_i64_64i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovq %rdi, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX2-SLOW-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm4, %ymm1
-; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpsrlw $7, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: ext_i64_64i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovq %rdi, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-FAST-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX2-FAST-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpsrlw $7, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: ext_i64_64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %rdi, %xmm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $7, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: ext_i64_64i8:
 ; AVX512F:       # %bb.0:
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
index 3deac92d9ed..6a8726b3a2a 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -226,12 +226,8 @@ define <32 x i1> @bitcast_i32_32i1(i32 %a0) {
 ; AVX2-LABEL: bitcast_i32_32i1:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 2ade0c5c646..461246d80a8 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -1956,14 +1956,8 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z
 ;
 ; AVX2-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[10,11],zero,zero,zero,zero,zero,zero,ymm0[28,29],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 9dfbb6af075..a391387923a 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -2728,26 +2728,16 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
 ;
 ; AVX2-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8],zero,zero,zero,ymm0[9],zero,zero,zero,ymm0[10],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[28],zero,zero,zero,ymm0[29],zero,zero,zero,ymm0[30],zero,zero,zero,ymm0[31],zero,zero,zero
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VLBW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VLBW-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX512VLBW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VLBW-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
 ; AVX512VLBW-NEXT:    movl $286331153, %eax # imm = 0x11111111
 ; AVX512VLBW-NEXT:    kmovd %eax, %k1
-; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,u,u,u,9,u,u,u,10,u,u,u,11,u,u,u,28,u,u,u,29,u,u,u,30,u,u,u,31,u,u,u]
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
-- 
GitLab


From 0ee7db4eac135df5bb44ce5770041082833181c7 Mon Sep 17 00:00:00 2001
From: Aditya Kumar <hiraditya@msn.com>
Date: Sun, 21 Oct 2018 18:11:56 +0000
Subject: [PATCH 0362/1116] Schedule Hot Cold Splitting pass after most
 optimization passes

Summary:
In the new+old pass manager, hot cold splitting was schedule too early.
Thanks to Vedant for pointing this out.

Reviewers: sebpop, vsk

Reviewed By: sebpop, vsk

Subscribers: mehdi_amini, llvm-commits

Differential Revision: https://reviews.llvm.org/D53437

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344869 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Passes/PassBuilder.cpp                |   6 +-
 lib/Transforms/IPO/PassManagerBuilder.cpp |   6 +-
 test/Other/opt-hot-cold-split.ll          | 292 ++++++++++++++++++++++
 3 files changed, 298 insertions(+), 6 deletions(-)
 create mode 100644 test/Other/opt-hot-cold-split.ll

diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index f6313d23e2d..8b333b7f8a6 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -621,9 +621,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
                                            true));
   }
 
-  if (EnableHotColdSplit)
-    MPM.addPass(HotColdSplittingPass());
-
   // Interprocedural constant propagation now that basic cleanup has occurred
   // and prior to optimizing globals.
   // FIXME: This position in the pipeline hasn't been carefully considered in
@@ -713,6 +710,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       buildFunctionSimplificationPipeline(Level, Phase, DebugLogging)));
 
+  if (EnableHotColdSplit)
+    MPM.addPass(HotColdSplittingPass());
+
   for (auto &C : CGSCCOptimizerLateEPCallbacks)
     C(MainCGPipeline, Level);
 
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 8b00a60f521..19ff2a21cd2 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -499,9 +499,6 @@ void PassManagerBuilder::populateModulePassManager(
   // Infer attributes about declarations if possible.
   MPM.add(createInferFunctionAttrsLegacyPass());
 
-  if (EnableHotColdSplit)
-    MPM.add(createHotColdSplittingPass());
-
   addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
   if (OptLevel > 2)
@@ -735,6 +732,9 @@ void PassManagerBuilder::populateModulePassManager(
   // flattening of blocks.
   MPM.add(createDivRemPairsPass());
 
+  if (EnableHotColdSplit)
+    MPM.add(createHotColdSplittingPass());
+
   // LoopSink (and other loop passes since the last simplifyCFG) might have
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
   MPM.add(createCFGSimplificationPass());
diff --git a/test/Other/opt-hot-cold-split.ll b/test/Other/opt-hot-cold-split.ll
new file mode 100644
index 00000000000..ba92ccab636
--- /dev/null
+++ b/test/Other/opt-hot-cold-split.ll
@@ -0,0 +1,292 @@
+; RUN: opt -mtriple=x86_64-- -Os -hotcoldsplit -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK-LABEL: Pass Arguments:
+; CHECK-NEXT: Target Transform Information
+; CHECK-NEXT: Type-Based Alias Analysis
+; CHECK-NEXT: Scoped NoAlias Alias Analysis
+; CHECK-NEXT: Assumption Cache Tracker
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Module Verifier
+; CHECK-NEXT:     Instrument function entry/exit with calls to e.g. mcount() (pre inlining)
+; CHECK-NEXT:     Simplify the CFG
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT:     SROA
+; CHECK-NEXT:     Early CSE
+; CHECK-NEXT:     Lower 'expect' Intrinsics
+; CHECK-NEXT:  Pass Arguments:
+; CHECK-NEXT:  Target Library Information
+; CHECK-NEXT:  Target Transform Information
+; CHECK-NEXT:  Target Pass Configuration
+; CHECK-NEXT:  Type-Based Alias Analysis
+; CHECK-NEXT:  Scoped NoAlias Alias Analysis
+; CHECK-NEXT:  Assumption Cache Tracker
+; CHECK-NEXT:  Profile summary info
+; CHECK-NEXT:    ModulePass Manager
+; CHECK-NEXT:      Force set function attributes
+; CHECK-NEXT:      Infer set function attributes
+; CHECK-NEXT:      Interprocedural Sparse Conditional Constant Propagation
+; CHECK-NEXT:        Unnamed pass: implement Pass::getPassName()
+; CHECK-NEXT:     Called Value Propagation
+; CHECK-NEXT:     Global Variable Optimizer
+; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Promote Memory to Register
+; CHECK-NEXT:     Dead Argument Elimination
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Combine redundant instructions
+; CHECK-NEXT:       Simplify the CFG
+; CHECK-NEXT:     CallGraph Construction
+; CHECK-NEXT:     Globals Alias Analysis
+; CHECK-NEXT:     Call Graph SCC Pass Manager
+; CHECK-NEXT:       Remove unused exception handling info
+; CHECK-NEXT:       Function Integration/Inlining
+; CHECK-NEXT:       Deduce function attributes
+; CHECK-NEXT:       FunctionPass Manager
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         SROA
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Memory SSA
+; CHECK-NEXT:         Early CSE w/ MemorySSA
+; CHECK-NEXT:         Speculatively execute instructions if target has divergent branches
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Lazy Value Information Analysis
+; CHECK-NEXT:         Jump Threading
+; CHECK-NEXT:         Value Propagation
+; CHECK-NEXT:         Simplify the CFG
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Lazy Branch Probability Analysis
+; CHECK-NEXT:         Lazy Block Frequency Analysis
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Combine redundant instructions
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Tail Call Elimination
+; CHECK-NEXT:         Simplify the CFG
+; CHECK-NEXT:         Reassociate expressions
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Canonicalize natural loops
+; CHECK-NEXT:         LCSSA Verifier
+; CHECK-NEXT:         Loop-Closed SSA Form Pass
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Scalar Evolution Analysis
+; CHECK-NEXT:         Loop Pass Manager
+; CHECK-NEXT:           Rotate Loops
+; CHECK-NEXT:           Loop Invariant Code Motion
+; CHECK-NEXT:           Unswitch loops
+; CHECK-NEXT:         Simplify the CFG
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Lazy Branch Probability Analysis
+; CHECK-NEXT:         Lazy Block Frequency Analysis
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Combine redundant instructions
+; CHECK-NEXT:         Canonicalize natural loops
+; CHECK-NEXT:         LCSSA Verifier
+; CHECK-NEXT:         Loop-Closed SSA Form Pass
+; CHECK-NEXT:         Scalar Evolution Analysis
+; CHECK-NEXT:         Loop Pass Manager
+; CHECK-NEXT:           Induction Variable Simplification
+; CHECK-NEXT:           Recognize loop idioms
+; CHECK-NEXT:           Delete dead loops
+; CHECK-NEXT:           Unroll loops
+; CHECK-NEXT:         MergedLoadStoreMotion
+; CHECK-NEXT:         Phi Values Analysis
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Memory Dependence Analysis
+; CHECK-NEXT:         Lazy Branch Probability Analysis
+; CHECK-NEXT:         Lazy Block Frequency Analysis
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Global Value Numbering
+; CHECK-NEXT:         Phi Values Analysis
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Memory Dependence Analysis
+; CHECK-NEXT:         MemCpy Optimization
+; CHECK-NEXT:         Sparse Conditional Constant Propagation
+; CHECK-NEXT:         Demanded bits analysis
+; CHECK-NEXT:         Bit-Tracking Dead Code Elimination
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Lazy Branch Probability Analysis
+; CHECK-NEXT:         Lazy Block Frequency Analysis
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Combine redundant instructions
+; CHECK-NEXT:         Lazy Value Information Analysis
+; CHECK-NEXT:         Jump Threading
+; CHECK-NEXT:         Value Propagation
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Phi Values Analysis
+; CHECK-NEXT:         Memory Dependence Analysis
+; CHECK-NEXT:         Dead Store Elimination
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Canonicalize natural loops
+; CHECK-NEXT:         LCSSA Verifier
+; CHECK-NEXT:         Loop-Closed SSA Form Pass
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Scalar Evolution Analysis
+; CHECK-NEXT:         Loop Pass Manager
+; CHECK-NEXT:           Loop Invariant Code Motion
+; CHECK-NEXT:         Post-Dominator Tree Construction
+; CHECK-NEXT:         Aggressive Dead Code Elimination
+; CHECK-NEXT:         Simplify the CFG
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Lazy Branch Probability Analysis
+; CHECK-NEXT:         Lazy Block Frequency Analysis
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Combine redundant instructions
+; CHECK-NEXT:     A No-Op Barrier Pass
+; CHECK-NEXT:     Eliminate Available Externally Globals
+; CHECK-NEXT:     CallGraph Construction
+; CHECK-NEXT:     Deduce function attributes in RPO
+; CHECK-NEXT:     Global Variable Optimizer
+; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
+; CHECK-NEXT:     Dead Global Elimination
+; CHECK-NEXT:     CallGraph Construction
+; CHECK-NEXT:     Globals Alias Analysis
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Float to int
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Canonicalize natural loops
+; CHECK-NEXT:       LCSSA Verifier
+; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         Rotate Loops
+; CHECK-NEXT:       Loop Access Analysis
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Loop Distribution
+; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Block Frequency Analysis
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Loop Access Analysis
+; CHECK-NEXT:       Demanded bits analysis
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Loop Vectorization
+; CHECK-NEXT:       Canonicalize natural loops
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Loop Access Analysis
+; CHECK-NEXT:       Loop Load Elimination
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Combine redundant instructions
+; CHECK-NEXT:       Simplify the CFG
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Demanded bits analysis
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Combine redundant instructions
+; CHECK-NEXT:       Canonicalize natural loops
+; CHECK-NEXT:       LCSSA Verifier
+; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         Unroll loops
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Combine redundant instructions
+; CHECK-NEXT:       Canonicalize natural loops
+; CHECK-NEXT:       LCSSA Verifier
+; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         Loop Invariant Code Motion
+; CHECK-NEXT:       Alignment from assumptions
+; CHECK-NEXT:     Strip Unused Function Prototypes
+; CHECK-NEXT:     Dead Global Elimination
+; CHECK-NEXT:     Merge Duplicate Global Constants
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Block Frequency Analysis
+; CHECK-NEXT:       Canonicalize natural loops
+; CHECK-NEXT:       LCSSA Verifier
+; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Block Frequency Analysis
+; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         Loop Sink
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Remove redundant instructions
+; CHECK-NEXT:       Hoist/decompose integer division and remainder
+; CHECK-NEXT:       Simplify the CFG
+; CHECK-NEXT:     Hot Cold Splitting
+; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Module Verifier
+; CHECK-NEXT:     Bitcode Writer
+; CHECK-NEXT: Pass Arguments:  -domtree
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT: Pass Arguments:  -targetlibinfo -domtree -loops -branch-prob -block-freq
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT:     Natural Loop Information
+; CHECK-NEXT:     Branch Probability Analysis
+; CHECK-NEXT:     Block Frequency Analysis
+; CHECK-NEXT: Pass Arguments:  -targetlibinfo -domtree -loops -branch-prob -block-freq
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT:     Natural Loop Information
+; CHECK-NEXT:     Branch Probability Analysis
+; CHECK-NEXT:     Block Frequency Analysis
+; CHECK-NEXT: Pass Arguments:  -targetlibinfo -domtree -loops -branch-prob -block-freq
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT:     Natural Loop Information
+; CHECK-NEXT:     Branch Probability Analysis
+; CHECK-NEXT:     Block Frequency Analysis
-- 
GitLab


From e69e0d92a0b6771a4cfa366ded00b78d8a7bade0 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 21 Oct 2018 20:13:29 +0000
Subject: [PATCH 0363/1116] [DAGCombiner] reduce insert+bitcast+extract vector
 ops to truncate (PR39016)

This is a late backend subset of the IR transform added with:
D52439

We can confirm that the conversion to a 'trunc' is correct by running:
$ opt -instcombine -data-layout="e"
(assuming the IR transforms are correct; change "e" to "E" for big-endian)

As discussed in PR39016:
https://bugs.llvm.org/show_bug.cgi?id=39016
...the pattern may emerge during legalization, so that's we are waiting for an
insertelement to become a scalar_to_vector in the pattern matching here.

The DAG allows for fun variations that are not possible in IR. Result types for
extracts and scalar_to_vector don't necessarily match input types, so that means
we have to be a bit more careful in the transform (see code comments).

The tests show that we don't handle cases that require a shift (as we did in the
IR version). I've left that as a potential follow-up because I'm not sure if
that's a real concern at this late stage.

Differential Revision: https://reviews.llvm.org/D53201


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344872 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 33 +++++++++++++++++---
 test/CodeGen/AArch64/extract-insert.ll   | 38 ++++++++++++++----------
 test/CodeGen/X86/extract-insert.ll       |  9 +++---
 test/CodeGen/X86/mmx-coalescing.ll       |  9 +++---
 4 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 11cc699ffe1..381efb9cb94 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15503,16 +15503,41 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     // converts.
   }
 
-  if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST) {
+  // TODO: These transforms should not require the 'hasOneUse' restriction, but
+  // there are regressions on multiple targets without it. We can end up with a
+  // mess of scalar and vector code if we reduce only part of the DAG to scalar.
+  if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && VT.isInteger() &&
+      InVec.hasOneUse()) {
     // The vector index of the LSBs of the source depend on the endian-ness.
     bool IsLE = DAG.getDataLayout().isLittleEndian();
-
+    unsigned ExtractIndex = ConstEltNo->getZExtValue();
     // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x)
     unsigned BCTruncElt = IsLE ? 0 : VT.getVectorNumElements() - 1;
     SDValue BCSrc = InVec.getOperand(0);
-    if (InVec.hasOneUse() && ConstEltNo->getZExtValue() == BCTruncElt &&
-        VT.isInteger() && BCSrc.getValueType().isScalarInteger())
+    if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger())
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
+
+    if (LegalTypes && BCSrc.getValueType().isInteger() &&
+        BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
+      // trunc i64 X to i32
+      SDValue X = BCSrc.getOperand(0);
+      assert(X.getValueType().isScalarInteger() && NVT.isScalarInteger() &&
+             "Extract element and scalar to vector can't change element type "
+             "from FP to integer.");
+      unsigned XBitWidth = X.getValueSizeInBits();
+      unsigned VecEltBitWidth = VT.getScalarSizeInBits();
+      BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
+
+      // An extract element return value type can be wider than its vector
+      // operand element type. In that case, the high bits are undefined, so
+      // it's possible that we may need to extend rather than truncate.
+      if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) {
+        assert(XBitWidth % VecEltBitWidth == 0 &&
+               "Scalar bitwidth must be a multiple of vector element bitwidth");
+        return DAG.getAnyExtOrTrunc(X, SDLoc(N), NVT);
+      }
+    }
   }
 
   // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
diff --git a/test/CodeGen/AArch64/extract-insert.ll b/test/CodeGen/AArch64/extract-insert.ll
index 91f6518edd8..077e5f3d042 100644
--- a/test/CodeGen/AArch64/extract-insert.ll
+++ b/test/CodeGen/AArch64/extract-insert.ll
@@ -12,8 +12,7 @@ define i32 @trunc_i64_to_i32_le(i64 %x) {
 ;
 ; LE-LABEL: trunc_i64_to_i32_le:
 ; LE:       // %bb.0:
-; LE-NEXT:    fmov d0, x0
-; LE-NEXT:    fmov w0, s0
+; LE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; LE-NEXT:    ret
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bc = bitcast <2 x i64> %ins to <4 x i32>
@@ -24,9 +23,7 @@ define i32 @trunc_i64_to_i32_le(i64 %x) {
 define i32 @trunc_i64_to_i32_be(i64 %x) {
 ; BE-LABEL: trunc_i64_to_i32_be:
 ; BE:       // %bb.0:
-; BE-NEXT:    fmov d0, x0
-; BE-NEXT:    rev64 v0.4s, v0.4s
-; BE-NEXT:    mov w0, v0.s[1]
+; BE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; BE-NEXT:    ret
 ;
 ; LE-LABEL: trunc_i64_to_i32_be:
@@ -50,8 +47,7 @@ define i16 @trunc_i64_to_i16_le(i64 %x) {
 ;
 ; LE-LABEL: trunc_i64_to_i16_le:
 ; LE:       // %bb.0:
-; LE-NEXT:    fmov d0, x0
-; LE-NEXT:    umov w0, v0.h[0]
+; LE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; LE-NEXT:    ret
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bc = bitcast <2 x i64> %ins to <8 x i16>
@@ -62,9 +58,7 @@ define i16 @trunc_i64_to_i16_le(i64 %x) {
 define i16 @trunc_i64_to_i16_be(i64 %x) {
 ; BE-LABEL: trunc_i64_to_i16_be:
 ; BE:       // %bb.0:
-; BE-NEXT:    fmov d0, x0
-; BE-NEXT:    rev64 v0.8h, v0.8h
-; BE-NEXT:    umov w0, v0.h[3]
+; BE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; BE-NEXT:    ret
 ;
 ; LE-LABEL: trunc_i64_to_i16_be:
@@ -88,8 +82,6 @@ define i8 @trunc_i32_to_i8_le(i32 %x) {
 ;
 ; LE-LABEL: trunc_i32_to_i8_le:
 ; LE:       // %bb.0:
-; LE-NEXT:    fmov s0, w0
-; LE-NEXT:    umov w0, v0.b[0]
 ; LE-NEXT:    ret
   %ins = insertelement <4 x i32> undef, i32 %x, i32 0
   %bc = bitcast <4 x i32> %ins to <16 x i8>
@@ -100,9 +92,6 @@ define i8 @trunc_i32_to_i8_le(i32 %x) {
 define i8 @trunc_i32_to_i8_be(i32 %x) {
 ; BE-LABEL: trunc_i32_to_i8_be:
 ; BE:       // %bb.0:
-; BE-NEXT:    fmov s0, w0
-; BE-NEXT:    rev32 v0.16b, v0.16b
-; BE-NEXT:    umov w0, v0.b[3]
 ; BE-NEXT:    ret
 ;
 ; LE-LABEL: trunc_i32_to_i8_be:
@@ -116,3 +105,22 @@ define i8 @trunc_i32_to_i8_be(i32 %x) {
   ret i8 %ext
 }
 
+; Weird type (non-power-of-2 vector) is ok.
+
+define i8 @trunc_i64_to_i8_be(i64 %x) {
+; BE-LABEL: trunc_i64_to_i8_be:
+; BE:       // %bb.0:
+; BE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i64_to_i8_be:
+; LE:       // %bb.0:
+; LE-NEXT:    fmov d0, x0
+; LE-NEXT:    umov w0, v0.b[7]
+; LE-NEXT:    ret
+  %ins = insertelement <3 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <3 x i64> %ins to <24 x i8>
+  %ext = extractelement <24 x i8> %bc, i32 7
+  ret i8 %ext
+}
+
diff --git a/test/CodeGen/X86/extract-insert.ll b/test/CodeGen/X86/extract-insert.ll
index 2393e32ebf6..be5f9ed24fb 100644
--- a/test/CodeGen/X86/extract-insert.ll
+++ b/test/CodeGen/X86/extract-insert.ll
@@ -68,8 +68,8 @@ define i32 @trunc_i64_to_i32_le(i64 %x) {
 ;
 ; X64-LABEL: trunc_i64_to_i32_le:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %xmm0
-; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bc = bitcast <2 x i64> %ins to <4 x i32>
@@ -86,9 +86,8 @@ define i16 @trunc_i64_to_i16_le(i64 %x) {
 ;
 ; X64-LABEL: trunc_i64_to_i16_le:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %xmm0
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $ax killed $ax killed $rax
 ; X64-NEXT:    retq
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bc = bitcast <2 x i64> %ins to <8 x i16>
diff --git a/test/CodeGen/X86/mmx-coalescing.ll b/test/CodeGen/X86/mmx-coalescing.ll
index 8f9204a4a85..8cd57aa8c53 100644
--- a/test/CodeGen/X86/mmx-coalescing.ll
+++ b/test/CodeGen/X86/mmx-coalescing.ll
@@ -16,16 +16,17 @@ define i32 @test(%SA* %pSA, i16* %A, i32 %B, i32 %C, i32 %D, i8* %E) {
 ; CHECK-NEXT:  # %bb.2: # %if.B
 ; CHECK-NEXT:    pshufw $238, %mm0, %mm0 # mm0 = mm0[2,3,2,3]
 ; CHECK-NEXT:    movq %mm0, %rax
-; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jne .LBB0_4
 ; CHECK-NEXT:  .LBB0_1: # %if.A
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movd %edx, %mm1
 ; CHECK-NEXT:    psllq %mm1, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    testq %rax, %rax
 ; CHECK-NEXT:    jne .LBB0_4
-; CHECK-NEXT:  .LBB0_3: # %if.C
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:  # %bb.3: # %if.C
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    testl %eax, %eax
 ; CHECK-NEXT:    je .LBB0_1
 ; CHECK-NEXT:  .LBB0_4: # %merge
-- 
GitLab


From 75cb0ad4ddf84b94818454d1dca4ec6c5dec7924 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 21 Oct 2018 21:07:25 +0000
Subject: [PATCH 0364/1116] foo

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344873 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            | 12 ++--
 lib/Target/X86/X86MCInstLower.cpp             | 24 ++++++--
 .../X86/X86ShuffleDecodeConstantPool.cpp      | 58 +++++++++----------
 lib/Target/X86/X86ShuffleDecodeConstantPool.h | 13 +++--
 4 files changed, 62 insertions(+), 45 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9e431162083..26a7c004e1f 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6046,7 +6046,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMILPMask(C, MaskEltSize, Mask);
+      DecodeVPERMILPMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6063,7 +6063,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodePSHUFBMask(C, Mask);
+      DecodePSHUFBMask(C, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6128,7 +6128,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
         break;
       }
       if (auto *C = getTargetConstantFromNode(MaskNode)) {
-        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
+        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, VT.getSizeInBits(), Mask);
         break;
       }
     }
@@ -6145,7 +6145,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPPERMMask(C, Mask);
+      DecodeVPPERMMask(C, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6163,7 +6163,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMVMask(C, MaskEltSize, Mask);
+      DecodeVPERMVMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6178,7 +6178,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     SDValue MaskNode = N->getOperand(1);
     unsigned MaskEltSize = VT.getScalarSizeInBits();
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMV3Mask(C, MaskEltSize, Mask);
+      DecodeVPERMV3Mask(C, MaskEltSize, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 58b1c505944..9c278116d7e 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1594,6 +1594,18 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   }
 }
 
+static unsigned getRegisterWidth(const MCOperandInfo &Info) {
+  if (Info.RegClass == X86::VR128RegClassID ||
+      Info.RegClass == X86::VR128XRegClassID)
+    return 128;
+  if (Info.RegClass == X86::VR256RegClassID ||
+      Info.RegClass == X86::VR256XRegClassID)
+    return 256;
+  if (Info.RegClass == X86::VR512RegClassID)
+    return 512;
+  llvm_unreachable("Unknown register class!");
+}
+
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   X86MCInstLower MCInstLowering(*MF, *this);
   const X86RegisterInfo *RI =
@@ -1879,8 +1891,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 64> Mask;
-      DecodePSHUFBMask(C, Mask);
+      DecodePSHUFBMask(C, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1951,8 +1964,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMILPMask(C, ElSize, Mask);
+      DecodeVPERMILPMask(C, ElSize, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1982,8 +1996,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
+      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
@@ -1999,8 +2014,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPPERMMask(C, Mask);
+      DecodeVPPERMMask(C, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index c7ddf93f8e8..720be8afa62 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -112,11 +112,10 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
   return true;
 }
 
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+void DecodePSHUFBMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
@@ -125,7 +124,7 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / 8;
   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
          "Unexpected number of vector elements.");
 
@@ -151,12 +150,10 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
                         SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
 
@@ -166,7 +163,7 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) &&
          "Unexpected number of vector elements.");
@@ -189,11 +186,13 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
 }
 
 void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
+                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask) {
   Type *MaskTy = C->getType();
   unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
   (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
+  assert((MaskTySize == 128 || MaskTySize == 256) &&
+         Width >= MaskTySize && "Unexpected vector size.");
 
   // The shuffle mask requires elements the same size as the target.
   APInt UndefElts;
@@ -201,7 +200,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected number of vector elements.");
@@ -242,9 +241,12 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   }
 }
 
-void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  assert(C->getType()->getPrimitiveSizeInBits() == 128 &&
-         "Unexpected vector size.");
+void DecodeVPPERMMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert(Width == 128 && Width >= MaskTySize && "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
   APInt UndefElts;
@@ -252,7 +254,7 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / 8;
   assert(NumElts == 16 && "Unexpected number of vector elements.");
 
   for (unsigned i = 0; i != NumElts; ++i) {
@@ -291,12 +293,10 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
                       SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -307,7 +307,7 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
@@ -319,12 +319,10 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
   }
 }
 
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
                        SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -335,7 +333,7 @@ void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index b703cbbd2b2..b08c31935d2 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -26,25 +26,28 @@ class Constant;
 class MVT;
 
 /// Decode a PSHUFB mask from an IR-level vector constant.
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFBMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP variable mask from an IR-level vector constant.
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP2 variable mask from an IR-level vector constant.
 void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
+                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPPERM variable mask from an IR-level vector constant.
-void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVPPERMMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
                        SmallVectorImpl<int> &ShuffleMask);
 
 } // llvm namespace
-- 
GitLab


From 80da74b74940c9abe2d2863cd12c8410101c5382 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 21 Oct 2018 21:07:27 +0000
Subject: [PATCH 0365/1116] [X86] Remove SDIVREM8_SEXT_HREG/UDIVREM8_ZEXT_HREG
 and their associated DAG combine and target bits support. Use a post isel
 peephole instead.

Summary:
These nodes exist to overcome an isel problem where we can generate a zero extend of an AH register followed by an extract subreg, and another zero extend. The first zero extend exists to avoid a partial register update copying the AH register into the low 8-bits. The second zero extend exists if the user wanted the remainder zero extended.

To make this work we had a DAG combine to morph the DIVREM opcode to a special opcode that included the extend. But then we had to add the new node to computeKnownBits and computeNumSignBits to process the extension portion.

This patch instead removes all of that and adds a late peephole to detect the two extends.

Reviewers: RKSimon, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53449

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344874 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp | 72 ++++++++++++++++++++++--------
 lib/Target/X86/X86ISelLowering.cpp | 51 ---------------------
 lib/Target/X86/X86ISelLowering.h   |  4 --
 3 files changed, 54 insertions(+), 73 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index efd6349871f..2034d85458a 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -470,6 +470,8 @@ namespace {
     MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
                                 const SDLoc &dl, MVT VT, SDNode *Node,
                                 SDValue &InFlag);
+
+    bool tryOptimizeRem8Extend(SDNode *N);
   };
 }
 
@@ -841,22 +843,63 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
   }
 }
 
+// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
+bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
+  unsigned Opc = N->getMachineOpcode();
+  if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
+      Opc != X86::MOVSX64rr8)
+    return false;
+
+  SDValue N0 = N->getOperand(0);
+
+  // We need to be extracting the lower bit of an extend.
+  if (!N0.isMachineOpcode() ||
+      N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
+      N0.getConstantOperandVal(1) != X86::sub_8bit)
+    return false;
+
+  // We're looking for either a movsx or movzx to match the original opcode.
+  unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
+                                                : X86::MOVSX32rr8_NOREX;
+  SDValue N00 = N0.getOperand(0);
+  if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
+    return false;
+
+  if (Opc == X86::MOVSX64rr8) {
+    // If we had a sign extend from 8 to 64 bits. We still need to go from 32
+    // to 64.
+    MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
+                                                   MVT::i64, N00);
+    ReplaceUses(N, Extend);
+  } else {
+    // Ok we can drop this extend and just use the original extend.
+    ReplaceUses(N, N00.getNode());
+  }
+
+  return true;
+}
 
 void X86DAGToDAGISel::PostprocessISelDAG() {
   // Skip peepholes at -O0.
   if (TM.getOptLevel() == CodeGenOpt::None)
     return;
 
-  // Attempt to remove vectors moves that were inserted to zero upper bits.
-
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
+  bool MadeChange = false;
   while (Position != CurDAG->allnodes_begin()) {
     SDNode *N = &*--Position;
     // Skip dead nodes and any non-machine opcodes.
     if (N->use_empty() || !N->isMachineOpcode())
       continue;
 
+    if (tryOptimizeRem8Extend(N)) {
+      MadeChange = true;
+      continue;
+    }
+
+    // Attempt to remove vectors moves that were inserted to zero upper bits.
+
     if (N->getMachineOpcode() != TargetOpcode::SUBREG_TO_REG)
       continue;
 
@@ -905,11 +948,11 @@ void X86DAGToDAGISel::PostprocessISelDAG() {
     // Producing instruction is another vector instruction. We can drop the
     // move.
     CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
-
-    // If the move is now dead, delete it.
-    if (Move.getNode()->use_empty())
-      CurDAG->RemoveDeadNode(Move.getNode());
+    MadeChange = true;
   }
+
+  if (MadeChange)
+    CurDAG->RemoveDeadNodes();
 }
 
 
@@ -3370,15 +3413,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
   }
 
   case ISD::SDIVREM:
-  case ISD::UDIVREM:
-  case X86ISD::SDIVREM8_SEXT_HREG:
-  case X86ISD::UDIVREM8_ZEXT_HREG: {
+  case ISD::UDIVREM: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
     unsigned Opc, MOpc;
-    bool isSigned = (Opcode == ISD::SDIVREM ||
-                     Opcode == X86ISD::SDIVREM8_SEXT_HREG);
+    bool isSigned = Opcode == ISD::SDIVREM;
     if (!isSigned) {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
@@ -3517,13 +3557,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       SDValue Result(RNode, 0);
       InFlag = SDValue(RNode, 1);
 
-      if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
-          Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
-        assert(Node->getValueType(1) == MVT::i32 && "Unexpected result type!");
-      } else {
-        Result =
-            CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
-      }
+      Result =
+          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
+
       ReplaceUses(SDValue(Node, 1), Result);
       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
                  dbgs() << '\n');
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 26a7c004e1f..23fc773b08e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26638,8 +26638,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::UMUL:               return "X86ISD::UMUL";
   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
-  case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
-  case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
   case X86ISD::INC:                return "X86ISD::INC";
   case X86ISD::DEC:                return "X86ISD::DEC";
   case X86ISD::OR:                 return "X86ISD::OR";
@@ -29583,13 +29581,6 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known.Zero &= Known2.Zero;
     break;
   }
-  case X86ISD::UDIVREM8_ZEXT_HREG:
-    // TODO: Support more than just the zero extended bits?
-    if (Op.getResNo() != 1)
-      break;
-    // The remainder is zero extended.
-    Known.Zero.setBitsFrom(8);
-    break;
   }
 
   // Handle target shuffles.
@@ -29720,12 +29711,6 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
     return std::min(Tmp0, Tmp1);
   }
-  case X86ISD::SDIVREM8_SEXT_HREG:
-    // TODO: Support more than just the sign extended bits?
-    if (Op.getResNo() != 1)
-      break;
-    // The remainder is sign extended.
-    return VTBits - 7;
   }
 
   // Fallback case.
@@ -38242,36 +38227,6 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
 }
 
-/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
-/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
-/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
-/// extends from AH (which we otherwise need to do contortions to access).
-static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  auto OpcodeN = N->getOpcode();
-  auto OpcodeN0 = N0.getOpcode();
-  if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
-        (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-  EVT InVT = N0.getValueType();
-  if (N0.getResNo() != 1 || InVT != MVT::i8 ||
-      !(VT == MVT::i32 || VT == MVT::i64))
-    return SDValue();
-
-  SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
-  auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
-                                               : X86ISD::UDIVREM8_ZEXT_HREG;
-  SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
-                          N0.getOperand(1));
-  DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
-  // If this was a 64-bit extend, complete it.
-  if (VT == MVT::i64)
-    return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
-  return R.getValue(1);
-}
-
 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
 // operands and the result of CMOV is not used anywhere else - promote CMOV
 // itself instead of promoting its result. This could be beneficial, because:
@@ -38572,9 +38527,6 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   EVT InVT = N0.getValueType();
   SDLoc DL(N);
 
-  if (SDValue DivRem8 = getDivRem8(N, DAG))
-    return DivRem8;
-
   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
     return NewCMov;
 
@@ -38775,9 +38727,6 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
     if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
       return R;
 
-  if (SDValue DivRem8 = getDivRem8(N, DAG))
-    return DivRem8;
-
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
     return NewAdd;
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index b5e9eb3b86f..3e6c8929a9b 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -361,10 +361,6 @@ namespace llvm {
       // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS.
       SMUL8, UMUL8,
 
-      // 8-bit divrem that zero-extend the high result (AH).
-      UDIVREM8_ZEXT_HREG,
-      SDIVREM8_SEXT_HREG,
-
       // X86-specific multiply by immediate.
       MUL_IMM,
 
-- 
GitLab


From de6038d9af3596ac564c948b8da86349c82f6b65 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 21 Oct 2018 21:08:37 +0000
Subject: [PATCH 0366/1116] Revert r344873 "foo"

Rebase gone wrong left this in my tree.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344875 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            | 12 ++--
 lib/Target/X86/X86MCInstLower.cpp             | 24 ++------
 .../X86/X86ShuffleDecodeConstantPool.cpp      | 58 ++++++++++---------
 lib/Target/X86/X86ShuffleDecodeConstantPool.h | 13 ++---
 4 files changed, 45 insertions(+), 62 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 23fc773b08e..69288018fe5 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6046,7 +6046,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMILPMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
+      DecodeVPERMILPMask(C, MaskEltSize, Mask);
       break;
     }
     return false;
@@ -6063,7 +6063,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodePSHUFBMask(C, VT.getSizeInBits(), Mask);
+      DecodePSHUFBMask(C, Mask);
       break;
     }
     return false;
@@ -6128,7 +6128,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
         break;
       }
       if (auto *C = getTargetConstantFromNode(MaskNode)) {
-        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, VT.getSizeInBits(), Mask);
+        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
         break;
       }
     }
@@ -6145,7 +6145,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPPERMMask(C, VT.getSizeInBits(), Mask);
+      DecodeVPPERMMask(C, Mask);
       break;
     }
     return false;
@@ -6163,7 +6163,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMVMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
+      DecodeVPERMVMask(C, MaskEltSize, Mask);
       break;
     }
     return false;
@@ -6178,7 +6178,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     SDValue MaskNode = N->getOperand(1);
     unsigned MaskEltSize = VT.getScalarSizeInBits();
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMV3Mask(C, MaskEltSize, VT.getSizeInBits(), Mask);
+      DecodeVPERMV3Mask(C, MaskEltSize, Mask);
       break;
     }
     return false;
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 9c278116d7e..58b1c505944 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1594,18 +1594,6 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   }
 }
 
-static unsigned getRegisterWidth(const MCOperandInfo &Info) {
-  if (Info.RegClass == X86::VR128RegClassID ||
-      Info.RegClass == X86::VR128XRegClassID)
-    return 128;
-  if (Info.RegClass == X86::VR256RegClassID ||
-      Info.RegClass == X86::VR256XRegClassID)
-    return 256;
-  if (Info.RegClass == X86::VR512RegClassID)
-    return 512;
-  llvm_unreachable("Unknown register class!");
-}
-
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   X86MCInstLower MCInstLowering(*MF, *this);
   const X86RegisterInfo *RI =
@@ -1891,9 +1879,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 64> Mask;
-      DecodePSHUFBMask(C, Width, Mask);
+      DecodePSHUFBMask(C, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1964,9 +1951,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMILPMask(C, ElSize, Width, Mask);
+      DecodeVPERMILPMask(C, ElSize, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1996,9 +1982,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
+      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
@@ -2014,9 +1999,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPPERMMask(C, Width, Mask);
+      DecodeVPPERMMask(C, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 720be8afa62..c7ddf93f8e8 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -112,10 +112,11 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
   return true;
 }
 
-void DecodePSHUFBMask(const Constant *C, unsigned Width,
-                      SmallVectorImpl<int> &ShuffleMask) {
-  assert((Width == 128 || Width == 256 || Width == 512) &&
-         C->getType()->getPrimitiveSizeInBits() >= Width &&
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
@@ -124,7 +125,7 @@ void DecodePSHUFBMask(const Constant *C, unsigned Width,
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / 8;
+  unsigned NumElts = RawMask.size();
   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
          "Unexpected number of vector elements.");
 
@@ -150,10 +151,12 @@ void DecodePSHUFBMask(const Constant *C, unsigned Width,
   }
 }
 
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
                         SmallVectorImpl<int> &ShuffleMask) {
-  assert((Width == 128 || Width == 256 || Width == 512) &&
-         C->getType()->getPrimitiveSizeInBits() >= Width &&
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
          "Unexpected vector size.");
   assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
 
@@ -163,7 +166,7 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / ElSize;
+  unsigned NumElts = RawMask.size();
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) &&
          "Unexpected number of vector elements.");
@@ -186,13 +189,11 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
 }
 
 void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
-                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask) {
   Type *MaskTy = C->getType();
   unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
   (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256) &&
-         Width >= MaskTySize && "Unexpected vector size.");
+  assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
 
   // The shuffle mask requires elements the same size as the target.
   APInt UndefElts;
@@ -200,7 +201,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / ElSize;
+  unsigned NumElts = RawMask.size();
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected number of vector elements.");
@@ -241,12 +242,9 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   }
 }
 
-void DecodeVPPERMMask(const Constant *C, unsigned Width,
-                      SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert(Width == 128 && Width >= MaskTySize && "Unexpected vector size.");
+void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  assert(C->getType()->getPrimitiveSizeInBits() == 128 &&
+         "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
   APInt UndefElts;
@@ -254,7 +252,7 @@ void DecodeVPPERMMask(const Constant *C, unsigned Width,
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / 8;
+  unsigned NumElts = RawMask.size();
   assert(NumElts == 16 && "Unexpected number of vector elements.");
 
   for (unsigned i = 0; i != NumElts; ++i) {
@@ -293,10 +291,12 @@ void DecodeVPPERMMask(const Constant *C, unsigned Width,
   }
 }
 
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
                       SmallVectorImpl<int> &ShuffleMask) {
-  assert((Width == 128 || Width == 256 || Width == 512) &&
-         C->getType()->getPrimitiveSizeInBits() >= Width &&
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -307,7 +307,7 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / ElSize;
+  unsigned NumElts = RawMask.size();
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
@@ -319,10 +319,12 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
   }
 }
 
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
                        SmallVectorImpl<int> &ShuffleMask) {
-  assert((Width == 128 || Width == 256 || Width == 512) &&
-         C->getType()->getPrimitiveSizeInBits() >= Width &&
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -333,7 +335,7 @@ void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / ElSize;
+  unsigned NumElts = RawMask.size();
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index b08c31935d2..b703cbbd2b2 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -26,28 +26,25 @@ class Constant;
 class MVT;
 
 /// Decode a PSHUFB mask from an IR-level vector constant.
-void DecodePSHUFBMask(const Constant *C, unsigned Width,
-                      SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP variable mask from an IR-level vector constant.
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP2 variable mask from an IR-level vector constant.
 void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
-                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPPERM variable mask from an IR-level vector constant.
-void DecodeVPPERMMask(const Constant *C, unsigned Width,
-                      SmallVectorImpl<int> &ShuffleMask);
+void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
                        SmallVectorImpl<int> &ShuffleMask);
 
 } // llvm namespace
-- 
GitLab


From e01c86dd475439245e4bda88f03555f2377b9613 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 21 Oct 2018 21:30:26 +0000
Subject: [PATCH 0367/1116] [X86] Stop promoting integer loads to vXi64

Summary:
Theoretically this was done to simplify the amount of isel patterns that were needed. But it also meant a substantial number of our isel patterns have to match an explicit bitcast. By making the vXi32/vXi16/vXi8 types legal for loads, DAG combiner should be able to change the load type to remove the bitcast.

I had to add some additional plain load instruction patterns and a few other special cases, but overall the isel table has reduced in size by ~12000 bytes. So it looks like this promotion was hurting us more than helping.

I still have one crash in vector-trunc.ll that I'm hoping @RKSimon can help with. It seems to relate to using getTargetConstantFromNode on a load that was shrunk due to an extract_subvector combine after the constant pool entry was created. So we end up decoding more mask elements than the load size.

I'm hoping this patch will simplify the number of patterns needed to remove the and/or/xor promotion.

Reviewers: RKSimon, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits, RKSimon

Differential Revision: https://reviews.llvm.org/D53306

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344877 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp            |  28 +-
 lib/Target/X86/X86ISelLowering.cpp            |  28 +-
 lib/Target/X86/X86InstrAVX512.td              | 243 ++++---
 lib/Target/X86/X86InstrFragmentsSIMD.td       |  54 +-
 lib/Target/X86/X86InstrSSE.td                 | 627 ++++++++++--------
 lib/Target/X86/X86InstrXOP.td                 |  75 ++-
 lib/Target/X86/X86MCInstLower.cpp             |  26 +-
 .../X86/X86ShuffleDecodeConstantPool.cpp      |  58 +-
 lib/Target/X86/X86ShuffleDecodeConstantPool.h |  13 +-
 test/CodeGen/X86/avx-vperm2x128.ll            |   2 +-
 test/CodeGen/X86/oddshuffles.ll               |  24 +-
 test/CodeGen/X86/pshufb-mask-comments.ll      |   6 +-
 test/CodeGen/X86/vector-extend-inreg.ll       |   2 +-
 test/CodeGen/X86/vector-idiv-v2i32.ll         |  18 +-
 test/CodeGen/X86/widened-broadcast.ll         |  95 +--
 15 files changed, 710 insertions(+), 589 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 2034d85458a..5e9fbf83f90 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2855,21 +2855,17 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
 
-  // If there is a load, it will be behind a bitcast. We don't need to check
-  // alignment on this load.
+  // Try to fold a load. No need to check alignment.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
-      tryFoldLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
-                  Tmp3, Tmp4)) {
-    SDValue Load = N1.getOperand(0);
+  if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
-                      Load.getOperand(0) };
+                      N1.getOperand(0) };
     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     // Update the chain.
-    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+    ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
     // Record the mem-refs
-    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     return CNode;
   }
 
@@ -2892,22 +2888,18 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
 
-  // If there is a load, it will be behind a bitcast. We don't need to check
-  // alignment on this load.
+  // Try to fold a load. No need to check alignment.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
-      tryFoldLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
-                  Tmp3, Tmp4)) {
-    SDValue Load = N2.getOperand(0);
+  if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
-                      Load.getOperand(0), InFlag };
+                      N2.getOperand(0), InFlag };
     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     InFlag = SDValue(CNode, 3);
     // Update the chain.
-    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+    ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
     // Record the mem-refs
-    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
     return CNode;
   }
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 69288018fe5..ae5795db4ab 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -869,11 +869,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
-    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
-      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
-    }
-
     // Custom lower v2i64 and v2f64 selects.
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
@@ -1178,11 +1173,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (HasInt256)
       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
 
-    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
-      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
-    }
-
     if (HasInt256) {
       // Custom legalize 2x32 to get a little better code.
       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
@@ -1419,10 +1409,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MGATHER,             VT, Custom);
       setOperationAction(ISD::MSCATTER,            VT, Custom);
     }
-    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
-      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
-    }
-
     // Need to custom split v32i16/v64i8 bitcasts.
     if (!Subtarget.hasBWI()) {
       setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
@@ -5539,7 +5525,7 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
     return nullptr;
 
-  return dyn_cast<Constant>(CNode->getConstVal());
+  return CNode->getConstVal();
 }
 
 // Extract raw constant bits from constant pools.
@@ -6046,7 +6032,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMILPMask(C, MaskEltSize, Mask);
+      DecodeVPERMILPMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6063,7 +6049,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodePSHUFBMask(C, Mask);
+      DecodePSHUFBMask(C, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6128,7 +6114,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
         break;
       }
       if (auto *C = getTargetConstantFromNode(MaskNode)) {
-        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
+        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, VT.getSizeInBits(), Mask);
         break;
       }
     }
@@ -6145,7 +6131,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPPERMMask(C, Mask);
+      DecodeVPPERMMask(C, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6163,7 +6149,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMVMask(C, MaskEltSize, Mask);
+      DecodeVPERMVMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6178,7 +6164,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     SDValue MaskNode = N->getOperand(1);
     unsigned MaskEltSize = VT.getScalarSizeInBits();
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMV3Mask(C, MaskEltSize, Mask);
+      DecodeVPERMV3Mask(C, MaskEltSize, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index f617de7dd7d..4c4c7e75ffc 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -66,21 +66,16 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
                            !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?));
 
   // Load patterns
-  // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
-  //       due to load promotion during legalization
-  PatFrag LdFrag = !cast<PatFrag>("load" #
-                                  !if (!eq (TypeVariantName, "i"),
-                                       !if (!eq (Size, 128), "v2i64",
-                                       !if (!eq (Size, 256), "v4i64",
-                                       !if (!eq (Size, 512), "v8i64",
-                                            VTName))), VTName));
-
-  PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" #
-                                         !if (!eq (TypeVariantName, "i"),
-                                               !if (!eq (Size, 128), "v2i64",
-                                               !if (!eq (Size, 256), "v4i64",
-                                               !if (!eq (Size, 512), "v8i64",
-                                                   VTName))), VTName));
+  PatFrag LdFrag = !cast<PatFrag>("load" # VTName);
+
+  PatFrag i64LdFrag = !cast<PatFrag>("load" #
+                                     !if (!eq (TypeVariantName, "i"),
+                                          !if (!eq (Size, 128), "v2i64",
+                                          !if (!eq (Size, 256), "v4i64",
+                                          !if (!eq (Size, 512), "v8i64",
+                                               VTName))), VTName));
+
+  PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # VTName);
 
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
 
@@ -518,10 +513,10 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
-                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                               (From.VT (From.LdFrag addr:$src2)),
                                (iPTR imm)),
                    (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
-                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                               (From.VT (From.LdFrag addr:$src2)),
                                (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
                    EVEX_CD8<From.EltSize, From.CD8TupleForm>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -547,7 +542,7 @@ multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
 
     def : Pat<(vinsert_insert:$ins
                   (To.VT To.RC:$src1),
-                  (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                  (From.VT (From.LdFrag addr:$src2)),
                   (iPTR imm)),
               (To.VT (!cast<Instruction>(InstrStr#"rm")
                   To.RC:$src1, addr:$src2,
@@ -680,9 +675,7 @@ let Predicates = p in {
              (vselect Cast.KRCWM:$mask,
                       (bitconvert
                        (vinsert_insert:$ins (To.VT To.RC:$src1),
-                                            (From.VT
-                                             (bitconvert
-                                              (From.LdFrag addr:$src2))),
+                                            (From.VT (From.LdFrag addr:$src2)),
                                             (iPTR imm))),
                       Cast.ImmAllZerosV)),
             (!cast<Instruction>(InstrStr#"rmkz")
@@ -1374,7 +1367,7 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+                             (_Src.VT (_Src.LdFrag addr:$src))))>,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
@@ -1389,7 +1382,7 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (null_frag),
                            (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+                             (_Src.VT (_Src.LdFrag addr:$src))))>,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
@@ -1442,11 +1435,11 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
 let Predicates = [HasAVX512] in {
 def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
           (VBROADCASTF64X4rm addr:$src)>;
-def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
+def : Pat<(v16i32 (X86SubVBroadcast (loadv8i32 addr:$src))),
           (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))),
+def : Pat<(v32i16 (X86SubVBroadcast (loadv16i16 addr:$src))),
           (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
+def : Pat<(v64i8 (X86SubVBroadcast (loadv32i8 addr:$src))),
           (VBROADCASTI64X4rm addr:$src)>;
 
 // Provide fallback in case the load node that is used in the patterns above
@@ -1474,9 +1467,9 @@ def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
           (VBROADCASTF32X4rm addr:$src)>;
 def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v32i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
@@ -1506,11 +1499,11 @@ def : Pat<(vselect VK8WM:$mask,
                    VR512:$src0),
           (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
                    (bc_v8i64 (v16i32 immAllZerosV))),
           (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
                    VR512:$src0),
           (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
@@ -1527,9 +1520,9 @@ def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
           (VBROADCASTF32X4Z256rm addr:$src)>;
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
@@ -1591,11 +1584,11 @@ def : Pat<(vselect VK4WM:$mask,
                    VR256X:$src0),
           (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    (bc_v4i64 (v8i32 immAllZerosV))),
           (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
 def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    VR256X:$src0),
           (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 }
@@ -1641,11 +1634,11 @@ def : Pat<(vselect VK8WM:$mask,
                    VR512:$src0),
           (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    (bc_v8i64 (v16i32 immAllZerosV))),
           (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    VR512:$src0),
           (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
@@ -1741,7 +1734,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
             (ins _.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1,
-                   (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
+                   (_.VT (_.LdFrag addr:$src3)))), 1>,
             EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -1859,7 +1852,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
             (ins IdxVT.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
-                   (bitconvert (_.LdFrag addr:$src3)))), 1>,
+                   (_.LdFrag addr:$src3))), 1>,
             EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -2149,7 +2142,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
-                                       (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+                                       (_.VT (_.LdFrag addr:$src2))))]>,
              EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = IsCommutable in
   def rrk : AVX512BI<opc, MRMSrcReg,
@@ -2165,8 +2158,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
                           "$dst {${mask}}, $src1, $src2}"),
               [(set _.KRC:$dst, (and _.KRCWM:$mask,
                                    (OpNode (_.VT _.RC:$src1),
-                                       (_.VT (bitconvert
-                                              (_.LdFrag addr:$src2))))))]>,
+                                       (_.VT (_.LdFrag addr:$src2)))))]>,
               EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -2291,7 +2283,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
              [(set _.KRC:$dst, (_.KVT
                                 (Frag:$cc
                                  (_.VT _.RC:$src1),
-                                 (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                 (_.VT (_.LdFrag addr:$src2)),
                                  cond)))]>,
              EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = 1 in
@@ -2316,8 +2308,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                      (_.KVT
                                       (Frag:$cc
                                        (_.VT _.RC:$src1),
-                                       (_.VT (bitconvert
-                                              (_.LdFrag addr:$src2))),
+                                       (_.VT (_.LdFrag addr:$src2)),
                                        cond))))]>,
               EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -2352,13 +2343,13 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                NotMemoryFoldable;
   }
 
-  def : Pat<(_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
+  def : Pat<(_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
                                  (_.VT _.RC:$src1), cond)),
             (!cast<Instruction>(Name#_.ZSuffix#"rmi")
              _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
 
   def : Pat<(and _.KRCWM:$mask,
-                 (_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
+                 (_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
                                       (_.VT _.RC:$src1), cond))),
             (!cast<Instruction>(Name#_.ZSuffix#"rmik")
              _.KRCWM:$mask, _.RC:$src1, addr:$src2,
@@ -2544,7 +2535,7 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
                 "vcmp${cc}"#_.Suffix,
                 "$src2, $src1", "$src1, $src2",
                 (X86cmpm (_.VT _.RC:$src1),
-                        (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                        (_.VT (_.LdFrag addr:$src2)),
                         imm:$cc)>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -2732,7 +2723,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     OpcodeStr##_.Suffix##mem#
                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set _.KRC:$dst,(OpNode
-                                     (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                     (_.VT (_.LdFrag addr:$src1)),
                                      (i32 imm:$src2)))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
@@ -2740,7 +2731,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     OpcodeStr##_.Suffix##mem#
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                     [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode
-                                  (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                  (_.VT (_.LdFrag addr:$src1)),
                                   (i32 imm:$src2))))]>,
                     EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
@@ -3353,7 +3344,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     !if(NoRMPattern, [],
                         [(set _.RC:$dst,
-                          (_.VT (bitconvert (ld_frag addr:$src))))]),
+                          (_.VT (ld_frag addr:$src)))]),
                     _.ExeDomain>, EVEX, Sched<[Sched.RM]>,
                     EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
 
@@ -3372,7 +3363,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                       "${dst} {${mask}}, $src1}"),
                      [(set _.RC:$dst, (_.VT
                          (vselect _.KRCWM:$mask,
-                          (_.VT (bitconvert (ld_frag addr:$src1))),
+                          (_.VT (ld_frag addr:$src1)),
                            (_.VT _.RC:$src0))))], _.ExeDomain>,
                      EVEX, EVEX_K, Sched<[Sched.RM]>;
   }
@@ -3381,7 +3372,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                   OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
                                 "${dst} {${mask}} {z}, $src}",
                   [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
-                    (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
+                    (_.VT (ld_frag addr:$src)), _.ImmAllZerosV)))],
                   _.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>;
   }
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
@@ -3681,6 +3672,20 @@ let Predicates = [HasBWI, NoVLX] in {
 }
 
 let Predicates = [HasAVX512] in {
+  // 512-bit load.
+  def : Pat<(alignedloadv16i32 addr:$src),
+            (VMOVDQA64Zrm addr:$src)>;
+  def : Pat<(alignedloadv32i16 addr:$src),
+            (VMOVDQA64Zrm addr:$src)>;
+  def : Pat<(alignedloadv64i8 addr:$src),
+            (VMOVDQA64Zrm addr:$src)>;
+  def : Pat<(loadv16i32 addr:$src),
+            (VMOVDQU64Zrm addr:$src)>;
+  def : Pat<(loadv32i16 addr:$src),
+            (VMOVDQU64Zrm addr:$src)>;
+  def : Pat<(loadv64i8 addr:$src),
+            (VMOVDQU64Zrm addr:$src)>;
+
   // 512-bit store.
   def : Pat<(alignedstore (v16i32 VR512:$src), addr:$dst),
             (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
@@ -3697,6 +3702,20 @@ let Predicates = [HasAVX512] in {
 }
 
 let Predicates = [HasVLX] in {
+  // 128-bit load.
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (VMOVDQA64Z128rm addr:$src)>;
+  def : Pat<(alignedloadv8i16 addr:$src),
+            (VMOVDQA64Z128rm addr:$src)>;
+  def : Pat<(alignedloadv16i8 addr:$src),
+            (VMOVDQA64Z128rm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (VMOVDQU64Z128rm addr:$src)>;
+  def : Pat<(loadv8i16 addr:$src),
+            (VMOVDQU64Z128rm addr:$src)>;
+  def : Pat<(loadv16i8 addr:$src),
+            (VMOVDQU64Z128rm addr:$src)>;
+
   // 128-bit store.
   def : Pat<(alignedstore (v4i32 VR128X:$src), addr:$dst),
             (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
@@ -3711,6 +3730,20 @@ let Predicates = [HasVLX] in {
   def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
             (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
 
+  // 256-bit load.
+  def : Pat<(alignedloadv8i32 addr:$src),
+            (VMOVDQA64Z256rm addr:$src)>;
+  def : Pat<(alignedloadv16i16 addr:$src),
+            (VMOVDQA64Z256rm addr:$src)>;
+  def : Pat<(alignedloadv32i8 addr:$src),
+            (VMOVDQA64Z256rm addr:$src)>;
+  def : Pat<(loadv8i32 addr:$src),
+            (VMOVDQU64Z256rm addr:$src)>;
+  def : Pat<(loadv16i16 addr:$src),
+            (VMOVDQU64Z256rm addr:$src)>;
+  def : Pat<(loadv32i8 addr:$src),
+            (VMOVDQU64Z256rm addr:$src)>;
+
   // 256-bit store.
   def : Pat<(alignedstore (v8i32 VR256X:$src), addr:$dst),
             (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
@@ -4495,7 +4528,7 @@ let Predicates = [HasAVX512] in {
             (VMOVDI2PDIZrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (VMOVDI2PDIZrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
             (VMOVDI2PDIZrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (VMOVDI2PDIZrm addr:$src)>;
@@ -4591,6 +4624,12 @@ let Predicates = [HasAVX512], AddedComplexity = 400 in {
             (VMOVNTDQAZrm addr:$src)>;
   def : Pat<(v8i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v16i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v32i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v64i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
 }
 
 let Predicates = [HasVLX], AddedComplexity = 400 in {
@@ -4607,6 +4646,12 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
             (VMOVNTDQAZ256rm addr:$src)>;
   def : Pat<(v4i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
 
   def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
             (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
@@ -4621,6 +4666,12 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
             (VMOVNTDQAZ128rm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4639,8 +4690,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
-                  (_.VT (OpNode _.RC:$src1,
-                                (bitconvert (_.LdFrag addr:$src2))))>,
+                  (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2)))>,
                   AVX512BIBase, EVEX_4V,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -4771,7 +4821,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
+                                      (_Src.LdFrag addr:$src2)))>,
                         AVX512BIBase, EVEX_4V,
                         Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -4876,7 +4926,7 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
+                                      (_Src.LdFrag addr:$src2)))>,
                          EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -5068,7 +5118,7 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
                   (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
                                    (bitconvert (_.LdFrag addr:$src2)))),
                   (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
-                                     (bitconvert (_.LdFrag addr:$src2))))))>,
+                                     (_.i64LdFrag addr:$src2)))))>,
                   AVX512BIBase, EVEX_4V,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -5641,7 +5691,7 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode,
                        "$src2, $src1", "$src1, $src2",
                    (OpNode (bitconvert
                             (_.i64VT (and _.RC:$src1,
-                                          (bitconvert (_.LdFrag addr:$src2))))),
+                                          (_.i64LdFrag addr:$src2)))),
                            _.ImmAllZerosV)>,
                    EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -5805,7 +5855,7 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
   defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                   (_.VT (OpNode (_.VT (_.LdFrag addr:$src1)),
                           (i8 imm:$src2)))>,
                    Sched<[sched.Folded]>;
   }
@@ -5835,8 +5885,7 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode _.RC:$src1,
-                                 (SrcVT (bitconvert (loadv2i64 addr:$src2)))))>,
+                   (_.VT (OpNode _.RC:$src1, (SrcVT (load addr:$src2))))>,
                    AVX512BIBase,
                    EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -5990,7 +6039,7 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1,
-                   (_.VT (bitconvert (_.LdFrag addr:$src2)))))>,
+                   (_.VT (_.LdFrag addr:$src2))))>,
                    AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -6090,7 +6139,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
     def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)),
               (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1,
                _.RC:$src2)>;
-    def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))),
+    def : Pat<(_.VT (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2))),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
                _.RC:$src1, addr:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
@@ -6098,7 +6147,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, _.RC:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                     (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+                     (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
                      _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, addr:$src2)>;
@@ -6107,7 +6156,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
                _.RC:$src1, _.RC:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                     (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+                     (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
                      _.ImmAllZerosV)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
                _.RC:$src1, addr:$src2)>;
@@ -6332,7 +6381,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode
                            _.RC:$src1,
-                           (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+                           (Ctrl.VT (Ctrl.LdFrag addr:$src2))))>,
                   T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
@@ -7618,7 +7667,7 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins MemOp:$src), OpcodeStr#Alias, "$src", "$src",
                          (_.VT (OpNode (_Src.VT
-                             (bitconvert (_Src.LdFrag addr:$src)))))>,
+                             (_Src.LdFrag addr:$src))))>,
                          EVEX, Sched<[sched.Folded]>;
 
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -8325,8 +8374,7 @@ multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
   defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
                             (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
                             (X86cvtph2ps (_src.VT
-                                          (bitconvert
-                                           (ld_frag addr:$src))))>,
+                                          (ld_frag addr:$src)))>,
                             T8PD, Sched<[sched.Folded]>;
 }
 
@@ -8341,17 +8389,17 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
 }
 
 let Predicates = [HasAVX512] in
-  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64,
+  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, load,
                                     WriteCvtPH2PSZ>,
                     avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>,
                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
 
 let Predicates = [HasVLX] in {
   defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
-                       loadv2i64, WriteCvtPH2PSY>, EVEX, EVEX_V256,
+                       load, WriteCvtPH2PSY>, EVEX, EVEX_V256,
                        EVEX_CD8<32, CD8VH>;
   defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
-                       loadv2i64, WriteCvtPH2PS>, EVEX, EVEX_V128,
+                       load, WriteCvtPH2PS>, EVEX, EVEX_V128,
                        EVEX_CD8<32, CD8VH>;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
@@ -9295,7 +9343,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
   def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
   }
   let Predicates = [HasVLX] in {
@@ -9305,7 +9353,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
   def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
@@ -9314,7 +9362,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
 
   def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -9325,7 +9373,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
   def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -9334,7 +9382,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -9345,12 +9393,12 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   }
   // 256-bit patterns
   let Predicates = [HasVLX, HasBWI] in {
-  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
   def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
@@ -9364,7 +9412,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
 
   def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -9373,10 +9421,10 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
 
-  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
@@ -9389,10 +9437,10 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
 
-  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
@@ -9401,25 +9449,25 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
   }
   // 512-bit patterns
   let Predicates = [HasBWI] in {
-  def : Pat<(v32i16 (ExtOp (bc_v32i8 (loadv4i64 addr:$src)))),
+  def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWZrm) addr:$src)>;
   }
   let Predicates = [HasAVX512] in {
-  def : Pat<(v16i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDZrm) addr:$src)>;
 
   def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
-  def : Pat<(v8i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
 
-  def : Pat<(v16i32 (ExtOp (bc_v16i16 (loadv4i64 addr:$src)))),
+  def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDZrm) addr:$src)>;
 
-  def : Pat<(v8i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQZrm) addr:$src)>;
 
-  def : Pat<(v8i64 (ExtOp (bc_v8i32 (loadv4i64 addr:$src)))),
+  def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQZrm) addr:$src)>;
   }
 }
@@ -10324,7 +10372,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                 (_.VT
                  (bitconvert
                   (CastInfo.VT (X86Shuf128 _.RC:$src1,
-                                           (bitconvert (_.LdFrag addr:$src2)),
+                                           (CastInfo.LdFrag addr:$src2),
                                            (i8 imm:$src3)))))>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>,
                 EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
@@ -10490,7 +10538,7 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
-                                      (bitconvert (To.LdFrag addr:$src2)),
+                                              (From.LdFrag addr:$src2),
                                       imm:$src3))),
                             To.RC:$src0)),
             (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
@@ -10500,7 +10548,7 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
-                                      (bitconvert (To.LdFrag addr:$src2)),
+                                              (From.LdFrag addr:$src2),
                                       imm:$src3))),
                             To.ImmAllZerosV)),
             (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
@@ -11644,7 +11692,7 @@ multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                        (VTI.VT (bitconvert (VTI.LdFrag addr:$src3)))))>,
+                        (VTI.VT (VTI.LdFrag addr:$src3))))>,
                 AVX512FMA3Base,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -11747,8 +11795,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
                                    (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                                            (VTI.VT (bitconvert
-                                                     (VTI.LdFrag addr:$src3)))))>,
+                                            (VTI.VT (VTI.LdFrag addr:$src3))))>,
                                    EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
                                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm mb :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -11804,7 +11851,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
                                 "vpshufbitqmb",
                                 "$src2, $src1", "$src1, $src2",
                                 (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
-                                (VTI.VT (bitconvert (VTI.LdFrag addr:$src2))))>,
+                                (VTI.VT (VTI.LdFrag addr:$src2)))>,
                                 EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
                                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index f750fe3ee0c..7e31527a877 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -648,21 +648,28 @@ def sdmem : Operand<v2f64> {
 //===----------------------------------------------------------------------===//
 
 // 128-bit load pattern fragments
-// NOTE: all 128-bit integer vector loads are promoted to v2i64
 def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
 def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
 def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv8i16    : PatFrag<(ops node:$ptr), (v8i16 (load node:$ptr))>;
+def loadv16i8    : PatFrag<(ops node:$ptr), (v16i8 (load node:$ptr))>;
 
 // 256-bit load pattern fragments
-// NOTE: all 256-bit integer vector loads are promoted to v4i64
-def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
-def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
-def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32  (load node:$ptr))>;
+def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64  (load node:$ptr))>;
+def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64  (load node:$ptr))>;
+def loadv8i32    : PatFrag<(ops node:$ptr), (v8i32  (load node:$ptr))>;
+def loadv16i16   : PatFrag<(ops node:$ptr), (v16i16 (load node:$ptr))>;
+def loadv32i8    : PatFrag<(ops node:$ptr), (v32i8  (load node:$ptr))>;
 
 // 512-bit load pattern fragments
 def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
-def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
-def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64  (load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64  (load node:$ptr))>;
+def loadv16i32   : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
+def loadv32i16   : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>;
+def loadv64i8    : PatFrag<(ops node:$ptr), (v64i8  (load node:$ptr))>;
 
 // 128-/256-/512-bit extload pattern fragments
 def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
@@ -690,15 +697,27 @@ def alignedloadv2f64 : PatFrag<(ops node:$ptr),
                                (v2f64 (alignedload node:$ptr))>;
 def alignedloadv2i64 : PatFrag<(ops node:$ptr),
                                (v2i64 (alignedload node:$ptr))>;
+def alignedloadv4i32 : PatFrag<(ops node:$ptr),
+                               (v4i32 (alignedload node:$ptr))>;
+def alignedloadv8i16 : PatFrag<(ops node:$ptr),
+                               (v8i16 (alignedload node:$ptr))>;
+def alignedloadv16i8 : PatFrag<(ops node:$ptr),
+                               (v16i8 (alignedload node:$ptr))>;
 
 // 256-bit aligned load pattern fragments
 // NOTE: all 256-bit integer vector loads are promoted to v4i64
-def alignedloadv8f32 : PatFrag<(ops node:$ptr),
-                               (v8f32 (alignedload node:$ptr))>;
-def alignedloadv4f64 : PatFrag<(ops node:$ptr),
-                               (v4f64 (alignedload node:$ptr))>;
-def alignedloadv4i64 : PatFrag<(ops node:$ptr),
-                               (v4i64 (alignedload node:$ptr))>;
+def alignedloadv8f32  : PatFrag<(ops node:$ptr),
+                                (v8f32  (alignedload node:$ptr))>;
+def alignedloadv4f64  : PatFrag<(ops node:$ptr),
+                                (v4f64  (alignedload node:$ptr))>;
+def alignedloadv4i64  : PatFrag<(ops node:$ptr),
+                                (v4i64  (alignedload node:$ptr))>;
+def alignedloadv8i32  : PatFrag<(ops node:$ptr),
+                                (v8i32  (alignedload node:$ptr))>;
+def alignedloadv16i16 : PatFrag<(ops node:$ptr),
+                                (v16i16 (alignedload node:$ptr))>;
+def alignedloadv32i8  : PatFrag<(ops node:$ptr),
+                                (v32i8  (alignedload node:$ptr))>;
 
 // 512-bit aligned load pattern fragments
 def alignedloadv16f32 : PatFrag<(ops node:$ptr),
@@ -707,6 +726,12 @@ def alignedloadv8f64  : PatFrag<(ops node:$ptr),
                                 (v8f64  (alignedload node:$ptr))>;
 def alignedloadv8i64  : PatFrag<(ops node:$ptr),
                                 (v8i64  (alignedload node:$ptr))>;
+def alignedloadv16i32 : PatFrag<(ops node:$ptr),
+                                (v16i32 (alignedload node:$ptr))>;
+def alignedloadv32i16 : PatFrag<(ops node:$ptr),
+                                (v32i16 (alignedload node:$ptr))>;
+def alignedloadv64i8  : PatFrag<(ops node:$ptr),
+                                (v64i8  (alignedload node:$ptr))>;
 
 // Like 'load', but uses special alignment checks suitable for use in
 // memory operands in most SSE instructions, which are required to
@@ -725,6 +750,9 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
+def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>;
+def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
 
 def X86masked_gather : SDNode<"X86ISD::MGATHER",
                               SDTypeProfile<2, 3, [SDTCisVec<0>,
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 8a836d8c173..6c90a8898f6 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -589,8 +589,21 @@ let Predicates = [HasAVX, NoVLX] in {
   // available and changing the domain is beneficial.
   def : Pat<(alignedloadv4i64 addr:$src),
             (VMOVAPSYrm addr:$src)>;
+  def : Pat<(alignedloadv8i32 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(alignedloadv16i16 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(alignedloadv32i8 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
   def : Pat<(loadv4i64 addr:$src),
             (VMOVUPSYrm addr:$src)>;
+  def : Pat<(loadv8i32 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(loadv16i16 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(loadv32i8 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+
   def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
@@ -615,8 +628,20 @@ let Predicates = [HasAVX, NoVLX] in {
 let Predicates = [UseSSE1] in {
   def : Pat<(alignedloadv2i64 addr:$src),
             (MOVAPSrm addr:$src)>;
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(alignedloadv8i16 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(alignedloadv16i8 addr:$src),
+            (MOVAPSrm addr:$src)>;
   def : Pat<(loadv2i64 addr:$src),
             (MOVUPSrm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (MOVUPSrm addr:$src)>;
+  def : Pat<(loadv8i16 addr:$src),
+            (MOVUPSrm addr:$src)>;
+  def : Pat<(loadv16i8 addr:$src),
+            (MOVUPSrm addr:$src)>;
 
   def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
             (MOVAPSmr addr:$dst, VR128:$src)>;
@@ -841,7 +866,7 @@ let hasSideEffects = 0 in {
   let mayLoad = 1 in
   def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
              [(set RC:$dst, (DstTy (sint_to_fp
-                                    (SrcTy (bitconvert (ld_frag addr:$src))))))], d>,
+                                    (SrcTy (ld_frag addr:$src)))))], d>,
              Sched<[sched.Folded]>;
 }
 }
@@ -1104,16 +1129,16 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
                                  ssmem, sse_load_f32, "cvtss2si",
                                  WriteCvtSS2I>, XS, REX_W;
 
-defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
+defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PS>,
                                PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
-defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
+defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PSY>,
                                PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 
-defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, WriteCvtI2PS>,
                             PS, Requires<[UseSSE2]>;
@@ -1672,7 +1697,7 @@ let hasSideEffects = 0, mayLoad = 1 in
 def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                          (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
                         VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
 def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -1682,7 +1707,7 @@ def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
-                           (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                           (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>,
                          VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
                          VEX_WIG;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
@@ -1696,7 +1721,7 @@ let hasSideEffects = 0, mayLoad = 1 in
 def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                         (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
                        Sched<[WriteCvtI2PDLd]>;
 def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2151,54 +2176,54 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
-defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
-defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
-defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
 
-defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 }// Predicates = [HasAVX, NoVLX]
 
 let Constraints = "$src1 = $dst" in {
-  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
-  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
         VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
-  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
         VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
-  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
         VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX1Only] in {
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
@@ -2284,8 +2309,7 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1,
-                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 } // ExeDomain = SSEPackedInt
@@ -2296,16 +2320,16 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          Predicate prd> {
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
-                             VR128, loadv2i64, i128mem, sched.XMM,
+                             VR128, load, i128mem, sched.XMM,
                              IsCommutable, 0>, VEX_4V, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
-                           memopv2i64, i128mem, sched.XMM, IsCommutable, 1>;
+                           memop, i128mem, sched.XMM, IsCommutable, 1>;
 
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
-                               OpVT256, VR256, loadv4i64, i256mem, sched.YMM,
+                               OpVT256, VR256, load, i256mem, sched.YMM,
                                IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
@@ -3306,6 +3330,19 @@ def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
 
 let Predicates = [HasAVX, NoVLX] in {
   // Additional patterns for other integer sizes.
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (VMOVDQArm addr:$src)>;
+  def : Pat<(alignedloadv8i16 addr:$src),
+            (VMOVDQArm addr:$src)>;
+  def : Pat<(alignedloadv16i8 addr:$src),
+            (VMOVDQArm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (VMOVDQUrm addr:$src)>;
+  def : Pat<(loadv8i16 addr:$src),
+            (VMOVDQUrm addr:$src)>;
+  def : Pat<(loadv16i8 addr:$src),
+            (VMOVDQUrm addr:$src)>;
+
   def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
             (VMOVDQAmr addr:$dst, VR128:$src)>;
   def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
@@ -3345,7 +3382,7 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
-                                     (bitconvert (memop_frag addr:$src2)))))]>,
+                                     (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 } // ExeDomain = SSEPackedInt
@@ -3405,28 +3442,28 @@ defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                              loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
+                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
                               VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
-                               VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM,
+                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
                                0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                             memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
+                             memop, i128mem, SchedWriteVecIMul.XMM>;
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
-                             loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>,
+                             load, i128mem, SchedWritePSADBW.XMM, 0>,
                              VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
-                             loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>,
+                             load, i256mem, SchedWritePSADBW.YMM, 0>,
                              VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
-                            memopv2i64, i128mem, SchedWritePSADBW.XMM>;
+                            memop, i128mem, SchedWritePSADBW.XMM>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Logical Instructions
@@ -3453,7 +3490,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode RC:$src1,
-                       (SrcVT (bitconvert (ld_frag addr:$src2))))))]>,
+                       (SrcVT (ld_frag addr:$src2)))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
        (ins RC:$src1, u8imm:$src2),
@@ -3473,16 +3510,16 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                               OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
-                              DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                              DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                                 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
-                                DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L,
+                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
                                 VEX_WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
                             VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
-                            memopv2i64>;
+                            memop>;
 }
 
 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
@@ -3582,7 +3619,7 @@ let Predicates = [HasAVX, prd] in {
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
-                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
+                       (vt128 (OpNode (load addr:$src1),
                         (i8 imm:$src2))))]>, VEX,
                   Sched<[sched.XMM.Folded]>, VEX_WIG;
 }
@@ -3600,7 +3637,7 @@ let Predicates = [HasAVX2, prd] in {
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
-                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
+                        (vt256 (OpNode (load addr:$src1),
                          (i8 imm:$src2))))]>, VEX, VEX_L,
                    Sched<[sched.YMM.Folded]>, VEX_WIG;
 }
@@ -3618,7 +3655,7 @@ let Predicates = [UseSSE2] in {
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set VR128:$dst,
-                 (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+                 (vt128 (OpNode (memop addr:$src1),
                         (i8 imm:$src2))))]>,
                Sched<[sched.XMM.Folded]>;
 }
@@ -3658,7 +3695,7 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                [(set RC:$dst,
                      (OutVT (OpNode (ArgVT RC:$src1),
-                                    (bitconvert (ld_frag addr:$src2)))))]>,
+                                    (ld_frag addr:$src2))))]>,
                Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -3683,53 +3720,53 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                  [(set RC:$dst,
                        (OutVT (OpNode (ArgVT RC:$src1),
-                                      (bitconvert (ld_frag addr:$src2)))))]>,
+                                      (ld_frag addr:$src2))))]>,
                  Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V, VEX_WIG;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V, VEX_WIG;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V, VEX_WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
 
   defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
   defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
 
   defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
 
   defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -3754,89 +3791,88 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
       !if(Is2Addr,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (OpNode RC:$src1,
-                                  (bitconvert (ld_frag addr:$src2)))))]>,
+      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
 
   defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -4155,7 +4191,7 @@ let Predicates = [UseAVX] in {
             (VMOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (VMOVDI2PDIrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
             (VMOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (VMOVDI2PDIrm addr:$src)>;
@@ -4180,7 +4216,7 @@ let Predicates = [UseSSE2] in {
             (MOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (MOVDI2PDIrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
             (MOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (MOVDI2PDIrm addr:$src)>;
@@ -4335,30 +4371,30 @@ defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
 let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (VMOVSHDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
             (VMOVSHDUPrm addr:$src)>;
   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
             (VMOVSLDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
             (VMOVSLDUPrm addr:$src)>;
   def : Pat<(v8i32 (X86Movshdup VR256:$src)),
             (VMOVSHDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
             (VMOVSHDUPYrm addr:$src)>;
   def : Pat<(v8i32 (X86Movsldup VR256:$src)),
             (VMOVSLDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
             (VMOVSLDUPYrm addr:$src)>;
 }
 
 let Predicates = [UseSSE3] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (MOVSHDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
             (MOVSHDUPrm addr:$src)>;
   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
             (MOVSLDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
             (MOVSLDUPrm addr:$src)>;
 }
 
@@ -4580,7 +4616,7 @@ multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
                  (ins i128mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                  [(set VR128:$dst,
-                   (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>,
+                   (vt (OpNode (ld_frag addr:$src))))]>,
                  Sched<[sched.XMM.Folded]>;
 }
 
@@ -4597,19 +4633,19 @@ multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
                   (ins i256mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR256:$dst,
-                    (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
+                    (vt (OpNode (load addr:$src))))]>,
                   Sched<[sched.YMM.Folded]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
-                              loadv2i64>, VEX, VEX_WIG;
+                              load>, VEX, VEX_WIG;
   defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
-                              loadv2i64>, VEX, VEX_WIG;
+                              load>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX] in {
   defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
-                              loadv2i64>, VEX, VEX_WIG;
+                              load>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
@@ -4623,11 +4659,11 @@ let Predicates = [HasAVX2, NoVLX] in {
 }
 
 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
-                          memopv2i64>;
+                          memop>;
 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
-                          memopv2i64>;
+                          memop>;
 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
-                          memopv2i64>;
+                          memop>;
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Binary Operator Instructions
@@ -4652,8 +4688,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (DstVT (OpNode (OpVT RC:$src1),
-          (bitconvert (memop_frag addr:$src2)))))]>,
+         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4675,8 +4710,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
-         (IntId128 VR128:$src1,
-          (bitconvert (ld_frag addr:$src2))))]>,
+         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4693,83 +4727,83 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        (ins VR256:$src1, i256mem:$src2),
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
-         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
+         (IntId256 VR256:$src1, (load addr:$src2)))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
-                                  VR128, loadv2i64, i128mem,
+                                  VR128, load, i128mem,
                                   SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
-                                  v16i8, VR128, loadv2i64, i128mem,
+                                  v16i8, VR128, load, i128mem,
                                   SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
 }
 defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
-                                  VR128, loadv2i64, i128mem,
+                                  VR128, load, i128mem,
                                   SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
-                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
                                       int_x86_ssse3_psign_w_128,
-                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128,
-                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
-                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128,
-                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
 }
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
-                                   v32i8, VR256, loadv4i64, i256mem,
+                                   v32i8, VR256, load, i256mem,
                                    SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
-                                  loadv4i64, i256mem,
+                                  load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
-                                  loadv4i64, i256mem,
+                                  load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
   defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
                                        SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
@@ -4790,33 +4824,33 @@ let isCommutable = 0 in {
 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 let isCommutable = 0 in {
   defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
-                                     SchedWriteVecALU.XMM, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memop>;
   defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
-                                     SchedWriteVecALU.XMM, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memop>;
   defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
-                                     SchedWriteVecALU.XMM, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memop>;
   defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
-                                 memopv2i64, i128mem, SchedWriteVarShuffle.XMM>;
+                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
   defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
                                      int_x86_ssse3_phadd_sw_128,
-                                     SchedWritePHAdd.XMM, memopv2i64>;
+                                     SchedWritePHAdd.XMM, memop>;
   defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
                                      int_x86_ssse3_phsub_sw_128,
-                                     SchedWritePHAdd.XMM, memopv2i64>;
+                                     SchedWritePHAdd.XMM, memop>;
   defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
-                                 v16i8, VR128, memopv2i64, i128mem,
+                                 v16i8, VR128, memop, i128mem,
                                  SchedWriteVecIMul.XMM>;
 }
 defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
-                                 VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
+                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -4843,20 +4877,20 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set RC:$dst, (VT (X86PAlignr RC:$src1,
-                                     (bitconvert (memop_frag addr:$src2)),
+                                     (memop_frag addr:$src2),
                                      (i8 imm:$src3))))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
-  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem,
+  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
                                 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
-  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem,
+  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
                                  SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
-  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem,
+  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
                                SchedWriteShuffle.XMM>;
 
 //===---------------------------------------------------------------------===//
@@ -4980,7 +5014,7 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
 
   // AVX2 Register-Memory patterns
   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
@@ -4994,7 +5028,7 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
 
   def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -5003,10 +5037,10 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
 
-  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
@@ -5019,10 +5053,10 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
 
-  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
@@ -5082,7 +5116,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   }
   let Predicates = [HasAVX, NoVLX] in {
@@ -5092,7 +5126,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
@@ -5101,7 +5135,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
 
   def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -5112,7 +5146,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -5121,7 +5155,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -5132,7 +5166,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   }
 }
@@ -5950,7 +5984,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
                   (ins i128mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR128:$dst,
-                    (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
+                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
                  Sched<[Sched.Folded]>;
 }
 
@@ -5958,10 +5992,10 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
 // model, although the naming is misleading.
 let Predicates = [HasAVX] in
 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
-                                         X86phminpos, loadv2i64,
+                                         X86phminpos, load,
                                          WritePHMINPOS>, VEX, VEX_WIG;
 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
-                                         X86phminpos, memopv2i64,
+                                         X86phminpos, memop,
                                          WritePHMINPOS>;
 
 /// SS48I_binop_rm - Simple SSE41 binary operator.
@@ -5983,118 +6017,118 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
+         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
+                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
                                   VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>,
+                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
 }
 
 let Predicates = [HasAVX, NoVLX] in
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
-                                 loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>,
+                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 let Predicates = [HasAVX] in
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
-                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX] in
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>,
+                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 let Predicates = [HasAVX2] in
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in {
   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
-                                memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>;
+                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
   defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
-                                memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
 }
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
@@ -6120,8 +6154,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (IntId RC:$src1,
-           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+          (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6148,8 +6181,7 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1,
-                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6171,28 +6203,28 @@ def BlendCommuteImm8 : SDNodeXForm<imm, [{
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
-                                        VR128, loadv2i64, i128mem, 0,
+                                        VR128, load, i128mem, 0,
                                         SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
   }
 
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                   VR128, loadv4f32, f128mem, 0,
+                                   VR128, load, f128mem, 0,
                                    SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                   VR128, loadv2f64, f128mem, 0,
+                                   VR128, load, f128mem, 0,
                                    SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                    VR256, loadv8f32, i256mem, 0,
+                                    VR256, load, i256mem, 0,
                                     SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                  VR256, loadv4i64, i256mem, 0,
+                                  VR256, load, i256mem, 0,
                                   SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
   }
 }
@@ -6200,17 +6232,17 @@ let Predicates = [HasAVX2] in {
 let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
-                                     VR128, memopv2i64, i128mem, 1,
+                                     VR128, memop, i128mem, 1,
                                      SchedWriteMPSAD.XMM>;
   }
 
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
-                                  VR128, memopv4f32, f128mem, 1,
+                                  VR128, memop, f128mem, 1,
                                   SchedWriteDPPS.XMM>;
   let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
-                                  VR128, memopv2f64, f128mem, 1,
+                                  VR128, memop, f128mem, 1,
                                   SchedWriteDPPD.XMM>;
 }
 
@@ -6238,56 +6270,54 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1,
-                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
   // Pattern to commute if load is in first source.
-  def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
-                          RC:$src1, imm:$src3)),
+  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)),
             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
                                             (commuteXForm imm:$src3))>;
 }
 
 let Predicates = [HasAVX] in {
   defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
-                                  VR128, loadv4f32, f128mem, 0, SSEPackedSingle,
+                                  VR128, load, f128mem, 0, SSEPackedSingle,
                                   SchedWriteFBlend.XMM, BlendCommuteImm4>,
                                   VEX_4V, VEX_WIG;
   defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
-                                   VR256, loadv8f32, f256mem, 0, SSEPackedSingle,
+                                   VR256, load, f256mem, 0, SSEPackedSingle,
                                    SchedWriteFBlend.YMM, BlendCommuteImm8>,
                                    VEX_4V, VEX_L, VEX_WIG;
   defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
-                                  VR128, loadv2f64, f128mem, 0, SSEPackedDouble,
+                                  VR128, load, f128mem, 0, SSEPackedDouble,
                                   SchedWriteFBlend.XMM, BlendCommuteImm2>,
                                   VEX_4V, VEX_WIG;
   defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
-                                   VR256, loadv4f64, f256mem, 0, SSEPackedDouble,
+                                   VR256, load, f256mem, 0, SSEPackedDouble,
                                    SchedWriteFBlend.YMM, BlendCommuteImm4>,
                                    VEX_4V, VEX_L, VEX_WIG;
   defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
-                                  VR128, loadv2i64, i128mem, 0, SSEPackedInt,
+                                  VR128, load, i128mem, 0, SSEPackedInt,
                                   SchedWriteBlend.XMM, BlendCommuteImm8>,
                                   VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
-                                   VR256, loadv4i64, i256mem, 0, SSEPackedInt,
+                                   VR256, load, i256mem, 0, SSEPackedInt,
                                    SchedWriteBlend.YMM, BlendCommuteImm8>,
                                    VEX_4V, VEX_L, VEX_WIG;
 }
 
 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
-                               VR128, memopv4f32, f128mem, 1, SSEPackedSingle,
+                               VR128, memop, f128mem, 1, SSEPackedSingle,
                                SchedWriteFBlend.XMM, BlendCommuteImm4>;
 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
-                               VR128, memopv2f64, f128mem, 1, SSEPackedDouble,
+                               VR128, memop, f128mem, 1, SSEPackedDouble,
                                SchedWriteFBlend.XMM, BlendCommuteImm2>;
 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
-                               VR128, memopv2i64, i128mem, 1, SSEPackedInt,
+                               VR128, memop, i128mem, 1, SSEPackedInt,
                                SchedWriteBlend.XMM, BlendCommuteImm8>;
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -6321,7 +6351,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst,
-                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
+                        (IntId RC:$src1, (mem_frag addr:$src2),
                                RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
                 Sched<[sched.Folded, sched.ReadAfterFold,
                        // x86memop:$src2
@@ -6334,7 +6364,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
-                                           loadv2f64, int_x86_sse41_blendvpd,
+                                           load, int_x86_sse41_blendvpd,
                                            SchedWriteFVarBlend.XMM>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
                                   loadv4f64, int_x86_avx_blendv_pd_256,
@@ -6342,20 +6372,20 @@ defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
-                                           loadv4f32, int_x86_sse41_blendvps,
+                                           load, int_x86_sse41_blendvps,
                                            SchedWriteFVarBlend.XMM>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
                                   loadv8f32, int_x86_avx_blendv_ps_256,
                                   SchedWriteFVarBlend.YMM>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
-                                           loadv2i64, int_x86_sse41_pblendvb,
+                                           load, int_x86_sse41_pblendvb,
                                            SchedWriteVarBlend.XMM>;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                      loadv4i64, int_x86_avx2_pblendvb,
+                                      load, int_x86_avx2_pblendvb,
                                       SchedWriteVarBlend.YMM>, VEX_L;
 }
 
@@ -6486,18 +6516,18 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                      "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
-                       (bitconvert (mem_frag addr:$src2)), XMM0))]>,
+                       (mem_frag addr:$src2), XMM0))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 let ExeDomain = SSEPackedDouble in
-defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem,
                                   int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
 let ExeDomain = SSEPackedSingle in
-defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem,
                                   int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
-defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem,
                                   int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
 
 // Aliases with the implicit xmm0 argument
@@ -6553,6 +6583,12 @@ let Predicates = [HasAVX2, NoVLX] in {
             (VMOVNTDQAYrm addr:$src)>;
   def : Pat<(v4i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
@@ -6562,6 +6598,12 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVNTDQArm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
 }
 
 let Predicates = [UseSSE41] in {
@@ -6571,6 +6613,12 @@ let Predicates = [UseSSE41] in {
             (MOVNTDQArm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (MOVNTDQArm addr:$src)>;
+  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
 }
 
 } // AddedComplexity
@@ -6603,17 +6651,17 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
-                                memopv2i64, i128mem, SchedWriteVecALU.XMM>;
+                                memop, i128mem, SchedWriteVecALU.XMM>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - String/text Processing Instructions
@@ -6764,9 +6812,9 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1,
-                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
+                    (memop addr:$src2), XMM0)),
                   (set VR128:$dst, (IntId VR128:$src1,
-                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8,
+                    (memop addr:$src2))))]>, T8,
              Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6783,7 +6831,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1,
-                            (bc_v4i32 (memopv2i64 addr:$src2)),
+                            (memop addr:$src2),
                             (i8 imm:$src3)))]>, TA,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
@@ -6836,39 +6884,39 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
 // Perform One Round of an AES Encryption/Decryption Flow
 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
 }
 
 let Predicates = [NoVLX, HasVAES] in {
   defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesenc_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesenclast_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesdec_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
-                         int_x86_aesni_aesenc, memopv2i64, 1>;
+                         int_x86_aesni_aesenc, memop, 1>;
   defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
-                         int_x86_aesni_aesenclast, memopv2i64, 1>;
+                         int_x86_aesni_aesenclast, memop, 1>;
   defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
-                         int_x86_aesni_aesdec, memopv2i64, 1>;
+                         int_x86_aesni_aesdec, memop, 1>;
   defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
-                         int_x86_aesni_aesdeclast, memopv2i64, 1>;
+                         int_x86_aesni_aesdeclast, memop, 1>;
 }
 
 // Perform the AES InvMixColumn Transformation
@@ -6882,7 +6930,7 @@ let Predicates = [HasAVX, HasAES] in {
   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
-      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
+      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
       Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
@@ -6893,7 +6941,7 @@ def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1),
   "aesimc\t{$src1, $dst|$dst, $src1}",
-  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
   Sched<[WriteAESIMC.Folded]>;
 
 // AES Round Key Generation Assist
@@ -6908,7 +6956,7 @@ let Predicates = [HasAVX, HasAES] in {
       (ins i128mem:$src1, u8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
-        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
+        (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>,
       Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
@@ -6921,7 +6969,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1, u8imm:$src2),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   [(set VR128:$dst,
-    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+    (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>,
   Sched<[WriteAESKeyGen.Folded]>;
 
 //===----------------------------------------------------------------------===//
@@ -6949,12 +6997,12 @@ let Predicates = [NoAVX, HasPCLMUL] in {
               (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
               [(set VR128:$dst,
-                 (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2),
+                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
                   imm:$src3))]>,
               Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
   } // Constraints = "$src1 = $dst"
 
-  def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1,
+  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
                                 (i8 imm:$src3)),
             (PCLMULQDQrm VR128:$src1, addr:$src2,
                           (PCLMULCommuteImm imm:$src3))>;
@@ -6997,11 +7045,11 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
-defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64,
+defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
                              int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
 
 let Predicates = [NoVLX, HasVPCLMULQDQ] in
-defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64,
+defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
                               int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
 
 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
@@ -7157,11 +7205,11 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
 let Predicates = [HasAVX2, NoVLX] in {
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
 }
 
@@ -7175,11 +7223,11 @@ def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
 let Predicates = [HasAVX1Only] in {
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
 }
 
@@ -7212,7 +7260,7 @@ multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
             (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
   def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
-                                    (From (bitconvert (memop_frag addr:$src2))),
+                                    (From (memop_frag addr:$src2)),
                                     (iPTR imm)),
             (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
@@ -7225,9 +7273,9 @@ let Predicates = [HasAVX, NoVLX] in {
 
 let Predicates = [HasAVX1Only] in {
   defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
-  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv4i32>;
+  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
+  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv16i8>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7316,7 +7364,7 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
 
 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                       RegisterClass RC, X86MemOperand x86memop_f,
-                      X86MemOperand x86memop_i, PatFrag i_frag,
+                      X86MemOperand x86memop_i,
                       ValueType f_vt, ValueType i_vt,
                       X86FoldableSchedWrite sched,
                       X86FoldableSchedWrite varsched> {
@@ -7330,7 +7378,7 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                (ins RC:$src1, x86memop_i:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
-                              (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
+                              (i_vt (load addr:$src2)))))]>, VEX_4V,
                Sched<[varsched.Folded, sched.ReadAfterFold]>;
 
     def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
@@ -7349,18 +7397,18 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
-                               loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM,
+                               v4f32, v4i32, SchedWriteFShuffle.XMM,
                                SchedWriteFVarShuffle.XMM>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                               loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM,
+                               v8f32, v8i32, SchedWriteFShuffle.YMM,
                                SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
-                               loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM,
+                               v2f64, v2i64, SchedWriteFShuffle.XMM,
                                SchedWriteFVarShuffle.XMM>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                               loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM,
+                               v4f64, v4i64, SchedWriteFShuffle.YMM,
                                SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 
@@ -7441,8 +7489,7 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             [(set RC:$dst, (X86cvtph2ps (bc_v8i16
-                                          (loadv2i64 addr:$src))))]>,
+             [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
              T8PD, VEX, Sched<[sched.Folded]>;
 }
 
@@ -7516,7 +7563,7 @@ let Predicates = [HasF16C, NoVLX] in {
 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, X86FoldableSchedWrite sched,
-                          RegisterClass RC, PatFrag memop_frag,
+                          RegisterClass RC,
                           X86MemOperand x86memop, SDNodeXForm commuteXForm> {
   let isCommutable = 1 in
   def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
@@ -7530,22 +7577,20 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1,
-           (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
 
   // Pattern to commute if load is in first source.
-  def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
-                          RC:$src1, imm:$src3)),
+  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)),
             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
                                             (commuteXForm imm:$src3))>;
 }
 
 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
-                               SchedWriteBlend.XMM, VR128, loadv2i64, i128mem,
+                               SchedWriteBlend.XMM, VR128, i128mem,
                                BlendCommuteImm4>;
 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
-                                SchedWriteBlend.YMM, VR256, loadv4i64, i256mem,
+                                SchedWriteBlend.YMM, VR256, i256mem,
                                 BlendCommuteImm8>, VEX_L;
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -7779,7 +7824,7 @@ let Predicates = [HasAVX1Only] in {
 // VPERM - Permute instructions
 //
 
-multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+multiclass avx2_perm<bits<8> opc, string OpcodeStr,
                      ValueType OpVT, X86FoldableSchedWrite Sched,
                      X86MemOperand memOp> {
   let Predicates = [HasAVX2, NoVLX] in {
@@ -7796,16 +7841,14 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermv VR256:$src1,
-                              (bitconvert (mem_frag addr:$src2)))))]>,
+                              (load addr:$src2))))]>,
                      Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
   }
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256,
-                        i256mem>;
+defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256,
-                        f256mem>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          ValueType OpVT, X86FoldableSchedWrite Sched,
@@ -7875,9 +7918,9 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
-  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv4i32>;
+  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
+  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv16i8>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8036,7 +8079,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1,
-                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+                       (vt128 (load addr:$src2)))))]>,
              VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
                             SchedWriteVarVecShift.XMM.ReadAfterFold]>;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
@@ -8050,7 +8093,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
-                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
+                       (vt256 (load addr:$src2)))))]>,
              VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
                                    SchedWriteVarVecShift.YMM.ReadAfterFold]>;
 }
@@ -8064,13 +8107,11 @@ let Predicates = [HasAVX2, NoVLX] in {
 
   def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
             (VPSRAVDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (X86vsrav VR128:$src1,
-                    (bitconvert (loadv2i64 addr:$src2)))),
+  def : Pat<(v4i32 (X86vsrav VR128:$src1, (load addr:$src2))),
             (VPSRAVDrm VR128:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
             (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86vsrav VR256:$src1,
-                    (bitconvert (loadv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86vsrav VR256:$src1, (load addr:$src2))),
             (VPSRAVDYrm VR256:$src1, addr:$src2)>;
 }
 
@@ -8152,7 +8193,7 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
 
     def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
-                                 (bitconvert (MemOpFrag addr:$src2)))))]>,
+                                 (MemOpFrag addr:$src2))))]>,
              Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
   }
 }
@@ -8170,7 +8211,7 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
   def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
               (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
               [(set RC:$dst, (OpVT (OpNode RC:$src1,
-                                    (bitconvert (MemOpFrag addr:$src2)),
+                                    (MemOpFrag addr:$src2),
                               imm:$src3)))], SSEPackedInt>,
               Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
   }
@@ -8180,24 +8221,24 @@ multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
   let Constraints = "$src1 = $dst",
       Predicates  = [HasGFNI, UseSSE2] in
   defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
-                                      VR128, loadv2i64, i128mem, 1>;
+                                      VR128, load, i128mem, 1>;
   let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
     defm V##NAME    : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
-                                      loadv2i64, i128mem>, VEX_4V, VEX_W;
+                                      load, i128mem>, VEX_4V, VEX_W;
     defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
-                                      loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W;
+                                      load, i256mem>, VEX_4V, VEX_L, VEX_W;
   }
 }
 
 // GF2P8MULB
 let Constraints = "$src1 = $dst",
     Predicates  = [HasGFNI, UseSSE2] in
-defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64,
+defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
                                     i128mem, 1>;
 let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
-  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64,
+  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
                                    i128mem>, VEX_4V;
-  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64,
+  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
                                    i256mem>, VEX_4V, VEX_L;
 }
 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index a8013e38e63..39f50c10ae1 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -11,32 +11,32 @@
 //
 //===----------------------------------------------------------------------===//
 
-multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
+           [(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
            Sched<[SchedWritePHAdd.XMM.Folded, SchedWritePHAdd.XMM.ReadAfterFold]>;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
-  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
-  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
-  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
-  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
-  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
-  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
-  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
-  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
-  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
-  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
-  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
-  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
-  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
-  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
+  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd>;
+  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq>;
+  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw>;
+  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq>;
+  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd>;
+  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq>;
+  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd>;
+  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq>;
+  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw>;
+  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq>;
+  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd>;
+  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq>;
+  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw>;
+  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq>;
+  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd>;
 }
 
 // Scalar load 2 addr operand instructions
@@ -48,47 +48,47 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP,
+           [(set VR128:$dst, (Int mem_cpat:$src))]>, XOP,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     PatFrag memop, X86FoldableSchedWrite sched> {
+                     X86FoldableSchedWrite sched> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
+           [(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     PatFrag memop, X86FoldableSchedWrite sched> {
+                     X86FoldableSchedWrite sched> {
   def Yrr : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[sched]>;
   def Yrm : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L,
+           [(set VR256:$dst, (Int (load addr:$src)))]>, XOP, VEX_L,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let ExeDomain = SSEPackedSingle in {
   defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
                            ssmem, sse_load_f32, SchedWriteFRnd.Scl>;
-  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32,
+  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps,
                            SchedWriteFRnd.XMM>;
-  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32,
+  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256,
                            SchedWriteFRnd.YMM>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
                            sdmem, sse_load_f64, SchedWriteFRnd.Scl>;
-  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64,
+  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd,
                            SchedWriteFRnd.XMM>;
-  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64,
+  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256,
                            SchedWriteFRnd.YMM>;
 }
 
@@ -105,13 +105,13 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1),
-                             (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+                             (vt128 (load addr:$src2)))))]>,
            XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
            (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
+              (vt128 (OpNode (vt128 (load addr:$src1)),
                              (vt128 VR128:$src2))))]>,
              XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
   // For disassembler
@@ -150,7 +150,7 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
            (ins i128mem:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>,
+              (vt128 (OpNode (vt128 (load addr:$src1)), imm:$src2)))]>,
            XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -181,7 +181,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-              (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+              (Int VR128:$src1, (load addr:$src2),
               VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -260,7 +260,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                 (vt128 (OpNode (vt128 VR128:$src1),
-                               (vt128 (bitconvert (loadv2i64 addr:$src2))),
+                               (vt128 (load addr:$src2)),
                                 imm:$cc)))]>,
              XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
     let isAsmParserOnly = 1, hasSideEffects = 0 in {
@@ -279,7 +279,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
     }
   }
 
-  def : Pat<(OpNode (bitconvert (loadv2i64 addr:$src2)),
+  def : Pat<(OpNode (load addr:$src2),
                     (vt128 VR128:$src1), imm:$cc),
             (!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2,
                                            (CommuteVPCOMCC imm:$cc))>;
@@ -310,14 +310,14 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                             (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>,
+                             (vt128 (load addr:$src3)))))]>,
             XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
             (ins VR128:$src1, i128mem:$src2, VR128:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set VR128:$dst,
-              (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))),
+              (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (load addr:$src2)),
                              (vt128 VR128:$src3))))]>,
             XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
                            // 128mem:$src2
@@ -401,8 +401,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set RC:$dst,
-          (VT (X86vpermil2 RC:$src1, RC:$src2,
-                           (bitconvert (IntLdFrag addr:$src3)),
+          (VT (X86vpermil2 RC:$src1, RC:$src2, (IntLdFrag addr:$src3),
                            (i8 imm:$src4))))]>, VEX_W,
         Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
@@ -437,10 +436,10 @@ let ExeDomain = SSEPackedDouble in {
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem,
-                                 v4f32, loadv4f32, loadv2i64,
+                                 v4f32, loadv4f32, loadv4i32,
                                  SchedWriteFVarShuffle.XMM>;
   defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem,
-                                  v8f32, loadv8f32, loadv4i64,
+                                  v8f32, loadv8f32, loadv8i32,
                                   SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 58b1c505944..b5fd9f4a785 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1391,7 +1391,7 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
   if (ConstantEntry.isMachineConstantPoolEntry())
     return nullptr;
 
-  auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
+  const Constant *C = ConstantEntry.Val.ConstVal;
   assert((!C || ConstantEntry.getType() == C->getType()) &&
          "Expected a constant of the same type!");
   return C;
@@ -1594,6 +1594,18 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   }
 }
 
+static unsigned getRegisterWidth(const MCOperandInfo &Info) {
+  if (Info.RegClass == X86::VR128RegClassID ||
+      Info.RegClass == X86::VR128XRegClassID)
+    return 128;
+  if (Info.RegClass == X86::VR256RegClassID ||
+      Info.RegClass == X86::VR256XRegClassID)
+    return 256;
+  if (Info.RegClass == X86::VR512RegClassID)
+    return 512;
+  llvm_unreachable("Unknown register class!");
+}
+
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   X86MCInstLower MCInstLowering(*MF, *this);
   const X86RegisterInfo *RI =
@@ -1879,8 +1891,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 64> Mask;
-      DecodePSHUFBMask(C, Mask);
+      DecodePSHUFBMask(C, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1951,8 +1964,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMILPMask(C, ElSize, Mask);
+      DecodeVPERMILPMask(C, ElSize, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1982,8 +1996,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
+      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
@@ -1999,8 +2014,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPPERMMask(C, Mask);
+      DecodeVPPERMMask(C, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index c7ddf93f8e8..720be8afa62 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -112,11 +112,10 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
   return true;
 }
 
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+void DecodePSHUFBMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
@@ -125,7 +124,7 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / 8;
   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
          "Unexpected number of vector elements.");
 
@@ -151,12 +150,10 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
                         SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
 
@@ -166,7 +163,7 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) &&
          "Unexpected number of vector elements.");
@@ -189,11 +186,13 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
 }
 
 void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
+                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask) {
   Type *MaskTy = C->getType();
   unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
   (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
+  assert((MaskTySize == 128 || MaskTySize == 256) &&
+         Width >= MaskTySize && "Unexpected vector size.");
 
   // The shuffle mask requires elements the same size as the target.
   APInt UndefElts;
@@ -201,7 +200,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected number of vector elements.");
@@ -242,9 +241,12 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   }
 }
 
-void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  assert(C->getType()->getPrimitiveSizeInBits() == 128 &&
-         "Unexpected vector size.");
+void DecodeVPPERMMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert(Width == 128 && Width >= MaskTySize && "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
   APInt UndefElts;
@@ -252,7 +254,7 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / 8;
   assert(NumElts == 16 && "Unexpected number of vector elements.");
 
   for (unsigned i = 0; i != NumElts; ++i) {
@@ -291,12 +293,10 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
                       SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -307,7 +307,7 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
@@ -319,12 +319,10 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
   }
 }
 
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
                        SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -335,7 +333,7 @@ void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index b703cbbd2b2..b08c31935d2 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -26,25 +26,28 @@ class Constant;
 class MVT;
 
 /// Decode a PSHUFB mask from an IR-level vector constant.
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFBMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP variable mask from an IR-level vector constant.
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP2 variable mask from an IR-level vector constant.
 void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
+                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPPERM variable mask from an IR-level vector constant.
-void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVPPERMMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
                        SmallVectorImpl<int> &ShuffleMask);
 
 } // llvm namespace
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index 75a11845b1e..0c501ea6895 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -224,7 +224,7 @@ entry:
 define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
 ; AVX1-LABEL: shuffle_v16i16_4501_mem:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index 6affef33932..9216cad5882 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -1630,7 +1630,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
-; AVX2-SLOW-NEXT:    vbroadcastsd 24(%rsi), %ymm5
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
@@ -1654,19 +1654,19 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
 ; AVX2-FAST-NEXT:    vbroadcastsd %xmm2, %ymm4
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7]
-; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
-; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2]
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = [5,6,5,6,5,6,7,7]
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm5, %ymm1
 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX2-FAST-NEXT:    vbroadcastsd 24(%rsi), %ymm2
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FAST-NEXT:    vmovups %ymm1, 64(%rdi)
-; AVX2-FAST-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,3,3]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-FAST-NEXT:    vmovups %ymm0, 64(%rdi)
+; AVX2-FAST-NEXT:    vmovups %ymm4, 32(%rdi)
 ; AVX2-FAST-NEXT:    vmovups %ymm3, (%rdi)
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
index 0900fdccb49..d0ed99f92f3 100644
--- a/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -57,9 +57,9 @@ define <16 x i8> @test5(<16 x i8> %V) {
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    movq %rax, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, (%rax)
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
-; CHECK-NEXT:    movdqa %xmm1, (%rax)
-; CHECK-NEXT:    pshufb %xmm1, %xmm0
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
+; CHECK-NEXT:    movaps %xmm1, (%rax)
+; CHECK-NEXT:    pshufb (%rax), %xmm0
 ; CHECK-NEXT:    retq
   store <2 x i64> <i64 1, i64 0>, <2 x i64>* undef, align 16
   %l = load <2 x i64>, <2 x i64>* undef, align 16
diff --git a/test/CodeGen/X86/vector-extend-inreg.ll b/test/CodeGen/X86/vector-extend-inreg.ll
index 86bb13f57eb..d790cb54b61 100644
--- a/test/CodeGen/X86/vector-extend-inreg.ll
+++ b/test/CodeGen/X86/vector-extend-inreg.ll
@@ -13,6 +13,7 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
 ; X32-SSE-NEXT:    subl $384, %esp # imm = 0x180
 ; X32-SSE-NEXT:    movl 88(%ebp), %ecx
 ; X32-SSE-NEXT:    movdqa 72(%ebp), %xmm0
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
@@ -21,7 +22,6 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
diff --git a/test/CodeGen/X86/vector-idiv-v2i32.ll b/test/CodeGen/X86/vector-idiv-v2i32.ll
index 49e29ac17a5..00126d67532 100644
--- a/test/CodeGen/X86/vector-idiv-v2i32.ll
+++ b/test/CodeGen/X86/vector-idiv-v2i32.ll
@@ -693,20 +693,20 @@ define void @test_sdiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-NEXT:    movdqa {{.*#+}} xmm3 = [31,0,31,0]
-; X86-NEXT:    movdqa %xmm2, %xmm4
-; X86-NEXT:    psrlq %xmm3, %xmm4
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [31,0,31,0]
+; X86-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-NEXT:    movdqa %xmm3, %xmm4
+; X86-NEXT:    psrlq %xmm2, %xmm4
 ; X86-NEXT:    movl $31, %ecx
 ; X86-NEXT:    movd %ecx, %xmm5
-; X86-NEXT:    psrlq %xmm5, %xmm2
-; X86-NEXT:    movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
+; X86-NEXT:    psrlq %xmm5, %xmm3
+; X86-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
 ; X86-NEXT:    movdqa %xmm1, %xmm4
-; X86-NEXT:    psrlq %xmm3, %xmm4
+; X86-NEXT:    psrlq %xmm2, %xmm4
 ; X86-NEXT:    psrlq %xmm5, %xmm1
 ; X86-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
-; X86-NEXT:    xorpd %xmm2, %xmm1
-; X86-NEXT:    psubq %xmm2, %xmm1
+; X86-NEXT:    xorpd %xmm3, %xmm1
+; X86-NEXT:    psubq %xmm3, %xmm1
 ; X86-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X86-NEXT:    psrlq $29, %xmm1
 ; X86-NEXT:    paddq %xmm0, %xmm1
diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll
index ce99d22dbbd..167128ae002 100644
--- a/test/CodeGen/X86/widened-broadcast.ll
+++ b/test/CodeGen/X86/widened-broadcast.ll
@@ -121,10 +121,21 @@ define <8 x i32> @load_splat_8i32_4i32_01010101(<4 x i32>* %ptr) nounwind uwtabl
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_splat_8i32_4i32_01010101:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_splat_8i32_4i32_01010101:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_splat_8i32_4i32_01010101:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_splat_8i32_4i32_01010101:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX512-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
   %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -138,21 +149,10 @@ define <8 x i32> @load_splat_8i32_8i32_01010101(<8 x i32>* %ptr) nounwind uwtabl
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_8i32_8i32_01010101:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_8i32_8i32_01010101:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_8i32_8i32_01010101:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_8i32_8i32_01010101:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <8 x i32>, <8 x i32>* %ptr
   %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -246,10 +246,21 @@ define <16 x i16> @load_splat_16i16_8i16_0123012301230123(<8 x i16>* %ptr) nounw
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_splat_16i16_8i16_0123012301230123:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_splat_16i16_8i16_0123012301230123:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX512-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3,i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -263,21 +274,10 @@ define <16 x i16> @load_splat_16i16_16i16_0101010101010101(<16 x i16>* %ptr) nou
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastss (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_16i16_16i16_0101010101010101:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastss (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x i16>, <16 x i16>* %ptr
   %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -446,10 +446,21 @@ define <32 x i8> @load_splat_32i8_16i8_01234567012345670123456701234567(<16 x i8
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX512-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-- 
GitLab


From c7a8ddb84930e6030d1498e87bae5ff928d8c4fb Mon Sep 17 00:00:00 2001
From: Dorit Nuzman <dorit.nuzman@intel.com>
Date: Mon, 22 Oct 2018 06:17:09 +0000
Subject: [PATCH 0368/1116] [IAI,LV] Avoid creating a scalar epilogue due to
 gaps in interleave-groups when optimizing for size

LV is careful to respect -Os and not to create a scalar epilog in all cases
(runtime tests, trip-counts that require a remainder loop) except for peeling
due to gaps in interleave-groups. This patch fixes that; -Os will now have us
invalidate such interleave-groups and vectorize without an epilog.

The patch also removes a related FIXME comment that is now obsolete, and was
also inaccurate:
"FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a smaller
MaxVF that does not require a scalar epilog."
(requiresScalarEpilog() has nothing to do with VF).

Reviewers: Ayal, hsaito, dcaballe, fhahn

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D53420


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344883 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/VectorUtils.h           |  22 ++++
 lib/Analysis/VectorUtils.cpp                  |  24 ++++
 lib/Transforms/Vectorize/LoopVectorize.cpp    |  10 +-
 .../x86-interleaved-accesses-masked-group.ll  | 114 +++++++++++++++++-
 4 files changed, 166 insertions(+), 4 deletions(-)

diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index 937a52fb968..2f562ec3d30 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -308,6 +308,23 @@ public:
     propagateMetadata(NewInst, VL);
   }
 
+  /// Returns true if this Group requires a scalar iteration to handle gaps.
+  bool requiresScalarEpilogue() const {
+    // If Group has no gaps, or has gaps but the last member exists, then a
+    // scalar epilog is not needed for this group.
+    if (getNumMembers() == getFactor() || getMember(getFactor() - 1))
+      return false;
+
+    // We have a group with gaps. It therefore cannot be a group of stores,
+    // and it can't be a reversed access, because such groups get invalidated.
+    assert(!getMember(0)->mayWriteToMemory() &&
+           "Group should have been invalidated");
+    assert(!isReverse() && "Group should have been invalidated");
+
+    // This is a group of loads, with gaps, and without a last-member
+    return true;
+  }
+
 private:
   unsigned Factor; // Interleave Factor.
   bool Reverse;
@@ -388,6 +405,11 @@ public:
   /// out-of-bounds requires a scalar epilogue iteration for correctness.
   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
 
+  /// Invalidate groups that require a scalar epilogue (due to gaps). This can
+  /// happen when we optimize for size and don't allow creating a scalar
+  /// epilogue.
+  void invalidateGroupsRequiringScalarEpilogue();
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 5fd6fe0ef31..8b6702c8544 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -919,3 +919,27 @@ void InterleavedAccessInfo::analyzeInterleaving(
     }
   }
 }
+
+void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
+  // If no group had triggered the requirement to create an epilogue loop,
+  // there is nothing to do.
+  if (!requiresScalarEpilogue())
+    return;
+
+  // Avoid releasing a Group twice.
+  SmallPtrSet<InterleaveGroup *, 4> DelSet;
+  for (auto &I : InterleaveGroupMap) {
+    InterleaveGroup *Group = I.second;
+    if (Group->requiresScalarEpilogue())
+      DelSet.insert(Group);
+  }
+  for (auto *Ptr : DelSet) {
+    LLVM_DEBUG(
+        dbgs() 
+        << "LV: Invalidate candidate interleaved group due to gaps that "
+           "require a scalar epilogue.\n");
+    releaseGroup(Ptr);
+  }
+
+  RequiresScalarEpilogue = false;
+}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index a395183398d..daaa1e27c8e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4599,6 +4599,14 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
     return None;
   }
 
+  // Record that scalar epilogue is not allowed.
+  LLVM_DEBUG(dbgs() << "LV: Not inserting scalar epilogue for access with gaps "
+                       "due to -Os/-Oz.\n");
+
+  // We don't create an epilogue when optimizing for size.
+  // Invalidate interleave groups that require an epilogue.
+  InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+
   unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
 
   if (TC > 0 && TC % MaxVF == 0) {
@@ -4610,8 +4618,6 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
-  //        smaller MaxVF that does not require a scalar epilog.
   if (Legal->canFoldTailByMasking()) {
     FoldTailByMasking = true;
     return MaxVF;
diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index b1163d0a199..61a2e2ca003 100644
--- a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
-; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
 
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386-unknown-linux-gnu"
@@ -9,9 +9,13 @@ target triple = "i386-unknown-linux-gnu"
 ; interleaved-group but rather as a scalarized accesses.
 ; (For SKX, Gather is not supported by the compiler for chars, therefore
 ;  the only remaining alternative is to scalarize).
+; In this case a scalar epilogue is not needed.
+;
 ; When  masked-interleave-group is enabled we expect to find the proper mask
 ; shuffling code, feeding the wide masked load for an interleave-group (with
 ; a single member).
+; Since the last (second) member of the load-group is a gap, peeling is used,
+; so we also expect to find a scalar epilogue loop.
 ;
 ; void masked_strided1(const unsigned char* restrict p,
 ;                      unsigned char* restrict q,
@@ -38,6 +42,8 @@ target triple = "i386-unknown-linux-gnu"
 ;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
 ;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
 ;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
 
 ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
 ;ENABLED_MASKED_STRIDED: vector.body:
@@ -47,6 +53,7 @@ target triple = "i386-unknown-linux-gnu"
 ;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
 ;ENABLED_MASKED_STRIDED-NEXT:  %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED: for.body:
 
 define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
 entry:
@@ -75,6 +82,109 @@ for.end:
   ret void
 }
 
+; Exactly the same scenario except we are now optimizing for size, therefore
+; we check that no scalar epilogue is created. Since we can't create an epilog
+; the interleave-group is invalidated because is has gaps, so we end up
+; scalarizing.
+; (Before the fix that this test checks, we used to create an epilogue despite
+; optsize, and vectorized the access as an interleaved-group. This is now fixed,
+; and we make sure that a scalar epilogue does not exist).
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED-NOT:   %interleaved.mask = 
+;ENABLED_MASKED_STRIDED-NOT:   call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;ENABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;ENABLED_MASKED_STRIDED-NOT:   %interleaved.mask = 
+;ENABLED_MASKED_STRIDED-NOT:   call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;ENABLED_MASKED_STRIDED-NOT: for.body:
+;ENABLED_MASKED_STRIDED:     for.end:
+
+define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Same, but the load/store are not predicated. The interleave-group is
+; invalidated here as well because we have gaps and we can't create an epilog.
+; The access is thus scalarized.
+; (Before the fix that this test checks, we used to create an epilogue despite
+; optsize, and vectorized the access as an interleaved-group. This is now fixed,
+; and we make sure that a scalar epilogue does not exist).
+; Since enable-masked-interleaved-accesses currently only affects predicated
+; accesses, the behavior is the same with this switch set/unset.
+
+
+; void unconditional_strided1_optsize(const unsigned char* restrict p,
+;                                unsigned char* restrict q,
+;                                unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;DISABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
+
+;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;ENABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
+;ENABLED_MASKED_STRIDED-NOT: for.body:
+;ENABLED_MASKED_STRIDED:     for.end:
+
+define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  br label %for.body
+
+for.body:
+  %ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = shl nuw nsw i32 %ix.06, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.06, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
 ; Check also a scenario with full interleave-groups (no gaps) as well as both
 ; load and store groups. We check that when masked-interleave-group is disabled
 ; the predicated loads (and stores) are not vectorized as an
-- 
GitLab


From c2ec04c61ca82a2557128831f9195d42a296f08e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 22 Oct 2018 06:30:22 +0000
Subject: [PATCH 0369/1116] [X86] Add patterns for vector and/or/xor/andn with
 other types than vXi64.

This makes fast isel treat all legal vector types the same way. Previously only vXi64 was in the fast-isel tables.

This unfortunately prevents matching of andn by fast-isel for these types since the requires SelectionDAG. But we already had this issue for vXi64. So at least we're consistent now.

Interestinly it looks like fast-isel can't handle instructions with constant vector arguments so the the not part of the andn patterns is selected with SelectionDAG. This explains why VPTERNLOG shows up in some of the tests.

This is a subset of D53268. As I make progress on that, I will try to reduce the number of lines in the tablegen files.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344884 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrAVX512.td              |  88 +++++++++++++
 lib/Target/X86/X86InstrSSE.td                 | 117 ++++++++++++++++++
 test/CodeGen/X86/avx-intrinsics-fast-isel.ll  |   5 +-
 test/CodeGen/X86/sse-intrinsics-fast-isel.ll  |   7 +-
 test/CodeGen/X86/sse2-intrinsics-fast-isel.ll |  11 +-
 5 files changed, 222 insertions(+), 6 deletions(-)

diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 4c4c7e75ffc..b2d0ce2bcd3 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -5184,6 +5184,94 @@ defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
 defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
                                     SchedWriteVecLogic>;
 
+let Predicates = [HasVLX] in {
+  def : Pat<(v16i8 (and VR128X:$src1, VR128X:$src2)),
+            (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v8i16 (and VR128X:$src1, VR128X:$src2)),
+            (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v4i32 (and VR128X:$src1, VR128X:$src2)),
+            (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+  def : Pat<(v16i8 (or VR128X:$src1, VR128X:$src2)),
+            (VPORQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v8i16 (or VR128X:$src1, VR128X:$src2)),
+            (VPORQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v4i32 (or VR128X:$src1, VR128X:$src2)),
+            (VPORQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+  def : Pat<(v16i8 (xor VR128X:$src1, VR128X:$src2)),
+            (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v8i16 (xor VR128X:$src1, VR128X:$src2)),
+            (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v4i32 (xor VR128X:$src1, VR128X:$src2)),
+            (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+  def : Pat<(v16i8 (X86andnp VR128X:$src1, VR128X:$src2)),
+            (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v8i16 (X86andnp VR128X:$src1, VR128X:$src2)),
+            (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v4i32 (X86andnp VR128X:$src1, VR128X:$src2)),
+            (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+  def : Pat<(v32i8 (and VR256X:$src1, VR256X:$src2)),
+            (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v16i16 (and VR256X:$src1, VR256X:$src2)),
+            (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v8i32 (and VR256X:$src1, VR256X:$src2)),
+            (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+  def : Pat<(v32i8 (or VR256X:$src1, VR256X:$src2)),
+            (VPORQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v16i16 (or VR256X:$src1, VR256X:$src2)),
+            (VPORQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v8i32 (or VR256X:$src1, VR256X:$src2)),
+            (VPORQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+  def : Pat<(v32i8 (xor VR256X:$src1, VR256X:$src2)),
+            (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v16i16 (xor VR256X:$src1, VR256X:$src2)),
+            (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v8i32 (xor VR256X:$src1, VR256X:$src2)),
+            (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+  def : Pat<(v32i8 (X86andnp VR256X:$src1, VR256X:$src2)),
+            (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v16i16 (X86andnp VR256X:$src1, VR256X:$src2)),
+            (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v8i32 (X86andnp VR256X:$src1, VR256X:$src2)),
+            (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>;
+}
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(v64i8 (and VR512:$src1, VR512:$src2)),
+            (VPANDQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v32i16 (and VR512:$src1, VR512:$src2)),
+            (VPANDQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v16i32 (and VR512:$src1, VR512:$src2)),
+            (VPANDQZrr VR512:$src1, VR512:$src2)>;
+
+  def : Pat<(v64i8 (or VR512:$src1, VR512:$src2)),
+            (VPORQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v32i16 (or VR512:$src1, VR512:$src2)),
+            (VPORQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v16i32 (or VR512:$src1, VR512:$src2)),
+            (VPORQZrr VR512:$src1, VR512:$src2)>;
+
+  def : Pat<(v64i8 (xor VR512:$src1, VR512:$src2)),
+            (VPXORQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v32i16 (xor VR512:$src1, VR512:$src2)),
+            (VPXORQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v16i32 (xor VR512:$src1, VR512:$src2)),
+            (VPXORQZrr VR512:$src1, VR512:$src2)>;
+
+  def : Pat<(v64i8 (X86andnp VR512:$src1, VR512:$src2)),
+            (VPANDNQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v32i16 (X86andnp VR512:$src1, VR512:$src2)),
+            (VPANDNQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v16i32 (X86andnp VR512:$src1, VR512:$src2)),
+            (VPANDNQZrr VR512:$src1, VR512:$src2)>;
+}
+
 //===----------------------------------------------------------------------===//
 // AVX-512  FP arithmetic
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 6c90a8898f6..8f97ce37068 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2389,15 +2389,72 @@ defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
 let isCommutable = 0 in
   defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
 
+let Predicates = [HasAVX2, NoVLX] in {
+  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
+            (VPANDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
+            (VPANDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
+            (VPANDYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
+            (VPORYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
+            (VPORYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
+            (VPORYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
+            (VPXORYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
+            (VPXORYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
+            (VPXORYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
+            (VPANDNYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
+            (VPANDNYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
+            (VPANDNYrr VR256:$src1, VR256:$src2)>;
+}
+
 // If only AVX1 is supported, we need to handle integer operations with
 // floating point instructions since the integer versions aren't available.
 let Predicates = [HasAVX1Only] in {
+  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
+            (VANDPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
+            (VANDPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
+            (VANDPSYrr VR256:$src1, VR256:$src2)>;
   def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
             (VANDPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
+            (VORPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
+            (VORPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
+            (VORPSYrr VR256:$src1, VR256:$src2)>;
   def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
             (VORPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
+            (VXORPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
+            (VXORPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
+            (VXORPSYrr VR256:$src1, VR256:$src2)>;
   def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
             (VXORPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
+            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
+            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
+            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
   def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
             (VANDNPSYrr VR256:$src1, VR256:$src2)>;
 
@@ -2504,6 +2561,66 @@ let Predicates = [UseSSE2] in {
              FR64)>;
 }
 
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
+            (VPANDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
+            (VPANDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
+            (VPANDrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
+            (VPORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
+            (VPORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
+            (VPORrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
+            (VPXORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
+            (VPXORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
+            (VPXORrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
+            (VPANDNrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
+            (VPANDNrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
+            (VPANDNrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
+            (PANDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
+            (PANDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
+            (PANDrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
+            (PORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
+            (PORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
+            (PORrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
+            (PXORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
+            (PXORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
+            (PXORrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
+            (PANDNrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
+            (PANDNrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
+            (PANDNrr VR128:$src1, VR128:$src2)>;
+}
+
 // Patterns for packed operations when we don't have integer type available.
 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
           (ANDPSrr VR128:$src1, VR128:$src2)>;
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index 6e58ffe0962..84b3b007310 100644
--- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -85,7 +85,10 @@ define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) no
 define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
 ; CHECK-LABEL: test_mm256_andnot_ps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vandnps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
+; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = bitcast <8 x float> %a0 to <8 x i32>
   %2 = bitcast <8 x float> %a1 to <8 x i32>
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 1ccd586c453..76623a2be22 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -79,12 +79,15 @@ define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 ;
 ; AVX1-LABEL: test_mm_andnot_ps:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0xc1]
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xef,0xc2]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX512-LABEL: test_mm_andnot_ps:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1]
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 83d3a0e0b95..23d0d66acfb 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -272,17 +272,22 @@ define <2 x i64> @test_mm_and_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
 ; SSE-LABEL: test_mm_andnot_pd:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm2 # encoding: [0x66,0x0f,0x76,0xd2]
+; SSE-NEXT:    pxor %xmm2, %xmm0 # encoding: [0x66,0x0f,0xef,0xc2]
+; SSE-NEXT:    pand %xmm1, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc1]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX1-LABEL: test_mm_andnot_pd:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0xc1]
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xef,0xc2]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX512-LABEL: test_mm_andnot_pd:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1]
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %arg0 = bitcast <2 x double> %a0 to <4 x i32>
   %arg1 = bitcast <2 x double> %a1 to <4 x i32>
-- 
GitLab


From b5c7e2f9a4dbb34e3667c4bb4972735eadd3247a Mon Sep 17 00:00:00 2001
From: Aleksandr Urakov <aleksandr.urakov@jetbrains.com>
Date: Mon, 22 Oct 2018 07:18:08 +0000
Subject: [PATCH 0370/1116] [PDB] Extend IPDBSession's interface to retrieve
 frame data

Summary:
This patch just extends the `IPDBSession` interface to allow retrieving
of frame data through it, and adds an implementation over DIA. It is needed
for an implementation (for now with DIA) of the conversion from FPO programs
to DWARF expressions mentioned in D53086.

Reviewers: zturner, asmith, rnk

Reviewed By: asmith

Subscribers: mgorny, aprantl, JDevlieghere, llvm-commits

Differential Revision: https://reviews.llvm.org/D53324

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344886 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h | 40 ++++++++++++++
 include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h | 41 ++++++++++++++
 include/llvm/DebugInfo/PDB/DIA/DIASession.h   |  1 +
 include/llvm/DebugInfo/PDB/IPDBFrameData.h    | 36 +++++++++++++
 include/llvm/DebugInfo/PDB/IPDBSession.h      |  3 ++
 .../llvm/DebugInfo/PDB/Native/NativeSession.h |  2 +
 include/llvm/DebugInfo/PDB/PDBTypes.h         |  2 +
 lib/DebugInfo/PDB/CMakeLists.txt              |  2 +
 lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp    | 43 +++++++++++++++
 lib/DebugInfo/PDB/DIA/DIAFrameData.cpp        | 54 +++++++++++++++++++
 lib/DebugInfo/PDB/DIA/DIASession.cpp          | 11 ++++
 lib/DebugInfo/PDB/Native/NativeSession.cpp    |  5 ++
 lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp     |  3 ++
 unittests/DebugInfo/PDB/PDBApiTest.cpp        |  4 ++
 14 files changed, 247 insertions(+)
 create mode 100644 include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
 create mode 100644 include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
 create mode 100644 include/llvm/DebugInfo/PDB/IPDBFrameData.h
 create mode 100644 lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
 create mode 100644 lib/DebugInfo/PDB/DIA/DIAFrameData.cpp

diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h b/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
new file mode 100644
index 00000000000..e17ba2ce59b
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
@@ -0,0 +1,40 @@
+//==- DIAEnumFrameData.h --------------------------------------- -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAENUMFRAMEDATA_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAENUMFRAMEDATA_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
+
+namespace llvm {
+namespace pdb {
+
+class DIASession;
+
+class DIAEnumFrameData : public IPDBEnumChildren<IPDBFrameData> {
+public:
+  explicit DIAEnumFrameData(const DIASession &PDBSession,
+                            CComPtr<IDiaEnumFrameData> DiaEnumerator);
+
+  uint32_t getChildCount() const override;
+  ChildTypePtr getChildAtIndex(uint32_t Index) const override;
+  ChildTypePtr getNext() override;
+  void reset() override;
+
+private:
+  const DIASession &Session;
+  CComPtr<IDiaEnumFrameData> Enumerator;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h b/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
new file mode 100644
index 00000000000..7564c3b7a5a
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
@@ -0,0 +1,41 @@
+//===- DIAFrameData.h - DIA Impl. of IPDBFrameData ---------------- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAFRAMEDATA_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAFRAMEDATA_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
+
+namespace llvm {
+namespace pdb {
+
+class DIASession;
+
+class DIAFrameData : public IPDBFrameData {
+public:
+  explicit DIAFrameData(const DIASession &PDBSession,
+                        CComPtr<IDiaFrameData> DiaFrameData);
+
+  uint32_t getAddressOffset() const override;
+  uint32_t getAddressSection() const override;
+  uint32_t getLengthBlock() const override;
+  std::string getProgram() const override;
+  uint32_t getRelativeVirtualAddress() const override;
+  uint64_t getVirtualAddress() const override;
+
+private:
+  const DIASession &Session;
+  CComPtr<IDiaFrameData> FrameData;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIASession.h b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
index e355605c296..592e061a8d8 100644
--- a/include/llvm/DebugInfo/PDB/DIA/DIASession.h
+++ b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
@@ -85,6 +85,7 @@ public:
 
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override;
 
+  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override;
 private:
   CComPtr<IDiaSession> Session;
 };
diff --git a/include/llvm/DebugInfo/PDB/IPDBFrameData.h b/include/llvm/DebugInfo/PDB/IPDBFrameData.h
new file mode 100644
index 00000000000..74679215b88
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/IPDBFrameData.h
@@ -0,0 +1,36 @@
+//===- IPDBFrameData.h - base interface for frame data ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBFRAMEDATA_H
+#define LLVM_DEBUGINFO_PDB_IPDBFRAMEDATA_H
+
+#include <cstdint>
+#include <string>
+
+namespace llvm {
+namespace pdb {
+
+/// IPDBFrameData defines an interface used to represent a frame data of some
+/// code block.
+class IPDBFrameData {
+public:
+  virtual ~IPDBFrameData();
+
+  virtual uint32_t getAddressOffset() const = 0;
+  virtual uint32_t getAddressSection() const = 0;
+  virtual uint32_t getLengthBlock() const = 0;
+  virtual std::string getProgram() const = 0;
+  virtual uint32_t getRelativeVirtualAddress() const = 0;
+  virtual uint64_t getVirtualAddress() const = 0;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/IPDBSession.h b/include/llvm/DebugInfo/PDB/IPDBSession.h
index 24573cdb779..88fd02c0a34 100644
--- a/include/llvm/DebugInfo/PDB/IPDBSession.h
+++ b/include/llvm/DebugInfo/PDB/IPDBSession.h
@@ -91,6 +91,9 @@ public:
 
   virtual std::unique_ptr<IPDBEnumSectionContribs>
   getSectionContribs() const = 0;
+
+  virtual std::unique_ptr<IPDBEnumFrameData>
+  getFrameData() const = 0;
 };
 } // namespace pdb
 } // namespace llvm
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeSession.h b/include/llvm/DebugInfo/PDB/Native/NativeSession.h
index 07ce85ef820..4878e47d312 100644
--- a/include/llvm/DebugInfo/PDB/Native/NativeSession.h
+++ b/include/llvm/DebugInfo/PDB/Native/NativeSession.h
@@ -93,6 +93,8 @@ public:
 
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override;
 
+  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override;
+
   PDBFile &getPDBFile() { return *Pdb; }
   const PDBFile &getPDBFile() const { return *Pdb; }
 
diff --git a/include/llvm/DebugInfo/PDB/PDBTypes.h b/include/llvm/DebugInfo/PDB/PDBTypes.h
index 6247018ce0f..917f3ed7391 100644
--- a/include/llvm/DebugInfo/PDB/PDBTypes.h
+++ b/include/llvm/DebugInfo/PDB/PDBTypes.h
@@ -12,6 +12,7 @@
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include <cctype>
 #include <cstddef>
@@ -71,6 +72,7 @@ using IPDBEnumLineNumbers = IPDBEnumChildren<IPDBLineNumber>;
 using IPDBEnumTables = IPDBEnumChildren<IPDBTable>;
 using IPDBEnumInjectedSources = IPDBEnumChildren<IPDBInjectedSource>;
 using IPDBEnumSectionContribs = IPDBEnumChildren<IPDBSectionContrib>;
+using IPDBEnumFrameData = IPDBEnumChildren<IPDBFrameData>;
 
 /// Specifies which PDB reader implementation is to be used.  Only a value
 /// of PDB_ReaderType::DIA is currently supported, but Native is in the works.
diff --git a/lib/DebugInfo/PDB/CMakeLists.txt b/lib/DebugInfo/PDB/CMakeLists.txt
index 86dcfdaa163..d9d379f6d09 100644
--- a/lib/DebugInfo/PDB/CMakeLists.txt
+++ b/lib/DebugInfo/PDB/CMakeLists.txt
@@ -14,6 +14,7 @@ if(LLVM_ENABLE_DIA_SDK)
   add_pdb_impl_folder(DIA
     DIA/DIADataStream.cpp
     DIA/DIAEnumDebugStreams.cpp
+    DIA/DIAEnumFrameData.cpp
     DIA/DIAEnumInjectedSources.cpp
     DIA/DIAEnumLineNumbers.cpp
     DIA/DIAEnumSectionContribs.cpp
@@ -21,6 +22,7 @@ if(LLVM_ENABLE_DIA_SDK)
     DIA/DIAEnumSymbols.cpp
     DIA/DIAEnumTables.cpp
     DIA/DIAError.cpp
+    DIA/DIAFrameData.cpp
     DIA/DIAInjectedSource.cpp
     DIA/DIALineNumber.cpp
     DIA/DIARawSymbol.cpp
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
new file mode 100644
index 00000000000..77514483e04
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
@@ -0,0 +1,43 @@
+//==- DIAEnumFrameData.cpp ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAFrameData.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+
+using namespace llvm::pdb;
+
+DIAEnumFrameData::DIAEnumFrameData(const DIASession &PDBSession,
+                                   CComPtr<IDiaEnumFrameData> DiaEnumerator)
+    : Session(PDBSession), Enumerator(DiaEnumerator) {}
+
+uint32_t DIAEnumFrameData::getChildCount() const {
+  LONG Count = 0;
+  return (S_OK == Enumerator->get_Count(&Count)) ? Count : 0;
+}
+
+std::unique_ptr<IPDBFrameData>
+DIAEnumFrameData::getChildAtIndex(uint32_t Index) const {
+  CComPtr<IDiaFrameData> Item;
+  if (S_OK != Enumerator->Item(Index, &Item))
+    return nullptr;
+
+  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Session, Item));
+}
+
+std::unique_ptr<IPDBFrameData> DIAEnumFrameData::getNext() {
+  CComPtr<IDiaFrameData> Item;
+  ULONG NumFetched = 0;
+  if (S_OK != Enumerator->Next(1, &Item, &NumFetched))
+    return nullptr;
+
+  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Session, Item));
+}
+
+void DIAEnumFrameData::reset() { Enumerator->Reset(); }
diff --git a/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp b/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp
new file mode 100644
index 00000000000..b904a2ff60a
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp
@@ -0,0 +1,54 @@
+//===- DIAFrameData.cpp - DIA impl. of IPDBFrameData -------------- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIAFrameData.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAUtils.h"
+
+using namespace llvm::pdb;
+
+DIAFrameData::DIAFrameData(const DIASession &PDBSession,
+                           CComPtr<IDiaFrameData> DiaFrameData)
+    : Session(PDBSession), FrameData(DiaFrameData) {}
+
+template <typename ArgType>
+ArgType
+PrivateGetDIAValue(IDiaFrameData *FrameData,
+                   HRESULT (__stdcall IDiaFrameData::*Method)(ArgType *)) {
+  ArgType Value;
+  if (S_OK == (FrameData->*Method)(&Value))
+    return static_cast<ArgType>(Value);
+
+  return ArgType();
+}
+
+uint32_t DIAFrameData::getAddressOffset() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_addressOffset);
+}
+
+uint32_t DIAFrameData::getAddressSection() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_addressSection);
+}
+
+uint32_t DIAFrameData::getLengthBlock() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_lengthBlock);
+}
+
+std::string DIAFrameData::getProgram() const {
+  return invokeBstrMethod(*FrameData, &IDiaFrameData::get_program);
+}
+
+uint32_t DIAFrameData::getRelativeVirtualAddress() const {
+  return PrivateGetDIAValue(FrameData,
+                            &IDiaFrameData::get_relativeVirtualAddress);
+}
+
+uint64_t DIAFrameData::getVirtualAddress() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_virtualAddress);
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIASession.cpp b/lib/DebugInfo/PDB/DIA/DIASession.cpp
index 7726fe13264..b89ca9a858f 100644
--- a/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -9,6 +9,7 @@
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumInjectedSources.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSectionContribs.h"
@@ -419,3 +420,13 @@ DIASession::getSectionContribs() const {
 
   return llvm::make_unique<DIAEnumSectionContribs>(*this, Sections);
 }
+
+std::unique_ptr<IPDBEnumFrameData>
+DIASession::getFrameData() const {
+  CComPtr<IDiaEnumFrameData> FD =
+      getTableEnumerator<IDiaEnumFrameData>(*Session);
+  if (!FD)
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumFrameData>(*this, FD);
+}
diff --git a/lib/DebugInfo/PDB/Native/NativeSession.cpp b/lib/DebugInfo/PDB/Native/NativeSession.cpp
index baab0a2399c..7807e312365 100644
--- a/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -200,6 +200,11 @@ NativeSession::getSectionContribs() const {
   return nullptr;
 }
 
+std::unique_ptr<IPDBEnumFrameData>
+NativeSession::getFrameData() const {
+  return nullptr;
+}
+
 void NativeSession::initializeExeSymbol() {
   if (ExeSymbol == 0)
     ExeSymbol = Cache.createSymbol<NativeExeSymbol>();
diff --git a/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
index c62796507a0..951909295d1 100644
--- a/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
+++ b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/IPDBDataStream.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
 #include "llvm/DebugInfo/PDB/IPDBInjectedSource.h"
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
@@ -35,3 +36,5 @@ IPDBTable::~IPDBTable() = default;
 IPDBInjectedSource::~IPDBInjectedSource() = default;
 
 IPDBSectionContrib::~IPDBSectionContrib() = default;
+
+IPDBFrameData::~IPDBFrameData() = default;
diff --git a/unittests/DebugInfo/PDB/PDBApiTest.cpp b/unittests/DebugInfo/PDB/PDBApiTest.cpp
index 948bde1bf72..007ea904085 100644
--- a/unittests/DebugInfo/PDB/PDBApiTest.cpp
+++ b/unittests/DebugInfo/PDB/PDBApiTest.cpp
@@ -159,6 +159,10 @@ class MockSession : public IPDBSession {
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override {
     return nullptr;
   }
+
+  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override {
+    return nullptr;
+  }
 };
 
 class MockRawSymbol : public IPDBRawSymbol {
-- 
GitLab


From 084d5e1748d32de6955d90ff9aeedb84b4afd883 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 22 Oct 2018 10:51:34 +0000
Subject: [PATCH 0371/1116] [CGProfile] Turn constant-size SmallVector into
 array

No functionality change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344893 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Instrumentation/CGProfile.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/Transforms/Instrumentation/CGProfile.cpp b/lib/Transforms/Instrumentation/CGProfile.cpp
index 9606b3da247..cdcd0172690 100644
--- a/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -88,11 +88,10 @@ void CGProfilePass::addModuleFlags(
   std::vector<Metadata *> Nodes;
 
   for (auto E : Counts) {
-    SmallVector<Metadata *, 3> Vals;
-    Vals.push_back(ValueAsMetadata::get(E.first.first));
-    Vals.push_back(ValueAsMetadata::get(E.first.second));
-    Vals.push_back(MDB.createConstant(
-        ConstantInt::get(Type::getInt64Ty(Context), E.second)));
+    Metadata *Vals[] = {ValueAsMetadata::get(E.first.first),
+                        ValueAsMetadata::get(E.first.second),
+                        MDB.createConstant(ConstantInt::get(
+                            Type::getInt64Ty(Context), E.second))};
     Nodes.push_back(MDNode::get(Context, Vals));
   }
 
-- 
GitLab


From 872c921c7d4729b354e984234cd04baeda8a7469 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Mon, 22 Oct 2018 11:22:59 +0000
Subject: [PATCH 0372/1116] [PowerPC][NFC] Fix bugs in r+r to r+i conversion

The D-Form VSX loads introduced in ISA 3.0 are not direct D-Form equivalent of
the corresponding X-Forms since they only target the Altivec registers.
Namely LXSSPX can load into any of the 64 VSX registers whereas LXSSP can only
load into the upper 32 VSX registers. Similarly with the remaining affected
instructions.

There is currently no way that I can see to trigger the bug, but as we add other
ways of exploiting these instructions, there may very well be instances that do.

This is an NFC patch in practical terms since the changes it introduces can not
be triggered without an MIR test.

Differential revision: https://reviews.llvm.org/D53323


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344894 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCInstrInfo.cpp           | 68 +++++++++++++++----
 lib/Target/PowerPC/PPCInstrInfo.h             |  3 +-
 .../PowerPC/convert-rr-to-ri-instrs.mir       | 24 +++----
 3 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 883f8390b7d..559ed59bec9 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2319,7 +2319,7 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
       Opc == PPC::RLDICL_32 || Opc == PPC::RLDICL_32_64 ||
       Opc == PPC::RLWINM || Opc == PPC::RLWINMo ||
       Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o;
-    if (!instrHasImmForm(MI, III) && !ConvertibleImmForm)
+    if (!instrHasImmForm(MI, III, true) && !ConvertibleImmForm)
       return nullptr;
 
     // Don't convert or %X, %Y, %Y since that's just a register move.
@@ -2421,7 +2421,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
     *KilledDef = DefMI;
 
   ImmInstrInfo III;
-  bool HasImmForm = instrHasImmForm(MI, III);
+  bool HasImmForm = instrHasImmForm(MI, III, PostRA);
   // If this is a reg+reg instruction that has a reg+imm form,
   // and one of the operands is produced by an add-immediate,
   // try to convert it.
@@ -2644,8 +2644,12 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
   return false;
 }
 
+static bool isVFReg(unsigned Reg) {
+  return PPC::VFRCRegClass.contains(Reg);
+}
+
 bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
-                                   ImmInstrInfo &III) const {
+                                   ImmInstrInfo &III, bool PostRA) const {
   unsigned Opc = MI.getOpcode();
   // The vast majority of the instructions would need their operand 2 replaced
   // with an immediate when switching to the reg+imm form. A marked exception
@@ -2946,13 +2950,20 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
     case PPC::STFDUX: III.ImmOpcode = PPC::STFDU; break;
     }
     break;
-  // Power9 only.
+  // Power9 and up only. For some of these, the X-Form version has access to all
+  // 64 VSR's whereas the D-Form only has access to the VR's. We replace those
+  // with pseudo-ops pre-ra and for post-ra, we check that the register loaded
+  // into or stored from is one of the VR registers.
   case PPC::LXVX:
   case PPC::LXSSPX:
   case PPC::LXSDX:
   case PPC::STXVX:
   case PPC::STXSSPX:
   case PPC::STXSDX:
+  case PPC::XFLOADf32:
+  case PPC::XFLOADf64:
+  case PPC::XFSTOREf32:
+  case PPC::XFSTOREf64:
     if (!Subtarget.hasP9Vector())
       return false;
     III.SignedImm = true;
@@ -2962,6 +2973,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
     III.IsSummingOperands = true;
     III.ImmOpNo = 1;
     III.OpNoForForwarding = 2;
+    III.ImmMustBeMultipleOf = 4;
     switch(Opc) {
     default: llvm_unreachable("Unknown opcode");
     case PPC::LXVX:
@@ -2969,24 +2981,56 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
       III.ImmMustBeMultipleOf = 16;
       break;
     case PPC::LXSSPX:
-      III.ImmOpcode = PPC::LXSSP;
-      III.ImmMustBeMultipleOf = 4;
+      if (PostRA) {
+        if (isVFReg(MI.getOperand(0).getReg()))
+          III.ImmOpcode = PPC::LXSSP;
+        else
+          III.ImmOpcode = PPC::LFS;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case PPC::XFLOADf32:
+      III.ImmOpcode = PPC::DFLOADf32;
       break;
     case PPC::LXSDX:
-      III.ImmOpcode = PPC::LXSD;
-      III.ImmMustBeMultipleOf = 4;
+      if (PostRA) {
+        if (isVFReg(MI.getOperand(0).getReg()))
+          III.ImmOpcode = PPC::LXSD;
+        else
+          III.ImmOpcode = PPC::LFD;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case PPC::XFLOADf64:
+      III.ImmOpcode = PPC::DFLOADf64;
       break;
     case PPC::STXVX:
       III.ImmOpcode = PPC::STXV;
       III.ImmMustBeMultipleOf = 16;
       break;
     case PPC::STXSSPX:
-      III.ImmOpcode = PPC::STXSSP;
-      III.ImmMustBeMultipleOf = 4;
+      if (PostRA) {
+        if (isVFReg(MI.getOperand(0).getReg()))
+          III.ImmOpcode = PPC::STXSSP;
+        else
+          III.ImmOpcode = PPC::STFS;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case PPC::XFSTOREf32:
+      III.ImmOpcode = PPC::DFSTOREf32;
       break;
     case PPC::STXSDX:
-      III.ImmOpcode = PPC::STXSD;
-      III.ImmMustBeMultipleOf = 4;
+      if (PostRA) {
+        if (isVFReg(MI.getOperand(0).getReg()))
+          III.ImmOpcode = PPC::STXSD;
+        else
+          III.ImmOpcode = PPC::STFD;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case PPC::XFSTOREf64:
+      III.ImmOpcode = PPC::DFSTOREf64;
       break;
     }
     break;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 8a062daab55..9c556e32496 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -414,7 +414,8 @@ public:
                               MachineInstr **KilledDef = nullptr) const;
   void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const;
 
-  bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III) const;
+  bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III,
+                       bool PostRA) const;
 
   /// getRegNumForOperand - some operands use different numbering schemes
   /// for the same registers. For example, a VSX instruction may have any of
diff --git a/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir b/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir
index c9038e87af5..e210ec5c523 100644
--- a/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir
+++ b/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir
@@ -3265,15 +3265,15 @@ body:             |
     %4 = INSERT_SUBREG %5, killed %3, 1
     %6 = LI8 100
     %7 = LXSDX %0, killed %6, implicit $rm :: (load 8 from %ir.arrayidx, !tbaa !12)
-    ; CHECK: LXSD 100, %0
-    ; CHECK-LATE: lxsd 0, 100(3)
+    ; CHECK: DFLOADf64 100, %0
+    ; CHECK-LATE: lfd 0, 100(3)
     %8 = ADDI %2, 2
     %10 = IMPLICIT_DEF
     %9 = INSERT_SUBREG %10, killed %8, 1
     %11 = LI8 -120
     %12 = LXSDX %0, killed %11, implicit $rm :: (load 8 from %ir.arrayidx3, !tbaa !12)
-    ; CHECK: LXSD -120, %0
-    ; CHECK-LATE: lxsd 1, -120(3)
+    ; CHECK: DFLOADf64 -120, %0
+    ; CHECK-LATE: lfd 1, -120(3)
     %13 = XSADDDP killed %7, killed %12, implicit $rm
     $f1 = COPY %13
     BLR8 implicit $lr8, implicit $rm, implicit $f1
@@ -3338,15 +3338,15 @@ body:             |
     %4 = INSERT_SUBREG %5, killed %3, 1
     %6 = LI8 96
     %7 = LXSSPX %0, killed %6 :: (load 4 from %ir.arrayidx, !tbaa !14)
-    ; CHECK: LXSSP 96, %0
-    ; CHECK-LATE: lxssp 0, 96(3)
+    ; CHECK: DFLOADf32 96, %0
+    ; CHECK-LATE: lfs 0, 96(3)
     %8 = ADDI %2, 2
     %10 = IMPLICIT_DEF
     %9 = INSERT_SUBREG %10, killed %8, 1
     %11 = LI8 -92
     %12 = LXSSPX %0, killed %11 :: (load 4 from %ir.arrayidx3, !tbaa !14)
-    ; CHECK: LXSSP -92, %0
-    ; CHECK-LATE: lxssp 1, -92(3)
+    ; CHECK: DFLOADf32 -92, %0
+    ; CHECK-LATE: lfs 1, -92(3)
     %13 = XSADDSP killed %7, killed %12
     $f1 = COPY %13
     BLR8 implicit $lr8, implicit $rm, implicit $f1
@@ -6031,8 +6031,8 @@ body:             |
     %0 = COPY $x3
     %3 = LI8 444
     STXSSPX %1, %0, killed %3 :: (store 4 into %ir.arrayidx, !tbaa !14)
-    ; CHECK: STXSSP %1, 444, %0
-    ; CHECK-LATE: stxssp 1, 444(3)
+    ; CHECK: DFSTOREf32 %1, 444, %0
+    ; CHECK-LATE: stfs 1, 444(3)
     BLR8 implicit $lr8, implicit $rm
 
 ...
@@ -6083,8 +6083,8 @@ body:             |
     %0 = COPY $x3
     %3 = LI8 4
     STXSDX %1, %0, killed %3, implicit $rm :: (store 8 into %ir.arrayidx, !tbaa !12)
-    ; CHECK: STXSD %1, 4, %0
-    ; CHECK-LATE: stxsd 1, 4(3)
+    ; CHECK: DFSTOREf64 %1, 4, %0
+    ; CHECK-LATE: stfd 1, 4(3)
     BLR8 implicit $lr8, implicit $rm
 
 ...
-- 
GitLab


From 879e2ef2a6771a2f8a6285977142be5871de265d Mon Sep 17 00:00:00 2001
From: George Rimar <grimar@accesssoftek.com>
Date: Mon, 22 Oct 2018 11:30:54 +0000
Subject: [PATCH 0373/1116] [llvm-dwarfdump] - Add the support of parsing
 .debug_loclists.

This teaches llvm-dwarfdump to dump the content of .debug_loclists sections.

It converts the DWARFDebugLocDWO class to DWARFDebugLoclists,
teaches llvm-dwarfdump about .debug_loclists section and
adds the implementation for parsing the DW_LLE_offset_pair entries.

Differential revision: https://reviews.llvm.org/D53364

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344895 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/BinaryFormat/Dwarf.def           |   1 +
 include/llvm/DebugInfo/DWARF/DWARFContext.h   |   4 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h  |  14 +-
 include/llvm/DebugInfo/DWARF/DWARFObject.h    |   1 +
 lib/DebugInfo/DWARF/DWARFContext.cpp          |  36 +++-
 lib/DebugInfo/DWARF/DWARFDebugLoc.cpp         |  78 +++++---
 lib/DebugInfo/DWARF/DWARFDie.cpp              |  22 ++-
 .../X86/dwarfdump-debug-loclists.test         | 168 ++++++++++++++++++
 8 files changed, 288 insertions(+), 36 deletions(-)
 create mode 100644 test/DebugInfo/X86/dwarfdump-debug-loclists.test

diff --git a/include/llvm/BinaryFormat/Dwarf.def b/include/llvm/BinaryFormat/Dwarf.def
index 6b7a7412f4d..512cc64926d 100644
--- a/include/llvm/BinaryFormat/Dwarf.def
+++ b/include/llvm/BinaryFormat/Dwarf.def
@@ -873,6 +873,7 @@ HANDLE_DWARF_SECTION(DebugTypes, ".debug_types", "debug-types")
 HANDLE_DWARF_SECTION(DebugLine, ".debug_line", "debug-line")
 HANDLE_DWARF_SECTION(DebugLineStr, ".debug_line_str", "debug-line-str")
 HANDLE_DWARF_SECTION(DebugLoc, ".debug_loc", "debug-loc")
+HANDLE_DWARF_SECTION(DebugLoclists, ".debug_loclists", "debug-loclists")
 HANDLE_DWARF_SECTION(DebugFrame, ".debug_frame", "debug-frame")
 HANDLE_DWARF_SECTION(DebugMacro, ".debug_macro", "debug-macro")
 HANDLE_DWARF_SECTION(DebugNames, ".debug_names", "debug-names")
diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
index c5b98ea5a2a..221f1f79698 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -76,7 +76,7 @@ class DWARFContext : public DIContext {
 
   DWARFUnitVector DWOUnits;
   std::unique_ptr<DWARFDebugAbbrev> AbbrevDWO;
-  std::unique_ptr<DWARFDebugLocDWO> LocDWO;
+  std::unique_ptr<DWARFDebugLoclists> LocDWO;
 
   /// The maximum DWARF version of all units.
   unsigned MaxVersion = 0;
@@ -262,7 +262,7 @@ public:
   const DWARFDebugAbbrev *getDebugAbbrevDWO();
 
   /// Get a pointer to the parsed DebugLoc object.
-  const DWARFDebugLocDWO *getDebugLocDWO();
+  const DWARFDebugLoclists *getDebugLocDWO();
 
   /// Get a pointer to the parsed DebugAranges object.
   const DWARFDebugAranges *getDebugAranges();
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
index 9a73745fb6b..ad44c2c83fb 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
@@ -73,19 +73,21 @@ public:
                                               uint32_t *Offset);
 };
 
-class DWARFDebugLocDWO {
+class DWARFDebugLoclists {
 public:
   struct Entry {
-    uint64_t Start;
-    uint32_t Length;
+    uint8_t Kind;
+    uint64_t Value0;
+    uint64_t Value1;
     SmallVector<char, 4> Loc;
   };
 
   struct LocationList {
     unsigned Offset;
     SmallVector<Entry, 2> Entries;
-    void dump(raw_ostream &OS, bool IsLittleEndian, unsigned AddressSize,
-              const MCRegisterInfo *RegInfo, unsigned Indent) const;
+    void dump(raw_ostream &OS, uint64_t BaseAddr, bool IsLittleEndian,
+              unsigned AddressSize, const MCRegisterInfo *RegInfo,
+              unsigned Indent) const;
   };
 
 private:
@@ -99,7 +101,7 @@ private:
 
 public:
   void parse(DataExtractor data);
-  void dump(raw_ostream &OS, const MCRegisterInfo *RegInfo,
+  void dump(raw_ostream &OS, uint64_t BaseAddr, const MCRegisterInfo *RegInfo,
             Optional<uint64_t> Offset) const;
 
   /// Return the location list at the given offset or nullptr.
diff --git a/include/llvm/DebugInfo/DWARF/DWARFObject.h b/include/llvm/DebugInfo/DWARF/DWARFObject.h
index 6e8f370f4ae..8e582da3172 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFObject.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFObject.h
@@ -38,6 +38,7 @@ public:
   forEachTypesSections(function_ref<void(const DWARFSection &)> F) const {}
   virtual StringRef getAbbrevSection() const { return ""; }
   virtual const DWARFSection &getLocSection() const { return Dummy; }
+  virtual const DWARFSection &getLoclistsSection() const { return Dummy; }
   virtual StringRef getARangeSection() const { return ""; }
   virtual StringRef getDebugFrameSection() const { return ""; }
   virtual StringRef getEHFrameSection() const { return ""; }
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 18ec8476e9d..1f3753809a2 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -292,6 +292,27 @@ dumpRnglistsSection(raw_ostream &OS, DWARFDataExtractor &rnglistData,
   }
 }
 
+static void dumpLoclistsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
+                                DWARFDataExtractor Data,
+                                const MCRegisterInfo *MRI,
+                                Optional<uint64_t> DumpOffset) {
+  uint32_t Offset = 0;
+  DWARFDebugLoclists Loclists;
+
+  DWARFListTableHeader Header(".debug_loclists", "locations");
+  if (Error E = Header.extract(Data, &Offset)) {
+    WithColor::error() << toString(std::move(E)) << '\n';
+    return;
+  }
+
+  Header.dump(OS, DumpOpts);
+  DataExtractor LocData(Data.getData().drop_front(Offset),
+                        Data.isLittleEndian(), Header.getAddrSize());
+
+  Loclists.parse(LocData);
+  Loclists.dump(OS, 0, MRI, DumpOffset);
+}
+
 void DWARFContext::dump(
     raw_ostream &OS, DIDumpOptions DumpOpts,
     std::array<Optional<uint64_t>, DIDT_ID_Count> DumpOffsets) {
@@ -366,9 +387,15 @@ void DWARFContext::dump(
                  DObj->getLocSection().Data)) {
     getDebugLoc()->dump(OS, getRegisterInfo(), DumpOffset);
   }
+  if (shouldDump(Explicit, ".debug_loclists", DIDT_ID_DebugLoclists,
+                 DObj->getLoclistsSection().Data)) {
+    DWARFDataExtractor Data(*DObj, DObj->getLoclistsSection(), isLittleEndian(),
+                            0);
+    dumpLoclistsSection(OS, DumpOpts, Data, getRegisterInfo(), DumpOffset);
+  }
   if (shouldDump(ExplicitDWO, ".debug_loc.dwo", DIDT_ID_DebugLoc,
                  DObj->getLocDWOSection().Data)) {
-    getDebugLocDWO()->dump(OS, getRegisterInfo(), DumpOffset);
+    getDebugLocDWO()->dump(OS, 0, getRegisterInfo(), DumpOffset);
   }
 
   if (shouldDump(Explicit, ".debug_frame", DIDT_ID_DebugFrame,
@@ -696,11 +723,11 @@ const DWARFDebugLoc *DWARFContext::getDebugLoc() {
   return Loc.get();
 }
 
-const DWARFDebugLocDWO *DWARFContext::getDebugLocDWO() {
+const DWARFDebugLoclists *DWARFContext::getDebugLocDWO() {
   if (LocDWO)
     return LocDWO.get();
 
-  LocDWO.reset(new DWARFDebugLocDWO());
+  LocDWO.reset(new DWARFDebugLoclists());
   // Assume all compile units have the same address byte size.
   // FIXME: We don't need AddressSize for split DWARF since relocatable
   // addresses cannot appear there. At the moment DWARFExpression requires it.
@@ -1213,6 +1240,7 @@ class DWARFObjInMemory final : public DWARFObject {
 
   DWARFSectionMap InfoSection;
   DWARFSectionMap LocSection;
+  DWARFSectionMap LocListsSection;
   DWARFSectionMap LineSection;
   DWARFSectionMap RangeSection;
   DWARFSectionMap RnglistsSection;
@@ -1234,6 +1262,7 @@ class DWARFObjInMemory final : public DWARFObject {
     return StringSwitch<DWARFSectionMap *>(Name)
         .Case("debug_info", &InfoSection)
         .Case("debug_loc", &LocSection)
+        .Case("debug_loclists", &LocListsSection)
         .Case("debug_line", &LineSection)
         .Case("debug_str_offsets", &StringOffsetSection)
         .Case("debug_ranges", &RangeSection)
@@ -1529,6 +1558,7 @@ public:
 
   StringRef getAbbrevSection() const override { return AbbrevSection; }
   const DWARFSection &getLocSection() const override { return LocSection; }
+  const DWARFSection &getLoclistsSection() const override { return LocListsSection; }
   StringRef getARangeSection() const override { return ARangeSection; }
   StringRef getDebugFrameSection() const override { return DebugFrameSection; }
   StringRef getEHFrameSection() const override { return EHFrameSection; }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index 617b914ecce..bfcf799d230 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -144,24 +144,39 @@ void DWARFDebugLoc::parse(const DWARFDataExtractor &data) {
     WithColor::error() << "failed to consume entire .debug_loc section\n";
 }
 
-Optional<DWARFDebugLocDWO::LocationList>
-DWARFDebugLocDWO::parseOneLocationList(DataExtractor Data, unsigned *Offset) {
+Optional<DWARFDebugLoclists::LocationList>
+DWARFDebugLoclists::parseOneLocationList(DataExtractor Data, unsigned *Offset) {
   LocationList LL;
   LL.Offset = *Offset;
 
   // dwarf::DW_LLE_end_of_list_entry is 0 and indicates the end of the list.
   while (auto Kind =
              static_cast<dwarf::LocationListEntry>(Data.getU8(Offset))) {
-    if (Kind != dwarf::DW_LLE_startx_length) {
+
+    Entry E;
+    E.Kind = Kind;
+    switch (Kind) {
+    case dwarf::DW_LLE_startx_length:
+      E.Value0 = Data.getULEB128(Offset);
+      E.Value1 = Data.getU32(Offset);
+      break;
+    case dwarf::DW_LLE_start_length:
+      E.Value0 = Data.getAddress(Offset);
+      E.Value1 = Data.getULEB128(Offset);
+      break;
+    case dwarf::DW_LLE_offset_pair:
+      E.Value0 = Data.getULEB128(Offset);
+      E.Value1 = Data.getULEB128(Offset);
+      break;
+    case dwarf::DW_LLE_base_address:
+      E.Value0 = Data.getAddress(Offset);
+      break;
+    default:
       WithColor::error() << "dumping support for LLE of kind " << (int)Kind
                          << " not implemented\n";
       return None;
     }
 
-    Entry E;
-    E.Start = Data.getULEB128(Offset);
-    E.Length = Data.getU32(Offset);
-
     unsigned Bytes = Data.getU16(Offset);
     // A single location description describing the location of the object...
     StringRef str = Data.getData().substr(*Offset, Bytes);
@@ -174,7 +189,7 @@ DWARFDebugLocDWO::parseOneLocationList(DataExtractor Data, unsigned *Offset) {
   return LL;
 }
 
-void DWARFDebugLocDWO::parse(DataExtractor data) {
+void DWARFDebugLoclists::parse(DataExtractor data) {
   IsLittleEndian = data.isLittleEndian();
   AddressSize = data.getAddressSize();
 
@@ -187,8 +202,8 @@ void DWARFDebugLocDWO::parse(DataExtractor data) {
   }
 }
 
-DWARFDebugLocDWO::LocationList const *
-DWARFDebugLocDWO::getLocationListAtOffset(uint64_t Offset) const {
+DWARFDebugLoclists::LocationList const *
+DWARFDebugLoclists::getLocationListAtOffset(uint64_t Offset) const {
   auto It = std::lower_bound(
       Locations.begin(), Locations.end(), Offset,
       [](const LocationList &L, uint64_t Offset) { return L.Offset < Offset; });
@@ -197,23 +212,46 @@ DWARFDebugLocDWO::getLocationListAtOffset(uint64_t Offset) const {
   return nullptr;
 }
 
-void DWARFDebugLocDWO::LocationList::dump(raw_ostream &OS, bool IsLittleEndian,
-                                          unsigned AddressSize,
-                                          const MCRegisterInfo *MRI,
-                                          unsigned Indent) const {
+void DWARFDebugLoclists::LocationList::dump(raw_ostream &OS, uint64_t BaseAddr,
+                                            bool IsLittleEndian,
+                                            unsigned AddressSize,
+                                            const MCRegisterInfo *MRI,
+                                            unsigned Indent) const {
   for (const Entry &E : Entries) {
-    OS << '\n';
-    OS.indent(Indent);
-    OS << "Addr idx " << E.Start << " (w/ length " << E.Length << "): ";
+    switch (E.Kind) {
+    case dwarf::DW_LLE_startx_length:
+      OS << '\n';
+      OS.indent(Indent);
+      OS << "Addr idx " << E.Value0 << " (w/ length " << E.Value1 << "): ";
+      break;
+    case dwarf::DW_LLE_start_length:
+      OS << '\n';
+      OS.indent(Indent);
+      OS << format("[0x%8.8x, 0x%8.8x): ", E.Value0, E.Value0 + E.Value1);
+      break;
+    case dwarf::DW_LLE_offset_pair:
+      OS << '\n';
+      OS.indent(Indent);
+      OS << format("[0x%8.8x, 0x%8.8x): ", BaseAddr + E.Value0,
+                   BaseAddr + E.Value1);
+      break;
+    case dwarf::DW_LLE_base_address:
+      BaseAddr = E.Value0;
+      break;
+    default:
+      llvm_unreachable("unreachable locations list kind");
+    }
+
     dumpExpression(OS, E.Loc, IsLittleEndian, AddressSize, MRI);
   }
 }
 
-void DWARFDebugLocDWO::dump(raw_ostream &OS, const MCRegisterInfo *MRI,
-                            Optional<uint64_t> Offset) const {
+void DWARFDebugLoclists::dump(raw_ostream &OS, uint64_t BaseAddr,
+                              const MCRegisterInfo *MRI,
+                              Optional<uint64_t> Offset) const {
   auto DumpLocationList = [&](const LocationList &L) {
     OS << format("0x%8.8x: ", L.Offset);
-    L.dump(OS, IsLittleEndian, AddressSize, MRI, /*Indent=*/12);
+    L.dump(OS, BaseAddr, IsLittleEndian, AddressSize, MRI, /*Indent=*/12);
     OS << "\n\n";
   };
 
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index b4413653290..76430b41f18 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -102,7 +102,7 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
   FormValue.dump(OS, DumpOpts);
   if (FormValue.isFormClass(DWARFFormValue::FC_SectionOffset)) {
     uint32_t Offset = *FormValue.getAsSectionOffset();
-    if (!U->isDWOUnit()) {
+    if (!U->isDWOUnit() && !U->getLocSection()->Data.empty()) {
       DWARFDebugLoc DebugLoc;
       DWARFDataExtractor Data(Obj, *U->getLocSection(), Ctx.isLittleEndian(),
                               Obj.getAddressSize());
@@ -115,11 +115,23 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
                  Indent);
       } else
         OS << "error extracting location list.";
-    } else {
-      DataExtractor Data(U->getLocSectionData(), Ctx.isLittleEndian(), 0);
-      auto LL = DWARFDebugLocDWO::parseOneLocationList(Data, &Offset);
+      return;
+    }
+
+    StringRef LoclistsSectionData =
+        U->isDWOUnit() ? U->getLocSectionData() : Obj.getLoclistsSection().Data;
+    if (!LoclistsSectionData.empty()) {
+      DataExtractor Data(LoclistsSectionData, Ctx.isLittleEndian(),
+                         Obj.getAddressSize());
+      auto LL = DWARFDebugLoclists::parseOneLocationList(Data, &Offset);
+
+      uint64_t BaseAddr = 0;
+      if (Optional<SectionedAddress> BA = U->getBaseAddress())
+        BaseAddr = BA->Address;
+
       if (LL)
-        LL->dump(OS, Ctx.isLittleEndian(), Obj.getAddressSize(), MRI, Indent);
+        LL->dump(OS, BaseAddr, Ctx.isLittleEndian(), Obj.getAddressSize(), MRI,
+                 Indent);
       else
         OS << "error extracting location list.";
     }
diff --git a/test/DebugInfo/X86/dwarfdump-debug-loclists.test b/test/DebugInfo/X86/dwarfdump-debug-loclists.test
new file mode 100644
index 00000000000..de8cb15596a
--- /dev/null
+++ b/test/DebugInfo/X86/dwarfdump-debug-loclists.test
@@ -0,0 +1,168 @@
+# RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux -o %t.o
+# RUN: llvm-dwarfdump -v %t.o | FileCheck %s
+
+# CHECK:      .debug_info
+# CHECK:       DW_AT_name{{.*}}"stub"
+# CHECK:       DW_AT_location [DW_FORM_sec_offset]   (0x0000000c
+# CHECK-NEXT:    [0x00000010, 0x00000020): DW_OP_breg5 RDI+0
+# CHECK-NEXT:    [0x00000530, 0x00000540): DW_OP_breg6 RBP-8, DW_OP_deref
+# CHECK-NEXT:    [0x00000700, 0x00000710): DW_OP_breg5 RDI+0
+
+# CHECK:      .debug_loclists contents:
+# CHECK-NEXT: 0x00000000: locations list header: length = 0x00000031, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+# CHECK-NEXT: 0x00000000:
+# CHECK-NEXT:   [0x00000000, 0x00000010): DW_OP_breg5 RDI+0
+# CHECK-NEXT:   [0x00000530, 0x00000540): DW_OP_breg6 RBP-8, DW_OP_deref
+# CHECK-NEXT:   [0x00000700, 0x00000710): DW_OP_breg5 RDI+0
+
+.section  .debug_str,"MS",@progbits,1
+  .asciz  "stub"
+
+.section  .debug_str_offsets,"",@progbits
+  .long  68
+  .short  5
+  .short  0
+.Lstr_offsets_base0:
+  .zero 64
+
+.section  .debug_loclists,"",@progbits
+  .long  .Ldebug_loclist_table_end0-.Ldebug_loclist_table_start0
+.Ldebug_loclist_table_start0:
+ .short 5                        # Version.
+ .byte 8                         # Address size.
+ .byte 0                         # Segmen selector size.
+ .long 0                         # Offset entry count.
+.Lloclists_table_base0:
+.Ldebug_loc0:
+  .byte  4                       # DW_LLE_offset_pair
+  .uleb128 0x0                   #   starting offset
+  .uleb128 0x10                  #   ending offset
+  .short  2                      # Loc expr size
+  .byte  117                     # DW_OP_breg5
+  .byte  0                       # 0
+  
+  .byte  6                       # DW_LLE_base_address
+  .quad  0x500                   # Some address
+  .short  0                      # Loc expr size = 0.
+  
+  .byte  4                       # DW_LLE_offset_pair
+  .uleb128 0x30                  #   starting offset
+  .uleb128 0x40                  #   ending offset
+  .short  3                      # Loc expr size
+  .byte  118                     # DW_OP_breg6
+  .byte  120                     # -8
+  .byte  6                       # DW_OP_deref
+
+  .byte  8                       # DW_LLE_start_length
+  .quad  0x700                   # Some address
+  .uleb128 0x10                  #   length
+  .short  2                      # Loc expr size
+  .byte  117                     # DW_OP_breg5
+  .byte  0                       # 0
+  
+  .byte  0                       # DW_LLE_end_of_list
+
+.Ldebug_loclist_table_end0:
+
+.section  .debug_abbrev,"",@progbits
+  .byte  1                       # Abbreviation Code
+  .byte  17                      # DW_TAG_compile_unit
+  .byte  1                       # DW_CHILDREN_yes
+  .byte  37                      # DW_AT_producer
+  .byte  37                      # DW_FORM_strx1
+  .byte  19                      # DW_AT_language
+  .byte  5                       # DW_FORM_data2
+  .byte  3                       # DW_AT_name
+  .byte  37                      # DW_FORM_strx1
+  .byte  114                     # DW_AT_str_offsets_base
+  .byte  23                      # DW_FORM_sec_offset
+  .byte  16                      # DW_AT_stmt_list
+  .byte  23                      # DW_FORM_sec_offset
+  .byte  27                      # DW_AT_comp_dir
+  .byte  37                      # DW_FORM_strx1
+  .byte  17                      # DW_AT_low_pc
+  .byte  1                       # DW_FORM_addr
+  .byte  18                      # DW_AT_high_pc
+  .byte  6                       # DW_FORM_data4
+  .ascii  "\214\001"             # DW_AT_loclists_base
+  .byte  23                      # DW_FORM_sec_offset
+  .byte  0                       # EOM(1)
+  .byte  0                       # EOM(2)
+  .byte  2                       # Abbreviation Code
+  .byte  46                      # DW_TAG_subprogram
+  .byte  1                       # DW_CHILDREN_yes
+  .byte  17                      # DW_AT_low_pc
+  .byte  1                       # DW_FORM_addr
+  .byte  18                      # DW_AT_high_pc
+  .byte  6                       # DW_FORM_data4
+  .byte  64                      # DW_AT_frame_base
+  .byte  24                      # DW_FORM_exprloc
+  .byte  110                     # DW_AT_linkage_name
+  .byte  37                      # DW_FORM_strx1
+  .byte  3                       # DW_AT_name
+  .byte  37                      # DW_FORM_strx1
+  .byte  58                      # DW_AT_decl_file
+  .byte  11                      # DW_FORM_data1
+  .byte  59                      # DW_AT_decl_line
+  .byte  11                      # DW_FORM_data1
+  .byte  63                      # DW_AT_external
+  .byte  25                      # DW_FORM_flag_present
+  .byte  0                       # EOM(1)
+  .byte  0                       # EOM(2)
+  .byte  3                       # Abbreviation Code
+  .byte  52                      # DW_TAG_variable
+  .byte  0                       # DW_CHILDREN_no
+  .byte  2                       # DW_AT_location
+  .byte  23                      # DW_FORM_sec_offset
+  .byte  3                       # DW_AT_name
+  .byte  37                      # DW_FORM_strx1
+  .byte  58                      # DW_AT_decl_file
+  .byte  11                      # DW_FORM_data1
+  .byte  59                      # DW_AT_decl_line
+  .byte  11                      # DW_FORM_data1
+  .byte  73                      # DW_AT_type
+  .byte  19                      # DW_FORM_ref4
+  .byte  0                       # EOM(1)
+  .byte  0                       # EOM(2)
+  .byte  0                       # EOM(3)
+
+.section  .debug_info,"",@progbits
+.Lcu_begin0:
+  .long  70                      # Length of Unit
+  .short  5                      # DWARF version number
+  .byte  1                       # DWARF Unit Type
+  .byte  8                       # Address Size (in bytes)
+  .long  .debug_abbrev           # Offset Into Abbrev. Section
+  .byte  1                       # Abbrev [1] 0xc:0xef DW_TAG_compile_unit
+  .byte  0                       # DW_AT_producer
+  .short  4                      # DW_AT_language
+  .byte  1                       # DW_AT_name
+  .long  .Lstr_offsets_base0     # DW_AT_str_offsets_base
+  .long  .Lline_table_start0     # DW_AT_stmt_list
+  .byte  2                       # DW_AT_comp_dir
+  .quad  0x10                    # DW_AT_low_pc
+  .long  0                       # DW_AT_high_pc
+  .long  .Lloclists_table_base0  # DW_AT_loclists_base
+  .byte  2                       # Abbrev [2] 0x2a:0x20 DW_TAG_subprogram
+  .quad  0                       # DW_AT_low_pc
+  .long  0                       # DW_AT_high_pc
+  .byte  1                       # DW_AT_frame_base
+  .byte  86
+  .byte  11                      # DW_AT_linkage_name
+  .byte  12                      # DW_AT_name
+  .byte  1                       # DW_AT_decl_file
+  .byte  6                       # DW_AT_decl_line
+                                 # DW_AT_external
+  .byte 3                        # Abbrev [3] 0x40:0xb DW_TAG_variable
+  .long .Ldebug_loc0             # DW_AT_location
+  .byte 7                        # DW_AT_name
+  .byte 1                        # DW_AT_decl_file
+  .byte 6                        # DW_AT_decl_line
+  .long 76                       # DW_AT_type
+  .byte 0                        # End Of Children Mark
+  .byte 0                        # End Of Children Mark
+  .byte  0                       # End Of Children Mark
+
+.section .debug_line,"",@progbits
+.Lline_table_start0:
+
-- 
GitLab


From ce9fb23fb1f1c43c0b1476962c0291eafc1a56aa Mon Sep 17 00:00:00 2001
From: George Rimar <grimar@accesssoftek.com>
Date: Mon, 22 Oct 2018 12:18:30 +0000
Subject: [PATCH 0374/1116] [llvm-dwarfdump] - Fix win10 build bot failture.

Bot failed:
http://lab.llvm.org:8011/builders/llvm-clang-lld-x86_64-scei-ps4-windows10pro-fast/builds/20877/steps/test/logs/stdio

This was broken after the
r344895 "[llvm-dwarfdump] - Add the support of parsing .debug_loclists."
because of wrong formatting specifiers used.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344896 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/DWARF/DWARFDebugLoc.cpp            |  9 ++++++---
 test/DebugInfo/X86/dwarfdump-debug-loclists.test | 12 ++++++------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index bfcf799d230..b4bdaaac0c2 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -227,13 +227,16 @@ void DWARFDebugLoclists::LocationList::dump(raw_ostream &OS, uint64_t BaseAddr,
     case dwarf::DW_LLE_start_length:
       OS << '\n';
       OS.indent(Indent);
-      OS << format("[0x%8.8x, 0x%8.8x): ", E.Value0, E.Value0 + E.Value1);
+      OS << format("[0x%*.*" PRIx64 ", 0x%*.*x): ", AddressSize * 2,
+                   AddressSize * 2, E.Value0, AddressSize * 2, AddressSize * 2,
+                   E.Value0 + E.Value1);
       break;
     case dwarf::DW_LLE_offset_pair:
       OS << '\n';
       OS.indent(Indent);
-      OS << format("[0x%8.8x, 0x%8.8x): ", BaseAddr + E.Value0,
-                   BaseAddr + E.Value1);
+      OS << format("[0x%*.*" PRIx64 ", 0x%*.*x): ", AddressSize * 2,
+                   AddressSize * 2, BaseAddr + E.Value0, AddressSize * 2,
+                   AddressSize * 2, BaseAddr + E.Value1);
       break;
     case dwarf::DW_LLE_base_address:
       BaseAddr = E.Value0;
diff --git a/test/DebugInfo/X86/dwarfdump-debug-loclists.test b/test/DebugInfo/X86/dwarfdump-debug-loclists.test
index de8cb15596a..e5f7fb0c1c1 100644
--- a/test/DebugInfo/X86/dwarfdump-debug-loclists.test
+++ b/test/DebugInfo/X86/dwarfdump-debug-loclists.test
@@ -4,16 +4,16 @@
 # CHECK:      .debug_info
 # CHECK:       DW_AT_name{{.*}}"stub"
 # CHECK:       DW_AT_location [DW_FORM_sec_offset]   (0x0000000c
-# CHECK-NEXT:    [0x00000010, 0x00000020): DW_OP_breg5 RDI+0
-# CHECK-NEXT:    [0x00000530, 0x00000540): DW_OP_breg6 RBP-8, DW_OP_deref
-# CHECK-NEXT:    [0x00000700, 0x00000710): DW_OP_breg5 RDI+0
+# CHECK-NEXT:    [0x0000000000000010, 0x0000000000000020): DW_OP_breg5 RDI+0
+# CHECK-NEXT:    [0x0000000000000530, 0x0000000000000540): DW_OP_breg6 RBP-8, DW_OP_deref
+# CHECK-NEXT:    [0x0000000000000700, 0x0000000000000710): DW_OP_breg5 RDI+0
 
 # CHECK:      .debug_loclists contents:
 # CHECK-NEXT: 0x00000000: locations list header: length = 0x00000031, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
 # CHECK-NEXT: 0x00000000:
-# CHECK-NEXT:   [0x00000000, 0x00000010): DW_OP_breg5 RDI+0
-# CHECK-NEXT:   [0x00000530, 0x00000540): DW_OP_breg6 RBP-8, DW_OP_deref
-# CHECK-NEXT:   [0x00000700, 0x00000710): DW_OP_breg5 RDI+0
+# CHECK-NEXT:   [0x0000000000000000, 0x0000000000000010): DW_OP_breg5 RDI+0
+# CHECK-NEXT:   [0x0000000000000530, 0x0000000000000540): DW_OP_breg6 RBP-8, DW_OP_deref
+# CHECK-NEXT:   [0x0000000000000700, 0x0000000000000710): DW_OP_breg5 RDI+0
 
 .section  .debug_str,"MS",@progbits,1
   .asciz  "stub"
-- 
GitLab


From ca81ba500b01cd713c491104cbd4e8262223d915 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@rt-rk.com>
Date: Mon, 22 Oct 2018 13:27:50 +0000
Subject: [PATCH 0375/1116] Test commit: change comment.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344900 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MipsCallLowering.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/Mips/MipsCallLowering.h b/lib/Target/Mips/MipsCallLowering.h
index 389db3a3b68..a0d4464e2c0 100644
--- a/lib/Target/Mips/MipsCallLowering.h
+++ b/lib/Target/Mips/MipsCallLowering.h
@@ -81,7 +81,7 @@ private:
                                       SmallVectorImpl<T> &ISDArgs) const;
 
   /// Split structures and arrays, save original argument indices since
-  /// Mips calling conv needs info about original argument type.
+  /// Mips calling convention needs info about original argument type.
   void splitToValueTypes(const ArgInfo &OrigArg, unsigned OriginalIndex,
                          SmallVectorImpl<ArgInfo> &SplitArgs,
                          SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const;
-- 
GitLab


From 090b8892dc16d42a59cc3f9e61c7467e11eb8fc0 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Mon, 22 Oct 2018 13:54:17 +0000
Subject: [PATCH 0376/1116] [X86][BMI1]: X86DAGToDAGISel: select BEXTR from  x
 & ((1 << nbits) + (-1))  pattern

Summary:
Trivial continuation of D52304.
While this pattern is not canonical, we do select it in the BZHI case,
so this should not be any different.

Reviewers: RKSimon, craig.topper, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D52348

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344902 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp  |  25 +++-
 test/CodeGen/X86/extract-bits.ll    | 207 ++++++++++------------------
 test/CodeGen/X86/extract-lowbits.ll | 122 +++++-----------
 3 files changed, 130 insertions(+), 224 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5e9fbf83f90..b6503754497 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2700,6 +2700,24 @@ bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) {
 
   SDValue NBits;
 
+  // a) x & ((1 << nbits) + (-1))
+  auto matchPatternA = [&NBits](SDValue Mask) -> bool {
+    // Match `add`. Must only have one use!
+    if (Mask->getOpcode() != ISD::ADD || !Mask->hasOneUse())
+      return false;
+    // We should be adding all-ones constant (i.e. subtracting one.)
+    if (!isAllOnesConstant(Mask->getOperand(1)))
+      return false;
+    // Match `1 << nbits`. Must only have one use!
+    SDValue M0 = Mask->getOperand(0);
+    if (M0->getOpcode() != ISD::SHL || !M0->hasOneUse())
+      return false;
+    if (!isOneConstant(M0->getOperand(0)))
+      return false;
+    NBits = M0->getOperand(1);
+    return true;
+  };
+
   // b) x & ~(-1 << nbits)
   auto matchPatternB = [&NBits](SDValue Mask) -> bool {
     // Match `~()`. Must only have one use!
@@ -2715,9 +2733,10 @@ bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) {
     return true;
   };
 
-  auto matchLowBitMask = [&matchPatternB](SDValue Mask) -> bool {
-    // FIXME: patterns a, c, d.
-    return matchPatternB(Mask);
+  auto matchLowBitMask = [&matchPatternA,
+                          &matchPatternB](SDValue Mask) -> bool {
+    // FIXME: patterns c, d.
+    return matchPatternA(Mask) || matchPatternB(Mask);
   };
 
   SDValue X = Node->getOperand(0);
diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll
index 06f316b14d0..a7d91ede4be 100644
--- a/test/CodeGen/X86/extract-bits.ll
+++ b/test/CodeGen/X86/extract-bits.ll
@@ -48,17 +48,12 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a0:
@@ -86,11 +81,8 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a0:
@@ -123,17 +115,12 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a1_indexzext:
@@ -161,11 +148,8 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a1_indexzext:
@@ -201,18 +185,13 @@ define i32 @bextr32_a2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a2_load:
@@ -240,14 +219,11 @@ define i32 @bextr32_a2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-LABEL: bextr32_a2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %esi
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a2_load:
@@ -282,18 +258,13 @@ define i32 @bextr32_a3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a3_load_indexzext:
@@ -321,14 +292,11 @@ define i32 @bextr32_a3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X64-BMI1NOTBM-LABEL: bextr32_a3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %esi
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a3_load_indexzext:
@@ -364,17 +332,12 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a4_commutative:
@@ -402,11 +365,8 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a4_commutative:
@@ -447,25 +407,19 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a5_skipextrauses:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    pushl %eax
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X86-BMI1NOTBM-NEXT:    movl $1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    decl %esi
-; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
-; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a5_skipextrauses:
@@ -504,11 +458,8 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $1, %ebx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    decl %ebx
-; X64-BMI1NOTBM-NEXT:    andl %edi, %ebx
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -662,11 +613,8 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a0:
@@ -805,14 +753,12 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_a1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a1_indexzext:
@@ -960,14 +906,11 @@ define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-LABEL: bextr64_a2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rsi
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rsi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rsi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a2_load:
@@ -1111,15 +1054,13 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_a3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rsi
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rsi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rsi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a3_load_indexzext:
@@ -1266,11 +1207,8 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a4_commutative:
@@ -1454,13 +1392,10 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_a5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movl $1, %ebx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    decq %rbx
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rbx
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
diff --git a/test/CodeGen/X86/extract-lowbits.ll b/test/CodeGen/X86/extract-lowbits.ll
index 43df34000d4..eae52441dfc 100644
--- a/test/CodeGen/X86/extract-lowbits.ll
+++ b/test/CodeGen/X86/extract-lowbits.ll
@@ -39,11 +39,9 @@ define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_a0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_a0:
@@ -64,12 +62,8 @@ define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_a0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_a0:
@@ -94,11 +88,9 @@ define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_a1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_a1_indexzext:
@@ -119,12 +111,8 @@ define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_a1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_a1_indexzext:
@@ -151,12 +139,10 @@ define i32 @bzhi32_a2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_a2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl (%edx), %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_a2_load:
@@ -178,12 +164,8 @@ define i32 @bzhi32_a2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_a2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_a2_load:
@@ -210,12 +192,10 @@ define i32 @bzhi32_a3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_a3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl (%edx), %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_a3_load_indexzext:
@@ -237,12 +217,8 @@ define i32 @bzhi32_a3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_a3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_a3_load_indexzext:
@@ -269,11 +245,9 @@ define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_a4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_a4_commutative:
@@ -294,12 +268,8 @@ define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_a4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_a4_commutative:
@@ -384,12 +354,8 @@ define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_a0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_a0:
@@ -472,12 +438,9 @@ define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_a1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_a1_indexzext:
@@ -571,12 +534,8 @@ define i64 @bzhi64_a2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_a2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_a2_load:
@@ -669,12 +628,9 @@ define i64 @bzhi64_a3_load_indexzext(i64* %w, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_a3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_a3_load_indexzext:
@@ -760,12 +716,8 @@ define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_a4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_a4_commutative:
-- 
GitLab


From 9b7ef04cfb21269c461a442c96a365c3bdaf8f06 Mon Sep 17 00:00:00 2001
From: David Greene <greened@obbligato.org>
Date: Mon, 22 Oct 2018 14:04:13 +0000
Subject: [PATCH 0377/1116] Document bisect-skip-count

Provide an example of how to use bisect-skip count to find bugs.

Differential revision: https://reviews.llvm.org/D52314


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344903 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/bisect-skip-count | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/utils/bisect-skip-count b/utils/bisect-skip-count
index b18b4f41481..f4f8ddcec79 100755
--- a/utils/bisect-skip-count
+++ b/utils/bisect-skip-count
@@ -1,6 +1,25 @@
 #!/usr/bin/env python
 # This script is used to bisect skip and count arguments for --debug-counter.
 # It is similar to bisect, except it understands how to increase skip and decrease count
+#
+# Typical usage:
+#
+# bisect-skip-count bisect-command.sh "%(skip)d" "%(count)d" 2>&1 | tee bisect.out
+#
+# bisect-command.sh is something like this:
+# #! /bin/bash
+#
+# skip=$1
+# count=$2
+#
+# opt -debug-counter=my-counter-skip=${skip},my-counter-count=${count}
+# ... Test output of opt and exit zero for pass, non-zero for fail
+#
+# Examine bisect.out to look for "Last good skip" and "Last good
+# count" to find the values of the counter that produce a passing
+# result.  Incrementing the last good count by one or decrementing the
+# last good skip by one should produce a failure.
+#
 import os
 import sys
 import argparse
-- 
GitLab


From a1fe5fbd7ae26effe99a24c54b7c8ef8f7d76700 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Mon, 22 Oct 2018 14:12:44 +0000
Subject: [PATCH 0378/1116] [X86] X86DAGToDAGISel: handle BZHI selection too,
 not just BEXTR.

Summary:
As discussed in D52304 / IRC, we now have pattern matching for
'bit extract' in two places - tablegen and `X86DAGToDAGISel`.
There are 4 patterns.
And we will have a problem with `x &  (-1 >> (32 - y))` pattern.
* If the mask is one-use, then it is always unfolded into `x << (32 - y) >> (32 - y)` first.
  Thus, the existing test coverage is already broken.
* If it is not one-use, then it is not unfolded, and is matched as BZHI.
* If it is not one-use, we will not match it as BEXTR. And if it is one-use, it will have been unfolded already.
So we will either not handle that pattern for BEXTR, or not have test coverage for it.
This is bad.

As discussed with @craig.topper, let's unify this matching, and do everything in `X86DAGToDAGISel`.
Then we will not have code duplication, and will have proper test coverage.

This indeed does not affect any tests, and this is great.
It means that for these two patterns, the `X86DAGToDAGISel` is identical to the tablegen version.

Please review carefully, i'm not fully sure about that intrinsic change, and introduction of the new `X86ISD` opcode.

Reviewers: craig.topper, RKSimon, spatel

Reviewed By: craig.topper

Subscribers: llvm-commits, craig.topper

Differential Revision: https://reviews.llvm.org/D53164

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344904 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp | 40 +++++++++++++++++++++---------
 lib/Target/X86/X86ISelLowering.cpp |  1 +
 lib/Target/X86/X86ISelLowering.h   |  3 +++
 lib/Target/X86/X86InstrInfo.td     | 18 +++-----------
 lib/Target/X86/X86IntrinsicsInfo.h |  2 ++
 5 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index b6503754497..d3aa5c89adc 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -460,7 +460,7 @@ namespace {
 
     bool foldLoadStoreIntoMemOperand(SDNode *Node);
     MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
-    bool matchBEXTR(SDNode *Node);
+    bool matchBitExtract(SDNode *Node);
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
     bool tryShiftAmountMod(SDNode *N);
@@ -2681,15 +2681,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
   return true;
 }
 
-// See if this is an  X & Mask  that we can match to BEXTR.
+// See if this is an  X & Mask  that we can match to BEXTR/BZHI.
 // Where Mask is one of the following patterns:
 //   a) x &  (1 << nbits) - 1
 //   b) x & ~(-1 << nbits)
 //   c) x &  (-1 >> (32 - y))
 //   d) x << (32 - y) >> (32 - y)
-bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) {
-  // BEXTR is BMI instruction. However, if we have BMI2, we prefer BZHI.
-  if (!Subtarget->hasBMI() || Subtarget->hasBMI2())
+bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
+  // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
+  if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
     return false;
 
   MVT NVT = Node->getSimpleValueType(0);
@@ -2700,17 +2700,24 @@ bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) {
 
   SDValue NBits;
 
+  // If we have BMI2's BZHI, we are ok with muti-use patterns.
+  // Else, if we only have BMI1's BEXTR, we require one-use.
+  const bool CanHaveExtraUses = Subtarget->hasBMI2();
+  auto checkOneUse = [CanHaveExtraUses](SDValue Op) {
+    return CanHaveExtraUses || Op.hasOneUse();
+  };
+
   // a) x & ((1 << nbits) + (-1))
-  auto matchPatternA = [&NBits](SDValue Mask) -> bool {
+  auto matchPatternA = [&checkOneUse, &NBits](SDValue Mask) -> bool {
     // Match `add`. Must only have one use!
-    if (Mask->getOpcode() != ISD::ADD || !Mask->hasOneUse())
+    if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
       return false;
     // We should be adding all-ones constant (i.e. subtracting one.)
     if (!isAllOnesConstant(Mask->getOperand(1)))
       return false;
     // Match `1 << nbits`. Must only have one use!
     SDValue M0 = Mask->getOperand(0);
-    if (M0->getOpcode() != ISD::SHL || !M0->hasOneUse())
+    if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
       return false;
     if (!isOneConstant(M0->getOperand(0)))
       return false;
@@ -2719,13 +2726,13 @@ bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) {
   };
 
   // b) x & ~(-1 << nbits)
-  auto matchPatternB = [&NBits](SDValue Mask) -> bool {
+  auto matchPatternB = [&checkOneUse, &NBits](SDValue Mask) -> bool {
     // Match `~()`. Must only have one use!
-    if (!isBitwiseNot(Mask) || !Mask->hasOneUse())
+    if (!isBitwiseNot(Mask) || !checkOneUse(Mask))
       return false;
     // Match `-1 << nbits`. Must only have one use!
     SDValue M0 = Mask->getOperand(0);
-    if (M0->getOpcode() != ISD::SHL || !M0->hasOneUse())
+    if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
       return false;
     if (!isAllOnesConstant(M0->getOperand(0)))
       return false;
@@ -2761,6 +2768,15 @@ bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) {
   NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, NVT, ImplDef, NBits);
   insertDAGNode(*CurDAG, OrigNBits, NBits);
 
+  if (Subtarget->hasBMI2()) {
+    // Great, just emit the the BZHI..
+    SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
+    ReplaceNode(Node, Extract.getNode());
+    SelectCode(Extract.getNode());
+    return true;
+  }
+
+  // Else, emitting BEXTR requires one more step.
   // The 'control' of BEXTR has the pattern of:
   // [15...8 bit][ 7...0 bit] location
   // [ bit count][     shift] name
@@ -3168,7 +3184,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       CurDAG->RemoveDeadNode(Node);
       return;
     }
-    if (matchBEXTR(Node))
+    if (matchBitExtract(Node))
       return;
     if (AndImmShrink && shrinkAndImmediate(Node))
       return;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ae5795db4ab..cdd76e1f03d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26630,6 +26630,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
+  case X86ISD::BZHI:               return "X86ISD::BZHI";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 3e6c8929a9b..15321b12ff6 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -355,6 +355,9 @@ namespace llvm {
       // Bit field extract.
       BEXTR,
 
+      // Zero High Bits Starting with Specified Bit Position.
+      BZHI,
+
       // LOW, HI, FLAGS = umul LHS, RHS.
       UMUL,
 
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 15ed435244e..39c3bbfd90e 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -291,6 +291,8 @@ def X86lock_dec  : SDNode<"X86ISD::LDEC",  SDTLockUnaryArithWithFlags,
 
 def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
 
+def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntBinOp>;
+
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
 def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
@@ -2454,9 +2456,9 @@ multiclass bmi_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
 
 let Predicates = [HasBMI2], Defs = [EFLAGS] in {
   defm BZHI32 : bmi_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
-                         int_x86_bmi_bzhi_32, loadi32, WriteBZHI>;
+                         X86bzhi, loadi32, WriteBZHI>;
   defm BZHI64 : bmi_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
-                         int_x86_bmi_bzhi_64, loadi64, WriteBZHI>, VEX_W;
+                         X86bzhi, loadi64, WriteBZHI>, VEX_W;
 }
 
 def CountTrailingOnes : SDNodeXForm<imm, [{
@@ -2512,18 +2514,6 @@ let Predicates = [HasBMI2] in {
   multiclass bmi_bzhi_patterns<RegisterClass RC, int bitwidth, ValueType VT,
                                Instruction DstInst, X86MemOperand x86memop,
                                Instruction DstMemInst> {
-    // x & ((1 << y) - 1)
-    defm : _bmi_bzhi_pattern<(and RC:$src, (add (shl 1, GR8:$lz), -1)),
-                             (and (x86memop addr:$src),
-                                  (add (shl 1, GR8:$lz), -1)),
-                             RC, VT, DstInst, DstMemInst>;
-
-    // x & ~(-1 << y)
-    defm : _bmi_bzhi_pattern<(and RC:$src, (xor (shl -1, GR8:$lz), -1)),
-                             (and (x86memop addr:$src),
-                                  (xor (shl -1, GR8:$lz), -1)),
-                             RC, VT, DstInst, DstMemInst>;
-
     // x & (-1 >> (bitwidth - y))
     defm : _bmi_bzhi_pattern<(and RC:$src, (srl -1, (sub bitwidth, GR8:$lz))),
                              (and (x86memop addr:$src),
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 84c7878de61..252d64808f0 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1120,6 +1120,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_vpshrd_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
   X86_INTRINSIC_DATA(bmi_bextr_32,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(bmi_bextr_64,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
+  X86_INTRINSIC_DATA(bmi_bzhi_32,          INTR_TYPE_2OP, X86ISD::BZHI, 0),
+  X86_INTRINSIC_DATA(bmi_bzhi_64,          INTR_TYPE_2OP, X86ISD::BZHI, 0),
   X86_INTRINSIC_DATA(sse_cmp_ps,        INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
-- 
GitLab


From 84a4ea35b3b0544100580bf5a7cefbfc4dd5a627 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Mon, 22 Oct 2018 14:46:08 +0000
Subject: [PATCH 0379/1116] [llvm-exegesis] Reject x86 instructions that use
 non uniform memory accesses

Reviewers: courbet

Subscribers: tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D53438

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344905 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/X86/Target.cpp | 89 ++++++++++++++++++--------
 1 file changed, 64 insertions(+), 25 deletions(-)

diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index 1a7d290b1af..b7548f8f3c3 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -21,12 +21,74 @@ namespace exegesis {
 
 namespace {
 
+// A chunk of instruction's operands that represents a single memory access.
+struct MemoryOperandRange {
+  MemoryOperandRange(llvm::ArrayRef<Operand> Operands) : Ops(Operands) {}
+
+  // Setup InstructionTemplate so the memory access represented by this object
+  // points to [reg] + offset.
+  void fillOrDie(InstructionTemplate &IT, unsigned Reg, unsigned Offset) {
+    switch (Ops.size()) {
+    case 5:
+      IT.getValueFor(Ops[0]) = llvm::MCOperand::createReg(Reg);    // BaseReg
+      IT.getValueFor(Ops[1]) = llvm::MCOperand::createImm(1);      // ScaleAmt
+      IT.getValueFor(Ops[2]) = llvm::MCOperand::createReg(0);      // IndexReg
+      IT.getValueFor(Ops[3]) = llvm::MCOperand::createImm(Offset); // Disp
+      IT.getValueFor(Ops[4]) = llvm::MCOperand::createReg(0);      // Segment
+      break;
+    default:
+      llvm::errs() << Ops.size() << "-op are not handled right now ("
+                   << IT.Instr.Name << ")\n";
+      llvm_unreachable("Invalid memory configuration");
+    }
+  }
+
+  // Returns whether Range can be filled.
+  static bool isValid(const MemoryOperandRange &Range) {
+    return Range.Ops.size() == 5;
+  }
+
+  // Returns whether Op is a valid memory operand.
+  static bool isMemoryOperand(const Operand &Op) {
+    return Op.isMemory() && Op.isExplicit();
+  }
+
+  llvm::ArrayRef<Operand> Ops;
+};
+
+// X86 memory access involve non constant number of operands, this function
+// extracts contiguous memory operands into MemoryOperandRange so it's easier to
+// check and fill.
+static std::vector<MemoryOperandRange>
+getMemoryOperandRanges(llvm::ArrayRef<Operand> Operands) {
+  std::vector<MemoryOperandRange> Result;
+  while (!Operands.empty()) {
+    Operands = Operands.drop_until(MemoryOperandRange::isMemoryOperand);
+    auto MemoryOps = Operands.take_while(MemoryOperandRange::isMemoryOperand);
+    if (!MemoryOps.empty())
+      Result.push_back(MemoryOps);
+    Operands = Operands.drop_front(MemoryOps.size());
+  }
+  return Result;
+}
+
 static llvm::Error IsInvalidOpcode(const Instruction &Instr) {
   const auto OpcodeName = Instr.Name;
   if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") ||
       OpcodeName.startswith("ADJCALLSTACK"))
     return llvm::make_error<BenchmarkFailure>(
         "unsupported opcode: Push/Pop/AdjCallStack");
+  const bool ValidMemoryOperands = llvm::all_of(
+      getMemoryOperandRanges(Instr.Operands), MemoryOperandRange::isValid);
+  if (!ValidMemoryOperands)
+    return llvm::make_error<BenchmarkFailure>(
+        "unsupported opcode: non uniform memory access");
+  // We do not handle instructions with OPERAND_PCREL.
+  for (const Operand &Op : Instr.Operands)
+    if (Op.isExplicit() &&
+        Op.getExplicitOperandInfo().OperandType == llvm::MCOI::OPERAND_PCREL)
+      return llvm::make_error<BenchmarkFailure>(
+          "unsupported opcode: PC relative operand");
   // We do not handle second-form X87 instructions. We only handle first-form
   // ones (_Fp), see comment in X86InstrFPStack.td.
   for (const Operand &Op : Instr.Operands)
@@ -281,31 +343,8 @@ class ExegesisX86Target : public ExegesisTarget {
                           unsigned Offset) const override {
     // FIXME: For instructions that read AND write to memory, we use the same
     // value for input and output.
-    for (size_t I = 0, E = IT.Instr.Operands.size(); I < E; ++I) {
-      const Operand *Op = &IT.Instr.Operands[I];
-      if (Op->isExplicit() && Op->isMemory()) {
-        // Case 1: 5-op memory.
-        assert((I + 5 <= E) && "x86 memory references are always 5 ops");
-        IT.getValueFor(*Op) = llvm::MCOperand::createReg(Reg); // BaseReg
-        Op = &IT.Instr.Operands[++I];
-        assert(Op->isMemory());
-        assert(Op->isExplicit());
-        IT.getValueFor(*Op) = llvm::MCOperand::createImm(1); // ScaleAmt
-        Op = &IT.Instr.Operands[++I];
-        assert(Op->isMemory());
-        assert(Op->isExplicit());
-        IT.getValueFor(*Op) = llvm::MCOperand::createReg(0); // IndexReg
-        Op = &IT.Instr.Operands[++I];
-        assert(Op->isMemory());
-        assert(Op->isExplicit());
-        IT.getValueFor(*Op) = llvm::MCOperand::createImm(Offset); // Disp
-        Op = &IT.Instr.Operands[++I];
-        assert(Op->isMemory());
-        assert(Op->isExplicit());
-        IT.getValueFor(*Op) = llvm::MCOperand::createReg(0); // Segment
-        // Case2: segment:index addressing. We assume that ES is 0.
-      }
-    }
+    for (auto &MemoryRange : getMemoryOperandRanges(IT.Instr.Operands))
+      MemoryRange.fillOrDie(IT, Reg, Offset);
   }
 
   std::vector<llvm::MCInst> setRegTo(const llvm::MCSubtargetInfo &STI,
-- 
GitLab


From 66e9f9ca3b56428648bd444b787419658194da01 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Mon, 22 Oct 2018 14:55:43 +0000
Subject: [PATCH 0380/1116] [llvm-exegesis] Mark x86 segment register
 instructions as unsupported.

Reviewers: courbet

Subscribers: tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D53499

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344906 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/X86/Target.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index b7548f8f3c3..db1a23b74cc 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -89,6 +89,12 @@ static llvm::Error IsInvalidOpcode(const Instruction &Instr) {
         Op.getExplicitOperandInfo().OperandType == llvm::MCOI::OPERAND_PCREL)
       return llvm::make_error<BenchmarkFailure>(
           "unsupported opcode: PC relative operand");
+  for (const Operand &Op : Instr.Operands)
+    if (Op.isReg() && Op.isExplicit() &&
+        Op.getExplicitOperandInfo().RegClass ==
+            llvm::X86::SEGMENT_REGRegClassID)
+      return llvm::make_error<BenchmarkFailure>(
+          "unsupported opcode: access segment memory");
   // We do not handle second-form X87 instructions. We only handle first-form
   // ones (_Fp), see comment in X86InstrFPStack.td.
   for (const Operand &Op : Instr.Operands)
-- 
GitLab


From 502027b7197108eccb6d0ee6190eabd93a609b8e Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Mon, 22 Oct 2018 15:06:10 +0000
Subject: [PATCH 0381/1116] [llvm-exegesis] Crash when assembling invalid
 Operand

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344907 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/Assembler.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/llvm-exegesis/lib/Assembler.cpp b/tools/llvm-exegesis/lib/Assembler.cpp
index 527b10146a3..1576cfe58e0 100644
--- a/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/tools/llvm-exegesis/lib/Assembler.cpp
@@ -110,6 +110,8 @@ static void fillMachineFunction(llvm::MachineFunction &MF,
         Builder.addReg(Op.getReg(), Flags);
       } else if (Op.isImm()) {
         Builder.addImm(Op.getImm());
+      } else if (!Op.isValid()) {
+        llvm_unreachable("Operand is not set");
       } else {
         llvm_unreachable("Not yet implemented");
       }
-- 
GitLab


From affca96420598449b96500ce46e31420ec70b5b5 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 22 Oct 2018 15:26:27 +0000
Subject: [PATCH 0382/1116] [InstCombine] add tests for shuffle+insert folds;
 NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344908 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/insert-extract-shuffle.ll     | 123 ++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll
index 2a0b9be218a..c76d88a8e9e 100644
--- a/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -303,3 +303,126 @@ define <4 x float> @collectShuffleElts(<2 x float> %x, float %y) {
   ret <4 x float> %v3
 }
 
+; TODO: Simplest case - insert scalar into undef, then shuffle that value in place into another vector.
+
+define <4 x float> @insert_shuffle(float %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_shuffle(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> [[Y:%.*]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> undef, float %x, i32 0
+  %r = shufflevector <4 x float> %xv, <4 x float> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %r
+}
+
+; TODO: Insert scalar into some element of a dummy vector, then move it to a different element in another vector.
+
+define <4 x float> @insert_shuffle_translate(float %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_shuffle_translate(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> [[Y:%.*]], <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> undef, float %x, i32 0
+  %r = shufflevector <4 x float> %xv, <4 x float> %y, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+  ret <4 x float> %r
+}
+
+; TODO: The vector operand of the insert is irrelevant.
+
+define <4 x float> @insert_not_undef_shuffle_translate(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 3
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> [[Y:%.*]], <4 x i32> <i32 4, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 3
+  %r = shufflevector <4 x float> %xv, <4 x float> %y, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
+  ret <4 x float> %r
+}
+
+; TODO: The insert may be the 2nd operand of the shuffle. The shuffle mask can include undef elements.
+
+define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[XV]], <4 x i32> <i32 0, i32 6, i32 2, i32 undef>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 2
+  %r = shufflevector <4 x float> %y, <4 x float> %xv, <4 x i32> <i32 0, i32 6, i32 2, i32 undef>
+  ret <4 x float> %r
+}
+
+; TODO: Both shuffle operands may be inserts - choose the correct side.
+
+define <4 x float> @insert_insert_shuffle_translate(float %x1, float %x2, <4 x float> %q) {
+; CHECK-LABEL: @insert_insert_shuffle_translate(
+; CHECK-NEXT:    [[XV1:%.*]] = insertelement <4 x float> undef, float [[X1:%.*]], i32 0
+; CHECK-NEXT:    [[XV2:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X2:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV1]], <4 x float> [[XV2]], <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv1 = insertelement <4 x float> %q, float %x1, i32 0
+  %xv2 = insertelement <4 x float> %q, float %x2, i32 2
+  %r = shufflevector <4 x float> %xv1, <4 x float> %xv2, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+  ret <4 x float> %r
+}
+
+; TODO: Both shuffle operands may be inserts - choose the correct side.
+
+define <4 x float> @insert_insert_shuffle_translate_commute(float %x1, float %x2, <4 x float> %q) {
+; CHECK-LABEL: @insert_insert_shuffle_translate_commute(
+; CHECK-NEXT:    [[XV1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i32 0
+; CHECK-NEXT:    [[XV2:%.*]] = insertelement <4 x float> undef, float [[X2:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV1]], <4 x float> [[XV2]], <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv1 = insertelement <4 x float> %q, float %x1, i32 0
+  %xv2 = insertelement <4 x float> %q, float %x2, i32 2
+  %r = shufflevector <4 x float> %xv1, <4 x float> %xv2, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %r
+}
+
+define <4 x float> @insert_insert_shuffle_translate_wrong_mask(float %x1, float %x2, <4 x float> %q) {
+; CHECK-LABEL: @insert_insert_shuffle_translate_wrong_mask(
+; CHECK-NEXT:    [[XV1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i32 0
+; CHECK-NEXT:    [[XV2:%.*]] = insertelement <4 x float> [[Q]], float [[X2:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV1]], <4 x float> [[XV2]], <4 x i32> <i32 0, i32 6, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv1 = insertelement <4 x float> %q, float %x1, i32 0
+  %xv2 = insertelement <4 x float> %q, float %x2, i32 2
+  %r = shufflevector <4 x float> %xv1, <4 x float> %xv2, <4 x i32> <i32 0, i32 6, i32 2, i32 7>
+  ret <4 x float> %r
+}
+
+; TODO: The insert may have other uses.
+
+declare void @use(<4 x float>)
+
+define <4 x float> @insert_not_undef_shuffle_translate_commute_uses(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute_uses(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X:%.*]], i32 2
+; CHECK-NEXT:    call void @use(<4 x float> [[XV]])
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[XV]], <4 x i32> <i32 6, i32 undef, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 2
+  call void @use(<4 x float> %xv)
+  %r = shufflevector <4 x float> %y, <4 x float> %xv, <4 x i32> <i32 6, i32 undef, i32 2, i32 3>
+  ret <4 x float> %r
+}
+
+define <5 x float> @insert_not_undef_shuffle_translate_commute_lengthen(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute_lengthen(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[XV]], <5 x i32> <i32 0, i32 6, i32 2, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <5 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 2
+  %r = shufflevector <4 x float> %y, <4 x float> %xv, <5 x i32> <i32 0, i32 6, i32 2, i32 undef, i32 undef>
+  ret <5 x float> %r
+}
+
-- 
GitLab


From 466ce67d6ec444962e5cc0136243c16a453190c0 Mon Sep 17 00:00:00 2001
From: Aleksandr Urakov <aleksandr.urakov@jetbrains.com>
Date: Mon, 22 Oct 2018 15:30:48 +0000
Subject: [PATCH 0383/1116] Revert "[PDB] Extend IPDBSession's interface to
 retrieve frame data"

This reverts commit b5c7e2f9a4dbb34e3667c4bb4972735eadd3247a.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344909 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h | 40 --------------
 include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h | 41 --------------
 include/llvm/DebugInfo/PDB/DIA/DIASession.h   |  1 -
 include/llvm/DebugInfo/PDB/IPDBFrameData.h    | 36 -------------
 include/llvm/DebugInfo/PDB/IPDBSession.h      |  3 --
 .../llvm/DebugInfo/PDB/Native/NativeSession.h |  2 -
 include/llvm/DebugInfo/PDB/PDBTypes.h         |  2 -
 lib/DebugInfo/PDB/CMakeLists.txt              |  2 -
 lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp    | 43 ---------------
 lib/DebugInfo/PDB/DIA/DIAFrameData.cpp        | 54 -------------------
 lib/DebugInfo/PDB/DIA/DIASession.cpp          | 11 ----
 lib/DebugInfo/PDB/Native/NativeSession.cpp    |  5 --
 lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp     |  3 --
 unittests/DebugInfo/PDB/PDBApiTest.cpp        |  4 --
 14 files changed, 247 deletions(-)
 delete mode 100644 include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
 delete mode 100644 include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
 delete mode 100644 include/llvm/DebugInfo/PDB/IPDBFrameData.h
 delete mode 100644 lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
 delete mode 100644 lib/DebugInfo/PDB/DIA/DIAFrameData.cpp

diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h b/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
deleted file mode 100644
index e17ba2ce59b..00000000000
--- a/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//==- DIAEnumFrameData.h --------------------------------------- -*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAENUMFRAMEDATA_H
-#define LLVM_DEBUGINFO_PDB_DIA_DIAENUMFRAMEDATA_H
-
-#include "DIASupport.h"
-#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
-#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
-
-namespace llvm {
-namespace pdb {
-
-class DIASession;
-
-class DIAEnumFrameData : public IPDBEnumChildren<IPDBFrameData> {
-public:
-  explicit DIAEnumFrameData(const DIASession &PDBSession,
-                            CComPtr<IDiaEnumFrameData> DiaEnumerator);
-
-  uint32_t getChildCount() const override;
-  ChildTypePtr getChildAtIndex(uint32_t Index) const override;
-  ChildTypePtr getNext() override;
-  void reset() override;
-
-private:
-  const DIASession &Session;
-  CComPtr<IDiaEnumFrameData> Enumerator;
-};
-
-} // namespace pdb
-} // namespace llvm
-
-#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h b/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
deleted file mode 100644
index 7564c3b7a5a..00000000000
--- a/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//===- DIAFrameData.h - DIA Impl. of IPDBFrameData ---------------- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAFRAMEDATA_H
-#define LLVM_DEBUGINFO_PDB_DIA_DIAFRAMEDATA_H
-
-#include "DIASupport.h"
-#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
-
-namespace llvm {
-namespace pdb {
-
-class DIASession;
-
-class DIAFrameData : public IPDBFrameData {
-public:
-  explicit DIAFrameData(const DIASession &PDBSession,
-                        CComPtr<IDiaFrameData> DiaFrameData);
-
-  uint32_t getAddressOffset() const override;
-  uint32_t getAddressSection() const override;
-  uint32_t getLengthBlock() const override;
-  std::string getProgram() const override;
-  uint32_t getRelativeVirtualAddress() const override;
-  uint64_t getVirtualAddress() const override;
-
-private:
-  const DIASession &Session;
-  CComPtr<IDiaFrameData> FrameData;
-};
-
-} // namespace pdb
-} // namespace llvm
-
-#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIASession.h b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
index 592e061a8d8..e355605c296 100644
--- a/include/llvm/DebugInfo/PDB/DIA/DIASession.h
+++ b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
@@ -85,7 +85,6 @@ public:
 
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override;
 
-  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override;
 private:
   CComPtr<IDiaSession> Session;
 };
diff --git a/include/llvm/DebugInfo/PDB/IPDBFrameData.h b/include/llvm/DebugInfo/PDB/IPDBFrameData.h
deleted file mode 100644
index 74679215b88..00000000000
--- a/include/llvm/DebugInfo/PDB/IPDBFrameData.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===- IPDBFrameData.h - base interface for frame data ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_PDB_IPDBFRAMEDATA_H
-#define LLVM_DEBUGINFO_PDB_IPDBFRAMEDATA_H
-
-#include <cstdint>
-#include <string>
-
-namespace llvm {
-namespace pdb {
-
-/// IPDBFrameData defines an interface used to represent a frame data of some
-/// code block.
-class IPDBFrameData {
-public:
-  virtual ~IPDBFrameData();
-
-  virtual uint32_t getAddressOffset() const = 0;
-  virtual uint32_t getAddressSection() const = 0;
-  virtual uint32_t getLengthBlock() const = 0;
-  virtual std::string getProgram() const = 0;
-  virtual uint32_t getRelativeVirtualAddress() const = 0;
-  virtual uint64_t getVirtualAddress() const = 0;
-};
-
-} // namespace pdb
-} // namespace llvm
-
-#endif
diff --git a/include/llvm/DebugInfo/PDB/IPDBSession.h b/include/llvm/DebugInfo/PDB/IPDBSession.h
index 88fd02c0a34..24573cdb779 100644
--- a/include/llvm/DebugInfo/PDB/IPDBSession.h
+++ b/include/llvm/DebugInfo/PDB/IPDBSession.h
@@ -91,9 +91,6 @@ public:
 
   virtual std::unique_ptr<IPDBEnumSectionContribs>
   getSectionContribs() const = 0;
-
-  virtual std::unique_ptr<IPDBEnumFrameData>
-  getFrameData() const = 0;
 };
 } // namespace pdb
 } // namespace llvm
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeSession.h b/include/llvm/DebugInfo/PDB/Native/NativeSession.h
index 4878e47d312..07ce85ef820 100644
--- a/include/llvm/DebugInfo/PDB/Native/NativeSession.h
+++ b/include/llvm/DebugInfo/PDB/Native/NativeSession.h
@@ -93,8 +93,6 @@ public:
 
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override;
 
-  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override;
-
   PDBFile &getPDBFile() { return *Pdb; }
   const PDBFile &getPDBFile() const { return *Pdb; }
 
diff --git a/include/llvm/DebugInfo/PDB/PDBTypes.h b/include/llvm/DebugInfo/PDB/PDBTypes.h
index 917f3ed7391..6247018ce0f 100644
--- a/include/llvm/DebugInfo/PDB/PDBTypes.h
+++ b/include/llvm/DebugInfo/PDB/PDBTypes.h
@@ -12,7 +12,6 @@
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
-#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include <cctype>
 #include <cstddef>
@@ -72,7 +71,6 @@ using IPDBEnumLineNumbers = IPDBEnumChildren<IPDBLineNumber>;
 using IPDBEnumTables = IPDBEnumChildren<IPDBTable>;
 using IPDBEnumInjectedSources = IPDBEnumChildren<IPDBInjectedSource>;
 using IPDBEnumSectionContribs = IPDBEnumChildren<IPDBSectionContrib>;
-using IPDBEnumFrameData = IPDBEnumChildren<IPDBFrameData>;
 
 /// Specifies which PDB reader implementation is to be used.  Only a value
 /// of PDB_ReaderType::DIA is currently supported, but Native is in the works.
diff --git a/lib/DebugInfo/PDB/CMakeLists.txt b/lib/DebugInfo/PDB/CMakeLists.txt
index d9d379f6d09..86dcfdaa163 100644
--- a/lib/DebugInfo/PDB/CMakeLists.txt
+++ b/lib/DebugInfo/PDB/CMakeLists.txt
@@ -14,7 +14,6 @@ if(LLVM_ENABLE_DIA_SDK)
   add_pdb_impl_folder(DIA
     DIA/DIADataStream.cpp
     DIA/DIAEnumDebugStreams.cpp
-    DIA/DIAEnumFrameData.cpp
     DIA/DIAEnumInjectedSources.cpp
     DIA/DIAEnumLineNumbers.cpp
     DIA/DIAEnumSectionContribs.cpp
@@ -22,7 +21,6 @@ if(LLVM_ENABLE_DIA_SDK)
     DIA/DIAEnumSymbols.cpp
     DIA/DIAEnumTables.cpp
     DIA/DIAError.cpp
-    DIA/DIAFrameData.cpp
     DIA/DIAInjectedSource.cpp
     DIA/DIALineNumber.cpp
     DIA/DIARawSymbol.cpp
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
deleted file mode 100644
index 77514483e04..00000000000
--- a/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//==- DIAEnumFrameData.cpp ---------------------------------------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h"
-#include "llvm/DebugInfo/PDB/DIA/DIAFrameData.h"
-#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
-
-using namespace llvm::pdb;
-
-DIAEnumFrameData::DIAEnumFrameData(const DIASession &PDBSession,
-                                   CComPtr<IDiaEnumFrameData> DiaEnumerator)
-    : Session(PDBSession), Enumerator(DiaEnumerator) {}
-
-uint32_t DIAEnumFrameData::getChildCount() const {
-  LONG Count = 0;
-  return (S_OK == Enumerator->get_Count(&Count)) ? Count : 0;
-}
-
-std::unique_ptr<IPDBFrameData>
-DIAEnumFrameData::getChildAtIndex(uint32_t Index) const {
-  CComPtr<IDiaFrameData> Item;
-  if (S_OK != Enumerator->Item(Index, &Item))
-    return nullptr;
-
-  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Session, Item));
-}
-
-std::unique_ptr<IPDBFrameData> DIAEnumFrameData::getNext() {
-  CComPtr<IDiaFrameData> Item;
-  ULONG NumFetched = 0;
-  if (S_OK != Enumerator->Next(1, &Item, &NumFetched))
-    return nullptr;
-
-  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Session, Item));
-}
-
-void DIAEnumFrameData::reset() { Enumerator->Reset(); }
diff --git a/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp b/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp
deleted file mode 100644
index b904a2ff60a..00000000000
--- a/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-//===- DIAFrameData.cpp - DIA impl. of IPDBFrameData -------------- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/PDB/DIA/DIAFrameData.h"
-#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
-#include "llvm/DebugInfo/PDB/DIA/DIAUtils.h"
-
-using namespace llvm::pdb;
-
-DIAFrameData::DIAFrameData(const DIASession &PDBSession,
-                           CComPtr<IDiaFrameData> DiaFrameData)
-    : Session(PDBSession), FrameData(DiaFrameData) {}
-
-template <typename ArgType>
-ArgType
-PrivateGetDIAValue(IDiaFrameData *FrameData,
-                   HRESULT (__stdcall IDiaFrameData::*Method)(ArgType *)) {
-  ArgType Value;
-  if (S_OK == (FrameData->*Method)(&Value))
-    return static_cast<ArgType>(Value);
-
-  return ArgType();
-}
-
-uint32_t DIAFrameData::getAddressOffset() const {
-  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_addressOffset);
-}
-
-uint32_t DIAFrameData::getAddressSection() const {
-  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_addressSection);
-}
-
-uint32_t DIAFrameData::getLengthBlock() const {
-  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_lengthBlock);
-}
-
-std::string DIAFrameData::getProgram() const {
-  return invokeBstrMethod(*FrameData, &IDiaFrameData::get_program);
-}
-
-uint32_t DIAFrameData::getRelativeVirtualAddress() const {
-  return PrivateGetDIAValue(FrameData,
-                            &IDiaFrameData::get_relativeVirtualAddress);
-}
-
-uint64_t DIAFrameData::getVirtualAddress() const {
-  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_virtualAddress);
-}
diff --git a/lib/DebugInfo/PDB/DIA/DIASession.cpp b/lib/DebugInfo/PDB/DIA/DIASession.cpp
index b89ca9a858f..7726fe13264 100644
--- a/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -9,7 +9,6 @@
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
-#include "llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumInjectedSources.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSectionContribs.h"
@@ -420,13 +419,3 @@ DIASession::getSectionContribs() const {
 
   return llvm::make_unique<DIAEnumSectionContribs>(*this, Sections);
 }
-
-std::unique_ptr<IPDBEnumFrameData>
-DIASession::getFrameData() const {
-  CComPtr<IDiaEnumFrameData> FD =
-      getTableEnumerator<IDiaEnumFrameData>(*Session);
-  if (!FD)
-    return nullptr;
-
-  return llvm::make_unique<DIAEnumFrameData>(*this, FD);
-}
diff --git a/lib/DebugInfo/PDB/Native/NativeSession.cpp b/lib/DebugInfo/PDB/Native/NativeSession.cpp
index 7807e312365..baab0a2399c 100644
--- a/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -200,11 +200,6 @@ NativeSession::getSectionContribs() const {
   return nullptr;
 }
 
-std::unique_ptr<IPDBEnumFrameData>
-NativeSession::getFrameData() const {
-  return nullptr;
-}
-
 void NativeSession::initializeExeSymbol() {
   if (ExeSymbol == 0)
     ExeSymbol = Cache.createSymbol<NativeExeSymbol>();
diff --git a/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
index 951909295d1..c62796507a0 100644
--- a/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
+++ b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/IPDBDataStream.h"
-#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
 #include "llvm/DebugInfo/PDB/IPDBInjectedSource.h"
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
@@ -36,5 +35,3 @@ IPDBTable::~IPDBTable() = default;
 IPDBInjectedSource::~IPDBInjectedSource() = default;
 
 IPDBSectionContrib::~IPDBSectionContrib() = default;
-
-IPDBFrameData::~IPDBFrameData() = default;
diff --git a/unittests/DebugInfo/PDB/PDBApiTest.cpp b/unittests/DebugInfo/PDB/PDBApiTest.cpp
index 007ea904085..948bde1bf72 100644
--- a/unittests/DebugInfo/PDB/PDBApiTest.cpp
+++ b/unittests/DebugInfo/PDB/PDBApiTest.cpp
@@ -159,10 +159,6 @@ class MockSession : public IPDBSession {
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override {
     return nullptr;
   }
-
-  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override {
-    return nullptr;
-  }
 };
 
 class MockRawSymbol : public IPDBRawSymbol {
-- 
GitLab


From b7afff3a69abadd191e03837465daf50ab3489fc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Oct 2018 15:33:30 +0000
Subject: [PATCH 0384/1116] [X86][SSE] getTargetShuffleMask - pull out repeated
 shuffle mask element size. NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344910 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 51 +++++++++++++-----------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index cdd76e1f03d..51ad0bdf00e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5871,6 +5871,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
                                  SmallVectorImpl<SDValue> &Ops,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
+  unsigned MaskEltSize = VT.getScalarSizeInBits();
   SDValue ImmN;
 
   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
@@ -5878,26 +5879,26 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
 
   IsUnary = false;
   bool IsFakeUnary = false;
-  switch(N->getOpcode()) {
+  switch (N->getOpcode()) {
   case X86ISD::BLENDI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::SHUFP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodeSHUFPMask(NumElems, VT.getScalarSizeInBits(),
+    ImmN = N->getOperand(N->getNumOperands() - 1);
+    DecodeSHUFPMask(NumElems, MaskEltSize,
                     cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::INSERTPS:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
@@ -5907,8 +5908,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
         isa<ConstantSDNode>(N->getOperand(2))) {
       int BitLen = N->getConstantOperandVal(1);
       int BitIdx = N->getConstantOperandVal(2);
-      DecodeEXTRQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
-                       Mask);
+      DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
       IsUnary = true;
     }
     break;
@@ -5919,21 +5919,20 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
         isa<ConstantSDNode>(N->getOperand(3))) {
       int BitLen = N->getConstantOperandVal(2);
       int BitIdx = N->getConstantOperandVal(3);
-      DecodeINSERTQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
-                         Mask);
+      DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     }
     break;
   case X86ISD::UNPCKH:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    DecodeUNPCKHMask(NumElems, VT.getScalarSizeInBits(), Mask);
+    DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::UNPCKL:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    DecodeUNPCKLMask(NumElems, VT.getScalarSizeInBits(), Mask);
+    DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVHLPS:
@@ -5952,7 +5951,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                       Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
@@ -5978,21 +5977,21 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodePSHUFMask(NumElems, VT.getScalarSizeInBits(),
+    ImmN = N->getOperand(N->getNumOperands() - 1);
+    DecodePSHUFMask(NumElems, MaskEltSize,
                     cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFHW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                       Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFLW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                       Mask);
     IsUnary = true;
@@ -6025,10 +6024,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
-    unsigned MaskEltSize = VT.getScalarSizeInBits();
     SmallVector<uint64_t, 32> RawMask;
     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
-      DecodeVPERMILPMask(NumElems, VT.getScalarSizeInBits(), RawMask, Mask);
+      DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, Mask);
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
@@ -6056,7 +6054,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   }
   case X86ISD::VPERMI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
@@ -6069,7 +6067,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::VPERM2X128:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                          Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
@@ -6077,10 +6075,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::SHUF128:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
-    decodeVSHUF64x2FamilyMask(NumElems, VT.getScalarSizeInBits(),
-                              cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                              Mask);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
+    decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
+                              cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVSLDUP:
@@ -6102,15 +6099,13 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
-    unsigned MaskEltSize = VT.getScalarSizeInBits();
     SDValue MaskNode = N->getOperand(2);
     SDValue CtrlNode = N->getOperand(3);
     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
       unsigned CtrlImm = CtrlOp->getZExtValue();
       SmallVector<uint64_t, 32> RawMask;
       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
-        DecodeVPERMIL2PMask(NumElems, VT.getScalarSizeInBits(), CtrlImm,
-                            RawMask, Mask);
+        DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, Mask);
         break;
       }
       if (auto *C = getTargetConstantFromNode(MaskNode)) {
@@ -6143,7 +6138,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     Ops.push_back(N->getOperand(1));
     SDValue MaskNode = N->getOperand(0);
     SmallVector<uint64_t, 32> RawMask;
-    unsigned MaskEltSize = VT.getScalarSizeInBits();
     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
       DecodeVPERMVMask(RawMask, Mask);
       break;
@@ -6162,7 +6156,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     Ops.push_back(N->getOperand(0));
     Ops.push_back(N->getOperand(2));
     SDValue MaskNode = N->getOperand(1);
-    unsigned MaskEltSize = VT.getScalarSizeInBits();
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
       DecodeVPERMV3Mask(C, MaskEltSize, VT.getSizeInBits(), Mask);
       break;
-- 
GitLab


From 2ec3239eb109c3713751cdefeb05d0d660237160 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Mon, 22 Oct 2018 15:36:15 +0000
Subject: [PATCH 0385/1116] [llvm-mca] Use llvm::ArrayRef in class SourceMgr.
 NFCI

Class SourceMgr now uses type ArrayRef<MCInst> to reference the
sequence of code from a "CodeRegion".


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344911 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/CodeRegion.cpp            |  6 +++---
 tools/llvm-mca/CodeRegion.h              | 24 ++++++++++--------------
 tools/llvm-mca/include/SourceMgr.h       | 14 +++++++-------
 tools/llvm-mca/lib/Stages/FetchStage.cpp |  2 +-
 tools/llvm-mca/llvm-mca.cpp              | 18 ++++++++++--------
 5 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/tools/llvm-mca/CodeRegion.cpp b/tools/llvm-mca/CodeRegion.cpp
index 89686599650..c26658a6cf5 100644
--- a/tools/llvm-mca/CodeRegion.cpp
+++ b/tools/llvm-mca/CodeRegion.cpp
@@ -52,15 +52,15 @@ void CodeRegions::endRegion(SMLoc Loc) {
   CurrentRegion.setEndLocation(Loc);
 }
 
-void CodeRegions::addInstruction(std::unique_ptr<const MCInst> Instruction) {
-  const SMLoc &Loc = Instruction->getLoc();
+void CodeRegions::addInstruction(const MCInst &Instruction) {
+  const SMLoc &Loc = Instruction.getLoc();
   const auto It =
       std::find_if(Regions.rbegin(), Regions.rend(),
                    [Loc](const std::unique_ptr<CodeRegion> &Region) {
                      return Region->isLocInRange(Loc);
                    });
   if (It != Regions.rend())
-    (*It)->addInstruction(std::move(Instruction));
+    (*It)->addInstruction(Instruction);
 }
 
 } // namespace mca
diff --git a/tools/llvm-mca/CodeRegion.h b/tools/llvm-mca/CodeRegion.h
index 7f0025e4884..21ca8da9b53 100644
--- a/tools/llvm-mca/CodeRegion.h
+++ b/tools/llvm-mca/CodeRegion.h
@@ -34,6 +34,7 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_CODEREGION_H
 #define LLVM_TOOLS_LLVM_MCA_CODEREGION_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/SMLoc.h"
@@ -49,7 +50,7 @@ class CodeRegion {
   // An optional descriptor for this region.
   llvm::StringRef Description;
   // Instructions that form this region.
-  std::vector<std::unique_ptr<const llvm::MCInst>> Instructions;
+  std::vector<llvm::MCInst> Instructions;
   // Source location range.
   llvm::SMLoc RangeStart;
   llvm::SMLoc RangeEnd;
@@ -61,8 +62,8 @@ public:
   CodeRegion(llvm::StringRef Desc, llvm::SMLoc Start)
       : Description(Desc), RangeStart(Start), RangeEnd() {}
 
-  void addInstruction(std::unique_ptr<const llvm::MCInst> Instruction) {
-    Instructions.emplace_back(std::move(Instruction));
+  void addInstruction(const llvm::MCInst &Instruction) {
+    Instructions.emplace_back(Instruction);
   }
 
   llvm::SMLoc startLoc() const { return RangeStart; }
@@ -72,10 +73,7 @@ public:
   bool empty() const { return Instructions.empty(); }
   bool isLocInRange(llvm::SMLoc Loc) const;
 
-  const std::vector<std::unique_ptr<const llvm::MCInst>> &
-  getInstructions() const {
-    return Instructions;
-  }
+  llvm::ArrayRef<llvm::MCInst> getInstructions() const { return Instructions; }
 
   llvm::StringRef getDescription() const { return Description; }
 };
@@ -106,23 +104,21 @@ public:
 
   void beginRegion(llvm::StringRef Description, llvm::SMLoc Loc);
   void endRegion(llvm::SMLoc Loc);
-  void addInstruction(std::unique_ptr<const llvm::MCInst> Instruction);
+  void addInstruction(const llvm::MCInst &Instruction);
 
   CodeRegions(llvm::SourceMgr &S) : SM(S) {
     // Create a default region for the input code sequence.
     addRegion("Default", llvm::SMLoc());
   }
 
-  const std::vector<std::unique_ptr<const llvm::MCInst>> &
-  getInstructionSequence(unsigned Idx) const {
+  llvm::ArrayRef<llvm::MCInst> getInstructionSequence(unsigned Idx) const {
     return Regions[Idx]->getInstructions();
   }
 
   bool empty() const {
-    return std::all_of(Regions.begin(), Regions.end(),
-                       [](const std::unique_ptr<CodeRegion> &Region) {
-                         return Region->empty();
-                       });
+    return llvm::all_of(Regions, [](const std::unique_ptr<CodeRegion> &Region) {
+      return Region->empty();
+    });
   }
 };
 
diff --git a/tools/llvm-mca/include/SourceMgr.h b/tools/llvm-mca/include/SourceMgr.h
index 573ca7a9a00..89412836360 100644
--- a/tools/llvm-mca/include/SourceMgr.h
+++ b/tools/llvm-mca/include/SourceMgr.h
@@ -16,29 +16,29 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
 #define LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCInst.h"
 #include <vector>
 
 namespace mca {
 
-typedef std::pair<unsigned, const llvm::MCInst *> SourceRef;
+typedef std::pair<unsigned, const llvm::MCInst &> SourceRef;
 
 class SourceMgr {
-  using InstVec = std::vector<std::unique_ptr<const llvm::MCInst>>;
-  const InstVec &Sequence;
+  llvm::ArrayRef<llvm::MCInst> Sequence;
   unsigned Current;
   unsigned Iterations;
   static const unsigned DefaultIterations = 100;
 
 public:
-  SourceMgr(const InstVec &MCInstSequence, unsigned NumIterations)
+  SourceMgr(llvm::ArrayRef<llvm::MCInst> MCInstSequence, unsigned NumIterations)
       : Sequence(MCInstSequence), Current(0),
         Iterations(NumIterations ? NumIterations : DefaultIterations) {}
 
   unsigned getCurrentIteration() const { return Current / Sequence.size(); }
   unsigned getNumIterations() const { return Iterations; }
   unsigned size() const { return Sequence.size(); }
-  const InstVec &getSequence() const { return Sequence; }
+  llvm::ArrayRef<llvm::MCInst> getSequence() const { return Sequence; }
 
   bool hasNext() const { return Current < (Iterations * size()); }
   void updateNext() { Current++; }
@@ -46,7 +46,7 @@ public:
   const SourceRef peekNext() const {
     assert(hasNext() && "Already at end of sequence!");
     unsigned Index = getCurrentInstructionIndex();
-    return SourceRef(Current, Sequence[Index].get());
+    return SourceRef(Current, Sequence[Index]);
   }
 
   unsigned getCurrentInstructionIndex() const {
@@ -54,7 +54,7 @@ public:
   }
 
   const llvm::MCInst &getMCInstFromIndex(unsigned Index) const {
-    return *Sequence[Index % size()];
+    return Sequence[Index % size()];
   }
 
   bool isEmpty() const { return size() == 0; }
diff --git a/tools/llvm-mca/lib/Stages/FetchStage.cpp b/tools/llvm-mca/lib/Stages/FetchStage.cpp
index e2cdad37ee1..8bd0bd9e3a7 100644
--- a/tools/llvm-mca/lib/Stages/FetchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/FetchStage.cpp
@@ -36,7 +36,7 @@ llvm::Error FetchStage::getNextInstruction() {
     return llvm::ErrorSuccess();
   const SourceRef SR = SM.peekNext();
   llvm::Expected<std::unique_ptr<Instruction>> InstOrErr =
-      IB.createInstruction(*SR.second);
+      IB.createInstruction(SR.second);
   if (!InstOrErr)
     return InstOrErr.takeError();
   CurrentInstruction = std::move(InstOrErr.get());
diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp
index 9466ae7e84d..59b78ff1545 100644
--- a/tools/llvm-mca/llvm-mca.cpp
+++ b/tools/llvm-mca/llvm-mca.cpp
@@ -68,13 +68,15 @@ static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
                                            cl::value_desc("filename"));
 
 static cl::opt<std::string>
-    ArchName("march", cl::desc("Target arch to assemble for, "
-                               "see -version for available targets"),
+    ArchName("march",
+             cl::desc("Target arch to assemble for, "
+                      "see -version for available targets"),
              cl::cat(ToolOptions));
 
 static cl::opt<std::string>
-    TripleName("mtriple", cl::desc("Target triple to assemble for, "
-                                   "see -version for available targets"),
+    TripleName("mtriple",
+               cl::desc("Target triple to assemble for, "
+                        "see -version for available targets"),
                cl::cat(ToolOptions));
 
 static cl::opt<std::string>
@@ -270,9 +272,10 @@ public:
       : MCStreamer(Context), Regions(R) {}
 
   // We only want to intercept the emission of new instructions.
-  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+  virtual void EmitInstruction(const MCInst &Inst,
+                               const MCSubtargetInfo & /* unused */,
                                bool /* unused */) override {
-    Regions.addInstruction(llvm::make_unique<const MCInst>(Inst));
+    Regions.addInstruction(Inst);
   }
 
   bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override {
@@ -290,8 +293,7 @@ public:
   void EmitCOFFSymbolType(int Type) override {}
   void EndCOFFSymbolDef() override {}
 
-  const std::vector<std::unique_ptr<const MCInst>> &
-  GetInstructionSequence(unsigned Index) const {
+  ArrayRef<MCInst> GetInstructionSequence(unsigned Index) const {
     return Regions.getInstructionSequence(Index);
   }
 };
-- 
GitLab


From c1d5ac81906ed3aa1117d9bae045aaef8a89e645 Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Mon, 22 Oct 2018 16:19:07 +0000
Subject: [PATCH 0386/1116] Some cleanups to the native pdb plugin [NFC].

This is mostly some cleanup done in the process of implementing
some basic support for types.  I tried to split up the patch a
bit to get some of the NFC portion of the patch out into a separate
commit, and this is the result of that.  It moves some code around,
deletes some spurious namespace qualifications, removes some
unnecessary header includes, forward declarations, etc.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344913 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/CodeView/TypeRecord.h  |  4 ++++
 include/llvm/DebugInfo/PDB/Native/TpiStream.h |  4 ++++
 lib/DebugInfo/PDB/Native/TpiStream.cpp        | 19 +++++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h
index 9a06a6a3344..76f1f98ab66 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h
@@ -429,6 +429,10 @@ public:
     return (Options & ClassOptions::ForwardReference) != ClassOptions::None;
   }
 
+  bool isScoped() const {
+    return (Options & ClassOptions::Scoped) != ClassOptions::None;
+  }
+
   uint16_t getMemberCount() const { return MemberCount; }
   ClassOptions getOptions() const { return Options; }
   TypeIndex getFieldList() const { return FieldList; }
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
index 00cc720336c..b76576a7a26 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
@@ -61,6 +61,10 @@ public:
   Expected<codeview::TypeIndex>
   findFullDeclForForwardRef(codeview::TypeIndex ForwardRefTI) const;
 
+  std::vector<codeview::TypeIndex> findRecordsByName(StringRef Name) const;
+
+  codeview::CVType getType(codeview::TypeIndex Index);
+
   BinarySubstreamRef getTypeRecordsSubstream() const;
 
   Error commit();
diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index 96221f7d6ec..44781705bfa 100644
--- a/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
+#include "llvm/DebugInfo/CodeView/RecordName.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordHelpers.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
@@ -158,6 +159,20 @@ void TpiStream::buildHashMap() {
   }
 }
 
+std::vector<TypeIndex> TpiStream::findRecordsByName(StringRef Name) const {
+  uint32_t Bucket = hashStringV1(Name) % Header->NumHashBuckets;
+  if (Bucket > HashMap.size())
+    return {};
+
+  std::vector<TypeIndex> Result;
+  for (TypeIndex TI : HashMap[Bucket]) {
+    std::string ThisName = computeTypeName(*Types, TI);
+    if (ThisName == Name)
+      Result.push_back(TI);
+  }
+  return Result;
+}
+
 bool TpiStream::supportsTypeLookup() const { return !HashMap.empty(); }
 
 Expected<TypeIndex>
@@ -199,6 +214,10 @@ TpiStream::findFullDeclForForwardRef(TypeIndex ForwardRefTI) const {
   return ForwardRefTI;
 }
 
+codeview::CVType TpiStream::getType(codeview::TypeIndex Index) {
+  return Types->getType(Index);
+}
+
 BinarySubstreamRef TpiStream::getTypeRecordsSubstream() const {
   return TypeRecordsSubstream;
 }
-- 
GitLab


From c0db9a7416db56822997b6cde2062830ad0f4b99 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 22 Oct 2018 16:27:27 +0000
Subject: [PATCH 0387/1116] DAG: Change behavior of fminnum/fmaxnum nodes

Introduce new versions that follow the IEEE semantics
to help with legalization that may need quieted inputs.

There are some regressions from inserting unnecessary
canonicalizes when these are matched from fast math
fcmp + select which should be fixed in a future commit.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344914 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/ISDOpcodes.h             |  11 +-
 include/llvm/CodeGen/TargetLowering.h         |   3 +
 include/llvm/Target/TargetSelectionDAG.td     |   5 +
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |  11 +
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |   7 +-
 .../SelectionDAG/LegalizeVectorOps.cpp        |  12 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   2 +
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |  26 +-
 .../SelectionDAG/SelectionDAGDumper.cpp       |   3 +
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |  29 +
 lib/CodeGen/TargetLoweringBase.cpp            |   2 +
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp      |   8 +
 lib/Target/AMDGPU/AMDGPUISelLowering.h        |   1 +
 lib/Target/AMDGPU/AMDGPUInstructions.td       |  28 +
 lib/Target/AMDGPU/SIISelLowering.cpp          | 166 +++-
 lib/Target/AMDGPU/SIISelLowering.h            |   6 +
 lib/Target/AMDGPU/SIInstructions.td           |  13 +-
 lib/Target/AMDGPU/VOP2Instructions.td         |   8 +-
 lib/Target/AMDGPU/VOP3Instructions.td         |   4 +-
 lib/Target/AMDGPU/VOP3PInstructions.td        |   4 +-
 test/CodeGen/AMDGPU/clamp.ll                  |  20 +-
 .../AMDGPU/fcanonicalize-elimination.ll       |  77 +-
 test/CodeGen/AMDGPU/fmax3.f64.ll              |  11 +-
 test/CodeGen/AMDGPU/fmax3.ll                  |  44 +-
 test/CodeGen/AMDGPU/fmax_legacy.f16.ll        |  16 +-
 test/CodeGen/AMDGPU/fmax_legacy.ll            |  97 +-
 test/CodeGen/AMDGPU/fmaxnum.ll                |  56 +-
 test/CodeGen/AMDGPU/fmin3.ll                  |  35 +-
 .../CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll |  49 +-
 test/CodeGen/AMDGPU/fmin_legacy.f16.ll        |  16 +-
 test/CodeGen/AMDGPU/fmin_legacy.ll            | 126 ++-
 test/CodeGen/AMDGPU/fminnum.f64.ll            |  36 +-
 test/CodeGen/AMDGPU/fminnum.ll                |  77 +-
 test/CodeGen/AMDGPU/fneg-combines.ll          | 311 +++++--
 test/CodeGen/AMDGPU/known-never-snan.ll       |  69 +-
 test/CodeGen/AMDGPU/llvm.maxnum.f16.ll        | 782 +++++++++++++---
 test/CodeGen/AMDGPU/llvm.minnum.f16.ll        | 836 +++++++++++++++---
 test/CodeGen/AMDGPU/reduction.ll              | 112 ++-
 38 files changed, 2486 insertions(+), 633 deletions(-)

diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 535fc4f0bf4..2424ae6e630 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -564,10 +564,19 @@ namespace ISD {
     FCEIL, FTRUNC, FRINT, FNEARBYINT, FROUND, FFLOOR,
     /// FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two
     /// values.
-    /// In the case where a single input is NaN, the non-NaN input is returned.
+    //
+    /// In the case where a single input is a NaN (either signaling or quiet),
+    /// the non-NaN input is returned.
     ///
     /// The return value of (FMINNUM 0.0, -0.0) could be either 0.0 or -0.0.
     FMINNUM, FMAXNUM,
+
+    /// FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on
+    /// two values, following the IEEE-754 2008 definition. This differs from
+    /// FMINNUM/FMAXNUM in the handling of signaling NaNs. If one input is a
+    /// signaling NaN, returns a quiet NaN.
+    FMINNUM_IEEE, FMAXNUM_IEEE,
+
     /// FMINNAN/FMAXNAN - NaN-propagating minimum/maximum that also treat -0.0
     /// as less than 0.0. While FMINNUM/FMAXNUM follow IEEE 754-2008 semantics,
     /// FMINNAN/FMAXNAN follow IEEE 754-2018 draft semantics.
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index d22f707d259..d56684d8558 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -3644,6 +3644,9 @@ public:
   /// \returns True, if the expansion was successful, false otherwise
   bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
 
+  /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
+  SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
+
   /// Turn load of vector type into a load of the individual elements.
   /// \param LD load to expand
   /// \returns MERGE_VALUEs of the scalar loads with their chains.
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 1ea370d39e9..918d0b1d37d 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -408,6 +408,11 @@ def fminnum    : SDNode<"ISD::FMINNUM"    , SDTFPBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
 def fmaxnum    : SDNode<"ISD::FMAXNUM"    , SDTFPBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
+def fminnum_ieee : SDNode<"ISD::FMINNUM_IEEE", SDTFPBinOp,
+                          [SDNPCommutative]>;
+def fmaxnum_ieee  : SDNode<"ISD::FMAXNUM_IEEE", SDTFPBinOp,
+                           [SDNPCommutative]>;
+
 def fminnan    : SDNode<"ISD::FMINNAN"    , SDTFPBinOp>;
 def fmaxnan    : SDNode<"ISD::FMAXNAN"    , SDTFPBinOp>;
 def fgetsign   : SDNode<"ISD::FGETSIGN"   , SDTFPToIntOp>;
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 381efb9cb94..f560f0e1a9c 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7097,6 +7097,13 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
   case ISD::SETLE:
   case ISD::SETULT:
   case ISD::SETULE: {
+    // Since it's known never nan to get here already, either fminnum or
+    // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is
+    // expanded in terms of it.
+    unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+    if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
+      return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
+
     unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
     if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
       return DAG.getNode(Opcode, DL, VT, LHS, RHS);
@@ -7108,6 +7115,10 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
   case ISD::SETGE:
   case ISD::SETUGT:
   case ISD::SETUGE: {
+    unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
+    if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
+      return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
+
     unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
     if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
       return DAG.getNode(Opcode, DL, VT, LHS, RHS);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 71d124c74ce..b73fc106a6b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3247,7 +3247,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   }
-
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM: {
+    if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Node, DAG))
+      Results.push_back(Expanded);
+    break;
+  }
   case ISD::FSIN:
   case ISD::FCOS: {
     EVT VT = Node->getValueType(0);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 2c1a4942f68..e7edc0ef860 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -130,6 +130,7 @@ class VectorLegalizer {
   SDValue ExpandBITREVERSE(SDValue Op);
   SDValue ExpandCTLZ(SDValue Op);
   SDValue ExpandCTTZ(SDValue Op);
+  SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
   SDValue ExpandStrictFPOp(SDValue Op);
 
   /// Implements vector promotion.
@@ -353,6 +354,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FABS:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
   case ISD::FMINNAN:
   case ISD::FMAXNAN:
   case ISD::FCOPYSIGN:
@@ -721,6 +724,9 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
     return ExpandCTTZ(Op);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+    return ExpandFMINNUM_FMAXNUM(Op);
   case ISD::STRICT_FADD:
   case ISD::STRICT_FSUB:
   case ISD::STRICT_FMUL:
@@ -1120,6 +1126,12 @@ SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
+SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {
+  if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG))
+    return Expanded;
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) {
   EVT VT = Op.getValueType();
   EVT EltVT = VT.getVectorElementType();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 8d00b3249d1..2b5fd8d75f4 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -113,6 +113,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMUL:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
   case ISD::FMINNAN:
   case ISD::FMAXNAN:
   case ISD::SMIN:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0f8bd080867..1f0f7325c9d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3712,9 +3712,31 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
     // TODO: Refine on operand
     return false;
   }
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM: {
+    // Only one needs to be known not-nan, since it will be returned if the
+    // other ends up being one.
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) ||
+           isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+  }
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE: {
+    if (SNaN)
+      return true;
+    // This can return a NaN if either operand is an sNaN, or if both operands
+    // are NaN.
+    return (isKnownNeverNaN(Op.getOperand(0), false, Depth + 1) &&
+            isKnownNeverSNaN(Op.getOperand(1), Depth + 1)) ||
+           (isKnownNeverNaN(Op.getOperand(1), false, Depth + 1) &&
+            isKnownNeverSNaN(Op.getOperand(0), Depth + 1));
+  }
+  case ISD::FMINNAN:
+  case ISD::FMAXNAN: {
+    // TODO: Does this quiet or return the origina NaN as-is?
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+           isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
 
-  // TODO: Handle FMINNUM/FMAXNUM/FMINNAN/FMAXNAN when there is an agreement on
-  // what they should do.
+  }
   case ISD::EXTRACT_VECTOR_ELT: {
     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 9967f0eba10..64a9764cce2 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -176,6 +176,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FABS:                       return "fabs";
   case ISD::FMINNUM:                    return "fminnum";
   case ISD::FMAXNUM:                    return "fmaxnum";
+  case ISD::FMINNUM_IEEE:               return "fminnum_ieee";
+  case ISD::FMAXNUM_IEEE:               return "fmaxnum_ieee";
+
   case ISD::FMINNAN:                    return "fminnan";
   case ISD::FMAXNAN:                    return "fmaxnan";
   case ISD::FNEG:                       return "fneg";
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b9b99b386af..ceedd06da1d 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4113,6 +4113,35 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
   return true;
 }
 
+SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
+                                              SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ?
+    ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+  EVT VT = Node->getValueType(0);
+  if (isOperationLegalOrCustom(NewOp, VT)) {
+    SDValue Quiet0 = Node->getOperand(0);
+    SDValue Quiet1 = Node->getOperand(1);
+
+    if (!Node->getFlags().hasNoNaNs()) {
+      // Insert canonicalizes if it's possible we need to quiet to get correct
+      // sNaN behavior.
+      if (!DAG.isKnownNeverSNaN(Quiet0)) {
+        Quiet0 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0,
+                             Node->getFlags());
+      }
+      if (!DAG.isKnownNeverSNaN(Quiet1)) {
+        Quiet1 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1,
+                             Node->getFlags());
+      }
+    }
+
+    return DAG.getNode(NewOp, dl, VT, Quiet0, Quiet1, Node->getFlags());
+  }
+
+  return SDValue();
+}
+
 SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
                                             SelectionDAG &DAG) const {
   SDLoc SL(LD);
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 03a29a3edf6..ddd5fc1df75 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -600,6 +600,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
     setOperationAction(ISD::FMINNUM, VT, Expand);
     setOperationAction(ISD::FMAXNUM, VT, Expand);
+    setOperationAction(ISD::FMINNUM_IEEE, VT, Expand);
+    setOperationAction(ISD::FMAXNUM_IEEE, VT, Expand);
     setOperationAction(ISD::FMINNAN, VT, Expand);
     setOperationAction(ISD::FMAXNAN, VT, Expand);
     setOperationAction(ISD::FMAD, VT, Expand);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ae6b925800b..a1b9198f945 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -552,6 +552,8 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case ISD::FMAD:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
   case ISD::FSIN:
   case ISD::FTRUNC:
   case ISD::FRINT:
@@ -3512,6 +3514,10 @@ static unsigned inverseMinMax(unsigned Opc) {
     return ISD::FMINNUM;
   case ISD::FMINNUM:
     return ISD::FMAXNUM;
+  case ISD::FMAXNUM_IEEE:
+    return ISD::FMINNUM_IEEE;
+  case ISD::FMINNUM_IEEE:
+    return ISD::FMAXNUM_IEEE;
   case AMDGPUISD::FMAX_LEGACY:
     return AMDGPUISD::FMIN_LEGACY;
   case AMDGPUISD::FMIN_LEGACY:
@@ -3617,6 +3623,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   }
   case ISD::FMAXNUM:
   case ISD::FMINNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINNUM_IEEE:
   case AMDGPUISD::FMAX_LEGACY:
   case AMDGPUISD::FMIN_LEGACY: {
     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 92d8991e582..0d22cb2e3e2 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -360,6 +360,7 @@ enum NodeType : unsigned {
   SIN_HW,
   FMAX_LEGACY,
   FMIN_LEGACY,
+
   FMAX3,
   SMAX3,
   UMAX3,
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index ab00b1d6326..b7d1575ca89 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -152,8 +152,14 @@ def smax_oneuse : HasOneUseBinOp<smax>;
 def smin_oneuse : HasOneUseBinOp<smin>;
 def umax_oneuse : HasOneUseBinOp<umax>;
 def umin_oneuse : HasOneUseBinOp<umin>;
+
 def fminnum_oneuse : HasOneUseBinOp<fminnum>;
 def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+
+def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>;
+def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>;
+
+
 def and_oneuse : HasOneUseBinOp<and>;
 def or_oneuse : HasOneUseBinOp<or>;
 def xor_oneuse : HasOneUseBinOp<xor>;
@@ -837,3 +843,25 @@ class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
   (AMDGPUrcp (fsqrt vt:$src)),
   (RsqInst $src)
 >;
+
+// Instructions which select to the same v_min_f*
+def fminnum_like : PatFrags<(ops node:$src0, node:$src1),
+  [(fminnum_ieee node:$src0, node:$src1),
+   (fminnum node:$src0, node:$src1)]
+>;
+
+// Instructions which select to the same v_max_f*
+def fmaxnum_like : PatFrags<(ops node:$src0, node:$src1),
+  [(fmaxnum_ieee node:$src0, node:$src1),
+   (fmaxnum node:$src0, node:$src1)]
+>;
+
+def fminnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+  [(fminnum_ieee_oneuse node:$src0, node:$src1),
+   (fminnum_oneuse node:$src0, node:$src1)]
+>;
+
+def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+  [(fmaxnum_ieee_oneuse node:$src0, node:$src1),
+   (fmaxnum_oneuse node:$src0, node:$src1)]
+>;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 81ff640f704..3ba04831d15 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -384,8 +384,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->hasBFE())
     setHasExtractBitsInsn(true);
 
-  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+  setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
+  setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
+
+
+  // These are really only legal for ieee_mode functions. We should be avoiding
+  // them for functions that don't have ieee_mode enabled, so just say they are
+  // legal.
+  setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
+  setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
+  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
+
 
   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
@@ -474,8 +486,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     // F16 - VOP2 Actions.
     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
-    setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+
     setOperationAction(ISD::FDIV, MVT::f16, Custom);
 
     // F16 - VOP3 Actions.
@@ -558,6 +569,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     // This isn't really legal, but this avoids the legalizer unrolling it (and
     // allows matching fneg (fabs x) patterns)
     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
+
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
+
+    setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
   }
 
   if (Subtarget->hasVOP3PInsts()) {
@@ -575,8 +597,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
+
     setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
@@ -596,6 +620,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::FADD, MVT::v4f16, Custom);
     setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
+    setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
+
     setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
     setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
@@ -634,6 +662,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
   setTargetDAGCombine(ISD::FMAXNUM);
+  setTargetDAGCombine(ISD::FMINNUM_IEEE);
+  setTargetDAGCombine(ISD::FMAXNUM_IEEE);
   setTargetDAGCombine(ISD::FMA);
   setTargetDAGCombine(ISD::SMIN);
   setTargetDAGCombine(ISD::SMAX);
@@ -3580,6 +3610,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FNEG:
   case ISD::FCANONICALIZE:
     return splitUnaryVectorOp(Op, DAG);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+    return lowerFMINNUM_FMAXNUM(Op, DAG);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
@@ -3590,10 +3623,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
-  case ISD::FMINNUM:
-  case ISD::FMAXNUM:
   case ISD::FADD:
   case ISD::FMUL:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
     return splitBinaryVectorOp(Op, DAG);
   }
   return SDValue();
@@ -4048,6 +4081,23 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
 }
 
+SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+
+  // FIXME: Assert during eslection that this is only selected for
+  // ieee_mode. Currently a combine can produce the ieee version for non-ieee
+  // mode functions, but this happens to be OK since it's only done in cases
+  // where there is known no sNaN.
+  if (IsIEEEMode)
+    return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
+
+  if (VT == MVT::v4f16)
+    return splitBinaryVectorOp(Op, DAG);
+  return Op;
+}
+
 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Chain = Op.getOperand(0);
@@ -7521,37 +7571,32 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
 
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
   case AMDGPUISD::CLAMP:
   case AMDGPUISD::FMED3:
   case AMDGPUISD::FMAX3:
   case AMDGPUISD::FMIN3: {
     // FIXME: Shouldn't treat the generic operations different based these.
-    bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
-    if (IsIEEEMode) {
-      // snans will be quieted, so we only need to worry about denormals.
-      if (Subtarget->supportsMinMaxDenormModes() ||
-          denormalsEnabledForType(Op.getValueType()))
-        return true;
-
-      // Flushing may be required.
-      // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
-      // targets need to check their input recursively.
-      return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
-             isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
-    }
+    // However, we aren't really required to flush the result from
+    // minnum/maxnum..
 
+    // snans will be quieted, so we only need to worry about denormals.
     if (Subtarget->supportsMinMaxDenormModes() ||
-        denormalsEnabledForType(Op.getValueType())) {
-      // Only quieting may be necessary.
-      return DAG.isKnownNeverSNaN(Op.getOperand(0)) &&
-             DAG.isKnownNeverSNaN(Op.getOperand(1));
+        denormalsEnabledForType(Op.getValueType()))
+      return true;
+
+    // Flushing may be required.
+    // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
+    // targets need to check their input recursively.
+
+    // FIXME: Does this apply with clamp? It's implemented with max.
+    for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
+      if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
+        return false;
     }
 
-    // Flushing and quieting may be necessary
-    // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it
-    // needs to be quieted.
-    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
-           isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
+    return true;
   }
   case ISD::SELECT: {
     return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
@@ -7578,6 +7623,21 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
     // Could be anything.
     return false;
 
+  case ISD::BITCAST: {
+    // Hack round the mess we make when legalizing extract_vector_elt
+    SDValue Src = Op.getOperand(0);
+    if (Src.getValueType() == MVT::i16 &&
+        Src.getOpcode() == ISD::TRUNCATE) {
+      SDValue TruncSrc = Src.getOperand(0);
+      if (TruncSrc.getValueType() == MVT::i32 &&
+          TruncSrc.getOpcode() == ISD::BITCAST &&
+          TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
+        return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
+      }
+    }
+
+    return false;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntrinsicID
       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -7603,7 +7663,6 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
 }
 
 // Constant fold canonicalize.
-
 SDValue SITargetLowering::getCanonicalConstantFP(
   SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
   // Flush denormals to 0 if not enabled.
@@ -7699,18 +7758,40 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
     }
   }
 
+  unsigned SrcOpc = N0.getOpcode();
+
+  // If it's free to do so, push canonicalizes further up the source, which may
+  // find a canonical source.
+  //
+  // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
+  // sNaNs.
+  if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
+    auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+    if (CRHS && N0.hasOneUse()) {
+      SDLoc SL(N);
+      SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
+                                   N0.getOperand(0));
+      SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
+      DCI.AddToWorklist(Canon0.getNode());
+
+      return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
+    }
+  }
+
   return isCanonicalized(DAG, N0) ? N0 : SDValue();
 }
 
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   switch (Opc) {
   case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
     return AMDGPUISD::FMAX3;
   case ISD::SMAX:
     return AMDGPUISD::SMAX3;
   case ISD::UMAX:
     return AMDGPUISD::UMAX3;
   case ISD::FMINNUM:
+  case ISD::FMINNUM_IEEE:
     return AMDGPUISD::FMIN3;
   case ISD::SMIN:
     return AMDGPUISD::SMIN3;
@@ -7877,6 +7958,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
 
   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+       (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
        (Opc == AMDGPUISD::FMIN_LEGACY &&
         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
       (VT == MVT::f32 || VT == MVT::f64 ||
@@ -7995,7 +8077,9 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
     case ISD::SMIN:
     case ISD::SMAX:
     case ISD::FMAXNUM:
-    case ISD::FMINNUM: {
+    case ISD::FMINNUM:
+    case ISD::FMAXNUM_IEEE:
+    case ISD::FMINNUM_IEEE: {
       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
                                  Vec.getOperand(0), Idx);
       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
@@ -8595,13 +8679,15 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performSetCCCombine(N, DCI);
   case ISD::FMAXNUM:
   case ISD::FMINNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINNUM_IEEE:
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
   case ISD::UMIN:
   case AMDGPUISD::FMIN_LEGACY:
   case AMDGPUISD::FMAX_LEGACY: {
-    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
+    if (//DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
         getTargetMachine().getOptLevel() > CodeGenOpt::None)
       return performMinMaxCombine(N, DCI);
     break;
@@ -9320,3 +9406,17 @@ bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
     return false;
   }
 }
+
+bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
+                                                    const SelectionDAG &DAG,
+                                                    bool SNaN,
+                                                    unsigned Depth) const {
+  if (Op.getOpcode() == AMDGPUISD::CLAMP) {
+    if (Subtarget->enableDX10Clamp())
+      return true; // Clamped to 0.
+    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+  }
+
+  return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
+                                                            SNaN, Depth);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 1b0cb06a9b0..bcb46ec41d1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -110,6 +110,7 @@ private:
 
   /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
                              SelectionDAG &DAG) const;
@@ -346,6 +347,11 @@ public:
   bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
                        unsigned MaxDepth = 5) const;
   bool denormalsEnabledForType(EVT VT) const;
+
+  bool isKnownNeverNaNForTargetNode(SDValue Op,
+                                    const SelectionDAG &DAG,
+                                    bool SNaN = false,
+                                    unsigned Depth = 0) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 1336a576e84..67aea73d1ca 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1645,10 +1645,11 @@ def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
 // This matches 16 permutations of
 // max(min(x, y), min(max(x, y), z))
 class FPMed3Pat<ValueType vt,
+                //SDPatternOperator max, SDPatternOperator min,
                 Instruction med3Inst> : GCNPat<
-  (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
                            (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
-           (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
                                            (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
                            (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
   (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
@@ -1656,10 +1657,10 @@ class FPMed3Pat<ValueType vt,
 
 class FP16Med3Pat<ValueType vt,
                 Instruction med3Inst> : GCNPat<
-  (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
-                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
-           (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
-                                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
                            (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
   (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
 >;
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index e9d12ba83f3..db031be7e55 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -393,8 +393,8 @@ defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>,
 defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>;
 defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_u24>;
 defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>;
-defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
-defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
+defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
+defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
 defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
 defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
 defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
@@ -556,8 +556,8 @@ defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
 defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
 defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
 defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
-defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
-defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
+defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
+defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
 defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
 defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
 defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 96b233b5a38..51bee3efeb2 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -295,8 +295,8 @@ let SchedRW = [WriteDoubleAdd] in {
 def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
 def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
 def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
-def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
-def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
+def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
+def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
 } // End SchedRW = [WriteDoubleAdd]
 
 let SchedRW = [WriteQuarterRate32] in {
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index 41e21c116a9..c91d911a283 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -48,8 +48,8 @@ def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_
 
 def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
 def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
-def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
-def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>;
+def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
 
 def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
 def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
diff --git a/test/CodeGen/AMDGPU/clamp.ll b/test/CodeGen/AMDGPU/clamp.ll
index e73f28604b5..d98b56062cd 100644
--- a/test/CodeGen/AMDGPU/clamp.ll
+++ b/test/CodeGen/AMDGPU/clamp.ll
@@ -74,7 +74,8 @@ define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float a
 
 ; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[QUIET]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
 define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -90,8 +91,17 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
+; GCN: v_min_f32_e32 [[MED:v[0-9]+]], 1.0, [[QUIET_A]]
+; GCN-NOT: [[MAX]]
+; GCN-NOT: [[MED]]
+
+; SI: buffer_store_dword [[MED]]
+; SI: buffer_store_dword [[MAX]]
+
+; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MED]]
+; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX]]
 define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@@ -406,8 +416,8 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0
 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 4d4e065ba56..e2741c25382 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -455,14 +455,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace
 }
 
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
-; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
-; GFX9-NOT: v_max
-; GFX9-NOT: v_mul
-
-; VI-DENORM-NOT: v_max_f32
-; VI-DENORM-NOT: v_mul_f32
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+; GCN-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; GCN-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
+; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]]
 
-; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GCN-NOT: v_max
+; GCN-NOT: v_mul
 
 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
@@ -476,15 +475,13 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee
 }
 
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
-; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-
-; GFX9-NOT: v_max
-; GFX9-NOT: v_mul
-
-
-; VI-DENORM-NOT: v_max
-; VI-DENORM-NOT: v_mul
 ; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GCN-DENORM-NOT: v_max
+; GCN-DENORM-NOT: v_mul
+
+; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN-DENORM-NOT: v_max
+; GCN-DENORM-NOT: v_mul
 
 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}]
 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 {
@@ -530,13 +527,19 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace
 }
 
 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GFX9:  v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+
+; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
+; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
 
-; VI-FLUSH: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
-; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
+; GFX9-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
 
-; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
 
+; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
+; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
+
+; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
 
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
@@ -552,11 +555,14 @@ define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspa
 }
 
 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
-; GFX9:  v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
-; VI-FLUSH:    v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
-; VI-FLUSH:    v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+
+; GFX9:  v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
+
+; VI-FLUSH:    v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; VI-FLUSH:    v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
 
-; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
+; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
 
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
@@ -707,16 +713,21 @@ define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(float addrspa
 
 ; Need to quiet the nan with a separate instruction since it will be
 ; passed through the minnum.
+; FIXME: canonicalize doens't work correctly without ieee_mode
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
 ; GFX9: v_min_f32_e32 v0, v0, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX9-DENORM-NEXT: v_max_f32_e32 v0, v0, v0
 ; GFX9-NEXT: ; return to shader
 
-; VI: v_min_f32_e32 v0, v0, v1
-; VI-FLUSH: v_mul_f32_e32 v0, 1.0, v0
-; VI-DENORM: v_max_f32_e32 v0, v0, v0
+; VI-FLUSH: v_min_f32_e32 v0, v0, v1
+; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-NEXT: ; return
+
+; VI-DENORM-NOT: v0
+; VI-DENORM: v_min_f32_e32 v0, v0, v1
+; VI-DENORM-NEXT: ; return
 define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
@@ -727,8 +738,14 @@ define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %
 ; GFX9: v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT: s_setpc_b64
 
-; VI: v_min_f32_e32 v0, v0, v1
-; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-DAG: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-DAG: v_mul_f32_e32 v1, 1.0, v1
+; VI-FLUSH: v_min_f32_e32 v0, v0, v1
+
+; VI-DENORM-DAG: v_max_f32_e32 v0, v0, v0
+; VI-DENORM-DAG: v_max_f32_e32 v1, v1, v1
+; VI-DENORM: v_min_f32_e32 v0, v0, v1
+
 ; VI-NEXT: s_setpc_b64
 define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
diff --git a/test/CodeGen/AMDGPU/fmax3.f64.ll b/test/CodeGen/AMDGPU/fmax3.f64.ll
index a56a5866aad..fe0e4409f16 100644
--- a/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -4,11 +4,14 @@
 declare double @llvm.maxnum.f64(double, double) nounwind readnone
 
 ; SI-LABEL: {{^}}test_fmax3_f64:
-; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
-; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
-; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
+; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; SI: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
 ; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
-; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
+; SI: v_max_f64 [[QUIET_A:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGA]]
+; SI: v_max_f64 [[QUIET_B:v\[[0-9]+:[0-9]+\]]], [[REGB]], [[REGB]]
+; SI: v_max_f64 [[MAX0:v\[[0-9]+:[0-9]+\]]], [[QUIET_A]], [[QUIET_B]]
+; SI: v_max_f64 [[QUIET_C:v\[[0-9]+:[0-9]+\]]], [[REGC]], [[REGC]]
+; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[MAX0]], [[QUIET_C]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
 define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
index 1f67ace72df..5a92eac7f32 100644
--- a/test/CodeGen/AMDGPU/fmax3.ll
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -48,8 +48,11 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(float addrspace(1)* %out, float
 ; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_A]], [[CVT_B]], [[CVT_C]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]
 
-; VI: v_max_f16_e32
-; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
+; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
+; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
+; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[MAX0]], [[QUIET_C]]
 
 ; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
 ; GCN: buffer_store_short [[RESULT]],
@@ -75,8 +78,11 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(half addrspace(1)* %out, half ad
 ; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]
 
-; VI: v_max_f16_e32
-; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
+; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
+; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
+; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[MAX0]]
 
 ; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
 ; GCN: buffer_store_short [[RESULT]],
@@ -100,22 +106,25 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half ad
 ; SI-NEXT: v_max3_f32
 ; SI-NEXT: v_max3_f32
 
-; VI: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_max_f16_e32 v0, v0, v1
-; VI: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI: v_max_f16_e32 v0, v2, v0
-; VI: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI: v_max_f16_e32 v0, v0, v3
-; VI: v_or_b32_e32 v0, v0, v1
-
-; GFX9: v_pk_max_f16
+; VI: s_waitcnt
+; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v1
+; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_max_f16_e32 v0, v2, v0
+; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v3
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_setpc_b64
+
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16
 ; GFX9-NEXT: v_pk_max_f16
 ; GFX9-NEXT: v_pk_max_f16
-define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
+define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
 entry:
-  %max = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
-  %max1 = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
-  %res = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
+  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
+  %res = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
   ret <2 x half> %res
 }
 
@@ -126,3 +135,4 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index e06d93f5dc6..e7f3f53685c 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -97,7 +97,7 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
@@ -178,7 +178,7 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
@@ -283,8 +283,8 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
@@ -437,10 +437,10 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v5
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll
index 6a1f7966c30..1fd1556de74 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -1,13 +1,22 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
+
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 
 ; EG: MAX
@@ -26,12 +35,16 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]]
 ; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]]
 
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
 
 ; EG: MAX
@@ -52,9 +65,14 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src(float addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_ge_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -72,9 +90,15 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -92,9 +116,14 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -112,9 +141,15 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
@@ -132,12 +167,24 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32:
-; GCN-SAFE: v_max_legacy_f32_e32
-; GCN-SAFE: v_max_legacy_f32_e32
-; GCN-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE-NOT: v_cmp
+; VI-SAFE-NOT: v_cndmask
+
 ; GCN-NONAN: v_max_f32_e32
 ; GCN-NONAN: v_max_f32_e32
 ; GCN-NONAN: v_max_f32_e32
+
+; GCN-NOT: v_max
 define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
@@ -153,8 +200,8 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_multi_use:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN-NOT: v_max_
 ; GCN: v_cmp_gt_f32
 ; GCN-NEXT: v_cndmask_b32
diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll
index 58b5b5282b0..7e16d1b883a 100644
--- a/test/CodeGen/AMDGPU/fmaxnum.ll
+++ b/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -1,14 +1,26 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}test_fmax_f32:
-; GCN: v_max_f32_e32
-define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) #0 {
-  %val = call float @llvm.maxnum.f32(float %a, float %b)
+; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_on:
+; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @test_fmax_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float %b) #1
   store float %val, float addrspace(1)* %out, align 4
   ret void
 }
 
+; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_off:
+; GCN: v_max_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmax_f32_ieee_mode_off(float %a, float %b) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
 ; GCN-LABEL: {{^}}test_fmax_v2f32:
 ; GCN: v_max_f32_e32
 ; GCN: v_max_f32_e32
@@ -158,38 +170,34 @@ define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}fmax_var_immediate_f32:
+; GCN-LABEL: {{^}}fmax_var_immediate_f32_no_ieee:
 ; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float %a, float 2.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_var_immediate_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmax_immediate_var_f32:
+; GCN-LABEL: {{^}}fmax_immediate_var_f32_no_ieee:
 ; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float 2.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_immediate_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmax_var_literal_f32:
+; GCN-LABEL: {{^}}fmax_var_literal_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float %a, float 99.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_var_literal_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmax_literal_var_f32:
+; GCN-LABEL: {{^}}fmax_literal_var_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float 99.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_literal_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
+  ret float %val
 }
 
 ; GCN-LABEL: {{^}}test_func_fmax_v3f32:
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
index fa93fbcfb91..48d0eedba5b 100644
--- a/test/CodeGen/AMDGPU/fmin3.ll
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -95,22 +95,26 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half ad
 ; SI-NEXT: v_min3_f32
 ; SI-NEXT: v_min3_f32
 
-; VI: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_min_f16_e32 v0, v0, v1
-; VI: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI: v_min_f16_e32 v0, v2, v0
-; VI: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI: v_min_f16_e32 v0, v0, v3
-; VI: v_or_b32_e32 v0, v0, v1
-
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
-define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
+; VI: s_waitcnt
+; VI-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_min_f16_e32 v0, v0, v1
+; VI-NEXT: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_min_f16_e32 v0, v2, v0
+; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_min_f16_e32 v0, v0, v3
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_setpc_b64
+
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT: v_pk_min_f16 v0, v2, v0
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v3
+; GFX9-NEXT: s_setpc_b64
+define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
 entry:
-  %min = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
-  %min1 = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
-  %res = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
+  %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
+  %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
+  %res = call <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
   ret <2 x half> %res
 }
 
@@ -121,3 +125,4 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>)
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll b/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
index af1dabaa01e..731204eeaf6 100644
--- a/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
+++ b/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
@@ -1,9 +1,19 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-NONAN -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,SI %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN,SI %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,VI %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN,VI %s
 
 ; GCN-LABEL: {{^}}min_fneg_select_regression_0:
-; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
-; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, -1.0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
+; GCN-NONAN: v_max_f32_e64 v0, -v0, -1.0
 define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ult float %a, 1.0
@@ -12,7 +22,14 @@ define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
 }
 
 ; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
-; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
 ; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
 define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
@@ -22,9 +39,16 @@ define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0
 }
 
 ; GCN-LABEL: {{^}}max_fneg_select_regression_0:
-; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+
+; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
 ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
-define amdgpu_ps float @max_fneg_select_regression_0(float %a, float %b) #0 {
+define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ugt float %a, 1.0
   %min.a = select i1 %cmp.a, float %fneg.a, float -1.0
@@ -32,9 +56,16 @@ define amdgpu_ps float @max_fneg_select_regression_0(float %a, float %b) #0 {
 }
 
 ; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0:
-; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+
+; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
 ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
-define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a, float %b) #0 {
+define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ugt float %a, -1.0
   %min.a = select i1 %cmp.a, float %fneg.a, float 1.0
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index 20057307354..19d4c316ec6 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -98,7 +98,7 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
@@ -179,7 +179,7 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
@@ -284,8 +284,8 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
@@ -438,10 +438,10 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v5
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
index e0acbaf59db..ca80c4edbfb 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -1,5 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
+
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #1
@@ -10,8 +14,13 @@ declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32:
 ; EG: MIN *
-; GCN-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GCN-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+
+; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
+
+; VI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(1)* %out, <4 x float> %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
@@ -22,13 +31,17 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(
 }
 
 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 
 ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]]
 
-; GCN-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
+; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
+
+; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]]
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[A]], [[VB]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]]
 
+; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -36,13 +49,19 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out,
   ret void
 }
 
+; Nsz also needed
+; FIXME: Should separate tests
 ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src:
-; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 
 ; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[A]], 1.0
 ; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[B]], 2.0
 
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI-SAFE: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(float addrspace(1)* %out, float %a, float %b) #0 {
   %a.nnan = fadd nnan float %a, 1.0
@@ -54,9 +73,14 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(float addrspace(1
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -73,9 +97,14 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE v_cmp_le_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -92,9 +121,14 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -111,9 +145,14 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -130,9 +169,14 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -149,10 +193,15 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32:
-; GCN: buffer_load_dwordx2
-; GCN: buffer_load_dwordx2
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+
+; VI-SAFE v_cmp_lt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE v_cmp_lt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
 
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN: v_min_f32_e32
@@ -171,13 +220,24 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32:
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE-NOT: v_min_
+
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-NOT: v_cmp
+; VI-NOT: v_cndmask
 
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN: v_min_f32_e32
+; GCN-NONAN-NOT: v_min_
 define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
@@ -193,8 +253,8 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_multi_use:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN-NOT: v_min
 ; GCN: v_cmp_le_f32
 ; GCN-NEXT: v_cndmask_b32
diff --git a/test/CodeGen/AMDGPU/fminnum.f64.ll b/test/CodeGen/AMDGPU/fminnum.f64.ll
index 475615e52cb..e37a1cead47 100644
--- a/test/CodeGen/AMDGPU/fminnum.f64.ll
+++ b/test/CodeGen/AMDGPU/fminnum.f64.ll
@@ -7,15 +7,35 @@ declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0
 declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0
 declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
 
-; FUNC-LABEL: @test_fmin_f64
-; SI: v_min_f64
-define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+; FUNC-LABEL: {{^}}test_fmin_f64_ieee:
+; SI: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]]
+; SI: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]]
+; SI-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]]
+; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]]
+define amdgpu_kernel void @test_fmin_f64_ieee([8 x i32], double %a, [8 x i32], double %b) nounwind {
+  %val = call double @llvm.minnum.f64(double %a, double %b) #0
+  store double %val, double addrspace(1)* undef, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_f64_no_ieee:
+; SI: ds_read_b64 [[VAL0:v\[[0-9]+:[0-9]+\]]]
+; SI: ds_read_b64 [[VAL1:v\[[0-9]+:[0-9]+\]]]
+; SI-NOT: [[VAL0]]
+; SI-NOT: [[VAL1]]
+; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VAL0]], [[VAL1]]
+; SI-NOT: [[RESULT]]
+; SI: ds_write_b64 v{{[0-9]+}}, [[RESULT]]
+define amdgpu_ps void @test_fmin_f64_no_ieee() nounwind {
+  %a = load volatile double, double addrspace(3)* undef
+  %b = load volatile double, double addrspace(3)* undef
   %val = call double @llvm.minnum.f64(double %a, double %b) #0
-  store double %val, double addrspace(1)* %out, align 8
+  store volatile double %val, double addrspace(3)* undef
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v2f64
+; FUNC-LABEL: {{^}}test_fmin_v2f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
@@ -24,7 +44,7 @@ define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v4f64
+; FUNC-LABEL: {{^}}test_fmin_v4f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
@@ -35,7 +55,7 @@ define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v8f64
+; FUNC-LABEL: {{^}}test_fmin_v8f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
@@ -50,7 +70,7 @@ define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v16f64
+; FUNC-LABEL: {{^}}test_fmin_v16f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll
index a0642e211f1..a8574b288f5 100644
--- a/test/CodeGen/AMDGPU/fminnum.ll
+++ b/test/CodeGen/AMDGPU/fminnum.ll
@@ -1,14 +1,45 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}test_fmin_f32:
-; GCN: v_min_f32_e32
-define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) #0 {
-  %val = call float @llvm.minnum.f32(float %a, float %b)
+; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_on:
+; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @test_fmin_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float %b) #1
   store float %val, float addrspace(1)* %out, align 4
   ret void
 }
 
+; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_on:
+; GCN: s_waitcnt
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: s_setpc_b64
+define float @test_fmin_nnan_f32_ieee_mode_on(float %a, float %b) #0 {
+  %val = call nnan float @llvm.minnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
+; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_off:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmin_nnan_f32_ieee_mode_off(float %a, float %b) #0 {
+  %val = call nnan float @llvm.minnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
+; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_off:
+; GCN: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmin_f32_ieee_mode_off(float %a, float %b) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
 ; GCN-LABEL: {{^}}test_fmin_v2f32:
 ; GCN: v_min_f32_e32
 ; GCN: v_min_f32_e32
@@ -147,38 +178,34 @@ define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}fmin_var_immediate_f32:
-; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float %a, float 2.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+; GCN-LABEL: {{^}}fmin_var_immediate_f32_no_ieee:
+; GCN: v_min_f32_e32 v0, 2.0, v0
+define amdgpu_ps float @fmin_var_immediate_f32_no_ieee(float %a) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float 2.0) #1
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmin_immediate_var_f32:
+; GCN-LABEL: {{^}}fmin_immediate_var_f32_no_ieee:
 ; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float 2.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmin_immediate_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.minnum.f32(float 2.0, float %a) #1
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmin_var_literal_f32:
+; GCN-LABEL: {{^}}fmin_var_literal_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float %a, float 99.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmin_var_literal_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float 99.0) #1
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmin_literal_var_f32:
+; GCN-LABEL: {{^}}fmin_literal_var_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float 99.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmin_literal_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.minnum.f32(float 99.0, float %a) #1
+  ret float %val
 }
 
 ; GCN-LABEL: {{^}}test_func_fmin_v3f32:
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
index 23e86351028..e57ebc9c061 100644
--- a/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -396,12 +396,14 @@ define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %
 ; fminnum tests
 ; --------------------------------------------------------------------------------
 
-; GCN-LABEL: {{^}}v_fneg_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -415,11 +417,23 @@ define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float add
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_self_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e64 v0, -v0, -v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 {
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -431,11 +445,22 @@ define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, floa
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, -v0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 {
+  %min = call float @llvm.minnum.f32(float %a, float %a)
+  %min.fneg = fsub float -0.0, %min
+  ret float %min.fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -447,11 +472,22 @@ define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, floa
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, -4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 {
+  %min = call float @llvm.minnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -463,6 +499,16 @@ define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, floa
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, 4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 {
+  %min = call float @llvm.minnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  ret float %fneg
+}
+
 ; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
@@ -479,11 +525,12 @@ define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float a
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -498,10 +545,11 @@ define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, floa
 ; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32:
 ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983
-; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]]
+; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
+; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
 
-; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
 ; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -520,10 +568,11 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(float addrspace(1)* %out, fl
 ; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32:
 ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e22f983
-; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]]
+; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
+; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]]
 
-; VI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494
+; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
+; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
@@ -545,7 +594,8 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out
 ; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
 
-; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]]
+; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
 ; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]]
 
 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -568,7 +618,8 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(half addrspace(1)* %out, hal
 ; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
 
-; VI: v_max_f16_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494
+; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
 
 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
@@ -588,7 +639,8 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out,
 
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
-; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, -[[A]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; VI: v_min_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[A]], 0.15915494
 ; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]]
@@ -611,9 +663,11 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(double addrspace(1)* %out, d
 
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
-; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
-; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], 0.15915494
+; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494
 
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
@@ -638,13 +692,14 @@ define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 {
   ret float %fneg
 }
 
-; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32:
+; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -660,15 +715,16 @@ define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32:
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
+; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
 
-; SI: v_max_f32_e64 [[MIN:v[0-9]+]], -[[A]], [[K]]
+; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
 ; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]]
 
-; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]]
 ; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -687,14 +743,29 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(float addrspace
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
+  %min = call float @llvm.minnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  %mul = fmul float %fneg, %b
+  ret float %mul
+}
+
+; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
-define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -710,16 +781,34 @@ define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e64 v0, -v0, -v1
+; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT: ; return
+define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 {
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  %use1 = fmul float %min, 4.0
+  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
+  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
+  ret <2 x float> %ins1
+}
+
 ; --------------------------------------------------------------------------------
 ; fmaxnum tests
 ; --------------------------------------------------------------------------------
 
-; GCN-LABEL: {{^}}v_fneg_maxnum_f32:
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -727,60 +816,104 @@ define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float add
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
   %b = load volatile float, float addrspace(1)* %b.gep
-  %min = call float @llvm.maxnum.f32(float %a, float %b)
-  %fneg = fsub float -0.000000e+00, %min
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
   store float %fneg, float addrspace(1)* %out.gep
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e64 v0, -v0, -v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 {
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
-  %min = call float @llvm.maxnum.f32(float %a, float %a)
-  %min.fneg = fsub float -0.0, %min
-  store float %min.fneg, float addrspace(1)* %out.gep
+  %max = call float @llvm.maxnum.f32(float %a, float %a)
+  %max.fneg = fsub float -0.0, %max
+  store float %max.fneg, float addrspace(1)* %out.gep
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, -v0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float %a, float %a)
+  %max.fneg = fsub float -0.0, %max
+  ret float %max.fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
-  %min = call float @llvm.maxnum.f32(float 4.0, float %a)
-  %fneg = fsub float -0.000000e+00, %min
+  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
   store float %fneg, float addrspace(1)* %out.gep
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, -4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
-  %min = call float @llvm.maxnum.f32(float -4.0, float %a)
-  %fneg = fsub float -0.000000e+00, %min
+  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
   store float %fneg, float addrspace(1)* %out.gep
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, 4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
 ; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
@@ -797,11 +930,12 @@ define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float a
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -813,13 +947,24 @@ define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, floa
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, 0{{$}}
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -834,14 +979,29 @@ define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)*
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
+  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  %mul = fmul float %fneg, %b
+  ret float %mul
+}
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
-define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -849,14 +1009,29 @@ define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
   %b = load volatile float, float addrspace(1)* %b.gep
-  %min = call float @llvm.maxnum.f32(float %a, float %b)
-  %fneg = fsub float -0.000000e+00, %min
-  %use1 = fmul float %min, 4.0
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
+  %use1 = fmul float %max, 4.0
   store volatile float %fneg, float addrspace(1)* %out
   store volatile float %use1, float addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e64 v0, -v0, -v1
+; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT: ; return
+define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 {
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
+  %use1 = fmul float %max, 4.0
+  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
+  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
+  ret <2 x float> %ins1
+}
+
 ; --------------------------------------------------------------------------------
 ; fma tests
 ; --------------------------------------------------------------------------------
diff --git a/test/CodeGen/AMDGPU/known-never-snan.ll b/test/CodeGen/AMDGPU/known-never-snan.ll
index 864cc745373..abf9b3ecefa 100644
--- a/test/CodeGen/AMDGPU/known-never-snan.ll
+++ b/test/CodeGen/AMDGPU/known-never-snan.ll
@@ -99,8 +99,7 @@ define float @v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32(float %a, float
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %b.nnan.add = fadd nnan float %b, 1.0
@@ -110,14 +109,46 @@ define float @v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32(float %a, float
   ret float %med
 }
 
+define float @v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
+; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %b.nsnan = fadd float %b, 1.0
+  %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nsnan)
+  %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
+  %med = call float @llvm.minnum.f32(float %max, float 4.0)
+  ret float %med
+}
+
+define float @v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
+; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %a.nsnan = fadd float %a, 1.0
+  %known.not.snan = call float @llvm.minnum.f32(float %a.nsnan, float %b)
+  %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
+  %med = call float @llvm.minnum.f32(float %max, float 4.0)
+  ret float %med
+}
+
 define float @v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nnan.add)
@@ -131,9 +162,9 @@ define float @v_minnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b)
@@ -148,8 +179,8 @@ define float @v_test_known_not_snan_maxnum_input_fmed3_r_i_i_f32(float %a, float
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %b.nnan.add = fadd nnan float %b, 1.0
@@ -164,8 +195,9 @@ define float @v_maxnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.maxnum.f32(float %a, float %b.nnan.add)
@@ -179,8 +211,9 @@ define float @v_maxnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b)
@@ -215,8 +248,8 @@ define float @v_select_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %cmp = icmp eq i32 %c, 0
@@ -233,8 +266,8 @@ define float @v_select_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %cmp = icmp eq i32 %c, 0
@@ -494,6 +527,7 @@ define float @v_test_known_not_snan_fmed3_input_fmed3_r_i_i_f32(float %a, float
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_med3_f32 v0, v0, v1, v2
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.amdgcn.fmed3.f32(float %a, float %b, float %c)
@@ -507,8 +541,7 @@ define float @v_test_known_not_snan_fmin3_input_fmed3_r_i_i_f32(float %a, float
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min0 = call float @llvm.minnum.f32(float %a, float %b)
   %known.not.snan = call float @llvm.minnum.f32(float %min0, float %c)
diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 13fdd288f9d..12573a5fee3 100644
--- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -1,23 +1,91 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SIVI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
 declare half @llvm.maxnum.f16(half %a, half %b)
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
 declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b)
 declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
 
-; GCN-LABEL: {{^}}maxnum_f16:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_f16(
+; SI-LABEL: maxnum_f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s8, s4
+; SI-NEXT:    s_mov_b32 s9, s5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v1
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -29,15 +97,65 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_f16_imm_a:
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_f16_imm_a(
+; SI-LABEL: maxnum_f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, 0x4200, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -47,15 +165,65 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_f16_imm_b:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_f16_imm_b(
+; SI-LABEL: maxnum_f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, 4.0, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -65,34 +233,79 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
-; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NOT: and
-; VI:    v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
-
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_v2f16(
+; SI-LABEL: maxnum_v2f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s6, s[6:7], 0x0
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; SI-NEXT:    s_lshr_b32 s0, s0, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v2, v3, v2
+; SI-NEXT:    v_max_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v2f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s5, s5
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_max_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -104,29 +317,64 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
-; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
-; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI-NOT: and
-; SIVI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @maxnum_v2f16_imm_a(
+; SI-LABEL: maxnum_v2f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v2f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -136,31 +384,64 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
-; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-
-; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
-; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-
-
-; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @maxnum_v2f16_imm_b(
+; SI-LABEL: maxnum_v2f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v2f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
+; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -171,10 +452,94 @@ entry:
 }
 
 ; FIXME: Scalarize with undef half
-; GCN-LABEL: {{^}}maxnum_v3f16:
-; GFX9: v_pk_max_f16
-; GFX9: v_pk_max_f16
 define amdgpu_kernel void @maxnum_v3f16(
+; SI-LABEL: maxnum_v3f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    s_lshr_b32 s4, s8, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s8
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s9
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_max_f32_e32 v2, v3, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, v1, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v3f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s6, s6, 16
+; VI-NEXT:    v_max_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s6, s6
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e32 v1, v2, v1
+; VI-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v3f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s6, s6
+; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s7, s7
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <3 x half> addrspace(1)* %r,
     <3 x half> addrspace(1)* %a,
     <3 x half> addrspace(1)* %b) {
@@ -186,13 +551,107 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v4f16:
-; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
-; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
 define amdgpu_kernel void @maxnum_v4f16(
+; SI-LABEL: maxnum_v4f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    s_lshr_b32 s4, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    s_lshr_b32 s4, s7, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s4, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s6
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_max_f32_e32 v3, v3, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, v1, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_max_f32_e32 v2, v2, v5
+; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_max_f32_e32 v0, v0, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v4f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v0, s7, s7
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    s_lshr_b32 s7, s7, 16
+; VI-NEXT:    v_max_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s6, 16
+; VI-NEXT:    v_max_f16_e32 v0, v2, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v3, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v4f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %a,
     <4 x half> addrspace(1)* %b) {
@@ -204,28 +663,87 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmax_v4f16_imm_a:
-; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200
-; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800
-
-; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], [[K0]]
-; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], [[K1]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
-
-; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000
-; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400
-
-; VI-DAG: v_max_f16_sdwa v[[MAX_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_max_f16_e32 v[[MAX_HI_LO:[0-9]+]], 0x4200, v[[A_HI]]
-; VI-DAG: v_max_f16_sdwa v[[MAX_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_max_f16_e32 v[[MAX_LO_LO:[0-9]+]], 0x4800, v[[A_LO]]
-
-; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MAX_LO_LO]], v[[MAX_LO_HI]]
-; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MAX_HI_LO]], v[[MAX_HI_HI]]
-
-; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}
 define amdgpu_kernel void @fmax_v4f16_imm_a(
+; SI-LABEL: fmax_v4f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s5, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_max_f32_e32 v2, 4.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_max_f32_e32 v3, 2.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 0x41000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fmax_v4f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_max_f16_e64 v3, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e32 v1, 0x4200, v1
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e32 v0, 0x4800, v2
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fmax_v4f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
+; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v2, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v0, s8
+; GFX9-NEXT:    v_pk_max_f16 v0, v2, s9
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %b) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index b34ad6f6890..cdf05094f69 100644
--- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -1,23 +1,91 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
 declare half @llvm.minnum.f16(half %a, half %b)
 declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
 declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b)
 declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
 
-; GCN-LABEL: {{^}}minnum_f16:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89:  v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
-define amdgpu_kernel void @minnum_f16(
+define amdgpu_kernel void @minnum_f16_ieee(
+; SI-LABEL: minnum_f16_ieee:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s8, s4
+; SI-NEXT:    s_mov_b32 s9, s5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_f16_ieee:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_f16_ieee:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -29,15 +97,88 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_f16_imm_a:
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89:  v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
+define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) {
+; SI-LABEL: minnum_f16_no_ieee:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: minnum_f16_no_ieee:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: minnum_f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %r.val = call half @llvm.minnum.f16(half %a, half %b)
+  ret half %r.val
+}
+
 define amdgpu_kernel void @minnum_f16_imm_a(
+; SI-LABEL: minnum_f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, 0x4200, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -47,15 +188,65 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_f16_imm_b:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89:  v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @minnum_f16_imm_b(
+; SI-LABEL: minnum_f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -65,33 +256,79 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
-; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NOT: and
-; VI:    v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
-
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
-define amdgpu_kernel void @minnum_v2f16(
+define amdgpu_kernel void @minnum_v2f16_ieee(
+; SI-LABEL: minnum_v2f16_ieee:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s6, s[6:7], 0x0
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; SI-NEXT:    s_lshr_b32 s0, s0, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v2, v3, v2
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v2f16_ieee:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s5, s5
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_min_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_ieee:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -103,29 +340,94 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
-; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
-; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI-NOT: and
-; SIVI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
+define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) {
+; SI-LABEL: minnum_v2f16_no_ieee:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_min_f32_e32 v0, v0, v2
+; SI-NEXT:    v_min_f32_e32 v1, v1, v3
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: minnum_v2f16_no_ieee:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: minnum_v2f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %r.val
+}
 
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @minnum_v2f16_imm_a(
+; SI-LABEL: minnum_v2f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v2f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -135,31 +437,64 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
-; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-
-; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
-; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-
-
-; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @minnum_v2f16_imm_b(
+; SI-LABEL: minnum_v2f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v2f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -170,10 +505,94 @@ entry:
 }
 
 ; FIXME: Scalarize with undef half
-; GCN-LABEL: {{^}}minnum_v3f16:
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
 define amdgpu_kernel void @minnum_v3f16(
+; SI-LABEL: minnum_v3f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    s_lshr_b32 s4, s8, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s8
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s9
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_min_f32_e32 v2, v3, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, v1, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v3f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s6, s6, 16
+; VI-NEXT:    v_min_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s6, s6
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_min_f16_e32 v1, v2, v1
+; VI-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v3f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s6, s6
+; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s7, s7
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <3 x half> addrspace(1)* %r,
     <3 x half> addrspace(1)* %a,
     <3 x half> addrspace(1)* %b) {
@@ -185,13 +604,107 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v4f16:
-; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
-; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
 define amdgpu_kernel void @minnum_v4f16(
+; SI-LABEL: minnum_v4f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    s_lshr_b32 s4, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    s_lshr_b32 s4, s7, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s4, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s6
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_min_f32_e32 v3, v3, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, v1, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_min_f32_e32 v2, v2, v5
+; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_min_f32_e32 v0, v0, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v4f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v0, s7, s7
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    s_lshr_b32 s7, s7, 16
+; VI-NEXT:    v_min_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s6, 16
+; VI-NEXT:    v_min_f16_e32 v0, v2, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v3, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v4f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %a,
     <4 x half> addrspace(1)* %b) {
@@ -203,28 +716,87 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmin_v4f16_imm_a:
-; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200
-; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800
-
-; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], [[K0]]
-; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], [[K1]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
-
-; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000
-; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400
-
-; VI-DAG: v_min_f16_sdwa v[[MIN_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_min_f16_e32 v[[MIN_HI_LO:[0-9]+]], 0x4200, v[[A_HI]]
-; VI-DAG: v_min_f16_sdwa v[[MIN_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_min_f16_e32 v[[MIN_LO_LO:[0-9]+]], 0x4800, v[[A_LO]]
-
-; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MIN_LO_LO]], v[[MIN_LO_HI]]
-; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MIN_HI_LO]], v[[MIN_HI_HI]]
-
-; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}
 define amdgpu_kernel void @fmin_v4f16_imm_a(
+; SI-LABEL: fmin_v4f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s5, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_min_f32_e32 v2, 4.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_min_f32_e32 v3, 2.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fmin_v4f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_max_f16_e64 v3, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_min_f16_e32 v1, 0x4200, v1
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_min_f16_e32 v0, 0x4800, v2
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fmin_v4f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
+; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v2, s4, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v0, s8
+; GFX9-NEXT:    v_pk_min_f16 v0, v2, s9
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %b) {
 entry:
diff --git a/test/CodeGen/AMDGPU/reduction.ll b/test/CodeGen/AMDGPU/reduction.ll
index 74ca4a668f9..0c605f79d98 100644
--- a/test/CodeGen/AMDGPU/reduction.ll
+++ b/test/CodeGen/AMDGPU/reduction.ll
@@ -434,12 +434,23 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}reduction_maxnum_v4f16:
-; GFX9:      v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-
-; VI:      v_max_f16_sdwa
-; VI-NEXT: v_max_f16_e32
-; VI-NEXT: v_max_f16_e32
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_maxnum_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -451,12 +462,24 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}reduction_minnum_v4f16:
-; GFX9:      v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
 
-; VI:      v_min_f16_sdwa
-; VI-NEXT: v_min_f16_e32
-; VI-NEXT: v_min_f16_e32
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_minnum_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -467,13 +490,36 @@ entry:
   ret half %res
 }
 
+; FIXME: Need to preserve fast math flags when fmaxnum matched
+; directly from the IR to avoid unnecessary quieting.
+
 ; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16:
-; GFX9:      v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; XGFX9:      v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; XGFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+; XVI: s_waitcnt
+; XVI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; XVI-NEXT: v_max_f16_e32 v0, v0, v1
+; XVI-NEXT: v_max_f16_e32 v0, v0, v2
+; XVI-NEXT: s_setpc_b64
 
-; VI:      v_max_f16_sdwa
-; VI-NEXT: v_max_f16_e32
-; VI-NEXT: v_max_f16_e32
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -486,13 +532,37 @@ entry:
   ret half %res
 }
 
+; FIXME: Need to preserve fast math flags when fmaxnum matched
+; directly from the IR to avoid unnecessary quieting.
+
 ; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16:
-; GFX9:      v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; XGFX9:      v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; XGFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+; XVI: s_waitcnt
+; XVI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; XVI-NEXT: v_min_f16_e32 v0, v0, v1
+; XVI-NEXT: v_min_f16_e32 v0, v0, v2
+; XVI-NEXT: s_setpc_b64
+
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
+
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
 
-; VI:      v_min_f16_sdwa
-; VI-NEXT: v_min_f16_e32
-; VI-NEXT: v_min_f16_e32
+; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-- 
GitLab


From b582972288e8c94ef1ead61090d5d67d9bffd130 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Mon, 22 Oct 2018 16:28:07 +0000
Subject: [PATCH 0388/1116] [llvm-mca] Remove a couple of using directives and
 a bunch of redundant namespace llvm prefixes. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344916 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/CodeRegion.cpp                   | 17 ++++++++---------
 tools/llvm-mca/PipelinePrinter.cpp              |  2 --
 tools/llvm-mca/Views/DispatchStatistics.cpp     |  2 +-
 tools/llvm-mca/Views/RegisterFileStatistics.cpp |  2 +-
 .../Views/RetireControlUnitStatistics.cpp       |  2 +-
 tools/llvm-mca/Views/SchedulerStatistics.cpp    |  2 +-
 tools/llvm-mca/Views/SummaryView.cpp            |  2 +-
 7 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/tools/llvm-mca/CodeRegion.cpp b/tools/llvm-mca/CodeRegion.cpp
index c26658a6cf5..591c45feb6d 100644
--- a/tools/llvm-mca/CodeRegion.cpp
+++ b/tools/llvm-mca/CodeRegion.cpp
@@ -14,11 +14,9 @@
 
 #include "CodeRegion.h"
 
-using namespace llvm;
-
 namespace mca {
 
-bool CodeRegion::isLocInRange(SMLoc Loc) const {
+bool CodeRegion::isLocInRange(llvm::SMLoc Loc) const {
   if (RangeEnd.isValid() && Loc.getPointer() > RangeEnd.getPointer())
     return false;
   if (RangeStart.isValid() && Loc.getPointer() < RangeStart.getPointer())
@@ -26,11 +24,11 @@ bool CodeRegion::isLocInRange(SMLoc Loc) const {
   return true;
 }
 
-void CodeRegions::beginRegion(StringRef Description, SMLoc Loc) {
+void CodeRegions::beginRegion(llvm::StringRef Description, llvm::SMLoc Loc) {
   assert(!Regions.empty() && "Missing Default region");
   const CodeRegion &CurrentRegion = *Regions.back();
   if (CurrentRegion.startLoc().isValid() && !CurrentRegion.endLoc().isValid()) {
-    SM.PrintMessage(Loc, SourceMgr::DK_Warning,
+    SM.PrintMessage(Loc, llvm::SourceMgr::DK_Warning,
                     "Ignoring invalid region start");
     return;
   }
@@ -41,19 +39,20 @@ void CodeRegions::beginRegion(StringRef Description, SMLoc Loc) {
   addRegion(Description, Loc);
 }
 
-void CodeRegions::endRegion(SMLoc Loc) {
+void CodeRegions::endRegion(llvm::SMLoc Loc) {
   assert(!Regions.empty() && "Missing Default region");
   CodeRegion &CurrentRegion = *Regions.back();
   if (CurrentRegion.endLoc().isValid()) {
-    SM.PrintMessage(Loc, SourceMgr::DK_Warning, "Ignoring invalid region end");
+    SM.PrintMessage(Loc, llvm::SourceMgr::DK_Warning,
+                    "Ignoring invalid region end");
     return;
   }
 
   CurrentRegion.setEndLocation(Loc);
 }
 
-void CodeRegions::addInstruction(const MCInst &Instruction) {
-  const SMLoc &Loc = Instruction.getLoc();
+void CodeRegions::addInstruction(const llvm::MCInst &Instruction) {
+  const llvm::SMLoc &Loc = Instruction.getLoc();
   const auto It =
       std::find_if(Regions.rbegin(), Regions.rend(),
                    [Loc](const std::unique_ptr<CodeRegion> &Region) {
diff --git a/tools/llvm-mca/PipelinePrinter.cpp b/tools/llvm-mca/PipelinePrinter.cpp
index 619f22cc810..8b2157a8eb6 100644
--- a/tools/llvm-mca/PipelinePrinter.cpp
+++ b/tools/llvm-mca/PipelinePrinter.cpp
@@ -17,8 +17,6 @@
 
 namespace mca {
 
-using namespace llvm;
-
 void PipelinePrinter::printReport(llvm::raw_ostream &OS) const {
   for (const auto &V : Views)
     V->printView(OS);
diff --git a/tools/llvm-mca/Views/DispatchStatistics.cpp b/tools/llvm-mca/Views/DispatchStatistics.cpp
index cccb09a9fa7..98adcfb450d 100644
--- a/tools/llvm-mca/Views/DispatchStatistics.cpp
+++ b/tools/llvm-mca/Views/DispatchStatistics.cpp
@@ -33,7 +33,7 @@ void DispatchStatistics::onEvent(const HWInstructionEvent &Event) {
   NumDispatched += DE.MicroOpcodes;
 }
 
-void DispatchStatistics::printDispatchHistogram(llvm::raw_ostream &OS) const {
+void DispatchStatistics::printDispatchHistogram(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
   TempStream << "\n\nDispatch Logic - "
diff --git a/tools/llvm-mca/Views/RegisterFileStatistics.cpp b/tools/llvm-mca/Views/RegisterFileStatistics.cpp
index cd540e9dc60..2697f528a0a 100644
--- a/tools/llvm-mca/Views/RegisterFileStatistics.cpp
+++ b/tools/llvm-mca/Views/RegisterFileStatistics.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
 
 namespace mca {
 
-RegisterFileStatistics::RegisterFileStatistics(const llvm::MCSubtargetInfo &sti)
+RegisterFileStatistics::RegisterFileStatistics(const MCSubtargetInfo &sti)
     : STI(sti) {
   const MCSchedModel &SM = STI.getSchedModel();
   RegisterFileUsage Empty = {0, 0, 0};
diff --git a/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp b/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
index d5aab396b4c..a9a4ac9a33d 100644
--- a/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
+++ b/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
@@ -24,7 +24,7 @@ void RetireControlUnitStatistics::onEvent(const HWInstructionEvent &Event) {
     ++NumRetired;
 }
 
-void RetireControlUnitStatistics::printView(llvm::raw_ostream &OS) const {
+void RetireControlUnitStatistics::printView(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
   TempStream << "\n\nRetire Control Unit - "
diff --git a/tools/llvm-mca/Views/SchedulerStatistics.cpp b/tools/llvm-mca/Views/SchedulerStatistics.cpp
index bc91bf04a81..958b3b548f4 100644
--- a/tools/llvm-mca/Views/SchedulerStatistics.cpp
+++ b/tools/llvm-mca/Views/SchedulerStatistics.cpp
@@ -121,7 +121,7 @@ void SchedulerStatistics::printSchedulerUsage(raw_ostream &OS) const {
   FOS.flush();
 }
 
-void SchedulerStatistics::printView(llvm::raw_ostream &OS) const {
+void SchedulerStatistics::printView(raw_ostream &OS) const {
   printSchedulerStats(OS);
   printSchedulerUsage(OS);
 }
diff --git a/tools/llvm-mca/Views/SummaryView.cpp b/tools/llvm-mca/Views/SummaryView.cpp
index eb4c50c5d1f..98f3410d61b 100644
--- a/tools/llvm-mca/Views/SummaryView.cpp
+++ b/tools/llvm-mca/Views/SummaryView.cpp
@@ -24,7 +24,7 @@ namespace mca {
 
 using namespace llvm;
 
-SummaryView::SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
+SummaryView::SummaryView(const MCSchedModel &Model, const SourceMgr &S,
                          unsigned Width)
     : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
       NumMicroOps(0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0),
-- 
GitLab


From 19bc740d06f6318daea26a6e8b6d44e2d19589e1 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Mon, 22 Oct 2018 16:50:24 +0000
Subject: [PATCH 0389/1116] [test] Relax test/Other/opt-hot-cold-split.ll

On some ARM bots, 'Target Pass Configuration' does not run after 'Target
Transform Info'. Relax this pipeline test to allow that.

This is the same fix as in r328167.

Bot URL: http://lab.llvm.org:8011/builders/clang-cmake-armv7-quick/builds/4611

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344919 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Other/opt-hot-cold-split.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/Other/opt-hot-cold-split.ll b/test/Other/opt-hot-cold-split.ll
index ba92ccab636..a3fbdeffb2f 100644
--- a/test/Other/opt-hot-cold-split.ll
+++ b/test/Other/opt-hot-cold-split.ll
@@ -18,8 +18,8 @@
 ; CHECK-NEXT:  Pass Arguments:
 ; CHECK-NEXT:  Target Library Information
 ; CHECK-NEXT:  Target Transform Information
-; CHECK-NEXT:  Target Pass Configuration
-; CHECK-NEXT:  Type-Based Alias Analysis
+;              Target Pass Configuration
+; CHECK:       Type-Based Alias Analysis
 ; CHECK-NEXT:  Scoped NoAlias Alias Analysis
 ; CHECK-NEXT:  Assumption Cache Tracker
 ; CHECK-NEXT:  Profile summary info
-- 
GitLab


From 2af624687d14f3508020781ad76b85d938ff40b8 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 22 Oct 2018 16:59:24 +0000
Subject: [PATCH 0390/1116] Revert r344877 "[X86] Stop promoting integer loads
 to vXi64"

Sam McCall reported miscompiles in some tensorflow code. Reverting while I try to figure out.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344921 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp            |  28 +-
 lib/Target/X86/X86ISelLowering.cpp            |  28 +-
 lib/Target/X86/X86InstrAVX512.td              | 243 +++----
 lib/Target/X86/X86InstrFragmentsSIMD.td       |  54 +-
 lib/Target/X86/X86InstrSSE.td                 | 627 ++++++++----------
 lib/Target/X86/X86InstrXOP.td                 |  75 +--
 lib/Target/X86/X86MCInstLower.cpp             |  26 +-
 .../X86/X86ShuffleDecodeConstantPool.cpp      |  58 +-
 lib/Target/X86/X86ShuffleDecodeConstantPool.h |  13 +-
 test/CodeGen/X86/avx-vperm2x128.ll            |   2 +-
 test/CodeGen/X86/oddshuffles.ll               |  24 +-
 test/CodeGen/X86/pshufb-mask-comments.ll      |   6 +-
 test/CodeGen/X86/vector-extend-inreg.ll       |   2 +-
 test/CodeGen/X86/vector-idiv-v2i32.ll         |  18 +-
 test/CodeGen/X86/widened-broadcast.ll         |  95 ++-
 15 files changed, 589 insertions(+), 710 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index d3aa5c89adc..9dbc2761a6c 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2890,17 +2890,21 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
 
-  // Try to fold a load. No need to check alignment.
+  // If there is a load, it will be behind a bitcast. We don't need to check
+  // alignment on this load.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+  if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
+      tryFoldLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
+                  Tmp3, Tmp4)) {
+    SDValue Load = N1.getOperand(0);
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
-                      N1.getOperand(0) };
+                      Load.getOperand(0) };
     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     // Update the chain.
-    ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
+    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
     // Record the mem-refs
-    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
+    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
     return CNode;
   }
 
@@ -2923,18 +2927,22 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
 
-  // Try to fold a load. No need to check alignment.
+  // If there is a load, it will be behind a bitcast. We don't need to check
+  // alignment on this load.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+  if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
+      tryFoldLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
+                  Tmp3, Tmp4)) {
+    SDValue Load = N2.getOperand(0);
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
-                      N2.getOperand(0), InFlag };
+                      Load.getOperand(0), InFlag };
     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     InFlag = SDValue(CNode, 3);
     // Update the chain.
-    ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
+    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
     // Record the mem-refs
-    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
+    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
     return CNode;
   }
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 51ad0bdf00e..8f4e2ad5ed6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -869,6 +869,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
+    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
+    }
+
     // Custom lower v2i64 and v2f64 selects.
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
@@ -1173,6 +1178,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (HasInt256)
       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
 
+    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
+    }
+
     if (HasInt256) {
       // Custom legalize 2x32 to get a little better code.
       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
@@ -1409,6 +1419,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MGATHER,             VT, Custom);
       setOperationAction(ISD::MSCATTER,            VT, Custom);
     }
+    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
+      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
+    }
+
     // Need to custom split v32i16/v64i8 bitcasts.
     if (!Subtarget.hasBWI()) {
       setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
@@ -5525,7 +5539,7 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
     return nullptr;
 
-  return CNode->getConstVal();
+  return dyn_cast<Constant>(CNode->getConstVal());
 }
 
 // Extract raw constant bits from constant pools.
@@ -6030,7 +6044,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMILPMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
+      DecodeVPERMILPMask(C, MaskEltSize, Mask);
       break;
     }
     return false;
@@ -6047,7 +6061,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodePSHUFBMask(C, VT.getSizeInBits(), Mask);
+      DecodePSHUFBMask(C, Mask);
       break;
     }
     return false;
@@ -6109,7 +6123,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
         break;
       }
       if (auto *C = getTargetConstantFromNode(MaskNode)) {
-        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, VT.getSizeInBits(), Mask);
+        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
         break;
       }
     }
@@ -6126,7 +6140,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPPERMMask(C, VT.getSizeInBits(), Mask);
+      DecodeVPPERMMask(C, Mask);
       break;
     }
     return false;
@@ -6143,7 +6157,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMVMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
+      DecodeVPERMVMask(C, MaskEltSize, Mask);
       break;
     }
     return false;
@@ -6157,7 +6171,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     Ops.push_back(N->getOperand(2));
     SDValue MaskNode = N->getOperand(1);
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMV3Mask(C, MaskEltSize, VT.getSizeInBits(), Mask);
+      DecodeVPERMV3Mask(C, MaskEltSize, Mask);
       break;
     }
     return false;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index b2d0ce2bcd3..72dd4ec8034 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -66,16 +66,21 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
                            !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?));
 
   // Load patterns
-  PatFrag LdFrag = !cast<PatFrag>("load" # VTName);
-
-  PatFrag i64LdFrag = !cast<PatFrag>("load" #
-                                     !if (!eq (TypeVariantName, "i"),
-                                          !if (!eq (Size, 128), "v2i64",
-                                          !if (!eq (Size, 256), "v4i64",
-                                          !if (!eq (Size, 512), "v8i64",
-                                               VTName))), VTName));
-
-  PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # VTName);
+  // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
+  //       due to load promotion during legalization
+  PatFrag LdFrag = !cast<PatFrag>("load" #
+                                  !if (!eq (TypeVariantName, "i"),
+                                       !if (!eq (Size, 128), "v2i64",
+                                       !if (!eq (Size, 256), "v4i64",
+                                       !if (!eq (Size, 512), "v8i64",
+                                            VTName))), VTName));
+
+  PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" #
+                                         !if (!eq (TypeVariantName, "i"),
+                                               !if (!eq (Size, 128), "v2i64",
+                                               !if (!eq (Size, 256), "v4i64",
+                                               !if (!eq (Size, 512), "v8i64",
+                                                   VTName))), VTName));
 
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
 
@@ -513,10 +518,10 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
-                               (From.VT (From.LdFrag addr:$src2)),
+                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
                                (iPTR imm)),
                    (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
-                               (From.VT (From.LdFrag addr:$src2)),
+                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
                                (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
                    EVEX_CD8<From.EltSize, From.CD8TupleForm>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -542,7 +547,7 @@ multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
 
     def : Pat<(vinsert_insert:$ins
                   (To.VT To.RC:$src1),
-                  (From.VT (From.LdFrag addr:$src2)),
+                  (From.VT (bitconvert (From.LdFrag addr:$src2))),
                   (iPTR imm)),
               (To.VT (!cast<Instruction>(InstrStr#"rm")
                   To.RC:$src1, addr:$src2,
@@ -675,7 +680,9 @@ let Predicates = p in {
              (vselect Cast.KRCWM:$mask,
                       (bitconvert
                        (vinsert_insert:$ins (To.VT To.RC:$src1),
-                                            (From.VT (From.LdFrag addr:$src2)),
+                                            (From.VT
+                                             (bitconvert
+                                              (From.LdFrag addr:$src2))),
                                             (iPTR imm))),
                       Cast.ImmAllZerosV)),
             (!cast<Instruction>(InstrStr#"rmkz")
@@ -1367,7 +1374,7 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (_Src.LdFrag addr:$src))))>,
+                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
@@ -1382,7 +1389,7 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (null_frag),
                            (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (_Src.LdFrag addr:$src))))>,
+                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
@@ -1435,11 +1442,11 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
 let Predicates = [HasAVX512] in {
 def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
           (VBROADCASTF64X4rm addr:$src)>;
-def : Pat<(v16i32 (X86SubVBroadcast (loadv8i32 addr:$src))),
+def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
           (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (loadv16i16 addr:$src))),
+def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))),
           (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (loadv32i8 addr:$src))),
+def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
           (VBROADCASTI64X4rm addr:$src)>;
 
 // Provide fallback in case the load node that is used in the patterns above
@@ -1467,9 +1474,9 @@ def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
           (VBROADCASTF32X4rm addr:$src)>;
 def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
+def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
           (VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
+def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
           (VBROADCASTI32X4rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
@@ -1499,11 +1506,11 @@ def : Pat<(vselect VK8WM:$mask,
                    VR512:$src0),
           (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
                    (bc_v8i64 (v16i32 immAllZerosV))),
           (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
                    VR512:$src0),
           (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
@@ -1520,9 +1527,9 @@ def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
           (VBROADCASTF32X4Z256rm addr:$src)>;
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
@@ -1584,11 +1591,11 @@ def : Pat<(vselect VK4WM:$mask,
                    VR256X:$src0),
           (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                   (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
                    (bc_v4i64 (v8i32 immAllZerosV))),
           (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
 def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                   (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
                    VR256X:$src0),
           (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 }
@@ -1634,11 +1641,11 @@ def : Pat<(vselect VK8WM:$mask,
                    VR512:$src0),
           (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
                    (bc_v8i64 (v16i32 immAllZerosV))),
           (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
                    VR512:$src0),
           (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
@@ -1734,7 +1741,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
             (ins _.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1,
-                   (_.VT (_.LdFrag addr:$src3)))), 1>,
+                   (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
             EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -1852,7 +1859,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
             (ins IdxVT.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
-                   (_.LdFrag addr:$src3))), 1>,
+                   (bitconvert (_.LdFrag addr:$src3)))), 1>,
             EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -2142,7 +2149,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
-                                       (_.VT (_.LdFrag addr:$src2))))]>,
+                                       (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
              EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = IsCommutable in
   def rrk : AVX512BI<opc, MRMSrcReg,
@@ -2158,7 +2165,8 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
                           "$dst {${mask}}, $src1, $src2}"),
               [(set _.KRC:$dst, (and _.KRCWM:$mask,
                                    (OpNode (_.VT _.RC:$src1),
-                                       (_.VT (_.LdFrag addr:$src2)))))]>,
+                                       (_.VT (bitconvert
+                                              (_.LdFrag addr:$src2))))))]>,
               EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -2283,7 +2291,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
              [(set _.KRC:$dst, (_.KVT
                                 (Frag:$cc
                                  (_.VT _.RC:$src1),
-                                 (_.VT (_.LdFrag addr:$src2)),
+                                 (_.VT (bitconvert (_.LdFrag addr:$src2))),
                                  cond)))]>,
              EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = 1 in
@@ -2308,7 +2316,8 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                      (_.KVT
                                       (Frag:$cc
                                        (_.VT _.RC:$src1),
-                                       (_.VT (_.LdFrag addr:$src2)),
+                                       (_.VT (bitconvert
+                                              (_.LdFrag addr:$src2))),
                                        cond))))]>,
               EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -2343,13 +2352,13 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                NotMemoryFoldable;
   }
 
-  def : Pat<(_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
+  def : Pat<(_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
                                  (_.VT _.RC:$src1), cond)),
             (!cast<Instruction>(Name#_.ZSuffix#"rmi")
              _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
 
   def : Pat<(and _.KRCWM:$mask,
-                 (_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
+                 (_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
                                       (_.VT _.RC:$src1), cond))),
             (!cast<Instruction>(Name#_.ZSuffix#"rmik")
              _.KRCWM:$mask, _.RC:$src1, addr:$src2,
@@ -2535,7 +2544,7 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
                 "vcmp${cc}"#_.Suffix,
                 "$src2, $src1", "$src1, $src2",
                 (X86cmpm (_.VT _.RC:$src1),
-                        (_.VT (_.LdFrag addr:$src2)),
+                        (_.VT (bitconvert (_.LdFrag addr:$src2))),
                         imm:$cc)>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -2723,7 +2732,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     OpcodeStr##_.Suffix##mem#
                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set _.KRC:$dst,(OpNode
-                                     (_.VT (_.LdFrag addr:$src1)),
+                                     (_.VT (bitconvert (_.LdFrag addr:$src1))),
                                      (i32 imm:$src2)))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
@@ -2731,7 +2740,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     OpcodeStr##_.Suffix##mem#
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                     [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode
-                                  (_.VT (_.LdFrag addr:$src1)),
+                                  (_.VT (bitconvert (_.LdFrag addr:$src1))),
                                   (i32 imm:$src2))))]>,
                     EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
@@ -3344,7 +3353,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     !if(NoRMPattern, [],
                         [(set _.RC:$dst,
-                          (_.VT (ld_frag addr:$src)))]),
+                          (_.VT (bitconvert (ld_frag addr:$src))))]),
                     _.ExeDomain>, EVEX, Sched<[Sched.RM]>,
                     EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
 
@@ -3363,7 +3372,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                       "${dst} {${mask}}, $src1}"),
                      [(set _.RC:$dst, (_.VT
                          (vselect _.KRCWM:$mask,
-                          (_.VT (ld_frag addr:$src1)),
+                          (_.VT (bitconvert (ld_frag addr:$src1))),
                            (_.VT _.RC:$src0))))], _.ExeDomain>,
                      EVEX, EVEX_K, Sched<[Sched.RM]>;
   }
@@ -3372,7 +3381,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                   OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
                                 "${dst} {${mask}} {z}, $src}",
                   [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
-                    (_.VT (ld_frag addr:$src)), _.ImmAllZerosV)))],
+                    (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
                   _.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>;
   }
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
@@ -3672,20 +3681,6 @@ let Predicates = [HasBWI, NoVLX] in {
 }
 
 let Predicates = [HasAVX512] in {
-  // 512-bit load.
-  def : Pat<(alignedloadv16i32 addr:$src),
-            (VMOVDQA64Zrm addr:$src)>;
-  def : Pat<(alignedloadv32i16 addr:$src),
-            (VMOVDQA64Zrm addr:$src)>;
-  def : Pat<(alignedloadv64i8 addr:$src),
-            (VMOVDQA64Zrm addr:$src)>;
-  def : Pat<(loadv16i32 addr:$src),
-            (VMOVDQU64Zrm addr:$src)>;
-  def : Pat<(loadv32i16 addr:$src),
-            (VMOVDQU64Zrm addr:$src)>;
-  def : Pat<(loadv64i8 addr:$src),
-            (VMOVDQU64Zrm addr:$src)>;
-
   // 512-bit store.
   def : Pat<(alignedstore (v16i32 VR512:$src), addr:$dst),
             (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
@@ -3702,20 +3697,6 @@ let Predicates = [HasAVX512] in {
 }
 
 let Predicates = [HasVLX] in {
-  // 128-bit load.
-  def : Pat<(alignedloadv4i32 addr:$src),
-            (VMOVDQA64Z128rm addr:$src)>;
-  def : Pat<(alignedloadv8i16 addr:$src),
-            (VMOVDQA64Z128rm addr:$src)>;
-  def : Pat<(alignedloadv16i8 addr:$src),
-            (VMOVDQA64Z128rm addr:$src)>;
-  def : Pat<(loadv4i32 addr:$src),
-            (VMOVDQU64Z128rm addr:$src)>;
-  def : Pat<(loadv8i16 addr:$src),
-            (VMOVDQU64Z128rm addr:$src)>;
-  def : Pat<(loadv16i8 addr:$src),
-            (VMOVDQU64Z128rm addr:$src)>;
-
   // 128-bit store.
   def : Pat<(alignedstore (v4i32 VR128X:$src), addr:$dst),
             (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
@@ -3730,20 +3711,6 @@ let Predicates = [HasVLX] in {
   def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
             (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
 
-  // 256-bit load.
-  def : Pat<(alignedloadv8i32 addr:$src),
-            (VMOVDQA64Z256rm addr:$src)>;
-  def : Pat<(alignedloadv16i16 addr:$src),
-            (VMOVDQA64Z256rm addr:$src)>;
-  def : Pat<(alignedloadv32i8 addr:$src),
-            (VMOVDQA64Z256rm addr:$src)>;
-  def : Pat<(loadv8i32 addr:$src),
-            (VMOVDQU64Z256rm addr:$src)>;
-  def : Pat<(loadv16i16 addr:$src),
-            (VMOVDQU64Z256rm addr:$src)>;
-  def : Pat<(loadv32i8 addr:$src),
-            (VMOVDQU64Z256rm addr:$src)>;
-
   // 256-bit store.
   def : Pat<(alignedstore (v8i32 VR256X:$src), addr:$dst),
             (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
@@ -4528,7 +4495,7 @@ let Predicates = [HasAVX512] in {
             (VMOVDI2PDIZrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (VMOVDI2PDIZrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
             (VMOVDI2PDIZrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (VMOVDI2PDIZrm addr:$src)>;
@@ -4624,12 +4591,6 @@ let Predicates = [HasAVX512], AddedComplexity = 400 in {
             (VMOVNTDQAZrm addr:$src)>;
   def : Pat<(v8i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZrm addr:$src)>;
-  def : Pat<(v16i32 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAZrm addr:$src)>;
-  def : Pat<(v32i16 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAZrm addr:$src)>;
-  def : Pat<(v64i8 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAZrm addr:$src)>;
 }
 
 let Predicates = [HasVLX], AddedComplexity = 400 in {
@@ -4646,12 +4607,6 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
             (VMOVNTDQAZ256rm addr:$src)>;
   def : Pat<(v4i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZ256rm addr:$src)>;
-  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAZ256rm addr:$src)>;
-  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAZ256rm addr:$src)>;
-  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAZ256rm addr:$src)>;
 
   def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
             (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
@@ -4666,12 +4621,6 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
             (VMOVNTDQAZ128rm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZ128rm addr:$src)>;
-  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAZ128rm addr:$src)>;
-  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAZ128rm addr:$src)>;
-  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAZ128rm addr:$src)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4690,7 +4639,8 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
-                  (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2)))>,
+                  (_.VT (OpNode _.RC:$src1,
+                                (bitconvert (_.LdFrag addr:$src2))))>,
                   AVX512BIBase, EVEX_4V,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -4821,7 +4771,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                      (_Src.LdFrag addr:$src2)))>,
+                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
                         AVX512BIBase, EVEX_4V,
                         Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -4926,7 +4876,7 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                      (_Src.LdFrag addr:$src2)))>,
+                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
                          EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -5118,7 +5068,7 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
                   (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
                                    (bitconvert (_.LdFrag addr:$src2)))),
                   (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
-                                     (_.i64LdFrag addr:$src2)))))>,
+                                     (bitconvert (_.LdFrag addr:$src2))))))>,
                   AVX512BIBase, EVEX_4V,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -5779,7 +5729,7 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode,
                        "$src2, $src1", "$src1, $src2",
                    (OpNode (bitconvert
                             (_.i64VT (and _.RC:$src1,
-                                          (_.i64LdFrag addr:$src2)))),
+                                          (bitconvert (_.LdFrag addr:$src2))))),
                            _.ImmAllZerosV)>,
                    EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -5943,7 +5893,7 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
   defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode (_.VT (_.LdFrag addr:$src1)),
+                   (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
                           (i8 imm:$src2)))>,
                    Sched<[sched.Folded]>;
   }
@@ -5973,7 +5923,8 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode _.RC:$src1, (SrcVT (load addr:$src2))))>,
+                   (_.VT (OpNode _.RC:$src1,
+                                 (SrcVT (bitconvert (loadv2i64 addr:$src2)))))>,
                    AVX512BIBase,
                    EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -6127,7 +6078,7 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1,
-                   (_.VT (_.LdFrag addr:$src2))))>,
+                   (_.VT (bitconvert (_.LdFrag addr:$src2)))))>,
                    AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -6227,7 +6178,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
     def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)),
               (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1,
                _.RC:$src2)>;
-    def : Pat<(_.VT (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2))),
+    def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
                _.RC:$src1, addr:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
@@ -6235,7 +6186,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, _.RC:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                     (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
+                     (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
                      _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, addr:$src2)>;
@@ -6244,7 +6195,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
                _.RC:$src1, _.RC:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                     (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
+                     (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
                      _.ImmAllZerosV)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
                _.RC:$src1, addr:$src2)>;
@@ -6469,7 +6420,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode
                            _.RC:$src1,
-                           (Ctrl.VT (Ctrl.LdFrag addr:$src2))))>,
+                           (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
                   T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
@@ -7755,7 +7706,7 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins MemOp:$src), OpcodeStr#Alias, "$src", "$src",
                          (_.VT (OpNode (_Src.VT
-                             (_Src.LdFrag addr:$src))))>,
+                             (bitconvert (_Src.LdFrag addr:$src)))))>,
                          EVEX, Sched<[sched.Folded]>;
 
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -8462,7 +8413,8 @@ multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
   defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
                             (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
                             (X86cvtph2ps (_src.VT
-                                          (ld_frag addr:$src)))>,
+                                          (bitconvert
+                                           (ld_frag addr:$src))))>,
                             T8PD, Sched<[sched.Folded]>;
 }
 
@@ -8477,17 +8429,17 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
 }
 
 let Predicates = [HasAVX512] in
-  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, load,
+  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64,
                                     WriteCvtPH2PSZ>,
                     avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>,
                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
 
 let Predicates = [HasVLX] in {
   defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
-                       load, WriteCvtPH2PSY>, EVEX, EVEX_V256,
+                       loadv2i64, WriteCvtPH2PSY>, EVEX, EVEX_V256,
                        EVEX_CD8<32, CD8VH>;
   defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
-                       load, WriteCvtPH2PS>, EVEX, EVEX_V128,
+                       loadv2i64, WriteCvtPH2PS>, EVEX, EVEX_V128,
                        EVEX_CD8<32, CD8VH>;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
@@ -9431,7 +9383,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
   def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (InVecOp (loadv16i8 addr:$src))),
+  def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
   }
   let Predicates = [HasVLX] in {
@@ -9441,7 +9393,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
   def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (InVecOp (loadv16i8 addr:$src))),
+  def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
@@ -9450,7 +9402,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (loadv16i8 addr:$src))),
+  def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
 
   def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -9461,7 +9413,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
   def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (InVecOp (loadv8i16 addr:$src))),
+  def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -9470,7 +9422,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (loadv8i16 addr:$src))),
+  def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -9481,12 +9433,12 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))),
+  def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   }
   // 256-bit patterns
   let Predicates = [HasVLX, HasBWI] in {
-  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
   def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
@@ -9500,7 +9452,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
 
   def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -9509,10 +9461,10 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
 
-  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
+  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
@@ -9525,10 +9477,10 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))),
+  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
 
-  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
+  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
@@ -9537,25 +9489,25 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
   }
   // 512-bit patterns
   let Predicates = [HasBWI] in {
-  def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))),
+  def : Pat<(v32i16 (ExtOp (bc_v32i8 (loadv4i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZrm) addr:$src)>;
   }
   let Predicates = [HasAVX512] in {
-  def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v16i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZrm) addr:$src)>;
 
   def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
-  def : Pat<(v8i64 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v8i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
 
-  def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))),
+  def : Pat<(v16i32 (ExtOp (bc_v16i16 (loadv4i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZrm) addr:$src)>;
 
-  def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))),
+  def : Pat<(v8i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZrm) addr:$src)>;
 
-  def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))),
+  def : Pat<(v8i64 (ExtOp (bc_v8i32 (loadv4i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZrm) addr:$src)>;
   }
 }
@@ -10460,7 +10412,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                 (_.VT
                  (bitconvert
                   (CastInfo.VT (X86Shuf128 _.RC:$src1,
-                                           (CastInfo.LdFrag addr:$src2),
+                                           (bitconvert (_.LdFrag addr:$src2)),
                                            (i8 imm:$src3)))))>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>,
                 EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
@@ -10626,7 +10578,7 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
-                                              (From.LdFrag addr:$src2),
+                                      (bitconvert (To.LdFrag addr:$src2)),
                                       imm:$src3))),
                             To.RC:$src0)),
             (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
@@ -10636,7 +10588,7 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
-                                              (From.LdFrag addr:$src2),
+                                      (bitconvert (To.LdFrag addr:$src2)),
                                       imm:$src3))),
                             To.ImmAllZerosV)),
             (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
@@ -11780,7 +11732,7 @@ multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                        (VTI.VT (VTI.LdFrag addr:$src3))))>,
+                        (VTI.VT (bitconvert (VTI.LdFrag addr:$src3)))))>,
                 AVX512FMA3Base,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -11883,7 +11835,8 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
                                    (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                                            (VTI.VT (VTI.LdFrag addr:$src3))))>,
+                                            (VTI.VT (bitconvert
+                                                     (VTI.LdFrag addr:$src3)))))>,
                                    EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
                                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm mb :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -11939,7 +11892,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
                                 "vpshufbitqmb",
                                 "$src2, $src1", "$src1, $src2",
                                 (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
-                                (VTI.VT (VTI.LdFrag addr:$src2)))>,
+                                (VTI.VT (bitconvert (VTI.LdFrag addr:$src2))))>,
                                 EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
                                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 7e31527a877..f750fe3ee0c 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -648,28 +648,21 @@ def sdmem : Operand<v2f64> {
 //===----------------------------------------------------------------------===//
 
 // 128-bit load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
 def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
 def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
 def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
-def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
-def loadv8i16    : PatFrag<(ops node:$ptr), (v8i16 (load node:$ptr))>;
-def loadv16i8    : PatFrag<(ops node:$ptr), (v16i8 (load node:$ptr))>;
 
 // 256-bit load pattern fragments
-def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32  (load node:$ptr))>;
-def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64  (load node:$ptr))>;
-def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64  (load node:$ptr))>;
-def loadv8i32    : PatFrag<(ops node:$ptr), (v8i32  (load node:$ptr))>;
-def loadv16i16   : PatFrag<(ops node:$ptr), (v16i16 (load node:$ptr))>;
-def loadv32i8    : PatFrag<(ops node:$ptr), (v32i8  (load node:$ptr))>;
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
+def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
+def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
 
 // 512-bit load pattern fragments
 def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
-def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64  (load node:$ptr))>;
-def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64  (load node:$ptr))>;
-def loadv16i32   : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
-def loadv32i16   : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>;
-def loadv64i8    : PatFrag<(ops node:$ptr), (v64i8  (load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
 
 // 128-/256-/512-bit extload pattern fragments
 def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
@@ -697,27 +690,15 @@ def alignedloadv2f64 : PatFrag<(ops node:$ptr),
                                (v2f64 (alignedload node:$ptr))>;
 def alignedloadv2i64 : PatFrag<(ops node:$ptr),
                                (v2i64 (alignedload node:$ptr))>;
-def alignedloadv4i32 : PatFrag<(ops node:$ptr),
-                               (v4i32 (alignedload node:$ptr))>;
-def alignedloadv8i16 : PatFrag<(ops node:$ptr),
-                               (v8i16 (alignedload node:$ptr))>;
-def alignedloadv16i8 : PatFrag<(ops node:$ptr),
-                               (v16i8 (alignedload node:$ptr))>;
 
 // 256-bit aligned load pattern fragments
 // NOTE: all 256-bit integer vector loads are promoted to v4i64
-def alignedloadv8f32  : PatFrag<(ops node:$ptr),
-                                (v8f32  (alignedload node:$ptr))>;
-def alignedloadv4f64  : PatFrag<(ops node:$ptr),
-                                (v4f64  (alignedload node:$ptr))>;
-def alignedloadv4i64  : PatFrag<(ops node:$ptr),
-                                (v4i64  (alignedload node:$ptr))>;
-def alignedloadv8i32  : PatFrag<(ops node:$ptr),
-                                (v8i32  (alignedload node:$ptr))>;
-def alignedloadv16i16 : PatFrag<(ops node:$ptr),
-                                (v16i16 (alignedload node:$ptr))>;
-def alignedloadv32i8  : PatFrag<(ops node:$ptr),
-                                (v32i8  (alignedload node:$ptr))>;
+def alignedloadv8f32 : PatFrag<(ops node:$ptr),
+                               (v8f32 (alignedload node:$ptr))>;
+def alignedloadv4f64 : PatFrag<(ops node:$ptr),
+                               (v4f64 (alignedload node:$ptr))>;
+def alignedloadv4i64 : PatFrag<(ops node:$ptr),
+                               (v4i64 (alignedload node:$ptr))>;
 
 // 512-bit aligned load pattern fragments
 def alignedloadv16f32 : PatFrag<(ops node:$ptr),
@@ -726,12 +707,6 @@ def alignedloadv8f64  : PatFrag<(ops node:$ptr),
                                 (v8f64  (alignedload node:$ptr))>;
 def alignedloadv8i64  : PatFrag<(ops node:$ptr),
                                 (v8i64  (alignedload node:$ptr))>;
-def alignedloadv16i32 : PatFrag<(ops node:$ptr),
-                                (v16i32 (alignedload node:$ptr))>;
-def alignedloadv32i16 : PatFrag<(ops node:$ptr),
-                                (v32i16 (alignedload node:$ptr))>;
-def alignedloadv64i8  : PatFrag<(ops node:$ptr),
-                                (v64i8  (alignedload node:$ptr))>;
 
 // Like 'load', but uses special alignment checks suitable for use in
 // memory operands in most SSE instructions, which are required to
@@ -750,9 +725,6 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
-def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
-def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>;
-def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
 
 def X86masked_gather : SDNode<"X86ISD::MGATHER",
                               SDTypeProfile<2, 3, [SDTCisVec<0>,
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 8f97ce37068..ced93f8d253 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -589,21 +589,8 @@ let Predicates = [HasAVX, NoVLX] in {
   // available and changing the domain is beneficial.
   def : Pat<(alignedloadv4i64 addr:$src),
             (VMOVAPSYrm addr:$src)>;
-  def : Pat<(alignedloadv8i32 addr:$src),
-            (VMOVAPSYrm addr:$src)>;
-  def : Pat<(alignedloadv16i16 addr:$src),
-            (VMOVAPSYrm addr:$src)>;
-  def : Pat<(alignedloadv32i8 addr:$src),
-            (VMOVAPSYrm addr:$src)>;
   def : Pat<(loadv4i64 addr:$src),
             (VMOVUPSYrm addr:$src)>;
-  def : Pat<(loadv8i32 addr:$src),
-            (VMOVUPSYrm addr:$src)>;
-  def : Pat<(loadv16i16 addr:$src),
-            (VMOVUPSYrm addr:$src)>;
-  def : Pat<(loadv32i8 addr:$src),
-            (VMOVUPSYrm addr:$src)>;
-
   def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
@@ -628,20 +615,8 @@ let Predicates = [HasAVX, NoVLX] in {
 let Predicates = [UseSSE1] in {
   def : Pat<(alignedloadv2i64 addr:$src),
             (MOVAPSrm addr:$src)>;
-  def : Pat<(alignedloadv4i32 addr:$src),
-            (MOVAPSrm addr:$src)>;
-  def : Pat<(alignedloadv8i16 addr:$src),
-            (MOVAPSrm addr:$src)>;
-  def : Pat<(alignedloadv16i8 addr:$src),
-            (MOVAPSrm addr:$src)>;
   def : Pat<(loadv2i64 addr:$src),
             (MOVUPSrm addr:$src)>;
-  def : Pat<(loadv4i32 addr:$src),
-            (MOVUPSrm addr:$src)>;
-  def : Pat<(loadv8i16 addr:$src),
-            (MOVUPSrm addr:$src)>;
-  def : Pat<(loadv16i8 addr:$src),
-            (MOVUPSrm addr:$src)>;
 
   def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
             (MOVAPSmr addr:$dst, VR128:$src)>;
@@ -866,7 +841,7 @@ let hasSideEffects = 0 in {
   let mayLoad = 1 in
   def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
              [(set RC:$dst, (DstTy (sint_to_fp
-                                    (SrcTy (ld_frag addr:$src)))))], d>,
+                                    (SrcTy (bitconvert (ld_frag addr:$src))))))], d>,
              Sched<[sched.Folded]>;
 }
 }
@@ -1129,16 +1104,16 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
                                  ssmem, sse_load_f32, "cvtss2si",
                                  WriteCvtSS2I>, XS, REX_W;
 
-defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
+defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PS>,
                                PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
-defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
+defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PSY>,
                                PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 
-defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, WriteCvtI2PS>,
                             PS, Requires<[UseSSE2]>;
@@ -1697,7 +1672,7 @@ let hasSideEffects = 0, mayLoad = 1 in
 def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
+                          (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
                         VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
 def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -1707,7 +1682,7 @@ def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
-                           (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>,
+                           (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
                          VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
                          VEX_WIG;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
@@ -1721,7 +1696,7 @@ let hasSideEffects = 0, mayLoad = 1 in
 def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
+                         (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
                        Sched<[WriteCvtI2PDLd]>;
 def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2176,54 +2151,54 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
-defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
-defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
-defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
 
-defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 }// Predicates = [HasAVX, NoVLX]
 
 let Constraints = "$src1 = $dst" in {
-  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
+  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
-  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
+  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
         VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
-  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
+  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
         VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
-  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
+  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
         VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX1Only] in {
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
@@ -2309,7 +2284,8 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
+       [(set RC:$dst, (OpVT (OpNode RC:$src1,
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 } // ExeDomain = SSEPackedInt
@@ -2320,16 +2296,16 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          Predicate prd> {
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
-                             VR128, load, i128mem, sched.XMM,
+                             VR128, loadv2i64, i128mem, sched.XMM,
                              IsCommutable, 0>, VEX_4V, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
-                           memop, i128mem, sched.XMM, IsCommutable, 1>;
+                           memopv2i64, i128mem, sched.XMM, IsCommutable, 1>;
 
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
-                               OpVT256, VR256, load, i256mem, sched.YMM,
+                               OpVT256, VR256, loadv4i64, i256mem, sched.YMM,
                                IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
@@ -3447,19 +3423,6 @@ def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
 
 let Predicates = [HasAVX, NoVLX] in {
   // Additional patterns for other integer sizes.
-  def : Pat<(alignedloadv4i32 addr:$src),
-            (VMOVDQArm addr:$src)>;
-  def : Pat<(alignedloadv8i16 addr:$src),
-            (VMOVDQArm addr:$src)>;
-  def : Pat<(alignedloadv16i8 addr:$src),
-            (VMOVDQArm addr:$src)>;
-  def : Pat<(loadv4i32 addr:$src),
-            (VMOVDQUrm addr:$src)>;
-  def : Pat<(loadv8i16 addr:$src),
-            (VMOVDQUrm addr:$src)>;
-  def : Pat<(loadv16i8 addr:$src),
-            (VMOVDQUrm addr:$src)>;
-
   def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
             (VMOVDQAmr addr:$dst, VR128:$src)>;
   def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
@@ -3499,7 +3462,7 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
-                                     (memop_frag addr:$src2))))]>,
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 } // ExeDomain = SSEPackedInt
@@ -3559,28 +3522,28 @@ defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
+                              loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
                               VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
-                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
+                               VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM,
                                0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                             memop, i128mem, SchedWriteVecIMul.XMM>;
+                             memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
-                             load, i128mem, SchedWritePSADBW.XMM, 0>,
+                             loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>,
                              VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
-                             load, i256mem, SchedWritePSADBW.YMM, 0>,
+                             loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>,
                              VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
-                            memop, i128mem, SchedWritePSADBW.XMM>;
+                            memopv2i64, i128mem, SchedWritePSADBW.XMM>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Logical Instructions
@@ -3607,7 +3570,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode RC:$src1,
-                       (SrcVT (ld_frag addr:$src2)))))]>,
+                       (SrcVT (bitconvert (ld_frag addr:$src2))))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
        (ins RC:$src1, u8imm:$src2),
@@ -3627,16 +3590,16 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                               OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
-                              DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
+                              DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                                 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
-                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
+                                DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L,
                                 VEX_WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
                             VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
-                            memop>;
+                            memopv2i64>;
 }
 
 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
@@ -3736,7 +3699,7 @@ let Predicates = [HasAVX, prd] in {
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
-                       (vt128 (OpNode (load addr:$src1),
+                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
                         (i8 imm:$src2))))]>, VEX,
                   Sched<[sched.XMM.Folded]>, VEX_WIG;
 }
@@ -3754,7 +3717,7 @@ let Predicates = [HasAVX2, prd] in {
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
-                        (vt256 (OpNode (load addr:$src1),
+                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
                          (i8 imm:$src2))))]>, VEX, VEX_L,
                    Sched<[sched.YMM.Folded]>, VEX_WIG;
 }
@@ -3772,7 +3735,7 @@ let Predicates = [UseSSE2] in {
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set VR128:$dst,
-                 (vt128 (OpNode (memop addr:$src1),
+                 (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
                         (i8 imm:$src2))))]>,
                Sched<[sched.XMM.Folded]>;
 }
@@ -3812,7 +3775,7 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                [(set RC:$dst,
                      (OutVT (OpNode (ArgVT RC:$src1),
-                                    (ld_frag addr:$src2))))]>,
+                                    (bitconvert (ld_frag addr:$src2)))))]>,
                Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -3837,53 +3800,53 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                  [(set RC:$dst,
                        (OutVT (OpNode (ArgVT RC:$src1),
-                                      (ld_frag addr:$src2))))]>,
+                                      (bitconvert (ld_frag addr:$src2)))))]>,
                  Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
-                             i128mem, SchedWriteShuffle.XMM, load, 0>,
+                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                              VEX_4V, VEX_WIG;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
-                             i128mem, SchedWriteShuffle.XMM, load, 0>,
+                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                              VEX_4V, VEX_WIG;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
-                             i128mem, SchedWriteShuffle.XMM, load, 0>,
+                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                              VEX_4V, VEX_WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
-                             i128mem, SchedWriteShuffle.XMM, load, 0>,
+                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                              VEX_4V;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
-                              i256mem, SchedWriteShuffle.YMM, load, 0>,
+                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
-                              i256mem, SchedWriteShuffle.YMM, load, 0>,
+                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
 
   defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
-                              i256mem, SchedWriteShuffle.YMM, load, 0>,
+                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
-                              i256mem, SchedWriteShuffle.YMM, load, 0>,
+                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                               VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memop>;
+                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memop>;
+                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
 
   defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memop>;
+                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
 
   defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memop>;
+                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -3908,88 +3871,89 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
       !if(Is2Addr,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
+      [(set RC:$dst, (vt (OpNode RC:$src1,
+                                  (bitconvert (ld_frag addr:$src2)))))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memop>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memop>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memop>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memop>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
 
   defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memop>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memop>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memop>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memop>;
+                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -4308,7 +4272,7 @@ let Predicates = [UseAVX] in {
             (VMOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (VMOVDI2PDIrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
             (VMOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (VMOVDI2PDIrm addr:$src)>;
@@ -4333,7 +4297,7 @@ let Predicates = [UseSSE2] in {
             (MOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (MOVDI2PDIrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
             (MOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (MOVDI2PDIrm addr:$src)>;
@@ -4488,30 +4452,30 @@ defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
 let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (VMOVSHDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
             (VMOVSHDUPrm addr:$src)>;
   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
             (VMOVSLDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
             (VMOVSLDUPrm addr:$src)>;
   def : Pat<(v8i32 (X86Movshdup VR256:$src)),
             (VMOVSHDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
+  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
             (VMOVSHDUPYrm addr:$src)>;
   def : Pat<(v8i32 (X86Movsldup VR256:$src)),
             (VMOVSLDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
+  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
             (VMOVSLDUPYrm addr:$src)>;
 }
 
 let Predicates = [UseSSE3] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (MOVSHDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
             (MOVSHDUPrm addr:$src)>;
   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
             (MOVSLDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
             (MOVSLDUPrm addr:$src)>;
 }
 
@@ -4733,7 +4697,7 @@ multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
                  (ins i128mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                  [(set VR128:$dst,
-                   (vt (OpNode (ld_frag addr:$src))))]>,
+                   (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>,
                  Sched<[sched.XMM.Folded]>;
 }
 
@@ -4750,19 +4714,19 @@ multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
                   (ins i256mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR256:$dst,
-                    (vt (OpNode (load addr:$src))))]>,
+                    (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
                   Sched<[sched.YMM.Folded]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
-                              load>, VEX, VEX_WIG;
+                              loadv2i64>, VEX, VEX_WIG;
   defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
-                              load>, VEX, VEX_WIG;
+                              loadv2i64>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX] in {
   defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
-                              load>, VEX, VEX_WIG;
+                              loadv2i64>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
@@ -4776,11 +4740,11 @@ let Predicates = [HasAVX2, NoVLX] in {
 }
 
 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
-                          memop>;
+                          memopv2i64>;
 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
-                          memop>;
+                          memopv2i64>;
 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
-                          memop>;
+                          memopv2i64>;
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Binary Operator Instructions
@@ -4805,7 +4769,8 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
+         (DstVT (OpNode (OpVT RC:$src1),
+          (bitconvert (memop_frag addr:$src2)))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4827,7 +4792,8 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
-         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
+         (IntId128 VR128:$src1,
+          (bitconvert (ld_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4844,83 +4810,83 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        (ins VR256:$src1, i256mem:$src2),
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
-         (IntId256 VR256:$src1, (load addr:$src2)))]>,
+         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
-                                  VR128, load, i128mem,
+                                  VR128, loadv2i64, i128mem,
                                   SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
-                                  v16i8, VR128, load, i128mem,
+                                  v16i8, VR128, loadv2i64, i128mem,
                                   SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
 }
 defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
-                                  VR128, load, i128mem,
+                                  VR128, loadv2i64, i128mem,
                                   SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
-                                  load, i128mem,
+                                  loadv2i64, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
-                                  load, i128mem,
+                                  loadv2i64, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
-                                  load, i128mem,
+                                  loadv2i64, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
-                                  load, i128mem,
+                                  loadv2i64, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
-                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
                                       int_x86_ssse3_psign_w_128,
-                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128,
-                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
-                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
+                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128,
-                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
+                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
-                                  VR256, load, i256mem,
+                                  VR256, loadv4i64, i256mem,
                                   SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
-                                   v32i8, VR256, load, i256mem,
+                                   v32i8, VR256, loadv4i64, i256mem,
                                    SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
-                                  VR256, load, i256mem,
+                                  VR256, loadv4i64, i256mem,
                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
-                                  VR256, load, i256mem,
+                                  VR256, loadv4i64, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
-                                  load, i256mem,
+                                  loadv4i64, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
-                                  VR256, load, i256mem,
+                                  VR256, loadv4i64, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
-                                  load, i256mem,
+                                  loadv4i64, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
   defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
                                        SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
@@ -4941,33 +4907,33 @@ let isCommutable = 0 in {
 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 let isCommutable = 0 in {
   defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
-                                 memop, i128mem, SchedWritePHAdd.XMM>;
+                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
-                                 memop, i128mem, SchedWritePHAdd.XMM>;
+                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
-                                 memop, i128mem, SchedWritePHAdd.XMM>;
+                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
-                                 memop, i128mem, SchedWritePHAdd.XMM>;
+                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
-                                     SchedWriteVecALU.XMM, memop>;
+                                     SchedWriteVecALU.XMM, memopv2i64>;
   defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
-                                     SchedWriteVecALU.XMM, memop>;
+                                     SchedWriteVecALU.XMM, memopv2i64>;
   defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
-                                     SchedWriteVecALU.XMM, memop>;
+                                     SchedWriteVecALU.XMM, memopv2i64>;
   defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
-                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
+                                 memopv2i64, i128mem, SchedWriteVarShuffle.XMM>;
   defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
                                      int_x86_ssse3_phadd_sw_128,
-                                     SchedWritePHAdd.XMM, memop>;
+                                     SchedWritePHAdd.XMM, memopv2i64>;
   defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
                                      int_x86_ssse3_phsub_sw_128,
-                                     SchedWritePHAdd.XMM, memop>;
+                                     SchedWritePHAdd.XMM, memopv2i64>;
   defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
-                                 v16i8, VR128, memop, i128mem,
+                                 v16i8, VR128, memopv2i64, i128mem,
                                  SchedWriteVecIMul.XMM>;
 }
 defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
-                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
+                                 VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -4994,20 +4960,20 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set RC:$dst, (VT (X86PAlignr RC:$src1,
-                                     (memop_frag addr:$src2),
+                                     (bitconvert (memop_frag addr:$src2)),
                                      (i8 imm:$src3))))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
-  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
+  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem,
                                 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
-  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
+  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem,
                                  SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
-  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
+  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem,
                                SchedWriteShuffle.XMM>;
 
 //===---------------------------------------------------------------------===//
@@ -5131,7 +5097,7 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
 
   // AVX2 Register-Memory patterns
   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
@@ -5145,7 +5111,7 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
 
   def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -5154,10 +5120,10 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
 
-  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
+  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
@@ -5170,10 +5136,10 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))),
+  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
 
-  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
+  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
@@ -5233,7 +5199,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   }
   let Predicates = [HasAVX, NoVLX] in {
@@ -5243,7 +5209,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
@@ -5252,7 +5218,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
+  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
 
   def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -5263,7 +5229,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -5272,7 +5238,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
+  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -5283,7 +5249,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   }
 }
@@ -6101,7 +6067,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
                   (ins i128mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR128:$dst,
-                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
+                    (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
                  Sched<[Sched.Folded]>;
 }
 
@@ -6109,10 +6075,10 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
 // model, although the naming is misleading.
 let Predicates = [HasAVX] in
 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
-                                         X86phminpos, load,
+                                         X86phminpos, loadv2i64,
                                          WritePHMINPOS>, VEX, VEX_WIG;
 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
-                                         X86phminpos, memop,
+                                         X86phminpos, memopv2i64,
                                          WritePHMINPOS>;
 
 /// SS48I_binop_rm - Simple SSE41 binary operator.
@@ -6134,118 +6100,118 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
+         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
-                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
-                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
-                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
-                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
-                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
+                                  loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
                                   VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
-                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
-                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
-                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
-                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
-                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
-                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
-                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
-                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
-                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
-                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
-                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
-                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
-                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
-                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
-                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
-                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
-                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
-                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
-                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
-                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
-                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
-                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
+                                 memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>;
 }
 
 let Predicates = [HasAVX, NoVLX] in
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
-                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
+                                 loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 let Predicates = [HasAVX] in
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
-                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
+                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX] in
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
-                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 let Predicates = [HasAVX2] in
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
-                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in {
   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
-                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
+                                memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>;
   defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
-                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
+                                memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
 }
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
@@ -6271,7 +6237,8 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>,
+          (IntId RC:$src1,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6298,7 +6265,8 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1,
+                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6320,28 +6288,28 @@ def BlendCommuteImm8 : SDNodeXForm<imm, [{
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
-                                        VR128, load, i128mem, 0,
+                                        VR128, loadv2i64, i128mem, 0,
                                         SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
   }
 
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                   VR128, load, f128mem, 0,
+                                   VR128, loadv4f32, f128mem, 0,
                                    SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                   VR128, load, f128mem, 0,
+                                   VR128, loadv2f64, f128mem, 0,
                                    SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                    VR256, load, i256mem, 0,
+                                    VR256, loadv8f32, i256mem, 0,
                                     SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                  VR256, load, i256mem, 0,
+                                  VR256, loadv4i64, i256mem, 0,
                                   SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
   }
 }
@@ -6349,17 +6317,17 @@ let Predicates = [HasAVX2] in {
 let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
-                                     VR128, memop, i128mem, 1,
+                                     VR128, memopv2i64, i128mem, 1,
                                      SchedWriteMPSAD.XMM>;
   }
 
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
-                                  VR128, memop, f128mem, 1,
+                                  VR128, memopv4f32, f128mem, 1,
                                   SchedWriteDPPS.XMM>;
   let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
-                                  VR128, memop, f128mem, 1,
+                                  VR128, memopv2f64, f128mem, 1,
                                   SchedWriteDPPD.XMM>;
 }
 
@@ -6387,54 +6355,56 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1,
+                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
   // Pattern to commute if load is in first source.
-  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)),
+  def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
+                          RC:$src1, imm:$src3)),
             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
                                             (commuteXForm imm:$src3))>;
 }
 
 let Predicates = [HasAVX] in {
   defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
-                                  VR128, load, f128mem, 0, SSEPackedSingle,
+                                  VR128, loadv4f32, f128mem, 0, SSEPackedSingle,
                                   SchedWriteFBlend.XMM, BlendCommuteImm4>,
                                   VEX_4V, VEX_WIG;
   defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
-                                   VR256, load, f256mem, 0, SSEPackedSingle,
+                                   VR256, loadv8f32, f256mem, 0, SSEPackedSingle,
                                    SchedWriteFBlend.YMM, BlendCommuteImm8>,
                                    VEX_4V, VEX_L, VEX_WIG;
   defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
-                                  VR128, load, f128mem, 0, SSEPackedDouble,
+                                  VR128, loadv2f64, f128mem, 0, SSEPackedDouble,
                                   SchedWriteFBlend.XMM, BlendCommuteImm2>,
                                   VEX_4V, VEX_WIG;
   defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
-                                   VR256, load, f256mem, 0, SSEPackedDouble,
+                                   VR256, loadv4f64, f256mem, 0, SSEPackedDouble,
                                    SchedWriteFBlend.YMM, BlendCommuteImm4>,
                                    VEX_4V, VEX_L, VEX_WIG;
   defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
-                                  VR128, load, i128mem, 0, SSEPackedInt,
+                                  VR128, loadv2i64, i128mem, 0, SSEPackedInt,
                                   SchedWriteBlend.XMM, BlendCommuteImm8>,
                                   VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
-                                   VR256, load, i256mem, 0, SSEPackedInt,
+                                   VR256, loadv4i64, i256mem, 0, SSEPackedInt,
                                    SchedWriteBlend.YMM, BlendCommuteImm8>,
                                    VEX_4V, VEX_L, VEX_WIG;
 }
 
 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
-                               VR128, memop, f128mem, 1, SSEPackedSingle,
+                               VR128, memopv4f32, f128mem, 1, SSEPackedSingle,
                                SchedWriteFBlend.XMM, BlendCommuteImm4>;
 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
-                               VR128, memop, f128mem, 1, SSEPackedDouble,
+                               VR128, memopv2f64, f128mem, 1, SSEPackedDouble,
                                SchedWriteFBlend.XMM, BlendCommuteImm2>;
 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
-                               VR128, memop, i128mem, 1, SSEPackedInt,
+                               VR128, memopv2i64, i128mem, 1, SSEPackedInt,
                                SchedWriteBlend.XMM, BlendCommuteImm8>;
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -6468,7 +6438,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst,
-                        (IntId RC:$src1, (mem_frag addr:$src2),
+                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
                                RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
                 Sched<[sched.Folded, sched.ReadAfterFold,
                        // x86memop:$src2
@@ -6481,7 +6451,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
-                                           load, int_x86_sse41_blendvpd,
+                                           loadv2f64, int_x86_sse41_blendvpd,
                                            SchedWriteFVarBlend.XMM>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
                                   loadv4f64, int_x86_avx_blendv_pd_256,
@@ -6489,20 +6459,20 @@ defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
-                                           load, int_x86_sse41_blendvps,
+                                           loadv4f32, int_x86_sse41_blendvps,
                                            SchedWriteFVarBlend.XMM>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
                                   loadv8f32, int_x86_avx_blendv_ps_256,
                                   SchedWriteFVarBlend.YMM>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
-                                           load, int_x86_sse41_pblendvb,
+                                           loadv2i64, int_x86_sse41_pblendvb,
                                            SchedWriteVarBlend.XMM>;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                      load, int_x86_avx2_pblendvb,
+                                      loadv4i64, int_x86_avx2_pblendvb,
                                       SchedWriteVarBlend.YMM>, VEX_L;
 }
 
@@ -6633,18 +6603,18 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                      "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
-                       (mem_frag addr:$src2), XMM0))]>,
+                       (bitconvert (mem_frag addr:$src2)), XMM0))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 let ExeDomain = SSEPackedDouble in
-defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem,
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
                                   int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
 let ExeDomain = SSEPackedSingle in
-defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem,
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
                                   int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
-defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem,
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
                                   int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
 
 // Aliases with the implicit xmm0 argument
@@ -6700,12 +6670,6 @@ let Predicates = [HasAVX2, NoVLX] in {
             (VMOVNTDQAYrm addr:$src)>;
   def : Pat<(v4i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAYrm addr:$src)>;
-  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAYrm addr:$src)>;
-  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAYrm addr:$src)>;
-  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
-            (VMOVNTDQAYrm addr:$src)>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
@@ -6715,12 +6679,6 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVNTDQArm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQArm addr:$src)>;
-  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
-            (VMOVNTDQArm addr:$src)>;
-  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
-            (VMOVNTDQArm addr:$src)>;
-  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
-            (VMOVNTDQArm addr:$src)>;
 }
 
 let Predicates = [UseSSE41] in {
@@ -6730,12 +6688,6 @@ let Predicates = [UseSSE41] in {
             (MOVNTDQArm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (MOVNTDQArm addr:$src)>;
-  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
-            (MOVNTDQArm addr:$src)>;
-  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
-            (MOVNTDQArm addr:$src)>;
-  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
-            (MOVNTDQArm addr:$src)>;
 }
 
 } // AddedComplexity
@@ -6768,17 +6720,17 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
+                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
-                                memop, i128mem, SchedWriteVecALU.XMM>;
+                                memopv2i64, i128mem, SchedWriteVecALU.XMM>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - String/text Processing Instructions
@@ -6929,9 +6881,9 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1,
-                    (memop addr:$src2), XMM0)),
+                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
                   (set VR128:$dst, (IntId VR128:$src1,
-                    (memop addr:$src2))))]>, T8,
+                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8,
              Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6948,7 +6900,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1,
-                            (memop addr:$src2),
+                            (bc_v4i32 (memopv2i64 addr:$src2)),
                             (i8 imm:$src3)))]>, TA,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
@@ -7001,39 +6953,39 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
 // Perform One Round of an AES Encryption/Decryption Flow
 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG;
   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG;
   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG;
 }
 
 let Predicates = [NoVLX, HasVAES] in {
   defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc_256, load, 0, VR256,
+                         int_x86_aesni_aesenc_256, loadv4i64, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast_256, load, 0, VR256,
+                         int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec_256, load, 0, VR256,
+                         int_x86_aesni_aesdec_256, loadv4i64, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
+                         int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
-                         int_x86_aesni_aesenc, memop, 1>;
+                         int_x86_aesni_aesenc, memopv2i64, 1>;
   defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
-                         int_x86_aesni_aesenclast, memop, 1>;
+                         int_x86_aesni_aesenclast, memopv2i64, 1>;
   defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
-                         int_x86_aesni_aesdec, memop, 1>;
+                         int_x86_aesni_aesdec, memopv2i64, 1>;
   defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
-                         int_x86_aesni_aesdeclast, memop, 1>;
+                         int_x86_aesni_aesdeclast, memopv2i64, 1>;
 }
 
 // Perform the AES InvMixColumn Transformation
@@ -7047,7 +6999,7 @@ let Predicates = [HasAVX, HasAES] in {
   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
-      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
+      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
       Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
@@ -7058,7 +7010,7 @@ def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1),
   "aesimc\t{$src1, $dst|$dst, $src1}",
-  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
+  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
   Sched<[WriteAESIMC.Folded]>;
 
 // AES Round Key Generation Assist
@@ -7073,7 +7025,7 @@ let Predicates = [HasAVX, HasAES] in {
       (ins i128mem:$src1, u8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
-        (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>,
+        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
       Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
@@ -7086,7 +7038,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1, u8imm:$src2),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   [(set VR128:$dst,
-    (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>,
+    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
   Sched<[WriteAESKeyGen.Folded]>;
 
 //===----------------------------------------------------------------------===//
@@ -7114,12 +7066,12 @@ let Predicates = [NoAVX, HasPCLMUL] in {
               (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
               [(set VR128:$dst,
-                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
+                 (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2),
                   imm:$src3))]>,
               Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
   } // Constraints = "$src1 = $dst"
 
-  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
+  def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1,
                                 (i8 imm:$src3)),
             (PCLMULQDQrm VR128:$src1, addr:$src2,
                           (PCLMULCommuteImm imm:$src3))>;
@@ -7162,11 +7114,11 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
-defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
+defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64,
                              int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
 
 let Predicates = [NoVLX, HasVPCLMULQDQ] in
-defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
+defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64,
                               int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
 
 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
@@ -7322,11 +7274,11 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
 let Predicates = [HasAVX2, NoVLX] in {
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
+def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
           (VBROADCASTI128 addr:$src)>;
 }
 
@@ -7340,11 +7292,11 @@ def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
 let Predicates = [HasAVX1Only] in {
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
+def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
           (VBROADCASTF128 addr:$src)>;
 }
 
@@ -7377,7 +7329,7 @@ multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
             (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
   def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
-                                    (From (memop_frag addr:$src2)),
+                                    (From (bitconvert (memop_frag addr:$src2))),
                                     (iPTR imm)),
             (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
@@ -7390,9 +7342,9 @@ let Predicates = [HasAVX, NoVLX] in {
 
 let Predicates = [HasAVX1Only] in {
   defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv4i32>;
-  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
-  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv16i8>;
+  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
+  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv2i64>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7481,7 +7433,7 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
 
 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                       RegisterClass RC, X86MemOperand x86memop_f,
-                      X86MemOperand x86memop_i,
+                      X86MemOperand x86memop_i, PatFrag i_frag,
                       ValueType f_vt, ValueType i_vt,
                       X86FoldableSchedWrite sched,
                       X86FoldableSchedWrite varsched> {
@@ -7495,7 +7447,7 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                (ins RC:$src1, x86memop_i:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
-                              (i_vt (load addr:$src2)))))]>, VEX_4V,
+                              (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
                Sched<[varsched.Folded, sched.ReadAfterFold]>;
 
     def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
@@ -7514,18 +7466,18 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
-                               v4f32, v4i32, SchedWriteFShuffle.XMM,
+                               loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM,
                                SchedWriteFVarShuffle.XMM>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                               v8f32, v8i32, SchedWriteFShuffle.YMM,
+                               loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM,
                                SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
-                               v2f64, v2i64, SchedWriteFShuffle.XMM,
+                               loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM,
                                SchedWriteFVarShuffle.XMM>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                               v4f64, v4i64, SchedWriteFShuffle.YMM,
+                               loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM,
                                SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 
@@ -7606,7 +7558,8 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
+             [(set RC:$dst, (X86cvtph2ps (bc_v8i16
+                                          (loadv2i64 addr:$src))))]>,
              T8PD, VEX, Sched<[sched.Folded]>;
 }
 
@@ -7680,7 +7633,7 @@ let Predicates = [HasF16C, NoVLX] in {
 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, X86FoldableSchedWrite sched,
-                          RegisterClass RC,
+                          RegisterClass RC, PatFrag memop_frag,
                           X86MemOperand x86memop, SDNodeXForm commuteXForm> {
   let isCommutable = 1 in
   def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
@@ -7694,20 +7647,22 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
 
   // Pattern to commute if load is in first source.
-  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)),
+  def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
+                          RC:$src1, imm:$src3)),
             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
                                             (commuteXForm imm:$src3))>;
 }
 
 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
-                               SchedWriteBlend.XMM, VR128, i128mem,
+                               SchedWriteBlend.XMM, VR128, loadv2i64, i128mem,
                                BlendCommuteImm4>;
 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
-                                SchedWriteBlend.YMM, VR256, i256mem,
+                                SchedWriteBlend.YMM, VR256, loadv4i64, i256mem,
                                 BlendCommuteImm8>, VEX_L;
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -7941,7 +7896,7 @@ let Predicates = [HasAVX1Only] in {
 // VPERM - Permute instructions
 //
 
-multiclass avx2_perm<bits<8> opc, string OpcodeStr,
+multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                      ValueType OpVT, X86FoldableSchedWrite Sched,
                      X86MemOperand memOp> {
   let Predicates = [HasAVX2, NoVLX] in {
@@ -7958,14 +7913,16 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermv VR256:$src1,
-                              (load addr:$src2))))]>,
+                              (bitconvert (mem_frag addr:$src2)))))]>,
                      Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
   }
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256,
+                        i256mem>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256,
+                        f256mem>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          ValueType OpVT, X86FoldableSchedWrite Sched,
@@ -8035,9 +7992,9 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv4i32>;
-  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
-  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv16i8>;
+  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
+  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv2i64>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8196,7 +8153,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1,
-                       (vt128 (load addr:$src2)))))]>,
+                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
              VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
                             SchedWriteVarVecShift.XMM.ReadAfterFold]>;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
@@ -8210,7 +8167,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
-                       (vt256 (load addr:$src2)))))]>,
+                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
              VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
                                    SchedWriteVarVecShift.YMM.ReadAfterFold]>;
 }
@@ -8224,11 +8181,13 @@ let Predicates = [HasAVX2, NoVLX] in {
 
   def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
             (VPSRAVDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (X86vsrav VR128:$src1, (load addr:$src2))),
+  def : Pat<(v4i32 (X86vsrav VR128:$src1,
+                    (bitconvert (loadv2i64 addr:$src2)))),
             (VPSRAVDrm VR128:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
             (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86vsrav VR256:$src1, (load addr:$src2))),
+  def : Pat<(v8i32 (X86vsrav VR256:$src1,
+                    (bitconvert (loadv4i64 addr:$src2)))),
             (VPSRAVDYrm VR256:$src1, addr:$src2)>;
 }
 
@@ -8310,7 +8269,7 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
 
     def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
-                                 (MemOpFrag addr:$src2))))]>,
+                                 (bitconvert (MemOpFrag addr:$src2)))))]>,
              Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
   }
 }
@@ -8328,7 +8287,7 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
   def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
               (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
               [(set RC:$dst, (OpVT (OpNode RC:$src1,
-                                    (MemOpFrag addr:$src2),
+                                    (bitconvert (MemOpFrag addr:$src2)),
                               imm:$src3)))], SSEPackedInt>,
               Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
   }
@@ -8338,24 +8297,24 @@ multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
   let Constraints = "$src1 = $dst",
       Predicates  = [HasGFNI, UseSSE2] in
   defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
-                                      VR128, load, i128mem, 1>;
+                                      VR128, loadv2i64, i128mem, 1>;
   let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
     defm V##NAME    : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
-                                      load, i128mem>, VEX_4V, VEX_W;
+                                      loadv2i64, i128mem>, VEX_4V, VEX_W;
     defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
-                                      load, i256mem>, VEX_4V, VEX_L, VEX_W;
+                                      loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W;
   }
 }
 
 // GF2P8MULB
 let Constraints = "$src1 = $dst",
     Predicates  = [HasGFNI, UseSSE2] in
-defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
+defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64,
                                     i128mem, 1>;
 let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
-  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
+  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64,
                                    i128mem>, VEX_4V;
-  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
+  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64,
                                    i256mem>, VEX_4V, VEX_L;
 }
 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 39f50c10ae1..a8013e38e63 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -11,32 +11,32 @@
 //
 //===----------------------------------------------------------------------===//
 
-multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
            Sched<[SchedWritePHAdd.XMM.Folded, SchedWritePHAdd.XMM.ReadAfterFold]>;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd>;
-  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq>;
-  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw>;
-  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq>;
-  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd>;
-  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq>;
-  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd>;
-  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq>;
-  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw>;
-  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq>;
-  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd>;
-  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq>;
-  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw>;
-  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq>;
-  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd>;
+  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
+  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
+  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
+  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
+  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
+  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
+  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
+  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
+  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
+  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
+  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
+  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
+  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
+  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
+  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
 }
 
 // Scalar load 2 addr operand instructions
@@ -48,47 +48,47 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int mem_cpat:$src))]>, XOP,
+           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     X86FoldableSchedWrite sched> {
+                     PatFrag memop, X86FoldableSchedWrite sched> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     X86FoldableSchedWrite sched> {
+                     PatFrag memop, X86FoldableSchedWrite sched> {
   def Yrr : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[sched]>;
   def Yrm : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int (load addr:$src)))]>, XOP, VEX_L,
+           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let ExeDomain = SSEPackedSingle in {
   defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
                            ssmem, sse_load_f32, SchedWriteFRnd.Scl>;
-  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps,
+  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32,
                            SchedWriteFRnd.XMM>;
-  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256,
+  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32,
                            SchedWriteFRnd.YMM>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
                            sdmem, sse_load_f64, SchedWriteFRnd.Scl>;
-  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd,
+  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64,
                            SchedWriteFRnd.XMM>;
-  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256,
+  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64,
                            SchedWriteFRnd.YMM>;
 }
 
@@ -105,13 +105,13 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1),
-                             (vt128 (load addr:$src2)))))]>,
+                             (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
            XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
            (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (vt128 (OpNode (vt128 (load addr:$src1)),
+              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
                              (vt128 VR128:$src2))))]>,
              XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
   // For disassembler
@@ -150,7 +150,7 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
            (ins i128mem:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (vt128 (OpNode (vt128 (load addr:$src1)), imm:$src2)))]>,
+              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>,
            XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -181,7 +181,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-              (Int VR128:$src1, (load addr:$src2),
+              (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
               VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -260,7 +260,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                 (vt128 (OpNode (vt128 VR128:$src1),
-                               (vt128 (load addr:$src2)),
+                               (vt128 (bitconvert (loadv2i64 addr:$src2))),
                                 imm:$cc)))]>,
              XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
     let isAsmParserOnly = 1, hasSideEffects = 0 in {
@@ -279,7 +279,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
     }
   }
 
-  def : Pat<(OpNode (load addr:$src2),
+  def : Pat<(OpNode (bitconvert (loadv2i64 addr:$src2)),
                     (vt128 VR128:$src1), imm:$cc),
             (!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2,
                                            (CommuteVPCOMCC imm:$cc))>;
@@ -310,14 +310,14 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                             (vt128 (load addr:$src3)))))]>,
+                             (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>,
             XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
             (ins VR128:$src1, i128mem:$src2, VR128:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set VR128:$dst,
-              (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (load addr:$src2)),
+              (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))),
                              (vt128 VR128:$src3))))]>,
             XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
                            // 128mem:$src2
@@ -401,7 +401,8 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set RC:$dst,
-          (VT (X86vpermil2 RC:$src1, RC:$src2, (IntLdFrag addr:$src3),
+          (VT (X86vpermil2 RC:$src1, RC:$src2,
+                           (bitconvert (IntLdFrag addr:$src3)),
                            (i8 imm:$src4))))]>, VEX_W,
         Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
@@ -436,10 +437,10 @@ let ExeDomain = SSEPackedDouble in {
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem,
-                                 v4f32, loadv4f32, loadv4i32,
+                                 v4f32, loadv4f32, loadv2i64,
                                  SchedWriteFVarShuffle.XMM>;
   defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem,
-                                  v8f32, loadv8f32, loadv8i32,
+                                  v8f32, loadv8f32, loadv4i64,
                                   SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index b5fd9f4a785..58b1c505944 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1391,7 +1391,7 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
   if (ConstantEntry.isMachineConstantPoolEntry())
     return nullptr;
 
-  const Constant *C = ConstantEntry.Val.ConstVal;
+  auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
   assert((!C || ConstantEntry.getType() == C->getType()) &&
          "Expected a constant of the same type!");
   return C;
@@ -1594,18 +1594,6 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   }
 }
 
-static unsigned getRegisterWidth(const MCOperandInfo &Info) {
-  if (Info.RegClass == X86::VR128RegClassID ||
-      Info.RegClass == X86::VR128XRegClassID)
-    return 128;
-  if (Info.RegClass == X86::VR256RegClassID ||
-      Info.RegClass == X86::VR256XRegClassID)
-    return 256;
-  if (Info.RegClass == X86::VR512RegClassID)
-    return 512;
-  llvm_unreachable("Unknown register class!");
-}
-
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   X86MCInstLower MCInstLowering(*MF, *this);
   const X86RegisterInfo *RI =
@@ -1891,9 +1879,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 64> Mask;
-      DecodePSHUFBMask(C, Width, Mask);
+      DecodePSHUFBMask(C, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1964,9 +1951,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMILPMask(C, ElSize, Width, Mask);
+      DecodeVPERMILPMask(C, ElSize, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1996,9 +1982,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
+      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
@@ -2014,9 +1999,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPPERMMask(C, Width, Mask);
+      DecodeVPPERMMask(C, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 720be8afa62..c7ddf93f8e8 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -112,10 +112,11 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
   return true;
 }
 
-void DecodePSHUFBMask(const Constant *C, unsigned Width,
-                      SmallVectorImpl<int> &ShuffleMask) {
-  assert((Width == 128 || Width == 256 || Width == 512) &&
-         C->getType()->getPrimitiveSizeInBits() >= Width &&
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
@@ -124,7 +125,7 @@ void DecodePSHUFBMask(const Constant *C, unsigned Width,
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / 8;
+  unsigned NumElts = RawMask.size();
   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
          "Unexpected number of vector elements.");
 
@@ -150,10 +151,12 @@ void DecodePSHUFBMask(const Constant *C, unsigned Width,
   }
 }
 
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
                         SmallVectorImpl<int> &ShuffleMask) {
-  assert((Width == 128 || Width == 256 || Width == 512) &&
-         C->getType()->getPrimitiveSizeInBits() >= Width &&
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
          "Unexpected vector size.");
   assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
 
@@ -163,7 +166,7 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / ElSize;
+  unsigned NumElts = RawMask.size();
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) &&
          "Unexpected number of vector elements.");
@@ -186,13 +189,11 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
 }
 
 void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
-                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask) {
   Type *MaskTy = C->getType();
   unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
   (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256) &&
-         Width >= MaskTySize && "Unexpected vector size.");
+  assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
 
   // The shuffle mask requires elements the same size as the target.
   APInt UndefElts;
@@ -200,7 +201,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / ElSize;
+  unsigned NumElts = RawMask.size();
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected number of vector elements.");
@@ -241,12 +242,9 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   }
 }
 
-void DecodeVPPERMMask(const Constant *C, unsigned Width,
-                      SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert(Width == 128 && Width >= MaskTySize && "Unexpected vector size.");
+void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  assert(C->getType()->getPrimitiveSizeInBits() == 128 &&
+         "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
   APInt UndefElts;
@@ -254,7 +252,7 @@ void DecodeVPPERMMask(const Constant *C, unsigned Width,
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / 8;
+  unsigned NumElts = RawMask.size();
   assert(NumElts == 16 && "Unexpected number of vector elements.");
 
   for (unsigned i = 0; i != NumElts; ++i) {
@@ -293,10 +291,12 @@ void DecodeVPPERMMask(const Constant *C, unsigned Width,
   }
 }
 
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
                       SmallVectorImpl<int> &ShuffleMask) {
-  assert((Width == 128 || Width == 256 || Width == 512) &&
-         C->getType()->getPrimitiveSizeInBits() >= Width &&
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -307,7 +307,7 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / ElSize;
+  unsigned NumElts = RawMask.size();
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
@@ -319,10 +319,12 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
   }
 }
 
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
                        SmallVectorImpl<int> &ShuffleMask) {
-  assert((Width == 128 || Width == 256 || Width == 512) &&
-         C->getType()->getPrimitiveSizeInBits() >= Width &&
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -333,7 +335,7 @@ void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = Width / ElSize;
+  unsigned NumElts = RawMask.size();
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index b08c31935d2..b703cbbd2b2 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -26,28 +26,25 @@ class Constant;
 class MVT;
 
 /// Decode a PSHUFB mask from an IR-level vector constant.
-void DecodePSHUFBMask(const Constant *C, unsigned Width,
-                      SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP variable mask from an IR-level vector constant.
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP2 variable mask from an IR-level vector constant.
 void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
-                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPPERM variable mask from an IR-level vector constant.
-void DecodeVPPERMMask(const Constant *C, unsigned Width,
-                      SmallVectorImpl<int> &ShuffleMask);
+void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
                        SmallVectorImpl<int> &ShuffleMask);
 
 } // llvm namespace
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index 0c501ea6895..75a11845b1e 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -224,7 +224,7 @@ entry:
 define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
 ; AVX1-LABEL: shuffle_v16i16_4501_mem:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index 9216cad5882..6affef33932 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -1630,7 +1630,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
-; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3]
+; AVX2-SLOW-NEXT:    vbroadcastsd 24(%rsi), %ymm5
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
@@ -1654,19 +1654,19 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
 ; AVX2-FAST-NEXT:    vbroadcastsd %xmm2, %ymm4
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
-; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[0,0,3,3,4,4,7,7]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = [5,6,5,6,5,6,7,7]
-; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7]
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,3,3]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-NEXT:    vmovups %ymm0, 64(%rdi)
-; AVX2-FAST-NEXT:    vmovups %ymm4, 32(%rdi)
+; AVX2-FAST-NEXT:    vbroadcastsd 24(%rsi), %ymm2
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX2-FAST-NEXT:    vmovups %ymm1, 64(%rdi)
+; AVX2-FAST-NEXT:    vmovups %ymm0, 32(%rdi)
 ; AVX2-FAST-NEXT:    vmovups %ymm3, (%rdi)
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
index d0ed99f92f3..0900fdccb49 100644
--- a/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -57,9 +57,9 @@ define <16 x i8> @test5(<16 x i8> %V) {
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    movq %rax, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, (%rax)
-; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
-; CHECK-NEXT:    movaps %xmm1, (%rax)
-; CHECK-NEXT:    pshufb (%rax), %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
+; CHECK-NEXT:    movdqa %xmm1, (%rax)
+; CHECK-NEXT:    pshufb %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   store <2 x i64> <i64 1, i64 0>, <2 x i64>* undef, align 16
   %l = load <2 x i64>, <2 x i64>* undef, align 16
diff --git a/test/CodeGen/X86/vector-extend-inreg.ll b/test/CodeGen/X86/vector-extend-inreg.ll
index d790cb54b61..86bb13f57eb 100644
--- a/test/CodeGen/X86/vector-extend-inreg.ll
+++ b/test/CodeGen/X86/vector-extend-inreg.ll
@@ -13,7 +13,6 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
 ; X32-SSE-NEXT:    subl $384, %esp # imm = 0x180
 ; X32-SSE-NEXT:    movl 88(%ebp), %ecx
 ; X32-SSE-NEXT:    movdqa 72(%ebp), %xmm0
-; X32-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
@@ -22,6 +21,7 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
diff --git a/test/CodeGen/X86/vector-idiv-v2i32.ll b/test/CodeGen/X86/vector-idiv-v2i32.ll
index 00126d67532..49e29ac17a5 100644
--- a/test/CodeGen/X86/vector-idiv-v2i32.ll
+++ b/test/CodeGen/X86/vector-idiv-v2i32.ll
@@ -693,20 +693,20 @@ define void @test_sdiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-NEXT:    movdqa {{.*#+}} xmm2 = [31,0,31,0]
-; X86-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
-; X86-NEXT:    movdqa %xmm3, %xmm4
-; X86-NEXT:    psrlq %xmm2, %xmm4
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-NEXT:    movdqa {{.*#+}} xmm3 = [31,0,31,0]
+; X86-NEXT:    movdqa %xmm2, %xmm4
+; X86-NEXT:    psrlq %xmm3, %xmm4
 ; X86-NEXT:    movl $31, %ecx
 ; X86-NEXT:    movd %ecx, %xmm5
-; X86-NEXT:    psrlq %xmm5, %xmm3
-; X86-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
+; X86-NEXT:    psrlq %xmm5, %xmm2
+; X86-NEXT:    movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
 ; X86-NEXT:    movdqa %xmm1, %xmm4
-; X86-NEXT:    psrlq %xmm2, %xmm4
+; X86-NEXT:    psrlq %xmm3, %xmm4
 ; X86-NEXT:    psrlq %xmm5, %xmm1
 ; X86-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
-; X86-NEXT:    xorpd %xmm3, %xmm1
-; X86-NEXT:    psubq %xmm3, %xmm1
+; X86-NEXT:    xorpd %xmm2, %xmm1
+; X86-NEXT:    psubq %xmm2, %xmm1
 ; X86-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X86-NEXT:    psrlq $29, %xmm1
 ; X86-NEXT:    paddq %xmm0, %xmm1
diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll
index 167128ae002..ce99d22dbbd 100644
--- a/test/CodeGen/X86/widened-broadcast.ll
+++ b/test/CodeGen/X86/widened-broadcast.ll
@@ -121,21 +121,10 @@ define <8 x i32> @load_splat_8i32_4i32_01010101(<4 x i32>* %ptr) nounwind uwtabl
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_8i32_4i32_01010101:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_8i32_4i32_01010101:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_8i32_4i32_01010101:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_8i32_4i32_01010101:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
   %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -149,10 +138,21 @@ define <8 x i32> @load_splat_8i32_8i32_01010101(<8 x i32>* %ptr) nounwind uwtabl
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_splat_8i32_8i32_01010101:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_splat_8i32_8i32_01010101:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_splat_8i32_8i32_01010101:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_splat_8i32_8i32_01010101:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX512-NEXT:    retq
 entry:
   %ld = load <8 x i32>, <8 x i32>* %ptr
   %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -246,21 +246,10 @@ define <16 x i16> @load_splat_16i16_8i16_0123012301230123(<8 x i16>* %ptr) nounw
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_16i16_8i16_0123012301230123:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_16i16_8i16_0123012301230123:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3,i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -274,10 +263,21 @@ define <16 x i16> @load_splat_16i16_16i16_0101010101010101(<16 x i16>* %ptr) nou
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vbroadcastss (%rdi), %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_splat_16i16_16i16_0101010101010101:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_splat_16i16_16i16_0101010101010101:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0
+; AVX512-NEXT:    retq
 entry:
   %ld = load <16 x i16>, <16 x i16>* %ptr
   %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -446,21 +446,10 @@ define <32 x i8> @load_splat_32i8_16i8_01234567012345670123456701234567(<16 x i8
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-- 
GitLab


From e234be51dae0bef7fa94eb1c989fa9a3f9021d1d Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Mon, 22 Oct 2018 17:10:47 +0000
Subject: [PATCH 0391/1116] [llvm-exegesis] Move namespace exegesis inside
 llvm::

Summary:
This allows simplifying references of llvm::foo with foo when the needs
come in the future.

Reviewers: courbet, gchatelet

Reviewed By: gchatelet

Subscribers: javed.absar, tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D53455

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344922 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/AArch64/Target.cpp    |   2 +
 tools/llvm-exegesis/lib/Analysis.cpp          |   2 +
 tools/llvm-exegesis/lib/Analysis.h            |   2 +
 tools/llvm-exegesis/lib/Assembler.cpp         |   2 +
 tools/llvm-exegesis/lib/Assembler.h           |   2 +
 tools/llvm-exegesis/lib/BenchmarkCode.h       |   2 +
 tools/llvm-exegesis/lib/BenchmarkResult.cpp   |  10 +-
 tools/llvm-exegesis/lib/BenchmarkResult.h     |   2 +
 tools/llvm-exegesis/lib/BenchmarkRunner.cpp   |   2 +
 tools/llvm-exegesis/lib/BenchmarkRunner.h     |   2 +
 tools/llvm-exegesis/lib/Clustering.cpp        |   2 +
 tools/llvm-exegesis/lib/Clustering.h          |   2 +
 tools/llvm-exegesis/lib/CodeTemplate.cpp      |   2 +
 tools/llvm-exegesis/lib/CodeTemplate.h        |   2 +
 tools/llvm-exegesis/lib/Latency.cpp           |   2 +
 tools/llvm-exegesis/lib/Latency.h             |   2 +
 tools/llvm-exegesis/lib/LlvmState.cpp         |   2 +
 tools/llvm-exegesis/lib/LlvmState.h           |   2 +
 tools/llvm-exegesis/lib/MCInstrDescView.cpp   |   2 +
 tools/llvm-exegesis/lib/MCInstrDescView.h     |   2 +
 tools/llvm-exegesis/lib/PerfHelper.cpp        |   2 +
 tools/llvm-exegesis/lib/PerfHelper.h          |   2 +
 tools/llvm-exegesis/lib/RegisterAliasing.cpp  |   2 +
 tools/llvm-exegesis/lib/RegisterAliasing.h    |   2 +
 tools/llvm-exegesis/lib/RegisterValue.cpp     |   2 +
 tools/llvm-exegesis/lib/RegisterValue.h       |   2 +
 tools/llvm-exegesis/lib/SnippetGenerator.cpp  |   2 +
 tools/llvm-exegesis/lib/SnippetGenerator.h    |   2 +
 tools/llvm-exegesis/lib/Target.cpp            |   2 +
 tools/llvm-exegesis/lib/Target.h              |   2 +
 tools/llvm-exegesis/lib/Uops.cpp              |   2 +
 tools/llvm-exegesis/lib/Uops.h                |   2 +
 tools/llvm-exegesis/lib/X86/Target.cpp        |   2 +
 tools/llvm-exegesis/llvm-exegesis.cpp         | 100 +++++++++---------
 .../llvm-exegesis/AArch64/TargetTest.cpp      |   2 +
 .../tools/llvm-exegesis/ARM/AssemblerTest.cpp |   2 +
 .../llvm-exegesis/BenchmarkRunnerTest.cpp     |   2 +
 .../tools/llvm-exegesis/ClusteringTest.cpp    |   2 +
 .../llvm-exegesis/Common/AssemblerUtils.h     |   2 +
 .../tools/llvm-exegesis/PerfHelperTest.cpp    |   2 +
 .../tools/llvm-exegesis/RegisterValueTest.cpp |   2 +
 .../tools/llvm-exegesis/X86/AnalysisTest.cpp  |   2 +
 .../tools/llvm-exegesis/X86/AssemblerTest.cpp |   2 +
 .../llvm-exegesis/X86/BenchmarkResultTest.cpp |   2 +
 .../X86/RegisterAliasingTest.cpp              |   2 +
 .../X86/SnippetGeneratorTest.cpp              |   2 +
 .../tools/llvm-exegesis/X86/TargetTest.cpp    |   2 +
 47 files changed, 148 insertions(+), 52 deletions(-)

diff --git a/tools/llvm-exegesis/lib/AArch64/Target.cpp b/tools/llvm-exegesis/lib/AArch64/Target.cpp
index 90c5927ad29..be8f0b41ede 100644
--- a/tools/llvm-exegesis/lib/AArch64/Target.cpp
+++ b/tools/llvm-exegesis/lib/AArch64/Target.cpp
@@ -11,6 +11,7 @@
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 
+namespace llvm {
 namespace exegesis {
 
 namespace {
@@ -90,3 +91,4 @@ void InitializeAArch64ExegesisTarget() {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Analysis.cpp b/tools/llvm-exegesis/lib/Analysis.cpp
index 73c54f53225..0dd6bcbd466 100644
--- a/tools/llvm-exegesis/lib/Analysis.cpp
+++ b/tools/llvm-exegesis/lib/Analysis.cpp
@@ -15,6 +15,7 @@
 #include <unordered_set>
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
 static const char kCsvSep = ',';
@@ -796,3 +797,4 @@ std::vector<std::pair<uint16_t, float>> computeIdealizedProcResPressure(
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Analysis.h b/tools/llvm-exegesis/lib/Analysis.h
index a65a2f1b1da..9ee1493f4e0 100644
--- a/tools/llvm-exegesis/lib/Analysis.h
+++ b/tools/llvm-exegesis/lib/Analysis.h
@@ -30,6 +30,7 @@
 #include <string>
 #include <unordered_map>
 
+namespace llvm {
 namespace exegesis {
 
 // A helper class to analyze benchmark results for a target.
@@ -135,5 +136,6 @@ std::vector<std::pair<uint16_t, float>> computeIdealizedProcResPressure(
     llvm::SmallVector<llvm::MCWriteProcResEntry, 8> WPRS);
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H
diff --git a/tools/llvm-exegesis/lib/Assembler.cpp b/tools/llvm-exegesis/lib/Assembler.cpp
index 1576cfe58e0..771a6e9ad24 100644
--- a/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/tools/llvm-exegesis/lib/Assembler.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/MemoryBuffer.h"
 
+namespace llvm {
 namespace exegesis {
 
 static constexpr const char ModuleID[] = "ExegesisInfoTest";
@@ -295,3 +296,4 @@ ExecutableFunction::ExecutableFunction(
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Assembler.h b/tools/llvm-exegesis/lib/Assembler.h
index f2a77168cb7..ee6bc86f378 100644
--- a/tools/llvm-exegesis/lib/Assembler.h
+++ b/tools/llvm-exegesis/lib/Assembler.h
@@ -32,6 +32,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 
+namespace llvm {
 namespace exegesis {
 
 class ExegesisTarget;
@@ -82,5 +83,6 @@ struct ExecutableFunction {
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_ASSEMBLER_H
diff --git a/tools/llvm-exegesis/lib/BenchmarkCode.h b/tools/llvm-exegesis/lib/BenchmarkCode.h
index b10dca5c25e..38bea2519a6 100644
--- a/tools/llvm-exegesis/lib/BenchmarkCode.h
+++ b/tools/llvm-exegesis/lib/BenchmarkCode.h
@@ -15,6 +15,7 @@
 #include <string>
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
 // A collection of instructions that are to be assembled, executed and measured.
@@ -35,5 +36,6 @@ struct BenchmarkCode {
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKCODE_H
diff --git a/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/tools/llvm-exegesis/lib/BenchmarkResult.cpp
index 5d4912ea407..4b91c6c3b3c 100644
--- a/tools/llvm-exegesis/lib/BenchmarkResult.cpp
+++ b/tools/llvm-exegesis/lib/BenchmarkResult.cpp
@@ -22,6 +22,10 @@ static constexpr const char kIntegerPrefix[] = "i_0x";
 static constexpr const char kDoublePrefix[] = "f_";
 static constexpr const char kInvalidOperand[] = "INVALID";
 
+namespace llvm {
+
+namespace {
+
 // A mutable struct holding an LLVMState that can be passed through the
 // serialization process to encode/decode registers and instructions.
 struct YamlContext {
@@ -141,13 +145,13 @@ private:
     return 0;
   }
 
-  const exegesis::LLVMState *State;
+  const llvm::exegesis::LLVMState *State;
   std::string LastError;
   llvm::raw_string_ostream ErrorStream;
 };
+} // namespace
 
 // Defining YAML traits for IO.
-namespace llvm {
 namespace yaml {
 
 static YamlContext &getTypedContext(void *Ctx) {
@@ -294,7 +298,6 @@ struct MappingContextTraits<exegesis::InstructionBenchmark, YamlContext> {
 };
 
 } // namespace yaml
-} // namespace llvm
 
 namespace exegesis {
 
@@ -384,3 +387,4 @@ void PerInstructionStats::push(const BenchmarkMeasure &BM) {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/BenchmarkResult.h b/tools/llvm-exegesis/lib/BenchmarkResult.h
index 961c07b99dd..773a2e50abc 100644
--- a/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -28,6 +28,7 @@
 #include <unordered_map>
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
 struct InstructionBenchmarkKey {
@@ -111,5 +112,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKRESULT_H
diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 4eb5f1e880c..0d372363927 100644
--- a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
 
+namespace llvm {
 namespace exegesis {
 
 BenchmarkFailure::BenchmarkFailure(const llvm::Twine &S)
@@ -173,3 +174,4 @@ BenchmarkRunner::writeObjectFile(const BenchmarkCode &BC,
 BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.h b/tools/llvm-exegesis/lib/BenchmarkRunner.h
index 517155dbdfb..4f77f492ab4 100644
--- a/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -27,6 +27,7 @@
 #include <memory>
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
 // A class representing failures that happened during Benchmark, they are used
@@ -90,5 +91,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKRUNNER_H
diff --git a/tools/llvm-exegesis/lib/Clustering.cpp b/tools/llvm-exegesis/lib/Clustering.cpp
index b63afec945f..761629167bb 100644
--- a/tools/llvm-exegesis/lib/Clustering.cpp
+++ b/tools/llvm-exegesis/lib/Clustering.cpp
@@ -11,6 +11,7 @@
 #include <string>
 #include <unordered_set>
 
+namespace llvm {
 namespace exegesis {
 
 // The clustering problem has the following characteristics:
@@ -170,3 +171,4 @@ InstructionBenchmarkClustering::create(
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Clustering.h b/tools/llvm-exegesis/lib/Clustering.h
index c811020e0fe..9dc0adffb1e 100644
--- a/tools/llvm-exegesis/lib/Clustering.h
+++ b/tools/llvm-exegesis/lib/Clustering.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/Error.h"
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
 class InstructionBenchmarkClustering {
@@ -109,5 +110,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H
diff --git a/tools/llvm-exegesis/lib/CodeTemplate.cpp b/tools/llvm-exegesis/lib/CodeTemplate.cpp
index 614d4749b1f..e159b000755 100644
--- a/tools/llvm-exegesis/lib/CodeTemplate.cpp
+++ b/tools/llvm-exegesis/lib/CodeTemplate.cpp
@@ -9,6 +9,7 @@
 
 #include "CodeTemplate.h"
 
+namespace llvm {
 namespace exegesis {
 
 CodeTemplate::CodeTemplate(CodeTemplate &&) = default;
@@ -115,3 +116,4 @@ getExecutionModeBits(ExecutionMode Execution) {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/CodeTemplate.h b/tools/llvm-exegesis/lib/CodeTemplate.h
index 734992f0afa..4c55487f3d1 100644
--- a/tools/llvm-exegesis/lib/CodeTemplate.h
+++ b/tools/llvm-exegesis/lib/CodeTemplate.h
@@ -19,6 +19,7 @@
 #include "MCInstrDescView.h"
 #include "llvm/ADT/BitmaskEnum.h"
 
+namespace llvm {
 namespace exegesis {
 
 // A template for an Instruction holding values for each of its Variables.
@@ -125,5 +126,6 @@ struct CodeTemplate {
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_CODETEMPLATE_H
diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp
index 7d68d60c48b..ec92d936de3 100644
--- a/tools/llvm-exegesis/lib/Latency.cpp
+++ b/tools/llvm-exegesis/lib/Latency.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/Support/FormatVariadic.h"
 
+namespace llvm {
 namespace exegesis {
 
 struct ExecutionClass {
@@ -200,3 +201,4 @@ LatencyBenchmarkRunner::runMeasurements(
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Latency.h b/tools/llvm-exegesis/lib/Latency.h
index cb55f340a5a..fef72cde5a6 100644
--- a/tools/llvm-exegesis/lib/Latency.h
+++ b/tools/llvm-exegesis/lib/Latency.h
@@ -19,6 +19,7 @@
 #include "MCInstrDescView.h"
 #include "SnippetGenerator.h"
 
+namespace llvm {
 namespace exegesis {
 
 class LatencySnippetGenerator : public SnippetGenerator {
@@ -43,5 +44,6 @@ private:
   virtual const char *getCounterName() const;
 };
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_LATENCY_H
diff --git a/tools/llvm-exegesis/lib/LlvmState.cpp b/tools/llvm-exegesis/lib/LlvmState.cpp
index 279792e9031..ba786cc97ce 100644
--- a/tools/llvm-exegesis/lib/LlvmState.cpp
+++ b/tools/llvm-exegesis/lib/LlvmState.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+namespace llvm {
 namespace exegesis {
 
 LLVMState::LLVMState(const std::string &Triple, const std::string &CpuName) {
@@ -71,3 +72,4 @@ bool LLVMState::canAssemble(const llvm::MCInst &Inst) const {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/LlvmState.h b/tools/llvm-exegesis/lib/LlvmState.h
index aa7705a36a6..f8ef8665f44 100644
--- a/tools/llvm-exegesis/lib/LlvmState.h
+++ b/tools/llvm-exegesis/lib/LlvmState.h
@@ -25,6 +25,7 @@
 #include <memory>
 #include <string>
 
+namespace llvm {
 namespace exegesis {
 
 class ExegesisTarget;
@@ -64,5 +65,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_LLVMSTATE_H
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
index 6fdb5a68419..2b4624b9b64 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.cpp
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 
+namespace llvm {
 namespace exegesis {
 
 unsigned Variable::getIndex() const {
@@ -343,3 +344,4 @@ void DumpMCInst(const llvm::MCRegisterInfo &MCRegisterInfo,
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.h b/tools/llvm-exegesis/lib/MCInstrDescView.h
index 17f3e2b930d..4e8278ba2b5 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.h
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.h
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 
+namespace llvm {
 namespace exegesis {
 
 // A variable represents the value associated to an Operand or a set of Operands
@@ -191,5 +192,6 @@ void DumpMCInst(const llvm::MCRegisterInfo &MCRegisterInfo,
                 const llvm::MCInst &MCInst, llvm::raw_ostream &OS);
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_MCINSTRDESCVIEW_H
diff --git a/tools/llvm-exegesis/lib/PerfHelper.cpp b/tools/llvm-exegesis/lib/PerfHelper.cpp
index c145ea8404b..c1c242ca88f 100644
--- a/tools/llvm-exegesis/lib/PerfHelper.cpp
+++ b/tools/llvm-exegesis/lib/PerfHelper.cpp
@@ -17,6 +17,7 @@
 #endif
 #include <cassert>
 
+namespace llvm {
 namespace exegesis {
 namespace pfm {
 
@@ -136,3 +137,4 @@ int64_t Counter::read() const { return 42; }
 
 } // namespace pfm
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/PerfHelper.h b/tools/llvm-exegesis/lib/PerfHelper.h
index 8c3f13e6c5c..2d081084660 100644
--- a/tools/llvm-exegesis/lib/PerfHelper.h
+++ b/tools/llvm-exegesis/lib/PerfHelper.h
@@ -23,6 +23,7 @@
 
 struct perf_event_attr;
 
+namespace llvm {
 namespace exegesis {
 namespace pfm {
 
@@ -102,5 +103,6 @@ void Measure(
 
 } // namespace pfm
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_PERFHELPER_H
diff --git a/tools/llvm-exegesis/lib/RegisterAliasing.cpp b/tools/llvm-exegesis/lib/RegisterAliasing.cpp
index 039f78db985..54041ca30aa 100644
--- a/tools/llvm-exegesis/lib/RegisterAliasing.cpp
+++ b/tools/llvm-exegesis/lib/RegisterAliasing.cpp
@@ -9,6 +9,7 @@
 
 #include "RegisterAliasing.h"
 
+namespace llvm {
 namespace exegesis {
 
 llvm::BitVector getAliasedBits(const llvm::MCRegisterInfo &RegInfo,
@@ -81,3 +82,4 @@ RegisterAliasingTrackerCache::getRegisterClass(unsigned RegClassIndex) const {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/RegisterAliasing.h b/tools/llvm-exegesis/lib/RegisterAliasing.h
index 064d9333beb..94a2eb07f49 100644
--- a/tools/llvm-exegesis/lib/RegisterAliasing.h
+++ b/tools/llvm-exegesis/lib/RegisterAliasing.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/PackedVector.h"
 #include "llvm/MC/MCRegisterInfo.h"
 
+namespace llvm {
 namespace exegesis {
 
 // Returns the registers that are aliased by the ones set in SourceBits.
@@ -104,5 +105,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_ALIASINGTRACKER_H
diff --git a/tools/llvm-exegesis/lib/RegisterValue.cpp b/tools/llvm-exegesis/lib/RegisterValue.cpp
index 1982a6c53b2..2bf996cead4 100644
--- a/tools/llvm-exegesis/lib/RegisterValue.cpp
+++ b/tools/llvm-exegesis/lib/RegisterValue.cpp
@@ -10,6 +10,7 @@
 #include "RegisterValue.h"
 #include "llvm/ADT/APFloat.h"
 
+namespace llvm {
 namespace exegesis {
 
 static llvm::APFloat getFloatValue(const llvm::fltSemantics &FltSemantics,
@@ -47,3 +48,4 @@ llvm::APInt bitcastFloatValue(const llvm::fltSemantics &FltSemantics,
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/RegisterValue.h b/tools/llvm-exegesis/lib/RegisterValue.h
index a4ef8e0ba1e..51ea30ac8eb 100644
--- a/tools/llvm-exegesis/lib/RegisterValue.h
+++ b/tools/llvm-exegesis/lib/RegisterValue.h
@@ -17,6 +17,7 @@
 #include <llvm/ADT/APFloat.h>
 #include <llvm/ADT/APInt.h>
 
+namespace llvm {
 namespace exegesis {
 
 // A simple object storing the value for a particular register.
@@ -43,3 +44,4 @@ llvm::APInt bitcastFloatValue(const llvm::fltSemantics &FltSemantics,
                               PredefinedValues Value);
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
index cdf54a32e4f..eb6a8577b57 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Program.h"
 
+namespace llvm {
 namespace exegesis {
 
 std::vector<CodeTemplate> getSingleton(CodeTemplate &&CT) {
@@ -222,3 +223,4 @@ void randomizeUnsetVariables(const llvm::BitVector &ForbiddenRegs,
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.h b/tools/llvm-exegesis/lib/SnippetGenerator.h
index 4b307fd75ac..967b273182b 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.h
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.h
@@ -28,6 +28,7 @@
 #include <memory>
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
 std::vector<CodeTemplate> getSingleton(CodeTemplate &&CT);
@@ -92,5 +93,6 @@ void randomizeUnsetVariables(const llvm::BitVector &ForbiddenRegs,
                              InstructionTemplate &IT);
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_SNIPPETGENERATOR_H
diff --git a/tools/llvm-exegesis/lib/Target.cpp b/tools/llvm-exegesis/lib/Target.cpp
index 8baa8499c92..b7828a13da0 100644
--- a/tools/llvm-exegesis/lib/Target.cpp
+++ b/tools/llvm-exegesis/lib/Target.cpp
@@ -11,6 +11,7 @@
 #include "Latency.h"
 #include "Uops.h"
 
+namespace llvm {
 namespace exegesis {
 
 ExegesisTarget::~ExegesisTarget() {} // anchor.
@@ -109,3 +110,4 @@ const ExegesisTarget &ExegesisTarget::getDefault() {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Target.h b/tools/llvm-exegesis/lib/Target.h
index dd778d35b72..2e94727d78d 100644
--- a/tools/llvm-exegesis/lib/Target.h
+++ b/tools/llvm-exegesis/lib/Target.h
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 
+namespace llvm {
 namespace exegesis {
 
 class ExegesisTarget {
@@ -100,5 +101,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_TARGET_H
diff --git a/tools/llvm-exegesis/lib/Uops.cpp b/tools/llvm-exegesis/lib/Uops.cpp
index 50be707feb2..5aa726218c7 100644
--- a/tools/llvm-exegesis/lib/Uops.cpp
+++ b/tools/llvm-exegesis/lib/Uops.cpp
@@ -78,6 +78,7 @@
 // In that case we just use a greedy register assignment and hope for the
 // best.
 
+namespace llvm {
 namespace exegesis {
 
 static llvm::SmallVector<const Variable *, 8>
@@ -252,3 +253,4 @@ UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Uops.h b/tools/llvm-exegesis/lib/Uops.h
index f75f2edd552..b2a5ea177f4 100644
--- a/tools/llvm-exegesis/lib/Uops.h
+++ b/tools/llvm-exegesis/lib/Uops.h
@@ -18,6 +18,7 @@
 #include "BenchmarkRunner.h"
 #include "SnippetGenerator.h"
 
+namespace llvm {
 namespace exegesis {
 
 class UopsSnippetGenerator : public SnippetGenerator {
@@ -73,5 +74,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_UOPS_H
diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index db1a23b74cc..69804849e62 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -17,6 +17,7 @@
 #include "X86Subtarget.h"
 #include "llvm/MC/MCInstBuilder.h"
 
+namespace llvm {
 namespace exegesis {
 
 namespace {
@@ -423,3 +424,4 @@ void InitializeX86ExegesisTarget() {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/llvm-exegesis.cpp b/tools/llvm-exegesis/llvm-exegesis.cpp
index bbc1c9ba28c..ea991420039 100644
--- a/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -38,63 +38,63 @@
 #include <algorithm>
 #include <string>
 
-static llvm::cl::opt<int>
-    OpcodeIndex("opcode-index", llvm::cl::desc("opcode to measure, by index"),
-                llvm::cl::init(0));
-
-static llvm::cl::opt<std::string> OpcodeNames(
-    "opcode-name",
-    llvm::cl::desc("comma-separated list of opcodes to measure, by name"),
-    llvm::cl::init(""));
-
-static llvm::cl::opt<std::string>
-    SnippetsFile("snippets-file", llvm::cl::desc("code snippets to measure"),
-                 llvm::cl::init(""));
-
-static llvm::cl::opt<std::string>
-    BenchmarkFile("benchmarks-file", llvm::cl::desc(""), llvm::cl::init(""));
-
-static llvm::cl::opt<exegesis::InstructionBenchmark::ModeE> BenchmarkMode(
-    "mode", llvm::cl::desc("the mode to run"),
-    llvm::cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency,
-                                "latency", "Instruction Latency"),
-                     clEnumValN(exegesis::InstructionBenchmark::Uops, "uops",
-                                "Uop Decomposition"),
-                     // When not asking for a specific benchmark mode, we'll
-                     // analyse the results.
-                     clEnumValN(exegesis::InstructionBenchmark::Unknown,
-                                "analysis", "Analysis")));
-
-static llvm::cl::opt<unsigned>
+namespace llvm {
+namespace exegesis {
+
+static cl::opt<int> OpcodeIndex("opcode-index",
+                                cl::desc("opcode to measure, by index"),
+                                cl::init(0));
+
+static cl::opt<std::string>
+    OpcodeNames("opcode-name",
+                cl::desc("comma-separated list of opcodes to measure, by name"),
+                cl::init(""));
+
+static cl::opt<std::string> SnippetsFile("snippets-file",
+                                         cl::desc("code snippets to measure"),
+                                         cl::init(""));
+
+static cl::opt<std::string> BenchmarkFile("benchmarks-file", cl::desc(""),
+                                          cl::init(""));
+
+static cl::opt<exegesis::InstructionBenchmark::ModeE>
+    BenchmarkMode("mode", cl::desc("the mode to run"),
+                  cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency,
+                                        "latency", "Instruction Latency"),
+                             clEnumValN(exegesis::InstructionBenchmark::Uops,
+                                        "uops", "Uop Decomposition"),
+                             // When not asking for a specific benchmark mode,
+                             // we'll analyse the results.
+                             clEnumValN(exegesis::InstructionBenchmark::Unknown,
+                                        "analysis", "Analysis")));
+
+static cl::opt<unsigned>
     NumRepetitions("num-repetitions",
-                   llvm::cl::desc("number of time to repeat the asm snippet"),
-                   llvm::cl::init(10000));
+                   cl::desc("number of time to repeat the asm snippet"),
+                   cl::init(10000));
 
-static llvm::cl::opt<bool> IgnoreInvalidSchedClass(
+static cl::opt<bool> IgnoreInvalidSchedClass(
     "ignore-invalid-sched-class",
-    llvm::cl::desc("ignore instructions that do not define a sched class"),
-    llvm::cl::init(false));
+    cl::desc("ignore instructions that do not define a sched class"),
+    cl::init(false));
 
-static llvm::cl::opt<unsigned> AnalysisNumPoints(
+static cl::opt<unsigned> AnalysisNumPoints(
     "analysis-numpoints",
-    llvm::cl::desc("minimum number of points in an analysis cluster"),
-    llvm::cl::init(3));
+    cl::desc("minimum number of points in an analysis cluster"), cl::init(3));
 
-static llvm::cl::opt<float>
+static cl::opt<float>
     AnalysisEpsilon("analysis-epsilon",
-                    llvm::cl::desc("dbscan epsilon for analysis clustering"),
-                    llvm::cl::init(0.1));
+                    cl::desc("dbscan epsilon for analysis clustering"),
+                    cl::init(0.1));
 
-static llvm::cl::opt<std::string>
-    AnalysisClustersOutputFile("analysis-clusters-output-file",
-                               llvm::cl::desc(""), llvm::cl::init("-"));
-static llvm::cl::opt<std::string>
+static cl::opt<std::string>
+    AnalysisClustersOutputFile("analysis-clusters-output-file", cl::desc(""),
+                               cl::init("-"));
+static cl::opt<std::string>
     AnalysisInconsistenciesOutputFile("analysis-inconsistencies-output-file",
-                                      llvm::cl::desc(""), llvm::cl::init("-"));
-
-namespace exegesis {
+                                      cl::desc(""), cl::init("-"));
 
-static llvm::ExitOnError ExitOnErr;
+static ExitOnError ExitOnErr;
 
 #ifdef LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET
 void LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET();
@@ -430,9 +430,11 @@ static void analysisMain() {
 }
 
 } // namespace exegesis
+} // namespace llvm
 
 int main(int Argc, char **Argv) {
-  llvm::cl::ParseCommandLineOptions(Argc, Argv, "");
+  using namespace llvm;
+  cl::ParseCommandLineOptions(Argc, Argv, "");
 
   exegesis::ExitOnErr.setExitCodeMapper([](const llvm::Error &Err) {
     if (Err.isA<llvm::StringError>())
@@ -440,7 +442,7 @@ int main(int Argc, char **Argv) {
     return EXIT_FAILURE;
   });
 
-  if (BenchmarkMode == exegesis::InstructionBenchmark::Unknown) {
+  if (exegesis::BenchmarkMode == exegesis::InstructionBenchmark::Unknown) {
     exegesis::analysisMain();
   } else {
     exegesis::benchmarkMain();
diff --git a/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp b/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
index 1f394ae2efd..8a519bb2e7b 100644
--- a/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
+++ b/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
@@ -9,6 +9,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 
 void InitializeAArch64ExegesisTarget();
@@ -60,3 +61,4 @@ TEST_F(AArch64TargetTest, SetRegToConstant) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/ARM/AssemblerTest.cpp b/unittests/tools/llvm-exegesis/ARM/AssemblerTest.cpp
index db8b9dfc3b7..a20fa5556bb 100644
--- a/unittests/tools/llvm-exegesis/ARM/AssemblerTest.cpp
+++ b/unittests/tools/llvm-exegesis/ARM/AssemblerTest.cpp
@@ -10,6 +10,7 @@
 #include "../Common/AssemblerUtils.h"
 #include "ARMInstrInfo.h"
 
+namespace llvm {
 namespace exegesis {
 namespace {
 
@@ -47,3 +48,4 @@ TEST_F(ARMMachineFunctionGeneratorTest, DISABLED_JitFunctionADDrr) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/BenchmarkRunnerTest.cpp b/unittests/tools/llvm-exegesis/BenchmarkRunnerTest.cpp
index 05b36a31b9e..c518491063a 100644
--- a/unittests/tools/llvm-exegesis/BenchmarkRunnerTest.cpp
+++ b/unittests/tools/llvm-exegesis/BenchmarkRunnerTest.cpp
@@ -11,6 +11,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 
 namespace {
@@ -29,3 +30,4 @@ TEST(ScratchSpaceTest, Works) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/ClusteringTest.cpp b/unittests/tools/llvm-exegesis/ClusteringTest.cpp
index e1bffd63454..8ea77dcbdde 100644
--- a/unittests/tools/llvm-exegesis/ClusteringTest.cpp
+++ b/unittests/tools/llvm-exegesis/ClusteringTest.cpp
@@ -14,6 +14,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 
 namespace {
@@ -104,3 +105,4 @@ TEST(ClusteringTest, Ordering) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h b/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h
index cc00cee58e3..8a144e5c26f 100644
--- a/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h
+++ b/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h
@@ -24,6 +24,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 
 class MachineFunctionGeneratorBaseTest : public ::testing::Test {
@@ -89,5 +90,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif
diff --git a/unittests/tools/llvm-exegesis/PerfHelperTest.cpp b/unittests/tools/llvm-exegesis/PerfHelperTest.cpp
index a8205f9e3eb..91ed4a60967 100644
--- a/unittests/tools/llvm-exegesis/PerfHelperTest.cpp
+++ b/unittests/tools/llvm-exegesis/PerfHelperTest.cpp
@@ -12,6 +12,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 namespace pfm {
 namespace {
@@ -45,3 +46,4 @@ TEST(PerfHelperTest, FunctionalTest) {
 } // namespace
 } // namespace pfm
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/RegisterValueTest.cpp b/unittests/tools/llvm-exegesis/RegisterValueTest.cpp
index 4ade990382d..8453720dc70 100644
--- a/unittests/tools/llvm-exegesis/RegisterValueTest.cpp
+++ b/unittests/tools/llvm-exegesis/RegisterValueTest.cpp
@@ -11,6 +11,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 
 namespace {
@@ -69,3 +70,4 @@ TEST(RegisterValueTest, Double) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp b/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp
index d2d4c152d79..00ac6290aed 100644
--- a/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp
@@ -8,6 +8,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 namespace {
 
@@ -100,3 +101,4 @@ TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_1P1_1P05_2P0156) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/AssemblerTest.cpp b/unittests/tools/llvm-exegesis/X86/AssemblerTest.cpp
index 8e81106db8d..451c3f67e75 100644
--- a/unittests/tools/llvm-exegesis/X86/AssemblerTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/AssemblerTest.cpp
@@ -10,6 +10,7 @@
 #include "../Common/AssemblerUtils.h"
 #include "X86InstrInfo.h"
 
+namespace llvm {
 namespace exegesis {
 
 void InitializeX86ExegesisTarget();
@@ -63,3 +64,4 @@ TEST_F(X86MachineFunctionGeneratorTest, DISABLED_JitFunctionMOV32ri) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp b/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp
index b17ae1caff3..f069c21b364 100644
--- a/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp
@@ -25,6 +25,7 @@ using ::testing::get;
 using ::testing::Pointwise;
 using ::testing::Property;
 
+namespace llvm {
 namespace exegesis {
 
 bool operator==(const BenchmarkMeasure &A, const BenchmarkMeasure &B) {
@@ -136,3 +137,4 @@ TEST(BenchmarkResultTest, PerInstructionStats) {
 }
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp b/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
index 12f76541d4d..007b0156b1f 100644
--- a/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
@@ -9,6 +9,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 namespace {
 
@@ -89,3 +90,4 @@ TEST_F(RegisterAliasingTest, TrackRegisterClassCache) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
index 4b3fa5455a3..04517359d8a 100644
--- a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -17,6 +17,7 @@
 
 #include <unordered_set>
 
+namespace llvm {
 namespace exegesis {
 
 void InitializeX86ExegesisTarget();
@@ -413,3 +414,4 @@ TEST_F(FakeSnippetGeneratorTest, ComputeRegisterInitialValuesAdd64rr) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/TargetTest.cpp b/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
index 5ada03b2e9e..2d9d7bcd559 100644
--- a/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
@@ -35,6 +35,7 @@ bool operator==(const MCInst &a, const MCInst &b) {
 
 } // namespace llvm
 
+namespace llvm {
 namespace exegesis {
 
 void InitializeX86ExegesisTarget();
@@ -376,3 +377,4 @@ TEST_F(Core2TargetTest, SetRegToFP1_4Bits) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
-- 
GitLab


From a6d25f294e7a0dbf3a16297aa88a541fe01f232e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Oct 2018 17:43:33 +0000
Subject: [PATCH 0392/1116] [X86] getTargetConstantBitsFromNode - handle
 extraction from larger constant pool entries

First step towards removing X86ShuffleDecodeConstantPool usage from X86ISelLowering.cpp

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344924 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 8f4e2ad5ed6..6059a2a09e7 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5694,11 +5694,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   // Extract constant bits from constant pool vector.
   if (auto *Cst = getTargetConstantFromNode(Op)) {
     Type *CstTy = Cst->getType();
-    if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
+    unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+    if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
       return false;
 
     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
-    unsigned NumSrcElts = CstTy->getVectorNumElements();
+    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
 
     APInt UndefSrcElts(NumSrcElts, 0);
     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
-- 
GitLab


From 305c774f246ac1fc1103d63834fa23cf5fd29478 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Mon, 22 Oct 2018 17:52:31 +0000
Subject: [PATCH 0393/1116] [llvm-exegesis] Fix name lookup ambiguity in MSVC
 after 344922

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344927 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/BenchmarkRunner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 0d372363927..3f13c4638ec 100644
--- a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -75,7 +75,7 @@ private:
         llvm::CrashRecoveryContext::Enable();
         const bool Crashed = !CRC.RunSafely([this, &Counter, ScratchPtr]() {
           Counter.start();
-          Function(ScratchPtr);
+          this->Function(ScratchPtr);
           Counter.stop();
         });
         llvm::CrashRecoveryContext::Disable();
-- 
GitLab


From ded7818bb75c33b13ff7402610f95b8277f45218 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Mon, 22 Oct 2018 17:57:02 +0000
Subject: [PATCH 0394/1116] [hot-cold-split] Add missing FileCheck invocations

Summary:
r344558 added some CHECK statements to split-cold-2.ll, but didn't add
any invocations of FileCheck. Add those here.

Reviewers: sebpop

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53505

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344928 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/HotColdSplit/split-cold-2.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/Transforms/HotColdSplit/split-cold-2.ll b/test/Transforms/HotColdSplit/split-cold-2.ll
index 3e1a567113a..cdbb67a2d4b 100644
--- a/test/Transforms/HotColdSplit/split-cold-2.ll
+++ b/test/Transforms/HotColdSplit/split-cold-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt -hotcoldsplit -S < %s
-; RUN: opt -passes=hotcoldsplit -S < %s
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+; RUN: opt -passes=hotcoldsplit -S < %s | FileCheck %s
 
 ; Make sure this compiles. This test used to fail with an invalid phi node: the
 ; two predecessors were outlined and the SSA representation was invalid.
-- 
GitLab


From 1aa867a43984f0234b97ceb17decb56321104322 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl@gmail.com>
Date: Mon, 22 Oct 2018 18:00:49 +0000
Subject: [PATCH 0395/1116] [SourceMgr][FileCheck] Obey -color by extending
 WithColor

While this change specifically targets FileCheck, it affects any tool
using the same SourceMgr facilities.

Previously, -color was documented in FileCheck's -help output, but
-color had no effect.  Now, -color obeys its documentation: it forces
colors to be used in FileCheck diagnostics even when stderr is not a
terminal.

-color is especially helpful when combined with FileCheck's -v, which
can produce a long series of diagnostics that you might wish to pipe
to a pager, such as less -R.  The WithColor extensions here will also
help to clean up color usage in FileCheck's annotated dump of input,
which is proposed in D52999.

Reviewed By: JDevlieghere

Differential Revision: https://reviews.llvm.org/D53419

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344930 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/CommandGuide/FileCheck.rst  |   4 ++
 include/llvm/Support/WithColor.h |  63 ++++++++++++++++--
 lib/Support/SourceMgr.cpp        | 108 +++++++++++++------------------
 lib/Support/WithColor.cpp        |  63 +++++++++++++-----
 test/FileCheck/opt-color.txt     |  22 +++++++
 5 files changed, 174 insertions(+), 86 deletions(-)
 create mode 100644 test/FileCheck/opt-color.txt

diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 75df8a62268..830b1e00d4e 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -116,6 +116,10 @@ OPTIONS
   as old tests are migrated to the new non-overlapping ``CHECK-DAG:``
   implementation.
 
+.. option:: --color
+
+  Use colors in output (autodetected by default).
+
 EXIT STATUS
 -----------
 
diff --git a/include/llvm/Support/WithColor.h b/include/llvm/Support/WithColor.h
index 85fc5fa0cf1..76842d1c3dc 100644
--- a/include/llvm/Support/WithColor.h
+++ b/include/llvm/Support/WithColor.h
@@ -29,23 +29,49 @@ enum class HighlightColor {
   Macro,
   Error,
   Warning,
-  Note
+  Note,
+  Remark
 };
 
 /// An RAII object that temporarily switches an output stream to a specific
 /// color.
 class WithColor {
   raw_ostream &OS;
-  /// Determine whether colors should be displayed.
-  bool colorsEnabled(raw_ostream &OS);
+  bool DisableColors;
 
 public:
   /// To be used like this: WithColor(OS, HighlightColor::String) << "text";
-  WithColor(raw_ostream &OS, HighlightColor S);
+  /// @param OS The output stream
+  /// @param S Symbolic name for syntax element to color
+  /// @param DisableColors Whether to ignore color changes regardless of -color
+  /// and support in OS
+  WithColor(raw_ostream &OS, HighlightColor S, bool DisableColors = false);
+  /// To be used like this: WithColor(OS, raw_ostream::Black) << "text";
+  /// @param OS The output stream
+  /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
+  /// change only the bold attribute, and keep colors untouched
+  /// @param Bold Bold/brighter text, default false
+  /// @param BG If true, change the background, default: change foreground
+  /// @param DisableColors Whether to ignore color changes regardless of -color
+  /// and support in OS
+  WithColor(raw_ostream &OS,
+            raw_ostream::Colors Color = raw_ostream::SAVEDCOLOR,
+            bool Bold = false, bool BG = false, bool DisableColors = false)
+      : OS(OS), DisableColors(DisableColors) {
+    changeColor(Color, Bold, BG);
+  }
   ~WithColor();
 
   raw_ostream &get() { return OS; }
   operator raw_ostream &() { return OS; }
+  template <typename T> WithColor &operator<<(T &O) {
+    OS << O;
+    return *this;
+  }
+  template <typename T> WithColor &operator<<(const T &O) {
+    OS << O;
+    return *this;
+  }
 
   /// Convenience method for printing "error: " to stderr.
   static raw_ostream &error();
@@ -53,13 +79,36 @@ public:
   static raw_ostream &warning();
   /// Convenience method for printing "note: " to stderr.
   static raw_ostream &note();
+  /// Convenience method for printing "remark: " to stderr.
+  static raw_ostream &remark();
 
   /// Convenience method for printing "error: " to the given stream.
-  static raw_ostream &error(raw_ostream &OS, StringRef Prefix = "");
+  static raw_ostream &error(raw_ostream &OS, StringRef Prefix = "",
+                            bool DisableColors = false);
   /// Convenience method for printing "warning: " to the given stream.
-  static raw_ostream &warning(raw_ostream &OS, StringRef Prefix = "");
+  static raw_ostream &warning(raw_ostream &OS, StringRef Prefix = "",
+                              bool DisableColors = false);
   /// Convenience method for printing "note: " to the given stream.
-  static raw_ostream &note(raw_ostream &OS, StringRef Prefix = "");
+  static raw_ostream &note(raw_ostream &OS, StringRef Prefix = "",
+                           bool DisableColors = false);
+  /// Convenience method for printing "remark: " to the given stream.
+  static raw_ostream &remark(raw_ostream &OS, StringRef Prefix = "",
+                             bool DisableColors = false);
+
+  /// Determine whether colors are displayed.
+  bool colorsEnabled();
+
+  /// Change the color of text that will be output from this point forward.
+  /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
+  /// change only the bold attribute, and keep colors untouched
+  /// @param Bold Bold/brighter text, default false
+  /// @param BG If true, change the background, default: change foreground
+  WithColor &changeColor(raw_ostream::Colors Color, bool Bold = false,
+                         bool BG = false);
+
+  /// Reset the colors to terminal defaults. Call this when you are done
+  /// outputting colored text, or before program exit.
+  WithColor &resetColor();
 };
 
 } // end namespace llvm
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 582e2cf6c11..a55ad881d01 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -370,65 +371,48 @@ static bool isNonASCII(char c) {
   return c & 0x80;
 }
 
-void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
-                         bool ShowKindLabel) const {
-  // Display colors only if OS supports colors.
-  ShowColors &= S.has_colors();
+void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
+                         bool ShowColors, bool ShowKindLabel) const {
+  {
+    WithColor S(OS, raw_ostream::SAVEDCOLOR, true, false, !ShowColors);
 
-  if (ShowColors)
-    S.changeColor(raw_ostream::SAVEDCOLOR, true);
+    if (ProgName && ProgName[0])
+      S << ProgName << ": ";
 
-  if (ProgName && ProgName[0])
-    S << ProgName << ": ";
+    if (!Filename.empty()) {
+      if (Filename == "-")
+        S << "<stdin>";
+      else
+        S << Filename;
 
-  if (!Filename.empty()) {
-    if (Filename == "-")
-      S << "<stdin>";
-    else
-      S << Filename;
-
-    if (LineNo != -1) {
-      S << ':' << LineNo;
-      if (ColumnNo != -1)
-        S << ':' << (ColumnNo+1);
+      if (LineNo != -1) {
+        S << ':' << LineNo;
+        if (ColumnNo != -1)
+          S << ':' << (ColumnNo + 1);
+      }
+      S << ": ";
     }
-    S << ": ";
   }
 
   if (ShowKindLabel) {
     switch (Kind) {
     case SourceMgr::DK_Error:
-      if (ShowColors)
-        S.changeColor(raw_ostream::RED, true);
-      S << "error: ";
+      WithColor::error(OS, "", !ShowColors);
       break;
     case SourceMgr::DK_Warning:
-      if (ShowColors)
-        S.changeColor(raw_ostream::MAGENTA, true);
-      S << "warning: ";
+      WithColor::warning(OS, "", !ShowColors);
       break;
     case SourceMgr::DK_Note:
-      if (ShowColors)
-        S.changeColor(raw_ostream::BLACK, true);
-      S << "note: ";
+      WithColor::note(OS, "", !ShowColors);
       break;
     case SourceMgr::DK_Remark:
-      if (ShowColors)
-        S.changeColor(raw_ostream::BLUE, true);
-      S << "remark: ";
+      WithColor::remark(OS, "", !ShowColors);
       break;
     }
-
-    if (ShowColors) {
-      S.resetColor();
-      S.changeColor(raw_ostream::SAVEDCOLOR, true);
-    }
   }
 
-  S << Message << '\n';
-
-  if (ShowColors)
-    S.resetColor();
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true, false, !ShowColors)
+      << Message << '\n';
 
   if (LineNo == -1 || ColumnNo == -1)
     return;
@@ -439,7 +423,7 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
   // expanding them later, and bail out rather than show incorrect ranges and
   // misaligned fixits for any other odd characters.
   if (find_if(LineContents, isNonASCII) != LineContents.end()) {
-    printSourceLine(S, LineContents);
+    printSourceLine(OS, LineContents);
     return;
   }
   size_t NumColumns = LineContents.size();
@@ -473,29 +457,27 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
   // least.
   CaretLine.erase(CaretLine.find_last_not_of(' ')+1);
 
-  printSourceLine(S, LineContents);
+  printSourceLine(OS, LineContents);
 
-  if (ShowColors)
-    S.changeColor(raw_ostream::GREEN, true);
+  {
+    WithColor S(OS, raw_ostream::GREEN, true, false, !ShowColors);
 
-  // Print out the caret line, matching tabs in the source line.
-  for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
-    if (i >= LineContents.size() || LineContents[i] != '\t') {
-      S << CaretLine[i];
-      ++OutCol;
-      continue;
-    }
+    // Print out the caret line, matching tabs in the source line.
+    for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
+      if (i >= LineContents.size() || LineContents[i] != '\t') {
+        S << CaretLine[i];
+        ++OutCol;
+        continue;
+      }
 
-    // Okay, we have a tab.  Insert the appropriate number of characters.
-    do {
-      S << CaretLine[i];
-      ++OutCol;
-    } while ((OutCol % TabStop) != 0);
+      // Okay, we have a tab.  Insert the appropriate number of characters.
+      do {
+        S << CaretLine[i];
+        ++OutCol;
+      } while ((OutCol % TabStop) != 0);
+    }
+    S << '\n';
   }
-  S << '\n';
-
-  if (ShowColors)
-    S.resetColor();
 
   // Print out the replacement line, matching tabs in the source line.
   if (FixItInsertionLine.empty())
@@ -503,14 +485,14 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
 
   for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i < e; ++i) {
     if (i >= LineContents.size() || LineContents[i] != '\t') {
-      S << FixItInsertionLine[i];
+      OS << FixItInsertionLine[i];
       ++OutCol;
       continue;
     }
 
     // Okay, we have a tab.  Insert the appropriate number of characters.
     do {
-      S << FixItInsertionLine[i];
+      OS << FixItInsertionLine[i];
       // FIXME: This is trying not to break up replacements, but then to re-sync
       // with the tabs between replacements. This will fail, though, if two
       // fix-it replacements are exactly adjacent, or if a fix-it contains a
@@ -521,5 +503,5 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
       ++OutCol;
     } while (((OutCol % TabStop) != 0) && i != e);
   }
-  S << '\n';
+  OS << '\n';
 }
diff --git a/lib/Support/WithColor.cpp b/lib/Support/WithColor.cpp
index d2e13f0e86d..cf4c10956f2 100644
--- a/lib/Support/WithColor.cpp
+++ b/lib/Support/WithColor.cpp
@@ -19,15 +19,10 @@ static cl::opt<cl::boolOrDefault>
              cl::desc("Use colors in output (default=autodetect)"),
              cl::init(cl::BOU_UNSET));
 
-bool WithColor::colorsEnabled(raw_ostream &OS) {
-  if (UseColor == cl::BOU_UNSET)
-    return OS.has_colors();
-  return UseColor == cl::BOU_TRUE;
-}
-
-WithColor::WithColor(raw_ostream &OS, HighlightColor Color) : OS(OS) {
+WithColor::WithColor(raw_ostream &OS, HighlightColor Color, bool DisableColors)
+    : OS(OS), DisableColors(DisableColors) {
   // Detect color from terminal type unless the user passed the --color option.
-  if (colorsEnabled(OS)) {
+  if (colorsEnabled()) {
     switch (Color) {
     case HighlightColor::Address:
       OS.changeColor(raw_ostream::YELLOW);
@@ -56,6 +51,9 @@ WithColor::WithColor(raw_ostream &OS, HighlightColor Color) : OS(OS) {
     case HighlightColor::Note:
       OS.changeColor(raw_ostream::BLACK, true);
       break;
+    case HighlightColor::Remark:
+      OS.changeColor(raw_ostream::BLUE, true);
+      break;
     }
   }
 }
@@ -66,25 +64,58 @@ raw_ostream &WithColor::warning() { return warning(errs()); }
 
 raw_ostream &WithColor::note() { return note(errs()); }
 
-raw_ostream &WithColor::error(raw_ostream &OS, StringRef Prefix) {
+raw_ostream &WithColor::remark() { return remark(errs()); }
+
+raw_ostream &WithColor::error(raw_ostream &OS, StringRef Prefix,
+                              bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Error).get() << "error: ";
+  return WithColor(OS, HighlightColor::Error, DisableColors).get()
+         << "error: ";
 }
 
-raw_ostream &WithColor::warning(raw_ostream &OS, StringRef Prefix) {
+raw_ostream &WithColor::warning(raw_ostream &OS, StringRef Prefix,
+                                bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Warning).get() << "warning: ";
+  return WithColor(OS, HighlightColor::Warning, DisableColors).get()
+         << "warning: ";
 }
 
-raw_ostream &WithColor::note(raw_ostream &OS, StringRef Prefix) {
+raw_ostream &WithColor::note(raw_ostream &OS, StringRef Prefix,
+                             bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Note).get() << "note: ";
+  return WithColor(OS, HighlightColor::Note, DisableColors).get() << "note: ";
 }
 
-WithColor::~WithColor() {
-  if (colorsEnabled(OS))
+raw_ostream &WithColor::remark(raw_ostream &OS, StringRef Prefix,
+                               bool DisableColors) {
+  if (!Prefix.empty())
+    OS << Prefix << ": ";
+  return WithColor(OS, HighlightColor::Remark, DisableColors).get()
+         << "remark: ";
+}
+
+bool WithColor::colorsEnabled() {
+  if (DisableColors)
+    return false;
+  if (UseColor == cl::BOU_UNSET)
+    return OS.has_colors();
+  return UseColor == cl::BOU_TRUE;
+}
+
+WithColor &WithColor::changeColor(raw_ostream::Colors Color, bool Bold,
+                                  bool BG) {
+  if (colorsEnabled())
+    OS.changeColor(Color, Bold, BG);
+  return *this;
+}
+
+WithColor &WithColor::resetColor() {
+  if (colorsEnabled())
     OS.resetColor();
+  return *this;
 }
+
+WithColor::~WithColor() { resetColor(); }
diff --git a/test/FileCheck/opt-color.txt b/test/FileCheck/opt-color.txt
new file mode 100644
index 00000000000..9430114bf31
--- /dev/null
+++ b/test/FileCheck/opt-color.txt
@@ -0,0 +1,22 @@
+; Create a case that produces a simple diagnostic.
+; RUN: echo foo > %t.in
+; CHECK: bar
+
+; Run without and with -color.  In the former case, FileCheck should suppress
+; color in its diagnostics because stderr is a file.
+; RUN: not FileCheck %s < %t.in 2> %t.no-color
+; RUN: not FileCheck -color %s < %t.in 2> %t.color
+
+; Check whether color was produced.
+; RUN: FileCheck -check-prefix NO-COLOR %s < %t.no-color
+; RUN: FileCheck -check-prefix COLOR %s < %t.color
+
+; Make sure our NO-COLOR and COLOR patterns are sane: they don't match the
+; opposite cases.
+; RUN: not FileCheck -check-prefix COLOR %s < %t.no-color
+; RUN: not FileCheck -check-prefix NO-COLOR %s < %t.color
+
+; I don't know of a good way to check for ANSI color codes, so just make sure
+; some new characters show up where those codes should appear.
+; NO-COLOR: : error: CHECK: expected string not found in input
+; COLOR: : {{.+}}error: {{.+}}CHECK: expected string not found in input
-- 
GitLab


From 3807aafb6e48f1e859c2617cb15a57dd01ecf84f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Oct 2018 18:09:02 +0000
Subject: [PATCH 0396/1116] [X86][SSE] getTargetShuffleMaskIndices - allow
 opt-in support for whole undef shuffle mask elements

Enable this for PSHUFB constant mask decoding and remove the ConstantPool DecodePSHUFBMask

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344931 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6059a2a09e7..e2b2191b181 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5839,20 +5839,23 @@ static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
 
 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
                                         unsigned MaskEltSizeInBits,
-                                        SmallVectorImpl<uint64_t> &RawMask) {
+                                        SmallVectorImpl<uint64_t> &RawMask,
+                                        bool AllowWholeUndefs = false) {
   APInt UndefElts;
   SmallVector<APInt, 64> EltBits;
 
   // Extract the raw target constant bits.
-  // FIXME: We currently don't support UNDEF bits or mask entries.
   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
-                                     EltBits, /* AllowWholeUndefs */ false,
+                                     EltBits, AllowWholeUndefs,
                                      /* AllowPartialUndefs */ false))
     return false;
 
   // Insert the extracted elements into the mask.
-  for (APInt Elt : EltBits)
-    RawMask.push_back(Elt.getZExtValue());
+  for (int i = 0, e = EltBits.size(); i != e; ++i) {
+    uint64_t M = AllowWholeUndefs && UndefElts[i] ? SM_SentinelUndef
+                                                  : EltBits[i].getZExtValue();
+    RawMask.push_back(M);
+  }
 
   return true;
 }
@@ -6057,14 +6060,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
     SmallVector<uint64_t, 32> RawMask;
-    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
+    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, true)) {
       DecodePSHUFBMask(RawMask, Mask);
       break;
     }
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodePSHUFBMask(C, Mask);
-      break;
-    }
     return false;
   }
   case X86ISD::VPERMI:
-- 
GitLab


From 49bd6e1c1f6cb3d1e7ee97431e19b7e090fd578c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Oct 2018 18:35:13 +0000
Subject: [PATCH 0397/1116] [X86][SSE] Tidyup DecodeVPERMILPMask shuffle mask
 decoding

Add support for UNDEF raw mask elements and remove the ConstantPool DecodeVPERMILPMask usage in X86ISelLowering.cpp

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344933 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/Utils/X86ShuffleDecode.cpp | 4 ++++
 lib/Target/X86/X86ISelLowering.cpp        | 6 +-----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index fe567f4cece..4bd5df44abd 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -501,6 +501,10 @@ void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
 
   for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
     uint64_t M = RawMask[i];
+    if (M == (uint64_t)SM_SentinelUndef) {
+      ShuffleMask.push_back(M);
+      continue;
+    }
     M = (ScalarBits == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
     unsigned LaneOffset = i & ~(NumEltsPerLane - 1);
     ShuffleMask.push_back((int)(LaneOffset + M));
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index e2b2191b181..ea95a8aeaea 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6043,14 +6043,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
     SmallVector<uint64_t, 32> RawMask;
-    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
+    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, true)) {
       DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, Mask);
       break;
     }
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMILPMask(C, MaskEltSize, Mask);
-      break;
-    }
     return false;
   }
   case X86ISD::PSHUFB: {
-- 
GitLab


From 30353731ceb5b41106adde1bc274462d387d3e78 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 22 Oct 2018 18:51:29 +0000
Subject: [PATCH 0398/1116] Revert r344930 as it broke some of the bots on
 Windows.

http://lab.llvm.org:8011/builders/clang-x64-windows-msvc/builds/739

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344935 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/CommandGuide/FileCheck.rst  |   4 --
 include/llvm/Support/WithColor.h |  63 ++----------------
 lib/Support/SourceMgr.cpp        | 108 ++++++++++++++++++-------------
 lib/Support/WithColor.cpp        |  63 +++++-------------
 test/FileCheck/opt-color.txt     |  22 -------
 5 files changed, 86 insertions(+), 174 deletions(-)
 delete mode 100644 test/FileCheck/opt-color.txt

diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 830b1e00d4e..75df8a62268 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -116,10 +116,6 @@ OPTIONS
   as old tests are migrated to the new non-overlapping ``CHECK-DAG:``
   implementation.
 
-.. option:: --color
-
-  Use colors in output (autodetected by default).
-
 EXIT STATUS
 -----------
 
diff --git a/include/llvm/Support/WithColor.h b/include/llvm/Support/WithColor.h
index 76842d1c3dc..85fc5fa0cf1 100644
--- a/include/llvm/Support/WithColor.h
+++ b/include/llvm/Support/WithColor.h
@@ -29,49 +29,23 @@ enum class HighlightColor {
   Macro,
   Error,
   Warning,
-  Note,
-  Remark
+  Note
 };
 
 /// An RAII object that temporarily switches an output stream to a specific
 /// color.
 class WithColor {
   raw_ostream &OS;
-  bool DisableColors;
+  /// Determine whether colors should be displayed.
+  bool colorsEnabled(raw_ostream &OS);
 
 public:
   /// To be used like this: WithColor(OS, HighlightColor::String) << "text";
-  /// @param OS The output stream
-  /// @param S Symbolic name for syntax element to color
-  /// @param DisableColors Whether to ignore color changes regardless of -color
-  /// and support in OS
-  WithColor(raw_ostream &OS, HighlightColor S, bool DisableColors = false);
-  /// To be used like this: WithColor(OS, raw_ostream::Black) << "text";
-  /// @param OS The output stream
-  /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
-  /// change only the bold attribute, and keep colors untouched
-  /// @param Bold Bold/brighter text, default false
-  /// @param BG If true, change the background, default: change foreground
-  /// @param DisableColors Whether to ignore color changes regardless of -color
-  /// and support in OS
-  WithColor(raw_ostream &OS,
-            raw_ostream::Colors Color = raw_ostream::SAVEDCOLOR,
-            bool Bold = false, bool BG = false, bool DisableColors = false)
-      : OS(OS), DisableColors(DisableColors) {
-    changeColor(Color, Bold, BG);
-  }
+  WithColor(raw_ostream &OS, HighlightColor S);
   ~WithColor();
 
   raw_ostream &get() { return OS; }
   operator raw_ostream &() { return OS; }
-  template <typename T> WithColor &operator<<(T &O) {
-    OS << O;
-    return *this;
-  }
-  template <typename T> WithColor &operator<<(const T &O) {
-    OS << O;
-    return *this;
-  }
 
   /// Convenience method for printing "error: " to stderr.
   static raw_ostream &error();
@@ -79,36 +53,13 @@ public:
   static raw_ostream &warning();
   /// Convenience method for printing "note: " to stderr.
   static raw_ostream &note();
-  /// Convenience method for printing "remark: " to stderr.
-  static raw_ostream &remark();
 
   /// Convenience method for printing "error: " to the given stream.
-  static raw_ostream &error(raw_ostream &OS, StringRef Prefix = "",
-                            bool DisableColors = false);
+  static raw_ostream &error(raw_ostream &OS, StringRef Prefix = "");
   /// Convenience method for printing "warning: " to the given stream.
-  static raw_ostream &warning(raw_ostream &OS, StringRef Prefix = "",
-                              bool DisableColors = false);
+  static raw_ostream &warning(raw_ostream &OS, StringRef Prefix = "");
   /// Convenience method for printing "note: " to the given stream.
-  static raw_ostream &note(raw_ostream &OS, StringRef Prefix = "",
-                           bool DisableColors = false);
-  /// Convenience method for printing "remark: " to the given stream.
-  static raw_ostream &remark(raw_ostream &OS, StringRef Prefix = "",
-                             bool DisableColors = false);
-
-  /// Determine whether colors are displayed.
-  bool colorsEnabled();
-
-  /// Change the color of text that will be output from this point forward.
-  /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
-  /// change only the bold attribute, and keep colors untouched
-  /// @param Bold Bold/brighter text, default false
-  /// @param BG If true, change the background, default: change foreground
-  WithColor &changeColor(raw_ostream::Colors Color, bool Bold = false,
-                         bool BG = false);
-
-  /// Reset the colors to terminal defaults. Call this when you are done
-  /// outputting colored text, or before program exit.
-  WithColor &resetColor();
+  static raw_ostream &note(raw_ostream &OS, StringRef Prefix = "");
 };
 
 } // end namespace llvm
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index a55ad881d01..582e2cf6c11 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -24,7 +24,6 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SMLoc.h"
-#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -371,48 +370,65 @@ static bool isNonASCII(char c) {
   return c & 0x80;
 }
 
-void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
-                         bool ShowColors, bool ShowKindLabel) const {
-  {
-    WithColor S(OS, raw_ostream::SAVEDCOLOR, true, false, !ShowColors);
+void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
+                         bool ShowKindLabel) const {
+  // Display colors only if OS supports colors.
+  ShowColors &= S.has_colors();
 
-    if (ProgName && ProgName[0])
-      S << ProgName << ": ";
+  if (ShowColors)
+    S.changeColor(raw_ostream::SAVEDCOLOR, true);
 
-    if (!Filename.empty()) {
-      if (Filename == "-")
-        S << "<stdin>";
-      else
-        S << Filename;
+  if (ProgName && ProgName[0])
+    S << ProgName << ": ";
 
-      if (LineNo != -1) {
-        S << ':' << LineNo;
-        if (ColumnNo != -1)
-          S << ':' << (ColumnNo + 1);
-      }
-      S << ": ";
+  if (!Filename.empty()) {
+    if (Filename == "-")
+      S << "<stdin>";
+    else
+      S << Filename;
+
+    if (LineNo != -1) {
+      S << ':' << LineNo;
+      if (ColumnNo != -1)
+        S << ':' << (ColumnNo+1);
     }
+    S << ": ";
   }
 
   if (ShowKindLabel) {
     switch (Kind) {
     case SourceMgr::DK_Error:
-      WithColor::error(OS, "", !ShowColors);
+      if (ShowColors)
+        S.changeColor(raw_ostream::RED, true);
+      S << "error: ";
       break;
     case SourceMgr::DK_Warning:
-      WithColor::warning(OS, "", !ShowColors);
+      if (ShowColors)
+        S.changeColor(raw_ostream::MAGENTA, true);
+      S << "warning: ";
       break;
     case SourceMgr::DK_Note:
-      WithColor::note(OS, "", !ShowColors);
+      if (ShowColors)
+        S.changeColor(raw_ostream::BLACK, true);
+      S << "note: ";
       break;
     case SourceMgr::DK_Remark:
-      WithColor::remark(OS, "", !ShowColors);
+      if (ShowColors)
+        S.changeColor(raw_ostream::BLUE, true);
+      S << "remark: ";
       break;
     }
+
+    if (ShowColors) {
+      S.resetColor();
+      S.changeColor(raw_ostream::SAVEDCOLOR, true);
+    }
   }
 
-  WithColor(OS, raw_ostream::SAVEDCOLOR, true, false, !ShowColors)
-      << Message << '\n';
+  S << Message << '\n';
+
+  if (ShowColors)
+    S.resetColor();
 
   if (LineNo == -1 || ColumnNo == -1)
     return;
@@ -423,7 +439,7 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
   // expanding them later, and bail out rather than show incorrect ranges and
   // misaligned fixits for any other odd characters.
   if (find_if(LineContents, isNonASCII) != LineContents.end()) {
-    printSourceLine(OS, LineContents);
+    printSourceLine(S, LineContents);
     return;
   }
   size_t NumColumns = LineContents.size();
@@ -457,27 +473,29 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
   // least.
   CaretLine.erase(CaretLine.find_last_not_of(' ')+1);
 
-  printSourceLine(OS, LineContents);
+  printSourceLine(S, LineContents);
 
-  {
-    WithColor S(OS, raw_ostream::GREEN, true, false, !ShowColors);
+  if (ShowColors)
+    S.changeColor(raw_ostream::GREEN, true);
 
-    // Print out the caret line, matching tabs in the source line.
-    for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
-      if (i >= LineContents.size() || LineContents[i] != '\t') {
-        S << CaretLine[i];
-        ++OutCol;
-        continue;
-      }
-
-      // Okay, we have a tab.  Insert the appropriate number of characters.
-      do {
-        S << CaretLine[i];
-        ++OutCol;
-      } while ((OutCol % TabStop) != 0);
+  // Print out the caret line, matching tabs in the source line.
+  for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
+    if (i >= LineContents.size() || LineContents[i] != '\t') {
+      S << CaretLine[i];
+      ++OutCol;
+      continue;
     }
-    S << '\n';
+
+    // Okay, we have a tab.  Insert the appropriate number of characters.
+    do {
+      S << CaretLine[i];
+      ++OutCol;
+    } while ((OutCol % TabStop) != 0);
   }
+  S << '\n';
+
+  if (ShowColors)
+    S.resetColor();
 
   // Print out the replacement line, matching tabs in the source line.
   if (FixItInsertionLine.empty())
@@ -485,14 +503,14 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
 
   for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i < e; ++i) {
     if (i >= LineContents.size() || LineContents[i] != '\t') {
-      OS << FixItInsertionLine[i];
+      S << FixItInsertionLine[i];
       ++OutCol;
       continue;
     }
 
     // Okay, we have a tab.  Insert the appropriate number of characters.
     do {
-      OS << FixItInsertionLine[i];
+      S << FixItInsertionLine[i];
       // FIXME: This is trying not to break up replacements, but then to re-sync
       // with the tabs between replacements. This will fail, though, if two
       // fix-it replacements are exactly adjacent, or if a fix-it contains a
@@ -503,5 +521,5 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
       ++OutCol;
     } while (((OutCol % TabStop) != 0) && i != e);
   }
-  OS << '\n';
+  S << '\n';
 }
diff --git a/lib/Support/WithColor.cpp b/lib/Support/WithColor.cpp
index cf4c10956f2..d2e13f0e86d 100644
--- a/lib/Support/WithColor.cpp
+++ b/lib/Support/WithColor.cpp
@@ -19,10 +19,15 @@ static cl::opt<cl::boolOrDefault>
              cl::desc("Use colors in output (default=autodetect)"),
              cl::init(cl::BOU_UNSET));
 
-WithColor::WithColor(raw_ostream &OS, HighlightColor Color, bool DisableColors)
-    : OS(OS), DisableColors(DisableColors) {
+bool WithColor::colorsEnabled(raw_ostream &OS) {
+  if (UseColor == cl::BOU_UNSET)
+    return OS.has_colors();
+  return UseColor == cl::BOU_TRUE;
+}
+
+WithColor::WithColor(raw_ostream &OS, HighlightColor Color) : OS(OS) {
   // Detect color from terminal type unless the user passed the --color option.
-  if (colorsEnabled()) {
+  if (colorsEnabled(OS)) {
     switch (Color) {
     case HighlightColor::Address:
       OS.changeColor(raw_ostream::YELLOW);
@@ -51,9 +56,6 @@ WithColor::WithColor(raw_ostream &OS, HighlightColor Color, bool DisableColors)
     case HighlightColor::Note:
       OS.changeColor(raw_ostream::BLACK, true);
       break;
-    case HighlightColor::Remark:
-      OS.changeColor(raw_ostream::BLUE, true);
-      break;
     }
   }
 }
@@ -64,58 +66,25 @@ raw_ostream &WithColor::warning() { return warning(errs()); }
 
 raw_ostream &WithColor::note() { return note(errs()); }
 
-raw_ostream &WithColor::remark() { return remark(errs()); }
-
-raw_ostream &WithColor::error(raw_ostream &OS, StringRef Prefix,
-                              bool DisableColors) {
+raw_ostream &WithColor::error(raw_ostream &OS, StringRef Prefix) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Error, DisableColors).get()
-         << "error: ";
+  return WithColor(OS, HighlightColor::Error).get() << "error: ";
 }
 
-raw_ostream &WithColor::warning(raw_ostream &OS, StringRef Prefix,
-                                bool DisableColors) {
+raw_ostream &WithColor::warning(raw_ostream &OS, StringRef Prefix) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Warning, DisableColors).get()
-         << "warning: ";
+  return WithColor(OS, HighlightColor::Warning).get() << "warning: ";
 }
 
-raw_ostream &WithColor::note(raw_ostream &OS, StringRef Prefix,
-                             bool DisableColors) {
+raw_ostream &WithColor::note(raw_ostream &OS, StringRef Prefix) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Note, DisableColors).get() << "note: ";
+  return WithColor(OS, HighlightColor::Note).get() << "note: ";
 }
 
-raw_ostream &WithColor::remark(raw_ostream &OS, StringRef Prefix,
-                               bool DisableColors) {
-  if (!Prefix.empty())
-    OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Remark, DisableColors).get()
-         << "remark: ";
-}
-
-bool WithColor::colorsEnabled() {
-  if (DisableColors)
-    return false;
-  if (UseColor == cl::BOU_UNSET)
-    return OS.has_colors();
-  return UseColor == cl::BOU_TRUE;
-}
-
-WithColor &WithColor::changeColor(raw_ostream::Colors Color, bool Bold,
-                                  bool BG) {
-  if (colorsEnabled())
-    OS.changeColor(Color, Bold, BG);
-  return *this;
-}
-
-WithColor &WithColor::resetColor() {
-  if (colorsEnabled())
+WithColor::~WithColor() {
+  if (colorsEnabled(OS))
     OS.resetColor();
-  return *this;
 }
-
-WithColor::~WithColor() { resetColor(); }
diff --git a/test/FileCheck/opt-color.txt b/test/FileCheck/opt-color.txt
deleted file mode 100644
index 9430114bf31..00000000000
--- a/test/FileCheck/opt-color.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-; Create a case that produces a simple diagnostic.
-; RUN: echo foo > %t.in
-; CHECK: bar
-
-; Run without and with -color.  In the former case, FileCheck should suppress
-; color in its diagnostics because stderr is a file.
-; RUN: not FileCheck %s < %t.in 2> %t.no-color
-; RUN: not FileCheck -color %s < %t.in 2> %t.color
-
-; Check whether color was produced.
-; RUN: FileCheck -check-prefix NO-COLOR %s < %t.no-color
-; RUN: FileCheck -check-prefix COLOR %s < %t.color
-
-; Make sure our NO-COLOR and COLOR patterns are sane: they don't match the
-; opposite cases.
-; RUN: not FileCheck -check-prefix COLOR %s < %t.no-color
-; RUN: not FileCheck -check-prefix NO-COLOR %s < %t.color
-
-; I don't know of a good way to check for ANSI color codes, so just make sure
-; some new characters show up where those codes should appear.
-; NO-COLOR: : error: CHECK: expected string not found in input
-; COLOR: : {{.+}}error: {{.+}}CHECK: expected string not found in input
-- 
GitLab


From 6a96dba94bab6e12d551d770c85a9175ee00968b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Oct 2018 18:58:32 +0000
Subject: [PATCH 0399/1116] Revert rL344933 from llvm/trunk: [X86][SSE] Tidyup
 DecodeVPERMILPMask shuffle mask decoding We can't safely assume that certain
 RawMask entries are UNDEF as most variable shuffles ignore non-index bits.
 ........ Add support for UNDEF raw mask elements and remove the ConstantPool
 DecodeVPERMILPMask usage in X86ISelLowering.cpp

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344936 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/Utils/X86ShuffleDecode.cpp | 4 ----
 lib/Target/X86/X86ISelLowering.cpp        | 6 +++++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 4bd5df44abd..fe567f4cece 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -501,10 +501,6 @@ void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
 
   for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
     uint64_t M = RawMask[i];
-    if (M == (uint64_t)SM_SentinelUndef) {
-      ShuffleMask.push_back(M);
-      continue;
-    }
     M = (ScalarBits == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
     unsigned LaneOffset = i & ~(NumEltsPerLane - 1);
     ShuffleMask.push_back((int)(LaneOffset + M));
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ea95a8aeaea..e2b2191b181 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6043,10 +6043,14 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
     SmallVector<uint64_t, 32> RawMask;
-    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, true)) {
+    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
       DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, Mask);
       break;
     }
+    if (auto *C = getTargetConstantFromNode(MaskNode)) {
+      DecodeVPERMILPMask(C, MaskEltSize, Mask);
+      break;
+    }
     return false;
   }
   case X86ISD::PSHUFB: {
-- 
GitLab


From 4762f7cfd743c187e38da7f2431b9f38bd2c09bd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Oct 2018 19:01:25 +0000
Subject: [PATCH 0400/1116] Revert rL344931 from llvm/trunk: [X86][SSE]
 getTargetShuffleMaskIndices - allow opt-in support for whole undef shuffle
 mask elements We can't safely assume that certain RawMask entries are UNDEF
 as most variable shuffles ignore non-index bits - PSHUFB only works on i8
 elts so it'd be safe to use but I'm intending to come up with an alternative
 approach that works for all. ........ Enable this for PSHUFB constant mask
 decoding and remove the ConstantPool DecodePSHUFBMask

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344937 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index e2b2191b181..6059a2a09e7 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5839,23 +5839,20 @@ static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
 
 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
                                         unsigned MaskEltSizeInBits,
-                                        SmallVectorImpl<uint64_t> &RawMask,
-                                        bool AllowWholeUndefs = false) {
+                                        SmallVectorImpl<uint64_t> &RawMask) {
   APInt UndefElts;
   SmallVector<APInt, 64> EltBits;
 
   // Extract the raw target constant bits.
+  // FIXME: We currently don't support UNDEF bits or mask entries.
   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
-                                     EltBits, AllowWholeUndefs,
+                                     EltBits, /* AllowWholeUndefs */ false,
                                      /* AllowPartialUndefs */ false))
     return false;
 
   // Insert the extracted elements into the mask.
-  for (int i = 0, e = EltBits.size(); i != e; ++i) {
-    uint64_t M = AllowWholeUndefs && UndefElts[i] ? SM_SentinelUndef
-                                                  : EltBits[i].getZExtValue();
-    RawMask.push_back(M);
-  }
+  for (APInt Elt : EltBits)
+    RawMask.push_back(Elt.getZExtValue());
 
   return true;
 }
@@ -6060,10 +6057,14 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
     SmallVector<uint64_t, 32> RawMask;
-    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, true)) {
+    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
       DecodePSHUFBMask(RawMask, Mask);
       break;
     }
+    if (auto *C = getTargetConstantFromNode(MaskNode)) {
+      DecodePSHUFBMask(C, Mask);
+      break;
+    }
     return false;
   }
   case X86ISD::VPERMI:
-- 
GitLab


From 7e3227d6f108792b73234bb313f27c9cd88c40f7 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Mon, 22 Oct 2018 19:06:42 +0000
Subject: [PATCH 0401/1116] [hot-cold-split] Add opt remark on success

Summary: Emit optimization remark on successful hot cold split.

Reviewers: sebpop, hiraditya

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53512

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344938 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/HotColdSplitting.cpp      | 8 ++++++++
 test/Transforms/HotColdSplit/split-cold-2.ll | 5 +++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index be4da249955..1d804ccb767 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -335,6 +335,7 @@ Function *
 HotColdSplitting::extractColdRegion(const SmallVectorImpl<BasicBlock *> &Region,
                                     DominatorTree *DT, BlockFrequencyInfo *BFI,
                                     OptimizationRemarkEmitter &ORE) {
+  assert(!Region.empty());
   LLVM_DEBUG(for (auto *BB : Region)
           llvm::dbgs() << "\nExtracting: " << *BB;);
 
@@ -348,6 +349,7 @@ HotColdSplitting::extractColdRegion(const SmallVectorImpl<BasicBlock *> &Region,
   if (Outputs.size() > 0)
     return nullptr;
 
+  Function *OrigF = Region[0]->getParent();
   if (Function *OutF = CE.extractCodeRegion()) {
     User *U = *OutF->user_begin();
     CallInst *CI = cast<CallInst>(U);
@@ -359,6 +361,12 @@ HotColdSplitting::extractColdRegion(const SmallVectorImpl<BasicBlock *> &Region,
     }
     CI->setIsNoInline();
     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
+                                &*Region[0]->begin())
+             << ore::NV("Original", OrigF) << " split cold code into "
+             << ore::NV("Split", OutF);
+    });
     return OutF;
   }
 
diff --git a/test/Transforms/HotColdSplit/split-cold-2.ll b/test/Transforms/HotColdSplit/split-cold-2.ll
index cdbb67a2d4b..de0c7655a6e 100644
--- a/test/Transforms/HotColdSplit/split-cold-2.ll
+++ b/test/Transforms/HotColdSplit/split-cold-2.ll
@@ -1,9 +1,10 @@
-; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
-; RUN: opt -passes=hotcoldsplit -S < %s | FileCheck %s
+; RUN: opt -hotcoldsplit -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
+; RUN: opt -passes=hotcoldsplit -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
 
 ; Make sure this compiles. This test used to fail with an invalid phi node: the
 ; two predecessors were outlined and the SSA representation was invalid.
 
+; CHECK: remark: <unknown>:0:0: fun split cold code into fun_if.else
 ; CHECK-LABEL: @fun
 ; CHECK: codeRepl:
 ; CHECK-NEXT: call void @fun_if.else
-- 
GitLab


From db46784d64525059fdaf2dc0111784986f1cd961 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Mon, 22 Oct 2018 19:51:31 +0000
Subject: [PATCH 0402/1116] Reapply "[MachineCopyPropagation] Reimplement
 CopyTracker in terms of register units"

Recommits r342942, which was reverted in r343189, with a fix for an
issue where we would propagate unsafely if we defined only the upper
part of a register.

Original message:

  Change the copy tracker to keep a single map of register units
  instead of 3 maps of registers. This gives a very significant
  compile time performance improvement to the pass. I measured a
  30-40% decrease in time spent in MCP on x86 and AArch64 and much
  more significant improvements on out of tree targets with more
  registers.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344942 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineCopyPropagation.cpp       | 127 ++++++++++---------
 test/CodeGen/AArch64/machine-cp-clobbers.mir |  51 ++++++++
 test/CodeGen/Hexagon/machine-cp-clobbers.mir |  51 ++++++++
 3 files changed, 171 insertions(+), 58 deletions(-)
 create mode 100644 test/CodeGen/AArch64/machine-cp-clobbers.mir
 create mode 100644 test/CodeGen/Hexagon/machine-cp-clobbers.mir

diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index dfaa946c913..19879fe8900 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -75,98 +75,109 @@ DEBUG_COUNTER(FwdCounter, "machine-cp-fwd",
 namespace {
 
 class CopyTracker {
-  using RegList = SmallVector<unsigned, 4>;
-  using SourceMap = DenseMap<unsigned, RegList>;
-  using Reg2MIMap = DenseMap<unsigned, MachineInstr *>;
+  struct CopyInfo {
+    MachineInstr *MI;
+    SmallVector<unsigned, 4> DefRegs;
+    bool Avail;
+  };
 
-  /// Def -> available copies map.
-  Reg2MIMap AvailCopyMap;
-
-  /// Def -> copies map.
-  Reg2MIMap CopyMap;
-
-  /// Src -> Def map
-  SourceMap SrcMap;
+  DenseMap<unsigned, CopyInfo> Copies;
 
 public:
   /// Mark all of the given registers and their subregisters as unavailable for
   /// copying.
-  void markRegsUnavailable(const RegList &Regs, const TargetRegisterInfo &TRI) {
+  void markRegsUnavailable(ArrayRef<unsigned> Regs,
+                           const TargetRegisterInfo &TRI) {
     for (unsigned Reg : Regs) {
       // Source of copy is no longer available for propagation.
-      for (MCSubRegIterator SR(Reg, &TRI, true); SR.isValid(); ++SR)
-        AvailCopyMap.erase(*SR);
+      for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) {
+        auto CI = Copies.find(*RUI);
+        if (CI != Copies.end())
+          CI->second.Avail = false;
+      }
     }
   }
 
   /// Clobber a single register, removing it from the tracker's copy maps.
   void clobberRegister(unsigned Reg, const TargetRegisterInfo &TRI) {
-    for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) {
-      CopyMap.erase(*AI);
-      AvailCopyMap.erase(*AI);
-
-      SourceMap::iterator SI = SrcMap.find(*AI);
-      if (SI != SrcMap.end()) {
-        markRegsUnavailable(SI->second, TRI);
-        SrcMap.erase(SI);
+    for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) {
+      auto I = Copies.find(*RUI);
+      if (I != Copies.end()) {
+        // When we clobber the source of a copy, we need to clobber everything
+        // it defined.
+        markRegsUnavailable(I->second.DefRegs, TRI);
+        // When we clobber the destination of a copy, we need to clobber the
+        // whole register it defined.
+        if (MachineInstr *MI = I->second.MI)
+          markRegsUnavailable({MI->getOperand(0).getReg()}, TRI);
+        // Now we can erase the copy.
+        Copies.erase(I);
       }
     }
   }
 
   /// Add this copy's registers into the tracker's copy maps.
-  void trackCopy(MachineInstr *Copy, const TargetRegisterInfo &TRI) {
-    assert(Copy->isCopy() && "Tracking non-copy?");
+  void trackCopy(MachineInstr *MI, const TargetRegisterInfo &TRI) {
+    assert(MI->isCopy() && "Tracking non-copy?");
 
-    unsigned Def = Copy->getOperand(0).getReg();
-    unsigned Src = Copy->getOperand(1).getReg();
+    unsigned Def = MI->getOperand(0).getReg();
+    unsigned Src = MI->getOperand(1).getReg();
 
     // Remember Def is defined by the copy.
-    for (MCSubRegIterator SR(Def, &TRI, /*IncludeSelf=*/true); SR.isValid();
-         ++SR) {
-      CopyMap[*SR] = Copy;
-      AvailCopyMap[*SR] = Copy;
-    }
+    for (MCRegUnitIterator RUI(Def, &TRI); RUI.isValid(); ++RUI)
+      Copies[*RUI] = {MI, {}, true};
 
     // Remember source that's copied to Def. Once it's clobbered, then
     // it's no longer available for copy propagation.
-    RegList &DestList = SrcMap[Src];
-    if (!is_contained(DestList, Def))
-      DestList.push_back(Def);
+    for (MCRegUnitIterator RUI(Src, &TRI); RUI.isValid(); ++RUI) {
+      auto I = Copies.insert({*RUI, {nullptr, {}, false}});
+      auto &Copy = I.first->second;
+      if (!is_contained(Copy.DefRegs, Def))
+        Copy.DefRegs.push_back(Def);
+    }
+  }
+
+  bool hasAnyCopies() {
+    return !Copies.empty();
   }
 
-  bool hasAvailableCopies() { return !AvailCopyMap.empty(); }
+  MachineInstr *findCopyForUnit(unsigned RegUnit, const TargetRegisterInfo &TRI,
+                         bool MustBeAvailable = false) {
+    auto CI = Copies.find(RegUnit);
+    if (CI == Copies.end())
+      return nullptr;
+    if (MustBeAvailable && !CI->second.Avail)
+      return nullptr;
+    return CI->second.MI;
+  }
 
-  MachineInstr *findAvailCopy(MachineInstr &DestCopy, unsigned Reg) {
-    auto CI = AvailCopyMap.find(Reg);
-    if (CI == AvailCopyMap.end())
+  MachineInstr *findAvailCopy(MachineInstr &DestCopy, unsigned Reg,
+                              const TargetRegisterInfo &TRI) {
+    // We check the first RegUnit here, since we'll only be interested in the
+    // copy if it copies the entire register anyway.
+    MCRegUnitIterator RUI(Reg, &TRI);
+    MachineInstr *AvailCopy =
+        findCopyForUnit(*RUI, TRI, /*MustBeAvailable=*/true);
+    if (!AvailCopy ||
+        !TRI.isSubRegisterEq(AvailCopy->getOperand(0).getReg(), Reg))
       return nullptr;
-    MachineInstr &AvailCopy = *CI->second;
 
     // Check that the available copy isn't clobbered by any regmasks between
     // itself and the destination.
-    unsigned AvailSrc = AvailCopy.getOperand(1).getReg();
-    unsigned AvailDef = AvailCopy.getOperand(0).getReg();
+    unsigned AvailSrc = AvailCopy->getOperand(1).getReg();
+    unsigned AvailDef = AvailCopy->getOperand(0).getReg();
     for (const MachineInstr &MI :
-         make_range(AvailCopy.getIterator(), DestCopy.getIterator()))
+         make_range(AvailCopy->getIterator(), DestCopy.getIterator()))
       for (const MachineOperand &MO : MI.operands())
         if (MO.isRegMask())
           if (MO.clobbersPhysReg(AvailSrc) || MO.clobbersPhysReg(AvailDef))
             return nullptr;
 
-    return &AvailCopy;
-  }
-
-  MachineInstr *findCopy(unsigned Reg) {
-    auto CI = CopyMap.find(Reg);
-    if (CI != CopyMap.end())
-      return CI->second;
-    return nullptr;
+    return AvailCopy;
   }
 
   void clear() {
-    AvailCopyMap.clear();
-    CopyMap.clear();
-    SrcMap.clear();
+    Copies.clear();
   }
 };
 
@@ -224,8 +235,8 @@ INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE,
 void MachineCopyPropagation::ReadRegister(unsigned Reg) {
   // If 'Reg' is defined by a copy, the copy is no longer a candidate
   // for elimination.
-  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-    if (MachineInstr *Copy = Tracker.findCopy(*AI)) {
+  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
+    if (MachineInstr *Copy = Tracker.findCopyForUnit(*RUI, *TRI)) {
       LLVM_DEBUG(dbgs() << "MCP: Copy is used - not dead: "; Copy->dump());
       MaybeDeadCopies.remove(Copy);
     }
@@ -263,7 +274,7 @@ bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
     return false;
 
   // Search for an existing copy.
-  MachineInstr *PrevCopy = Tracker.findAvailCopy(Copy, Def);
+  MachineInstr *PrevCopy = Tracker.findAvailCopy(Copy, Def, *TRI);
   if (!PrevCopy)
     return false;
 
@@ -357,7 +368,7 @@ bool MachineCopyPropagation::hasImplicitOverlap(const MachineInstr &MI,
 /// Look for available copies whose destination register is used by \p MI and
 /// replace the use in \p MI with the copy's source register.
 void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
-  if (!Tracker.hasAvailableCopies())
+  if (!Tracker.hasAnyCopies())
     return;
 
   // Look for non-tied explicit vreg uses that have an active COPY
@@ -384,7 +395,7 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
     if (!MOUse.isRenamable())
       continue;
 
-    MachineInstr *Copy = Tracker.findAvailCopy(MI, MOUse.getReg());
+    MachineInstr *Copy = Tracker.findAvailCopy(MI, MOUse.getReg(), *TRI);
     if (!Copy)
       continue;
 
diff --git a/test/CodeGen/AArch64/machine-cp-clobbers.mir b/test/CodeGen/AArch64/machine-cp-clobbers.mir
new file mode 100644
index 00000000000..b5c0331d2ef
--- /dev/null
+++ b/test/CodeGen/AArch64/machine-cp-clobbers.mir
@@ -0,0 +1,51 @@
+# RUN: llc -march=aarch64 -o - %s -run-pass=machine-cp | FileCheck %s
+
+---
+name: dont_propagate_past_lower_subreg_kill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: dont_propagate_past_lower_subreg_kill
+    ; CHECK: HINT 0, implicit-def $q0
+    ; CHECK: HINT 0, implicit-def $d1
+    ; CHECK: HINT 0, implicit killed $d1
+    ; CHECK: $q1 = COPY killed $q0
+    ; CHECK: $q2 = COPY $q1
+    ; CHECK: HINT 0, implicit $q2
+    HINT 0, implicit-def $q0
+    $q1 = COPY killed $q0
+    $q0 = COPY killed $q1
+
+    HINT 0, implicit-def $d1
+    HINT 0, implicit killed $d1
+
+    $q1 = COPY killed $q0
+    $q2 = COPY $q1
+    HINT 0, implicit $q2
+
+...
+
+---
+name: dont_propagate_past_upper_subreg_kill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: dont_propagate_past_upper_subreg_kill
+    ; CHECK: HINT 0, implicit-def $z0
+    ; CHECK: HINT 0, implicit-def $z1_hi
+    ; CHECK: HINT 0, implicit killed $z1_hi
+    ; CHECK: $z1 = COPY killed $z0
+    ; CHECK: $z2 = COPY $z1
+    ; CHECK: HINT 0, implicit $z2
+    HINT 0, implicit-def $z0
+    $z1 = COPY killed $z0
+    $z0 = COPY killed $z1
+
+    HINT 0, implicit-def $z1_hi
+    HINT 0, implicit killed $z1_hi
+
+    $z1 = COPY killed $z0
+    $z2 = COPY $z1
+    HINT 0, implicit $z2
+
+...
diff --git a/test/CodeGen/Hexagon/machine-cp-clobbers.mir b/test/CodeGen/Hexagon/machine-cp-clobbers.mir
new file mode 100644
index 00000000000..736eccc217e
--- /dev/null
+++ b/test/CodeGen/Hexagon/machine-cp-clobbers.mir
@@ -0,0 +1,51 @@
+# RUN: llc -march=hexagon -o - %s -run-pass=machine-cp | FileCheck %s
+
+---
+name: dont_propagate_past_lower_subreg_kill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: dont_propagate_past_lower_subreg_kill
+    ; CHECK: A2_nop implicit-def $d0
+    ; CHECK: A2_nop implicit-def $r2
+    ; CHECK: A2_nop implicit killed $r2
+    ; CHECK: $d1 = COPY killed $d0
+    ; CHECK: $d2 = COPY $d1
+    ; CHECK: A2_nop implicit $d2
+    A2_nop implicit-def $d0
+    $d1 = COPY killed $d0
+    $d0 = COPY killed $d1
+
+    A2_nop implicit-def $r2
+    A2_nop implicit killed $r2
+
+    $d1 = COPY killed $d0
+    $d2 = COPY $d1
+    A2_nop implicit $d2
+
+...
+
+---
+name: dont_propagate_past_upper_subreg_kill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: dont_propagate_past_upper_subreg_kill
+    ; CHECK: A2_nop implicit-def $d0
+    ; CHECK: A2_nop implicit-def $r3
+    ; CHECK: A2_nop implicit killed $r3
+    ; CHECK: $d1 = COPY killed $d0
+    ; CHECK: $d2 = COPY $d1
+    ; CHECK: A2_nop implicit $d2
+    A2_nop implicit-def $d0
+    $d1 = COPY killed $d0
+    $d0 = COPY killed $d1
+
+    A2_nop implicit-def $r3
+    A2_nop implicit killed $r3
+
+    $d1 = COPY killed $d0
+    $d2 = COPY $d1
+    A2_nop implicit $d2
+
+...
-- 
GitLab


From 101d24deace1ab8832afcd79a4e7152d4a889187 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Mon, 22 Oct 2018 20:38:13 +0000
Subject: [PATCH 0403/1116] X86: add alias for pushfw/popfw in Intel mode

A while ago we changed pushf and popf in Intel mode to generate pushfq
and popfq. Unfortunately that left us with no way to get the 16-bit
encoding in Intel mode so this patch adds pushfw and popfw as aliases
there.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344949 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrInfo.td      | 4 ++++
 test/MC/X86/intel-syntax-encoding.s | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 39c3bbfd90e..a12c9e81b05 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -2966,6 +2966,8 @@ def : MnemonicAlias<"popf",  "popfl", "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"popf",  "popfq", "att">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"popf",  "popfq", "intel">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"popfd", "popfl", "att">;
+def : MnemonicAlias<"popfw", "popf",  "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popfw", "popf",  "intel">, Requires<[In64BitMode]>;
 
 // FIXME: This is wrong for "push reg".  "push %bx" should turn into pushw in
 // all modes.  However: "push (addr)" and "push $42" should default to
@@ -2978,6 +2980,8 @@ def : MnemonicAlias<"pushf",  "pushfl", "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"pushf",  "pushfq", "att">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"pushf",  "pushfq", "intel">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"pushfd", "pushfl", "att">;
+def : MnemonicAlias<"pushfw", "pushf",  "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushfw", "pushf",  "intel">, Requires<[In64BitMode]>;
 
 def : MnemonicAlias<"popad",  "popal",  "intel">, Requires<[Not64BitMode]>;
 def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>;
diff --git a/test/MC/X86/intel-syntax-encoding.s b/test/MC/X86/intel-syntax-encoding.s
index aedd74447d6..cf1b403e967 100644
--- a/test/MC/X86/intel-syntax-encoding.s
+++ b/test/MC/X86/intel-syntax-encoding.s
@@ -64,6 +64,11 @@
 pushf
 popf
 
+// CHECK: encoding: [0x66,0x9c]
+// CHECK: encoding: [0x66,0x9d]
+pushfw
+popfw
+
 LBB0_3:
 // CHECK: encoding: [0xeb,A]
 	jmp	LBB0_3
-- 
GitLab


From 0f65a4fb0515ecc5bdfecd3c9df6a5575dc73f11 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 22 Oct 2018 21:11:15 +0000
Subject: [PATCH 0404/1116] [x86] add test for PR25498 and complete checks; NFC

Might as well test the actual codegen instead of just the absence of crashing.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344955 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/scheduler-backtracking.ll | 1227 +++++++++++++++++++-
 1 file changed, 1213 insertions(+), 14 deletions(-)

diff --git a/test/CodeGen/X86/scheduler-backtracking.ll b/test/CodeGen/X86/scheduler-backtracking.ll
index d62f07fa0f7..0926a9814ce 100644
--- a/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/test/CodeGen/X86/scheduler-backtracking.ll
@@ -1,15 +1,462 @@
-; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-ilp    | FileCheck %s
-; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-hybrid | FileCheck %s
-; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=source      | FileCheck %s
-; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-burr   | FileCheck %s
-; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=linearize   | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-ilp    | FileCheck %s --check-prefix=ILP
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-hybrid | FileCheck %s --check-prefix=HYBRID
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-burr   | FileCheck %s --check-prefix=BURR
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=source      | FileCheck %s --check-prefix=SRC
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=linearize   | FileCheck %s --check-prefix=LIN
 
 ; PR22304 https://llvm.org/bugs/show_bug.cgi?id=22304
 ; Tests checking backtracking in source scheduler. llc used to crash on them.
 
-; CHECK-LABEL: test1
-define i256 @test1(i256 %a) {
-  %b = add i256 %a, 1 
+define i256 @test1(i256 %a) nounwind {
+; ILP-LABEL: test1:
+; ILP:       # %bb.0:
+; ILP-NEXT:    pushq %rbp
+; ILP-NEXT:    pushq %r15
+; ILP-NEXT:    pushq %r14
+; ILP-NEXT:    pushq %r13
+; ILP-NEXT:    pushq %r12
+; ILP-NEXT:    pushq %rbx
+; ILP-NEXT:    movq %rcx, %r9
+; ILP-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; ILP-NEXT:    xorl %eax, %eax
+; ILP-NEXT:    addq $1, %rsi
+; ILP-NEXT:    adcq $0, %rdx
+; ILP-NEXT:    adcq $0, %r9
+; ILP-NEXT:    adcq $0, %r8
+; ILP-NEXT:    leal 1(%rsi,%rsi), %edi
+; ILP-NEXT:    movl $1, %ebp
+; ILP-NEXT:    xorl %r14d, %r14d
+; ILP-NEXT:    movl %edi, %ecx
+; ILP-NEXT:    shldq %cl, %rbp, %r14
+; ILP-NEXT:    movl $1, %r11d
+; ILP-NEXT:    shlq %cl, %r11
+; ILP-NEXT:    movb $-128, %r10b
+; ILP-NEXT:    subb %dil, %r10b
+; ILP-NEXT:    movq %r9, %r13
+; ILP-NEXT:    movl %r10d, %ecx
+; ILP-NEXT:    shlq %cl, %r13
+; ILP-NEXT:    movl $1, %r12d
+; ILP-NEXT:    shrdq %cl, %rax, %r12
+; ILP-NEXT:    xorl %r15d, %r15d
+; ILP-NEXT:    movl %edi, %ecx
+; ILP-NEXT:    shldq %cl, %r15, %r15
+; ILP-NEXT:    movq %rsi, %rbx
+; ILP-NEXT:    shrdq %cl, %rdx, %rbx
+; ILP-NEXT:    shrq %cl, %rdx
+; ILP-NEXT:    addb $-128, %cl
+; ILP-NEXT:    shrdq %cl, %r8, %r9
+; ILP-NEXT:    testb $64, %dil
+; ILP-NEXT:    cmovneq %r11, %r14
+; ILP-NEXT:    cmoveq %rbx, %rdx
+; ILP-NEXT:    cmovneq %rax, %r15
+; ILP-NEXT:    cmovneq %rax, %r11
+; ILP-NEXT:    testb $64, %r10b
+; ILP-NEXT:    cmovneq %rax, %r12
+; ILP-NEXT:    cmovneq %rax, %r13
+; ILP-NEXT:    movl $1, %ebx
+; ILP-NEXT:    shlq %cl, %rbx
+; ILP-NEXT:    orl %edx, %r13d
+; ILP-NEXT:    xorl %edx, %edx
+; ILP-NEXT:    movl $1, %ebp
+; ILP-NEXT:    shldq %cl, %rbp, %rdx
+; ILP-NEXT:    shrq %cl, %r8
+; ILP-NEXT:    testb $64, %cl
+; ILP-NEXT:    cmoveq %r9, %r8
+; ILP-NEXT:    cmovneq %rbx, %rdx
+; ILP-NEXT:    cmovneq %rax, %rbx
+; ILP-NEXT:    testb %dil, %dil
+; ILP-NEXT:    cmovsq %rax, %r14
+; ILP-NEXT:    cmovsq %rax, %r11
+; ILP-NEXT:    jns .LBB0_2
+; ILP-NEXT:  # %bb.1:
+; ILP-NEXT:    movl %r8d, %r13d
+; ILP-NEXT:  .LBB0_2:
+; ILP-NEXT:    je .LBB0_4
+; ILP-NEXT:  # %bb.3:
+; ILP-NEXT:    movl %r13d, %esi
+; ILP-NEXT:  .LBB0_4:
+; ILP-NEXT:    cmovnsq %r12, %rbx
+; ILP-NEXT:    cmoveq %rax, %rbx
+; ILP-NEXT:    cmovnsq %r15, %rdx
+; ILP-NEXT:    cmoveq %rax, %rdx
+; ILP-NEXT:    testb $1, %sil
+; ILP-NEXT:    cmovneq %rax, %rdx
+; ILP-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; ILP-NEXT:    movq %rdx, 24(%rax)
+; ILP-NEXT:    cmovneq %rax, %rbx
+; ILP-NEXT:    movq %rbx, 16(%rax)
+; ILP-NEXT:    cmovneq %rax, %r14
+; ILP-NEXT:    movq %r14, 8(%rax)
+; ILP-NEXT:    cmovneq %rax, %r11
+; ILP-NEXT:    movq %r11, (%rax)
+; ILP-NEXT:    popq %rbx
+; ILP-NEXT:    popq %r12
+; ILP-NEXT:    popq %r13
+; ILP-NEXT:    popq %r14
+; ILP-NEXT:    popq %r15
+; ILP-NEXT:    popq %rbp
+; ILP-NEXT:    retq
+;
+; HYBRID-LABEL: test1:
+; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    pushq %rbp
+; HYBRID-NEXT:    pushq %r15
+; HYBRID-NEXT:    pushq %r14
+; HYBRID-NEXT:    pushq %r13
+; HYBRID-NEXT:    pushq %r12
+; HYBRID-NEXT:    pushq %rbx
+; HYBRID-NEXT:    movq %rcx, %r9
+; HYBRID-NEXT:    movq %rdi, %rax
+; HYBRID-NEXT:    addq $1, %rsi
+; HYBRID-NEXT:    adcq $0, %rdx
+; HYBRID-NEXT:    adcq $0, %r9
+; HYBRID-NEXT:    adcq $0, %r8
+; HYBRID-NEXT:    xorl %r10d, %r10d
+; HYBRID-NEXT:    leal 1(%rsi,%rsi), %edi
+; HYBRID-NEXT:    xorl %r14d, %r14d
+; HYBRID-NEXT:    movl %edi, %ecx
+; HYBRID-NEXT:    shldq %cl, %r14, %r14
+; HYBRID-NEXT:    testb $64, %dil
+; HYBRID-NEXT:    cmovneq %r10, %r14
+; HYBRID-NEXT:    movl $1, %ebp
+; HYBRID-NEXT:    movl $1, %r12d
+; HYBRID-NEXT:    shlq %cl, %r12
+; HYBRID-NEXT:    testb $64, %dil
+; HYBRID-NEXT:    movq %r12, %r11
+; HYBRID-NEXT:    cmovneq %r10, %r11
+; HYBRID-NEXT:    movq %rsi, %rbx
+; HYBRID-NEXT:    shrdq %cl, %rdx, %rbx
+; HYBRID-NEXT:    shrq %cl, %rdx
+; HYBRID-NEXT:    testb $64, %dil
+; HYBRID-NEXT:    cmoveq %rbx, %rdx
+; HYBRID-NEXT:    xorl %r15d, %r15d
+; HYBRID-NEXT:    shldq %cl, %rbp, %r15
+; HYBRID-NEXT:    testb $64, %dil
+; HYBRID-NEXT:    cmovneq %r12, %r15
+; HYBRID-NEXT:    movb $-128, %cl
+; HYBRID-NEXT:    subb %dil, %cl
+; HYBRID-NEXT:    movq %r9, %r13
+; HYBRID-NEXT:    shlq %cl, %r13
+; HYBRID-NEXT:    movl $1, %r12d
+; HYBRID-NEXT:    shrdq %cl, %r10, %r12
+; HYBRID-NEXT:    testb $64, %cl
+; HYBRID-NEXT:    cmovneq %r10, %r12
+; HYBRID-NEXT:    cmovneq %r10, %r13
+; HYBRID-NEXT:    orl %edx, %r13d
+; HYBRID-NEXT:    movl %edi, %ecx
+; HYBRID-NEXT:    addb $-128, %cl
+; HYBRID-NEXT:    shrdq %cl, %r8, %r9
+; HYBRID-NEXT:    shrq %cl, %r8
+; HYBRID-NEXT:    xorl %edx, %edx
+; HYBRID-NEXT:    shldq %cl, %rbp, %rdx
+; HYBRID-NEXT:    shlq %cl, %rbp
+; HYBRID-NEXT:    testb $64, %cl
+; HYBRID-NEXT:    cmovneq %rbp, %rdx
+; HYBRID-NEXT:    cmoveq %r9, %r8
+; HYBRID-NEXT:    cmovneq %r10, %rbp
+; HYBRID-NEXT:    testb %dil, %dil
+; HYBRID-NEXT:    jns .LBB0_2
+; HYBRID-NEXT:  # %bb.1:
+; HYBRID-NEXT:    movl %r8d, %r13d
+; HYBRID-NEXT:  .LBB0_2:
+; HYBRID-NEXT:    je .LBB0_4
+; HYBRID-NEXT:  # %bb.3:
+; HYBRID-NEXT:    movl %r13d, %esi
+; HYBRID-NEXT:  .LBB0_4:
+; HYBRID-NEXT:    cmovsq %r10, %r15
+; HYBRID-NEXT:    cmovnsq %r12, %rbp
+; HYBRID-NEXT:    cmoveq %r10, %rbp
+; HYBRID-NEXT:    cmovnsq %r14, %rdx
+; HYBRID-NEXT:    cmoveq %r10, %rdx
+; HYBRID-NEXT:    cmovsq %r10, %r11
+; HYBRID-NEXT:    testb $1, %sil
+; HYBRID-NEXT:    cmovneq %rax, %rdx
+; HYBRID-NEXT:    movq %rdx, 24(%rax)
+; HYBRID-NEXT:    cmovneq %rax, %rbp
+; HYBRID-NEXT:    movq %rbp, 16(%rax)
+; HYBRID-NEXT:    cmovneq %rax, %r15
+; HYBRID-NEXT:    movq %r15, 8(%rax)
+; HYBRID-NEXT:    cmovneq %rax, %r11
+; HYBRID-NEXT:    movq %r11, (%rax)
+; HYBRID-NEXT:    popq %rbx
+; HYBRID-NEXT:    popq %r12
+; HYBRID-NEXT:    popq %r13
+; HYBRID-NEXT:    popq %r14
+; HYBRID-NEXT:    popq %r15
+; HYBRID-NEXT:    popq %rbp
+; HYBRID-NEXT:    retq
+;
+; BURR-LABEL: test1:
+; BURR:       # %bb.0:
+; BURR-NEXT:    pushq %rbp
+; BURR-NEXT:    pushq %r15
+; BURR-NEXT:    pushq %r14
+; BURR-NEXT:    pushq %r13
+; BURR-NEXT:    pushq %r12
+; BURR-NEXT:    pushq %rbx
+; BURR-NEXT:    movq %rcx, %r9
+; BURR-NEXT:    movq %rdi, %rax
+; BURR-NEXT:    addq $1, %rsi
+; BURR-NEXT:    adcq $0, %rdx
+; BURR-NEXT:    adcq $0, %r9
+; BURR-NEXT:    adcq $0, %r8
+; BURR-NEXT:    xorl %r10d, %r10d
+; BURR-NEXT:    leal 1(%rsi,%rsi), %edi
+; BURR-NEXT:    xorl %r14d, %r14d
+; BURR-NEXT:    movl %edi, %ecx
+; BURR-NEXT:    shldq %cl, %r14, %r14
+; BURR-NEXT:    testb $64, %dil
+; BURR-NEXT:    cmovneq %r10, %r14
+; BURR-NEXT:    movl $1, %ebp
+; BURR-NEXT:    movl $1, %r12d
+; BURR-NEXT:    shlq %cl, %r12
+; BURR-NEXT:    testb $64, %dil
+; BURR-NEXT:    movq %r12, %r11
+; BURR-NEXT:    cmovneq %r10, %r11
+; BURR-NEXT:    movq %rsi, %rbx
+; BURR-NEXT:    shrdq %cl, %rdx, %rbx
+; BURR-NEXT:    shrq %cl, %rdx
+; BURR-NEXT:    testb $64, %dil
+; BURR-NEXT:    cmoveq %rbx, %rdx
+; BURR-NEXT:    xorl %r15d, %r15d
+; BURR-NEXT:    shldq %cl, %rbp, %r15
+; BURR-NEXT:    testb $64, %dil
+; BURR-NEXT:    cmovneq %r12, %r15
+; BURR-NEXT:    movb $-128, %cl
+; BURR-NEXT:    subb %dil, %cl
+; BURR-NEXT:    movq %r9, %r13
+; BURR-NEXT:    shlq %cl, %r13
+; BURR-NEXT:    movl $1, %r12d
+; BURR-NEXT:    shrdq %cl, %r10, %r12
+; BURR-NEXT:    testb $64, %cl
+; BURR-NEXT:    cmovneq %r10, %r12
+; BURR-NEXT:    cmovneq %r10, %r13
+; BURR-NEXT:    orl %edx, %r13d
+; BURR-NEXT:    movl %edi, %ecx
+; BURR-NEXT:    addb $-128, %cl
+; BURR-NEXT:    shrdq %cl, %r8, %r9
+; BURR-NEXT:    xorl %edx, %edx
+; BURR-NEXT:    shldq %cl, %rbp, %rdx
+; BURR-NEXT:    shrq %cl, %r8
+; BURR-NEXT:    shlq %cl, %rbp
+; BURR-NEXT:    testb $64, %cl
+; BURR-NEXT:    cmovneq %rbp, %rdx
+; BURR-NEXT:    cmoveq %r9, %r8
+; BURR-NEXT:    cmovneq %r10, %rbp
+; BURR-NEXT:    testb %dil, %dil
+; BURR-NEXT:    jns .LBB0_2
+; BURR-NEXT:  # %bb.1:
+; BURR-NEXT:    movl %r8d, %r13d
+; BURR-NEXT:  .LBB0_2:
+; BURR-NEXT:    je .LBB0_4
+; BURR-NEXT:  # %bb.3:
+; BURR-NEXT:    movl %r13d, %esi
+; BURR-NEXT:  .LBB0_4:
+; BURR-NEXT:    cmovsq %r10, %r15
+; BURR-NEXT:    cmovnsq %r12, %rbp
+; BURR-NEXT:    cmoveq %r10, %rbp
+; BURR-NEXT:    cmovnsq %r14, %rdx
+; BURR-NEXT:    cmoveq %r10, %rdx
+; BURR-NEXT:    cmovsq %r10, %r11
+; BURR-NEXT:    testb $1, %sil
+; BURR-NEXT:    cmovneq %rax, %rdx
+; BURR-NEXT:    movq %rdx, 24(%rax)
+; BURR-NEXT:    cmovneq %rax, %rbp
+; BURR-NEXT:    movq %rbp, 16(%rax)
+; BURR-NEXT:    cmovneq %rax, %r15
+; BURR-NEXT:    movq %r15, 8(%rax)
+; BURR-NEXT:    cmovneq %rax, %r11
+; BURR-NEXT:    movq %r11, (%rax)
+; BURR-NEXT:    popq %rbx
+; BURR-NEXT:    popq %r12
+; BURR-NEXT:    popq %r13
+; BURR-NEXT:    popq %r14
+; BURR-NEXT:    popq %r15
+; BURR-NEXT:    popq %rbp
+; BURR-NEXT:    retq
+;
+; SRC-LABEL: test1:
+; SRC:       # %bb.0:
+; SRC-NEXT:    pushq %rbp
+; SRC-NEXT:    pushq %r15
+; SRC-NEXT:    pushq %r14
+; SRC-NEXT:    pushq %r13
+; SRC-NEXT:    pushq %r12
+; SRC-NEXT:    pushq %rbx
+; SRC-NEXT:    movq %rcx, %r9
+; SRC-NEXT:    movq %rdi, %rax
+; SRC-NEXT:    addq $1, %rsi
+; SRC-NEXT:    adcq $0, %rdx
+; SRC-NEXT:    adcq $0, %r9
+; SRC-NEXT:    adcq $0, %r8
+; SRC-NEXT:    leal 1(%rsi,%rsi), %r11d
+; SRC-NEXT:    movb $-128, %r10b
+; SRC-NEXT:    subb %r11b, %r10b
+; SRC-NEXT:    movq %r9, %r12
+; SRC-NEXT:    movl %r10d, %ecx
+; SRC-NEXT:    shlq %cl, %r12
+; SRC-NEXT:    movq %rsi, %rbp
+; SRC-NEXT:    movl %r11d, %ecx
+; SRC-NEXT:    shrdq %cl, %rdx, %rbp
+; SRC-NEXT:    shrq %cl, %rdx
+; SRC-NEXT:    xorl %r15d, %r15d
+; SRC-NEXT:    movl $1, %edi
+; SRC-NEXT:    xorl %r14d, %r14d
+; SRC-NEXT:    shldq %cl, %rdi, %r14
+; SRC-NEXT:    xorl %r13d, %r13d
+; SRC-NEXT:    shldq %cl, %r13, %r13
+; SRC-NEXT:    movl $1, %ebx
+; SRC-NEXT:    shlq %cl, %rbx
+; SRC-NEXT:    testb $64, %r11b
+; SRC-NEXT:    cmoveq %rbp, %rdx
+; SRC-NEXT:    cmovneq %rbx, %r14
+; SRC-NEXT:    cmovneq %r15, %rbx
+; SRC-NEXT:    cmovneq %r15, %r13
+; SRC-NEXT:    movl $1, %ebp
+; SRC-NEXT:    movl %r10d, %ecx
+; SRC-NEXT:    shrdq %cl, %r15, %rbp
+; SRC-NEXT:    testb $64, %r10b
+; SRC-NEXT:    cmovneq %r15, %r12
+; SRC-NEXT:    cmovneq %r15, %rbp
+; SRC-NEXT:    orl %edx, %r12d
+; SRC-NEXT:    movl %r11d, %ecx
+; SRC-NEXT:    addb $-128, %cl
+; SRC-NEXT:    shrdq %cl, %r8, %r9
+; SRC-NEXT:    shrq %cl, %r8
+; SRC-NEXT:    xorl %edx, %edx
+; SRC-NEXT:    shldq %cl, %rdi, %rdx
+; SRC-NEXT:    shlq %cl, %rdi
+; SRC-NEXT:    testb $64, %cl
+; SRC-NEXT:    cmoveq %r9, %r8
+; SRC-NEXT:    cmovneq %rdi, %rdx
+; SRC-NEXT:    cmovneq %r15, %rdi
+; SRC-NEXT:    testb %r11b, %r11b
+; SRC-NEXT:    jns .LBB0_2
+; SRC-NEXT:  # %bb.1:
+; SRC-NEXT:    movl %r8d, %r12d
+; SRC-NEXT:  .LBB0_2:
+; SRC-NEXT:    je .LBB0_4
+; SRC-NEXT:  # %bb.3:
+; SRC-NEXT:    movl %r12d, %esi
+; SRC-NEXT:  .LBB0_4:
+; SRC-NEXT:    cmovnsq %r13, %rdx
+; SRC-NEXT:    cmoveq %r15, %rdx
+; SRC-NEXT:    cmovnsq %rbp, %rdi
+; SRC-NEXT:    cmoveq %r15, %rdi
+; SRC-NEXT:    cmovsq %r15, %r14
+; SRC-NEXT:    cmovsq %r15, %rbx
+; SRC-NEXT:    testb $1, %sil
+; SRC-NEXT:    cmovneq %rax, %rbx
+; SRC-NEXT:    cmovneq %rax, %r14
+; SRC-NEXT:    cmovneq %rax, %rdi
+; SRC-NEXT:    cmovneq %rax, %rdx
+; SRC-NEXT:    movq %rdx, 24(%rax)
+; SRC-NEXT:    movq %rdi, 16(%rax)
+; SRC-NEXT:    movq %r14, 8(%rax)
+; SRC-NEXT:    movq %rbx, (%rax)
+; SRC-NEXT:    popq %rbx
+; SRC-NEXT:    popq %r12
+; SRC-NEXT:    popq %r13
+; SRC-NEXT:    popq %r14
+; SRC-NEXT:    popq %r15
+; SRC-NEXT:    popq %rbp
+; SRC-NEXT:    retq
+;
+; LIN-LABEL: test1:
+; LIN:       # %bb.0:
+; LIN-NEXT:    pushq %rbp
+; LIN-NEXT:    pushq %r15
+; LIN-NEXT:    pushq %r14
+; LIN-NEXT:    pushq %r12
+; LIN-NEXT:    pushq %rbx
+; LIN-NEXT:    movq %rcx, %r9
+; LIN-NEXT:    movq %rdi, %rax
+; LIN-NEXT:    xorl %r15d, %r15d
+; LIN-NEXT:    movl $1, %r14d
+; LIN-NEXT:    addq $1, %rsi
+; LIN-NEXT:    leal 1(%rsi,%rsi), %ebp
+; LIN-NEXT:    movl $1, %r12d
+; LIN-NEXT:    movl %ebp, %ecx
+; LIN-NEXT:    shlq %cl, %r12
+; LIN-NEXT:    testb $64, %bpl
+; LIN-NEXT:    movq %r12, %rbx
+; LIN-NEXT:    cmovneq %r15, %rbx
+; LIN-NEXT:    testb %bpl, %bpl
+; LIN-NEXT:    cmovsq %r15, %rbx
+; LIN-NEXT:    adcq $0, %rdx
+; LIN-NEXT:    adcq $0, %r9
+; LIN-NEXT:    adcq $0, %r8
+; LIN-NEXT:    movl %ebp, %r10d
+; LIN-NEXT:    addb $-128, %r10b
+; LIN-NEXT:    movq %r9, %rdi
+; LIN-NEXT:    movl %r10d, %ecx
+; LIN-NEXT:    shrdq %cl, %r8, %rdi
+; LIN-NEXT:    shrq %cl, %r8
+; LIN-NEXT:    testb $64, %r10b
+; LIN-NEXT:    cmoveq %rdi, %r8
+; LIN-NEXT:    movq %rsi, %rdi
+; LIN-NEXT:    movl %ebp, %ecx
+; LIN-NEXT:    shrdq %cl, %rdx, %rdi
+; LIN-NEXT:    shrq %cl, %rdx
+; LIN-NEXT:    cmoveq %rdi, %rdx
+; LIN-NEXT:    movb $-128, %r11b
+; LIN-NEXT:    subb %bpl, %r11b
+; LIN-NEXT:    movl %r11d, %ecx
+; LIN-NEXT:    shlq %cl, %r9
+; LIN-NEXT:    testb $64, %r11b
+; LIN-NEXT:    cmovneq %r15, %r9
+; LIN-NEXT:    orl %edx, %r9d
+; LIN-NEXT:    jns .LBB0_2
+; LIN-NEXT:  # %bb.1:
+; LIN-NEXT:    movl %r8d, %r9d
+; LIN-NEXT:  .LBB0_2:
+; LIN-NEXT:    je .LBB0_4
+; LIN-NEXT:  # %bb.3:
+; LIN-NEXT:    movl %r9d, %esi
+; LIN-NEXT:  .LBB0_4:
+; LIN-NEXT:    testb $1, %sil
+; LIN-NEXT:    cmovneq %rax, %rbx
+; LIN-NEXT:    movq %rbx, (%rax)
+; LIN-NEXT:    xorl %edx, %edx
+; LIN-NEXT:    movl %ebp, %ecx
+; LIN-NEXT:    shldq %cl, %r14, %rdx
+; LIN-NEXT:    cmovneq %r12, %rdx
+; LIN-NEXT:    cmovsq %r15, %rdx
+; LIN-NEXT:    cmovneq %rax, %rdx
+; LIN-NEXT:    movq %rdx, 8(%rax)
+; LIN-NEXT:    movl $1, %edx
+; LIN-NEXT:    movl %r10d, %ecx
+; LIN-NEXT:    shlq %cl, %rdx
+; LIN-NEXT:    movq %rdx, %rsi
+; LIN-NEXT:    cmovneq %r15, %rsi
+; LIN-NEXT:    movl $1, %edi
+; LIN-NEXT:    movl %r11d, %ecx
+; LIN-NEXT:    shrdq %cl, %r15, %rdi
+; LIN-NEXT:    cmovneq %r15, %rdi
+; LIN-NEXT:    cmovsq %rsi, %rdi
+; LIN-NEXT:    cmoveq %r15, %rdi
+; LIN-NEXT:    cmovneq %rax, %rdi
+; LIN-NEXT:    movq %rdi, 16(%rax)
+; LIN-NEXT:    xorl %esi, %esi
+; LIN-NEXT:    movl %r10d, %ecx
+; LIN-NEXT:    shldq %cl, %r14, %rsi
+; LIN-NEXT:    cmovneq %rdx, %rsi
+; LIN-NEXT:    xorl %edx, %edx
+; LIN-NEXT:    movl %ebp, %ecx
+; LIN-NEXT:    shldq %cl, %rdx, %rdx
+; LIN-NEXT:    cmovneq %r15, %rdx
+; LIN-NEXT:    cmovsq %rsi, %rdx
+; LIN-NEXT:    cmoveq %r15, %rdx
+; LIN-NEXT:    cmovneq %rax, %rdx
+; LIN-NEXT:    movq %rdx, 24(%rax)
+; LIN-NEXT:    popq %rbx
+; LIN-NEXT:    popq %r12
+; LIN-NEXT:    popq %r14
+; LIN-NEXT:    popq %r15
+; LIN-NEXT:    popq %rbp
+; LIN-NEXT:    retq
+  %b = add i256 %a, 1
   %m = shl i256 %b, 1
   %p = add i256 %m, 1
   %v = lshr i256 %b, %p
@@ -19,16 +466,436 @@ define i256 @test1(i256 %a) {
   ret i256 %f
 }
 
-; CHECK-LABEL: test2
-define i256 @test2(i256 %a) {
+define i256 @test2(i256 %a) nounwind {
+; ILP-LABEL: test2:
+; ILP:       # %bb.0:
+; ILP-NEXT:    movq %rdi, %rax
+; ILP-NEXT:    xorl %edi, %edi
+; ILP-NEXT:    movq %rsi, %r11
+; ILP-NEXT:    negq %r11
+; ILP-NEXT:    movl $0, %r10d
+; ILP-NEXT:    sbbq %rdx, %r10
+; ILP-NEXT:    movl $0, %r9d
+; ILP-NEXT:    sbbq %rcx, %r9
+; ILP-NEXT:    sbbq %r8, %rdi
+; ILP-NEXT:    andq %rcx, %r9
+; ILP-NEXT:    bsrq %r9, %rcx
+; ILP-NEXT:    xorq $63, %rcx
+; ILP-NEXT:    andq %r8, %rdi
+; ILP-NEXT:    bsrq %rdi, %r8
+; ILP-NEXT:    andq %rdx, %r10
+; ILP-NEXT:    bsrq %r10, %rdx
+; ILP-NEXT:    xorq $63, %r8
+; ILP-NEXT:    addq $64, %rcx
+; ILP-NEXT:    testq %rdi, %rdi
+; ILP-NEXT:    movq $0, 24(%rax)
+; ILP-NEXT:    movq $0, 16(%rax)
+; ILP-NEXT:    movq $0, 8(%rax)
+; ILP-NEXT:    cmovneq %r8, %rcx
+; ILP-NEXT:    xorq $63, %rdx
+; ILP-NEXT:    andq %rsi, %r11
+; ILP-NEXT:    movl $127, %r8d
+; ILP-NEXT:    bsrq %r11, %rsi
+; ILP-NEXT:    cmoveq %r8, %rsi
+; ILP-NEXT:    xorq $63, %rsi
+; ILP-NEXT:    addq $64, %rsi
+; ILP-NEXT:    testq %r10, %r10
+; ILP-NEXT:    cmovneq %rdx, %rsi
+; ILP-NEXT:    subq $-128, %rsi
+; ILP-NEXT:    orq %r9, %rdi
+; ILP-NEXT:    cmovneq %rcx, %rsi
+; ILP-NEXT:    movq %rsi, (%rax)
+; ILP-NEXT:    retq
+;
+; HYBRID-LABEL: test2:
+; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    movq %rdi, %rax
+; HYBRID-NEXT:    xorl %r9d, %r9d
+; HYBRID-NEXT:    movq %rsi, %r11
+; HYBRID-NEXT:    negq %r11
+; HYBRID-NEXT:    movl $0, %r10d
+; HYBRID-NEXT:    sbbq %rdx, %r10
+; HYBRID-NEXT:    movl $0, %edi
+; HYBRID-NEXT:    sbbq %rcx, %rdi
+; HYBRID-NEXT:    sbbq %r8, %r9
+; HYBRID-NEXT:    andq %r8, %r9
+; HYBRID-NEXT:    bsrq %r9, %r8
+; HYBRID-NEXT:    xorq $63, %r8
+; HYBRID-NEXT:    andq %rcx, %rdi
+; HYBRID-NEXT:    bsrq %rdi, %rcx
+; HYBRID-NEXT:    xorq $63, %rcx
+; HYBRID-NEXT:    addq $64, %rcx
+; HYBRID-NEXT:    testq %r9, %r9
+; HYBRID-NEXT:    cmovneq %r8, %rcx
+; HYBRID-NEXT:    andq %rdx, %r10
+; HYBRID-NEXT:    bsrq %r10, %rdx
+; HYBRID-NEXT:    xorq $63, %rdx
+; HYBRID-NEXT:    andq %rsi, %r11
+; HYBRID-NEXT:    movl $127, %r8d
+; HYBRID-NEXT:    bsrq %r11, %rsi
+; HYBRID-NEXT:    cmoveq %r8, %rsi
+; HYBRID-NEXT:    xorq $63, %rsi
+; HYBRID-NEXT:    addq $64, %rsi
+; HYBRID-NEXT:    testq %r10, %r10
+; HYBRID-NEXT:    cmovneq %rdx, %rsi
+; HYBRID-NEXT:    subq $-128, %rsi
+; HYBRID-NEXT:    orq %r9, %rdi
+; HYBRID-NEXT:    cmovneq %rcx, %rsi
+; HYBRID-NEXT:    movq %rsi, (%rax)
+; HYBRID-NEXT:    movq $0, 24(%rax)
+; HYBRID-NEXT:    movq $0, 16(%rax)
+; HYBRID-NEXT:    movq $0, 8(%rax)
+; HYBRID-NEXT:    retq
+;
+; BURR-LABEL: test2:
+; BURR:       # %bb.0:
+; BURR-NEXT:    movq %rdi, %rax
+; BURR-NEXT:    xorl %r9d, %r9d
+; BURR-NEXT:    movq %rsi, %r11
+; BURR-NEXT:    negq %r11
+; BURR-NEXT:    movl $0, %r10d
+; BURR-NEXT:    sbbq %rdx, %r10
+; BURR-NEXT:    movl $0, %edi
+; BURR-NEXT:    sbbq %rcx, %rdi
+; BURR-NEXT:    sbbq %r8, %r9
+; BURR-NEXT:    andq %r8, %r9
+; BURR-NEXT:    bsrq %r9, %r8
+; BURR-NEXT:    xorq $63, %r8
+; BURR-NEXT:    andq %rcx, %rdi
+; BURR-NEXT:    bsrq %rdi, %rcx
+; BURR-NEXT:    xorq $63, %rcx
+; BURR-NEXT:    addq $64, %rcx
+; BURR-NEXT:    testq %r9, %r9
+; BURR-NEXT:    cmovneq %r8, %rcx
+; BURR-NEXT:    andq %rdx, %r10
+; BURR-NEXT:    bsrq %r10, %rdx
+; BURR-NEXT:    xorq $63, %rdx
+; BURR-NEXT:    andq %rsi, %r11
+; BURR-NEXT:    movl $127, %r8d
+; BURR-NEXT:    bsrq %r11, %rsi
+; BURR-NEXT:    cmoveq %r8, %rsi
+; BURR-NEXT:    xorq $63, %rsi
+; BURR-NEXT:    addq $64, %rsi
+; BURR-NEXT:    testq %r10, %r10
+; BURR-NEXT:    cmovneq %rdx, %rsi
+; BURR-NEXT:    subq $-128, %rsi
+; BURR-NEXT:    orq %r9, %rdi
+; BURR-NEXT:    cmovneq %rcx, %rsi
+; BURR-NEXT:    movq %rsi, (%rax)
+; BURR-NEXT:    movq $0, 24(%rax)
+; BURR-NEXT:    movq $0, 16(%rax)
+; BURR-NEXT:    movq $0, 8(%rax)
+; BURR-NEXT:    retq
+;
+; SRC-LABEL: test2:
+; SRC:       # %bb.0:
+; SRC-NEXT:    movq %rdi, %rax
+; SRC-NEXT:    xorl %edi, %edi
+; SRC-NEXT:    movq %rsi, %r11
+; SRC-NEXT:    negq %r11
+; SRC-NEXT:    movl $0, %r10d
+; SRC-NEXT:    sbbq %rdx, %r10
+; SRC-NEXT:    movl $0, %r9d
+; SRC-NEXT:    sbbq %rcx, %r9
+; SRC-NEXT:    sbbq %r8, %rdi
+; SRC-NEXT:    andq %rdx, %r10
+; SRC-NEXT:    andq %rcx, %r9
+; SRC-NEXT:    andq %r8, %rdi
+; SRC-NEXT:    andq %rsi, %r11
+; SRC-NEXT:    bsrq %rdi, %rcx
+; SRC-NEXT:    xorq $63, %rcx
+; SRC-NEXT:    bsrq %r9, %rdx
+; SRC-NEXT:    xorq $63, %rdx
+; SRC-NEXT:    addq $64, %rdx
+; SRC-NEXT:    testq %rdi, %rdi
+; SRC-NEXT:    cmovneq %rcx, %rdx
+; SRC-NEXT:    bsrq %r10, %rcx
+; SRC-NEXT:    xorq $63, %rcx
+; SRC-NEXT:    bsrq %r11, %r8
+; SRC-NEXT:    movl $127, %esi
+; SRC-NEXT:    cmovneq %r8, %rsi
+; SRC-NEXT:    xorq $63, %rsi
+; SRC-NEXT:    addq $64, %rsi
+; SRC-NEXT:    testq %r10, %r10
+; SRC-NEXT:    cmovneq %rcx, %rsi
+; SRC-NEXT:    subq $-128, %rsi
+; SRC-NEXT:    orq %r9, %rdi
+; SRC-NEXT:    cmovneq %rdx, %rsi
+; SRC-NEXT:    movq %rsi, (%rax)
+; SRC-NEXT:    movq $0, 24(%rax)
+; SRC-NEXT:    movq $0, 16(%rax)
+; SRC-NEXT:    movq $0, 8(%rax)
+; SRC-NEXT:    retq
+;
+; LIN-LABEL: test2:
+; LIN:       # %bb.0:
+; LIN-NEXT:    movq %rdi, %rax
+; LIN-NEXT:    movq %rsi, %rdi
+; LIN-NEXT:    negq %rdi
+; LIN-NEXT:    andq %rsi, %rdi
+; LIN-NEXT:    bsrq %rdi, %rsi
+; LIN-NEXT:    movl $127, %edi
+; LIN-NEXT:    cmovneq %rsi, %rdi
+; LIN-NEXT:    xorq $63, %rdi
+; LIN-NEXT:    addq $64, %rdi
+; LIN-NEXT:    xorl %r9d, %r9d
+; LIN-NEXT:    movl $0, %esi
+; LIN-NEXT:    sbbq %rdx, %rsi
+; LIN-NEXT:    andq %rdx, %rsi
+; LIN-NEXT:    bsrq %rsi, %rdx
+; LIN-NEXT:    xorq $63, %rdx
+; LIN-NEXT:    testq %rsi, %rsi
+; LIN-NEXT:    cmoveq %rdi, %rdx
+; LIN-NEXT:    subq $-128, %rdx
+; LIN-NEXT:    movl $0, %esi
+; LIN-NEXT:    sbbq %rcx, %rsi
+; LIN-NEXT:    andq %rcx, %rsi
+; LIN-NEXT:    bsrq %rsi, %rcx
+; LIN-NEXT:    xorq $63, %rcx
+; LIN-NEXT:    addq $64, %rcx
+; LIN-NEXT:    sbbq %r8, %r9
+; LIN-NEXT:    andq %r8, %r9
+; LIN-NEXT:    bsrq %r9, %rdi
+; LIN-NEXT:    xorq $63, %rdi
+; LIN-NEXT:    testq %r9, %r9
+; LIN-NEXT:    cmoveq %rcx, %rdi
+; LIN-NEXT:    orq %rsi, %r9
+; LIN-NEXT:    cmoveq %rdx, %rdi
+; LIN-NEXT:    movq %rdi, (%rax)
+; LIN-NEXT:    movq $0, 8(%rax)
+; LIN-NEXT:    movq $0, 16(%rax)
+; LIN-NEXT:    movq $0, 24(%rax)
+; LIN-NEXT:    retq
   %b = sub i256 0, %a
   %c = and i256 %b, %a
   %d = call i256 @llvm.ctlz.i256(i256 %c, i1 false)
   ret i256 %d
 }
 
-; CHECK-LABEL: test3
-define i256 @test3(i256 %n) {
+define i256 @test3(i256 %n) nounwind {
+; ILP-LABEL: test3:
+; ILP:       # %bb.0:
+; ILP-NEXT:    movq %rdi, %rax
+; ILP-NEXT:    xorl %r10d, %r10d
+; ILP-NEXT:    movq %rsi, %r9
+; ILP-NEXT:    negq %r9
+; ILP-NEXT:    movl $0, %r11d
+; ILP-NEXT:    sbbq %rdx, %r11
+; ILP-NEXT:    movl $0, %edi
+; ILP-NEXT:    sbbq %rcx, %rdi
+; ILP-NEXT:    sbbq %r8, %r10
+; ILP-NEXT:    notq %rcx
+; ILP-NEXT:    andq %rdi, %rcx
+; ILP-NEXT:    bsrq %rcx, %rdi
+; ILP-NEXT:    notq %rdx
+; ILP-NEXT:    andq %r11, %rdx
+; ILP-NEXT:    xorq $63, %rdi
+; ILP-NEXT:    notq %r8
+; ILP-NEXT:    andq %r10, %r8
+; ILP-NEXT:    bsrq %r8, %r10
+; ILP-NEXT:    xorq $63, %r10
+; ILP-NEXT:    addq $64, %rdi
+; ILP-NEXT:    bsrq %rdx, %r11
+; ILP-NEXT:    notq %rsi
+; ILP-NEXT:    testq %r8, %r8
+; ILP-NEXT:    movq $0, 24(%rax)
+; ILP-NEXT:    movq $0, 16(%rax)
+; ILP-NEXT:    movq $0, 8(%rax)
+; ILP-NEXT:    cmovneq %r10, %rdi
+; ILP-NEXT:    xorq $63, %r11
+; ILP-NEXT:    andq %r9, %rsi
+; ILP-NEXT:    movl $127, %r9d
+; ILP-NEXT:    bsrq %rsi, %rsi
+; ILP-NEXT:    cmoveq %r9, %rsi
+; ILP-NEXT:    xorq $63, %rsi
+; ILP-NEXT:    addq $64, %rsi
+; ILP-NEXT:    testq %rdx, %rdx
+; ILP-NEXT:    cmovneq %r11, %rsi
+; ILP-NEXT:    subq $-128, %rsi
+; ILP-NEXT:    orq %rcx, %r8
+; ILP-NEXT:    cmovneq %rdi, %rsi
+; ILP-NEXT:    movq %rsi, (%rax)
+; ILP-NEXT:    retq
+;
+; HYBRID-LABEL: test3:
+; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    pushq %rbx
+; HYBRID-NEXT:    movq %rdi, %rax
+; HYBRID-NEXT:    xorl %edi, %edi
+; HYBRID-NEXT:    movq %rsi, %r9
+; HYBRID-NEXT:    negq %r9
+; HYBRID-NEXT:    movl $0, %r10d
+; HYBRID-NEXT:    sbbq %rdx, %r10
+; HYBRID-NEXT:    movl $0, %r11d
+; HYBRID-NEXT:    sbbq %rcx, %r11
+; HYBRID-NEXT:    sbbq %r8, %rdi
+; HYBRID-NEXT:    notq %r8
+; HYBRID-NEXT:    andq %rdi, %r8
+; HYBRID-NEXT:    bsrq %r8, %rbx
+; HYBRID-NEXT:    xorq $63, %rbx
+; HYBRID-NEXT:    notq %rcx
+; HYBRID-NEXT:    andq %r11, %rcx
+; HYBRID-NEXT:    bsrq %rcx, %rdi
+; HYBRID-NEXT:    xorq $63, %rdi
+; HYBRID-NEXT:    addq $64, %rdi
+; HYBRID-NEXT:    testq %r8, %r8
+; HYBRID-NEXT:    cmovneq %rbx, %rdi
+; HYBRID-NEXT:    notq %rdx
+; HYBRID-NEXT:    andq %r10, %rdx
+; HYBRID-NEXT:    bsrq %rdx, %rbx
+; HYBRID-NEXT:    xorq $63, %rbx
+; HYBRID-NEXT:    notq %rsi
+; HYBRID-NEXT:    andq %r9, %rsi
+; HYBRID-NEXT:    movl $127, %r9d
+; HYBRID-NEXT:    bsrq %rsi, %rsi
+; HYBRID-NEXT:    cmoveq %r9, %rsi
+; HYBRID-NEXT:    xorq $63, %rsi
+; HYBRID-NEXT:    addq $64, %rsi
+; HYBRID-NEXT:    testq %rdx, %rdx
+; HYBRID-NEXT:    cmovneq %rbx, %rsi
+; HYBRID-NEXT:    subq $-128, %rsi
+; HYBRID-NEXT:    orq %r8, %rcx
+; HYBRID-NEXT:    cmovneq %rdi, %rsi
+; HYBRID-NEXT:    movq %rsi, (%rax)
+; HYBRID-NEXT:    movq $0, 24(%rax)
+; HYBRID-NEXT:    movq $0, 16(%rax)
+; HYBRID-NEXT:    movq $0, 8(%rax)
+; HYBRID-NEXT:    popq %rbx
+; HYBRID-NEXT:    retq
+;
+; BURR-LABEL: test3:
+; BURR:       # %bb.0:
+; BURR-NEXT:    pushq %rbx
+; BURR-NEXT:    movq %rdi, %rax
+; BURR-NEXT:    xorl %edi, %edi
+; BURR-NEXT:    movq %rsi, %r9
+; BURR-NEXT:    negq %r9
+; BURR-NEXT:    movl $0, %r10d
+; BURR-NEXT:    sbbq %rdx, %r10
+; BURR-NEXT:    movl $0, %r11d
+; BURR-NEXT:    sbbq %rcx, %r11
+; BURR-NEXT:    sbbq %r8, %rdi
+; BURR-NEXT:    notq %r8
+; BURR-NEXT:    andq %rdi, %r8
+; BURR-NEXT:    bsrq %r8, %rbx
+; BURR-NEXT:    xorq $63, %rbx
+; BURR-NEXT:    notq %rcx
+; BURR-NEXT:    andq %r11, %rcx
+; BURR-NEXT:    bsrq %rcx, %rdi
+; BURR-NEXT:    xorq $63, %rdi
+; BURR-NEXT:    addq $64, %rdi
+; BURR-NEXT:    testq %r8, %r8
+; BURR-NEXT:    cmovneq %rbx, %rdi
+; BURR-NEXT:    notq %rdx
+; BURR-NEXT:    andq %r10, %rdx
+; BURR-NEXT:    bsrq %rdx, %rbx
+; BURR-NEXT:    xorq $63, %rbx
+; BURR-NEXT:    notq %rsi
+; BURR-NEXT:    andq %r9, %rsi
+; BURR-NEXT:    movl $127, %r9d
+; BURR-NEXT:    bsrq %rsi, %rsi
+; BURR-NEXT:    cmoveq %r9, %rsi
+; BURR-NEXT:    xorq $63, %rsi
+; BURR-NEXT:    addq $64, %rsi
+; BURR-NEXT:    testq %rdx, %rdx
+; BURR-NEXT:    cmovneq %rbx, %rsi
+; BURR-NEXT:    subq $-128, %rsi
+; BURR-NEXT:    orq %r8, %rcx
+; BURR-NEXT:    cmovneq %rdi, %rsi
+; BURR-NEXT:    movq %rsi, (%rax)
+; BURR-NEXT:    movq $0, 24(%rax)
+; BURR-NEXT:    movq $0, 16(%rax)
+; BURR-NEXT:    movq $0, 8(%rax)
+; BURR-NEXT:    popq %rbx
+; BURR-NEXT:    retq
+;
+; SRC-LABEL: test3:
+; SRC:       # %bb.0:
+; SRC-NEXT:    movq %rdi, %rax
+; SRC-NEXT:    movq %rsi, %r9
+; SRC-NEXT:    notq %r9
+; SRC-NEXT:    xorl %r10d, %r10d
+; SRC-NEXT:    negq %rsi
+; SRC-NEXT:    movl $0, %r11d
+; SRC-NEXT:    sbbq %rdx, %r11
+; SRC-NEXT:    notq %rdx
+; SRC-NEXT:    movl $0, %edi
+; SRC-NEXT:    sbbq %rcx, %rdi
+; SRC-NEXT:    notq %rcx
+; SRC-NEXT:    sbbq %r8, %r10
+; SRC-NEXT:    notq %r8
+; SRC-NEXT:    andq %r11, %rdx
+; SRC-NEXT:    andq %rdi, %rcx
+; SRC-NEXT:    andq %r10, %r8
+; SRC-NEXT:    andq %r9, %rsi
+; SRC-NEXT:    bsrq %r8, %r9
+; SRC-NEXT:    xorq $63, %r9
+; SRC-NEXT:    bsrq %rcx, %rdi
+; SRC-NEXT:    xorq $63, %rdi
+; SRC-NEXT:    addq $64, %rdi
+; SRC-NEXT:    testq %r8, %r8
+; SRC-NEXT:    cmovneq %r9, %rdi
+; SRC-NEXT:    bsrq %rdx, %r9
+; SRC-NEXT:    xorq $63, %r9
+; SRC-NEXT:    bsrq %rsi, %r10
+; SRC-NEXT:    movl $127, %esi
+; SRC-NEXT:    cmovneq %r10, %rsi
+; SRC-NEXT:    xorq $63, %rsi
+; SRC-NEXT:    addq $64, %rsi
+; SRC-NEXT:    testq %rdx, %rdx
+; SRC-NEXT:    cmovneq %r9, %rsi
+; SRC-NEXT:    subq $-128, %rsi
+; SRC-NEXT:    orq %rcx, %r8
+; SRC-NEXT:    cmovneq %rdi, %rsi
+; SRC-NEXT:    movq %rsi, (%rax)
+; SRC-NEXT:    movq $0, 24(%rax)
+; SRC-NEXT:    movq $0, 16(%rax)
+; SRC-NEXT:    movq $0, 8(%rax)
+; SRC-NEXT:    retq
+;
+; LIN-LABEL: test3:
+; LIN:       # %bb.0:
+; LIN-NEXT:    movq %rdi, %rax
+; LIN-NEXT:    movq %rsi, %rdi
+; LIN-NEXT:    negq %rdi
+; LIN-NEXT:    notq %rsi
+; LIN-NEXT:    andq %rdi, %rsi
+; LIN-NEXT:    bsrq %rsi, %rsi
+; LIN-NEXT:    movl $127, %edi
+; LIN-NEXT:    cmovneq %rsi, %rdi
+; LIN-NEXT:    xorq $63, %rdi
+; LIN-NEXT:    addq $64, %rdi
+; LIN-NEXT:    xorl %r9d, %r9d
+; LIN-NEXT:    movl $0, %esi
+; LIN-NEXT:    sbbq %rdx, %rsi
+; LIN-NEXT:    notq %rdx
+; LIN-NEXT:    andq %rsi, %rdx
+; LIN-NEXT:    bsrq %rdx, %rsi
+; LIN-NEXT:    xorq $63, %rsi
+; LIN-NEXT:    testq %rdx, %rdx
+; LIN-NEXT:    cmoveq %rdi, %rsi
+; LIN-NEXT:    subq $-128, %rsi
+; LIN-NEXT:    movl $0, %edx
+; LIN-NEXT:    sbbq %rcx, %rdx
+; LIN-NEXT:    notq %rcx
+; LIN-NEXT:    andq %rdx, %rcx
+; LIN-NEXT:    bsrq %rcx, %rdx
+; LIN-NEXT:    xorq $63, %rdx
+; LIN-NEXT:    addq $64, %rdx
+; LIN-NEXT:    sbbq %r8, %r9
+; LIN-NEXT:    notq %r8
+; LIN-NEXT:    andq %r9, %r8
+; LIN-NEXT:    bsrq %r8, %rdi
+; LIN-NEXT:    xorq $63, %rdi
+; LIN-NEXT:    testq %r8, %r8
+; LIN-NEXT:    cmoveq %rdx, %rdi
+; LIN-NEXT:    orq %rcx, %r8
+; LIN-NEXT:    cmoveq %rsi, %rdi
+; LIN-NEXT:    movq %rdi, (%rax)
+; LIN-NEXT:    movq $0, 8(%rax)
+; LIN-NEXT:    movq $0, 16(%rax)
+; LIN-NEXT:    movq $0, 24(%rax)
+; LIN-NEXT:    retq
   %m = sub i256 -1, %n
   %x = sub i256 0, %n
   %y = and i256 %x, %m
@@ -38,8 +905,91 @@ define i256 @test3(i256 %n) {
 
 declare i256 @llvm.ctlz.i256(i256, i1) nounwind readnone
 
-; CHECK-LABEL: test4
-define i64 @test4(i64 %a, i64 %b) {
+define i64 @test4(i64 %a, i64 %b) nounwind {
+; ILP-LABEL: test4:
+; ILP:       # %bb.0:
+; ILP-NEXT:    xorl %ecx, %ecx
+; ILP-NEXT:    xorl %edx, %edx
+; ILP-NEXT:    addq $1, %rsi
+; ILP-NEXT:    setb %dl
+; ILP-NEXT:    movl $2, %eax
+; ILP-NEXT:    cmpq %rdi, %rsi
+; ILP-NEXT:    sbbq $0, %rdx
+; ILP-NEXT:    movl $0, %edx
+; ILP-NEXT:    sbbq $0, %rdx
+; ILP-NEXT:    sbbq $0, %rcx
+; ILP-NEXT:    setae %cl
+; ILP-NEXT:    movzbl %cl, %ecx
+; ILP-NEXT:    subq %rcx, %rax
+; ILP-NEXT:    retq
+;
+; HYBRID-LABEL: test4:
+; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    xorl %eax, %eax
+; HYBRID-NEXT:    xorl %ecx, %ecx
+; HYBRID-NEXT:    addq $1, %rsi
+; HYBRID-NEXT:    setb %cl
+; HYBRID-NEXT:    cmpq %rdi, %rsi
+; HYBRID-NEXT:    sbbq $0, %rcx
+; HYBRID-NEXT:    movl $0, %ecx
+; HYBRID-NEXT:    sbbq $0, %rcx
+; HYBRID-NEXT:    sbbq $0, %rax
+; HYBRID-NEXT:    setae %al
+; HYBRID-NEXT:    movzbl %al, %ecx
+; HYBRID-NEXT:    movl $2, %eax
+; HYBRID-NEXT:    subq %rcx, %rax
+; HYBRID-NEXT:    retq
+;
+; BURR-LABEL: test4:
+; BURR:       # %bb.0:
+; BURR-NEXT:    xorl %eax, %eax
+; BURR-NEXT:    xorl %ecx, %ecx
+; BURR-NEXT:    addq $1, %rsi
+; BURR-NEXT:    setb %cl
+; BURR-NEXT:    cmpq %rdi, %rsi
+; BURR-NEXT:    sbbq $0, %rcx
+; BURR-NEXT:    movl $0, %ecx
+; BURR-NEXT:    sbbq $0, %rcx
+; BURR-NEXT:    sbbq $0, %rax
+; BURR-NEXT:    setae %al
+; BURR-NEXT:    movzbl %al, %ecx
+; BURR-NEXT:    movl $2, %eax
+; BURR-NEXT:    subq %rcx, %rax
+; BURR-NEXT:    retq
+;
+; SRC-LABEL: test4:
+; SRC:       # %bb.0:
+; SRC-NEXT:    xorl %eax, %eax
+; SRC-NEXT:    addq $1, %rsi
+; SRC-NEXT:    setb %al
+; SRC-NEXT:    xorl %ecx, %ecx
+; SRC-NEXT:    cmpq %rdi, %rsi
+; SRC-NEXT:    sbbq $0, %rax
+; SRC-NEXT:    movl $0, %eax
+; SRC-NEXT:    sbbq $0, %rax
+; SRC-NEXT:    sbbq $0, %rcx
+; SRC-NEXT:    setae %al
+; SRC-NEXT:    movzbl %al, %ecx
+; SRC-NEXT:    movl $2, %eax
+; SRC-NEXT:    subq %rcx, %rax
+; SRC-NEXT:    retq
+;
+; LIN-LABEL: test4:
+; LIN:       # %bb.0:
+; LIN-NEXT:    movl $2, %eax
+; LIN-NEXT:    xorl %ecx, %ecx
+; LIN-NEXT:    xorl %edx, %edx
+; LIN-NEXT:    addq $1, %rsi
+; LIN-NEXT:    setb %dl
+; LIN-NEXT:    cmpq %rdi, %rsi
+; LIN-NEXT:    sbbq $0, %rdx
+; LIN-NEXT:    movl $0, %edx
+; LIN-NEXT:    sbbq $0, %rdx
+; LIN-NEXT:    sbbq $0, %rcx
+; LIN-NEXT:    setae %cl
+; LIN-NEXT:    movzbl %cl, %ecx
+; LIN-NEXT:    subq %rcx, %rax
+; LIN-NEXT:    retq
   %r = zext i64 %b to i256
   %u = add i256 %r, 1
   %w = and i256 %u, 1461501637330902918203684832716283019655932542975
@@ -49,3 +999,252 @@ define i64 @test4(i64 %a, i64 %b) {
   %z = add i64 %y, 1
   ret i64 %z
 }
+
+define i256 @PR25498(i256 %a) nounwind {
+; ILP-LABEL: PR25498:
+; ILP:       # %bb.0:
+; ILP-NEXT:    pushq %rbx
+; ILP-NEXT:    movq %rdi, %rax
+; ILP-NEXT:    xorl %r9d, %r9d
+; ILP-NEXT:    movq %rsi, %rbx
+; ILP-NEXT:    negq %rbx
+; ILP-NEXT:    movl $0, %r11d
+; ILP-NEXT:    sbbq %rdx, %r11
+; ILP-NEXT:    movl $0, %r10d
+; ILP-NEXT:    sbbq %rcx, %r10
+; ILP-NEXT:    movl $0, %edi
+; ILP-NEXT:    sbbq %r8, %rdi
+; ILP-NEXT:    orq %r8, %rdx
+; ILP-NEXT:    orq %rcx, %rsi
+; ILP-NEXT:    orq %rdx, %rsi
+; ILP-NEXT:    je .LBB4_1
+; ILP-NEXT:  # %bb.2: # %cond.false
+; ILP-NEXT:    bsrq %r11, %rdx
+; ILP-NEXT:    bsrq %rdi, %rcx
+; ILP-NEXT:    xorq $63, %rcx
+; ILP-NEXT:    bsrq %r10, %rsi
+; ILP-NEXT:    xorq $63, %rsi
+; ILP-NEXT:    addq $64, %rsi
+; ILP-NEXT:    testq %rdi, %rdi
+; ILP-NEXT:    cmovneq %rcx, %rsi
+; ILP-NEXT:    xorq $63, %rdx
+; ILP-NEXT:    bsrq %rbx, %rcx
+; ILP-NEXT:    xorq $63, %rcx
+; ILP-NEXT:    addq $64, %rcx
+; ILP-NEXT:    testq %r11, %r11
+; ILP-NEXT:    cmovneq %rdx, %rcx
+; ILP-NEXT:    subq $-128, %rcx
+; ILP-NEXT:    xorl %r9d, %r9d
+; ILP-NEXT:    orq %rdi, %r10
+; ILP-NEXT:    cmovneq %rsi, %rcx
+; ILP-NEXT:    jmp .LBB4_3
+; ILP-NEXT:  .LBB4_1:
+; ILP-NEXT:    movl $256, %ecx # imm = 0x100
+; ILP-NEXT:  .LBB4_3: # %cond.end
+; ILP-NEXT:    movq %rcx, (%rax)
+; ILP-NEXT:    movq %r9, 8(%rax)
+; ILP-NEXT:    movq %r9, 16(%rax)
+; ILP-NEXT:    movq %r9, 24(%rax)
+; ILP-NEXT:    popq %rbx
+; ILP-NEXT:    retq
+;
+; HYBRID-LABEL: PR25498:
+; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    pushq %rbx
+; HYBRID-NEXT:    movq %rdi, %rax
+; HYBRID-NEXT:    xorl %r9d, %r9d
+; HYBRID-NEXT:    movq %rsi, %rbx
+; HYBRID-NEXT:    negq %rbx
+; HYBRID-NEXT:    movl $0, %r11d
+; HYBRID-NEXT:    sbbq %rdx, %r11
+; HYBRID-NEXT:    movl $0, %r10d
+; HYBRID-NEXT:    sbbq %rcx, %r10
+; HYBRID-NEXT:    movl $0, %edi
+; HYBRID-NEXT:    sbbq %r8, %rdi
+; HYBRID-NEXT:    orq %r8, %rdx
+; HYBRID-NEXT:    orq %rcx, %rsi
+; HYBRID-NEXT:    orq %rdx, %rsi
+; HYBRID-NEXT:    je .LBB4_1
+; HYBRID-NEXT:  # %bb.2: # %cond.false
+; HYBRID-NEXT:    bsrq %rdi, %rcx
+; HYBRID-NEXT:    xorq $63, %rcx
+; HYBRID-NEXT:    bsrq %r10, %rdx
+; HYBRID-NEXT:    xorq $63, %rdx
+; HYBRID-NEXT:    addq $64, %rdx
+; HYBRID-NEXT:    testq %rdi, %rdi
+; HYBRID-NEXT:    cmovneq %rcx, %rdx
+; HYBRID-NEXT:    bsrq %r11, %rsi
+; HYBRID-NEXT:    xorq $63, %rsi
+; HYBRID-NEXT:    bsrq %rbx, %rcx
+; HYBRID-NEXT:    xorq $63, %rcx
+; HYBRID-NEXT:    addq $64, %rcx
+; HYBRID-NEXT:    testq %r11, %r11
+; HYBRID-NEXT:    cmovneq %rsi, %rcx
+; HYBRID-NEXT:    subq $-128, %rcx
+; HYBRID-NEXT:    orq %rdi, %r10
+; HYBRID-NEXT:    cmovneq %rdx, %rcx
+; HYBRID-NEXT:    xorl %r9d, %r9d
+; HYBRID-NEXT:    jmp .LBB4_3
+; HYBRID-NEXT:  .LBB4_1:
+; HYBRID-NEXT:    movl $256, %ecx # imm = 0x100
+; HYBRID-NEXT:  .LBB4_3: # %cond.end
+; HYBRID-NEXT:    movq %rcx, (%rax)
+; HYBRID-NEXT:    movq %r9, 8(%rax)
+; HYBRID-NEXT:    movq %r9, 16(%rax)
+; HYBRID-NEXT:    movq %r9, 24(%rax)
+; HYBRID-NEXT:    popq %rbx
+; HYBRID-NEXT:    retq
+;
+; BURR-LABEL: PR25498:
+; BURR:       # %bb.0:
+; BURR-NEXT:    pushq %rbx
+; BURR-NEXT:    movq %rdi, %rax
+; BURR-NEXT:    xorl %r9d, %r9d
+; BURR-NEXT:    movq %rsi, %rbx
+; BURR-NEXT:    negq %rbx
+; BURR-NEXT:    movl $0, %r11d
+; BURR-NEXT:    sbbq %rdx, %r11
+; BURR-NEXT:    movl $0, %r10d
+; BURR-NEXT:    sbbq %rcx, %r10
+; BURR-NEXT:    movl $0, %edi
+; BURR-NEXT:    sbbq %r8, %rdi
+; BURR-NEXT:    orq %r8, %rdx
+; BURR-NEXT:    orq %rcx, %rsi
+; BURR-NEXT:    orq %rdx, %rsi
+; BURR-NEXT:    je .LBB4_1
+; BURR-NEXT:  # %bb.2: # %cond.false
+; BURR-NEXT:    bsrq %rdi, %rcx
+; BURR-NEXT:    xorq $63, %rcx
+; BURR-NEXT:    bsrq %r10, %rdx
+; BURR-NEXT:    xorq $63, %rdx
+; BURR-NEXT:    addq $64, %rdx
+; BURR-NEXT:    testq %rdi, %rdi
+; BURR-NEXT:    cmovneq %rcx, %rdx
+; BURR-NEXT:    bsrq %r11, %rsi
+; BURR-NEXT:    xorq $63, %rsi
+; BURR-NEXT:    bsrq %rbx, %rcx
+; BURR-NEXT:    xorq $63, %rcx
+; BURR-NEXT:    addq $64, %rcx
+; BURR-NEXT:    testq %r11, %r11
+; BURR-NEXT:    cmovneq %rsi, %rcx
+; BURR-NEXT:    subq $-128, %rcx
+; BURR-NEXT:    orq %rdi, %r10
+; BURR-NEXT:    cmovneq %rdx, %rcx
+; BURR-NEXT:    xorl %r9d, %r9d
+; BURR-NEXT:    jmp .LBB4_3
+; BURR-NEXT:  .LBB4_1:
+; BURR-NEXT:    movl $256, %ecx # imm = 0x100
+; BURR-NEXT:  .LBB4_3: # %cond.end
+; BURR-NEXT:    movq %rcx, (%rax)
+; BURR-NEXT:    movq %r9, 8(%rax)
+; BURR-NEXT:    movq %r9, 16(%rax)
+; BURR-NEXT:    movq %r9, 24(%rax)
+; BURR-NEXT:    popq %rbx
+; BURR-NEXT:    retq
+;
+; SRC-LABEL: PR25498:
+; SRC:       # %bb.0:
+; SRC-NEXT:    pushq %rbx
+; SRC-NEXT:    movq %rdi, %rax
+; SRC-NEXT:    xorl %r9d, %r9d
+; SRC-NEXT:    movq %rsi, %rbx
+; SRC-NEXT:    negq %rbx
+; SRC-NEXT:    movl $0, %r11d
+; SRC-NEXT:    sbbq %rdx, %r11
+; SRC-NEXT:    movl $0, %r10d
+; SRC-NEXT:    sbbq %rcx, %r10
+; SRC-NEXT:    movl $0, %edi
+; SRC-NEXT:    sbbq %r8, %rdi
+; SRC-NEXT:    orq %r8, %rdx
+; SRC-NEXT:    orq %rcx, %rsi
+; SRC-NEXT:    orq %rdx, %rsi
+; SRC-NEXT:    je .LBB4_1
+; SRC-NEXT:  # %bb.2: # %cond.false
+; SRC-NEXT:    bsrq %rdi, %rcx
+; SRC-NEXT:    xorq $63, %rcx
+; SRC-NEXT:    bsrq %r10, %rdx
+; SRC-NEXT:    xorq $63, %rdx
+; SRC-NEXT:    addq $64, %rdx
+; SRC-NEXT:    testq %rdi, %rdi
+; SRC-NEXT:    cmovneq %rcx, %rdx
+; SRC-NEXT:    bsrq %r11, %rsi
+; SRC-NEXT:    xorq $63, %rsi
+; SRC-NEXT:    bsrq %rbx, %rcx
+; SRC-NEXT:    xorq $63, %rcx
+; SRC-NEXT:    addq $64, %rcx
+; SRC-NEXT:    testq %r11, %r11
+; SRC-NEXT:    cmovneq %rsi, %rcx
+; SRC-NEXT:    subq $-128, %rcx
+; SRC-NEXT:    orq %rdi, %r10
+; SRC-NEXT:    cmovneq %rdx, %rcx
+; SRC-NEXT:    xorl %r9d, %r9d
+; SRC-NEXT:    jmp .LBB4_3
+; SRC-NEXT:  .LBB4_1:
+; SRC-NEXT:    movl $256, %ecx # imm = 0x100
+; SRC-NEXT:  .LBB4_3: # %cond.end
+; SRC-NEXT:    movq %rcx, (%rax)
+; SRC-NEXT:    movq %r9, 8(%rax)
+; SRC-NEXT:    movq %r9, 16(%rax)
+; SRC-NEXT:    movq %r9, 24(%rax)
+; SRC-NEXT:    popq %rbx
+; SRC-NEXT:    retq
+;
+; LIN-LABEL: PR25498:
+; LIN:       # %bb.0:
+; LIN-NEXT:    pushq %rbx
+; LIN-NEXT:    movq %rdi, %rax
+; LIN-NEXT:    movq %rsi, %rbx
+; LIN-NEXT:    negq %rbx
+; LIN-NEXT:    xorl %r9d, %r9d
+; LIN-NEXT:    movl $0, %edi
+; LIN-NEXT:    sbbq %rdx, %rdi
+; LIN-NEXT:    movl $0, %r10d
+; LIN-NEXT:    sbbq %rcx, %r10
+; LIN-NEXT:    movl $0, %r11d
+; LIN-NEXT:    sbbq %r8, %r11
+; LIN-NEXT:    orq %rcx, %rsi
+; LIN-NEXT:    orq %r8, %rdx
+; LIN-NEXT:    orq %rsi, %rdx
+; LIN-NEXT:    je .LBB4_1
+; LIN-NEXT:  # %bb.2: # %cond.false
+; LIN-NEXT:    bsrq %rbx, %rcx
+; LIN-NEXT:    xorq $63, %rcx
+; LIN-NEXT:    addq $64, %rcx
+; LIN-NEXT:    bsrq %rdi, %rdx
+; LIN-NEXT:    xorq $63, %rdx
+; LIN-NEXT:    testq %rdi, %rdi
+; LIN-NEXT:    cmoveq %rcx, %rdx
+; LIN-NEXT:    subq $-128, %rdx
+; LIN-NEXT:    bsrq %r10, %rsi
+; LIN-NEXT:    xorq $63, %rsi
+; LIN-NEXT:    addq $64, %rsi
+; LIN-NEXT:    bsrq %r11, %rcx
+; LIN-NEXT:    xorq $63, %rcx
+; LIN-NEXT:    testq %r11, %r11
+; LIN-NEXT:    cmoveq %rsi, %rcx
+; LIN-NEXT:    orq %r11, %r10
+; LIN-NEXT:    cmoveq %rdx, %rcx
+; LIN-NEXT:    xorl %r9d, %r9d
+; LIN-NEXT:    jmp .LBB4_3
+; LIN-NEXT:  .LBB4_1:
+; LIN-NEXT:    movl $256, %ecx # imm = 0x100
+; LIN-NEXT:  .LBB4_3: # %cond.end
+; LIN-NEXT:    movq %rcx, (%rax)
+; LIN-NEXT:    movq %r9, 8(%rax)
+; LIN-NEXT:    movq %r9, 16(%rax)
+; LIN-NEXT:    movq %r9, 24(%rax)
+; LIN-NEXT:    popq %rbx
+; LIN-NEXT:    retq
+  %b = sub i256 0, %a
+  %cmpz = icmp eq i256 %b, 0
+  br i1 %cmpz, label %cond.end, label %cond.false
+
+cond.false:
+  %d = call i256 @llvm.ctlz.i256(i256 %b, i1 true)
+  br label %cond.end
+
+cond.end:
+  %ctz = phi i256 [ 256, %0 ], [ %d, %cond.false ]
+  ret i256 %ctz
+}
+
-- 
GitLab


From 56e3b243da925db22245cf529f7393dac8ec6eb6 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 22 Oct 2018 21:17:56 +0000
Subject: [PATCH 0405/1116] [ORC] Guard access to the MemMgrs vector in
 RTDyldObjectLinkingLayer.

Otherwise we can end up with a data-race when linking concurrently.

This should fix an intermittent failure in the multiple-compile-threads-basic.ll
testcase.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344956 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Orc/RTDyldObjectLinkingLayer.cpp                | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index 8511e41c4f2..616251c7e00 100644
--- a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -121,8 +121,15 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
   }
 
   auto K = R.getVModuleKey();
-  MemMgrs.push_back(GetMemoryManager());
-  auto &MemMgr = *MemMgrs.back();
+  RuntimeDyld::MemoryManager *MemMgr = nullptr;
+
+  // Create a record a memory manager for this object.
+  {
+    auto Tmp = GetMemoryManager();
+    std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
+    MemMgrs.push_back(std::move(Tmp));
+    MemMgr = MemMgrs.back().get();
+  }
 
   JITDylibSearchOrderResolver Resolver(*SharedR);
 
@@ -134,7 +141,7 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
    * duplicate defs.
    */
   jitLinkForORC(
-      **Obj, std::move(O), MemMgr, Resolver, ProcessAllSections,
+      **Obj, std::move(O), *MemMgr, Resolver, ProcessAllSections,
       [this, K, SharedR, &Obj, InternalSymbols](
           std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo,
           std::map<StringRef, JITEvaluatedSymbol> ResolvedSymbols) {
-- 
GitLab


From 353c741e8e2dd610af55553cb460b3c10f0fcaa5 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 22 Oct 2018 21:37:02 +0000
Subject: [PATCH 0406/1116] [Reassociate] add 'using namespace' to reduce
 bloat; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344959 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/Reassociate.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 03cd7c10150..78e0fe47be0 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -63,6 +63,7 @@
 
 using namespace llvm;
 using namespace reassociate;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "reassociate"
 
@@ -125,10 +126,10 @@ XorOpnd::XorOpnd(Value *V) {
     Value *V0 = I->getOperand(0);
     Value *V1 = I->getOperand(1);
     const APInt *C;
-    if (match(V0, PatternMatch::m_APInt(C)))
+    if (match(V0, m_APInt(C)))
       std::swap(V0, V1);
 
-    if (match(V1, PatternMatch::m_APInt(C))) {
+    if (match(V1, m_APInt(C))) {
       ConstPart = *C;
       SymbolicPart = V0;
       isOr = (I->getOpcode() == Instruction::Or);
@@ -1304,7 +1305,7 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
     Value *V = Ops[i].Op;
     const APInt *C;
     // TODO: Support non-splat vectors.
-    if (match(V, PatternMatch::m_APInt(C))) {
+    if (match(V, m_APInt(C))) {
       ConstOpnd ^= *C;
     } else {
       XorOpnd O(V);
-- 
GitLab


From 8266f03bf7f9c694dbf2965dc0a1a755da483e03 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Mon, 22 Oct 2018 21:44:21 +0000
Subject: [PATCH 0407/1116] [DWARF] Use a function-local offset for
 AT_call_return_pc

Logs provided by @stella.stamenova indicate that on Linux, lldb adds a
spurious slide offset to the return PC it loads from AT_call_return_pc
attributes (see the list thread: "[PATCH] D50478: Add support for
artificial tail call frames").

This patch side-steps the issue by getting rid of the load address
calculation in lldb's CallEdge::GetReturnPCAddress.

The idea is to have the DWARF writer emit function-local offsets to the
instruction after a call. I.e. return-pc = label-after-call-insn -
function-entry. LLDB can simply add this offset to the base address of a
function to get the return PC.

Differential Revision: https://reviews.llvm.org/D53469

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344960 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp | 15 +++++++++++++++
 lib/CodeGen/AsmPrinter/DebugHandlerBase.h   |  4 ++++
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp | 12 +++++++++---
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.h   |  9 ++++++---
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp       |  7 ++++---
 5 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 580f682b9a6..a362dd40e3b 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -125,6 +125,21 @@ MCSymbol *DebugHandlerBase::getLabelAfterInsn(const MachineInstr *MI) {
   return LabelsAfterInsn.lookup(MI);
 }
 
+// Return the function-local offset of an instruction.
+const MCExpr *
+DebugHandlerBase::getFunctionLocalOffsetAfterInsn(const MachineInstr *MI) {
+  MCContext &MC = Asm->OutContext;
+
+  MCSymbol *Start = Asm->getFunctionBegin();
+  const auto *StartRef = MCSymbolRefExpr::create(Start, MC);
+
+  MCSymbol *AfterInsn = getLabelAfterInsn(MI);
+  assert(AfterInsn && "Expected label after instruction");
+  const auto *AfterRef = MCSymbolRefExpr::create(AfterInsn, MC);
+
+  return MCBinaryExpr::createSub(AfterRef, StartRef, MC);
+}
+
 /// If this type is derived from a base type then return base type size.
 uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
   DIType *Ty = TyRef.resolve();
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
index 4b0ce0e3f03..cdf8dc72b07 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
@@ -125,6 +125,10 @@ public:
   /// Return Label immediately following the instruction.
   MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
 
+  /// Return the function-local offset of an instruction. A label for the
+  /// instruction \p MI should exist (\ref getLabelAfterInsn).
+  const MCExpr *getFunctionLocalOffsetAfterInsn(const MachineInstr *MI);
+
   /// If this type is derived from a base type then return base type size.
   static uint64_t getBaseTypeSize(const DITypeRef TyRef);
 };
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 81eb0c2aa9e..1d9c1d38a24 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -821,7 +821,7 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
 DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
                                                  const DISubprogram &CalleeSP,
                                                  bool IsTail,
-                                                 const MCSymbol *ReturnPC) {
+                                                 const MCExpr *PCOffset) {
   // Insert a call site entry DIE within ScopeDIE.
   DIE &CallSiteDIE =
       createAndAddDIE(dwarf::DW_TAG_call_site, ScopeDIE, nullptr);
@@ -838,8 +838,8 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
   } else {
     // Attach the return PC to allow the debugger to disambiguate call paths
     // from one function to another.
-    assert(ReturnPC && "Missing return PC information for a call");
-    addLabelAddress(CallSiteDIE, dwarf::DW_AT_call_return_pc, ReturnPC);
+    assert(PCOffset && "Missing return PC information for a call");
+    addAddressExpr(CallSiteDIE, dwarf::DW_AT_call_return_pc, PCOffset);
   }
   return CallSiteDIE;
 }
@@ -1103,6 +1103,12 @@ void DwarfCompileUnit::addExpr(DIELoc &Die, dwarf::Form Form,
   Die.addValue(DIEValueAllocator, (dwarf::Attribute)0, Form, DIEExpr(Expr));
 }
 
+void DwarfCompileUnit::addAddressExpr(DIE &Die, dwarf::Attribute Attribute,
+                                      const MCExpr *Expr) {
+  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_addr,
+               DIEExpr(Expr));
+}
+
 void DwarfCompileUnit::applySubprogramAttributesToDefinition(
     const DISubprogram *SP, DIE &SPDie) {
   auto *SPDecl = SP->getDeclaration();
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 97a944e9b95..13679c37fe5 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -210,10 +210,10 @@ public:
 
   /// Construct a call site entry DIE describing a call within \p Scope to a
   /// callee described by \p CalleeSP. \p IsTail specifies whether the call is
-  /// a tail call. \p ReturnPC must be non-null for non-tail calls and point
-  /// to the PC value after the call returns.
+  /// a tail call. \p PCOffset must be non-zero for non-tail calls or be the
+  /// function-local offset to PC value after the call instruction.
   DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram &CalleeSP,
-                                 bool IsTail, const MCSymbol *ReturnPC);
+                                 bool IsTail, const MCExpr *PCOffset);
 
   /// Construct import_module DIE.
   DIE *constructImportedEntityDIE(const DIImportedEntity *Module);
@@ -292,6 +292,9 @@ public:
   /// Add a Dwarf expression attribute data and value.
   void addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr);
 
+  /// Add an attribute containing an address expression to \p Die.
+  void addAddressExpr(DIE &Die, dwarf::Attribute Attribute, const MCExpr *Expr);
+
   void applySubprogramAttributesToDefinition(const DISubprogram *SP,
                                              DIE &SPDie);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 2f14f5464fd..5f91674d9f0 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -548,14 +548,15 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
       // For tail calls, no return PC information is needed. For regular calls,
       // the return PC is needed to disambiguate paths in the call graph which
       // could lead to some target function.
-      const MCSymbol *ReturnPC = IsTail ? nullptr : getLabelAfterInsn(&MI);
+      const MCExpr *PCOffset =
+          IsTail ? nullptr : getFunctionLocalOffsetAfterInsn(&MI);
 
-      assert((IsTail || ReturnPC) && "Call without return PC information");
+      assert((IsTail || PCOffset) && "Call without return PC information");
       LLVM_DEBUG(dbgs() << "CallSiteEntry: " << MF.getName() << " -> "
                         << CalleeDecl->getName() << (IsTail ? " [tail]" : "")
                         << "\n");
       CU.constructCallSiteEntryDIE(ScopeDIE, *CalleeDecl->getSubprogram(),
-                                   IsTail, ReturnPC);
+                                   IsTail, PCOffset);
     }
   }
 }
-- 
GitLab


From fbc926ad9aca9c121b1c5b57c1f2bdece9010494 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Mon, 22 Oct 2018 21:55:26 +0000
Subject: [PATCH 0408/1116] [WebAssembly][NFC] Remove WebAssemblyStackifier
 TableGen backend

Summary:
Replace its functionality with a TableGen InstrInfo relational
instruction mapping. Although arguably more complex than the TableGen
backend, the relational mapping is a smaller maintenance burden than a
TableGen backend.

Reviewers: aardappel, aheejin, dschuff

Subscribers: mgorny, sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53307

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344962 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/CMakeLists.txt         |  1 -
 .../WebAssembly/WebAssemblyInstrControl.td    | 10 +++--
 .../WebAssembly/WebAssemblyInstrFormats.td    | 17 ++++---
 .../WebAssembly/WebAssemblyInstrInfo.td       | 13 ++++++
 .../WebAssembly/WebAssemblyMCInstLower.cpp    | 27 +++---------
 utils/TableGen/CMakeLists.txt                 |  1 -
 utils/TableGen/TableGen.cpp                   |  8 +---
 utils/TableGen/TableGenBackends.h             |  1 -
 .../WebAssemblyDisassemblerEmitter.cpp        | 13 +++---
 .../TableGen/WebAssemblyStackifierEmitter.cpp | 44 -------------------
 10 files changed, 44 insertions(+), 91 deletions(-)
 delete mode 100644 utils/TableGen/WebAssemblyStackifierEmitter.cpp

diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index 2af5b9cb23d..549229ad572 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -9,7 +9,6 @@ tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM WebAssemblyGenStackifier.inc -gen-wasm-stackifier)
 
 add_public_tablegen_target(WebAssemblyCommonTableGen)
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index ed9879ae454..be9cdc59a69 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -43,26 +43,28 @@ def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 let isCodeGenOnly = 1 in
 def BR_TABLE_I32 : NI<(outs), (ins I32:$index, variable_ops),
-                      [(WebAssemblybr_table I32:$index)], 0,
+                      [(WebAssemblybr_table I32:$index)], "false",
                       "br_table \t$index", 0x0e> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
+let BaseName = "BR_TABLE_I32" in
 def BR_TABLE_I32_S : NI<(outs), (ins variable_ops),
-                        [], 1,
+                        [], "true",
                         "br_table", 0x0e> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
 let isCodeGenOnly = 1 in
 def BR_TABLE_I64 : NI<(outs), (ins I64:$index, variable_ops),
-                      [(WebAssemblybr_table I64:$index)], 0,
+                      [(WebAssemblybr_table I64:$index)], "false",
                       "br_table \t$index"> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
+let BaseName = "BR_TABLE_I64" in
 def BR_TABLE_I64_S : NI<(outs), (ins variable_ops),
-                        [], 1,
+                        [], "true",
                         "br_table"> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 2d23acfc825..97583ea0e6a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -15,17 +15,19 @@
 // WebAssembly Instruction Format.
 // We instantiate 2 of these for every actual instruction (register based
 // and stack based), see below.
-class WebAssemblyInst<bits<32> inst, string asmstr, bit stack> : Instruction {
-  field bits<32> Inst = inst; // Instruction encoding.
-  field bit StackBased = stack;
+class WebAssemblyInst<bits<32> inst, string asmstr, string stack> : StackRel,
+  Instruction {
+  bits<32> Inst = inst; // Instruction encoding.
+  string StackBased = stack;
+  string BaseName = NAME;
   let Namespace   = "WebAssembly";
   let Pattern     = [];
   let AsmString   = asmstr;
 }
 
 // Normal instructions. Default instantiation of a WebAssemblyInst.
-class NI<dag oops, dag iops, list<dag> pattern, bit stack, string asmstr = "",
-         bits<32> inst = -1>
+class NI<dag oops, dag iops, list<dag> pattern, string stack,
+         string asmstr = "", bits<32> inst = -1>
     : WebAssemblyInst<inst, asmstr, stack> {
   dag OutOperandList = oops;
   dag InOperandList  = iops;
@@ -50,8 +52,9 @@ multiclass I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
              list<dag> pattern_r, string asmstr_r = "", string asmstr_s = "",
              bits<32> inst = -1> {
   let isCodeGenOnly = 1 in
-  def "" : NI<oops_r, iops_r, pattern_r, 0, asmstr_r, inst>;
-  def _S : NI<oops_s, iops_s, [], 1, asmstr_s, inst>;
+  def "" : NI<oops_r, iops_r, pattern_r, "false", asmstr_r, inst>;
+  let BaseName = NAME in
+  def _S : NI<oops_s, iops_s, [], "true", asmstr_s, inst>;
 }
 
 // For instructions that have no register ops, so both sets are the same.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 8d98510c67d..c5b41983245 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -153,6 +153,19 @@ def TypeIndex : Operand<i32>;
 
 } // OperandNamespace = "WebAssembly"
 
+//===----------------------------------------------------------------------===//
+// WebAssembly Register to Stack instruction mapping
+//===----------------------------------------------------------------------===//
+
+class StackRel;
+def getStackOpcode : InstrMapping {
+  let FilterClass = "StackRel";
+  let RowFields = ["BaseName"];
+  let ColFields = ["StackBased"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
 //===----------------------------------------------------------------------===//
 // WebAssembly Instruction Format Definitions.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index e9a0cf51905..ebd374762ae 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -30,6 +30,11 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+// Defines llvm::WebAssembly::getStackOpcode to convert register instructions to
+// stack instructions
+#define GET_INSTRMAP_INFO 1
+#include "WebAssemblyGenInstrInfo.inc"
+
 // This disables the removal of registers when lowering into MC, as required
 // by some current tests.
 static cl::opt<bool>
@@ -38,7 +43,6 @@ static cl::opt<bool>
                                " instruction output for test purposes only."),
                       cl::init(false));
 
-static unsigned regInstructionToStackInstruction(unsigned OpCode);
 static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI);
 
 MCSymbol *
@@ -254,7 +258,8 @@ static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI) {
 
   // Transform to _S instruction.
   auto RegOpcode = OutMI.getOpcode();
-  auto StackOpcode = regInstructionToStackInstruction(RegOpcode);
+  auto StackOpcode = WebAssembly::getStackOpcode(RegOpcode);
+  assert(StackOpcode != -1 && "Failed to stackify instruction");
   OutMI.setOpcode(StackOpcode);
 
   // Remove register operands.
@@ -265,21 +270,3 @@ static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI) {
     }
   }
 }
-
-static unsigned regInstructionToStackInstruction(unsigned OpCode) {
-  // For most opcodes, this function could have been implemented as "return
-  // OpCode + 1", but since table-gen alphabetically sorts them, this cannot be
-  // guaranteed (see e.g. BR and BR_IF). Instead we use a giant switch statement
-  // generated by a custom TableGen backend (WebAssemblyStackifierEmitter.cpp)
-  // that emits switch cases of the form
-  //
-  //   case WebAssembly::RegisterInstr: return WebAssembly::StackInstr;
-  //
-  // for every pair of equivalent register and stack instructions.
-  switch (OpCode) {
-  default:
-    llvm_unreachable(
-        "unknown WebAssembly instruction in WebAssemblyMCInstLower pass");
-#include "WebAssemblyGenStackifier.inc"
-  }
-}
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index 5ac3eca4c68..0428249f917 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -46,7 +46,6 @@ add_tablegen(llvm-tblgen LLVM
   X86ModRMFilters.cpp
   X86RecognizableInstr.cpp
   WebAssemblyDisassemblerEmitter.cpp
-  WebAssemblyStackifierEmitter.cpp
   CTagsEmitter.cpp
   )
 set_target_properties(llvm-tblgen PROPERTIES FOLDER "Tablegenning")
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index 9e526b6d8f5..b78260625cb 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -53,7 +53,6 @@ enum ActionType {
   GenX86EVEX2VEXTables,
   GenX86FoldTables,
   GenRegisterBank,
-  GenWebAssemblyStackifier,
 };
 
 namespace {
@@ -118,9 +117,7 @@ namespace {
                     clEnumValN(GenX86FoldTables, "gen-x86-fold-tables",
                                "Generate X86 fold tables"),
                     clEnumValN(GenRegisterBank, "gen-register-bank",
-                               "Generate registers bank descriptions"),
-                    clEnumValN(GenWebAssemblyStackifier, "gen-wasm-stackifier",
-                               "Generate WebAssembly stackification cases")));
+                               "Generate registers bank descriptions")));
 
   cl::OptionCategory PrintEnumsCat("Options for -print-enums");
   cl::opt<std::string>
@@ -234,9 +231,6 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenX86FoldTables:
     EmitX86FoldTables(Records, OS);
     break;
-  case GenWebAssemblyStackifier:
-    EmitWebAssemblyStackifier(Records, OS);
-    break;
   }
 
   return false;
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index f7ed5cc87d3..1329a6d833f 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -89,7 +89,6 @@ void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS);
-void EmitWebAssemblyStackifier(RecordKeeper &RK, raw_ostream &OS);
 
 } // End llvm namespace
 
diff --git a/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
index f9c3cb12f85..a8edfdc623f 100644
--- a/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
+++ b/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
@@ -42,12 +42,13 @@ void emitWebAssemblyDisassemblerTables(
     auto Prefix = Opc >> 8;
     Opc = Opc & 0xFF;
     auto &CGIP = OpcodeTable[Prefix][Opc];
-    // All wasm instructions have a StackBased fieldof type bit, we only want
-    // the instructions for which this is 1.
-    auto Bit = Def.getValue("StackBased")->getValue()->
-                 getCastTo(BitRecTy::get());
-    auto IsStackBased = Bit && reinterpret_cast<const BitInit *>(Bit)
-                                 ->getValue();
+    // All wasm instructions have a StackBased field of type string, we only
+    // want the instructions for which this is "true".
+    auto StackString =
+        Def.getValue("StackBased")->getValue()->getCastTo(StringRecTy::get());
+    auto IsStackBased =
+        StackString &&
+        reinterpret_cast<const StringInit *>(StackString)->getValue() == "true";
     if (IsStackBased && !CGIP.second) {
       // this picks the first of many typed variants, which is
       // currently the except_ref one, though this shouldn't matter for
diff --git a/utils/TableGen/WebAssemblyStackifierEmitter.cpp b/utils/TableGen/WebAssemblyStackifierEmitter.cpp
deleted file mode 100644
index 0b9741d22b8..00000000000
--- a/utils/TableGen/WebAssemblyStackifierEmitter.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===- WebAssemblyStackifierEmitter.cpp - Stackifier cases ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file emits the switch statement cases to translate WebAssembly
-// instructions to their stack forms.
-//
-//===----------------------------------------------------------------------===//
-
-#include "WebAssemblyDisassemblerEmitter.h"
-#include "llvm/TableGen/Record.h"
-
-namespace llvm {
-
-// Find all register WebAssembly instructions and their corresponding stack
-// instructions. For each pair, emit a switch case of the form
-//
-//   case WebAssembly::RegisterInstr: return WebAssembly::StackInstr;
-//
-// For example,
-//
-//   case WebAssembly::ADD_I32: return WebAssembly::ADD_I32_S;
-//
-// This is useful for converting instructions from their register form to their
-// equivalent stack form.
-void EmitWebAssemblyStackifier(RecordKeeper &RK, raw_ostream &OS) {
-  Record *InstrClass = RK.getClass("WebAssemblyInst");
-  for (auto &RecordPair : RK.getDefs()) {
-    if (!RecordPair.second->isSubClassOf(InstrClass))
-      continue;
-    bool IsStackBased = RecordPair.second->getValueAsBit("StackBased");
-    if (IsStackBased)
-      continue;
-    OS << "  case WebAssembly::" << RecordPair.first << ": return "
-       << "WebAssembly::" << RecordPair.first << "_S;\n";
-  }
-}
-
-} // namespace llvm
-- 
GitLab


From 5528b0e4848be430325d982c8239264e38a9b9b9 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 22 Oct 2018 22:04:13 +0000
Subject: [PATCH 0409/1116] [Reassociate] add vector tests with undef elements;
 NFC

Also, regenerate checks for these files. We should do better
on the vector tests by using the PatternMatch API instead of
BinaryOperator::isNot/isNeg.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344964 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/Reassociate/inverses.ll  | 77 +++++++++++++++---------
 test/Transforms/Reassociate/negation.ll  | 66 ++++++++++++++++----
 test/Transforms/Reassociate/negation1.ll |  7 ++-
 3 files changed, 107 insertions(+), 43 deletions(-)

diff --git a/test/Transforms/Reassociate/inverses.ll b/test/Transforms/Reassociate/inverses.ll
index 8500cd867fd..15c77206e72 100644
--- a/test/Transforms/Reassociate/inverses.ll
+++ b/test/Transforms/Reassociate/inverses.ll
@@ -1,46 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -reassociate -die -S | FileCheck %s
 
+; (A&B)&~A == 0
 define i32 @test1(i32 %a, i32 %b) {
-	%tmp.2 = and i32 %b, %a
-	%tmp.4 = xor i32 %a, -1
-        ; (A&B)&~A == 0
-	%tmp.5 = and i32 %tmp.2, %tmp.4
-	ret i32 %tmp.5
 ; CHECK-LABEL: @test1(
-; CHECK: ret i32 0
+; CHECK-NEXT:    ret i32 0
+;
+  %t2 = and i32 %b, %a
+  %t4 = xor i32 %a, -1
+  %t5 = and i32 %t2, %t4
+  ret i32 %t5
 }
 
+define <2 x i32> @not_op_vec_undef(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @not_op_vec_undef(
+; CHECK-NEXT:    [[T2:%.*]] = and <2 x i32> [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[T4:%.*]] = xor <2 x i32> [[A]], <i32 -1, i32 undef>
+; CHECK-NEXT:    [[T5:%.*]] = and <2 x i32> [[T2]], [[T4]]
+; CHECK-NEXT:    ret <2 x i32> [[T5]]
+;
+  %t2 = and <2 x i32> %b, %a
+  %t4 = xor <2 x i32> %a, <i32 -1, i32 undef>
+  %t5 = and <2 x i32> %t2, %t4
+  ret <2 x i32> %t5
+}
+
+; A&~A == 0
 define i32 @test2(i32 %a, i32 %b) {
-	%tmp.1 = and i32 %a, 1234
-	%tmp.2 = and i32 %b, %tmp.1
-	%tmp.4 = xor i32 %a, -1
-	; A&~A == 0
-        %tmp.5 = and i32 %tmp.2, %tmp.4
-	ret i32 %tmp.5
 ; CHECK-LABEL: @test2(
-; CHECK: ret i32 0
+; CHECK-NEXT:    ret i32 0
+;
+  %t1 = and i32 %a, 1234
+  %t2 = and i32 %b, %t1
+  %t4 = xor i32 %a, -1
+  %t5 = and i32 %t2, %t4
+  ret i32 %t5
 }
 
+; (b+(a+1234))+-a -> b+1234
 define i32 @test3(i32 %b, i32 %a) {
-	%tmp.1 = add i32 %a, 1234
-	%tmp.2 = add i32 %b, %tmp.1
-	%tmp.4 = sub i32 0, %a
-        ; (b+(a+1234))+-a -> b+1234
-  	%tmp.5 = add i32 %tmp.2, %tmp.4
-	ret i32 %tmp.5
 ; CHECK-LABEL: @test3(
-; CHECK: %tmp.5 = add i32 %b, 1234
-; CHECK: ret i32 %tmp.5
+; CHECK-NEXT:    [[T5:%.*]] = add i32 [[B:%.*]], 1234
+; CHECK-NEXT:    ret i32 [[T5]]
+;
+  %t1 = add i32 %a, 1234
+  %t2 = add i32 %b, %t1
+  %t4 = sub i32 0, %a
+  %t5 = add i32 %t2, %t4
+  ret i32 %t5
 }
 
+; (b+(a+1234))+~a -> b+1233
 define i32 @test4(i32 %b, i32 %a) {
-        %tmp.1 = add i32 %a, 1234
-        %tmp.2 = add i32 %b, %tmp.1
-        %tmp.4 = xor i32 %a, -1
-        ; (b+(a+1234))+~a -> b+1233
-        %tmp.5 = add i32 %tmp.2, %tmp.4
-        ret i32 %tmp.5
 ; CHECK-LABEL: @test4(
-; CHECK: %tmp.5 = add i32 %b, 1233
-; CHECK: ret i32 %tmp.5
+; CHECK-NEXT:    [[T5:%.*]] = add i32 [[B:%.*]], 1233
+; CHECK-NEXT:    ret i32 [[T5]]
+;
+  %t1 = add i32 %a, 1234
+  %t2 = add i32 %b, %t1
+  %t4 = xor i32 %a, -1
+  %t5 = add i32 %t2, %t4
+  ret i32 %t5
 }
+
diff --git a/test/Transforms/Reassociate/negation.ll b/test/Transforms/Reassociate/negation.ll
index 12d2c86192b..e1f9a421a9c 100644
--- a/test/Transforms/Reassociate/negation.ll
+++ b/test/Transforms/Reassociate/negation.ll
@@ -1,14 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
 ; Test that we can turn things like X*-(Y*Z) -> X*-1*Y*Z.
 
 define i32 @test1(i32 %a, i32 %b, i32 %z) {
-; CHECK-LABEL: test1
-; CHECK-NEXT: %e = mul i32 %a, 12345
-; CHECK-NEXT: %f = mul i32 %e, %b
-; CHECK-NEXT: %g = mul i32 %f, %z
-; CHECK-NEXT: ret i32 %g
-
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], 12345
+; CHECK-NEXT:    [[F:%.*]] = mul i32 [[E]], [[B:%.*]]
+; CHECK-NEXT:    [[G:%.*]] = mul i32 [[F]], [[Z:%.*]]
+; CHECK-NEXT:    ret i32 [[G]]
+;
   %c = sub i32 0, %z
   %d = mul i32 %a, %b
   %e = mul i32 %c, %d
@@ -18,14 +19,57 @@ define i32 @test1(i32 %a, i32 %b, i32 %z) {
 }
 
 define i32 @test2(i32 %a, i32 %b, i32 %z) {
-; CHECK-LABEL: test2
-; CHECK-NEXT: %e = mul i32 %a, 40
-; CHECK-NEXT: %f = mul i32 %e, %z
-; CHECK-NEXT: ret i32 %f
-
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], 40
+; CHECK-NEXT:    [[F:%.*]] = mul i32 [[E]], [[Z:%.*]]
+; CHECK-NEXT:    ret i32 [[F]]
+;
   %d = mul i32 %z, 40
   %c = sub i32 0, %d
   %e = mul i32 %a, %c
   %f = sub i32 0, %e
   ret i32 %f
 }
+
+define <2 x i32> @negate_vec_undefs(<2 x i32> %a, <2 x i32> %b, <2 x i32> %z) {
+; CHECK-LABEL: @negate_vec_undefs(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[Z:%.*]], <i32 -40, i32 -40>
+; CHECK-NEXT:    [[E:%.*]] = mul <2 x i32> [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    [[F:%.*]] = sub <2 x i32> <i32 0, i32 undef>, [[E]]
+; CHECK-NEXT:    ret <2 x i32> [[F]]
+;
+  %d = mul <2 x i32> %z, <i32 40, i32 40>
+  %c = sub <2 x i32> <i32 0, i32 undef>, %d
+  %e = mul <2 x i32> %a, %c
+  %f = sub <2 x i32> <i32 0, i32 undef>, %e
+  ret <2 x i32> %f
+}
+
+define i32 @not_not(i32 %a, i32 %b, i32 %z) {
+; CHECK-LABEL: @not_not(
+; CHECK-NEXT:    [[D:%.*]] = and i32 [[Z:%.*]], 40
+; CHECK-NEXT:    [[A_NOT:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[F:%.*]] = and i32 [[D]], [[A_NOT]]
+; CHECK-NEXT:    ret i32 [[F]]
+;
+  %d = and i32 %z, 40
+  %c = xor i32 -1, %d
+  %e = or i32 %a, %c
+  %f = xor i32 -1, %e
+  ret i32 %f
+}
+
+define <2 x i32> @not_vec_undefs(<2 x i32> %a, <2 x i32> %b, <2 x i32> %z) {
+; CHECK-LABEL: @not_vec_undefs(
+; CHECK-NEXT:    [[D:%.*]] = or <2 x i32> [[Z:%.*]], <i32 40, i32 40>
+; CHECK-NEXT:    [[A_NOT:%.*]] = xor <2 x i32> [[A:%.*]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[F:%.*]] = or <2 x i32> [[D]], [[A_NOT]]
+; CHECK-NEXT:    ret <2 x i32> [[F]]
+;
+  %d = or <2 x i32> %z, <i32 40, i32 40>
+  %c = xor <2 x i32> <i32 undef, i32 -1>, %d
+  %e = and <2 x i32> %a, %c
+  %f = xor <2 x i32> <i32 undef, i32 -1>, %e
+  ret <2 x i32> %f
+}
+
diff --git a/test/Transforms/Reassociate/negation1.ll b/test/Transforms/Reassociate/negation1.ll
index 34b943cf496..674e57df956 100644
--- a/test/Transforms/Reassociate/negation1.ll
+++ b/test/Transforms/Reassociate/negation1.ll
@@ -1,11 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
 ; Test that we can turn things like A*B + X - A*B -> X.
 
 define i32 @test1(i32 %a, i32 %b, i32 %x) {
-; CHECK-LABEL: test1
-; CHECK: ret i32 %x
-
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i32 [[X:%.*]]
+;
   %c = mul i32 %a, %b
   %d = add i32 %c, %x
   %c1 = mul i32 %a, %b
-- 
GitLab


From a551fc948486b4dfc2a589a3cac34b8230d0f6b3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 22 Oct 2018 22:14:05 +0000
Subject: [PATCH 0410/1116] Recommit r344877 "[X86] Stop promoting integer
 loads to vXi64"

I've included a fix to DAGCombiner::ForwardStoreValueToDirectLoad that I believe will prevent the previous miscompile.

Original commit message:

Theoretically this was done to simplify the amount of isel patterns that were needed. But it also meant a substantial number of our isel patterns have to match an explicit bitcast. By making the vXi32/vXi16/vXi8 types legal for loads, DAG combiner should be able to change the load type to rem

I had to add some additional plain load instruction patterns and a few other special cases, but overall the isel table has reduced in size by ~12000 bytes. So it looks like this promotion was hurting us more than helping.

I still have one crash in vector-trunc.ll that I'm hoping @RKSimon can help with. It seems to relate to using getTargetConstantFromNode on a load that was shrunk due to an extract_subvector combine after the constant pool entry was created. So we end up decoding more mask elements than the lo

I'm hoping this patch will simplify the number of patterns needed to remove the and/or/xor promotion.

Reviewers: RKSimon, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits, RKSimon

Differential Revision: https://reviews.llvm.org/D53306

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344965 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |   3 +-
 lib/Target/X86/X86ISelDAGToDAG.cpp            |  28 +-
 lib/Target/X86/X86ISelLowering.cpp            |  28 +-
 lib/Target/X86/X86InstrAVX512.td              | 243 ++++---
 lib/Target/X86/X86InstrFragmentsSIMD.td       |  54 +-
 lib/Target/X86/X86InstrSSE.td                 | 627 ++++++++++--------
 lib/Target/X86/X86InstrXOP.td                 |  75 ++-
 lib/Target/X86/X86MCInstLower.cpp             |  26 +-
 .../X86/X86ShuffleDecodeConstantPool.cpp      |  58 +-
 lib/Target/X86/X86ShuffleDecodeConstantPool.h |  13 +-
 test/CodeGen/X86/avx-vperm2x128.ll            |   2 +-
 test/CodeGen/X86/oddshuffles.ll               |  24 +-
 test/CodeGen/X86/pshufb-mask-comments.ll      |   6 +-
 test/CodeGen/X86/vector-extend-inreg.ll       |   2 +-
 test/CodeGen/X86/vector-idiv-v2i32.ll         |  18 +-
 test/CodeGen/X86/widened-broadcast.ll         |  95 +--
 16 files changed, 712 insertions(+), 590 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f560f0e1a9c..e606cbd749c 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12897,7 +12897,8 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
     if (!isTypeLegal(LDMemType))
       continue;
     if (STMemType != LDMemType) {
-      if (numVectorEltsOrZero(STMemType) == numVectorEltsOrZero(LDMemType) &&
+      // TODO: Support vectors? This requires extract_subvector/bitcast.
+      if (!STMemType.isVector() && !LDMemType.isVector() && 
           STMemType.isInteger() && LDMemType.isInteger())
         Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val);
       else
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 9dbc2761a6c..d3aa5c89adc 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2890,21 +2890,17 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
 
-  // If there is a load, it will be behind a bitcast. We don't need to check
-  // alignment on this load.
+  // Try to fold a load. No need to check alignment.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
-      tryFoldLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
-                  Tmp3, Tmp4)) {
-    SDValue Load = N1.getOperand(0);
+  if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
-                      Load.getOperand(0) };
+                      N1.getOperand(0) };
     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     // Update the chain.
-    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+    ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
     // Record the mem-refs
-    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     return CNode;
   }
 
@@ -2927,22 +2923,18 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
 
-  // If there is a load, it will be behind a bitcast. We don't need to check
-  // alignment on this load.
+  // Try to fold a load. No need to check alignment.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
-      tryFoldLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
-                  Tmp3, Tmp4)) {
-    SDValue Load = N2.getOperand(0);
+  if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
-                      Load.getOperand(0), InFlag };
+                      N2.getOperand(0), InFlag };
     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     InFlag = SDValue(CNode, 3);
     // Update the chain.
-    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+    ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
     // Record the mem-refs
-    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
     return CNode;
   }
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6059a2a09e7..8ba6c9ee018 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -869,11 +869,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
-    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
-      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
-    }
-
     // Custom lower v2i64 and v2f64 selects.
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
@@ -1178,11 +1173,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (HasInt256)
       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
 
-    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
-      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
-    }
-
     if (HasInt256) {
       // Custom legalize 2x32 to get a little better code.
       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
@@ -1419,10 +1409,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MGATHER,             VT, Custom);
       setOperationAction(ISD::MSCATTER,            VT, Custom);
     }
-    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
-      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
-    }
-
     // Need to custom split v32i16/v64i8 bitcasts.
     if (!Subtarget.hasBWI()) {
       setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
@@ -5539,7 +5525,7 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
     return nullptr;
 
-  return dyn_cast<Constant>(CNode->getConstVal());
+  return CNode->getConstVal();
 }
 
 // Extract raw constant bits from constant pools.
@@ -6045,7 +6031,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMILPMask(C, MaskEltSize, Mask);
+      DecodeVPERMILPMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6062,7 +6048,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodePSHUFBMask(C, Mask);
+      DecodePSHUFBMask(C, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6124,7 +6110,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
         break;
       }
       if (auto *C = getTargetConstantFromNode(MaskNode)) {
-        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
+        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, VT.getSizeInBits(), Mask);
         break;
       }
     }
@@ -6141,7 +6127,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPPERMMask(C, Mask);
+      DecodeVPPERMMask(C, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6158,7 +6144,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMVMask(C, MaskEltSize, Mask);
+      DecodeVPERMVMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6172,7 +6158,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     Ops.push_back(N->getOperand(2));
     SDValue MaskNode = N->getOperand(1);
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMV3Mask(C, MaskEltSize, Mask);
+      DecodeVPERMV3Mask(C, MaskEltSize, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 72dd4ec8034..b2d0ce2bcd3 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -66,21 +66,16 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
                            !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?));
 
   // Load patterns
-  // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
-  //       due to load promotion during legalization
-  PatFrag LdFrag = !cast<PatFrag>("load" #
-                                  !if (!eq (TypeVariantName, "i"),
-                                       !if (!eq (Size, 128), "v2i64",
-                                       !if (!eq (Size, 256), "v4i64",
-                                       !if (!eq (Size, 512), "v8i64",
-                                            VTName))), VTName));
-
-  PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" #
-                                         !if (!eq (TypeVariantName, "i"),
-                                               !if (!eq (Size, 128), "v2i64",
-                                               !if (!eq (Size, 256), "v4i64",
-                                               !if (!eq (Size, 512), "v8i64",
-                                                   VTName))), VTName));
+  PatFrag LdFrag = !cast<PatFrag>("load" # VTName);
+
+  PatFrag i64LdFrag = !cast<PatFrag>("load" #
+                                     !if (!eq (TypeVariantName, "i"),
+                                          !if (!eq (Size, 128), "v2i64",
+                                          !if (!eq (Size, 256), "v4i64",
+                                          !if (!eq (Size, 512), "v8i64",
+                                               VTName))), VTName));
+
+  PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # VTName);
 
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
 
@@ -518,10 +513,10 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
-                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                               (From.VT (From.LdFrag addr:$src2)),
                                (iPTR imm)),
                    (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
-                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                               (From.VT (From.LdFrag addr:$src2)),
                                (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
                    EVEX_CD8<From.EltSize, From.CD8TupleForm>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -547,7 +542,7 @@ multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
 
     def : Pat<(vinsert_insert:$ins
                   (To.VT To.RC:$src1),
-                  (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                  (From.VT (From.LdFrag addr:$src2)),
                   (iPTR imm)),
               (To.VT (!cast<Instruction>(InstrStr#"rm")
                   To.RC:$src1, addr:$src2,
@@ -680,9 +675,7 @@ let Predicates = p in {
              (vselect Cast.KRCWM:$mask,
                       (bitconvert
                        (vinsert_insert:$ins (To.VT To.RC:$src1),
-                                            (From.VT
-                                             (bitconvert
-                                              (From.LdFrag addr:$src2))),
+                                            (From.VT (From.LdFrag addr:$src2)),
                                             (iPTR imm))),
                       Cast.ImmAllZerosV)),
             (!cast<Instruction>(InstrStr#"rmkz")
@@ -1374,7 +1367,7 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+                             (_Src.VT (_Src.LdFrag addr:$src))))>,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
@@ -1389,7 +1382,7 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (null_frag),
                            (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+                             (_Src.VT (_Src.LdFrag addr:$src))))>,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
@@ -1442,11 +1435,11 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
 let Predicates = [HasAVX512] in {
 def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
           (VBROADCASTF64X4rm addr:$src)>;
-def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
+def : Pat<(v16i32 (X86SubVBroadcast (loadv8i32 addr:$src))),
           (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))),
+def : Pat<(v32i16 (X86SubVBroadcast (loadv16i16 addr:$src))),
           (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
+def : Pat<(v64i8 (X86SubVBroadcast (loadv32i8 addr:$src))),
           (VBROADCASTI64X4rm addr:$src)>;
 
 // Provide fallback in case the load node that is used in the patterns above
@@ -1474,9 +1467,9 @@ def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
           (VBROADCASTF32X4rm addr:$src)>;
 def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v32i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
@@ -1506,11 +1499,11 @@ def : Pat<(vselect VK8WM:$mask,
                    VR512:$src0),
           (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
                    (bc_v8i64 (v16i32 immAllZerosV))),
           (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
                    VR512:$src0),
           (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
@@ -1527,9 +1520,9 @@ def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
           (VBROADCASTF32X4Z256rm addr:$src)>;
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
@@ -1591,11 +1584,11 @@ def : Pat<(vselect VK4WM:$mask,
                    VR256X:$src0),
           (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    (bc_v4i64 (v8i32 immAllZerosV))),
           (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
 def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    VR256X:$src0),
           (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 }
@@ -1641,11 +1634,11 @@ def : Pat<(vselect VK8WM:$mask,
                    VR512:$src0),
           (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    (bc_v8i64 (v16i32 immAllZerosV))),
           (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    VR512:$src0),
           (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
@@ -1741,7 +1734,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
             (ins _.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1,
-                   (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
+                   (_.VT (_.LdFrag addr:$src3)))), 1>,
             EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -1859,7 +1852,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
             (ins IdxVT.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
-                   (bitconvert (_.LdFrag addr:$src3)))), 1>,
+                   (_.LdFrag addr:$src3))), 1>,
             EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -2149,7 +2142,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
-                                       (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+                                       (_.VT (_.LdFrag addr:$src2))))]>,
              EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = IsCommutable in
   def rrk : AVX512BI<opc, MRMSrcReg,
@@ -2165,8 +2158,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
                           "$dst {${mask}}, $src1, $src2}"),
               [(set _.KRC:$dst, (and _.KRCWM:$mask,
                                    (OpNode (_.VT _.RC:$src1),
-                                       (_.VT (bitconvert
-                                              (_.LdFrag addr:$src2))))))]>,
+                                       (_.VT (_.LdFrag addr:$src2)))))]>,
               EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -2291,7 +2283,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
              [(set _.KRC:$dst, (_.KVT
                                 (Frag:$cc
                                  (_.VT _.RC:$src1),
-                                 (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                 (_.VT (_.LdFrag addr:$src2)),
                                  cond)))]>,
              EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = 1 in
@@ -2316,8 +2308,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                      (_.KVT
                                       (Frag:$cc
                                        (_.VT _.RC:$src1),
-                                       (_.VT (bitconvert
-                                              (_.LdFrag addr:$src2))),
+                                       (_.VT (_.LdFrag addr:$src2)),
                                        cond))))]>,
               EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -2352,13 +2343,13 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                NotMemoryFoldable;
   }
 
-  def : Pat<(_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
+  def : Pat<(_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
                                  (_.VT _.RC:$src1), cond)),
             (!cast<Instruction>(Name#_.ZSuffix#"rmi")
              _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
 
   def : Pat<(and _.KRCWM:$mask,
-                 (_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
+                 (_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
                                       (_.VT _.RC:$src1), cond))),
             (!cast<Instruction>(Name#_.ZSuffix#"rmik")
              _.KRCWM:$mask, _.RC:$src1, addr:$src2,
@@ -2544,7 +2535,7 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
                 "vcmp${cc}"#_.Suffix,
                 "$src2, $src1", "$src1, $src2",
                 (X86cmpm (_.VT _.RC:$src1),
-                        (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                        (_.VT (_.LdFrag addr:$src2)),
                         imm:$cc)>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -2732,7 +2723,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     OpcodeStr##_.Suffix##mem#
                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set _.KRC:$dst,(OpNode
-                                     (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                     (_.VT (_.LdFrag addr:$src1)),
                                      (i32 imm:$src2)))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
@@ -2740,7 +2731,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     OpcodeStr##_.Suffix##mem#
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                     [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode
-                                  (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                  (_.VT (_.LdFrag addr:$src1)),
                                   (i32 imm:$src2))))]>,
                     EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
@@ -3353,7 +3344,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     !if(NoRMPattern, [],
                         [(set _.RC:$dst,
-                          (_.VT (bitconvert (ld_frag addr:$src))))]),
+                          (_.VT (ld_frag addr:$src)))]),
                     _.ExeDomain>, EVEX, Sched<[Sched.RM]>,
                     EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
 
@@ -3372,7 +3363,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                       "${dst} {${mask}}, $src1}"),
                      [(set _.RC:$dst, (_.VT
                          (vselect _.KRCWM:$mask,
-                          (_.VT (bitconvert (ld_frag addr:$src1))),
+                          (_.VT (ld_frag addr:$src1)),
                            (_.VT _.RC:$src0))))], _.ExeDomain>,
                      EVEX, EVEX_K, Sched<[Sched.RM]>;
   }
@@ -3381,7 +3372,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                   OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
                                 "${dst} {${mask}} {z}, $src}",
                   [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
-                    (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
+                    (_.VT (ld_frag addr:$src)), _.ImmAllZerosV)))],
                   _.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>;
   }
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
@@ -3681,6 +3672,20 @@ let Predicates = [HasBWI, NoVLX] in {
 }
 
 let Predicates = [HasAVX512] in {
+  // 512-bit load.
+  def : Pat<(alignedloadv16i32 addr:$src),
+            (VMOVDQA64Zrm addr:$src)>;
+  def : Pat<(alignedloadv32i16 addr:$src),
+            (VMOVDQA64Zrm addr:$src)>;
+  def : Pat<(alignedloadv64i8 addr:$src),
+            (VMOVDQA64Zrm addr:$src)>;
+  def : Pat<(loadv16i32 addr:$src),
+            (VMOVDQU64Zrm addr:$src)>;
+  def : Pat<(loadv32i16 addr:$src),
+            (VMOVDQU64Zrm addr:$src)>;
+  def : Pat<(loadv64i8 addr:$src),
+            (VMOVDQU64Zrm addr:$src)>;
+
   // 512-bit store.
   def : Pat<(alignedstore (v16i32 VR512:$src), addr:$dst),
             (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
@@ -3697,6 +3702,20 @@ let Predicates = [HasAVX512] in {
 }
 
 let Predicates = [HasVLX] in {
+  // 128-bit load.
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (VMOVDQA64Z128rm addr:$src)>;
+  def : Pat<(alignedloadv8i16 addr:$src),
+            (VMOVDQA64Z128rm addr:$src)>;
+  def : Pat<(alignedloadv16i8 addr:$src),
+            (VMOVDQA64Z128rm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (VMOVDQU64Z128rm addr:$src)>;
+  def : Pat<(loadv8i16 addr:$src),
+            (VMOVDQU64Z128rm addr:$src)>;
+  def : Pat<(loadv16i8 addr:$src),
+            (VMOVDQU64Z128rm addr:$src)>;
+
   // 128-bit store.
   def : Pat<(alignedstore (v4i32 VR128X:$src), addr:$dst),
             (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
@@ -3711,6 +3730,20 @@ let Predicates = [HasVLX] in {
   def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
             (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
 
+  // 256-bit load.
+  def : Pat<(alignedloadv8i32 addr:$src),
+            (VMOVDQA64Z256rm addr:$src)>;
+  def : Pat<(alignedloadv16i16 addr:$src),
+            (VMOVDQA64Z256rm addr:$src)>;
+  def : Pat<(alignedloadv32i8 addr:$src),
+            (VMOVDQA64Z256rm addr:$src)>;
+  def : Pat<(loadv8i32 addr:$src),
+            (VMOVDQU64Z256rm addr:$src)>;
+  def : Pat<(loadv16i16 addr:$src),
+            (VMOVDQU64Z256rm addr:$src)>;
+  def : Pat<(loadv32i8 addr:$src),
+            (VMOVDQU64Z256rm addr:$src)>;
+
   // 256-bit store.
   def : Pat<(alignedstore (v8i32 VR256X:$src), addr:$dst),
             (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
@@ -4495,7 +4528,7 @@ let Predicates = [HasAVX512] in {
             (VMOVDI2PDIZrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (VMOVDI2PDIZrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
             (VMOVDI2PDIZrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (VMOVDI2PDIZrm addr:$src)>;
@@ -4591,6 +4624,12 @@ let Predicates = [HasAVX512], AddedComplexity = 400 in {
             (VMOVNTDQAZrm addr:$src)>;
   def : Pat<(v8i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v16i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v32i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v64i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
 }
 
 let Predicates = [HasVLX], AddedComplexity = 400 in {
@@ -4607,6 +4646,12 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
             (VMOVNTDQAZ256rm addr:$src)>;
   def : Pat<(v4i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
 
   def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
             (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
@@ -4621,6 +4666,12 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
             (VMOVNTDQAZ128rm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4639,8 +4690,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
-                  (_.VT (OpNode _.RC:$src1,
-                                (bitconvert (_.LdFrag addr:$src2))))>,
+                  (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2)))>,
                   AVX512BIBase, EVEX_4V,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -4771,7 +4821,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
+                                      (_Src.LdFrag addr:$src2)))>,
                         AVX512BIBase, EVEX_4V,
                         Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -4876,7 +4926,7 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
+                                      (_Src.LdFrag addr:$src2)))>,
                          EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -5068,7 +5118,7 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
                   (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
                                    (bitconvert (_.LdFrag addr:$src2)))),
                   (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
-                                     (bitconvert (_.LdFrag addr:$src2))))))>,
+                                     (_.i64LdFrag addr:$src2)))))>,
                   AVX512BIBase, EVEX_4V,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -5729,7 +5779,7 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode,
                        "$src2, $src1", "$src1, $src2",
                    (OpNode (bitconvert
                             (_.i64VT (and _.RC:$src1,
-                                          (bitconvert (_.LdFrag addr:$src2))))),
+                                          (_.i64LdFrag addr:$src2)))),
                            _.ImmAllZerosV)>,
                    EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -5893,7 +5943,7 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
   defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                   (_.VT (OpNode (_.VT (_.LdFrag addr:$src1)),
                           (i8 imm:$src2)))>,
                    Sched<[sched.Folded]>;
   }
@@ -5923,8 +5973,7 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode _.RC:$src1,
-                                 (SrcVT (bitconvert (loadv2i64 addr:$src2)))))>,
+                   (_.VT (OpNode _.RC:$src1, (SrcVT (load addr:$src2))))>,
                    AVX512BIBase,
                    EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -6078,7 +6127,7 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1,
-                   (_.VT (bitconvert (_.LdFrag addr:$src2)))))>,
+                   (_.VT (_.LdFrag addr:$src2))))>,
                    AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -6178,7 +6227,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
     def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)),
               (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1,
                _.RC:$src2)>;
-    def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))),
+    def : Pat<(_.VT (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2))),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
                _.RC:$src1, addr:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
@@ -6186,7 +6235,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, _.RC:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                     (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+                     (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
                      _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, addr:$src2)>;
@@ -6195,7 +6244,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
                _.RC:$src1, _.RC:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                     (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+                     (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
                      _.ImmAllZerosV)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
                _.RC:$src1, addr:$src2)>;
@@ -6420,7 +6469,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode
                            _.RC:$src1,
-                           (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+                           (Ctrl.VT (Ctrl.LdFrag addr:$src2))))>,
                   T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
@@ -7706,7 +7755,7 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins MemOp:$src), OpcodeStr#Alias, "$src", "$src",
                          (_.VT (OpNode (_Src.VT
-                             (bitconvert (_Src.LdFrag addr:$src)))))>,
+                             (_Src.LdFrag addr:$src))))>,
                          EVEX, Sched<[sched.Folded]>;
 
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -8413,8 +8462,7 @@ multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
   defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
                             (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
                             (X86cvtph2ps (_src.VT
-                                          (bitconvert
-                                           (ld_frag addr:$src))))>,
+                                          (ld_frag addr:$src)))>,
                             T8PD, Sched<[sched.Folded]>;
 }
 
@@ -8429,17 +8477,17 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
 }
 
 let Predicates = [HasAVX512] in
-  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64,
+  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, load,
                                     WriteCvtPH2PSZ>,
                     avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>,
                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
 
 let Predicates = [HasVLX] in {
   defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
-                       loadv2i64, WriteCvtPH2PSY>, EVEX, EVEX_V256,
+                       load, WriteCvtPH2PSY>, EVEX, EVEX_V256,
                        EVEX_CD8<32, CD8VH>;
   defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
-                       loadv2i64, WriteCvtPH2PS>, EVEX, EVEX_V128,
+                       load, WriteCvtPH2PS>, EVEX, EVEX_V128,
                        EVEX_CD8<32, CD8VH>;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
@@ -9383,7 +9431,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
   def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
   }
   let Predicates = [HasVLX] in {
@@ -9393,7 +9441,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
   def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
@@ -9402,7 +9450,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
 
   def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -9413,7 +9461,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
   def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -9422,7 +9470,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -9433,12 +9481,12 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   }
   // 256-bit patterns
   let Predicates = [HasVLX, HasBWI] in {
-  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
   def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
@@ -9452,7 +9500,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
 
   def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -9461,10 +9509,10 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
 
-  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
@@ -9477,10 +9525,10 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
 
-  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
@@ -9489,25 +9537,25 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
   }
   // 512-bit patterns
   let Predicates = [HasBWI] in {
-  def : Pat<(v32i16 (ExtOp (bc_v32i8 (loadv4i64 addr:$src)))),
+  def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWZrm) addr:$src)>;
   }
   let Predicates = [HasAVX512] in {
-  def : Pat<(v16i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDZrm) addr:$src)>;
 
   def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
-  def : Pat<(v8i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
 
-  def : Pat<(v16i32 (ExtOp (bc_v16i16 (loadv4i64 addr:$src)))),
+  def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDZrm) addr:$src)>;
 
-  def : Pat<(v8i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQZrm) addr:$src)>;
 
-  def : Pat<(v8i64 (ExtOp (bc_v8i32 (loadv4i64 addr:$src)))),
+  def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQZrm) addr:$src)>;
   }
 }
@@ -10412,7 +10460,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                 (_.VT
                  (bitconvert
                   (CastInfo.VT (X86Shuf128 _.RC:$src1,
-                                           (bitconvert (_.LdFrag addr:$src2)),
+                                           (CastInfo.LdFrag addr:$src2),
                                            (i8 imm:$src3)))))>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>,
                 EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
@@ -10578,7 +10626,7 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
-                                      (bitconvert (To.LdFrag addr:$src2)),
+                                              (From.LdFrag addr:$src2),
                                       imm:$src3))),
                             To.RC:$src0)),
             (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
@@ -10588,7 +10636,7 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
-                                      (bitconvert (To.LdFrag addr:$src2)),
+                                              (From.LdFrag addr:$src2),
                                       imm:$src3))),
                             To.ImmAllZerosV)),
             (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
@@ -11732,7 +11780,7 @@ multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                        (VTI.VT (bitconvert (VTI.LdFrag addr:$src3)))))>,
+                        (VTI.VT (VTI.LdFrag addr:$src3))))>,
                 AVX512FMA3Base,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -11835,8 +11883,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
                                    (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                                            (VTI.VT (bitconvert
-                                                     (VTI.LdFrag addr:$src3)))))>,
+                                            (VTI.VT (VTI.LdFrag addr:$src3))))>,
                                    EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
                                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm mb :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -11892,7 +11939,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
                                 "vpshufbitqmb",
                                 "$src2, $src1", "$src1, $src2",
                                 (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
-                                (VTI.VT (bitconvert (VTI.LdFrag addr:$src2))))>,
+                                (VTI.VT (VTI.LdFrag addr:$src2)))>,
                                 EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
                                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index f750fe3ee0c..7e31527a877 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -648,21 +648,28 @@ def sdmem : Operand<v2f64> {
 //===----------------------------------------------------------------------===//
 
 // 128-bit load pattern fragments
-// NOTE: all 128-bit integer vector loads are promoted to v2i64
 def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
 def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
 def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv8i16    : PatFrag<(ops node:$ptr), (v8i16 (load node:$ptr))>;
+def loadv16i8    : PatFrag<(ops node:$ptr), (v16i8 (load node:$ptr))>;
 
 // 256-bit load pattern fragments
-// NOTE: all 256-bit integer vector loads are promoted to v4i64
-def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
-def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
-def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32  (load node:$ptr))>;
+def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64  (load node:$ptr))>;
+def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64  (load node:$ptr))>;
+def loadv8i32    : PatFrag<(ops node:$ptr), (v8i32  (load node:$ptr))>;
+def loadv16i16   : PatFrag<(ops node:$ptr), (v16i16 (load node:$ptr))>;
+def loadv32i8    : PatFrag<(ops node:$ptr), (v32i8  (load node:$ptr))>;
 
 // 512-bit load pattern fragments
 def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
-def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
-def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64  (load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64  (load node:$ptr))>;
+def loadv16i32   : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
+def loadv32i16   : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>;
+def loadv64i8    : PatFrag<(ops node:$ptr), (v64i8  (load node:$ptr))>;
 
 // 128-/256-/512-bit extload pattern fragments
 def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
@@ -690,15 +697,27 @@ def alignedloadv2f64 : PatFrag<(ops node:$ptr),
                                (v2f64 (alignedload node:$ptr))>;
 def alignedloadv2i64 : PatFrag<(ops node:$ptr),
                                (v2i64 (alignedload node:$ptr))>;
+def alignedloadv4i32 : PatFrag<(ops node:$ptr),
+                               (v4i32 (alignedload node:$ptr))>;
+def alignedloadv8i16 : PatFrag<(ops node:$ptr),
+                               (v8i16 (alignedload node:$ptr))>;
+def alignedloadv16i8 : PatFrag<(ops node:$ptr),
+                               (v16i8 (alignedload node:$ptr))>;
 
 // 256-bit aligned load pattern fragments
 // NOTE: all 256-bit integer vector loads are promoted to v4i64
-def alignedloadv8f32 : PatFrag<(ops node:$ptr),
-                               (v8f32 (alignedload node:$ptr))>;
-def alignedloadv4f64 : PatFrag<(ops node:$ptr),
-                               (v4f64 (alignedload node:$ptr))>;
-def alignedloadv4i64 : PatFrag<(ops node:$ptr),
-                               (v4i64 (alignedload node:$ptr))>;
+def alignedloadv8f32  : PatFrag<(ops node:$ptr),
+                                (v8f32  (alignedload node:$ptr))>;
+def alignedloadv4f64  : PatFrag<(ops node:$ptr),
+                                (v4f64  (alignedload node:$ptr))>;
+def alignedloadv4i64  : PatFrag<(ops node:$ptr),
+                                (v4i64  (alignedload node:$ptr))>;
+def alignedloadv8i32  : PatFrag<(ops node:$ptr),
+                                (v8i32  (alignedload node:$ptr))>;
+def alignedloadv16i16 : PatFrag<(ops node:$ptr),
+                                (v16i16 (alignedload node:$ptr))>;
+def alignedloadv32i8  : PatFrag<(ops node:$ptr),
+                                (v32i8  (alignedload node:$ptr))>;
 
 // 512-bit aligned load pattern fragments
 def alignedloadv16f32 : PatFrag<(ops node:$ptr),
@@ -707,6 +726,12 @@ def alignedloadv8f64  : PatFrag<(ops node:$ptr),
                                 (v8f64  (alignedload node:$ptr))>;
 def alignedloadv8i64  : PatFrag<(ops node:$ptr),
                                 (v8i64  (alignedload node:$ptr))>;
+def alignedloadv16i32 : PatFrag<(ops node:$ptr),
+                                (v16i32 (alignedload node:$ptr))>;
+def alignedloadv32i16 : PatFrag<(ops node:$ptr),
+                                (v32i16 (alignedload node:$ptr))>;
+def alignedloadv64i8  : PatFrag<(ops node:$ptr),
+                                (v64i8  (alignedload node:$ptr))>;
 
 // Like 'load', but uses special alignment checks suitable for use in
 // memory operands in most SSE instructions, which are required to
@@ -725,6 +750,9 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
+def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>;
+def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
 
 def X86masked_gather : SDNode<"X86ISD::MGATHER",
                               SDTypeProfile<2, 3, [SDTCisVec<0>,
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index ced93f8d253..8f97ce37068 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -589,8 +589,21 @@ let Predicates = [HasAVX, NoVLX] in {
   // available and changing the domain is beneficial.
   def : Pat<(alignedloadv4i64 addr:$src),
             (VMOVAPSYrm addr:$src)>;
+  def : Pat<(alignedloadv8i32 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(alignedloadv16i16 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(alignedloadv32i8 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
   def : Pat<(loadv4i64 addr:$src),
             (VMOVUPSYrm addr:$src)>;
+  def : Pat<(loadv8i32 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(loadv16i16 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(loadv32i8 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+
   def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
@@ -615,8 +628,20 @@ let Predicates = [HasAVX, NoVLX] in {
 let Predicates = [UseSSE1] in {
   def : Pat<(alignedloadv2i64 addr:$src),
             (MOVAPSrm addr:$src)>;
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(alignedloadv8i16 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(alignedloadv16i8 addr:$src),
+            (MOVAPSrm addr:$src)>;
   def : Pat<(loadv2i64 addr:$src),
             (MOVUPSrm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (MOVUPSrm addr:$src)>;
+  def : Pat<(loadv8i16 addr:$src),
+            (MOVUPSrm addr:$src)>;
+  def : Pat<(loadv16i8 addr:$src),
+            (MOVUPSrm addr:$src)>;
 
   def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
             (MOVAPSmr addr:$dst, VR128:$src)>;
@@ -841,7 +866,7 @@ let hasSideEffects = 0 in {
   let mayLoad = 1 in
   def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
              [(set RC:$dst, (DstTy (sint_to_fp
-                                    (SrcTy (bitconvert (ld_frag addr:$src))))))], d>,
+                                    (SrcTy (ld_frag addr:$src)))))], d>,
              Sched<[sched.Folded]>;
 }
 }
@@ -1104,16 +1129,16 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
                                  ssmem, sse_load_f32, "cvtss2si",
                                  WriteCvtSS2I>, XS, REX_W;
 
-defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
+defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PS>,
                                PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
-defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
+defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PSY>,
                                PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 
-defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, WriteCvtI2PS>,
                             PS, Requires<[UseSSE2]>;
@@ -1672,7 +1697,7 @@ let hasSideEffects = 0, mayLoad = 1 in
 def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                          (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
                         VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
 def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -1682,7 +1707,7 @@ def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
-                           (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                           (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>,
                          VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
                          VEX_WIG;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
@@ -1696,7 +1721,7 @@ let hasSideEffects = 0, mayLoad = 1 in
 def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                         (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
                        Sched<[WriteCvtI2PDLd]>;
 def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2151,54 +2176,54 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
-defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
-defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
-defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
 
-defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 }// Predicates = [HasAVX, NoVLX]
 
 let Constraints = "$src1 = $dst" in {
-  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
-  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
         VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
-  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
         VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
-  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
         VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX1Only] in {
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
@@ -2284,8 +2309,7 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1,
-                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 } // ExeDomain = SSEPackedInt
@@ -2296,16 +2320,16 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          Predicate prd> {
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
-                             VR128, loadv2i64, i128mem, sched.XMM,
+                             VR128, load, i128mem, sched.XMM,
                              IsCommutable, 0>, VEX_4V, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
-                           memopv2i64, i128mem, sched.XMM, IsCommutable, 1>;
+                           memop, i128mem, sched.XMM, IsCommutable, 1>;
 
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
-                               OpVT256, VR256, loadv4i64, i256mem, sched.YMM,
+                               OpVT256, VR256, load, i256mem, sched.YMM,
                                IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
@@ -3423,6 +3447,19 @@ def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
 
 let Predicates = [HasAVX, NoVLX] in {
   // Additional patterns for other integer sizes.
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (VMOVDQArm addr:$src)>;
+  def : Pat<(alignedloadv8i16 addr:$src),
+            (VMOVDQArm addr:$src)>;
+  def : Pat<(alignedloadv16i8 addr:$src),
+            (VMOVDQArm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (VMOVDQUrm addr:$src)>;
+  def : Pat<(loadv8i16 addr:$src),
+            (VMOVDQUrm addr:$src)>;
+  def : Pat<(loadv16i8 addr:$src),
+            (VMOVDQUrm addr:$src)>;
+
   def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
             (VMOVDQAmr addr:$dst, VR128:$src)>;
   def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
@@ -3462,7 +3499,7 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
-                                     (bitconvert (memop_frag addr:$src2)))))]>,
+                                     (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 } // ExeDomain = SSEPackedInt
@@ -3522,28 +3559,28 @@ defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                              loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
+                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
                               VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
-                               VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM,
+                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
                                0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                             memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
+                             memop, i128mem, SchedWriteVecIMul.XMM>;
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
-                             loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>,
+                             load, i128mem, SchedWritePSADBW.XMM, 0>,
                              VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
-                             loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>,
+                             load, i256mem, SchedWritePSADBW.YMM, 0>,
                              VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
-                            memopv2i64, i128mem, SchedWritePSADBW.XMM>;
+                            memop, i128mem, SchedWritePSADBW.XMM>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Logical Instructions
@@ -3570,7 +3607,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode RC:$src1,
-                       (SrcVT (bitconvert (ld_frag addr:$src2))))))]>,
+                       (SrcVT (ld_frag addr:$src2)))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
        (ins RC:$src1, u8imm:$src2),
@@ -3590,16 +3627,16 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                               OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
-                              DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                              DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                                 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
-                                DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L,
+                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
                                 VEX_WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
                             VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
-                            memopv2i64>;
+                            memop>;
 }
 
 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
@@ -3699,7 +3736,7 @@ let Predicates = [HasAVX, prd] in {
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
-                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
+                       (vt128 (OpNode (load addr:$src1),
                         (i8 imm:$src2))))]>, VEX,
                   Sched<[sched.XMM.Folded]>, VEX_WIG;
 }
@@ -3717,7 +3754,7 @@ let Predicates = [HasAVX2, prd] in {
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
-                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
+                        (vt256 (OpNode (load addr:$src1),
                          (i8 imm:$src2))))]>, VEX, VEX_L,
                    Sched<[sched.YMM.Folded]>, VEX_WIG;
 }
@@ -3735,7 +3772,7 @@ let Predicates = [UseSSE2] in {
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set VR128:$dst,
-                 (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+                 (vt128 (OpNode (memop addr:$src1),
                         (i8 imm:$src2))))]>,
                Sched<[sched.XMM.Folded]>;
 }
@@ -3775,7 +3812,7 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                [(set RC:$dst,
                      (OutVT (OpNode (ArgVT RC:$src1),
-                                    (bitconvert (ld_frag addr:$src2)))))]>,
+                                    (ld_frag addr:$src2))))]>,
                Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -3800,53 +3837,53 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                  [(set RC:$dst,
                        (OutVT (OpNode (ArgVT RC:$src1),
-                                      (bitconvert (ld_frag addr:$src2)))))]>,
+                                      (ld_frag addr:$src2))))]>,
                  Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V, VEX_WIG;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V, VEX_WIG;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V, VEX_WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
 
   defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
   defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
 
   defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
 
   defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -3871,89 +3908,88 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
       !if(Is2Addr,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (OpNode RC:$src1,
-                                  (bitconvert (ld_frag addr:$src2)))))]>,
+      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
 
   defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -4272,7 +4308,7 @@ let Predicates = [UseAVX] in {
             (VMOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (VMOVDI2PDIrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
             (VMOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (VMOVDI2PDIrm addr:$src)>;
@@ -4297,7 +4333,7 @@ let Predicates = [UseSSE2] in {
             (MOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (MOVDI2PDIrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
             (MOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (MOVDI2PDIrm addr:$src)>;
@@ -4452,30 +4488,30 @@ defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
 let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (VMOVSHDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
             (VMOVSHDUPrm addr:$src)>;
   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
             (VMOVSLDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
             (VMOVSLDUPrm addr:$src)>;
   def : Pat<(v8i32 (X86Movshdup VR256:$src)),
             (VMOVSHDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
             (VMOVSHDUPYrm addr:$src)>;
   def : Pat<(v8i32 (X86Movsldup VR256:$src)),
             (VMOVSLDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
             (VMOVSLDUPYrm addr:$src)>;
 }
 
 let Predicates = [UseSSE3] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (MOVSHDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
             (MOVSHDUPrm addr:$src)>;
   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
             (MOVSLDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
             (MOVSLDUPrm addr:$src)>;
 }
 
@@ -4697,7 +4733,7 @@ multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
                  (ins i128mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                  [(set VR128:$dst,
-                   (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>,
+                   (vt (OpNode (ld_frag addr:$src))))]>,
                  Sched<[sched.XMM.Folded]>;
 }
 
@@ -4714,19 +4750,19 @@ multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
                   (ins i256mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR256:$dst,
-                    (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
+                    (vt (OpNode (load addr:$src))))]>,
                   Sched<[sched.YMM.Folded]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
-                              loadv2i64>, VEX, VEX_WIG;
+                              load>, VEX, VEX_WIG;
   defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
-                              loadv2i64>, VEX, VEX_WIG;
+                              load>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX] in {
   defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
-                              loadv2i64>, VEX, VEX_WIG;
+                              load>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
@@ -4740,11 +4776,11 @@ let Predicates = [HasAVX2, NoVLX] in {
 }
 
 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
-                          memopv2i64>;
+                          memop>;
 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
-                          memopv2i64>;
+                          memop>;
 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
-                          memopv2i64>;
+                          memop>;
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Binary Operator Instructions
@@ -4769,8 +4805,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (DstVT (OpNode (OpVT RC:$src1),
-          (bitconvert (memop_frag addr:$src2)))))]>,
+         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4792,8 +4827,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
-         (IntId128 VR128:$src1,
-          (bitconvert (ld_frag addr:$src2))))]>,
+         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4810,83 +4844,83 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        (ins VR256:$src1, i256mem:$src2),
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
-         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
+         (IntId256 VR256:$src1, (load addr:$src2)))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
-                                  VR128, loadv2i64, i128mem,
+                                  VR128, load, i128mem,
                                   SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
-                                  v16i8, VR128, loadv2i64, i128mem,
+                                  v16i8, VR128, load, i128mem,
                                   SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
 }
 defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
-                                  VR128, loadv2i64, i128mem,
+                                  VR128, load, i128mem,
                                   SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
-                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
                                       int_x86_ssse3_psign_w_128,
-                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128,
-                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
-                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128,
-                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
 }
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
-                                   v32i8, VR256, loadv4i64, i256mem,
+                                   v32i8, VR256, load, i256mem,
                                    SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
-                                  loadv4i64, i256mem,
+                                  load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
-                                  loadv4i64, i256mem,
+                                  load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
   defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
                                        SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
@@ -4907,33 +4941,33 @@ let isCommutable = 0 in {
 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 let isCommutable = 0 in {
   defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
-                                     SchedWriteVecALU.XMM, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memop>;
   defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
-                                     SchedWriteVecALU.XMM, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memop>;
   defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
-                                     SchedWriteVecALU.XMM, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memop>;
   defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
-                                 memopv2i64, i128mem, SchedWriteVarShuffle.XMM>;
+                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
   defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
                                      int_x86_ssse3_phadd_sw_128,
-                                     SchedWritePHAdd.XMM, memopv2i64>;
+                                     SchedWritePHAdd.XMM, memop>;
   defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
                                      int_x86_ssse3_phsub_sw_128,
-                                     SchedWritePHAdd.XMM, memopv2i64>;
+                                     SchedWritePHAdd.XMM, memop>;
   defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
-                                 v16i8, VR128, memopv2i64, i128mem,
+                                 v16i8, VR128, memop, i128mem,
                                  SchedWriteVecIMul.XMM>;
 }
 defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
-                                 VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
+                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -4960,20 +4994,20 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set RC:$dst, (VT (X86PAlignr RC:$src1,
-                                     (bitconvert (memop_frag addr:$src2)),
+                                     (memop_frag addr:$src2),
                                      (i8 imm:$src3))))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
-  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem,
+  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
                                 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
-  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem,
+  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
                                  SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
-  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem,
+  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
                                SchedWriteShuffle.XMM>;
 
 //===---------------------------------------------------------------------===//
@@ -5097,7 +5131,7 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
 
   // AVX2 Register-Memory patterns
   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
@@ -5111,7 +5145,7 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
 
   def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -5120,10 +5154,10 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
 
-  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
@@ -5136,10 +5170,10 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
 
-  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
@@ -5199,7 +5233,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   }
   let Predicates = [HasAVX, NoVLX] in {
@@ -5209,7 +5243,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
@@ -5218,7 +5252,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
 
   def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -5229,7 +5263,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -5238,7 +5272,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -5249,7 +5283,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   }
 }
@@ -6067,7 +6101,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
                   (ins i128mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR128:$dst,
-                    (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
+                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
                  Sched<[Sched.Folded]>;
 }
 
@@ -6075,10 +6109,10 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
 // model, although the naming is misleading.
 let Predicates = [HasAVX] in
 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
-                                         X86phminpos, loadv2i64,
+                                         X86phminpos, load,
                                          WritePHMINPOS>, VEX, VEX_WIG;
 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
-                                         X86phminpos, memopv2i64,
+                                         X86phminpos, memop,
                                          WritePHMINPOS>;
 
 /// SS48I_binop_rm - Simple SSE41 binary operator.
@@ -6100,118 +6134,118 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
+         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
+                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
                                   VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>,
+                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
 }
 
 let Predicates = [HasAVX, NoVLX] in
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
-                                 loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>,
+                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 let Predicates = [HasAVX] in
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
-                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX] in
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>,
+                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 let Predicates = [HasAVX2] in
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in {
   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
-                                memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>;
+                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
   defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
-                                memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
 }
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
@@ -6237,8 +6271,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (IntId RC:$src1,
-           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+          (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6265,8 +6298,7 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1,
-                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6288,28 +6320,28 @@ def BlendCommuteImm8 : SDNodeXForm<imm, [{
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
-                                        VR128, loadv2i64, i128mem, 0,
+                                        VR128, load, i128mem, 0,
                                         SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
   }
 
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                   VR128, loadv4f32, f128mem, 0,
+                                   VR128, load, f128mem, 0,
                                    SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                   VR128, loadv2f64, f128mem, 0,
+                                   VR128, load, f128mem, 0,
                                    SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                    VR256, loadv8f32, i256mem, 0,
+                                    VR256, load, i256mem, 0,
                                     SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                  VR256, loadv4i64, i256mem, 0,
+                                  VR256, load, i256mem, 0,
                                   SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
   }
 }
@@ -6317,17 +6349,17 @@ let Predicates = [HasAVX2] in {
 let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
-                                     VR128, memopv2i64, i128mem, 1,
+                                     VR128, memop, i128mem, 1,
                                      SchedWriteMPSAD.XMM>;
   }
 
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
-                                  VR128, memopv4f32, f128mem, 1,
+                                  VR128, memop, f128mem, 1,
                                   SchedWriteDPPS.XMM>;
   let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
-                                  VR128, memopv2f64, f128mem, 1,
+                                  VR128, memop, f128mem, 1,
                                   SchedWriteDPPD.XMM>;
 }
 
@@ -6355,56 +6387,54 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1,
-                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
   // Pattern to commute if load is in first source.
-  def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
-                          RC:$src1, imm:$src3)),
+  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)),
             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
                                             (commuteXForm imm:$src3))>;
 }
 
 let Predicates = [HasAVX] in {
   defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
-                                  VR128, loadv4f32, f128mem, 0, SSEPackedSingle,
+                                  VR128, load, f128mem, 0, SSEPackedSingle,
                                   SchedWriteFBlend.XMM, BlendCommuteImm4>,
                                   VEX_4V, VEX_WIG;
   defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
-                                   VR256, loadv8f32, f256mem, 0, SSEPackedSingle,
+                                   VR256, load, f256mem, 0, SSEPackedSingle,
                                    SchedWriteFBlend.YMM, BlendCommuteImm8>,
                                    VEX_4V, VEX_L, VEX_WIG;
   defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
-                                  VR128, loadv2f64, f128mem, 0, SSEPackedDouble,
+                                  VR128, load, f128mem, 0, SSEPackedDouble,
                                   SchedWriteFBlend.XMM, BlendCommuteImm2>,
                                   VEX_4V, VEX_WIG;
   defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
-                                   VR256, loadv4f64, f256mem, 0, SSEPackedDouble,
+                                   VR256, load, f256mem, 0, SSEPackedDouble,
                                    SchedWriteFBlend.YMM, BlendCommuteImm4>,
                                    VEX_4V, VEX_L, VEX_WIG;
   defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
-                                  VR128, loadv2i64, i128mem, 0, SSEPackedInt,
+                                  VR128, load, i128mem, 0, SSEPackedInt,
                                   SchedWriteBlend.XMM, BlendCommuteImm8>,
                                   VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
-                                   VR256, loadv4i64, i256mem, 0, SSEPackedInt,
+                                   VR256, load, i256mem, 0, SSEPackedInt,
                                    SchedWriteBlend.YMM, BlendCommuteImm8>,
                                    VEX_4V, VEX_L, VEX_WIG;
 }
 
 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
-                               VR128, memopv4f32, f128mem, 1, SSEPackedSingle,
+                               VR128, memop, f128mem, 1, SSEPackedSingle,
                                SchedWriteFBlend.XMM, BlendCommuteImm4>;
 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
-                               VR128, memopv2f64, f128mem, 1, SSEPackedDouble,
+                               VR128, memop, f128mem, 1, SSEPackedDouble,
                                SchedWriteFBlend.XMM, BlendCommuteImm2>;
 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
-                               VR128, memopv2i64, i128mem, 1, SSEPackedInt,
+                               VR128, memop, i128mem, 1, SSEPackedInt,
                                SchedWriteBlend.XMM, BlendCommuteImm8>;
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -6438,7 +6468,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst,
-                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
+                        (IntId RC:$src1, (mem_frag addr:$src2),
                                RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
                 Sched<[sched.Folded, sched.ReadAfterFold,
                        // x86memop:$src2
@@ -6451,7 +6481,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
-                                           loadv2f64, int_x86_sse41_blendvpd,
+                                           load, int_x86_sse41_blendvpd,
                                            SchedWriteFVarBlend.XMM>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
                                   loadv4f64, int_x86_avx_blendv_pd_256,
@@ -6459,20 +6489,20 @@ defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
-                                           loadv4f32, int_x86_sse41_blendvps,
+                                           load, int_x86_sse41_blendvps,
                                            SchedWriteFVarBlend.XMM>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
                                   loadv8f32, int_x86_avx_blendv_ps_256,
                                   SchedWriteFVarBlend.YMM>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
-                                           loadv2i64, int_x86_sse41_pblendvb,
+                                           load, int_x86_sse41_pblendvb,
                                            SchedWriteVarBlend.XMM>;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                      loadv4i64, int_x86_avx2_pblendvb,
+                                      load, int_x86_avx2_pblendvb,
                                       SchedWriteVarBlend.YMM>, VEX_L;
 }
 
@@ -6603,18 +6633,18 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                      "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
-                       (bitconvert (mem_frag addr:$src2)), XMM0))]>,
+                       (mem_frag addr:$src2), XMM0))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 let ExeDomain = SSEPackedDouble in
-defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem,
                                   int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
 let ExeDomain = SSEPackedSingle in
-defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem,
                                   int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
-defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem,
                                   int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
 
 // Aliases with the implicit xmm0 argument
@@ -6670,6 +6700,12 @@ let Predicates = [HasAVX2, NoVLX] in {
             (VMOVNTDQAYrm addr:$src)>;
   def : Pat<(v4i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
@@ -6679,6 +6715,12 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVNTDQArm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
 }
 
 let Predicates = [UseSSE41] in {
@@ -6688,6 +6730,12 @@ let Predicates = [UseSSE41] in {
             (MOVNTDQArm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (MOVNTDQArm addr:$src)>;
+  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
 }
 
 } // AddedComplexity
@@ -6720,17 +6768,17 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
-                                memopv2i64, i128mem, SchedWriteVecALU.XMM>;
+                                memop, i128mem, SchedWriteVecALU.XMM>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - String/text Processing Instructions
@@ -6881,9 +6929,9 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1,
-                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
+                    (memop addr:$src2), XMM0)),
                   (set VR128:$dst, (IntId VR128:$src1,
-                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8,
+                    (memop addr:$src2))))]>, T8,
              Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6900,7 +6948,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1,
-                            (bc_v4i32 (memopv2i64 addr:$src2)),
+                            (memop addr:$src2),
                             (i8 imm:$src3)))]>, TA,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
@@ -6953,39 +7001,39 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
 // Perform One Round of an AES Encryption/Decryption Flow
 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
 }
 
 let Predicates = [NoVLX, HasVAES] in {
   defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesenc_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesenclast_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesdec_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
-                         int_x86_aesni_aesenc, memopv2i64, 1>;
+                         int_x86_aesni_aesenc, memop, 1>;
   defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
-                         int_x86_aesni_aesenclast, memopv2i64, 1>;
+                         int_x86_aesni_aesenclast, memop, 1>;
   defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
-                         int_x86_aesni_aesdec, memopv2i64, 1>;
+                         int_x86_aesni_aesdec, memop, 1>;
   defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
-                         int_x86_aesni_aesdeclast, memopv2i64, 1>;
+                         int_x86_aesni_aesdeclast, memop, 1>;
 }
 
 // Perform the AES InvMixColumn Transformation
@@ -6999,7 +7047,7 @@ let Predicates = [HasAVX, HasAES] in {
   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
-      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
+      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
       Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
@@ -7010,7 +7058,7 @@ def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1),
   "aesimc\t{$src1, $dst|$dst, $src1}",
-  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
   Sched<[WriteAESIMC.Folded]>;
 
 // AES Round Key Generation Assist
@@ -7025,7 +7073,7 @@ let Predicates = [HasAVX, HasAES] in {
       (ins i128mem:$src1, u8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
-        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
+        (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>,
       Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
@@ -7038,7 +7086,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1, u8imm:$src2),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   [(set VR128:$dst,
-    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+    (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>,
   Sched<[WriteAESKeyGen.Folded]>;
 
 //===----------------------------------------------------------------------===//
@@ -7066,12 +7114,12 @@ let Predicates = [NoAVX, HasPCLMUL] in {
               (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
               [(set VR128:$dst,
-                 (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2),
+                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
                   imm:$src3))]>,
               Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
   } // Constraints = "$src1 = $dst"
 
-  def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1,
+  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
                                 (i8 imm:$src3)),
             (PCLMULQDQrm VR128:$src1, addr:$src2,
                           (PCLMULCommuteImm imm:$src3))>;
@@ -7114,11 +7162,11 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
-defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64,
+defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
                              int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
 
 let Predicates = [NoVLX, HasVPCLMULQDQ] in
-defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64,
+defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
                               int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
 
 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
@@ -7274,11 +7322,11 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
 let Predicates = [HasAVX2, NoVLX] in {
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
 }
 
@@ -7292,11 +7340,11 @@ def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
 let Predicates = [HasAVX1Only] in {
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
 }
 
@@ -7329,7 +7377,7 @@ multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
             (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
   def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
-                                    (From (bitconvert (memop_frag addr:$src2))),
+                                    (From (memop_frag addr:$src2)),
                                     (iPTR imm)),
             (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
@@ -7342,9 +7390,9 @@ let Predicates = [HasAVX, NoVLX] in {
 
 let Predicates = [HasAVX1Only] in {
   defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
-  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv4i32>;
+  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
+  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv16i8>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7433,7 +7481,7 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
 
 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                       RegisterClass RC, X86MemOperand x86memop_f,
-                      X86MemOperand x86memop_i, PatFrag i_frag,
+                      X86MemOperand x86memop_i,
                       ValueType f_vt, ValueType i_vt,
                       X86FoldableSchedWrite sched,
                       X86FoldableSchedWrite varsched> {
@@ -7447,7 +7495,7 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                (ins RC:$src1, x86memop_i:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
-                              (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
+                              (i_vt (load addr:$src2)))))]>, VEX_4V,
                Sched<[varsched.Folded, sched.ReadAfterFold]>;
 
     def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
@@ -7466,18 +7514,18 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
-                               loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM,
+                               v4f32, v4i32, SchedWriteFShuffle.XMM,
                                SchedWriteFVarShuffle.XMM>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                               loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM,
+                               v8f32, v8i32, SchedWriteFShuffle.YMM,
                                SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
-                               loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM,
+                               v2f64, v2i64, SchedWriteFShuffle.XMM,
                                SchedWriteFVarShuffle.XMM>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                               loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM,
+                               v4f64, v4i64, SchedWriteFShuffle.YMM,
                                SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 
@@ -7558,8 +7606,7 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             [(set RC:$dst, (X86cvtph2ps (bc_v8i16
-                                          (loadv2i64 addr:$src))))]>,
+             [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
              T8PD, VEX, Sched<[sched.Folded]>;
 }
 
@@ -7633,7 +7680,7 @@ let Predicates = [HasF16C, NoVLX] in {
 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, X86FoldableSchedWrite sched,
-                          RegisterClass RC, PatFrag memop_frag,
+                          RegisterClass RC,
                           X86MemOperand x86memop, SDNodeXForm commuteXForm> {
   let isCommutable = 1 in
   def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
@@ -7647,22 +7694,20 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1,
-           (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
 
   // Pattern to commute if load is in first source.
-  def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
-                          RC:$src1, imm:$src3)),
+  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)),
             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
                                             (commuteXForm imm:$src3))>;
 }
 
 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
-                               SchedWriteBlend.XMM, VR128, loadv2i64, i128mem,
+                               SchedWriteBlend.XMM, VR128, i128mem,
                                BlendCommuteImm4>;
 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
-                                SchedWriteBlend.YMM, VR256, loadv4i64, i256mem,
+                                SchedWriteBlend.YMM, VR256, i256mem,
                                 BlendCommuteImm8>, VEX_L;
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -7896,7 +7941,7 @@ let Predicates = [HasAVX1Only] in {
 // VPERM - Permute instructions
 //
 
-multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+multiclass avx2_perm<bits<8> opc, string OpcodeStr,
                      ValueType OpVT, X86FoldableSchedWrite Sched,
                      X86MemOperand memOp> {
   let Predicates = [HasAVX2, NoVLX] in {
@@ -7913,16 +7958,14 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermv VR256:$src1,
-                              (bitconvert (mem_frag addr:$src2)))))]>,
+                              (load addr:$src2))))]>,
                      Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
   }
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256,
-                        i256mem>;
+defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256,
-                        f256mem>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          ValueType OpVT, X86FoldableSchedWrite Sched,
@@ -7992,9 +8035,9 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
-  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv4i32>;
+  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
+  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv16i8>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8153,7 +8196,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1,
-                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+                       (vt128 (load addr:$src2)))))]>,
              VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
                             SchedWriteVarVecShift.XMM.ReadAfterFold]>;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
@@ -8167,7 +8210,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
-                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
+                       (vt256 (load addr:$src2)))))]>,
              VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
                                    SchedWriteVarVecShift.YMM.ReadAfterFold]>;
 }
@@ -8181,13 +8224,11 @@ let Predicates = [HasAVX2, NoVLX] in {
 
   def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
             (VPSRAVDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (X86vsrav VR128:$src1,
-                    (bitconvert (loadv2i64 addr:$src2)))),
+  def : Pat<(v4i32 (X86vsrav VR128:$src1, (load addr:$src2))),
             (VPSRAVDrm VR128:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
             (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86vsrav VR256:$src1,
-                    (bitconvert (loadv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86vsrav VR256:$src1, (load addr:$src2))),
             (VPSRAVDYrm VR256:$src1, addr:$src2)>;
 }
 
@@ -8269,7 +8310,7 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
 
     def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
-                                 (bitconvert (MemOpFrag addr:$src2)))))]>,
+                                 (MemOpFrag addr:$src2))))]>,
              Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
   }
 }
@@ -8287,7 +8328,7 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
   def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
               (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
               [(set RC:$dst, (OpVT (OpNode RC:$src1,
-                                    (bitconvert (MemOpFrag addr:$src2)),
+                                    (MemOpFrag addr:$src2),
                               imm:$src3)))], SSEPackedInt>,
               Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
   }
@@ -8297,24 +8338,24 @@ multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
   let Constraints = "$src1 = $dst",
       Predicates  = [HasGFNI, UseSSE2] in
   defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
-                                      VR128, loadv2i64, i128mem, 1>;
+                                      VR128, load, i128mem, 1>;
   let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
     defm V##NAME    : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
-                                      loadv2i64, i128mem>, VEX_4V, VEX_W;
+                                      load, i128mem>, VEX_4V, VEX_W;
     defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
-                                      loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W;
+                                      load, i256mem>, VEX_4V, VEX_L, VEX_W;
   }
 }
 
 // GF2P8MULB
 let Constraints = "$src1 = $dst",
     Predicates  = [HasGFNI, UseSSE2] in
-defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64,
+defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
                                     i128mem, 1>;
 let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
-  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64,
+  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
                                    i128mem>, VEX_4V;
-  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64,
+  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
                                    i256mem>, VEX_4V, VEX_L;
 }
 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index a8013e38e63..39f50c10ae1 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -11,32 +11,32 @@
 //
 //===----------------------------------------------------------------------===//
 
-multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
+           [(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
            Sched<[SchedWritePHAdd.XMM.Folded, SchedWritePHAdd.XMM.ReadAfterFold]>;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
-  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
-  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
-  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
-  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
-  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
-  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
-  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
-  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
-  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
-  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
-  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
-  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
-  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
-  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
+  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd>;
+  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq>;
+  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw>;
+  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq>;
+  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd>;
+  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq>;
+  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd>;
+  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq>;
+  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw>;
+  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq>;
+  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd>;
+  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq>;
+  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw>;
+  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq>;
+  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd>;
 }
 
 // Scalar load 2 addr operand instructions
@@ -48,47 +48,47 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP,
+           [(set VR128:$dst, (Int mem_cpat:$src))]>, XOP,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     PatFrag memop, X86FoldableSchedWrite sched> {
+                     X86FoldableSchedWrite sched> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
+           [(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     PatFrag memop, X86FoldableSchedWrite sched> {
+                     X86FoldableSchedWrite sched> {
   def Yrr : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[sched]>;
   def Yrm : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L,
+           [(set VR256:$dst, (Int (load addr:$src)))]>, XOP, VEX_L,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let ExeDomain = SSEPackedSingle in {
   defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
                            ssmem, sse_load_f32, SchedWriteFRnd.Scl>;
-  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32,
+  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps,
                            SchedWriteFRnd.XMM>;
-  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32,
+  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256,
                            SchedWriteFRnd.YMM>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
                            sdmem, sse_load_f64, SchedWriteFRnd.Scl>;
-  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64,
+  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd,
                            SchedWriteFRnd.XMM>;
-  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64,
+  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256,
                            SchedWriteFRnd.YMM>;
 }
 
@@ -105,13 +105,13 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1),
-                             (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+                             (vt128 (load addr:$src2)))))]>,
            XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
            (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
+              (vt128 (OpNode (vt128 (load addr:$src1)),
                              (vt128 VR128:$src2))))]>,
              XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
   // For disassembler
@@ -150,7 +150,7 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
            (ins i128mem:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>,
+              (vt128 (OpNode (vt128 (load addr:$src1)), imm:$src2)))]>,
            XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -181,7 +181,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-              (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+              (Int VR128:$src1, (load addr:$src2),
               VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -260,7 +260,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                 (vt128 (OpNode (vt128 VR128:$src1),
-                               (vt128 (bitconvert (loadv2i64 addr:$src2))),
+                               (vt128 (load addr:$src2)),
                                 imm:$cc)))]>,
              XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
     let isAsmParserOnly = 1, hasSideEffects = 0 in {
@@ -279,7 +279,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
     }
   }
 
-  def : Pat<(OpNode (bitconvert (loadv2i64 addr:$src2)),
+  def : Pat<(OpNode (load addr:$src2),
                     (vt128 VR128:$src1), imm:$cc),
             (!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2,
                                            (CommuteVPCOMCC imm:$cc))>;
@@ -310,14 +310,14 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                             (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>,
+                             (vt128 (load addr:$src3)))))]>,
             XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
             (ins VR128:$src1, i128mem:$src2, VR128:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set VR128:$dst,
-              (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))),
+              (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (load addr:$src2)),
                              (vt128 VR128:$src3))))]>,
             XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
                            // 128mem:$src2
@@ -401,8 +401,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set RC:$dst,
-          (VT (X86vpermil2 RC:$src1, RC:$src2,
-                           (bitconvert (IntLdFrag addr:$src3)),
+          (VT (X86vpermil2 RC:$src1, RC:$src2, (IntLdFrag addr:$src3),
                            (i8 imm:$src4))))]>, VEX_W,
         Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
@@ -437,10 +436,10 @@ let ExeDomain = SSEPackedDouble in {
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem,
-                                 v4f32, loadv4f32, loadv2i64,
+                                 v4f32, loadv4f32, loadv4i32,
                                  SchedWriteFVarShuffle.XMM>;
   defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem,
-                                  v8f32, loadv8f32, loadv4i64,
+                                  v8f32, loadv8f32, loadv8i32,
                                   SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 58b1c505944..b5fd9f4a785 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1391,7 +1391,7 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
   if (ConstantEntry.isMachineConstantPoolEntry())
     return nullptr;
 
-  auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
+  const Constant *C = ConstantEntry.Val.ConstVal;
   assert((!C || ConstantEntry.getType() == C->getType()) &&
          "Expected a constant of the same type!");
   return C;
@@ -1594,6 +1594,18 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   }
 }
 
+static unsigned getRegisterWidth(const MCOperandInfo &Info) {
+  if (Info.RegClass == X86::VR128RegClassID ||
+      Info.RegClass == X86::VR128XRegClassID)
+    return 128;
+  if (Info.RegClass == X86::VR256RegClassID ||
+      Info.RegClass == X86::VR256XRegClassID)
+    return 256;
+  if (Info.RegClass == X86::VR512RegClassID)
+    return 512;
+  llvm_unreachable("Unknown register class!");
+}
+
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   X86MCInstLower MCInstLowering(*MF, *this);
   const X86RegisterInfo *RI =
@@ -1879,8 +1891,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 64> Mask;
-      DecodePSHUFBMask(C, Mask);
+      DecodePSHUFBMask(C, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1951,8 +1964,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMILPMask(C, ElSize, Mask);
+      DecodeVPERMILPMask(C, ElSize, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1982,8 +1996,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
+      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
@@ -1999,8 +2014,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPPERMMask(C, Mask);
+      DecodeVPPERMMask(C, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index c7ddf93f8e8..720be8afa62 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -112,11 +112,10 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
   return true;
 }
 
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+void DecodePSHUFBMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
@@ -125,7 +124,7 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / 8;
   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
          "Unexpected number of vector elements.");
 
@@ -151,12 +150,10 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
                         SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
 
@@ -166,7 +163,7 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) &&
          "Unexpected number of vector elements.");
@@ -189,11 +186,13 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
 }
 
 void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
+                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask) {
   Type *MaskTy = C->getType();
   unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
   (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
+  assert((MaskTySize == 128 || MaskTySize == 256) &&
+         Width >= MaskTySize && "Unexpected vector size.");
 
   // The shuffle mask requires elements the same size as the target.
   APInt UndefElts;
@@ -201,7 +200,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected number of vector elements.");
@@ -242,9 +241,12 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   }
 }
 
-void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  assert(C->getType()->getPrimitiveSizeInBits() == 128 &&
-         "Unexpected vector size.");
+void DecodeVPPERMMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert(Width == 128 && Width >= MaskTySize && "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
   APInt UndefElts;
@@ -252,7 +254,7 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / 8;
   assert(NumElts == 16 && "Unexpected number of vector elements.");
 
   for (unsigned i = 0; i != NumElts; ++i) {
@@ -291,12 +293,10 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
                       SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -307,7 +307,7 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
@@ -319,12 +319,10 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
   }
 }
 
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
                        SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -335,7 +333,7 @@ void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index b703cbbd2b2..b08c31935d2 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -26,25 +26,28 @@ class Constant;
 class MVT;
 
 /// Decode a PSHUFB mask from an IR-level vector constant.
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFBMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP variable mask from an IR-level vector constant.
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP2 variable mask from an IR-level vector constant.
 void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
+                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPPERM variable mask from an IR-level vector constant.
-void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVPPERMMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
                        SmallVectorImpl<int> &ShuffleMask);
 
 } // llvm namespace
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index 75a11845b1e..0c501ea6895 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -224,7 +224,7 @@ entry:
 define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
 ; AVX1-LABEL: shuffle_v16i16_4501_mem:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index 6affef33932..9216cad5882 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -1630,7 +1630,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
-; AVX2-SLOW-NEXT:    vbroadcastsd 24(%rsi), %ymm5
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
@@ -1654,19 +1654,19 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
 ; AVX2-FAST-NEXT:    vbroadcastsd %xmm2, %ymm4
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7]
-; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
-; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2]
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = [5,6,5,6,5,6,7,7]
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm5, %ymm1
 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX2-FAST-NEXT:    vbroadcastsd 24(%rsi), %ymm2
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FAST-NEXT:    vmovups %ymm1, 64(%rdi)
-; AVX2-FAST-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,3,3]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-FAST-NEXT:    vmovups %ymm0, 64(%rdi)
+; AVX2-FAST-NEXT:    vmovups %ymm4, 32(%rdi)
 ; AVX2-FAST-NEXT:    vmovups %ymm3, (%rdi)
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
index 0900fdccb49..d0ed99f92f3 100644
--- a/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -57,9 +57,9 @@ define <16 x i8> @test5(<16 x i8> %V) {
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    movq %rax, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, (%rax)
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
-; CHECK-NEXT:    movdqa %xmm1, (%rax)
-; CHECK-NEXT:    pshufb %xmm1, %xmm0
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
+; CHECK-NEXT:    movaps %xmm1, (%rax)
+; CHECK-NEXT:    pshufb (%rax), %xmm0
 ; CHECK-NEXT:    retq
   store <2 x i64> <i64 1, i64 0>, <2 x i64>* undef, align 16
   %l = load <2 x i64>, <2 x i64>* undef, align 16
diff --git a/test/CodeGen/X86/vector-extend-inreg.ll b/test/CodeGen/X86/vector-extend-inreg.ll
index 86bb13f57eb..d790cb54b61 100644
--- a/test/CodeGen/X86/vector-extend-inreg.ll
+++ b/test/CodeGen/X86/vector-extend-inreg.ll
@@ -13,6 +13,7 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
 ; X32-SSE-NEXT:    subl $384, %esp # imm = 0x180
 ; X32-SSE-NEXT:    movl 88(%ebp), %ecx
 ; X32-SSE-NEXT:    movdqa 72(%ebp), %xmm0
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
@@ -21,7 +22,6 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
diff --git a/test/CodeGen/X86/vector-idiv-v2i32.ll b/test/CodeGen/X86/vector-idiv-v2i32.ll
index 49e29ac17a5..00126d67532 100644
--- a/test/CodeGen/X86/vector-idiv-v2i32.ll
+++ b/test/CodeGen/X86/vector-idiv-v2i32.ll
@@ -693,20 +693,20 @@ define void @test_sdiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-NEXT:    movdqa {{.*#+}} xmm3 = [31,0,31,0]
-; X86-NEXT:    movdqa %xmm2, %xmm4
-; X86-NEXT:    psrlq %xmm3, %xmm4
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [31,0,31,0]
+; X86-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-NEXT:    movdqa %xmm3, %xmm4
+; X86-NEXT:    psrlq %xmm2, %xmm4
 ; X86-NEXT:    movl $31, %ecx
 ; X86-NEXT:    movd %ecx, %xmm5
-; X86-NEXT:    psrlq %xmm5, %xmm2
-; X86-NEXT:    movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
+; X86-NEXT:    psrlq %xmm5, %xmm3
+; X86-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
 ; X86-NEXT:    movdqa %xmm1, %xmm4
-; X86-NEXT:    psrlq %xmm3, %xmm4
+; X86-NEXT:    psrlq %xmm2, %xmm4
 ; X86-NEXT:    psrlq %xmm5, %xmm1
 ; X86-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
-; X86-NEXT:    xorpd %xmm2, %xmm1
-; X86-NEXT:    psubq %xmm2, %xmm1
+; X86-NEXT:    xorpd %xmm3, %xmm1
+; X86-NEXT:    psubq %xmm3, %xmm1
 ; X86-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X86-NEXT:    psrlq $29, %xmm1
 ; X86-NEXT:    paddq %xmm0, %xmm1
diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll
index ce99d22dbbd..167128ae002 100644
--- a/test/CodeGen/X86/widened-broadcast.ll
+++ b/test/CodeGen/X86/widened-broadcast.ll
@@ -121,10 +121,21 @@ define <8 x i32> @load_splat_8i32_4i32_01010101(<4 x i32>* %ptr) nounwind uwtabl
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_splat_8i32_4i32_01010101:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_splat_8i32_4i32_01010101:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_splat_8i32_4i32_01010101:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_splat_8i32_4i32_01010101:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX512-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
   %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -138,21 +149,10 @@ define <8 x i32> @load_splat_8i32_8i32_01010101(<8 x i32>* %ptr) nounwind uwtabl
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_8i32_8i32_01010101:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_8i32_8i32_01010101:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_8i32_8i32_01010101:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_8i32_8i32_01010101:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <8 x i32>, <8 x i32>* %ptr
   %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -246,10 +246,21 @@ define <16 x i16> @load_splat_16i16_8i16_0123012301230123(<8 x i16>* %ptr) nounw
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_splat_16i16_8i16_0123012301230123:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_splat_16i16_8i16_0123012301230123:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX512-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3,i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -263,21 +274,10 @@ define <16 x i16> @load_splat_16i16_16i16_0101010101010101(<16 x i16>* %ptr) nou
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastss (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_16i16_16i16_0101010101010101:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastss (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x i16>, <16 x i16>* %ptr
   %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -446,10 +446,21 @@ define <32 x i8> @load_splat_32i8_16i8_01234567012345670123456701234567(<16 x i8
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX512-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-- 
GitLab


From 4235b9cb6e5f1053c459492af7cd004fdcdb3864 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Oct 2018 22:26:00 +0000
Subject: [PATCH 0411/1116] [ARM] Regenerate reverse shuffle costs

Came about while cleaning up general shuffle costs for PR39368

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344966 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Analysis/CostModel/ARM/shuffle.ll | 36 ++++++++++++--------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/test/Analysis/CostModel/ARM/shuffle.ll b/test/Analysis/CostModel/ARM/shuffle.ll
index c92d6688046..7ad9b0286ec 100644
--- a/test/Analysis/CostModel/ARM/shuffle.ll
+++ b/test/Analysis/CostModel/ARM/shuffle.ll
@@ -1,39 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios6.0.0"
 
-; CHECK: shuffle
-define void @shuffle() {
-
-
-  ;; Reverse shuffles should be lowered to vrev and possibly a vext (for
-  ;; quadwords)
-
-    ; Vector values
-  ; CHECK: cost of 1 {{.*}} shuffle
+;; Reverse shuffles should be lowered to vrev and possibly a vext (for quadwords)
+define void @reverse() {
+; CHECK-LABEL: 'reverse'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v10 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %v7 = shufflevector <2 x i8> undef, <2 x i8>undef, <2 x i32> <i32 1, i32 0>
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v8 = shufflevector <4 x i8> undef, <4 x i8>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v9 = shufflevector <8 x i8> undef, <8 x i8>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  ; CHECK: cost of 2 {{.*}} shuffle
   %v10 = shufflevector <16 x i8> undef, <16 x i8>undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v11 = shufflevector <2 x i16> undef, <2 x i16>undef, <2 x i32> <i32 1, i32 0>
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v12 = shufflevector <4 x i16> undef, <4 x i16>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ; CHECK: cost of 2 {{.*}} shuffle
   %v13 = shufflevector <8 x i16> undef, <8 x i16>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v14 = shufflevector <2 x i32> undef, <2 x i32>undef, <2 x i32> <i32 1, i32 0>
-  ; CHECK: cost of 2 {{.*}} shuffle
   %v15 = shufflevector <4 x i32> undef, <4 x i32>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v16 = shufflevector <2 x float> undef, <2 x float>undef, <2 x i32> <i32 1, i32 0>
-  ; CHECK: cost of 2 {{.*}} shuffle
   %v17 = shufflevector <4 x float> undef, <4 x float>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 
   ret void
-- 
GitLab


From e2097155c1ca8684e26fecffaa7e10f1ba01376e Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Mon, 22 Oct 2018 22:29:09 +0000
Subject: [PATCH 0412/1116] [MC] Shrink MCAsmParser by grouping bools, add
 const, NFC

I was considering adding another boolean here. I standardized on bools
since they allow default member initializers in the class definition.
This makes ShowParsedOperands protected instead of private, but that's
probably fine.

Reduce the SmallVector size while we're at it, since the common case is
that there is never a pending error.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344967 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCParser/MCAsmParser.h | 9 +++++----
 lib/MC/MCParser/MCAsmParser.cpp        | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index 0d56f36fbae..b80289878e6 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -122,17 +122,18 @@ public:
 private:
   MCTargetAsmParser *TargetParser = nullptr;
 
-  unsigned ShowParsedOperands : 1;
-
 protected: // Can only create subclasses.
   MCAsmParser();
 
+  SmallVector<MCPendingError, 0> PendingErrors;
+
   /// Flag tracking whether any errors have been encountered.
   bool HadError = false;
+
   /// Enable print [latency:throughput] in output file.
   bool EnablePrintSchedInfo = false;
 
-  SmallVector<MCPendingError, 1> PendingErrors;
+  bool ShowParsedOperands = false;
 
 public:
   MCAsmParser(const MCAsmParser &) = delete;
@@ -166,7 +167,7 @@ public:
   void setShowParsedOperands(bool Value) { ShowParsedOperands = Value; }
 
   void setEnablePrintSchedInfo(bool Value) { EnablePrintSchedInfo = Value; }
-  bool shouldPrintSchedInfo() { return EnablePrintSchedInfo; }
+  bool shouldPrintSchedInfo() const { return EnablePrintSchedInfo; }
 
   /// Run the parser on the input source buffer.
   virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false) = 0;
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index d439734e76f..efedcdc5a31 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -21,7 +21,7 @@
 
 using namespace llvm;
 
-MCAsmParser::MCAsmParser() : ShowParsedOperands(0) {}
+MCAsmParser::MCAsmParser() {}
 
 MCAsmParser::~MCAsmParser() = default;
 
-- 
GitLab


From 8f75264f16f6e02aa2d3462744d96199a8c581de Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 22 Oct 2018 22:50:27 +0000
Subject: [PATCH 0413/1116] [Reassociate] remove bogus tests; NFC

I was trying to provide test coverage for D53533
with rL344964, but these don't do it...and I don't
think they add any value, so deleting.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344969 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/Reassociate/negation.ll | 28 -------------------------
 1 file changed, 28 deletions(-)

diff --git a/test/Transforms/Reassociate/negation.ll b/test/Transforms/Reassociate/negation.ll
index e1f9a421a9c..59b7b5dca1d 100644
--- a/test/Transforms/Reassociate/negation.ll
+++ b/test/Transforms/Reassociate/negation.ll
@@ -45,31 +45,3 @@ define <2 x i32> @negate_vec_undefs(<2 x i32> %a, <2 x i32> %b, <2 x i32> %z) {
   ret <2 x i32> %f
 }
 
-define i32 @not_not(i32 %a, i32 %b, i32 %z) {
-; CHECK-LABEL: @not_not(
-; CHECK-NEXT:    [[D:%.*]] = and i32 [[Z:%.*]], 40
-; CHECK-NEXT:    [[A_NOT:%.*]] = xor i32 [[A:%.*]], -1
-; CHECK-NEXT:    [[F:%.*]] = and i32 [[D]], [[A_NOT]]
-; CHECK-NEXT:    ret i32 [[F]]
-;
-  %d = and i32 %z, 40
-  %c = xor i32 -1, %d
-  %e = or i32 %a, %c
-  %f = xor i32 -1, %e
-  ret i32 %f
-}
-
-define <2 x i32> @not_vec_undefs(<2 x i32> %a, <2 x i32> %b, <2 x i32> %z) {
-; CHECK-LABEL: @not_vec_undefs(
-; CHECK-NEXT:    [[D:%.*]] = or <2 x i32> [[Z:%.*]], <i32 40, i32 40>
-; CHECK-NEXT:    [[A_NOT:%.*]] = xor <2 x i32> [[A:%.*]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[F:%.*]] = or <2 x i32> [[D]], [[A_NOT]]
-; CHECK-NEXT:    ret <2 x i32> [[F]]
-;
-  %d = or <2 x i32> %z, <i32 40, i32 40>
-  %c = xor <2 x i32> <i32 undef, i32 -1>, %d
-  %e = and <2 x i32> %a, %c
-  %f = xor <2 x i32> <i32 undef, i32 -1>, %e
-  ret <2 x i32> %f
-}
-
-- 
GitLab


From d8c89437902108b83006d92d5985fcfd2eb0deae Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Mon, 22 Oct 2018 22:52:23 +0000
Subject: [PATCH 0414/1116] X86: Do not optimize branches with undef eflags
 inputs

analyzeBranch()/insertBranch() etc. do not properly deal with an undef
flag on the eflags input and used to produce invalid MIR.  I don't see
this ever affecting real world inputs (I don't think it is possible to
produce undef flags with llvm IR), so I simply changed the code to bail
out in this case.

rdar://42122367

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344970 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrInfo.cpp   |  5 +++++
 test/CodeGen/X86/undef-eflags.mir | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 test/CodeGen/X86/undef-eflags.mir

diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 36ef7dca1f3..1eddb27847d 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -2640,6 +2640,11 @@ bool X86InstrInfo::AnalyzeBranchImpl(
     if (BranchCode == X86::COND_INVALID)
       return true;  // Can't handle indirect branch.
 
+    // In practice we should never have an undef eflags operand, if we do
+    // abort here as we are not prepared to preserve the flag.
+    if (I->getOperand(1).isUndef())
+      return true;
+
     // Working from the bottom, handle the first conditional branch.
     if (Cond.empty()) {
       MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
diff --git a/test/CodeGen/X86/undef-eflags.mir b/test/CodeGen/X86/undef-eflags.mir
new file mode 100644
index 00000000000..e5cf58bac68
--- /dev/null
+++ b/test/CodeGen/X86/undef-eflags.mir
@@ -0,0 +1,18 @@
+# RUN: llc -o - %s -mtriple=x86_64-- -verify-machineinstrs -run-pass branch-folder | FileCheck %s
+# Check that we do not generate invalid MIR when optimizing condjumps with undef
+# flags on the eflags input (currently we should just bail out).
+---
+# CHECK-LABEL: name: fallundef
+name: fallundef
+tracksRegLiveness: true
+body: |
+  bb.0:
+    JE_1 %bb.1, implicit undef $eflags
+    ; CHECK: JE_1 %bb.1, implicit undef $eflags
+    JMP_1 %bb.2
+  bb.1:
+    RET 2, undef $eax
+
+  bb.2:
+    RET 0, undef $eax
+...
-- 
GitLab


From 152aae5d0d5087bed92a2f3d5452095efbabae68 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Mon, 22 Oct 2018 23:08:40 +0000
Subject: [PATCH 0415/1116] [Intrinsic] Unigned Saturation Addition Intrinsic

Add an intrinsic that takes 2 integers and perform unsigned saturation
addition on them.

This is a part of implementing fixed point arithmetic in clang where some of
the more complex operations will be implemented as intrinsics.

Differential Revision: https://reviews.llvm.org/D53340

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344971 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/ISDOpcodes.h             |   8 +-
 include/llvm/CodeGen/TargetLowering.h         |   7 +-
 include/llvm/IR/Intrinsics.td                 |   3 +
 include/llvm/Target/TargetSelectionDAG.td     |   1 +
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |   8 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  27 +--
 lib/CodeGen/SelectionDAG/LegalizeTypes.h      |   4 +-
 .../SelectionDAG/LegalizeVectorOps.cpp        |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   2 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   6 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   1 +
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |  49 +++---
 lib/CodeGen/TargetLoweringBase.cpp            |   1 +
 lib/IR/Verifier.cpp                           |  13 +-
 test/CodeGen/X86/uadd_sat.ll                  | 157 ++++++++++++++++++
 15 files changed, 238 insertions(+), 50 deletions(-)
 create mode 100644 test/CodeGen/X86/uadd_sat.ll

diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 2424ae6e630..75ec0b99a12 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -256,13 +256,13 @@ namespace ISD {
     /// Same for multiplication.
     SMULO, UMULO,
 
-    /// RESULT = SADDSAT(LHS, RHS) - Perform signed saturation addition on 2
+    /// RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2
     /// integers with the same bit width (W). If the true value of LHS + RHS
-    /// exceeds the largest signed value that can be represented by W bits, the
+    /// exceeds the largest value that can be represented by W bits, the
     /// resulting value is this maximum value. Otherwise, if this value is less
-    /// than the smallest signed value that can be represented by W bits, the
+    /// than the smallest value that can be represented by W bits, the
     /// resulting value is this minimum value.
-    SADDSAT,
+    SADDSAT, UADDSAT,
 
     /// Simple binary floating point operators.
     FADD, FSUB, FMUL, FDIV, FREM,
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index d56684d8558..2a02ac1ecb0 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -3684,10 +3684,9 @@ public:
   SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
                                   SDValue Index) const;
 
-  /// Method for building the DAG expansion of ISD::SADDSAT. This method accepts
-  /// integers or vectors of integers as its arguments.
-  SDValue getExpandedSignedSaturationAddition(SDNode *Node,
-                                              SelectionDAG &DAG) const;
+  /// Method for building the DAG expansion of ISD::[US]ADDSAT. This method
+  /// accepts integers or vectors of integers as its arguments.
+  SDValue getExpandedSaturationAddition(SDNode *Node, SelectionDAG &DAG) const;
 
   //===--------------------------------------------------------------------===//
   // Instruction Emitting Hooks
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 978f471f7ea..e49fa147709 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -713,6 +713,9 @@ def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
 def int_sadd_sat : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
                              [IntrNoMem, IntrSpeculatable, Commutative]>;
+def int_uadd_sat : Intrinsic<[llvm_anyint_ty],
+                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                             [IntrNoMem, IntrSpeculatable, Commutative]>;
 
 //===------------------------- Memory Use Markers -------------------------===//
 //
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 918d0b1d37d..c235c85e144 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -374,6 +374,7 @@ def umax       : SDNode<"ISD::UMAX"      , SDTIntBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
 
 def saddsat    : SDNode<"ISD::SADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
+def uaddsat    : SDNode<"ISD::UADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
 
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index b73fc106a6b..7b25d9f98ff 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1115,7 +1115,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getStrictFPOperationAction(Node->getOpcode(),
                                             Node->getValueType(0));
     break;
-  case ISD::SADDSAT: {
+  case ISD::SADDSAT:
+  case ISD::UADDSAT: {
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   }
@@ -3460,8 +3461,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
     break;
   }
-  case ISD::SADDSAT: {
-    Results.push_back(TLI.getExpandedSignedSaturationAddition(Node, DAG));
+  case ISD::SADDSAT:
+  case ISD::UADDSAT: {
+    Results.push_back(TLI.getExpandedSaturationAddition(Node, DAG));
     break;
   }
   case ISD::SADDO:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index fffebaf194e..690a64e724b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -141,7 +141,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:    Res = PromoteIntRes_ADDSUBCARRY(N, ResNo); break;
 
-  case ISD::SADDSAT:     Res = PromoteIntRes_SADDSAT(N); break;
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:     Res = PromoteIntRes_ADDSAT(N); break;
 
   case ISD::ATOMIC_LOAD:
     Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
@@ -548,17 +549,22 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   return SDValue(Res.getNode(), 1);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SADDSAT(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_ADDSAT(SDNode *N) {
   // For promoting iN -> iM, this can be expanded by
   // 1. ANY_EXTEND iN to iM
   // 2. SHL by M-N
-  // 3. SADDSAT
-  // 4. ASHR by M-N
+  // 3. U/SADDSAT
+  // 4. L/ASHR by M-N
   SDLoc dl(N);
   SDValue Op1 = N->getOperand(0);
   SDValue Op2 = N->getOperand(1);
   unsigned OldBits = Op1.getValueSizeInBits();
 
+  unsigned Opcode = N->getOpcode();
+  assert((Opcode == ISD::SADDSAT || Opcode == ISD::UADDSAT) &&
+         "Expected opcode to be SADDSAT or UADDSAT");
+  unsigned ShiftOp = Opcode == ISD::SADDSAT ? ISD::SRA : ISD::SRL;
+
   SDValue Op1Promoted = GetPromotedInteger(Op1);
   SDValue Op2Promoted = GetPromotedInteger(Op2);
 
@@ -573,8 +579,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SADDSAT(SDNode *N) {
       DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
 
   SDValue Result =
-      DAG.getNode(ISD::SADDSAT, dl, PromotedType, Op1Promoted, Op2Promoted);
-  return DAG.getNode(ISD::SRA, dl, PromotedType, Result, ShiftAmount);
+      DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
+  return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
@@ -1498,7 +1504,8 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::UMULO:
   case ISD::SMULO: ExpandIntRes_XMULO(N, Lo, Hi); break;
 
-  case ISD::SADDSAT: ExpandIntRes_SADDSAT(N, Lo, Hi); break;
+  case ISD::SADDSAT:
+  case ISD::UADDSAT: ExpandIntRes_ADDSAT(N, Lo, Hi); break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -2461,9 +2468,9 @@ void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo,
   ReplaceValueWith(SDValue(N, 1), R.getValue(2));
 }
 
-void DAGTypeLegalizer::ExpandIntRes_SADDSAT(SDNode *N, SDValue &Lo,
-                                            SDValue &Hi) {
-  SDValue Result = TLI.getExpandedSignedSaturationAddition(N, DAG);
+void DAGTypeLegalizer::ExpandIntRes_ADDSAT(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Result = TLI.getExpandedSaturationAddition(N, DAG);
   SplitInteger(Result, Lo, Hi);
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 83429ec6e98..f31b115bc2d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -330,7 +330,7 @@ private:
   SDValue PromoteIntRes_UNDEF(SDNode *N);
   SDValue PromoteIntRes_VAARG(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
-  SDValue PromoteIntRes_SADDSAT(SDNode *N);
+  SDValue PromoteIntRes_ADDSAT(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -415,7 +415,7 @@ private:
   void ExpandIntRes_SADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_XMULO             (SDNode *N, SDValue &Lo, SDValue &Hi);
-  void ExpandIntRes_SADDSAT           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSAT            (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_ATOMIC_LOAD       (SDNode *N, SDValue &Lo, SDValue &Hi);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index e7edc0ef860..787091a7f4c 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -390,6 +390,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::UMUL_LOHI:
   case ISD::FCANONICALIZE:
   case ISD::SADDSAT:
+  case ISD::UADDSAT:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::FP_ROUND_INREG:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 2b5fd8d75f4..43b4bf0c497 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -123,6 +123,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UMAX:
 
   case ISD::SADDSAT:
+  case ISD::UADDSAT:
 
   case ISD::FPOW:
   case ISD::FREM:
@@ -805,6 +806,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UMIN:
   case ISD::UMAX:
   case ISD::SADDSAT:
+  case ISD::UADDSAT:
     SplitVecRes_BinOp(N, Lo, Hi);
     break;
   case ISD::FMA:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index be4a219efe5..71814d79098 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5777,6 +5777,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, DAG.getNode(ISD::SADDSAT, sdl, Op1.getValueType(), Op1, Op2));
     return nullptr;
   }
+  case Intrinsic::uadd_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::UADDSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return nullptr;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 64a9764cce2..1b032ce456a 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -286,6 +286,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::SRL_PARTS:                  return "srl_parts";
 
   case ISD::SADDSAT:                    return "saddsat";
+  case ISD::UADDSAT:                    return "uaddsat";
 
   // Conversion operators.
   case ISD::SIGN_EXTEND:                return "sign_extend";
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ceedd06da1d..d31d6344519 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4681,13 +4681,12 @@ SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
   return SDValue();
 }
 
-SDValue
-TargetLowering::getExpandedSignedSaturationAddition(SDNode *Node,
-                                                    SelectionDAG &DAG) const {
-  assert(Node->getOpcode() == ISD::SADDSAT &&
-         "Expected method to receive SADDSAT node.");
-  assert(Node->getNumOperands() == 2 &&
-         "Expected SADDSAT node to have 2 operands.");
+SDValue TargetLowering::getExpandedSaturationAddition(SDNode *Node,
+                                                      SelectionDAG &DAG) const {
+  unsigned Opcode = Node->getOpcode();
+  assert((Opcode == ISD::SADDSAT || Opcode == ISD::UADDSAT) &&
+         "Expected method to receive SADDSAT or UADDSAT node.");
+  assert(Node->getNumOperands() == 2 && "Expected node to have 2 operands.");
 
   SDLoc dl(Node);
   SDValue LHS = Node->getOperand(0);
@@ -4699,27 +4698,33 @@ TargetLowering::getExpandedSignedSaturationAddition(SDNode *Node,
          "Expected operands to be integers. Vector of int arguments should "
          "already be unrolled.");
   assert(LHS.getValueType() == RHS.getValueType() &&
-         "Expected both operands of SADDSAT to be the same type");
+         "Expected both operands to be the same type");
 
+  unsigned OverflowOp = Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::UADDO;
   unsigned BitWidth = LHS.getValueSizeInBits();
   EVT ResultType = LHS.getValueType();
   EVT BoolVT =
       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ResultType);
   SDValue Result =
-      DAG.getNode(ISD::SADDO, dl, DAG.getVTList(ResultType, BoolVT), LHS, RHS);
+      DAG.getNode(OverflowOp, dl, DAG.getVTList(ResultType, BoolVT), LHS, RHS);
   SDValue Sum = Result.getValue(0);
   SDValue Overflow = Result.getValue(1);
-
-  // SatMax -> Overflow && Sum < 0
-  // SatMin -> Overflow && Sum > 0
-  SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
-
-  SDValue SumNeg = DAG.getSetCC(dl, BoolVT, Sum, Zero, ISD::SETLT);
-  APInt MinVal = APInt::getSignedMinValue(BitWidth);
-  APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
-  SDValue SatMin = DAG.getConstant(MinVal, dl, ResultType);
-  SDValue SatMax = DAG.getConstant(MaxVal, dl, ResultType);
-
-  Result = DAG.getSelect(dl, ResultType, SumNeg, SatMax, SatMin);
-  return DAG.getSelect(dl, ResultType, Overflow, Result, Sum);
+  SDValue Zero = DAG.getConstant(0, dl, ResultType);
+
+  if (Opcode == ISD::SADDSAT) {
+    // SatMax -> Overflow && Sum < 0
+    // SatMin -> Overflow && Sum > 0
+    APInt MinVal = APInt::getSignedMinValue(BitWidth);
+    APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
+    SDValue SatMin = DAG.getConstant(MinVal, dl, ResultType);
+    SDValue SatMax = DAG.getConstant(MaxVal, dl, ResultType);
+    SDValue SumNeg = DAG.getSetCC(dl, BoolVT, Sum, Zero, ISD::SETLT);
+    Result = DAG.getSelect(dl, ResultType, SumNeg, SatMax, SatMin);
+    return DAG.getSelect(dl, ResultType, Overflow, Result, Sum);
+  } else {
+    // Just need to check overflow for SatMax.
+    APInt MaxVal = APInt::getMaxValue(BitWidth);
+    SDValue SatMax = DAG.getConstant(MaxVal, dl, ResultType);
+    return DAG.getSelect(dl, ResultType, Overflow, SatMax, Sum);
+  }
 }
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index ddd5fc1df75..09c5b527956 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -611,6 +611,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::UMAX, VT, Expand);
     setOperationAction(ISD::ABS, VT, Expand);
     setOperationAction(ISD::SADDSAT, VT, Expand);
+    setOperationAction(ISD::UADDSAT, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index dc6c1f663d6..ae578c91ae8 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -4474,13 +4474,16 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
 
     break;
   }
-  case Intrinsic::sadd_sat: {
+  case Intrinsic::sadd_sat:
+  case Intrinsic::uadd_sat: {
     Value *Op1 = CS.getArgOperand(0);
     Value *Op2 = CS.getArgOperand(1);
-    Assert(Op1->getType()->isIntOrIntVectorTy(),
-           "first operand of sadd_sat must be an int type or vector of ints");
-    Assert(Op2->getType()->isIntOrIntVectorTy(),
-           "second operand of sadd_sat must be an int type or vector of ints");
+    Assert(
+        Op1->getType()->isIntOrIntVectorTy(),
+        "first operand of [us]add_sat must be an int type or vector of ints");
+    Assert(
+        Op2->getType()->isIntOrIntVectorTy(),
+        "second operand of [us]add_sat must be an int type or vector of ints");
     break;
   }
   };
diff --git a/test/CodeGen/X86/uadd_sat.ll b/test/CodeGen/X86/uadd_sat.ll
new file mode 100644
index 00000000000..f72d036288d
--- /dev/null
+++ b/test/CodeGen/X86/uadd_sat.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=CHECK32
+
+declare  i4  @llvm.uadd.sat.i4   (i4,  i4)
+declare  i32 @llvm.uadd.sat.i32  (i32, i32)
+declare  i64 @llvm.uadd.sat.i64  (i64, i64)
+declare  <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>)
+
+define i32 @func(i32 %x, i32 %y) {
+; CHECK-LABEL: func:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    cmovael %edi, %eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl $-1, %eax
+; CHECK32-NEXT:    cmovael %ecx, %eax
+; CHECK32-NEXT:    retl
+  %tmp = call i32 @llvm.uadd.sat.i32(i32 %x, i32 %y);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) {
+; CHECK-LABEL: func2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addq %rsi, %rdi
+; CHECK-NEXT:    movq $-1, %rax
+; CHECK-NEXT:    cmovaeq %rdi, %rax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func2:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    movl $-1, %ecx
+; CHECK32-NEXT:    cmovbl %ecx, %edx
+; CHECK32-NEXT:    cmovbl %ecx, %eax
+; CHECK32-NEXT:    retl
+  %tmp = call i64 @llvm.uadd.sat.i64(i64 %x, i64 %y);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) {
+; CHECK-LABEL: func3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    shlb $4, %sil
+; CHECK-NEXT:    shlb $4, %dil
+; CHECK-NEXT:    addb %sil, %dil
+; CHECK-NEXT:    movb $-1, %al
+; CHECK-NEXT:    jb .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    shrb $4, %al
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func3:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    shlb $4, %al
+; CHECK32-NEXT:    shlb $4, %cl
+; CHECK32-NEXT:    addb %al, %cl
+; CHECK32-NEXT:    movb $-1, %al
+; CHECK32-NEXT:    jb .LBB2_2
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    movl %ecx, %eax
+; CHECK32-NEXT:  .LBB2_2:
+; CHECK32-NEXT:    shrb $4, %al
+; CHECK32-NEXT:    retl
+  %tmp = call i4 @llvm.uadd.sat.i4(i4 %x, i4 %y);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    cmovbl %eax, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; CHECK-NEXT:    movd %xmm3, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm3, %edx
+; CHECK-NEXT:    addl %ecx, %edx
+; CHECK-NEXT:    cmovbl %eax, %edx
+; CHECK-NEXT:    movd %edx, %xmm3
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-NEXT:    movd %xmm1, %ecx
+; CHECK-NEXT:    movd %xmm0, %edx
+; CHECK-NEXT:    addl %ecx, %edx
+; CHECK-NEXT:    cmovbl %eax, %edx
+; CHECK-NEXT:    movd %edx, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT:    movd %xmm1, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT:    movd %xmm0, %edx
+; CHECK-NEXT:    addl %ecx, %edx
+; CHECK-NEXT:    cmovbl %eax, %edx
+; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: vec:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    .cfi_offset %esi, -16
+; CHECK32-NEXT:    .cfi_offset %edi, -12
+; CHECK32-NEXT:    .cfi_offset %ebx, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    movl $-1, %ebx
+; CHECK32-NEXT:    cmovbl %ebx, %edi
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    cmovbl %ebx, %esi
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    cmovbl %ebx, %edx
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    cmovbl %ebx, %ecx
+; CHECK32-NEXT:    movl %ecx, 12(%eax)
+; CHECK32-NEXT:    movl %edx, 8(%eax)
+; CHECK32-NEXT:    movl %esi, 4(%eax)
+; CHECK32-NEXT:    movl %edi, (%eax)
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
+  ret <4 x i32> %tmp;
+}
-- 
GitLab


From c06413bb743d30362442ad61c04dbfbb7b662e36 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 22 Oct 2018 23:14:55 +0000
Subject: [PATCH 0416/1116] [X86] Remove unused entries from the X86ProcFamily
 enum. Add a note to discourage creation of new enum entries.

As we've learned multiple times, a coarse grained enum like this is not scalable and we should be migrating away from it.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344972 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.td         | 21 ---------------------
 lib/Target/X86/X86Subtarget.h |  9 ++-------
 2 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index d1263a1fb45..6bf6aae95c0 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -460,22 +460,8 @@ def ProcIntelGLP  : SubtargetFeature<"glp", "X86ProcFamily", "IntelGLP",
                     "Intel Goldmont Plus processors">;
 def ProcIntelTRM  : SubtargetFeature<"tremont", "X86ProcFamily", "IntelTRM",
                     "Intel Tremont processors">;
-def ProcIntelHSW  : SubtargetFeature<"haswell", "X86ProcFamily",
-                    "IntelHaswell", "Intel Haswell processors">;
-def ProcIntelBDW  : SubtargetFeature<"broadwell", "X86ProcFamily",
-                    "IntelBroadwell", "Intel Broadwell processors">;
-def ProcIntelSKL  : SubtargetFeature<"skylake", "X86ProcFamily",
-                    "IntelSkylake", "Intel Skylake processors">;
 def ProcIntelKNL  : SubtargetFeature<"knl", "X86ProcFamily",
                     "IntelKNL", "Intel Knights Landing processors">;
-def ProcIntelSKX  : SubtargetFeature<"skx", "X86ProcFamily",
-                    "IntelSKX", "Intel Skylake Server processors">;
-def ProcIntelCNL  : SubtargetFeature<"cannonlake", "X86ProcFamily",
-                    "IntelCannonlake", "Intel Cannonlake processors">;
-def ProcIntelICL  : SubtargetFeature<"icelake-client", "X86ProcFamily",
-                    "IntelIcelakeClient", "Intel Icelake processors">;
-def ProcIntelICX  : SubtargetFeature<"icelake-server", "X86ProcFamily",
-                    "IntelIcelakeServer", "Intel Icelake Server processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
@@ -787,7 +773,6 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
 
 class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
                                            HSWFeatures.Value, [
-  ProcIntelHSW,
   FeaturePOPCNTFalseDeps,
   FeatureLZCNTFalseDeps
 ]>;
@@ -801,7 +786,6 @@ def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
 ]>;
 class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,
                                              BDWFeatures.Value, [
-  ProcIntelBDW,
   FeaturePOPCNTFalseDeps,
   FeatureLZCNTFalseDeps
 ]>;
@@ -818,7 +802,6 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
 
 class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
                                                  SKLFeatures.Value, [
-  ProcIntelSKL,
   FeatureHasFastGather,
   FeaturePOPCNTFalseDeps,
   FeatureSGX
@@ -873,7 +856,6 @@ def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
 
 class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
                                                  SKXFeatures.Value, [
-  ProcIntelSKX,
   FeatureHasFastGather,
   FeaturePOPCNTFalseDeps
 ]>;
@@ -895,7 +877,6 @@ def CNLFeatures : ProcessorFeatures<SKLFeatures.Value, [
 
 class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
                                               CNLFeatures.Value, [
-  ProcIntelCNL,
   FeatureHasFastGather
 ]>;
 def : CannonlakeProc<"cannonlake">;
@@ -914,14 +895,12 @@ def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [
 
 class IcelakeClientProc<string Name> : ProcModel<Name, SkylakeServerModel,
                                                  ICLFeatures.Value, [
-  ProcIntelICL,
   FeatureHasFastGather
 ]>;
 def : IcelakeClientProc<"icelake-client">;
 
 class IcelakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
                                                  ICLFeatures.Value, [
-  ProcIntelICX,
   FeaturePCONFIG,
   FeatureWBNOINVD,
   FeatureHasFastGather
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 0df3058c374..47b51376e5e 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -52,6 +52,8 @@ enum Style {
 
 class X86Subtarget final : public X86GenSubtargetInfo {
 public:
+  // NOTE: Do not add anything new to this list. Coarse, CPU name based flags
+  // are not a good idea. We should be migrating away from these.
   enum X86ProcFamilyEnum {
     Others,
     IntelAtom,
@@ -59,14 +61,7 @@ public:
     IntelGLM,
     IntelGLP,
     IntelTRM,
-    IntelHaswell,
-    IntelBroadwell,
-    IntelSkylake,
     IntelKNL,
-    IntelSKX,
-    IntelCannonlake,
-    IntelIcelakeClient,
-    IntelIcelakeServer,
   };
 
 protected:
-- 
GitLab


From 4d0646db972a7c2fb2ea0a86bbc153d392c31a74 Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Mon, 22 Oct 2018 23:34:24 +0000
Subject: [PATCH 0417/1116] X86: fix a comment copy-paste issue (NFC)

The comment was copy-pasted but not updated.  NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344973 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86MCInstLower.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index b5fd9f4a785..74fe85851cb 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -527,7 +527,7 @@ ReSimplify:
   }
 
   case X86::CLEANUPRET: {
-    // Replace CATCHRET with the appropriate RET.
+    // Replace CLEANUPRET with the appropriate RET.
     OutMI = MCInst();
     OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
     break;
-- 
GitLab


From d248358db82ed2344221e4a37c2fdc74c5b1f6bf Mon Sep 17 00:00:00 2001
From: Wouter van Oortmerssen <aardappel@gmail.com>
Date: Tue, 23 Oct 2018 00:12:49 +0000
Subject: [PATCH 0418/1116] [WebAssembly] Added test for inline assembly
 roundtrip.

Summary:
Due to previous work to make WebAssembly MC by default stack-only
inline assembly now "just works" (previously it didn't since it had
no way to know types of registers), so no further work required.

So far we only have tests (in inline-asm.ll) which test with
non-existing instructions, so this adds a test that roundtrips
both the inline assembly and its surrounding code thru the assembler.

Reviewers: dschuff, sunfish

Subscribers: sbc100, jgravelle-google, eraman, aheejin, llvm-commits

Differential Revision: https://reviews.llvm.org/D52914

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344977 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/inline-asm-roundtrip.ll       | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 test/CodeGen/WebAssembly/inline-asm-roundtrip.ll

diff --git a/test/CodeGen/WebAssembly/inline-asm-roundtrip.ll b/test/CodeGen/WebAssembly/inline-asm-roundtrip.ll
new file mode 100644
index 00000000000..7fcc3cf276a
--- /dev/null
+++ b/test/CodeGen/WebAssembly/inline-asm-roundtrip.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s | llvm-mc -triple=wasm32-unknown-unknown | FileCheck --match-full-lines %s
+
+; Test basic inline assembly can actually be assembled by the assembler.
+
+; .ll code below is the result of this code run thru
+; clang -target wasm32-unknown-unknown-wasm -O2 -S -emit-llvm test.c
+
+; int main(int argc, const char *argv[]) {
+;   int src = 1;
+;   int dst;
+;   asm ("i32.const\t2\n"
+;        "\tget_local\t%1\n"
+;        "\ti32.add\n"
+;        "\tset_local\t%0"
+;        : "=r" (dst)
+;        : "r" (src));
+;   return dst != 3;
+; }
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: main:
+; CHECK-NEXT:	.param  	i32, i32
+; CHECK-NEXT:	.local  	i32
+; CHECK-NEXT:	i32.const	1
+; CHECK-NEXT:	set_local	[[SRC:[0-9]+]]
+; CHECK-NEXT:	i32.const	2
+; CHECK-NEXT:	get_local	[[SRC]]
+; CHECK-NEXT:	i32.add
+; CHECK-NEXT:	set_local	[[DST:[0-9]+]]
+; CHECK-NEXT:	get_local	[[DST]]
+; CHECK-NEXT:	i32.const	3
+; CHECK-NEXT:	i32.ne
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
+entry:
+  %0 = tail call i32 asm "i32.const\092\0A\09get_local\09$1\0A\09i32.add\0A\09set_local\09$0", "=r,r"(i32 1) #1
+  %cmp = icmp ne i32 %0, 3
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
-- 
GitLab


From 8ad683cced15373e16b79ae618af7abf4eaadd02 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 23 Oct 2018 00:28:14 +0000
Subject: [PATCH 0419/1116] [WebAssembly] Fix assembly printing of br_table

Summary: In `br_table's stack version asm string, \t was missing.

Reviewers: aardappel

Subscribers: dschuff, sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53516

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344981 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyInstrControl.td    |  4 +--
 test/CodeGen/WebAssembly/stack-insts.ll       | 32 +++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/WebAssembly/stack-insts.ll

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index be9cdc59a69..0af94ef8755 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -51,7 +51,7 @@ def BR_TABLE_I32 : NI<(outs), (ins I32:$index, variable_ops),
 let BaseName = "BR_TABLE_I32" in
 def BR_TABLE_I32_S : NI<(outs), (ins variable_ops),
                         [], "true",
-                        "br_table", 0x0e> {
+                        "br_table \t", 0x0e> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
@@ -65,7 +65,7 @@ def BR_TABLE_I64 : NI<(outs), (ins I64:$index, variable_ops),
 let BaseName = "BR_TABLE_I64" in
 def BR_TABLE_I64_S : NI<(outs), (ins variable_ops),
                         [], "true",
-                        "br_table"> {
+                        "br_table \t"> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
diff --git a/test/CodeGen/WebAssembly/stack-insts.ll b/test/CodeGen/WebAssembly/stack-insts.ll
new file mode 100644
index 00000000000..0876b4a4279
--- /dev/null
+++ b/test/CodeGen/WebAssembly/stack-insts.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare void @foo0()
+declare void @foo1()
+
+; Tests if br_table is printed correctly with a tab.
+; CHECK-LABEL: test0:
+; CHECK-NOT: br_table0, 1, 0, 1, 0
+; CHECK: br_table 0, 1, 0, 1, 0
+define void @test0(i32 %n) {
+entry:
+  switch i32 %n, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb.1
+  ]
+
+sw.bb:                                            ; preds = %entry, %entry
+  tail call void @foo0()
+  br label %sw.epilog
+
+sw.bb.1:                                          ; preds = %entry, %entry
+  tail call void @foo1()
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %entry, %sw.bb, %sw.bb.1
+  ret void
+}
-- 
GitLab


From 956dada02800e779f422ac788167e1485881aeaf Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 23 Oct 2018 00:32:22 +0000
Subject: [PATCH 0420/1116] [dsymutil] Improve error reporting when we cannot
 create output file.

Before this patch we were returning an empty string in case we couldn't
create the output file. Now we return an expected string so we can
return and print the proper issue. We now return errors instead of bools
and defer printing to the call site.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344983 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/dsymutil/dsymutil.cpp | 73 +++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 31 deletions(-)

diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp
index c0e6d505941..5fe40678ca9 100644
--- a/tools/dsymutil/dsymutil.cpp
+++ b/tools/dsymutil/dsymutil.cpp
@@ -7,9 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This program is a utility that aims to be a dropin replacement for
-// Darwin's dsymutil.
-//
+// This program is a utility that aims to be a dropin replacement for Darwin's
+// dsymutil.
 //===----------------------------------------------------------------------===//
 
 #include "dsymutil.h"
@@ -165,20 +164,18 @@ static opt<bool>
                        desc("Embed warnings in the linked DWARF debug info."),
                        cat(DsymCategory));
 
-static bool createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) {
+static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) {
   if (NoOutput)
-    return true;
+    return Error::success();
 
   // Create plist file to write to.
   llvm::SmallString<128> InfoPlist(BundleRoot);
   llvm::sys::path::append(InfoPlist, "Contents/Info.plist");
   std::error_code EC;
   llvm::raw_fd_ostream PL(InfoPlist, EC, llvm::sys::fs::F_Text);
-  if (EC) {
-    WithColor::error() << "cannot create plist file " << InfoPlist << ": "
-                       << EC.message() << '\n';
-    return false;
-  }
+  if (EC)
+    return make_error<StringError>(
+        "cannot create Plist: " + toString(errorCodeToError(EC)), EC);
 
   CFBundleInfo BI = getBundleInfo(Bin);
 
@@ -230,22 +227,21 @@ static bool createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) {
      << "</plist>\n";
 
   PL.close();
-  return true;
+  return Error::success();
 }
 
-static bool createBundleDir(llvm::StringRef BundleBase) {
+static Error createBundleDir(llvm::StringRef BundleBase) {
   if (NoOutput)
-    return true;
+    return Error::success();
 
   llvm::SmallString<128> Bundle(BundleBase);
   llvm::sys::path::append(Bundle, "Contents", "Resources", "DWARF");
-  if (std::error_code EC = create_directories(Bundle.str(), true,
-                                              llvm::sys::fs::perms::all_all)) {
-    WithColor::error() << "cannot create directory " << Bundle << ": "
-                       << EC.message() << "\n";
-    return false;
-  }
-  return true;
+  if (std::error_code EC =
+          create_directories(Bundle.str(), true, llvm::sys::fs::perms::all_all))
+    return make_error<StringError>(
+        "cannot create bundle: " + toString(errorCodeToError(EC)), EC);
+
+  return Error::success();
 }
 
 static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch) {
@@ -257,7 +253,7 @@ static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch) {
 
   Expected<OwningBinary<Binary>> BinOrErr = createBinary(OutputFile);
   if (!BinOrErr) {
-    errs() << OutputFile << ": " << toString(BinOrErr.takeError());
+    WithColor::error() << OutputFile << ": " << toString(BinOrErr.takeError());
     return false;
   }
 
@@ -276,7 +272,7 @@ static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch) {
   return false;
 }
 
-static std::string getOutputFileName(llvm::StringRef InputFile) {
+static Expected<std::string> getOutputFileName(llvm::StringRef InputFile) {
   // When updating, do in place replacement.
   if (OutputFileOpt.empty() && Update)
     return InputFile;
@@ -305,8 +301,10 @@ static std::string getOutputFileName(llvm::StringRef InputFile) {
   llvm::SmallString<128> BundleDir(OutputFileOpt);
   if (BundleDir.empty())
     BundleDir = DwarfFile + ".dSYM";
-  if (!createBundleDir(BundleDir) || !createPlistFile(DwarfFile, BundleDir))
-    return "";
+  if (auto E = createBundleDir(BundleDir))
+    return std::move(E);
+  if (auto E = createPlistFile(DwarfFile, BundleDir))
+    return std::move(E);
 
   llvm::sys::path::append(BundleDir, "Contents", "Resources", "DWARF",
                           llvm::sys::path::filename(DwarfFile));
@@ -521,13 +519,20 @@ int main(int argc, char **argv) {
       // Using a std::shared_ptr rather than std::unique_ptr because move-only
       // types don't work with std::bind in the ThreadPool implementation.
       std::shared_ptr<raw_fd_ostream> OS;
-      std::string OutputFile = getOutputFileName(InputFile);
+
+      Expected<std::string> OutputFileOrErr = getOutputFileName(InputFile);
+      if (!OutputFileOrErr) {
+        WithColor::error() << toString(OutputFileOrErr.takeError());
+        return 1;
+      }
+
+      std::string OutputFile = *OutputFileOrErr;
       if (NeedsTempFiles) {
         TempFiles.emplace_back(Map->getTriple().getArchName().str());
 
         auto E = TempFiles.back().createTempFile();
         if (E) {
-          errs() << toString(std::move(E));
+          WithColor::error() << toString(std::move(E));
           return 1;
         }
 
@@ -540,7 +545,7 @@ int main(int argc, char **argv) {
         OS = std::make_shared<raw_fd_ostream>(NoOutput ? "-" : OutputFile, EC,
                                               sys::fs::F_None);
         if (EC) {
-          errs() << OutputFile << ": " << EC.message();
+          WithColor::error() << OutputFile << ": " << EC.message();
           return 1;
         }
       }
@@ -567,10 +572,16 @@ int main(int argc, char **argv) {
     if (!AllOK)
       return 1;
 
-    if (NeedsTempFiles &&
-        !MachOUtils::generateUniversalBinary(
-            TempFiles, getOutputFileName(InputFile), *OptionsOrErr, SDKPath))
-      return 1;
+    if (NeedsTempFiles) {
+      Expected<std::string> OutputFileOrErr = getOutputFileName(InputFile);
+      if (!OutputFileOrErr) {
+        WithColor::error() << toString(OutputFileOrErr.takeError());
+        return 1;
+      }
+      if (!MachOUtils::generateUniversalBinary(TempFiles, *OutputFileOrErr,
+                                               *OptionsOrErr, SDKPath))
+        return 1;
+    }
   }
 
   return 0;
-- 
GitLab


From ac0745ecd05964bb0760bac16b73e83da6cbc68e Mon Sep 17 00:00:00 2001
From: Kostya Serebryany <kcc@google.com>
Date: Tue, 23 Oct 2018 00:50:40 +0000
Subject: [PATCH 0421/1116] [hwasan] add stack frame descriptions.

Summary:
At compile-time, create an array of {PC,HumanReadableStackFrameDescription}
for every function that has an instrumented frame, and pass this array
to the run-time at the module-init time.
Similar to how we handle pc-table in SanitizerCoverage.
The run-time is dummy, will add the actual logic in later commits.

Reviewers: morehouse, eugenis

Reviewed By: eugenis

Subscribers: srhines, llvm-commits, kubamracek

Differential Revision: https://reviews.llvm.org/D53227

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344985 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Instrumentation/HWAddressSanitizer.cpp    | 76 ++++++++++++++++++-
 .../HWAddressSanitizer/basic.ll               |  1 +
 .../HWAddressSanitizer/frame-descriptor.ll    | 27 +++++++
 .../HWAddressSanitizer/with-calls.ll          |  1 +
 4 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 test/Instrumentation/HWAddressSanitizer/frame-descriptor.ll

diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 63bd8ee35c6..34a66296f6f 100644
--- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <sstream>
 
 using namespace llvm;
 
@@ -146,6 +147,11 @@ static cl::opt<bool>
                          cl::desc("Record stack frames with tagged allocations "
                                   "in a thread-local ring buffer"),
                          cl::Hidden, cl::init(true));
+static cl::opt<bool>
+    ClCreateFrameDescriptions("hwasan-create-frame-descriptions",
+                              cl::desc("create static frame descriptions"),
+                              cl::Hidden, cl::init(true));
+
 namespace {
 
 /// An instrumentation pass implementing detection of addressability bugs
@@ -198,8 +204,27 @@ public:
 
 private:
   LLVMContext *C;
+  std::string CurModuleUniqueId;
   Triple TargetTriple;
 
+  // Frame description is a way to pass names/sizes of local variables
+  // to the run-time w/o adding extra executable code in every function.
+  // We do this by creating a separate section with {PC,Descr} pairs and passing
+  // the section beg/end to __hwasan_init_frames() at module init time.
+  std::string createFrameString(ArrayRef<AllocaInst*> Allocas);
+  void createFrameGlobal(Function &F, const std::string &FrameString);
+  // Get the section name for frame descriptions. Currently ELF-only.
+  const char *getFrameSection() { return "__hwasan_frames"; }
+  const char *getFrameSectionBeg() { return  "__start___hwasan_frames"; }
+  const char *getFrameSectionEnd() { return  "__stop___hwasan_frames"; }
+  GlobalVariable *createFrameSectionBound(Module &M, Type *Ty,
+                                          const char *Name) {
+    auto GV = new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
+                                 nullptr, Name);
+    GV->setVisibility(GlobalValue::HiddenVisibility);
+    return GV;
+  }
+
   /// This struct defines the shadow mapping using the rule:
   ///   shadow = (mem >> Scale) + Offset.
   /// If InGlobal is true, then
@@ -207,7 +232,7 @@ private:
   ///   shadow = (mem >> Scale) + &__hwasan_shadow
   /// If InTls is true, then
   ///   extern char *__hwasan_tls;
-  ///   shadow = (mem >> Scale) + align_up(__hwasan_shadow, kShadowBaseAlignment)
+  ///   shadow = (mem>>Scale) + align_up(__hwasan_shadow, kShadowBaseAlignment)
   struct ShadowMapping {
     int Scale;
     uint64_t Offset;
@@ -271,6 +296,7 @@ bool HWAddressSanitizer::doInitialization(Module &M) {
   Mapping.init(TargetTriple);
 
   C = &(M.getContext());
+  CurModuleUniqueId = getUniqueModuleId(&M);
   IRBuilder<> IRB(*C);
   IntptrTy = IRB.getIntPtrTy(DL);
   Int8PtrTy = IRB.getInt8PtrTy();
@@ -285,6 +311,21 @@ bool HWAddressSanitizer::doInitialization(Module &M) {
                                             /*InitArgs=*/{});
     appendToGlobalCtors(M, HwasanCtorFunction, 0);
   }
+
+  // Create a call to __hwasan_init_frames.
+  if (HwasanCtorFunction) {
+    // Create a dummy frame description for the CTOR function.
+    // W/o it we would have to create the call to __hwasan_init_frames after
+    // all functions are instrumented (i.e. need to have a ModulePass).
+    createFrameGlobal(*HwasanCtorFunction, "");
+    IRBuilder<> IRBCtor(HwasanCtorFunction->getEntryBlock().getTerminator());
+    IRBCtor.CreateCall(
+        declareSanitizerInitFunction(M, "__hwasan_init_frames",
+                                     {Int8PtrTy, Int8PtrTy}),
+        {createFrameSectionBound(M, Int8Ty, getFrameSectionBeg()),
+         createFrameSectionBound(M, Int8Ty, getFrameSectionEnd())});
+  }
+
   if (!TargetTriple.isAndroid())
     appendToCompilerUsed(
         M, ThreadPtrGlobal = new GlobalVariable(
@@ -676,6 +717,36 @@ Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
   return nullptr;
 }
 
+// Creates a string with a description of the stack frame (set of Allocas).
+// The string is intended to be human readable.
+// The current form is: Size1 Name1; Size2 Name2; ...
+std::string
+HWAddressSanitizer::createFrameString(ArrayRef<AllocaInst *> Allocas) {
+  std::ostringstream Descr;
+  for (auto AI : Allocas)
+    Descr << getAllocaSizeInBytes(*AI) << " " <<  AI->getName().str() << "; ";
+  return Descr.str();
+}
+
+// Creates a global in the frame section which consists of two pointers:
+// the function PC and the frame string constant.
+void HWAddressSanitizer::createFrameGlobal(Function &F,
+                                           const std::string &FrameString) {
+  Module &M = *F.getParent();
+  auto DescrGV = createPrivateGlobalForString(M, FrameString, true);
+  auto PtrPairTy = StructType::get(F.getType(), DescrGV->getType());
+  auto GV = new GlobalVariable(
+      M, PtrPairTy, /*isConstantGlobal*/ true, GlobalVariable::PrivateLinkage,
+      ConstantStruct::get(PtrPairTy, (Constant *)&F, (Constant *)DescrGV),
+      "__hwasan");
+  GV->setSection(getFrameSection());
+  appendToCompilerUsed(M, GV);
+  // Put GV into the F's Comadat so that if F is deleted GV can be deleted too.
+  if (&F != HwasanCtorFunction)
+    if (auto Comdat = GetOrCreateFunctionComdat(F, CurModuleUniqueId))
+      GV->setComdat(Comdat);
+}
+
 Value *HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB,
                                         bool WithFrameRecord) {
   if (!Mapping.InTls)
@@ -838,6 +909,9 @@ bool HWAddressSanitizer::runOnFunction(Function &F) {
   if (AllocasToInstrument.empty() && ToInstrument.empty())
     return false;
 
+  if (ClCreateFrameDescriptions && !AllocasToInstrument.empty())
+    createFrameGlobal(F, createFrameString(AllocasToInstrument));
+
   initializeCallbacks(*F.getParent());
 
   assert(!LocalDynamicShadow);
diff --git a/test/Instrumentation/HWAddressSanitizer/basic.ll b/test/Instrumentation/HWAddressSanitizer/basic.ll
index e8010992945..8253016d97b 100644
--- a/test/Instrumentation/HWAddressSanitizer/basic.ll
+++ b/test/Instrumentation/HWAddressSanitizer/basic.ll
@@ -354,5 +354,6 @@ entry:
 
 ; CHECK:      define internal void @hwasan.module_ctor() {
 ; CHECK-NEXT:   call void @__hwasan_init()
+; CHECK-NEXT:   call void @__hwasan_init_frames(
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
diff --git a/test/Instrumentation/HWAddressSanitizer/frame-descriptor.ll b/test/Instrumentation/HWAddressSanitizer/frame-descriptor.ll
new file mode 100644
index 00000000000..3fd4197d3bb
--- /dev/null
+++ b/test/Instrumentation/HWAddressSanitizer/frame-descriptor.ll
@@ -0,0 +1,27 @@
+; Test frame descriptors
+;
+; RUN: opt < %s -hwasan -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android"
+
+declare void @use32(i32*, i64*)
+
+define void @test_alloca() sanitize_hwaddress {
+entry:
+  %XYZ = alloca i32, align 4
+  %ABC = alloca i64, align 4
+  call void @use32(i32* nonnull %XYZ, i64 *nonnull %ABC)
+  ret void
+}
+
+; CHECK: @[[STR:[0-9]*]] = private unnamed_addr constant [15 x i8] c"4 XYZ; 8 ABC; \00", align 1
+; CHECK: private constant { void ()*, [15 x i8]* } { void ()* @test_alloca, [15 x i8]* @[[STR]] }, section "__hwasan_frames", comdat($test_alloca)
+
+; CHECK-LABEL: @test_alloca(
+; CHECK: ret void
+
+; CHECK-LABEL: @hwasan.module_ctor
+; CHECK: call void @__hwasan_init_frames(i8* @__start___hwasan_frames, i8* @__stop___hwasan_frames)
+; CHECK: ret void
+
diff --git a/test/Instrumentation/HWAddressSanitizer/with-calls.ll b/test/Instrumentation/HWAddressSanitizer/with-calls.ll
index 768434c5b55..8d6068c3438 100644
--- a/test/Instrumentation/HWAddressSanitizer/with-calls.ll
+++ b/test/Instrumentation/HWAddressSanitizer/with-calls.ll
@@ -199,5 +199,6 @@ entry:
 
 ; CHECK:      define internal void @hwasan.module_ctor() {
 ; CHECK-NEXT:   call void @__hwasan_init()
+; CHECK-NEXT:   call void @__hwasan_init_frames(
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
-- 
GitLab


From abbd80eecc4475adbf7a3b352b49f4862aeb611b Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 23 Oct 2018 01:36:31 +0000
Subject: [PATCH 0422/1116] [ORC] Dump flags for JITDylib symbol table entries.

This can help when debugging flag-specific symbol table issues.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344993 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/ExecutionEngine/Orc/Core.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index 5e31e448c7d..6c7a952c648 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -1386,7 +1386,7 @@ void JITDylib::dump(raw_ostream &OS) {
     for (auto &KV : Symbols) {
       OS << "    \"" << *KV.first << "\": ";
       if (auto Addr = KV.second.getAddress())
-        OS << format("0x%016x", Addr);
+        OS << format("0x%016x", Addr) << ", " << KV.second.getFlags();
       else
         OS << "<not resolved>";
       if (KV.second.getFlags().isLazy() ||
@@ -1400,7 +1400,7 @@ void JITDylib::dump(raw_ostream &OS) {
         }
         if (KV.second.getFlags().isMaterializing())
           OS << " Materializing";
-        OS << " )\n";
+        OS << ", " << KV.second.getFlags() << " )\n";
       } else
         OS << "\n";
     }
-- 
GitLab


From 41fdb6ac056221e583fa52c82e54f67e58d91b53 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 23 Oct 2018 01:36:32 +0000
Subject: [PATCH 0423/1116] [ORC] Show JITDylib search order in JITDylib::dump.

This can be helpful in debugging search-order related failures.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344994 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/ExecutionEngine/Orc/Core.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index 6c7a952c648..df4d0028a4a 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -1381,6 +1381,10 @@ void JITDylib::dump(raw_ostream &OS) {
     OS << "JITDylib \"" << JITDylibName
        << "\" (ES: " << format("0x%016x", reinterpret_cast<uintptr_t>(&ES))
        << "):\n"
+       << "Search order: [";
+    for (auto *JD : SearchOrder)
+      OS << " \"" << JD->getName() << "\"";
+    OS << " ]\n"
        << "Symbol table:\n";
 
     for (auto &KV : Symbols) {
-- 
GitLab


From c407a17bdd084ea008046378b90dfceaea2a3a75 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 23 Oct 2018 01:36:33 +0000
Subject: [PATCH 0424/1116] [RuntimeDyld][COFF] Skip non-loaded sections when
 calculating ImageBase.

Non-loaded sections (whose unused load-address defaults to zero) should not
be taken into account when calculating ImageBase, or ImageBase will be
incorrectly set to 0.

Patch by Andrew Scheidecker. Thanks Andrew!

https://reviews.llvm.org/D51343

+        // The Sections list may contain sections that weren't loaded for
+        // whatever reason: they may be debug sections, and ProcessAllSections
+        // is false, or they may be sections that contain 0 bytes. If the
+        // section isn't loaded, the load address will be 0, and it should not
+        // be included in the ImageBase calculation.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344995 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Targets/RuntimeDyldCOFFX86_64.h           |  8 +++++++-
 .../RuntimeDyld/X86/COFF_x86_64_IMGREL.s      |  6 +++---
 tools/llvm-rtdyld/llvm-rtdyld.cpp             | 19 ++++++++++++++-----
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index 2d6e5c4aea6..39bdc4b6921 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -37,7 +37,13 @@ private:
     if (!ImageBase) {
       ImageBase = std::numeric_limits<uint64_t>::max();
       for (const SectionEntry &Section : Sections)
-        ImageBase = std::min(ImageBase, Section.getLoadAddress());
+        // The Sections list may contain sections that weren't loaded for
+        // whatever reason: they may be debug sections, and ProcessAllSections
+        // is false, or they may be sections that contain 0 bytes. If the
+        // section isn't loaded, the load address will be 0, and it should not
+        // be included in the ImageBase calculation.
+        if (Section.getLoadAddress() != 0)
+          ImageBase = std::min(ImageBase, Section.getLoadAddress());
     }
     return ImageBase;
   }
diff --git a/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s b/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s
index 8f7e2043218..ac097c44e5f 100644
--- a/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s
+++ b/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s
@@ -1,6 +1,6 @@
 # RUN: rm -rf %t && mkdir -p %t
 # RUN: llvm-mc -triple=x86_64-pc-win32 -filetype=obj -o %t/COFF_x86_64_IMGREL.o %s
-# RUN: llvm-rtdyld -triple=x86_64-pc-win32 -verify -check=%s %t/COFF_x86_64_IMGREL.o
+# RUN: llvm-rtdyld -triple=x86_64-pc-win32 -verify -target-addr-start=40960000000000 -check=%s %t/COFF_x86_64_IMGREL.o
 .text
 	.def	 F;
 	.scl	2;
@@ -18,9 +18,9 @@
 	.align	16, 0x90
 
 F:                                      # @F
-# rtdyld-check: decode_operand(inst1, 3) = section_addr(COFF_x86_64_IMGREL.o, .text)+0
+# rtdyld-check: decode_operand(inst1, 3) = section_addr(COFF_x86_64_IMGREL.o, .text)+0-40960000000000
 inst1:
     mov %ebx, F@IMGREL
-# rtdyld-check: decode_operand(inst2, 3) = section_addr(COFF_x86_64_IMGREL.o, .rdata)+5
+# rtdyld-check: decode_operand(inst2, 3) = section_addr(COFF_x86_64_IMGREL.o, .rdata)+5-40960000000000
 inst2:
     mov %ebx, (__constdata@imgrel+5)
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 54db1ec113f..6ef28236574 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -88,25 +88,30 @@ CheckFiles("check",
            cl::desc("File containing RuntimeDyld verifier checks."),
            cl::ZeroOrMore);
 
-static cl::opt<uint64_t>
+// Tracking BUG: 19665
+// http://llvm.org/bugs/show_bug.cgi?id=19665
+//
+// Do not change these options to cl::opt<uint64_t> since this silently breaks
+// argument parsing.
+static cl::opt<unsigned long long>
 PreallocMemory("preallocate",
               cl::desc("Allocate memory upfront rather than on-demand"),
               cl::init(0));
 
-static cl::opt<uint64_t>
+static cl::opt<unsigned long long>
 TargetAddrStart("target-addr-start",
                 cl::desc("For -verify only: start of phony target address "
                          "range."),
                 cl::init(4096), // Start at "page 1" - no allocating at "null".
                 cl::Hidden);
 
-static cl::opt<uint64_t>
+static cl::opt<unsigned long long>
 TargetAddrEnd("target-addr-end",
               cl::desc("For -verify only: end of phony target address range."),
               cl::init(~0ULL),
               cl::Hidden);
 
-static cl::opt<uint64_t>
+static cl::opt<unsigned long long>
 TargetSectionSep("target-section-sep",
                  cl::desc("For -verify only: Separation between sections in "
                           "phony target address space."),
@@ -577,7 +582,11 @@ static void remapSectionsAndSymbols(const llvm::Triple &TargetTriple,
     if (LoadAddr &&
         *LoadAddr != static_cast<uint64_t>(
                        reinterpret_cast<uintptr_t>(Tmp->first))) {
-      AlreadyAllocated[*LoadAddr] = Tmp->second;
+      // A section will have a LoadAddr of 0 if it wasn't loaded for whatever
+      // reason (e.g. zero byte COFF sections). Don't include those sections in
+      // the allocation map.
+      if (*LoadAddr != 0)
+        AlreadyAllocated[*LoadAddr] = Tmp->second;
       Worklist.erase(Tmp);
     }
   }
-- 
GitLab


From 5eb2e5b8a8d712254b3c59cb3e2028a8263b23d2 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 23 Oct 2018 04:18:08 +0000
Subject: [PATCH 0425/1116] [X86] Regenerate test checks to show fma comments.
 NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344999 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/fma-fneg-combine.ll | 36 ++++++++++++++--------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index ce368744562..6a148397336 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -8,7 +8,7 @@
 define <16 x float> @test1(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfmsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
 ; CHECK-NEXT:    retq
 entry:
   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
@@ -24,7 +24,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x fl
 define <16 x float> @test2(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i32 4) #2
@@ -35,7 +35,7 @@ entry:
 define <16 x float> @test3(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfmsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2
@@ -46,7 +46,7 @@ entry:
 define <16 x float> @test4(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
 ; CHECK-LABEL: test4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2
@@ -105,7 +105,7 @@ declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x f
 define <8 x double> @test9(<8 x double> %a, <8 x double> %b, <8 x double> %c) {
 ; CHECK-LABEL: test9:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i32 4) #2
@@ -118,7 +118,7 @@ declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double
 define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK-LABEL: test10:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
 ; CHECK-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
@@ -160,13 +160,13 @@ define <4 x float> @test11b(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 z
 ; SKX-LABEL: test11b:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test11b:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; KNL-NEXT:    retq
 entry:
   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
@@ -180,14 +180,14 @@ define <8 x double> @test12(<8 x double> %a, <8 x double> %b, <8 x double> %c, i
 ; SKX-LABEL: test12:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
 ; SKX-NEXT:    vxorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test12:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1}
+; KNL-NEXT:    vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
 ; KNL-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
 ; KNL-NEXT:    retq
 entry:
@@ -297,13 +297,13 @@ define <8 x double> @test17(<8 x double> %a, <8 x double> %b, <8 x double> %c, i
 ; SKX-LABEL: test17:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfmsubadd132pd %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test17:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfmsubadd132pd %zmm1, %zmm2, %zmm0 {%k1}
+; KNL-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
 ; KNL-NEXT:    retq
   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %c
   %res = call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %sub.i, i32 4)
@@ -317,13 +317,13 @@ define <4 x float> @test18(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 ze
 ; SKX-LABEL: test18:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test18:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
 ; KNL-NEXT:    retq
 entry:
   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
@@ -335,13 +335,13 @@ define <4 x float> @test19(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 ze
 ; SKX-LABEL: test19:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test19:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
 ; KNL-NEXT:    retq
 entry:
   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
@@ -354,14 +354,14 @@ define <4 x float> @test20(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 ze
 ; SKX-LABEL: test20:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; SKX-NEXT:    vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test20:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; KNL-NEXT:    vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
 ; KNL-NEXT:    vmovaps %xmm2, %xmm0
 ; KNL-NEXT:    retq
 entry:
-- 
GitLab


From 2cf3038ef72546910cf9ac1c6f21cc5ad85e5453 Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Tue, 23 Oct 2018 07:13:47 +0000
Subject: [PATCH 0426/1116] Add support for GNU Hurd in Path.inc and other
 places

Summary: Patch by Svante Signell & myself

Reviewers: rnk, JDevlieghere, efriedma

Reviewed By: efriedma

Subscribers: efriedma, JDevlieghere, krytarowski, llvm-commits, kristina

Differential Revision: https://reviews.llvm.org/D53409

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345007 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Unix/Path.inc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index ec3eecb2947..0f61e94145e 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -49,6 +49,7 @@
 // For GNU Hurd
 #if defined(__GNU__) && !defined(PATH_MAX)
 # define PATH_MAX 4096
+# define MAXPATHLEN 4096
 #endif
 
 #include <sys/types.h>
@@ -82,7 +83,7 @@
 #define STATVFS_F_FRSIZE(vfs) static_cast<uint64_t>(vfs.f_bsize)
 #endif
 
-#if defined(__NetBSD__)
+#if defined(__NetBSD__) || defined(__GNU__)
 #define STATVFS_F_FLAG(vfs) (vfs).f_flag
 #else
 #define STATVFS_F_FLAG(vfs) (vfs).f_flags
@@ -98,7 +99,7 @@ const file_t kInvalidFile = -1;
 
 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) ||     \
     defined(__minix) || defined(__FreeBSD_kernel__) || defined(__linux__) ||   \
-    defined(__CYGWIN__) || defined(__DragonFly__) || defined(_AIX)
+    defined(__CYGWIN__) || defined(__DragonFly__) || defined(_AIX) || defined(__GNU__)
 static int
 test_dir(char ret[PATH_MAX], const char *dir, const char *bin)
 {
@@ -347,7 +348,7 @@ std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
 }
 
 static bool is_local_impl(struct STATVFS &Vfs) {
-#if defined(__linux__)
+#if defined(__linux__) || defined(__GNU__)
 #ifndef NFS_SUPER_MAGIC
 #define NFS_SUPER_MAGIC 0x6969
 #endif
@@ -357,7 +358,11 @@ static bool is_local_impl(struct STATVFS &Vfs) {
 #ifndef CIFS_MAGIC_NUMBER
 #define CIFS_MAGIC_NUMBER 0xFF534D42
 #endif
+#ifdef __GNU__
+  switch ((uint32_t)Vfs.__f_type) {
+#else
   switch ((uint32_t)Vfs.f_type) {
+#endif
   case NFS_SUPER_MAGIC:
   case SMB_SUPER_MAGIC:
   case CIFS_MAGIC_NUMBER:
-- 
GitLab


From 40760b733d9eef841c897338af5e9d81b12551bf Mon Sep 17 00:00:00 2001
From: Lama Saba <lama.saba@intel.com>
Date: Tue, 23 Oct 2018 07:58:41 +0000
Subject: [PATCH 0427/1116] [MachinePipeliner] Split MachinePipeliner code into
 header and cpp files

Split MachinePipeliner code into header and cpp files to allow inheritance from SwingSchedulerDAG

Differential Revision: https://reviews.llvm.org/D53477

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345008 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachinePipeliner.h | 614 ++++++++++++++++++++++++
 lib/CodeGen/MachinePipeliner.cpp        | 605 +----------------------
 2 files changed, 621 insertions(+), 598 deletions(-)
 create mode 100644 include/llvm/CodeGen/MachinePipeliner.h

diff --git a/include/llvm/CodeGen/MachinePipeliner.h b/include/llvm/CodeGen/MachinePipeliner.h
new file mode 100644
index 00000000000..2eb63adc7a3
--- /dev/null
+++ b/include/llvm/CodeGen/MachinePipeliner.h
@@ -0,0 +1,614 @@
+//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
+//
+// Software pipelining (SWP) is an instruction scheduling technique for loops
+// that overlap loop iterations and exploits ILP via a compiler transformation.
+//
+// Swing Modulo Scheduling is an implementation of software pipelining
+// that generates schedules that are near optimal in terms of initiation
+// interval, register requirements, and stage count. See the papers:
+//
+// "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
+// A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Proceedings of the 1996
+// Conference on Parallel Architectures and Compilation Techiniques.
+//
+// "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
+// Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
+// Transactions on Computers, Vol. 50, No. 3, 2001.
+//
+// "An Implementation of Swing Modulo Scheduling With Extensions for
+// Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
+// Urbana-Chambpain, 2005.
+//
+//
+// The SMS algorithm consists of three main steps after computing the minimal
+// initiation interval (MII).
+// 1) Analyze the dependence graph and compute information about each
+//    instruction in the graph.
+// 2) Order the nodes (instructions) by priority based upon the heuristics
+//    described in the algorithm.
+// 3) Attempt to schedule the nodes in the specified order using the MII.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
+#define LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
+
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+namespace llvm {
+
+class NodeSet;
+class SMSchedule;
+
+extern cl::opt<bool> SwpEnableCopyToPhi;
+
+/// The main class in the implementation of the target independent
+/// software pipeliner pass.
+class MachinePipeliner : public MachineFunctionPass {
+public:
+  MachineFunction *MF = nullptr;
+  const MachineLoopInfo *MLI = nullptr;
+  const MachineDominatorTree *MDT = nullptr;
+  const InstrItineraryData *InstrItins;
+  const TargetInstrInfo *TII = nullptr;
+  RegisterClassInfo RegClassInfo;
+
+#ifndef NDEBUG
+  static int NumTries;
+#endif
+
+  /// Cache the target analysis information about the loop.
+  struct LoopInfo {
+    MachineBasicBlock *TBB = nullptr;
+    MachineBasicBlock *FBB = nullptr;
+    SmallVector<MachineOperand, 4> BrCond;
+    MachineInstr *LoopInductionVar = nullptr;
+    MachineInstr *LoopCompare = nullptr;
+  };
+  LoopInfo LI;
+
+  static char ID;
+
+  MachinePipeliner() : MachineFunctionPass(ID) {
+    initializeMachinePipelinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<LiveIntervals>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  void preprocessPhiNodes(MachineBasicBlock &B);
+  bool canPipelineLoop(MachineLoop &L);
+  bool scheduleLoop(MachineLoop &L);
+  bool swingModuloScheduler(MachineLoop &L);
+};
+
+/// This class builds the dependence graph for the instructions in a loop,
+/// and attempts to schedule the instructions using the SMS algorithm.
+class SwingSchedulerDAG : public ScheduleDAGInstrs {
+  MachinePipeliner &Pass;
+  /// The minimum initiation interval between iterations for this schedule.
+  unsigned MII = 0;
+  /// Set to true if a valid pipelined schedule is found for the loop.
+  bool Scheduled = false;
+  MachineLoop &Loop;
+  LiveIntervals &LIS;
+  const RegisterClassInfo &RegClassInfo;
+
+  /// A toplogical ordering of the SUnits, which is needed for changing
+  /// dependences and iterating over the SUnits.
+  ScheduleDAGTopologicalSort Topo;
+
+  struct NodeInfo {
+    int ASAP = 0;
+    int ALAP = 0;
+    int ZeroLatencyDepth = 0;
+    int ZeroLatencyHeight = 0;
+
+    NodeInfo() = default;
+  };
+  /// Computed properties for each node in the graph.
+  std::vector<NodeInfo> ScheduleInfo;
+
+  enum OrderKind { BottomUp = 0, TopDown = 1 };
+  /// Computed node ordering for scheduling.
+  SetVector<SUnit *> NodeOrder;
+
+  using NodeSetType = SmallVector<NodeSet, 8>;
+  using ValueMapTy = DenseMap<unsigned, unsigned>;
+  using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
+  using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
+
+  /// Instructions to change when emitting the final schedule.
+  DenseMap<SUnit *, std::pair<unsigned, int64_t>> InstrChanges;
+
+  /// We may create a new instruction, so remember it because it
+  /// must be deleted when the pass is finished.
+  SmallPtrSet<MachineInstr *, 4> NewMIs;
+
+  /// Ordered list of DAG postprocessing steps.
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
+
+  /// Helper class to implement Johnson's circuit finding algorithm.
+  class Circuits {
+    std::vector<SUnit> &SUnits;
+    SetVector<SUnit *> Stack;
+    BitVector Blocked;
+    SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
+    SmallVector<SmallVector<int, 4>, 16> AdjK;
+    // Node to Index from ScheduleDAGTopologicalSort
+    std::vector<int> *Node2Idx;
+    unsigned NumPaths;
+    static unsigned MaxPaths;
+
+  public:
+    Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
+        : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {
+      Node2Idx = new std::vector<int>(SUs.size());
+      unsigned Idx = 0;
+      for (const auto &NodeNum : Topo)
+        Node2Idx->at(NodeNum) = Idx++;
+    }
+
+    ~Circuits() { delete Node2Idx; }
+
+    /// Reset the data structures used in the circuit algorithm.
+    void reset() {
+      Stack.clear();
+      Blocked.reset();
+      B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>());
+      NumPaths = 0;
+    }
+
+    void createAdjacencyStructure(SwingSchedulerDAG *DAG);
+    bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
+    void unblock(int U);
+  };
+
+  struct CopyToPhiMutation : public ScheduleDAGMutation {
+    void apply(ScheduleDAGInstrs *DAG) override;
+  };
+
+public:
+  SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
+                    const RegisterClassInfo &rci)
+      : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
+        RegClassInfo(rci), Topo(SUnits, &ExitSU) {
+    P.MF->getSubtarget().getSMSMutations(Mutations);
+    if (SwpEnableCopyToPhi)
+      Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
+  }
+
+  void schedule() override;
+  void finishBlock() override;
+
+  /// Return true if the loop kernel has been scheduled.
+  bool hasNewSchedule() { return Scheduled; }
+
+  /// Return the earliest time an instruction may be scheduled.
+  int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
+
+  /// Return the latest time an instruction my be scheduled.
+  int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
+
+  /// The mobility function, which the number of slots in which
+  /// an instruction may be scheduled.
+  int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
+
+  /// The depth, in the dependence graph, for a node.
+  unsigned getDepth(SUnit *Node) { return Node->getDepth(); }
+
+  /// The maximum unweighted length of a path from an arbitrary node to the
+  /// given node in which each edge has latency 0
+  int getZeroLatencyDepth(SUnit *Node) {
+    return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth;
+  }
+
+  /// The height, in the dependence graph, for a node.
+  unsigned getHeight(SUnit *Node) { return Node->getHeight(); }
+
+  /// The maximum unweighted length of a path from the given node to an
+  /// arbitrary node in which each edge has latency 0
+  int getZeroLatencyHeight(SUnit *Node) {
+    return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
+  }
+
+  /// Return true if the dependence is a back-edge in the data dependence graph.
+  /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
+  /// using an anti dependence from a Phi to an instruction.
+  bool isBackedge(SUnit *Source, const SDep &Dep) {
+    if (Dep.getKind() != SDep::Anti)
+      return false;
+    return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
+  }
+
+  bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
+
+  /// The distance function, which indicates that operation V of iteration I
+  /// depends on operations U of iteration I-distance.
+  unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
+    // Instructions that feed a Phi have a distance of 1. Computing larger
+    // values for arrays requires data dependence information.
+    if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti)
+      return 1;
+    return 0;
+  }
+
+  /// Set the Minimum Initiation Interval for this schedule attempt.
+  void setMII(unsigned mii) { MII = mii; }
+
+  void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
+
+  void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
+
+  /// Return the new base register that was stored away for the changed
+  /// instruction.
+  unsigned getInstrBaseReg(SUnit *SU) {
+    DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
+        InstrChanges.find(SU);
+    if (It != InstrChanges.end())
+      return It->second.first;
+    return 0;
+  }
+
+  void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
+    Mutations.push_back(std::move(Mutation));
+  }
+
+  static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
+
+private:
+  void addLoopCarriedDependences(AliasAnalysis *AA);
+  void updatePhiDependences();
+  void changeDependences();
+  unsigned calculateResMII();
+  unsigned calculateRecMII(NodeSetType &RecNodeSets);
+  void findCircuits(NodeSetType &NodeSets);
+  void fuseRecs(NodeSetType &NodeSets);
+  void removeDuplicateNodes(NodeSetType &NodeSets);
+  void computeNodeFunctions(NodeSetType &NodeSets);
+  void registerPressureFilter(NodeSetType &NodeSets);
+  void colocateNodeSets(NodeSetType &NodeSets);
+  void checkNodeSets(NodeSetType &NodeSets);
+  void groupRemainingNodes(NodeSetType &NodeSets);
+  void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
+                         SetVector<SUnit *> &NodesAdded);
+  void computeNodeOrder(NodeSetType &NodeSets);
+  void checkValidNodeOrder(const NodeSetType &Circuits) const;
+  bool schedulePipeline(SMSchedule &Schedule);
+  void generatePipelinedLoop(SMSchedule &Schedule);
+  void generateProlog(SMSchedule &Schedule, unsigned LastStage,
+                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
+                      MBBVectorTy &PrologBBs);
+  void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
+                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
+                      MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
+  void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
+                            MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
+                            SMSchedule &Schedule, ValueMapTy *VRMap,
+                            InstrMapTy &InstrMap, unsigned LastStageNum,
+                            unsigned CurStageNum, bool IsLast);
+  void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
+                    MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
+                    SMSchedule &Schedule, ValueMapTy *VRMap,
+                    InstrMapTy &InstrMap, unsigned LastStageNum,
+                    unsigned CurStageNum, bool IsLast);
+  void removeDeadInstructions(MachineBasicBlock *KernelBB,
+                              MBBVectorTy &EpilogBBs);
+  void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
+                      SMSchedule &Schedule);
+  void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
+                   MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
+                   ValueMapTy *VRMap);
+  bool computeDelta(MachineInstr &MI, unsigned &Delta);
+  void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
+                         unsigned Num);
+  MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
+                           unsigned InstStageNum);
+  MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
+                                    unsigned InstStageNum,
+                                    SMSchedule &Schedule);
+  void updateInstruction(MachineInstr *NewMI, bool LastDef,
+                         unsigned CurStageNum, unsigned InstrStageNum,
+                         SMSchedule &Schedule, ValueMapTy *VRMap);
+  MachineInstr *findDefInLoop(unsigned Reg);
+  unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
+                         unsigned LoopStage, ValueMapTy *VRMap,
+                         MachineBasicBlock *BB);
+  void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
+                        SMSchedule &Schedule, ValueMapTy *VRMap,
+                        InstrMapTy &InstrMap);
+  void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
+                             InstrMapTy &InstrMap, unsigned CurStageNum,
+                             unsigned PhiNum, MachineInstr *Phi,
+                             unsigned OldReg, unsigned NewReg,
+                             unsigned PrevReg = 0);
+  bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
+                             unsigned &OffsetPos, unsigned &NewBase,
+                             int64_t &NewOffset);
+  void postprocessDAG();
+};
+
+/// A NodeSet contains a set of SUnit DAG nodes with additional information
+/// that assigns a priority to the set.
+class NodeSet {
+  SetVector<SUnit *> Nodes;
+  bool HasRecurrence = false;
+  unsigned RecMII = 0;
+  int MaxMOV = 0;
+  unsigned MaxDepth = 0;
+  unsigned Colocate = 0;
+  SUnit *ExceedPressure = nullptr;
+  unsigned Latency = 0;
+
+public:
+  using iterator = SetVector<SUnit *>::const_iterator;
+
+  NodeSet() = default;
+  NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
+    Latency = 0;
+    for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
+      for (const SDep &Succ : Nodes[i]->Succs)
+        if (Nodes.count(Succ.getSUnit()))
+          Latency += Succ.getLatency();
+  }
+
+  bool insert(SUnit *SU) { return Nodes.insert(SU); }
+
+  void insert(iterator S, iterator E) { Nodes.insert(S, E); }
+
+  template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) {
+    return Nodes.remove_if(P);
+  }
+
+  unsigned count(SUnit *SU) const { return Nodes.count(SU); }
+
+  bool hasRecurrence() { return HasRecurrence; };
+
+  unsigned size() const { return Nodes.size(); }
+
+  bool empty() const { return Nodes.empty(); }
+
+  SUnit *getNode(unsigned i) const { return Nodes[i]; };
+
+  void setRecMII(unsigned mii) { RecMII = mii; };
+
+  void setColocate(unsigned c) { Colocate = c; };
+
+  void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
+
+  bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
+
+  int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
+
+  int getRecMII() { return RecMII; }
+
+  /// Summarize node functions for the entire node set.
+  void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
+    for (SUnit *SU : *this) {
+      MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
+      MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
+    }
+  }
+
+  unsigned getLatency() { return Latency; }
+
+  unsigned getMaxDepth() { return MaxDepth; }
+
+  void clear() {
+    Nodes.clear();
+    RecMII = 0;
+    HasRecurrence = false;
+    MaxMOV = 0;
+    MaxDepth = 0;
+    Colocate = 0;
+    ExceedPressure = nullptr;
+  }
+
+  operator SetVector<SUnit *> &() { return Nodes; }
+
+  /// Sort the node sets by importance. First, rank them by recurrence MII,
+  /// then by mobility (least mobile done first), and finally by depth.
+  /// Each node set may contain a colocate value which is used as the first
+  /// tie breaker, if it's set.
+  bool operator>(const NodeSet &RHS) const {
+    if (RecMII == RHS.RecMII) {
+      if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate)
+        return Colocate < RHS.Colocate;
+      if (MaxMOV == RHS.MaxMOV)
+        return MaxDepth > RHS.MaxDepth;
+      return MaxMOV < RHS.MaxMOV;
+    }
+    return RecMII > RHS.RecMII;
+  }
+
+  bool operator==(const NodeSet &RHS) const {
+    return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
+           MaxDepth == RHS.MaxDepth;
+  }
+
+  bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
+
+  iterator begin() { return Nodes.begin(); }
+  iterator end() { return Nodes.end(); }
+
+  void print(raw_ostream &os) const {
+    os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
+       << " depth " << MaxDepth << " col " << Colocate << "\n";
+    for (const auto &I : Nodes)
+      os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
+    os << "\n";
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+#endif
+};
+
+/// This class represents the scheduled code.  The main data structure is a
+/// map from scheduled cycle to instructions.  During scheduling, the
+/// data structure explicitly represents all stages/iterations.   When
+/// the algorithm finshes, the schedule is collapsed into a single stage,
+/// which represents instructions from different loop iterations.
+///
+/// The SMS algorithm allows negative values for cycles, so the first cycle
+/// in the schedule is the smallest cycle value.
+class SMSchedule {
+private:
+  /// Map from execution cycle to instructions.
+  DenseMap<int, std::deque<SUnit *>> ScheduledInstrs;
+
+  /// Map from instruction to execution cycle.
+  std::map<SUnit *, int> InstrToCycle;
+
+  /// Map for each register and the max difference between its uses and def.
+  /// The first element in the pair is the max difference in stages. The
+  /// second is true if the register defines a Phi value and loop value is
+  /// scheduled before the Phi.
+  std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
+
+  /// Keep track of the first cycle value in the schedule.  It starts
+  /// as zero, but the algorithm allows negative values.
+  int FirstCycle = 0;
+
+  /// Keep track of the last cycle value in the schedule.
+  int LastCycle = 0;
+
+  /// The initiation interval (II) for the schedule.
+  int InitiationInterval = 0;
+
+  /// Target machine information.
+  const TargetSubtargetInfo &ST;
+
+  /// Virtual register information.
+  MachineRegisterInfo &MRI;
+
+  std::unique_ptr<DFAPacketizer> Resources;
+
+public:
+  SMSchedule(MachineFunction *mf)
+      : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
+        Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {}
+
+  void reset() {
+    ScheduledInstrs.clear();
+    InstrToCycle.clear();
+    RegToStageDiff.clear();
+    FirstCycle = 0;
+    LastCycle = 0;
+    InitiationInterval = 0;
+  }
+
+  /// Set the initiation interval for this schedule.
+  void setInitiationInterval(int ii) { InitiationInterval = ii; }
+
+  /// Return the first cycle in the completed schedule.  This
+  /// can be a negative value.
+  int getFirstCycle() const { return FirstCycle; }
+
+  /// Return the last cycle in the finalized schedule.
+  int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
+
+  /// Return the cycle of the earliest scheduled instruction in the dependence
+  /// chain.
+  int earliestCycleInChain(const SDep &Dep);
+
+  /// Return the cycle of the latest scheduled instruction in the dependence
+  /// chain.
+  int latestCycleInChain(const SDep &Dep);
+
+  void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
+                    int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
+  bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
+
+  /// Iterators for the cycle to instruction map.
+  using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator;
+  using const_sched_iterator =
+      DenseMap<int, std::deque<SUnit *>>::const_iterator;
+
+  /// Return true if the instruction is scheduled at the specified stage.
+  bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
+    return (stageScheduled(SU) == (int)StageNum);
+  }
+
+  /// Return the stage for a scheduled instruction.  Return -1 if
+  /// the instruction has not been scheduled.
+  int stageScheduled(SUnit *SU) const {
+    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
+    if (it == InstrToCycle.end())
+      return -1;
+    return (it->second - FirstCycle) / InitiationInterval;
+  }
+
+  /// Return the cycle for a scheduled instruction. This function normalizes
+  /// the first cycle to be 0.
+  unsigned cycleScheduled(SUnit *SU) const {
+    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
+    assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
+    return (it->second - FirstCycle) % InitiationInterval;
+  }
+
+  /// Return the maximum stage count needed for this schedule.
+  unsigned getMaxStageCount() {
+    return (LastCycle - FirstCycle) / InitiationInterval;
+  }
+
+  /// Return the max. number of stages/iterations that can occur between a
+  /// register definition and its uses.
+  unsigned getStagesForReg(int Reg, unsigned CurStage) {
+    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
+    if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
+      return 1;
+    return Stages.first;
+  }
+
+  /// The number of stages for a Phi is a little different than other
+  /// instructions. The minimum value computed in RegToStageDiff is 1
+  /// because we assume the Phi is needed for at least 1 iteration.
+  /// This is not the case if the loop value is scheduled prior to the
+  /// Phi in the same stage.  This function returns the number of stages
+  /// or iterations needed between the Phi definition and any uses.
+  unsigned getStagesForPhi(int Reg) {
+    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
+    if (Stages.second)
+      return Stages.first;
+    return Stages.first - 1;
+  }
+
+  /// Return the instructions that are scheduled at the specified cycle.
+  std::deque<SUnit *> &getInstructions(int cycle) {
+    return ScheduledInstrs[cycle];
+  }
+
+  bool isValidSchedule(SwingSchedulerDAG *SSD);
+  void finalizeSchedule(SwingSchedulerDAG *SSD);
+  void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
+                       std::deque<SUnit *> &Insts);
+  bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
+  bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def,
+                             MachineOperand &MO);
+  void print(raw_ostream &os) const;
+  void dump() const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index a341aac227a..65805d12cac 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -9,34 +9,6 @@
 //
 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
 //
-// Software pipelining (SWP) is an instruction scheduling technique for loops
-// that overlap loop iterations and exploits ILP via a compiler transformation.
-//
-// Swing Modulo Scheduling is an implementation of software pipelining
-// that generates schedules that are near optimal in terms of initiation
-// interval, register requirements, and stage count. See the papers:
-//
-// "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
-// A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Proceedings of the 1996
-// Conference on Parallel Architectures and Compilation Techiniques.
-//
-// "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
-// Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
-// Transactions on Computers, Vol. 50, No. 3, 2001.
-//
-// "An Implementation of Swing Modulo Scheduling With Extensions for
-// Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
-// Urbana-Chambpain, 2005.
-//
-//
-// The SMS algorithm consists of three main steps after computing the minimal
-// initiation interval (MII).
-// 1) Analyze the dependence graph and compute information about each
-//    instruction in the graph.
-// 2) Order the nodes (instructions) by priority based upon the heuristics
-//    described in the algorithm.
-// 3) Attempt to schedule the nodes in the specified order using the MII.
-//
 // This SMS implementation is a target-independent back-end pass. When enabled,
 // the pass runs just prior to the register allocation pass, while the machine
 // IR is in SSA form. If software pipelining is successful, then the original
@@ -83,13 +55,11 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePipeliner.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -172,575 +142,14 @@ static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
                                      cl::ReallyHidden, cl::init(false),
                                      cl::ZeroOrMore, cl::desc("Ignore RecMII"));
 
-// A command line option to enable the CopyToPhi DAG mutation.
-static cl::opt<bool>
-    SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
-                       cl::init(true), cl::ZeroOrMore,
-                       cl::desc("Enable CopyToPhi DAG Mutation"));
-
-namespace {
-
-class NodeSet;
-class SMSchedule;
-
-/// The main class in the implementation of the target independent
-/// software pipeliner pass.
-class MachinePipeliner : public MachineFunctionPass {
-public:
-  MachineFunction *MF = nullptr;
-  const MachineLoopInfo *MLI = nullptr;
-  const MachineDominatorTree *MDT = nullptr;
-  const InstrItineraryData *InstrItins;
-  const TargetInstrInfo *TII = nullptr;
-  RegisterClassInfo RegClassInfo;
-
-#ifndef NDEBUG
-  static int NumTries;
-#endif
-
-  /// Cache the target analysis information about the loop.
-  struct LoopInfo {
-    MachineBasicBlock *TBB = nullptr;
-    MachineBasicBlock *FBB = nullptr;
-    SmallVector<MachineOperand, 4> BrCond;
-    MachineInstr *LoopInductionVar = nullptr;
-    MachineInstr *LoopCompare = nullptr;
-  };
-  LoopInfo LI;
-
-  static char ID;
-
-  MachinePipeliner() : MachineFunctionPass(ID) {
-    initializeMachinePipelinerPass(*PassRegistry::getPassRegistry());
-  }
+namespace llvm {
 
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
-    AU.addRequired<MachineLoopInfo>();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addRequired<LiveIntervals>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-private:
-  void preprocessPhiNodes(MachineBasicBlock &B);
-  bool canPipelineLoop(MachineLoop &L);
-  bool scheduleLoop(MachineLoop &L);
-  bool swingModuloScheduler(MachineLoop &L);
-};
-
-/// This class builds the dependence graph for the instructions in a loop,
-/// and attempts to schedule the instructions using the SMS algorithm.
-class SwingSchedulerDAG : public ScheduleDAGInstrs {
-  MachinePipeliner &Pass;
-  /// The minimum initiation interval between iterations for this schedule.
-  unsigned MII = 0;
-  /// Set to true if a valid pipelined schedule is found for the loop.
-  bool Scheduled = false;
-  MachineLoop &Loop;
-  LiveIntervals &LIS;
-  const RegisterClassInfo &RegClassInfo;
-
-  /// A toplogical ordering of the SUnits, which is needed for changing
-  /// dependences and iterating over the SUnits.
-  ScheduleDAGTopologicalSort Topo;
-
-  struct NodeInfo {
-    int ASAP = 0;
-    int ALAP = 0;
-    int ZeroLatencyDepth = 0;
-    int ZeroLatencyHeight = 0;
-
-    NodeInfo() = default;
-  };
-  /// Computed properties for each node in the graph.
-  std::vector<NodeInfo> ScheduleInfo;
-
-  enum OrderKind { BottomUp = 0, TopDown = 1 };
-  /// Computed node ordering for scheduling.
-  SetVector<SUnit *> NodeOrder;
-
-  using NodeSetType = SmallVector<NodeSet, 8>;
-  using ValueMapTy = DenseMap<unsigned, unsigned>;
-  using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
-  using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
-
-  /// Instructions to change when emitting the final schedule.
-  DenseMap<SUnit *, std::pair<unsigned, int64_t>> InstrChanges;
-
-  /// We may create a new instruction, so remember it because it
-  /// must be deleted when the pass is finished.
-  SmallPtrSet<MachineInstr *, 4> NewMIs;
-
-  /// Ordered list of DAG postprocessing steps.
-  std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
-
-  /// Helper class to implement Johnson's circuit finding algorithm.
-  class Circuits {
-    std::vector<SUnit> &SUnits;
-    SetVector<SUnit *> Stack;
-    BitVector Blocked;
-    SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
-    SmallVector<SmallVector<int, 4>, 16> AdjK;
-    // Node to Index from ScheduleDAGTopologicalSort
-    std::vector<int> *Node2Idx;
-    unsigned NumPaths;
-    static unsigned MaxPaths;
-
-  public:
-    Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
-        : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {
-      Node2Idx = new std::vector<int>(SUs.size());
-      unsigned Idx = 0;
-      for (const auto &NodeNum : Topo)
-        Node2Idx->at(NodeNum) = Idx++;
-    }
-
-    ~Circuits() { delete Node2Idx; }
-
-    /// Reset the data structures used in the circuit algorithm.
-    void reset() {
-      Stack.clear();
-      Blocked.reset();
-      B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>());
-      NumPaths = 0;
-    }
-
-    void createAdjacencyStructure(SwingSchedulerDAG *DAG);
-    bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
-    void unblock(int U);
-  };
-
-  struct CopyToPhiMutation : public ScheduleDAGMutation {
-    void apply(ScheduleDAGInstrs *DAG) override;
-  };
-
-public:
-  SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
-                    const RegisterClassInfo &rci)
-      : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
-        RegClassInfo(rci), Topo(SUnits, &ExitSU) {
-    P.MF->getSubtarget().getSMSMutations(Mutations);
-    if (SwpEnableCopyToPhi)
-      Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
-  }
-
-  void schedule() override;
-  void finishBlock() override;
-
-  /// Return true if the loop kernel has been scheduled.
-  bool hasNewSchedule() { return Scheduled; }
-
-  /// Return the earliest time an instruction may be scheduled.
-  int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
-
-  /// Return the latest time an instruction my be scheduled.
-  int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
-
-  /// The mobility function, which the number of slots in which
-  /// an instruction may be scheduled.
-  int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
-
-  /// The depth, in the dependence graph, for a node.
-  unsigned getDepth(SUnit *Node) { return Node->getDepth(); }
-
-  /// The maximum unweighted length of a path from an arbitrary node to the
-  /// given node in which each edge has latency 0
-  int getZeroLatencyDepth(SUnit *Node) {
-    return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth;
-  }
-
-  /// The height, in the dependence graph, for a node.
-  unsigned getHeight(SUnit *Node) { return Node->getHeight(); }
-
-  /// The maximum unweighted length of a path from the given node to an
-  /// arbitrary node in which each edge has latency 0
-  int getZeroLatencyHeight(SUnit *Node) {
-    return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
-  }
-
-  /// Return true if the dependence is a back-edge in the data dependence graph.
-  /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
-  /// using an anti dependence from a Phi to an instruction.
-  bool isBackedge(SUnit *Source, const SDep &Dep) {
-    if (Dep.getKind() != SDep::Anti)
-      return false;
-    return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
-  }
-
-  bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
-
-  /// The distance function, which indicates that operation V of iteration I
-  /// depends on operations U of iteration I-distance.
-  unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
-    // Instructions that feed a Phi have a distance of 1. Computing larger
-    // values for arrays requires data dependence information.
-    if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti)
-      return 1;
-    return 0;
-  }
-
-  /// Set the Minimum Initiation Interval for this schedule attempt.
-  void setMII(unsigned mii) { MII = mii; }
-
-  void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
-
-  void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
-
-  /// Return the new base register that was stored away for the changed
-  /// instruction.
-  unsigned getInstrBaseReg(SUnit *SU) {
-    DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
-        InstrChanges.find(SU);
-    if (It != InstrChanges.end())
-      return It->second.first;
-    return 0;
-  }
-
-  void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
-    Mutations.push_back(std::move(Mutation));
-  }
-
-  static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
-
-private:
-  void addLoopCarriedDependences(AliasAnalysis *AA);
-  void updatePhiDependences();
-  void changeDependences();
-  unsigned calculateResMII();
-  unsigned calculateRecMII(NodeSetType &RecNodeSets);
-  void findCircuits(NodeSetType &NodeSets);
-  void fuseRecs(NodeSetType &NodeSets);
-  void removeDuplicateNodes(NodeSetType &NodeSets);
-  void computeNodeFunctions(NodeSetType &NodeSets);
-  void registerPressureFilter(NodeSetType &NodeSets);
-  void colocateNodeSets(NodeSetType &NodeSets);
-  void checkNodeSets(NodeSetType &NodeSets);
-  void groupRemainingNodes(NodeSetType &NodeSets);
-  void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
-                         SetVector<SUnit *> &NodesAdded);
-  void computeNodeOrder(NodeSetType &NodeSets);
-  void checkValidNodeOrder(const NodeSetType &Circuits) const;
-  bool schedulePipeline(SMSchedule &Schedule);
-  void generatePipelinedLoop(SMSchedule &Schedule);
-  void generateProlog(SMSchedule &Schedule, unsigned LastStage,
-                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
-                      MBBVectorTy &PrologBBs);
-  void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
-                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
-                      MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
-  void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
-                            MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
-                            SMSchedule &Schedule, ValueMapTy *VRMap,
-                            InstrMapTy &InstrMap, unsigned LastStageNum,
-                            unsigned CurStageNum, bool IsLast);
-  void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
-                    MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
-                    SMSchedule &Schedule, ValueMapTy *VRMap,
-                    InstrMapTy &InstrMap, unsigned LastStageNum,
-                    unsigned CurStageNum, bool IsLast);
-  void removeDeadInstructions(MachineBasicBlock *KernelBB,
-                              MBBVectorTy &EpilogBBs);
-  void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
-                      SMSchedule &Schedule);
-  void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
-                   MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
-                   ValueMapTy *VRMap);
-  bool computeDelta(MachineInstr &MI, unsigned &Delta);
-  void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
-                         unsigned Num);
-  MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
-                           unsigned InstStageNum);
-  MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
-                                    unsigned InstStageNum,
-                                    SMSchedule &Schedule);
-  void updateInstruction(MachineInstr *NewMI, bool LastDef,
-                         unsigned CurStageNum, unsigned InstrStageNum,
-                         SMSchedule &Schedule, ValueMapTy *VRMap);
-  MachineInstr *findDefInLoop(unsigned Reg);
-  unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
-                         unsigned LoopStage, ValueMapTy *VRMap,
-                         MachineBasicBlock *BB);
-  void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
-                        SMSchedule &Schedule, ValueMapTy *VRMap,
-                        InstrMapTy &InstrMap);
-  void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
-                             InstrMapTy &InstrMap, unsigned CurStageNum,
-                             unsigned PhiNum, MachineInstr *Phi,
-                             unsigned OldReg, unsigned NewReg,
-                             unsigned PrevReg = 0);
-  bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
-                             unsigned &OffsetPos, unsigned &NewBase,
-                             int64_t &NewOffset);
-  void postprocessDAG();
-};
-
-/// A NodeSet contains a set of SUnit DAG nodes with additional information
-/// that assigns a priority to the set.
-class NodeSet {
-  SetVector<SUnit *> Nodes;
-  bool HasRecurrence = false;
-  unsigned RecMII = 0;
-  int MaxMOV = 0;
-  unsigned MaxDepth = 0;
-  unsigned Colocate = 0;
-  SUnit *ExceedPressure = nullptr;
-  unsigned Latency = 0;
-
-public:
-  using iterator = SetVector<SUnit *>::const_iterator;
-
-  NodeSet() = default;
-  NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
-    Latency = 0;
-    for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
-      for (const SDep &Succ : Nodes[i]->Succs)
-        if (Nodes.count(Succ.getSUnit()))
-          Latency += Succ.getLatency();
-  }
-
-  bool insert(SUnit *SU) { return Nodes.insert(SU); }
-
-  void insert(iterator S, iterator E) { Nodes.insert(S, E); }
-
-  template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) {
-    return Nodes.remove_if(P);
-  }
-
-  unsigned count(SUnit *SU) const { return Nodes.count(SU); }
-
-  bool hasRecurrence() { return HasRecurrence; };
-
-  unsigned size() const { return Nodes.size(); }
-
-  bool empty() const { return Nodes.empty(); }
-
-  SUnit *getNode(unsigned i) const { return Nodes[i]; };
-
-  void setRecMII(unsigned mii) { RecMII = mii; };
-
-  void setColocate(unsigned c) { Colocate = c; };
-
-  void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
-
-  bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
-
-  int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
-
-  int getRecMII() { return RecMII; }
-
-  /// Summarize node functions for the entire node set.
-  void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
-    for (SUnit *SU : *this) {
-      MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
-      MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
-    }
-  }
-
-  unsigned getLatency() { return Latency; }
-
-  unsigned getMaxDepth() { return MaxDepth; }
-
-  void clear() {
-    Nodes.clear();
-    RecMII = 0;
-    HasRecurrence = false;
-    MaxMOV = 0;
-    MaxDepth = 0;
-    Colocate = 0;
-    ExceedPressure = nullptr;
-  }
-
-  operator SetVector<SUnit *> &() { return Nodes; }
-
-  /// Sort the node sets by importance. First, rank them by recurrence MII,
-  /// then by mobility (least mobile done first), and finally by depth.
-  /// Each node set may contain a colocate value which is used as the first
-  /// tie breaker, if it's set.
-  bool operator>(const NodeSet &RHS) const {
-    if (RecMII == RHS.RecMII) {
-      if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate)
-        return Colocate < RHS.Colocate;
-      if (MaxMOV == RHS.MaxMOV)
-        return MaxDepth > RHS.MaxDepth;
-      return MaxMOV < RHS.MaxMOV;
-    }
-    return RecMII > RHS.RecMII;
-  }
-
-  bool operator==(const NodeSet &RHS) const {
-    return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
-           MaxDepth == RHS.MaxDepth;
-  }
-
-  bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
-
-  iterator begin() { return Nodes.begin(); }
-  iterator end() { return Nodes.end(); }
-
-  void print(raw_ostream &os) const {
-    os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
-       << " depth " << MaxDepth << " col " << Colocate << "\n";
-    for (const auto &I : Nodes)
-      os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
-    os << "\n";
-  }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
-#endif
-};
-
-/// This class represents the scheduled code.  The main data structure is a
-/// map from scheduled cycle to instructions.  During scheduling, the
-/// data structure explicitly represents all stages/iterations.   When
-/// the algorithm finshes, the schedule is collapsed into a single stage,
-/// which represents instructions from different loop iterations.
-///
-/// The SMS algorithm allows negative values for cycles, so the first cycle
-/// in the schedule is the smallest cycle value.
-class SMSchedule {
-private:
-  /// Map from execution cycle to instructions.
-  DenseMap<int, std::deque<SUnit *>> ScheduledInstrs;
-
-  /// Map from instruction to execution cycle.
-  std::map<SUnit *, int> InstrToCycle;
-
-  /// Map for each register and the max difference between its uses and def.
-  /// The first element in the pair is the max difference in stages. The
-  /// second is true if the register defines a Phi value and loop value is
-  /// scheduled before the Phi.
-  std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
-
-  /// Keep track of the first cycle value in the schedule.  It starts
-  /// as zero, but the algorithm allows negative values.
-  int FirstCycle = 0;
-
-  /// Keep track of the last cycle value in the schedule.
-  int LastCycle = 0;
-
-  /// The initiation interval (II) for the schedule.
-  int InitiationInterval = 0;
-
-  /// Target machine information.
-  const TargetSubtargetInfo &ST;
-
-  /// Virtual register information.
-  MachineRegisterInfo &MRI;
-
-  std::unique_ptr<DFAPacketizer> Resources;
-
-public:
-  SMSchedule(MachineFunction *mf)
-      : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
-        Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {}
-
-  void reset() {
-    ScheduledInstrs.clear();
-    InstrToCycle.clear();
-    RegToStageDiff.clear();
-    FirstCycle = 0;
-    LastCycle = 0;
-    InitiationInterval = 0;
-  }
-
-  /// Set the initiation interval for this schedule.
-  void setInitiationInterval(int ii) { InitiationInterval = ii; }
-
-  /// Return the first cycle in the completed schedule.  This
-  /// can be a negative value.
-  int getFirstCycle() const { return FirstCycle; }
-
-  /// Return the last cycle in the finalized schedule.
-  int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
-
-  /// Return the cycle of the earliest scheduled instruction in the dependence
-  /// chain.
-  int earliestCycleInChain(const SDep &Dep);
-
-  /// Return the cycle of the latest scheduled instruction in the dependence
-  /// chain.
-  int latestCycleInChain(const SDep &Dep);
-
-  void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
-                    int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
-  bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
-
-  /// Iterators for the cycle to instruction map.
-  using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator;
-  using const_sched_iterator =
-      DenseMap<int, std::deque<SUnit *>>::const_iterator;
-
-  /// Return true if the instruction is scheduled at the specified stage.
-  bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
-    return (stageScheduled(SU) == (int)StageNum);
-  }
-
-  /// Return the stage for a scheduled instruction.  Return -1 if
-  /// the instruction has not been scheduled.
-  int stageScheduled(SUnit *SU) const {
-    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
-    if (it == InstrToCycle.end())
-      return -1;
-    return (it->second - FirstCycle) / InitiationInterval;
-  }
-
-  /// Return the cycle for a scheduled instruction. This function normalizes
-  /// the first cycle to be 0.
-  unsigned cycleScheduled(SUnit *SU) const {
-    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
-    assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
-    return (it->second - FirstCycle) % InitiationInterval;
-  }
-
-  /// Return the maximum stage count needed for this schedule.
-  unsigned getMaxStageCount() {
-    return (LastCycle - FirstCycle) / InitiationInterval;
-  }
-
-  /// Return the max. number of stages/iterations that can occur between a
-  /// register definition and its uses.
-  unsigned getStagesForReg(int Reg, unsigned CurStage) {
-    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
-    if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
-      return 1;
-    return Stages.first;
-  }
-
-  /// The number of stages for a Phi is a little different than other
-  /// instructions. The minimum value computed in RegToStageDiff is 1
-  /// because we assume the Phi is needed for at least 1 iteration.
-  /// This is not the case if the loop value is scheduled prior to the
-  /// Phi in the same stage.  This function returns the number of stages
-  /// or iterations needed between the Phi definition and any uses.
-  unsigned getStagesForPhi(int Reg) {
-    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
-    if (Stages.second)
-      return Stages.first;
-    return Stages.first - 1;
-  }
-
-  /// Return the instructions that are scheduled at the specified cycle.
-  std::deque<SUnit *> &getInstructions(int cycle) {
-    return ScheduledInstrs[cycle];
-  }
-
-  bool isValidSchedule(SwingSchedulerDAG *SSD);
-  void finalizeSchedule(SwingSchedulerDAG *SSD);
-  void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
-                       std::deque<SUnit *> &Insts);
-  bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
-  bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def,
-                             MachineOperand &MO);
-  void print(raw_ostream &os) const;
-  void dump() const;
-};
+// A command line option to enable the CopyToPhi DAG mutation.
+cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+                                 cl::init(true), cl::ZeroOrMore,
+                                 cl::desc("Enable CopyToPhi DAG Mutation"));
 
-} // end anonymous namespace
+} // end namespace llvm
 
 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
 char MachinePipeliner::ID = 0;
-- 
GitLab


From 7fc581a267dd2caa08bd30cad95becec9415579b Mon Sep 17 00:00:00 2001
From: Aleksandr Urakov <aleksandr.urakov@jetbrains.com>
Date: Tue, 23 Oct 2018 08:14:53 +0000
Subject: [PATCH 0428/1116] Revert "Revert "[PDB] Extend IPDBSession's
 interface to retrieve frame data""

This reverts commit 466ce67d6ec444962e5cc0136243c16a453190c0.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345010 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h | 40 ++++++++++++++
 include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h | 41 ++++++++++++++
 include/llvm/DebugInfo/PDB/DIA/DIASession.h   |  1 +
 include/llvm/DebugInfo/PDB/IPDBFrameData.h    | 36 +++++++++++++
 include/llvm/DebugInfo/PDB/IPDBSession.h      |  3 ++
 .../llvm/DebugInfo/PDB/Native/NativeSession.h |  2 +
 include/llvm/DebugInfo/PDB/PDBTypes.h         |  2 +
 lib/DebugInfo/PDB/CMakeLists.txt              |  2 +
 lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp    | 43 +++++++++++++++
 lib/DebugInfo/PDB/DIA/DIAFrameData.cpp        | 54 +++++++++++++++++++
 lib/DebugInfo/PDB/DIA/DIASession.cpp          | 11 ++++
 lib/DebugInfo/PDB/Native/NativeSession.cpp    |  5 ++
 lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp     |  3 ++
 unittests/DebugInfo/PDB/PDBApiTest.cpp        |  4 ++
 14 files changed, 247 insertions(+)
 create mode 100644 include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
 create mode 100644 include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
 create mode 100644 include/llvm/DebugInfo/PDB/IPDBFrameData.h
 create mode 100644 lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
 create mode 100644 lib/DebugInfo/PDB/DIA/DIAFrameData.cpp

diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h b/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
new file mode 100644
index 00000000000..e17ba2ce59b
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
@@ -0,0 +1,40 @@
+//==- DIAEnumFrameData.h --------------------------------------- -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAENUMFRAMEDATA_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAENUMFRAMEDATA_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
+
+namespace llvm {
+namespace pdb {
+
+class DIASession;
+
+class DIAEnumFrameData : public IPDBEnumChildren<IPDBFrameData> {
+public:
+  explicit DIAEnumFrameData(const DIASession &PDBSession,
+                            CComPtr<IDiaEnumFrameData> DiaEnumerator);
+
+  uint32_t getChildCount() const override;
+  ChildTypePtr getChildAtIndex(uint32_t Index) const override;
+  ChildTypePtr getNext() override;
+  void reset() override;
+
+private:
+  const DIASession &Session;
+  CComPtr<IDiaEnumFrameData> Enumerator;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h b/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
new file mode 100644
index 00000000000..7564c3b7a5a
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
@@ -0,0 +1,41 @@
+//===- DIAFrameData.h - DIA Impl. of IPDBFrameData ---------------- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAFRAMEDATA_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAFRAMEDATA_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
+
+namespace llvm {
+namespace pdb {
+
+class DIASession;
+
+class DIAFrameData : public IPDBFrameData {
+public:
+  explicit DIAFrameData(const DIASession &PDBSession,
+                        CComPtr<IDiaFrameData> DiaFrameData);
+
+  uint32_t getAddressOffset() const override;
+  uint32_t getAddressSection() const override;
+  uint32_t getLengthBlock() const override;
+  std::string getProgram() const override;
+  uint32_t getRelativeVirtualAddress() const override;
+  uint64_t getVirtualAddress() const override;
+
+private:
+  const DIASession &Session;
+  CComPtr<IDiaFrameData> FrameData;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIASession.h b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
index e355605c296..592e061a8d8 100644
--- a/include/llvm/DebugInfo/PDB/DIA/DIASession.h
+++ b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
@@ -85,6 +85,7 @@ public:
 
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override;
 
+  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override;
 private:
   CComPtr<IDiaSession> Session;
 };
diff --git a/include/llvm/DebugInfo/PDB/IPDBFrameData.h b/include/llvm/DebugInfo/PDB/IPDBFrameData.h
new file mode 100644
index 00000000000..74679215b88
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/IPDBFrameData.h
@@ -0,0 +1,36 @@
+//===- IPDBFrameData.h - base interface for frame data ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBFRAMEDATA_H
+#define LLVM_DEBUGINFO_PDB_IPDBFRAMEDATA_H
+
+#include <cstdint>
+#include <string>
+
+namespace llvm {
+namespace pdb {
+
+/// IPDBFrameData defines an interface used to represent a frame data of some
+/// code block.
+class IPDBFrameData {
+public:
+  virtual ~IPDBFrameData();
+
+  virtual uint32_t getAddressOffset() const = 0;
+  virtual uint32_t getAddressSection() const = 0;
+  virtual uint32_t getLengthBlock() const = 0;
+  virtual std::string getProgram() const = 0;
+  virtual uint32_t getRelativeVirtualAddress() const = 0;
+  virtual uint64_t getVirtualAddress() const = 0;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/IPDBSession.h b/include/llvm/DebugInfo/PDB/IPDBSession.h
index 24573cdb779..88fd02c0a34 100644
--- a/include/llvm/DebugInfo/PDB/IPDBSession.h
+++ b/include/llvm/DebugInfo/PDB/IPDBSession.h
@@ -91,6 +91,9 @@ public:
 
   virtual std::unique_ptr<IPDBEnumSectionContribs>
   getSectionContribs() const = 0;
+
+  virtual std::unique_ptr<IPDBEnumFrameData>
+  getFrameData() const = 0;
 };
 } // namespace pdb
 } // namespace llvm
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeSession.h b/include/llvm/DebugInfo/PDB/Native/NativeSession.h
index 07ce85ef820..4878e47d312 100644
--- a/include/llvm/DebugInfo/PDB/Native/NativeSession.h
+++ b/include/llvm/DebugInfo/PDB/Native/NativeSession.h
@@ -93,6 +93,8 @@ public:
 
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override;
 
+  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override;
+
   PDBFile &getPDBFile() { return *Pdb; }
   const PDBFile &getPDBFile() const { return *Pdb; }
 
diff --git a/include/llvm/DebugInfo/PDB/PDBTypes.h b/include/llvm/DebugInfo/PDB/PDBTypes.h
index 6247018ce0f..917f3ed7391 100644
--- a/include/llvm/DebugInfo/PDB/PDBTypes.h
+++ b/include/llvm/DebugInfo/PDB/PDBTypes.h
@@ -12,6 +12,7 @@
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include <cctype>
 #include <cstddef>
@@ -71,6 +72,7 @@ using IPDBEnumLineNumbers = IPDBEnumChildren<IPDBLineNumber>;
 using IPDBEnumTables = IPDBEnumChildren<IPDBTable>;
 using IPDBEnumInjectedSources = IPDBEnumChildren<IPDBInjectedSource>;
 using IPDBEnumSectionContribs = IPDBEnumChildren<IPDBSectionContrib>;
+using IPDBEnumFrameData = IPDBEnumChildren<IPDBFrameData>;
 
 /// Specifies which PDB reader implementation is to be used.  Only a value
 /// of PDB_ReaderType::DIA is currently supported, but Native is in the works.
diff --git a/lib/DebugInfo/PDB/CMakeLists.txt b/lib/DebugInfo/PDB/CMakeLists.txt
index 86dcfdaa163..d9d379f6d09 100644
--- a/lib/DebugInfo/PDB/CMakeLists.txt
+++ b/lib/DebugInfo/PDB/CMakeLists.txt
@@ -14,6 +14,7 @@ if(LLVM_ENABLE_DIA_SDK)
   add_pdb_impl_folder(DIA
     DIA/DIADataStream.cpp
     DIA/DIAEnumDebugStreams.cpp
+    DIA/DIAEnumFrameData.cpp
     DIA/DIAEnumInjectedSources.cpp
     DIA/DIAEnumLineNumbers.cpp
     DIA/DIAEnumSectionContribs.cpp
@@ -21,6 +22,7 @@ if(LLVM_ENABLE_DIA_SDK)
     DIA/DIAEnumSymbols.cpp
     DIA/DIAEnumTables.cpp
     DIA/DIAError.cpp
+    DIA/DIAFrameData.cpp
     DIA/DIAInjectedSource.cpp
     DIA/DIALineNumber.cpp
     DIA/DIARawSymbol.cpp
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
new file mode 100644
index 00000000000..77514483e04
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
@@ -0,0 +1,43 @@
+//==- DIAEnumFrameData.cpp ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAFrameData.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+
+using namespace llvm::pdb;
+
+DIAEnumFrameData::DIAEnumFrameData(const DIASession &PDBSession,
+                                   CComPtr<IDiaEnumFrameData> DiaEnumerator)
+    : Session(PDBSession), Enumerator(DiaEnumerator) {}
+
+uint32_t DIAEnumFrameData::getChildCount() const {
+  LONG Count = 0;
+  return (S_OK == Enumerator->get_Count(&Count)) ? Count : 0;
+}
+
+std::unique_ptr<IPDBFrameData>
+DIAEnumFrameData::getChildAtIndex(uint32_t Index) const {
+  CComPtr<IDiaFrameData> Item;
+  if (S_OK != Enumerator->Item(Index, &Item))
+    return nullptr;
+
+  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Session, Item));
+}
+
+std::unique_ptr<IPDBFrameData> DIAEnumFrameData::getNext() {
+  CComPtr<IDiaFrameData> Item;
+  ULONG NumFetched = 0;
+  if (S_OK != Enumerator->Next(1, &Item, &NumFetched))
+    return nullptr;
+
+  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Session, Item));
+}
+
+void DIAEnumFrameData::reset() { Enumerator->Reset(); }
diff --git a/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp b/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp
new file mode 100644
index 00000000000..b904a2ff60a
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp
@@ -0,0 +1,54 @@
+//===- DIAFrameData.cpp - DIA impl. of IPDBFrameData -------------- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIAFrameData.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAUtils.h"
+
+using namespace llvm::pdb;
+
+DIAFrameData::DIAFrameData(const DIASession &PDBSession,
+                           CComPtr<IDiaFrameData> DiaFrameData)
+    : Session(PDBSession), FrameData(DiaFrameData) {}
+
+template <typename ArgType>
+ArgType
+PrivateGetDIAValue(IDiaFrameData *FrameData,
+                   HRESULT (__stdcall IDiaFrameData::*Method)(ArgType *)) {
+  ArgType Value;
+  if (S_OK == (FrameData->*Method)(&Value))
+    return static_cast<ArgType>(Value);
+
+  return ArgType();
+}
+
+uint32_t DIAFrameData::getAddressOffset() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_addressOffset);
+}
+
+uint32_t DIAFrameData::getAddressSection() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_addressSection);
+}
+
+uint32_t DIAFrameData::getLengthBlock() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_lengthBlock);
+}
+
+std::string DIAFrameData::getProgram() const {
+  return invokeBstrMethod(*FrameData, &IDiaFrameData::get_program);
+}
+
+uint32_t DIAFrameData::getRelativeVirtualAddress() const {
+  return PrivateGetDIAValue(FrameData,
+                            &IDiaFrameData::get_relativeVirtualAddress);
+}
+
+uint64_t DIAFrameData::getVirtualAddress() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_virtualAddress);
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIASession.cpp b/lib/DebugInfo/PDB/DIA/DIASession.cpp
index 7726fe13264..b89ca9a858f 100644
--- a/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -9,6 +9,7 @@
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumInjectedSources.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSectionContribs.h"
@@ -419,3 +420,13 @@ DIASession::getSectionContribs() const {
 
   return llvm::make_unique<DIAEnumSectionContribs>(*this, Sections);
 }
+
+std::unique_ptr<IPDBEnumFrameData>
+DIASession::getFrameData() const {
+  CComPtr<IDiaEnumFrameData> FD =
+      getTableEnumerator<IDiaEnumFrameData>(*Session);
+  if (!FD)
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumFrameData>(*this, FD);
+}
diff --git a/lib/DebugInfo/PDB/Native/NativeSession.cpp b/lib/DebugInfo/PDB/Native/NativeSession.cpp
index baab0a2399c..7807e312365 100644
--- a/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -200,6 +200,11 @@ NativeSession::getSectionContribs() const {
   return nullptr;
 }
 
+std::unique_ptr<IPDBEnumFrameData>
+NativeSession::getFrameData() const {
+  return nullptr;
+}
+
 void NativeSession::initializeExeSymbol() {
   if (ExeSymbol == 0)
     ExeSymbol = Cache.createSymbol<NativeExeSymbol>();
diff --git a/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
index c62796507a0..951909295d1 100644
--- a/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
+++ b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/IPDBDataStream.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
 #include "llvm/DebugInfo/PDB/IPDBInjectedSource.h"
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
@@ -35,3 +36,5 @@ IPDBTable::~IPDBTable() = default;
 IPDBInjectedSource::~IPDBInjectedSource() = default;
 
 IPDBSectionContrib::~IPDBSectionContrib() = default;
+
+IPDBFrameData::~IPDBFrameData() = default;
diff --git a/unittests/DebugInfo/PDB/PDBApiTest.cpp b/unittests/DebugInfo/PDB/PDBApiTest.cpp
index 948bde1bf72..007ea904085 100644
--- a/unittests/DebugInfo/PDB/PDBApiTest.cpp
+++ b/unittests/DebugInfo/PDB/PDBApiTest.cpp
@@ -159,6 +159,10 @@ class MockSession : public IPDBSession {
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override {
     return nullptr;
   }
+
+  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override {
+    return nullptr;
+  }
 };
 
 class MockRawSymbol : public IPDBRawSymbol {
-- 
GitLab


From 8e47a8d1a66b89cd59fbc2fdc7e19dbe7a15c6f8 Mon Sep 17 00:00:00 2001
From: Aleksandr Urakov <aleksandr.urakov@jetbrains.com>
Date: Tue, 23 Oct 2018 08:15:00 +0000
Subject: [PATCH 0429/1116] Fix non-Windows build for D53324

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345011 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/module.modulemap | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index 9abbab87885..138eb06078d 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap
@@ -88,12 +88,14 @@ module LLVM_DebugInfo_PDB {
   // FIXME: There should be a better way to specify this.
   exclude header "DebugInfo/PDB/DIA/DIADataStream.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
+  exclude header "DebugInfo/PDB/DIA/DIAEnumFrameData.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumInjectedSources.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumSectionContribs.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumSourceFiles.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumSymbols.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumTables.h"
+  exclude header "DebugInfo/PDB/DIA/DIAFrameData.h"
   exclude header "DebugInfo/PDB/DIA/DIAInjectedSource.h"
   exclude header "DebugInfo/PDB/DIA/DIALineNumber.h"
   exclude header "DebugInfo/PDB/DIA/DIARawSymbol.h"
-- 
GitLab


From 62bea74cfa266d9f2b8a3e5df1c4c84ebac29b03 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Tue, 23 Oct 2018 09:08:44 +0000
Subject: [PATCH 0430/1116] [X86][BMI1] X86DAGToDAGISel: select BEXTR from x <<
 (32 - y) >> (32 - y) pattern

Summary:
Continuation of D52348.

We also get the `c) x &  (-1 >> (32 - y))` pattern here, because of the D48768.
I will add extra-uses into those tests and follow-up with a patch to handle those patterns too.

Reviewers: RKSimon, craig.topper

Reviewed By: craig.topper

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53521

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345014 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp  |  91 ++++++--
 lib/Target/X86/X86InstrInfo.td      |  26 ---
 test/CodeGen/X86/extract-bits.ll    | 328 +++++++++++-----------------
 test/CodeGen/X86/extract-lowbits.ll | 227 ++++++-------------
 4 files changed, 258 insertions(+), 414 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index d3aa5c89adc..288d87ae887 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2688,6 +2688,10 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
 //   c) x &  (-1 >> (32 - y))
 //   d) x << (32 - y) >> (32 - y)
 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
+  assert(
+      (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
+      "Should be either an and-mask, or right-shift after clearing high bits.");
+
   // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
   if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
     return false;
@@ -2698,13 +2702,16 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
   if (NVT != MVT::i32 && NVT != MVT::i64)
     return false;
 
+  unsigned Size = NVT.getSizeInBits();
+
   SDValue NBits;
 
   // If we have BMI2's BZHI, we are ok with muti-use patterns.
   // Else, if we only have BMI1's BEXTR, we require one-use.
   const bool CanHaveExtraUses = Subtarget->hasBMI2();
-  auto checkOneUse = [CanHaveExtraUses](SDValue Op) {
-    return CanHaveExtraUses || Op.hasOneUse();
+  auto checkOneUse = [CanHaveExtraUses](SDValue Op, unsigned NUses = 1) {
+    return CanHaveExtraUses ||
+           Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
   };
 
   // a) x & ((1 << nbits) + (-1))
@@ -2740,31 +2747,73 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
     return true;
   };
 
+  SDValue X;
+
+  // d) x << (32 - y) >> (32 - y)
+  auto matchPatternD = [&checkOneUse, Size, &X, &NBits](SDNode *Node) -> bool {
+    if (Node->getOpcode() != ISD::SRL)
+      return false;
+    SDValue N0 = Node->getOperand(0);
+    if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
+      return false;
+    SDValue N1 = Node->getOperand(1);
+    SDValue N01 = N0->getOperand(1);
+    // Both of the shifts must be by the exact same value.
+    // There should not be any uses of the shift amount outside of the pattern.
+    if (N1 != N01 || !checkOneUse(N1, 2))
+      return false;
+    // Skip over a truncate of the shift amount.
+    if (N1->getOpcode() == ISD::TRUNCATE) {
+      N1 = N1->getOperand(0);
+      // The trunc should have been the only user of the real shift amount.
+      if (!checkOneUse(N1))
+        return false;
+    }
+    // Match the shift amount as: (bitwidth - y). It should go away, too.
+    if (N1.getOpcode() != ISD::SUB)
+      return false;
+    auto N10 = dyn_cast<ConstantSDNode>(N1.getOperand(0));
+    if (!N10 || N10->getZExtValue() != Size)
+      return false;
+    X = N0->getOperand(0);
+    NBits = N1.getOperand(1);
+    return true;
+  };
+
   auto matchLowBitMask = [&matchPatternA,
                           &matchPatternB](SDValue Mask) -> bool {
-    // FIXME: patterns c, d.
+    // FIXME: pattern c.
     return matchPatternA(Mask) || matchPatternB(Mask);
   };
 
-  SDValue X = Node->getOperand(0);
-  SDValue Mask = Node->getOperand(1);
+  if (Node->getOpcode() == ISD::AND) {
+    X = Node->getOperand(0);
+    SDValue Mask = Node->getOperand(1);
 
-  if (matchLowBitMask(Mask)) {
-    // Great.
-  } else {
-    std::swap(X, Mask);
-    if (!matchLowBitMask(Mask))
-      return false;
-  }
+    if (matchLowBitMask(Mask)) {
+      // Great.
+    } else {
+      std::swap(X, Mask);
+      if (!matchLowBitMask(Mask))
+        return false;
+    }
+  } else if (!matchPatternD(Node))
+    return false;
 
   SDLoc DL(Node);
 
+  SDValue OrigNBits = NBits;
+  // Do we need to truncate the shift amount?
+  if (NBits.getValueType() != MVT::i8) {
+    NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
+    insertDAGNode(*CurDAG, OrigNBits, NBits);
+  }
+
   // Insert 8-bit NBits into lowest 8 bits of NVT-sized (32 or 64-bit) register.
   // All the other bits are undefined, we do not care about them.
   SDValue ImplDef =
       SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, NVT), 0);
   insertDAGNode(*CurDAG, NBits, ImplDef);
-  SDValue OrigNBits = NBits;
   NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, NVT, ImplDef, NBits);
   insertDAGNode(*CurDAG, OrigNBits, NBits);
 
@@ -2963,17 +3012,8 @@ bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
   if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
     ShiftAmt = ShiftAmt->getOperand(0);
 
-  // Special case to avoid messing up a BZHI pattern.
-  // Look for (srl (shl X, (size - y)), (size - y)
-  if (Subtarget->hasBMI2() && (VT == MVT::i32 || VT == MVT::i64) &&
-      N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL &&
-      // Shift amounts the same?
-      N->getOperand(1) == N->getOperand(0).getOperand(1) &&
-      // Shift amounts size - y?
-      ShiftAmt.getOpcode() == ISD::SUB &&
-      isa<ConstantSDNode>(ShiftAmt.getOperand(0)) &&
-      cast<ConstantSDNode>(ShiftAmt.getOperand(0))->getZExtValue() == Size)
-    return false;
+  // This function is called after X86DAGToDAGISel::matchBitExtract(),
+  // so we are not afraid that we might mess up BZHI/BEXTR pattern.
 
   SDValue NewShiftAmt;
   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
@@ -3172,6 +3212,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
   }
 
   case ISD::SRL:
+    if (matchBitExtract(Node))
+      return;
+    LLVM_FALLTHROUGH;
   case ISD::SRA:
   case ISD::SHL:
     if (tryShiftAmountMod(Node))
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index a12c9e81b05..8d3f7c856d0 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -2519,14 +2519,6 @@ let Predicates = [HasBMI2] in {
                              (and (x86memop addr:$src),
                                   (srl -1, (sub bitwidth, GR8:$lz))),
                              RC, VT, DstInst, DstMemInst>;
-
-    // x << (bitwidth - y) >> (bitwidth - y)
-    defm : _bmi_bzhi_pattern<(srl (shl RC:$src, (sub bitwidth, GR8:$lz)),
-                                  (sub bitwidth, GR8:$lz)),
-                             (srl (shl (x86memop addr:$src),
-                                        (sub bitwidth, GR8:$lz)),
-                                  (sub bitwidth, GR8:$lz)),
-                             RC, VT, DstInst, DstMemInst>;
   }
 
   defm : bmi_bzhi_patterns<GR32, 32, i32, BZHI32rr, loadi32, BZHI32rm>;
@@ -2545,24 +2537,6 @@ let Predicates = [HasBMI2] in {
   def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
             (BZHI64rm addr:$src,
               (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
-
-  // x << (32 - y) >> (32 - y)
-  def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))),
-                 (i8 (trunc (sub 32, GR32:$lz)))),
-            (BZHI32rr GR32:$src, GR32:$lz)>;
-  def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))),
-                 (i8 (trunc (sub 32, GR32:$lz)))),
-            (BZHI32rm addr:$src, GR32:$lz)>;
-
-  // x << (64 - y) >> (64 - y)
-  def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))),
-                 (i8 (trunc (sub 64, GR32:$lz)))),
-            (BZHI64rr GR64:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
-  def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))),
-                 (i8 (trunc (sub 64, GR32:$lz)))),
-            (BZHI64rm addr:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
 } // HasBMI2
 
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll
index a7d91ede4be..4c0d62d5279 100644
--- a/test/CodeGen/X86/extract-bits.ll
+++ b/test/CodeGen/X86/extract-bits.ll
@@ -2813,14 +2813,12 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c0:
@@ -2846,13 +2844,10 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr32_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c0:
@@ -2882,14 +2877,12 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c1_indexzext:
@@ -2915,13 +2908,10 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X64-BMI1NOTBM-LABEL: bextr32_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c1_indexzext:
@@ -2954,15 +2944,13 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c2_load:
@@ -2992,10 +2980,8 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c2_load:
@@ -3027,15 +3013,13 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c3_load_indexzext:
@@ -3065,10 +3049,8 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c3_load_indexzext:
@@ -3101,14 +3083,12 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c4_commutative:
@@ -3134,13 +3114,10 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X64-BMI1NOTBM-LABEL: bextr32_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c4_commutative:
@@ -3180,16 +3157,13 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    addl $8, %esp
@@ -3230,13 +3204,10 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr32_c5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -3379,13 +3350,10 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr64_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c0:
@@ -3515,14 +3483,12 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c1_indexzext:
@@ -3663,10 +3629,8 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c2_load:
@@ -3800,14 +3764,13 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c3_load_indexzext:
@@ -3943,13 +3906,10 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X64-BMI1NOTBM-LABEL: bextr64_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c4_commutative:
@@ -4129,13 +4089,10 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_c5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
@@ -4179,14 +4136,12 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d0:
@@ -4212,13 +4167,10 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr32_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d0:
@@ -4248,14 +4200,12 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d1_indexzext:
@@ -4281,13 +4231,10 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X64-BMI1NOTBM-LABEL: bextr32_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d1_indexzext:
@@ -4320,15 +4267,13 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d2_load:
@@ -4358,10 +4303,8 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d2_load:
@@ -4393,15 +4336,13 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d3_load_indexzext:
@@ -4431,10 +4372,8 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d3_load_indexzext:
@@ -4477,16 +4416,13 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    addl $8, %esp
@@ -4527,13 +4463,10 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr32_d5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -4713,13 +4646,10 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr64_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d0:
@@ -4886,14 +4816,12 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d1_indexzext:
@@ -5071,10 +4999,8 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d2_load:
@@ -5245,14 +5171,13 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d3_load_indexzext:
@@ -5466,13 +5391,10 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_d5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
diff --git a/test/CodeGen/X86/extract-lowbits.ll b/test/CodeGen/X86/extract-lowbits.ll
index eae52441dfc..f81280a5290 100644
--- a/test/CodeGen/X86/extract-lowbits.ll
+++ b/test/CodeGen/X86/extract-lowbits.ll
@@ -1442,11 +1442,8 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c0:
@@ -1467,12 +1464,8 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c0:
@@ -1498,12 +1491,9 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c1_indexzext:
@@ -1524,12 +1514,8 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c1_indexzext:
@@ -1558,19 +1544,16 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c2_load:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, (%ecx), %eax
+; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %eax
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c2_load:
@@ -1585,12 +1568,8 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c2_load:
@@ -1619,12 +1598,9 @@ define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c3_load_indexzext:
@@ -1646,12 +1622,8 @@ define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c3_load_indexzext:
@@ -1680,11 +1652,8 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c4_commutative:
@@ -1705,12 +1674,8 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c4_commutative:
@@ -1791,12 +1756,8 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c0:
@@ -1875,12 +1836,9 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c1_indexzext:
@@ -1970,12 +1928,8 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c2_load:
@@ -2064,12 +2018,9 @@ define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c3_load_indexzext:
@@ -2151,12 +2102,8 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c4_commutative:
@@ -2187,11 +2134,8 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d0:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d0:
@@ -2212,12 +2156,8 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d0:
@@ -2243,12 +2183,9 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_d1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d1_indexzext:
@@ -2269,12 +2206,8 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d1_indexzext:
@@ -2303,19 +2236,16 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d2_load:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, (%ecx), %eax
+; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %eax
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_d2_load:
@@ -2330,12 +2260,8 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d2_load:
@@ -2364,12 +2290,9 @@ define i32 @bzhi32_d3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d3_load_indexzext:
@@ -2391,12 +2314,8 @@ define i32 @bzhi32_d3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d3_load_indexzext:
@@ -2536,12 +2455,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d0:
@@ -2677,12 +2592,9 @@ define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d1_indexzext:
@@ -2823,12 +2735,8 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d2_load:
@@ -2968,12 +2876,9 @@ define i64 @bzhi64_d3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d3_load_indexzext:
-- 
GitLab


From befe74f5035b154f1a4da10db1fbe1f1b3636cab Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Oct 2018 09:42:10 +0000
Subject: [PATCH 0431/1116] [TTI] Add generic cost handling of SK_Reverse
 shuffles

These can be treated as a general permute.

This required a fix for missing reverse patterns on ARM

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345015 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/BasicTTIImpl.h       | 1 +
 lib/Target/ARM/ARMTargetTransformInfo.cpp | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index e740fe57172..18c9a61d19b 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -555,6 +555,7 @@ public:
                           Type *SubTp) {
     switch (Kind) {
     case TTI::SK_Select:
+    case TTI::SK_Reverse:
     case TTI::SK_Transpose:
     case TTI::SK_PermuteSingleSrc:
     case TTI::SK_PermuteTwoSrc:
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 90e0cd96682..39a72f0edeb 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -412,6 +412,8 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
         {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
         {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
         {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i8,  1},
 
         {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
         {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
-- 
GitLab


From 54030c22de0460fd084e4d8f8d15ffb34a1919ad Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Tue, 23 Oct 2018 10:34:57 +0000
Subject: [PATCH 0432/1116] Revert "[X86][BMI1] X86DAGToDAGISel: select BEXTR
 from x << (32 - y) >> (32 - y) pattern"

*Seems* to be breaking sanitizer-x86_64-linux-fast buildbot,
the ELF/relocatable-versioned.s test:

==17758==MemorySanitizer CHECK failed: /b/sanitizer-x86_64-linux-fast/build/llvm/projects/compiler-rt/lib/sanitizer_common/sanitizer_allocator.cc:191 "((kBlockMagic)) == ((((u64*)addr)[0]))" (0x6a6cb03abcebc041, 0x0)
    #0 0x59716b in MsanCheckFailed(char const*, int, char const*, unsigned long long, unsigned long long) /b/sanitizer-x86_64-linux-fast/build/llvm/projects/compiler-rt/lib/msan/msan.cc:393
    #1 0x586635 in __sanitizer::CheckFailed(char const*, int, char const*, unsigned long long, unsigned long long) /b/sanitizer-x86_64-linux-fast/build/llvm/projects/compiler-rt/lib/sanitizer_common/sanitizer_termination.cc:79
    #2 0x57d5ff in __sanitizer::InternalFree(void*, __sanitizer::SizeClassAllocatorLocalCache<__sanitizer::SizeClassAllocator32<__sanitizer::AP32> >*) /b/sanitizer-x86_64-linux-fast/build/llvm/projects/compiler-rt/lib/sanitizer_common/sanitizer_allocator.cc:191
    #3 0x7fc21b24193f  (/lib/x86_64-linux-gnu/libc.so.6+0x3593f)
    #4 0x7fc21b241999 in exit (/lib/x86_64-linux-gnu/libc.so.6+0x35999)
    #5 0x7fc21b22c2e7 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x202e7)
    #6 0x57c039 in _start (/b/sanitizer-x86_64-linux-fast/build/llvm_build_msan/bin/lld+0x57c039)

This reverts commit r345014.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345017 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp  |  91 ++------
 lib/Target/X86/X86InstrInfo.td      |  26 +++
 test/CodeGen/X86/extract-bits.ll    | 328 +++++++++++++++++-----------
 test/CodeGen/X86/extract-lowbits.ll | 227 +++++++++++++------
 4 files changed, 414 insertions(+), 258 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 288d87ae887..d3aa5c89adc 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2688,10 +2688,6 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
 //   c) x &  (-1 >> (32 - y))
 //   d) x << (32 - y) >> (32 - y)
 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
-  assert(
-      (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
-      "Should be either an and-mask, or right-shift after clearing high bits.");
-
   // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
   if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
     return false;
@@ -2702,16 +2698,13 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
   if (NVT != MVT::i32 && NVT != MVT::i64)
     return false;
 
-  unsigned Size = NVT.getSizeInBits();
-
   SDValue NBits;
 
   // If we have BMI2's BZHI, we are ok with muti-use patterns.
   // Else, if we only have BMI1's BEXTR, we require one-use.
   const bool CanHaveExtraUses = Subtarget->hasBMI2();
-  auto checkOneUse = [CanHaveExtraUses](SDValue Op, unsigned NUses = 1) {
-    return CanHaveExtraUses ||
-           Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
+  auto checkOneUse = [CanHaveExtraUses](SDValue Op) {
+    return CanHaveExtraUses || Op.hasOneUse();
   };
 
   // a) x & ((1 << nbits) + (-1))
@@ -2747,73 +2740,31 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
     return true;
   };
 
-  SDValue X;
-
-  // d) x << (32 - y) >> (32 - y)
-  auto matchPatternD = [&checkOneUse, Size, &X, &NBits](SDNode *Node) -> bool {
-    if (Node->getOpcode() != ISD::SRL)
-      return false;
-    SDValue N0 = Node->getOperand(0);
-    if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
-      return false;
-    SDValue N1 = Node->getOperand(1);
-    SDValue N01 = N0->getOperand(1);
-    // Both of the shifts must be by the exact same value.
-    // There should not be any uses of the shift amount outside of the pattern.
-    if (N1 != N01 || !checkOneUse(N1, 2))
-      return false;
-    // Skip over a truncate of the shift amount.
-    if (N1->getOpcode() == ISD::TRUNCATE) {
-      N1 = N1->getOperand(0);
-      // The trunc should have been the only user of the real shift amount.
-      if (!checkOneUse(N1))
-        return false;
-    }
-    // Match the shift amount as: (bitwidth - y). It should go away, too.
-    if (N1.getOpcode() != ISD::SUB)
-      return false;
-    auto N10 = dyn_cast<ConstantSDNode>(N1.getOperand(0));
-    if (!N10 || N10->getZExtValue() != Size)
-      return false;
-    X = N0->getOperand(0);
-    NBits = N1.getOperand(1);
-    return true;
-  };
-
   auto matchLowBitMask = [&matchPatternA,
                           &matchPatternB](SDValue Mask) -> bool {
-    // FIXME: pattern c.
+    // FIXME: patterns c, d.
     return matchPatternA(Mask) || matchPatternB(Mask);
   };
 
-  if (Node->getOpcode() == ISD::AND) {
-    X = Node->getOperand(0);
-    SDValue Mask = Node->getOperand(1);
+  SDValue X = Node->getOperand(0);
+  SDValue Mask = Node->getOperand(1);
 
-    if (matchLowBitMask(Mask)) {
-      // Great.
-    } else {
-      std::swap(X, Mask);
-      if (!matchLowBitMask(Mask))
-        return false;
-    }
-  } else if (!matchPatternD(Node))
-    return false;
+  if (matchLowBitMask(Mask)) {
+    // Great.
+  } else {
+    std::swap(X, Mask);
+    if (!matchLowBitMask(Mask))
+      return false;
+  }
 
   SDLoc DL(Node);
 
-  SDValue OrigNBits = NBits;
-  // Do we need to truncate the shift amount?
-  if (NBits.getValueType() != MVT::i8) {
-    NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
-    insertDAGNode(*CurDAG, OrigNBits, NBits);
-  }
-
   // Insert 8-bit NBits into lowest 8 bits of NVT-sized (32 or 64-bit) register.
   // All the other bits are undefined, we do not care about them.
   SDValue ImplDef =
       SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, NVT), 0);
   insertDAGNode(*CurDAG, NBits, ImplDef);
+  SDValue OrigNBits = NBits;
   NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, NVT, ImplDef, NBits);
   insertDAGNode(*CurDAG, OrigNBits, NBits);
 
@@ -3012,8 +2963,17 @@ bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
   if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
     ShiftAmt = ShiftAmt->getOperand(0);
 
-  // This function is called after X86DAGToDAGISel::matchBitExtract(),
-  // so we are not afraid that we might mess up BZHI/BEXTR pattern.
+  // Special case to avoid messing up a BZHI pattern.
+  // Look for (srl (shl X, (size - y)), (size - y)
+  if (Subtarget->hasBMI2() && (VT == MVT::i32 || VT == MVT::i64) &&
+      N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL &&
+      // Shift amounts the same?
+      N->getOperand(1) == N->getOperand(0).getOperand(1) &&
+      // Shift amounts size - y?
+      ShiftAmt.getOpcode() == ISD::SUB &&
+      isa<ConstantSDNode>(ShiftAmt.getOperand(0)) &&
+      cast<ConstantSDNode>(ShiftAmt.getOperand(0))->getZExtValue() == Size)
+    return false;
 
   SDValue NewShiftAmt;
   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
@@ -3212,9 +3172,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
   }
 
   case ISD::SRL:
-    if (matchBitExtract(Node))
-      return;
-    LLVM_FALLTHROUGH;
   case ISD::SRA:
   case ISD::SHL:
     if (tryShiftAmountMod(Node))
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 8d3f7c856d0..a12c9e81b05 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -2519,6 +2519,14 @@ let Predicates = [HasBMI2] in {
                              (and (x86memop addr:$src),
                                   (srl -1, (sub bitwidth, GR8:$lz))),
                              RC, VT, DstInst, DstMemInst>;
+
+    // x << (bitwidth - y) >> (bitwidth - y)
+    defm : _bmi_bzhi_pattern<(srl (shl RC:$src, (sub bitwidth, GR8:$lz)),
+                                  (sub bitwidth, GR8:$lz)),
+                             (srl (shl (x86memop addr:$src),
+                                        (sub bitwidth, GR8:$lz)),
+                                  (sub bitwidth, GR8:$lz)),
+                             RC, VT, DstInst, DstMemInst>;
   }
 
   defm : bmi_bzhi_patterns<GR32, 32, i32, BZHI32rr, loadi32, BZHI32rm>;
@@ -2537,6 +2545,24 @@ let Predicates = [HasBMI2] in {
   def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
             (BZHI64rm addr:$src,
               (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+
+  // x << (32 - y) >> (32 - y)
+  def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))),
+                 (i8 (trunc (sub 32, GR32:$lz)))),
+            (BZHI32rr GR32:$src, GR32:$lz)>;
+  def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))),
+                 (i8 (trunc (sub 32, GR32:$lz)))),
+            (BZHI32rm addr:$src, GR32:$lz)>;
+
+  // x << (64 - y) >> (64 - y)
+  def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))),
+                 (i8 (trunc (sub 64, GR32:$lz)))),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+  def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))),
+                 (i8 (trunc (sub 64, GR32:$lz)))),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
 } // HasBMI2
 
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll
index 4c0d62d5279..a7d91ede4be 100644
--- a/test/CodeGen/X86/extract-bits.ll
+++ b/test/CodeGen/X86/extract-bits.ll
@@ -2813,12 +2813,14 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c0:
@@ -2844,10 +2846,13 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr32_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c0:
@@ -2877,12 +2882,14 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c1_indexzext:
@@ -2908,10 +2915,13 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X64-BMI1NOTBM-LABEL: bextr32_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c1_indexzext:
@@ -2944,13 +2954,15 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c2_load:
@@ -2980,8 +2992,10 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c2_load:
@@ -3013,13 +3027,15 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c3_load_indexzext:
@@ -3049,8 +3065,10 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c3_load_indexzext:
@@ -3083,12 +3101,14 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c4_commutative:
@@ -3114,10 +3134,13 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X64-BMI1NOTBM-LABEL: bextr32_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c4_commutative:
@@ -3157,13 +3180,16 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
-; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
+; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    addl $8, %esp
@@ -3204,10 +3230,13 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr32_c5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -3350,10 +3379,13 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr64_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c0:
@@ -3483,12 +3515,14 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c1_indexzext:
@@ -3629,8 +3663,10 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c2_load:
@@ -3764,13 +3800,14 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c3_load_indexzext:
@@ -3906,10 +3943,13 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X64-BMI1NOTBM-LABEL: bextr64_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c4_commutative:
@@ -4089,10 +4129,13 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_c5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rbx
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
@@ -4136,12 +4179,14 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d0:
@@ -4167,10 +4212,13 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr32_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d0:
@@ -4200,12 +4248,14 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d1_indexzext:
@@ -4231,10 +4281,13 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X64-BMI1NOTBM-LABEL: bextr32_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d1_indexzext:
@@ -4267,13 +4320,15 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d2_load:
@@ -4303,8 +4358,10 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d2_load:
@@ -4336,13 +4393,15 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d3_load_indexzext:
@@ -4372,8 +4431,10 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d3_load_indexzext:
@@ -4416,13 +4477,16 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
-; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
+; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    addl $8, %esp
@@ -4463,10 +4527,13 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr32_d5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -4646,10 +4713,13 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr64_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d0:
@@ -4816,12 +4886,14 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d1_indexzext:
@@ -4999,8 +5071,10 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d2_load:
@@ -5171,13 +5245,14 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d3_load_indexzext:
@@ -5391,10 +5466,13 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_d5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rbx
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
diff --git a/test/CodeGen/X86/extract-lowbits.ll b/test/CodeGen/X86/extract-lowbits.ll
index f81280a5290..eae52441dfc 100644
--- a/test/CodeGen/X86/extract-lowbits.ll
+++ b/test/CodeGen/X86/extract-lowbits.ll
@@ -1442,8 +1442,11 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c0:
@@ -1464,8 +1467,12 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c0:
@@ -1491,9 +1498,12 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c1_indexzext:
@@ -1514,8 +1524,12 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c1_indexzext:
@@ -1544,16 +1558,19 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
-; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c2_load:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI1BMI2-NEXT:    bzhil %eax, (%ecx), %eax
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c2_load:
@@ -1568,8 +1585,12 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c2_load:
@@ -1598,9 +1619,12 @@ define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
-; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c3_load_indexzext:
@@ -1622,8 +1646,12 @@ define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c3_load_indexzext:
@@ -1652,8 +1680,11 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c4_commutative:
@@ -1674,8 +1705,12 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c4_commutative:
@@ -1756,8 +1791,12 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c0:
@@ -1836,9 +1875,12 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c1_indexzext:
@@ -1928,8 +1970,12 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c2_load:
@@ -2018,9 +2064,12 @@ define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c3_load_indexzext:
@@ -2102,8 +2151,12 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c4_commutative:
@@ -2134,8 +2187,11 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d0:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d0:
@@ -2156,8 +2212,12 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d0:
@@ -2183,9 +2243,12 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_d1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d1_indexzext:
@@ -2206,8 +2269,12 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d1_indexzext:
@@ -2236,16 +2303,19 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
-; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d2_load:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI1BMI2-NEXT:    bzhil %eax, (%ecx), %eax
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_d2_load:
@@ -2260,8 +2330,12 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d2_load:
@@ -2290,9 +2364,12 @@ define i32 @bzhi32_d3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
-; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d3_load_indexzext:
@@ -2314,8 +2391,12 @@ define i32 @bzhi32_d3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d3_load_indexzext:
@@ -2455,8 +2536,12 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d0:
@@ -2592,9 +2677,12 @@ define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d1_indexzext:
@@ -2735,8 +2823,12 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d2_load:
@@ -2876,9 +2968,12 @@ define i64 @bzhi64_d3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d3_load_indexzext:
-- 
GitLab


From 477d32c48ae3fcf618e72e286804c1b2b5bbecbe Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Oct 2018 11:33:38 +0000
Subject: [PATCH 0433/1116] [X86][SSE] Update raw mask shuffle decoders to
 handle UNDEF mask elts

Matches the approach taken in the constant pool shuffle decoders, and uses an UndefElts mask instead of uint64_t(-1) raw mask values, which doesn't work safely for i32/i64 shuffle mask sizes (as the -1 value is legal).

This allows us to remove the constant pool shuffle decoders from most of the getTargetShuffleMask variable shuffle cases (X86ISD::VPERMV3 will be handled in a future commit).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345018 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/Utils/X86ShuffleDecode.cpp | 45 ++++++++++++-----
 lib/Target/X86/Utils/X86ShuffleDecode.h   | 13 ++---
 lib/Target/X86/X86ISelLowering.cpp        | 61 ++++++++---------------
 3 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index fe567f4cece..bed940d0d0e 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -304,12 +304,12 @@ void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
   }
 }
 
-void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask) {
   for (int i = 0, e = RawMask.size(); i < e; ++i) {
     uint64_t M = RawMask[i];
-    if (M == (uint64_t)SM_SentinelUndef) {
-      ShuffleMask.push_back(M);
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
       continue;
     }
     // For 256/512-bit vectors the base of the shuffle is the 128-bit
@@ -336,7 +336,7 @@ void DecodeBLENDMask(unsigned NumElts, unsigned Imm,
   }
 }
 
-void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask) {
   assert(RawMask.size() == 16 && "Illegal VPPERM shuffle mask size");
 
@@ -354,12 +354,12 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
   // 6 - Most significant bit of source byte replicated in all bit positions.
   // 7 - Invert most significant bit of source byte and replicate in all bit positions.
   for (int i = 0, e = RawMask.size(); i < e; ++i) {
-    uint64_t M = RawMask[i];
-    if (M == (uint64_t)SM_SentinelUndef) {
-      ShuffleMask.push_back(M);
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
       continue;
     }
 
+    uint64_t M = RawMask[i];
     uint64_t PermuteOp = (M >> 5) & 0x7;
     if (PermuteOp == 4) {
       ShuffleMask.push_back(SM_SentinelZero);
@@ -490,7 +490,7 @@ void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
 }
 
 void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
-                        ArrayRef<uint64_t> RawMask,
+                        ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                         SmallVectorImpl<int> &ShuffleMask) {
   unsigned VecSize = NumElts * ScalarBits;
   unsigned NumLanes = VecSize / 128;
@@ -500,6 +500,10 @@ void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
   assert((ScalarBits == 32 || ScalarBits == 64) && "Unexpected element size");
 
   for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
     uint64_t M = RawMask[i];
     M = (ScalarBits == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
     unsigned LaneOffset = i & ~(NumEltsPerLane - 1);
@@ -508,7 +512,7 @@ void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
 }
 
 void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
-                         ArrayRef<uint64_t> RawMask,
+                         ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                          SmallVectorImpl<int> &ShuffleMask) {
   unsigned VecSize = NumElts * ScalarBits;
   unsigned NumLanes = VecSize / 128;
@@ -518,6 +522,11 @@ void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
   assert((NumElts == RawMask.size()) && "Unexpected mask size");
 
   for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
     // VPERMIL2 Operation.
     // Bits[3] - Match Bit.
     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
@@ -548,19 +557,29 @@ void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
   }
 }
 
-void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask) {
   uint64_t EltMaskSize = RawMask.size() - 1;
-  for (auto M : RawMask) {
+  for (int i = 0, e = RawMask.size(); i != e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    uint64_t M = RawMask[i];
     M &= EltMaskSize;
     ShuffleMask.push_back((int)M);
   }
 }
 
-void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask) {
   uint64_t EltMaskSize = (RawMask.size() * 2) - 1;
-  for (auto M : RawMask) {
+  for (int i = 0, e = RawMask.size(); i != e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    uint64_t M = RawMask[i];
     M &= EltMaskSize;
     ShuffleMask.push_back((int)M);
   }
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 6d13bd58a12..85cde14a324 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
 #define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
 
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 
 //===----------------------------------------------------------------------===//
@@ -108,7 +109,7 @@ void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
 
 /// Decode a PSHUFB mask from a raw array of constants such as from
 /// BUILD_VECTOR.
-void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a BLEND immediate mask into a shuffle mask.
@@ -131,7 +132,7 @@ void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
 /// BUILD_VECTOR.
 /// This can only basic masks (permutes + zeros), not any of the other
 /// operations that VPPERM can perform.
-void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a zero extension instruction as a shuffle mask.
@@ -156,20 +157,20 @@ void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
 
 /// Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
 void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
-                        ArrayRef<uint64_t> RawMask,
+                        ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
 void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
-                         ArrayRef<uint64_t> RawMask,
+                         ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                          SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
-void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
-void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask);
 } // llvm namespace
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 8ba6c9ee018..cc867070398 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5825,14 +5825,12 @@ static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
 
 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
                                         unsigned MaskEltSizeInBits,
-                                        SmallVectorImpl<uint64_t> &RawMask) {
-  APInt UndefElts;
-  SmallVector<APInt, 64> EltBits;
-
+                                        SmallVectorImpl<uint64_t> &RawMask,
+                                        APInt &UndefElts) {
   // Extract the raw target constant bits.
-  // FIXME: We currently don't support UNDEF bits or mask entries.
+  SmallVector<APInt, 64> EltBits;
   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
-                                     EltBits, /* AllowWholeUndefs */ false,
+                                     EltBits, /* AllowWholeUndefs */ true,
                                      /* AllowPartialUndefs */ false))
     return false;
 
@@ -5873,6 +5871,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
   unsigned MaskEltSize = VT.getScalarSizeInBits();
+  SmallVector<uint64_t, 32> RawMask;
+  APInt RawUndefs;
   SDValue ImmN;
 
   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
@@ -6025,13 +6025,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
-    SmallVector<uint64_t, 32> RawMask;
-    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
-      DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, Mask);
-      break;
-    }
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMILPMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
+    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
+                                    RawUndefs)) {
+      DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
       break;
     }
     return false;
@@ -6042,13 +6038,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
-    SmallVector<uint64_t, 32> RawMask;
-    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
-      DecodePSHUFBMask(RawMask, Mask);
-      break;
-    }
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodePSHUFBMask(C, VT.getSizeInBits(), Mask);
+    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
+      DecodePSHUFBMask(RawMask, RawUndefs, Mask);
       break;
     }
     return false;
@@ -6104,13 +6095,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     SDValue CtrlNode = N->getOperand(3);
     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
       unsigned CtrlImm = CtrlOp->getZExtValue();
-      SmallVector<uint64_t, 32> RawMask;
-      if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
-        DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, Mask);
-        break;
-      }
-      if (auto *C = getTargetConstantFromNode(MaskNode)) {
-        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, VT.getSizeInBits(), Mask);
+      if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
+                                      RawUndefs)) {
+        DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
+                            Mask);
         break;
       }
     }
@@ -6121,13 +6109,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     SDValue MaskNode = N->getOperand(2);
-    SmallVector<uint64_t, 32> RawMask;
-    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
-      DecodeVPPERMMask(RawMask, Mask);
-      break;
-    }
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPPERMMask(C, VT.getSizeInBits(), Mask);
+    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
+      DecodeVPPERMMask(RawMask, RawUndefs, Mask);
       break;
     }
     return false;
@@ -6138,13 +6121,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
     Ops.push_back(N->getOperand(1));
     SDValue MaskNode = N->getOperand(0);
-    SmallVector<uint64_t, 32> RawMask;
-    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
-      DecodeVPERMVMask(RawMask, Mask);
-      break;
-    }
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMVMask(C, MaskEltSize, VT.getSizeInBits(), Mask);
+    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
+                                    RawUndefs)) {
+      DecodeVPERMVMask(RawMask, RawUndefs, Mask);
       break;
     }
     return false;
-- 
GitLab


From ee6a84b4fee5ae1c731ce767d5cab19a161bf0e9 Mon Sep 17 00:00:00 2001
From: Greg Bedwell <greg_bedwell@sn.scee.net>
Date: Tue, 23 Oct 2018 11:34:04 +0000
Subject: [PATCH 0434/1116] [lit] Only return a found bash executable on
 Windows if it can understand Windows paths

Some versions of bash.exe, for example WSL's version expect paths in the form
/mnt/c/path/to/dir rather than c:\\path\\to\\dir so will cause failures
for any tests that require an external shell if used by lit.  If we're on
Windows and looking for an external shell, check that the found version
of bash is able to parse a native path before returning that version.

This patch also partially reverts the behaviour of r228221 by
restoring the warning if bash cannot be found.  This shouldn't pollute
the lit stderr anymore as we're now using internal shell by default on
Windows.  If someone is explicitly specifying to use an external shell, it's
probably worth alerting them to the fact that bash could not be found.

Differential Revision: https://reviews.llvm.org/D52831

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345019 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/lit/lit/LitConfig.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py
index e8fb1533a86..97c09108581 100644
--- a/utils/lit/lit/LitConfig.py
+++ b/utils/lit/lit/LitConfig.py
@@ -120,6 +120,22 @@ class LitConfig(object):
         if self.bashPath is None:
             self.bashPath = ''
 
+        # Check whether the found version of bash is able to cope with paths in
+        # the host path format. If not, don't return it as it can't be used to
+        # run scripts. For example, WSL's bash.exe requires '/mnt/c/foo' rather
+        # than 'C:\\foo' or 'C:/foo'.
+        if self.isWindows and self.bashPath:
+            command = [self.bashPath, '-c',
+                       '[[ -f "%s" ]]' % self.bashPath.replace('\\', '\\\\')]
+            _, _, exitCode = lit.util.executeCommand(command)
+            if exitCode:
+                self.note('bash command failed: %s' % (
+                    ' '.join('"%s"' % c for c in command)))
+                self.bashPath = ''
+
+        if not self.bashPath:
+            self.warning('Unable to find a usable version of bash.')
+
         return self.bashPath
 
     def getToolsPath(self, dir, paths, tools):
-- 
GitLab


From e7a92994d433d5e0ce7823c63b174a6a1e5ce21b Mon Sep 17 00:00:00 2001
From: Dorit Nuzman <dorit.nuzman@intel.com>
Date: Tue, 23 Oct 2018 11:51:55 +0000
Subject: [PATCH 0435/1116] Leftover bits from https://reviews.llvm.org/D53420
 that were accidentally left out of revision 344883

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345021 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/VectorUtils.h        | 6 +++---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index 2f562ec3d30..85d1a01e315 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -310,9 +310,9 @@ public:
 
   /// Returns true if this Group requires a scalar iteration to handle gaps.
   bool requiresScalarEpilogue() const {
-    // If Group has no gaps, or has gaps but the last member exists, then a
-    // scalar epilog is not needed for this group.
-    if (getNumMembers() == getFactor() || getMember(getFactor() - 1))
+    // If the last member of the Group exists, then a scalar epilog is not
+    // needed for this group.
+    if (getMember(getFactor() - 1))
       return false;
 
     // We have a group with gaps. It therefore cannot be a group of stores,
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index daaa1e27c8e..ab0e72960dd 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4600,8 +4600,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
   }
 
   // Record that scalar epilogue is not allowed.
-  LLVM_DEBUG(dbgs() << "LV: Not inserting scalar epilogue for access with gaps "
-                       "due to -Os/-Oz.\n");
+  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
 
   // We don't create an epilogue when optimizing for size.
   // Invalidate interleave groups that require an epilogue.
-- 
GitLab


From f3f126519d70d3d175045737f302e71453282e97 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Oct 2018 13:00:22 +0000
Subject: [PATCH 0436/1116] Add BROADCAST shuffle cost tests.

Part of a lot of cleanup necessary before PR39368.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345023 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Analysis/CostModel/ARM/shuffle.ll | 33 ++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/test/Analysis/CostModel/ARM/shuffle.ll b/test/Analysis/CostModel/ARM/shuffle.ll
index 7ad9b0286ec..a6a42352564 100644
--- a/test/Analysis/CostModel/ARM/shuffle.ll
+++ b/test/Analysis/CostModel/ARM/shuffle.ll
@@ -3,6 +3,39 @@
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios6.0.0"
 
+define void @broadcast() {
+; CHECK-LABEL: 'broadcast'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v7 = shufflevector <2 x i8> undef, <2 x i8>undef, <2 x i32> zeroinitializer
+  %v8 = shufflevector <4 x i8> undef, <4 x i8>undef, <4 x i32> zeroinitializer
+  %v9 = shufflevector <8 x i8> undef, <8 x i8>undef, <8 x i32> zeroinitializer
+  %v10 = shufflevector <16 x i8> undef, <16 x i8>undef, <16 x i32> zeroinitializer
+
+  %v11 = shufflevector <2 x i16> undef, <2 x i16>undef, <2 x i32> zeroinitializer
+  %v12 = shufflevector <4 x i16> undef, <4 x i16>undef, <4 x i32> zeroinitializer
+  %v13 = shufflevector <8 x i16> undef, <8 x i16>undef, <8 x i32> zeroinitializer
+
+  %v14 = shufflevector <2 x i32> undef, <2 x i32>undef, <2 x i32> zeroinitializer
+  %v15 = shufflevector <4 x i32> undef, <4 x i32>undef, <4 x i32> zeroinitializer
+
+  %v16 = shufflevector <2 x float> undef, <2 x float>undef, <2 x i32> zeroinitializer
+  %v17 = shufflevector <4 x float> undef, <4 x float>undef, <4 x i32> zeroinitializer
+
+  ret void
+}
+
 ;; Reverse shuffles should be lowered to vrev and possibly a vext (for quadwords)
 define void @reverse() {
 ; CHECK-LABEL: 'reverse'
-- 
GitLab


From bd0ecc17b7973c223ee95a0a328b02767db32ba4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Oct 2018 13:14:54 +0000
Subject: [PATCH 0437/1116] Add BROADCAST shuffle cost tests.

Part of a lot of cleanup necessary before PR39368.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345025 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../CostModel/AArch64/shuffle-broadcast.ll    | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 test/Analysis/CostModel/AArch64/shuffle-broadcast.ll

diff --git a/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll b/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
new file mode 100644
index 00000000000..355ed520575
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=aarch64--linux-gnu -cost-model -analyze | FileCheck %s
+
+define void @broadcast() {
+; CHECK-LABEL: 'broadcast'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v7 = shufflevector <2 x i8> undef, <2 x i8>undef, <2 x i32> zeroinitializer
+  %v8 = shufflevector <4 x i8> undef, <4 x i8>undef, <4 x i32> zeroinitializer
+  %v9 = shufflevector <8 x i8> undef, <8 x i8>undef, <8 x i32> zeroinitializer
+  %v10 = shufflevector <16 x i8> undef, <16 x i8>undef, <16 x i32> zeroinitializer
+
+  %v11 = shufflevector <2 x i16> undef, <2 x i16>undef, <2 x i32> zeroinitializer
+  %v12 = shufflevector <4 x i16> undef, <4 x i16>undef, <4 x i32> zeroinitializer
+  %v13 = shufflevector <8 x i16> undef, <8 x i16>undef, <8 x i32> zeroinitializer
+
+  %v14 = shufflevector <2 x i32> undef, <2 x i32>undef, <2 x i32> zeroinitializer
+  %v15 = shufflevector <4 x i32> undef, <4 x i32>undef, <4 x i32> zeroinitializer
+
+  %v16 = shufflevector <2 x float> undef, <2 x float>undef, <2 x i32> zeroinitializer
+  %v17 = shufflevector <4 x float> undef, <4 x float>undef, <4 x i32> zeroinitializer
+
+  ret void
+}
-- 
GitLab


From 778349e2df2b2102a60d1a5c88f6f4f83328e7c6 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Tue, 23 Oct 2018 13:19:31 +0000
Subject: [PATCH 0438/1116] Experimental re-land of [X86][BMI1]
 X86DAGToDAGISel: select BEXTR from x << (32 - y) >> (32 - y) pattern

This initially landed in rL345014, but was reverted in rL345017
due to sanitizer-x86_64-linux-fast buildbot failure in
check-lld (ELF/relocatable-versioned.s) test.

While i'm not yet quite sure what is the problem, one obvious
thing here is that extra truncation roundtrip.
Maybe that's it? If not, will re-revert.

Differential Revision: https://reviews.llvm.org/D53521

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345027 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp  | 104 ++++++---
 lib/Target/X86/X86InstrInfo.td      |  26 ---
 test/CodeGen/X86/extract-bits.ll    | 328 +++++++++++-----------------
 test/CodeGen/X86/extract-lowbits.ll | 223 ++++++-------------
 4 files changed, 263 insertions(+), 418 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index d3aa5c89adc..73abdd80dc6 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2688,6 +2688,10 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
 //   c) x &  (-1 >> (32 - y))
 //   d) x << (32 - y) >> (32 - y)
 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
+  assert(
+      (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
+      "Should be either an and-mask, or right-shift after clearing high bits.");
+
   // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
   if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
     return false;
@@ -2698,13 +2702,16 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
   if (NVT != MVT::i32 && NVT != MVT::i64)
     return false;
 
+  unsigned Size = NVT.getSizeInBits();
+
   SDValue NBits;
 
   // If we have BMI2's BZHI, we are ok with muti-use patterns.
   // Else, if we only have BMI1's BEXTR, we require one-use.
   const bool CanHaveExtraUses = Subtarget->hasBMI2();
-  auto checkOneUse = [CanHaveExtraUses](SDValue Op) {
-    return CanHaveExtraUses || Op.hasOneUse();
+  auto checkOneUse = [CanHaveExtraUses](SDValue Op, unsigned NUses = 1) {
+    return CanHaveExtraUses ||
+           Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
   };
 
   // a) x & ((1 << nbits) + (-1))
@@ -2740,33 +2747,76 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
     return true;
   };
 
+  SDValue X;
+
+  // d) x << (32 - y) >> (32 - y)
+  auto matchPatternD = [&checkOneUse, Size, &X, &NBits](SDNode *Node) -> bool {
+    if (Node->getOpcode() != ISD::SRL)
+      return false;
+    SDValue N0 = Node->getOperand(0);
+    if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
+      return false;
+    SDValue N1 = Node->getOperand(1);
+    SDValue N01 = N0->getOperand(1);
+    // Both of the shifts must be by the exact same value.
+    // There should not be any uses of the shift amount outside of the pattern.
+    if (N1 != N01 || !checkOneUse(N1, 2))
+      return false;
+    // Skip over a truncate of the shift amount.
+    if (N1->getOpcode() == ISD::TRUNCATE) {
+      N1 = N1->getOperand(0);
+      // The trunc should have been the only user of the real shift amount.
+      if (!checkOneUse(N1))
+        return false;
+    }
+    // Match the shift amount as: (bitwidth - y). It should go away, too.
+    if (N1.getOpcode() != ISD::SUB)
+      return false;
+    auto N10 = dyn_cast<ConstantSDNode>(N1.getOperand(0));
+    if (!N10 || N10->getZExtValue() != Size)
+      return false;
+    X = N0->getOperand(0);
+    NBits = N1.getOperand(1);
+    return true;
+  };
+
   auto matchLowBitMask = [&matchPatternA,
                           &matchPatternB](SDValue Mask) -> bool {
-    // FIXME: patterns c, d.
+    // FIXME: pattern c.
     return matchPatternA(Mask) || matchPatternB(Mask);
   };
 
-  SDValue X = Node->getOperand(0);
-  SDValue Mask = Node->getOperand(1);
+  if (Node->getOpcode() == ISD::AND) {
+    X = Node->getOperand(0);
+    SDValue Mask = Node->getOperand(1);
 
-  if (matchLowBitMask(Mask)) {
-    // Great.
-  } else {
-    std::swap(X, Mask);
-    if (!matchLowBitMask(Mask))
-      return false;
-  }
+    if (matchLowBitMask(Mask)) {
+      // Great.
+    } else {
+      std::swap(X, Mask);
+      if (!matchLowBitMask(Mask))
+        return false;
+    }
+  } else if (!matchPatternD(Node))
+    return false;
 
   SDLoc DL(Node);
 
-  // Insert 8-bit NBits into lowest 8 bits of NVT-sized (32 or 64-bit) register.
-  // All the other bits are undefined, we do not care about them.
-  SDValue ImplDef =
-      SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, NVT), 0);
-  insertDAGNode(*CurDAG, NBits, ImplDef);
   SDValue OrigNBits = NBits;
-  NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, NVT, ImplDef, NBits);
-  insertDAGNode(*CurDAG, OrigNBits, NBits);
+  if (NBits.getValueType() != NVT) {
+    // Truncate the shift amount.
+    NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
+    insertDAGNode(*CurDAG, OrigNBits, NBits);
+
+    // Insert 8-bit NBits into lowest 8 bits of NVT-sized (32 or 64-bit)
+    // register. All the other bits are undefined, we do not care about them.
+    SDValue ImplDef =
+        SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, NVT), 0);
+    insertDAGNode(*CurDAG, OrigNBits, ImplDef);
+    NBits =
+        CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, NVT, ImplDef, NBits);
+    insertDAGNode(*CurDAG, OrigNBits, NBits);
+  }
 
   if (Subtarget->hasBMI2()) {
     // Great, just emit the the BZHI..
@@ -2963,17 +3013,8 @@ bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
   if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
     ShiftAmt = ShiftAmt->getOperand(0);
 
-  // Special case to avoid messing up a BZHI pattern.
-  // Look for (srl (shl X, (size - y)), (size - y)
-  if (Subtarget->hasBMI2() && (VT == MVT::i32 || VT == MVT::i64) &&
-      N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL &&
-      // Shift amounts the same?
-      N->getOperand(1) == N->getOperand(0).getOperand(1) &&
-      // Shift amounts size - y?
-      ShiftAmt.getOpcode() == ISD::SUB &&
-      isa<ConstantSDNode>(ShiftAmt.getOperand(0)) &&
-      cast<ConstantSDNode>(ShiftAmt.getOperand(0))->getZExtValue() == Size)
-    return false;
+  // This function is called after X86DAGToDAGISel::matchBitExtract(),
+  // so we are not afraid that we might mess up BZHI/BEXTR pattern.
 
   SDValue NewShiftAmt;
   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
@@ -3172,6 +3213,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
   }
 
   case ISD::SRL:
+    if (matchBitExtract(Node))
+      return;
+    LLVM_FALLTHROUGH;
   case ISD::SRA:
   case ISD::SHL:
     if (tryShiftAmountMod(Node))
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index a12c9e81b05..8d3f7c856d0 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -2519,14 +2519,6 @@ let Predicates = [HasBMI2] in {
                              (and (x86memop addr:$src),
                                   (srl -1, (sub bitwidth, GR8:$lz))),
                              RC, VT, DstInst, DstMemInst>;
-
-    // x << (bitwidth - y) >> (bitwidth - y)
-    defm : _bmi_bzhi_pattern<(srl (shl RC:$src, (sub bitwidth, GR8:$lz)),
-                                  (sub bitwidth, GR8:$lz)),
-                             (srl (shl (x86memop addr:$src),
-                                        (sub bitwidth, GR8:$lz)),
-                                  (sub bitwidth, GR8:$lz)),
-                             RC, VT, DstInst, DstMemInst>;
   }
 
   defm : bmi_bzhi_patterns<GR32, 32, i32, BZHI32rr, loadi32, BZHI32rm>;
@@ -2545,24 +2537,6 @@ let Predicates = [HasBMI2] in {
   def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
             (BZHI64rm addr:$src,
               (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
-
-  // x << (32 - y) >> (32 - y)
-  def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))),
-                 (i8 (trunc (sub 32, GR32:$lz)))),
-            (BZHI32rr GR32:$src, GR32:$lz)>;
-  def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))),
-                 (i8 (trunc (sub 32, GR32:$lz)))),
-            (BZHI32rm addr:$src, GR32:$lz)>;
-
-  // x << (64 - y) >> (64 - y)
-  def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))),
-                 (i8 (trunc (sub 64, GR32:$lz)))),
-            (BZHI64rr GR64:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
-  def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))),
-                 (i8 (trunc (sub 64, GR32:$lz)))),
-            (BZHI64rm addr:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
 } // HasBMI2
 
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll
index a7d91ede4be..4c0d62d5279 100644
--- a/test/CodeGen/X86/extract-bits.ll
+++ b/test/CodeGen/X86/extract-bits.ll
@@ -2813,14 +2813,12 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c0:
@@ -2846,13 +2844,10 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr32_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c0:
@@ -2882,14 +2877,12 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c1_indexzext:
@@ -2915,13 +2908,10 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X64-BMI1NOTBM-LABEL: bextr32_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c1_indexzext:
@@ -2954,15 +2944,13 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c2_load:
@@ -2992,10 +2980,8 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c2_load:
@@ -3027,15 +3013,13 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c3_load_indexzext:
@@ -3065,10 +3049,8 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c3_load_indexzext:
@@ -3101,14 +3083,12 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c4_commutative:
@@ -3134,13 +3114,10 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X64-BMI1NOTBM-LABEL: bextr32_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c4_commutative:
@@ -3180,16 +3157,13 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    addl $8, %esp
@@ -3230,13 +3204,10 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr32_c5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -3379,13 +3350,10 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr64_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c0:
@@ -3515,14 +3483,12 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c1_indexzext:
@@ -3663,10 +3629,8 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c2_load:
@@ -3800,14 +3764,13 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c3_load_indexzext:
@@ -3943,13 +3906,10 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X64-BMI1NOTBM-LABEL: bextr64_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c4_commutative:
@@ -4129,13 +4089,10 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_c5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
@@ -4179,14 +4136,12 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d0:
@@ -4212,13 +4167,10 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr32_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d0:
@@ -4248,14 +4200,12 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d1_indexzext:
@@ -4281,13 +4231,10 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X64-BMI1NOTBM-LABEL: bextr32_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d1_indexzext:
@@ -4320,15 +4267,13 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d2_load:
@@ -4358,10 +4303,8 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d2_load:
@@ -4393,15 +4336,13 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d3_load_indexzext:
@@ -4431,10 +4372,8 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d3_load_indexzext:
@@ -4477,16 +4416,13 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    addl $8, %esp
@@ -4527,13 +4463,10 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr32_d5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -4713,13 +4646,10 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr64_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d0:
@@ -4886,14 +4816,12 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d1_indexzext:
@@ -5071,10 +4999,8 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d2_load:
@@ -5245,14 +5171,13 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d3_load_indexzext:
@@ -5466,13 +5391,10 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_d5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
diff --git a/test/CodeGen/X86/extract-lowbits.ll b/test/CodeGen/X86/extract-lowbits.ll
index eae52441dfc..59865538585 100644
--- a/test/CodeGen/X86/extract-lowbits.ll
+++ b/test/CodeGen/X86/extract-lowbits.ll
@@ -1442,11 +1442,8 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c0:
@@ -1467,12 +1464,8 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c0:
@@ -1498,12 +1491,9 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c1_indexzext:
@@ -1524,12 +1514,8 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c1_indexzext:
@@ -1558,12 +1544,9 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c2_load:
@@ -1585,12 +1568,8 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c2_load:
@@ -1619,12 +1598,9 @@ define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c3_load_indexzext:
@@ -1646,12 +1622,8 @@ define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c3_load_indexzext:
@@ -1680,11 +1652,8 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c4_commutative:
@@ -1705,12 +1674,8 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c4_commutative:
@@ -1791,12 +1756,8 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c0:
@@ -1875,12 +1836,9 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c1_indexzext:
@@ -1970,12 +1928,8 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c2_load:
@@ -2064,12 +2018,9 @@ define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c3_load_indexzext:
@@ -2151,12 +2102,8 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c4_commutative:
@@ -2187,11 +2134,8 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d0:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d0:
@@ -2212,12 +2156,8 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d0:
@@ -2243,12 +2183,9 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_d1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d1_indexzext:
@@ -2269,12 +2206,8 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d1_indexzext:
@@ -2303,12 +2236,9 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d2_load:
@@ -2330,12 +2260,8 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d2_load:
@@ -2364,12 +2290,9 @@ define i32 @bzhi32_d3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d3_load_indexzext:
@@ -2391,12 +2314,8 @@ define i32 @bzhi32_d3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d3_load_indexzext:
@@ -2536,12 +2455,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d0:
@@ -2677,12 +2592,9 @@ define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d1_indexzext:
@@ -2823,12 +2735,8 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d2_load:
@@ -2968,12 +2876,9 @@ define i64 @bzhi64_d3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d3_load_indexzext:
-- 
GitLab


From 191127eb32cf31478fd7664951bc8de2a6ad630e Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 23 Oct 2018 13:39:40 +0000
Subject: [PATCH 0439/1116] [SLSR] auto-generate full test assertions; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345028 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../StraightLineStrengthReduce/slsr-add.ll    | 131 +++++++++++-------
 1 file changed, 79 insertions(+), 52 deletions(-)

diff --git a/test/Transforms/StraightLineStrengthReduce/slsr-add.ll b/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
index b4f448ace2a..c3bffb270af 100644
--- a/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
+++ b/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
@@ -1,51 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slsr -gvn -S | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 
 define void @shl(i32 %b, i32 %s) {
 ; CHECK-LABEL: @shl(
-  %1 = add i32 %b, %s
-; [[BASIS:%[a-zA-Z0-9]+]] = add i32 %b, %s
-  call void @foo(i32 %1)
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B:%.*]], [[S:%.*]]
+; CHECK-NEXT:    call void @foo(i32 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = add i32 [[T1]], [[S]]
+; CHECK-NEXT:    call void @foo(i32 [[T2]])
+; CHECK-NEXT:    ret void
+;
+  %t1 = add i32 %b, %s
+  call void @foo(i32 %t1)
   %s2 = shl i32 %s, 1
-  %2 = add i32 %b, %s2
-; add i32 [[BASIS]], %s
-  call void @foo(i32 %2)
+  %t2 = add i32 %b, %s2
+  call void @foo(i32 %t2)
   ret void
 }
 
 define void @stride_is_2s(i32 %b, i32 %s) {
 ; CHECK-LABEL: @stride_is_2s(
+; CHECK-NEXT:    [[S2:%.*]] = shl i32 [[S:%.*]], 1
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B:%.*]], [[S2]]
+; CHECK-NEXT:    call void @foo(i32 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = add i32 [[T1]], [[S2]]
+; CHECK-NEXT:    call void @foo(i32 [[T2]])
+; CHECK-NEXT:    [[T3:%.*]] = add i32 [[T2]], [[S2]]
+; CHECK-NEXT:    call void @foo(i32 [[T3]])
+; CHECK-NEXT:    ret void
+;
   %s2 = shl i32 %s, 1
-; CHECK: %s2 = shl i32 %s, 1
-  %1 = add i32 %b, %s2
-; CHECK: [[t1:%[a-zA-Z0-9]+]] = add i32 %b, %s2
-  call void @foo(i32 %1)
+  %t1 = add i32 %b, %s2
+  call void @foo(i32 %t1)
   %s4 = shl i32 %s, 2
-  %2 = add i32 %b, %s4
-; CHECK: [[t2:%[a-zA-Z0-9]+]] = add i32 [[t1]], %s2
-  call void @foo(i32 %2)
+  %t2 = add i32 %b, %s4
+  call void @foo(i32 %t2)
   %s6 = mul i32 %s, 6
-  %3 = add i32 %b, %s6
-; CHECK: add i32 [[t2]], %s2
-  call void @foo(i32 %3)
+  %t3 = add i32 %b, %s6
+  call void @foo(i32 %t3)
   ret void
 }
 
 define void @stride_is_3s(i32 %b, i32 %s) {
 ; CHECK-LABEL: @stride_is_3s(
-  %1 = add i32 %s, %b
-; CHECK: [[t1:%[a-zA-Z0-9]+]] = add i32 %s, %b
-  call void @foo(i32 %1)
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[S:%.*]], [[B:%.*]]
+; CHECK-NEXT:    call void @foo(i32 [[T1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[S]], 3
+; CHECK-NEXT:    [[T2:%.*]] = add i32 [[T1]], [[TMP1]]
+; CHECK-NEXT:    call void @foo(i32 [[T2]])
+; CHECK-NEXT:    [[T3:%.*]] = add i32 [[T2]], [[TMP1]]
+; CHECK-NEXT:    call void @foo(i32 [[T3]])
+; CHECK-NEXT:    ret void
+;
+  %t1 = add i32 %s, %b
+  call void @foo(i32 %t1)
   %s4 = shl i32 %s, 2
-  %2 = add i32 %s4, %b
-; CHECK: [[bump:%[a-zA-Z0-9]+]] = mul i32 %s, 3
-; CHECK: [[t2:%[a-zA-Z0-9]+]] = add i32 [[t1]], [[bump]]
-  call void @foo(i32 %2)
+  %t2 = add i32 %s4, %b
+  call void @foo(i32 %t2)
   %s7 = mul i32 %s, 7
-  %3 = add i32 %s7, %b
-; CHECK: add i32 [[t2]], [[bump]]
-  call void @foo(i32 %3)
+  %t3 = add i32 %s7, %b
+  call void @foo(i32 %t3)
   ret void
 }
 
@@ -62,22 +77,25 @@ define void @stride_is_3s(i32 %b, i32 %s) {
 ; foo(t3);
 define void @stride_is_minus_2s(i32 %b, i32 %s) {
 ; CHECK-LABEL: @stride_is_minus_2s(
+; CHECK-NEXT:    [[S6:%.*]] = mul i32 [[S:%.*]], 6
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B:%.*]], [[S6]]
+; CHECK-NEXT:    call void @foo(i32 [[T1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[S]], 1
+; CHECK-NEXT:    [[T2:%.*]] = sub i32 [[T1]], [[TMP1]]
+; CHECK-NEXT:    call void @foo(i32 [[T2]])
+; CHECK-NEXT:    [[T3:%.*]] = sub i32 [[T2]], [[TMP1]]
+; CHECK-NEXT:    call void @foo(i32 [[T3]])
+; CHECK-NEXT:    ret void
+;
   %s6 = mul i32 %s, 6
-  %1 = add i32 %b, %s6
-; CHECK: [[t1:%[a-zA-Z0-9]+]] = add i32 %b, %s6
-; CHECK: call void @foo(i32 [[t1]])
-  call void @foo(i32 %1)
+  %t1 = add i32 %b, %s6
+  call void @foo(i32 %t1)
   %s4 = shl i32 %s, 2
-  %2 = add i32 %b, %s4
-; CHECK: [[bump:%[a-zA-Z0-9]+]] = shl i32 %s, 1
-; CHECK: [[t2:%[a-zA-Z0-9]+]] = sub i32 [[t1]], [[bump]]
-  call void @foo(i32 %2)
-; CHECK: call void @foo(i32 [[t2]])
+  %t2 = add i32 %b, %s4
+  call void @foo(i32 %t2)
   %s2 = shl i32 %s, 1
-  %3 = add i32 %b, %s2
-; CHECK: [[t3:%[a-zA-Z0-9]+]] = sub i32 [[t2]], [[bump]]
-  call void @foo(i32 %3)
-; CHECK: call void @foo(i32 [[t3]])
+  %t3 = add i32 %b, %s2
+  call void @foo(i32 %t3)
   ret void
 }
 
@@ -88,27 +106,36 @@ define void @stride_is_minus_2s(i32 %b, i32 %s) {
 ; do not rewrite b + s to t - 7 * s because the latter is more complicated.
 define void @simple_enough(i32 %b, i32 %s) {
 ; CHECK-LABEL: @simple_enough(
+; CHECK-NEXT:    [[S8:%.*]] = shl i32 [[S:%.*]], 3
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B:%.*]], [[S8]]
+; CHECK-NEXT:    call void @foo(i32 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = add i32 [[B]], [[S]]
+; CHECK-NEXT:    call void @foo(i32 [[T2]])
+; CHECK-NEXT:    ret void
+;
   %s8 = shl i32 %s, 3
-  %1 = add i32 %b, %s8
-  call void @foo(i32 %1)
-  %2 = add i32 %b, %s
-; CHECK: [[t:%[a-zA-Z0-9]+]] = add i32 %b, %s{{$}}
-  call void @foo(i32 %2)
-; CHECK: call void @foo(i32 [[t]])
+  %t1 = add i32 %b, %s8
+  call void @foo(i32 %t1)
+  %t2 = add i32 %b, %s
+  call void @foo(i32 %t2)
   ret void
 }
 
 define void @slsr_strided_add_128bit(i128 %b, i128 %s) {
 ; CHECK-LABEL: @slsr_strided_add_128bit(
+; CHECK-NEXT:    [[S125:%.*]] = shl i128 [[S:%.*]], 125
+; CHECK-NEXT:    [[T1:%.*]] = add i128 [[B:%.*]], [[S125]]
+; CHECK-NEXT:    call void @bar(i128 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = add i128 [[T1]], [[S125]]
+; CHECK-NEXT:    call void @bar(i128 [[T2]])
+; CHECK-NEXT:    ret void
+;
   %s125 = shl i128 %s, 125
   %s126 = shl i128 %s, 126
-  %1 = add i128 %b, %s125
-; CHECK: [[t1:%[a-zA-Z0-9]+]] = add i128 %b, %s125
-  call void @bar(i128 %1)
-  %2 = add i128 %b, %s126
-; CHECK: [[t2:%[a-zA-Z0-9]+]] = add i128 [[t1]], %s125
-  call void @bar(i128 %2)
-; CHECK: call void @bar(i128 [[t2]])
+  %t1 = add i128 %b, %s125
+  call void @bar(i128 %t1)
+  %t2 = add i128 %b, %s126
+  call void @bar(i128 %t2)
   ret void
 }
 
-- 
GitLab


From 8a6e76e6c9fb6701004cf619846aedb722a60ecf Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 23 Oct 2018 14:07:39 +0000
Subject: [PATCH 0440/1116] [SLSR] use 'match' to simplify code; NFC

This pass could probably be modified slightly to allow
vector splat transforms for practically no cost, but
it only works on scalars for now. So the use of the
newer 'match' API should make no functional difference.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345030 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Scalar/StraightLineStrengthReduce.cpp     |  9 +++---
 .../StraightLineStrengthReduce/slsr-add.ll    | 29 +++++++++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 2061db13639..b5089b006bd 100644
--- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -640,12 +640,12 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
   Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
   switch (C.CandidateKind) {
   case Candidate::Add:
-  case Candidate::Mul:
+  case Candidate::Mul: {
     // C = Basis + Bump
-    if (BinaryOperator::isNeg(Bump)) {
+    Value *NegBump;
+    if (match(Bump, m_Neg(m_Value(NegBump)))) {
       // If Bump is a neg instruction, emit C = Basis - (-Bump).
-      Reduced =
-          Builder.CreateSub(Basis.Ins, BinaryOperator::getNegArgument(Bump));
+      Reduced = Builder.CreateSub(Basis.Ins, NegBump);
       // We only use the negative argument of Bump, and Bump itself may be
       // trivially dead.
       RecursivelyDeleteTriviallyDeadInstructions(Bump);
@@ -662,6 +662,7 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
       Reduced = Builder.CreateAdd(Basis.Ins, Bump);
     }
     break;
+  }
   case Candidate::GEP:
     {
       Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType());
diff --git a/test/Transforms/StraightLineStrengthReduce/slsr-add.ll b/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
index c3bffb270af..92af617dab8 100644
--- a/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
+++ b/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
@@ -99,6 +99,34 @@ define void @stride_is_minus_2s(i32 %b, i32 %s) {
   ret void
 }
 
+; TODO: This pass is targeted at simple address-calcs, so it is artificially limited to
+; match scalar values. The code could be modified to handle vector types too.
+
+define void @stride_is_minus_2s_vec(<2 x i32> %b, <2 x i32> %s) {
+; CHECK-LABEL: @stride_is_minus_2s_vec(
+; CHECK-NEXT:    [[S6:%.*]] = mul <2 x i32> [[S:%.*]], <i32 6, i32 6>
+; CHECK-NEXT:    [[T1:%.*]] = add <2 x i32> [[B:%.*]], [[S6]]
+; CHECK-NEXT:    call void @voo(<2 x i32> [[T1]])
+; CHECK-NEXT:    [[S4:%.*]] = shl <2 x i32> [[S]], <i32 2, i32 2>
+; CHECK-NEXT:    [[T2:%.*]] = add <2 x i32> [[B]], [[S4]]
+; CHECK-NEXT:    call void @voo(<2 x i32> [[T2]])
+; CHECK-NEXT:    [[S2:%.*]] = shl <2 x i32> [[S]], <i32 1, i32 1>
+; CHECK-NEXT:    [[T3:%.*]] = add <2 x i32> [[B]], [[S2]]
+; CHECK-NEXT:    call void @voo(<2 x i32> [[T3]])
+; CHECK-NEXT:    ret void
+;
+  %s6 = mul <2 x i32> %s, <i32 6, i32 6>
+  %t1 = add <2 x i32> %b, %s6
+  call void @voo(<2 x i32> %t1)
+  %s4 = shl <2 x i32> %s, <i32 2, i32 2>
+  %t2 = add <2 x i32> %b, %s4
+  call void @voo(<2 x i32> %t2)
+  %s2 = shl <2 x i32> %s, <i32 1, i32 1>
+  %t3 = add <2 x i32> %b, %s2
+  call void @voo(<2 x i32> %t3)
+  ret void
+}
+
 ; t = b + (s << 3);
 ; foo(t);
 ; foo(b + s);
@@ -140,4 +168,5 @@ define void @slsr_strided_add_128bit(i128 %b, i128 %s) {
 }
 
 declare void @foo(i32)
+declare void @voo(<2 x i32>)
 declare void @bar(i128)
-- 
GitLab


From d07f87e01a012436a07fad6eb8d7c1ffef8aa650 Mon Sep 17 00:00:00 2001
From: Aleksandr Urakov <aleksandr.urakov@jetbrains.com>
Date: Tue, 23 Oct 2018 14:27:45 +0000
Subject: [PATCH 0441/1116] Revert "[MachinePipeliner] Split MachinePipeliner
 code into header and cpp files"

This reverts commit 40760b733d9eef841c897338af5e9d81b12551bf.
It seems that the commit is a cuse of the build failure.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345032 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachinePipeliner.h | 614 ------------------------
 lib/CodeGen/MachinePipeliner.cpp        | 605 ++++++++++++++++++++++-
 2 files changed, 598 insertions(+), 621 deletions(-)
 delete mode 100644 include/llvm/CodeGen/MachinePipeliner.h

diff --git a/include/llvm/CodeGen/MachinePipeliner.h b/include/llvm/CodeGen/MachinePipeliner.h
deleted file mode 100644
index 2eb63adc7a3..00000000000
--- a/include/llvm/CodeGen/MachinePipeliner.h
+++ /dev/null
@@ -1,614 +0,0 @@
-//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
-//
-// Software pipelining (SWP) is an instruction scheduling technique for loops
-// that overlap loop iterations and exploits ILP via a compiler transformation.
-//
-// Swing Modulo Scheduling is an implementation of software pipelining
-// that generates schedules that are near optimal in terms of initiation
-// interval, register requirements, and stage count. See the papers:
-//
-// "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
-// A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Proceedings of the 1996
-// Conference on Parallel Architectures and Compilation Techiniques.
-//
-// "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
-// Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
-// Transactions on Computers, Vol. 50, No. 3, 2001.
-//
-// "An Implementation of Swing Modulo Scheduling With Extensions for
-// Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
-// Urbana-Chambpain, 2005.
-//
-//
-// The SMS algorithm consists of three main steps after computing the minimal
-// initiation interval (MII).
-// 1) Analyze the dependence graph and compute information about each
-//    instruction in the graph.
-// 2) Order the nodes (instructions) by priority based upon the heuristics
-//    described in the algorithm.
-// 3) Attempt to schedule the nodes in the specified order using the MII.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
-#define LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
-
-#include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-
-namespace llvm {
-
-class NodeSet;
-class SMSchedule;
-
-extern cl::opt<bool> SwpEnableCopyToPhi;
-
-/// The main class in the implementation of the target independent
-/// software pipeliner pass.
-class MachinePipeliner : public MachineFunctionPass {
-public:
-  MachineFunction *MF = nullptr;
-  const MachineLoopInfo *MLI = nullptr;
-  const MachineDominatorTree *MDT = nullptr;
-  const InstrItineraryData *InstrItins;
-  const TargetInstrInfo *TII = nullptr;
-  RegisterClassInfo RegClassInfo;
-
-#ifndef NDEBUG
-  static int NumTries;
-#endif
-
-  /// Cache the target analysis information about the loop.
-  struct LoopInfo {
-    MachineBasicBlock *TBB = nullptr;
-    MachineBasicBlock *FBB = nullptr;
-    SmallVector<MachineOperand, 4> BrCond;
-    MachineInstr *LoopInductionVar = nullptr;
-    MachineInstr *LoopCompare = nullptr;
-  };
-  LoopInfo LI;
-
-  static char ID;
-
-  MachinePipeliner() : MachineFunctionPass(ID) {
-    initializeMachinePipelinerPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
-    AU.addRequired<MachineLoopInfo>();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addRequired<LiveIntervals>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-private:
-  void preprocessPhiNodes(MachineBasicBlock &B);
-  bool canPipelineLoop(MachineLoop &L);
-  bool scheduleLoop(MachineLoop &L);
-  bool swingModuloScheduler(MachineLoop &L);
-};
-
-/// This class builds the dependence graph for the instructions in a loop,
-/// and attempts to schedule the instructions using the SMS algorithm.
-class SwingSchedulerDAG : public ScheduleDAGInstrs {
-  MachinePipeliner &Pass;
-  /// The minimum initiation interval between iterations for this schedule.
-  unsigned MII = 0;
-  /// Set to true if a valid pipelined schedule is found for the loop.
-  bool Scheduled = false;
-  MachineLoop &Loop;
-  LiveIntervals &LIS;
-  const RegisterClassInfo &RegClassInfo;
-
-  /// A toplogical ordering of the SUnits, which is needed for changing
-  /// dependences and iterating over the SUnits.
-  ScheduleDAGTopologicalSort Topo;
-
-  struct NodeInfo {
-    int ASAP = 0;
-    int ALAP = 0;
-    int ZeroLatencyDepth = 0;
-    int ZeroLatencyHeight = 0;
-
-    NodeInfo() = default;
-  };
-  /// Computed properties for each node in the graph.
-  std::vector<NodeInfo> ScheduleInfo;
-
-  enum OrderKind { BottomUp = 0, TopDown = 1 };
-  /// Computed node ordering for scheduling.
-  SetVector<SUnit *> NodeOrder;
-
-  using NodeSetType = SmallVector<NodeSet, 8>;
-  using ValueMapTy = DenseMap<unsigned, unsigned>;
-  using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
-  using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
-
-  /// Instructions to change when emitting the final schedule.
-  DenseMap<SUnit *, std::pair<unsigned, int64_t>> InstrChanges;
-
-  /// We may create a new instruction, so remember it because it
-  /// must be deleted when the pass is finished.
-  SmallPtrSet<MachineInstr *, 4> NewMIs;
-
-  /// Ordered list of DAG postprocessing steps.
-  std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
-
-  /// Helper class to implement Johnson's circuit finding algorithm.
-  class Circuits {
-    std::vector<SUnit> &SUnits;
-    SetVector<SUnit *> Stack;
-    BitVector Blocked;
-    SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
-    SmallVector<SmallVector<int, 4>, 16> AdjK;
-    // Node to Index from ScheduleDAGTopologicalSort
-    std::vector<int> *Node2Idx;
-    unsigned NumPaths;
-    static unsigned MaxPaths;
-
-  public:
-    Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
-        : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {
-      Node2Idx = new std::vector<int>(SUs.size());
-      unsigned Idx = 0;
-      for (const auto &NodeNum : Topo)
-        Node2Idx->at(NodeNum) = Idx++;
-    }
-
-    ~Circuits() { delete Node2Idx; }
-
-    /// Reset the data structures used in the circuit algorithm.
-    void reset() {
-      Stack.clear();
-      Blocked.reset();
-      B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>());
-      NumPaths = 0;
-    }
-
-    void createAdjacencyStructure(SwingSchedulerDAG *DAG);
-    bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
-    void unblock(int U);
-  };
-
-  struct CopyToPhiMutation : public ScheduleDAGMutation {
-    void apply(ScheduleDAGInstrs *DAG) override;
-  };
-
-public:
-  SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
-                    const RegisterClassInfo &rci)
-      : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
-        RegClassInfo(rci), Topo(SUnits, &ExitSU) {
-    P.MF->getSubtarget().getSMSMutations(Mutations);
-    if (SwpEnableCopyToPhi)
-      Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
-  }
-
-  void schedule() override;
-  void finishBlock() override;
-
-  /// Return true if the loop kernel has been scheduled.
-  bool hasNewSchedule() { return Scheduled; }
-
-  /// Return the earliest time an instruction may be scheduled.
-  int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
-
-  /// Return the latest time an instruction my be scheduled.
-  int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
-
-  /// The mobility function, which the number of slots in which
-  /// an instruction may be scheduled.
-  int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
-
-  /// The depth, in the dependence graph, for a node.
-  unsigned getDepth(SUnit *Node) { return Node->getDepth(); }
-
-  /// The maximum unweighted length of a path from an arbitrary node to the
-  /// given node in which each edge has latency 0
-  int getZeroLatencyDepth(SUnit *Node) {
-    return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth;
-  }
-
-  /// The height, in the dependence graph, for a node.
-  unsigned getHeight(SUnit *Node) { return Node->getHeight(); }
-
-  /// The maximum unweighted length of a path from the given node to an
-  /// arbitrary node in which each edge has latency 0
-  int getZeroLatencyHeight(SUnit *Node) {
-    return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
-  }
-
-  /// Return true if the dependence is a back-edge in the data dependence graph.
-  /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
-  /// using an anti dependence from a Phi to an instruction.
-  bool isBackedge(SUnit *Source, const SDep &Dep) {
-    if (Dep.getKind() != SDep::Anti)
-      return false;
-    return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
-  }
-
-  bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
-
-  /// The distance function, which indicates that operation V of iteration I
-  /// depends on operations U of iteration I-distance.
-  unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
-    // Instructions that feed a Phi have a distance of 1. Computing larger
-    // values for arrays requires data dependence information.
-    if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti)
-      return 1;
-    return 0;
-  }
-
-  /// Set the Minimum Initiation Interval for this schedule attempt.
-  void setMII(unsigned mii) { MII = mii; }
-
-  void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
-
-  void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
-
-  /// Return the new base register that was stored away for the changed
-  /// instruction.
-  unsigned getInstrBaseReg(SUnit *SU) {
-    DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
-        InstrChanges.find(SU);
-    if (It != InstrChanges.end())
-      return It->second.first;
-    return 0;
-  }
-
-  void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
-    Mutations.push_back(std::move(Mutation));
-  }
-
-  static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
-
-private:
-  void addLoopCarriedDependences(AliasAnalysis *AA);
-  void updatePhiDependences();
-  void changeDependences();
-  unsigned calculateResMII();
-  unsigned calculateRecMII(NodeSetType &RecNodeSets);
-  void findCircuits(NodeSetType &NodeSets);
-  void fuseRecs(NodeSetType &NodeSets);
-  void removeDuplicateNodes(NodeSetType &NodeSets);
-  void computeNodeFunctions(NodeSetType &NodeSets);
-  void registerPressureFilter(NodeSetType &NodeSets);
-  void colocateNodeSets(NodeSetType &NodeSets);
-  void checkNodeSets(NodeSetType &NodeSets);
-  void groupRemainingNodes(NodeSetType &NodeSets);
-  void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
-                         SetVector<SUnit *> &NodesAdded);
-  void computeNodeOrder(NodeSetType &NodeSets);
-  void checkValidNodeOrder(const NodeSetType &Circuits) const;
-  bool schedulePipeline(SMSchedule &Schedule);
-  void generatePipelinedLoop(SMSchedule &Schedule);
-  void generateProlog(SMSchedule &Schedule, unsigned LastStage,
-                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
-                      MBBVectorTy &PrologBBs);
-  void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
-                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
-                      MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
-  void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
-                            MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
-                            SMSchedule &Schedule, ValueMapTy *VRMap,
-                            InstrMapTy &InstrMap, unsigned LastStageNum,
-                            unsigned CurStageNum, bool IsLast);
-  void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
-                    MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
-                    SMSchedule &Schedule, ValueMapTy *VRMap,
-                    InstrMapTy &InstrMap, unsigned LastStageNum,
-                    unsigned CurStageNum, bool IsLast);
-  void removeDeadInstructions(MachineBasicBlock *KernelBB,
-                              MBBVectorTy &EpilogBBs);
-  void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
-                      SMSchedule &Schedule);
-  void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
-                   MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
-                   ValueMapTy *VRMap);
-  bool computeDelta(MachineInstr &MI, unsigned &Delta);
-  void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
-                         unsigned Num);
-  MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
-                           unsigned InstStageNum);
-  MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
-                                    unsigned InstStageNum,
-                                    SMSchedule &Schedule);
-  void updateInstruction(MachineInstr *NewMI, bool LastDef,
-                         unsigned CurStageNum, unsigned InstrStageNum,
-                         SMSchedule &Schedule, ValueMapTy *VRMap);
-  MachineInstr *findDefInLoop(unsigned Reg);
-  unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
-                         unsigned LoopStage, ValueMapTy *VRMap,
-                         MachineBasicBlock *BB);
-  void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
-                        SMSchedule &Schedule, ValueMapTy *VRMap,
-                        InstrMapTy &InstrMap);
-  void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
-                             InstrMapTy &InstrMap, unsigned CurStageNum,
-                             unsigned PhiNum, MachineInstr *Phi,
-                             unsigned OldReg, unsigned NewReg,
-                             unsigned PrevReg = 0);
-  bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
-                             unsigned &OffsetPos, unsigned &NewBase,
-                             int64_t &NewOffset);
-  void postprocessDAG();
-};
-
-/// A NodeSet contains a set of SUnit DAG nodes with additional information
-/// that assigns a priority to the set.
-class NodeSet {
-  SetVector<SUnit *> Nodes;
-  bool HasRecurrence = false;
-  unsigned RecMII = 0;
-  int MaxMOV = 0;
-  unsigned MaxDepth = 0;
-  unsigned Colocate = 0;
-  SUnit *ExceedPressure = nullptr;
-  unsigned Latency = 0;
-
-public:
-  using iterator = SetVector<SUnit *>::const_iterator;
-
-  NodeSet() = default;
-  NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
-    Latency = 0;
-    for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
-      for (const SDep &Succ : Nodes[i]->Succs)
-        if (Nodes.count(Succ.getSUnit()))
-          Latency += Succ.getLatency();
-  }
-
-  bool insert(SUnit *SU) { return Nodes.insert(SU); }
-
-  void insert(iterator S, iterator E) { Nodes.insert(S, E); }
-
-  template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) {
-    return Nodes.remove_if(P);
-  }
-
-  unsigned count(SUnit *SU) const { return Nodes.count(SU); }
-
-  bool hasRecurrence() { return HasRecurrence; };
-
-  unsigned size() const { return Nodes.size(); }
-
-  bool empty() const { return Nodes.empty(); }
-
-  SUnit *getNode(unsigned i) const { return Nodes[i]; };
-
-  void setRecMII(unsigned mii) { RecMII = mii; };
-
-  void setColocate(unsigned c) { Colocate = c; };
-
-  void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
-
-  bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
-
-  int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
-
-  int getRecMII() { return RecMII; }
-
-  /// Summarize node functions for the entire node set.
-  void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
-    for (SUnit *SU : *this) {
-      MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
-      MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
-    }
-  }
-
-  unsigned getLatency() { return Latency; }
-
-  unsigned getMaxDepth() { return MaxDepth; }
-
-  void clear() {
-    Nodes.clear();
-    RecMII = 0;
-    HasRecurrence = false;
-    MaxMOV = 0;
-    MaxDepth = 0;
-    Colocate = 0;
-    ExceedPressure = nullptr;
-  }
-
-  operator SetVector<SUnit *> &() { return Nodes; }
-
-  /// Sort the node sets by importance. First, rank them by recurrence MII,
-  /// then by mobility (least mobile done first), and finally by depth.
-  /// Each node set may contain a colocate value which is used as the first
-  /// tie breaker, if it's set.
-  bool operator>(const NodeSet &RHS) const {
-    if (RecMII == RHS.RecMII) {
-      if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate)
-        return Colocate < RHS.Colocate;
-      if (MaxMOV == RHS.MaxMOV)
-        return MaxDepth > RHS.MaxDepth;
-      return MaxMOV < RHS.MaxMOV;
-    }
-    return RecMII > RHS.RecMII;
-  }
-
-  bool operator==(const NodeSet &RHS) const {
-    return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
-           MaxDepth == RHS.MaxDepth;
-  }
-
-  bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
-
-  iterator begin() { return Nodes.begin(); }
-  iterator end() { return Nodes.end(); }
-
-  void print(raw_ostream &os) const {
-    os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
-       << " depth " << MaxDepth << " col " << Colocate << "\n";
-    for (const auto &I : Nodes)
-      os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
-    os << "\n";
-  }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
-#endif
-};
-
-/// This class represents the scheduled code.  The main data structure is a
-/// map from scheduled cycle to instructions.  During scheduling, the
-/// data structure explicitly represents all stages/iterations.   When
-/// the algorithm finshes, the schedule is collapsed into a single stage,
-/// which represents instructions from different loop iterations.
-///
-/// The SMS algorithm allows negative values for cycles, so the first cycle
-/// in the schedule is the smallest cycle value.
-class SMSchedule {
-private:
-  /// Map from execution cycle to instructions.
-  DenseMap<int, std::deque<SUnit *>> ScheduledInstrs;
-
-  /// Map from instruction to execution cycle.
-  std::map<SUnit *, int> InstrToCycle;
-
-  /// Map for each register and the max difference between its uses and def.
-  /// The first element in the pair is the max difference in stages. The
-  /// second is true if the register defines a Phi value and loop value is
-  /// scheduled before the Phi.
-  std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
-
-  /// Keep track of the first cycle value in the schedule.  It starts
-  /// as zero, but the algorithm allows negative values.
-  int FirstCycle = 0;
-
-  /// Keep track of the last cycle value in the schedule.
-  int LastCycle = 0;
-
-  /// The initiation interval (II) for the schedule.
-  int InitiationInterval = 0;
-
-  /// Target machine information.
-  const TargetSubtargetInfo &ST;
-
-  /// Virtual register information.
-  MachineRegisterInfo &MRI;
-
-  std::unique_ptr<DFAPacketizer> Resources;
-
-public:
-  SMSchedule(MachineFunction *mf)
-      : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
-        Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {}
-
-  void reset() {
-    ScheduledInstrs.clear();
-    InstrToCycle.clear();
-    RegToStageDiff.clear();
-    FirstCycle = 0;
-    LastCycle = 0;
-    InitiationInterval = 0;
-  }
-
-  /// Set the initiation interval for this schedule.
-  void setInitiationInterval(int ii) { InitiationInterval = ii; }
-
-  /// Return the first cycle in the completed schedule.  This
-  /// can be a negative value.
-  int getFirstCycle() const { return FirstCycle; }
-
-  /// Return the last cycle in the finalized schedule.
-  int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
-
-  /// Return the cycle of the earliest scheduled instruction in the dependence
-  /// chain.
-  int earliestCycleInChain(const SDep &Dep);
-
-  /// Return the cycle of the latest scheduled instruction in the dependence
-  /// chain.
-  int latestCycleInChain(const SDep &Dep);
-
-  void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
-                    int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
-  bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
-
-  /// Iterators for the cycle to instruction map.
-  using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator;
-  using const_sched_iterator =
-      DenseMap<int, std::deque<SUnit *>>::const_iterator;
-
-  /// Return true if the instruction is scheduled at the specified stage.
-  bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
-    return (stageScheduled(SU) == (int)StageNum);
-  }
-
-  /// Return the stage for a scheduled instruction.  Return -1 if
-  /// the instruction has not been scheduled.
-  int stageScheduled(SUnit *SU) const {
-    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
-    if (it == InstrToCycle.end())
-      return -1;
-    return (it->second - FirstCycle) / InitiationInterval;
-  }
-
-  /// Return the cycle for a scheduled instruction. This function normalizes
-  /// the first cycle to be 0.
-  unsigned cycleScheduled(SUnit *SU) const {
-    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
-    assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
-    return (it->second - FirstCycle) % InitiationInterval;
-  }
-
-  /// Return the maximum stage count needed for this schedule.
-  unsigned getMaxStageCount() {
-    return (LastCycle - FirstCycle) / InitiationInterval;
-  }
-
-  /// Return the max. number of stages/iterations that can occur between a
-  /// register definition and its uses.
-  unsigned getStagesForReg(int Reg, unsigned CurStage) {
-    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
-    if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
-      return 1;
-    return Stages.first;
-  }
-
-  /// The number of stages for a Phi is a little different than other
-  /// instructions. The minimum value computed in RegToStageDiff is 1
-  /// because we assume the Phi is needed for at least 1 iteration.
-  /// This is not the case if the loop value is scheduled prior to the
-  /// Phi in the same stage.  This function returns the number of stages
-  /// or iterations needed between the Phi definition and any uses.
-  unsigned getStagesForPhi(int Reg) {
-    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
-    if (Stages.second)
-      return Stages.first;
-    return Stages.first - 1;
-  }
-
-  /// Return the instructions that are scheduled at the specified cycle.
-  std::deque<SUnit *> &getInstructions(int cycle) {
-    return ScheduledInstrs[cycle];
-  }
-
-  bool isValidSchedule(SwingSchedulerDAG *SSD);
-  void finalizeSchedule(SwingSchedulerDAG *SSD);
-  void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
-                       std::deque<SUnit *> &Insts);
-  bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
-  bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def,
-                             MachineOperand &MO);
-  void print(raw_ostream &os) const;
-  void dump() const;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index 65805d12cac..a341aac227a 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -9,6 +9,34 @@
 //
 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
 //
+// Software pipelining (SWP) is an instruction scheduling technique for loops
+// that overlap loop iterations and exploits ILP via a compiler transformation.
+//
+// Swing Modulo Scheduling is an implementation of software pipelining
+// that generates schedules that are near optimal in terms of initiation
+// interval, register requirements, and stage count. See the papers:
+//
+// "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
+// A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Proceedings of the 1996
+// Conference on Parallel Architectures and Compilation Techiniques.
+//
+// "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
+// Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
+// Transactions on Computers, Vol. 50, No. 3, 2001.
+//
+// "An Implementation of Swing Modulo Scheduling With Extensions for
+// Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
+// Urbana-Chambpain, 2005.
+//
+//
+// The SMS algorithm consists of three main steps after computing the minimal
+// initiation interval (MII).
+// 1) Analyze the dependence graph and compute information about each
+//    instruction in the graph.
+// 2) Order the nodes (instructions) by priority based upon the heuristics
+//    described in the algorithm.
+// 3) Attempt to schedule the nodes in the specified order using the MII.
+//
 // This SMS implementation is a target-independent back-end pass. When enabled,
 // the pass runs just prior to the register allocation pass, while the machine
 // IR is in SSA form. If software pipelining is successful, then the original
@@ -55,11 +83,13 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachinePipeliner.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -142,14 +172,575 @@ static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
                                      cl::ReallyHidden, cl::init(false),
                                      cl::ZeroOrMore, cl::desc("Ignore RecMII"));
 
-namespace llvm {
-
 // A command line option to enable the CopyToPhi DAG mutation.
-cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
-                                 cl::init(true), cl::ZeroOrMore,
-                                 cl::desc("Enable CopyToPhi DAG Mutation"));
+static cl::opt<bool>
+    SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+                       cl::init(true), cl::ZeroOrMore,
+                       cl::desc("Enable CopyToPhi DAG Mutation"));
+
+namespace {
+
+class NodeSet;
+class SMSchedule;
+
+/// The main class in the implementation of the target independent
+/// software pipeliner pass.
+class MachinePipeliner : public MachineFunctionPass {
+public:
+  MachineFunction *MF = nullptr;
+  const MachineLoopInfo *MLI = nullptr;
+  const MachineDominatorTree *MDT = nullptr;
+  const InstrItineraryData *InstrItins;
+  const TargetInstrInfo *TII = nullptr;
+  RegisterClassInfo RegClassInfo;
+
+#ifndef NDEBUG
+  static int NumTries;
+#endif
 
-} // end namespace llvm
+  /// Cache the target analysis information about the loop.
+  struct LoopInfo {
+    MachineBasicBlock *TBB = nullptr;
+    MachineBasicBlock *FBB = nullptr;
+    SmallVector<MachineOperand, 4> BrCond;
+    MachineInstr *LoopInductionVar = nullptr;
+    MachineInstr *LoopCompare = nullptr;
+  };
+  LoopInfo LI;
+
+  static char ID;
+
+  MachinePipeliner() : MachineFunctionPass(ID) {
+    initializeMachinePipelinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<LiveIntervals>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  void preprocessPhiNodes(MachineBasicBlock &B);
+  bool canPipelineLoop(MachineLoop &L);
+  bool scheduleLoop(MachineLoop &L);
+  bool swingModuloScheduler(MachineLoop &L);
+};
+
+/// This class builds the dependence graph for the instructions in a loop,
+/// and attempts to schedule the instructions using the SMS algorithm.
+class SwingSchedulerDAG : public ScheduleDAGInstrs {
+  MachinePipeliner &Pass;
+  /// The minimum initiation interval between iterations for this schedule.
+  unsigned MII = 0;
+  /// Set to true if a valid pipelined schedule is found for the loop.
+  bool Scheduled = false;
+  MachineLoop &Loop;
+  LiveIntervals &LIS;
+  const RegisterClassInfo &RegClassInfo;
+
+  /// A toplogical ordering of the SUnits, which is needed for changing
+  /// dependences and iterating over the SUnits.
+  ScheduleDAGTopologicalSort Topo;
+
+  struct NodeInfo {
+    int ASAP = 0;
+    int ALAP = 0;
+    int ZeroLatencyDepth = 0;
+    int ZeroLatencyHeight = 0;
+
+    NodeInfo() = default;
+  };
+  /// Computed properties for each node in the graph.
+  std::vector<NodeInfo> ScheduleInfo;
+
+  enum OrderKind { BottomUp = 0, TopDown = 1 };
+  /// Computed node ordering for scheduling.
+  SetVector<SUnit *> NodeOrder;
+
+  using NodeSetType = SmallVector<NodeSet, 8>;
+  using ValueMapTy = DenseMap<unsigned, unsigned>;
+  using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
+  using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
+
+  /// Instructions to change when emitting the final schedule.
+  DenseMap<SUnit *, std::pair<unsigned, int64_t>> InstrChanges;
+
+  /// We may create a new instruction, so remember it because it
+  /// must be deleted when the pass is finished.
+  SmallPtrSet<MachineInstr *, 4> NewMIs;
+
+  /// Ordered list of DAG postprocessing steps.
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
+
+  /// Helper class to implement Johnson's circuit finding algorithm.
+  class Circuits {
+    std::vector<SUnit> &SUnits;
+    SetVector<SUnit *> Stack;
+    BitVector Blocked;
+    SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
+    SmallVector<SmallVector<int, 4>, 16> AdjK;
+    // Node to Index from ScheduleDAGTopologicalSort
+    std::vector<int> *Node2Idx;
+    unsigned NumPaths;
+    static unsigned MaxPaths;
+
+  public:
+    Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
+        : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {
+      Node2Idx = new std::vector<int>(SUs.size());
+      unsigned Idx = 0;
+      for (const auto &NodeNum : Topo)
+        Node2Idx->at(NodeNum) = Idx++;
+    }
+
+    ~Circuits() { delete Node2Idx; }
+
+    /// Reset the data structures used in the circuit algorithm.
+    void reset() {
+      Stack.clear();
+      Blocked.reset();
+      B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>());
+      NumPaths = 0;
+    }
+
+    void createAdjacencyStructure(SwingSchedulerDAG *DAG);
+    bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
+    void unblock(int U);
+  };
+
+  struct CopyToPhiMutation : public ScheduleDAGMutation {
+    void apply(ScheduleDAGInstrs *DAG) override;
+  };
+
+public:
+  SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
+                    const RegisterClassInfo &rci)
+      : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
+        RegClassInfo(rci), Topo(SUnits, &ExitSU) {
+    P.MF->getSubtarget().getSMSMutations(Mutations);
+    if (SwpEnableCopyToPhi)
+      Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
+  }
+
+  void schedule() override;
+  void finishBlock() override;
+
+  /// Return true if the loop kernel has been scheduled.
+  bool hasNewSchedule() { return Scheduled; }
+
+  /// Return the earliest time an instruction may be scheduled.
+  int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
+
+  /// Return the latest time an instruction my be scheduled.
+  int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
+
+  /// The mobility function, which the number of slots in which
+  /// an instruction may be scheduled.
+  int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
+
+  /// The depth, in the dependence graph, for a node.
+  unsigned getDepth(SUnit *Node) { return Node->getDepth(); }
+
+  /// The maximum unweighted length of a path from an arbitrary node to the
+  /// given node in which each edge has latency 0
+  int getZeroLatencyDepth(SUnit *Node) {
+    return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth;
+  }
+
+  /// The height, in the dependence graph, for a node.
+  unsigned getHeight(SUnit *Node) { return Node->getHeight(); }
+
+  /// The maximum unweighted length of a path from the given node to an
+  /// arbitrary node in which each edge has latency 0
+  int getZeroLatencyHeight(SUnit *Node) {
+    return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
+  }
+
+  /// Return true if the dependence is a back-edge in the data dependence graph.
+  /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
+  /// using an anti dependence from a Phi to an instruction.
+  bool isBackedge(SUnit *Source, const SDep &Dep) {
+    if (Dep.getKind() != SDep::Anti)
+      return false;
+    return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
+  }
+
+  bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
+
+  /// The distance function, which indicates that operation V of iteration I
+  /// depends on operations U of iteration I-distance.
+  unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
+    // Instructions that feed a Phi have a distance of 1. Computing larger
+    // values for arrays requires data dependence information.
+    if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti)
+      return 1;
+    return 0;
+  }
+
+  /// Set the Minimum Initiation Interval for this schedule attempt.
+  void setMII(unsigned mii) { MII = mii; }
+
+  void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
+
+  void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
+
+  /// Return the new base register that was stored away for the changed
+  /// instruction.
+  unsigned getInstrBaseReg(SUnit *SU) {
+    DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
+        InstrChanges.find(SU);
+    if (It != InstrChanges.end())
+      return It->second.first;
+    return 0;
+  }
+
+  void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
+    Mutations.push_back(std::move(Mutation));
+  }
+
+  static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
+
+private:
+  void addLoopCarriedDependences(AliasAnalysis *AA);
+  void updatePhiDependences();
+  void changeDependences();
+  unsigned calculateResMII();
+  unsigned calculateRecMII(NodeSetType &RecNodeSets);
+  void findCircuits(NodeSetType &NodeSets);
+  void fuseRecs(NodeSetType &NodeSets);
+  void removeDuplicateNodes(NodeSetType &NodeSets);
+  void computeNodeFunctions(NodeSetType &NodeSets);
+  void registerPressureFilter(NodeSetType &NodeSets);
+  void colocateNodeSets(NodeSetType &NodeSets);
+  void checkNodeSets(NodeSetType &NodeSets);
+  void groupRemainingNodes(NodeSetType &NodeSets);
+  void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
+                         SetVector<SUnit *> &NodesAdded);
+  void computeNodeOrder(NodeSetType &NodeSets);
+  void checkValidNodeOrder(const NodeSetType &Circuits) const;
+  bool schedulePipeline(SMSchedule &Schedule);
+  void generatePipelinedLoop(SMSchedule &Schedule);
+  void generateProlog(SMSchedule &Schedule, unsigned LastStage,
+                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
+                      MBBVectorTy &PrologBBs);
+  void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
+                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
+                      MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
+  void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
+                            MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
+                            SMSchedule &Schedule, ValueMapTy *VRMap,
+                            InstrMapTy &InstrMap, unsigned LastStageNum,
+                            unsigned CurStageNum, bool IsLast);
+  void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
+                    MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
+                    SMSchedule &Schedule, ValueMapTy *VRMap,
+                    InstrMapTy &InstrMap, unsigned LastStageNum,
+                    unsigned CurStageNum, bool IsLast);
+  void removeDeadInstructions(MachineBasicBlock *KernelBB,
+                              MBBVectorTy &EpilogBBs);
+  void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
+                      SMSchedule &Schedule);
+  void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
+                   MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
+                   ValueMapTy *VRMap);
+  bool computeDelta(MachineInstr &MI, unsigned &Delta);
+  void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
+                         unsigned Num);
+  MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
+                           unsigned InstStageNum);
+  MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
+                                    unsigned InstStageNum,
+                                    SMSchedule &Schedule);
+  void updateInstruction(MachineInstr *NewMI, bool LastDef,
+                         unsigned CurStageNum, unsigned InstrStageNum,
+                         SMSchedule &Schedule, ValueMapTy *VRMap);
+  MachineInstr *findDefInLoop(unsigned Reg);
+  unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
+                         unsigned LoopStage, ValueMapTy *VRMap,
+                         MachineBasicBlock *BB);
+  void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
+                        SMSchedule &Schedule, ValueMapTy *VRMap,
+                        InstrMapTy &InstrMap);
+  void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
+                             InstrMapTy &InstrMap, unsigned CurStageNum,
+                             unsigned PhiNum, MachineInstr *Phi,
+                             unsigned OldReg, unsigned NewReg,
+                             unsigned PrevReg = 0);
+  bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
+                             unsigned &OffsetPos, unsigned &NewBase,
+                             int64_t &NewOffset);
+  void postprocessDAG();
+};
+
+/// A NodeSet contains a set of SUnit DAG nodes with additional information
+/// that assigns a priority to the set.
+class NodeSet {
+  SetVector<SUnit *> Nodes;
+  bool HasRecurrence = false;
+  unsigned RecMII = 0;
+  int MaxMOV = 0;
+  unsigned MaxDepth = 0;
+  unsigned Colocate = 0;
+  SUnit *ExceedPressure = nullptr;
+  unsigned Latency = 0;
+
+public:
+  using iterator = SetVector<SUnit *>::const_iterator;
+
+  NodeSet() = default;
+  NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
+    Latency = 0;
+    for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
+      for (const SDep &Succ : Nodes[i]->Succs)
+        if (Nodes.count(Succ.getSUnit()))
+          Latency += Succ.getLatency();
+  }
+
+  bool insert(SUnit *SU) { return Nodes.insert(SU); }
+
+  void insert(iterator S, iterator E) { Nodes.insert(S, E); }
+
+  template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) {
+    return Nodes.remove_if(P);
+  }
+
+  unsigned count(SUnit *SU) const { return Nodes.count(SU); }
+
+  bool hasRecurrence() { return HasRecurrence; };
+
+  unsigned size() const { return Nodes.size(); }
+
+  bool empty() const { return Nodes.empty(); }
+
+  SUnit *getNode(unsigned i) const { return Nodes[i]; };
+
+  void setRecMII(unsigned mii) { RecMII = mii; };
+
+  void setColocate(unsigned c) { Colocate = c; };
+
+  void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
+
+  bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
+
+  int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
+
+  int getRecMII() { return RecMII; }
+
+  /// Summarize node functions for the entire node set.
+  void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
+    for (SUnit *SU : *this) {
+      MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
+      MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
+    }
+  }
+
+  unsigned getLatency() { return Latency; }
+
+  unsigned getMaxDepth() { return MaxDepth; }
+
+  void clear() {
+    Nodes.clear();
+    RecMII = 0;
+    HasRecurrence = false;
+    MaxMOV = 0;
+    MaxDepth = 0;
+    Colocate = 0;
+    ExceedPressure = nullptr;
+  }
+
+  operator SetVector<SUnit *> &() { return Nodes; }
+
+  /// Sort the node sets by importance. First, rank them by recurrence MII,
+  /// then by mobility (least mobile done first), and finally by depth.
+  /// Each node set may contain a colocate value which is used as the first
+  /// tie breaker, if it's set.
+  bool operator>(const NodeSet &RHS) const {
+    if (RecMII == RHS.RecMII) {
+      if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate)
+        return Colocate < RHS.Colocate;
+      if (MaxMOV == RHS.MaxMOV)
+        return MaxDepth > RHS.MaxDepth;
+      return MaxMOV < RHS.MaxMOV;
+    }
+    return RecMII > RHS.RecMII;
+  }
+
+  bool operator==(const NodeSet &RHS) const {
+    return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
+           MaxDepth == RHS.MaxDepth;
+  }
+
+  bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
+
+  iterator begin() { return Nodes.begin(); }
+  iterator end() { return Nodes.end(); }
+
+  void print(raw_ostream &os) const {
+    os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
+       << " depth " << MaxDepth << " col " << Colocate << "\n";
+    for (const auto &I : Nodes)
+      os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
+    os << "\n";
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+#endif
+};
+
+/// This class represents the scheduled code.  The main data structure is a
+/// map from scheduled cycle to instructions.  During scheduling, the
+/// data structure explicitly represents all stages/iterations.   When
+/// the algorithm finshes, the schedule is collapsed into a single stage,
+/// which represents instructions from different loop iterations.
+///
+/// The SMS algorithm allows negative values for cycles, so the first cycle
+/// in the schedule is the smallest cycle value.
+class SMSchedule {
+private:
+  /// Map from execution cycle to instructions.
+  DenseMap<int, std::deque<SUnit *>> ScheduledInstrs;
+
+  /// Map from instruction to execution cycle.
+  std::map<SUnit *, int> InstrToCycle;
+
+  /// Map for each register and the max difference between its uses and def.
+  /// The first element in the pair is the max difference in stages. The
+  /// second is true if the register defines a Phi value and loop value is
+  /// scheduled before the Phi.
+  std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
+
+  /// Keep track of the first cycle value in the schedule.  It starts
+  /// as zero, but the algorithm allows negative values.
+  int FirstCycle = 0;
+
+  /// Keep track of the last cycle value in the schedule.
+  int LastCycle = 0;
+
+  /// The initiation interval (II) for the schedule.
+  int InitiationInterval = 0;
+
+  /// Target machine information.
+  const TargetSubtargetInfo &ST;
+
+  /// Virtual register information.
+  MachineRegisterInfo &MRI;
+
+  std::unique_ptr<DFAPacketizer> Resources;
+
+public:
+  SMSchedule(MachineFunction *mf)
+      : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
+        Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {}
+
+  void reset() {
+    ScheduledInstrs.clear();
+    InstrToCycle.clear();
+    RegToStageDiff.clear();
+    FirstCycle = 0;
+    LastCycle = 0;
+    InitiationInterval = 0;
+  }
+
+  /// Set the initiation interval for this schedule.
+  void setInitiationInterval(int ii) { InitiationInterval = ii; }
+
+  /// Return the first cycle in the completed schedule.  This
+  /// can be a negative value.
+  int getFirstCycle() const { return FirstCycle; }
+
+  /// Return the last cycle in the finalized schedule.
+  int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
+
+  /// Return the cycle of the earliest scheduled instruction in the dependence
+  /// chain.
+  int earliestCycleInChain(const SDep &Dep);
+
+  /// Return the cycle of the latest scheduled instruction in the dependence
+  /// chain.
+  int latestCycleInChain(const SDep &Dep);
+
+  void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
+                    int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
+  bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
+
+  /// Iterators for the cycle to instruction map.
+  using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator;
+  using const_sched_iterator =
+      DenseMap<int, std::deque<SUnit *>>::const_iterator;
+
+  /// Return true if the instruction is scheduled at the specified stage.
+  bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
+    return (stageScheduled(SU) == (int)StageNum);
+  }
+
+  /// Return the stage for a scheduled instruction.  Return -1 if
+  /// the instruction has not been scheduled.
+  int stageScheduled(SUnit *SU) const {
+    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
+    if (it == InstrToCycle.end())
+      return -1;
+    return (it->second - FirstCycle) / InitiationInterval;
+  }
+
+  /// Return the cycle for a scheduled instruction. This function normalizes
+  /// the first cycle to be 0.
+  unsigned cycleScheduled(SUnit *SU) const {
+    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
+    assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
+    return (it->second - FirstCycle) % InitiationInterval;
+  }
+
+  /// Return the maximum stage count needed for this schedule.
+  unsigned getMaxStageCount() {
+    return (LastCycle - FirstCycle) / InitiationInterval;
+  }
+
+  /// Return the max. number of stages/iterations that can occur between a
+  /// register definition and its uses.
+  unsigned getStagesForReg(int Reg, unsigned CurStage) {
+    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
+    if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
+      return 1;
+    return Stages.first;
+  }
+
+  /// The number of stages for a Phi is a little different than other
+  /// instructions. The minimum value computed in RegToStageDiff is 1
+  /// because we assume the Phi is needed for at least 1 iteration.
+  /// This is not the case if the loop value is scheduled prior to the
+  /// Phi in the same stage.  This function returns the number of stages
+  /// or iterations needed between the Phi definition and any uses.
+  unsigned getStagesForPhi(int Reg) {
+    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
+    if (Stages.second)
+      return Stages.first;
+    return Stages.first - 1;
+  }
+
+  /// Return the instructions that are scheduled at the specified cycle.
+  std::deque<SUnit *> &getInstructions(int cycle) {
+    return ScheduledInstrs[cycle];
+  }
+
+  bool isValidSchedule(SwingSchedulerDAG *SSD);
+  void finalizeSchedule(SwingSchedulerDAG *SSD);
+  void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
+                       std::deque<SUnit *> &Insts);
+  bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
+  bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def,
+                             MachineOperand &MO);
+  void print(raw_ostream &os) const;
+  void dump() const;
+};
+
+} // end anonymous namespace
 
 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
 char MachinePipeliner::ID = 0;
-- 
GitLab


From b0f74f04d32bc2ed5e890a1b2308a52929b22abd Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 23 Oct 2018 14:37:29 +0000
Subject: [PATCH 0442/1116] [InstCombine] add/move tests for select with
 inverted condition; NFC

The transform is broken in 2 ways - it doesn't correct metadata (or even drop it),
and it doesn't work with vectors with undef elements.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345033 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/select_meta.ll | 39 ++++++++++++++++++++++
 test/Transforms/InstCombine/xor.ll         | 10 ------
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/test/Transforms/InstCombine/select_meta.ll b/test/Transforms/InstCombine/select_meta.ll
index 816504b296c..250a33b4064 100644
--- a/test/Transforms/InstCombine/select_meta.ll
+++ b/test/Transforms/InstCombine/select_meta.ll
@@ -298,6 +298,45 @@ define i32 @umax2(i32 %x) {
   ret i32 %sel
 }
 
+; FIXME: The condition is inverted, and the select ops are swapped. The metadata should be swapped.
+
+define i32 @not_cond(i1 %c, i32 %tv, i32 %fv) {
+; CHECK-LABEL: @not_cond(
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C:%.*]], i32 [[FV:%.*]], i32 [[TV:%.*]], !prof ![[$MD1]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %notc = xor i1 %c, true
+  %r = select i1 %notc, i32 %tv, i32 %fv, !prof !1
+  ret i32 %r
+}
+
+; FIXME: The condition is inverted, and the select ops are swapped. The metadata should be swapped.
+
+define <2 x i32> @not_cond_vec(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
+; CHECK-LABEL: @not_cond_vec(
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[C:%.*]], <2 x i32> [[FV:%.*]], <2 x i32> [[TV:%.*]], !prof ![[$MD1]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %notc = xor <2 x i1> %c, <i1 true, i1 true>
+  %r = select <2 x i1> %notc, <2 x i32> %tv, <2 x i32> %fv, !prof !1
+  ret <2 x i32> %r
+}
+
+; FIXME: Should match vector 'not' with undef element. After that...
+; FIXME: The condition is inverted, and the select ops are swapped. The metadata should be swapped.
+
+define <2 x i32> @not_cond_vec_undef(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
+; CHECK-LABEL: @not_cond_vec_undef(
+; CHECK-NEXT:    [[NOTC:%.*]] = xor <2 x i1> [[C:%.*]], <i1 undef, i1 true>
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[NOTC]], <2 x i32> [[TV:%.*]], <2 x i32> [[FV:%.*]], !prof ![[$MD1]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %notc = xor <2 x i1> %c, <i1 undef, i1 true>
+  %r = select <2 x i1> %notc, <2 x i32> %tv, <2 x i32> %fv, !prof !1
+  ret <2 x i32> %r
+}
+
+
 !1 = !{!"branch_weights", i32 2, i32 10}
 !2 = !{!"branch_weights", i32 3, i32 10}
 
diff --git a/test/Transforms/InstCombine/xor.ll b/test/Transforms/InstCombine/xor.ll
index c149cef295b..b06abe2919b 100644
--- a/test/Transforms/InstCombine/xor.ll
+++ b/test/Transforms/InstCombine/xor.ll
@@ -190,16 +190,6 @@ define void @test20(i32 %A, i32 %B) {
   ret void
 }
 
-define i32 @test21(i1 %C, i32 %A, i32 %B) {
-; CHECK-LABEL: @test21(
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[C:%.*]], i32 [[B:%.*]], i32 [[A:%.*]]
-; CHECK-NEXT:    ret i32 [[D]]
-;
-  %C2 = xor i1 %C, true
-  %D = select i1 %C2, i32 %A, i32 %B
-  ret i32 %D
-}
-
 define i32 @test22(i1 %X) {
 ; CHECK-LABEL: @test22(
 ; CHECK-NEXT:    [[Z:%.*]] = zext i1 [[X:%.*]] to i32
-- 
GitLab


From fa8e666b5510d4e3d7455eff5c2b31504aa2263b Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 23 Oct 2018 14:43:31 +0000
Subject: [PATCH 0443/1116] [InstCombine] swap select profile metadata when
 swapping select ops

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345034 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineSelect.cpp |  1 +
 test/Transforms/InstCombine/select_meta.ll       | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 3d346dff2f8..c15999d81b3 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1993,6 +1993,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     SI.setOperand(0, BinaryOperator::getNotArgument(CondVal));
     SI.setOperand(1, FalseVal);
     SI.setOperand(2, TrueVal);
+    SI.swapProfMetadata();
     return &SI;
   }
 
diff --git a/test/Transforms/InstCombine/select_meta.ll b/test/Transforms/InstCombine/select_meta.ll
index 250a33b4064..c9e277f1563 100644
--- a/test/Transforms/InstCombine/select_meta.ll
+++ b/test/Transforms/InstCombine/select_meta.ll
@@ -298,11 +298,11 @@ define i32 @umax2(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME: The condition is inverted, and the select ops are swapped. The metadata should be swapped.
+; The condition is inverted, and the select ops are swapped. The metadata should be swapped.
 
 define i32 @not_cond(i1 %c, i32 %tv, i32 %fv) {
 ; CHECK-LABEL: @not_cond(
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[C:%.*]], i32 [[FV:%.*]], i32 [[TV:%.*]], !prof ![[$MD1]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C:%.*]], i32 [[FV:%.*]], i32 [[TV:%.*]], !prof ![[$MD3]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %notc = xor i1 %c, true
@@ -310,11 +310,11 @@ define i32 @not_cond(i1 %c, i32 %tv, i32 %fv) {
   ret i32 %r
 }
 
-; FIXME: The condition is inverted, and the select ops are swapped. The metadata should be swapped.
+; The condition is inverted, and the select ops are swapped. The metadata should be swapped.
 
 define <2 x i32> @not_cond_vec(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
 ; CHECK-LABEL: @not_cond_vec(
-; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[C:%.*]], <2 x i32> [[FV:%.*]], <2 x i32> [[TV:%.*]], !prof ![[$MD1]]
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[C:%.*]], <2 x i32> [[FV:%.*]], <2 x i32> [[TV:%.*]], !prof ![[$MD3]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %notc = xor <2 x i1> %c, <i1 true, i1 true>
@@ -323,7 +323,7 @@ define <2 x i32> @not_cond_vec(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
 }
 
 ; FIXME: Should match vector 'not' with undef element. After that...
-; FIXME: The condition is inverted, and the select ops are swapped. The metadata should be swapped.
+; The condition is inverted, and the select ops are swapped. The metadata should be swapped.
 
 define <2 x i32> @not_cond_vec_undef(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
 ; CHECK-LABEL: @not_cond_vec_undef(
-- 
GitLab


From f46dd75b538cbf3131ddc1fb06a24cdfc8296450 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 23 Oct 2018 15:05:12 +0000
Subject: [PATCH 0444/1116] [InstCombine] use 'match' to handle vectors and
 simplify code

This is another step towards completely removing the fake
binop queries for not/neg/fneg.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345036 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineSelect.cpp             | 5 +++--
 test/Transforms/InstCombine/select_meta.ll                   | 5 ++---
 .../LoopVectorize/invariant-store-vectorization.ll           | 3 +--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index c15999d81b3..00dcacccb40 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1989,8 +1989,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     }
   }
 
-  if (BinaryOperator::isNot(CondVal)) {
-    SI.setOperand(0, BinaryOperator::getNotArgument(CondVal));
+  Value *NotCond;
+  if (match(CondVal, m_Not(m_Value(NotCond)))) {
+    SI.setOperand(0, NotCond);
     SI.setOperand(1, FalseVal);
     SI.setOperand(2, TrueVal);
     SI.swapProfMetadata();
diff --git a/test/Transforms/InstCombine/select_meta.ll b/test/Transforms/InstCombine/select_meta.ll
index c9e277f1563..67dd246c040 100644
--- a/test/Transforms/InstCombine/select_meta.ll
+++ b/test/Transforms/InstCombine/select_meta.ll
@@ -322,13 +322,12 @@ define <2 x i32> @not_cond_vec(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
   ret <2 x i32> %r
 }
 
-; FIXME: Should match vector 'not' with undef element. After that...
+; Should match vector 'not' with undef element.
 ; The condition is inverted, and the select ops are swapped. The metadata should be swapped.
 
 define <2 x i32> @not_cond_vec_undef(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
 ; CHECK-LABEL: @not_cond_vec_undef(
-; CHECK-NEXT:    [[NOTC:%.*]] = xor <2 x i1> [[C:%.*]], <i1 undef, i1 true>
-; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[NOTC]], <2 x i32> [[TV:%.*]], <2 x i32> [[FV:%.*]], !prof ![[$MD1]]
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[C:%.*]], <2 x i32> [[FV:%.*]], <2 x i32> [[TV:%.*]], !prof ![[$MD3]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %notc = xor <2 x i1> %c, <i1 undef, i1 true>
diff --git a/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 69e202f8889..e341576e531 100644
--- a/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -293,8 +293,7 @@ for.end:                                          ; preds = %for.body
 ; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[K]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP2]], <i1 undef, i1 undef, i1 undef, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP3]], <4 x i32> [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32> [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-- 
GitLab


From 838ac3865fe0ecd6e45d2d4be43bf7ed59cfb06e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Oct 2018 15:13:09 +0000
Subject: [PATCH 0445/1116] [SLPVectorizer] Add basic support for
 mul/and/or/xor horizontal reductions

Expand arithmetic reduction to include mul/and/or/xor instructions.

This patch just fixes the SLPVectorizer - the effective reduction costs for AVX1+ are still poor (see rL344846) and will need to be improved before SLP sees this as a valid transform - but we can already see the effect on SSE2 tests.

This partially helps PR37731, but doesn't fix it all as it still falls over on the extraction/reduction order for some reason.

Differential Revision: https://reviews.llvm.org/D53473

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345037 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp    |   7 +-
 .../SLPVectorizer/X86/reduction_unrolled.ll   | 105 +++++++++---------
 2 files changed, 59 insertions(+), 53 deletions(-)

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5fdbf219009..3592df3ede3 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5126,9 +5126,12 @@ class HorizontalReduction {
     /// Checks if the reduction operation can be vectorized.
     bool isVectorizable() const {
       return LHS && RHS &&
-             // We currently only support adds && min/max reductions.
+             // We currently only support add/mul/logical && min/max reductions.
              ((Kind == RK_Arithmetic &&
-               (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) ||
+               (Opcode == Instruction::Add || Opcode == Instruction::FAdd ||
+                Opcode == Instruction::Mul || Opcode == Instruction::FMul ||
+                Opcode == Instruction::And || Opcode == Instruction::Or ||
+                Opcode == Instruction::Xor)) ||
               ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
                (Kind == RK_Min || Kind == RK_Max)) ||
               (Opcode == Instruction::ICmp &&
diff --git a/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
index b5a96025764..80c9044e80a 100644
--- a/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -217,29 +217,30 @@ define i32 @test_and(i32* nocapture readonly %p) {
 ;
 ; SSE2-LABEL: @test_and(
 ; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
-; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; SSE2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
-; SSE2-NEXT:    [[MUL_18:%.*]] = and i32 [[TMP1]], [[TMP0]]
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; SSE2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
-; SSE2-NEXT:    [[MUL_29:%.*]] = and i32 [[TMP2]], [[MUL_18]]
 ; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; SSE2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
-; SSE2-NEXT:    [[MUL_310:%.*]] = and i32 [[TMP3]], [[MUL_29]]
 ; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; SSE2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
-; SSE2-NEXT:    [[MUL_411:%.*]] = and i32 [[TMP4]], [[MUL_310]]
 ; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; SSE2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
-; SSE2-NEXT:    [[MUL_512:%.*]] = and i32 [[TMP5]], [[MUL_411]]
 ; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; SSE2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
-; SSE2-NEXT:    [[MUL_613:%.*]] = and i32 [[TMP6]], [[MUL_512]]
 ; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; SSE2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
-; SSE2-NEXT:    [[MUL_714:%.*]] = and i32 [[TMP7]], [[MUL_613]]
-; SSE2-NEXT:    ret i32 [[MUL_714]]
+; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
+; SSE2-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
+; SSE2-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
+; SSE2-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
+; SSE2-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
+; SSE2-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
+; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE2-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -303,29 +304,30 @@ define i32 @test_or(i32* nocapture readonly %p) {
 ;
 ; SSE2-LABEL: @test_or(
 ; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
-; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; SSE2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
-; SSE2-NEXT:    [[MUL_18:%.*]] = or i32 [[TMP1]], [[TMP0]]
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; SSE2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
-; SSE2-NEXT:    [[MUL_29:%.*]] = or i32 [[TMP2]], [[MUL_18]]
 ; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; SSE2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
-; SSE2-NEXT:    [[MUL_310:%.*]] = or i32 [[TMP3]], [[MUL_29]]
 ; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; SSE2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
-; SSE2-NEXT:    [[MUL_411:%.*]] = or i32 [[TMP4]], [[MUL_310]]
 ; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; SSE2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
-; SSE2-NEXT:    [[MUL_512:%.*]] = or i32 [[TMP5]], [[MUL_411]]
 ; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; SSE2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
-; SSE2-NEXT:    [[MUL_613:%.*]] = or i32 [[TMP6]], [[MUL_512]]
 ; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; SSE2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
-; SSE2-NEXT:    [[MUL_714:%.*]] = or i32 [[TMP7]], [[MUL_613]]
-; SSE2-NEXT:    ret i32 [[MUL_714]]
+; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
+; SSE2-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
+; SSE2-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
+; SSE2-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
+; SSE2-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
+; SSE2-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
+; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE2-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -389,29 +391,30 @@ define i32 @test_xor(i32* nocapture readonly %p) {
 ;
 ; SSE2-LABEL: @test_xor(
 ; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
-; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; SSE2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
-; SSE2-NEXT:    [[MUL_18:%.*]] = xor i32 [[TMP1]], [[TMP0]]
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; SSE2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
-; SSE2-NEXT:    [[MUL_29:%.*]] = xor i32 [[TMP2]], [[MUL_18]]
 ; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; SSE2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
-; SSE2-NEXT:    [[MUL_310:%.*]] = xor i32 [[TMP3]], [[MUL_29]]
 ; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; SSE2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
-; SSE2-NEXT:    [[MUL_411:%.*]] = xor i32 [[TMP4]], [[MUL_310]]
 ; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; SSE2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
-; SSE2-NEXT:    [[MUL_512:%.*]] = xor i32 [[TMP5]], [[MUL_411]]
 ; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; SSE2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
-; SSE2-NEXT:    [[MUL_613:%.*]] = xor i32 [[TMP6]], [[MUL_512]]
 ; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; SSE2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
-; SSE2-NEXT:    [[MUL_714:%.*]] = xor i32 [[TMP7]], [[MUL_613]]
-; SSE2-NEXT:    ret i32 [[MUL_714]]
+; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
+; SSE2-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
+; SSE2-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
+; SSE2-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
+; SSE2-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
+; SSE2-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
+; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE2-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
-- 
GitLab


From e3237d202fb0a963b01d491150c6d6d24d1bffb4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Oct 2018 15:37:19 +0000
Subject: [PATCH 0446/1116] [LegalizeDAG] Share Vector/Scalar CTTZ Expansion

As suggested on D53258, this patch demonstrates sharing common CTTZ expansion code between VectorLegalizer and SelectionDAGLegalize by putting it in TargetLowering.

I intend to move CTLZ and (scalar) CTPOP over as well and then update D53258 accordingly.

Differential Revision: https://reviews.llvm.org/D53474

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345039 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/TargetLowering.h         |  7 +++
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      | 36 ++----------
 .../SelectionDAG/LegalizeVectorOps.cpp        | 20 ++-----
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   | 55 +++++++++++++++++++
 4 files changed, 71 insertions(+), 47 deletions(-)

diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 2a02ac1ecb0..bcfe0fd6e74 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -3647,6 +3647,13 @@ public:
   /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
   SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand CTTZ/CTTZ_ZERO_UNDEF nodes. Expands vector/scalar CTTZ nodes,
+  /// vector nodes can only succeed if all operations are legal/custom.
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandCTTZ(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
   /// Turn load of vector type into a load of the individual elements.
   /// \param LD load to expand
   /// \returns MERGE_VALUEs of the scalar loads with their chains.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7b25d9f98ff..e03263a9948 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2789,35 +2789,6 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     Op = DAG.getNOT(dl, Op, VT);
     return DAG.getNode(ISD::CTPOP, dl, VT, Op);
   }
-  case ISD::CTTZ_ZERO_UNDEF:
-    // This trivially expands to CTTZ.
-    return DAG.getNode(ISD::CTTZ, dl, VT, Op);
-  case ISD::CTTZ: {
-    if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
-      EVT SetCCVT = getSetCCResultType(VT);
-      SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
-      SDValue Zero = DAG.getConstant(0, dl, VT);
-      SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
-      return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
-                         DAG.getConstant(Len, dl, VT), CTTZ);
-    }
-
-    // for now, we use: { return popcount(~x & (x - 1)); }
-    // unless the target has ctlz but not ctpop, in which case we use:
-    // { return 32 - nlz(~x & (x-1)); }
-    // Ref: "Hacker's Delight" by Henry Warren
-    SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
-                               DAG.getNOT(dl, Op, VT),
-                               DAG.getNode(ISD::SUB, dl, VT, Op,
-                                           DAG.getConstant(1, dl, VT)));
-    // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
-    if (!TLI.isOperationLegal(ISD::CTPOP, VT) &&
-        TLI.isOperationLegal(ISD::CTLZ, VT))
-      return DAG.getNode(ISD::SUB, dl, VT,
-                         DAG.getConstant(Len, dl, VT),
-                         DAG.getNode(ISD::CTLZ, dl, VT, Tmp3));
-    return DAG.getNode(ISD::CTPOP, dl, VT, Tmp3);
-  }
   }
 }
 
@@ -2831,11 +2802,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::CTPOP:
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
-  case ISD::CTTZ:
-  case ISD::CTTZ_ZERO_UNDEF:
     Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
     Results.push_back(Tmp1);
     break;
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+    if (TLI.expandCTTZ(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
+    break;
   case ISD::BITREVERSE:
     Results.push_back(ExpandBITREVERSE(Node->getOperand(0), dl));
     break;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 787091a7f4c..d0fb3ea2d30 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1105,23 +1105,11 @@ SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
 
 SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
   EVT VT = Op.getValueType();
-  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
-
-  // If the non-ZERO_UNDEF version is supported we can use that instead.
-  if (TLI.isOperationLegalOrCustom(ISD::CTTZ, VT)) {
-    SDLoc DL(Op);
-    return DAG.getNode(ISD::CTTZ, DL, VT, Op.getOperand(0));
-  }
 
-  // If we have the appropriate vector bit operations, it is better to use them
-  // than unrolling and expanding each component.
-  if (isPowerOf2_32(NumBitsPerElt) &&
-      (TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) ||
-       TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) &&
-      TLI.isOperationLegalOrCustom(ISD::SUB, VT) &&
-      TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT) &&
-      TLI.isOperationLegalOrCustomOrPromote(ISD::XOR, VT))
-    return Op;
+  // Attempt to expand using TargetLowering.
+  SDValue Result;
+  if (TLI.expandCTTZ(Op.getNode(), Result, DAG))
+    return Result;
 
   // Otherwise go ahead and unroll.
   return DAG.UnrollVectorOp(Op.getNode());
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index d31d6344519..b9b0941903b 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4142,6 +4142,61 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
   return SDValue();
 }
 
+bool TargetLowering::expandCTTZ(SDNode *Node, SDValue &Result,
+                                SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue Op = Node->getOperand(0);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // If the non-ZERO_UNDEF version is supported we can use that instead.
+  if (Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
+      isOperationLegalOrCustom(ISD::CTTZ, VT)) {
+    Result = DAG.getNode(ISD::CTTZ, dl, VT, Op);
+    return true;
+  }
+
+  // If the ZERO_UNDEF version is supported use that and handle the zero case.
+  if (isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
+    Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
+                         DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ);
+    return true;
+  }
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
+                        (!isOperationLegalOrCustom(ISD::CTPOP, VT) &&
+                         !isOperationLegalOrCustom(ISD::CTLZ, VT)) ||
+                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::AND, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
+    return false;
+
+  // for now, we use: { return popcount(~x & (x - 1)); }
+  // unless the target has ctlz but not ctpop, in which case we use:
+  // { return 32 - nlz(~x & (x-1)); }
+  // Ref: "Hacker's Delight" by Henry Warren
+  SDValue Tmp = DAG.getNode(
+      ISD::AND, dl, VT, DAG.getNOT(dl, Op, VT),
+      DAG.getNode(ISD::SUB, dl, VT, Op, DAG.getConstant(1, dl, VT)));
+
+  // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
+  if (isOperationLegal(ISD::CTLZ, VT) && !isOperationLegal(ISD::CTPOP, VT)) {
+    Result =
+        DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(NumBitsPerElt, dl, VT),
+                    DAG.getNode(ISD::CTLZ, dl, VT, Tmp));
+    return true;
+  }
+
+  Result = DAG.getNode(ISD::CTPOP, dl, VT, Tmp);
+  return true;
+}
+
 SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
                                             SelectionDAG &DAG) const {
   SDLoc SL(LD);
-- 
GitLab


From 8681ecc0ffb1d778d2af4a1127feda90e9e37241 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Tue, 23 Oct 2018 15:43:36 +0000
Subject: [PATCH 0447/1116] [LegalizeDAG] Remove unused variable

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345040 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index d0fb3ea2d30..9f18920a8a1 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1104,8 +1104,6 @@ SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
 }
 
 SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
-  EVT VT = Op.getValueType();
-
   // Attempt to expand using TargetLowering.
   SDValue Result;
   if (TLI.expandCTTZ(Op.getNode(), Result, DAG))
-- 
GitLab


From 3a1d78fdddff2f7c63f737b926947da8e4c7174e Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 23 Oct 2018 15:46:10 +0000
Subject: [PATCH 0448/1116] [SelectionDAG] use 'match' to simplify code; NFC

Vector types are not possible here because this code only starts
matching from the scalar bool value of a conditional branch, but
this is another step towards completely removing the fake binop
queries for not/neg/fneg.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345041 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 71814d79098..87921ccb074 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -88,6 +88,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
@@ -121,6 +122,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "isel"
 
@@ -1824,7 +1826,6 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
   SwitchCases.push_back(CB);
 }
 
-/// FindMergedConditions - If Cond is an expression like
 void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
                                                MachineBasicBlock *TBB,
                                                MachineBasicBlock *FBB,
@@ -1836,13 +1837,12 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
                                                bool InvertCond) {
   // Skip over not part of the tree and remember to invert op and operands at
   // next level.
-  if (BinaryOperator::isNot(Cond) && Cond->hasOneUse()) {
-    const Value *CondOp = BinaryOperator::getNotArgument(Cond);
-    if (InBlock(CondOp, CurBB->getBasicBlock())) {
-      FindMergedConditions(CondOp, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
-                           !InvertCond);
-      return;
-    }
+  Value *NotCond;
+  if (match(Cond, m_OneUse(m_Not(m_Value(NotCond)))) &&
+      InBlock(NotCond, CurBB->getBasicBlock())) {
+    FindMergedConditions(NotCond, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
+                         !InvertCond);
+    return;
   }
 
   const Instruction *BOp = dyn_cast<Instruction>(Cond);
-- 
GitLab


From 1a72ab0867cc1d40a868fe189a47398ea2280ecf Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 23 Oct 2018 15:55:06 +0000
Subject: [PATCH 0449/1116] [Reassociate] replace fake binop queries with
 'match' API

We need to update this code before introducing an 'fneg' instruction in IR,
so we might as well kill off the integer neg/not queries too.

This is no-functional-change-intended for scalar code and most vector code.
For vectors, we can see that the 'match' API allows for undef elements in
constants, so we optimize those cases better.

Ideally, there would be a test for each code diff, but I don't see evidence
of that for the existing code, so I didn't try very hard to come up with new
vector tests for each code change.

Differential Revision: https://reviews.llvm.org/D53533


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345042 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/Reassociate.cpp   | 34 ++++++++++++-------------
 test/Transforms/Reassociate/inverses.ll |  5 +---
 test/Transforms/Reassociate/negation.ll |  5 ++--
 3 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 78e0fe47be0..c4faab7c58c 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -205,9 +205,9 @@ unsigned ReassociatePass::getRank(Value *V) {
   for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i)
     Rank = std::max(Rank, getRank(I->getOperand(i)));
 
-  // If this is a not or neg instruction, do not count it for rank.  This
+  // If this is a 'not' or 'neg' instruction, do not count it for rank. This
   // assures us that X and ~X will have the same rank.
-  if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
+  if (!match(I, m_Not(m_Value())) && !match(I, m_Neg(m_Value())) &&
       !BinaryOperator::isFNeg(I))
     ++Rank;
 
@@ -574,7 +574,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
       // If this is a multiply expression, turn any internal negations into
       // multiplies by -1 so they can be reassociated.
       if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
-        if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
+        if ((Opcode == Instruction::Mul && match(BO, m_Neg(m_Value()))) ||
             (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
           LLVM_DEBUG(dbgs()
                      << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
@@ -855,7 +855,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
   // Okay, we need to materialize a negated version of V with an instruction.
   // Scan the use lists of V to see if we have one already.
   for (User *U : V->users()) {
-    if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U))
+    if (!match(U, m_Neg(m_Value())) && !BinaryOperator::isFNeg(U))
       continue;
 
     // We found one!  Now we have to make sure that the definition dominates
@@ -900,7 +900,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
 /// Return true if we should break up this subtract of X-Y into (X + -Y).
 static bool ShouldBreakUpSubtract(Instruction *Sub) {
   // If this is a negation, we can't split it up!
-  if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub))
+  if (match(Sub, m_Neg(m_Value())) || BinaryOperator::isFNeg(Sub))
     return false;
 
   // Don't breakup X - undef.
@@ -1114,8 +1114,8 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     // First, check for X and ~X in the operand list.
     assert(i < Ops.size());
-    if (BinaryOperator::isNot(Ops[i].Op)) {    // Cannot occur for ^.
-      Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
+    Value *X;
+    if (match(Ops[i].Op, m_Not(m_Value(X)))) {    // Cannot occur for ^.
       unsigned FoundX = FindInOperandList(Ops, i, X);
       if (FoundX != i) {
         if (Opcode == Instruction::And)   // ...&X&~X = 0
@@ -1461,15 +1461,13 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
     }
 
     // Check for X and -X or X and ~X in the operand list.
-    if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) &&
-        !BinaryOperator::isNot(TheOp))
+    Value *X;
+    if (!match(TheOp, m_Neg(m_Value(X))) && !match(TheOp, m_Not(m_Value(X))) &&
+        !BinaryOperator::isFNeg(TheOp))
       continue;
 
-    Value *X = nullptr;
-    if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))
-      X = BinaryOperator::getNegArgument(TheOp);
-    else if (BinaryOperator::isNot(TheOp))
-      X = BinaryOperator::getNotArgument(TheOp);
+    if (BinaryOperator::isFNeg(TheOp))
+      X = BinaryOperator::getFNegArgument(TheOp);
 
     unsigned FoundX = FindInOperandList(Ops, i, X);
     if (FoundX == i)
@@ -1477,11 +1475,11 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
 
     // Remove X and -X from the operand list.
     if (Ops.size() == 2 &&
-        (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)))
+        (match(TheOp, m_Neg(m_Value())) || BinaryOperator::isFNeg(TheOp)))
       return Constant::getNullValue(X->getType());
 
     // Remove X and ~X from the operand list.
-    if (Ops.size() == 2 && BinaryOperator::isNot(TheOp))
+    if (Ops.size() == 2 && match(TheOp, m_Not(m_Value())))
       return Constant::getAllOnesValue(X->getType());
 
     Ops.erase(Ops.begin()+i);
@@ -1495,7 +1493,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
     e -= 2;  // Removed two elements.
 
     // if X and ~X we append -1 to the operand list.
-    if (BinaryOperator::isNot(TheOp)) {
+    if (match(TheOp, m_Not(m_Value()))) {
       Value *V = Constant::getAllOnesValue(X->getType());
       Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
       e += 1;
@@ -2059,7 +2057,7 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
       RedoInsts.insert(I);
       MadeChange = true;
       I = NI;
-    } else if (BinaryOperator::isNeg(I)) {
+    } else if (match(I, m_Neg(m_Value()))) {
       // Otherwise, this is a negation.  See if the operand is a multiply tree
       // and if this is not an inner node of a multiply tree.
       if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
diff --git a/test/Transforms/Reassociate/inverses.ll b/test/Transforms/Reassociate/inverses.ll
index 15c77206e72..14753b1724b 100644
--- a/test/Transforms/Reassociate/inverses.ll
+++ b/test/Transforms/Reassociate/inverses.ll
@@ -14,10 +14,7 @@ define i32 @test1(i32 %a, i32 %b) {
 
 define <2 x i32> @not_op_vec_undef(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: @not_op_vec_undef(
-; CHECK-NEXT:    [[T2:%.*]] = and <2 x i32> [[B:%.*]], [[A:%.*]]
-; CHECK-NEXT:    [[T4:%.*]] = xor <2 x i32> [[A]], <i32 -1, i32 undef>
-; CHECK-NEXT:    [[T5:%.*]] = and <2 x i32> [[T2]], [[T4]]
-; CHECK-NEXT:    ret <2 x i32> [[T5]]
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
 ;
   %t2 = and <2 x i32> %b, %a
   %t4 = xor <2 x i32> %a, <i32 -1, i32 undef>
diff --git a/test/Transforms/Reassociate/negation.ll b/test/Transforms/Reassociate/negation.ll
index 59b7b5dca1d..f443083ff3f 100644
--- a/test/Transforms/Reassociate/negation.ll
+++ b/test/Transforms/Reassociate/negation.ll
@@ -33,9 +33,8 @@ define i32 @test2(i32 %a, i32 %b, i32 %z) {
 
 define <2 x i32> @negate_vec_undefs(<2 x i32> %a, <2 x i32> %b, <2 x i32> %z) {
 ; CHECK-LABEL: @negate_vec_undefs(
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[Z:%.*]], <i32 -40, i32 -40>
-; CHECK-NEXT:    [[E:%.*]] = mul <2 x i32> [[TMP1]], [[A:%.*]]
-; CHECK-NEXT:    [[F:%.*]] = sub <2 x i32> <i32 0, i32 undef>, [[E]]
+; CHECK-NEXT:    [[E:%.*]] = mul <2 x i32> [[A:%.*]], <i32 40, i32 40>
+; CHECK-NEXT:    [[F:%.*]] = mul <2 x i32> [[E]], [[Z:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[F]]
 ;
   %d = mul <2 x i32> %z, <i32 40, i32 40>
-- 
GitLab


From c924df94b9c9dee68d65dfacde0888748c16dc1f Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 23 Oct 2018 16:05:09 +0000
Subject: [PATCH 0450/1116] [WebAssembly] use 'match' to simplify code; NFC

Vector types are not possible here because this code explicitly
checks for a scalar type, but this is another step towards
completely removing the fake binop queries for not/neg/fneg.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345043 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyFastISel.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 0be4f228347..5611a1b4588 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -37,7 +37,10 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+
 using namespace llvm;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "wasm-fastisel"
 
@@ -417,9 +420,10 @@ unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, bool &Not) {
         return getRegForValue(ICmp->getOperand(0));
       }
 
-  if (BinaryOperator::isNot(V) && V->getType()->isIntegerTy(32)) {
+  Value *NotV;
+  if (match(V, m_Not(m_Value(NotV))) && V->getType()->isIntegerTy(32)) {
     Not = true;
-    return getRegForValue(BinaryOperator::getNotArgument(V));
+    return getRegForValue(NotV);
   }
 
   Not = false;
-- 
GitLab


From 922e1373b54f7b5584ada32dd5ce38cd613f6d6c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Oct 2018 16:27:14 +0000
Subject: [PATCH 0451/1116] [CostModel][X86] Add transpose shuffle cost tests

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345045 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../CostModel/X86/shuffle-transpose.ll        | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 test/Analysis/CostModel/X86/shuffle-transpose.ll

diff --git a/test/Analysis/CostModel/X86/shuffle-transpose.ll b/test/Analysis/CostModel/X86/shuffle-transpose.ll
new file mode 100644
index 00000000000..2a846bf0269
--- /dev/null
+++ b/test/Analysis/CostModel/X86/shuffle-transpose.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VBMI
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+;
+; Verify the cost model for transpose shuffles.
+;
+
+define void @test_vXf64(<2 x double> %a128, <2 x double> %b128, <4 x double> %a256, <4 x double> %b256, <8 x double> %a512, <8 x double> %b512) {
+; SSE-LABEL: 'test_vXf64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXf64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXf64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXf64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+  %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret void
+}
+
+define void @test_vXi64(<2 x i64> %a128, <2 x i64> %b128, <4 x i64> %a256, <4 x i64> %b256, <8 x i64> %a512, <8 x i64> %b512) {
+; CHECK-LABEL: 'test_vXi64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXi64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+  %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret void
+}
+
+define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <4 x float> %b128, <8 x float> %a256, <8 x float> %b256, <16 x float> %a512, <16 x float> %b512) {
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXf32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXf32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXf32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+  %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret void
+}
+
+define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i32> %b128, <8 x i32> %a256, <8 x i32> %b256, <16 x i32> %a512, <16 x i32> %b512) {
+; CHECK-LABEL: 'test_vXi32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXi32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+  %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret void
+}
+
+define void @test_vXi16(<8 x i16> %a128, <8 x i16> %b128, <16 x i16> %a256, <16 x i16> %b256, <32 x i16> %a512, <32 x i16> %b512) {
+; CHECK-LABEL: 'test_vXi16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXi16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+  ret void
+}
+
+define void @test_vXi8(<16 x i8> %a128, <16 x i8> %b128, <32 x i8> %a256, <32 x i8> %b256, <64 x i8> %a512, <64 x i8> %b512) {
+; CHECK-LABEL: 'test_vXi8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXi8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+  %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+  ret void
+}
+
-- 
GitLab


From 22a8730763ca1f7ba62751c2767fe4042d681642 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 23 Oct 2018 16:35:51 +0000
Subject: [PATCH 0452/1116] [DebugInfo][GlobalOpt] Fix -debugify for globalopt
 shrinking globals to booleans.

Summary:
TryToShrinkGlobalToBoolean, when possible, will split store <value> + load <value> into store <bool> + select <bool ? value : 0>. This preserves DebugLoc during that pass.

Fixes PR37959. The test case here is the simplified .ll for:

```
static int foo;
int bar() {
  foo = 5;
  return foo;
}
```

Reviewers: dblaikie, gbedwell, aprantl

Reviewed By: dblaikie

Subscribers: mehdi_amini, JDevlieghere, dexonsmith, llvm-commits

Tags: #debug-info

Differential Revision: https://reviews.llvm.org/D53531

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345046 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/GlobalOpt.cpp              | 12 +++++++---
 .../shrink-global-to-bool-check-debug.ll      | 22 +++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100644 test/Transforms/GlobalOpt/shrink-global-to-bool-check-debug.ll

diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 5518ef8fce9..3005aafd06b 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1710,19 +1710,25 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
           assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!");
         }
       }
-      new StoreInst(StoreVal, NewGV, false, 0,
-                    SI->getOrdering(), SI->getSyncScopeID(), SI);
+      StoreInst *NSI =
+          new StoreInst(StoreVal, NewGV, false, 0, SI->getOrdering(),
+                        SI->getSyncScopeID(), SI);
+      NSI->setDebugLoc(SI->getDebugLoc());
     } else {
       // Change the load into a load of bool then a select.
       LoadInst *LI = cast<LoadInst>(UI);
       LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", false, 0,
                                    LI->getOrdering(), LI->getSyncScopeID(), LI);
-      Value *NSI;
+      Instruction *NSI;
       if (IsOneZero)
         NSI = new ZExtInst(NLI, LI->getType(), "", LI);
       else
         NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI);
       NSI->takeName(LI);
+      // Since LI is split into two instructions, NLI and NSI both inherit the
+      // same DebugLoc
+      NLI->setDebugLoc(LI->getDebugLoc());
+      NSI->setDebugLoc(LI->getDebugLoc());
       LI->replaceAllUsesWith(NSI);
     }
     UI->eraseFromParent();
diff --git a/test/Transforms/GlobalOpt/shrink-global-to-bool-check-debug.ll b/test/Transforms/GlobalOpt/shrink-global-to-bool-check-debug.ll
new file mode 100644
index 00000000000..71019128bb1
--- /dev/null
+++ b/test/Transforms/GlobalOpt/shrink-global-to-bool-check-debug.ll
@@ -0,0 +1,22 @@
+;RUN: opt -S -debugify -globalopt -f %s | FileCheck %s
+
+@foo = internal global i32 0, align 4
+
+define dso_local i32 @bar() {
+entry:
+  store i32 5, i32* @foo, align 4
+  %0 = load i32, i32* @foo, align 4
+  ret i32 %0
+}
+
+;CHECK:      @bar
+;CHECK-NEXT: entry:
+;CHECK-NEXT:   store i1 true, i1* @foo, !dbg ![[DbgLocStore:[0-9]+]]
+;CHECK-NEXT:   %.b = load i1, i1* @foo, !dbg ![[DbgLocLoadSel:[0-9]+]]
+;CHECK-NEXT:   %0 = select i1 %.b, i32 5, i32 0, !dbg ![[DbgLocLoadSel]]
+;CHECK-NEXT:   call void @llvm.dbg.value({{.*}}), !dbg ![[DbgLocLoadSel]]
+;CHECK-NEXT:   ret i32 %0, !dbg ![[DbgLocRet:[0-9]+]]
+
+;CHECK: ![[DbgLocStore]] = !DILocation(line: 1,
+;CHECK: ![[DbgLocLoadSel]] = !DILocation(line: 2,
+;CHECK: ![[DbgLocRet]] = !DILocation(line: 3,
-- 
GitLab


From 6fffb2a18e148b4d01d15a2dcfe579e50aa9ebbb Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Oct 2018 16:45:26 +0000
Subject: [PATCH 0453/1116] [TTI][X86] Treat SK_Transpose shuffles as
 SK_PermuteTwoSrc - there's no difference in lowering.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345048 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetTransformInfo.cpp     |   4 +
 .../CostModel/X86/shuffle-transpose.ll        | 246 +++++++++++++-----
 2 files changed, 189 insertions(+), 61 deletions(-)

diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 82e4dfe25b7..ffc5a029040 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -810,6 +810,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
+  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
+  if (Kind == TTI::SK_Transpose)
+    Kind = TTI::SK_PermuteTwoSrc;
+
   // For Broadcasts we are splatting the first element from the first input
   // register, so only need to reference that input and all the output
   // registers are the same.
diff --git a/test/Analysis/CostModel/X86/shuffle-transpose.ll b/test/Analysis/CostModel/X86/shuffle-transpose.ll
index 2a846bf0269..25a887604fa 100644
--- a/test/Analysis/CostModel/X86/shuffle-transpose.ll
+++ b/test/Analysis/CostModel/X86/shuffle-transpose.ll
@@ -18,27 +18,27 @@
 
 define void @test_vXf64(<2 x double> %a128, <2 x double> %b128, <4 x double> %a256, <4 x double> %b256, <8 x double> %a512, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; BTVER2-LABEL: 'test_vXf64'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
@@ -48,16 +48,28 @@ define void @test_vXf64(<2 x double> %a128, <2 x double> %b128, <4 x double> %a2
 }
 
 define void @test_vXi64(<2 x i64> %a128, <2 x i64> %b128, <4 x i64> %a256, <4 x i64> %b256, <8 x i64> %a512, <8 x i64> %b512) {
-; CHECK-LABEL: 'test_vXi64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXi64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXi64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; BTVER2-LABEL: 'test_vXi64'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
@@ -69,30 +81,37 @@ define void @test_vXi64(<2 x i64> %a128, <2 x i64> %b128, <4 x i64> %a256, <4 x
 define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <4 x float> %b128, <8 x float> %a256, <8 x float> %b256, <16 x float> %a512, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXf32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; BTVER2-LABEL: 'test_vXf32'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
@@ -103,18 +122,39 @@ define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <
 }
 
 define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i32> %b128, <8 x i32> %a256, <8 x i32> %b256, <16 x i32> %a512, <16 x i32> %b512) {
-; CHECK-LABEL: 'test_vXi32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXi32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'test_vXi32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXi32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; BTVER2-LABEL: 'test_vXi32'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
@@ -125,16 +165,58 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 }
 
 define void @test_vXi16(<8 x i16> %a128, <8 x i16> %b128, <16 x i16> %a256, <16 x i16> %b256, <32 x i16> %a512, <32 x i16> %b512) {
-; CHECK-LABEL: 'test_vXi16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_vXi16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi16'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; BTVER2-LABEL: 'test_vXi16'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -144,16 +226,58 @@ define void @test_vXi16(<8 x i16> %a128, <8 x i16> %b128, <16 x i16> %a256, <16
 }
 
 define void @test_vXi8(<16 x i8> %a128, <16 x i8> %b128, <32 x i8> %a256, <32 x i8> %b256, <64 x i8> %a512, <64 x i8> %b512) {
-; CHECK-LABEL: 'test_vXi8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_vXi8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi8'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; BTVER2-LABEL: 'test_vXi8'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-- 
GitLab


From c04985c6062bf9365de28be5f6e20f24b55fedaf Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 23 Oct 2018 16:54:28 +0000
Subject: [PATCH 0454/1116] [InstCombine] use 'match' to simplify code

There's probably some vector-with-undef-element pattern
that shows an improvement, so this is probably not quite
'NFC'.

This is the last step towards removing the fake binop
queries for not/neg. Ie, there are no more uses of those
functions in trunk. Fneg should follow.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345050 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineInternal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 3a18744e434..a4d7fe8861b 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -141,7 +141,7 @@ static inline Constant *SubOne(Constant *C) {
 /// uses of V and only keep uses of ~V.
 static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
   // ~(~(X)) -> X.
-  if (BinaryOperator::isNot(V))
+  if (match(V, m_Not(m_Value())))
     return true;
 
   // Constants can be considered to be not'ed values.
-- 
GitLab


From 6e614234d5d208c537a379cadb4fa02b468e339f Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 23 Oct 2018 17:06:03 +0000
Subject: [PATCH 0455/1116] [IR] remove fake binop queries for not/neg

The initial motivation is that we want to remove the
fneg API because that would silently fail if we add
an actual fneg instruction to IR. The same would be
true for the integer ops, so we might as well get rid
of these too.

We have a newer 'match' API that makes checking for
these patterns simpler. It also works with vectors
that may include undef elements in constants.

If any out-of-tree users need updating, they can model
their code changes on these commits:
rL345050
rL345043
rL345042
rL345041
rL345036
rL345030


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345052 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/InstrTypes.h | 13 ++----------
 lib/IR/Instructions.cpp      | 39 ------------------------------------
 2 files changed, 2 insertions(+), 50 deletions(-)

diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index e42bfc3afce..4487768e6c6 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -308,21 +308,12 @@ public:
   static BinaryOperator *CreateNot(Value *Op, const Twine &Name,
                                    BasicBlock *InsertAtEnd);
 
-  /// Check if the given Value is a NEG, FNeg, or NOT instruction.
-  ///
-  static bool isNeg(const Value *V);
+  /// Check if the given Value is an FNeg instruction.
   static bool isFNeg(const Value *V, bool IgnoreZeroSign=false);
-  static bool isNot(const Value *V);
 
-  /// Helper functions to extract the unary argument of a NEG, FNEG or NOT
-  /// operation implemented via Sub, FSub, or Xor.
-  ///
-  static const Value *getNegArgument(const Value *BinOp);
-  static       Value *getNegArgument(      Value *BinOp);
+  /// Helper functions to extract the unary argument of an FNeg.
   static const Value *getFNegArgument(const Value *BinOp);
   static       Value *getFNegArgument(      Value *BinOp);
-  static const Value *getNotArgument(const Value *BinOp);
-  static       Value *getNotArgument(      Value *BinOp);
 
   BinaryOps getOpcode() const {
     return static_cast<BinaryOps>(Instruction::getOpcode());
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index e1d1c0f2a6b..2a188b77679 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -2116,14 +2116,6 @@ static inline bool isConstantAllOnes(const Value *V) {
   return false;
 }
 
-bool BinaryOperator::isNeg(const Value *V) {
-  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
-    if (Bop->getOpcode() == Instruction::Sub)
-      if (Constant *C = dyn_cast<Constant>(Bop->getOperand(0)))
-        return C->isNegativeZeroValue();
-  return false;
-}
-
 bool BinaryOperator::isFNeg(const Value *V, bool IgnoreZeroSign) {
   if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
     if (Bop->getOpcode() == Instruction::FSub)
@@ -2135,22 +2127,6 @@ bool BinaryOperator::isFNeg(const Value *V, bool IgnoreZeroSign) {
   return false;
 }
 
-bool BinaryOperator::isNot(const Value *V) {
-  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
-    return (Bop->getOpcode() == Instruction::Xor &&
-            (isConstantAllOnes(Bop->getOperand(1)) ||
-             isConstantAllOnes(Bop->getOperand(0))));
-  return false;
-}
-
-Value *BinaryOperator::getNegArgument(Value *BinOp) {
-  return cast<BinaryOperator>(BinOp)->getOperand(1);
-}
-
-const Value *BinaryOperator::getNegArgument(const Value *BinOp) {
-  return getNegArgument(const_cast<Value*>(BinOp));
-}
-
 Value *BinaryOperator::getFNegArgument(Value *BinOp) {
   return cast<BinaryOperator>(BinOp)->getOperand(1);
 }
@@ -2159,21 +2135,6 @@ const Value *BinaryOperator::getFNegArgument(const Value *BinOp) {
   return getFNegArgument(const_cast<Value*>(BinOp));
 }
 
-Value *BinaryOperator::getNotArgument(Value *BinOp) {
-  assert(isNot(BinOp) && "getNotArgument on non-'not' instruction!");
-  BinaryOperator *BO = cast<BinaryOperator>(BinOp);
-  Value *Op0 = BO->getOperand(0);
-  Value *Op1 = BO->getOperand(1);
-  if (isConstantAllOnes(Op0)) return Op1;
-
-  assert(isConstantAllOnes(Op1));
-  return Op0;
-}
-
-const Value *BinaryOperator::getNotArgument(const Value *BinOp) {
-  return getNotArgument(const_cast<Value*>(BinOp));
-}
-
 // Exchange the two operands to this instruction. This instruction is safe to
 // use on any binary instruction and does not modify the semantics of the
 // instruction. If the instruction is order-dependent (SetLT f.e.), the opcode
-- 
GitLab


From c0de197df0d1b5178d40fc730466325475a4aca7 Mon Sep 17 00:00:00 2001
From: Stefan Pintilie <stefanp@ca.ibm.com>
Date: Tue, 23 Oct 2018 17:11:36 +0000
Subject: [PATCH 0456/1116] [Power9] Add __float128 support in the backend for
 bitcast to a i128

Add support to allow bit-casting from f128 to i128 and then
extracting 64 bits from the result.

Differential Revision: https://reviews.llvm.org/D49507

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345053 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCISelLowering.cpp | 58 ++++++++++++++++++++++++++
 lib/Target/PowerPC/PPCISelLowering.h   |  1 +
 lib/Target/PowerPC/PPCInstrVSX.td      |  9 ++++
 test/CodeGen/PowerPC/f128-bitcast.ll   | 53 +++++++++++++++++++++++
 4 files changed, 121 insertions(+)
 create mode 100644 test/CodeGen/PowerPC/f128-bitcast.ll

diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 1fd5018d05c..ca60f318278 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1070,6 +1070,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
 
+  setTargetDAGCombine(ISD::TRUNCATE);
+
   if (Subtarget.useCRBits()) {
     setTargetDAGCombine(ISD::TRUNCATE);
     setTargetDAGCombine(ISD::SETCC);
@@ -9634,6 +9636,9 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
     return;
+  case ISD::BITCAST:
+    // Don't handle bitcast here.
+    return;
   }
 }
 
@@ -12479,6 +12484,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ANY_EXTEND:
     return DAGCombineExtBoolTrunc(N, DCI);
   case ISD::TRUNCATE:
+    return combineTRUNCATE(N, DCI);
   case ISD::SETCC:
   case ISD::SELECT_CC:
     return DAGCombineTruncBoolExt(N, DCI);
@@ -14253,6 +14259,58 @@ SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
   return SDValue();
 }
 
+// Detect TRUNCATE operations on bitcasts of float128 values.
+// What we are looking for here is the situtation where we extract a subset
+// of bits from a 128 bit float.
+// This can be of two forms:
+// 1) BITCAST of f128 feeding TRUNCATE
+// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE
+// The reason this is required is because we do not have a legal i128 type
+// and so we want to prevent having to store the f128 and then reload part
+// of it.
+SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
+                                           DAGCombinerInfo &DCI) const {
+  // If we are using CRBits then try that first.
+  if (Subtarget.useCRBits()) {
+    // Check if CRBits did anything and return that if it did.
+    if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))
+      return CRTruncValue;
+  }
+
+  SDLoc dl(N);
+  SDValue Op0 = N->getOperand(0);
+
+  // Looking for a truncate of i128 to i64.
+  if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;
+
+  // SRL feeding TRUNCATE.
+  if (Op0.getOpcode() == ISD::SRL) {
+    ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+    // The right shift has to be by 64 bits.
+    if (!ConstNode || ConstNode->getZExtValue() != 64)
+      return SDValue();
+
+    // Switch the element number to extract.
+    EltToExtract = EltToExtract ? 0 : 1;
+    // Update Op0 past the SRL.
+    Op0 = Op0.getOperand(0);
+  }
+
+  // BITCAST feeding a TRUNCATE possibly via SRL.
+  if (Op0.getOpcode() == ISD::BITCAST &&
+      Op0.getValueType() == MVT::i128 &&
+      Op0.getOperand(0).getValueType() == MVT::f128) {
+    SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0));
+    return DCI.DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast,
+        DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32));
+  }
+  return SDValue();
+}
+
 bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
   if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64())
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 9709d6bb09e..959831cb1c0 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -1093,6 +1093,7 @@ namespace llvm {
     SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const;
 
     /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
     /// SETCC with integer subtraction when (1) there is a legal way of doing it
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index b1cfbc7b664..7a3141abc1b 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1040,6 +1040,15 @@ def : Pat<(v2f64 (bitconvert v1i128:$A)),
 def : Pat<(v1i128 (bitconvert v2f64:$A)),
           (COPY_TO_REGCLASS $A, VRRC)>;
 
+def : Pat<(v2i64 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
 def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 0)),
           (v2f64 (XVCVSXWDP (v2i64 (XXMRGHW $C, $C))))>;
 def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 1)),
diff --git a/test/CodeGen/PowerPC/f128-bitcast.ll b/test/CodeGen/PowerPC/f128-bitcast.ll
new file mode 100644
index 00000000000..68069e542ff
--- /dev/null
+++ b/test/CodeGen/PowerPC/f128-bitcast.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -enable-ppc-quad-precision -verify-machineinstrs \
+; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | FileCheck %s
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-unknown \
+; RUN:   -enable-ppc-quad-precision -verify-machineinstrs \
+; RUN:   -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @getPart1(fp128 %in) local_unnamed_addr {
+entry:
+  %0 = bitcast fp128 %in to i128
+  %a.sroa.0.0.extract.trunc = trunc i128 %0 to i64
+  ret i64 %a.sroa.0.0.extract.trunc
+; CHECK-LABEL: getPart1
+; CHECK:       mfvsrld r3, v2
+; CHECK-NEXT:  blr
+; CHECK-BE-LABEL: getPart1
+; CHECK-BE:       mfvsrld r3, v2
+; CHECK-BE-NEXT:  blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @getPart2(fp128 %in) local_unnamed_addr {
+entry:
+  %0 = bitcast fp128 %in to i128
+  %a.sroa.0.8.extract.shift = lshr i128 %0, 64
+  %a.sroa.0.8.extract.trunc = trunc i128 %a.sroa.0.8.extract.shift to i64
+  ret i64 %a.sroa.0.8.extract.trunc
+; CHECK-LABEL: getPart2
+; CHECK:       mfvsrd r3, v2
+; CHECK-NEXT:  blr
+; CHECK-BE-LABEL: getPart2
+; CHECK-BE:       mfvsrd r3, v2
+; CHECK-BE-NEXT:  blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @checkBitcast(fp128 %in, <2 x i64> %in2, <2 x i64> *%out) local_unnamed_addr {
+entry:
+  %0 = bitcast fp128 %in to <2 x i64>
+  %1 = extractelement <2 x i64> %0, i64 0
+  %2 = add <2 x i64> %0, %in2
+  store <2 x i64> %2, <2 x i64> *%out, align 16
+  ret i64 %1
+; CHECK-LABEL: checkBitcast
+; CHECK:       mfvsrld r3, v2
+; CHECK:       blr
+; CHECK-BE-LABEL: checkBitcast
+; CHECK-BE:       mfvsrd r3, v2
+; CHECK-BE:       blr
+}
+
-- 
GitLab


From b63f24a552f78c1615193eb68b5fdb91f2ab8898 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 23 Oct 2018 17:20:16 +0000
Subject: [PATCH 0457/1116] [PDB] Fix -Wunused-private-field in DIA

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345054 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h | 6 +-----
 include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h     | 4 +---
 lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp        | 9 ++++-----
 lib/DebugInfo/PDB/DIA/DIAFrameData.cpp            | 5 ++---
 lib/DebugInfo/PDB/DIA/DIASession.cpp              | 2 +-
 5 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h b/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
index e17ba2ce59b..f3b02f07e64 100644
--- a/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
@@ -17,12 +17,9 @@
 namespace llvm {
 namespace pdb {
 
-class DIASession;
-
 class DIAEnumFrameData : public IPDBEnumChildren<IPDBFrameData> {
 public:
-  explicit DIAEnumFrameData(const DIASession &PDBSession,
-                            CComPtr<IDiaEnumFrameData> DiaEnumerator);
+  explicit DIAEnumFrameData(CComPtr<IDiaEnumFrameData> DiaEnumerator);
 
   uint32_t getChildCount() const override;
   ChildTypePtr getChildAtIndex(uint32_t Index) const override;
@@ -30,7 +27,6 @@ public:
   void reset() override;
 
 private:
-  const DIASession &Session;
   CComPtr<IDiaEnumFrameData> Enumerator;
 };
 
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h b/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
index 7564c3b7a5a..0ce6cfc9303 100644
--- a/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
@@ -20,8 +20,7 @@ class DIASession;
 
 class DIAFrameData : public IPDBFrameData {
 public:
-  explicit DIAFrameData(const DIASession &PDBSession,
-                        CComPtr<IDiaFrameData> DiaFrameData);
+  explicit DIAFrameData(CComPtr<IDiaFrameData> DiaFrameData);
 
   uint32_t getAddressOffset() const override;
   uint32_t getAddressSection() const override;
@@ -31,7 +30,6 @@ public:
   uint64_t getVirtualAddress() const override;
 
 private:
-  const DIASession &Session;
   CComPtr<IDiaFrameData> FrameData;
 };
 
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
index 77514483e04..f873f3525df 100644
--- a/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
@@ -13,9 +13,8 @@
 
 using namespace llvm::pdb;
 
-DIAEnumFrameData::DIAEnumFrameData(const DIASession &PDBSession,
-                                   CComPtr<IDiaEnumFrameData> DiaEnumerator)
-    : Session(PDBSession), Enumerator(DiaEnumerator) {}
+DIAEnumFrameData::DIAEnumFrameData(CComPtr<IDiaEnumFrameData> DiaEnumerator)
+    : Enumerator(DiaEnumerator) {}
 
 uint32_t DIAEnumFrameData::getChildCount() const {
   LONG Count = 0;
@@ -28,7 +27,7 @@ DIAEnumFrameData::getChildAtIndex(uint32_t Index) const {
   if (S_OK != Enumerator->Item(Index, &Item))
     return nullptr;
 
-  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Session, Item));
+  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Item));
 }
 
 std::unique_ptr<IPDBFrameData> DIAEnumFrameData::getNext() {
@@ -37,7 +36,7 @@ std::unique_ptr<IPDBFrameData> DIAEnumFrameData::getNext() {
   if (S_OK != Enumerator->Next(1, &Item, &NumFetched))
     return nullptr;
 
-  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Session, Item));
+  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Item));
 }
 
 void DIAEnumFrameData::reset() { Enumerator->Reset(); }
diff --git a/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp b/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp
index b904a2ff60a..533cce7923c 100644
--- a/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp
@@ -13,9 +13,8 @@
 
 using namespace llvm::pdb;
 
-DIAFrameData::DIAFrameData(const DIASession &PDBSession,
-                           CComPtr<IDiaFrameData> DiaFrameData)
-    : Session(PDBSession), FrameData(DiaFrameData) {}
+DIAFrameData::DIAFrameData(CComPtr<IDiaFrameData> DiaFrameData)
+    : FrameData(DiaFrameData) {}
 
 template <typename ArgType>
 ArgType
diff --git a/lib/DebugInfo/PDB/DIA/DIASession.cpp b/lib/DebugInfo/PDB/DIA/DIASession.cpp
index b89ca9a858f..bd375e172ac 100644
--- a/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -428,5 +428,5 @@ DIASession::getFrameData() const {
   if (!FD)
     return nullptr;
 
-  return llvm::make_unique<DIAEnumFrameData>(*this, FD);
+  return llvm::make_unique<DIAEnumFrameData>(FD);
 }
-- 
GitLab


From ca762c62ed4fd12d2b8319a4647841f85b79657b Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Tue, 23 Oct 2018 17:23:31 +0000
Subject: [PATCH 0458/1116] [tblgen] Allow FixedLenDecoderEmitter to use
 APInt-like objects as InsnType

Summary:
Some targets have very long encodings and uint64_t isn't sufficient. uint128_t
isn't portable so such targets need to use an object instead.

There is one catch with this at the moment, no string of bits extracted
from the encoding may exceeed 64-bits. Fields are still permitted to
exceed 64-bits so long as they aren't one contiguous string of bits. If
this proves to be a problem then we can modify the generation of
fieldFromInstruction() calls to account for it but for now I've added an
assertion for this.

InsnType must either be integral or an APInt-like object that must:
* Have a static const max_size_in_bits equal to the number of bits in the encoding.
* be default-constructible and copy-constructible
* be constructible from a uint64_t (this is the key area the interface deviates
  from APInt since this constructor does not take the bit width)
* be constructible from an APInt (this can be private)
* be convertible to uint64_t
* Support the ~, &,, ==, !=, and |= operators with other objects of the same type
* Support shift (<<, >>) with signed and unsigned integers on the RHS
* Support put (<<) to raw_ostream&

Reviewers: bogner, charukcs

Subscribers: nhaehnle, llvm-commits

Differential Revision: https://reviews.llvm.org/D52100

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345056 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/TableGen/FixedLenDecoderEmitter.cpp | 60 ++++++++++++++++++-----
 1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index 76ba1c00109..361bad38302 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -2067,21 +2067,59 @@ static bool populateInstruction(CodeGenTarget &Target,
 // using the VS compiler. It has a bug which causes the function
 // to be optimized out in some circustances. See llvm.org/pr38292
 static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
-  OS << "// Helper function for extracting fields from encoded instructions.\n"
-     << "template<typename InsnType>\n"
+  OS << "// Helper functions for extracting fields from encoded instructions.\n"
+     << "// InsnType must either be integral or an APInt-like object that "
+        "must:\n"
+     << "// * Have a static const max_size_in_bits equal to the number of bits "
+        "in the\n"
+     << "//   encoding.\n"
+     << "// * be default-constructible and copy-constructible\n"
+     << "// * be constructible from a uint64_t\n"
+     << "// * be constructible from an APInt (this can be private)\n"
+     << "// * Support getBitsSet(loBit, hiBit)\n"
+     << "// * be convertible to uint64_t\n"
+     << "// * Support the ~, &, ==, !=, and |= operators with other objects of "
+        "the same type\n"
+     << "// * Support shift (<<, >>) with signed and unsigned integers on the "
+        "RHS\n"
+     << "// * Support put (<<) to raw_ostream&\n"
      << "#if defined(_MSC_VER) && !defined(__clang__)\n"
      << "__declspec(noinline)\n"
      << "#endif\n"
-     << "static InsnType fieldFromInstruction(InsnType insn, unsigned startBit,\n"
+     << "template<typename InsnType>\n"
+     << "static InsnType fieldFromInstruction(InsnType insn, unsigned "
+        "startBit,\n"
+     << "                                     unsigned numBits, "
+        "std::true_type) {\n"
+     << "  assert(startBit + numBits <= 64 && \"Cannot support >64-bit "
+        "extractions!\");\n"
+     << "  assert(startBit + numBits <= (sizeof(InsnType) * 8) &&\n"
+     << "         \"Instruction field out of bounds!\");\n"
+     << "  InsnType fieldMask;\n"
+     << "  if (numBits == sizeof(InsnType) * 8)\n"
+     << "    fieldMask = (InsnType)(-1LL);\n"
+     << "  else\n"
+     << "    fieldMask = (((InsnType)1 << numBits) - 1) << startBit;\n"
+     << "  return (insn & fieldMask) >> startBit;\n"
+     << "}\n"
+     << "\n"
+     << "template<typename InsnType>\n"
+     << "static InsnType fieldFromInstruction(InsnType insn, unsigned "
+        "startBit,\n"
+     << "                                     unsigned numBits, "
+        "std::false_type) {\n"
+     << "  assert(startBit + numBits <= InsnType::max_size_in_bits && "
+        "\"Instruction field out of bounds!\");\n"
+     << "  InsnType fieldMask = InsnType::getBitsSet(0, numBits);\n"
+     << "  return (insn >> startBit) & fieldMask;\n"
+     << "}\n"
+     << "\n"
+     << "template<typename InsnType>\n"
+     << "static InsnType fieldFromInstruction(InsnType insn, unsigned "
+        "startBit,\n"
      << "                                     unsigned numBits) {\n"
-     << "    assert(startBit + numBits <= (sizeof(InsnType)*8) &&\n"
-     << "           \"Instruction field out of bounds!\");\n"
-     << "    InsnType fieldMask;\n"
-     << "    if (numBits == sizeof(InsnType)*8)\n"
-     << "      fieldMask = (InsnType)(-1LL);\n"
-     << "    else\n"
-     << "      fieldMask = (((InsnType)1 << numBits) - 1) << startBit;\n"
-     << "    return (insn & fieldMask) >> startBit;\n"
+     << "  return fieldFromInstruction(insn, startBit, numBits, "
+        "std::is_integral<InsnType>());\n"
      << "}\n\n";
 }
 
-- 
GitLab


From 68de396c344b5f158606532da61fd8157925340c Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 23 Oct 2018 17:24:15 +0000
Subject: [PATCH 0459/1116] [IR] Fix -Wunused-function after r345052

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345057 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Instructions.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 2a188b77679..d92706500bc 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -2109,13 +2109,6 @@ BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
                             Op->getType(), Name, InsertAtEnd);
 }
 
-// isConstantAllOnes - Helper function for several functions below
-static inline bool isConstantAllOnes(const Value *V) {
-  if (const Constant *C = dyn_cast<Constant>(V))
-    return C->isAllOnesValue();
-  return false;
-}
-
 bool BinaryOperator::isFNeg(const Value *V, bool IgnoreZeroSign) {
   if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
     if (Bop->getOpcode() == Instruction::FSub)
-- 
GitLab


From 5b2930967ecfb3d62bd6aadc8940ba79b576566c Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Tue, 23 Oct 2018 17:41:39 +0000
Subject: [PATCH 0460/1116] Fix MSVC build by correcting placement of declspec
 after r345056

Going by the MSVC toolchains at godbolt.org, declspec comes after the template<...>.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345059 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/TableGen/FixedLenDecoderEmitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index 361bad38302..44cf6eadcb0 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -2083,10 +2083,10 @@ static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
      << "// * Support shift (<<, >>) with signed and unsigned integers on the "
         "RHS\n"
      << "// * Support put (<<) to raw_ostream&\n"
+     << "template<typename InsnType>\n"
      << "#if defined(_MSC_VER) && !defined(__clang__)\n"
      << "__declspec(noinline)\n"
      << "#endif\n"
-     << "template<typename InsnType>\n"
      << "static InsnType fieldFromInstruction(InsnType insn, unsigned "
         "startBit,\n"
      << "                                     unsigned numBits, "
-- 
GitLab


From 79b7ee934428cb225f0bd04908488402b95f5e87 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Oct 2018 17:48:30 +0000
Subject: [PATCH 0461/1116] [LegalizeDAG] Share Vector/Scalar CTLZ Expansion

As suggested on D53258, this patch shares common CTLZ expansion code between VectorLegalizer and SelectionDAGLegalize by putting it in TargetLowering.

Extension to D53474

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345060 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/TargetLowering.h         |  7 +++
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      | 37 ++----------
 .../SelectionDAG/LegalizeVectorOps.cpp        | 21 ++-----
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   | 53 +++++++++++++++++
 test/CodeGen/X86/vec_ctbits.ll                | 58 +++++++++----------
 5 files changed, 98 insertions(+), 78 deletions(-)

diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index bcfe0fd6e74..8545da55f78 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -3647,6 +3647,13 @@ public:
   /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
   SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand CTLZ/CTLZ_ZERO_UNDEF nodes. Expands vector/scalar CTLZ nodes,
+  /// vector nodes can only succeed if all operations are legal/custom.
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandCTLZ(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
   /// Expand CTTZ/CTTZ_ZERO_UNDEF nodes. Expands vector/scalar CTTZ nodes,
   /// vector nodes can only succeed if all operations are legal/custom.
   /// \param N Node to expand
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index e03263a9948..c8d843e54c3 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2759,36 +2759,6 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
 
     return Op;
   }
-  case ISD::CTLZ_ZERO_UNDEF:
-    // This trivially expands to CTLZ.
-    return DAG.getNode(ISD::CTLZ, dl, VT, Op);
-  case ISD::CTLZ: {
-    if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
-      EVT SetCCVT = getSetCCResultType(VT);
-      SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
-      SDValue Zero = DAG.getConstant(0, dl, VT);
-      SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
-      return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
-                         DAG.getConstant(Len, dl, VT), CTLZ);
-    }
-
-    // for now, we do this:
-    // x = x | (x >> 1);
-    // x = x | (x >> 2);
-    // ...
-    // x = x | (x >>16);
-    // x = x | (x >>32); // for 64-bit input
-    // return popcount(~x);
-    //
-    // Ref: "Hacker's Delight" by Henry Warren
-    for (unsigned i = 0; (1U << i) <= (Len / 2); ++i) {
-      SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT);
-      Op = DAG.getNode(ISD::OR, dl, VT, Op,
-                       DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3));
-    }
-    Op = DAG.getNOT(dl, Op, VT);
-    return DAG.getNode(ISD::CTPOP, dl, VT, Op);
-  }
   }
 }
 
@@ -2800,11 +2770,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   bool NeedInvert;
   switch (Node->getOpcode()) {
   case ISD::CTPOP:
-  case ISD::CTLZ:
-  case ISD::CTLZ_ZERO_UNDEF:
     Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
     Results.push_back(Tmp1);
     break;
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+    if (TLI.expandCTLZ(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
+    break;
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
     if (TLI.expandCTTZ(Node, Tmp1, DAG))
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 9f18920a8a1..fdb74fef121 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1081,23 +1081,10 @@ SDValue VectorLegalizer::ExpandFSUB(SDValue Op) {
 }
 
 SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
-  EVT VT = Op.getValueType();
-  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
-
-  // If the non-ZERO_UNDEF version is supported we can use that instead.
-  if (Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF &&
-      TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) {
-    SDLoc DL(Op);
-    return DAG.getNode(ISD::CTLZ, DL, VT, Op.getOperand(0));
-  }
-
-  // If we have the appropriate vector bit operations, it is better to use them
-  // than unrolling and expanding each component.
-  if (isPowerOf2_32(NumBitsPerElt) &&
-      TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
-      TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
-      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT))
-    return Op;
+  // Attempt to expand using TargetLowering.
+  SDValue Result;
+  if (TLI.expandCTLZ(Op.getNode(), Result, DAG))
+    return Result;
 
   // Otherwise go ahead and unroll.
   return DAG.UnrollVectorOp(Op.getNode());
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b9b0941903b..4e7094bf210 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4142,6 +4142,59 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
   return SDValue();
 }
 
+bool TargetLowering::expandCTLZ(SDNode *Node, SDValue &Result,
+                                SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Op = Node->getOperand(0);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // If the non-ZERO_UNDEF version is supported we can use that instead.
+  if (Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF &&
+      isOperationLegalOrCustom(ISD::CTLZ, VT)) {
+    Result = DAG.getNode(ISD::CTLZ, dl, VT, Op);
+    return true;
+  }
+
+  // If the ZERO_UNDEF version is supported use that and handle the zero case.
+  if (isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
+    Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
+                         DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ);
+    return true;
+  }
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
+                        !isOperationLegalOrCustom(ISD::CTPOP, VT) ||
+                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
+    return false;
+
+  // for now, we do this:
+  // x = x | (x >> 1);
+  // x = x | (x >> 2);
+  // ...
+  // x = x | (x >>16);
+  // x = x | (x >>32); // for 64-bit input
+  // return popcount(~x);
+  //
+  // Ref: "Hacker's Delight" by Henry Warren
+  for (unsigned i = 0; (1U << i) <= (NumBitsPerElt / 2); ++i) {
+    SDValue Tmp = DAG.getConstant(1ULL << i, dl, ShVT);
+    Op = DAG.getNode(ISD::OR, dl, VT, Op,
+                     DAG.getNode(ISD::SRL, dl, VT, Op, Tmp));
+  }
+  Op = DAG.getNOT(dl, Op, VT);
+  Result = DAG.getNode(ISD::CTPOP, dl, VT, Op);
+  return true;
+}
+
 bool TargetLowering::expandCTTZ(SDNode *Node, SDValue &Result,
                                 SelectionDAG &DAG) const {
   SDLoc dl(Node);
diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 002bcebdf71..26330f940af 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll
@@ -140,42 +140,42 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind {
 ; CHECK-LABEL: promlz:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $1, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    psrlq $1, %xmm2
+; CHECK-NEXT:    por %xmm0, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
 ; CHECK-NEXT:    psrlq $2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $4, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    por %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    psrlq $4, %xmm2
+; CHECK-NEXT:    por %xmm0, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
 ; CHECK-NEXT:    psrlq $8, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $16, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    por %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    psrlq $16, %xmm2
+; CHECK-NEXT:    por %xmm0, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
 ; CHECK-NEXT:    psrlq $32, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    por %xmm2, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
+; CHECK-NEXT:    pxor %xmm0, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
 ; CHECK-NEXT:    psrlw $1, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubb %xmm0, %xmm1
+; CHECK-NEXT:    psubb %xmm0, %xmm2
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm3
+; CHECK-NEXT:    pand %xmm0, %xmm3
+; CHECK-NEXT:    psrlw $2, %xmm2
 ; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    psrlw $2, %xmm1
-; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    paddb %xmm2, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrlw $4, %xmm2
-; CHECK-NEXT:    paddb %xmm1, %xmm2
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    psadbw %xmm2, %xmm0
+; CHECK-NEXT:    paddb %xmm3, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    psrlw $4, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    psadbw %xmm1, %xmm0
 ; CHECK-NEXT:    psubq {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
-- 
GitLab


From 2ea19c8e668c79236626a78e640199c9bff4b119 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Tue, 23 Oct 2018 18:27:10 +0000
Subject: [PATCH 0462/1116] X86DAGToDAGISel::matchBitExtract(): lambdas can't
 have default arguments.

As reported by ctopper.
That is a gcc-only warning at the moment.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345065 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 73abdd80dc6..4b803c5a81b 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2709,10 +2709,12 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
   // If we have BMI2's BZHI, we are ok with muti-use patterns.
   // Else, if we only have BMI1's BEXTR, we require one-use.
   const bool CanHaveExtraUses = Subtarget->hasBMI2();
-  auto checkOneUse = [CanHaveExtraUses](SDValue Op, unsigned NUses = 1) {
+  auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
     return CanHaveExtraUses ||
            Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
   };
+  auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
+  auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
 
   // a) x & ((1 << nbits) + (-1))
   auto matchPatternA = [&checkOneUse, &NBits](SDValue Mask) -> bool {
@@ -2750,7 +2752,8 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
   SDValue X;
 
   // d) x << (32 - y) >> (32 - y)
-  auto matchPatternD = [&checkOneUse, Size, &X, &NBits](SDNode *Node) -> bool {
+  auto matchPatternD = [&checkOneUse, &checkTwoUse, Size, &X,
+                        &NBits](SDNode *Node) -> bool {
     if (Node->getOpcode() != ISD::SRL)
       return false;
     SDValue N0 = Node->getOperand(0);
@@ -2760,7 +2763,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
     SDValue N01 = N0->getOperand(1);
     // Both of the shifts must be by the exact same value.
     // There should not be any uses of the shift amount outside of the pattern.
-    if (N1 != N01 || !checkOneUse(N1, 2))
+    if (N1 != N01 || !checkTwoUse(N1))
       return false;
     // Skip over a truncate of the shift amount.
     if (N1->getOpcode() == ISD::TRUNCATE) {
-- 
GitLab


From d537407cfe068087f21357c78bf7b4168b84f557 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Oct 2018 18:28:24 +0000
Subject: [PATCH 0463/1116] [LegalizeDAG] Share Vector/Scalar CTPOP Expansion

As suggested on D53258, this patch move the CTPOP expansion code from SelectionDAGLegalize to TargetLowering to allow it to be reused by the VectorLegalizer.

Proper vector support will be added by D53258.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345066 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/TargetLowering.h       |  7 +++
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp    | 60 +--------------------
 lib/CodeGen/SelectionDAG/TargetLowering.cpp | 49 +++++++++++++++++
 3 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 8545da55f78..93a08347964 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -3647,6 +3647,13 @@ public:
   /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
   SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand CTPOP nodes. Expands vector/scalar CTPOP nodes,
+  /// vector nodes can only succeed if all operations are legal/custom.
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandCTPOP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
   /// Expand CTLZ/CTLZ_ZERO_UNDEF nodes. Expands vector/scalar CTLZ nodes,
   /// vector nodes can only succeed if all operations are legal/custom.
   /// \param N Node to expand
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index c8d843e54c3..cfc4d13b383 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -176,7 +176,6 @@ private:
 
   SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl);
   SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl);
-  SDValue ExpandBitCount(unsigned Opc, SDValue Op, const SDLoc &dl);
 
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
@@ -2707,61 +2706,6 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
   }
 }
 
-/// Expand the specified bitcount instruction into operations.
-SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
-                                             const SDLoc &dl) {
-  EVT VT = Op.getValueType();
-  EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-  unsigned Len = VT.getScalarSizeInBits();
-
-  switch (Opc) {
-  default: llvm_unreachable("Cannot expand this yet!");
-  case ISD::CTPOP: {
-    assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
-           "CTPOP not implemented for this type.");
-
-    // This is the "best" algorithm from
-    // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-
-    SDValue Mask55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)),
-                                     dl, VT);
-    SDValue Mask33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)),
-                                     dl, VT);
-    SDValue Mask0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)),
-                                     dl, VT);
-    SDValue Mask01 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)),
-                                     dl, VT);
-
-    // v = v - ((v >> 1) & 0x55555555...)
-    Op = DAG.getNode(ISD::SUB, dl, VT, Op,
-                     DAG.getNode(ISD::AND, dl, VT,
-                                 DAG.getNode(ISD::SRL, dl, VT, Op,
-                                             DAG.getConstant(1, dl, ShVT)),
-                                 Mask55));
-    // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
-    Op = DAG.getNode(ISD::ADD, dl, VT,
-                     DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
-                     DAG.getNode(ISD::AND, dl, VT,
-                                 DAG.getNode(ISD::SRL, dl, VT, Op,
-                                             DAG.getConstant(2, dl, ShVT)),
-                                 Mask33));
-    // v = (v + (v >> 4)) & 0x0F0F0F0F...
-    Op = DAG.getNode(ISD::AND, dl, VT,
-                     DAG.getNode(ISD::ADD, dl, VT, Op,
-                                 DAG.getNode(ISD::SRL, dl, VT, Op,
-                                             DAG.getConstant(4, dl, ShVT))),
-                     Mask0F);
-    // v = (v * 0x01010101...) >> (Len - 8)
-    if (Len > 8)
-      Op = DAG.getNode(ISD::SRL, dl, VT,
-                       DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
-                       DAG.getConstant(Len - 8, dl, ShVT));
-
-    return Op;
-  }
-  }
-}
-
 bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to expand node\n");
   SmallVector<SDValue, 8> Results;
@@ -2770,8 +2714,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   bool NeedInvert;
   switch (Node->getOpcode()) {
   case ISD::CTPOP:
-    Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
-    Results.push_back(Tmp1);
+    if (TLI.expandCTPOP(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
     break;
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4e7094bf210..017db41fa9e 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4142,6 +4142,55 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
   return SDValue();
 }
 
+bool TargetLowering::expandCTPOP(SDNode *Node, SDValue &Result,
+                                 SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Op = Node->getOperand(0);
+  unsigned Len = VT.getScalarSizeInBits();
+  assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
+         "CTPOP not implemented for this type.");
+
+  // This is the "best" algorithm from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+  SDValue Mask55 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, VT);
+  SDValue Mask33 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, VT);
+  SDValue Mask0F =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, VT);
+  SDValue Mask01 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
+
+  // v = v - ((v >> 1) & 0x55555555...)
+  Op = DAG.getNode(ISD::SUB, dl, VT, Op,
+                   DAG.getNode(ISD::AND, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, VT, Op,
+                                           DAG.getConstant(1, dl, ShVT)),
+                               Mask55));
+  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+  Op = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
+                   DAG.getNode(ISD::AND, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, VT, Op,
+                                           DAG.getConstant(2, dl, ShVT)),
+                               Mask33));
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  Op = DAG.getNode(ISD::AND, dl, VT,
+                   DAG.getNode(ISD::ADD, dl, VT, Op,
+                               DAG.getNode(ISD::SRL, dl, VT, Op,
+                                           DAG.getConstant(4, dl, ShVT))),
+                   Mask0F);
+  // v = (v * 0x01010101...) >> (Len - 8)
+  if (Len > 8)
+    Op =
+        DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
+                    DAG.getConstant(Len - 8, dl, ShVT));
+
+  Result = Op;
+  return true;
+}
+
 bool TargetLowering::expandCTLZ(SDNode *Node, SDValue &Result,
                                 SelectionDAG &DAG) const {
   SDLoc dl(Node);
-- 
GitLab


From 863ec6dadfe1a7b80f7d00473d016dfbafef2e53 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 23 Oct 2018 18:46:33 +0000
Subject: [PATCH 0464/1116] [llvm-strip] Support -s alias for --strip-all. Make
 both strip and objcopy case sensitive to support both -s (--strip-all) and -S
 (--strip-debug).

Summary:
GNU strip supports both `-s` and `-S` as aliases for `--strip-all` and `--strip-debug`, respectfully.

As part of this, it turns out that strip/objcopy were accepting case insensitive command line args. I'm not sure if there was an explicit reason for this. The only others uses of this are llvm-cvtres/llvm-mt/llvm-lib, which are all tools specific for windows support. Forcing case sensitivity allows both aliases to exist, but seems like a good idea anyway.

And as a surprise test case adjustment, the llvm-strip unit test was running with `-keep=unavailable_symbol`, despite `keep` not be a valid flag for strip. This is because there is a flag `-K` which, when case insensitivity is permitted, allows it to be interpreted as `-K` = `eep=unavailable_symbol` (e.g. to allow `-Kfoo` == `--keep-symbol=foo`).

Reviewers: jakehehrlich, jhenderson, alexshap

Reviewed By: jakehehrlich

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53163

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345068 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-objcopy/strip-all.test | 12 ++++++++----
 tools/llvm-objcopy/CopyConfig.cpp      |  4 ++--
 tools/llvm-objcopy/StripOpts.td        |  3 +++
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/test/tools/llvm-objcopy/strip-all.test b/test/tools/llvm-objcopy/strip-all.test
index 8c0f7489134..5c5b6fd374f 100644
--- a/test/tools/llvm-objcopy/strip-all.test
+++ b/test/tools/llvm-objcopy/strip-all.test
@@ -39,12 +39,16 @@
 # RUN: llvm-objcopy -S %t9 %t9
 # RUN: cmp %t2 %t9
 
+# RUN: cp %t %t10
+# RUN: llvm-strip -s %t10
+# RUN: cmp %t2 %t10
+
 # Verify that a non-existent symbol table (after first call to llvm-strip)
 # can be handled correctly.
-# RUN: cp %t %t9
-# RUN: llvm-strip --strip-all -keep=unavailable_symbol %t9
-# RUN: llvm-strip --strip-all -keep=unavailable_symbol %t9
-# RUN: cmp %t2 %t9
+# RUN: cp %t %t11
+# RUN: llvm-strip --strip-all --keep-symbol=unavailable_symbol %t11
+# RUN: llvm-strip --strip-all --keep-symbol=unavailable_symbol %t11
+# RUN: cmp %t2 %t11
 
 !ELF
 FileHeader:
diff --git a/tools/llvm-objcopy/CopyConfig.cpp b/tools/llvm-objcopy/CopyConfig.cpp
index 2c3551ba026..9746110cd45 100644
--- a/tools/llvm-objcopy/CopyConfig.cpp
+++ b/tools/llvm-objcopy/CopyConfig.cpp
@@ -61,7 +61,7 @@ static const opt::OptTable::Info ObjcopyInfoTable[] = {
 
 class ObjcopyOptTable : public opt::OptTable {
 public:
-  ObjcopyOptTable() : OptTable(ObjcopyInfoTable, true) {}
+  ObjcopyOptTable() : OptTable(ObjcopyInfoTable) {}
 };
 
 enum StripID {
@@ -90,7 +90,7 @@ static const opt::OptTable::Info StripInfoTable[] = {
 
 class StripOptTable : public opt::OptTable {
 public:
-  StripOptTable() : OptTable(StripInfoTable, true) {}
+  StripOptTable() : OptTable(StripInfoTable) {}
 };
 
 enum SectionFlag {
diff --git a/tools/llvm-objcopy/StripOpts.td b/tools/llvm-objcopy/StripOpts.td
index 821dfa3b277..b155933616d 100644
--- a/tools/llvm-objcopy/StripOpts.td
+++ b/tools/llvm-objcopy/StripOpts.td
@@ -19,6 +19,9 @@ def p : Flag<[ "-" ], "p">, Alias<preserve_dates>;
 def strip_all : Flag<["-", "--"], "strip-all">,
                 HelpText<"Remove non-allocated sections other than .gnu.warning* sections">;
 
+def s : Flag<["-"], "s">,
+        Alias<strip_all>;
+
 def strip_debug : Flag<["-", "--"], "strip-debug">,
                   HelpText<"Remove debugging symbols only">;
 
-- 
GitLab


From 75e42f666e05273089389ada06bddfcd39cb42da Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Oct 2018 19:07:53 +0000
Subject: [PATCH 0465/1116] [X86][SSE] Revert rL343922 combinePMULDQ
 AddToWorklist (PR39398)

We can't add the MULDQ node back to the worklist after the demanded bits change has been committed in case the node has been removed entirely. This will have to wait until we have SimplifyDemandedBitsForTargetNode.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345070 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp          |  8 +--
 test/CodeGen/X86/combine-pmuldq.ll          | 75 +++++++++++++++++++--
 test/CodeGen/X86/urem-seteq-vec-nonsplat.ll | 68 +++++++++----------
 3 files changed, 107 insertions(+), 44 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index cc867070398..5e4796ca54d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -40362,14 +40362,10 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
   APInt DemandedMask(APInt::getLowBitsSet(64, 32));
 
   // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
-  if (TLI.SimplifyDemandedBits(LHS, DemandedMask, DCI)) {
-    DCI.AddToWorklist(N);
+  if (TLI.SimplifyDemandedBits(LHS, DemandedMask, DCI))
     return SDValue(N, 0);
-  }
-  if (TLI.SimplifyDemandedBits(RHS, DemandedMask, DCI)) {
-    DCI.AddToWorklist(N);
+  if (TLI.SimplifyDemandedBits(RHS, DemandedMask, DCI))
     return SDValue(N, 0);
-  }
 
   return SDValue();
 }
diff --git a/test/CodeGen/X86/combine-pmuldq.ll b/test/CodeGen/X86/combine-pmuldq.ll
index c735b204344..edc6cb01d97 100644
--- a/test/CodeGen/X86/combine-pmuldq.ll
+++ b/test/CodeGen/X86/combine-pmuldq.ll
@@ -47,10 +47,26 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_shuffle_zero_pmuludq:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_shuffle_zero_pmuludq:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: combine_shuffle_zero_pmuludq:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512VL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQVL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512DQVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    retq
   %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   %2 = shufflevector <4 x i32> %a1, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   %3 = bitcast <4 x i32> %1 to <2 x i64>
@@ -68,16 +84,22 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1)
 ;
 ; AVX2-LABEL: combine_shuffle_zero_pmuludq_256:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; AVX512VL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256:
 ; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQVL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; AVX512DQVL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    retq
   %1 = shufflevector <8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
@@ -130,3 +152,48 @@ define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) {
   %2 = mul nuw nsw <8 x i64> %1, <i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883>
   ret <8 x i64> %2
 }
+
+define void @PR39398() {
+; SSE-LABEL: PR39398:
+; SSE:       # %bb.0: # %bb
+; SSE-NEXT:    .p2align 4, 0x90
+; SSE-NEXT:  .LBB5_1: # %bb10
+; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
+; SSE-NEXT:    cmpl $232, %eax
+; SSE-NEXT:    jne .LBB5_1
+; SSE-NEXT:  # %bb.2: # %bb34
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR39398:
+; AVX:       # %bb.0: # %bb
+; AVX-NEXT:    .p2align 4, 0x90
+; AVX-NEXT:  .LBB5_1: # %bb10
+; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX-NEXT:    cmpl $232, %eax
+; AVX-NEXT:    jne .LBB5_1
+; AVX-NEXT:  # %bb.2: # %bb34
+; AVX-NEXT:    retq
+bb:
+  %tmp9 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+  br label %bb10
+
+bb10:                                             ; preds = %bb10, %bb
+  %tmp12 = phi <4 x i32> [ <i32 9, i32 8, i32 7, i32 6>, %bb ], [ zeroinitializer, %bb10 ]
+  %tmp16 = add <4 x i32> %tmp12, <i32 -4, i32 -4, i32 -4, i32 -4>
+  %tmp18 = zext <4 x i32> %tmp12 to <4 x i64>
+  %tmp19 = zext <4 x i32> %tmp16 to <4 x i64>
+  %tmp20 = xor <4 x i64> %tmp18, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %tmp21 = xor <4 x i64> %tmp19, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %tmp24 = mul <4 x i64> %tmp9, %tmp20
+  %tmp25 = mul <4 x i64> %tmp9, %tmp21
+  %tmp26 = select <4 x i1> undef, <4 x i64> zeroinitializer, <4 x i64> %tmp24
+  %tmp27 = select <4 x i1> undef, <4 x i64> zeroinitializer, <4 x i64> %tmp25
+  %tmp28 = add <4 x i64> zeroinitializer, %tmp26
+  %tmp29 = add <4 x i64> zeroinitializer, %tmp27
+  %tmp33 = icmp eq i32 undef, 232
+  br i1 %tmp33, label %bb34, label %bb10
+
+bb34:                                             ; preds = %bb10
+  %tmp35 = add <4 x i64> %tmp29, %tmp28
+  ret void
+}
diff --git a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 82385386c88..9f339a8a555 100644
--- a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -143,31 +143,31 @@ define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone {
 define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone {
 ; CHECK-SSE2-LABEL: test_urem_even_div:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,2454267027]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2863311531,3435973837,2863311531,2454267027]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; CHECK-SSE2-NEXT:    psrld $1, %xmm3
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    psrld $2, %xmm2
+; CHECK-SSE2-NEXT:    psrld $3, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
 ; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [6,10,12,14]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm5
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,3,1]
+; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
@@ -377,30 +377,30 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind readnone {
 define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone {
 ; CHECK-SSE2-LABEL: test_urem_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,0,2863311531,2454267027]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2863311531,0,2863311531,2454267027]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; CHECK-SSE2-NEXT:    psrld $1, %xmm3
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    psrld $2, %xmm2
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
-; CHECK-SSE2-NEXT:    psrld $3, %xmm2
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [6,1,12,14]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0]
+; CHECK-SSE2-NEXT:    psrld $3, %xmm1
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [6,1,12,14]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-- 
GitLab


From fda29c9e709cddb7edd7ca95ee6bd5a7e7947e6d Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Tue, 23 Oct 2018 19:41:12 +0000
Subject: [PATCH 0466/1116] [HotColdSplitting] Attach MinSize to outlined code

Outlined code is cold by assumption, so it makes sense to optimize it
for minimal code size rather than performance.

After r344869 moved the splitting pass to the end of the IR pipeline,
this does not result in much of a code size reduction. This is probably
because a comparatively small number backend transforms make use of the
MinSize hint.

Running LNT on x86_64, I see that 33/1020 binaries shrink for a total of
919 bytes of TEXT reduction. I didn't measure a significant performance
impact.

Differential Revision: https://reviews.llvm.org/D53518

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345072 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/HotColdSplitting.cpp |  7 ++++++
 test/Transforms/HotColdSplit/minsize.ll | 32 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+)
 create mode 100644 test/Transforms/HotColdSplit/minsize.ll

diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index 1d804ccb767..d3e086e972a 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -360,6 +360,13 @@ HotColdSplitting::extractColdRegion(const SmallVectorImpl<BasicBlock *> &Region,
       CS.setCallingConv(CallingConv::Cold);
     }
     CI->setIsNoInline();
+
+    // Try to make the outlined code as small as possible on the assumption
+    // that it's cold.
+    assert(!OutF->hasFnAttribute(Attribute::OptimizeNone) &&
+           "An outlined function should never be marked optnone");
+    OutF->addFnAttr(Attribute::MinSize);
+
     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
     ORE.emit([&]() {
       return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
diff --git a/test/Transforms/HotColdSplit/minsize.ll b/test/Transforms/HotColdSplit/minsize.ll
new file mode 100644
index 00000000000..f7509bf3c02
--- /dev/null
+++ b/test/Transforms/HotColdSplit/minsize.ll
@@ -0,0 +1,32 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+; CHECK-LABEL: @fun
+; CHECK: codeRepl:
+; CHECK-NEXT: call void @fun_if.else
+
+define void @fun() {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  ret void
+
+if.else:
+  br label %if.then4
+
+if.then4:
+  br i1 undef, label %if.then5, label %if.end
+
+if.then5:
+  br label %cleanup
+
+if.end:
+  br label %cleanup
+
+cleanup:
+  %cleanup.dest.slot.0 = phi i32 [ 1, %if.then5 ], [ 0, %if.end ]
+  unreachable
+}
+
+; CHECK: define {{.*}} @fun_if.else{{.*}}#[[outlined_func_attr:[0-9]+]]
+; CHECK: attributes #[[outlined_func_attr]] = { {{.*}}minsize
-- 
GitLab


From 39f6fac74bc4036f432dcb3294fb0cbe1b0f1e13 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm@meinersbur.de>
Date: Tue, 23 Oct 2018 19:46:29 +0000
Subject: [PATCH 0467/1116] [test-suite/doc] Add list of programs we might add.

Add a list of benchmarks, applications and algorithms which are under
discussion to be added to the test-suite.

The initial list includes the the benchmarks mentioned at
https://llvm.org/PR34216, missing SPEC benchmarks, some image processing
algorithms and a few others. The bug tracker only allows adding to the
discussion, not removing, commenting, adding details to individual
benchmarks.

The first proposal was to add these benchmark into the test-suite
repository, but after a discussion, adding it to llvm/docs/Proposals
seem more appropriate. One advantage is that llvm.org will have a
browsable web page with these suggestions.

Suggested-by: Hal Finkel

Differential Revision: https://reviews.llvm.org/D46714

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345074 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/Proposals/TestSuite.rst | 310 +++++++++++++++++++++++++++++++++++
 docs/index.rst               |   4 +
 2 files changed, 314 insertions(+)
 create mode 100644 docs/Proposals/TestSuite.rst

diff --git a/docs/Proposals/TestSuite.rst b/docs/Proposals/TestSuite.rst
new file mode 100644
index 00000000000..62fc137d686
--- /dev/null
+++ b/docs/Proposals/TestSuite.rst
@@ -0,0 +1,310 @@
+=====================
+Test-Suite Extentions
+=====================
+
+.. contents::
+   :depth: 1
+   :local:
+
+Abstract
+========
+
+These are ideas for additional programs, benchmarks, applications and
+algorithms that could be added to the LLVM Test-Suite.
+The test-suite could be much larger than it is now, which would help us
+detecting compiler errors (crashes, miscompiles) during development.
+
+Most probably, the reason why the programs below have not been added to
+the test-suite yet is that nobody has found time to do it. But there
+might be other issues as well, such as
+
+ * Licensing (Support can still be added as external module,
+              like for the SPEC benchmarks)
+
+ * Language (in particular, there is no official LLVM frontend
+             for FORTRAN yet)
+
+ * Parallelism (currently, all programs in test-suite use
+                one thread only)
+
+Benchmarks
+==========
+
+SPEC CPU 2017
+-------------
+https://www.spec.org/cpu2017/
+
+The following have not been included yet because they contain Fortran
+code.
+
+In case of cactuBSSN only a small portion is Fortran. The hosts's
+Fortran compiler could be used for these parts.
+
+Note that CMake's Ninja generator has difficulties with Fortran. See the
+`CMake documentation <https://cmake.org/cmake/help/v3.13/generator/Ninja.html#fortran-support>`_
+for details.
+
+ * 503.bwaves_r/603.bwaves_s
+ * 507.cactuBSSN_r
+ * 521.wrf_r/621.wrf_s
+ * 527.cam4_r/627.cam4_s
+ * 628.pop2_s
+ * 548.exchange2_r/648.exchange2_s
+ * 549.fotonik3d_r/649.fotonik3d_s
+ * 554.roms_r/654.roms_s
+
+SPEC OMP2012
+------------
+https://www.spec.org/omp2012/
+
+ * 350.md
+ * 351.bwaves
+ * 352.nab
+ * 357.bt331
+ * 358.botsalgn
+ * 359.botsspar
+ * 360.ilbdc
+ * 362.fma3d
+ * 363.swim
+ * 367.imagick
+ * 370.mgrid331
+ * 371.applu331
+ * 372.smithwa
+ * 376.kdtree
+
+OpenCV
+------
+https://opencv.org/
+
+OpenMP 4.x SIMD Benchmarks
+--------------------------
+https://github.com/flwende/simd_benchmarks
+
+PWM-benchmarking
+----------------
+https://github.com/tbepler/PWM-benchmarking
+
+SLAMBench
+---------
+https://github.com/pamela-project/slambench
+
+FireHose
+--------
+http://firehose.sandia.gov/
+
+A Benchmark for the C/C++ Standard Library
+------------------------------------------
+https://github.com/hiraditya/std-benchmark
+
+OpenBenchmarking.org CPU / Processor Suite
+------------------------------------------
+https://openbenchmarking.org/suite/pts/cpu
+
+This is a subset of the
+`Phoronix Test Suite <https://github.com/phoronix-test-suite/phoronix-test-suite/>`_
+and is itself a collection of benchmark suites
+
+Parboil Benchmarks
+------------------
+http://impact.crhc.illinois.edu/parboil/parboil.aspx
+
+MachSuite
+---------
+https://breagen.github.io/MachSuite/
+
+Rodinia
+-------
+http://lava.cs.virginia.edu/Rodinia/download_links.htm
+
+Rodinia has already been partially included in
+MultiSource/Benchmarks/Rodinia. Benchmarks still missing are:
+
+ * streamcluster
+ * particlefilter
+ * nw
+ * nn
+ * myocyte
+ * mummergpu
+ * lud
+ * leukocyte
+ * lavaMD
+ * kmeans
+ * hotspot3D
+ * heartwall
+ * cfd
+ * bfs
+ * b+tree
+
+vecmathlib tests harness
+------------------------
+https://bitbucket.org/eschnett/vecmathlib/wiki/Home
+
+PARSEC
+------
+http://parsec.cs.princeton.edu/
+
+Graph500 reference implementations
+----------------------------------
+https://github.com/graph500/graph500/tree/v2-spec
+
+NAS Parallel Benchmarks
+-----------------------
+https://www.nas.nasa.gov/publications/npb.html
+
+The official benchmark is written in Fortran, but an unofficial
+C-translation is available as well:
+https://github.com/benchmark-subsetting/NPB3.0-omp-C
+
+DARPA HPCS SSCA#2 C/OpenMP reference implementation
+---------------------------------------------------
+http://www.highproductivity.org/SSCABmks.htm
+
+This web site does not exist any more, but there seems to be a copy of
+some of the benchmarks
+https://github.com/gtcasl/hpc-benchmarks/tree/master/SSCA2v2.2
+
+Kokkos
+------
+https://github.com/kokkos/kokkos-kernels/tree/master/perf_test
+https://github.com/kokkos/kokkos/tree/master/benchmarks
+
+PolyMage
+--------
+https://github.com/bondhugula/polymage-benchmarks
+
+PolyBench
+---------
+https://sourceforge.net/projects/polybench/
+
+A modified version of Polybench 3.2 is already presented in
+SingleSource/Benchmarks/Polybench. A newer version 4.2.1 is available.
+
+High Performance Geometric Multigrid
+------------------------------------
+https://crd.lbl.gov/departments/computer-science/PAR/research/hpgmg/
+
+RAJA Performance Suite
+----------------------
+https://github.com/LLNL/RAJAPerf
+
+CORAL-2 Benchmarks
+------------------
+https://asc.llnl.gov/coral-2-benchmarks/
+
+Many of its programs have already been integreated in
+MultiSource/Benchmarks/DOE-ProxyApps-C and
+MultiSource/Benchmarks/DOE-ProxyApps-C++.
+
+ * Nekbone
+ * QMCPack
+ * LAMMPS
+ * Kripke
+ * Quicksilver
+ * PENNANT
+ * Big Data Analytic Suite
+ * Deep Learning Suite
+ * Stream
+ * Stride
+ * ML/DL micro-benchmark
+ * Pynamic
+ * ACME
+ * VPIC
+ * Laghos
+ * Parallel Integer Sort
+ * Havoq
+
+NWChem
+------
+http://www.nwchem-sw.org/index.php/Benchmarks
+
+TVM
+----
+https://github.com/dmlc/tvm/tree/master/apps/benchmark
+
+HydroBench
+----------
+https://github.com/HydroBench/Hydro
+
+ParRes
+------
+https://github.com/ParRes/Kernels/tree/master/Cxx11
+
+Applications/Libraries
+======================
+
+GnuPG
+-----
+https://gnupg.org/
+
+Blitz++
+-------
+https://sourceforge.net/projects/blitz/
+
+FFmpeg
+------
+https://ffmpeg.org/
+
+FreePOOMA
+---------
+http://www.nongnu.org/freepooma/
+
+FTensors
+--------
+http://www.wlandry.net/Projects/FTensor
+
+Generic Algorithms
+==================
+
+Image processing
+----------------
+
+Resampling
+``````````
+
+ * Bilinear
+ * Bicubic
+ * Lanczos
+
+Dither
+``````
+
+ * Threshold
+ * Random
+ * Halftone
+ * Bayer
+ * Floyd-Steinberg
+ * Jarvis
+ * Stucki
+ * Burkes
+ * Sierra
+ * Atkinson
+ * Gradient-based
+
+Feature detection
+`````````````````
+
+ * Harris
+ * Histogram of Oriented Gradients
+
+Color conversion
+````````````````
+
+ * RGB to grayscale
+ * HSL to RGB
+
+Graph
+-----
+
+Search Algorithms
+`````````````````
+
+ * Breadth-First-Search
+ * Depth-First-Search
+ * Dijkstra's algorithm
+ * A-Star
+
+Spanning Tree
+`````````````
+
+ * Kruskal's algorithm
+ * Prim's algorithm
diff --git a/docs/index.rst b/docs/index.rst
index 7edfdd24191..16d36866b5d 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -559,6 +559,7 @@ can be better.
 
    CodeOfConduct
    Proposals/GitHubMove
+   Proposals/TestSuite
    Proposals/VectorizationPlan
 
 :doc:`CodeOfConduct`
@@ -568,6 +569,9 @@ can be better.
 :doc:`Proposals/GitHubMove`
    Proposal to move from SVN/Git to GitHub.
 
+:doc:`Proposals/TestSuite`
+   Proposals for additional benchmarks/programs for llvm's test-suite.
+
 :doc:`Proposals/VectorizationPlan`
    Proposal to model the process and upgrade the infrastructure of LLVM's Loop Vectorizer.
 
-- 
GitLab


From e4c9c3925cc54cf13e6c99b68cafd15f50458d25 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 23 Oct 2018 20:20:22 +0000
Subject: [PATCH 0468/1116] [ORC] Change how non-exported symbols are matched
 during lookup.

In the new scheme the client passes a list of (JITDylib&, bool) pairs, rather
than a list of JITDylibs. For each JITDylib the boolean indicates whether or not
to match against non-exported symbols (true means that they should be found,
false means that they should not). The MatchNonExportedInJD and MatchNonExported
parameters on lookup are removed.

The new scheme is more flexible, and easier to understand.

This patch also updates JITDylib search orders to be lists of (JITDylib&, bool)
pairs to match the new lookup scheme. Error handling is also plumbed through
the LLJIT class to allow regression tests to fail predictably when a lookup from
a lazy call-through fails.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345077 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/JITSymbol.h      |  12 ++
 include/llvm/ExecutionEngine/Orc/Core.h       |  99 ++++++------
 include/llvm/ExecutionEngine/Orc/LLJIT.h      |   2 +-
 .../Orc/CompileOnDemandLayer.cpp              |  17 +-
 lib/ExecutionEngine/Orc/Core.cpp              | 151 ++++++++++--------
 lib/ExecutionEngine/Orc/ExecutionUtils.cpp    |   5 +-
 lib/ExecutionEngine/Orc/IndirectionUtils.cpp  |   2 +-
 lib/ExecutionEngine/Orc/LLJIT.cpp             |   8 +-
 lib/ExecutionEngine/Orc/LazyReexports.cpp     |   4 +-
 .../Orc/RTDyldObjectLinkingLayer.cpp          |   9 +-
 .../OrcLazy/Inputs/hidden-definitions.ll      |   6 +
 .../OrcLazy/hidden-visibility.ll              |  17 ++
 tools/lli/lli.cpp                             |  46 +++++-
 .../ExecutionEngine/Orc/CoreAPIsTest.cpp      |  61 +++----
 .../Orc/RTDyldObjectLinkingLayerTest.cpp      |   8 +-
 15 files changed, 277 insertions(+), 170 deletions(-)
 create mode 100644 test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll
 create mode 100644 test/ExecutionEngine/OrcLazy/hidden-visibility.ll

diff --git a/include/llvm/ExecutionEngine/JITSymbol.h b/include/llvm/ExecutionEngine/JITSymbol.h
index 18b972ed829..05c9590726d 100644
--- a/include/llvm/ExecutionEngine/JITSymbol.h
+++ b/include/llvm/ExecutionEngine/JITSymbol.h
@@ -40,6 +40,18 @@ class SymbolRef;
 /// Represents an address in the target process's address space.
 using JITTargetAddress = uint64_t;
 
+/// Convert a JITTargetAddress to a pointer.
+template <typename T> T jitTargetAddressToPointer(JITTargetAddress Addr) {
+  static_assert(std::is_pointer<T>::value, "T must be a pointer type");
+  uintptr_t IntPtr = static_cast<uintptr_t>(Addr);
+  assert(IntPtr == Addr && "JITTargetAddress value out of range for uintptr_t");
+  return reinterpret_cast<T>(IntPtr);
+}
+
+template <typename T> JITTargetAddress pointerToJITTargetAddress(T *Ptr) {
+  return static_cast<JITTargetAddress>(reinterpret_cast<uintptr_t>(Ptr));
+}
+
 /// Flags for symbols in the JIT.
 class JITSymbolFlags {
 public:
diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h
index 2e56854340c..39d306e0bd4 100644
--- a/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/include/llvm/ExecutionEngine/Orc/Core.h
@@ -54,8 +54,8 @@ using SymbolFlagsMap = DenseMap<SymbolStringPtr, JITSymbolFlags>;
 ///        symbols to be obtained for logging.
 using SymbolDependenceMap = DenseMap<JITDylib *, SymbolNameSet>;
 
-/// A list of JITDylib pointers.
-using JITDylibList = std::vector<JITDylib *>;
+/// A list of (JITDylib*, bool) pairs.
+using JITDylibSearchList = std::vector<std::pair<JITDylib *, bool>>;
 
 /// Render a SymbolStringPtr.
 raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym);
@@ -85,8 +85,8 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps);
 /// Render a MaterializationUnit.
 raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU);
 
-/// Render a JITDylibList.
-raw_ostream &operator<<(raw_ostream &OS, const JITDylibList &JDs);
+/// Render a JITDylibSearchList.
+raw_ostream &operator<<(raw_ostream &OS, const JITDylibSearchList &JDs);
 
 /// Callback to notify client that symbols have been resolved.
 using SymbolsResolvedCallback = std::function<void(Expected<SymbolMap>)>;
@@ -351,14 +351,15 @@ using SymbolAliasMap = DenseMap<SymbolStringPtr, SymbolAliasMapEntry>;
 class ReExportsMaterializationUnit : public MaterializationUnit {
 public:
   /// SourceJD is allowed to be nullptr, in which case the source JITDylib is
-  /// taken to be whatever JITDylib these definitions are materialized in. This
-  /// is useful for defining aliases within a JITDylib.
+  /// taken to be whatever JITDylib these definitions are materialized in (and
+  /// MatchNonExported has no effect). This is useful for defining aliases
+  /// within a JITDylib.
   ///
   /// Note: Care must be taken that no sets of aliases form a cycle, as such
   ///       a cycle will result in a deadlock when any symbol in the cycle is
   ///       resolved.
-  ReExportsMaterializationUnit(JITDylib *SourceJD, SymbolAliasMap Aliases,
-                               VModuleKey K);
+  ReExportsMaterializationUnit(JITDylib *SourceJD, bool MatchNonExported,
+                               SymbolAliasMap Aliases, VModuleKey K);
 
   StringRef getName() const override;
 
@@ -368,6 +369,7 @@ private:
   static SymbolFlagsMap extractFlags(const SymbolAliasMap &Aliases);
 
   JITDylib *SourceJD = nullptr;
+  bool MatchNonExported = false;
   SymbolAliasMap Aliases;
 };
 
@@ -385,16 +387,19 @@ private:
 inline std::unique_ptr<ReExportsMaterializationUnit>
 symbolAliases(SymbolAliasMap Aliases, VModuleKey K = VModuleKey()) {
   return llvm::make_unique<ReExportsMaterializationUnit>(
-      nullptr, std::move(Aliases), std::move(K));
+      nullptr, true, std::move(Aliases), std::move(K));
 }
 
 /// Create a materialization unit for re-exporting symbols from another JITDylib
 /// with alternative names/flags.
+/// If MatchNonExported is true then non-exported symbols from SourceJD can be
+/// re-exported. If it is false, attempts to re-export a non-exported symbol
+/// will result in a "symbol not found" error.
 inline std::unique_ptr<ReExportsMaterializationUnit>
 reexports(JITDylib &SourceJD, SymbolAliasMap Aliases,
-          VModuleKey K = VModuleKey()) {
+          bool MatchNonExported = false, VModuleKey K = VModuleKey()) {
   return llvm::make_unique<ReExportsMaterializationUnit>(
-      &SourceJD, std::move(Aliases), std::move(K));
+      &SourceJD, MatchNonExported, std::move(Aliases), std::move(K));
 }
 
 /// Build a SymbolAliasMap for the common case where you want to re-export
@@ -411,13 +416,14 @@ public:
   /// Create a reexports generator. If an Allow predicate is passed, only
   /// symbols for which the predicate returns true will be reexported. If no
   /// Allow predicate is passed, all symbols will be exported.
-  ReexportsGenerator(JITDylib &SourceJD,
+  ReexportsGenerator(JITDylib &SourceJD, bool MatchNonExported = false,
                      SymbolPredicate Allow = SymbolPredicate());
 
   SymbolNameSet operator()(JITDylib &JD, const SymbolNameSet &Names);
 
 private:
   JITDylib &SourceJD;
+  bool MatchNonExported = false;
   SymbolPredicate Allow;
 };
 
@@ -536,16 +542,18 @@ public:
   /// as the first in the search order (instead of this dylib) ensures that
   /// definitions within this dylib resolve to the lazy-compiling stubs,
   /// rather than immediately materializing the definitions in this dylib.
-  void setSearchOrder(JITDylibList NewSearchOrder,
-                      bool SearchThisJITDylibFirst = true);
+  void setSearchOrder(JITDylibSearchList NewSearchOrder,
+                      bool SearchThisJITDylibFirst = true,
+                      bool MatchNonExportedInThisDylib = true);
 
   /// Add the given JITDylib to the search order for definitions in this
   /// JITDylib.
-  void addToSearchOrder(JITDylib &JD);
+  void addToSearchOrder(JITDylib &JD, bool MatcNonExported = false);
 
   /// Replace OldJD with NewJD in the search order if OldJD is present.
   /// Otherwise this operation is a no-op.
-  void replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD);
+  void replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
+                            bool MatchNonExported = false);
 
   /// Remove the given JITDylib from the search order for this JITDylib if it is
   /// present. Otherwise this operation is a no-op.
@@ -554,7 +562,7 @@ public:
   /// Do something with the search order (run under the session lock).
   template <typename Func>
   auto withSearchOrderDo(Func &&F)
-      -> decltype(F(std::declval<const JITDylibList &>()));
+      -> decltype(F(std::declval<const JITDylibSearchList &>()));
 
   /// Define all symbols provided by the materialization unit to be part of this
   /// JITDylib.
@@ -642,12 +650,12 @@ private:
                                 const SymbolNameSet &Names);
 
   void lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                  SymbolNameSet &Unresolved, JITDylib *MatchNonExportedInJD,
-                  bool MatchNonExported, MaterializationUnitList &MUs);
+                  SymbolNameSet &Unresolved, bool MatchNonExported,
+                  MaterializationUnitList &MUs);
 
   void lodgeQueryImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                      SymbolNameSet &Unresolved, JITDylib *MatchNonExportedInJD,
-                      bool MatchNonExported, MaterializationUnitList &MUs);
+                      SymbolNameSet &Unresolved, bool MatchNonExported,
+                      MaterializationUnitList &MUs);
 
   LookupImplActionFlags
   lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
@@ -682,7 +690,7 @@ private:
   UnmaterializedInfosMap UnmaterializedInfos;
   MaterializingInfosMap MaterializingInfos;
   GeneratorFunction DefGenerator;
-  JITDylibList SearchOrder;
+  JITDylibSearchList SearchOrder;
 };
 
 /// An ExecutionSession represents a running JIT program.
@@ -766,6 +774,10 @@ public:
 
   /// Search the given JITDylib list for the given symbols.
   ///
+  /// SearchOrder lists the JITDylibs to search. For each dylib, the associated
+  /// boolean indicates whether the search should match against non-exported
+  /// (hidden visibility) symbols in that dylib (true means match against
+  /// non-exported symbols, false means do not match).
   ///
   /// The OnResolve callback will be called once all requested symbols are
   /// resolved, or if an error occurs prior to resolution.
@@ -782,19 +794,9 @@ public:
   /// dependenant symbols for this query (e.g. it is being made by a top level
   /// client to get an address to call) then the value NoDependenciesToRegister
   /// can be used.
-  ///
-  /// If the MatchNonExportedInJD pointer is non-null, then the lookup will find
-  /// non-exported symbols defined in the JITDylib pointed to by
-  /// MatchNonExportedInJD.
-  /// If MatchNonExported is true the lookup will find non-exported symbols in
-  /// any JITDylib (setting MatchNonExportedInJD is redundant in such cases).
-  /// If MatchNonExported is false and MatchNonExportedInJD is null,
-  /// non-exported symbols will never be found.
-  void lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
+  void lookup(const JITDylibSearchList &SearchOrder, SymbolNameSet Symbols,
               SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
-              RegisterDependenciesFunction RegisterDependencies,
-              JITDylib *MatchNonExportedInJD = nullptr,
-              bool MatchNonExported = false);
+              RegisterDependenciesFunction RegisterDependencies);
 
   /// Blocking version of lookup above. Returns the resolved symbol map.
   /// If WaitUntilReady is true (the default), will not return until all
@@ -803,24 +805,29 @@ public:
   /// or an error occurs. If WaitUntilReady is false and an error occurs
   /// after resolution, the function will return a success value, but the
   /// error will be reported via reportErrors.
-  Expected<SymbolMap> lookup(const JITDylibList &JDs,
+  Expected<SymbolMap> lookup(const JITDylibSearchList &SearchOrder,
                              const SymbolNameSet &Symbols,
                              RegisterDependenciesFunction RegisterDependencies =
                                  NoDependenciesToRegister,
-                             bool WaitUntilReady = true,
-                             JITDylib *MatchNonExportedInJD = nullptr,
-                             bool MatchNonExported = false);
+                             bool WaitUntilReady = true);
+
+  /// Convenience version of blocking lookup.
+  /// Searches each of the JITDylibs in the search order in turn for the given
+  /// symbol.
+  Expected<JITEvaluatedSymbol> lookup(const JITDylibSearchList &SearchOrder,
+                                      SymbolStringPtr Symbol);
 
   /// Convenience version of blocking lookup.
-  /// Performs a single-symbol lookup.
-  Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs,
-                                      SymbolStringPtr Symbol,
-                                      bool MatchNonExported = false);
+  /// Searches each of the JITDylibs in the search order in turn for the given
+  /// symbol. The search will not find non-exported symbols.
+  Expected<JITEvaluatedSymbol> lookup(ArrayRef<JITDylib *> SearchOrder,
+                                      SymbolStringPtr Symbol);
 
   /// Convenience version of blocking lookup.
-  /// Performs a single-symbol lookup, auto-interning the given symbol name.
-  Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs, StringRef Symbol,
-                                      bool MatchNonExported = false);
+  /// Searches each of the JITDylibs in the search order in turn for the given
+  /// symbol. The search will not find non-exported symbols.
+  Expected<JITEvaluatedSymbol> lookup(ArrayRef<JITDylib *> SearchOrder,
+                                      StringRef Symbol);
 
   /// Materialize the given unit.
   void dispatchMaterialization(JITDylib &JD,
@@ -866,7 +873,7 @@ private:
 
 template <typename Func>
 auto JITDylib::withSearchOrderDo(Func &&F)
-    -> decltype(F(std::declval<const JITDylibList &>())) {
+    -> decltype(F(std::declval<const JITDylibSearchList &>())) {
   return ES.runSessionLocked([&]() { return F(SearchOrder); });
 }
 
diff --git a/include/llvm/ExecutionEngine/Orc/LLJIT.h b/include/llvm/ExecutionEngine/Orc/LLJIT.h
index b7ef8834706..ce3e5d519c7 100644
--- a/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -144,7 +144,7 @@ public:
   /// LLLazyJIT with the given number of compile threads.
   static Expected<std::unique_ptr<LLLazyJIT>>
   Create(JITTargetMachineBuilder JTMB, DataLayout DL,
-         unsigned NumCompileThreads = 0);
+         JITTargetAddress ErrorAddr, unsigned NumCompileThreads = 0);
 
   /// Set an IR transform (e.g. pass manager pipeline) to run on each function
   /// when it is compiled.
diff --git a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index de1fa079dde..241eb3600da 100644
--- a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -157,7 +157,7 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R,
     return;
   }
 
-  R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables)));
+  R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables), true));
   R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(),
                           std::move(Callables)));
 }
@@ -166,10 +166,17 @@ CompileOnDemandLayer::PerDylibResources &
 CompileOnDemandLayer::getPerDylibResources(JITDylib &TargetD) {
   auto I = DylibResources.find(&TargetD);
   if (I == DylibResources.end()) {
-    auto &ImplD =
-        getExecutionSession().createJITDylib(TargetD.getName() + ".impl");
-    TargetD.withSearchOrderDo([&](const JITDylibList &TargetSearchOrder) {
-      ImplD.setSearchOrder(TargetSearchOrder, false);
+    auto &ImplD = getExecutionSession().createJITDylib(
+        TargetD.getName() + ".impl", false);
+    TargetD.withSearchOrderDo([&](const JITDylibSearchList &TargetSearchOrder) {
+      auto NewSearchOrder = TargetSearchOrder;
+      assert(!NewSearchOrder.empty() &&
+             NewSearchOrder.front().first == &TargetD &&
+             NewSearchOrder.front().second == true &&
+             "TargetD must be at the front of its own search order and match "
+             "non-exported symbol");
+      NewSearchOrder.insert(std::next(NewSearchOrder.begin()), {&ImplD, true});
+      ImplD.setSearchOrder(std::move(NewSearchOrder), false);
     });
     PerDylibResources PDR(ImplD, BuildIndirectStubsManager());
     I = DylibResources.insert(std::make_pair(&TargetD, std::move(PDR))).first;
diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index df4d0028a4a..8a9740e0be0 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -205,14 +205,16 @@ raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) {
   return OS << ")";
 }
 
-raw_ostream &operator<<(raw_ostream &OS, const JITDylibList &JDs) {
+raw_ostream &operator<<(raw_ostream &OS, const JITDylibSearchList &JDs) {
   OS << "[";
   if (!JDs.empty()) {
-    assert(JDs.front() && "JITDylibList entries must not be null");
-    OS << " " << JDs.front()->getName();
-    for (auto *JD : make_range(std::next(JDs.begin()), JDs.end())) {
-      assert(JD && "JITDylibList entries must not be null");
-      OS << ", " << JD->getName();
+    assert(JDs.front().first && "JITDylibList entries must not be null");
+    OS << " (\"" << JDs.front().first->getName() << "\", "
+       << (JDs.front().second ? "true" : "false") << ")";
+    for (auto &KV : make_range(std::next(JDs.begin()), JDs.end())) {
+      assert(KV.first && "JITDylibList entries must not be null");
+      OS << ", (\"" << KV.first->getName() << "\", "
+         << (KV.second ? "true" : "false") << ")";
     }
   }
   OS << " ]";
@@ -526,9 +528,11 @@ AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) {
 }
 
 ReExportsMaterializationUnit::ReExportsMaterializationUnit(
-    JITDylib *SourceJD, SymbolAliasMap Aliases, VModuleKey K)
+    JITDylib *SourceJD, bool MatchNonExported, SymbolAliasMap Aliases,
+    VModuleKey K)
     : MaterializationUnit(extractFlags(Aliases), std::move(K)),
-      SourceJD(SourceJD), Aliases(std::move(Aliases)) {}
+      SourceJD(SourceJD), MatchNonExported(MatchNonExported),
+      Aliases(std::move(Aliases)) {}
 
 StringRef ReExportsMaterializationUnit::getName() const {
   return "<Reexports>";
@@ -556,7 +560,7 @@ void ReExportsMaterializationUnit::materialize(
 
   if (!Aliases.empty()) {
     if (SourceJD)
-      R.replace(reexports(*SourceJD, std::move(Aliases)));
+      R.replace(reexports(*SourceJD, std::move(Aliases), MatchNonExported));
     else
       R.replace(symbolAliases(std::move(Aliases)));
   }
@@ -656,8 +660,8 @@ void ReExportsMaterializationUnit::materialize(
 
     auto OnReady = [&ES](Error Err) { ES.reportError(std::move(Err)); };
 
-    ES.lookup({&SrcJD}, QuerySymbols, std::move(OnResolve), std::move(OnReady),
-              std::move(RegisterDependencies), nullptr, true);
+    ES.lookup({{&SrcJD, MatchNonExported}}, QuerySymbols, std::move(OnResolve),
+              std::move(OnReady), std::move(RegisterDependencies));
   }
 }
 
@@ -698,8 +702,10 @@ buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols) {
 }
 
 ReexportsGenerator::ReexportsGenerator(JITDylib &SourceJD,
+                                       bool MatchNonExported,
                                        SymbolPredicate Allow)
-    : SourceJD(SourceJD), Allow(std::move(Allow)) {}
+    : SourceJD(SourceJD), MatchNonExported(MatchNonExported),
+      Allow(std::move(Allow)) {}
 
 SymbolNameSet ReexportsGenerator::operator()(JITDylib &JD,
                                              const SymbolNameSet &Names) {
@@ -716,7 +722,7 @@ SymbolNameSet ReexportsGenerator::operator()(JITDylib &JD,
   }
 
   if (!Added.empty())
-    cantFail(JD.define(reexports(SourceJD, AliasMap)));
+    cantFail(JD.define(reexports(SourceJD, AliasMap, MatchNonExported)));
 
   return Added;
 }
@@ -1041,30 +1047,41 @@ void JITDylib::notifyFailed(const SymbolNameSet &FailedSymbols) {
     Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbols));
 }
 
-void JITDylib::setSearchOrder(JITDylibList NewSearchOrder,
-                              bool SearchThisJITDylibFirst) {
-  if (SearchThisJITDylibFirst && NewSearchOrder.front() != this)
-    NewSearchOrder.insert(NewSearchOrder.begin(), this);
+void JITDylib::setSearchOrder(JITDylibSearchList NewSearchOrder,
+                              bool SearchThisJITDylibFirst,
+                              bool MatchNonExportedInThisDylib) {
+  if (SearchThisJITDylibFirst && NewSearchOrder.front().first != this)
+    NewSearchOrder.insert(NewSearchOrder.begin(),
+                          {this, MatchNonExportedInThisDylib});
 
   ES.runSessionLocked([&]() { SearchOrder = std::move(NewSearchOrder); });
 }
 
-void JITDylib::addToSearchOrder(JITDylib &JD) {
-  ES.runSessionLocked([&]() { SearchOrder.push_back(&JD); });
+void JITDylib::addToSearchOrder(JITDylib &JD, bool MatchNonExported) {
+  ES.runSessionLocked([&]() {
+    SearchOrder.push_back({&JD, MatchNonExported});
+  });
 }
 
-void JITDylib::replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD) {
+void JITDylib::replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
+                                    bool MatchNonExported) {
   ES.runSessionLocked([&]() {
-    auto I = std::find(SearchOrder.begin(), SearchOrder.end(), &OldJD);
+    auto I = std::find_if(SearchOrder.begin(), SearchOrder.end(),
+                          [&](const JITDylibSearchList::value_type &KV) {
+                            return KV.first == &OldJD;
+                          });
 
     if (I != SearchOrder.end())
-      *I = &NewJD;
+      *I = {&NewJD, MatchNonExported};
   });
 }
 
 void JITDylib::removeFromSearchOrder(JITDylib &JD) {
   ES.runSessionLocked([&]() {
-    auto I = std::find(SearchOrder.begin(), SearchOrder.end(), &JD);
+    auto I = std::find_if(SearchOrder.begin(), SearchOrder.end(),
+                          [&](const JITDylibSearchList::value_type &KV) {
+                            return KV.first == &JD;
+                          });
     if (I != SearchOrder.end())
       SearchOrder.erase(I);
   });
@@ -1161,18 +1178,17 @@ SymbolNameSet JITDylib::lookupFlagsImpl(SymbolFlagsMap &Flags,
 }
 
 void JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                          SymbolNameSet &Unresolved,
-                          JITDylib *MatchNonExportedInJD, bool MatchNonExported,
+                          SymbolNameSet &Unresolved, bool MatchNonExported,
                           MaterializationUnitList &MUs) {
   assert(Q && "Query can not be null");
 
-  lodgeQueryImpl(Q, Unresolved, MatchNonExportedInJD, MatchNonExported, MUs);
+  lodgeQueryImpl(Q, Unresolved, MatchNonExported, MUs);
   if (DefGenerator && !Unresolved.empty()) {
     auto NewDefs = DefGenerator(*this, Unresolved);
     if (!NewDefs.empty()) {
       for (auto &D : NewDefs)
         Unresolved.erase(D);
-      lodgeQueryImpl(Q, NewDefs, MatchNonExportedInJD, MatchNonExported, MUs);
+      lodgeQueryImpl(Q, NewDefs, MatchNonExported, MUs);
       assert(NewDefs.empty() &&
              "All fallback defs should have been found by lookupImpl");
     }
@@ -1181,7 +1197,7 @@ void JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
 
 void JITDylib::lodgeQueryImpl(
     std::shared_ptr<AsynchronousSymbolQuery> &Q, SymbolNameSet &Unresolved,
-    JITDylib *MatchNonExportedInJD, bool MatchNonExported,
+    bool MatchNonExported,
     std::vector<std::unique_ptr<MaterializationUnit>> &MUs) {
 
   std::vector<SymbolStringPtr> ToRemove;
@@ -1191,12 +1207,9 @@ void JITDylib::lodgeQueryImpl(
     if (SymI == Symbols.end())
       continue;
 
-    // If this is a non-exported symbol, then check the values of
-    // MatchNonExportedInJD and MatchNonExported. Skip if we should not match
-    // against this symbol.
-    if (!SymI->second.getFlags().isExported())
-      if (!MatchNonExported && MatchNonExportedInJD != this)
-        continue;
+    // If this is a non exported symbol and we're skipping those then skip it.
+    if (!SymI->second.getFlags().isExported() && !MatchNonExported)
+      continue;
 
     // If we matched against Name in JD, mark it to be removed from the Unresolved
     // set.
@@ -1382,8 +1395,9 @@ void JITDylib::dump(raw_ostream &OS) {
        << "\" (ES: " << format("0x%016x", reinterpret_cast<uintptr_t>(&ES))
        << "):\n"
        << "Search order: [";
-    for (auto *JD : SearchOrder)
-      OS << " \"" << JD->getName() << "\"";
+    for (auto &KV : SearchOrder)
+      OS << " (\"" << KV.first->getName() << "\", "
+         << (KV.second ? "all" : "exported only") << ")";
     OS << " ]\n"
        << "Symbol table:\n";
 
@@ -1431,7 +1445,7 @@ void JITDylib::dump(raw_ostream &OS) {
 
 JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
     : ES(ES), JITDylibName(std::move(Name)) {
-  SearchOrder.push_back(this);
+  SearchOrder.push_back({this, true});
 }
 
 Error JITDylib::defineImpl(MaterializationUnit &MU) {
@@ -1724,12 +1738,10 @@ Expected<SymbolMap> ExecutionSession::legacyLookup(
 #endif
 }
 
-void ExecutionSession::lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
-                              SymbolsResolvedCallback OnResolve,
-                              SymbolsReadyCallback OnReady,
-                              RegisterDependenciesFunction RegisterDependencies,
-                              JITDylib *MatchNonExportedInJD,
-                              bool MatchNonExported) {
+void ExecutionSession::lookup(
+    const JITDylibSearchList &SearchOrder, SymbolNameSet Symbols,
+    SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
+    RegisterDependenciesFunction RegisterDependencies) {
 
   // lookup can be re-entered recursively if running on a single thread. Run any
   // outstanding MUs in case this query depends on them, otherwise this lookup
@@ -1745,12 +1757,14 @@ void ExecutionSession::lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
   bool QueryFailed = false;
 
   runSessionLocked([&]() {
-    for (auto *JD : JDs) {
-      assert(JD && "JITDylibList entries must not be null");
-      assert(!CollectedMUsMap.count(JD) &&
+    for (auto &KV : SearchOrder) {
+      assert(KV.first && "JITDylibList entries must not be null");
+      assert(!CollectedMUsMap.count(KV.first) &&
              "JITDylibList should not contain duplicate entries");
-      JD->lodgeQuery(Q, Unresolved, MatchNonExportedInJD, MatchNonExported,
-                     CollectedMUsMap[JD]);
+
+      auto &JD = *KV.first;
+      auto MatchNonExported = KV.second;
+      JD.lodgeQuery(Q, Unresolved, MatchNonExported, CollectedMUsMap[&JD]);
     }
 
     if (Unresolved.empty()) {
@@ -1801,11 +1815,9 @@ void ExecutionSession::lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
   runOutstandingMUs();
 }
 
-Expected<SymbolMap>
-ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
-                         RegisterDependenciesFunction RegisterDependencies,
-                         bool WaitUntilReady, JITDylib *MatchNonExportedInJD,
-                         bool MatchNonExported) {
+Expected<SymbolMap> ExecutionSession::lookup(
+    const JITDylibSearchList &SearchOrder, const SymbolNameSet &Symbols,
+    RegisterDependenciesFunction RegisterDependencies, bool WaitUntilReady) {
 #if LLVM_ENABLE_THREADS
   // In the threaded case we use promises to return the results.
   std::promise<SymbolMap> PromisedResult;
@@ -1872,8 +1884,7 @@ ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
 #endif
 
   // Perform the asynchronous lookup.
-  lookup(JDs, Symbols, OnResolve, OnReady, RegisterDependencies,
-         MatchNonExportedInJD, MatchNonExported);
+  lookup(SearchOrder, Symbols, OnResolve, OnReady, RegisterDependencies);
 
 #if LLVM_ENABLE_THREADS
   auto ResultFuture = PromisedResult.get_future();
@@ -1916,14 +1927,13 @@ ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
 #endif
 }
 
-/// Look up a symbol by searching a list of JDs.
-Expected<JITEvaluatedSymbol> ExecutionSession::lookup(const JITDylibList &JDs,
-                                                      SymbolStringPtr Name,
-                                                      bool MatchNonExported) {
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(const JITDylibSearchList &SearchOrder,
+                         SymbolStringPtr Name) {
   SymbolNameSet Names({Name});
 
-  if (auto ResultMap = lookup(JDs, std::move(Names), NoDependenciesToRegister,
-                              true, nullptr, MatchNonExported)) {
+  if (auto ResultMap = lookup(SearchOrder, std::move(Names),
+                              NoDependenciesToRegister, true)) {
     assert(ResultMap->size() == 1 && "Unexpected number of results");
     assert(ResultMap->count(Name) && "Missing result for symbol");
     return std::move(ResultMap->begin()->second);
@@ -1931,10 +1941,21 @@ Expected<JITEvaluatedSymbol> ExecutionSession::lookup(const JITDylibList &JDs,
     return ResultMap.takeError();
 }
 
-Expected<JITEvaluatedSymbol> ExecutionSession::lookup(const JITDylibList &JDs,
-                                                      StringRef Name,
-                                                      bool MatchNonExported) {
-  return lookup(JDs, intern(Name), MatchNonExported);
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder,
+                         SymbolStringPtr Name) {
+  SymbolNameSet Names({Name});
+
+  JITDylibSearchList FullSearchOrder(SearchOrder.size());
+  for (auto *JD : SearchOrder)
+    FullSearchOrder.push_back({JD, false});
+
+  return lookup(FullSearchOrder, Name);
+}
+
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder, StringRef Name) {
+  return lookup(SearchOrder, intern(Name));
 }
 
 void ExecutionSession::dump(raw_ostream &OS) {
diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 21a604f71ca..3a1984e8a50 100644
--- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -129,9 +129,8 @@ Error CtorDtorRunner::run() {
   }
 
   auto &ES = JD.getExecutionSession();
-  if (auto CtorDtorMap =
-          ES.lookup({&JD}, std::move(Names), NoDependenciesToRegister, true,
-                    nullptr, true)) {
+  if (auto CtorDtorMap = ES.lookup({{&JD, true}}, std::move(Names),
+                                   NoDependenciesToRegister, true)) {
     for (auto &KV : CtorDtorsByPriority) {
       for (auto &Name : KV.second) {
         assert(CtorDtorMap->count(Name) && "No entry for Name");
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index c10d15ab117..205821b0a71 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -101,7 +101,7 @@ JITTargetAddress JITCompileCallbackManager::executeCompileCallback(
       Name = I->second;
   }
 
-  if (auto Sym = ES.lookup({&CallbacksJD}, Name, true))
+  if (auto Sym = ES.lookup({{&CallbacksJD, true}}, Name))
     return Sym->getAddress();
   else {
     llvm::dbgs() << "Didn't find callback.\n";
diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp
index ac71a5e7673..8486fe449f7 100644
--- a/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -76,7 +76,7 @@ Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
 
 Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
                                                         StringRef Name) {
-  return ES->lookup({&JD}, ES->intern(Name));
+  return ES->lookup({{&JD, true}}, ES->intern(Name));
 }
 
 LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES,
@@ -144,13 +144,13 @@ void LLJIT::recordCtorDtors(Module &M) {
 }
 
 Expected<std::unique_ptr<LLLazyJIT>>
-  LLLazyJIT::Create(JITTargetMachineBuilder JTMB, DataLayout DL,
-                    unsigned NumCompileThreads) {
+LLLazyJIT::Create(JITTargetMachineBuilder JTMB, DataLayout DL,
+                  JITTargetAddress ErrorAddr, unsigned NumCompileThreads) {
   auto ES = llvm::make_unique<ExecutionSession>();
 
   const Triple &TT = JTMB.getTargetTriple();
 
-  auto LCTMgr = createLocalLazyCallThroughManager(TT, *ES, 0);
+  auto LCTMgr = createLocalLazyCallThroughManager(TT, *ES, ErrorAddr);
   if (!LCTMgr)
     return LCTMgr.takeError();
 
diff --git a/lib/ExecutionEngine/Orc/LazyReexports.cpp b/lib/ExecutionEngine/Orc/LazyReexports.cpp
index af4c508d7f1..ba8e2a9c52f 100644
--- a/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -52,8 +52,8 @@ LazyCallThroughManager::callThroughToSymbol(JITTargetAddress TrampolineAddr) {
     SymbolName = I->second.second;
   }
 
-  auto LookupResult = ES.lookup({SourceJD}, {SymbolName},
-                                NoDependenciesToRegister, true, nullptr, true);
+  auto LookupResult = ES.lookup({{SourceJD, true}}, {SymbolName},
+                                NoDependenciesToRegister, true);
 
   if (!LookupResult) {
     ES.reportError(LookupResult.takeError());
diff --git a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index 616251c7e00..299d76183cd 100644
--- a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -50,10 +50,11 @@ public:
       MR.addDependenciesForAll(Deps);
     };
 
-    MR.getTargetJITDylib().withSearchOrderDo([&](const JITDylibList &JDs) {
-      ES.lookup(JDs, InternedSymbols, OnResolvedWithUnwrap, OnReady,
-                RegisterDependencies, &MR.getTargetJITDylib());
-    });
+    JITDylibSearchList SearchOrder;
+    MR.getTargetJITDylib().withSearchOrderDo(
+        [&](const JITDylibSearchList &JDs) { SearchOrder = JDs; });
+    ES.lookup(SearchOrder, InternedSymbols, OnResolvedWithUnwrap, OnReady,
+              RegisterDependencies);
   }
 
   Expected<LookupSet> getResponsibilitySet(const LookupSet &Symbols) {
diff --git a/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll b/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll
new file mode 100644
index 00000000000..8d1f4b9cc5c
--- /dev/null
+++ b/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll
@@ -0,0 +1,6 @@
+@bar = hidden global i32 0
+
+define hidden i32 @foo() {
+entry:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcLazy/hidden-visibility.ll b/test/ExecutionEngine/OrcLazy/hidden-visibility.ll
new file mode 100644
index 00000000000..199fd644bff
--- /dev/null
+++ b/test/ExecutionEngine/OrcLazy/hidden-visibility.ll
@@ -0,0 +1,17 @@
+; RUN: lli -jit-kind=orc-lazy -extra-module %p/Inputs/hidden-definitions.ll %s
+; RUN: not lli -jit-kind=orc-lazy -jd libFoo -extra-module %p/Inputs/hidden-definitions.ll %s
+;
+; Check that hidden symbols in another module are visible when the module is
+; added to the same JITDylib, and not visible if it is added to a different
+; JITDylib.
+
+@bar = external global i32
+declare i32 @foo()
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
+entry:
+  %0 = call i32() @foo()
+  %1 = load i32, i32* @bar
+  %2 = add i32 %0, %1
+  ret i32 %2
+}
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index f4585dc080d..c3c57e2cdee 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -115,6 +115,11 @@ namespace {
                "rather than individual functions"),
       cl::init(false));
 
+  cl::list<std::string>
+      JITDylibs("jd",
+                cl::desc("Specifies the JITDylib to be used for any subsequent "
+                         "-extra-module arguments."));
+
   // The MCJIT supports building for a target address space separate from
   // the JIT compilation process. Use a forked process and a copying
   // memory manager with IPC to execute using this functionality.
@@ -749,6 +754,8 @@ static orc::IRTransformLayer::TransformFunction createDebugDumper() {
   llvm_unreachable("Unknown DumpKind");
 }
 
+static void exitOnLazyCallThroughFailure() { exit(1); }
+
 int runOrcLazyJIT(const char *ProgName) {
   // Start setting up the JIT environment.
 
@@ -778,7 +785,11 @@ int runOrcLazyJIT(const char *ProgName) {
                         : None);
 
   DataLayout DL = ExitOnErr(JTMB.getDefaultDataLayoutForTarget());
-  auto J = ExitOnErr(orc::LLLazyJIT::Create(std::move(JTMB), DL, LazyJITCompileThreads));
+
+  auto J = ExitOnErr(orc::LLLazyJIT::Create(
+      std::move(JTMB), DL,
+      pointerToJITTargetAddress(exitOnLazyCallThroughFailure),
+      LazyJITCompileThreads));
 
   if (PerModuleLazy)
     J->setPartitionFunction(orc::CompileOnDemandLayer::compileWholeModule);
@@ -803,13 +814,32 @@ int runOrcLazyJIT(const char *ProgName) {
   // Add the main module.
   ExitOnErr(J->addLazyIRModule(std::move(MainModule)));
 
-  // Add any extra modules.
-  for (auto &ModulePath : ExtraModules) {
-    auto M = parseIRFile(ModulePath, Err, *TSCtx.getContext());
-    if (!M)
-      reportError(Err, ProgName);
+  // Create JITDylibs and add any extra modules.
+  {
+    // Create JITDylibs, keep a map from argument index to dylib. We will use
+    // -extra-module argument indexes to determine what dylib to use for each
+    // -extra-module.
+    std::map<unsigned, orc::JITDylib *> IdxToDylib;
+    IdxToDylib[0] = &J->getMainJITDylib();
+    for (auto JDItr = JITDylibs.begin(), JDEnd = JITDylibs.end();
+         JDItr != JDEnd; ++JDItr) {
+      IdxToDylib[JITDylibs.getPosition(JDItr - JITDylibs.begin())] =
+          &J->createJITDylib(*JDItr);
+    }
 
-    ExitOnErr(J->addLazyIRModule(orc::ThreadSafeModule(std::move(M), TSCtx)));
+    for (auto EMItr = ExtraModules.begin(), EMEnd = ExtraModules.end();
+         EMItr != EMEnd; ++EMItr) {
+      auto M = parseIRFile(*EMItr, Err, *TSCtx.getContext());
+      if (!M)
+        reportError(Err, ProgName);
+
+      auto EMIdx = ExtraModules.getPosition(EMItr - ExtraModules.begin());
+      assert(EMIdx != 0 && "ExtraModule should have index > 0");
+      auto JDItr = std::prev(IdxToDylib.lower_bound(EMIdx));
+      auto &JD = *JDItr->second;
+      ExitOnErr(
+          J->addLazyIRModule(JD, orc::ThreadSafeModule(std::move(M), TSCtx)));
+    }
   }
 
   // Add the objects.
@@ -837,6 +867,8 @@ int runOrcLazyJIT(const char *ProgName) {
     AltEntryThreads.push_back(std::thread([EntryPoint]() { EntryPoint(); }));
   }
 
+  J->getExecutionSession().dump(llvm::dbgs());
+
   // Run main.
   auto MainSym = ExitOnErr(J->lookup("main"));
   typedef int (*MainFnPtr)(int, const char *[]);
diff --git a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
index 1ccc4755957..1444ba74364 100644
--- a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
+++ b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
@@ -48,7 +48,8 @@ TEST_F(CoreAPIsStandardTest, BasicSuccessfulLookup) {
         FooMR = std::make_shared<MaterializationResponsibility>(std::move(R));
       })));
 
-  ES.lookup({&JD}, {Foo}, OnResolution, OnReady, NoDependenciesToRegister);
+  ES.lookup({{&JD, false}}, {Foo}, OnResolution, OnReady,
+            NoDependenciesToRegister);
 
   EXPECT_FALSE(OnResolutionRun) << "Should not have been resolved yet";
   EXPECT_FALSE(OnReadyRun) << "Should not have been marked ready yet";
@@ -101,7 +102,8 @@ TEST_F(CoreAPIsStandardTest, EmptyLookup) {
     OnReadyRun = true;
   };
 
-  ES.lookup({&JD}, {}, OnResolution, OnReady, NoDependenciesToRegister);
+  ES.lookup({{&JD, false}}, {}, OnResolution, OnReady,
+            NoDependenciesToRegister);
 
   EXPECT_TRUE(OnResolvedRun) << "OnResolved was not run for empty query";
   EXPECT_TRUE(OnReadyRun) << "OnReady was not run for empty query";
@@ -148,7 +150,7 @@ TEST_F(CoreAPIsStandardTest, RemoveSymbolsTest) {
 
   bool OnResolvedRun = false;
   bool OnReadyRun = false;
-  ES.lookup({&JD}, {Foo, Baz},
+  ES.lookup({{&JD, false}}, {Foo, Baz},
             [&](Expected<SymbolMap> Result) {
               EXPECT_TRUE(!!Result) << "OnResolved failed unexpectedly";
               consumeError(Result.takeError());
@@ -229,7 +231,8 @@ TEST_F(CoreAPIsStandardTest, LookupWithHiddenSymbols) {
   auto &JD2 = ES.createJITDylib("JD2");
   cantFail(JD2.define(absoluteSymbols({{Bar, QuxSym}})));
 
-  auto Result = cantFail(ES.lookup({&JD, &JD2}, {Foo, Bar}));
+  /// Try a blocking lookup.
+  auto Result = cantFail(ES.lookup({{&JD, false}, {&JD2, false}}, {Foo, Bar}));
 
   EXPECT_EQ(Result.size(), 2U) << "Unexpected number of results";
   EXPECT_EQ(Result.count(Foo), 1U) << "Missing result for \"Foo\"";
@@ -275,7 +278,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicAliases) {
                                     {Qux, {Bar, JITSymbolFlags::Weak}}})));
   cantFail(JD.define(absoluteSymbols({{Qux, QuxSym}})));
 
-  auto Result = ES.lookup({&JD}, {Baz, Qux});
+  auto Result = ES.lookup({{&JD, false}}, {Baz, Qux});
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
   EXPECT_EQ(Result->count(Qux), 1U) << "No result for \"qux\"";
@@ -290,7 +293,7 @@ TEST_F(CoreAPIsStandardTest, TestChainedAliases) {
   cantFail(JD.define(symbolAliases(
       {{Baz, {Bar, BazSym.getFlags()}}, {Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = ES.lookup({&JD}, {Bar, Baz});
+  auto Result = ES.lookup({{&JD, false}}, {Bar, Baz});
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Bar), 1U) << "No result for \"bar\"";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
@@ -309,7 +312,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicReExports) {
 
   cantFail(JD2.define(reexports(JD, {{Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = cantFail(ES.lookup({&JD2}, Bar));
+  auto Result = cantFail(ES.lookup({{&JD2, false}}, Bar));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Bar for symbol Foo should match FooSym's address";
 }
@@ -335,7 +338,7 @@ TEST_F(CoreAPIsStandardTest, TestThatReExportsDontUnnecessarilyMaterialize) {
   cantFail(JD2.define(reexports(
       JD, {{Baz, {Foo, BazSym.getFlags()}}, {Qux, {Bar, QuxSym.getFlags()}}})));
 
-  auto Result = cantFail(ES.lookup({&JD2}, Baz));
+  auto Result = cantFail(ES.lookup({{&JD2, false}}, Baz));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Baz for symbol Foo should match FooSym's address";
 
@@ -350,13 +353,13 @@ TEST_F(CoreAPIsStandardTest, TestReexportsGenerator) {
 
   auto Filter = [this](SymbolStringPtr Name) { return Name != Bar; };
 
-  JD.setGenerator(ReexportsGenerator(JD2, Filter));
+  JD.setGenerator(ReexportsGenerator(JD2, false, Filter));
 
   auto Flags = JD.lookupFlags({Foo, Bar, Baz});
   EXPECT_EQ(Flags.size(), 1U) << "Unexpected number of results";
   EXPECT_EQ(Flags[Foo], FooSym.getFlags()) << "Unexpected flags for Foo";
 
-  auto Result = cantFail(ES.lookup({&JD}, Foo));
+  auto Result = cantFail(ES.lookup({{&JD, false}}, Foo));
 
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Incorrect reexported symbol address";
@@ -377,7 +380,7 @@ TEST_F(CoreAPIsStandardTest, TestTrivialCircularDependency) {
     FooReady = true;
   };
 
-  ES.lookup({&JD}, {Foo}, std::move(OnResolution), std::move(OnReady),
+  ES.lookup({{&JD, false}}, {Foo}, std::move(OnResolution), std::move(OnReady),
             NoDependenciesToRegister);
 
   FooR->resolve({{Foo, FooSym}});
@@ -434,8 +437,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
 
   // Issue a lookup for Foo. Use NoDependenciesToRegister: We're going to add
   // the dependencies manually below.
-  ES.lookup({&JD}, {Foo}, std::move(OnFooResolution), std::move(OnFooReady),
-            NoDependenciesToRegister);
+  ES.lookup({{&JD, false}}, {Foo}, std::move(OnFooResolution),
+            std::move(OnFooReady), NoDependenciesToRegister);
 
   bool BarResolved = false;
   bool BarReady = false;
@@ -449,8 +452,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
     BarReady = true;
   };
 
-  ES.lookup({&JD}, {Bar}, std::move(OnBarResolution), std::move(OnBarReady),
-            NoDependenciesToRegister);
+  ES.lookup({{&JD, false}}, {Bar}, std::move(OnBarResolution),
+            std::move(OnBarReady), NoDependenciesToRegister);
 
   bool BazResolved = false;
   bool BazReady = false;
@@ -465,8 +468,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
     BazReady = true;
   };
 
-  ES.lookup({&JD}, {Baz}, std::move(OnBazResolution), std::move(OnBazReady),
-            NoDependenciesToRegister);
+  ES.lookup({{&JD, false}}, {Baz}, std::move(OnBazResolution),
+            std::move(OnBazReady), NoDependenciesToRegister);
 
   // Add a circular dependency: Foo -> Bar, Bar -> Baz, Baz -> Foo.
   FooR->addDependenciesForAll({{&JD, SymbolNameSet({Bar})}});
@@ -588,7 +591,7 @@ TEST_F(CoreAPIsStandardTest, AddAndMaterializeLazySymbol) {
     OnReadyRun = true;
   };
 
-  ES.lookup({&JD}, Names, std::move(OnResolution), std::move(OnReady),
+  ES.lookup({{&JD, false}}, Names, std::move(OnResolution), std::move(OnReady),
             NoDependenciesToRegister);
 
   EXPECT_TRUE(FooMaterialized) << "Foo was not materialized";
@@ -637,7 +640,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicWeakSymbolMaterialization) {
     OnReadyRun = true;
   };
 
-  ES.lookup({&JD}, {Bar}, std::move(OnResolution), std::move(OnReady),
+  ES.lookup({{&JD, false}}, {Bar}, std::move(OnResolution), std::move(OnReady),
             NoDependenciesToRegister);
 
   EXPECT_TRUE(OnResolvedRun) << "OnResolved not run";
@@ -666,13 +669,13 @@ TEST_F(CoreAPIsStandardTest, DefineMaterializingSymbol) {
       });
 
   cantFail(JD.define(MU));
-  cantFail(ES.lookup({&JD}, Foo));
+  cantFail(ES.lookup({{&JD, false}}, Foo));
 
   // Assert that materialization is complete by now.
   ExpectNoMoreMaterialization = true;
 
   // Look up bar to verify that no further materialization happens.
-  auto BarResult = cantFail(ES.lookup({&JD}, Bar));
+  auto BarResult = cantFail(ES.lookup({{&JD, false}}, Bar));
   EXPECT_EQ(BarResult.getAddress(), BarSym.getAddress())
       << "Expected Bar == BarSym";
 }
@@ -685,7 +688,7 @@ TEST_F(CoreAPIsStandardTest, GeneratorTest) {
     return SymbolNameSet({Bar});
   });
 
-  auto Result = cantFail(ES.lookup({&JD}, {Foo, Bar}));
+  auto Result = cantFail(ES.lookup({{&JD, false}}, {Foo, Bar}));
 
   EXPECT_EQ(Result.count(Bar), 1U) << "Expected to find fallback def for 'bar'";
   EXPECT_EQ(Result[Bar].getAddress(), BarSym.getAddress())
@@ -701,7 +704,7 @@ TEST_F(CoreAPIsStandardTest, FailResolution) {
   cantFail(JD.define(MU));
 
   SymbolNameSet Names({Foo, Bar});
-  auto Result = ES.lookup({&JD}, Names);
+  auto Result = ES.lookup({{&JD, false}}, Names);
 
   EXPECT_FALSE(!!Result) << "Expected failure";
   if (!Result) {
@@ -733,7 +736,7 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithUnthreadedMaterialization) {
 
   cantFail(JD.define(MU));
 
-  auto FooLookupResult = cantFail(ES.lookup({&JD}, Foo));
+  auto FooLookupResult = cantFail(ES.lookup({{&JD, false}}, Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -754,7 +757,7 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithThreadedMaterialization) {
 
   cantFail(JD.define(absoluteSymbols({{Foo, FooSym}})));
 
-  auto FooLookupResult = cantFail(ES.lookup({&JD}, Foo));
+  auto FooLookupResult = cantFail(ES.lookup({{&JD, false}}, Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -802,14 +805,14 @@ TEST_F(CoreAPIsStandardTest, TestGetRequestedSymbolsAndReplace) {
   EXPECT_FALSE(FooMaterialized) << "Foo should not be materialized yet";
   EXPECT_FALSE(BarMaterialized) << "Bar should not be materialized yet";
 
-  auto FooSymResult = cantFail(ES.lookup({&JD}, Foo));
+  auto FooSymResult = cantFail(ES.lookup({{&JD, false}}, Foo));
   EXPECT_EQ(FooSymResult.getAddress(), FooSym.getAddress())
       << "Address mismatch for Foo";
 
   EXPECT_TRUE(FooMaterialized) << "Foo should be materialized now";
   EXPECT_FALSE(BarMaterialized) << "Bar still should not be materialized";
 
-  auto BarSymResult = cantFail(ES.lookup({&JD}, Bar));
+  auto BarSymResult = cantFail(ES.lookup({{&JD, false}}, Bar));
   EXPECT_EQ(BarSymResult.getAddress(), BarSym.getAddress())
       << "Address mismatch for Bar";
   EXPECT_TRUE(BarMaterialized) << "Bar should be materialized now";
@@ -829,7 +832,7 @@ TEST_F(CoreAPIsStandardTest, TestMaterializationResponsibilityDelegation) {
 
   cantFail(JD.define(MU));
 
-  auto Result = ES.lookup({&JD}, {Foo, Bar});
+  auto Result = ES.lookup({{&JD, false}}, {Foo, Bar});
 
   EXPECT_TRUE(!!Result) << "Result should be a success value";
   EXPECT_EQ(Result->count(Foo), 1U) << "\"Foo\" entry missing";
@@ -861,7 +864,7 @@ TEST_F(CoreAPIsStandardTest, TestMaterializeWeakSymbol) {
 
   auto OnReady = [](Error Err) { cantFail(std::move(Err)); };
 
-  ES.lookup({&JD}, {Foo}, std::move(OnResolution), std::move(OnReady),
+  ES.lookup({{&JD, false}}, {Foo}, std::move(OnResolution), std::move(OnReady),
             NoDependenciesToRegister);
 
   auto MU2 = llvm::make_unique<SimpleMaterializationUnit>(
diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
index 1660670ae63..b6c362b8aaa 100644
--- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
@@ -66,7 +66,7 @@ static bool testSetProcessAllSections(std::unique_ptr<MemoryBuffer> Obj,
 
   ObjLayer.setProcessAllSections(ProcessAllSections);
   cantFail(ObjLayer.add(JD, std::move(Obj), ES.allocateVModule()));
-  ES.lookup({&JD}, {Foo}, OnResolveDoNothing, OnReadyDoNothing,
+  ES.lookup({{&JD, false}}, {Foo}, OnResolveDoNothing, OnReadyDoNothing,
             NoDependenciesToRegister);
   return DebugSectionSeen;
 }
@@ -157,7 +157,8 @@ TEST(RTDyldObjectLinkingLayerTest, TestOverrideObjectFlags) {
   ObjLayer.setOverrideObjectFlagsWithResponsibilityFlags(true);
 
   cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
-  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
+  ES.lookup({{&JD, false}}, {Foo},
+            [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
             [](Error Err) { cantFail(std::move(Err)); },
             NoDependenciesToRegister);
 }
@@ -219,7 +220,8 @@ TEST(RTDyldObjectLinkingLayerTest, TestAutoClaimResponsibilityForSymbols) {
   ObjLayer.setAutoClaimResponsibilityForObjectSymbols(true);
 
   cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
-  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
+  ES.lookup({{&JD, false}}, {Foo},
+            [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
             [](Error Err) { cantFail(std::move(Err)); },
             NoDependenciesToRegister);
 }
-- 
GitLab


From 1989ce13fd9d206b0e5ab20e7d135f308748c732 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 23 Oct 2018 20:54:43 +0000
Subject: [PATCH 0469/1116] Revert r345077 "[ORC] Change how non-exported
 symbols are matched during lookup."

Doesn't build on Windows. The call to 'lookup' is ambiguous. Clang and
MSVC agree, anyway.

http://lab.llvm.org:8011/builders/clang-x64-windows-msvc/builds/787
C:\b\slave\clang-x64-windows-msvc\build\llvm.src\unittests\ExecutionEngine\Orc\CoreAPIsTest.cpp(315): error C2668: 'llvm::orc::ExecutionSession::lookup': ambiguous call to overloaded function
C:\b\slave\clang-x64-windows-msvc\build\llvm.src\include\llvm/ExecutionEngine/Orc/Core.h(823): note: could be 'llvm::Expected<llvm::JITEvaluatedSymbol> llvm::orc::ExecutionSession::lookup(llvm::ArrayRef<llvm::orc::JITDylib *>,llvm::orc::SymbolStringPtr)'
C:\b\slave\clang-x64-windows-msvc\build\llvm.src\include\llvm/ExecutionEngine/Orc/Core.h(817): note: or       'llvm::Expected<llvm::JITEvaluatedSymbol> llvm::orc::ExecutionSession::lookup(const llvm::orc::JITDylibSearchList &,llvm::orc::SymbolStringPtr)'
C:\b\slave\clang-x64-windows-msvc\build\llvm.src\unittests\ExecutionEngine\Orc\CoreAPIsTest.cpp(315): note: while trying to match the argument list '(initializer list, llvm::orc::SymbolStringPtr)'

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345078 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/JITSymbol.h      |  12 --
 include/llvm/ExecutionEngine/Orc/Core.h       |  99 ++++++------
 include/llvm/ExecutionEngine/Orc/LLJIT.h      |   2 +-
 .../Orc/CompileOnDemandLayer.cpp              |  17 +-
 lib/ExecutionEngine/Orc/Core.cpp              | 151 ++++++++----------
 lib/ExecutionEngine/Orc/ExecutionUtils.cpp    |   5 +-
 lib/ExecutionEngine/Orc/IndirectionUtils.cpp  |   2 +-
 lib/ExecutionEngine/Orc/LLJIT.cpp             |   8 +-
 lib/ExecutionEngine/Orc/LazyReexports.cpp     |   4 +-
 .../Orc/RTDyldObjectLinkingLayer.cpp          |   9 +-
 .../OrcLazy/Inputs/hidden-definitions.ll      |   6 -
 .../OrcLazy/hidden-visibility.ll              |  17 --
 tools/lli/lli.cpp                             |  46 +-----
 .../ExecutionEngine/Orc/CoreAPIsTest.cpp      |  61 ++++---
 .../Orc/RTDyldObjectLinkingLayerTest.cpp      |   8 +-
 15 files changed, 170 insertions(+), 277 deletions(-)
 delete mode 100644 test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll
 delete mode 100644 test/ExecutionEngine/OrcLazy/hidden-visibility.ll

diff --git a/include/llvm/ExecutionEngine/JITSymbol.h b/include/llvm/ExecutionEngine/JITSymbol.h
index 05c9590726d..18b972ed829 100644
--- a/include/llvm/ExecutionEngine/JITSymbol.h
+++ b/include/llvm/ExecutionEngine/JITSymbol.h
@@ -40,18 +40,6 @@ class SymbolRef;
 /// Represents an address in the target process's address space.
 using JITTargetAddress = uint64_t;
 
-/// Convert a JITTargetAddress to a pointer.
-template <typename T> T jitTargetAddressToPointer(JITTargetAddress Addr) {
-  static_assert(std::is_pointer<T>::value, "T must be a pointer type");
-  uintptr_t IntPtr = static_cast<uintptr_t>(Addr);
-  assert(IntPtr == Addr && "JITTargetAddress value out of range for uintptr_t");
-  return reinterpret_cast<T>(IntPtr);
-}
-
-template <typename T> JITTargetAddress pointerToJITTargetAddress(T *Ptr) {
-  return static_cast<JITTargetAddress>(reinterpret_cast<uintptr_t>(Ptr));
-}
-
 /// Flags for symbols in the JIT.
 class JITSymbolFlags {
 public:
diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h
index 39d306e0bd4..2e56854340c 100644
--- a/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/include/llvm/ExecutionEngine/Orc/Core.h
@@ -54,8 +54,8 @@ using SymbolFlagsMap = DenseMap<SymbolStringPtr, JITSymbolFlags>;
 ///        symbols to be obtained for logging.
 using SymbolDependenceMap = DenseMap<JITDylib *, SymbolNameSet>;
 
-/// A list of (JITDylib*, bool) pairs.
-using JITDylibSearchList = std::vector<std::pair<JITDylib *, bool>>;
+/// A list of JITDylib pointers.
+using JITDylibList = std::vector<JITDylib *>;
 
 /// Render a SymbolStringPtr.
 raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym);
@@ -85,8 +85,8 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps);
 /// Render a MaterializationUnit.
 raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU);
 
-/// Render a JITDylibSearchList.
-raw_ostream &operator<<(raw_ostream &OS, const JITDylibSearchList &JDs);
+/// Render a JITDylibList.
+raw_ostream &operator<<(raw_ostream &OS, const JITDylibList &JDs);
 
 /// Callback to notify client that symbols have been resolved.
 using SymbolsResolvedCallback = std::function<void(Expected<SymbolMap>)>;
@@ -351,15 +351,14 @@ using SymbolAliasMap = DenseMap<SymbolStringPtr, SymbolAliasMapEntry>;
 class ReExportsMaterializationUnit : public MaterializationUnit {
 public:
   /// SourceJD is allowed to be nullptr, in which case the source JITDylib is
-  /// taken to be whatever JITDylib these definitions are materialized in (and
-  /// MatchNonExported has no effect). This is useful for defining aliases
-  /// within a JITDylib.
+  /// taken to be whatever JITDylib these definitions are materialized in. This
+  /// is useful for defining aliases within a JITDylib.
   ///
   /// Note: Care must be taken that no sets of aliases form a cycle, as such
   ///       a cycle will result in a deadlock when any symbol in the cycle is
   ///       resolved.
-  ReExportsMaterializationUnit(JITDylib *SourceJD, bool MatchNonExported,
-                               SymbolAliasMap Aliases, VModuleKey K);
+  ReExportsMaterializationUnit(JITDylib *SourceJD, SymbolAliasMap Aliases,
+                               VModuleKey K);
 
   StringRef getName() const override;
 
@@ -369,7 +368,6 @@ private:
   static SymbolFlagsMap extractFlags(const SymbolAliasMap &Aliases);
 
   JITDylib *SourceJD = nullptr;
-  bool MatchNonExported = false;
   SymbolAliasMap Aliases;
 };
 
@@ -387,19 +385,16 @@ private:
 inline std::unique_ptr<ReExportsMaterializationUnit>
 symbolAliases(SymbolAliasMap Aliases, VModuleKey K = VModuleKey()) {
   return llvm::make_unique<ReExportsMaterializationUnit>(
-      nullptr, true, std::move(Aliases), std::move(K));
+      nullptr, std::move(Aliases), std::move(K));
 }
 
 /// Create a materialization unit for re-exporting symbols from another JITDylib
 /// with alternative names/flags.
-/// If MatchNonExported is true then non-exported symbols from SourceJD can be
-/// re-exported. If it is false, attempts to re-export a non-exported symbol
-/// will result in a "symbol not found" error.
 inline std::unique_ptr<ReExportsMaterializationUnit>
 reexports(JITDylib &SourceJD, SymbolAliasMap Aliases,
-          bool MatchNonExported = false, VModuleKey K = VModuleKey()) {
+          VModuleKey K = VModuleKey()) {
   return llvm::make_unique<ReExportsMaterializationUnit>(
-      &SourceJD, MatchNonExported, std::move(Aliases), std::move(K));
+      &SourceJD, std::move(Aliases), std::move(K));
 }
 
 /// Build a SymbolAliasMap for the common case where you want to re-export
@@ -416,14 +411,13 @@ public:
   /// Create a reexports generator. If an Allow predicate is passed, only
   /// symbols for which the predicate returns true will be reexported. If no
   /// Allow predicate is passed, all symbols will be exported.
-  ReexportsGenerator(JITDylib &SourceJD, bool MatchNonExported = false,
+  ReexportsGenerator(JITDylib &SourceJD,
                      SymbolPredicate Allow = SymbolPredicate());
 
   SymbolNameSet operator()(JITDylib &JD, const SymbolNameSet &Names);
 
 private:
   JITDylib &SourceJD;
-  bool MatchNonExported = false;
   SymbolPredicate Allow;
 };
 
@@ -542,18 +536,16 @@ public:
   /// as the first in the search order (instead of this dylib) ensures that
   /// definitions within this dylib resolve to the lazy-compiling stubs,
   /// rather than immediately materializing the definitions in this dylib.
-  void setSearchOrder(JITDylibSearchList NewSearchOrder,
-                      bool SearchThisJITDylibFirst = true,
-                      bool MatchNonExportedInThisDylib = true);
+  void setSearchOrder(JITDylibList NewSearchOrder,
+                      bool SearchThisJITDylibFirst = true);
 
   /// Add the given JITDylib to the search order for definitions in this
   /// JITDylib.
-  void addToSearchOrder(JITDylib &JD, bool MatcNonExported = false);
+  void addToSearchOrder(JITDylib &JD);
 
   /// Replace OldJD with NewJD in the search order if OldJD is present.
   /// Otherwise this operation is a no-op.
-  void replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
-                            bool MatchNonExported = false);
+  void replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD);
 
   /// Remove the given JITDylib from the search order for this JITDylib if it is
   /// present. Otherwise this operation is a no-op.
@@ -562,7 +554,7 @@ public:
   /// Do something with the search order (run under the session lock).
   template <typename Func>
   auto withSearchOrderDo(Func &&F)
-      -> decltype(F(std::declval<const JITDylibSearchList &>()));
+      -> decltype(F(std::declval<const JITDylibList &>()));
 
   /// Define all symbols provided by the materialization unit to be part of this
   /// JITDylib.
@@ -650,12 +642,12 @@ private:
                                 const SymbolNameSet &Names);
 
   void lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                  SymbolNameSet &Unresolved, bool MatchNonExported,
-                  MaterializationUnitList &MUs);
+                  SymbolNameSet &Unresolved, JITDylib *MatchNonExportedInJD,
+                  bool MatchNonExported, MaterializationUnitList &MUs);
 
   void lodgeQueryImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                      SymbolNameSet &Unresolved, bool MatchNonExported,
-                      MaterializationUnitList &MUs);
+                      SymbolNameSet &Unresolved, JITDylib *MatchNonExportedInJD,
+                      bool MatchNonExported, MaterializationUnitList &MUs);
 
   LookupImplActionFlags
   lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
@@ -690,7 +682,7 @@ private:
   UnmaterializedInfosMap UnmaterializedInfos;
   MaterializingInfosMap MaterializingInfos;
   GeneratorFunction DefGenerator;
-  JITDylibSearchList SearchOrder;
+  JITDylibList SearchOrder;
 };
 
 /// An ExecutionSession represents a running JIT program.
@@ -774,10 +766,6 @@ public:
 
   /// Search the given JITDylib list for the given symbols.
   ///
-  /// SearchOrder lists the JITDylibs to search. For each dylib, the associated
-  /// boolean indicates whether the search should match against non-exported
-  /// (hidden visibility) symbols in that dylib (true means match against
-  /// non-exported symbols, false means do not match).
   ///
   /// The OnResolve callback will be called once all requested symbols are
   /// resolved, or if an error occurs prior to resolution.
@@ -794,9 +782,19 @@ public:
   /// dependenant symbols for this query (e.g. it is being made by a top level
   /// client to get an address to call) then the value NoDependenciesToRegister
   /// can be used.
-  void lookup(const JITDylibSearchList &SearchOrder, SymbolNameSet Symbols,
+  ///
+  /// If the MatchNonExportedInJD pointer is non-null, then the lookup will find
+  /// non-exported symbols defined in the JITDylib pointed to by
+  /// MatchNonExportedInJD.
+  /// If MatchNonExported is true the lookup will find non-exported symbols in
+  /// any JITDylib (setting MatchNonExportedInJD is redundant in such cases).
+  /// If MatchNonExported is false and MatchNonExportedInJD is null,
+  /// non-exported symbols will never be found.
+  void lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
               SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
-              RegisterDependenciesFunction RegisterDependencies);
+              RegisterDependenciesFunction RegisterDependencies,
+              JITDylib *MatchNonExportedInJD = nullptr,
+              bool MatchNonExported = false);
 
   /// Blocking version of lookup above. Returns the resolved symbol map.
   /// If WaitUntilReady is true (the default), will not return until all
@@ -805,29 +803,24 @@ public:
   /// or an error occurs. If WaitUntilReady is false and an error occurs
   /// after resolution, the function will return a success value, but the
   /// error will be reported via reportErrors.
-  Expected<SymbolMap> lookup(const JITDylibSearchList &SearchOrder,
+  Expected<SymbolMap> lookup(const JITDylibList &JDs,
                              const SymbolNameSet &Symbols,
                              RegisterDependenciesFunction RegisterDependencies =
                                  NoDependenciesToRegister,
-                             bool WaitUntilReady = true);
-
-  /// Convenience version of blocking lookup.
-  /// Searches each of the JITDylibs in the search order in turn for the given
-  /// symbol.
-  Expected<JITEvaluatedSymbol> lookup(const JITDylibSearchList &SearchOrder,
-                                      SymbolStringPtr Symbol);
+                             bool WaitUntilReady = true,
+                             JITDylib *MatchNonExportedInJD = nullptr,
+                             bool MatchNonExported = false);
 
   /// Convenience version of blocking lookup.
-  /// Searches each of the JITDylibs in the search order in turn for the given
-  /// symbol. The search will not find non-exported symbols.
-  Expected<JITEvaluatedSymbol> lookup(ArrayRef<JITDylib *> SearchOrder,
-                                      SymbolStringPtr Symbol);
+  /// Performs a single-symbol lookup.
+  Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs,
+                                      SymbolStringPtr Symbol,
+                                      bool MatchNonExported = false);
 
   /// Convenience version of blocking lookup.
-  /// Searches each of the JITDylibs in the search order in turn for the given
-  /// symbol. The search will not find non-exported symbols.
-  Expected<JITEvaluatedSymbol> lookup(ArrayRef<JITDylib *> SearchOrder,
-                                      StringRef Symbol);
+  /// Performs a single-symbol lookup, auto-interning the given symbol name.
+  Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs, StringRef Symbol,
+                                      bool MatchNonExported = false);
 
   /// Materialize the given unit.
   void dispatchMaterialization(JITDylib &JD,
@@ -873,7 +866,7 @@ private:
 
 template <typename Func>
 auto JITDylib::withSearchOrderDo(Func &&F)
-    -> decltype(F(std::declval<const JITDylibSearchList &>())) {
+    -> decltype(F(std::declval<const JITDylibList &>())) {
   return ES.runSessionLocked([&]() { return F(SearchOrder); });
 }
 
diff --git a/include/llvm/ExecutionEngine/Orc/LLJIT.h b/include/llvm/ExecutionEngine/Orc/LLJIT.h
index ce3e5d519c7..b7ef8834706 100644
--- a/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -144,7 +144,7 @@ public:
   /// LLLazyJIT with the given number of compile threads.
   static Expected<std::unique_ptr<LLLazyJIT>>
   Create(JITTargetMachineBuilder JTMB, DataLayout DL,
-         JITTargetAddress ErrorAddr, unsigned NumCompileThreads = 0);
+         unsigned NumCompileThreads = 0);
 
   /// Set an IR transform (e.g. pass manager pipeline) to run on each function
   /// when it is compiled.
diff --git a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index 241eb3600da..de1fa079dde 100644
--- a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -157,7 +157,7 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R,
     return;
   }
 
-  R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables), true));
+  R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables)));
   R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(),
                           std::move(Callables)));
 }
@@ -166,17 +166,10 @@ CompileOnDemandLayer::PerDylibResources &
 CompileOnDemandLayer::getPerDylibResources(JITDylib &TargetD) {
   auto I = DylibResources.find(&TargetD);
   if (I == DylibResources.end()) {
-    auto &ImplD = getExecutionSession().createJITDylib(
-        TargetD.getName() + ".impl", false);
-    TargetD.withSearchOrderDo([&](const JITDylibSearchList &TargetSearchOrder) {
-      auto NewSearchOrder = TargetSearchOrder;
-      assert(!NewSearchOrder.empty() &&
-             NewSearchOrder.front().first == &TargetD &&
-             NewSearchOrder.front().second == true &&
-             "TargetD must be at the front of its own search order and match "
-             "non-exported symbol");
-      NewSearchOrder.insert(std::next(NewSearchOrder.begin()), {&ImplD, true});
-      ImplD.setSearchOrder(std::move(NewSearchOrder), false);
+    auto &ImplD =
+        getExecutionSession().createJITDylib(TargetD.getName() + ".impl");
+    TargetD.withSearchOrderDo([&](const JITDylibList &TargetSearchOrder) {
+      ImplD.setSearchOrder(TargetSearchOrder, false);
     });
     PerDylibResources PDR(ImplD, BuildIndirectStubsManager());
     I = DylibResources.insert(std::make_pair(&TargetD, std::move(PDR))).first;
diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index 8a9740e0be0..df4d0028a4a 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -205,16 +205,14 @@ raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) {
   return OS << ")";
 }
 
-raw_ostream &operator<<(raw_ostream &OS, const JITDylibSearchList &JDs) {
+raw_ostream &operator<<(raw_ostream &OS, const JITDylibList &JDs) {
   OS << "[";
   if (!JDs.empty()) {
-    assert(JDs.front().first && "JITDylibList entries must not be null");
-    OS << " (\"" << JDs.front().first->getName() << "\", "
-       << (JDs.front().second ? "true" : "false") << ")";
-    for (auto &KV : make_range(std::next(JDs.begin()), JDs.end())) {
-      assert(KV.first && "JITDylibList entries must not be null");
-      OS << ", (\"" << KV.first->getName() << "\", "
-         << (KV.second ? "true" : "false") << ")";
+    assert(JDs.front() && "JITDylibList entries must not be null");
+    OS << " " << JDs.front()->getName();
+    for (auto *JD : make_range(std::next(JDs.begin()), JDs.end())) {
+      assert(JD && "JITDylibList entries must not be null");
+      OS << ", " << JD->getName();
     }
   }
   OS << " ]";
@@ -528,11 +526,9 @@ AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) {
 }
 
 ReExportsMaterializationUnit::ReExportsMaterializationUnit(
-    JITDylib *SourceJD, bool MatchNonExported, SymbolAliasMap Aliases,
-    VModuleKey K)
+    JITDylib *SourceJD, SymbolAliasMap Aliases, VModuleKey K)
     : MaterializationUnit(extractFlags(Aliases), std::move(K)),
-      SourceJD(SourceJD), MatchNonExported(MatchNonExported),
-      Aliases(std::move(Aliases)) {}
+      SourceJD(SourceJD), Aliases(std::move(Aliases)) {}
 
 StringRef ReExportsMaterializationUnit::getName() const {
   return "<Reexports>";
@@ -560,7 +556,7 @@ void ReExportsMaterializationUnit::materialize(
 
   if (!Aliases.empty()) {
     if (SourceJD)
-      R.replace(reexports(*SourceJD, std::move(Aliases), MatchNonExported));
+      R.replace(reexports(*SourceJD, std::move(Aliases)));
     else
       R.replace(symbolAliases(std::move(Aliases)));
   }
@@ -660,8 +656,8 @@ void ReExportsMaterializationUnit::materialize(
 
     auto OnReady = [&ES](Error Err) { ES.reportError(std::move(Err)); };
 
-    ES.lookup({{&SrcJD, MatchNonExported}}, QuerySymbols, std::move(OnResolve),
-              std::move(OnReady), std::move(RegisterDependencies));
+    ES.lookup({&SrcJD}, QuerySymbols, std::move(OnResolve), std::move(OnReady),
+              std::move(RegisterDependencies), nullptr, true);
   }
 }
 
@@ -702,10 +698,8 @@ buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols) {
 }
 
 ReexportsGenerator::ReexportsGenerator(JITDylib &SourceJD,
-                                       bool MatchNonExported,
                                        SymbolPredicate Allow)
-    : SourceJD(SourceJD), MatchNonExported(MatchNonExported),
-      Allow(std::move(Allow)) {}
+    : SourceJD(SourceJD), Allow(std::move(Allow)) {}
 
 SymbolNameSet ReexportsGenerator::operator()(JITDylib &JD,
                                              const SymbolNameSet &Names) {
@@ -722,7 +716,7 @@ SymbolNameSet ReexportsGenerator::operator()(JITDylib &JD,
   }
 
   if (!Added.empty())
-    cantFail(JD.define(reexports(SourceJD, AliasMap, MatchNonExported)));
+    cantFail(JD.define(reexports(SourceJD, AliasMap)));
 
   return Added;
 }
@@ -1047,41 +1041,30 @@ void JITDylib::notifyFailed(const SymbolNameSet &FailedSymbols) {
     Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbols));
 }
 
-void JITDylib::setSearchOrder(JITDylibSearchList NewSearchOrder,
-                              bool SearchThisJITDylibFirst,
-                              bool MatchNonExportedInThisDylib) {
-  if (SearchThisJITDylibFirst && NewSearchOrder.front().first != this)
-    NewSearchOrder.insert(NewSearchOrder.begin(),
-                          {this, MatchNonExportedInThisDylib});
+void JITDylib::setSearchOrder(JITDylibList NewSearchOrder,
+                              bool SearchThisJITDylibFirst) {
+  if (SearchThisJITDylibFirst && NewSearchOrder.front() != this)
+    NewSearchOrder.insert(NewSearchOrder.begin(), this);
 
   ES.runSessionLocked([&]() { SearchOrder = std::move(NewSearchOrder); });
 }
 
-void JITDylib::addToSearchOrder(JITDylib &JD, bool MatchNonExported) {
-  ES.runSessionLocked([&]() {
-    SearchOrder.push_back({&JD, MatchNonExported});
-  });
+void JITDylib::addToSearchOrder(JITDylib &JD) {
+  ES.runSessionLocked([&]() { SearchOrder.push_back(&JD); });
 }
 
-void JITDylib::replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
-                                    bool MatchNonExported) {
+void JITDylib::replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD) {
   ES.runSessionLocked([&]() {
-    auto I = std::find_if(SearchOrder.begin(), SearchOrder.end(),
-                          [&](const JITDylibSearchList::value_type &KV) {
-                            return KV.first == &OldJD;
-                          });
+    auto I = std::find(SearchOrder.begin(), SearchOrder.end(), &OldJD);
 
     if (I != SearchOrder.end())
-      *I = {&NewJD, MatchNonExported};
+      *I = &NewJD;
   });
 }
 
 void JITDylib::removeFromSearchOrder(JITDylib &JD) {
   ES.runSessionLocked([&]() {
-    auto I = std::find_if(SearchOrder.begin(), SearchOrder.end(),
-                          [&](const JITDylibSearchList::value_type &KV) {
-                            return KV.first == &JD;
-                          });
+    auto I = std::find(SearchOrder.begin(), SearchOrder.end(), &JD);
     if (I != SearchOrder.end())
       SearchOrder.erase(I);
   });
@@ -1178,17 +1161,18 @@ SymbolNameSet JITDylib::lookupFlagsImpl(SymbolFlagsMap &Flags,
 }
 
 void JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                          SymbolNameSet &Unresolved, bool MatchNonExported,
+                          SymbolNameSet &Unresolved,
+                          JITDylib *MatchNonExportedInJD, bool MatchNonExported,
                           MaterializationUnitList &MUs) {
   assert(Q && "Query can not be null");
 
-  lodgeQueryImpl(Q, Unresolved, MatchNonExported, MUs);
+  lodgeQueryImpl(Q, Unresolved, MatchNonExportedInJD, MatchNonExported, MUs);
   if (DefGenerator && !Unresolved.empty()) {
     auto NewDefs = DefGenerator(*this, Unresolved);
     if (!NewDefs.empty()) {
       for (auto &D : NewDefs)
         Unresolved.erase(D);
-      lodgeQueryImpl(Q, NewDefs, MatchNonExported, MUs);
+      lodgeQueryImpl(Q, NewDefs, MatchNonExportedInJD, MatchNonExported, MUs);
       assert(NewDefs.empty() &&
              "All fallback defs should have been found by lookupImpl");
     }
@@ -1197,7 +1181,7 @@ void JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
 
 void JITDylib::lodgeQueryImpl(
     std::shared_ptr<AsynchronousSymbolQuery> &Q, SymbolNameSet &Unresolved,
-    bool MatchNonExported,
+    JITDylib *MatchNonExportedInJD, bool MatchNonExported,
     std::vector<std::unique_ptr<MaterializationUnit>> &MUs) {
 
   std::vector<SymbolStringPtr> ToRemove;
@@ -1207,9 +1191,12 @@ void JITDylib::lodgeQueryImpl(
     if (SymI == Symbols.end())
       continue;
 
-    // If this is a non exported symbol and we're skipping those then skip it.
-    if (!SymI->second.getFlags().isExported() && !MatchNonExported)
-      continue;
+    // If this is a non-exported symbol, then check the values of
+    // MatchNonExportedInJD and MatchNonExported. Skip if we should not match
+    // against this symbol.
+    if (!SymI->second.getFlags().isExported())
+      if (!MatchNonExported && MatchNonExportedInJD != this)
+        continue;
 
     // If we matched against Name in JD, mark it to be removed from the Unresolved
     // set.
@@ -1395,9 +1382,8 @@ void JITDylib::dump(raw_ostream &OS) {
        << "\" (ES: " << format("0x%016x", reinterpret_cast<uintptr_t>(&ES))
        << "):\n"
        << "Search order: [";
-    for (auto &KV : SearchOrder)
-      OS << " (\"" << KV.first->getName() << "\", "
-         << (KV.second ? "all" : "exported only") << ")";
+    for (auto *JD : SearchOrder)
+      OS << " \"" << JD->getName() << "\"";
     OS << " ]\n"
        << "Symbol table:\n";
 
@@ -1445,7 +1431,7 @@ void JITDylib::dump(raw_ostream &OS) {
 
 JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
     : ES(ES), JITDylibName(std::move(Name)) {
-  SearchOrder.push_back({this, true});
+  SearchOrder.push_back(this);
 }
 
 Error JITDylib::defineImpl(MaterializationUnit &MU) {
@@ -1738,10 +1724,12 @@ Expected<SymbolMap> ExecutionSession::legacyLookup(
 #endif
 }
 
-void ExecutionSession::lookup(
-    const JITDylibSearchList &SearchOrder, SymbolNameSet Symbols,
-    SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
-    RegisterDependenciesFunction RegisterDependencies) {
+void ExecutionSession::lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
+                              SymbolsResolvedCallback OnResolve,
+                              SymbolsReadyCallback OnReady,
+                              RegisterDependenciesFunction RegisterDependencies,
+                              JITDylib *MatchNonExportedInJD,
+                              bool MatchNonExported) {
 
   // lookup can be re-entered recursively if running on a single thread. Run any
   // outstanding MUs in case this query depends on them, otherwise this lookup
@@ -1757,14 +1745,12 @@ void ExecutionSession::lookup(
   bool QueryFailed = false;
 
   runSessionLocked([&]() {
-    for (auto &KV : SearchOrder) {
-      assert(KV.first && "JITDylibList entries must not be null");
-      assert(!CollectedMUsMap.count(KV.first) &&
+    for (auto *JD : JDs) {
+      assert(JD && "JITDylibList entries must not be null");
+      assert(!CollectedMUsMap.count(JD) &&
              "JITDylibList should not contain duplicate entries");
-
-      auto &JD = *KV.first;
-      auto MatchNonExported = KV.second;
-      JD.lodgeQuery(Q, Unresolved, MatchNonExported, CollectedMUsMap[&JD]);
+      JD->lodgeQuery(Q, Unresolved, MatchNonExportedInJD, MatchNonExported,
+                     CollectedMUsMap[JD]);
     }
 
     if (Unresolved.empty()) {
@@ -1815,9 +1801,11 @@ void ExecutionSession::lookup(
   runOutstandingMUs();
 }
 
-Expected<SymbolMap> ExecutionSession::lookup(
-    const JITDylibSearchList &SearchOrder, const SymbolNameSet &Symbols,
-    RegisterDependenciesFunction RegisterDependencies, bool WaitUntilReady) {
+Expected<SymbolMap>
+ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
+                         RegisterDependenciesFunction RegisterDependencies,
+                         bool WaitUntilReady, JITDylib *MatchNonExportedInJD,
+                         bool MatchNonExported) {
 #if LLVM_ENABLE_THREADS
   // In the threaded case we use promises to return the results.
   std::promise<SymbolMap> PromisedResult;
@@ -1884,7 +1872,8 @@ Expected<SymbolMap> ExecutionSession::lookup(
 #endif
 
   // Perform the asynchronous lookup.
-  lookup(SearchOrder, Symbols, OnResolve, OnReady, RegisterDependencies);
+  lookup(JDs, Symbols, OnResolve, OnReady, RegisterDependencies,
+         MatchNonExportedInJD, MatchNonExported);
 
 #if LLVM_ENABLE_THREADS
   auto ResultFuture = PromisedResult.get_future();
@@ -1927,13 +1916,14 @@ Expected<SymbolMap> ExecutionSession::lookup(
 #endif
 }
 
-Expected<JITEvaluatedSymbol>
-ExecutionSession::lookup(const JITDylibSearchList &SearchOrder,
-                         SymbolStringPtr Name) {
+/// Look up a symbol by searching a list of JDs.
+Expected<JITEvaluatedSymbol> ExecutionSession::lookup(const JITDylibList &JDs,
+                                                      SymbolStringPtr Name,
+                                                      bool MatchNonExported) {
   SymbolNameSet Names({Name});
 
-  if (auto ResultMap = lookup(SearchOrder, std::move(Names),
-                              NoDependenciesToRegister, true)) {
+  if (auto ResultMap = lookup(JDs, std::move(Names), NoDependenciesToRegister,
+                              true, nullptr, MatchNonExported)) {
     assert(ResultMap->size() == 1 && "Unexpected number of results");
     assert(ResultMap->count(Name) && "Missing result for symbol");
     return std::move(ResultMap->begin()->second);
@@ -1941,21 +1931,10 @@ ExecutionSession::lookup(const JITDylibSearchList &SearchOrder,
     return ResultMap.takeError();
 }
 
-Expected<JITEvaluatedSymbol>
-ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder,
-                         SymbolStringPtr Name) {
-  SymbolNameSet Names({Name});
-
-  JITDylibSearchList FullSearchOrder(SearchOrder.size());
-  for (auto *JD : SearchOrder)
-    FullSearchOrder.push_back({JD, false});
-
-  return lookup(FullSearchOrder, Name);
-}
-
-Expected<JITEvaluatedSymbol>
-ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder, StringRef Name) {
-  return lookup(SearchOrder, intern(Name));
+Expected<JITEvaluatedSymbol> ExecutionSession::lookup(const JITDylibList &JDs,
+                                                      StringRef Name,
+                                                      bool MatchNonExported) {
+  return lookup(JDs, intern(Name), MatchNonExported);
 }
 
 void ExecutionSession::dump(raw_ostream &OS) {
diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 3a1984e8a50..21a604f71ca 100644
--- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -129,8 +129,9 @@ Error CtorDtorRunner::run() {
   }
 
   auto &ES = JD.getExecutionSession();
-  if (auto CtorDtorMap = ES.lookup({{&JD, true}}, std::move(Names),
-                                   NoDependenciesToRegister, true)) {
+  if (auto CtorDtorMap =
+          ES.lookup({&JD}, std::move(Names), NoDependenciesToRegister, true,
+                    nullptr, true)) {
     for (auto &KV : CtorDtorsByPriority) {
       for (auto &Name : KV.second) {
         assert(CtorDtorMap->count(Name) && "No entry for Name");
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 205821b0a71..c10d15ab117 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -101,7 +101,7 @@ JITTargetAddress JITCompileCallbackManager::executeCompileCallback(
       Name = I->second;
   }
 
-  if (auto Sym = ES.lookup({{&CallbacksJD, true}}, Name))
+  if (auto Sym = ES.lookup({&CallbacksJD}, Name, true))
     return Sym->getAddress();
   else {
     llvm::dbgs() << "Didn't find callback.\n";
diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp
index 8486fe449f7..ac71a5e7673 100644
--- a/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -76,7 +76,7 @@ Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
 
 Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
                                                         StringRef Name) {
-  return ES->lookup({{&JD, true}}, ES->intern(Name));
+  return ES->lookup({&JD}, ES->intern(Name));
 }
 
 LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES,
@@ -144,13 +144,13 @@ void LLJIT::recordCtorDtors(Module &M) {
 }
 
 Expected<std::unique_ptr<LLLazyJIT>>
-LLLazyJIT::Create(JITTargetMachineBuilder JTMB, DataLayout DL,
-                  JITTargetAddress ErrorAddr, unsigned NumCompileThreads) {
+  LLLazyJIT::Create(JITTargetMachineBuilder JTMB, DataLayout DL,
+                    unsigned NumCompileThreads) {
   auto ES = llvm::make_unique<ExecutionSession>();
 
   const Triple &TT = JTMB.getTargetTriple();
 
-  auto LCTMgr = createLocalLazyCallThroughManager(TT, *ES, ErrorAddr);
+  auto LCTMgr = createLocalLazyCallThroughManager(TT, *ES, 0);
   if (!LCTMgr)
     return LCTMgr.takeError();
 
diff --git a/lib/ExecutionEngine/Orc/LazyReexports.cpp b/lib/ExecutionEngine/Orc/LazyReexports.cpp
index ba8e2a9c52f..af4c508d7f1 100644
--- a/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -52,8 +52,8 @@ LazyCallThroughManager::callThroughToSymbol(JITTargetAddress TrampolineAddr) {
     SymbolName = I->second.second;
   }
 
-  auto LookupResult = ES.lookup({{SourceJD, true}}, {SymbolName},
-                                NoDependenciesToRegister, true);
+  auto LookupResult = ES.lookup({SourceJD}, {SymbolName},
+                                NoDependenciesToRegister, true, nullptr, true);
 
   if (!LookupResult) {
     ES.reportError(LookupResult.takeError());
diff --git a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index 299d76183cd..616251c7e00 100644
--- a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -50,11 +50,10 @@ public:
       MR.addDependenciesForAll(Deps);
     };
 
-    JITDylibSearchList SearchOrder;
-    MR.getTargetJITDylib().withSearchOrderDo(
-        [&](const JITDylibSearchList &JDs) { SearchOrder = JDs; });
-    ES.lookup(SearchOrder, InternedSymbols, OnResolvedWithUnwrap, OnReady,
-              RegisterDependencies);
+    MR.getTargetJITDylib().withSearchOrderDo([&](const JITDylibList &JDs) {
+      ES.lookup(JDs, InternedSymbols, OnResolvedWithUnwrap, OnReady,
+                RegisterDependencies, &MR.getTargetJITDylib());
+    });
   }
 
   Expected<LookupSet> getResponsibilitySet(const LookupSet &Symbols) {
diff --git a/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll b/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll
deleted file mode 100644
index 8d1f4b9cc5c..00000000000
--- a/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-@bar = hidden global i32 0
-
-define hidden i32 @foo() {
-entry:
-  ret i32 0
-}
diff --git a/test/ExecutionEngine/OrcLazy/hidden-visibility.ll b/test/ExecutionEngine/OrcLazy/hidden-visibility.ll
deleted file mode 100644
index 199fd644bff..00000000000
--- a/test/ExecutionEngine/OrcLazy/hidden-visibility.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: lli -jit-kind=orc-lazy -extra-module %p/Inputs/hidden-definitions.ll %s
-; RUN: not lli -jit-kind=orc-lazy -jd libFoo -extra-module %p/Inputs/hidden-definitions.ll %s
-;
-; Check that hidden symbols in another module are visible when the module is
-; added to the same JITDylib, and not visible if it is added to a different
-; JITDylib.
-
-@bar = external global i32
-declare i32 @foo()
-
-define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
-entry:
-  %0 = call i32() @foo()
-  %1 = load i32, i32* @bar
-  %2 = add i32 %0, %1
-  ret i32 %2
-}
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index c3c57e2cdee..f4585dc080d 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -115,11 +115,6 @@ namespace {
                "rather than individual functions"),
       cl::init(false));
 
-  cl::list<std::string>
-      JITDylibs("jd",
-                cl::desc("Specifies the JITDylib to be used for any subsequent "
-                         "-extra-module arguments."));
-
   // The MCJIT supports building for a target address space separate from
   // the JIT compilation process. Use a forked process and a copying
   // memory manager with IPC to execute using this functionality.
@@ -754,8 +749,6 @@ static orc::IRTransformLayer::TransformFunction createDebugDumper() {
   llvm_unreachable("Unknown DumpKind");
 }
 
-static void exitOnLazyCallThroughFailure() { exit(1); }
-
 int runOrcLazyJIT(const char *ProgName) {
   // Start setting up the JIT environment.
 
@@ -785,11 +778,7 @@ int runOrcLazyJIT(const char *ProgName) {
                         : None);
 
   DataLayout DL = ExitOnErr(JTMB.getDefaultDataLayoutForTarget());
-
-  auto J = ExitOnErr(orc::LLLazyJIT::Create(
-      std::move(JTMB), DL,
-      pointerToJITTargetAddress(exitOnLazyCallThroughFailure),
-      LazyJITCompileThreads));
+  auto J = ExitOnErr(orc::LLLazyJIT::Create(std::move(JTMB), DL, LazyJITCompileThreads));
 
   if (PerModuleLazy)
     J->setPartitionFunction(orc::CompileOnDemandLayer::compileWholeModule);
@@ -814,32 +803,13 @@ int runOrcLazyJIT(const char *ProgName) {
   // Add the main module.
   ExitOnErr(J->addLazyIRModule(std::move(MainModule)));
 
-  // Create JITDylibs and add any extra modules.
-  {
-    // Create JITDylibs, keep a map from argument index to dylib. We will use
-    // -extra-module argument indexes to determine what dylib to use for each
-    // -extra-module.
-    std::map<unsigned, orc::JITDylib *> IdxToDylib;
-    IdxToDylib[0] = &J->getMainJITDylib();
-    for (auto JDItr = JITDylibs.begin(), JDEnd = JITDylibs.end();
-         JDItr != JDEnd; ++JDItr) {
-      IdxToDylib[JITDylibs.getPosition(JDItr - JITDylibs.begin())] =
-          &J->createJITDylib(*JDItr);
-    }
+  // Add any extra modules.
+  for (auto &ModulePath : ExtraModules) {
+    auto M = parseIRFile(ModulePath, Err, *TSCtx.getContext());
+    if (!M)
+      reportError(Err, ProgName);
 
-    for (auto EMItr = ExtraModules.begin(), EMEnd = ExtraModules.end();
-         EMItr != EMEnd; ++EMItr) {
-      auto M = parseIRFile(*EMItr, Err, *TSCtx.getContext());
-      if (!M)
-        reportError(Err, ProgName);
-
-      auto EMIdx = ExtraModules.getPosition(EMItr - ExtraModules.begin());
-      assert(EMIdx != 0 && "ExtraModule should have index > 0");
-      auto JDItr = std::prev(IdxToDylib.lower_bound(EMIdx));
-      auto &JD = *JDItr->second;
-      ExitOnErr(
-          J->addLazyIRModule(JD, orc::ThreadSafeModule(std::move(M), TSCtx)));
-    }
+    ExitOnErr(J->addLazyIRModule(orc::ThreadSafeModule(std::move(M), TSCtx)));
   }
 
   // Add the objects.
@@ -867,8 +837,6 @@ int runOrcLazyJIT(const char *ProgName) {
     AltEntryThreads.push_back(std::thread([EntryPoint]() { EntryPoint(); }));
   }
 
-  J->getExecutionSession().dump(llvm::dbgs());
-
   // Run main.
   auto MainSym = ExitOnErr(J->lookup("main"));
   typedef int (*MainFnPtr)(int, const char *[]);
diff --git a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
index 1444ba74364..1ccc4755957 100644
--- a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
+++ b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
@@ -48,8 +48,7 @@ TEST_F(CoreAPIsStandardTest, BasicSuccessfulLookup) {
         FooMR = std::make_shared<MaterializationResponsibility>(std::move(R));
       })));
 
-  ES.lookup({{&JD, false}}, {Foo}, OnResolution, OnReady,
-            NoDependenciesToRegister);
+  ES.lookup({&JD}, {Foo}, OnResolution, OnReady, NoDependenciesToRegister);
 
   EXPECT_FALSE(OnResolutionRun) << "Should not have been resolved yet";
   EXPECT_FALSE(OnReadyRun) << "Should not have been marked ready yet";
@@ -102,8 +101,7 @@ TEST_F(CoreAPIsStandardTest, EmptyLookup) {
     OnReadyRun = true;
   };
 
-  ES.lookup({{&JD, false}}, {}, OnResolution, OnReady,
-            NoDependenciesToRegister);
+  ES.lookup({&JD}, {}, OnResolution, OnReady, NoDependenciesToRegister);
 
   EXPECT_TRUE(OnResolvedRun) << "OnResolved was not run for empty query";
   EXPECT_TRUE(OnReadyRun) << "OnReady was not run for empty query";
@@ -150,7 +148,7 @@ TEST_F(CoreAPIsStandardTest, RemoveSymbolsTest) {
 
   bool OnResolvedRun = false;
   bool OnReadyRun = false;
-  ES.lookup({{&JD, false}}, {Foo, Baz},
+  ES.lookup({&JD}, {Foo, Baz},
             [&](Expected<SymbolMap> Result) {
               EXPECT_TRUE(!!Result) << "OnResolved failed unexpectedly";
               consumeError(Result.takeError());
@@ -231,8 +229,7 @@ TEST_F(CoreAPIsStandardTest, LookupWithHiddenSymbols) {
   auto &JD2 = ES.createJITDylib("JD2");
   cantFail(JD2.define(absoluteSymbols({{Bar, QuxSym}})));
 
-  /// Try a blocking lookup.
-  auto Result = cantFail(ES.lookup({{&JD, false}, {&JD2, false}}, {Foo, Bar}));
+  auto Result = cantFail(ES.lookup({&JD, &JD2}, {Foo, Bar}));
 
   EXPECT_EQ(Result.size(), 2U) << "Unexpected number of results";
   EXPECT_EQ(Result.count(Foo), 1U) << "Missing result for \"Foo\"";
@@ -278,7 +275,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicAliases) {
                                     {Qux, {Bar, JITSymbolFlags::Weak}}})));
   cantFail(JD.define(absoluteSymbols({{Qux, QuxSym}})));
 
-  auto Result = ES.lookup({{&JD, false}}, {Baz, Qux});
+  auto Result = ES.lookup({&JD}, {Baz, Qux});
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
   EXPECT_EQ(Result->count(Qux), 1U) << "No result for \"qux\"";
@@ -293,7 +290,7 @@ TEST_F(CoreAPIsStandardTest, TestChainedAliases) {
   cantFail(JD.define(symbolAliases(
       {{Baz, {Bar, BazSym.getFlags()}}, {Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = ES.lookup({{&JD, false}}, {Bar, Baz});
+  auto Result = ES.lookup({&JD}, {Bar, Baz});
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Bar), 1U) << "No result for \"bar\"";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
@@ -312,7 +309,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicReExports) {
 
   cantFail(JD2.define(reexports(JD, {{Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = cantFail(ES.lookup({{&JD2, false}}, Bar));
+  auto Result = cantFail(ES.lookup({&JD2}, Bar));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Bar for symbol Foo should match FooSym's address";
 }
@@ -338,7 +335,7 @@ TEST_F(CoreAPIsStandardTest, TestThatReExportsDontUnnecessarilyMaterialize) {
   cantFail(JD2.define(reexports(
       JD, {{Baz, {Foo, BazSym.getFlags()}}, {Qux, {Bar, QuxSym.getFlags()}}})));
 
-  auto Result = cantFail(ES.lookup({{&JD2, false}}, Baz));
+  auto Result = cantFail(ES.lookup({&JD2}, Baz));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Baz for symbol Foo should match FooSym's address";
 
@@ -353,13 +350,13 @@ TEST_F(CoreAPIsStandardTest, TestReexportsGenerator) {
 
   auto Filter = [this](SymbolStringPtr Name) { return Name != Bar; };
 
-  JD.setGenerator(ReexportsGenerator(JD2, false, Filter));
+  JD.setGenerator(ReexportsGenerator(JD2, Filter));
 
   auto Flags = JD.lookupFlags({Foo, Bar, Baz});
   EXPECT_EQ(Flags.size(), 1U) << "Unexpected number of results";
   EXPECT_EQ(Flags[Foo], FooSym.getFlags()) << "Unexpected flags for Foo";
 
-  auto Result = cantFail(ES.lookup({{&JD, false}}, Foo));
+  auto Result = cantFail(ES.lookup({&JD}, Foo));
 
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Incorrect reexported symbol address";
@@ -380,7 +377,7 @@ TEST_F(CoreAPIsStandardTest, TestTrivialCircularDependency) {
     FooReady = true;
   };
 
-  ES.lookup({{&JD, false}}, {Foo}, std::move(OnResolution), std::move(OnReady),
+  ES.lookup({&JD}, {Foo}, std::move(OnResolution), std::move(OnReady),
             NoDependenciesToRegister);
 
   FooR->resolve({{Foo, FooSym}});
@@ -437,8 +434,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
 
   // Issue a lookup for Foo. Use NoDependenciesToRegister: We're going to add
   // the dependencies manually below.
-  ES.lookup({{&JD, false}}, {Foo}, std::move(OnFooResolution),
-            std::move(OnFooReady), NoDependenciesToRegister);
+  ES.lookup({&JD}, {Foo}, std::move(OnFooResolution), std::move(OnFooReady),
+            NoDependenciesToRegister);
 
   bool BarResolved = false;
   bool BarReady = false;
@@ -452,8 +449,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
     BarReady = true;
   };
 
-  ES.lookup({{&JD, false}}, {Bar}, std::move(OnBarResolution),
-            std::move(OnBarReady), NoDependenciesToRegister);
+  ES.lookup({&JD}, {Bar}, std::move(OnBarResolution), std::move(OnBarReady),
+            NoDependenciesToRegister);
 
   bool BazResolved = false;
   bool BazReady = false;
@@ -468,8 +465,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
     BazReady = true;
   };
 
-  ES.lookup({{&JD, false}}, {Baz}, std::move(OnBazResolution),
-            std::move(OnBazReady), NoDependenciesToRegister);
+  ES.lookup({&JD}, {Baz}, std::move(OnBazResolution), std::move(OnBazReady),
+            NoDependenciesToRegister);
 
   // Add a circular dependency: Foo -> Bar, Bar -> Baz, Baz -> Foo.
   FooR->addDependenciesForAll({{&JD, SymbolNameSet({Bar})}});
@@ -591,7 +588,7 @@ TEST_F(CoreAPIsStandardTest, AddAndMaterializeLazySymbol) {
     OnReadyRun = true;
   };
 
-  ES.lookup({{&JD, false}}, Names, std::move(OnResolution), std::move(OnReady),
+  ES.lookup({&JD}, Names, std::move(OnResolution), std::move(OnReady),
             NoDependenciesToRegister);
 
   EXPECT_TRUE(FooMaterialized) << "Foo was not materialized";
@@ -640,7 +637,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicWeakSymbolMaterialization) {
     OnReadyRun = true;
   };
 
-  ES.lookup({{&JD, false}}, {Bar}, std::move(OnResolution), std::move(OnReady),
+  ES.lookup({&JD}, {Bar}, std::move(OnResolution), std::move(OnReady),
             NoDependenciesToRegister);
 
   EXPECT_TRUE(OnResolvedRun) << "OnResolved not run";
@@ -669,13 +666,13 @@ TEST_F(CoreAPIsStandardTest, DefineMaterializingSymbol) {
       });
 
   cantFail(JD.define(MU));
-  cantFail(ES.lookup({{&JD, false}}, Foo));
+  cantFail(ES.lookup({&JD}, Foo));
 
   // Assert that materialization is complete by now.
   ExpectNoMoreMaterialization = true;
 
   // Look up bar to verify that no further materialization happens.
-  auto BarResult = cantFail(ES.lookup({{&JD, false}}, Bar));
+  auto BarResult = cantFail(ES.lookup({&JD}, Bar));
   EXPECT_EQ(BarResult.getAddress(), BarSym.getAddress())
       << "Expected Bar == BarSym";
 }
@@ -688,7 +685,7 @@ TEST_F(CoreAPIsStandardTest, GeneratorTest) {
     return SymbolNameSet({Bar});
   });
 
-  auto Result = cantFail(ES.lookup({{&JD, false}}, {Foo, Bar}));
+  auto Result = cantFail(ES.lookup({&JD}, {Foo, Bar}));
 
   EXPECT_EQ(Result.count(Bar), 1U) << "Expected to find fallback def for 'bar'";
   EXPECT_EQ(Result[Bar].getAddress(), BarSym.getAddress())
@@ -704,7 +701,7 @@ TEST_F(CoreAPIsStandardTest, FailResolution) {
   cantFail(JD.define(MU));
 
   SymbolNameSet Names({Foo, Bar});
-  auto Result = ES.lookup({{&JD, false}}, Names);
+  auto Result = ES.lookup({&JD}, Names);
 
   EXPECT_FALSE(!!Result) << "Expected failure";
   if (!Result) {
@@ -736,7 +733,7 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithUnthreadedMaterialization) {
 
   cantFail(JD.define(MU));
 
-  auto FooLookupResult = cantFail(ES.lookup({{&JD, false}}, Foo));
+  auto FooLookupResult = cantFail(ES.lookup({&JD}, Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -757,7 +754,7 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithThreadedMaterialization) {
 
   cantFail(JD.define(absoluteSymbols({{Foo, FooSym}})));
 
-  auto FooLookupResult = cantFail(ES.lookup({{&JD, false}}, Foo));
+  auto FooLookupResult = cantFail(ES.lookup({&JD}, Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -805,14 +802,14 @@ TEST_F(CoreAPIsStandardTest, TestGetRequestedSymbolsAndReplace) {
   EXPECT_FALSE(FooMaterialized) << "Foo should not be materialized yet";
   EXPECT_FALSE(BarMaterialized) << "Bar should not be materialized yet";
 
-  auto FooSymResult = cantFail(ES.lookup({{&JD, false}}, Foo));
+  auto FooSymResult = cantFail(ES.lookup({&JD}, Foo));
   EXPECT_EQ(FooSymResult.getAddress(), FooSym.getAddress())
       << "Address mismatch for Foo";
 
   EXPECT_TRUE(FooMaterialized) << "Foo should be materialized now";
   EXPECT_FALSE(BarMaterialized) << "Bar still should not be materialized";
 
-  auto BarSymResult = cantFail(ES.lookup({{&JD, false}}, Bar));
+  auto BarSymResult = cantFail(ES.lookup({&JD}, Bar));
   EXPECT_EQ(BarSymResult.getAddress(), BarSym.getAddress())
       << "Address mismatch for Bar";
   EXPECT_TRUE(BarMaterialized) << "Bar should be materialized now";
@@ -832,7 +829,7 @@ TEST_F(CoreAPIsStandardTest, TestMaterializationResponsibilityDelegation) {
 
   cantFail(JD.define(MU));
 
-  auto Result = ES.lookup({{&JD, false}}, {Foo, Bar});
+  auto Result = ES.lookup({&JD}, {Foo, Bar});
 
   EXPECT_TRUE(!!Result) << "Result should be a success value";
   EXPECT_EQ(Result->count(Foo), 1U) << "\"Foo\" entry missing";
@@ -864,7 +861,7 @@ TEST_F(CoreAPIsStandardTest, TestMaterializeWeakSymbol) {
 
   auto OnReady = [](Error Err) { cantFail(std::move(Err)); };
 
-  ES.lookup({{&JD, false}}, {Foo}, std::move(OnResolution), std::move(OnReady),
+  ES.lookup({&JD}, {Foo}, std::move(OnResolution), std::move(OnReady),
             NoDependenciesToRegister);
 
   auto MU2 = llvm::make_unique<SimpleMaterializationUnit>(
diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
index b6c362b8aaa..1660670ae63 100644
--- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
@@ -66,7 +66,7 @@ static bool testSetProcessAllSections(std::unique_ptr<MemoryBuffer> Obj,
 
   ObjLayer.setProcessAllSections(ProcessAllSections);
   cantFail(ObjLayer.add(JD, std::move(Obj), ES.allocateVModule()));
-  ES.lookup({{&JD, false}}, {Foo}, OnResolveDoNothing, OnReadyDoNothing,
+  ES.lookup({&JD}, {Foo}, OnResolveDoNothing, OnReadyDoNothing,
             NoDependenciesToRegister);
   return DebugSectionSeen;
 }
@@ -157,8 +157,7 @@ TEST(RTDyldObjectLinkingLayerTest, TestOverrideObjectFlags) {
   ObjLayer.setOverrideObjectFlagsWithResponsibilityFlags(true);
 
   cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
-  ES.lookup({{&JD, false}}, {Foo},
-            [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
+  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
             [](Error Err) { cantFail(std::move(Err)); },
             NoDependenciesToRegister);
 }
@@ -220,8 +219,7 @@ TEST(RTDyldObjectLinkingLayerTest, TestAutoClaimResponsibilityForSymbols) {
   ObjLayer.setAutoClaimResponsibilityForObjectSymbols(true);
 
   cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
-  ES.lookup({{&JD, false}}, {Foo},
-            [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
+  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
             [](Error Err) { cantFail(std::move(Err)); },
             NoDependenciesToRegister);
 }
-- 
GitLab


From c2e4d07208a9a1bf5ea16af0b68756e1dbe48fbb Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 23 Oct 2018 20:54:51 +0000
Subject: [PATCH 0470/1116] [llvm-objcopy] Fix use-after-move clang-tidy
 warning

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345079 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objcopy/CopyConfig.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/llvm-objcopy/CopyConfig.cpp b/tools/llvm-objcopy/CopyConfig.cpp
index 9746110cd45..24c72de8882 100644
--- a/tools/llvm-objcopy/CopyConfig.cpp
+++ b/tools/llvm-objcopy/CopyConfig.cpp
@@ -345,8 +345,6 @@ DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
 
   Config.PreserveDates = InputArgs.hasArg(OBJCOPY_preserve_dates);
 
-  DriverConfig DC;
-  DC.CopyConfigs.push_back(std::move(Config));
   if (Config.DecompressDebugSections &&
       Config.CompressionType != DebugCompressionType::None) {
     error("Cannot specify --compress-debug-sections at the same time as "
@@ -356,6 +354,8 @@ DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
   if (Config.DecompressDebugSections && !zlib::isAvailable())
     error("LLVM was not compiled with LLVM_ENABLE_ZLIB: cannot decompress.");
 
+  DriverConfig DC;
+  DC.CopyConfigs.push_back(std::move(Config));
   return DC;
 }
 
-- 
GitLab


From 47fe3c0d7915103a5b262e2a55a281013b4167e5 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Tue, 23 Oct 2018 21:23:18 +0000
Subject: [PATCH 0471/1116] CGP: Clear data structures at the end of a loop
 iteration instead of the beginning.

Clearing LargeOffsetGEPMap at the end fixes a bug where if a large
offset GEP is in a dead basic block, we fail an assertion when trying
to delete the block due to the asserting VH in LargeOffsetGEPMap.

Differential Revision: https://reviews.llvm.org/D53464

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345082 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/CodeGenPrepare.cpp                | 10 ++++-----
 .../Thumb2/unreachable-large-offset-gep.ll    | 22 +++++++++++++++++++
 2 files changed, 27 insertions(+), 5 deletions(-)
 create mode 100644 test/CodeGen/Thumb2/unreachable-large-offset-gep.ll

diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index fa5cc4dc969..6e73f7d773b 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -436,11 +436,6 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
-    SeenChainsForSExt.clear();
-    ValToSExtendedUses.clear();
-    RemovedInsts.clear();
-    LargeOffsetGEPMap.clear();
-    LargeOffsetGEPID.clear();
     for (Function::iterator I = F.begin(); I != F.end(); ) {
       BasicBlock *BB = &*I++;
       bool ModifiedDTOnIteration = false;
@@ -460,6 +455,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
       I->deleteValue();
 
     EverMadeChange |= MadeChange;
+    SeenChainsForSExt.clear();
+    ValToSExtendedUses.clear();
+    RemovedInsts.clear();
+    LargeOffsetGEPMap.clear();
+    LargeOffsetGEPID.clear();
   }
 
   SunkAddrs.clear();
diff --git a/test/CodeGen/Thumb2/unreachable-large-offset-gep.ll b/test/CodeGen/Thumb2/unreachable-large-offset-gep.ll
new file mode 100644
index 00000000000..641787b0d7f
--- /dev/null
+++ b/test/CodeGen/Thumb2/unreachable-large-offset-gep.ll
@@ -0,0 +1,22 @@
+; RUN: llc -o - %s | FileCheck %s
+
+; CHECK: .LBB0_1:
+; CHECK: b .LBB0_1
+
+target triple = "thumbv8m-unknown-linux-android"
+
+define void @d(i32* %c) {
+entry:
+  br i1 false, label %f.exit, label %i.d
+
+i.d:
+  br label %i.d
+
+f.exit:
+  %0 = getelementptr i32, i32* %c, i32 57
+  br label %if.g
+
+if.g:
+  store i32 0, i32* %0
+  ret void
+}
-- 
GitLab


From f5961147fde0c7e1139a4c4c54ab133894fc56aa Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 23 Oct 2018 21:23:52 +0000
Subject: [PATCH 0472/1116] Fix typo in verifier error message

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345083 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineVerifier.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index a19c2ef8002..b37c421596b 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -778,7 +778,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
                "isn't a terminator instruction!", MBB);
       }
       if (Cond.empty()) {
-        report("MBB exits via conditinal branch/branch but there's no "
+        report("MBB exits via conditional branch/branch but there's no "
                "condition!", MBB);
       }
     } else {
-- 
GitLab


From ed9fa1afb6dc49865d3376782fc53463e989950f Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 23 Oct 2018 21:51:44 +0000
Subject: [PATCH 0473/1116] [dwarfdump] Make incompatibility between -diff and
 -verbose explicit.

Using -diff and -verbose together doesn't work today. We should audit
where these two options interact and fix them. In the meantime we error
out when the user try to specify both.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345084 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-dwarfdump/cmdline.test  |  3 +++
 tools/llvm-dwarfdump/llvm-dwarfdump.cpp | 10 +++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/test/tools/llvm-dwarfdump/cmdline.test b/test/tools/llvm-dwarfdump/cmdline.test
index 1314990a7b9..5b5ea618c1a 100644
--- a/test/tools/llvm-dwarfdump/cmdline.test
+++ b/test/tools/llvm-dwarfdump/cmdline.test
@@ -24,3 +24,6 @@ HELP-NOT: -reverse-iterate
 
 RUN: llvm-dwarfdump --version 2>&1 | FileCheck --check-prefix=VERSION %s
 VERSION: {{ version }}
+
+RUN: llvm-dwarfdump -diff -verbose 2>&1 | FileCheck --check-prefix=INCOMPATIBLE %s
+INCOMPATIBLE: error: incompatible arguments: specifying both -diff and -verbose is currently not supported
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index af21a41a108..0ed86964089 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -226,7 +226,7 @@ static alias VerboseAlias("v", desc("Alias for -verbose."), aliasopt(Verbose),
 static void error(StringRef Prefix, std::error_code EC) {
   if (!EC)
     return;
-  errs() << Prefix << ": " << EC.message() << "\n";
+  WithColor::error() << Prefix << ": " << EC.message() << "\n";
   exit(1);
 }
 
@@ -571,6 +571,14 @@ int main(int argc, char **argv) {
     return 0;
   }
 
+  // FIXME: Audit interactions between these two options and make them
+  //        compatible.
+  if (Diff && Verbose) {
+    WithColor::error() << "incompatible arguments: specifying both -diff and "
+                          "-verbose is currently not supported";
+    return 0;
+  }
+
   std::unique_ptr<ToolOutputFile> OutputFile;
   if (!OutputFilename.empty()) {
     std::error_code EC;
-- 
GitLab


From 1bbd49ae6f129642054cc1e59ffee3b0b672b53e Mon Sep 17 00:00:00 2001
From: Zhizhou Yang <zhizhouyang@gmail.com>
Date: Tue, 23 Oct 2018 21:51:56 +0000
Subject: [PATCH 0474/1116] Print out DebugCounter info with
 -print-debug-counter

Summary:
This patch will print out {Counter, Skip, StopAfter} info of all passes which have DebugCounter set at destruction.

It can be used to monitor how many times does certain transformation happen in a pass, and also help check if -debug-counter option is set correctly.

Please refer to this [[ http://lists.llvm.org/pipermail/llvm-dev/2018-July/124722.html  | thread ]] for motivation.

Reviewers: george.burgess.iv, davide, greened

Reviewed By: greened

Subscribers: kristina, llozano, mgorny, llvm-commits, mgrang

Differential Revision: https://reviews.llvm.org/D50031

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345085 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/DebugCounter.h |  2 ++
 lib/Support/DebugCounter.cpp        | 25 ++++++++++++++++++++----
 test/Other/print-debug-counter.ll   | 30 +++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 4 deletions(-)
 create mode 100644 test/Other/print-debug-counter.ll

diff --git a/include/llvm/Support/DebugCounter.h b/include/llvm/Support/DebugCounter.h
index 83bd5a06c94..6eadd5c6aef 100644
--- a/include/llvm/Support/DebugCounter.h
+++ b/include/llvm/Support/DebugCounter.h
@@ -55,6 +55,8 @@ namespace llvm {
 
 class DebugCounter {
 public:
+  ~DebugCounter();
+
   /// Returns a reference to the singleton instance.
   static DebugCounter &instance();
 
diff --git a/lib/Support/DebugCounter.cpp b/lib/Support/DebugCounter.cpp
index 9c8260dbe07..6598103658d 100644
--- a/lib/Support/DebugCounter.cpp
+++ b/lib/Support/DebugCounter.cpp
@@ -49,8 +49,18 @@ static DebugCounterList DebugCounterOption(
     cl::desc("Comma separated list of debug counter skip and count"),
     cl::CommaSeparated, cl::ZeroOrMore, cl::location(DebugCounter::instance()));
 
+static cl::opt<bool> PrintDebugCounter(
+    "print-debug-counter", cl::Hidden, cl::init(false), cl::Optional,
+    cl::desc("Print out debug counter info after all counters accumulated"));
+
 static ManagedStatic<DebugCounter> DC;
 
+// Print information when destroyed, iff command line option is specified.
+DebugCounter::~DebugCounter() {
+  if (isCountingEnabled() && PrintDebugCounter)
+    print(dbgs());
+}
+
 DebugCounter &DebugCounter::instance() { return *DC; }
 
 // This is called by the command line parser when it sees a value for the
@@ -107,11 +117,18 @@ void DebugCounter::push_back(const std::string &Val) {
 }
 
 void DebugCounter::print(raw_ostream &OS) const {
+  SmallVector<StringRef, 16> CounterNames(RegisteredCounters.begin(),
+                                          RegisteredCounters.end());
+  sort(CounterNames.begin(), CounterNames.end());
+
+  auto &Us = instance();
   OS << "Counters and values:\n";
-  for (const auto &KV : Counters)
-    OS << left_justify(RegisteredCounters[KV.first], 32) << ": {"
-       << KV.second.Count << "," << KV.second.Skip << ","
-       << KV.second.StopAfter << "}\n";
+  for (auto &CounterName : CounterNames) {
+    unsigned CounterID = getCounterId(CounterName);
+    OS << left_justify(RegisteredCounters[CounterID], 32) << ": {"
+       << Us.Counters[CounterID].Count << "," << Us.Counters[CounterID].Skip
+       << "," << Us.Counters[CounterID].StopAfter << "}\n";
+  }
 }
 
 LLVM_DUMP_METHOD void DebugCounter::dump() const {
diff --git a/test/Other/print-debug-counter.ll b/test/Other/print-debug-counter.ll
new file mode 100644
index 00000000000..ffd197197d8
--- /dev/null
+++ b/test/Other/print-debug-counter.ll
@@ -0,0 +1,30 @@
+; RUN: opt -S -debug-counter=early-cse-skip=1,early-cse-count=1 -early-cse \
+; RUN:        -debug-counter=newgvn-vn-skip=1,newgvn-vn-count=2 -newgvn \
+; RUN:        -instcombine -print-debug-counter < %s 2>&1 | FileCheck %s
+;; Test debug counter prints correct info in right order.
+; CHECK-LABEL: Counters and values:
+; CHECK:       early-cse
+; CHECK-SAME:  {4,1,1}
+; CHECK:       instcombine-visit
+; CHECK-SAME:  {12,0,-1}
+; CHECK:       newgvn-vn
+; CHECK-SAME:  {9,1,2}
+define i32 @f1(i32 %a, i32 %b) {
+bb:
+  %add1 = add i32 %a, %b
+  %add2 = add i32 %a, %b
+  %add3 = add i32 %a, %b
+  %add4 = add i32 %a, %b
+  %ret1 = add i32 %add1, %add2
+  %ret2 = add i32 %add3, %add4
+  %ret = add i32 %ret1, %ret2
+  ret i32 %ret
+}
+
+define i32 @f2(i32 %a, i32 %b) {
+bb:
+  %add1 = add i32 %a, %b
+  %add2 = add i32 %a, %b
+  %ret = add i32 %add1, %add2
+  ret i32 %ret
+}
-- 
GitLab


From 26f7b1bbdf545e3556750f9e6ef1c039c12b75e1 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 23 Oct 2018 21:58:49 +0000
Subject: [PATCH 0475/1116] [X86] Autogenerate comple checks. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345087 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/crash-O0.ll | 50 ++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/test/CodeGen/X86/crash-O0.ll b/test/CodeGen/X86/crash-O0.ll
index dab15c19c69..1a234d45cb2 100644
--- a/test/CodeGen/X86/crash-O0.ll
+++ b/test/CodeGen/X86/crash-O0.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -O0 -relocation-model=pic -disable-fp-elim < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10"
@@ -9,6 +10,35 @@ target triple = "x86_64-apple-darwin10"
 ; aliased registers (AX and AL) - RegAllocFast does not like that.
 ; PR7312
 define i32 @div8() nounwind {
+; CHECK-LABEL: div8:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    ## implicit-def: $rdx
+; CHECK-NEXT:    movb %dl, %sil
+; CHECK-NEXT:    movzbw %cl, %ax
+; CHECK-NEXT:    divb %sil
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Spill
+; CHECK-NEXT:    movzbw %cl, %ax
+; CHECK-NEXT:    divb %sil
+; CHECK-NEXT:    shrw $8, %ax
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    cmpb %sil, %cl
+; CHECK-NEXT:    jae LBB0_2
+; CHECK-NEXT:  ## %bb.1: ## %"39"
+; CHECK-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al ## 1-byte Reload
+; CHECK-NEXT:    movzbl %al, %ecx
+; CHECK-NEXT:    ## implicit-def: $edx
+; CHECK-NEXT:    imull %edx, %ecx
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    cmpl %edx, %ecx
+; CHECK-NEXT:    je LBB0_3
+; CHECK-NEXT:  LBB0_2: ## %"40"
+; CHECK-NEXT:    ud2
+; CHECK-NEXT:  LBB0_3: ## %"41"
+; CHECK-NEXT:    ud2
 entry:
   %0 = trunc i64 undef to i8                      ; <i8> [#uses=3]
   %1 = udiv i8 0, %0                              ; <i8> [#uses=1]
@@ -38,12 +68,22 @@ entry:
 ; An instruction gets between CQO and DIV64 because the load is folded
 ; into the division but it requires a sign extension.
 ; PR21700
-; CHECK-LABEL: addressModeWith32bitIndex:
-; CHECK: cqto
-; CHECK-NEXT: movslq
-; CHECK-NEXT: idivq
-; CHECK: retq
 define i64 @addressModeWith32bitIndex(i32 %V) {
+; CHECK-LABEL: addressModeWith32bitIndex:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    movslq %edi, %rsi
+; CHECK-NEXT:    idivq (%rcx,%rsi,8)
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
   %gep = getelementptr i64, i64* null, i32 %V
   %load = load i64, i64* %gep
   %sdiv = sdiv i64 0, %load
-- 
GitLab


From c8b72a0513501940175e56c0ebdd490eecc790e4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 23 Oct 2018 22:04:33 +0000
Subject: [PATCH 0476/1116] Fix test after r345085

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345089 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Other/print-debug-counter.ll | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/Other/print-debug-counter.ll b/test/Other/print-debug-counter.ll
index ffd197197d8..35fc1c121e2 100644
--- a/test/Other/print-debug-counter.ll
+++ b/test/Other/print-debug-counter.ll
@@ -1,3 +1,5 @@
+; REQUIRES: debug
+
 ; RUN: opt -S -debug-counter=early-cse-skip=1,early-cse-count=1 -early-cse \
 ; RUN:        -debug-counter=newgvn-vn-skip=1,newgvn-vn-count=2 -newgvn \
 ; RUN:        -instcombine -print-debug-counter < %s 2>&1 | FileCheck %s
-- 
GitLab


From eec87eef4178f37b806484a71fcbcb28d7aeb3ac Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 23 Oct 2018 22:07:34 +0000
Subject: [PATCH 0477/1116] Actually fix test from r345085 REQUIRE: asserts

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345090 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Other/print-debug-counter.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Other/print-debug-counter.ll b/test/Other/print-debug-counter.ll
index 35fc1c121e2..3647f39026d 100644
--- a/test/Other/print-debug-counter.ll
+++ b/test/Other/print-debug-counter.ll
@@ -1,4 +1,4 @@
-; REQUIRES: debug
+; REQUIRES: asserts
 
 ; RUN: opt -S -debug-counter=early-cse-skip=1,early-cse-count=1 -early-cse \
 ; RUN:        -debug-counter=newgvn-vn-skip=1,newgvn-vn-count=2 -newgvn \
-- 
GitLab


From fe08d6738febbb037a66ae547745756367a167c1 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 23 Oct 2018 22:57:21 +0000
Subject: [PATCH 0478/1116] [ThinLTO] Fix a crash in lazy loading of Metadata

Summary:
This is a revised version of D41474.

When the debug location is parsed in BitcodeReader::parseFunction, the
scope and inlinedAt MDNodes are obtained via MDLoader->getMDNodeFwdRefOrNull(),
which will create a forward ref if they were not yet loaded.
Specifically, if one of these MDNodes is in the module level metadata
block, and this is during ThinLTO importing, that metadata block is
lazily loaded.

Most places in that invoke getMDNodeFwdRefOrNull have a corresponding call
to resolveForwardRefsAndPlaceholders which will take care of resolving them.
E.g. places that call getMetadataFwdRefOrLoad, or at the end of parsing a
function-level metadata block, or at the end of the initial lazy load of
module level metadata in order to handle invocations of getMDNodeFwdRefOrNull
for named metadata and global object attachments. However, the calls for
the scope/inlinedAt of debug locations are not backed by any such call to
resolveForwardRefsAndPlaceholders.

To fix this, change the scope and inlinedAt parsing to instead use
getMetadataFwdRefOrLoad, which will ensure the forward refs to lazily
loaded metadata are resolved.

Fixes PR35472.

Reviewers: dexonsmith, Sunil_Srivastava, vsk

Subscribers: inglorion, eraman, steven_wu, sebpop, mehdi_amini, dmikulin, vsk, hiraditya, llvm-commits

Differential Revision: https://reviews.llvm.org/D53596

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345095 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Bitcode/Reader/BitcodeReader.cpp |   6 +-
 test/ThinLTO/X86/Inputs/pr35472.ll   |  13 +++
 test/ThinLTO/X86/pr35472.ll          | 122 +++++++++++++++++++++++++++
 3 files changed, 139 insertions(+), 2 deletions(-)
 create mode 100644 test/ThinLTO/X86/Inputs/pr35472.ll
 create mode 100644 test/ThinLTO/X86/pr35472.ll

diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index aa83955e646..1c4772968e7 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -3520,12 +3520,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       MDNode *Scope = nullptr, *IA = nullptr;
       if (ScopeID) {
-        Scope = MDLoader->getMDNodeFwdRefOrNull(ScopeID - 1);
+        Scope = dyn_cast_or_null<MDNode>(
+            MDLoader->getMetadataFwdRefOrLoad(ScopeID - 1));
         if (!Scope)
           return error("Invalid record");
       }
       if (IAID) {
-        IA = MDLoader->getMDNodeFwdRefOrNull(IAID - 1);
+        IA = dyn_cast_or_null<MDNode>(
+            MDLoader->getMetadataFwdRefOrLoad(IAID - 1));
         if (!IA)
           return error("Invalid record");
       }
diff --git a/test/ThinLTO/X86/Inputs/pr35472.ll b/test/ThinLTO/X86/Inputs/pr35472.ll
new file mode 100644
index 00000000000..b9c92b3e3fb
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/pr35472.ll
@@ -0,0 +1,13 @@
+; ModuleID = 'b.cpp'
+source_filename = "b.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline optnone uwtable
+define void @_Z5Alphav() {
+entry:
+  call void @_Z5Bravov()
+  ret void
+}
+
+declare void @_Z5Bravov()
diff --git a/test/ThinLTO/X86/pr35472.ll b/test/ThinLTO/X86/pr35472.ll
new file mode 100644
index 00000000000..9173f92063a
--- /dev/null
+++ b/test/ThinLTO/X86/pr35472.ll
@@ -0,0 +1,122 @@
+; Test to make sure that lazily loaded debug location scope metadata is
+; handled properly. Note that we need to have the DILexicalScope !34
+; referenced from multiple function's debug locs for this to be in the
+; lazily loaded module level metadata block.
+
+; RUN: opt -module-hash -module-summary %s -o %t1.bc
+; RUN: opt -module-hash -module-summary %p/Inputs/pr35472.ll -o %t2.bc
+; RUN: llvm-lto -thinlto-action=run %t1.bc %t2.bc
+; RUN: llvm-nm %t1.bc.thinlto.o | FileCheck %s -check-prefix=ThinLTOa
+; RUN: llvm-nm %t2.bc.thinlto.o | FileCheck %s -check-prefix=ThinLTOb
+
+; ThinLTOa-DAG: T _Z5Bravov
+; ThinLTOa-DAG: W _ZN4EchoD2Ev
+; ThinLTOb-DAG: T _Z5Alphav
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.Delta = type { %struct.Charlie }
+%struct.Charlie = type { i32 }
+%struct.Echo = type { %struct.Charlie }
+
+$_ZN4EchoD2Ev = comdat any
+$_ZN5DeltaD2Ev = comdat any
+
+define void @_Z5Bravov() !dbg !7 {
+  %Hotel = alloca %struct.Delta, align 4
+  %India = alloca %struct.Echo, align 4
+  call void @llvm.dbg.declare(metadata %struct.Delta* %Hotel, metadata !10, metadata !DIExpression()), !dbg !22
+  call void @_ZN4EchoD2Ev(%struct.Echo* %India), !dbg !28
+  ret void, !dbg !28
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+define linkonce_odr void @_ZN4EchoD2Ev(%struct.Echo* %this) unnamed_addr comdat align 2 {
+  %this.addr.i = alloca %struct.Charlie*, align 8
+  call void @llvm.dbg.declare(metadata %struct.Charlie** %this.addr.i, metadata !29, metadata !DIExpression()), !dbg !32
+  %this1.i = load %struct.Charlie*, %struct.Charlie** %this.addr.i, align 8
+  %Golf.i = getelementptr inbounds %struct.Charlie, %struct.Charlie* %this1.i, i32 0, i32 0, !dbg !33
+  ret void
+}
+
+define linkonce_odr void @_ZN5DeltaD2Ev(%struct.Delta* %this) unnamed_addr comdat align 2 !dbg !36 {
+  %this.addr.i = alloca %struct.Charlie*, align 8
+  call void @llvm.dbg.declare(metadata %struct.Charlie** %this.addr.i, metadata !29, metadata !DIExpression()), !dbg !41
+  %this1.i = load %struct.Charlie*, %struct.Charlie** %this.addr.i, align 8
+  %Golf.i = getelementptr inbounds %struct.Charlie, %struct.Charlie* %this1.i, i32 0, i32 0, !dbg !48
+  ret void
+}
+
+!llvm.module.flags = !{!3, !4, !5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 321056)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "a.cpp", directory: "/home/sunil/185335/302")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!7 = distinct !DISubprogram(name: "Bravo", linkageName: "_Z5Bravov", scope: !1, file: !1, line: 17, type: !8, isLocal: false, isDefinition: true, scopeLine: 17, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !DILocalVariable(name: "Hotel", scope: !7, file: !1, line: 18, type: !11)
+!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Delta", file: !1, line: 6, size: 32, elements: !12, identifier: "_ZTS5Delta")
+!12 = !{!13}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "Foxtrot", scope: !11, file: !1, line: 7, baseType: !14, size: 32)
+!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Charlie", file: !1, line: 1, size: 32, elements: !15, identifier: "_ZTS7Charlie")
+!15 = !{!16, !18}
+!16 = !DIDerivedType(tag: DW_TAG_member, name: "Golf", scope: !14, file: !1, line: 3, baseType: !17, size: 32)
+!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!18 = !DISubprogram(name: "~Charlie", scope: !14, file: !1, line: 2, type: !19, isLocal: false, isDefinition: false, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false)
+!19 = !DISubroutineType(types: !20)
+!20 = !{null, !21}
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!22 = !DILocation(line: 18, column: 11, scope: !7)
+!24 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Echo", file: !1, line: 10, size: 32, elements: !25, identifier: "_ZTS4Echo")
+!25 = !{!26}
+!26 = !DIDerivedType(tag: DW_TAG_member, name: "Foxtrot", scope: !24, file: !1, line: 11, baseType: !14, size: 32)
+!28 = !DILocation(line: 20, column: 1, scope: !7)
+!29 = !DILocalVariable(name: "this", arg: 1, scope: !30, type: !31, flags: DIFlagArtificial | DIFlagObjectPointer)
+!30 = distinct !DISubprogram(name: "~Charlie", linkageName: "_ZN7CharlieD2Ev", scope: !14, file: !1, line: 2, type: !19, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !18)
+!31 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64)
+!32 = !DILocation(line: 0, scope: !30)
+!33 = !DILocation(line: 2, column: 53, scope: !34)
+!34 = distinct !DILexicalBlock(scope: !30, file: !1, line: 2, column: 51)
+!36 = distinct !DISubprogram(name: "~Delta", linkageName: "_ZN5DeltaD2Ev", scope: !11, file: !1, line: 6, type: !37, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagArtificial | DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !40)
+!37 = !DISubroutineType(types: !38)
+!38 = !{null, !39}
+!39 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!40 = !DISubprogram(name: "~Delta", scope: !11, type: !37, isLocal: false, isDefinition: false, flags: DIFlagArtificial | DIFlagPrototyped, isOptimized: false)
+!41 = !DILocation(line: 0, scope: !30, inlinedAt: !42)
+!42 = distinct !DILocation(line: 6, column: 8, scope: !43)
+!43 = distinct !DILexicalBlock(scope: !36, file: !1, line: 6, column: 8)
+!48 = !DILocation(line: 2, column: 53, scope: !34, inlinedAt: !42)
+
+;----------------------------------------------------------------------------------------------
+; Compiled from following two source files with 'clang++ -S --std=c++11 -O0 -g -flto=thin' 
+; struct Charlie {
+;     __attribute__((__always_inline__)) ~Charlie() { Golf = 0; }
+;     int Golf;
+; };
+; 
+; struct Delta {
+;     Charlie Foxtrot;
+; };
+; 
+; struct Echo {
+;     Charlie Foxtrot;
+;     __attribute__((nodebug)) ~Echo() = default;
+; };
+; 
+; extern void Bravo();
+; 
+; void Bravo() {
+;     Delta Hotel;
+;     Echo India;
+; }
+; -----------------------------
+; extern void Bravo();
+; extern void Alpha();
+; void Alpha() { Bravo(); }
+
-- 
GitLab


From 70fc21b85b45d83dd10cd3c2fd4cee317ea6a40c Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 23 Oct 2018 22:57:40 +0000
Subject: [PATCH 0479/1116] [hot-cold-split] Only perform splitting in ThinLTO
 backend post-link

Summary:
Fix the new PM to only perform hot cold splitting once during ThinLTO,
by skipping it in the pre-link phase.

This was already fixed in the old PM by the move of the hot cold split
pass later (after the early return when PrepareForThinLTO) by r344869.

Reviewers: vsk, sebpop, hiraditya

Subscribers: mehdi_amini, inglorion, eraman, steven_wu, dexonsmith, llvm-commits

Differential Revision: https://reviews.llvm.org/D53611

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345096 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Passes/PassBuilder.cpp            | 4 +++-
 test/Other/new-pm-thinlto-defaults.ll | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 8b333b7f8a6..90561b05e62 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -710,7 +710,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       buildFunctionSimplificationPipeline(Level, Phase, DebugLogging)));
 
-  if (EnableHotColdSplit)
+  // We only want to do hot cold splitting once for ThinLTO, during the
+  // post-link ThinLTO.
+  if (EnableHotColdSplit && Phase != ThinLTOPhase::PreLink)
     MPM.addPass(HotColdSplittingPass());
 
   for (auto &C : CGSCCOptimizerLateEPCallbacks)
diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll
index 001e3eeeb96..c68aa1d05aa 100644
--- a/test/Other/new-pm-thinlto-defaults.ll
+++ b/test/Other/new-pm-thinlto-defaults.ll
@@ -26,6 +26,10 @@
 ; RUN: opt -disable-verify -debug-pass-manager -new-pm-debug-info-for-profiling \
 ; RUN:     -passes='thinlto-pre-link<O2>,name-anon-globals' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-DIS,CHECK-O,CHECK-O2,CHECK-PRELINK-O,CHECK-PRELINK-O2
+; Enabling the hot-cold-split pass should not affect the ThinLTO pre-link
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN:     -passes='thinlto-pre-link<O2>,name-anon-globals' -hot-cold-split -S  %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O2
 ;
 ; Postlink pipelines:
 ; RUN: opt -disable-verify -debug-pass-manager \
-- 
GitLab


From 2a04af10be8a35388d6d80cce7539f1c76931328 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 23 Oct 2018 23:00:29 +0000
Subject: [PATCH 0480/1116] Revert "[ThinLTO] Fix a crash in lazy loading of
 Metadata"

This reverts commit r345095. It was accidentally committed.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345097 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Bitcode/Reader/BitcodeReader.cpp |   6 +-
 test/ThinLTO/X86/Inputs/pr35472.ll   |  13 ---
 test/ThinLTO/X86/pr35472.ll          | 122 ---------------------------
 3 files changed, 2 insertions(+), 139 deletions(-)
 delete mode 100644 test/ThinLTO/X86/Inputs/pr35472.ll
 delete mode 100644 test/ThinLTO/X86/pr35472.ll

diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 1c4772968e7..aa83955e646 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -3520,14 +3520,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       MDNode *Scope = nullptr, *IA = nullptr;
       if (ScopeID) {
-        Scope = dyn_cast_or_null<MDNode>(
-            MDLoader->getMetadataFwdRefOrLoad(ScopeID - 1));
+        Scope = MDLoader->getMDNodeFwdRefOrNull(ScopeID - 1);
         if (!Scope)
           return error("Invalid record");
       }
       if (IAID) {
-        IA = dyn_cast_or_null<MDNode>(
-            MDLoader->getMetadataFwdRefOrLoad(IAID - 1));
+        IA = MDLoader->getMDNodeFwdRefOrNull(IAID - 1);
         if (!IA)
           return error("Invalid record");
       }
diff --git a/test/ThinLTO/X86/Inputs/pr35472.ll b/test/ThinLTO/X86/Inputs/pr35472.ll
deleted file mode 100644
index b9c92b3e3fb..00000000000
--- a/test/ThinLTO/X86/Inputs/pr35472.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; ModuleID = 'b.cpp'
-source_filename = "b.cpp"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: noinline optnone uwtable
-define void @_Z5Alphav() {
-entry:
-  call void @_Z5Bravov()
-  ret void
-}
-
-declare void @_Z5Bravov()
diff --git a/test/ThinLTO/X86/pr35472.ll b/test/ThinLTO/X86/pr35472.ll
deleted file mode 100644
index 9173f92063a..00000000000
--- a/test/ThinLTO/X86/pr35472.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; Test to make sure that lazily loaded debug location scope metadata is
-; handled properly. Note that we need to have the DILexicalScope !34
-; referenced from multiple function's debug locs for this to be in the
-; lazily loaded module level metadata block.
-
-; RUN: opt -module-hash -module-summary %s -o %t1.bc
-; RUN: opt -module-hash -module-summary %p/Inputs/pr35472.ll -o %t2.bc
-; RUN: llvm-lto -thinlto-action=run %t1.bc %t2.bc
-; RUN: llvm-nm %t1.bc.thinlto.o | FileCheck %s -check-prefix=ThinLTOa
-; RUN: llvm-nm %t2.bc.thinlto.o | FileCheck %s -check-prefix=ThinLTOb
-
-; ThinLTOa-DAG: T _Z5Bravov
-; ThinLTOa-DAG: W _ZN4EchoD2Ev
-; ThinLTOb-DAG: T _Z5Alphav
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.Delta = type { %struct.Charlie }
-%struct.Charlie = type { i32 }
-%struct.Echo = type { %struct.Charlie }
-
-$_ZN4EchoD2Ev = comdat any
-$_ZN5DeltaD2Ev = comdat any
-
-define void @_Z5Bravov() !dbg !7 {
-  %Hotel = alloca %struct.Delta, align 4
-  %India = alloca %struct.Echo, align 4
-  call void @llvm.dbg.declare(metadata %struct.Delta* %Hotel, metadata !10, metadata !DIExpression()), !dbg !22
-  call void @_ZN4EchoD2Ev(%struct.Echo* %India), !dbg !28
-  ret void, !dbg !28
-}
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata)
-
-define linkonce_odr void @_ZN4EchoD2Ev(%struct.Echo* %this) unnamed_addr comdat align 2 {
-  %this.addr.i = alloca %struct.Charlie*, align 8
-  call void @llvm.dbg.declare(metadata %struct.Charlie** %this.addr.i, metadata !29, metadata !DIExpression()), !dbg !32
-  %this1.i = load %struct.Charlie*, %struct.Charlie** %this.addr.i, align 8
-  %Golf.i = getelementptr inbounds %struct.Charlie, %struct.Charlie* %this1.i, i32 0, i32 0, !dbg !33
-  ret void
-}
-
-define linkonce_odr void @_ZN5DeltaD2Ev(%struct.Delta* %this) unnamed_addr comdat align 2 !dbg !36 {
-  %this.addr.i = alloca %struct.Charlie*, align 8
-  call void @llvm.dbg.declare(metadata %struct.Charlie** %this.addr.i, metadata !29, metadata !DIExpression()), !dbg !41
-  %this1.i = load %struct.Charlie*, %struct.Charlie** %this.addr.i, align 8
-  %Golf.i = getelementptr inbounds %struct.Charlie, %struct.Charlie* %this1.i, i32 0, i32 0, !dbg !48
-  ret void
-}
-
-!llvm.module.flags = !{!3, !4, !5}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 321056)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "a.cpp", directory: "/home/sunil/185335/302")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!7 = distinct !DISubprogram(name: "Bravo", linkageName: "_Z5Bravov", scope: !1, file: !1, line: 17, type: !8, isLocal: false, isDefinition: true, scopeLine: 17, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
-!8 = !DISubroutineType(types: !9)
-!9 = !{null}
-!10 = !DILocalVariable(name: "Hotel", scope: !7, file: !1, line: 18, type: !11)
-!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Delta", file: !1, line: 6, size: 32, elements: !12, identifier: "_ZTS5Delta")
-!12 = !{!13}
-!13 = !DIDerivedType(tag: DW_TAG_member, name: "Foxtrot", scope: !11, file: !1, line: 7, baseType: !14, size: 32)
-!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Charlie", file: !1, line: 1, size: 32, elements: !15, identifier: "_ZTS7Charlie")
-!15 = !{!16, !18}
-!16 = !DIDerivedType(tag: DW_TAG_member, name: "Golf", scope: !14, file: !1, line: 3, baseType: !17, size: 32)
-!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!18 = !DISubprogram(name: "~Charlie", scope: !14, file: !1, line: 2, type: !19, isLocal: false, isDefinition: false, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false)
-!19 = !DISubroutineType(types: !20)
-!20 = !{null, !21}
-!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
-!22 = !DILocation(line: 18, column: 11, scope: !7)
-!24 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Echo", file: !1, line: 10, size: 32, elements: !25, identifier: "_ZTS4Echo")
-!25 = !{!26}
-!26 = !DIDerivedType(tag: DW_TAG_member, name: "Foxtrot", scope: !24, file: !1, line: 11, baseType: !14, size: 32)
-!28 = !DILocation(line: 20, column: 1, scope: !7)
-!29 = !DILocalVariable(name: "this", arg: 1, scope: !30, type: !31, flags: DIFlagArtificial | DIFlagObjectPointer)
-!30 = distinct !DISubprogram(name: "~Charlie", linkageName: "_ZN7CharlieD2Ev", scope: !14, file: !1, line: 2, type: !19, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !18)
-!31 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64)
-!32 = !DILocation(line: 0, scope: !30)
-!33 = !DILocation(line: 2, column: 53, scope: !34)
-!34 = distinct !DILexicalBlock(scope: !30, file: !1, line: 2, column: 51)
-!36 = distinct !DISubprogram(name: "~Delta", linkageName: "_ZN5DeltaD2Ev", scope: !11, file: !1, line: 6, type: !37, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagArtificial | DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !40)
-!37 = !DISubroutineType(types: !38)
-!38 = !{null, !39}
-!39 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
-!40 = !DISubprogram(name: "~Delta", scope: !11, type: !37, isLocal: false, isDefinition: false, flags: DIFlagArtificial | DIFlagPrototyped, isOptimized: false)
-!41 = !DILocation(line: 0, scope: !30, inlinedAt: !42)
-!42 = distinct !DILocation(line: 6, column: 8, scope: !43)
-!43 = distinct !DILexicalBlock(scope: !36, file: !1, line: 6, column: 8)
-!48 = !DILocation(line: 2, column: 53, scope: !34, inlinedAt: !42)
-
-;----------------------------------------------------------------------------------------------
-; Compiled from following two source files with 'clang++ -S --std=c++11 -O0 -g -flto=thin' 
-; struct Charlie {
-;     __attribute__((__always_inline__)) ~Charlie() { Golf = 0; }
-;     int Golf;
-; };
-; 
-; struct Delta {
-;     Charlie Foxtrot;
-; };
-; 
-; struct Echo {
-;     Charlie Foxtrot;
-;     __attribute__((nodebug)) ~Echo() = default;
-; };
-; 
-; extern void Bravo();
-; 
-; void Bravo() {
-;     Delta Hotel;
-;     Echo India;
-; }
-; -----------------------------
-; extern void Bravo();
-; extern void Alpha();
-; void Alpha() { Bravo(); }
-
-- 
GitLab


From 8b9cbda1b3249b6ec40f8495a13876159ebfef7d Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 23 Oct 2018 23:01:39 +0000
Subject: [PATCH 0481/1116] [ORC] Re-apply r345077 with fixes to remove
 ambiguity in lookup calls.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345098 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/JITSymbol.h      |  12 ++
 include/llvm/ExecutionEngine/Orc/Core.h       |  99 ++++++------
 include/llvm/ExecutionEngine/Orc/LLJIT.h      |   2 +-
 .../Orc/CompileOnDemandLayer.cpp              |  17 +-
 lib/ExecutionEngine/Orc/Core.cpp              | 152 ++++++++++--------
 lib/ExecutionEngine/Orc/ExecutionUtils.cpp    |   4 +-
 lib/ExecutionEngine/Orc/IndirectionUtils.cpp  |   2 +-
 lib/ExecutionEngine/Orc/LLJIT.cpp             |   8 +-
 lib/ExecutionEngine/Orc/LazyReexports.cpp     |   4 +-
 .../Orc/RTDyldObjectLinkingLayer.cpp          |   9 +-
 .../OrcLazy/Inputs/hidden-definitions.ll      |   6 +
 .../OrcLazy/hidden-visibility.ll              |  17 ++
 tools/lli/lli.cpp                             |  46 +++++-
 .../ExecutionEngine/Orc/CoreAPIsTest.cpp      |  72 +++++----
 .../Orc/RTDyldObjectLinkingLayerTest.cpp      |  10 +-
 15 files changed, 289 insertions(+), 171 deletions(-)
 create mode 100644 test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll
 create mode 100644 test/ExecutionEngine/OrcLazy/hidden-visibility.ll

diff --git a/include/llvm/ExecutionEngine/JITSymbol.h b/include/llvm/ExecutionEngine/JITSymbol.h
index 18b972ed829..05c9590726d 100644
--- a/include/llvm/ExecutionEngine/JITSymbol.h
+++ b/include/llvm/ExecutionEngine/JITSymbol.h
@@ -40,6 +40,18 @@ class SymbolRef;
 /// Represents an address in the target process's address space.
 using JITTargetAddress = uint64_t;
 
+/// Convert a JITTargetAddress to a pointer.
+template <typename T> T jitTargetAddressToPointer(JITTargetAddress Addr) {
+  static_assert(std::is_pointer<T>::value, "T must be a pointer type");
+  uintptr_t IntPtr = static_cast<uintptr_t>(Addr);
+  assert(IntPtr == Addr && "JITTargetAddress value out of range for uintptr_t");
+  return reinterpret_cast<T>(IntPtr);
+}
+
+template <typename T> JITTargetAddress pointerToJITTargetAddress(T *Ptr) {
+  return static_cast<JITTargetAddress>(reinterpret_cast<uintptr_t>(Ptr));
+}
+
 /// Flags for symbols in the JIT.
 class JITSymbolFlags {
 public:
diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h
index 2e56854340c..39d306e0bd4 100644
--- a/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/include/llvm/ExecutionEngine/Orc/Core.h
@@ -54,8 +54,8 @@ using SymbolFlagsMap = DenseMap<SymbolStringPtr, JITSymbolFlags>;
 ///        symbols to be obtained for logging.
 using SymbolDependenceMap = DenseMap<JITDylib *, SymbolNameSet>;
 
-/// A list of JITDylib pointers.
-using JITDylibList = std::vector<JITDylib *>;
+/// A list of (JITDylib*, bool) pairs.
+using JITDylibSearchList = std::vector<std::pair<JITDylib *, bool>>;
 
 /// Render a SymbolStringPtr.
 raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym);
@@ -85,8 +85,8 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps);
 /// Render a MaterializationUnit.
 raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU);
 
-/// Render a JITDylibList.
-raw_ostream &operator<<(raw_ostream &OS, const JITDylibList &JDs);
+/// Render a JITDylibSearchList.
+raw_ostream &operator<<(raw_ostream &OS, const JITDylibSearchList &JDs);
 
 /// Callback to notify client that symbols have been resolved.
 using SymbolsResolvedCallback = std::function<void(Expected<SymbolMap>)>;
@@ -351,14 +351,15 @@ using SymbolAliasMap = DenseMap<SymbolStringPtr, SymbolAliasMapEntry>;
 class ReExportsMaterializationUnit : public MaterializationUnit {
 public:
   /// SourceJD is allowed to be nullptr, in which case the source JITDylib is
-  /// taken to be whatever JITDylib these definitions are materialized in. This
-  /// is useful for defining aliases within a JITDylib.
+  /// taken to be whatever JITDylib these definitions are materialized in (and
+  /// MatchNonExported has no effect). This is useful for defining aliases
+  /// within a JITDylib.
   ///
   /// Note: Care must be taken that no sets of aliases form a cycle, as such
   ///       a cycle will result in a deadlock when any symbol in the cycle is
   ///       resolved.
-  ReExportsMaterializationUnit(JITDylib *SourceJD, SymbolAliasMap Aliases,
-                               VModuleKey K);
+  ReExportsMaterializationUnit(JITDylib *SourceJD, bool MatchNonExported,
+                               SymbolAliasMap Aliases, VModuleKey K);
 
   StringRef getName() const override;
 
@@ -368,6 +369,7 @@ private:
   static SymbolFlagsMap extractFlags(const SymbolAliasMap &Aliases);
 
   JITDylib *SourceJD = nullptr;
+  bool MatchNonExported = false;
   SymbolAliasMap Aliases;
 };
 
@@ -385,16 +387,19 @@ private:
 inline std::unique_ptr<ReExportsMaterializationUnit>
 symbolAliases(SymbolAliasMap Aliases, VModuleKey K = VModuleKey()) {
   return llvm::make_unique<ReExportsMaterializationUnit>(
-      nullptr, std::move(Aliases), std::move(K));
+      nullptr, true, std::move(Aliases), std::move(K));
 }
 
 /// Create a materialization unit for re-exporting symbols from another JITDylib
 /// with alternative names/flags.
+/// If MatchNonExported is true then non-exported symbols from SourceJD can be
+/// re-exported. If it is false, attempts to re-export a non-exported symbol
+/// will result in a "symbol not found" error.
 inline std::unique_ptr<ReExportsMaterializationUnit>
 reexports(JITDylib &SourceJD, SymbolAliasMap Aliases,
-          VModuleKey K = VModuleKey()) {
+          bool MatchNonExported = false, VModuleKey K = VModuleKey()) {
   return llvm::make_unique<ReExportsMaterializationUnit>(
-      &SourceJD, std::move(Aliases), std::move(K));
+      &SourceJD, MatchNonExported, std::move(Aliases), std::move(K));
 }
 
 /// Build a SymbolAliasMap for the common case where you want to re-export
@@ -411,13 +416,14 @@ public:
   /// Create a reexports generator. If an Allow predicate is passed, only
   /// symbols for which the predicate returns true will be reexported. If no
   /// Allow predicate is passed, all symbols will be exported.
-  ReexportsGenerator(JITDylib &SourceJD,
+  ReexportsGenerator(JITDylib &SourceJD, bool MatchNonExported = false,
                      SymbolPredicate Allow = SymbolPredicate());
 
   SymbolNameSet operator()(JITDylib &JD, const SymbolNameSet &Names);
 
 private:
   JITDylib &SourceJD;
+  bool MatchNonExported = false;
   SymbolPredicate Allow;
 };
 
@@ -536,16 +542,18 @@ public:
   /// as the first in the search order (instead of this dylib) ensures that
   /// definitions within this dylib resolve to the lazy-compiling stubs,
   /// rather than immediately materializing the definitions in this dylib.
-  void setSearchOrder(JITDylibList NewSearchOrder,
-                      bool SearchThisJITDylibFirst = true);
+  void setSearchOrder(JITDylibSearchList NewSearchOrder,
+                      bool SearchThisJITDylibFirst = true,
+                      bool MatchNonExportedInThisDylib = true);
 
   /// Add the given JITDylib to the search order for definitions in this
   /// JITDylib.
-  void addToSearchOrder(JITDylib &JD);
+  void addToSearchOrder(JITDylib &JD, bool MatcNonExported = false);
 
   /// Replace OldJD with NewJD in the search order if OldJD is present.
   /// Otherwise this operation is a no-op.
-  void replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD);
+  void replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
+                            bool MatchNonExported = false);
 
   /// Remove the given JITDylib from the search order for this JITDylib if it is
   /// present. Otherwise this operation is a no-op.
@@ -554,7 +562,7 @@ public:
   /// Do something with the search order (run under the session lock).
   template <typename Func>
   auto withSearchOrderDo(Func &&F)
-      -> decltype(F(std::declval<const JITDylibList &>()));
+      -> decltype(F(std::declval<const JITDylibSearchList &>()));
 
   /// Define all symbols provided by the materialization unit to be part of this
   /// JITDylib.
@@ -642,12 +650,12 @@ private:
                                 const SymbolNameSet &Names);
 
   void lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                  SymbolNameSet &Unresolved, JITDylib *MatchNonExportedInJD,
-                  bool MatchNonExported, MaterializationUnitList &MUs);
+                  SymbolNameSet &Unresolved, bool MatchNonExported,
+                  MaterializationUnitList &MUs);
 
   void lodgeQueryImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                      SymbolNameSet &Unresolved, JITDylib *MatchNonExportedInJD,
-                      bool MatchNonExported, MaterializationUnitList &MUs);
+                      SymbolNameSet &Unresolved, bool MatchNonExported,
+                      MaterializationUnitList &MUs);
 
   LookupImplActionFlags
   lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
@@ -682,7 +690,7 @@ private:
   UnmaterializedInfosMap UnmaterializedInfos;
   MaterializingInfosMap MaterializingInfos;
   GeneratorFunction DefGenerator;
-  JITDylibList SearchOrder;
+  JITDylibSearchList SearchOrder;
 };
 
 /// An ExecutionSession represents a running JIT program.
@@ -766,6 +774,10 @@ public:
 
   /// Search the given JITDylib list for the given symbols.
   ///
+  /// SearchOrder lists the JITDylibs to search. For each dylib, the associated
+  /// boolean indicates whether the search should match against non-exported
+  /// (hidden visibility) symbols in that dylib (true means match against
+  /// non-exported symbols, false means do not match).
   ///
   /// The OnResolve callback will be called once all requested symbols are
   /// resolved, or if an error occurs prior to resolution.
@@ -782,19 +794,9 @@ public:
   /// dependenant symbols for this query (e.g. it is being made by a top level
   /// client to get an address to call) then the value NoDependenciesToRegister
   /// can be used.
-  ///
-  /// If the MatchNonExportedInJD pointer is non-null, then the lookup will find
-  /// non-exported symbols defined in the JITDylib pointed to by
-  /// MatchNonExportedInJD.
-  /// If MatchNonExported is true the lookup will find non-exported symbols in
-  /// any JITDylib (setting MatchNonExportedInJD is redundant in such cases).
-  /// If MatchNonExported is false and MatchNonExportedInJD is null,
-  /// non-exported symbols will never be found.
-  void lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
+  void lookup(const JITDylibSearchList &SearchOrder, SymbolNameSet Symbols,
               SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
-              RegisterDependenciesFunction RegisterDependencies,
-              JITDylib *MatchNonExportedInJD = nullptr,
-              bool MatchNonExported = false);
+              RegisterDependenciesFunction RegisterDependencies);
 
   /// Blocking version of lookup above. Returns the resolved symbol map.
   /// If WaitUntilReady is true (the default), will not return until all
@@ -803,24 +805,29 @@ public:
   /// or an error occurs. If WaitUntilReady is false and an error occurs
   /// after resolution, the function will return a success value, but the
   /// error will be reported via reportErrors.
-  Expected<SymbolMap> lookup(const JITDylibList &JDs,
+  Expected<SymbolMap> lookup(const JITDylibSearchList &SearchOrder,
                              const SymbolNameSet &Symbols,
                              RegisterDependenciesFunction RegisterDependencies =
                                  NoDependenciesToRegister,
-                             bool WaitUntilReady = true,
-                             JITDylib *MatchNonExportedInJD = nullptr,
-                             bool MatchNonExported = false);
+                             bool WaitUntilReady = true);
+
+  /// Convenience version of blocking lookup.
+  /// Searches each of the JITDylibs in the search order in turn for the given
+  /// symbol.
+  Expected<JITEvaluatedSymbol> lookup(const JITDylibSearchList &SearchOrder,
+                                      SymbolStringPtr Symbol);
 
   /// Convenience version of blocking lookup.
-  /// Performs a single-symbol lookup.
-  Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs,
-                                      SymbolStringPtr Symbol,
-                                      bool MatchNonExported = false);
+  /// Searches each of the JITDylibs in the search order in turn for the given
+  /// symbol. The search will not find non-exported symbols.
+  Expected<JITEvaluatedSymbol> lookup(ArrayRef<JITDylib *> SearchOrder,
+                                      SymbolStringPtr Symbol);
 
   /// Convenience version of blocking lookup.
-  /// Performs a single-symbol lookup, auto-interning the given symbol name.
-  Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs, StringRef Symbol,
-                                      bool MatchNonExported = false);
+  /// Searches each of the JITDylibs in the search order in turn for the given
+  /// symbol. The search will not find non-exported symbols.
+  Expected<JITEvaluatedSymbol> lookup(ArrayRef<JITDylib *> SearchOrder,
+                                      StringRef Symbol);
 
   /// Materialize the given unit.
   void dispatchMaterialization(JITDylib &JD,
@@ -866,7 +873,7 @@ private:
 
 template <typename Func>
 auto JITDylib::withSearchOrderDo(Func &&F)
-    -> decltype(F(std::declval<const JITDylibList &>())) {
+    -> decltype(F(std::declval<const JITDylibSearchList &>())) {
   return ES.runSessionLocked([&]() { return F(SearchOrder); });
 }
 
diff --git a/include/llvm/ExecutionEngine/Orc/LLJIT.h b/include/llvm/ExecutionEngine/Orc/LLJIT.h
index b7ef8834706..ce3e5d519c7 100644
--- a/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -144,7 +144,7 @@ public:
   /// LLLazyJIT with the given number of compile threads.
   static Expected<std::unique_ptr<LLLazyJIT>>
   Create(JITTargetMachineBuilder JTMB, DataLayout DL,
-         unsigned NumCompileThreads = 0);
+         JITTargetAddress ErrorAddr, unsigned NumCompileThreads = 0);
 
   /// Set an IR transform (e.g. pass manager pipeline) to run on each function
   /// when it is compiled.
diff --git a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index de1fa079dde..241eb3600da 100644
--- a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -157,7 +157,7 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R,
     return;
   }
 
-  R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables)));
+  R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables), true));
   R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(),
                           std::move(Callables)));
 }
@@ -166,10 +166,17 @@ CompileOnDemandLayer::PerDylibResources &
 CompileOnDemandLayer::getPerDylibResources(JITDylib &TargetD) {
   auto I = DylibResources.find(&TargetD);
   if (I == DylibResources.end()) {
-    auto &ImplD =
-        getExecutionSession().createJITDylib(TargetD.getName() + ".impl");
-    TargetD.withSearchOrderDo([&](const JITDylibList &TargetSearchOrder) {
-      ImplD.setSearchOrder(TargetSearchOrder, false);
+    auto &ImplD = getExecutionSession().createJITDylib(
+        TargetD.getName() + ".impl", false);
+    TargetD.withSearchOrderDo([&](const JITDylibSearchList &TargetSearchOrder) {
+      auto NewSearchOrder = TargetSearchOrder;
+      assert(!NewSearchOrder.empty() &&
+             NewSearchOrder.front().first == &TargetD &&
+             NewSearchOrder.front().second == true &&
+             "TargetD must be at the front of its own search order and match "
+             "non-exported symbol");
+      NewSearchOrder.insert(std::next(NewSearchOrder.begin()), {&ImplD, true});
+      ImplD.setSearchOrder(std::move(NewSearchOrder), false);
     });
     PerDylibResources PDR(ImplD, BuildIndirectStubsManager());
     I = DylibResources.insert(std::make_pair(&TargetD, std::move(PDR))).first;
diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index df4d0028a4a..9cbb03734ed 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -205,14 +205,16 @@ raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) {
   return OS << ")";
 }
 
-raw_ostream &operator<<(raw_ostream &OS, const JITDylibList &JDs) {
+raw_ostream &operator<<(raw_ostream &OS, const JITDylibSearchList &JDs) {
   OS << "[";
   if (!JDs.empty()) {
-    assert(JDs.front() && "JITDylibList entries must not be null");
-    OS << " " << JDs.front()->getName();
-    for (auto *JD : make_range(std::next(JDs.begin()), JDs.end())) {
-      assert(JD && "JITDylibList entries must not be null");
-      OS << ", " << JD->getName();
+    assert(JDs.front().first && "JITDylibList entries must not be null");
+    OS << " (\"" << JDs.front().first->getName() << "\", "
+       << (JDs.front().second ? "true" : "false") << ")";
+    for (auto &KV : make_range(std::next(JDs.begin()), JDs.end())) {
+      assert(KV.first && "JITDylibList entries must not be null");
+      OS << ", (\"" << KV.first->getName() << "\", "
+         << (KV.second ? "true" : "false") << ")";
     }
   }
   OS << " ]";
@@ -526,9 +528,11 @@ AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) {
 }
 
 ReExportsMaterializationUnit::ReExportsMaterializationUnit(
-    JITDylib *SourceJD, SymbolAliasMap Aliases, VModuleKey K)
+    JITDylib *SourceJD, bool MatchNonExported, SymbolAliasMap Aliases,
+    VModuleKey K)
     : MaterializationUnit(extractFlags(Aliases), std::move(K)),
-      SourceJD(SourceJD), Aliases(std::move(Aliases)) {}
+      SourceJD(SourceJD), MatchNonExported(MatchNonExported),
+      Aliases(std::move(Aliases)) {}
 
 StringRef ReExportsMaterializationUnit::getName() const {
   return "<Reexports>";
@@ -556,7 +560,7 @@ void ReExportsMaterializationUnit::materialize(
 
   if (!Aliases.empty()) {
     if (SourceJD)
-      R.replace(reexports(*SourceJD, std::move(Aliases)));
+      R.replace(reexports(*SourceJD, std::move(Aliases), MatchNonExported));
     else
       R.replace(symbolAliases(std::move(Aliases)));
   }
@@ -656,8 +660,9 @@ void ReExportsMaterializationUnit::materialize(
 
     auto OnReady = [&ES](Error Err) { ES.reportError(std::move(Err)); };
 
-    ES.lookup({&SrcJD}, QuerySymbols, std::move(OnResolve), std::move(OnReady),
-              std::move(RegisterDependencies), nullptr, true);
+    ES.lookup(JITDylibSearchList({{&SrcJD, MatchNonExported}}), QuerySymbols,
+              std::move(OnResolve), std::move(OnReady),
+              std::move(RegisterDependencies));
   }
 }
 
@@ -698,8 +703,10 @@ buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols) {
 }
 
 ReexportsGenerator::ReexportsGenerator(JITDylib &SourceJD,
+                                       bool MatchNonExported,
                                        SymbolPredicate Allow)
-    : SourceJD(SourceJD), Allow(std::move(Allow)) {}
+    : SourceJD(SourceJD), MatchNonExported(MatchNonExported),
+      Allow(std::move(Allow)) {}
 
 SymbolNameSet ReexportsGenerator::operator()(JITDylib &JD,
                                              const SymbolNameSet &Names) {
@@ -716,7 +723,7 @@ SymbolNameSet ReexportsGenerator::operator()(JITDylib &JD,
   }
 
   if (!Added.empty())
-    cantFail(JD.define(reexports(SourceJD, AliasMap)));
+    cantFail(JD.define(reexports(SourceJD, AliasMap, MatchNonExported)));
 
   return Added;
 }
@@ -1041,30 +1048,41 @@ void JITDylib::notifyFailed(const SymbolNameSet &FailedSymbols) {
     Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbols));
 }
 
-void JITDylib::setSearchOrder(JITDylibList NewSearchOrder,
-                              bool SearchThisJITDylibFirst) {
-  if (SearchThisJITDylibFirst && NewSearchOrder.front() != this)
-    NewSearchOrder.insert(NewSearchOrder.begin(), this);
+void JITDylib::setSearchOrder(JITDylibSearchList NewSearchOrder,
+                              bool SearchThisJITDylibFirst,
+                              bool MatchNonExportedInThisDylib) {
+  if (SearchThisJITDylibFirst && NewSearchOrder.front().first != this)
+    NewSearchOrder.insert(NewSearchOrder.begin(),
+                          {this, MatchNonExportedInThisDylib});
 
   ES.runSessionLocked([&]() { SearchOrder = std::move(NewSearchOrder); });
 }
 
-void JITDylib::addToSearchOrder(JITDylib &JD) {
-  ES.runSessionLocked([&]() { SearchOrder.push_back(&JD); });
+void JITDylib::addToSearchOrder(JITDylib &JD, bool MatchNonExported) {
+  ES.runSessionLocked([&]() {
+    SearchOrder.push_back({&JD, MatchNonExported});
+  });
 }
 
-void JITDylib::replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD) {
+void JITDylib::replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
+                                    bool MatchNonExported) {
   ES.runSessionLocked([&]() {
-    auto I = std::find(SearchOrder.begin(), SearchOrder.end(), &OldJD);
+    auto I = std::find_if(SearchOrder.begin(), SearchOrder.end(),
+                          [&](const JITDylibSearchList::value_type &KV) {
+                            return KV.first == &OldJD;
+                          });
 
     if (I != SearchOrder.end())
-      *I = &NewJD;
+      *I = {&NewJD, MatchNonExported};
   });
 }
 
 void JITDylib::removeFromSearchOrder(JITDylib &JD) {
   ES.runSessionLocked([&]() {
-    auto I = std::find(SearchOrder.begin(), SearchOrder.end(), &JD);
+    auto I = std::find_if(SearchOrder.begin(), SearchOrder.end(),
+                          [&](const JITDylibSearchList::value_type &KV) {
+                            return KV.first == &JD;
+                          });
     if (I != SearchOrder.end())
       SearchOrder.erase(I);
   });
@@ -1161,18 +1179,17 @@ SymbolNameSet JITDylib::lookupFlagsImpl(SymbolFlagsMap &Flags,
 }
 
 void JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                          SymbolNameSet &Unresolved,
-                          JITDylib *MatchNonExportedInJD, bool MatchNonExported,
+                          SymbolNameSet &Unresolved, bool MatchNonExported,
                           MaterializationUnitList &MUs) {
   assert(Q && "Query can not be null");
 
-  lodgeQueryImpl(Q, Unresolved, MatchNonExportedInJD, MatchNonExported, MUs);
+  lodgeQueryImpl(Q, Unresolved, MatchNonExported, MUs);
   if (DefGenerator && !Unresolved.empty()) {
     auto NewDefs = DefGenerator(*this, Unresolved);
     if (!NewDefs.empty()) {
       for (auto &D : NewDefs)
         Unresolved.erase(D);
-      lodgeQueryImpl(Q, NewDefs, MatchNonExportedInJD, MatchNonExported, MUs);
+      lodgeQueryImpl(Q, NewDefs, MatchNonExported, MUs);
       assert(NewDefs.empty() &&
              "All fallback defs should have been found by lookupImpl");
     }
@@ -1181,7 +1198,7 @@ void JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
 
 void JITDylib::lodgeQueryImpl(
     std::shared_ptr<AsynchronousSymbolQuery> &Q, SymbolNameSet &Unresolved,
-    JITDylib *MatchNonExportedInJD, bool MatchNonExported,
+    bool MatchNonExported,
     std::vector<std::unique_ptr<MaterializationUnit>> &MUs) {
 
   std::vector<SymbolStringPtr> ToRemove;
@@ -1191,12 +1208,9 @@ void JITDylib::lodgeQueryImpl(
     if (SymI == Symbols.end())
       continue;
 
-    // If this is a non-exported symbol, then check the values of
-    // MatchNonExportedInJD and MatchNonExported. Skip if we should not match
-    // against this symbol.
-    if (!SymI->second.getFlags().isExported())
-      if (!MatchNonExported && MatchNonExportedInJD != this)
-        continue;
+    // If this is a non exported symbol and we're skipping those then skip it.
+    if (!SymI->second.getFlags().isExported() && !MatchNonExported)
+      continue;
 
     // If we matched against Name in JD, mark it to be removed from the Unresolved
     // set.
@@ -1382,8 +1396,9 @@ void JITDylib::dump(raw_ostream &OS) {
        << "\" (ES: " << format("0x%016x", reinterpret_cast<uintptr_t>(&ES))
        << "):\n"
        << "Search order: [";
-    for (auto *JD : SearchOrder)
-      OS << " \"" << JD->getName() << "\"";
+    for (auto &KV : SearchOrder)
+      OS << " (\"" << KV.first->getName() << "\", "
+         << (KV.second ? "all" : "exported only") << ")";
     OS << " ]\n"
        << "Symbol table:\n";
 
@@ -1431,7 +1446,7 @@ void JITDylib::dump(raw_ostream &OS) {
 
 JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
     : ES(ES), JITDylibName(std::move(Name)) {
-  SearchOrder.push_back(this);
+  SearchOrder.push_back({this, true});
 }
 
 Error JITDylib::defineImpl(MaterializationUnit &MU) {
@@ -1724,12 +1739,10 @@ Expected<SymbolMap> ExecutionSession::legacyLookup(
 #endif
 }
 
-void ExecutionSession::lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
-                              SymbolsResolvedCallback OnResolve,
-                              SymbolsReadyCallback OnReady,
-                              RegisterDependenciesFunction RegisterDependencies,
-                              JITDylib *MatchNonExportedInJD,
-                              bool MatchNonExported) {
+void ExecutionSession::lookup(
+    const JITDylibSearchList &SearchOrder, SymbolNameSet Symbols,
+    SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
+    RegisterDependenciesFunction RegisterDependencies) {
 
   // lookup can be re-entered recursively if running on a single thread. Run any
   // outstanding MUs in case this query depends on them, otherwise this lookup
@@ -1745,12 +1758,14 @@ void ExecutionSession::lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
   bool QueryFailed = false;
 
   runSessionLocked([&]() {
-    for (auto *JD : JDs) {
-      assert(JD && "JITDylibList entries must not be null");
-      assert(!CollectedMUsMap.count(JD) &&
+    for (auto &KV : SearchOrder) {
+      assert(KV.first && "JITDylibList entries must not be null");
+      assert(!CollectedMUsMap.count(KV.first) &&
              "JITDylibList should not contain duplicate entries");
-      JD->lodgeQuery(Q, Unresolved, MatchNonExportedInJD, MatchNonExported,
-                     CollectedMUsMap[JD]);
+
+      auto &JD = *KV.first;
+      auto MatchNonExported = KV.second;
+      JD.lodgeQuery(Q, Unresolved, MatchNonExported, CollectedMUsMap[&JD]);
     }
 
     if (Unresolved.empty()) {
@@ -1801,11 +1816,9 @@ void ExecutionSession::lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
   runOutstandingMUs();
 }
 
-Expected<SymbolMap>
-ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
-                         RegisterDependenciesFunction RegisterDependencies,
-                         bool WaitUntilReady, JITDylib *MatchNonExportedInJD,
-                         bool MatchNonExported) {
+Expected<SymbolMap> ExecutionSession::lookup(
+    const JITDylibSearchList &SearchOrder, const SymbolNameSet &Symbols,
+    RegisterDependenciesFunction RegisterDependencies, bool WaitUntilReady) {
 #if LLVM_ENABLE_THREADS
   // In the threaded case we use promises to return the results.
   std::promise<SymbolMap> PromisedResult;
@@ -1872,8 +1885,7 @@ ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
 #endif
 
   // Perform the asynchronous lookup.
-  lookup(JDs, Symbols, OnResolve, OnReady, RegisterDependencies,
-         MatchNonExportedInJD, MatchNonExported);
+  lookup(SearchOrder, Symbols, OnResolve, OnReady, RegisterDependencies);
 
 #if LLVM_ENABLE_THREADS
   auto ResultFuture = PromisedResult.get_future();
@@ -1916,14 +1928,13 @@ ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
 #endif
 }
 
-/// Look up a symbol by searching a list of JDs.
-Expected<JITEvaluatedSymbol> ExecutionSession::lookup(const JITDylibList &JDs,
-                                                      SymbolStringPtr Name,
-                                                      bool MatchNonExported) {
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(const JITDylibSearchList &SearchOrder,
+                         SymbolStringPtr Name) {
   SymbolNameSet Names({Name});
 
-  if (auto ResultMap = lookup(JDs, std::move(Names), NoDependenciesToRegister,
-                              true, nullptr, MatchNonExported)) {
+  if (auto ResultMap = lookup(SearchOrder, std::move(Names),
+                              NoDependenciesToRegister, true)) {
     assert(ResultMap->size() == 1 && "Unexpected number of results");
     assert(ResultMap->count(Name) && "Missing result for symbol");
     return std::move(ResultMap->begin()->second);
@@ -1931,10 +1942,21 @@ Expected<JITEvaluatedSymbol> ExecutionSession::lookup(const JITDylibList &JDs,
     return ResultMap.takeError();
 }
 
-Expected<JITEvaluatedSymbol> ExecutionSession::lookup(const JITDylibList &JDs,
-                                                      StringRef Name,
-                                                      bool MatchNonExported) {
-  return lookup(JDs, intern(Name), MatchNonExported);
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder,
+                         SymbolStringPtr Name) {
+  SymbolNameSet Names({Name});
+
+  JITDylibSearchList FullSearchOrder(SearchOrder.size());
+  for (auto *JD : SearchOrder)
+    FullSearchOrder.push_back({JD, false});
+
+  return lookup(FullSearchOrder, Name);
+}
+
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder, StringRef Name) {
+  return lookup(SearchOrder, intern(Name));
 }
 
 void ExecutionSession::dump(raw_ostream &OS) {
diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 21a604f71ca..d9ff07efbe9 100644
--- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -130,8 +130,8 @@ Error CtorDtorRunner::run() {
 
   auto &ES = JD.getExecutionSession();
   if (auto CtorDtorMap =
-          ES.lookup({&JD}, std::move(Names), NoDependenciesToRegister, true,
-                    nullptr, true)) {
+          ES.lookup(JITDylibSearchList({{&JD, true}}), std::move(Names),
+                    NoDependenciesToRegister, true)) {
     for (auto &KV : CtorDtorsByPriority) {
       for (auto &Name : KV.second) {
         assert(CtorDtorMap->count(Name) && "No entry for Name");
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index c10d15ab117..af7fcddd53d 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -101,7 +101,7 @@ JITTargetAddress JITCompileCallbackManager::executeCompileCallback(
       Name = I->second;
   }
 
-  if (auto Sym = ES.lookup({&CallbacksJD}, Name, true))
+  if (auto Sym = ES.lookup(JITDylibSearchList({{&CallbacksJD, true}}), Name))
     return Sym->getAddress();
   else {
     llvm::dbgs() << "Didn't find callback.\n";
diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp
index ac71a5e7673..e2089f9106b 100644
--- a/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -76,7 +76,7 @@ Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
 
 Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
                                                         StringRef Name) {
-  return ES->lookup({&JD}, ES->intern(Name));
+  return ES->lookup(JITDylibSearchList({{&JD, true}}), ES->intern(Name));
 }
 
 LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES,
@@ -144,13 +144,13 @@ void LLJIT::recordCtorDtors(Module &M) {
 }
 
 Expected<std::unique_ptr<LLLazyJIT>>
-  LLLazyJIT::Create(JITTargetMachineBuilder JTMB, DataLayout DL,
-                    unsigned NumCompileThreads) {
+LLLazyJIT::Create(JITTargetMachineBuilder JTMB, DataLayout DL,
+                  JITTargetAddress ErrorAddr, unsigned NumCompileThreads) {
   auto ES = llvm::make_unique<ExecutionSession>();
 
   const Triple &TT = JTMB.getTargetTriple();
 
-  auto LCTMgr = createLocalLazyCallThroughManager(TT, *ES, 0);
+  auto LCTMgr = createLocalLazyCallThroughManager(TT, *ES, ErrorAddr);
   if (!LCTMgr)
     return LCTMgr.takeError();
 
diff --git a/lib/ExecutionEngine/Orc/LazyReexports.cpp b/lib/ExecutionEngine/Orc/LazyReexports.cpp
index af4c508d7f1..55f4a7c5afc 100644
--- a/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -52,8 +52,8 @@ LazyCallThroughManager::callThroughToSymbol(JITTargetAddress TrampolineAddr) {
     SymbolName = I->second.second;
   }
 
-  auto LookupResult = ES.lookup({SourceJD}, {SymbolName},
-                                NoDependenciesToRegister, true, nullptr, true);
+  auto LookupResult = ES.lookup(JITDylibSearchList({{SourceJD, true}}),
+                                {SymbolName}, NoDependenciesToRegister, true);
 
   if (!LookupResult) {
     ES.reportError(LookupResult.takeError());
diff --git a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index 616251c7e00..299d76183cd 100644
--- a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -50,10 +50,11 @@ public:
       MR.addDependenciesForAll(Deps);
     };
 
-    MR.getTargetJITDylib().withSearchOrderDo([&](const JITDylibList &JDs) {
-      ES.lookup(JDs, InternedSymbols, OnResolvedWithUnwrap, OnReady,
-                RegisterDependencies, &MR.getTargetJITDylib());
-    });
+    JITDylibSearchList SearchOrder;
+    MR.getTargetJITDylib().withSearchOrderDo(
+        [&](const JITDylibSearchList &JDs) { SearchOrder = JDs; });
+    ES.lookup(SearchOrder, InternedSymbols, OnResolvedWithUnwrap, OnReady,
+              RegisterDependencies);
   }
 
   Expected<LookupSet> getResponsibilitySet(const LookupSet &Symbols) {
diff --git a/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll b/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll
new file mode 100644
index 00000000000..8d1f4b9cc5c
--- /dev/null
+++ b/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll
@@ -0,0 +1,6 @@
+@bar = hidden global i32 0
+
+define hidden i32 @foo() {
+entry:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcLazy/hidden-visibility.ll b/test/ExecutionEngine/OrcLazy/hidden-visibility.ll
new file mode 100644
index 00000000000..199fd644bff
--- /dev/null
+++ b/test/ExecutionEngine/OrcLazy/hidden-visibility.ll
@@ -0,0 +1,17 @@
+; RUN: lli -jit-kind=orc-lazy -extra-module %p/Inputs/hidden-definitions.ll %s
+; RUN: not lli -jit-kind=orc-lazy -jd libFoo -extra-module %p/Inputs/hidden-definitions.ll %s
+;
+; Check that hidden symbols in another module are visible when the module is
+; added to the same JITDylib, and not visible if it is added to a different
+; JITDylib.
+
+@bar = external global i32
+declare i32 @foo()
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
+entry:
+  %0 = call i32() @foo()
+  %1 = load i32, i32* @bar
+  %2 = add i32 %0, %1
+  ret i32 %2
+}
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index f4585dc080d..c3c57e2cdee 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -115,6 +115,11 @@ namespace {
                "rather than individual functions"),
       cl::init(false));
 
+  cl::list<std::string>
+      JITDylibs("jd",
+                cl::desc("Specifies the JITDylib to be used for any subsequent "
+                         "-extra-module arguments."));
+
   // The MCJIT supports building for a target address space separate from
   // the JIT compilation process. Use a forked process and a copying
   // memory manager with IPC to execute using this functionality.
@@ -749,6 +754,8 @@ static orc::IRTransformLayer::TransformFunction createDebugDumper() {
   llvm_unreachable("Unknown DumpKind");
 }
 
+static void exitOnLazyCallThroughFailure() { exit(1); }
+
 int runOrcLazyJIT(const char *ProgName) {
   // Start setting up the JIT environment.
 
@@ -778,7 +785,11 @@ int runOrcLazyJIT(const char *ProgName) {
                         : None);
 
   DataLayout DL = ExitOnErr(JTMB.getDefaultDataLayoutForTarget());
-  auto J = ExitOnErr(orc::LLLazyJIT::Create(std::move(JTMB), DL, LazyJITCompileThreads));
+
+  auto J = ExitOnErr(orc::LLLazyJIT::Create(
+      std::move(JTMB), DL,
+      pointerToJITTargetAddress(exitOnLazyCallThroughFailure),
+      LazyJITCompileThreads));
 
   if (PerModuleLazy)
     J->setPartitionFunction(orc::CompileOnDemandLayer::compileWholeModule);
@@ -803,13 +814,32 @@ int runOrcLazyJIT(const char *ProgName) {
   // Add the main module.
   ExitOnErr(J->addLazyIRModule(std::move(MainModule)));
 
-  // Add any extra modules.
-  for (auto &ModulePath : ExtraModules) {
-    auto M = parseIRFile(ModulePath, Err, *TSCtx.getContext());
-    if (!M)
-      reportError(Err, ProgName);
+  // Create JITDylibs and add any extra modules.
+  {
+    // Create JITDylibs, keep a map from argument index to dylib. We will use
+    // -extra-module argument indexes to determine what dylib to use for each
+    // -extra-module.
+    std::map<unsigned, orc::JITDylib *> IdxToDylib;
+    IdxToDylib[0] = &J->getMainJITDylib();
+    for (auto JDItr = JITDylibs.begin(), JDEnd = JITDylibs.end();
+         JDItr != JDEnd; ++JDItr) {
+      IdxToDylib[JITDylibs.getPosition(JDItr - JITDylibs.begin())] =
+          &J->createJITDylib(*JDItr);
+    }
 
-    ExitOnErr(J->addLazyIRModule(orc::ThreadSafeModule(std::move(M), TSCtx)));
+    for (auto EMItr = ExtraModules.begin(), EMEnd = ExtraModules.end();
+         EMItr != EMEnd; ++EMItr) {
+      auto M = parseIRFile(*EMItr, Err, *TSCtx.getContext());
+      if (!M)
+        reportError(Err, ProgName);
+
+      auto EMIdx = ExtraModules.getPosition(EMItr - ExtraModules.begin());
+      assert(EMIdx != 0 && "ExtraModule should have index > 0");
+      auto JDItr = std::prev(IdxToDylib.lower_bound(EMIdx));
+      auto &JD = *JDItr->second;
+      ExitOnErr(
+          J->addLazyIRModule(JD, orc::ThreadSafeModule(std::move(M), TSCtx)));
+    }
   }
 
   // Add the objects.
@@ -837,6 +867,8 @@ int runOrcLazyJIT(const char *ProgName) {
     AltEntryThreads.push_back(std::thread([EntryPoint]() { EntryPoint(); }));
   }
 
+  J->getExecutionSession().dump(llvm::dbgs());
+
   // Run main.
   auto MainSym = ExitOnErr(J->lookup("main"));
   typedef int (*MainFnPtr)(int, const char *[]);
diff --git a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
index 1ccc4755957..22be76a2eb6 100644
--- a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
+++ b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
@@ -48,7 +48,8 @@ TEST_F(CoreAPIsStandardTest, BasicSuccessfulLookup) {
         FooMR = std::make_shared<MaterializationResponsibility>(std::move(R));
       })));
 
-  ES.lookup({&JD}, {Foo}, OnResolution, OnReady, NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, OnResolution, OnReady,
+            NoDependenciesToRegister);
 
   EXPECT_FALSE(OnResolutionRun) << "Should not have been resolved yet";
   EXPECT_FALSE(OnReadyRun) << "Should not have been marked ready yet";
@@ -101,7 +102,8 @@ TEST_F(CoreAPIsStandardTest, EmptyLookup) {
     OnReadyRun = true;
   };
 
-  ES.lookup({&JD}, {}, OnResolution, OnReady, NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {}, OnResolution, OnReady,
+            NoDependenciesToRegister);
 
   EXPECT_TRUE(OnResolvedRun) << "OnResolved was not run for empty query";
   EXPECT_TRUE(OnReadyRun) << "OnReady was not run for empty query";
@@ -148,7 +150,7 @@ TEST_F(CoreAPIsStandardTest, RemoveSymbolsTest) {
 
   bool OnResolvedRun = false;
   bool OnReadyRun = false;
-  ES.lookup({&JD}, {Foo, Baz},
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo, Baz},
             [&](Expected<SymbolMap> Result) {
               EXPECT_TRUE(!!Result) << "OnResolved failed unexpectedly";
               consumeError(Result.takeError());
@@ -229,7 +231,9 @@ TEST_F(CoreAPIsStandardTest, LookupWithHiddenSymbols) {
   auto &JD2 = ES.createJITDylib("JD2");
   cantFail(JD2.define(absoluteSymbols({{Bar, QuxSym}})));
 
-  auto Result = cantFail(ES.lookup({&JD, &JD2}, {Foo, Bar}));
+  /// Try a blocking lookup.
+  auto Result = cantFail(
+      ES.lookup(JITDylibSearchList({{&JD, false}, {&JD2, false}}), {Foo, Bar}));
 
   EXPECT_EQ(Result.size(), 2U) << "Unexpected number of results";
   EXPECT_EQ(Result.count(Foo), 1U) << "Missing result for \"Foo\"";
@@ -275,7 +279,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicAliases) {
                                     {Qux, {Bar, JITSymbolFlags::Weak}}})));
   cantFail(JD.define(absoluteSymbols({{Qux, QuxSym}})));
 
-  auto Result = ES.lookup({&JD}, {Baz, Qux});
+  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), {Baz, Qux});
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
   EXPECT_EQ(Result->count(Qux), 1U) << "No result for \"qux\"";
@@ -290,7 +294,7 @@ TEST_F(CoreAPIsStandardTest, TestChainedAliases) {
   cantFail(JD.define(symbolAliases(
       {{Baz, {Bar, BazSym.getFlags()}}, {Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = ES.lookup({&JD}, {Bar, Baz});
+  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar, Baz});
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Bar), 1U) << "No result for \"bar\"";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
@@ -309,7 +313,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicReExports) {
 
   cantFail(JD2.define(reexports(JD, {{Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = cantFail(ES.lookup({&JD2}, Bar));
+  auto Result = cantFail(ES.lookup(JITDylibSearchList({{&JD2, false}}), Bar));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Bar for symbol Foo should match FooSym's address";
 }
@@ -335,7 +339,7 @@ TEST_F(CoreAPIsStandardTest, TestThatReExportsDontUnnecessarilyMaterialize) {
   cantFail(JD2.define(reexports(
       JD, {{Baz, {Foo, BazSym.getFlags()}}, {Qux, {Bar, QuxSym.getFlags()}}})));
 
-  auto Result = cantFail(ES.lookup({&JD2}, Baz));
+  auto Result = cantFail(ES.lookup(JITDylibSearchList({{&JD2, false}}), Baz));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Baz for symbol Foo should match FooSym's address";
 
@@ -350,13 +354,13 @@ TEST_F(CoreAPIsStandardTest, TestReexportsGenerator) {
 
   auto Filter = [this](SymbolStringPtr Name) { return Name != Bar; };
 
-  JD.setGenerator(ReexportsGenerator(JD2, Filter));
+  JD.setGenerator(ReexportsGenerator(JD2, false, Filter));
 
   auto Flags = JD.lookupFlags({Foo, Bar, Baz});
   EXPECT_EQ(Flags.size(), 1U) << "Unexpected number of results";
   EXPECT_EQ(Flags[Foo], FooSym.getFlags()) << "Unexpected flags for Foo";
 
-  auto Result = cantFail(ES.lookup({&JD}, Foo));
+  auto Result = cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
 
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Incorrect reexported symbol address";
@@ -377,8 +381,8 @@ TEST_F(CoreAPIsStandardTest, TestTrivialCircularDependency) {
     FooReady = true;
   };
 
-  ES.lookup({&JD}, {Foo}, std::move(OnResolution), std::move(OnReady),
-            NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, std::move(OnResolution),
+            std::move(OnReady), NoDependenciesToRegister);
 
   FooR->resolve({{Foo, FooSym}});
   FooR->emit();
@@ -434,7 +438,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
 
   // Issue a lookup for Foo. Use NoDependenciesToRegister: We're going to add
   // the dependencies manually below.
-  ES.lookup({&JD}, {Foo}, std::move(OnFooResolution), std::move(OnFooReady),
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo},
+            std::move(OnFooResolution), std::move(OnFooReady),
             NoDependenciesToRegister);
 
   bool BarResolved = false;
@@ -449,7 +454,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
     BarReady = true;
   };
 
-  ES.lookup({&JD}, {Bar}, std::move(OnBarResolution), std::move(OnBarReady),
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar},
+            std::move(OnBarResolution), std::move(OnBarReady),
             NoDependenciesToRegister);
 
   bool BazResolved = false;
@@ -465,7 +471,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
     BazReady = true;
   };
 
-  ES.lookup({&JD}, {Baz}, std::move(OnBazResolution), std::move(OnBazReady),
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Baz},
+            std::move(OnBazResolution), std::move(OnBazReady),
             NoDependenciesToRegister);
 
   // Add a circular dependency: Foo -> Bar, Bar -> Baz, Baz -> Foo.
@@ -588,8 +595,8 @@ TEST_F(CoreAPIsStandardTest, AddAndMaterializeLazySymbol) {
     OnReadyRun = true;
   };
 
-  ES.lookup({&JD}, Names, std::move(OnResolution), std::move(OnReady),
-            NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), Names, std::move(OnResolution),
+            std::move(OnReady), NoDependenciesToRegister);
 
   EXPECT_TRUE(FooMaterialized) << "Foo was not materialized";
   EXPECT_TRUE(BarDiscarded) << "Bar was not discarded";
@@ -637,8 +644,8 @@ TEST_F(CoreAPIsStandardTest, TestBasicWeakSymbolMaterialization) {
     OnReadyRun = true;
   };
 
-  ES.lookup({&JD}, {Bar}, std::move(OnResolution), std::move(OnReady),
-            NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar}, std::move(OnResolution),
+            std::move(OnReady), NoDependenciesToRegister);
 
   EXPECT_TRUE(OnResolvedRun) << "OnResolved not run";
   EXPECT_TRUE(OnReadyRun) << "OnReady not run";
@@ -666,13 +673,13 @@ TEST_F(CoreAPIsStandardTest, DefineMaterializingSymbol) {
       });
 
   cantFail(JD.define(MU));
-  cantFail(ES.lookup({&JD}, Foo));
+  cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
 
   // Assert that materialization is complete by now.
   ExpectNoMoreMaterialization = true;
 
   // Look up bar to verify that no further materialization happens.
-  auto BarResult = cantFail(ES.lookup({&JD}, Bar));
+  auto BarResult = cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Bar));
   EXPECT_EQ(BarResult.getAddress(), BarSym.getAddress())
       << "Expected Bar == BarSym";
 }
@@ -685,7 +692,8 @@ TEST_F(CoreAPIsStandardTest, GeneratorTest) {
     return SymbolNameSet({Bar});
   });
 
-  auto Result = cantFail(ES.lookup({&JD}, {Foo, Bar}));
+  auto Result =
+      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo, Bar}));
 
   EXPECT_EQ(Result.count(Bar), 1U) << "Expected to find fallback def for 'bar'";
   EXPECT_EQ(Result[Bar].getAddress(), BarSym.getAddress())
@@ -701,7 +709,7 @@ TEST_F(CoreAPIsStandardTest, FailResolution) {
   cantFail(JD.define(MU));
 
   SymbolNameSet Names({Foo, Bar});
-  auto Result = ES.lookup({&JD}, Names);
+  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), Names);
 
   EXPECT_FALSE(!!Result) << "Expected failure";
   if (!Result) {
@@ -733,7 +741,8 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithUnthreadedMaterialization) {
 
   cantFail(JD.define(MU));
 
-  auto FooLookupResult = cantFail(ES.lookup({&JD}, Foo));
+  auto FooLookupResult =
+      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -754,7 +763,8 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithThreadedMaterialization) {
 
   cantFail(JD.define(absoluteSymbols({{Foo, FooSym}})));
 
-  auto FooLookupResult = cantFail(ES.lookup({&JD}, Foo));
+  auto FooLookupResult =
+      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -802,14 +812,16 @@ TEST_F(CoreAPIsStandardTest, TestGetRequestedSymbolsAndReplace) {
   EXPECT_FALSE(FooMaterialized) << "Foo should not be materialized yet";
   EXPECT_FALSE(BarMaterialized) << "Bar should not be materialized yet";
 
-  auto FooSymResult = cantFail(ES.lookup({&JD}, Foo));
+  auto FooSymResult =
+      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
   EXPECT_EQ(FooSymResult.getAddress(), FooSym.getAddress())
       << "Address mismatch for Foo";
 
   EXPECT_TRUE(FooMaterialized) << "Foo should be materialized now";
   EXPECT_FALSE(BarMaterialized) << "Bar still should not be materialized";
 
-  auto BarSymResult = cantFail(ES.lookup({&JD}, Bar));
+  auto BarSymResult =
+      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Bar));
   EXPECT_EQ(BarSymResult.getAddress(), BarSym.getAddress())
       << "Address mismatch for Bar";
   EXPECT_TRUE(BarMaterialized) << "Bar should be materialized now";
@@ -829,7 +841,7 @@ TEST_F(CoreAPIsStandardTest, TestMaterializationResponsibilityDelegation) {
 
   cantFail(JD.define(MU));
 
-  auto Result = ES.lookup({&JD}, {Foo, Bar});
+  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo, Bar});
 
   EXPECT_TRUE(!!Result) << "Result should be a success value";
   EXPECT_EQ(Result->count(Foo), 1U) << "\"Foo\" entry missing";
@@ -861,8 +873,8 @@ TEST_F(CoreAPIsStandardTest, TestMaterializeWeakSymbol) {
 
   auto OnReady = [](Error Err) { cantFail(std::move(Err)); };
 
-  ES.lookup({&JD}, {Foo}, std::move(OnResolution), std::move(OnReady),
-            NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, std::move(OnResolution),
+            std::move(OnReady), NoDependenciesToRegister);
 
   auto MU2 = llvm::make_unique<SimpleMaterializationUnit>(
       SymbolFlagsMap({{Foo, JITSymbolFlags::Exported}}),
diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
index 1660670ae63..6b1dbe93d5e 100644
--- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
@@ -66,8 +66,8 @@ static bool testSetProcessAllSections(std::unique_ptr<MemoryBuffer> Obj,
 
   ObjLayer.setProcessAllSections(ProcessAllSections);
   cantFail(ObjLayer.add(JD, std::move(Obj), ES.allocateVModule()));
-  ES.lookup({&JD}, {Foo}, OnResolveDoNothing, OnReadyDoNothing,
-            NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, OnResolveDoNothing,
+            OnReadyDoNothing, NoDependenciesToRegister);
   return DebugSectionSeen;
 }
 
@@ -157,7 +157,8 @@ TEST(RTDyldObjectLinkingLayerTest, TestOverrideObjectFlags) {
   ObjLayer.setOverrideObjectFlagsWithResponsibilityFlags(true);
 
   cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
-  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo},
+            [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
             [](Error Err) { cantFail(std::move(Err)); },
             NoDependenciesToRegister);
 }
@@ -219,7 +220,8 @@ TEST(RTDyldObjectLinkingLayerTest, TestAutoClaimResponsibilityForSymbols) {
   ObjLayer.setAutoClaimResponsibilityForObjectSymbols(true);
 
   cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
-  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo},
+            [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
             [](Error Err) { cantFail(std::move(Err)); },
             NoDependenciesToRegister);
 }
-- 
GitLab


From 078522725a57a70156b5448836260ee2f3ae5ca4 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Tue, 23 Oct 2018 23:19:23 +0000
Subject: [PATCH 0482/1116] SelectionDAG: Reuse bigger sized constants in
 memset expansion.

When implementing memset's today we often see this pattern:
$x0 = MOV 0xXYXYXYXYXYXYXYXY
store $x0, ...
$w1 = MOV 0xXYXYXYXY
store $w1, ...

We first create a 64bit constant in a 64bit register with all bytes the
same and then create a 32bit constant with all bytes the same in a 32bit
register. In many targets we could just access the lower byte of the
64bit register instead.

- Ideally this would be handled by the ConstantHoist pass but it runs
  too early when memset isn't expanded yet.
- The memset expansion code already had this optimization implemented,
  however SelectionDAG constantfolding would constantfold the
  "trunc(bigconstnat)" pattern to "smallconstant".
- This patch makes the memset expansion mark the constant as Opaque and
  stop DAGCombiner from constant folding in this situation. (Similar to
  how ConstantHoisting marks things as Opaque to avoid folding
  ADD/SUB/etc.)

Differential Revision: https://reviews.llvm.org/D53181

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345102 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/TargetLowering.h       |  8 +++++++
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp    |  4 +++-
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp   | 12 ++++++++---
 lib/Target/X86/X86ISelLowering.cpp          |  4 ++++
 lib/Target/X86/X86ISelLowering.h            |  2 ++
 test/CodeGen/AArch64/arm64-memset-inline.ll | 20 ++++++-----------
 test/CodeGen/X86/pr38771.ll                 | 24 ---------------------
 7 files changed, 33 insertions(+), 41 deletions(-)
 delete mode 100644 test/CodeGen/X86/pr38771.ll

diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 93a08347964..585d07cf044 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -2058,6 +2058,14 @@ public:
     return true;
   }
 
+  /// Return true if the specified immediate is legal for the value input of a
+  /// store instruction.
+  virtual bool isLegalStoreImmediate(int64_t Value) const {
+    // Default implementation assumes that at least 0 works since it is likely
+    // that a zero register exists or a zero immediate is allowed.
+    return Value == 0;
+  }
+
   /// Return true if it's significantly cheaper to shift a vector by a uniform
   /// scalar than by an amount which will vary across each lane. On x86, for
   /// example, there is a "psllw" instruction for the former case, but no simple
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e606cbd749c..3c7830e23c7 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15029,7 +15029,9 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // FIXME: is there such a thing as a truncating indexed store?
   if (ST->isTruncatingStore() && ST->isUnindexed() &&
-      Value.getValueType().isInteger()) {
+      Value.getValueType().isInteger() &&
+      (!isa<ConstantSDNode>(Value) ||
+       !cast<ConstantSDNode>(Value)->isOpaque())) {
     // See if we can simplify the input to this truncstore with knowledge that
     // only the low bits are being used.  For example:
     // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 1f0f7325c9d..1f63923d7ec 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3889,9 +3889,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     case ISD::SIGN_EXTEND:
       return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT,
                          C->isTargetOpcode(), C->isOpaque());
+    case ISD::TRUNCATE:
+      if (C->isOpaque())
+        break;
+      LLVM_FALLTHROUGH;
     case ISD::ANY_EXTEND:
     case ISD::ZERO_EXTEND:
-    case ISD::TRUNCATE:
       return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT,
                          C->isTargetOpcode(), C->isOpaque());
     case ISD::UINT_TO_FP:
@@ -5158,8 +5161,11 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
     assert(C->getAPIntValue().getBitWidth() == 8);
     APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
-    if (VT.isInteger())
-      return DAG.getConstant(Val, dl, VT);
+    if (VT.isInteger()) {
+      bool IsOpaque = VT.getSizeInBits() > 64 ||
+          !DAG.getTargetLoweringInfo().isLegalStoreImmediate(C->getSExtValue());
+      return DAG.getConstant(Val, dl, VT, false, IsOpaque);
+    }
     return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), dl,
                              VT);
   }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 5e4796ca54d..44d0d711dd1 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26890,6 +26890,10 @@ bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
   return isInt<32>(Imm);
 }
 
+bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
+  return isInt<32>(Imm);
+}
+
 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (!VT1.isInteger() || !VT2.isInteger())
     return false;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 15321b12ff6..eeef7579714 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -940,6 +940,8 @@ namespace llvm {
     /// the immediate into a register.
     bool isLegalAddImmediate(int64_t Imm) const override;
 
+    bool isLegalStoreImmediate(int64_t Imm) const override;
+
     /// Return the cost of the scaling factor used in the addressing
     /// mode represented by AM for this target, for a load/store
     /// of the specified type.
diff --git a/test/CodeGen/AArch64/arm64-memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll
index 8946d8db331..7a9f3b2fa97 100644
--- a/test/CodeGen/AArch64/arm64-memset-inline.ll
+++ b/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -242,14 +242,12 @@ define void @memset_8_stack() {
   ret void
 }
 
-; FIXME This could be better: x9 is a superset of w8's bit-pattern.
 define void @memset_12_stack() {
 ; CHECK-LABEL: memset_12_stack:
-; CHECK:       mov w8, #-1431655766
-; CHECK-NEXT:  mov x9, #-6148914691236517206
+; CHECK:       mov x8, #-6148914691236517206
 ; CHECK-NEXT:  mov x0, sp
+; CHECK-NEXT:  str x8, [sp]
 ; CHECK-NEXT:  str w8, [sp, #8]
-; CHECK-NEXT:  str x9, [sp]
 ; CHECK-NEXT:  bl something
   %buf = alloca [12 x i8], align 1
   %cast = bitcast [12 x i8]* %buf to i8*
@@ -272,14 +270,12 @@ define void @memset_16_stack() {
   ret void
 }
 
-; FIXME This could be better: x9 is a superset of w8's bit-pattern.
 define void @memset_20_stack() {
 ; CHECK-LABEL: memset_20_stack:
-; CHECK:       mov w8, #-1431655766
-; CHECK-NEXT:  mov x9, #-6148914691236517206
+; CHECK:       mov x8, #-6148914691236517206
 ; CHECK-NEXT:  add x0, sp, #8
+; CHECK-NEXT:  stp x8, x8, [sp, #8]
 ; CHECK-NEXT:  str w8, [sp, #24]
-; CHECK-NEXT:  stp x9, x9, [sp, #8]
 ; CHECK-NEXT:  bl something
   %buf = alloca [20 x i8], align 1
   %cast = bitcast [20 x i8]* %buf to i8*
@@ -288,15 +284,13 @@ define void @memset_20_stack() {
   ret void
 }
 
-; FIXME This could be better: x9 is a superset of w8's bit-pattern.
 define void @memset_26_stack() {
 ; CHECK-LABEL: memset_26_stack:
-; CHECK:       mov w8, #43690
-; CHECK-NEXT:  mov x9, #-6148914691236517206
+; CHECK:       mov x8, #-6148914691236517206
 ; CHECK-NEXT:  mov x0, sp
+; CHECK-NEXT:  stp x8, x8, [sp, #8]
+; CHECK-NEXT:  str x8, [sp]
 ; CHECK-NEXT:  strh w8, [sp, #24]
-; CHECK-NEXT:  stp x9, x9, [sp, #8]
-; CHECK-NEXT:  str x9, [sp]
 ; CHECK-NEXT:  bl something
   %buf = alloca [26 x i8], align 1
   %cast = bitcast [26 x i8]* %buf to i8*
diff --git a/test/CodeGen/X86/pr38771.ll b/test/CodeGen/X86/pr38771.ll
deleted file mode 100644
index 2a9ee66f7ef..00000000000
--- a/test/CodeGen/X86/pr38771.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
-
-define void @function() nounwind {
-; CHECK-LABEL: function:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movabsq $281474976710656, %rax # imm = 0x1000000000000
-; CHECK-NEXT:    notq %rax
-; CHECK-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-NEXT:    shldq $65, %rax, %rcx
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movb $64, %dl
-; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    cmoveq %rcx, %rax
-; CHECK-NEXT:    movq %rax, (%rax)
-; CHECK-NEXT:    movl $0, (%rax)
-; CHECK-NEXT:    retq
-entry:
-  %B68 = sub i96 39614081257132168796771975167, 281474976710656
-  %B49 = or i96 39614081257132168796771975167, 39614081257132168796771975167
-  %B33 = lshr i96 %B68, %B68
-  store i96 %B33, i96* undef
-  ret void
-}
-- 
GitLab


From a73d657b58931f82df97d1772f2c10d94ac7dc8b Mon Sep 17 00:00:00 2001
From: Wei Mi <wmi@google.com>
Date: Tue, 23 Oct 2018 23:29:45 +0000
Subject: [PATCH 0483/1116] [PM] keeping history when original SCC split and
 then merge into itself in the same round of SCC update.

In https://reviews.llvm.org/rL309784, inline history is added to prevent
infinite inlining across multiple run of inliner and SCC update, but the
history will only be kept when new SCC is actually generated during SCC update.

We found a case that SCC can be split and then merge into itself in the same
round of SCC update, so the same SCC will be pop out from UR.CWorklist and
then added back immediately, without any new SCC generated, that is why the
existing patch cannot catch the infinite inline case.

What the patch does is even if no new SCC is generated, if only the current
SCC appears in UR.CWorklist again, then keep the inline history.

Differential Revision: https://reviews.llvm.org/D52915


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345103 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/Inliner.cpp        |  13 ++-
 test/Transforms/Inline/cgscc-cycle.ll | 109 +++++++++++++++++++++++++-
 test/Transforms/Inline/monster_scc.ll |  46 +++--------
 3 files changed, 128 insertions(+), 40 deletions(-)

diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 3275226925c..66aea45323f 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -1158,10 +1158,19 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // SCC splits and merges. To avoid this, we capture the originating caller
     // node and the SCC containing the call edge. This is a slight over
     // approximation of the possible inlining decisions that must be avoided,
-    // but is relatively efficient to store.
+    // but is relatively efficient to store. We use C != OldC to know when
+    // a new SCC is generated and the original SCC may be generated via merge
+    // in later iterations.
+    //
+    // It is also possible that even if no new SCC is generated
+    // (i.e., C == OldC), the original SCC could be split and then merged
+    // into the same one as itself. and the original SCC will be added into
+    // UR.CWorklist again, we want to catch such cases too.
+    //
     // FIXME: This seems like a very heavyweight way of retaining the inline
     // history, we should look for a more efficient way of tracking it.
-    if (C != OldC && llvm::any_of(InlinedCallees, [&](Function *Callee) {
+    if ((C != OldC || UR.CWorklist.count(OldC)) &&
+        llvm::any_of(InlinedCallees, [&](Function *Callee) {
           return CG.lookupSCC(*CG.lookup(*Callee)) == OldC;
         })) {
       LLVM_DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
diff --git a/test/Transforms/Inline/cgscc-cycle.ll b/test/Transforms/Inline/cgscc-cycle.ll
index 69874c3ef2f..bc3bdc99fff 100644
--- a/test/Transforms/Inline/cgscc-cycle.ll
+++ b/test/Transforms/Inline/cgscc-cycle.ll
@@ -5,7 +5,7 @@
 ; some out-of-band way to prevent infinitely re-inlining and re-transforming the
 ; code.
 ;
-; RUN: opt < %s -passes='cgscc(inline,function(sroa,instcombine))' -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(inline,function(sroa,instcombine))' -inline-threshold=50 -S | FileCheck %s
 
 
 ; The `test1_*` collection of functions form a directly cycling pattern.
@@ -123,3 +123,110 @@ bb2:
 
   ret void
 }
+
+; Another infinite inlining case. The initial callgraph is like following:
+;
+;         test3_a <---> test3_b
+;             |         ^
+;             v         |
+;         test3_c <---> test3_d
+;
+; For all the call edges in the call graph, only test3_c and test3_d can be
+; inlined into test3_a, and no other call edge can be inlined.
+;
+; After test3_c is inlined into test3_a, the original call edge test3_a->test3_c
+; will be removed, a new call edge will be added and the call graph becomes:
+;
+;            test3_a <---> test3_b
+;                  \      ^
+;                   v    /
+;     test3_c <---> test3_d
+; But test3_a, test3_b, test3_c and test3_d still belong to the same SCC.
+;
+; Then after test3_a->test3_d is inlined, when test3_a->test3_d is converted to
+; a ref edge, the original SCC will be split into two: {test3_c, test3_d} and
+; {test3_a, test3_b}, immediately after the newly added ref edge
+; test3_a->test3_c will be converted to a call edge, and the two SCCs will be
+; merged into the original one again. During this cycle, the original SCC will
+; be added into UR.CWorklist again and this creates an infinite loop.
+
+@a = global i64 0
+@b = global i64 0
+
+define void @test3_c(i32 %i) {
+entry:
+  %cmp = icmp eq i32 %i, 5
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i64 @random()
+  %t0 = load i64, i64* @a
+  %add = add nsw i64 %t0, %call
+  store i64 %add, i64* @a
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  tail call void @test3_d(i32 %i)
+  %t6 = load i64, i64* @a
+  %add85 = add nsw i64 %t6, 1
+  store i64 %add85, i64* @a
+  ret void
+}
+
+declare i64 @random()
+
+define void @test3_d(i32 %i) {
+entry:
+  %cmp = icmp eq i32 %i, 5
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i64 @random()
+  %t0 = load i64, i64* @a
+  %add = add nsw i64 %t0, %call
+  store i64 %add, i64* @a
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  tail call void @test3_c(i32 %i)
+  tail call void @test3_b()
+  %t6 = load i64, i64* @a
+  %add79 = add nsw i64 %t6, 3
+  store i64 %add79, i64* @a
+  ret void
+}
+
+; Function Attrs: noinline
+define void @test3_b() #0 {
+entry:
+  tail call void @test3_a()
+  %t0 = load i64, i64* @a
+  %add = add nsw i64 %t0, 2
+  store i64 %add, i64* @a
+  ret void
+}
+
+; Check test3_c is inlined into test3_a once and only once.
+; CHECK-LABEL: @test3_a(
+; CHECK: tail call void @test3_b()
+; CHECK-NEXT: tail call void @test3_d(i32 5)
+; CHECK-NEXT: %[[LD1:.*]] = load i64, i64* @a
+; CHECK-NEXT: %[[ADD1:.*]] = add nsw i64 %[[LD1]], 1
+; CHECK-NEXT: store i64 %[[ADD1]], i64* @a
+; CHECK-NEXT: %[[LD2:.*]] = load i64, i64* @b
+; CHECK-NEXT: %[[ADD2:.*]] = add nsw i64 %[[LD2]], 5
+; CHECK-NEXT: store i64 %[[ADD2]], i64* @b
+; CHECK-NEXT: ret void
+
+; Function Attrs: noinline
+define void @test3_a() #0 {
+entry:
+  tail call void @test3_b()
+  tail call void @test3_c(i32 5)
+  %t0 = load i64, i64* @b
+  %add = add nsw i64 %t0, 5
+  store i64 %add, i64* @b
+  ret void
+}
+
+attributes #0 = { noinline }
diff --git a/test/Transforms/Inline/monster_scc.ll b/test/Transforms/Inline/monster_scc.ll
index 0f8f1f21c8b..b32a2aed331 100644
--- a/test/Transforms/Inline/monster_scc.ll
+++ b/test/Transforms/Inline/monster_scc.ll
@@ -154,11 +154,7 @@ if.end3:
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1gi(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW: call void @_Z1fILb1ELi3EEvPbS0_(
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
 ; NEW-NOT: call
@@ -198,19 +194,11 @@ if.end3:
 ; NEW-NOT: call
 ; NEW: call void @_Z1gi(
 ; NEW-NOT: call
-; NEW: call void @_Z1gi(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW: call void @_Z1fILb1ELi3EEvPbS0_(
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1gi(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW: call void @_Z1fILb1ELi3EEvPbS0_(
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
 ; NEW-NOT: call
@@ -260,7 +248,7 @@ if.end3:
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb1ELi4EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW: call void @_Z1fILb0ELi4EEvPbS0_(
 ; NEW-NOT: call
 define void @_Z1fILb0ELi2EEvPbS0_(i8* %B, i8* %E) {
 entry:
@@ -304,21 +292,13 @@ if.end3:
 ; NEW-NOT: call
 ; NEW: call void @_Z1gi(
 ; NEW-NOT: call
-; NEW: call void @_Z1gi(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1gi(
+; NEW: call void @_Z1fILb1ELi4EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW: call void @_Z1fILb0ELi4EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
+; NEW: call void @_Z1fILb1ELi4EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW: call void @_Z1fILb0ELi4EEvPbS0_(
 ; NEW-NOT: call
 define void @_Z1fILb1ELi2EEvPbS0_(i8* %B, i8* %E) {
 entry:
@@ -433,15 +413,7 @@ entry:
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1gi(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
+; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
 ; NEW-NOT: call
 define void @_Z1fILb1ELi4EEvPbS0_(i8* %B, i8* %E) {
 entry:
-- 
GitLab


From bd26778075288e3b5929e1d222966ddb6e5e4947 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 23 Oct 2018 23:35:43 +0000
Subject: [PATCH 0484/1116] [hurd] Make getMainExecutable get the real binary
 path

On GNU/Hurd, llvm-config is returning bogus value, such as:

$ llvm-config-6.0 --includedir
/usr/include

while it should be:
$ llvm-config-6.0 --includedir
/usr/lib/llvm-6.0/include

This is because getMainExecutable does not get the actual installation
path. On GNU/Hurd, /proc/self/exe is indeed a symlink to the path that
was used to start the program, and not the eventual binary file. Llvm's
getMainExecutable thus needs to run realpath over it to get the actual
place where llvm was installed (/usr/lib/llvm-6.0/bin/llvm-config), and
not /usr/bin/llvm-config-6.0. This will not change the result on Linux,
where /proc/self/exe already points to the eventual file.

Patch by Samuel Thibault!

While making changes here, I reformatted this block a bit to reduce
indentation and match 2 space indent style.

Differential Revision: https://reviews.llvm.org/D53557

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345104 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Unix/Path.inc | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 0f61e94145e..8f98dda52d7 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -179,14 +179,34 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
   char exe_path[MAXPATHLEN];
   StringRef aPath("/proc/self/exe");
   if (sys::fs::exists(aPath)) {
-      // /proc is not always mounted under Linux (chroot for example).
-      ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
-      if (len >= 0)
-          return std::string(exe_path, len);
+    // /proc is not always mounted under Linux (chroot for example).
+    ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
+    if (len < 0)
+      return "";
+
+    // Null terminate the string for realpath. readlink never null
+    // terminates its output.
+    len = std::min(len, long(sizeof(exe_path) - 1));
+    exe_path[len] = '\0';
+
+    // At least on GNU/Hurd, /proc/self/exe is a symlink to the path that
+    // was used to start the program, and not the eventual binary file.
+    // We thus needs to run realpath over it to get the actual place
+    // where llvm was installed.
+#if _POSIX_VERSION >= 200112 || defined(__GLIBC__)
+    char *real_path = realpath(exe_path, NULL);
+    std::string ret = std::string(real_path);
+    free(real_path);
+    return ret;
+#else
+    char real_path[MAXPATHLEN];
+    realpath(exe_path, real_path);
+    return std::string(real_path);
+#endif
   } else {
-      // Fall back to the classical detection.
-      if (getprogpath(exe_path, argv0))
-        return exe_path;
+    // Fall back to the classical detection.
+    if (getprogpath(exe_path, argv0))
+      return exe_path;
   }
 #elif defined(HAVE_DLFCN_H) && defined(HAVE_DLADDR)
   // Use dladdr to get executable path if available.
-- 
GitLab


From 0253f7506fa808312119393a633d03cde96cbc8f Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 23 Oct 2018 23:44:44 +0000
Subject: [PATCH 0485/1116] Commit missing comment edit and use correct cast to
 fix std::min overload

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345105 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Unix/Path.inc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 8f98dda52d7..02b7c2579c9 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -186,13 +186,13 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
 
     // Null terminate the string for realpath. readlink never null
     // terminates its output.
-    len = std::min(len, long(sizeof(exe_path) - 1));
+    len = std::min(len, ssize_t(sizeof(exe_path) - 1));
     exe_path[len] = '\0';
 
-    // At least on GNU/Hurd, /proc/self/exe is a symlink to the path that
-    // was used to start the program, and not the eventual binary file.
-    // We thus needs to run realpath over it to get the actual place
-    // where llvm was installed.
+    // On Linux, /proc/self/exe always looks through symlinks. However, on
+    // GNU/Hurd, /proc/self/exe is a symlink to the path that was used to start
+    // the program, and not the eventual binary file. Therefore, call realpath
+    // so this behaves the same on all platforms.
 #if _POSIX_VERSION >= 200112 || defined(__GLIBC__)
     char *real_path = realpath(exe_path, NULL);
     std::string ret = std::string(real_path);
-- 
GitLab


From 9a2b84c9515b3b6ef82e4185d815519a42666df5 Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Wed, 24 Oct 2018 00:00:52 +0000
Subject: [PATCH 0486/1116] ARM: handle checking aliases with out-of-bounds
 GEPs

A global alias may use indices which are not considered in bounds.  In
such a case, accessing the base object will fail as it only peers
through inbounds accesses.  This pattern is used by the swift compiler
to create references to preceeding members in the type metadata.  This
would cause the code generation to fail when targeting a platform that
used ELF as the object file format.  Be conservative and fail the
read-only check if we run into an alias that we cannot peer through.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345107 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMISelLowering.cpp   |  8 +++++---
 test/CodeGen/ARM/readonly-aliases.ll | 17 +++++++++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 test/CodeGen/ARM/readonly-aliases.ll

diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 3527d049f50..2f4bc46f932 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -3171,9 +3171,11 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
 
 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    GV = GA->getBaseObject();
-  return (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
-         isa<Function>(GV);
+    if (!(GV = GA->getBaseObject()))
+      return false;
+  if (const auto *V = dyn_cast<GlobalVariable>(GV))
+    return V->isConstant();
+  return isa<Function>(GV);
 }
 
 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
diff --git a/test/CodeGen/ARM/readonly-aliases.ll b/test/CodeGen/ARM/readonly-aliases.ll
new file mode 100644
index 00000000000..c90650d3a81
--- /dev/null
+++ b/test/CodeGen/ARM/readonly-aliases.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple thumbv7-unknown-linux-android -filetype asm -o - %s | FileCheck %s
+
+@a = protected constant <{ i32, i32 }> <{ i32 0, i32 0 }>
+@b = protected alias i32, getelementptr(i32, i32* getelementptr inbounds (<{ i32, i32 }>, <{ i32, i32 }>* @a, i32 0, i32 1), i32 -1)
+
+declare void @f(i32*)
+
+define void @g() {
+entry:
+  call void @f(i32* @b)
+  ret void
+}
+
+; CHECK-LABEL: g:
+; CHECK: movw [[REGISTER:r[0-9]+]], :lower16:b
+; CHECK: movt [[REGISTER]], :upper16:b
+
-- 
GitLab


From e10866a63f98531ff0ba1714b92b2624ed2a4f7c Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric@codeaurora.org>
Date: Wed, 24 Oct 2018 00:03:34 +0000
Subject: [PATCH 0487/1116] [ARM64][Windows] Add unwind support to llvm-readobj

This patch adds support for dumping the unwind info from ARM64 COFF object
files.

Differential Revision: https://reviews.llvm.org/D53264


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345108 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/ARMWinEH.h               |  88 +++-
 test/tools/llvm-readobj/Inputs/arm64-win1.obj | Bin 0 -> 2063 bytes
 test/tools/llvm-readobj/Inputs/arm64-win2.obj | Bin 0 -> 956 bytes
 test/tools/llvm-readobj/arm64-win-error1.s    |  53 +++
 test/tools/llvm-readobj/arm64-win-error2.s    |  50 +++
 test/tools/llvm-readobj/arm64-win-error3.s    |  51 +++
 .../llvm-readobj/unwind-arm64-windows.test    |  69 +++
 tools/llvm-readobj/ARMWinEHPrinter.cpp        | 399 ++++++++++++++++--
 tools/llvm-readobj/ARMWinEHPrinter.h          |  51 ++-
 tools/llvm-readobj/COFFDumper.cpp             |   4 +-
 10 files changed, 711 insertions(+), 54 deletions(-)
 create mode 100755 test/tools/llvm-readobj/Inputs/arm64-win1.obj
 create mode 100755 test/tools/llvm-readobj/Inputs/arm64-win2.obj
 create mode 100644 test/tools/llvm-readobj/arm64-win-error1.s
 create mode 100644 test/tools/llvm-readobj/arm64-win-error2.s
 create mode 100644 test/tools/llvm-readobj/arm64-win-error3.s
 create mode 100644 test/tools/llvm-readobj/unwind-arm64-windows.test

diff --git a/include/llvm/Support/ARMWinEH.h b/include/llvm/Support/ARMWinEH.h
index 1463629f45d..4f05965ed25 100644
--- a/include/llvm/Support/ARMWinEH.h
+++ b/include/llvm/Support/ARMWinEH.h
@@ -207,6 +207,8 @@ std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
 
 /// ExceptionDataRecord - An entry in the table of exception data (.xdata)
 ///
+/// The format on ARM is:
+///
 ///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
 ///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
 /// +-------+---------+-+-+-+---+-----------------------------------+
@@ -215,6 +217,16 @@ std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
 /// |    Reserved    |Ex. Code Words|   (Extended Epilogue Count)   |
 /// +-------+--------+--------------+-------------------------------+
 ///
+/// The format on ARM64 is:
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +---------+---------+-+-+---+-----------------------------------+
+/// |  C Wrd  | Epi Cnt |E|X|Ver|         Function Length           |
+/// +---------+------+--'-'-'---'---+-------------------------------+
+/// |    Reserved    |Ex. Code Words|   (Extended Epilogue Count)   |
+/// +-------+--------+--------------+-------------------------------+
+///
 /// Function Length : 18-bit field indicating the total length of the function
 ///                   in bytes divided by 2.  If a function is larger than
 ///                   512KB, then multiple pdata and xdata records must be used.
@@ -225,7 +237,7 @@ std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
 ///     header
 /// F : 1-bit field indicating that the record describes a function fragment
 ///     (implies that no prologue is present, and prologue processing should be
-///     skipped)
+///     skipped) (ARM only)
 /// Epilogue Count : 5-bit field that differs in meaning based on the E field.
 ///
 ///                  If E is set, then this field specifies the index of the
@@ -235,33 +247,43 @@ std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
 ///                  scopes.  If more than 31 scopes exist, then this field and
 ///                  the Code Words field must both be set to 0 to indicate that
 ///                  an extension word is required.
-/// Code Words : 4-bit field that species the number of 32-bit words needed to
-///              contain all the unwind codes.  If more than 15 words (63 code
-///              bytes) are required, then this field and the Epilogue Count
-///              field must both be set to 0 to indicate that an extension word
-///              is required.
+/// Code Words : 4-bit (5-bit on ARM64) field that specifies the number of
+///              32-bit words needed to contain all the unwind codes.  If more
+///              than 15 words (31 words on ARM64) are required, then this field
+///              and the Epilogue Count field must both be set to 0 to indicate
+///              that an extension word is required.
 /// Extended Epilogue Count, Extended Code Words :
 ///                          Valid only if Epilog Count and Code Words are both
 ///                          set to 0.  Provides an 8-bit extended code word
 ///                          count and 16-bits for epilogue count
 ///
+/// The epilogue scope format on ARM is:
+///
 ///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
 ///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
 /// +----------------+------+---+---+-------------------------------+
 /// |  Ep Start Idx  | Cond |Res|       Epilogue Start Offset       |
 /// +----------------+------+---+-----------------------------------+
 ///
+/// The epilogue scope format on ARM64 is:
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +-------------------+-------+---+-------------------------------+
+/// |  Ep Start Idx     |  Res  |   Epilogue Start Offset           |
+/// +-------------------+-------+-----------------------------------+
+///
 /// If the E bit is unset in the header, the header is followed by a series of
 /// epilogue scopes, which are sorted by their offset.
 ///
 /// Epilogue Start Offset: 18-bit field encoding the offset of epilogue relative
 ///                        to the start of the function in bytes divided by two
 /// Res : 2-bit field reserved for future expansion (must be set to 0)
-/// Condition : 4-bit field providing the condition under which the epilogue is
-///             executed.  Unconditional epilogues should set this field to 0xe.
-///             Epilogues must be entirely conditional or unconditional, and in
-///             Thumb-2 mode.  The epilogue beings with the first instruction
-///             after the IT opcode.
+/// Condition : (ARM only) 4-bit field providing the condition under which the
+///             epilogue is executed.  Unconditional epilogues should set this
+///             field to 0xe. Epilogues must be entirely conditional or
+///             unconditional, and in Thumb-2 mode.  The epilogue begins with
+///             the first instruction after the IT opcode.
 /// Epilogue Start Index : 8-bit field indicating the byte index of the first
 ///                        unwind code describing the epilogue
 ///
@@ -293,18 +315,33 @@ struct EpilogueScope {
   const support::ulittle32_t ES;
 
   EpilogueScope(const support::ulittle32_t Data) : ES(Data) {}
+  // Same for both ARM and AArch64.
   uint32_t EpilogueStartOffset() const {
     return (ES & 0x0003ffff);
   }
-  uint8_t Res() const {
+
+  // Different implementations for ARM and AArch64.
+  uint8_t ResARM() const {
     return ((ES & 0x000c0000) >> 18);
   }
+
+  uint8_t ResAArch64() const {
+    return ((ES & 0x000f0000) >> 18);
+  }
+
+  // Condition is only applicable to ARM.
   uint8_t Condition() const {
     return ((ES & 0x00f00000) >> 20);
   }
-  uint8_t EpilogueStartIndex() const {
+
+  // Different implementations for ARM and AArch64.
+  uint8_t EpilogueStartIndexARM() const {
     return ((ES & 0xff000000) >> 24);
   }
+
+  uint8_t EpilogueStartIndexAArch64() const {
+    return ((ES & 0xffc00000) >> 22);
+  }
 };
 
 struct ExceptionDataRecord;
@@ -312,13 +349,23 @@ inline size_t HeaderWords(const ExceptionDataRecord &XR);
 
 struct ExceptionDataRecord {
   const support::ulittle32_t *Data;
+  bool isAArch64;
 
-  ExceptionDataRecord(const support::ulittle32_t *Data) : Data(Data) {}
+  ExceptionDataRecord(const support::ulittle32_t *Data, bool isAArch64) :
+    Data(Data), isAArch64(isAArch64) {}
 
   uint32_t FunctionLength() const {
     return (Data[0] & 0x0003ffff);
   }
 
+  uint32_t FunctionLengthInBytesARM() const {
+    return FunctionLength() << 1;
+  }
+
+  uint32_t FunctionLengthInBytesAArch64() const {
+    return FunctionLength() << 2;
+  }
+
   uint8_t Vers() const {
     return (Data[0] & 0x000C0000) >> 18;
   }
@@ -332,18 +379,25 @@ struct ExceptionDataRecord {
   }
 
   bool F() const {
+    assert(!isAArch64 && "Fragments are only supported on ARMv7 WinEH");
     return ((Data[0] & 0x00400000) >> 22);
   }
 
   uint8_t EpilogueCount() const {
-    if (HeaderWords(*this) == 1)
+    if (HeaderWords(*this) == 1) {
+      if (isAArch64)
+        return (Data[0] & 0x07C00000) >> 22;
       return (Data[0] & 0x0f800000) >> 23;
+    }
     return Data[1] & 0x0000ffff;
   }
 
   uint8_t CodeWords() const {
-    if (HeaderWords(*this) == 1)
+    if (HeaderWords(*this) == 1) {
+      if (isAArch64)
+        return (Data[0] & 0xf8000000) >> 27;
       return (Data[0] & 0xf0000000) >> 28;
+    }
     return (Data[1] & 0x00ff0000) >> 16;
   }
 
@@ -373,6 +427,8 @@ struct ExceptionDataRecord {
 };
 
 inline size_t HeaderWords(const ExceptionDataRecord &XR) {
+  if (XR.isAArch64)
+    return (XR.Data[0] & 0xffc0000) ? 1 : 2;
   return (XR.Data[0] & 0xff800000) ? 1 : 2;
 }
 }
diff --git a/test/tools/llvm-readobj/Inputs/arm64-win1.obj b/test/tools/llvm-readobj/Inputs/arm64-win1.obj
new file mode 100755
index 0000000000000000000000000000000000000000..025e1db6cce4a2efdc398d5149dab52ce132c250
GIT binary patch
literal 2063
zcmYdU#m3Mu@o;oHGXsM$0|e-$6s0DYl%+y=3=H}V3=AxcFdmfV5@66vNlhwER|$rS
zFk~<=Fc_n$b6^l~(n~5XhN%J3H$Vg$ZeVaYpjVPwQKFKY2a^h6WMG)h$iVQP2_nMA
zz`&p&;E<qKk&;-F2oVF33JeSk+RPvp0y_v8IOr9is^ef_VDM&UU<ih)1sMxcr=Xyq
z@8as_80r(^<LP9j<m2h&>>HwlD(LUy;^*k=8my$i!oa}rk%56BhJk@Wj)74SWT~rF
zOmT5$R$@_Ra!hJQOj2gLsa}3k7K0UojUXcv1H(54VFoUS4T%hJ+BY+~D8D#Atwcd1
zNK?VTpd>Rlvnn$$UBNj&w;(eowTR(!@Sc_bxEX%FSk9RHYCTgff8^{-{tOE~F)%Pp
z`CqJmnuCF1f-ob)!~`aXzh9Ucv_RtXxflx9vNMEau5-xEYI4lY5Oerjz?ms0$dx5m
zAm#Aah$}&^P|o3RFjt0Lk&?sT60QQdVl{`q1#S+1r*aj_m1sHqE%b8uyN#<zu2j$A
zZ;_wF-|Jk(a%Dyie~W`0{{G-9kt;WI_*)X@@K=(%RIb9x;csb_!(S`zGPz1ShreZU
z4u7M$%jK$^9R8LkIsC2Ru8^xpbND-#yHc()%i-@n?kc(a+y!#axeMhAnH>H;<0z6l
z%26!0grh{RnWI!LiK9%;k)vGBnxjIln!8f2iaS#-hC5qs0(X|2ELVcu0(TF2kUw7>
zW}nD$vt$PY0|z%V1B1u|W`>C%aab7rFV^pVwcaWB#d62of7}i~KL;OKaez7gngjy_
zg9ZZw12Y3C(SQ;J3xf&+0|SV~#$dp}z@Q42m0*Oi3>X<0)WBk)3=gayFx@)w`0?W#
ztv8r29nyFL;=<UELFoX7wHX)~m>57QMGwq!NY2kK(92BOkjTpT|3AonhqTnh5<LTh
z2@DJk5E0~T2g*gvjG&AMGZ8`~XE{*vWo86rI*1&MM9yxYl+MhEnc?hH3>_Sd{TxAh
zKx&v6>{E=uBA}3fi5P=L1i-!r`NuxR1S|pyMVJ~>um~tLVIpQ=5lARQbeV%iK%om$
zV*wViKr-19EaCtc0l5qmejX?yd<+Z>0VpCMw?v?bC^0ZFB%p}sFff1%Cy4DJlg$_y
z7z$9-fNZWn5%FSRU}!)Q31MJh=s*#PV_;yIfFc5lj~OT;MGOoK3s6LA7#J8<pop|F
zFfeRD5doRH14U#O0|UbW6p<wi3=AhwMAk7dFkFC(AQz~huw-Tg6{(<N)c5GhDh38n
zDg%WE$Q}^Qz@WeYN>vOD$01x$feRJ62^C=htBH?K&d4q<$%Y7mbgB6G8G(2@j11s(
z&&HtQ;|H!$OprxD0S{6JRfAlVvx3~s2uevH1z)T?Zn7|d#6Y~~AcBDb%x7jmF4#f7
zVP*uST8PU*w!?J&h3W#UVTe!5DJd;ZO@RtBFfj-*GBDVu7#e_+9*70a6!s~GkjwxQ
zgJc2_3zGOjEJ&gUu^@>Z#DXMp5DSvHK`cn32IVcd9Y%0FjNo<{!R;`D+hGK^!w7DN
z5zLOX(!69N2Zu;U4?l+}273_K7|acDb@YIVfCWH2mD0TO%)As8usVb&xOPMn1OS7X
BZ^r-t

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-readobj/Inputs/arm64-win2.obj b/test/tools/llvm-readobj/Inputs/arm64-win2.obj
new file mode 100755
index 0000000000000000000000000000000000000000..7e506eedda6549fdd4d576972a182fcca358b453
GIT binary patch
literal 956
zcmYdU#mZ2%{!lb069a=V0|e-$6s0DYl%+y=3=H}V3=DT*92m_dz@V3snpB#u5)2c{
zU|?Y2W`yw|v;%{HlU_+`MTtso9z=wJ!GnQ;p_q|@VK-DQGXn#If`CJUUPVe`Ng_-Y
z1A_<y1H(<I8kneqfPsTv0jfF<1_p+=j0_B4p=y~J7#Kk66ciNnU0mH9Lw!PgJe{nR
zd_0|;eM6K`1^s<o{2YB<gOwCm7#J8nGB7a2FfcI4F)#{(taP=CDK5^;N-WAuj!Dgk
zNy<z&(aTTDVz6Sc5oBayVED!$%)rI4ArTaK5ZX60xhTIlKdnSTBS=%hzn~;DH?t};
zFI~YoKer$=C$)&-zca(d&(#bo{~u<*_?ekOi-Cb*N<9}t;aLWT5D@$SVfKlis~uMU
zcV?J)fI0r!17?Pa*u(`G7#KiaXJ(LrvrHHm7&sXi7?>G^7#ajXvV05-4D#0)7@s_T
z{21gT80KMMU|?bZ`I#Tga!Ah4EzrwM*^tP}`2Ro11c$WL#1cINg9!`_4G<CJgaHa`
zW=2rbfSCxPkrM?dBA6LL$pRt=BassXDDs&ZK}iC{dO!K6Ap--*Y>--z>p(OEg8~D{
zECvQ42p5!Kpdu<z5oWL&6(2t%kbnU=3>X;L7*u@x%s?U_46=cZ0Xdn0Y+z;tB{Yz{
zd8OcUW(HOelYxN&Y!CwjIE<JXkP{t91v4WkbRn(+g%(H*q!Hv35DnrpGsH45FxaP+
u<|P|BI7B*nL_|S&#$aB6tD}blR0J#l;;EG8m1pLqsDO1LM8QcOO%MRs!JtwA

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-readobj/arm64-win-error1.s b/test/tools/llvm-readobj/arm64-win-error1.s
new file mode 100644
index 00000000000..ba59edf3dea
--- /dev/null
+++ b/test/tools/llvm-readobj/arm64-win-error1.s
@@ -0,0 +1,53 @@
+## Check that error handling for bad opcodes works.
+## .xdata below contains the bad opcode 0xdf in the 4th word of .xdata.
+
+// REQUIRES: aarch64-registered-target
+// RUN: llvm-mc -filetype=obj -triple aarch64-windows %s -o - \
+// RUN:   | llvm-readobj -unwind - | FileCheck %s
+
+// CHECK:     Prologue [
+// CHECK:        0xdf                ; Bad opcode!
+// CHECK:        0xd600              ; stp x19, lr, [sp, #0]
+// CHECK:        0x01                ; sub sp, #16
+// CHECK:        0xe4                ; end
+// CHECK:     ]
+
+	.text
+	.globl	"?func@@YAHXZ"
+	.p2align	3
+"?func@@YAHXZ":
+	sub     sp,sp,#0x10
+	stp     x19,lr,[sp]
+	sub     sp,sp,#0x1F0
+	mov     w19,w0
+	bl	"?func2@@YAXXZ"
+	cmp     w19,#2
+	ble     .LBB0_1
+	bl      "?func2@@YAHXZ"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+.LBB0_1:
+	mov      x0,sp
+	bl       "?func3@@YAHPEAH@Z"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+
+
+.section .pdata,"dr"
+	.long "?func@@YAHXZ"@IMGREL
+        .long "$unwind$func@@YAHXZ"@IMGREL
+
+
+.section	.xdata,"dr"
+"$unwind$func@@YAHXZ":
+        .p2align	3
+	.long		0x10800012
+	.long 		0x8
+	.long 		0xe
+	.long 		0x100d6df
+	.long 		0xe3e3e3e4
+
diff --git a/test/tools/llvm-readobj/arm64-win-error2.s b/test/tools/llvm-readobj/arm64-win-error2.s
new file mode 100644
index 00000000000..93c461de8ee
--- /dev/null
+++ b/test/tools/llvm-readobj/arm64-win-error2.s
@@ -0,0 +1,50 @@
+## Check that the sanity check for an inconsistent header works.
+## The first word contains the bad value for CodeWords, 0xf, which indicates
+## that we need 0x11110 << 2 =  120 bytes of space for the unwind codes.
+## It follows that the .xdata section is badly formed as only 8 bytes are
+## allocated for the unwind codes.
+
+// REQUIRES: aarch64-registered-target
+// RUN: llvm-mc -filetype=obj -triple aarch64-windows %s -o - \
+// RUN:   | not llvm-readobj -unwind - 2>&1 | FileCheck %s
+
+// CHECK: LLVM ERROR: Malformed unwind data
+
+	.text
+	.globl	"?func@@YAHXZ"
+	.p2align	3
+"?func@@YAHXZ":
+	sub     sp,sp,#0x10
+	stp     x19,lr,[sp]
+	sub     sp,sp,#0x1F0
+	mov     w19,w0
+	bl	"?func2@@YAXXZ"
+	cmp     w19,#2
+	ble     .LBB0_1
+	bl      "?func2@@YAHXZ"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+.LBB0_1:
+	mov      x0,sp
+	bl       "?func3@@YAHPEAH@Z"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+
+.section .pdata,"dr"
+	.long "?func@@YAHXZ"@IMGREL
+        .long "$unwind$func@@YAHXZ"@IMGREL
+
+
+.section	.xdata,"dr"
+"$unwind$func@@YAHXZ":
+        .p2align	3
+	.long		0xf0800012
+	.long 		0x8
+	.long 		0xe
+	.long 		0x100d61f
+	.long 		0xe3e3e3e4
+
diff --git a/test/tools/llvm-readobj/arm64-win-error3.s b/test/tools/llvm-readobj/arm64-win-error3.s
new file mode 100644
index 00000000000..5cbc3d7c585
--- /dev/null
+++ b/test/tools/llvm-readobj/arm64-win-error3.s
@@ -0,0 +1,51 @@
+## Check that error handling for going past the unwind data works.
+## .xdata below contains bad opcodes in the last word.  The last byte, 0xe0,
+## indicates that we have come across alloc_l, which requires 4 bytes. In this
+## case, unwind code processing will go past the allocated unwind data.
+
+// REQUIRES: aarch64-registered-target
+// RUN: llvm-mc -filetype=obj -triple aarch64-windows %s -o - \
+// RUN:   | llvm-readobj -unwind - | FileCheck %s
+
+// CHECK: Prologue [
+// CHECK:   Opcode 0xe0 goes past the unwind data
+
+	.text
+	.globl	"?func@@YAHXZ"
+	.p2align	3
+"?func@@YAHXZ":
+	sub     sp,sp,#0x10
+	stp     x19,lr,[sp]
+	sub     sp,sp,#0x1F0
+	mov     w19,w0
+	bl	"?func2@@YAXXZ"
+	cmp     w19,#2
+	ble     .LBB0_1
+	bl      "?func2@@YAHXZ"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+.LBB0_1:
+	mov      x0,sp
+	bl       "?func3@@YAHPEAH@Z"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+
+
+.section .pdata,"dr"
+	.long "?func@@YAHXZ"@IMGREL
+        .long "$unwind$func@@YAHXZ"@IMGREL
+
+
+.section	.xdata,"dr"
+"$unwind$func@@YAHXZ":
+        .p2align	3
+	.long		0x10800012
+	.long 		0x8
+	.long 		0xe
+	.long 		0x100d61f
+	.long 		0xe0000000
+
diff --git a/test/tools/llvm-readobj/unwind-arm64-windows.test b/test/tools/llvm-readobj/unwind-arm64-windows.test
new file mode 100644
index 00000000000..879afe27efb
--- /dev/null
+++ b/test/tools/llvm-readobj/unwind-arm64-windows.test
@@ -0,0 +1,69 @@
+RUN: llvm-readobj -unwind %p/Inputs/arm64-win1.obj | FileCheck %s -check-prefix=UNWIND1
+RUN: llvm-readobj -unwind %p/Inputs/arm64-win2.obj | FileCheck %s -check-prefix=UNWIND2
+
+UNWIND1:         ExceptionData {
+UNWIND1-NEXT:      FunctionLength: 340
+UNWIND1-NEXT:      Version: 0
+UNWIND1-NEXT:      ExceptionData: No
+UNWIND1-NEXT:      EpiloguePacked: Yes
+UNWIND1-NEXT:      EpilogueOffset: 15
+UNWIND1-NEXT:      ByteCodeLength: 28
+UNWIND1-NEXT:      Prologue [
+UNWIND1-NEXT:        0xe002dac8          ; sub sp, #2993280
+UNWIND1-NEXT:        0xe3                ; nop
+UNWIND1-NEXT:        0xe3                ; nop
+UNWIND1-NEXT:        0xe3                ; nop
+UNWIND1-NEXT:        0xd885              ; stp d10, d11, [sp, #40]
+UNWIND1-NEXT:        0xd803              ; stp d8, d9, [sp, #24]
+UNWIND1-NEXT:        0xd2c2              ; str x30, [sp, #16]
+UNWIND1-NEXT:        0x28                ; stp x19, x20, [sp, #-64]!
+UNWIND1-NEXT:        0xe4                ; end
+UNWIND1-NEXT:      ]
+UNWIND1-NEXT:      Epilogue [
+UNWIND1-NEXT:        0xe002dac8          ; add sp, #2993280
+UNWIND1-NEXT:        0xd885              ; ldp d10, d11, [sp, #40]
+UNWIND1-NEXT:        0xd803              ; ldp d8, d9, [sp, #24]
+UNWIND1-NEXT:        0xd2c2              ; ldr x30, [sp, #16]
+UNWIND1-NEXT:        0x28                ; ldp x19, x20, [sp], #64
+UNWIND1-NEXT:        0xe4                ; end
+UNWIND1-NEXT:      ]
+UNWIND1_NEXT:    }
+
+
+UNWIND2:         ExceptionData {
+UNWIND2-NEXT:      FunctionLength: 72
+UNWIND2-NEXT:      Version: 0
+UNWIND2-NEXT:      ExceptionData: No
+UNWIND2-NEXT:      EpiloguePacked: No
+UNWIND2-NEXT:      EpilogueScopes: 2
+UNWIND2-NEXT:      ByteCodeLength: 8
+UNWIND2-NEXT:      Prologue [
+UNWIND2-NEXT:        0x1f                ; sub sp, #496
+UNWIND2-NEXT:        0xd600              ; stp x19, lr, [sp, #0]
+UNWIND2-NEXT:        0x01                ; sub sp, #16
+UNWIND2-NEXT:        0xe4                ; end
+UNWIND2-NEXT:      ]
+UNWIND2-NEXT:      EpilogueScopes [
+UNWIND2-NEXT:        EpilogueScope {
+UNWIND2-NEXT:          StartOffset: 8
+UNWIND2-NEXT:          EpilogueStartIndex: 0
+UNWIND2-NEXT:          Opcodes [
+UNWIND2-NEXT:            0x1f                ; add sp, #496
+UNWIND2-NEXT:            0xd600              ; ldp x19, lr, [sp, #0]
+UNWIND2-NEXT:            0x01                ; add sp, #16
+UNWIND2-NEXT:            0xe4                ; end
+UNWIND2-NEXT:          ]
+UNWIND2-NEXT:        }
+UNWIND2-NEXT:        EpilogueScope {
+UNWIND2-NEXT:          StartOffset: 14
+UNWIND2-NEXT:          EpilogueStartIndex: 0
+UNWIND2-NEXT:          Opcodes [
+UNWIND2-NEXT:            0x1f                ; add sp, #496
+UNWIND2-NEXT:            0xd600              ; ldp x19, lr, [sp, #0]
+UNWIND2-NEXT:            0x01                ; add sp, #16
+UNWIND2-NEXT:            0xe4                ; end
+UNWIND2-NEXT:          ]
+UNWIND2-NEXT:        }
+UNWIND2-NEXT:      ]
+UNWIND2-NEXT:    }
+
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.cpp b/tools/llvm-readobj/ARMWinEHPrinter.cpp
index a90840b22c8..56dd6c0aed4 100644
--- a/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -118,31 +118,57 @@ const size_t Decoder::PDataEntrySize = sizeof(RuntimeFunction);
 
 // TODO name the uops more appropriately
 const Decoder::RingEntry Decoder::Ring[] = {
-  { 0x80, 0x00, &Decoder::opcode_0xxxxxxx },  // UOP_STACK_FREE (16-bit)
-  { 0xc0, 0x80, &Decoder::opcode_10Lxxxxx },  // UOP_POP (32-bit)
-  { 0xf0, 0xc0, &Decoder::opcode_1100xxxx },  // UOP_STACK_SAVE (16-bit)
-  { 0xf8, 0xd0, &Decoder::opcode_11010Lxx },  // UOP_POP (16-bit)
-  { 0xf8, 0xd8, &Decoder::opcode_11011Lxx },  // UOP_POP (32-bit)
-  { 0xf8, 0xe0, &Decoder::opcode_11100xxx },  // UOP_VPOP (32-bit)
-  { 0xfc, 0xe8, &Decoder::opcode_111010xx },  // UOP_STACK_FREE (32-bit)
-  { 0xfe, 0xec, &Decoder::opcode_1110110L },  // UOP_POP (16-bit)
-  { 0xff, 0xee, &Decoder::opcode_11101110 },  // UOP_MICROSOFT_SPECIFIC (16-bit)
+  { 0x80, 0x00, 1, &Decoder::opcode_0xxxxxxx },  // UOP_STACK_FREE (16-bit)
+  { 0xc0, 0x80, 2, &Decoder::opcode_10Lxxxxx },  // UOP_POP (32-bit)
+  { 0xf0, 0xc0, 1, &Decoder::opcode_1100xxxx },  // UOP_STACK_SAVE (16-bit)
+  { 0xf8, 0xd0, 1, &Decoder::opcode_11010Lxx },  // UOP_POP (16-bit)
+  { 0xf8, 0xd8, 1, &Decoder::opcode_11011Lxx },  // UOP_POP (32-bit)
+  { 0xf8, 0xe0, 1, &Decoder::opcode_11100xxx },  // UOP_VPOP (32-bit)
+  { 0xfc, 0xe8, 2, &Decoder::opcode_111010xx },  // UOP_STACK_FREE (32-bit)
+  { 0xfe, 0xec, 2, &Decoder::opcode_1110110L },  // UOP_POP (16-bit)
+  { 0xff, 0xee, 2, &Decoder::opcode_11101110 },  // UOP_MICROSOFT_SPECIFIC (16-bit)
                                               // UOP_PUSH_MACHINE_FRAME
                                               // UOP_PUSH_CONTEXT
                                               // UOP_PUSH_TRAP_FRAME
                                               // UOP_REDZONE_RESTORE_LR
-  { 0xff, 0xef, &Decoder::opcode_11101111 },  // UOP_LDRPC_POSTINC (32-bit)
-  { 0xff, 0xf5, &Decoder::opcode_11110101 },  // UOP_VPOP (32-bit)
-  { 0xff, 0xf6, &Decoder::opcode_11110110 },  // UOP_VPOP (32-bit)
-  { 0xff, 0xf7, &Decoder::opcode_11110111 },  // UOP_STACK_RESTORE (16-bit)
-  { 0xff, 0xf8, &Decoder::opcode_11111000 },  // UOP_STACK_RESTORE (16-bit)
-  { 0xff, 0xf9, &Decoder::opcode_11111001 },  // UOP_STACK_RESTORE (32-bit)
-  { 0xff, 0xfa, &Decoder::opcode_11111010 },  // UOP_STACK_RESTORE (32-bit)
-  { 0xff, 0xfb, &Decoder::opcode_11111011 },  // UOP_NOP (16-bit)
-  { 0xff, 0xfc, &Decoder::opcode_11111100 },  // UOP_NOP (32-bit)
-  { 0xff, 0xfd, &Decoder::opcode_11111101 },  // UOP_NOP (16-bit) / END
-  { 0xff, 0xfe, &Decoder::opcode_11111110 },  // UOP_NOP (32-bit) / END
-  { 0xff, 0xff, &Decoder::opcode_11111111 },  // UOP_END
+  { 0xff, 0xef, 2, &Decoder::opcode_11101111 },  // UOP_LDRPC_POSTINC (32-bit)
+  { 0xff, 0xf5, 2, &Decoder::opcode_11110101 },  // UOP_VPOP (32-bit)
+  { 0xff, 0xf6, 2, &Decoder::opcode_11110110 },  // UOP_VPOP (32-bit)
+  { 0xff, 0xf7, 3, &Decoder::opcode_11110111 },  // UOP_STACK_RESTORE (16-bit)
+  { 0xff, 0xf8, 4, &Decoder::opcode_11111000 },  // UOP_STACK_RESTORE (16-bit)
+  { 0xff, 0xf9, 3, &Decoder::opcode_11111001 },  // UOP_STACK_RESTORE (32-bit)
+  { 0xff, 0xfa, 4, &Decoder::opcode_11111010 },  // UOP_STACK_RESTORE (32-bit)
+  { 0xff, 0xfb, 1, &Decoder::opcode_11111011 },  // UOP_NOP (16-bit)
+  { 0xff, 0xfc, 1, &Decoder::opcode_11111100 },  // UOP_NOP (32-bit)
+  { 0xff, 0xfd, 1, &Decoder::opcode_11111101 },  // UOP_NOP (16-bit) / END
+  { 0xff, 0xfe, 1, &Decoder::opcode_11111110 },  // UOP_NOP (32-bit) / END
+  { 0xff, 0xff, 1, &Decoder::opcode_11111111 },  // UOP_END
+};
+
+
+// Unwind opcodes for ARM64.
+// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+const Decoder::RingEntry Decoder::Ring64[] = {
+  { 0xe0, 0x00, 1, &Decoder::opcode_alloc_s },
+  { 0xe0, 0x20, 1, &Decoder::opcode_save_r19r20_x },
+  { 0xc0, 0x40, 1, &Decoder::opcode_save_fplr },
+  { 0xc0, 0x80, 1, &Decoder::opcode_save_fplr_x },
+  { 0xf8, 0xc0, 2, &Decoder::opcode_alloc_m },
+  { 0xfc, 0xc8, 2, &Decoder::opcode_save_regp },
+  { 0xfc, 0xcc, 2, &Decoder::opcode_save_regp_x },
+  { 0xfc, 0xd0, 2, &Decoder::opcode_save_reg },
+  { 0xfe, 0xd4, 2, &Decoder::opcode_save_reg_x },
+  { 0xfe, 0xd6, 2, &Decoder::opcode_save_lrpair },
+  { 0xfe, 0xd8, 2, &Decoder::opcode_save_fregp },
+  { 0xfe, 0xda, 2, &Decoder::opcode_save_fregp_x },
+  { 0xfe, 0xdc, 2, &Decoder::opcode_save_freg },
+  { 0xff, 0xde, 2, &Decoder::opcode_save_freg_x },
+  { 0xff, 0xe0, 4, &Decoder::opcode_alloc_l },
+  { 0xff, 0xe1, 1, &Decoder::opcode_setfp },
+  { 0xff, 0xe2, 2, &Decoder::opcode_addfp },
+  { 0xff, 0xe3, 1, &Decoder::opcode_nop },
+  { 0xff, 0xe4, 1, &Decoder::opcode_end },
+  { 0xff, 0xe5, 1, &Decoder::opcode_end_c },
 };
 
 void Decoder::printRegisters(const std::pair<uint16_t, uint32_t> &RegisterMask) {
@@ -493,18 +519,291 @@ bool Decoder::opcode_11111111(const uint8_t *OC, unsigned &Offset,
   return true;
 }
 
+// ARM64 unwind codes start here.
+bool Decoder::opcode_alloc_s(const uint8_t *OC, unsigned &Offset,
+                             unsigned Length, bool Prologue) {
+  uint32_t NumBytes = (OC[Offset] & 0x1F) << 4;
+  SW.startLine() << format("0x%02x                ; %s sp, #%u\n", OC[Offset],
+                           static_cast<const char *>(Prologue ? "sub" : "add"),
+                           NumBytes);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_save_r19r20_x(const uint8_t *OC, unsigned &Offset,
+                                   unsigned Length, bool Prologue) {
+  uint32_t Off = (OC[Offset] & 0x1F) << 3;
+  if (Prologue)
+    SW.startLine() << format(
+        "0x%02x                ; stp x19, x20, [sp, #-%u]!\n", OC[Offset], Off);
+  else
+    SW.startLine() << format(
+        "0x%02x                ; ldp x19, x20, [sp], #%u\n", OC[Offset], Off);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_save_fplr(const uint8_t *OC, unsigned &Offset,
+                               unsigned Length, bool Prologue) {
+  uint32_t Off = (OC[Offset] & 0x3F) << 3;
+  SW.startLine() << format(
+      "0x%02x                ; %s x29, x30, [sp, #%u]\n", OC[Offset],
+      static_cast<const char *>(Prologue ? "stp" : "ldp"), Off);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_save_fplr_x(const uint8_t *OC, unsigned &Offset,
+                                 unsigned Length, bool Prologue) {
+  uint32_t Off = ((OC[Offset] & 0x3F) + 1) << 3;
+  if (Prologue)
+    SW.startLine() << format(
+        "0x%02x                ; stp x29, x30, [sp, #-%u]!\n", OC[Offset], Off);
+  else
+    SW.startLine() << format(
+        "0x%02x                ; ldp x29, x30, [sp], #%u\n", OC[Offset], Off);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_alloc_m(const uint8_t *OC, unsigned &Offset,
+                             unsigned Length, bool Prologue) {
+  uint32_t NumBytes = ((OC[Offset] & 0x07) << 8);
+  NumBytes |= (OC[Offset + 1] & 0xFF);
+  NumBytes <<= 4;
+  SW.startLine() << format("0x%02x%02x              ; %s sp, #%u\n",
+                           OC[Offset], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "sub" : "add"),
+                           NumBytes);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_regp(const uint8_t *OC, unsigned &Offset,
+                               unsigned Length, bool Prologue) {
+  uint32_t Reg = ((OC[Offset] & 0x03) << 8);
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 19;
+  uint32_t Off = (OC[Offset + 1] & 0x3F) << 3;
+  SW.startLine() << format(
+      "0x%02x%02x              ; %s x%u, x%u, [sp, #%u]\n",
+      OC[Offset], OC[Offset + 1],
+      static_cast<const char *>(Prologue ? "stp" : "ldp"), Reg, Reg + 1, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_regp_x(const uint8_t *OC, unsigned &Offset,
+                                 unsigned Length, bool Prologue) {
+  uint32_t Reg = ((OC[Offset] & 0x03) << 8);
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 19;
+  uint32_t Off = ((OC[Offset + 1] & 0x3F) + 1) << 3;
+  if (Prologue)
+    SW.startLine() << format(
+        "0x%02x%02x              ; stp x%u, x%u, [sp, #-%u]!\n",
+        OC[Offset], OC[Offset + 1], Reg,
+        Reg + 1, Off);
+  else
+    SW.startLine() << format(
+        "0x%02x%02x              ; ldp x%u, x%u, [sp], #%u\n",
+        OC[Offset], OC[Offset + 1], Reg,
+        Reg + 1, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_reg(const uint8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x03) << 8;
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 19;
+  uint32_t Off = (OC[Offset + 1] & 0x3F) << 3;
+  SW.startLine() << format("0x%02x%02x              ; %s x%u, [sp, #%u]\n",
+                           OC[Offset], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "str" : "ldr"),
+                           Reg, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_reg_x(const uint8_t *OC, unsigned &Offset,
+                                unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x01) << 8;
+  Reg |= (OC[Offset + 1] & 0xE0);
+  Reg >>= 5;
+  Reg += 19;
+  uint32_t Off = ((OC[Offset + 1] & 0x1F) + 1) << 3;
+  if (Prologue)
+    SW.startLine() << format("0x%02x%02x              ; str x%u, [sp, #%u]!\n",
+                             OC[Offset], OC[Offset + 1], Reg, Off);
+  else
+    SW.startLine() << format("0x%02x%02x              ; ldr x%u, [sp], #%u\n",
+                             OC[Offset], OC[Offset + 1], Reg, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_lrpair(const uint8_t *OC, unsigned &Offset,
+                                 unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x01) << 8;
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg *= 2;
+  Reg += 19;
+  uint32_t Off = (OC[Offset + 1] & 0x3F) << 3;
+  SW.startLine() << format("0x%02x%02x              ; %s x%u, lr, [sp, #%u]\n",
+                           OC[Offset], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "stp" : "ldp"),
+                           Reg, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_fregp(const uint8_t *OC, unsigned &Offset,
+                                unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x01) << 8;
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 8;
+  uint32_t Off = (OC[Offset + 1] & 0x3F) << 3;
+  SW.startLine() << format("0x%02x%02x              ; %s d%u, d%u, [sp, #%u]\n",
+                           OC[Offset], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "stp" : "ldp"),
+                           Reg, Reg + 1, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_fregp_x(const uint8_t *OC, unsigned &Offset,
+                                  unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x01) << 8;
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 8;
+  uint32_t Off = ((OC[Offset + 1] & 0x3F) + 1) << 3;
+  if (Prologue)
+    SW.startLine() << format(
+        "0x%02x%02x              ; stp d%u, d%u, [sp, #-%u]!\n", OC[Offset],
+        OC[Offset + 1], Reg, Reg + 1, Off);
+  else
+    SW.startLine() << format(
+        "0x%02x%02x              ; ldp d%u, d%u, [sp], #%u\n", OC[Offset],
+        OC[Offset + 1], Reg, Reg + 1, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_freg(const uint8_t *OC, unsigned &Offset,
+                               unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x01) << 8;
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 8;
+  uint32_t Off = (OC[Offset + 1] & 0x3F) << 3;
+  SW.startLine() << format("0x%02x%02x                ; %s d%u, [sp, #%u]\n",
+                           OC[Offset], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "str" : "ldr"),
+                           Reg, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_freg_x(const uint8_t *OC, unsigned &Offset,
+                                 unsigned Length, bool Prologue) {
+  uint32_t Reg = ((OC[Offset + 1] & 0xE0) >> 5) + 8;
+  uint32_t Off = ((OC[Offset + 1] & 0x1F) + 1) << 3;
+  if (Prologue)
+    SW.startLine() << format(
+        "0x%02x%02x              ; str d%u, [sp, #-%u]!\n", OC[Offset],
+        OC[Offset + 1], Reg, Off);
+  else
+    SW.startLine() << format(
+        "0x%02x%02x              ; ldr d%u, [sp], #%u\n", OC[Offset],
+        OC[Offset + 1], Reg, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_alloc_l(const uint8_t *OC, unsigned &Offset,
+                             unsigned Length, bool Prologue) {
+  unsigned Off =
+      (OC[Offset + 1] << 16) | (OC[Offset + 2] << 8) | (OC[Offset + 3] << 0);
+  Off <<= 4;
+  SW.startLine() << format(
+      "0x%02x%02x%02x%02x          ; %s sp, #%u\n", OC[Offset], OC[Offset + 1],
+      OC[Offset + 2], OC[Offset + 3],
+      static_cast<const char *>(Prologue ? "sub" : "add"), Off);
+  Offset += 4;
+  return false;
+}
+
+bool Decoder::opcode_setfp(const uint8_t *OC, unsigned &Offset, unsigned Length,
+                           bool Prologue) {
+  SW.startLine() << format("0x%02x                ; mov fp, sp\n", OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_addfp(const uint8_t *OC, unsigned &Offset, unsigned Length,
+                           bool Prologue) {
+  unsigned NumBytes = OC[Offset + 1] << 3;
+  SW.startLine() << format("0x%02x%02x              ; add fp, sp, #%u\n",
+                           OC[Offset], OC[Offset + 1], NumBytes);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_nop(const uint8_t *OC, unsigned &Offset, unsigned Length,
+                         bool Prologue) {
+  SW.startLine() << format("0x%02x                ; nop\n", OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_end(const uint8_t *OC, unsigned &Offset, unsigned Length,
+                         bool Prologue) {
+  SW.startLine() << format("0x%02x                ; end\n", OC[Offset]);
+  ++Offset;
+  return true;
+}
+
+bool Decoder::opcode_end_c(const uint8_t *OC, unsigned &Offset, unsigned Length,
+                           bool Prologue) {
+  SW.startLine() << format("0x%02x                ; end_c\n", OC[Offset]);
+  ++Offset;
+  return true;
+}
+
 void Decoder::decodeOpcodes(ArrayRef<uint8_t> Opcodes, unsigned Offset,
                             bool Prologue) {
   assert((!Prologue || Offset == 0) && "prologue should always use offset 0");
-
+  const RingEntry* DecodeRing = isAArch64 ? Ring64 : Ring;
   bool Terminated = false;
   for (unsigned OI = Offset, OE = Opcodes.size(); !Terminated && OI < OE; ) {
     for (unsigned DI = 0;; ++DI) {
-      if ((Opcodes[OI] & Ring[DI].Mask) == Ring[DI].Value) {
-        Terminated = (this->*Ring[DI].Routine)(Opcodes.data(), OI, 0, Prologue);
+      if ((isAArch64 && (DI >= array_lengthof(Ring64))) ||
+          (!isAArch64 && (DI >= array_lengthof(Ring)))) {
+        SW.startLine() << format("0x%02x                ; Bad opcode!\n",
+                                 Opcodes.data()[Offset]);
+        ++OI;
+        break;
+      }
+
+      if ((Opcodes[OI] & DecodeRing[DI].Mask) == DecodeRing[DI].Value) {
+        if (OI + DecodeRing[DI].Length > OE) {
+          SW.startLine() << format("Opcode 0x%02x goes past the unwind data\n",
+                                    Opcodes[OI]);
+          OI += DecodeRing[DI].Length;
+          break;
+        }
+        Terminated =
+            (this->*DecodeRing[DI].Routine)(Opcodes.data(), OI, 0, Prologue);
         break;
       }
-      assert(DI < array_lengthof(Ring) && "unhandled opcode");
     }
   }
 }
@@ -520,22 +819,36 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
   uint64_t Offset = VA - SectionVA;
   const ulittle32_t *Data =
     reinterpret_cast<const ulittle32_t *>(Contents.data() + Offset);
-  const ExceptionDataRecord XData(Data);
 
+  // Sanity check to ensure that the .xdata header is present.
+  // A header is one or two words, followed by at least one word to describe
+  // the unwind codes. Applicable to both ARM and AArch64.
+  if (Contents.size() - Offset < 8)
+    report_fatal_error(".xdata must be at least 8 bytes in size");
+
+  const ExceptionDataRecord XData(Data, isAArch64);
   DictScope XRS(SW, "ExceptionData");
-  SW.printNumber("FunctionLength", XData.FunctionLength() << 1);
+  SW.printNumber("FunctionLength",
+                 isAArch64 ? XData.FunctionLengthInBytesAArch64() :
+                 XData.FunctionLengthInBytesARM());
   SW.printNumber("Version", XData.Vers());
   SW.printBoolean("ExceptionData", XData.X());
   SW.printBoolean("EpiloguePacked", XData.E());
-  SW.printBoolean("Fragment", XData.F());
+  if (!isAArch64)
+    SW.printBoolean("Fragment", XData.F());
   SW.printNumber(XData.E() ? "EpilogueOffset" : "EpilogueScopes",
                  XData.EpilogueCount());
-  SW.printNumber("ByteCodeLength",
-                 static_cast<uint64_t>(XData.CodeWords() * sizeof(uint32_t)));
+  uint64_t ByteCodeLength = XData.CodeWords() * sizeof(uint32_t);
+  SW.printNumber("ByteCodeLength", ByteCodeLength);
+
+  if ((int64_t)(Contents.size() - Offset - 4 * HeaderWords(XData) -
+                (XData.E() ? 0 : XData.EpilogueCount() * 4) -
+                (XData.X() ? 8 : 0)) < (int64_t)ByteCodeLength)
+    report_fatal_error("Malformed unwind data");
 
   if (XData.E()) {
     ArrayRef<uint8_t> UC = XData.UnwindByteCode();
-    if (!XData.F()) {
+    if (isAArch64 || !XData.F()) {
       ListScope PS(SW, "Prologue");
       decodeOpcodes(UC, 0, /*Prologue=*/true);
     }
@@ -544,16 +857,25 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
       decodeOpcodes(UC, XData.EpilogueCount(), /*Prologue=*/false);
     }
   } else {
+    {
+      ListScope PS(SW, "Prologue");
+      decodeOpcodes(XData.UnwindByteCode(), 0, /*Prologue=*/true);
+    }
     ArrayRef<ulittle32_t> EpilogueScopes = XData.EpilogueScopes();
     ListScope ESS(SW, "EpilogueScopes");
     for (const EpilogueScope ES : EpilogueScopes) {
       DictScope ESES(SW, "EpilogueScope");
       SW.printNumber("StartOffset", ES.EpilogueStartOffset());
-      SW.printNumber("Condition", ES.Condition());
-      SW.printNumber("EpilogueStartIndex", ES.EpilogueStartIndex());
+      if (!isAArch64)
+        SW.printNumber("Condition", ES.Condition());
+      SW.printNumber("EpilogueStartIndex",
+                     isAArch64 ? ES.EpilogueStartIndexAArch64()
+                               : ES.EpilogueStartIndexARM());
 
       ListScope Opcodes(SW, "Opcodes");
-      decodeOpcodes(XData.UnwindByteCode(), ES.EpilogueStartIndex(),
+      decodeOpcodes(XData.UnwindByteCode(),
+                    isAArch64 ? ES.EpilogueStartIndexAArch64()
+                              : ES.EpilogueStartIndexARM(),
                     /*Prologue=*/false);
     }
   }
@@ -725,8 +1047,9 @@ bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF,
   }
 
   SW.printString("Function", formatSymbol(FunctionName, FunctionAddress));
-  SW.printBoolean("Fragment",
-                  RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment);
+  if (!isAArch64)
+    SW.printBoolean("Fragment",
+                    RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment);
   SW.printNumber("FunctionLength", RF.FunctionLength());
   SW.startLine() << "ReturnType: " << RF.Ret() << '\n';
   SW.printBoolean("HomedParameters", RF.H());
@@ -749,6 +1072,10 @@ bool Decoder::dumpProcedureDataEntry(const COFFObjectFile &COFF,
   DictScope RFS(SW, "RuntimeFunction");
   if (Entry.Flag() == RuntimeFunctionFlag::RFF_Unpacked)
     return dumpUnpackedEntry(COFF, Section, Offset, Index, Entry);
+  if (isAArch64) {
+    llvm::errs() << "Packed unwind data not yet supported for ARM64\n";
+    return false;
+  }
   return dumpPackedEntry(COFF, Section, Offset, Index, Entry);
 }
 
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.h b/tools/llvm-readobj/ARMWinEHPrinter.h
index 95f52170226..e271a1e6fe7 100644
--- a/tools/llvm-readobj/ARMWinEHPrinter.h
+++ b/tools/llvm-readobj/ARMWinEHPrinter.h
@@ -24,13 +24,16 @@ class Decoder {
 
   ScopedPrinter &SW;
   raw_ostream &OS;
+  bool isAArch64;
 
   struct RingEntry {
     uint8_t Mask;
     uint8_t Value;
+    uint8_t Length;
     bool (Decoder::*Routine)(const uint8_t *, unsigned &, unsigned, bool);
   };
   static const RingEntry Ring[];
+  static const RingEntry Ring64[];
 
   bool opcode_0xxxxxxx(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
@@ -75,6 +78,50 @@ class Decoder {
   bool opcode_11111111(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
 
+  // ARM64 unwind codes start here.
+  bool opcode_alloc_s(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                      bool Prologue);
+  bool opcode_save_r19r20_x(const uint8_t *Opcodes, unsigned &Offset,
+                            unsigned Length, bool Prologue);
+  bool opcode_save_fplr(const uint8_t *Opcodes, unsigned &Offset,
+                        unsigned Length, bool Prologue);
+  bool opcode_save_fplr_x(const uint8_t *Opcodes, unsigned &Offset,
+                          unsigned Length, bool Prologue);
+  bool opcode_alloc_m(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                      bool Prologue);
+  bool opcode_save_regp(const uint8_t *Opcodes, unsigned &Offset,
+                        unsigned Length, bool Prologue);
+  bool opcode_save_regp_x(const uint8_t *Opcodes, unsigned &Offset,
+                          unsigned Length, bool Prologue);
+  bool opcode_save_reg(const uint8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_save_reg_x(const uint8_t *Opcodes, unsigned &Offset,
+                         unsigned Length, bool Prologue);
+  bool opcode_save_lrpair(const uint8_t *Opcodes, unsigned &Offset,
+                          unsigned Length, bool Prologue);
+  bool opcode_save_fregp(const uint8_t *Opcodes, unsigned &Offset,
+                         unsigned Length, bool Prologue);
+  bool opcode_save_fregp_x(const uint8_t *Opcodes, unsigned &Offset,
+                           unsigned Length, bool Prologue);
+  bool opcode_save_freg(const uint8_t *Opcodes, unsigned &Offset,
+                        unsigned Length, bool Prologue);
+  bool opcode_save_freg_x(const uint8_t *Opcodes, unsigned &Offset,
+                          unsigned Length, bool Prologue);
+  bool opcode_alloc_l(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                      bool Prologue);
+  bool opcode_setfp(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                    bool Prologue);
+  bool opcode_addfp(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                    bool Prologue);
+  bool opcode_nop(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                  bool Prologue);
+  bool opcode_end(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                  bool Prologue);
+  bool opcode_end_c(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                    bool Prologue);
+  bool opcode_save_next(const uint8_t *Opcodes, unsigned &Offset,
+                        unsigned Length, bool Prologue);
+
   void decodeOpcodes(ArrayRef<uint8_t> Opcodes, unsigned Offset,
                      bool Prologue);
 
@@ -107,7 +154,9 @@ class Decoder {
                          const object::SectionRef Section);
 
 public:
-  Decoder(ScopedPrinter &SW) : SW(SW), OS(SW.getOStream()) {}
+  Decoder(ScopedPrinter &SW, bool isAArch64) : SW(SW),
+                                               OS(SW.getOStream()),
+                                               isAArch64(isAArch64) {}
   std::error_code dumpProcedureData(const object::COFFObjectFile &COFF);
 };
 }
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index fe31c36b602..26fe1aa622f 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -1549,8 +1549,10 @@ void COFFDumper::printUnwindInfo() {
     Dumper.printData(Ctx);
     break;
   }
+  case COFF::IMAGE_FILE_MACHINE_ARM64:
   case COFF::IMAGE_FILE_MACHINE_ARMNT: {
-    ARM::WinEH::Decoder Decoder(W);
+    ARM::WinEH::Decoder Decoder(W, Obj->getMachine() ==
+                                       COFF::IMAGE_FILE_MACHINE_ARM64);
     Decoder.dumpProcedureData(*Obj);
     break;
   }
-- 
GitLab


From bb45f1e99f86e1a52844fc8c98e81cf70ca81622 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 24 Oct 2018 06:13:36 +0000
Subject: [PATCH 0488/1116] [X86] Correct a bad isel predicate. Though I don't
 think it can be exposed.

This B/W VPTEST instructions are only available with AVX512BW. But lowering should prevent any byte or word elements from getting to isel so this can't be exposed.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345112 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrAVX512.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index b2d0ce2bcd3..5550eb0061f 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -5901,7 +5901,7 @@ multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
                             v16i8x_info, NAME#"B">, EVEX_V128;
   }
 
-  let Predicates = [HasAVX512, NoVLX] in {
+  let Predicates = [HasBWI, NoVLX] in {
   defm BZ256_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v32i8x_info, NAME#"B">;
   defm BZ128_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v16i8x_info, NAME#"B">;
   defm WZ256_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v16i16x_info, NAME#"W">;
-- 
GitLab


From edb9243e2d3cbb1326499004fa0e7f28dc3e32c9 Mon Sep 17 00:00:00 2001
From: Gil Rapaport <gil.rapaport@intel.com>
Date: Wed, 24 Oct 2018 07:08:38 +0000
Subject: [PATCH 0489/1116] [LSR] Combine unfolded offset into invariant
 register

LSR reassociates constants as unfolded offsets when the constants fit as
immediate add operands, which currently prevents such constants from being
combined later with loop invariant registers.
This patch modifies GenerateCombinations() to generate a second formula which
includes the unfolded offset in the combined loop-invariant register.

Differential Revision: https://reviews.llvm.org/D51861


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345114 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopStrengthReduce.cpp  | 52 ++++++++++---
 .../AArch64/small-constant.ll                 | 75 +++++--------------
 2 files changed, 60 insertions(+), 67 deletions(-)

diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 857b83da96d..702202b1a5e 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3638,32 +3638,60 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
                                        Formula Base) {
   // This method is only interesting on a plurality of registers.
-  if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
+  if (Base.BaseRegs.size() + (Base.Scale == 1) +
+      (Base.UnfoldedOffset != 0) <= 1)
     return;
 
   // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
   // processing the formula.
   Base.unscale();
-  Formula F = Base;
-  F.BaseRegs.clear();
   SmallVector<const SCEV *, 4> Ops;
+  Formula NewBase = Base;
+  NewBase.BaseRegs.clear();
+  Type *CombinedIntegerType = nullptr;
   for (const SCEV *BaseReg : Base.BaseRegs) {
     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
-        !SE.hasComputableLoopEvolution(BaseReg, L))
+        !SE.hasComputableLoopEvolution(BaseReg, L)) {
+      if (!CombinedIntegerType)
+        CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
       Ops.push_back(BaseReg);
+    }
     else
-      F.BaseRegs.push_back(BaseReg);
+      NewBase.BaseRegs.push_back(BaseReg);
   }
-  if (Ops.size() > 1) {
-    const SCEV *Sum = SE.getAddExpr(Ops);
+
+  // If no register is relevant, we're done.
+  if (Ops.size() == 0)
+    return;
+
+  // Utility function for generating the required variants of the combined
+  // registers.
+  auto GenerateFormula = [&](const SCEV *Sum) {
+    Formula F = NewBase;
+
     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
     // opportunity to fold something. For now, just ignore such cases
     // rather than proceed with zero in a register.
-    if (!Sum->isZero()) {
-      F.BaseRegs.push_back(Sum);
-      F.canonicalize(*L);
-      (void)InsertFormula(LU, LUIdx, F);
-    }
+    if (Sum->isZero())
+      return;
+
+    F.BaseRegs.push_back(Sum);
+    F.canonicalize(*L);
+    (void)InsertFormula(LU, LUIdx, F);
+  };
+
+  // If we collected at least two registers, generate a formula combining them.
+  if (Ops.size() > 1)
+    GenerateFormula(SE.getAddExpr(Ops));
+
+  // If we have an unfolded offset, generate a formula combining it with the
+  // registers collected.
+  if (NewBase.UnfoldedOffset) {
+    assert(CombinedIntegerType && "Missing a type for the unfolded offset");
+    Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
+                                 true));
+    NewBase.UnfoldedOffset = 0;
+    GenerateFormula(SE.getAddExpr(Ops));
   }
 }
 
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll b/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
index 585759dd178..04ad762df99 100644
--- a/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
+++ b/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
@@ -2,45 +2,10 @@
 
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
 
-; LSR doesn't consider bumping a pointer by constants outside the loop when the
-; constants fit as immediate add operands. The constants are re-associated as an
-; unfolded offset rather than a register and are not combined later with
-; loop-invariant registers. For large-enough constants LSR produces better
-; solutions for these test cases, with test1 switching from:
-;
-; The chosen solution requires 2 instructions 2 regs, with addrec cost 1, plus 1 scale cost, plus 4 imm cost, plus 1 setup cost:
-;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
-;     -7 + reg({(7 + %start)<nsw>,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
-;     reg(%arr) + 4*reg({(7 + %start)<nsw>,+,1}<nsw><%for.body>)
-;
-; to:
-;
-; The chosen solution requires 1 instruction 2 regs, with addrec cost 1, plus 1 scale cost, plus 1 setup cost:
-;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
-;     reg({%start,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
-;     reg((88888 + %arr)) + 4*reg({%start,+,1}<nsw><%for.body>)
-;
-; and test2 switching from:
-;
-; The chosen solution requires 2 instructions 2 regs, with addrec cost 1, plus 1 base add, plus 1 scale cost:
-;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
-;     reg({%start,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Basic, Offsets={0}, widest fixup type: i64
-;     reg({%start,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
-;     reg(%arr) + 4*reg({%start,+,1}<nsw><%for.body>) + imm(28)
-;
-; to:
-;
-; The chosen solution requires 1 instruction 2 regs, with addrec cost 1, plus 1 scale cost, plus 1 setup cost:
-;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
-;     reg({%start,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Basic, Offsets={0}, widest fixup type: i64
-;     reg({%start,+,1}<nsw><%for.body>)
-;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
-;     reg((88888 + %arr)) + 4*reg({%start,+,1}<nsw><%for.body>)
+; Test LSR for giving small constants, which get re-associated as unfolded
+; offset, a chance to get combined with loop-invariant registers (same as
+; large constants which do not fit as add immediate operands). LSR
+; favors here to bump the base pointer outside the loop.
 
 ; float test(float *arr, long long start, float threshold) {
 ;   for (long long i = start; i != 0; ++i) {
@@ -56,17 +21,16 @@ define float @test1(float* nocapture readonly %arr, i64 %start, float %threshold
 ; CHECK-NEXT:    fmov s2, #-7.00000000
 ; CHECK-NEXT:    cbz x1, .LBB0_5
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    add x8, x1, #7 // =7
+; CHECK-NEXT:    add x8, x0, #28 // =28
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr s1, [x0, x8, lsl #2]
+; CHECK-NEXT:    ldr s1, [x8, x1, lsl #2]
 ; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    b.gt .LBB0_6
 ; CHECK-NEXT:  // %bb.3: // %for.cond
 ; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    add x8, x8, #1 // =1
-; CHECK-NEXT:    cmp x8, #7 // =7
-; CHECK-NEXT:    b.ne .LBB0_2
+; CHECK-NEXT:    add x1, x1, #1 // =1
+; CHECK-NEXT:    cbnz x1, .LBB0_2
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
@@ -104,26 +68,27 @@ define float @test2(float* nocapture readonly %arr, i64 %start, float %threshold
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov s2, #-7.00000000
-; CHECK-NEXT:    cbz x1, .LBB1_4
-; CHECK-NEXT:  .LBB1_1: // %for.body
+; CHECK-NEXT:    cbz x1, .LBB1_5
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    add x8, x0, #28 // =28
+; CHECK-NEXT:  .LBB1_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x8, x0, x1, lsl #2
-; CHECK-NEXT:    ldr s1, [x8, #28]
+; CHECK-NEXT:    ldr s1, [x8, x1, lsl #2]
 ; CHECK-NEXT:    scvtf s3, x1
 ; CHECK-NEXT:    fadd s3, s3, s0
 ; CHECK-NEXT:    fcmp s1, s3
-; CHECK-NEXT:    b.gt .LBB1_5
-; CHECK-NEXT:  // %bb.2: // %for.cond
-; CHECK-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    b.gt .LBB1_6
+; CHECK-NEXT:  // %bb.3: // %for.cond
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    add x1, x1, #1 // =1
-; CHECK-NEXT:    cbnz x1, .LBB1_1
-; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    cbnz x1, .LBB1_2
+; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:  .LBB1_5:
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB1_5: // %cleanup4
+; CHECK-NEXT:  .LBB1_6: // %cleanup4
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
-- 
GitLab


From 63aae622d8ef517be44dd9ec11904c8242fdcc38 Mon Sep 17 00:00:00 2001
From: Dorit Nuzman <dorit.nuzman@intel.com>
Date: Wed, 24 Oct 2018 07:11:38 +0000
Subject: [PATCH 0490/1116] [LV] Don't have fold-tail under optsize invalidate
 interleave-groups when masked-interleaving is enabled

Enable interleave-groups under fold-tail scenario for Opt for size compilation;
D50480 added support for vectorizing loops of arbitrary trip-count without a
remiander, which in turn makes everything in the loop conditional, including
interleave-groups if any. It therefore invalidated all interleave-groups
because we didn't have support for vectorizing predicated interleaved-groups
at the time. In the meantime, D53011 introduced this support, so we don't
have to invalidate interleave-groups when masked-interleaved support is enabled.

Reviewers: Ayal, hsaito, dcaballe, fhahn

Reviewed By: hsaito

Differential Revision: https://reviews.llvm.org/D53559


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345115 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp    |   8 +-
 .../x86-interleaved-accesses-masked-group.ll  | 387 ++++++++++++++++++
 2 files changed, 394 insertions(+), 1 deletion(-)

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ab0e72960dd..f0a07eddc3b 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6017,8 +6017,14 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
     return NoVectorization;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.
-  if (CM.blockNeedsPredication(OrigLoop->getHeader()))
+  if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
+      !useMaskedInterleavedAccesses(*TTI)) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Invalidate all interleaved groups due to fold-tail by masking "
+           "which requires masked-interleaved support.\n");
     CM.InterleaveInfo.reset();
+  }
 
   if (UserVF) {
     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index 61a2e2ca003..a2304e447f5 100644
--- a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -132,6 +132,106 @@ for.end:
   ret void
 }
 
+
+; Accesses with gaps under Optsize scenario again, with unknown trip-count
+; this time, in order to check the behavior of folding-the-tail (folding the
+; remainder loop into the main loop using masking) together with interleaved-
+; groups.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during Legality checks;
+; When masked-interleave-group is enabled the interleave-groups will be
+; invalidated during cost-model checks, because we don't have a way to support
+; interleave-groups with gaps that require an epilogue using masking.
+; So in both cases we check for no epilogue and scalarized conditional accesses.
+
+; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard,
+;                      int n) {
+;   for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+; DISABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISABLED_MASKED_STRIDED-NOT:   for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi 
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; ENABLED_MASKED_STRIDED-NOT:   for.body:
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.010, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.010, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
 ; Same, but the load/store are not predicated. The interleave-group is
 ; invalidated here as well because we have gaps and we can't create an epilog.
 ; The access is thus scalarized.
@@ -185,6 +285,86 @@ for.end:
 }
 
 
+
+; Unconditioal accesses with gaps under Optsize scenario again, with unknown
+; trip-count this time, in order to check the behavior of folding-the-tail 
+; (folding the remainder loop into the main loop using masking) together with
+; interleaved-groups.
+; The interleave-groups will be invalidated during cost-model checks, because
+; we don't have a way to support interleave-groups with gaps that require an
+; epilogue using masking (even when interleaved-masking is enabled; this
+; is not yet supported).
+; So we check for no epilogue and for scalarized conditional accesses.
+
+;   for(ix=0; ix < n; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+
+; DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} 
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISBLED_MASKED_STRIDED-NOT:    for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; ENABLED_MASKED_STRIDED-NOT:   for.body:
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @unconditional_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl nuw nsw i32 %ix.07, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.07
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.07, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
 ; Check also a scenario with full interleave-groups (no gaps) as well as both
 ; load and store groups. We check that when masked-interleave-group is disabled
 ; the predicated loads (and stores) are not vectorized as an
@@ -272,3 +452,210 @@ for.inc:
 for.end:
   ret void
 }
+
+; Full groups again, this time checking an Optsize scenario, with unknown trip-
+; count, to check the behavior of folding-the-tail (folding the remainder loop
+; into the main loop using masking) together with interleaved-groups.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during Legality check, so nothing to check here.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+;
+; void masked_strided2_unknown_tc(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard,
+;                     int n) {
+; for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided2_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = or i32 [[TMP1]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add <8 x i32> {{.*}}, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = icmp eq i32 {{.*}}, {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP13]], 
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp22 = icmp sgt i32 %n, 0
+  br i1 %cmp22, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.023 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp sgt i32 %ix.023, %guard
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.023, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx3, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx5 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx5, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx9 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx9, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.023, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; Full groups under Optsize scenario again, with unknown trip-count, again in
+; order to check the behavior of folding-the-tail (folding the remainder loop
+; into the main loop using masking) together with interleaved-groups.
+; This time the accesses are not conditional, they become conditional only
+; due to tail folding.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during cost-model checks, so we check for no epilogue and
+; scalarized conditional accesses.
+; When masked-interleave-group is enabled we check for no epilogue,
+; and interleave-groups vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; (Same vectorization scheme as for the previous loop with conditional accesses
+; except here the mask only masks away the remainder iterations.)
+;
+; void unconditional_masked_strided2_unknown_tc(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     int n) {
+; for(ix=0; ix < n; ++ix) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+; }
+;}
+
+; DISABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISABLED_MASKED_STRIDED-NOT:   for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> {{.*}}, {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = or i32 [[TMP0]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i32 {{.*}}, {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP11]]
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp20 = icmp sgt i32 %n, 0
+  br i1 %cmp20, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.021 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl nuw nsw i32 %ix.021, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx2 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx2, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx4 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx4, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx8 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx8, align 1
+  %inc = add nuw nsw i32 %ix.021, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
-- 
GitLab


From 8b5bda22ebfff505eca10488ff1e48348ed8b0d4 Mon Sep 17 00:00:00 2001
From: Eugene Leviant <eleviant@accesssoftek.com>
Date: Wed, 24 Oct 2018 07:48:32 +0000
Subject: [PATCH 0491/1116] [ThinLTO] Fix dot dumper for regular LTO modules

Regular LTO module identifier is (unsigned)-1. This patch emits correct
module identifier while printing edges with source summary in regular
LTO module.

Differential revision: https://reviews.llvm.org/D53583


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345118 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/ModuleSummaryIndex.cpp           |  2 +-
 test/ThinLTO/X86/dot-dumper-full-lto.ll | 28 +++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 test/ThinLTO/X86/dot-dumper-full-lto.ll

diff --git a/lib/IR/ModuleSummaryIndex.cpp b/lib/IR/ModuleSummaryIndex.cpp
index 4c4466f9a90..02661915b2c 100644
--- a/lib/IR/ModuleSummaryIndex.cpp
+++ b/lib/IR/ModuleSummaryIndex.cpp
@@ -241,7 +241,7 @@ void ModuleSummaryIndex::exportToDot(raw_ostream& OS) const {
                                        "_" + std::to_string(Id);
   };
 
-  auto DrawEdge = [&](const char *Pfx, int SrcMod, GlobalValue::GUID SrcId,
+  auto DrawEdge = [&](const char *Pfx, uint64_t SrcMod, GlobalValue::GUID SrcId,
                       int DstMod, GlobalValue::GUID DstId, int TypeOrHotness) {
     // 0 corresponds to alias edge, 1 to ref edge, 2 to call with unknown
     // hotness, ...
diff --git a/test/ThinLTO/X86/dot-dumper-full-lto.ll b/test/ThinLTO/X86/dot-dumper-full-lto.ll
new file mode 100644
index 00000000000..6d4d1efa9a3
--- /dev/null
+++ b/test/ThinLTO/X86/dot-dumper-full-lto.ll
@@ -0,0 +1,28 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/dot-dumper.ll -o %t2.bc
+; RUN: llvm-lto2 run -save-temps %t1.bc %t2.bc -o %t3 \
+; RUN:  -r=%t1.bc,main,px \
+; RUN:  -r=%t1.bc,A, \
+; RUN:  -r=%t2.bc,foo,p \
+; RUN:  -r=%t2.bc,bar,p \
+; RUN:  -r=%t2.bc,A,p \
+; RUN:  -r=%t2.bc,B,p
+; RUN: cat %t3.index.dot | FileCheck %s
+
+; CHECK: subgraph cluster_4294967295
+; CHECK:   M4294967295_[[ID:[0-9]+]]{{.*}}main
+; CHECK: // Cross-module edges:
+; CHECK:  M4294967295_[[ID]] -> M0_{{[0-9]+}}{{.*}}// ref
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = external global i32
+
+define i32 @main() {
+  %v = load i32, i32* @A
+  ret i32 %v
+}
+
+!0 = !{i32 1, !"ThinLTO", i32 0}
+!llvm.module.flags = !{ !0 }
-- 
GitLab


From 7d5e5c27471568f7928bc81af4ff1c64469ec2e8 Mon Sep 17 00:00:00 2001
From: Tim Renouf <tpr.llvm@botech.co.uk>
Date: Wed, 24 Oct 2018 08:14:07 +0000
Subject: [PATCH 0492/1116] [AMDGPU] Defined gfx909 Raven Ridge 2

Differential Revision: https://reviews.llvm.org/D53418

Change-Id: Ie3d054f2e956c2768988c0f4c0ffd29a47294eef

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345120 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/AMDGPUUsage.rst                             |  6 ++++++
 include/llvm/BinaryFormat/ELF.h                  |  3 ++-
 include/llvm/Support/TargetParser.h              |  3 ++-
 lib/ObjectYAML/ELFYAML.cpp                       |  1 +
 lib/Support/TargetParser.cpp                     |  4 +++-
 lib/Target/AMDGPU/AMDGPU.td                      |  7 +++++++
 lib/Target/AMDGPU/AMDGPUSubtarget.h              |  1 +
 lib/Target/AMDGPU/GCNProcessors.td               |  5 +++++
 .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp |  2 ++
 test/CodeGen/AMDGPU/elf-header-flags-mach.ll     |  2 ++
 test/CodeGen/AMDGPU/hsa-note-no-func.ll          |  2 ++
 test/Object/AMDGPU/elf-header-flags-mach.yaml    | 16 ++++++++++++++++
 tools/llvm-readobj/ELFDumper.cpp                 |  1 +
 13 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst
index 1ddda1bae9e..2692078d28b 100644
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@@ -210,6 +210,11 @@ names from both the *Processor* and *Alternative Processor* can be used.
                                                                       .. TODO
                                                                          Add product
                                                                          names.
+     ``gfx909``                  ``amdgcn``   APU   - xnack           *TBA* (Raven Ridge 2)
+                                                      [on]
+                                                                      .. TODO
+                                                                         Add product
+                                                                         names.
      =========== =============== ============ ===== ========= ======= ==================
 
 .. _amdgpu-target-features:
@@ -589,6 +594,7 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_MACH_AMDGCN_GFX904``  0x02e      ``gfx904``
      ``EF_AMDGPU_MACH_AMDGCN_GFX906``  0x02f      ``gfx906``
      *reserved*                        0x030      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX909``  0x031      ``gfx909``
      ================================= ========== =============================
 
 Sections
diff --git a/include/llvm/BinaryFormat/ELF.h b/include/llvm/BinaryFormat/ELF.h
index 2e778779117..26f65be9f1d 100644
--- a/include/llvm/BinaryFormat/ELF.h
+++ b/include/llvm/BinaryFormat/ELF.h
@@ -701,6 +701,7 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
   EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
+  EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031,
 
   // Reserved for AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027,
@@ -708,7 +709,7 @@ enum : unsigned {
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX906,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX909,
 
   // Indicates if the xnack target feature is enabled for all code contained in
   // the object.
diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h
index 01397e8ebb7..63241b52e1f 100644
--- a/include/llvm/Support/TargetParser.h
+++ b/include/llvm/Support/TargetParser.h
@@ -317,9 +317,10 @@ enum GPUKind : uint32_t {
   GK_GFX902 = 61,
   GK_GFX904 = 62,
   GK_GFX906 = 63,
+  GK_GFX909 = 65,
 
   GK_AMDGCN_FIRST = GK_GFX600,
-  GK_AMDGCN_LAST = GK_GFX906,
+  GK_AMDGCN_LAST = GK_GFX909,
 };
 
 /// Instruction set architecture version.
diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp
index a381a63d600..2c69f115bca 100644
--- a/lib/ObjectYAML/ELFYAML.cpp
+++ b/lib/ObjectYAML/ELFYAML.cpp
@@ -402,6 +402,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX902, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX904, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH);
     BCase(EF_AMDGPU_XNACK);
     break;
   case ELF::EM_X86_64:
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index f2fdc23ad85..968b559c08d 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -995,7 +995,7 @@ constexpr GPUInfo R600GPUs[26] = {
 
 // This table should be sorted by the value of GPUKind
 // Don't bother listing the implicitly true features
-constexpr GPUInfo AMDGCNGPUs[32] = {
+constexpr GPUInfo AMDGCNGPUs[33] = {
   // Name         Canonical    Kind        Features
   //              Name
   {{"gfx600"},    {"gfx600"},  GK_GFX600,  FEATURE_FAST_FMA_F32},
@@ -1030,6 +1030,7 @@ constexpr GPUInfo AMDGCNGPUs[32] = {
   {{"gfx902"},    {"gfx902"},  GK_GFX902,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
   {{"gfx904"},    {"gfx904"},  GK_GFX904,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
   {{"gfx906"},    {"gfx906"},  GK_GFX906,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
+  {{"gfx909"},    {"gfx909"},  GK_GFX909,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
 };
 
 const GPUInfo *getArchEntry(AMDGPU::GPUKind AK, ArrayRef<GPUInfo> Table) {
@@ -1124,6 +1125,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX902: return {9, 0, 2};
   case GK_GFX904: return {9, 0, 4};
   case GK_GFX906: return {9, 0, 6};
+  case GK_GFX909: return {9, 0, 9};
   default:        return {0, 0, 0};
   }
 }
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index dd9c16a9435..54b6c8a7882 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -553,6 +553,13 @@ def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
    FeatureLDSBankCount32,
    FeatureDLInsts]>;
 
+def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9,
+  [FeatureGFX9,
+   FeatureMadMixInsts,
+   FeatureLDSBankCount32,
+   FeatureXNACK,
+   FeatureD16PreservesUnusedBits]>;
+
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index fb39dc4493c..ca055f6c957 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -263,6 +263,7 @@ public:
     ISAVersion9_0_2,
     ISAVersion9_0_4,
     ISAVersion9_0_6,
+    ISAVersion9_0_9,
   };
 
   enum TrapHandlerAbi {
diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td
index d76acfa24f9..b8142a4e4ff 100644
--- a/lib/Target/AMDGPU/GCNProcessors.td
+++ b/lib/Target/AMDGPU/GCNProcessors.td
@@ -156,3 +156,8 @@ def : ProcessorModel<"gfx904", SIQuarterSpeedModel,
 def : ProcessorModel<"gfx906", SIQuarterSpeedModel,
   [FeatureISAVersion9_0_6]
 >;
+
+def : ProcessorModel<"gfx909", SIQuarterSpeedModel,
+  [FeatureISAVersion9_0_9]
+>;
+
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 3f6ab244c34..a7b8c11288f 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -83,6 +83,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909;  break;
   case ELF::EF_AMDGPU_MACH_NONE:          AK = GK_NONE;    break;
   }
 
@@ -129,6 +130,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX902:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX902;
   case GK_GFX904:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904;
   case GK_GFX906:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
+  case GK_GFX909:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
   case GK_NONE:    return ELF::EF_AMDGPU_MACH_NONE;
   }
 
diff --git a/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
index 9d2d3690995..5887951b4e6 100644
--- a/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
+++ b/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
@@ -46,6 +46,7 @@
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX902 %s
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx904 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX904 %s
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX906 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx909 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX909 %s
 
 ; ARCH-R600: Arch: r600
 ; ARCH-GCN:  Arch: amdgcn
@@ -85,6 +86,7 @@
 ; GFX902-NEXT:   EF_AMDGPU_XNACK              (0x100)
 ; GFX904:        EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E)
 ; GFX906:        EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
+; GFX909:        EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31)
 ; ALL:         ]
 
 define amdgpu_kernel void @elf_header() {
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
index dd38d1d2366..e937aaca66f 100644
--- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -23,6 +23,7 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx902 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX902 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx904 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx906 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx909 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s
 
 ; HSA: .hsa_code_object_version 2,1
 ; HSA-SI600: .hsa_code_object_isa 6,0,0,"AMD","AMDGPU"
@@ -40,3 +41,4 @@
 ; HSA-GFX902: .hsa_code_object_isa 9,0,2,"AMD","AMDGPU"
 ; HSA-GFX904: .hsa_code_object_isa 9,0,4,"AMD","AMDGPU"
 ; HSA-GFX906: .hsa_code_object_isa 9,0,6,"AMD","AMDGPU"
+; HSA-GFX909: .hsa_code_object_isa 9,0,9,"AMD","AMDGPU"
diff --git a/test/Object/AMDGPU/elf-header-flags-mach.yaml b/test/Object/AMDGPU/elf-header-flags-mach.yaml
index c3800d2ff27..7a594843c20 100644
--- a/test/Object/AMDGPU/elf-header-flags-mach.yaml
+++ b/test/Object/AMDGPU/elf-header-flags-mach.yaml
@@ -91,6 +91,9 @@
 # RUN: yaml2obj -docnum=31 %s > %t.o.31
 # RUN: llvm-readobj -s -file-headers %t.o.31 | FileCheck --check-prefixes=ELF-ALL,ELF-GFX906 %s
 # RUN: obj2yaml %t.o.31 | FileCheck --check-prefixes=YAML-GFX906 %s
+# RUN: yaml2obj -docnum=32 %s > %t.o.32
+# RUN: llvm-readobj -s -file-headers %t.o.32 | FileCheck --check-prefixes=ELF-ALL,ELF-GFX909 %s
+# RUN: obj2yaml %t.o.32 | FileCheck --check-prefixes=YAML-GFX909 %s
 
 
 # ELF-ALL:     Flags [
@@ -125,6 +128,7 @@
 # ELF-GFX902:    EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D)
 # ELF-GFX904:    EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E)
 # ELF-GFX906:    EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
+# ELF-GFX909:    EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31)
 # ELF-ALL:     ]
 
 # YAML-R600:    Flags: [ EF_AMDGPU_MACH_R600_R600 ]
@@ -158,6 +162,7 @@
 # YAML-GFX902:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX902 ]
 # YAML-GFX904:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX904 ]
 # YAML-GFX906:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX906 ]
+# YAML-GFX909:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX909 ]
 
 # Doc1
 --- !ELF
@@ -499,3 +504,14 @@ FileHeader:
   Machine: EM_AMDGPU
   Flags:   [ EF_AMDGPU_MACH_AMDGCN_GFX906 ]
 ...
+
+# Doc32
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  OSABI:   ELFOSABI_NONE
+  Type:    ET_REL
+  Machine: EM_AMDGPU
+  Flags:   [ EF_AMDGPU_MACH_AMDGCN_GFX909 ]
+...
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index 5e7eae1b272..bace24fefb9 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -1325,6 +1325,7 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX902),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX904),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX906),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX909),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK)
 };
 
-- 
GitLab


From 37f7dbe18580e0e8017ec2630189336c426899e4 Mon Sep 17 00:00:00 2001
From: Gil Rapaport <gil.rapaport@intel.com>
Date: Wed, 24 Oct 2018 08:41:22 +0000
Subject: [PATCH 0493/1116] Revert r345114

Investigating fails.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345123 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopStrengthReduce.cpp  | 52 +++----------
 .../AArch64/small-constant.ll                 | 75 ++++++++++++++-----
 2 files changed, 67 insertions(+), 60 deletions(-)

diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 702202b1a5e..857b83da96d 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3638,60 +3638,32 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
                                        Formula Base) {
   // This method is only interesting on a plurality of registers.
-  if (Base.BaseRegs.size() + (Base.Scale == 1) +
-      (Base.UnfoldedOffset != 0) <= 1)
+  if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
     return;
 
   // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
   // processing the formula.
   Base.unscale();
+  Formula F = Base;
+  F.BaseRegs.clear();
   SmallVector<const SCEV *, 4> Ops;
-  Formula NewBase = Base;
-  NewBase.BaseRegs.clear();
-  Type *CombinedIntegerType = nullptr;
   for (const SCEV *BaseReg : Base.BaseRegs) {
     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
-        !SE.hasComputableLoopEvolution(BaseReg, L)) {
-      if (!CombinedIntegerType)
-        CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
+        !SE.hasComputableLoopEvolution(BaseReg, L))
       Ops.push_back(BaseReg);
-    }
     else
-      NewBase.BaseRegs.push_back(BaseReg);
+      F.BaseRegs.push_back(BaseReg);
   }
-
-  // If no register is relevant, we're done.
-  if (Ops.size() == 0)
-    return;
-
-  // Utility function for generating the required variants of the combined
-  // registers.
-  auto GenerateFormula = [&](const SCEV *Sum) {
-    Formula F = NewBase;
-
+  if (Ops.size() > 1) {
+    const SCEV *Sum = SE.getAddExpr(Ops);
     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
     // opportunity to fold something. For now, just ignore such cases
     // rather than proceed with zero in a register.
-    if (Sum->isZero())
-      return;
-
-    F.BaseRegs.push_back(Sum);
-    F.canonicalize(*L);
-    (void)InsertFormula(LU, LUIdx, F);
-  };
-
-  // If we collected at least two registers, generate a formula combining them.
-  if (Ops.size() > 1)
-    GenerateFormula(SE.getAddExpr(Ops));
-
-  // If we have an unfolded offset, generate a formula combining it with the
-  // registers collected.
-  if (NewBase.UnfoldedOffset) {
-    assert(CombinedIntegerType && "Missing a type for the unfolded offset");
-    Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
-                                 true));
-    NewBase.UnfoldedOffset = 0;
-    GenerateFormula(SE.getAddExpr(Ops));
+    if (!Sum->isZero()) {
+      F.BaseRegs.push_back(Sum);
+      F.canonicalize(*L);
+      (void)InsertFormula(LU, LUIdx, F);
+    }
   }
 }
 
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll b/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
index 04ad762df99..585759dd178 100644
--- a/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
+++ b/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
@@ -2,10 +2,45 @@
 
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
 
-; Test LSR for giving small constants, which get re-associated as unfolded
-; offset, a chance to get combined with loop-invariant registers (same as
-; large constants which do not fit as add immediate operands). LSR
-; favors here to bump the base pointer outside the loop.
+; LSR doesn't consider bumping a pointer by constants outside the loop when the
+; constants fit as immediate add operands. The constants are re-associated as an
+; unfolded offset rather than a register and are not combined later with
+; loop-invariant registers. For large-enough constants LSR produces better
+; solutions for these test cases, with test1 switching from:
+;
+; The chosen solution requires 2 instructions 2 regs, with addrec cost 1, plus 1 scale cost, plus 4 imm cost, plus 1 setup cost:
+;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
+;     -7 + reg({(7 + %start)<nsw>,+,1}<nsw><%for.body>)
+;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
+;     reg(%arr) + 4*reg({(7 + %start)<nsw>,+,1}<nsw><%for.body>)
+;
+; to:
+;
+; The chosen solution requires 1 instruction 2 regs, with addrec cost 1, plus 1 scale cost, plus 1 setup cost:
+;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
+;     reg({%start,+,1}<nsw><%for.body>)
+;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
+;     reg((88888 + %arr)) + 4*reg({%start,+,1}<nsw><%for.body>)
+;
+; and test2 switching from:
+;
+; The chosen solution requires 2 instructions 2 regs, with addrec cost 1, plus 1 base add, plus 1 scale cost:
+;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
+;     reg({%start,+,1}<nsw><%for.body>)
+;   LSR Use: Kind=Basic, Offsets={0}, widest fixup type: i64
+;     reg({%start,+,1}<nsw><%for.body>)
+;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
+;     reg(%arr) + 4*reg({%start,+,1}<nsw><%for.body>) + imm(28)
+;
+; to:
+;
+; The chosen solution requires 1 instruction 2 regs, with addrec cost 1, plus 1 scale cost, plus 1 setup cost:
+;   LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i64
+;     reg({%start,+,1}<nsw><%for.body>)
+;   LSR Use: Kind=Basic, Offsets={0}, widest fixup type: i64
+;     reg({%start,+,1}<nsw><%for.body>)
+;   LSR Use: Kind=Address of float in addrspace(0), Offsets={0}, widest fixup type: float*
+;     reg((88888 + %arr)) + 4*reg({%start,+,1}<nsw><%for.body>)
 
 ; float test(float *arr, long long start, float threshold) {
 ;   for (long long i = start; i != 0; ++i) {
@@ -21,16 +56,17 @@ define float @test1(float* nocapture readonly %arr, i64 %start, float %threshold
 ; CHECK-NEXT:    fmov s2, #-7.00000000
 ; CHECK-NEXT:    cbz x1, .LBB0_5
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    add x8, x0, #28 // =28
+; CHECK-NEXT:    add x8, x1, #7 // =7
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr s1, [x8, x1, lsl #2]
+; CHECK-NEXT:    ldr s1, [x0, x8, lsl #2]
 ; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    b.gt .LBB0_6
 ; CHECK-NEXT:  // %bb.3: // %for.cond
 ; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    add x1, x1, #1 // =1
-; CHECK-NEXT:    cbnz x1, .LBB0_2
+; CHECK-NEXT:    add x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #7 // =7
+; CHECK-NEXT:    b.ne .LBB0_2
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
@@ -68,27 +104,26 @@ define float @test2(float* nocapture readonly %arr, i64 %start, float %threshold
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov s2, #-7.00000000
-; CHECK-NEXT:    cbz x1, .LBB1_5
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    add x8, x0, #28 // =28
-; CHECK-NEXT:  .LBB1_2: // %for.body
+; CHECK-NEXT:    cbz x1, .LBB1_4
+; CHECK-NEXT:  .LBB1_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr s1, [x8, x1, lsl #2]
+; CHECK-NEXT:    add x8, x0, x1, lsl #2
+; CHECK-NEXT:    ldr s1, [x8, #28]
 ; CHECK-NEXT:    scvtf s3, x1
 ; CHECK-NEXT:    fadd s3, s3, s0
 ; CHECK-NEXT:    fcmp s1, s3
-; CHECK-NEXT:    b.gt .LBB1_6
-; CHECK-NEXT:  // %bb.3: // %for.cond
-; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    b.gt .LBB1_5
+; CHECK-NEXT:  // %bb.2: // %for.cond
+; CHECK-NEXT:    // in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    add x1, x1, #1 // =1
-; CHECK-NEXT:    cbnz x1, .LBB1_2
-; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    cbnz x1, .LBB1_1
+; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB1_5:
+; CHECK-NEXT:  .LBB1_4:
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB1_6: // %cleanup4
+; CHECK-NEXT:  .LBB1_5: // %cleanup4
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
-- 
GitLab


From 62af346c35b530d2f3981a476f5ae7e6cb117ed4 Mon Sep 17 00:00:00 2001
From: Eugene Leviant <eleviant@accesssoftek.com>
Date: Wed, 24 Oct 2018 08:59:58 +0000
Subject: [PATCH 0494/1116] [ThinLTO] Change parameter type. NFC

Change destination module type for consistency with r345118


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345124 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/ModuleSummaryIndex.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/IR/ModuleSummaryIndex.cpp b/lib/IR/ModuleSummaryIndex.cpp
index 02661915b2c..d4368413584 100644
--- a/lib/IR/ModuleSummaryIndex.cpp
+++ b/lib/IR/ModuleSummaryIndex.cpp
@@ -242,7 +242,7 @@ void ModuleSummaryIndex::exportToDot(raw_ostream& OS) const {
   };
 
   auto DrawEdge = [&](const char *Pfx, uint64_t SrcMod, GlobalValue::GUID SrcId,
-                      int DstMod, GlobalValue::GUID DstId, int TypeOrHotness) {
+                      uint64_t DstMod, GlobalValue::GUID DstId, int TypeOrHotness) {
     // 0 corresponds to alias edge, 1 to ref edge, 2 to call with unknown
     // hotness, ...
     TypeOrHotness += 2;
-- 
GitLab


From 8f55d0922bba1f85b3f0fa087a317eade3249ae8 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Wed, 24 Oct 2018 10:56:47 +0000
Subject: [PATCH 0495/1116] [llvm-mca] [llvm-mca] Improved error handling and
 error reporting from class InstrBuilder.

A new class named InstructionError has been added to Support.h in order to
improve the error reporting from class InstrBuilder.
The llvm-mca driver is responsible for handling InstructionError objects, and
printing them out to stderr.

The goal of this patch is to remove all the remaining error handling logic from
the library code.
In particular, this allows us to:
 - Simplify the logic in InstrBuilder by removing a needless dependency from
MCInstrPrinter.
 - Centralize all the error halding logic in a new function named 'runPipeline'
(see llvm-mca.cpp).

This is also a first step towards generalizing class InstrBuilder, so that in
future, we will be able to reuse its logic to also "lower" MachineInstr to
mca::Instruction objects.

Differential Revision: https://reviews.llvm.org/D53585


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345129 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm-mca/ARM/unsupported-write-variant.s  |  6 ++
 tools/llvm-mca/include/InstrBuilder.h         |  6 +-
 tools/llvm-mca/include/Support.h              | 20 +++++++
 tools/llvm-mca/lib/InstrBuilder.cpp           | 59 ++++++++-----------
 tools/llvm-mca/llvm-mca.cpp                   | 40 ++++++++++---
 5 files changed, 84 insertions(+), 47 deletions(-)
 create mode 100644 test/tools/llvm-mca/ARM/unsupported-write-variant.s

diff --git a/test/tools/llvm-mca/ARM/unsupported-write-variant.s b/test/tools/llvm-mca/ARM/unsupported-write-variant.s
new file mode 100644
index 00000000000..f4511f54ab5
--- /dev/null
+++ b/test/tools/llvm-mca/ARM/unsupported-write-variant.s
@@ -0,0 +1,6 @@
+# RUN: not llvm-mca -march=arm -mcpu=swift -all-views=false 2>&1 < %s | FileCheck %s
+
+add r3, r1, r12, lsl #2
+
+# CHECK:      error: unable to resolve scheduling class for write variant.
+# CHECK-NEXT: note: instruction:    add r3, r1, r12, lsl #2
diff --git a/tools/llvm-mca/include/InstrBuilder.h b/tools/llvm-mca/include/InstrBuilder.h
index ff7fb52044a..9fee94bbb3f 100644
--- a/tools/llvm-mca/include/InstrBuilder.h
+++ b/tools/llvm-mca/include/InstrBuilder.h
@@ -17,7 +17,6 @@
 
 #include "Instruction.h"
 #include "Support.h"
-#include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -41,7 +40,6 @@ class InstrBuilder {
   const llvm::MCInstrInfo &MCII;
   const llvm::MCRegisterInfo &MRI;
   const llvm::MCInstrAnalysis &MCIA;
-  llvm::MCInstPrinter &MCIP;
   llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
 
   llvm::DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
@@ -66,8 +64,8 @@ class InstrBuilder {
 public:
   InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii,
                const llvm::MCRegisterInfo &mri,
-               const llvm::MCInstrAnalysis &mcia, llvm::MCInstPrinter &mcip)
-      : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia), MCIP(mcip),
+               const llvm::MCInstrAnalysis &mcia)
+      : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia),
         ProcResourceMasks(STI.getSchedModel().getNumProcResourceKinds()) {
     computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
   }
diff --git a/tools/llvm-mca/include/Support.h b/tools/llvm-mca/include/Support.h
index 91c8e1b4177..9371394542d 100644
--- a/tools/llvm-mca/include/Support.h
+++ b/tools/llvm-mca/include/Support.h
@@ -18,9 +18,29 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
+#include "llvm/Support/Error.h"
 
 namespace mca {
 
+template <typename T>
+class InstructionError : public llvm::ErrorInfo<InstructionError<T>> {
+public:
+  static char ID;
+  std::string Message;
+  const T &Inst;
+
+  InstructionError(std::string M, const T &MCI)
+      : Message(std::move(M)), Inst(MCI) {}
+
+  void log(llvm::raw_ostream &OS) const override { OS << Message; }
+
+  std::error_code convertToErrorCode() const override {
+    return llvm::inconvertibleErrorCode();
+  }
+};
+
+template <typename T> char InstructionError<T>::ID;
+
 /// This class represents the number of cycles per resource (fractions of
 /// cycles).  That quantity is managed here as a ratio, and accessed via the
 /// double cast-operator below.  The two quantities, number of cycles and
diff --git a/tools/llvm-mca/lib/InstrBuilder.cpp b/tools/llvm-mca/lib/InstrBuilder.cpp
index 1cb020a9f6d..55f1ebf6e8a 100644
--- a/tools/llvm-mca/lib/InstrBuilder.cpp
+++ b/tools/llvm-mca/lib/InstrBuilder.cpp
@@ -215,9 +215,8 @@ Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
   }
 
   if (CurrentDef != NumExplicitDefs) {
-    return make_error<StringError>(
-        "error: Expected more register operand definitions.",
-        inconvertibleErrorCode());
+    return make_error<InstructionError<MCInst>>(
+        "Expected more register operand definitions.", MCI);
   }
 
   CurrentDef = 0;
@@ -253,11 +252,12 @@ Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
     // Always assume that the optional definition is the last operand of the
     // MCInst sequence.
     const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1);
-    if (i == MCI.getNumOperands() || !Op.isReg())
-      return make_error<StringError>(
-          "error: expected a register operand for an optional "
-          "definition. Instruction has not be correctly analyzed.",
-          inconvertibleErrorCode());
+    if (i == MCI.getNumOperands() || !Op.isReg()) {
+      std::string Message =
+          "expected a register operand for an optional definition. Instruction "
+          "has not been correctly analyzed.";
+      return make_error<InstructionError<MCInst>>(Message, MCI);
+    }
 
     WriteDescriptor &Write = ID.Writes[TotalDefs - 1];
     Write.OpIndex = MCI.getNumOperands() - 1;
@@ -284,9 +284,8 @@ Error InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
   }
 
   if (NumExplicitDefs) {
-    return make_error<StringError>(
-        "error: Expected more register operand definitions. ",
-        inconvertibleErrorCode());
+    return make_error<InstructionError<MCInst>>(
+        "Expected more register operand definitions.", MCI);
   }
 
   unsigned NumExplicitUses = MCI.getNumOperands() - i;
@@ -332,23 +331,18 @@ Error InstrBuilder::verifyInstrDesc(const InstrDesc &ID,
   if (!UsesMemory && !UsesBuffers && !UsesResources)
     return ErrorSuccess();
 
-  std::string ToString;
-  raw_string_ostream OS(ToString);
+  StringRef Message;
   if (UsesMemory) {
-    WithColor::error() << "found an inconsistent instruction that decodes "
-                       << "into zero opcodes and that consumes load/store "
-                       << "unit resources.\n";
+    Message = "found an inconsistent instruction that decodes "
+              "into zero opcodes and that consumes load/store "
+              "unit resources.";
   } else {
-    WithColor::error() << "found an inconsistent instruction that decodes"
-                       << " to zero opcodes and that consumes scheduler "
-                       << "resources.\n";
+    Message = "found an inconsistent instruction that decodes "
+              "to zero opcodes and that consumes scheduler "
+              "resources.";
   }
 
-  MCIP.printInst(&MCI, OS, "", STI);
-  OS.flush();
-  WithColor::note() << "instruction: " << ToString << '\n';
-  return make_error<StringError>("Invalid instruction definition found",
-                                 inconvertibleErrorCode());
+  return make_error<InstructionError<MCInst>>(Message, MCI);
 }
 
 Expected<const InstrDesc &>
@@ -371,24 +365,17 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
       SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID);
 
     if (!SchedClassID) {
-      return make_error<StringError>("unable to resolve this variant class.",
-                                     inconvertibleErrorCode());
+      return make_error<InstructionError<MCInst>>(
+          "unable to resolve scheduling class for write variant.", MCI);
     }
   }
 
   // Check if this instruction is supported. Otherwise, report an error.
   const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
   if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) {
-    std::string ToString;
-    raw_string_ostream OS(ToString);
-    WithColor::error() << "found an unsupported instruction in the input"
-                       << " assembly sequence.\n";
-    MCIP.printInst(&MCI, OS, "", STI);
-    OS.flush();
-    WithColor::note() << "instruction: " << ToString << '\n';
-    return make_error<StringError>(
-        "Don't know how to analyze unsupported instructions",
-        inconvertibleErrorCode());
+    return make_error<InstructionError<MCInst>>(
+        "found an unsupported instruction in the input assembly sequence.",
+        MCI);
   }
 
   // Create a new empty descriptor.
diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp
index 59b78ff1545..9ad761e6665 100644
--- a/tools/llvm-mca/llvm-mca.cpp
+++ b/tools/llvm-mca/llvm-mca.cpp
@@ -35,6 +35,7 @@
 #include "Views/TimelineView.h"
 #include "include/Context.h"
 #include "include/Pipeline.h"
+#include "include/Support.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
@@ -326,6 +327,30 @@ static void processViewOptions() {
   processOptionImpl(PrintRetireStats, Default);
 }
 
+// Returns true on success.
+static bool runPipeline(mca::Pipeline &P, MCInstPrinter &MCIP,
+                        const MCSubtargetInfo &STI) {
+  // Handle pipeline errors here.
+  if (auto Err = P.run()) {
+    if (auto NewE = handleErrors(
+            std::move(Err),
+            [&MCIP, &STI](const mca::InstructionError<MCInst> &IE) {
+              std::string InstructionStr;
+              raw_string_ostream SS(InstructionStr);
+              WithColor::error() << IE.Message << '\n';
+              MCIP.printInst(&IE.Inst, SS, "", STI);
+              SS.flush();
+              WithColor::note() << "instruction: " << InstructionStr << '\n';
+            })) {
+      // Default case.
+      WithColor::error() << toString(std::move(NewE));
+    }
+    return false;
+  }
+
+  return true;
+}
+
 int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
 
@@ -462,7 +487,7 @@ int main(int argc, char **argv) {
     Width = DispatchWidth;
 
   // Create an instruction builder.
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, *MCIA, *IP);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, *MCIA);
 
   // Create a context to control ownership of the pipeline hardware.
   mca::Context MCA(*MRI, *STI);
@@ -504,9 +529,10 @@ int main(int argc, char **argv) {
       }
       Printer.addView(
           llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, S));
-      auto Err = P->run();
-      if (Err)
-        report_fatal_error(toString(std::move(Err)));
+
+      if (!runPipeline(*P, *IP, *STI))
+        return 1;
+
       Printer.printReport(TOF->os());
       continue;
     }
@@ -543,9 +569,9 @@ int main(int argc, char **argv) {
           *STI, *IP, S, TimelineMaxIterations, TimelineMaxCycles));
     }
 
-    auto Err = P->run();
-    if (Err)
-      report_fatal_error(toString(std::move(Err)));
+    if (!runPipeline(*P, *IP, *STI))
+      return 1;
+
     Printer.printReport(TOF->os());
 
     // Clear the InstrBuilder internal state in preparation for another round.
-- 
GitLab


From 2191c153b0abf38d61f5b1bf3ec28873df0ef6f0 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 24 Oct 2018 11:55:06 +0000
Subject: [PATCH 0496/1116] [llvm-exegesis] Implements a cache of Instruction
 objects.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345130 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/Latency.cpp           |  2 +-
 tools/llvm-exegesis/lib/LlvmState.cpp         |  1 +
 tools/llvm-exegesis/lib/LlvmState.h           |  3 +++
 tools/llvm-exegesis/lib/MCInstrDescView.cpp   | 19 +++++++++++++----
 tools/llvm-exegesis/lib/MCInstrDescView.h     | 21 +++++++++++++++++--
 tools/llvm-exegesis/llvm-exegesis.cpp         |  2 +-
 .../X86/SnippetGeneratorTest.cpp              |  6 +++---
 7 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp
index ec92d936de3..602b379faf3 100644
--- a/tools/llvm-exegesis/lib/Latency.cpp
+++ b/tools/llvm-exegesis/lib/Latency.cpp
@@ -49,7 +49,7 @@ computeAliasingInstructions(const LLVMState &State, const Instruction &Instr,
   for (const unsigned OtherOpcode : Opcodes) {
     if (OtherOpcode == Instr.Description->getOpcode())
       continue;
-    const Instruction OtherInstr(State, OtherOpcode);
+    const Instruction &OtherInstr = State.getIC().getInstr(OtherOpcode);
     if (OtherInstr.hasMemoryOperands())
       continue;
     if (Instr.hasAliasingRegistersThrough(OtherInstr))
diff --git a/tools/llvm-exegesis/lib/LlvmState.cpp b/tools/llvm-exegesis/lib/LlvmState.cpp
index ba786cc97ce..58e9db315d5 100644
--- a/tools/llvm-exegesis/lib/LlvmState.cpp
+++ b/tools/llvm-exegesis/lib/LlvmState.cpp
@@ -38,6 +38,7 @@ LLVMState::LLVMState(const std::string &Triple, const std::string &CpuName) {
   }
   RATC.reset(new RegisterAliasingTrackerCache(
       getRegInfo(), getFunctionReservedRegs(getTargetMachine())));
+  IC.reset(new InstructionsCache(getInstrInfo(), getRATC()));
 }
 
 LLVMState::LLVMState()
diff --git a/tools/llvm-exegesis/lib/LlvmState.h b/tools/llvm-exegesis/lib/LlvmState.h
index f8ef8665f44..918738551d0 100644
--- a/tools/llvm-exegesis/lib/LlvmState.h
+++ b/tools/llvm-exegesis/lib/LlvmState.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TOOLS_LLVM_EXEGESIS_LLVMSTATE_H
 #define LLVM_TOOLS_LLVM_EXEGESIS_LLVMSTATE_H
 
+#include "MCInstrDescView.h"
 #include "RegisterAliasing.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
@@ -57,11 +58,13 @@ public:
     return *TargetMachine->getMCSubtargetInfo();
   }
   const RegisterAliasingTrackerCache &getRATC() const { return *RATC; }
+  const InstructionsCache &getIC() const { return *IC; }
 
 private:
   const ExegesisTarget *TheExegesisTarget;
   std::unique_ptr<const llvm::TargetMachine> TargetMachine;
   std::unique_ptr<const RegisterAliasingTrackerCache> RATC;
+  std::unique_ptr<const InstructionsCache> IC;
 };
 
 } // namespace exegesis
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
index 2b4624b9b64..e0521af4d19 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.cpp
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
@@ -95,10 +95,10 @@ const llvm::MCOperandInfo &Operand::getExplicitOperandInfo() const {
   return *Info;
 }
 
-Instruction::Instruction(const LLVMState &State, unsigned Opcode)
-    : Description(&State.getInstrInfo().get(Opcode)),
-      Name(State.getInstrInfo().getName(Opcode)) {
-  const auto &RATC = State.getRATC();
+Instruction::Instruction(const llvm::MCInstrInfo &InstrInfo,
+                         const RegisterAliasingTrackerCache &RATC,
+                         unsigned Opcode)
+    : Description(&InstrInfo.get(Opcode)), Name(InstrInfo.getName(Opcode)) {
   unsigned OpIndex = 0;
   for (; OpIndex < Description->getNumOperands(); ++OpIndex) {
     const auto &OpInfo = Description->opInfo_begin()[OpIndex];
@@ -262,6 +262,17 @@ void Instruction::dump(const llvm::MCRegisterInfo &RegInfo,
     Stream << "- hasAliasingRegisters\n";
 }
 
+InstructionsCache::InstructionsCache(const llvm::MCInstrInfo &InstrInfo,
+                                     const RegisterAliasingTrackerCache &RATC)
+    : InstrInfo(InstrInfo), RATC(RATC) {}
+
+const Instruction &InstructionsCache::getInstr(unsigned Opcode) const {
+  auto &Found = Instructions[Opcode];
+  if (!Found)
+    Found.reset(new Instruction(InstrInfo, RATC, Opcode));
+  return *Found;
+}
+
 bool RegisterOperandAssignment::
 operator==(const RegisterOperandAssignment &Other) const {
   return std::tie(Op, Reg) == std::tie(Other.Op, Other.Reg);
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.h b/tools/llvm-exegesis/lib/MCInstrDescView.h
index 4e8278ba2b5..58efd2a4e41 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.h
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.h
@@ -20,8 +20,8 @@
 #define LLVM_TOOLS_LLVM_EXEGESIS_MCINSTRDESCVIEW_H
 
 #include <random>
+#include <unordered_map>
 
-#include "LlvmState.h"
 #include "RegisterAliasing.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
@@ -94,7 +94,8 @@ struct Operand {
 // A view over an MCInstrDesc offering a convenient interface to compute
 // Register aliasing.
 struct Instruction {
-  Instruction(const LLVMState &State, unsigned Opcode);
+  Instruction(const llvm::MCInstrInfo &InstrInfo,
+              const RegisterAliasingTrackerCache &RATC, unsigned Opcode);
 
   // Returns the Operand linked to this Variable.
   // In case the Variable is tied, the primary (i.e. Def) Operand is returned.
@@ -145,6 +146,22 @@ struct Instruction {
   llvm::BitVector AllUseRegs;  // The set of all aliased use registers.
 };
 
+// Instructions are expensive to instantiate. This class provides a cache of
+// Instructions with lazy construction.
+struct InstructionsCache {
+  InstructionsCache(const llvm::MCInstrInfo &InstrInfo,
+                    const RegisterAliasingTrackerCache &RATC);
+
+  // Returns the Instruction object corresponding to this Opcode.
+  const Instruction &getInstr(unsigned Opcode) const;
+
+private:
+  const llvm::MCInstrInfo &InstrInfo;
+  const RegisterAliasingTrackerCache &RATC;
+  mutable std::unordered_map<unsigned, std::unique_ptr<Instruction>>
+      Instructions;
+};
+
 // Represents the assignment of a Register to an Operand.
 struct RegisterOperandAssignment {
   RegisterOperandAssignment(const Operand *Operand, llvm::MCPhysReg Reg)
diff --git a/tools/llvm-exegesis/llvm-exegesis.cpp b/tools/llvm-exegesis/llvm-exegesis.cpp
index ea991420039..689a1e097c6 100644
--- a/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -147,7 +147,7 @@ getOpcodesOrDie(const llvm::MCInstrInfo &MCInstrInfo) {
 // Generates code snippets for opcode `Opcode`.
 static llvm::Expected<std::vector<BenchmarkCode>>
 generateSnippets(const LLVMState &State, unsigned Opcode) {
-  const Instruction Instr(State, Opcode);
+  const Instruction &Instr = State.getIC().getInstr(Opcode);
   const llvm::MCInstrDesc &InstrDesc = *Instr.Description;
   // Ignore instructions that we cannot run.
   if (InstrDesc.isPseudo())
diff --git a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
index 04517359d8a..1689defded8 100644
--- a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -61,7 +61,7 @@ protected:
 
   std::vector<CodeTemplate> checkAndGetCodeTemplates(unsigned Opcode) {
     randomGenerator().seed(0); // Initialize seed.
-    const Instruction Instr(State, Opcode);
+    const Instruction &Instr = State.getIC().getInstr(Opcode);
     auto CodeTemplateOrError = Generator.generateCodeTemplates(Instr);
     EXPECT_FALSE(CodeTemplateOrError.takeError()); // Valid configuration.
     return std::move(CodeTemplateOrError.get());
@@ -341,7 +341,7 @@ TEST_F(UopsSnippetGeneratorTest, MemoryUse_Movsb) {
   // - hasAliasingImplicitRegisters (execution is always serial)
   // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::MOVSB;
-  const Instruction Instr(State, Opcode);
+  const Instruction &Instr = State.getIC().getInstr(Opcode);
   auto Error = Generator.generateCodeTemplates(Instr).takeError();
   EXPECT_TRUE((bool)Error);
   llvm::consumeError(std::move(Error));
@@ -352,7 +352,7 @@ public:
   FakeSnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {}
 
   Instruction createInstruction(unsigned Opcode) {
-    return Instruction(State, Opcode);
+    return State.getIC().getInstr(Opcode);
   }
 
 private:
-- 
GitLab


From 0fd37a09705babb9b4b79ed49d5d1e6373e2130e Mon Sep 17 00:00:00 2001
From: Martin Storsjo <martin@martin.st>
Date: Wed, 24 Oct 2018 12:22:12 +0000
Subject: [PATCH 0497/1116] [MinGW] Enable large file for mingw-w64

64-bit mingw doesn't define _FILE_OFFSET_BITS=64 by default.

Differential Revision: https://reviews.llvm.org/D53569

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345131 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/HandleLLVMOptions.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 27875781d22..05db1b076a1 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -224,6 +224,10 @@ if(NOT WIN32 AND NOT CYGWIN)
   append_if(SUPPORTS_FVISIBILITY_INLINES_HIDDEN_FLAG "-fvisibility-inlines-hidden" CMAKE_CXX_FLAGS)
 endif()
 
+if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND MINGW)
+  add_definitions( -D_FILE_OFFSET_BITS=64 )
+endif()
+
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
   # TODO: support other platforms and toolchains.
   if( LLVM_BUILD_32_BITS )
-- 
GitLab


From 1dca977606741cfc4c2989f76c4eb7ceb3b64bec Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 24 Oct 2018 13:13:36 +0000
Subject: [PATCH 0498/1116] [X86][SSE] Update PMULDQ schedule tests to survive
 more aggressive SimplifyDemandedBits

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345136 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx2-schedule.ll  | 26 ++++++++------
 test/CodeGen/X86/sse41-schedule.ll | 56 +++++++++++++++++++-----------
 2 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll
index 1bfe60e3104..e04eb583087 100644
--- a/test/CodeGen/X86/avx2-schedule.ll
+++ b/test/CodeGen/X86/avx2-schedule.ll
@@ -4734,46 +4734,52 @@ define <4 x i64> @test_pmovzxwq(<8 x i16> %a0, <8 x i16> *%a1) {
   ret <4 x i64> %6
 }
 
-define <4 x i64> @test_pmuldq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+define <4 x i64> @test_pmuldq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> *%a3) {
 ; GENERIC-LABEL: test_pmuldq:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; GENERIC-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; GENERIC-NEXT:    vpmuldq (%rdi), %ymm2, %ymm1 # sched: [12:1.00]
+; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pmuldq:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT:    vpmuldq (%rdi), %ymm2, %ymm1 # sched: [12:1.00]
+; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_pmuldq:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; BROADWELL-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT:    vpmuldq (%rdi), %ymm2, %ymm1 # sched: [11:1.00]
+; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_pmuldq:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
-; SKYLAKE-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vpmuldq (%rdi), %ymm2, %ymm1 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_pmuldq:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vpmuldq (%rdi), %ymm2, %ymm1 # sched: [11:0.50]
+; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_pmuldq:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    vpmuldq (%rdi), %ymm2, %ymm2 # sched: [11:1.00]
 ; ZNVER1-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
-; ZNVER1-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    vpor %ymm2, %ymm0, %ymm0 # sched: [1:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1)
-  %2 = bitcast <4 x i64> %1 to <8 x i32>
-  %3 = load <8 x i32>, <8 x i32> *%a2, align 32
-  %4 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %2, <8 x i32> %3)
+  %2 = load <8 x i32>, <8 x i32> *%a3, align 32
+  %3 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a2, <8 x i32> %2)
+  %4 = or <4 x i64> %1, %3
   ret <4 x i64> %4
 }
 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll
index 3af917491c9..313f6325319 100644
--- a/test/CodeGen/X86/sse41-schedule.ll
+++ b/test/CodeGen/X86/sse41-schedule.ll
@@ -4704,106 +4704,122 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
   ret <2 x i64> %5
 }
 
-define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> *%a3) {
 ; GENERIC-LABEL: test_pmuldq:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    pmuldq %xmm1, %xmm0 # sched: [5:1.00]
-; GENERIC-NEXT:    pmuldq (%rdi), %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT:    pmuldq (%rdi), %xmm2 # sched: [11:1.00]
+; GENERIC-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SLM-LABEL: test_pmuldq:
 ; SLM:       # %bb.0:
+; SLM-NEXT:    pmuldq (%rdi), %xmm2 # sched: [7:1.00]
 ; SLM-NEXT:    pmuldq %xmm1, %xmm0 # sched: [4:1.00]
-; SLM-NEXT:    pmuldq (%rdi), %xmm0 # sched: [7:1.00]
+; SLM-NEXT:    por %xmm2, %xmm0 # sched: [1:0.50]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-SSE-LABEL: test_pmuldq:
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [11:1.00]
+; SANDY-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [11:1.00]
+; SANDY-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_pmuldq:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [11:1.00]
+; SANDY-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_pmuldq:
 ; HASWELL-SSE:       # %bb.0:
 ; HASWELL-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [5:1.00]
-; HASWELL-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [11:1.00]
+; HASWELL-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [11:1.00]
+; HASWELL-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-LABEL: test_pmuldq:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [11:1.00]
+; HASWELL-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-SSE-LABEL: test_pmuldq:
 ; BROADWELL-SSE:       # %bb.0:
 ; BROADWELL-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [5:1.00]
-; BROADWELL-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [10:1.00]
+; BROADWELL-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [10:1.00]
+; BROADWELL-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_pmuldq:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BROADWELL-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [10:1.00]
+; BROADWELL-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-SSE-LABEL: test_pmuldq:
 ; SKYLAKE-SSE:       # %bb.0:
 ; SKYLAKE-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [4:0.50]
-; SKYLAKE-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [10:0.50]
+; SKYLAKE-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [10:0.50]
+; SKYLAKE-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_pmuldq:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKYLAKE-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-SSE-LABEL: test_pmuldq:
 ; SKX-SSE:       # %bb.0:
 ; SKX-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [10:0.50]
+; SKX-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [10:0.50]
+; SKX-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_pmuldq:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [10:0.50]
+; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmuldq:
 ; BTVER2-SSE:       # %bb.0:
+; BTVER2-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [7:1.00]
 ; BTVER2-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [7:1.00]
+; BTVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.50]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_pmuldq:
 ; BTVER2:       # %bb.0:
+; BTVER2-NEXT:    vpmuldq (%rdi), %xmm2, %xmm2 # sched: [7:1.00]
 ; BTVER2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT:    vpor %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_pmuldq:
 ; ZNVER1-SSE:       # %bb.0:
+; ZNVER1-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [11:1.00]
 ; ZNVER1-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [4:1.00]
-; ZNVER1-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [11:1.00]
+; ZNVER1-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.25]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
 ; ZNVER1-LABEL: test_pmuldq:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    vpmuldq (%rdi), %xmm2, %xmm2 # sched: [11:1.00]
 ; ZNVER1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
-; ZNVER1-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    vpor %xmm2, %xmm0, %xmm0 # sched: [1:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
-  %2 = bitcast <2 x i64> %1 to <4 x i32>
-  %3 = load <4 x i32>, <4 x i32> *%a2, align 16
-  %4 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %2, <4 x i32> %3)
+  %2 = load <4 x i32>, <4 x i32> *%a3, align 16
+  %3 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a2, <4 x i32> %2)
+  %4 = or <2 x i64> %1, %3
   ret <2 x i64> %4
 }
 declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
-- 
GitLab


From 306868b91c00a1a2dd7e94462d6aa9b72496dfec Mon Sep 17 00:00:00 2001
From: James Henderson <jh7370@my.bristol.ac.uk>
Date: Wed, 24 Oct 2018 13:16:16 +0000
Subject: [PATCH 0499/1116] Fix llvm-strings crash for negative char values

On Windows at least, llvm-strings was crashing if it encountered bytes
that mapped to negative chars, as it was passing these into
std::isgraph and std::isblank functions, resulting in undefined
behaviour. On debug builds using MSVC, these functions verfiy that the
value passed in is representable as an unsigned char. Since the char is
promoted to an int, a value greater than 127 would turn into a negative
integer value, and fail the check. Using the llvm::isPrint function is
sufficient to solve the issue.

Reviewed by: ruiu, mstorsjo

Differential Revision: https://reviews.llvm.org/D53509


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345137 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-strings/negative-char.test | 3 +++
 tools/llvm-strings/llvm-strings.cpp        | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 test/tools/llvm-strings/negative-char.test

diff --git a/test/tools/llvm-strings/negative-char.test b/test/tools/llvm-strings/negative-char.test
new file mode 100644
index 00000000000..331dde47078
--- /dev/null
+++ b/test/tools/llvm-strings/negative-char.test
@@ -0,0 +1,3 @@
+# RUN: echo -e "z\0\x80\0a\0" | llvm-strings --bytes 1 - | FileCheck %s
+# CHECK: z{{$}}
+# CHECK-NEXT: {{^}} a
diff --git a/tools/llvm-strings/llvm-strings.cpp b/tools/llvm-strings/llvm-strings.cpp
index 8e2d213bcc7..c355caf899d 100644
--- a/tools/llvm-strings/llvm-strings.cpp
+++ b/tools/llvm-strings/llvm-strings.cpp
@@ -80,7 +80,7 @@ static void strings(raw_ostream &OS, StringRef FileName, StringRef Contents) {
   const char *B = Contents.begin();
   const char *P = nullptr, *E = nullptr, *S = nullptr;
   for (P = Contents.begin(), E = Contents.end(); P < E; ++P) {
-    if (std::isgraph(*P) || std::isblank(*P)) {
+    if (isPrint(*P) || *P == '\t') {
       if (S == nullptr)
         S = P;
     } else if (S) {
-- 
GitLab


From 99402db0aa1637d923a6d1376dbc857a43a9adac Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Wed, 24 Oct 2018 14:04:00 +0000
Subject: [PATCH 0500/1116] [DEBUGINFO, NVPTX] Try to pack bytes data into a
 single string.

Summary:
If the target does not support `.asciz` and `.ascii` directives, the
strings are represented as bytes and each byte is placed on the new line
as a separate byte directive `.b8 <data>`. NVPTX target allows to
represent the vector of the data of the same type as a vector, where
values are separated using `,` symbol: `.b8 <data1>,<data2>,...`. This
allows to reduce the size of the final PTX file. Ptxas tool includes ptx
files into the resulting binary object, so reducing the size of the PTX
file is important.

Reviewers: tra, jlebar, echristo

Subscribers: jholewinski, llvm-commits

Differential Revision: https://reviews.llvm.org/D45822

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345142 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCStreamer.h                  |    5 +
 lib/CodeGen/AsmPrinter/DIE.cpp                |    3 +-
 lib/MC/MCAsmStreamer.cpp                      |   12 +-
 lib/MC/MCStreamer.cpp                         |   12 +
 .../MCTargetDesc/NVPTXTargetStreamer.cpp      |   27 +
 .../NVPTX/MCTargetDesc/NVPTXTargetStreamer.h  |    4 +
 test/DebugInfo/NVPTX/cu-range-hole.ll         |   72 +-
 test/DebugInfo/NVPTX/dbg-declare-alloca.ll    |   39 +-
 test/DebugInfo/NVPTX/debug-file-loc.ll        |   19 +-
 test/DebugInfo/NVPTX/debug-info.ll            | 4906 +++--------------
 test/DebugInfo/NVPTX/debug-loc-offset.ll      |  167 +-
 11 files changed, 736 insertions(+), 4530 deletions(-)

diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 2e9a9d61c67..d66a89f76a7 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -109,6 +109,11 @@ public:
 
   virtual void emitValue(const MCExpr *Value);
 
+  /// Emit the bytes in \p Data into the output.
+  ///
+  /// This is used to emit bytes in \p Data as sequence of .byte directives.
+  virtual void emitRawBytes(StringRef Data);
+
   virtual void finish();
 };
 
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index ca3a7506789..301fd9ef81b 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -589,8 +589,7 @@ void DIEString::print(raw_ostream &O) const {
 //===----------------------------------------------------------------------===//
 void DIEInlineString::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_string) {
-    for (char ch : S)
-      AP->emitInt8(ch);
+    AP->OutStreamer->EmitBytes(S);
     AP->emitInt8(0);
     return;
   }
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index c4744ac5d51..f75a8e077e4 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -858,10 +858,14 @@ void MCAsmStreamer::EmitBytes(StringRef Data) {
   // supported, emit as vector of 8bits data.
   if (Data.size() == 1 ||
       !(MAI->getAscizDirective() || MAI->getAsciiDirective())) {
-    const char *Directive = MAI->getData8bitsDirective();
-    for (const unsigned char C : Data.bytes()) {
-      OS << Directive << (unsigned)C;
-      EmitEOL();
+    if (MCTargetStreamer *TS = getTargetStreamer()) {
+      TS->emitRawBytes(Data);
+    } else {
+      const char *Directive = MAI->getData8bitsDirective();
+      for (const unsigned char C : Data.bytes()) {
+        OS << Directive << (unsigned)C;
+        EmitEOL();
+      }
     }
     return;
   }
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index bfcf6d47a78..1b704b89320 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -72,6 +72,18 @@ void MCTargetStreamer::emitValue(const MCExpr *Value) {
   Streamer.EmitRawText(OS.str());
 }
 
+void MCTargetStreamer::emitRawBytes(StringRef Data) {
+  const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
+  const char *Directive = MAI->getData8bitsDirective();
+  for (const unsigned char C : Data.bytes()) {
+    SmallString<128> Str;
+    raw_svector_ostream OS(Str);
+
+    OS << Directive << (unsigned)C;
+    Streamer.EmitRawText(OS.str());
+  }
+}
+
 void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
 
 MCStreamer::MCStreamer(MCContext &Ctx)
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index aeb90eca3a0..71ca7a5ca8d 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -92,3 +92,30 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
     OS << "//\t{\n";
   }
 }
+
+void NVPTXTargetStreamer::emitRawBytes(StringRef Data) {
+  const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
+  const char *Directive = MAI->getData8bitsDirective();
+  unsigned NumElements = Data.size();
+  const unsigned MaxLen = 40;
+  unsigned NumChunks = 1 + ((NumElements - 1) / MaxLen);
+  // Split the very long directives into several parts if the limit is
+  // specified.
+  for (unsigned I = 0; I < NumChunks; ++I) {
+    SmallString<128> Str;
+    raw_svector_ostream OS(Str);
+
+    const char *Label = Directive;
+    for (auto It = std::next(Data.bytes_begin(), I * MaxLen),
+              End = (I == NumChunks - 1)
+                        ? Data.bytes_end()
+                        : std::next(Data.bytes_begin(), (I + 1) * MaxLen);
+         It != End; ++It) {
+      OS << Label << (unsigned)*It;
+      if (Label == Directive)
+        Label = ",";
+    }
+    Streamer.EmitRawText(OS.str());
+  }
+}
+
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
index 30831ab8bbe..34391a8b9ab 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
@@ -39,6 +39,10 @@ public:
   void emitDwarfFileDirective(StringRef Directive) override;
   void changeSection(const MCSection *CurSection, MCSection *Section,
                      const MCExpr *SubSection, raw_ostream &OS) override;
+  /// Emit the bytes in \p Data into the output.
+  ///
+  /// This is used to emit bytes in \p Data as sequence of .byte directives.
+  void emitRawBytes(StringRef Data) override;
 };
 
 } // end namespace llvm
diff --git a/test/DebugInfo/NVPTX/cu-range-hole.ll b/test/DebugInfo/NVPTX/cu-range-hole.ll
index 01d038477c0..c8ea509396a 100644
--- a/test/DebugInfo/NVPTX/cu-range-hole.ll
+++ b/test/DebugInfo/NVPTX/cu-range-hole.ll
@@ -148,75 +148,15 @@ entry:
 ; CHECK: // .b32 .debug_abbrev                   // Offset Into Abbrev. Section
 ; CHECK: // .b8 8                                // Address Size (in bytes)
 ; CHECK: // .b8 1                                // Abbrev [1] 0xb:0xb0 DW_TAG_compile_unit
-; CHECK: // .b8 99                               // DW_AT_producer
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 118
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 32
-; CHECK: // .b8 51
-; CHECK: // .b8 46
-; CHECK: // .b8 53
-; CHECK: // .b8 46
-; CHECK: // .b8 48
-; CHECK: // .b8 32
-; CHECK: // .b8 40
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 107
-; CHECK: // .b8 32
-; CHECK: // .b8 50
-; CHECK: // .b8 48
-; CHECK: // .b8 52
-; CHECK: // .b8 49
-; CHECK: // .b8 54
-; CHECK: // .b8 52
-; CHECK: // .b8 41
-; CHECK: // .b8 32
-; CHECK: // .b8 40
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 118
-; CHECK: // .b8 109
-; CHECK: // .b8 47
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 107
-; CHECK: // .b8 32
-; CHECK: // .b8 50
-; CHECK: // .b8 48
-; CHECK: // .b8 52
-; CHECK: // .b8 49
-; CHECK: // .b8 56
-; CHECK: // .b8 51
-; CHECK: // .b8 41
+; CHECK: // .b8 99,108,97,110,103,32,118,101,114,115,105,111,110,32,51,46,53,46,48,32,40,116,114,117,110,107,32,50,48,52,49,54,52,41,32,40,108,108,118,109 // DW_AT_producer
+; CHECK: // .b8 47,116,114,117,110,107,32,50,48,52,49,56,51,41
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 46
-; CHECK: // .b8 99
+; CHECK: // .b8 98,46,99                         // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 47                               // DW_AT_comp_dir
-; CHECK: // .b8 115
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 114
-; CHECK: // .b8 99
-; CHECK: // .b8 101
+; CHECK: // .b8 47,115,111,117,114,99,101        // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin0                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end2                      // DW_AT_high_pc
@@ -259,9 +199,7 @@ entry:
 ; CHECK: // .b32 179                             // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 4                                // Abbrev [4] 0xb3:0x7 DW_TAG_base_type
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 105,110,116                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
diff --git a/test/DebugInfo/NVPTX/dbg-declare-alloca.ll b/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
index 8a6fddddd88..9a4beed23d7 100644
--- a/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
+++ b/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
@@ -129,23 +129,14 @@
 ; CHECK: // .b32 .debug_abbrev                   // Offset Into Abbrev. Section
 ; CHECK: // .b8 8                                // Address Size (in bytes)
 ; CHECK: // .b8 1                                // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit
-; CHECK: // .b8 99                               // DW_AT_producer
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 103
+; CHECK: // .b8 99,108,97,110,103                // DW_AT_producer
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 46
-; CHECK: // .b8 99
+; CHECK: // .b8 116,46,99                        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 116                              // DW_AT_comp_dir
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 116
+; CHECK: // .b8 116,101,115,116                  // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin0                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
@@ -154,21 +145,7 @@
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
 ; CHECK: // .b8 1                                // DW_AT_frame_base
 ; CHECK: // .b8 156
-; CHECK: // .b8 117                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK: // .b8 95
-; CHECK: // .b8 100
-; CHECK: // .b8 98
-; CHECK: // .b8 103
-; CHECK: // .b8 95
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 99
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 101
+; CHECK: // .b8 117,115,101,95,100,98,103,95,100,101,99,108,97,114,101 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 3                                // DW_AT_decl_line
@@ -187,9 +164,7 @@
 ; CHECK: // .b32 110                             // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 4                                // Abbrev [4] 0x6e:0x15 DW_TAG_structure_type
-; CHECK: // .b8 70                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 111
+; CHECK: // .b8 70,111,111                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_byte_size
 ; CHECK: // .b8 1                                // DW_AT_decl_file
@@ -205,9 +180,7 @@
 ; CHECK: // .b8 0
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 6                                // Abbrev [6] 0x83:0x7 DW_TAG_base_type
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 105,110,116                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
diff --git a/test/DebugInfo/NVPTX/debug-file-loc.ll b/test/DebugInfo/NVPTX/debug-file-loc.ll
index 16753e76322..a9ea67c3388 100644
--- a/test/DebugInfo/NVPTX/debug-file-loc.ll
+++ b/test/DebugInfo/NVPTX/debug-file-loc.ll
@@ -63,25 +63,10 @@ bb:
 ; CHECK: // .b8 0                                // DW_AT_producer
 ; CHECK: // .b8 4                                // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 46
-; CHECK: // .b8 99
-; CHECK: // .b8 117
+; CHECK: // .b8 98,97,114,46,99,117              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 47                               // DW_AT_comp_dir
-; CHECK: // .b8 115
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 114
-; CHECK: // .b8 99
-; CHECK: // .b8 101
-; CHECK: // .b8 47
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 114
+; CHECK: // .b8 47,115,111,117,114,99,101,47,100,105,114                // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin0                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end1                      // DW_AT_high_pc
diff --git a/test/DebugInfo/NVPTX/debug-info.ll b/test/DebugInfo/NVPTX/debug-info.ll
index d5dee4055f0..02e6240aa3e 100644
--- a/test/DebugInfo/NVPTX/debug-info.ll
+++ b/test/DebugInfo/NVPTX/debug-info.ll
@@ -158,8 +158,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 5                                // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -281,8 +280,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 12                               // DW_FORM_flag
 ; CHECK: // .b8 63                               // DW_AT_external
 ; CHECK: // .b8 12                               // DW_FORM_flag
-; CHECK: // .b8 135                              // DW_AT_noreturn
-; CHECK: // .b8 1
+; CHECK: // .b8 135,1                            // DW_AT_noreturn
 ; CHECK: // .b8 12                               // DW_FORM_flag
 ; CHECK: // .b8 0                                // EOM(1)
 ; CHECK: // .b8 0                                // EOM(2)
@@ -351,8 +349,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 12                               // DW_FORM_flag
 ; CHECK: // .b8 63                               // DW_AT_external
 ; CHECK: // .b8 12                               // DW_FORM_flag
-; CHECK: // .b8 135                              // DW_AT_noreturn
-; CHECK: // .b8 1
+; CHECK: // .b8 135,1                            // DW_AT_noreturn
 ; CHECK: // .b8 12                               // DW_FORM_flag
 ; CHECK: // .b8 0                                // EOM(1)
 ; CHECK: // .b8 0                                // EOM(2)
@@ -391,8 +388,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 25                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -411,8 +407,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 26                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -442,8 +437,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 28                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 0                                // DW_CHILDREN_no
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -488,8 +482,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 31                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -508,8 +501,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 32                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -574,8 +566,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 37                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -594,8 +585,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 38                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -631,8 +621,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_FORM_addr
 ; CHECK: // .b8 64                               // DW_AT_frame_base
 ; CHECK: // .b8 10                               // DW_FORM_block1
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -695,9 +684,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // EOM(1)
 ; CHECK: // .b8 0                                // EOM(2)
 ; CHECK: // .b8 0                                // EOM(3)
-; CHECK: //	}
-; CHECK: //	.section	.debug_info
-; CHECK: //	{
+; CHECK: // }
+; CHECK: // .section .debug_info
+; CHECK: // {
 ; CHECK: // .b32 10025                           // Length of Unit
 ; CHECK: // .b8 2                                // DWARF version number
 ; CHECK: // .b8 0
@@ -707,43 +696,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // DW_AT_producer
 ; CHECK: // .b8 4                                // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 103
-; CHECK: // .b8 45
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 111
-; CHECK: // .b8 46
-; CHECK: // .b8 99
-; CHECK: // .b8 117
+; CHECK: // .b8 100,101,98,117,103,45,105,110,102,111,46,99,117 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 47                               // DW_AT_comp_dir
-; CHECK: // .b8 115
-; CHECK: // .b8 111
-; CHECK: // .b8 109
-; CHECK: // .b8 101
-; CHECK: // .b8 47
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 99
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 121
+; CHECK: // .b8 47,115,111,109,101,47,100,105,114,101,99,116,111,114,121 // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin0                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
 ; CHECK: // .b8 2                                // Abbrev [2] 0x41:0x588 DW_TAG_namespace
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 100
+; CHECK: // .b8 115,116,100                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 3                                // Abbrev [3] 0x46:0x7 DW_TAG_imported_declaration
 ; CHECK: // .b8 1                                // DW_AT_decl_file
@@ -1432,7 +1393,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 172                              // DW_AT_decl_line
 ; CHECK: // .b8 1
-; CHECK:  / .b32 6628                            // DW_AT_import
+; CHECK: // .b32 6628                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x4d0:0x8 DW_TAG_imported_declaration
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 173                              // DW_AT_decl_line
@@ -1472,7 +1433,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 180                              // DW_AT_decl_line
 ; CHECK: // .b8 1
-; CHECK:  / .b32 6931                            // DW_AT_import
+; CHECK: // .b32 6931                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x510:0x8 DW_TAG_imported_declaration
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 181                              // DW_AT_decl_line
@@ -1506,7 +1467,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 4                                // Abbrev [4] 0x540:0x8 DW_TAG_imported_declaration
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 187                              // DW_AT_decl_line
-; CHECK:  / .b8 1
+; CHECK: // .b8 1
 ; CHECK: // .b32 7163                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x548:0x8 DW_TAG_imported_declaration
 ; CHECK: // .b8 10                               // DW_AT_decl_file
@@ -1529,7 +1490,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1
 ; CHECK: // .b32 7330                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x568:0x8 DW_TAG_imported_declaration
-; CHECK:  / .b8 10                               // DW_AT_decl_file
+; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 192                              // DW_AT_decl_line
 ; CHECK: // .b8 1
 ; CHECK: // .b32 7379                            // DW_AT_import
@@ -1554,7 +1515,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1
 ; CHECK: // .b32 7538                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x590:0x8 DW_TAG_imported_declaration
-; CHECK:  / .b8 10                               // DW_AT_decl_file
+; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 197                              // DW_AT_decl_line
 ; CHECK: // .b8 1
 ; CHECK: // .b32 7580                            // DW_AT_import
@@ -1577,7 +1538,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 201                              // DW_AT_decl_line
 ; CHECK: // .b8 1
-; CHECK:  / .b32 7704                            // DW_AT_import
+; CHECK: // .b32 7704                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x5b8:0x8 DW_TAG_imported_declaration
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 202                              // DW_AT_decl_line
@@ -1590,20 +1551,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 7772                            // DW_AT_import
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x5c9:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 120
+; CHECK: // .b8 95,90,76,51,97,98,115,120        // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 97,98,115                        // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK:  / .b8 1                                // DW_AT_decl_file
+; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 44                               // DW_AT_decl_line
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
@@ -1611,37 +1563,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x5e4:0x11 DW_TAG_base_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 108,111,110,103,32,108,111,110,103,32,105,110,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
 ; CHECK: // .b8 5                                // Abbrev [5] 0x5f5:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 97
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,97,99,111,115,102    // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
+; CHECK: // .b8 97,99,111,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 46                               // DW_AT_decl_line
@@ -1651,32 +1580,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x612:0x9 DW_TAG_base_type
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK:  / .b8 111
-; CHECK: // .b8 97
-; CHECK: // .b8 116
+; CHECK: // .b8 102,108,111,97,116               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
 ; CHECK: // .b8 5                                // Abbrev [5] 0x61b:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,99,111,115,104,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 97,99,111,115,104                // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK:  / .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 48                               // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
@@ -1685,20 +1597,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x63a:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,97,115,105,110,102   // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
+; CHECK: // .b8 97,115,105,110                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 50                               // DW_AT_decl_line
@@ -1708,22 +1609,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x657:0x1f DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,115,105,110,104,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 97,115,105,110,104               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 52                               // DW_AT_decl_line
@@ -1731,22 +1619,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x670:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x676:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,97,116,97,110,102    // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
+; CHECK: // .b8 97,116,97,110                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 56                               // DW_AT_decl_line
@@ -1754,28 +1631,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x68d:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x693:0x25 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,116,97,110,50,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 50
+; CHECK: // .b8 97,116,97,110,50                 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
-; CHECK:  / .b8 54                               // DW_AT_decl_line
+; CHECK: // .b8 54                               // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x6ad:0x5 DW_TAG_formal_parameter
@@ -1784,22 +1647,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x6b8:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,116,97,110,104,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK:  / .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 97,116,97,110,104                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 58                               // DW_AT_decl_line
@@ -1809,20 +1659,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x6d7:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 99
-; CHECK: // .b8 98
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 99                               // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 114
-; CHECK: // .b8 116
+; CHECK: // .b8 95,90,76,52,99,98,114,116,102    // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,98,114,116                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 60                               // DW_AT_decl_line
@@ -1832,20 +1671,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x6f4:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 99
-; CHECK: // .b8 101
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 99                               // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 105
-; CHECK: // .b8 108
+; CHECK: // .b8 95,90,76,52,99,101,105,108,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,101,105,108                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 62                               // DW_AT_decl_line
@@ -1855,29 +1683,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x711:0x2b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 56
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 112
-; CHECK: // .b8 121
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK:  / .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 112
-; CHECK: // .b8 121
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
+; CHECK: // .b8 95,90,76,56,99,111,112,121,115,105,103,110,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,111,112,121,115,105,103,110   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 64                               // DW_AT_decl_line
@@ -1889,18 +1697,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x73c:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,51,99,111,115,102       // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 115
+; CHECK: // .b8 99,111,115                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 66                               // DW_AT_decl_line
@@ -1910,20 +1709,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x757:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK:  / .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
+; CHECK: // .b8 95,90,76,52,99,111,115,104,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,111,115,104                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 68                               // DW_AT_decl_line
@@ -1933,18 +1721,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x774:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK:  / .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,51,101,114,102,102      // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,114,102                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 72                               // DW_AT_decl_line
@@ -1954,20 +1733,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x78f:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 99
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 101                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 99
+; CHECK: // .b8 95,90,76,52,101,114,102,99,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,114,102,99                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 70                               // DW_AT_decl_line
@@ -1977,66 +1745,33 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x7ac:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,51,101,120,112,102      // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 101,120,112                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 76                               // DW_AT_decl_line
-; CHECK:  / .b32 1554                            // DW_AT_type
+; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x7c1:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x7c7:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 50
+; CHECK: // .b8 95,90,76,52,101,120,112,50,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,120,112,50                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x7de:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 1554                            // DW_AT_type
+; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x7e4:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 109
-; CHECK: // .b8 49
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 109
-; CHECK: // .b8 49
+; CHECK: // .b8 95,90,76,53,101,120,112,109,49,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,120,112,109,49               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 78                               // DW_AT_decl_line
@@ -2044,22 +1779,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x7fd:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x803:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 95,90,76,52,102,97,98,115,102    // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,97,98,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 80                               // DW_AT_decl_line
@@ -2069,21 +1793,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x820:0x23 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK:  / .b8 105
-; CHECK: // .b8 109
+; CHECK: // .b8 95,90,76,52,102,100,105,109,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,100,105,109                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 82                               // DW_AT_decl_line
@@ -2095,45 +1807,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x843:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 114
+; CHECK: // .b8 95,90,76,53,102,108,111,111,114,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,108,111,111,114              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 84                               // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0x85c:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0x85c:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x862:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 97
+; CHECK: // .b8 95,90,76,51,102,109,97,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,97                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 86                               // DW_AT_decl_line
@@ -2146,22 +1834,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x883:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 5                                // Abbrev [5] 0x889:0x23 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 120
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 5                                // Abbrev [5] 0x889:0x23 DW_TAG_subprogram
+; CHECK: // .b8 95,90,76,52,102,109,97,120,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 120
+; CHECK: // .b8 102,109,97,120                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 88                               // DW_AT_decl_line
@@ -2173,21 +1849,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x8ac:0x23 DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 105
-; CHECK: // .b8 110
+; CHECK: // .b8 95,90,76,52,102,109,105,110,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,105,110                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 90                               // DW_AT_decl_line
@@ -2199,21 +1863,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x8cf:0x23 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK:  / .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
+; CHECK: // .b8 95,90,76,52,102,109,111,100,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,111,100                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 92                               // DW_AT_decl_line
@@ -2225,33 +1877,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x8f2:0x2a DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 102
-; CHECK: // .b8 112
-; CHECK: // .b8 99
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 102
-; CHECK: // .b8 121
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 99
-; CHECK: // .b8 108
-; CHECK:  / .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 102
-; CHECK: // .b8 121
+; CHECK: // .b8 95,90,76,49,48,102,112,99,108,97,115,115,105,102,121,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,112,99,108,97,115,115,105,102,121 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 94                               // DW_AT_decl_line
@@ -2261,31 +1889,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x91c:0x7 DW_TAG_base_type
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 105,110,116                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
 ; CHECK: // .b8 5                                // Abbrev [5] 0x923:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 95,90,76,53,102,114,101,120,112,102,80,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,114,101,120,112              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 96                               // DW_AT_decl_line
@@ -2293,87 +1904,41 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x93e:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 6                                // Abbrev [6] 0x943:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0x943:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 2377                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x949:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 5                                // Abbrev [5] 0x94e:0x25 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 104
-; CHECK: // .b8 121
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 104                              // DW_AT_name
-; CHECK: // .b8 121
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 116
+; CHECK: // .b8 95,90,76,53,104,121,112,111,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 104,121,112,111,116              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 98                               // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x968:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x96d:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x973:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,105,108,111,103,98,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
+; CHECK: // .b8 105,108,111,103,98               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 100                              // DW_AT_decl_line
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0x98c:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0x98c:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x992:0x25 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 56
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 105
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,56,105,115,102,105,110,105,116,101,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 105
-; CHECK: // .b8 116
-; CHECK: // .b8 101
+; CHECK: // .b8 105,115,102,105,110,105,116,101  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 102                              // DW_AT_decl_line
@@ -2383,39 +1948,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x9b7:0x8 DW_TAG_base_type
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 108
+; CHECK: // .b8 98,111,111,108                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_encoding
-; CHECK:  / .b8 1                                // DW_AT_byte_size
+; CHECK: // .b8 1                                // DW_AT_byte_size
 ; CHECK: // .b8 5                                // Abbrev [5] 0x9bf:0x2d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 57
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,57,105,115,103,114,101,97,116,101,114,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
+; CHECK: // .b8 105,115,103,114,101,97,116,101,114 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 106                              // DW_AT_decl_line
@@ -2427,42 +1967,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x9ec:0x38 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 52
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK:  / .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,52,105,115,103,114,101,97,116,101,114,101,113,117,97,108,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK:  / .b8 97
-; CHECK: // .b8 108
+; CHECK: // .b8 105,115,103,114,101,97,116,101,114,101,113,117,97,108 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 105                              // DW_AT_decl_line
@@ -2474,50 +1981,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xa24:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,105,115,105,110,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 105,115,105,110,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
-; CHECK:  / .b8 108                              // DW_AT_decl_line
+; CHECK: // .b8 108                              // DW_AT_decl_line
 ; CHECK: // .b32 2487                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xa3d:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xa43:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,105,115,108,101,115,115,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
+; CHECK: // .b8 105,115,108,101,115,115          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 112                              // DW_AT_decl_line
@@ -2529,36 +2007,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xa6a:0x32 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 49
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,49,105,115,108,101,115,115,101,113,117,97,108,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK: // .b8 113
-; CHECK:  / .b8 117
-; CHECK: // .b8 97
-; CHECK: // .b8 108
+; CHECK: // .b8 105,115,108,101,115,115,101,113,117,97,108 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 111                              // DW_AT_decl_line
@@ -2570,40 +2021,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xa9c:0x36 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 51
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,51,105,115,108,101,115,115,103,114,101,97,116,101,114,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK:  / .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
+; CHECK: // .b8 105,115,108,101,115,115,103,114,101,97,116,101,114 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 114                              // DW_AT_decl_line
@@ -2615,22 +2035,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xad2:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 110
-; CHECK: // .b8 97
-; CHECK:  / .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,105,115,110,97,110,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 110
-; CHECK: // .b8 97
-; CHECK: // .b8 110
+; CHECK: // .b8 105,115,110,97,110               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 116                              // DW_AT_decl_line
@@ -2640,28 +2047,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xaf1:0x25 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 56
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 110
-; CHECK: // .b8 111
-; CHECK:  / .b8 114
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,56,105,115,110,111,114,109,97,108,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 110
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 108
+; CHECK: // .b8 105,115,110,111,114,109,97,108   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 118                              // DW_AT_decl_line
@@ -2671,36 +2059,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xb16:0x32 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK:  / .b8 49
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,49,105,115,117,110,111,114,100,101,114,101,100,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK:  / .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 100
+; CHECK: // .b8 105,115,117,110,111,114,100,101,114,101,100 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 120                              // DW_AT_decl_line
@@ -2712,20 +2073,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xb48:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK:  / .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 95,90,76,52,108,97,98,115,108    // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,97,98,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 121                              // DW_AT_decl_line
@@ -2735,64 +2085,28 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0xb65:0xc DW_TAG_base_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 108,111,110,103,32,105,110,116   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
-; CHECK:  / .b8 5                                // Abbrev [5] 0xb71:0x25 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 5                                // Abbrev [5] 0xb71:0x25 DW_TAG_subprogram
+; CHECK: // .b8 95,90,76,53,108,100,101,120,112,102,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,100,101,120,112              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 123                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0xb8b:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0xb8b:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0xb90:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xb96:0x21 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK:  / .b8 97
+; CHECK: // .b8 95,90,76,54,108,103,97,109,109,97,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,103,97,109,109,97            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 125                              // DW_AT_decl_line
@@ -2802,22 +2116,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xbb7:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 120
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK:  / .b8 115
+; CHECK: // .b8 95,90,76,53,108,108,97,98,115,120 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,108,97,98,115                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 126                              // DW_AT_decl_line
@@ -2827,24 +2128,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xbd6:0x21 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK:  / .b8 116
+; CHECK: // .b8 95,90,76,54,108,108,114,105,110,116,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,108,114,105,110,116          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 128                              // DW_AT_decl_line
@@ -2854,18 +2140,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xbf7:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,51,108,111,103,102      // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
+; CHECK: // .b8 108,111,103                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 138                              // DW_AT_decl_line
@@ -2873,24 +2150,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xc0c:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xc12:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 48
+; CHECK: // .b8 95,90,76,53,108,111,103,49,48,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,49,48                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 130                              // DW_AT_decl_line
@@ -2898,24 +2162,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xc2b:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xc31:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 112
+; CHECK: // .b8 95,90,76,53,108,111,103,49,112,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,49,112               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 132                              // DW_AT_decl_line
@@ -2925,20 +2176,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xc50:0x1d DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 50
+; CHECK: // .b8 95,90,76,52,108,111,103,50,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,50                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 134                              // DW_AT_decl_line
@@ -2948,20 +2188,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xc6d:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
+; CHECK: // .b8 95,90,76,52,108,111,103,98,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,98                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 136                              // DW_AT_decl_line
@@ -2971,22 +2200,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xc8a:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 95,90,76,53,108,114,105,110,116,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,114,105,110,116              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 140                              // DW_AT_decl_line
@@ -2996,24 +2212,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xca9:0x21 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK:  / .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
+; CHECK: // .b8 95,90,76,54,108,114,111,117,110,100,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,114,111,117,110,100          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 142                              // DW_AT_decl_line
@@ -3023,26 +2224,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xcca:0x23 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK:  / .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
+; CHECK: // .b8 95,90,76,55,108,108,114,111,117,110,100,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,108,114,111,117,110,100      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 143                              // DW_AT_decl_line
@@ -3052,22 +2236,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xced:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 102
-; CHECK:  / .b8 0
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,109,111,100,102,102,80,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 109,111,100,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 145                              // DW_AT_decl_line
@@ -3081,35 +2252,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 8                                // Abbrev [8] 0xd11:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 5                                // Abbrev [5] 0xd16:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 110
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 80
-; CHECK: // .b8 75
-; CHECK: // .b8 99
+; CHECK: // .b8 95,90,76,51,110,97,110,80,75,99  // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
+; CHECK: // .b8 110,97,110                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 146                              // DW_AT_decl_line
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xd2d:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0xd33:0xa DW_TAG_base_type
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 101
+; CHECK: // .b8 100,111,117,98,108,101           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
@@ -3118,30 +2273,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 9                                // Abbrev [9] 0xd42:0x5 DW_TAG_const_type
 ; CHECK: // .b32 3399                            // DW_AT_type
 ; CHECK: // .b8 7                                // Abbrev [7] 0xd47:0x8 DW_TAG_base_type
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 104
-; CHECK: // .b8 97
-; CHECK: // .b8 114
+; CHECK: // .b8 99,104,97,114                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 8                                // DW_AT_encoding
 ; CHECK: // .b8 1                                // DW_AT_byte_size
 ; CHECK: // .b8 5                                // Abbrev [5] 0xd4f:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 110
-; CHECK:  / .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 75
-; CHECK: // .b8 99
+; CHECK: // .b8 95,90,76,52,110,97,110,102,80,75,99 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 110,97,110,102                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 147                              // DW_AT_decl_line
@@ -3151,30 +2290,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xd6e:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 57
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK:  / .b8 114
-; CHECK: // .b8 98
-; CHECK: // .b8 121
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,57,110,101,97,114,98,121,105,110,116,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 98
-; CHECK: // .b8 121
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 110,101,97,114,98,121,105,110,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 149                              // DW_AT_decl_line
@@ -3183,56 +2301,24 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0xd8f:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 5                                // Abbrev [5] 0xd95:0x2d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 57
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 5                                // Abbrev [5] 0xd95:0x2d DW_TAG_subprogram
+; CHECK: // .b8 95,90,76,57,110,101,120,116,97,102,116,101,114,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
+; CHECK: // .b8 110,101,120,116,97,102,116,101,114 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 151                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xdb7:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0xdbc:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xdc2:0x21 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 119
-; CHECK: // .b8 102
-; CHECK: // .b8 105
+; CHECK: // .b8 95,90,76,51,112,111,119,102,105  // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 112                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 119
+; CHECK: // .b8 112,111,119                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 155                              // DW_AT_decl_line
@@ -3243,64 +2329,24 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0xddd:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 5                                // Abbrev [5] 0xde3:0x2d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 57
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
+; CHECK: // .b8 5                                // Abbrev [5] 0xde3:0x2d DW_TAG_subprogram
+; CHECK: // .b8 95,90,76,57,114,101,109,97,105,110,100,101,114,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,101,109,97,105,110,100,101,114 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 157                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0xe05:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0xe05:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0xe0a:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xe10:0x2e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 111
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK:  / .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 111
+; CHECK: // .b8 95,90,76,54,114,101,109,113,117,111,102,102,80,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,101,109,113,117,111          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 159                              // DW_AT_decl_line
@@ -3314,20 +2360,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2377                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xe3e:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK:  / .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 95,90,76,52,114,105,110,116,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,105,110,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 161                              // DW_AT_decl_line
@@ -3337,22 +2372,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xe5b:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK:  / .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
+; CHECK: // .b8 95,90,76,53,114,111,117,110,100,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,111,117,110,100              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 163                              // DW_AT_decl_line
@@ -3362,30 +2384,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xe7a:0x29 DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 115
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 108
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 110
-; CHECK: // .b8 0
-; CHECK: // .b8 1                                // DW_AT_decl_file
-; CHECK:  / .b8 165                              // DW_AT_decl_line
+; CHECK: // .b8 95,90,76,55,115,99,97,108,98,108,110,102,108 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,99,97,108,98,108,110         // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 1                                // DW_AT_decl_file
+; CHECK: // .b8 165                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xe98:0x5 DW_TAG_formal_parameter
@@ -3394,25 +2398,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xea3:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 115
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK:  / .b8 110
+; CHECK: // .b8 95,90,76,54,115,99,97,108,98,110,102,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,99,97,108,98,110             // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 167                              // DW_AT_decl_line
@@ -3424,48 +2412,22 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xeca:0x23 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 98
-; CHECK: // .b8 105
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 98
-; CHECK: // .b8 105
-; CHECK: // .b8 116
+; CHECK: // .b8 95,90,76,55,115,105,103,110,98,105,116,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,105,103,110,98,105,116       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 169                              // DW_AT_decl_line
 ; CHECK: // .b32 2487                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xee7:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b32 1554                            // DW_AT_type
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xeed:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK:  / .b8 110
-; CHECK:  / .b8 0
+; CHECK: // .b8 95,90,76,51,115,105,110,102      // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,105,110                      // DW_AT_name
+; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 171                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
@@ -3474,43 +2436,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xf08:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK:  / .b8 115
-; CHECK:  / .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 95,90,76,52,115,105,110,104,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,105,110,104                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 173                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0xf1f:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 1554                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0xf1f:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xf25:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 115
-; CHECK: // .b8 113
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 113
-; CHECK:  / .b8 114
-; CHECK:  / .b8 116
+; CHECK: // .b8 95,90,76,52,115,113,114,116,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,113,114,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 175                              // DW_AT_decl_line
@@ -3520,41 +2460,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xf42:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 76
-; CHECK:  / .b8 51
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,51,116,97,110,102       // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
+; CHECK: // .b8 116,97,110                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 177                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0xf57:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 6                                // Abbrev [6] 0xf57:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xf5d:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK:  / .b8 102
-; CHECK:  / .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 95,90,76,52,116,97,110,104,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,97,110,104                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 179                              // DW_AT_decl_line
@@ -3563,25 +2483,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0xf74:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 5                                // Abbrev [5] 0xf7a:0x21 DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 116
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
+; CHECK: // .b8 5                                // Abbrev [5] 0xf7a:0x21 DW_TAG_subprogram
+; CHECK: // .b8 95,90,76,54,116,103,97,109,109,97,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 103
-; CHECK:  / .b8 97
-; CHECK:  / .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
+; CHECK: // .b8 116,103,97,109,109,97            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 181                              // DW_AT_decl_line
@@ -3591,49 +2496,30 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xf9b:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK:  / .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 99
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 99
-; CHECK: // .b8 0
-; CHECK: // .b8 1                                // DW_AT_decl_file
-; CHECK:  / .b8 183                              // DW_AT_decl_line
-; CHECK:  / .b32 1554                            // DW_AT_type
+; CHECK: // .b8 95,90,76,53,116,114,117,110,99,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,114,117,110,99               // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 1                                // DW_AT_decl_file
+; CHECK: // .b8 183                              // DW_AT_decl_line
+; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xfb4:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0xfba:0x14 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
+; CHECK: // .b8 97,99,111,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 54                               // DW_AT_decl_line
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0xfc8:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 3379                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0xfc8:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0xfce:0x14 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
+; CHECK: // .b8 97,115,105,110                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 56                               // DW_AT_decl_line
@@ -3641,13 +2527,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0xfdc:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b32 3379                            // DW_AT_type
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0xfe2:0x14 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
+; CHECK: // .b8 97,116,97,110                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 58                               // DW_AT_decl_line
@@ -3656,13 +2539,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0xff0:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0xff6:0x1a DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 50
+; CHECK: // .b8 0                                // End Of Children Mark
+; CHECK: // .b8 10                               // Abbrev [10] 0xff6:0x1a DW_TAG_subprogram
+; CHECK: // .b8 97,116,97,110,50                 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 60                               // DW_AT_decl_line
@@ -3670,15 +2549,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1005:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 6                                // Abbrev [6] 0x100a:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 3379                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0x100a:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1010:0x14 DW_TAG_subprogram
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 105
-; CHECK: // .b8 108
+; CHECK: // .b8 99,101,105,108                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 178                              // DW_AT_decl_line
@@ -3687,11 +2563,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x101e:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x1024:0x13 DW_TAG_subprogram
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 115
+; CHECK: // .b8 0                                // End Of Children Mark
+; CHECK: // .b8 10                               // Abbrev [10] 0x1024:0x13 DW_TAG_subprogram
+; CHECK: // .b8 99,111,115                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 63                               // DW_AT_decl_line
@@ -3702,10 +2576,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1037:0x14 DW_TAG_subprogram
-; CHECK:  / .b8 99                               // DW_AT_name
-; CHECK:  / .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
+; CHECK: // .b8 99,111,115,104                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 72                               // DW_AT_decl_line
@@ -3716,9 +2587,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x104b:0x13 DW_TAG_subprogram
-; CHECK:  / .b8 101                              // DW_AT_name
-; CHECK:  / .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 101,120,112                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 100                              // DW_AT_decl_line
@@ -3729,10 +2598,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x105e:0x14 DW_TAG_subprogram
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK:  / .b8 97
-; CHECK:  / .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 102,97,98,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 181                              // DW_AT_decl_line
@@ -3743,11 +2609,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1072:0x15 DW_TAG_subprogram
-; CHECK:  / .b8 102                              // DW_AT_name
-; CHECK:  / .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 114
+; CHECK: // .b8 102,108,111,111,114              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 184                              // DW_AT_decl_line
@@ -3757,11 +2619,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1081:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x1087:0x19 DW_TAG_subprogram
-; CHECK:  / .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
+; CHECK: // .b8 10                               // Abbrev [10] 0x1087:0x19 DW_TAG_subprogram
+; CHECK: // .b8 102,109,111,100                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 187                              // DW_AT_decl_line
@@ -3773,12 +2632,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x109a:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x10a0:0x1a DW_TAG_subprogram
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 10                               // Abbrev [10] 0x10a0:0x1a DW_TAG_subprogram
+; CHECK: // .b8 102,114,101,120,112              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 103                              // DW_AT_decl_line
@@ -3791,11 +2646,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2377                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x10ba:0x1a DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 108,100,101,120,112              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 106                              // DW_AT_decl_line
@@ -3808,9 +2659,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x10d4:0x13 DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
+; CHECK: // .b8 108,111,103                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 109                              // DW_AT_decl_line
@@ -3819,27 +2668,20 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x10e1:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x10e7:0x15 DW_TAG_subprogram
-; CHECK:  / .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 48
+; CHECK: // .b8 0                                // End Of Children Mark
+; CHECK: // .b8 10                               // Abbrev [10] 0x10e7:0x15 DW_TAG_subprogram
+; CHECK: // .b8 108,111,103,49,48                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 112                              // DW_AT_decl_line
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0x10f6:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 1                                // DW_AT_external
+; CHECK: // .b8 6                                // Abbrev [6] 0x10f6:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x10fc:0x19 DW_TAG_subprogram
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
+; CHECK: // .b8 109,111,100,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 115                              // DW_AT_decl_line
@@ -3848,15 +2690,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x110a:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 6                                // Abbrev [6] 0x110f:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 4373                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0x110f:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 4373                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x1115:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 10                               // Abbrev [10] 0x111a:0x18 DW_TAG_subprogram
-; CHECK: // .b8 112                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 119
+; CHECK: // .b8 112,111,119                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 153                              // DW_AT_decl_line
@@ -3865,13 +2705,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1127:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 6                                // Abbrev [6] 0x112c:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 3379                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0x112c:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1132:0x13 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
+; CHECK: // .b8 115,105,110                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 65                               // DW_AT_decl_line
@@ -3880,12 +2718,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x113f:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x1145:0x14 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 0                                // End Of Children Mark
+; CHECK: // .b8 10                               // Abbrev [10] 0x1145:0x14 DW_TAG_subprogram
+; CHECK: // .b8 115,105,110,104                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
@@ -3895,11 +2730,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1153:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x1159:0x14 DW_TAG_subprogram
-; CHECK:  / .b8 115                              // DW_AT_name
-; CHECK: // .b8 113
-; CHECK: // .b8 114
-; CHECK: // .b8 116
+; CHECK: // .b8 10                               // Abbrev [10] 0x1159:0x14 DW_TAG_subprogram
+; CHECK: // .b8 115,113,114,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 156                              // DW_AT_decl_line
@@ -3910,10 +2742,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x116d:0x13 DW_TAG_subprogram
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK:  / .b8 110
-; CHECK:  / .b8 0
+; CHECK: // .b8 116,97,110                       // DW_AT_name
+; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 67                               // DW_AT_decl_line
 ; CHECK: // .b32 3379                            // DW_AT_type
@@ -3923,10 +2753,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1180:0x14 DW_TAG_subprogram
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 116,97,110,104                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 76                               // DW_AT_decl_line
@@ -3938,11 +2765,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 11                               // Abbrev [11] 0x1194:0xd DW_TAG_typedef
 ; CHECK: // .b32 4513                            // DW_AT_type
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 118
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 100,105,118,95,116               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 101                              // DW_AT_decl_line
@@ -3950,12 +2773,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 11                               // Abbrev [11] 0x11a3:0xe DW_TAG_typedef
 ; CHECK: // .b32 4529                            // DW_AT_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 118
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 108,100,105,118,95,116           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 109                              // DW_AT_decl_line
@@ -3964,10 +2782,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 105                              // DW_AT_decl_line
 ; CHECK: // .b8 14                               // Abbrev [14] 0x11b5:0xf DW_TAG_member
-; CHECK: // .b8 113                              // DW_AT_name
-; CHECK: // .b8 117
-; CHECK: // .b8 111
-; CHECK: // .b8 116
+; CHECK: // .b8 113,117,111,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 4                                // DW_AT_decl_file
@@ -3976,9 +2791,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 35
 ; CHECK: // .b8 0
 ; CHECK: // .b8 14                               // Abbrev [14] 0x11c4:0xe DW_TAG_member
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 109
+; CHECK: // .b8 114,101,109                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 4                                // DW_AT_decl_file
@@ -3988,11 +2801,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 8
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 15                               // Abbrev [15] 0x11d3:0xd DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 116
+; CHECK: // .b8 97,98,111,114,116                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 3                                // DW_AT_decl_line
@@ -4001,9 +2810,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 1                                // DW_AT_noreturn
 ; CHECK: // .b8 16                               // Abbrev [16] 0x11e0:0x14 DW_TAG_subprogram
-; CHECK:  / .b8 97                               // DW_AT_name
-; CHECK:  / .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 97,98,115                        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 7                                // DW_AT_decl_line
@@ -4015,12 +2822,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x11f4:0x17 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK:  / .b8 101
-; CHECK:  / .b8 120
-; CHECK: // .b8 105
-; CHECK: // .b8 116
+; CHECK: // .b8 97,116,101,120,105,116           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 7                                // DW_AT_decl_line
@@ -4033,12 +2835,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x120b:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 4624                            // DW_AT_type
-; CHECK:  / .b8 17                               // Abbrev [17] 0x1210:0x1 DW_TAG_subroutine_type
-; CHECK:  / .b8 10                               // Abbrev [10] 0x1211:0x14 DW_TAG_subprogram
-; CHECK:  / .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 102
+; CHECK: // .b8 17                               // Abbrev [17] 0x1210:0x1 DW_TAG_subroutine_type
+; CHECK: // .b8 10                               // Abbrev [10] 0x1211:0x14 DW_TAG_subprogram
+; CHECK: // .b8 97,116,111,102                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 6                                // DW_AT_decl_file
 ; CHECK: // .b8 26                               // DW_AT_decl_line
@@ -4048,11 +2847,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x121f:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 16                               // Abbrev [16] 0x1225:0x15 DW_TAG_subprogram
-; CHECK:  / .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 105
+; CHECK: // .b8 16                               // Abbrev [16] 0x1225:0x15 DW_TAG_subprogram
+; CHECK: // .b8 97,116,111,105                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 22                               // DW_AT_decl_line
@@ -4064,10 +2860,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x123a:0x15 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK:  / .b8 116
-; CHECK:  / .b8 111
-; CHECK: // .b8 108
+; CHECK: // .b8 97,116,111,108                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 27                               // DW_AT_decl_line
@@ -4079,13 +2872,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x124f:0x2b DW_TAG_subprogram
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK:  / .b8 97
-; CHECK:  / .b8 114
-; CHECK: // .b8 99
-; CHECK: // .b8 104
+; CHECK: // .b8 98,115,101,97,114,99,104         // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 7                                // DW_AT_decl_file
 ; CHECK: // .b8 20                               // DW_AT_decl_line
@@ -4097,8 +2884,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1265:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4731                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x126a:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 4737                            // DW_AT_type
-; CHECK:  / .b8 6                                // Abbrev [6] 0x126f:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 4737                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0x126f:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1274:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4772                            // DW_AT_type
@@ -4109,52 +2896,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 19                               // Abbrev [19] 0x1280:0x1 DW_TAG_const_type
 ; CHECK: // .b8 11                               // Abbrev [11] 0x1281:0xe DW_TAG_typedef
 ; CHECK: // .b32 4751                            // DW_AT_type
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 122
-; CHECK: // .b8 101
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 0
-; CHECK:  / .b8 8                                // DW_AT_decl_file
-; CHECK:  / .b8 62                               // DW_AT_decl_line
+; CHECK: // .b8 115,105,122,101,95,116           // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 8                                // DW_AT_decl_file
+; CHECK: // .b8 62                               // DW_AT_decl_line
 ; CHECK: // .b8 7                                // Abbrev [7] 0x128f:0x15 DW_TAG_base_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 100
-; CHECK:  / .b8 32
-; CHECK:  / .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 108,111,110,103,32,117,110,115,105,103,110,101,100,32,105,110,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 7                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
 ; CHECK: // .b8 20                               // Abbrev [20] 0x12a4:0x16 DW_TAG_typedef
 ; CHECK: // .b32 4794                            // DW_AT_type
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 109
-; CHECK: // .b8 112
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK:  / .b8 116
-; CHECK:  / .b8 0
+; CHECK: // .b8 95,95,99,111,109,112,97,114,95,102,110,95,116 // DW_AT_name
+; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 230                              // DW_AT_decl_line
 ; CHECK: // .b8 2
@@ -4168,12 +2922,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 4731                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x12cf:0x1c DW_TAG_subprogram
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
+; CHECK: // .b8 99,97,108,108,111,99             // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 212                              // DW_AT_decl_line
@@ -4187,12 +2936,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x12eb:0x19 DW_TAG_subprogram
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 118
+; CHECK: // .b8 100,105,118                      // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK:  / .b8 4                                // DW_AT_decl_file
-; CHECK:  / .b8 21                               // DW_AT_decl_line
+; CHECK: // .b8 4                                // DW_AT_decl_file
+; CHECK: // .b8 21                               // DW_AT_decl_line
 ; CHECK: // .b8 3
 ; CHECK: // .b32 4500                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
@@ -4203,25 +2950,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 22                               // Abbrev [22] 0x1304:0x12 DW_TAG_subprogram
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 105
-; CHECK: // .b8 116
+; CHECK: // .b8 101,120,105,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 31                               // DW_AT_decl_line
-; CHECK:  / .b8 2
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 2
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 1                                // DW_AT_noreturn
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1310:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 23                               // Abbrev [23] 0x1316:0x11 DW_TAG_subprogram
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 101
+; CHECK: // .b8 102,114,101,101                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 227                              // DW_AT_decl_line
@@ -4231,13 +2972,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1321:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4730                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 16                               // Abbrev [16] 0x1327:0x17 DW_TAG_subprogram
-; CHECK:  / .b8 103                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 110
-; CHECK: // .b8 118
+; CHECK: // .b8 16                               // Abbrev [16] 0x1327:0x17 DW_TAG_subprogram
+; CHECK: // .b8 103,101,116,101,110,118          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 52                               // DW_AT_decl_line
@@ -4246,15 +2982,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1338:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 3389                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b32 3389                            // DW_AT_type
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x133e:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 3399                            // DW_AT_type
 ; CHECK: // .b8 16                               // Abbrev [16] 0x1343:0x15 DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 108,97,98,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 8                                // DW_AT_decl_line
@@ -4263,13 +2996,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1352:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 2917                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b32 2917                            // DW_AT_type
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x1358:0x1a DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 118
+; CHECK: // .b8 108,100,105,118                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 23                               // DW_AT_decl_line
@@ -4283,28 +3013,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x1372:0x17 DW_TAG_subprogram
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
+; CHECK: // .b8 109,97,108,108,111,99            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 210                              // DW_AT_decl_line
 ; CHECK: // .b8 1
 ; CHECK: // .b32 4730                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0x1383:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 1                                // DW_AT_external
+; CHECK: // .b8 6                                // Abbrev [6] 0x1383:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x1389:0x1b DW_TAG_subprogram
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 110
+; CHECK: // .b8 109,98,108,101,110               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 95                               // DW_AT_decl_line
@@ -4317,15 +3038,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x139e:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 16                               // Abbrev [16] 0x13a4:0x23 DW_TAG_subprogram
-; CHECK:  / .b8 109                              // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 119
-; CHECK: // .b8 99
-; CHECK: // .b8 115
+; CHECK: // .b8 16                               // Abbrev [16] 0x13a4:0x23 DW_TAG_subprogram
+; CHECK: // .b8 109,98,115,116,111,119,99,115    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 106                              // DW_AT_decl_line
@@ -4338,29 +3052,18 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x13bc:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x13c1:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 4737                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b32 4737                            // DW_AT_type
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x13c7:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 5068                            // DW_AT_type
 ; CHECK: // .b8 7                                // Abbrev [7] 0x13cc:0xb DW_TAG_base_type
-; CHECK: // .b8 119                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 119,99,104,97,114,95,116         // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
 ; CHECK: // .b8 16                               // Abbrev [16] 0x13d7:0x21 DW_TAG_subprogram
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 119
-; CHECK:  / .b8 99
-; CHECK:  / .b8 0
+; CHECK: // .b8 109,98,116,111,119,99            // DW_AT_name
+; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 98                               // DW_AT_decl_line
 ; CHECK: // .b8 3
@@ -4375,11 +3078,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 23                               // Abbrev [23] 0x13f8:0x21 DW_TAG_subprogram
-; CHECK: // .b8 113                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 111
-; CHECK:  / .b8 114
-; CHECK:  / .b8 116
+; CHECK: // .b8 113,115,111,114,116              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 253                              // DW_AT_decl_line
@@ -4396,10 +3095,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 4772                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 24                               // Abbrev [24] 0x1419:0xf DW_TAG_subprogram
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 100
+; CHECK: // .b8 114,97,110,100                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 118                              // DW_AT_decl_line
@@ -4407,14 +3103,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
-; CHECK:  / .b8 16                               // Abbrev [16] 0x1428:0x1d DW_TAG_subprogram
-; CHECK:  / .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
+; CHECK: // .b8 16                               // Abbrev [16] 0x1428:0x1d DW_TAG_subprogram
+; CHECK: // .b8 114,101,97,108,108,111,99        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 224                              // DW_AT_decl_line
@@ -4428,11 +3118,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 23                               // Abbrev [23] 0x1445:0x12 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 100
+; CHECK: // .b8 115,114,97,110,100               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 120                              // DW_AT_decl_line
@@ -4443,28 +3129,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 5207                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x1457:0x10 DW_TAG_base_type
-; CHECK: // .b8 117                              // DW_AT_name
-; CHECK: // .b8 110
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 100
-; CHECK: // .b8 32
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 117,110,115,105,103,110,101,100,32,105,110,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 7                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1467:0x1b DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 100
+; CHECK: // .b8 115,116,114,116,111,100          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 164                              // DW_AT_decl_line
@@ -4479,19 +3149,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 8                                // Abbrev [8] 0x1482:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 4926                            // DW_AT_type
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1487:0x20 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 108
+; CHECK: // .b8 115,116,114,116,111,108          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 183                              // DW_AT_decl_line
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0x1497:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 1                                // DW_AT_external
+; CHECK: // .b8 6                                // Abbrev [6] 0x1497:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x149c:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 5250                            // DW_AT_type
@@ -4499,13 +3164,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x14a7:0x21 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 108
+; CHECK: // .b8 115,116,114,116,111,117,108      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 187                              // DW_AT_decl_line
@@ -4520,12 +3179,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x14c8:0x17 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 121
-; CHECK: // .b8 115
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 109
+; CHECK: // .b8 115,121,115,116,101,109          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 205                              // DW_AT_decl_line
@@ -4537,16 +3191,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x14df:0x23 DW_TAG_subprogram
-; CHECK: // .b8 119                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 115
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 109
-; CHECK: // .b8 98
-; CHECK:  / .b8 115
-; CHECK:  / .b8 0
-; CHECK:  / .b8 4                                // DW_AT_decl_file
+; CHECK: // .b8 119,99,115,116,111,109,98,115    // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 109                              // DW_AT_decl_line
 ; CHECK: // .b8 3
 ; CHECK: // .b32 4737                            // DW_AT_type
@@ -4561,15 +3208,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x1502:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 5383                            // DW_AT_type
-; CHECK:  / .b8 9                                // Abbrev [9] 0x1507:0x5 DW_TAG_const_type
-; CHECK:  / .b32 5068                            // DW_AT_type
+; CHECK: // .b8 9                                // Abbrev [9] 0x1507:0x5 DW_TAG_const_type
+; CHECK: // .b32 5068                            // DW_AT_type
 ; CHECK: // .b8 16                               // Abbrev [16] 0x150c:0x1c DW_TAG_subprogram
-; CHECK: // .b8 119                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 109
-; CHECK: // .b8 98
+; CHECK: // .b8 119,99,116,111,109,98            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 102                              // DW_AT_decl_line
@@ -4577,26 +3219,18 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0x151d:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 4926                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0x151d:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 4926                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1522:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 5068                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 2                                // Abbrev [2] 0x1528:0x78 DW_TAG_namespace
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 117
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 120
-; CHECK: // .b8 120
+; CHECK: // .b8 95,95,103,110,117,95,99,120,120  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 3                                // Abbrev [3] 0x1533:0x7 DW_TAG_imported_declaration
 ; CHECK: // .b8 5                                // DW_AT_decl_file
-; CHECK:  / .b8 201                              // DW_AT_decl_line
-; CHECK:  / .b32 5536                            // DW_AT_import
+; CHECK: // .b8 201                              // DW_AT_decl_line
+; CHECK: // .b32 5536                            // DW_AT_import
 ; CHECK: // .b8 3                                // Abbrev [3] 0x153a:0x7 DW_TAG_imported_declaration
 ; CHECK: // .b8 5                                // DW_AT_decl_file
 ; CHECK: // .b8 207                              // DW_AT_decl_line
@@ -4612,8 +3246,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 3                                // Abbrev [3] 0x154f:0x7 DW_TAG_imported_declaration
 ; CHECK: // .b8 5                                // DW_AT_decl_file
 ; CHECK: // .b8 228                              // DW_AT_decl_line
-; CHECK:  / .b32 5653                            // DW_AT_import
-; CHECK:  / .b8 3                                // Abbrev [3] 0x1556:0x7 DW_TAG_imported_declaration
+; CHECK: // .b32 5653                            // DW_AT_import
+; CHECK: // .b8 3                                // Abbrev [3] 0x1556:0x7 DW_TAG_imported_declaration
 ; CHECK: // .b8 5                                // DW_AT_decl_file
 ; CHECK: // .b8 229                              // DW_AT_decl_line
 ; CHECK: // .b32 5675                            // DW_AT_import
@@ -4630,30 +3264,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 233                              // DW_AT_decl_line
 ; CHECK: // .b32 5795                            // DW_AT_import
 ; CHECK: // .b8 25                               // Abbrev [25] 0x1572:0x2d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 57
-; CHECK:  / .b8 95
-; CHECK:  / .b8 95
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 117
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 120
-; CHECK: // .b8 120
-; CHECK: // .b8 51
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 118
-; CHECK: // .b8 69
-; CHECK: // .b8 120
-; CHECK: // .b8 120
+; CHECK: // .b8 95,90,78,57,95,95,103,110,117,95,99,120,120,51,100,105,118,69,120,120 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 118
+; CHECK: // .b8 100,105,118                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_decl_file
 ; CHECK: // .b8 214                              // DW_AT_decl_line
@@ -4668,25 +3281,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 11                               // Abbrev [11] 0x15a0:0xf DW_TAG_typedef
 ; CHECK: // .b32 5551                            // DW_AT_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 118
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 108,108,100,105,118,95,116       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 121                              // DW_AT_decl_line
 ; CHECK: // .b8 13                               // Abbrev [13] 0x15af:0x22 DW_TAG_structure_type
 ; CHECK: // .b8 16                               // DW_AT_byte_size
-; CHECK:  / .b8 4                                // DW_AT_decl_file
-; CHECK:  / .b8 117                              // DW_AT_decl_line
+; CHECK: // .b8 4                                // DW_AT_decl_file
+; CHECK: // .b8 117                              // DW_AT_decl_line
 ; CHECK: // .b8 14                               // Abbrev [14] 0x15b3:0xf DW_TAG_member
-; CHECK: // .b8 113                              // DW_AT_name
-; CHECK: // .b8 117
-; CHECK: // .b8 111
-; CHECK: // .b8 116
+; CHECK: // .b8 113,117,111,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 4                                // DW_AT_decl_file
@@ -4695,23 +3299,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 35
 ; CHECK: // .b8 0
 ; CHECK: // .b8 14                               // Abbrev [14] 0x15c2:0xe DW_TAG_member
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 109
+; CHECK: // .b8 114,101,109                      // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK:  / .b32 1508                            // DW_AT_type
-; CHECK:  / .b8 4                                // DW_AT_decl_file
+; CHECK: // .b32 1508                            // DW_AT_type
+; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 120                              // DW_AT_decl_line
 ; CHECK: // .b8 2                                // DW_AT_data_member_location
 ; CHECK: // .b8 35
 ; CHECK: // .b8 8
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 22                               // Abbrev [22] 0x15d1:0x13 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 69
-; CHECK: // .b8 120
-; CHECK: // .b8 105
-; CHECK: // .b8 116
+; CHECK: // .b8 95,69,120,105,116                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 45                               // DW_AT_decl_line
@@ -4723,11 +3321,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x15e4:0x16 DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 108,108,97,98,115                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 12                               // DW_AT_decl_line
@@ -4739,11 +3333,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x15fa:0x1b DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK:  / .b8 118
+; CHECK: // .b8 108,108,100,105,118              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 29                               // DW_AT_decl_line
@@ -4757,11 +3347,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x1615:0x16 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 108
-; CHECK: // .b8 108
+; CHECK: // .b8 97,116,111,108,108               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 36                               // DW_AT_decl_line
@@ -4771,15 +3357,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1625:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x162b:0x21 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 108
-; CHECK: // .b8 108
+; CHECK: // .b8 115,116,114,116,111,108,108      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 209                              // DW_AT_decl_line
@@ -4794,14 +3374,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x164c:0x22 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK:  / .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 108
-; CHECK: // .b8 108
+; CHECK: // .b8 115,116,114,116,111,117,108,108  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 214                              // DW_AT_decl_line
@@ -4816,57 +3389,25 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x166e:0x1a DW_TAG_base_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK:  / .b8 32
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 100
-; CHECK: // .b8 32
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 108,111,110,103,32,108,111,110,103,32,117,110,115,105,103,110,101,100,32,105,110,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 7                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1688:0x1b DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 102
+; CHECK: // .b8 115,116,114,116,111,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 172                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0x1698:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0x1698:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x169d:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 5250                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x16a3:0x1c DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 108
-; CHECK: // .b8 100
+; CHECK: // .b8 115,116,114,116,111,108,100      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 175                              // DW_AT_decl_line
@@ -4879,38 +3420,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 5250                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x16bf:0xf DW_TAG_base_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK:  / .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 100
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 101
+; CHECK: // .b8 108,111,110,103,32,100,111,117,98,108,101 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
 ; CHECK: // .b8 26                               // Abbrev [26] 0x16ce:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,99,111,115,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 97,99,111,115,102                // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK:  / .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 62                               // DW_AT_decl_line
 ; CHECK: // .b8 5
@@ -4920,24 +3438,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x16ee:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 97
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,97,99,111,115,104,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK:  / .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 97,99,111,115,104,102            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 90                               // DW_AT_decl_line
@@ -4948,22 +3451,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1710:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,115,105,110,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK:  / .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 97,115,105,110,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 57                               // DW_AT_decl_line
@@ -4974,56 +3464,25 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1730:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,97,115,105,110,104,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 97,115,105,110,104,102           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 95                               // DW_AT_decl_line
 ; CHECK: // .b8 5
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x174c:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1752:0x28 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,97,116,97,110,50,102,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 50
-; CHECK: // .b8 102
+; CHECK: // .b8 97,116,97,110,50,102             // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
-; CHECK:  / .b8 47                               // DW_AT_decl_line
+; CHECK: // .b8 47                               // DW_AT_decl_line
 ; CHECK: // .b8 5
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
@@ -5033,23 +3492,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x177a:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,116,97,110,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 97,116,97,110,102                // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK:  / .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 52                               // DW_AT_decl_line
 ; CHECK: // .b8 5
@@ -5059,24 +3505,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x179a:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,97,116,97,110,104,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK:  / .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 97,116,97,110,104,102            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 100                              // DW_AT_decl_line
@@ -5087,22 +3518,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x17bc:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 99
-; CHECK: // .b8 98
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK:  / .b8 98
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,99,98,114,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,98,114,116,102                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 150                              // DW_AT_decl_line
@@ -5113,22 +3531,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x17dc:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 99
-; CHECK: // .b8 101
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK:  / .b8 101
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,99,101,105,108,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,101,105,108,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 155                              // DW_AT_decl_line
@@ -5139,31 +3544,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x17fc:0x2e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 57
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 112
-; CHECK: // .b8 121
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK:  / .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 112
-; CHECK: // .b8 121
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,57,99,111,112,121,115,105,103,110,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,111,112,121,115,105,103,110,102 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 165                              // DW_AT_decl_line
@@ -5176,20 +3559,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x182a:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,99,111,115,102,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,111,115,102                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 219                              // DW_AT_decl_line
@@ -5200,22 +3572,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1848:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK:  / .b8 53
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,99,111,115,104,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,111,115,104,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 32                               // DW_AT_decl_line
@@ -5226,22 +3585,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1868:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK:  / .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 99
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 99
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,101,114,102,99,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,114,102,99,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 210                              // DW_AT_decl_line
@@ -5252,20 +3598,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1888:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK:  / .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,101,114,102,102,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,114,102,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 200                              // DW_AT_decl_line
@@ -5276,22 +3611,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x18a6:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK:  / .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 50
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,101,120,112,50,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,120,112,50,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 145                              // DW_AT_decl_line
@@ -5302,22 +3624,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x18c6:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 9                                // DW_AT_decl_file
+; CHECK: // .b8 95,90,76,52,101,120,112,102,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,120,112,102                  // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 14                               // DW_AT_decl_line
 ; CHECK: // .b8 5
 ; CHECK: // .b32 1554                            // DW_AT_type
@@ -5326,27 +3637,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x18e4:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 109
-; CHECK: // .b8 49
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 109
-; CHECK: // .b8 49
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,101,120,112,109,49,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,120,112,109,49,102           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
-; CHECK:  / .b8 105                              // DW_AT_decl_line
+; CHECK: // .b8 105                              // DW_AT_decl_line
 ; CHECK: // .b8 5
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
@@ -5354,49 +3650,22 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1906:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,102,97,98,115,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,97,98,115,102                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 95                               // DW_AT_decl_line
 ; CHECK: // .b8 2
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1920:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1926:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,102,100,105,109,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,100,105,109,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 80                               // DW_AT_decl_line
@@ -5407,26 +3676,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1946:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x194c:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 102
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,102,108,111,111,114,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,108,111,111,114,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -5437,22 +3691,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x196e:0x2a DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,102,109,97,102,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,97,102                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 32                               // DW_AT_decl_line
@@ -5467,23 +3708,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1998:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 120
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 120
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,102,109,97,120,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,97,120,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 110                              // DW_AT_decl_line
@@ -5496,23 +3723,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x19be:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,102,109,105,110,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,105,110,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 105                              // DW_AT_decl_line
@@ -5525,23 +3738,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x19e4:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK:  / .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,102,109,111,100,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,111,100,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 17                               // DW_AT_decl_line
@@ -5554,26 +3753,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1a0a:0x29 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 102
-; CHECK:  / .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,102,114,101,120,112,102,102,80,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,114,101,120,112,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 7                                // DW_AT_decl_line
@@ -5586,25 +3768,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2377                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1a33:0x28 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 104
-; CHECK:  / .b8 121
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 104                              // DW_AT_name
-; CHECK: // .b8 121
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,104,121,112,111,116,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 104,121,112,111,116,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 110                              // DW_AT_decl_line
@@ -5617,24 +3783,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1a5b:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,105,108,111,103,98,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
+; CHECK: // .b8 105,108,111,103,98,102           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -5645,25 +3796,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1a7d:0x28 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,108,100,101,120,112,102,102,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,100,101,120,112,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 240                              // DW_AT_decl_line
@@ -5673,59 +3808,25 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1a9a:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1a9f:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 2332                            // DW_AT_type
+; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1aa5:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 108
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,108,103,97,109,109,97,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,103,97,109,109,97,102        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 235                              // DW_AT_decl_line
-; CHECK:  / .b8 5
+; CHECK: // .b8 5
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1ac3:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1ac9:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK:  / .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,108,108,114,105,110,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,108,114,105,110,116,102      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 125                              // DW_AT_decl_line
@@ -5736,28 +3837,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1aed:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 56
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK:  / .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,56,108,108,114,111,117,110,100,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,108,114,111,117,110,100,102  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 66                               // DW_AT_decl_line
@@ -5768,24 +3850,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1b13:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK:  / .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,108,111,103,49,48,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,49,48,102            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 76                               // DW_AT_decl_line
@@ -5796,24 +3863,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1b35:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK:  / .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 112
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,108,111,103,49,112,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,49,112,102           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -5824,22 +3876,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1b57:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 50
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,108,111,103,50,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,50,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 5                                // DW_AT_decl_line
@@ -5848,24 +3887,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1b71:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1b77:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,108,111,103,98,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,98,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 90                               // DW_AT_decl_line
@@ -5876,20 +3902,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1b97:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,108,111,103,102,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 67                               // DW_AT_decl_line
@@ -5900,24 +3915,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1bb5:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK:  / .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,108,114,105,110,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,114,105,110,116,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 116                              // DW_AT_decl_line
@@ -5928,54 +3928,22 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1bd7:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,108,114,111,117,110,100,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,114,111,117,110,100,102      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 71                               // DW_AT_decl_line
 ; CHECK: // .b8 6
 ; CHECK: // .b32 2917                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1bf5:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1bfb:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,109,111,100,102,102,102,80,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 109,111,100,102,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 12                               // DW_AT_decl_line
@@ -5988,71 +3956,22 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3345                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1c22:0x2b DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 98
-; CHECK: // .b8 121
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,48,110,101,97,114,98,121,105,110,116,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 98
-; CHECK: // .b8 121
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 110,101,97,114,98,121,105,110,116,102 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 130                              // DW_AT_decl_line
-; CHECK:  / .b8 4
+; CHECK: // .b8 4
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1c47:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1c4d:0x31 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,48,110,101,120,116,97,102,116,101,114,102,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK:  / .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
+; CHECK: // .b8 110,101,120,116,97,102,116,101,114,102 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 194                              // DW_AT_decl_line
@@ -6065,21 +3984,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1c7e:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 119
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 112                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 119
-; CHECK:  / .b8 102
+; CHECK: // .b8 95,90,76,52,112,111,119,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 112,111,119,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 47                               // DW_AT_decl_line
@@ -6092,34 +3999,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1ca2:0x31 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK:  / .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,48,114,101,109,97,105,110,100,101,114,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,101,109,97,105,110,100,101,114,102 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 22                               // DW_AT_decl_line
@@ -6132,29 +4014,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1cd3:0x31 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK:  / .b8 109
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 111
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 111
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,114,101,109,113,117,111,102,102,102,80,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,101,109,113,117,111,102      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 27                               // DW_AT_decl_line
@@ -6164,57 +4026,29 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1cf4:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1cf9:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 1554                            // DW_AT_type
+; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1cfe:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 2377                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1d04:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,114,105,110,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,105,110,116,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 111                              // DW_AT_decl_line
 ; CHECK: // .b8 4
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1d1e:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1d24:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 9                                // DW_AT_decl_file
+; CHECK: // .b8 95,90,76,54,114,111,117,110,100,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,111,117,110,100,102          // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 61                               // DW_AT_decl_line
 ; CHECK: // .b8 6
 ; CHECK: // .b32 1554                            // DW_AT_type
@@ -6223,29 +4057,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1d46:0x2c DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 56
-; CHECK: // .b8 115
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 108
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK:  / .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,56,115,99,97,108,98,108,110,102,102,108 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,99,97,108,98,108,110,102     // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 250                              // DW_AT_decl_line
@@ -6258,27 +4072,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1d72:0x2a DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 115
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK:  / .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,115,99,97,108,98,110,102,102,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,99,97,108,98,110,102         // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 245                              // DW_AT_decl_line
@@ -6291,20 +4087,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1d9c:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK:  / .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,115,105,110,102,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,105,110,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 210                              // DW_AT_decl_line
@@ -6315,22 +4100,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1dba:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,115,105,110,104,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,105,110,104,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 37                               // DW_AT_decl_line
@@ -6341,22 +4113,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1dda:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 115
-; CHECK: // .b8 113
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 113
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,115,113,114,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,113,114,116,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 139                              // DW_AT_decl_line
@@ -6367,20 +4126,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1dfa:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,116,97,110,102,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,97,110,102                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 252                              // DW_AT_decl_line
@@ -6391,22 +4139,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1e18:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,116,97,110,104,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,97,110,104,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 42                               // DW_AT_decl_line
@@ -6417,26 +4152,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1e38:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 116
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,116,103,97,109,109,97,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,103,97,109,109,97,102        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 56                               // DW_AT_decl_line
@@ -6447,24 +4165,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1e5c:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 99
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 99
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,116,114,117,110,99,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,114,117,110,99,102           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 150                              // DW_AT_decl_line
@@ -6475,181 +4178,27 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 27                               // Abbrev [27] 0x1e7e:0x22a DW_TAG_structure_type
-; CHECK:  / .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_byte_size
-; CHECK:  / .b8 13                               // DW_AT_decl_file
+; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 77                               // DW_AT_decl_line
 ; CHECK: // .b8 28                               // Abbrev [28] 0x1e9c:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK:  / .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,120,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,120 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 78                               // DW_AT_decl_line
-; CHECK:  / .b32 5207                            // DW_AT_type
+; CHECK: // .b32 5207                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x1eeb:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK:  / .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,121,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK:  / .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,121 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 79                               // DW_AT_decl_line
@@ -6657,138 +4206,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x1f3a:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK:  / .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK:  / .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,122,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
-; CHECK: // .b8 0
-; CHECK:  / .b8 13                               // DW_AT_decl_file
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,122 // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 80                               // DW_AT_decl_line
 ; CHECK: // .b32 5207                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 25                               // Abbrev [25] 0x1f89:0x49 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK:  / .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 118
-; CHECK: // .b8 53
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
-; CHECK: // .b8 69
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,99,118,53,117,105,110,116,51,69 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 118
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK:  / .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 32
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
+; CHECK: // .b8 111,112,101,114,97,116,111,114,32,117,105,110,116,51 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 83                               // DW_AT_decl_line
@@ -6800,36 +4232,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x1fd2:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK:  / .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 1                                // DW_AT_external
+; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 3                                // DW_AT_accessibility
 ; CHECK:                                         // DW_ACCESS_private
 ; CHECK: // .b8 29                               // Abbrev [29] 0x1ff2:0x6 DW_TAG_formal_parameter
@@ -6837,31 +4245,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x1ff9:0x2c DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK:  / .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -6876,54 +4260,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 8422                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 31                               // Abbrev [31] 0x2025:0x43 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK:  / .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 83
-; CHECK: // .b8 69
-; CHECK: // .b8 82
-; CHECK: // .b8 75
-; CHECK: // .b8 83
-; CHECK: // .b8 95
-; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 61
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,97,83,69,82,75,83,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 111,112,101,114,97,116,111,114,61 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -6938,51 +4277,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 8422                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 32                               // Abbrev [32] 0x2068:0x3f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,97,100,69,118 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 38
+; CHECK: // .b8 111,112,101,114,97,116,111,114,38 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -6997,11 +4294,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 27                               // Abbrev [27] 0x20a8:0x2f DW_TAG_structure_type
-; CHECK: // .b8 117                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
+; CHECK: // .b8 117,105,110,116,51               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_byte_size
 ; CHECK: // .b8 14                               // DW_AT_decl_file
@@ -7048,105 +4341,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 7836                            // DW_AT_specification
 ; CHECK: // .b8 1                                // DW_AT_inline
 ; CHECK: // .b8 27                               // Abbrev [27] 0x20f6:0x228 DW_TAG_structure_type
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK:  / .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_byte_size
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 88                               // DW_AT_decl_line
 ; CHECK: // .b8 28                               // Abbrev [28] 0x2114:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK:  / .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,120,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK:  / .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,120 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 89                               // DW_AT_decl_line
@@ -7154,151 +4358,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x2163:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK:  / .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK:  / .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,121,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
-; CHECK: // .b8 0
-; CHECK:  / .b8 13                               // DW_AT_decl_file
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,121 // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 90                               // DW_AT_decl_line
 ; CHECK: // .b32 5207                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x21b2:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK:  / .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,122,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK:  / .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,122 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 91                               // DW_AT_decl_line
@@ -7306,60 +4380,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 25                               // Abbrev [25] 0x2201:0x47 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK:  / .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 118
-; CHECK: // .b8 52
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK:  / .b8 109
-; CHECK: // .b8 51
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,99,118,52,100,105,109,51,69,118 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 32
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 51
+; CHECK: // .b8 111,112,101,114,97,116,111,114,32,100,105,109,51 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 94                               // DW_AT_decl_line
@@ -7368,34 +4391,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 29                               // Abbrev [29] 0x2241:0x6 DW_TAG_formal_parameter
 ; CHECK: // .b32 9166                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_artificial
+; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x2248:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK:  / .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 96                               // DW_AT_decl_line
@@ -7408,31 +4407,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x226f:0x2c DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK:  / .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 96                               // DW_AT_decl_line
@@ -7447,54 +4422,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9181                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 31                               // Abbrev [31] 0x229b:0x43 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK:  / .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 83
-; CHECK: // .b8 69
-; CHECK: // .b8 82
-; CHECK: // .b8 75
-; CHECK: // .b8 83
-; CHECK: // .b8 95
-; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 61
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,97,83,69,82,75,83,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 111,112,101,114,97,116,111,114,61 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 96                               // DW_AT_decl_line
@@ -7509,51 +4439,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9181                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 32                               // Abbrev [32] 0x22de:0x3f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,97,100,69,118 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 38
+; CHECK: // .b8 111,112,101,114,97,116,111,114,38 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 96                               // DW_AT_decl_line
@@ -7568,10 +4456,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 35                               // Abbrev [35] 0x231e:0x9d DW_TAG_structure_type
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 51
+; CHECK: // .b8 100,105,109,51                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_byte_size
 ; CHECK: // .b8 14                               // DW_AT_decl_file
@@ -7608,10 +4493,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 35
 ; CHECK: // .b8 8
 ; CHECK: // .b8 23                               // Abbrev [23] 0x234f:0x21 DW_TAG_subprogram
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 51
+; CHECK: // .b8 100,105,109,51                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 14                               // DW_AT_decl_file
 ; CHECK: // .b8 165                              // DW_AT_decl_line
@@ -7629,10 +4511,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 5207                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 23                               // Abbrev [23] 0x2370:0x17 DW_TAG_subprogram
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 51
+; CHECK: // .b8 100,105,109,51                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 14                               // DW_AT_decl_file
 ; CHECK: // .b8 166                              // DW_AT_decl_line
@@ -7646,41 +4525,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9152                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 37                               // Abbrev [37] 0x2387:0x33 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 52
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK:  / .b8 51
-; CHECK: // .b8 99
-; CHECK: // .b8 118
-; CHECK: // .b8 53
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,52,100,105,109,51,99,118,53,117,105,110,116,51,69,118 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 32
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
-; CHECK: // .b8 0
-; CHECK:  / .b8 14                               // DW_AT_decl_file
+; CHECK: // .b8 111,112,101,114,97,116,111,114,32,117,105,110,116,51 // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 14                               // DW_AT_decl_file
 ; CHECK: // .b8 167                              // DW_AT_decl_line
 ; CHECK: // .b8 1
 ; CHECK: // .b32 9152                            // DW_AT_type
@@ -7695,11 +4544,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 8990                            // DW_AT_type
 ; CHECK: // .b8 20                               // Abbrev [20] 0x23c0:0xe DW_TAG_typedef
 ; CHECK: // .b32 8360                            // DW_AT_type
-; CHECK: // .b8 117                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
+; CHECK: // .b8 117,105,110,116,51               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 14                               // DW_AT_decl_file
 ; CHECK: // .b8 127                              // DW_AT_decl_line
@@ -7718,107 +4563,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 8468                            // DW_AT_specification
 ; CHECK: // .b8 1                                // DW_AT_inline
 ; CHECK: // .b8 27                               // Abbrev [27] 0x23ed:0x233 DW_TAG_structure_type
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK:  / .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_byte_size
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 66                               // DW_AT_decl_line
 ; CHECK: // .b8 28                               // Abbrev [28] 0x240c:0x50 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK:  / .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK:  / .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,49,55,95,95,102,101,116,99,104 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 95,98,117,105,108,116,105,110,95,120,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK:  / .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,120 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 67                               // DW_AT_decl_line
@@ -7826,76 +4580,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x245c:0x50 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK:  / .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,49,55,95,95,102,101,116,99,104 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 95,98,117,105,108,116,105,110,95,121,69,118
 ; CHECK: // .b8 0
-; CHECK:  / .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,121 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 68                               // DW_AT_decl_line
@@ -7903,76 +4591,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x24ac:0x50 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK:  / .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,49,55,95,95,102,101,116,99,104 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 95,98,117,105,108,116,105,110,95,122,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK:  / .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,122 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 69                               // DW_AT_decl_line
@@ -7980,64 +4602,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 25                               // Abbrev [25] 0x24fc:0x4a DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK:  / .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 118
-; CHECK: // .b8 53
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,75,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,99,118,53,117,105,110,116,51 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 69,118
+; CHECK: // .b8 0
+; CHECK: // .b8 111,112,101,114,97,116,111,114,32,117,105,110,116,51 // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 32
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
-; CHECK:  / .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 72                               // DW_AT_decl_line
 ; CHECK: // .b32 8360                            // DW_AT_type
@@ -8048,32 +4617,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x2546:0x28 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK:  / .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
@@ -8086,32 +4630,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x256e:0x2d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
@@ -8126,55 +4645,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9775                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 31                               // Abbrev [31] 0x259b:0x44 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 83
-; CHECK: // .b8 69
-; CHECK: // .b8 82
-; CHECK: // .b8 75
-; CHECK: // .b8 83
-; CHECK: // .b8 95
-; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 61
+; CHECK: // .b8 95,90,78,75,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,97,83,69,82,75,83,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 111,112,101,114,97,116,111,114,61 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
@@ -8189,52 +4662,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9775                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 32                               // Abbrev [32] 0x25df:0x40 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,75,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,97,100,69,118 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 38
+; CHECK: // .b8 111,112,101,114,97,116,111,114,38 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
@@ -8262,20 +4692,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9228                            // DW_AT_specification
 ; CHECK: // .b8 1                                // DW_AT_inline
 ; CHECK: // .b8 38                               // Abbrev [38] 0x263f:0x32 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 51
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 115
+; CHECK: // .b8 95,90,51,114,101,115,102,102,80,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,101,115                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_decl_file
 ; CHECK: // .b8 3                                // DW_AT_decl_line
@@ -8294,9 +4713,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 3                                // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 39                               // Abbrev [39] 0x2665:0xb DW_TAG_formal_parameter
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 115
+; CHECK: // .b8 114,101,115                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_decl_file
 ; CHECK: // .b8 3                                // DW_AT_decl_line
@@ -8307,26 +4724,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
 ; CHECK: // .b8 1                                // DW_AT_frame_base
 ; CHECK: // .b8 156
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 53
-; CHECK: // .b8 115
-; CHECK: // .b8 97
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 121
-; CHECK: // .b8 105
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 102
-; CHECK: // .b8 83
-; CHECK: // .b8 95
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 121
+; CHECK: // .b8 95,90,53,115,97,120,112,121,105,102,80,102,83,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,97,120,112,121               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_decl_file
 ; CHECK: // .b8 5                                // DW_AT_decl_line
diff --git a/test/DebugInfo/NVPTX/debug-loc-offset.ll b/test/DebugInfo/NVPTX/debug-loc-offset.ll
index 53c5fd9dff1..91926517bbc 100644
--- a/test/DebugInfo/NVPTX/debug-loc-offset.ll
+++ b/test/DebugInfo/NVPTX/debug-loc-offset.ll
@@ -166,8 +166,7 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b8 1                                // DW_FORM_addr
 ; CHECK: // .b8 64                               // DW_AT_frame_base
 ; CHECK: // .b8 10                               // DW_FORM_block1
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -201,8 +200,7 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b8 1                                // DW_FORM_addr
 ; CHECK: // .b8 64                               // DW_AT_frame_base
 ; CHECK: // .b8 10                               // DW_FORM_block1
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -250,74 +248,14 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b32 .debug_abbrev                   // Offset Into Abbrev. Section
 ; CHECK: // .b8 8                                // Address Size (in bytes)
 ; CHECK: // .b8 1                                // Abbrev [1] 0xb:0x8f DW_TAG_compile_unit
-; CHECK: // .b8 99                               // DW_AT_producer
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 118
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 32
-; CHECK: // .b8 51
-; CHECK: // .b8 46
-; CHECK: // .b8 53
-; CHECK: // .b8 46
-; CHECK: // .b8 48
-; CHECK: // .b8 32
-; CHECK: // .b8 40
-; CHECK: // .b8 50
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 52
-; CHECK: // .b8 55
-; CHECK: // .b8 57
-; CHECK: // .b8 41
+; CHECK: // .b8 99,108,97,110,103,32,118,101,114,115,105,111,110,32,51,46,53,46,48,32,40,50,49,48,52,55,57,41 // DW_AT_producer
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 103
-; CHECK: // .b8 45
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 45
-; CHECK: // .b8 111
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 50
-; CHECK: // .b8 46
-; CHECK: // .b8 99
-; CHECK: // .b8 99
+; CHECK: // .b8 100,101,98,117,103,45,108,111,99,45,111,102,102,115,101,116,50,46,99,99 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 47                               // DW_AT_comp_dir
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 118
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 107
-; CHECK: // .b8 101
-; CHECK: // .b8 95
-; CHECK: // .b8 103
-; CHECK: // .b8 99
-; CHECK: // .b8 99
+; CHECK: // .b8 47,108,108,118,109,95,99,109,97,107,101,95,103,99,99 // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin1                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end1                      // DW_AT_high_pc
@@ -330,18 +268,9 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b64 Lfunc_end1                      // DW_AT_high_pc
 ; CHECK: // .b8 1                                // DW_AT_frame_base
 ; CHECK: // .b8 156
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 51
-; CHECK: // .b8 98
-; CHECK: // .b8 97
-; CHECK: // .b8 122
-; CHECK: // .b8 49
-; CHECK: // .b8 65
+; CHECK: // .b8 95,90,51,98,97,122,49,65         // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 122
+; CHECK: // .b8 98,97,122                        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 6                                // DW_AT_decl_line
@@ -360,74 +289,14 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b32 .debug_abbrev                   // Offset Into Abbrev. Section
 ; CHECK: // .b8 8                                // Address Size (in bytes)
 ; CHECK: // .b8 1                                // Abbrev [1] 0xb:0x91 DW_TAG_compile_unit
-; CHECK: // .b8 99                               // DW_AT_producer
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 118
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 32
-; CHECK: // .b8 51
-; CHECK: // .b8 46
-; CHECK: // .b8 53
-; CHECK: // .b8 46
-; CHECK: // .b8 48
-; CHECK: // .b8 32
-; CHECK: // .b8 40
-; CHECK: // .b8 50
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 52
-; CHECK: // .b8 55
-; CHECK: // .b8 57
-; CHECK: // .b8 41
+; CHECK: // .b8 99,108,97,110,103,32,118,101,114,115,105,111,110,32,51,46,53,46,48,32,40,50,49,48,52,55,57,41 // DW_AT_producer
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 103
-; CHECK: // .b8 45
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 45
-; CHECK: // .b8 111
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 46
-; CHECK: // .b8 99
-; CHECK: // .b8 99
+; CHECK: // .b8 100,101,98,117,103,45,108,111,99,45,111,102,102,115,101,116,49,46,99,99 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 47                               // DW_AT_comp_dir
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 118
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 107
-; CHECK: // .b8 101
-; CHECK: // .b8 95
-; CHECK: // .b8 103
-; CHECK: // .b8 99
-; CHECK: // .b8 99
+; CHECK: // .b8 47,108,108,118,109,95,99,109,97,107,101,95,103,99,99 // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin0                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
@@ -436,17 +305,9 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
 ; CHECK: // .b8 1                                // DW_AT_frame_base
 ; CHECK: // .b8 156
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 51
-; CHECK: // .b8 98
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 105
+; CHECK: // .b8 95,90,51,98,97,114,105           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 114
+; CHECK: // .b8 98,97,114                        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 1                                // DW_AT_decl_line
@@ -460,9 +321,7 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b32 148                             // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x94:0x7 DW_TAG_base_type
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 105,110,116                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
-- 
GitLab


From 31fda7550e7e17fbd8c765d205e55b5562c6469c Mon Sep 17 00:00:00 2001
From: Cameron McInally <cameron.mcinally@nyu.edu>
Date: Wed, 24 Oct 2018 14:45:18 +0000
Subject: [PATCH 0501/1116] [FPEnv] Convert more BinaryOperator::isFNeg(...) to
 m_FNeg(...)

This work is to avoid regressions when we seperate FNeg from the FSub IR instruction.

Differential Revision: https://reviews.llvm.org/D53205


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345146 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/PatternMatch.h         | 29 ++++++++++++++++++++++----
 lib/Transforms/Scalar/Reassociate.cpp  | 17 +++++++--------
 test/Transforms/Reassociate/fp-expr.ll |  4 ++--
 3 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h
index 7c058342265..dd30072ce57 100644
--- a/include/llvm/IR/PatternMatch.h
+++ b/include/llvm/IR/PatternMatch.h
@@ -659,11 +659,32 @@ inline BinaryOp_match<LHS, RHS, Instruction::FSub> m_FSub(const LHS &L,
   return BinaryOp_match<LHS, RHS, Instruction::FSub>(L, R);
 }
 
+template <typename Op_t> struct FNeg_match {
+  Op_t X;
+
+  FNeg_match(const Op_t &Op) : X(Op) {}
+  template <typename OpTy> bool match(OpTy *V) {
+    auto *FPMO = dyn_cast<FPMathOperator>(V);
+    if (!FPMO || FPMO->getOpcode() != Instruction::FSub)
+      return false;
+    if (FPMO->hasNoSignedZeros()) {
+      // With 'nsz', any zero goes.
+      if (!cstfp_pred_ty<is_any_zero_fp>().match(FPMO->getOperand(0)))
+        return false;
+    } else {
+      // Without 'nsz', we need fsub -0.0, X exactly.
+      if (!cstfp_pred_ty<is_neg_zero_fp>().match(FPMO->getOperand(0)))
+        return false;
+    }
+    return X.match(FPMO->getOperand(1));
+  }
+};
+
 /// Match 'fneg X' as 'fsub -0.0, X'.
-template <typename RHS>
-inline BinaryOp_match<cstfp_pred_ty<is_neg_zero_fp>, RHS, Instruction::FSub>
-m_FNeg(const RHS &X) {
-  return m_FSub(m_NegZeroFP(), X);
+template <typename OpTy>
+inline FNeg_match<OpTy>
+m_FNeg(const OpTy &X) {
+  return FNeg_match<OpTy>(X);
 }
 
 /// Match 'fneg X' as 'fsub +-0.0, X'.
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index c4faab7c58c..61b6d7ca259 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -208,7 +208,7 @@ unsigned ReassociatePass::getRank(Value *V) {
   // If this is a 'not' or 'neg' instruction, do not count it for rank. This
   // assures us that X and ~X will have the same rank.
   if (!match(I, m_Not(m_Value())) && !match(I, m_Neg(m_Value())) &&
-      !BinaryOperator::isFNeg(I))
+      !match(I, m_FNeg(m_Value())))
     ++Rank;
 
   LLVM_DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank
@@ -575,7 +575,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
       // multiplies by -1 so they can be reassociated.
       if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
         if ((Opcode == Instruction::Mul && match(BO, m_Neg(m_Value()))) ||
-            (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
+            (Opcode == Instruction::FMul && match(BO, m_FNeg(m_Value())))) {
           LLVM_DEBUG(dbgs()
                      << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
           BO = LowerNegateToMultiply(BO);
@@ -855,7 +855,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
   // Okay, we need to materialize a negated version of V with an instruction.
   // Scan the use lists of V to see if we have one already.
   for (User *U : V->users()) {
-    if (!match(U, m_Neg(m_Value())) && !BinaryOperator::isFNeg(U))
+    if (!match(U, m_Neg(m_Value())) && !match(U, m_FNeg(m_Value())))
       continue;
 
     // We found one!  Now we have to make sure that the definition dominates
@@ -900,7 +900,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
 /// Return true if we should break up this subtract of X-Y into (X + -Y).
 static bool ShouldBreakUpSubtract(Instruction *Sub) {
   // If this is a negation, we can't split it up!
-  if (match(Sub, m_Neg(m_Value())) || BinaryOperator::isFNeg(Sub))
+  if (match(Sub, m_Neg(m_Value())) || match(Sub, m_FNeg(m_Value()))) 
     return false;
 
   // Don't breakup X - undef.
@@ -1463,19 +1463,16 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
     // Check for X and -X or X and ~X in the operand list.
     Value *X;
     if (!match(TheOp, m_Neg(m_Value(X))) && !match(TheOp, m_Not(m_Value(X))) &&
-        !BinaryOperator::isFNeg(TheOp))
+        !match(TheOp, m_FNeg(m_Value(X))))
       continue;
 
-    if (BinaryOperator::isFNeg(TheOp))
-      X = BinaryOperator::getFNegArgument(TheOp);
-
     unsigned FoundX = FindInOperandList(Ops, i, X);
     if (FoundX == i)
       continue;
 
     // Remove X and -X from the operand list.
     if (Ops.size() == 2 &&
-        (match(TheOp, m_Neg(m_Value())) || BinaryOperator::isFNeg(TheOp)))
+        (match(TheOp, m_Neg(m_Value())) || match(TheOp, m_FNeg(m_Value()))))
       return Constant::getNullValue(X->getType());
 
     // Remove X and ~X from the operand list.
@@ -2081,7 +2078,7 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
       RedoInsts.insert(I);
       MadeChange = true;
       I = NI;
-    } else if (BinaryOperator::isFNeg(I)) {
+    } else if (match(I, m_FNeg(m_Value()))) {
       // Otherwise, this is a negation.  See if the operand is a multiply tree
       // and if this is not an inner node of a multiply tree.
       if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
diff --git a/test/Transforms/Reassociate/fp-expr.ll b/test/Transforms/Reassociate/fp-expr.ll
index e616c52f28e..dcbf835ba54 100644
--- a/test/Transforms/Reassociate/fp-expr.ll
+++ b/test/Transforms/Reassociate/fp-expr.ll
@@ -4,8 +4,8 @@
 define void @test1() {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[T1:%.*]] = tail call <4 x float> @blam()
-; CHECK-NEXT:    [[T1_NEG:%.*]] = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[T1]]
-; CHECK-NEXT:    [[T24:%.*]] = fadd fast <4 x float> [[T1_NEG]], undef
+; CHECK-NEXT:    [[T23:%.*]] = fsub fast <4 x float> undef, [[T1]]
+; CHECK-NEXT:    [[T24:%.*]] = fadd fast <4 x float> [[T23]], undef
 ; CHECK-NEXT:    tail call void @wombat(<4 x float> [[T24]])
 ; CHECK-NEXT:    ret void
 ;
-- 
GitLab


From a9c48593d9914b9b2d118eb359c2d63e26f44459 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Wed, 24 Oct 2018 15:06:27 +0000
Subject: [PATCH 0502/1116] [llvm-mca] Refactor class SourceMgr. NFCI

Added begin()/end() methods to allow the usage of SourceMgr in foreach loops.
With this change, method getMCInstFromIndex() (as well as a couple of other
methods) are now redundant, and can be removed from the public interface.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345147 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/Views/InstructionInfoView.cpp  |  4 +-
 tools/llvm-mca/Views/ResourcePressureView.cpp | 10 ++--
 tools/llvm-mca/Views/TimelineView.cpp         | 51 ++++++++++---------
 tools/llvm-mca/include/SourceMgr.h            | 22 +++-----
 4 files changed, 43 insertions(+), 44 deletions(-)

diff --git a/tools/llvm-mca/Views/InstructionInfoView.cpp b/tools/llvm-mca/Views/InstructionInfoView.cpp
index a2e3001383a..0a97e569c47 100644
--- a/tools/llvm-mca/Views/InstructionInfoView.cpp
+++ b/tools/llvm-mca/Views/InstructionInfoView.cpp
@@ -22,7 +22,6 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
   const MCSchedModel &SM = STI.getSchedModel();
-  unsigned Instructions = Source.size();
 
   std::string Instruction;
   raw_string_ostream InstrStream(Instruction);
@@ -32,8 +31,7 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
              << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects (U)\n\n";
 
   TempStream << "[1]    [2]    [3]    [4]    [5]    [6]    Instructions:\n";
-  for (unsigned I = 0, E = Instructions; I < E; ++I) {
-    const MCInst &Inst = Source.getMCInstFromIndex(I);
+  for (const MCInst &Inst : Source) {
     const MCInstrDesc &MCDesc = MCII.get(Inst.getOpcode());
 
     // Obtain the scheduling class information from the instruction.
diff --git a/tools/llvm-mca/Views/ResourcePressureView.cpp b/tools/llvm-mca/Views/ResourcePressureView.cpp
index bba1e70bc26..17c801259d9 100644
--- a/tools/llvm-mca/Views/ResourcePressureView.cpp
+++ b/tools/llvm-mca/Views/ResourcePressureView.cpp
@@ -148,13 +148,15 @@ void ResourcePressureView::printResourcePressurePerInstruction(
   std::string Instruction;
   raw_string_ostream InstrStream(Instruction);
 
-  for (unsigned I = 0, E = Source.size(); I < E; ++I) {
+  unsigned InstrIndex = 0;
+  for (const MCInst &MCI : Source) {
+    unsigned BaseEltIdx = InstrIndex * NumResourceUnits;
     for (unsigned J = 0; J < NumResourceUnits; ++J) {
-      double Usage = ResourceUsage[J + I * NumResourceUnits];
+      double Usage = ResourceUsage[J + BaseEltIdx];
       printResourcePressure(FOS, Usage / Executions, (J + 1) * 7);
     }
 
-    MCIP.printInst(&Source.getMCInstFromIndex(I), InstrStream, "", STI);
+    MCIP.printInst(&MCI, InstrStream, "", STI);
     InstrStream.flush();
     StringRef Str(Instruction);
 
@@ -167,6 +169,8 @@ void ResourcePressureView::printResourcePressurePerInstruction(
     FOS.flush();
     OS << Buffer;
     Buffer = "";
+
+    ++InstrIndex;
   }
 }
 } // namespace mca
diff --git a/tools/llvm-mca/Views/TimelineView.cpp b/tools/llvm-mca/Views/TimelineView.cpp
index 1ad7271b2a4..d802d42352d 100644
--- a/tools/llvm-mca/Views/TimelineView.cpp
+++ b/tools/llvm-mca/Views/TimelineView.cpp
@@ -177,11 +177,10 @@ void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
 
   formatted_raw_ostream FOS(OS);
   unsigned Executions = Timeline.size() / AsmSequence.size();
-  for (unsigned I = 0, E = WaitTime.size(); I < E; ++I) {
-    printWaitTimeEntry(FOS, WaitTime[I], I, Executions);
+  unsigned IID = 0;
+  for (const MCInst &Inst : AsmSequence) {
+    printWaitTimeEntry(FOS, WaitTime[IID], IID, Executions);
     // Append the instruction info at the end of the line.
-    const MCInst &Inst = AsmSequence.getMCInstFromIndex(I);
-
     MCIP.printInst(&Inst, InstrStream, "", STI);
     InstrStream.flush();
 
@@ -191,6 +190,8 @@ void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
     FOS << "   " << Str << '\n';
     FOS.flush();
     Instruction = "";
+
+    ++IID;
   }
 }
 
@@ -266,25 +267,29 @@ void TimelineView::printTimeline(raw_ostream &OS) const {
   std::string Instruction;
   raw_string_ostream InstrStream(Instruction);
 
-  for (unsigned I = 0, E = Timeline.size(); I < E; ++I) {
-    const TimelineViewEntry &Entry = Timeline[I];
-    if (Entry.CycleRetired == 0)
-      return;
-
-    unsigned Iteration = I / AsmSequence.size();
-    unsigned SourceIndex = I % AsmSequence.size();
-    printTimelineViewEntry(FOS, Entry, Iteration, SourceIndex);
-    // Append the instruction info at the end of the line.
-    const MCInst &Inst = AsmSequence.getMCInstFromIndex(I);
-    MCIP.printInst(&Inst, InstrStream, "", STI);
-    InstrStream.flush();
-
-    // Consume any tabs or spaces at the beginning of the string.
-    StringRef Str(Instruction);
-    Str = Str.ltrim();
-    FOS << "   " << Str << '\n';
-    FOS.flush();
-    Instruction = "";
+  unsigned IID = 0;
+  const unsigned Iterations = Timeline.size() / AsmSequence.size();
+  for (unsigned Iteration = 0; Iteration < Iterations; ++Iteration) {
+    for (const MCInst &Inst : AsmSequence) {
+      const TimelineViewEntry &Entry = Timeline[IID];
+      if (Entry.CycleRetired == 0)
+        return;
+
+      unsigned SourceIndex = IID % AsmSequence.size();
+      printTimelineViewEntry(FOS, Entry, Iteration, SourceIndex);
+      // Append the instruction info at the end of the line.
+      MCIP.printInst(&Inst, InstrStream, "", STI);
+      InstrStream.flush();
+
+      // Consume any tabs or spaces at the beginning of the string.
+      StringRef Str(Instruction);
+      Str = Str.ltrim();
+      FOS << "   " << Str << '\n';
+      FOS.flush();
+      Instruction = "";
+
+      ++IID;
+    }
   }
 }
 } // namespace mca
diff --git a/tools/llvm-mca/include/SourceMgr.h b/tools/llvm-mca/include/SourceMgr.h
index 89412836360..e7cd358afd4 100644
--- a/tools/llvm-mca/include/SourceMgr.h
+++ b/tools/llvm-mca/include/SourceMgr.h
@@ -27,7 +27,7 @@ typedef std::pair<unsigned, const llvm::MCInst &> SourceRef;
 class SourceMgr {
   llvm::ArrayRef<llvm::MCInst> Sequence;
   unsigned Current;
-  unsigned Iterations;
+  const unsigned Iterations;
   static const unsigned DefaultIterations = 100;
 
 public:
@@ -35,27 +35,19 @@ public:
       : Sequence(MCInstSequence), Current(0),
         Iterations(NumIterations ? NumIterations : DefaultIterations) {}
 
-  unsigned getCurrentIteration() const { return Current / Sequence.size(); }
   unsigned getNumIterations() const { return Iterations; }
   unsigned size() const { return Sequence.size(); }
-  llvm::ArrayRef<llvm::MCInst> getSequence() const { return Sequence; }
-
-  bool hasNext() const { return Current < (Iterations * size()); }
-  void updateNext() { Current++; }
+  bool hasNext() const { return Current < (Iterations * Sequence.size()); }
+  void updateNext() { ++Current; }
 
   const SourceRef peekNext() const {
     assert(hasNext() && "Already at end of sequence!");
-    unsigned Index = getCurrentInstructionIndex();
-    return SourceRef(Current, Sequence[Index]);
-  }
-
-  unsigned getCurrentInstructionIndex() const {
-    return Current % Sequence.size();
+    return SourceRef(Current, Sequence[Current % Sequence.size()]);
   }
 
-  const llvm::MCInst &getMCInstFromIndex(unsigned Index) const {
-    return Sequence[Index % size()];
-  }
+  using const_iterator = llvm::ArrayRef<llvm::MCInst>::const_iterator;
+  const_iterator begin() const { return Sequence.begin(); }
+  const_iterator end() const { return Sequence.end(); }
 
   bool isEmpty() const { return size() == 0; }
 };
-- 
GitLab


From c26bd2324dd6b4decb25679ff3c553ff0d71136d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 24 Oct 2018 15:17:56 +0000
Subject: [PATCH 0503/1116] [InstCombine] try harder to form select from logic
 ops (2nd try)

The original patch was committed here:
rL344609
...and reverted:
rL344612
...because it did not properly check/test data types before calling
ComputeNumSignBits().

The tests that caused bot failures for the previous commit are
over-reaching front-end tests that run the entire -O optimizer
pipeline:
    Clang :: CodeGen/builtins-systemz-zvector.c
    Clang :: CodeGen/builtins-systemz-zvector2.c

I've added a negative test here to ensure coverage for that case.
The new early exit check also tests the type of the 'B' parameter,
so we don't waste time on matching if either value is unsuitable.

Original commit message:

This is part of solving PR37549:
https://bugs.llvm.org/show_bug.cgi?id=37549

The patterns shown here are a special case of something
that we already convert to select. Using ComputeNumSignBits()
catches that case (but not the more complicated motivating
patterns yet).

The backend has hooks/logic to convert back to logic ops
if that's better for the target.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345149 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineAndOrXor.cpp       | 72 +++++++++++--------
 .../InstCombine/InstCombineInternal.h         |  3 +
 test/Transforms/InstCombine/logical-select.ll | 41 +++++++----
 test/Transforms/InstCombine/vec_sext.ll       | 18 ++---
 4 files changed, 80 insertions(+), 54 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index a6280ec95a9..7e7a515bfc8 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1831,14 +1831,33 @@ static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
 /// We have an expression of the form (A & C) | (B & D). If A is a scalar or
 /// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
 /// B, it can be used as the condition operand of a select instruction.
-static Value *getSelectCondition(Value *A, Value *B,
-                                 InstCombiner::BuilderTy &Builder) {
-  // If these are scalars or vectors of i1, A can be used directly.
+Value *InstCombiner::getSelectCondition(Value *A, Value *B) {
+  // Step 1: We may have peeked through bitcasts in the caller.
+  // Exit immediately if we don't have (vector) integer types.
   Type *Ty = A->getType();
-  if (match(A, m_Not(m_Specific(B))) && Ty->isIntOrIntVectorTy(1))
-    return A;
+  if (!Ty->isIntOrIntVectorTy() || !B->getType()->isIntOrIntVectorTy())
+    return nullptr;
+
+  // Step 2: We need 0 or all-1's bitmasks.
+  if (ComputeNumSignBits(A) != Ty->getScalarSizeInBits())
+    return nullptr;
 
-  // If A and B are sign-extended, look through the sexts to find the booleans.
+  // Step 3: If B is the 'not' value of A, we have our answer.
+  if (match(A, m_Not(m_Specific(B)))) {
+    // If these are scalars or vectors of i1, A can be used directly.
+    if (Ty->isIntOrIntVectorTy(1))
+      return A;
+    return Builder.CreateTrunc(A, CmpInst::makeCmpResultType(Ty));
+  }
+
+  // If both operands are constants, see if the constants are inverse bitmasks.
+  Constant *AConst, *BConst;
+  if (match(A, m_Constant(AConst)) && match(B, m_Constant(BConst)))
+    if (AConst == ConstantExpr::getNot(BConst))
+      return Builder.CreateZExtOrTrunc(A, CmpInst::makeCmpResultType(Ty));
+
+  // Look for more complex patterns. The 'not' op may be hidden behind various
+  // casts. Look through sexts and bitcasts to find the booleans.
   Value *Cond;
   Value *NotB;
   if (match(A, m_SExt(m_Value(Cond))) &&
@@ -1854,36 +1873,29 @@ static Value *getSelectCondition(Value *A, Value *B,
   if (!Ty->isVectorTy())
     return nullptr;
 
-  // If both operands are constants, see if the constants are inverse bitmasks.
-  Constant *AC, *BC;
-  if (match(A, m_Constant(AC)) && match(B, m_Constant(BC)) &&
-      areInverseVectorBitmasks(AC, BC)) {
-    return Builder.CreateZExtOrTrunc(AC, CmpInst::makeCmpResultType(Ty));
-  }
-
   // If both operands are xor'd with constants using the same sexted boolean
   // operand, see if the constants are inverse bitmasks.
-  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AC)))) &&
-      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BC)))) &&
+  // TODO: Use ConstantExpr::getNot()?
+  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AConst)))) &&
+      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BConst)))) &&
       Cond->getType()->isIntOrIntVectorTy(1) &&
-      areInverseVectorBitmasks(AC, BC)) {
-    AC = ConstantExpr::getTrunc(AC, CmpInst::makeCmpResultType(Ty));
-    return Builder.CreateXor(Cond, AC);
+      areInverseVectorBitmasks(AConst, BConst)) {
+    AConst = ConstantExpr::getTrunc(AConst, CmpInst::makeCmpResultType(Ty));
+    return Builder.CreateXor(Cond, AConst);
   }
   return nullptr;
 }
 
 /// We have an expression of the form (A & C) | (B & D). Try to simplify this
 /// to "A' ? C : D", where A' is a boolean or vector of booleans.
-static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
-                                   InstCombiner::BuilderTy &Builder) {
+Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
+                                          Value *D) {
   // The potential condition of the select may be bitcasted. In that case, look
   // through its bitcast and the corresponding bitcast of the 'not' condition.
   Type *OrigType = A->getType();
   A = peekThroughBitcast(A, true);
   B = peekThroughBitcast(B, true);
-
-  if (Value *Cond = getSelectCondition(A, B, Builder)) {
+  if (Value *Cond = getSelectCondition(A, B)) {
     // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
     // The bitcasts will either all exist or all not exist. The builder will
     // not create unnecessary casts if the types already match.
@@ -2234,21 +2246,21 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     // 'or' that it is replacing.
     if (Op0->hasOneUse() || Op1->hasOneUse()) {
       // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants.
-      if (Value *V = matchSelectFromAndOr(A, C, B, D, Builder))
+      if (Value *V = matchSelectFromAndOr(A, C, B, D))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(A, C, D, B, Builder))
+      if (Value *V = matchSelectFromAndOr(A, C, D, B))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(C, A, B, D, Builder))
+      if (Value *V = matchSelectFromAndOr(C, A, B, D))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(C, A, D, B, Builder))
+      if (Value *V = matchSelectFromAndOr(C, A, D, B))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(B, D, A, C, Builder))
+      if (Value *V = matchSelectFromAndOr(B, D, A, C))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(B, D, C, A, Builder))
+      if (Value *V = matchSelectFromAndOr(B, D, C, A))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(D, B, A, C, Builder))
+      if (Value *V = matchSelectFromAndOr(D, B, A, C))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(D, B, C, A, Builder))
+      if (Value *V = matchSelectFromAndOr(D, B, C, A))
         return replaceInstUsesWith(I, V);
     }
   }
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index a4d7fe8861b..431856c9e00 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -589,6 +589,9 @@ private:
 
   Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
                                        bool JoinedByAnd, Instruction &CxtI);
+  Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D);
+  Value *getSelectCondition(Value *A, Value *B);
+
 public:
   /// Inserts an instruction \p New before instruction \p Old
   ///
diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index 3ee0ba169b3..dc4c04b6991 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -535,12 +535,9 @@ define <4 x i32> @vec_sel_xor_multi_use(<4 x i32> %a, <4 x i32> %b, <4 x i1> %c)
 
 define i32 @allSignBits(i32 %cond, i32 %tval, i32 %fval) {
 ; CHECK-LABEL: @allSignBits(
-; CHECK-NEXT:    [[BITMASK:%.*]] = ashr i32 [[COND:%.*]], 31
-; CHECK-NEXT:    [[NOT_BITMASK:%.*]] = xor i32 [[BITMASK]], -1
-; CHECK-NEXT:    [[A1:%.*]] = and i32 [[BITMASK]], [[TVAL:%.*]]
-; CHECK-NEXT:    [[A2:%.*]] = and i32 [[NOT_BITMASK]], [[FVAL:%.*]]
-; CHECK-NEXT:    [[SEL:%.*]] = or i32 [[A1]], [[A2]]
-; CHECK-NEXT:    ret i32 [[SEL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[COND:%.*]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[TVAL:%.*]], i32 [[FVAL:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
   %bitmask = ashr i32 %cond, 31
   %not_bitmask = xor i32 %bitmask, -1
@@ -552,12 +549,9 @@ define i32 @allSignBits(i32 %cond, i32 %tval, i32 %fval) {
 
 define <4 x i8> @allSignBits_vec(<4 x i8> %cond, <4 x i8> %tval, <4 x i8> %fval) {
 ; CHECK-LABEL: @allSignBits_vec(
-; CHECK-NEXT:    [[BITMASK:%.*]] = ashr <4 x i8> [[COND:%.*]], <i8 7, i8 7, i8 7, i8 7>
-; CHECK-NEXT:    [[NOT_BITMASK:%.*]] = xor <4 x i8> [[BITMASK]], <i8 -1, i8 -1, i8 -1, i8 -1>
-; CHECK-NEXT:    [[A1:%.*]] = and <4 x i8> [[BITMASK]], [[TVAL:%.*]]
-; CHECK-NEXT:    [[A2:%.*]] = and <4 x i8> [[NOT_BITMASK]], [[FVAL:%.*]]
-; CHECK-NEXT:    [[SEL:%.*]] = or <4 x i8> [[A2]], [[A1]]
-; CHECK-NEXT:    ret <4 x i8> [[SEL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i8> [[COND:%.*]], <i8 -1, i8 -1, i8 -1, i8 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> [[FVAL:%.*]], <4 x i8> [[TVAL:%.*]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
 ;
   %bitmask = ashr <4 x i8> %cond, <i8 7, i8 7, i8 7, i8 7>
   %not_bitmask = xor <4 x i8> %bitmask, <i8 -1, i8 -1, i8 -1, i8 -1>
@@ -567,3 +561,26 @@ define <4 x i8> @allSignBits_vec(<4 x i8> %cond, <4 x i8> %tval, <4 x i8> %fval)
   ret <4 x i8> %sel
 }
 
+; Negative test - make sure that bitcasts from FP do not cause a crash.
+
+define <2 x i64> @fp_bitcast(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @fp_bitcast(
+; CHECK-NEXT:    [[SIA:%.*]] = fptosi <2 x double> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[SIB:%.*]] = fptosi <2 x double> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[AND1:%.*]] = and <2 x i64> [[SIA]], [[BC1]]
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[AND2:%.*]] = and <2 x i64> [[SIB]], [[BC2]]
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i64> [[AND2]], [[AND1]]
+; CHECK-NEXT:    ret <2 x i64> [[OR]]
+;
+  %sia = fptosi <2 x double> %a to <2 x i64>
+  %sib = fptosi <2 x double> %b to <2 x i64>
+  %bc1 = bitcast <2 x double> %a to <2 x i64>
+  %and1 = and <2 x i64> %sia, %bc1
+  %bc2 = bitcast <2 x double> %b to <2 x i64>
+  %and2 = and <2 x i64> %sib, %bc2
+  %or = or <2 x i64> %and2, %and1
+  ret <2 x i64> %or
+}
+
diff --git a/test/Transforms/InstCombine/vec_sext.ll b/test/Transforms/InstCombine/vec_sext.ll
index f244d49527b..39bd4087416 100644
--- a/test/Transforms/InstCombine/vec_sext.ll
+++ b/test/Transforms/InstCombine/vec_sext.ll
@@ -4,12 +4,9 @@
 define <4 x i32> @vec_select(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @vec_select(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]]
-; CHECK-NEXT:    [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    [[T1:%.*]] = xor <4 x i32> [[B_LOBIT1]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> [[T1]], [[A]]
-; CHECK-NEXT:    [[T3:%.*]] = and <4 x i32> [[B_LOBIT1]], [[SUB]]
-; CHECK-NEXT:    [[COND:%.*]] = or <4 x i32> [[T2]], [[T3]]
-; CHECK-NEXT:    ret <4 x i32> [[COND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A]], <4 x i32> [[SUB]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %cmp = icmp slt <4 x i32> %b, zeroinitializer
   %sext = sext <4 x i1> %cmp to <4 x i32>
@@ -26,12 +23,9 @@ define <4 x i32> @vec_select(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_select_alternate_sign_bit_test(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @vec_select_alternate_sign_bit_test(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]]
-; CHECK-NEXT:    [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    [[B_LOBIT1_NOT:%.*]] = xor <4 x i32> [[B_LOBIT1]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> [[B_LOBIT1]], [[A]]
-; CHECK-NEXT:    [[T3:%.*]] = and <4 x i32> [[B_LOBIT1_NOT]], [[SUB]]
-; CHECK-NEXT:    [[COND:%.*]] = or <4 x i32> [[T2]], [[T3]]
-; CHECK-NEXT:    ret <4 x i32> [[COND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[SUB]], <4 x i32> [[A]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %cmp = icmp sgt <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   %sext = sext <4 x i1> %cmp to <4 x i32>
-- 
GitLab


From 0ca5e5ea6173a78ccb59ea567aa12ef981dcf43a Mon Sep 17 00:00:00 2001
From: Krasimir Georgiev <krasimir@google.com>
Date: Wed, 24 Oct 2018 15:18:51 +0000
Subject: [PATCH 0504/1116] IR: Optimize FunctionType::get to perform one hash
 lookup instead of two, NFCI

Summary: This function was performing two hash lookups when a new function type was requested: first checking if it exists and second to insert it. This patch updates the function to perform a single hash lookup in this case by updating the value in the hash table in-place in case the function type was not there before.

Reviewers: bkramer

Reviewed By: bkramer

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53471

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345151 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Type.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index 83016496ff7..4e7532cb268 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -297,20 +297,26 @@ FunctionType::FunctionType(Type *Result, ArrayRef<Type*> Params,
 FunctionType *FunctionType::get(Type *ReturnType,
                                 ArrayRef<Type*> Params, bool isVarArg) {
   LLVMContextImpl *pImpl = ReturnType->getContext().pImpl;
-  FunctionTypeKeyInfo::KeyTy Key(ReturnType, Params, isVarArg);
-  auto I = pImpl->FunctionTypes.find_as(Key);
+  const FunctionTypeKeyInfo::KeyTy Key(ReturnType, Params, isVarArg);
   FunctionType *FT;
-
-  if (I == pImpl->FunctionTypes.end()) {
+  // Since we only want to allocate a fresh function type in case none is found
+  // and we don't want to perform two lookups (one for checking if existent and
+  // one for inserting the newly allocated one), here we instead lookup based on
+  // Key and update the reference to the function type in-place to a newly
+  // allocated one if not found.
+  auto Insertion = pImpl->FunctionTypes.insert_as(nullptr, Key);
+  if (Insertion.second) {
+    // The function type was not found. Allocate one and update FunctionTypes
+    // in-place.
     FT = (FunctionType *)pImpl->TypeAllocator.Allocate(
         sizeof(FunctionType) + sizeof(Type *) * (Params.size() + 1),
         alignof(FunctionType));
     new (FT) FunctionType(ReturnType, Params, isVarArg);
-    pImpl->FunctionTypes.insert(FT);
+    *Insertion.first = FT;
   } else {
-    FT = *I;
+    // The function type was found. Just return it.
+    FT = *Insertion.first;
   }
-
   return FT;
 }
 
-- 
GitLab


From ed350b9c5926f92ab0ebca1b8fd833e70a5d9b0d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 24 Oct 2018 16:21:23 +0000
Subject: [PATCH 0505/1116] [InstCombine] add test for select with shuffled
 condition (PR37549); NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345156 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/logical-select.ll | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index dc4c04b6991..db1eae05083 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -584,3 +584,38 @@ define <2 x i64> @fp_bitcast(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
   ret <2 x i64> %or
 }
 
+define <4 x i32> @computesignbits_through_shuffles(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; CHECK-LABEL: @computesignbits_through_shuffles(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole <4 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF_OR1:%.*]] = or <4 x i32> [[S1]], [[S2]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <4 x i32> [[SHUF_OR1]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <4 x i32> [[SHUF_OR1]], <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF_OR2:%.*]] = or <4 x i32> [[S3]], [[S4]]
+; CHECK-NEXT:    [[NOT_OR2:%.*]] = xor <4 x i32> [[SHUF_OR2]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[XBC:%.*]] = bitcast <4 x float> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[ZBC:%.*]] = bitcast <4 x float> [[Z:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[AND1:%.*]] = and <4 x i32> [[NOT_OR2]], [[XBC]]
+; CHECK-NEXT:    [[AND2:%.*]] = and <4 x i32> [[SHUF_OR2]], [[ZBC]]
+; CHECK-NEXT:    [[SEL:%.*]] = or <4 x i32> [[AND1]], [[AND2]]
+; CHECK-NEXT:    ret <4 x i32> [[SEL]]
+;
+  %cmp = fcmp ole <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %s1 = shufflevector <4 x i32> %sext, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %s2 = shufflevector <4 x i32> %sext, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  %shuf_or1 = or <4 x i32> %s1, %s2
+  %s3 = shufflevector <4 x i32> %shuf_or1, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %s4 = shufflevector <4 x i32> %shuf_or1, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  %shuf_or2 = or <4 x i32> %s3, %s4
+  %not_or2 = xor <4 x i32> %shuf_or2, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %xbc = bitcast <4 x float> %x to <4 x i32>
+  %zbc = bitcast <4 x float> %z to <4 x i32>
+  %and1 = and <4 x i32> %not_or2, %xbc
+  %and2 = and <4 x i32> %shuf_or2, %zbc
+  %sel = or <4 x i32> %and1, %and2
+  ret <4 x i32> %sel
+}
+
-- 
GitLab


From 8ae5d1204066044b52a80df8d3880fdf4e4b27d5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 24 Oct 2018 16:35:01 +0000
Subject: [PATCH 0506/1116] [LegalizeDAG] ExpandLegalINT_TO_FP - cleanup
 UINT_TO_FP i64 -> f32 expansion.

Use SrcVT/DestVT types and correct shift type.

Part of prep work for D52965

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345158 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index cfc4d13b383..f6a6e064fa4 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2399,24 +2399,25 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
     // For unsigned conversions, convert them to signed conversions using the
     // algorithm from the x86_64 __floatundidf in compiler_rt.
     if (!isSigned) {
-      SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Op0);
+      SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
 
       SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
-      SDValue Shr = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0, ShiftConst);
-      SDValue AndConst = DAG.getConstant(1, dl, MVT::i64);
-      SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, AndConst);
-      SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And, Shr);
+      SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, ShiftConst);
+      SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
+      SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Op0, AndConst);
+      SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr);
 
-      SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Or);
-      SDValue Slow = DAG.getNode(ISD::FADD, dl, MVT::f32, SignCvt, SignCvt);
+      SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Or);
+      SDValue Slow = DAG.getNode(ISD::FADD, dl, DestVT, SignCvt, SignCvt);
 
       // TODO: This really should be implemented using a branch rather than a
       // select.  We happen to get lucky and machinesink does the right
       // thing most of the time.  This would be a good candidate for a
-      //pseudo-op, or, even better, for whole-function isel.
-      SDValue SignBitTest = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
-        Op0, DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
-      return DAG.getSelect(dl, MVT::f32, SignBitTest, Slow, Fast);
+      // pseudo-op, or, even better, for whole-function isel.
+      SDValue SignBitTest =
+          DAG.getSetCC(dl, getSetCCResultType(SrcVT), Op0,
+                       DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
+      return DAG.getSelect(dl, DestVT, SignBitTest, Slow, Fast);
     }
 
     // Otherwise, implement the fully general conversion.
-- 
GitLab


From 3f46d1d9fae6b10cf93a26a53765813baacebc78 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Wed, 24 Oct 2018 16:56:43 +0000
Subject: [PATCH 0507/1116] [llvm-mca] Remove dependency from InstrBuilder in
 class InstructionTables.

Also, removed the initialization of vectors used for processor resource masks.
Support function 'computeProcResourceMasks()' already calls method resize on
those vectors.
No functional change intended.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345161 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/Views/SummaryView.cpp                 | 3 +--
 tools/llvm-mca/include/InstrBuilder.h                | 3 +--
 tools/llvm-mca/include/Stages/InstructionTables.h    | 9 +++++----
 tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp | 3 +--
 tools/llvm-mca/lib/Stages/InstructionTables.cpp      | 1 -
 tools/llvm-mca/llvm-mca.cpp                          | 2 +-
 6 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/tools/llvm-mca/Views/SummaryView.cpp b/tools/llvm-mca/Views/SummaryView.cpp
index 98f3410d61b..8d529ba1549 100644
--- a/tools/llvm-mca/Views/SummaryView.cpp
+++ b/tools/llvm-mca/Views/SummaryView.cpp
@@ -27,8 +27,7 @@ using namespace llvm;
 SummaryView::SummaryView(const MCSchedModel &Model, const SourceMgr &S,
                          unsigned Width)
     : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
-      NumMicroOps(0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0),
-      ProcResourceMasks(Model.getNumProcResourceKinds(), 0) {
+      NumMicroOps(0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0) {
   computeProcResourceMasks(SM, ProcResourceMasks);
 }
 
diff --git a/tools/llvm-mca/include/InstrBuilder.h b/tools/llvm-mca/include/InstrBuilder.h
index 9fee94bbb3f..31c52702058 100644
--- a/tools/llvm-mca/include/InstrBuilder.h
+++ b/tools/llvm-mca/include/InstrBuilder.h
@@ -65,8 +65,7 @@ public:
   InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii,
                const llvm::MCRegisterInfo &mri,
                const llvm::MCInstrAnalysis &mcia)
-      : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia),
-        ProcResourceMasks(STI.getSchedModel().getNumProcResourceKinds()) {
+      : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia) {
     computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
   }
 
diff --git a/tools/llvm-mca/include/Stages/InstructionTables.h b/tools/llvm-mca/include/Stages/InstructionTables.h
index 16be004d115..de31a7949bb 100644
--- a/tools/llvm-mca/include/Stages/InstructionTables.h
+++ b/tools/llvm-mca/include/Stages/InstructionTables.h
@@ -18,8 +18,8 @@
 #define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONTABLES_H
 
 #include "HardwareUnits/Scheduler.h"
-#include "InstrBuilder.h"
 #include "Stages/Stage.h"
+#include "Support.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
 
@@ -27,12 +27,13 @@ namespace mca {
 
 class InstructionTables final : public Stage {
   const llvm::MCSchedModel &SM;
-  InstrBuilder &IB;
   llvm::SmallVector<std::pair<ResourceRef, ResourceCycles>, 4> UsedResources;
+  llvm::SmallVector<uint64_t, 8> Masks;
 
 public:
-  InstructionTables(const llvm::MCSchedModel &Model, InstrBuilder &Builder)
-      : Stage(), SM(Model), IB(Builder) {}
+  InstructionTables(const llvm::MCSchedModel &Model) : Stage(), SM(Model) {
+    computeProcResourceMasks(Model, Masks);
+  }
 
   bool hasWorkToComplete() const override { return false; }
   llvm::Error execute(InstRef &IR) override;
diff --git a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp b/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
index bb6ed309c26..e033217d52d 100644
--- a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
@@ -97,8 +97,7 @@ getStrategyFor(const ResourceState &RS) {
   return std::unique_ptr<ResourceStrategy>(nullptr);
 }
 
-ResourceManager::ResourceManager(const MCSchedModel &SM)
-    : ProcResID2Mask(SM.getNumProcResourceKinds()) {
+ResourceManager::ResourceManager(const MCSchedModel &SM) {
   computeProcResourceMasks(SM, ProcResID2Mask);
   Resources.resize(SM.getNumProcResourceKinds());
   Strategies.resize(SM.getNumProcResourceKinds());
diff --git a/tools/llvm-mca/lib/Stages/InstructionTables.cpp b/tools/llvm-mca/lib/Stages/InstructionTables.cpp
index e49eb446062..06319f857dc 100644
--- a/tools/llvm-mca/lib/Stages/InstructionTables.cpp
+++ b/tools/llvm-mca/lib/Stages/InstructionTables.cpp
@@ -22,7 +22,6 @@ namespace mca {
 using namespace llvm;
 
 Error InstructionTables::execute(InstRef &IR) {
-  ArrayRef<uint64_t> Masks = IB.getProcResourceMasks();
   const InstrDesc &Desc = IR.getInstruction()->getDesc();
   UsedResources.clear();
 
diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp
index 9ad761e6665..9992395fb6e 100644
--- a/tools/llvm-mca/llvm-mca.cpp
+++ b/tools/llvm-mca/llvm-mca.cpp
@@ -519,7 +519,7 @@ int main(int argc, char **argv) {
       //  Create a pipeline, stages, and a printer.
       auto P = llvm::make_unique<mca::Pipeline>();
       P->appendStage(llvm::make_unique<mca::FetchStage>(IB, S));
-      P->appendStage(llvm::make_unique<mca::InstructionTables>(SM, IB));
+      P->appendStage(llvm::make_unique<mca::InstructionTables>(SM));
       mca::PipelinePrinter Printer(*P);
 
       // Create the views for this pipeline, execute, and emit a report.
-- 
GitLab


From 95f42d0b8c48eaf41ea07a7bd38691f2f47e6f5c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 24 Oct 2018 17:01:42 +0000
Subject: [PATCH 0508/1116] [InstCombine] add test for ComputeNumSignBits with
 shuffle; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345162 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/nsw.ll | 83 +++++++++++++++++++++---------
 1 file changed, 58 insertions(+), 25 deletions(-)

diff --git a/test/Transforms/InstCombine/nsw.ll b/test/Transforms/InstCombine/nsw.ll
index 0bed76717ce..ab2cbb2d865 100644
--- a/test/Transforms/InstCombine/nsw.ll
+++ b/test/Transforms/InstCombine/nsw.ll
@@ -1,83 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-; CHECK-LABEL: @sub1(
-; CHECK: %y = sub i32 0, %x
-; CHECK: %z = sdiv i32 %y, 337
-; CHECK: ret i32 %z
 define i32 @sub1(i32 %x) {
+; CHECK-LABEL: @sub1(
+; CHECK-NEXT:    [[Y:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[Z:%.*]] = sdiv i32 [[Y]], 337
+; CHECK-NEXT:    ret i32 [[Z]]
+;
   %y = sub i32 0, %x
   %z = sdiv i32 %y, 337
   ret i32 %z
 }
 
-; CHECK-LABEL: @sub2(
-; CHECK: %z = sdiv i32 %x, -337
-; CHECK: ret i32 %z
 define i32 @sub2(i32 %x) {
+; CHECK-LABEL: @sub2(
+; CHECK-NEXT:    [[Z:%.*]] = sdiv i32 [[X:%.*]], -337
+; CHECK-NEXT:    ret i32 [[Z]]
+;
   %y = sub nsw i32 0, %x
   %z = sdiv i32 %y, 337
   ret i32 %z
 }
 
+define i1 @shl_icmp(i64 %X) {
 ; CHECK-LABEL: @shl_icmp(
-; CHECK: %B = icmp eq i64 %X, 0
-; CHECK: ret i1 %B
-define i1 @shl_icmp(i64 %X) nounwind {
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i64 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[B]]
+;
   %A = shl nuw i64 %X, 2   ; X/4
   %B = icmp eq i64 %A, 0
   ret i1 %B
 }
 
+define i64 @shl1(i64 %X, i64* %P) {
 ; CHECK-LABEL: @shl1(
-; CHECK: %B = shl nuw nsw i64 %A, 8
-; CHECK: ret i64 %B
-define i64 @shl1(i64 %X, i64* %P) nounwind {
+; CHECK-NEXT:    [[A:%.*]] = and i64 [[X:%.*]], 312
+; CHECK-NEXT:    store i64 [[A]], i64* [[P:%.*]], align 4
+; CHECK-NEXT:    [[B:%.*]] = shl nuw nsw i64 [[A]], 8
+; CHECK-NEXT:    ret i64 [[B]]
+;
   %A = and i64 %X, 312
   store i64 %A, i64* %P  ; multiple uses of A.
   %B = shl i64 %A, 8
   ret i64 %B
 }
 
+define i32 @preserve1(i32 %x) {
 ; CHECK-LABEL: @preserve1(
-; CHECK: add nsw i32 %x, 5
-define i32 @preserve1(i32 %x) nounwind {
+; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[X:%.*]], 5
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
   %add = add nsw i32 %x, 2
   %add3 = add nsw i32 %add, 3
   ret i32 %add3
 }
 
+define i8 @nopreserve1(i8 %x) {
 ; CHECK-LABEL: @nopreserve1(
-; CHECK: add i8 %x, -126
-define i8 @nopreserve1(i8 %x) nounwind {
+; CHECK-NEXT:    [[ADD3:%.*]] = add i8 [[X:%.*]], -126
+; CHECK-NEXT:    ret i8 [[ADD3]]
+;
   %add = add nsw i8 %x, 127
   %add3 = add nsw i8 %add, 3
   ret i8 %add3
 }
 
+define i8 @nopreserve2(i8 %x) {
 ; CHECK-LABEL: @nopreserve2(
-; CHECK: add i8 %x, 3
-define i8 @nopreserve2(i8 %x) nounwind {
+; CHECK-NEXT:    [[ADD3:%.*]] = add i8 [[X:%.*]], 3
+; CHECK-NEXT:    ret i8 [[ADD3]]
+;
   %add = add i8 %x, 1
   %add3 = add nsw i8 %add, 2
   ret i8 %add3
 }
 
+define i8 @nopreserve3(i8 %A, i8 %B) {
 ; CHECK-LABEL: @nopreserve3(
-; CHECK: add i8 %A, %B
-; CHECK: add i8
-define i8 @nopreserve3(i8 %A, i8 %B) nounwind {
+; CHECK-NEXT:    [[Y:%.*]] = add i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[Y]], 20
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
   %x = add i8 %A, 10
   %y = add i8 %B, 10
   %add = add nsw i8 %x, %y
   ret i8 %add
 }
 
+define i8 @nopreserve4(i8 %A, i8 %B) {
 ; CHECK-LABEL: @nopreserve4(
-; CHECK: add i8 %A, %B
-; CHECK: add i8
-define i8 @nopreserve4(i8 %A, i8 %B) nounwind {
+; CHECK-NEXT:    [[Y:%.*]] = add i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[Y]], 20
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
   %x = add nsw i8 %A, 10
   %y = add nsw i8 %B, 10
   %add = add nsw i8 %x, %y
   ret i8 %add
 }
+
+; TODO: ComputeNumSignBits()/computeKnownBits() should look through a shufflevector.
+
+define <3 x i32> @shl_nuw_nsw_shuffle_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @shl_nuw_nsw_shuffle_splat_vec(
+; CHECK-NEXT:    [[T2:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[T2]], <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[T3:%.*]] = shl <3 x i32> [[SHUF]], <i32 17, i32 17, i32 17>
+; CHECK-NEXT:    ret <3 x i32> [[T3]]
+;
+  %t2 = zext <2 x i8> %x to <2 x i32>
+  %shuf = shufflevector <2 x i32> %t2, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
+  %t3 = shl <3 x i32> %shuf, <i32 17, i32 17, i32 17>
+  ret <3 x i32> %t3
+}
+
-- 
GitLab


From e781359f6e6d92b3893312743af595f7353ddf76 Mon Sep 17 00:00:00 2001
From: Robert Lougher <rob.lougher@gmail.com>
Date: Wed, 24 Oct 2018 17:03:19 +0000
Subject: [PATCH 0509/1116] [CodeGen] skip lifetime end marker in
 isInTailCallPosition

A lifetime end intrinsic between a tail call and the return should not
prevent the call from being tail call optimized.

Differential Revision: https://reviews.llvm.org/D53519


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345163 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/Analysis.cpp                  |  4 ++++
 test/CodeGen/X86/tailcall-lifetime-end.ll | 27 +++++++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 test/CodeGen/X86/tailcall-lifetime-end.ll

diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index aae04a573af..27dce7fd7b7 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -496,6 +496,10 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
       // Debug info intrinsics do not get in the way of tail call optimization.
       if (isa<DbgInfoIntrinsic>(BBI))
         continue;
+      // A lifetime end intrinsic should not stop tail call optimization.
+      if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
+        if (II->getIntrinsicID() == Intrinsic::lifetime_end)
+          continue;
       if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
           !isSafeToSpeculativelyExecute(&*BBI))
         return false;
diff --git a/test/CodeGen/X86/tailcall-lifetime-end.ll b/test/CodeGen/X86/tailcall-lifetime-end.ll
new file mode 100644
index 00000000000..3aedd007d44
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-lifetime-end.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s
+
+; A lifetime end intrinsic should not prevent a call from being tail call
+; optimized.
+
+define void @foobar() {
+; CHECK-LABEL: foobar
+; CHECK: pushq	%rax
+; CHECK: leaq	4(%rsp), %rdi
+; CHECK: callq	foo
+; CHECK: popq	%rax
+; CHECK: jmp	bar
+entry:
+  %i = alloca i32
+  %0 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0)
+  call void @foo(i32* nonnull %i)
+  tail call void @bar()
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0)
+  ret void
+}
+
+declare void @foo(i32* nocapture %p)
+declare void @bar()
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
-- 
GitLab


From b3b0836b9b35c7ae99ffa242a55e2d7bb97d106f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 24 Oct 2018 17:30:29 +0000
Subject: [PATCH 0510/1116] [CostModel][X86] Enable non-uniform vector division
 by constants costs.

Non-uniform division/remainder handling was added back at D49248/D50765 - so share the 'mul+sub' costs that already exist for uniform cases.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345164 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetTransformInfo.cpp |  88 ++++--
 test/Analysis/CostModel/X86/div.ll        | 324 ++++++++++++++++++----
 test/Analysis/CostModel/X86/rem.ll        | 324 ++++++++++++++++++----
 test/Analysis/CostModel/X86/vdiv-cost.ll  |  52 +++-
 4 files changed, 660 insertions(+), 128 deletions(-)

diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index ffc5a029040..29306d75454 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -290,11 +290,6 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SHL,  MVT::v64i8,   2 }, // psllw + pand.
     { ISD::SRL,  MVT::v64i8,   2 }, // psrlw + pand.
     { ISD::SRA,  MVT::v64i8,   4 }, // psrlw, pand, pxor, psubb.
-
-    { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
-    { ISD::SREM, MVT::v32i16,  8 }, // vpmulhw+mul+sub sequence
-    { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
-    { ISD::UREM, MVT::v32i16,  8 }, // vpmulhuw+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -308,11 +303,6 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v2i64,   1 },
     { ISD::SRA,  MVT::v4i64,   1 },
     { ISD::SRA,  MVT::v8i64,   1 },
-
-    { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
-    { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
-    { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
-    { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -328,15 +318,6 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v32i8,   4 }, // psrlw, pand, pxor, psubb.
 
     { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
-
-    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
-    { ISD::SREM, MVT::v16i16,  8 }, // vpmulhw+mul+sub sequence
-    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
-    { ISD::UREM, MVT::v16i16,  8 }, // vpmulhuw+mul+sub sequence
-    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
-    { ISD::SREM, MVT::v8i32,  19 }, // vpmuldq+mul+sub sequence
-    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
-    { ISD::UREM, MVT::v8i32,  19 }, // vpmuludq+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -354,7 +335,65 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
     { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
     { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+  };
 
+  // XOP has faster vXi8 shifts.
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasSSE2() && !ST->hasXOP()) {
+    if (const auto *Entry =
+            CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX512BWConstCostTable[] = {
+    { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
+    { ISD::SREM, MVT::v32i16,  8 }, // vpmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
+    { ISD::UREM, MVT::v32i16,  8 }, // vpmulhuw+mul+sub sequence
+  };
+
+  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+      ST->hasBWI()) {
+    if (const auto *Entry =
+            CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX512ConstCostTable[] = {
+    { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+    { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
+    { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+    { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
+  };
+
+  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+      ST->hasAVX512()) {
+    if (const auto *Entry =
+            CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX2ConstCostTable[] = {
+    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
+    { ISD::SREM, MVT::v16i16,  8 }, // vpmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
+    { ISD::UREM, MVT::v16i16,  8 }, // vpmulhuw+mul+sub sequence
+    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
+    { ISD::SREM, MVT::v8i32,  19 }, // vpmuldq+mul+sub sequence
+    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
+    { ISD::UREM, MVT::v8i32,  19 }, // vpmuludq+mul+sub sequence
+  };
+
+  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+      ST->hasAVX2()) {
+    if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry SSE2ConstCostTable[] = {
     { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
     { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
     { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
@@ -373,7 +412,8 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::UREM, MVT::v4i32,    20 }, // pmuludq+mul+sub sequence
   };
 
-  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
       ST->hasSSE2()) {
     // pmuldq sequence.
     if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
@@ -385,12 +425,8 @@ int X86TTIImpl::getArithmeticInstrCost(
     if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
       return LT.first * 20;
 
-    // XOP has faster vXi8 shifts.
-    if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
-        !ST->hasXOP())
-      if (const auto *Entry =
-              CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
-        return LT.first * Entry->Cost;
+    if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
   }
 
   static const CostTblEntry AVX2UniformCostTable[] = {
diff --git a/test/Analysis/CostModel/X86/div.ll b/test/Analysis/CostModel/X86/div.ll
index 63fb25dd2f6..7a53db1b26b 100644
--- a/test/Analysis/CostModel/X86/div.ll
+++ b/test/Analysis/CostModel/X86/div.ll
@@ -136,24 +136,157 @@ define i32 @udiv() {
 }
 
 define i32 @sdiv_const() {
-; CHECK-LABEL: 'sdiv_const'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'sdiv_const'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'sdiv_const'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'sdiv_const'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'sdiv_const'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'sdiv_const'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'sdiv_const'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'sdiv_const'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'sdiv_const'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sdiv_const'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
@@ -161,13 +294,13 @@ define i32 @sdiv_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -198,24 +331,119 @@ define i32 @sdiv_const() {
 }
 
 define i32 @udiv_const() {
-; CHECK-LABEL: 'udiv_const'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'udiv_const'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'udiv_const'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'udiv_const'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'udiv_const'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'udiv_const'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'udiv_const'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'udiv_const'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
@@ -223,13 +451,13 @@ define i32 @udiv_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
diff --git a/test/Analysis/CostModel/X86/rem.ll b/test/Analysis/CostModel/X86/rem.ll
index fd7e83d74ff..e28b4d9583f 100644
--- a/test/Analysis/CostModel/X86/rem.ll
+++ b/test/Analysis/CostModel/X86/rem.ll
@@ -136,24 +136,176 @@ define i32 @urem() {
 }
 
 define i32 @srem_const() {
-; CHECK-LABEL: 'srem_const'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'srem_const'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'srem_const'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'srem_const'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'srem_const'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'srem_const'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'srem_const'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'srem_const'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'srem_const'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'srem_const'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'srem_const'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
@@ -161,13 +313,13 @@ define i32 @srem_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -198,24 +350,100 @@ define i32 @srem_const() {
 }
 
 define i32 @urem_const() {
-; CHECK-LABEL: 'urem_const'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'urem_const'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'urem_const'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'urem_const'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'urem_const'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'urem_const'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'urem_const'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
@@ -223,13 +451,13 @@ define i32 @urem_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
diff --git a/test/Analysis/CostModel/X86/vdiv-cost.ll b/test/Analysis/CostModel/X86/vdiv-cost.ll
index 7bb935cbcec..d5b404227aa 100644
--- a/test/Analysis/CostModel/X86/vdiv-cost.ll
+++ b/test/Analysis/CostModel/X86/vdiv-cost.ll
@@ -162,18 +162,58 @@ define <8 x i32> @test9(<8 x i32> %a) {
 }
 
 define <8 x i32> @test10(<8 x i32> %a) {
-; CHECK-LABEL: 'test10'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+; SSE2-LABEL: 'test10'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+;
+; SSSE3-LABEL: 'test10'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+;
+; SSE42-LABEL: 'test10'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+;
+; AVX1-LABEL: 'test10'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+;
+; AVX2-LABEL: 'test10'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+;
+; AVX512-LABEL: 'test10'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
 ;
   %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <8 x i32> %div
 }
 
 define <16 x i32> @test11(<16 x i32> %a) {
-; CHECK-LABEL: 'test11'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+; SSE2-LABEL: 'test11'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+;
+; SSSE3-LABEL: 'test11'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+;
+; SSE42-LABEL: 'test11'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+;
+; AVX1-LABEL: 'test11'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+;
+; AVX2-LABEL: 'test11'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+;
+; AVX512-LABEL: 'test11'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
 ;
   %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <16 x i32> %div
-- 
GitLab


From a1d58c653729783e0f5e20c3c093f3789dccd8b4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 24 Oct 2018 17:32:09 +0000
Subject: [PATCH 0511/1116] [X86] Bring back the MOV64r0 pseudo instruction

This patch brings back the MOV64r0 pseudo instruction for zeroing a 64-bit register. This replaces the SUBREG_TO_REG MOV32r0 sequence we use today. Post register allocation we will rewrite the MOV64r0 to a 32-bit xor with an implicit def of the 64-bit register similar to what we do for the various XMM/YMM/ZMM zeroing pseudos.

My main motivation is to enable the spill optimization in foldMemoryOperandImpl. As we were seeing some code that repeatedly did "xor eax, eax; store eax;" to spill several registers with a new xor for each store. With this optimization enabled we get a store of a 0 immediate instead of an xor. Though I admit the ideal solution would be one xor where there are multiple spills. I don't believe we have a test case that shows this optimization in here. I'll see if I can try to reduce one from the code were looking at.

There's definitely some other machine CSE(and maybe other passes) behavior changes exposed by this patch. So it seems like there might be some other deficiencies in SUBREG_TO_REG handling.

Differential Revision: https://reviews.llvm.org/D52757

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345165 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FastISel.cpp                |  32 +-
 lib/Target/X86/X86ISelDAGToDAG.cpp            |  13 +-
 lib/Target/X86/X86InstrCompiler.td            |   6 +-
 lib/Target/X86/X86InstrInfo.cpp               |  23 +-
 .../X86/X86SpeculativeLoadHardening.cpp       |  10 +-
 test/CodeGen/X86/GlobalISel/constant.ll       |   2 +-
 test/CodeGen/X86/avg.ll                       | 455 +++++++++---------
 test/CodeGen/X86/crash-O0.ll                  |   8 +-
 test/CodeGen/X86/hoist-spill.ll               |   2 -
 test/CodeGen/X86/machine-cse.ll               |  15 +-
 test/CodeGen/X86/madd.ll                      |  66 +--
 test/CodeGen/X86/mmx-arith.ll                 |  11 +-
 test/CodeGen/X86/pr32284.ll                   |  19 +-
 test/CodeGen/X86/pr32340.ll                   |  25 +-
 test/CodeGen/X86/scheduler-backtracking.ll    | 212 ++++----
 test/CodeGen/X86/spill-zero-x86_64.ll         |  75 +++
 test/CodeGen/X86/swifterror.ll                |  21 +-
 17 files changed, 523 insertions(+), 472 deletions(-)
 create mode 100644 test/CodeGen/X86/spill-zero-x86_64.ll

diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index a49ad8bd59d..b87f4802473 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1916,8 +1916,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
     { &X86::GR64RegClass, X86::RAX, X86::RDX, {
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
-        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
-        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
+        { X86::DIV64r,  X86::MOV64r0, Copy,            X86::RAX, U }, // UDiv
+        { X86::DIV64r,  X86::MOV64r0, Copy,            X86::RDX, U }, // URem
       }
     }, // i64
   };
@@ -1964,26 +1964,22 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(OpEntry.OpSignExtend));
     else {
-      unsigned Zero32 = createResultReg(&X86::GR32RegClass);
+      unsigned ZeroReg = createResultReg(VT == MVT::i64 ? &X86::GR64RegClass
+                                                        : &X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(X86::MOV32r0), Zero32);
+              TII.get(OpEntry.OpSignExtend), ZeroReg);
 
       // Copy the zero into the appropriate sub/super/identical physical
       // register. Unfortunately the operations needed are not uniform enough
       // to fit neatly into the table above.
-      if (VT == MVT::i16) {
+      if (VT == MVT::i16)
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Copy), TypeEntry.HighInReg)
-          .addReg(Zero32, 0, X86::sub_16bit);
-      } else if (VT == MVT::i32) {
+          .addReg(ZeroReg, 0, X86::sub_16bit);
+      else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Copy), TypeEntry.HighInReg)
-            .addReg(Zero32);
-      } else if (VT == MVT::i64) {
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
-            .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
-      }
+            .addReg(ZeroReg);
     }
   }
   // Generate the DIV/IDIV instruction.
@@ -3708,6 +3704,9 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
 
   uint64_t Imm = CI->getZExtValue();
   if (Imm == 0) {
+    if (VT.SimpleTy == MVT::i64)
+      return fastEmitInst_(X86::MOV64r0, &X86::GR64RegClass);
+
     unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Unexpected value type");
@@ -3720,13 +3719,6 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
                                         X86::sub_16bit);
     case MVT::i32:
       return SrcReg;
-    case MVT::i64: {
-      unsigned ResultReg = createResultReg(&X86::GR64RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
-        .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
-      return ResultReg;
-    }
     }
   }
 
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4b803c5a81b..83d5be34dc7 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -3569,7 +3569,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
       } else {
         // Zero out the high part, effectively zero extending the input.
-        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
+        unsigned ClrOpc = NVT.SimpleTy == MVT::i64 ? X86::MOV64r0
+                                                   : X86::MOV32r0;
+        MVT ClrVT = NVT.SimpleTy == MVT::i64 ? MVT::i64 : MVT::i32;
+        SDValue ClrNode = SDValue(CurDAG->getMachineNode(ClrOpc, dl, ClrVT), 0);
         switch (NVT.SimpleTy) {
         case MVT::i16:
           ClrNode =
@@ -3580,15 +3583,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
                       0);
           break;
         case MVT::i32:
-          break;
         case MVT::i64:
-          ClrNode =
-              SDValue(CurDAG->getMachineNode(
-                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
-                          CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
-                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
-                                                    MVT::i32)),
-                      0);
           break;
         default:
           llvm_unreachable("Unexpected division source");
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 051832bf4bc..11d6edd55d4 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -275,16 +275,18 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>;
 // Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isPseudo = 1, AddedComplexity = 10 in
+    isPseudo = 1, AddedComplexity = 10 in {
 def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)]>, Sched<[WriteZero]>;
+def MOV64r0  : I<0, Pseudo, (outs GR64:$dst), (ins), "",
+                 [(set GR64:$dst, 0)]>, Sched<[WriteZero]>;
+}
 
 // Other widths can also make use of the 32-bit xor, which may have a smaller
 // encoding and avoid partial register updates.
 let AddedComplexity = 10 in {
 def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
 def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
-def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>;
 }
 
 let Predicates = [OptForSize, Not64BitMode],
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 1eddb27847d..e62c8403693 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -683,8 +683,10 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
   if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
     // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
     // effects.
+    unsigned NewOpc = X86::MOV32ri;
     int Value;
     switch (Orig.getOpcode()) {
+    case X86::MOV64r0:  NewOpc = X86::MOV32ri64; Value = 0; break;
     case X86::MOV32r0:  Value = 0; break;
     case X86::MOV32r1:  Value = 1; break;
     case X86::MOV32r_1: Value = -1; break;
@@ -693,7 +695,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
     }
 
     const DebugLoc &DL = Orig.getDebugLoc();
-    BuildMI(MBB, I, DL, get(X86::MOV32ri))
+    BuildMI(MBB, I, DL, get(NewOpc))
         .add(Orig.getOperand(0))
         .addImm(Value);
   } else {
@@ -3750,7 +3752,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
       // MOV32r0 etc. are implemented with xor which clobbers condition code.
       // They are safe to move up, if the definition to EFLAGS is dead and
       // earlier instructions do not read or write EFLAGS.
-      if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
+      if (!Movr0Inst &&
+          (Instr.getOpcode() == X86::MOV32r0 ||
+           Instr.getOpcode() == X86::MOV64r0) &&
           Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
         Movr0Inst = &Instr;
         continue;
@@ -4155,6 +4159,15 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case X86::MOV32r0:
     return Expand2AddrUndef(MIB, get(X86::XOR32rr));
+  case X86::MOV64r0: {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    unsigned Reg = MIB->getOperand(0).getReg();
+    unsigned Reg32 = TRI->getSubReg(Reg, X86::sub_32bit);
+    MIB->getOperand(0).setReg(Reg32);
+    Expand2AddrUndef(MIB, get(X86::XOR32rr));
+    MIB.addReg(Reg, RegState::ImplicitDefine);
+    return true;
+  }
   case X86::MOV32r1:
     return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
   case X86::MOV32r_1:
@@ -4898,8 +4911,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     isTwoAddrFold = true;
   } else {
     if (OpNum == 0) {
-      if (MI.getOpcode() == X86::MOV32r0) {
-        NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
+      if (MI.getOpcode() == X86::MOV32r0 || MI.getOpcode() == X86::MOV64r0) {
+        unsigned NewOpc = MI.getOpcode() == X86::MOV64r0 ? X86::MOV64mi32
+                                                         : X86::MOV32mi;
+        NewMI = MakeM0Inst(*this, NewOpc, MOs, InsertPt, MI);
         if (NewMI)
           return NewMI;
       }
diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 14e4c455a08..20997ecc07d 100644
--- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -487,20 +487,14 @@ bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
     // Otherwise, just build the predicate state itself by zeroing a register
     // as we don't need any initial state.
     PS->InitialReg = MRI->createVirtualRegister(PS->RC);
-    unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
-    auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
-                         PredStateSubReg);
+    auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64r0),
+                         PS->InitialReg);
     ++NumInstsInserted;
     MachineOperand *ZeroEFLAGSDefOp =
         ZeroI->findRegisterDefOperand(X86::EFLAGS);
     assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
            "Must have an implicit def of EFLAGS!");
     ZeroEFLAGSDefOp->setIsDead(true);
-    BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
-            PS->InitialReg)
-        .addImm(0)
-        .addReg(PredStateSubReg)
-        .addImm(X86::sub_32bit);
   }
 
   // We're going to need to trace predicate state throughout the function's
diff --git a/test/CodeGen/X86/GlobalISel/constant.ll b/test/CodeGen/X86/GlobalISel/constant.ll
index f6ebb70fcf5..2043c60f499 100644
--- a/test/CodeGen/X86/GlobalISel/constant.ll
+++ b/test/CodeGen/X86/GlobalISel/constant.ll
@@ -54,7 +54,7 @@ define i64 @const_i64_i32() {
 define void @main(i32 ** %data) {
 ; ALL-LABEL: main:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    movq $0, %rax
+; ALL-NEXT:    xorl %eax, %eax
 ; ALL-NEXT:    movq %rax, (%rdi)
 ; ALL-NEXT:    retq
   store i32* null, i32** %data, align 8
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index 84f1296d51c..c4b15070bad 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -2141,7 +2141,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    pushq %r13
 ; AVX1-NEXT:    pushq %r12
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    subq $24, %rsp
+; AVX1-NEXT:    subq $16, %rsp
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -2152,12 +2152,12 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vmovq %xmm5, %rbp
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX1-NEXT:    vmovq %xmm4, %rcx
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rcx
+; AVX1-NEXT:    vmovq %xmm4, %rsi
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %r8
+; AVX1-NEXT:    vpextrq $1, %xmm4, %r10
 ; AVX1-NEXT:    vmovq %xmm4, %r11
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
@@ -2166,7 +2166,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm4, %r15
-; AVX1-NEXT:    vmovq %xmm4, %rdi
+; AVX1-NEXT:    vmovq %xmm4, %rdx
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
@@ -2175,27 +2175,28 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    vmovq %xmm3, %r10
+; AVX1-NEXT:    vmovq %xmm3, %r9
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX1-NEXT:    addq %rbx, %rdx
-; AVX1-NEXT:    vmovq %xmm4, %r9
-; AVX1-NEXT:    addq %rbp, %r9
+; AVX1-NEXT:    vpextrq $1, %xmm4, %r14
+; AVX1-NEXT:    addq %rbx, %r14
+; AVX1-NEXT:    vmovq %xmm4, %r8
+; AVX1-NEXT:    addq %rbp, %r8
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm3, %rdi
+; AVX1-NEXT:    addq %rcx, %rdi
+; AVX1-NEXT:    vmovq %xmm3, %rax
 ; AVX1-NEXT:    addq %rsi, %rax
-; AVX1-NEXT:    movq %rax, %r14
-; AVX1-NEXT:    vmovq %xmm3, %rbp
-; AVX1-NEXT:    addq %rcx, %rbp
+; AVX1-NEXT:    movq %rax, %rsi
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rsi
-; AVX1-NEXT:    addq %r8, %rsi
+; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX1-NEXT:    addq %r10, %rax
+; AVX1-NEXT:    movq %rax, %r10
 ; AVX1-NEXT:    vmovq %xmm3, %rax
 ; AVX1-NEXT:    addq %r11, %rax
 ; AVX1-NEXT:    movq %rax, %r11
@@ -2203,17 +2204,17 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
 ; AVX1-NEXT:    addq %r13, %rax
-; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    movq %rax, %rbx
 ; AVX1-NEXT:    vmovq %xmm2, %rax
 ; AVX1-NEXT:    addq %r12, %rax
-; AVX1-NEXT:    movq %rax, %r8
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
 ; AVX1-NEXT:    addq %r15, %rax
-; AVX1-NEXT:    movq %rax, %rbx
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    vmovq %xmm3, %rax
-; AVX1-NEXT:    addq %rdi, %rax
+; AVX1-NEXT:    addq %rdx, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
@@ -2226,41 +2227,40 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vmovq %xmm2, %r12
-; AVX1-NEXT:    addq %r10, %r12
+; AVX1-NEXT:    vpextrq $1, %xmm2, %rbp
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; AVX1-NEXT:    vmovq %xmm2, %r15
+; AVX1-NEXT:    addq %r9, %r15
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm0, %r10
-; AVX1-NEXT:    addq %rax, %r10
-; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vmovq %xmm0, %rdi
-; AVX1-NEXT:    addq %rax, %rdi
-; AVX1-NEXT:    addq $-1, %rdx
-; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX1-NEXT:    addq %rax, %r9
+; AVX1-NEXT:    vmovq %xmm1, %rcx
+; AVX1-NEXT:    vmovq %xmm0, %rdx
+; AVX1-NEXT:    addq %rcx, %rdx
+; AVX1-NEXT:    addq $-1, %r14
+; AVX1-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r9
-; AVX1-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %r8
+; AVX1-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r14
-; AVX1-NEXT:    movq %r14, (%rsp) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %rdi
+; AVX1-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rbp
-; AVX1-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %rsi
+; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rsi
-; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %r10
+; AVX1-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2269,93 +2269,90 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rcx
-; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %ebp
-; AVX1-NEXT:    adcq $-1, %rbp
-; AVX1-NEXT:    addq $-1, %r8
-; AVX1-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %r15d
-; AVX1-NEXT:    adcq $-1, %r15
 ; AVX1-NEXT:    addq $-1, %rbx
 ; AVX1-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, %rsi
 ; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    movl $0, %r12d
+; AVX1-NEXT:    adcq $-1, %r12
+; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    movl $0, %r13d
 ; AVX1-NEXT:    adcq $-1, %r13
 ; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    movl $0, %r14d
 ; AVX1-NEXT:    adcq $-1, %r14
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    addq $-1, %rdx
-; AVX1-NEXT:    movl $0, %r11d
-; AVX1-NEXT:    adcq $-1, %r11
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    addq $-1, %rax
+; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    movl $0, %ebx
 ; AVX1-NEXT:    adcq $-1, %rbx
-; AVX1-NEXT:    addq $-1, %r12
-; AVX1-NEXT:    movl $0, %r9d
-; AVX1-NEXT:    adcq $-1, %r9
-; AVX1-NEXT:    addq $-1, %r10
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT:    addq $-1, %rcx
+; AVX1-NEXT:    movl $0, %r11d
+; AVX1-NEXT:    adcq $-1, %r11
+; AVX1-NEXT:    addq $-1, %rbp
+; AVX1-NEXT:    movl $0, %r10d
+; AVX1-NEXT:    adcq $-1, %r10
+; AVX1-NEXT:    addq $-1, %r15
 ; AVX1-NEXT:    movl $0, %r8d
 ; AVX1-NEXT:    adcq $-1, %r8
-; AVX1-NEXT:    addq $-1, %rdi
-; AVX1-NEXT:    movl $0, %ecx
-; AVX1-NEXT:    adcq $-1, %rcx
-; AVX1-NEXT:    shldq $63, %rdi, %rcx
-; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    shldq $63, %r10, %r8
-; AVX1-NEXT:    shldq $63, %r12, %r9
-; AVX1-NEXT:    shldq $63, %rax, %rbx
-; AVX1-NEXT:    shldq $63, %rdx, %r11
+; AVX1-NEXT:    addq $-1, %r9
+; AVX1-NEXT:    movl $0, %edi
+; AVX1-NEXT:    adcq $-1, %rdi
+; AVX1-NEXT:    addq $-1, %rdx
+; AVX1-NEXT:    movl $0, %eax
+; AVX1-NEXT:    adcq $-1, %rax
+; AVX1-NEXT:    shldq $63, %rdx, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    shldq $63, %r9, %rdi
+; AVX1-NEXT:    shldq $63, %r15, %r8
+; AVX1-NEXT:    shldq $63, %rbp, %r10
+; AVX1-NEXT:    shldq $63, %rcx, %r11
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rdx, %rbx
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rdx, %r14
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rdx, %r13
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rdx, %r12
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rax, %rsi
 ; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %r15
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rbp
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rax, %rsi
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rcx
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rdi
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX1-NEXT:    movq (%rsp), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %r12
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %r15
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %r10
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rcx, %rax
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rcx, %r9
+; AVX1-NEXT:    movq (%rsp), %rcx # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rdx, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm8
-; AVX1-NEXT:    vmovq %r10, %xmm0
-; AVX1-NEXT:    vmovq %r12, %xmm1
-; AVX1-NEXT:    vmovq %rdi, %xmm11
-; AVX1-NEXT:    vmovq %rcx, %xmm2
+; AVX1-NEXT:    shldq $63, %rdx, %rcx
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbp, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm8
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vmovq %r9, %xmm1
+; AVX1-NEXT:    vmovq %rax, %xmm11
+; AVX1-NEXT:    vmovq %r15, %xmm2
 ; AVX1-NEXT:    vmovq %rsi, %xmm13
-; AVX1-NEXT:    vmovq %rbp, %xmm14
-; AVX1-NEXT:    vmovq %r15, %xmm15
-; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 8-byte Folded Reload
-; AVX1-NEXT:    # xmm9 = mem[0],zero
-; AVX1-NEXT:    vmovq %r13, %xmm10
-; AVX1-NEXT:    vmovq %r14, %xmm12
+; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 8-byte Folded Reload
+; AVX1-NEXT:    # xmm14 = mem[0],zero
+; AVX1-NEXT:    vmovq %r12, %xmm15
+; AVX1-NEXT:    vmovq %r13, %xmm9
+; AVX1-NEXT:    vmovq %r14, %xmm10
+; AVX1-NEXT:    vmovq %rbx, %xmm12
 ; AVX1-NEXT:    vmovq %r11, %xmm3
-; AVX1-NEXT:    vmovq %rbx, %xmm4
-; AVX1-NEXT:    vmovq %r9, %xmm5
-; AVX1-NEXT:    vmovq %r8, %xmm6
+; AVX1-NEXT:    vmovq %r10, %xmm4
+; AVX1-NEXT:    vmovq %r8, %xmm5
+; AVX1-NEXT:    vmovq %rdi, %xmm6
 ; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
 ; AVX1-NEXT:    # xmm7 = mem[0],zero
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0]
@@ -2382,7 +2379,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
-; AVX1-NEXT:    addq $24, %rsp
+; AVX1-NEXT:    addq $16, %rsp
 ; AVX1-NEXT:    popq %rbx
 ; AVX1-NEXT:    popq %r12
 ; AVX1-NEXT:    popq %r13
@@ -2407,15 +2404,15 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX2-NEXT:    vpextrq $1, %xmm4, %rbx
-; AVX2-NEXT:    vmovq %xmm4, %rbp
+; AVX2-NEXT:    vmovq %xmm4, %rdx
 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rdi
 ; AVX2-NEXT:    vmovq %xmm3, %rcx
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX2-NEXT:    vmovq %xmm3, %r9
-; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r9
+; AVX2-NEXT:    vmovq %xmm3, %r10
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r13
 ; AVX2-NEXT:    vmovq %xmm2, %r12
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -2433,26 +2430,26 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
-; AVX2-NEXT:    addq %rbx, %rax
-; AVX2-NEXT:    movq %rax, %rbx
-; AVX2-NEXT:    vmovq %xmm4, %r13
-; AVX2-NEXT:    addq %rbp, %r13
-; AVX2-NEXT:    vpextrq $1, %xmm3, %r10
-; AVX2-NEXT:    addq %rdi, %r10
+; AVX2-NEXT:    vpextrq $1, %xmm4, %rbp
+; AVX2-NEXT:    addq %rbx, %rbp
+; AVX2-NEXT:    vmovq %xmm4, %rax
+; AVX2-NEXT:    addq %rdx, %rax
+; AVX2-NEXT:    movq %rax, %r11
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r8
+; AVX2-NEXT:    addq %rdi, %r8
 ; AVX2-NEXT:    vmovq %xmm3, %r14
 ; AVX2-NEXT:    addq %rcx, %r14
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX2-NEXT:    addq %rdx, %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    vmovq %xmm3, %r8
-; AVX2-NEXT:    addq %r9, %r8
-; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX2-NEXT:    addq %r11, %rax
-; AVX2-NEXT:    movq %rax, %r11
+; AVX2-NEXT:    addq %r9, %rax
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    addq %r10, %rax
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT:    addq %r13, %rcx
 ; AVX2-NEXT:    vmovq %xmm2, %rax
 ; AVX2-NEXT:    addq %r12, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2474,8 +2471,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpextrq $1, %xmm2, %rbp
-; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r12
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; AVX2-NEXT:    vmovq %xmm2, %r9
 ; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
@@ -2484,36 +2481,36 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    vmovq %xmm1, %rdx
 ; AVX2-NEXT:    vmovq %xmm0, %rsi
 ; AVX2-NEXT:    addq %rdx, %rsi
-; AVX2-NEXT:    addq $-1, %rbx
-; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq $-1, %rbp
+; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %eax
 ; AVX2-NEXT:    adcq $-1, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, %r13
-; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq $-1, %r11
+; AVX2-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %eax
 ; AVX2-NEXT:    adcq $-1, %rax
 ; AVX2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, %r10
-; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq $-1, %r8
+; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %eax
 ; AVX2-NEXT:    adcq $-1, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    addq $-1, %r14
 ; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl $0, %r13d
-; AVX2-NEXT:    adcq $-1, %r13
-; AVX2-NEXT:    addq $-1, %rcx
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movl $0, %ebp
+; AVX2-NEXT:    adcq $-1, %rbp
+; AVX2-NEXT:    addq $-1, %rbx
+; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %eax
 ; AVX2-NEXT:    adcq $-1, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, %r8
-; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq $-1, %r10
+; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %r15d
 ; AVX2-NEXT:    adcq $-1, %r15
-; AVX2-NEXT:    addq $-1, %r11
-; AVX2-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq $-1, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %ebx
 ; AVX2-NEXT:    adcq $-1, %rbx
 ; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
@@ -2528,13 +2525,13 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    adcq $-1, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movl $0, %r12d
-; AVX2-NEXT:    adcq $-1, %r12
+; AVX2-NEXT:    movl $0, %r13d
+; AVX2-NEXT:    adcq $-1, %r13
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX2-NEXT:    addq $-1, %rcx
 ; AVX2-NEXT:    movl $0, %r11d
 ; AVX2-NEXT:    adcq $-1, %r11
-; AVX2-NEXT:    addq $-1, %rbp
+; AVX2-NEXT:    addq $-1, %r12
 ; AVX2-NEXT:    movl $0, %r14d
 ; AVX2-NEXT:    adcq $-1, %r14
 ; AVX2-NEXT:    addq $-1, %r9
@@ -2550,10 +2547,10 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    shldq $63, %rdi, %rdx
 ; AVX2-NEXT:    shldq $63, %r9, %r10
-; AVX2-NEXT:    shldq $63, %rbp, %r14
+; AVX2-NEXT:    shldq $63, %r12, %r14
 ; AVX2-NEXT:    shldq $63, %rcx, %r11
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %r12
+; AVX2-NEXT:    shldq $63, %rcx, %r13
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX2-NEXT:    shldq $63, %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
@@ -2569,10 +2566,10 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX2-NEXT:    shldq $63, %rcx, %rax
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %r13
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX2-NEXT:    shldq $63, %rcx, %rbp
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rcx, %r12
 ; AVX2-NEXT:    movq (%rsp), %rdi # 8-byte Reload
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX2-NEXT:    shldq $63, %rcx, %rdi
@@ -2581,8 +2578,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    shldq $63, %rcx, %rsi
 ; AVX2-NEXT:    vmovq %rsi, %xmm8
 ; AVX2-NEXT:    vmovq %rdi, %xmm9
-; AVX2-NEXT:    vmovq %rbp, %xmm10
-; AVX2-NEXT:    vmovq %r13, %xmm11
+; AVX2-NEXT:    vmovq %r12, %xmm10
+; AVX2-NEXT:    vmovq %rbp, %xmm11
 ; AVX2-NEXT:    vmovq %rax, %xmm12
 ; AVX2-NEXT:    vmovq %r15, %xmm13
 ; AVX2-NEXT:    vmovq %rbx, %xmm14
@@ -2590,7 +2587,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    vmovq %r9, %xmm0
 ; AVX2-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload
 ; AVX2-NEXT:    # xmm1 = mem[0],zero
-; AVX2-NEXT:    vmovq %r12, %xmm2
+; AVX2-NEXT:    vmovq %r13, %xmm2
 ; AVX2-NEXT:    vmovq %r11, %xmm3
 ; AVX2-NEXT:    vmovq %r14, %xmm4
 ; AVX2-NEXT:    vmovq %r10, %xmm5
@@ -2647,7 +2644,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    pushq %r13
 ; AVX512-NEXT:    pushq %r12
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $24, %rsp
+; AVX512-NEXT:    subq $16, %rsp
 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -2660,8 +2657,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    vmovq %xmm3, %rsi
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX512-NEXT:    vmovq %xmm3, %r8
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rcx
+; AVX512-NEXT:    vmovq %xmm3, %r10
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r13
 ; AVX512-NEXT:    vmovq %xmm2, %r12
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
@@ -2669,7 +2666,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpextrq $1, %xmm3, %r15
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rdx
 ; AVX512-NEXT:    vmovq %xmm3, %r14
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r9
 ; AVX512-NEXT:    vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
@@ -2681,35 +2678,34 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rax
-; AVX512-NEXT:    addq %rbx, %rax
-; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    vpextrq $1, %xmm4, %r11
+; AVX512-NEXT:    addq %rbx, %r11
 ; AVX512-NEXT:    vmovq %xmm4, %rax
 ; AVX512-NEXT:    addq %rbp, %rax
-; AVX512-NEXT:    movq %rax, %rbp
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512-NEXT:    addq %rdi, %rax
-; AVX512-NEXT:    movq %rax, %rdi
-; AVX512-NEXT:    vmovq %xmm3, %r10
-; AVX512-NEXT:    addq %rsi, %r10
+; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    vpextrq $1, %xmm3, %r8
+; AVX512-NEXT:    addq %rdi, %r8
+; AVX512-NEXT:    vmovq %xmm3, %r15
+; AVX512-NEXT:    addq %rsi, %r15
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rcx
-; AVX512-NEXT:    addq %rdx, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rdi
+; AVX512-NEXT:    addq %rcx, %rdi
 ; AVX512-NEXT:    vmovq %xmm3, %rax
-; AVX512-NEXT:    addq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r8
+; AVX512-NEXT:    addq %r10, %rax
+; AVX512-NEXT:    movq %rax, %r10
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %rsi
 ; AVX512-NEXT:    addq %r13, %rsi
-; AVX512-NEXT:    vmovq %xmm2, %r11
-; AVX512-NEXT:    addq %r12, %r11
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    addq %r12, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512-NEXT:    addq %r15, %rax
+; AVX512-NEXT:    addq %rdx, %rax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    vmovq %xmm3, %rax
 ; AVX512-NEXT:    addq %r14, %rax
@@ -2722,24 +2718,33 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rbp
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; AVX512-NEXT:    vmovq %xmm2, %r14
 ; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512-NEXT:    vpextrq $1, %xmm1, %r9
 ; AVX512-NEXT:    addq %rax, %r9
-; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vmovq %xmm0, %rcx
 ; AVX512-NEXT:    vmovq %xmm1, %rdx
-; AVX512-NEXT:    addq %rax, %rdx
+; AVX512-NEXT:    addq %rcx, %rdx
+; AVX512-NEXT:    addq $-1, %r11
+; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movl $0, %eax
+; AVX512-NEXT:    adcq $-1, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    addq $-1, %rbx
 ; AVX512-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movl $0, %eax
 ; AVX512-NEXT:    adcq $-1, %rax
+; AVX512-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; AVX512-NEXT:    addq $-1, %r8
+; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movl $0, %eax
+; AVX512-NEXT:    adcq $-1, %rax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    addq $-1, %rbp
-; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    addq $-1, %r15
+; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movl $0, %eax
 ; AVX512-NEXT:    adcq $-1, %rax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2747,108 +2752,94 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movl $0, %eax
 ; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    addq $-1, %r10
 ; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movl $0, %eax
 ; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    addq $-1, %rcx
-; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    addq $-1, %r8
-; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
 ; AVX512-NEXT:    addq $-1, %rsi
 ; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %r13d
-; AVX512-NEXT:    adcq $-1, %r13
-; AVX512-NEXT:    addq $-1, %r11
-; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %r15d
-; AVX512-NEXT:    adcq $-1, %r15
-; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, %rsi
-; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX512-NEXT:    movl $0, %r12d
 ; AVX512-NEXT:    adcq $-1, %r12
 ; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX512-NEXT:    movl $0, %ebx
 ; AVX512-NEXT:    adcq $-1, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    addq $-1, %rbp
+; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movl $0, %r13d
+; AVX512-NEXT:    adcq $-1, %r13
+; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movl $0, %r15d
+; AVX512-NEXT:    adcq $-1, %r15
+; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX512-NEXT:    movl $0, %r11d
 ; AVX512-NEXT:    adcq $-1, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    addq $-1, %rax
+; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movl $0, %r8d
+; AVX512-NEXT:    adcq $-1, %r8
+; AVX512-NEXT:    addq $-1, %rbp
 ; AVX512-NEXT:    movl $0, %r10d
 ; AVX512-NEXT:    adcq $-1, %r10
 ; AVX512-NEXT:    addq $-1, %r14
-; AVX512-NEXT:    movl $0, %r8d
-; AVX512-NEXT:    adcq $-1, %r8
-; AVX512-NEXT:    addq $-1, %r9
 ; AVX512-NEXT:    movl $0, %edi
 ; AVX512-NEXT:    adcq $-1, %rdi
+; AVX512-NEXT:    addq $-1, %r9
+; AVX512-NEXT:    movl $0, %esi
+; AVX512-NEXT:    adcq $-1, %rsi
 ; AVX512-NEXT:    addq $-1, %rdx
-; AVX512-NEXT:    movl $0, %ecx
-; AVX512-NEXT:    adcq $-1, %rcx
-; AVX512-NEXT:    shldq $63, %rdx, %rcx
-; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq $63, %r9, %rdi
-; AVX512-NEXT:    shldq $63, %r14, %r8
-; AVX512-NEXT:    shldq $63, %rax, %r10
-; AVX512-NEXT:    shldq $63, %rbp, %r11
+; AVX512-NEXT:    movl $0, %eax
+; AVX512-NEXT:    adcq $-1, %rax
+; AVX512-NEXT:    shldq $63, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq $63, %r9, %rsi
+; AVX512-NEXT:    shldq $63, %r14, %rdi
+; AVX512-NEXT:    shldq $63, %rbp, %r10
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %rbx
+; AVX512-NEXT:    shldq $63, %rdx, %r8
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %r12
+; AVX512-NEXT:    shldq $63, %rdx, %r11
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %rsi
-; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq $63, %rdx, %r15
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rdx, %r13
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rax, %r15
+; AVX512-NEXT:    shldq $63, %rax, %rbx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rax, %r13
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rax, %r12
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rax, %rsi
+; AVX512-NEXT:    shldq $63, %rax, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX512-NEXT:    shldq $63, %rax, %rcx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX512-NEXT:    shldq $63, %rdx, %rax
-; AVX512-NEXT:    movq (%rsp), %r14 # 8-byte Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX512-NEXT:    shldq $63, %rdx, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT:    movq (%rsp), %r9 # 8-byte Reload
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX512-NEXT:    shldq $63, %rdx, %r9
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %rbp
-; AVX512-NEXT:    vmovq %rbp, %xmm8
+; AVX512-NEXT:    shldq $63, %rbp, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm8
 ; AVX512-NEXT:    vmovq %r9, %xmm9
 ; AVX512-NEXT:    vmovq %r14, %xmm10
 ; AVX512-NEXT:    vmovq %rax, %xmm11
 ; AVX512-NEXT:    vmovq %rcx, %xmm12
-; AVX512-NEXT:    vmovq %rsi, %xmm13
-; AVX512-NEXT:    vmovq %r13, %xmm14
-; AVX512-NEXT:    vmovq %r15, %xmm15
-; AVX512-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = mem[0],zero
-; AVX512-NEXT:    vmovq %r12, %xmm1
-; AVX512-NEXT:    vmovq %rbx, %xmm2
-; AVX512-NEXT:    vmovq %r11, %xmm3
+; AVX512-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 8-byte Folded Reload
+; AVX512-NEXT:    # xmm13 = mem[0],zero
+; AVX512-NEXT:    vmovq %r12, %xmm14
+; AVX512-NEXT:    vmovq %rbx, %xmm15
+; AVX512-NEXT:    vmovq %r13, %xmm0
+; AVX512-NEXT:    vmovq %r15, %xmm1
+; AVX512-NEXT:    vmovq %r11, %xmm2
+; AVX512-NEXT:    vmovq %r8, %xmm3
 ; AVX512-NEXT:    vmovq %r10, %xmm4
-; AVX512-NEXT:    vmovq %r8, %xmm5
-; AVX512-NEXT:    vmovq %rdi, %xmm6
+; AVX512-NEXT:    vmovq %rdi, %xmm5
+; AVX512-NEXT:    vmovq %rsi, %xmm6
 ; AVX512-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
 ; AVX512-NEXT:    # xmm7 = mem[0],zero
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0]
@@ -2869,7 +2860,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
-; AVX512-NEXT:    addq $24, %rsp
+; AVX512-NEXT:    addq $16, %rsp
 ; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    popq %r12
 ; AVX512-NEXT:    popq %r13
diff --git a/test/CodeGen/X86/crash-O0.ll b/test/CodeGen/X86/crash-O0.ll
index 1a234d45cb2..deaf19daccc 100644
--- a/test/CodeGen/X86/crash-O0.ll
+++ b/test/CodeGen/X86/crash-O0.ll
@@ -77,11 +77,11 @@ define i64 @addressModeWith32bitIndex(i32 %V) {
 ; CHECK-NEXT:    movq %rsp, %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    cqto
-; CHECK-NEXT:    movslq %edi, %rsi
-; CHECK-NEXT:    idivq (%rcx,%rsi,8)
+; CHECK-NEXT:    movslq %edi, %rcx
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
+; CHECK-NEXT:    idivq (%rsi,%rcx,8)
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %gep = getelementptr i64, i64* null, i32 %V
diff --git a/test/CodeGen/X86/hoist-spill.ll b/test/CodeGen/X86/hoist-spill.ll
index 6a3f5ca01e8..040924a6c28 100644
--- a/test/CodeGen/X86/hoist-spill.ll
+++ b/test/CodeGen/X86/hoist-spill.ll
@@ -2,9 +2,7 @@
 
 ; Check no spills to the same stack slot after hoisting.
 ; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET1:-?[0-9]*]](%rsp)
-; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET2:-?[0-9]*]](%rsp)
 ; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET1]](%rsp)
-; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET2]](%rsp)
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/machine-cse.ll b/test/CodeGen/X86/machine-cse.ll
index b55b43fafa5..8ce61be555f 100644
--- a/test/CodeGen/X86/machine-cse.ll
+++ b/test/CodeGen/X86/machine-cse.ll
@@ -133,25 +133,26 @@ return:
 define i8* @bsd_memchr(i8* %s, i32 %a, i32 %c, i64 %n) nounwind ssp {
 ; CHECK-LABEL: bsd_memchr:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testq %rcx, %rcx
-; CHECK-NEXT:    je .LBB3_4
+; CHECK-NEXT:    je .LBB3_5
 ; CHECK-NEXT:  # %bb.1: # %preheader
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    movzbl %dl, %edx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB3_2: # %do.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpl %edx, %esi
-; CHECK-NEXT:    je .LBB3_5
-; CHECK-NEXT:  # %bb.3: # %do.cond
+; CHECK-NEXT:    je .LBB3_3
+; CHECK-NEXT:  # %bb.4: # %do.cond
 ; CHECK-NEXT:    # in Loop: Header=BB3_2 Depth=1
-; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    incq %rdi
 ; CHECK-NEXT:    decq %rcx
 ; CHECK-NEXT:    jne .LBB3_2
-; CHECK-NEXT:  .LBB3_4:
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:  .LBB3_5: # %return
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB3_3:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    retq
 entry:
   %cmp = icmp eq i64 %n, 0
   br i1 %cmp, label %return, label %preheader
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index c36faecbf85..ef91981e701 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -356,7 +356,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; SSE2-NEXT:    xorl %ecx, %ecx
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB3_1: # %vector.body
@@ -365,18 +365,18 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm6
 ; SSE2-NEXT:    movdqu 32(%rdi,%rcx,2), %xmm7
 ; SSE2-NEXT:    movdqu 48(%rdi,%rcx,2), %xmm9
-; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm0
-; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm0
-; SSE2-NEXT:    pmaddwd %xmm6, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm4
-; SSE2-NEXT:    movdqu 32(%rsi,%rcx,2), %xmm0
-; SSE2-NEXT:    pmaddwd %xmm7, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movdqu 48(%rsi,%rcx,2), %xmm0
-; SSE2-NEXT:    pmaddwd %xmm9, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm3
+; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm1
+; SSE2-NEXT:    pmaddwd %xmm5, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm1
+; SSE2-NEXT:    pmaddwd %xmm6, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm4
+; SSE2-NEXT:    movdqu 32(%rsi,%rcx,2), %xmm1
+; SSE2-NEXT:    pmaddwd %xmm7, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movdqu 48(%rsi,%rcx,2), %xmm1
+; SSE2-NEXT:    pmaddwd %xmm9, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm3
 ; SSE2-NEXT:    addq $16, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB3_1
@@ -385,14 +385,14 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; SSE2-NEXT:    paddd %xmm8, %xmm3
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm8, %xmm2
-; SSE2-NEXT:    paddd %xmm8, %xmm1
-; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm8, %xmm0
+; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: _Z10test_shortPsS_i_1024:
@@ -949,7 +949,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    xorl %ecx, %ecx
 ; SSE2-NEXT:    pxor %xmm9, %xmm9
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB7_1: # %vector.body
@@ -963,9 +963,9 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    movq {{.*#+}} xmm7 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm7
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm1
 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
@@ -980,11 +980,11 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
 ; SSE2-NEXT:    pmaddwd %xmm7, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
+; SSE2-NEXT:    pmaddwd %xmm1, %xmm2
 ; SSE2-NEXT:    paddd %xmm2, %xmm3
 ; SSE2-NEXT:    addq $32, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
@@ -994,14 +994,14 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    paddd %xmm8, %xmm3
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm8, %xmm9
-; SSE2-NEXT:    paddd %xmm8, %xmm1
-; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    paddd %xmm9, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm8, %xmm0
+; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    paddd %xmm9, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: _Z9test_charPcS_i_1024:
diff --git a/test/CodeGen/X86/mmx-arith.ll b/test/CodeGen/X86/mmx-arith.ll
index 2d24cb8df35..4362a193014 100644
--- a/test/CodeGen/X86/mmx-arith.ll
+++ b/test/CodeGen/X86/mmx-arith.ll
@@ -604,12 +604,13 @@ define <1 x i64> @test3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind {
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    xorl %r8d, %r8d
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    je .LBB3_2
+; X64-NEXT:    je .LBB3_3
+; X64-NEXT:  # %bb.1: # %bb26.preheader
+; X64-NEXT:    xorl %r8d, %r8d
 ; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB3_1: # %bb26
+; X64-NEXT:  .LBB3_2: # %bb26
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    movslq %r8d, %r8
 ; X64-NEXT:    movq (%rdi,%r8,8), %rcx
@@ -617,8 +618,8 @@ define <1 x i64> @test3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind {
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    incl %r8d
 ; X64-NEXT:    cmpl %edx, %r8d
-; X64-NEXT:    jb .LBB3_1
-; X64-NEXT:  .LBB3_2: # %bb31
+; X64-NEXT:    jb .LBB3_2
+; X64-NEXT:  .LBB3_3: # %bb31
 ; X64-NEXT:    retq
 entry:
   %tmp2942 = icmp eq i32 %count, 0
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index ab6680cf45a..878c1c5af61 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -10,13 +10,12 @@ define void @foo() {
 ; X86-O0-LABEL: foo:
 ; X86-O0:       # %bb.0: # %entry
 ; X86-O0-NEXT:    xorl %eax, %eax
-; X86-O0-NEXT:    movl %eax, %ecx
-; X86-O0-NEXT:    xorl %eax, %eax
+; X86-O0-NEXT:    xorl %ecx, %ecx
 ; X86-O0-NEXT:    movzbl c, %edx
-; X86-O0-NEXT:    subl %edx, %eax
-; X86-O0-NEXT:    movslq %eax, %rsi
-; X86-O0-NEXT:    subq %rsi, %rcx
-; X86-O0-NEXT:    movb %cl, %dil
+; X86-O0-NEXT:    subl %edx, %ecx
+; X86-O0-NEXT:    movslq %ecx, %rsi
+; X86-O0-NEXT:    subq %rsi, %rax
+; X86-O0-NEXT:    movb %al, %dil
 ; X86-O0-NEXT:    cmpb $0, %dil
 ; X86-O0-NEXT:    setne %dil
 ; X86-O0-NEXT:    andb $1, %dil
@@ -26,13 +25,13 @@ define void @foo() {
 ; X86-O0-NEXT:    xorb $-1, %dil
 ; X86-O0-NEXT:    xorb $-1, %dil
 ; X86-O0-NEXT:    andb $1, %dil
-; X86-O0-NEXT:    movzbl %dil, %eax
+; X86-O0-NEXT:    movzbl %dil, %ecx
 ; X86-O0-NEXT:    movzbl c, %edx
-; X86-O0-NEXT:    cmpl %edx, %eax
+; X86-O0-NEXT:    cmpl %edx, %ecx
 ; X86-O0-NEXT:    setle %dil
 ; X86-O0-NEXT:    andb $1, %dil
-; X86-O0-NEXT:    movzbl %dil, %eax
-; X86-O0-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X86-O0-NEXT:    movzbl %dil, %ecx
+; X86-O0-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
 ; X86-O0-NEXT:    retq
 ;
 ; X64-LABEL: foo:
diff --git a/test/CodeGen/X86/pr32340.ll b/test/CodeGen/X86/pr32340.ll
index b530bb18c93..559bd8d6b5a 100644
--- a/test/CodeGen/X86/pr32340.ll
+++ b/test/CodeGen/X86/pr32340.ll
@@ -14,22 +14,21 @@ define void @foo() {
 ; X64-LABEL: foo:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    movw $0, var_825
-; X64-NEXT:    movzwl var_32, %eax
+; X64-NEXT:    movzwl var_32, %ecx
 ; X64-NEXT:    movzwl var_901, %edx
-; X64-NEXT:    movl %eax, %esi
+; X64-NEXT:    movl %ecx, %esi
 ; X64-NEXT:    xorl %edx, %esi
-; X64-NEXT:    movl %eax, %edx
+; X64-NEXT:    movl %ecx, %edx
 ; X64-NEXT:    xorl %esi, %edx
-; X64-NEXT:    addl %eax, %edx
+; X64-NEXT:    addl %ecx, %edx
 ; X64-NEXT:    movslq %edx, %rdi
 ; X64-NEXT:    movq %rdi, var_826
-; X64-NEXT:    movzwl var_32, %eax
-; X64-NEXT:    movl %eax, %edi
-; X64-NEXT:    movzwl var_901, %eax
-; X64-NEXT:    xorl $51981, %eax # imm = 0xCB0D
-; X64-NEXT:    movslq %eax, %r8
+; X64-NEXT:    movzwl var_32, %ecx
+; X64-NEXT:    movl %ecx, %edi
+; X64-NEXT:    movzwl var_901, %ecx
+; X64-NEXT:    xorl $51981, %ecx # imm = 0xCB0D
+; X64-NEXT:    movslq %ecx, %r8
 ; X64-NEXT:    movabsq $-1142377792914660288, %r9 # imm = 0xF02575732E06E440
 ; X64-NEXT:    xorq %r9, %r8
 ; X64-NEXT:    movq %rdi, %r9
@@ -41,11 +40,11 @@ define void @foo() {
 ; X64-NEXT:    orq %r8, %rdi
 ; X64-NEXT:    movw %di, %r10w
 ; X64-NEXT:    movw %r10w, var_900
-; X64-NEXT:    cmpq var_28, %rcx
+; X64-NEXT:    cmpq var_28, %rax
 ; X64-NEXT:    setne %r11b
 ; X64-NEXT:    andb $1, %r11b
-; X64-NEXT:    movzbl %r11b, %eax
-; X64-NEXT:    movw %ax, %r10w
+; X64-NEXT:    movzbl %r11b, %ecx
+; X64-NEXT:    movw %cx, %r10w
 ; X64-NEXT:    movw %r10w, var_827
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/scheduler-backtracking.ll b/test/CodeGen/X86/scheduler-backtracking.ll
index 0926a9814ce..0cd35114937 100644
--- a/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/test/CodeGen/X86/scheduler-backtracking.ll
@@ -19,18 +19,18 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    pushq %rbx
 ; ILP-NEXT:    movq %rcx, %r9
 ; ILP-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; ILP-NEXT:    xorl %eax, %eax
 ; ILP-NEXT:    addq $1, %rsi
 ; ILP-NEXT:    adcq $0, %rdx
 ; ILP-NEXT:    adcq $0, %r9
 ; ILP-NEXT:    adcq $0, %r8
 ; ILP-NEXT:    leal 1(%rsi,%rsi), %edi
 ; ILP-NEXT:    movl $1, %ebp
-; ILP-NEXT:    xorl %r14d, %r14d
+; ILP-NEXT:    xorl %eax, %eax
+; ILP-NEXT:    xorl %r11d, %r11d
 ; ILP-NEXT:    movl %edi, %ecx
-; ILP-NEXT:    shldq %cl, %rbp, %r14
-; ILP-NEXT:    movl $1, %r11d
-; ILP-NEXT:    shlq %cl, %r11
+; ILP-NEXT:    shldq %cl, %rbp, %r11
+; ILP-NEXT:    movl $1, %r14d
+; ILP-NEXT:    shlq %cl, %r14
 ; ILP-NEXT:    movb $-128, %r10b
 ; ILP-NEXT:    subb %dil, %r10b
 ; ILP-NEXT:    movq %r9, %r13
@@ -41,33 +41,33 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    xorl %r15d, %r15d
 ; ILP-NEXT:    movl %edi, %ecx
 ; ILP-NEXT:    shldq %cl, %r15, %r15
-; ILP-NEXT:    movq %rsi, %rbx
-; ILP-NEXT:    shrdq %cl, %rdx, %rbx
+; ILP-NEXT:    movq %rsi, %rbp
+; ILP-NEXT:    shrdq %cl, %rdx, %rbp
 ; ILP-NEXT:    shrq %cl, %rdx
 ; ILP-NEXT:    addb $-128, %cl
 ; ILP-NEXT:    shrdq %cl, %r8, %r9
 ; ILP-NEXT:    testb $64, %dil
-; ILP-NEXT:    cmovneq %r11, %r14
-; ILP-NEXT:    cmoveq %rbx, %rdx
+; ILP-NEXT:    cmovneq %r14, %r11
+; ILP-NEXT:    cmoveq %rbp, %rdx
 ; ILP-NEXT:    cmovneq %rax, %r15
-; ILP-NEXT:    cmovneq %rax, %r11
+; ILP-NEXT:    cmovneq %rax, %r14
 ; ILP-NEXT:    testb $64, %r10b
 ; ILP-NEXT:    cmovneq %rax, %r12
 ; ILP-NEXT:    cmovneq %rax, %r13
-; ILP-NEXT:    movl $1, %ebx
-; ILP-NEXT:    shlq %cl, %rbx
+; ILP-NEXT:    movl $1, %ebp
+; ILP-NEXT:    shlq %cl, %rbp
 ; ILP-NEXT:    orl %edx, %r13d
 ; ILP-NEXT:    xorl %edx, %edx
-; ILP-NEXT:    movl $1, %ebp
-; ILP-NEXT:    shldq %cl, %rbp, %rdx
+; ILP-NEXT:    movl $1, %ebx
+; ILP-NEXT:    shldq %cl, %rbx, %rdx
 ; ILP-NEXT:    shrq %cl, %r8
 ; ILP-NEXT:    testb $64, %cl
 ; ILP-NEXT:    cmoveq %r9, %r8
-; ILP-NEXT:    cmovneq %rbx, %rdx
-; ILP-NEXT:    cmovneq %rax, %rbx
+; ILP-NEXT:    cmovneq %rbp, %rdx
+; ILP-NEXT:    cmovneq %rax, %rbp
 ; ILP-NEXT:    testb %dil, %dil
-; ILP-NEXT:    cmovsq %rax, %r14
 ; ILP-NEXT:    cmovsq %rax, %r11
+; ILP-NEXT:    cmovsq %rax, %r14
 ; ILP-NEXT:    jns .LBB0_2
 ; ILP-NEXT:  # %bb.1:
 ; ILP-NEXT:    movl %r8d, %r13d
@@ -76,20 +76,20 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:  # %bb.3:
 ; ILP-NEXT:    movl %r13d, %esi
 ; ILP-NEXT:  .LBB0_4:
-; ILP-NEXT:    cmovnsq %r12, %rbx
-; ILP-NEXT:    cmoveq %rax, %rbx
+; ILP-NEXT:    cmovnsq %r12, %rbp
+; ILP-NEXT:    cmoveq %rax, %rbp
 ; ILP-NEXT:    cmovnsq %r15, %rdx
 ; ILP-NEXT:    cmoveq %rax, %rdx
 ; ILP-NEXT:    testb $1, %sil
 ; ILP-NEXT:    cmovneq %rax, %rdx
 ; ILP-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; ILP-NEXT:    movq %rdx, 24(%rax)
-; ILP-NEXT:    cmovneq %rax, %rbx
-; ILP-NEXT:    movq %rbx, 16(%rax)
-; ILP-NEXT:    cmovneq %rax, %r14
-; ILP-NEXT:    movq %r14, 8(%rax)
+; ILP-NEXT:    cmovneq %rax, %rbp
+; ILP-NEXT:    movq %rbp, 16(%rax)
 ; ILP-NEXT:    cmovneq %rax, %r11
-; ILP-NEXT:    movq %r11, (%rax)
+; ILP-NEXT:    movq %r11, 8(%rax)
+; ILP-NEXT:    cmovneq %rax, %r14
+; ILP-NEXT:    movq %r14, (%rax)
 ; ILP-NEXT:    popq %rbx
 ; ILP-NEXT:    popq %r12
 ; ILP-NEXT:    popq %r13
@@ -100,7 +100,6 @@ define i256 @test1(i256 %a) nounwind {
 ;
 ; HYBRID-LABEL: test1:
 ; HYBRID:       # %bb.0:
-; HYBRID-NEXT:    pushq %rbp
 ; HYBRID-NEXT:    pushq %r15
 ; HYBRID-NEXT:    pushq %r14
 ; HYBRID-NEXT:    pushq %r13
@@ -112,84 +111,82 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-NEXT:    adcq $0, %rdx
 ; HYBRID-NEXT:    adcq $0, %r9
 ; HYBRID-NEXT:    adcq $0, %r8
-; HYBRID-NEXT:    xorl %r10d, %r10d
 ; HYBRID-NEXT:    leal 1(%rsi,%rsi), %edi
 ; HYBRID-NEXT:    xorl %r14d, %r14d
+; HYBRID-NEXT:    xorl %r15d, %r15d
 ; HYBRID-NEXT:    movl %edi, %ecx
-; HYBRID-NEXT:    shldq %cl, %r14, %r14
+; HYBRID-NEXT:    shldq %cl, %r15, %r15
 ; HYBRID-NEXT:    testb $64, %dil
-; HYBRID-NEXT:    cmovneq %r10, %r14
-; HYBRID-NEXT:    movl $1, %ebp
+; HYBRID-NEXT:    cmovneq %r14, %r15
+; HYBRID-NEXT:    movl $1, %r11d
 ; HYBRID-NEXT:    movl $1, %r12d
 ; HYBRID-NEXT:    shlq %cl, %r12
 ; HYBRID-NEXT:    testb $64, %dil
-; HYBRID-NEXT:    movq %r12, %r11
-; HYBRID-NEXT:    cmovneq %r10, %r11
+; HYBRID-NEXT:    movq %r12, %r10
+; HYBRID-NEXT:    cmovneq %r14, %r10
 ; HYBRID-NEXT:    movq %rsi, %rbx
 ; HYBRID-NEXT:    shrdq %cl, %rdx, %rbx
 ; HYBRID-NEXT:    shrq %cl, %rdx
 ; HYBRID-NEXT:    testb $64, %dil
 ; HYBRID-NEXT:    cmoveq %rbx, %rdx
-; HYBRID-NEXT:    xorl %r15d, %r15d
-; HYBRID-NEXT:    shldq %cl, %rbp, %r15
+; HYBRID-NEXT:    xorl %r13d, %r13d
+; HYBRID-NEXT:    shldq %cl, %r11, %r13
 ; HYBRID-NEXT:    testb $64, %dil
-; HYBRID-NEXT:    cmovneq %r12, %r15
+; HYBRID-NEXT:    cmovneq %r12, %r13
 ; HYBRID-NEXT:    movb $-128, %cl
 ; HYBRID-NEXT:    subb %dil, %cl
-; HYBRID-NEXT:    movq %r9, %r13
-; HYBRID-NEXT:    shlq %cl, %r13
+; HYBRID-NEXT:    movq %r9, %rbx
+; HYBRID-NEXT:    shlq %cl, %rbx
 ; HYBRID-NEXT:    movl $1, %r12d
-; HYBRID-NEXT:    shrdq %cl, %r10, %r12
+; HYBRID-NEXT:    shrdq %cl, %r14, %r12
 ; HYBRID-NEXT:    testb $64, %cl
-; HYBRID-NEXT:    cmovneq %r10, %r12
-; HYBRID-NEXT:    cmovneq %r10, %r13
-; HYBRID-NEXT:    orl %edx, %r13d
+; HYBRID-NEXT:    cmovneq %r14, %r12
+; HYBRID-NEXT:    cmovneq %r14, %rbx
+; HYBRID-NEXT:    orl %edx, %ebx
 ; HYBRID-NEXT:    movl %edi, %ecx
 ; HYBRID-NEXT:    addb $-128, %cl
 ; HYBRID-NEXT:    shrdq %cl, %r8, %r9
 ; HYBRID-NEXT:    shrq %cl, %r8
 ; HYBRID-NEXT:    xorl %edx, %edx
-; HYBRID-NEXT:    shldq %cl, %rbp, %rdx
-; HYBRID-NEXT:    shlq %cl, %rbp
+; HYBRID-NEXT:    shldq %cl, %r11, %rdx
+; HYBRID-NEXT:    shlq %cl, %r11
 ; HYBRID-NEXT:    testb $64, %cl
-; HYBRID-NEXT:    cmovneq %rbp, %rdx
+; HYBRID-NEXT:    cmovneq %r11, %rdx
 ; HYBRID-NEXT:    cmoveq %r9, %r8
-; HYBRID-NEXT:    cmovneq %r10, %rbp
+; HYBRID-NEXT:    cmovneq %r14, %r11
 ; HYBRID-NEXT:    testb %dil, %dil
 ; HYBRID-NEXT:    jns .LBB0_2
 ; HYBRID-NEXT:  # %bb.1:
-; HYBRID-NEXT:    movl %r8d, %r13d
+; HYBRID-NEXT:    movl %r8d, %ebx
 ; HYBRID-NEXT:  .LBB0_2:
 ; HYBRID-NEXT:    je .LBB0_4
 ; HYBRID-NEXT:  # %bb.3:
-; HYBRID-NEXT:    movl %r13d, %esi
+; HYBRID-NEXT:    movl %ebx, %esi
 ; HYBRID-NEXT:  .LBB0_4:
-; HYBRID-NEXT:    cmovsq %r10, %r15
-; HYBRID-NEXT:    cmovnsq %r12, %rbp
-; HYBRID-NEXT:    cmoveq %r10, %rbp
-; HYBRID-NEXT:    cmovnsq %r14, %rdx
-; HYBRID-NEXT:    cmoveq %r10, %rdx
-; HYBRID-NEXT:    cmovsq %r10, %r11
+; HYBRID-NEXT:    cmovsq %r14, %r13
+; HYBRID-NEXT:    cmovnsq %r12, %r11
+; HYBRID-NEXT:    cmoveq %r14, %r11
+; HYBRID-NEXT:    cmovnsq %r15, %rdx
+; HYBRID-NEXT:    cmoveq %r14, %rdx
+; HYBRID-NEXT:    cmovsq %r14, %r10
 ; HYBRID-NEXT:    testb $1, %sil
 ; HYBRID-NEXT:    cmovneq %rax, %rdx
 ; HYBRID-NEXT:    movq %rdx, 24(%rax)
-; HYBRID-NEXT:    cmovneq %rax, %rbp
-; HYBRID-NEXT:    movq %rbp, 16(%rax)
-; HYBRID-NEXT:    cmovneq %rax, %r15
-; HYBRID-NEXT:    movq %r15, 8(%rax)
 ; HYBRID-NEXT:    cmovneq %rax, %r11
-; HYBRID-NEXT:    movq %r11, (%rax)
+; HYBRID-NEXT:    movq %r11, 16(%rax)
+; HYBRID-NEXT:    cmovneq %rax, %r13
+; HYBRID-NEXT:    movq %r13, 8(%rax)
+; HYBRID-NEXT:    cmovneq %rax, %r10
+; HYBRID-NEXT:    movq %r10, (%rax)
 ; HYBRID-NEXT:    popq %rbx
 ; HYBRID-NEXT:    popq %r12
 ; HYBRID-NEXT:    popq %r13
 ; HYBRID-NEXT:    popq %r14
 ; HYBRID-NEXT:    popq %r15
-; HYBRID-NEXT:    popq %rbp
 ; HYBRID-NEXT:    retq
 ;
 ; BURR-LABEL: test1:
 ; BURR:       # %bb.0:
-; BURR-NEXT:    pushq %rbp
 ; BURR-NEXT:    pushq %r15
 ; BURR-NEXT:    pushq %r14
 ; BURR-NEXT:    pushq %r13
@@ -201,79 +198,78 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-NEXT:    adcq $0, %rdx
 ; BURR-NEXT:    adcq $0, %r9
 ; BURR-NEXT:    adcq $0, %r8
-; BURR-NEXT:    xorl %r10d, %r10d
 ; BURR-NEXT:    leal 1(%rsi,%rsi), %edi
 ; BURR-NEXT:    xorl %r14d, %r14d
+; BURR-NEXT:    xorl %r15d, %r15d
 ; BURR-NEXT:    movl %edi, %ecx
-; BURR-NEXT:    shldq %cl, %r14, %r14
+; BURR-NEXT:    shldq %cl, %r15, %r15
 ; BURR-NEXT:    testb $64, %dil
-; BURR-NEXT:    cmovneq %r10, %r14
-; BURR-NEXT:    movl $1, %ebp
+; BURR-NEXT:    cmovneq %r14, %r15
+; BURR-NEXT:    movl $1, %r11d
 ; BURR-NEXT:    movl $1, %r12d
 ; BURR-NEXT:    shlq %cl, %r12
 ; BURR-NEXT:    testb $64, %dil
-; BURR-NEXT:    movq %r12, %r11
-; BURR-NEXT:    cmovneq %r10, %r11
+; BURR-NEXT:    movq %r12, %r10
+; BURR-NEXT:    cmovneq %r14, %r10
 ; BURR-NEXT:    movq %rsi, %rbx
 ; BURR-NEXT:    shrdq %cl, %rdx, %rbx
 ; BURR-NEXT:    shrq %cl, %rdx
 ; BURR-NEXT:    testb $64, %dil
 ; BURR-NEXT:    cmoveq %rbx, %rdx
-; BURR-NEXT:    xorl %r15d, %r15d
-; BURR-NEXT:    shldq %cl, %rbp, %r15
+; BURR-NEXT:    xorl %r13d, %r13d
+; BURR-NEXT:    shldq %cl, %r11, %r13
 ; BURR-NEXT:    testb $64, %dil
-; BURR-NEXT:    cmovneq %r12, %r15
+; BURR-NEXT:    cmovneq %r12, %r13
 ; BURR-NEXT:    movb $-128, %cl
 ; BURR-NEXT:    subb %dil, %cl
-; BURR-NEXT:    movq %r9, %r13
-; BURR-NEXT:    shlq %cl, %r13
+; BURR-NEXT:    movq %r9, %rbx
+; BURR-NEXT:    shlq %cl, %rbx
 ; BURR-NEXT:    movl $1, %r12d
-; BURR-NEXT:    shrdq %cl, %r10, %r12
+; BURR-NEXT:    shrdq %cl, %r14, %r12
 ; BURR-NEXT:    testb $64, %cl
-; BURR-NEXT:    cmovneq %r10, %r12
-; BURR-NEXT:    cmovneq %r10, %r13
-; BURR-NEXT:    orl %edx, %r13d
+; BURR-NEXT:    cmovneq %r14, %r12
+; BURR-NEXT:    cmovneq %r14, %rbx
+; BURR-NEXT:    orl %edx, %ebx
 ; BURR-NEXT:    movl %edi, %ecx
 ; BURR-NEXT:    addb $-128, %cl
 ; BURR-NEXT:    shrdq %cl, %r8, %r9
 ; BURR-NEXT:    xorl %edx, %edx
-; BURR-NEXT:    shldq %cl, %rbp, %rdx
+; BURR-NEXT:    shldq %cl, %r11, %rdx
 ; BURR-NEXT:    shrq %cl, %r8
-; BURR-NEXT:    shlq %cl, %rbp
+; BURR-NEXT:    shlq %cl, %r11
 ; BURR-NEXT:    testb $64, %cl
-; BURR-NEXT:    cmovneq %rbp, %rdx
+; BURR-NEXT:    cmovneq %r11, %rdx
 ; BURR-NEXT:    cmoveq %r9, %r8
-; BURR-NEXT:    cmovneq %r10, %rbp
+; BURR-NEXT:    cmovneq %r14, %r11
 ; BURR-NEXT:    testb %dil, %dil
 ; BURR-NEXT:    jns .LBB0_2
 ; BURR-NEXT:  # %bb.1:
-; BURR-NEXT:    movl %r8d, %r13d
+; BURR-NEXT:    movl %r8d, %ebx
 ; BURR-NEXT:  .LBB0_2:
 ; BURR-NEXT:    je .LBB0_4
 ; BURR-NEXT:  # %bb.3:
-; BURR-NEXT:    movl %r13d, %esi
+; BURR-NEXT:    movl %ebx, %esi
 ; BURR-NEXT:  .LBB0_4:
-; BURR-NEXT:    cmovsq %r10, %r15
-; BURR-NEXT:    cmovnsq %r12, %rbp
-; BURR-NEXT:    cmoveq %r10, %rbp
-; BURR-NEXT:    cmovnsq %r14, %rdx
-; BURR-NEXT:    cmoveq %r10, %rdx
-; BURR-NEXT:    cmovsq %r10, %r11
+; BURR-NEXT:    cmovsq %r14, %r13
+; BURR-NEXT:    cmovnsq %r12, %r11
+; BURR-NEXT:    cmoveq %r14, %r11
+; BURR-NEXT:    cmovnsq %r15, %rdx
+; BURR-NEXT:    cmoveq %r14, %rdx
+; BURR-NEXT:    cmovsq %r14, %r10
 ; BURR-NEXT:    testb $1, %sil
 ; BURR-NEXT:    cmovneq %rax, %rdx
 ; BURR-NEXT:    movq %rdx, 24(%rax)
-; BURR-NEXT:    cmovneq %rax, %rbp
-; BURR-NEXT:    movq %rbp, 16(%rax)
-; BURR-NEXT:    cmovneq %rax, %r15
-; BURR-NEXT:    movq %r15, 8(%rax)
 ; BURR-NEXT:    cmovneq %rax, %r11
-; BURR-NEXT:    movq %r11, (%rax)
+; BURR-NEXT:    movq %r11, 16(%rax)
+; BURR-NEXT:    cmovneq %rax, %r13
+; BURR-NEXT:    movq %r13, 8(%rax)
+; BURR-NEXT:    cmovneq %rax, %r10
+; BURR-NEXT:    movq %r10, (%rax)
 ; BURR-NEXT:    popq %rbx
 ; BURR-NEXT:    popq %r12
 ; BURR-NEXT:    popq %r13
 ; BURR-NEXT:    popq %r14
 ; BURR-NEXT:    popq %r15
-; BURR-NEXT:    popq %rbp
 ; BURR-NEXT:    retq
 ;
 ; SRC-LABEL: test1:
@@ -300,8 +296,8 @@ define i256 @test1(i256 %a) nounwind {
 ; SRC-NEXT:    movl %r11d, %ecx
 ; SRC-NEXT:    shrdq %cl, %rdx, %rbp
 ; SRC-NEXT:    shrq %cl, %rdx
-; SRC-NEXT:    xorl %r15d, %r15d
 ; SRC-NEXT:    movl $1, %edi
+; SRC-NEXT:    xorl %r15d, %r15d
 ; SRC-NEXT:    xorl %r14d, %r14d
 ; SRC-NEXT:    shldq %cl, %rdi, %r14
 ; SRC-NEXT:    xorl %r13d, %r13d
@@ -909,15 +905,15 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
 ; ILP-LABEL: test4:
 ; ILP:       # %bb.0:
 ; ILP-NEXT:    xorl %ecx, %ecx
-; ILP-NEXT:    xorl %edx, %edx
 ; ILP-NEXT:    addq $1, %rsi
-; ILP-NEXT:    setb %dl
+; ILP-NEXT:    setb %cl
 ; ILP-NEXT:    movl $2, %eax
+; ILP-NEXT:    xorl %edx, %edx
 ; ILP-NEXT:    cmpq %rdi, %rsi
-; ILP-NEXT:    sbbq $0, %rdx
-; ILP-NEXT:    movl $0, %edx
-; ILP-NEXT:    sbbq $0, %rdx
 ; ILP-NEXT:    sbbq $0, %rcx
+; ILP-NEXT:    movl $0, %ecx
+; ILP-NEXT:    sbbq $0, %rcx
+; ILP-NEXT:    sbbq $0, %rdx
 ; ILP-NEXT:    setae %cl
 ; ILP-NEXT:    movzbl %cl, %ecx
 ; ILP-NEXT:    subq %rcx, %rax
@@ -926,14 +922,14 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
 ; HYBRID-LABEL: test4:
 ; HYBRID:       # %bb.0:
 ; HYBRID-NEXT:    xorl %eax, %eax
-; HYBRID-NEXT:    xorl %ecx, %ecx
 ; HYBRID-NEXT:    addq $1, %rsi
-; HYBRID-NEXT:    setb %cl
+; HYBRID-NEXT:    setb %al
+; HYBRID-NEXT:    xorl %ecx, %ecx
 ; HYBRID-NEXT:    cmpq %rdi, %rsi
-; HYBRID-NEXT:    sbbq $0, %rcx
-; HYBRID-NEXT:    movl $0, %ecx
-; HYBRID-NEXT:    sbbq $0, %rcx
 ; HYBRID-NEXT:    sbbq $0, %rax
+; HYBRID-NEXT:    movl $0, %eax
+; HYBRID-NEXT:    sbbq $0, %rax
+; HYBRID-NEXT:    sbbq $0, %rcx
 ; HYBRID-NEXT:    setae %al
 ; HYBRID-NEXT:    movzbl %al, %ecx
 ; HYBRID-NEXT:    movl $2, %eax
@@ -943,14 +939,14 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
 ; BURR-LABEL: test4:
 ; BURR:       # %bb.0:
 ; BURR-NEXT:    xorl %eax, %eax
-; BURR-NEXT:    xorl %ecx, %ecx
 ; BURR-NEXT:    addq $1, %rsi
-; BURR-NEXT:    setb %cl
+; BURR-NEXT:    setb %al
+; BURR-NEXT:    xorl %ecx, %ecx
 ; BURR-NEXT:    cmpq %rdi, %rsi
-; BURR-NEXT:    sbbq $0, %rcx
-; BURR-NEXT:    movl $0, %ecx
-; BURR-NEXT:    sbbq $0, %rcx
 ; BURR-NEXT:    sbbq $0, %rax
+; BURR-NEXT:    movl $0, %eax
+; BURR-NEXT:    sbbq $0, %rax
+; BURR-NEXT:    sbbq $0, %rcx
 ; BURR-NEXT:    setae %al
 ; BURR-NEXT:    movzbl %al, %ecx
 ; BURR-NEXT:    movl $2, %eax
diff --git a/test/CodeGen/X86/spill-zero-x86_64.ll b/test/CodeGen/X86/spill-zero-x86_64.ll
new file mode 100644
index 00000000000..d90cca6eabd
--- /dev/null
+++ b/test/CodeGen/X86/spill-zero-x86_64.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; This test checks that we use "movq $0, (%rsp)" to spill a 0 to the stack. It
+; was reduced from a larger function.
+
+; CHECK:    movq $0, (%rsp) # 8-byte Folded Spill
+
+%struct.foo = type { i8*, i32 }
+
+declare void @pluto()
+
+define void @spam() {
+bb:
+  br label %bb13
+
+bb1:                                              ; preds = %bb18
+  call void @pluto()
+  %tmp = getelementptr inbounds %struct.foo, %struct.foo* %tmp20, i64 0, i32 1
+  %tmp2 = bitcast i32* %tmp to %struct.foo**
+  store %struct.foo* null, %struct.foo** %tmp2
+  unreachable
+
+bb3:                                              ; preds = %bb18
+  call void @pluto()
+  store i8* %tmp22, i8** undef
+  unreachable
+
+bb4:                                              ; preds = %bb18
+  call void @pluto()
+  br label %bb13
+
+bb5:                                              ; preds = %bb18
+  %tmp7 = add nsw i32 %tmp23, 1
+  store i8* %tmp22, i8** undef
+  unreachable
+
+bb8:                                              ; preds = %bb18
+  store %struct.foo* %tmp14, %struct.foo** undef
+  unreachable
+
+bb9:                                              ; preds = %bb18
+  %tmp10 = load %struct.foo*, %struct.foo** undef
+  br label %bb13
+
+bb13:                                             ; preds = %bb18, %bb9, %bb4, %bb
+  %tmp14 = phi %struct.foo* [ %tmp14, %bb18 ], [ %tmp14, %bb4 ], [ null, %bb ], [ %tmp10, %bb9 ]
+  %tmp15 = phi %struct.foo* [ %tmp26, %bb18 ], [ %tmp26, %bb4 ], [ null, %bb ], [ %tmp26, %bb9 ]
+  %tmp16 = phi i32 [ %tmp23, %bb18 ], [ %tmp23, %bb4 ], [ 0, %bb ], [ %tmp23, %bb9 ]
+  br label %bb17
+
+bb17:                                             ; preds = %bb13
+  br i1 false, label %bb27, label %bb18
+
+bb18:                                             ; preds = %bb17
+  %tmp19 = load %struct.foo*, %struct.foo** undef
+  %tmp20 = getelementptr inbounds %struct.foo, %struct.foo* %tmp19, i64 0
+  %tmp21 = getelementptr inbounds %struct.foo, %struct.foo* %tmp20, i64 0, i32 0
+  %tmp22 = load i8*, i8** %tmp21
+  %tmp23 = add nsw i32 %tmp16, -1
+  %tmp24 = getelementptr inbounds %struct.foo, %struct.foo* %tmp15, i64 0, i32 1
+  %tmp25 = bitcast i32* %tmp24 to %struct.foo**
+  %tmp26 = load %struct.foo*, %struct.foo** %tmp25
+  switch i32 undef, label %bb9 [
+    i32 1, label %bb1
+    i32 2, label %bb3
+    i32 3, label %bb4
+    i32 4, label %bb5
+    i32 5, label %bb13
+    i32 6, label %bb8
+  ]
+
+bb27:                                             ; preds = %bb17
+  ret void
+}
diff --git a/test/CodeGen/X86/swifterror.ll b/test/CodeGen/X86/swifterror.ll
index cb0597f7151..a88a714f016 100644
--- a/test/CodeGen/X86/swifterror.ll
+++ b/test/CodeGen/X86/swifterror.ll
@@ -41,8 +41,7 @@ define float @caller(i8* %error_ref) {
 ; CHECK-APPLE: callq {{.*}}free
 
 ; CHECK-O0-LABEL: caller:
-; CHECK-O0: xorl
-; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: xorl %r12d, %r12d
 ; CHECK-O0: callq {{.*}}foo
 ; CHECK-O0: jne
 entry:
@@ -78,8 +77,7 @@ define float @caller2(i8* %error_ref) {
 ; CHECK-APPLE: callq {{.*}}free
 
 ; CHECK-O0-LABEL: caller2:
-; CHECK-O0: xorl
-; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: xorl %r12d, %r12d
 ; CHECK-O0: callq {{.*}}foo
 ; CHECK-O0: movq %r12, [[ID:%[a-z]+]]
 ; CHECK-O0: cmpq $0, %r12
@@ -254,8 +252,7 @@ define float @caller3(i8* %error_ref) {
 ; CHECK-APPLE: callq {{.*}}free
 
 ; CHECK-O0-LABEL: caller3:
-; CHECK-O0: xorl
-; CHECK-O0: movl {{.*}}, %r12d
+; CHECK-O0: xorl %r12d, %r12d
 ; CHECK-O0: movl $1, %esi
 ; CHECK-O0: movq {{.*}}, %rdi
 ; CHECK-O0: callq {{.*}}foo_sret
@@ -313,14 +310,12 @@ define float @caller_with_multiple_swifterror_values(i8* %error_ref, i8* %error_
 ; CHECK-O0-LABEL: caller_with_multiple_swifterror_values:
 
 ; The first swifterror value:
-; CHECK-O0: xorl
-; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: xorl %r12d, %r12d
 ; CHECK-O0: callq {{.*}}foo
 ; CHECK-O0: jne
 
 ; The second swifterror value:
-; CHECK-O0: xorl
-; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: xorl %r12d, %r12d
 ; CHECK-O0: callq {{.*}}foo
 ; CHECK-O0: jne
 entry:
@@ -715,8 +710,7 @@ declare swiftcc void @foo2(%swift_error** swifterror)
 ; Make sure we properly assign registers during fast-isel.
 ; CHECK-O0-LABEL: testAssign
 ; CHECK-O0:        pushq   %r12
-; CHECK-O0:        xorl    [[ZERO:%[a-z0-9]+]], [[ZERO]]
-; CHECK-O0:        movl    [[ZERO]], %r12d
+; CHECK-O0:        xorl    %r12d, %r12d
 ; CHECK-O0:        callq   _foo2
 ; CHECK-O0:        movq    %r12, [[SLOT:[-a-z0-9\(\)\%]*]]
 ;
@@ -792,8 +786,7 @@ a:
 
 ; CHECK-O0-LABEL: testAssign4
 ; CHECK-O0:        callq   _foo2
-; CHECK-O0:        xorl    %ecx, %ecx
-; CHECK-O0:        movl    %ecx, %eax
+; CHECK-O0:        xorl    %eax, %eax
 ; CHECK-O0:        movq    %rax, [[SLOT:[-a-z0-9\(\)\%]*]]
 ; CHECK-O0:        movq    [[SLOT]], %rax
 ; CHECK-O0:        movq    %rax, [[SLOT2:[-a-z0-9\(\)\%]*]]
-- 
GitLab


From 048958afe03ee667d41dc22e571d7132b84bcafc Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm@meinersbur.de>
Date: Wed, 24 Oct 2018 17:35:35 +0000
Subject: [PATCH 0512/1116] [docs] Add rawspeed to test-suite proposals.

rawspeed was suggested by Simon Pilgrim and Roman Lebedev in
llvm.org/PR34216 and reviews.llvm.org/D46714.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345166 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/Proposals/TestSuite.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/Proposals/TestSuite.rst b/docs/Proposals/TestSuite.rst
index 62fc137d686..8c7531783d4 100644
--- a/docs/Proposals/TestSuite.rst
+++ b/docs/Proposals/TestSuite.rst
@@ -252,6 +252,13 @@ FTensors
 --------
 http://www.wlandry.net/Projects/FTensor
 
+rawspeed
+--------
+https://github.com/darktable-org/rawspeed
+
+Its test dataset is 756 MB in size, which is too large to be included
+into the test-suite repository.
+
 Generic Algorithms
 ==================
 
-- 
GitLab


From 3be6adfc14047acb272062615d5a6d1d33c148bd Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Wed, 24 Oct 2018 17:55:13 +0000
Subject: [PATCH 0513/1116] [Hexagon] Flip hexagon-autohvx to be true by
 default

This will allow other generators of LLVM IR to use the auto-vectorizer
without having to change that flag.

Note: on its own, this patch will enable auto-vectorization on Hexagon
in all cases, regardless of the -fvectorize flag. There is a companion
clang patch that together with this one forms an NFC for clang users.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345169 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonTargetTransformInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 79b269bccfe..5cfaa42ae5c 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hexagontti"
 
-static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),
+static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(true),
   cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
 
 static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
-- 
GitLab


From fdd0e51f3ee7dde04b9e3d493d0c42616c8cf16a Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Wed, 24 Oct 2018 18:10:38 +0000
Subject: [PATCH 0514/1116] ARM: Use BKPT instead of TRAP to implement
 llvm.debugtrap.

The BKPT instruction is specified to cause a software breakpoint,
and at least on Linux results in a SIGTRAP. This makes it more
suitable for implementing debugtrap than TRAP (aka UDF #254), which
is specified to cause an undefined instruction exception and results
in a SIGILL on Linux.

Moreover, BKPT is not marked as a terminator, which is not only
consistent with the IR instruction but allows the analyzeBlock
function to correctly analyze a basic block containing the instruction,
which fixes an assertion failure in the machine block placement pass
previously triggered by the included test case.

Because BKPT is only supported starting with ARMv5T, we continue to
use UDF #254 when targeting v4T.

Differential Revision: https://reviews.llvm.org/D53614

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345171 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMISelLowering.cpp      |  1 +
 lib/Target/ARM/ARMInstrInfo.td          |  4 ++
 lib/Target/ARM/ARMInstrThumb.td         |  3 ++
 test/CodeGen/ARM/analyze-branch-bkpt.ll | 61 +++++++++++++++++++++++++
 test/CodeGen/ARM/debugtrap.ll           |  8 +++-
 test/CodeGen/ARM/trap.ll                | 14 +++---
 6 files changed, 82 insertions(+), 9 deletions(-)
 create mode 100644 test/CodeGen/ARM/analyze-branch-bkpt.ll

diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 2f4bc46f932..8c18477005f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -956,6 +956,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
   // Use the default implementation.
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index fc8ed95ce8b..76f8414e8f0 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -221,6 +221,7 @@ def HasV4T           : Predicate<"Subtarget->hasV4TOps()">,
 def NoV4T            : Predicate<"!Subtarget->hasV4TOps()">;
 def HasV5T           : Predicate<"Subtarget->hasV5TOps()">,
                                  AssemblerPredicate<"HasV5TOps", "armv5t">;
+def NoV5T            : Predicate<"!Subtarget->hasV5TOps()">;
 def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
                                  AssemblerPredicate<"HasV5TEOps", "armv5te">;
 def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
@@ -2200,6 +2201,9 @@ def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
   let Inst = 0xe7ffdefe;
 }
 
+def : Pat<(debugtrap), (BKPT 0)>, Requires<[IsARM, HasV5T]>;
+def : Pat<(debugtrap), (UDF 254)>, Requires<[IsARM, NoV5T]>;
+
 // Address computation and loads and stores in PIC mode.
 let isNotDuplicable = 1 in {
 def PICADD  : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 88aab47a79b..8b85db7e685 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1380,6 +1380,9 @@ def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8",
   let Inst{7-0} = imm8;
 }
 
+def : Pat<(debugtrap), (tBKPT 0)>, Requires<[IsThumb, HasV5T]>;
+def : Pat<(debugtrap), (tUDF 254)>, Requires<[IsThumb, NoV5T]>;
+
 def t__brkdiv0 : TI<(outs), (ins), IIC_Br, "__brkdiv0",
                     [(int_arm_undefined 249)]>, Encoding16,
     Requires<[IsThumb, IsWindows]> {
diff --git a/test/CodeGen/ARM/analyze-branch-bkpt.ll b/test/CodeGen/ARM/analyze-branch-bkpt.ll
new file mode 100644
index 00000000000..cba89fe9987
--- /dev/null
+++ b/test/CodeGen/ARM/analyze-branch-bkpt.ll
@@ -0,0 +1,61 @@
+; RUN: llc -o - %s -mtriple thumbv4-unknown-linux-android | FileCheck --check-prefix=V4 %s
+; RUN: llc -o - %s -mtriple thumbv5-unknown-linux-android | FileCheck --check-prefix=V5 %s
+
+; V4: udf #254
+; V5: bkpt #0
+
+define i1 @a(i32 %b) !dbg !3 {
+  br i1 undef, label %c, label %d, !dbg !4
+
+d:                                                ; preds = %0
+  call void @llvm.debugtrap()
+  br label %ah, !dbg !4
+
+c:                                                ; preds = %0
+  %aj = icmp ne i20 undef, 5
+  br label %ah, !dbg !4
+
+ah:                                               ; preds = %c, %d
+  %ak = phi i1 [ false, %d ], [ %aj, %c ]
+  call void @llvm.dbg.value(metadata i1 %ak, metadata !7, metadata !DIExpression()), !dbg !9
+  switch i32 %b, label %al [
+    i32 0, label %am
+    i32 10, label %an
+  ]
+
+an:                                               ; preds = %ah
+  %ch = select i1 %ak, i32 0, i32 5
+  br label %am, !dbg !10
+
+al:                                               ; preds = %ah
+  br label %am, !dbg !9
+
+am:                                               ; preds = %al, %an, %ah
+  %1 = phi i32 [ 0, %al ], [ %ch, %an ], [ %b, %ah ]
+  unreachable
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #0
+
+; Function Attrs: nounwind
+declare void @llvm.debugtrap() #1
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "a", directory: "")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !0)
+!4 = !DILocation(line: 0, scope: !5, inlinedAt: !6)
+!5 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !0)
+!6 = !DILocation(line: 0, scope: !3)
+!7 = !DILocalVariable(scope: !8)
+!8 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !0)
+!9 = !DILocation(line: 0, scope: !8, inlinedAt: !6)
+!10 = !DILocation(line: 0, scope: !11, inlinedAt: !6)
+!11 = !DILexicalBlock(scope: !8)
diff --git a/test/CodeGen/ARM/debugtrap.ll b/test/CodeGen/ARM/debugtrap.ll
index 5064a4ec2ca..88ca81c4f2c 100644
--- a/test/CodeGen/ARM/debugtrap.ll
+++ b/test/CodeGen/ARM/debugtrap.ll
@@ -1,7 +1,10 @@
 ; This test ensures the @llvm.debugtrap() call is not removed when generating
 ; the 'pop' instruction to restore the callee saved registers on ARM.
 
-; RUN: llc < %s -mtriple=armv7 -O0 -filetype=asm | FileCheck %s 
+; RUN: llc < %s -mtriple=armv4 -O0 -filetype=asm | FileCheck --check-prefixes=CHECK,V4 %s
+; RUN: llc < %s -mtriple=armv5 -O0 -filetype=asm | FileCheck --check-prefixes=CHECK,V5 %s
+; RUN: llc < %s -mtriple=thumbv4 -O0 -filetype=asm | FileCheck --check-prefixes=CHECK,V4 %s
+; RUN: llc < %s -mtriple=thumbv5 -O0 -filetype=asm | FileCheck --check-prefixes=CHECK,V5 %s
 
 declare void @llvm.debugtrap() nounwind
 declare void @foo() nounwind
@@ -9,8 +12,9 @@ declare void @foo() nounwind
 define void @test() nounwind {
 entry:
   ; CHECK: bl foo
+  ; V4-NEXT: udf #254
+  ; V5-NEXT: bkpt #0
   ; CHECK-NEXT: pop
-  ; CHECK-NEXT: .inst 0xe7ffdefe
   call void @foo()
   call void @llvm.debugtrap()
   ret void
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index 585218cf337..c45f7133feb 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -59,25 +59,25 @@ entry:
 define void @t2() nounwind {
 entry:
 ; DARWIN-LABEL: t2:
-; DARWIN: trap
+; DARWIN: udf #254
 
 ; FUNC-LABEL: t2:
 ; FUNC: bl __trap
 
 ; NACL-LABEL: t2:
-; NACL: .inst 0xe7fedef0
+; NACL: bkpt #0
 
 ; ARM-LABEL: t2:
-; ARM: .inst 0xe7ffdefe
+; ARM: bkpt #0
 
 ; THUMB-LABEL: t2:
-; THUMB: .inst.n 0xdefe
+; THUMB: bkpt #0
 
-; ENCODING-NACL: f0 de fe e7 trap
+; ENCODING-NACL: 70 00 20 e1 bkpt #0
 
-; ENCODING-ARM: fe de ff e7 trap
+; ENCODING-ARM: 70 00 20 e1 bkpt #0
 
-; ENCODING-THUMB: fe de trap
+; ENCODING-THUMB: 00 be bkpt #0
 
   call void @llvm.debugtrap()
   unreachable
-- 
GitLab


From 5b4f9ef85eba6aa97adcc8414d4b00c18006a336 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 24 Oct 2018 18:44:12 +0000
Subject: [PATCH 0515/1116] [CostModel][X86] Add vXi8 vector division by
 constants costs.

ISD::MULHS/ISD::MULHU lowering of vXi8 types means we expand these in TargetLowering BuildSDIV/BuildUDIV.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345175 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetTransformInfo.cpp |  16 ++
 test/Analysis/CostModel/X86/div.ll        | 192 +++++++++++-----------
 test/Analysis/CostModel/X86/rem.ll        | 192 +++++++++++-----------
 test/Analysis/CostModel/X86/vdiv-cost.ll  |   2 +-
 4 files changed, 209 insertions(+), 193 deletions(-)

diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 29306d75454..b77ac5c9953 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -346,6 +346,10 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry AVX512BWConstCostTable[] = {
+    { ISD::SDIV, MVT::v64i8,  14 }, // 2*ext+2*pmulhw sequence
+    { ISD::SREM, MVT::v64i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v64i8,  14 }, // 2*ext+2*pmulhw sequence
+    { ISD::UREM, MVT::v64i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
     { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
     { ISD::SREM, MVT::v32i16,  8 }, // vpmulhw+mul+sub sequence
     { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
@@ -376,6 +380,10 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry AVX2ConstCostTable[] = {
+    { ISD::SDIV, MVT::v32i8,  14 }, // 2*ext+2*pmulhw sequence
+    { ISD::SREM, MVT::v32i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v32i8,  14 }, // 2*ext+2*pmulhw sequence
+    { ISD::UREM, MVT::v32i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
     { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
     { ISD::SREM, MVT::v16i16,  8 }, // vpmulhw+mul+sub sequence
     { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
@@ -394,6 +402,14 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry SSE2ConstCostTable[] = {
+    { ISD::SDIV, MVT::v32i8,  28+2 }, // 4*ext+4*pmulhw sequence + split.
+    { ISD::SREM, MVT::v32i8,  32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
+    { ISD::SDIV, MVT::v16i8,    14 }, // 2*ext+2*pmulhw sequence
+    { ISD::SREM, MVT::v16i8,    16 }, // 2*ext+2*pmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v32i8,  28+2 }, // 4*ext+4*pmulhw sequence + split.
+    { ISD::UREM, MVT::v32i8,  32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
+    { ISD::UDIV, MVT::v16i8,    14 }, // 2*ext+2*pmulhw sequence
+    { ISD::UREM, MVT::v16i8,    16 }, // 2*ext+2*pmulhw+mul+sub sequence
     { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
     { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
     { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
diff --git a/test/Analysis/CostModel/X86/div.ll b/test/Analysis/CostModel/X86/div.ll
index 7a53db1b26b..724f9872417 100644
--- a/test/Analysis/CostModel/X86/div.ll
+++ b/test/Analysis/CostModel/X86/div.ll
@@ -150,9 +150,9 @@ define i32 @sdiv_const() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sdiv_const'
@@ -169,9 +169,9 @@ define i32 @sdiv_const() {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sdiv_const'
@@ -188,9 +188,9 @@ define i32 @sdiv_const() {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sdiv_const'
@@ -207,9 +207,9 @@ define i32 @sdiv_const() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sdiv_const'
@@ -226,9 +226,9 @@ define i32 @sdiv_const() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sdiv_const'
@@ -245,9 +245,9 @@ define i32 @sdiv_const() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sdiv_const'
@@ -264,9 +264,9 @@ define i32 @sdiv_const() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sdiv_const'
@@ -283,9 +283,9 @@ define i32 @sdiv_const() {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sdiv_const'
@@ -302,9 +302,9 @@ define i32 @sdiv_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, 7
@@ -345,9 +345,9 @@ define i32 @udiv_const() {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'udiv_const'
@@ -364,9 +364,9 @@ define i32 @udiv_const() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'udiv_const'
@@ -383,9 +383,9 @@ define i32 @udiv_const() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'udiv_const'
@@ -402,9 +402,9 @@ define i32 @udiv_const() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'udiv_const'
@@ -421,9 +421,9 @@ define i32 @udiv_const() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'udiv_const'
@@ -440,9 +440,9 @@ define i32 @udiv_const() {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'udiv_const'
@@ -459,9 +459,9 @@ define i32 @udiv_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, 7
@@ -502,9 +502,9 @@ define i32 @sdiv_uniformconst() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sdiv_uniformconst'
@@ -521,9 +521,9 @@ define i32 @sdiv_uniformconst() {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sdiv_uniformconst'
@@ -540,9 +540,9 @@ define i32 @sdiv_uniformconst() {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sdiv_uniformconst'
@@ -559,9 +559,9 @@ define i32 @sdiv_uniformconst() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sdiv_uniformconst'
@@ -578,9 +578,9 @@ define i32 @sdiv_uniformconst() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sdiv_uniformconst'
@@ -597,9 +597,9 @@ define i32 @sdiv_uniformconst() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sdiv_uniformconst'
@@ -616,9 +616,9 @@ define i32 @sdiv_uniformconst() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sdiv_uniformconst'
@@ -635,9 +635,9 @@ define i32 @sdiv_uniformconst() {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sdiv_uniformconst'
@@ -654,9 +654,9 @@ define i32 @sdiv_uniformconst() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, 7
@@ -697,9 +697,9 @@ define i32 @udiv_uniformconst() {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'udiv_uniformconst'
@@ -716,9 +716,9 @@ define i32 @udiv_uniformconst() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'udiv_uniformconst'
@@ -735,9 +735,9 @@ define i32 @udiv_uniformconst() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'udiv_uniformconst'
@@ -754,9 +754,9 @@ define i32 @udiv_uniformconst() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'udiv_uniformconst'
@@ -773,9 +773,9 @@ define i32 @udiv_uniformconst() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'udiv_uniformconst'
@@ -792,9 +792,9 @@ define i32 @udiv_uniformconst() {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'udiv_uniformconst'
@@ -811,9 +811,9 @@ define i32 @udiv_uniformconst() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, 7
diff --git a/test/Analysis/CostModel/X86/rem.ll b/test/Analysis/CostModel/X86/rem.ll
index e28b4d9583f..62de12d57ed 100644
--- a/test/Analysis/CostModel/X86/rem.ll
+++ b/test/Analysis/CostModel/X86/rem.ll
@@ -150,9 +150,9 @@ define i32 @srem_const() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'srem_const'
@@ -169,9 +169,9 @@ define i32 @srem_const() {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'srem_const'
@@ -188,9 +188,9 @@ define i32 @srem_const() {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'srem_const'
@@ -207,9 +207,9 @@ define i32 @srem_const() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'srem_const'
@@ -226,9 +226,9 @@ define i32 @srem_const() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'srem_const'
@@ -245,9 +245,9 @@ define i32 @srem_const() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'srem_const'
@@ -264,9 +264,9 @@ define i32 @srem_const() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'srem_const'
@@ -283,9 +283,9 @@ define i32 @srem_const() {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'srem_const'
@@ -302,9 +302,9 @@ define i32 @srem_const() {
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; GLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'srem_const'
@@ -321,9 +321,9 @@ define i32 @srem_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 7
@@ -364,9 +364,9 @@ define i32 @urem_const() {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'urem_const'
@@ -383,9 +383,9 @@ define i32 @urem_const() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'urem_const'
@@ -402,9 +402,9 @@ define i32 @urem_const() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'urem_const'
@@ -421,9 +421,9 @@ define i32 @urem_const() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'urem_const'
@@ -440,9 +440,9 @@ define i32 @urem_const() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'urem_const'
@@ -459,9 +459,9 @@ define i32 @urem_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 7
@@ -502,9 +502,9 @@ define i32 @srem_uniformconst() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'srem_uniformconst'
@@ -521,9 +521,9 @@ define i32 @srem_uniformconst() {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'srem_uniformconst'
@@ -540,9 +540,9 @@ define i32 @srem_uniformconst() {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'srem_uniformconst'
@@ -559,9 +559,9 @@ define i32 @srem_uniformconst() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'srem_uniformconst'
@@ -578,9 +578,9 @@ define i32 @srem_uniformconst() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'srem_uniformconst'
@@ -597,9 +597,9 @@ define i32 @srem_uniformconst() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'srem_uniformconst'
@@ -616,9 +616,9 @@ define i32 @srem_uniformconst() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'srem_uniformconst'
@@ -635,9 +635,9 @@ define i32 @srem_uniformconst() {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'srem_uniformconst'
@@ -654,9 +654,9 @@ define i32 @srem_uniformconst() {
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; GLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'srem_uniformconst'
@@ -673,9 +673,9 @@ define i32 @srem_uniformconst() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 7
@@ -716,9 +716,9 @@ define i32 @urem_uniformconst() {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'urem_uniformconst'
@@ -735,9 +735,9 @@ define i32 @urem_uniformconst() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'urem_uniformconst'
@@ -754,9 +754,9 @@ define i32 @urem_uniformconst() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'urem_uniformconst'
@@ -773,9 +773,9 @@ define i32 @urem_uniformconst() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'urem_uniformconst'
@@ -792,9 +792,9 @@ define i32 @urem_uniformconst() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'urem_uniformconst'
@@ -811,9 +811,9 @@ define i32 @urem_uniformconst() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 7
diff --git a/test/Analysis/CostModel/X86/vdiv-cost.ll b/test/Analysis/CostModel/X86/vdiv-cost.ll
index d5b404227aa..d87d21c487d 100644
--- a/test/Analysis/CostModel/X86/vdiv-cost.ll
+++ b/test/Analysis/CostModel/X86/vdiv-cost.ll
@@ -100,7 +100,7 @@ define <16 x i16> @test6(<16 x i16> %a) {
 
 define <16 x i8> @test7(<16 x i8> %a) {
 ; CHECK-LABEL: 'test7'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %div
 ;
   %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
-- 
GitLab


From 0c69da83b35ea1d512e10c2b4addd175e15a5c0d Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 24 Oct 2018 18:53:47 +0000
Subject: [PATCH 0516/1116] [hot-cold-split] Name split functions with ".cold"
 suffix

Summary:
The current default of appending "_"+entry block label to the new
extracted cold function breaks demangling. Change the deliminator from
"_" to "." to enable demangling. Because the header block label will
be empty for release compile code, use "extracted" after the "." when
the label is empty.

Additionally, add a mechanism for the client to pass in an alternate
suffix applied after the ".", and have the hot cold split pass use
"cold."+Count, where the Count is currently 1 but can be used to
uniquely number multiple cold functions split out from the same function
with D53588.

Reviewers: sebpop, hiraditya

Subscribers: llvm-commits, erik.pilkington

Differential Revision: https://reviews.llvm.org/D53534

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345178 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/CodeExtractor.h    | 11 +++++++++--
 lib/Transforms/IPO/HotColdSplitting.cpp          | 16 +++++++++-------
 lib/Transforms/Utils/CodeExtractor.cpp           | 16 +++++++++++-----
 test/Transforms/BlockExtractor/extract-blocks.ll |  8 ++++----
 .../CodeExtractor/ExtractedFnEntryCount.ll       |  2 +-
 .../Transforms/CodeExtractor/PartialInlineAnd.ll |  4 ++--
 .../CodeExtractor/PartialInlineAndOr.ll          |  2 +-
 .../CodeExtractor/PartialInlineAttributes.ll     |  8 ++++----
 .../CodeExtractor/PartialInlineDebug.ll          |  8 ++++----
 .../CodeExtractor/PartialInlineEntryUpdate.ll    |  6 +++---
 .../PartialInlineInvokeProducesOutVal.ll         |  4 ++--
 .../CodeExtractor/PartialInlineLiveAcross.ll     |  4 ++--
 .../CodeExtractor/PartialInlineNoLiveOut.ll      |  4 ++--
 test/Transforms/CodeExtractor/PartialInlineOr.ll |  4 ++--
 .../CodeExtractor/PartialInlineOrAnd.ll          |  4 ++--
 .../CodeExtractor/PartialInlinePGOMultiRegion.ll |  8 ++++----
 .../CodeExtractor/PartialInlinePGORegion.ll      |  4 ++--
 .../CodeExtractor/PartialInlineVarArg.ll         |  6 +++---
 .../CodeExtractor/PartialInlineVarArgsDebug.ll   |  4 ++--
 test/Transforms/CodeExtractor/SingleCondition.ll |  2 +-
 .../CodeExtractor/X86/InheritTargetAttributes.ll |  2 +-
 test/Transforms/CodeExtractor/cost.ll            |  4 ++--
 test/Transforms/CodeExtractor/cost_meta.ll       |  2 +-
 test/Transforms/CodeExtractor/inline_eh.ll       |  4 ++--
 test/Transforms/CodeExtractor/inline_eh_1.ll     |  4 ++--
 test/Transforms/CodeExtractor/live_shrink.ll     |  4 ++--
 test/Transforms/CodeExtractor/live_shrink_gep.ll |  4 ++--
 .../CodeExtractor/live_shrink_hoist.ll           |  2 +-
 .../CodeExtractor/live_shrink_multiple.ll        |  2 +-
 .../CodeExtractor/unreachable-block.ll           |  4 ++--
 test/Transforms/HotColdSplit/minsize.ll          |  4 ++--
 test/Transforms/HotColdSplit/split-cold-2.ll     |  4 ++--
 .../HotColdSplit/split-out-dbg-val-of-arg.ll     |  2 +-
 test/tools/llvm-extract/extract-block.ll         |  2 +-
 .../llvm-extract/extract-multiple-blocks.ll      |  4 ++--
 35 files changed, 94 insertions(+), 79 deletions(-)

diff --git a/include/llvm/Transforms/Utils/CodeExtractor.h b/include/llvm/Transforms/Utils/CodeExtractor.h
index 0e5254acb0d..13bef841805 100644
--- a/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -64,6 +64,11 @@ class Value;
     unsigned NumExitBlocks = std::numeric_limits<unsigned>::max();
     Type *RetTy;
 
+    // Suffix to use when creating extracted function (appended to the original
+    // function name + "."). If empty, the default is to use the entry block
+    // label, if non-empty, otherwise "extracted".
+    std::string Suffix;
+
   public:
     /// Create a code extractor for a sequence of blocks.
     ///
@@ -78,7 +83,8 @@ class Value;
     CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT = nullptr,
                   bool AggregateArgs = false, BlockFrequencyInfo *BFI = nullptr,
                   BranchProbabilityInfo *BPI = nullptr,
-                  bool AllowVarArgs = false, bool AllowAlloca = false);
+                  bool AllowVarArgs = false, bool AllowAlloca = false,
+                  std::string Suffix = "");
 
     /// Create a code extractor for a loop body.
     ///
@@ -86,7 +92,8 @@ class Value;
     /// block sequence of the loop.
     CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs = false,
                   BlockFrequencyInfo *BFI = nullptr,
-                  BranchProbabilityInfo *BPI = nullptr);
+                  BranchProbabilityInfo *BPI = nullptr,
+                  std::string Suffix = "");
 
     /// Perform the extraction, returning the new function.
     ///
diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index d3e086e972a..a63cd842241 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -265,7 +265,7 @@ private:
                                     DominatorTree *DT, PostDomTree *PDT);
   Function *extractColdRegion(const SmallVectorImpl<BasicBlock *> &Region,
                               DominatorTree *DT, BlockFrequencyInfo *BFI,
-                              OptimizationRemarkEmitter &ORE);
+                              OptimizationRemarkEmitter &ORE, unsigned Count);
   bool isOutlineCandidate(const SmallVectorImpl<BasicBlock *> &Region,
                           const BasicBlock *Exit) const {
     if (!Exit)
@@ -331,16 +331,18 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
   return true;
 }
 
-Function *
-HotColdSplitting::extractColdRegion(const SmallVectorImpl<BasicBlock *> &Region,
-                                    DominatorTree *DT, BlockFrequencyInfo *BFI,
-                                    OptimizationRemarkEmitter &ORE) {
+Function *HotColdSplitting::extractColdRegion(
+    const SmallVectorImpl<BasicBlock *> &Region, DominatorTree *DT,
+    BlockFrequencyInfo *BFI, OptimizationRemarkEmitter &ORE, unsigned Count) {
   assert(!Region.empty());
   LLVM_DEBUG(for (auto *BB : Region)
           llvm::dbgs() << "\nExtracting: " << *BB;);
 
   // TODO: Pass BFI and BPI to update profile information.
-  CodeExtractor CE(Region, DT);
+  CodeExtractor CE(Region, DT, /* AggregateArgs */ false, /* BFI */ nullptr,
+                   /* BPI */ nullptr, /* AllowVarArgs */ false,
+                   /* AllowAlloca */ false,
+                   /* Suffix */ "cold." + std::to_string(Count));
 
   SetVector<Value *> Inputs, Outputs, Sinks;
   CE.findInputsOutputs(Inputs, Outputs, Sinks);
@@ -426,7 +428,7 @@ const Function *HotColdSplitting::outlineColdBlocks(Function &F,
         ++NumColdSESEFound;
         ValidColdRegion.push_back(ExitColdRegion);
         // Candidate for outlining. FIXME: Continue outlining.
-        return extractColdRegion(ValidColdRegion, DT, BFI, ORE);
+        return extractColdRegion(ValidColdRegion, DT, BFI, ORE, /* Count */ 1);
       }
     }
   }
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 27b982578c4..328fe1fac65 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -228,19 +228,21 @@ buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
 CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
                              bool AggregateArgs, BlockFrequencyInfo *BFI,
                              BranchProbabilityInfo *BPI, bool AllowVarArgs,
-                             bool AllowAlloca)
+                             bool AllowAlloca, std::string Suffix)
     : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AllowVarArgs(AllowVarArgs),
-      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)) {}
+      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)),
+      Suffix(Suffix) {}
 
 CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
                              BlockFrequencyInfo *BFI,
-                             BranchProbabilityInfo *BPI)
+                             BranchProbabilityInfo *BPI, std::string Suffix)
     : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AllowVarArgs(false),
       Blocks(buildExtractionBlockSet(L.getBlocks(), &DT,
                                      /* AllowVarArgs */ false,
-                                     /* AllowAlloca */ false)) {}
+                                     /* AllowAlloca */ false)),
+      Suffix(Suffix) {}
 
 /// definedInRegion - Return true if the specified value is defined in the
 /// extracted region.
@@ -669,10 +671,14 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
                   FunctionType::get(RetTy, paramTy,
                                     AllowVarArgs && oldFunction->isVarArg());
 
+  std::string SuffixToUse =
+      Suffix.empty()
+          ? (header->getName().empty() ? "extracted" : header->getName().str())
+          : Suffix;
   // Create the new function
   Function *newFunction = Function::Create(
       funcType, GlobalValue::InternalLinkage, oldFunction->getAddressSpace(),
-      oldFunction->getName() + "_" + header->getName(), M);
+      oldFunction->getName() + "." + SuffixToUse, M);
   // If the old function is no-throw, so is the new one.
   if (oldFunction->doesNotThrow())
     newFunction->setDoesNotThrow();
diff --git a/test/Transforms/BlockExtractor/extract-blocks.ll b/test/Transforms/BlockExtractor/extract-blocks.ll
index 47e5fc30849..e720953a1e7 100644
--- a/test/Transforms/BlockExtractor/extract-blocks.ll
+++ b/test/Transforms/BlockExtractor/extract-blocks.ll
@@ -4,11 +4,11 @@
 ; RUN: opt -S -extract-blocks -extract-blocks-file=%t -extract-blocks-erase-funcs %s | FileCheck %s --check-prefix=CHECK-ERASE
 
 ; CHECK-NO-ERASE: @foo(
-; CHECK-NO-ERASE: @foo_bb9(
-; CHECK-NO-ERASE: @foo_bb20(
+; CHECK-NO-ERASE: @foo.bb9(
+; CHECK-NO-ERASE: @foo.bb20(
 ; CHECK-ERASE: declare i32 @foo(
-; CHECK-ERASE: @foo_bb9(
-; CHECK-ERASE: @foo_bb20(
+; CHECK-ERASE: @foo.bb9(
+; CHECK-ERASE: @foo.bb20(
 define i32 @foo(i32 %arg, i32 %arg1) {
 bb:
   %tmp5 = icmp sgt i32 %arg, 0
diff --git a/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll b/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
index 8313cfac04e..55c44e1e832 100644
--- a/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
+++ b/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
@@ -22,7 +22,7 @@ entry:
   ret i32 %val
 }
 
-; CHECK: @inlinedFunc.1_if.then(i1 %cond) !prof [[COUNT1:![0-9]+]]
+; CHECK: @inlinedFunc.1.if.then(i1 %cond) !prof [[COUNT1:![0-9]+]]
 
 
 !llvm.module.flags = !{!0}
diff --git a/test/Transforms/CodeExtractor/PartialInlineAnd.ll b/test/Transforms/CodeExtractor/PartialInlineAnd.ll
index d32d834d2df..6d555b740e5 100644
--- a/test/Transforms/CodeExtractor/PartialInlineAnd.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineAnd.ll
@@ -41,11 +41,11 @@ bb:
 ; CHECK-LABEL: @dummy_caller
 ; CHECK: br i1
 ; CHECK: br i1
-; CHECK: call void @bar.1_
+; CHECK: call void @bar.1.
 ; LIMIT-LABEL: @dummy_caller
 ; LIMIT: br i1
 ; LIMIT-NOT: br
-; LIMIT: call void @bar.1_
+; LIMIT: call void @bar.1.
   %tmp = tail call i32 @bar(i32 %arg)
   ret i32 %tmp
 }
diff --git a/test/Transforms/CodeExtractor/PartialInlineAndOr.ll b/test/Transforms/CodeExtractor/PartialInlineAndOr.ll
index 485e06ce102..9da9ed4437e 100644
--- a/test/Transforms/CodeExtractor/PartialInlineAndOr.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineAndOr.ll
@@ -49,7 +49,7 @@ bb:
 ; CHECK: br i1
 ; CHECK: br i1
 ; CHECK: br i1
-; CHECK: call void @bar.1_
+; CHECK: call void @bar.1.
 ; LIMIT-LABEL: @dummy_caller
 ; LIMIT-NOT: br i1
 ; LIMIT: call i32 @bar
diff --git a/test/Transforms/CodeExtractor/PartialInlineAttributes.ll b/test/Transforms/CodeExtractor/PartialInlineAttributes.ll
index 40170846392..18c934bc6a1 100644
--- a/test/Transforms/CodeExtractor/PartialInlineAttributes.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineAttributes.ll
@@ -55,9 +55,9 @@ if.end:
   ret i32 %add
 }
 ; CHECK-LABEL: @caller
-; CHECK: call void @callee_most.2_if.then(i32 %v
+; CHECK: call void @callee_most.2.if.then(i32 %v
 ; CHECK: call i32 @callee_noinline(i32 %v)
-; CHECK: call void @callee_writeonly.1_if.then(i32 %v
+; CHECK: call void @callee_writeonly.1.if.then(i32 %v
 define i32 @caller(i32 %v) {
 entry:
   %c1 = call i32 @callee_most(i32 %v)
@@ -66,8 +66,8 @@ entry:
   ret i32 %c3
 }
 
-; CHECK: define internal void @callee_writeonly.1_if.then(i32 %v, i32* %sub.out) { 
-; CHECK: define internal void @callee_most.2_if.then(i32 %v, i32* %sub.out)  [[FN_ATTRS:#[0-9]+]]
+; CHECK: define internal void @callee_writeonly.1.if.then(i32 %v, i32* %sub.out) { 
+; CHECK: define internal void @callee_most.2.if.then(i32 %v, i32* %sub.out)  [[FN_ATTRS:#[0-9]+]]
 
 ; attributes to preserve
 attributes #0 = {
diff --git a/test/Transforms/CodeExtractor/PartialInlineDebug.ll b/test/Transforms/CodeExtractor/PartialInlineDebug.ll
index 5d9e64dc277..c0bc66db0ee 100644
--- a/test/Transforms/CodeExtractor/PartialInlineDebug.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineDebug.ll
@@ -23,7 +23,7 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; CHECK-LABEL: @caller
 ; CHECK: codeRepl.i:
-; CHECK-NEXT: call void @callee.2_if.then(i32 %v, i32* %mul.loc.i), !dbg ![[DBG2:[0-9]+]]
+; CHECK-NEXT: call void @callee.2.if.then(i32 %v, i32* %mul.loc.i), !dbg ![[DBG2:[0-9]+]]
 define i32 @caller(i32 %v) !dbg !8 {
 entry:
   %call = call i32 @callee(i32 %v), !dbg !14
@@ -53,17 +53,17 @@ if.end:
 
 ; CHECK-LABEL: @caller2
 ; CHECK: codeRepl.i:
-; CHECK-NEXT: call void @callee2.1_if.then(i32 %v, i32* %sub.loc.i), !dbg ![[DBG4:[0-9]+]]
+; CHECK-NEXT: call void @callee2.1.if.then(i32 %v, i32* %sub.loc.i), !dbg ![[DBG4:[0-9]+]]
 define i32 @caller2(i32 %v) !dbg !21 {
 entry:
   %call = call i32 @callee2(i32 %v), !dbg !22
   ret i32 %call
 }
 
-; CHECK-LABEL: define internal void @callee2.1_if.then
+; CHECK-LABEL: define internal void @callee2.1.if.then
 ; CHECK: br label %if.then, !dbg ![[DBG5:[0-9]+]]
 
-; CHECK-LABEL: define internal void @callee.2_if.then
+; CHECK-LABEL: define internal void @callee.2.if.then
 ; CHECK: br label %if.then, !dbg ![[DBG6:[0-9]+]]
 
 ; CHECK: ![[DBG1]] = !DILocation(line: 10, column: 7,
diff --git a/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll b/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
index 3a7a9752e50..0efc8299dab 100644
--- a/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
@@ -17,9 +17,9 @@ define internal i32 @Caller1(i1 %cond, i32* align 2 %align.val) !prof !3{
 entry:
 ; CHECK-LABEL: @Caller1
 ; CHECK: br
-; CHECK: call void @Func.1_ 
+; CHECK: call void @Func.1.
 ; CHECK: br
-; CHECK: call void @Func.1_ 
+; CHECK: call void @Func.1.
   %val = call i32 @Func(i1 %cond, i32* %align.val)
   %val2 = call i32 @Func(i1 %cond, i32* %align.val)
   ret i32 %val
@@ -29,7 +29,7 @@ define internal i32 @Caller2(i1 %cond, i32* align 2 %align.val) !prof !2{
 entry:
 ; CHECK-LABEL: @Caller2
 ; CHECK: br
-; CHECK: call void @Func.1_ 
+; CHECK: call void @Func.1.
   %val = call i32 @Func(i1 %cond, i32* %align.val)
   ret i32 %val
 }
diff --git a/test/Transforms/CodeExtractor/PartialInlineInvokeProducesOutVal.ll b/test/Transforms/CodeExtractor/PartialInlineInvokeProducesOutVal.ll
index bba7ad05fac..bc6f780c5a8 100644
--- a/test/Transforms/CodeExtractor/PartialInlineInvokeProducesOutVal.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineInvokeProducesOutVal.ll
@@ -26,14 +26,14 @@ bb5:                                              ; preds = %bb4, %bb1, %bb
 ; CHECK-LABEL: bb:
 ; CHECK-NEXT:  [[CALL26LOC:%.*]] = alloca i8*
 ; CHECK-LABEL: codeRepl.i:
-; CHECK-NEXT:   call void @bar.1_bb1(i8** [[CALL26LOC]])
+; CHECK-NEXT:   call void @bar.1.bb1(i8** [[CALL26LOC]])
 define i8* @dummy_caller(i32 %arg) {
 bb:
   %tmp = tail call i8* @bar(i32 %arg)
   ret i8* %tmp
 }
 
-; CHECK-LABEL: define internal void @bar.1_bb1
+; CHECK-LABEL: define internal void @bar.1.bb1
 ; CHECK-LABEL: bb1:
 ; CHECK-NEXT:    %call26 = invoke i8* @invoke_callee()
 ; CHECK-NEXT:            to label %cont unwind label %lpad
diff --git a/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll b/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll
index e8a4d1281a2..1e1a1b062d4 100644
--- a/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll
@@ -36,7 +36,7 @@ declare void @foo(...) local_unnamed_addr #1
 define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
 ; CHECK-LABEL: @dummy_caller
 ; CHECK: codeRepl.i:
-; CHECK:  call void @test.1_bb2()
+; CHECK:  call void @test.1.bb2()
 ; CHECK-NOT: load
 ; CHECK  br
 
@@ -45,7 +45,7 @@ bb:
   ret i32 %tmp
 }
 
-; CHECK-LABEL: define internal void @test.1_bb2()
+; CHECK-LABEL: define internal void @test.1.bb2()
 ; CHECK: .exitStub:
 ; CHECK-NOT:  store i32 %tmp7, i32* %tmp7.out
 ; CHECK: ret
diff --git a/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll b/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll
index a48ff4b1b8f..d41492f8ffd 100644
--- a/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll
@@ -39,7 +39,7 @@ declare void @foo(...) local_unnamed_addr #0
 define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
 ; CHECK-LABEL: @dummy_caller
 ; CHECK: codeRepl.i:
-; CHECK:  call void @test.1_bb2()
+; CHECK:  call void @test.1.bb2()
 ; CHECK-NOT: load
 ; CHECK  br
 bb:
@@ -47,7 +47,7 @@ bb:
   ret i32 %tmp
 }
 
-; CHECK-LABEL: define internal void @test.1_bb2()
+; CHECK-LABEL: define internal void @test.1.bb2()
 ; CHECK: .exitStub:
 ; CHECK-NOT:  store i32 %tmp7, i32* %tmp7.out
 ; CHECK: ret
diff --git a/test/Transforms/CodeExtractor/PartialInlineOr.ll b/test/Transforms/CodeExtractor/PartialInlineOr.ll
index 758945c7ade..cbf7a47de9b 100644
--- a/test/Transforms/CodeExtractor/PartialInlineOr.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineOr.ll
@@ -41,7 +41,7 @@ bb:
 ; CHECK-LABEL: @dummy_caller
 ; CHECK: br i1
 ; CHECK: br i1
-; CHECK: call void @bar.2_
+; CHECK: call void @bar.2.
 ; LIMIT-LABEL: @dummy_caller
 ; LIMIT-NOT: br
 ; LIMIT: call i32 @bar(
@@ -84,7 +84,7 @@ bb5:                                              ; preds = %bb4, %bb1
 define i32 @dummy_caller2(i32 %arg) local_unnamed_addr #0 {
 ; CHECK: br i1
 ; CHECK: br i1
-; CHECK: call {{.*}} @bar_multi_ret.1_
+; CHECK: call {{.*}} @bar_multi_ret.1.
   %tmp = tail call i32 @bar_multi_ret(i32 %arg)
   ret i32 %tmp
 }
diff --git a/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll b/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
index fb6d1c33536..09d0e2503ea 100644
--- a/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
@@ -52,12 +52,12 @@ bb:
 ; CHECK: br i1
 ; CHECK: br i1
 ; CHECK: br i1
-; CHECK: call void @bar.1_
+; CHECK: call void @bar.1.
 ; LIMIT3-LABEL: @dummy_caller
 ; LIMIT3: br i1
 ; LIMIT3: br i1
 ; LIMIT3-NOT: br i1
-; LIMIT3: call void @bar.1_
+; LIMIT3: call void @bar.1.
 ; LIMIT2-LABEL: @dummy_caller
 ; LIMIT2-NOT: br i1
 ; LIMIT2: call i32 @bar(
diff --git a/test/Transforms/CodeExtractor/PartialInlinePGOMultiRegion.ll b/test/Transforms/CodeExtractor/PartialInlinePGOMultiRegion.ll
index a51bdd01df5..5d187abb68a 100644
--- a/test/Transforms/CodeExtractor/PartialInlinePGOMultiRegion.ll
+++ b/test/Transforms/CodeExtractor/PartialInlinePGOMultiRegion.ll
@@ -109,9 +109,9 @@ define signext i32 @foo(i32 signext %value, i32 signext %ub) #0 !prof !30 {
 ; CHECK-LABEL: @foo
 ; CHECK-NOT: call signext i32 @bar
 ; CHECK: codeRepl1.i:
-; CHECK: call void @bar.1_if.then
+; CHECK: call void @bar.1.if.then
 ; CHECK: codeRepl.i:
-; CHECK: call void @bar.1_if.then2
+; CHECK: call void @bar.1.if.then2
 entry:
   %value.addr = alloca i32, align 4
   %ub.addr = alloca i32, align 4
@@ -123,11 +123,11 @@ entry:
   ret i32 %call
 }
 
-; CHECK-LABEL: define internal void @bar.1_if.then2
+; CHECK-LABEL: define internal void @bar.1.if.then2
 ; CHECK: .exitStub:
 ; CHECK: ret void
 
-; CHECK-LABEL: define internal void @bar.1_if.then
+; CHECK-LABEL: define internal void @bar.1.if.then
 ; CHECK: .exitStub:
 ; CHECK: ret void
 
diff --git a/test/Transforms/CodeExtractor/PartialInlinePGORegion.ll b/test/Transforms/CodeExtractor/PartialInlinePGORegion.ll
index 27c858f3de6..4aa70624315 100644
--- a/test/Transforms/CodeExtractor/PartialInlinePGORegion.ll
+++ b/test/Transforms/CodeExtractor/PartialInlinePGORegion.ll
@@ -66,7 +66,7 @@ define signext i32 @foo(i32 signext %value, i32 signext %ub) #0 !prof !30 {
 ; CHECK-LABEL: @foo
 ; CHECK: codeRepl.i:
 ; CHECK-NOT: call signext i32 @bar
-; CHECK: call void @bar.1_if.then
+; CHECK: call void @bar.1.if.then
 entry:
   %value.addr = alloca i32, align 4
   %ub.addr = alloca i32, align 4
@@ -78,7 +78,7 @@ entry:
   ret i32 %call
 }
 
-; CHECK-LABEL: define internal void @bar.1_if.then
+; CHECK-LABEL: define internal void @bar.1.if.then
 ; CHECK: .exitStub:
 ; CHECK: ret void
 
diff --git a/test/Transforms/CodeExtractor/PartialInlineVarArg.ll b/test/Transforms/CodeExtractor/PartialInlineVarArg.ll
index bf6db27c959..8582f5e18f8 100644
--- a/test/Transforms/CodeExtractor/PartialInlineVarArg.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineVarArg.ll
@@ -36,7 +36,7 @@ bb:
 }
 ; CHECK-LABEL: @caller1
 ; CHECK: codeRepl.i:
-; CHECK-NEXT:  call void (i32, i8**, i32, ...) @vararg.3_bb1(i32 %stat1.i, i8** %vargs.i, i32 %arg)
+; CHECK-NEXT:  call void (i32, i8**, i32, ...) @vararg.3.bb1(i32 %stat1.i, i8** %vargs.i, i32 %arg)
 
 define i32 @caller2(i32 %arg, float %arg2) {
 bb:
@@ -46,7 +46,7 @@ bb:
 
 ; CHECK-LABEL: @caller2
 ; CHECK: codeRepl.i:
-; CHECK-NEXT:  call void (i32, i8**, i32, ...) @vararg.3_bb1(i32 %stat1.i, i8** %vargs.i, i32 %arg, i32 10, float %arg2)
+; CHECK-NEXT:  call void (i32, i8**, i32, ...) @vararg.3.bb1(i32 %stat1.i, i8** %vargs.i, i32 %arg, i32 10, float %arg2)
 
 ; Test case to check that we do not extract a vararg function, if va_end is in
 ; a block that is not outlined.
@@ -104,4 +104,4 @@ entry:
 
 ; CHECK-LABEL: @caller_with_signext
 ; CHECK: codeRepl.i:
-; CHECK-NEXT:  call void (i32*, ...) @vararg2.1_cond.end(i32* %foo, i32 signext 8)
+; CHECK-NEXT:  call void (i32*, ...) @vararg2.1.cond.end(i32* %foo, i32 signext 8)
diff --git a/test/Transforms/CodeExtractor/PartialInlineVarArgsDebug.ll b/test/Transforms/CodeExtractor/PartialInlineVarArgsDebug.ll
index 1a3d3ee4401..02f695d3662 100644
--- a/test/Transforms/CodeExtractor/PartialInlineVarArgsDebug.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineVarArgsDebug.ll
@@ -19,14 +19,14 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; CHECK-LABEL: @caller
 ; CHECK: codeRepl.i:
-; CHECK-NEXT: call void (i32, i32*, ...) @callee.1_if.then(i32 %v, i32* %mul.loc.i, i32 99), !dbg ![[DBG2:[0-9]+]]
+; CHECK-NEXT: call void (i32, i32*, ...) @callee.1.if.then(i32 %v, i32* %mul.loc.i, i32 99), !dbg ![[DBG2:[0-9]+]]
 define i32 @caller(i32 %v) !dbg !8 {
 entry:
   %call = call i32 (i32, ...) @callee(i32 %v, i32 99), !dbg !14
   ret i32 %call, !dbg !15
 }
 
-; CHECK-LABEL: define internal void @callee.1_if.then
+; CHECK-LABEL: define internal void @callee.1.if.then
 ; CHECK: br label %if.then, !dbg ![[DBG3:[0-9]+]]
 
 ; CHECK: ![[DBG1]] = !DILocation(line: 10, column: 7,
diff --git a/test/Transforms/CodeExtractor/SingleCondition.ll b/test/Transforms/CodeExtractor/SingleCondition.ll
index 4110cd95b7e..334364484ee 100644
--- a/test/Transforms/CodeExtractor/SingleCondition.ll
+++ b/test/Transforms/CodeExtractor/SingleCondition.ll
@@ -16,7 +16,7 @@ define internal i32 @dummyCaller(i1 %cond, i32* align 2 %align.val) {
 entry:
 ; CHECK-LABEL: @dummyCaller
 ; CHECK: br
-; CHECK: call void @inlinedFunc.1_ 
+; CHECK: call void @inlinedFunc.1.
   %val = call i32 @inlinedFunc(i1 %cond, i32* %align.val)
   ret i32 %val
 }
diff --git a/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll b/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
index 0f8a71907d8..e6a5113261e 100644
--- a/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
+++ b/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
@@ -36,5 +36,5 @@ entry:
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+sse4.1" }
 
-; CHECK: define {{.*}} @inlinedFunc.1_if.then{{.*}} [[COUNT1:#[0-9]+]]
+; CHECK: define {{.*}} @inlinedFunc.1.if.then{{.*}} [[COUNT1:#[0-9]+]]
 ; CHECK: [[COUNT1]] = { {{.*}} "target-cpu"="x86-64" "target-features"="+sse4.1" }
diff --git a/test/Transforms/CodeExtractor/cost.ll b/test/Transforms/CodeExtractor/cost.ll
index 4ac5acee019..841b42b7c35 100644
--- a/test/Transforms/CodeExtractor/cost.ll
+++ b/test/Transforms/CodeExtractor/cost.ll
@@ -47,14 +47,14 @@ declare i32 @foo(i32* %arg)
 define i32 @dummy_caller(i32* %arg) local_unnamed_addr {
 ; CHECK-LABEL: @dummy_caller
   %tmp = call i32 @outline_region_notlikely(i32* %arg)
-; CHECK:  call void @outline_region_notlikely.2_bb1
+; CHECK:  call void @outline_region_notlikely.2.bb1
   %tmp2 = tail call i32 @outline_region_likely(i32* %arg)
 ; CHECK: %tmp2 = tail call i32 @outline_region_likely(i32* %arg)
   ret i32 %tmp
 
 }
 
-; CHECK-LABEL: define internal void @outline_region_notlikely.2_bb1(i32* %arg) {
+; CHECK-LABEL: define internal void @outline_region_notlikely.2.bb1(i32* %arg) {
 ; CHECK-NEXT: newFuncRoot:
 
 !llvm.module.flags = !{!0}
diff --git a/test/Transforms/CodeExtractor/cost_meta.ll b/test/Transforms/CodeExtractor/cost_meta.ll
index 2e4467a8d0c..ca1690a4c9f 100644
--- a/test/Transforms/CodeExtractor/cost_meta.ll
+++ b/test/Transforms/CodeExtractor/cost_meta.ll
@@ -28,7 +28,7 @@ define i32 @dummy_caller(i32* %arg) local_unnamed_addr {
  }
 
 
-; CHECK-LABEL: define internal void @outline_region_notlikely.1_bb1(i32* %arg) {
+; CHECK-LABEL: define internal void @outline_region_notlikely.1.bb1(i32* %arg) {
 ; CHECK-NEXT: newFuncRoot:
 
 declare i32 @foo(i32 * %arg)
diff --git a/test/Transforms/CodeExtractor/inline_eh.ll b/test/Transforms/CodeExtractor/inline_eh.ll
index 4e0aa7a0d72..a69e0c30bb9 100644
--- a/test/Transforms/CodeExtractor/inline_eh.ll
+++ b/test/Transforms/CodeExtractor/inline_eh.ll
@@ -42,11 +42,11 @@ entry:
 ; CHECK: entry:
 ; CHECK-NEXT: br i1
 ; CHECK: codeRepl.i:
-; CHECK-NEXT: call void @callee.1_{{.*}}()
+; CHECK-NEXT: call void @callee.1.{{.*}}()
   call void @callee(i1 %cond)
   ret void
 }
 
-; CHECK-LABEL: define {{.*}} @callee.1_{{.*}}() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+; CHECK-LABEL: define {{.*}} @callee.1.{{.*}}() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
 ; CHECK: invoke void @bar()
 ; CHECK: landingpad
diff --git a/test/Transforms/CodeExtractor/inline_eh_1.ll b/test/Transforms/CodeExtractor/inline_eh_1.ll
index 31e35839644..b01abb6c1e8 100644
--- a/test/Transforms/CodeExtractor/inline_eh_1.ll
+++ b/test/Transforms/CodeExtractor/inline_eh_1.ll
@@ -42,12 +42,12 @@ entry:
 ; CHECK: entry:
 ; CHECK-NEXT: br i1
 ; CHECK: codeRepl.i:
-; CHECK-NEXT: call void @callee.1_{{.*}}()
+; CHECK-NEXT: call void @callee.1.{{.*}}()
   call void @callee(i1 %cond)
   ret void
 }
 
-; CHECK-LABEL: define {{.*}} @callee.1_{{.*}}() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+; CHECK-LABEL: define {{.*}} @callee.1.{{.*}}() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK: invoke void @bar()
 ; CHECK: cleanuppad
 ; CHECK-NEXT: cleanupret
diff --git a/test/Transforms/CodeExtractor/live_shrink.ll b/test/Transforms/CodeExtractor/live_shrink.ll
index c25ed2b622c..780ab480c4b 100644
--- a/test/Transforms/CodeExtractor/live_shrink.ll
+++ b/test/Transforms/CodeExtractor/live_shrink.ll
@@ -41,13 +41,13 @@ bb:
 ; CHECK-NOT: llvm.lifetime
 ; CHECK: br i1
 ; CHECK: codeRepl.i:
-; CHECK: call void @_Z3foov.1_
+; CHECK: call void @_Z3foov.1.
 
   tail call void @_Z3foov()
   ret void
 }
 
-; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK-LABEL: define internal void @_Z3foov.1.
 ; CHECK: newFuncRoot:
 ; CHECK-NEXT:  %tmp = alloca %class.A
 ; CHECK-NEXT:  %tmp1 = bitcast %class.A* %tmp to i8*
diff --git a/test/Transforms/CodeExtractor/live_shrink_gep.ll b/test/Transforms/CodeExtractor/live_shrink_gep.ll
index ac6aa4fbda4..aed86f84b66 100644
--- a/test/Transforms/CodeExtractor/live_shrink_gep.ll
+++ b/test/Transforms/CodeExtractor/live_shrink_gep.ll
@@ -42,12 +42,12 @@ bb:
 ; CHECK-NOT: llvm.lifetime
 ; CHECK: br i1
 ; CHECK: codeRepl.i:
-; CHECK: call void @_Z3foov.1_
+; CHECK: call void @_Z3foov.1.
   tail call void @_Z3foov()
   ret void
 }
 
-; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK-LABEL: define internal void @_Z3foov.1.
 ; CHECK: newFuncRoot:
 ; CHECK-NEXT:  %tmp = alloca %class.A
 ; CHECK-NEXT:  %tmp1 = getelementptr
diff --git a/test/Transforms/CodeExtractor/live_shrink_hoist.ll b/test/Transforms/CodeExtractor/live_shrink_hoist.ll
index 1f57146c941..13dab8d6b83 100644
--- a/test/Transforms/CodeExtractor/live_shrink_hoist.ll
+++ b/test/Transforms/CodeExtractor/live_shrink_hoist.ll
@@ -50,7 +50,7 @@ bb:
   ret void
 }
 
-; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK-LABEL: define internal void @_Z3foov.1.
 ; CHECK: bb9:
 ; CHECK: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1)
 ; CHECK:  br label %.exitStub
diff --git a/test/Transforms/CodeExtractor/live_shrink_multiple.ll b/test/Transforms/CodeExtractor/live_shrink_multiple.ll
index 8d9045c7267..9350ca2ef9c 100644
--- a/test/Transforms/CodeExtractor/live_shrink_multiple.ll
+++ b/test/Transforms/CodeExtractor/live_shrink_multiple.ll
@@ -42,7 +42,7 @@ bb:
   ret void
 }
 
-; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK-LABEL: define internal void @_Z3foov.1.
 ; CHECK: newFuncRoot:
 ; CHECK-NEXT:  alloca 
 ; CHECK-NEXT:  bitcast 
diff --git a/test/Transforms/CodeExtractor/unreachable-block.ll b/test/Transforms/CodeExtractor/unreachable-block.ll
index 09f41f6bd2f..7ce65f529a6 100644
--- a/test/Transforms/CodeExtractor/unreachable-block.ll
+++ b/test/Transforms/CodeExtractor/unreachable-block.ll
@@ -1,12 +1,12 @@
 ; RUN: opt -S -partial-inliner %s | FileCheck %s
 
 ; CHECK-LABEL: define void @dipsy(
-; CHECK-NEXT:   call void @tinkywinky.1_ontrue()
+; CHECK-NEXT:   call void @tinkywinky.1.ontrue()
 ; CHECK-NEXT:   call void @patatuccio()
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 
-; CHECK-LABEL: define internal void @tinkywinky.1_ontrue() {
+; CHECK-LABEL: define internal void @tinkywinky.1.ontrue() {
 ; CHECK-NEXT: newFuncRoot:
 ; CHECK-NEXT:   br label %ontrue
 ; CHECK: onfalse{{.*}}:
diff --git a/test/Transforms/HotColdSplit/minsize.ll b/test/Transforms/HotColdSplit/minsize.ll
index f7509bf3c02..4865fb6d024 100644
--- a/test/Transforms/HotColdSplit/minsize.ll
+++ b/test/Transforms/HotColdSplit/minsize.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: @fun
 ; CHECK: codeRepl:
-; CHECK-NEXT: call void @fun_if.else
+; CHECK-NEXT: call void @fun.cold.1
 
 define void @fun() {
 entry:
@@ -28,5 +28,5 @@ cleanup:
   unreachable
 }
 
-; CHECK: define {{.*}} @fun_if.else{{.*}}#[[outlined_func_attr:[0-9]+]]
+; CHECK: define {{.*}} @fun.cold.1{{.*}}#[[outlined_func_attr:[0-9]+]]
 ; CHECK: attributes #[[outlined_func_attr]] = { {{.*}}minsize
diff --git a/test/Transforms/HotColdSplit/split-cold-2.ll b/test/Transforms/HotColdSplit/split-cold-2.ll
index de0c7655a6e..ac7d856608c 100644
--- a/test/Transforms/HotColdSplit/split-cold-2.ll
+++ b/test/Transforms/HotColdSplit/split-cold-2.ll
@@ -4,10 +4,10 @@
 ; Make sure this compiles. This test used to fail with an invalid phi node: the
 ; two predecessors were outlined and the SSA representation was invalid.
 
-; CHECK: remark: <unknown>:0:0: fun split cold code into fun_if.else
+; CHECK: remark: <unknown>:0:0: fun split cold code into fun.cold.1
 ; CHECK-LABEL: @fun
 ; CHECK: codeRepl:
-; CHECK-NEXT: call void @fun_if.else
+; CHECK-NEXT: call void @fun.cold.1
 
 define void @fun() {
 entry:
diff --git a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
index dcaff122442..459ee6712bc 100644
--- a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
+++ b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
 
-; CHECK-LABEL: define {{.*}}@foo_if.end
+; CHECK-LABEL: define {{.*}}@foo.cold
 ; CHECK-NOT: llvm.dbg.value
 
 define void @foo(i32 %arg1) !dbg !6 {
diff --git a/test/tools/llvm-extract/extract-block.ll b/test/tools/llvm-extract/extract-block.ll
index c812a567523..7cf0f160337 100644
--- a/test/tools/llvm-extract/extract-block.ll
+++ b/test/tools/llvm-extract/extract-block.ll
@@ -12,7 +12,7 @@ bb:
   ret void
 }
 
-; CHECK: @foo_bb4
+; CHECK: @foo.bb4
 ; CHECK: call void @bar()
 ; CHECK: %tmp5
 define i32 @foo(i32 %arg) {
diff --git a/test/tools/llvm-extract/extract-multiple-blocks.ll b/test/tools/llvm-extract/extract-multiple-blocks.ll
index a7f270bdcd6..343edff342f 100644
--- a/test/tools/llvm-extract/extract-multiple-blocks.ll
+++ b/test/tools/llvm-extract/extract-multiple-blocks.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-extract -S -bb foo:bb4 -bb foo:bb7 %s | FileCheck %s
 
-; CHECK: @foo_bb4
-; CHECK: @foo_bb7
+; CHECK: @foo.bb4
+; CHECK: @foo.bb7
 define i32 @foo(i32 %arg) {
 bb:
   %tmp = alloca i32, align 4
-- 
GitLab


From 3cc8b4921a68f89b7d110b4aed2a7a05eccbb3ce Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 24 Oct 2018 19:00:56 +0000
Subject: [PATCH 0517/1116] [TargetLowering] Add
 SimplifyDemandedBitsForTargetNode callback

Add a SimplifyDemandedBitsForTargetNode callback to handle target nodes.

Differential Revision: https://reviews.llvm.org/D53643

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345179 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/TargetLowering.h       | 13 ++++++++++-
 lib/CodeGen/SelectionDAG/TargetLowering.cpp | 24 +++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 585d07cf044..4b1fae89be5 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -2916,11 +2916,22 @@ public:
   /// elements, returning true on success. Otherwise, analyze the expression and
   /// return a mask of KnownUndef and KnownZero elements for the expression
   /// (used to simplify the caller). The KnownUndef/Zero elements may only be
-  /// accurate for those bits in the DemandedMask
+  /// accurate for those bits in the DemandedMask.
   virtual bool SimplifyDemandedVectorEltsForTargetNode(
       SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
       APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const;
 
+  /// Attempt to simplify any target nodes based on the demanded bits,
+  /// returning true on success. Otherwise, analyze the
+  /// expression and return a mask of KnownOne and KnownZero bits for the
+  /// expression (used to simplify the caller).  The KnownZero/One bits may only
+  /// be accurate for those bits in the DemandedMask.
+  virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op,
+                                                 const APInt &DemandedBits,
+                                                 KnownBits &Known,
+                                                 TargetLoweringOpt &TLO,
+                                                 unsigned Depth = 0) const;
+
   /// If \p SNaN is false, \returns true if \p Op is known to never be any
   /// NaN. If \p sNaN is true, returns if \p Op is known to never be a signaling
   /// NaN.
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 017db41fa9e..0189a11fa1d 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1335,6 +1335,13 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     LLVM_FALLTHROUGH;
   }
   default:
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
+      if (SimplifyDemandedBitsForTargetNode(Op, DemandedBits, Known, TLO,
+                                            Depth))
+        return true;
+      break;
+    }
+
     // Just use computeKnownBits to compute output bits.
     TLO.DAG.computeKnownBits(Op, Known, Depth);
     break;
@@ -1803,6 +1810,23 @@ bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
   return false;
 }
 
+bool TargetLowering::SimplifyDemandedBitsForTargetNode(
+    SDValue Op, const APInt &DemandedBits, KnownBits &Known,
+    TargetLoweringOpt &TLO, unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use SimplifyDemandedBits if you don't know whether Op"
+         " is a target node!");
+  EVT VT = Op.getValueType();
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
+  return false;
+}
+
 bool TargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
                                                   const SelectionDAG &DAG,
                                                   bool SNaN,
-- 
GitLab


From 24c5a02353e96587fe894f62f245612020bd35f7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 24 Oct 2018 19:11:28 +0000
Subject: [PATCH 0518/1116] [X86][SSE] Add SimplifyDemandedBitsForTargetNode
 PMULDQ/PMULUDQ handling

Add X86 SimplifyDemandedBitsForTargetNode and use it to simplify PMULDQ/PMULUDQ target nodes.

This enables us to repeatedly simplify the node's arguments after the previous approach had to be reverted due to PR39398.

Differential Revision: https://reviews.llvm.org/D53643

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345182 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp          | 32 ++++++++--
 lib/Target/X86/X86ISelLowering.h            |  6 ++
 test/CodeGen/X86/combine-pmuldq.ll          | 30 ++-------
 test/CodeGen/X86/urem-seteq-vec-nonsplat.ll | 68 ++++++++++-----------
 4 files changed, 70 insertions(+), 66 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 44d0d711dd1..d86f9d5a220 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -31870,6 +31870,30 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
   return false;
 }
 
+bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
+    SDValue Op, const APInt &OriginalDemandedBits, KnownBits &Known,
+    TargetLoweringOpt &TLO, unsigned Depth) const {
+  unsigned Opc = Op.getOpcode();
+  switch(Opc) {
+  case X86ISD::PMULDQ:
+  case X86ISD::PMULUDQ: {
+    // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
+    KnownBits KnownOp;
+    SDValue LHS = Op.getOperand(0);
+    SDValue RHS = Op.getOperand(1);
+    APInt DemandedMask = OriginalDemandedBits & APInt::getLowBitsSet(64, 32);
+    if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1))
+      return true;
+    if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1))
+      return true;
+    break;
+  }
+  }
+
+  return TargetLowering::SimplifyDemandedBitsForTargetNode(
+      Op, OriginalDemandedBits, Known, TLO, Depth);
+}
+
 /// Check if a vector extract from a target-specific shuffle of a load can be
 /// folded into a single element load.
 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
@@ -40362,13 +40386,9 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
   if (ISD::isBuildVectorAllZeros(RHS.getNode()))
     return RHS;
 
+  // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  APInt DemandedMask(APInt::getLowBitsSet(64, 32));
-
-  // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
-  if (TLI.SimplifyDemandedBits(LHS, DemandedMask, DCI))
-    return SDValue(N, 0);
-  if (TLI.SimplifyDemandedBits(RHS, DemandedMask, DCI))
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
     return SDValue(N, 0);
 
   return SDValue();
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index eeef7579714..fea7ecbdbb4 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -874,6 +874,12 @@ namespace llvm {
                                                  TargetLoweringOpt &TLO,
                                                  unsigned Depth) const override;
 
+    bool SimplifyDemandedBitsForTargetNode(SDValue Op,
+                                           const APInt &DemandedBits,
+                                           KnownBits &Known,
+                                           TargetLoweringOpt &TLO,
+                                           unsigned Depth) const override;
+
     SDValue unwrapAddress(SDValue N) const override;
 
     bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
diff --git a/test/CodeGen/X86/combine-pmuldq.ll b/test/CodeGen/X86/combine-pmuldq.ll
index edc6cb01d97..cd58947b186 100644
--- a/test/CodeGen/X86/combine-pmuldq.ll
+++ b/test/CodeGen/X86/combine-pmuldq.ll
@@ -47,26 +47,10 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: combine_shuffle_zero_pmuludq:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: combine_shuffle_zero_pmuludq:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq:
-; AVX512DQVL:       # %bb.0:
-; AVX512DQVL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQVL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512DQVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT:    retq
+; AVX-LABEL: combine_shuffle_zero_pmuludq:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   %2 = shufflevector <4 x i32> %a1, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   %3 = bitcast <4 x i32> %1 to <2 x i64>
@@ -84,22 +68,16 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1)
 ;
 ; AVX2-LABEL: combine_shuffle_zero_pmuludq_256:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; AVX512VL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256:
 ; AVX512DQVL:       # %bb.0:
-; AVX512DQVL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQVL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; AVX512DQVL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    retq
   %1 = shufflevector <8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
diff --git a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 9f339a8a555..82385386c88 100644
--- a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -143,31 +143,31 @@ define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone {
 define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone {
 ; CHECK-SSE2-LABEL: test_urem_even_div:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2863311531,3435973837,2863311531,2454267027]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,2454267027]
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; CHECK-SSE2-NEXT:    psrld $1, %xmm3
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
-; CHECK-SSE2-NEXT:    psrld $3, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    psrld $2, %xmm1
+; CHECK-SSE2-NEXT:    psrld $3, %xmm2
+; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
 ; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [6,10,12,14]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm5
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,3,1]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
@@ -377,30 +377,30 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind readnone {
 define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone {
 ; CHECK-SSE2-LABEL: test_urem_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2863311531,0,2863311531,2454267027]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,0,2863311531,2454267027]
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; CHECK-SSE2-NEXT:    psrld $1, %xmm3
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    psrld $2, %xmm1
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0]
-; CHECK-SSE2-NEXT:    psrld $3, %xmm1
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [6,1,12,14]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
+; CHECK-SSE2-NEXT:    psrld $3, %xmm2
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [6,1,12,14]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm3
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-- 
GitLab


From c76c43b05e92d0ab7a843e8bb99354dd3826a28f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 24 Oct 2018 19:24:44 +0000
Subject: [PATCH 0519/1116] [X86] Explicitly list all KNL features of
 inheriting from IVB. NFC

I'm not sure all the microarchitectural tuning flags that have been added to IVBFeatures are relevant for KNL. Separating will allow us to see and audit them. There might even be some simplification opportunities in the Sandy Bridge through Icelake inheritance line without KNL using the same chain.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345183 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.td | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 6bf6aae95c0..9a12a7237e4 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -808,7 +808,29 @@ class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
 ]>;
 def : SkylakeClientProc<"skylake">;
 
-def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [
+def KNLFeatures : ProcessorFeatures<[], [
+  FeatureX87,
+  FeatureCMOV,
+  FeatureMMX,
+  FeatureFXSR,
+  FeatureNOPL,
+  Feature64Bit,
+  FeatureCMPXCHG16B,
+  FeaturePOPCNT,
+  FeatureSlowDivide64,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureLAHFSAHF,
+  FeatureSlow3OpsLEA,
+  FeatureFastScalarFSQRT,
+  FeatureFastSHLDRotate,
+  FeatureSlowIncDec,
+  FeatureMergeToThreeWayBranch,
+  FeatureMacroFusion,
+  FeatureRDRAND,
+  FeatureF16C,
+  FeatureFSGSBase,
   FeatureAVX512,
   FeatureERI,
   FeatureCDI,
-- 
GitLab


From e65789f9913de60abab8f5f2c7c6bcc5e11ba82c Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Wed, 24 Oct 2018 19:37:45 +0000
Subject: [PATCH 0520/1116] [llvm-mca] Simplify the logic in FetchStage. NFCI

Only method 'getNextInstruction()' needs to interact with the SourceMgr.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345185 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/include/Stages/FetchStage.h |  2 +-
 tools/llvm-mca/lib/Pipeline.cpp            |  5 ++--
 tools/llvm-mca/lib/Stages/FetchStage.cpp   | 33 +++++++++-------------
 3 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/tools/llvm-mca/include/Stages/FetchStage.h b/tools/llvm-mca/include/Stages/FetchStage.h
index 10a89c94469..45e30e17b4d 100644
--- a/tools/llvm-mca/include/Stages/FetchStage.h
+++ b/tools/llvm-mca/include/Stages/FetchStage.h
@@ -24,7 +24,7 @@
 namespace mca {
 
 class FetchStage final : public Stage {
-  std::unique_ptr<Instruction> CurrentInstruction;
+  InstRef CurrentInstruction;
   using InstMap = std::map<unsigned, std::unique_ptr<Instruction>>;
   InstMap Instructions;
   InstrBuilder &IB;
diff --git a/tools/llvm-mca/lib/Pipeline.cpp b/tools/llvm-mca/lib/Pipeline.cpp
index 2d9aa6b2a31..ad49522ad79 100644
--- a/tools/llvm-mca/lib/Pipeline.cpp
+++ b/tools/llvm-mca/lib/Pipeline.cpp
@@ -39,13 +39,14 @@ bool Pipeline::hasWorkToProcess() {
 Error Pipeline::run() {
   assert(!Stages.empty() && "Unexpected empty pipeline found!");
 
-  while (hasWorkToProcess()) {
+  do {
     notifyCycleBegin();
     if (Error Err = runCycle())
       return Err;
     notifyCycleEnd();
     ++Cycles;
-  }
+  } while (hasWorkToProcess());
+
   return ErrorSuccess();
 }
 
diff --git a/tools/llvm-mca/lib/Stages/FetchStage.cpp b/tools/llvm-mca/lib/Stages/FetchStage.cpp
index 8bd0bd9e3a7..e607db9c8f0 100644
--- a/tools/llvm-mca/lib/Stages/FetchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/FetchStage.cpp
@@ -18,20 +18,18 @@
 namespace mca {
 
 bool FetchStage::hasWorkToComplete() const {
-  return CurrentInstruction.get() || SM.hasNext();
+  return CurrentInstruction.isValid();
 }
 
 bool FetchStage::isAvailable(const InstRef & /* unused */) const {
-  if (!CurrentInstruction)
-    return false;
-  assert(SM.hasNext() && "Unexpected internal state!");
-  const SourceRef SR = SM.peekNext();
-  InstRef IR(SR.first, CurrentInstruction.get());
-  return checkNextStage(IR);
+  if (CurrentInstruction.isValid())
+    return checkNextStage(CurrentInstruction);
+  return false;
 }
 
 llvm::Error FetchStage::getNextInstruction() {
-  assert(!CurrentInstruction && "There is already an instruction to process!");
+  assert(!CurrentInstruction.isValid() &&
+         "There is already an instruction to process!");
   if (!SM.hasNext())
     return llvm::ErrorSuccess();
   const SourceRef SR = SM.peekNext();
@@ -39,28 +37,25 @@ llvm::Error FetchStage::getNextInstruction() {
       IB.createInstruction(SR.second);
   if (!InstOrErr)
     return InstOrErr.takeError();
-  CurrentInstruction = std::move(InstOrErr.get());
+  std::unique_ptr<Instruction> Inst = std::move(InstOrErr.get());
+  CurrentInstruction = InstRef(SR.first, Inst.get());
+  Instructions[SR.first] = std::move(Inst);
+  SM.updateNext();
   return llvm::ErrorSuccess();
 }
 
 llvm::Error FetchStage::execute(InstRef & /*unused */) {
-  assert(CurrentInstruction && "There is no instruction to process!");
-  const SourceRef SR = SM.peekNext();
-  InstRef IR(SR.first, CurrentInstruction.get());
-  assert(checkNextStage(IR) && "Invalid fetch!");
-
-  Instructions[IR.getSourceIndex()] = std::move(CurrentInstruction);
-  if (llvm::Error Val = moveToTheNextStage(IR))
+  assert(CurrentInstruction.isValid() && "There is no instruction to process!");
+  if (llvm::Error Val = moveToTheNextStage(CurrentInstruction))
     return Val;
 
-  SM.updateNext();
-
   // Move the program counter.
+  CurrentInstruction.invalidate();
   return getNextInstruction();
 }
 
 llvm::Error FetchStage::cycleStart() {
-  if (!CurrentInstruction)
+  if (!CurrentInstruction.isValid())
     return getNextInstruction();
   return llvm::ErrorSuccess();
 }
-- 
GitLab


From b0a35e2f85dd9fe2538d373bf5085cf031173d42 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Wed, 24 Oct 2018 20:03:20 +0000
Subject: [PATCH 0521/1116] [AArch64] Fix overlapping instructions

Fix overlapping instruction descriptions in the machine model for Exynos M3.
Effectively, NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345186 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64SchedExynosM3.td | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/lib/Target/AArch64/AArch64SchedExynosM3.td b/lib/Target/AArch64/AArch64SchedExynosM3.td
index 5e5369a5a7f..56808c291b1 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -26,9 +26,6 @@ def ExynosM3Model : SchedMachineModel {
   let CompleteModel         =   1; // Use the default model otherwise.
 
   list<Predicate> UnsupportedFeatures = [HasSVE];
-
-  // FIXME: Remove when all errors have been fixed.
-  let FullInstRWOverlapCheck = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -588,7 +585,7 @@ def : InstRW<[M3WriteSA,
 // ASIMD instructions.
 def : InstRW<[M3WriteNMSC3], (instregex "^[SU]ABAL?v")>;
 def : InstRW<[M3WriteNMSC1], (instregex "^[SU]ABDL?v")>;
-def : InstRW<[M3WriteNMSC1], (instregex "^(SQ)?(ABS|NEG)v")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^((SQ)?ABS|SQNEG)v")>;
 def : InstRW<[M3WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>;
 def : InstRW<[M3WriteNMSC3], (instregex "^[SU]?ADDL?Pv")>;
 def : InstRW<[M3WriteNMSC3], (instregex "^[SU]H(ADD|SUB)v")>;
@@ -597,7 +594,6 @@ def : InstRW<[M3WriteNMSC3], (instregex "^R?(ADD|SUB)HN2?v")>;
 def : InstRW<[M3WriteNMSC3], (instregex "^[SU]Q(ADD|SUB)v")>;
 def : InstRW<[M3WriteNMSC3], (instregex "^(SU|US)QADDv")>;
 def : InstRW<[M3WriteNMSC3], (instregex "^[SU]RHADDv")>;
-def : InstRW<[M3WriteNMSC3], (instregex "^[SU]?ADDL?Vv")>;
 def : InstRW<[M3WriteNMSC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
 def : InstRW<[M3WriteNALU1], (instregex "^CMTSTv")>;
 def : InstRW<[M3WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
-- 
GitLab


From 07660eab6905bb4acfa03f1f8d6091c2701d111a Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Wed, 24 Oct 2018 20:03:24 +0000
Subject: [PATCH 0522/1116] [AArch64] Refactor Exynos machine model (NFC)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345187 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrInfo.cpp    |  4 +-
 lib/Target/AArch64/AArch64InstrInfo.h      |  2 +-
 lib/Target/AArch64/AArch64SchedExynosM1.td | 30 +++++++-------
 lib/Target/AArch64/AArch64SchedExynosM3.td | 48 +++++++++++-----------
 4 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 2452d6a0298..bbd734a542c 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -696,7 +696,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   // Secondly, check cases specific to sub-targets.
 
   if (Subtarget.hasExynosCheapAsMoveHandling()) {
-    if (isExynosResetFast(MI) || isExynosShiftLeftFast(MI))
+    if (isExynosResetFast(MI) || isExynosShiftExtFast(MI))
       return true;
     else
       return MI.isAsCheapAsAMove();
@@ -821,7 +821,7 @@ bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) const {
   }
 }
 
-bool AArch64InstrInfo::isExynosShiftLeftFast(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isExynosShiftExtFast(const MachineInstr &MI) const {
   unsigned Imm, Shift;
   AArch64_AM::ShiftExtendType Ext;
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 11882e238b7..49ea9ad0fda 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -255,7 +255,7 @@ public:
   bool isExynosResetFast(const MachineInstr &MI) const;
   /// Returns true if the instruction has a shift left that can be executed
   /// more efficiently.
-  bool isExynosShiftLeftFast(const MachineInstr &MI) const;
+  bool isExynosShiftExtFast(const MachineInstr &MI) const;
   /// Returns true if the instruction has a shift by immediate that can be
   /// executed in one cycle less.
   bool isFalkorShiftExtFast(const MachineInstr &MI) const;
diff --git a/lib/Target/AArch64/AArch64SchedExynosM1.td b/lib/Target/AArch64/AArch64SchedExynosM1.td
index ecc68aed155..c19ec45dab4 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM1.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM1.td
@@ -64,9 +64,9 @@ def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
 //===----------------------------------------------------------------------===//
 // Predicates.
 
-def M1BranchLinkFastPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
-                                            MI->getOperand(0).getReg() != AArch64::LR}]>;
-def M1ShiftLeftFastPred  : SchedPredicate<[{TII->isExynosShiftLeftFast(*MI)}]>;
+def M1BranchLinkPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
+                                        MI->getOperand(0).getReg() != AArch64::LR}]>;
+def M1ShiftExtPred   : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>;
 
 //===----------------------------------------------------------------------===//
 // Coarse scheduling model.
@@ -85,14 +85,14 @@ def M1WriteAC : SchedWriteRes<[M1UnitALU,
 def M1WriteAD : SchedWriteRes<[M1UnitALU,
                                M1UnitC]>   { let Latency = 2;
                                              let NumMicroOps = 2; }
-def M1WriteAX : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteA1]>,
-                                   SchedVar<NoSchedPred,         [M1WriteAA]>]>;
+def M1WriteAX : SchedWriteVariant<[SchedVar<M1ShiftExtPred, [M1WriteA1]>,
+                                   SchedVar<NoSchedPred,    [M1WriteAA]>]>;
 def M1WriteC1 : SchedWriteRes<[M1UnitC]>   { let Latency = 1; }
 def M1WriteC2 : SchedWriteRes<[M1UnitC]>   { let Latency = 2; }
 
 def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; }
-def M1WriteBX : SchedWriteVariant<[SchedVar<M1BranchLinkFastPred, [M1WriteAB]>,
-                                   SchedVar<NoSchedPred,          [M1WriteAC]>]>;
+def M1WriteBX : SchedWriteVariant<[SchedVar<M1BranchLinkPred, [M1WriteAB]>,
+                                   SchedVar<NoSchedPred,      [M1WriteAC]>]>;
 
 def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; }
 def M1WriteL6 : SchedWriteRes<[M1UnitL]> { let Latency = 6; }
@@ -110,10 +110,10 @@ def M1WriteLD : SchedWriteRes<[M1UnitL,
                                            let ResourceCycles = [2, 1]; }
 def M1WriteLH : SchedWriteRes<[]>        { let Latency = 5;
                                            let NumMicroOps = 0; }
-def M1WriteLX : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteL5]>,
-                                   SchedVar<NoSchedPred,         [M1WriteLC]>]>;
-def M1WriteLY : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteL5]>,
-                                   SchedVar<NoSchedPred,         [M1WriteLD]>]>;
+def M1WriteLX : SchedWriteVariant<[SchedVar<M1ShiftExtPred, [M1WriteL5]>,
+                                   SchedVar<NoSchedPred,    [M1WriteLC]>]>;
+def M1WriteLY : SchedWriteVariant<[SchedVar<M1ShiftExtPred, [M1WriteL5]>,
+                                   SchedVar<NoSchedPred,    [M1WriteLD]>]>;
 
 def M1WriteS1 : SchedWriteRes<[M1UnitS]>   { let Latency = 1; }
 def M1WriteS3 : SchedWriteRes<[M1UnitS]>   { let Latency = 3; }
@@ -140,10 +140,10 @@ def M1WriteSD : SchedWriteRes<[M1UnitS,
 def M1WriteSE : SchedWriteRes<[M1UnitS,
                                M1UnitA]>   { let Latency = 2;
                                              let NumMicroOps = 2; }
-def M1WriteSX : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteS1]>,
-                                   SchedVar<NoSchedPred,         [M1WriteSE]>]>;
-def M1WriteSY : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteS1]>,
-                                   SchedVar<NoSchedPred,         [M1WriteSB]>]>;
+def M1WriteSX : SchedWriteVariant<[SchedVar<M1ShiftExtPred, [M1WriteS1]>,
+                                   SchedVar<NoSchedPred,    [M1WriteSE]>]>;
+def M1WriteSY : SchedWriteVariant<[SchedVar<M1ShiftExtPred, [M1WriteS1]>,
+                                   SchedVar<NoSchedPred,    [M1WriteSB]>]>;
 
 def M1ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
                                       SchedVar<NoSchedPred,   [ReadDefault]>]>;
diff --git a/lib/Target/AArch64/AArch64SchedExynosM3.td b/lib/Target/AArch64/AArch64SchedExynosM3.td
index 56808c291b1..7b3ab72ccd0 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -106,15 +106,15 @@ def M3UnitNSHF : ProcResGroup<[M3UnitNSHF0,
 //===----------------------------------------------------------------------===//
 // Predicates.
 
-def M3BranchLinkFastPred  : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
-                                             MI->getOperand(0).isReg() &&
-                                             MI->getOperand(0).getReg() != AArch64::LR}]>;
-def M3ResetFastPred       : SchedPredicate<[{TII->isExynosResetFast(*MI)}]>;
-def M3RotateRightFastPred : SchedPredicate<[{(MI->getOpcode() == AArch64::EXTRWrri ||
-                                              MI->getOpcode() == AArch64::EXTRXrri) &&
-                                             MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
-                                             MI->getOperand(1).getReg() == MI->getOperand(2).getReg()}]>;
-def M3ShiftLeftFastPred   : SchedPredicate<[{TII->isExynosShiftLeftFast(*MI)}]>;
+def M3BranchLinkPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
+                                        MI->getOperand(0).isReg() &&
+                                        MI->getOperand(0).getReg() != AArch64::LR}]>;
+def M3ResetPred      : SchedPredicate<[{TII->isExynosResetFast(*MI)}]>;
+def M3RotatePred     : SchedPredicate<[{(MI->getOpcode() == AArch64::EXTRWrri ||
+                                         MI->getOpcode() == AArch64::EXTRXrri) &&
+                                        MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+                                        MI->getOperand(1).getReg() == MI->getOperand(2).getReg()}]>;
+def M3ShiftExtPred   : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>;
 
 //===----------------------------------------------------------------------===//
 // Coarse scheduling model.
@@ -137,15 +137,15 @@ def M3WriteAD : SchedWriteRes<[M3UnitALU,
                                              let NumMicroOps = 2; }
 def M3WriteC1 : SchedWriteRes<[M3UnitC]>   { let Latency = 1; }
 def M3WriteC2 : SchedWriteRes<[M3UnitC]>   { let Latency = 2; }
-def M3WriteAX : SchedWriteVariant<[SchedVar<M3ResetFastPred,     [M3WriteZ0]>,
-                                   SchedVar<M3ShiftLeftFastPred, [M3WriteA1]>,
-                                   SchedVar<NoSchedPred,         [M3WriteAA]>]>;
-def M3WriteAY : SchedWriteVariant<[SchedVar<M3RotateRightFastPred, [M3WriteA1]>,
-                                   SchedVar<NoSchedPred,           [M3WriteAA]>]>;
+def M3WriteAX : SchedWriteVariant<[SchedVar<M3ResetPred,    [M3WriteZ0]>,
+                                   SchedVar<M3ShiftExtPred, [M3WriteA1]>,
+                                   SchedVar<NoSchedPred,    [M3WriteAA]>]>;
+def M3WriteAY : SchedWriteVariant<[SchedVar<M3RotatePred, [M3WriteA1]>,
+                                   SchedVar<NoSchedPred,  [M3WriteAA]>]>;
 
 def M3WriteB1 : SchedWriteRes<[M3UnitB]> { let Latency = 1; }
-def M3WriteBX : SchedWriteVariant<[SchedVar<M3BranchLinkFastPred, [M3WriteAB]>,
-                                   SchedVar<NoSchedPred,          [M3WriteAC]>]>;
+def M3WriteBX : SchedWriteVariant<[SchedVar<M3BranchLinkPred, [M3WriteAB]>,
+                                   SchedVar<NoSchedPred,      [M3WriteAC]>]>;
 
 def M3WriteL4 : SchedWriteRes<[M3UnitL]> { let Latency = 4; }
 def M3WriteL5 : SchedWriteRes<[M3UnitL]> { let Latency = 5; }
@@ -165,8 +165,8 @@ def M3WriteLD : SchedWriteRes<[M3UnitA,
 def M3WriteLH : SchedWriteRes<[]>        { let Latency = 5;
                                            let NumMicroOps = 0; }
 
-def M3WriteLX : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteL5]>,
-                                   SchedVar<NoSchedPred,         [M3WriteLB]>]>;
+def M3WriteLX : SchedWriteVariant<[SchedVar<M3ShiftExtPred, [M3WriteL5]>,
+                                   SchedVar<NoSchedPred,    [M3WriteLB]>]>;
 
 def M3WriteS1 : SchedWriteRes<[M3UnitS]>   { let Latency = 1; }
 def M3WriteSA : SchedWriteRes<[M3UnitA,
@@ -180,10 +180,10 @@ def M3WriteSC : SchedWriteRes<[M3UnitA,
                                M3UnitS]>   { let Latency = 2;
                                              let NumMicroOps = 2; }
 
-def M3WriteSX : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteS1]>,
-                                   SchedVar<NoSchedPred,         [M3WriteSB]>]>;
-def M3WriteSY : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteS1]>,
-                                   SchedVar<NoSchedPred,         [M3WriteSC]>]>;
+def M3WriteSX : SchedWriteVariant<[SchedVar<M3ShiftExtPred, [M3WriteS1]>,
+                                   SchedVar<NoSchedPred,    [M3WriteSB]>]>;
+def M3WriteSY : SchedWriteVariant<[SchedVar<M3ShiftExtPred, [M3WriteS1]>,
+                                   SchedVar<NoSchedPred,    [M3WriteSC]>]>;
 
 def M3ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
                                       SchedVar<NoSchedPred,   [ReadDefault]>]>;
@@ -481,8 +481,8 @@ def M3WriteAES     : SchedWriteRes<[M3UnitNCRY]>  { let Latency = 1; }
 def M3ReadAES      : SchedReadAdvance<1, [M3WriteAES]>;
 def M3ReadFMAC     : SchedReadAdvance<1, [M3WriteFMAC4,
                                           M3WriteFMAC5]>;
-def M3WriteMOVI    : SchedWriteVariant<[SchedVar<M3ResetFastPred, [M3WriteZ0]>,
-                                        SchedVar<NoSchedPred,     [M3WriteNALU1]>]>;
+def M3WriteMOVI    : SchedWriteVariant<[SchedVar<M3ResetPred, [M3WriteZ0]>,
+                                        SchedVar<NoSchedPred, [M3WriteNALU1]>]>;
 def M3ReadNMUL     : SchedReadAdvance<1, [M3WriteNMUL3]>;
 
 // Branch instructions
-- 
GitLab


From 55134c631c6cfe9ff844bcd0d419a91437cbda77 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Wed, 24 Oct 2018 20:19:09 +0000
Subject: [PATCH 0523/1116] AArch64: add a pass to compress jump-table entries
 when possible.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345188 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsAArch64.td          |   6 +
 lib/Target/AArch64/AArch64.h                  |   2 +
 lib/Target/AArch64/AArch64.td                 |  13 +-
 lib/Target/AArch64/AArch64AsmPrinter.cpp      | 132 ++++++++++++++
 .../AArch64/AArch64CompressJumpTables.cpp     | 162 ++++++++++++++++++
 lib/Target/AArch64/AArch64ISelLowering.cpp    |  20 ++-
 lib/Target/AArch64/AArch64ISelLowering.h      |   1 +
 lib/Target/AArch64/AArch64InstrInfo.cpp       |   8 +
 lib/Target/AArch64/AArch64InstrInfo.td        |  24 +++
 .../AArch64/AArch64MachineFunctionInfo.h      |  15 ++
 lib/Target/AArch64/AArch64Subtarget.h         |   2 +
 lib/Target/AArch64/AArch64TargetMachine.cpp   |   8 +
 lib/Target/AArch64/CMakeLists.txt             |   1 +
 test/CodeGen/AArch64/O3-pipeline.ll           |   1 +
 test/CodeGen/AArch64/jump-table-compress.mir  | 111 ++++++++++++
 test/CodeGen/AArch64/jump-table-exynos.ll     |  67 ++++++++
 test/CodeGen/AArch64/jump-table.ll            | 156 +++++++++++++----
 test/CodeGen/AArch64/min-jump-table.ll        |   6 +-
 18 files changed, 693 insertions(+), 42 deletions(-)
 create mode 100644 lib/Target/AArch64/AArch64CompressJumpTables.cpp
 create mode 100644 test/CodeGen/AArch64/jump-table-compress.mir
 create mode 100644 test/CodeGen/AArch64/jump-table-exynos.ll

diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
index 688e863c1af..5f86ee7cdb4 100644
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@@ -44,6 +44,12 @@ def int_aarch64_dmb : GCCBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">, Intri
 def int_aarch64_dsb : GCCBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">, Intrinsic<[], [llvm_i32_ty]>;
 def int_aarch64_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">, Intrinsic<[], [llvm_i32_ty]>;
 
+// A space-consuming intrinsic primarily for testing block and jump table
+// placements. The first argument is the number of bytes this "instruction"
+// takes up, the second and return value are essentially chains, used to force
+// ordering during ISel.
+def int_aarch64_space : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty], []>;
+
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 6472dcd5157..2f0d0bf346d 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -32,6 +32,7 @@ class MachineFunctionPass;
 FunctionPass *createAArch64DeadRegisterDefinitions();
 FunctionPass *createAArch64RedundantCopyEliminationPass();
 FunctionPass *createAArch64CondBrTuning();
+FunctionPass *createAArch64CompressJumpTablesPass();
 FunctionPass *createAArch64ConditionalCompares();
 FunctionPass *createAArch64AdvSIMDScalar();
 FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
@@ -62,6 +63,7 @@ void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
 void initializeAArch64BranchTargetsPass(PassRegistry&);
 void initializeAArch64CollectLOHPass(PassRegistry&);
 void initializeAArch64CondBrTuningPass(PassRegistry &);
+void initializeAArch64CompressJumpTablesPass(PassRegistry&);
 void initializeAArch64ConditionalComparesPass(PassRegistry&);
 void initializeAArch64ConditionOptimizerPass(PassRegistry&);
 void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 368898fd1e6..de78ca5b257 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -180,6 +180,10 @@ def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
     "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
     "Disable latency scheduling heuristic">;
 
+def FeatureForce32BitJumpTables
+   : SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true",
+                      "Force jump table entries to be 32-bits wide except at MinSize">;
+
 def FeatureRCPC : SubtargetFeature<"rcpc", "HasRCPC", "true",
                                    "Enable support for RCPC extension">;
 
@@ -411,7 +415,8 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                      FeaturePostRAScheduler,
                                      FeatureSlowMisaligned128Store,
                                      FeatureUseRSqrt,
-                                     FeatureZCZeroingFP]>;
+                                     FeatureZCZeroingFP,
+                                     FeatureForce32BitJumpTables]>;
 
 def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                     "Samsung Exynos-M2 processors",
@@ -425,7 +430,8 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
                                      FeatureSlowMisaligned128Store,
-                                     FeatureZCZeroingFP]>;
+                                     FeatureZCZeroingFP,
+                                     FeatureForce32BitJumpTables]>;
 
 def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                     "Samsung Exynos-M3 processors",
@@ -442,7 +448,8 @@ def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
                                      FeaturePredictableSelectIsExpensive,
-                                     FeatureZCZeroingFP]>;
+                                     FeatureZCZeroingFP,
+                                     FeatureForce32BitJumpTables]>;
 
 def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    "Qualcomm Kryo processors", [
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 23b6a65555a..b1375c969d9 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -31,6 +31,8 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -77,6 +79,12 @@ public:
     return MCInstLowering.lowerOperand(MO, MCOp);
   }
 
+  void EmitJumpTableInfo() override;
+  void emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                          const MachineBasicBlock *MBB, unsigned JTI);
+
+  void LowerJumpTableDestSmall(MCStreamer &OutStreamer, const MachineInstr &MI);
+
   void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
                      const MachineInstr &MI);
   void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
@@ -433,6 +441,104 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
   printOperand(MI, NOps - 2, OS);
 }
 
+void AArch64AsmPrinter::EmitJumpTableInfo() {
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (!MJTI) return;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty()) return;
+
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+  MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM);
+  OutStreamer->SwitchSection(ReadOnlySec);
+
+  auto AFI = MF->getInfo<AArch64FunctionInfo>();
+  for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+    // If this jump table was deleted, ignore it.
+    if (JTBBs.empty()) continue;
+
+    unsigned Size = AFI->getJumpTableEntrySize(JTI);
+    EmitAlignment(Log2_32(Size));
+    OutStreamer->EmitLabel(GetJTISymbol(JTI));
+
+    for (auto *JTBB : JTBBs)
+      emitJumpTableEntry(MJTI, JTBB, JTI);
+  }
+}
+
+void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                           const MachineBasicBlock *MBB,
+                                           unsigned JTI) {
+  const MCExpr *Value = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+  auto AFI = MF->getInfo<AArch64FunctionInfo>();
+  unsigned Size = AFI->getJumpTableEntrySize(JTI);
+
+  if (Size == 4) {
+    // .word LBB - LJTI
+    const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+    const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, JTI, OutContext);
+    Value = MCBinaryExpr::createSub(Value, Base, OutContext);
+  } else {
+    // .byte (LBB - LBB) >> 2 (or .hword)
+    const MCSymbol *BaseSym = AFI->getJumpTableEntryPCRelSymbol(JTI);
+    const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, OutContext);
+    Value = MCBinaryExpr::createSub(Value, Base, OutContext);
+    Value = MCBinaryExpr::createLShr(
+        Value, MCConstantExpr::create(2, OutContext), OutContext);
+  }
+
+  OutStreamer->EmitValue(Value, Size);
+}
+
+/// Small jump tables contain an unsigned byte or half, representing the offset
+/// from the lowest-addressed possible destination to the desired basic
+/// block. Since all instructions are 4-byte aligned, this is further compressed
+/// by counting in instructions rather than bytes (i.e. divided by 4). So, to
+/// materialize the correct destination we need:
+///
+///             adr xDest, .LBB0_0
+///             ldrb wScratch, [xTable, xEntry]   (with "lsl #1" for ldrh).
+///             add xDest, xDest, xScratch, lsl #2
+void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer,
+                                                const llvm::MachineInstr &MI) {
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned ScratchReg = MI.getOperand(1).getReg();
+  unsigned ScratchRegW =
+      STI->getRegisterInfo()->getSubReg(ScratchReg, AArch64::sub_32);
+  unsigned TableReg = MI.getOperand(2).getReg();
+  unsigned EntryReg = MI.getOperand(3).getReg();
+  int JTIdx = MI.getOperand(4).getIndex();
+  bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8;
+
+  // This has to be first because the compression pass based its reachability
+  // calculations on the start of the JumpTableDest instruction.
+  auto Label =
+      MF->getInfo<AArch64FunctionInfo>()->getJumpTableEntryPCRelSymbol(JTIdx);
+  EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADR)
+                                  .addReg(DestReg)
+                                  .addExpr(MCSymbolRefExpr::create(
+                                      Label, MF->getContext())));
+
+  // Load the number of instruction-steps to offset from the label.
+  unsigned LdrOpcode = IsByteEntry ? AArch64::LDRBBroX : AArch64::LDRHHroX;
+  EmitToStreamer(OutStreamer, MCInstBuilder(LdrOpcode)
+                                  .addReg(ScratchRegW)
+                                  .addReg(TableReg)
+                                  .addReg(EntryReg)
+                                  .addImm(0)
+                                  .addImm(IsByteEntry ? 0 : 1));
+
+  // Multiply the steps by 4 and add to the already materialized base label
+  // address.
+  EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADDXrs)
+                                  .addReg(DestReg)
+                                  .addReg(DestReg)
+                                  .addReg(ScratchReg)
+                                  .addImm(2));
+}
+
 void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
                                       const MachineInstr &MI) {
   unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes();
@@ -662,6 +768,32 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  case AArch64::JumpTableDest32: {
+    // We want:
+    //     ldrsw xScratch, [xTable, xEntry, lsl #2]
+    //     add xDest, xTable, xScratch
+    unsigned DestReg = MI->getOperand(0).getReg(),
+             ScratchReg = MI->getOperand(1).getReg(),
+             TableReg = MI->getOperand(2).getReg(),
+             EntryReg = MI->getOperand(3).getReg();
+    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRSWroX)
+                                     .addReg(ScratchReg)
+                                     .addReg(TableReg)
+                                     .addReg(EntryReg)
+                                     .addImm(0)
+                                     .addImm(1));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXrs)
+                                     .addReg(DestReg)
+                                     .addReg(TableReg)
+                                     .addReg(ScratchReg)
+                                     .addImm(0));
+    return;
+  }
+  case AArch64::JumpTableDest16:
+  case AArch64::JumpTableDest8:
+    LowerJumpTableDestSmall(*OutStreamer, *MI);
+    return;
+
   case AArch64::FMOVH0:
   case AArch64::FMOVS0:
   case AArch64::FMOVD0:
diff --git a/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/lib/Target/AArch64/AArch64CompressJumpTables.cpp
new file mode 100644
index 00000000000..0924a27e258
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CompressJumpTables.cpp
@@ -0,0 +1,162 @@
+//==-- AArch64CompressJumpTables.cpp - Compress jump tables for AArch64 --====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This pass looks at the basic blocks each jump-table refers to and works out
+// whether they can be emitted in a compressed form (with 8 or 16-bit
+// entries). If so, it changes the opcode and flags them in the associated
+// AArch64FunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-jump-tables"
+
+STATISTIC(NumJT8, "Number of jump-tables with 1-byte entries");
+STATISTIC(NumJT16, "Number of jump-tables with 2-byte entries");
+STATISTIC(NumJT32, "Number of jump-tables with 4-byte entries");
+
+namespace {
+class AArch64CompressJumpTables : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  MachineFunction *MF;
+  SmallVector<int, 8> BlockInfo;
+
+  int computeBlockSize(MachineBasicBlock &MBB);
+  void scanFunction();
+
+  bool compressJumpTable(MachineInstr &MI, int Offset);
+
+public:
+  static char ID;
+  AArch64CompressJumpTables() : MachineFunctionPass(ID) {
+    initializeAArch64CompressJumpTablesPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+  StringRef getPassName() const override {
+    return "AArch64 Compress Jump Tables";
+  }
+};
+char AArch64CompressJumpTables::ID = 0;
+}
+
+INITIALIZE_PASS(AArch64CompressJumpTables, DEBUG_TYPE,
+                "AArch64 compress jump tables pass", false, false)
+
+int AArch64CompressJumpTables::computeBlockSize(MachineBasicBlock &MBB) {
+  int Size = 0;
+  for (const MachineInstr &MI : MBB)
+    Size += TII->getInstSizeInBytes(MI);
+  return Size;
+}
+
+void AArch64CompressJumpTables::scanFunction() {
+  BlockInfo.clear();
+  BlockInfo.resize(MF->getNumBlockIDs());
+
+  int Offset = 0;
+  for (MachineBasicBlock &MBB : *MF) {
+    BlockInfo[MBB.getNumber()] = Offset;
+    Offset += computeBlockSize(MBB);
+  }
+}
+
+bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI,
+                                                  int Offset) {
+  if (MI.getOpcode() != AArch64::JumpTableDest32)
+    return false;
+
+  int JTIdx = MI.getOperand(4).getIndex();
+  auto &JTInfo = *MF->getJumpTableInfo();
+  const MachineJumpTableEntry &JT = JTInfo.getJumpTables()[JTIdx];
+
+  // The jump-table might have been optimized away.
+  if (JT.MBBs.empty())
+    return false;
+
+  int MaxOffset = std::numeric_limits<int>::min(),
+      MinOffset = std::numeric_limits<int>::max();
+  MachineBasicBlock *MinBlock = nullptr;
+  for (auto Block : JT.MBBs) {
+    int BlockOffset = BlockInfo[Block->getNumber()];
+    assert(BlockOffset % 4 == 0 && "misaligned basic block");
+
+    MaxOffset = std::max(MaxOffset, BlockOffset);
+    if (BlockOffset <= MinOffset) {
+      MinOffset = BlockOffset;
+      MinBlock = Block;
+    }
+  }
+
+  // The ADR instruction needed to calculate the address of the first reachable
+  // basic block can address +/-1MB.
+  if (!isInt<21>(MinOffset - Offset)) {
+    ++NumJT32;
+    return false;
+  }
+
+  int Span = MaxOffset - MinOffset;
+  auto AFI = MF->getInfo<AArch64FunctionInfo>();
+  if (isUInt<8>(Span / 4)) {
+    AFI->setJumpTableEntryInfo(JTIdx, 1, MinBlock->getSymbol());
+    MI.setDesc(TII->get(AArch64::JumpTableDest8));
+    ++NumJT8;
+    return true;
+  } else if (isUInt<16>(Span / 4)) {
+    AFI->setJumpTableEntryInfo(JTIdx, 2, MinBlock->getSymbol());
+    MI.setDesc(TII->get(AArch64::JumpTableDest16));
+    ++NumJT16;
+    return true;
+  }
+
+  ++NumJT32;
+  return false;
+}
+
+bool AArch64CompressJumpTables::runOnMachineFunction(MachineFunction &MFIn) {
+  bool Changed = false;
+  MF = &MFIn;
+
+  const auto &ST = MF->getSubtarget<AArch64Subtarget>();
+  TII = ST.getInstrInfo();
+
+  if (ST.force32BitJumpTables() && !MF->getFunction().optForMinSize())
+    return false;
+
+  scanFunction();
+
+  for (MachineBasicBlock &MBB : *MF) {
+    int Offset = BlockInfo[MBB.getNumber()];
+    for (MachineInstr &MI : MBB) {
+      Changed |= compressJumpTable(MI, Offset);
+      Offset += TII->getInstSizeInBytes(MI);
+    }
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64CompressJumpTablesPass() {
+  return new AArch64CompressJumpTables();
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index fea1531540f..c8227cd139a 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -187,7 +187,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
 
   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
@@ -2825,6 +2825,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerSELECT_CC(Op, DAG);
   case ISD::JumpTable:
     return LowerJumpTable(Op, DAG);
+  case ISD::BR_JT:
+    return LowerBR_JT(Op, DAG);
   case ISD::ConstantPool:
     return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:
@@ -4902,6 +4904,22 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
   return getAddr(JT, DAG);
 }
 
+SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  // Jump table entries as PC relative offsets. No additional tweaking
+  // is necessary here. Just get the address of the jump table.
+  SDLoc DL(Op);
+  SDValue JT = Op.getOperand(1);
+  SDValue Entry = Op.getOperand(2);
+  int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
+
+  SDNode *Dest =
+      DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
+                         Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
+  return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
+                     SDValue(Dest, 0));
+}
+
 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                  SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 94df7e4c39d..3e89de665a7 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -607,6 +607,7 @@ private:
                          SDValue TVal, SDValue FVal, const SDLoc &dl,
                          SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index bbd734a542c..e6474046534 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -108,6 +108,14 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     // This gets lowered to an instruction sequence which takes 16 bytes
     NumBytes = 16;
     break;
+  case AArch64::JumpTableDest32:
+  case AArch64::JumpTableDest16:
+  case AArch64::JumpTableDest8:
+    NumBytes = 12;
+    break;
+  case AArch64::SPACE:
+    NumBytes = MI.getOperand(1).getImm();
+    break;
   }
 
   return NumBytes;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 1d9e3d0b812..24f6aaaab57 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -448,6 +448,30 @@ def : Pat<(AArch64LOADgot texternalsym:$addr),
 def : Pat<(AArch64LOADgot tconstpool:$addr),
           (LOADgot tconstpool:$addr)>;
 
+// 32-bit jump table destination is actually only 2 instructions since we can
+// use the table itself as a PC-relative base. But optimization occurs after
+// branch relaxation so be pessimistic.
+let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch" in {
+def JumpTableDest32 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
+                             (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
+                      Sched<[]>;
+def JumpTableDest16 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
+                             (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
+                      Sched<[]>;
+def JumpTableDest8 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
+                            (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
+                     Sched<[]>;
+}
+
+// Space-consuming pseudo to aid testing of placement and reachability
+// algorithms. Immediate operand is the number of bytes this "instruction"
+// occupies; register operands can be used to enforce dependency and constrain
+// the scheduler.
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
+def SPACE : Pseudo<(outs GPR64:$Rd), (ins i32imm:$size, GPR64:$Rn),
+                   [(set GPR64:$Rd, (int_aarch64_space imm:$size, GPR64:$Rn))]>,
+            Sched<[]>;
+
 //===----------------------------------------------------------------------===//
 // System instructions.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index e42214d1569..63c0ba2811e 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -162,6 +162,19 @@ public:
   unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
   void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
 
+  unsigned getJumpTableEntrySize(int Idx) const {
+    auto It = JumpTableEntryInfo.find(Idx);
+    if (It != JumpTableEntryInfo.end())
+      return It->second.first;
+    return 4;
+  }
+  MCSymbol *getJumpTableEntryPCRelSymbol(int Idx) const {
+    return JumpTableEntryInfo.find(Idx)->second.second;
+  }
+  void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym) {
+    JumpTableEntryInfo[Idx] = std::make_pair(Size, PCRelSym);
+  }
+
   using SetOfInstructions = SmallPtrSet<const MachineInstr *, 16>;
 
   const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
@@ -200,6 +213,8 @@ private:
   // Hold the lists of LOHs.
   MILOHContainer LOHContainerSet;
   SetOfInstructions LOHRelated;
+
+  DenseMap<int, std::pair<unsigned, MCSymbol *>> JumpTableEntryInfo;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index abe1980740e..8bf7c165408 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -142,6 +142,7 @@ protected:
   bool HasFuseLiterals = false;
   bool DisableLatencySchedHeuristic = false;
   bool UseRSqrt = false;
+  bool Force32BitJumpTables = false;
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 3;
   uint16_t CacheLineSize = 0;
@@ -292,6 +293,7 @@ public:
   }
 
   bool useRSqrt() const { return UseRSqrt; }
+  bool force32BitJumpTables() const { return Force32BitJumpTables; }
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
   unsigned getVectorInsertExtractBaseCost() const {
     return VectorInsertExtractBaseCost;
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index e183288d8df..fe2eea65ffe 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -123,6 +123,10 @@ static cl::opt<bool>
     BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true),
                      cl::desc("Relax out of range conditional branches"));
 
+static cl::opt<bool> EnableCompressJumpTables(
+    "aarch64-enable-compress-jump-tables", cl::Hidden, cl::init(true),
+    cl::desc("Use smallest entry possible for jump tables"));
+
 // FIXME: Unify control over GlobalMerge.
 static cl::opt<cl::boolOrDefault>
     EnableGlobalMerge("aarch64-enable-global-merge", cl::Hidden,
@@ -158,6 +162,7 @@ extern "C" void LLVMInitializeAArch64Target() {
   initializeAArch64AdvSIMDScalarPass(*PR);
   initializeAArch64BranchTargetsPass(*PR);
   initializeAArch64CollectLOHPass(*PR);
+  initializeAArch64CompressJumpTablesPass(*PR);
   initializeAArch64ConditionalComparesPass(*PR);
   initializeAArch64ConditionOptimizerPass(*PR);
   initializeAArch64DeadRegisterDefinitionsPass(*PR);
@@ -546,6 +551,9 @@ void AArch64PassConfig::addPreEmitPass() {
   if (EnableBranchTargets)
     addPass(createAArch64BranchTargetsPass());
 
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableCompressJumpTables)
+    addPass(createAArch64CompressJumpTablesPass());
+
   if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
       TM->getTargetTriple().isOSBinFormatMachO())
     addPass(createAArch64CollectLOHPass());
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index c57ebeb854c..58190686c79 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -34,6 +34,7 @@ add_llvm_target(AArch64CodeGen
   AArch64FastISel.cpp
   AArch64A53Fix835769.cpp
   AArch64FrameLowering.cpp
+  AArch64CompressJumpTables.cpp
   AArch64ConditionOptimizer.cpp
   AArch64RedundantCopyElimination.cpp
   AArch64ISelDAGToDAG.cpp
diff --git a/test/CodeGen/AArch64/O3-pipeline.ll b/test/CodeGen/AArch64/O3-pipeline.ll
index 33bc05f91d5..dc2316987d3 100644
--- a/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/test/CodeGen/AArch64/O3-pipeline.ll
@@ -151,6 +151,7 @@
 ; CHECK-NEXT:       Branch Probability Basic Block Placement
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       AArch64 Branch Targets
+; CHECK-NEXT:       AArch64 Compress Jump Tables
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
diff --git a/test/CodeGen/AArch64/jump-table-compress.mir b/test/CodeGen/AArch64/jump-table-compress.mir
new file mode 100644
index 00000000000..b4217ea6168
--- /dev/null
+++ b/test/CodeGen/AArch64/jump-table-compress.mir
@@ -0,0 +1,111 @@
+# RUN: llc -mtriple=aarch64-linux-gnu %s -run-pass=aarch64-jump-tables -o - | FileCheck %s
+--- |
+  define i32 @test_jumptable(i32 %in) {
+    unreachable
+  }
+
+...
+---
+name:            test_jumptable
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+jumpTable:
+  kind:            block-address
+  entries:
+    - id:              0
+      blocks:          [ '%bb.2', '%bb.3' ]
+    - id:              1
+      blocks:          [ '%bb.4', '%bb.5' ]
+    - id:              2
+      blocks:          [ '%bb.7' ]
+    - id:              3
+      blocks:          [ '%bb.9' ]
+    - id:              4
+      blocks:          [ '%bb.9' ]
+    - id:              5
+      blocks:          [ '%bb.11' ]
+body:             |
+  bb.0 (%ir-block.0):
+
+  bb.1 (%ir-block.0):
+    ; CHECK-LABEL: body:
+    ; CHECK-LABEL: bb.1
+    ; CHECK: JumpTableDest8
+    liveins: $x8
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.0
+    BR killed $x10
+
+  bb.2:
+    ; Last destination is 4 * 255 = 1020 bytes after first. Byte is OK.
+    dead $xzr = SPACE 1020, undef $xzr
+
+  bb.3:
+    ; CHECK-LABEL: bb.3
+    ; CHECK: JumpTableDest16
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.1
+    BR killed $x10
+
+  bb.4:
+    ; Last destination is 4 * 256 = 1024 bytes after first. Half needed.
+    dead $xzr = SPACE 1024, undef $xzr
+
+  bb.5:
+    ; CHECK-LABEL: bb.5
+    ; CHECK: JumpTableDest8
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.2
+    BR killed $x10
+
+  bb.6:
+    ; First destination is (2^20 - 4) after reference. Just reachable by ADR so can use compressed table.
+    dead $xzr = SPACE 1048556, undef $xzr
+
+  bb.7:
+    ; CHECK-LABEL: bb.7
+    ; CHECK: JumpTableDest32
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.3
+    BR killed $x10
+
+  bb.8:
+    ; First destination is 2^20 after reference. Compressed table cannot reach it.
+    dead $xzr = SPACE 1048560, undef $xzr
+
+  bb.9:
+    ; First destination is 2^20 before reference. Just within reach of ADR.
+    dead $xzr = SPACE 1048576, undef $xzr
+
+  bb.10:
+    ; CHECK-LABEL: bb.10
+    ; CHECK: JumpTableDest8
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.4
+    BR killed $x10
+
+  bb.11:
+    ; First destination is 2^20 before reference. Just within reach of ADR.
+    dead $xzr = SPACE 1048580, undef $xzr
+
+  bb.12:
+    ; CHECK-LABEL: bb.12
+    ; CHECK: JumpTableDest32
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.5
+    BR killed $x10
+...
diff --git a/test/CodeGen/AArch64/jump-table-exynos.ll b/test/CodeGen/AArch64/jump-table-exynos.ll
new file mode 100644
index 00000000000..e018410792e
--- /dev/null
+++ b/test/CodeGen/AArch64/jump-table-exynos.ll
@@ -0,0 +1,67 @@
+; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mattr=+force-32bit-jump-tables -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m1 -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m2 -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m3 -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+
+; Exynos doesn't want jump tables to be compressed for now.
+
+define i32 @test_jumptable(i32 %in)  {
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+    i32 2, label %lbl3
+    i32 4, label %lbl4
+  ]
+; CHECK-LABEL: test_jumptable:
+; CHECK-NOT: ldrb
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  ret i32 2
+
+lbl3:
+  ret i32 4
+
+lbl4:
+  ret i32 8
+
+}
+
+define i32 @test_jumptable_minsize(i32 %in) minsize {
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+    i32 2, label %lbl3
+    i32 4, label %lbl4
+  ]
+; CHECK-LABEL: test_jumptable_minsize:
+; CHECK:     adrp [[JTPAGE:x[0-9]+]], .LJTI1_0
+; CHECK:     add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI1_0
+; CHECK:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK:     ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}]
+; CHECK:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK:     br [[DEST]]
+
+
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  ret i32 2
+
+lbl3:
+  ret i32 4
+
+lbl4:
+  ret i32 8
+
+}
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 098b90f94b9..4e70e92beaf 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -aarch64-enable-atomic-cfg-tidy=0 -o - %s | FileCheck --check-prefix=CHECK-PIC %s
-; RUN: llc -code-model=tiny -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-TINY %s
+; RUN: llc -no-integrated-as -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -no-integrated-as -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -no-integrated-as -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -aarch64-enable-atomic-cfg-tidy=0 -o - %s | FileCheck --check-prefix=CHECK-PIC %s
+; RUN: llc -no-integrated-as -code-model=tiny -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-TINY %s
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
@@ -12,27 +12,45 @@ define i32 @test_jumptable(i32 %in) {
     i32 2, label %lbl3
     i32 4, label %lbl4
   ]
-; CHECK: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
-; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
-; CHECK: ldr [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #3]
-; CHECK: br [[DEST]]
-
-; CHECK-LARGE: movz x[[JTADDR:[0-9]+]], #:abs_g0_nc:.LJTI0_0
-; CHECK-LARGE: movk x[[JTADDR]], #:abs_g1_nc:.LJTI0_0
-; CHECK-LARGE: movk x[[JTADDR]], #:abs_g2_nc:.LJTI0_0
-; CHECK-LARGE: movk x[[JTADDR]], #:abs_g3:.LJTI0_0
-; CHECK-LARGE: ldr [[DEST:x[0-9]+]], [x[[JTADDR]], {{x[0-9]+}}, lsl #3]
-; CHECK-LARGE: br [[DEST]]
-
-; CHECK-PIC: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
-; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
-; CHECK-PIC: ldrsw [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #2]
-; CHECK-PIC: add [[TABLE:x[0-9]+]], [[DEST]], x[[JT]]
-; CHECK-PIC: br [[TABLE]]
-
-; CHECK-TINY: adr x[[JT:[0-9]+]], .LJTI0_0
-; CHECK-TINY: ldr [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #3]
-; CHECK-TINY: br [[DEST]]
+; CHECK-LABEL: test_jumptable:
+; CHECK:     adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
+; CHECK:     add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
+; CHECK:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK:     ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}]
+; CHECK:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK:     br [[DEST]]
+
+; CHECK-LARGE:     movz x[[JTADDR:[0-9]+]], #:abs_g0_nc:.LJTI0_0
+; CHECK-LARGE:     movk x[[JTADDR]], #:abs_g1_nc:.LJTI0_0
+; CHECK-LARGE:     movk x[[JTADDR]], #:abs_g2_nc:.LJTI0_0
+; CHECK-LARGE:     movk x[[JTADDR]], #:abs_g3:.LJTI0_0
+; CHECK-LARGE:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK-LARGE:     ldrb w[[OFFSET:[0-9]+]], [x[[JTADDR]], {{x[0-9]+}}]
+; CHECK-LARGE:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK-LARGE:     br [[DEST]]
+
+; CHECK-PIC-LABEL: test_jumptable:
+; CHECK-PIC:     adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
+; CHECK-PIC:     add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
+; CHECK-PIC:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK-PIC:     ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}]
+; CHECK-PIC:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK-PIC:     br [[DEST]]
+
+; CHECK-IOS:     adrp [[JTPAGE:x[0-9]+]], LJTI0_0@PAGE
+; CHECK-IOS:     add x[[JT:[0-9]+]], [[JTPAGE]], LJTI0_0@PAGEOFF
+; CHECK-IOS:     adr [[PCBASE:x[0-9]+]], [[JTBASE:LBB[0-9]+_[0-9]+]]
+; CHECK-IOS:     ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}]
+; CHECK-IOS:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK-IOS: br [[DEST]]
+
+; CHECK-TINY-LABEL: test_jumptable:
+; CHECK-TINY:     adr x[[JT:[0-9]+]], .LJTI0_0
+; CHECK-TINY:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK-TINY:     ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}]
+; CHECK-TINY:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK-TINY:     br [[DEST]]
+
 
 def:
   ret i32 0
@@ -54,18 +72,86 @@ lbl4:
 ; CHECK: .rodata
 
 ; CHECK: .LJTI0_0:
-; CHECK-NEXT: .xword
-; CHECK-NEXT: .xword
-; CHECK-NEXT: .xword
-; CHECK-NEXT: .xword
-; CHECK-NEXT: .xword
+; CHECK-NEXT: .byte ([[JTBASE]]-[[JTBASE]])>>2
+; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+
+define i32 @test_jumptable16(i32 %in) {
+
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+    i32 2, label %lbl3
+    i32 4, label %lbl4
+  ]
+; CHECK-LABEL: test_jumptable16:
+; CHECK:     adrp [[JTPAGE:x[0-9]+]], .LJTI1_0
+; CHECK:     add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI1_0
+; CHECK:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK:     ldrh w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #1]
+; CHECK:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK:     br [[DEST]]
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  ret i32 2
+
+lbl3:
+  ret i32 4
+
+lbl4:
+  ret i32 8
+
+}
+
+; CHECK:      .rodata
+; CHECK:      .p2align 1
+; CHECK: .LJTI1_0:
+; CHECK-NEXT: .hword ([[JTBASE]]-[[JTBASE]])>>2
+; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2
 
 ; CHECK-PIC-NOT: .data_region
 ; CHECK-PIC-NOT: .LJTI0_0
 ; CHECK-PIC: .LJTI0_0:
-; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
-; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
-; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
-; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
-; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .byte ([[JTBASE]]-[[JTBASE]])>>2
+; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
 ; CHECK-PIC-NOT: .end_data_region
+
+; CHECK-IOS: .section __TEXT,__const
+; CHECK-IOS-NOT: .data_region
+; CHECK-IOS: LJTI0_0:
+; CHECK-IOS-NEXT:     .byte ([[JTBASE]]-[[JTBASE]])>>2
+; CHECK-IOS-NEXT:     .byte (LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-IOS-NEXT:     .byte (LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-IOS-NEXT:     .byte (LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-IOS-NEXT:     .byte (LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-IOS-NOT: .end_data_region
diff --git a/test/CodeGen/AArch64/min-jump-table.ll b/test/CodeGen/AArch64/min-jump-table.ll
index b22e683ebfe..7d6d26259af 100644
--- a/test/CodeGen/AArch64/min-jump-table.ll
+++ b/test/CodeGen/AArch64/min-jump-table.ll
@@ -14,8 +14,8 @@ entry:
 ; CHECK0-NEXT: Jump Tables:
 ; CHECK0-NEXT: %jump-table.0:
 ; CHECK0-NOT: %jump-table.1:
-; CHECK4-NOT: Jump Tables:
-; CHECK8-NOT: Jump Tables:
+; CHECK4-NOT: {{^}}Jump Tables:
+; CHECK8-NOT: {{^}}Jump Tables:
 
 bb1: tail call void @ext(i32 0) br label %return
 bb2: tail call void @ext(i32 2) br label %return
@@ -38,7 +38,7 @@ entry:
 ; CHECK4-NEXT: Jump Tables:
 ; CHECK4-NEXT: %jump-table.0:
 ; CHECK4-NOT: %jump-table.1:
-; CHECK8-NOT: Jump Tables:
+; CHECK8-NOT: {{^}}Jump Tables:
 
 bb1: tail call void @ext(i32 0) br label %return
 bb2: tail call void @ext(i32 2) br label %return
-- 
GitLab


From 3435941b05c2674c84501436b89633302922aac5 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 24 Oct 2018 20:23:57 +0000
Subject: [PATCH 0524/1116] [MC] Separate masm integer literal lexer support
 from inline asm

Summary:
This renames the IsParsingMSInlineAsm member variable of AsmLexer to
LexMasmIntegers and moves it up to MCAsmLexer. This is the only behavior
controlled by that variable. I added a public setter, so that it can be
set from outside or from the llvm-mc command line. We may need to
arrange things so that users can get this behavior from clang, but
that's future work.

I also put additional hex literal lexing functionality under this flag
to fix PR32973. It appears that this hex literal parsing wasn't intended
to be enabled in non-masm-style blocks.

Now, masm integers (0b1101 and 0ABCh) work in __asm blocks from clang,
but 0b label references work when using .intel_syntax in standalone .s
files.

However, 0b label references will *not* work from __asm blocks in clang.
They will work from GCC inline asm blocks, which it sounds like is
important for Crypto++ as mentioned in PR36144.

Essentially, we only lex masm literals for inline asm blobs that use
intel syntax. If the .intel_syntax directive is used inside a gnu-style
inline asm statement, masm literals will not be lexed, which is
compatible with gas and llvm-mc standalone .s assembly.

This fixes PR36144 and PR32973.

Reviewers: Gerolf, avt77

Subscribers: eraman, hiraditya, llvm-commits

Differential Revision: https://reviews.llvm.org/D53535

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345189 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCParser/AsmLexer.h           |  2 --
 include/llvm/MC/MCParser/MCAsmLexer.h         |  5 +++
 .../AsmPrinter/AsmPrinterInlineAsm.cpp        |  5 +--
 lib/MC/MCParser/AsmLexer.cpp                  | 36 ++++++++++---------
 lib/MC/MCParser/AsmParser.cpp                 |  4 ++-
 lib/Target/X86/AsmParser/X86AsmParser.cpp     |  2 --
 test/MC/AArch64/macro-hex-int.s               |  8 +++++
 test/MC/X86/intel-syntax-hex.s                |  2 +-
 test/MC/X86/pr27884.s                         |  2 +-
 test/tools/llvm-mca/X86/intel-syntax.s        |  2 +-
 tools/llvm-mc/llvm-mc.cpp                     |  5 +++
 11 files changed, 47 insertions(+), 26 deletions(-)
 create mode 100644 test/MC/AArch64/macro-hex-int.s

diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
index 207183a69b0..2e9b8dfa3b2 100644
--- a/include/llvm/MC/MCParser/AsmLexer.h
+++ b/include/llvm/MC/MCParser/AsmLexer.h
@@ -30,7 +30,6 @@ class AsmLexer : public MCAsmLexer {
   StringRef CurBuf;
   bool IsAtStartOfLine = true;
   bool IsAtStartOfStatement = true;
-  bool IsParsingMSInlineAsm = false;
   bool IsPeeking = false;
 
 protected:
@@ -44,7 +43,6 @@ public:
   ~AsmLexer() override;
 
   void setBuffer(StringRef Buf, const char *ptr = nullptr);
-  void setParsingMSInlineAsm(bool V) { IsParsingMSInlineAsm = V; }
 
   StringRef LexUntilEndOfStatement() override;
 
diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index 8ff0df2a185..ea13d1cdc09 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -50,6 +50,7 @@ protected: // Can only create subclasses.
   bool SkipSpace = true;
   bool AllowAtInIdentifier;
   bool IsAtStartOfStatement = true;
+  bool LexMasmIntegers = false;
   AsmCommentConsumer *CommentConsumer = nullptr;
 
   MCAsmLexer();
@@ -146,6 +147,10 @@ public:
   void setCommentConsumer(AsmCommentConsumer *CommentConsumer) {
     this->CommentConsumer = CommentConsumer;
   }
+
+  /// Set whether to lex masm-style binary and hex literals. They look like
+  /// 0b1101 and 0ABCh respectively.
+  void setLexMasmIntegers(bool V) { LexMasmIntegers = V; }
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 2920ac66290..62103e3107c 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -156,9 +156,10 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   Parser->setAssemblerDialect(Dialect);
   Parser->setTargetParser(*TAP.get());
   Parser->setEnablePrintSchedInfo(EnablePrintSchedInfo);
+  // Enable lexing Masm binary and hex integer literals in intel inline
+  // assembly.
   if (Dialect == InlineAsm::AD_Intel)
-    // We need this flag to be able to parse numbers like "0bH"
-    Parser->setParsingInlineAsm(true);
+    Parser->getLexer().setLexMasmIntegers(true);
   if (MF) {
     const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
     TAP->SetFrameRegister(TRI->getFrameRegister(*MF));
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 74835fd70c0..c8d48f033f6 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -243,22 +243,26 @@ static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
 
 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
 // integer as a hexadecimal, possibly with leading zeroes.
-static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
-  const char *FirstHex = nullptr;
+static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
+                               bool LexHex) {
+  const char *FirstNonDec = nullptr;
   const char *LookAhead = CurPtr;
   while (true) {
     if (isDigit(*LookAhead)) {
       ++LookAhead;
-    } else if (isHexDigit(*LookAhead)) {
-      if (!FirstHex)
-        FirstHex = LookAhead;
-      ++LookAhead;
     } else {
-      break;
+      if (!FirstNonDec)
+        FirstNonDec = LookAhead;
+
+      // Keep going if we are looking for a 'h' suffix.
+      if (LexHex && isHexDigit(*LookAhead))
+        ++LookAhead;
+      else
+        break;
     }
   }
-  bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
-  CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
+  bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
+  CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
   if (isHex)
     return 16;
   return DefaultRadix;
@@ -281,7 +285,7 @@ static AsmToken intToken(StringRef Ref, APInt &Value)
 AsmToken AsmLexer::LexDigit() {
   // MASM-flavor binary integer: [01]+[bB]
   // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
-  if (IsParsingMSInlineAsm && isdigit(CurPtr[-1])) {
+  if (LexMasmIntegers && isdigit(CurPtr[-1])) {
     const char *FirstNonBinary = (CurPtr[-1] != '0' && CurPtr[-1] != '1') ?
                                    CurPtr - 1 : nullptr;
     const char *OldCurPtr = CurPtr;
@@ -320,7 +324,7 @@ AsmToken AsmLexer::LexDigit() {
 
   // Decimal integer: [1-9][0-9]*
   if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
-    unsigned Radix = doLookAhead(CurPtr, 10);
+    unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
     bool isHex = Radix == 16;
     // Check for floating point literals.
     if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
@@ -335,8 +339,8 @@ AsmToken AsmLexer::LexDigit() {
       return ReturnError(TokStart, !isHex ? "invalid decimal number" :
                            "invalid hexdecimal number");
 
-    // Consume the [bB][hH].
-    if (Radix == 2 || Radix == 16)
+    // Consume the [hH].
+    if (LexMasmIntegers && Radix == 16)
       ++CurPtr;
 
     // The darwin/x86 (and x86-64) assembler accepts and ignores type
@@ -346,7 +350,7 @@ AsmToken AsmLexer::LexDigit() {
     return intToken(Result, Value);
   }
 
-  if (!IsParsingMSInlineAsm && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
+  if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
     ++CurPtr;
     // See if we actually have "0b" as part of something like "jmp 0b\n"
     if (!isDigit(CurPtr[0])) {
@@ -395,7 +399,7 @@ AsmToken AsmLexer::LexDigit() {
       return ReturnError(TokStart, "invalid hexadecimal number");
 
     // Consume the optional [hH].
-    if (!IsParsingMSInlineAsm && (*CurPtr == 'h' || *CurPtr == 'H'))
+    if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
       ++CurPtr;
 
     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
@@ -407,7 +411,7 @@ AsmToken AsmLexer::LexDigit() {
 
   // Either octal or hexadecimal.
   APInt Value(128, 0, true);
-  unsigned Radix = doLookAhead(CurPtr, 8);
+  unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
   bool isHex = Radix == 16;
   StringRef Result(TokStart, CurPtr - TokStart);
   if (Result.getAsInteger(Radix, Value))
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 529f16525fe..3f7b507791e 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -229,7 +229,9 @@ public:
 
   void setParsingInlineAsm(bool V) override {
     ParsingInlineAsm = V;
-    Lexer.setParsingMSInlineAsm(V);
+    // When parsing MS inline asm, we must lex 0b1101 and 0ABCH as binary and
+    // hex integer literals.
+    Lexer.setLexMasmIntegers(V);
   }
   bool isParsingInlineAsm() override { return ParsingInlineAsm; }
 
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e67daa5d857..4801078925c 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3283,7 +3283,6 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal.startswith(".code"))
     return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
   else if (IDVal.startswith(".att_syntax")) {
-    getParser().setParsingInlineAsm(false);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if (Parser.getTok().getString() == "prefix")
         Parser.Lex();
@@ -3296,7 +3295,6 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   } else if (IDVal.startswith(".intel_syntax")) {
     getParser().setAssemblerDialect(1);
-    getParser().setParsingInlineAsm(true);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if (Parser.getTok().getString() == "noprefix")
         Parser.Lex();
diff --git a/test/MC/AArch64/macro-hex-int.s b/test/MC/AArch64/macro-hex-int.s
new file mode 100644
index 00000000000..0d697bce53e
--- /dev/null
+++ b/test/MC/AArch64/macro-hex-int.s
@@ -0,0 +1,8 @@
+// RUN: llvm-mc -triple aarch64-elf -filetype=obj %s -o - | llvm-objdump -d -r - | FileCheck %s
+
+.macro do_add sz
+        add     v0.\sz, v0.\sz, v0.\sz
+.endm
+
+do_add 8h
+// CHECK:  add     v0.8h, v0.8h, v0.8h
diff --git a/test/MC/X86/intel-syntax-hex.s b/test/MC/X86/intel-syntax-hex.s
index b3a19fbaa34..cb73ca9f501 100644
--- a/test/MC/X86/intel-syntax-hex.s
+++ b/test/MC/X86/intel-syntax-hex.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s | FileCheck %s
+// RUN: llvm-mc -masm-integers -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s | FileCheck %s
 // rdar://12470373
 
 // Checks to make sure we parse the hexadecimal suffix properly.
diff --git a/test/MC/X86/pr27884.s b/test/MC/X86/pr27884.s
index edd4e8d34a9..d78c35c8fc0 100644
--- a/test/MC/X86/pr27884.s
+++ b/test/MC/X86/pr27884.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown %s
+// RUN: llvm-mc -triple x86_64-unknown-unknown %s -masm-integers=1
 
 .intel_syntax
 add rbx, 0B0h
diff --git a/test/tools/llvm-mca/X86/intel-syntax.s b/test/tools/llvm-mca/X86/intel-syntax.s
index 1aaa3902866..786d06ba0d1 100644
--- a/test/tools/llvm-mca/X86/intel-syntax.s
+++ b/test/tools/llvm-mca/X86/intel-syntax.s
@@ -5,7 +5,7 @@
 
   .intel_syntax noprefix
   mov	eax, 1
-  mov	ebx, 0ffh
+  mov	ebx, 0xff
   imul	esi, edi
   lea	eax, [rsi + rdi]
 
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 0263c866f77..c0976502f54 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -164,6 +164,10 @@ MainFileName("main-file-name",
 static cl::opt<bool> SaveTempLabels("save-temp-labels",
                                     cl::desc("Don't discard temporary labels"));
 
+static cl::opt<bool> LexMasmIntegers(
+    "masm-integers",
+    cl::desc("Enable binary and hex masm integers (0b110 and 0ABCh)"));
+
 static cl::opt<bool> NoExecStack("no-exec-stack",
                                  cl::desc("File doesn't need an exec stack"));
 
@@ -293,6 +297,7 @@ static int AssembleInput(const char *ProgName, const Target *TheTarget,
     return SymbolResult;
   Parser->setShowParsedOperands(ShowInstOperands);
   Parser->setTargetParser(*TAP);
+  Parser->getLexer().setLexMasmIntegers(LexMasmIntegers);
 
   int Res = Parser->Run(NoInitialTextSection);
 
-- 
GitLab


From 679afabe4eff7cc278f079474fda793ecd89dc65 Mon Sep 17 00:00:00 2001
From: Matt Davis <Matthew.Davis@sony.com>
Date: Wed, 24 Oct 2018 20:27:47 +0000
Subject: [PATCH 0525/1116] [llvm-mca] Replace InstRef::isValid with operator
 bool. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345190 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/include/Instruction.h                |  2 +-
 .../lib/HardwareUnits/RetireControlUnit.cpp         |  4 ++--
 tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp      |  4 ++--
 tools/llvm-mca/lib/Stages/DispatchStage.cpp         |  2 +-
 tools/llvm-mca/lib/Stages/ExecuteStage.cpp          |  4 +---
 tools/llvm-mca/lib/Stages/FetchStage.cpp            | 13 +++++--------
 6 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/tools/llvm-mca/include/Instruction.h b/tools/llvm-mca/include/Instruction.h
index ca84b86d70d..a1d1082a215 100644
--- a/tools/llvm-mca/include/Instruction.h
+++ b/tools/llvm-mca/include/Instruction.h
@@ -444,7 +444,7 @@ public:
   const Instruction *getInstruction() const { return Data.second; }
 
   /// Returns true if this references a valid instruction.
-  bool isValid() const { return Data.second; }
+  operator bool() const { return Data.second != nullptr; }
 
   /// Invalidate this reference.
   void invalidate() { Data.second = nullptr; }
diff --git a/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp b/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
index af1b01f49dc..8f543eeb8c2 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
@@ -63,7 +63,7 @@ const RetireControlUnit::RUToken &RetireControlUnit::peekCurrentToken() const {
 void RetireControlUnit::consumeCurrentToken() {
   const RetireControlUnit::RUToken &Current = peekCurrentToken();
   assert(Current.NumSlots && "Reserved zero slots?");
-  assert(Current.IR.isValid() && "Invalid RUToken in the RCU queue.");
+  assert(Current.IR && "Invalid RUToken in the RCU queue.");
 
   // Update the slot index to be the next item in the circular queue.
   CurrentInstructionSlotIdx += Current.NumSlots;
@@ -73,7 +73,7 @@ void RetireControlUnit::consumeCurrentToken() {
 
 void RetireControlUnit::onInstructionExecuted(unsigned TokenID) {
   assert(Queue.size() > TokenID);
-  assert(Queue[TokenID].Executed == false && Queue[TokenID].IR.isValid());
+  assert(Queue[TokenID].Executed == false && Queue[TokenID].IR);
   Queue[TokenID].Executed = true;
 }
 
diff --git a/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp b/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
index 8bfa761c8a1..3d91cb12c2d 100644
--- a/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
@@ -108,7 +108,7 @@ void Scheduler::promoteToReadySet(SmallVectorImpl<InstRef> &Ready) {
   unsigned RemovedElements = 0;
   for (auto I = WaitSet.begin(), E = WaitSet.end(); I != E;) {
     InstRef &IR = *I;
-    if (!IR.isValid())
+    if (!IR)
       break;
 
     // Check if this instruction is now ready. In case, force
@@ -160,7 +160,7 @@ void Scheduler::updateIssuedSet(SmallVectorImpl<InstRef> &Executed) {
   unsigned RemovedElements = 0;
   for (auto I = IssuedSet.begin(), E = IssuedSet.end(); I != E;) {
     InstRef &IR = *I;
-    if (!IR.isValid())
+    if (!IR)
       break;
     Instruction &IS = *IR.getInstruction();
     if (!IS.isExecuted()) {
diff --git a/tools/llvm-mca/lib/Stages/DispatchStage.cpp b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
index a6be2474554..653f39bf5b7 100644
--- a/tools/llvm-mca/lib/Stages/DispatchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
@@ -154,7 +154,7 @@ Error DispatchStage::cycleStart() {
   AvailableEntries = CarryOver >= DispatchWidth ? 0 : DispatchWidth - CarryOver;
   unsigned DispatchedOpcodes = DispatchWidth - AvailableEntries;
   CarryOver -= DispatchedOpcodes;
-  assert(CarriedOver.isValid() && "Invalid dispatched instruction");
+  assert(CarriedOver && "Invalid dispatched instruction");
 
   SmallVector<unsigned, 8> RegisterFiles(PRF.getNumRegisterFiles(), 0U);
   notifyInstructionDispatched(CarriedOver, RegisterFiles, DispatchedOpcodes);
diff --git a/tools/llvm-mca/lib/Stages/ExecuteStage.cpp b/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
index fa297148167..3b45a84c338 100644
--- a/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
+++ b/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
@@ -73,7 +73,7 @@ Error ExecuteStage::issueInstruction(InstRef &IR) {
 
 Error ExecuteStage::issueReadyInstructions() {
   InstRef IR = HWS.select();
-  while (IR.isValid()) {
+  while (IR) {
     if (Error Err = issueInstruction(IR))
       return Err;
 
@@ -107,7 +107,6 @@ Error ExecuteStage::cycleStart() {
   return issueReadyInstructions();
 }
 
-
 #ifndef NDEBUG
 static void verifyInstructionEliminated(const InstRef &IR) {
   const Instruction &Inst = *IR.getInstruction();
@@ -121,7 +120,6 @@ static void verifyInstructionEliminated(const InstRef &IR) {
 }
 #endif
 
-
 Error ExecuteStage::handleInstructionEliminated(InstRef &IR) {
 #ifndef NDEBUG
   verifyInstructionEliminated(IR);
diff --git a/tools/llvm-mca/lib/Stages/FetchStage.cpp b/tools/llvm-mca/lib/Stages/FetchStage.cpp
index e607db9c8f0..515dc15c5b3 100644
--- a/tools/llvm-mca/lib/Stages/FetchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/FetchStage.cpp
@@ -17,19 +17,16 @@
 
 namespace mca {
 
-bool FetchStage::hasWorkToComplete() const {
-  return CurrentInstruction.isValid();
-}
+bool FetchStage::hasWorkToComplete() const { return CurrentInstruction; }
 
 bool FetchStage::isAvailable(const InstRef & /* unused */) const {
-  if (CurrentInstruction.isValid())
+  if (CurrentInstruction)
     return checkNextStage(CurrentInstruction);
   return false;
 }
 
 llvm::Error FetchStage::getNextInstruction() {
-  assert(!CurrentInstruction.isValid() &&
-         "There is already an instruction to process!");
+  assert(!CurrentInstruction && "There is already an instruction to process!");
   if (!SM.hasNext())
     return llvm::ErrorSuccess();
   const SourceRef SR = SM.peekNext();
@@ -45,7 +42,7 @@ llvm::Error FetchStage::getNextInstruction() {
 }
 
 llvm::Error FetchStage::execute(InstRef & /*unused */) {
-  assert(CurrentInstruction.isValid() && "There is no instruction to process!");
+  assert(CurrentInstruction && "There is no instruction to process!");
   if (llvm::Error Val = moveToTheNextStage(CurrentInstruction))
     return Val;
 
@@ -55,7 +52,7 @@ llvm::Error FetchStage::execute(InstRef & /*unused */) {
 }
 
 llvm::Error FetchStage::cycleStart() {
-  if (!CurrentInstruction.isValid())
+  if (!CurrentInstruction)
     return getNextInstruction();
   return llvm::ErrorSuccess();
 }
-- 
GitLab


From e7543196c954f4c5965253d418913b708726e81d Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 24 Oct 2018 20:37:40 +0000
Subject: [PATCH 0526/1116] [ExecutionEngine] Remove some dead code from
 JITEventListener.h.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345195 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/ExecutionEngine/JITEventListener.h   | 22 -------------------
 1 file changed, 22 deletions(-)

diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h
index 1ce772ccde9..589ca612f04 100644
--- a/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/include/llvm/ExecutionEngine/JITEventListener.h
@@ -35,34 +35,12 @@ class ObjectFile;
 
 } // end namespace object
 
-/// JITEvent_EmittedFunctionDetails - Helper struct for containing information
-/// about a generated machine code function.
-struct JITEvent_EmittedFunctionDetails {
-  struct LineStart {
-    /// The address at which the current line changes.
-    uintptr_t Address;
-
-    /// The new location information.  These can be translated to DebugLocTuples
-    /// using MF->getDebugLocTuple().
-    DebugLoc Loc;
-  };
-
-  /// The machine function the struct contains information for.
-  const MachineFunction *MF;
-
-  /// The list of line boundary information, sorted by address.
-  std::vector<LineStart> LineStarts;
-};
-
 /// JITEventListener - Abstract interface for use by the JIT to notify clients
 /// about significant events during compilation. For example, to notify
 /// profilers and debuggers that need to know where functions have been emitted.
 ///
 /// The default implementation of each method does nothing.
 class JITEventListener {
-public:
-  using EmittedFunctionDetails = JITEvent_EmittedFunctionDetails;
-
 public:
   JITEventListener() = default;
   virtual ~JITEventListener() = default;
-- 
GitLab


From b8895a84ec6427b0b640b52550b0c162de1cf51e Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric@codeaurora.org>
Date: Wed, 24 Oct 2018 21:07:38 +0000
Subject: [PATCH 0527/1116] [MIR] Add hasWinCFI field

Adding hasWinCFI field so that I can add MIR test cases to
https://reviews.llvm.org/D50166.

Differential Revision: https://reviews.llvm.org/D51201


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345196 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MIRYamlMapping.h | 2 ++
 lib/CodeGen/MIRParser/MIRParser.cpp   | 1 +
 lib/CodeGen/MIRPrinter.cpp            | 1 +
 3 files changed, 4 insertions(+)

diff --git a/include/llvm/CodeGen/MIRYamlMapping.h b/include/llvm/CodeGen/MIRYamlMapping.h
index dc9057521e7..98ac81915dc 100644
--- a/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/include/llvm/CodeGen/MIRYamlMapping.h
@@ -494,6 +494,7 @@ struct MachineFunction {
   bool FailedISel = false;
   // Register information
   bool TracksRegLiveness = false;
+  bool HasWinCFI = false;
   std::vector<VirtualRegisterDefinition> VirtualRegisters;
   std::vector<MachineFunctionLiveIn> LiveIns;
   Optional<std::vector<FlowStringValue>> CalleeSavedRegisters;
@@ -517,6 +518,7 @@ template <> struct MappingTraits<MachineFunction> {
     YamlIO.mapOptional("selected", MF.Selected, false);
     YamlIO.mapOptional("failedISel", MF.FailedISel, false);
     YamlIO.mapOptional("tracksRegLiveness", MF.TracksRegLiveness, false);
+    YamlIO.mapOptional("hasWinCFI", MF.HasWinCFI, false);
     YamlIO.mapOptional("registers", MF.VirtualRegisters,
                        std::vector<VirtualRegisterDefinition>());
     YamlIO.mapOptional("liveins", MF.LiveIns,
diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp
index 0102f1240a8..00da92a92ec 100644
--- a/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -355,6 +355,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   if (YamlMF.Alignment)
     MF.setAlignment(YamlMF.Alignment);
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
+  MF.setHasWinCFI(YamlMF.HasWinCFI);
 
   if (YamlMF.Legalized)
     MF.getProperties().set(MachineFunctionProperties::Property::Legalized);
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 88e2f16d3fd..80129463715 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -196,6 +196,7 @@ void MIRPrinter::print(const MachineFunction &MF) {
   YamlMF.Name = MF.getName();
   YamlMF.Alignment = MF.getAlignment();
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
+  YamlMF.HasWinCFI = MF.hasWinCFI();
 
   YamlMF.Legalized = MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::Legalized);
-- 
GitLab


From 04070efcd23ad080cc51eb3b9a187c746814a80c Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 24 Oct 2018 21:09:34 +0000
Subject: [PATCH 0528/1116] [X86] Add *SP to tailcall register class to fix
 verifier error

It's possible to do a tail call to a stack argument. LLVM already
calculates the right stack offset to call through.

Fixes the sibcall* and musttail* verifier failures tracked at PR27481.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345197 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86RegisterInfo.td     |  7 +--
 test/CodeGen/X86/musttail-indirect.ll |  4 +-
 test/CodeGen/X86/musttail-thiscall.ll |  4 +-
 test/CodeGen/X86/musttail-varargs.ll  |  4 +-
 test/CodeGen/X86/sibcall-2.ll         |  4 +-
 test/CodeGen/X86/sibcall.ll           | 63 ++++++++++++++++++---------
 6 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 31b939641fd..0c1b05fd3ab 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -436,11 +436,12 @@ def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
 def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
 def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
 def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
-def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
+def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, ESP)>;
 def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
-                                                     R8, R9, R11, RIP)>;
+                                                     R8, R9, R11, RIP, RSP)>;
 def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
-                                                      R8, R9, R10, R11, RIP)>;
+                                                      R8, R9, R10, R11,
+                                                      RIP, RSP)>;
 
 // GR8_NOREX - GR8 registers which do not require a REX prefix.
 def GR8_NOREX : RegisterClass<"X86", [i8], 8,
diff --git a/test/CodeGen/X86/musttail-indirect.ll b/test/CodeGen/X86/musttail-indirect.ll
index 7bb71c3fb03..c142ffae69d 100644
--- a/test/CodeGen/X86/musttail-indirect.ll
+++ b/test/CodeGen/X86/musttail-indirect.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s
-; RUN: llc < %s -mtriple=i686-win32 -O0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=i686-win32 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=i686-win32 -O0 | FileCheck %s
 
 ; IR simplified from the following C++ snippet compiled for i686-windows-msvc:
 
diff --git a/test/CodeGen/X86/musttail-thiscall.ll b/test/CodeGen/X86/musttail-thiscall.ll
index 454c66cd675..a1ddbd5d1cb 100644
--- a/test/CodeGen/X86/musttail-thiscall.ll
+++ b/test/CodeGen/X86/musttail-thiscall.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i686-- < %s | FileCheck %s
-; RUN: llc -mtriple=i686-- -O0 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=i686-- < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=i686-- -O0 < %s | FileCheck %s
 
 ; CHECK-LABEL: t1:
 ; CHECK: jmp {{_?}}t1_callee
diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll
index 080e5e5b1e0..6a338c5c7da 100644
--- a/test/CodeGen/X86/musttail-varargs.ll
+++ b/test/CodeGen/X86/musttail-varargs.ll
@@ -83,7 +83,6 @@ define void @f_thunk(i8* %this, ...) {
 ; LINUX-NEXT:    movq %rbp, %rdx
 ; LINUX-NEXT:    movq %r13, %rcx
 ; LINUX-NEXT:    movq %r12, %r8
-; LINUX-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
 ; LINUX-NEXT:    movq %r15, %r9
 ; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -93,6 +92,7 @@ define void @f_thunk(i8* %this, ...) {
 ; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; LINUX-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
 ; LINUX-NEXT:    addq $360, %rsp # imm = 0x168
 ; LINUX-NEXT:    .cfi_def_cfa_offset 56
 ; LINUX-NEXT:    popq %rbx
@@ -177,7 +177,6 @@ define void @f_thunk(i8* %this, ...) {
 ; LINUX-X32-NEXT:    movq %rbp, %rdx
 ; LINUX-X32-NEXT:    movq %r13, %rcx
 ; LINUX-X32-NEXT:    movq %r12, %r8
-; LINUX-X32-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
 ; LINUX-X32-NEXT:    movq %r15, %r9
 ; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -187,6 +186,7 @@ define void @f_thunk(i8* %this, ...) {
 ; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload
 ; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload
 ; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload
+; LINUX-X32-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
 ; LINUX-X32-NEXT:    addl $344, %esp # imm = 0x158
 ; LINUX-X32-NEXT:    .cfi_def_cfa_offset 56
 ; LINUX-X32-NEXT:    popq %rbx
diff --git a/test/CodeGen/X86/sibcall-2.ll b/test/CodeGen/X86/sibcall-2.ll
index 1b9d2db47c3..6ed7b5a1505 100644
--- a/test/CodeGen/X86/sibcall-2.ll
+++ b/test/CodeGen/X86/sibcall-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin   -disable-fp-elim | FileCheck %s -check-prefix=32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck %s -check-prefix=64
+; RUN: llc -verify-machineinstrs < %s -mtriple=i386-apple-darwin   -disable-fp-elim | FileCheck %s -check-prefix=32
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck %s -check-prefix=64
 
 ; Tail call should not use ebp / rbp after it's popped. Use esp / rsp.
 
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index 784b10b3566..2b4af2e5830 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-linux   -mcpu=core2 -mattr=+sse2 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 -mattr=+sse2 | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -mcpu=core2 -mattr=+sse2  | FileCheck %s --check-prefix=X32
+; RUN: llc -verify-machineinstrs < %s -mtriple=i686-linux   -mcpu=core2 -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-linux -mcpu=core2 -mattr=+sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-linux-gnux32 -mcpu=core2 -mattr=+sse2  | FileCheck %s --check-prefix=X32
 
 define void @t1(i32 %x) nounwind ssp {
 ; X86-LABEL: t1:
@@ -101,41 +101,62 @@ define void @t5(void ()* nocapture %x) nounwind ssp {
   ret void
 }
 
+; Basically the same test as t5, except pass the function pointer on the stack
+; for x86_64.
+
+define void @t5_x64(i32, i32, i32, i32, i32, i32, void ()* nocapture %x) nounwind ssp {
+; X86-LABEL: t5_x64:
+; X86:       # %bb.0:
+; X86-NEXT:    jmpl *{{[0-9]+}}(%esp) # TAILCALL
+;
+; X64-LABEL: t5_x64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmpq *{{[0-9]+}}(%rsp) # TAILCALL
+;
+; X32-LABEL: t5_x64:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    jmpq *%rax # TAILCALL
+  tail call void %x() nounwind
+  ret void
+}
+
+
 define i32 @t6(i32 %x) nounwind ssp {
 ; X86-LABEL: t6:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl $9, %eax
-; X86-NEXT:    jg .LBB5_2
+; X86-NEXT:    jg .LBB6_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    decl %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll t6
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB5_2: # %bb1
+; X86-NEXT:  .LBB6_2: # %bb1
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    jmp bar # TAILCALL
 ;
 ; X64-LABEL: t6:
 ; X64:       # %bb.0:
 ; X64-NEXT:    cmpl $9, %edi
-; X64-NEXT:    jg .LBB5_2
+; X64-NEXT:    jg .LBB6_2
 ; X64-NEXT:  # %bb.1: # %bb
 ; X64-NEXT:    decl %edi
 ; X64-NEXT:    jmp t6 # TAILCALL
-; X64-NEXT:  .LBB5_2: # %bb1
+; X64-NEXT:  .LBB6_2: # %bb1
 ; X64-NEXT:    jmp bar # TAILCALL
 ;
 ; X32-LABEL: t6:
 ; X32:       # %bb.0:
 ; X32-NEXT:    cmpl $9, %edi
-; X32-NEXT:    jg .LBB5_2
+; X32-NEXT:    jg .LBB6_2
 ; X32-NEXT:  # %bb.1: # %bb
 ; X32-NEXT:    decl %edi
 ; X32-NEXT:    jmp t6 # TAILCALL
-; X32-NEXT:  .LBB5_2: # %bb1
+; X32-NEXT:  .LBB6_2: # %bb1
 ; X32-NEXT:    jmp bar # TAILCALL
   %t0 = icmp slt i32 %x, 10
   br i1 %t0, label %bb, label %bb1
@@ -245,30 +266,30 @@ define i32 @t11(i32 %x, i32 %y, i32 %z.0, i32 %z.1, i32 %z.2) nounwind ssp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB10_1
+; X86-NEXT:    je .LBB11_1
 ; X86-NEXT:  # %bb.2: # %bb
 ; X86-NEXT:    jmp foo5 # TAILCALL
-; X86-NEXT:  .LBB10_1: # %bb6
+; X86-NEXT:  .LBB11_1: # %bb6
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t11:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB10_1
+; X64-NEXT:    je .LBB11_1
 ; X64-NEXT:  # %bb.2: # %bb
 ; X64-NEXT:    jmp foo5 # TAILCALL
-; X64-NEXT:  .LBB10_1: # %bb6
+; X64-NEXT:  .LBB11_1: # %bb6
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: t11:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    testl %edi, %edi
-; X32-NEXT:    je .LBB10_1
+; X32-NEXT:    je .LBB11_1
 ; X32-NEXT:  # %bb.2: # %bb
 ; X32-NEXT:    jmp foo5 # TAILCALL
-; X32-NEXT:  .LBB10_1: # %bb6
+; X32-NEXT:  .LBB11_1: # %bb6
 ; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    retq
 entry:
@@ -292,30 +313,30 @@ define i32 @t12(i32 %x, i32 %y, %struct.t* byval align 4 %z) nounwind ssp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB11_1
+; X86-NEXT:    je .LBB12_1
 ; X86-NEXT:  # %bb.2: # %bb
 ; X86-NEXT:    jmp foo6 # TAILCALL
-; X86-NEXT:  .LBB11_1: # %bb2
+; X86-NEXT:  .LBB12_1: # %bb2
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t12:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB11_1
+; X64-NEXT:    je .LBB12_1
 ; X64-NEXT:  # %bb.2: # %bb
 ; X64-NEXT:    jmp foo6 # TAILCALL
-; X64-NEXT:  .LBB11_1: # %bb2
+; X64-NEXT:  .LBB12_1: # %bb2
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: t12:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    testl %edi, %edi
-; X32-NEXT:    je .LBB11_1
+; X32-NEXT:    je .LBB12_1
 ; X32-NEXT:  # %bb.2: # %bb
 ; X32-NEXT:    jmp foo6 # TAILCALL
-; X32-NEXT:  .LBB11_1: # %bb2
+; X32-NEXT:  .LBB12_1: # %bb2
 ; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    retq
 entry:
-- 
GitLab


From 76616be80bbff8acf3e7b9090afecf9757ed2f63 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Wed, 24 Oct 2018 21:36:34 +0000
Subject: [PATCH 0529/1116] [DAG] check more operands for cycles when merging
 stores.

Until now, we've only checked whether merging stores would cause a cycle via
the value argument, but the address and indexed offset arguments are also
capable of creating cycles in some situations.

The addresses are all base+offset with notionally the same base, but the base
SDNode may still be different (e.g. via an indexed load in one case, and an
ISD::ADD elsewhere). This allows cycles to creep in if one of these sources
depends on another.

The indexed offset is usually undef (representing a non-indexed store), but on
some architectures (e.g. 32-bit ARM-mode ARM) it can be an arbitrary value,
again allowing dependency cycles to creep in.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345200 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3c7830e23c7..ef0afc71ab4 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14316,14 +14316,14 @@ bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
     //                    in candidate selection and can be
     //                    safely ignored
     //   * Value (Op 1) -> Cycles may happen (e.g. through load chains)
-    //   * Address (Op 2) -> Merged addresses may only vary by a fixed constant
-    //                      and so no cycles are possible.
-    //   * (Op 3) -> appears to always be undef. Cannot be source of cycle.
-    //
-    // Thus we need only check predecessors of the value operands.
-    auto *Op = N->getOperand(1).getNode();
-    if (Visited.insert(Op).second)
-      Worklist.push_back(Op);
+    //   * Address (Op 2) -> Merged addresses may only vary by a fixed constant,
+    //                       but aren't necessarily fromt the same base node, so
+    //                       cycles possible (e.g. via indexed store).
+    //   * (Op 3) -> Represents the pre or post-indexing offset (or undef for
+    //               non-indexed stores). Not constant on all targets (e.g. ARM)
+    //               and so can participate in a cycle.
+    for (unsigned j = 1; j < N->getNumOperands(); ++j)
+      Worklist.push_back(N->getOperand(j).getNode());
   }
   // Search through DAG. We can stop early if we find a store node.
   for (unsigned i = 0; i < NumStores; ++i)
-- 
GitLab


From 36c040be6bdfd14b9c1fe0592ea328c7fefdff35 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Wed, 24 Oct 2018 21:40:43 +0000
Subject: [PATCH 0530/1116] [AArch64] Refactor Exynos machine model

Effectively, NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345201 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrInfo.cpp    | 119 +++++++++++----------
 lib/Target/AArch64/AArch64InstrInfo.h      |   7 +-
 lib/Target/AArch64/AArch64SchedExynosM1.td |  17 +--
 lib/Target/AArch64/AArch64SchedExynosM3.td |  13 +--
 4 files changed, 86 insertions(+), 70 deletions(-)

diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index e6474046534..503bda08a9c 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -829,6 +829,71 @@ bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) const {
   }
 }
 
+bool AArch64InstrInfo::isExynosLdStExtFast(const MachineInstr &MI) const {
+  unsigned Imm;
+  AArch64_AM::ShiftExtendType Ext;
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+
+  // WriteLD
+  case AArch64::PRFMroW:
+  case AArch64::PRFMroX:
+
+  // WriteLDIdx
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRWroW:
+  case AArch64::LDRWroX:
+  case AArch64::LDRXroW:
+  case AArch64::LDRXroX:
+
+  case AArch64::LDRBroW:
+  case AArch64::LDRBroX:
+  case AArch64::LDRDroW:
+  case AArch64::LDRDroX:
+  case AArch64::LDRHroW:
+  case AArch64::LDRHroX:
+  case AArch64::LDRSroW:
+  case AArch64::LDRSroX:
+
+  // WriteSTIdx
+  case AArch64::STRBBroW:
+  case AArch64::STRBBroX:
+  case AArch64::STRHHroW:
+  case AArch64::STRHHroX:
+  case AArch64::STRWroW:
+  case AArch64::STRWroX:
+  case AArch64::STRXroW:
+  case AArch64::STRXroX:
+
+  case AArch64::STRBroW:
+  case AArch64::STRBroX:
+  case AArch64::STRDroW:
+  case AArch64::STRDroX:
+  case AArch64::STRHroW:
+  case AArch64::STRHroX:
+  case AArch64::STRSroW:
+  case AArch64::STRSroX:
+    Imm = MI.getOperand(3).getImm();
+    Ext = AArch64_AM::getMemExtendType(Imm);
+    return (Ext == AArch64_AM::SXTX || Ext == AArch64_AM::UXTX);
+  }
+}
+
 bool AArch64InstrInfo::isExynosShiftExtFast(const MachineInstr &MI) const {
   unsigned Imm, Shift;
   AArch64_AM::ShiftExtendType Ext;
@@ -895,60 +960,6 @@ bool AArch64InstrInfo::isExynosShiftExtFast(const MachineInstr &MI) const {
     Shift = AArch64_AM::getArithShiftValue(Imm);
     Ext = AArch64_AM::getArithExtendType(Imm);
     return (Shift == 0 || (Shift <= 3 && Ext == AArch64_AM::UXTX));
-
-  case AArch64::PRFMroW:
-  case AArch64::PRFMroX:
-
-  // WriteLDIdx
-  case AArch64::LDRBBroW:
-  case AArch64::LDRBBroX:
-  case AArch64::LDRHHroW:
-  case AArch64::LDRHHroX:
-  case AArch64::LDRSBWroW:
-  case AArch64::LDRSBWroX:
-  case AArch64::LDRSBXroW:
-  case AArch64::LDRSBXroX:
-  case AArch64::LDRSHWroW:
-  case AArch64::LDRSHWroX:
-  case AArch64::LDRSHXroW:
-  case AArch64::LDRSHXroX:
-  case AArch64::LDRSWroW:
-  case AArch64::LDRSWroX:
-  case AArch64::LDRWroW:
-  case AArch64::LDRWroX:
-  case AArch64::LDRXroW:
-  case AArch64::LDRXroX:
-
-  case AArch64::LDRBroW:
-  case AArch64::LDRBroX:
-  case AArch64::LDRDroW:
-  case AArch64::LDRDroX:
-  case AArch64::LDRHroW:
-  case AArch64::LDRHroX:
-  case AArch64::LDRSroW:
-  case AArch64::LDRSroX:
-
-  // WriteSTIdx
-  case AArch64::STRBBroW:
-  case AArch64::STRBBroX:
-  case AArch64::STRHHroW:
-  case AArch64::STRHHroX:
-  case AArch64::STRWroW:
-  case AArch64::STRWroX:
-  case AArch64::STRXroW:
-  case AArch64::STRXroX:
-
-  case AArch64::STRBroW:
-  case AArch64::STRBroX:
-  case AArch64::STRDroW:
-  case AArch64::STRDroX:
-  case AArch64::STRHroW:
-  case AArch64::STRHroX:
-  case AArch64::STRSroW:
-  case AArch64::STRSroX:
-    Imm = MI.getOperand(3).getImm();
-    Ext = AArch64_AM::getMemExtendType(Imm);
-    return (Ext == AArch64_AM::SXTX || Ext == AArch64_AM::UXTX);
   }
 }
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 49ea9ad0fda..05721336df7 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -250,11 +250,14 @@ public:
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
                      const outliner::Candidate &C) const override;
   bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
-  /// Returns true if the instruction sets to an immediate value that can be
+  /// Returns true if the instruction sets a constant value that can be
   /// executed more efficiently.
   bool isExynosResetFast(const MachineInstr &MI) const;
-  /// Returns true if the instruction has a shift left that can be executed
+  /// Returns true if the load or store has an extension that can be executed
   /// more efficiently.
+  bool isExynosLdStExtFast(const MachineInstr &MI) const;
+  /// Returns true if the instruction has a constant shift left or extension
+  /// that can be executed more efficiently.
   bool isExynosShiftExtFast(const MachineInstr &MI) const;
   /// Returns true if the instruction has a shift by immediate that can be
   /// executed in one cycle less.
diff --git a/lib/Target/AArch64/AArch64SchedExynosM1.td b/lib/Target/AArch64/AArch64SchedExynosM1.td
index c19ec45dab4..d566a13dc67 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM1.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM1.td
@@ -66,6 +66,7 @@ def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
 
 def M1BranchLinkPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
                                         MI->getOperand(0).getReg() != AArch64::LR}]>;
+def M1LdStExtPred    : SchedPredicate<[{TII->isExynosLdStExtFast(*MI)}]>;
 def M1ShiftExtPred   : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>;
 
 //===----------------------------------------------------------------------===//
@@ -110,10 +111,10 @@ def M1WriteLD : SchedWriteRes<[M1UnitL,
                                            let ResourceCycles = [2, 1]; }
 def M1WriteLH : SchedWriteRes<[]>        { let Latency = 5;
                                            let NumMicroOps = 0; }
-def M1WriteLX : SchedWriteVariant<[SchedVar<M1ShiftExtPred, [M1WriteL5]>,
-                                   SchedVar<NoSchedPred,    [M1WriteLC]>]>;
-def M1WriteLY : SchedWriteVariant<[SchedVar<M1ShiftExtPred, [M1WriteL5]>,
-                                   SchedVar<NoSchedPred,    [M1WriteLD]>]>;
+def M1WriteLX : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteL5]>,
+                                   SchedVar<NoSchedPred,   [M1WriteLC]>]>;
+def M1WriteLY : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteL5]>,
+                                   SchedVar<NoSchedPred,   [M1WriteLD]>]>;
 
 def M1WriteS1 : SchedWriteRes<[M1UnitS]>   { let Latency = 1; }
 def M1WriteS3 : SchedWriteRes<[M1UnitS]>   { let Latency = 3; }
@@ -140,10 +141,10 @@ def M1WriteSD : SchedWriteRes<[M1UnitS,
 def M1WriteSE : SchedWriteRes<[M1UnitS,
                                M1UnitA]>   { let Latency = 2;
                                              let NumMicroOps = 2; }
-def M1WriteSX : SchedWriteVariant<[SchedVar<M1ShiftExtPred, [M1WriteS1]>,
-                                   SchedVar<NoSchedPred,    [M1WriteSE]>]>;
-def M1WriteSY : SchedWriteVariant<[SchedVar<M1ShiftExtPred, [M1WriteS1]>,
-                                   SchedVar<NoSchedPred,    [M1WriteSB]>]>;
+def M1WriteSX : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteS1]>,
+                                   SchedVar<NoSchedPred,   [M1WriteSE]>]>;
+def M1WriteSY : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteS1]>,
+                                   SchedVar<NoSchedPred,   [M1WriteSB]>]>;
 
 def M1ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
                                       SchedVar<NoSchedPred,   [ReadDefault]>]>;
diff --git a/lib/Target/AArch64/AArch64SchedExynosM3.td b/lib/Target/AArch64/AArch64SchedExynosM3.td
index 7b3ab72ccd0..e61fb611ab2 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -114,6 +114,7 @@ def M3RotatePred     : SchedPredicate<[{(MI->getOpcode() == AArch64::EXTRWrri ||
                                          MI->getOpcode() == AArch64::EXTRXrri) &&
                                         MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
                                         MI->getOperand(1).getReg() == MI->getOperand(2).getReg()}]>;
+def M3LdStExtPred    : SchedPredicate<[{TII->isExynosLdStExtFast(*MI)}]>;
 def M3ShiftExtPred   : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>;
 
 //===----------------------------------------------------------------------===//
@@ -165,8 +166,8 @@ def M3WriteLD : SchedWriteRes<[M3UnitA,
 def M3WriteLH : SchedWriteRes<[]>        { let Latency = 5;
                                            let NumMicroOps = 0; }
 
-def M3WriteLX : SchedWriteVariant<[SchedVar<M3ShiftExtPred, [M3WriteL5]>,
-                                   SchedVar<NoSchedPred,    [M3WriteLB]>]>;
+def M3WriteLX : SchedWriteVariant<[SchedVar<M3LdStExtPred, [M3WriteL5]>,
+                                   SchedVar<NoSchedPred,   [M3WriteLB]>]>;
 
 def M3WriteS1 : SchedWriteRes<[M3UnitS]>   { let Latency = 1; }
 def M3WriteSA : SchedWriteRes<[M3UnitA,
@@ -180,10 +181,10 @@ def M3WriteSC : SchedWriteRes<[M3UnitA,
                                M3UnitS]>   { let Latency = 2;
                                              let NumMicroOps = 2; }
 
-def M3WriteSX : SchedWriteVariant<[SchedVar<M3ShiftExtPred, [M3WriteS1]>,
-                                   SchedVar<NoSchedPred,    [M3WriteSB]>]>;
-def M3WriteSY : SchedWriteVariant<[SchedVar<M3ShiftExtPred, [M3WriteS1]>,
-                                   SchedVar<NoSchedPred,    [M3WriteSC]>]>;
+def M3WriteSX : SchedWriteVariant<[SchedVar<M3LdStExtPred, [M3WriteS1]>,
+                                   SchedVar<NoSchedPred,   [M3WriteSB]>]>;
+def M3WriteSY : SchedWriteVariant<[SchedVar<M3LdStExtPred, [M3WriteS1]>,
+                                   SchedVar<NoSchedPred,   [M3WriteSC]>]>;
 
 def M3ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
                                       SchedVar<NoSchedPred,   [ReadDefault]>]>;
-- 
GitLab


From 718a779582c7913eae1f850e2ecdf75141179809 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl@gmail.com>
Date: Wed, 24 Oct 2018 21:46:42 +0000
Subject: [PATCH 0531/1116] [SourceMgr][FileCheck] Obey -color by extending
 WithColor

(Relands r344930, reverted in r344935, and now hopefully fixed for
Windows.)

While this change specifically targets FileCheck, it affects any tool
using the same SourceMgr facilities.

Previously, -color was documented in FileCheck's -help output, but
-color had no effect.  Now, -color obeys its documentation: it forces
colors to be used in FileCheck diagnostics even when stderr is not a
terminal.

-color is especially helpful when combined with FileCheck's -v, which
can produce a long series of diagnostics that you might wish to pipe
to a pager, such as less -R.  The WithColor extensions here will also
help to clean up color usage in FileCheck's annotated dump of input,
which is proposed in D52999.

Reviewed By: JDevlieghere, zturner

Differential Revision: https://reviews.llvm.org/D53419

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345202 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/CommandGuide/FileCheck.rst  |   4 ++
 include/llvm/Support/WithColor.h |  63 ++++++++++++++++--
 lib/Support/SourceMgr.cpp        | 108 +++++++++++++------------------
 lib/Support/WithColor.cpp        |  63 +++++++++++++-----
 test/FileCheck/opt-color.txt     |  22 +++++++
 utils/FileCheck/FileCheck.cpp    |   5 ++
 6 files changed, 179 insertions(+), 86 deletions(-)
 create mode 100644 test/FileCheck/opt-color.txt

diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 75df8a62268..830b1e00d4e 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -116,6 +116,10 @@ OPTIONS
   as old tests are migrated to the new non-overlapping ``CHECK-DAG:``
   implementation.
 
+.. option:: --color
+
+  Use colors in output (autodetected by default).
+
 EXIT STATUS
 -----------
 
diff --git a/include/llvm/Support/WithColor.h b/include/llvm/Support/WithColor.h
index 85fc5fa0cf1..76842d1c3dc 100644
--- a/include/llvm/Support/WithColor.h
+++ b/include/llvm/Support/WithColor.h
@@ -29,23 +29,49 @@ enum class HighlightColor {
   Macro,
   Error,
   Warning,
-  Note
+  Note,
+  Remark
 };
 
 /// An RAII object that temporarily switches an output stream to a specific
 /// color.
 class WithColor {
   raw_ostream &OS;
-  /// Determine whether colors should be displayed.
-  bool colorsEnabled(raw_ostream &OS);
+  bool DisableColors;
 
 public:
   /// To be used like this: WithColor(OS, HighlightColor::String) << "text";
-  WithColor(raw_ostream &OS, HighlightColor S);
+  /// @param OS The output stream
+  /// @param S Symbolic name for syntax element to color
+  /// @param DisableColors Whether to ignore color changes regardless of -color
+  /// and support in OS
+  WithColor(raw_ostream &OS, HighlightColor S, bool DisableColors = false);
+  /// To be used like this: WithColor(OS, raw_ostream::Black) << "text";
+  /// @param OS The output stream
+  /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
+  /// change only the bold attribute, and keep colors untouched
+  /// @param Bold Bold/brighter text, default false
+  /// @param BG If true, change the background, default: change foreground
+  /// @param DisableColors Whether to ignore color changes regardless of -color
+  /// and support in OS
+  WithColor(raw_ostream &OS,
+            raw_ostream::Colors Color = raw_ostream::SAVEDCOLOR,
+            bool Bold = false, bool BG = false, bool DisableColors = false)
+      : OS(OS), DisableColors(DisableColors) {
+    changeColor(Color, Bold, BG);
+  }
   ~WithColor();
 
   raw_ostream &get() { return OS; }
   operator raw_ostream &() { return OS; }
+  template <typename T> WithColor &operator<<(T &O) {
+    OS << O;
+    return *this;
+  }
+  template <typename T> WithColor &operator<<(const T &O) {
+    OS << O;
+    return *this;
+  }
 
   /// Convenience method for printing "error: " to stderr.
   static raw_ostream &error();
@@ -53,13 +79,36 @@ public:
   static raw_ostream &warning();
   /// Convenience method for printing "note: " to stderr.
   static raw_ostream &note();
+  /// Convenience method for printing "remark: " to stderr.
+  static raw_ostream &remark();
 
   /// Convenience method for printing "error: " to the given stream.
-  static raw_ostream &error(raw_ostream &OS, StringRef Prefix = "");
+  static raw_ostream &error(raw_ostream &OS, StringRef Prefix = "",
+                            bool DisableColors = false);
   /// Convenience method for printing "warning: " to the given stream.
-  static raw_ostream &warning(raw_ostream &OS, StringRef Prefix = "");
+  static raw_ostream &warning(raw_ostream &OS, StringRef Prefix = "",
+                              bool DisableColors = false);
   /// Convenience method for printing "note: " to the given stream.
-  static raw_ostream &note(raw_ostream &OS, StringRef Prefix = "");
+  static raw_ostream &note(raw_ostream &OS, StringRef Prefix = "",
+                           bool DisableColors = false);
+  /// Convenience method for printing "remark: " to the given stream.
+  static raw_ostream &remark(raw_ostream &OS, StringRef Prefix = "",
+                             bool DisableColors = false);
+
+  /// Determine whether colors are displayed.
+  bool colorsEnabled();
+
+  /// Change the color of text that will be output from this point forward.
+  /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
+  /// change only the bold attribute, and keep colors untouched
+  /// @param Bold Bold/brighter text, default false
+  /// @param BG If true, change the background, default: change foreground
+  WithColor &changeColor(raw_ostream::Colors Color, bool Bold = false,
+                         bool BG = false);
+
+  /// Reset the colors to terminal defaults. Call this when you are done
+  /// outputting colored text, or before program exit.
+  WithColor &resetColor();
 };
 
 } // end namespace llvm
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 582e2cf6c11..a55ad881d01 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -370,65 +371,48 @@ static bool isNonASCII(char c) {
   return c & 0x80;
 }
 
-void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
-                         bool ShowKindLabel) const {
-  // Display colors only if OS supports colors.
-  ShowColors &= S.has_colors();
+void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
+                         bool ShowColors, bool ShowKindLabel) const {
+  {
+    WithColor S(OS, raw_ostream::SAVEDCOLOR, true, false, !ShowColors);
 
-  if (ShowColors)
-    S.changeColor(raw_ostream::SAVEDCOLOR, true);
+    if (ProgName && ProgName[0])
+      S << ProgName << ": ";
 
-  if (ProgName && ProgName[0])
-    S << ProgName << ": ";
+    if (!Filename.empty()) {
+      if (Filename == "-")
+        S << "<stdin>";
+      else
+        S << Filename;
 
-  if (!Filename.empty()) {
-    if (Filename == "-")
-      S << "<stdin>";
-    else
-      S << Filename;
-
-    if (LineNo != -1) {
-      S << ':' << LineNo;
-      if (ColumnNo != -1)
-        S << ':' << (ColumnNo+1);
+      if (LineNo != -1) {
+        S << ':' << LineNo;
+        if (ColumnNo != -1)
+          S << ':' << (ColumnNo + 1);
+      }
+      S << ": ";
     }
-    S << ": ";
   }
 
   if (ShowKindLabel) {
     switch (Kind) {
     case SourceMgr::DK_Error:
-      if (ShowColors)
-        S.changeColor(raw_ostream::RED, true);
-      S << "error: ";
+      WithColor::error(OS, "", !ShowColors);
       break;
     case SourceMgr::DK_Warning:
-      if (ShowColors)
-        S.changeColor(raw_ostream::MAGENTA, true);
-      S << "warning: ";
+      WithColor::warning(OS, "", !ShowColors);
       break;
     case SourceMgr::DK_Note:
-      if (ShowColors)
-        S.changeColor(raw_ostream::BLACK, true);
-      S << "note: ";
+      WithColor::note(OS, "", !ShowColors);
       break;
     case SourceMgr::DK_Remark:
-      if (ShowColors)
-        S.changeColor(raw_ostream::BLUE, true);
-      S << "remark: ";
+      WithColor::remark(OS, "", !ShowColors);
       break;
     }
-
-    if (ShowColors) {
-      S.resetColor();
-      S.changeColor(raw_ostream::SAVEDCOLOR, true);
-    }
   }
 
-  S << Message << '\n';
-
-  if (ShowColors)
-    S.resetColor();
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true, false, !ShowColors)
+      << Message << '\n';
 
   if (LineNo == -1 || ColumnNo == -1)
     return;
@@ -439,7 +423,7 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
   // expanding them later, and bail out rather than show incorrect ranges and
   // misaligned fixits for any other odd characters.
   if (find_if(LineContents, isNonASCII) != LineContents.end()) {
-    printSourceLine(S, LineContents);
+    printSourceLine(OS, LineContents);
     return;
   }
   size_t NumColumns = LineContents.size();
@@ -473,29 +457,27 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
   // least.
   CaretLine.erase(CaretLine.find_last_not_of(' ')+1);
 
-  printSourceLine(S, LineContents);
+  printSourceLine(OS, LineContents);
 
-  if (ShowColors)
-    S.changeColor(raw_ostream::GREEN, true);
+  {
+    WithColor S(OS, raw_ostream::GREEN, true, false, !ShowColors);
 
-  // Print out the caret line, matching tabs in the source line.
-  for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
-    if (i >= LineContents.size() || LineContents[i] != '\t') {
-      S << CaretLine[i];
-      ++OutCol;
-      continue;
-    }
+    // Print out the caret line, matching tabs in the source line.
+    for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
+      if (i >= LineContents.size() || LineContents[i] != '\t') {
+        S << CaretLine[i];
+        ++OutCol;
+        continue;
+      }
 
-    // Okay, we have a tab.  Insert the appropriate number of characters.
-    do {
-      S << CaretLine[i];
-      ++OutCol;
-    } while ((OutCol % TabStop) != 0);
+      // Okay, we have a tab.  Insert the appropriate number of characters.
+      do {
+        S << CaretLine[i];
+        ++OutCol;
+      } while ((OutCol % TabStop) != 0);
+    }
+    S << '\n';
   }
-  S << '\n';
-
-  if (ShowColors)
-    S.resetColor();
 
   // Print out the replacement line, matching tabs in the source line.
   if (FixItInsertionLine.empty())
@@ -503,14 +485,14 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
 
   for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i < e; ++i) {
     if (i >= LineContents.size() || LineContents[i] != '\t') {
-      S << FixItInsertionLine[i];
+      OS << FixItInsertionLine[i];
       ++OutCol;
       continue;
     }
 
     // Okay, we have a tab.  Insert the appropriate number of characters.
     do {
-      S << FixItInsertionLine[i];
+      OS << FixItInsertionLine[i];
       // FIXME: This is trying not to break up replacements, but then to re-sync
       // with the tabs between replacements. This will fail, though, if two
       // fix-it replacements are exactly adjacent, or if a fix-it contains a
@@ -521,5 +503,5 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
       ++OutCol;
     } while (((OutCol % TabStop) != 0) && i != e);
   }
-  S << '\n';
+  OS << '\n';
 }
diff --git a/lib/Support/WithColor.cpp b/lib/Support/WithColor.cpp
index d2e13f0e86d..cf4c10956f2 100644
--- a/lib/Support/WithColor.cpp
+++ b/lib/Support/WithColor.cpp
@@ -19,15 +19,10 @@ static cl::opt<cl::boolOrDefault>
              cl::desc("Use colors in output (default=autodetect)"),
              cl::init(cl::BOU_UNSET));
 
-bool WithColor::colorsEnabled(raw_ostream &OS) {
-  if (UseColor == cl::BOU_UNSET)
-    return OS.has_colors();
-  return UseColor == cl::BOU_TRUE;
-}
-
-WithColor::WithColor(raw_ostream &OS, HighlightColor Color) : OS(OS) {
+WithColor::WithColor(raw_ostream &OS, HighlightColor Color, bool DisableColors)
+    : OS(OS), DisableColors(DisableColors) {
   // Detect color from terminal type unless the user passed the --color option.
-  if (colorsEnabled(OS)) {
+  if (colorsEnabled()) {
     switch (Color) {
     case HighlightColor::Address:
       OS.changeColor(raw_ostream::YELLOW);
@@ -56,6 +51,9 @@ WithColor::WithColor(raw_ostream &OS, HighlightColor Color) : OS(OS) {
     case HighlightColor::Note:
       OS.changeColor(raw_ostream::BLACK, true);
       break;
+    case HighlightColor::Remark:
+      OS.changeColor(raw_ostream::BLUE, true);
+      break;
     }
   }
 }
@@ -66,25 +64,58 @@ raw_ostream &WithColor::warning() { return warning(errs()); }
 
 raw_ostream &WithColor::note() { return note(errs()); }
 
-raw_ostream &WithColor::error(raw_ostream &OS, StringRef Prefix) {
+raw_ostream &WithColor::remark() { return remark(errs()); }
+
+raw_ostream &WithColor::error(raw_ostream &OS, StringRef Prefix,
+                              bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Error).get() << "error: ";
+  return WithColor(OS, HighlightColor::Error, DisableColors).get()
+         << "error: ";
 }
 
-raw_ostream &WithColor::warning(raw_ostream &OS, StringRef Prefix) {
+raw_ostream &WithColor::warning(raw_ostream &OS, StringRef Prefix,
+                                bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Warning).get() << "warning: ";
+  return WithColor(OS, HighlightColor::Warning, DisableColors).get()
+         << "warning: ";
 }
 
-raw_ostream &WithColor::note(raw_ostream &OS, StringRef Prefix) {
+raw_ostream &WithColor::note(raw_ostream &OS, StringRef Prefix,
+                             bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Note).get() << "note: ";
+  return WithColor(OS, HighlightColor::Note, DisableColors).get() << "note: ";
 }
 
-WithColor::~WithColor() {
-  if (colorsEnabled(OS))
+raw_ostream &WithColor::remark(raw_ostream &OS, StringRef Prefix,
+                               bool DisableColors) {
+  if (!Prefix.empty())
+    OS << Prefix << ": ";
+  return WithColor(OS, HighlightColor::Remark, DisableColors).get()
+         << "remark: ";
+}
+
+bool WithColor::colorsEnabled() {
+  if (DisableColors)
+    return false;
+  if (UseColor == cl::BOU_UNSET)
+    return OS.has_colors();
+  return UseColor == cl::BOU_TRUE;
+}
+
+WithColor &WithColor::changeColor(raw_ostream::Colors Color, bool Bold,
+                                  bool BG) {
+  if (colorsEnabled())
+    OS.changeColor(Color, Bold, BG);
+  return *this;
+}
+
+WithColor &WithColor::resetColor() {
+  if (colorsEnabled())
     OS.resetColor();
+  return *this;
 }
+
+WithColor::~WithColor() { resetColor(); }
diff --git a/test/FileCheck/opt-color.txt b/test/FileCheck/opt-color.txt
new file mode 100644
index 00000000000..9430114bf31
--- /dev/null
+++ b/test/FileCheck/opt-color.txt
@@ -0,0 +1,22 @@
+; Create a case that produces a simple diagnostic.
+; RUN: echo foo > %t.in
+; CHECK: bar
+
+; Run without and with -color.  In the former case, FileCheck should suppress
+; color in its diagnostics because stderr is a file.
+; RUN: not FileCheck %s < %t.in 2> %t.no-color
+; RUN: not FileCheck -color %s < %t.in 2> %t.color
+
+; Check whether color was produced.
+; RUN: FileCheck -check-prefix NO-COLOR %s < %t.no-color
+; RUN: FileCheck -check-prefix COLOR %s < %t.color
+
+; Make sure our NO-COLOR and COLOR patterns are sane: they don't match the
+; opposite cases.
+; RUN: not FileCheck -check-prefix COLOR %s < %t.no-color
+; RUN: not FileCheck -check-prefix NO-COLOR %s < %t.color
+
+; I don't know of a good way to check for ANSI color codes, so just make sure
+; some new characters show up where those codes should appear.
+; NO-COLOR: : error: CHECK: expected string not found in input
+; COLOR: : {{.+}}error: {{.+}}CHECK: expected string not found in input
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index 8ce1d9cbd09..bf3c3983cfa 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -18,6 +18,7 @@
 
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/FileCheck.h"
 using namespace llvm;
@@ -108,6 +109,10 @@ static void DumpCommandLine(int argc, char **argv) {
 }
 
 int main(int argc, char **argv) {
+  // Enable use of ANSI color codes because FileCheck is using them to
+  // highlight text.
+  llvm::sys::Process::UseANSIEscapeCodes(true);
+
   InitLLVM X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv);
 
-- 
GitLab


From d51aaa8a3f6aff1459625f3f20e8c6824ac785ad Mon Sep 17 00:00:00 2001
From: Paul Robinson <paul.robinson@sony.com>
Date: Wed, 24 Oct 2018 21:51:55 +0000
Subject: [PATCH 0532/1116] Make llvm-dwarfdump -name work on type units.

Differential Revision: https://reviews.llvm.org/D53672

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345203 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-dwarfdump/X86/typeunit-name.s | 100 ++++++++++++++++++
 tools/llvm-dwarfdump/llvm-dwarfdump.cpp       |   4 +-
 2 files changed, 102 insertions(+), 2 deletions(-)
 create mode 100644 test/tools/llvm-dwarfdump/X86/typeunit-name.s

diff --git a/test/tools/llvm-dwarfdump/X86/typeunit-name.s b/test/tools/llvm-dwarfdump/X86/typeunit-name.s
new file mode 100644
index 00000000000..7b60ac96ffe
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/typeunit-name.s
@@ -0,0 +1,100 @@
+# Demonstrate that -name works with type units.
+# RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o %t.o
+# RUN: llvm-dwarfdump -name=V4_type_unit -name=V5_split_type_unit %t.o | FileCheck %s
+#
+# The names should appear twice, once for the unit and once for the type DIE,
+# because we give them the same name.
+# CHECK: V4_type_unit
+# CHECK: V4_type_unit
+# CHECK: V5_split_type_unit
+# CHECK: V5_split_type_unit
+
+        .section .debug_str,"MS",@progbits,1
+str_TU_4:
+        .asciz "V4_type_unit"
+
+        .section .debug_str.dwo,"MS",@progbits,1
+dwo_TU_5:
+        .asciz "V5_split_type_unit"
+
+# Abbrev section for the normal type unit.
+        .section .debug_abbrev,"",@progbits
+        .byte 0x01  # Abbrev code
+        .byte 0x41  # DW_TAG_type_unit
+        .byte 0x01  # DW_CHILDREN_yes
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x02  # Abbrev code
+        .byte 0x13  # DW_TAG_structure_type
+        .byte 0x00  # DW_CHILDREN_no (no members)
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x00  # EOM(3)
+
+# And a .dwo copy for the .dwo section.
+        .section .debug_abbrev.dwo,"",@progbits
+        .byte 0x01  # Abbrev code
+        .byte 0x41  # DW_TAG_type_unit
+        .byte 0x01  # DW_CHILDREN_yes
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x02  # Abbrev code
+        .byte 0x13  # DW_TAG_structure_type
+        .byte 0x00  # DW_CHILDREN_no (no members)
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x00  # EOM(3)
+
+        .section .debug_types,"",@progbits
+
+# DWARF v4 Type unit header. Normal/split are identical so we do only one.
+TU_4_start:
+        .long  TU_4_end-TU_4_version  # Length of Unit
+TU_4_version:
+        .short 4               # DWARF version number
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+        .byte 8                # Address Size (in bytes)
+        .quad 0x0011223344556677 # Type Signature
+        .long TU_4_type-TU_4_start # Type offset
+# The type-unit DIE, which has a name.
+        .byte 1
+        .long str_TU_4
+# The type DIE, which has the same name.
+TU_4_type:
+        .byte 2
+        .long str_TU_4
+        .byte 0 # NULL
+        .byte 0 # NULL
+TU_4_end:
+
+        .section .debug_types.dwo,"",@progbits
+# FIXME: DWARF v5 wants type units in .debug_info[.dwo] not .debug_types[.dwo].
+
+# DWARF v5 split type unit header.
+TU_split_5_start:
+        .long  TU_split_5_end-TU_split_5_version  # Length of Unit
+TU_split_5_version:
+        .short 5               # DWARF version number
+        .byte 6                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev.dwo    # Offset Into Abbrev. Section
+        .quad 0x8899aabbccddeeff # Type Signature
+        .long TU_split_5_type-TU_split_5_start  # Type offset
+# The type-unit DIE, which has a name.
+        .byte 1
+        .long dwo_TU_5
+# The type DIE, which has the same name.
+TU_split_5_type:
+        .byte 2
+        .long dwo_TU_5
+        .byte 0 # NULL
+        .byte 0 # NULL
+TU_split_5_end:
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 0ed86964089..d9e8e36efe5 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -422,8 +422,8 @@ static bool dumpObjectFile(ObjectFile &Obj, DWARFContext &DICtx, Twine Filename,
     for (auto name : Name)
       Names.insert((IgnoreCase && !UseRegex) ? StringRef(name).lower() : name);
 
-    filterByName(Names, DICtx.compile_units(), OS);
-    filterByName(Names, DICtx.dwo_compile_units(), OS);
+    filterByName(Names, DICtx.normal_units(), OS);
+    filterByName(Names, DICtx.dwo_units(), OS);
     return true;
   }
 
-- 
GitLab


From a5ed0ab1fe735d5bb3a789e7b4fa84b4bbca54b6 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 24 Oct 2018 22:02:05 +0000
Subject: [PATCH 0533/1116] [InstCombine] add test for fptrunc with vector with
 undef elt; NFC

This should be fixed with D53650.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345206 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/fpcast.ll | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/test/Transforms/InstCombine/fpcast.ll b/test/Transforms/InstCombine/fpcast.ll
index 4cada061a8f..7ba2ca04bcd 100644
--- a/test/Transforms/InstCombine/fpcast.ll
+++ b/test/Transforms/InstCombine/fpcast.ll
@@ -29,8 +29,8 @@ define half @test3(float %a) {
   ret half %c
 }
 
-define half @test4(float %a) {
-; CHECK-LABEL: @test4(
+define half @fneg_fptrunc(float %a) {
+; CHECK-LABEL: @fneg_fptrunc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
 ; CHECK-NEXT:    [[C:%.*]] = fsub half 0xH8000, [[TMP1]]
 ; CHECK-NEXT:    ret half [[C]]
@@ -40,6 +40,17 @@ define half @test4(float %a) {
   ret half %c
 }
 
+define <2 x half> @fneg_fptrunc_vec_undef(<2 x float> %a) {
+; CHECK-LABEL: @fneg_fptrunc_vec_undef(
+; CHECK-NEXT:    [[B:%.*]] = fsub <2 x float> <float -0.000000e+00, float undef>, [[A:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = fptrunc <2 x float> [[B]] to <2 x half>
+; CHECK-NEXT:    ret <2 x half> [[C]]
+;
+  %b = fsub <2 x float> <float -0.0, float undef>, %a
+  %c = fptrunc <2 x float> %b to <2 x half>
+  ret <2 x half> %c
+}
+
 define half @test4-fast(float %a) {
 ; CHECK-LABEL: @test4-fast(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
-- 
GitLab


From c0bb0349d79c133514ed23b50f29a9f7ce96350e Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Wed, 24 Oct 2018 22:15:41 +0000
Subject: [PATCH 0534/1116] [HotColdSplitting] Identify larger cold regions
 using domtree queries

The current splitting algorithm works in three stages:

  1) Identify cold blocks, then
  2) Use forward/backward propagation to mark hot blocks, then
  3) Grow a SESE region of blocks *outside* of the set of hot blocks and
  start outlining.

While testing this pass on Apple internal frameworks I noticed that some
kinds of control flow (e.g. loops) are never outlined, even though they
unconditionally lead to / follow cold blocks. I noticed two other issues
related to how cold regions are identified:

  - An inconsistency can arise in the internal state of the hotness
  propagation stage, as a block may end up in both the ColdBlocks set
  and the HotBlocks set. Further inconsistencies can arise as these sets
  do not match what's in ProfileSummaryInfo.

  - It isn't necessary to limit outlining to single-exit regions.

This patch teaches the splitting algorithm to identify maximal cold
regions and outline them. A maximal cold region is defined as the set of
blocks post-dominated by a cold sink block, or dominated by that sink
block. This approach can successfully outline loops in the cold path. As
a side benefit, it maintains less internal state than the current
approach.

Due to a limitation in CodeExtractor, blocks within the maximal cold
region which aren't dominated by a single entry point (a so-called "max
ancestor") are filtered out.

Results:
  - X86 (LNT + -Os + externals): 134KB of TEXT were outlined compared to
  47KB pre-patch, or a ~3x improvement. Did not see a performance impact
  across two runs.
  - AArch64 (LNT + -Os + externals + Apple-internal benchmarks): 149KB
  of TEXT were outlined. Ditto re: performance impact.
  - Outlining results improve marginally in the internal frameworks I
  tested.

Follow-ups:
  - Outline more than once per function, outline large single basic
  blocks, & try to remove unconditional branches in outlined functions.

Differential Revision: https://reviews.llvm.org/D53627

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345209 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/HotColdSplitting.cpp       | 353 +++++++++---------
 lib/Transforms/Utils/CodeExtractor.cpp        |  40 +-
 .../{split-cold-1.ll => do-not-split.ll}      |  21 +-
 .../HotColdSplit/duplicate-phi-preds-crash.ll |  54 +++
 .../Transforms/HotColdSplit/multiple-exits.ll |  73 ++++
 .../HotColdSplit/outline-if-then-else.ll      |  64 ++++
 .../HotColdSplit/outline-while-loop.ll        |  67 ++++
 .../Transforms/Utils/CodeExtractorTest.cpp    |  21 +-
 8 files changed, 487 insertions(+), 206 deletions(-)
 rename test/Transforms/HotColdSplit/{split-cold-1.ll => do-not-split.ll} (71%)
 create mode 100644 test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll
 create mode 100644 test/Transforms/HotColdSplit/multiple-exits.ll
 create mode 100644 test/Transforms/HotColdSplit/outline-if-then-else.ll
 create mode 100644 test/Transforms/HotColdSplit/outline-while-loop.ll

diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index a63cd842241..4f371a494e9 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -57,10 +57,8 @@
 
 #define DEBUG_TYPE "hotcoldsplit"
 
-STATISTIC(NumColdSESEFound,
-          "Number of cold single entry single exit (SESE) regions found.");
-STATISTIC(NumColdSESEOutlined,
-          "Number of cold single entry single exit (SESE) regions outlined.");
+STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
+STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
 
 using namespace llvm;
 
@@ -74,32 +72,10 @@ struct PostDomTree : PostDomTreeBase<BasicBlock> {
   PostDomTree(Function &F) { recalculate(F); }
 };
 
-typedef DenseSet<const BasicBlock *> DenseSetBB;
-typedef DenseMap<const BasicBlock *, uint64_t> DenseMapBBInt;
-
-// From: https://reviews.llvm.org/D22558
-// Exit is not part of the region.
-static bool isSingleEntrySingleExit(BasicBlock *Entry, const BasicBlock *Exit,
-                                    DominatorTree *DT, PostDomTree *PDT,
-                                    SmallVectorImpl<BasicBlock *> &Region) {
-  if (!DT->dominates(Entry, Exit))
-    return false;
-
-  if (!PDT->dominates(Exit, Entry))
-    return false;
-
-  for (auto I = df_begin(Entry), E = df_end(Entry); I != E;) {
-    if (*I == Exit) {
-      I.skipChildren();
-      continue;
-    }
-    if (!DT->dominates(Entry, *I))
-      return false;
-    Region.push_back(*I);
-    ++I;
-  }
-  return true;
-}
+/// A sequence of basic blocks.
+///
+/// A 0-sized SmallVector is slightly cheaper to move than a std::vector.
+using BlockSequence = SmallVector<BasicBlock *, 0>;
 
 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
 // this function unless you modify the MBB version as well.
@@ -149,105 +125,155 @@ static bool unlikelyExecuted(const BasicBlock &BB) {
   return false;
 }
 
-static bool returnsOrHasSideEffects(const BasicBlock &BB) {
-  const Instruction *I = BB.getTerminator();
-  if (isa<ReturnInst>(I) || isa<IndirectBrInst>(I) || isa<InvokeInst>(I))
-    return true;
+/// Check whether it's safe to outline \p BB.
+static bool mayExtractBlock(const BasicBlock &BB) {
+  return !BB.hasAddressTaken();
+}
 
-  for (const Instruction &I : BB)
-    if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
-      if (CI->hasFnAttr(Attribute::NoReturn))
-        return true;
+/// Identify the maximal region of cold blocks which includes \p SinkBB.
+///
+/// Include all blocks post-dominated by \p SinkBB, \p SinkBB itself, and all
+/// blocks dominated by \p SinkBB. Exclude all other blocks, and blocks which
+/// cannot be outlined.
+///
+/// Return an empty sequence if the cold region is too small to outline, or if
+/// the cold region has no warm predecessors.
+static BlockSequence
+findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) {
+  // The maximal cold region.
+  BlockSequence ColdRegion = {};
+
+  // The ancestor farthest-away from SinkBB, and also post-dominated by it.
+  BasicBlock *MaxAncestor = &SinkBB;
+  unsigned MaxAncestorHeight = 0;
+
+  // Visit SinkBB's ancestors using inverse DFS.
+  auto PredIt = ++idf_begin(&SinkBB);
+  auto PredEnd = idf_end(&SinkBB);
+  while (PredIt != PredEnd) {
+    BasicBlock &PredBB = **PredIt;
+    bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
+
+    // If SinkBB does not post-dominate a predecessor, do not mark the
+    // predecessor (or any of its predecessors) cold.
+    if (!SinkPostDom || !mayExtractBlock(PredBB)) {
+      PredIt.skipChildren();
+      continue;
+    }
 
-      if (isa<InlineAsm>(CI->getCalledValue()))
-        return true;
+    // Keep track of the post-dominated ancestor farthest away from the sink.
+    unsigned AncestorHeight = PredIt.getPathLength();
+    if (AncestorHeight > MaxAncestorHeight) {
+      MaxAncestor = &PredBB;
+      MaxAncestorHeight = AncestorHeight;
     }
 
-  return false;
-}
+    ColdRegion.push_back(&PredBB);
+    ++PredIt;
+  }
 
-static DenseSetBB getHotBlocks(Function &F) {
+  // CodeExtractor requires that all blocks to be extracted must be dominated
+  // by the first block to be extracted.
+  //
+  // To avoid spurious or repeated outlining, require that the max ancestor
+  // has a predecessor. By construction this predecessor is not in the cold
+  // region, i.e. its existence implies we don't outline the whole function.
+  //
+  // TODO: If MaxAncestor has no predecessors, we may be able to outline the
+  // second largest cold region that has a predecessor.
+  if (pred_empty(MaxAncestor) ||
+      MaxAncestor->getSinglePredecessor() == MaxAncestor)
+    return {};
+
+  // Filter out predecessors not dominated by the max ancestor.
+  //
+  // TODO: Blocks not dominated by the max ancestor could be extracted as
+  // other cold regions. Marking outlined calls as noreturn when appropriate
+  // and outlining more than once per function could achieve most of the win.
+  auto EraseIt = remove_if(ColdRegion, [&](BasicBlock *PredBB) {
+    return PredBB != MaxAncestor && !DT.dominates(MaxAncestor, PredBB);
+  });
+  ColdRegion.erase(EraseIt, ColdRegion.end());
 
-  // Mark all cold basic blocks.
-  DenseSetBB ColdBlocks;
-  for (BasicBlock &BB : F)
-    if (unlikelyExecuted(BB)) {
-      LLVM_DEBUG(llvm::dbgs() << "\nForward propagation marks cold: " << BB);
-      ColdBlocks.insert((const BasicBlock *)&BB);
-    }
+  // Add SinkBB to the cold region.
+  ColdRegion.push_back(&SinkBB);
 
-  // Forward propagation: basic blocks are hot when they are reachable from the
-  // beginning of the function through a path that does not contain cold blocks.
-  SmallVector<const BasicBlock *, 8> WL;
-  DenseSetBB HotBlocks;
-
-  const BasicBlock *It = &F.front();
-  if (!ColdBlocks.count(It)) {
-    HotBlocks.insert(It);
-    // Breadth First Search to mark edges reachable from hot.
-    WL.push_back(It);
-    while (WL.size() > 0) {
-      It = WL.pop_back_val();
-
-      for (const BasicBlock *Succ : successors(It)) {
-        // Do not visit blocks that are cold.
-        if (!ColdBlocks.count(Succ) && !HotBlocks.count(Succ)) {
-          HotBlocks.insert(Succ);
-          WL.push_back(Succ);
-        }
-      }
-    }
+  // Ensure that the first extracted block is the max ancestor.
+  if (ColdRegion[0] != MaxAncestor) {
+    auto AncestorIt = find(ColdRegion, MaxAncestor);
+    *AncestorIt = ColdRegion[0];
+    ColdRegion[0] = MaxAncestor;
   }
 
-  assert(WL.empty() && "work list should be empty");
-
-  DenseMapBBInt NumHotSuccessors;
-  // Back propagation: when all successors of a basic block are cold, the
-  // basic block is cold as well.
-  for (BasicBlock &BBRef : F) {
-    const BasicBlock *BB = &BBRef;
-    if (HotBlocks.count(BB)) {
-      // Keep a count of hot successors for every hot block.
-      NumHotSuccessors[BB] = 0;
-      for (const BasicBlock *Succ : successors(BB))
-        if (!ColdBlocks.count(Succ))
-          NumHotSuccessors[BB] += 1;
-
-      // Add to work list the blocks with all successors cold. Those are the
-      // root nodes in the next loop, where we will move those blocks from
-      // HotBlocks to ColdBlocks and iterate over their predecessors.
-      if (NumHotSuccessors[BB] == 0)
-        WL.push_back(BB);
+  // Find all successors of SinkBB dominated by SinkBB using DFS.
+  auto SuccIt = ++df_begin(&SinkBB);
+  auto SuccEnd = df_end(&SinkBB);
+  while (SuccIt != SuccEnd) {
+    BasicBlock &SuccBB = **SuccIt;
+    bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
+
+    // If SinkBB does not dominate a successor, do not mark the successor (or
+    // any of its successors) cold.
+    if (!SinkDom || !mayExtractBlock(SuccBB)) {
+      SuccIt.skipChildren();
+      continue;
     }
+
+    ColdRegion.push_back(&SuccBB);
+    ++SuccIt;
   }
 
-  while (WL.size() > 0) {
-    It = WL.pop_back_val();
-    if (ColdBlocks.count(It))
+  // TODO: Consider outlining regions with just 1 block, but more than some
+  // threshold of instructions.
+  if (ColdRegion.size() == 1)
+    return {};
+
+  return ColdRegion;
+}
+
+/// Get the largest cold region in \p F.
+static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI,
+                                          BlockFrequencyInfo *BFI,
+                                          DominatorTree &DT, PostDomTree &PDT) {
+  // Keep track of the largest cold region.
+  BlockSequence LargestColdRegion = {};
+
+  for (BasicBlock &BB : F) {
+    // Identify cold blocks.
+    if (!mayExtractBlock(BB))
+      continue;
+    bool Cold =
+        PSI.isColdBB(&BB, BFI) || (EnableStaticAnalyis && unlikelyExecuted(BB));
+    if (!Cold)
       continue;
 
-    // Do not back-propagate to blocks that return or have side effects.
-    if (returnsOrHasSideEffects(*It))
+    LLVM_DEBUG({
+      dbgs() << "Found cold block:\n";
+      BB.dump();
+    });
+
+    // Find a maximal cold region we can outline.
+    BlockSequence ColdRegion = findMaximalColdRegion(BB, DT, PDT);
+    if (ColdRegion.empty()) {
+      LLVM_DEBUG(dbgs() << "  Skipping (block not profitable to extract)\n");
       continue;
+    }
 
-    // Move the block from HotBlocks to ColdBlocks.
-    LLVM_DEBUG(llvm::dbgs() << "\nBack propagation marks cold: " << *It);
-    HotBlocks.erase(It);
-    ColdBlocks.insert(It);
+    ++NumColdRegionsFound;
 
-    // Iterate over the predecessors.
-    for (const BasicBlock *Pred : predecessors(It)) {
-      if (HotBlocks.count(Pred)) {
-        NumHotSuccessors[Pred] -= 1;
+    LLVM_DEBUG({
+      llvm::dbgs() << "Identified cold region with " << ColdRegion.size()
+                   << " blocks:\n";
+      for (BasicBlock *BB : ColdRegion)
+        BB->dump();
+    });
 
-        // If Pred has no more hot successors, add it to the work list.
-        if (NumHotSuccessors[Pred] == 0)
-          WL.push_back(Pred);
-      }
-    }
+    // TODO: Outline more than one region.
+    if (ColdRegion.size() > LargestColdRegion.size())
+      LargestColdRegion = std::move(ColdRegion);
   }
 
-  return HotBlocks;
+  return LargestColdRegion;
 }
 
 class HotColdSplitting {
@@ -261,23 +287,9 @@ public:
 
 private:
   bool shouldOutlineFrom(const Function &F) const;
-  const Function *outlineColdBlocks(Function &F, const DenseSetBB &ColdBlock,
-                                    DominatorTree *DT, PostDomTree *PDT);
-  Function *extractColdRegion(const SmallVectorImpl<BasicBlock *> &Region,
-                              DominatorTree *DT, BlockFrequencyInfo *BFI,
+  Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT,
+                              BlockFrequencyInfo *BFI,
                               OptimizationRemarkEmitter &ORE, unsigned Count);
-  bool isOutlineCandidate(const SmallVectorImpl<BasicBlock *> &Region,
-                          const BasicBlock *Exit) const {
-    if (!Exit)
-      return false;
-
-    // Regions with landing pads etc.
-    for (const BasicBlock *BB : Region) {
-      if (BB->isEHPad() || BB->hasAddressTaken())
-        return false;
-    }
-    return true;
-  }
   SmallPtrSet<const Function *, 2> OutlinedFunctions;
   ProfileSummaryInfo *PSI;
   function_ref<BlockFrequencyInfo *(Function &)> GetBFI;
@@ -314,6 +326,8 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
   if (F.size() <= 2)
     return false;
 
+  // TODO: Consider only skipping functions marked `optnone` or `cold`.
+
   if (F.hasAddressTaken())
     return false;
 
@@ -331,15 +345,17 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
   return true;
 }
 
-Function *HotColdSplitting::extractColdRegion(
-    const SmallVectorImpl<BasicBlock *> &Region, DominatorTree *DT,
-    BlockFrequencyInfo *BFI, OptimizationRemarkEmitter &ORE, unsigned Count) {
+Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
+                                              DominatorTree &DT,
+                                              BlockFrequencyInfo *BFI,
+                                              OptimizationRemarkEmitter &ORE,
+                                              unsigned Count) {
   assert(!Region.empty());
   LLVM_DEBUG(for (auto *BB : Region)
           llvm::dbgs() << "\nExtracting: " << *BB;);
 
   // TODO: Pass BFI and BPI to update profile information.
-  CodeExtractor CE(Region, DT, /* AggregateArgs */ false, /* BFI */ nullptr,
+  CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
                    /* BPI */ nullptr, /* AllowVarArgs */ false,
                    /* AllowAlloca */ false,
                    /* Suffix */ "cold." + std::to_string(Count));
@@ -348,15 +364,18 @@ Function *HotColdSplitting::extractColdRegion(
   CE.findInputsOutputs(Inputs, Outputs, Sinks);
 
   // Do not extract regions that have live exit variables.
-  if (Outputs.size() > 0)
+  if (Outputs.size() > 0) {
+    LLVM_DEBUG(llvm::dbgs() << "Not outlining; live outputs\n");
     return nullptr;
+  }
 
+  // TODO: Run MergeBasicBlockIntoOnlyPred on the outlined function.
   Function *OrigF = Region[0]->getParent();
   if (Function *OutF = CE.extractCodeRegion()) {
     User *U = *OutF->user_begin();
     CallInst *CI = cast<CallInst>(U);
     CallSite CS(CI);
-    NumColdSESEOutlined++;
+    NumColdRegionsOutlined++;
     if (GetTTI(*OutF).useColdCCForColdCall(*OutF)) {
       OutF->setCallingConv(CallingConv::Cold);
       CS.setCallingConv(CallingConv::Cold);
@@ -388,69 +407,33 @@ Function *HotColdSplitting::extractColdRegion(
   return nullptr;
 }
 
-// Return the function created after outlining, nullptr otherwise.
-const Function *HotColdSplitting::outlineColdBlocks(Function &F,
-                                                    const DenseSetBB &HotBlocks,
-                                                    DominatorTree *DT,
-                                                    PostDomTree *PDT) {
-  auto BFI = GetBFI(F);
-  auto &ORE = (*GetORE)(F);
-  // Walking the dominator tree allows us to find the largest
-  // cold region.
-  BasicBlock *Begin = DT->getRootNode()->getBlock();
-
-  // Early return if the beginning of the function has been marked cold,
-  // otherwise all the function gets outlined.
-  if (PSI->isColdBB(Begin, BFI) || !HotBlocks.count(Begin))
-    return nullptr;
-
-  for (auto I = df_begin(Begin), E = df_end(Begin); I != E; ++I) {
-    BasicBlock *BB = *I;
-    if (PSI->isColdBB(BB, BFI) || !HotBlocks.count(BB)) {
-      SmallVector<BasicBlock *, 4> ValidColdRegion, Region;
-      BasicBlock *Exit = (*PDT)[BB]->getIDom()->getBlock();
-      BasicBlock *ExitColdRegion = nullptr;
-
-      // Estimated cold region between a BB and its dom-frontier.
-      while (Exit && isSingleEntrySingleExit(BB, Exit, DT, PDT, Region) &&
-             isOutlineCandidate(Region, Exit)) {
-        ExitColdRegion = Exit;
-        ValidColdRegion = Region;
-        Region.clear();
-        // Update Exit recursively to its dom-frontier.
-        Exit = (*PDT)[Exit]->getIDom()->getBlock();
-      }
-      if (ExitColdRegion) {
-        // Do not outline a region with only one block.
-        if (ValidColdRegion.size() == 1)
-          continue;
-
-        ++NumColdSESEFound;
-        ValidColdRegion.push_back(ExitColdRegion);
-        // Candidate for outlining. FIXME: Continue outlining.
-        return extractColdRegion(ValidColdRegion, DT, BFI, ORE, /* Count */ 1);
-      }
-    }
-  }
-  return nullptr;
-}
-
 bool HotColdSplitting::run(Module &M) {
+  bool Changed = false;
   for (auto &F : M) {
-    if (!shouldOutlineFrom(F))
+    if (!shouldOutlineFrom(F)) {
+      LLVM_DEBUG(llvm::dbgs() << "Not outlining in " << F.getName() << "\n");
       continue;
+    }
+
+    LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
     DominatorTree DT(F);
     PostDomTree PDT(F);
     PDT.recalculate(F);
-    DenseSetBB HotBlocks;
-    if (EnableStaticAnalyis) // Static analysis of cold blocks.
-      HotBlocks = getHotBlocks(F);
+    BlockFrequencyInfo *BFI = GetBFI(F);
 
-    const Function *Outlined = outlineColdBlocks(F, HotBlocks, &DT, &PDT);
-    if (Outlined)
+    BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, DT, PDT);
+    if (ColdRegion.empty())
+      continue;
+
+    OptimizationRemarkEmitter &ORE = (*GetORE)(F);
+    Function *Outlined =
+        extractColdRegion(ColdRegion, DT, BFI, ORE, /*Count=*/1);
+    if (Outlined) {
       OutlinedFunctions.insert(Outlined);
+      Changed = true;
+    }
   }
-  return true;
+  return Changed;
 }
 
 bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 328fe1fac65..462dc588cd5 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1273,24 +1273,32 @@ Function *CodeExtractor::extractCodeRegion() {
   // Look at all successors of the codeReplacer block.  If any of these blocks
   // had PHI nodes in them, we need to update the "from" block to be the code
   // replacer, not the original block in the extracted region.
-  std::vector<BasicBlock *> Succs(succ_begin(codeReplacer),
-                                  succ_end(codeReplacer));
-  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
-    for (BasicBlock::iterator I = Succs[i]->begin(); isa<PHINode>(I); ++I) {
-      PHINode *PN = cast<PHINode>(I);
-      std::set<BasicBlock*> ProcessedPreds;
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-        if (Blocks.count(PN->getIncomingBlock(i))) {
-          if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second)
-            PN->setIncomingBlock(i, codeReplacer);
-          else {
-            // There were multiple entries in the PHI for this block, now there
-            // is only one, so remove the duplicated entries.
-            PN->removeIncomingValue(i, false);
-            --i; --e;
-          }
+  for (BasicBlock *SuccBB : successors(codeReplacer)) {
+    for (PHINode &PN : SuccBB->phis()) {
+      Value *IncomingCodeReplacerVal = nullptr;
+      SmallVector<unsigned, 2> IncomingValsToRemove;
+      for (unsigned I = 0, E = PN.getNumIncomingValues(); I != E; ++I) {
+        BasicBlock *IncomingBB = PN.getIncomingBlock(I);
+
+        // Ignore incoming values from outside of the extracted region.
+        if (!Blocks.count(IncomingBB))
+          continue;
+
+        // Ensure that there is only one incoming value from codeReplacer.
+        if (!IncomingCodeReplacerVal) {
+          PN.setIncomingBlock(I, codeReplacer);
+          IncomingCodeReplacerVal = PN.getIncomingValue(I);
+        } else {
+          assert(IncomingCodeReplacerVal == PN.getIncomingValue(I) &&
+                 "PHI has two incompatbile incoming values from codeRepl");
+          IncomingValsToRemove.push_back(I);
         }
+      }
+
+      for (unsigned I : reverse(IncomingValsToRemove))
+        PN.removeIncomingValue(I, /*DeletePHIIfEmpty=*/false);
     }
+  }
 
   // Erase debug info intrinsics. Variable updates within the new function are
   // invisible to debuggers. This could be improved by defining a DISubprogram
diff --git a/test/Transforms/HotColdSplit/split-cold-1.ll b/test/Transforms/HotColdSplit/do-not-split.ll
similarity index 71%
rename from test/Transforms/HotColdSplit/split-cold-1.ll
rename to test/Transforms/HotColdSplit/do-not-split.ll
index 1a8138fe0d3..1f626581919 100644
--- a/test/Transforms/HotColdSplit/split-cold-1.ll
+++ b/test/Transforms/HotColdSplit/do-not-split.ll
@@ -1,9 +1,10 @@
 ; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
 ; RUN: opt -passes=hotcoldsplit -S < %s | FileCheck %s
 
-; Check that the function is not split. Outlined function is called from a
+; Check that these functions are not split. Outlined functions are called from a
 ; basic block named codeRepl.
 
+; The cold region is too small to split.
 ; CHECK-LABEL: @foo
 ; CHECK-NOT: codeRepl
 define void @foo() {
@@ -26,11 +27,9 @@ return:                                           ; preds = %cleanup40
   ret void
 }
 
-; Check that the function is not split. We used to outline the full function.
-
+; Make sure we don't try to outline the entire function.
 ; CHECK-LABEL: @fun
 ; CHECK-NOT: codeRepl
-
 define void @fun() {
 entry:
   br i1 undef, label %if.then, label %if.end
@@ -41,3 +40,17 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %entry
   ret void
 }
+
+; Don't outline infinite loops.
+; CHECK-LABEL: @infinite_loop
+; CHECK-NOT: codeRepl
+define void @infinite_loop() {
+entry:
+  br label %loop
+
+loop:
+  call void @sink()
+  br label %loop
+}
+
+declare void @sink() cold
diff --git a/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll b/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll
new file mode 100644
index 00000000000..17001f95468
--- /dev/null
+++ b/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll
@@ -0,0 +1,54 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+declare void @sideeffect(i64)
+
+declare i8* @realloc(i8* %ptr, i64 %size)
+
+declare void @free(i8* %ptr)
+
+declare void @sink() cold
+
+; CHECK-LABEL: define {{.*}}@realloc2(
+; CHECK: call {{.*}}@sideeffect(
+; CHECK: call {{.*}}@realloc(
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call {{.*}}@realloc2.cold.1(i64 %size, i8* %ptr)
+; CHECK-LABEL: cleanup:
+; CHECK-NEXT: phi i8* [ null, %if.then ], [ null, %codeRepl ], [ %call, %if.end ]
+define i8* @realloc2(i8* %ptr, i64 %size) {
+entry:
+  %0 = add i64 %size, -1
+  %1 = icmp ugt i64 %0, 184549375
+  br i1 %1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @sideeffect(i64 %size)
+  br label %cleanup
+
+if.end:                                           ; preds = %entry
+  %call = call i8* @realloc(i8* %ptr, i64 %size)
+  %tobool1 = icmp eq i8* %call, null
+  br i1 %tobool1, label %if.then2, label %cleanup
+
+if.then2:                                         ; preds = %if.end
+  call void @sideeffect(i64 %size)
+  call void @sink()
+  %tobool3 = icmp eq i8* %ptr, null
+  br i1 %tobool3, label %cleanup, label %if.then4
+
+if.then4:                                         ; preds = %if.then2
+  call void @free(i8* %ptr)
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.end, %if.then4, %if.then2, %if.then
+  %retval.0 = phi i8* [ null, %if.then ], [ null, %if.then2 ], [ null, %if.then4 ], [ %call, %if.end ]
+  ret i8* %retval.0
+}
+
+; CHECK-LABEL: define {{.*}}@realloc2.cold.1(
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sink
+; CHECK: call {{.*}}@free
diff --git a/test/Transforms/HotColdSplit/multiple-exits.ll b/test/Transforms/HotColdSplit/multiple-exits.ll
new file mode 100644
index 00000000000..2e7cf84f72e
--- /dev/null
+++ b/test/Transforms/HotColdSplit/multiple-exits.ll
@@ -0,0 +1,73 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern void sideeffect(int);
+; extern void __attribute__((cold)) sink();
+; void foo(int cond) {
+;   if (cond) { //< Start outlining here.
+;     sink();
+;     if (cond > 10)
+;       goto exit1;
+;     else
+;       goto exit2;
+;   }
+; exit1:
+;   sideeffect(1);
+;   return;
+; exit2:
+;   sideeffect(2);
+;   return;
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: br i1 {{.*}}, label %exit1, label %codeRepl
+; CHECK-LABEL: codeRepl:
+; CHECK: [[targetBlock:%.*]] = call i1 @foo.cold.1(
+; CHECK-NEXT: br i1 [[targetBlock]], label %exit1, label %[[return:.*]]
+; CHECK-LABEL: exit1:
+; CHECK: call {{.*}}@sideeffect(i32 1)
+; CHECK: [[return]]:
+; CHECK-NEXT: ret void
+define void @foo(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %exit1, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void (...) @sink()
+  %cmp = icmp sgt i32 %cond, 10
+  br i1 %cmp, label %exit1, label %exit2
+
+exit1:                                            ; preds = %entry, %if.then
+  call void @sideeffect(i32 1)
+  br label %return
+
+exit2:                                            ; preds = %if.then
+  call void @sideeffect(i32 2)
+  br label %return
+
+return:                                           ; preds = %exit2, %exit1
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; TODO: Eliminate this unnecessary unconditional branch.
+; CHECK: br
+; CHECK: [[exit1Stub:.*]]:
+; CHECK-NEXT: ret i1 true
+; CHECK: [[returnStub:.*]]:
+; CHECK-NEXT: ret i1 false
+; CHECK: call {{.*}}@sink
+; CHECK-NEXT: [[cmp:%.*]] = icmp
+; CHECK-NEXT: br i1 [[cmp]], label %[[exit1Stub]], label %exit2
+; CHECK-LABEL: exit2:
+; CHECK-NEXT: call {{.*}}@sideeffect(i32 2)
+; CHECK-NEXT: br label %[[returnStub]]
+
+declare void @sink(...) cold
+
+declare void @sideeffect(i32)
diff --git a/test/Transforms/HotColdSplit/outline-if-then-else.ll b/test/Transforms/HotColdSplit/outline-if-then-else.ll
new file mode 100644
index 00000000000..bbde7651e28
--- /dev/null
+++ b/test/Transforms/HotColdSplit/outline-if-then-else.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern void sideeffect(int);
+; extern void __attribute__((cold)) sink();
+; void foo(int cond) {
+;   if (cond) { //< Start outlining here.
+;     if (cond > 10)
+;       sideeffect(0);
+;     else
+;       sideeffect(1);
+;     sink();
+;   }
+;   sideeffect(2);
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: br i1 {{.*}}, label %codeRepl, label %if.end2
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call void @foo.cold.1
+; CHECK-LABEL: if.end2:
+; CHECK: call void @sideeffect(i32 2)
+define void @foo(i32 %cond) {
+entry:
+  %cond.addr = alloca i32
+  store i32 %cond, i32* %cond.addr
+  %0 = load i32, i32* %cond.addr
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end2
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %cond.addr
+  %cmp = icmp sgt i32 %1, 10
+  br i1 %cmp, label %if.then1, label %if.else
+
+if.then1:                                         ; preds = %if.then
+  call void @sideeffect(i32 0)
+  br label %if.end
+
+if.else:                                          ; preds = %if.then
+  call void @sideeffect(i32 1)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then1
+  call void (...) @sink()
+  ret void
+
+if.end2:                                          ; preds = %entry
+  call void @sideeffect(i32 2)
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sink
+
+declare void @sideeffect(i32)
+
+declare void @sink(...) cold
diff --git a/test/Transforms/HotColdSplit/outline-while-loop.ll b/test/Transforms/HotColdSplit/outline-while-loop.ll
new file mode 100644
index 00000000000..2a132bda7f0
--- /dev/null
+++ b/test/Transforms/HotColdSplit/outline-while-loop.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern void sideeffect(int);
+; extern void __attribute__((cold)) sink();
+; void foo(int cond) {
+;   if (cond) { //< Start outlining here.
+;     while (cond > 10) {
+;       --cond;
+;       sideeffect(0);
+;     }
+;     sink();
+;   }
+;   sideeffect(1);
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: br i1 {{.*}}, label %if.end, label %codeRepl
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call void @foo.cold.1
+; CHECK-LABEL: if.end:
+; CHECK: call void @sideeffect(i32 1)
+define void @foo(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end, label %while.cond.preheader
+
+while.cond.preheader:                             ; preds = %entry
+  %cmp3 = icmp sgt i32 %cond, 10
+  br i1 %cmp3, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %while.cond.preheader
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %cond.addr.04 = phi i32 [ %dec, %while.body ], [ %cond, %while.body.preheader ]
+  %dec = add nsw i32 %cond.addr.04, -1
+  tail call void @sideeffect(i32 0) #3
+  %cmp = icmp sgt i32 %dec, 10
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %while.cond.preheader
+  tail call void (...) @sink()
+  ret void
+
+if.end:                                           ; preds = %entry
+  tail call void @sideeffect(i32 1)
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1
+; CHECK: phi i32
+; CHECK-NEXT: add nsw i32
+; CHECK-NEXT: call {{.*}}@sideeffect
+; CHECK-NEXT: icmp
+; CHECK-NEXT: br
+
+declare void @sideeffect(i32)
+
+declare void @sink(...) cold
diff --git a/unittests/Transforms/Utils/CodeExtractorTest.cpp b/unittests/Transforms/Utils/CodeExtractorTest.cpp
index c229be6d695..c53b3152a7d 100644
--- a/unittests/Transforms/Utils/CodeExtractorTest.cpp
+++ b/unittests/Transforms/Utils/CodeExtractorTest.cpp
@@ -21,7 +21,7 @@
 using namespace llvm;
 
 namespace {
-TEST(CodeExtractor, ExitStub) {
+TEST(CodeExtractor, DISABLED_ExitStub) {
   LLVMContext Ctx;
   SMDiagnostic Err;
   std::unique_ptr<Module> M(parseAssemblyString(R"invalid(
@@ -46,6 +46,25 @@ TEST(CodeExtractor, ExitStub) {
   )invalid",
                                                 Err, Ctx));
 
+  // CodeExtractor miscompiles this function. There appear to be some issues
+  // with the handling of outlined regions with live output values.
+  //
+  // In the original function, CE adds two reloads in the codeReplacer block:
+  //
+  //   codeRepl:                                         ; preds = %header
+  //     call void @foo_header.split(i32 %z, i32 %x, i32 %y, i32* %.loc, i32* %.loc1)
+  //     %.reload = load i32, i32* %.loc
+  //     %.reload2 = load i32, i32* %.loc1
+  //     br label %notExtracted
+  //
+  // These reloads must flow into the notExtracted block:
+  //
+  //   notExtracted:                                     ; preds = %codeRepl
+  //     %0 = phi i32 [ %.reload, %codeRepl ], [ %.reload2, %body2 ]
+  //
+  // The problem is that the PHI node in notExtracted now has an incoming
+  // value from a BasicBlock that's in a different function.
+
   Function *Func = M->getFunction("foo");
   SmallVector<BasicBlock *, 3> Candidates;
   for (auto &BB : *Func) {
-- 
GitLab


From a740192ca60d8b2bd8768839cbcc92adf4d0fe74 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 24 Oct 2018 22:18:54 +0000
Subject: [PATCH 0535/1116] [SelectionDAG] DAG combiner for fminnan and fmaxnan

Summary: Depends on D52765.

Reviewers: aheejin, dschuff

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D52768

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345210 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 40 +++++++--------
 test/CodeGen/WebAssembly/simd-arith.ll   | 64 ++++++++++++++++++++++--
 2 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ef0afc71ab4..4f5e96a8257 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -371,6 +371,8 @@ namespace {
     SDValue visitFFLOOR(SDNode *N);
     SDValue visitFMINNUM(SDNode *N);
     SDValue visitFMAXNUM(SDNode *N);
+    SDValue visitFMINNAN(SDNode *N);
+    SDValue visitFMAXNAN(SDNode *N);
     SDValue visitBRCOND(SDNode *N);
     SDValue visitBR_CC(SDNode *N);
     SDValue visitLOAD(SDNode *N);
@@ -1582,6 +1584,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FFLOOR:             return visitFFLOOR(N);
   case ISD::FMINNUM:            return visitFMINNUM(N);
   case ISD::FMAXNUM:            return visitFMAXNUM(N);
+  case ISD::FMINNAN:            return visitFMINNAN(N);
+  case ISD::FMAXNAN:            return visitFMAXNAN(N);
   case ISD::FCEIL:              return visitFCEIL(N);
   case ISD::FTRUNC:             return visitFTRUNC(N);
   case ISD::BRCOND:             return visitBRCOND(N);
@@ -12124,7 +12128,8 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
+static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N,
+                            APFloat (*Op)(const APFloat &, const APFloat &)) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
@@ -12134,36 +12139,31 @@ SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
   if (N0CFP && N1CFP) {
     const APFloat &C0 = N0CFP->getValueAPF();
     const APFloat &C1 = N1CFP->getValueAPF();
-    return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), VT);
+    return DAG.getConstantFP(Op(C0, C1), SDLoc(N), VT);
   }
 
   // Canonicalize to constant on RHS.
   if (isConstantFPBuildVectorOrConstantFP(N0) &&
-     !isConstantFPBuildVectorOrConstantFP(N1))
-    return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0);
+      !isConstantFPBuildVectorOrConstantFP(N1))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
 
   return SDValue();
 }
 
-SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  EVT VT = N->getValueType(0);
-  const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
-  const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
+SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
+  return visitFMinMax(DAG, N, minnum);
+}
 
-  if (N0CFP && N1CFP) {
-    const APFloat &C0 = N0CFP->getValueAPF();
-    const APFloat &C1 = N1CFP->getValueAPF();
-    return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), VT);
-  }
+SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
+  return visitFMinMax(DAG, N, maxnum);
+}
 
-  // Canonicalize to constant on RHS.
-  if (isConstantFPBuildVectorOrConstantFP(N0) &&
-     !isConstantFPBuildVectorOrConstantFP(N1))
-    return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0);
+SDValue DAGCombiner::visitFMINNAN(SDNode *N) {
+  return visitFMinMax(DAG, N, minimum);
+}
 
-  return SDValue();
+SDValue DAGCombiner::visitFMAXNAN(SDNode *N) {
+  return visitFMinMax(DAG, N, maximum);
 }
 
 SDValue DAGCombiner::visitFABS(SDNode *N) {
diff --git a/test/CodeGen/WebAssembly/simd-arith.ll b/test/CodeGen/WebAssembly/simd-arith.ll
index 689853fa7bb..573f4fff5ad 100644
--- a/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/test/CodeGen/WebAssembly/simd-arith.ll
@@ -824,7 +824,7 @@ define <4 x float> @max_unordered_v4f32(<4 x float> %x) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}}
+; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @min_ordered_v4f32(<4 x float> %x) {
   %cmps = fcmp ole <4 x float> <float 5., float 5., float 5., float 5.>, %x
@@ -839,7 +839,7 @@ define <4 x float> @min_ordered_v4f32(<4 x float> %x) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}}
+; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @max_ordered_v4f32(<4 x float> %x) {
   %cmps = fcmp oge <4 x float> <float 5., float 5., float 5., float 5.>, %x
@@ -872,6 +872,34 @@ define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %a
 }
 
+; CHECK-LABEL: min_const_intrinsic_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L:[0-9]+]]=, 0x1.4p2{{$}}
+; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @min_const_intrinsic_v4f32() {
+  %a = call <4 x float> @llvm.minimum.v4f32(
+    <4 x float> <float 42., float 42., float 42., float 42.>,
+    <4 x float> <float 5., float 5., float 5., float 5.>
+  )
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: max_const_intrinsic_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L:[0-9]+]]=, 0x1.5p5{{$}}
+; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @max_const_intrinsic_v4f32() {
+  %a = call <4 x float> @llvm.maximum.v4f32(
+    <4 x float> <float 42., float 42., float 42., float 42.>,
+    <4 x float> <float 5., float 5., float 5., float 5.>
+  )
+  ret <4 x float> %a
+}
+
 ; CHECK-LABEL: add_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -991,7 +1019,7 @@ define <2 x double> @max_unordered_v2f64(<2 x double> %x) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}}
+; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @min_ordered_v2f64(<2 x double> %x) {
   %cmps = fcmp ole <2 x double> <double 5., double 5.>, %x
@@ -1006,7 +1034,7 @@ define <2 x double> @min_ordered_v2f64(<2 x double> %x) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
 ; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
-; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}}
+; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @max_ordered_v2f64(<2 x double> %x) {
   %cmps = fcmp oge <2 x double> <double 5., double 5.>, %x
@@ -1039,6 +1067,34 @@ define <2 x double> @max_intrinsic_v2f64(<2 x double> %x, <2 x double> %y) {
   ret <2 x double> %a
 }
 
+; CHECK-LABEL: min_const_intrinsic_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L:[0-9]+]]=, 0x1.4p2{{$}}
+; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @min_const_intrinsic_v2f64() {
+  %a = call <2 x double> @llvm.minimum.v2f64(
+    <2 x double> <double 42., double 42.>,
+    <2 x double> <double 5., double 5.>
+  )
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: max_const_intrinsic_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L:[0-9]+]]=, 0x1.5p5{{$}}
+; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @max_const_intrinsic_v2f64() {
+  %a = call <2 x double> @llvm.maximum.v2f64(
+    <2 x double> <double 42., double 42.>,
+    <2 x double> <double 5., double 5.>
+  )
+  ret <2 x double> %a
+}
+
 ; CHECK-LABEL: add_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f62x2
-- 
GitLab


From 69b255770f9de2056095394e7ff441e263578bf0 Mon Sep 17 00:00:00 2001
From: Volodymyr Sapsai <vsapsai@apple.com>
Date: Wed, 24 Oct 2018 22:40:54 +0000
Subject: [PATCH 0536/1116] [VFS] Remove 'ignore-non-existent-contents'
 attribute for YAML-based VFS.

'ignore-non-existent-contents' stopped working after r342232 in a way
that the actual attribute value isn't used and it works as if it is
always `true`.

Common use case for VFS iteration is iterating through files in umbrella
directories for modules. Ability to detect if some VFS entries point to
non-existing files is nice but non-critical. Instead of adding back
support for `'ignore-non-existent-contents': false` I am removing the
attribute, because such scenario isn't used widely enough and stricter
checks don't provide enough value to justify the maintenance.

Change is done both in LLVM and Clang, corresponding Clang commit is r345212.

rdar://problem/45176119

Reviewers: bruno

Reviewed By: bruno

Subscribers: hiraditya, dexonsmith, sammccall, cfe-commits

Differential Revision: https://reviews.llvm.org/D53228


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345213 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/VirtualFileSystem.h |  5 -----
 lib/Support/VirtualFileSystem.cpp        | 24 ++----------------------
 2 files changed, 2 insertions(+), 27 deletions(-)

diff --git a/include/llvm/Support/VirtualFileSystem.h b/include/llvm/Support/VirtualFileSystem.h
index f2f8ffafc50..7e1828062b3 100644
--- a/include/llvm/Support/VirtualFileSystem.h
+++ b/include/llvm/Support/VirtualFileSystem.h
@@ -490,7 +490,6 @@ class YAMLVFSWriter {
   Optional<bool> IsCaseSensitive;
   Optional<bool> IsOverlayRelative;
   Optional<bool> UseExternalNames;
-  Optional<bool> IgnoreNonExistentContents;
   std::string OverlayDir;
 
 public:
@@ -504,10 +503,6 @@ public:
 
   void setUseExternalNames(bool UseExtNames) { UseExternalNames = UseExtNames; }
 
-  void setIgnoreNonExistentContents(bool IgnoreContents) {
-    IgnoreNonExistentContents = IgnoreContents;
-  }
-
   void setOverlayDir(StringRef OverlayDirectory) {
     IsOverlayRelative = true;
     OverlayDir.assign(OverlayDirectory.str());
diff --git a/lib/Support/VirtualFileSystem.cpp b/lib/Support/VirtualFileSystem.cpp
index 23b5fbceb20..cf7fe967f01 100644
--- a/lib/Support/VirtualFileSystem.cpp
+++ b/lib/Support/VirtualFileSystem.cpp
@@ -1028,7 +1028,6 @@ public:
 ///   'case-sensitive': <boolean, default=true>
 ///   'use-external-names': <boolean, default=true>
 ///   'overlay-relative': <boolean, default=false>
-///   'ignore-non-existent-contents': <boolean, default=true>
 ///
 /// Virtual directories are represented as
 /// \verbatim
@@ -1092,14 +1091,6 @@ class RedirectingFileSystem : public vfs::FileSystem {
   /// Whether to use to use the value of 'external-contents' for the
   /// names of files.  This global value is overridable on a per-file basis.
   bool UseExternalNames = true;
-
-  /// Whether an invalid path obtained via 'external-contents' should
-  /// cause iteration on the VFS to stop. If 'true', the VFS should ignore
-  /// the entry and continue with the next. Allows YAML files to be shared
-  /// across multiple compiler invocations regardless of prior existent
-  /// paths in 'external-contents'. This global value is overridable on a
-  /// per-file basis.
-  bool IgnoreNonExistentContents = true;
   /// @}
 
   /// Virtual file paths and external files could be canonicalized without "..",
@@ -1176,8 +1167,6 @@ public:
     return ExternalContentsPrefixDir;
   }
 
-  bool ignoreNonExistentContents() const { return IgnoreNonExistentContents; }
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   LLVM_DUMP_METHOD void dump() const {
     for (const auto &Root : Roots)
@@ -1549,7 +1538,6 @@ public:
         KeyStatusPair("case-sensitive", false),
         KeyStatusPair("use-external-names", false),
         KeyStatusPair("overlay-relative", false),
-        KeyStatusPair("ignore-non-existent-contents", false),
         KeyStatusPair("roots", true),
     };
 
@@ -1607,9 +1595,6 @@ public:
       } else if (Key == "use-external-names") {
         if (!parseScalarBool(I.getValue(), FS->UseExternalNames))
           return false;
-      } else if (Key == "ignore-non-existent-contents") {
-        if (!parseScalarBool(I.getValue(), FS->IgnoreNonExistentContents))
-          return false;
       } else {
         llvm_unreachable("key missing from Keys");
       }
@@ -1915,7 +1900,7 @@ public:
 
   void write(ArrayRef<YAMLVFSEntry> Entries, Optional<bool> UseExternalNames,
              Optional<bool> IsCaseSensitive, Optional<bool> IsOverlayRelative,
-             Optional<bool> IgnoreNonExistentContents, StringRef OverlayDir);
+             StringRef OverlayDir);
 };
 
 } // namespace
@@ -1973,7 +1958,6 @@ void JSONWriter::write(ArrayRef<YAMLVFSEntry> Entries,
                        Optional<bool> UseExternalNames,
                        Optional<bool> IsCaseSensitive,
                        Optional<bool> IsOverlayRelative,
-                       Optional<bool> IgnoreNonExistentContents,
                        StringRef OverlayDir) {
   using namespace llvm::sys;
 
@@ -1991,9 +1975,6 @@ void JSONWriter::write(ArrayRef<YAMLVFSEntry> Entries,
     OS << "  'overlay-relative': '" << (UseOverlayRelative ? "true" : "false")
        << "',\n";
   }
-  if (IgnoreNonExistentContents.hasValue())
-    OS << "  'ignore-non-existent-contents': '"
-       << (IgnoreNonExistentContents.getValue() ? "true" : "false") << "',\n";
   OS << "  'roots': [\n";
 
   if (!Entries.empty()) {
@@ -2049,8 +2030,7 @@ void YAMLVFSWriter::write(llvm::raw_ostream &OS) {
   });
 
   JSONWriter(OS).write(Mappings, UseExternalNames, IsCaseSensitive,
-                       IsOverlayRelative, IgnoreNonExistentContents,
-                       OverlayDir);
+                       IsOverlayRelative, OverlayDir);
 }
 
 VFSFromYamlDirIterImpl::VFSFromYamlDirIterImpl(
-- 
GitLab


From b8fd9d55db1c1c23aebcc0e0819f07c0fff5a88f Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Wed, 24 Oct 2018 22:44:54 +0000
Subject: [PATCH 0537/1116] llvm-dwarfdump: Account for skeleton addr_base when
 dumping addresses in split unit in the same file

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345215 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/DWARF/DWARFUnit.cpp           | 10 ++++++++++
 test/DebugInfo/X86/split-dwarf-v5-ranges.ll |  4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 4cf1f938c3e..9d75dc94604 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -197,6 +197,16 @@ DWARFDataExtractor DWARFUnit::getDebugInfoExtractor() const {
 
 Optional<SectionedAddress>
 DWARFUnit::getAddrOffsetSectionItem(uint32_t Index) const {
+  if (isDWO) {
+    auto R = Context.info_section_units();
+    auto I = R.begin();
+    // Surprising if a DWO file has more than one skeleton unit in it - this
+    // probably shouldn't be valid, but if a use case is found, here's where to
+    // support it (probably have to linearly search for the matching skeleton CU
+    // here)
+    if (I != R.end() && std::next(I) == R.end())
+      return (*I)->getAddrOffsetSectionItem(Index);
+  }
   uint32_t Offset = AddrOffsetSectionBase + Index * getAddressByteSize();
   if (AddrOffsetSection->Data.size() < Offset + getAddressByteSize())
     return None;
diff --git a/test/DebugInfo/X86/split-dwarf-v5-ranges.ll b/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
index 4404d5c3639..295bbc41ad3 100644
--- a/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
+++ b/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
@@ -4,8 +4,8 @@
 ; CHECK: .debug_info contents:
 ; CHECK: .debug_info.dwo contents:
 ; CHECK: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x0) rangelist = 0x00000004
-; CHECK:          [0x0000000000000000, 0x000000000000000b) "x"
-; CHECK:          [0x000000000000000d, 0x0000000000000012) "x")
+; CHECK:          [0x0000000000000001, 0x000000000000000c) ".text"
+; CHECK:          [0x000000000000000e, 0x0000000000000013) ".text")
 
 ; CHECK: .debug_rnglists.dwo contents:
 ; CHECK: 0x00000000: range list header: length = 0x00000015, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000001
-- 
GitLab


From 86e199d8d614913930aa2eabca693a88fb522ac9 Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Wed, 24 Oct 2018 22:46:45 +0000
Subject: [PATCH 0538/1116] Update MemorySSA in LoopRotate.

Summary:
Teach LoopRotate to preserve MemorySSA.
Enable tests for correctness, dependency disabled by default.

Subscribers: sanjoy, jlebar, Prazek, george.burgess.iv, llvm-commits

Differential Revision: https://reviews.llvm.org/D51718

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345216 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/Transforms/Utils/LoopRotationUtils.h |   5 +-
 lib/Transforms/Scalar/LoopRotation.cpp        |  28 ++++-
 lib/Transforms/Utils/LoopRotationUtils.cpp    |  60 ++++++++--
 .../LoopRotate/2009-01-25-SingleEntryPhi.ll   |   1 +
 test/Transforms/LoopRotate/PhiRename-1.ll     |   1 +
 .../LoopRotate/PhiSelfReference-1.ll          |   1 +
 test/Transforms/LoopRotate/alloca.ll          |   1 +
 test/Transforms/LoopRotate/basic.ll           |   2 +
 test/Transforms/LoopRotate/catchret.ll        |   1 +
 test/Transforms/LoopRotate/convergent.ll      |   1 +
 test/Transforms/LoopRotate/crash.ll           |   1 +
 .../LoopRotate/dbg-value-duplicates.ll        |   1 +
 test/Transforms/LoopRotate/dbgvalue.ll        |   1 +
 test/Transforms/LoopRotate/indirectbr.ll      |   1 +
 .../Transforms/LoopRotate/loopexitinglatch.ll |   1 +
 test/Transforms/LoopRotate/multiple-exits.ll  |   1 +
 test/Transforms/LoopRotate/phi-dbgvalue.ll    |   1 +
 test/Transforms/LoopRotate/phi-duplicate.ll   |   1 +
 test/Transforms/LoopRotate/pr22337.ll         |   1 +
 test/Transforms/LoopRotate/pr33701.ll         |   1 +
 test/Transforms/LoopRotate/pr35210.ll         |  51 ++++++++
 test/Transforms/LoopRotate/pr37205.ll         |   1 +
 test/Transforms/LoopRotate/preserve-mssa.ll   | 109 ++++++++++++++++++
 test/Transforms/LoopRotate/preserve-scev.ll   |  37 +++++-
 .../LoopRotate/vect.omp.persistence.ll        |   1 +
 25 files changed, 293 insertions(+), 17 deletions(-)
 create mode 100644 test/Transforms/LoopRotate/preserve-mssa.ll

diff --git a/include/llvm/Transforms/Utils/LoopRotationUtils.h b/include/llvm/Transforms/Utils/LoopRotationUtils.h
index 231e5bbb6de..cd5bc430101 100644
--- a/include/llvm/Transforms/Utils/LoopRotationUtils.h
+++ b/include/llvm/Transforms/Utils/LoopRotationUtils.h
@@ -20,6 +20,7 @@ class AssumptionCache;
 class DominatorTree;
 class Loop;
 class LoopInfo;
+class MemorySSAUpdater;
 class ScalarEvolution;
 struct SimplifyQuery;
 class TargetTransformInfo;
@@ -32,8 +33,8 @@ class TargetTransformInfo;
 /// LoopRotation. If it is true, the profitability heuristic will be ignored.
 bool LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
                   AssumptionCache *AC, DominatorTree *DT, ScalarEvolution *SE,
-                  const SimplifyQuery &SQ, bool RotationOnly,
-                  unsigned Threshold, bool IsUtilMode);
+                  MemorySSAUpdater *MSSAU, const SimplifyQuery &SQ,
+                  bool RotationOnly, unsigned Threshold, bool IsUtilMode);
 
 } // namespace llvm
 
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index eeaad39dc1d..fd22128f7fe 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -15,6 +15,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
@@ -40,12 +42,19 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
   const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
   const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
 
-  bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, SQ,
-                              false, Threshold, false);
+  Optional<MemorySSAUpdater> MSSAU;
+  if (AR.MSSA)
+    MSSAU = MemorySSAUpdater(AR.MSSA);
+  bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
+                              MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
+                              SQ, false, Threshold, false);
 
   if (!Changed)
     return PreservedAnalyses::all();
 
+  if (AR.MSSA && VerifyMemorySSA)
+    AR.MSSA->verifyMemorySSA();
+
   return getLoopPassPreservedAnalyses();
 }
 
@@ -68,6 +77,10 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    if (EnableMSSALoopDependency) {
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    }
     getLoopAnalysisUsage(AU);
   }
 
@@ -84,8 +97,14 @@ public:
     auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
     auto *SE = SEWP ? &SEWP->getSE() : nullptr;
     const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
-    return LoopRotation(L, LI, TTI, AC, DT, SE, SQ, false, MaxHeaderSize,
-                        false);
+    Optional<MemorySSAUpdater> MSSAU;
+    if (EnableMSSALoopDependency) {
+      MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+      MSSAU = MemorySSAUpdater(MSSA);
+    }
+    return LoopRotation(L, LI, TTI, AC, DT, SE,
+                        MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
+                        false, MaxHeaderSize, false);
   }
 };
 }
@@ -96,6 +115,7 @@ INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
                     false)
 
diff --git a/lib/Transforms/Utils/LoopRotationUtils.cpp b/lib/Transforms/Utils/LoopRotationUtils.cpp
index 73f67f3219d..41f14a83461 100644
--- a/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -20,6 +20,8 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -54,6 +56,7 @@ class LoopRotate {
   AssumptionCache *AC;
   DominatorTree *DT;
   ScalarEvolution *SE;
+  MemorySSAUpdater *MSSAU;
   const SimplifyQuery &SQ;
   bool RotationOnly;
   bool IsUtilMode;
@@ -61,10 +64,11 @@ class LoopRotate {
 public:
   LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
              const TargetTransformInfo *TTI, AssumptionCache *AC,
-             DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ,
-             bool RotationOnly, bool IsUtilMode)
+             DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+             const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode)
       : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
-        SQ(SQ), RotationOnly(RotationOnly), IsUtilMode(IsUtilMode) {}
+        MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
+        IsUtilMode(IsUtilMode) {}
   bool processLoop(Loop *L);
 
 private:
@@ -269,6 +273,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     SE->forgetTopmostLoop(L);
 
   LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
   // Find new Loop header. NewHeader is a Header's one and only successor
   // that is inside loop.  Header's other successor is outside the
@@ -385,6 +391,12 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // remove the corresponding incoming values from the PHI nodes in OrigHeader.
   LoopEntryBranch->eraseFromParent();
 
+  // Update MemorySSA before the rewrite call below changes the 1:1
+  // instruction:cloned_instruction_or_value mapping in ValueMap.
+  if (MSSAU) {
+    ValueMap[OrigHeader] = OrigPreheader;
+    MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, ValueMap);
+  }
 
   SmallVector<PHINode*, 2> InsertedPHIs;
   // If there were any uses of instructions in the duplicated block outside the
@@ -411,6 +423,12 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
     Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
     DT->applyUpdates(Updates);
+
+    if (MSSAU) {
+      MSSAU->applyUpdates(Updates, *DT);
+      if (VerifyMemorySSA)
+        MSSAU->getMemorySSA()->verifyMemorySSA();
+    }
   }
 
   // At this point, we've finished our major CFG changes.  As part of cloning
@@ -433,7 +451,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // Split the edge to form a real preheader.
     BasicBlock *NewPH = SplitCriticalEdge(
         OrigPreheader, NewHeader,
-        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+        CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
     NewPH->setName(NewHeader->getName() + ".lr.ph");
 
     // Preserve canonical loop form, which means that 'Exit' should have only
@@ -452,7 +470,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       SplitLatchEdge |= L->getLoopLatch() == ExitPred;
       BasicBlock *ExitSplit = SplitCriticalEdge(
           ExitPred, Exit,
-          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+          CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
       ExitSplit->moveBefore(Exit);
     }
     assert(SplitLatchEdge &&
@@ -467,17 +485,27 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
     // With our CFG finalized, update DomTree if it is available.
     if (DT) DT->deleteEdge(OrigPreheader, Exit);
+
+    // Update MSSA too, if available.
+    if (MSSAU)
+      MSSAU->removeEdge(OrigPreheader, Exit);
   }
 
   assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
   assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   // Now that the CFG and DomTree are in a consistent state again, try to merge
   // the OrigHeader block into OrigLatch.  This will succeed if they are
   // connected by an unconditional branch.  This is just a cleanup so the
   // emitted code isn't too gross in this common case.
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-  MergeBlockIntoPredecessor(OrigHeader, &DTU, LI);
+  MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
   LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
 
@@ -586,9 +614,14 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
                     << LastExit->getName() << "\n");
 
   // Hoist the instructions from Latch into LastExit.
+  Instruction *FirstLatchInst = &*(Latch->begin());
   LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
                                  Latch->begin(), Jmp->getIterator());
 
+  // Update MemorySSA
+  if (MSSAU)
+    MSSAU->moveAllAfterMergeBlocks(Latch, LastExit, FirstLatchInst);
+
   unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
   BasicBlock *Header = Jmp->getSuccessor(0);
   assert(Header == L->getHeader() && "expected a backward branch");
@@ -604,6 +637,10 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
   if (DT)
     DT->eraseNode(Latch);
   Latch->eraseFromParent();
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   return true;
 }
 
@@ -636,11 +673,16 @@ bool LoopRotate::processLoop(Loop *L) {
 /// The utility to convert a loop into a loop with bottom test.
 bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
                         AssumptionCache *AC, DominatorTree *DT,
-                        ScalarEvolution *SE, const SimplifyQuery &SQ,
-                        bool RotationOnly = true,
+                        ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+                        const SimplifyQuery &SQ, bool RotationOnly = true,
                         unsigned Threshold = unsigned(-1),
                         bool IsUtilMode = true) {
-  LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, SQ, RotationOnly, IsUtilMode);
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+  LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
+                IsUtilMode);
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
   return LR.processLoop(L);
 }
diff --git a/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll b/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll
index 7036d2d9c3a..a09a2290e0a 100644
--- a/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll
+++ b/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 ; PR3408
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/LoopRotate/PhiRename-1.ll b/test/Transforms/LoopRotate/PhiRename-1.ll
index 6d75888d70d..8bece445cf4 100644
--- a/test/Transforms/LoopRotate/PhiRename-1.ll
+++ b/test/Transforms/LoopRotate/PhiRename-1.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 ; CHECK-NOT: [ {{.}}tmp224
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/Transforms/LoopRotate/PhiSelfReference-1.ll b/test/Transforms/LoopRotate/PhiSelfReference-1.ll
index ed494483391..7726c53e55e 100644
--- a/test/Transforms/LoopRotate/PhiSelfReference-1.ll
+++ b/test/Transforms/LoopRotate/PhiSelfReference-1.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 ; ModuleID = 'PhiSelfReference-1.bc'
 
 define void @snrm2(i32 %incx) {
diff --git a/test/Transforms/LoopRotate/alloca.ll b/test/Transforms/LoopRotate/alloca.ll
index bbcfb39c372..59da33f8802 100644
--- a/test/Transforms/LoopRotate/alloca.ll
+++ b/test/Transforms/LoopRotate/alloca.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 
 ; Test alloca in -loop-rotate.
 
diff --git a/test/Transforms/LoopRotate/basic.ll b/test/Transforms/LoopRotate/basic.ll
index 299c18c871e..d01d19f7f12 100644
--- a/test/Transforms/LoopRotate/basic.ll
+++ b/test/Transforms/LoopRotate/basic.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 ; RUN: opt -S -passes='require<targetir>,require<assumptions>,loop(rotate)' < %s | FileCheck %s
+; RUN: opt -S -passes='require<targetir>,require<assumptions>,loop(rotate)' -enable-mssa-loop-dependency=true -verify-memoryssa  < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/test/Transforms/LoopRotate/catchret.ll b/test/Transforms/LoopRotate/catchret.ll
index c035e49d79c..f28af8aed60 100755
--- a/test/Transforms/LoopRotate/catchret.ll
+++ b/test/Transforms/LoopRotate/catchret.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 
 target triple = "x86_64-pc-windows-msvc"
 
diff --git a/test/Transforms/LoopRotate/convergent.ll b/test/Transforms/LoopRotate/convergent.ll
index c8b34fd75f0..37671562142 100644
--- a/test/Transforms/LoopRotate/convergent.ll
+++ b/test/Transforms/LoopRotate/convergent.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 
 @e = global i32 10
 
diff --git a/test/Transforms/LoopRotate/crash.ll b/test/Transforms/LoopRotate/crash.ll
index 5e2b66d6803..2a45e370e18 100644
--- a/test/Transforms/LoopRotate/crash.ll
+++ b/test/Transforms/LoopRotate/crash.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -loop-rotate -disable-output -verify-dom-info -verify-loop-info < %s
+; RUN: opt -loop-rotate -disable-output -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/test/Transforms/LoopRotate/dbg-value-duplicates.ll b/test/Transforms/LoopRotate/dbg-value-duplicates.ll
index 2fea06b5afe..ce7157c571f 100644
--- a/test/Transforms/LoopRotate/dbg-value-duplicates.ll
+++ b/test/Transforms/LoopRotate/dbg-value-duplicates.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 source_filename = "/tmp/loop.c"
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.13.0"
diff --git a/test/Transforms/LoopRotate/dbgvalue.ll b/test/Transforms/LoopRotate/dbgvalue.ll
index bc0b20d0fea..93e3c4c252c 100644
--- a/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/test/Transforms/LoopRotate/dbgvalue.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
diff --git a/test/Transforms/LoopRotate/indirectbr.ll b/test/Transforms/LoopRotate/indirectbr.ll
index 8f059d50505..a26ec375953 100644
--- a/test/Transforms/LoopRotate/indirectbr.ll
+++ b/test/Transforms/LoopRotate/indirectbr.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info | FileCheck %s
+; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
 
 ; PR5502
 define void @z80_do_opcodes() nounwind {
diff --git a/test/Transforms/LoopRotate/loopexitinglatch.ll b/test/Transforms/LoopRotate/loopexitinglatch.ll
index c05e512831e..dee29ec958e 100644
--- a/test/Transforms/LoopRotate/loopexitinglatch.ll
+++ b/test/Transforms/LoopRotate/loopexitinglatch.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8m.base-arm-none-eabi"
diff --git a/test/Transforms/LoopRotate/multiple-exits.ll b/test/Transforms/LoopRotate/multiple-exits.ll
index f38c855b9c8..c6f153b8ca3 100644
--- a/test/Transforms/LoopRotate/multiple-exits.ll
+++ b/test/Transforms/LoopRotate/multiple-exits.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopRotate/phi-dbgvalue.ll b/test/Transforms/LoopRotate/phi-dbgvalue.ll
index c4b2a6a76f2..1f7e129c26e 100644
--- a/test/Transforms/LoopRotate/phi-dbgvalue.ll
+++ b/test/Transforms/LoopRotate/phi-dbgvalue.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 
 ;CHECK-LABEL: func
 ;CHECK-LABEL: entry
diff --git a/test/Transforms/LoopRotate/phi-duplicate.ll b/test/Transforms/LoopRotate/phi-duplicate.ll
index 46ee5961ba5..d7f69d8c9cc 100644
--- a/test/Transforms/LoopRotate/phi-duplicate.ll
+++ b/test/Transforms/LoopRotate/phi-duplicate.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0"
 
diff --git a/test/Transforms/LoopRotate/pr22337.ll b/test/Transforms/LoopRotate/pr22337.ll
index 03e804b775e..8195affbcd3 100644
--- a/test/Transforms/LoopRotate/pr22337.ll
+++ b/test/Transforms/LoopRotate/pr22337.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 
 @a = external global i8, align 4
 @tmp = global i8* @a
diff --git a/test/Transforms/LoopRotate/pr33701.ll b/test/Transforms/LoopRotate/pr33701.ll
index ed162b12098..8535e317676 100644
--- a/test/Transforms/LoopRotate/pr33701.ll
+++ b/test/Transforms/LoopRotate/pr33701.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 
 define void @func() {
 bb0:
diff --git a/test/Transforms/LoopRotate/pr35210.ll b/test/Transforms/LoopRotate/pr35210.ll
index 3033ca84732..a705642c435 100644
--- a/test/Transforms/LoopRotate/pr35210.ll
+++ b/test/Transforms/LoopRotate/pr35210.ll
@@ -1,4 +1,5 @@
 ;RUN: opt %s -passes='adce,loop(rotate),adce' -S -debug-pass-manager -debug-only=loop-rotate 2>&1 | FileCheck %s
+;RUN: opt %s -passes='adce,loop(rotate),adce' -S -debug-pass-manager -debug-only=loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa 2>&1 | FileCheck %s --check-prefix=MSSA
 ;REQUIRES: asserts
 
 ; This test is to make sure we invalidate the post dominator pass after loop rotate simplifies the loop latch.
@@ -32,6 +33,36 @@
 ; CHECK-NEXT: Running analysis: PostDominatorTreeAnalysis on f
 ; CHECK-NEXT: Finished llvm::Function pass manager run.
 
+; MSSA: Starting llvm::Function pass manager run.
+; MSSA-NEXT: Running pass: ADCEPass on f
+; MSSA-NEXT: Running analysis: PostDominatorTreeAnalysis on f
+; MSSA-NEXT: Running pass: FunctionToLoopPassAdaptor{{.*}} on f
+; MSSA-NEXT: Starting llvm::Function pass manager run.
+; MSSA-NEXT: Running pass: LoopSimplifyPass on f
+; MSSA-NEXT: Running analysis: LoopAnalysis on f
+; MSSA-NEXT: Running analysis: DominatorTreeAnalysis on f
+; MSSA-NEXT: Running analysis: AssumptionAnalysis on f
+; MSSA-NEXT: Running pass: LCSSAPass on f
+; MSSA-NEXT: Finished llvm::Function pass manager run.
+; MSSA-NEXT: Running analysis: MemorySSAAnalysis on f
+; MSSA-NEXT: Running analysis: AAManager on f
+; MSSA-NEXT: Running analysis: TargetLibraryAnalysis on f
+; MSSA-NEXT: Running analysis: ScalarEvolutionAnalysis on f
+; MSSA-NEXT: Running analysis: TargetIRAnalysis on f
+; MSSA-NEXT: Running analysis: InnerAnalysisManagerProxy{{.*}} on f
+; MSSA-NEXT: Starting Loop pass manager run.
+; MSSA-NEXT: Running analysis: PassInstrumentationAnalysis on bb
+; MSSA-NEXT: Running pass: LoopRotatePass on Loop at depth 1 containing: %bb<header><exiting>,%bb4<latch>
+; MSSA-NEXT: Folding loop latch bb4 into bb
+; MSSA-NEXT: Invalidating all non-preserved analyses for: bb
+; MSSA-NEXT: Finished Loop pass manager run.
+; MSSA-NEXT: Invalidating all non-preserved analyses for: f
+; MSSA-NEXT: Invalidating analysis: PostDominatorTreeAnalysis on f
+; MSSA-NEXT: Running pass: ADCEPass on f
+; MSSA-NEXT: Running analysis: PostDominatorTreeAnalysis on f
+; MSSA-NEXT: Finished llvm::Function pass manager run.
+
+
 ; CHECK-LABEL: define i8 @f() {
 ; CHECK-NEXT : entry:
 ; CHECK-NEXT :   br label %bb
@@ -52,6 +83,26 @@
 ; CHECK-NEXT :
 ; CHECK-NEXT : attributes #0 = { noreturn }
 
+; MSSA-LABEL: define i8 @f() {
+; MSSA-NEXT : entry:
+; MSSA-NEXT :   br label %bb
+; MSSA-NEXT :
+; MSSA-NEXT : bb:                                               ; preds = %bb, %entry
+; MSSA-NEXT :   %mode.0 = phi i8 [ 0, %entry ], [ %indvar.next, %bb ]
+; MSSA-NEXT :   %tmp5 = icmp eq i8 %mode.0, 1
+; MSSA-NEXT :   %indvar.next = add i8 %mode.0, 1
+; MSSA-NEXT :   br i1 %tmp5, label %bb5, label %bb
+; MSSA-NEXT :
+; MSSA-NEXT : bb5:                                              ; preds = %bb
+; MSSA-NEXT :   tail call void @raise_exception() #0
+; MSSA-NEXT :   unreachable
+; MSSA-NEXT : }
+; MSSA-NEXT :
+; MSSA-NEXT : ; Function Attrs: noreturn
+; MSSA-NEXT : declare void @raise_exception() #0
+; MSSA-NEXT :
+; MSSA-NEXT : attributes #0 = { noreturn }
+
 define i8 @f() {
 entry:
   br label %bb
diff --git a/test/Transforms/LoopRotate/pr37205.ll b/test/Transforms/LoopRotate/pr37205.ll
index 3ba6c04545f..20ad7568189 100644
--- a/test/Transforms/LoopRotate/pr37205.ll
+++ b/test/Transforms/LoopRotate/pr37205.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -indvars -verify -loop-rotate -loop-idiom < %s | FileCheck %s
+; RUN: opt -S -indvars -verify -loop-rotate -loop-idiom -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Verify that we invalidate SCEV properly.
diff --git a/test/Transforms/LoopRotate/preserve-mssa.ll b/test/Transforms/LoopRotate/preserve-mssa.ll
new file mode 100644
index 00000000000..d975f80cd9e
--- /dev/null
+++ b/test/Transforms/LoopRotate/preserve-mssa.ll
@@ -0,0 +1,109 @@
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+
+; CHECK-LABEL: @multiedge(
+define void @multiedge() {
+entry:
+  br label %retry
+
+retry:                                            ; preds = %sw.epilog, %entry
+  br i1 undef, label %cleanup, label %if.end
+
+if.end:                                           ; preds = %retry
+  switch i32 undef, label %sw.epilog [
+    i32 -3, label %cleanup
+    i32 -5, label %cleanup
+    i32 -16, label %cleanup
+    i32 -25, label %cleanup
+  ]
+
+sw.epilog:                                        ; preds = %if.end
+  br label %retry
+
+cleanup:                                          ; preds = %if.end, %if.end, %if.end, %if.end, %retry
+  ret void
+}
+
+; CHECK-LABEL: @read_line(
+define internal fastcc i32 @read_line(i8* nocapture %f) unnamed_addr {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end, %entry
+  %call = call i8* @prepbuffer(i8* nonnull undef)
+  %call1 = call i8* @fgets(i8* %call, i32 8192, i8* %f)
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.cond
+  ret i32 undef
+
+if.end:                                           ; preds = %for.cond
+  %call4 = call i64 @strlen(i8* %call)
+  br label %for.cond
+}
+
+declare dso_local i8* @prepbuffer(i8*) local_unnamed_addr
+declare dso_local i8* @fgets(i8*, i32, i8* nocapture) local_unnamed_addr
+declare dso_local i64 @strlen(i8* nocapture) local_unnamed_addr
+
+
+; CHECK-LABEL: @loop3
+define dso_local fastcc void @loop3() unnamed_addr {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.body, label %for.end81
+
+for.body:                                         ; preds = %for.cond
+  %.idx122.val = load i32, i32* undef, align 8
+  call fastcc void @cont()
+  br label %for.cond
+
+for.end81:                                        ; preds = %for.cond
+  ret void
+}
+
+; CHECK-LABEL: @loop4
+define dso_local fastcc void @loop4() unnamed_addr {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  br i1 undef, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  call fastcc void @cont()
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  call fastcc void @cont()
+  call fastcc void @cont()
+  ret void
+}
+
+; Function Attrs: inlinehint nounwind uwtable
+declare dso_local fastcc void @cont() unnamed_addr
+
+@glob_array = internal unnamed_addr constant [3 x i32] [i32 1, i32 0, i32 2], align 4
+; Test against failure in MemorySSAUpdater, when rotate clones instructions as Value.
+; CHECK-LABEL: @loop5
+define dso_local fastcc void @loop5() unnamed_addr {
+entry:
+  br label %for.body
+
+do.cond:                          ; preds = %for.body
+  unreachable
+
+for.body:                               ; preds = %if.end, %entry
+  %indvar = phi i64 [ %indvar.next, %if.end ], [ 0, %entry ]
+  %array = getelementptr inbounds [3 x i32], [3 x i32]* @glob_array, i64 0, i64 %indvar
+  %0 = load i32, i32* %array, align 4
+  br i1 undef, label %do.cond, label %if.end
+
+if.end:                                 ; preds = %for.body
+  store i32 undef, i32* undef, align 4
+  %indvar.next = add nuw nsw i64 %indvar, 1
+  br label %for.body
+}
+
+
diff --git a/test/Transforms/LoopRotate/preserve-scev.ll b/test/Transforms/LoopRotate/preserve-scev.ll
index 7bd22326864..2faf8ec487a 100644
--- a/test/Transforms/LoopRotate/preserve-scev.ll
+++ b/test/Transforms/LoopRotate/preserve-scev.ll
@@ -1,27 +1,48 @@
 ; RUN: opt < %s -loop-rotate -loop-reduce -verify-dom-info -verify-loop-info -disable-output
+; RUN: opt < %s -loop-rotate -loop-reduce -enable-mssa-loop-dependency=true -verify-memoryssa -verify-dom-info -verify-loop-info -disable-output
 
-define fastcc void @foo() nounwind {
+define fastcc void @foo(i32* %A, i64 %i) nounwind {
 BB:
   br label %BB1
 
 BB1:                                              ; preds = %BB19, %BB
+  %tttmp1 = getelementptr i32, i32* %A, i64 %i
+  %tttmp2 = load i32, i32* %tttmp1
+  %tttmp3 = add i32 %tttmp2, 1
+  store i32 %tttmp3, i32* %tttmp1
   br label %BB4
 
 BB2:                                              ; preds = %BB4
   %tmp = bitcast i32 undef to i32                 ; <i32> [#uses=1]
+  %tttmp7 = getelementptr i32, i32* %A, i64 %i
+  %tttmp8 = load i32, i32* %tttmp7
+  %tttmp9 = add i32 %tttmp8, 3
+  store i32 %tttmp9, i32* %tttmp7
   br label %BB4
 
-BB4:                                              ; preds = %BB3, %BB1
+BB4:                                              ; preds = %BB2, %BB1
   %tmp5 = phi i32 [ undef, %BB1 ], [ %tmp, %BB2 ] ; <i32> [#uses=1]
+  %tttmp4 = getelementptr i32, i32* %A, i64 %i
+  %tttmp5 = load i32, i32* %tttmp4
+  %tttmp6 = add i32 %tttmp5, 3
+  store i32 %tttmp6, i32* %tttmp4
   br i1 false, label %BB8, label %BB2
 
 BB8:                                              ; preds = %BB6
   %tmp7 = bitcast i32 %tmp5 to i32                ; <i32> [#uses=2]
+  %tttmp10 = getelementptr i32, i32* %A, i64 %i
+  %tttmp11 = load i32, i32* %tttmp10
+  %tttmp12 = add i32 %tttmp11, 3
+  store i32 %tttmp12, i32* %tttmp10
   br i1 false, label %BB9, label %BB13
 
 BB9:                                              ; preds = %BB12, %BB8
   %tmp10 = phi i32 [ %tmp11, %BB12 ], [ %tmp7, %BB8 ] ; <i32> [#uses=2]
   %tmp11 = add i32 %tmp10, 1                      ; <i32> [#uses=1]
+  %tttmp13 = getelementptr i32, i32* %A, i64 %i
+  %tttmp14 = load i32, i32* %tttmp13
+  %tttmp15 = add i32 %tttmp14, 3
+  store i32 %tttmp15, i32* %tttmp13
   br label %BB12
 
 BB12:                                             ; preds = %BB9
@@ -29,16 +50,28 @@ BB12:                                             ; preds = %BB9
 
 BB13:                                             ; preds = %BB15, %BB8
   %tmp14 = phi i32 [ %tmp16, %BB15 ], [ %tmp7, %BB8 ] ; <i32> [#uses=1]
+  %tttmp16 = getelementptr i32, i32* %A, i64 %i
+  %tttmp17 = load i32, i32* %tttmp16
+  %tttmp18 = add i32 %tttmp17, 3
+  store i32 %tttmp18, i32* %tttmp16
   br label %BB15
 
 BB15:                                             ; preds = %BB13
   %tmp16 = add i32 %tmp14, -1                     ; <i32> [#uses=1]
+  %tttmp19 = getelementptr i32, i32* %A, i64 %i
+  %tttmp20 = load i32, i32* %tttmp19
+  %tttmp21 = add i32 %tttmp20, 3
+  store i32 %tttmp21, i32* %tttmp19
   br i1 false, label %BB13, label %BB18
 
 BB17:                                             ; preds = %BB12
   br label %BB19
 
 BB18:                                             ; preds = %BB15
+  %tttmp22 = getelementptr i32, i32* %A, i64 %i
+  %tttmp23 = load i32, i32* %tttmp22
+  %tttmp24 = add i32 %tttmp23, 3
+  store i32 %tttmp24, i32* %tttmp22
   br label %BB19
 
 BB19:                                             ; preds = %BB18, %BB17
diff --git a/test/Transforms/LoopRotate/vect.omp.persistence.ll b/test/Transforms/LoopRotate/vect.omp.persistence.ll
index 6a1865499d3..c4c987e7b2b 100644
--- a/test/Transforms/LoopRotate/vect.omp.persistence.ll
+++ b/test/Transforms/LoopRotate/vect.omp.persistence.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-- 
GitLab


From bcfd1f3eadddae63420376692f7b328092a54e87 Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov <shal1t712@gmail.com>
Date: Wed, 24 Oct 2018 22:49:06 +0000
Subject: [PATCH 0539/1116] [llvm-objcopy] Introduce dispatch mechanism based
 on the input

In this diff we introduce dispatch mechanism based on
the type of the input (archive, object file, raw binary)
and the format (coff, elf, macho).
We also move the ELF-specific code into the namespace llvm::objcopy::elf.

Test plan: make check-all

Differential revision: https://reviews.llvm.org/D53311


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345217 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objcopy/Object.cpp       | 15 +++--
 tools/llvm-objcopy/Object.h         |  3 +
 tools/llvm-objcopy/llvm-objcopy.cpp | 88 +++++++++++++++++++++--------
 3 files changed, 77 insertions(+), 29 deletions(-)

diff --git a/tools/llvm-objcopy/Object.cpp b/tools/llvm-objcopy/Object.cpp
index d677579ea23..5b2138436d5 100644
--- a/tools/llvm-objcopy/Object.cpp
+++ b/tools/llvm-objcopy/Object.cpp
@@ -28,8 +28,10 @@
 #include <utility>
 #include <vector>
 
-using namespace llvm;
-using namespace llvm::objcopy;
+namespace llvm {
+namespace objcopy {
+namespace elf {
+
 using namespace object;
 using namespace ELF;
 
@@ -1165,7 +1167,9 @@ template <class ELFT> void ELFWriter<ELFT>::writeEhdr() {
   Ehdr.e_machine = Obj.Machine;
   Ehdr.e_version = Obj.Version;
   Ehdr.e_entry = Obj.Entry;
-  Ehdr.e_phnum = size(Obj.segments());
+  // We have to use the fully-qualified name llvm::size
+  // since some compilers complain on ambiguous resolution.
+  Ehdr.e_phnum = llvm::size(Obj.segments());
   Ehdr.e_phoff = (Ehdr.e_phnum != 0) ? Obj.ProgramHdrSegment.Offset : 0;
   Ehdr.e_phentsize = (Ehdr.e_phnum != 0) ? sizeof(Elf_Phdr) : 0;
   Ehdr.e_flags = Obj.Flags;
@@ -1597,9 +1601,6 @@ void BinaryWriter::finalize() {
   SecWriter = llvm::make_unique<BinarySectionWriter>(Buf);
 }
 
-namespace llvm {
-namespace objcopy {
-
 template class BinaryELFBuilder<ELF64LE>;
 template class BinaryELFBuilder<ELF64BE>;
 template class BinaryELFBuilder<ELF32LE>;
@@ -1614,5 +1615,7 @@ template class ELFWriter<ELF64LE>;
 template class ELFWriter<ELF64BE>;
 template class ELFWriter<ELF32LE>;
 template class ELFWriter<ELF32BE>;
+
+} // end namespace elf
 } // end namespace objcopy
 } // end namespace llvm
diff --git a/tools/llvm-objcopy/Object.h b/tools/llvm-objcopy/Object.h
index 1019391fa89..38ef21ffec9 100644
--- a/tools/llvm-objcopy/Object.h
+++ b/tools/llvm-objcopy/Object.h
@@ -30,6 +30,7 @@
 namespace llvm {
 enum class DebugCompressionType;
 namespace objcopy {
+namespace elf {
 
 class SectionBase;
 class Section;
@@ -765,6 +766,8 @@ public:
     return *Segments.back();
   }
 };
+
+} // end namespace elf
 } // end namespace objcopy
 } // end namespace llvm
 
diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index b7dbf6c66b3..b7e2361cc01 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -53,13 +53,6 @@
 #include <system_error>
 #include <utility>
 
-using namespace llvm;
-using namespace llvm::objcopy;
-using namespace object;
-using namespace ELF;
-
-using SectionPred = std::function<bool(const SectionBase &Sec)>;
-
 namespace llvm {
 namespace objcopy {
 
@@ -92,6 +85,16 @@ LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Error E) {
 } // end namespace objcopy
 } // end namespace llvm
 
+// TODO: move everything enclosed in the namespace llvm::objcopy::elf
+// into separate header+cpp files.
+namespace llvm {
+namespace objcopy {
+namespace elf {
+
+using namespace object;
+using namespace ELF;
+using SectionPred = std::function<bool(const SectionBase &Sec)>;
+
 static bool isDebugSection(const SectionBase &Sec) {
   return StringRef(Sec.Name).startswith(".debug") ||
          StringRef(Sec.Name).startswith(".zdebug") || Sec.Name == ".gdb_index";
@@ -513,18 +516,39 @@ static void handleArgs(const CopyConfig &Config, Object &Obj,
     Obj.addSection<GnuDebugLinkSection>(Config.AddGnuDebugLink);
 }
 
-static void executeElfObjcopyOnBinary(const CopyConfig &Config, Reader &Reader,
-                                      Buffer &Out, ElfType OutputElfType) {
+void executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In,
+                               Buffer &Out) {
+  BinaryReader Reader(Config.BinaryArch, &In);
   std::unique_ptr<Object> Obj = Reader.create();
 
+  const ElfType OutputElfType = getOutputElfType(Config.BinaryArch);
   handleArgs(Config, *Obj, Reader, OutputElfType);
+  std::unique_ptr<Writer> Writer =
+      createWriter(Config, *Obj, Out, OutputElfType);
+  Writer->finalize();
+  Writer->write();
+}
 
+void executeObjcopyOnBinary(const CopyConfig &Config,
+                            object::ELFObjectFileBase &In, Buffer &Out) {
+  ELFReader Reader(&In);
+  std::unique_ptr<Object> Obj = Reader.create();
+  const ElfType OutputElfType = getOutputElfType(In);
+  handleArgs(Config, *Obj, Reader, OutputElfType);
   std::unique_ptr<Writer> Writer =
       createWriter(Config, *Obj, Out, OutputElfType);
   Writer->finalize();
   Writer->write();
 }
 
+} // end namespace elf
+} // end namespace objcopy
+} // end namespace llvm
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::objcopy;
+
 // For regular archives this function simply calls llvm::writeArchive,
 // For thin archives it writes the archive file itself as well as its members.
 static Error deepWriteArchive(StringRef ArcName,
@@ -554,8 +578,29 @@ static Error deepWriteArchive(StringRef ArcName,
   return Error::success();
 }
 
-static void executeElfObjcopyOnArchive(const CopyConfig &Config,
-                                       const Archive &Ar) {
+/// The function executeObjcopyOnRawBinary does the dispatch based on the format
+/// of the output specified by the command line options.
+static void executeObjcopyOnRawBinary(const CopyConfig &Config,
+                                      MemoryBuffer &In, Buffer &Out) {
+  // TODO: llvm-objcopy should parse CopyConfig.OutputFormat to recognize
+  // formats other than ELF / "binary" and invoke
+  // elf::executeObjcopyOnRawBinary, macho::executeObjcopyOnRawBinary or
+  // coff::executeObjcopyOnRawBinary accordingly.
+  return elf::executeObjcopyOnRawBinary(Config, In, Out);
+}
+
+/// The function executeObjcopyOnBinary does the dispatch based on the format
+/// of the input binary (ELF, MachO or COFF).
+static void executeObjcopyOnBinary(const CopyConfig &Config, object::Binary &In,
+                                   Buffer &Out) {
+  if (auto *ELFBinary = dyn_cast<object::ELFObjectFileBase>(&In))
+    return elf::executeObjcopyOnBinary(Config, *ELFBinary, Out);
+  else
+    error("Unsupported object file format");
+}
+
+static void executeObjcopyOnArchive(const CopyConfig &Config,
+                                    const Archive &Ar) {
   std::vector<NewArchiveMember> NewArchiveMembers;
   Error Err = Error::success();
   for (const Archive::Child &Child : Ar.children(Err)) {
@@ -569,8 +614,7 @@ static void executeElfObjcopyOnArchive(const CopyConfig &Config,
       reportError(Ar.getFileName(), ChildNameOrErr.takeError());
 
     MemBuffer MB(ChildNameOrErr.get());
-    ELFReader Reader(Bin);
-    executeElfObjcopyOnBinary(Config, Reader, MB, getOutputElfType(*Bin));
+    executeObjcopyOnBinary(Config, *Bin, MB);
 
     Expected<NewArchiveMember> Member =
         NewArchiveMember::getOldMember(Child, true);
@@ -605,7 +649,10 @@ static void restoreDateOnFile(StringRef Filename,
     reportError(Filename, EC);
 }
 
-static void executeElfObjcopy(const CopyConfig &Config) {
+/// The function executeObjcopy does the higher level dispatch based on the type
+/// of input (raw binary, archive or single object file) and takes care of the
+/// format-agnostic modifications, i.e. preserving dates.
+static void executeObjcopy(const CopyConfig &Config) {
   sys::fs::file_status Stat;
   if (Config.PreserveDates)
     if (auto EC = sys::fs::status(Config.InputFilename, Stat))
@@ -615,11 +662,8 @@ static void executeElfObjcopy(const CopyConfig &Config) {
     auto BufOrErr = MemoryBuffer::getFile(Config.InputFilename);
     if (!BufOrErr)
       reportError(Config.InputFilename, BufOrErr.getError());
-
     FileBuffer FB(Config.OutputFilename);
-    BinaryReader Reader(Config.BinaryArch, BufOrErr->get());
-    executeElfObjcopyOnBinary(Config, Reader, FB,
-                              getOutputElfType(Config.BinaryArch));
+    executeObjcopyOnRawBinary(Config, *BufOrErr->get(), FB);
   } else {
     Expected<OwningBinary<llvm::object::Binary>> BinaryOrErr =
         createBinary(Config.InputFilename);
@@ -627,12 +671,10 @@ static void executeElfObjcopy(const CopyConfig &Config) {
       reportError(Config.InputFilename, BinaryOrErr.takeError());
 
     if (Archive *Ar = dyn_cast<Archive>(BinaryOrErr.get().getBinary())) {
-      executeElfObjcopyOnArchive(Config, *Ar);
+      executeObjcopyOnArchive(Config, *Ar);
     } else {
       FileBuffer FB(Config.OutputFilename);
-      Binary *Bin = BinaryOrErr.get().getBinary();
-      ELFReader Reader(Bin);
-      executeElfObjcopyOnBinary(Config, Reader, FB, getOutputElfType(*Bin));
+      executeObjcopyOnBinary(Config, *BinaryOrErr.get().getBinary(), FB);
     }
   }
 
@@ -652,5 +694,5 @@ int main(int argc, char **argv) {
   else
     DriverConfig = parseObjcopyOptions(makeArrayRef(argv + 1, argc));
   for (const CopyConfig &CopyConfig : DriverConfig.CopyConfigs)
-    executeElfObjcopy(CopyConfig);
+    executeObjcopy(CopyConfig);
 }
-- 
GitLab


From bbc2ea9b2187d3428f3829d67cc68e9f76149c31 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 24 Oct 2018 22:49:55 +0000
Subject: [PATCH 0540/1116] [NFC] Rename minnan and maxnan to minimum and
 maximum

Summary:
Changes all uses of minnan/maxnan to minimum/maximum
globally. These names emphasize that the semantic difference between
these operations is more than just NaN-propagation.

Reviewers: arsenm, aheejin, dschuff, javed.absar

Subscribers: jholewinski, sdardis, wdng, sbc100, jgravelle-google, jrtc27, atanasyan, llvm-commits

Differential Revision: https://reviews.llvm.org/D53112

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345218 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/BasicTTIImpl.h           |  4 +--
 include/llvm/CodeGen/ISDOpcodes.h             |  8 +++---
 include/llvm/CodeGen/TargetLowering.h         |  4 +--
 include/llvm/Target/TargetSelectionDAG.td     |  5 ++--
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      | 12 ++++----
 .../SelectionDAG/LegalizeFloatTypes.cpp       |  4 +--
 .../SelectionDAG/LegalizeVectorOps.cpp        |  4 +--
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 16 +++++------
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |  5 ++--
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 28 +++++++++----------
 .../SelectionDAG/SelectionDAGDumper.cpp       |  5 ++--
 lib/CodeGen/TargetLoweringBase.cpp            |  4 +--
 lib/Target/AArch64/AArch64ISelLowering.cpp    | 20 ++++++-------
 lib/Target/AArch64/AArch64InstrInfo.td        | 12 ++++----
 lib/Target/ARM/ARMISelLowering.cpp            | 26 ++++++++---------
 lib/Target/ARM/ARMInstrNEON.td                | 24 ++++++++--------
 lib/Target/Mips/MipsSEISelLowering.cpp        |  4 +--
 lib/Target/NVPTX/NVPTXISelLowering.cpp        |  4 +--
 lib/Target/SystemZ/SystemZISelLowering.cpp    | 20 ++++++-------
 lib/Target/SystemZ/SystemZInstrVector.td      |  4 +--
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  6 ++--
 .../WebAssembly/WebAssemblyInstrFloat.td      |  4 +--
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  4 +--
 test/CodeGen/ARM/fp16-promote.ll              |  8 +++---
 test/CodeGen/SystemZ/vec-max-05.ll            |  6 ++--
 test/CodeGen/SystemZ/vec-min-05.ll            |  6 ++--
 unittests/Analysis/ValueTrackingTest.cpp      |  4 +--
 27 files changed, 124 insertions(+), 127 deletions(-)

diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index 18c9a61d19b..3f7a1206c84 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1073,12 +1073,12 @@ public:
     case Intrinsic::minnum:
       ISDs.push_back(ISD::FMINNUM);
       if (FMF.noNaNs())
-        ISDs.push_back(ISD::FMINNAN);
+        ISDs.push_back(ISD::FMINIMUM);
       break;
     case Intrinsic::maxnum:
       ISDs.push_back(ISD::FMAXNUM);
       if (FMF.noNaNs())
-        ISDs.push_back(ISD::FMAXNAN);
+        ISDs.push_back(ISD::FMAXIMUM);
       break;
     case Intrinsic::copysign:
       ISDs.push_back(ISD::FCOPYSIGN);
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 75ec0b99a12..1c0318d6a70 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -577,10 +577,10 @@ namespace ISD {
     /// signaling NaN, returns a quiet NaN.
     FMINNUM_IEEE, FMAXNUM_IEEE,
 
-    /// FMINNAN/FMAXNAN - NaN-propagating minimum/maximum that also treat -0.0
-    /// as less than 0.0. While FMINNUM/FMAXNUM follow IEEE 754-2008 semantics,
-    /// FMINNAN/FMAXNAN follow IEEE 754-2018 draft semantics.
-    FMINNAN, FMAXNAN,
+    /// FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0
+    /// as less than 0.0. While FMINNUM_IEEE/FMAXNUM_IEEE follow IEEE 754-2008
+    /// semantics, FMINIMUM/FMAXIMUM follow IEEE 754-2018 draft semantics.
+    FMINIMUM, FMAXIMUM,
 
     /// FSINCOS - Compute both fsin and fcos as a single operation.
     FSINCOS,
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 4b1fae89be5..9fae319ac88 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -2099,8 +2099,8 @@ public:
     case ISD::ADDE:
     case ISD::FMINNUM:
     case ISD::FMAXNUM:
-    case ISD::FMINNAN:
-    case ISD::FMAXNAN:
+    case ISD::FMINIMUM:
+    case ISD::FMAXIMUM:
       return true;
     default: return false;
     }
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index c235c85e144..b1558b0f347 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -413,9 +413,8 @@ def fminnum_ieee : SDNode<"ISD::FMINNUM_IEEE", SDTFPBinOp,
                           [SDNPCommutative]>;
 def fmaxnum_ieee  : SDNode<"ISD::FMAXNUM_IEEE", SDTFPBinOp,
                            [SDNPCommutative]>;
-
-def fminnan    : SDNode<"ISD::FMINNAN"    , SDTFPBinOp>;
-def fmaxnan    : SDNode<"ISD::FMAXNAN"    , SDTFPBinOp>;
+def fminimum   : SDNode<"ISD::FMINIMUM"   , SDTFPBinOp>;
+def fmaximum   : SDNode<"ISD::FMAXIMUM"   , SDTFPBinOp>;
 def fgetsign   : SDNode<"ISD::FGETSIGN"   , SDTFPToIntOp>;
 def fcanonicalize : SDNode<"ISD::FCANONICALIZE", SDTFPUnaryOp>;
 def fneg       : SDNode<"ISD::FNEG"       , SDTFPUnaryOp>;
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4f5e96a8257..e8584921c42 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -371,8 +371,8 @@ namespace {
     SDValue visitFFLOOR(SDNode *N);
     SDValue visitFMINNUM(SDNode *N);
     SDValue visitFMAXNUM(SDNode *N);
-    SDValue visitFMINNAN(SDNode *N);
-    SDValue visitFMAXNAN(SDNode *N);
+    SDValue visitFMINIMUM(SDNode *N);
+    SDValue visitFMAXIMUM(SDNode *N);
     SDValue visitBRCOND(SDNode *N);
     SDValue visitBR_CC(SDNode *N);
     SDValue visitLOAD(SDNode *N);
@@ -1584,8 +1584,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FFLOOR:             return visitFFLOOR(N);
   case ISD::FMINNUM:            return visitFMINNUM(N);
   case ISD::FMAXNUM:            return visitFMAXNUM(N);
-  case ISD::FMINNAN:            return visitFMINNAN(N);
-  case ISD::FMAXNAN:            return visitFMAXNAN(N);
+  case ISD::FMINIMUM:           return visitFMINIMUM(N);
+  case ISD::FMAXIMUM:           return visitFMAXIMUM(N);
   case ISD::FCEIL:              return visitFCEIL(N);
   case ISD::FTRUNC:             return visitFTRUNC(N);
   case ISD::BRCOND:             return visitBRCOND(N);
@@ -12158,11 +12158,11 @@ SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
   return visitFMinMax(DAG, N, maxnum);
 }
 
-SDValue DAGCombiner::visitFMINNAN(SDNode *N) {
+SDValue DAGCombiner::visitFMINIMUM(SDNode *N) {
   return visitFMinMax(DAG, N, minimum);
 }
 
-SDValue DAGCombiner::visitFMAXNAN(SDNode *N) {
+SDValue DAGCombiner::visitFMAXIMUM(SDNode *N) {
   return visitFMinMax(DAG, N, maximum);
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index b6cce910228..866744c397b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1910,8 +1910,8 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     // Binary FP Operations
     case ISD::FADD:
     case ISD::FDIV:
-    case ISD::FMAXNAN:
-    case ISD::FMINNAN:
+    case ISD::FMAXIMUM:
+    case ISD::FMINIMUM:
     case ISD::FMAXNUM:
     case ISD::FMINNUM:
     case ISD::FMUL:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index fdb74fef121..850cdcd1701 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -356,8 +356,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FMAXNUM:
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
-  case ISD::FMINNAN:
-  case ISD::FMAXNAN:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
   case ISD::FCOPYSIGN:
   case ISD::FSQRT:
   case ISD::FSIN:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 43b4bf0c497..58446101556 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -115,8 +115,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMAXNUM:
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
-  case ISD::FMINNAN:
-  case ISD::FMAXNAN:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
@@ -786,8 +786,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMUL:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
-  case ISD::FMINNAN:
-  case ISD::FMAXNAN:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::FDIV:
@@ -1804,10 +1804,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
   case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
   case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
   case ISD::VECREDUCE_FMAX:
-    CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN;
+    CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM;
     break;
   case ISD::VECREDUCE_FMIN:
-    CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN;
+    CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM;
     break;
   default:
     llvm_unreachable("Unexpected reduce ISD node");
@@ -2356,8 +2356,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::XOR:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
-  case ISD::FMINNAN:
-  case ISD::FMAXNAN:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 1f63923d7ec..2d99a6aecb5 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3730,12 +3730,11 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
            (isKnownNeverNaN(Op.getOperand(1), false, Depth + 1) &&
             isKnownNeverSNaN(Op.getOperand(0), Depth + 1));
   }
-  case ISD::FMINNAN:
-  case ISD::FMAXNAN: {
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM: {
     // TODO: Does this quiet or return the origina NaN as-is?
     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
            isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
-
   }
   case ISD::EXTRACT_VECTOR_ELT: {
     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 87921ccb074..05eac30843f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2972,16 +2972,16 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
     case SPF_FMINNUM:
       switch (SPR.NaNBehavior) {
       case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
-      case SPNB_RETURNS_NAN:   Opc = ISD::FMINNAN; break;
+      case SPNB_RETURNS_NAN:   Opc = ISD::FMINIMUM; break;
       case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break;
       case SPNB_RETURNS_ANY: {
         if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT))
           Opc = ISD::FMINNUM;
-        else if (TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT))
-          Opc = ISD::FMINNAN;
+        else if (TLI.isOperationLegalOrCustom(ISD::FMINIMUM, VT))
+          Opc = ISD::FMINIMUM;
         else if (UseScalarMinMax)
           Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()) ?
-            ISD::FMINNUM : ISD::FMINNAN;
+            ISD::FMINNUM : ISD::FMINIMUM;
         break;
       }
       }
@@ -2989,17 +2989,17 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
     case SPF_FMAXNUM:
       switch (SPR.NaNBehavior) {
       case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
-      case SPNB_RETURNS_NAN:   Opc = ISD::FMAXNAN; break;
+      case SPNB_RETURNS_NAN:   Opc = ISD::FMAXIMUM; break;
       case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break;
       case SPNB_RETURNS_ANY:
 
         if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT))
           Opc = ISD::FMAXNUM;
-        else if (TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT))
-          Opc = ISD::FMAXNAN;
+        else if (TLI.isOperationLegalOrCustom(ISD::FMAXIMUM, VT))
+          Opc = ISD::FMAXIMUM;
         else if (UseScalarMinMax)
           Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()) ?
-            ISD::FMAXNUM : ISD::FMAXNAN;
+            ISD::FMAXNUM : ISD::FMAXIMUM;
         break;
       }
       break;
@@ -5565,8 +5565,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::minnum: {
     auto VT = getValue(I.getArgOperand(0)).getValueType();
     unsigned Opc =
-        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT)
-            ? ISD::FMINNAN
+        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMINIMUM, VT)
+            ? ISD::FMINIMUM
             : ISD::FMINNUM;
     setValue(&I, DAG.getNode(Opc, sdl, VT,
                              getValue(I.getArgOperand(0)),
@@ -5576,8 +5576,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::maxnum: {
     auto VT = getValue(I.getArgOperand(0)).getValueType();
     unsigned Opc =
-        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT)
-            ? ISD::FMAXNAN
+        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMAXIMUM, VT)
+            ? ISD::FMAXIMUM
             : ISD::FMAXNUM;
     setValue(&I, DAG.getNode(Opc, sdl, VT,
                              getValue(I.getArgOperand(0)),
@@ -5585,13 +5585,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
   case Intrinsic::minimum:
-    setValue(&I, DAG.getNode(ISD::FMINNAN, sdl,
+    setValue(&I, DAG.getNode(ISD::FMINIMUM, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1))));
     return nullptr;
   case Intrinsic::maximum:
-    setValue(&I, DAG.getNode(ISD::FMAXNAN, sdl,
+    setValue(&I, DAG.getNode(ISD::FMAXIMUM, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1))));
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 1b032ce456a..1c9a49306c6 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -178,9 +178,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FMAXNUM:                    return "fmaxnum";
   case ISD::FMINNUM_IEEE:               return "fminnum_ieee";
   case ISD::FMAXNUM_IEEE:               return "fmaxnum_ieee";
-
-  case ISD::FMINNAN:                    return "fminnan";
-  case ISD::FMAXNAN:                    return "fmaxnan";
+  case ISD::FMINIMUM:                   return "fminimum";
+  case ISD::FMAXIMUM:                   return "fmaximum";
   case ISD::FNEG:                       return "fneg";
   case ISD::FSQRT:                      return "fsqrt";
   case ISD::STRICT_FSQRT:               return "strict_fsqrt";
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 09c5b527956..715112edc17 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -602,8 +602,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::FMAXNUM, VT, Expand);
     setOperationAction(ISD::FMINNUM_IEEE, VT, Expand);
     setOperationAction(ISD::FMAXNUM_IEEE, VT, Expand);
-    setOperationAction(ISD::FMINNAN, VT, Expand);
-    setOperationAction(ISD::FMAXNAN, VT, Expand);
+    setOperationAction(ISD::FMINIMUM, VT, Expand);
+    setOperationAction(ISD::FMAXIMUM, VT, Expand);
     setOperationAction(ISD::FMAD, VT, Expand);
     setOperationAction(ISD::SMIN, VT, Expand);
     setOperationAction(ISD::SMAX, VT, Expand);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index c8227cd139a..a7a1b0a5feb 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -385,8 +385,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
     setOperationAction(ISD::FMINNUM,     MVT::f16,  Promote);
     setOperationAction(ISD::FMAXNUM,     MVT::f16,  Promote);
-    setOperationAction(ISD::FMINNAN,     MVT::f16,  Promote);
-    setOperationAction(ISD::FMAXNAN,     MVT::f16,  Promote);
+    setOperationAction(ISD::FMINIMUM,    MVT::f16,  Promote);
+    setOperationAction(ISD::FMAXIMUM,    MVT::f16,  Promote);
 
     // promote v4f16 to v4f32 when that is known to be safe.
     setOperationAction(ISD::FADD,        MVT::v4f16, Promote);
@@ -450,8 +450,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FROUND, Ty, Legal);
     setOperationAction(ISD::FMINNUM, Ty, Legal);
     setOperationAction(ISD::FMAXNUM, Ty, Legal);
-    setOperationAction(ISD::FMINNAN, Ty, Legal);
-    setOperationAction(ISD::FMAXNAN, Ty, Legal);
+    setOperationAction(ISD::FMINIMUM, Ty, Legal);
+    setOperationAction(ISD::FMAXIMUM, Ty, Legal);
   }
 
   if (Subtarget->hasFullFP16()) {
@@ -463,8 +463,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FROUND,  MVT::f16, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
   }
 
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
@@ -816,8 +816,8 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
   if (VT.isFloatingPoint() &&
       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
-    for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
-                            ISD::FMINNUM, ISD::FMAXNUM})
+    for (unsigned Opcode :
+         {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
       setOperationAction(Opcode, VT, Legal);
 
   if (Subtarget->isLittleEndian()) {
@@ -9867,10 +9867,10 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_neon_umaxv:
     return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
   case Intrinsic::aarch64_neon_fmax:
-    return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
+    return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_fmin:
-    return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
+    return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_fmaxnm:
     return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 24f6aaaab57..88e5632fbe6 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3050,18 +3050,18 @@ let SchedRW = [WriteFDiv] in {
 defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
 }
 defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
-defm FMAX   : TwoOperandFPData<0b0100, "fmax", fmaxnan>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", fmaximum>;
 defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
-defm FMIN   : TwoOperandFPData<0b0101, "fmin", fminnan>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", fminimum>;
 let SchedRW = [WriteFMul] in {
 defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
 defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
 }
 defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
 
-def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fmaximum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fminimum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
 def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
@@ -3387,11 +3387,11 @@ defm FDIV    : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
 defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
 defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
 defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
-defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaximum>;
 defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
 defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
 defm FMINP   : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
-defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminimum>;
 
 // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
 // instruction expects the addend first, while the fma intrinsic puts it last.
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 8c18477005f..0f68fb0287c 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1143,14 +1143,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->hasNEON()) {
     // vmin and vmax aren't available in a scalar form, so we use
     // a NEON instruction with an undef lane instead.
-    setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
 
     if (Subtarget->hasFullFP16()) {
       setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
@@ -1158,10 +1158,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
       setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
 
-      setOperationAction(ISD::FMINNAN, MVT::v4f16, Legal);
-      setOperationAction(ISD::FMAXNAN, MVT::v4f16, Legal);
-      setOperationAction(ISD::FMINNAN, MVT::v8f16, Legal);
-      setOperationAction(ISD::FMAXNAN, MVT::v8f16, Legal);
+      setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
+      setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
+      setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
+      setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
     }
   }
 
@@ -3408,7 +3408,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                          Op.getOperand(1), Op.getOperand(2));
     }
     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
-      ? ISD::FMINNAN : ISD::FMAXNAN;
+      ? ISD::FMINIMUM : ISD::FMAXIMUM;
     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   }
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 2085507056b..96986e74415 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -5521,17 +5521,17 @@ defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm,
                            "vmax", "u", umax, 1>;
 def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND,
                         "vmax", "f32",
-                        v2f32, v2f32, fmaxnan, 1>;
+                        v2f32, v2f32, fmaximum, 1>;
 def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmax", "f32",
-                        v4f32, v4f32, fmaxnan, 1>;
+                        v4f32, v4f32, fmaximum, 1>;
 def  VMAXhd   : N3VDInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBIND,
                         "vmax", "f16",
-                        v4f16, v4f16, fmaxnan, 1>,
+                        v4f16, v4f16, fmaximum, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
 def  VMAXhq   : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmax", "f16",
-                        v8f16, v8f16, fmaxnan, 1>,
+                        v8f16, v8f16, fmaximum, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
 
 // VMAXNM
@@ -5563,17 +5563,17 @@ defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm,
                            "vmin", "u", umin, 1>;
 def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND,
                         "vmin", "f32",
-                        v2f32, v2f32, fminnan, 1>;
+                        v2f32, v2f32, fminimum, 1>;
 def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmin", "f32",
-                        v4f32, v4f32, fminnan, 1>;
+                        v4f32, v4f32, fminimum, 1>;
 def  VMINhd   : N3VDInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBIND,
                         "vmin", "f16",
-                        v4f16, v4f16, fminnan, 1>,
+                        v4f16, v4f16, fminimum, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
 def  VMINhq   : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmin", "f16",
-                        v8f16, v8f16, fminnan, 1>,
+                        v8f16, v8f16, fminimum, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
 
 // VMINNM
@@ -7093,10 +7093,10 @@ def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
       Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
 def : N2VSPat<fabs, VABSfd>;
 def : N2VSPat<fneg, VNEGfd>;
-def : N3VSPatFP16<fmaxnan, VMAXhd>, Requires<[HasFullFP16]>;
-def : N3VSPatFP16<fminnan, VMINhd>, Requires<[HasFullFP16]>;
-def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>;
-def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>;
+def : N3VSPatFP16<fmaximum, VMAXhd>, Requires<[HasFullFP16]>;
+def : N3VSPatFP16<fminimum, VMINhd>, Requires<[HasFullFP16]>;
+def : N3VSPat<fmaximum, VMAXfd>, Requires<[HasNEON]>;
+def : N3VSPat<fminimum, VMINfd>, Requires<[HasNEON]>;
 def : NVCVTFIPat<fp_to_sint, VCVTf2sd>;
 def : NVCVTFIPat<fp_to_uint, VCVTf2ud>;
 def : NVCVTIFPat<sint_to_fp, VCVTs2fd>;
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index f625a2903bd..d745ce00149 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -158,8 +158,8 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
     setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
     setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
-    setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
-    setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
 
     setTargetDAGCombine(ISD::AND);
     setTargetDAGCombine(ISD::OR);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2536623fb85..1f323b63034 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -560,8 +560,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   }
   setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
   setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
-  setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
-  setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
+  setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
+  setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
 
   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
   // No FPOW or FREM in PTX.
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 7ab4024d43c..53cd21c4236 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -452,29 +452,29 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
 
     setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f64, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::f64, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f64, Legal);
 
     setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::v2f64, Legal);
     setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::v2f64, Legal);
 
     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
 
     setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
     setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
 
     setOperationAction(ISD::FMAXNUM, MVT::f128, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f128, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f128, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f128, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::f128, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f128, Legal);
   }
 
   // We have fused multiply-addition for f32 and f64 but not f128.
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index 094d3a7de3d..8523af7e573 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -1031,7 +1031,7 @@ let Predicates = [FeatureVector] in {
   // Maximum.
   multiclass VectorMax<Instruction insn, TypedReg tr> {
     def : FPMinMax<insn, fmaxnum, tr, 4>;
-    def : FPMinMax<insn, fmaxnan, tr, 1>;
+    def : FPMinMax<insn, fmaximum, tr, 1>;
   }
   let Predicates = [FeatureVectorEnhancements1] in {
     def VFMAX   : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>;
@@ -1055,7 +1055,7 @@ let Predicates = [FeatureVector] in {
   // Minimum.
   multiclass VectorMin<Instruction insn, TypedReg tr> {
     def : FPMinMax<insn, fminnum, tr, 4>;
-    def : FPMinMax<insn, fminnan, tr, 1>;
+    def : FPMinMax<insn, fminimum, tr, 1>;
   }
   let Predicates = [FeatureVectorEnhancements1] in {
     def VFMIN   : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>;
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 1da66af5560..4b20404cf61 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -105,9 +105,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     for (auto Op :
          {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT, ISD::FRINT})
       setOperationAction(Op, T, Legal);
-    // Support minnan and maxnan, which otherwise default to expand.
-    setOperationAction(ISD::FMINNAN, T, Legal);
-    setOperationAction(ISD::FMAXNAN, T, Legal);
+    // Support minimum and maximum, which otherwise default to expand.
+    setOperationAction(ISD::FMINIMUM, T, Legal);
+    setOperationAction(ISD::FMAXIMUM, T, Legal);
     // WebAssembly currently has no builtin f16 support.
     setOperationAction(ISD::FP16_TO_FP, T, Expand);
     setOperationAction(ISD::FP_TO_FP16, T, Expand);
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 364c485f409..3c02b0f01ea 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -58,8 +58,8 @@ defm NEG : UnaryFP<fneg, "neg ", 0x8c, 0x9a>;
 defm COPYSIGN : BinaryFP<fcopysign, "copysign", 0x98, 0xa6>;
 
 let isCommutable = 1 in {
-defm MIN : BinaryFP<fminnan, "min ", 0x96, 0xa4>;
-defm MAX : BinaryFP<fmaxnan, "max ", 0x97, 0xa5>;
+defm MIN : BinaryFP<fminimum, "min ", 0x96, 0xa4>;
+defm MAX : BinaryFP<fmaximum, "max ", 0x97, 0xa5>;
 } // isCommutable = 1
 
 defm CEIL : UnaryFP<fceil, "ceil", 0x8d, 0x9b>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 711d42a219e..08bb39748b8 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -757,10 +757,10 @@ multiclass SIMDBinaryFP<SDNode node, string name, bits<32> baseInst> {
 }
 
 // NaN-propagating minimum: min
-defm MIN : SIMDBinaryFP<fminnan, "min", 129>;
+defm MIN : SIMDBinaryFP<fminimum, "min", 129>;
 
 // NaN-propagating maximum: max
-defm MAX : SIMDBinaryFP<fmaxnan, "max", 131>;
+defm MAX : SIMDBinaryFP<fmaximum, "max", 131>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point arithmetic
diff --git a/test/CodeGen/ARM/fp16-promote.ll b/test/CodeGen/ARM/fp16-promote.ll
index dae9ef2ea83..d7eaddc9e40 100644
--- a/test/CodeGen/ARM/fp16-promote.ll
+++ b/test/CodeGen/ARM/fp16-promote.ll
@@ -644,7 +644,7 @@ define void @test_maxnum(half* %p, half* %q) #0 {
   ret void
 }
 
-; CHECK-ALL-LABEL: test_minnan:
+; CHECK-ALL-LABEL: test_minimum:
 ; CHECK-FP16: vmov.f32 s0, #1.000000e+00
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-LIBCALL: bl __aeabi_h2f
@@ -654,7 +654,7 @@ define void @test_maxnum(half* %p, half* %q) #0 {
 ; CHECK-NOVFP: bl __aeabi_fcmpge
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL: bl __aeabi_f2h
-define void @test_minnan(half* %p) #0 {
+define void @test_minimum(half* %p) #0 {
   %a = load half, half* %p, align 2
   %c = fcmp ult half %a, 1.0
   %r = select i1 %c, half %a, half 1.0
@@ -662,7 +662,7 @@ define void @test_minnan(half* %p) #0 {
   ret void
 }
 
-; CHECK-ALL-LABEL: test_maxnan:
+; CHECK-ALL-LABEL: test_maximum:
 ; CHECK-FP16: vmov.f32 s0, #1.000000e+00
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-LIBCALL: bl __aeabi_h2f
@@ -672,7 +672,7 @@ define void @test_minnan(half* %p) #0 {
 ; CHECK-NOVFP: bl __aeabi_fcmple
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL: bl __aeabi_f2h
-define void @test_maxnan(half* %p) #0 {
+define void @test_maximum(half* %p) #0 {
   %a = load half, half* %p, align 2
   %c = fcmp ugt half %a, 1.0
   %r = select i1 %c, half %a, half 1.0
diff --git a/test/CodeGen/SystemZ/vec-max-05.ll b/test/CodeGen/SystemZ/vec-max-05.ll
index 591d3bf36f1..1fe0db350b7 100644
--- a/test/CodeGen/SystemZ/vec-max-05.ll
+++ b/test/CodeGen/SystemZ/vec-max-05.ll
@@ -42,7 +42,7 @@ define double @f3(double %dummy, double %val) {
   ret double %ret
 }
 
-; Test a f64 constant compare/select resulting in maxnan.
+; Test a f64 constant compare/select resulting in maximum.
 define double @f4(double %dummy, double %val) {
 ; CHECK-LABEL: f4:
 ; CHECK: lzdr [[REG:%f[0-9]+]]
@@ -92,7 +92,7 @@ define float @f13(float %dummy, float %val) {
   ret float %ret
 }
 
-; Test a f32 constant compare/select resulting in maxnan.
+; Test a f32 constant compare/select resulting in maximum.
 define float @f14(float %dummy, float %val) {
 ; CHECK-LABEL: f14:
 ; CHECK: lzer [[REG:%f[0-9]+]]
@@ -158,7 +158,7 @@ define void @f23(fp128 *%ptr, fp128 *%dst) {
   ret void
 }
 
-; Test a f128 constant compare/select resulting in maxnan.
+; Test a f128 constant compare/select resulting in maximum.
 define void @f24(fp128 *%ptr, fp128 *%dst) {
 ; CHECK-LABEL: f24:
 ; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
diff --git a/test/CodeGen/SystemZ/vec-min-05.ll b/test/CodeGen/SystemZ/vec-min-05.ll
index 3eef9016cd0..6417e5ed750 100644
--- a/test/CodeGen/SystemZ/vec-min-05.ll
+++ b/test/CodeGen/SystemZ/vec-min-05.ll
@@ -42,7 +42,7 @@ define double @f3(double %dummy, double %val) {
   ret double %ret
 }
 
-; Test a f64 constant compare/select resulting in minnan.
+; Test a f64 constant compare/select resulting in minimum.
 define double @f4(double %dummy, double %val) {
 ; CHECK-LABEL: f4:
 ; CHECK: lzdr [[REG:%f[0-9]+]]
@@ -92,7 +92,7 @@ define float @f13(float %dummy, float %val) {
   ret float %ret
 }
 
-; Test a f32 constant compare/select resulting in minnan.
+; Test a f32 constant compare/select resulting in minimum.
 define float @f14(float %dummy, float %val) {
 ; CHECK-LABEL: f14:
 ; CHECK: lzer [[REG:%f[0-9]+]]
@@ -158,7 +158,7 @@ define void @f23(fp128 *%ptr, fp128 *%dst) {
   ret void
 }
 
-; Test a f128 constant compare/select resulting in minnan.
+; Test a f128 constant compare/select resulting in minimum.
 define void @f24(fp128 *%ptr, fp128 *%dst) {
 ; CHECK-LABEL: f24:
 ; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
diff --git a/unittests/Analysis/ValueTrackingTest.cpp b/unittests/Analysis/ValueTrackingTest.cpp
index f391ca12e55..d6365176d08 100644
--- a/unittests/Analysis/ValueTrackingTest.cpp
+++ b/unittests/Analysis/ValueTrackingTest.cpp
@@ -149,7 +149,7 @@ TEST_F(MatchSelectPatternTest, FMinConstantZeroNsz) {
   expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, true});
 }
 
-TEST_F(MatchSelectPatternTest, VectorFMinNaN) {
+TEST_F(MatchSelectPatternTest, VectorFMinimum) {
   parseAssembly(
       "define <4 x float> @test(<4 x float> %a) {\n"
       "  %1 = fcmp ule <4 x float> %a, \n"
@@ -177,7 +177,7 @@ TEST_F(MatchSelectPatternTest, VectorFMinOtherOrdered) {
   expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, true});
 }
 
-TEST_F(MatchSelectPatternTest, VectorNotFMinNaN) {
+TEST_F(MatchSelectPatternTest, VectorNotFMinimum) {
   parseAssembly(
       "define <4 x float> @test(<4 x float> %a) {\n"
       "  %1 = fcmp ule <4 x float> %a, \n"
-- 
GitLab


From 59116cffe1e0992b92e8d0fd7d12c313c1d49318 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 24 Oct 2018 22:57:28 +0000
Subject: [PATCH 0541/1116] [ELF] Fix large code model MIR verifier errors

Instead of using the MOVGOT64r pseudo, use the existing
MO_PIC_BASE_OFFSET support on symbol operands. Now I don't have to
create a "scratch register operand" for the pseudo to use, and the
register allocator can make better decisions.

Fixes some X86 verifier errors tracked in PR27481.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345219 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrCompiler.td        |  5 --
 lib/Target/X86/X86InstrInfo.cpp           | 28 ++++++--
 lib/Target/X86/X86MCInstLower.cpp         | 35 ----------
 test/CodeGen/X86/code-model-elf-memset.ll | 12 ++--
 test/CodeGen/X86/code-model-elf.ll        | 84 +++++++++++------------
 test/CodeGen/X86/large-pic-string.ll      | 16 ++---
 6 files changed, 78 insertions(+), 102 deletions(-)

diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 11d6edd55d4..2805517b747 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -37,11 +37,6 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP],
   def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
                       "", []>;
 
-// 64-bit large code model PIC base construction.
-let hasSideEffects = 0, mayLoad = 1, isNotDuplicable = 1, SchedRW = [WriteJump] in
-  def MOVGOT64r : PseudoI<(outs GR64:$reg),
-                          (ins GR64:$scratch, i64i32imm_pcrel:$got), []>;
-
 // ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
 // a stack adjustment and the codegen must know that they may modify the stack
 // pointer before prolog-epilog rewriting occurs.
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index e62c8403693..db0cb63ae69 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -7488,12 +7488,28 @@ namespace {
               .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
               .addReg(0);
         } else if (TM->getCodeModel() == CodeModel::Large) {
-          // Loading the GOT in the large code model requires math with labels,
-          // so we use a pseudo instruction and expand it during MC emission.
-          unsigned Scratch = RegInfo.createVirtualRegister(&X86::GR64RegClass);
-          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVGOT64r), PC)
-              .addReg(Scratch, RegState::Undef | RegState::Define)
-              .addExternalSymbol("_GLOBAL_OFFSET_TABLE_");
+          // In the large code model, we are aiming for this code, though the
+          // register allocation may vary:
+          //   leaq .LN$pb(%rip), %rax
+          //   movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
+          //   addq %rcx, %rax
+          // RAX now holds address of _GLOBAL_OFFSET_TABLE_.
+          unsigned PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
+          unsigned GOTReg =
+              RegInfo.createVirtualRegister(&X86::GR64RegClass);
+          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
+              .addReg(X86::RIP)
+              .addImm(0)
+              .addReg(0)
+              .addSym(MF.getPICBaseSymbol())
+              .addReg(0);
+          std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol());
+          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg)
+              .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+                                 X86II::MO_PIC_BASE_OFFSET);
+          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC)
+              .addReg(PBReg, RegState::Kill)
+              .addReg(GOTReg, RegState::Kill);
         } else {
           llvm_unreachable("unexpected code model");
         }
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 74fe85851cb..86495f16c3a 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1709,41 +1709,6 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  case X86::MOVGOT64r: {
-    // Materializes the GOT for the 64-bit large code model.
-    MCSymbol *DotSym = OutContext.createTempSymbol();
-    OutStreamer->EmitLabel(DotSym);
-
-    unsigned DstReg = MI->getOperand(0).getReg();
-    unsigned ScratchReg = MI->getOperand(1).getReg();
-    MCSymbol *GOTSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
-
-    // .LtmpN: leaq .LtmpN(%rip), %dst
-    const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
-    EmitAndCountInstruction(MCInstBuilder(X86::LEA64r)
-                                .addReg(DstReg)   // dest
-                                .addReg(X86::RIP) // base
-                                .addImm(1)        // scale
-                                .addReg(0)        // index
-                                .addExpr(DotExpr) // disp
-                                .addReg(0));      // seg
-
-    // movq $_GLOBAL_OFFSET_TABLE_ - .LtmpN, %scratch
-    const MCExpr *GOTSymExpr = MCSymbolRefExpr::create(GOTSym, OutContext);
-    const MCExpr *GOTDiffExpr =
-        MCBinaryExpr::createSub(GOTSymExpr, DotExpr, OutContext);
-    EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri)
-                                .addReg(ScratchReg)     // dest
-                                .addExpr(GOTDiffExpr)); // disp
-
-    // addq %scratch, %dst
-    EmitAndCountInstruction(MCInstBuilder(X86::ADD64rr)
-                                .addReg(DstReg)       // dest
-                                .addReg(DstReg)       // dest
-                                .addReg(ScratchReg)); // src
-    return;
-  }
-
   case X86::ADD32ri: {
     // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
     if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
diff --git a/test/CodeGen/X86/code-model-elf-memset.ll b/test/CodeGen/X86/code-model-elf-memset.ll
index ba34aaeddcb..2f429f32eab 100644
--- a/test/CodeGen/X86/code-model-elf-memset.ll
+++ b/test/CodeGen/X86/code-model-elf-memset.ll
@@ -56,16 +56,16 @@ define i32 @main() #0 {
 ; LARGE-PIC:       # %bb.0: # %entry
 ; LARGE-PIC-NEXT:    subq $424, %rsp # imm = 0x1A8
 ; LARGE-PIC-NEXT:    .cfi_def_cfa_offset 432
-; LARGE-PIC-NEXT:  .Ltmp0:
-; LARGE-PIC-NEXT:    leaq {{.*}}(%rip), %rax
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp0, %rcx
-; LARGE-PIC-NEXT:    addq %rcx, %rax
+; LARGE-PIC-NEXT:  .L0$pb:
+; LARGE-PIC-NEXT:    leaq .L0$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rcx
+; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movl $0, {{[0-9]+}}(%rsp)
 ; LARGE-PIC-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; LARGE-PIC-NEXT:    movabsq $memset@GOT, %rcx
+; LARGE-PIC-NEXT:    movabsq $memset@GOT, %rax
 ; LARGE-PIC-NEXT:    xorl %esi, %esi
 ; LARGE-PIC-NEXT:    movl $400, %edx # imm = 0x190
-; LARGE-PIC-NEXT:    callq *(%rax,%rcx)
+; LARGE-PIC-NEXT:    callq *(%rcx,%rax)
 ; LARGE-PIC-NEXT:    xorl %eax, %eax
 ; LARGE-PIC-NEXT:    addq $424, %rsp # imm = 0x1A8
 ; LARGE-PIC-NEXT:    .cfi_def_cfa_offset 8
diff --git a/test/CodeGen/X86/code-model-elf.ll b/test/CodeGen/X86/code-model-elf.ll
index 6d62f256179..56d3f4c102f 100644
--- a/test/CodeGen/X86/code-model-elf.ll
+++ b/test/CodeGen/X86/code-model-elf.ll
@@ -2,12 +2,12 @@
 ; Run with --no_x86_scrub_rip because we care a lot about how globals are
 ; accessed in the code model.
 
-; RUN: llc < %s -relocation-model=static -code-model=small  | FileCheck %s --check-prefix=CHECK --check-prefix=SMALL-STATIC
-; RUN: llc < %s -relocation-model=static -code-model=medium | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-STATIC
-; RUN: llc < %s -relocation-model=static -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-STATIC
-; RUN: llc < %s -relocation-model=pic    -code-model=small  | FileCheck %s --check-prefix=CHECK --check-prefix=SMALL-PIC
-; RUN: llc < %s -relocation-model=pic    -code-model=medium | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-PIC
-; RUN: llc < %s -relocation-model=pic    -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-PIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=small  | FileCheck %s --check-prefix=CHECK --check-prefix=SMALL-STATIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=medium | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-STATIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-STATIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=small  | FileCheck %s --check-prefix=CHECK --check-prefix=SMALL-PIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=medium | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-PIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-PIC
 
 ; Generated from this C source:
 ;
@@ -68,9 +68,9 @@ define dso_local i32* @lea_static_data() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_static_data:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp0:
-; LARGE-PIC-NEXT:    leaq .Ltmp0(%rip), %rcx
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp0, %rax
+; LARGE-PIC-NEXT:  .L0$pb:
+; LARGE-PIC-NEXT:    leaq .L0$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $static_data@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
@@ -108,9 +108,9 @@ define dso_local i32* @lea_global_data() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_global_data:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp1:
-; LARGE-PIC-NEXT:    leaq .Ltmp1(%rip), %rcx
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp1, %rax
+; LARGE-PIC-NEXT:  .L1$pb:
+; LARGE-PIC-NEXT:    leaq .L1$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L1$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $global_data@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
@@ -146,12 +146,12 @@ define dso_local i32* @lea_extern_data() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_extern_data:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp2:
-; LARGE-PIC-NEXT:    leaq .Ltmp2(%rip), %rax
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp2, %rcx
-; LARGE-PIC-NEXT:    addq %rcx, %rax
-; LARGE-PIC-NEXT:    movabsq $extern_data@GOT, %rcx
-; LARGE-PIC-NEXT:    movq (%rax,%rcx), %rax
+; LARGE-PIC-NEXT:  .L2$pb:
+; LARGE-PIC-NEXT:    leaq .L2$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L2$pb, %rcx
+; LARGE-PIC-NEXT:    addq %rax, %rcx
+; LARGE-PIC-NEXT:    movabsq $extern_data@GOT, %rax
+; LARGE-PIC-NEXT:    movq (%rcx,%rax), %rax
 ; LARGE-PIC-NEXT:    retq
   ret i32* getelementptr inbounds ([10 x i32], [10 x i32]* @extern_data, i64 0, i64 0)
 }
@@ -188,12 +188,12 @@ define dso_local i32 @load_global_data() #0 {
 ;
 ; LARGE-PIC-LABEL: load_global_data:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp3:
-; LARGE-PIC-NEXT:    leaq .Ltmp3(%rip), %rax
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp3, %rcx
-; LARGE-PIC-NEXT:    addq %rcx, %rax
-; LARGE-PIC-NEXT:    movabsq $global_data@GOTOFF, %rcx
-; LARGE-PIC-NEXT:    movl 8(%rax,%rcx), %eax
+; LARGE-PIC-NEXT:  .L3$pb:
+; LARGE-PIC-NEXT:    leaq .L3$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L3$pb, %rcx
+; LARGE-PIC-NEXT:    addq %rax, %rcx
+; LARGE-PIC-NEXT:    movabsq $global_data@GOTOFF, %rax
+; LARGE-PIC-NEXT:    movl 8(%rcx,%rax), %eax
 ; LARGE-PIC-NEXT:    retq
   %rv = load i32, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @global_data, i64 0, i64 2)
   ret i32 %rv
@@ -231,12 +231,12 @@ define dso_local i32 @load_extern_data() #0 {
 ;
 ; LARGE-PIC-LABEL: load_extern_data:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp4:
-; LARGE-PIC-NEXT:    leaq .Ltmp4(%rip), %rax
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp4, %rcx
-; LARGE-PIC-NEXT:    addq %rcx, %rax
-; LARGE-PIC-NEXT:    movabsq $extern_data@GOT, %rcx
-; LARGE-PIC-NEXT:    movq (%rax,%rcx), %rax
+; LARGE-PIC-NEXT:  .L4$pb:
+; LARGE-PIC-NEXT:    leaq .L4$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L4$pb, %rcx
+; LARGE-PIC-NEXT:    addq %rax, %rcx
+; LARGE-PIC-NEXT:    movabsq $extern_data@GOT, %rax
+; LARGE-PIC-NEXT:    movq (%rcx,%rax), %rax
 ; LARGE-PIC-NEXT:    movl 8(%rax), %eax
 ; LARGE-PIC-NEXT:    retq
   %rv = load i32, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @extern_data, i64 0, i64 2)
@@ -287,9 +287,9 @@ define dso_local void ()* @lea_static_fn() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_static_fn:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp5:
-; LARGE-PIC-NEXT:    leaq .Ltmp5(%rip), %rcx
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp5, %rax
+; LARGE-PIC-NEXT:  .L7$pb:
+; LARGE-PIC-NEXT:    leaq .L7$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L7$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $static_fn@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
@@ -325,9 +325,9 @@ define dso_local void ()* @lea_global_fn() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_global_fn:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp6:
-; LARGE-PIC-NEXT:    leaq .Ltmp6(%rip), %rcx
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp6, %rax
+; LARGE-PIC-NEXT:  .L8$pb:
+; LARGE-PIC-NEXT:    leaq .L8$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L8$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $global_fn@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
@@ -363,12 +363,12 @@ define dso_local void ()* @lea_extern_fn() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_extern_fn:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp7:
-; LARGE-PIC-NEXT:    leaq .Ltmp7(%rip), %rax
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp7, %rcx
-; LARGE-PIC-NEXT:    addq %rcx, %rax
-; LARGE-PIC-NEXT:    movabsq $extern_fn@GOT, %rcx
-; LARGE-PIC-NEXT:    movq (%rax,%rcx), %rax
+; LARGE-PIC-NEXT:  .L9$pb:
+; LARGE-PIC-NEXT:    leaq .L9$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L9$pb, %rcx
+; LARGE-PIC-NEXT:    addq %rax, %rcx
+; LARGE-PIC-NEXT:    movabsq $extern_fn@GOT, %rax
+; LARGE-PIC-NEXT:    movq (%rcx,%rax), %rax
 ; LARGE-PIC-NEXT:    retq
   ret void ()* @extern_fn
 }
diff --git a/test/CodeGen/X86/large-pic-string.ll b/test/CodeGen/X86/large-pic-string.ll
index be8a629c31c..e677ed85c66 100644
--- a/test/CodeGen/X86/large-pic-string.ll
+++ b/test/CodeGen/X86/large-pic-string.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 
-; RUN: llc < %s -code-model=large -relocation-model=pic -mtriple=x86_64--linux | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -code-model=large -relocation-model=pic -mtriple=x86_64--linux | FileCheck %s
 
 @.str = private unnamed_addr constant [2 x i8] c"a\00", align 1
 
 define void @pr38385() {
 ; CHECK-LABEL: pr38385:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    leaq {{.*}}(%rip), %rax
-; CHECK-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp0, %rcx
-; CHECK-NEXT:    addq %rcx, %rax
-; CHECK-NEXT:    movabsq $.L.str@GOTOFF, %rcx
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:  .L0$pb:
+; CHECK-NEXT:    leaq .L0${{.*}}(%rip), %rax
+; CHECK-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    movabsq $.L.str@GOTOFF, %rax
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    retq
   %p = alloca i8, align 1
   store i8 ptrtoint ([2 x i8]* @.str to i8), i8* %p, align 1
-- 
GitLab


From d63364426de3ac37a8ef2eb38b65965f8a07b210 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 24 Oct 2018 23:14:59 +0000
Subject: [PATCH 0542/1116] Make fminimum/fmaximum SDNodes commutative and
 associative

Reviewers: aheejin, dschuff

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53680

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345220 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetSelectionDAG.td | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index b1558b0f347..dfc3ce86217 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -413,8 +413,10 @@ def fminnum_ieee : SDNode<"ISD::FMINNUM_IEEE", SDTFPBinOp,
                           [SDNPCommutative]>;
 def fmaxnum_ieee  : SDNode<"ISD::FMAXNUM_IEEE", SDTFPBinOp,
                            [SDNPCommutative]>;
-def fminimum   : SDNode<"ISD::FMINIMUM"   , SDTFPBinOp>;
-def fmaximum   : SDNode<"ISD::FMAXIMUM"   , SDTFPBinOp>;
+def fminimum   : SDNode<"ISD::FMINIMUM"   , SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def fmaximum   : SDNode<"ISD::FMAXIMUM"   , SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
 def fgetsign   : SDNode<"ISD::FGETSIGN"   , SDTFPToIntOp>;
 def fcanonicalize : SDNode<"ISD::FCANONICALIZE", SDTFPUnaryOp>;
 def fneg       : SDNode<"ISD::FNEG"       , SDTFPUnaryOp>;
-- 
GitLab


From 409b1027e901214eeaac22e84682818169dca482 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 24 Oct 2018 23:27:40 +0000
Subject: [PATCH 0543/1116] [WebAssembly] Retain shuffle types during custom
 lowering

Summary:
Changing the node type in lowering was violating assumptions made in
the DAG combiner, so don't change the node type any more. This fixes
one of the issues reported in bug 39275.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits, alexcrichton

Differential Revision: https://reviews.llvm.org/D53537

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345221 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp     |  2 +-
 lib/Target/WebAssembly/WebAssemblyInstrSIMD.td  |  6 +++---
 .../CodeGen/WebAssembly/simd-nested-shuffles.ll | 17 +++++++++++++++++
 3 files changed, 21 insertions(+), 4 deletions(-)
 create mode 100644 test/CodeGen/WebAssembly/simd-nested-shuffles.ll

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4b20404cf61..49fb8404b80 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1010,7 +1010,7 @@ WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     }
   }
 
-  return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, MVT::v16i8, Ops);
+  return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops);
 }
 
 SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 08bb39748b8..ff6bbab705c 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -350,7 +350,7 @@ def : Pat<(v2f64 (build_vector (f64 F64:$x0), (f64 F64:$x1))),
             (v2f64 (SPLAT_v2f64 (f64 F64:$x0))), 1, F64:$x1))>;
 
 // Shuffle lanes: shuffle
-defm SHUFFLE_v16i8 :
+defm SHUFFLE :
   SIMD_I<(outs V128:$dst),
          (ins V128:$x, V128:$y,
            vec_i8imm_op:$m0, vec_i8imm_op:$m1,
@@ -384,7 +384,7 @@ defm SHUFFLE_v16i8 :
 def wasm_shuffle_t : SDTypeProfile<1, 18, []>;
 def wasm_shuffle : SDNode<"WebAssemblyISD::SHUFFLE", wasm_shuffle_t>;
 foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-def : Pat<(v16i8 (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
+def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
             (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
             (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
             (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
@@ -393,7 +393,7 @@ def : Pat<(v16i8 (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
             (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB),
             (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD),
             (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF))),
-          (v16i8 (SHUFFLE_v16i8 (vec_t V128:$x), (vec_t V128:$y),
+          (vec_t (SHUFFLE (vec_t V128:$x), (vec_t V128:$y),
             (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
             (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
             (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
diff --git a/test/CodeGen/WebAssembly/simd-nested-shuffles.ll b/test/CodeGen/WebAssembly/simd-nested-shuffles.ll
new file mode 100644
index 00000000000..51ba5a99be6
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-nested-shuffles.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mattr=+simd128 | FileCheck %s --check-prefixes CHECK
+
+; Check that shuffles maintain their type when being custom
+; lowered. Regression test for bug 39275.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK: v8x16.shuffle
+define <4 x i32> @foo(<4 x i32> %x) {
+  %1 = shufflevector <4 x i32> %x, <4 x i32> undef,
+    <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef,
+    <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %3 = add <4 x i32> %2, %2
+  ret <4 x i32> %3
+}
-- 
GitLab


From d5d108ee21b796bc6d3d874872bbc51391229b84 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Wed, 24 Oct 2018 23:31:24 +0000
Subject: [PATCH 0544/1116] [WebAssembly] Fix immediate of rethrow when
 throwing to caller

Summary:
Currently when assigning depths 'rethrow' does not take the whole
control flow stack into accounts but only considers EH pad stacks. When
assigning depth immmediates to rethrows, in normal cases it is done
correctly but when a rethrow instruction throws up to a caller, i.e., we
convert a pseudo RETHROW_TO_CALLER instruction to a rethrow, it
mistakenly compute the whole stack depth.

Reviewers: dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53619

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345223 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp | 2 +-
 test/CodeGen/WebAssembly/cfg-stackify-eh.mir      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 93ca670bbdb..a3b3901f019 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -767,7 +767,7 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
       case WebAssembly::RETHROW_TO_CALLER: {
         MachineInstr *Rethrow =
             BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(WebAssembly::RETHROW))
-                .addImm(Stack.size());
+                .addImm(EHPadStack.size());
         MI.eraseFromParent();
         I = MachineBasicBlock::reverse_iterator(Rethrow);
         break;
diff --git a/test/CodeGen/WebAssembly/cfg-stackify-eh.mir b/test/CodeGen/WebAssembly/cfg-stackify-eh.mir
index 9038f68966b..b67579087fa 100644
--- a/test/CodeGen/WebAssembly/cfg-stackify-eh.mir
+++ b/test/CodeGen/WebAssembly/cfg-stackify-eh.mir
@@ -180,7 +180,7 @@ body: |
     RETHROW_TO_CALLER implicit-def $arguments
   ; CHECK-LABEL: bb.7:
     ; CHECK-NEXT: END_TRY
-    ; CHECK: RETHROW 3
+    ; CHECK: RETHROW 0
 
   bb.8:
   ; predecessors: %bb.2, %bb.4
-- 
GitLab


From 57075a5226d7dacf61975147c474aef53cefac7b Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Wed, 24 Oct 2018 23:36:29 +0000
Subject: [PATCH 0545/1116] DebugInfo: Reuse common addresses for rnglist base
 address selections

This makes the offsets larger (since they are further from the base
address) but those are in the .dwo - and allows removing addresses and
relocations from the .o file.

This could be built into the AddressPool more fundamentally, perhaps -
when you ask for an AddressPool entry you could say "or give me some
other entry and an offset I need to use" - though what to do about
situations where the first use of an address in a section is not the
earliest address in that section... is tricky.

At least with range addresses we can be fairly sure we've seen the
earliest address first because we see the start address for the
function.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345224 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp | 1 +
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp       | 9 +++++++++
 lib/CodeGen/AsmPrinter/DwarfDebug.h         | 5 +++++
 test/DebugInfo/X86/split-dwarf-v5-ranges.ll | 6 +++---
 4 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 1d9c1d38a24..a32cd8bc904 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -277,6 +277,7 @@ void DwarfCompileUnit::addRange(RangeSpan Range) {
       (&CURanges.back().getEnd()->getSection() !=
        &Range.getEnd()->getSection())) {
     CURanges.push_back(Range);
+    DD->addSectionLabel(Range.getStart());
     return;
   }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 5f91674d9f0..7f9ef3eba90 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2171,6 +2171,7 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm,
       // the lowest address/range in this object.
       Base = P.second.front()->getStart();
       if (DwarfVersion >= 5) {
+        Base = DD.getSectionLabel(&Base->getSection());
         Asm->OutStreamer->AddComment("DW_RLE_base_addressx");
         Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_base_addressx, 1);
         Asm->OutStreamer->AddComment("  base address index");
@@ -2623,3 +2624,11 @@ void DwarfDebug::addAccelType(const DICompileUnit &CU, StringRef Name,
 uint16_t DwarfDebug::getDwarfVersion() const {
   return Asm->OutStreamer->getContext().getDwarfVersion();
 }
+
+void DwarfDebug::addSectionLabel(const MCSymbol *Sym) {
+  SectionLabels.insert(std::make_pair(&Sym->getSection(), Sym));
+}
+
+const MCSymbol *DwarfDebug::getSectionLabel(const MCSection *S) {
+  return SectionLabels.find(S)->second;
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index b98d9267455..c73d442af2f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -327,6 +327,8 @@ class DwarfDebug : public DebugHandlerBase {
   /// used to keep track of which types we have emitted type units for.
   DenseMap<const MDNode *, uint64_t> TypeSignatures;
 
+  DenseMap<const MCSection *, const MCSymbol *> SectionLabels;
+
   SmallVector<
       std::pair<std::unique_ptr<DwarfTypeUnit>, const DICompositeType *>, 1>
       TypeUnitsUnderConstruction;
@@ -721,6 +723,9 @@ public:
   bool tuneForLLDB() const { return DebuggerTuning == DebuggerKind::LLDB; }
   bool tuneForSCE() const { return DebuggerTuning == DebuggerKind::SCE; }
   /// @}
+
+  void addSectionLabel(const MCSymbol *Sym);
+  const MCSymbol *getSectionLabel(const MCSection *S);
 };
 
 } // end namespace llvm
diff --git a/test/DebugInfo/X86/split-dwarf-v5-ranges.ll b/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
index 295bbc41ad3..74e94643b9c 100644
--- a/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
+++ b/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
@@ -13,9 +13,9 @@
 ; CHECK: 0x00000004 => 0x00000010
 ; CHECK: ]
 ; CHECK: ranges:
-; CHECK: 0x00000010: [DW_RLE_base_addressx]:  0x0000000000000002
-; CHECK: 0x00000012: [DW_RLE_offset_pair  ]:  0x0000000000000000, 0x000000000000000b => [0x0000000000000001, 0x000000000000000c)
-; CHECK: 0x00000015: [DW_RLE_offset_pair  ]:  0x000000000000000d, 0x0000000000000012 => [0x000000000000000e, 0x0000000000000013)
+; CHECK: 0x00000010: [DW_RLE_base_addressx]:  0x0000000000000000
+; CHECK: 0x00000012: [DW_RLE_offset_pair  ]:  0x0000000000000001, 0x000000000000000c => [0x0000000000000001, 0x000000000000000c)
+; CHECK: 0x00000015: [DW_RLE_offset_pair  ]:  0x000000000000000e, 0x0000000000000013 => [0x000000000000000e, 0x0000000000000013)
 ; CHECK: 0x00000018: [DW_RLE_end_of_list  ]
 
 ; Function Attrs: noinline optnone uwtable
-- 
GitLab


From 3d8e85cd8636a2a7de31a59876b1c72dcff52cf2 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 24 Oct 2018 23:52:22 +0000
Subject: [PATCH 0546/1116] [X86] Fix pipeline tests when enabling MIR
 verification, NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345226 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/O0-pipeline.ll | 5 ++++-
 test/CodeGen/X86/O3-pipeline.ll | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll
index 05c0f358e7b..d9a093b8c59 100644
--- a/test/CodeGen/X86/O0-pipeline.ll
+++ b/test/CodeGen/X86/O0-pipeline.ll
@@ -1,4 +1,7 @@
-; RUN: llc -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+; When EXPENSIVE_CHECKS are enabled, the machine verifier appears between each
+; pass. Ignore it with 'grep -v'.
+; RUN: llc -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 \
+; RUN:   | grep -v 'Verify generated machine code' | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/test/CodeGen/X86/O3-pipeline.ll b/test/CodeGen/X86/O3-pipeline.ll
index 93e184c4371..9828d1eeab1 100644
--- a/test/CodeGen/X86/O3-pipeline.ll
+++ b/test/CodeGen/X86/O3-pipeline.ll
@@ -1,4 +1,7 @@
-; RUN: llc -mtriple=x86_64-- -O3 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+; When EXPENSIVE_CHECKS are enabled, the machine verifier appears between each
+; pass. Ignore it with 'grep -v'.
+; RUN: llc -mtriple=x86_64-- -O3 -debug-pass=Structure < %s -o /dev/null 2>&1 \
+; RUN:   | grep -v 'Verify generated machine code' | FileCheck %s
 
 ; REQUIRES: asserts
 
-- 
GitLab


From 7b105bc3a6c4795ad428e55713aedfa7edb27911 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 24 Oct 2018 23:52:33 +0000
Subject: [PATCH 0547/1116] [X86] Adjust MIR test case to pacify machine
 verifier

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345227 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/PR37310.mir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CodeGen/X86/PR37310.mir b/test/CodeGen/X86/PR37310.mir
index a3e17b55c4a..37f400de2e7 100644
--- a/test/CodeGen/X86/PR37310.mir
+++ b/test/CodeGen/X86/PR37310.mir
@@ -79,7 +79,7 @@ registers:
   - { id: 4, class: gr64, preferred-register: '' }
   - { id: 5, class: gr32, preferred-register: '' }
 liveins:         
-  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$edi' }
 frameInfo:       
   isFrameAddressTaken: false
   isReturnAddressTaken: false
-- 
GitLab


From 7c10f9a44d521a0bad17b3899402a54e83040947 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Thu, 25 Oct 2018 01:46:07 +0000
Subject: [PATCH 0548/1116] [WebAssembly] Set LoadExt and TruncStore actions
 for SIMD types

Summary: Fixes part of the problem reported in bug 39275.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits, alexcrichton

Differential Revision: https://reviews.llvm.org/D53542

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345230 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 13 ++++
 .../WebAssembly/simd-ext-load-trunc-store.ll  | 60 +++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 test/CodeGen/WebAssembly/simd-ext-load-trunc-store.ll

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 49fb8404b80..0bd2ebdc4bb 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -174,11 +174,24 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   //  - Floating-point extending loads.
   //  - Floating-point truncating stores.
   //  - i1 extending loads.
+  //  - extending/truncating SIMD loads/stores
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   for (auto T : MVT::integer_valuetypes())
     for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
       setLoadExtAction(Ext, T, MVT::i1, Promote);
+  if (Subtarget->hasSIMD128()) {
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32,
+                   MVT::v2f64}) {
+      for (auto MemT : MVT::vector_valuetypes()) {
+        if (MVT(T) != MemT) {
+          setTruncStoreAction(T, MemT, Expand);
+          for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
+            setLoadExtAction(Ext, T, MemT, Expand);
+        }
+      }
+    }
+  }
 
   // Trap lowers to wasm unreachable
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
diff --git a/test/CodeGen/WebAssembly/simd-ext-load-trunc-store.ll b/test/CodeGen/WebAssembly/simd-ext-load-trunc-store.ll
new file mode 100644
index 00000000000..f128483cb9a
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-ext-load-trunc-store.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-unimplemented-simd -mattr=+simd128 | FileCheck %s
+
+; Check that store in memory with smaller lanes are loaded and stored
+; as expected. This is a regression test for part of bug 39275.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: load_ext_2xi32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i64.load32_u $push[[L0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: i64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]{{$}}
+; CHECK-NEXT: i64.load32_u $push[[L2:[0-9]+]]=, 4($0){{$}}
+; CHECK-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L1]], 1, $pop[[L2]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i32> @load_ext_2xi32(<2 x i32>* %p) {
+  %1 = load <2 x i32>, <2 x i32>* %p, align 4
+  ret <2 x i32> %1
+}
+
+; CHECK-LABEL: load_zext_2xi32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i64.load32_u $push[[L0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: i64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]{{$}}
+; CHECK-NEXT: i64.load32_u $push[[L2:[0-9]+]]=, 4($0){{$}}
+; CHECK-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L1]], 1, $pop[[L2]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_zext_2xi32(<2 x i32>* %p) {
+  %1 = load <2 x i32>, <2 x i32>* %p, align 4
+  %2 = zext <2 x i32> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+; CHECK-LABEL: load_sext_2xi32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i64.load32_s $push[[L0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: i64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]{{$}}
+; CHECK-NEXT: i64.load32_s $push[[L2:[0-9]+]]=, 4($0){{$}}
+; CHECK-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L1]], 1, $pop[[L2]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_sext_2xi32(<2 x i32>* %p) {
+  %1 = load <2 x i32>, <2 x i32>* %p, align 4
+  %2 = sext <2 x i32> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+; CHECK-LABEL: store_trunc_2xi32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $1, 1
+; CHECK-NEXT: i64.store32 4($0), $pop[[L0]]
+; CHECK-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0
+; CHECK-NEXT: i64.store32 0($0), $pop[[L1]]
+; CHECK-NEXT: return
+define void @store_trunc_2xi32(<2 x i32>* %p, <2 x i32> %x) {
+  store <2 x i32> %x, <2 x i32>* %p, align 4
+  ret void
+}
-- 
GitLab


From 21435b345c58381e525292b30fd3abe3cbdf7d45 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 25 Oct 2018 05:00:20 +0000
Subject: [PATCH 0549/1116] [X86] Fix typo in comment. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345236 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InterleavedAccess.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index 6c7fb9c339a..28940754a20 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -463,7 +463,7 @@ static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
 //  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
 //  Imm variable sets the offset amount. The result of the
 //  function is stored inside ShuffleMask vector and it built as described in
-//  the begin of the description. AlignDirection is a boolean that indecat the
+//  the begin of the description. AlignDirection is a boolean that indicates the
 //  direction of the alignment. (false - align to the "right" side while true -
 //  align to the "left" side)
 static void DecodePALIGNRMask(MVT VT, unsigned Imm,
-- 
GitLab


From 5c27f3e8877d39ee57f0d565dbd86731b2709f7d Mon Sep 17 00:00:00 2001
From: Simon Atanasyan <simon@atanasyan.com>
Date: Thu, 25 Oct 2018 05:39:27 +0000
Subject: [PATCH 0550/1116] [llvm-readobj] Print ELF header flags names in GNU
 output

GNU readelf tool prints hex value of the ELF header flags field and the
flags names. This change adds the same functionality to llvm-readobj.
Now llvm-readobj can print MIPS and RISCV flags.

New GNUStyle::printFlags() method is a copy of ScopedPrinter::printFlags()
routine. Probably we can escape code duplication and / or simplify the
printFlags() method. But it's a task for separate commit.

Differential revision: https://reviews.llvm.org/D52027

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345238 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-readobj/gnu-file-headers.test |  23 +++
 tools/llvm-readobj/ELFDumper.cpp              | 133 +++++++++++-------
 2 files changed, 108 insertions(+), 48 deletions(-)

diff --git a/test/tools/llvm-readobj/gnu-file-headers.test b/test/tools/llvm-readobj/gnu-file-headers.test
index 4b74d0948a3..e246a3d717b 100644
--- a/test/tools/llvm-readobj/gnu-file-headers.test
+++ b/test/tools/llvm-readobj/gnu-file-headers.test
@@ -2,6 +2,8 @@ RUN: llvm-readobj -h %p/Inputs/trivial.obj.elf-i386 --elf-output-style=GNU \
 RUN:   | FileCheck %s -check-prefix ELF32
 RUN: llvm-readobj -h %p/Inputs/trivial.obj.elf-x86-64 --elf-output-style=GNU \
 RUN:   | FileCheck %s -check-prefix ELF64
+RUN: llvm-readobj -h %p/Inputs/trivial.obj.elf-mipsel --elf-output-style=GNU \
+RUN:   | FileCheck %s -check-prefix MIPSEL
 
 ELF32:      ELF Header:
 ELF32-NEXT:  Magic:   7f 45 4c 46 01 01 01 03 00 00 00 00 00 00 00 00
@@ -44,3 +46,24 @@ ELF64-NEXT:  Number of program headers:          0
 ELF64-NEXT:  Size of section headers:           64 (bytes)
 ELF64-NEXT:  Number of section headers:         10
 ELF64-NEXT:  Section header string table index: 7
+
+MIPSEL:     ELF Header:
+MIPSEL-NEXT:  Magic:   7f 45 4c 46 01 01 01 03 00 00 00 00 00 00 00 00
+MIPSEL-NEXT:  Class:                             ELF32
+MIPSEL-NEXT:  Data:                              2's complement, little endian
+MIPSEL-NEXT:  Version:                           1 (current)
+MIPSEL-NEXT:  OS/ABI:                            UNIX - GNU
+MIPSEL-NEXT:  ABI Version:                       0x0
+MIPSEL-NEXT:  Type:                              REL (Relocatable file)
+MIPSEL-NEXT:  Machine:                           MIPS R3000
+MIPSEL-NEXT:  Version:                           0x1
+MIPSEL-NEXT:  Entry point address:               0x0
+MIPSEL-NEXT:  Start of program headers:          0 (bytes into file)
+MIPSEL-NEXT:  Start of section headers:          172 (bytes into file)
+MIPSEL-NEXT:  Flags:                             0x50001000, o32, mips32
+MIPSEL-NEXT:  Size of this header:               52 (bytes)
+MIPSEL-NEXT:  Size of program headers:           0 (bytes)
+MIPSEL-NEXT:  Number of program headers:         0
+MIPSEL-NEXT:  Size of section headers:           40 (bytes)
+MIPSEL-NEXT:  Number of section headers:         9
+MIPSEL-NEXT:  Section header string table index: 6
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index bace24fefb9..5e8a35f13a1 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -390,6 +390,33 @@ private:
     return to_hexString(Value, false);
   }
 
+  template <typename T, typename TEnum>
+  std::string printFlags(T Value, ArrayRef<EnumEntry<TEnum>> EnumValues,
+                         TEnum EnumMask1 = {}, TEnum EnumMask2 = {},
+                         TEnum EnumMask3 = {}) {
+    std::string Str;
+    for (const auto &Flag : EnumValues) {
+      if (Flag.Value == 0)
+        continue;
+
+      TEnum EnumMask{};
+      if (Flag.Value & EnumMask1)
+        EnumMask = EnumMask1;
+      else if (Flag.Value & EnumMask2)
+        EnumMask = EnumMask2;
+      else if (Flag.Value & EnumMask3)
+        EnumMask = EnumMask3;
+      bool IsEnum = (Flag.Value & EnumMask) != 0;
+      if ((!IsEnum && (Value & Flag.Value) == Flag.Value) ||
+          (IsEnum && (Value & EnumMask) == Flag.Value)) {
+        if (!Str.empty())
+          Str += ", ";
+        Str += Flag.AltName;
+      }
+    }
+    return Str;
+  }
+
   formatted_raw_ostream &printField(struct Field F) {
     if (F.Column != 0)
       OS.PadToColumn(F.Column);
@@ -1247,49 +1274,49 @@ static const EnumEntry<unsigned> ElfSegmentFlags[] = {
 };
 
 static const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_NOREORDER),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_PIC),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_CPIC),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI2),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_32BITMODE),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_FP64),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_NAN2008),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI_O32),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI_O64),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI_EABI32),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI_EABI64),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_3900),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_4010),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_4100),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_4650),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_4120),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_4111),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_SB1),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_OCTEON),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_XLR),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_OCTEON2),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_OCTEON3),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_5400),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_5900),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_5500),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_9000),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_LS2E),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_LS2F),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_LS3A),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MICROMIPS),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_ASE_M16),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_ASE_MDMX),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_1),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_2),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_3),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_4),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_5),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_32),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_64),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_32R2),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_64R2),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_32R6),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_64R6)
+  ENUM_ENT(EF_MIPS_NOREORDER, "noreorder"),
+  ENUM_ENT(EF_MIPS_PIC, "pic"),
+  ENUM_ENT(EF_MIPS_CPIC, "cpic"),
+  ENUM_ENT(EF_MIPS_ABI2, "abi2"),
+  ENUM_ENT(EF_MIPS_32BITMODE, "32bitmode"),
+  ENUM_ENT(EF_MIPS_FP64, "fp64"),
+  ENUM_ENT(EF_MIPS_NAN2008, "nan2008"),
+  ENUM_ENT(EF_MIPS_ABI_O32, "o32"),
+  ENUM_ENT(EF_MIPS_ABI_O64, "o64"),
+  ENUM_ENT(EF_MIPS_ABI_EABI32, "eabi32"),
+  ENUM_ENT(EF_MIPS_ABI_EABI64, "eabi64"),
+  ENUM_ENT(EF_MIPS_MACH_3900, "3900"),
+  ENUM_ENT(EF_MIPS_MACH_4010, "4010"),
+  ENUM_ENT(EF_MIPS_MACH_4100, "4100"),
+  ENUM_ENT(EF_MIPS_MACH_4650, "4650"),
+  ENUM_ENT(EF_MIPS_MACH_4120, "4120"),
+  ENUM_ENT(EF_MIPS_MACH_4111, "4111"),
+  ENUM_ENT(EF_MIPS_MACH_SB1, "sb1"),
+  ENUM_ENT(EF_MIPS_MACH_OCTEON, "octeon"),
+  ENUM_ENT(EF_MIPS_MACH_XLR, "xlr"),
+  ENUM_ENT(EF_MIPS_MACH_OCTEON2, "octeon2"),
+  ENUM_ENT(EF_MIPS_MACH_OCTEON3, "octeon3"),
+  ENUM_ENT(EF_MIPS_MACH_5400, "5400"),
+  ENUM_ENT(EF_MIPS_MACH_5900, "5900"),
+  ENUM_ENT(EF_MIPS_MACH_5500, "5500"),
+  ENUM_ENT(EF_MIPS_MACH_9000, "9000"),
+  ENUM_ENT(EF_MIPS_MACH_LS2E, "loongson-2e"),
+  ENUM_ENT(EF_MIPS_MACH_LS2F, "loongson-2f"),
+  ENUM_ENT(EF_MIPS_MACH_LS3A, "loongson-3a"),
+  ENUM_ENT(EF_MIPS_MICROMIPS, "micromips"),
+  ENUM_ENT(EF_MIPS_ARCH_ASE_M16, "mips16"),
+  ENUM_ENT(EF_MIPS_ARCH_ASE_MDMX, "mdmx"),
+  ENUM_ENT(EF_MIPS_ARCH_1, "mips1"),
+  ENUM_ENT(EF_MIPS_ARCH_2, "mips2"),
+  ENUM_ENT(EF_MIPS_ARCH_3, "mips3"),
+  ENUM_ENT(EF_MIPS_ARCH_4, "mips4"),
+  ENUM_ENT(EF_MIPS_ARCH_5, "mips5"),
+  ENUM_ENT(EF_MIPS_ARCH_32, "mips32"),
+  ENUM_ENT(EF_MIPS_ARCH_64, "mips64"),
+  ENUM_ENT(EF_MIPS_ARCH_32R2, "mips32r2"),
+  ENUM_ENT(EF_MIPS_ARCH_64R2, "mips64r2"),
+  ENUM_ENT(EF_MIPS_ARCH_32R6, "mips32r6"),
+  ENUM_ENT(EF_MIPS_ARCH_64R6, "mips64r6")
 };
 
 static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
@@ -1330,11 +1357,11 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
 };
 
 static const EnumEntry<unsigned> ElfHeaderRISCVFlags[] = {
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_RISCV_RVC),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_RISCV_FLOAT_ABI_SINGLE),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_RISCV_FLOAT_ABI_DOUBLE),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_RISCV_FLOAT_ABI_QUAD),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_RISCV_RVE)
+  ENUM_ENT(EF_RISCV_RVC, "RVC"),
+  ENUM_ENT(EF_RISCV_FLOAT_ABI_SINGLE, "single-float ABI"),
+  ENUM_ENT(EF_RISCV_FLOAT_ABI_DOUBLE, "double-float ABI"),
+  ENUM_ENT(EF_RISCV_FLOAT_ABI_QUAD, "quad-float ABI"),
+  ENUM_ENT(EF_RISCV_RVE, "RVE")
 };
 
 static const EnumEntry<unsigned> ElfSymOtherFlags[] = {
@@ -2518,7 +2545,17 @@ template <class ELFT> void GNUStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
   printFields(OS, "Start of program headers:", Str);
   Str = to_string(e->e_shoff) + " (bytes into file)";
   printFields(OS, "Start of section headers:", Str);
+  std::string ElfFlags;
+  if (e->e_machine == EM_MIPS)
+    ElfFlags =
+        printFlags(e->e_flags, makeArrayRef(ElfHeaderMipsFlags),
+                   unsigned(ELF::EF_MIPS_ARCH), unsigned(ELF::EF_MIPS_ABI),
+                   unsigned(ELF::EF_MIPS_MACH));
+  else if (e->e_machine == EM_RISCV)
+    ElfFlags = printFlags(e->e_flags, makeArrayRef(ElfHeaderRISCVFlags));
   Str = "0x" + to_hexString(e->e_flags);
+  if (!ElfFlags.empty())
+    Str = Str + ", " + ElfFlags;
   printFields(OS, "Flags:", Str);
   Str = to_string(e->e_ehsize) + " (bytes)";
   printFields(OS, "Size of this header:", Str);
-- 
GitLab


From 42e23ebc6e40be5e6edf47143145c5c34ec23fe0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 25 Oct 2018 07:00:09 +0000
Subject: [PATCH 0551/1116] [X86] Don't use the OriginalDemandedBits to
 calculate the DemandedMask for PMULUDQ/PMULDQ inputs.

Multiply a is complex operation so just because some bit of the output isn't used doesn't mean that bit of the input isn't used.

We might able to bound it, but it will require some more thought.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345241 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index d86f9d5a220..f9f8fb4a419 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -31881,7 +31881,8 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
     KnownBits KnownOp;
     SDValue LHS = Op.getOperand(0);
     SDValue RHS = Op.getOperand(1);
-    APInt DemandedMask = OriginalDemandedBits & APInt::getLowBitsSet(64, 32);
+    // FIXME: Can we bound this better?
+    APInt DemandedMask = APInt::getLowBitsSet(64, 32);
     if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1))
       return true;
     if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1))
-- 
GitLab


From f4fb61b34ba74e17727676877ab3d3cb9efc9f22 Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Thu, 25 Oct 2018 07:44:01 +0000
Subject: [PATCH 0552/1116] [MCSched] Bind PFM Counters to the CPUs instead of
 the SchedModel.

Summary:
The pfm counters are now in the ExegesisTarget rather than the
MCSchedModel (PR39165).

This also compresses the pfm counter tables (PR37068).

Reviewers: RKSimon, gchatelet

Subscribers: mgrang, llvm-commits

Differential Revision: https://reviews.llvm.org/D52932

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345243 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/CommandGuide/llvm-exegesis.rst        |   4 +
 docs/CommandGuide/tblgen.rst               |   4 +
 include/llvm/MC/MCSchedule.h               |  16 --
 include/llvm/Target/Target.td              |   5 +
 include/llvm/Target/TargetPfmCounters.td   |  46 +++++
 include/llvm/Target/TargetSchedule.td      |  28 ---
 lib/Target/X86/CMakeLists.txt              |   1 +
 lib/Target/X86/X86PfmCounters.td           | 137 +++++++------
 tools/llvm-exegesis/lib/AArch64/Target.cpp |   4 +
 tools/llvm-exegesis/lib/Latency.cpp        |   9 +-
 tools/llvm-exegesis/lib/LlvmState.cpp      |   7 +-
 tools/llvm-exegesis/lib/LlvmState.h        |   8 +-
 tools/llvm-exegesis/lib/Target.cpp         |  27 +++
 tools/llvm-exegesis/lib/Target.h           |  39 ++++
 tools/llvm-exegesis/lib/Uops.cpp           |  20 +-
 tools/llvm-exegesis/lib/X86/Target.cpp     |   6 +
 tools/llvm-exegesis/llvm-exegesis.cpp      |  11 +-
 utils/TableGen/CMakeLists.txt              |   1 +
 utils/TableGen/CodeGenSchedule.cpp         |  31 +--
 utils/TableGen/CodeGenSchedule.h           |  12 +-
 utils/TableGen/CodeGenTarget.cpp           |   1 -
 utils/TableGen/ExegesisEmitter.cpp         | 212 +++++++++++++++++++++
 utils/TableGen/SubtargetEmitter.cpp        |  86 +--------
 utils/TableGen/TableGen.cpp                |   8 +-
 utils/TableGen/TableGenBackends.h          |   1 +
 25 files changed, 477 insertions(+), 247 deletions(-)
 create mode 100644 include/llvm/Target/TargetPfmCounters.td
 create mode 100644 utils/TableGen/ExegesisEmitter.cpp

diff --git a/docs/CommandGuide/llvm-exegesis.rst b/docs/CommandGuide/llvm-exegesis.rst
index bf21563722f..f27db9e57ed 100644
--- a/docs/CommandGuide/llvm-exegesis.rst
+++ b/docs/CommandGuide/llvm-exegesis.rst
@@ -224,6 +224,10 @@ OPTIONS
 
  If set, ignore instructions that do not have a sched class (class idx = 0).
 
+ .. option:: -mcpu=<cpu name>
+
+  If set, measure the cpu characteristics using the counters for this CPU. This
+  is useful when creating new sched models (the host CPU is unknown to LLVM).
 
 EXIT STATUS
 -----------
diff --git a/docs/CommandGuide/tblgen.rst b/docs/CommandGuide/tblgen.rst
index 55b54294846..3105e0c8076 100644
--- a/docs/CommandGuide/tblgen.rst
+++ b/docs/CommandGuide/tblgen.rst
@@ -130,6 +130,10 @@ OPTIONS
 
  Generate enhanced disassembly info.
 
+.. option:: -gen-exegesis
+
+ Generate llvm-exegesis tables.
+
 .. option:: -version
 
  Show the version number of this program.
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index 8990c2e3c0d..41305296b00 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -183,22 +183,6 @@ struct MCExtraProcessorInfo {
   unsigned NumRegisterFiles;
   const MCRegisterCostEntry *RegisterCostTable;
   unsigned NumRegisterCostEntries;
-
-  struct PfmCountersInfo {
-    // An optional name of a performance counter that can be used to measure
-    // cycles.
-    const char *CycleCounter;
-
-    // An optional name of a performance counter that can be used to measure
-    // uops.
-    const char *UopsCounter;
-
-    // For each MCProcResourceDesc defined by the processor, an optional list of
-    // names of performance counters that can be used to measure the resource
-    // utilization.
-    const char **IssueCounters;
-  };
-  PfmCountersInfo PfmCounters;
 };
 
 /// Machine model for scheduling, bundling, and heuristics.
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 538605a57ab..c2c56b0aca1 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -1555,3 +1555,8 @@ include "llvm/Target/GlobalISel/Target.td"
 // Pull in the common support for the Global ISel DAG-based selector generation.
 //
 include "llvm/Target/GlobalISel/SelectionDAGCompat.td"
+
+//===----------------------------------------------------------------------===//
+// Pull in the common support for Pfm Counters generation.
+//
+include "llvm/Target/TargetPfmCounters.td"
diff --git a/include/llvm/Target/TargetPfmCounters.td b/include/llvm/Target/TargetPfmCounters.td
new file mode 100644
index 00000000000..0a55a558f30
--- /dev/null
+++ b/include/llvm/Target/TargetPfmCounters.td
@@ -0,0 +1,46 @@
+//===- TargetPfmCounters.td - Target Pfm Counters -*- tablegen ----------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the target-independent interfaces for performance counters.
+
+// Definition of a hardware counters from libpfm identifiers.
+class PfmCounter<string counter> {
+  // The name of the counter that measures events.
+  // The name can be "some_counter + some_other_counter", in which case the
+  // measured value is the sum of events on these counters.
+  string Counter = counter;
+}
+
+// Issue counters can be tied to a ProcResource
+class PfmIssueCounter<string resource_name, string counter>
+    : PfmCounter<counter> {
+  // The name of the ProcResource on which uops are issued. This is used by
+  // llvm-exegesis to compare measurements with values in the SchedModels.
+  // If the CPU has a sched model, this should correspond to the name of a
+  // ProcResource.
+  string ResourceName = resource_name;
+}
+
+def NoPfmCounter : PfmCounter <""> {}
+
+// Set of PfmCounters for measuring sched model characteristics.
+class ProcPfmCounters {
+  // Processors can define how to measure cycles by defining a CycleCounter.
+  PfmCounter CycleCounter = NoPfmCounter;
+  // Processors can define how to measure uops by defining a UopsCounter.
+  PfmCounter UopsCounter = NoPfmCounter;
+  // Processors can define how to measure issued uops by defining IssueCounters.
+  list<PfmIssueCounter> IssueCounters = [];
+}
+
+// A binding of a set of counters to a CPU.
+class PfmCountersBinding<string cpu_name, ProcPfmCounters counters> {
+  string CpuName = cpu_name;
+  ProcPfmCounters Counters = counters;
+}
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index 77b1927f932..141e0669388 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -557,31 +557,3 @@ class RetireControlUnit<int bufferSize, int retirePerCycle> {
   int MaxRetirePerCycle = retirePerCycle;
   SchedMachineModel SchedModel = ?;
 }
-
-// Allow the definition of hardware counters.
-class PfmCounter {
-  SchedMachineModel SchedModel = ?;
-}
-
-// Each processor can define how to measure cycles by defining a
-// PfmCycleCounter.
-class PfmCycleCounter<string counter> : PfmCounter {
-  string Counter = counter;
-}
-
-// Each ProcResourceUnits can define how to measure issued uops by defining
-// a PfmIssueCounter.
-class PfmIssueCounter<ProcResourceUnits resource, list<string> counters>
-    : PfmCounter{
-  // The resource units on which uops are issued.
-  ProcResourceUnits Resource = resource;
-  // The list of counters that measure issue events.
-  list<string> Counters = counters;
-}
-
-// Each processor can define how to measure NumMicroOps by defining a
-// PfmUopsCounter.
-class PfmUopsCounter<string counter> : PfmCounter {
-  string Counter = counter;
-}
-
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 4495bc20618..5ded1f971a0 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -13,6 +13,7 @@ tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
 tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM X86GenExegesis.inc -gen-exegesis)
 
 if (X86_GEN_FOLD_TABLES)
   tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables)
diff --git a/lib/Target/X86/X86PfmCounters.td b/lib/Target/X86/X86PfmCounters.td
index 684cadd4962..9e0f0c4f64a 100644
--- a/lib/Target/X86/X86PfmCounters.td
+++ b/lib/Target/X86/X86PfmCounters.td
@@ -11,73 +11,92 @@
 //
 //===----------------------------------------------------------------------===//
 
-let SchedModel = SandyBridgeModel in {
-def SBCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
-def SBPort0Counter : PfmIssueCounter<SBPort0, ["uops_dispatched_port:port_0"]>;
-def SBPort1Counter : PfmIssueCounter<SBPort1, ["uops_dispatched_port:port_1"]>;
-def SBPort23Counter : PfmIssueCounter<SBPort23,
-                                      ["uops_dispatched_port:port_2",
-                                       "uops_dispatched_port:port_3"]>;
-def SBPort4Counter : PfmIssueCounter<SBPort4, ["uops_dispatched_port:port_4"]>;
-def SBPort5Counter : PfmIssueCounter<SBPort5, ["uops_dispatched_port:port_5"]>;
-def SBUopsCounter  : PfmUopsCounter<"uops_issued:any">;
+def UnhaltedCoreCyclesPfmCounter : PfmCounter<"unhalted_core_cycles">;
+def UopsIssuedPfmCounter : PfmCounter<"uops_issued:any">;
+
+def SandyBridgePfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = UopsIssuedPfmCounter;
+  let IssueCounters = [
+    PfmIssueCounter<"SBPort0",  "uops_dispatched_port:port_0">,
+    PfmIssueCounter<"SBPort1",  "uops_dispatched_port:port_1">,
+    PfmIssueCounter<"SBPort23", "uops_dispatched_port:port_2 + uops_dispatched_port:port_3">,
+    PfmIssueCounter<"SBPort4",  "uops_dispatched_port:port_4">,
+    PfmIssueCounter<"SBPort5",  "uops_dispatched_port:port_5">
+  ];
 }
+def : PfmCountersBinding<"sandybridge", SandyBridgePfmCounters>;
 
-let SchedModel = HaswellModel in {
-def HWCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
-def HWPort0Counter : PfmIssueCounter<HWPort0, ["uops_dispatched_port:port_0"]>;
-def HWPort1Counter : PfmIssueCounter<HWPort1, ["uops_dispatched_port:port_1"]>;
-def HWPort2Counter : PfmIssueCounter<HWPort2, ["uops_dispatched_port:port_2"]>;
-def HWPort3Counter : PfmIssueCounter<HWPort3, ["uops_dispatched_port:port_3"]>;
-def HWPort4Counter : PfmIssueCounter<HWPort4, ["uops_dispatched_port:port_4"]>;
-def HWPort5Counter : PfmIssueCounter<HWPort5, ["uops_dispatched_port:port_5"]>;
-def HWPort6Counter : PfmIssueCounter<HWPort6, ["uops_dispatched_port:port_6"]>;
-def HWPort7Counter : PfmIssueCounter<HWPort7, ["uops_dispatched_port:port_7"]>;
-def HWUopsCounter  : PfmUopsCounter<"uops_issued:any">;
+def HaswellPfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = UopsIssuedPfmCounter;
+  let IssueCounters = [
+    PfmIssueCounter<"HWPort0", "uops_dispatched_port:port_0">,
+    PfmIssueCounter<"HWPort1", "uops_dispatched_port:port_1">,
+    PfmIssueCounter<"HWPort2", "uops_dispatched_port:port_2">,
+    PfmIssueCounter<"HWPort3", "uops_dispatched_port:port_3">,
+    PfmIssueCounter<"HWPort4", "uops_dispatched_port:port_4">,
+    PfmIssueCounter<"HWPort5", "uops_dispatched_port:port_5">,
+    PfmIssueCounter<"HWPort6", "uops_dispatched_port:port_6">,
+    PfmIssueCounter<"HWPort7", "uops_dispatched_port:port_7">
+  ];
 }
+def : PfmCountersBinding<"haswell", HaswellPfmCounters>;
 
-let SchedModel = BroadwellModel in {
-def BWCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
-def BWPort0Counter : PfmIssueCounter<BWPort0, ["uops_executed_port:port_0"]>;
-def BWPort1Counter : PfmIssueCounter<BWPort1, ["uops_executed_port:port_1"]>;
-def BWPort2Counter : PfmIssueCounter<BWPort2, ["uops_executed_port:port_2"]>;
-def BWPort3Counter : PfmIssueCounter<BWPort3, ["uops_executed_port:port_3"]>;
-def BWPort4Counter : PfmIssueCounter<BWPort4, ["uops_executed_port:port_4"]>;
-def BWPort5Counter : PfmIssueCounter<BWPort5, ["uops_executed_port:port_5"]>;
-def BWPort6Counter : PfmIssueCounter<BWPort6, ["uops_executed_port:port_6"]>;
-def BWPort7Counter : PfmIssueCounter<BWPort7, ["uops_executed_port:port_7"]>;
-def BWUopsCounter  : PfmUopsCounter<"uops_issued:any">;
+def BroadwellPfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = UopsIssuedPfmCounter;
+  let IssueCounters = [
+    PfmIssueCounter<"BWPort0", "uops_executed_port:port_0">,
+    PfmIssueCounter<"BWPort1", "uops_executed_port:port_1">,
+    PfmIssueCounter<"BWPort2", "uops_executed_port:port_2">,
+    PfmIssueCounter<"BWPort3", "uops_executed_port:port_3">,
+    PfmIssueCounter<"BWPort4", "uops_executed_port:port_4">,
+    PfmIssueCounter<"BWPort5", "uops_executed_port:port_5">,
+    PfmIssueCounter<"BWPort6", "uops_executed_port:port_6">,
+    PfmIssueCounter<"BWPort7", "uops_executed_port:port_7">
+  ];
 }
+def : PfmCountersBinding<"broadwell", BroadwellPfmCounters>;
 
-let SchedModel = SkylakeClientModel in {
-def SKLCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
-def SKLPort0Counter : PfmIssueCounter<SKLPort0, ["uops_dispatched_port:port_0"]>;
-def SKLPort1Counter : PfmIssueCounter<SKLPort1, ["uops_dispatched_port:port_1"]>;
-def SKLPort2Counter : PfmIssueCounter<SKLPort2, ["uops_dispatched_port:port_2"]>;
-def SKLPort3Counter : PfmIssueCounter<SKLPort3, ["uops_dispatched_port:port_3"]>;
-def SKLPort4Counter : PfmIssueCounter<SKLPort4, ["uops_dispatched_port:port_4"]>;
-def SKLPort5Counter : PfmIssueCounter<SKLPort5, ["uops_dispatched_port:port_5"]>;
-def SKLPort6Counter : PfmIssueCounter<SKLPort6, ["uops_dispatched_port:port_6"]>;
-def SKLPort7Counter : PfmIssueCounter<SKLPort7, ["uops_dispatched_port:port_7"]>;
-def SKLUopsCounter  : PfmUopsCounter<"uops_issued:any">;
+def SkylakeClientPfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = UopsIssuedPfmCounter;
+  let IssueCounters = [
+    PfmIssueCounter<"SKLPort0", "uops_dispatched_port:port_0">,
+    PfmIssueCounter<"SKLPort1", "uops_dispatched_port:port_1">,
+    PfmIssueCounter<"SKLPort2", "uops_dispatched_port:port_2">,
+    PfmIssueCounter<"SKLPort3", "uops_dispatched_port:port_3">,
+    PfmIssueCounter<"SKLPort4", "uops_dispatched_port:port_4">,
+    PfmIssueCounter<"SKLPort5", "uops_dispatched_port:port_5">,
+    PfmIssueCounter<"SKLPort6", "uops_dispatched_port:port_6">,
+    PfmIssueCounter<"SKLPort7", "uops_dispatched_port:port_7">
+  ];
 }
+def : PfmCountersBinding<"skylake", SkylakeClientPfmCounters>;
 
-let SchedModel = SkylakeServerModel in {
-def SKXCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
-def SKXPort0Counter : PfmIssueCounter<SKXPort0, ["uops_dispatched_port:port_0"]>;
-def SKXPort1Counter : PfmIssueCounter<SKXPort1, ["uops_dispatched_port:port_1"]>;
-def SKXPort2Counter : PfmIssueCounter<SKXPort2, ["uops_dispatched_port:port_2"]>;
-def SKXPort3Counter : PfmIssueCounter<SKXPort3, ["uops_dispatched_port:port_3"]>;
-def SKXPort4Counter : PfmIssueCounter<SKXPort4, ["uops_dispatched_port:port_4"]>;
-def SKXPort5Counter : PfmIssueCounter<SKXPort5, ["uops_dispatched_port:port_5"]>;
-def SKXPort6Counter : PfmIssueCounter<SKXPort6, ["uops_dispatched_port:port_6"]>;
-def SKXPort7Counter : PfmIssueCounter<SKXPort7, ["uops_dispatched_port:port_7"]>;
-def SKXUopsCounter  : PfmUopsCounter<"uops_issued:any">;
+def SkylakeServerPfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = UopsIssuedPfmCounter;
+  let IssueCounters = [
+    PfmIssueCounter<"SKXPort0", "uops_dispatched_port:port_0">,
+    PfmIssueCounter<"SKXPort1", "uops_dispatched_port:port_1">,
+    PfmIssueCounter<"SKXPort2", "uops_dispatched_port:port_2">,
+    PfmIssueCounter<"SKXPort3", "uops_dispatched_port:port_3">,
+    PfmIssueCounter<"SKXPort4", "uops_dispatched_port:port_4">,
+    PfmIssueCounter<"SKXPort5", "uops_dispatched_port:port_5">,
+    PfmIssueCounter<"SKXPort6", "uops_dispatched_port:port_6">,
+    PfmIssueCounter<"SKXPort7", "uops_dispatched_port:port_7">
+  ];
 }
+def : PfmCountersBinding<"skylake-avx512", SkylakeServerPfmCounters>;
 
-let SchedModel = BtVer2Model in {
-def JCycleCounter : PfmCycleCounter<"cpu_clk_unhalted">;
-def JUopsCounter  : PfmUopsCounter<"retired_uops">;
-def JFPU0Counter  : PfmIssueCounter<JFPU0, ["dispatched_fpu:pipe0"]>;
-def JFPU1Counter  : PfmIssueCounter<JFPU1, ["dispatched_fpu:pipe1"]>;
+def BtVer2PfmCounters : ProcPfmCounters {
+  let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+  let UopsCounter = PfmCounter<"retired_uops">;
+  let IssueCounters = [
+    PfmIssueCounter<"JFPU0", "dispatched_fpu:pipe0">,
+    PfmIssueCounter<"JFPU1", "dispatched_fpu:pipe1">
+  ];
 }
+def : PfmCountersBinding<"btver2", BtVer2PfmCounters>;
diff --git a/tools/llvm-exegesis/lib/AArch64/Target.cpp b/tools/llvm-exegesis/lib/AArch64/Target.cpp
index be8f0b41ede..0197420f433 100644
--- a/tools/llvm-exegesis/lib/AArch64/Target.cpp
+++ b/tools/llvm-exegesis/lib/AArch64/Target.cpp
@@ -53,6 +53,10 @@ static llvm::MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
 } // namespace
 
 class ExegesisAArch64Target : public ExegesisTarget {
+public:
+  ExegesisAArch64Target() : ExegesisTarget({}) {}
+
+private:
   std::vector<llvm::MCInst> setRegTo(const llvm::MCSubtargetInfo &STI,
                                      unsigned Reg,
                                      const llvm::APInt &Value) const override {
diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp
index 602b379faf3..3d18e37f4c3 100644
--- a/tools/llvm-exegesis/lib/Latency.cpp
+++ b/tools/llvm-exegesis/lib/Latency.cpp
@@ -12,6 +12,8 @@
 #include "Assembler.h"
 #include "BenchmarkRunner.h"
 #include "MCInstrDescView.h"
+#include "PerfHelper.h"
+#include "Target.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
@@ -165,12 +167,7 @@ LatencySnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
 }
 
 const char *LatencyBenchmarkRunner::getCounterName() const {
-  if (!State.getSubtargetInfo().getSchedModel().hasExtraProcessorInfo())
-    llvm::report_fatal_error("sched model is missing extra processor info!");
-  const char *CounterName = State.getSubtargetInfo()
-                                .getSchedModel()
-                                .getExtraProcessorInfo()
-                                .PfmCounters.CycleCounter;
+  const char *CounterName = State.getPfmCounters().CycleCounter;
   if (!CounterName)
     llvm::report_fatal_error("sched model does not define a cycle counter");
   return CounterName;
diff --git a/tools/llvm-exegesis/lib/LlvmState.cpp b/tools/llvm-exegesis/lib/LlvmState.cpp
index 58e9db315d5..b5580c83cf5 100644
--- a/tools/llvm-exegesis/lib/LlvmState.cpp
+++ b/tools/llvm-exegesis/lib/LlvmState.cpp
@@ -36,14 +36,17 @@ LLVMState::LLVMState(const std::string &Triple, const std::string &CpuName) {
     llvm::errs() << "no exegesis target for " << Triple << ", using default\n";
     TheExegesisTarget = &ExegesisTarget::getDefault();
   }
+  PfmCounters = &TheExegesisTarget->getPfmCounters(CpuName);
+
   RATC.reset(new RegisterAliasingTrackerCache(
       getRegInfo(), getFunctionReservedRegs(getTargetMachine())));
   IC.reset(new InstructionsCache(getInstrInfo(), getRATC()));
 }
 
-LLVMState::LLVMState()
+LLVMState::LLVMState(const std::string &CpuName)
     : LLVMState(llvm::sys::getProcessTriple(),
-                llvm::sys::getHostCPUName().str()) {}
+                CpuName.empty() ? llvm::sys::getHostCPUName().str() : CpuName) {
+}
 
 std::unique_ptr<llvm::LLVMTargetMachine>
 LLVMState::createTargetMachine() const {
diff --git a/tools/llvm-exegesis/lib/LlvmState.h b/tools/llvm-exegesis/lib/LlvmState.h
index 918738551d0..be1e7979a17 100644
--- a/tools/llvm-exegesis/lib/LlvmState.h
+++ b/tools/llvm-exegesis/lib/LlvmState.h
@@ -30,12 +30,14 @@ namespace llvm {
 namespace exegesis {
 
 class ExegesisTarget;
+class PfmCountersInfo;
 
 // An object to initialize LLVM and prepare objects needed to run the
 // measurements.
 class LLVMState {
 public:
-  LLVMState();
+  // Uses the host triple. If CpuName is empty, uses the host CPU.
+  LLVMState(const std::string &CpuName);
 
   LLVMState(const std::string &Triple,
             const std::string &CpuName); // For tests.
@@ -57,14 +59,18 @@ public:
   const llvm::MCSubtargetInfo &getSubtargetInfo() const {
     return *TargetMachine->getMCSubtargetInfo();
   }
+
   const RegisterAliasingTrackerCache &getRATC() const { return *RATC; }
   const InstructionsCache &getIC() const { return *IC; }
 
+  const PfmCountersInfo &getPfmCounters() const { return *PfmCounters; }
+
 private:
   const ExegesisTarget *TheExegesisTarget;
   std::unique_ptr<const llvm::TargetMachine> TargetMachine;
   std::unique_ptr<const RegisterAliasingTrackerCache> RATC;
   std::unique_ptr<const InstructionsCache> IC;
+  const PfmCountersInfo *PfmCounters;
 };
 
 } // namespace exegesis
diff --git a/tools/llvm-exegesis/lib/Target.cpp b/tools/llvm-exegesis/lib/Target.cpp
index b7828a13da0..588c40e8c7f 100644
--- a/tools/llvm-exegesis/lib/Target.cpp
+++ b/tools/llvm-exegesis/lib/Target.cpp
@@ -85,10 +85,37 @@ ExegesisTarget::createUopsBenchmarkRunner(const LLVMState &State) const {
   return llvm::make_unique<UopsBenchmarkRunner>(State);
 }
 
+static_assert(std::is_pod<PfmCountersInfo>::value,
+              "We shouldn't have dynamic initialization here");
+const PfmCountersInfo PfmCountersInfo::Default = {nullptr, nullptr, nullptr};
+
+const PfmCountersInfo &
+ExegesisTarget::getPfmCounters(llvm::StringRef CpuName) const {
+  assert(std::is_sorted(
+             CpuPfmCounters.begin(), CpuPfmCounters.end(),
+             [](const CpuAndPfmCounters &LHS, const CpuAndPfmCounters &RHS) {
+               return strcmp(LHS.CpuName, RHS.CpuName) < 0;
+             }) &&
+         "CpuPfmCounters table is not sorted");
+
+  // Find entry
+  auto Found =
+      std::lower_bound(CpuPfmCounters.begin(), CpuPfmCounters.end(), CpuName);
+  if (Found == CpuPfmCounters.end() ||
+      llvm::StringRef(Found->CpuName) != CpuName) {
+    return PfmCountersInfo::Default;
+  }
+  assert(Found->PCI && "Missing counters");
+  return *Found->PCI;
+}
+
 namespace {
 
 // Default implementation.
 class ExegesisDefaultTarget : public ExegesisTarget {
+public:
+  ExegesisDefaultTarget() : ExegesisTarget({}) {}
+
 private:
   std::vector<llvm::MCInst> setRegTo(const llvm::MCSubtargetInfo &STI,
                                      unsigned Reg,
diff --git a/tools/llvm-exegesis/lib/Target.h b/tools/llvm-exegesis/lib/Target.h
index 2e94727d78d..a6ec36bebb3 100644
--- a/tools/llvm-exegesis/lib/Target.h
+++ b/tools/llvm-exegesis/lib/Target.h
@@ -31,8 +31,42 @@
 namespace llvm {
 namespace exegesis {
 
+struct PfmCountersInfo {
+  // An optional name of a performance counter that can be used to measure
+  // cycles.
+  const char *const CycleCounter;
+
+  // An optional name of a performance counter that can be used to measure
+  // uops.
+  const char *const UopsCounter;
+
+  // An IssueCounter specifies how to measure uops issued to specific proc
+  // resources.
+  struct IssueCounter {
+    const char *const Counter;
+    // The name of the ProcResource that this counter measures.
+    const char *const ProcResName;
+  };
+  // An optional list of IssueCounters.
+  const IssueCounter *const IssueCounters;
+  const unsigned NumIssueCounters;
+
+  static const PfmCountersInfo Default;
+};
+
+struct CpuAndPfmCounters {
+  const char *const CpuName;
+  const PfmCountersInfo *const PCI;
+  bool operator<(llvm::StringRef S) const {
+    return llvm::StringRef(CpuName) < S;
+  }
+};
+
 class ExegesisTarget {
 public:
+  explicit ExegesisTarget(llvm::ArrayRef<CpuAndPfmCounters> CpuPfmCounters)
+      : CpuPfmCounters(CpuPfmCounters) {}
+
   // Targets can use this to add target-specific passes in assembleToStream();
   virtual void addTargetSpecificPasses(llvm::PassManagerBase &PM) const {}
 
@@ -83,6 +117,10 @@ public:
 
   virtual ~ExegesisTarget();
 
+  // Returns the Pfm counters for the given CPU (or the default if no pfm
+  // counters are defined for this CPU).
+  const PfmCountersInfo &getPfmCounters(llvm::StringRef CpuName) const;
+
 private:
   virtual bool matchesArch(llvm::Triple::ArchType Arch) const = 0;
 
@@ -98,6 +136,7 @@ private:
       const LLVMState &State) const;
 
   const ExegesisTarget *Next = nullptr;
+  const llvm::ArrayRef<CpuAndPfmCounters> CpuPfmCounters;
 };
 
 } // namespace exegesis
diff --git a/tools/llvm-exegesis/lib/Uops.cpp b/tools/llvm-exegesis/lib/Uops.cpp
index 5aa726218c7..9768f4533f7 100644
--- a/tools/llvm-exegesis/lib/Uops.cpp
+++ b/tools/llvm-exegesis/lib/Uops.cpp
@@ -223,24 +223,22 @@ UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
 
 llvm::Expected<std::vector<BenchmarkMeasure>>
 UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
-  const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
-
   std::vector<BenchmarkMeasure> Result;
-  const auto &PfmCounters = SchedModel.getExtraProcessorInfo().PfmCounters;
+  const PfmCountersInfo &PCI = State.getPfmCounters();
   // Uops per port.
-  for (unsigned ProcResIdx = 1;
-       ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
-    const char *const Counters = PfmCounters.IssueCounters[ProcResIdx];
-    if (!Counters)
+  for (const auto *IssueCounter = PCI.IssueCounters,
+                  *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters;
+       IssueCounter != IssueCounterEnd; ++IssueCounter) {
+    if (!IssueCounter->Counter)
       continue;
-    auto ExpectedCounterValue = Executor.runAndMeasure(Counters);
+    auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter);
     if (!ExpectedCounterValue)
       return ExpectedCounterValue.takeError();
-    Result.push_back(BenchmarkMeasure::Create(
-        SchedModel.getProcResource(ProcResIdx)->Name, *ExpectedCounterValue));
+    Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName,
+                                              *ExpectedCounterValue));
   }
   // NumMicroOps.
-  if (const char *const UopsCounter = PfmCounters.UopsCounter) {
+  if (const char *const UopsCounter = PCI.UopsCounter) {
     auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
     if (!ExpectedCounterValue)
       return ExpectedCounterValue.takeError();
diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index 69804849e62..6ae228e1124 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -329,7 +329,13 @@ private:
   std::vector<llvm::MCInst> Instructions;
 };
 
+#include "X86GenExegesis.inc"
+
 class ExegesisX86Target : public ExegesisTarget {
+public:
+  ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
+
+private:
   void addTargetSpecificPasses(llvm::PassManagerBase &PM) const override {
     // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
     PM.add(llvm::createX86FloatingPointStackifierPass());
diff --git a/tools/llvm-exegesis/llvm-exegesis.cpp b/tools/llvm-exegesis/llvm-exegesis.cpp
index 689a1e097c6..a28e68ec006 100644
--- a/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -94,6 +94,13 @@ static cl::opt<std::string>
     AnalysisInconsistenciesOutputFile("analysis-inconsistencies-output-file",
                                       cl::desc(""), cl::init("-"));
 
+static cl::opt<std::string>
+    CpuName("mcpu",
+            cl::desc(
+                "cpu name to use for pfm counters, leave empty to autodetect"),
+            cl::init(""));
+
+
 static ExitOnError ExitOnErr;
 
 #ifdef LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET
@@ -321,7 +328,7 @@ void benchmarkMain() {
   LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET();
 #endif
 
-  const LLVMState State;
+  const LLVMState State(CpuName);
   const auto Opcodes = getOpcodesOrDie(State.getInstrInfo());
 
   std::vector<BenchmarkCode> Configurations;
@@ -399,7 +406,7 @@ static void analysisMain() {
   llvm::InitializeNativeTargetAsmPrinter();
   llvm::InitializeNativeTargetDisassembler();
   // Read benchmarks.
-  const LLVMState State;
+  const LLVMState State("");
   const std::vector<InstructionBenchmark> Points =
       ExitOnErr(InstructionBenchmark::readYamls(State, BenchmarkFile));
   llvm::outs() << "Parsed " << Points.size() << " benchmark points\n";
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index 0428249f917..c88365a2b8c 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -21,6 +21,7 @@ add_tablegen(llvm-tblgen LLVM
   DAGISelMatcher.cpp
   DFAPacketizerEmitter.cpp
   DisassemblerEmitter.cpp
+  ExegesisEmitter.cpp
   FastISelEmitter.cpp
   FixedLenDecoderEmitter.cpp
   GlobalISelEmitter.cpp
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index e94ed760fc4..a9a36a87ef3 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -350,7 +350,7 @@ processSTIPredicate(STIPredicateFunction &Fn,
         unsigned OpcodeIdx = Opcode2Index[Opcode];
         if (OpcodeMasks[OpcodeIdx].first[ProcIndex]) {
           std::string Message =
-              "Opcode " + Opcode->getName().str() + 
+              "Opcode " + Opcode->getName().str() +
               " used by multiple InstructionEquivalenceClass definitions.";
           PrintFatalError(EC->getLoc(), Message);
         }
@@ -487,9 +487,6 @@ void CodeGenSchedModels::collectOptionalProcessorInfo() {
   // Collect processor RetireControlUnit descriptors if available.
   collectRetireControlUnits();
 
-  // Find pfm counter definitions for each processor.
-  collectPfmCounters();
-
   checkCompleteness();
 }
 
@@ -1789,32 +1786,6 @@ void CodeGenSchedModels::collectRegisterFiles() {
   }
 }
 
-// Collect all the RegisterFile definitions available in this target.
-void CodeGenSchedModels::collectPfmCounters() {
-  for (Record *Def : Records.getAllDerivedDefinitions("PfmIssueCounter")) {
-    CodeGenProcModel &PM = getProcModel(Def->getValueAsDef("SchedModel"));
-    PM.PfmIssueCounterDefs.emplace_back(Def);
-  }
-  for (Record *Def : Records.getAllDerivedDefinitions("PfmCycleCounter")) {
-    CodeGenProcModel &PM = getProcModel(Def->getValueAsDef("SchedModel"));
-    if (PM.PfmCycleCounterDef) {
-      PrintFatalError(Def->getLoc(),
-                      "multiple cycle counters for " +
-                          Def->getValueAsDef("SchedModel")->getName());
-    }
-    PM.PfmCycleCounterDef = Def;
-  }
-  for (Record *Def : Records.getAllDerivedDefinitions("PfmUopsCounter")) {
-    CodeGenProcModel &PM = getProcModel(Def->getValueAsDef("SchedModel"));
-    if (PM.PfmUopsCounterDef) {
-      PrintFatalError(Def->getLoc(),
-                      "multiple uops counters for " +
-                          Def->getValueAsDef("SchedModel")->getName());
-    }
-    PM.PfmUopsCounterDef = Def;
-  }
-}
-
 // Collect and sort WriteRes, ReadAdvance, and ProcResources.
 void CodeGenSchedModels::collectProcResources() {
   ProcResourceDefs = Records.getAllDerivedDefinitions("ProcResourceUnits");
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index 39443bb35e9..9bde5f4e759 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -246,11 +246,6 @@ struct CodeGenProcModel {
   // Optional Retire Control Unit definition.
   Record *RetireControlUnit;
 
-  // List of PfmCounters.
-  RecVec PfmIssueCounterDefs;
-  Record *PfmCycleCounterDef = nullptr;
-  Record *PfmUopsCounterDef = nullptr;
-
   CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef,
                    Record *IDef) :
     Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
@@ -265,10 +260,7 @@ struct CodeGenProcModel {
   }
 
   bool hasExtraProcessorInfo() const {
-    return RetireControlUnit || !RegisterFiles.empty() ||
-        !PfmIssueCounterDefs.empty() ||
-        PfmCycleCounterDef != nullptr ||
-        PfmUopsCounterDef != nullptr;
+    return RetireControlUnit || !RegisterFiles.empty();
   }
 
   unsigned getProcResourceIdx(Record *PRDef) const;
@@ -593,8 +585,6 @@ private:
 
   void collectRegisterFiles();
 
-  void collectPfmCounters();
-
   void collectOptionalProcessorInfo();
 
   std::string createSchedClassName(Record *ItinClassDef,
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 2766fcca161..305d2d19ff4 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -711,4 +711,3 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
   // Sort the argument attributes for later benefit.
   llvm::sort(ArgumentAttributes);
 }
-
diff --git a/utils/TableGen/ExegesisEmitter.cpp b/utils/TableGen/ExegesisEmitter.cpp
new file mode 100644
index 00000000000..083d7439451
--- /dev/null
+++ b/utils/TableGen/ExegesisEmitter.cpp
@@ -0,0 +1,212 @@
+//===- ExegesisEmitter.cpp - Generate exegesis target data ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This tablegen backend emits llvm-exegesis information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "exegesis-emitter"
+
+namespace {
+
+class ExegesisEmitter {
+public:
+  ExegesisEmitter(RecordKeeper &RK);
+
+  void run(raw_ostream &OS) const;
+
+private:
+  unsigned getPfmCounterId(llvm::StringRef Name) const {
+    const auto It = PfmCounterNameTable.find(Name);
+    if (It == PfmCounterNameTable.end())
+      PrintFatalError("no pfm counter id for " + Name);
+    return It->second;
+  }
+
+  // Collects all the ProcPfmCounters definitions available in this target.
+  void emitPfmCounters(raw_ostream &OS) const;
+
+  void emitPfmCountersInfo(const Record &Def,
+                           unsigned &IssueCountersTableOffset,
+                           raw_ostream &OS) const;
+
+  void emitPfmCountersLookupTable(raw_ostream &OS) const;
+
+  RecordKeeper &Records;
+  std::string Target;
+
+  // Table of counter name -> counter index.
+  const std::map<llvm::StringRef, unsigned> PfmCounterNameTable;
+};
+
+static std::map<llvm::StringRef, unsigned>
+collectPfmCounters(const RecordKeeper &Records) {
+  std::map<llvm::StringRef, unsigned> PfmCounterNameTable;
+  const auto AddPfmCounterName = [&PfmCounterNameTable](
+                                     const Record *PfmCounterDef) {
+    const llvm::StringRef Counter = PfmCounterDef->getValueAsString("Counter");
+    if (!Counter.empty())
+      PfmCounterNameTable.emplace(Counter, 0);
+  };
+  for (Record *Def : Records.getAllDerivedDefinitions("ProcPfmCounters")) {
+    // Check that ResourceNames are unique.
+    llvm::SmallSet<llvm::StringRef, 16> Seen;
+    for (const Record *IssueCounter :
+         Def->getValueAsListOfDefs("IssueCounters")) {
+      const llvm::StringRef ResourceName =
+          IssueCounter->getValueAsString("ResourceName");
+      if (ResourceName.empty())
+        PrintFatalError(IssueCounter->getLoc(), "invalid empty ResourceName");
+      if (!Seen.insert(ResourceName).second)
+        PrintFatalError(IssueCounter->getLoc(),
+                        "duplicate ResourceName " + ResourceName);
+      AddPfmCounterName(IssueCounter);
+    }
+    AddPfmCounterName(Def->getValueAsDef("CycleCounter"));
+    AddPfmCounterName(Def->getValueAsDef("UopsCounter"));
+  }
+  unsigned Index = 0;
+  for (auto &NameAndIndex : PfmCounterNameTable)
+    NameAndIndex.second = Index++;
+  return PfmCounterNameTable;
+}
+
+ExegesisEmitter::ExegesisEmitter(RecordKeeper &RK)
+    : Records(RK), PfmCounterNameTable(collectPfmCounters(RK)) {
+  std::vector<Record *> Targets = Records.getAllDerivedDefinitions("Target");
+  if (Targets.size() == 0)
+    PrintFatalError("ERROR: No 'Target' subclasses defined!");
+  if (Targets.size() != 1)
+    PrintFatalError("ERROR: Multiple subclasses of Target defined!");
+  Target = Targets[0]->getName();
+}
+
+void ExegesisEmitter::emitPfmCountersInfo(const Record &Def,
+                                          unsigned &IssueCountersTableOffset,
+                                          raw_ostream &OS) const {
+  const auto CycleCounter =
+      Def.getValueAsDef("CycleCounter")->getValueAsString("Counter");
+  const auto UopsCounter =
+      Def.getValueAsDef("UopsCounter")->getValueAsString("Counter");
+  const size_t NumIssueCounters =
+      Def.getValueAsListOfDefs("IssueCounters").size();
+
+  // This is the default, do not emit.
+  if (CycleCounter.empty() && UopsCounter.empty() && NumIssueCounters == 0)
+    return;
+
+  OS << "\nstatic const PfmCountersInfo " << Target << Def.getName()
+     << " = {\n";
+
+  // Cycle Counter.
+  if (CycleCounter.empty())
+    OS << "  nullptr,  // No cycle counter.\n";
+  else
+    OS << "  " << Target << "PfmCounterNames[" << getPfmCounterId(CycleCounter)
+       << "],  // Cycle counter\n";
+
+  // Uops Counter.
+  if (UopsCounter.empty())
+    OS << "  nullptr,  // No uops counter.\n";
+  else
+    OS << "  " << Target << "PfmCounterNames[" << getPfmCounterId(UopsCounter)
+       << "],  // Uops counter\n";
+
+  // Issue Counters
+  if (NumIssueCounters == 0)
+    OS << "  nullptr,  // No issue counters.\n  0\n";
+  else
+    OS << "  " << Target << "PfmIssueCounters + " << IssueCountersTableOffset
+       << ", " << NumIssueCounters << " // Issue counters.\n";
+
+  OS << "};\n";
+  IssueCountersTableOffset += NumIssueCounters;
+}
+
+void ExegesisEmitter::emitPfmCounters(raw_ostream &OS) const {
+  // Emit the counter name table.
+  OS << "\nstatic const char* " << Target << "PfmCounterNames[] = {\n";
+  for (const auto &NameAndIndex : PfmCounterNameTable)
+    OS << "  \"" << NameAndIndex.first << "\", // " << NameAndIndex.second
+       << "\n";
+  OS << "};\n\n";
+
+  // Emit the IssueCounters table.
+  const auto PfmCounterDefs =
+      Records.getAllDerivedDefinitions("ProcPfmCounters");
+  OS << "static const PfmCountersInfo::IssueCounter " << Target
+     << "PfmIssueCounters[] = {\n";
+  for (const Record *Def : PfmCounterDefs) {
+    for (const Record *ICDef : Def->getValueAsListOfDefs("IssueCounters"))
+      OS << "  { " << Target << "PfmCounterNames["
+         << getPfmCounterId(ICDef->getValueAsString("Counter")) << "], \""
+         << ICDef->getValueAsString("ResourceName") << "\"},\n";
+  }
+
+  OS << "};\n";
+
+  // Now generate the PfmCountersInfo.
+  unsigned IssueCountersTableOffset = 0;
+  for (const Record *Def : PfmCounterDefs)
+    emitPfmCountersInfo(*Def, IssueCountersTableOffset, OS);
+
+  OS << "\n";
+}
+
+void ExegesisEmitter::emitPfmCountersLookupTable(raw_ostream &OS) const {
+  std::vector<Record *> Bindings =
+      Records.getAllDerivedDefinitions("PfmCountersBinding");
+  llvm::sort(Bindings, [](const Record *L, const Record *R) {
+    return L->getValueAsString("CpuName") < R->getValueAsString("CpuName");
+  });
+
+  OS << "// Sorted (by CpuName) array of pfm counters.\n"
+     << "static const CpuAndPfmCounters " << Target << "CpuPfmCounters[] = {\n";
+  for (Record *Binding : Bindings) {
+    // Emit as { "cpu", procinit },
+    OS << "  { \""                                                        //
+       << Binding->getValueAsString("CpuName") << "\","                   //
+       << " &" << Target << Binding->getValueAsDef("Counters")->getName() //
+       << " },\n";
+  }
+  OS << "};\n\n";
+}
+
+void ExegesisEmitter::run(raw_ostream &OS) const {
+  emitSourceFileHeader("Exegesis Tables", OS);
+  emitPfmCounters(OS);
+  emitPfmCountersLookupTable(OS);
+}
+
+} // end anonymous namespace
+
+namespace llvm {
+
+void EmitExegesis(RecordKeeper &RK, raw_ostream &OS) {
+  ExegesisEmitter(RK).run(OS);
+}
+
+} // end namespace llvm
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index d1ea968590f..4ff52b3e44e 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -697,80 +697,12 @@ SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
   return CostTblIndex;
 }
 
-static bool EmitPfmIssueCountersTable(const CodeGenProcModel &ProcModel,
-                                      raw_ostream &OS) {
-  unsigned NumCounterDefs = 1 + ProcModel.ProcResourceDefs.size();
-  std::vector<const Record *> CounterDefs(NumCounterDefs);
-  bool HasCounters = false;
-  for (const Record *CounterDef : ProcModel.PfmIssueCounterDefs) {
-    const Record *&CD = CounterDefs[ProcModel.getProcResourceIdx(
-        CounterDef->getValueAsDef("Resource"))];
-    if (CD) {
-      PrintFatalError(CounterDef->getLoc(),
-                      "multiple issue counters for " +
-                          CounterDef->getValueAsDef("Resource")->getName());
-    }
-    CD = CounterDef;
-    HasCounters = true;
-  }
-  if (!HasCounters) {
-    return false;
-  }
-  OS << "\nstatic const char* " << ProcModel.ModelName
-     << "PfmIssueCounters[] = {\n";
-  for (unsigned i = 0; i != NumCounterDefs; ++i) {
-    const Record *CounterDef = CounterDefs[i];
-    if (CounterDef) {
-      const auto PfmCounters = CounterDef->getValueAsListOfStrings("Counters");
-      if (PfmCounters.empty())
-        PrintFatalError(CounterDef->getLoc(), "empty counter list");
-      OS << "  \"" << PfmCounters[0];
-      for (unsigned p = 1, e = PfmCounters.size(); p != e; ++p)
-        OS << ",\" \"" << PfmCounters[p];
-      OS << "\",  // #" << i << " = ";
-      OS << CounterDef->getValueAsDef("Resource")->getName() << "\n";
-    } else {
-      OS << "  nullptr, // #" << i << "\n";
-    }
-  }
-  OS << "};\n";
-  return true;
-}
-
-static void EmitPfmCounters(const CodeGenProcModel &ProcModel,
-                            const bool HasPfmIssueCounters, raw_ostream &OS) {
-  OS << "  {\n";
-  // Emit the cycle counter.
-  if (ProcModel.PfmCycleCounterDef)
-    OS << "    \"" << ProcModel.PfmCycleCounterDef->getValueAsString("Counter")
-       << "\",  // Cycle counter.\n";
-  else
-    OS << "    nullptr,  // No cycle counter.\n";
-
-  // Emit the uops counter.
-  if (ProcModel.PfmUopsCounterDef)
-    OS << "    \"" << ProcModel.PfmUopsCounterDef->getValueAsString("Counter")
-       << "\",  // Uops counter.\n";
-  else
-    OS << "    nullptr,  // No uops counter.\n";
-
-  // Emit a reference to issue counters table.
-  if (HasPfmIssueCounters)
-    OS << "    " << ProcModel.ModelName << "PfmIssueCounters\n";
-  else
-    OS << "    nullptr  // No issue counters.\n";
-  OS << "  }\n";
-}
-
 void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
                                               raw_ostream &OS) {
   // Generate a table of register file descriptors (one entry per each user
   // defined register file), and a table of register costs.
   unsigned NumCostEntries = EmitRegisterFileTables(ProcModel, OS);
 
-  // Generate a table of ProcRes counter names.
-  const bool HasPfmIssueCounters = EmitPfmIssueCountersTable(ProcModel, OS);
-
   // Now generate a table for the extra processor info.
   OS << "\nstatic const llvm::MCExtraProcessorInfo " << ProcModel.ModelName
      << "ExtraInfo = {\n  ";
@@ -783,8 +715,6 @@ void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
   EmitRegisterFileInfo(ProcModel, ProcModel.RegisterFiles.size(),
                        NumCostEntries, OS);
 
-  EmitPfmCounters(ProcModel, HasPfmIssueCounters, OS);
-
   OS << "};\n";
 }
 
@@ -1410,7 +1340,7 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
 }
 
 //
-// EmitProcessorLookup - generate cpu name to itinerary lookup table.
+// EmitProcessorLookup - generate cpu name to sched model lookup tables.
 //
 void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
   // Gather and sort processor information
@@ -1418,12 +1348,11 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
                           Records.getAllDerivedDefinitions("Processor");
   llvm::sort(ProcessorList, LessRecordFieldName());
 
-  // Begin processor table
+  // Begin processor->sched model table
   OS << "\n";
-  OS << "// Sorted (by key) array of itineraries for CPU subtype.\n"
-     << "extern const llvm::SubtargetInfoKV "
-     << Target << "ProcSchedKV[] = {\n";
-
+  OS << "// Sorted (by key) array of sched model for CPU subtype.\n"
+     << "extern const llvm::SubtargetInfoKV " << Target
+     << "ProcSchedKV[] = {\n";
   // For each processor
   for (Record *Processor : ProcessorList) {
     StringRef Name = Processor->getValueAsString("Name");
@@ -1433,8 +1362,7 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
     // Emit as { "cpu", procinit },
     OS << "  { \"" << Name << "\", (const void *)&" << ProcModelName << " },\n";
   }
-
-  // End processor table
+  // End processor->sched model table
   OS << "};\n";
 }
 
@@ -1675,7 +1603,7 @@ void SubtargetEmitter::EmitSchedModelHelpers(const std::string &ClassName,
 
   // Emit target predicates.
   emitSchedModelHelpersImpl(OS);
-  
+
   OS << "} // " << ClassName << "::resolveSchedClass\n\n";
 
   OS << "unsigned " << ClassName
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index b78260625cb..d5b6a3c1264 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -53,6 +53,7 @@ enum ActionType {
   GenX86EVEX2VEXTables,
   GenX86FoldTables,
   GenRegisterBank,
+  GenExegesis,
 };
 
 namespace {
@@ -117,7 +118,9 @@ namespace {
                     clEnumValN(GenX86FoldTables, "gen-x86-fold-tables",
                                "Generate X86 fold tables"),
                     clEnumValN(GenRegisterBank, "gen-register-bank",
-                               "Generate registers bank descriptions")));
+                               "Generate registers bank descriptions"),
+                    clEnumValN(GenExegesis, "gen-exegesis",
+                               "Generate llvm-exegesis tables")));
 
   cl::OptionCategory PrintEnumsCat("Options for -print-enums");
   cl::opt<std::string>
@@ -231,6 +234,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenX86FoldTables:
     EmitX86FoldTables(Records, OS);
     break;
+  case GenExegesis:
+    EmitExegesis(Records, OS);
+    break;
   }
 
   return false;
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index 1329a6d833f..f4f2909f8e8 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -89,6 +89,7 @@ void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS);
+void EmitExegesis(RecordKeeper &RK, raw_ostream &OS);
 
 } // End llvm namespace
 
-- 
GitLab


From 65f864674d97dc54bc88504d5b6c462c2381c8af Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Thu, 25 Oct 2018 08:06:35 +0000
Subject: [PATCH 0553/1116] [llvm-exegesis] Fix warning in r345243.

warning C4099: 'llvm::exegesis::PfmCountersInfo': type name first seen using 'class' now seen using 'struct'

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345244 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/LlvmState.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-exegesis/lib/LlvmState.h b/tools/llvm-exegesis/lib/LlvmState.h
index be1e7979a17..159a8a51c5c 100644
--- a/tools/llvm-exegesis/lib/LlvmState.h
+++ b/tools/llvm-exegesis/lib/LlvmState.h
@@ -30,7 +30,7 @@ namespace llvm {
 namespace exegesis {
 
 class ExegesisTarget;
-class PfmCountersInfo;
+struct PfmCountersInfo;
 
 // An object to initialize LLVM and prepare objects needed to run the
 // measurements.
-- 
GitLab


From a06ebcb65fa79daedc483e8b74bb51682781645d Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Thu, 25 Oct 2018 08:08:58 +0000
Subject: [PATCH 0554/1116] [llvm-exegesis] Fix VC build of r345243.

"const members cannot be default initialized unless their type has a user defined default constructor"

Make members non-const.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345245 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/Target.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/llvm-exegesis/lib/Target.h b/tools/llvm-exegesis/lib/Target.h
index a6ec36bebb3..7f349026ece 100644
--- a/tools/llvm-exegesis/lib/Target.h
+++ b/tools/llvm-exegesis/lib/Target.h
@@ -34,29 +34,29 @@ namespace exegesis {
 struct PfmCountersInfo {
   // An optional name of a performance counter that can be used to measure
   // cycles.
-  const char *const CycleCounter;
+  const char * CycleCounter;
 
   // An optional name of a performance counter that can be used to measure
   // uops.
-  const char *const UopsCounter;
+  const char * UopsCounter;
 
   // An IssueCounter specifies how to measure uops issued to specific proc
   // resources.
   struct IssueCounter {
-    const char *const Counter;
+    const char * Counter;
     // The name of the ProcResource that this counter measures.
-    const char *const ProcResName;
+    const char * ProcResName;
   };
   // An optional list of IssueCounters.
-  const IssueCounter *const IssueCounters;
-  const unsigned NumIssueCounters;
+  const IssueCounter * IssueCounters;
+  unsigned NumIssueCounters;
 
   static const PfmCountersInfo Default;
 };
 
 struct CpuAndPfmCounters {
-  const char *const CpuName;
-  const PfmCountersInfo *const PCI;
+  const char * CpuName;
+  const PfmCountersInfo * PCI;
   bool operator<(llvm::StringRef S) const {
     return llvm::StringRef(CpuName) < S;
   }
-- 
GitLab


From 4a4eaf68ea0988f57f663173b64d9e536eb96b2b Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Thu, 25 Oct 2018 08:11:35 +0000
Subject: [PATCH 0555/1116] [llvm-exegesis] Add missing initializer.

This is a better fix than rL345245.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345246 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/Target.cpp |  2 +-
 tools/llvm-exegesis/lib/Target.h   | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/llvm-exegesis/lib/Target.cpp b/tools/llvm-exegesis/lib/Target.cpp
index 588c40e8c7f..06557770418 100644
--- a/tools/llvm-exegesis/lib/Target.cpp
+++ b/tools/llvm-exegesis/lib/Target.cpp
@@ -87,7 +87,7 @@ ExegesisTarget::createUopsBenchmarkRunner(const LLVMState &State) const {
 
 static_assert(std::is_pod<PfmCountersInfo>::value,
               "We shouldn't have dynamic initialization here");
-const PfmCountersInfo PfmCountersInfo::Default = {nullptr, nullptr, nullptr};
+const PfmCountersInfo PfmCountersInfo::Default = {nullptr, nullptr, nullptr, 0u};
 
 const PfmCountersInfo &
 ExegesisTarget::getPfmCounters(llvm::StringRef CpuName) const {
diff --git a/tools/llvm-exegesis/lib/Target.h b/tools/llvm-exegesis/lib/Target.h
index 7f349026ece..a6ec36bebb3 100644
--- a/tools/llvm-exegesis/lib/Target.h
+++ b/tools/llvm-exegesis/lib/Target.h
@@ -34,29 +34,29 @@ namespace exegesis {
 struct PfmCountersInfo {
   // An optional name of a performance counter that can be used to measure
   // cycles.
-  const char * CycleCounter;
+  const char *const CycleCounter;
 
   // An optional name of a performance counter that can be used to measure
   // uops.
-  const char * UopsCounter;
+  const char *const UopsCounter;
 
   // An IssueCounter specifies how to measure uops issued to specific proc
   // resources.
   struct IssueCounter {
-    const char * Counter;
+    const char *const Counter;
     // The name of the ProcResource that this counter measures.
-    const char * ProcResName;
+    const char *const ProcResName;
   };
   // An optional list of IssueCounters.
-  const IssueCounter * IssueCounters;
-  unsigned NumIssueCounters;
+  const IssueCounter *const IssueCounters;
+  const unsigned NumIssueCounters;
 
   static const PfmCountersInfo Default;
 };
 
 struct CpuAndPfmCounters {
-  const char * CpuName;
-  const PfmCountersInfo * PCI;
+  const char *const CpuName;
+  const PfmCountersInfo *const PCI;
   bool operator<(llvm::StringRef S) const {
     return llvm::StringRef(CpuName) < S;
   }
-- 
GitLab


From e658a0023002b26cb0a7a475fcaf21ae82eb685f Mon Sep 17 00:00:00 2001
From: Gabor Buella <gabor.buella@intel.com>
Date: Thu, 25 Oct 2018 08:32:29 +0000
Subject: [PATCH 0556/1116] Add -instcombine-code-sinking option

Reviewers: craig.topper, andrew.w.kaylor, efriedma

Reviewed By: craig.topper, andrew.w.kaylor, efriedma

Differential Revision: https://reviews.llvm.org/D52709


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345248 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstructionCombining.cpp      |  6 +++++-
 .../InstCombine/no_sink_instruction.ll        | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/InstCombine/no_sink_instruction.ll

diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 34a5e1955b6..8506cf9baee 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -119,6 +119,10 @@ STATISTIC(NumReassoc  , "Number of reassociations");
 DEBUG_COUNTER(VisitCounter, "instcombine-visit",
               "Controls which instructions are visited");
 
+static cl::opt<bool>
+EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
+                                              cl::init(true));
+
 static cl::opt<bool>
 EnableExpensiveCombines("expensive-combines",
                         cl::desc("Enable expensive instruction combines"));
@@ -3103,7 +3107,7 @@ bool InstCombiner::run() {
     }
 
     // See if we can trivially sink this instruction to a successor basic block.
-    if (I->hasOneUse()) {
+    if (EnableCodeSinking && I->hasOneUse()) {
       BasicBlock *BB = I->getParent();
       Instruction *UserInst = cast<Instruction>(*I->user_begin());
       BasicBlock *UserParent;
diff --git a/test/Transforms/InstCombine/no_sink_instruction.ll b/test/Transforms/InstCombine/no_sink_instruction.ll
new file mode 100644
index 00000000000..caeba16fa2e
--- /dev/null
+++ b/test/Transforms/InstCombine/no_sink_instruction.ll
@@ -0,0 +1,19 @@
+; RUN: opt -instcombine -instcombine-code-sinking=0 -S < %s | FileCheck %s
+
+define i32 @test(i1 %C, i32 %A, i32 %B) {
+; CHECK-LABEL: @test(
+; CHECK: sdiv i32
+; CHECK-NEXT: add i32
+entry:
+        %tmp.2 = sdiv i32 %A, %B                ; <i32> [#uses=1]
+        %tmp.9 = add i32 %B, %A         ; <i32> [#uses=1]
+        br i1 %C, label %then, label %endif
+
+then:           ; preds = %entry
+; CHECK: ret i32
+        ret i32 %tmp.9
+
+endif:          ; preds = %entry
+; CHECK: ret i32
+        ret i32 %tmp.2
+}
-- 
GitLab


From 406bb74d5097456f0225851436616b875ddfac02 Mon Sep 17 00:00:00 2001
From: Carlos Alberto Enciso <carlos.alberto.enciso@gmail.com>
Date: Thu, 25 Oct 2018 09:58:59 +0000
Subject: [PATCH 0557/1116] [DebugInfo][Dexter] Unreachable line stepped onto
 after SimplifyCFG.

When SimplifyCFG changes the PHI node into a select instruction, the debug line records becomes ambiguous. It causes the debugger to display unreachable source lines.

Differential Revision: https://reviews.llvm.org/D53287

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345250 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/Local.h |   9 ++
 lib/Transforms/Utils/Local.cpp        |  41 ++++++++
 lib/Transforms/Utils/SimplifyCFG.cpp  |  22 +----
 test/CodeGen/X86/pr38762.ll           | 101 ++++++++++++++++++++
 test/CodeGen/X86/pr38763.ll           |  20 ++--
 test/CodeGen/X86/pr39243.ll           | 132 ++++++++++++++++++++++++++
 6 files changed, 296 insertions(+), 29 deletions(-)
 create mode 100644 test/CodeGen/X86/pr38762.ll
 create mode 100644 test/CodeGen/X86/pr39243.ll

diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index f7da69644da..86a32bb6300 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -446,6 +446,15 @@ void copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI, MDNode *N,
 /// Remove the debug intrinsic instructions for the given instruction.
 void dropDebugUsers(Instruction &I);
 
+/// Hoist all of the instructions in the \p IfBlock to the dominant block
+/// \p DomBlock, by moving its instructions to the insertion point \p InsertPt.
+///
+/// The moved instructions receive the insertion point debug location values
+/// (DILocations) and their debug intrinsic instructions (dbg.values) are
+/// removed.
+void hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
+                              BasicBlock *BB);
+
 //===----------------------------------------------------------------------===//
 //  Intrinsic pattern matching
 //
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 1153f3cbd15..82fb765842d 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -2529,6 +2529,47 @@ void llvm::dropDebugUsers(Instruction &I) {
     DII->eraseFromParent();
 }
 
+void llvm::hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
+                                    BasicBlock *BB) {
+  // Since we are moving the instructions out of its basic block, we do not
+  // retain their original debug locations (DILocations) and debug intrinsic
+  // instructions (dbg.values).
+  //
+  // Doing so would degrade the debugging experience and adversely affect the
+  // accuracy of profiling information.
+  //
+  // Currently, when hoisting the instructions, we take the following actions:
+  // - Remove their dbg.values.
+  // - Set their debug locations to the values from the insertion point.
+  //
+  // As per PR39141 (comment #8), the more fundamental reason why the dbg.values
+  // need to be deleted, is because there will not be any instructions with a
+  // DILocation in either branch left after performing the transformation. We
+  // can only insert a dbg.value after the two branches are joined again.
+  //
+  // See PR38762, PR39243 for more details.
+  //
+  // TODO: Extend llvm.dbg.value to take more than one SSA Value (PR39141) to
+  // encode predicated DIExpressions that yield different results on different
+  // code paths.
+  for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
+    Instruction *I = &*II;
+    I->dropUnknownNonDebugMetadata();
+    if (I->isUsedByMetadata())
+      dropDebugUsers(*I);
+    if (isa<DbgVariableIntrinsic>(I)) {
+      // Remove DbgInfo Intrinsics.
+      II = I->eraseFromParent();
+      continue;
+    }
+    I->setDebugLoc(InsertPt->getDebugLoc());
+    ++II;
+  }
+  DomBlock->getInstList().splice(InsertPt->getIterator(), BB->getInstList(),
+                                 BB->begin(),
+                                 BB->getTerminator()->getIterator());
+}
+
 namespace {
 
 /// A potential constituent of a bitreverse or bswap expression. See
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 8dad6176c51..dd0d441a4da 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2375,24 +2375,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
 
   // Move all 'aggressive' instructions, which are defined in the
   // conditional parts of the if's up to the dominating block.
-  if (IfBlock1) {
-    for (auto &I : *IfBlock1) {
-      I.dropUnknownNonDebugMetadata();
-      dropDebugUsers(I);
-    }
-    DomBlock->getInstList().splice(InsertPt->getIterator(),
-                                   IfBlock1->getInstList(), IfBlock1->begin(),
-                                   IfBlock1->getTerminator()->getIterator());
-  }
-  if (IfBlock2) {
-    for (auto &I : *IfBlock2) {
-      I.dropUnknownNonDebugMetadata();
-      dropDebugUsers(I);
-    }
-    DomBlock->getInstList().splice(InsertPt->getIterator(),
-                                   IfBlock2->getInstList(), IfBlock2->begin(),
-                                   IfBlock2->getTerminator()->getIterator());
-  }
+  if (IfBlock1)
+    hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock1);
+  if (IfBlock2)
+    hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock2);
 
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     // Change the PHI node into a select instruction.
diff --git a/test/CodeGen/X86/pr38762.ll b/test/CodeGen/X86/pr38762.ll
new file mode 100644
index 00000000000..dc4d535511c
--- /dev/null
+++ b/test/CodeGen/X86/pr38762.ll
@@ -0,0 +1,101 @@
+; RUN: opt < %s -S -simplifycfg | FileCheck %s
+
+; Note: This patch is a complement to pr38763.
+;
+; When SimplifyCFG changes the PHI node into a select instruction, the debug
+; information becomes ambiguous. It causes the debugger to display unreached
+; lines and invalid variable values.
+;
+; When in the debugger, on the line "if (read1 > 3)", and we step from the
+; 'if' condition, onto the addition, then back to the 'if' again, which is
+; misleading because that addition doesn't really "happen" (it's speculated).
+
+; IR generated with:
+; clang -S -g -gno-column-info -O2 -emit-llvm pr38762.cpp -o pr38762.ll -mllvm -opt-bisect-limit=10
+
+; // pr38762.cpp
+; int main() {
+;   volatile int foo = 0;
+;   int read1 = foo;
+;   int brains = foo;
+; 
+;   if (read1 > 3) {
+;     brains *= 2;
+;     brains += 1;
+;   }
+; 
+;   return brains;
+; }
+
+; Change the debug locations associated with the PHI nodes being promoted, to
+; the debug locations from the insertion point in the dominant block.
+
+; CHECK-LABEL: entry
+; CHECK:  %cmp = icmp sgt i32 %foo.0., 3, !dbg !14
+; CHECK:  %mul = shl nsw i32 %foo.0.5, 1, !dbg !16
+; CHECK-NOT:  call void @llvm.dbg.value(metadata i32 %mul, metadata !15, metadata !DIExpression()), !dbg !25
+; CHECK:  %add = or i32 %mul, 1, !dbg !16
+; CHECK-NOT:  call void @llvm.dbg.value(metadata i32 %add, metadata !15, metadata !DIExpression()), !dbg !25
+; CHECK:  %brains.0 = select i1 %cmp, i32 %add, i32 %foo.0.5, !dbg !16
+
+; ModuleID = 'pr38762.cpp'
+source_filename = "pr38762.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
+entry:
+  %foo = alloca i32, align 4
+  %foo.0..sroa_cast = bitcast i32* %foo to i8*
+  store volatile i32 0, i32* %foo, align 4
+  %foo.0. = load volatile i32, i32* %foo, align 4
+  %foo.0.5 = load volatile i32, i32* %foo, align 4
+  call void @llvm.dbg.value(metadata i32 %foo.0.5, metadata !15, metadata !DIExpression()), !dbg !25
+  %cmp = icmp sgt i32 %foo.0., 3, !dbg !26
+  br i1 %cmp, label %if.then, label %if.end, !dbg !28
+
+if.then:                                          ; preds = %entry
+  %mul = shl nsw i32 %foo.0.5, 1, !dbg !29
+  call void @llvm.dbg.value(metadata i32 %mul, metadata !15, metadata !DIExpression()), !dbg !25
+  %add = or i32 %mul, 1, !dbg !31
+  call void @llvm.dbg.value(metadata i32 %add, metadata !15, metadata !DIExpression()), !dbg !25
+  br label %if.end, !dbg !32
+
+if.end:                                           ; preds = %if.then, %entry
+  %brains.0 = phi i32 [ %add, %if.then ], [ %foo.0.5, %entry ], !dbg !33
+  call void @llvm.dbg.value(metadata i32 %brains.0, metadata !15, metadata !DIExpression()), !dbg !25
+  ret i32 %brains.0, !dbg !35
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 343753)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "pr38762.cpp", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk 343753)"}
+!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !{!15}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !10)
+!15 = !DILocalVariable(name: "brains", scope: !7, file: !1, line: 4, type: !10)
+!25 = !DILocation(line: 4, scope: !7)
+!26 = !DILocation(line: 6, scope: !27)
+!27 = distinct !DILexicalBlock(scope: !7, file: !1, line: 6)
+!28 = !DILocation(line: 6, scope: !7)
+!29 = !DILocation(line: 7, scope: !30)
+!30 = distinct !DILexicalBlock(scope: !27, file: !1, line: 6)
+!31 = !DILocation(line: 8, scope: !30)
+!32 = !DILocation(line: 9, scope: !30)
+!33 = !DILocation(line: 0, scope: !7)
+!34 = !DILocation(line: 12, scope: !7)
+!35 = !DILocation(line: 11, scope: !7)
diff --git a/test/CodeGen/X86/pr38763.ll b/test/CodeGen/X86/pr38763.ll
index b36e1efd927..ee0872748d6 100644
--- a/test/CodeGen/X86/pr38763.ll
+++ b/test/CodeGen/X86/pr38763.ll
@@ -30,13 +30,13 @@
 ; branches, as they becomes ambiguous.
 
 ; CHECK-LABEL: entry
-; CHECK:  %cmp = icmp eq i32 %foo.0., 4
-; CHECK:  %add = add nsw i32 %foo.0.4, 2, !dbg !18
+; CHECK:  %cmp = icmp eq i32 %foo.0., 4, !dbg !14
+; CHECK:  %add = add nsw i32 %foo.0.4, 2, !dbg !16
 ; CHECK-NOT: @llvm.dbg.value(metadata i32 %add
-; CHECK:  %sub = add nsw i32 %foo.0.4, -2, !dbg !21
+; CHECK:  %sub = add nsw i32 %foo.0.4, -2, !dbg !16
 ; CHECK-NOT: @llvm.dbg.value(metadata i32 %sub
 ; CHECK:  %result.0 = select i1 %cmp, i32 %add, i32 %sub
-; CHECK:  call void @llvm.dbg.value(metadata i32 %result.0, metadata !12, metadata !DIExpression()), !dbg !17
+; CHECK:  call void @llvm.dbg.value(metadata i32 %result.0, metadata !12, metadata !DIExpression()), !dbg !13
 
 ; ModuleID = 'pr38763.cpp'
 source_filename = "pr38763.cpp"
@@ -48,12 +48,12 @@ define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
 entry:
   %foo = alloca i32, align 4
   %foo.0..sroa_cast = bitcast i32* %foo to i8*
-  store volatile i32 4, i32* %foo, align 4, !tbaa !19
+  store volatile i32 4, i32* %foo, align 4
   %foo.0. = load volatile i32, i32* %foo, align 4
   %foo.0.4 = load volatile i32, i32* %foo, align 4
   call void @llvm.dbg.value(metadata i32 0, metadata !16, metadata !DIExpression()), !dbg !27
-  %cmp = icmp eq i32 %foo.0., 4
-  br i1 %cmp, label %if.then, label %if.else
+  %cmp = icmp eq i32 %foo.0., 4, !dbg !28
+  br i1 %cmp, label %if.then, label %if.else, !dbg !30
 
 if.then:                                          ; preds = %entry
   %add = add nsw i32 %foo.0.4, 2, !dbg !31
@@ -91,12 +91,10 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) #2
 !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 !11 = !{!16}
 !16 = !DILocalVariable(name: "result", scope: !7, file: !1, line: 6, type: !10)
-!19 = !{!20, !20, i64 0}
-!20 = !{!"int", !21, i64 0}
-!21 = !{!"omnipotent char", !22, i64 0}
-!22 = !{!"Simple C++ TBAA"}
 !27 = !DILocation(line: 6, column: 7, scope: !7)
+!28 = !DILocation(line: 7, column: 12, scope: !29)
 !29 = distinct !DILexicalBlock(scope: !7, file: !1, line: 7, column: 7)
+!30 = !DILocation(line: 7, column: 7, scope: !7)
 !31 = !DILocation(line: 8, column: 20, scope: !32)
 !32 = distinct !DILexicalBlock(scope: !29, file: !1, line: 7, column: 18)
 !34 = !DILocation(line: 10, column: 20, scope: !35)
diff --git a/test/CodeGen/X86/pr39243.ll b/test/CodeGen/X86/pr39243.ll
new file mode 100644
index 00000000000..a901e291eca
--- /dev/null
+++ b/test/CodeGen/X86/pr39243.ll
@@ -0,0 +1,132 @@
+; RUN: opt < %s -S -simplifycfg | FileCheck %s
+
+; Note: This patch fixes the regression introduced by pr38762.
+;
+; When SimplifyCFG changes the PHI node into a select instruction, the debug
+; information becomes ambiguous. It causes the debugger to display unreached
+; lines and invalid variable values.
+;
+; When the function 'hoistAllInstructionsInto' hoist a basic block:
+; - Remove their dbg.values.
+; - Set their debug locations to the values from the insertion point.
+;
+; But, if one of the instructions being hoisted is a debug intrinsic from an
+; inlined function, assigning it the debug location from the insertion point
+; will create a mismatch between the intrinsic's subprogram and the location's
+; subprogram, causing the assertion "Expected inlined-at fields to agree" in
+; SelectionDAG".
+
+; IR generated with:
+; clang -S -g -gno-column-info -O2 -emit-llvm pr39243.cpp -o pr39243.ll -mllvm -opt-bisect-limit=103
+
+; // pr39243.cpp
+; union onion {
+;   double dd;
+;   int ii[2];
+; };
+;
+; int alpha;
+; int bravo();
+;
+; int charlie() {
+;   int delta = 0;
+;   return bravo();
+; }
+;
+; int echo(onion foxtrot) {
+;   alpha = foxtrot.ii[0];
+;   if (alpha) {
+;     int golf = bravo();
+;     return -golf;
+;   }
+;   alpha = foxtrot.ii[1];
+;   return -charlie();
+; }
+
+; Change the debug locations associated with the PHI nodes being promoted, to
+; the debug locations from the insertion point in the dominant block.
+
+; CHECK-LABEL: entry
+; CHECK:  %foxtrot.sroa.0.0.extract.trunc = trunc i64 %foxtrot.coerce to i32
+; CHECK:  %tobool = icmp eq i32 %foxtrot.sroa.0.0.extract.trunc, 0
+; CHECK:  %foxtrot.sroa.2.0.extract.shift = lshr i64 %foxtrot.coerce, 32
+; CHECK-NOT:  call void @llvm.dbg.value(metadata i32 %foxtrot.sroa.2.0.extract.trunc, metadata !30, metadata !DIExpression(DW_OP_LLVM_fragment, 32, 32)), !dbg !34
+; CHECK:  %foxtrot.sroa.2.0.extract.trunc = trunc i64 %foxtrot.sroa.2.0.extract.shift to i32
+; CHECK:  store i32 %foxtrot.sroa.2.0.extract.trunc, i32* @alpha, align 4, !dbg !25
+; CHECK-NOT:  call void @llvm.dbg.value(metadata i32 0, metadata !15, metadata !DIExpression()), !dbg !43
+
+; ModuleID = 'pr39243.cpp'
+source_filename = "pr39243.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+@alpha = dso_local local_unnamed_addr global i32 0, align 4, !dbg !0
+
+define dso_local i32 @_Z7charliev() local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 @_Z5bravov()
+  ret i32 %call
+}
+
+declare dso_local i32 @_Z5bravov() local_unnamed_addr #1
+
+define dso_local i32 @_Z4echo5onion(i64 %foxtrot.coerce) local_unnamed_addr #0 !dbg !18 {
+entry:
+  %foxtrot.sroa.0.0.extract.trunc = trunc i64 %foxtrot.coerce to i32
+  store i32 %foxtrot.sroa.0.0.extract.trunc, i32* @alpha, align 4
+  %tobool = icmp eq i32 %foxtrot.sroa.0.0.extract.trunc, 0
+  br i1 %tobool, label %if.end, label %return
+
+if.end:                                           ; preds = %entry
+  %foxtrot.sroa.2.0.extract.shift = lshr i64 %foxtrot.coerce, 32
+  %foxtrot.sroa.2.0.extract.trunc = trunc i64 %foxtrot.sroa.2.0.extract.shift to i32
+  call void @llvm.dbg.value(metadata i32 %foxtrot.sroa.2.0.extract.trunc, metadata !30, metadata !DIExpression(DW_OP_LLVM_fragment, 32, 32)), !dbg !34
+  store i32 %foxtrot.sroa.2.0.extract.trunc, i32* @alpha, align 4, !dbg !42
+  call void @llvm.dbg.value(metadata i32 0, metadata !15, metadata !DIExpression()), !dbg !43
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  %call.i = tail call i32 @_Z5bravov()
+  %retval.0 = sub nsw i32 0, %call.i
+  ret i32 %retval.0
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "alpha", scope: !2, file: !3, line: 6, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 8.0.0 (trunk 344502)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: None)
+!3 = !DIFile(filename: "pr39243.cpp", directory: ".")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{!"clang version 8.0.0 (trunk 344502)"}
+!11 = distinct !DISubprogram(name: "charlie", linkageName: "_Z7charliev", scope: !3, file: !3, line: 9, type: !12, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !2, retainedNodes: !14)
+!12 = !DISubroutineType(types: !13)
+!13 = !{!6}
+!14 = !{!15}
+!15 = !DILocalVariable(name: "delta", scope: !11, file: !3, line: 10, type: !6)
+!18 = distinct !DISubprogram(name: "echo", linkageName: "_Z4echo5onion", scope: !3, file: !3, line: 14, type: !19, isLocal: false, isDefinition: true, scopeLine: 14, flags: DIFlagPrototyped, isOptimized: true, unit: !2, retainedNodes: !29)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!6, !21}
+!21 = distinct !DICompositeType(tag: DW_TAG_union_type, name: "onion", file: !3, line: 1, size: 64, flags: DIFlagTypePassByValue | DIFlagTrivial, elements: !22, identifier: "_ZTS5onion")
+!22 = !{!23, !25}
+!23 = !DIDerivedType(tag: DW_TAG_member, name: "dd", scope: !21, file: !3, line: 2, baseType: !24, size: 64)
+!24 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+!25 = !DIDerivedType(tag: DW_TAG_member, name: "ii", scope: !21, file: !3, line: 3, baseType: !26, size: 64)
+!26 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 64, elements: !27)
+!27 = !{!28}
+!28 = !DISubrange(count: 2)
+!29 = !{!30}
+!30 = !DILocalVariable(name: "foxtrot", arg: 1, scope: !18, file: !3, line: 14, type: !21)
+!34 = !DILocation(line: 14, scope: !18)
+!42 = !DILocation(line: 20, scope: !18)
+!43 = !DILocation(line: 10, scope: !11, inlinedAt: !44)
+!44 = distinct !DILocation(line: 21, scope: !18)
-- 
GitLab


From 5d0d6ef59112cffd897a3b9f88fc3bb54a8e3cd2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 25 Oct 2018 10:45:38 +0000
Subject: [PATCH 0558/1116] Fix MSVC llvm-exegesis build. NFCI.

MSVC is a bit funny about is_pod.....

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345252 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/Target.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/llvm-exegesis/lib/Target.h b/tools/llvm-exegesis/lib/Target.h
index a6ec36bebb3..b0f0e996173 100644
--- a/tools/llvm-exegesis/lib/Target.h
+++ b/tools/llvm-exegesis/lib/Target.h
@@ -34,29 +34,29 @@ namespace exegesis {
 struct PfmCountersInfo {
   // An optional name of a performance counter that can be used to measure
   // cycles.
-  const char *const CycleCounter;
+  const char *CycleCounter;
 
   // An optional name of a performance counter that can be used to measure
   // uops.
-  const char *const UopsCounter;
+  const char *UopsCounter;
 
   // An IssueCounter specifies how to measure uops issued to specific proc
   // resources.
   struct IssueCounter {
-    const char *const Counter;
+    const char *Counter;
     // The name of the ProcResource that this counter measures.
-    const char *const ProcResName;
+    const char *ProcResName;
   };
   // An optional list of IssueCounters.
-  const IssueCounter *const IssueCounters;
-  const unsigned NumIssueCounters;
+  const IssueCounter *IssueCounters;
+  unsigned NumIssueCounters;
 
   static const PfmCountersInfo Default;
 };
 
 struct CpuAndPfmCounters {
-  const char *const CpuName;
-  const PfmCountersInfo *const PCI;
+  const char *CpuName;
+  const PfmCountersInfo *PCI;
   bool operator<(llvm::StringRef S) const {
     return llvm::StringRef(CpuName) < S;
   }
-- 
GitLab


From 474181f1d601a10bc1d68b4acfb063ea09079b0d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 25 Oct 2018 10:52:36 +0000
Subject: [PATCH 0559/1116] [TTI] Add generic SK_Broadcast shuffle costs

I noticed while fixing PR39368 that we don't have generic shuffle costs for broadcast style shuffles.

This patch adds SK_BROADCAST handling, but exposes ARM/AARCH64 lack of handling of this type, which I've added a fix for at the same time.

Differential Revision: https://reviews.llvm.org/D53570

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345253 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/BasicTTIImpl.h           | 19 ++++++++++++++
 .../AArch64/AArch64TargetTransformInfo.cpp    | 15 +++++++++--
 lib/Target/ARM/ARMTargetTransformInfo.cpp     | 25 ++++++++++++++++---
 3 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index 3f7a1206c84..0cd38617123 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -80,6 +80,23 @@ private:
   using BaseT = TargetTransformInfoImplCRTPBase<T>;
   using TTI = TargetTransformInfo;
 
+  /// Estimate a cost of Broadcast as an extract and sequence of insert
+  /// operations.
+  unsigned getBroadcastShuffleOverhead(Type *Ty) {
+    assert(Ty->isVectorTy() && "Can only shuffle vectors");
+    unsigned Cost = 0;
+    // Broadcast cost is equal to the cost of extracting the zero'th element
+    // plus the cost of inserting it into every element of the result vector.
+    Cost += static_cast<T *>(this)->getVectorInstrCost(
+        Instruction::ExtractElement, Ty, 0);
+
+    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::InsertElement, Ty, i);
+    }
+    return Cost;
+  }
+
   /// Estimate a cost of shuffle as a sequence of extract and insert
   /// operations.
   unsigned getPermuteShuffleOverhead(Type *Ty) {
@@ -554,6 +571,8 @@ public:
   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                           Type *SubTp) {
     switch (Kind) {
+    case TTI::SK_Broadcast:
+      return getBroadcastShuffleOverhead(Tp);
     case TTI::SK_Select:
     case TTI::SK_Reverse:
     case TTI::SK_Transpose:
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a16de89cf10..77c83970f68 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -946,9 +946,20 @@ int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
 
 int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                    Type *SubTp) {
-  if (Kind == TTI::SK_Transpose || Kind == TTI::SK_Select ||
-      Kind == TTI::SK_PermuteSingleSrc) {
+  if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
+      Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
     static const CostTblEntry ShuffleTbl[] = {
+      // Broadcast shuffle kinds can be performed with 'dup'.
+      { TTI::SK_Broadcast, MVT::v8i8,  1 },
+      { TTI::SK_Broadcast, MVT::v16i8, 1 },
+      { TTI::SK_Broadcast, MVT::v4i16, 1 },
+      { TTI::SK_Broadcast, MVT::v8i16, 1 },
+      { TTI::SK_Broadcast, MVT::v2i32, 1 },
+      { TTI::SK_Broadcast, MVT::v4i32, 1 },
+      { TTI::SK_Broadcast, MVT::v2i64, 1 },
+      { TTI::SK_Broadcast, MVT::v2f32, 1 },
+      { TTI::SK_Broadcast, MVT::v4f32, 1 },
+      { TTI::SK_Broadcast, MVT::v2f64, 1 },
       // Transpose shuffle kinds can be performed with 'trn1/trn2' and
       // 'zip1/zip2' instructions.
       { TTI::SK_Transpose, MVT::v8i8,  1 },
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 39a72f0edeb..a07c1e83a3f 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -400,10 +400,29 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 
 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
-  // We only handle costs of reverse and select shuffles for now.
-  if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Select)
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  if (Kind == TTI::SK_Broadcast) {
+    static const CostTblEntry NEONDupTbl[] = {
+        // VDUP handles these cases.
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i8,  1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
 
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+    if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  }
   if (Kind == TTI::SK_Reverse) {
     static const CostTblEntry NEONShuffleTbl[] = {
         // Reverse shuffle cost one instruction if we are shuffling within a
-- 
GitLab


From 8377922c2828230a16b7fc88b47477a780189b0d Mon Sep 17 00:00:00 2001
From: George Rimar <grimar@accesssoftek.com>
Date: Thu, 25 Oct 2018 10:56:44 +0000
Subject: [PATCH 0560/1116] [llvm-dwarfdump] - Fix incorrect parsing of the
 DW_LLE_startx_length

As was already mentioned in comments for D53364, DWARF 5
spec says about DW_LLE_startx_length:

"This is a form of bounded location description that has two unsigned ULEB operands.
The first value is an address index (into the .debug_addr section) that indicates the beginning of the address range
over which the location is valid. The second value is the length of the range. ")

Currently, the length is always parsed as U32.
Patch change the behavior to parse DW_LLE_startx_length as ULEB128 for DWARF 5
and keeps it as U32 for DWARF4+(pre-DWARF5) for compatibility.

Differential revision: https://reviews.llvm.org/D53564

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345254 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h  |  6 ++---
 include/llvm/DebugInfo/DWARF/DWARFListTable.h |  1 +
 lib/DebugInfo/DWARF/DWARFContext.cpp          |  7 +++--
 lib/DebugInfo/DWARF/DWARFDebugLoc.cpp         | 14 +++++++---
 lib/DebugInfo/DWARF/DWARFDie.cpp              | 12 +++++++--
 .../X86/debug_loclists_startx_length.s        | 27 +++++++++++++++++++
 6 files changed, 56 insertions(+), 11 deletions(-)
 create mode 100644 test/tools/llvm-dwarfdump/X86/debug_loclists_startx_length.s

diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
index ad44c2c83fb..da2098e1540 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
@@ -100,15 +100,15 @@ private:
   bool IsLittleEndian;
 
 public:
-  void parse(DataExtractor data);
+  void parse(DataExtractor data, unsigned Version);
   void dump(raw_ostream &OS, uint64_t BaseAddr, const MCRegisterInfo *RegInfo,
             Optional<uint64_t> Offset) const;
 
   /// Return the location list at the given offset or nullptr.
   LocationList const *getLocationListAtOffset(uint64_t Offset) const;
 
-  static Optional<LocationList> parseOneLocationList(DataExtractor Data,
-                                                     uint32_t *Offset);
+  static Optional<LocationList>
+  parseOneLocationList(DataExtractor Data, unsigned *Offset, unsigned Version);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFListTable.h b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
index 8c15d9d58d4..9b987314f20 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFListTable.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
@@ -99,6 +99,7 @@ public:
   uint32_t getHeaderOffset() const { return HeaderOffset; }
   uint8_t getAddrSize() const { return HeaderData.AddrSize; }
   uint32_t getLength() const { return HeaderData.Length; }
+  uint16_t getVersion() const { return HeaderData.Version; }
   StringRef getSectionName() const { return SectionName; }
   StringRef getListTypeString() const { return ListTypeString; }
   dwarf::DwarfFormat getFormat() const { return Format; }
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 1f3753809a2..a29c9c2f160 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -309,7 +309,7 @@ static void dumpLoclistsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
   DataExtractor LocData(Data.getData().drop_front(Offset),
                         Data.isLittleEndian(), Header.getAddrSize());
 
-  Loclists.parse(LocData);
+  Loclists.parse(LocData, Header.getVersion());
   Loclists.dump(OS, 0, MRI, DumpOffset);
 }
 
@@ -732,7 +732,10 @@ const DWARFDebugLoclists *DWARFContext::getDebugLocDWO() {
   // FIXME: We don't need AddressSize for split DWARF since relocatable
   // addresses cannot appear there. At the moment DWARFExpression requires it.
   DataExtractor LocData(DObj->getLocDWOSection().Data, isLittleEndian(), 4);
-  LocDWO->parse(LocData);
+  // Use version 4. DWO does not support the DWARF v5 .debug_loclists yet and
+  // that means we are parsing the new style .debug_loc (pre-standatized version
+  // of the .debug_loclists).
+  LocDWO->parse(LocData, 4 /* Version */);
   return LocDWO.get();
 }
 
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index b4bdaaac0c2..044a0243360 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -145,7 +145,8 @@ void DWARFDebugLoc::parse(const DWARFDataExtractor &data) {
 }
 
 Optional<DWARFDebugLoclists::LocationList>
-DWARFDebugLoclists::parseOneLocationList(DataExtractor Data, unsigned *Offset) {
+DWARFDebugLoclists::parseOneLocationList(DataExtractor Data, unsigned *Offset,
+                                         unsigned Version) {
   LocationList LL;
   LL.Offset = *Offset;
 
@@ -158,7 +159,12 @@ DWARFDebugLoclists::parseOneLocationList(DataExtractor Data, unsigned *Offset) {
     switch (Kind) {
     case dwarf::DW_LLE_startx_length:
       E.Value0 = Data.getULEB128(Offset);
-      E.Value1 = Data.getU32(Offset);
+      // Pre-DWARF 5 has different interpretation of the length field. We have
+      // to support both pre- and standartized styles for the compatibility.
+      if (Version < 5)
+        E.Value1 = Data.getU32(Offset);
+      else
+        E.Value1 = Data.getULEB128(Offset);
       break;
     case dwarf::DW_LLE_start_length:
       E.Value0 = Data.getAddress(Offset);
@@ -189,13 +195,13 @@ DWARFDebugLoclists::parseOneLocationList(DataExtractor Data, unsigned *Offset) {
   return LL;
 }
 
-void DWARFDebugLoclists::parse(DataExtractor data) {
+void DWARFDebugLoclists::parse(DataExtractor data, unsigned Version) {
   IsLittleEndian = data.isLittleEndian();
   AddressSize = data.getAddressSize();
 
   uint32_t Offset = 0;
   while (data.isValidOffset(Offset)) {
-    if (auto LL = parseOneLocationList(data, &Offset))
+    if (auto LL = parseOneLocationList(data, &Offset, Version))
       Locations.push_back(std::move(*LL));
     else
       return;
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 76430b41f18..31c4cd5e472 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -118,12 +118,20 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
       return;
     }
 
+    bool UseLocLists = !U->isDWOUnit();
     StringRef LoclistsSectionData =
-        U->isDWOUnit() ? U->getLocSectionData() : Obj.getLoclistsSection().Data;
+        UseLocLists ? Obj.getLoclistsSection().Data : U->getLocSectionData();
+
     if (!LoclistsSectionData.empty()) {
       DataExtractor Data(LoclistsSectionData, Ctx.isLittleEndian(),
                          Obj.getAddressSize());
-      auto LL = DWARFDebugLoclists::parseOneLocationList(Data, &Offset);
+
+      // Old-style location list were used in DWARF v4 (.debug_loc.dwo section).
+      // Modern locations list (.debug_loclists) are used starting from v5.
+      // Ideally we should take the version from the .debug_loclists section
+      // header, but using CU's version for simplicity.
+      auto LL = DWARFDebugLoclists::parseOneLocationList(
+          Data, &Offset, UseLocLists ? U->getVersion() : 4);
 
       uint64_t BaseAddr = 0;
       if (Optional<SectionedAddress> BA = U->getBaseAddress())
diff --git a/test/tools/llvm-dwarfdump/X86/debug_loclists_startx_length.s b/test/tools/llvm-dwarfdump/X86/debug_loclists_startx_length.s
new file mode 100644
index 00000000000..07c68ab2618
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/debug_loclists_startx_length.s
@@ -0,0 +1,27 @@
+# RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux -o %t.o
+# RUN: llvm-dwarfdump -v %t.o | FileCheck %s
+
+# DW_LLE_startx_length has different `length` encoding in pre-DWARF 5
+# and final DWARF 5 versions. This test checks we are able to parse
+# the final version which uses ULEB128 and not the U32.
+
+# CHECK:         .debug_loclists contents:
+# CHECK-NEXT:    0x00000000: locations list header: length = 0x0000000f, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+# CHECK-NEXT:    0x00000000:
+# CHECK-NEXT:    Addr idx 1 (w/ length 16): DW_OP_reg5 RDI
+
+.section .debug_loclists,"",@progbits
+ .long  .Ldebug_loclist_table_end0-.Ldebug_loclist_table_start0
+.Ldebug_loclist_table_start0:
+ .short 5         # Version.
+ .byte 8          # Address size.
+ .byte 0          # Segmen selector size.
+ .long 0          # Offset entry count.
+ 
+ .byte 3          # DW_LLE_startx_length
+ .byte 0x01       # Index
+ .uleb128 0x10    # Length
+ .short 1         # Loc expr size
+ .byte 85         # DW_OP_reg5
+ .byte 0          # DW_LLE_end_of_list
+.Ldebug_loclist_table_end0:
-- 
GitLab


From f86533efab9e99ad8dca05f1b8ffb52b9a119341 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 25 Oct 2018 11:15:57 +0000
Subject: [PATCH 0561/1116] [TargetLowering] Improve vXi64 UINT_TO_FP vXf64
 support (P38226)

As suggested on D52965, this patch moves the i64 to f64 UINT_TO_FP expansion code from LegalizeDAG into TargetLowering and makes it available to LegalizeVectorOps as well.

Not only does this help perform X86 lowering as a true vectorization instead of (partially vectorized) scalar conversions, it avoids the HADDPD op from the scalar code which can be slow on most targets.

The AVX512F does have the vcvtusi2sdq scalar operation but we don't unroll to use it as it seems to only help for the v2f64 case - otherwise the unrolling cost will certainly be too high. My feeling is that we should leave it to the vectorizers - and if it generates the vector UINT_TO_FP we should use it.

Differential Revision: https://reviews.llvm.org/D53649


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345256 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/TargetLowering.h         |   6 +
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |  31 +-
 .../SelectionDAG/LegalizeVectorOps.cpp        |   5 +
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |  42 ++
 test/CodeGen/X86/avx512-cvt.ll                | 200 ++-----
 test/CodeGen/X86/ftrunc.ll                    | 157 +++---
 test/CodeGen/X86/vec_int_to_fp.ll             | 513 +++++++++---------
 7 files changed, 431 insertions(+), 523 deletions(-)

diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 9fae319ac88..4dfc72ea52a 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -3663,6 +3663,12 @@ public:
   /// \returns True, if the expansion was successful, false otherwise
   bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
 
+  /// Expand UINT(i64) to double(f64) conversion
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
   /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
   SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index f6a6e064fa4..413a53d2e6e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2369,30 +2369,6 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
   assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
   // Code below here assumes !isSigned without checking again.
 
-  // Implementation of unsigned i64 to f64 following the algorithm in
-  // __floatundidf in compiler_rt. This implementation has the advantage
-  // of performing rounding correctly, both in the default rounding mode
-  // and in all alternate rounding modes.
-  // TODO: Generalize this for use with other types.
-  if (SrcVT == MVT::i64 && DestVT == MVT::f64) {
-    LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f64\n");
-    SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
-    SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
-        BitsToDouble(UINT64_C(0x4530000000100000)), dl, DestVT);
-    SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
-    SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
-    SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);
-
-    SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Op0, LoMask);
-    SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, HiShift);
-    SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52);
-    SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
-    SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, DestVT, LoOr);
-    SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, DestVT, HiOr);
-    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DestVT, HiFlt, TwoP84PlusTwoP52);
-    return DAG.getNode(ISD::FADD, dl, DestVT, LoFlt, HiSub);
-  }
-
   // TODO: Generalize this for use with other types.
   if (SrcVT == MVT::i64 && DestVT == MVT::f32) {
     LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f32\n");
@@ -2921,8 +2897,13 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   }
-  case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
+    if (TLI.expandUINT_TO_FP(Node, Tmp1, DAG)) {
+      Results.push_back(Tmp1);
+      break;
+    }
+    LLVM_FALLTHROUGH
+  case ISD::SINT_TO_FP:
     Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP,
                                 Node->getOperand(0), Node->getValueType(0), dl);
     Results.push_back(Tmp1);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 850cdcd1701..6554d5a27b2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1022,6 +1022,11 @@ SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
   EVT VT = Op.getOperand(0).getValueType();
   SDLoc DL(Op);
 
+  // Attempt to expand using TargetLowering.
+  SDValue Result;
+  if (TLI.expandUINT_TO_FP(Op.getNode(), Result, DAG))
+    return Result;
+
   // Make sure that the SINT_TO_FP and SRL instructions are available.
   if (TLI.getOperationAction(ISD::SINT_TO_FP, VT) == TargetLowering::Expand ||
       TLI.getOperationAction(ISD::SRL,        VT) == TargetLowering::Expand)
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 0189a11fa1d..1a29cb7ebf7 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4137,6 +4137,48 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
   return true;
 }
 
+bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
+                                      SelectionDAG &DAG) const {
+  SDValue Src = Node->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
+
+  if (SrcVT.getScalarType() != MVT::i64 || DstVT.getScalarType() != MVT::f64)
+    return false;
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (SrcVT.isVector() && (!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
+                           !isOperationLegalOrCustom(ISD::FADD, DstVT) ||
+                           !isOperationLegalOrCustom(ISD::FSUB, DstVT) ||
+                           !isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
+                           !isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
+    return false;
+
+  SDLoc dl(SDValue(Node, 0));
+  EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout());
+
+  // Implementation of unsigned i64 to f64 following the algorithm in
+  // __floatundidf in compiler_rt. This implementation has the advantage
+  // of performing rounding correctly, both in the default rounding mode
+  // and in all alternate rounding modes.
+  SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
+  SDValue TwoP84PlusTwoP52 =
+      DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT);
+  SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
+  SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
+  SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);
+
+  SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Src, LoMask);
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Src, HiShift);
+  SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52);
+  SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
+  SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
+  SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
+  SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
+  Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
+  return true;
+}
+
 SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
                                               SelectionDAG &DAG) const {
   SDLoc dl(Node);
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 4c089ac379c..e99cdaf1ce9 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -484,32 +484,12 @@ define <4 x float> @ulto4f32(<4 x i64> %a) {
 define <8 x double> @ulto8f64(<8 x i64> %a) {
 ; NODQ-LABEL: ulto8f64:
 ; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; NODQ-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; NODQ-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; NODQ-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; NODQ-NEXT:    vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; NODQ-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
 ; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: ulto8f64:
@@ -524,32 +504,12 @@ define <8 x double> @ulto8f64(<8 x i64> %a) {
 ;
 ; KNL_WIDEN-LABEL: ulto8f64:
 ; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm0
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; KNL_WIDEN-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_WIDEN-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
 ; KNL_WIDEN-NEXT:    retq
   %b = uitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %b
@@ -558,58 +518,22 @@ define <8 x double> @ulto8f64(<8 x i64> %a) {
 define <16 x double> @ulto16f64(<16 x i64> %a) {
 ; NODQ-LABEL: ulto16f64:
 ; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
+; NODQ-NEXT:    vpandq %zmm2, %zmm0, %zmm3
+; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; NODQ-NEXT:    vporq %zmm4, %zmm3, %zmm3
+; NODQ-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; NODQ-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; NODQ-NEXT:    vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; NODQ-NEXT:    vsubpd %zmm6, %zmm0, %zmm0
+; NODQ-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
+; NODQ-NEXT:    vpandq %zmm2, %zmm1, %zmm2
+; NODQ-NEXT:    vporq %zmm4, %zmm2, %zmm2
+; NODQ-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; NODQ-NEXT:    vporq %zmm5, %zmm1, %zmm1
+; NODQ-NEXT:    vsubpd %zmm6, %zmm1, %zmm1
+; NODQ-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
 ; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: ulto16f64:
@@ -626,58 +550,22 @@ define <16 x double> @ulto16f64(<16 x i64> %a) {
 ;
 ; KNL_WIDEN-LABEL: ulto16f64:
 ; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm0
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm1
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; KNL_WIDEN-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
+; KNL_WIDEN-NEXT:    vpandq %zmm2, %zmm0, %zmm3
+; KNL_WIDEN-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; KNL_WIDEN-NEXT:    vporq %zmm4, %zmm3, %zmm3
+; KNL_WIDEN-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; KNL_WIDEN-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; KNL_WIDEN-NEXT:    vsubpd %zmm6, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
+; KNL_WIDEN-NEXT:    vpandq %zmm2, %zmm1, %zmm2
+; KNL_WIDEN-NEXT:    vporq %zmm4, %zmm2, %zmm2
+; KNL_WIDEN-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; KNL_WIDEN-NEXT:    vporq %zmm5, %zmm1, %zmm1
+; KNL_WIDEN-NEXT:    vsubpd %zmm6, %zmm1, %zmm1
+; KNL_WIDEN-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
 ; KNL_WIDEN-NEXT:    retq
   %b = uitofp <16 x i64> %a to <16 x double>
   ret <16 x double> %b
diff --git a/test/CodeGen/X86/ftrunc.ll b/test/CodeGen/X86/ftrunc.ll
index 01112f48bf4..ff40f619853 100644
--- a/test/CodeGen/X86/ftrunc.ll
+++ b/test/CodeGen/X86/ftrunc.ll
@@ -106,39 +106,34 @@ define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 {
 define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 {
 ; SSE2-LABEL: trunc_unsigned_v2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT:    movapd %xmm1, %xmm3
-; SSE2-NEXT:    subsd %xmm2, %xmm3
-; SSE2-NEXT:    cvttsd2si %xmm3, %rax
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    cvttsd2si %xmm1, %rdx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm1
-; SSE2-NEXT:    cmovaeq %rax, %rdx
 ; SSE2-NEXT:    movapd %xmm0, %xmm1
 ; SSE2-NEXT:    subsd %xmm2, %xmm1
 ; SSE2-NEXT:    cvttsd2si %xmm1, %rax
+; SSE2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE2-NEXT:    xorq %rcx, %rax
+; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
+; SSE2-NEXT:    ucomisd %xmm2, %xmm0
+; SSE2-NEXT:    cmovaeq %rax, %rdx
+; SSE2-NEXT:    movq %rdx, %xmm1
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    movapd %xmm0, %xmm3
+; SSE2-NEXT:    subsd %xmm2, %xmm3
+; SSE2-NEXT:    cvttsd2si %xmm3, %rax
 ; SSE2-NEXT:    xorq %rcx, %rax
 ; SSE2-NEXT:    cvttsd2si %xmm0, %rcx
 ; SSE2-NEXT:    ucomisd %xmm2, %xmm0
 ; SSE2-NEXT:    cmovaeq %rax, %rcx
-; SSE2-NEXT:    movq %rcx, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm3, %xmm1
+; SSE2-NEXT:    movq %rcx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [4294967295,4294967295]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    addpd %xmm0, %xmm1
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movq %rdx, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_unsigned_v2f64:
@@ -158,68 +153,62 @@ define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 {
 define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 {
 ; SSE2-LABEL: trunc_unsigned_v4f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm1, %xmm3
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT:    movapd %xmm3, %xmm4
-; SSE2-NEXT:    subsd %xmm2, %xmm4
-; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rdx, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm3, %rax
-; SSE2-NEXT:    ucomisd %xmm2, %xmm3
-; SSE2-NEXT:    cmovaeq %rcx, %rax
-; SSE2-NEXT:    movapd %xmm1, %xmm3
-; SSE2-NEXT:    subsd %xmm2, %xmm3
-; SSE2-NEXT:    cvttsd2si %xmm3, %rsi
-; SSE2-NEXT:    xorq %rdx, %rsi
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE2-NEXT:    subsd %xmm3, %xmm1
 ; SSE2-NEXT:    cvttsd2si %xmm1, %rcx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm1
-; SSE2-NEXT:    cmovaeq %rsi, %rcx
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    movapd %xmm1, %xmm3
-; SSE2-NEXT:    subsd %xmm2, %xmm3
-; SSE2-NEXT:    cvttsd2si %xmm3, %rsi
-; SSE2-NEXT:    xorq %rdx, %rsi
-; SSE2-NEXT:    cvttsd2si %xmm1, %rdi
-; SSE2-NEXT:    ucomisd %xmm2, %xmm1
-; SSE2-NEXT:    cmovaeq %rsi, %rdi
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    subsd %xmm2, %xmm1
-; SSE2-NEXT:    cvttsd2si %xmm1, %rsi
-; SSE2-NEXT:    xorq %rdx, %rsi
-; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm0
-; SSE2-NEXT:    cmovaeq %rsi, %rdx
+; SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE2-NEXT:    xorq %rax, %rcx
+; SSE2-NEXT:    cvttsd2si %xmm2, %rdx
+; SSE2-NEXT:    ucomisd %xmm3, %xmm2
+; SSE2-NEXT:    cmovaeq %rcx, %rdx
 ; SSE2-NEXT:    movq %rdx, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm3, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movq %rdi, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm4
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm4
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE2-NEXT:    movq %rcx, %xmm4
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm1
-; SSE2-NEXT:    movq %rax, %xmm4
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE2-NEXT:    movapd %xmm2, %xmm4
+; SSE2-NEXT:    subsd %xmm3, %xmm4
+; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
+; SSE2-NEXT:    xorq %rax, %rcx
+; SSE2-NEXT:    cvttsd2si %xmm2, %rdx
+; SSE2-NEXT:    ucomisd %xmm3, %xmm2
+; SSE2-NEXT:    cmovaeq %rcx, %rdx
+; SSE2-NEXT:    movq %rdx, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    subsd %xmm3, %xmm2
+; SSE2-NEXT:    cvttsd2si %xmm2, %rcx
+; SSE2-NEXT:    xorq %rax, %rcx
+; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
+; SSE2-NEXT:    ucomisd %xmm3, %xmm0
+; SSE2-NEXT:    cmovaeq %rcx, %rdx
+; SSE2-NEXT:    movq %rdx, %xmm2
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    movapd %xmm0, %xmm4
+; SSE2-NEXT:    subsd %xmm3, %xmm4
+; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
+; SSE2-NEXT:    xorq %rax, %rcx
+; SSE2-NEXT:    cvttsd2si %xmm0, %rax
+; SSE2-NEXT:    ucomisd %xmm3, %xmm0
+; SSE2-NEXT:    cmovaeq %rcx, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE2-NEXT:    subpd %xmm6, %xmm2
+; SSE2-NEXT:    addpd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    subpd %xmm6, %xmm1
+; SSE2-NEXT:    addpd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_unsigned_v4f64:
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 2b8ceeba7f3..14cce63ca96 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -497,63 +497,67 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
 define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
 ; SSE2-LABEL: uitofp_2i64_to_2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm4, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm3
-; SSE2-NEXT:    movapd %xmm3, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
-; SSE2-NEXT:    addpd %xmm3, %xmm0
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    addpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_2i64_to_2f64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT:    subpd %xmm3, %xmm0
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE41-NEXT:    subpd %xmm3, %xmm2
-; SSE41-NEXT:    haddpd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    subpd {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; VEX-LABEL: uitofp_2i64_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT:    vmovapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
-; VEX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
-; VEX-NEXT:    vhaddpd %xmm0, %xmm2, %xmm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: uitofp_2i64_to_2f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_2i64_to_2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_2i64_to_2f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_2i64_to_2f64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_2i64_to_2f64:
@@ -837,104 +841,96 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
 define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
 ; SSE2-LABEL: uitofp_4i64_to_4f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm5, %xmm2
-; SSE2-NEXT:    movapd %xmm2, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; SSE2-NEXT:    addpd %xmm2, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT:    movapd %xmm2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE2-NEXT:    subpd %xmm6, %xmm0
+; SSE2-NEXT:    addpd %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    subpd %xmm6, %xmm1
+; SSE2-NEXT:    addpd %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_4i64_to_4f64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT:    subpd %xmm4, %xmm0
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm3
-; SSE41-NEXT:    haddpd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm1
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm3
-; SSE41-NEXT:    haddpd %xmm3, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE41-NEXT:    subpd %xmm6, %xmm0
+; SSE41-NEXT:    addpd %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    por %xmm4, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm1
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    subpd %xmm6, %xmm1
+; SSE41-NEXT:    addpd %xmm2, %xmm1
 ; SSE41-NEXT:    retq
 ;
-; VEX-LABEL: uitofp_4i64_to_4f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; VEX-NEXT:    vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT:    vmovapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
-; VEX-NEXT:    vhaddpd %xmm1, %xmm3, %xmm1
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
-; VEX-NEXT:    vhaddpd %xmm0, %xmm3, %xmm0
-; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: uitofp_4i64_to_4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_4i64_to_4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_4i64_to_4f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_4i64_to_4f64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_4i64_to_4f64:
@@ -3446,67 +3442,73 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
 define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
 ; SSE2-LABEL: uitofp_load_2i64_to_2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm4, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm3
-; SSE2-NEXT:    movapd %xmm3, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_load_2i64_to_2f64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT:    subpd %xmm3, %xmm0
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE41-NEXT:    subpd %xmm3, %xmm2
-; SSE41-NEXT:    haddpd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    subpd {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; VEX-LABEL: uitofp_load_2i64_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovapd (%rdi), %xmm0
-; VEX-NEXT:    vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT:    vmovapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
-; VEX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
-; VEX-NEXT:    vhaddpd %xmm0, %xmm2, %xmm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: uitofp_load_2i64_to_2f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_2i64_to_2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_load_2i64_to_2f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_2i64_to_2f64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64:
@@ -3652,109 +3654,104 @@ define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
 define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ; SSE2-LABEL: uitofp_load_4i64_to_4f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm5, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm2
-; SSE2-NEXT:    movapd %xmm2, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE2-NEXT:    subpd %xmm6, %xmm0
+; SSE2-NEXT:    addpd %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    subpd %xmm6, %xmm1
 ; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_load_4i64_to_4f64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa (%rdi), %xmm0
 ; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT:    subpd %xmm4, %xmm0
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm3
-; SSE41-NEXT:    haddpd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm1
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm3
-; SSE41-NEXT:    haddpd %xmm3, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE41-NEXT:    subpd %xmm6, %xmm0
+; SSE41-NEXT:    addpd %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    por %xmm4, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm1
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    subpd %xmm6, %xmm1
+; SSE41-NEXT:    addpd %xmm2, %xmm1
 ; SSE41-NEXT:    retq
 ;
-; VEX-LABEL: uitofp_load_4i64_to_4f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovapd (%rdi), %ymm0
-; VEX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; VEX-NEXT:    vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT:    vmovapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
-; VEX-NEXT:    vhaddpd %xmm1, %xmm3, %xmm1
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
-; VEX-NEXT:    vhaddpd %xmm0, %xmm3, %xmm0
-; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: uitofp_load_4i64_to_4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps (%rdi), %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_4i64_to_4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_load_4i64_to_4f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_4i64_to_4f64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64:
-- 
GitLab


From 8893acb1d7ddedb61140d4c9fab2826739fe869c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 25 Oct 2018 11:38:17 +0000
Subject: [PATCH 0562/1116] Missing semicolon.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345257 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 413a53d2e6e..30e9413dc0a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2902,7 +2902,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(Tmp1);
       break;
     }
-    LLVM_FALLTHROUGH
+    LLVM_FALLTHROUGH;
   case ISD::SINT_TO_FP:
     Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP,
                                 Node->getOperand(0), Node->getValueType(0), dl);
-- 
GitLab


From aa7c2d802d931d71693b3c06f685a0cf49a4e9b8 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Thu, 25 Oct 2018 11:51:34 +0000
Subject: [PATCH 0563/1116] [llvm-mca] Removed a couple of redundant method
 declarations, and simplified code in ResourcePressureView. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345259 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/Views/ResourcePressureView.cpp | 13 ++++++++-----
 tools/llvm-mca/Views/ResourcePressureView.h   | 19 +++++--------------
 .../Views/RetireControlUnitStatistics.h       |  2 --
 tools/llvm-mca/Views/SchedulerStatistics.h    |  2 --
 tools/llvm-mca/Views/SummaryView.h            |  1 -
 tools/llvm-mca/include/InstrBuilder.h         | 16 ++--------------
 tools/llvm-mca/lib/InstrBuilder.cpp           |  8 ++++++++
 7 files changed, 23 insertions(+), 38 deletions(-)

diff --git a/tools/llvm-mca/Views/ResourcePressureView.cpp b/tools/llvm-mca/Views/ResourcePressureView.cpp
index 17c801259d9..e71825b07c7 100644
--- a/tools/llvm-mca/Views/ResourcePressureView.cpp
+++ b/tools/llvm-mca/Views/ResourcePressureView.cpp
@@ -20,7 +20,10 @@ namespace mca {
 
 using namespace llvm;
 
-void ResourcePressureView::initialize() {
+ResourcePressureView::ResourcePressureView(const llvm::MCSubtargetInfo &sti,
+                                           llvm::MCInstPrinter &Printer,
+                                           const SourceMgr &Sequence)
+    : STI(sti), MCIP(Printer), Source(Sequence) {
   // Populate the map of resource descriptors.
   unsigned R2VIndex = 0;
   const MCSchedModel &SM = STI.getSchedModel();
@@ -92,8 +95,7 @@ static void printResourcePressure(formatted_raw_ostream &OS, double Pressure,
   OS.PadToColumn(Col);
 }
 
-void ResourcePressureView::printResourcePressurePerIteration(
-    raw_ostream &OS, unsigned Executions) const {
+void ResourcePressureView::printResourcePressurePerIter(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
   formatted_raw_ostream FOS(TempStream);
@@ -126,6 +128,7 @@ void ResourcePressureView::printResourcePressurePerIteration(
   FOS << '\n';
   FOS.flush();
 
+  const unsigned Executions = Source.getNumIterations();
   for (unsigned I = 0, E = NumResourceUnits; I < E; ++I) {
     double Usage = ResourceUsage[I + Source.size() * E];
     printResourcePressure(FOS, Usage / Executions, (I + 1) * 7);
@@ -135,8 +138,7 @@ void ResourcePressureView::printResourcePressurePerIteration(
   OS << Buffer;
 }
 
-void ResourcePressureView::printResourcePressurePerInstruction(
-    raw_ostream &OS, unsigned Executions) const {
+void ResourcePressureView::printResourcePressurePerInst(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
   formatted_raw_ostream FOS(TempStream);
@@ -149,6 +151,7 @@ void ResourcePressureView::printResourcePressurePerInstruction(
   raw_string_ostream InstrStream(Instruction);
 
   unsigned InstrIndex = 0;
+  const unsigned Executions = Source.getNumIterations();
   for (const MCInst &MCI : Source) {
     unsigned BaseEltIdx = InstrIndex * NumResourceUnits;
     for (unsigned J = 0; J < NumResourceUnits; ++J) {
diff --git a/tools/llvm-mca/Views/ResourcePressureView.h b/tools/llvm-mca/Views/ResourcePressureView.h
index ad9c29a55e5..d413bcd80fd 100644
--- a/tools/llvm-mca/Views/ResourcePressureView.h
+++ b/tools/llvm-mca/Views/ResourcePressureView.h
@@ -82,26 +82,17 @@ class ResourcePressureView : public View {
   std::vector<ResourceCycles> ResourceUsage;
   unsigned NumResourceUnits;
 
-  const llvm::MCInst &GetMCInstFromIndex(unsigned Index) const;
-  void printResourcePressurePerIteration(llvm::raw_ostream &OS,
-                                         unsigned Executions) const;
-  void printResourcePressurePerInstruction(llvm::raw_ostream &OS,
-                                           unsigned Executions) const;
-  void initialize();
+  void printResourcePressurePerIter(llvm::raw_ostream &OS) const;
+  void printResourcePressurePerInst(llvm::raw_ostream &OS) const;
 
 public:
   ResourcePressureView(const llvm::MCSubtargetInfo &sti,
-                       llvm::MCInstPrinter &Printer, const SourceMgr &SM)
-      : STI(sti), MCIP(Printer), Source(SM) {
-    initialize();
-  }
+                       llvm::MCInstPrinter &Printer, const SourceMgr &SM);
 
   void onEvent(const HWInstructionEvent &Event) override;
-
   void printView(llvm::raw_ostream &OS) const override {
-    unsigned Executions = Source.getNumIterations();
-    printResourcePressurePerIteration(OS, Executions);
-    printResourcePressurePerInstruction(OS, Executions);
+    printResourcePressurePerIter(OS);
+    printResourcePressurePerInst(OS);
   }
 };
 } // namespace mca
diff --git a/tools/llvm-mca/Views/RetireControlUnitStatistics.h b/tools/llvm-mca/Views/RetireControlUnitStatistics.h
index 0531e389c90..e9be542a786 100644
--- a/tools/llvm-mca/Views/RetireControlUnitStatistics.h
+++ b/tools/llvm-mca/Views/RetireControlUnitStatistics.h
@@ -48,9 +48,7 @@ public:
   RetireControlUnitStatistics() : NumRetired(0), NumCycles(0) {}
 
   void onEvent(const HWInstructionEvent &Event) override;
-
   void onCycleBegin() override { NumCycles++; }
-
   void onCycleEnd() override { updateHistograms(); }
 
   void printView(llvm::raw_ostream &OS) const override;
diff --git a/tools/llvm-mca/Views/SchedulerStatistics.h b/tools/llvm-mca/Views/SchedulerStatistics.h
index de70db26ed4..3515546f083 100644
--- a/tools/llvm-mca/Views/SchedulerStatistics.h
+++ b/tools/llvm-mca/Views/SchedulerStatistics.h
@@ -70,9 +70,7 @@ public:
         Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {}
 
   void onEvent(const HWInstructionEvent &Event) override;
-
   void onCycleBegin() override { NumCycles++; }
-
   void onCycleEnd() override { updateHistograms(); }
 
   // Increases the number of used scheduler queue slots of every buffered
diff --git a/tools/llvm-mca/Views/SummaryView.h b/tools/llvm-mca/Views/SummaryView.h
index 13875976d39..3d4585e1d5a 100644
--- a/tools/llvm-mca/Views/SummaryView.h
+++ b/tools/llvm-mca/Views/SummaryView.h
@@ -66,7 +66,6 @@ public:
               unsigned Width);
 
   void onCycleEnd() override { ++TotalCycles; }
-
   void onEvent(const HWInstructionEvent &Event) override;
 
   void printView(llvm::raw_ostream &OS) const override;
diff --git a/tools/llvm-mca/include/InstrBuilder.h b/tools/llvm-mca/include/InstrBuilder.h
index 31c52702058..0fd97cb1ed5 100644
--- a/tools/llvm-mca/include/InstrBuilder.h
+++ b/tools/llvm-mca/include/InstrBuilder.h
@@ -62,20 +62,8 @@ class InstrBuilder {
                               const llvm::MCInst &MCI) const;
 
 public:
-  InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii,
-               const llvm::MCRegisterInfo &mri,
-               const llvm::MCInstrAnalysis &mcia)
-      : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia) {
-    computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
-  }
-
-  // Returns an array of processor resource masks.
-  // Masks are computed by function mca::computeProcResourceMasks. see
-  // Support.h for a description of how masks are computed and how masks can be
-  // used to solve set membership problems.
-  llvm::ArrayRef<uint64_t> getProcResourceMasks() const {
-    return ProcResourceMasks;
-  }
+  InstrBuilder(const llvm::MCSubtargetInfo &STI, const llvm::MCInstrInfo &MCII,
+               const llvm::MCRegisterInfo &RI, const llvm::MCInstrAnalysis &IA);
 
   void clear() { VariantDescriptors.shrink_and_clear(); }
 
diff --git a/tools/llvm-mca/lib/InstrBuilder.cpp b/tools/llvm-mca/lib/InstrBuilder.cpp
index 55f1ebf6e8a..3768c2e7088 100644
--- a/tools/llvm-mca/lib/InstrBuilder.cpp
+++ b/tools/llvm-mca/lib/InstrBuilder.cpp
@@ -26,6 +26,14 @@ namespace mca {
 
 using namespace llvm;
 
+InstrBuilder::InstrBuilder(const llvm::MCSubtargetInfo &sti,
+                           const llvm::MCInstrInfo &mcii,
+                           const llvm::MCRegisterInfo &mri,
+                           const llvm::MCInstrAnalysis &mcia)
+    : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia) {
+  computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
+}
+
 static void initializeUsedResources(InstrDesc &ID,
                                     const MCSchedClassDesc &SCDesc,
                                     const MCSubtargetInfo &STI,
-- 
GitLab


From 6f98ad093134e6304d4db65564911697784132f1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 25 Oct 2018 12:42:10 +0000
Subject: [PATCH 0564/1116] [CostModel][X86] Add realistic i64 uitofp f64
 scalar costs

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345261 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetTransformInfo.cpp   |   5 +
 test/Analysis/CostModel/X86/uitofp.ll       |   6 +-
 test/Transforms/SLPVectorizer/X86/uitofp.ll | 116 ++++++++++++--------
 3 files changed, 81 insertions(+), 46 deletions(-)

diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index b77ac5c9953..2da069e44ed 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1291,6 +1291,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
 
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
+
     { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  1 },
@@ -1444,6 +1446,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
 
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
   };
 
   static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
@@ -1470,6 +1473,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 
     { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  3 },
 
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    6 },
+
     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
diff --git a/test/Analysis/CostModel/X86/uitofp.ll b/test/Analysis/CostModel/X86/uitofp.ll
index 9b8bd082923..621975509f2 100644
--- a/test/Analysis/CostModel/X86/uitofp.ll
+++ b/test/Analysis/CostModel/X86/uitofp.ll
@@ -120,14 +120,14 @@ define i32 @uitofp_i32_double() {
 
 define i32 @uitofp_i64_double() {
 ; SSE-LABEL: 'uitofp_i64_double'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'uitofp_i64_double'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
@@ -148,7 +148,7 @@ define i32 @uitofp_i64_double() {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'uitofp_i64_double'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
diff --git a/test/Transforms/SLPVectorizer/X86/uitofp.ll b/test/Transforms/SLPVectorizer/X86/uitofp.ll
index ff63fe35bdd..3356f560f35 100644
--- a/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ b/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=XOP
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -69,20 +69,32 @@ define void @uitofp_4i64_4f64() #0 {
 ; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @uitofp_4i64_4f64(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; AVX256-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; AVX256-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
-; AVX256-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
-; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; AVX256-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; AVX256-NEXT:    ret void
+; AVX1-LABEL: @uitofp_4i64_4f64(
+; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
+; AVX1-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
+; AVX1-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
+; AVX1-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
+; AVX1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX1-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX1-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX1-NEXT:    ret void
+;
+; XOP-LABEL: @uitofp_4i64_4f64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; XOP-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; XOP-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; XOP-NEXT:    ret void
+;
+; AVX2-LABEL: @uitofp_4i64_4f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX2-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_4i64_4f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
@@ -133,32 +145,50 @@ define void @uitofp_8i64_8f64() #0 {
 ; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @uitofp_8i64_8f64(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX256-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX256-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX256-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; AVX256-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; AVX256-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
-; AVX256-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
-; AVX256-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to double
-; AVX256-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to double
-; AVX256-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to double
-; AVX256-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to double
-; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; AVX256-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; AVX256-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; AVX256-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; AVX256-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; AVX256-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
-; AVX256-NEXT:    ret void
+; AVX1-LABEL: @uitofp_8i64_8f64(
+; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX1-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX1-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
+; AVX1-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
+; AVX1-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
+; AVX1-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
+; AVX1-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to double
+; AVX1-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to double
+; AVX1-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to double
+; AVX1-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to double
+; AVX1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX1-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX1-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX1-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; AVX1-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; AVX1-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; AVX1-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; AVX1-NEXT:    ret void
+;
+; XOP-LABEL: @uitofp_8i64_8f64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; XOP-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; XOP-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
+; XOP-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; XOP-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; XOP-NEXT:    ret void
+;
+; AVX2-LABEL: @uitofp_8i64_8f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX2-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX2-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_8i64_8f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
-- 
GitLab


From a9f0942cd976774c52ffc0f075381965fb903fa7 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@lowrisc.org>
Date: Thu, 25 Oct 2018 12:45:20 +0000
Subject: [PATCH 0565/1116] [RISCV] Use PatFrags for variable shift patterns

This follows SystemZ and I think is cleaner vs the multiclass.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345262 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/RISCV/RISCVInstrInfo.td | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td
index 50012569a74..631a1f7deca 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/lib/Target/RISCV/RISCVInstrInfo.td
@@ -206,7 +206,7 @@ def ixlenimm : Operand<XLenVT> {
 def simm32     : ImmLeaf<XLenVT, [{return isInt<32>(Imm);}]>;
 def simm32hi20 : ImmLeaf<XLenVT, [{return isShiftedInt<20, 12>(Imm);}]>;
 // A mask value that won't affect significant shift bits.
-def immshiftxlen : ImmLeaf<XLenVT, [{
+def immbottomxlenset : ImmLeaf<XLenVT, [{
   if (Subtarget->is64Bit())
     return countTrailingOnes<uint64_t>(Imm) >= 6;
   return countTrailingOnes<uint64_t>(Imm) >= 5;
@@ -660,15 +660,14 @@ def : PatGprUimmLog2XLen<sra, SRAI>;
 // typically introduced when the legalizer promotes the shift amount and
 // zero-extends it). For RISC-V, the mask is unnecessary as shifts in the base
 // ISA only read the least significant 5 bits (RV32I) or 6 bits (RV64I).
-multiclass VarShiftXLenPat<PatFrag ShiftOp, RVInst Inst> {
-  def : Pat<(ShiftOp GPR:$rs1, GPR:$rs2), (Inst GPR:$rs1, GPR:$rs2)>;
-  def : Pat<(ShiftOp GPR:$rs1, (and GPR:$rs2, immshiftxlen)),
-            (Inst GPR:$rs1, GPR:$rs2)>;
-}
-
-defm : VarShiftXLenPat<shl, SLL>;
-defm : VarShiftXLenPat<srl, SRL>;
-defm : VarShiftXLenPat<sra, SRA>;
+class shiftop<SDPatternOperator operator>
+    : PatFrags<(ops node:$val, node:$count),
+               [(operator node:$val, node:$count),
+                (operator node:$val, (and node:$count, immbottomxlenset))]>;
+
+def : PatGprGpr<shiftop<shl>, SLL>;
+def : PatGprGpr<shiftop<srl>, SRL>;
+def : PatGprGpr<shiftop<sra>, SRA>;
 
 /// FrameIndex calculations
 
-- 
GitLab


From 8a10b6b077a6cb9f95bd0d9781bc5936d30735cf Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 25 Oct 2018 13:06:20 +0000
Subject: [PATCH 0566/1116] [CostModel][X86] Add realistic vXi64 uitofp vXf64
 costs

Match codegen improvements from D53649/rL345256

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345263 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetTransformInfo.cpp     |  13 +-
 test/Analysis/CostModel/X86/uitofp.ll         |  28 +--
 .../X86/uint64_to_fp64-cost-model.ll          |   5 +-
 test/Transforms/SLPVectorizer/X86/uitofp.ll   | 177 ++++--------------
 4 files changed, 62 insertions(+), 161 deletions(-)

diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 2da069e44ed..8d8bc0b35cb 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1264,8 +1264,6 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
 
     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
@@ -1287,9 +1285,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  5 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  5 },
 
     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
 
@@ -1387,13 +1386,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 5 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 6 },
     // The generic code to compute the scalar overhead is currently broken.
     // Workaround this limitation by estimating the scalarization overhead
     // here. We have roughly 10 instructions per scalar element.
     // Multiply that by the vector width.
     // FIXME: remove that when PR19268 is fixed.
-    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 10 },
-    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 20 },
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
 
@@ -1468,7 +1467,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
 
     { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  3 },
diff --git a/test/Analysis/CostModel/X86/uitofp.ll b/test/Analysis/CostModel/X86/uitofp.ll
index 621975509f2..c76ac14a685 100644
--- a/test/Analysis/CostModel/X86/uitofp.ll
+++ b/test/Analysis/CostModel/X86/uitofp.ll
@@ -13,7 +13,7 @@
 define i32 @uitofp_i8_double() {
 ; SSE-LABEL: 'uitofp_i8_double'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -49,7 +49,7 @@ define i32 @uitofp_i8_double() {
 define i32 @uitofp_i16_double() {
 ; SSE-LABEL: 'uitofp_i16_double'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -85,7 +85,7 @@ define i32 @uitofp_i16_double() {
 define i32 @uitofp_i32_double() {
 ; SSE-LABEL: 'uitofp_i32_double'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -121,23 +121,23 @@ define i32 @uitofp_i32_double() {
 define i32 @uitofp_i64_double() {
 ; SSE-LABEL: 'uitofp_i64_double'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'uitofp_i64_double'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'uitofp_i64_double'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'uitofp_i64_double'
@@ -149,9 +149,9 @@ define i32 @uitofp_i64_double() {
 ;
 ; BTVER2-LABEL: 'uitofp_i64_double'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %cvt_i64_f64 = uitofp i64 undef to double
diff --git a/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
index 387eec4d5ed..e08ef002d0e 100644
--- a/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -5,8 +5,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 
-; CHECK: cost of 10 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: cost of 20 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 6 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
 define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
 entry:
   br label %for.body
diff --git a/test/Transforms/SLPVectorizer/X86/uitofp.ll b/test/Transforms/SLPVectorizer/X86/uitofp.ll
index 3356f560f35..65218409423 100644
--- a/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ b/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -20,29 +20,11 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ;
 
 define void @uitofp_2i64_2f64() #0 {
-; SSE-LABEL: @uitofp_2i64_2f64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    ret void
-;
-; AVX256-LABEL: @uitofp_2i64_2f64(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; AVX256-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_2i64_2f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
-; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; AVX512-NEXT:    ret void
+; CHECK-LABEL: @uitofp_2i64_2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -55,52 +37,19 @@ define void @uitofp_2i64_2f64() #0 {
 
 define void @uitofp_4i64_4f64() #0 {
 ; SSE-LABEL: @uitofp_4i64_4f64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
-; AVX1-LABEL: @uitofp_4i64_4f64(
-; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX1-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; AVX1-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; AVX1-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
-; AVX1-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
-; AVX1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX1-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; AVX1-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; AVX1-NEXT:    ret void
-;
-; XOP-LABEL: @uitofp_4i64_4f64(
-; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; XOP-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
-; XOP-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; XOP-NEXT:    ret void
-;
-; AVX2-LABEL: @uitofp_4i64_4f64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX2-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_4i64_4f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX512-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX512-NEXT:    ret void
+; AVX-LABEL: @uitofp_4i64_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -119,76 +68,28 @@ define void @uitofp_4i64_4f64() #0 {
 
 define void @uitofp_8i64_8f64() #0 {
 ; SSE-LABEL: @uitofp_8i64_8f64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
-; SSE-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to double
-; SSE-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to double
-; SSE-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to double
-; SSE-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i64> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
-; AVX1-LABEL: @uitofp_8i64_8f64(
-; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX1-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX1-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX1-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX1-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX1-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; AVX1-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; AVX1-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
-; AVX1-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
-; AVX1-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to double
-; AVX1-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to double
-; AVX1-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to double
-; AVX1-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to double
-; AVX1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX1-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; AVX1-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; AVX1-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; AVX1-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; AVX1-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; AVX1-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
-; AVX1-NEXT:    ret void
-;
-; XOP-LABEL: @uitofp_8i64_8f64(
-; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; XOP-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
-; XOP-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
-; XOP-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; XOP-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
-; XOP-NEXT:    ret void
-;
-; AVX2-LABEL: @uitofp_8i64_8f64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; AVX2-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX2-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
-; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
-; AVX2-NEXT:    ret void
+; AVX256-LABEL: @uitofp_8i64_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_8i64_8f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
-- 
GitLab


From 5b13734c4d8c37a54708fc9d9ad28712cb1d5da9 Mon Sep 17 00:00:00 2001
From: Krasimir Georgiev <krasimir@google.com>
Date: Thu, 25 Oct 2018 13:38:07 +0000
Subject: [PATCH 0567/1116] IR: Optimize StructType::get to perform one hash
 lookup instead of two, NFCI

Summary:
This function was performing two hash lookups when a new struct type was requested: first checking if it exists and second to insert it. This patch updates the function to perform a single hash lookup in this case by updating the value in the hash table in-place in case the struct type was not there before.

Similar to r345151.

Reviewers: bkramer

Reviewed By: bkramer

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53689

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345264 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Type.cpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index 4e7532cb268..0fb079c5ab7 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -342,18 +342,25 @@ bool FunctionType::isValidArgumentType(Type *ArgTy) {
 StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes,
                             bool isPacked) {
   LLVMContextImpl *pImpl = Context.pImpl;
-  AnonStructTypeKeyInfo::KeyTy Key(ETypes, isPacked);
-  auto I = pImpl->AnonStructTypes.find_as(Key);
-  StructType *ST;
+  const AnonStructTypeKeyInfo::KeyTy Key(ETypes, isPacked);
 
-  if (I == pImpl->AnonStructTypes.end()) {
-    // Value not found.  Create a new type!
+  StructType *ST;
+  // Since we only want to allocate a fresh struct type in case none is found
+  // and we don't want to perform two lookups (one for checking if existent and
+  // one for inserting the newly allocated one), here we instead lookup based on
+  // Key and update the reference to the struct type in-place to a newly
+  // allocated one if not found.
+  auto Insertion = pImpl->AnonStructTypes.insert_as(nullptr, Key);
+  if (Insertion.second) {
+    // The struct type was not found. Allocate one and update AnonStructTypes
+    // in-place.
     ST = new (Context.pImpl->TypeAllocator) StructType(Context);
     ST->setSubclassData(SCDB_IsLiteral);  // Literal struct.
     ST->setBody(ETypes, isPacked);
-    Context.pImpl->AnonStructTypes.insert(ST);
+    *Insertion.first = ST;
   } else {
-    ST = *I;
+    // The struct type was found. Just return it.
+    ST = *Insertion.first;
   }
 
   return ST;
-- 
GitLab


From 497bf4892d238fa411f9cca16d676901bd4079dd Mon Sep 17 00:00:00 2001
From: Amara Emerson <aemerson@apple.com>
Date: Thu, 25 Oct 2018 14:04:54 +0000
Subject: [PATCH 0568/1116] [GlobalISel] Use the target preferred type for
 G_EXTRACT_VECTOR_ELT index.

Allows for better imported pattern re-use.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345265 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Target/GlobalISel/SelectionDAGCompat.td     |  1 +
 lib/CodeGen/GlobalISel/IRTranslator.cpp         | 17 ++++++++++++++++-
 lib/CodeGen/GlobalISel/LegalizerHelper.cpp      |  6 ++++++
 lib/Target/AArch64/AArch64LegalizerInfo.cpp     | 11 +++++++++++
 .../AArch64/GlobalISel/arm64-fallback.ll        |  2 +-
 .../AArch64/GlobalISel/arm64-irtranslator.ll    | 13 ++++++++++++-
 .../GlobalISel/legalizer-info-validation.mir    |  2 +-
 7 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index a3d310cfe1c..af26375802a 100644
--- a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -88,6 +88,7 @@ def : GINodeEquiv<G_CTTZ, cttz>;
 def : GINodeEquiv<G_CTLZ_ZERO_UNDEF, ctlz_zero_undef>;
 def : GINodeEquiv<G_CTTZ_ZERO_UNDEF, cttz_zero_undef>;
 def : GINodeEquiv<G_CTPOP, ctpop>;
+def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
 // Broadly speaking G_LOAD is equivalent to ISD::LOAD but there are some
 // complications that tablegen must take care of. For example, Predicates such
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 709965ba151..ab7d3a87975 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1330,7 +1330,22 @@ bool IRTranslator::translateExtractElement(const User &U,
   }
   unsigned Res = getOrCreateVReg(U);
   unsigned Val = getOrCreateVReg(*U.getOperand(0));
-  unsigned Idx = getOrCreateVReg(*U.getOperand(1));
+  const auto &TLI = *MF->getSubtarget().getTargetLowering();
+  unsigned PreferredVecIdxWidth = TLI.getVectorIdxTy(*DL).getSizeInBits();
+  unsigned Idx = 0;
+  if (auto *CI = dyn_cast<ConstantInt>(U.getOperand(1))) {
+    if (CI->getBitWidth() != PreferredVecIdxWidth) {
+      APInt NewIdx = CI->getValue().sextOrTrunc(PreferredVecIdxWidth);
+      auto *NewIdxCI = ConstantInt::get(CI->getContext(), NewIdx);
+      Idx = getOrCreateVReg(*NewIdxCI);
+    }
+  }
+  if (!Idx)
+    Idx = getOrCreateVReg(*U.getOperand(1));
+  if (MRI->getType(Idx).getSizeInBits() != PreferredVecIdxWidth) {
+    const LLT &VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+    Idx = MIRBuilder.buildSExtOrTrunc(VecIdxTy, Idx)->getOperand(0).getReg();
+  }
   MIRBuilder.buildExtractVectorElement(Res, Val, Idx);
   return true;
 }
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ff2e61c03b4..c9ed97aa390 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -878,6 +878,12 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     MIRBuilder.recordInsertion(&MI);
     return Legalized;
   }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+    if (TypeIdx != 2)
+      return UnableToLegalize;
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
+    MIRBuilder.recordInsertion(&MI);
+    return Legalized;
   }
 }
 
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 327c758a7f8..b3c2fbf2b15 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -385,6 +385,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
                          });
   }
 
+  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
+      .unsupportedIf([=](const LegalityQuery &Query) {
+        const LLT &EltTy = Query.Types[1].getElementType();
+        return Query.Types[0] != EltTy;
+      })
+      .minScalar(2, s64)
+      .legalIf([=](const LegalityQuery &Query) {
+        const LLT &VecTy = Query.Types[1];
+        return VecTy == v4s32 || VecTy == v2s64;
+      });
+
   computeTables();
   verify(*ST.getInstrInfo());
 }
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index e1defd71958..da3aa3c1009 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -141,7 +141,7 @@ define fp128 @test_quad_dump() {
   ret fp128 0xL00000000000000004000000000000000
 }
 
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %2:_(p0) = G_EXTRACT_VECTOR_ELT %0:_(<2 x p0>), %3:_(s32) (in function: vector_of_pointers_extractelement)
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %2:_(p0) = G_EXTRACT_VECTOR_ELT %0:_(<2 x p0>), %3:_(s64) (in function: vector_of_pointers_extractelement)
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for vector_of_pointers_extractelement
 ; FALLBACK-WITH-REPORT-OUT-LABEL: vector_of_pointers_extractelement:
 @var = global <2 x i16*> zeroinitializer
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index a021eeda353..2997c5350eb 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1530,12 +1530,23 @@ define i32 @test_extractelement(<2 x i32> %vec, i32 %idx) {
 ; CHECK-LABEL: name: test_extractelement
 ; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = COPY $d0
 ; CHECK: [[IDX:%[0-9]+]]:_(s32) = COPY $w0
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>), [[IDX]](s32)
+; CHECK: [[IDXEXT:%[0-9]+]]:_(s64) = G_SEXT [[IDX]]
+; CHECK: [[RES:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>), [[IDXEXT]](s64)
 ; CHECK: $w0 = COPY [[RES]](s32)
   %res = extractelement <2 x i32> %vec, i32 %idx
   ret i32 %res
 }
 
+define i32 @test_extractelement_const_idx(<2 x i32> %vec) {
+; CHECK-LABEL: name: test_extractelement
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+; CHECK: [[IDX:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; CHECK: [[RES:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>), [[IDX]](s64)
+; CHECK: $w0 = COPY [[RES]](s32)
+  %res = extractelement <2 x i32> %vec, i32 1
+  ret i32 %res
+}
+
 define i32 @test_singleelementvector(i32 %elt){
 ; CHECK-LABEL: name: test_singleelementvector
 ; CHECK: [[ELT:%[0-9]+]]:_(s32) = COPY $w0
diff --git a/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index f776ca6df31..ca059cf1544 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -295,7 +295,7 @@
 # DEBUG:      .. type index coverage check SKIPPED: no rules defined
 #
 # DEBUG-NEXT: G_EXTRACT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices
-# DEBUG:      .. type index coverage check SKIPPED: no rules defined
+# DEBUG:      .. type index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_SHUFFLE_VECTOR (opcode {{[0-9]+}}): 3 type indices
 # DEBUG:      .. type index coverage check SKIPPED: no rules defined
-- 
GitLab


From c00e256ef2cba1db0713b9d2753b259298fc5b1f Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Thu, 25 Oct 2018 14:11:07 +0000
Subject: [PATCH 0569/1116] [X86] Fix llc invocation on MIR test case

The current state of the llc invocation is:

* Running all the passes from dwarfehprepare to stack coloring
(included)
* It runs it from the LLVM IR included in the file
* It *ADDS* the generated MI from ISel to the MI in the MIR file
* The machine verifier doesn't like it.

Differential Revision: https://reviews.llvm.org/D53698

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345266 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/PR37310.mir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CodeGen/X86/PR37310.mir b/test/CodeGen/X86/PR37310.mir
index 37f400de2e7..6f09a8987eb 100644
--- a/test/CodeGen/X86/PR37310.mir
+++ b/test/CodeGen/X86/PR37310.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -start-before dwarfehprepare -no-stack-coloring=false -stop-after stack-coloring -o - %s
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -no-stack-coloring=false -run-pass stack-coloring -o - %s
 
 # Test to insure that the liveness analysis in the StackColoring
 # pass gracefully handles statically unreachable blocks. See PR 37310.
-- 
GitLab


From 1b6f74f7adcbf0a84a45ff45152f23d3ed41ba9c Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Thu, 25 Oct 2018 14:27:27 +0000
Subject: [PATCH 0570/1116] [DEBUG_INFO][NVPTX]Fix processing of DBG_VALUES.

Summary:
If the instruction in the eliminateFrameIndex function is a DBG_VALUE
instruction, it requires special processing. The frame register is set
to VRFrame and the offset is based on the object offset.
The code is similar to the code used in
lib/CodeGen/PrologEpilogInserter.cpp.

Reviewers: tra

Subscribers: jholewinski, llvm-commits

Differential Revision: https://reviews.llvm.org/D53657

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345269 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp    | 19 +++++
 test/DebugInfo/NVPTX/dbg-value-const-byref.ll | 81 +++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 test/DebugInfo/NVPTX/dbg-value-const-byref.ll

diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 5bb4fc3edd0..2ca0ccf2dfa 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -60,6 +61,24 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
       for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
         if (!MI.getOperand(i).isFI())
           continue;
+
+        // Frame indices in debug values are encoded in a target independent
+        // way with simply the frame index and offset rather than any
+        // target-specific addressing mode.
+        if (MI.isDebugValue()) {
+          assert(i == 0 && "Frame indices can only appear as the first "
+                           "operand of a DBG_VALUE machine instruction");
+          unsigned Reg;
+          int64_t Offset =
+              TFI.getFrameIndexReference(MF, MI.getOperand(0).getIndex(), Reg);
+          MI.getOperand(0).ChangeToRegister(Reg, /*isDef=*/false);
+          MI.getOperand(0).setIsDebug();
+          auto *DIExpr = DIExpression::prepend(MI.getDebugExpression(),
+                                               DIExpression::NoDeref, Offset);
+          MI.getOperand(3).setMetadata(DIExpr);
+          continue;
+        }
+
         TRI.eliminateFrameIndex(MI, 0, i, nullptr);
         Modified = true;
       }
diff --git a/test/DebugInfo/NVPTX/dbg-value-const-byref.ll b/test/DebugInfo/NVPTX/dbg-value-const-byref.ll
new file mode 100644
index 00000000000..b120a406454
--- /dev/null
+++ b/test/DebugInfo/NVPTX/dbg-value-const-byref.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s | FileCheck %s
+; Generated with -O1 from:
+; int f1();
+; void f2(int*);
+; int f3(int);
+;
+; int foo() {
+;   int i = 3;
+;   f3(i);
+;   i = 7;
+;   i = f1();
+;   f2(&i);
+;   return 0;
+; }
+;
+; Test that we generate valid debug info for optimized code,
+; particularly variables that are described as constants and passed
+; by reference.
+;
+; CHECK: DEBUG_VALUE: foo:i <- [DW_OP_deref] $vrdepot
+; CHECK: DEBUG_VALUE: foo:i <- 3
+; CHECK: DEBUG_VALUE: foo:i <- 7
+; CHECK: DEBUG_VALUE: foo:i <- %
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo() #0 !dbg !4 {
+entry:
+  %i = alloca i32, align 4
+  call void @llvm.dbg.value(metadata i32 3, metadata !10, metadata !DIExpression()), !dbg !15
+  %call = call i32 @f3(i32 3) #3, !dbg !16
+  call void @llvm.dbg.value(metadata i32 7, metadata !10, metadata !DIExpression()), !dbg !18
+  %call1 = call i32 (...) @f1() #3, !dbg !19
+  call void @llvm.dbg.value(metadata i32 %call1, metadata !10, metadata !DIExpression()), !dbg !19
+  store i32 %call1, i32* %i, align 4, !dbg !19, !tbaa !20
+  call void @llvm.dbg.value(metadata i32* %i, metadata !10, metadata !DIExpression(DW_OP_deref)), !dbg !24
+  call void @f2(i32* %i) #3, !dbg !24
+  ret i32 0, !dbg !25
+}
+
+declare i32 @f3(i32)
+
+declare i32 @f1(...)
+
+declare void @f2(i32*)
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11, !12}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5.0 ", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "dbg-value-const-byref.c", directory: "")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !0, scopeLine: 5, file: !1, scope: !5, type: !6, retainedNodes: !9)
+!5 = !DIFile(filename: "dbg-value-const-byref.c", directory: "")
+!6 = !DISubroutineType(types: !7)
+!7 = !{!8}
+!8 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !{!10}
+!10 = !DILocalVariable(name: "i", line: 6, scope: !4, file: !5, type: !8)
+!11 = !{i32 2, !"Dwarf Version", i32 2}
+!12 = !{i32 1, !"Debug Info Version", i32 3}
+!13 = !{!"clang version 3.5.0 "}
+!14 = !{i32 3}
+!15 = !DILocation(line: 6, scope: !4)
+!16 = !DILocation(line: 7, scope: !4)
+!17 = !{i32 7}
+!18 = !DILocation(line: 8, scope: !4)
+!19 = !DILocation(line: 9, scope: !4)
+!20 = !{!21, !21, i64 0}
+!21 = !{!"int", !22, i64 0}
+!22 = !{!"omnipotent char", !23, i64 0}
+!23 = !{!"Simple C/C++ TBAA"}
+!24 = !DILocation(line: 10, scope: !4)
+!25 = !DILocation(line: 11, scope: !4)
-- 
GitLab


From b79b03ee2a83e70fe4f18a5019f4838ad7e69675 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Thu, 25 Oct 2018 14:56:48 +0000
Subject: [PATCH 0571/1116] [AArch64] Do 64-bit vector move of 0 and -1 by
 extracting from the 128-bit move

Currently a vector move of 0 or -1 will use different instructions depending on
the size of the vector. Using a single instruction (the 128-bit one) for both
gives more opportunity for Machine CSE to eliminate instructions.

Differential Revision: https://reviews.llvm.org/D53579


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345270 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrInfo.td        | 22 +++++++++--------
 test/CodeGen/AArch64/aarch64-be-bv.ll         |  2 +-
 .../AArch64/aarch64-smax-constantfold.ll      |  2 +-
 .../arm64-neon-compare-instructions.ll        |  6 ++---
 test/CodeGen/AArch64/arm64-neon-copy.ll       |  4 ++--
 test/CodeGen/AArch64/arm64-vector-ext.ll      |  2 +-
 test/CodeGen/AArch64/arm64-vshuffle.ll        |  2 +-
 .../AArch64/arm64-zero-cycle-zeroing.ll       |  8 +++----
 test/CodeGen/AArch64/bitcast.ll               |  4 ++--
 test/CodeGen/AArch64/fast-isel-cmp-vec.ll     |  4 ++--
 test/CodeGen/AArch64/fold-constants.ll        |  2 +-
 test/CodeGen/AArch64/machine_cse.ll           | 24 +++++++++++++++++++
 .../AArch64/neon-compare-instructions.ll      |  6 ++---
 test/CodeGen/AArch64/selectiondag-order.ll    |  6 ++---
 14 files changed, 60 insertions(+), 34 deletions(-)

diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 88e5632fbe6..76ea2acef6a 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4920,16 +4920,6 @@ def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
 def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
           (MOVID imm0_255:$shift)>;
 
-def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
-
-def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
-
 // EDIT byte mask: 2d
 
 // The movi_edit node has the immediate value already encoded, so we use
@@ -4950,6 +4940,18 @@ def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 
+// Set 64-bit vectors to all 0/1 by extracting from a 128-bit register as the
+// extract is free and this gives better MachineCSE results.
+def : Pat<(v1i64 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+def : Pat<(v2i32 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+def : Pat<(v4i16 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+def : Pat<(v8i8  immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+
+def : Pat<(v1i64 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+def : Pat<(v2i32 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+def : Pat<(v4i16 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+def : Pat<(v8i8  immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+
 // EDIT per word & halfword: 2s, 4h, 4s, & 8h
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
diff --git a/test/CodeGen/AArch64/aarch64-be-bv.ll b/test/CodeGen/AArch64/aarch64-be-bv.ll
index 54b7c8ff414..0e1797fa179 100644
--- a/test/CodeGen/AArch64/aarch64-be-bv.ll
+++ b/test/CodeGen/AArch64/aarch64-be-bv.ll
@@ -746,7 +746,7 @@ define void @modimm_t10_call() {
   ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
   ; CHECK-NEXT:    bl      f_v4i16
   call i16 @f_v4i16(<4 x i16> <i16 -1, i16 0, i16 -1, i16 0>)
-  ; CHECK:         movi    d[[REG1:[0-9]+]], #0xffffffffffffffff
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffffffffffffff
   ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
   ; CHECK-NEXT:    bl      f_v2i32
   call i32 @f_v2i32(<2 x i32> <i32 -1, i32 -1>)
diff --git a/test/CodeGen/AArch64/aarch64-smax-constantfold.ll b/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
index 0e5b59f9512..32cd3c68333 100644
--- a/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
+++ b/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
@@ -6,7 +6,7 @@ declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>)
 ; CHECK-LABEL: test
 define <4 x i16> @test() {
 entry:
-; CHECK: movi	d{{[0-9]+}}, #0000000000000000
+; CHECK: movi	v{{[0-9]+}}.2d, #0000000000000000
   %0 = tail call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer)
   ret <4 x i16> %0
 }
diff --git a/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
index 7cc5a43d53c..bb3c36adee5 100644
--- a/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
@@ -975,7 +975,7 @@ define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
 define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
 ;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b
 	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
@@ -995,7 +995,7 @@ define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
 define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
 ;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
 	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
@@ -1015,7 +1015,7 @@ define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
 define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
 ;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
 	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index 2a9e545165e..0b6132b1be6 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1401,7 +1401,7 @@ entry:
 
 define <4 x i16> @concat_vector_v4i16_const() {
 ; CHECK-LABEL: concat_vector_v4i16_const:
-; CHECK: movi {{d[0-9]+}}, #0
+; CHECK: movi {{v[0-9]+}}.2d, #0
  %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
  ret <4 x i16> %r
 }
@@ -1422,7 +1422,7 @@ define <4 x i32> @concat_vector_v4i32_const() {
 
 define <8 x i8> @concat_vector_v8i8_const() {
 ; CHECK-LABEL: concat_vector_v8i8_const:
-; CHECK: movi {{d[0-9]+}}, #0
+; CHECK: movi {{v[0-9]+}}.2d, #0
  %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
  ret <8 x i8> %r
 }
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
index 68892eeacf3..8debd21ee6e 100644
--- a/test/CodeGen/AArch64/arm64-vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -19,7 +19,7 @@ define void @func30(%T0_30 %v0, %T1_30* %p1) {
 ; sensible instead.
 define <1 x i32> @autogen_SD7918() {
 ; CHECK-LABEL: autogen_SD7918
-; CHECK: movi d0, #0000000000000000
+; CHECK: movi.2d v0, #0000000000000000
 ; CHECK-NEXT: ret
   %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
   %ZE = zext <1 x i1> %I29 to <1 x i32>
diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
index b4f57675ace..fdd7cad7853 100644
--- a/test/CodeGen/AArch64/arm64-vshuffle.ll
+++ b/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -2,7 +2,7 @@
 
 
 ; CHECK: test1
-; CHECK: movi d[[REG0:[0-9]+]], #0000000000000000
+; CHECK: movi.16b v[[REG0:[0-9]+]], #0
 define <8 x i1> @test1() {
 entry:
   %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
index 0335d0a6a07..784b4c486fe 100644
--- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -162,28 +162,28 @@ entry:
 define <8 x i8> @tv8i8() {
 entry:
 ; ALL-LABEL: tv8i8:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
   ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
 }
 
 define <4 x i16> @tv4i16() {
 entry:
 ; ALL-LABEL: tv4i16:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
   ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
 }
 
 define <2 x i32> @tv2i32() {
 entry:
 ; ALL-LABEL: tv2i32:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
   ret <2 x i32> <i32 0, i32 0>
 }
 
 define <2 x float> @tv2f32() {
 entry:
 ; ALL-LABEL: tv2f32:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
   ret <2 x float> <float 0.0, float 0.0>
 }
 
diff --git a/test/CodeGen/AArch64/bitcast.ll b/test/CodeGen/AArch64/bitcast.ll
index e88ea9ec021..d60bd4ab3fc 100644
--- a/test/CodeGen/AArch64/bitcast.ll
+++ b/test/CodeGen/AArch64/bitcast.ll
@@ -4,7 +4,7 @@
 
 define <4 x i16> @foo1(<2 x i32> %a) {
 ; CHECK-LABEL: foo1:
-; CHECK:       movi	d0, #0000000000000000
+; CHECK:       movi	v0.2d, #0000000000000000
 ; CHECK-NEXT:  ret
 
   %1 = shufflevector <2 x i32> <i32 58712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
@@ -16,7 +16,7 @@ define <4 x i16> @foo1(<2 x i32> %a) {
 
 define <4 x i16> @foo2(<2 x i32> %a) {
 ; CHECK-LABEL: foo2:
-; CHECK:       movi	d0, #0000000000000000
+; CHECK:       movi	v0.2d, #0000000000000000
 ; CHECK-NEXT:  ret
 
   %1 = shufflevector <2 x i32> <i32 712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
diff --git a/test/CodeGen/AArch64/fast-isel-cmp-vec.ll b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
index d5b64c5363e..42112065943 100644
--- a/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
+++ b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
@@ -24,7 +24,7 @@ bb2:
 define <2 x i32> @icmp_constfold_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: icmp_constfold_v2i32:
 ; CHECK:      ; %bb.0:
-; CHECK-NEXT:  movi d[[CMP:[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT:  movi.2d v[[CMP:[0-9]+]], #0xffffffffffffffff
 ; CHECK-NEXT: ; %bb.1:
 ; CHECK-NEXT:  movi.2s [[MASK:v[0-9]+]], #1
 ; CHECK-NEXT:  and.8b v0, v[[CMP]], [[MASK]]
@@ -56,7 +56,7 @@ bb2:
 define <4 x i32> @icmp_constfold_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: icmp_constfold_v4i32:
 ; CHECK:      ; %bb.0:
-; CHECK-NEXT:  movi d[[CMP:[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT:  movi.2d v[[CMP:[0-9]+]], #0xffffffffffffffff
 ; CHECK-NEXT: ; %bb.1:
 ; CHECK-NEXT:  movi.4h [[MASK:v[0-9]+]], #1
 ; CHECK-NEXT:  and.8b [[ZEXT:v[0-9]+]], v[[CMP]], [[MASK]]
diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll
index 719d3f46950..ab13eb631d4 100644
--- a/test/CodeGen/AArch64/fold-constants.ll
+++ b/test/CodeGen/AArch64/fold-constants.ll
@@ -2,7 +2,7 @@
 
 define i64 @dotests_616() {
 ; CHECK-LABEL: dotests_616
-; CHECK:       movi d0, #0000000000000000
+; CHECK:       movi v0.2d, #0000000000000000
 ; CHECK-NEXT:  fmov x0, d0
 ; CHECK-NEXT:  ret
 entry:
diff --git a/test/CodeGen/AArch64/machine_cse.ll b/test/CodeGen/AArch64/machine_cse.ll
index e9fa68041d9..51252a2a842 100644
--- a/test/CodeGen/AArch64/machine_cse.ll
+++ b/test/CodeGen/AArch64/machine_cse.ll
@@ -47,3 +47,27 @@ return:
   store i32 %a, i32 *%arg
   ret void
 }
+
+define void @combine_vector_zeros(<8 x i8>* %p, <16 x i8>* %q) {
+; CHECK-LABEL: combine_vector_zeros:
+; CHECK: movi v[[REG:[0-9]+]].2d, #0
+; CHECK-NOT: movi
+; CHECK: str d[[REG]], [x0]
+; CHECK: str q[[REG]], [x1]
+entry:
+  store <8 x i8> zeroinitializer, <8 x i8>* %p
+  store <16 x i8> zeroinitializer, <16 x i8>* %q
+  ret void
+}
+
+define void @combine_vector_ones(<2 x i32>* %p, <4 x i32>* %q) {
+; CHECK-LABEL: combine_vector_ones:
+; CHECK: movi v[[REG:[0-9]+]].2d, #0xffffffffffffffff
+; CHECK-NOT: movi
+; CHECK: str d[[REG]], [x0]
+; CHECK: str q[[REG]], [x1]
+entry:
+  store <2 x i32> <i32 -1, i32 -1>, <2 x i32>* %p
+  store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* %q
+  ret void
+}
diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll
index 8bb7cc8c143..9d7d0abbf6c 100644
--- a/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1223,7 +1223,7 @@ define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
 ; CHECK-LABEL: cmlsz8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}}
 ; CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
@@ -1245,7 +1245,7 @@ define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
 ; CHECK-LABEL: cmlsz4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}}
 ; CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
@@ -1267,7 +1267,7 @@ define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
 ; CHECK-LABEL: cmlsz2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}}
 ; CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
diff --git a/test/CodeGen/AArch64/selectiondag-order.ll b/test/CodeGen/AArch64/selectiondag-order.ll
index 9427906160f..fb40653723f 100644
--- a/test/CodeGen/AArch64/selectiondag-order.ll
+++ b/test/CodeGen/AArch64/selectiondag-order.ll
@@ -21,7 +21,7 @@ end:                                        ; preds = %body
 }
 
 ; AARCH64-CHECK: simulate:
-; AARCH64-CHECK: movi d9, #0000000000000000
+; AARCH64-CHECK: movi v0.2d, #0000000000000000
 ; AARCH64-CHECK: bl lrand48
 ; AARCH64-CHECK: mov x19, x0
 ; AARCH64-CHECK: BB0_1:
@@ -47,7 +47,7 @@ end:                                        ; preds = %body
 }
 
 ; AARCH64-CHECK: simulateWithDebugIntrinsic
-; AARCH64-CHECK: movi d9, #0000000000000000
+; AARCH64-CHECK: movi v0.2d, #0000000000000000
 ; AARCH64-CHECK: bl lrand48
 ; AARCH64-CHECK: mov x19, x0
 ; AARCH64-CHECK: BB1_1:
@@ -73,7 +73,7 @@ end:                                        ; preds = %body
 }
 
 ; AARCH64-CHECK: simulateWithDbgDeclare:
-; AARCH64-CHECK: movi d9, #0000000000000000
+; AARCH64-CHECK: movi v0.2d, #0000000000000000
 ; AARCH64-CHECK: bl lrand48
 ; AARCH64-CHECK: mov x19, x0
 ; AARCH64-CHECK: BB2_1:
-- 
GitLab


From 4f4e519ae6641413d5c63a6e11e24fc853a10291 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Thu, 25 Oct 2018 15:00:10 +0000
Subject: [PATCH 0572/1116] [AArch64] Refactor definition of EXT patterns to
 use a multiclass

Using a multiclass reduces duplication, and makes it easier to add new patterns
later. This refactoring does add some new patterns, but as far as I can tell
there's no IR that will end up triggering them so this is effectively NFC.

Differential Revision: https://reviews.llvm.org/D53580


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345271 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrInfo.td | 56 +++++++++-----------------
 1 file changed, 18 insertions(+), 38 deletions(-)

diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 76ea2acef6a..323e74a0519 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4185,44 +4185,24 @@ def : Pat<(concat_vectors (v2i32 V64:$Rd),
 
 defm EXT : SIMDBitwiseExtract<"ext">;
 
-def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-
-// We use EXT to handle extract_subvector to copy the upper 64-bits of a
-// 128-bit vector.
-def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-
+multiclass ExtPat<ValueType VT64, ValueType VT128, int N> {
+  def : Pat<(VT64 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+            (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+  def : Pat<(VT128 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+            (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+  // We use EXT to handle extract_subvector to copy the upper 64-bits of a
+  // 128-bit vector.
+  def : Pat<(VT64 (extract_subvector V128:$Rn, (i64 N))),
+            (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+}
+
+defm : ExtPat<v8i8, v16i8, 8>;
+defm : ExtPat<v4i16, v8i16, 4>;
+defm : ExtPat<v4f16, v8f16, 4>;
+defm : ExtPat<v2i32, v4i32, 2>;
+defm : ExtPat<v2f32, v4f32, 2>;
+defm : ExtPat<v1i64, v2i64, 1>;
+defm : ExtPat<v1f64, v2f64, 1>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD zip vector
-- 
GitLab


From 7efb6dd83cf90da0c8512d1167ef004abf607965 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Thu, 25 Oct 2018 15:08:29 +0000
Subject: [PATCH 0573/1116] [ARM] Use Cortex-A57 sched model for Cortex-A72

This mirrors what we already do for AArch64 as the cores are similar.
As discussed in the review, enabling the machine scheduler causes
more variations in performance changes so it is not enabled for now.
This patch improves LNT scores by a geomean of 1.57% at -O3.

Differential Revision: https://reviews.llvm.org/D53562


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345272 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARM.td                            | 2 +-
 test/Transforms/LoopUnroll/ARM/loop-unrolling.ll | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index fc23495ebf3..b71a09828bc 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -1043,7 +1043,7 @@ def : ProcessorModel<"cortex-a57",  CortexA57Model,     [ARMv8a, ProcA57,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureCheapPredicableCPSR]>;
 
-def : ProcNoItin<"cortex-a72",                          [ARMv8a, ProcA72,
+def : ProcessorModel<"cortex-a72",  CortexA57Model,     [ARMv8a, ProcA72,
                                                          FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
diff --git a/test/Transforms/LoopUnroll/ARM/loop-unrolling.ll b/test/Transforms/LoopUnroll/ARM/loop-unrolling.ll
index c159a88e723..bb5277bedc0 100644
--- a/test/Transforms/LoopUnroll/ARM/loop-unrolling.ll
+++ b/test/Transforms/LoopUnroll/ARM/loop-unrolling.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -mtriple=armv7 -mcpu=cortex-a57 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-A
 ; RUN: opt -mtriple=thumbv7 -mcpu=cortex-a57 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-A
+; RUN: opt -mtriple=thumbv7 -mcpu=cortex-a72 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-A
 ; RUN: opt -mtriple=thumbv8m -mcpu=cortex-m23 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-T1
 ; RUN: opt -mtriple=thumbv8m.main -mcpu=cortex-m33 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-T2
 ; RUN: opt -mtriple=thumbv7em -mcpu=cortex-m7 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-T2
-- 
GitLab


From 7dddaa81329d622fa6814c398a22146e546cc176 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Thu, 25 Oct 2018 15:31:51 +0000
Subject: [PATCH 0574/1116] [AArch64] Add EXT patterns for 64-bit EXT of a
 subvector of a 128-bit vector

If we have a 64-bit EXT where one of the operands is a subvector of a 128-bit
vector then in some cases we can eliminate an extract_subvector by converting
to a 128-bit EXT of the 128-bit vector.

Differential Revision: https://reviews.llvm.org/D53582


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345275 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrInfo.td   |  19 ++
 test/CodeGen/AArch64/ext-narrow-index.ll | 345 +++++++++++++++++++++++
 2 files changed, 364 insertions(+)
 create mode 100644 test/CodeGen/AArch64/ext-narrow-index.ll

diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 323e74a0519..9e9e1429371 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4185,6 +4185,9 @@ def : Pat<(concat_vectors (v2i32 V64:$Rd),
 
 defm EXT : SIMDBitwiseExtract<"ext">;
 
+def AdjustExtImm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(8 + N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
 multiclass ExtPat<ValueType VT64, ValueType VT128, int N> {
   def : Pat<(VT64 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
             (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
@@ -4194,6 +4197,22 @@ multiclass ExtPat<ValueType VT64, ValueType VT128, int N> {
   // 128-bit vector.
   def : Pat<(VT64 (extract_subvector V128:$Rn, (i64 N))),
             (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+  // A 64-bit EXT of two halves of the same 128-bit register can be done as a
+  // single 128-bit EXT.
+  def : Pat<(VT64 (AArch64ext (extract_subvector V128:$Rn, (i64 0)),
+                              (extract_subvector V128:$Rn, (i64 N)),
+                              (i32 imm:$imm))),
+            (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, imm:$imm), dsub)>;
+  // A 64-bit EXT of the high half of a 128-bit register can be done using a
+  // 128-bit EXT of the whole register with an adjustment to the immediate. The
+  // top half of the other operand will be unset, but that doesn't matter as it
+  // will not be used.
+  def : Pat<(VT64 (AArch64ext (extract_subvector V128:$Rn, (i64 N)),
+                              V64:$Rm,
+                              (i32 imm:$imm))),
+            (EXTRACT_SUBREG (EXTv16i8 V128:$Rn,
+                                      (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                                      (AdjustExtImm imm:$imm)), dsub)>;
 }
 
 defm : ExtPat<v8i8, v16i8, 8>;
diff --git a/test/CodeGen/AArch64/ext-narrow-index.ll b/test/CodeGen/AArch64/ext-narrow-index.ll
new file mode 100644
index 00000000000..f7f143ff49e
--- /dev/null
+++ b/test/CodeGen/AArch64/ext-narrow-index.ll
@@ -0,0 +1,345 @@
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; Tests of shufflevector where the index operand is half the width of the vector
+; operands. We should get one ext instruction and not two.
+
+; i8 tests
+define <8 x i8> @i8_off0(<16 x i8> %arg1, <16 x i8> %arg2) {
+; CHECK-LABEL: i8_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_off1(<16 x i8> %arg1, <16 x i8> %arg2) {
+; CHECK-LABEL: i8_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #1
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_off8(<16 x i8> %arg1, <16 x i8> %arg2) {
+; CHECK-LABEL: i8_off8:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_off15(<16 x i8> %arg1, <16 x i8> %arg2) {
+; CHECK-LABEL: i8_off15:
+; CHECK: ext v0.16b, v0.16b, v1.16b, #15
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_off22(<16 x i8> %arg1, <16 x i8> %arg2) {
+; CHECK-LABEL: i8_off22:
+; CHECK: ext v0.16b, v1.16b, v1.16b, #6
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29>
+  ret <8 x i8> %shuffle
+}
+
+; i16 tests
+define <4 x i16> @i16_off0(<8 x i16> %arg1, <8 x i16> %arg2) {
+; CHECK-LABEL: i16_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> %arg2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_off1(<8 x i16> %arg1, <8 x i16> %arg2) {
+; CHECK-LABEL: i16_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #2
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> %arg2, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_off7(<8 x i16> %arg1, <8 x i16> %arg2) {
+; CHECK-LABEL: i16_off7:
+; CHECK: ext v0.16b, v0.16b, v1.16b, #14
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> %arg2, <4 x i32> <i32 7, i32 8, i32 9, i32 10>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_off8(<8 x i16> %arg1, <8 x i16> %arg2) {
+; CHECK-LABEL: i16_off8:
+; CHECK: mov v0.16b, v1.16b
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> %arg2, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  ret <4 x i16> %shuffle
+}
+
+; i32 tests
+define <2 x i32> @i32_off0(<4 x i32> %arg1, <4 x i32> %arg2) {
+; CHECK-LABEL: i32_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> %arg2, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_off1(<4 x i32> %arg1, <4 x i32> %arg2) {
+; CHECK-LABEL: i32_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> %arg2, <2 x i32> <i32 1, i32 2>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_off3(<4 x i32> %arg1, <4 x i32> %arg2) {
+; CHECK-LABEL: i32_off3:
+; CHECK: ext v0.16b, v0.16b, v1.16b, #12
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> %arg2, <2 x i32> <i32 3, i32 4>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_off4(<4 x i32> %arg1, <4 x i32> %arg2) {
+; CHECK-LABEL: i32_off4:
+; CHECK: mov v0.16b, v1.16b
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> %arg2, <2 x i32> <i32 4, i32 5>
+  ret <2 x i32> %shuffle
+}
+
+; i64 tests
+define <1 x i64> @i64_off0(<2 x i64> %arg1, <2 x i64> %arg2) {
+; CHECK-LABEL: i64_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> %arg2, <1 x i32> <i32 0>
+  ret <1 x i64> %shuffle
+}
+
+define <1 x i64> @i64_off1(<2 x i64> %arg1, <2 x i64> %arg2) {
+; CHECK-LABEL: i64_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> %arg2, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle
+}
+
+define <1 x i64> @i64_off2(<2 x i64> %arg1, <2 x i64> %arg2) {
+; CHECK-LABEL: i64_off2:
+; CHECK: mov v0.16b, v1.16b
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> %arg2, <1 x i32> <i32 2>
+  ret <1 x i64> %shuffle
+}
+
+; i8 tests with second operand zero
+define <8 x i8> @i8_zero_off0(<16 x i8> %arg1) {
+; CHECK-LABEL: i8_zero_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_zero_off1(<16 x i8> %arg1) {
+; CHECK-LABEL: i8_zero_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #1
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_zero_off8(<16 x i8> %arg1) {
+; CHECK-LABEL: i8_zero_off8:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_zero_off15(<16 x i8> %arg1) {
+; CHECK-LABEL: i8_zero_off15:
+; CHECK: movi [[REG:v[0-9]+]].2d, #0
+; CHECK: ext v0.16b, v0.16b, [[REG]].16b, #15
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_zero_off22(<16 x i8> %arg1) {
+; CHECK-LABEL: i8_zero_off22:
+; CHECK: movi v0.2d, #0
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29>
+  ret <8 x i8> %shuffle
+}
+
+; i16 tests with second operand zero
+define <4 x i16> @i16_zero_off0(<8 x i16> %arg1) {
+; CHECK-LABEL: i16_zero_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_zero_off1(<8 x i16> %arg1) {
+; CHECK-LABEL: i16_zero_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #2
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_zero_off7(<8 x i16> %arg1) {
+; CHECK-LABEL: i16_zero_off7:
+; CHECK: movi [[REG:v[0-9]+]].2d, #0
+; CHECK: ext v0.16b, v0.16b, [[REG]].16b, #14
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> zeroinitializer, <4 x i32> <i32 7, i32 8, i32 9, i32 10>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_zero_off8(<8 x i16> %arg1) {
+; CHECK-LABEL: i16_zero_off8:
+; CHECK: movi v0.2d, #0
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> zeroinitializer, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  ret <4 x i16> %shuffle
+}
+
+; i32 tests with second operand zero
+define <2 x i32> @i32_zero_off0(<4 x i32> %arg1) {
+; CHECK-LABEL: i32_zero_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_zero_off1(<4 x i32> %arg1) {
+; CHECK-LABEL: i32_zero_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> zeroinitializer, <2 x i32> <i32 1, i32 2>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_zero_off3(<4 x i32> %arg1) {
+; CHECK-LABEL: i32_zero_off3:
+; CHECK: movi [[REG:v[0-9]+]].2d, #0
+; CHECK: ext v0.16b, v0.16b, [[REG]].16b, #12
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> zeroinitializer, <2 x i32> <i32 3, i32 4>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_zero_off4(<4 x i32> %arg1) {
+; CHECK-LABEL: i32_zero_off4:
+; CHECK: movi v0.2d, #0
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> zeroinitializer, <2 x i32> <i32 4, i32 5>
+  ret <2 x i32> %shuffle
+}
+
+; i64 tests with second operand zero
+define <1 x i64> @i64_zero_off0(<2 x i64> %arg1) {
+; CHECK-LABEL: i64_zero_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> zeroinitializer, <1 x i32> <i32 0>
+  ret <1 x i64> %shuffle
+}
+
+define <1 x i64> @i64_zero_off1(<2 x i64> %arg1) {
+; CHECK-LABEL: i64_zero_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> zeroinitializer, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle
+}
+
+define <1 x i64> @i64_zero_off2(<2 x i64> %arg1) {
+; CHECK-LABEL: i64_zero_off2:
+; CHECK: fmov d0, xzr
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> zeroinitializer, <1 x i32> <i32 2>
+  ret <1 x i64> %shuffle
+}
-- 
GitLab


From c8f0858dfc52c8058cae0231b9a1913b833f3e54 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 25 Oct 2018 15:33:47 +0000
Subject: [PATCH 0575/1116] [ARM] Regenerate vdup tests

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345276 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/ARM/vdup.ll | 352 ++++++++++++++++++++++++++++++---------
 1 file changed, 271 insertions(+), 81 deletions(-)

diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index b7693c79763..c16a2a9e3c0 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -1,9 +1,12 @@
-; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs %s -o - \
-; RUN:	| FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs | FileCheck %s
 
 define <8 x i8> @v_dup8(i8 %A) nounwind {
-;CHECK-LABEL: v_dup8:
-;CHECK: vdup.8
+; CHECK-LABEL: v_dup8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.8 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
 	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
 	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
@@ -16,8 +19,11 @@ define <8 x i8> @v_dup8(i8 %A) nounwind {
 }
 
 define <4 x i16> @v_dup16(i16 %A) nounwind {
-;CHECK-LABEL: v_dup16:
-;CHECK: vdup.16
+; CHECK-LABEL: v_dup16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.16 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
 	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
 	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
@@ -26,24 +32,34 @@ define <4 x i16> @v_dup16(i16 %A) nounwind {
 }
 
 define <2 x i32> @v_dup32(i32 %A) nounwind {
-;CHECK-LABEL: v_dup32:
-;CHECK: vdup.32
+; CHECK-LABEL: v_dup32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
 	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
 	ret <2 x i32> %tmp2
 }
 
 define <2 x float> @v_dupfloat(float %A) nounwind {
-;CHECK-LABEL: v_dupfloat:
-;CHECK: vdup.32
+; CHECK-LABEL: v_dupfloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
 	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
 	ret <2 x float> %tmp2
 }
 
 define <16 x i8> @v_dupQ8(i8 %A) nounwind {
-;CHECK-LABEL: v_dupQ8:
-;CHECK: vdup.8
+; CHECK-LABEL: v_dupQ8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.8 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
 	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
 	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
@@ -64,8 +80,12 @@ define <16 x i8> @v_dupQ8(i8 %A) nounwind {
 }
 
 define <8 x i16> @v_dupQ16(i16 %A) nounwind {
-;CHECK-LABEL: v_dupQ16:
-;CHECK: vdup.16
+; CHECK-LABEL: v_dupQ16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.16 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
 	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
 	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
@@ -78,8 +98,12 @@ define <8 x i16> @v_dupQ16(i16 %A) nounwind {
 }
 
 define <4 x i32> @v_dupQ32(i32 %A) nounwind {
-;CHECK-LABEL: v_dupQ32:
-;CHECK: vdup.32
+; CHECK-LABEL: v_dupQ32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
 	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
 	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
@@ -88,8 +112,12 @@ define <4 x i32> @v_dupQ32(i32 %A) nounwind {
 }
 
 define <4 x float> @v_dupQfloat(float %A) nounwind {
-;CHECK-LABEL: v_dupQfloat:
-;CHECK: vdup.32
+; CHECK-LABEL: v_dupQfloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
 	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
 	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
@@ -100,163 +128,248 @@ define <4 x float> @v_dupQfloat(float %A) nounwind {
 ; Check to make sure it works with shuffles, too.
 
 define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
-;CHECK-LABEL: v_shuffledup8:
-;CHECK: vdup.8
+; CHECK-LABEL: v_shuffledup8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.8 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 	ret <8 x i8> %tmp2
 }
 
 define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
-;CHECK-LABEL: v_shuffledup16:
-;CHECK: vdup.16
+; CHECK-LABEL: v_shuffledup16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.16 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	ret <4 x i16> %tmp2
 }
 
 define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
-;CHECK-LABEL: v_shuffledup32:
-;CHECK: vdup.32
+; CHECK-LABEL: v_shuffledup32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	ret <2 x i32> %tmp2
 }
 
 define <2 x float> @v_shuffledupfloat(float %A) nounwind {
-;CHECK-LABEL: v_shuffledupfloat:
-;CHECK: vdup.32
+; CHECK-LABEL: v_shuffledupfloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
 	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
 	ret <2 x float> %tmp2
 }
 
 define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ8:
-;CHECK: vdup.8
+; CHECK-LABEL: v_shuffledupQ8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.8 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
 	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
 	ret <16 x i8> %tmp2
 }
 
 define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ16:
-;CHECK: vdup.16
+; CHECK-LABEL: v_shuffledupQ16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.16 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
 	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
 	ret <8 x i16> %tmp2
 }
 
 define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ32:
-;CHECK: vdup.32
+; CHECK-LABEL: v_shuffledupQ32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
 	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
 	ret <4 x i32> %tmp2
 }
 
 define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
-;CHECK-LABEL: v_shuffledupQfloat:
-;CHECK: vdup.32
+; CHECK-LABEL: v_shuffledupQfloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
 	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
 	ret <4 x float> %tmp2
 }
 
 define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vduplane8:
-;CHECK: vdup.8
+; CHECK-LABEL: vduplane8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.8 d16, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
 	ret <8 x i8> %tmp2
 }
 
 define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vduplane16:
-;CHECK: vdup.16
+; CHECK-LABEL: vduplane16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.16 d16, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
 	ret <4 x i16> %tmp2
 }
 
 define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vduplane32:
-;CHECK: vdup.32
+; CHECK-LABEL: vduplane32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.32 d16, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
 	ret <2 x i32> %tmp2
 }
 
 define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
-;CHECK-LABEL: vduplanefloat:
-;CHECK: vdup.32
+; CHECK-LABEL: vduplanefloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.32 d16, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x float>, <2 x float>* %A
 	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
 	ret <2 x float> %tmp2
 }
 
 define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ8:
-;CHECK: vdup.8
+; CHECK-LABEL: vduplaneQ8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.8 q8, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
 	ret <16 x i8> %tmp2
 }
 
 define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ16:
-;CHECK: vdup.16
+; CHECK-LABEL: vduplaneQ16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.16 q8, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
 	ret <8 x i16> %tmp2
 }
 
 define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ32:
-;CHECK: vdup.32
+; CHECK-LABEL: vduplaneQ32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.32 q8, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
 	ret <4 x i32> %tmp2
 }
 
 define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
-;CHECK-LABEL: vduplaneQfloat:
-;CHECK: vdup.32
+; CHECK-LABEL: vduplaneQfloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.32 q8, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x float>, <2 x float>* %A
 	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
 	ret <4 x float> %tmp2
 }
 
 define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+; CHECK-LABEL: foo:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
   ret <2 x i64> %0
 }
 
 define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+; CHECK-LABEL: bar:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    mov r3, r1
+; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   ret <2 x i64> %0
 }
 
 define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
+; CHECK-LABEL: baz:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   ret <2 x double> %0
 }
 
 define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
+; CHECK-LABEL: qux:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    mov r3, r1
+; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
   ret <2 x double> %0
 }
 
 ; Radar 7373643
-;CHECK-LABEL: redundantVdup:
-;CHECK: vmov.i8
-;CHECK-NOT: vdup.8
-;CHECK: vstr
 define void @redundantVdup(<8 x i8>* %ptr) nounwind {
+; CHECK-LABEL: redundantVdup:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 d16, #0x80
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    mov pc, lr
   %1 = insertelement <8 x i8> undef, i8 -128, i32 0
   %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
   store <8 x i8> %2, <8 x i8>* %ptr, align 8
@@ -264,8 +377,13 @@ define void @redundantVdup(<8 x i8>* %ptr) nounwind {
 }
 
 define <4 x i32> @tdupi(i32 %x, i32 %y) {
-;CHECK-LABEL: tdupi:
-;CHECK: vdup.32
+; CHECK-LABEL: tdupi:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q8, r0
+; CHECK-NEXT:    vmov.32 d17[1], r1
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
   %1 = insertelement <4 x i32> undef, i32 %x, i32 0
   %2 = insertelement <4 x i32> %1, i32 %x, i32 1
   %3 = insertelement <4 x i32> %2, i32 %x, i32 2
@@ -274,8 +392,13 @@ define <4 x i32> @tdupi(i32 %x, i32 %y) {
 }
 
 define <4 x float> @tdupf(float %x, float %y) {
-;CHECK-LABEL: tdupf:
-;CHECK: vdup.32
+; CHECK-LABEL: tdupf:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov s3, r1
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    mov pc, lr
   %1 = insertelement <4 x float> undef, float %x, i32 0
   %2 = insertelement <4 x float> %1, float %x, i32 1
   %3 = insertelement <4 x float> %2, float %x, i32 2
@@ -286,9 +409,15 @@ define <4 x float> @tdupf(float %x, float %y) {
 ; This test checks that when splatting an element from a vector into another,
 ; the value isn't moved out to GPRs first.
 define <4 x i32> @tduplane(<4 x i32> %invec) {
-;CHECK-LABEL: tduplane:
-;CHECK-NOT: vmov {{.*}}, d16[1]
-;CHECK: vdup.32 {{.*}}, d16[1]
+; CHECK-LABEL: tduplane:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r0, #255
+; CHECK-NEXT:    vdup.32 q8, d16[1]
+; CHECK-NEXT:    vmov.32 d17[1], r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
   %in = extractelement <4 x i32> %invec, i32 1
   %1 = insertelement <4 x i32> undef, i32 %in, i32 0
   %2 = insertelement <4 x i32> %1, i32 %in, i32 1
@@ -298,8 +427,13 @@ define <4 x i32> @tduplane(<4 x i32> %invec) {
 }
 
 define <2 x float> @check_f32(<4 x float> %v) nounwind {
-;CHECK-LABEL: check_f32:
-;CHECK: vdup.32 {{.*}}, d{{..}}[1]
+; CHECK-LABEL: check_f32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vdup.32 d16, d17[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
   %x = extractelement <4 x float> %v, i32 3
   %1 = insertelement  <2 x float> undef, float %x, i32 0
   %2 = insertelement  <2 x float> %1, float %x, i32 1
@@ -307,8 +441,13 @@ define <2 x float> @check_f32(<4 x float> %v) nounwind {
 }
 
 define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
-;CHECK-LABEL: check_i32:
-;CHECK: vdup.32 {{.*}}, d{{..}}[1]
+; CHECK-LABEL: check_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vdup.32 d16, d17[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
   %x = extractelement <4 x i32> %v, i32 3
   %1 = insertelement  <2 x i32> undef, i32 %x, i32 0
   %2 = insertelement  <2 x i32> %1, i32 %x, i32 1
@@ -316,8 +455,13 @@ define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
 }
 
 define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
-;CHECK-LABEL: check_i16:
-;CHECK: vdup.16 {{.*}}, d{{..}}[3]
+; CHECK-LABEL: check_i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vdup.16 d16, d16[3]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
   %x = extractelement <8 x i16> %v, i32 3
   %1 = insertelement  <4 x i16> undef, i16 %x, i32 0
   %2 = insertelement  <4 x i16> %1, i16 %x, i32 1
@@ -325,8 +469,13 @@ define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
 }
 
 define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
-;CHECK-LABEL: check_i8:
-;CHECK: vdup.8 {{.*}}, d{{..}}[3]
+; CHECK-LABEL: check_i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vdup.8 d16, d16[3]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
   %x = extractelement <16 x i8> %v, i32 3
   %1 = insertelement  <8  x i8> undef, i8 %x, i32 0
   %2 = insertelement  <8  x i8> %1, i8 %x, i32 1
@@ -336,8 +485,16 @@ define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
 ; Check that an SPR splat produces a vdup.
 
 define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
-;CHECK-LABEL: check_spr_splat2:
-;CHECK: vdup.32 d
+; CHECK-LABEL: check_spr_splat2:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    lsl r2, r2, #16
+; CHECK-NEXT:    vmov d17, r0, r1
+; CHECK-NEXT:    asr r2, r2, #16
+; CHECK-NEXT:    vdup.32 d16, r2
+; CHECK-NEXT:    vcvt.f32.s32 d16, d16
+; CHECK-NEXT:    vsub.f32 d16, d16, d17
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
   %conv = sitofp i16 %q to float
   %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0
   %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer
@@ -346,8 +503,18 @@ define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
 }
 
 define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
-;CHECK-LABEL: check_spr_splat4:
-;CHECK: vld1.16
+; CHECK-LABEL: check_spr_splat4:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r12, sp
+; CHECK-NEXT:    vmov d19, r2, r3
+; CHECK-NEXT:    vld1.16 {d16[]}, [r12:16]
+; CHECK-NEXT:    vmov d18, r0, r1
+; CHECK-NEXT:    vmovl.s16 q8, d16
+; CHECK-NEXT:    vcvt.f32.s32 q8, q8
+; CHECK-NEXT:    vsub.f32 q8, q8, q9
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
   %conv = sitofp i16 %q to float
   %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0
   %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
@@ -356,8 +523,18 @@ define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
 }
 ; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant.
 define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
-;CHECK-LABEL: check_spr_splat4_lane1:
-;CHECK: vld1.16
+; CHECK-LABEL: check_spr_splat4_lane1:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r12, sp
+; CHECK-NEXT:    vmov d19, r2, r3
+; CHECK-NEXT:    vld1.16 {d16[]}, [r12:16]
+; CHECK-NEXT:    vmov d18, r0, r1
+; CHECK-NEXT:    vmovl.s16 q8, d16
+; CHECK-NEXT:    vcvt.f32.s32 q8, q8
+; CHECK-NEXT:    vsub.f32 q8, q8, q9
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
   %conv = sitofp i16 %q to float
   %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1
   %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -370,12 +547,25 @@ define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
 
 define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
 ; CHECK-LABEL: check_i8_varidx:
-; CHECK: mov r[[FP:[0-9]+]], sp
-; CHECK: ldr r[[IDX:[0-9]+]], [r[[FP]], #4]
-; CHECK: mov r[[SPCOPY:[0-9]+]], sp
-; CHECK: and r[[MASKED_IDX:[0-9]+]], r[[IDX]], #15
-; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[MASKED_IDX]]
-; CHECK: vld1.8 {d{{.*}}[]}, [r[[SPCOPY]]]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11}
+; CHECK-NEXT:    push {r11}
+; CHECK-NEXT:    .setfp r11, sp
+; CHECK-NEXT:    mov r11, sp
+; CHECK-NEXT:    .pad #28
+; CHECK-NEXT:    sub sp, sp, #28
+; CHECK-NEXT:    bic sp, sp, #15
+; CHECK-NEXT:    ldr r12, [r11, #4]
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    and r0, r12, #15
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1:128], r0
+; CHECK-NEXT:    vld1.8 {d16[]}, [r1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov sp, r11
+; CHECK-NEXT:    pop {r11}
+; CHECK-NEXT:    mov pc, lr
   %x = extractelement <16 x i8> %v, i32 %idx
   %1 = insertelement  <8 x i8> undef, i8 %x, i32 0
   %2 = insertelement  <8 x i8> %1, i8 %x, i32 1
-- 
GitLab


From c915aaf6e9a3cd43f773a05ba0e1d3d9e007b1eb Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Thu, 25 Oct 2018 16:45:46 +0000
Subject: [PATCH 0576/1116] [AArch64] Refactor Exynos feature sets (NFC)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345279 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64.td | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index de78ca5b257..e3f69c7509f 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -408,15 +408,13 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                      FeatureCRC,
                                      FeatureCrypto,
                                      FeatureExynosCheapAsMoveHandling,
-                                     FeatureFPARMv8,
+                                     FeatureForce32BitJumpTables,
                                      FeatureFuseAES,
-                                     FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
                                      FeatureSlowMisaligned128Store,
                                      FeatureUseRSqrt,
-                                     FeatureZCZeroingFP,
-                                     FeatureForce32BitJumpTables]>;
+                                     FeatureZCZeroingFP]>;
 
 def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                     "Samsung Exynos-M2 processors",
@@ -424,32 +422,28 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                      FeatureCRC,
                                      FeatureCrypto,
                                      FeatureExynosCheapAsMoveHandling,
-                                     FeatureFPARMv8,
+                                     FeatureForce32BitJumpTables,
                                      FeatureFuseAES,
-                                     FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
                                      FeatureSlowMisaligned128Store,
-                                     FeatureZCZeroingFP,
-                                     FeatureForce32BitJumpTables]>;
+                                     FeatureZCZeroingFP]>;
 
 def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                     "Samsung Exynos-M3 processors",
                                     [FeatureCRC,
                                      FeatureCrypto,
                                      FeatureExynosCheapAsMoveHandling,
-                                     FeatureFPARMv8,
+                                     FeatureForce32BitJumpTables,
                                      FeatureFuseAddress,
                                      FeatureFuseAES,
                                      FeatureFuseCCSelect,
                                      FeatureFuseLiterals,
                                      FeatureLSLFast,
-                                     FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
                                      FeaturePredictableSelectIsExpensive,
-                                     FeatureZCZeroingFP,
-                                     FeatureForce32BitJumpTables]>;
+                                     FeatureZCZeroingFP]>;
 
 def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    "Qualcomm Kryo processors", [
-- 
GitLab


From a64e4ae25fdfa79a55645db00a89f82bb192d518 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Thu, 25 Oct 2018 17:03:51 +0000
Subject: [PATCH 0577/1116] [llvm-mca] Introduce a new base class for
 mca::Instruction, and change how read/write information is stored.

This patch introduces a new base class for Instruction named InstructionBase.
Class InstructionBase is responsible for tracking data dependencies with the
help of ReadState and WriteState objects.  Class Instruction now derives from
InstructionBase, and adds extra information related to the `InstrStage` as well
as the `RCUTokenID`.

ReadState and WriteState objects are no longer unique pointers. This avoids
extra heap allocation and pointer checks that weren't really needed.  Now, those
objects are simply stored into SmallVectors.  We use a SmallVector instead of a
std::vector because we expect most instructions to only have a very small number
of reads and writes.  By using a simple SmallVector we also avoid extra heap
allocations most of the time.
In a debug build, this improves the performance of llvm-mca by roughly 10% (I
still have to verify the impact in performance on a release build).


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345280 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/include/Instruction.h        | 106 +++++++++++---------
 tools/llvm-mca/lib/InstrBuilder.cpp         |  12 +--
 tools/llvm-mca/lib/Instruction.cpp          |  24 ++---
 tools/llvm-mca/lib/Stages/DispatchStage.cpp |  17 ++--
 tools/llvm-mca/lib/Stages/RetireStage.cpp   |   4 +-
 5 files changed, 84 insertions(+), 79 deletions(-)

diff --git a/tools/llvm-mca/include/Instruction.h b/tools/llvm-mca/include/Instruction.h
index a1d1082a215..9d1c91ad441 100644
--- a/tools/llvm-mca/include/Instruction.h
+++ b/tools/llvm-mca/include/Instruction.h
@@ -16,7 +16,9 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
 #define LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/MathExtras.h"
 
 #ifndef NDEBUG
@@ -134,8 +136,6 @@ public:
       : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
         ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero),
         IsEliminated(false), DependentWrite(nullptr), NumWriteUsers(0U) {}
-  WriteState(const WriteState &Other) = delete;
-  WriteState &operator=(const WriteState &Other) = delete;
 
   int getCyclesLeft() const { return CyclesLeft; }
   unsigned getWriteResourceID() const { return WD.SClassOrWriteResourceID; }
@@ -205,8 +205,6 @@ public:
       : RD(Desc), RegisterID(RegID), DependentWrites(0),
         CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true),
         IndependentFromDef(false) {}
-  ReadState(const ReadState &Other) = delete;
-  ReadState &operator=(const ReadState &Other) = delete;
 
   const ReadDescriptor &getDescriptor() const { return RD; }
   unsigned getSchedClass() const { return RD.SchedClassID; }
@@ -313,13 +311,59 @@ struct InstrDesc {
   InstrDesc &operator=(const InstrDesc &Other) = delete;
 };
 
+/// Base class for instructions consumed by the simulation pipeline.
+///
+/// This class tracks data dependencies as well as generic properties
+/// of the instruction.
+class InstructionBase {
+  const InstrDesc &Desc;
+
+  // This field is set for instructions that are candidates for move
+  // elimination. For more information about move elimination, see the
+  // definition of RegisterMappingTracker in RegisterFile.h
+  bool IsOptimizableMove;
+
+  // Output dependencies.
+  // One entry per each implicit and explicit register definition.
+  llvm::SmallVector<WriteState, 4> Defs;
+
+  // Input dependencies.
+  // One entry per each implicit and explicit register use.
+  llvm::SmallVector<ReadState, 4> Uses;
+
+public:
+  InstructionBase(const InstrDesc &D) : Desc(D), IsOptimizableMove(false) {}
+
+  llvm::SmallVectorImpl<WriteState> &getDefs() { return Defs; }
+  const llvm::ArrayRef<WriteState> getDefs() const { return Defs; }
+  llvm::SmallVectorImpl<ReadState> &getUses() { return Uses; }
+  const llvm::ArrayRef<ReadState> getUses() const { return Uses; }
+  const InstrDesc &getDesc() const { return Desc; }
+
+  unsigned getLatency() const { return Desc.MaxLatency; }
+
+  bool hasDependentUsers() const {
+    return llvm::any_of(
+        Defs, [](const WriteState &Def) { return Def.getNumUsers() > 0; });
+  }
+
+  unsigned getNumUsers() const {
+    unsigned NumUsers = 0;
+    for (const WriteState &Def : Defs)
+      NumUsers += Def.getNumUsers();
+    return NumUsers;
+  }
+
+  // Returns true if this instruction is a candidate for move elimination.
+  bool isOptimizableMove() const { return IsOptimizableMove; }
+  void setOptimizableMove() { IsOptimizableMove = true; }
+};
+
 /// An instruction propagated through the simulated instruction pipeline.
 ///
 /// This class is used to monitor changes to the internal state of instructions
 /// that are sent to the various components of the simulated hardware pipeline.
-class Instruction {
-  const InstrDesc &Desc;
-
+class Instruction : public InstructionBase {
   enum InstrStage {
     IS_INVALID,   // Instruction in an invalid state.
     IS_AVAILABLE, // Instruction dispatched but operands are not ready.
@@ -339,51 +383,16 @@ class Instruction {
   // Retire Unit token ID for this instruction.
   unsigned RCUTokenID;
 
-  // This field is set for instructions that are candidates for move
-  // elimination. For more information about move elimination, see the
-  // definition of RegisterMappingTracker in RegisterFile.h
-  bool IsOptimizableMove;
-
-  using UniqueDef = std::unique_ptr<WriteState>;
-  using UniqueUse = std::unique_ptr<ReadState>;
-  using VecDefs = std::vector<UniqueDef>;
-  using VecUses = std::vector<UniqueUse>;
-
-  // Output dependencies.
-  // One entry per each implicit and explicit register definition.
-  VecDefs Defs;
-
-  // Input dependencies.
-  // One entry per each implicit and explicit register use.
-  VecUses Uses;
-
 public:
   Instruction(const InstrDesc &D)
-      : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0),
-        IsOptimizableMove(false) {}
+      : InstructionBase(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES),
+        RCUTokenID(0) {}
   Instruction(const Instruction &Other) = delete;
   Instruction &operator=(const Instruction &Other) = delete;
 
-  VecDefs &getDefs() { return Defs; }
-  const VecDefs &getDefs() const { return Defs; }
-  VecUses &getUses() { return Uses; }
-  const VecUses &getUses() const { return Uses; }
-  const InstrDesc &getDesc() const { return Desc; }
   unsigned getRCUTokenID() const { return RCUTokenID; }
   int getCyclesLeft() const { return CyclesLeft; }
 
-  bool hasDependentUsers() const {
-    return llvm::any_of(
-        Defs, [](const UniqueDef &Def) { return Def->getNumUsers() > 0; });
-  }
-
-  unsigned getNumUsers() const {
-    unsigned NumUsers = 0;
-    for (const UniqueDef &Def : Defs)
-      NumUsers += Def->getNumUsers();
-    return NumUsers;
-  }
-
   // Transition to the dispatch stage, and assign a RCUToken to this
   // instruction. The RCUToken is used to track the completion of every
   // register write performed by this instruction.
@@ -407,13 +416,10 @@ public:
   bool isExecuted() const { return Stage == IS_EXECUTED; }
   bool isRetired() const { return Stage == IS_RETIRED; }
 
-  // Returns true if this instruction is a candidate for move elimination.
-  bool isOptimizableMove() const { return IsOptimizableMove; }
-  void setOptimizableMove() { IsOptimizableMove = true; }
   bool isEliminated() const {
-    return isReady() && Defs.size() &&
-           llvm::all_of(Defs,
-                        [](const UniqueDef &D) { return D->isEliminated(); });
+    return isReady() && getDefs().size() &&
+           llvm::all_of(getDefs(),
+                        [](const WriteState &W) { return W.isEliminated(); });
   }
 
   // Forces a transition from state IS_AVAILABLE to state IS_EXECUTED.
diff --git a/tools/llvm-mca/lib/InstrBuilder.cpp b/tools/llvm-mca/lib/InstrBuilder.cpp
index 3768c2e7088..3704eaf6a50 100644
--- a/tools/llvm-mca/lib/InstrBuilder.cpp
+++ b/tools/llvm-mca/lib/InstrBuilder.cpp
@@ -482,14 +482,15 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
 
     // Okay, this is a register operand. Create a ReadState for it.
     assert(RegID > 0 && "Invalid register ID found!");
-    auto RS = llvm::make_unique<ReadState>(RD, RegID);
+    NewIS->getUses().emplace_back(RD, RegID);
+    ReadState &RS = NewIS->getUses().back();
 
     if (IsDepBreaking) {
       // A mask of all zeroes means: explicit input operands are not
       // independent.
       if (Mask.isNullValue()) {
         if (!RD.isImplicitRead())
-          RS->setIndependentFromDef();
+          RS.setIndependentFromDef();
       } else {
         // Check if this register operand is independent according to `Mask`.
         // Note that Mask may not have enough bits to describe all explicit and
@@ -499,11 +500,10 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
         if (Mask.getBitWidth() > RD.UseIndex) {
           // Okay. This map describe register use `RD.UseIndex`.
           if (Mask[RD.UseIndex])
-            RS->setIndependentFromDef();
+            RS.setIndependentFromDef();
         }
       }
     }
-    NewIS->getUses().emplace_back(std::move(RS));
   }
 
   // Early exit if there are no writes.
@@ -530,9 +530,9 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
     }
 
     assert(RegID && "Expected a valid register ID!");
-    NewIS->getDefs().emplace_back(llvm::make_unique<WriteState>(
+    NewIS->getDefs().emplace_back(
         WD, RegID, /* ClearsSuperRegs */ WriteMask[WriteIndex],
-        /* WritesZero */ IsZeroIdiom));
+        /* WritesZero */ IsZeroIdiom);
     ++WriteIndex;
   }
 
diff --git a/tools/llvm-mca/lib/Instruction.cpp b/tools/llvm-mca/lib/Instruction.cpp
index 511e7b20703..12b6e185ced 100644
--- a/tools/llvm-mca/lib/Instruction.cpp
+++ b/tools/llvm-mca/lib/Instruction.cpp
@@ -120,10 +120,10 @@ void Instruction::execute() {
   Stage = IS_EXECUTING;
 
   // Set the cycles left before the write-back stage.
-  CyclesLeft = Desc.MaxLatency;
+  CyclesLeft = getLatency();
 
-  for (UniqueDef &Def : Defs)
-    Def->onInstructionIssued();
+  for (WriteState &WS : getDefs())
+    WS.onInstructionIssued();
 
   // Transition to the "executed" stage if this is a zero-latency instruction.
   if (!CyclesLeft)
@@ -139,21 +139,21 @@ void Instruction::forceExecuted() {
 void Instruction::update() {
   assert(isDispatched() && "Unexpected instruction stage found!");
 
-  if (!all_of(Uses, [](const UniqueUse &Use) { return Use->isReady(); }))
+  if (!all_of(getUses(), [](const ReadState &Use) { return Use.isReady(); }))
     return;
 
   // A partial register write cannot complete before a dependent write.
-  auto IsDefReady = [&](const UniqueDef &Def) {
-    if (const WriteState *Write = Def->getDependentWrite()) {
+  auto IsDefReady = [&](const WriteState &Def) {
+    if (const WriteState *Write = Def.getDependentWrite()) {
       int WriteLatency = Write->getCyclesLeft();
       if (WriteLatency == UNKNOWN_CYCLES)
         return false;
-      return static_cast<unsigned>(WriteLatency) < Desc.MaxLatency;
+      return static_cast<unsigned>(WriteLatency) < getLatency();
     }
     return true;
   };
 
-  if (all_of(Defs, IsDefReady))
+  if (all_of(getDefs(), IsDefReady))
     Stage = IS_READY;
 }
 
@@ -162,8 +162,8 @@ void Instruction::cycleEvent() {
     return;
 
   if (isDispatched()) {
-    for (UniqueUse &Use : Uses)
-      Use->cycleEvent();
+    for (ReadState &Use : getUses())
+      Use.cycleEvent();
 
     update();
     return;
@@ -171,8 +171,8 @@ void Instruction::cycleEvent() {
 
   assert(isExecuting() && "Instruction not in-flight?");
   assert(CyclesLeft && "Instruction already executed?");
-  for (UniqueDef &Def : Defs)
-    Def->cycleEvent();
+  for (WriteState &Def : getDefs())
+    Def.cycleEvent();
   CyclesLeft--;
   if (!CyclesLeft)
     Stage = IS_EXECUTED;
diff --git a/tools/llvm-mca/lib/Stages/DispatchStage.cpp b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
index 653f39bf5b7..0246151c64c 100644
--- a/tools/llvm-mca/lib/Stages/DispatchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
@@ -37,9 +37,8 @@ void DispatchStage::notifyInstructionDispatched(const InstRef &IR,
 
 bool DispatchStage::checkPRF(const InstRef &IR) const {
   SmallVector<unsigned, 4> RegDefs;
-  for (const std::unique_ptr<WriteState> &RegDef :
-       IR.getInstruction()->getDefs())
-    RegDefs.emplace_back(RegDef->getRegisterID());
+  for (const WriteState &RegDef : IR.getInstruction()->getDefs())
+    RegDefs.emplace_back(RegDef.getRegisterID());
 
   const unsigned RegisterMask = PRF.isAvailable(RegDefs);
   // A mask with all zeroes means: register files are available.
@@ -105,7 +104,7 @@ Error DispatchStage::dispatch(InstRef IR) {
   if (IS.isOptimizableMove()) {
     assert(IS.getDefs().size() == 1 && "Expected a single input!");
     assert(IS.getUses().size() == 1 && "Expected a single output!");
-    IsEliminated = PRF.tryEliminateMove(*IS.getDefs()[0], *IS.getUses()[0]);
+    IsEliminated = PRF.tryEliminateMove(IS.getDefs()[0], IS.getUses()[0]);
   }
 
   // A dependency-breaking instruction doesn't have to wait on the register
@@ -118,9 +117,9 @@ Error DispatchStage::dispatch(InstRef IR) {
   // We also don't update data dependencies for instructions that have been
   // eliminated at register renaming stage.
   if (!IsEliminated) {
-    for (std::unique_ptr<ReadState> &RS : IS.getUses()) {
-      if (!RS->isIndependentFromDef())
-        updateRAWDependencies(*RS, STI);
+    for (ReadState &RS : IS.getUses()) {
+      if (!RS.isIndependentFromDef())
+        updateRAWDependencies(RS, STI);
     }
   }
 
@@ -128,8 +127,8 @@ Error DispatchStage::dispatch(InstRef IR) {
   // at register renaming stage. That means, no physical register is allocated
   // to the instruction.
   SmallVector<unsigned, 4> RegisterFiles(PRF.getNumRegisterFiles());
-  for (std::unique_ptr<WriteState> &WS : IS.getDefs())
-    PRF.addRegisterWrite(WriteRef(IR.getSourceIndex(), WS.get()),
+  for (WriteState &WS : IS.getDefs())
+    PRF.addRegisterWrite(WriteRef(IR.getSourceIndex(), &WS),
                          RegisterFiles);
 
   // Reserve slots in the RCU, and notify the instruction that it has been
diff --git a/tools/llvm-mca/lib/Stages/RetireStage.cpp b/tools/llvm-mca/lib/Stages/RetireStage.cpp
index 3c923e4bb05..8297c9c9ea5 100644
--- a/tools/llvm-mca/lib/Stages/RetireStage.cpp
+++ b/tools/llvm-mca/lib/Stages/RetireStage.cpp
@@ -52,8 +52,8 @@ void RetireStage::notifyInstructionRetired(const InstRef &IR) const {
   llvm::SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles());
   const Instruction &Inst = *IR.getInstruction();
 
-  for (const std::unique_ptr<WriteState> &WS : Inst.getDefs())
-    PRF.removeRegisterWrite(*WS.get(), FreedRegs);
+  for (const WriteState &WS : Inst.getDefs())
+    PRF.removeRegisterWrite(WS, FreedRegs);
   notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs));
 }
 
-- 
GitLab


From a46fc2a08457f8ca892210d7969287dab87364f7 Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Thu, 25 Oct 2018 17:23:25 +0000
Subject: [PATCH 0578/1116] [AArch64][GlobalISel] Fix the LegalityPredicate for
 lowerIf for G_LOAD/G_STORE

Summary:
Currently, Legalizer is trying to lower G_LOAD with a vector type
that has more than two elements due to the incorrect LegalityPredicate.

This patch fixes the issue by removing the multiplication by 8
as `MemDesc.Size` already contains the size in bits.

Reviewers: dsanders, aemerson

Reviewed By: dsanders

Subscribers: rovka, javed.absar, kristof.beyls, llvm-commits

Differential Revision: https://reviews.llvm.org/D53679

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345282 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64LegalizerInfo.cpp   |  4 ++--
 .../GlobalISel/legalize-load-v4s32.mir        | 21 +++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-load-v4s32.mir

diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index b3c2fbf2b15..f0648f0b071 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -167,7 +167,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .unsupportedIfMemSizeNotPow2()
       // Lower any any-extending loads left into G_ANYEXT and G_LOAD
       .lowerIf([=](const LegalityQuery &Query) {
-        return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size * 8;
+        return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size;
       })
       .clampNumElements(0, v2s32, v2s32);
 
@@ -185,7 +185,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .unsupportedIfMemSizeNotPow2()
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
-               Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size * 8;
+               Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size;
       })
       .clampNumElements(0, v2s32, v2s32);
 
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-v4s32.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-v4s32.mir
new file mode 100644
index 00000000000..8493bd8292c
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-v4s32.mir
@@ -0,0 +1,21 @@
+# RUN: not llc -march=aarch64 -o - -run-pass=legalizer -debug-only=legalizer 2>&1 %s | FileCheck %s
+# REQUIRES: asserts
+
+# CHECK: Legalize Machine IR for: load_v4s32
+# CHECK-NEXT: %{{[0-9]+}}:_(<4 x s32>) = G_LOAD %{{[0-9]+}}:_(p0)
+# CHECK-NOT: Lower
+# CHECK: unable to legalize instruction
+---
+name:            load_v4s32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x0
+
+    %0:_(p0) = COPY $x0
+    %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load 16, align 4)
+    %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<4 x s32>)
+    $w0 = COPY %5(s32)
+
+...
-- 
GitLab


From 681afad76c736aac54ef20f9bee47b300d925e55 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 25 Oct 2018 17:28:57 +0000
Subject: [PATCH 0579/1116] [X86] Remove some uarch tuning flags from KNL that
 look to have been inherited from SNB/IVB incorrectly

KNL is based on a modified Silvermont core so I don't think these features apply. I think the LEA flag is probably also wrong, but I'm less sure as I barely understand the 3 LEA flags we have currently.

Differential Revision: https://reviews.llvm.org/D53671

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345285 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.td             | 4 ----
 test/CodeGen/X86/avx512-select.ll | 4 ++--
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 9a12a7237e4..100e192f181 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -823,11 +823,7 @@ def KNLFeatures : ProcessorFeatures<[], [
   FeatureXSAVEOPT,
   FeatureLAHFSAHF,
   FeatureSlow3OpsLEA,
-  FeatureFastScalarFSQRT,
-  FeatureFastSHLDRotate,
   FeatureSlowIncDec,
-  FeatureMergeToThreeWayBranch,
-  FeatureMacroFusion,
   FeatureRDRAND,
   FeatureF16C,
   FeatureFSGSBase,
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 2ad2251bc1a..008a3b44ce0 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -16,8 +16,8 @@ define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
 ;
 ; X64-LABEL: select00:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    cmpl $255, %edi
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    je .LBB0_2
 ; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm1
@@ -44,8 +44,8 @@ define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
 ;
 ; X64-LABEL: select01:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    cmpl $255, %edi
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    je .LBB1_2
 ; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm1
-- 
GitLab


From e662a68784ea276e97f40d2dd9342938cf12694e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 25 Oct 2018 17:29:00 +0000
Subject: [PATCH 0580/1116] [X86] Remove ProcIntelKNL and replace with a
 SlowPMADDWD flag to use in the one place it was checked.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345286 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.td              | 11 ++++++-----
 lib/Target/X86/X86ISelLowering.cpp |  2 +-
 lib/Target/X86/X86Subtarget.h      |  7 +++++--
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 100e192f181..94da74225b1 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -98,6 +98,9 @@ def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                        "SHLD instruction is slow">;
 def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
                                         "PMULLD instruction is slow">;
+def FeatureSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
+                                          "true",
+                                          "PMADDWD is slower than PMULLD">;
 // FIXME: This should not apply to CPUs that do not have SSE.
 def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
                                 "IsUAMem16Slow", "true",
@@ -460,8 +463,6 @@ def ProcIntelGLP  : SubtargetFeature<"glp", "X86ProcFamily", "IntelGLP",
                     "Intel Goldmont Plus processors">;
 def ProcIntelTRM  : SubtargetFeature<"tremont", "X86ProcFamily", "IntelTRM",
                     "Intel Tremont processors">;
-def ProcIntelKNL  : SubtargetFeature<"knl", "X86ProcFamily",
-                    "IntelKNL", "Intel Knights Landing processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
@@ -845,19 +846,19 @@ def KNLFeatures : ProcessorFeatures<[], [
 // FIXME: define KNL model
 class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
                                                   KNLFeatures.Value, [
-  ProcIntelKNL,
   FeatureSlowTwoMemOps,
   FeatureFastPartialYMMorZMMWrite,
-  FeatureHasFastGather
+  FeatureHasFastGather,
+  FeatureSlowPMADDWD
 ]>;
 def : KnightsLandingProc<"knl">;
 
 class KnightsMillProc<string Name> : ProcModel<Name, HaswellModel,
                                                KNLFeatures.Value, [
-  ProcIntelKNL,
   FeatureSlowTwoMemOps,
   FeatureFastPartialYMMorZMMWrite,
   FeatureHasFastGather,
+  FeatureSlowPMADDWD,
   FeatureVPOPCNTDQ
 ]>;
 def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f9f8fb4a419..1664a312aef 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -34361,7 +34361,7 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
   if (!Subtarget.hasSSE2())
     return SDValue();
 
-  if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL)
+  if (Subtarget.isPMADDWDSlow())
     return SDValue();
 
   EVT VT = N->getValueType(0);
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 47b51376e5e..b1103f823e7 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -60,8 +60,7 @@ public:
     IntelSLM,
     IntelGLM,
     IntelGLP,
-    IntelTRM,
-    IntelKNL,
+    IntelTRM
   };
 
 protected:
@@ -224,6 +223,9 @@ protected:
   //  PMULUDQ.
   bool IsPMULLDSlow = false;
 
+  /// True if the PMADDWD instruction is slow compared to PMULLD.
+  bool IsPMADDWDSlow = false;
+
   /// True if unaligned memory accesses of 16-bytes are slow.
   bool IsUAMem16Slow = false;
 
@@ -613,6 +615,7 @@ public:
   bool hasPTWRITE() const { return HasPTWRITE; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isPMULLDSlow() const { return IsPMULLDSlow; }
+  bool isPMADDWDSlow() const { return IsPMADDWDSlow; }
   bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
   bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
   int getGatherOverhead() const { return GatherOverhead; }
-- 
GitLab


From af0086ca57f9221406e3a47e73f362f9725101a6 Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Thu, 25 Oct 2018 17:37:07 +0000
Subject: [PATCH 0581/1116] [GISel] LegalizerInfo: Rename MemDesc::Size to
 SizeInBits to make the value clearer

Requested in D53679.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345288 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/GlobalISel/LegalizerInfo.h | 2 +-
 lib/CodeGen/GlobalISel/LegalityPredicates.cpp   | 4 ++--
 lib/CodeGen/GlobalISel/LegalizerInfo.cpp        | 2 +-
 lib/Target/AArch64/AArch64LegalizerInfo.cpp     | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index a8c26082f22..e0ea5755387 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -121,7 +121,7 @@ struct LegalityQuery {
   ArrayRef<LLT> Types;
 
   struct MemDesc {
-    uint64_t Size;
+    uint64_t SizeInBits;
     AtomicOrdering Ordering;
   };
 
diff --git a/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index 344f573a67f..94eab9ae00c 100644
--- a/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -45,7 +45,7 @@ LegalityPredicate LegalityPredicates::typePairAndMemSizeInSet(
   SmallVector<TypePairAndMemSize, 4> TypesAndMemSize = TypesAndMemSizeInit;
   return [=](const LegalityQuery &Query) {
     TypePairAndMemSize Match = {Query.Types[TypeIdx0], Query.Types[TypeIdx1],
-                                Query.MMODescrs[MMOIdx].Size};
+                                Query.MMODescrs[MMOIdx].SizeInBits};
     return std::find(TypesAndMemSize.begin(), TypesAndMemSize.end(), Match) !=
            TypesAndMemSize.end();
   };
@@ -82,7 +82,7 @@ LegalityPredicate LegalityPredicates::sizeNotPow2(unsigned TypeIdx) {
 
 LegalityPredicate LegalityPredicates::memSizeInBytesNotPow2(unsigned MMOIdx) {
   return [=](const LegalityQuery &Query) {
-    return !isPowerOf2_32(Query.MMODescrs[MMOIdx].Size /* In Bytes */);
+    return !isPowerOf2_32(Query.MMODescrs[MMOIdx].SizeInBits / 8);
   };
 }
 
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 1bfede097bd..b6ed2654bd0 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -51,7 +51,7 @@ raw_ostream &LegalityQuery::print(raw_ostream &OS) const {
 
   OS << Opcode << ", MMOs={";
   for (const auto &MMODescr : MMODescrs) {
-    OS << MMODescr.Size << ", ";
+    OS << MMODescr.SizeInBits << ", ";
   }
   OS << "}";
 
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index f0648f0b071..474516ff2cc 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -167,7 +167,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .unsupportedIfMemSizeNotPow2()
       // Lower any any-extending loads left into G_ANYEXT and G_LOAD
       .lowerIf([=](const LegalityQuery &Query) {
-        return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size;
+        return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
       })
       .clampNumElements(0, v2s32, v2s32);
 
@@ -185,7 +185,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .unsupportedIfMemSizeNotPow2()
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
-               Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size;
+               Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
       })
       .clampNumElements(0, v2s32, v2s32);
 
-- 
GitLab


From 4532ef4dfdf0126dacf037e38a304b3a5217356a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 25 Oct 2018 17:43:36 +0000
Subject: [PATCH 0582/1116] [LegalizeDAG] Remove dead SINT_TO_FP legalization
 code

As noticed on D52965, the SINT_TO_FP i64 to f32 legalization code has been dead for years - protected by an assert.

Differential Revision: https://reviews.llvm.org/D53703

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345290 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 73 ++++++------------------
 1 file changed, 19 insertions(+), 54 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 30e9413dc0a..e506f7b76b1 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2374,60 +2374,25 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
     LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f32\n");
     // For unsigned conversions, convert them to signed conversions using the
     // algorithm from the x86_64 __floatundidf in compiler_rt.
-    if (!isSigned) {
-      SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
-
-      SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
-      SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, ShiftConst);
-      SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
-      SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Op0, AndConst);
-      SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr);
-
-      SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Or);
-      SDValue Slow = DAG.getNode(ISD::FADD, dl, DestVT, SignCvt, SignCvt);
-
-      // TODO: This really should be implemented using a branch rather than a
-      // select.  We happen to get lucky and machinesink does the right
-      // thing most of the time.  This would be a good candidate for a
-      // pseudo-op, or, even better, for whole-function isel.
-      SDValue SignBitTest =
-          DAG.getSetCC(dl, getSetCCResultType(SrcVT), Op0,
-                       DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
-      return DAG.getSelect(dl, DestVT, SignBitTest, Slow, Fast);
-    }
-
-    // Otherwise, implement the fully general conversion.
-
-    SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
-         DAG.getConstant(UINT64_C(0xfffffffffffff800), dl, MVT::i64));
-    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And,
-         DAG.getConstant(UINT64_C(0x800), dl, MVT::i64));
-    SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
-         DAG.getConstant(UINT64_C(0x7ff), dl, MVT::i64));
-    SDValue Ne = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), And2,
-                              DAG.getConstant(UINT64_C(0), dl, MVT::i64),
-                              ISD::SETNE);
-    SDValue Sel = DAG.getSelect(dl, MVT::i64, Ne, Or, Op0);
-    SDValue Ge = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), Op0,
-                              DAG.getConstant(UINT64_C(0x0020000000000000), dl,
-                                              MVT::i64),
-                              ISD::SETUGE);
-    SDValue Sel2 = DAG.getSelect(dl, MVT::i64, Ge, Sel, Op0);
-    EVT SHVT = TLI.getShiftAmountTy(Sel2.getValueType(), DAG.getDataLayout());
-
-    SDValue Sh = DAG.getNode(ISD::SRL, dl, MVT::i64, Sel2,
-                             DAG.getConstant(32, dl, SHVT));
-    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sh);
-    SDValue Fcvt = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Trunc);
-    SDValue TwoP32 =
-      DAG.getConstantFP(BitsToDouble(UINT64_C(0x41f0000000000000)), dl,
-                        MVT::f64);
-    SDValue Fmul = DAG.getNode(ISD::FMUL, dl, MVT::f64, TwoP32, Fcvt);
-    SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sel2);
-    SDValue Fcvt2 = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Lo);
-    SDValue Fadd = DAG.getNode(ISD::FADD, dl, MVT::f64, Fmul, Fcvt2);
-    return DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Fadd,
-                       DAG.getIntPtrConstant(0, dl));
+    SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
+
+    SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
+    SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, ShiftConst);
+    SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
+    SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Op0, AndConst);
+    SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr);
+
+    SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Or);
+    SDValue Slow = DAG.getNode(ISD::FADD, dl, DestVT, SignCvt, SignCvt);
+
+    // TODO: This really should be implemented using a branch rather than a
+    // select.  We happen to get lucky and machinesink does the right
+    // thing most of the time.  This would be a good candidate for a
+    // pseudo-op, or, even better, for whole-function isel.
+    SDValue SignBitTest =
+        DAG.getSetCC(dl, getSetCCResultType(SrcVT), Op0,
+                     DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
+    return DAG.getSelect(dl, DestVT, SignBitTest, Slow, Fast);
   }
 
   SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
-- 
GitLab


From 32b8b6b1e18bd940dc921b30fe0d532b028f607f Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Thu, 25 Oct 2018 17:52:19 +0000
Subject: [PATCH 0583/1116] [GlobalISel] LegalizerHelper: Fix the incorrect
 alignment when splitting loads/stores in narrowScalar

Reviewers: dsanders, bogner, jpaquette, aemerson, ab, paquette

Reviewed By: dsanders

Subscribers: rovka, kristof.beyls, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D53664

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345292 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/GlobalISel/LegalizerHelper.cpp    | 12 +++----
 .../legalize-load-store-s128-unaligned.mir    | 31 +++++++++++++++++++
 2 files changed, 37 insertions(+), 6 deletions(-)
 create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-load-store-s128-unaligned.mir

diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index c9ed97aa390..516f5ce4343 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -467,12 +467,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       unsigned SrcReg = 0;
       unsigned Adjustment = i * NarrowSize / 8;
+      unsigned Alignment = MinAlign(MMO.getAlignment(), Adjustment);
 
       MachineMemOperand *SplitMMO = MIRBuilder.getMF().getMachineMemOperand(
           MMO.getPointerInfo().getWithOffset(Adjustment), MMO.getFlags(),
-          NarrowSize / 8, i == 0 ? MMO.getAlignment() : NarrowSize / 8,
-          MMO.getAAInfo(), MMO.getRanges(), MMO.getSyncScopeID(),
-          MMO.getOrdering(), MMO.getFailureOrdering());
+          NarrowSize / 8, Alignment, MMO.getAAInfo(), MMO.getRanges(),
+          MMO.getSyncScopeID(), MMO.getOrdering(), MMO.getFailureOrdering());
 
       MIRBuilder.materializeGEP(SrcReg, MI.getOperand(1).getReg(), OffsetTy,
                                 Adjustment);
@@ -509,12 +509,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     for (int i = 0; i < NumParts; ++i) {
       unsigned DstReg = 0;
       unsigned Adjustment = i * NarrowSize / 8;
+      unsigned Alignment = MinAlign(MMO.getAlignment(), Adjustment);
 
       MachineMemOperand *SplitMMO = MIRBuilder.getMF().getMachineMemOperand(
           MMO.getPointerInfo().getWithOffset(Adjustment), MMO.getFlags(),
-          NarrowSize / 8, i == 0 ? MMO.getAlignment() : NarrowSize / 8,
-          MMO.getAAInfo(), MMO.getRanges(), MMO.getSyncScopeID(),
-          MMO.getOrdering(), MMO.getFailureOrdering());
+          NarrowSize / 8, Alignment, MMO.getAAInfo(), MMO.getRanges(),
+          MMO.getSyncScopeID(), MMO.getOrdering(), MMO.getFailureOrdering());
 
       MIRBuilder.materializeGEP(DstReg, MI.getOperand(1).getReg(), OffsetTy,
                                 Adjustment);
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-store-s128-unaligned.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-store-s128-unaligned.mir
new file mode 100644
index 00000000000..33a6c23eb36
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-store-s128-unaligned.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -o - -run-pass=legalizer %s | FileCheck %s
+---
+name:            loadstore128_align4
+exposesReturnsTwice: false
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: loadstore128_align4
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load 8, align 4)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP]](p0) :: (load 8, align 4)
+    ; CHECK: G_STORE [[LOAD]](s64), [[COPY1]](p0) :: (store 8, align 4)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64)
+    ; CHECK: G_STORE [[LOAD1]](s64), [[GEP1]](p0) :: (store 8, align 4)
+    ; CHECK: RET_ReallyLR
+    %0:_(p0) = COPY $x0
+    %1:_(p0) = COPY $x1
+    %2:_(s128) = G_LOAD %0(p0) :: (load 16, align 4)
+    G_STORE %2(s128), %1(p0) :: (store 16, align 4)
+    RET_ReallyLR
+
+...
-- 
GitLab


From f6da6905251e36f990a3689f769153fdc3130b4d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 25 Oct 2018 18:06:25 +0000
Subject: [PATCH 0584/1116] [X86] Add KNL command lines to movmsk-cmp.ll.

Some of this code looks pretty bad and we should probably still be using movmskb more with avx512f.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345293 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/movmsk-cmp.ll | 1050 ++++++++++++++++++++++++++++++++
 1 file changed, 1050 insertions(+)

diff --git a/test/CodeGen/X86/movmsk-cmp.ll b/test/CodeGen/X86/movmsk-cmp.ll
index 9f55ca31b1b..4b33b04a8b2 100644
--- a/test/CodeGen/X86/movmsk-cmp.ll
+++ b/test/CodeGen/X86/movmsk-cmp.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s  --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s  --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s  --check-prefix=KNL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s  --check-prefix=SKX
 
 define i1 @allones_v16i8_sign(<16 x i8> %arg) {
@@ -19,6 +20,17 @@ define i1 @allones_v16i8_sign(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %xmm0, %k0
@@ -46,6 +58,17 @@ define i1 @allzeros_v16i8_sign(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %xmm0, %k0
@@ -92,6 +115,24 @@ define i1 @allones_v32i8_sign(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %ymm0, %k0
@@ -137,6 +178,23 @@ define i1 @allzeros_v32i8_sign(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %ymm0, %k0
@@ -202,6 +260,36 @@ define i1 @allones_v64i8_sign(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v64i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    cmpq $-1, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v64i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %zmm0, %k0
@@ -264,6 +352,35 @@ define i1 @allzeros_v64i8_sign(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v64i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v64i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %zmm0, %k0
@@ -298,6 +415,18 @@ define i1 @allones_v8i16_sign(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %xmm0, %k0
@@ -331,6 +460,18 @@ define i1 @allzeros_v8i16_sign(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %xmm0, %k0
@@ -381,6 +522,17 @@ define i1 @allones_v16i16_sign(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %ymm0, %k0
@@ -432,6 +584,17 @@ define i1 @allzeros_v16i16_sign(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %ymm0, %k0
@@ -499,6 +662,24 @@ define i1 @allones_v32i16_sign(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %zmm0, %k0
@@ -564,6 +745,23 @@ define i1 @allzeros_v32i16_sign(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %zmm0, %k0
@@ -592,6 +790,18 @@ define i1 @allones_v4i32_sign(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %xmm0, %k0
@@ -621,6 +831,17 @@ define i1 @allzeros_v4i32_sign(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %xmm0, %k0
@@ -656,6 +877,17 @@ define i1 @allones_v8i32_sign(<8 x i32> %arg) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %ymm0, %k0
@@ -691,6 +923,17 @@ define i1 @allzeros_v8i32_sign(<8 x i32> %arg) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %ymm0, %k0
@@ -756,6 +999,15 @@ define i1 @allones_v16i32_sign(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %zmm0, %k0
@@ -821,6 +1073,15 @@ define i1 @allzeros_v16i32_sign(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %zmm0, %k0
@@ -870,6 +1131,18 @@ define i1 @allones_v4i64_sign(<4 x i64> %arg) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i64_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i64_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %ymm0, %k0
@@ -921,6 +1194,17 @@ define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i64_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i64_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %ymm0, %k0
@@ -1015,6 +1299,16 @@ define i1 @allones_v8i64_sign(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i64_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i64_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %zmm0, %k0
@@ -1108,6 +1402,16 @@ define i1 @allzeros_v8i64_sign(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i64_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i64_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %zmm0, %k0
@@ -1138,6 +1442,18 @@ define i1 @allones_v16i8_and1(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %xmm0, %k0
@@ -1168,6 +1484,18 @@ define i1 @allzeros_v16i8_and1(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %xmm0, %k0
@@ -1222,6 +1550,25 @@ define i1 @allones_v32i8_and1(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %ymm0, %k0
@@ -1275,6 +1622,24 @@ define i1 @allzeros_v32i8_and1(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %ymm0, %k0
@@ -1355,6 +1720,38 @@ define i1 @allones_v64i8_and1(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v64i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    cmpq $-1, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v64i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %zmm0, %k0
@@ -1432,6 +1829,37 @@ define i1 @allzeros_v64i8_and1(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v64i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v64i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %zmm0, %k0
@@ -1469,6 +1897,19 @@ define i1 @allones_v8i16_and1(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %xmm0, %k0
@@ -1505,6 +1946,19 @@ define i1 @allzeros_v8i16_and1(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %xmm0, %k0
@@ -1562,6 +2016,18 @@ define i1 @allones_v16i16_and1(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %ymm0, %k0
@@ -1641,6 +2107,26 @@ define i1 @allones_v32i16_and1(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %zmm0, %k0
@@ -1718,6 +2204,25 @@ define i1 @allzeros_v32i16_and1(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %zmm0, %k0
@@ -1776,6 +2281,18 @@ define i1 @allzeros_v16i16_and1(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %ymm0, %k0
@@ -1807,6 +2324,18 @@ define i1 @allones_v4i32_and1(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i32_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
@@ -1840,6 +2369,17 @@ define i1 @allzeros_v4i32_and1(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i32_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
@@ -1891,6 +2431,17 @@ define i1 @allones_v8i32_and1(<8 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i32_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
@@ -1942,6 +2493,17 @@ define i1 @allzeros_v8i32_and1(<8 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i32_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
@@ -2020,6 +2582,15 @@ define i1 @allones_v16i32_and1(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i32_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
@@ -2098,6 +2669,15 @@ define i1 @allzeros_v16i32_and1(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i32_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
@@ -2130,6 +2710,18 @@ define i1 @allones_v2i64_and1(<2 x i64> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v2i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $3, %al
+; KNL-NEXT:    cmpb $3, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v2i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip), %xmm0, %k0
@@ -2162,6 +2754,17 @@ define i1 @allzeros_v2i64_and1(<2 x i64> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v2i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $3, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v2i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip), %xmm0, %k0
@@ -2215,6 +2818,18 @@ define i1 @allones_v4i64_and1(<4 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0
@@ -2270,6 +2885,17 @@ define i1 @allzeros_v4i64_and1(<4 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0
@@ -2355,6 +2981,15 @@ define i1 @allones_v8i64_and1(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
@@ -2439,6 +3074,15 @@ define i1 @allzeros_v8i64_and1(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
@@ -2470,6 +3114,18 @@ define i1 @allones_v16i8_and4(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %xmm0, %k0
@@ -2500,6 +3156,18 @@ define i1 @allzeros_v16i8_and4(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %xmm0, %k0
@@ -2554,6 +3222,25 @@ define i1 @allones_v32i8_and4(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %ymm0, %k0
@@ -2607,6 +3294,24 @@ define i1 @allzeros_v32i8_and4(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %ymm0, %k0
@@ -2687,6 +3392,38 @@ define i1 @allones_v64i8_and4(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v64i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    cmpq $-1, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v64i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %zmm0, %k0
@@ -2764,6 +3501,37 @@ define i1 @allzeros_v64i8_and4(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v64i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v64i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %zmm0, %k0
@@ -2801,6 +3569,19 @@ define i1 @allones_v8i16_and4(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %xmm0, %k0
@@ -2837,6 +3618,19 @@ define i1 @allzeros_v8i16_and4(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %xmm0, %k0
@@ -2894,6 +3688,18 @@ define i1 @allones_v16i16_and4(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %ymm0, %k0
@@ -2973,6 +3779,26 @@ define i1 @allones_v32i16_and4(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %zmm0, %k0
@@ -3050,6 +3876,25 @@ define i1 @allzeros_v32i16_and4(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %zmm0, %k0
@@ -3108,6 +3953,18 @@ define i1 @allzeros_v16i16_and4(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %ymm0, %k0
@@ -3139,6 +3996,18 @@ define i1 @allones_v4i32_and4(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i32_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
@@ -3172,6 +4041,17 @@ define i1 @allzeros_v4i32_and4(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i32_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
@@ -3223,6 +4103,17 @@ define i1 @allones_v8i32_and4(<8 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i32_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
@@ -3274,6 +4165,17 @@ define i1 @allzeros_v8i32_and4(<8 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i32_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
@@ -3352,6 +4254,15 @@ define i1 @allones_v16i32_and4(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i32_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
@@ -3430,6 +4341,15 @@ define i1 @allzeros_v16i32_and4(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i32_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
@@ -3462,6 +4382,18 @@ define i1 @allones_v2i64_and4(<2 x i64> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v2i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $3, %al
+; KNL-NEXT:    cmpb $3, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v2i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip), %xmm0, %k0
@@ -3494,6 +4426,17 @@ define i1 @allzeros_v2i64_and4(<2 x i64> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v2i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $3, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v2i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip), %xmm0, %k0
@@ -3547,6 +4490,18 @@ define i1 @allones_v4i64_and4(<4 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0
@@ -3602,6 +4557,17 @@ define i1 @allzeros_v4i64_and4(<4 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0
@@ -3687,6 +4653,15 @@ define i1 @allones_v8i64_and4(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
@@ -3771,6 +4746,15 @@ define i1 @allzeros_v8i64_and4(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
@@ -3799,6 +4783,16 @@ define i32 @movmskpd(<2 x double> %x) {
 ; AVX-NEXT:    vmovmskpd %xmm0, %eax
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: movmskpd:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $3, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskpd:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %xmm0, %k0
@@ -3823,6 +4817,16 @@ define i32 @movmskps(<4 x float> %x) {
 ; AVX-NEXT:    vmovmskps %xmm0, %eax
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: movmskps:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $15, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskps:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %xmm0, %k0
@@ -3868,6 +4872,16 @@ define i32 @movmskpd256(<4 x double> %x) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: movmskpd256:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $15, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskpd256:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %ymm0, %k0
@@ -3901,6 +4915,16 @@ define i32 @movmskps256(<8 x float> %x) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: movmskps256:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskps256:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %ymm0, %k0
@@ -3925,6 +4949,16 @@ define i32 @movmskb(<16 x i8> %x) {
 ; AVX-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: movmskb:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskb:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %xmm0, %k0
@@ -3964,6 +4998,22 @@ define i32 @movmskb256(<32 x i8> %x) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: movmskb256:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    shll $16, %eax
+; KNL-NEXT:    orl %ecx, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskb256:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %ymm0, %k0
-- 
GitLab


From 88b2c1ee841fc10cf7a2dfce594f006ca50bf7bb Mon Sep 17 00:00:00 2001
From: Cameron McInally <cameron.mcinally@nyu.edu>
Date: Thu, 25 Oct 2018 18:09:33 +0000
Subject: [PATCH 0585/1116] [FPEnv] Last BinaryOperator::isFNeg(...) to
 m_FNeg(...) changes

Replacing BinaryOperator::isFNeg(...) to avoid regressions when we
separate FNeg from the FSub IR instruction.

Differential Revision: https://reviews.llvm.org/D53650


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345295 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/FastISel.cpp           | 13 ++++++++-----
 lib/Transforms/InstCombine/InstCombineCasts.cpp |  5 +++--
 test/Transforms/InstCombine/fpcast.ll           |  4 ++--
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 542cc10371e..035844294f4 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -89,6 +89,7 @@
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -110,6 +111,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "isel"
 
@@ -1692,7 +1694,10 @@ void FastISel::finishCondBranch(const BasicBlock *BranchBB,
 
 /// Emit an FNeg operation.
 bool FastISel::selectFNeg(const User *I) {
-  unsigned OpReg = getRegForValue(BinaryOperator::getFNegArgument(I));
+  Value *X;
+  if (!match(I, m_FNeg(m_Value(X))))
+    return false;
+  unsigned OpReg = getRegForValue(X);
   if (!OpReg)
     return false;
   bool OpRegIsKill = hasTrivialKill(I);
@@ -1782,11 +1787,9 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) {
     return selectBinaryOp(I, ISD::FADD);
   case Instruction::Sub:
     return selectBinaryOp(I, ISD::SUB);
-  case Instruction::FSub:
+  case Instruction::FSub: 
     // FNeg is currently represented in LLVM IR as a special case of FSub.
-    if (BinaryOperator::isFNeg(I))
-      return selectFNeg(I);
-    return selectBinaryOp(I, ISD::FSUB);
+    return selectFNeg(I) || selectBinaryOp(I, ISD::FSUB);
   case Instruction::Mul:
     return selectBinaryOp(I, ISD::MUL);
   case Instruction::FMul:
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 74f1e695ff6..9fa27d89911 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1612,8 +1612,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
     }
 
     // (fptrunc (fneg x)) -> (fneg (fptrunc x))
-    if (BinaryOperator::isFNeg(OpI)) {
-      Value *InnerTrunc = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
+    Value *X;
+    if (match(OpI, m_FNeg(m_Value(X)))) {
+      Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty);
       return BinaryOperator::CreateFNegFMF(InnerTrunc, OpI);
     }
   }
diff --git a/test/Transforms/InstCombine/fpcast.ll b/test/Transforms/InstCombine/fpcast.ll
index 7ba2ca04bcd..bfc1de4ff6d 100644
--- a/test/Transforms/InstCombine/fpcast.ll
+++ b/test/Transforms/InstCombine/fpcast.ll
@@ -42,8 +42,8 @@ define half @fneg_fptrunc(float %a) {
 
 define <2 x half> @fneg_fptrunc_vec_undef(<2 x float> %a) {
 ; CHECK-LABEL: @fneg_fptrunc_vec_undef(
-; CHECK-NEXT:    [[B:%.*]] = fsub <2 x float> <float -0.000000e+00, float undef>, [[A:%.*]]
-; CHECK-NEXT:    [[C:%.*]] = fptrunc <2 x float> [[B]] to <2 x half>
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc <2 x float> [[A:%.*]] to <2 x half>
+; CHECK-NEXT:    [[C:%.*]] = fsub <2 x half> <half 0xH8000, half 0xH8000>, [[TMP1]]
 ; CHECK-NEXT:    ret <2 x half> [[C]]
 ;
   %b = fsub <2 x float> <float -0.0, float undef>, %a
-- 
GitLab


From ee0e48f6cbbe9c0d052d86cdb542de7cf391b7e4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 25 Oct 2018 18:23:48 +0000
Subject: [PATCH 0586/1116] [X86] Add some non-AVX512VL command lines to the
 *vl-vec-test-testn.ll tests.

This will expose some regressions in the WIP and/or/xor promotion removal patch.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345297 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx512bwvl-vec-test-testn.ll | 383 ++++++---
 test/CodeGen/X86/avx512vl-vec-test-testn.ll   | 736 ++++++++++++------
 2 files changed, 762 insertions(+), 357 deletions(-)

diff --git a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
index 70cadc78c18..ecb76b3f9a7 100644
--- a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
@@ -1,14 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_test_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmb %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_test_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmb %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_test_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
@@ -19,13 +30,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_mask_test_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmb %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_mask_test_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmb %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_mask_test_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
@@ -38,12 +60,22 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_test_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmw %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_test_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmw %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_test_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
@@ -54,13 +86,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_mask_test_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmw %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andb %dil, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_mask_test_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmw %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andb %dil, %al
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_mask_test_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andb %dil, %al
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
@@ -73,12 +116,22 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_testn_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmb %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_testn_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmb %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_testn_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
@@ -89,13 +142,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_mask_testn_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmb %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_mask_testn_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmb %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_mask_testn_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
@@ -108,12 +172,22 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_testn_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmw %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_testn_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmw %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_testn_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
@@ -124,13 +198,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_mask_testn_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmw %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andb %dil, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_mask_testn_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmw %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andb %dil, %al
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_mask_testn_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andb %dil, %al
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
@@ -143,12 +228,21 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define i32 @TEST_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_test_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmb %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_test_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmb %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_test_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
@@ -159,13 +253,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define i32 @TEST_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_mask_test_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmb %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_mask_test_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmb %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_mask_test_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
@@ -178,13 +282,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_test_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmw %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_test_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmw %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_test_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
@@ -195,14 +309,25 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_mask_test_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmw %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_mask_test_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmw %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_mask_test_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
@@ -215,12 +340,21 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define i32 @TEST_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_testn_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmb %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_testn_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmb %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_testn_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
@@ -231,13 +365,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define i32 @TEST_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_mask_testn_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmb %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_mask_testn_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmb %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_mask_testn_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
@@ -250,13 +394,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_testn_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmw %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_testn_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmw %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_testn_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
@@ -267,14 +421,25 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_mask_testn_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmw %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_mask_testn_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmw %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_mask_testn_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
diff --git a/test/CodeGen/X86/avx512vl-vec-test-testn.ll b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
index c662226fde9..ae74be241d5 100644
--- a/test/CodeGen/X86/avx512vl-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
@@ -1,22 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X86_64
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=I386
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-X64
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F --check-prefix=AVX512F-X64
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F --check-prefix=AVX512F-X86
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_test_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestmq %xmm0, %xmm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_test_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestmq %xmm0, %xmm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm_test_epi64_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestmq %xmm0, %xmm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm_test_epi64_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
@@ -27,19 +34,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_test_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestmd %xmm0, %xmm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_test_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestmd %xmm0, %xmm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm_test_epi32_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestmd %xmm0, %xmm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm_test_epi32_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
@@ -51,21 +63,25 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_test_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestmq %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_test_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestmq %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm256_test_epi64_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestmq %ymm0, %ymm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm256_test_epi64_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
@@ -76,21 +92,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_test_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestmd %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_test_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestmd %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm256_test_epi32_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestmd %ymm0, %ymm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm256_test_epi32_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
@@ -101,22 +119,49 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_test_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_mask_test_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_mask_test_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm_mask_test_epi64_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm_mask_test_epi64_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm_mask_test_epi64_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm_mask_test_epi64_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
@@ -130,22 +175,49 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_test_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_mask_test_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_mask_test_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm_mask_test_epi32_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm_mask_test_epi32_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm_mask_test_epi32_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestmd %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm_mask_test_epi32_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestmd %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
@@ -161,24 +233,51 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_mask_test_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_mask_test_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_mask_test_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm256_mask_test_epi64_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    vzeroupper
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm256_mask_test_epi64_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    vzeroupper
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm256_mask_test_epi64_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm256_mask_test_epi64_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
@@ -192,23 +291,45 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_mask_test_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestmd %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    andb %dil, %al
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_mask_test_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestmd %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    andb {{[0-9]+}}(%esp), %al
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm256_mask_test_epi32_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    vptestmd %ymm0, %ymm1, %k0
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    andb %dil, %al
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    vzeroupper
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm256_mask_test_epi32_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    vptestmd %ymm0, %ymm1, %k0
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    andb {{[0-9]+}}(%esp), %al
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    vzeroupper
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm256_mask_test_epi32_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X64-NEXT:    vptestmd %zmm0, %zmm1, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    andb %dil, %al
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm256_mask_test_epi32_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X86-NEXT:    vptestmd %zmm0, %zmm1, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    andb {{[0-9]+}}(%esp), %al
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
@@ -221,19 +342,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_testn_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestnmq %xmm0, %xmm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_testn_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestnmq %xmm0, %xmm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm_testn_epi64_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm_testn_epi64_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
@@ -244,19 +370,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_testn_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestnmd %xmm0, %xmm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_testn_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestnmd %xmm0, %xmm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm_testn_epi32_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm_testn_epi32_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
@@ -268,21 +399,25 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_testn_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestnmq %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_testn_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestnmq %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm256_testn_epi64_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestnmq %ymm0, %ymm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm256_testn_epi64_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
@@ -293,21 +428,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_testn_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestnmd %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_testn_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestnmd %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm256_testn_epi32_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm256_testn_epi32_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
@@ -318,22 +455,49 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_testn_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_mask_testn_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_mask_testn_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm_mask_testn_epi64_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm_mask_testn_epi64_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm_mask_testn_epi64_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm_mask_testn_epi64_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
@@ -347,22 +511,49 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_testn_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_mask_testn_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_mask_testn_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm_mask_testn_epi32_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm_mask_testn_epi32_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm_mask_testn_epi32_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestnmd %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm_mask_testn_epi32_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestnmd %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
@@ -378,24 +569,51 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_mask_testn_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_mask_testn_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_mask_testn_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm256_mask_testn_epi64_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    vzeroupper
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm256_mask_testn_epi64_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    vzeroupper
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm256_mask_testn_epi64_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm256_mask_testn_epi64_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
@@ -409,23 +627,45 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_mask_testn_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestnmd %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    andb %dil, %al
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_mask_testn_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestnmd %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    andb {{[0-9]+}}(%esp), %al
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm256_mask_testn_epi32_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    vptestnmd %ymm0, %ymm1, %k0
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    andb %dil, %al
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    vzeroupper
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm256_mask_testn_epi32_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    vptestnmd %ymm0, %ymm1, %k0
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    andb {{[0-9]+}}(%esp), %al
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    vzeroupper
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm256_mask_testn_epi32_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X64-NEXT:    vptestnmd %zmm0, %zmm1, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    andb %dil, %al
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm256_mask_testn_epi32_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X86-NEXT:    vptestnmd %zmm0, %zmm1, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    andb {{[0-9]+}}(%esp), %al
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
-- 
GitLab


From 6c8035cd8970ca5815d7bb034f8e59df1562ec8e Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Thu, 25 Oct 2018 19:06:13 +0000
Subject: [PATCH 0587/1116] [WebAssembly] Use target-independent saturating add

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53721

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345299 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsWebAssembly.td      |  8 -------
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  6 +++++
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  4 ++--
 test/CodeGen/WebAssembly/simd-intrinsics.ll   | 24 +++++++------------
 4 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/include/llvm/IR/IntrinsicsWebAssembly.td b/include/llvm/IR/IntrinsicsWebAssembly.td
index adf7cb0ba0e..897d3525b4c 100644
--- a/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -102,14 +102,6 @@ def int_wasm_atomic_notify:
 // SIMD intrinsics
 //===----------------------------------------------------------------------===//
 
-def int_wasm_add_saturate_signed :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, LLVMMatchType<0>],
-            [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_add_saturate_unsigned :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, LLVMMatchType<0>],
-            [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_sub_saturate_signed :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>],
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 0bd2ebdc4bb..c056e1af588 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -115,6 +115,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setTruncStoreAction(T, MVT::f16, Expand);
   }
 
+  // Support saturating add for i8x16 and i16x8
+  if (Subtarget->hasSIMD128())
+    for (auto T : {MVT::v16i8, MVT::v8i16})
+      for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
+        setOperationAction(Op, T, Legal);
+
   for (auto T : {MVT::i32, MVT::i64}) {
     // Expand unavailable integer operations.
     for (auto Op :
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index ff6bbab705c..caad638e9e3 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -469,9 +469,9 @@ multiclass SIMDBinarySat<SDNode node, string name, bits<32> baseInst> {
 // Saturating integer addition: add_saturate_s / add_saturate_u
 let isCommutable = 1 in {
 defm ADD_SAT_S :
-  SIMDBinarySat<int_wasm_add_saturate_signed, "add_saturate_s", 40>;
+  SIMDBinarySat<saddsat, "add_saturate_s", 40>;
 defm ADD_SAT_U :
-  SIMDBinarySat<int_wasm_add_saturate_unsigned, "add_saturate_u", 41>;
+  SIMDBinarySat<uaddsat, "add_saturate_u", 41>;
 } // isCommutable = 1
 
 // Saturating integer subtraction: sub_saturate_s / sub_saturate_u
diff --git a/test/CodeGen/WebAssembly/simd-intrinsics.ll b/test/CodeGen/WebAssembly/simd-intrinsics.ll
index ab32929ceb8..1cf990d11d4 100644
--- a/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -16,11 +16,9 @@ target triple = "wasm32-unknown-unknown"
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i8x16.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
-declare <16 x i8> @llvm.wasm.add.saturate.signed.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>)
 define <16 x i8> @add_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
-  %a = call <16 x i8> @llvm.wasm.add.saturate.signed.v16i8(
-    <16 x i8> %x, <16 x i8> %y
-  )
+  %a = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %a
 }
 
@@ -29,11 +27,9 @@ define <16 x i8> @add_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i8x16.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
-declare <16 x i8> @llvm.wasm.add.saturate.unsigned.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>)
 define <16 x i8> @add_sat_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
-  %a = call <16 x i8> @llvm.wasm.add.saturate.unsigned.v16i8(
-    <16 x i8> %x, <16 x i8> %y
-  )
+  %a = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %a
 }
 
@@ -106,11 +102,9 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i16x8.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
-declare <8 x i16> @llvm.wasm.add.saturate.signed.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>)
 define <8 x i16> @add_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
-  %a = call <8 x i16> @llvm.wasm.add.saturate.signed.v8i16(
-    <8 x i16> %x, <8 x i16> %y
-  )
+  %a = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %a
 }
 
@@ -119,11 +113,9 @@ define <8 x i16> @add_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i16x8.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
-declare <8 x i16> @llvm.wasm.add.saturate.unsigned.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>)
 define <8 x i16> @add_sat_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
-  %a = call <8 x i16> @llvm.wasm.add.saturate.unsigned.v8i16(
-    <8 x i16> %x, <8 x i16> %y
-  )
+  %a = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %a
 }
 
-- 
GitLab


From 5b2a5eeabc6db9a079372b20f69730e02fa97bc8 Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Thu, 25 Oct 2018 20:01:19 +0000
Subject: [PATCH 0588/1116] [AArch64][GlobalISel] Simplify a legalizer test.
 NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345307 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../GlobalISel/legalize-load-store.mir        | 207 ++++++++----------
 1 file changed, 91 insertions(+), 116 deletions(-)

diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 9a563037159..7a41cb0cd79 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -1,130 +1,105 @@
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_load(i8* %addr) {
-  entry:
-    ret void
-  }
-  define void @test_store(i8* %addr) {
-  entry:
-    ret void
-  }
-...
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 
 ---
 name:            test_load
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
 body: |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-    ; CHECK-LABEL: name: test_load
-    %0(p0) = COPY $x0
-
-    %1(s1) = G_LOAD %0 :: (load 1 from %ir.addr)
-    %9:_(s32) = G_ANYEXT %1
-    $w0 = COPY %9
-
-    ; CHECK: %2:_(s8) = G_LOAD %0(p0) :: (load 1 from %ir.addr)
-    %2(s8) = G_LOAD %0 :: (load 1 from %ir.addr)
-    %10:_(s32) = G_ANYEXT %2
-    $w0 = COPY %10
-
-    ; CHECK: %3:_(s16) = G_LOAD %0(p0) :: (load 2 from %ir.addr)
-    %3(s16) = G_LOAD %0 :: (load 2 from %ir.addr)
-    %11:_(s32) = G_ANYEXT %3
-    $w0 = COPY %11
+    liveins: $x0
 
-    ; CHECK: %4:_(s32) = G_LOAD %0(p0) :: (load 4 from %ir.addr)
-    %4(s32) = G_LOAD %0 :: (load 4 from %ir.addr)
-    $w0 = COPY %4
-
-    ; CHECK: %5:_(s64) = G_LOAD %0(p0) :: (load 8 from %ir.addr)
-    %5(s64) = G_LOAD %0 :: (load 8 from %ir.addr)
-    $x0 = COPY %5
-
-    %6(p0) = G_LOAD %0(p0) :: (load 8 from %ir.addr)
-    %12:_(s64) = G_PTRTOINT %6
-    $x0 = COPY %12
-
-    ; CHECK: %7:_(<2 x s32>) = G_LOAD %0(p0) :: (load 8 from %ir.addr)
-    %7(<2 x s32>) = G_LOAD %0(p0) :: (load 8 from %ir.addr)
-    %13:_(s64) = G_BITCAST %7
-    $x0 = COPY %13
-
-    ; CHECK: [[LOAD0:%[0-9]+]]:_(s64) = G_LOAD %0(p0) :: (load 8 from %ir.addr, align 16)
-    ; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP %0, [[OFFSET1]](s64)
-    ; CHECK: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p0) :: (load 8 from %ir.addr + 8)
-    ; CHECK: %8:_(s128) = G_MERGE_VALUES [[LOAD0]](s64), [[LOAD1]](s64)
-    %8(s128) = G_LOAD %0(p0) :: (load 16 from %ir.addr)
-    %14:_(s64) = G_TRUNC %8
-    $x0 = COPY %14
+    ; CHECK-LABEL: name: test_load
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load 1)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s8)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load 1)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
+    ; CHECK: $w0 = COPY [[ANYEXT1]](s32)
+    ; CHECK: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p0) :: (load 2)
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD2]](s16)
+    ; CHECK: $w0 = COPY [[ANYEXT2]](s32)
+    ; CHECK: $w0 = COPY [[ANYEXT1]](s32)
+    ; CHECK: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load 8)
+    ; CHECK: $x0 = COPY [[LOAD3]](s64)
+    ; CHECK: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[COPY]](p0) :: (load 8)
+    ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[LOAD4]](p0)
+    ; CHECK: $x0 = COPY [[PTRTOINT]](s64)
+    ; CHECK: [[LOAD5:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load 8)
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD5]](<2 x s32>)
+    ; CHECK: $x0 = COPY [[BITCAST]](s64)
+    ; CHECK: [[LOAD6:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load 8, align 16)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD7:%[0-9]+]]:_(s64) = G_LOAD [[GEP]](p0) :: (load 8)
+    ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LOAD6]](s64), [[LOAD7]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[MV]](s128)
+    ; CHECK: $x0 = COPY [[TRUNC]](s64)
+    %0:_(p0) = COPY $x0
+    %1:_(s1) = G_LOAD %0(p0) :: (load 1)
+    %2:_(s32) = G_ANYEXT %1(s1)
+    $w0 = COPY %2(s32)
+    %3:_(s8) = G_LOAD %0(p0) :: (load 1)
+    %4:_(s32) = G_ANYEXT %3(s8)
+    $w0 = COPY %4(s32)
+    %5:_(s16) = G_LOAD %0(p0) :: (load 2)
+    %6:_(s32) = G_ANYEXT %5(s16)
+    $w0 = COPY %6(s32)
+    %7:_(s32) = G_LOAD %0(p0) :: (load 4)
+    $w0 = COPY %4(s32)
+    %8:_(s64) = G_LOAD %0(p0) :: (load 8)
+    $x0 = COPY %8(s64)
+    %9:_(p0) = G_LOAD %0(p0) :: (load 8)
+    %10:_(s64) = G_PTRTOINT %9(p0)
+    $x0 = COPY %10(s64)
+    %11:_(<2 x s32>) = G_LOAD %0(p0) :: (load 8)
+    %12:_(s64) = G_BITCAST %11(<2 x s32>)
+    $x0 = COPY %12(s64)
+    %13:_(s128) = G_LOAD %0(p0) :: (load 16)
+    %14:_(s64) = G_TRUNC %13(s128)
+    $x0 = COPY %14(s64)
 ...
 
 ---
 name:            test_store
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
 body: |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-    ; CHECK-LABEL: name: test_store
-
-    %0(p0) = COPY $x0
-    %1(s32) = COPY $w1
-
-    ; CHECK: [[C1:%.*]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK: [[B:%.*]]:_(s32) = COPY %1(s32)
-    ; CHECK: [[COPY_C1:%.*]]:_(s32) = COPY [[C1]]
-    ; CHECK: [[AND:%.*]]:_(s32) = G_AND [[B]], [[COPY_C1]]
-    ; CHECK: [[BIT8:%.*]]:_(s8) = G_TRUNC [[AND]]
-
-
-    ; CHECK: G_STORE [[BIT8]](s8), %0(p0) :: (store 1 into %ir.addr)
-    %2(s1) = G_TRUNC %1
-    G_STORE %2, %0 :: (store 1 into %ir.addr)
-
-    ; CHECK: G_STORE %3(s8), %0(p0) :: (store 1 into %ir.addr)
-    %3(s8) = G_TRUNC %1
-    G_STORE %3, %0 :: (store 1 into %ir.addr)
+    liveins: $x0, $w1
 
-    ; CHECK: G_STORE %4(s16), %0(p0) :: (store 2 into %ir.addr)
-    %4(s16) = G_TRUNC %1
-    G_STORE %4, %0 :: (store 2 into %ir.addr)
-
-    ; CHECK: G_STORE %1(s32), %0(p0) :: (store 4 into %ir.addr)
-    G_STORE %1, %0 :: (store 4 into %ir.addr)
-
-    ; CHECK: G_STORE %5(s64), %0(p0) :: (store 8 into %ir.addr)
-    %5(s64) = G_PTRTOINT %0(p0)
-    G_STORE %5, %0 :: (store 8 into %ir.addr)
-
-    ; CHECK: G_STORE %0(p0), %0(p0) :: (store 8 into %ir.addr)
-    G_STORE %0(p0), %0(p0) :: (store 8 into %ir.addr)
-
-    ; CHECK: G_STORE %5(s64), %0(p0) :: (store 8 into %ir.addr, align 16)
-    ; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP %0, [[OFFSET1]](s64)
-    ; CHECK: G_STORE %6(s64), [[GEP1]](p0) :: (store 8 into %ir.addr + 8)
-    %6(s64) = G_PTRTOINT %0(p0)
-    %7(s128) = G_MERGE_VALUES %5, %6
-    G_STORE %7, %0 :: (store 16 into %ir.addr)
+    ; CHECK-LABEL: name: test_store
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[COPY3]]
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[AND]](s32)
+    ; CHECK: G_STORE [[TRUNC]](s8), [[COPY]](p0) :: (store 1)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
+    ; CHECK: G_STORE [[TRUNC1]](s8), [[COPY]](p0) :: (store 1)
+    ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK: G_STORE [[TRUNC2]](s16), [[COPY]](p0) :: (store 2)
+    ; CHECK: G_STORE [[COPY1]](s32), [[COPY]](p0) :: (store 4)
+    ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY]](p0)
+    ; CHECK: G_STORE [[PTRTOINT]](s64), [[COPY]](p0) :: (store 8)
+    ; CHECK: G_STORE [[COPY]](p0), [[COPY]](p0) :: (store 8)
+    ; CHECK: [[PTRTOINT1:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY]](p0)
+    ; CHECK: G_STORE [[PTRTOINT1]](s64), [[COPY]](p0) :: (store 8, align 16)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64)
+    ; CHECK: G_STORE [[PTRTOINT1]](s64), [[GEP]](p0) :: (store 8)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = COPY $w1
+    %2:_(s1) = G_TRUNC %1(s32)
+    G_STORE %2(s1), %0(p0) :: (store 1)
+    %3:_(s8) = G_TRUNC %1(s32)
+    G_STORE %3(s8), %0(p0) :: (store 1)
+    %4:_(s16) = G_TRUNC %1(s32)
+    G_STORE %4(s16), %0(p0) :: (store 2)
+    G_STORE %1(s32), %0(p0) :: (store 4)
+    %5:_(s64) = G_PTRTOINT %0(p0)
+    G_STORE %5(s64), %0(p0) :: (store 8)
+    G_STORE %0(p0), %0(p0) :: (store 8)
+    %6:_(s64) = G_PTRTOINT %0(p0)
+    %7:_(s128) = G_MERGE_VALUES %6(s64), %6
+    G_STORE %7(s128), %0(p0) :: (store 16)
 ...
-- 
GitLab


From 02558799ed8412c6c9a85153821ae97a4f4ea07e Mon Sep 17 00:00:00 2001
From: David Greene <greened@obbligato.org>
Date: Thu, 25 Oct 2018 21:10:39 +0000
Subject: [PATCH 0589/1116] [AArch64] Create proper memoperand for multi-vector
 stores

Include all of the store's source vector operands when creating the
MachineMemOperand. Previously, we were missing the first operand,
making the store size seem smaller than it really is.

Differential Revision: https://reviews.llvm.org/D52816


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345315 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp    |   2 +-
 .../AArch64/multi-vector-store-size.ll        | 164 ++++++++++++++++++
 2 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/AArch64/multi-vector-store-size.ll

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index a7a1b0a5feb..2a42d2db75d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7972,7 +7972,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+    for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
diff --git a/test/CodeGen/AArch64/multi-vector-store-size.ll b/test/CodeGen/AArch64/multi-vector-store-size.ll
new file mode 100644
index 00000000000..9627556168a
--- /dev/null
+++ b/test/CodeGen/AArch64/multi-vector-store-size.ll
@@ -0,0 +1,164 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=isel < %s | FileCheck %s
+
+declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+define void @addstx(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entire vector is stored.
+  tail call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
+; CHECK: ST2Twov4s {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
+; CHECK: ST3Threev4s {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
+; CHECK: ST4Fourv4s {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
+
+define void @addst1x(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entire vector is stored.
+  tail call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
+; CHECK: ST1Twov4s {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
+; CHECK: ST1Threev4s {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
+; CHECK: ST1Fourv4s {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
+
+define void @addstxlane(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entire vector is stored.
+  tail call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, i64 1, float* %res)
+; CHECK: ST2i32 {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, float* %res)
+; CHECK: ST3i32 {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, float* %res)
+; CHECK: ST4i32 {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
+; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=isel < %s | FileCheck %s
+
+declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+define void @addstx(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entiew vector is stored.
+  tail call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
+; CHECK: ST2Twov4s {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
+; CHECK: ST3Threev4s {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
+; CHECK: ST4Fourv4s {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
+
+define void @addst1x(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entiew vector is stored.
+  tail call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
+; CHECK: ST1Twov4s {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
+; CHECK: ST1Threev4s {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
+; CHECK: ST1Fourv4s {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
+
+define void @addstxlane(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entiew vector is stored.
+  tail call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, i64 1, float* %res)
+; CHECK: ST2i32 {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, float* %res)
+; CHECK: ST3i32 {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, float* %res)
+; CHECK: ST4i32 {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
-- 
GitLab


From 4619b40840bace6251f8af97c794aeb6144adbb1 Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Thu, 25 Oct 2018 21:12:15 +0000
Subject: [PATCH 0590/1116] [CodeGen] Remove operands from FENTRY_CALL

FENTRY_CALL is actually not taking any input / output operands. The
machine verifier complains now because the target description says that:

* It needs 1 unknown output
* It needs 1 or more variable inputs

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345316 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/Target.td        | 4 ++--
 test/CodeGen/X86/fentry-insertion.ll | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index c2c56b0aca1..96641dda700 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -1164,8 +1164,8 @@ def PATCHABLE_TYPED_EVENT_CALL : StandardPseudoInstruction {
   let hasSideEffects = 1;
 }
 def FENTRY_CALL : StandardPseudoInstruction {
-  let OutOperandList = (outs unknown:$dst);
-  let InOperandList = (ins variable_ops);
+  let OutOperandList = (outs);
+  let InOperandList = (ins);
   let AsmString = "# FEntry call";
   let usesCustomInserter = 1;
   let mayLoad = 1;
diff --git a/test/CodeGen/X86/fentry-insertion.ll b/test/CodeGen/X86/fentry-insertion.ll
index c5fb3b254b2..56e32742c59 100644
--- a/test/CodeGen/X86/fentry-insertion.ll
+++ b/test/CodeGen/X86/fentry-insertion.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | FileCheck %s
+; RUN: llc %s -o - -verify-machineinstrs | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-- 
GitLab


From b23940c35851fb9511472fd9f086034723c4d533 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 25 Oct 2018 21:16:06 +0000
Subject: [PATCH 0591/1116] [X86] Change X86 backend to look for
 'min-legal-vector-width' attribute instead of 'required-vector-width' when
 determining whether 512-bit vectors should be legal.

The required-vector-width attribute was only used for backend testing and has never been generated by clang.

I believe clang is now generating min-legal-vector-width for vector uses in user code.

With this I believe passing -mprefer-vector-width=256 to clang should prevent use of zmm registers in the generated assembly unless the user used a 512-bit intrinsic in their source code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345317 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetMachine.cpp           |  9 ++--
 ...tor-width.ll => min-legal-vector-width.ll} | 48 +++++++++----------
 2 files changed, 29 insertions(+), 28 deletions(-)
 rename test/CodeGen/X86/{required-vector-width.ll => min-legal-vector-width.ll} (95%)

diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 812b8b28ebd..3583a9cfb8d 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -295,13 +295,14 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
     }
   }
 
-  // Extract required-vector-width attribute.
+  // Extract min-legal-vector-width attribute.
   unsigned RequiredVectorWidth = UINT32_MAX;
-  if (F.hasFnAttribute("required-vector-width")) {
-    StringRef Val = F.getFnAttribute("required-vector-width").getValueAsString();
+  if (F.hasFnAttribute("min-legal-vector-width")) {
+    StringRef Val =
+        F.getFnAttribute("min-legal-vector-width").getValueAsString();
     unsigned Width;
     if (!Val.getAsInteger(0, Width)) {
-      Key += ",required-vector-width=";
+      Key += ",min-legal-vector-width=";
       Key += Val;
       RequiredVectorWidth = Width;
     }
diff --git a/test/CodeGen/X86/required-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll
similarity index 95%
rename from test/CodeGen/X86/required-vector-width.ll
rename to test/CodeGen/X86/min-legal-vector-width.ll
index 6693e3c67a5..5e5d74defe4 100644
--- a/test/CodeGen/X86/required-vector-width.ll
+++ b/test/CodeGen/X86/min-legal-vector-width.ll
@@ -3,7 +3,7 @@
 
 ; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled.
 
-define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="256" {
+define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: add256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
@@ -21,7 +21,7 @@ define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-ve
   ret void
 }
 
-define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="512" {
+define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: add512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
@@ -36,7 +36,7 @@ define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-ve
   ret void
 }
 
-define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" {
+define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: avg_v64i8_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rsi), %ymm0
@@ -60,7 +60,7 @@ define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"
 }
 
 
-define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="512" {
+define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: avg_v64i8_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm0
@@ -80,7 +80,7 @@ define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"
   ret void
 }
 
-define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="256" {
+define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: pmaddwd_32_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
@@ -103,7 +103,7 @@ define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %C
    ret void
 }
 
-define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="512" {
+define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: pmaddwd_32_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
@@ -123,7 +123,7 @@ define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %C
    ret void
 }
 
-define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="256" {
+define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: psubus_64i8_max_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
@@ -143,7 +143,7 @@ define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>*
   ret void
 }
 
-define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="512" {
+define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: psubus_64i8_max_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
@@ -160,7 +160,7 @@ define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>*
   ret void
 }
 
-define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="256" {
+define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: _Z9test_charPcS_i_256:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %edx, %eax
@@ -231,7 +231,7 @@ middle.block:
   ret i32 %13
 }
 
-define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="512" {
+define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: _Z9test_charPcS_i_512:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %edx, %eax
@@ -300,7 +300,7 @@ middle.block:
 @a = global [1024 x i8] zeroinitializer, align 16
 @b = global [1024 x i8] zeroinitializer, align 16
 
-define i32 @sad_16i8_256() "required-vector-width"="256" {
+define i32 @sad_16i8_256() "min-legal-vector-width"="256" {
 ; CHECK-LABEL: sad_16i8_256:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
@@ -362,7 +362,7 @@ middle.block:
   ret i32 %12
 }
 
-define i32 @sad_16i8_512() "required-vector-width"="512" {
+define i32 @sad_16i8_512() "min-legal-vector-width"="512" {
 ; CHECK-LABEL: sad_16i8_512:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
@@ -424,7 +424,7 @@ middle.block:
   ret i32 %12
 }
 
-define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" {
+define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: sbto16f32_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -443,7 +443,7 @@ define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-w
   ret void
 }
 
-define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" {
+define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: sbto16f32_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -458,7 +458,7 @@ define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-w
   ret void
 }
 
-define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res)  "required-vector-width"="256" {
+define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res)  "min-legal-vector-width"="256" {
 ; CHECK-LABEL: sbto16f64_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -483,7 +483,7 @@ define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res)  "required-vector
   ret void
 }
 
-define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res)  "required-vector-width"="512" {
+define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res)  "min-legal-vector-width"="512" {
 ; CHECK-LABEL: sbto16f64_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -501,7 +501,7 @@ define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res)  "required-vector
   ret void
 }
 
-define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" {
+define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: ubto16f32_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -522,7 +522,7 @@ define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-w
   ret void
 }
 
-define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" {
+define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: ubto16f32_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -538,7 +538,7 @@ define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-w
   ret void
 }
 
-define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="256" {
+define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: ubto16f64_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -565,7 +565,7 @@ define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-
   ret void
 }
 
-define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="512" {
+define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: ubto16f64_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -584,7 +584,7 @@ define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-
   ret void
 }
 
-define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" {
+define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: test_16f32toub_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
@@ -602,7 +602,7 @@ define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru)
   ret <16 x i16> %select
 }
 
-define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" {
+define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: test_16f32toub_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttps2dq (%rdi), %zmm1
@@ -616,7 +616,7 @@ define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru)
   ret <16 x i16> %select
 }
 
-define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" {
+define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: test_16f32tosb_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
@@ -634,7 +634,7 @@ define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru)
   ret <16 x i16> %select
 }
 
-define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" {
+define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: test_16f32tosb_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttps2dq (%rdi), %zmm1
-- 
GitLab


From e87a916f684a369fd0f7c8eaa80dca92dd5b0ad1 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sgundapa@codeaurora.org>
Date: Thu, 25 Oct 2018 21:25:30 +0000
Subject: [PATCH 0592/1116] [Pipeliner] Remove the unneeded include
 header(NFC).

Differential Revision: https://reviews.llvm.org/D53451


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345318 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachinePipeliner.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index a341aac227a..e3166f60f3d 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -102,7 +102,6 @@
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-- 
GitLab


From 4b57585e55dcdf71e11adb0fd71a26a2828808a7 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sgundapa@codeaurora.org>
Date: Thu, 25 Oct 2018 21:27:08 +0000
Subject: [PATCH 0593/1116] [Pipeliner] Ignore Artificial dependences while
 computing recurrences.

The artificial dependencies are not real dependencies. In some cases, they
form circuits with bigger MII. However, they are used to schedule instructions
better.

Differential Revision: https://reviews.llvm.org/D53450


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345319 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachinePipeliner.cpp         |   6 +-
 test/CodeGen/Hexagon/swp-art-deps-rec.ll | 107 +++++++++++++++++++++++
 2 files changed, 110 insertions(+), 3 deletions(-)
 create mode 100644 test/CodeGen/Hexagon/swp-art-deps-rec.ll

diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index e3166f60f3d..bb5fc664c5f 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -1530,9 +1530,9 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
         }
         OutputDeps[N] = BackEdge;
       }
-      // Do not process a boundary node and a back-edge is processed only
-      // if it goes to a Phi.
-      if (SI.getSUnit()->isBoundaryNode() ||
+      // Do not process a boundary node, an artificial node.
+      // A back-edge is processed only if it goes to a Phi.
+      if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
           (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
         continue;
       int N = SI.getSUnit()->NodeNum;
diff --git a/test/CodeGen/Hexagon/swp-art-deps-rec.ll b/test/CodeGen/Hexagon/swp-art-deps-rec.ll
new file mode 100644
index 00000000000..941a8b8a3f9
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-art-deps-rec.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv65 -O3 -debug-only=pipeliner \
+; RUN: < %s 2>&1 | FileCheck %s
+
+; Test that the artificial dependences are ignored while computing the
+; circuits.
+
+; The recurrence should be 1 here. If we do not ignore artificial deps,
+; it will be greater.
+; CHECK: rec=1,
+
+define void @foo(i32 %size) #0 {
+entry:
+  %add = add nsw i32 0, 4
+  %shr = ashr i32 %size, 1
+  br i1 undef, label %L57.us, label %L57.us.ur
+
+L57.us:
+  %R9.0470.us = phi i32 [ %sub40.us.3, %L57.us ], [ undef, %entry ]
+  %sub40.us.3 = add i32 %R9.0470.us, -64
+  br i1 undef, label %L57.us, label %for.cond22.for.end_crit_edge.us.ur-lcssa
+
+for.cond22.for.end_crit_edge.us.ur-lcssa:
+  %inc.us.3.lcssa = phi i32 [ undef, %L57.us ]
+  %sub40.us.3.lcssa = phi i32 [ %sub40.us.3, %L57.us ]
+  %0 = icmp eq i32 %inc.us.3.lcssa, %shr
+  br i1 %0, label %for.cond22.for.end_crit_edge.us, label %L57.us.ur
+
+L57.us.ur:
+  %R15_14.0478.us.ur = phi i64 [ %1, %L57.us.ur ], [ 0, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R13_12.0477.us.ur = phi i64 [ %14, %L57.us.ur ], [ 0, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R11_10.0476.us.ur = phi i64 [ %8, %L57.us.ur ], [ 0, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R7_6.0475.us.ur = phi i64 [ %7, %L57.us.ur ], [ 0, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R5_4.2474.us.ur = phi i64 [ %16, %L57.us.ur ], [ undef, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R3_2.0473.us.ur = phi i64 [ %9, %L57.us.ur ], [ 0, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R1_0.0472.us.ur = phi i64 [ %15, %L57.us.ur ], [ undef, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %kk.0471.us.ur = phi i32 [ %inc.us.ur, %L57.us.ur ], [ 0, %entry ], [ %inc.us.3.lcssa, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R9.0470.us.ur = phi i32 [ %sub40.us.ur, %L57.us.ur ], [ undef, %entry ], [ %sub40.us.3.lcssa, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R8.0469.us.ur = phi i32 [ %sub34.us.ur, %L57.us.ur ], [ undef, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %1 = tail call i64 @llvm.hexagon.M2.vdmacs.s0(i64 %R15_14.0478.us.ur, i64 %R1_0.0472.us.ur, i64 %R3_2.0473.us.ur)
+  %2 = tail call i64 @llvm.hexagon.S2.shuffeh(i64 %R5_4.2474.us.ur, i64 %R7_6.0475.us.ur)
+  %3 = inttoptr i32 %R9.0470.us.ur to i16*
+  %4 = load i16, i16* %3, align 2
+  %conv27.us.ur = sext i16 %4 to i32
+  %sub28.us.ur = add i32 %R9.0470.us.ur, -8
+  %5 = inttoptr i32 %R8.0469.us.ur to i16*
+  %6 = load i16, i16* %5, align 2
+  %conv30.us.ur = sext i16 %6 to i32
+  %sub31.us.ur = add i32 %R8.0469.us.ur, -8
+  %7 = tail call i64 @llvm.hexagon.A2.combinew(i32 %conv27.us.ur, i32 %conv30.us.ur)
+  %8 = tail call i64 @llvm.hexagon.M2.vdmacs.s0(i64 %R11_10.0476.us.ur, i64 %R1_0.0472.us.ur, i64 %2)
+  %9 = tail call i64 @llvm.hexagon.S2.shuffeh(i64 %7, i64 %R5_4.2474.us.ur)
+  %10 = inttoptr i32 %sub31.us.ur to i16*
+  %11 = load i16, i16* %10, align 2
+  %conv33.us.ur = sext i16 %11 to i32
+  %sub34.us.ur = add i32 %R8.0469.us.ur, -16
+  %conv35.us.ur = trunc i64 %9 to i32
+  %12 = inttoptr i32 %sub28.us.ur to i16*
+  %13 = load i16, i16* %12, align 2
+  %conv39.us.ur = sext i16 %13 to i32
+  %sub40.us.ur = add i32 %R9.0470.us.ur, -16
+  %14 = tail call i64 @llvm.hexagon.M2.vdmacs.s0(i64 %R13_12.0477.us.ur, i64 %R1_0.0472.us.ur, i64 %9)
+  %15 = tail call i64 @llvm.hexagon.A2.combinew(i32 %conv35.us.ur, i32 undef)
+  %16 = tail call i64 @llvm.hexagon.A2.combinew(i32 %conv39.us.ur, i32 %conv33.us.ur)
+  %inc.us.ur = add nsw i32 %kk.0471.us.ur, 1
+  %exitcond535.ur = icmp eq i32 %inc.us.ur, %shr
+  br i1 %exitcond535.ur, label %for.cond22.for.end_crit_edge.us.ur-lcssa572, label %L57.us.ur
+
+for.cond22.for.end_crit_edge.us.ur-lcssa572:
+  %.lcssa730 = phi i64 [ %14, %L57.us.ur ]
+  %.lcssa729 = phi i64 [ %8, %L57.us.ur ]
+  %.lcssa728 = phi i64 [ %1, %L57.us.ur ]
+  %extract.t652 = trunc i64 %.lcssa730 to i32
+  %extract661 = lshr i64 %.lcssa729, 32
+  %extract.t662 = trunc i64 %extract661 to i32
+  %extract.t664 = trunc i64 %.lcssa728 to i32
+  br label %for.cond22.for.end_crit_edge.us
+
+for.cond22.for.end_crit_edge.us:
+  %.lcssa551.off0 = phi i32 [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ], [ %extract.t652, %for.cond22.for.end_crit_edge.us.ur-lcssa572 ]
+  %.lcssa550.off32 = phi i32 [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ], [ %extract.t662, %for.cond22.for.end_crit_edge.us.ur-lcssa572 ]
+  %.lcssa549.off0 = phi i32 [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ], [ %extract.t664, %for.cond22.for.end_crit_edge.us.ur-lcssa572 ]
+  %17 = inttoptr i32 %add to i32*
+  store i32 %.lcssa549.off0, i32* %17, align 4
+  %add.ptr61.us = getelementptr inbounds i8, i8* null, i32 32
+  %18 = bitcast i8* %add.ptr61.us to i32*
+  store i32 %.lcssa551.off0, i32* %18, align 4
+  %19 = bitcast i8* undef to i32*
+  store i32 %.lcssa550.off32, i32* %19, align 4
+  call void @llvm.trap()
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.A2.combinew(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.M2.vdmacs.s0(i64, i64, i64) #1
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.S2.shuffeh(i64, i64) #1
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noreturn nounwind }
-- 
GitLab


From 441900b083e3f315dac3a05b4cfa52b7e41c3d12 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Thu, 25 Oct 2018 21:35:59 +0000
Subject: [PATCH 0594/1116] llvm-dwarfdump: loclists: Don't expect an (albeit
 empty) expression for LLE_base_address

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345320 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/DWARF/DWARFDebugLoc.cpp            | 14 ++++++++------
 test/DebugInfo/X86/dwarfdump-debug-loclists.test |  3 +--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index 044a0243360..9146b457a5d 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -183,12 +183,14 @@ DWARFDebugLoclists::parseOneLocationList(DataExtractor Data, unsigned *Offset,
       return None;
     }
 
-    unsigned Bytes = Data.getU16(Offset);
-    // A single location description describing the location of the object...
-    StringRef str = Data.getData().substr(*Offset, Bytes);
-    *Offset += Bytes;
-    E.Loc.resize(str.size());
-    std::copy(str.begin(), str.end(), E.Loc.begin());
+    if (Kind != dwarf::DW_LLE_base_address) {
+      unsigned Bytes = Data.getU16(Offset);
+      // A single location description describing the location of the object...
+      StringRef str = Data.getData().substr(*Offset, Bytes);
+      *Offset += Bytes;
+      E.Loc.resize(str.size());
+      std::copy(str.begin(), str.end(), E.Loc.begin());
+    }
 
     LL.Entries.push_back(std::move(E));
   }
diff --git a/test/DebugInfo/X86/dwarfdump-debug-loclists.test b/test/DebugInfo/X86/dwarfdump-debug-loclists.test
index e5f7fb0c1c1..669607fe557 100644
--- a/test/DebugInfo/X86/dwarfdump-debug-loclists.test
+++ b/test/DebugInfo/X86/dwarfdump-debug-loclists.test
@@ -9,7 +9,7 @@
 # CHECK-NEXT:    [0x0000000000000700, 0x0000000000000710): DW_OP_breg5 RDI+0
 
 # CHECK:      .debug_loclists contents:
-# CHECK-NEXT: 0x00000000: locations list header: length = 0x00000031, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+# CHECK-NEXT: 0x00000000: locations list header: length = 0x0000002f, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
 # CHECK-NEXT: 0x00000000:
 # CHECK-NEXT:   [0x0000000000000000, 0x0000000000000010): DW_OP_breg5 RDI+0
 # CHECK-NEXT:   [0x0000000000000530, 0x0000000000000540): DW_OP_breg6 RBP-8, DW_OP_deref
@@ -43,7 +43,6 @@
   
   .byte  6                       # DW_LLE_base_address
   .quad  0x500                   # Some address
-  .short  0                      # Loc expr size = 0.
   
   .byte  4                       # DW_LLE_offset_pair
   .uleb128 0x30                  #   starting offset
-- 
GitLab


From ff3c72bc7c5cb096cc570f3b7d21f9b50501d239 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Thu, 25 Oct 2018 21:47:22 +0000
Subject: [PATCH 0595/1116] [SystemZ]  Improve handling and cost estimates of
 vector integer div/rem

Enable the DAG optimization that converts vector div/rem with constants into
multiply+shifts sequences by expanding them early. This is needed since
ISD::SMUL_LOHI is 'Custom' lowered on SystemZ, and will therefore not be
available to BuildSDIV after legalization.

Better cost values for these instructions based on how they will be
implemented (a constant divisor is cheaper).

Review: Ulrich Weigand
https://reviews.llvm.org/D53196

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345321 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/SystemZ/SystemZISelLowering.cpp    |  25 ++
 lib/Target/SystemZ/SystemZISelLowering.h      |   1 +
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  82 ++--
 test/Analysis/CostModel/SystemZ/div-pow2.ll   | 154 -------
 .../CostModel/SystemZ/divrem-const.ll         | 291 +++++++++++++
 .../Analysis/CostModel/SystemZ/divrem-pow2.ll | 383 ++++++++++++++++++
 test/Analysis/CostModel/SystemZ/divrem-reg.ll | 286 +++++++++++++
 test/Analysis/CostModel/SystemZ/int-arith.ll  | 187 ---------
 .../SystemZ/memop-folding-int-arith.ll        |  28 +-
 9 files changed, 1043 insertions(+), 394 deletions(-)
 delete mode 100644 test/Analysis/CostModel/SystemZ/div-pow2.ll
 create mode 100644 test/Analysis/CostModel/SystemZ/divrem-const.ll
 create mode 100644 test/Analysis/CostModel/SystemZ/divrem-pow2.ll
 create mode 100644 test/Analysis/CostModel/SystemZ/divrem-reg.ll

diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 53cd21c4236..d86737e2192 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -527,6 +527,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::FP_ROUND);
   setTargetDAGCombine(ISD::BSWAP);
+  setTargetDAGCombine(ISD::SDIV);
+  setTargetDAGCombine(ISD::UDIV);
+  setTargetDAGCombine(ISD::SREM);
+  setTargetDAGCombine(ISD::UREM);
 
   // Handle intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -5664,6 +5668,23 @@ SDValue SystemZTargetLowering::combineGET_CCMASK(
   return Select->getOperand(4);
 }
 
+SDValue SystemZTargetLowering::combineIntDIVREM(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  // In the case where the divisor is a vector of constants a cheaper
+  // sequence of instructions can replace the divide. BuildSDIV is called to
+  // do this during DAG combining, but it only succeeds when it can build a
+  // multiplication node. The only option for SystemZ is ISD::SMUL_LOHI, and
+  // since it is not Legal but Custom it can only happen before
+  // legalization. Therefore we must scalarize this early before Combine
+  // 1. For widened vectors, this is already the result of type legalization.
+  if (VT.isVector() && isTypeLegal(VT) &&
+      DAG.isConstantIntBuildVectorOrConstantInt(N->getOperand(1)))
+    return DAG.UnrollVectorOp(N);
+  return SDValue();
+}
+
 SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   switch(N->getOpcode()) {
@@ -5681,6 +5702,10 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
   case SystemZISD::BR_CCMASK:   return combineBR_CCMASK(N, DCI);
   case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
   case SystemZISD::GET_CCMASK:  return combineGET_CCMASK(N, DCI);
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:               return combineIntDIVREM(N, DCI);
   }
 
   return SDValue();
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 267e31a8521..4b6be9bff0a 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -605,6 +605,7 @@ private:
   SDValue combineBR_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineIntDIVREM(SDNode *N, DAGCombinerInfo &DCI) const;
 
   // If the last instruction before MBBI in MBB was some form of COMPARE,
   // try to replace it with a COMPARE AND BRANCH just before MBBI.
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 1eaeb9699bf..f52c9ca6e49 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -362,27 +362,33 @@ int SystemZTTIImpl::getArithmeticInstrCost(
 
   unsigned ScalarBits = Ty->getScalarSizeInBits();
 
-  // Div with a constant which is a power of 2 will be converted by
-  // DAGCombiner to use shifts. With vector shift-element instructions, a
-  // vector sdiv costs about as much as a scalar one.
-  const unsigned SDivCostEstimate = 4;
-  bool SDivPow2 = false;
-  bool UDivPow2 = false;
-  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv) &&
-      Args.size() == 2) {
-    const ConstantInt *CI = nullptr;
+  // There are thre cases of division and remainder: Dividing with a register
+  // needs a divide instruction. A divisor which is a power of two constant
+  // can be implemented with a sequence of shifts. Any other constant needs a
+  // multiply and shifts.
+  const unsigned DivInstrCost = 20;
+  const unsigned DivMulSeqCost = 10;
+  const unsigned SDivPow2Cost = 4;
+
+  bool SignedDivRem =
+      Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
+  bool UnsignedDivRem =
+      Opcode == Instruction::UDiv || Opcode == Instruction::URem;
+
+  // Check for a constant divisor.
+  bool DivRemConst = false;
+  bool DivRemConstPow2 = false;
+  if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
     if (const Constant *C = dyn_cast<Constant>(Args[1])) {
-      if (C->getType()->isVectorTy())
-        CI = dyn_cast_or_null<const ConstantInt>(C->getSplatValue());
+      const ConstantInt *CVal =
+          (C->getType()->isVectorTy()
+               ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
+               : dyn_cast<const ConstantInt>(C));
+      if (CVal != nullptr &&
+          (CVal->getValue().isPowerOf2() || (-CVal->getValue()).isPowerOf2()))
+        DivRemConstPow2 = true;
       else
-        CI = dyn_cast<const ConstantInt>(C);
-    }
-    if (CI != nullptr &&
-        (CI->getValue().isPowerOf2() || (-CI->getValue()).isPowerOf2())) {
-      if (Opcode == Instruction::SDiv)
-        SDivPow2 = true;
-      else
-        UDivPow2 = true;
+        DivRemConst = true;
     }
   }
 
@@ -394,18 +400,19 @@ int SystemZTTIImpl::getArithmeticInstrCost(
     // These vector operations are custom handled, but are still supported
     // with one instruction per vector, regardless of element size.
     if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
-        Opcode == Instruction::AShr || UDivPow2) {
+        Opcode == Instruction::AShr) {
       return NumVectors;
     }
 
-    if (SDivPow2)
-      return (NumVectors * SDivCostEstimate);
-
-    // Temporary hack: disable high vectorization factors with integer
-    // division/remainder, which will get scalarized and handled with GR128
-    // registers. The mischeduler is not clever enough to avoid spilling yet.
-    if ((Opcode == Instruction::UDiv || Opcode == Instruction::SDiv ||
-         Opcode == Instruction::URem || Opcode == Instruction::SRem) && VF > 4)
+    if (DivRemConstPow2)
+      return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
+    if (DivRemConst)
+      return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args);
+    if ((SignedDivRem || UnsignedDivRem) && VF > 4)
+      // Temporary hack: disable high vectorization factors with integer
+      // division/remainder, which will get scalarized and handled with
+      // GR128 registers. The mischeduler is not clever enough to avoid
+      // spilling yet.
       return 1000;
 
     // These FP operations are supported with a single vector instruction for
@@ -471,19 +478,16 @@ int SystemZTTIImpl::getArithmeticInstrCost(
       return 7; // 2 * ipm sequences ; xor ; shift ; compare
     }
 
-    if (UDivPow2)
-      return 1;
-    if (SDivPow2)
-      return SDivCostEstimate;
-
-    // An extra extension for narrow types is needed.
-    if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
+    if (DivRemConstPow2)
+      return (SignedDivRem ? SDivPow2Cost : 1);
+    if (DivRemConst)
+      return DivMulSeqCost;
+    if (SignedDivRem)
       // sext of op(s) for narrow types
-      return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1));
-
-    if (Opcode == Instruction::UDiv || Opcode == Instruction::URem)
+      return DivInstrCost + (ScalarBits < 32 ? 3 : (ScalarBits == 32 ? 1 : 0));
+    if (UnsignedDivRem)
       // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
-      return (ScalarBits < 32 ? 4 : 2);
+      return DivInstrCost + (ScalarBits < 32 ? 3 : 1);
   }
 
   // Fallback to the default implementation.
diff --git a/test/Analysis/CostModel/SystemZ/div-pow2.ll b/test/Analysis/CostModel/SystemZ/div-pow2.ll
deleted file mode 100644
index 9ef2dd71e8f..00000000000
--- a/test/Analysis/CostModel/SystemZ/div-pow2.ll
+++ /dev/null
@@ -1,154 +0,0 @@
-; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
-
-; Scalar sdiv
-
-define i64 @fun0(i64 %a) {
-  %r = sdiv i64 %a, 2
-  ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i64 %a, 2
-}
-
-define i64 @fun1(i64 %a) {
-  %r = sdiv i64 %a, -4
-  ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i64 %a, -4
-}
-
-define i32 @fun2(i32 %a) {
-  %r = sdiv i32 %a, 8
-  ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i32 %a, 8
-}
-
-define i32 @fun3(i32 %a) {
-  %r = sdiv i32 %a, -16
-  ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i32 %a, -16
-}
-
-define i16 @fun4(i16 %a) {
-  %r = sdiv i16 %a, 32
-  ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i16 %a, 32
-}
-
-define i16 @fun5(i16 %a) {
-  %r = sdiv i16 %a, -64
-  ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i16 %a, -64
-}
-
-define i8 @fun6(i8 %a) {
-  %r = sdiv i8 %a, 64
-  ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i8 %a, 64
-}
-
-define i8 @fun7(i8 %a) {
-  %r = sdiv i8 %a, -128
-  ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i8 %a, -128
-}
-
-
-; Vector sdiv
-
-define <2 x i64> @fun8(<2 x i64> %a) {
-  %r = sdiv <2 x i64> %a, <i64 2, i64 2>
-  ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i64> %a, <i64 2, i64 2>
-}
-
-define <2 x i64> @fun9(<2 x i64> %a) {
-  %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
-  ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
-}
-
-define <4 x i32> @fun10(<4 x i32> %a) {
-  %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
-  ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
-}
-
-define <4 x i32> @fun11(<4 x i32> %a) {
-  %r = sdiv <4 x i32> %a, <i32 -16, i32 -16, i32 -16, i32 -16>
-  ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i32> %a, <i32 -16
-}
-
-define <8 x i16> @fun12(<8 x i16> %a) {
-  %r = sdiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
-  ret <8 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i16> %a, <i16 32
-}
-
-define <8 x i16> @fun13(<8 x i16> %a) {
-  %r = sdiv <8 x i16> %a, <i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64>
-  ret <8 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i16> %a, <i16 -64
-}
-
-define <16 x i8> @fun14(<16 x i8> %a) {
-  %r = sdiv <16 x i8> %a, <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
-  ret <16 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <16 x i8> %a, <i8 64
-}
-
-define <16 x i8> @fun15(<16 x i8> %a) {
-  %r = sdiv <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
-  ret <16 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <16 x i8> %a, <i8 -128
-}
-
-; Scalar udiv
-
-define i64 @fun16(i64 %a) {
-  %r = udiv i64 %a, 2
-  ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i64 %a, 2
-}
-
-define i32 @fun17(i32 %a) {
-  %r = udiv i32 %a, 8
-  ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i32 %a, 8
-}
-
-define i16 @fun18(i16 %a) {
-  %r = udiv i16 %a, 32
-  ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i16 %a, 32
-}
-
-define i8 @fun19(i8 %a) {
-  %r = udiv i8 %a, 128
-  ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i8 %a, -128
-}
-
-; Vector udiv
-
-define <2 x i64> @fun20(<2 x i64> %a) {
-  %r = udiv <2 x i64> %a, <i64 2, i64 2>
-  ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <2 x i64> %a, <i64 2
-}
-
-define <4 x i32> @fun21(<4 x i32> %a) {
-  %r = udiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
-  ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <4 x i32> %a, <i32 8
-}
-
-define <8 x i16> @fun22(<8 x i16> %a) {
-  %r = udiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
-  ret <8 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <8 x i16> %a, <i16 32
-}
-
-define <16 x i8> @fun23(<16 x i8> %a) {
-  %r = udiv <16 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
-  ret <16 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <16 x i8> %a, <i8 -128
-}
diff --git a/test/Analysis/CostModel/SystemZ/divrem-const.ll b/test/Analysis/CostModel/SystemZ/divrem-const.ll
new file mode 100644
index 00000000000..0889d0f96af
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/divrem-const.ll
@@ -0,0 +1,291 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 \
+; RUN:  | FileCheck %s -check-prefix=COST
+
+; Check that all divide/remainder instructions are implemented by cheaper instructions.
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -o - | FileCheck %s
+; CHECK-NOT: dsg
+; CHECK-NOT: dl
+
+; Check costs of divisions/remainders by a vector of constants that is *not*
+; a power of two. A sequence containing a multiply and shifts will replace
+; the divide instruction.
+
+; Scalar sdiv
+
+define i64 @fun0(i64 %a) {
+  %r = sdiv i64 %a, 20
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = sdiv i64 %a, 20
+}
+
+define i32 @fun1(i32 %a) {
+  %r = sdiv i32 %a, 20
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = sdiv i32 %a, 20
+}
+
+define i16 @fun2(i16 %a) {
+  %r = sdiv i16 %a, 20
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = sdiv i16 %a, 20
+}
+
+define i8 @fun3(i8 %a) {
+  %r = sdiv i8 %a, 20
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = sdiv i8 %a, 20
+}
+
+; Vector sdiv
+
+define <2 x i64> @fun4(<2 x i64> %a) {
+  %r = sdiv <2 x i64> %a, <i64 20, i64 21>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 24 for instruction:   %r = sdiv <2 x i64>
+}
+
+define <4 x i32> @fun5(<4 x i32> %a) {
+  %r = sdiv <4 x i32> %a, <i32 20, i32 20, i32 20, i32 20>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = sdiv <4 x i32>
+}
+
+define <2 x i32> @fun6(<2 x i32> %a) {
+  %r = sdiv <2 x i32> %a, <i32 20, i32 21>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 25 for instruction:   %r = sdiv <2 x i32>
+}
+
+define <8 x i16> @fun7(<8 x i16> %a) {
+  %r = sdiv <8 x i16> %a, <i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = sdiv <8 x i16>
+}
+
+define <4 x i16> @fun8(<4 x i16> %a) {
+  %r = sdiv <4 x i16> %a, <i16 20, i16 20, i16 20, i16 21>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = sdiv <4 x i16>
+}
+
+define <16 x i8> @fun9(<16 x i8> %a) {
+  %r = sdiv <16 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 193 for instruction:   %r = sdiv <16 x i8>
+}
+
+define <8 x i8> @fun10(<8 x i8> %a) {
+  %r = sdiv <8 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 21>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = sdiv <8 x i8>
+}
+
+; Scalar udiv
+
+define i64 @fun11(i64 %a) {
+  %r = udiv i64 %a, 20
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = udiv i64 %a, 20
+}
+
+define i32 @fun12(i32 %a) {
+  %r = udiv i32 %a, 20
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = udiv i32 %a, 20
+}
+
+define i16 @fun13(i16 %a) {
+  %r = udiv i16 %a, 20
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = udiv i16 %a, 20
+}
+
+define i8 @fun14(i8 %a) {
+  %r = udiv i8 %a, 20
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = udiv i8
+}
+
+; Vector udiv
+
+define <2 x i64> @fun15(<2 x i64> %a) {
+  %r = udiv <2 x i64> %a, <i64 20, i64 20>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 24 for instruction:   %r = udiv <2 x i64>
+}
+
+define <4 x i32> @fun16(<4 x i32> %a) {
+  %r = udiv <4 x i32> %a, <i32 20, i32 20, i32 20, i32 21>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = udiv <4 x i32>
+}
+
+define <2 x i32> @fun17(<2 x i32> %a) {
+  %r = udiv <2 x i32> %a, <i32 20, i32 20>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 25 for instruction:   %r = udiv <2 x i32>
+}
+
+define <8 x i16> @fun18(<8 x i16> %a) {
+  %r = udiv <8 x i16> %a, <i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 21>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = udiv <8 x i16>
+}
+
+define <4 x i16> @fun19(<4 x i16> %a) {
+  %r = udiv <4 x i16> %a, <i16 20, i16 20, i16 20, i16 20>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = udiv <4 x i16>
+}
+
+define <16 x i8> @fun20(<16 x i8> %a) {
+  %r = udiv <16 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 21>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 193 for instruction:   %r = udiv <16 x i8>
+}
+
+define <8 x i8> @fun21(<8 x i8> %a) {
+  %r = udiv <8 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = udiv <8 x i8>
+}
+
+; Scalar srem
+
+define i64 @fun22(i64 %a) {
+  %r = srem i64 %a, 20
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = srem i64
+}
+
+define i32 @fun23(i32 %a) {
+  %r = srem i32 %a, 20
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = srem i32
+}
+
+define i16 @fun24(i16 %a) {
+  %r = srem i16 %a, 20
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = srem i16
+}
+
+define i8 @fun25(i8 %a) {
+  %r = srem i8 %a, 20
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = srem i8
+}
+
+; Vector srem
+
+define <2 x i64> @fun26(<2 x i64> %a) {
+  %r = srem <2 x i64> %a, <i64 20, i64 21>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 24 for instruction:   %r = srem <2 x i64>
+}
+
+define <4 x i32> @fun27(<4 x i32> %a) {
+  %r = srem <4 x i32> %a, <i32 20, i32 20, i32 20, i32 20>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = srem <4 x i32>
+}
+
+define <2 x i32> @fun28(<2 x i32> %a) {
+  %r = srem <2 x i32> %a, <i32 20, i32 21>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 25 for instruction:   %r = srem <2 x i32>
+}
+
+define <8 x i16> @fun29(<8 x i16> %a) {
+  %r = srem <8 x i16> %a, <i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = srem <8 x i16>
+}
+
+define <4 x i16> @fun30(<4 x i16> %a) {
+  %r = srem <4 x i16> %a, <i16 20, i16 20, i16 20, i16 21>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = srem <4 x i16>
+}
+
+define <16 x i8> @fun31(<16 x i8> %a) {
+  %r = srem <16 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 193 for instruction:   %r = srem <16 x i8>
+}
+
+define <8 x i8> @fun32(<8 x i8> %a) {
+  %r = srem <8 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 21>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = srem <8 x i8>
+}
+
+; Scalar urem
+
+define i64 @fun33(i64 %a) {
+  %r = urem i64 %a, 20
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = urem i64
+}
+
+define i32 @fun34(i32 %a) {
+  %r = urem i32 %a, 20
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = urem i32
+}
+
+define i16 @fun35(i16 %a) {
+  %r = urem i16 %a, 20
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = urem i16
+}
+
+define i8 @fun36(i8 %a) {
+  %r = urem i8 %a, 20
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = urem i8
+}
+
+; Vector urem
+
+define <2 x i64> @fun37(<2 x i64> %a) {
+  %r = urem <2 x i64> %a, <i64 20, i64 20>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 24 for instruction:   %r = urem <2 x i64>
+}
+
+define <4 x i32> @fun38(<4 x i32> %a) {
+  %r = urem <4 x i32> %a, <i32 20, i32 20, i32 20, i32 21>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = urem <4 x i32>
+}
+
+define <2 x i32> @fun39(<2 x i32> %a) {
+  %r = urem <2 x i32> %a, <i32 20, i32 20>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 25 for instruction:   %r = urem <2 x i32>
+}
+
+define <8 x i16> @fun40(<8 x i16> %a) {
+  %r = urem <8 x i16> %a, <i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 21>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = urem <8 x i16>
+}
+
+define <4 x i16> @fun41(<4 x i16> %a) {
+  %r = urem <4 x i16> %a, <i16 20, i16 20, i16 20, i16 20>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = urem <4 x i16>
+}
+
+define <16 x i8> @fun42(<16 x i8> %a) {
+  %r = urem <16 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 21>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 193 for instruction:   %r = urem <16 x i8>
+}
+
+define <8 x i8> @fun43(<8 x i8> %a) {
+  %r = urem <8 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = urem <8 x i8>
+}
diff --git a/test/Analysis/CostModel/SystemZ/divrem-pow2.ll b/test/Analysis/CostModel/SystemZ/divrem-pow2.ll
new file mode 100644
index 00000000000..ad67ef9405f
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/divrem-pow2.ll
@@ -0,0 +1,383 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 \
+; RUN:  | FileCheck %s -check-prefix=COST
+
+; Check that all divide/remainder instructions are implemented by cheaper instructions.
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -o - | FileCheck %s
+; CHECK-NOT: dsg
+; CHECK-NOT: dl
+
+; Scalar sdiv
+
+define i64 @fun0(i64 %a) {
+  %r = sdiv i64 %a, 2
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i64 %a, 2
+}
+
+define i64 @fun1(i64 %a) {
+  %r = sdiv i64 %a, -4
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i64 %a, -4
+}
+
+define i32 @fun2(i32 %a) {
+  %r = sdiv i32 %a, 8
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i32 %a, 8
+}
+
+define i32 @fun3(i32 %a) {
+  %r = sdiv i32 %a, -16
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i32 %a, -16
+}
+
+define i16 @fun4(i16 %a) {
+  %r = sdiv i16 %a, 32
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i16 %a, 32
+}
+
+define i16 @fun5(i16 %a) {
+  %r = sdiv i16 %a, -64
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i16 %a, -64
+}
+
+define i8 @fun6(i8 %a) {
+  %r = sdiv i8 %a, 64
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i8 %a, 64
+}
+
+define i8 @fun7(i8 %a) {
+  %r = sdiv i8 %a, -128
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i8 %a, -128
+}
+
+; Vector sdiv
+
+define <2 x i64> @fun8(<2 x i64> %a) {
+  %r = sdiv <2 x i64> %a, <i64 2, i64 2>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i64> %a, <i64 2, i64 2>
+}
+
+define <2 x i64> @fun9(<2 x i64> %a) {
+  %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
+}
+
+define <4 x i32> @fun10(<4 x i32> %a) {
+  %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+}
+
+define <4 x i32> @fun11(<4 x i32> %a) {
+  %r = sdiv <4 x i32> %a, <i32 -16, i32 -16, i32 -16, i32 -16>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i32> %a, <i32 -16
+}
+
+define <2 x i32> @fun12(<2 x i32> %a) {
+  %r = sdiv <2 x i32> %a, <i32 -16, i32 -16>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i32> %a, <i32 -16
+}
+
+define <8 x i16> @fun13(<8 x i16> %a) {
+  %r = sdiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i16> %a, <i16 32
+}
+
+define <8 x i16> @fun14(<8 x i16> %a) {
+  %r = sdiv <8 x i16> %a, <i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i16> %a, <i16 -64
+}
+
+define <4 x i16> @fun15(<4 x i16> %a) {
+  %r = sdiv <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i16> %a, <i16 32
+}
+
+define <16 x i8> @fun16(<16 x i8> %a) {
+  %r = sdiv <16 x i8> %a, <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <16 x i8> %a, <i8 64
+}
+
+define <16 x i8> @fun17(<16 x i8> %a) {
+  %r = sdiv <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <16 x i8> %a, <i8 -128
+}
+
+define <8 x i8> @fun18(<8 x i8> %a) {
+  %r = sdiv <8 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i8> %a, <i8 -128
+}
+
+; Scalar udiv
+
+define i64 @fun19(i64 %a) {
+  %r = udiv i64 %a, 2
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i64 %a, 2
+}
+
+define i32 @fun20(i32 %a) {
+  %r = udiv i32 %a, 8
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i32 %a, 8
+}
+
+define i16 @fun21(i16 %a) {
+  %r = udiv i16 %a, 32
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i16 %a, 32
+}
+
+define i8 @fun22(i8 %a) {
+  %r = udiv i8 %a, 128
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i8 %a, -128
+}
+
+; Vector udiv
+
+define <2 x i64> @fun23(<2 x i64> %a) {
+  %r = udiv <2 x i64> %a, <i64 2, i64 2>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <2 x i64> %a, <i64 2
+}
+
+define <4 x i32> @fun24(<4 x i32> %a) {
+  %r = udiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <4 x i32> %a, <i32 8
+}
+
+define <2 x i32> @fun25(<2 x i32> %a) {
+  %r = udiv <2 x i32> %a, <i32 8, i32 8>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <2 x i32> %a, <i32 8
+}
+
+define <8 x i16> @fun26(<8 x i16> %a) {
+  %r = udiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <8 x i16> %a, <i16 32
+}
+
+define <4 x i16> @fun27(<4 x i16> %a) {
+  %r = udiv <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <4 x i16> %a, <i16 32
+}
+
+define <16 x i8> @fun28(<16 x i8> %a) {
+  %r = udiv <16 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <16 x i8> %a, <i8 -128
+}
+
+define <8 x i8> @fun29(<8 x i8> %a) {
+  %r = udiv <8 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <8 x i8> %a, <i8 -128
+}
+
+; Scalar srem
+
+define i64 @fun30(i64 %a) {
+  %r = srem i64 %a, 2
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i64 %a, 2
+}
+
+define i64 @fun31(i64 %a) {
+  %r = srem i64 %a, -4
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i64 %a, -4
+}
+
+define i32 @fun32(i32 %a) {
+  %r = srem i32 %a, 8
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i32 %a, 8
+}
+
+define i32 @fun33(i32 %a) {
+  %r = srem i32 %a, -16
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i32 %a, -16
+}
+
+define i16 @fun34(i16 %a) {
+  %r = srem i16 %a, 32
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i16 %a, 32
+}
+
+define i16 @fun35(i16 %a) {
+  %r = srem i16 %a, -64
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i16 %a, -64
+}
+
+define i8 @fun36(i8 %a) {
+  %r = srem i8 %a, 64
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i8 %a, 64
+}
+
+define i8 @fun37(i8 %a) {
+  %r = srem i8 %a, -128
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i8 %a, -128
+}
+
+; Vector srem
+
+define <2 x i64> @fun38(<2 x i64> %a) {
+  %r = srem <2 x i64> %a, <i64 2, i64 2>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <2 x i64> %a, <i64 2, i64 2>
+}
+
+define <2 x i64> @fun39(<2 x i64> %a) {
+  %r = srem <2 x i64> %a, <i64 -4, i64 -4>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <2 x i64> %a, <i64 -4, i64 -4>
+}
+
+define <4 x i32> @fun40(<4 x i32> %a) {
+  %r = srem <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+}
+
+define <4 x i32> @fun41(<4 x i32> %a) {
+  %r = srem <4 x i32> %a, <i32 -16, i32 -16, i32 -16, i32 -16>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <4 x i32> %a, <i32 -16
+}
+
+define <2 x i32> @fun42(<2 x i32> %a) {
+  %r = srem <2 x i32> %a, <i32 -16, i32 -16>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <2 x i32> %a, <i32 -16
+}
+
+define <8 x i16> @fun43(<8 x i16> %a) {
+  %r = srem <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <8 x i16> %a, <i16 32
+}
+
+define <8 x i16> @fun44(<8 x i16> %a) {
+  %r = srem <8 x i16> %a, <i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <8 x i16> %a, <i16 -64
+}
+
+define <4 x i16> @fun45(<4 x i16> %a) {
+  %r = srem <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <4 x i16> %a, <i16 32
+}
+
+define <16 x i8> @fun46(<16 x i8> %a) {
+  %r = srem <16 x i8> %a, <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <16 x i8> %a, <i8 64
+}
+
+define <16 x i8> @fun47(<16 x i8> %a) {
+  %r = srem <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <16 x i8> %a, <i8 -128
+}
+
+define <8 x i8> @fun48(<8 x i8> %a) {
+  %r = srem <8 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <8 x i8> %a, <i8 -128
+}
+
+; Scalar urem
+
+define i64 @fun49(i64 %a) {
+  %r = urem i64 %a, 2
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i64 %a, 2
+}
+
+define i32 @fun50(i32 %a) {
+  %r = urem i32 %a, 8
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i32 %a, 8
+}
+
+define i16 @fun51(i16 %a) {
+  %r = urem i16 %a, 32
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i16 %a, 32
+}
+
+define i8 @fun52(i8 %a) {
+  %r = urem i8 %a, 128
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i8 %a, -128
+}
+
+; Vector urem
+
+define <2 x i64> @fun53(<2 x i64> %a) {
+  %r = urem <2 x i64> %a, <i64 2, i64 2>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <2 x i64> %a, <i64 2
+}
+
+define <4 x i32> @fun54(<4 x i32> %a) {
+  %r = urem <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <4 x i32> %a, <i32 8
+}
+
+define <2 x i32> @fun55(<2 x i32> %a) {
+  %r = urem <2 x i32> %a, <i32 8, i32 8>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <2 x i32> %a, <i32 8
+}
+
+define <8 x i16> @fun56(<8 x i16> %a) {
+  %r = urem <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <8 x i16> %a, <i16 32
+}
+
+define <4 x i16> @fun57(<4 x i16> %a) {
+  %r = urem <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <4 x i16> %a, <i16 32
+}
+
+define <16 x i8> @fun58(<16 x i8> %a) {
+  %r = urem <16 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <16 x i8> %a, <i8 -128
+}
+
+define <8 x i8> @fun59(<8 x i8> %a) {
+  %r = urem <8 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <8 x i8> %a, <i8 -128
+}
diff --git a/test/Analysis/CostModel/SystemZ/divrem-reg.ll b/test/Analysis/CostModel/SystemZ/divrem-reg.ll
new file mode 100644
index 00000000000..0cb1293cf3b
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/divrem-reg.ll
@@ -0,0 +1,286 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+; Check costs of divisions by register
+;
+; Note: Vectorization of division/remainder is temporarily disabled for high
+; vectorization factors by returning 1000.
+
+; Scalar sdiv
+
+define i64 @fun0(i64 %a, i64 %b) {
+  %r = sdiv i64 %a, %b
+  ret i64 %r
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = sdiv i64
+}
+
+define i32 @fun1(i32 %a, i32 %b) {
+  %r = sdiv i32 %a, %b
+  ret i32 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = sdiv i32 %a, %b
+}
+
+define i16 @fun2(i16 %a, i16 %b) {
+  %r = sdiv i16 %a, %b
+  ret i16 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = sdiv i16 %a, %b
+}
+
+define i8 @fun3(i8 %a, i8 %b) {
+  %r = sdiv i8 %a, %b
+  ret i8 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = sdiv i8 %a, %b
+}
+
+; Vector sdiv
+
+define <2 x i64> @fun4(<2 x i64> %a, <2 x i64> %b) {
+  %r = sdiv <2 x i64> %a, %b
+  ret <2 x i64> %r
+; CHECK: Cost Model: Found an estimated cost of 47 for instruction:   %r = sdiv <2 x i64>
+}
+
+define <4 x i32> @fun5(<4 x i32> %a, <4 x i32> %b) {
+  %r = sdiv <4 x i32> %a, %b
+  ret <4 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = sdiv <4 x i32>
+}
+
+define <2 x i32> @fun6(<2 x i32> %a, <2 x i32> %b) {
+  %r = sdiv <2 x i32> %a, %b
+  ret <2 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = sdiv <2 x i32>
+}
+
+define <8 x i16> @fun7(<8 x i16> %a, <8 x i16> %b) {
+  %r = sdiv <8 x i16> %a, %b
+  ret <8 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = sdiv <8 x i16>
+}
+
+define <4 x i16> @fun8(<4 x i16> %a, <4 x i16> %b) {
+  %r = sdiv <4 x i16> %a, %b
+  ret <4 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = sdiv <4 x i16>
+}
+
+define <16 x i8> @fun9(<16 x i8> %a, <16 x i8> %b) {
+  %r = sdiv <16 x i8> %a, %b
+  ret <16 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = sdiv <16 x i8>
+}
+
+define <8 x i8> @fun10(<8 x i8> %a, <8 x i8> %b) {
+  %r = sdiv <8 x i8> %a, %b
+  ret <8 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = sdiv <8 x i8>
+}
+
+; Scalar udiv
+
+define i64 @fun11(i64 %a, i64 %b) {
+  %r = udiv i64 %a, %b
+  ret i64 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = udiv i64 %a, %b
+}
+
+define i32 @fun12(i32 %a, i32 %b) {
+  %r = udiv i32 %a, %b
+  ret i32 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = udiv i32
+}
+
+define i16 @fun13(i16 %a, i16 %b) {
+  %r = udiv i16 %a, %b
+  ret i16 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = udiv i16
+}
+
+define i8 @fun14(i8 %a, i8 %b) {
+  %r = udiv i8 %a, %b
+  ret i8 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = udiv i8
+}
+
+; Vector udiv
+
+define <2 x i64> @fun15(<2 x i64> %a, <2 x i64> %b) {
+  %r = udiv <2 x i64> %a, %b
+  ret <2 x i64> %r
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %r = udiv <2 x i64>
+}
+
+define <4 x i32> @fun16(<4 x i32> %a, <4 x i32> %b) {
+  %r = udiv <4 x i32> %a, %b
+  ret <4 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = udiv <4 x i32>
+}
+
+define <2 x i32> @fun17(<2 x i32> %a, <2 x i32> %b) {
+  %r = udiv <2 x i32> %a, %b
+  ret <2 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = udiv <2 x i32>
+}
+
+define <8 x i16> @fun18(<8 x i16> %a, <8 x i16> %b) {
+  %r = udiv <8 x i16> %a, %b
+  ret <8 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = udiv <8 x i16>
+}
+
+define <4 x i16> @fun19(<4 x i16> %a, <4 x i16> %b) {
+  %r = udiv <4 x i16> %a, %b
+  ret <4 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = udiv <4 x i16>
+}
+
+define <16 x i8> @fun20(<16 x i8> %a, <16 x i8> %b) {
+  %r = udiv <16 x i8> %a, %b
+  ret <16 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = udiv <16 x i8>
+}
+
+define <8 x i8> @fun21(<8 x i8> %a, <8 x i8> %b) {
+  %r = udiv <8 x i8> %a, %b
+  ret <8 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = udiv <8 x i8>
+}
+
+; Scalar srem
+
+define i64 @fun22(i64 %a, i64 %b) {
+  %r = srem i64 %a, %b
+  ret i64 %r
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = srem i64
+}
+
+define i32 @fun23(i32 %a, i32 %b) {
+  %r = srem i32 %a, %b
+  ret i32 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = srem i32
+}
+
+define i16 @fun24(i16 %a, i16 %b) {
+  %r = srem i16 %a, %b
+  ret i16 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = srem i16
+}
+
+define i8 @fun25(i8 %a, i8 %b) {
+  %r = srem i8 %a, %b
+  ret i8 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = srem i8
+}
+
+; Vector srem
+
+define <2 x i64> @fun26(<2 x i64> %a, <2 x i64> %b) {
+  %r = srem <2 x i64> %a, %b
+  ret <2 x i64> %r
+; CHECK: Cost Model: Found an estimated cost of 47 for instruction:   %r = srem <2 x i64>
+}
+
+define <4 x i32> @fun27(<4 x i32> %a, <4 x i32> %b) {
+  %r = srem <4 x i32> %a, %b
+  ret <4 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = srem <4 x i32>
+}
+
+define <2 x i32> @fun28(<2 x i32> %a, <2 x i32> %b) {
+  %r = srem <2 x i32> %a, %b
+  ret <2 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = srem <2 x i32>
+}
+
+define <8 x i16> @fun29(<8 x i16> %a, <8 x i16> %b) {
+  %r = srem <8 x i16> %a, %b
+  ret <8 x i16> %r
+; CHECK: ost Model: Found an estimated cost of 1000 for instruction:   %r = srem <8 x i16>
+}
+
+define <4 x i16> @fun30(<4 x i16> %a, <4 x i16> %b) {
+  %r = srem <4 x i16> %a, %b
+  ret <4 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = srem <4 x i16>
+}
+
+define <16 x i8> @fun31(<16 x i8> %a, <16 x i8> %b) {
+  %r = srem <16 x i8> %a, %b
+  ret <16 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = srem <16 x i8>
+}
+
+define <8 x i8> @fun32(<8 x i8> %a, <8 x i8> %b) {
+  %r = srem <8 x i8> %a, %b
+  ret <8 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = srem <8 x i8>
+}
+
+; Scalar urem
+
+define i64 @fun33(i64 %a, i64 %b) {
+  %r = urem i64 %a, %b
+  ret i64 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = urem i64
+}
+
+define i32 @fun34(i32 %a, i32 %b) {
+  %r = urem i32 %a, %b
+  ret i32 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = urem i32
+}
+
+define i16 @fun35(i16 %a, i16 %b) {
+  %r = urem i16 %a, %b
+  ret i16 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = urem i16
+}
+
+define i8 @fun36(i8 %a, i8 %b) {
+  %r = urem i8 %a, %b
+  ret i8 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = urem i8
+}
+
+; Vector urem
+
+define <2 x i64> @fun37(<2 x i64> %a, <2 x i64> %b) {
+  %r = urem <2 x i64> %a, %b
+  ret <2 x i64> %r
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %r = urem <2 x i64>
+}
+
+define <4 x i32> @fun38(<4 x i32> %a, <4 x i32> %b) {
+  %r = urem <4 x i32> %a, %b
+  ret <4 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = urem <4 x i32>
+}
+
+define <2 x i32> @fun39(<2 x i32> %a, <2 x i32> %b) {
+  %r = urem <2 x i32> %a, %b
+  ret <2 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = urem <2 x i32>
+}
+
+define <8 x i16> @fun40(<8 x i16> %a, <8 x i16> %b) {
+  %r = urem <8 x i16> %a, %b
+  ret <8 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = urem <8 x i16>
+}
+
+define <4 x i16> @fun41(<4 x i16> %a, <4 x i16> %b) {
+  %r = urem <4 x i16> %a, %b
+  ret <4 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = urem <4 x i16>
+}
+
+define <16 x i8> @fun42(<16 x i8> %a, <16 x i8> %b) {
+  %r = urem <16 x i8> %a, %b
+  ret <16 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = urem <16 x i8>
+}
+
+define <8 x i8> @fun43(<8 x i8> %a, <8 x i8> %b) {
+  %r = urem <8 x i8> %a, %b
+  ret <8 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = urem <8 x i8>
+}
diff --git a/test/Analysis/CostModel/SystemZ/int-arith.ll b/test/Analysis/CostModel/SystemZ/int-arith.ll
index 3ecf4342b94..f9a55dfe742 100644
--- a/test/Analysis/CostModel/SystemZ/int-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/int-arith.ll
@@ -2,9 +2,6 @@
 ;
 ; Note: The scalarized vector instructions costs are not including any
 ; extracts, due to the undef operands.
-;
-; Note: Vectorization of division/remainder is temporarily disabled for high
-; vectorization factors by returning 1000.
 
 define void @add() {
   %res0 = add i8 undef, undef
@@ -143,187 +140,3 @@ define void @mul() {
 
   ret void;
 }
-
-define void @sdiv() {
-  %res0 = sdiv i8 undef, undef
-  %res1 = sdiv i16 undef, undef
-  %res2 = sdiv i32 undef, undef
-  %res3 = sdiv i64 undef, undef
-  %res4 = sdiv <2 x i8> undef, undef
-  %res5 = sdiv <2 x i16> undef, undef
-  %res6 = sdiv <2 x i32> undef, undef
-  %res7 = sdiv <2 x i64> undef, undef
-  %res8 = sdiv <4 x i8> undef, undef
-  %res9 = sdiv <4 x i16> undef, undef
-  %res10 = sdiv <4 x i32> undef, undef
-  %res11 = sdiv <4 x i64> undef, undef
-  %res12 = sdiv <8 x i8> undef, undef
-  %res13 = sdiv <8 x i16> undef, undef
-  %res14 = sdiv <8 x i32> undef, undef
-  %res15 = sdiv <8 x i64> undef, undef
-  %res16 = sdiv <16 x i8> undef, undef
-  %res17 = sdiv <16 x i16> undef, undef
-  %res18 = sdiv <16 x i32> undef, undef
-  %res19 = sdiv <16 x i64> undef, undef
-
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = sdiv i8 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = sdiv i16 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = sdiv i32 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = sdiv i64 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = sdiv <2 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = sdiv <2 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = sdiv <2 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = sdiv <2 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = sdiv <4 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = sdiv <4 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = sdiv <4 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = sdiv <4 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res12 = sdiv <8 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res13 = sdiv <8 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res14 = sdiv <8 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res15 = sdiv <8 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res16 = sdiv <16 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res17 = sdiv <16 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res18 = sdiv <16 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res19 = sdiv <16 x i64> undef, undef
-
-  ret void;
-}
-
-define void @srem() {
-  %res0 = srem i8 undef, undef
-  %res1 = srem i16 undef, undef
-  %res2 = srem i32 undef, undef
-  %res3 = srem i64 undef, undef
-  %res4 = srem <2 x i8> undef, undef
-  %res5 = srem <2 x i16> undef, undef
-  %res6 = srem <2 x i32> undef, undef
-  %res7 = srem <2 x i64> undef, undef
-  %res8 = srem <4 x i8> undef, undef
-  %res9 = srem <4 x i16> undef, undef
-  %res10 = srem <4 x i32> undef, undef
-  %res11 = srem <4 x i64> undef, undef
-  %res12 = srem <8 x i8> undef, undef
-  %res13 = srem <8 x i16> undef, undef
-  %res14 = srem <8 x i32> undef, undef
-  %res15 = srem <8 x i64> undef, undef
-  %res16 = srem <16 x i8> undef, undef
-  %res17 = srem <16 x i16> undef, undef
-  %res18 = srem <16 x i32> undef, undef
-  %res19 = srem <16 x i64> undef, undef
-
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = srem i8 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = srem i16 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = srem i32 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = srem i64 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = srem <2 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = srem <2 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = srem <2 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = srem <2 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = srem <4 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = srem <4 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = srem <4 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = srem <4 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res12 = srem <8 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res13 = srem <8 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res14 = srem <8 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res15 = srem <8 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res16 = srem <16 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res17 = srem <16 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res18 = srem <16 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res19 = srem <16 x i64> undef, undef
-
-  ret void;
-}
-
-define void @udiv() {
-  %res0 = udiv i8 undef, undef
-  %res1 = udiv i16 undef, undef
-  %res2 = udiv i32 undef, undef
-  %res3 = udiv i64 undef, undef
-  %res4 = udiv <2 x i8> undef, undef
-  %res5 = udiv <2 x i16> undef, undef
-  %res6 = udiv <2 x i32> undef, undef
-  %res7 = udiv <2 x i64> undef, undef
-  %res8 = udiv <4 x i8> undef, undef
-  %res9 = udiv <4 x i16> undef, undef
-  %res10 = udiv <4 x i32> undef, undef
-  %res11 = udiv <4 x i64> undef, undef
-  %res12 = udiv <8 x i8> undef, undef
-  %res13 = udiv <8 x i16> undef, undef
-  %res14 = udiv <8 x i32> undef, undef
-  %res15 = udiv <8 x i64> undef, undef
-  %res16 = udiv <16 x i8> undef, undef
-  %res17 = udiv <16 x i16> undef, undef
-  %res18 = udiv <16 x i32> undef, undef
-  %res19 = udiv <16 x i64> undef, undef
-
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = udiv i8 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = udiv i16 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = udiv i32 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res3 = udiv i64 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = udiv <2 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = udiv <2 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = udiv <2 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %res7 = udiv <2 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = udiv <4 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = udiv <4 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = udiv <4 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res11 = udiv <4 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res12 = udiv <8 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res13 = udiv <8 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res14 = udiv <8 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res15 = udiv <8 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res16 = udiv <16 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res17 = udiv <16 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res18 = udiv <16 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res19 = udiv <16 x i64> undef, undef
-
-  ret void;
-}
-
-define void @urem() {
-  %res0 = urem i8 undef, undef
-  %res1 = urem i16 undef, undef
-  %res2 = urem i32 undef, undef
-  %res3 = urem i64 undef, undef
-  %res4 = urem <2 x i8> undef, undef
-  %res5 = urem <2 x i16> undef, undef
-  %res6 = urem <2 x i32> undef, undef
-  %res7 = urem <2 x i64> undef, undef
-  %res8 = urem <4 x i8> undef, undef
-  %res9 = urem <4 x i16> undef, undef
-  %res10 = urem <4 x i32> undef, undef
-  %res11 = urem <4 x i64> undef, undef
-  %res12 = urem <8 x i8> undef, undef
-  %res13 = urem <8 x i16> undef, undef
-  %res14 = urem <8 x i32> undef, undef
-  %res15 = urem <8 x i64> undef, undef
-  %res16 = urem <16 x i8> undef, undef
-  %res17 = urem <16 x i16> undef, undef
-  %res18 = urem <16 x i32> undef, undef
-  %res19 = urem <16 x i64> undef, undef
-
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = urem i8 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = urem i16 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = urem i32 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res3 = urem i64 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = urem <2 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = urem <2 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = urem <2 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %res7 = urem <2 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = urem <4 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = urem <4 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = urem <4 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res11 = urem <4 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res12 = urem <8 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res13 = urem <8 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res14 = urem <8 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res15 = urem <8 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res16 = urem <16 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res17 = urem <16 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res18 = urem <16 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res19 = urem <16 x i64> undef, undef
-
-  ret void;
-}
diff --git a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
index 1b6a50d303f..1ca16bab538 100644
--- a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
@@ -90,16 +90,16 @@ define void @mul() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = mul i64 %li64_0, %li64_1
 }
 
-define void @sdiv() {
+define void @sdiv(i32 %arg32, i64 %arg64) {
   %li32 = load i32, i32* undef
-  sdiv i32 %li32, undef
+  sdiv i32 %li32, %arg32
 
   %li32_0 = load i32, i32* undef
   %li32_1 = load i32, i32* undef
   sdiv i32 %li32_0, %li32_1
 
   %li64 = load i64, i64* undef
-  sdiv i64 %li64, undef
+  sdiv i64 %li64, %arg64
 
   %li64_0 = load i64, i64* undef
   %li64_1 = load i64, i64* undef
@@ -107,27 +107,27 @@ define void @sdiv() {
 
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = sdiv i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %1 = sdiv i32 %li32, %arg32
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %2 = sdiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %2 = sdiv i32 %li32_0, %li32_1
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = sdiv i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:  %3 = sdiv i64 %li64, %arg64
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = sdiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:  %4 = sdiv i64 %li64_0, %li64_1
 }
 
-define void @udiv() {
+define void @udiv(i32 %arg32, i64 %arg64) {
   %li32 = load i32, i32* undef
-  udiv i32 %li32, undef
+  udiv i32 %li32, %arg32
 
   %li32_0 = load i32, i32* undef
   %li32_1 = load i32, i32* undef
   udiv i32 %li32_0, %li32_1
 
   %li64 = load i64, i64* undef
-  udiv i64 %li64, undef
+  udiv i64 %li64, %arg64
 
   %li64_0 = load i64, i64* undef
   %li64_1 = load i64, i64* undef
@@ -135,15 +135,15 @@ define void @udiv() {
 
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = udiv i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %1 = udiv i32 %li32, %arg32
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %2 = udiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %2 = udiv i32 %li32_0, %li32_1
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %3 = udiv i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %3 = udiv i64 %li64, %arg64
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %4 = udiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %4 = udiv i64 %li64_0, %li64_1
 }
 
 define void @and() {
-- 
GitLab


From bf9eac7026cae322483e26c7c7ef76d916ef8ac3 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 25 Oct 2018 22:23:27 +0000
Subject: [PATCH 0596/1116] [x86] add tests for missed load folding; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345325 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/vector-shuffle-128-v16.ll | 92 ++++++++++++++++++++++
 test/CodeGen/X86/vector-shuffle-256-v32.ll | 57 ++++++++++++++
 2 files changed, 149 insertions(+)

diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 65335572229..d2410050b49 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -579,6 +579,98 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(
   ret <16 x i8> %shuffle
 }
 
+; PR27780 - https://bugs.llvm.org/show_bug.cgi?id=27780
+
+define <16 x i8> @load_fold_pblendvb(<16 x i8>* %px, <16 x i8> %y) {
+; SSE2-LABEL: load_fold_pblendvb:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    andnps (%rdi), %xmm1
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_fold_pblendvb:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3],zero,zero,zero,xmm0[7,8,9],zero,xmm0[11],zero,zero,zero,xmm0[15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,xmm1[2],zero,xmm1[4,5,6],zero,zero,zero,xmm1[10],zero,xmm1[12,13,14],zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_fold_pblendvb:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa (%rdi), %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1OR2-LABEL: load_fold_pblendvb:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX1OR2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: load_fold_pblendvb:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    movw $29812, %ax # imm = 0x7474
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqu8 (%rdi), %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %x = load <16 x i8>, <16 x i8>* %px, align 16
+  %select = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i8> %select
+}
+
+define <16 x i8> @load_fold_pblendvb_commute(<16 x i8>* %px, <16 x i8> %y) {
+; SSE2-LABEL: load_fold_pblendvb_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    andps (%rdi), %xmm1
+; SSE2-NEXT:    orps %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_fold_pblendvb_commute:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[2],zero,xmm0[4,5,6],zero,zero,zero,xmm0[10],zero,xmm0[12,13,14],zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3],zero,zero,zero,xmm1[7,8,9],zero,xmm1[11],zero,zero,zero,xmm1[15]
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_fold_pblendvb_commute:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSE41-NEXT:    pblendvb %xmm0, (%rdi), %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1OR2-LABEL: load_fold_pblendvb_commute:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX1OR2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: load_fold_pblendvb_commute:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512VL-NEXT:    movw $29812, %ax # imm = 0x7474
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+  %x = load <16 x i8>, <16 x i8>* %px, align 16
+  %select = shufflevector <16 x i8> %y, <16 x i8> %x, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i8> %select
+}
+
 define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
 ; SSE2-LABEL: trunc_v4i32_shuffle:
 ; SSE2:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index a391387923a..8189be0311c 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1643,6 +1643,63 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
   ret <32 x i8> %shuffle
 }
 
+; PR27780 - https://bugs.llvm.org/show_bug.cgi?id=27780
+
+define <32 x i8> @load_fold_pblendvb(<32 x i8>* %px, <32 x i8> %y) {
+; AVX1-LABEL: load_fold_pblendvb:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303]
+; AVX1-NEXT:    vandnps (%rdi), %ymm1, %ymm2
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_fold_pblendvb:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: load_fold_pblendvb:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    movl $1953789044, %eax # imm = 0x74747474
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqu8 (%rdi), %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %x = load <32 x i8>, <32 x i8>* %px, align 32
+  %select = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 32, i32 33, i32 2, i32 35, i32 4, i32 5, i32 6, i32 39, i32 40, i32 41, i32 10, i32 43, i32 12, i32 13, i32 14, i32 47, i32 48, i32 49, i32 18, i32 51, i32 20, i32 21, i32 22, i32 55, i32 56, i32 57, i32 26, i32 59, i32 28, i32 29, i32 30, i32 63>
+  ret <32 x i8> %select
+}
+
+define <32 x i8> @load_fold_pblendvb_commute(<32 x i8>* %px, <32 x i8> %y) {
+; AVX1-LABEL: load_fold_pblendvb_commute:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303]
+; AVX1-NEXT:    vandnps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vandps (%rdi), %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_fold_pblendvb_commute:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX2-NEXT:    vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: load_fold_pblendvb_commute:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512VL-NEXT:    movl $1953789044, %eax # imm = 0x74747474
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+  %x = load <32 x i8>, <32 x i8>* %px, align 32
+  %select = shufflevector <32 x i8> %y, <32 x i8> %x, <32 x i32> <i32 32, i32 33, i32 2, i32 35, i32 4, i32 5, i32 6, i32 39, i32 40, i32 41, i32 10, i32 43, i32 12, i32 13, i32 14, i32 47, i32 48, i32 49, i32 18, i32 51, i32 20, i32 21, i32 22, i32 55, i32 56, i32 57, i32 26, i32 59, i32 28, i32 29, i32 30, i32 63>
+  ret <32 x i8> %select
+}
+
 define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) {
 ; AVX1OR2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
 ; AVX1OR2:       # %bb.0:
-- 
GitLab


From f9dc2283f4fd0efb1d3ad7c62d242a315c3a2237 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Thu, 25 Oct 2018 22:26:25 +0000
Subject: [PATCH 0597/1116] DebugInfo: Explain why DW_LLE_(GNU_)startx_length
 is used

This isn't the most object-size efficient encoding, but it's the only
one GDB supports for the pre-standard fission format. I've written fixes
for this twice now... - so perhaps this comment will help me remember
why neither of these have been committed and why I shouldn't try to
write a third fix another year from now...

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345326 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 7f9ef3eba90..4a9ed6d03c6 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1962,10 +1962,13 @@ void DwarfDebug::emitDebugLocDWO() {
   for (const auto &List : DebugLocs.getLists()) {
     Asm->OutStreamer->EmitLabel(List.Label);
     for (const auto &Entry : DebugLocs.getEntries(List)) {
-      // Just always use start_length for now - at least that's one address
-      // rather than two. We could get fancier and try to, say, reuse an
-      // address we know we've emitted elsewhere (the start of the function?
-      // The start of the CU or CU subrange that encloses this range?)
+      // GDB only supports startx_length in pre-standard split-DWARF.
+      // (in v5 standard loclists, it currently* /only/ supports base_address +
+      // offset_pair, so the implementations can't really share much since they
+      // need to use different representations)
+      // * as of October 2018, at least
+      // Ideally/in v5, this could use SectionLabels to reuse existing addresses
+      // in the address pool to minimize object size/relocations.
       Asm->emitInt8(dwarf::DW_LLE_startx_length);
       unsigned idx = AddrPool.getIndex(Entry.BeginSym);
       Asm->EmitULEB128(idx);
-- 
GitLab


From bc1846b491bf861cfa569260b24206b0322e7452 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Thu, 25 Oct 2018 22:28:25 +0000
Subject: [PATCH 0598/1116] [SystemZ] Improve getMemoryOpCost() to find
 foldable loads that are converted.

The SystemZ backend can do arithmetic of memory by loading and then extending
one of the operands. Similarly, a load + truncate can be folded into an
operand.

This patch improves the SystemZ TTI cost function to recognize this.

Review: Ulrich Weigand
https://reviews.llvm.org/D52692

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345327 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SystemZ/SystemZTargetTransformInfo.cpp    | 130 ++++++---
 .../SystemZ/SystemZTargetTransformInfo.h      |   1 +
 .../SystemZ/memop-folding-int-arith.ll        | 258 +++++++++++++++++-
 3 files changed, 347 insertions(+), 42 deletions(-)

diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index f52c9ca6e49..670a8d393f8 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -849,54 +849,102 @@ getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   return BaseT::getVectorInstrCost(Opcode, Val, Index);
 }
 
+// Check if a load may be folded as a memory operand in its user.
+bool SystemZTTIImpl::
+isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
+  if (!Ld->hasOneUse())
+    return false;
+  FoldedValue = Ld;
+  const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
+  unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
+  unsigned TruncBits = 0;
+  unsigned SExtBits = 0;
+  unsigned ZExtBits = 0;
+  if (UserI->hasOneUse()) {
+    unsigned UserBits = UserI->getType()->getScalarSizeInBits();
+    if (isa<TruncInst>(UserI))
+      TruncBits = UserBits;
+    else if (isa<SExtInst>(UserI))
+      SExtBits = UserBits;
+    else if (isa<ZExtInst>(UserI))
+      ZExtBits = UserBits;
+  }
+  if (TruncBits || SExtBits || ZExtBits) {
+    FoldedValue = UserI;
+    UserI = cast<Instruction>(*UserI->user_begin());
+    // Load (single use) -> trunc/extend (single use) -> UserI
+  }
+  switch (UserI->getOpcode()) {
+  case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
+  case Instruction::Sub:
+    if (LoadedBits == 32 && ZExtBits == 64)
+      return true;
+    LLVM_FALLTHROUGH;
+  case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
+    if (LoadedBits == 16 &&
+        (SExtBits == 32 ||
+         (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
+      return true;
+    LLVM_FALLTHROUGH;
+  case Instruction::SDiv:// SE: 32->64
+    if (LoadedBits == 32 && SExtBits == 64)
+      return true;
+    LLVM_FALLTHROUGH;
+  case Instruction::UDiv:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+    // This also makes sense for float operations, but disabled for now due
+    // to regressions.
+    // case Instruction::FCmp:
+    // case Instruction::FAdd:
+    // case Instruction::FSub:
+    // case Instruction::FMul:
+    // case Instruction::FDiv:
+
+    // All possible extensions of memory checked above.
+    if (SExtBits || ZExtBits)
+      return false;
+
+    unsigned LoadOrTruncBits = (TruncBits ? TruncBits : LoadedBits);
+    return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
+    break;
+  }
+  return false;
+}
+
 int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                     unsigned Alignment, unsigned AddressSpace,
                                     const Instruction *I) {
   assert(!Src->isVoidTy() && "Invalid type");
 
-  if (!Src->isVectorTy() && Opcode == Instruction::Load &&
-      I != nullptr && I->hasOneUse()) {
-      const Instruction *UserI = cast<Instruction>(*I->user_begin());
-      unsigned Bits = getScalarSizeInBits(Src);
-      bool FoldsLoad = false;
-      switch (UserI->getOpcode()) {
-      case Instruction::ICmp:
-      case Instruction::Add:
-      case Instruction::Sub:
-      case Instruction::Mul:
-      case Instruction::SDiv:
-      case Instruction::UDiv:
-      case Instruction::And:
-      case Instruction::Or:
-      case Instruction::Xor:
-      // This also makes sense for float operations, but disabled for now due
-      // to regressions.
-      // case Instruction::FCmp:
-      // case Instruction::FAdd:
-      // case Instruction::FSub:
-      // case Instruction::FMul:
-      // case Instruction::FDiv:
-        FoldsLoad = (Bits == 32 || Bits == 64);
-        break;
-      }
-
-      if (FoldsLoad) {
-        assert (UserI->getNumOperands() == 2 &&
-                "Expected to only handle binops.");
-
-        // UserI can't fold two loads, so in that case return 0 cost only
-        // half of the time.
-        for (unsigned i = 0; i < 2; ++i) {
-          if (UserI->getOperand(i) == I)
-            continue;
-          if (LoadInst *LI = dyn_cast<LoadInst>(UserI->getOperand(i))) {
-            if (LI->hasOneUse())
-              return i == 0;
-          }
+  if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
+    // Store the load or its truncated or extended value in FoldedValue.
+    const Instruction *FoldedValue = nullptr;
+    if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
+      const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
+      assert (UserI->getNumOperands() == 2 && "Expected a binop.");
+
+      // UserI can't fold two loads, so in that case return 0 cost only
+      // half of the time.
+      for (unsigned i = 0; i < 2; ++i) {
+        if (UserI->getOperand(i) == FoldedValue)
+          continue;
+
+        if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
+          LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
+          if (!OtherLoad &&
+              (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
+               isa<ZExtInst>(OtherOp)))
+            OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
+          if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
+            return i == 0; // Both operands foldable.
         }
-
-        return 0;
       }
+
+      return 0; // Only I is foldable in user.
+    }
   }
 
   unsigned NumOps =
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 92b2b9bdcb8..347a8a632f0 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -85,6 +85,7 @@ public:
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue);
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                       unsigned AddressSpace, const Instruction *I = nullptr);
 
diff --git a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
index 1ca16bab538..8198386832e 100644
--- a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
@@ -1,4 +1,7 @@
-; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 \
+; RUN:  | FileCheck %s -check-prefixes=CHECK,Z13
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z14 \
+; RUN:  | FileCheck %s -check-prefixes=CHECK,Z14
 ;
 ; Test that loads into operations that can fold one memory operand get zero
 ; cost. In the case that both operands are loaded, one load should get a cost
@@ -19,6 +22,35 @@ define void @add() {
   %li64_1 = load i64, i64* undef
   add i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  add i32 %tr, undef
+
+  ; Sign-extended loads
+  %li16_0 = load i16, i16* undef
+  %sext_0 = sext i16 %li16_0 to i32
+  add i32 %sext_0, undef
+
+  %li16_1 = load i16, i16* undef
+  %sext_1 = sext i16 %li16_1 to i64
+  add i64 %sext_1, undef
+
+  %li32_2 = load i32, i32* undef
+  %sext_2 = sext i32 %li32_2 to i64
+  add i64 %sext_2, undef
+
+  ; Zero-extended loads
+  %li32_3 = load i32, i32* undef
+  %zext_0 = zext i32 %li32_3 to i64
+  add i64 %zext_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li16_3 = load i16, i16* undef
+  %sext_3 = sext i16 %li16_3 to i32
+  %sext_4 = sext i16 %li16_3 to i32
+  add i32 %sext_3, undef
+
   ret void;
 
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
@@ -31,6 +63,26 @@ define void @add() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = add i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = add i32 %tr, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li16_0 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i16 %li16_0 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = add i32 %sext_0, undef
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   %li16_1 = load i16, i16* undef
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction:   %li16_1 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i16 %li16_1 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = add i64 %sext_1, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = add i64 %sext_2, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %zext_0 = zext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = add i64 %zext_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_3 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_3 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_4 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = add i32 %sext_3, undef
 }
 
 define void @sub() {
@@ -48,6 +100,35 @@ define void @sub() {
   %li64_1 = load i64, i64* undef
   sub i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  sub i32 %tr, undef
+
+  ; Sign-extended loads
+  %li16_0 = load i16, i16* undef
+  %sext_0 = sext i16 %li16_0 to i32
+  sub i32 %sext_0, undef
+
+  %li16_1 = load i16, i16* undef
+  %sext_1 = sext i16 %li16_1 to i64
+  sub i64 %sext_1, undef
+
+  %li32_2 = load i32, i32* undef
+  %sext_2 = sext i32 %li32_2 to i64
+  sub i64 %sext_2, undef
+
+  ; Zero-extended loads
+  %li32_3 = load i32, i32* undef
+  %zext_0 = zext i32 %li32_3 to i64
+  sub i64 %zext_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li16_3 = load i16, i16* undef
+  %sext_3 = sext i16 %li16_3 to i32
+  %sext_4 = sext i16 %li16_3 to i32
+  sub i32 %sext_3, undef
+
   ret void;
 
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
@@ -60,6 +141,26 @@ define void @sub() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = sub i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = sub i32 %tr, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li16_0 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i16 %li16_0 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = sub i32 %sext_0, undef
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   %li16_1 = load i16, i16* undef
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction:   %li16_1 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i16 %li16_1 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = sub i64 %sext_1, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = sub i64 %sext_2, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %zext_0 = zext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = sub i64 %zext_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_3 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_3 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_4 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = sub i32 %sext_3, undef
 }
 
 define void @mul() {
@@ -77,6 +178,35 @@ define void @mul() {
   %li64_1 = load i64, i64* undef
   mul i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  mul i32 %tr, undef
+
+  ; Sign-extended loads
+  %li16_0 = load i16, i16* undef
+  %sext_0 = sext i16 %li16_0 to i32
+  mul i32 %sext_0, undef
+
+  %li16_1 = load i16, i16* undef
+  %sext_1 = sext i16 %li16_1 to i64
+  mul i64 %sext_1, undef
+
+  %li32_2 = load i32, i32* undef
+  %sext_2 = sext i32 %li32_2 to i64
+  mul i64 %sext_2, undef
+
+  ; Zero-extended loads are *not* folded
+  %li16_2 = load i16, i16* undef
+  %zext_0 = zext i16 %li16_2 to i32
+  mul i32 %zext_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li16_3 = load i16, i16* undef
+  %sext_3 = sext i16 %li16_3 to i32
+  %sext_4 = sext i16 %li16_3 to i32
+  mul i32 %sext_3, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = mul i32 %li32, undef
@@ -88,6 +218,26 @@ define void @mul() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = mul i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = mul i32 %tr, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li16_0 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i16 %li16_0 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = mul i32 %sext_0, undef
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   %li16_1 = load i16, i16* undef
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction:   %li16_1 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i16 %li16_1 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = mul i64 %sext_1, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = mul i64 %sext_2, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_2 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %zext_0 = zext i16 %li16_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = mul i32 %zext_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_3 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_3 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_4 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = mul i32 %sext_3, undef
 }
 
 define void @sdiv(i32 %arg32, i64 %arg64) {
@@ -105,6 +255,22 @@ define void @sdiv(i32 %arg32, i64 %arg64) {
   %li64_1 = load i64, i64* undef
   sdiv i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  sdiv i32 %tr, undef
+
+  ; Sign-extended loads
+  %li32_2 = load i32, i32* undef
+  %sext_0 = sext i32 %li32_2 to i64
+  sdiv i64 %sext_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li32_3 = load i32, i32* undef
+  %sext_1 = sext i32 %li32_3 to i64
+  %sext_2 = sext i32 %li32_3 to i64
+  sdiv i64 %sext_1, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %1 = sdiv i32 %li32, %arg32
@@ -116,6 +282,16 @@ define void @sdiv(i32 %arg32, i64 %arg64) {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 20 for instruction:  %4 = sdiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %5 = sdiv i32 %tr, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %6 = sdiv i64 %sext_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %7 = sdiv i64 %sext_1, undef
 }
 
 define void @udiv(i32 %arg32, i64 %arg64) {
@@ -133,6 +309,16 @@ define void @udiv(i32 %arg32, i64 %arg64) {
   %li64_1 = load i64, i64* undef
   udiv i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  udiv i32 %tr_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  udiv i64 %li64_3, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %1 = udiv i32 %li32, %arg32
@@ -144,6 +330,12 @@ define void @udiv(i32 %arg32, i64 %arg64) {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %4 = udiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %5 = udiv i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %6 = udiv i64 %li64_3, undef
 }
 
 define void @and() {
@@ -161,6 +353,16 @@ define void @and() {
   %li64_1 = load i64, i64* undef
   and i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  and i32 %tr_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  and i64 %li64_3, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = and i32 %li32, undef
@@ -172,6 +374,12 @@ define void @and() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = and i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = and i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = and i64 %li64_3, undef
 }
 
 define void @or() {
@@ -189,6 +397,16 @@ define void @or() {
   %li64_1 = load i64, i64* undef
   or i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  or i32 %tr_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  or i64 %li64_3, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = or i32 %li32, undef
@@ -200,6 +418,12 @@ define void @or() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = or i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = or i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = or i64 %li64_3, undef
 }
 
 define void @xor() {
@@ -217,6 +441,16 @@ define void @xor() {
   %li64_1 = load i64, i64* undef
   xor i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  xor i32 %tr_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  xor i64 %li64_3, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = xor i32 %li32, undef
@@ -228,6 +462,12 @@ define void @xor() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = xor i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = xor i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = xor i64 %li64_3, undef
 }
 
 define void @icmp() {
@@ -245,6 +485,16 @@ define void @icmp() {
   %li64_1 = load i64, i64* undef
   icmp eq i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  icmp eq i32 %tr_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  icmp eq i64 %li64_3, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = icmp eq i32 %li32, undef
@@ -256,4 +506,10 @@ define void @icmp() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = icmp eq i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = icmp eq i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = icmp eq i64 %li64_3, undef
 }
-- 
GitLab


From aa15a2efde15732f23b0bd57dbba000c777b862c Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Thu, 25 Oct 2018 22:53:27 +0000
Subject: [PATCH 0599/1116] [SystemZ]  NFC reformatting in
 SystemZTargetTransformInfo.cpp

Some lines more than 80 characters long reformatted.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345331 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/SystemZ/SystemZTargetTransformInfo.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 670a8d393f8..3bc87ef0225 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -393,7 +393,8 @@ int SystemZTTIImpl::getArithmeticInstrCost(
   }
 
   if (Ty->isVectorTy()) {
-    assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
+    assert(ST->hasVector() &&
+           "getArithmeticInstrCost() called with vector type.");
     unsigned VF = Ty->getVectorNumElements();
     unsigned NumVectors = getNumVectorRegs(Ty);
 
@@ -428,7 +429,8 @@ int SystemZTTIImpl::getArithmeticInstrCost(
           return NumVectors;
         // Return the cost of multiple scalar invocation plus the cost of
         // inserting and extracting the values.
-        unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+        unsigned ScalarCost =
+            getArithmeticInstrCost(Opcode, Ty->getScalarType());
         unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
         // FIXME: VF 2 for these FP operations are currently just as
         // expensive as for VF 4.
@@ -759,8 +761,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
 }
 
-int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                       const Instruction *I) {
+int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                       Type *CondTy, const Instruction *I) {
   if (ValTy->isVectorTy()) {
     assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
     unsigned VF = ValTy->getVectorNumElements();
@@ -821,7 +823,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondT
     }
     case Instruction::Select:
       if (ValTy->isFloatingPointTy())
-        return 4; // No load on condition for FP, so this costs a conditional jump.
+        return 4; // No load on condition for FP - costs a conditional jump.
       return 1; // Load On Condition.
     }
   }
-- 
GitLab


From 35a25c7fdd0723e0aee2722ba7bb5d6217c16e63 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Thu, 25 Oct 2018 23:35:13 +0000
Subject: [PATCH 0600/1116] [WebAssembly] Error out when block/loop markers
 mismatch

Summary:
Currently InstPrinter ignores if there are mismatches between block/loop
and end markers by skipping the case if ControlFlowStack is empty. I
guess it is better to explicitly error out in this case, because this
signals invalid input.

Reviewers: aardappel

Subscribers: dschuff, sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53620

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345333 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstPrinter/WebAssemblyInstPrinter.cpp     | 14 +++++++-------
 test/CodeGen/WebAssembly/block-mismatch.mir    | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+), 7 deletions(-)
 create mode 100644 test/CodeGen/WebAssembly/block-mismatch.mir

diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 08c1155fed7..e26cf1d8cf4 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -85,16 +85,16 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
       break;
     case WebAssembly::END_LOOP:
     case WebAssembly::END_LOOP_S:
-      // Have to guard against an empty stack, in case of mismatched pairs
-      // in assembly parsing.
-      if (!ControlFlowStack.empty())
-        ControlFlowStack.pop_back();
+      if (ControlFlowStack.empty())
+        report_fatal_error("End marker mismatch!");
+      ControlFlowStack.pop_back();
       break;
     case WebAssembly::END_BLOCK:
     case WebAssembly::END_BLOCK_S:
-      if (!ControlFlowStack.empty())
-        printAnnotation(
-            OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+      if (ControlFlowStack.empty())
+        report_fatal_error("END marker mismatch!");
+      printAnnotation(
+          OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
       break;
     }
 
diff --git a/test/CodeGen/WebAssembly/block-mismatch.mir b/test/CodeGen/WebAssembly/block-mismatch.mir
new file mode 100644
index 00000000000..0da58e9ddb8
--- /dev/null
+++ b/test/CodeGen/WebAssembly/block-mismatch.mir
@@ -0,0 +1,18 @@
+# RUN: not llc -mtriple=wasm32-unknown-unknown -start-after xray-instrumentation %s -o /dev/null 2>&1 | FileCheck %s
+
+# This tests if there are block/loop marker mismatches, the program crashes.
+
+---
+name: block_mismatch
+liveins:
+  - { reg: '$arguments', reg: '$value_stack' }
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $arguments, $value_stack
+    BLOCK 64, implicit-def $value_stack, implicit $value_stack
+    END_BLOCK implicit-def $value_stack, implicit $value_stack
+    ; CHECK: LLVM ERROR: END marker mismatch!
+    END_BLOCK implicit-def $value_stack, implicit $value_stack
+    RETURN_VOID implicit-def dead $arguments
+...
-- 
GitLab


From dcebff85b487d72d718a82bc8d052ae78cbaf14d Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Thu, 25 Oct 2018 23:35:14 +0000
Subject: [PATCH 0601/1116] Address comments

- Add llvm-mc test case (and delete the old one)
- Change report_fatal_error to assertions

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345334 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstPrinter/WebAssemblyInstPrinter.cpp     |  6 ++----
 test/CodeGen/WebAssembly/block-mismatch.mir    | 18 ------------------
 test/MC/WebAssembly/block-mismatch.s           | 13 +++++++++++++
 3 files changed, 15 insertions(+), 22 deletions(-)
 delete mode 100644 test/CodeGen/WebAssembly/block-mismatch.mir
 create mode 100644 test/MC/WebAssembly/block-mismatch.s

diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index e26cf1d8cf4..33b224adc6e 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -85,14 +85,12 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
       break;
     case WebAssembly::END_LOOP:
     case WebAssembly::END_LOOP_S:
-      if (ControlFlowStack.empty())
-        report_fatal_error("End marker mismatch!");
+      assert(!ControlFlowStack.empty() && "End marker mismatch!");
       ControlFlowStack.pop_back();
       break;
     case WebAssembly::END_BLOCK:
     case WebAssembly::END_BLOCK_S:
-      if (ControlFlowStack.empty())
-        report_fatal_error("END marker mismatch!");
+      assert(!ControlFlowStack.empty() && "End marker mismatch!");
       printAnnotation(
           OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
       break;
diff --git a/test/CodeGen/WebAssembly/block-mismatch.mir b/test/CodeGen/WebAssembly/block-mismatch.mir
deleted file mode 100644
index 0da58e9ddb8..00000000000
--- a/test/CodeGen/WebAssembly/block-mismatch.mir
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: not llc -mtriple=wasm32-unknown-unknown -start-after xray-instrumentation %s -o /dev/null 2>&1 | FileCheck %s
-
-# This tests if there are block/loop marker mismatches, the program crashes.
-
----
-name: block_mismatch
-liveins:
-  - { reg: '$arguments', reg: '$value_stack' }
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $arguments, $value_stack
-    BLOCK 64, implicit-def $value_stack, implicit $value_stack
-    END_BLOCK implicit-def $value_stack, implicit $value_stack
-    ; CHECK: LLVM ERROR: END marker mismatch!
-    END_BLOCK implicit-def $value_stack, implicit $value_stack
-    RETURN_VOID implicit-def dead $arguments
-...
diff --git a/test/MC/WebAssembly/block-mismatch.s b/test/MC/WebAssembly/block-mismatch.s
new file mode 100644
index 00000000000..586407a054a
--- /dev/null
+++ b/test/MC/WebAssembly/block-mismatch.s
@@ -0,0 +1,13 @@
+# RUN: not llvm-mc -triple=wasm32-unknown-unknown %s -o - 2>&1 | FileCheck %s
+
+# This tests if there are block/loop marker mismatches, the program crashes.
+  .text
+  .type  test0,@function
+test0:
+  block
+  end_block
+  # CHECK: End marker mismatch!
+  end_block
+  end_function
+.Lfunc_end1:
+  .size  test1, .Lfunc_end1-test1
-- 
GitLab


From 8b707d722259c17bc6c55cb399e520ce211c3632 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Thu, 25 Oct 2018 23:35:15 +0000
Subject: [PATCH 0602/1116] Tidy up test case

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345335 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/WebAssembly/block-mismatch.s | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/MC/WebAssembly/block-mismatch.s b/test/MC/WebAssembly/block-mismatch.s
index 586407a054a..507ab3fa0e2 100644
--- a/test/MC/WebAssembly/block-mismatch.s
+++ b/test/MC/WebAssembly/block-mismatch.s
@@ -9,5 +9,3 @@ test0:
   # CHECK: End marker mismatch!
   end_block
   end_function
-.Lfunc_end1:
-  .size  test1, .Lfunc_end1-test1
-- 
GitLab


From 0c69a17dcceca25d3e7787b17ddf3121701189ca Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Thu, 25 Oct 2018 23:35:15 +0000
Subject: [PATCH 0603/1116] Delete test case. Assertions can't be tested.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345336 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/WebAssembly/block-mismatch.s | 11 -----------
 1 file changed, 11 deletions(-)
 delete mode 100644 test/MC/WebAssembly/block-mismatch.s

diff --git a/test/MC/WebAssembly/block-mismatch.s b/test/MC/WebAssembly/block-mismatch.s
deleted file mode 100644
index 507ab3fa0e2..00000000000
--- a/test/MC/WebAssembly/block-mismatch.s
+++ /dev/null
@@ -1,11 +0,0 @@
-# RUN: not llvm-mc -triple=wasm32-unknown-unknown %s -o - 2>&1 | FileCheck %s
-
-# This tests if there are block/loop marker mismatches, the program crashes.
-  .text
-  .type  test0,@function
-test0:
-  block
-  end_block
-  # CHECK: End marker mismatch!
-  end_block
-  end_function
-- 
GitLab


From 50bf7df5bb75e610358a9b6a823bbff310d17569 Mon Sep 17 00:00:00 2001
From: Bryan Chan <bryan.chan@huawei.com>
Date: Thu, 25 Oct 2018 23:36:41 +0000
Subject: [PATCH 0604/1116] [AArch64] Implement FP16FML intrinsics

Add LLVM intrinsics for the ARMv8.2-A FP16FML vector-form instructions. Add a
DAG pattern to define the indexed-form intrinsics in terms of the vector-form
ones, similarly to how the Dot Product intrinsics were implemented.

Based on a patch by Gao Yiling.

Differential Revision: https://reviews.llvm.org/D53632


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345337 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsAArch64.td      | 11 ++++
 lib/Target/AArch64/AArch64InstrFormats.td | 44 +++++++++++---
 lib/Target/AArch64/AArch64InstrInfo.td    | 26 +++-----
 test/CodeGen/AArch64/neon-fp16fml.ll      | 74 +++++++++++++++++++++++
 4 files changed, 130 insertions(+), 25 deletions(-)
 create mode 100644 test/CodeGen/AArch64/neon-fp16fml.ll

diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
index 5f86ee7cdb4..ff25750fe39 100644
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@@ -160,6 +160,11 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
+
+  class AdvSIMD_FP16FML_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -430,6 +435,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   // v8.2-A Dot Product
   def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;
+
+  // v8.2-A FP16 Fused Multiply-Add Long
+  def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
+  def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
+  def int_aarch64_neon_fmlal2 : AdvSIMD_FP16FML_Intrinsic;
+  def int_aarch64_neon_fmlsl2 : AdvSIMD_FP16FML_Intrinsic;
 }
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 3ebbb446c12..d1e189362f0 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -4941,14 +4941,27 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
   let Inst{4-0}   = Rd;
 }
 
-let Predicates = [HasNEON, HasFP16FML] in
+// ARMv8.2 Fused Multiply Add Long Instructions (Vector)
 class BaseSIMDThreeSameMult<bit Q, bit U, bit b13, bits<3> size, string asm, string kind1,
-                                 string kind2> :
-        BaseSIMDThreeSameVector<Q, U, size, 0b11101, V128, asm, kind1, [] > {
+                                 string kind2, RegisterOperand RegType,
+                                 ValueType AccumType, ValueType InputType,
+                                 SDPatternOperator OpNode> :
+        BaseSIMDThreeSameVectorTied<Q, U, size, 0b11101, RegType, asm, kind1,
+		[(set (AccumType RegType:$dst),
+              (OpNode (AccumType RegType:$Rd),
+                      (InputType RegType:$Rn),
+                      (InputType RegType:$Rm)))]> {
   let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
   let Inst{13} = b13;
 }
 
+multiclass SIMDThreeSameMult<bit U, bit b13, bits<3> size, string asm, SDPatternOperator OpNode> {
+  def v4f16 : BaseSIMDThreeSameMult<0, U, b13, size, asm, ".2s", ".2h", V64,
+                                         v2f32, v4f16, OpNode>;
+  def v8f16 : BaseSIMDThreeSameMult<1, U, b13, size, asm, ".4s", ".4h", V128,
+                                         v4f32, v8f16, OpNode>;
+}
+
 class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
                                  string kind2, RegisterOperand RegType,
                                  ValueType AccumType, ValueType InputType,
@@ -7433,14 +7446,20 @@ class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
   let Inst{11}    = idx{1};  // H
 }
 
-let Predicates = [HasNEON, HasFP16FML] in
+// ARMv8.2 Fused Multiply Add Long Instructions (Indexed)
 class BaseSIMDThreeSameMultIndex<bit Q, bit U, bits<4> opc, string asm,
                                  string dst_kind, string lhs_kind,
-                                 string rhs_kind> :
-        BaseSIMDIndexedTied<Q, U, 0, 0b10, opc, V128, V128, V128,
-                            VectorIndexH, asm, "", dst_kind, lhs_kind,
-                            rhs_kind, []> {
-  //idx = H:L:M
+                                 string rhs_kind, RegisterOperand RegType,
+                                 ValueType AccumType, ValueType InputType,
+                                 SDPatternOperator OpNode> :
+        BaseSIMDIndexedTied<Q, U, 0, 0b10, opc, RegType, RegType, V128,
+                            VectorIndexH, asm, "", dst_kind, lhs_kind, rhs_kind,
+          [(set (AccumType RegType:$dst),
+                (AccumType (OpNode (AccumType RegType:$Rd),
+                                   (InputType RegType:$Rn),
+                                   (InputType (AArch64duplane16 (v8f16 V128:$Rm),
+                                                VectorIndexH:$idx)))))]> {
+  // idx = H:L:M
   bits<3> idx;
   let Inst{11} = idx{2}; // H
   let Inst{21} = idx{1}; // L
@@ -7455,6 +7474,13 @@ multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
                                               v4i32, v16i8, OpNode>;
 }
 
+multiclass SIMDThreeSameMultIndex<bit U, bits<4> opc, string asm, SDPatternOperator OpNode> {
+  def v4f16 : BaseSIMDThreeSameMultIndex<0, U, opc, asm, ".2s", ".2h", ".h", V64,
+                                         v2f32, v4f16, OpNode>;
+  def v8f16 : BaseSIMDThreeSameMultIndex<1, U, opc, asm, ".4s", ".4h", ".h", V128,
+                                         v4f32, v8f16, OpNode>;
+}
+
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
                          SDPatternOperator OpNode> {
   let Predicates = [HasNEON, HasFullFP16] in {
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 9e9e1429371..59adec3ce38 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3463,22 +3463,16 @@ defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
                                                     int_aarch64_neon_sqsub>;
 
 // FP16FML
-def FMLAL_2S   : BaseSIMDThreeSameMult<0, 0, 1, 0b001, "fmlal", ".2s", ".2h">;
-def FMLSL_2S   : BaseSIMDThreeSameMult<0, 0, 1, 0b101, "fmlsl", ".2s", ".2h">;
-def FMLAL_4S   : BaseSIMDThreeSameMult<1, 0, 1, 0b001, "fmlal", ".4s", ".4h">;
-def FMLSL_4S   : BaseSIMDThreeSameMult<1, 0, 1, 0b101, "fmlsl", ".4s", ".4h">;
-def FMLAL2_2S  : BaseSIMDThreeSameMult<0, 1, 0, 0b001, "fmlal2", ".2s", ".2h">;
-def FMLSL2_2S  : BaseSIMDThreeSameMult<0, 1, 0, 0b101, "fmlsl2", ".2s", ".2h">;
-def FMLAL2_4S  : BaseSIMDThreeSameMult<1, 1, 0, 0b001, "fmlal2", ".4s", ".4h">;
-def FMLSL2_4S  : BaseSIMDThreeSameMult<1, 1, 0, 0b101, "fmlsl2", ".4s", ".4h">;
-def FMLALI_2s  : BaseSIMDThreeSameMultIndex<0, 0, 0b0000, "fmlal", ".2s", ".2h", ".h">;
-def FMLSLI_2s  : BaseSIMDThreeSameMultIndex<0, 0, 0b0100, "fmlsl", ".2s", ".2h", ".h">;
-def FMLALI_4s  : BaseSIMDThreeSameMultIndex<1, 0, 0b0000, "fmlal", ".4s", ".4h", ".h">;
-def FMLSLI_4s  : BaseSIMDThreeSameMultIndex<1, 0, 0b0100, "fmlsl", ".4s", ".4h", ".h">;
-def FMLALI2_2s : BaseSIMDThreeSameMultIndex<0, 1, 0b1000, "fmlal2", ".2s", ".2h", ".h">;
-def FMLSLI2_2s : BaseSIMDThreeSameMultIndex<0, 1, 0b1100, "fmlsl2", ".2s", ".2h", ".h">;
-def FMLALI2_4s : BaseSIMDThreeSameMultIndex<1, 1, 0b1000, "fmlal2", ".4s", ".4h", ".h">;
-def FMLSLI2_4s : BaseSIMDThreeSameMultIndex<1, 1, 0b1100, "fmlsl2", ".4s", ".4h", ".h">;
+let Predicates = [HasNEON, HasFP16FML] in {
+defm FMLAL      : SIMDThreeSameMult<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;
+defm FMLSL      : SIMDThreeSameMult<0, 1, 0b101, "fmlsl", int_aarch64_neon_fmlsl>;
+defm FMLAL2     : SIMDThreeSameMult<1, 0, 0b001, "fmlal2", int_aarch64_neon_fmlal2>;
+defm FMLSL2     : SIMDThreeSameMult<1, 0, 0b101, "fmlsl2", int_aarch64_neon_fmlsl2>;
+defm FMLALlane  : SIMDThreeSameMultIndex<0, 0b0000, "fmlal", int_aarch64_neon_fmlal>;
+defm FMLSLlane  : SIMDThreeSameMultIndex<0, 0b0100, "fmlsl", int_aarch64_neon_fmlsl>;
+defm FMLAL2lane : SIMDThreeSameMultIndex<1, 0b1000, "fmlal2", int_aarch64_neon_fmlal2>;
+defm FMLSL2lane : SIMDThreeSameMultIndex<1, 0b1100, "fmlsl2", int_aarch64_neon_fmlsl2>;
+}
 
 defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
 defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
diff --git a/test/CodeGen/AArch64/neon-fp16fml.ll b/test/CodeGen/AArch64/neon-fp16fml.ll
new file mode 100644
index 00000000000..dcae645ea54
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-fp16fml.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+fp16fml < %s | FileCheck %s
+
+declare <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>)
+declare <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>)
+declare <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>)
+declare <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>)
+declare <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>)
+declare <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>)
+declare <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>)
+declare <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>)
+
+define <2 x float> @test_vfmlal_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlal_low_u32:
+; CHECK: fmlal   v0.2s, v1.2h, v2.2h
+  %vfmlal_low2.i = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2
+  ret <2 x float> %vfmlal_low2.i
+}
+
+define <2 x float> @test_vfmlsl_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlsl_low_u32:
+; CHECK: fmlsl   v0.2s, v1.2h, v2.2h
+  %vfmlsl_low2.i = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2
+  ret <2 x float> %vfmlsl_low2.i
+}
+
+define <2 x float> @test_vfmlal_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlal_high_u32:
+; CHECK: fmlal2   v0.2s, v1.2h, v2.2h
+  %vfmlal_high2.i = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2
+  ret <2 x float> %vfmlal_high2.i
+}
+
+define <2 x float> @test_vfmlsl_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlsl_high_u32:
+; CHECK: fmlsl2   v0.2s, v1.2h, v2.2h
+  %vfmlsl_high2.i = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2
+  ret <2 x float> %vfmlsl_high2.i
+}
+
+define <4 x float> @test_vfmlalq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlalq_low_u32:
+; CHECK: fmlal   v0.4s, v1.4h, v2.4h
+  %vfmlalq_low4.i = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2
+  ret <4 x float> %vfmlalq_low4.i
+}
+
+define <4 x float> @test_vfmlslq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlslq_low_u32:
+; CHECK: fmlsl   v0.4s, v1.4h, v2.4h
+  %vfmlslq_low4.i = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2
+  ret <4 x float> %vfmlslq_low4.i
+}
+
+define <4 x float> @test_vfmlalq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlalq_high_u32:
+; CHECK: fmlal2   v0.4s, v1.4h, v2.4h
+  %vfmlalq_high4.i = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2
+  ret <4 x float> %vfmlalq_high4.i
+}
+
+define <4 x float> @test_vfmlslq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlslq_high_u32:
+; CHECK: fmlsl2   v0.4s, v1.4h, v2.4h
+  %vfmlslq_high4.i = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2
+  ret <4 x float> %vfmlslq_high4.i
+}
-- 
GitLab


From b829dc3549918925ee32765b7258b4aad0b5b0d0 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Thu, 25 Oct 2018 23:39:07 +0000
Subject: [PATCH 0605/1116] Fix in MachineOperand::printIRValueReference().

Handle the case where getCurrentFunction() returns nullptr by passing -1 to
printIRSlotNumber(). This will result in <badref> being printed instead of an
assertion failure.

Review: Francis Visoiu Mistrih
https://reviews.llvm.org/D53333

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345342 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineOperand.cpp     |  3 ++-
 test/CodeGen/SystemZ/isel-debug.ll | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/SystemZ/isel-debug.ll

diff --git a/lib/CodeGen/MachineOperand.cpp b/lib/CodeGen/MachineOperand.cpp
index a116d8fe877..97d32a5d66a 100644
--- a/lib/CodeGen/MachineOperand.cpp
+++ b/lib/CodeGen/MachineOperand.cpp
@@ -461,7 +461,8 @@ static void printIRValueReference(raw_ostream &OS, const Value &V,
     printLLVMNameWithoutPrefix(OS, V.getName());
     return;
   }
-  MachineOperand::printIRSlotNumber(OS, MST.getLocalSlot(&V));
+  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
+  MachineOperand::printIRSlotNumber(OS, Slot);
 }
 
 static void printSyncScope(raw_ostream &OS, const LLVMContext &Context,
diff --git a/test/CodeGen/SystemZ/isel-debug.ll b/test/CodeGen/SystemZ/isel-debug.ll
new file mode 100644
index 00000000000..cf5370cbec6
--- /dev/null
+++ b/test/CodeGen/SystemZ/isel-debug.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -debug-only=systemz-isel -o - 2>&1 | \
+; RUN:   FileCheck %s
+
+; REQUIRES: asserts
+;
+; Check that some debug output is printed without problems.
+; CHECK: SystemZAddressingMode
+; CHECK: Base
+; CHECK: Index
+; CHECK: Disp
+
+define void @fun(i64* %ptr) {
+entry:
+  %0 = bitcast i64* %ptr to i32**
+  %1 = load i32*, i32** %0, align 8
+  %xpv_pv = getelementptr inbounds i32, i32* %1
+  store i32 0, i32* %xpv_pv
+  ret void
+}
-- 
GitLab


From 81e90c1e1e811f54319b56510b8bbf535f8d4ead Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Thu, 25 Oct 2018 23:45:48 +0000
Subject: [PATCH 0606/1116] [WebAssembly] Support EH instructions in
 InstPrinter

Summary: This adds support for exception handling instructions to InstPrinter.

Reviewers: dschuff, aardappel

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53634

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345343 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstPrinter/WebAssemblyInstPrinter.cpp    | 66 +++++++++++--
 .../InstPrinter/WebAssemblyInstPrinter.h      |  9 +-
 test/CodeGen/WebAssembly/annotations.mir      | 94 +++++++++++++++++++
 3 files changed, 160 insertions(+), 9 deletions(-)
 create mode 100644 test/CodeGen/WebAssembly/annotations.mir

diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 33b224adc6e..6b97e14364f 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -35,7 +35,7 @@ using namespace llvm;
 WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI,
                                                const MCInstrInfo &MII,
                                                const MCRegisterInfo &MRI)
-    : MCInstPrinter(MAI, MII, MRI), ControlFlowCounter(0) {}
+    : MCInstPrinter(MAI, MII, MRI) {}
 
 void WebAssemblyInstPrinter::printRegName(raw_ostream &OS,
                                           unsigned RegNo) const {
@@ -70,30 +70,65 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   if (CommentStream) {
     // Observe any effects on the control flow stack, for use in annotating
     // control flow label references.
-    switch (MI->getOpcode()) {
+    unsigned Opc = MI->getOpcode();
+    switch (Opc) {
     default:
       break;
+
     case WebAssembly::LOOP:
-    case WebAssembly::LOOP_S: {
+    case WebAssembly::LOOP_S:
       printAnnotation(OS, "label" + utostr(ControlFlowCounter) + ':');
       ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, true));
       break;
-    }
+
     case WebAssembly::BLOCK:
     case WebAssembly::BLOCK_S:
       ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
       break;
+
+    case WebAssembly::TRY:
+    case WebAssembly::TRY_S:
+      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
+      EHPadStack.push_back(EHPadStackCounter++);
+      LastSeenEHInst = TRY;
+      break;
+
     case WebAssembly::END_LOOP:
     case WebAssembly::END_LOOP_S:
       assert(!ControlFlowStack.empty() && "End marker mismatch!");
       ControlFlowStack.pop_back();
       break;
+
     case WebAssembly::END_BLOCK:
     case WebAssembly::END_BLOCK_S:
       assert(!ControlFlowStack.empty() && "End marker mismatch!");
       printAnnotation(
           OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
       break;
+
+    case WebAssembly::END_TRY:
+    case WebAssembly::END_TRY_S:
+      assert(!ControlFlowStack.empty() && "End marker mismatch!");
+      printAnnotation(
+          OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+      LastSeenEHInst = END_TRY;
+      break;
+
+    case WebAssembly::CATCH_I32:
+    case WebAssembly::CATCH_I32_S:
+    case WebAssembly::CATCH_I64:
+    case WebAssembly::CATCH_I64_S:
+    case WebAssembly::CATCH_ALL:
+    case WebAssembly::CATCH_ALL_S:
+      assert(LastSeenEHInst != END_TRY);
+      // There can be multiple catch instructions for one try instruction, so we
+      // only print 'catch' label when the last seen EH instruction was 'try'.
+      if (LastSeenEHInst == TRY) {
+        assert(!EHPadStack.empty() && "try-catch mismatch!");
+        printAnnotation(OS, "catch" + utostr(EHPadStack.pop_back_val()) + ':');
+      }
+      LastSeenEHInst = CATCH;
+      break;
     }
 
     // Annotate any control flow label references.
@@ -108,9 +143,26 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
       uint64_t Depth = MI->getOperand(i).getImm();
       if (!Printed.insert(Depth).second)
         continue;
-      const auto &Pair = ControlFlowStack.rbegin()[Depth];
-      printAnnotation(OS, utostr(Depth) + ": " + (Pair.second ? "up" : "down") +
-                              " to label" + utostr(Pair.first));
+
+      if (Opc == WebAssembly::RETHROW || Opc == WebAssembly::RETHROW_S) {
+        assert(Depth <= EHPadStack.size() && "Invalid depth argument!");
+        if (Depth == EHPadStack.size()) {
+          // This can happen when rethrow instruction breaks out of all nests
+          // and throws up to the current function's caller.
+          printAnnotation(OS, utostr(Depth) + ": " + "to caller");
+        } else {
+          uint64_t CatchNo = EHPadStack.rbegin()[Depth];
+          printAnnotation(OS, utostr(Depth) + ": " + "down to catch" +
+                                  utostr(CatchNo));
+        }
+
+      } else {
+        assert(Depth < ControlFlowStack.size() && "Invalid depth argument!");
+        const auto &Pair = ControlFlowStack.rbegin()[Depth];
+        printAnnotation(OS, utostr(Depth) + ": " +
+                                (Pair.second ? "up" : "down") + " to label" +
+                                utostr(Pair.first));
+      }
     }
   }
 }
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index 18023328b38..ded64f9a6e9 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -25,8 +25,13 @@ namespace llvm {
 class MCSubtargetInfo;
 
 class WebAssemblyInstPrinter final : public MCInstPrinter {
-  uint64_t ControlFlowCounter;
-  SmallVector<std::pair<uint64_t, bool>, 0> ControlFlowStack;
+  uint64_t ControlFlowCounter = 0;
+  uint64_t EHPadStackCounter = 0;
+  SmallVector<std::pair<uint64_t, bool>, 4> ControlFlowStack;
+  SmallVector<uint64_t, 4> EHPadStack;
+
+  enum EHInstKind { TRY, CATCH, END_TRY };
+  EHInstKind LastSeenEHInst = END_TRY;
 
 public:
   WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/test/CodeGen/WebAssembly/annotations.mir b/test/CodeGen/WebAssembly/annotations.mir
new file mode 100644
index 00000000000..1ae2db82484
--- /dev/null
+++ b/test/CodeGen/WebAssembly/annotations.mir
@@ -0,0 +1,94 @@
+# RUN: llc -mtriple=wasm32-unknown-unknown -start-after xray-instrumentation -wasm-keep-registers %s -o - | FileCheck %s
+
+---
+# Tests if block/loop/try/catch/end instructions are correctly printed with
+# their annotations.
+
+# CHECK: test0:
+# CHECK:   block
+# CHECK:   try
+# CHECK:   br        0               # 0: down to label1
+# CHECK:   catch_all                 # catch0:
+# CHECK:   block
+# CHECK:   br_if     0, 1            # 0: down to label2
+# CHECK:   loop                      # label3:
+# CHECK:   br_if     0, 1            # 0: up to label3
+# CHECK:   end_loop
+# CHECK:   end_block                 # label2:
+# CHECK:   try
+# CHECK:   rethrow   0               # 0: down to catch1
+# CHECK:   catch_all                 # catch1:
+# CHECK:   block
+# CHECK:   try
+# CHECK:   br        0               # 0: down to label6
+# CHECK:   catch_all                 # catch2:
+# CHECK:   unreachable
+# CHECK:   end_try                   # label6:
+# CHECK:   end_block                 # label5:
+# CHECK:   rethrow   0               # 0: to caller
+# CHECK:   end_try                   # label4:
+# CHECK:   end_try                   # label1:
+# CHECK:   end_block                 # label0:
+
+name: test0
+liveins:
+  - { reg: '$arguments', reg: '$value_stack' }
+body: |
+  bb.0:
+    successors: %bb.7, %bb.1
+    BLOCK 64, implicit-def $value_stack, implicit $value_stack
+    TRY 64, implicit-def $value_stack, implicit $value_stack
+    BR 0, implicit-def $arguments
+
+  bb.1 (landing-pad):
+  ; predecessors: %bb.0
+    successors: %bb.2, %bb.3
+
+    CATCH_ALL implicit-def $arguments
+    BLOCK 64, implicit-def $value_stack, implicit $value_stack
+    BR_IF 0, 1, implicit-def $arguments, implicit-def $value_stack, implicit $value_stack
+
+  bb.2:
+  ; predecessors: %bb.1, %bb.2
+    successors: %bb.2, %bb.3
+
+    LOOP 64, implicit-def $value_stack, implicit $value_stack
+    BR_IF 0, 1, implicit-def $arguments
+
+  bb.3:
+  ; predecessors: %bb.1, %bb.2
+    successors: %bb.4
+
+    END_LOOP implicit-def $value_stack, implicit $value_stack
+    END_BLOCK implicit-def $value_stack, implicit $value_stack
+    TRY 64, implicit-def $value_stack, implicit $value_stack
+    RETHROW 0, implicit-def $arguments
+
+  bb.4 (landing-pad):
+  ; predecessors: %bb.3
+    successors: %bb.6, %bb.5
+
+    CATCH_ALL implicit-def $arguments
+    BLOCK 64, implicit-def $value_stack, implicit $value_stack
+    TRY 64, implicit-def $value_stack, implicit $value_stack
+    BR 0, implicit-def $arguments
+
+  bb.5 (landing-pad):
+  ; predecessors: %bb.4
+    CATCH_ALL implicit-def $arguments
+    UNREACHABLE implicit-def dead $arguments
+
+  bb.6:
+  ; predecessors: %bb.4
+    END_TRY implicit-def $value_stack, implicit $value_stack
+    END_BLOCK implicit-def $value_stack, implicit $value_stack
+    RETHROW 0, implicit-def $arguments
+
+  bb.7:
+  ; predecessors: %bb.0
+    END_TRY implicit-def $value_stack, implicit $value_stack
+    END_TRY implicit-def $value_stack, implicit $value_stack
+    END_BLOCK implicit-def $value_stack, implicit $value_stack
+    FALLTHROUGH_RETURN_VOID implicit-def dead $arguments
+    END_FUNCTION implicit-def $value_stack, implicit $value_stack
+...
-- 
GitLab


From 397841e1d34c168b09747b1823e11dc44af0293e Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Thu, 25 Oct 2018 23:55:10 +0000
Subject: [PATCH 0607/1116] Reland "[WebAssembly] LSDA info generation"

Summary:
This adds support for LSDA (exception table) generation for wasm EH.
Wasm EH mostly follows the structure of Itanium-style exception tables,
with one exception: a call site table entry in wasm EH corresponds to
not a call site but a landing pad.

In wasm EH, the VM is responsible for stack unwinding. After an
exception occurs and the stack is unwound, the control flow is
transferred to wasm 'catch' instruction by the VM, after which the
personality function is called from the compiler-generated code. (Refer
to WasmEHPrepare pass for more information on this part.)

This patch:
- Changes wasm.landingpad.index intrinsic to take a token argument, to
make this 1:1 match with a catchpad instruction
- Stores landingpad index info and catch type info MachineFunction in
before instruction selection
- Lowers wasm.lsda intrinsic to an MCSymbol pointing to the start of an
exception table
- Adds WasmException class with overridden methods for table generation
- Adds support for LSDA section in Wasm object writer

Reviewers: dschuff, sbc100, rnk

Subscribers: mgorny, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D52748

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345345 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineFunction.h        |  22 +-
 include/llvm/IR/IntrinsicsWebAssembly.td      |   3 +-
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp         |   3 +-
 lib/CodeGen/AsmPrinter/CMakeLists.txt         |   1 +
 lib/CodeGen/AsmPrinter/EHStreamer.cpp         |  10 +-
 lib/CodeGen/AsmPrinter/EHStreamer.h           |  11 +-
 lib/CodeGen/AsmPrinter/WasmException.cpp      |  81 ++++++
 lib/CodeGen/AsmPrinter/WasmException.h        |  42 +++
 lib/CodeGen/MachineFunction.cpp               |  45 ++--
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  10 +-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |  83 ++++--
 lib/CodeGen/TargetLoweringObjectFileImpl.cpp  |   4 +
 lib/CodeGen/WasmEHPrepare.cpp                 |   2 +-
 lib/MC/MCObjectFileInfo.cpp                   |   6 +
 lib/MC/WasmObjectWriter.cpp                   |   4 +-
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  15 +-
 .../WebAssembly/WebAssemblyInstrInfo.td       |   2 +
 .../WebAssembly/WebAssemblyMCInstLower.cpp    |   7 +
 test/CodeGen/WebAssembly/eh-lsda.ll           | 239 ++++++++++++++++++
 test/CodeGen/WebAssembly/wasmehprepare.ll     |   6 +-
 20 files changed, 529 insertions(+), 67 deletions(-)
 create mode 100644 lib/CodeGen/AsmPrinter/WasmException.cpp
 create mode 100644 lib/CodeGen/AsmPrinter/WasmException.h
 create mode 100644 test/CodeGen/WebAssembly/eh-lsda.ll

diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 7471b314846..bc81e485a80 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -316,6 +316,9 @@ class MachineFunction {
   /// Map a landing pad's EH symbol to the call site indexes.
   DenseMap<MCSymbol*, SmallVector<unsigned, 4>> LPadToCallSiteMap;
 
+  /// Map a landing pad to its index.
+  DenseMap<const MachineBasicBlock *, unsigned> WasmLPadToIndexMap;
+
   /// Map of invoke call site index values to associated begin EH_LABEL.
   DenseMap<MCSymbol*, unsigned> CallSiteMap;
 
@@ -810,7 +813,8 @@ public:
   LandingPadInfo &getOrCreateLandingPadInfo(MachineBasicBlock *LandingPad);
 
   /// Remap landing pad labels and remove any deleted landing pads.
-  void tidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap = nullptr);
+  void tidyLandingPads(DenseMap<MCSymbol *, uintptr_t> *LPMap = nullptr,
+                       bool TidyIfNoBeginLabels = true);
 
   /// Return a reference to the landing pad info for the current function.
   const std::vector<LandingPadInfo> &getLandingPads() const {
@@ -853,6 +857,22 @@ public:
   /// Map the landing pad's EH symbol to the call site indexes.
   void setCallSiteLandingPad(MCSymbol *Sym, ArrayRef<unsigned> Sites);
 
+  /// Map the landing pad to its index. Used for Wasm exception handling.
+  void setWasmLandingPadIndex(const MachineBasicBlock *LPad, unsigned Index) {
+    WasmLPadToIndexMap[LPad] = Index;
+  }
+
+  /// Returns true if the landing pad has an associate index in wasm EH.
+  bool hasWasmLandingPadIndex(const MachineBasicBlock *LPad) const {
+    return WasmLPadToIndexMap.count(LPad);
+  }
+
+  /// Get the index in wasm EH for a given landing pad.
+  unsigned getWasmLandingPadIndex(const MachineBasicBlock *LPad) const {
+    assert(hasWasmLandingPadIndex(LPad));
+    return WasmLPadToIndexMap.lookup(LPad);
+  }
+
   /// Get the call site indexes for a landing pad EH symbol.
   SmallVectorImpl<unsigned> &getCallSiteLandingPad(MCSymbol *Sym) {
     assert(hasCallSiteLandingPad(Sym) &&
diff --git a/include/llvm/IR/IntrinsicsWebAssembly.td b/include/llvm/IR/IntrinsicsWebAssembly.td
index 897d3525b4c..ff5964c3aab 100644
--- a/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -71,7 +71,8 @@ def int_wasm_catch : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty],
 // WebAssembly EH must maintain the landingpads in the order assigned to them
 // by WasmEHPrepare pass to generate landingpad table in EHStreamer. This is
 // used in order to give them the indices in WasmEHPrepare.
-def int_wasm_landingpad_index: Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
+def int_wasm_landingpad_index: Intrinsic<[], [llvm_token_ty, llvm_i32_ty],
+                                         [IntrNoMem]>;
 
 // Returns LSDA address of the current function.
 def int_wasm_lsda : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 63c5b262edc..526f7ce3083 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -16,6 +16,7 @@
 #include "CodeViewDebug.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
+#include "WasmException.h"
 #include "WinCFGuard.h"
 #include "WinException.h"
 #include "llvm/ADT/APFloat.h"
@@ -356,7 +357,7 @@ bool AsmPrinter::doInitialization(Module &M) {
     }
     break;
   case ExceptionHandling::Wasm:
-    // TODO to prevent warning
+    ES = new WasmException(this);
     break;
   }
   if (ES)
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index 6cba4a0d4b8..3fb088ab6f0 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_library(LLVMAsmPrinter
   WinCFGuard.cpp
   WinException.cpp
   CodeViewDebug.cpp
+  WasmException.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index be04b9a6e8c..7599121de2b 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -345,7 +345,9 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
 ///     unwound and handling continues.
 ///  3. Type ID table contains references to all the C++ typeinfo for all
 ///     catches in the function.  This tables is reverse indexed base 1.
-void EHStreamer::emitExceptionTable() {
+///
+/// Returns the starting symbol of an exception table.
+MCSymbol *EHStreamer::emitExceptionTable() {
   const MachineFunction *MF = Asm->MF;
   const std::vector<const GlobalValue *> &TypeInfos = MF->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MF->getFilterIds();
@@ -375,6 +377,7 @@ void EHStreamer::emitExceptionTable() {
   computeCallSiteTable(CallSites, LandingPads, FirstActions);
 
   bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
+  bool IsWasm = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::Wasm;
   unsigned CallSiteEncoding =
       IsSJLJ ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_uleb128;
   bool HaveTTData = !TypeInfos.empty() || !FilterIds.empty();
@@ -457,8 +460,8 @@ void EHStreamer::emitExceptionTable() {
   Asm->EmitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
   Asm->OutStreamer->EmitLabel(CstBeginLabel);
 
-  // SjLj Exception handling
-  if (IsSJLJ) {
+  // SjLj / Wasm Exception handling
+  if (IsSJLJ || IsWasm) {
     unsigned idx = 0;
     for (SmallVectorImpl<CallSiteEntry>::const_iterator
          I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) {
@@ -604,6 +607,7 @@ void EHStreamer::emitExceptionTable() {
   }
 
   Asm->EmitAlignment(2);
+  return GCCETSym;
 }
 
 void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
index b89421a1e06..e3a6f8e9d58 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -85,9 +85,10 @@ protected:
   /// zero for the landing pad and the action.  Calls marked 'nounwind' have
   /// no entry and must not be contained in the try-range of any entry - they
   /// form gaps in the table.  Entries must be ordered by try-range address.
-  void computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                            const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
-                            const SmallVectorImpl<unsigned> &FirstActions);
+  virtual void computeCallSiteTable(
+      SmallVectorImpl<CallSiteEntry> &CallSites,
+      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+      const SmallVectorImpl<unsigned> &FirstActions);
 
   /// Emit landing pads and actions.
   ///
@@ -108,7 +109,9 @@ protected:
   ///     found the frame is unwound and handling continues.
   ///  3. Type id table contains references to all the C++ typeinfo for all
   ///     catches in the function.  This tables is reversed indexed base 1.
-  void emitExceptionTable();
+  ///
+  /// Returns the starting symbol of an exception table.
+  MCSymbol *emitExceptionTable();
 
   virtual void emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel);
 
diff --git a/lib/CodeGen/AsmPrinter/WasmException.cpp b/lib/CodeGen/AsmPrinter/WasmException.cpp
new file mode 100644
index 00000000000..46745d08c9f
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/WasmException.cpp
@@ -0,0 +1,81 @@
+//===-- CodeGen/AsmPrinter/WasmException.cpp - Wasm Exception Impl --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing WebAssembly exception info into asm
+// files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "WasmException.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+void WasmException::markFunctionEnd() {
+  // Get rid of any dead landing pads.
+  if (!Asm->MF->getLandingPads().empty()) {
+    auto *NonConstMF = const_cast<MachineFunction *>(Asm->MF);
+    // Wasm does not set BeginLabel and EndLabel information for landing pads,
+    // so we should set the second argument false.
+    NonConstMF->tidyLandingPads(nullptr, /* TidyIfNoBeginLabels */ false);
+  }
+}
+
+void WasmException::endFunction(const MachineFunction *MF) {
+  bool ShouldEmitExceptionTable = false;
+  for (const LandingPadInfo &Info : MF->getLandingPads()) {
+    if (MF->hasWasmLandingPadIndex(Info.LandingPadBlock)) {
+      ShouldEmitExceptionTable = true;
+      break;
+    }
+  }
+  if (!ShouldEmitExceptionTable)
+    return;
+  MCSymbol *LSDALabel = emitExceptionTable();
+  assert(LSDALabel && ".GCC_exception_table has not been emitted!");
+
+  // Wasm requires every data section symbol to have a .size set. So we emit an
+  // end marker and set the size as the difference between the start end the end
+  // marker.
+  MCSymbol *LSDAEndLabel = Asm->createTempSymbol("GCC_except_table_end");
+  Asm->OutStreamer->EmitLabel(LSDAEndLabel);
+  MCContext &OutContext = Asm->OutStreamer->getContext();
+  const MCExpr *SizeExp = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(LSDAEndLabel, OutContext),
+      MCSymbolRefExpr::create(LSDALabel, OutContext), OutContext);
+  Asm->OutStreamer->emitELFSize(LSDALabel, SizeExp);
+}
+
+// Compute the call-site table for wasm EH. Even though we use the same function
+// name to share the common routines, a call site entry in the table corresponds
+// to not a call site for possibly-throwing functions but a landing pad. In wasm
+// EH the VM is responsible for stack unwinding. After an exception occurs and
+// the stack is unwound, the control flow is transferred to wasm 'catch'
+// instruction by the VM, after which the personality function is called from
+// the compiler-generated code. Refer to WasmEHPrepare pass for more
+// information.
+void WasmException::computeCallSiteTable(
+    SmallVectorImpl<CallSiteEntry> &CallSites,
+    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+    const SmallVectorImpl<unsigned> &FirstActions) {
+  MachineFunction &MF = *Asm->MF;
+  for (unsigned I = 0, N = LandingPads.size(); I < N; ++I) {
+    const LandingPadInfo *Info = LandingPads[I];
+    MachineBasicBlock *LPad = Info->LandingPadBlock;
+    // We don't emit LSDA for single catch (...).
+    if (!MF.hasWasmLandingPadIndex(LPad))
+      continue;
+    // Wasm EH must maintain the EH pads in the order assigned to them by the
+    // WasmEHPrepare pass.
+    unsigned LPadIndex = MF.getWasmLandingPadIndex(LPad);
+    CallSiteEntry Site = {nullptr, nullptr, Info, FirstActions[I]};
+    if (CallSites.size() < LPadIndex + 1)
+      CallSites.resize(LPadIndex + 1);
+    CallSites[LPadIndex] = Site;
+  }
+}
diff --git a/lib/CodeGen/AsmPrinter/WasmException.h b/lib/CodeGen/AsmPrinter/WasmException.h
new file mode 100644
index 00000000000..09a9a25ce8d
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/WasmException.h
@@ -0,0 +1,42 @@
+//===-- WasmException.h - Wasm Exception Framework -------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing WebAssembly exception info into asm
+// files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H
+
+#include "EHStreamer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+
+namespace llvm {
+
+class LLVM_LIBRARY_VISIBILITY WasmException : public EHStreamer {
+public:
+  WasmException(AsmPrinter *A) : EHStreamer(A) {}
+
+  void endModule() override {}
+  void beginFunction(const MachineFunction *MF) override {}
+  virtual void markFunctionEnd() override;
+  void endFunction(const MachineFunction *MF) override;
+
+protected:
+  // Compute the call site table for wasm EH.
+  void computeCallSiteTable(
+      SmallVectorImpl<CallSiteEntry> &CallSites,
+      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+      const SmallVectorImpl<unsigned> &FirstActions) override;
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 431484f078b..9e4963c4bdb 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -661,8 +661,11 @@ MCSymbol *MachineFunction::addLandingPad(MachineBasicBlock *LandingPad) {
       }
     }
 
-  } else if (isa<CatchPadInst>(FirstI)) {
-    // TODO
+  } else if (const auto *CPI = dyn_cast<CatchPadInst>(FirstI)) {
+    for (unsigned I = CPI->getNumArgOperands(); I != 0; --I) {
+      Value *TypeInfo = CPI->getArgOperand(I - 1)->stripPointerCasts();
+      addCatchTypeInfo(LandingPad, dyn_cast<GlobalValue>(TypeInfo));
+    }
 
   } else {
     assert(isa<CleanupPadInst>(FirstI) && "Invalid landingpad!");
@@ -687,7 +690,8 @@ void MachineFunction::addFilterTypeInfo(MachineBasicBlock *LandingPad,
   LP.TypeIds.push_back(getFilterIDFor(IdsInFilter));
 }
 
-void MachineFunction::tidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
+void MachineFunction::tidyLandingPads(DenseMap<MCSymbol *, uintptr_t> *LPMap,
+                                      bool TidyIfNoBeginLabels) {
   for (unsigned i = 0; i != LandingPads.size(); ) {
     LandingPadInfo &LandingPad = LandingPads[i];
     if (LandingPad.LandingPadLabel &&
@@ -702,24 +706,25 @@ void MachineFunction::tidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
       continue;
     }
 
-    for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) {
-      MCSymbol *BeginLabel = LandingPad.BeginLabels[j];
-      MCSymbol *EndLabel = LandingPad.EndLabels[j];
-      if ((BeginLabel->isDefined() ||
-           (LPMap && (*LPMap)[BeginLabel] != 0)) &&
-          (EndLabel->isDefined() ||
-           (LPMap && (*LPMap)[EndLabel] != 0))) continue;
-
-      LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
-      LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
-      --j;
-      --e;
-    }
+    if (TidyIfNoBeginLabels) {
+      for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) {
+        MCSymbol *BeginLabel = LandingPad.BeginLabels[j];
+        MCSymbol *EndLabel = LandingPad.EndLabels[j];
+        if ((BeginLabel->isDefined() || (LPMap && (*LPMap)[BeginLabel] != 0)) &&
+            (EndLabel->isDefined() || (LPMap && (*LPMap)[EndLabel] != 0)))
+          continue;
+
+        LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
+        LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
+        --j;
+        --e;
+      }
 
-    // Remove landing pads with no try-ranges.
-    if (LandingPads[i].BeginLabels.empty()) {
-      LandingPads.erase(LandingPads.begin() + i);
-      continue;
+      // Remove landing pads with no try-ranges.
+      if (LandingPads[i].BeginLabels.empty()) {
+        LandingPads.erase(LandingPads.begin() + i);
+        continue;
+      }
     }
 
     // If there is no landing pad, ensure that the list of typeids is empty.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 05eac30843f..3434f24db91 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6294,12 +6294,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
 
-  case Intrinsic::wasm_landingpad_index: {
-    // TODO store landing pad index in a map, which will be used when generating
-    // LSDA information
+  case Intrinsic::wasm_landingpad_index:
+    // Information this intrinsic contained has been transferred to
+    // MachineFunction in SelectionDAGISel::PrepareEHLandingPad. We can safely
+    // delete it now.
     return nullptr;
   }
-  }
 }
 
 void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
@@ -6456,7 +6456,7 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
       WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
       EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS.getInstruction()),
                                 BeginLabel, EndLabel);
-    } else {
+    } else if (!isScopedEHPersonality(Pers)) {
       MF.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 2b4a590f19f..90bcaa653c3 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -1128,6 +1129,36 @@ static bool hasExceptionPointerOrCodeUser(const CatchPadInst *CPI) {
   return false;
 }
 
+// wasm.landingpad.index intrinsic is for associating a landing pad index number
+// with a catchpad instruction. Retrieve the landing pad index in the intrinsic
+// and store the mapping in the function.
+static void mapWasmLandingPadIndex(MachineBasicBlock *MBB,
+                                   const CatchPadInst *CPI) {
+  MachineFunction *MF = MBB->getParent();
+  // In case of single catch (...), we don't emit LSDA, so we don't need
+  // this information.
+  bool IsSingleCatchAllClause =
+      CPI->getNumArgOperands() == 1 &&
+      cast<Constant>(CPI->getArgOperand(0))->isNullValue();
+  if (!IsSingleCatchAllClause) {
+    // Create a mapping from landing pad label to landing pad index.
+    bool IntrFound = false;
+    for (const User *U : CPI->users()) {
+      if (const auto *Call = dyn_cast<IntrinsicInst>(U)) {
+        Intrinsic::ID IID = Call->getIntrinsicID();
+        if (IID == Intrinsic::wasm_landingpad_index) {
+          Value *IndexArg = Call->getArgOperand(1);
+          int Index = cast<ConstantInt>(IndexArg)->getZExtValue();
+          MF->setWasmLandingPadIndex(MBB, Index);
+          IntrFound = true;
+          break;
+        }
+      }
+    }
+    assert(IntrFound && "wasm.landingpad.index intrinsic not found!");
+  }
+}
+
 /// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
 /// do other setup for EH landing-pad blocks.
 bool SelectionDAGISel::PrepareEHLandingPad() {
@@ -1137,44 +1168,48 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
   const TargetRegisterClass *PtrRC =
       TLI->getRegClassFor(TLI->getPointerTy(CurDAG->getDataLayout()));
 
+  auto Pers = classifyEHPersonality(PersonalityFn);
+
   // Catchpads have one live-in register, which typically holds the exception
   // pointer or code.
-  if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
-    if (hasExceptionPointerOrCodeUser(CPI)) {
-      // Get or create the virtual register to hold the pointer or code.  Mark
-      // the live in physreg and copy into the vreg.
-      MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
-      assert(EHPhysReg && "target lacks exception pointer register");
-      MBB->addLiveIn(EHPhysReg);
-      unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
-      BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
-              TII->get(TargetOpcode::COPY), VReg)
-          .addReg(EHPhysReg, RegState::Kill);
+  if (isFuncletEHPersonality(Pers)) {
+    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
+      if (hasExceptionPointerOrCodeUser(CPI)) {
+        // Get or create the virtual register to hold the pointer or code.  Mark
+        // the live in physreg and copy into the vreg.
+        MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
+        assert(EHPhysReg && "target lacks exception pointer register");
+        MBB->addLiveIn(EHPhysReg);
+        unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
+        BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
+                TII->get(TargetOpcode::COPY), VReg)
+            .addReg(EHPhysReg, RegState::Kill);
+      }
     }
     return true;
   }
 
-  if (!LLVMBB->isLandingPad())
-    return true;
-
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
   MCSymbol *Label = MF->addLandingPad(MBB);
 
-  // Assign the call site to the landing pad's begin label.
-  MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
-
   const MCInstrDesc &II = TII->get(TargetOpcode::EH_LABEL);
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
-  // Mark exception register as live in.
-  if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
-    FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
-
-  // Mark exception selector register as live in.
-  if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
-    FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
+  if (Pers == EHPersonality::Wasm_CXX) {
+    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI()))
+      mapWasmLandingPadIndex(MBB, CPI);
+  } else {
+    // Assign the call site to the landing pad's begin label.
+    MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
+    // Mark exception register as live in.
+    if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
+      FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
+    // Mark exception selector register as live in.
+    if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
+      FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
+  }
 
   return true;
 }
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index b046cd81d6c..341ab927861 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1748,6 +1748,10 @@ const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference(
 void TargetLoweringObjectFileWasm::InitializeWasm() {
   StaticCtorSection =
       getContext().getWasmSection(".init_array", SectionKind::getData());
+
+  // We don't use PersonalityEncoding and LSDAEncoding because we don't emit
+  // .cfi directives. We use TTypeEncoding to encode typeinfo global variables.
+  TTypeEncoding = dwarf::DW_EH_PE_absptr;
 }
 
 MCSection *TargetLoweringObjectFileWasm::getStaticCtorSection(
diff --git a/lib/CodeGen/WasmEHPrepare.cpp b/lib/CodeGen/WasmEHPrepare.cpp
index 83d04da5dd0..6f02a05f561 100644
--- a/lib/CodeGen/WasmEHPrepare.cpp
+++ b/lib/CodeGen/WasmEHPrepare.cpp
@@ -300,7 +300,7 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, unsigned Index) {
   // This is to create a map of <landingpad EH label, landingpad index> in
   // SelectionDAGISel, which is to be used in EHStreamer to emit LSDA tables.
   // Pseudocode: wasm.landingpad.index(Index);
-  IRB.CreateCall(LPadIndexF, IRB.getInt32(Index));
+  IRB.CreateCall(LPadIndexF, {FPI, IRB.getInt32(Index)});
 
   // Pseudocode: __wasm_lpad_context.lpad_index = index;
   IRB.CreateStore(IRB.getInt32(Index), LPadIndexField);
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 465b62ce8c9..54d8d4e5f71 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -746,6 +746,12 @@ void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
   DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", SectionKind::getMetadata());
   DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", SectionKind::getMetadata());
 
+  // Wasm use data section for LSDA.
+  // TODO Consider putting each function's exception table in a separate
+  // section, as in -function-sections, to facilitate lld's --gc-section.
+  LSDASection = Ctx->getWasmSection(".rodata.gcc_except_table",
+                                    SectionKind::getReadOnlyWithRel());
+
   // TODO: Define more sections.
 }
 
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index cbbe161ae82..f9318ad5801 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -635,10 +635,12 @@ static void addData(SmallVectorImpl<char> &DataBytes,
         llvm_unreachable("The fill should be an assembler constant");
       DataBytes.insert(DataBytes.end(), Fill->getValueSize() * NumValues,
                        Fill->getValue());
+    } else if (auto *LEB = dyn_cast<MCLEBFragment>(&Frag)) {
+      const SmallVectorImpl<char> &Contents = LEB->getContents();
+      DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
     } else {
       const auto &DataFrag = cast<MCDataFragment>(Frag);
       const SmallVectorImpl<char> &Contents = DataFrag.getContents();
-
       DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
     }
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index c056e1af588..06414c27318 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -998,9 +999,17 @@ WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   default:
     return {}; // Don't custom lower most intrinsics.
 
-  case Intrinsic::wasm_lsda:
-    // TODO For now, just return 0 not to crash
-    return DAG.getConstant(0, DL, Op.getValueType());
+  case Intrinsic::wasm_lsda: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    EVT VT = Op.getValueType();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    auto &Context = MF.getMMI().getContext();
+    MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
+                                            Twine(MF.getFunctionNumber()));
+    return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
+                       DAG.getMCSymbol(S, PtrVT));
+  }
   }
 }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index c5b41983245..8fff924265f 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -282,6 +282,8 @@ def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
           (CONST_I32 tglobaladdr:$addr)>;
 def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
           (CONST_I32 texternalsym:$addr)>;
+def : Pat<(i32 (WebAssemblywrapper mcsym:$sym)), (CONST_I32 mcsym:$sym)>;
+def : Pat<(i64 (WebAssemblywrapper mcsym:$sym)), (CONST_I64 mcsym:$sym)>;
 
 //===----------------------------------------------------------------------===//
 // Additional sets of instructions.
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index ebd374762ae..1dad7b8a289 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -230,6 +230,13 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
           (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_FUNCTION) != 0,
           (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_GLOBAL) != 0);
       break;
+    case MachineOperand::MO_MCSymbol:
+      // This is currently used only for LSDA symbols (GCC_except_table),
+      // because global addresses or other external symbols are handled above.
+      assert(MO.getTargetFlags() == 0 &&
+             "WebAssembly does not use target flags on MCSymbol");
+      MCOp = LowerSymbolOperand(MO.getMCSymbol(), /*Offset=*/0, false, false);
+      break;
     }
 
     OutMI.addOperand(MCOp);
diff --git a/test/CodeGen/WebAssembly/eh-lsda.ll b/test/CodeGen/WebAssembly/eh-lsda.ll
new file mode 100644
index 00000000000..fd550938c42
--- /dev/null
+++ b/test/CodeGen/WebAssembly/eh-lsda.ll
@@ -0,0 +1,239 @@
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling | FileCheck -allow-deprecated-dag-overlap %s
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+@_ZTIi = external constant i8*
+@_ZTIf = external constant i8*
+@_ZTId = external constant i8*
+
+; Single catch (...) does not need an exception table.
+;
+; try {
+;   may_throw();
+; } catch (...) {
+; }
+; CHECK-LABEL: test0:
+; CHECK-NOT: GCC_except_table
+define void @test0() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+entry:
+  invoke void @may_throw()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null]
+  %2 = call i8* @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch.start
+  ret void
+}
+
+; Exception table generation + shared action test.
+;
+; try {
+;   may_throw();
+; } catch (int) {
+; } catch (float) {
+; } catch (double) {
+; } catch (...) {
+; }
+;
+; try {
+;   may_throw();
+; } catch (double) {
+; } catch (...) {
+; }
+;
+; try {
+;   may_throw();
+; } catch (int) {
+; } catch (float) {
+; }
+;
+; There are three landing pads. The second landing pad should share action table
+; entries with the first landing pad because they end with the same sequence
+; (double -> ...). But the third landing table cannot share action table entries
+; with others, so it should create its own entries.
+; CHECK-LABEL: test1:
+; CHECK: .section  .rodata.gcc_except_table,"",@
+; CHECK-NEXT:   .p2align  2
+; CHECK-NEXT: GCC_except_table[[START:[0-9]+]]:
+; CHECK-NEXT: .Lexception0:
+; CHECK-NEXT:   .int8  255                     # @LPStart Encoding = omit
+; CHECK-NEXT:   .int8  0                       # @TType Encoding = absptr
+; CHECK-NEXT:   .uleb128 .Lttbase0-.Lttbaseref0
+; CHECK-NEXT: .Lttbaseref0:
+; CHECK-NEXT:   .int8  1                       # Call site Encoding = uleb128
+; CHECK-NEXT:   .uleb128 .Lcst_end0-.Lcst_begin0
+; CHECK-NEXT: .Lcst_begin0:
+; CHECK-NEXT:   .int8  0                       # >> Call Site 0 <<
+; CHECK-NEXT:                                  #   On exception at call site 0
+; CHECK-NEXT:   .int8  7                       #   Action: 4
+; CHECK-NEXT:   .int8  1                       # >> Call Site 1 <<
+; CHECK-NEXT:                                  #   On exception at call site 1
+; CHECK-NEXT:   .int8  3                       #   Action: 2
+; CHECK-NEXT:   .int8  2                       # >> Call Site 2 <<
+; CHECK-NEXT:                                  #   On exception at call site 2
+; CHECK-NEXT:   .int8  11                      #   Action: 6
+; CHECK-NEXT: .Lcst_end0:
+; CHECK-NEXT:   .int8  1                       # >> Action Record 1 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 1
+; CHECK-NEXT:   .int8  0                       #   No further actions
+; CHECK-NEXT:   .int8  2                       # >> Action Record 2 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 2
+; CHECK-NEXT:   .int8  125                     #   Continue to action 1
+; CHECK-NEXT:   .int8  3                       # >> Action Record 3 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 3
+; CHECK-NEXT:   .int8  125                     #   Continue to action 2
+; CHECK-NEXT:   .int8  4                       # >> Action Record 4 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 4
+; CHECK-NEXT:   .int8  125                     #   Continue to action 3
+; CHECK-NEXT:   .int8  3                       # >> Action Record 5 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 3
+; CHECK-NEXT:   .int8  0                       #   No further actions
+; CHECK-NEXT:   .int8  4                       # >> Action Record 6 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 4
+; CHECK-NEXT:   .int8  125                     #   Continue to action 5
+; CHECK-NEXT:   .p2align  2
+; CHECK-NEXT:                                  # >> Catch TypeInfos <<
+; CHECK-NEXT:   .int32  _ZTIi                  # TypeInfo 4
+; CHECK-NEXT:   .int32  _ZTIf                  # TypeInfo 3
+; CHECK-NEXT:   .int32  _ZTId                  # TypeInfo 2
+; CHECK-NEXT:   .int32  0                      # TypeInfo 1
+; CHECK-NEXT: .Lttbase0:
+; CHECK-NEXT:   .p2align  2
+; CHECK-NEXT: .LGCC_except_table_end[[END:[0-9]+]]:
+; CHECK-NEXT:   .size  GCC_except_table[[START]], .LGCC_except_table_end[[END]]-GCC_except_table[[START]]
+define void @test1() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+entry:
+  invoke void @may_throw()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIf to i8*), i8* bitcast (i8** @_ZTId to i8*), i8* null]
+  %2 = call i8* @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch10, label %catch.fallthrough
+
+catch10:                                          ; preds = %catch.start
+  %5 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  %6 = bitcast i8* %5 to i32*
+  %7 = load i32, i32* %6, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch, %catch4, %catch7, %catch10
+  invoke void @may_throw()
+          to label %try.cont23 unwind label %catch.dispatch14
+
+catch.dispatch14:                                 ; preds = %try.cont
+  %8 = catchswitch within none [label %catch.start15] unwind to caller
+
+catch.start15:                                    ; preds = %catch.dispatch14
+  %9 = catchpad within %8 [i8* bitcast (i8** @_ZTId to i8*), i8* null]
+  %10 = call i8* @llvm.wasm.get.exception(token %9)
+  %11 = call i32 @llvm.wasm.get.ehselector(token %9)
+  %12 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*))
+  %matches16 = icmp eq i32 %11, %12
+  %13 = call i8* @__cxa_begin_catch(i8* %10) [ "funclet"(token %9) ]
+  br i1 %matches16, label %catch20, label %catch17
+
+catch20:                                          ; preds = %catch.start15
+  %14 = bitcast i8* %13 to double*
+  %15 = load double, double* %14, align 8
+  call void @__cxa_end_catch() [ "funclet"(token %9) ]
+  catchret from %9 to label %try.cont23
+
+try.cont23:                                       ; preds = %try.cont, %catch17, %catch20
+  invoke void @may_throw()
+          to label %try.cont36 unwind label %catch.dispatch25
+
+catch.dispatch25:                                 ; preds = %try.cont23
+  %16 = catchswitch within none [label %catch.start26] unwind to caller
+
+catch.start26:                                    ; preds = %catch.dispatch25
+  %17 = catchpad within %16 [i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIf to i8*)]
+  %18 = call i8* @llvm.wasm.get.exception(token %17)
+  %19 = call i32 @llvm.wasm.get.ehselector(token %17)
+  %20 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches27 = icmp eq i32 %19, %20
+  br i1 %matches27, label %catch33, label %catch.fallthrough28
+
+catch33:                                          ; preds = %catch.start26
+  %21 = call i8* @__cxa_begin_catch(i8* %18) [ "funclet"(token %17) ]
+  %22 = bitcast i8* %21 to i32*
+  %23 = load i32, i32* %22, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %17) ]
+  catchret from %17 to label %try.cont36
+
+catch.fallthrough28:                              ; preds = %catch.start26
+  %24 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*))
+  %matches29 = icmp eq i32 %19, %24
+  br i1 %matches29, label %catch30, label %rethrow
+
+catch30:                                          ; preds = %catch.fallthrough28
+  %25 = call i8* @__cxa_begin_catch(i8* %18) [ "funclet"(token %17) ]
+  %26 = bitcast i8* %25 to float*
+  %27 = load float, float* %26, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %17) ]
+  catchret from %17 to label %try.cont36
+
+rethrow:                                          ; preds = %catch.fallthrough28
+  call void @__cxa_rethrow() [ "funclet"(token %17) ]
+  unreachable
+
+try.cont36:                                       ; preds = %try.cont23, %catch30, %catch33
+  ret void
+
+catch17:                                          ; preds = %catch.start15
+  call void @__cxa_end_catch() [ "funclet"(token %9) ]
+  catchret from %9 to label %try.cont23
+
+catch.fallthrough:                                ; preds = %catch.start
+  %28 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*))
+  %matches1 = icmp eq i32 %3, %28
+  br i1 %matches1, label %catch7, label %catch.fallthrough2
+
+catch7:                                           ; preds = %catch.fallthrough
+  %29 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  %30 = bitcast i8* %29 to float*
+  %31 = load float, float* %30, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+catch.fallthrough2:                               ; preds = %catch.fallthrough
+  %32 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*))
+  %matches3 = icmp eq i32 %3, %32
+  %33 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  br i1 %matches3, label %catch4, label %catch
+
+catch4:                                           ; preds = %catch.fallthrough2
+  %34 = bitcast i8* %33 to double*
+  %35 = load double, double* %34, align 8
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+catch:                                            ; preds = %catch.fallthrough2
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+}
+
+declare void @may_throw()
+declare i32 @llvm.eh.typeid.for(i8*)
+declare i8* @llvm.wasm.get.exception(token)
+declare i32 @llvm.wasm.get.ehselector(token)
+declare void @__cxa_rethrow()
+declare i8* @__cxa_begin_catch(i8*)
+declare void @__cxa_end_catch()
+declare i32 @__gxx_wasm_personality_v0(...)
diff --git a/test/CodeGen/WebAssembly/wasmehprepare.ll b/test/CodeGen/WebAssembly/wasmehprepare.ll
index e6005e34057..67e198eb058 100644
--- a/test/CodeGen/WebAssembly/wasmehprepare.ll
+++ b/test/CodeGen/WebAssembly/wasmehprepare.ll
@@ -30,7 +30,7 @@ catch.start:                                      ; preds = %catch.dispatch
 ; CHECK: catch.start:
 ; CHECK-NEXT:   %[[CATCHPAD:.*]] = catchpad
 ; CHECK-NEXT:   %[[EXN:.*]] = call i8* @llvm.wasm.catch(i32 0)
-; CHECK-NEXT:   call void @llvm.wasm.landingpad.index(i32 0)
+; CHECK-NEXT:   call void @llvm.wasm.landingpad.index(token %[[CATCHPAD]], i32 0)
 ; CHECK-NEXT:   store i32 0, i32* getelementptr inbounds ({ i32, i8*, i32 }, { i32, i8*, i32 }* @__wasm_lpad_context, i32 0, i32 0)
 ; CHECK-NEXT:   %[[LSDA:.*]] = call i8* @llvm.wasm.lsda()
 ; CHECK-NEXT:   store i8* %[[LSDA]], i8** getelementptr inbounds ({ i32, i8*, i32 }, { i32, i8*, i32 }* @__wasm_lpad_context, i32 0, i32 1)
@@ -98,7 +98,7 @@ catch.start3:                                     ; preds = %catch.dispatch2
   %matches = icmp eq i32 %8, %9
   br i1 %matches, label %catch4, label %rethrow
 ; CHECK: catch.start3:
-; CHECK:   call void @llvm.wasm.landingpad.index(i32 0)
+; CHECK:   call void @llvm.wasm.landingpad.index(token %{{.+}}, i32 0)
 
 catch4:                                           ; preds = %catch.start3
   %10 = call i8* @__cxa_begin_catch(i8* %7) [ "funclet"(token %6) ]
@@ -311,7 +311,7 @@ declare void @__cxa_rethrow()
 declare void @__clang_call_terminate(i8*)
 
 ; CHECK-DAG: declare i8* @llvm.wasm.catch(i32)
-; CHECK-DAG: declare void @llvm.wasm.landingpad.index(i32)
+; CHECK-DAG: declare void @llvm.wasm.landingpad.index(token, i32)
 ; CHECK-DAG: declare i8* @llvm.wasm.lsda()
 ; CHECK-DAG: declare i32 @_Unwind_CallPersonality(i8*)
 
-- 
GitLab


From 9067288adc53f2e9f654e30022acce85459787f6 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Fri, 26 Oct 2018 00:02:33 +0000
Subject: [PATCH 0608/1116] [SystemZ]  Pass the DAG pointer from
 SystemZAddressingMode::dump().

In order to print the IR slot number for the memory operand, the DAG pointer
must be passed to SDNode::dump().

The isel-debug.ll test updated to also check for the IR Value reference being
printed correctly.

Review: Ulrich Weigand
https://reviews.llvm.org/D53333

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345347 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/SystemZ/SystemZISelDAGToDAG.cpp | 8 ++++----
 test/CodeGen/SystemZ/isel-debug.ll         | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 5edfdf645e5..c8474b15b18 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -71,19 +71,19 @@ struct SystemZAddressingMode {
   // True if the address can (and must) include ADJDYNALLOC.
   bool isDynAlloc() { return Form == FormBDXDynAlloc; }
 
-  void dump() {
+  void dump(const llvm::SelectionDAG *DAG) {
     errs() << "SystemZAddressingMode " << this << '\n';
 
     errs() << " Base ";
     if (Base.getNode())
-      Base.getNode()->dump();
+      Base.getNode()->dump(DAG);
     else
       errs() << "null\n";
 
     if (hasIndexField()) {
       errs() << " Index ";
       if (Index.getNode())
-        Index.getNode()->dump();
+        Index.getNode()->dump(DAG);
       else
         errs() << "null\n";
     }
@@ -589,7 +589,7 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
   if (AM.isDynAlloc() && !AM.IncludesDynAlloc)
     return false;
 
-  LLVM_DEBUG(AM.dump());
+  LLVM_DEBUG(AM.dump(CurDAG));
   return true;
 }
 
diff --git a/test/CodeGen/SystemZ/isel-debug.ll b/test/CodeGen/SystemZ/isel-debug.ll
index cf5370cbec6..0e48210e9b6 100644
--- a/test/CodeGen/SystemZ/isel-debug.ll
+++ b/test/CodeGen/SystemZ/isel-debug.ll
@@ -5,7 +5,7 @@
 ;
 ; Check that some debug output is printed without problems.
 ; CHECK: SystemZAddressingMode
-; CHECK: Base
+; CHECK: Base t5: i64,ch = load<(load 8 from %ir.0)>
 ; CHECK: Index
 ; CHECK: Disp
 
-- 
GitLab


From f3aabb0947b171f4b0958d39d76adf713ea2ca0a Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Fri, 26 Oct 2018 00:17:31 +0000
Subject: [PATCH 0609/1116] Dump public symbol records in pdb2yaml mode

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345348 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-pdbutil/PdbYaml.cpp         |  6 ++++
 tools/llvm-pdbutil/PdbYaml.h           |  6 ++++
 tools/llvm-pdbutil/YAMLOutputStyle.cpp | 42 ++++++++++++++++++++++++++
 tools/llvm-pdbutil/YAMLOutputStyle.h   |  1 +
 tools/llvm-pdbutil/llvm-pdbutil.cpp    |  5 +++
 tools/llvm-pdbutil/llvm-pdbutil.h      |  1 +
 6 files changed, 61 insertions(+)

diff --git a/tools/llvm-pdbutil/PdbYaml.cpp b/tools/llvm-pdbutil/PdbYaml.cpp
index eb39708a27e..3ea33360831 100644
--- a/tools/llvm-pdbutil/PdbYaml.cpp
+++ b/tools/llvm-pdbutil/PdbYaml.cpp
@@ -110,6 +110,7 @@ void MappingTraits<PdbObject>::mapping(IO &IO, PdbObject &Obj) {
   IO.mapOptional("DbiStream", Obj.DbiStream);
   IO.mapOptional("TpiStream", Obj.TpiStream);
   IO.mapOptional("IpiStream", Obj.IpiStream);
+  IO.mapOptional("PublicsStream", Obj.PublicsStream);
 }
 
 void MappingTraits<MSFHeaders>::mapping(IO &IO, MSFHeaders &Obj) {
@@ -163,6 +164,11 @@ void MappingTraits<PdbTpiStream>::mapping(IO &IO,
   IO.mapRequired("Records", Obj.Records);
 }
 
+void MappingTraits<PdbPublicsStream>::mapping(
+    IO &IO, pdb::yaml::PdbPublicsStream &Obj) {
+  IO.mapRequired("Records", Obj.PubSyms);
+}
+
 void MappingTraits<NamedStreamMapping>::mapping(IO &IO,
                                                 NamedStreamMapping &Obj) {
   IO.mapRequired("Name", Obj.StreamName);
diff --git a/tools/llvm-pdbutil/PdbYaml.h b/tools/llvm-pdbutil/PdbYaml.h
index 91e054490a5..97ba87266cc 100644
--- a/tools/llvm-pdbutil/PdbYaml.h
+++ b/tools/llvm-pdbutil/PdbYaml.h
@@ -92,6 +92,10 @@ struct PdbTpiStream {
   std::vector<CodeViewYAML::LeafRecord> Records;
 };
 
+struct PdbPublicsStream {
+  std::vector<CodeViewYAML::SymbolRecord> PubSyms;
+};
+
 struct PdbObject {
   explicit PdbObject(BumpPtrAllocator &Allocator) : Allocator(Allocator) {}
 
@@ -102,6 +106,7 @@ struct PdbObject {
   Optional<PdbDbiStream> DbiStream;
   Optional<PdbTpiStream> TpiStream;
   Optional<PdbTpiStream> IpiStream;
+  Optional<PdbPublicsStream> PublicsStream;
 
   Optional<std::vector<StringRef>> StringTable;
 
@@ -118,6 +123,7 @@ LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::StreamBlockList)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbInfoStream)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbDbiStream)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbTpiStream)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbPublicsStream)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::NamedStreamMapping)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbModiStream)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbDbiModuleInfo)
diff --git a/tools/llvm-pdbutil/YAMLOutputStyle.cpp b/tools/llvm-pdbutil/YAMLOutputStyle.cpp
index 521e27fc089..62b5c428d41 100644
--- a/tools/llvm-pdbutil/YAMLOutputStyle.cpp
+++ b/tools/llvm-pdbutil/YAMLOutputStyle.cpp
@@ -18,10 +18,13 @@
 #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 
 using namespace llvm;
@@ -68,6 +71,9 @@ Error YAMLOutputStyle::dump() {
   if (auto EC = dumpIpiStream())
     return EC;
 
+  if (auto EC = dumpPublics())
+    return EC;
+
   flush();
   return Error::success();
 }
@@ -326,6 +332,42 @@ Error YAMLOutputStyle::dumpIpiStream() {
   return Error::success();
 }
 
+Error YAMLOutputStyle::dumpPublics() {
+  if (!opts::pdb2yaml::PublicsStream)
+    return Error::success();
+
+  Obj.PublicsStream.emplace();
+  auto ExpectedPublics = File.getPDBPublicsStream();
+  if (!ExpectedPublics) {
+    llvm::consumeError(ExpectedPublics.takeError());
+    return Error::success();
+  }
+
+  PublicsStream &Publics = *ExpectedPublics;
+  const GSIHashTable &PublicsTable = Publics.getPublicsTable();
+
+  auto ExpectedSyms = File.getPDBSymbolStream();
+  if (!ExpectedSyms) {
+    llvm::consumeError(ExpectedSyms.takeError());
+    return Error::success();
+  }
+
+  BinaryStreamRef SymStream =
+      ExpectedSyms->getSymbolArray().getUnderlyingStream();
+  for (uint32_t PubSymOff : PublicsTable) {
+    Expected<CVSymbol> Sym = readSymbolFromStream(SymStream, PubSymOff);
+    if (!Sym)
+      return Sym.takeError();
+    auto ES = CodeViewYAML::SymbolRecord::fromCodeViewSymbol(*Sym);
+    if (!ES)
+      return ES.takeError();
+
+    Obj.PublicsStream->PubSyms.push_back(*ES);
+  }
+
+  return Error::success();
+}
+
 void YAMLOutputStyle::flush() {
   Out << Obj;
   outs().flush();
diff --git a/tools/llvm-pdbutil/YAMLOutputStyle.h b/tools/llvm-pdbutil/YAMLOutputStyle.h
index 3690e3529d4..a5ad3355d2a 100644
--- a/tools/llvm-pdbutil/YAMLOutputStyle.h
+++ b/tools/llvm-pdbutil/YAMLOutputStyle.h
@@ -35,6 +35,7 @@ private:
   Error dumpDbiStream();
   Error dumpTpiStream();
   Error dumpIpiStream();
+  Error dumpPublics();
 
   void flush();
 
diff --git a/tools/llvm-pdbutil/llvm-pdbutil.cpp b/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 34618f6b762..215bfbeb206 100644
--- a/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -663,6 +663,10 @@ cl::opt<bool> IpiStream("ipi-stream",
                         cl::desc("Dump the IPI Stream (Stream 5)"),
                         cl::sub(PdbToYamlSubcommand), cl::init(false));
 
+cl::opt<bool> PublicsStream("publics-stream",
+                            cl::desc("Dump the Publics Stream"),
+                            cl::sub(PdbToYamlSubcommand), cl::init(false));
+
 // MODULE & FILE OPTIONS
 cl::opt<bool> DumpModules("modules", cl::desc("dump compiland information"),
                           cl::cat(FileOptions), cl::sub(PdbToYamlSubcommand));
@@ -1495,6 +1499,7 @@ int main(int Argc, const char **Argv) {
       opts::pdb2yaml::DbiStream = true;
       opts::pdb2yaml::TpiStream = true;
       opts::pdb2yaml::IpiStream = true;
+      opts::pdb2yaml::PublicsStream = true;
       opts::pdb2yaml::DumpModules = true;
       opts::pdb2yaml::DumpModuleFiles = true;
       opts::pdb2yaml::DumpModuleSyms = true;
diff --git a/tools/llvm-pdbutil/llvm-pdbutil.h b/tools/llvm-pdbutil/llvm-pdbutil.h
index 1584dce52c5..a57cc51d7fd 100644
--- a/tools/llvm-pdbutil/llvm-pdbutil.h
+++ b/tools/llvm-pdbutil/llvm-pdbutil.h
@@ -192,6 +192,7 @@ extern llvm::cl::opt<bool> PdbStream;
 extern llvm::cl::opt<bool> DbiStream;
 extern llvm::cl::opt<bool> TpiStream;
 extern llvm::cl::opt<bool> IpiStream;
+extern llvm::cl::opt<bool> PublicsStream;
 extern llvm::cl::list<std::string> InputFilename;
 extern llvm::cl::opt<bool> DumpModules;
 extern llvm::cl::opt<bool> DumpModuleFiles;
-- 
GitLab


From 23962bb2765b60e0aee02123dcd6faeaab0e0c5c Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Fri, 26 Oct 2018 00:36:00 +0000
Subject: [PATCH 0610/1116] [SystemZ] Implement SystemZOperand::print()

SystemZAsmParser can now handle -debug by printing the operands neatly to the
output stream. Before this patch this lead to an llvm_unreachable().

It seems that now '-mllvm -debug' does not cause any crashes anywhere (at
least not on SPEC).

Review: Ulrich Weigand
https://reviews.llvm.org/D53328

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345349 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SystemZ/AsmParser/SystemZAsmParser.cpp    | 67 ++++++++++++++-
 test/MC/SystemZ/asm-match.s                   | 81 +++++++++++++++++++
 2 files changed, 147 insertions(+), 1 deletion(-)
 create mode 100644 test/MC/SystemZ/asm-match.s

diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index bde067d6c12..6f4f543caad 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "InstPrinter/SystemZInstPrinter.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -243,6 +244,11 @@ public:
     return Kind == KindImmTLS;
   }
 
+  const ImmTLSOp getImmTLS() const {
+    assert(Kind == KindImmTLS && "Not a TLS immediate");
+    return ImmTLS;
+  }
+
   // Memory operands.
   bool isMem() const override {
     return Kind == KindMem;
@@ -270,6 +276,11 @@ public:
     return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x100);
   }
 
+  const MemOp& getMem() const {
+    assert(Kind == KindMem && "Not a Mem operand");
+    return Mem;
+  }
+
   // Override MCParsedAsmOperand.
   SMLoc getStartLoc() const override { return StartLoc; }
   SMLoc getEndLoc() const override { return EndLoc; }
@@ -623,8 +634,62 @@ static struct InsnMatchEntry InsnMatchTable[] = {
     { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12, MCK_AnyReg } }
 };
 
+static void printMCExpr(const MCExpr *E, raw_ostream &OS) {
+  if (!E)
+    return;
+  if (auto *CE = dyn_cast<MCConstantExpr>(E))
+    OS << *CE;
+  else if (auto *UE = dyn_cast<MCUnaryExpr>(E))
+    OS << *UE;
+  else if (auto *BE = dyn_cast<MCBinaryExpr>(E))
+    OS << *BE;
+  else if (auto *SRE = dyn_cast<MCSymbolRefExpr>(E))
+    OS << *SRE;
+  else
+    OS << *E;
+}
+
 void SystemZOperand::print(raw_ostream &OS) const {
-  llvm_unreachable("Not implemented");
+  switch (Kind) {
+    break;
+  case KindToken:
+    OS << "Token:" << getToken();
+    break;
+  case KindReg:
+    OS << "Reg:" << SystemZInstPrinter::getRegisterName(getReg());
+    break;
+  case KindImm:
+    OS << "Imm:";
+    printMCExpr(getImm(), OS);
+    break;
+  case KindImmTLS:
+    OS << "ImmTLS:";
+    printMCExpr(getImmTLS().Imm, OS);
+    if (getImmTLS().Sym) {
+      OS << ", ";
+      printMCExpr(getImmTLS().Sym, OS);
+    }
+    break;
+  case KindMem: {
+    const MemOp &Op = getMem();
+    OS << "Mem:" << *cast<MCConstantExpr>(Op.Disp);
+    if (Op.Base) {
+      OS << "(";
+      if (Op.MemKind == BDLMem)
+        OS << *cast<MCConstantExpr>(Op.Length.Imm) << ",";
+      else if (Op.MemKind == BDRMem)
+        OS << SystemZInstPrinter::getRegisterName(Op.Length.Reg) << ",";
+      if (Op.Index)
+        OS << SystemZInstPrinter::getRegisterName(Op.Index) << ",";
+      OS << SystemZInstPrinter::getRegisterName(Op.Base);
+      OS << ")";
+    }
+    break;
+  }
+  default:
+  case KindInvalid:
+    break;
+  }
 }
 
 // Parse one register of the form %<prefix><number>.
diff --git a/test/MC/SystemZ/asm-match.s b/test/MC/SystemZ/asm-match.s
new file mode 100644
index 00000000000..843d3ae6cac
--- /dev/null
+++ b/test/MC/SystemZ/asm-match.s
@@ -0,0 +1,81 @@
+// REQUIRES: asserts
+// RUN: llvm-mc -triple s390x-linux-gnu -debug-only=asm-matcher %s 2>&1 | FileCheck %s
+//
+// Check that debug output prints the operands correctly.
+
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'sllg'
+// CHECK: Trying to match opcode SLLG
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r3): match success using generic matcher
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 2 (Reg:r0): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDAddr32Disp20 against actual operand at index 3 (Mem:3): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'llill'
+// CHECK: Trying to match opcode LLILL
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r0): match success using generic matcher
+// CHECK: Matching formal operand class MCK_U16Imm against actual operand at index 2 (Imm:0): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'lgr'
+// CHECK: Trying to match opcode LGR
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r1): match success using generic matcher
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 2 (Reg:r0): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'lg'
+// CHECK: Trying to match opcode LG
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r1): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDXAddr64Disp20 against actual operand at index 2 (Mem:16(r2)): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'lg'
+// CHECK: Trying to match opcode LG
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r1): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDXAddr64Disp20 against actual operand at index 2 (Mem:16(r2,r3)): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'stmg'
+// CHECK: Trying to match opcode STMG
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r13): match success using generic matcher
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 2 (Reg:r15): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDAddr64Disp20 against actual operand at index 3 (Mem:104(r15)): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'mvc'
+// CHECK: Trying to match opcode MVC
+// CHECK: Matching formal operand class MCK_BDLAddr64Disp12Len8 against actual operand at index 1 (Mem:184(8,r15)): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDAddr64Disp12 against actual operand at index 2 (Mem:8(r2)): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'mvck'
+// CHECK: Trying to match opcode MVCK
+// CHECK: Matching formal operand class MCK_BDRAddr64Disp12 against actual operand at index 1 (Mem:0(r0,r1)): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDAddr64Disp12 against actual operand at index 2 (Mem:4095(r15)): match success using generic matcher
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 3 (Reg:r2): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'j'
+// CHECK: Trying to match opcode J
+// CHECK: Matching formal operand class MCK_PCRel16 against actual operand at index 1 (Imm:.Ltmp0+2): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 2: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'brasl'
+// CHECK: Trying to match opcode BRASL
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r14): match success using generic matcher
+// CHECK: Matching formal operand class MCK_PCRelTLS32 against actual operand at index 2 (ImmTLS:fun): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: .text
+// CHECK: sllg	%r3, %r0, 3
+// CHECK: llill	%r0, 0
+// CHECK: lgr	%r1, %r0
+// CHECK: lg	%r1, 16(%r2)
+// CHECK: lg	%r1, 16(%r2,%r3)
+// CHECK: stmg	%r13, %r15, 104(%r15)
+// CHECK: mvc	184(8,%r15), 8(%r2)
+// CHECK: mvck	0(%r0,%r1), 4095(%r15), %r2
+// CHECK: .Ltmp0:
+// CHECK: j	.Ltmp0+2
+// CHECK: brasl	%r14, fun
+	
+        sllg    %r3, %r0, 3
+        llill	%r0, 0
+        lgr	%r1, %r0
+        lg      %r1, 16(%r2)
+        lg      %r1, 16(%r2,%r3)
+        stmg    %r13, %r15, 104(%r15)
+        mvc     184(8,%r15), 8(%r2)
+        mvck    0(%r0,%r1), 4095(%r15), %r2
+.Ltmp0:
+        j	.Ltmp0+2
+        brasl   %r14, fun
-- 
GitLab


From 8b17cc87ef70db6c823318e35cb5a72f25f1e2aa Mon Sep 17 00:00:00 2001
From: Chijun Sima <simachijun@gmail.com>
Date: Fri, 26 Oct 2018 01:28:36 +0000
Subject: [PATCH 0611/1116] Teach the DominatorTree fallback to recalculation
 when applying updates to speedup JT (PR37929)

Summary:
This patch makes the dominatortree recalculate when applying updates with the size of the update vector larger than a threshold. Directly applying updates is usually slower than recalculating the whole domtree in this case. This patch fixes an issue which causes JT running slowly on some inputs.

In bug 37929, the dominator tree is trying to apply 19,000+ updates several times, which takes several minutes.

After this patch, the time used by DT.applyUpdates:

| Input | Before (s) | After (s) | Speedup |
| the 2nd Reproducer in 37929 | 297 | 0.15 | 1980x |
| clang-5.0.0.0.bc | 9.7 | 4.3 | 2.26x |
| clang-5.0.0.4.bc | 11.6 | 2.6 | 4.46x |

Reviewers: kuhar, brzycki, trentxintong, davide, dmgreen, grosser

Reviewed By: kuhar, brzycki

Subscribers: kristina, llvm-commits

Differential Revision: https://reviews.llvm.org/D53245

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345353 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/GenericDomTreeConstruction.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index 344484b285c..971e8305a11 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -1191,6 +1191,20 @@ struct SemiNCAInfo {
     });
     LLVM_DEBUG(dbgs() << "\n");
 
+    // Recalculate the DominatorTree when the number of updates
+    // exceeds a threshold, which usually makes direct updating slower than
+    // recalculation. We select this threshold proportional to the
+    // size of the DominatorTree. The constant is selected
+    // by choosing the one with an acceptable performance on some real-world
+    // inputs.
+
+    // Make unittests of the incremental algorithm work
+    if (DT.DomTreeNodes.size() <= 100) {
+      if (NumLegalized > DT.DomTreeNodes.size())
+        CalculateFromScratch(DT, &BUI);
+    } else if (NumLegalized > DT.DomTreeNodes.size() / 40)
+      CalculateFromScratch(DT, &BUI);
+
     // If the DominatorTree was recalculated at some point, stop the batch
     // updates. Full recalculations ignore batch updates and look at the actual
     // CFG.
-- 
GitLab


From 24e43efe87f3fc6cd46a2ae4ffaa1eb2d55d4129 Mon Sep 17 00:00:00 2001
From: Li Jia He <hljhehlj@cn.ibm.com>
Date: Fri, 26 Oct 2018 01:58:23 +0000
Subject: [PATCH 0612/1116] add myself to the CREDITS.TXT

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345355 91177308-0d34-0410-b5e6-96231b3b80d8
---
 CREDITS.TXT | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CREDITS.TXT b/CREDITS.TXT
index 7108051d67a..79ce040cfa2 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -502,3 +502,7 @@ D: Advanced SIMD (NEON) support in the ARM backend.
 N: QingShan Zhang
 E: qshanz@cn.ibm.com
 D: PowerPC Backend Developer
+
+N: Li Jia He
+E: hljhehlj@cn.ibm.com
+D: PowerPC Backend Developer
-- 
GitLab


From 7a69d009b10ef16fc5fb34f4b0026e07fa8a6930 Mon Sep 17 00:00:00 2001
From: Vlad Tsyrklevich <vlad@tsyrklevich.net>
Date: Fri, 26 Oct 2018 02:00:14 +0000
Subject: [PATCH 0613/1116] Revert "[AArch64] Create proper memoperand for
 multi-vector stores"

This reverts commit r345315, it was causing test failures on
sanitizer-x86_64-linux-fast.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345356 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp    |   2 +-
 .../AArch64/multi-vector-store-size.ll        | 164 ------------------
 2 files changed, 1 insertion(+), 165 deletions(-)
 delete mode 100644 test/CodeGen/AArch64/multi-vector-store-size.ll

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2a42d2db75d..a7a1b0a5feb 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7972,7 +7972,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
-    for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
diff --git a/test/CodeGen/AArch64/multi-vector-store-size.ll b/test/CodeGen/AArch64/multi-vector-store-size.ll
deleted file mode 100644
index 9627556168a..00000000000
--- a/test/CodeGen/AArch64/multi-vector-store-size.ll
+++ /dev/null
@@ -1,164 +0,0 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=isel < %s | FileCheck %s
-
-declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
-declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
-declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
-
-declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
-declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
-declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
-
-declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
-declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
-declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
-
-define void @addstx(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
-  %al = load <4 x float>, <4 x float>* %a
-  %bl = load <4 x float>, <4 x float>* %b
-  %cl = load <4 x float>, <4 x float>* %c
-  %dl = load <4 x float>, <4 x float>* %d
-
-  %ar = fadd <4 x float> %al, %bl
-  %br = fadd <4 x float> %bl, %cl
-  %cr = fadd <4 x float> %cl, %dl
-  %dr = fadd <4 x float> %dl, %al
-
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
-  tail call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
-; CHECK: ST2Twov4s {{.*}} :: (store 32 {{.*}})
-  tail call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
-; CHECK: ST3Threev4s {{.*}} :: (store 48 {{.*}})
-  tail call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
-; CHECK: ST4Fourv4s {{.*}} :: (store 64 {{.*}})
-
-  ret void
-}
-
-define void @addst1x(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
-  %al = load <4 x float>, <4 x float>* %a
-  %bl = load <4 x float>, <4 x float>* %b
-  %cl = load <4 x float>, <4 x float>* %c
-  %dl = load <4 x float>, <4 x float>* %d
-
-  %ar = fadd <4 x float> %al, %bl
-  %br = fadd <4 x float> %bl, %cl
-  %cr = fadd <4 x float> %cl, %dl
-  %dr = fadd <4 x float> %dl, %al
-
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
-  tail call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
-; CHECK: ST1Twov4s {{.*}} :: (store 32 {{.*}})
-  tail call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
-; CHECK: ST1Threev4s {{.*}} :: (store 48 {{.*}})
-  tail call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
-; CHECK: ST1Fourv4s {{.*}} :: (store 64 {{.*}})
-
-  ret void
-}
-
-define void @addstxlane(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
-  %al = load <4 x float>, <4 x float>* %a
-  %bl = load <4 x float>, <4 x float>* %b
-  %cl = load <4 x float>, <4 x float>* %c
-  %dl = load <4 x float>, <4 x float>* %d
-
-  %ar = fadd <4 x float> %al, %bl
-  %br = fadd <4 x float> %bl, %cl
-  %cr = fadd <4 x float> %cl, %dl
-  %dr = fadd <4 x float> %dl, %al
-
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
-  tail call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, i64 1, float* %res)
-; CHECK: ST2i32 {{.*}} :: (store 32 {{.*}})
-  tail call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, float* %res)
-; CHECK: ST3i32 {{.*}} :: (store 48 {{.*}})
-  tail call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, float* %res)
-; CHECK: ST4i32 {{.*}} :: (store 64 {{.*}})
-
-  ret void
-}
-; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=isel < %s | FileCheck %s
-
-declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
-declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
-declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
-
-declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
-declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
-declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
-
-declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
-declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
-declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
-
-define void @addstx(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
-  %al = load <4 x float>, <4 x float>* %a
-  %bl = load <4 x float>, <4 x float>* %b
-  %cl = load <4 x float>, <4 x float>* %c
-  %dl = load <4 x float>, <4 x float>* %d
-
-  %ar = fadd <4 x float> %al, %bl
-  %br = fadd <4 x float> %bl, %cl
-  %cr = fadd <4 x float> %cl, %dl
-  %dr = fadd <4 x float> %dl, %al
-
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entiew vector is stored.
-  tail call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
-; CHECK: ST2Twov4s {{.*}} :: (store 32 {{.*}})
-  tail call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
-; CHECK: ST3Threev4s {{.*}} :: (store 48 {{.*}})
-  tail call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
-; CHECK: ST4Fourv4s {{.*}} :: (store 64 {{.*}})
-
-  ret void
-}
-
-define void @addst1x(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
-  %al = load <4 x float>, <4 x float>* %a
-  %bl = load <4 x float>, <4 x float>* %b
-  %cl = load <4 x float>, <4 x float>* %c
-  %dl = load <4 x float>, <4 x float>* %d
-
-  %ar = fadd <4 x float> %al, %bl
-  %br = fadd <4 x float> %bl, %cl
-  %cr = fadd <4 x float> %cl, %dl
-  %dr = fadd <4 x float> %dl, %al
-
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entiew vector is stored.
-  tail call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
-; CHECK: ST1Twov4s {{.*}} :: (store 32 {{.*}})
-  tail call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
-; CHECK: ST1Threev4s {{.*}} :: (store 48 {{.*}})
-  tail call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
-; CHECK: ST1Fourv4s {{.*}} :: (store 64 {{.*}})
-
-  ret void
-}
-
-define void @addstxlane(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
-  %al = load <4 x float>, <4 x float>* %a
-  %bl = load <4 x float>, <4 x float>* %b
-  %cl = load <4 x float>, <4 x float>* %c
-  %dl = load <4 x float>, <4 x float>* %d
-
-  %ar = fadd <4 x float> %al, %bl
-  %br = fadd <4 x float> %bl, %cl
-  %cr = fadd <4 x float> %cl, %dl
-  %dr = fadd <4 x float> %dl, %al
-
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entiew vector is stored.
-  tail call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, i64 1, float* %res)
-; CHECK: ST2i32 {{.*}} :: (store 32 {{.*}})
-  tail call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, float* %res)
-; CHECK: ST3i32 {{.*}} :: (store 48 {{.*}})
-  tail call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, float* %res)
-; CHECK: ST4i32 {{.*}} :: (store 64 {{.*}})
-
-  ret void
-}
-- 
GitLab


From 44ed3ae63d64853f02372e3432887d5a5116e12e Mon Sep 17 00:00:00 2001
From: Li Jia He <hljhehlj@cn.ibm.com>
Date: Fri, 26 Oct 2018 02:34:57 +0000
Subject: [PATCH 0614/1116] [PowerPC][NFC] Add tests for some missed
 optimization opportunities in combineSetCC

For both operands are bool, short, int, long, long long, add the following optimization test case.
1. 0-x == y --> x+y ==0
2. 0-x != y --> x+y != 0

Review: nemanjai
Differential Revision: https://reviews.llvm.org/D53358


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345357 91177308-0d34-0410-b5e6-96231b3b80d8
---
 llvm/test/CodeGen/PowerPC/combine-setcc.ll | 436 +++++++++++++++++++++
 1 file changed, 436 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/combine-setcc.ll

diff --git a/llvm/test/CodeGen/PowerPC/combine-setcc.ll b/llvm/test/CodeGen/PowerPC/combine-setcc.ll
new file mode 100644
index 00000000000..054b153ed77
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/combine-setcc.ll
@@ -0,0 +1,436 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \
+; RUN:  -ppc-asm-full-reg-names < %s  | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \
+; RUN:  -ppc-asm-full-reg-names < %s  | FileCheck %s
+
+define zeroext i1 @eq1(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: eq1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sext i1 %x to i32
+  %conv3 = zext i1 %y to i32
+  %cmp = icmp eq i32 %sub, %conv3
+  ret i1 %cmp
+}
+
+define zeroext i8 @eq2(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: eq2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i8 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i8 %y to i32
+  %cmp = icmp eq i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @eq3(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: eq3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = sext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = sext i16 %y to i32
+  %cmp = icmp eq i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @eq4(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: eq4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i16 %y to i32
+  %cmp = icmp eq i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @eq5(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: eq5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub nsw i32 0, %x
+  %cmp = icmp eq i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @eq6(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: eq6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub i32 0, %x
+  %cmp = icmp eq i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @eq7(i64 %x, i64 %y) {
+; CHECK-LABEL: eq7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzd r3, r3
+; CHECK-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %x
+  %cmp = icmp eq i64 %sub, %y
+  %zext = zext i1 %cmp to i64
+  ret i64 %zext
+}
+
+define zeroext i1 @eq8(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: eq8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i1 %y to i32
+  %sub = sext i1 %x to i32
+  %cmp = icmp eq i32 %conv, %sub
+  ret i1 %cmp
+}
+
+define zeroext i8 @eq9(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: eq9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i8 %x to i32
+  %conv1 = zext i8 %y to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @eq10(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: eq10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = sext i16 %x to i32
+  %conv1 = sext i16 %y to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @eq11(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: eq11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i16 %x to i32
+  %conv1 = zext i16 %y to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @eq12(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: eq12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub nsw i32 0, %y
+  %cmp = icmp eq i32 %sub, %x
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @eq13(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: eq13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub i32 0, %y
+  %cmp = icmp eq i32 %sub, %x
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @eq14(i64 %x, i64 %y) {
+; CHECK-LABEL: eq14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzd r3, r3
+; CHECK-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %y
+  %cmp = icmp eq i64 %sub, %x
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+define zeroext i1 @neq1(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: neq1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %sub = sext i1 %x to i32
+  %conv3 = zext i1 %y to i32
+  %cmp = icmp ne i32 %sub, %conv3
+  ret i1 %cmp
+}
+
+define zeroext i8 @neq2(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: neq2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i8 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i8 %y to i32
+  %cmp = icmp ne i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @neq3(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: neq3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = sext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = sext i16 %y to i32
+  %cmp = icmp ne i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @neq4(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: neq4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i16 %y to i32
+  %cmp = icmp ne i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @neq5(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: neq5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %sub = sub nsw i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @neq6(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: neq6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %sub = sub i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @neq7(i64 %x, i64 %y) {
+; CHECK-LABEL: neq7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addic r4, r3, -1
+; CHECK-NEXT:    subfe r3, r4, r3
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %x
+  %cmp = icmp ne i64 %sub, %y
+  %zext = zext i1 %cmp to i64
+  ret i64 %zext
+}
+
+define zeroext i1 @neq8(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: neq8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i1 %y to i32
+  %sub = sext i1 %x to i32
+  %cmp = icmp ne i32 %conv, %sub
+  ret i1 %cmp
+}
+
+define zeroext i8 @neq9(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: neq9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i8 %y to i32
+  %conv1 = zext i8 %x to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @neq10(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: neq10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = sext i16 %y to i32
+  %conv1 = sext i16 %x to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @neq11(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: neq11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+entry:
+  %conv = zext i16 %y to i32
+  %conv1 = zext i16 %x to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @neq12(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: neq12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+entry:
+  %sub = sub nsw i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @neq13(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: neq13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+entry:
+  %sub = sub i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @neq14(i64 %x, i64 %y) {
+; CHECK-LABEL: neq14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    addic r4, r3, -1
+; CHECK-NEXT:    subfe r3, r4, r3
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %x
+  %cmp = icmp ne i64 %y, %sub
+  %zext = zext i1 %cmp to i64
+  ret i64 %zext
+}
-- 
GitLab


From 775f2aaa830e42223991a15ba2688891651cb415 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 26 Oct 2018 03:04:54 +0000
Subject: [PATCH 0615/1116] Add dependency from SystemZAsmParser to
 SystemZAsmPrinter after rL345349

This fixes -DBUILD_SHARED_LIBS=on build. The dependency is similar to that of X86's.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345358 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/SystemZ/AsmParser/LLVMBuild.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/SystemZ/AsmParser/LLVMBuild.txt b/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
index 602898ea341..76aa5a4aa9d 100644
--- a/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
+++ b/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SystemZAsmParser
 parent = SystemZ
-required_libraries = MC MCParser Support SystemZDesc SystemZInfo
+required_libraries = MC MCParser Support SystemZDesc SystemZInfo SystemZAsmPrinter
 add_to_library_groups = SystemZ
-- 
GitLab


From 5fca60507547fc445be04e1b29c4ba2ea533eb07 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 26 Oct 2018 03:15:56 +0000
Subject: [PATCH 0616/1116] [Pipeliner] Mark swp-art-deps-rec.ll as REQUIRES:
 asserts after rL345319

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345359 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/Hexagon/swp-art-deps-rec.ll | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/CodeGen/Hexagon/swp-art-deps-rec.ll b/test/CodeGen/Hexagon/swp-art-deps-rec.ll
index 941a8b8a3f9..5272faf8f9b 100644
--- a/test/CodeGen/Hexagon/swp-art-deps-rec.ll
+++ b/test/CodeGen/Hexagon/swp-art-deps-rec.ll
@@ -1,3 +1,5 @@
+; REQUIRES: asserts
+
 ; RUN: llc -march=hexagon -mcpu=hexagonv65 -O3 -debug-only=pipeliner \
 ; RUN: < %s 2>&1 | FileCheck %s
 
-- 
GitLab


From 699414a49357812bd428e9ccd37af398ca299c90 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Fri, 26 Oct 2018 03:19:13 +0000
Subject: [PATCH 0617/1116] [PowerPC] Keep vector int to fp conversions in
 vector domain

At present a v2i16 -> v2f64 convert is implemented by extracts to scalar,
scalar converts, and merge back into a vector. Use vector converts instead,
with the int data permuted into the proper position and extended if necessary.

Patch by RolandF.

Differential revision: https://reviews.llvm.org/D53346


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345361 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCISelLowering.cpp |  68 +++++++++
 lib/Target/PowerPC/PPCISelLowering.h   |   3 +
 test/CodeGen/PowerPC/vec-itofp.ll      | 192 +++++++++++++++++++++++++
 3 files changed, 263 insertions(+)
 create mode 100644 test/CodeGen/PowerPC/vec-itofp.ll

diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index ca60f318278..860181c57bd 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -792,6 +792,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
       setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
 
+      setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom);
+
       setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
       setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
       setOperationAction(ISD::FABS, MVT::v4f32, Legal);
@@ -7265,10 +7268,75 @@ SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
   return FP;
 }
 
+static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
+
+  EVT VecVT = Vec.getValueType();
+  assert(VecVT.isVector() && "Expected a vector type.");
+  assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");
+
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned WideNumElts = 128 / EltVT.getSizeInBits();
+  EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
+
+  unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumConcat);
+  Ops[0] = Vec;
+  SDValue UndefVec = DAG.getUNDEF(VecVT);
+  for (unsigned i = 1; i < NumConcat; ++i)
+    Ops[i] = UndefVec;
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops);
+}
+
+SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op,
+                                                SelectionDAG &DAG,
+                                                const SDLoc &dl) const {
+
+  unsigned Opc = Op.getOpcode();
+  assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP) &&
+         "Unexpected conversion type");
+  assert(Op.getValueType() == MVT::v2f64 && "Supports v2f64 only.");
+
+  // CPU's prior to P9 don't have a way to sign-extend in vectors.
+  bool SignedConv = Opc == ISD::SINT_TO_FP;
+  if (SignedConv && !Subtarget.hasP9Altivec())
+    return SDValue();
+
+  SDValue Wide = widenVec(DAG, Op.getOperand(0), dl);
+  EVT WideVT = Wide.getValueType();
+  unsigned WideNumElts = WideVT.getVectorNumElements();
+
+  SmallVector<int, 16> ShuffV;
+  for (unsigned i = 0; i < WideNumElts; ++i)
+    ShuffV.push_back(i + WideNumElts);
+
+  if (Subtarget.isLittleEndian()) {
+    ShuffV[0] = 0;
+    ShuffV[WideNumElts / 2] = 1;
+  }
+  else {
+    ShuffV[WideNumElts / 2 - 1] = 0;
+    ShuffV[WideNumElts - 1] = 1;
+  }
+
+  SDValue ShuffleSrc2 = SignedConv ? DAG.getUNDEF(WideVT) :
+                                     DAG.getConstant(0, dl, WideVT);
+  SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);
+  unsigned ExtendOp = SignedConv ? (unsigned) PPCISD::SExtVElems :
+                                   (unsigned) ISD::BITCAST;
+  SDValue Extend = DAG.getNode(ExtendOp, dl, MVT::v2i64, Arrange);
+
+  return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
+}
+
 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                           SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
+  if (Op.getValueType() == MVT::v2f64 &&
+      Op.getOperand(0).getValueType() == MVT::v2i16)
+    return LowerINT_TO_FPVector(Op, DAG, dl);
+
   // Conversions to f128 are legal.
   if (EnableQuadPrecision && (Op.getValueType() == MVT::f128))
     return Op;
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 959831cb1c0..081e7a92bf2 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,6 +927,9 @@ namespace llvm {
     SDValue LowerINT_TO_FPDirectMove(SDValue Op, SelectionDAG &DAG,
                                      const SDLoc &dl) const;
 
+    SDValue LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
+                                 const SDLoc &dl) const;
+
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
     SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
 
diff --git a/test/CodeGen/PowerPC/vec-itofp.ll b/test/CodeGen/PowerPC/vec-itofp.ll
new file mode 100644
index 00000000000..852b7c822ad
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec-itofp.ll
@@ -0,0 +1,192 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define void @test8(<8 x double>* nocapture %Sink, <8 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <8 x i16>, <8 x i16>* %SrcPtr, align 16
+  %1 = uitofp <8 x i16> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: @test8
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: xvcvuxddp
+; CHECK-P9: xvcvuxddp
+; CHECK-P9: xvcvuxddp
+; CHECK-P9: xvcvuxddp
+; CHECK-P8-LABEL: @test8
+; CHECK-P8: vperm
+; CHECK-P8: vperm
+; CHECK-P8: vperm
+; CHECK-P8: vperm
+; CHECK-P8: xvcvuxddp
+; CHECK-P8: xvcvuxddp
+; CHECK-P8: xvcvuxddp
+; CHECK-P8: xvcvuxddp
+}
+
+define void @test4(<4 x double>* nocapture %Sink, <4 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <4 x i16>, <4 x i16>* %SrcPtr, align 16
+  %1 = uitofp <4 x i16> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: @test4
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: xvcvuxddp
+; CHECK-P9: xvcvuxddp
+; CHECK-P8-LABEL: @test4
+; CHECK-P8: vperm
+; CHECK-P8: vperm
+; CHECK-P8: xvcvuxddp
+; CHECK-P8: xvcvuxddp
+}
+
+define void @test2(<2 x double>* nocapture %Sink, <2 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <2 x i16>, <2 x i16>* %SrcPtr, align 16
+  %1 = uitofp <2 x i16> %0 to <2 x double>
+  store <2 x double> %1, <2 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: .LCPI2_0:
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 30
+; CHECK-P9-NEXT: .byte 13
+; CHECK-P9-NEXT: .byte 12
+; CHECK-P9-NEXT: .byte 11
+; CHECK-P9-NEXT: .byte 10
+; CHECK-P9-NEXT: .byte 9
+; CHECK-P9-NEXT: .byte 8
+; CHECK-P9-NEXT: .byte 29
+; CHECK-P9-NEXT: .byte 28
+; CHECK-P9-NEXT: .byte 5
+; CHECK-P9-NEXT: .byte 4
+; CHECK-P9-NEXT: .byte 3
+; CHECK-P9-NEXT: .byte 2
+; CHECK-P9-NEXT: .byte 1
+; CHECK-P9-NEXT: .byte 0
+; CHECK-P9: addi [[REG1:r[0-9]+]], {{r[0-9]+}}, .LCPI2_0@toc@l
+; CHECK-P9: lxvx [[REG2:v[0-9]+]], 0, [[REG1]]
+; CHECK-P9: vperm [[REG3:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[REG2]]
+; CHECK-P9: xvcvuxddp {{vs[0-9]+}}, [[REG3]]
+; CHECK-P8-LABEL: @test2
+; CHECK-P8: vperm [[REG1:v[0-9]+]]
+; CHECK-P8: xvcvuxddp {{vs[0-9]+}}, [[REG1]]
+; CHECK-BE-LABEL: .LCPI2_0:
+; CHECK-BE-NEXT: .byte 16
+; CHECK-BE-NEXT: .byte 17
+; CHECK-BE-NEXT: .byte 18
+; CHECK-BE-NEXT: .byte 19
+; CHECK-BE-NEXT: .byte 20
+; CHECK-BE-NEXT: .byte 21
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 1
+; CHECK-BE-NEXT: .byte 24
+; CHECK-BE-NEXT: .byte 25
+; CHECK-BE-NEXT: .byte 26
+; CHECK-BE-NEXT: .byte 27
+; CHECK-BE-NEXT: .byte 28
+; CHECK-BE-NEXT: .byte 29
+; CHECK-BE-NEXT: .byte 2
+; CHECK-BE-NEXT: .byte 3
+; CHECK-BE: addi [[REG1:r[0-9]+]], {{r[0-9]+}}, .LCPI2_0@toc@l
+; CHECK-BE: lxvx [[REG2:v[0-9]+]], 0, [[REG1]]
+; CHECK-BE: vperm [[REG3:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[REG2]]
+; CHECK-BE: xvcvuxddp {{vs[0-9]+}}, [[REG3]]
+}
+
+define void @stest8(<8 x double>* nocapture %Sink, <8 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <8 x i16>, <8 x i16>* %SrcPtr, align 16
+  %1 = sitofp <8 x i16> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: @stest8
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vextsh2d
+; CHECK-P9: vextsh2d
+; CHECK-P9: vextsh2d
+; CHECK-P9: vextsh2d
+; CHECK-P9: xvcvsxddp
+; CHECK-P9: xvcvsxddp
+; CHECK-P9: xvcvsxddp
+; CHECK-P9: xvcvsxddp
+}
+
+define void @stest4(<4 x double>* nocapture %Sink, <4 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <4 x i16>, <4 x i16>* %SrcPtr, align 16
+  %1 = sitofp <4 x i16> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: @stest4
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vextsh2d
+; CHECK-P9: vextsh2d
+; CHECK-P9: xvcvsxddp
+; CHECK-P9: xvcvsxddp
+}
+
+define void @stest2(<2 x double>* nocapture %Sink, <2 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <2 x i16>, <2 x i16>* %SrcPtr, align 16
+  %1 = sitofp <2 x i16> %0 to <2 x double>
+  store <2 x double> %1, <2 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: .LCPI5_0:
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 30
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 29
+; CHECK-P9-NEXT: .byte 28
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9: vperm [[REG1:v[0-9]+]]
+; CHECK-P9: vextsh2d [[REG2:v[0-9]+]], [[REG1]]
+; CHECK-P9: xvcvsxddp {{vs[0-9]+}}, [[REG2]]
+; CHECK-BE-LABEL: .LCPI5_0:
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 1
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 2
+; CHECK-BE-NEXT: .byte 3
+; CHECK-BE: addi [[REG1:r[0-9]+]], {{r[0-9]+}}, .LCPI5_0@toc@l
+; CHECK-BE: lxvx [[REG2:v[0-9]+]], 0, [[REG1]]
+; CHECK-BE: vperm [[REG3:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[REG2]]
+; CHECK-BE: vextsh2d [[REG4:v[0-9]+]], [[REG3]]
+; CHECK-BE: xvcvsxddp {{vs[0-9]+}}, [[REG4]]
+}
-- 
GitLab


From 3dfd21e18627f59a7b74945fe8f53019b8488399 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Fri, 26 Oct 2018 03:30:28 +0000
Subject: [PATCH 0618/1116] [NFC] Fix the regular expression for BE PPC in
 update_llc_test_checks.py

Currently, the regular expression that matches the lines of assembly for PPC LE
(ELFv2) does not work for the assembly for BE (ELFv1). This patch fixes it.

Differential revision: https://reviews.llvm.org/D53059


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345363 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/UpdateTestChecks/asm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/UpdateTestChecks/asm.py b/utils/UpdateTestChecks/asm.py
index 726a653d151..923efd5bbef 100644
--- a/utils/UpdateTestChecks/asm.py
+++ b/utils/UpdateTestChecks/asm.py
@@ -52,6 +52,7 @@ ASM_FUNCTION_MIPS_RE = re.compile(
 
 ASM_FUNCTION_PPC_RE = re.compile(
     r'^_?(?P<func>[^:]+):[ \t]*#+[ \t]*@(?P=func)\n'
+    r'.*?'
     r'\.Lfunc_begin[0-9]+:\n'
     r'(?:[ \t]+.cfi_startproc\n)?'
     r'(?:\.Lfunc_[gl]ep[0-9]+:\n(?:[ \t]+.*?\n)*)*'
-- 
GitLab


From bd16cc646c6cedc42a9cc7c32f1c52275d63da59 Mon Sep 17 00:00:00 2001
From: Li Jia He <hljhehlj@cn.ibm.com>
Date: Fri, 26 Oct 2018 04:54:56 +0000
Subject: [PATCH 0619/1116] This reverts commit  r345357, It is wrong to create
 a new directory and put the test file into it. I am sorry for this.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345364 91177308-0d34-0410-b5e6-96231b3b80d8
---
 llvm/test/CodeGen/PowerPC/combine-setcc.ll | 436 ---------------------
 1 file changed, 436 deletions(-)
 delete mode 100644 llvm/test/CodeGen/PowerPC/combine-setcc.ll

diff --git a/llvm/test/CodeGen/PowerPC/combine-setcc.ll b/llvm/test/CodeGen/PowerPC/combine-setcc.ll
deleted file mode 100644
index 054b153ed77..00000000000
--- a/llvm/test/CodeGen/PowerPC/combine-setcc.ll
+++ /dev/null
@@ -1,436 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \
-; RUN:  -ppc-asm-full-reg-names < %s  | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \
-; RUN:  -ppc-asm-full-reg-names < %s  | FileCheck %s
-
-define zeroext i1 @eq1(i1 zeroext %x, i1 zeroext %y) {
-; CHECK-LABEL: eq1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %sub = sext i1 %x to i32
-  %conv3 = zext i1 %y to i32
-  %cmp = icmp eq i32 %sub, %conv3
-  ret i1 %cmp
-}
-
-define zeroext i8 @eq2(i8 zeroext %x, i8 zeroext %y) {
-; CHECK-LABEL: eq2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %conv = zext i8 %x to i32
-  %sub = sub nsw i32 0, %conv
-  %conv1 = zext i8 %y to i32
-  %cmp = icmp eq i32 %sub, %conv1
-  %conv3 = zext i1 %cmp to i8
-  ret i8 %conv3
-}
-
-define signext i16 @eq3(i16 signext %x, i16 signext %y) {
-; CHECK-LABEL: eq3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %conv = sext i16 %x to i32
-  %sub = sub nsw i32 0, %conv
-  %conv1 = sext i16 %y to i32
-  %cmp = icmp eq i32 %sub, %conv1
-  %conv3 = zext i1 %cmp to i16
-  ret i16 %conv3
-}
-
-define zeroext i16 @eq4(i16 zeroext %x, i16 zeroext %y) {
-; CHECK-LABEL: eq4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %conv = zext i16 %x to i32
-  %sub = sub nsw i32 0, %conv
-  %conv1 = zext i16 %y to i32
-  %cmp = icmp eq i32 %sub, %conv1
-  %conv3 = zext i1 %cmp to i16
-  ret i16 %conv3
-}
-
-define signext i32 @eq5(i32 signext %x, i32 signext %y) {
-; CHECK-LABEL: eq5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %sub = sub nsw i32 0, %x
-  %cmp = icmp eq i32 %sub, %y
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define zeroext i32 @eq6(i32 zeroext %x, i32 zeroext %y) {
-; CHECK-LABEL: eq6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %sub = sub i32 0, %x
-  %cmp = icmp eq i32 %sub, %y
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i64 @eq7(i64 %x, i64 %y) {
-; CHECK-LABEL: eq7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzd r3, r3
-; CHECK-NEXT:    rldicl r3, r3, 58, 63
-; CHECK-NEXT:    blr
-  %sub = sub nsw i64 0, %x
-  %cmp = icmp eq i64 %sub, %y
-  %zext = zext i1 %cmp to i64
-  ret i64 %zext
-}
-
-define zeroext i1 @eq8(i1 zeroext %x, i1 zeroext %y) {
-; CHECK-LABEL: eq8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %conv = zext i1 %y to i32
-  %sub = sext i1 %x to i32
-  %cmp = icmp eq i32 %conv, %sub
-  ret i1 %cmp
-}
-
-define zeroext i8 @eq9(i8 zeroext %x, i8 zeroext %y) {
-; CHECK-LABEL: eq9:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %conv = zext i8 %x to i32
-  %conv1 = zext i8 %y to i32
-  %sub = sub nsw i32 0, %conv1
-  %cmp = icmp eq i32 %conv, %sub
-  %conv3 = zext i1 %cmp to i8
-  ret i8 %conv3
-}
-
-define signext i16 @eq10(i16 signext %x, i16 signext %y) {
-; CHECK-LABEL: eq10:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %conv = sext i16 %x to i32
-  %conv1 = sext i16 %y to i32
-  %sub = sub nsw i32 0, %conv1
-  %cmp = icmp eq i32 %conv, %sub
-  %conv3 = zext i1 %cmp to i16
-  ret i16 %conv3
-}
-
-define zeroext i16 @eq11(i16 zeroext %x, i16 zeroext %y) {
-; CHECK-LABEL: eq11:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %conv = zext i16 %x to i32
-  %conv1 = zext i16 %y to i32
-  %sub = sub nsw i32 0, %conv1
-  %cmp = icmp eq i32 %conv, %sub
-  %conv3 = zext i1 %cmp to i16
-  ret i16 %conv3
-}
-
-define signext i32 @eq12(i32 signext %x, i32 signext %y) {
-; CHECK-LABEL: eq12:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r4, r3
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %sub = sub nsw i32 0, %y
-  %cmp = icmp eq i32 %sub, %x
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define zeroext i32 @eq13(i32 zeroext %x, i32 zeroext %y) {
-; CHECK-LABEL: eq13:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r4, r3
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    blr
-  %sub = sub i32 0, %y
-  %cmp = icmp eq i32 %sub, %x
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i64 @eq14(i64 %x, i64 %y) {
-; CHECK-LABEL: eq14:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r4, r3
-; CHECK-NEXT:    cntlzd r3, r3
-; CHECK-NEXT:    rldicl r3, r3, 58, 63
-; CHECK-NEXT:    blr
-  %sub = sub nsw i64 0, %y
-  %cmp = icmp eq i64 %sub, %x
-  %conv1 = zext i1 %cmp to i64
-  ret i64 %conv1
-}
-
-define zeroext i1 @neq1(i1 zeroext %x, i1 zeroext %y) {
-; CHECK-LABEL: neq1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-  %sub = sext i1 %x to i32
-  %conv3 = zext i1 %y to i32
-  %cmp = icmp ne i32 %sub, %conv3
-  ret i1 %cmp
-}
-
-define zeroext i8 @neq2(i8 zeroext %x, i8 zeroext %y) {
-; CHECK-LABEL: neq2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-  %conv = zext i8 %x to i32
-  %sub = sub nsw i32 0, %conv
-  %conv1 = zext i8 %y to i32
-  %cmp = icmp ne i32 %sub, %conv1
-  %conv3 = zext i1 %cmp to i8
-  ret i8 %conv3
-}
-
-define signext i16 @neq3(i16 signext %x, i16 signext %y) {
-; CHECK-LABEL: neq3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-  %conv = sext i16 %x to i32
-  %sub = sub nsw i32 0, %conv
-  %conv1 = sext i16 %y to i32
-  %cmp = icmp ne i32 %sub, %conv1
-  %conv3 = zext i1 %cmp to i16
-  ret i16 %conv3
-}
-
-define zeroext i16 @neq4(i16 zeroext %x, i16 zeroext %y) {
-; CHECK-LABEL: neq4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-  %conv = zext i16 %x to i32
-  %sub = sub nsw i32 0, %conv
-  %conv1 = zext i16 %y to i32
-  %cmp = icmp ne i32 %sub, %conv1
-  %conv3 = zext i1 %cmp to i16
-  ret i16 %conv3
-}
-
-define signext i32 @neq5(i32 signext %x, i32 signext %y) {
-; CHECK-LABEL: neq5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-  %sub = sub nsw i32 0, %x
-  %cmp = icmp ne i32 %sub, %y
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define zeroext i32 @neq6(i32 zeroext %x, i32 zeroext %y) {
-; CHECK-LABEL: neq6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-  %sub = sub i32 0, %x
-  %cmp = icmp ne i32 %sub, %y
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i64 @neq7(i64 %x, i64 %y) {
-; CHECK-LABEL: neq7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    addic r4, r3, -1
-; CHECK-NEXT:    subfe r3, r4, r3
-; CHECK-NEXT:    blr
-  %sub = sub nsw i64 0, %x
-  %cmp = icmp ne i64 %sub, %y
-  %zext = zext i1 %cmp to i64
-  ret i64 %zext
-}
-
-define zeroext i1 @neq8(i1 zeroext %x, i1 zeroext %y) {
-; CHECK-LABEL: neq8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-  %conv = zext i1 %y to i32
-  %sub = sext i1 %x to i32
-  %cmp = icmp ne i32 %conv, %sub
-  ret i1 %cmp
-}
-
-define zeroext i8 @neq9(i8 zeroext %x, i8 zeroext %y) {
-; CHECK-LABEL: neq9:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-  %conv = zext i8 %y to i32
-  %conv1 = zext i8 %x to i32
-  %sub = sub nsw i32 0, %conv1
-  %cmp = icmp ne i32 %conv, %sub
-  %conv3 = zext i1 %cmp to i8
-  ret i8 %conv3
-}
-
-define signext i16 @neq10(i16 signext %x, i16 signext %y) {
-; CHECK-LABEL: neq10:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-  %conv = sext i16 %y to i32
-  %conv1 = sext i16 %x to i32
-  %sub = sub nsw i32 0, %conv1
-  %cmp = icmp ne i32 %conv, %sub
-  %conv3 = zext i1 %cmp to i16
-  ret i16 %conv3
-}
-
-define zeroext i16 @neq11(i16 zeroext %x, i16 zeroext %y) {
-; CHECK-LABEL: neq11:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-entry:
-  %conv = zext i16 %y to i32
-  %conv1 = zext i16 %x to i32
-  %sub = sub nsw i32 0, %conv1
-  %cmp = icmp ne i32 %conv, %sub
-  %conv3 = zext i1 %cmp to i16
-  ret i16 %conv3
-}
-
-define signext i32 @neq12(i32 signext %x, i32 signext %y) {
-; CHECK-LABEL: neq12:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-entry:
-  %sub = sub nsw i32 0, %x
-  %cmp = icmp ne i32 %sub, %y
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define zeroext i32 @neq13(i32 zeroext %x, i32 zeroext %y) {
-; CHECK-LABEL: neq13:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    xori r3, r3, 1
-; CHECK-NEXT:    blr
-entry:
-  %sub = sub i32 0, %x
-  %cmp = icmp ne i32 %sub, %y
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i64 @neq14(i64 %x, i64 %y) {
-; CHECK-LABEL: neq14:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
-; CHECK-NEXT:    addic r4, r3, -1
-; CHECK-NEXT:    subfe r3, r4, r3
-; CHECK-NEXT:    blr
-  %sub = sub nsw i64 0, %x
-  %cmp = icmp ne i64 %y, %sub
-  %zext = zext i1 %cmp to i64
-  ret i64 %zext
-}
-- 
GitLab


From 61aa414e834418a257857e4755fce1ecba847f43 Mon Sep 17 00:00:00 2001
From: Li Jia He <hljhehlj@cn.ibm.com>
Date: Fri, 26 Oct 2018 05:02:10 +0000
Subject: [PATCH 0620/1116] [PowerPC][NFC] Add tests for some missed
 optimization opportunities in combineSetCC

For both operands are bool, short, int, long, long long, add the following optimization test case.
1. 0-x == y --> x+y ==0
2. 0-x != y --> x+y != 0

Review: nemanjai
Differential Revision: https://reviews.llvm.org/D53358


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345365 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/PowerPC/combine-setcc.ll | 436 ++++++++++++++++++++++++++
 1 file changed, 436 insertions(+)
 create mode 100644 test/CodeGen/PowerPC/combine-setcc.ll

diff --git a/test/CodeGen/PowerPC/combine-setcc.ll b/test/CodeGen/PowerPC/combine-setcc.ll
new file mode 100644
index 00000000000..054b153ed77
--- /dev/null
+++ b/test/CodeGen/PowerPC/combine-setcc.ll
@@ -0,0 +1,436 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \
+; RUN:  -ppc-asm-full-reg-names < %s  | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \
+; RUN:  -ppc-asm-full-reg-names < %s  | FileCheck %s
+
+define zeroext i1 @eq1(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: eq1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sext i1 %x to i32
+  %conv3 = zext i1 %y to i32
+  %cmp = icmp eq i32 %sub, %conv3
+  ret i1 %cmp
+}
+
+define zeroext i8 @eq2(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: eq2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i8 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i8 %y to i32
+  %cmp = icmp eq i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @eq3(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: eq3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = sext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = sext i16 %y to i32
+  %cmp = icmp eq i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @eq4(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: eq4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i16 %y to i32
+  %cmp = icmp eq i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @eq5(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: eq5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub nsw i32 0, %x
+  %cmp = icmp eq i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @eq6(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: eq6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub i32 0, %x
+  %cmp = icmp eq i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @eq7(i64 %x, i64 %y) {
+; CHECK-LABEL: eq7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzd r3, r3
+; CHECK-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %x
+  %cmp = icmp eq i64 %sub, %y
+  %zext = zext i1 %cmp to i64
+  ret i64 %zext
+}
+
+define zeroext i1 @eq8(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: eq8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i1 %y to i32
+  %sub = sext i1 %x to i32
+  %cmp = icmp eq i32 %conv, %sub
+  ret i1 %cmp
+}
+
+define zeroext i8 @eq9(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: eq9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i8 %x to i32
+  %conv1 = zext i8 %y to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @eq10(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: eq10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = sext i16 %x to i32
+  %conv1 = sext i16 %y to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @eq11(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: eq11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i16 %x to i32
+  %conv1 = zext i16 %y to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @eq12(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: eq12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub nsw i32 0, %y
+  %cmp = icmp eq i32 %sub, %x
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @eq13(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: eq13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub i32 0, %y
+  %cmp = icmp eq i32 %sub, %x
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @eq14(i64 %x, i64 %y) {
+; CHECK-LABEL: eq14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r4, r4
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzd r3, r3
+; CHECK-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %y
+  %cmp = icmp eq i64 %sub, %x
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+define zeroext i1 @neq1(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: neq1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %sub = sext i1 %x to i32
+  %conv3 = zext i1 %y to i32
+  %cmp = icmp ne i32 %sub, %conv3
+  ret i1 %cmp
+}
+
+define zeroext i8 @neq2(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: neq2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i8 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i8 %y to i32
+  %cmp = icmp ne i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @neq3(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: neq3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = sext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = sext i16 %y to i32
+  %cmp = icmp ne i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @neq4(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: neq4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i16 %y to i32
+  %cmp = icmp ne i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @neq5(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: neq5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %sub = sub nsw i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @neq6(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: neq6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %sub = sub i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @neq7(i64 %x, i64 %y) {
+; CHECK-LABEL: neq7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addic r4, r3, -1
+; CHECK-NEXT:    subfe r3, r4, r3
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %x
+  %cmp = icmp ne i64 %sub, %y
+  %zext = zext i1 %cmp to i64
+  ret i64 %zext
+}
+
+define zeroext i1 @neq8(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: neq8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i1 %y to i32
+  %sub = sext i1 %x to i32
+  %cmp = icmp ne i32 %conv, %sub
+  ret i1 %cmp
+}
+
+define zeroext i8 @neq9(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: neq9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i8 %y to i32
+  %conv1 = zext i8 %x to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @neq10(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: neq10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = sext i16 %y to i32
+  %conv1 = sext i16 %x to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @neq11(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: neq11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+entry:
+  %conv = zext i16 %y to i32
+  %conv1 = zext i16 %x to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @neq12(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: neq12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+entry:
+  %sub = sub nsw i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @neq13(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: neq13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+entry:
+  %sub = sub i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @neq14(i64 %x, i64 %y) {
+; CHECK-LABEL: neq14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg r3, r3
+; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    addic r4, r3, -1
+; CHECK-NEXT:    subfe r3, r4, r3
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %x
+  %cmp = icmp ne i64 %y, %sub
+  %zext = zext i1 %cmp to i64
+  ret i64 %zext
+}
-- 
GitLab


From ad84a8b9be854db641a18d9559fe00b4b821b881 Mon Sep 17 00:00:00 2001
From: Li Jia He <hljhehlj@cn.ibm.com>
Date: Fri, 26 Oct 2018 06:48:53 +0000
Subject: [PATCH 0621/1116] [PowerPC] Fix some missed optimization
 opportunities in combineSetCC

For both operands are bool, short, int, long, long long, add the following optimization.
1. 0-x == y --> x+y ==0
2. 0-x != y --> x+y != 0

Review: nemanjai
Differential Revision: https://reviews.llvm.org/D53360


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345366 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCISelLowering.cpp | 34 +++++++++++
 lib/Target/PowerPC/PPCISelLowering.h   |  1 +
 test/CodeGen/PowerPC/combine-setcc.ll  | 84 +++++++++-----------------
 3 files changed, 63 insertions(+), 56 deletions(-)

diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 860181c57bd..a135667beaa 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11823,6 +11823,37 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
       ShiftCst);
 }
 
+SDValue PPCTargetLowering::combineSetCC(SDNode *N,
+                                        DAGCombinerInfo &DCI) const {
+  assert(N->getOpcode() == ISD::SETCC &&
+         "Should be called with a SETCC node");
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  if (CC == ISD::SETNE || CC == ISD::SETEQ) {
+    SDValue LHS = N->getOperand(0);
+    SDValue RHS = N->getOperand(1);
+
+    // If there is a '0 - y' pattern, canonicalize the pattern to the RHS.
+    if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
+        LHS.hasOneUse())
+      std::swap(LHS, RHS);
+
+    // x == 0-y --> x+y == 0
+    // x != 0-y --> x+y != 0
+    if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
+        RHS.hasOneUse()) {
+      SDLoc DL(N);
+      SelectionDAG &DAG = DCI.DAG;
+      EVT VT = N->getValueType(0);
+      EVT OpVT = LHS.getValueType();
+      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
+      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
+    }
+  }
+
+  return DAGCombineTruncBoolExt(N, DCI);
+}
+
 // Is this an extending load from an f32 to an f64?
 static bool isFPExtLoad(SDValue Op) {
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))
@@ -12554,6 +12585,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::TRUNCATE:
     return combineTRUNCATE(N, DCI);
   case ISD::SETCC:
+    if (SDValue CSCC = combineSetCC(N, DCI))
+      return CSCC;
+    LLVM_FALLTHROUGH;
   case ISD::SELECT_CC:
     return DAGCombineTruncBoolExt(N, DCI);
   case ISD::SINT_TO_FP:
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 081e7a92bf2..d597e9348a1 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -1097,6 +1097,7 @@ namespace llvm {
     SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const;
 
     /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
     /// SETCC with integer subtraction when (1) there is a legal way of doing it
diff --git a/test/CodeGen/PowerPC/combine-setcc.ll b/test/CodeGen/PowerPC/combine-setcc.ll
index 054b153ed77..a86de8296f8 100644
--- a/test/CodeGen/PowerPC/combine-setcc.ll
+++ b/test/CodeGen/PowerPC/combine-setcc.ll
@@ -7,8 +7,7 @@
 define zeroext i1 @eq1(i1 zeroext %x, i1 zeroext %y) {
 ; CHECK-LABEL: eq1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -21,8 +20,7 @@ define zeroext i1 @eq1(i1 zeroext %x, i1 zeroext %y) {
 define zeroext i8 @eq2(i8 zeroext %x, i8 zeroext %y) {
 ; CHECK-LABEL: eq2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -37,8 +35,7 @@ define zeroext i8 @eq2(i8 zeroext %x, i8 zeroext %y) {
 define signext i16 @eq3(i16 signext %x, i16 signext %y) {
 ; CHECK-LABEL: eq3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -53,8 +50,7 @@ define signext i16 @eq3(i16 signext %x, i16 signext %y) {
 define zeroext i16 @eq4(i16 zeroext %x, i16 zeroext %y) {
 ; CHECK-LABEL: eq4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -69,8 +65,7 @@ define zeroext i16 @eq4(i16 zeroext %x, i16 zeroext %y) {
 define signext i32 @eq5(i32 signext %x, i32 signext %y) {
 ; CHECK-LABEL: eq5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -83,8 +78,7 @@ define signext i32 @eq5(i32 signext %x, i32 signext %y) {
 define zeroext i32 @eq6(i32 zeroext %x, i32 zeroext %y) {
 ; CHECK-LABEL: eq6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -97,8 +91,7 @@ define zeroext i32 @eq6(i32 zeroext %x, i32 zeroext %y) {
 define i64 @eq7(i64 %x, i64 %y) {
 ; CHECK-LABEL: eq7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
 ; CHECK-NEXT:    blr
@@ -111,8 +104,7 @@ define i64 @eq7(i64 %x, i64 %y) {
 define zeroext i1 @eq8(i1 zeroext %x, i1 zeroext %y) {
 ; CHECK-LABEL: eq8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -125,8 +117,7 @@ define zeroext i1 @eq8(i1 zeroext %x, i1 zeroext %y) {
 define zeroext i8 @eq9(i8 zeroext %x, i8 zeroext %y) {
 ; CHECK-LABEL: eq9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -141,8 +132,7 @@ define zeroext i8 @eq9(i8 zeroext %x, i8 zeroext %y) {
 define signext i16 @eq10(i16 signext %x, i16 signext %y) {
 ; CHECK-LABEL: eq10:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -157,8 +147,7 @@ define signext i16 @eq10(i16 signext %x, i16 signext %y) {
 define zeroext i16 @eq11(i16 zeroext %x, i16 zeroext %y) {
 ; CHECK-LABEL: eq11:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -173,8 +162,7 @@ define zeroext i16 @eq11(i16 zeroext %x, i16 zeroext %y) {
 define signext i32 @eq12(i32 signext %x, i32 signext %y) {
 ; CHECK-LABEL: eq12:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    add r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -187,8 +175,7 @@ define signext i32 @eq12(i32 signext %x, i32 signext %y) {
 define zeroext i32 @eq13(i32 zeroext %x, i32 zeroext %y) {
 ; CHECK-LABEL: eq13:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    add r3, r3, r4
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    blr
@@ -201,8 +188,7 @@ define zeroext i32 @eq13(i32 zeroext %x, i32 zeroext %y) {
 define i64 @eq14(i64 %x, i64 %y) {
 ; CHECK-LABEL: eq14:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r4, r4
-; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    add r3, r3, r4
 ; CHECK-NEXT:    cntlzd r3, r3
 ; CHECK-NEXT:    rldicl r3, r3, 58, 63
 ; CHECK-NEXT:    blr
@@ -215,8 +201,7 @@ define i64 @eq14(i64 %x, i64 %y) {
 define zeroext i1 @neq1(i1 zeroext %x, i1 zeroext %y) {
 ; CHECK-LABEL: neq1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -230,8 +215,7 @@ define zeroext i1 @neq1(i1 zeroext %x, i1 zeroext %y) {
 define zeroext i8 @neq2(i8 zeroext %x, i8 zeroext %y) {
 ; CHECK-LABEL: neq2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -247,8 +231,7 @@ define zeroext i8 @neq2(i8 zeroext %x, i8 zeroext %y) {
 define signext i16 @neq3(i16 signext %x, i16 signext %y) {
 ; CHECK-LABEL: neq3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -264,8 +247,7 @@ define signext i16 @neq3(i16 signext %x, i16 signext %y) {
 define zeroext i16 @neq4(i16 zeroext %x, i16 zeroext %y) {
 ; CHECK-LABEL: neq4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -281,8 +263,7 @@ define zeroext i16 @neq4(i16 zeroext %x, i16 zeroext %y) {
 define signext i32 @neq5(i32 signext %x, i32 signext %y) {
 ; CHECK-LABEL: neq5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -296,8 +277,7 @@ define signext i32 @neq5(i32 signext %x, i32 signext %y) {
 define zeroext i32 @neq6(i32 zeroext %x, i32 zeroext %y) {
 ; CHECK-LABEL: neq6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -311,8 +291,7 @@ define zeroext i32 @neq6(i32 zeroext %x, i32 zeroext %y) {
 define i64 @neq7(i64 %x, i64 %y) {
 ; CHECK-LABEL: neq7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    addic r4, r3, -1
 ; CHECK-NEXT:    subfe r3, r4, r3
 ; CHECK-NEXT:    blr
@@ -325,8 +304,7 @@ define i64 @neq7(i64 %x, i64 %y) {
 define zeroext i1 @neq8(i1 zeroext %x, i1 zeroext %y) {
 ; CHECK-LABEL: neq8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -340,8 +318,7 @@ define zeroext i1 @neq8(i1 zeroext %x, i1 zeroext %y) {
 define zeroext i8 @neq9(i8 zeroext %x, i8 zeroext %y) {
 ; CHECK-LABEL: neq9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -357,8 +334,7 @@ define zeroext i8 @neq9(i8 zeroext %x, i8 zeroext %y) {
 define signext i16 @neq10(i16 signext %x, i16 signext %y) {
 ; CHECK-LABEL: neq10:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -374,8 +350,7 @@ define signext i16 @neq10(i16 signext %x, i16 signext %y) {
 define zeroext i16 @neq11(i16 zeroext %x, i16 zeroext %y) {
 ; CHECK-LABEL: neq11:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -392,8 +367,7 @@ entry:
 define signext i32 @neq12(i32 signext %x, i32 signext %y) {
 ; CHECK-LABEL: neq12:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -408,8 +382,7 @@ entry:
 define zeroext i32 @neq13(i32 zeroext %x, i32 zeroext %y) {
 ; CHECK-LABEL: neq13:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    cntlzw r3, r3
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    xori r3, r3, 1
@@ -424,8 +397,7 @@ entry:
 define i64 @neq14(i64 %x, i64 %y) {
 ; CHECK-LABEL: neq14:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    neg r3, r3
-; CHECK-NEXT:    xor r3, r4, r3
+; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    addic r4, r3, -1
 ; CHECK-NEXT:    subfe r3, r4, r3
 ; CHECK-NEXT:    blr
-- 
GitLab


From cb46794300b2e6358e780871c723c22c260ccbee Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 26 Oct 2018 06:56:51 +0000
Subject: [PATCH 0622/1116] [llvm-nm] Simplify. NFC

Change a \t to spaces
Change some zero-filling memcpy to aggregate initialization
Delete redundant ArchiveName.clear() after declaration

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345367 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-nm/llvm-nm.cpp | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 7e257d8ce89..21f3a2bade5 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -1181,8 +1181,7 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
       // see if this symbol is a symbol from that section and if not skip it.
       if (Nsect && Nsect != getNsectInMachO(*MachO, Sym))
         continue;
-      NMSymbol S;
-      memset(&S, '\0', sizeof(S));
+      NMSymbol S = {};
       S.Size = 0;
       S.Address = 0;
       if (PrintSize) {
@@ -1276,8 +1275,7 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
           }
         }
         if (!found) {
-          NMSymbol S;
-          memset(&S, '\0', sizeof(NMSymbol));
+          NMSymbol S = {};
           S.Address = Entry.address() + BaseSegmentAddress;
           S.Size = 0;
           S.TypeChar = '\0';
@@ -1367,8 +1365,7 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
 
             // Now create the undefined symbol using the referened dynamic
             // library.
-            NMSymbol U;
-            memset(&U, '\0', sizeof(NMSymbol));
+            NMSymbol U = {};
             U.Address = 0;
             U.Size = 0;
             U.TypeChar = 'U';
@@ -1434,8 +1431,7 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
         }
         if (!found) {
           LastSymbolName = Entry.symbolName();
-          NMSymbol B;
-          memset(&B, '\0', sizeof(NMSymbol));
+          NMSymbol B = {};
           B.Address = 0;
           B.Size = 0;
           B.TypeChar = 'U';
@@ -1494,8 +1490,7 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
         }
         if (!found) {
           LastSymbolName = Entry.symbolName();
-          NMSymbol L;
-          memset(&L, '\0', sizeof(NMSymbol));
+          NMSymbol L = {};
           L.Name = Entry.symbolName();
           L.Address = 0;
           L.Size = 0;
@@ -1633,9 +1628,8 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
         }
         // See this address is not already in the symbol table fake up an
         // nlist for it.
-	if (!found) {
-          NMSymbol F;
-          memset(&F, '\0', sizeof(NMSymbol));
+        if (!found) {
+          NMSymbol F = {};
           F.Name = "<redacted function X>";
           F.Address = FoundFns[f] + BaseSegmentAddress;
           F.Size = 0;
@@ -1902,7 +1896,6 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
         if (HostArchName == I->getArchFlagName()) {
           Expected<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
           std::string ArchiveName;
-          ArchiveName.clear();
           if (ObjOrErr) {
             ObjectFile &Obj = *ObjOrErr.get();
             dumpSymbolNamesFromObject(Obj, false);
-- 
GitLab


From 9da5c176f07d8b8f2db0abcb32689ee152cadc07 Mon Sep 17 00:00:00 2001
From: Kristina Brooks <kristina@nym.hush.com>
Date: Fri, 26 Oct 2018 06:57:02 +0000
Subject: [PATCH 0623/1116] [NFC] Add periods to CREDITS.txt (testing git-llvm)

NFC commit to test git-llvm bridge for current GitHub monorepo.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345368 91177308-0d34-0410-b5e6-96231b3b80d8
---
 CREDITS.TXT | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CREDITS.TXT b/CREDITS.TXT
index 79ce040cfa2..cde8a441cac 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -492,8 +492,8 @@ D: Thread Local Storage implementation
 N: Bill Wendling
 I: wendling
 E: isanbard@gmail.com
-D: Release manager, IR Linker, LTO
-D: Bunches of stuff
+D: Release manager, IR Linker, LTO.
+D: Bunches of stuff.
 
 N: Bob Wilson
 E: bob.wilson@acm.org
-- 
GitLab


From 6a2012a85f7ebdf5e33d721e8c9582d495a082cb Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 26 Oct 2018 06:59:08 +0000
Subject: [PATCH 0624/1116] [SystemZ] Fix -Wcovered-switch-default as coding
 standard regulates

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345369 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 6f4f543caad..91959b4151b 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -686,7 +686,6 @@ void SystemZOperand::print(raw_ostream &OS) const {
     }
     break;
   }
-  default:
   case KindInvalid:
     break;
   }
-- 
GitLab


From 8af523842a7bfcafe68c556c681630231bf5f8c3 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Fri, 26 Oct 2018 09:52:58 +0000
Subject: [PATCH 0625/1116] [SimpleLoopUnswitch] Make all checks before actual
 non-trivial unswitch

We should be able to make all relevant checks before we actually start the non-trivial
unswitching, so that we could guarantee that once we have started the transform,
it will always succeed.

Reviewed By: chandlerc
Differential Revision: https://reviews.llvm.org/D53747


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345375 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 38 ++++++++++----------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 96db249584e..e8f67a689f4 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1792,10 +1792,10 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
   } while (!DomWorklist.empty());
 }
 
-static bool unswitchNontrivialInvariants(
+static void unswitchNontrivialInvariants(
     Loop &L, Instruction &TI, ArrayRef<Value *> Invariants,
-    DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
-    function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+    SmallVectorImpl<BasicBlock *> &ExitBlocks, DominatorTree &DT, LoopInfo &LI,
+    AssumptionCache &AC, function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
     ScalarEvolution *SE) {
   auto *ParentBB = TI.getParent();
   BranchInst *BI = dyn_cast<BranchInst>(&TI);
@@ -1851,17 +1851,6 @@ static bool unswitchNontrivialInvariants(
   // whatever reason).
   assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!");
 
-  SmallVector<BasicBlock *, 4> ExitBlocks;
-  L.getUniqueExitBlocks(ExitBlocks);
-
-  // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
-  // don't know how to split those exit blocks.
-  // FIXME: We should teach SplitBlock to handle this and remove this
-  // restriction.
-  for (auto *ExitBB : ExitBlocks)
-    if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI()))
-      return false;
-
   // Compute the parent loop now before we start hacking on things.
   Loop *ParentL = L.getParentLoop();
 
@@ -2145,7 +2134,6 @@ static bool unswitchNontrivialInvariants(
   UnswitchCB(IsStillLoop, SibLoops);
 
   ++NumBranches;
-  return true;
 }
 
 /// Recursively compute the cost of a dominator subtree based on the per-block
@@ -2241,6 +2229,19 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
   if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
     return false;
 
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L.getUniqueExitBlocks(ExitBlocks);
+
+  // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
+  // don't know how to split those exit blocks.
+  // FIXME: We should teach SplitBlock to handle this and remove this
+  // restriction.
+  for (auto *ExitBB : ExitBlocks)
+    if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI())) {
+      dbgs() << "Cannot unswitch because of cleanuppad in exit block\n";
+      return false;
+    }
+
   LLVM_DEBUG(
       dbgs() << "Considering " << UnswitchCandidates.size()
              << " non-trivial loop invariant conditions for unswitching.\n");
@@ -2374,11 +2375,12 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
     return false;
   }
 
-  LLVM_DEBUG(dbgs() << "  Trying to unswitch non-trivial (cost = "
+  LLVM_DEBUG(dbgs() << "  Unswitching non-trivial (cost = "
                     << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
                     << "\n");
-  return unswitchNontrivialInvariants(
-      L, *BestUnswitchTI, BestUnswitchInvariants, DT, LI, AC, UnswitchCB, SE);
+  unswitchNontrivialInvariants(L, *BestUnswitchTI, BestUnswitchInvariants,
+                               ExitBlocks, DT, LI, AC, UnswitchCB, SE);
+  return true;
 }
 
 /// Unswitch control flow predicated on loop invariant conditions.
-- 
GitLab


From 390074be6513afe12679f8ca6d59e0d15675adc6 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Fri, 26 Oct 2018 10:48:04 +0000
Subject: [PATCH 0626/1116] [llvm-mca] Removed dependency on mca::SourcMgr in
 some Views. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345376 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/Views/InstructionInfoView.h    |  9 ++++---
 tools/llvm-mca/Views/ResourcePressureView.cpp | 16 ++++++++----
 tools/llvm-mca/Views/ResourcePressureView.h   | 10 +++++---
 tools/llvm-mca/Views/SummaryView.cpp          | 10 +++++---
 tools/llvm-mca/Views/SummaryView.h            |  6 ++---
 tools/llvm-mca/Views/TimelineView.cpp         | 25 +++++++++----------
 tools/llvm-mca/Views/TimelineView.h           |  7 +++---
 tools/llvm-mca/include/SourceMgr.h            |  3 +--
 tools/llvm-mca/llvm-mca.cpp                   | 19 ++++++++------
 9 files changed, 61 insertions(+), 44 deletions(-)

diff --git a/tools/llvm-mca/Views/InstructionInfoView.h b/tools/llvm-mca/Views/InstructionInfoView.h
index 435c058d824..f7bbe6147d7 100644
--- a/tools/llvm-mca/Views/InstructionInfoView.h
+++ b/tools/llvm-mca/Views/InstructionInfoView.h
@@ -35,8 +35,9 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H
 
-#include "SourceMgr.h"
 #include "Views/View.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -50,13 +51,13 @@ namespace mca {
 class InstructionInfoView : public View {
   const llvm::MCSubtargetInfo &STI;
   const llvm::MCInstrInfo &MCII;
-  const SourceMgr &Source;
+  llvm::ArrayRef<llvm::MCInst> Source;
   llvm::MCInstPrinter &MCIP;
 
 public:
   InstructionInfoView(const llvm::MCSubtargetInfo &sti,
-                      const llvm::MCInstrInfo &mcii, const SourceMgr &S,
-                      llvm::MCInstPrinter &IP)
+                      const llvm::MCInstrInfo &mcii,
+                      llvm::ArrayRef<llvm::MCInst> S, llvm::MCInstPrinter &IP)
       : STI(sti), MCII(mcii), Source(S), MCIP(IP) {}
 
   void printView(llvm::raw_ostream &OS) const override;
diff --git a/tools/llvm-mca/Views/ResourcePressureView.cpp b/tools/llvm-mca/Views/ResourcePressureView.cpp
index e71825b07c7..e7943252206 100644
--- a/tools/llvm-mca/Views/ResourcePressureView.cpp
+++ b/tools/llvm-mca/Views/ResourcePressureView.cpp
@@ -21,9 +21,9 @@ namespace mca {
 using namespace llvm;
 
 ResourcePressureView::ResourcePressureView(const llvm::MCSubtargetInfo &sti,
-                                           llvm::MCInstPrinter &Printer,
-                                           const SourceMgr &Sequence)
-    : STI(sti), MCIP(Printer), Source(Sequence) {
+                                           MCInstPrinter &Printer,
+                                           ArrayRef<MCInst> S)
+    : STI(sti), MCIP(Printer), Source(S), LastInstructionIdx(0) {
   // Populate the map of resource descriptors.
   unsigned R2VIndex = 0;
   const MCSchedModel &SM = STI.getSchedModel();
@@ -44,9 +44,15 @@ ResourcePressureView::ResourcePressureView(const llvm::MCSubtargetInfo &sti,
 }
 
 void ResourcePressureView::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type == HWInstructionEvent::Dispatched) {
+    LastInstructionIdx = Event.IR.getSourceIndex();
+    return;
+  }
+
   // We're only interested in Issue events.
   if (Event.Type != HWInstructionEvent::Issued)
     return;
+
   const auto &IssueEvent = static_cast<const HWInstructionIssuedEvent &>(Event);
   const unsigned SourceIdx = Event.IR.getSourceIndex() % Source.size();
   for (const std::pair<ResourceRef, ResourceCycles> &Use :
@@ -128,7 +134,7 @@ void ResourcePressureView::printResourcePressurePerIter(raw_ostream &OS) const {
   FOS << '\n';
   FOS.flush();
 
-  const unsigned Executions = Source.getNumIterations();
+  const unsigned Executions = LastInstructionIdx / Source.size() + 1;
   for (unsigned I = 0, E = NumResourceUnits; I < E; ++I) {
     double Usage = ResourceUsage[I + Source.size() * E];
     printResourcePressure(FOS, Usage / Executions, (I + 1) * 7);
@@ -151,7 +157,7 @@ void ResourcePressureView::printResourcePressurePerInst(raw_ostream &OS) const {
   raw_string_ostream InstrStream(Instruction);
 
   unsigned InstrIndex = 0;
-  const unsigned Executions = Source.getNumIterations();
+  const unsigned Executions = LastInstructionIdx / Source.size() + 1;
   for (const MCInst &MCI : Source) {
     unsigned BaseEltIdx = InstrIndex * NumResourceUnits;
     for (unsigned J = 0; J < NumResourceUnits; ++J) {
diff --git a/tools/llvm-mca/Views/ResourcePressureView.h b/tools/llvm-mca/Views/ResourcePressureView.h
index d413bcd80fd..5ee86df424b 100644
--- a/tools/llvm-mca/Views/ResourcePressureView.h
+++ b/tools/llvm-mca/Views/ResourcePressureView.h
@@ -58,12 +58,12 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
 
-#include "SourceMgr.h"
 #include "Views/View.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include <map>
 
 namespace mca {
 
@@ -72,7 +72,8 @@ namespace mca {
 class ResourcePressureView : public View {
   const llvm::MCSubtargetInfo &STI;
   llvm::MCInstPrinter &MCIP;
-  const SourceMgr &Source;
+  llvm::ArrayRef<llvm::MCInst> Source;
+  unsigned LastInstructionIdx;
 
   // Map to quickly obtain the ResourceUsage column index from a processor
   // resource ID.
@@ -87,7 +88,8 @@ class ResourcePressureView : public View {
 
 public:
   ResourcePressureView(const llvm::MCSubtargetInfo &sti,
-                       llvm::MCInstPrinter &Printer, const SourceMgr &SM);
+                       llvm::MCInstPrinter &Printer,
+                       llvm::ArrayRef<llvm::MCInst> S);
 
   void onEvent(const HWInstructionEvent &Event) override;
   void printView(llvm::raw_ostream &OS) const override {
diff --git a/tools/llvm-mca/Views/SummaryView.cpp b/tools/llvm-mca/Views/SummaryView.cpp
index 8d529ba1549..a509818e6d7 100644
--- a/tools/llvm-mca/Views/SummaryView.cpp
+++ b/tools/llvm-mca/Views/SummaryView.cpp
@@ -24,14 +24,18 @@ namespace mca {
 
 using namespace llvm;
 
-SummaryView::SummaryView(const MCSchedModel &Model, const SourceMgr &S,
+SummaryView::SummaryView(const MCSchedModel &Model, ArrayRef<MCInst> S,
                          unsigned Width)
     : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
-      NumMicroOps(0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0) {
+      LastInstructionIdx(0), NumMicroOps(0),
+      ProcResourceUsage(Model.getNumProcResourceKinds(), 0) {
   computeProcResourceMasks(SM, ProcResourceMasks);
 }
 
 void SummaryView::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type == HWInstructionEvent::Dispatched)
+    LastInstructionIdx = Event.IR.getSourceIndex();
+
   // We are only interested in the "instruction retired" events generated by
   // the retire stage for instructions that are part of iteration #0.
   if (Event.Type != HWInstructionEvent::Retired ||
@@ -57,8 +61,8 @@ void SummaryView::onEvent(const HWInstructionEvent &Event) {
 }
 
 void SummaryView::printView(raw_ostream &OS) const {
-  unsigned Iterations = Source.getNumIterations();
   unsigned Instructions = Source.size();
+  unsigned Iterations = (LastInstructionIdx / Instructions) + 1;
   unsigned TotalInstructions = Instructions * Iterations;
   unsigned TotalUOps = NumMicroOps * Iterations;
   double IPC = (double)TotalInstructions / TotalCycles;
diff --git a/tools/llvm-mca/Views/SummaryView.h b/tools/llvm-mca/Views/SummaryView.h
index 3d4585e1d5a..8c330f28f39 100644
--- a/tools/llvm-mca/Views/SummaryView.h
+++ b/tools/llvm-mca/Views/SummaryView.h
@@ -29,7 +29,6 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H
 
-#include "SourceMgr.h"
 #include "Views/View.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/MC/MCSchedule.h"
@@ -40,8 +39,9 @@ namespace mca {
 /// A view that collects and prints a few performance numbers.
 class SummaryView : public View {
   const llvm::MCSchedModel &SM;
-  const SourceMgr &Source;
+  llvm::ArrayRef<llvm::MCInst> Source;
   const unsigned DispatchWidth;
+  unsigned LastInstructionIdx;
   unsigned TotalCycles;
   // The total number of micro opcodes contributed by a block of instructions.
   unsigned NumMicroOps;
@@ -62,7 +62,7 @@ class SummaryView : public View {
   double getBlockRThroughput() const;
 
 public:
-  SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
+  SummaryView(const llvm::MCSchedModel &Model, llvm::ArrayRef<llvm::MCInst> S,
               unsigned Width);
 
   void onCycleEnd() override { ++TotalCycles; }
diff --git a/tools/llvm-mca/Views/TimelineView.cpp b/tools/llvm-mca/Views/TimelineView.cpp
index d802d42352d..de347b54bd9 100644
--- a/tools/llvm-mca/Views/TimelineView.cpp
+++ b/tools/llvm-mca/Views/TimelineView.cpp
@@ -19,15 +19,14 @@ using namespace llvm;
 namespace mca {
 
 TimelineView::TimelineView(const MCSubtargetInfo &sti, MCInstPrinter &Printer,
-                           const SourceMgr &S, unsigned MaxIterations,
+                           llvm::ArrayRef<llvm::MCInst> S, unsigned Iterations,
                            unsigned Cycles)
-    : STI(sti), MCIP(Printer), AsmSequence(S), CurrentCycle(0),
+    : STI(sti), MCIP(Printer), Source(S), CurrentCycle(0),
       MaxCycle(Cycles == 0 ? 80 : Cycles), LastCycle(0), WaitTime(S.size()),
       UsedBuffer(S.size()) {
-  unsigned NumInstructions = AsmSequence.size();
-  if (!MaxIterations)
-    MaxIterations = DEFAULT_ITERATIONS;
-  NumInstructions *= std::min(MaxIterations, AsmSequence.getNumIterations());
+  unsigned NumInstructions = Source.size();
+  assert(Iterations && "Invalid number of iterations specified!");
+  NumInstructions *= Iterations;
   Timeline.resize(NumInstructions);
   TimelineViewEntry InvalidTVEntry = {-1, 0, 0, 0, 0};
   std::fill(Timeline.begin(), Timeline.end(), InvalidTVEntry);
@@ -42,7 +41,7 @@ TimelineView::TimelineView(const MCSubtargetInfo &sti, MCInstPrinter &Printer,
 
 void TimelineView::onReservedBuffers(const InstRef &IR,
                                      ArrayRef<unsigned> Buffers) {
-  if (IR.getSourceIndex() >= AsmSequence.size())
+  if (IR.getSourceIndex() >= Source.size())
     return;
 
   const MCSchedModel &SM = STI.getSchedModel();
@@ -72,7 +71,7 @@ void TimelineView::onEvent(const HWInstructionEvent &Event) {
     // Update the WaitTime entry which corresponds to this Index.
     assert(TVEntry.CycleDispatched >= 0 && "Invalid TVEntry found!");
     unsigned CycleDispatched = static_cast<unsigned>(TVEntry.CycleDispatched);
-    WaitTimeEntry &WTEntry = WaitTime[Index % AsmSequence.size()];
+    WaitTimeEntry &WTEntry = WaitTime[Index % Source.size()];
     WTEntry.CyclesSpentInSchedulerQueue +=
         TVEntry.CycleIssued - CycleDispatched;
     assert(CycleDispatched <= TVEntry.CycleReady &&
@@ -176,9 +175,9 @@ void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
   raw_string_ostream InstrStream(Instruction);
 
   formatted_raw_ostream FOS(OS);
-  unsigned Executions = Timeline.size() / AsmSequence.size();
+  unsigned Executions = Timeline.size() / Source.size();
   unsigned IID = 0;
-  for (const MCInst &Inst : AsmSequence) {
+  for (const MCInst &Inst : Source) {
     printWaitTimeEntry(FOS, WaitTime[IID], IID, Executions);
     // Append the instruction info at the end of the line.
     MCIP.printInst(&Inst, InstrStream, "", STI);
@@ -268,14 +267,14 @@ void TimelineView::printTimeline(raw_ostream &OS) const {
   raw_string_ostream InstrStream(Instruction);
 
   unsigned IID = 0;
-  const unsigned Iterations = Timeline.size() / AsmSequence.size();
+  const unsigned Iterations = Timeline.size() / Source.size();
   for (unsigned Iteration = 0; Iteration < Iterations; ++Iteration) {
-    for (const MCInst &Inst : AsmSequence) {
+    for (const MCInst &Inst : Source) {
       const TimelineViewEntry &Entry = Timeline[IID];
       if (Entry.CycleRetired == 0)
         return;
 
-      unsigned SourceIndex = IID % AsmSequence.size();
+      unsigned SourceIndex = IID % Source.size();
       printTimelineViewEntry(FOS, Entry, Iteration, SourceIndex);
       // Append the instruction info at the end of the line.
       MCIP.printInst(&Inst, InstrStream, "", STI);
diff --git a/tools/llvm-mca/Views/TimelineView.h b/tools/llvm-mca/Views/TimelineView.h
index 361e37ac625..244d254b7f5 100644
--- a/tools/llvm-mca/Views/TimelineView.h
+++ b/tools/llvm-mca/Views/TimelineView.h
@@ -100,8 +100,9 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
 
-#include "SourceMgr.h"
 #include "Views/View.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/FormattedStream.h"
@@ -119,7 +120,7 @@ namespace mca {
 class TimelineView : public View {
   const llvm::MCSubtargetInfo &STI;
   llvm::MCInstPrinter &MCIP;
-  const SourceMgr &AsmSequence;
+  llvm::ArrayRef<llvm::MCInst> Source;
 
   unsigned CurrentCycle;
   unsigned MaxCycle;
@@ -166,7 +167,7 @@ class TimelineView : public View {
 
 public:
   TimelineView(const llvm::MCSubtargetInfo &sti, llvm::MCInstPrinter &Printer,
-               const SourceMgr &Sequence, unsigned MaxIterations,
+               llvm::ArrayRef<llvm::MCInst> S, unsigned Iterations,
                unsigned Cycles);
 
   // Event handlers.
diff --git a/tools/llvm-mca/include/SourceMgr.h b/tools/llvm-mca/include/SourceMgr.h
index e7cd358afd4..12713588246 100644
--- a/tools/llvm-mca/include/SourceMgr.h
+++ b/tools/llvm-mca/include/SourceMgr.h
@@ -48,9 +48,8 @@ public:
   using const_iterator = llvm::ArrayRef<llvm::MCInst>::const_iterator;
   const_iterator begin() const { return Sequence.begin(); }
   const_iterator end() const { return Sequence.end(); }
-
-  bool isEmpty() const { return size() == 0; }
 };
+
 } // namespace mca
 
 #endif
diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp
index 9992395fb6e..b89e4bd9551 100644
--- a/tools/llvm-mca/llvm-mca.cpp
+++ b/tools/llvm-mca/llvm-mca.cpp
@@ -497,6 +497,7 @@ int main(int argc, char **argv) {
 
   // Number each region in the sequence.
   unsigned RegionIdx = 0;
+
   for (const std::unique_ptr<mca::CodeRegion> &Region : Regions) {
     // Skip empty code regions.
     if (Region->empty())
@@ -512,6 +513,7 @@ int main(int argc, char **argv) {
       TOF->os() << "\n\n";
     }
 
+    ArrayRef<MCInst> Insts = Region->getInstructions();
     mca::SourceMgr S(Region->getInstructions(),
                      PrintInstructionTables ? 1 : Iterations);
 
@@ -524,11 +526,11 @@ int main(int argc, char **argv) {
 
       // Create the views for this pipeline, execute, and emit a report.
       if (PrintInstructionInfoView) {
-        Printer.addView(
-            llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, S, *IP));
+        Printer.addView(llvm::make_unique<mca::InstructionInfoView>(
+            *STI, *MCII, Insts, *IP));
       }
       Printer.addView(
-          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, S));
+          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts));
 
       if (!runPipeline(*P, *IP, *STI))
         return 1;
@@ -542,11 +544,11 @@ int main(int argc, char **argv) {
     mca::PipelinePrinter Printer(*P);
 
     if (PrintSummaryView)
-      Printer.addView(llvm::make_unique<mca::SummaryView>(SM, S, Width));
+      Printer.addView(llvm::make_unique<mca::SummaryView>(SM, Insts, Width));
 
     if (PrintInstructionInfoView)
       Printer.addView(
-          llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, S, *IP));
+          llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, Insts, *IP));
 
     if (PrintDispatchStats)
       Printer.addView(llvm::make_unique<mca::DispatchStatistics>());
@@ -562,11 +564,14 @@ int main(int argc, char **argv) {
 
     if (PrintResourcePressureView)
       Printer.addView(
-          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, S));
+          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts));
 
     if (PrintTimelineView) {
+      unsigned TimelineIterations =
+          TimelineMaxIterations ? TimelineMaxIterations : 10;
       Printer.addView(llvm::make_unique<mca::TimelineView>(
-          *STI, *IP, S, TimelineMaxIterations, TimelineMaxCycles));
+          *STI, *IP, Insts, std::min(TimelineIterations, S.getNumIterations()),
+          TimelineMaxCycles));
     }
 
     if (!runPipeline(*P, *IP, *STI))
-- 
GitLab


From 500b851fc5b88268b9f09f8505dbd17405742f12 Mon Sep 17 00:00:00 2001
From: George Rimar <grimar@accesssoftek.com>
Date: Fri, 26 Oct 2018 11:25:12 +0000
Subject: [PATCH 0627/1116] [Codegen] - Implement basic .debug_loclists section
 emission (DWARF5).

.debug_loclists is the DWARF 5 version of the .debug_loc.
With that patch, it will be emitted when DWARF 5 is used.

Differential revision: https://reviews.llvm.org/D53365

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345377 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCObjectFileInfo.h    |   3 +
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 184 +++++++++++++++++++-------
 lib/CodeGen/AsmPrinter/DwarfFile.h    |   9 +-
 lib/CodeGen/AsmPrinter/DwarfUnit.cpp  |   9 ++
 lib/CodeGen/AsmPrinter/DwarfUnit.h    |   3 +
 lib/MC/MCObjectFileInfo.cpp           |   5 +
 test/CodeGen/X86/debug-loclists.ll    | 142 ++++++++++++++++++++
 7 files changed, 302 insertions(+), 53 deletions(-)
 create mode 100644 test/CodeGen/X86/debug-loclists.ll

diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 8cf9e1cc55a..729aa23ef33 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -117,6 +117,8 @@ protected:
   MCSection *DwarfAddrSection;
   /// The DWARF v5 range list section.
   MCSection *DwarfRnglistsSection;
+  /// The DWARF v5 locations list section.
+  MCSection *DwarfLoclistsSection;
 
   /// The DWARF v5 range list section for fission.
   MCSection *DwarfRnglistsDWOSection;
@@ -258,6 +260,7 @@ public:
   MCSection *getDwarfARangesSection() const { return DwarfARangesSection; }
   MCSection *getDwarfRangesSection() const { return DwarfRangesSection; }
   MCSection *getDwarfRnglistsSection() const { return DwarfRnglistsSection; }
+  MCSection *getDwarfLoclistsSection() const { return DwarfLoclistsSection; }
   MCSection *getDwarfMacinfoSection() const { return DwarfMacinfoSection; }
 
   MCSection *getDwarfDebugNamesSection() const {
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 4a9ed6d03c6..3a1e54812a1 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -727,11 +727,16 @@ void DwarfDebug::beginModule() {
     (useSplitDwarf() ? SkeletonHolder : InfoHolder)
         .setStringOffsetsStartSym(Asm->createTempSymbol("str_offsets_base"));
 
-  // Create the symbol that designates the start of the DWARF v5 range list
-  // table. It is located past the header and before the offsets table.
+
+  // Create the symbols that designates the start of the DWARF v5 range list
+  // and locations list tables. They are located past the table headers.
   if (getDwarfVersion() >= 5) {
-    (useSplitDwarf() ? SkeletonHolder : InfoHolder)
-        .setRnglistsTableBaseSym(Asm->createTempSymbol("rnglists_table_base"));
+    DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+    Holder.setRnglistsTableBaseSym(
+        Asm->createTempSymbol("rnglists_table_base"));
+    Holder.setLoclistsTableBaseSym(
+        Asm->createTempSymbol("loclists_table_base"));
+
     if (useSplitDwarf())
       InfoHolder.setRnglistsTableBaseSym(
           Asm->createTempSymbol("rnglists_dwo_table_base"));
@@ -889,8 +894,13 @@ void DwarfDebug::finalizeModuleInfo() {
       U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges());
     }
 
-    if (getDwarfVersion() >= 5 && U.hasRangeLists())
-      U.addRnglistsBase();
+    if (getDwarfVersion() >= 5) {
+      if (U.hasRangeLists())
+        U.addRnglistsBase();
+
+      if (!DebugLocs.getLists().empty() && !useSplitDwarf())
+        U.addLoclistsBase();
+    }
 
     auto *CUNode = cast<DICompileUnit>(P.first);
     // If compile Unit has macros, emit "DW_AT_macro_info" attribute.
@@ -1925,25 +1935,119 @@ void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry) {
   emitDebugLocEntry(Streamer, Entry);
 }
 
-// Emit locations into the debug loc section.
+// Emit the common part of the DWARF 5 range/locations list tables header.
+static void emitListsTableHeaderStart(AsmPrinter *Asm, const DwarfFile &Holder,
+                                      MCSymbol *TableStart,
+                                      MCSymbol *TableEnd) {
+  // Build the table header, which starts with the length field.
+  Asm->OutStreamer->AddComment("Length");
+  Asm->EmitLabelDifference(TableEnd, TableStart, 4);
+  Asm->OutStreamer->EmitLabel(TableStart);
+  // Version number (DWARF v5 and later).
+  Asm->OutStreamer->AddComment("Version");
+  Asm->emitInt16(Asm->OutStreamer->getContext().getDwarfVersion());
+  // Address size.
+  Asm->OutStreamer->AddComment("Address size");
+  Asm->emitInt8(Asm->MAI->getCodePointerSize());
+  // Segment selector size.
+  Asm->OutStreamer->AddComment("Segment selector size");
+  Asm->emitInt8(0);
+}
+
+// Emit the header of a DWARF 5 range list table list table. Returns the symbol
+// that designates the end of the table for the caller to emit when the table is
+// complete.
+static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
+                                         const DwarfFile &Holder) {
+  MCSymbol *TableStart = Asm->createTempSymbol("debug_rnglist_table_start");
+  MCSymbol *TableEnd = Asm->createTempSymbol("debug_rnglist_table_end");
+  emitListsTableHeaderStart(Asm, Holder, TableStart, TableEnd);
+
+  Asm->OutStreamer->AddComment("Offset entry count");
+  Asm->emitInt32(Holder.getRangeLists().size());
+  Asm->OutStreamer->EmitLabel(Holder.getRnglistsTableBaseSym());
+
+  for (const RangeSpanList &List : Holder.getRangeLists())
+    Asm->EmitLabelDifference(List.getSym(), Holder.getRnglistsTableBaseSym(),
+                             4);
+
+  return TableEnd;
+}
+
+// Emit the header of a DWARF 5 locations list table. Returns the symbol that
+// designates the end of the table for the caller to emit when the table is
+// complete.
+static MCSymbol *emitLoclistsTableHeader(AsmPrinter *Asm,
+                                         const DwarfFile &Holder) {
+  MCSymbol *TableStart = Asm->createTempSymbol("debug_loclist_table_start");
+  MCSymbol *TableEnd = Asm->createTempSymbol("debug_loclist_table_end");
+  emitListsTableHeaderStart(Asm, Holder, TableStart, TableEnd);
+
+  // FIXME: Generate the offsets table and use DW_FORM_loclistx with the
+  // DW_AT_loclists_base attribute. Until then set the number of offsets to 0.
+  Asm->OutStreamer->AddComment("Offset entry count");
+  Asm->emitInt32(0);
+  Asm->OutStreamer->EmitLabel(Holder.getLoclistsTableBaseSym());
+
+  return TableEnd;
+}
+
+// Emit locations into the .debug_loc/.debug_rnglists section.
 void DwarfDebug::emitDebugLoc() {
   if (DebugLocs.getLists().empty())
     return;
 
-  // Start the dwarf loc section.
-  Asm->OutStreamer->SwitchSection(
-      Asm->getObjFileLowering().getDwarfLocSection());
+  bool IsLocLists = getDwarfVersion() >= 5;
+  MCSymbol *TableEnd = nullptr;
+  if (IsLocLists) {
+    Asm->OutStreamer->SwitchSection(
+        Asm->getObjFileLowering().getDwarfLoclistsSection());
+    TableEnd = emitLoclistsTableHeader(Asm, useSplitDwarf() ? SkeletonHolder
+                                                            : InfoHolder);
+  } else {
+    Asm->OutStreamer->SwitchSection(
+        Asm->getObjFileLowering().getDwarfLocSection());
+  }
+
   unsigned char Size = Asm->MAI->getCodePointerSize();
   for (const auto &List : DebugLocs.getLists()) {
     Asm->OutStreamer->EmitLabel(List.Label);
+
     const DwarfCompileUnit *CU = List.CU;
+    const MCSymbol *Base = CU->getBaseAddress();
     for (const auto &Entry : DebugLocs.getEntries(List)) {
-      // Set up the range. This range is relative to the entry point of the
-      // compile unit. This is a hard coded 0 for low_pc when we're emitting
-      // ranges, or the DW_AT_low_pc on the compile unit otherwise.
-      if (auto *Base = CU->getBaseAddress()) {
-        Asm->EmitLabelDifference(Entry.BeginSym, Base, Size);
-        Asm->EmitLabelDifference(Entry.EndSym, Base, Size);
+      if (Base) {
+        // Set up the range. This range is relative to the entry point of the
+        // compile unit. This is a hard coded 0 for low_pc when we're emitting
+        // ranges, or the DW_AT_low_pc on the compile unit otherwise.
+        if (IsLocLists) {
+          Asm->OutStreamer->AddComment("DW_LLE_offset_pair");
+          Asm->OutStreamer->EmitIntValue(dwarf::DW_LLE_offset_pair, 1);
+          Asm->OutStreamer->AddComment("  starting offset");
+          Asm->EmitLabelDifferenceAsULEB128(Entry.BeginSym, Base);
+          Asm->OutStreamer->AddComment("  ending offset");
+          Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Base);
+        } else {
+          Asm->EmitLabelDifference(Entry.BeginSym, Base, Size);
+          Asm->EmitLabelDifference(Entry.EndSym, Base, Size);
+        }
+
+        emitDebugLocEntryLocation(Entry);
+        continue;
+      }
+
+      // We have no base address.
+      if (IsLocLists) {
+        // TODO: Use DW_LLE_base_addressx + DW_LLE_offset_pair, or
+        // DW_LLE_startx_length in case if there is only a single range.
+        // That should reduce the size of the debug data emited.
+        // For now just use the DW_LLE_startx_length for all cases.
+        Asm->OutStreamer->AddComment("DW_LLE_startx_length");
+        Asm->emitInt8(dwarf::DW_LLE_startx_length);
+        Asm->OutStreamer->AddComment("  start idx");
+        Asm->EmitULEB128(AddrPool.getIndex(Entry.BeginSym));
+        Asm->OutStreamer->AddComment("  length");
+        Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Entry.BeginSym);
       } else {
         Asm->OutStreamer->EmitSymbolValue(Entry.BeginSym, Size);
         Asm->OutStreamer->EmitSymbolValue(Entry.EndSym, Size);
@@ -1951,9 +2055,20 @@ void DwarfDebug::emitDebugLoc() {
 
       emitDebugLocEntryLocation(Entry);
     }
-    Asm->OutStreamer->EmitIntValue(0, Size);
-    Asm->OutStreamer->EmitIntValue(0, Size);
+
+    if (IsLocLists) {
+      // .debug_loclists section ends with DW_LLE_end_of_list.
+      Asm->OutStreamer->AddComment("DW_LLE_end_of_list");
+      Asm->OutStreamer->EmitIntValue(dwarf::DW_LLE_end_of_list, 1);
+    } else {
+      // Terminate the .debug_loc list with two 0 values.
+      Asm->OutStreamer->EmitIntValue(0, Size);
+      Asm->OutStreamer->EmitIntValue(0, Size);
+    }
   }
+
+  if (TableEnd)
+    Asm->OutStreamer->EmitLabel(TableEnd);
 }
 
 void DwarfDebug::emitDebugLocDWO() {
@@ -2232,39 +2347,6 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm,
   }
 }
 
-// Emit the header of a DWARF 5 range list table. Returns the symbol that
-// designates the end of the table for the caller to emit when the table is
-// complete.
-static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
-                                         const DwarfFile &Holder) {
-  // The length is described by a starting label right after the length field
-  // and an end label.
-  MCSymbol *TableStart = Asm->createTempSymbol("debug_rnglist_table_start");
-  MCSymbol *TableEnd = Asm->createTempSymbol("debug_rnglist_table_end");
-  // Build the range table header, which starts with the length field.
-  Asm->OutStreamer->AddComment("Length");
-  Asm->EmitLabelDifference(TableEnd, TableStart, 4);
-  Asm->OutStreamer->EmitLabel(TableStart);
-  // Version number (DWARF v5 and later).
-  Asm->OutStreamer->AddComment("Version");
-  Asm->emitInt16(Asm->OutStreamer->getContext().getDwarfVersion());
-  Asm->OutStreamer->AddComment("Address size");
-  Asm->emitInt8(Asm->MAI->getCodePointerSize());
-  Asm->OutStreamer->AddComment("Segment selector size");
-  Asm->emitInt8(0);
-
-  MCSymbol *RnglistsTableBaseSym = Holder.getRnglistsTableBaseSym();
-
-  // FIXME: Generate the offsets table and use DW_FORM_rnglistx with the
-  // DW_AT_ranges attribute. Until then set the number of offsets to 0.
-  Asm->OutStreamer->AddComment("Offset entry count");
-  Asm->emitInt32(Holder.getRangeLists().size());
-  Asm->OutStreamer->EmitLabel(RnglistsTableBaseSym);
-  for (const RangeSpanList &List : Holder.getRangeLists())
-    Asm->EmitLabelDifference(List.getSym(), RnglistsTableBaseSym, 4);
-  return TableEnd;
-}
-
 void emitDebugRangesImpl(DwarfDebug &DD, AsmPrinter *Asm,
                          const DwarfFile &Holder, MCSymbol *TableEnd) {
   for (const RangeSpanList &List : Holder.getRangeLists())
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index c764c6c5afb..1e5c99e26eb 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -89,6 +89,10 @@ class DwarfFile {
   /// The table is shared by all units.
   MCSymbol *RnglistsTableBaseSym = nullptr;
 
+  /// DWARF v5: The symbol that designates the base of the locations list table.
+  /// The table is shared by all units.
+  MCSymbol *LoclistsTableBaseSym = nullptr;
+
   /// The variables of a lexical scope.
   struct ScopeVars {
     /// We need to sort Args by ArgNo and check for duplicates. This could also
@@ -161,13 +165,14 @@ public:
   DwarfStringPool &getStringPool() { return StrPool; }
 
   MCSymbol *getStringOffsetsStartSym() const { return StringOffsetsStartSym; }
-
   void setStringOffsetsStartSym(MCSymbol *Sym) { StringOffsetsStartSym = Sym; }
 
   MCSymbol *getRnglistsTableBaseSym() const { return RnglistsTableBaseSym; }
-
   void setRnglistsTableBaseSym(MCSymbol *Sym) { RnglistsTableBaseSym = Sym; }
 
+  MCSymbol *getLoclistsTableBaseSym() const { return LoclistsTableBaseSym; }
+  void setLoclistsTableBaseSym(MCSymbol *Sym) { LoclistsTableBaseSym = Sym; }
+
   /// \returns false if the variable was merged with a previous one.
   bool addScopeVariable(LexicalScope *LS, DbgVariable *Var);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 42aa0c933ef..2053395808f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1656,6 +1656,15 @@ void DwarfUnit::addRnglistsBase() {
                   TLOF.getDwarfRnglistsSection()->getBeginSymbol());
 }
 
+void DwarfUnit::addLoclistsBase() {
+  assert(DD->getDwarfVersion() >= 5 &&
+         "DW_AT_loclists_base requires DWARF version 5 or later");
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  addSectionLabel(getUnitDie(), dwarf::DW_AT_loclists_base,
+                  DU->getLoclistsTableBaseSym(),
+                  TLOF.getDwarfLoclistsSection()->getBeginSymbol());
+}
+
 void DwarfUnit::addAddrTableBase() {
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   MCSymbol *Label = DD->getAddressPool().getLabel();
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 1a36ea9ec55..860d1653184 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -272,6 +272,9 @@ public:
   /// Add the DW_AT_rnglists_base attribute to the unit DIE.
   void addRnglistsBase();
 
+  /// Add the DW_AT_loclists_base attribute to the unit DIE.
+  void addLoclistsBase();
+
   /// Add the DW_AT_addr_base attribute to the unit DIE.
   void addAddrTableBase();
 
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 54d8d4e5f71..ab8e0f31db9 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -260,6 +260,10 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
   DwarfLocSection =
       Ctx->getMachOSection("__DWARF", "__debug_loc", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "section_debug_loc");
+  DwarfLoclistsSection =
+      Ctx->getMachOSection("__DWARF", "__debug_loclists", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata(), "section_debug_loc");
+
   DwarfARangesSection =
       Ctx->getMachOSection("__DWARF", "__debug_aranges", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata());
@@ -435,6 +439,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
       Ctx->getELFSection(".debug_str_offsets", DebugSecType, 0);
   DwarfAddrSection = Ctx->getELFSection(".debug_addr", DebugSecType, 0);
   DwarfRnglistsSection = Ctx->getELFSection(".debug_rnglists", DebugSecType, 0);
+  DwarfLoclistsSection = Ctx->getELFSection(".debug_loclists", DebugSecType, 0);
 
   // Fission Sections
   DwarfInfoDWOSection =
diff --git a/test/CodeGen/X86/debug-loclists.ll b/test/CodeGen/X86/debug-loclists.ll
new file mode 100644
index 00000000000..874cdc196e4
--- /dev/null
+++ b/test/CodeGen/X86/debug-loclists.ll
@@ -0,0 +1,142 @@
+; RUN: llc -mtriple=x86_64-pc-linux -filetype=obj -o %t < %s
+; RUN: llvm-dwarfdump -v %t | FileCheck %s
+
+; CHECK:      0x00000033: DW_TAG_formal_parameter [3]
+; CHECK-NEXT:               DW_AT_location [DW_FORM_sec_offset]   (0x0000000c
+; CHECK-NEXT:                  [0x0000000000000000, 0x0000000000000004): DW_OP_breg5 RDI+0
+; CHECK-NEXT:                  [0x0000000000000004, 0x0000000000000012): DW_OP_breg3 RBX+0)
+; CHECK-NEXT:               DW_AT_name [DW_FORM_strx1]    ( indexed (0000000e) string = "a")
+; CHECK-NEXT:               DW_AT_decl_file [DW_FORM_data1]       ("/home/folder{{\\|\/}}test.cc")
+; CHECK-NEXT:               DW_AT_decl_line [DW_FORM_data1]       (6)
+; CHECK-NEXT:               DW_AT_type [DW_FORM_ref4]     (cu + 0x0040 => {0x00000040} "A")
+
+; CHECK:      .debug_loclists contents:
+; CHECK-NEXT: 0x00000000: locations list header: length = 0x00000017, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+; CHECK-NEXT: 0x00000000:
+; CHECK-NEXT:  [0x0000000000000000, 0x0000000000000004): DW_OP_breg5 RDI+0
+; CHECK-NEXT:  [0x0000000000000004, 0x0000000000000012): DW_OP_breg3 RBX+0
+
+; There is no way to use llvm-dwarfdump atm (2018, october) to verify the DW_LLE_* codes emited,
+; because dumper is not yet implements that. Use asm code to do this check instead.
+;
+; RUN: llc -mtriple=x86_64-pc-linux -filetype=asm < %s -o - | FileCheck %s --check-prefix=ASM
+; ASM:      .section .debug_loclists,"",@progbits
+; ASM-NEXT: .long .Ldebug_loclist_table_end0-.Ldebug_loclist_table_start0 # Length
+; ASM-NEXT: .Ldebug_loclist_table_start0:
+; ASM-NEXT:  .short 5                              # Version
+; ASM-NEXT:  .byte 8                               # Address size
+; ASM-NEXT:  .byte 0                               # Segment selector size
+; ASM-NEXT:  .long 0                               # Offset entry count
+; ASM-NEXT: .Lloclists_table_base0:                
+; ASM-NEXT: .Ldebug_loc0:
+; ASM-NEXT:  .byte 4                               # DW_LLE_offset_pair
+; ASM-NEXT:  .uleb128 .Lfunc_begin0-.Lfunc_begin0  # starting offset
+; ASM-NEXT:  .uleb128 .Ltmp0-.Lfunc_begin0         # ending offset
+; ASM-NEXT:  .short 2                              # Loc expr size
+; ASM-NEXT:  .byte 117                             # DW_OP_breg5
+; ASM-NEXT:  .byte 0                               # 0
+; ASM-NEXT:  .byte 4                               # DW_LLE_offset_pair
+; ASM-NEXT:  .uleb128 .Ltmp0-.Lfunc_begin0         # starting offset
+; ASM-NEXT:  .uleb128 .Ltmp1-.Lfunc_begin0         # ending offset
+; ASM-NEXT:  .short 2                              # Loc expr size
+; ASM-NEXT:  .byte 115                             # DW_OP_breg3
+; ASM-NEXT:  .byte 0                               # 0
+; ASM-NEXT:  .byte 0                               # DW_LLE_end_of_list
+; ASM-NEXT: .Ldebug_loclist_table_end0:
+
+; ModuleID = 'test.cc'
+source_filename = "test.cc"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { i32 (...)** }
+
+@_ZTV1A = dso_local unnamed_addr constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI1A to i8*), i8* bitcast (void (%struct.A*)* @_ZN1A3fooEv to i8*), i8* bitcast (void (%struct.A*)* @_ZN1A3barEv to i8*)] }, align 8
+@_ZTVN10__cxxabiv117__class_type_infoE = external dso_local global i8*
+@_ZTS1A = dso_local constant [3 x i8] c"1A\00", align 1
+@_ZTI1A = dso_local constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv117__class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @_ZTS1A, i32 0, i32 0) }, align 8
+
+; Function Attrs: noinline optnone uwtable
+define dso_local void @_Z3baz1A(%struct.A* %a) #0 !dbg !7 {
+entry:
+  call void @llvm.dbg.declare(metadata %struct.A* %a, metadata !23, metadata !DIExpression()), !dbg !24
+  call void @_ZN1A3fooEv(%struct.A* %a), !dbg !25
+  call void @_ZN1A3barEv(%struct.A* %a), !dbg !26
+  ret void, !dbg !27
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @_ZN1A3fooEv(%struct.A* %this) unnamed_addr #2 align 2 !dbg !28 {
+entry:
+  %this.addr = alloca %struct.A*, align 8
+  store %struct.A* %this, %struct.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata %struct.A** %this.addr, metadata !29, metadata !DIExpression()), !dbg !31
+  %this1 = load %struct.A*, %struct.A** %this.addr, align 8
+  ret void, !dbg !32
+}
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @_ZN1A3barEv(%struct.A* %this) unnamed_addr #2 align 2 !dbg !33 {
+entry:
+  %this.addr = alloca %struct.A*, align 8
+  store %struct.A* %this, %struct.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata %struct.A** %this.addr, metadata !34, metadata !DIExpression()), !dbg !35
+  %this1 = load %struct.A*, %struct.A** %this.addr, align 8
+  ret void, !dbg !36
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define dso_local i32 @main() #3 !dbg !37 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  ret i32 0, !dbg !38
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 344035)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "test.cc", directory: "/home/folder", checksumkind: CSK_MD5, checksum: "e0f357ad6dcb791a774a0dae55baf5e7")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk 344035)"}
+!7 = distinct !DISubprogram(name: "baz", linkageName: "_Z3baz1A", scope: !1, file: !1, line: 6, type: !8, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "A", file: !1, line: 1, size: 64, flags: DIFlagTypePassByReference, elements: !11, vtableHolder: !10, identifier: "_ZTS1A")
+!11 = !{!12, !18, !22}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "_vptr$A", scope: !1, file: !1, baseType: !13, size: 64, flags: DIFlagArtificial)
+!13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64)
+!14 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "__vtbl_ptr_type", baseType: !15, size: 64)
+!15 = !DISubroutineType(types: !16)
+!16 = !{!17}
+!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!18 = !DISubprogram(name: "foo", linkageName: "_ZN1A3fooEv", scope: !10, file: !1, line: 2, type: !19, isLocal: false, isDefinition: false, scopeLine: 2, containingType: !10, virtuality: DW_VIRTUALITY_virtual, virtualIndex: 0, flags: DIFlagPrototyped, isOptimized: false)
+!19 = !DISubroutineType(types: !20)
+!20 = !{null, !21}
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!22 = !DISubprogram(name: "bar", linkageName: "_ZN1A3barEv", scope: !10, file: !1, line: 3, type: !19, isLocal: false, isDefinition: false, scopeLine: 3, containingType: !10, virtuality: DW_VIRTUALITY_virtual, virtualIndex: 1, flags: DIFlagPrototyped, isOptimized: false)
+!23 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 6, type: !10)
+!24 = !DILocation(line: 6, column: 19, scope: !7)
+!25 = !DILocation(line: 7, column: 6, scope: !7)
+!26 = !DILocation(line: 8, column: 6, scope: !7)
+!27 = !DILocation(line: 9, column: 1, scope: !7)
+!28 = distinct !DISubprogram(name: "foo", linkageName: "_ZN1A3fooEv", scope: !10, file: !1, line: 12, type: !19, isLocal: false, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !18, retainedNodes: !2)
+!29 = !DILocalVariable(name: "this", arg: 1, scope: !28, type: !30, flags: DIFlagArtificial | DIFlagObjectPointer)
+!30 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!31 = !DILocation(line: 0, scope: !28)
+!32 = !DILocation(line: 12, column: 16, scope: !28)
+!33 = distinct !DISubprogram(name: "bar", linkageName: "_ZN1A3barEv", scope: !10, file: !1, line: 13, type: !19, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !22, retainedNodes: !2)
+!34 = !DILocalVariable(name: "this", arg: 1, scope: !33, type: !30, flags: DIFlagArtificial | DIFlagObjectPointer)
+!35 = !DILocation(line: 0, scope: !33)
+!36 = !DILocation(line: 13, column: 16, scope: !33)
+!37 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 15, type: !15, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!38 = !DILocation(line: 16, column: 3, scope: !37)
-- 
GitLab


From d362f0fbbba60765a35260d349608f382ffaa0ed Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Fri, 26 Oct 2018 12:19:48 +0000
Subject: [PATCH 0628/1116] [llvm-mca] Fix -wreorder and -Wunused-private-field
 after r345376. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345378 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/Views/SummaryView.cpp | 4 ++--
 tools/llvm-mca/Views/TimelineView.h  | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tools/llvm-mca/Views/SummaryView.cpp b/tools/llvm-mca/Views/SummaryView.cpp
index a509818e6d7..2007746b81f 100644
--- a/tools/llvm-mca/Views/SummaryView.cpp
+++ b/tools/llvm-mca/Views/SummaryView.cpp
@@ -26,8 +26,8 @@ using namespace llvm;
 
 SummaryView::SummaryView(const MCSchedModel &Model, ArrayRef<MCInst> S,
                          unsigned Width)
-    : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
-      LastInstructionIdx(0), NumMicroOps(0),
+    : SM(Model), Source(S), DispatchWidth(Width), LastInstructionIdx(0),
+      TotalCycles(0), NumMicroOps(0),
       ProcResourceUsage(Model.getNumProcResourceKinds(), 0) {
   computeProcResourceMasks(SM, ProcResourceMasks);
 }
diff --git a/tools/llvm-mca/Views/TimelineView.h b/tools/llvm-mca/Views/TimelineView.h
index 244d254b7f5..9b39a98c74a 100644
--- a/tools/llvm-mca/Views/TimelineView.h
+++ b/tools/llvm-mca/Views/TimelineView.h
@@ -153,8 +153,6 @@ class TimelineView : public View {
                           const WaitTimeEntry &E, unsigned Index,
                           unsigned Executions) const;
 
-  const unsigned DEFAULT_ITERATIONS = 10;
-
   // Display characters for the TimelineView report output.
   struct DisplayChar {
     static const char Dispatched = 'D';
-- 
GitLab


From 6080e680f1f9a7fee12bead3444f919072cea11f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 26 Oct 2018 12:33:56 +0000
Subject: [PATCH 0629/1116] Regenerate test

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345379 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/i256-add.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/CodeGen/X86/i256-add.ll b/test/CodeGen/X86/i256-add.ll
index 85a885a4315..23973bca7d5 100644
--- a/test/CodeGen/X86/i256-add.ll
+++ b/test/CodeGen/X86/i256-add.ll
@@ -12,7 +12,7 @@ define void @add(i256* %p, i256* %q) nounwind {
 ; X32-NEXT:    subl $8, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 28(%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl 24(%eax), %ecx
 ; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl 20(%eax), %esi
@@ -30,7 +30,7 @@ define void @add(i256* %p, i256* %q) nounwind {
 ; X32-NEXT:    adcl %esi, 20(%eax)
 ; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, 24(%eax)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, 28(%eax)
 ; X32-NEXT:    addl $8, %esp
 ; X32-NEXT:    popl %esi
@@ -66,7 +66,7 @@ define void @sub(i256* %p, i256* %q) nounwind {
 ; X32-NEXT:    subl $8, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 28(%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl 24(%eax), %ecx
 ; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl 20(%eax), %esi
@@ -84,7 +84,7 @@ define void @sub(i256* %p, i256* %q) nounwind {
 ; X32-NEXT:    sbbl %esi, 20(%eax)
 ; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    sbbl %ecx, 24(%eax)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    sbbl %ecx, 28(%eax)
 ; X32-NEXT:    addl $8, %esp
 ; X32-NEXT:    popl %esi
-- 
GitLab


From 92794ee479fb45ae867c2dc006051b87b03f0c4d Mon Sep 17 00:00:00 2001
From: Scott Linder <scott@scottlinder.com>
Date: Fri, 26 Oct 2018 13:18:36 +0000
Subject: [PATCH 0630/1116] [AMDGPU] Add a pass to promote bitcast calls

AMDGPU currently only supports direct calls, but at lower optimisation levels it
fails to lower statically direct calls which appear indirect due to a bitcast.

Add a pass to visit all CallSites and use CallPromotionUtils to "devirtualize"
calls.

Differential Revision: https://reviews.llvm.org/D52741


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345382 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPU.h                    |   4 +
 .../AMDGPU/AMDGPUFixFunctionBitcasts.cpp      |  63 ++++++++
 lib/Target/AMDGPU/AMDGPUTargetMachine.cpp     |   6 +
 lib/Target/AMDGPU/CMakeLists.txt              |   1 +
 test/CodeGen/AMDGPU/call-constexpr.ll         | 140 ++++++++++++++++++
 .../AMDGPU/promote-alloca-bitcast-function.ll |  14 +-
 test/CodeGen/AMDGPU/unsupported-calls.ll      |   2 +-
 7 files changed, 222 insertions(+), 8 deletions(-)
 create mode 100644 lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
 create mode 100644 test/CodeGen/AMDGPU/call-constexpr.ll

diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 5e8a402fb6e..457ec9f9a95 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -77,6 +77,10 @@ ModulePass *createAMDGPULowerIntrinsicsPass();
 void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
 extern char &AMDGPULowerIntrinsicsID;
 
+ModulePass *createAMDGPUFixFunctionBitcastsPass();
+void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &);
+extern char &AMDGPUFixFunctionBitcastsID;
+
 FunctionPass *createAMDGPULowerKernelArgumentsPass();
 void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
 extern char &AMDGPULowerKernelArgumentsID;
diff --git a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
new file mode 100644
index 00000000000..6e2a981d339
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
@@ -0,0 +1,63 @@
+//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Promote indirect (bitcast) calls to direct calls when they are statically
+/// known to be direct. Required when InstCombine is not run (e.g. at OptNone)
+/// because AMDGPU does not support indirect calls.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-fix-function-bitcasts"
+
+namespace {
+class AMDGPUFixFunctionBitcasts final
+    : public ModulePass,
+      public InstVisitor<AMDGPUFixFunctionBitcasts> {
+
+  bool runOnModule(Module &M) override;
+
+  bool Modified;
+
+public:
+  void visitCallSite(CallSite CS) {
+    if (CS.getCalledFunction())
+      return;
+    auto Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+    if (Callee && isLegalToPromote(CS, Callee)) {
+      promoteCall(CS, Callee);
+      Modified = true;
+    }
+  }
+
+  static char ID;
+  AMDGPUFixFunctionBitcasts() : ModulePass(ID) {}
+};
+} // End anonymous namespace
+
+char AMDGPUFixFunctionBitcasts::ID = 0;
+char &llvm::AMDGPUFixFunctionBitcastsID = AMDGPUFixFunctionBitcasts::ID;
+INITIALIZE_PASS(AMDGPUFixFunctionBitcasts, DEBUG_TYPE,
+                "Fix function bitcasts for AMDGPU", false, false)
+
+ModulePass *llvm::createAMDGPUFixFunctionBitcastsPass() {
+  return new AMDGPUFixFunctionBitcasts();
+}
+
+bool AMDGPUFixFunctionBitcasts::runOnModule(Module &M) {
+  Modified = false;
+  visit(M);
+  return Modified;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ef54100a9c4..6d39c254c73 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -166,6 +166,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIShrinkInstructionsPass(*PR);
   initializeSIOptimizeExecMaskingPreRAPass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
+  initializeAMDGPUFixFunctionBitcastsPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
@@ -611,6 +612,11 @@ void AMDGPUPassConfig::addIRPasses() {
   disablePass(&PatchableFunctionID);
 
   addPass(createAtomicExpandPass());
+
+  // This must occur before inlining, as the inliner will not look through
+  // bitcast calls.
+  addPass(createAMDGPUFixFunctionBitcastsPass());
+
   addPass(createAMDGPULowerIntrinsicsPass());
 
   // Function calls are not supported, so make sure we inline everything.
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 5af27cd1d8c..3c87dc18827 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -40,6 +40,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAtomicOptimizer.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
+  AMDGPUFixFunctionBitcasts.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInstrInfo.cpp
diff --git a/test/CodeGen/AMDGPU/call-constexpr.ll b/test/CodeGen/AMDGPU/call-constexpr.ll
new file mode 100644
index 00000000000..e0a39680bdf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/call-constexpr.ll
@@ -0,0 +1,140 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-fix-function-bitcasts < %s | FileCheck -check-prefix=OPT %s
+
+; GCN-LABEL: {{^}}test_bitcast_return_type_noinline:
+; GCN: s_getpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@lo+4
+; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@hi+4
+; GCN: s_swappc_b64
+; OPT-LABEL: @test_bitcast_return_type_noinline(
+; OPT: %val = call i32 @ret_i32_noinline()
+; OPT: bitcast i32 %val to float
+define amdgpu_kernel void @test_bitcast_return_type_noinline() #0 {
+  %val = call float bitcast (i32()* @ret_i32_noinline to float()*)()
+  %op = fadd float %val, 1.0
+  store volatile float %op, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bitcast_return_type_alwaysinline:
+; GCN-NOT: s_getpc_b64
+; GCN-NOT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_alwaysinline@rel32@lo+4
+; GCN-NOT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_alwaysinline@rel32@hi+4
+; GCN-NOT: s_swappc_b64
+; OPT-LABEL: @test_bitcast_return_type_alwaysinline(
+; OPT: %val = call i32 @ret_i32_alwaysinline()
+; OPT: bitcast i32 %val to float
+define amdgpu_kernel void @test_bitcast_return_type_alwaysinline() #0 {
+  %val = call float bitcast (i32()* @ret_i32_alwaysinline to float()*)()
+  %op = fadd float %val, 1.0
+  store volatile float %op, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bitcast_argument_type:
+; GCN: s_getpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
+; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4
+; GCN: s_swappc_b64
+; OPT-LABEL: @test_bitcast_argument_type(
+; OPT: %1 = bitcast float 2.000000e+00 to i32
+; OPT: %val = call i32 @ident_i32(i32 %1)
+; OPT-NOT: bitcast i32 %val to float
+define amdgpu_kernel void @test_bitcast_argument_type() #0 {
+  %val = call i32 bitcast (i32(i32)* @ident_i32 to i32(float)*)(float 2.0)
+  %op = add i32 %val, 1
+  store volatile i32 %op, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bitcast_argument_and_return_types:
+; GCN: s_getpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
+; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4
+; GCN: s_swappc_b64
+; OPT-LABEL: @test_bitcast_argument_and_return_types(
+; OPT: %1 = bitcast float 2.000000e+00 to i32
+; OPT: %val = call i32 @ident_i32(i32 %1)
+; OPT: bitcast i32 %val to float
+define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 {
+  %val = call float bitcast (i32(i32)* @ident_i32 to float(float)*)(float 2.0)
+  %op = fadd float %val, 1.0
+  store volatile float %op, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_x:
+; GCN: s_waitcnt
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT: s_setpc_b64
+define i32 @use_workitem_id_x(i32 %arg0) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %op = add i32 %id, %arg0
+  ret i32 %op
+}
+
+; GCN-LABEL: {{^}}test_bitcast_use_workitem_id_x:
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: s_getpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4
+; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+4
+; GCN: v_mov_b32_e32 v0, 9
+; GCN: s_swappc_b64
+; GCN: v_add_f32_e32
+; OPT-LABEL: @use_workitem_id_x(
+; OPT: %val = call i32 @use_workitem_id_x(i32 9)
+; OPT: bitcast i32 %val to float
+define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #0 {
+  %val = call float bitcast (i32(i32)* @use_workitem_id_x to float(i32)*)(i32 9)
+  %op = fadd float %val, 1.0
+  store volatile float %op, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_invoke:
+; GCN: s_getpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
+; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4
+; GCN: s_swappc_b64
+; OPT-LABEL: @test_invoke(
+; OPT: %1 = bitcast float 2.000000e+00 to i32
+; OPT: %val = invoke i32 @ident_i32(i32 %1)
+; OPT-NEXT: to label %continue unwind label %broken
+; OPT-LABEL: continue.split:
+; OPT: bitcast i32 %val to float
+@_ZTIi = external global i8*
+declare i32 @__gxx_personality_v0(...)
+define amdgpu_kernel void @test_invoke() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+  %val = invoke float bitcast (i32(i32)* @ident_i32 to float(float)*)(float 2.0)
+          to label %continue unwind label %broken
+
+broken:
+  landingpad { i8*, i32 } catch i8** @_ZTIi
+  ret void
+
+continue:
+  %op = fadd float %val, 1.0
+  store volatile float %op, float addrspace(1)* undef
+  ret void
+}
+
+; Callees appears last in source file to test that we still lower their
+; arguments before we lower any calls to them.
+
+define i32 @ret_i32_noinline() #0 {
+  ret i32 4
+}
+
+define i32 @ret_i32_alwaysinline() #1 {
+  ret i32 4
+}
+
+define i32 @ident_i32(i32 %i) #0 {
+  ret i32 %i
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+attributes #0 = { nounwind noinline }
+attributes #1 = { alwaysinline nounwind }
+attributes #2 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
index 19e89ce97a9..5d8863f4337 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
@@ -1,8 +1,4 @@
-; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck %s
-
-; FIXME: Error is misleading because it's not an indirect call.
-
-; CHECK: error: <unknown>:0:0: in function crash_call_constexpr_cast void (): unsupported indirect call to function foo
+; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s
 
 ; Make sure that AMDGPUPromoteAlloca doesn't crash if the called
 ; function is a constantexpr cast of a function.
@@ -10,14 +6,18 @@
 declare void @foo(float addrspace(5)*) #0
 declare void @foo.varargs(...) #0
 
-; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo
+; CHECK-LABEL: @crash_call_constexpr_cast(
+; CHECK: alloca
+; CHECK: call void
 define amdgpu_kernel void @crash_call_constexpr_cast() #0 {
   %alloca = alloca i32, addrspace(5)
   call void bitcast (void (float addrspace(5)*)* @foo to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0
   ret void
 }
 
-; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs
+; CHECK-LABEL: @crash_call_constexpr_cast_varargs(
+; CHECK: alloca
+; CHECK: call void
 define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 {
   %alloca = alloca i32, addrspace(5)
   call void bitcast (void (...)* @foo.varargs to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0
diff --git a/test/CodeGen/AMDGPU/unsupported-calls.ll b/test/CodeGen/AMDGPU/unsupported-calls.ll
index 2b6e15b79a4..303a0d6a114 100644
--- a/test/CodeGen/AMDGPU/unsupported-calls.ll
+++ b/test/CodeGen/AMDGPU/unsupported-calls.ll
@@ -53,7 +53,7 @@ define void @test_call_varargs() {
 
 declare i32 @extern_variadic(...)
 
-; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported indirect call to function extern_variadic
+; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to variadic function extern_variadic
 ; R600: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to function extern_variadic
 define i32 @test_tail_call_bitcast_extern_variadic(<4 x float> %arg0, <4 x float> %arg1, i32 %arg2) {
   %add = fadd <4 x float> %arg0, %arg1
-- 
GitLab


From 0cb7fad74cf71229ac3e92d27222c395f97da8cb Mon Sep 17 00:00:00 2001
From: Owen Reynolds <gbreynoo@gmail.com>
Date: Fri, 26 Oct 2018 13:34:38 +0000
Subject: [PATCH 0631/1116] [llvm-ar] Access ADDLIB in llvm-ar via command line

ADDLIB is called to add the contents of an archive to another archive.
Previously this was only accessible through the use of an MRI script.

With the use of a new "L" modifier, archive files can treated in the
manner above when using quick append.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345383 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-ar/Inputs/add-lib1.yaml |   30 +
 test/tools/llvm-ar/Inputs/add-lib2.yaml |   30 +
 test/tools/llvm-ar/Inputs/add-lib3.yaml |   30 +
 test/tools/llvm-ar/add-library.test     |   43 +
 tools/llvm-ar/llvm-ar.cpp               | 1997 ++++++++++++-----------
 5 files changed, 1151 insertions(+), 979 deletions(-)
 create mode 100644 test/tools/llvm-ar/Inputs/add-lib1.yaml
 create mode 100644 test/tools/llvm-ar/Inputs/add-lib2.yaml
 create mode 100644 test/tools/llvm-ar/Inputs/add-lib3.yaml
 create mode 100644 test/tools/llvm-ar/add-library.test

diff --git a/test/tools/llvm-ar/Inputs/add-lib1.yaml b/test/tools/llvm-ar/Inputs/add-lib1.yaml
new file mode 100644
index 00000000000..7ae9fd95a62
--- /dev/null
+++ b/test/tools/llvm-ar/Inputs/add-lib1.yaml
@@ -0,0 +1,30 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000004
+    Content:         ''
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x0000000000000001
+    Content:         00636C616E672076657273696F6E20332E392E3020287472756E6B203237333632342920286C6C766D2F7472756E6B203237333633362900
+  - Name:            .note.GNU-stack
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x0000000000000001
+    Content:         ''
+Symbols:
+  Global:
+    - Name:     lib1
+      Index:    SHN_ABS
+      Value:    0x1234
+  Local:
+    - Name:            '-'
+      Type:            STT_FILE
+...
diff --git a/test/tools/llvm-ar/Inputs/add-lib2.yaml b/test/tools/llvm-ar/Inputs/add-lib2.yaml
new file mode 100644
index 00000000000..8d224b95a4d
--- /dev/null
+++ b/test/tools/llvm-ar/Inputs/add-lib2.yaml
@@ -0,0 +1,30 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000004
+    Content:         ''
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x0000000000000001
+    Content:         00636C616E672076657273696F6E20332E392E3020287472756E6B203237333632342920286C6C766D2F7472756E6B203237333633362900
+  - Name:            .note.GNU-stack
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x0000000000000001
+    Content:         ''
+Symbols:
+  Global:
+    - Name:     lib2
+      Index:    SHN_ABS
+      Value:    0x1234
+  Local:
+    - Name:            '-'
+      Type:            STT_FILE
+...
diff --git a/test/tools/llvm-ar/Inputs/add-lib3.yaml b/test/tools/llvm-ar/Inputs/add-lib3.yaml
new file mode 100644
index 00000000000..0f1cfe7d806
--- /dev/null
+++ b/test/tools/llvm-ar/Inputs/add-lib3.yaml
@@ -0,0 +1,30 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000004
+    Content:         ''
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x0000000000000001
+    Content:         00636C616E672076657273696F6E20332E392E3020287472756E6B203237333632342920286C6C766D2F7472756E6B203237333633362900
+  - Name:            .note.GNU-stack
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x0000000000000001
+    Content:         ''
+Symbols:
+  Global:
+    - Name:     lib3
+      Index:    SHN_ABS
+      Value:    0x1234
+  Local:
+    - Name:            '-'
+      Type:            STT_FILE
+...
diff --git a/test/tools/llvm-ar/add-library.test b/test/tools/llvm-ar/add-library.test
new file mode 100644
index 00000000000..bd44a7e9a4a
--- /dev/null
+++ b/test/tools/llvm-ar/add-library.test
@@ -0,0 +1,43 @@
+RUN: yaml2obj %S/Inputs/add-lib1.yaml -o %t-add-lib1.o
+RUN: yaml2obj %S/Inputs/add-lib2.yaml -o %t-add-lib2.o
+RUN: yaml2obj %S/Inputs/add-lib2.yaml -o %t-add-lib3.o
+
+RUN: rm -f %t.ar
+RUN: llvm-ar crs %t.ar %t-add-lib1.o
+RUN: llvm-ar cqs %t.ar %t-add-lib2.o
+
+RUN: llvm-ar tv %t.ar | FileCheck %s --check-prefix=CHECK-NAMES-NO-ADDLIB
+CHECK-NAMES-NO-ADDLIB: add-library.test.tmp-add-lib1.o
+CHECK-NAMES-NO-ADDLIB: add-library.test.tmp-add-lib2.o
+
+RUN: llvm-nm %t.ar | FileCheck %s --check-prefix=CHECK-SYMBOLS-NO-ADDLIB
+CHECK-SYMBOLS-NO-ADDLIB: add-lib1
+CHECK-SYMBOLS-NO-ADDLIB: add-lib2
+
+RUN: rm -f %t1.ar
+RUN: llvm-ar crs %t1.ar %t-add-lib3.o
+RUN: llvm-ar cqLs %t1.ar %t.ar
+
+RUN: llvm-ar tv %t1.ar | FileCheck %s --check-prefix=CHECK-NAMES-ADDLIB
+CHECK-NAMES-ADDLIB: add-library.test.tmp-add-lib3.o
+CHECK-NAMES-ADDLIB: add-library.test.tmp-add-lib1.o
+CHECK-NAMES-ADDLIB: add-library.test.tmp-add-lib2.o
+
+RUN: llvm-nm %t1.ar | FileCheck %s --check-prefix=CHECK-SYMBOLS-ADDLIB
+CHECK-SYMBOLS-ADDLIB: add-lib3
+CHECK-SYMBOLS-ADDLIB: add-lib1
+CHECK-SYMBOLS-ADDLIB: add-lib2
+
+RUN: llvm-ar cqLs %t1.ar %t-add-lib1.o
+
+RUN: llvm-ar tv %t1.ar | FileCheck %s --check-prefix=CHECK-NAMES-DUPLICATE
+CHECK-NAMES-DUPLICATE: add-library.test.tmp-add-lib3.o
+CHECK-NAMES-DUPLICATE: add-library.test.tmp-add-lib1.o
+CHECK-NAMES-DUPLICATE: add-library.test.tmp-add-lib2.o
+CHECK-NAMES-DUPLICATE: add-library.test.tmp-add-lib1.o
+
+RUN: llvm-nm %t1.ar | FileCheck %s --check-prefix=CHECK-SYMBOLS-DUPLICATE
+CHECK-SYMBOLS-DUPLICATE: add-lib3
+CHECK-SYMBOLS-DUPLICATE: add-lib1
+CHECK-SYMBOLS-DUPLICATE: add-lib2
+CHECK-SYMBOLS-DUPLICATE: add-lib1
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 454b3971d28..2c6dc8fad92 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -1,979 +1,1018 @@
-//===-- llvm-ar.cpp - LLVM archive librarian utility ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Builds up (relatively) standard unix archive files (.a) containing LLVM
-// bitcode or other files.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Object/Archive.h"
-#include "llvm/Object/ArchiveWriter.h"
-#include "llvm/Object/MachO.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Chrono.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/LineIterator.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/StringSaver.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
-#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
-
-#if !defined(_MSC_VER) && !defined(__MINGW32__)
-#include <unistd.h>
-#else
-#include <io.h>
-#endif
-
-using namespace llvm;
-
-// The name this program was invoked as.
-static StringRef ToolName;
-
-// The basename of this program.
-static StringRef Stem;
-
-const char RanlibHelp[] = R"(
-OVERVIEW: LLVM Ranlib (llvm-ranlib)
-
-  This program generates an index to speed access to archives
-
-USAGE: llvm-ranlib <archive-file>
-
-OPTIONS:
-  -help                             - Display available options
-  -version                          - Display the version of this program
-)";
-
-const char ArHelp[] = R"(
-OVERVIEW: LLVM Archiver
-
-USAGE: llvm-ar [options] [-]<operation>[modifiers] [relpos] <archive> [files]
-       llvm-ar -M [<mri-script]
-
-OPTIONS:
-  --format              - Archive format to create
-    =default            -   default
-    =gnu                -   gnu
-    =darwin             -   darwin
-    =bsd                -   bsd
-  --plugin=<string>     - Ignored for compatibility
-  --help                - Display available options
-  --version             - Display the version of this program
-
-OPERATIONS:
-  d - delete [files] from the archive
-  m - move [files] in the archive
-  p - print [files] found in the archive
-  q - quick append [files] to the archive
-  r - replace or insert [files] into the archive
-  s - act as ranlib
-  t - display contents of archive
-  x - extract [files] from the archive
-
-MODIFIERS:
-  [a] - put [files] after [relpos]
-  [b] - put [files] before [relpos] (same as [i])
-  [c] - do not warn if archive had to be created
-  [D] - use zero for timestamps and uids/gids (default)
-  [i] - put [files] before [relpos] (same as [b])
-  [l] - ignored for compatibility
-  [o] - preserve original dates
-  [s] - create an archive index (cf. ranlib)
-  [S] - do not build a symbol table
-  [T] - create a thin archive
-  [u] - update only [files] newer than archive contents
-  [U] - use actual timestamps and uids/gids
-  [v] - be verbose about actions taken
-)";
-
-void printHelpMessage() {
-  if (Stem.contains_lower("ranlib"))
-    outs() << RanlibHelp;
-  else if (Stem.contains_lower("ar"))
-    outs() << ArHelp;
-}
-
-// Show the error message and exit.
-LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) {
-  errs() << ToolName << ": " << Error << ".\n";
-  printHelpMessage();
-  exit(1);
-}
-
-static void failIfError(std::error_code EC, Twine Context = "") {
-  if (!EC)
-    return;
-
-  std::string ContextStr = Context.str();
-  if (ContextStr == "")
-    fail(EC.message());
-  fail(Context + ": " + EC.message());
-}
-
-static void failIfError(Error E, Twine Context = "") {
-  if (!E)
-    return;
-
-  handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EIB) {
-    std::string ContextStr = Context.str();
-    if (ContextStr == "")
-      fail(EIB.message());
-    fail(Context + ": " + EIB.message());
-  });
-}
-
-static SmallVector<const char *, 256> PositionalArgs;
-
-static bool MRI;
-
-namespace {
-enum Format { Default, GNU, BSD, DARWIN, Unknown };
-}
-
-static Format FormatType = Default;
-
-static std::string Options;
-
-// This enumeration delineates the kinds of operations on an archive
-// that are permitted.
-enum ArchiveOperation {
-  Print,            ///< Print the contents of the archive
-  Delete,           ///< Delete the specified members
-  Move,             ///< Move members to end or as given by {a,b,i} modifiers
-  QuickAppend,      ///< Quickly append to end of archive
-  ReplaceOrInsert,  ///< Replace or Insert members
-  DisplayTable,     ///< Display the table of contents
-  Extract,          ///< Extract files back to file system
-  CreateSymTab      ///< Create a symbol table in an existing archive
-};
-
-// Modifiers to follow operation to vary behavior
-static bool AddAfter = false;      ///< 'a' modifier
-static bool AddBefore = false;     ///< 'b' modifier
-static bool Create = false;        ///< 'c' modifier
-static bool OriginalDates = false; ///< 'o' modifier
-static bool OnlyUpdate = false;    ///< 'u' modifier
-static bool Verbose = false;       ///< 'v' modifier
-static bool Symtab = true;         ///< 's' modifier
-static bool Deterministic = true;  ///< 'D' and 'U' modifiers
-static bool Thin = false;          ///< 'T' modifier
-
-// Relative Positional Argument (for insert/move). This variable holds
-// the name of the archive member to which the 'a', 'b' or 'i' modifier
-// refers. Only one of 'a', 'b' or 'i' can be specified so we only need
-// one variable.
-static std::string RelPos;
-
-// This variable holds the name of the archive file as given on the
-// command line.
-static std::string ArchiveName;
-
-// This variable holds the list of member files to proecess, as given
-// on the command line.
-static std::vector<StringRef> Members;
-
-// Extract the member filename from the command line for the [relpos] argument
-// associated with a, b, and i modifiers
-static void getRelPos() {
-  if (PositionalArgs.size() == 0)
-    fail("Expected [relpos] for a, b, or i modifier");
-  RelPos = PositionalArgs[0];
-  PositionalArgs.erase(PositionalArgs.begin());
-}
-
-// Get the archive file name from the command line
-static void getArchive() {
-  if (PositionalArgs.size() == 0)
-    fail("An archive name must be specified");
-  ArchiveName = PositionalArgs[0];
-  PositionalArgs.erase(PositionalArgs.begin());
-}
-
-// Copy over remaining items in PositionalArgs to our Members vector
-static void getMembers() {
-  for (auto &Arg : PositionalArgs)
-    Members.push_back(Arg);
-}
-
-static void runMRIScript();
-
-// Parse the command line options as presented and return the operation
-// specified. Process all modifiers and check to make sure that constraints on
-// modifier/operation pairs have not been violated.
-static ArchiveOperation parseCommandLine() {
-  if (MRI) {
-    if (!PositionalArgs.empty() || !Options.empty())
-      fail("Cannot mix -M and other options");
-    runMRIScript();
-  }
-
-  // Keep track of number of operations. We can only specify one
-  // per execution.
-  unsigned NumOperations = 0;
-
-  // Keep track of the number of positional modifiers (a,b,i). Only
-  // one can be specified.
-  unsigned NumPositional = 0;
-
-  // Keep track of which operation was requested
-  ArchiveOperation Operation;
-
-  bool MaybeJustCreateSymTab = false;
-
-  for(unsigned i=0; i<Options.size(); ++i) {
-    switch(Options[i]) {
-    case 'd': ++NumOperations; Operation = Delete; break;
-    case 'm': ++NumOperations; Operation = Move ; break;
-    case 'p': ++NumOperations; Operation = Print; break;
-    case 'q': ++NumOperations; Operation = QuickAppend; break;
-    case 'r': ++NumOperations; Operation = ReplaceOrInsert; break;
-    case 't': ++NumOperations; Operation = DisplayTable; break;
-    case 'x': ++NumOperations; Operation = Extract; break;
-    case 'c': Create = true; break;
-    case 'l': /* accepted but unused */ break;
-    case 'o': OriginalDates = true; break;
-    case 's':
-      Symtab = true;
-      MaybeJustCreateSymTab = true;
-      break;
-    case 'S':
-      Symtab = false;
-      break;
-    case 'u': OnlyUpdate = true; break;
-    case 'v': Verbose = true; break;
-    case 'a':
-      getRelPos();
-      AddAfter = true;
-      NumPositional++;
-      break;
-    case 'b':
-      getRelPos();
-      AddBefore = true;
-      NumPositional++;
-      break;
-    case 'i':
-      getRelPos();
-      AddBefore = true;
-      NumPositional++;
-      break;
-    case 'D':
-      Deterministic = true;
-      break;
-    case 'U':
-      Deterministic = false;
-      break;
-    case 'T':
-      Thin = true;
-      break;
-    default:
-      fail(std::string("unknown option ") + Options[i]);
-    }
-  }
-
-  // At this point, the next thing on the command line must be
-  // the archive name.
-  getArchive();
-
-  // Everything on the command line at this point is a member.
-  getMembers();
-
- if (NumOperations == 0 && MaybeJustCreateSymTab) {
-    NumOperations = 1;
-    Operation = CreateSymTab;
-    if (!Members.empty())
-      fail("The s operation takes only an archive as argument");
-  }
-
-  // Perform various checks on the operation/modifier specification
-  // to make sure we are dealing with a legal request.
-  if (NumOperations == 0)
-    fail("You must specify at least one of the operations");
-  if (NumOperations > 1)
-    fail("Only one operation may be specified");
-  if (NumPositional > 1)
-    fail("You may only specify one of a, b, and i modifiers");
-  if (AddAfter || AddBefore) {
-    if (Operation != Move && Operation != ReplaceOrInsert)
-      fail("The 'a', 'b' and 'i' modifiers can only be specified with "
-           "the 'm' or 'r' operations");
-  }
-  if (OriginalDates && Operation != Extract)
-    fail("The 'o' modifier is only applicable to the 'x' operation");
-  if (OnlyUpdate && Operation != ReplaceOrInsert)
-    fail("The 'u' modifier is only applicable to the 'r' operation");
-
-  // Return the parsed operation to the caller
-  return Operation;
-}
-
-// Implements the 'p' operation. This function traverses the archive
-// looking for members that match the path list.
-static void doPrint(StringRef Name, const object::Archive::Child &C) {
-  if (Verbose)
-    outs() << "Printing " << Name << "\n";
-
-  Expected<StringRef> DataOrErr = C.getBuffer();
-  failIfError(DataOrErr.takeError());
-  StringRef Data = *DataOrErr;
-  outs().write(Data.data(), Data.size());
-}
-
-// Utility function for printing out the file mode when the 't' operation is in
-// verbose mode.
-static void printMode(unsigned mode) {
-  outs() << ((mode & 004) ? "r" : "-");
-  outs() << ((mode & 002) ? "w" : "-");
-  outs() << ((mode & 001) ? "x" : "-");
-}
-
-// Implement the 't' operation. This function prints out just
-// the file names of each of the members. However, if verbose mode is requested
-// ('v' modifier) then the file type, permission mode, user, group, size, and
-// modification time are also printed.
-static void doDisplayTable(StringRef Name, const object::Archive::Child &C) {
-  if (Verbose) {
-    Expected<sys::fs::perms> ModeOrErr = C.getAccessMode();
-    failIfError(ModeOrErr.takeError());
-    sys::fs::perms Mode = ModeOrErr.get();
-    printMode((Mode >> 6) & 007);
-    printMode((Mode >> 3) & 007);
-    printMode(Mode & 007);
-    Expected<unsigned> UIDOrErr = C.getUID();
-    failIfError(UIDOrErr.takeError());
-    outs() << ' ' << UIDOrErr.get();
-    Expected<unsigned> GIDOrErr = C.getGID();
-    failIfError(GIDOrErr.takeError());
-    outs() << '/' << GIDOrErr.get();
-    Expected<uint64_t> Size = C.getSize();
-    failIfError(Size.takeError());
-    outs() << ' ' << format("%6llu", Size.get());
-    auto ModTimeOrErr = C.getLastModified();
-    failIfError(ModTimeOrErr.takeError());
-    // Note: formatv() only handles the default TimePoint<>, which is in
-    // nanoseconds.
-    // TODO: fix format_provider<TimePoint<>> to allow other units.
-    sys::TimePoint<> ModTimeInNs = ModTimeOrErr.get();
-    outs() << ' ' << formatv("{0:%b %e %H:%M %Y}", ModTimeInNs);
-    outs() << ' ';
-  }
-
-  if (C.getParent()->isThin()) {
-    outs() << sys::path::parent_path(ArchiveName);
-    outs() << '/';
-  }
-  outs() << Name << "\n";
-}
-
-// Implement the 'x' operation. This function extracts files back to the file
-// system.
-static void doExtract(StringRef Name, const object::Archive::Child &C) {
-  // Retain the original mode.
-  Expected<sys::fs::perms> ModeOrErr = C.getAccessMode();
-  failIfError(ModeOrErr.takeError());
-  sys::fs::perms Mode = ModeOrErr.get();
-
-  int FD;
-  failIfError(sys::fs::openFileForWrite(sys::path::filename(Name), FD,
-                                        sys::fs::CD_CreateAlways,
-                                        sys::fs::F_None, Mode),
-              Name);
-
-  {
-    raw_fd_ostream file(FD, false);
-
-    // Get the data and its length
-    Expected<StringRef> BufOrErr = C.getBuffer();
-    failIfError(BufOrErr.takeError());
-    StringRef Data = BufOrErr.get();
-
-    // Write the data.
-    file.write(Data.data(), Data.size());
-  }
-
-  // If we're supposed to retain the original modification times, etc. do so
-  // now.
-  if (OriginalDates) {
-    auto ModTimeOrErr = C.getLastModified();
-    failIfError(ModTimeOrErr.takeError());
-    failIfError(
-        sys::fs::setLastAccessAndModificationTime(FD, ModTimeOrErr.get()));
-  }
-
-  if (close(FD))
-    fail("Could not close the file");
-}
-
-static bool shouldCreateArchive(ArchiveOperation Op) {
-  switch (Op) {
-  case Print:
-  case Delete:
-  case Move:
-  case DisplayTable:
-  case Extract:
-  case CreateSymTab:
-    return false;
-
-  case QuickAppend:
-  case ReplaceOrInsert:
-    return true;
-  }
-
-  llvm_unreachable("Missing entry in covered switch.");
-}
-
-static void performReadOperation(ArchiveOperation Operation,
-                                 object::Archive *OldArchive) {
-  if (Operation == Extract && OldArchive->isThin())
-    fail("extracting from a thin archive is not supported");
-
-  bool Filter = !Members.empty();
-  {
-    Error Err = Error::success();
-    for (auto &C : OldArchive->children(Err)) {
-      Expected<StringRef> NameOrErr = C.getName();
-      failIfError(NameOrErr.takeError());
-      StringRef Name = NameOrErr.get();
-
-      if (Filter) {
-        auto I = find(Members, Name);
-        if (I == Members.end())
-          continue;
-        Members.erase(I);
-      }
-
-      switch (Operation) {
-      default:
-        llvm_unreachable("Not a read operation");
-      case Print:
-        doPrint(Name, C);
-        break;
-      case DisplayTable:
-        doDisplayTable(Name, C);
-        break;
-      case Extract:
-        doExtract(Name, C);
-        break;
-      }
-    }
-    failIfError(std::move(Err));
-  }
-
-  if (Members.empty())
-    return;
-  for (StringRef Name : Members)
-    errs() << Name << " was not found\n";
-  exit(1);
-}
-
-static void addMember(std::vector<NewArchiveMember> &Members,
-                      StringRef FileName, int Pos = -1) {
-  Expected<NewArchiveMember> NMOrErr =
-      NewArchiveMember::getFile(FileName, Deterministic);
-  failIfError(NMOrErr.takeError(), FileName);
-
-  // Use the basename of the object path for the member name.
-  NMOrErr->MemberName = sys::path::filename(NMOrErr->MemberName);
-
-  if (Pos == -1)
-    Members.push_back(std::move(*NMOrErr));
-  else
-    Members[Pos] = std::move(*NMOrErr);
-}
-
-static void addMember(std::vector<NewArchiveMember> &Members,
-                      const object::Archive::Child &M, int Pos = -1) {
-  if (Thin && !M.getParent()->isThin())
-    fail("Cannot convert a regular archive to a thin one");
-  Expected<NewArchiveMember> NMOrErr =
-      NewArchiveMember::getOldMember(M, Deterministic);
-  failIfError(NMOrErr.takeError());
-  if (Pos == -1)
-    Members.push_back(std::move(*NMOrErr));
-  else
-    Members[Pos] = std::move(*NMOrErr);
-}
-
-enum InsertAction {
-  IA_AddOldMember,
-  IA_AddNewMember,
-  IA_Delete,
-  IA_MoveOldMember,
-  IA_MoveNewMember
-};
-
-static InsertAction computeInsertAction(ArchiveOperation Operation,
-                                        const object::Archive::Child &Member,
-                                        StringRef Name,
-                                        std::vector<StringRef>::iterator &Pos) {
-  if (Operation == QuickAppend || Members.empty())
-    return IA_AddOldMember;
-
-  auto MI = find_if(Members, [Name](StringRef Path) {
-    return Name == sys::path::filename(Path);
-  });
-
-  if (MI == Members.end())
-    return IA_AddOldMember;
-
-  Pos = MI;
-
-  if (Operation == Delete)
-    return IA_Delete;
-
-  if (Operation == Move)
-    return IA_MoveOldMember;
-
-  if (Operation == ReplaceOrInsert) {
-    StringRef PosName = sys::path::filename(RelPos);
-    if (!OnlyUpdate) {
-      if (PosName.empty())
-        return IA_AddNewMember;
-      return IA_MoveNewMember;
-    }
-
-    // We could try to optimize this to a fstat, but it is not a common
-    // operation.
-    sys::fs::file_status Status;
-    failIfError(sys::fs::status(*MI, Status), *MI);
-    auto ModTimeOrErr = Member.getLastModified();
-    failIfError(ModTimeOrErr.takeError());
-    if (Status.getLastModificationTime() < ModTimeOrErr.get()) {
-      if (PosName.empty())
-        return IA_AddOldMember;
-      return IA_MoveOldMember;
-    }
-
-    if (PosName.empty())
-      return IA_AddNewMember;
-    return IA_MoveNewMember;
-  }
-  llvm_unreachable("No such operation");
-}
-
-// We have to walk this twice and computing it is not trivial, so creating an
-// explicit std::vector is actually fairly efficient.
-static std::vector<NewArchiveMember>
-computeNewArchiveMembers(ArchiveOperation Operation,
-                         object::Archive *OldArchive) {
-  std::vector<NewArchiveMember> Ret;
-  std::vector<NewArchiveMember> Moved;
-  int InsertPos = -1;
-  StringRef PosName = sys::path::filename(RelPos);
-  if (OldArchive) {
-    Error Err = Error::success();
-    for (auto &Child : OldArchive->children(Err)) {
-      int Pos = Ret.size();
-      Expected<StringRef> NameOrErr = Child.getName();
-      failIfError(NameOrErr.takeError());
-      StringRef Name = NameOrErr.get();
-      if (Name == PosName) {
-        assert(AddAfter || AddBefore);
-        if (AddBefore)
-          InsertPos = Pos;
-        else
-          InsertPos = Pos + 1;
-      }
-
-      std::vector<StringRef>::iterator MemberI = Members.end();
-      InsertAction Action =
-          computeInsertAction(Operation, Child, Name, MemberI);
-      switch (Action) {
-      case IA_AddOldMember:
-        addMember(Ret, Child);
-        break;
-      case IA_AddNewMember:
-        addMember(Ret, *MemberI);
-        break;
-      case IA_Delete:
-        break;
-      case IA_MoveOldMember:
-        addMember(Moved, Child);
-        break;
-      case IA_MoveNewMember:
-        addMember(Moved, *MemberI);
-        break;
-      }
-      if (MemberI != Members.end())
-        Members.erase(MemberI);
-    }
-    failIfError(std::move(Err));
-  }
-
-  if (Operation == Delete)
-    return Ret;
-
-  if (!RelPos.empty() && InsertPos == -1)
-    fail("Insertion point not found");
-
-  if (RelPos.empty())
-    InsertPos = Ret.size();
-
-  assert(unsigned(InsertPos) <= Ret.size());
-  int Pos = InsertPos;
-  for (auto &M : Moved) {
-    Ret.insert(Ret.begin() + Pos, std::move(M));
-    ++Pos;
-  }
-
-  for (unsigned I = 0; I != Members.size(); ++I)
-    Ret.insert(Ret.begin() + InsertPos, NewArchiveMember());
-  Pos = InsertPos;
-  for (auto &Member : Members) {
-    addMember(Ret, Member, Pos);
-    ++Pos;
-  }
-
-  return Ret;
-}
-
-static object::Archive::Kind getDefaultForHost() {
-  return Triple(sys::getProcessTriple()).isOSDarwin()
-             ? object::Archive::K_DARWIN
-             : object::Archive::K_GNU;
-}
-
-static object::Archive::Kind getKindFromMember(const NewArchiveMember &Member) {
-  Expected<std::unique_ptr<object::ObjectFile>> OptionalObject =
-      object::ObjectFile::createObjectFile(Member.Buf->getMemBufferRef());
-
-  if (OptionalObject)
-    return isa<object::MachOObjectFile>(**OptionalObject)
-               ? object::Archive::K_DARWIN
-               : object::Archive::K_GNU;
-
-  // squelch the error in case we had a non-object file
-  consumeError(OptionalObject.takeError());
-  return getDefaultForHost();
-}
-
-static void
-performWriteOperation(ArchiveOperation Operation,
-                      object::Archive *OldArchive,
-                      std::unique_ptr<MemoryBuffer> OldArchiveBuf,
-                      std::vector<NewArchiveMember> *NewMembersP) {
-  std::vector<NewArchiveMember> NewMembers;
-  if (!NewMembersP)
-    NewMembers = computeNewArchiveMembers(Operation, OldArchive);
-
-  object::Archive::Kind Kind;
-  switch (FormatType) {
-  case Default:
-    if (Thin)
-      Kind = object::Archive::K_GNU;
-    else if (OldArchive)
-      Kind = OldArchive->kind();
-    else if (NewMembersP)
-      Kind = NewMembersP->size() ? getKindFromMember(NewMembersP->front())
-                                 : getDefaultForHost();
-    else
-      Kind = NewMembers.size() ? getKindFromMember(NewMembers.front())
-                               : getDefaultForHost();
-    break;
-  case GNU:
-    Kind = object::Archive::K_GNU;
-    break;
-  case BSD:
-    if (Thin)
-      fail("Only the gnu format has a thin mode");
-    Kind = object::Archive::K_BSD;
-    break;
-  case DARWIN:
-    if (Thin)
-      fail("Only the gnu format has a thin mode");
-    Kind = object::Archive::K_DARWIN;
-    break;
-  case Unknown:
-    llvm_unreachable("");
-  }
-
-  Error E =
-      writeArchive(ArchiveName, NewMembersP ? *NewMembersP : NewMembers, Symtab,
-                   Kind, Deterministic, Thin, std::move(OldArchiveBuf));
-  failIfError(std::move(E), ArchiveName);
-}
-
-static void createSymbolTable(object::Archive *OldArchive) {
-  // When an archive is created or modified, if the s option is given, the
-  // resulting archive will have a current symbol table. If the S option
-  // is given, it will have no symbol table.
-  // In summary, we only need to update the symbol table if we have none.
-  // This is actually very common because of broken build systems that think
-  // they have to run ranlib.
-  if (OldArchive->hasSymbolTable())
-    return;
-
-  performWriteOperation(CreateSymTab, OldArchive, nullptr, nullptr);
-}
-
-static void performOperation(ArchiveOperation Operation,
-                             object::Archive *OldArchive,
-                             std::unique_ptr<MemoryBuffer> OldArchiveBuf,
-                             std::vector<NewArchiveMember> *NewMembers) {
-  switch (Operation) {
-  case Print:
-  case DisplayTable:
-  case Extract:
-    performReadOperation(Operation, OldArchive);
-    return;
-
-  case Delete:
-  case Move:
-  case QuickAppend:
-  case ReplaceOrInsert:
-    performWriteOperation(Operation, OldArchive, std::move(OldArchiveBuf),
-                          NewMembers);
-    return;
-  case CreateSymTab:
-    createSymbolTable(OldArchive);
-    return;
-  }
-  llvm_unreachable("Unknown operation.");
-}
-
-static int performOperation(ArchiveOperation Operation,
-                            std::vector<NewArchiveMember> *NewMembers) {
-  // Create or open the archive object.
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
-      MemoryBuffer::getFile(ArchiveName, -1, false);
-  std::error_code EC = Buf.getError();
-  if (EC && EC != errc::no_such_file_or_directory)
-    fail("error opening '" + ArchiveName + "': " + EC.message() + "!");
-
-  if (!EC) {
-    Error Err = Error::success();
-    object::Archive Archive(Buf.get()->getMemBufferRef(), Err);
-    EC = errorToErrorCode(std::move(Err));
-    failIfError(EC,
-                "error loading '" + ArchiveName + "': " + EC.message() + "!");
-    performOperation(Operation, &Archive, std::move(Buf.get()), NewMembers);
-    return 0;
-  }
-
-  assert(EC == errc::no_such_file_or_directory);
-
-  if (!shouldCreateArchive(Operation)) {
-    failIfError(EC, Twine("error loading '") + ArchiveName + "'");
-  } else {
-    if (!Create) {
-      // Produce a warning if we should and we're creating the archive
-      errs() << ToolName << ": creating " << ArchiveName << "\n";
-    }
-  }
-
-  performOperation(Operation, nullptr, nullptr, NewMembers);
-  return 0;
-}
-
-static void runMRIScript() {
-  enum class MRICommand { AddLib, AddMod, Create, Delete, Save, End, Invalid };
-
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getSTDIN();
-  failIfError(Buf.getError());
-  const MemoryBuffer &Ref = *Buf.get();
-  bool Saved = false;
-  std::vector<NewArchiveMember> NewMembers;
-  std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
-  std::vector<std::unique_ptr<object::Archive>> Archives;
-
-  for (line_iterator I(Ref, /*SkipBlanks*/ false), E; I != E; ++I) {
-    StringRef Line = *I;
-    Line = Line.split(';').first;
-    Line = Line.split('*').first;
-    Line = Line.trim();
-    if (Line.empty())
-      continue;
-    StringRef CommandStr, Rest;
-    std::tie(CommandStr, Rest) = Line.split(' ');
-    Rest = Rest.trim();
-    if (!Rest.empty() && Rest.front() == '"' && Rest.back() == '"')
-      Rest = Rest.drop_front().drop_back();
-    auto Command = StringSwitch<MRICommand>(CommandStr.lower())
-                       .Case("addlib", MRICommand::AddLib)
-                       .Case("addmod", MRICommand::AddMod)
-                       .Case("create", MRICommand::Create)
-                       .Case("delete", MRICommand::Delete)
-                       .Case("save", MRICommand::Save)
-                       .Case("end", MRICommand::End)
-                       .Default(MRICommand::Invalid);
-
-    switch (Command) {
-    case MRICommand::AddLib: {
-      auto BufOrErr = MemoryBuffer::getFile(Rest, -1, false);
-      failIfError(BufOrErr.getError(), "Could not open library");
-      ArchiveBuffers.push_back(std::move(*BufOrErr));
-      auto LibOrErr =
-          object::Archive::create(ArchiveBuffers.back()->getMemBufferRef());
-      failIfError(errorToErrorCode(LibOrErr.takeError()),
-                  "Could not parse library");
-      Archives.push_back(std::move(*LibOrErr));
-      object::Archive &Lib = *Archives.back();
-      {
-        Error Err = Error::success();
-        for (auto &Member : Lib.children(Err))
-          addMember(NewMembers, Member);
-        failIfError(std::move(Err));
-      }
-      break;
-    }
-    case MRICommand::AddMod:
-      addMember(NewMembers, Rest);
-      break;
-    case MRICommand::Create:
-      Create = true;
-      if (!ArchiveName.empty())
-        fail("Editing multiple archives not supported");
-      if (Saved)
-        fail("File already saved");
-      ArchiveName = Rest;
-      break;
-    case MRICommand::Delete: {
-      StringRef Name = sys::path::filename(Rest);
-      llvm::erase_if(NewMembers,
-                     [=](NewArchiveMember &M) { return M.MemberName == Name; });
-      break;
-    }
-    case MRICommand::Save:
-      Saved = true;
-      break;
-    case MRICommand::End:
-      break;
-    case MRICommand::Invalid:
-      fail("Unknown command: " + CommandStr);
-    }
-  }
-
-  // Nothing to do if not saved.
-  if (Saved)
-    performOperation(ReplaceOrInsert, &NewMembers);
-  exit(0);
-}
-
-static bool handleGenericOption(StringRef arg) {
-  if (arg == "-help" || arg == "--help") {
-    printHelpMessage();
-    return true;
-  }
-  if (arg == "-version" || arg == "--version") {
-    cl::PrintVersionMessage();
-    return true;
-  }
-  return false;
-}
-
-static int ar_main(int argc, char **argv) {
-  SmallVector<const char *, 0> Argv(argv, argv + argc);
-  BumpPtrAllocator Alloc;
-  StringSaver Saver(Alloc);
-  cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv);
-  for(size_t i = 1; i < Argv.size(); ++i) {
-    StringRef Arg = Argv[i];
-    const char *match;
-    auto MatchFlagWithArg = [&](const char *expected) {
-      size_t len = strlen(expected);
-      if (Arg == expected) {
-        if (++i >= Argv.size())
-          fail(std::string(expected) + " requires an argument");
-        match = Argv[i];
-        return true;
-      }
-      if (Arg.startswith(expected) && Arg.size() > len &&
-                 Arg[len] == '=') {
-        match = Arg.data() + len + 1;
-        return true;
-      }
-      return false;
-    };
-    if (handleGenericOption(Argv[i]))
-      return 0;
-    if (Arg == "--") {
-      for(; i < Argv.size(); ++i)
-        PositionalArgs.push_back(Argv[i]);
-      break;
-    }
-    if (Arg[0] == '-') {
-      if (Arg.startswith("--"))
-        Arg = Argv[i] + 2;
-      else
-        Arg = Argv[i] + 1;
-      if (Arg == "M") {
-        MRI = true;
-      } else if (MatchFlagWithArg("format")) {
-        FormatType = StringSwitch<Format>(match)
-            .Case("default", Default)
-            .Case("gnu", GNU)
-            .Case("darwin", DARWIN)
-            .Case("bsd", BSD)
-            .Default(Unknown);
-        if (FormatType == Unknown)
-          fail(std::string("Invalid format ") + match);
-      } else if (MatchFlagWithArg("plugin")) {
-        // Ignored.
-      } else {
-        Options += Argv[i] + 1;
-      }
-    } else if (Options.empty()) {
-      Options += Argv[i];
-    } else {
-      PositionalArgs.push_back(Argv[i]);
-    }
-  }
-  ArchiveOperation Operation = parseCommandLine();
-  return performOperation(Operation, nullptr);
-}
-
-static int ranlib_main(int argc, char **argv) {
-  bool ArchiveSpecified = false;
-  for(int i = 1; i < argc; ++i) {
-    if (handleGenericOption(argv[i])) {
-      return 0;
-    } else {
-      if (ArchiveSpecified)
-        fail("Exactly one archive should be specified");
-      ArchiveSpecified = true;
-      ArchiveName = argv[i];
-    }
-  }
-  return performOperation(CreateSymTab, nullptr);
-}
-
-int main(int argc, char **argv) {
-  InitLLVM X(argc, argv);
-  ToolName = argv[0];
-
-  llvm::InitializeAllTargetInfos();
-  llvm::InitializeAllTargetMCs();
-  llvm::InitializeAllAsmParsers();
-
-  Stem = sys::path::stem(ToolName);
-  if (Stem.contains_lower("dlltool"))
-    return dlltoolDriverMain(makeArrayRef(argv, argc));
-
-  if (Stem.contains_lower("ranlib"))
-    return ranlib_main(argc, argv);
-
-  if (Stem.contains_lower("lib"))
-    return libDriverMain(makeArrayRef(argv, argc));
-
-  if (Stem.contains_lower("ar"))
-    return ar_main(argc, argv);
-  fail("Not ranlib, ar, lib or dlltool!");
-}
+//===-- llvm-ar.cpp - LLVM archive librarian utility ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Builds up (relatively) standard unix archive files (.a) containing LLVM
+// bitcode or other files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Chrono.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
+#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
+
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
+using namespace llvm;
+
+// The name this program was invoked as.
+static StringRef ToolName;
+
+// The basename of this program.
+static StringRef Stem;
+
+const char RanlibHelp[] = R"(
+OVERVIEW: LLVM Ranlib (llvm-ranlib)
+
+  This program generates an index to speed access to archives
+
+USAGE: llvm-ranlib <archive-file>
+
+OPTIONS:
+  -help                             - Display available options
+  -version                          - Display the version of this program
+)";
+
+const char ArHelp[] = R"(
+OVERVIEW: LLVM Archiver
+
+USAGE: llvm-ar [options] [-]<operation>[modifiers] [relpos] <archive> [files]
+       llvm-ar -M [<mri-script]
+
+OPTIONS:
+  --format              - Archive format to create
+    =default            -   default
+    =gnu                -   gnu
+    =darwin             -   darwin
+    =bsd                -   bsd
+  --plugin=<string>     - Ignored for compatibility
+  --help                - Display available options
+  --version             - Display the version of this program
+
+OPERATIONS:
+  d - delete [files] from the archive
+  m - move [files] in the archive
+  p - print [files] found in the archive
+  q - quick append [files] to the archive
+  r - replace or insert [files] into the archive
+  s - act as ranlib
+  t - display contents of archive
+  x - extract [files] from the archive
+
+MODIFIERS:
+  [a] - put [files] after [relpos]
+  [b] - put [files] before [relpos] (same as [i])
+  [c] - do not warn if archive had to be created
+  [D] - use zero for timestamps and uids/gids (default)
+  [i] - put [files] before [relpos] (same as [b])
+  [l] - ignored for compatibility
+  [L] - add archive's contents
+  [o] - preserve original dates
+  [s] - create an archive index (cf. ranlib)
+  [S] - do not build a symbol table
+  [T] - create a thin archive
+  [u] - update only [files] newer than archive contents
+  [U] - use actual timestamps and uids/gids
+  [v] - be verbose about actions taken
+)";
+
+void printHelpMessage() {
+  if (Stem.contains_lower("ranlib"))
+    outs() << RanlibHelp;
+  else if (Stem.contains_lower("ar"))
+    outs() << ArHelp;
+}
+
+// Show the error message and exit.
+LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) {
+  errs() << ToolName << ": " << Error << ".\n";
+  printHelpMessage();
+  exit(1);
+}
+
+static void failIfError(std::error_code EC, Twine Context = "") {
+  if (!EC)
+    return;
+
+  std::string ContextStr = Context.str();
+  if (ContextStr == "")
+    fail(EC.message());
+  fail(Context + ": " + EC.message());
+}
+
+static void failIfError(Error E, Twine Context = "") {
+  if (!E)
+    return;
+
+  handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EIB) {
+    std::string ContextStr = Context.str();
+    if (ContextStr == "")
+      fail(EIB.message());
+    fail(Context + ": " + EIB.message());
+  });
+}
+
+static SmallVector<const char *, 256> PositionalArgs;
+
+static bool MRI;
+
+namespace {
+enum Format { Default, GNU, BSD, DARWIN, Unknown };
+}
+
+static Format FormatType = Default;
+
+static std::string Options;
+
+// This enumeration delineates the kinds of operations on an archive
+// that are permitted.
+enum ArchiveOperation {
+  Print,            ///< Print the contents of the archive
+  Delete,           ///< Delete the specified members
+  Move,             ///< Move members to end or as given by {a,b,i} modifiers
+  QuickAppend,      ///< Quickly append to end of archive
+  ReplaceOrInsert,  ///< Replace or Insert members
+  DisplayTable,     ///< Display the table of contents
+  Extract,          ///< Extract files back to file system
+  CreateSymTab      ///< Create a symbol table in an existing archive
+};
+
+// Modifiers to follow operation to vary behavior
+static bool AddAfter = false;      ///< 'a' modifier
+static bool AddBefore = false;     ///< 'b' modifier
+static bool Create = false;        ///< 'c' modifier
+static bool OriginalDates = false; ///< 'o' modifier
+static bool OnlyUpdate = false;    ///< 'u' modifier
+static bool Verbose = false;       ///< 'v' modifier
+static bool Symtab = true;         ///< 's' modifier
+static bool Deterministic = true;  ///< 'D' and 'U' modifiers
+static bool Thin = false;          ///< 'T' modifier
+static bool AddLibrary = false;    ///< 'L' modifier
+
+// Relative Positional Argument (for insert/move). This variable holds
+// the name of the archive member to which the 'a', 'b' or 'i' modifier
+// refers. Only one of 'a', 'b' or 'i' can be specified so we only need
+// one variable.
+static std::string RelPos;
+
+// This variable holds the name of the archive file as given on the
+// command line.
+static std::string ArchiveName;
+
+// This variable holds the list of member files to proecess, as given
+// on the command line.
+static std::vector<StringRef> Members;
+
+// Extract the member filename from the command line for the [relpos] argument
+// associated with a, b, and i modifiers
+static void getRelPos() {
+  if (PositionalArgs.size() == 0)
+    fail("Expected [relpos] for a, b, or i modifier");
+  RelPos = PositionalArgs[0];
+  PositionalArgs.erase(PositionalArgs.begin());
+}
+
+// Get the archive file name from the command line
+static void getArchive() {
+  if (PositionalArgs.size() == 0)
+    fail("An archive name must be specified");
+  ArchiveName = PositionalArgs[0];
+  PositionalArgs.erase(PositionalArgs.begin());
+}
+
+// Copy over remaining items in PositionalArgs to our Members vector
+static void getMembers() {
+  for (auto &Arg : PositionalArgs)
+    Members.push_back(Arg);
+}
+
+std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
+std::vector<std::unique_ptr<object::Archive>> Archives;
+
+static object::Archive &readLibrary(const Twine &Library) {
+  auto BufOrErr = MemoryBuffer::getFile(Library, -1, false);
+  failIfError(BufOrErr.getError(), "Could not open library");
+  ArchiveBuffers.push_back(std::move(*BufOrErr));
+  auto LibOrErr =
+      object::Archive::create(ArchiveBuffers.back()->getMemBufferRef());
+  failIfError(errorToErrorCode(LibOrErr.takeError()),
+              "Could not parse library");
+  Archives.push_back(std::move(*LibOrErr));
+  return *Archives.back();
+}
+
+static void runMRIScript();
+
+// Parse the command line options as presented and return the operation
+// specified. Process all modifiers and check to make sure that constraints on
+// modifier/operation pairs have not been violated.
+static ArchiveOperation parseCommandLine() {
+  if (MRI) {
+    if (!PositionalArgs.empty() || !Options.empty())
+      fail("Cannot mix -M and other options");
+    runMRIScript();
+  }
+
+  // Keep track of number of operations. We can only specify one
+  // per execution.
+  unsigned NumOperations = 0;
+
+  // Keep track of the number of positional modifiers (a,b,i). Only
+  // one can be specified.
+  unsigned NumPositional = 0;
+
+  // Keep track of which operation was requested
+  ArchiveOperation Operation;
+
+  bool MaybeJustCreateSymTab = false;
+
+  for(unsigned i=0; i<Options.size(); ++i) {
+    switch(Options[i]) {
+    case 'd': ++NumOperations; Operation = Delete; break;
+    case 'm': ++NumOperations; Operation = Move ; break;
+    case 'p': ++NumOperations; Operation = Print; break;
+    case 'q': ++NumOperations; Operation = QuickAppend; break;
+    case 'r': ++NumOperations; Operation = ReplaceOrInsert; break;
+    case 't': ++NumOperations; Operation = DisplayTable; break;
+    case 'x': ++NumOperations; Operation = Extract; break;
+    case 'c': Create = true; break;
+    case 'l': /* accepted but unused */ break;
+    case 'o': OriginalDates = true; break;
+    case 's':
+      Symtab = true;
+      MaybeJustCreateSymTab = true;
+      break;
+    case 'S':
+      Symtab = false;
+      break;
+    case 'u': OnlyUpdate = true; break;
+    case 'v': Verbose = true; break;
+    case 'a':
+      getRelPos();
+      AddAfter = true;
+      NumPositional++;
+      break;
+    case 'b':
+      getRelPos();
+      AddBefore = true;
+      NumPositional++;
+      break;
+    case 'i':
+      getRelPos();
+      AddBefore = true;
+      NumPositional++;
+      break;
+    case 'D':
+      Deterministic = true;
+      break;
+    case 'U':
+      Deterministic = false;
+      break;
+    case 'T':
+      Thin = true;
+      break;
+    case 'L':
+      AddLibrary = true;
+      break;
+    default:
+      fail(std::string("unknown option ") + Options[i]);
+    }
+  }
+
+  // At this point, the next thing on the command line must be
+  // the archive name.
+  getArchive();
+
+  // Everything on the command line at this point is a member.
+  getMembers();
+
+ if (NumOperations == 0 && MaybeJustCreateSymTab) {
+    NumOperations = 1;
+    Operation = CreateSymTab;
+    if (!Members.empty())
+      fail("The s operation takes only an archive as argument");
+  }
+
+  // Perform various checks on the operation/modifier specification
+  // to make sure we are dealing with a legal request.
+  if (NumOperations == 0)
+    fail("You must specify at least one of the operations");
+  if (NumOperations > 1)
+    fail("Only one operation may be specified");
+  if (NumPositional > 1)
+    fail("You may only specify one of a, b, and i modifiers");
+  if (AddAfter || AddBefore) {
+    if (Operation != Move && Operation != ReplaceOrInsert)
+      fail("The 'a', 'b' and 'i' modifiers can only be specified with "
+           "the 'm' or 'r' operations");
+  }
+  if (OriginalDates && Operation != Extract)
+    fail("The 'o' modifier is only applicable to the 'x' operation");
+  if (OnlyUpdate && Operation != ReplaceOrInsert)
+    fail("The 'u' modifier is only applicable to the 'r' operation");
+  if (AddLibrary && Operation != QuickAppend)
+    fail("The 'L' modifier is only applicable to the 'q' operation");
+
+  // Return the parsed operation to the caller
+  return Operation;
+}
+
+// Implements the 'p' operation. This function traverses the archive
+// looking for members that match the path list.
+static void doPrint(StringRef Name, const object::Archive::Child &C) {
+  if (Verbose)
+    outs() << "Printing " << Name << "\n";
+
+  Expected<StringRef> DataOrErr = C.getBuffer();
+  failIfError(DataOrErr.takeError());
+  StringRef Data = *DataOrErr;
+  outs().write(Data.data(), Data.size());
+}
+
+// Utility function for printing out the file mode when the 't' operation is in
+// verbose mode.
+static void printMode(unsigned mode) {
+  outs() << ((mode & 004) ? "r" : "-");
+  outs() << ((mode & 002) ? "w" : "-");
+  outs() << ((mode & 001) ? "x" : "-");
+}
+
+// Implement the 't' operation. This function prints out just
+// the file names of each of the members. However, if verbose mode is requested
+// ('v' modifier) then the file type, permission mode, user, group, size, and
+// modification time are also printed.
+static void doDisplayTable(StringRef Name, const object::Archive::Child &C) {
+  if (Verbose) {
+    Expected<sys::fs::perms> ModeOrErr = C.getAccessMode();
+    failIfError(ModeOrErr.takeError());
+    sys::fs::perms Mode = ModeOrErr.get();
+    printMode((Mode >> 6) & 007);
+    printMode((Mode >> 3) & 007);
+    printMode(Mode & 007);
+    Expected<unsigned> UIDOrErr = C.getUID();
+    failIfError(UIDOrErr.takeError());
+    outs() << ' ' << UIDOrErr.get();
+    Expected<unsigned> GIDOrErr = C.getGID();
+    failIfError(GIDOrErr.takeError());
+    outs() << '/' << GIDOrErr.get();
+    Expected<uint64_t> Size = C.getSize();
+    failIfError(Size.takeError());
+    outs() << ' ' << format("%6llu", Size.get());
+    auto ModTimeOrErr = C.getLastModified();
+    failIfError(ModTimeOrErr.takeError());
+    // Note: formatv() only handles the default TimePoint<>, which is in
+    // nanoseconds.
+    // TODO: fix format_provider<TimePoint<>> to allow other units.
+    sys::TimePoint<> ModTimeInNs = ModTimeOrErr.get();
+    outs() << ' ' << formatv("{0:%b %e %H:%M %Y}", ModTimeInNs);
+    outs() << ' ';
+  }
+
+  if (C.getParent()->isThin()) {
+    outs() << sys::path::parent_path(ArchiveName);
+    outs() << '/';
+  }
+  outs() << Name << "\n";
+}
+
+// Implement the 'x' operation. This function extracts files back to the file
+// system.
+static void doExtract(StringRef Name, const object::Archive::Child &C) {
+  // Retain the original mode.
+  Expected<sys::fs::perms> ModeOrErr = C.getAccessMode();
+  failIfError(ModeOrErr.takeError());
+  sys::fs::perms Mode = ModeOrErr.get();
+
+  int FD;
+  failIfError(sys::fs::openFileForWrite(sys::path::filename(Name), FD,
+                                        sys::fs::CD_CreateAlways,
+                                        sys::fs::F_None, Mode),
+              Name);
+
+  {
+    raw_fd_ostream file(FD, false);
+
+    // Get the data and its length
+    Expected<StringRef> BufOrErr = C.getBuffer();
+    failIfError(BufOrErr.takeError());
+    StringRef Data = BufOrErr.get();
+
+    // Write the data.
+    file.write(Data.data(), Data.size());
+  }
+
+  // If we're supposed to retain the original modification times, etc. do so
+  // now.
+  if (OriginalDates) {
+    auto ModTimeOrErr = C.getLastModified();
+    failIfError(ModTimeOrErr.takeError());
+    failIfError(
+        sys::fs::setLastAccessAndModificationTime(FD, ModTimeOrErr.get()));
+  }
+
+  if (close(FD))
+    fail("Could not close the file");
+}
+
+static bool shouldCreateArchive(ArchiveOperation Op) {
+  switch (Op) {
+  case Print:
+  case Delete:
+  case Move:
+  case DisplayTable:
+  case Extract:
+  case CreateSymTab:
+    return false;
+
+  case QuickAppend:
+  case ReplaceOrInsert:
+    return true;
+  }
+
+  llvm_unreachable("Missing entry in covered switch.");
+}
+
+static void performReadOperation(ArchiveOperation Operation,
+                                 object::Archive *OldArchive) {
+  if (Operation == Extract && OldArchive->isThin())
+    fail("extracting from a thin archive is not supported");
+
+  bool Filter = !Members.empty();
+  {
+    Error Err = Error::success();
+    for (auto &C : OldArchive->children(Err)) {
+      Expected<StringRef> NameOrErr = C.getName();
+      failIfError(NameOrErr.takeError());
+      StringRef Name = NameOrErr.get();
+
+      if (Filter) {
+        auto I = find(Members, Name);
+        if (I == Members.end())
+          continue;
+        Members.erase(I);
+      }
+
+      switch (Operation) {
+      default:
+        llvm_unreachable("Not a read operation");
+      case Print:
+        doPrint(Name, C);
+        break;
+      case DisplayTable:
+        doDisplayTable(Name, C);
+        break;
+      case Extract:
+        doExtract(Name, C);
+        break;
+      }
+    }
+    failIfError(std::move(Err));
+  }
+
+  if (Members.empty())
+    return;
+  for (StringRef Name : Members)
+    errs() << Name << " was not found\n";
+  exit(1);
+}
+
+static void addMember(std::vector<NewArchiveMember> &Members,
+                      StringRef FileName, int Pos = -1) {
+  Expected<NewArchiveMember> NMOrErr =
+      NewArchiveMember::getFile(FileName, Deterministic);
+  failIfError(NMOrErr.takeError(), FileName);
+
+  // Use the basename of the object path for the member name.
+  NMOrErr->MemberName = sys::path::filename(NMOrErr->MemberName);
+
+  if (Pos == -1)
+    Members.push_back(std::move(*NMOrErr));
+  else
+    Members[Pos] = std::move(*NMOrErr);
+}
+
+static void addMember(std::vector<NewArchiveMember> &Members,
+                      const object::Archive::Child &M, int Pos = -1) {
+  if (Thin && !M.getParent()->isThin())
+    fail("Cannot convert a regular archive to a thin one");
+  Expected<NewArchiveMember> NMOrErr =
+      NewArchiveMember::getOldMember(M, Deterministic);
+  failIfError(NMOrErr.takeError());
+  if (Pos == -1)
+    Members.push_back(std::move(*NMOrErr));
+  else
+    Members[Pos] = std::move(*NMOrErr);
+}
+
+static void addLibMember(std::vector<NewArchiveMember> &Members,
+                         StringRef FileName) {
+  Expected<NewArchiveMember> NMOrErr =
+      NewArchiveMember::getFile(FileName, Deterministic);
+  failIfError(NMOrErr.takeError(), FileName);
+  if (identify_magic(NMOrErr->Buf->getBuffer()) == file_magic::archive) {
+    object::Archive &Lib = readLibrary(FileName);
+    Error Err = Error::success();
+
+    for (auto &Child : Lib.children(Err))
+      addMember(Members, Child);
+
+    failIfError(std::move(Err));
+  } else {
+    // Use the basename of the object path for the member name.
+    NMOrErr->MemberName = sys::path::filename(NMOrErr->MemberName);
+    Members.push_back(std::move(*NMOrErr));
+  }
+}
+
+enum InsertAction {
+  IA_AddOldMember,
+  IA_AddNewMember,
+  IA_Delete,
+  IA_MoveOldMember,
+  IA_MoveNewMember
+};
+
+static InsertAction computeInsertAction(ArchiveOperation Operation,
+                                        const object::Archive::Child &Member,
+                                        StringRef Name,
+                                        std::vector<StringRef>::iterator &Pos) {
+  if (Operation == QuickAppend || Members.empty())
+    return IA_AddOldMember;
+
+  auto MI = find_if(Members, [Name](StringRef Path) {
+    return Name == sys::path::filename(Path);
+  });
+
+  if (MI == Members.end())
+    return IA_AddOldMember;
+
+  Pos = MI;
+
+  if (Operation == Delete)
+    return IA_Delete;
+
+  if (Operation == Move)
+    return IA_MoveOldMember;
+
+  if (Operation == ReplaceOrInsert) {
+    StringRef PosName = sys::path::filename(RelPos);
+    if (!OnlyUpdate) {
+      if (PosName.empty())
+        return IA_AddNewMember;
+      return IA_MoveNewMember;
+    }
+
+    // We could try to optimize this to a fstat, but it is not a common
+    // operation.
+    sys::fs::file_status Status;
+    failIfError(sys::fs::status(*MI, Status), *MI);
+    auto ModTimeOrErr = Member.getLastModified();
+    failIfError(ModTimeOrErr.takeError());
+    if (Status.getLastModificationTime() < ModTimeOrErr.get()) {
+      if (PosName.empty())
+        return IA_AddOldMember;
+      return IA_MoveOldMember;
+    }
+
+    if (PosName.empty())
+      return IA_AddNewMember;
+    return IA_MoveNewMember;
+  }
+  llvm_unreachable("No such operation");
+}
+
+// We have to walk this twice and computing it is not trivial, so creating an
+// explicit std::vector is actually fairly efficient.
+static std::vector<NewArchiveMember>
+computeNewArchiveMembers(ArchiveOperation Operation,
+                         object::Archive *OldArchive) {
+  std::vector<NewArchiveMember> Ret;
+  std::vector<NewArchiveMember> Moved;
+  int InsertPos = -1;
+  StringRef PosName = sys::path::filename(RelPos);
+  if (OldArchive) {
+    Error Err = Error::success();
+    for (auto &Child : OldArchive->children(Err)) {
+      int Pos = Ret.size();
+      Expected<StringRef> NameOrErr = Child.getName();
+      failIfError(NameOrErr.takeError());
+      StringRef Name = NameOrErr.get();
+      if (Name == PosName) {
+        assert(AddAfter || AddBefore);
+        if (AddBefore)
+          InsertPos = Pos;
+        else
+          InsertPos = Pos + 1;
+      }
+
+      std::vector<StringRef>::iterator MemberI = Members.end();
+      InsertAction Action =
+          computeInsertAction(Operation, Child, Name, MemberI);
+      switch (Action) {
+      case IA_AddOldMember:
+        addMember(Ret, Child);
+        break;
+      case IA_AddNewMember:
+        addMember(Ret, *MemberI);
+        break;
+      case IA_Delete:
+        break;
+      case IA_MoveOldMember:
+        addMember(Moved, Child);
+        break;
+      case IA_MoveNewMember:
+        addMember(Moved, *MemberI);
+        break;
+      }
+      if (MemberI != Members.end())
+        Members.erase(MemberI);
+    }
+    failIfError(std::move(Err));
+  }
+
+  if (Operation == Delete)
+    return Ret;
+
+  if (!RelPos.empty() && InsertPos == -1)
+    fail("Insertion point not found");
+
+  if (RelPos.empty())
+    InsertPos = Ret.size();
+
+  assert(unsigned(InsertPos) <= Ret.size());
+  int Pos = InsertPos;
+  for (auto &M : Moved) {
+    Ret.insert(Ret.begin() + Pos, std::move(M));
+    ++Pos;
+  }
+
+  if (AddLibrary) {
+    assert(Operation == QuickAppend);
+    for (auto &Member : Members)
+      addLibMember(Ret, Member);
+    return Ret;
+  }
+
+  for (unsigned I = 0; I != Members.size(); ++I)
+    Ret.insert(Ret.begin() + InsertPos, NewArchiveMember());
+  Pos = InsertPos;
+  for (auto &Member : Members) {
+    addMember(Ret, Member, Pos);
+    ++Pos;
+  }
+
+  return Ret;
+}
+
+static object::Archive::Kind getDefaultForHost() {
+  return Triple(sys::getProcessTriple()).isOSDarwin()
+             ? object::Archive::K_DARWIN
+             : object::Archive::K_GNU;
+}
+
+static object::Archive::Kind getKindFromMember(const NewArchiveMember &Member) {
+  Expected<std::unique_ptr<object::ObjectFile>> OptionalObject =
+      object::ObjectFile::createObjectFile(Member.Buf->getMemBufferRef());
+
+  if (OptionalObject)
+    return isa<object::MachOObjectFile>(**OptionalObject)
+               ? object::Archive::K_DARWIN
+               : object::Archive::K_GNU;
+
+  // squelch the error in case we had a non-object file
+  consumeError(OptionalObject.takeError());
+  return getDefaultForHost();
+}
+
+static void
+performWriteOperation(ArchiveOperation Operation,
+                      object::Archive *OldArchive,
+                      std::unique_ptr<MemoryBuffer> OldArchiveBuf,
+                      std::vector<NewArchiveMember> *NewMembersP) {
+  std::vector<NewArchiveMember> NewMembers;
+  if (!NewMembersP)
+    NewMembers = computeNewArchiveMembers(Operation, OldArchive);
+
+  object::Archive::Kind Kind;
+  switch (FormatType) {
+  case Default:
+    if (Thin)
+      Kind = object::Archive::K_GNU;
+    else if (OldArchive)
+      Kind = OldArchive->kind();
+    else if (NewMembersP)
+      Kind = NewMembersP->size() ? getKindFromMember(NewMembersP->front())
+                                 : getDefaultForHost();
+    else
+      Kind = NewMembers.size() ? getKindFromMember(NewMembers.front())
+                               : getDefaultForHost();
+    break;
+  case GNU:
+    Kind = object::Archive::K_GNU;
+    break;
+  case BSD:
+    if (Thin)
+      fail("Only the gnu format has a thin mode");
+    Kind = object::Archive::K_BSD;
+    break;
+  case DARWIN:
+    if (Thin)
+      fail("Only the gnu format has a thin mode");
+    Kind = object::Archive::K_DARWIN;
+    break;
+  case Unknown:
+    llvm_unreachable("");
+  }
+
+  Error E =
+      writeArchive(ArchiveName, NewMembersP ? *NewMembersP : NewMembers, Symtab,
+                   Kind, Deterministic, Thin, std::move(OldArchiveBuf));
+  failIfError(std::move(E), ArchiveName);
+}
+
+static void createSymbolTable(object::Archive *OldArchive) {
+  // When an archive is created or modified, if the s option is given, the
+  // resulting archive will have a current symbol table. If the S option
+  // is given, it will have no symbol table.
+  // In summary, we only need to update the symbol table if we have none.
+  // This is actually very common because of broken build systems that think
+  // they have to run ranlib.
+  if (OldArchive->hasSymbolTable())
+    return;
+
+  performWriteOperation(CreateSymTab, OldArchive, nullptr, nullptr);
+}
+
+static void performOperation(ArchiveOperation Operation,
+                             object::Archive *OldArchive,
+                             std::unique_ptr<MemoryBuffer> OldArchiveBuf,
+                             std::vector<NewArchiveMember> *NewMembers) {
+  switch (Operation) {
+  case Print:
+  case DisplayTable:
+  case Extract:
+    performReadOperation(Operation, OldArchive);
+    return;
+
+  case Delete:
+  case Move:
+  case QuickAppend:
+  case ReplaceOrInsert:
+    performWriteOperation(Operation, OldArchive, std::move(OldArchiveBuf),
+                          NewMembers);
+    return;
+  case CreateSymTab:
+    createSymbolTable(OldArchive);
+    return;
+  }
+  llvm_unreachable("Unknown operation.");
+}
+
+static int performOperation(ArchiveOperation Operation,
+                            std::vector<NewArchiveMember> *NewMembers) {
+  // Create or open the archive object.
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+      MemoryBuffer::getFile(ArchiveName, -1, false);
+  std::error_code EC = Buf.getError();
+  if (EC && EC != errc::no_such_file_or_directory)
+    fail("error opening '" + ArchiveName + "': " + EC.message() + "!");
+
+  if (!EC) {
+    Error Err = Error::success();
+    object::Archive Archive(Buf.get()->getMemBufferRef(), Err);
+    EC = errorToErrorCode(std::move(Err));
+    failIfError(EC,
+                "error loading '" + ArchiveName + "': " + EC.message() + "!");
+    performOperation(Operation, &Archive, std::move(Buf.get()), NewMembers);
+    return 0;
+  }
+
+  assert(EC == errc::no_such_file_or_directory);
+
+  if (!shouldCreateArchive(Operation)) {
+    failIfError(EC, Twine("error loading '") + ArchiveName + "'");
+  } else {
+    if (!Create) {
+      // Produce a warning if we should and we're creating the archive
+      errs() << ToolName << ": creating " << ArchiveName << "\n";
+    }
+  }
+
+  performOperation(Operation, nullptr, nullptr, NewMembers);
+  return 0;
+}
+
+static void runMRIScript() {
+  enum class MRICommand { AddLib, AddMod, Create, Delete, Save, End, Invalid };
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getSTDIN();
+  failIfError(Buf.getError());
+  const MemoryBuffer &Ref = *Buf.get();
+  bool Saved = false;
+  std::vector<NewArchiveMember> NewMembers;
+
+  for (line_iterator I(Ref, /*SkipBlanks*/ false), E; I != E; ++I) {
+    StringRef Line = *I;
+    Line = Line.split(';').first;
+    Line = Line.split('*').first;
+    Line = Line.trim();
+    if (Line.empty())
+      continue;
+    StringRef CommandStr, Rest;
+    std::tie(CommandStr, Rest) = Line.split(' ');
+    Rest = Rest.trim();
+    if (!Rest.empty() && Rest.front() == '"' && Rest.back() == '"')
+      Rest = Rest.drop_front().drop_back();
+    auto Command = StringSwitch<MRICommand>(CommandStr.lower())
+                       .Case("addlib", MRICommand::AddLib)
+                       .Case("addmod", MRICommand::AddMod)
+                       .Case("create", MRICommand::Create)
+                       .Case("delete", MRICommand::Delete)
+                       .Case("save", MRICommand::Save)
+                       .Case("end", MRICommand::End)
+                       .Default(MRICommand::Invalid);
+
+    switch (Command) {
+    case MRICommand::AddLib: {
+      object::Archive &Lib = readLibrary(Rest);
+      {
+        Error Err = Error::success();
+        for (auto &Member : Lib.children(Err))
+          addMember(NewMembers, Member);
+        failIfError(std::move(Err));
+      }
+      break;
+    }
+    case MRICommand::AddMod:
+      addMember(NewMembers, Rest);
+      break;
+    case MRICommand::Create:
+      Create = true;
+      if (!ArchiveName.empty())
+        fail("Editing multiple archives not supported");
+      if (Saved)
+        fail("File already saved");
+      ArchiveName = Rest;
+      break;
+    case MRICommand::Delete: {
+      StringRef Name = sys::path::filename(Rest);
+      llvm::erase_if(NewMembers,
+                     [=](NewArchiveMember &M) { return M.MemberName == Name; });
+      break;
+    }
+    case MRICommand::Save:
+      Saved = true;
+      break;
+    case MRICommand::End:
+      break;
+    case MRICommand::Invalid:
+      fail("Unknown command: " + CommandStr);
+    }
+  }
+
+  // Nothing to do if not saved.
+  if (Saved)
+    performOperation(ReplaceOrInsert, &NewMembers);
+  exit(0);
+}
+
+static bool handleGenericOption(StringRef arg) {
+  if (arg == "-help" || arg == "--help") {
+    printHelpMessage();
+    return true;
+  }
+  if (arg == "-version" || arg == "--version") {
+    cl::PrintVersionMessage();
+    return true;
+  }
+  return false;
+}
+
+static int ar_main(int argc, char **argv) {
+  SmallVector<const char *, 0> Argv(argv, argv + argc);
+  BumpPtrAllocator Alloc;
+  StringSaver Saver(Alloc);
+  cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv);
+  for(size_t i = 1; i < Argv.size(); ++i) {
+    StringRef Arg = Argv[i];
+    const char *match;
+    auto MatchFlagWithArg = [&](const char *expected) {
+      size_t len = strlen(expected);
+      if (Arg == expected) {
+        if (++i >= Argv.size())
+          fail(std::string(expected) + " requires an argument");
+        match = Argv[i];
+        return true;
+      }
+      if (Arg.startswith(expected) && Arg.size() > len &&
+                 Arg[len] == '=') {
+        match = Arg.data() + len + 1;
+        return true;
+      }
+      return false;
+    };
+    if (handleGenericOption(Argv[i]))
+      return 0;
+    if (Arg == "--") {
+      for(; i < Argv.size(); ++i)
+        PositionalArgs.push_back(Argv[i]);
+      break;
+    }
+    if (Arg[0] == '-') {
+      if (Arg.startswith("--"))
+        Arg = Argv[i] + 2;
+      else
+        Arg = Argv[i] + 1;
+      if (Arg == "M") {
+        MRI = true;
+      } else if (MatchFlagWithArg("format")) {
+        FormatType = StringSwitch<Format>(match)
+            .Case("default", Default)
+            .Case("gnu", GNU)
+            .Case("darwin", DARWIN)
+            .Case("bsd", BSD)
+            .Default(Unknown);
+        if (FormatType == Unknown)
+          fail(std::string("Invalid format ") + match);
+      } else if (MatchFlagWithArg("plugin")) {
+        // Ignored.
+      } else {
+        Options += Argv[i] + 1;
+      }
+    } else if (Options.empty()) {
+      Options += Argv[i];
+    } else {
+      PositionalArgs.push_back(Argv[i]);
+    }
+  }
+  ArchiveOperation Operation = parseCommandLine();
+  return performOperation(Operation, nullptr);
+}
+
+static int ranlib_main(int argc, char **argv) {
+  bool ArchiveSpecified = false;
+  for(int i = 1; i < argc; ++i) {
+    if (handleGenericOption(argv[i])) {
+      return 0;
+    } else {
+      if (ArchiveSpecified)
+        fail("Exactly one archive should be specified");
+      ArchiveSpecified = true;
+      ArchiveName = argv[i];
+    }
+  }
+  return performOperation(CreateSymTab, nullptr);
+}
+
+int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
+  ToolName = argv[0];
+
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+
+  Stem = sys::path::stem(ToolName);
+  if (Stem.contains_lower("dlltool"))
+    return dlltoolDriverMain(makeArrayRef(argv, argc));
+
+  if (Stem.contains_lower("ranlib"))
+    return ranlib_main(argc, argv);
+
+  if (Stem.contains_lower("lib"))
+    return libDriverMain(makeArrayRef(argv, argc));
+
+  if (Stem.contains_lower("ar"))
+    return ar_main(argc, argv);
+  fail("Not ranlib, ar, lib or dlltool!");
+}
-- 
GitLab


From f01b04819ecb40ad1fbd06bf75b1a0ca1c4f3ca3 Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Fri, 26 Oct 2018 13:37:25 +0000
Subject: [PATCH 0632/1116] [CodeGen] Remove out operands from PATCHABLE_OP

The current model requires 1 out operand, but it is not used nor created.

This fixed an x86 machine verifier issue.

Part of PR27481.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345384 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/Target.td          | 2 +-
 test/CodeGen/X86/patchable-prologue.ll | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 96641dda700..abb1bb431f6 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -1104,7 +1104,7 @@ def FAULTING_OP : StandardPseudoInstruction {
   let isBranch = 1;
 }
 def PATCHABLE_OP : StandardPseudoInstruction {
-  let OutOperandList = (outs unknown:$dst);
+  let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let usesCustomInserter = 1;
   let mayLoad = 1;
diff --git a/test/CodeGen/X86/patchable-prologue.ll b/test/CodeGen/X86/patchable-prologue.ll
index c8daff33181..3779bc39531 100644
--- a/test/CodeGen/X86/patchable-prologue.ll
+++ b/test/CodeGen/X86/patchable-prologue.ll
@@ -1,5 +1,5 @@
-; RUN: llc -filetype=obj -o - -mtriple=x86_64-apple-macosx < %s | llvm-objdump -triple x86_64-apple-macosx -disassemble - | FileCheck %s
-; RUN: llc -mtriple=x86_64-apple-macosx < %s | FileCheck %s --check-prefix=CHECK-ALIGN
+; RUN: llc -verify-machineinstrs -filetype=obj -o - -mtriple=x86_64-apple-macosx < %s | llvm-objdump -triple x86_64-apple-macosx -disassemble - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-apple-macosx < %s | FileCheck %s --check-prefix=CHECK-ALIGN
 
 declare void @callee(i64*)
 
-- 
GitLab


From 1911e927c06a16b98c5f95d3285ceca0fc8e5f6d Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <sjoerd.meijer@arm.com>
Date: Fri, 26 Oct 2018 14:19:57 +0000
Subject: [PATCH 0633/1116] [ARM] Fix ARMCodeGenPrepare test cases

While working on FileCheck producing better diagnostics in D53710, I noticed
that our test case is broken in a few different ways. The test was running, but
results were not checked as prefix CHECK-COMMON wasn't defined (which is what
FileCheck should warn about). Also, the output was different in 2 cases because
of recent changes in ARMCodeGenPrepare.

Differential Revision: https://reviews.llvm.org/D53746


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345386 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/ARM/arm-cgp-calls.ll | 62 +++++++++++++++----------------
 1 file changed, 30 insertions(+), 32 deletions(-)

diff --git a/test/CodeGen/ARM/arm-cgp-calls.ll b/test/CodeGen/ARM/arm-cgp-calls.ll
index b9cff6e307a..5972980b8d6 100644
--- a/test/CodeGen/ARM/arm-cgp-calls.ll
+++ b/test/CodeGen/ARM/arm-cgp-calls.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=armv8 -arm-disable-cgp=false %s -o - | FileCheck %s
 
 ; Check that the pass doesn't try to promote the immediate parameters.
-; CHECK-COMMON-LABEL: call_with_imms
-; CHECK-COMMON-NOT:   uxt
+; CHECK-LABEL: call_with_imms
+; CHECK-NOT:   uxt
 define i8 @call_with_imms(i8* %arg) {
   %call = tail call arm_aapcs_vfpcc zeroext i8 @dummy2(i8* nonnull %arg, i8 zeroext 0, i8 zeroext 0)
   %cmp = icmp eq i8 %call, 0
@@ -12,23 +12,23 @@ define i8 @call_with_imms(i8* %arg) {
 }
 
 ; Test that the call result is still extended.
-; CHECK-COMMON-LABEL: test_call:
-; CHECK-COMMON: bl
-; CHECK-COMMONNEXT: sxtb r1, r0
+; CHECK-LABEL: test_call:
+; CHECK: bl
+; CHECK-NEXT: sxtb r1, r0
 define i16 @test_call(i8 zeroext %arg) {
   %call = call i8 @dummy_i8(i8 %arg)
   %cmp = icmp ult i8 %call, 128
   %conv = zext i1 %cmp to i16
-  ret i16 %conv 
+  ret i16 %conv
 }
 
 ; Test that the transformation bails when it finds that i16 is larger than i8.
 ; TODO: We should be able to remove the uxtb in these cases.
 ; CHECK-LABEL: promote_i8_sink_i16_1
-; CHECK-COMMON: bl dummy_i8
-; CHECK-COMMON: adds r0, #1
-; CHECK-COMMON: uxtb r0, r0
-; CHECK-COMMON: cmp r0
+; CHECK: bl dummy_i8
+; CHECK: add{{.*}} r0, #1
+; CHECK: uxtb r0, r0
+; CHECK: cmp r0
 define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroext %arg2) {
   %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
   %add = add nuw i8 %call, 1
@@ -39,11 +39,11 @@ define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroe
   ret i16 %res
 }
 
-; CHECK-COMMON-LABEL: promote_i8_sink_i16_2
-; CHECK-COMMON: bl dummy_i8
-; CHECK-COMMON: adds r0, #1
-; CHECK-COMMON-NOT: uxt
-; CHECK-COMMON: cmp r0
+; CHECK-LABEL: promote_i8_sink_i16_2
+; CHECK: bl dummy_i8
+; CHECK: add{{.*}} r0, #1
+; CHECK-NOT: uxt
+; CHECK: cmp r0
 define i16 @promote_i8_sink_i16_2(i8 zeroext %arg0, i8 zeroext %arg1, i16 zeroext %arg2) {
   %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
   %add = add nuw i8 %call, 1
@@ -57,9 +57,9 @@ define i16 @promote_i8_sink_i16_2(i8 zeroext %arg0, i8 zeroext %arg1, i16 zeroex
 @uc = global i8 42, align 1
 @LL = global i64 0, align 8
 
-; CHECK-COMMON-LABEL: zext_i64
-; CHECK-COMMON: ldrb
-; CHECK-COMMON: strd
+; CHECK-LABEL: zext_i64
+; CHECK: ldrb
+; CHECK: strd
 define void @zext_i64() {
 entry:
   %0 = load i8, i8* @uc, align 1
@@ -74,8 +74,8 @@ entry:
 @a = global i16* null, align 4
 @b = global i32 0, align 4
 
-; CHECK-COMMON-LABEL: constexpr
-; CHECK-COMMON: uxth
+; CHECK-LABEL: constexpr
+; CHECK: uxth
 define i32 @constexpr() {
 entry:
   store i32 ptrtoint (i32* @b to i32), i32* @b, align 4
@@ -89,12 +89,11 @@ entry:
   ret i32 undef
 }
 
-; The call to safe_lshift_func takes two parameters, but they're the same value just one is zext.
-; The transform won't happen because of the zext.
-; CHECK-COMMON-LABEL: call_zext_i8_i32
-; CHECK-COMMON-NOT: uxt
-; CHECK-COMMON: cmp
-; CHECK-COMMON: uxtb
+; The call to safe_lshift_func takes two parameters, but they're the same value
+; just one is zext. We do support zext now, so the transformation should
+; trigger and we don't want see uxtb here.
+; CHECK-LABEL: call_zext_i8_i32
+; CHECK-NOT: uxt
 define fastcc i32 @call_zext_i8_i32(i32 %p_45, i8 zeroext %p_46) {
 for.cond8.preheader:
   %call217 = call fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 zeroext undef)
@@ -119,9 +118,9 @@ for.end411:                                       ; preds = %for.cond8.preheader
 @g_82 = hidden local_unnamed_addr global i32 0, align 4
 
 ; Test that the transform bails on finding %conv4, a trunc
-; CHECK-COMMON-LABEL: call_return_pointer
-; CHECK-COMMON: sxth
-; CHECK-COMMON-NOT: uxt
+; CHECK-LABEL: call_return_pointer
+; CHECK: sxth
+; CHECK: uxt
 define hidden i32 @call_return_pointer(i8 zeroext %p_13) local_unnamed_addr #0 {
 entry:
   %conv1 = zext i8 %p_13 to i16
@@ -147,9 +146,8 @@ if.then:                                          ; preds = %for.cond
 
 ; Transform will bail because of the zext
 ; Check that d.sroa.0.0.be is promoted passed directly into the tail call.
-; CHECK-COMMON-LABEL: check_zext_phi_call_arg
-; CHECK-COMMON: uxt
-; CHECK-COMMON: uxt
+; CHECK-LABEL: check_zext_phi_call_arg
+; CHECK: uxt
 define i32 @check_zext_phi_call_arg() {
 entry:
   br label %for.cond
-- 
GitLab


From 79fad5df81b56600575274d4d99b5b9d1986866a Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Fri, 26 Oct 2018 14:20:11 +0000
Subject: [PATCH 0634/1116] [SimpleLoopUnswitch] Unswitch by experimental.guard
 intrinsics

This patch adds support of `llvm.experimental.guard` intrinsics to non-trivial
simple loop unswitching. These intrinsics represent implicit control flow which
has pretty much the same semantics as usual conditional branches. The
algorithm of dealing with them is following:

- Consider guards as unswitching candidates;
- If a guard is considered the best candidate, turn it into a branch;
- Apply normal unswitching algorithm on this branch.

The patch has no compile time effect on code that does not contain any guards.

Differential Revision: https://reviews.llvm.org/D53744
Reviewed By: chandlerc


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345387 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 109 ++++++++-
 test/Transforms/SimpleLoopUnswitch/guards.ll | 238 +++++++++++++++++++
 2 files changed, 345 insertions(+), 2 deletions(-)
 create mode 100644 test/Transforms/SimpleLoopUnswitch/guards.ll

diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index e8f67a689f4..8b6935fa039 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -59,6 +60,7 @@ using namespace llvm;
 
 STATISTIC(NumBranches, "Number of branches unswitched");
 STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumGuards, "Number of guards turned into branches for unswitching");
 STATISTIC(NumTrivial, "Number of unswitches that are trivial");
 
 static cl::opt<bool> EnableNonTrivialUnswitch(
@@ -70,6 +72,11 @@ static cl::opt<int>
     UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
                       cl::desc("The cost threshold for unswitching a loop."));
 
+static cl::opt<bool> UnswitchGuards(
+    "simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
+    cl::desc("If enabled, simple loop unswitching will also consider "
+             "llvm.experimental.guard intrinsics as unswitch candidates."));
+
 /// Collect all of the loop invariant input values transitively used by the
 /// homogeneous instruction graph from a given root.
 ///
@@ -2169,6 +2176,77 @@ computeDomSubtreeCost(DomTreeNode &N,
   return Cost;
 }
 
+/// Turns a llvm.experimental.guard intrinsic into implicit control flow branch,
+/// making the following replacement:
+///
+///   <code before guard>
+///   call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+///   <code after guard>
+///
+/// into
+///
+///   <code before guard>
+///   br i1 %cond, label %guarded, label %deopt
+///
+/// guarded:
+///   <code after guard>
+///
+/// deopt:
+///   call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+///   unreachable
+///
+/// It also makes all relevant DT and LI updates, so that all structures are in
+/// valid state after this transform.
+static BranchInst *
+turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
+                    SmallVectorImpl<BasicBlock *> &ExitBlocks,
+                    DominatorTree &DT, LoopInfo &LI) {
+  SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+  LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n");
+  BasicBlock *CheckBB = GI->getParent();
+
+  // Remove all CheckBB's successors from DomTree. A block can be seen among
+  // successors more than once, but for DomTree it should be added only once.
+  SmallPtrSet<BasicBlock *, 4> Successors;
+  for (auto *Succ : successors(CheckBB))
+    if (Successors.insert(Succ).second)
+      DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ});
+
+  Instruction *DeoptBlockTerm =
+      SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true);
+  BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+  // SplitBlockAndInsertIfThen inserts control flow that branches to
+  // DeoptBlockTerm if the condition is true.  We want the opposite.
+  CheckBI->swapSuccessors();
+
+  BasicBlock *GuardedBlock = CheckBI->getSuccessor(0);
+  GuardedBlock->setName("guarded");
+  CheckBI->getSuccessor(1)->setName("deopt");
+
+  // We now have a new exit block.
+  ExitBlocks.push_back(CheckBI->getSuccessor(1));
+
+  GI->moveBefore(DeoptBlockTerm);
+  GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext()));
+
+  // Add new successors of CheckBB into DomTree.
+  for (auto *Succ : successors(CheckBB))
+    DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ});
+
+  // Now the blocks that used to be CheckBB's successors are GuardedBlock's
+  // successors.
+  for (auto *Succ : Successors)
+    DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ});
+
+  // Make proper changes to DT.
+  DT.applyUpdates(DTUpdates);
+  // Inform LI of a new loop block.
+  L.addBasicBlockToLoop(GuardedBlock, LI);
+
+  ++NumGuards;
+  return CheckBI;
+}
+
 static bool
 unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
                       AssumptionCache &AC, TargetTransformInfo &TTI,
@@ -2178,10 +2256,29 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
   // loop which would be handled when visiting that inner loop).
   SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4>
       UnswitchCandidates;
+
+  // Whether or not we should also collect guards in the loop.
+  bool CollectGuards = false;
+  if (UnswitchGuards) {
+    auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction(
+        Intrinsic::getName(Intrinsic::experimental_guard));
+    if (GuardDecl && !GuardDecl->use_empty())
+      CollectGuards = true;
+  }
+
   for (auto *BB : L.blocks()) {
     if (LI.getLoopFor(BB) != &L)
       continue;
 
+    if (CollectGuards)
+      for (auto &I : *BB)
+        if (isGuard(&I)) {
+          auto *Cond = cast<IntrinsicInst>(&I)->getArgOperand(0);
+          // TODO: Support AND, OR conditions and partial unswitching.
+          if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond))
+            UnswitchCandidates.push_back({&I, {Cond}});
+        }
+
     if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
       // We can only consider fully loop-invariant switch conditions as we need
       // to completely eliminate the switch after unswitching.
@@ -2346,9 +2443,12 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
     // Now scale the cost by the number of unique successors minus one. We
     // subtract one because there is already at least one copy of the entire
     // loop. This is computing the new cost of unswitching a condition.
-    assert(Visited.size() > 1 &&
+    // Note that guards always have 2 unique successors that are implicit and
+    // will be materialized if we decide to unswitch it.
+    int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size();
+    assert(SuccessorsCount > 1 &&
            "Cannot unswitch a condition without multiple distinct successors!");
-    return Cost * (Visited.size() - 1);
+    return Cost * (SuccessorsCount - 1);
   };
   Instruction *BestUnswitchTI = nullptr;
   int BestUnswitchCost;
@@ -2375,6 +2475,11 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
     return false;
   }
 
+  // If the best candidate is a guard, turn it into a branch.
+  if (isGuard(BestUnswitchTI))
+    BestUnswitchTI = turnGuardIntoBranch(cast<IntrinsicInst>(BestUnswitchTI), L,
+                                         ExitBlocks, DT, LI);
+
   LLVM_DEBUG(dbgs() << "  Unswitching non-trivial (cost = "
                     << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
                     << "\n");
diff --git a/test/Transforms/SimpleLoopUnswitch/guards.ll b/test/Transforms/SimpleLoopUnswitch/guards.ll
new file mode 100644
index 00000000000..95661c425e1
--- /dev/null
+++ b/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -0,0 +1,238 @@
+; RUN: opt -passes='loop(unswitch),verify<loops>' -enable-nontrivial-unswitch -simple-loop-unswitch-guards -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -simple-loop-unswitch-guards -S < %s | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define void @test_simple_case(i1 %cond, i32 %N) {
+; CHECK-LABEL: @test_simple_case(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
+; CHECK-NEXT:    br label [[GUARDED_US]]
+; CHECK:       guarded.us:
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+  %iv.next = add i32 %iv, 1
+  %loop.cond = icmp slt i32 %iv.next, %N
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
+; CHECK-LABEL: @test_two_guards(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]]
+; CHECK:       entry.split.us.split.us:
+; CHECK-NEXT:    br label [[LOOP_US_US:%.*]]
+; CHECK:       loop.us.us:
+; CHECK-NEXT:    [[IV_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], [[GUARDED_US2:%.*]] ]
+; CHECK-NEXT:    br label [[GUARDED_US_US:%.*]]
+; CHECK:       guarded.us.us:
+; CHECK-NEXT:    br label [[GUARDED_US2]]
+; CHECK:       guarded.us2:
+; CHECK-NEXT:    [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US_US]], label [[LOOP_US_US]], label [[EXIT_SPLIT_US_SPLIT_US:%.*]]
+; CHECK:       deopt1:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond2) [ "deopt"() ]
+  %iv.next = add i32 %iv, 1
+  %loop.cond = icmp slt i32 %iv.next, %N
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_conditional_guards(i1 %cond, i32 %N) {
+; CHECK-LABEL: @test_conditional_guards(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[BACKEDGE_US:%.*]] ]
+; CHECK-NEXT:    [[CONDITION_US:%.*]] = icmp eq i32 [[IV_US]], 123
+; CHECK-NEXT:    br i1 [[CONDITION_US]], label [[GUARD_US:%.*]], label [[BACKEDGE_US]]
+; CHECK:       guard.us:
+; CHECK-NEXT:    br label [[GUARDED_US:%.*]]
+; CHECK:       backedge.us:
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[CONDITION:%.*]] = icmp eq i32 [[IV]], 123
+; CHECK-NEXT:    br i1 [[CONDITION]], label [[GUARD:%.*]], label [[BACKEDGE]]
+; CHECK:       guard:
+; CHECK-NEXT:    br label [[DEOPT:%.*]]
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+; CHECK:       backedge:
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %loop, label [[EXIT_SPLIT:%.*]]
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %condition = icmp eq i32 %iv, 123
+  br i1 %condition, label %guard, label %backedge
+
+guard:
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+  br label %backedge
+
+backedge:
+  %iv.next = add i32 %iv, 1
+  %loop.cond = icmp slt i32 %iv.next, %N
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_nested_loop(i1 %cond, i32 %N) {
+; CHECK-LABEL: @test_nested_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[ENTRY_SPLIT:%.*]], label [[OUTER_LOOP_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
+; CHECK:       outer_loop:
+; CHECK-NEXT:    br label [[OUTER_LOOP_SPLIT_US:%.*]]
+; CHECK:       outer_loop.split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[OUTER_LOOP_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
+; CHECK-NEXT:    br label [[GUARDED_US]]
+; CHECK:       guarded.us:
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[OUTER_BACKEDGE_SPLIT_US:%.*]]
+; CHECK:       outer_backedge.split.us:
+; CHECK-NEXT:    br label [[OUTER_BACKEDGE:%.*]]
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    br i1 false, label [[OUTER_LOOP]], label [[EXIT:%.*]]
+;
+
+entry:
+  br label %outer_loop
+
+outer_loop:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %outer_loop ], [ %iv.next, %loop ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+  %iv.next = add i32 %iv, 1
+  %loop.cond = icmp slt i32 %iv.next, %N
+  br i1 %loop.cond, label %loop, label %outer_backedge
+
+outer_backedge:
+  br i1 undef, label %outer_loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) {
+; CHECK-LABEL: @test_sibling_loops(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:         [[IV1_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
+; CHECK-NEXT:    br label [[GUARDED_US]]
+; CHECK:         call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+; CHECK:         [[IV2_US:%.*]] = phi i32 [ 0, [[BETWEEN:%.*]] ], [ [[IV1_NEXT_US2:%.*]], [[GUARDED_US2:%.*]] ]
+; CHECK-NEXT:    br label [[GUARDED_US2]]
+; CHECK:         call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+;
+
+entry:
+  br label %loop1
+
+loop1:
+  %iv1 = phi i32 [ 0, %entry ], [ %iv1.next, %loop1 ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
+  %iv1.next = add i32 %iv1, 1
+  %loop1.cond = icmp slt i32 %iv1.next, %N
+  br i1 %loop1.cond, label %loop1, label %between
+
+between:
+  br label %loop2
+
+loop2:
+  %iv2 = phi i32 [ 0, %between ], [ %iv2.next, %loop2 ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond2) [ "deopt"() ]
+  %iv2.next = add i32 %iv2, 1
+  %loop2.cond = icmp slt i32 %iv2.next, %N
+  br i1 %loop2.cond, label %loop2, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we don't do anything because of cleanuppad.
+; CHECK-LABEL: @test_cleanuppad(
+; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+; CHECK-NOT:   call void (i1, ...) @llvm.experimental.guard(
+define void @test_cleanuppad(i1 %cond, i32 %N) personality i32 (...)* @__CxxFrameHandler3 {
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+  %iv.next = add i32 %iv, 1
+  invoke void @may_throw(i32 %iv) to label %loop unwind label %exit
+
+exit:
+  %cp = cleanuppad within none []
+  cleanupret from %cp unwind to caller
+
+}
+
+declare void @may_throw(i32 %i)
+declare i32 @__CxxFrameHandler3(...)
-- 
GitLab


From 49cd7dd878dc1358afd119b35b1999c52a301111 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 26 Oct 2018 14:39:28 +0000
Subject: [PATCH 0635/1116] [X86] Use existing pulled out VT variables. NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345388 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1664a312aef..0426c801e79 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6530,8 +6530,8 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     MVT SrcVT = Src.getSimpleValueType();
     if (NumSizeInBits != SrcVT.getSizeInBits())
       break;
-    DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), VT.getScalarSizeInBits(),
-                         VT.getVectorNumElements(), Mask);
+    DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
+                         Mask);
     Ops.push_back(Src);
     return true;
   }
-- 
GitLab


From 8d07a140f255db9ac8230eac36578c65692e015c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 26 Oct 2018 14:58:13 +0000
Subject: [PATCH 0636/1116] [x86] commute blendvb with constant condition op to
 allow load folding

This is a narrow fix for 1 of the problems mentioned in PR27780:
https://bugs.llvm.org/show_bug.cgi?id=27780

I looked at more general solutions, but it's a mess. We canonicalize shuffle masks
based on the number of elements accessed from each operand, and that's not optional.
If you remove that, we'll crash because we fail to match isel patterns. So I'm
waiting until we're sure that we have blendvb with constant condition and then
commuting based on the load potential. Other cases like blend-with-immediate are
already handled elsewhere, so this is probably not a common problem anyway.

I didn't use "MayFoldLoad" because that checks for one-use and in these cases, we've
screwed that up by creating a temporary PSHUFB using these operands that we're counting
on to be killed later. Undoing that didn't look like a simple task because it's
intertwined with determining if we actually use both operands of the shuffle or not.a

Differential Revision: https://reviews.llvm.org/D53737


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345390 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp         |  9 +++++++++
 test/CodeGen/X86/vector-shuffle-128-v16.ll | 12 +++++-------
 test/CodeGen/X86/vector-shuffle-256-v32.ll |  5 ++---
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0426c801e79..dd37010cbd8 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -10068,6 +10068,15 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     // type.
     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
 
+    // x86 allows load folding with blendvb from the 2nd source operand. But
+    // we are still using LLVM select here (see comment below), so that's V1.
+    // If V2 can be load-folded and V1 cannot be load-folded, then commute to
+    // allow that load-folding possibility.
+    if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
+      ShuffleVectorSDNode::commuteMask(Mask);
+      std::swap(V1, V2);
+    }
+
     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
     // mix of LLVM's code generator and the x86 backend. We tell the code
     // generator that boolean values in the elements of an x86 vector register
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index d2410050b49..bf34c0332dd 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -601,17 +601,15 @@ define <16 x i8> @load_fold_pblendvb(<16 x i8>* %px, <16 x i8> %y) {
 ; SSE41-LABEL: load_fold_pblendvb:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movdqa (%rdi), %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
-; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSE41-NEXT:    pblendvb %xmm0, (%rdi), %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: load_fold_pblendvb:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
-; AVX1OR2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX1OR2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
 ; AVX1OR2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_fold_pblendvb:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 8189be0311c..c4759ab54f5 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1656,9 +1656,8 @@ define <32 x i8> @load_fold_pblendvb(<32 x i8>* %px, <32 x i8> %y) {
 ;
 ; AVX2-LABEL: load_fold_pblendvb:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX2-NEXT:    vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_fold_pblendvb:
-- 
GitLab


From 35e2b773d7ace3c8900b21feeec5b5804804d261 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 26 Oct 2018 15:19:02 +0000
Subject: [PATCH 0637/1116] [X86][SSE] Move 2-input limit up from
 getFauxShuffleMask to resolveTargetShuffleInputs

Makes no difference to actual shuffle decoding yet, but merges all the existing limits in one place for when proper support is fixed.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345395 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index dd37010cbd8..bfd8c89599b 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6325,9 +6325,6 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
         !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
       return false;
-    // TODO - Add support for more than 2 inputs.
-    if ((SrcInputs0.size() + SrcInputs1.size()) > 2)
-      return false;
     int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
     SmallVector<int, 64> Mask0, Mask1;
     scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
@@ -6387,8 +6384,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
         Mask[i + InsertIdx] = (NumElts * (1 + InputIdx)) + ExtractIdx + M;
       }
     }
-    // TODO - Add support for more than 1 subinput.
-    return Ops.size() <= 2;
+    return true;
   }
   case ISD::SCALAR_TO_VECTOR: {
     // Match against a scalar_to_vector of an extract from a vector,
@@ -6581,7 +6577,7 @@ static bool resolveTargetShuffleInputs(SDValue Op,
       return false;
 
   resolveTargetShuffleInputsAndMask(Inputs, Mask);
-  return true;
+  return Inputs.size() <= 2;
 }
 
 /// Returns the scalar element that will make up the ith
-- 
GitLab


From 1ba9cfd74f152b17bed0d61e89753d077a491324 Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Fri, 26 Oct 2018 16:00:29 +0000
Subject: [PATCH 0638/1116] [MIR] Simplify and move MIR test

Also fixes a Machine Verifier issue.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345396 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/Generic/zero-probability.mir | 39 -----------------------
 test/CodeGen/MIR/X86/zero-probability.mir | 13 ++++++++
 2 files changed, 13 insertions(+), 39 deletions(-)
 delete mode 100644 test/CodeGen/Generic/zero-probability.mir
 create mode 100644 test/CodeGen/MIR/X86/zero-probability.mir

diff --git a/test/CodeGen/Generic/zero-probability.mir b/test/CodeGen/Generic/zero-probability.mir
deleted file mode 100644
index 6a9ab67cb26..00000000000
--- a/test/CodeGen/Generic/zero-probability.mir
+++ /dev/null
@@ -1,39 +0,0 @@
-# RUN: llc -o /dev/null %s 
-# REQUIRES: asserts
-# Makes sure that having a probability of 0x00000000 to branch to a successor
-# doesn't hit an APInt assert in the MIParser.
-
---- |
-  define i32 @main() local_unnamed_addr #0 {
-  entry:
-    ret i32 0
-  
-  other:
-    ret i32 0
-  }
-  
-  attributes #0 = { nounwind }
-  
-  !llvm.module.flags = !{!0, !1}
-  !llvm.ident = !{!2}
-  
-  !0 = !{i32 1, !"wchar_size", i32 4}
-  !1 = !{i32 7, !"PIC Level", i32 2}
-  !2 = !{!"clang version 6.0.0"}
-  !3 = !{!"branch_weights", i32 0, i32 -1}
-
-...
----
-name:            main
-alignment:       2
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true   
-body:             |
-  bb.0.entry:
-    successors: %bb.1.other(0x00000000)
-  bb.1.other:
-
-...
diff --git a/test/CodeGen/MIR/X86/zero-probability.mir b/test/CodeGen/MIR/X86/zero-probability.mir
new file mode 100644
index 00000000000..c6863dfbbda
--- /dev/null
+++ b/test/CodeGen/MIR/X86/zero-probability.mir
@@ -0,0 +1,13 @@
+# RUN: llc -run-pass=none -o /dev/null %s
+# REQUIRES: asserts
+# Makes sure that having a probability of 0x00000000 to branch to a successor
+# doesn't hit an APInt assert in the MIParser.
+
+---
+name:            main
+body:             |
+  bb.0:
+    successors: %bb.1(0x00000000)
+  bb.1:
+
+...
-- 
GitLab


From 78b827929dbba04cf608d491f0b79ab95046f860 Mon Sep 17 00:00:00 2001
From: Vlad Tsyrklevich <vlad@tsyrklevich.net>
Date: Fri, 26 Oct 2018 16:07:50 +0000
Subject: [PATCH 0639/1116] Revert "UBSan blacklist workaround for bot
 timeouts"

This reverts commit r335525. This workaround is no longer necessary
because PR37929 has been fixed.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345397 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/sanitizers/ubsan_blacklist.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/utils/sanitizers/ubsan_blacklist.txt b/utils/sanitizers/ubsan_blacklist.txt
index 69230a3e465..b5bbfddceef 100644
--- a/utils/sanitizers/ubsan_blacklist.txt
+++ b/utils/sanitizers/ubsan_blacklist.txt
@@ -10,8 +10,3 @@ src:*bits/stl_tree.h
 # data() on an empty vector: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59829
 src:*bits/stl_iterator.h
 src:*bits/stl_vector.h
-
-# These auto-generated functions compile down to ~50k basic blocks with inlining
-# and UBSan enabled, causing long builds that lead to bot timeouts.
-# https://bugs.llvm.org/show_bug.cgi?id=37929
-fun:*AArch64*InstPrinter*printAliasInstr*
-- 
GitLab


From 79135142e40354f64abfc3de065b0499aa3f8349 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Fri, 26 Oct 2018 16:22:26 +0000
Subject: [PATCH 0640/1116] [tblgen] Improve comments in
 TargetInstrPredicate.td. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345399 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetInstrPredicate.td | 57 +++++++++++----------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/include/llvm/Target/TargetInstrPredicate.td b/include/llvm/Target/TargetInstrPredicate.td
index f70af259603..d25309a45ba 100644
--- a/include/llvm/Target/TargetInstrPredicate.td
+++ b/include/llvm/Target/TargetInstrPredicate.td
@@ -7,29 +7,39 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines MCInstPredicate classes and its subclasses.
+// This file defines class MCInstPredicate and its subclasses.
 //
-// MCInstPredicate is used to describe constraints on the opcode/operand(s) of
-// an instruction. Each MCInstPredicate class has a well-known semantic, and it
-// is used by a PredicateExpander to generate code for MachineInstr and/or
-// MCInst.
-//
-// MCInstPredicate definitions can be used to construct MCSchedPredicate
-// definitions. An MCSchedPredicate can be used in place of a SchedPredicate
-// when defining SchedReadVariant and SchedWriteVariant used by a processor
-// scheduling model.
+// MCInstPredicate definitions are used by target scheduling models to describe
+// constraints on instructions.
 //
-// Here is an example of MCInstPredicate definition:
+// Here is an example of an MCInstPredicate definition in tablegen:
 //
 // def MCInstPredicateExample : CheckAll<[
 //    CheckOpcode<[BLR]>,
 //    CheckIsRegOperand<0>,
 //    CheckNot<CheckRegOperand<0, LR>>]>;
 //
-// Predicate `MCInstPredicateExample` checks that the machine instruction in
-// input is a BLR, and that operand at index 0 is register `LR`.
+// The syntax for MCInstPredicate is declarative, and predicate definitions can
+// be composed together in order to generate more complex constraints.
+//
+// The `CheckAll` from the example defines a composition of three different
+// predicates.  Definition `MCInstPredicateExample` identifies instructions
+// whose opcode is BLR, and whose first operand is a register different from
+// register `LR`.
+//
+// Every MCInstPredicate class has a well-known semantic in tablegen. For
+// example, `CheckOpcode` is a special type of predicate used to describe a
+// constraint on the value of an instruction opcode.
 //
-// That predicate could be used to rewrite the following definition (from
+// MCInstPredicate definitions are typically used by scheduling models to
+// construct MCSchedPredicate definitions (see the definition of class
+// MCSchedPredicate in llvm/Target/TargetSchedule.td).
+// In particular, an MCSchedPredicate can be used instead of a SchedPredicate
+// when defining the set of SchedReadVariant and SchedWriteVariant of a
+// processor scheduling model.
+//
+// The `MCInstPredicateExample` definition above is equivalent (and therefore
+// could replace) the following definition from the ExynosM3 model (see
 // AArch64SchedExynosM3.td):
 //
 // def M3BranchLinkFastPred  : SchedPredicate<[{
@@ -37,22 +47,13 @@
 //    MI->getOperand(0).isReg() &&
 //    MI->getOperand(0).getReg() != AArch64::LR}]>;
 //
-// MCInstPredicate definitions are used to construct MCSchedPredicate (see the
-// definition of class MCSchedPredicate in llvm/Target/TargetSchedule.td).  An
-// MCSchedPredicate can be used by a `SchedVar` to associate a predicate with a
-// list of SchedReadWrites. Note that `SchedVar` are used to create SchedVariant
-// definitions.
-//
-// Each MCInstPredicate class has a well known semantic. For example,
-// `CheckOpcode` is only used to check the instruction opcode value.
-//
-// MCInstPredicate classes allow the definition of predicates in a declarative
-// way.  These predicates don't require a custom block of C++, and can be used
-// to define conditions on instructions without being bound to a particular
+// The main advantage of using MCInstPredicate instead of SchedPredicate is
+// portability: users don't need to specify predicates in C++. As a consequence
+// of this, MCInstPredicate definitions are not bound to a particular
 // representation (i.e. MachineInstr vs MCInst).
 //
-// It also means that tablegen backends must know how to parse and expand them
-// into code that works on MCInst (or MachineInst).
+// Tablegen backends know how to expand MCInstPredicate definitions into actual
+// C++ code that works on MachineInstr (and/or MCInst).
 //
 // Instances of class PredicateExpander (see utils/Tablegen/PredicateExpander.h)
 // know how to expand a predicate. For each MCInstPredicate class, there must be
-- 
GitLab


From d0ae3cb27e5abdbd98f2bb4384013a54cb658697 Mon Sep 17 00:00:00 2001
From: Wolfgang Pieb <Wolfgang.Pieb@sony.com>
Date: Fri, 26 Oct 2018 17:14:46 +0000
Subject: [PATCH 0641/1116] [DWARF][NFC] cleanup (mostly leftovers from the
 implementation of string offsets tables) Majority of the patch by David
 Blaikie.

Differential Revision: https://reviews.llvm.org/D53741


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345404 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFUnit.h | 10 ++--
 lib/DebugInfo/DWARF/DWARFUnit.cpp        | 74 ++++++++++--------------
 2 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index ae0e8cc8db1..c3252157b0b 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -205,7 +205,7 @@ class DWARFUnit {
   const DWARFSection *AddrOffsetSection;
   uint32_t AddrOffsetSectionBase = 0;
   bool isLittleEndian;
-  bool isDWO;
+  bool IsDWO;
   const DWARFUnitVector &UnitVector;
 
   /// Start, length, and DWARF format of the unit's contribution to the string
@@ -246,16 +246,14 @@ protected:
   /// length and form. The given offset is expected to be derived from the unit
   /// DIE's DW_AT_str_offsets_base attribute.
   Optional<StrOffsetsContributionDescriptor>
-  determineStringOffsetsTableContribution(DWARFDataExtractor &DA,
-                                          uint64_t Offset);
+  determineStringOffsetsTableContribution(DWARFDataExtractor &DA);
 
   /// Find the unit's contribution to the string offsets table and determine its
   /// length and form. The given offset is expected to be 0 in a dwo file or,
   /// in a dwp file, the start of the unit's contribution to the string offsets
   /// table section (as determined by the index table).
   Optional<StrOffsetsContributionDescriptor>
-  determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA,
-                                             uint64_t Offset);
+  determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA);
 
 public:
   DWARFUnit(DWARFContext &Context, const DWARFSection &Section,
@@ -267,7 +265,7 @@ public:
 
   virtual ~DWARFUnit();
 
-  bool isDWOUnit() const { return isDWO; }
+  bool isDWOUnit() const { return IsDWO; }
   DWARFContext& getContext() const { return Context; }
   const DWARFSection &getInfoSection() const { return InfoSection; }
   const DWARFSection *getLocSection() const { return LocSection; }
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 9d75dc94604..d475c44c393 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -175,7 +175,7 @@ DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
     : Context(DC), InfoSection(Section), Header(Header), Abbrev(DA),
       RangeSection(RS), LocSection(LocSection), LineSection(LS),
       StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS),
-      isLittleEndian(LE), isDWO(IsDWO), UnitVector(UnitVector) {
+      isLittleEndian(LE), IsDWO(IsDWO), UnitVector(UnitVector) {
   clear();
   // For split DWARF we only need to keep track of the location list section's
   // data (no relocations), and if we are reading a package file, we need to
@@ -197,7 +197,7 @@ DWARFDataExtractor DWARFUnit::getDebugInfoExtractor() const {
 
 Optional<SectionedAddress>
 DWARFUnit::getAddrOffsetSectionItem(uint32_t Index) const {
-  if (isDWO) {
+  if (IsDWO) {
     auto R = Context.info_section_units();
     auto I = R.begin();
     // Surprising if a DWO file has more than one skeleton unit in it - this
@@ -409,7 +409,7 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
     DWARFDie UnitDie = getUnitDIE();
     if (Optional<uint64_t> DWOId = toUnsigned(UnitDie.find(DW_AT_GNU_dwo_id)))
       Header.setDWOId(*DWOId);
-    if (!isDWO) {
+    if (!IsDWO) {
       assert(AddrOffsetSectionBase == 0);
       assert(RangeSectionBase == 0);
       AddrOffsetSectionBase = toSectionOffset(UnitDie.find(DW_AT_addr_base), 0);
@@ -426,27 +426,19 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
     // offsets table starting at offset 0 of the debug_str_offsets.dwo section.
     // In both cases we need to determine the format of the contribution,
     // which may differ from the unit's format.
-    uint64_t StringOffsetsContributionBase =
-        isDWO ? 0 : toSectionOffset(UnitDie.find(DW_AT_str_offsets_base), 0);
-    auto IndexEntry = Header.getIndexEntry();
-    if (IndexEntry)
-      if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS))
-        StringOffsetsContributionBase += C->Offset;
-
     DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection,
                           isLittleEndian, 0);
-    if (isDWO)
+    if (IsDWO)
       StringOffsetsTableContribution =
-          determineStringOffsetsTableContributionDWO(
-              DA, StringOffsetsContributionBase);
+          determineStringOffsetsTableContributionDWO(DA);
     else if (getVersion() >= 5)
-      StringOffsetsTableContribution = determineStringOffsetsTableContribution(
-          DA, StringOffsetsContributionBase);
+      StringOffsetsTableContribution =
+          determineStringOffsetsTableContribution(DA);
 
     // DWARF v5 uses the .debug_rnglists and .debug_rnglists.dwo sections to
     // describe address ranges.
     if (getVersion() >= 5) {
-      if (isDWO)
+      if (IsDWO)
         setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
       else
         setRangesSection(&Context.getDWARFObj().getRnglistsSection(),
@@ -466,20 +458,20 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
 
         // In a split dwarf unit, there is no DW_AT_rnglists_base attribute.
         // Adjust RangeSectionBase to point past the table header.
-        if (isDWO && RngListTable)
+        if (IsDWO && RngListTable)
           RangeSectionBase = RngListTable->getHeaderSize();
       }
     }
 
     // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
     // skeleton CU DIE, so that DWARF users not aware of it are not broken.
-  }
+    }
 
   return DieArray.size();
 }
 
 bool DWARFUnit::parseDWO() {
-  if (isDWO)
+  if (IsDWO)
     return false;
   if (DWO.get())
     return false;
@@ -794,7 +786,7 @@ StrOffsetsContributionDescriptor::validateContributionSize(
   if (ValidationSize >= Size)
     if (DA.isValidOffsetForDataOfSize((uint32_t)Base, ValidationSize))
       return *this;
-  return Optional<StrOffsetsContributionDescriptor>();
+  return None;
 }
 
 // Look for a DWARF64-formatted contribution to the string offsets table
@@ -802,18 +794,17 @@ StrOffsetsContributionDescriptor::validateContributionSize(
 static Optional<StrOffsetsContributionDescriptor>
 parseDWARF64StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
   if (!DA.isValidOffsetForDataOfSize(Offset, 16))
-    return Optional<StrOffsetsContributionDescriptor>();
+    return None;
 
   if (DA.getU32(&Offset) != 0xffffffff)
-    return Optional<StrOffsetsContributionDescriptor>();
+    return None;
 
   uint64_t Size = DA.getU64(&Offset);
   uint8_t Version = DA.getU16(&Offset);
   (void)DA.getU16(&Offset); // padding
   // The encoded length includes the 2-byte version field and the 2-byte
   // padding, so we need to subtract them out when we populate the descriptor.
-  return StrOffsetsContributionDescriptor(Offset, Size - 4, Version, DWARF64);
-  //return Optional<StrOffsetsContributionDescriptor>(Descriptor);
+  return {{Offset, Size - 4, Version, DWARF64}};
 }
 
 // Look for a DWARF32-formatted contribution to the string offsets table
@@ -821,22 +812,20 @@ parseDWARF64StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
 static Optional<StrOffsetsContributionDescriptor>
 parseDWARF32StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
   if (!DA.isValidOffsetForDataOfSize(Offset, 8))
-    return Optional<StrOffsetsContributionDescriptor>();
+    return None;
   uint32_t ContributionSize = DA.getU32(&Offset);
   if (ContributionSize >= 0xfffffff0)
-    return Optional<StrOffsetsContributionDescriptor>();
+    return None;
   uint8_t Version = DA.getU16(&Offset);
   (void)DA.getU16(&Offset); // padding
   // The encoded length includes the 2-byte version field and the 2-byte
   // padding, so we need to subtract them out when we populate the descriptor.
-  return StrOffsetsContributionDescriptor(Offset, ContributionSize - 4, Version,
-                                          DWARF32);
-  //return Optional<StrOffsetsContributionDescriptor>(Descriptor);
+  return {{Offset, ContributionSize - 4, Version, DWARF32}};
 }
 
 Optional<StrOffsetsContributionDescriptor>
-DWARFUnit::determineStringOffsetsTableContribution(DWARFDataExtractor &DA,
-                                                   uint64_t Offset) {
+DWARFUnit::determineStringOffsetsTableContribution(DWARFDataExtractor &DA) {
+  auto Offset = toSectionOffset(getUnitDIE().find(DW_AT_str_offsets_base), 0);
   Optional<StrOffsetsContributionDescriptor> Descriptor;
   // Attempt to find a DWARF64 contribution 16 bytes before the base.
   if (Offset >= 16)
@@ -849,8 +838,13 @@ DWARFUnit::determineStringOffsetsTableContribution(DWARFDataExtractor &DA,
 }
 
 Optional<StrOffsetsContributionDescriptor>
-DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA,
-                                                      uint64_t Offset) {
+DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor & DA) {
+  uint64_t Offset = 0;
+  auto IndexEntry = Header.getIndexEntry();
+  const auto *C =
+      IndexEntry ? IndexEntry->getOffset(DW_SECT_STR_OFFSETS) : nullptr;
+  if (C)
+    Offset = C->Offset;
   if (getVersion() >= 5) {
     // Look for a valid contribution at the given offset.
     auto Descriptor =
@@ -862,15 +856,9 @@ DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA,
   // Prior to DWARF v5, we derive the contribution size from the
   // index table (in a package file). In a .dwo file it is simply
   // the length of the string offsets section.
-  uint64_t Size = 0;
-  auto IndexEntry = Header.getIndexEntry();
   if (!IndexEntry)
-    Size = StringOffsetSection.Data.size();
-  else if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS))
-    Size = C->Length;
-  // Return a descriptor with the given offset as base, version 4 and
-  // DWARF32 format.
-  //return Optional<StrOffsetsContributionDescriptor>(
-      //StrOffsetsContributionDescriptor(Offset, Size, 4, DWARF32));
-  return StrOffsetsContributionDescriptor(Offset, Size, 4, DWARF32);
+    return {{0, StringOffsetSection.Data.size(), 4, DWARF32}};
+  if (C)
+    return {{C->Offset, C->Length, 4, DWARF32}};
+  return None;
 }
-- 
GitLab


From 571f8d7f8e9987c32982c584fd8a9f36fa7f1ad7 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 26 Oct 2018 17:15:52 +0000
Subject: [PATCH 0642/1116] [llvm-ar] Add a dependency to BinaryFormat after
 rL345383

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345405 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-ar/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt
index 2970a59beee..191c684d524 100644
--- a/tools/llvm-ar/CMakeLists.txt
+++ b/tools/llvm-ar/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  BinaryFormat
   Core
   DlltoolDriver
   LibDriver
-- 
GitLab


From 39e6c05285c6a7a1e35d9325fdb4778226ab933e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 26 Oct 2018 17:21:19 +0000
Subject: [PATCH 0643/1116] [X86] Add -LABEL to some FileCheck checks. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345407 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Analysis/CostModel/X86/testshiftashr.ll | 160 +++++++++----------
 test/Analysis/CostModel/X86/testshiftlshr.ll | 160 +++++++++----------
 test/Analysis/CostModel/X86/testshiftshl.ll  | 160 +++++++++----------
 3 files changed, 240 insertions(+), 240 deletions(-)

diff --git a/test/Analysis/CostModel/X86/testshiftashr.ll b/test/Analysis/CostModel/X86/testshiftashr.ll
index 13f2bd2019d..864ea2e5559 100644
--- a/test/Analysis/CostModel/X86/testshiftashr.ll
+++ b/test/Analysis/CostModel/X86/testshiftashr.ll
@@ -4,9 +4,9 @@
 %shifttype = type <2 x i16>
 define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
 entry:
-  ; SSE2: shift2i16
+  ; SSE2-LABEL: shift2i16
   ; SSE2: cost of 12 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i16
+  ; SSE2-CODEGEN-LABEL: shift2i16
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype %a , %b
@@ -16,9 +16,9 @@ entry:
 %shifttype4i16 = type <4 x i16>
 define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
 entry:
-  ; SSE2: shift4i16
+  ; SSE2-LABEL: shift4i16
   ; SSE2: cost of 16 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i16
+  ; SSE2-CODEGEN-LABEL: shift4i16
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype4i16 %a , %b
@@ -28,9 +28,9 @@ entry:
 %shifttype8i16 = type <8 x i16>
 define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
 entry:
-  ; SSE2: shift8i16
+  ; SSE2-LABEL: shift8i16
   ; SSE2: cost of 32 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i16
+  ; SSE2-CODEGEN-LABEL: shift8i16
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype8i16 %a , %b
@@ -40,9 +40,9 @@ entry:
 %shifttype16i16 = type <16 x i16>
 define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
 entry:
-  ; SSE2: shift16i16
+  ; SSE2-LABEL: shift16i16
   ; SSE2: cost of 64 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i16
+  ; SSE2-CODEGEN-LABEL: shift16i16
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype16i16 %a , %b
@@ -52,9 +52,9 @@ entry:
 %shifttype32i16 = type <32 x i16>
 define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
 entry:
-  ; SSE2: shift32i16
+  ; SSE2-LABEL: shift32i16
   ; SSE2: cost of 128 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i16
+  ; SSE2-CODEGEN-LABEL: shift32i16
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype32i16 %a , %b
@@ -64,9 +64,9 @@ entry:
 %shifttype2i32 = type <2 x i32>
 define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
 entry:
-  ; SSE2: shift2i32
+  ; SSE2-LABEL: shift2i32
   ; SSE2: cost of 12 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i32
+  ; SSE2-CODEGEN-LABEL: shift2i32
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype2i32 %a , %b
@@ -76,9 +76,9 @@ entry:
 %shifttype4i32 = type <4 x i32>
 define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
 entry:
-  ; SSE2: shift4i32
+  ; SSE2-LABEL: shift4i32
   ; SSE2: cost of 16 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i32
+  ; SSE2-CODEGEN-LABEL: shift4i32
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype4i32 %a , %b
@@ -88,9 +88,9 @@ entry:
 %shifttype8i32 = type <8 x i32>
 define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
 entry:
-  ; SSE2: shift8i32
+  ; SSE2-LABEL: shift8i32
   ; SSE2: cost of 32 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i32
+  ; SSE2-CODEGEN-LABEL: shift8i32
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype8i32 %a , %b
@@ -100,9 +100,9 @@ entry:
 %shifttype16i32 = type <16 x i32>
 define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
 entry:
-  ; SSE2: shift16i32
+  ; SSE2-LABEL: shift16i32
   ; SSE2: cost of 64 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i32
+  ; SSE2-CODEGEN-LABEL: shift16i32
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype16i32 %a , %b
@@ -112,9 +112,9 @@ entry:
 %shifttype32i32 = type <32 x i32>
 define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
 entry:
-  ; SSE2: shift32i32
+  ; SSE2-LABEL: shift32i32
   ; SSE2: cost of 128 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i32
+  ; SSE2-CODEGEN-LABEL: shift32i32
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype32i32 %a , %b
@@ -124,9 +124,9 @@ entry:
 %shifttype2i64 = type <2 x i64>
 define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) {
 entry:
-  ; SSE2: shift2i64
+  ; SSE2-LABEL: shift2i64
   ; SSE2: cost of 12 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i64
+  ; SSE2-CODEGEN-LABEL: shift2i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype2i64 %a , %b
@@ -136,9 +136,9 @@ entry:
 %shifttype4i64 = type <4 x i64>
 define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) {
 entry:
-  ; SSE2: shift4i64
+  ; SSE2-LABEL: shift4i64
   ; SSE2: cost of 24 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i64
+  ; SSE2-CODEGEN-LABEL: shift4i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype4i64 %a , %b
@@ -148,9 +148,9 @@ entry:
 %shifttype8i64 = type <8 x i64>
 define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) {
 entry:
-  ; SSE2: shift8i64
+  ; SSE2-LABEL: shift8i64
   ; SSE2: cost of 48 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i64
+  ; SSE2-CODEGEN-LABEL: shift8i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype8i64 %a , %b
@@ -160,9 +160,9 @@ entry:
 %shifttype16i64 = type <16 x i64>
 define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) {
 entry:
-  ; SSE2: shift16i64
+  ; SSE2-LABEL: shift16i64
   ; SSE2: cost of 96 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i64
+  ; SSE2-CODEGEN-LABEL: shift16i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype16i64 %a , %b
@@ -172,9 +172,9 @@ entry:
 %shifttype32i64 = type <32 x i64>
 define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) {
 entry:
-  ; SSE2: shift32i64
+  ; SSE2-LABEL: shift32i64
   ; SSE2: cost of 192 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i64
+  ; SSE2-CODEGEN-LABEL: shift32i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype32i64 %a , %b
@@ -184,9 +184,9 @@ entry:
 %shifttype2i8 = type <2 x i8>
 define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
 entry:
-  ; SSE2: shift2i8
+  ; SSE2-LABEL: shift2i8
   ; SSE2: cost of 12 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i8
+  ; SSE2-CODEGEN-LABEL: shift2i8
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype2i8 %a , %b
@@ -196,9 +196,9 @@ entry:
 %shifttype4i8 = type <4 x i8>
 define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
 entry:
-  ; SSE2: shift4i8
+  ; SSE2-LABEL: shift4i8
   ; SSE2: cost of 16 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i8
+  ; SSE2-CODEGEN-LABEL: shift4i8
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype4i8 %a , %b
@@ -208,9 +208,9 @@ entry:
 %shifttype8i8 = type <8 x i8>
 define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
 entry:
-  ; SSE2: shift8i8
+  ; SSE2-LABEL: shift8i8
   ; SSE2: cost of 32 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i8
+  ; SSE2-CODEGEN-LABEL: shift8i8
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype8i8 %a , %b
@@ -220,9 +220,9 @@ entry:
 %shifttype16i8 = type <16 x i8>
 define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
 entry:
-  ; SSE2: shift16i8
+  ; SSE2-LABEL: shift16i8
   ; SSE2: cost of 54 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i8
+  ; SSE2-CODEGEN-LABEL: shift16i8
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype16i8 %a , %b
@@ -232,9 +232,9 @@ entry:
 %shifttype32i8 = type <32 x i8>
 define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
 entry:
-  ; SSE2: shift32i8
+  ; SSE2-LABEL: shift32i8
   ; SSE2: cost of 108 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i8
+  ; SSE2-CODEGEN-LABEL: shift32i8
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype32i8 %a , %b
@@ -246,9 +246,9 @@ entry:
 %shifttypec = type <2 x i16>
 define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
 entry:
-  ; SSE2: shift2i16const
+  ; SSE2-LABEL: shift2i16const
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i16const
+  ; SSE2-CODEGEN-LABEL: shift2i16const
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec %a , <i16 3, i16 3>
@@ -258,9 +258,9 @@ entry:
 %shifttypec4i16 = type <4 x i16>
 define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
 entry:
-  ; SSE2: shift4i16const
+  ; SSE2-LABEL: shift4i16const
   ; SSE2: cost of 1 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i16const
+  ; SSE2-CODEGEN-LABEL: shift4i16const
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
@@ -270,9 +270,9 @@ entry:
 %shifttypec8i16 = type <8 x i16>
 define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
 entry:
-  ; SSE2: shift8i16const
+  ; SSE2-LABEL: shift8i16const
   ; SSE2: cost of 1 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i16const
+  ; SSE2-CODEGEN-LABEL: shift8i16const
   ; SSE2-CODEGEN: psraw $3
 
   %0 = ashr %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -284,9 +284,9 @@ entry:
 define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
                                          %shifttypec16i16 %b) {
 entry:
-  ; SSE2: shift16i16const
+  ; SSE2-LABEL: shift16i16const
   ; SSE2: cost of 2 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i16const
+  ; SSE2-CODEGEN-LABEL: shift16i16const
   ; SSE2-CODEGEN: psraw $3
 
   %0 = ashr %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -300,9 +300,9 @@ entry:
 define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
                                         %shifttypec32i16 %b) {
 entry:
-  ; SSE2: shift32i16const
+  ; SSE2-LABEL: shift32i16const
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i16const
+  ; SSE2-CODEGEN-LABEL: shift32i16const
   ; SSE2-CODEGEN: psraw $3
 
   %0 = ashr %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -319,9 +319,9 @@ entry:
 %shifttypec2i32 = type <2 x i32>
 define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
 entry:
-  ; SSE2: shift2i32c
+  ; SSE2-LABEL: shift2i32c
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i32c
+  ; SSE2-CODEGEN-LABEL: shift2i32c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec2i32 %a , <i32 3, i32 3>
@@ -331,9 +331,9 @@ entry:
 %shifttypec4i32 = type <4 x i32>
 define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
 entry:
-  ; SSE2: shift4i32c
+  ; SSE2-LABEL: shift4i32c
   ; SSE2: cost of 1 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i32c
+  ; SSE2-CODEGEN-LABEL: shift4i32c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
@@ -343,9 +343,9 @@ entry:
 %shifttypec8i32 = type <8 x i32>
 define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
 entry:
-  ; SSE2: shift8i32c
+  ; SSE2-LABEL: shift8i32c
   ; SSE2: cost of 2 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i32c
+  ; SSE2-CODEGEN-LABEL: shift8i32c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -356,9 +356,9 @@ entry:
 %shifttypec16i32 = type <16 x i32>
 define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
 entry:
-  ; SSE2: shift16i32c
+  ; SSE2-LABEL: shift16i32c
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i32c
+  ; SSE2-CODEGEN-LABEL: shift16i32c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -371,10 +371,10 @@ entry:
 %shifttypec32i32 = type <32 x i32>
 define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
 entry:
-  ; SSE2: shift32i32c
+  ; SSE2-LABEL: shift32i32c
   ; getTypeConversion fails here and promotes this to a i64.
   ; SSE2: cost of 8 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i32c
+  ; SSE2-CODEGEN-LABEL: shift32i32c
   ; SSE2-CODEGEN: psrad $3
   %0 = ashr %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
                                    i32 3, i32 3, i32 3, i32 3,
@@ -390,9 +390,9 @@ entry:
 %shifttypec2i64 = type <2 x i64>
 define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
 entry:
-  ; SSE2: shift2i64c
+  ; SSE2-LABEL: shift2i64c
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i64c
+  ; SSE2-CODEGEN-LABEL: shift2i64c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec2i64 %a , <i64 3, i64 3>
@@ -402,9 +402,9 @@ entry:
 %shifttypec4i64 = type <4 x i64>
 define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
 entry:
-  ; SSE2: shift4i64c
+  ; SSE2-LABEL: shift4i64c
   ; SSE2: cost of 8 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i64c
+  ; SSE2-CODEGEN-LABEL: shift4i64c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
@@ -414,9 +414,9 @@ entry:
 %shifttypec8i64 = type <8 x i64>
 define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
 entry:
-  ; SSE2: shift8i64c
+  ; SSE2-LABEL: shift8i64c
   ; SSE2: cost of 16 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i64c
+  ; SSE2-CODEGEN-LABEL: shift8i64c
   ; SSE2-CODEGEN: psrad $3
 
  %0 = ashr %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -427,9 +427,9 @@ entry:
 %shifttypec16i64 = type <16 x i64>
 define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
 entry:
-  ; SSE2: shift16i64c
+  ; SSE2-LABEL: shift16i64c
   ; SSE2: cost of 32 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i64c
+  ; SSE2-CODEGEN-LABEL: shift16i64c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -442,9 +442,9 @@ entry:
 %shifttypec32i64 = type <32 x i64>
 define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
 entry:
-  ; SSE2: shift32i64c
+  ; SSE2-LABEL: shift32i64c
   ; SSE2: cost of 64 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i64c
+  ; SSE2-CODEGEN-LABEL: shift32i64c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
@@ -461,9 +461,9 @@ entry:
 %shifttypec2i8 = type <2 x i8>
 define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
 entry:
-  ; SSE2: shift2i8c
+  ; SSE2-LABEL: shift2i8c
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i8c
+  ; SSE2-CODEGEN-LABEL: shift2i8c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec2i8 %a , <i8 3, i8 3>
@@ -473,9 +473,9 @@ entry:
 %shifttypec4i8 = type <4 x i8>
 define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
 entry:
-  ; SSE2: shift4i8c
+  ; SSE2-LABEL: shift4i8c
   ; SSE2: cost of 1 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i8c
+  ; SSE2-CODEGEN-LABEL: shift4i8c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
@@ -485,9 +485,9 @@ entry:
 %shifttypec8i8 = type <8 x i8>
 define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
 entry:
-  ; SSE2: shift8i8c
+  ; SSE2-LABEL: shift8i8c
   ; SSE2: cost of 1 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i8c
+  ; SSE2-CODEGEN-LABEL: shift8i8c
   ; SSE2-CODEGEN: psraw $3
 
   %0 = ashr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -498,9 +498,9 @@ entry:
 %shifttypec16i8 = type <16 x i8>
 define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
 entry:
-  ; SSE2: shift16i8c
+  ; SSE2-LABEL: shift16i8c
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i8c
+  ; SSE2-CODEGEN-LABEL: shift16i8c
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = ashr %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -513,9 +513,9 @@ entry:
 %shifttypec32i8 = type <32 x i8>
 define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
 entry:
-  ; SSE2: shift32i8c
+  ; SSE2-LABEL: shift32i8c
   ; SSE2: cost of 8 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i8c
+  ; SSE2-CODEGEN-LABEL: shift32i8c
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = ashr %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
diff --git a/test/Analysis/CostModel/X86/testshiftlshr.ll b/test/Analysis/CostModel/X86/testshiftlshr.ll
index e5fff9b5e4d..3e30614e185 100644
--- a/test/Analysis/CostModel/X86/testshiftlshr.ll
+++ b/test/Analysis/CostModel/X86/testshiftlshr.ll
@@ -4,9 +4,9 @@
 %shifttype = type <2 x i16>
 define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
 entry:
-  ; SSE2: shift2i16
+  ; SSE2-LABEL: shift2i16
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i16
+  ; SSE2-CODEGEN-LABEL: shift2i16
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype %a , %b
@@ -16,9 +16,9 @@ entry:
 %shifttype4i16 = type <4 x i16>
 define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
 entry:
-  ; SSE2: shift4i16
+  ; SSE2-LABEL: shift4i16
   ; SSE2: cost of 16 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i16
+  ; SSE2-CODEGEN-LABEL: shift4i16
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype4i16 %a , %b
@@ -28,9 +28,9 @@ entry:
 %shifttype8i16 = type <8 x i16>
 define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
 entry:
-  ; SSE2: shift8i16
+  ; SSE2-LABEL: shift8i16
   ; SSE2: cost of 32 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i16
+  ; SSE2-CODEGEN-LABEL: shift8i16
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype8i16 %a , %b
@@ -40,9 +40,9 @@ entry:
 %shifttype16i16 = type <16 x i16>
 define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
 entry:
-  ; SSE2: shift16i16
+  ; SSE2-LABEL: shift16i16
   ; SSE2: cost of 64 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i16
+  ; SSE2-CODEGEN-LABEL: shift16i16
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype16i16 %a , %b
@@ -52,9 +52,9 @@ entry:
 %shifttype32i16 = type <32 x i16>
 define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
 entry:
-  ; SSE2: shift32i16
+  ; SSE2-LABEL: shift32i16
   ; SSE2: cost of 128 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i16
+  ; SSE2-CODEGEN-LABEL: shift32i16
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype32i16 %a , %b
@@ -64,9 +64,9 @@ entry:
 %shifttype2i32 = type <2 x i32>
 define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
 entry:
-  ; SSE2: shift2i32
+  ; SSE2-LABEL: shift2i32
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i32
+  ; SSE2-CODEGEN-LABEL: shift2i32
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype2i32 %a , %b
@@ -76,9 +76,9 @@ entry:
 %shifttype4i32 = type <4 x i32>
 define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
 entry:
-  ; SSE2: shift4i32
+  ; SSE2-LABEL: shift4i32
   ; SSE2: cost of 16 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i32
+  ; SSE2-CODEGEN-LABEL: shift4i32
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype4i32 %a , %b
@@ -88,9 +88,9 @@ entry:
 %shifttype8i32 = type <8 x i32>
 define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
 entry:
-  ; SSE2: shift8i32
+  ; SSE2-LABEL: shift8i32
   ; SSE2: cost of 32 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i32
+  ; SSE2-CODEGEN-LABEL: shift8i32
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype8i32 %a , %b
@@ -100,9 +100,9 @@ entry:
 %shifttype16i32 = type <16 x i32>
 define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
 entry:
-  ; SSE2: shift16i32
+  ; SSE2-LABEL: shift16i32
   ; SSE2: cost of 64 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i32
+  ; SSE2-CODEGEN-LABEL: shift16i32
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype16i32 %a , %b
@@ -112,9 +112,9 @@ entry:
 %shifttype32i32 = type <32 x i32>
 define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
 entry:
-  ; SSE2: shift32i32
+  ; SSE2-LABEL: shift32i32
   ; SSE2: cost of 128 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i32
+  ; SSE2-CODEGEN-LABEL: shift32i32
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype32i32 %a , %b
@@ -124,9 +124,9 @@ entry:
 %shifttype2i64 = type <2 x i64>
 define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) {
 entry:
-  ; SSE2: shift2i64
+  ; SSE2-LABEL: shift2i64
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i64
+  ; SSE2-CODEGEN-LABEL: shift2i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype2i64 %a , %b
@@ -136,9 +136,9 @@ entry:
 %shifttype4i64 = type <4 x i64>
 define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) {
 entry:
-  ; SSE2: shift4i64
+  ; SSE2-LABEL: shift4i64
   ; SSE2: cost of 8 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i64
+  ; SSE2-CODEGEN-LABEL: shift4i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype4i64 %a , %b
@@ -148,9 +148,9 @@ entry:
 %shifttype8i64 = type <8 x i64>
 define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) {
 entry:
-  ; SSE2: shift8i64
+  ; SSE2-LABEL: shift8i64
   ; SSE2: cost of 16 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i64
+  ; SSE2-CODEGEN-LABEL: shift8i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype8i64 %a , %b
@@ -160,9 +160,9 @@ entry:
 %shifttype16i64 = type <16 x i64>
 define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) {
 entry:
-  ; SSE2: shift16i64
+  ; SSE2-LABEL: shift16i64
   ; SSE2: cost of 32 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i64
+  ; SSE2-CODEGEN-LABEL: shift16i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype16i64 %a , %b
@@ -172,9 +172,9 @@ entry:
 %shifttype32i64 = type <32 x i64>
 define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) {
 entry:
-  ; SSE2: shift32i64
+  ; SSE2-LABEL: shift32i64
   ; SSE2: cost of 64 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i64
+  ; SSE2-CODEGEN-LABEL: shift32i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype32i64 %a , %b
@@ -184,9 +184,9 @@ entry:
 %shifttype2i8 = type <2 x i8>
 define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
 entry:
-  ; SSE2: shift2i8
+  ; SSE2-LABEL: shift2i8
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i8
+  ; SSE2-CODEGEN-LABEL: shift2i8
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype2i8 %a , %b
@@ -196,9 +196,9 @@ entry:
 %shifttype4i8 = type <4 x i8>
 define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
 entry:
-  ; SSE2: shift4i8
+  ; SSE2-LABEL: shift4i8
   ; SSE2: cost of 16 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i8
+  ; SSE2-CODEGEN-LABEL: shift4i8
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype4i8 %a , %b
@@ -208,9 +208,9 @@ entry:
 %shifttype8i8 = type <8 x i8>
 define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
 entry:
-  ; SSE2: shift8i8
+  ; SSE2-LABEL: shift8i8
   ; SSE2: cost of 32 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i8
+  ; SSE2-CODEGEN-LABEL: shift8i8
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype8i8 %a , %b
@@ -220,9 +220,9 @@ entry:
 %shifttype16i8 = type <16 x i8>
 define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
 entry:
-  ; SSE2: shift16i8
+  ; SSE2-LABEL: shift16i8
   ; SSE2: cost of 26 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i8
+  ; SSE2-CODEGEN-LABEL: shift16i8
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype16i8 %a , %b
@@ -232,9 +232,9 @@ entry:
 %shifttype32i8 = type <32 x i8>
 define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
 entry:
-  ; SSE2: shift32i8
+  ; SSE2-LABEL: shift32i8
   ; SSE2: cost of 52 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i8
+  ; SSE2-CODEGEN-LABEL: shift32i8
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype32i8 %a , %b
@@ -246,9 +246,9 @@ entry:
 %shifttypec = type <2 x i16>
 define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
 entry:
-  ; SSE2: shift2i16const
+  ; SSE2-LABEL: shift2i16const
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i16const
+  ; SSE2-CODEGEN-LABEL: shift2i16const
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec %a , <i16 3, i16 3>
@@ -258,9 +258,9 @@ entry:
 %shifttypec4i16 = type <4 x i16>
 define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
 entry:
-  ; SSE2: shift4i16const
+  ; SSE2-LABEL: shift4i16const
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i16const
+  ; SSE2-CODEGEN-LABEL: shift4i16const
   ; SSE2-CODEGEN: psrld $3
 
   %0 = lshr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
@@ -270,9 +270,9 @@ entry:
 %shifttypec8i16 = type <8 x i16>
 define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
 entry:
-  ; SSE2: shift8i16const
+  ; SSE2-LABEL: shift8i16const
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i16const
+  ; SSE2-CODEGEN-LABEL: shift8i16const
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -284,9 +284,9 @@ entry:
 define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
                                          %shifttypec16i16 %b) {
 entry:
-  ; SSE2: shift16i16const
+  ; SSE2-LABEL: shift16i16const
   ; SSE2: cost of 2 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i16const
+  ; SSE2-CODEGEN-LABEL: shift16i16const
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -300,9 +300,9 @@ entry:
 define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
                                         %shifttypec32i16 %b) {
 entry:
-  ; SSE2: shift32i16const
+  ; SSE2-LABEL: shift32i16const
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i16const
+  ; SSE2-CODEGEN-LABEL: shift32i16const
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -319,9 +319,9 @@ entry:
 %shifttypec2i32 = type <2 x i32>
 define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
 entry:
-  ; SSE2: shift2i32c
+  ; SSE2-LABEL: shift2i32c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i32c
+  ; SSE2-CODEGEN-LABEL: shift2i32c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec2i32 %a , <i32 3, i32 3>
@@ -331,9 +331,9 @@ entry:
 %shifttypec4i32 = type <4 x i32>
 define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
 entry:
-  ; SSE2: shift4i32c
+  ; SSE2-LABEL: shift4i32c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i32c
+  ; SSE2-CODEGEN-LABEL: shift4i32c
   ; SSE2-CODEGEN: psrld $3
 
   %0 = lshr %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
@@ -343,9 +343,9 @@ entry:
 %shifttypec8i32 = type <8 x i32>
 define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
 entry:
-  ; SSE2: shift8i32c
+  ; SSE2-LABEL: shift8i32c
   ; SSE2: cost of 2 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i32c
+  ; SSE2-CODEGEN-LABEL: shift8i32c
   ; SSE2-CODEGEN: psrld $3
 
   %0 = lshr %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -356,9 +356,9 @@ entry:
 %shifttypec16i32 = type <16 x i32>
 define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
 entry:
-  ; SSE2: shift16i32c
+  ; SSE2-LABEL: shift16i32c
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i32c
+  ; SSE2-CODEGEN-LABEL: shift16i32c
   ; SSE2-CODEGEN: psrld $3
 
   %0 = lshr %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -371,9 +371,9 @@ entry:
 %shifttypec32i32 = type <32 x i32>
 define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
 entry:
-  ; SSE2: shift32i32c
+  ; SSE2-LABEL: shift32i32c
   ; SSE2: cost of 8 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i32c
+  ; SSE2-CODEGEN-LABEL: shift32i32c
   ; SSE2-CODEGEN: psrld $3
   %0 = lshr %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
                                    i32 3, i32 3, i32 3, i32 3,
@@ -389,9 +389,9 @@ entry:
 %shifttypec2i64 = type <2 x i64>
 define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
 entry:
-  ; SSE2: shift2i64c
+  ; SSE2-LABEL: shift2i64c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i64c
+  ; SSE2-CODEGEN-LABEL: shift2i64c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec2i64 %a , <i64 3, i64 3>
@@ -401,9 +401,9 @@ entry:
 %shifttypec4i64 = type <4 x i64>
 define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
 entry:
-  ; SSE2: shift4i64c
+  ; SSE2-LABEL: shift4i64c
   ; SSE2: cost of 2 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i64c
+  ; SSE2-CODEGEN-LABEL: shift4i64c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
@@ -413,9 +413,9 @@ entry:
 %shifttypec8i64 = type <8 x i64>
 define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
 entry:
-  ; SSE2: shift8i64c
+  ; SSE2-LABEL: shift8i64c
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i64c
+  ; SSE2-CODEGEN-LABEL: shift8i64c
   ; SSE2-CODEGEN: psrlq $3
 
  %0 = lshr %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -426,9 +426,9 @@ entry:
 %shifttypec16i64 = type <16 x i64>
 define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
 entry:
-  ; SSE2: shift16i64c
+  ; SSE2-LABEL: shift16i64c
   ; SSE2: cost of 8 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i64c
+  ; SSE2-CODEGEN-LABEL: shift16i64c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -441,9 +441,9 @@ entry:
 %shifttypec32i64 = type <32 x i64>
 define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
 entry:
-  ; SSE2: shift32i64c
+  ; SSE2-LABEL: shift32i64c
   ; SSE2: cost of 16 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i64c
+  ; SSE2-CODEGEN-LABEL: shift32i64c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
@@ -460,9 +460,9 @@ entry:
 %shifttypec2i8 = type <2 x i8>
 define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
 entry:
-  ; SSE2: shift2i8c
+  ; SSE2-LABEL: shift2i8c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i8c
+  ; SSE2-CODEGEN-LABEL: shift2i8c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec2i8 %a , <i8 3, i8 3>
@@ -472,9 +472,9 @@ entry:
 %shifttypec4i8 = type <4 x i8>
 define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
 entry:
-  ; SSE2: shift4i8c
+  ; SSE2-LABEL: shift4i8c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i8c
+  ; SSE2-CODEGEN-LABEL: shift4i8c
   ; SSE2-CODEGEN: psrld $3
 
   %0 = lshr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
@@ -484,9 +484,9 @@ entry:
 %shifttypec8i8 = type <8 x i8>
 define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
 entry:
-  ; SSE2: shift8i8c
+  ; SSE2-LABEL: shift8i8c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i8c
+  ; SSE2-CODEGEN-LABEL: shift8i8c
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -497,9 +497,9 @@ entry:
 %shifttypec16i8 = type <16 x i8>
 define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
 entry:
-  ; SSE2: shift16i8c
+  ; SSE2-LABEL: shift16i8c
   ; SSE2: cost of 2 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i8c
+  ; SSE2-CODEGEN-LABEL: shift16i8c
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -512,9 +512,9 @@ entry:
 %shifttypec32i8 = type <32 x i8>
 define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
 entry:
-  ; SSE2: shift32i8c
+  ; SSE2-LABEL: shift32i8c
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i8c
+  ; SSE2-CODEGEN-LABEL: shift32i8c
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
diff --git a/test/Analysis/CostModel/X86/testshiftshl.ll b/test/Analysis/CostModel/X86/testshiftshl.ll
index 5f48b46684d..7db82b9fa5c 100644
--- a/test/Analysis/CostModel/X86/testshiftshl.ll
+++ b/test/Analysis/CostModel/X86/testshiftshl.ll
@@ -4,9 +4,9 @@
 %shifttype = type <2 x i16>
 define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
 entry:
-  ; SSE2: shift2i16
+  ; SSE2-LABEL: shift2i16
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i16
+  ; SSE2-CODEGEN-LABEL: shift2i16
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype %a , %b
@@ -16,9 +16,9 @@ entry:
 %shifttype4i16 = type <4 x i16>
 define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
 entry:
-  ; SSE2: shift4i16
+  ; SSE2-LABEL: shift4i16
   ; SSE2: cost of 10 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i16
+  ; SSE2-CODEGEN-LABEL: shift4i16
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype4i16 %a , %b
@@ -28,9 +28,9 @@ entry:
 %shifttype8i16 = type <8 x i16>
 define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
 entry:
-  ; SSE2: shift8i16
+  ; SSE2-LABEL: shift8i16
   ; SSE2: cost of 32 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i16
+  ; SSE2-CODEGEN-LABEL: shift8i16
   ; SSE2-CODEGEN: pmullw
 
   %0 = shl %shifttype8i16 %a , %b
@@ -40,9 +40,9 @@ entry:
 %shifttype16i16 = type <16 x i16>
 define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
 entry:
-  ; SSE2: shift16i16
+  ; SSE2-LABEL: shift16i16
   ; SSE2: cost of 64 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i16
+  ; SSE2-CODEGEN-LABEL: shift16i16
   ; SSE2-CODEGEN: pmullw
 
   %0 = shl %shifttype16i16 %a , %b
@@ -52,9 +52,9 @@ entry:
 %shifttype32i16 = type <32 x i16>
 define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
 entry:
-  ; SSE2: shift32i16
+  ; SSE2-LABEL: shift32i16
   ; SSE2: cost of 128 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i16
+  ; SSE2-CODEGEN-LABEL: shift32i16
   ; SSE2-CODEGEN: pmullw
 
   %0 = shl %shifttype32i16 %a , %b
@@ -64,9 +64,9 @@ entry:
 %shifttype2i32 = type <2 x i32>
 define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
 entry:
-  ; SSE2: shift2i32
+  ; SSE2-LABEL: shift2i32
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i32
+  ; SSE2-CODEGEN-LABEL: shift2i32
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype2i32 %a , %b
@@ -76,9 +76,9 @@ entry:
 %shifttype4i32 = type <4 x i32>
 define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
 entry:
-  ; SSE2: shift4i32
+  ; SSE2-LABEL: shift4i32
   ; SSE2: cost of 10 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i32
+  ; SSE2-CODEGEN-LABEL: shift4i32
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype4i32 %a , %b
@@ -88,9 +88,9 @@ entry:
 %shifttype8i32 = type <8 x i32>
 define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
 entry:
-  ; SSE2: shift8i32
+  ; SSE2-LABEL: shift8i32
   ; SSE2: cost of 20 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i32
+  ; SSE2-CODEGEN-LABEL: shift8i32
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype8i32 %a , %b
@@ -100,9 +100,9 @@ entry:
 %shifttype16i32 = type <16 x i32>
 define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
 entry:
-  ; SSE2: shift16i32
+  ; SSE2-LABEL: shift16i32
   ; SSE2: cost of 40 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i32
+  ; SSE2-CODEGEN-LABEL: shift16i32
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype16i32 %a , %b
@@ -112,9 +112,9 @@ entry:
 %shifttype32i32 = type <32 x i32>
 define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
 entry:
-  ; SSE2: shift32i32
+  ; SSE2-LABEL: shift32i32
   ; SSE2: cost of 80 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i32
+  ; SSE2-CODEGEN-LABEL: shift32i32
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype32i32 %a , %b
@@ -124,9 +124,9 @@ entry:
 %shifttype2i64 = type <2 x i64>
 define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) {
 entry:
-  ; SSE2: shift2i64
+  ; SSE2-LABEL: shift2i64
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i64
+  ; SSE2-CODEGEN-LABEL: shift2i64
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype2i64 %a , %b
@@ -136,9 +136,9 @@ entry:
 %shifttype4i64 = type <4 x i64>
 define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) {
 entry:
-  ; SSE2: shift4i64
+  ; SSE2-LABEL: shift4i64
   ; SSE2: cost of 8 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i64
+  ; SSE2-CODEGEN-LABEL: shift4i64
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype4i64 %a , %b
@@ -148,9 +148,9 @@ entry:
 %shifttype8i64 = type <8 x i64>
 define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) {
 entry:
-  ; SSE2: shift8i64
+  ; SSE2-LABEL: shift8i64
   ; SSE2: cost of 16 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i64
+  ; SSE2-CODEGEN-LABEL: shift8i64
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype8i64 %a , %b
@@ -160,9 +160,9 @@ entry:
 %shifttype16i64 = type <16 x i64>
 define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) {
 entry:
-  ; SSE2: shift16i64
+  ; SSE2-LABEL: shift16i64
   ; SSE2: cost of 32 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i64
+  ; SSE2-CODEGEN-LABEL: shift16i64
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype16i64 %a , %b
@@ -172,9 +172,9 @@ entry:
 %shifttype32i64 = type <32 x i64>
 define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) {
 entry:
-  ; SSE2: shift32i64
+  ; SSE2-LABEL: shift32i64
   ; SSE2: cost of 64 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i64
+  ; SSE2-CODEGEN-LABEL: shift32i64
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype32i64 %a , %b
@@ -184,9 +184,9 @@ entry:
 %shifttype2i8 = type <2 x i8>
 define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
 entry:
-  ; SSE2: shift2i8
+  ; SSE2-LABEL: shift2i8
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i8
+  ; SSE2-CODEGEN-LABEL: shift2i8
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype2i8 %a , %b
@@ -196,9 +196,9 @@ entry:
 %shifttype4i8 = type <4 x i8>
 define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
 entry:
-  ; SSE2: shift4i8
+  ; SSE2-LABEL: shift4i8
   ; SSE2: cost of 10 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i8
+  ; SSE2-CODEGEN-LABEL: shift4i8
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype4i8 %a , %b
@@ -208,9 +208,9 @@ entry:
 %shifttype8i8 = type <8 x i8>
 define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
 entry:
-  ; SSE2: shift8i8
+  ; SSE2-LABEL: shift8i8
   ; SSE2: cost of 32 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i8
+  ; SSE2-CODEGEN-LABEL: shift8i8
   ; SSE2-CODEGEN: pmullw
 
   %0 = shl %shifttype8i8 %a , %b
@@ -220,9 +220,9 @@ entry:
 %shifttype16i8 = type <16 x i8>
 define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
 entry:
-  ; SSE2: shift16i8
+  ; SSE2-LABEL: shift16i8
   ; SSE2: cost of 26 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i8
+  ; SSE2-CODEGEN-LABEL: shift16i8
   ; SSE2-CODEGEN: psllw
 
   %0 = shl %shifttype16i8 %a , %b
@@ -232,9 +232,9 @@ entry:
 %shifttype32i8 = type <32 x i8>
 define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
 entry:
-  ; SSE2: shift32i8
+  ; SSE2-LABEL: shift32i8
   ; SSE2: cost of 52 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i8
+  ; SSE2-CODEGEN-LABEL: shift32i8
   ; SSE2-CODEGEN: psllw
 
   %0 = shl %shifttype32i8 %a , %b
@@ -246,9 +246,9 @@ entry:
 %shifttypec = type <2 x i16>
 define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
 entry:
-  ; SSE2: shift2i16const
+  ; SSE2-LABEL: shift2i16const
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i16const
+  ; SSE2-CODEGEN-LABEL: shift2i16const
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec %a , <i16 3, i16 3>
@@ -258,9 +258,9 @@ entry:
 %shifttypec4i16 = type <4 x i16>
 define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
 entry:
-  ; SSE2: shift4i16const
+  ; SSE2-LABEL: shift4i16const
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i16const
+  ; SSE2-CODEGEN-LABEL: shift4i16const
   ; SSE2-CODEGEN: pslld $3
 
   %0 = shl %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
@@ -270,9 +270,9 @@ entry:
 %shifttypec8i16 = type <8 x i16>
 define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
 entry:
-  ; SSE2: shift8i16const
+  ; SSE2-LABEL: shift8i16const
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i16const
+  ; SSE2-CODEGEN-LABEL: shift8i16const
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -284,9 +284,9 @@ entry:
 define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
                                          %shifttypec16i16 %b) {
 entry:
-  ; SSE2: shift16i16const
+  ; SSE2-LABEL: shift16i16const
   ; SSE2: cost of 2 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i16const
+  ; SSE2-CODEGEN-LABEL: shift16i16const
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -300,9 +300,9 @@ entry:
 define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
                                         %shifttypec32i16 %b) {
 entry:
-  ; SSE2: shift32i16const
+  ; SSE2-LABEL: shift32i16const
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i16const
+  ; SSE2-CODEGEN-LABEL: shift32i16const
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -319,9 +319,9 @@ entry:
 %shifttypec2i32 = type <2 x i32>
 define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
 entry:
-  ; SSE2: shift2i32c
+  ; SSE2-LABEL: shift2i32c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i32c
+  ; SSE2-CODEGEN-LABEL: shift2i32c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec2i32 %a , <i32 3, i32 3>
@@ -331,9 +331,9 @@ entry:
 %shifttypec4i32 = type <4 x i32>
 define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
 entry:
-  ; SSE2: shift4i32c
+  ; SSE2-LABEL: shift4i32c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i32c
+  ; SSE2-CODEGEN-LABEL: shift4i32c
   ; SSE2-CODEGEN: pslld $3
 
   %0 = shl %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
@@ -343,9 +343,9 @@ entry:
 %shifttypec8i32 = type <8 x i32>
 define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
 entry:
-  ; SSE2: shift8i32c
+  ; SSE2-LABEL: shift8i32c
   ; SSE2: cost of 2 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i32c
+  ; SSE2-CODEGEN-LABEL: shift8i32c
   ; SSE2-CODEGEN: pslld $3
 
   %0 = shl %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -356,9 +356,9 @@ entry:
 %shifttypec16i32 = type <16 x i32>
 define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
 entry:
-  ; SSE2: shift16i32c
+  ; SSE2-LABEL: shift16i32c
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i32c
+  ; SSE2-CODEGEN-LABEL: shift16i32c
   ; SSE2-CODEGEN: pslld $3
 
   %0 = shl %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -371,9 +371,9 @@ entry:
 %shifttypec32i32 = type <32 x i32>
 define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
 entry:
-  ; SSE2: shift32i32c
+  ; SSE2-LABEL: shift32i32c
   ; SSE2: cost of 8 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i32c
+  ; SSE2-CODEGEN-LABEL: shift32i32c
   ; SSE2-CODEGEN: pslld $3
   %0 = shl %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
                                    i32 3, i32 3, i32 3, i32 3,
@@ -389,9 +389,9 @@ entry:
 %shifttypec2i64 = type <2 x i64>
 define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
 entry:
-  ; SSE2: shift2i64c
+  ; SSE2-LABEL: shift2i64c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i64c
+  ; SSE2-CODEGEN-LABEL: shift2i64c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec2i64 %a , <i64 3, i64 3>
@@ -401,9 +401,9 @@ entry:
 %shifttypec4i64 = type <4 x i64>
 define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
 entry:
-  ; SSE2: shift4i64c
+  ; SSE2-LABEL: shift4i64c
   ; SSE2: cost of 2 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i64c
+  ; SSE2-CODEGEN-LABEL: shift4i64c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
@@ -413,9 +413,9 @@ entry:
 %shifttypec8i64 = type <8 x i64>
 define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
 entry:
-  ; SSE2: shift8i64c
+  ; SSE2-LABEL: shift8i64c
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i64c
+  ; SSE2-CODEGEN-LABEL: shift8i64c
   ; SSE2-CODEGEN: psllq $3
 
  %0 = shl %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -426,9 +426,9 @@ entry:
 %shifttypec16i64 = type <16 x i64>
 define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
 entry:
-  ; SSE2: shift16i64c
+  ; SSE2-LABEL: shift16i64c
   ; SSE2: cost of 8 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i64c
+  ; SSE2-CODEGEN-LABEL: shift16i64c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -441,9 +441,9 @@ entry:
 %shifttypec32i64 = type <32 x i64>
 define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
 entry:
-  ; SSE2: shift32i64c
+  ; SSE2-LABEL: shift32i64c
   ; SSE2: cost of 16 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i64c
+  ; SSE2-CODEGEN-LABEL: shift32i64c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
@@ -460,9 +460,9 @@ entry:
 %shifttypec2i8 = type <2 x i8>
 define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
 entry:
-  ; SSE2: shift2i8c
+  ; SSE2-LABEL: shift2i8c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i8c
+  ; SSE2-CODEGEN-LABEL: shift2i8c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec2i8 %a , <i8 3, i8 3>
@@ -472,9 +472,9 @@ entry:
 %shifttypec4i8 = type <4 x i8>
 define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
 entry:
-  ; SSE2: shift4i8c
+  ; SSE2-LABEL: shift4i8c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i8c
+  ; SSE2-CODEGEN-LABEL: shift4i8c
   ; SSE2-CODEGEN: pslld $3
 
   %0 = shl %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
@@ -484,9 +484,9 @@ entry:
 %shifttypec8i8 = type <8 x i8>
 define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
 entry:
-  ; SSE2: shift8i8c
+  ; SSE2-LABEL: shift8i8c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i8c
+  ; SSE2-CODEGEN-LABEL: shift8i8c
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -497,9 +497,9 @@ entry:
 %shifttypec16i8 = type <16 x i8>
 define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
 entry:
-  ; SSE2: shift16i8c
+  ; SSE2-LABEL: shift16i8c
   ; SSE2: cost of 2 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i8c
+  ; SSE2-CODEGEN-LABEL: shift16i8c
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -512,9 +512,9 @@ entry:
 %shifttypec32i8 = type <32 x i8>
 define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
 entry:
-  ; SSE2: shift32i8c
+  ; SSE2-LABEL: shift32i8c
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i8c
+  ; SSE2-CODEGEN-LABEL: shift32i8c
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
-- 
GitLab


From e6fdc842f78352a916d688f3bf2f28b1424a53fd Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 26 Oct 2018 17:21:26 +0000
Subject: [PATCH 0644/1116] [X86] Stop promoting vector and/or/xor/andn to
 vXi64.

These promotions add additional bitcasts to the SelectionDAG that can pessimize computeKnownBits/computeNumSignBits. It also seems to interfere with broadcast formation.

This patch removes the promotion and adds isel patterns instead.

The increased table size is more than I would like, but hopefully we can find some canonicalizations or other tricks to start pruning out patterns going forward.

Differential Revision: https://reviews.llvm.org/D53268

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345408 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            |  63 +-
 lib/Target/X86/X86InstrAVX512.td              | 679 ++++++++++++------
 lib/Target/X86/X86InstrFragmentsSIMD.td       |   1 +
 lib/Target/X86/X86InstrSSE.td                 | 111 +++
 lib/Target/X86/X86InstrXOP.td                 |  43 ++
 test/CodeGen/X86/avx512-arith.ll              |   6 +-
 .../X86/avx512-intrinsics-fast-isel.ll        |   8 +-
 test/CodeGen/X86/avx512-intrinsics-upgrade.ll |   6 +-
 test/CodeGen/X86/avx512-logic.ll              |  10 +-
 test/CodeGen/X86/avx512-mask-op.ll            |  14 +-
 test/CodeGen/X86/avx512-schedule.ll           |  16 +-
 test/CodeGen/X86/avx512-select.ll             |   4 +-
 .../X86/bitcast-int-to-vector-bool-sext.ll    |   2 +-
 .../X86/bitcast-int-to-vector-bool-zext.ll    |   2 +-
 test/CodeGen/X86/bitcast-setcc-128.ll         |   1 -
 .../X86/broadcast-elm-cross-splat-vec.ll      |  80 ++-
 test/CodeGen/X86/movmsk-cmp.ll                |  48 +-
 test/CodeGen/X86/psubus.ll                    |  54 +-
 test/CodeGen/X86/sat-add.ll                   |   7 +-
 ...-masked-merge-vector-variablemask-const.ll |   8 +-
 test/CodeGen/X86/vec-copysign-avx512.ll       |   2 +-
 test/CodeGen/X86/vector-bitreverse.ll         |  12 +-
 test/CodeGen/X86/vector-lzcnt-512.ll          |  40 +-
 test/CodeGen/X86/vector-reduce-and.ll         |  18 +-
 test/CodeGen/X86/vector-reduce-or.ll          |  18 +-
 test/CodeGen/X86/vector-reduce-xor.ll         |  18 +-
 test/CodeGen/X86/vector-rotate-512.ll         |   6 +-
 test/CodeGen/X86/vector-trunc-math.ll         |   6 +-
 test/CodeGen/X86/vector-tzcnt-512.ll          |  20 +-
 29 files changed, 844 insertions(+), 459 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index bfd8c89599b..2ebaec778e3 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -9882,11 +9882,7 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
 
   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
-  // We have to cast V2 around.
-  MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-  V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
-                                      DAG.getBitcast(MaskVT, V1Mask),
-                                      DAG.getBitcast(MaskVT, V2)));
+  V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
 }
 
@@ -35055,13 +35051,13 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::AND);
 
-  EVT VT = N->getValueType(0);
-  if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
+  MVT VT = N->getSimpleValueType(0);
+  if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
     return SDValue();
 
   SDValue X, Y;
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
+  SDValue N0 = peekThroughBitcasts(N->getOperand(0));
+  SDValue N1 = peekThroughBitcasts(N->getOperand(1));
   if (N0.getOpcode() == ISD::XOR &&
       ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
     X = N0.getOperand(0);
@@ -35073,6 +35069,8 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   } else
     return SDValue();
 
+  X = DAG.getBitcast(VT, X);
+  Y = DAG.getBitcast(VT, Y);
   return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
 }
 
@@ -35402,27 +35400,6 @@ static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
 }
 
-// This promotes vectors and/or/xor to a vXi64 type. We used to do this during
-// op legalization, but DAG combine yields better results.
-// TODO: This is largely just to reduce the number of isel patterns. Maybe we
-// can just add all the patterns or do C++ based selection in X86ISelDAGToDAG?
-static SDValue promoteVecLogicOp(SDNode *N, SelectionDAG &DAG) {
-  MVT VT = N->getSimpleValueType(0);
-
-  if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
-    return SDValue();
-
-  // Already correct type.
-  if (VT.getVectorElementType() == MVT::i64)
-    return SDValue();
-
-  MVT NewVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-  SDValue Op0 = DAG.getBitcast(NewVT, N->getOperand(0));
-  SDValue Op1 = DAG.getBitcast(NewVT, N->getOperand(1));
-  return DAG.getBitcast(VT, DAG.getNode(N->getOpcode(), SDLoc(N), NewVT,
-                                        Op0, Op1));
-}
-
 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -35457,9 +35434,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (SDValue V = promoteVecLogicOp(N, DAG))
-    return V;
-
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
@@ -35647,7 +35621,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   if (!Subtarget.hasSSE41())
     return SDValue();
 
-  MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+  MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
 
   X = DAG.getBitcast(BlendVT, X);
   Y = DAG.getBitcast(BlendVT, Y);
@@ -35782,9 +35756,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (SDValue V = promoteVecLogicOp(N, DAG))
-    return V;
-
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
@@ -37760,7 +37731,9 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
   if ((VT.isVector() || VT == MVT::f128) && Subtarget.hasSSE2()) {
     SDLoc dl(N);
 
-    MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+    unsigned IntBits = std::min(VT.getScalarSizeInBits(), 64U);
+    MVT IntSVT = MVT::getIntegerVT(IntBits);
+    MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
 
     SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
     SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
@@ -37813,9 +37786,6 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (SDValue V = promoteVecLogicOp(N, DAG))
-    return V;
-
   if (SDValue SetCC = foldXor1SetCC(N, DAG))
     return SetCC;
 
@@ -38043,15 +38013,22 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
+  MVT VT = N->getSimpleValueType(0);
+
   // ANDNP(0, x) -> x
   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
     return N->getOperand(1);
 
   // ANDNP(x, 0) -> 0
   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
-    return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
+    return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
 
-  EVT VT = N->getValueType(0);
+  // Turn ANDNP back to AND if input is inverted.
+  if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR &&
+      ISD::isBuildVectorAllOnes(N->getOperand(0).getOperand(1).getNode())) {
+    return DAG.getNode(ISD::AND, SDLoc(N), VT,
+                       N->getOperand(0).getOperand(0), N->getOperand(1));
+  }
 
   // Attempt to recursively combine a bitmask ANDNP with shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 5550eb0061f..ec314f329fd 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -68,13 +68,6 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   // Load patterns
   PatFrag LdFrag = !cast<PatFrag>("load" # VTName);
 
-  PatFrag i64LdFrag = !cast<PatFrag>("load" #
-                                     !if (!eq (TypeVariantName, "i"),
-                                          !if (!eq (Size, 128), "v2i64",
-                                          !if (!eq (Size, 256), "v4i64",
-                                          !if (!eq (Size, 512), "v8i64",
-                                               VTName))), VTName));
-
   PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # VTName);
 
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
@@ -102,10 +95,6 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
 
   RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
 
-  // A vector tye of the same width with element type i64. This is used to
-  // create patterns for logic ops.
-  ValueType i64VT = !cast<ValueType>("v" # !srl(Size, 6) # "i64");
-
   // A vector type of the same width with element type i32.  This is used to
   // create the canonical constant zero node ImmAllZerosV.
   ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
@@ -5094,152 +5083,147 @@ let Predicates = [HasAVX512, NoVLX] in {
 // AVX-512  Logical Instructions
 //===----------------------------------------------------------------------===//
 
-// OpNodeMsk is the OpNode to use when element size is important. OpNode will
-// be set to null_frag for 32-bit elements.
-multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
-                           SDPatternOperator OpNode,
-                           SDNode OpNodeMsk, X86FoldableSchedWrite sched,
-                           X86VectorVTInfo _, bit IsCommutable = 0> {
-  let hasSideEffects = 0 in
-  defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
-                    "$src2, $src1", "$src1, $src2",
-                    (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
-                                     (bitconvert (_.VT _.RC:$src2)))),
-                    (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
-                                                          _.RC:$src2)))),
-                    IsCommutable>, AVX512BIBase, EVEX_4V,
-                    Sched<[sched]>;
-
-  let hasSideEffects = 0, mayLoad = 1 in
-  defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
-                  "$src2, $src1", "$src1, $src2",
-                  (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
-                                   (bitconvert (_.LdFrag addr:$src2)))),
-                  (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
-                                     (_.i64LdFrag addr:$src2)))))>,
-                  AVX512BIBase, EVEX_4V,
-                  Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
-
-// OpNodeMsk is the OpNode to use where element size is important. So use
-// for all of the broadcast patterns.
-multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr,
-                            SDPatternOperator OpNode,
-                            SDNode OpNodeMsk, X86FoldableSchedWrite sched, X86VectorVTInfo _,
-                            bit IsCommutable = 0> :
-           avx512_logic_rm<opc, OpcodeStr, OpNode, OpNodeMsk, sched, _,
-                           IsCommutable> {
-  defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
-                  "${src2}"##_.BroadcastStr##", $src1",
-                  "$src1, ${src2}"##_.BroadcastStr,
-                  (_.i64VT (OpNodeMsk _.RC:$src1,
-                                   (bitconvert
-                                    (_.VT (X86VBroadcast
-                                            (_.ScalarLdFrag addr:$src2)))))),
-                  (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
-                                     (bitconvert
-                                      (_.VT (X86VBroadcast
-                                             (_.ScalarLdFrag addr:$src2))))))))>,
-                  AVX512BIBase, EVEX_4V, EVEX_B,
-                  Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
-
-multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr,
-                               SDPatternOperator OpNode,
-                               SDNode OpNodeMsk, X86SchedWriteWidths sched,
-                               AVX512VLVectorVTInfo VTInfo,
-                               bit IsCommutable = 0> {
-  let Predicates = [HasAVX512] in
-    defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.ZMM,
-                              VTInfo.info512, IsCommutable>, EVEX_V512;
-
-  let Predicates = [HasAVX512, HasVLX] in {
-    defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.YMM,
-                                 VTInfo.info256, IsCommutable>, EVEX_V256;
-    defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.XMM,
-                                 VTInfo.info128, IsCommutable>, EVEX_V128;
-  }
-}
-
-multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
-                                 SDNode OpNode, X86SchedWriteWidths sched,
-                                 bit IsCommutable = 0> {
-  defm Q : avx512_logic_rmb_vl<opc_q, OpcodeStr#"q", OpNode, OpNode, sched,
-                               avx512vl_i64_info, IsCommutable>,
-                               VEX_W, EVEX_CD8<64, CD8VF>;
-  defm D : avx512_logic_rmb_vl<opc_d, OpcodeStr#"d", null_frag, OpNode, sched,
-                               avx512vl_i32_info, IsCommutable>,
-                               EVEX_CD8<32, CD8VF>;
-}
-
-defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and,
-                                   SchedWriteVecLogic, 1>;
-defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or,
-                                  SchedWriteVecLogic, 1>;
-defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
-                                   SchedWriteVecLogic, 1>;
-defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
-                                    SchedWriteVecLogic>;
+defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+                                   SchedWriteVecLogic, HasAVX512, 1>;
+defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+                                  SchedWriteVecLogic, HasAVX512, 1>;
+defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+                                   SchedWriteVecLogic, HasAVX512, 1>;
+defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+                                    SchedWriteVecLogic, HasAVX512>;
 
 let Predicates = [HasVLX] in {
   def : Pat<(v16i8 (and VR128X:$src1, VR128X:$src2)),
             (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>;
   def : Pat<(v8i16 (and VR128X:$src1, VR128X:$src2)),
             (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>;
-  def : Pat<(v4i32 (and VR128X:$src1, VR128X:$src2)),
-            (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>;
 
   def : Pat<(v16i8 (or VR128X:$src1, VR128X:$src2)),
             (VPORQZ128rr VR128X:$src1, VR128X:$src2)>;
   def : Pat<(v8i16 (or VR128X:$src1, VR128X:$src2)),
             (VPORQZ128rr VR128X:$src1, VR128X:$src2)>;
-  def : Pat<(v4i32 (or VR128X:$src1, VR128X:$src2)),
-            (VPORQZ128rr VR128X:$src1, VR128X:$src2)>;
 
   def : Pat<(v16i8 (xor VR128X:$src1, VR128X:$src2)),
             (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>;
   def : Pat<(v8i16 (xor VR128X:$src1, VR128X:$src2)),
             (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>;
-  def : Pat<(v4i32 (xor VR128X:$src1, VR128X:$src2)),
-            (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>;
 
   def : Pat<(v16i8 (X86andnp VR128X:$src1, VR128X:$src2)),
             (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>;
   def : Pat<(v8i16 (X86andnp VR128X:$src1, VR128X:$src2)),
             (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>;
-  def : Pat<(v4i32 (X86andnp VR128X:$src1, VR128X:$src2)),
-            (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+  def : Pat<(and VR128X:$src1, (loadv16i8 addr:$src2)),
+            (VPANDQZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(and VR128X:$src1, (loadv8i16 addr:$src2)),
+            (VPANDQZ128rm VR128X:$src1, addr:$src2)>;
+
+  def : Pat<(or VR128X:$src1, (loadv16i8 addr:$src2)),
+            (VPORQZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(or VR128X:$src1, (loadv8i16 addr:$src2)),
+            (VPORQZ128rm VR128X:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR128X:$src1, (loadv16i8 addr:$src2)),
+            (VPXORQZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(xor VR128X:$src1, (loadv8i16 addr:$src2)),
+            (VPXORQZ128rm VR128X:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR128X:$src1, (loadv16i8 addr:$src2)),
+            (VPANDNQZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128X:$src1, (loadv8i16 addr:$src2)),
+            (VPANDNQZ128rm VR128X:$src1, addr:$src2)>;
+
+  def : Pat<(and VR128X:$src1,
+                 (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDDZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(or VR128X:$src1,
+                (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPORDZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(xor VR128X:$src1,
+                 (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPXORDZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128X:$src1,
+                      (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDNDZ128rmb VR128X:$src1, addr:$src2)>;
+
+  def : Pat<(and VR128X:$src1,
+                 (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDQZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(or VR128X:$src1,
+                (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPORQZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(xor VR128X:$src1,
+                 (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPXORQZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128X:$src1,
+                      (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDNQZ128rmb VR128X:$src1, addr:$src2)>;
 
   def : Pat<(v32i8 (and VR256X:$src1, VR256X:$src2)),
             (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
   def : Pat<(v16i16 (and VR256X:$src1, VR256X:$src2)),
             (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
-  def : Pat<(v8i32 (and VR256X:$src1, VR256X:$src2)),
-            (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
 
   def : Pat<(v32i8 (or VR256X:$src1, VR256X:$src2)),
             (VPORQZ256rr VR256X:$src1, VR256X:$src2)>;
   def : Pat<(v16i16 (or VR256X:$src1, VR256X:$src2)),
             (VPORQZ256rr VR256X:$src1, VR256X:$src2)>;
-  def : Pat<(v8i32 (or VR256X:$src1, VR256X:$src2)),
-            (VPORQZ256rr VR256X:$src1, VR256X:$src2)>;
 
   def : Pat<(v32i8 (xor VR256X:$src1, VR256X:$src2)),
             (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>;
   def : Pat<(v16i16 (xor VR256X:$src1, VR256X:$src2)),
             (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>;
-  def : Pat<(v8i32 (xor VR256X:$src1, VR256X:$src2)),
-            (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>;
 
   def : Pat<(v32i8 (X86andnp VR256X:$src1, VR256X:$src2)),
             (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>;
   def : Pat<(v16i16 (X86andnp VR256X:$src1, VR256X:$src2)),
             (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>;
-  def : Pat<(v8i32 (X86andnp VR256X:$src1, VR256X:$src2)),
-            (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+  def : Pat<(and VR256X:$src1, (loadv32i8 addr:$src2)),
+            (VPANDQZ256rm VR256X:$src1, addr:$src2)>;
+  def : Pat<(and VR256X:$src1, (loadv16i16 addr:$src2)),
+            (VPANDQZ256rm VR256X:$src1, addr:$src2)>;
+
+  def : Pat<(or VR256X:$src1, (loadv32i8 addr:$src2)),
+            (VPORQZ256rm VR256X:$src1, addr:$src2)>;
+  def : Pat<(or VR256X:$src1, (loadv16i16 addr:$src2)),
+            (VPORQZ256rm VR256X:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR256X:$src1, (loadv32i8 addr:$src2)),
+            (VPXORQZ256rm VR256X:$src1, addr:$src2)>;
+  def : Pat<(xor VR256X:$src1, (loadv16i16 addr:$src2)),
+            (VPXORQZ256rm VR256X:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR256X:$src1, (loadv32i8 addr:$src2)),
+            (VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256X:$src1, (loadv16i16 addr:$src2)),
+            (VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
+
+  def : Pat<(and VR256X:$src1,
+                 (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDDZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(or VR256X:$src1,
+                (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPORDZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(xor VR256X:$src1,
+                 (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPXORDZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256X:$src1,
+                      (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDNDZ256rmb VR256X:$src1, addr:$src2)>;
+
+  def : Pat<(and VR256X:$src1,
+                 (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDQZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(or VR256X:$src1,
+                (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPORQZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(xor VR256X:$src1,
+                 (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPXORQZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256X:$src1,
+                      (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDNQZ256rmb VR256X:$src1, addr:$src2)>;
 }
 
 let Predicates = [HasAVX512] in {
@@ -5247,31 +5231,209 @@ let Predicates = [HasAVX512] in {
             (VPANDQZrr VR512:$src1, VR512:$src2)>;
   def : Pat<(v32i16 (and VR512:$src1, VR512:$src2)),
             (VPANDQZrr VR512:$src1, VR512:$src2)>;
-  def : Pat<(v16i32 (and VR512:$src1, VR512:$src2)),
-            (VPANDQZrr VR512:$src1, VR512:$src2)>;
 
   def : Pat<(v64i8 (or VR512:$src1, VR512:$src2)),
             (VPORQZrr VR512:$src1, VR512:$src2)>;
   def : Pat<(v32i16 (or VR512:$src1, VR512:$src2)),
             (VPORQZrr VR512:$src1, VR512:$src2)>;
-  def : Pat<(v16i32 (or VR512:$src1, VR512:$src2)),
-            (VPORQZrr VR512:$src1, VR512:$src2)>;
 
   def : Pat<(v64i8 (xor VR512:$src1, VR512:$src2)),
             (VPXORQZrr VR512:$src1, VR512:$src2)>;
   def : Pat<(v32i16 (xor VR512:$src1, VR512:$src2)),
             (VPXORQZrr VR512:$src1, VR512:$src2)>;
-  def : Pat<(v16i32 (xor VR512:$src1, VR512:$src2)),
-            (VPXORQZrr VR512:$src1, VR512:$src2)>;
 
   def : Pat<(v64i8 (X86andnp VR512:$src1, VR512:$src2)),
             (VPANDNQZrr VR512:$src1, VR512:$src2)>;
   def : Pat<(v32i16 (X86andnp VR512:$src1, VR512:$src2)),
             (VPANDNQZrr VR512:$src1, VR512:$src2)>;
-  def : Pat<(v16i32 (X86andnp VR512:$src1, VR512:$src2)),
-            (VPANDNQZrr VR512:$src1, VR512:$src2)>;
+
+  def : Pat<(and VR512:$src1, (loadv64i8 addr:$src2)),
+            (VPANDQZrm VR512:$src1, addr:$src2)>;
+  def : Pat<(and VR512:$src1, (loadv32i16 addr:$src2)),
+            (VPANDQZrm VR512:$src1, addr:$src2)>;
+
+  def : Pat<(or VR512:$src1, (loadv64i8 addr:$src2)),
+            (VPORQZrm VR512:$src1, addr:$src2)>;
+  def : Pat<(or VR512:$src1, (loadv32i16 addr:$src2)),
+            (VPORQZrm VR512:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR512:$src1, (loadv64i8 addr:$src2)),
+            (VPXORQZrm VR512:$src1, addr:$src2)>;
+  def : Pat<(xor VR512:$src1, (loadv32i16 addr:$src2)),
+            (VPXORQZrm VR512:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR512:$src1, (loadv64i8 addr:$src2)),
+            (VPANDNQZrm VR512:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR512:$src1, (loadv32i16 addr:$src2)),
+            (VPANDNQZrm VR512:$src1, addr:$src2)>;
+
+  def : Pat<(and VR512:$src1,
+                 (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDDZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(or VR512:$src1,
+                (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPORDZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(xor VR512:$src1,
+                 (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPXORDZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR512:$src1,
+                      (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDNDZrmb VR512:$src1, addr:$src2)>;
+
+  def : Pat<(and VR512:$src1,
+                 (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDQZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(or VR512:$src1,
+                (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPORQZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(xor VR512:$src1,
+                 (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPXORQZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR512:$src1,
+                      (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDNQZrmb VR512:$src1, addr:$src2)>;
+}
+
+// Patterns to catch vselect with different type than logic op.
+multiclass avx512_logical_lowering<string InstrStr, SDNode OpNode,
+                                    X86VectorVTInfo _,
+                                    X86VectorVTInfo IntInfo> {
+  // Masked register-register logical operations.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))),
+                   _.RC:$src0)),
+            (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask,
+             _.RC:$src1, _.RC:$src2)>;
+
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1,
+             _.RC:$src2)>;
+
+  // Masked register-memory logical operations.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (IntInfo.VT (OpNode _.RC:$src1,
+                                            (load addr:$src2)))),
+                   _.RC:$src0)),
+            (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask,
+             _.RC:$src1, addr:$src2)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (IntInfo.VT (OpNode _.RC:$src1,
+                                            (load addr:$src2)))),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1,
+             addr:$src2)>;
 }
 
+multiclass avx512_logical_lowering_bcast<string InstrStr, SDNode OpNode,
+                                         X86VectorVTInfo _,
+                                         X86VectorVTInfo IntInfo> {
+  // Register-broadcast logical operations.
+  def : Pat<(IntInfo.VT (OpNode _.RC:$src1,
+                         (bitconvert (_.VT (X86VBroadcast
+                                            (_.ScalarLdFrag addr:$src2)))))),
+            (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert
+                    (IntInfo.VT (OpNode _.RC:$src1,
+                                 (bitconvert (_.VT
+                                              (X86VBroadcast
+                                               (_.ScalarLdFrag addr:$src2))))))),
+                   _.RC:$src0)),
+            (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
+             _.RC:$src1, addr:$src2)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert
+                    (IntInfo.VT (OpNode _.RC:$src1,
+                                 (bitconvert (_.VT
+                                              (X86VBroadcast
+                                               (_.ScalarLdFrag addr:$src2))))))),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr#rmbkz)  _.KRCWM:$mask,
+             _.RC:$src1, addr:$src2)>;
+}
+
+multiclass avx512_logical_lowering_sizes<string InstrStr, SDNode OpNode,
+                                         AVX512VLVectorVTInfo SelectInfo,
+                                         AVX512VLVectorVTInfo IntInfo> {
+let Predicates = [HasVLX] in {
+  defm : avx512_logical_lowering<InstrStr#"Z128", OpNode, SelectInfo.info128,
+                                 IntInfo.info128>;
+  defm : avx512_logical_lowering<InstrStr#"Z256", OpNode, SelectInfo.info256,
+                                 IntInfo.info256>;
+}
+let Predicates = [HasAVX512] in {
+  defm : avx512_logical_lowering<InstrStr#"Z", OpNode, SelectInfo.info512,
+                                 IntInfo.info512>;
+}
+}
+
+multiclass avx512_logical_lowering_sizes_bcast<string InstrStr, SDNode OpNode,
+                                               AVX512VLVectorVTInfo SelectInfo,
+                                               AVX512VLVectorVTInfo IntInfo> {
+let Predicates = [HasVLX] in {
+  defm : avx512_logical_lowering_bcast<InstrStr#"Z128", OpNode,
+                                       SelectInfo.info128, IntInfo.info128>;
+  defm : avx512_logical_lowering_bcast<InstrStr#"Z256", OpNode,
+                                       SelectInfo.info256, IntInfo.info256>;
+}
+let Predicates = [HasAVX512] in {
+  defm : avx512_logical_lowering_bcast<InstrStr#"Z", OpNode,
+                                       SelectInfo.info512, IntInfo.info512>;
+}
+}
+
+multiclass avx512_logical_lowering_types<string InstrStr, SDNode OpNode> {
+  // i64 vselect with i32/i16/i8 logic op
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_i64_info,
+                                       avx512vl_i32_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_i64_info,
+                                       avx512vl_i16_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_i64_info,
+                                       avx512vl_i8_info>;
+
+  // i32 vselect with i64/i16/i8 logic op
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_i32_info,
+                                       avx512vl_i64_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_i32_info,
+                                       avx512vl_i16_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_i32_info,
+                                       avx512vl_i8_info>;
+
+  // f32 vselect with i64/i32/i16/i8 logic op
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+                                       avx512vl_i64_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+                                       avx512vl_i32_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+                                       avx512vl_i16_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+                                       avx512vl_i8_info>;
+
+  // f64 vselect with i64/i32/i16/i8 logic op
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+                                       avx512vl_i64_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+                                       avx512vl_i32_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+                                       avx512vl_i16_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+                                       avx512vl_i8_info>;
+
+  defm : avx512_logical_lowering_sizes_bcast<InstrStr#"D", OpNode,
+                                             avx512vl_f32_info,
+                                             avx512vl_i32_info>;
+  defm : avx512_logical_lowering_sizes_bcast<InstrStr#"Q", OpNode,
+                                             avx512vl_f64_info,
+                                             avx512vl_i64_info>;
+}
+
+defm : avx512_logical_lowering_types<"VPAND", and>;
+defm : avx512_logical_lowering_types<"VPOR",  or>;
+defm : avx512_logical_lowering_types<"VPXOR", xor>;
+defm : avx512_logical_lowering_types<"VPANDN", X86andnp>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512  FP arithmetic
 //===----------------------------------------------------------------------===//
@@ -5575,73 +5737,6 @@ defm VOR   : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
 defm VXOR  : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
                                SchedWriteFLogicSizes, 1>;
 
-// Patterns catch floating point selects with bitcasted integer logic ops.
-multiclass avx512_fp_logical_lowering<string InstrStr, SDNode OpNode,
-                                      X86VectorVTInfo _, Predicate prd> {
-let Predicates = [prd] in {
-  // Masked register-register logical operations.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
-                   _.RC:$src0)),
-            (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask,
-             _.RC:$src1, _.RC:$src2)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
-                   _.ImmAllZerosV)),
-            (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1,
-             _.RC:$src2)>;
-  // Masked register-memory logical operations.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert (_.i64VT (OpNode _.RC:$src1,
-                                         (load addr:$src2)))),
-                   _.RC:$src0)),
-            (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask,
-             _.RC:$src1, addr:$src2)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert (_.i64VT (OpNode _.RC:$src1, (load addr:$src2)))),
-                   _.ImmAllZerosV)),
-            (!cast<Instruction>(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1,
-             addr:$src2)>;
-  // Register-broadcast logical operations.
-  def : Pat<(_.i64VT (OpNode _.RC:$src1,
-                      (bitconvert (_.VT (X86VBroadcast
-                                         (_.ScalarLdFrag addr:$src2)))))),
-            (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert
-                    (_.i64VT (OpNode _.RC:$src1,
-                              (bitconvert (_.VT
-                                           (X86VBroadcast
-                                            (_.ScalarLdFrag addr:$src2))))))),
-                   _.RC:$src0)),
-            (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
-             _.RC:$src1, addr:$src2)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert
-                    (_.i64VT (OpNode _.RC:$src1,
-                              (bitconvert (_.VT
-                                           (X86VBroadcast
-                                            (_.ScalarLdFrag addr:$src2))))))),
-                   _.ImmAllZerosV)),
-            (!cast<Instruction>(InstrStr#rmbkz)  _.KRCWM:$mask,
-             _.RC:$src1, addr:$src2)>;
-}
-}
-
-multiclass avx512_fp_logical_lowering_sizes<string InstrStr, SDNode OpNode> {
-  defm : avx512_fp_logical_lowering<InstrStr#DZ128, OpNode, v4f32x_info, HasVLX>;
-  defm : avx512_fp_logical_lowering<InstrStr#QZ128, OpNode, v2f64x_info, HasVLX>;
-  defm : avx512_fp_logical_lowering<InstrStr#DZ256, OpNode, v8f32x_info, HasVLX>;
-  defm : avx512_fp_logical_lowering<InstrStr#QZ256, OpNode, v4f64x_info, HasVLX>;
-  defm : avx512_fp_logical_lowering<InstrStr#DZ, OpNode, v16f32_info, HasAVX512>;
-  defm : avx512_fp_logical_lowering<InstrStr#QZ, OpNode, v8f64_info, HasAVX512>;
-}
-
-defm : avx512_fp_logical_lowering_sizes<"VPAND", and>;
-defm : avx512_fp_logical_lowering_sizes<"VPOR", or>;
-defm : avx512_fp_logical_lowering_sizes<"VPXOR", xor>;
-defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>;
-
 let Predicates = [HasVLX,HasDQI] in {
   // Use packed logical operations for scalar ops.
   def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
@@ -5771,15 +5866,12 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode,
   defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
-                   (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
-                           _.ImmAllZerosV)>,
+                   (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)>,
                    EVEX_4V, Sched<[sched]>;
   defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (OpNode (bitconvert
-                            (_.i64VT (and _.RC:$src1,
-                                          (_.i64LdFrag addr:$src2)))),
+                   (OpNode (and _.RC:$src1, (_.LdFrag addr:$src2)),
                            _.ImmAllZerosV)>,
                    EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -5813,7 +5905,7 @@ multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
 // Use 512bit version to implement 128/256 bit in case NoVLX.
 multiclass avx512_vptest_lowering<PatFrag OpNode, X86VectorVTInfo ExtendInfo,
                                   X86VectorVTInfo _, string Name> {
-  def : Pat<(_.KVT (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
+  def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2),
                            _.ImmAllZerosV)),
             (_.KVT (COPY_TO_REGCLASS
                      (!cast<Instruction>(Name # "Zrr")
@@ -5824,7 +5916,7 @@ multiclass avx512_vptest_lowering<PatFrag OpNode, X86VectorVTInfo ExtendInfo,
                    _.KRC))>;
 
   def : Pat<(_.KVT (and _.KRC:$mask,
-                        (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
+                        (OpNode (and _.RC:$src1, _.RC:$src2),
                                 _.ImmAllZerosV))),
             (COPY_TO_REGCLASS
              (!cast<Instruction>(Name # "Zrrk")
@@ -5927,6 +6019,125 @@ defm VPTESTM   : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem,
 defm VPTESTNM  : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm,
                                          SchedWriteVecLogic>, T8XS;
 
+
+multiclass avx512_vptest_lowering_pats<string InstrStr, PatFrag OpNode,
+                                       X86VectorVTInfo _,
+                                       X86VectorVTInfo AndInfo> {
+  def : Pat<(_.KVT (OpNode (bitconvert
+                            (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr # "rr") _.RC:$src1, _.RC:$src2)>;
+
+  def : Pat<(_.KVT (and _.KRC:$mask,
+                    (OpNode (bitconvert
+                             (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
+                            _.ImmAllZerosV))),
+            (!cast<Instruction>(InstrStr # "rrk") _.KRC:$mask, _.RC:$src1,
+                                                  _.RC:$src2)>;
+
+  def : Pat<(_.KVT (OpNode (bitconvert
+                            (AndInfo.VT (and _.RC:$src1,
+                                             (AndInfo.LdFrag addr:$src2)))),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr # "rm") _.RC:$src1, addr:$src2)>;
+
+  def : Pat<(_.KVT (and _.KRC:$mask,
+                    (OpNode (bitconvert
+                             (AndInfo.VT (and _.RC:$src1,
+                                              (AndInfo.LdFrag addr:$src2)))),
+                            _.ImmAllZerosV))),
+            (!cast<Instruction>(InstrStr # "rmk") _.KRC:$mask, _.RC:$src1,
+                                                  addr:$src2)>;
+}
+
+// Patterns to use 512-bit instructions when 128/256 are not available.
+multiclass avx512_vptest_lowering_wide_pats<string InstrStr, PatFrag OpNode,
+                                            X86VectorVTInfo _,
+                                            X86VectorVTInfo AndInfo,
+                                            X86VectorVTInfo ExtendInfo> {
+  def : Pat<(_.KVT (OpNode (bitconvert
+                            (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
+                           _.ImmAllZerosV)),
+            (_.KVT (COPY_TO_REGCLASS
+                     (!cast<Instruction>(InstrStr#"rr")
+                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                      _.RC:$src1, _.SubRegIdx),
+                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                      _.RC:$src2, _.SubRegIdx)),
+                   _.KRC))>;
+
+  def : Pat<(_.KVT (and _.KRC:$mask,
+                    (OpNode (bitconvert
+                             (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
+                            _.ImmAllZerosV))),
+            (COPY_TO_REGCLASS
+             (!cast<Instruction>(InstrStr#"rrk")
+              (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
+              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                             _.RC:$src1, _.SubRegIdx),
+              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                             _.RC:$src2, _.SubRegIdx)),
+             _.KRC)>;
+}
+
+multiclass avx512_vptest_lowering_sizes<string InstrStr, PatFrag OpNode,
+                                        Predicate prd,
+                                        AVX512VLVectorVTInfo CmpInfo,
+                                        AVX512VLVectorVTInfo AndInfo> {
+let Predicates = [prd, HasVLX] in {
+  defm : avx512_vptest_lowering_pats<InstrStr#"Z128", OpNode,
+                                     CmpInfo.info128, AndInfo.info128>;
+  defm : avx512_vptest_lowering_pats<InstrStr#"Z256", OpNode,
+                                     CmpInfo.info256, AndInfo.info256>;
+}
+let Predicates = [prd] in {
+  defm : avx512_vptest_lowering_pats<InstrStr#"Z", OpNode,
+                                     CmpInfo.info512, AndInfo.info512>;
+}
+
+let Predicates = [prd, NoVLX] in {
+  defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode,
+                                          CmpInfo.info128, AndInfo.info128,
+                                          CmpInfo.info512>;
+  defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode,
+                                          CmpInfo.info256, AndInfo.info256,
+                                          CmpInfo.info512>;
+}
+}
+
+multiclass avx512_vptest_lowering_types<string InstrStr, PatFrag OpNode> {
+  defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
+                                      avx512vl_i8_info, avx512vl_i16_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
+                                      avx512vl_i8_info, avx512vl_i32_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
+                                      avx512vl_i8_info, avx512vl_i64_info>;
+
+  defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
+                                      avx512vl_i16_info, avx512vl_i8_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
+                                      avx512vl_i16_info, avx512vl_i32_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
+                                      avx512vl_i16_info, avx512vl_i64_info>;
+
+  defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
+                                      avx512vl_i32_info, avx512vl_i8_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
+                                      avx512vl_i32_info, avx512vl_i16_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
+                                      avx512vl_i32_info, avx512vl_i64_info>;
+
+  defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
+                                      avx512vl_i64_info, avx512vl_i8_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
+                                      avx512vl_i64_info, avx512vl_i16_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
+                                      avx512vl_i64_info, avx512vl_i32_info>;
+}
+
+defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem>;
+defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512  Shift instructions
 //===----------------------------------------------------------------------===//
@@ -11443,19 +11654,68 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
 // TODO: We should maybe have a more generalized algorithm for folding to
 // vpternlog.
 let Predicates = [HasAVX512] in {
-  def : Pat<(v8i64 (xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV)))),
+  def : Pat<(xor VR512:$src, (bc_v64i8 (v16i32 immAllOnesV))),
+            (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+  def : Pat<(xor VR512:$src, (bc_v32i16 (v16i32 immAllOnesV))),
+            (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+  def : Pat<(xor VR512:$src, (bc_v16i32 (v16i32 immAllOnesV))),
+            (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+  def : Pat<(xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV))),
             (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
 }
 
 let Predicates = [HasAVX512, NoVLX] in {
-  def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))),
+  def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (i8 15)), sub_xmm)>;
+  def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (i8 15)), sub_xmm)>;
+  def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (i8 15)), sub_xmm)>;
+  def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))),
             (EXTRACT_SUBREG
              (VPTERNLOGQZrri
               (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
               (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
               (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
               (i8 15)), sub_xmm)>;
-  def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))),
+
+  def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (i8 15)), sub_ymm)>;
+  def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (i8 15)), sub_ymm)>;
+  def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (i8 15)), sub_ymm)>;
+  def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))),
             (EXTRACT_SUBREG
              (VPTERNLOGQZrri
               (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
@@ -11465,9 +11725,22 @@ let Predicates = [HasAVX512, NoVLX] in {
 }
 
 let Predicates = [HasVLX] in {
-  def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))),
+  def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))),
+            (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+  def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))),
             (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
-  def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))),
+  def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))),
+            (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+  def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))),
+            (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+
+  def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))),
+            (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+  def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))),
+            (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+  def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))),
+            (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+  def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))),
             (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
 }
 
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 7e31527a877..7bc8d0aa530 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -855,6 +855,7 @@ def bc_v4f64 : PatFrag<(ops node:$in), (v4f64 (bitconvert node:$in))>;
 
 // 512-bit bitconvert pattern fragments
 def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>;
+def bc_v32i16 : PatFrag<(ops node:$in), (v32i16 (bitconvert node:$in))>;
 def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
 def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
 def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 8f97ce37068..85e4fd38563 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2417,6 +2417,34 @@ let Predicates = [HasAVX2, NoVLX] in {
             (VPANDNYrr VR256:$src1, VR256:$src2)>;
   def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
             (VPANDNYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
+            (VPANDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
+            (VPANDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
+            (VPANDYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
+            (VPORYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
+            (VPORYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
+            (VPORYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
+            (VPXORYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
+            (VPXORYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
+            (VPXORYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
+            (VPANDNYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
+            (VPANDNYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
+            (VPANDNYrm VR256:$src1, addr:$src2)>;
 }
 
 // If only AVX1 is supported, we need to handle integer operations with
@@ -2458,12 +2486,39 @@ let Predicates = [HasAVX1Only] in {
   def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
             (VANDNPSYrr VR256:$src1, VR256:$src2)>;
 
+  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
             (VANDPSYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
             (VORPSYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
             (VXORPSYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
             (VANDNPSYrm VR256:$src1, addr:$src2)>;
 }
@@ -2589,6 +2644,34 @@ let Predicates = [HasAVX, NoVLX] in {
             (VPANDNrr VR128:$src1, VR128:$src2)>;
   def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
             (VPANDNrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
+            (VPANDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
+            (VPANDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
+            (VPANDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
+            (VPORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
+            (VPORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
+            (VPORrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
+            (VPXORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
+            (VPXORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
+            (VPXORrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
+            (VPANDNrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
+            (VPANDNrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
+            (VPANDNrm VR128:$src1, addr:$src2)>;
 }
 
 let Predicates = [UseSSE2] in {
@@ -2619,6 +2702,34 @@ let Predicates = [UseSSE2] in {
             (PANDNrr VR128:$src1, VR128:$src2)>;
   def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
             (PANDNrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
+            (PANDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
+            (PANDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
+            (PANDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
+            (PORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
+            (PORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
+            (PORrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
+            (PXORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
+            (PXORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
+            (PXORrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
+            (PANDNrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
+            (PANDNrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
+            (PANDNrm VR128:$src1, addr:$src2)>;
 }
 
 // Patterns for packed operations when we don't have integer type available.
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 39f50c10ae1..9d810a675e3 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -350,6 +350,7 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
             [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
                                    (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V,
             Sched<[sched]>;
+  // FIXME: This pattern can't match.
   def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
             (ins RC:$src1, RC:$src2, x86memop:$src3),
             !strconcat(OpcodeStr,
@@ -385,6 +386,48 @@ let ExeDomain = SSEPackedInt in {
                             SchedWriteShuffle.YMM>, VEX_L;
 }
 
+let Predicates = [HasXOP] in {
+  def : Pat<(v16i8 (or (and VR128:$src3, VR128:$src1),
+                   (X86andnp VR128:$src3, VR128:$src2))),
+            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v8i16 (or (and VR128:$src3, VR128:$src1),
+                   (X86andnp VR128:$src3, VR128:$src2))),
+            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v4i32 (or (and VR128:$src3, VR128:$src1),
+                   (X86andnp VR128:$src3, VR128:$src2))),
+            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+  def : Pat<(or (and VR128:$src3, VR128:$src1),
+                (X86andnp VR128:$src3, (bc_v16i8 (loadv2i64 addr:$src2)))),
+            (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
+  def : Pat<(or (and VR128:$src3, VR128:$src1),
+                (X86andnp VR128:$src3, (bc_v8i16 (loadv2i64 addr:$src2)))),
+            (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
+  def : Pat<(or (and VR128:$src3, VR128:$src1),
+                (X86andnp VR128:$src3, (bc_v4i32 (loadv2i64 addr:$src2)))),
+            (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+  def : Pat<(v32i8 (or (and VR256:$src3, VR256:$src1),
+                   (X86andnp VR256:$src3, VR256:$src2))),
+            (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+  def : Pat<(v16i16 (or (and VR256:$src3, VR256:$src1),
+                    (X86andnp VR256:$src3, VR256:$src2))),
+            (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+  def : Pat<(v8i32 (or (and VR256:$src3, VR256:$src1),
+                   (X86andnp VR256:$src3, VR256:$src2))),
+            (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+
+  def : Pat<(or (and VR256:$src3, VR256:$src1),
+                (X86andnp VR256:$src3, (bc_v32i8 (loadv4i64 addr:$src2)))),
+            (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
+  def : Pat<(or (and VR256:$src3, VR256:$src1),
+                (X86andnp VR256:$src3, (bc_v16i16 (loadv4i64 addr:$src2)))),
+            (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
+  def : Pat<(or (and VR256:$src3, VR256:$src1),
+                (X86andnp VR256:$src3, (bc_v8i32 (loadv4i64 addr:$src2)))),
+            (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
+}
+
 multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
                         X86MemOperand intmemop, X86MemOperand fpmemop,
                         ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag,
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index d836e9e439b..6bc69213d42 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -601,17 +601,17 @@ define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
 define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
 ; AVX512F-LABEL: andd512fold:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpandq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT:    vpandd (%rdi), %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: andd512fold:
 ; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vpandq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpandd (%rdi), %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: andd512fold:
 ; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpandq (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandd (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: andd512fold:
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index aa89ee7c390..d888ea4ac44 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -3614,8 +3614,8 @@ define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float>
 ; CHECK-LABEL: test_mm512_fnmsub_round_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
-; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
-; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
+; CHECK-NEXT:    vpxord %zmm3, %zmm0, %zmm4
+; CHECK-NEXT:    vpxord %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 entry:
@@ -3837,8 +3837,8 @@ define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B,
 ; CHECK-LABEL: test_mm512_fnmsub_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
-; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
-; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
+; CHECK-NEXT:    vpxord %zmm3, %zmm0, %zmm4
+; CHECK-NEXT:    vpxord %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 entry:
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index a70ff9bc1b1..26e8636df8f 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -1658,7 +1658,7 @@ define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) {
 define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_xor_epi32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xef,0xc1]
+; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xef,0xc1]
 ; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
@@ -1687,7 +1687,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16
 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_or_epi32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xeb,0xc1]
+; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xeb,0xc1]
 ; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
@@ -1716,7 +1716,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x
 define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_and_epi32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xdb,0xc1]
+; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xdb,0xc1]
 ; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
diff --git a/test/CodeGen/X86/avx512-logic.ll b/test/CodeGen/X86/avx512-logic.ll
index bb1e8550ba2..65d9d67b2ca 100644
--- a/test/CodeGen/X86/avx512-logic.ll
+++ b/test/CodeGen/X86/avx512-logic.ll
@@ -7,7 +7,7 @@ define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon
 ; ALL-LABEL: vpandd:
 ; ALL:       ## %bb.0: ## %entry
 ; ALL-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -21,7 +21,7 @@ define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readno
 ; ALL-LABEL: vpandnd:
 ; ALL:       ## %bb.0: ## %entry
 ; ALL-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT:    vpandnq %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    vpandnd %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -37,7 +37,7 @@ define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone
 ; ALL-LABEL: vpord:
 ; ALL:       ## %bb.0: ## %entry
 ; ALL-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -51,7 +51,7 @@ define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon
 ; ALL-LABEL: vpxord:
 ; ALL:       ## %bb.0: ## %entry
 ; ALL-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -132,7 +132,7 @@ define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
 define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
 ; KNL-LABEL: andd512fold:
 ; KNL:       ## %bb.0: ## %entry
-; KNL-NEXT:    vpandq (%rdi), %zmm0, %zmm0
+; KNL-NEXT:    vpandd (%rdi), %zmm0, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: andd512fold:
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 1449f5cf7b4..13ffb9f65bf 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -3177,7 +3177,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    pushq %rax
 ; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; KNL-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testw %ax, %ax
@@ -3196,7 +3196,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    pushq %rax
 ; SKX-NEXT:    .cfi_def_cfa_offset 16
-; SKX-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    testw %ax, %ax
@@ -3215,7 +3215,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    pushq %rax
 ; AVX512BW-NEXT:    .cfi_def_cfa_offset 16
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testw %ax, %ax
@@ -3234,7 +3234,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; AVX512DQ:       ## %bb.0:
 ; AVX512DQ-NEXT:    pushq %rax
 ; AVX512DQ-NEXT:    .cfi_def_cfa_offset 16
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, %eax
 ; AVX512DQ-NEXT:    testw %ax, %ax
@@ -3253,7 +3253,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; X86-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; X86-NEXT:    kmovd %k0, %eax
 ; X86-NEXT:    testw %ax, %ax
@@ -3287,7 +3287,7 @@ define void @ktest_allones(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; CHECK-NEXT:    kortestw %k0, %k0
 ; CHECK-NEXT:    jb LBB65_2
@@ -3303,7 +3303,7 @@ define void @ktest_allones(<16 x i32> %x, <16 x i32> %y) {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; X86-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; X86-NEXT:    kortestw %k0, %k0
 ; X86-NEXT:    jb LBB65_2
diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll
index e0237ff0d83..7ea2aef6ba4 100755
--- a/test/CodeGen/X86/avx512-schedule.ll
+++ b/test/CodeGen/X86/avx512-schedule.ll
@@ -5029,13 +5029,13 @@ define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon
 ; GENERIC-LABEL: vpandd:
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT:    vpandd %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpandd:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; SKX-NEXT:    vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpandd %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 entry:
   ; Force the execution domain with an add.
@@ -5049,13 +5049,13 @@ define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readno
 ; GENERIC-LABEL: vpandnd:
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT:    vpandnd %zmm0, %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpandnd:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; SKX-NEXT:    vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpandnd %zmm0, %zmm1, %zmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 entry:
   ; Force the execution domain with an add.
@@ -5071,13 +5071,13 @@ define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone
 ; GENERIC-LABEL: vpord:
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT:    vpord %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpord:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; SKX-NEXT:    vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpord %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 entry:
   ; Force the execution domain with an add.
@@ -5091,13 +5091,13 @@ define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon
 ; GENERIC-LABEL: vpxord:
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT:    vpxord %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpxord:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; SKX-NEXT:    vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpxord %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 entry:
   ; Force the execution domain with an add.
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 008a3b44ce0..90e533c09b7 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -11,7 +11,7 @@ define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; X86-NEXT:  .LBB0_2:
-; X86-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; X86-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: select00:
@@ -22,7 +22,7 @@ define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
 ; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; X64-NEXT:  .LBB0_2:
-; X64-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; X64-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    retq
   %cmpres = icmp eq i32 %a, 255
   %selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index 2964c905946..b1a63ffedf3 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -657,7 +657,7 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307]
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 139fabd25c9..c524021866d 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -845,7 +845,7 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307]
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll
index c43b19a7179..3f78d0c9c5c 100644
--- a/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -708,7 +708,6 @@ define i64 @v16i8_widened_with_ones(<16 x i8> %a, <16 x i8> %b) {
 ; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
 ; AVX2-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
 ; AVX2-NEXT:    orq %rcx, %rax
diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index bb79efcbad4..19d3b47a659 100644
--- a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -359,7 +359,8 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 ; AVX-LABEL: f64i8_i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -367,7 +368,7 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -391,7 +392,8 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 ; AVX-64-LABEL: f64i8_i32:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-64-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-64-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -399,7 +401,7 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -425,12 +427,13 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 }
 
 
+; FIXME the load should be folded with the MOVDDUP with AVX1. PR39454
 define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 ; AVX-LABEL: f64xi8_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275]
-; AVX-NEXT:    # xmm3 = mem[0,0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -438,7 +441,7 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -462,8 +465,8 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 ; AVX-64-LABEL: f64xi8_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275]
-; AVX-64-NEXT:    # xmm3 = mem[0,0]
+; AVX-64-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -471,7 +474,7 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -496,7 +499,6 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
   ret <64 x i8> %res2
 }
 
-
 define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
 ; AVX-LABEL: f64xi8_i128:
 ; AVX:       # %bb.0:
@@ -509,7 +511,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -543,7 +545,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -844,7 +846,8 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 ; AVX-LABEL: f32xi16_i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -852,7 +855,7 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -876,7 +879,8 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 ; AVX-64-LABEL: f32xi16_i32:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-64-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-64-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -884,7 +888,7 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -910,12 +914,13 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 }
 
 
+; FIXME the load should be folded with the MOVDDUP with AVX1. PR39454
 define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 ; AVX-LABEL: f32xi16_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [4.1720559249406128E-309,4.1720559249406128E-309]
-; AVX-NEXT:    # xmm3 = mem[0,0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -923,7 +928,7 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -947,8 +952,8 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 ; AVX-64-LABEL: f32xi16_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = [4.1720559249406128E-309,4.1720559249406128E-309]
-; AVX-64-NEXT:    # xmm3 = mem[0,0]
+; AVX-64-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -956,7 +961,7 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -994,7 +999,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -1028,7 +1033,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -1252,12 +1257,13 @@ define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
 }
 
 
+; FIXME the load should be folded with the MOVDDUP with AVX1. PR39454
 define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX-LABEL: f16xi32_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [2.1219957909652723E-314,2.1219957909652723E-314]
-; AVX-NEXT:    # xmm3 = mem[0,0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -1265,7 +1271,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -1283,14 +1289,14 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    retl
 ;
 ; AVX-64-LABEL: f16xi32_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = [2.1219957909652723E-314,2.1219957909652723E-314]
-; AVX-64-NEXT:    # xmm3 = mem[0,0]
+; AVX-64-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -1298,7 +1304,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -1316,7 +1322,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296]
 ; AVX512F-64-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    retq
   %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
   %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
@@ -1336,7 +1342,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
 ; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -1356,7 +1362,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    retl
 ;
 ; AVX-64-LABEL: f16xi32_i128:
@@ -1370,7 +1376,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -1390,7 +1396,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
 ; AVX512F-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    retq
   %res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
   %res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
diff --git a/test/CodeGen/X86/movmsk-cmp.ll b/test/CodeGen/X86/movmsk-cmp.ll
index 4b33b04a8b2..452676e19db 100644
--- a/test/CodeGen/X86/movmsk-cmp.ll
+++ b/test/CodeGen/X86/movmsk-cmp.ll
@@ -2338,8 +2338,7 @@ define i1 @allones_v4i32_and1(<4 x i32> %arg) {
 ;
 ; SKX-LABEL: allones_v4i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; SKX-NEXT:    vptestmd %xmm1, %xmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    andb $15, %al
 ; SKX-NEXT:    cmpb $15, %al
@@ -2382,8 +2381,7 @@ define i1 @allzeros_v4i32_and1(<4 x i32> %arg) {
 ;
 ; SKX-LABEL: allzeros_v4i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; SKX-NEXT:    vptestmd %xmm1, %xmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    testb $15, %al
 ; SKX-NEXT:    sete %al
@@ -2444,8 +2442,7 @@ define i1 @allones_v8i32_and1(<8 x i32> %arg) {
 ;
 ; SKX-LABEL: allones_v8i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; SKX-NEXT:    vptestmd %ymm1, %ymm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0
 ; SKX-NEXT:    kortestb %k0, %k0
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vzeroupper
@@ -2506,8 +2503,7 @@ define i1 @allzeros_v8i32_and1(<8 x i32> %arg) {
 ;
 ; SKX-LABEL: allzeros_v8i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; SKX-NEXT:    vptestmd %ymm1, %ymm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0
 ; SKX-NEXT:    kortestb %k0, %k0
 ; SKX-NEXT:    sete %al
 ; SKX-NEXT:    vzeroupper
@@ -2584,8 +2580,7 @@ define i1 @allones_v16i32_and1(<16 x i32> %arg) {
 ;
 ; KNL-LABEL: allones_v16i32_and1:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kortestw %k0, %k0
 ; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vzeroupper
@@ -2593,8 +2588,7 @@ define i1 @allones_v16i32_and1(<16 x i32> %arg) {
 ;
 ; SKX-LABEL: allones_v16i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SKX-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; SKX-NEXT:    kortestw %k0, %k0
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vzeroupper
@@ -2671,8 +2665,7 @@ define i1 @allzeros_v16i32_and1(<16 x i32> %arg) {
 ;
 ; KNL-LABEL: allzeros_v16i32_and1:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kortestw %k0, %k0
 ; KNL-NEXT:    sete %al
 ; KNL-NEXT:    vzeroupper
@@ -2680,8 +2673,7 @@ define i1 @allzeros_v16i32_and1(<16 x i32> %arg) {
 ;
 ; SKX-LABEL: allzeros_v16i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SKX-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; SKX-NEXT:    kortestw %k0, %k0
 ; SKX-NEXT:    sete %al
 ; SKX-NEXT:    vzeroupper
@@ -4010,8 +4002,7 @@ define i1 @allones_v4i32_and4(<4 x i32> %arg) {
 ;
 ; SKX-LABEL: allones_v4i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
-; SKX-NEXT:    vptestmd %xmm1, %xmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    andb $15, %al
 ; SKX-NEXT:    cmpb $15, %al
@@ -4054,8 +4045,7 @@ define i1 @allzeros_v4i32_and4(<4 x i32> %arg) {
 ;
 ; SKX-LABEL: allzeros_v4i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
-; SKX-NEXT:    vptestmd %xmm1, %xmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    testb $15, %al
 ; SKX-NEXT:    sete %al
@@ -4116,8 +4106,7 @@ define i1 @allones_v8i32_and4(<8 x i32> %arg) {
 ;
 ; SKX-LABEL: allones_v8i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
-; SKX-NEXT:    vptestmd %ymm1, %ymm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0
 ; SKX-NEXT:    kortestb %k0, %k0
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vzeroupper
@@ -4178,8 +4167,7 @@ define i1 @allzeros_v8i32_and4(<8 x i32> %arg) {
 ;
 ; SKX-LABEL: allzeros_v8i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
-; SKX-NEXT:    vptestmd %ymm1, %ymm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0
 ; SKX-NEXT:    kortestb %k0, %k0
 ; SKX-NEXT:    sete %al
 ; SKX-NEXT:    vzeroupper
@@ -4256,8 +4244,7 @@ define i1 @allones_v16i32_and4(<16 x i32> %arg) {
 ;
 ; KNL-LABEL: allones_v16i32_and4:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kortestw %k0, %k0
 ; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vzeroupper
@@ -4265,8 +4252,7 @@ define i1 @allones_v16i32_and4(<16 x i32> %arg) {
 ;
 ; SKX-LABEL: allones_v16i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; SKX-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; SKX-NEXT:    kortestw %k0, %k0
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vzeroupper
@@ -4343,8 +4329,7 @@ define i1 @allzeros_v16i32_and4(<16 x i32> %arg) {
 ;
 ; KNL-LABEL: allzeros_v16i32_and4:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kortestw %k0, %k0
 ; KNL-NEXT:    sete %al
 ; KNL-NEXT:    vzeroupper
@@ -4352,8 +4337,7 @@ define i1 @allzeros_v16i32_and4(<16 x i32> %arg) {
 ;
 ; SKX-LABEL: allzeros_v16i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; SKX-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; SKX-NEXT:    kortestw %k0, %k0
 ; SKX-NEXT:    sete %al
 ; SKX-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index a6bdfe9780c..6f4a3812ffa 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -531,18 +531,16 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
 ; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT:    pshufb %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm3, %xmm7
-; SSE41-NEXT:    pmaxud %xmm2, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm7
-; SSE41-NEXT:    pxor %xmm5, %xmm7
-; SSE41-NEXT:    pshufb %xmm6, %xmm7
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pmaxud %xmm2, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    packssdw %xmm6, %xmm0
 ; SSE41-NEXT:    psubd %xmm2, %xmm3
 ; SSE41-NEXT:    psubd %xmm1, %xmm4
-; SSE41-NEXT:    pshufb %xmm6, %xmm4
-; SSE41-NEXT:    pshufb %xmm6, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
 ; SSE41-NEXT:    pandn %xmm4, %xmm0
 ; SSE41-NEXT:    retq
@@ -916,18 +914,16 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
 ; SSE41-NEXT:    pxor %xmm5, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT:    pshufb %xmm6, %xmm4
-; SSE41-NEXT:    movdqa %xmm3, %xmm7
-; SSE41-NEXT:    pminud %xmm2, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm7
-; SSE41-NEXT:    pxor %xmm5, %xmm7
-; SSE41-NEXT:    pshufb %xmm6, %xmm7
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pminud %xmm2, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    packssdw %xmm6, %xmm4
 ; SSE41-NEXT:    psubd %xmm2, %xmm3
 ; SSE41-NEXT:    psubd %xmm1, %xmm0
-; SSE41-NEXT:    pshufb %xmm6, %xmm0
-; SSE41-NEXT:    pshufb %xmm6, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; SSE41-NEXT:    pand %xmm4, %xmm0
 ; SSE41-NEXT:    retq
@@ -1052,18 +1048,16 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm4
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
 ; SSE41-NEXT:    pxor %xmm5, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT:    pshufb %xmm6, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm7
-; SSE41-NEXT:    pmaxud %xmm3, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm7
-; SSE41-NEXT:    pxor %xmm5, %xmm7
-; SSE41-NEXT:    pshufb %xmm6, %xmm7
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pmaxud %xmm3, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    packssdw %xmm6, %xmm4
 ; SSE41-NEXT:    psubd %xmm2, %xmm3
 ; SSE41-NEXT:    psubd %xmm1, %xmm0
-; SSE41-NEXT:    pshufb %xmm6, %xmm0
-; SSE41-NEXT:    pshufb %xmm6, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; SSE41-NEXT:    pand %xmm4, %xmm0
 ; SSE41-NEXT:    retq
diff --git a/test/CodeGen/X86/sat-add.ll b/test/CodeGen/X86/sat-add.ll
index ec160c94f5e..f0989e8b081 100644
--- a/test/CodeGen/X86/sat-add.ll
+++ b/test/CodeGen/X86/sat-add.ll
@@ -746,15 +746,16 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32>
 ; SSE2-LABEL: unsigned_sat_variable_v4i32_using_min:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
 ; SSE2-NEXT:    pxor %xmm0, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
index f109d69621c..7cb0d3ff58f 100644
--- a/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
+++ b/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
@@ -132,9 +132,9 @@ define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py,
 ;
 ; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa (%rdi), %xmm2
+; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm2
 ; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm0
@@ -142,9 +142,9 @@ define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py,
 ;
 ; CHECK-XOP-LABEL: in_constant_varx_mone_invmask:
 ; CHECK-XOP:       # %bb.0:
-; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
 ; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-XOP-NEXT:    vpxor (%rdi), %xmm1, %xmm2
+; CHECK-XOP-NEXT:    vpxor (%rdx), %xmm1, %xmm2
 ; CHECK-XOP-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; CHECK-XOP-NEXT:    retq
diff --git a/test/CodeGen/X86/vec-copysign-avx512.ll b/test/CodeGen/X86/vec-copysign-avx512.ll
index 6fb0033e750..b08b15ce004 100644
--- a/test/CodeGen/X86/vec-copysign-avx512.ll
+++ b/test/CodeGen/X86/vec-copysign-avx512.ll
@@ -43,7 +43,7 @@ define <16 x float> @v16f32(<16 x float> %a, <16 x float> %b) nounwind {
 ; AVX512VL:       ## %bb.0:
 ; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: v16f32:
diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll
index fa4c8abe6d2..b249eed2fc7 100644
--- a/test/CodeGen/X86/vector-bitreverse.ll
+++ b/test/CodeGen/X86/vector-bitreverse.ll
@@ -2046,27 +2046,27 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; AVX512F-NEXT:    vpsrld $24, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpsrld $8, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vpord %zmm1, %zmm2, %zmm1
 ; AVX512F-NEXT:    vpslld $24, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpslld $8, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpslld $4, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpsrld $4, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpslld $2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpsrld $2, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpslld $1, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_bitreverse_v16i32:
diff --git a/test/CodeGen/X86/vector-lzcnt-512.ll b/test/CodeGen/X86/vector-lzcnt-512.ll
index 10db0aeb25e..71a9ba19396 100644
--- a/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -172,15 +172,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512BW-LABEL: testv16i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrld $1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $8, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $16, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -201,15 +201,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512DQ-LABEL: testv16i32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpsrld $1, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $2, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $4, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $8, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $16, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -257,15 +257,15 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512BW-LABEL: testv16i32u:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrld $1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $8, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $16, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -286,15 +286,15 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512DQ-LABEL: testv16i32u:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpsrld $1, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $2, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $4, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $8, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $16, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
diff --git a/test/CodeGen/X86/vector-reduce-and.ll b/test/CodeGen/X86/vector-reduce-and.ll
index 89ae9510cdc..560bceb2d05 100644
--- a/test/CodeGen/X86/vector-reduce-and.ll
+++ b/test/CodeGen/X86/vector-reduce-and.ll
@@ -309,13 +309,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-LABEL: test_v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -372,15 +372,15 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ;
 ; AVX512-LABEL: test_v32i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-reduce-or.ll b/test/CodeGen/X86/vector-reduce-or.ll
index 04ec6cfc970..169394040bf 100644
--- a/test/CodeGen/X86/vector-reduce-or.ll
+++ b/test/CodeGen/X86/vector-reduce-or.ll
@@ -309,13 +309,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-LABEL: test_v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -372,15 +372,15 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ;
 ; AVX512-LABEL: test_v32i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-reduce-xor.ll b/test/CodeGen/X86/vector-reduce-xor.ll
index cb69ee80ee4..d1bf3e99c2c 100644
--- a/test/CodeGen/X86/vector-reduce-xor.ll
+++ b/test/CodeGen/X86/vector-reduce-xor.ll
@@ -309,13 +309,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-LABEL: test_v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -372,15 +372,15 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ;
 ; AVX512-LABEL: test_v32i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-rotate-512.ll b/test/CodeGen/X86/vector-rotate-512.ll
index eb51c1029de..30de8d7c908 100644
--- a/test/CodeGen/X86/vector-rotate-512.ll
+++ b/test/CodeGen/X86/vector-rotate-512.ll
@@ -876,7 +876,7 @@ define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
 ; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vprold $4, %zmm0, %zmm0
-; AVX512-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
@@ -980,10 +980,8 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -991,10 +989,8 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
index e552f5f4036..7dc850391bc 100644
--- a/test/CodeGen/X86/vector-trunc-math.ll
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -3505,7 +3505,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ;
 ; AVX512-LABEL: trunc_and_v16i32_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -4309,7 +4309,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ;
 ; AVX512-LABEL: trunc_xor_v16i32_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -5113,7 +5113,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
 ;
 ; AVX512-LABEL: trunc_or_v16i32_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 1de03463e19..501d7e96835 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -128,7 +128,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512CD:       # %bb.0:
 ; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; AVX512CD-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
 ; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
@@ -138,7 +138,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512CDBW:       # %bb.0:
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vplzcntd %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
@@ -148,7 +148,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -169,7 +169,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -177,7 +177,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; BITALG-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
@@ -195,7 +195,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512CD:       # %bb.0:
 ; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; AVX512CD-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
 ; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
@@ -205,7 +205,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512CDBW:       # %bb.0:
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vplzcntd %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
@@ -215,7 +215,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -236,7 +236,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -244,7 +244,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; BITALG-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-- 
GitLab


From a8aac85576d3fe8143b0481ab0c973546b3f39e4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 26 Oct 2018 17:38:27 +0000
Subject: [PATCH 0645/1116] [llvm-ar] Strip trailing \r and format

Reviewers: mstorsjo, rupprecht, gbreynoo

Reviewed By: rupprecht

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53769

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345410 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-ar/llvm-ar.cpp | 2064 +++++++++++++++++++------------------
 1 file changed, 1046 insertions(+), 1018 deletions(-)

diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 2c6dc8fad92..5ab8ae13d3e 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -1,1018 +1,1046 @@
-//===-- llvm-ar.cpp - LLVM archive librarian utility ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Builds up (relatively) standard unix archive files (.a) containing LLVM
-// bitcode or other files.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Object/Archive.h"
-#include "llvm/Object/ArchiveWriter.h"
-#include "llvm/Object/MachO.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Chrono.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/LineIterator.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/StringSaver.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
-#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
-
-#if !defined(_MSC_VER) && !defined(__MINGW32__)
-#include <unistd.h>
-#else
-#include <io.h>
-#endif
-
-using namespace llvm;
-
-// The name this program was invoked as.
-static StringRef ToolName;
-
-// The basename of this program.
-static StringRef Stem;
-
-const char RanlibHelp[] = R"(
-OVERVIEW: LLVM Ranlib (llvm-ranlib)
-
-  This program generates an index to speed access to archives
-
-USAGE: llvm-ranlib <archive-file>
-
-OPTIONS:
-  -help                             - Display available options
-  -version                          - Display the version of this program
-)";
-
-const char ArHelp[] = R"(
-OVERVIEW: LLVM Archiver
-
-USAGE: llvm-ar [options] [-]<operation>[modifiers] [relpos] <archive> [files]
-       llvm-ar -M [<mri-script]
-
-OPTIONS:
-  --format              - Archive format to create
-    =default            -   default
-    =gnu                -   gnu
-    =darwin             -   darwin
-    =bsd                -   bsd
-  --plugin=<string>     - Ignored for compatibility
-  --help                - Display available options
-  --version             - Display the version of this program
-
-OPERATIONS:
-  d - delete [files] from the archive
-  m - move [files] in the archive
-  p - print [files] found in the archive
-  q - quick append [files] to the archive
-  r - replace or insert [files] into the archive
-  s - act as ranlib
-  t - display contents of archive
-  x - extract [files] from the archive
-
-MODIFIERS:
-  [a] - put [files] after [relpos]
-  [b] - put [files] before [relpos] (same as [i])
-  [c] - do not warn if archive had to be created
-  [D] - use zero for timestamps and uids/gids (default)
-  [i] - put [files] before [relpos] (same as [b])
-  [l] - ignored for compatibility
-  [L] - add archive's contents
-  [o] - preserve original dates
-  [s] - create an archive index (cf. ranlib)
-  [S] - do not build a symbol table
-  [T] - create a thin archive
-  [u] - update only [files] newer than archive contents
-  [U] - use actual timestamps and uids/gids
-  [v] - be verbose about actions taken
-)";
-
-void printHelpMessage() {
-  if (Stem.contains_lower("ranlib"))
-    outs() << RanlibHelp;
-  else if (Stem.contains_lower("ar"))
-    outs() << ArHelp;
-}
-
-// Show the error message and exit.
-LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) {
-  errs() << ToolName << ": " << Error << ".\n";
-  printHelpMessage();
-  exit(1);
-}
-
-static void failIfError(std::error_code EC, Twine Context = "") {
-  if (!EC)
-    return;
-
-  std::string ContextStr = Context.str();
-  if (ContextStr == "")
-    fail(EC.message());
-  fail(Context + ": " + EC.message());
-}
-
-static void failIfError(Error E, Twine Context = "") {
-  if (!E)
-    return;
-
-  handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EIB) {
-    std::string ContextStr = Context.str();
-    if (ContextStr == "")
-      fail(EIB.message());
-    fail(Context + ": " + EIB.message());
-  });
-}
-
-static SmallVector<const char *, 256> PositionalArgs;
-
-static bool MRI;
-
-namespace {
-enum Format { Default, GNU, BSD, DARWIN, Unknown };
-}
-
-static Format FormatType = Default;
-
-static std::string Options;
-
-// This enumeration delineates the kinds of operations on an archive
-// that are permitted.
-enum ArchiveOperation {
-  Print,            ///< Print the contents of the archive
-  Delete,           ///< Delete the specified members
-  Move,             ///< Move members to end or as given by {a,b,i} modifiers
-  QuickAppend,      ///< Quickly append to end of archive
-  ReplaceOrInsert,  ///< Replace or Insert members
-  DisplayTable,     ///< Display the table of contents
-  Extract,          ///< Extract files back to file system
-  CreateSymTab      ///< Create a symbol table in an existing archive
-};
-
-// Modifiers to follow operation to vary behavior
-static bool AddAfter = false;      ///< 'a' modifier
-static bool AddBefore = false;     ///< 'b' modifier
-static bool Create = false;        ///< 'c' modifier
-static bool OriginalDates = false; ///< 'o' modifier
-static bool OnlyUpdate = false;    ///< 'u' modifier
-static bool Verbose = false;       ///< 'v' modifier
-static bool Symtab = true;         ///< 's' modifier
-static bool Deterministic = true;  ///< 'D' and 'U' modifiers
-static bool Thin = false;          ///< 'T' modifier
-static bool AddLibrary = false;    ///< 'L' modifier
-
-// Relative Positional Argument (for insert/move). This variable holds
-// the name of the archive member to which the 'a', 'b' or 'i' modifier
-// refers. Only one of 'a', 'b' or 'i' can be specified so we only need
-// one variable.
-static std::string RelPos;
-
-// This variable holds the name of the archive file as given on the
-// command line.
-static std::string ArchiveName;
-
-// This variable holds the list of member files to proecess, as given
-// on the command line.
-static std::vector<StringRef> Members;
-
-// Extract the member filename from the command line for the [relpos] argument
-// associated with a, b, and i modifiers
-static void getRelPos() {
-  if (PositionalArgs.size() == 0)
-    fail("Expected [relpos] for a, b, or i modifier");
-  RelPos = PositionalArgs[0];
-  PositionalArgs.erase(PositionalArgs.begin());
-}
-
-// Get the archive file name from the command line
-static void getArchive() {
-  if (PositionalArgs.size() == 0)
-    fail("An archive name must be specified");
-  ArchiveName = PositionalArgs[0];
-  PositionalArgs.erase(PositionalArgs.begin());
-}
-
-// Copy over remaining items in PositionalArgs to our Members vector
-static void getMembers() {
-  for (auto &Arg : PositionalArgs)
-    Members.push_back(Arg);
-}
-
-std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
-std::vector<std::unique_ptr<object::Archive>> Archives;
-
-static object::Archive &readLibrary(const Twine &Library) {
-  auto BufOrErr = MemoryBuffer::getFile(Library, -1, false);
-  failIfError(BufOrErr.getError(), "Could not open library");
-  ArchiveBuffers.push_back(std::move(*BufOrErr));
-  auto LibOrErr =
-      object::Archive::create(ArchiveBuffers.back()->getMemBufferRef());
-  failIfError(errorToErrorCode(LibOrErr.takeError()),
-              "Could not parse library");
-  Archives.push_back(std::move(*LibOrErr));
-  return *Archives.back();
-}
-
-static void runMRIScript();
-
-// Parse the command line options as presented and return the operation
-// specified. Process all modifiers and check to make sure that constraints on
-// modifier/operation pairs have not been violated.
-static ArchiveOperation parseCommandLine() {
-  if (MRI) {
-    if (!PositionalArgs.empty() || !Options.empty())
-      fail("Cannot mix -M and other options");
-    runMRIScript();
-  }
-
-  // Keep track of number of operations. We can only specify one
-  // per execution.
-  unsigned NumOperations = 0;
-
-  // Keep track of the number of positional modifiers (a,b,i). Only
-  // one can be specified.
-  unsigned NumPositional = 0;
-
-  // Keep track of which operation was requested
-  ArchiveOperation Operation;
-
-  bool MaybeJustCreateSymTab = false;
-
-  for(unsigned i=0; i<Options.size(); ++i) {
-    switch(Options[i]) {
-    case 'd': ++NumOperations; Operation = Delete; break;
-    case 'm': ++NumOperations; Operation = Move ; break;
-    case 'p': ++NumOperations; Operation = Print; break;
-    case 'q': ++NumOperations; Operation = QuickAppend; break;
-    case 'r': ++NumOperations; Operation = ReplaceOrInsert; break;
-    case 't': ++NumOperations; Operation = DisplayTable; break;
-    case 'x': ++NumOperations; Operation = Extract; break;
-    case 'c': Create = true; break;
-    case 'l': /* accepted but unused */ break;
-    case 'o': OriginalDates = true; break;
-    case 's':
-      Symtab = true;
-      MaybeJustCreateSymTab = true;
-      break;
-    case 'S':
-      Symtab = false;
-      break;
-    case 'u': OnlyUpdate = true; break;
-    case 'v': Verbose = true; break;
-    case 'a':
-      getRelPos();
-      AddAfter = true;
-      NumPositional++;
-      break;
-    case 'b':
-      getRelPos();
-      AddBefore = true;
-      NumPositional++;
-      break;
-    case 'i':
-      getRelPos();
-      AddBefore = true;
-      NumPositional++;
-      break;
-    case 'D':
-      Deterministic = true;
-      break;
-    case 'U':
-      Deterministic = false;
-      break;
-    case 'T':
-      Thin = true;
-      break;
-    case 'L':
-      AddLibrary = true;
-      break;
-    default:
-      fail(std::string("unknown option ") + Options[i]);
-    }
-  }
-
-  // At this point, the next thing on the command line must be
-  // the archive name.
-  getArchive();
-
-  // Everything on the command line at this point is a member.
-  getMembers();
-
- if (NumOperations == 0 && MaybeJustCreateSymTab) {
-    NumOperations = 1;
-    Operation = CreateSymTab;
-    if (!Members.empty())
-      fail("The s operation takes only an archive as argument");
-  }
-
-  // Perform various checks on the operation/modifier specification
-  // to make sure we are dealing with a legal request.
-  if (NumOperations == 0)
-    fail("You must specify at least one of the operations");
-  if (NumOperations > 1)
-    fail("Only one operation may be specified");
-  if (NumPositional > 1)
-    fail("You may only specify one of a, b, and i modifiers");
-  if (AddAfter || AddBefore) {
-    if (Operation != Move && Operation != ReplaceOrInsert)
-      fail("The 'a', 'b' and 'i' modifiers can only be specified with "
-           "the 'm' or 'r' operations");
-  }
-  if (OriginalDates && Operation != Extract)
-    fail("The 'o' modifier is only applicable to the 'x' operation");
-  if (OnlyUpdate && Operation != ReplaceOrInsert)
-    fail("The 'u' modifier is only applicable to the 'r' operation");
-  if (AddLibrary && Operation != QuickAppend)
-    fail("The 'L' modifier is only applicable to the 'q' operation");
-
-  // Return the parsed operation to the caller
-  return Operation;
-}
-
-// Implements the 'p' operation. This function traverses the archive
-// looking for members that match the path list.
-static void doPrint(StringRef Name, const object::Archive::Child &C) {
-  if (Verbose)
-    outs() << "Printing " << Name << "\n";
-
-  Expected<StringRef> DataOrErr = C.getBuffer();
-  failIfError(DataOrErr.takeError());
-  StringRef Data = *DataOrErr;
-  outs().write(Data.data(), Data.size());
-}
-
-// Utility function for printing out the file mode when the 't' operation is in
-// verbose mode.
-static void printMode(unsigned mode) {
-  outs() << ((mode & 004) ? "r" : "-");
-  outs() << ((mode & 002) ? "w" : "-");
-  outs() << ((mode & 001) ? "x" : "-");
-}
-
-// Implement the 't' operation. This function prints out just
-// the file names of each of the members. However, if verbose mode is requested
-// ('v' modifier) then the file type, permission mode, user, group, size, and
-// modification time are also printed.
-static void doDisplayTable(StringRef Name, const object::Archive::Child &C) {
-  if (Verbose) {
-    Expected<sys::fs::perms> ModeOrErr = C.getAccessMode();
-    failIfError(ModeOrErr.takeError());
-    sys::fs::perms Mode = ModeOrErr.get();
-    printMode((Mode >> 6) & 007);
-    printMode((Mode >> 3) & 007);
-    printMode(Mode & 007);
-    Expected<unsigned> UIDOrErr = C.getUID();
-    failIfError(UIDOrErr.takeError());
-    outs() << ' ' << UIDOrErr.get();
-    Expected<unsigned> GIDOrErr = C.getGID();
-    failIfError(GIDOrErr.takeError());
-    outs() << '/' << GIDOrErr.get();
-    Expected<uint64_t> Size = C.getSize();
-    failIfError(Size.takeError());
-    outs() << ' ' << format("%6llu", Size.get());
-    auto ModTimeOrErr = C.getLastModified();
-    failIfError(ModTimeOrErr.takeError());
-    // Note: formatv() only handles the default TimePoint<>, which is in
-    // nanoseconds.
-    // TODO: fix format_provider<TimePoint<>> to allow other units.
-    sys::TimePoint<> ModTimeInNs = ModTimeOrErr.get();
-    outs() << ' ' << formatv("{0:%b %e %H:%M %Y}", ModTimeInNs);
-    outs() << ' ';
-  }
-
-  if (C.getParent()->isThin()) {
-    outs() << sys::path::parent_path(ArchiveName);
-    outs() << '/';
-  }
-  outs() << Name << "\n";
-}
-
-// Implement the 'x' operation. This function extracts files back to the file
-// system.
-static void doExtract(StringRef Name, const object::Archive::Child &C) {
-  // Retain the original mode.
-  Expected<sys::fs::perms> ModeOrErr = C.getAccessMode();
-  failIfError(ModeOrErr.takeError());
-  sys::fs::perms Mode = ModeOrErr.get();
-
-  int FD;
-  failIfError(sys::fs::openFileForWrite(sys::path::filename(Name), FD,
-                                        sys::fs::CD_CreateAlways,
-                                        sys::fs::F_None, Mode),
-              Name);
-
-  {
-    raw_fd_ostream file(FD, false);
-
-    // Get the data and its length
-    Expected<StringRef> BufOrErr = C.getBuffer();
-    failIfError(BufOrErr.takeError());
-    StringRef Data = BufOrErr.get();
-
-    // Write the data.
-    file.write(Data.data(), Data.size());
-  }
-
-  // If we're supposed to retain the original modification times, etc. do so
-  // now.
-  if (OriginalDates) {
-    auto ModTimeOrErr = C.getLastModified();
-    failIfError(ModTimeOrErr.takeError());
-    failIfError(
-        sys::fs::setLastAccessAndModificationTime(FD, ModTimeOrErr.get()));
-  }
-
-  if (close(FD))
-    fail("Could not close the file");
-}
-
-static bool shouldCreateArchive(ArchiveOperation Op) {
-  switch (Op) {
-  case Print:
-  case Delete:
-  case Move:
-  case DisplayTable:
-  case Extract:
-  case CreateSymTab:
-    return false;
-
-  case QuickAppend:
-  case ReplaceOrInsert:
-    return true;
-  }
-
-  llvm_unreachable("Missing entry in covered switch.");
-}
-
-static void performReadOperation(ArchiveOperation Operation,
-                                 object::Archive *OldArchive) {
-  if (Operation == Extract && OldArchive->isThin())
-    fail("extracting from a thin archive is not supported");
-
-  bool Filter = !Members.empty();
-  {
-    Error Err = Error::success();
-    for (auto &C : OldArchive->children(Err)) {
-      Expected<StringRef> NameOrErr = C.getName();
-      failIfError(NameOrErr.takeError());
-      StringRef Name = NameOrErr.get();
-
-      if (Filter) {
-        auto I = find(Members, Name);
-        if (I == Members.end())
-          continue;
-        Members.erase(I);
-      }
-
-      switch (Operation) {
-      default:
-        llvm_unreachable("Not a read operation");
-      case Print:
-        doPrint(Name, C);
-        break;
-      case DisplayTable:
-        doDisplayTable(Name, C);
-        break;
-      case Extract:
-        doExtract(Name, C);
-        break;
-      }
-    }
-    failIfError(std::move(Err));
-  }
-
-  if (Members.empty())
-    return;
-  for (StringRef Name : Members)
-    errs() << Name << " was not found\n";
-  exit(1);
-}
-
-static void addMember(std::vector<NewArchiveMember> &Members,
-                      StringRef FileName, int Pos = -1) {
-  Expected<NewArchiveMember> NMOrErr =
-      NewArchiveMember::getFile(FileName, Deterministic);
-  failIfError(NMOrErr.takeError(), FileName);
-
-  // Use the basename of the object path for the member name.
-  NMOrErr->MemberName = sys::path::filename(NMOrErr->MemberName);
-
-  if (Pos == -1)
-    Members.push_back(std::move(*NMOrErr));
-  else
-    Members[Pos] = std::move(*NMOrErr);
-}
-
-static void addMember(std::vector<NewArchiveMember> &Members,
-                      const object::Archive::Child &M, int Pos = -1) {
-  if (Thin && !M.getParent()->isThin())
-    fail("Cannot convert a regular archive to a thin one");
-  Expected<NewArchiveMember> NMOrErr =
-      NewArchiveMember::getOldMember(M, Deterministic);
-  failIfError(NMOrErr.takeError());
-  if (Pos == -1)
-    Members.push_back(std::move(*NMOrErr));
-  else
-    Members[Pos] = std::move(*NMOrErr);
-}
-
-static void addLibMember(std::vector<NewArchiveMember> &Members,
-                         StringRef FileName) {
-  Expected<NewArchiveMember> NMOrErr =
-      NewArchiveMember::getFile(FileName, Deterministic);
-  failIfError(NMOrErr.takeError(), FileName);
-  if (identify_magic(NMOrErr->Buf->getBuffer()) == file_magic::archive) {
-    object::Archive &Lib = readLibrary(FileName);
-    Error Err = Error::success();
-
-    for (auto &Child : Lib.children(Err))
-      addMember(Members, Child);
-
-    failIfError(std::move(Err));
-  } else {
-    // Use the basename of the object path for the member name.
-    NMOrErr->MemberName = sys::path::filename(NMOrErr->MemberName);
-    Members.push_back(std::move(*NMOrErr));
-  }
-}
-
-enum InsertAction {
-  IA_AddOldMember,
-  IA_AddNewMember,
-  IA_Delete,
-  IA_MoveOldMember,
-  IA_MoveNewMember
-};
-
-static InsertAction computeInsertAction(ArchiveOperation Operation,
-                                        const object::Archive::Child &Member,
-                                        StringRef Name,
-                                        std::vector<StringRef>::iterator &Pos) {
-  if (Operation == QuickAppend || Members.empty())
-    return IA_AddOldMember;
-
-  auto MI = find_if(Members, [Name](StringRef Path) {
-    return Name == sys::path::filename(Path);
-  });
-
-  if (MI == Members.end())
-    return IA_AddOldMember;
-
-  Pos = MI;
-
-  if (Operation == Delete)
-    return IA_Delete;
-
-  if (Operation == Move)
-    return IA_MoveOldMember;
-
-  if (Operation == ReplaceOrInsert) {
-    StringRef PosName = sys::path::filename(RelPos);
-    if (!OnlyUpdate) {
-      if (PosName.empty())
-        return IA_AddNewMember;
-      return IA_MoveNewMember;
-    }
-
-    // We could try to optimize this to a fstat, but it is not a common
-    // operation.
-    sys::fs::file_status Status;
-    failIfError(sys::fs::status(*MI, Status), *MI);
-    auto ModTimeOrErr = Member.getLastModified();
-    failIfError(ModTimeOrErr.takeError());
-    if (Status.getLastModificationTime() < ModTimeOrErr.get()) {
-      if (PosName.empty())
-        return IA_AddOldMember;
-      return IA_MoveOldMember;
-    }
-
-    if (PosName.empty())
-      return IA_AddNewMember;
-    return IA_MoveNewMember;
-  }
-  llvm_unreachable("No such operation");
-}
-
-// We have to walk this twice and computing it is not trivial, so creating an
-// explicit std::vector is actually fairly efficient.
-static std::vector<NewArchiveMember>
-computeNewArchiveMembers(ArchiveOperation Operation,
-                         object::Archive *OldArchive) {
-  std::vector<NewArchiveMember> Ret;
-  std::vector<NewArchiveMember> Moved;
-  int InsertPos = -1;
-  StringRef PosName = sys::path::filename(RelPos);
-  if (OldArchive) {
-    Error Err = Error::success();
-    for (auto &Child : OldArchive->children(Err)) {
-      int Pos = Ret.size();
-      Expected<StringRef> NameOrErr = Child.getName();
-      failIfError(NameOrErr.takeError());
-      StringRef Name = NameOrErr.get();
-      if (Name == PosName) {
-        assert(AddAfter || AddBefore);
-        if (AddBefore)
-          InsertPos = Pos;
-        else
-          InsertPos = Pos + 1;
-      }
-
-      std::vector<StringRef>::iterator MemberI = Members.end();
-      InsertAction Action =
-          computeInsertAction(Operation, Child, Name, MemberI);
-      switch (Action) {
-      case IA_AddOldMember:
-        addMember(Ret, Child);
-        break;
-      case IA_AddNewMember:
-        addMember(Ret, *MemberI);
-        break;
-      case IA_Delete:
-        break;
-      case IA_MoveOldMember:
-        addMember(Moved, Child);
-        break;
-      case IA_MoveNewMember:
-        addMember(Moved, *MemberI);
-        break;
-      }
-      if (MemberI != Members.end())
-        Members.erase(MemberI);
-    }
-    failIfError(std::move(Err));
-  }
-
-  if (Operation == Delete)
-    return Ret;
-
-  if (!RelPos.empty() && InsertPos == -1)
-    fail("Insertion point not found");
-
-  if (RelPos.empty())
-    InsertPos = Ret.size();
-
-  assert(unsigned(InsertPos) <= Ret.size());
-  int Pos = InsertPos;
-  for (auto &M : Moved) {
-    Ret.insert(Ret.begin() + Pos, std::move(M));
-    ++Pos;
-  }
-
-  if (AddLibrary) {
-    assert(Operation == QuickAppend);
-    for (auto &Member : Members)
-      addLibMember(Ret, Member);
-    return Ret;
-  }
-
-  for (unsigned I = 0; I != Members.size(); ++I)
-    Ret.insert(Ret.begin() + InsertPos, NewArchiveMember());
-  Pos = InsertPos;
-  for (auto &Member : Members) {
-    addMember(Ret, Member, Pos);
-    ++Pos;
-  }
-
-  return Ret;
-}
-
-static object::Archive::Kind getDefaultForHost() {
-  return Triple(sys::getProcessTriple()).isOSDarwin()
-             ? object::Archive::K_DARWIN
-             : object::Archive::K_GNU;
-}
-
-static object::Archive::Kind getKindFromMember(const NewArchiveMember &Member) {
-  Expected<std::unique_ptr<object::ObjectFile>> OptionalObject =
-      object::ObjectFile::createObjectFile(Member.Buf->getMemBufferRef());
-
-  if (OptionalObject)
-    return isa<object::MachOObjectFile>(**OptionalObject)
-               ? object::Archive::K_DARWIN
-               : object::Archive::K_GNU;
-
-  // squelch the error in case we had a non-object file
-  consumeError(OptionalObject.takeError());
-  return getDefaultForHost();
-}
-
-static void
-performWriteOperation(ArchiveOperation Operation,
-                      object::Archive *OldArchive,
-                      std::unique_ptr<MemoryBuffer> OldArchiveBuf,
-                      std::vector<NewArchiveMember> *NewMembersP) {
-  std::vector<NewArchiveMember> NewMembers;
-  if (!NewMembersP)
-    NewMembers = computeNewArchiveMembers(Operation, OldArchive);
-
-  object::Archive::Kind Kind;
-  switch (FormatType) {
-  case Default:
-    if (Thin)
-      Kind = object::Archive::K_GNU;
-    else if (OldArchive)
-      Kind = OldArchive->kind();
-    else if (NewMembersP)
-      Kind = NewMembersP->size() ? getKindFromMember(NewMembersP->front())
-                                 : getDefaultForHost();
-    else
-      Kind = NewMembers.size() ? getKindFromMember(NewMembers.front())
-                               : getDefaultForHost();
-    break;
-  case GNU:
-    Kind = object::Archive::K_GNU;
-    break;
-  case BSD:
-    if (Thin)
-      fail("Only the gnu format has a thin mode");
-    Kind = object::Archive::K_BSD;
-    break;
-  case DARWIN:
-    if (Thin)
-      fail("Only the gnu format has a thin mode");
-    Kind = object::Archive::K_DARWIN;
-    break;
-  case Unknown:
-    llvm_unreachable("");
-  }
-
-  Error E =
-      writeArchive(ArchiveName, NewMembersP ? *NewMembersP : NewMembers, Symtab,
-                   Kind, Deterministic, Thin, std::move(OldArchiveBuf));
-  failIfError(std::move(E), ArchiveName);
-}
-
-static void createSymbolTable(object::Archive *OldArchive) {
-  // When an archive is created or modified, if the s option is given, the
-  // resulting archive will have a current symbol table. If the S option
-  // is given, it will have no symbol table.
-  // In summary, we only need to update the symbol table if we have none.
-  // This is actually very common because of broken build systems that think
-  // they have to run ranlib.
-  if (OldArchive->hasSymbolTable())
-    return;
-
-  performWriteOperation(CreateSymTab, OldArchive, nullptr, nullptr);
-}
-
-static void performOperation(ArchiveOperation Operation,
-                             object::Archive *OldArchive,
-                             std::unique_ptr<MemoryBuffer> OldArchiveBuf,
-                             std::vector<NewArchiveMember> *NewMembers) {
-  switch (Operation) {
-  case Print:
-  case DisplayTable:
-  case Extract:
-    performReadOperation(Operation, OldArchive);
-    return;
-
-  case Delete:
-  case Move:
-  case QuickAppend:
-  case ReplaceOrInsert:
-    performWriteOperation(Operation, OldArchive, std::move(OldArchiveBuf),
-                          NewMembers);
-    return;
-  case CreateSymTab:
-    createSymbolTable(OldArchive);
-    return;
-  }
-  llvm_unreachable("Unknown operation.");
-}
-
-static int performOperation(ArchiveOperation Operation,
-                            std::vector<NewArchiveMember> *NewMembers) {
-  // Create or open the archive object.
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
-      MemoryBuffer::getFile(ArchiveName, -1, false);
-  std::error_code EC = Buf.getError();
-  if (EC && EC != errc::no_such_file_or_directory)
-    fail("error opening '" + ArchiveName + "': " + EC.message() + "!");
-
-  if (!EC) {
-    Error Err = Error::success();
-    object::Archive Archive(Buf.get()->getMemBufferRef(), Err);
-    EC = errorToErrorCode(std::move(Err));
-    failIfError(EC,
-                "error loading '" + ArchiveName + "': " + EC.message() + "!");
-    performOperation(Operation, &Archive, std::move(Buf.get()), NewMembers);
-    return 0;
-  }
-
-  assert(EC == errc::no_such_file_or_directory);
-
-  if (!shouldCreateArchive(Operation)) {
-    failIfError(EC, Twine("error loading '") + ArchiveName + "'");
-  } else {
-    if (!Create) {
-      // Produce a warning if we should and we're creating the archive
-      errs() << ToolName << ": creating " << ArchiveName << "\n";
-    }
-  }
-
-  performOperation(Operation, nullptr, nullptr, NewMembers);
-  return 0;
-}
-
-static void runMRIScript() {
-  enum class MRICommand { AddLib, AddMod, Create, Delete, Save, End, Invalid };
-
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getSTDIN();
-  failIfError(Buf.getError());
-  const MemoryBuffer &Ref = *Buf.get();
-  bool Saved = false;
-  std::vector<NewArchiveMember> NewMembers;
-
-  for (line_iterator I(Ref, /*SkipBlanks*/ false), E; I != E; ++I) {
-    StringRef Line = *I;
-    Line = Line.split(';').first;
-    Line = Line.split('*').first;
-    Line = Line.trim();
-    if (Line.empty())
-      continue;
-    StringRef CommandStr, Rest;
-    std::tie(CommandStr, Rest) = Line.split(' ');
-    Rest = Rest.trim();
-    if (!Rest.empty() && Rest.front() == '"' && Rest.back() == '"')
-      Rest = Rest.drop_front().drop_back();
-    auto Command = StringSwitch<MRICommand>(CommandStr.lower())
-                       .Case("addlib", MRICommand::AddLib)
-                       .Case("addmod", MRICommand::AddMod)
-                       .Case("create", MRICommand::Create)
-                       .Case("delete", MRICommand::Delete)
-                       .Case("save", MRICommand::Save)
-                       .Case("end", MRICommand::End)
-                       .Default(MRICommand::Invalid);
-
-    switch (Command) {
-    case MRICommand::AddLib: {
-      object::Archive &Lib = readLibrary(Rest);
-      {
-        Error Err = Error::success();
-        for (auto &Member : Lib.children(Err))
-          addMember(NewMembers, Member);
-        failIfError(std::move(Err));
-      }
-      break;
-    }
-    case MRICommand::AddMod:
-      addMember(NewMembers, Rest);
-      break;
-    case MRICommand::Create:
-      Create = true;
-      if (!ArchiveName.empty())
-        fail("Editing multiple archives not supported");
-      if (Saved)
-        fail("File already saved");
-      ArchiveName = Rest;
-      break;
-    case MRICommand::Delete: {
-      StringRef Name = sys::path::filename(Rest);
-      llvm::erase_if(NewMembers,
-                     [=](NewArchiveMember &M) { return M.MemberName == Name; });
-      break;
-    }
-    case MRICommand::Save:
-      Saved = true;
-      break;
-    case MRICommand::End:
-      break;
-    case MRICommand::Invalid:
-      fail("Unknown command: " + CommandStr);
-    }
-  }
-
-  // Nothing to do if not saved.
-  if (Saved)
-    performOperation(ReplaceOrInsert, &NewMembers);
-  exit(0);
-}
-
-static bool handleGenericOption(StringRef arg) {
-  if (arg == "-help" || arg == "--help") {
-    printHelpMessage();
-    return true;
-  }
-  if (arg == "-version" || arg == "--version") {
-    cl::PrintVersionMessage();
-    return true;
-  }
-  return false;
-}
-
-static int ar_main(int argc, char **argv) {
-  SmallVector<const char *, 0> Argv(argv, argv + argc);
-  BumpPtrAllocator Alloc;
-  StringSaver Saver(Alloc);
-  cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv);
-  for(size_t i = 1; i < Argv.size(); ++i) {
-    StringRef Arg = Argv[i];
-    const char *match;
-    auto MatchFlagWithArg = [&](const char *expected) {
-      size_t len = strlen(expected);
-      if (Arg == expected) {
-        if (++i >= Argv.size())
-          fail(std::string(expected) + " requires an argument");
-        match = Argv[i];
-        return true;
-      }
-      if (Arg.startswith(expected) && Arg.size() > len &&
-                 Arg[len] == '=') {
-        match = Arg.data() + len + 1;
-        return true;
-      }
-      return false;
-    };
-    if (handleGenericOption(Argv[i]))
-      return 0;
-    if (Arg == "--") {
-      for(; i < Argv.size(); ++i)
-        PositionalArgs.push_back(Argv[i]);
-      break;
-    }
-    if (Arg[0] == '-') {
-      if (Arg.startswith("--"))
-        Arg = Argv[i] + 2;
-      else
-        Arg = Argv[i] + 1;
-      if (Arg == "M") {
-        MRI = true;
-      } else if (MatchFlagWithArg("format")) {
-        FormatType = StringSwitch<Format>(match)
-            .Case("default", Default)
-            .Case("gnu", GNU)
-            .Case("darwin", DARWIN)
-            .Case("bsd", BSD)
-            .Default(Unknown);
-        if (FormatType == Unknown)
-          fail(std::string("Invalid format ") + match);
-      } else if (MatchFlagWithArg("plugin")) {
-        // Ignored.
-      } else {
-        Options += Argv[i] + 1;
-      }
-    } else if (Options.empty()) {
-      Options += Argv[i];
-    } else {
-      PositionalArgs.push_back(Argv[i]);
-    }
-  }
-  ArchiveOperation Operation = parseCommandLine();
-  return performOperation(Operation, nullptr);
-}
-
-static int ranlib_main(int argc, char **argv) {
-  bool ArchiveSpecified = false;
-  for(int i = 1; i < argc; ++i) {
-    if (handleGenericOption(argv[i])) {
-      return 0;
-    } else {
-      if (ArchiveSpecified)
-        fail("Exactly one archive should be specified");
-      ArchiveSpecified = true;
-      ArchiveName = argv[i];
-    }
-  }
-  return performOperation(CreateSymTab, nullptr);
-}
-
-int main(int argc, char **argv) {
-  InitLLVM X(argc, argv);
-  ToolName = argv[0];
-
-  llvm::InitializeAllTargetInfos();
-  llvm::InitializeAllTargetMCs();
-  llvm::InitializeAllAsmParsers();
-
-  Stem = sys::path::stem(ToolName);
-  if (Stem.contains_lower("dlltool"))
-    return dlltoolDriverMain(makeArrayRef(argv, argc));
-
-  if (Stem.contains_lower("ranlib"))
-    return ranlib_main(argc, argv);
-
-  if (Stem.contains_lower("lib"))
-    return libDriverMain(makeArrayRef(argv, argc));
-
-  if (Stem.contains_lower("ar"))
-    return ar_main(argc, argv);
-  fail("Not ranlib, ar, lib or dlltool!");
-}
+//===-- llvm-ar.cpp - LLVM archive librarian utility ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Builds up (relatively) standard unix archive files (.a) containing LLVM
+// bitcode or other files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Chrono.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
+#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
+
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
+using namespace llvm;
+
+// The name this program was invoked as.
+static StringRef ToolName;
+
+// The basename of this program.
+static StringRef Stem;
+
+const char RanlibHelp[] = R"(
+OVERVIEW: LLVM Ranlib (llvm-ranlib)
+
+  This program generates an index to speed access to archives
+
+USAGE: llvm-ranlib <archive-file>
+
+OPTIONS:
+  -help                             - Display available options
+  -version                          - Display the version of this program
+)";
+
+const char ArHelp[] = R"(
+OVERVIEW: LLVM Archiver
+
+USAGE: llvm-ar [options] [-]<operation>[modifiers] [relpos] <archive> [files]
+       llvm-ar -M [<mri-script]
+
+OPTIONS:
+  --format              - Archive format to create
+    =default            -   default
+    =gnu                -   gnu
+    =darwin             -   darwin
+    =bsd                -   bsd
+  --plugin=<string>     - Ignored for compatibility
+  --help                - Display available options
+  --version             - Display the version of this program
+
+OPERATIONS:
+  d - delete [files] from the archive
+  m - move [files] in the archive
+  p - print [files] found in the archive
+  q - quick append [files] to the archive
+  r - replace or insert [files] into the archive
+  s - act as ranlib
+  t - display contents of archive
+  x - extract [files] from the archive
+
+MODIFIERS:
+  [a] - put [files] after [relpos]
+  [b] - put [files] before [relpos] (same as [i])
+  [c] - do not warn if archive had to be created
+  [D] - use zero for timestamps and uids/gids (default)
+  [i] - put [files] before [relpos] (same as [b])
+  [l] - ignored for compatibility
+  [L] - add archive's contents
+  [o] - preserve original dates
+  [s] - create an archive index (cf. ranlib)
+  [S] - do not build a symbol table
+  [T] - create a thin archive
+  [u] - update only [files] newer than archive contents
+  [U] - use actual timestamps and uids/gids
+  [v] - be verbose about actions taken
+)";
+
+void printHelpMessage() {
+  if (Stem.contains_lower("ranlib"))
+    outs() << RanlibHelp;
+  else if (Stem.contains_lower("ar"))
+    outs() << ArHelp;
+}
+
+// Show the error message and exit.
+LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) {
+  errs() << ToolName << ": " << Error << ".\n";
+  printHelpMessage();
+  exit(1);
+}
+
+static void failIfError(std::error_code EC, Twine Context = "") {
+  if (!EC)
+    return;
+
+  std::string ContextStr = Context.str();
+  if (ContextStr == "")
+    fail(EC.message());
+  fail(Context + ": " + EC.message());
+}
+
+static void failIfError(Error E, Twine Context = "") {
+  if (!E)
+    return;
+
+  handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EIB) {
+    std::string ContextStr = Context.str();
+    if (ContextStr == "")
+      fail(EIB.message());
+    fail(Context + ": " + EIB.message());
+  });
+}
+
+static SmallVector<const char *, 256> PositionalArgs;
+
+static bool MRI;
+
+namespace {
+enum Format { Default, GNU, BSD, DARWIN, Unknown };
+}
+
+static Format FormatType = Default;
+
+static std::string Options;
+
+// This enumeration delineates the kinds of operations on an archive
+// that are permitted.
+enum ArchiveOperation {
+  Print,           ///< Print the contents of the archive
+  Delete,          ///< Delete the specified members
+  Move,            ///< Move members to end or as given by {a,b,i} modifiers
+  QuickAppend,     ///< Quickly append to end of archive
+  ReplaceOrInsert, ///< Replace or Insert members
+  DisplayTable,    ///< Display the table of contents
+  Extract,         ///< Extract files back to file system
+  CreateSymTab     ///< Create a symbol table in an existing archive
+};
+
+// Modifiers to follow operation to vary behavior
+static bool AddAfter = false;      ///< 'a' modifier
+static bool AddBefore = false;     ///< 'b' modifier
+static bool Create = false;        ///< 'c' modifier
+static bool OriginalDates = false; ///< 'o' modifier
+static bool OnlyUpdate = false;    ///< 'u' modifier
+static bool Verbose = false;       ///< 'v' modifier
+static bool Symtab = true;         ///< 's' modifier
+static bool Deterministic = true;  ///< 'D' and 'U' modifiers
+static bool Thin = false;          ///< 'T' modifier
+static bool AddLibrary = false;    ///< 'L' modifier
+
+// Relative Positional Argument (for insert/move). This variable holds
+// the name of the archive member to which the 'a', 'b' or 'i' modifier
+// refers. Only one of 'a', 'b' or 'i' can be specified so we only need
+// one variable.
+static std::string RelPos;
+
+// This variable holds the name of the archive file as given on the
+// command line.
+static std::string ArchiveName;
+
+// This variable holds the list of member files to proecess, as given
+// on the command line.
+static std::vector<StringRef> Members;
+
+// Extract the member filename from the command line for the [relpos] argument
+// associated with a, b, and i modifiers
+static void getRelPos() {
+  if (PositionalArgs.size() == 0)
+    fail("Expected [relpos] for a, b, or i modifier");
+  RelPos = PositionalArgs[0];
+  PositionalArgs.erase(PositionalArgs.begin());
+}
+
+// Get the archive file name from the command line
+static void getArchive() {
+  if (PositionalArgs.size() == 0)
+    fail("An archive name must be specified");
+  ArchiveName = PositionalArgs[0];
+  PositionalArgs.erase(PositionalArgs.begin());
+}
+
+// Copy over remaining items in PositionalArgs to our Members vector
+static void getMembers() {
+  for (auto &Arg : PositionalArgs)
+    Members.push_back(Arg);
+}
+
+std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
+std::vector<std::unique_ptr<object::Archive>> Archives;
+
+static object::Archive &readLibrary(const Twine &Library) {
+  auto BufOrErr = MemoryBuffer::getFile(Library, -1, false);
+  failIfError(BufOrErr.getError(), "Could not open library");
+  ArchiveBuffers.push_back(std::move(*BufOrErr));
+  auto LibOrErr =
+      object::Archive::create(ArchiveBuffers.back()->getMemBufferRef());
+  failIfError(errorToErrorCode(LibOrErr.takeError()),
+              "Could not parse library");
+  Archives.push_back(std::move(*LibOrErr));
+  return *Archives.back();
+}
+
+static void runMRIScript();
+
+// Parse the command line options as presented and return the operation
+// specified. Process all modifiers and check to make sure that constraints on
+// modifier/operation pairs have not been violated.
+static ArchiveOperation parseCommandLine() {
+  if (MRI) {
+    if (!PositionalArgs.empty() || !Options.empty())
+      fail("Cannot mix -M and other options");
+    runMRIScript();
+  }
+
+  // Keep track of number of operations. We can only specify one
+  // per execution.
+  unsigned NumOperations = 0;
+
+  // Keep track of the number of positional modifiers (a,b,i). Only
+  // one can be specified.
+  unsigned NumPositional = 0;
+
+  // Keep track of which operation was requested
+  ArchiveOperation Operation;
+
+  bool MaybeJustCreateSymTab = false;
+
+  for (unsigned i = 0; i < Options.size(); ++i) {
+    switch (Options[i]) {
+    case 'd':
+      ++NumOperations;
+      Operation = Delete;
+      break;
+    case 'm':
+      ++NumOperations;
+      Operation = Move;
+      break;
+    case 'p':
+      ++NumOperations;
+      Operation = Print;
+      break;
+    case 'q':
+      ++NumOperations;
+      Operation = QuickAppend;
+      break;
+    case 'r':
+      ++NumOperations;
+      Operation = ReplaceOrInsert;
+      break;
+    case 't':
+      ++NumOperations;
+      Operation = DisplayTable;
+      break;
+    case 'x':
+      ++NumOperations;
+      Operation = Extract;
+      break;
+    case 'c':
+      Create = true;
+      break;
+    case 'l': /* accepted but unused */
+      break;
+    case 'o':
+      OriginalDates = true;
+      break;
+    case 's':
+      Symtab = true;
+      MaybeJustCreateSymTab = true;
+      break;
+    case 'S':
+      Symtab = false;
+      break;
+    case 'u':
+      OnlyUpdate = true;
+      break;
+    case 'v':
+      Verbose = true;
+      break;
+    case 'a':
+      getRelPos();
+      AddAfter = true;
+      NumPositional++;
+      break;
+    case 'b':
+      getRelPos();
+      AddBefore = true;
+      NumPositional++;
+      break;
+    case 'i':
+      getRelPos();
+      AddBefore = true;
+      NumPositional++;
+      break;
+    case 'D':
+      Deterministic = true;
+      break;
+    case 'U':
+      Deterministic = false;
+      break;
+    case 'T':
+      Thin = true;
+      break;
+    case 'L':
+      AddLibrary = true;
+      break;
+    default:
+      fail(std::string("unknown option ") + Options[i]);
+    }
+  }
+
+  // At this point, the next thing on the command line must be
+  // the archive name.
+  getArchive();
+
+  // Everything on the command line at this point is a member.
+  getMembers();
+
+  if (NumOperations == 0 && MaybeJustCreateSymTab) {
+    NumOperations = 1;
+    Operation = CreateSymTab;
+    if (!Members.empty())
+      fail("The s operation takes only an archive as argument");
+  }
+
+  // Perform various checks on the operation/modifier specification
+  // to make sure we are dealing with a legal request.
+  if (NumOperations == 0)
+    fail("You must specify at least one of the operations");
+  if (NumOperations > 1)
+    fail("Only one operation may be specified");
+  if (NumPositional > 1)
+    fail("You may only specify one of a, b, and i modifiers");
+  if (AddAfter || AddBefore) {
+    if (Operation != Move && Operation != ReplaceOrInsert)
+      fail("The 'a', 'b' and 'i' modifiers can only be specified with "
+           "the 'm' or 'r' operations");
+  }
+  if (OriginalDates && Operation != Extract)
+    fail("The 'o' modifier is only applicable to the 'x' operation");
+  if (OnlyUpdate && Operation != ReplaceOrInsert)
+    fail("The 'u' modifier is only applicable to the 'r' operation");
+  if (AddLibrary && Operation != QuickAppend)
+    fail("The 'L' modifier is only applicable to the 'q' operation");
+
+  // Return the parsed operation to the caller
+  return Operation;
+}
+
+// Implements the 'p' operation. This function traverses the archive
+// looking for members that match the path list.
+static void doPrint(StringRef Name, const object::Archive::Child &C) {
+  if (Verbose)
+    outs() << "Printing " << Name << "\n";
+
+  Expected<StringRef> DataOrErr = C.getBuffer();
+  failIfError(DataOrErr.takeError());
+  StringRef Data = *DataOrErr;
+  outs().write(Data.data(), Data.size());
+}
+
+// Utility function for printing out the file mode when the 't' operation is in
+// verbose mode.
+static void printMode(unsigned mode) {
+  outs() << ((mode & 004) ? "r" : "-");
+  outs() << ((mode & 002) ? "w" : "-");
+  outs() << ((mode & 001) ? "x" : "-");
+}
+
+// Implement the 't' operation. This function prints out just
+// the file names of each of the members. However, if verbose mode is requested
+// ('v' modifier) then the file type, permission mode, user, group, size, and
+// modification time are also printed.
+static void doDisplayTable(StringRef Name, const object::Archive::Child &C) {
+  if (Verbose) {
+    Expected<sys::fs::perms> ModeOrErr = C.getAccessMode();
+    failIfError(ModeOrErr.takeError());
+    sys::fs::perms Mode = ModeOrErr.get();
+    printMode((Mode >> 6) & 007);
+    printMode((Mode >> 3) & 007);
+    printMode(Mode & 007);
+    Expected<unsigned> UIDOrErr = C.getUID();
+    failIfError(UIDOrErr.takeError());
+    outs() << ' ' << UIDOrErr.get();
+    Expected<unsigned> GIDOrErr = C.getGID();
+    failIfError(GIDOrErr.takeError());
+    outs() << '/' << GIDOrErr.get();
+    Expected<uint64_t> Size = C.getSize();
+    failIfError(Size.takeError());
+    outs() << ' ' << format("%6llu", Size.get());
+    auto ModTimeOrErr = C.getLastModified();
+    failIfError(ModTimeOrErr.takeError());
+    // Note: formatv() only handles the default TimePoint<>, which is in
+    // nanoseconds.
+    // TODO: fix format_provider<TimePoint<>> to allow other units.
+    sys::TimePoint<> ModTimeInNs = ModTimeOrErr.get();
+    outs() << ' ' << formatv("{0:%b %e %H:%M %Y}", ModTimeInNs);
+    outs() << ' ';
+  }
+
+  if (C.getParent()->isThin()) {
+    outs() << sys::path::parent_path(ArchiveName);
+    outs() << '/';
+  }
+  outs() << Name << "\n";
+}
+
+// Implement the 'x' operation. This function extracts files back to the file
+// system.
+static void doExtract(StringRef Name, const object::Archive::Child &C) {
+  // Retain the original mode.
+  Expected<sys::fs::perms> ModeOrErr = C.getAccessMode();
+  failIfError(ModeOrErr.takeError());
+  sys::fs::perms Mode = ModeOrErr.get();
+
+  int FD;
+  failIfError(sys::fs::openFileForWrite(sys::path::filename(Name), FD,
+                                        sys::fs::CD_CreateAlways,
+                                        sys::fs::F_None, Mode),
+              Name);
+
+  {
+    raw_fd_ostream file(FD, false);
+
+    // Get the data and its length
+    Expected<StringRef> BufOrErr = C.getBuffer();
+    failIfError(BufOrErr.takeError());
+    StringRef Data = BufOrErr.get();
+
+    // Write the data.
+    file.write(Data.data(), Data.size());
+  }
+
+  // If we're supposed to retain the original modification times, etc. do so
+  // now.
+  if (OriginalDates) {
+    auto ModTimeOrErr = C.getLastModified();
+    failIfError(ModTimeOrErr.takeError());
+    failIfError(
+        sys::fs::setLastAccessAndModificationTime(FD, ModTimeOrErr.get()));
+  }
+
+  if (close(FD))
+    fail("Could not close the file");
+}
+
+static bool shouldCreateArchive(ArchiveOperation Op) {
+  switch (Op) {
+  case Print:
+  case Delete:
+  case Move:
+  case DisplayTable:
+  case Extract:
+  case CreateSymTab:
+    return false;
+
+  case QuickAppend:
+  case ReplaceOrInsert:
+    return true;
+  }
+
+  llvm_unreachable("Missing entry in covered switch.");
+}
+
+static void performReadOperation(ArchiveOperation Operation,
+                                 object::Archive *OldArchive) {
+  if (Operation == Extract && OldArchive->isThin())
+    fail("extracting from a thin archive is not supported");
+
+  bool Filter = !Members.empty();
+  {
+    Error Err = Error::success();
+    for (auto &C : OldArchive->children(Err)) {
+      Expected<StringRef> NameOrErr = C.getName();
+      failIfError(NameOrErr.takeError());
+      StringRef Name = NameOrErr.get();
+
+      if (Filter) {
+        auto I = find(Members, Name);
+        if (I == Members.end())
+          continue;
+        Members.erase(I);
+      }
+
+      switch (Operation) {
+      default:
+        llvm_unreachable("Not a read operation");
+      case Print:
+        doPrint(Name, C);
+        break;
+      case DisplayTable:
+        doDisplayTable(Name, C);
+        break;
+      case Extract:
+        doExtract(Name, C);
+        break;
+      }
+    }
+    failIfError(std::move(Err));
+  }
+
+  if (Members.empty())
+    return;
+  for (StringRef Name : Members)
+    errs() << Name << " was not found\n";
+  exit(1);
+}
+
+static void addMember(std::vector<NewArchiveMember> &Members,
+                      StringRef FileName, int Pos = -1) {
+  Expected<NewArchiveMember> NMOrErr =
+      NewArchiveMember::getFile(FileName, Deterministic);
+  failIfError(NMOrErr.takeError(), FileName);
+
+  // Use the basename of the object path for the member name.
+  NMOrErr->MemberName = sys::path::filename(NMOrErr->MemberName);
+
+  if (Pos == -1)
+    Members.push_back(std::move(*NMOrErr));
+  else
+    Members[Pos] = std::move(*NMOrErr);
+}
+
+static void addMember(std::vector<NewArchiveMember> &Members,
+                      const object::Archive::Child &M, int Pos = -1) {
+  if (Thin && !M.getParent()->isThin())
+    fail("Cannot convert a regular archive to a thin one");
+  Expected<NewArchiveMember> NMOrErr =
+      NewArchiveMember::getOldMember(M, Deterministic);
+  failIfError(NMOrErr.takeError());
+  if (Pos == -1)
+    Members.push_back(std::move(*NMOrErr));
+  else
+    Members[Pos] = std::move(*NMOrErr);
+}
+
+static void addLibMember(std::vector<NewArchiveMember> &Members,
+                         StringRef FileName) {
+  Expected<NewArchiveMember> NMOrErr =
+      NewArchiveMember::getFile(FileName, Deterministic);
+  failIfError(NMOrErr.takeError(), FileName);
+  if (identify_magic(NMOrErr->Buf->getBuffer()) == file_magic::archive) {
+    object::Archive &Lib = readLibrary(FileName);
+    Error Err = Error::success();
+
+    for (auto &Child : Lib.children(Err))
+      addMember(Members, Child);
+
+    failIfError(std::move(Err));
+  } else {
+    // Use the basename of the object path for the member name.
+    NMOrErr->MemberName = sys::path::filename(NMOrErr->MemberName);
+    Members.push_back(std::move(*NMOrErr));
+  }
+}
+
+enum InsertAction {
+  IA_AddOldMember,
+  IA_AddNewMember,
+  IA_Delete,
+  IA_MoveOldMember,
+  IA_MoveNewMember
+};
+
+static InsertAction computeInsertAction(ArchiveOperation Operation,
+                                        const object::Archive::Child &Member,
+                                        StringRef Name,
+                                        std::vector<StringRef>::iterator &Pos) {
+  if (Operation == QuickAppend || Members.empty())
+    return IA_AddOldMember;
+
+  auto MI = find_if(Members, [Name](StringRef Path) {
+    return Name == sys::path::filename(Path);
+  });
+
+  if (MI == Members.end())
+    return IA_AddOldMember;
+
+  Pos = MI;
+
+  if (Operation == Delete)
+    return IA_Delete;
+
+  if (Operation == Move)
+    return IA_MoveOldMember;
+
+  if (Operation == ReplaceOrInsert) {
+    StringRef PosName = sys::path::filename(RelPos);
+    if (!OnlyUpdate) {
+      if (PosName.empty())
+        return IA_AddNewMember;
+      return IA_MoveNewMember;
+    }
+
+    // We could try to optimize this to a fstat, but it is not a common
+    // operation.
+    sys::fs::file_status Status;
+    failIfError(sys::fs::status(*MI, Status), *MI);
+    auto ModTimeOrErr = Member.getLastModified();
+    failIfError(ModTimeOrErr.takeError());
+    if (Status.getLastModificationTime() < ModTimeOrErr.get()) {
+      if (PosName.empty())
+        return IA_AddOldMember;
+      return IA_MoveOldMember;
+    }
+
+    if (PosName.empty())
+      return IA_AddNewMember;
+    return IA_MoveNewMember;
+  }
+  llvm_unreachable("No such operation");
+}
+
+// We have to walk this twice and computing it is not trivial, so creating an
+// explicit std::vector is actually fairly efficient.
+static std::vector<NewArchiveMember>
+computeNewArchiveMembers(ArchiveOperation Operation,
+                         object::Archive *OldArchive) {
+  std::vector<NewArchiveMember> Ret;
+  std::vector<NewArchiveMember> Moved;
+  int InsertPos = -1;
+  StringRef PosName = sys::path::filename(RelPos);
+  if (OldArchive) {
+    Error Err = Error::success();
+    for (auto &Child : OldArchive->children(Err)) {
+      int Pos = Ret.size();
+      Expected<StringRef> NameOrErr = Child.getName();
+      failIfError(NameOrErr.takeError());
+      StringRef Name = NameOrErr.get();
+      if (Name == PosName) {
+        assert(AddAfter || AddBefore);
+        if (AddBefore)
+          InsertPos = Pos;
+        else
+          InsertPos = Pos + 1;
+      }
+
+      std::vector<StringRef>::iterator MemberI = Members.end();
+      InsertAction Action =
+          computeInsertAction(Operation, Child, Name, MemberI);
+      switch (Action) {
+      case IA_AddOldMember:
+        addMember(Ret, Child);
+        break;
+      case IA_AddNewMember:
+        addMember(Ret, *MemberI);
+        break;
+      case IA_Delete:
+        break;
+      case IA_MoveOldMember:
+        addMember(Moved, Child);
+        break;
+      case IA_MoveNewMember:
+        addMember(Moved, *MemberI);
+        break;
+      }
+      if (MemberI != Members.end())
+        Members.erase(MemberI);
+    }
+    failIfError(std::move(Err));
+  }
+
+  if (Operation == Delete)
+    return Ret;
+
+  if (!RelPos.empty() && InsertPos == -1)
+    fail("Insertion point not found");
+
+  if (RelPos.empty())
+    InsertPos = Ret.size();
+
+  assert(unsigned(InsertPos) <= Ret.size());
+  int Pos = InsertPos;
+  for (auto &M : Moved) {
+    Ret.insert(Ret.begin() + Pos, std::move(M));
+    ++Pos;
+  }
+
+  if (AddLibrary) {
+    assert(Operation == QuickAppend);
+    for (auto &Member : Members)
+      addLibMember(Ret, Member);
+    return Ret;
+  }
+
+  for (unsigned I = 0; I != Members.size(); ++I)
+    Ret.insert(Ret.begin() + InsertPos, NewArchiveMember());
+  Pos = InsertPos;
+  for (auto &Member : Members) {
+    addMember(Ret, Member, Pos);
+    ++Pos;
+  }
+
+  return Ret;
+}
+
+static object::Archive::Kind getDefaultForHost() {
+  return Triple(sys::getProcessTriple()).isOSDarwin()
+             ? object::Archive::K_DARWIN
+             : object::Archive::K_GNU;
+}
+
+static object::Archive::Kind getKindFromMember(const NewArchiveMember &Member) {
+  Expected<std::unique_ptr<object::ObjectFile>> OptionalObject =
+      object::ObjectFile::createObjectFile(Member.Buf->getMemBufferRef());
+
+  if (OptionalObject)
+    return isa<object::MachOObjectFile>(**OptionalObject)
+               ? object::Archive::K_DARWIN
+               : object::Archive::K_GNU;
+
+  // squelch the error in case we had a non-object file
+  consumeError(OptionalObject.takeError());
+  return getDefaultForHost();
+}
+
+static void performWriteOperation(ArchiveOperation Operation,
+                                  object::Archive *OldArchive,
+                                  std::unique_ptr<MemoryBuffer> OldArchiveBuf,
+                                  std::vector<NewArchiveMember> *NewMembersP) {
+  std::vector<NewArchiveMember> NewMembers;
+  if (!NewMembersP)
+    NewMembers = computeNewArchiveMembers(Operation, OldArchive);
+
+  object::Archive::Kind Kind;
+  switch (FormatType) {
+  case Default:
+    if (Thin)
+      Kind = object::Archive::K_GNU;
+    else if (OldArchive)
+      Kind = OldArchive->kind();
+    else if (NewMembersP)
+      Kind = NewMembersP->size() ? getKindFromMember(NewMembersP->front())
+                                 : getDefaultForHost();
+    else
+      Kind = NewMembers.size() ? getKindFromMember(NewMembers.front())
+                               : getDefaultForHost();
+    break;
+  case GNU:
+    Kind = object::Archive::K_GNU;
+    break;
+  case BSD:
+    if (Thin)
+      fail("Only the gnu format has a thin mode");
+    Kind = object::Archive::K_BSD;
+    break;
+  case DARWIN:
+    if (Thin)
+      fail("Only the gnu format has a thin mode");
+    Kind = object::Archive::K_DARWIN;
+    break;
+  case Unknown:
+    llvm_unreachable("");
+  }
+
+  Error E =
+      writeArchive(ArchiveName, NewMembersP ? *NewMembersP : NewMembers, Symtab,
+                   Kind, Deterministic, Thin, std::move(OldArchiveBuf));
+  failIfError(std::move(E), ArchiveName);
+}
+
+static void createSymbolTable(object::Archive *OldArchive) {
+  // When an archive is created or modified, if the s option is given, the
+  // resulting archive will have a current symbol table. If the S option
+  // is given, it will have no symbol table.
+  // In summary, we only need to update the symbol table if we have none.
+  // This is actually very common because of broken build systems that think
+  // they have to run ranlib.
+  if (OldArchive->hasSymbolTable())
+    return;
+
+  performWriteOperation(CreateSymTab, OldArchive, nullptr, nullptr);
+}
+
+static void performOperation(ArchiveOperation Operation,
+                             object::Archive *OldArchive,
+                             std::unique_ptr<MemoryBuffer> OldArchiveBuf,
+                             std::vector<NewArchiveMember> *NewMembers) {
+  switch (Operation) {
+  case Print:
+  case DisplayTable:
+  case Extract:
+    performReadOperation(Operation, OldArchive);
+    return;
+
+  case Delete:
+  case Move:
+  case QuickAppend:
+  case ReplaceOrInsert:
+    performWriteOperation(Operation, OldArchive, std::move(OldArchiveBuf),
+                          NewMembers);
+    return;
+  case CreateSymTab:
+    createSymbolTable(OldArchive);
+    return;
+  }
+  llvm_unreachable("Unknown operation.");
+}
+
+static int performOperation(ArchiveOperation Operation,
+                            std::vector<NewArchiveMember> *NewMembers) {
+  // Create or open the archive object.
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+      MemoryBuffer::getFile(ArchiveName, -1, false);
+  std::error_code EC = Buf.getError();
+  if (EC && EC != errc::no_such_file_or_directory)
+    fail("error opening '" + ArchiveName + "': " + EC.message() + "!");
+
+  if (!EC) {
+    Error Err = Error::success();
+    object::Archive Archive(Buf.get()->getMemBufferRef(), Err);
+    EC = errorToErrorCode(std::move(Err));
+    failIfError(EC,
+                "error loading '" + ArchiveName + "': " + EC.message() + "!");
+    performOperation(Operation, &Archive, std::move(Buf.get()), NewMembers);
+    return 0;
+  }
+
+  assert(EC == errc::no_such_file_or_directory);
+
+  if (!shouldCreateArchive(Operation)) {
+    failIfError(EC, Twine("error loading '") + ArchiveName + "'");
+  } else {
+    if (!Create) {
+      // Produce a warning if we should and we're creating the archive
+      errs() << ToolName << ": creating " << ArchiveName << "\n";
+    }
+  }
+
+  performOperation(Operation, nullptr, nullptr, NewMembers);
+  return 0;
+}
+
+static void runMRIScript() {
+  enum class MRICommand { AddLib, AddMod, Create, Delete, Save, End, Invalid };
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getSTDIN();
+  failIfError(Buf.getError());
+  const MemoryBuffer &Ref = *Buf.get();
+  bool Saved = false;
+  std::vector<NewArchiveMember> NewMembers;
+
+  for (line_iterator I(Ref, /*SkipBlanks*/ false), E; I != E; ++I) {
+    StringRef Line = *I;
+    Line = Line.split(';').first;
+    Line = Line.split('*').first;
+    Line = Line.trim();
+    if (Line.empty())
+      continue;
+    StringRef CommandStr, Rest;
+    std::tie(CommandStr, Rest) = Line.split(' ');
+    Rest = Rest.trim();
+    if (!Rest.empty() && Rest.front() == '"' && Rest.back() == '"')
+      Rest = Rest.drop_front().drop_back();
+    auto Command = StringSwitch<MRICommand>(CommandStr.lower())
+                       .Case("addlib", MRICommand::AddLib)
+                       .Case("addmod", MRICommand::AddMod)
+                       .Case("create", MRICommand::Create)
+                       .Case("delete", MRICommand::Delete)
+                       .Case("save", MRICommand::Save)
+                       .Case("end", MRICommand::End)
+                       .Default(MRICommand::Invalid);
+
+    switch (Command) {
+    case MRICommand::AddLib: {
+      object::Archive &Lib = readLibrary(Rest);
+      {
+        Error Err = Error::success();
+        for (auto &Member : Lib.children(Err))
+          addMember(NewMembers, Member);
+        failIfError(std::move(Err));
+      }
+      break;
+    }
+    case MRICommand::AddMod:
+      addMember(NewMembers, Rest);
+      break;
+    case MRICommand::Create:
+      Create = true;
+      if (!ArchiveName.empty())
+        fail("Editing multiple archives not supported");
+      if (Saved)
+        fail("File already saved");
+      ArchiveName = Rest;
+      break;
+    case MRICommand::Delete: {
+      StringRef Name = sys::path::filename(Rest);
+      llvm::erase_if(NewMembers,
+                     [=](NewArchiveMember &M) { return M.MemberName == Name; });
+      break;
+    }
+    case MRICommand::Save:
+      Saved = true;
+      break;
+    case MRICommand::End:
+      break;
+    case MRICommand::Invalid:
+      fail("Unknown command: " + CommandStr);
+    }
+  }
+
+  // Nothing to do if not saved.
+  if (Saved)
+    performOperation(ReplaceOrInsert, &NewMembers);
+  exit(0);
+}
+
+static bool handleGenericOption(StringRef arg) {
+  if (arg == "-help" || arg == "--help") {
+    printHelpMessage();
+    return true;
+  }
+  if (arg == "-version" || arg == "--version") {
+    cl::PrintVersionMessage();
+    return true;
+  }
+  return false;
+}
+
+static int ar_main(int argc, char **argv) {
+  SmallVector<const char *, 0> Argv(argv, argv + argc);
+  BumpPtrAllocator Alloc;
+  StringSaver Saver(Alloc);
+  cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv);
+  for (size_t i = 1; i < Argv.size(); ++i) {
+    StringRef Arg = Argv[i];
+    const char *match;
+    auto MatchFlagWithArg = [&](const char *expected) {
+      size_t len = strlen(expected);
+      if (Arg == expected) {
+        if (++i >= Argv.size())
+          fail(std::string(expected) + " requires an argument");
+        match = Argv[i];
+        return true;
+      }
+      if (Arg.startswith(expected) && Arg.size() > len && Arg[len] == '=') {
+        match = Arg.data() + len + 1;
+        return true;
+      }
+      return false;
+    };
+    if (handleGenericOption(Argv[i]))
+      return 0;
+    if (Arg == "--") {
+      for (; i < Argv.size(); ++i)
+        PositionalArgs.push_back(Argv[i]);
+      break;
+    }
+    if (Arg[0] == '-') {
+      if (Arg.startswith("--"))
+        Arg = Argv[i] + 2;
+      else
+        Arg = Argv[i] + 1;
+      if (Arg == "M") {
+        MRI = true;
+      } else if (MatchFlagWithArg("format")) {
+        FormatType = StringSwitch<Format>(match)
+                         .Case("default", Default)
+                         .Case("gnu", GNU)
+                         .Case("darwin", DARWIN)
+                         .Case("bsd", BSD)
+                         .Default(Unknown);
+        if (FormatType == Unknown)
+          fail(std::string("Invalid format ") + match);
+      } else if (MatchFlagWithArg("plugin")) {
+        // Ignored.
+      } else {
+        Options += Argv[i] + 1;
+      }
+    } else if (Options.empty()) {
+      Options += Argv[i];
+    } else {
+      PositionalArgs.push_back(Argv[i]);
+    }
+  }
+  ArchiveOperation Operation = parseCommandLine();
+  return performOperation(Operation, nullptr);
+}
+
+static int ranlib_main(int argc, char **argv) {
+  bool ArchiveSpecified = false;
+  for (int i = 1; i < argc; ++i) {
+    if (handleGenericOption(argv[i])) {
+      return 0;
+    } else {
+      if (ArchiveSpecified)
+        fail("Exactly one archive should be specified");
+      ArchiveSpecified = true;
+      ArchiveName = argv[i];
+    }
+  }
+  return performOperation(CreateSymTab, nullptr);
+}
+
+int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
+  ToolName = argv[0];
+
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+
+  Stem = sys::path::stem(ToolName);
+  if (Stem.contains_lower("dlltool"))
+    return dlltoolDriverMain(makeArrayRef(argv, argc));
+
+  if (Stem.contains_lower("ranlib"))
+    return ranlib_main(argc, argv);
+
+  if (Stem.contains_lower("lib"))
+    return libDriverMain(makeArrayRef(argv, argc));
+
+  if (Stem.contains_lower("ar"))
+    return ar_main(argc, argv);
+  fail("Not ranlib, ar, lib or dlltool!");
+}
-- 
GitLab


From 11f7e93c1559c60c19b9a1bb35e4f74d9f68a1da Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 26 Oct 2018 17:48:50 +0000
Subject: [PATCH 0646/1116] [ADT] Use explicit constructors for DenseMapPair to
 work around compiler issues.

Inheriting constructors from std::pair caused clang-3.8 to treat some DenseMap
initializer_list constructor calls as ambiguous, which broke several bots. This
commit explicitly defines DenseMapPair's constructos to work around the issue.

https://reviews.llvm.org/D53726

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345411 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/DenseMap.h | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index ac1e5c632d3..1f50502fff9 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -40,7 +40,32 @@ namespace detail {
 template <typename KeyT, typename ValueT>
 struct DenseMapPair : public std::pair<KeyT, ValueT> {
 
-  using std::pair<KeyT, ValueT>::pair;
+  // FIXME: Switch to inheriting constructors when we drop support for older
+  //        clang versions.
+  // NOTE: This default constructor is declared with '{}' rather than
+  //       '= default' to work around a separate bug in clang-3.8. This can
+  //       also go when we switch to inheriting constructors.
+  DenseMapPair() {}
+
+  DenseMapPair(const KeyT &Key, const ValueT &Value)
+      : std::pair<KeyT, ValueT>(Key, Value) {}
+
+  DenseMapPair(KeyT &&Key, ValueT &&Value)
+      : std::pair<KeyT, ValueT>(std::move(Key), std::move(Value)) {}
+
+  template <typename AltKeyT, typename AltValueT>
+  DenseMapPair(AltKeyT &&AltKey, AltValueT &&AltValue,
+               typename std::enable_if<
+                   std::is_convertible<AltKeyT, KeyT>::value &&
+                   std::is_convertible<AltValueT, ValueT>::value>::type * = 0)
+      : std::pair<KeyT, ValueT>(std::forward<AltKeyT>(AltKey),
+                                std::forward<AltValueT>(AltValue)) {}
+
+  template <typename AltPairT>
+  DenseMapPair(AltPairT &&AltPair,
+               typename std::enable_if<std::is_convertible<
+                   AltPairT, std::pair<KeyT, ValueT>>::value>::type * = 0)
+      : std::pair<KeyT, ValueT>(std::forward<AltPairT>(AltPair)) {}
 
   KeyT &getFirst() { return std::pair<KeyT, ValueT>::first; }
   const KeyT &getFirst() const { return std::pair<KeyT, ValueT>::first; }
-- 
GitLab


From d87fb985a5dba12e80ebcb95f2a56ca20099c2a2 Mon Sep 17 00:00:00 2001
From: Christy Lee <christycylee@gmail.com>
Date: Fri, 26 Oct 2018 18:02:06 +0000
Subject: [PATCH 0647/1116] Pointer types were treated as zero-size by
 MergeICmps

Summary:
The visitICmp analysis function would record compares of pointer types, as size 0. This causes the resulting memcmp() call to have the wrong total size.
Found with "self-build" of clang/LLVM on Windows.

Reviewers: christylee, trentxintong, courbet

Reviewed By: courbet

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53536

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345413 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/MergeICmps.cpp          |  3 +-
 .../MergeICmps/X86/int64-and-ptr.ll           | 39 +++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/MergeICmps/X86/int64-and-ptr.ll

diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp
index f68662b488c..3633485d5d5 100644
--- a/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/lib/Transforms/Scalar/MergeICmps.cpp
@@ -283,8 +283,9 @@ BCECmpBlock visitICmp(const ICmpInst *const CmpI,
     if (!Lhs.Base()) return {};
     auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
     if (!Rhs.Base()) return {};
+    const auto &DL = CmpI->getModule()->getDataLayout();
     return BCECmpBlock(std::move(Lhs), std::move(Rhs),
-                       CmpI->getOperand(0)->getType()->getScalarSizeInBits());
+                       DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()));
   }
   return {};
 }
diff --git a/test/Transforms/MergeICmps/X86/int64-and-ptr.ll b/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
new file mode 100644
index 00000000000..78924aea9be
--- /dev/null
+++ b/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mergeicmps -S | FileCheck %s --check-prefix=X86
+
+; 8-byte int and 8-byte pointer should merge into a 16-byte memcpy.
+; X86: memcmp(i8* {{.*}}, i8* {{.*}}, i64 16)
+
+%struct.outer = type { i64, %struct.inner* }
+%struct.inner = type { i32, i32, i32 }
+
+; Function Attrs: nounwind uwtable
+define dso_local i1 @"?foo@@YAHAEAUouter@@0@Z"(%struct.outer* align 8 dereferenceable(16) %o1, %struct.outer* align 8 dereferenceable(116) %o2) local_unnamed_addr #0 {
+entry:
+  %p1 = getelementptr inbounds %struct.outer, %struct.outer* %o1, i64 0, i32 0
+  %0 = load i64, i64* %p1, align 8
+  %p11 = getelementptr inbounds %struct.outer, %struct.outer* %o2, i64 0, i32 0
+  %1 = load i64, i64* %p11, align 8
+  %cmp = icmp eq i64 %0, %1
+  br i1 %cmp, label %if.then, label %if.end5
+
+if.then:                                          ; preds = %entry
+  %p2 = getelementptr inbounds %struct.outer, %struct.outer* %o1, i64 0, i32 1
+  %2 = load %struct.inner*, %struct.inner** %p2, align 8
+  %p22 = getelementptr inbounds %struct.outer, %struct.outer* %o2, i64 0, i32 1
+  %3 = load %struct.inner*, %struct.inner** %p22, align 8
+  %cmp3 = icmp eq %struct.inner* %2, %3
+  br label %if.end5
+
+if.end5:                                          ; preds = %if.then, %entry
+  %rez.0 = phi i1 [ %cmp3, %if.then ], [ false, %entry ]
+  ret i1 %rez.0
+}
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
-- 
GitLab


From aff6cf491087ba32e338c9af076c9b7739c978a0 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Fri, 26 Oct 2018 18:09:36 +0000
Subject: [PATCH 0648/1116] [PowerPC] Improve BUILD_VECTOR of 4 i32s

Currently, for this node:
  vector int test(int a, int b, int c, int d) {
    return (vector int) { a, b, c, d };
  }

we get this on Power9:
  mtvsrdd 34, 5, 3
  mtvsrdd 35, 6, 4
  vmrgow 2, 3, 2

and this on Power8:
  mtvsrwz 0, 3
  mtvsrwz 1, 5
  mtvsrwz 2, 4
  mtvsrwz 3, 6
  xxmrghd 34, 1, 0
  xxmrghd 35, 3, 2
  vmrgow 2, 3, 2

This can be improved to this on LE Power9:
  rldimi 3, 4, 32, 0
  rldimi 5, 6, 32, 0
  mtvsrdd 34, 5, 3

and this on LE Power8
  rldimi 3, 4, 32, 0
  rldimi 5, 6, 32, 0
  mtvsrd 34, 3
  mtvsrd 35, 5
  xxpermdi 34, 35, 34, 0

This patch updates the TD pattern to generate the optimized sequence for both
Power8 and Power9 on LE and BE.

Differential Revision: https://reviews.llvm.org/D53494

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345414 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCInstrVSX.td          |  32 ++--
 test/CodeGen/PowerPC/build-vector-tests.ll | 188 +++++++++------------
 2 files changed, 100 insertions(+), 120 deletions(-)

diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 7a3141abc1b..6a4586002b2 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3873,10 +3873,11 @@ let AddedComplexity = 400 in {
                         (COPY_TO_REGCLASS (MTVSRD $A), VSRC),
                         (COPY_TO_REGCLASS (MTVSRD $B), VSRC), 0))>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC),
-                                   (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC), 0),
-                      (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC),
-                                   (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC), 0))>;
+              (XXPERMDI
+                (COPY_TO_REGCLASS
+                  (MTVSRD (RLDIMI AnyExts.B, AnyExts.A, 32, 0)), VSRC),
+                (COPY_TO_REGCLASS
+                  (MTVSRD (RLDIMI AnyExts.D, AnyExts.C, 32, 0)), VSRC), 0)>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
               (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
   }
@@ -3888,10 +3889,11 @@ let AddedComplexity = 400 in {
                         (COPY_TO_REGCLASS (MTVSRD $B), VSRC),
                         (COPY_TO_REGCLASS (MTVSRD $A), VSRC), 0))>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC),
-                                   (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC), 0),
-                      (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC),
-                                   (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 0))>;
+              (XXPERMDI
+                (COPY_TO_REGCLASS
+                  (MTVSRD (RLDIMI AnyExts.C, AnyExts.D, 32, 0)), VSRC),
+                (COPY_TO_REGCLASS
+                  (MTVSRD (RLDIMI AnyExts.A, AnyExts.B, 32, 0)), VSRC), 0)>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
               (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
   }
@@ -3944,10 +3946,9 @@ let AddedComplexity = 400 in {
     def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)),
               (v2i64 (MTVSRDD $rB, $rA))>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (VMRGOW
-                (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.A, AnyExts.C), VSRC)),
-                (v4i32
-                  (COPY_TO_REGCLASS (MTVSRDD AnyExts.B, AnyExts.D), VSRC)))>;
+              (MTVSRDD
+                (RLDIMI AnyExts.B, AnyExts.A, 32, 0),
+                (RLDIMI AnyExts.D, AnyExts.C, 32, 0))>;
   }
 
   let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in {
@@ -3957,10 +3958,9 @@ let AddedComplexity = 400 in {
     def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)),
               (v2i64 (MTVSRDD $rB, $rA))>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (VMRGOW
-                (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC)),
-                (v4i32
-                  (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC)))>;
+              (MTVSRDD
+                (RLDIMI AnyExts.C, AnyExts.D, 32, 0),
+                (RLDIMI AnyExts.A, AnyExts.B, 32, 0))>;
   }
   // P9 Altivec instructions that can be used to build vectors.
   // Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll
index d192bafca23..6f65b189b75 100644
--- a/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -838,28 +838,26 @@ entry:
 ; P9LE-LABEL: fromRegsi
 ; P8BE-LABEL: fromRegsi
 ; P8LE-LABEL: fromRegsi
-; P9BE-DAG: mtvsrdd [[REG1:v[0-9]+]], r3, r5
-; P9BE-DAG: mtvsrdd [[REG2:v[0-9]+]], r4, r6
-; P9BE: vmrgow v2, [[REG1]], [[REG2]]
+; P9BE-DAG: rldimi r6, r5, 32, 0
+; P9BE-DAG: rldimi r4, r3, 32, 0
+; P9BE: mtvsrdd v2, r4, r6
 ; P9BE: blr
-; P9LE-DAG: mtvsrdd [[REG1:v[0-9]+]], r5, r3
-; P9LE-DAG: mtvsrdd [[REG2:v[0-9]+]], r6, r4
-; P9LE: vmrgow v2, [[REG2]], [[REG1]]
+; P9LE-DAG: rldimi r3, r4, 32, 0
+; P9LE-DAG: rldimi r5, r6, 32, 0
+; P9LE: mtvsrdd v2, r5, r3
 ; P9LE: blr
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
-; P8BE-DAG: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG1]], {{[v][s]*}}[[REG3]]
-; P8BE-DAG: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG2]], {{[v][s]*}}[[REG4]]
-; P8BE: vmrgow v2, [[REG5]], [[REG6]]
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
-; P8LE: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG3]], {{[v][s]*}}[[REG1]]
-; P8LE: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG4]], {{[v][s]*}}[[REG2]]
-; P8LE: vmrgow v2, [[REG6]], [[REG5]]
+; P8BE-DAG: rldimi r6, r5, 32, 0
+; P8BE-DAG: rldimi r4, r3, 32, 0
+; P8BE-DAG: mtvsrd f[[REG1:[0-9]+]], r6
+; P8BE-DAG: mtvsrd f[[REG2:[0-9]+]], r4
+; P8BE-DAG: xxmrghd v2, vs[[REG2]], vs[[REG1]]
+; P8BE: blr
+; P8LE-DAG: rldimi r3, r4, 32, 0
+; P8LE-DAG: rldimi r5, r6, 32, 0
+; P8LE-DAG: mtvsrd f[[REG1:[0-9]+]], r3
+; P8LE-DAG: mtvsrd f[[REG2:[0-9]+]], r5
+; P8LE-DAG: xxmrghd v2, vs[[REG2]], vs[[REG1]]
+; P8LE: blr
 }
 
 ; Function Attrs: norecurse nounwind readnone
@@ -1065,38 +1063,34 @@ entry:
 ; P9BE: lwz
 ; P9BE: lwz
 ; P9BE: lwz
+; P9BE: rldimi
+; P9BE: rldimi
 ; P9BE: mtvsrdd
-; P9BE: mtvsrdd
-; P9BE: vmrgow
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
+; P9LE: rldimi
+; P9LE: rldimi
 ; P9LE: mtvsrdd
-; P9LE: mtvsrdd
-; P9LE: vmrgow
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: xxmrghd
+; P8BE: rldimi
+; P8BE: rldimi
+; P8BE: mtvsrd
+; P8BE: mtvsrd
 ; P8BE: xxmrghd
-; P8BE: vmrgow
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: xxmrghd
+; P8LE: rldimi
+; P8LE: rldimi
+; P8LE: mtvsrd
+; P8LE: mtvsrd
 ; P8LE: xxmrghd
-; P8LE: vmrgow
 }
 
 ; Function Attrs: norecurse nounwind readonly
@@ -1132,41 +1126,37 @@ entry:
 ; P9BE: lwz
 ; P9BE: lwz
 ; P9BE: lwz
+; P9BE: rldimi
+; P9BE: rldimi
 ; P9BE: mtvsrdd
-; P9BE: mtvsrdd
-; P9BE: vmrgow
 ; P9LE: sldi r4, r4, 2
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
+; P9LE: rldimi
+; P9LE: rldimi
 ; P9LE: mtvsrdd
-; P9LE: mtvsrdd
-; P9LE: vmrgow
 ; P8BE: sldi r4, r4, 2
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: xxmrghd
+; P8BE: rldimi
+; P8BE: rldimi
+; P8BE: mtvsrd
+; P8BE: mtvsrd
 ; P8BE: xxmrghd
-; P8BE: vmrgow
 ; P8LE: sldi r4, r4, 2
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: xxmrghd
+; P8LE: rldimi
+; P8LE: rldimi
+; P8LE: mtvsrd
+; P8LE: mtvsrd
 ; P8LE: xxmrghd
-; P8LE: vmrgow
 }
 
 ; Function Attrs: norecurse nounwind readnone
@@ -1978,28 +1968,26 @@ entry:
 ; P9LE-LABEL: fromRegsui
 ; P8BE-LABEL: fromRegsui
 ; P8LE-LABEL: fromRegsui
-; P9BE-DAG: mtvsrdd [[REG1:v[0-9]+]], r3, r5
-; P9BE-DAG: mtvsrdd [[REG2:v[0-9]+]], r4, r6
-; P9BE: vmrgow v2, [[REG1]], [[REG2]]
+; P9BE-DAG: rldimi r6, r5, 32, 0
+; P9BE-DAG: rldimi r4, r3, 32, 0
+; P9BE: mtvsrdd v2, r4, r6
 ; P9BE: blr
-; P9LE-DAG: mtvsrdd [[REG1:v[0-9]+]], r5, r3
-; P9LE-DAG: mtvsrdd [[REG2:v[0-9]+]], r6, r4
-; P9LE: vmrgow v2, [[REG2]], [[REG1]]
+; P9LE-DAG: rldimi r3, r4, 32, 0
+; P9LE-DAG: rldimi r5, r6, 32, 0
+; P9LE: mtvsrdd v2, r5, r3
 ; P9LE: blr
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
-; P8BE-DAG: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG1]], {{[v][s]*}}[[REG3]]
-; P8BE-DAG: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG2]], {{[v][s]*}}[[REG4]]
-; P8BE: vmrgow v2, [[REG5]], [[REG6]]
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
-; P8LE: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG3]], {{[v][s]*}}[[REG1]]
-; P8LE: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG4]], {{[v][s]*}}[[REG2]]
-; P8LE: vmrgow v2, [[REG6]], [[REG5]]
+; P8BE-DAG: rldimi r6, r5, 32, 0
+; P8BE-DAG: rldimi r4, r3, 32, 0
+; P8BE-DAG: mtvsrd f[[REG1:[0-9]+]], r6
+; P8BE-DAG: mtvsrd f[[REG2:[0-9]+]], r4
+; P8BE-DAG: xxmrghd v2, vs[[REG2]], vs[[REG1]]
+; P8BE: blr
+; P8LE-DAG: rldimi r3, r4, 32, 0
+; P8LE-DAG: rldimi r5, r6, 32, 0
+; P8LE-DAG: mtvsrd f[[REG1:[0-9]+]], r3
+; P8LE-DAG: mtvsrd f[[REG2:[0-9]+]], r5
+; P8LE-DAG: xxmrghd v2, vs[[REG2]], vs[[REG1]]
+; P8LE: blr
 }
 
 ; Function Attrs: norecurse nounwind readnone
@@ -2207,38 +2195,34 @@ entry:
 ; P9BE: lwz
 ; P9BE: lwz
 ; P9BE: lwz
+; P9BE: rldimi
+; P9BE: rldimi
 ; P9BE: mtvsrdd
-; P9BE: mtvsrdd
-; P9BE: vmrgow
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
+; P9LE: rldimi
+; P9LE: rldimi
 ; P9LE: mtvsrdd
-; P9LE: mtvsrdd
-; P9LE: vmrgow
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: xxmrghd
+; P8BE: rldimi
+; P8BE: rldimi
+; P8BE: mtvsrd
+; P8BE: mtvsrd
 ; P8BE: xxmrghd
-; P8BE: vmrgow
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: xxmrghd
+; P8LE: rldimi
+; P8LE: rldimi
+; P8LE: mtvsrd
+; P8LE: mtvsrd
 ; P8LE: xxmrghd
-; P8LE: vmrgow
 }
 
 ; Function Attrs: norecurse nounwind readonly
@@ -2274,41 +2258,37 @@ entry:
 ; P9BE: lwz
 ; P9BE: lwz
 ; P9BE: lwz
+; P9BE: rldimi
+; P9BE: rldimi
 ; P9BE: mtvsrdd
-; P9BE: mtvsrdd
-; P9BE: vmrgow
 ; P9LE: sldi r4, r4, 2
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
+; P9LE: rldimi
+; P9LE: rldimi
 ; P9LE: mtvsrdd
-; P9LE: mtvsrdd
-; P9LE: vmrgow
 ; P8BE: sldi r4, r4, 2
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: xxmrghd
+; P8BE: rldimi
+; P8BE: rldimi
+; P8BE: mtvsrd
+; P8BE: mtvsrd
 ; P8BE: xxmrghd
-; P8BE: vmrgow
 ; P8LE: sldi r4, r4, 2
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: xxmrghd
+; P8LE: rldimi
+; P8LE: rldimi
+; P8LE: mtvsrd
+; P8LE: mtvsrd
 ; P8LE: xxmrghd
-; P8LE: vmrgow
 }
 
 ; Function Attrs: norecurse nounwind readnone
-- 
GitLab


From d8ceaaa597ec44a75e9a45aacca7516bd04857ab Mon Sep 17 00:00:00 2001
From: Yi Kong <yikong@google.com>
Date: Fri, 26 Oct 2018 18:25:27 +0000
Subject: [PATCH 0649/1116] [XRay] Use std::errc::invalid_argument instead of
 std::errc::bad_message

This change should appease the mingw32 builds.

Similar to r293725.

Differential Revision: https://reviews.llvm.org/D53742


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345416 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/XRay/RecordInitializer.cpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/lib/XRay/RecordInitializer.cpp b/lib/XRay/RecordInitializer.cpp
index 7f9fd4c9627..fe76f7d79fb 100644
--- a/lib/XRay/RecordInitializer.cpp
+++ b/lib/XRay/RecordInitializer.cpp
@@ -20,7 +20,7 @@ Error RecordInitializer::visit(BufferExtents &R) {
   auto PreReadOffset = OffsetPtr;
   R.Size = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read buffer extent at offset %d.",
                              OffsetPtr);
 
@@ -39,14 +39,14 @@ Error RecordInitializer::visit(WallclockRecord &R) {
   R.Seconds = E.getU64(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
     return createStringError(
-        std::make_error_code(std::errc::bad_message),
+        std::make_error_code(std::errc::invalid_argument),
         "Cannot read wall clock 'seconds' field at offset %d.", OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   R.Nanos = E.getU32(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
     return createStringError(
-        std::make_error_code(std::errc::bad_message),
+        std::make_error_code(std::errc::invalid_argument),
         "Cannot read wall clock 'nanos' field at offset %d.", OffsetPtr);
 
   // Align to metadata record size boundary.
@@ -65,13 +65,13 @@ Error RecordInitializer::visit(NewCPUIDRecord &R) {
   auto PreReadOffset = OffsetPtr;
   R.CPUId = E.getU16(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read CPU id at offset %d.", OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   R.TSC = E.getU64(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read CPU TSC at offset %d.", OffsetPtr);
 
   OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - BeginOffset);
@@ -88,7 +88,7 @@ Error RecordInitializer::visit(TSCWrapRecord &R) {
   auto PreReadOffset = OffsetPtr;
   R.BaseTSC = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read TSC wrap record at offset %d.",
                              OffsetPtr);
 
@@ -108,14 +108,14 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
   R.Size = E.getSigned(&OffsetPtr, sizeof(int32_t));
   if (PreReadOffset == OffsetPtr)
     return createStringError(
-        std::make_error_code(std::errc::bad_message),
+        std::make_error_code(std::errc::invalid_argument),
         "Cannot read a custom event record size field offset %d.", OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   R.TSC = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
     return createStringError(
-        std::make_error_code(std::errc::bad_message),
+        std::make_error_code(std::errc::invalid_argument),
         "Cannot read a custom event TSC field at offset %d.", OffsetPtr);
 
   OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - BeginOffset);
@@ -131,7 +131,7 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
   Buffer.resize(R.Size);
   if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
     return createStringError(
-        std::make_error_code(std::errc::bad_message),
+        std::make_error_code(std::errc::invalid_argument),
         "Failed reading data into buffer of size %d at offset %d.", R.Size,
         OffsetPtr);
   R.Data.assign(Buffer.begin(), Buffer.end());
@@ -148,7 +148,7 @@ Error RecordInitializer::visit(CallArgRecord &R) {
   auto PreReadOffset = OffsetPtr;
   R.Arg = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read a call arg record at offset %d.",
                              OffsetPtr);
 
@@ -166,7 +166,7 @@ Error RecordInitializer::visit(PIDRecord &R) {
   auto PreReadOffset = OffsetPtr;
   R.PID = E.getSigned(&OffsetPtr, 4);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read a process ID record at offset %d.",
                              OffsetPtr);
 
@@ -184,7 +184,7 @@ Error RecordInitializer::visit(NewBufferRecord &R) {
   auto PreReadOffset = OffsetPtr;
   R.TID = E.getSigned(&OffsetPtr, sizeof(int32_t));
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read a new buffer record at offset %d.",
                              OffsetPtr);
 
@@ -234,7 +234,7 @@ Error RecordInitializer::visit(FunctionRecord &R) {
     R.Kind = static_cast<RecordTypes>(FunctionType);
     break;
   default:
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Unknown function record type '%d' at offset %d.",
                              FunctionType, BeginOffset);
   }
@@ -243,7 +243,7 @@ Error RecordInitializer::visit(FunctionRecord &R) {
   PreReadOffset = OffsetPtr;
   R.Delta = E.getU32(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Failed reading TSC delta from offset %d.",
                              OffsetPtr);
   assert(FunctionRecord::kFunctionRecordSize == (OffsetPtr - BeginOffset));
-- 
GitLab


From d1f07078e8db8d6d86edec4aef5eb4c2b65a9739 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Fri, 26 Oct 2018 19:32:24 +0000
Subject: [PATCH 0650/1116] [ARM] Make InstrEmitter mark CPSR defs dead for
 Thumb1.

The "dead" markings allow existing target-independent optimizations,
like MachineSink, to trigger more frequently. The CPSR defs would have
eventually been marked dead by LiveVariables, so this only affects
optimizations before regalloc.

The ARMBaseInstrInfo.cpp change is fixing a bug which is only visible
with this change: the transform adds a use to an otherwise dead def
of CPSR. This is covered by existing regression tests.

thumb2-tbh.ll breaks for Thumb1 due to MachineLICM changing the
generated code; I'll fix it in D53452.

Differential Revision: https://reviews.llvm.org/D53453


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345420 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/InstrEmitter.cpp     |   2 +-
 lib/Target/ARM/ARMBaseInstrInfo.cpp           |   2 +
 test/CodeGen/ARM/arm-and-tst-peephole.ll      |   3 +-
 test/CodeGen/ARM/cmn.ll                       |  26 +-
 test/CodeGen/ARM/intrinsics-overflow.ll       |  19 +-
 test/CodeGen/ARM/select-imm.ll                |  10 +-
 test/CodeGen/Thumb/branchless-cmp.ll          |  15 +-
 .../Thumb/consthoist-few-dependents.ll        |   1 -
 test/CodeGen/Thumb/select.ll                  |   4 +-
 .../Thumb/umulo-128-legalisation-lowering.ll  | 356 ++++++++++--------
 test/CodeGen/Thumb2/thumb2-tbh.ll             |   9 +-
 11 files changed, 235 insertions(+), 212 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 9f57df87fb2..fc9c227e4df 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -959,7 +959,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   }
 
   // Finally mark unused registers as dead.
-  if (!UsedRegs.empty() || II.getImplicitDefs())
+  if (!UsedRegs.empty() || II.getImplicitDefs() || II.hasOptionalDef())
     MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
 
   // Run post-isel target hook to adjust this instruction if needed.
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 0d1908ada7f..c9d78df4b37 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2963,6 +2963,8 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
   for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
     OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
 
+  MI->clearRegisterDeads(ARM::CPSR);
+
   return true;
 }
 
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index c6ca6a624b1..8e38f18c069 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -153,11 +153,10 @@ define i32 @test_tst_assessment(i32 %a, i32 %b) {
 ; THUMB-NEXT:    movs r2, r0
 ; THUMB-NEXT:    movs r0, #1
 ; THUMB-NEXT:    ands r0, r2
-; THUMB-NEXT:    subs r2, r0, #1
 ; THUMB-NEXT:    lsls r1, r1, #31
 ; THUMB-NEXT:    beq .LBB2_2
 ; THUMB-NEXT:  @ %bb.1:
-; THUMB-NEXT:    movs r0, r2
+; THUMB-NEXT:    subs r0, r0, #1
 ; THUMB-NEXT:  .LBB2_2:
 ; THUMB-NEXT:    bx lr
 ;
diff --git a/test/CodeGen/ARM/cmn.ll b/test/CodeGen/ARM/cmn.ll
index 9321527a9e2..fbcee5196b6 100644
--- a/test/CodeGen/ARM/cmn.ll
+++ b/test/CodeGen/ARM/cmn.ll
@@ -15,16 +15,15 @@ define i32 @compare_i_gt(i32 %a) {
 ;
 ; T1-LABEL: compare_i_gt:
 ; T1:       @ %bb.0: @ %entry
-; T1-NEXT:    mov r1, r0
-; T1-NEXT:    movs r0, #77
-; T1-NEXT:    mvns r3, r0
-; T1-NEXT:    movs r0, #42
-; T1-NEXT:    movs r2, #24
-; T1-NEXT:    cmp r1, r3
+; T1-NEXT:    movs r1, #77
+; T1-NEXT:    mvns r1, r1
+; T1-NEXT:    cmp r0, r1
 ; T1-NEXT:    bgt .LBB0_2
 ; T1-NEXT:  @ %bb.1: @ %entry
-; T1-NEXT:    mov r0, r2
-; T1-NEXT:  .LBB0_2: @ %entry
+; T1-NEXT:    movs r0, #24
+; T1-NEXT:    bx lr
+; T1-NEXT:  .LBB0_2:
+; T1-NEXT:    movs r0, #42
 ; T1-NEXT:    bx lr
 entry:
   %cmp = icmp sgt i32 %a, -78
@@ -44,14 +43,13 @@ define i32 @compare_r_eq(i32 %a, i32 %b) {
 ;
 ; T1-LABEL: compare_r_eq:
 ; T1:       @ %bb.0: @ %entry
-; T1-NEXT:    mov r2, r0
-; T1-NEXT:    movs r0, #42
-; T1-NEXT:    movs r3, #24
-; T1-NEXT:    cmn r2, r1
+; T1-NEXT:    cmn r0, r1
 ; T1-NEXT:    beq .LBB1_2
 ; T1-NEXT:  @ %bb.1: @ %entry
-; T1-NEXT:    mov r0, r3
-; T1-NEXT:  .LBB1_2: @ %entry
+; T1-NEXT:    movs r0, #24
+; T1-NEXT:    bx lr
+; T1-NEXT:  .LBB1_2:
+; T1-NEXT:    movs r0, #42
 ; T1-NEXT:    bx lr
 entry:
   %sub = sub nsw i32 0, %b
diff --git a/test/CodeGen/ARM/intrinsics-overflow.ll b/test/CodeGen/ARM/intrinsics-overflow.ll
index 835be7e949d..d4c20dfacce 100644
--- a/test/CodeGen/ARM/intrinsics-overflow.ll
+++ b/test/CodeGen/ARM/intrinsics-overflow.ll
@@ -38,14 +38,9 @@ define i32 @sadd_overflow(i32 %a, i32 %b) #0 {
   ; ARM: movvc r[[R0]], #0
   ; ARM: mov pc, lr
 
-  ; THUMBV6: mov  r[[R2:[0-9]+]], r[[R0:[0-9]+]]
-  ; THUMBV6: adds r[[R3:[0-9]+]], r[[R0]], r[[R1:[0-9]+]]
-  ; THUMBV6: movs r[[R0]], #0
-  ; THUMBV6: movs r[[R1]], #1
-  ; THUMBV6: cmp  r[[R3]], r[[R2]]
-  ; THUMBV6: bvc  .L[[LABEL:.*]]
-  ; THUMBV6: mov  r[[R0]], r[[R1]]
-  ; THUMBV6: .L[[LABEL]]:
+  ; THUMBV6: adds    r1, r0, r1
+  ; THUMBV6: cmp     r1, r0
+  ; THUMBV6: bvc     .LBB1_2
 
   ; THUMBV7: adds  r[[R2:[0-9]+]], r[[R0]], r[[R1:[0-9]+]]
   ; THUMBV7: mov.w r[[R0:[0-9]+]], #1
@@ -94,12 +89,8 @@ define i32 @ssub_overflow(i32 %a, i32 %b) #0 {
   ; ARM: cmp r[[R0]], r[[R1]]
   ; ARM: movvc r[[R2]], #0
 
-  ; THUMBV6: movs    r[[R0]], #0
-  ; THUMBV6: movs    r[[R3:[0-9]+]], #1
-  ; THUMBV6: cmp     r[[R2]], r[[R1:[0-9]+]]
-  ; THUMBV6: bvc     .L[[LABEL:.*]]
-  ; THUMBV6: mov     r[[R0]], r[[R3]]
-  ; THUMBV6: .L[[LABEL]]:
+  ; THUMBV6: cmp     r0, r1
+  ; THUMBV6: bvc     .LBB3_2
 
   ; THUMBV7: movs  r[[R2:[0-9]+]], #1
   ; THUMBV7: cmp   r[[R0:[0-9]+]], r[[R1:[0-9]+]]
diff --git a/test/CodeGen/ARM/select-imm.ll b/test/CodeGen/ARM/select-imm.ll
index c0cebf833a0..04f6d252e27 100644
--- a/test/CodeGen/ARM/select-imm.ll
+++ b/test/CodeGen/ARM/select-imm.ll
@@ -24,12 +24,8 @@ entry:
 ; ARMT2: movwgt [[R]], #123
 
 ; THUMB1-LABEL: t1:
-; THUMB1: mov     r1, r0
-; THUMB1: movs    r2, #255
-; THUMB1: adds    r2, #102
-; THUMB1: movs    r0, #123
-; THUMB1: cmp     r1, #1
-; THUMB1: bgt
+; THUMB1: cmp     r0, #1
+; THUMB1: bgt     .LBB0_2
 
 ; THUMB2-LABEL: t1:
 ; THUMB2: movw [[R:r[0-1]]], #357
@@ -144,7 +140,7 @@ entry:
 
 ; THUMB1-LABEL: t6:
 ; THUMB1: cmp r{{[0-9]+}}, #0
-; THUMB1: bne
+; THUMB1: beq
 
 ; THUMB2-LABEL: t6:
 ; THUMB2-NOT: mov
diff --git a/test/CodeGen/Thumb/branchless-cmp.ll b/test/CodeGen/Thumb/branchless-cmp.ll
index 6c6c905c5d3..8435529d681 100644
--- a/test/CodeGen/Thumb/branchless-cmp.ll
+++ b/test/CodeGen/Thumb/branchless-cmp.ll
@@ -85,14 +85,15 @@ entry:
   %cond = select i1 %cmp, i32 0, i32 4
   ret i32 %cond
 ; CHECK-LABEL: test4a:
-; CHECK-NOT: b{{(ne)|(eq)}}
-; CHECK:       mov     r2, r0
+; CHECK: bb.0:
+; CHECK-NEXT:  cmp     r0, r1
+; CHECK-NEXT:  bne     .LBB6_2
+; CHECK-NEXT: bb.1:
+; CHECK-NEXT:  movs    r0, #4
+; CHECK-NEXT:  bx      lr
+; CHECK-NEXT: .LBB6_2:
 ; CHECK-NEXT:  movs    r0, #0
-; CHECK-NEXT:  movs    r3, #4
-; CHECK-NEXT:  cmp     r2, r1
-; CHECK-NEXT:  bne     .[[BRANCH:[A-Z0-9_]+]]
-; CHECK:       mov     r0, r3
-; CHECK:       .[[BRANCH]]:
+; CHECK-NEXT:  bx      lr
 }
 
 define i32 @test4b(i32 %a, i32 %b) {
diff --git a/test/CodeGen/Thumb/consthoist-few-dependents.ll b/test/CodeGen/Thumb/consthoist-few-dependents.ll
index 4141cf38a93..72f085afdff 100644
--- a/test/CodeGen/Thumb/consthoist-few-dependents.ll
+++ b/test/CodeGen/Thumb/consthoist-few-dependents.ll
@@ -23,7 +23,6 @@ target triple = "thumbv6m-none-unknown-musleabi"
 
 ; LLC-LABEL: avalon
 ; LLC-DAG: movs r{{[0-9]+}}, #0
-; LLC-DAG: movs r{{[0-9]+}}, #0
 ; LLC-DAG: movs r{{[0-9]+}}, #1
 ; LLC-NOT: add
 
diff --git a/test/CodeGen/Thumb/select.ll b/test/CodeGen/Thumb/select.ll
index 75dbeab5ad0..41ace62de53 100644
--- a/test/CodeGen/Thumb/select.ll
+++ b/test/CodeGen/Thumb/select.ll
@@ -73,10 +73,10 @@ define double @f7(double %a, double %b) {
     ret double %tmp1
 }
 ; CHECK-LABEL: f7:
-; CHECK: blt
+; CHECK: {{blt|bge}}
 ; CHECK: {{blt|bge}}
 ; CHECK: __ltdf2
 ; CHECK-EABI-LABEL: f7:
 ; CHECK-EABI: __aeabi_dcmplt
-; CHECK-EABI: bne
+; CHECK-EABI: {{bne|beq}}
 ; CHECK-EABI: {{bne|beq}}
diff --git a/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll b/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
index f57f46f68cf..5445cd8e743 100644
--- a/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
+++ b/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
@@ -3,168 +3,200 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV6-LABEL: muloti_test:
-; THUMBV6: push    {r4, r5, r6, r7, lr}
-; THUMBV6: sub     sp, #84
-; THUMBV6-NEXT: mov     r6, r3
-; THUMBV6-NEXT: mov     r7, r2
-; THUMBV6-NEXT: mov     r4, r0
-; THUMBV6-NEXT: movs    r5, #0
-; THUMBV6-NEXT: mov     r0, sp
-; THUMBV6-NEXT: str     r5, [r0, #12]
-; THUMBV6-NEXT: str     r5, [r0, #8]
-; THUMBV6-NEXT: ldr     r1, [sp, #116]
-; THUMBV6-NEXT: str     r1, [sp, #68]           @ 4-byte Spill
-; THUMBV6-NEXT: str     r1, [r0, #4]
-; THUMBV6-NEXT: ldr     r1, [sp, #112]
-; THUMBV6-NEXT: str     r1, [sp, #32]           @ 4-byte Spill
-; THUMBV6-NEXT: str     r1, [r0]
-; THUMBV6-NEXT: mov     r0, r2
-; THUMBV6-NEXT: mov     r1, r3
-; THUMBV6-NEXT: mov     r2, r5
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __multi3
-; THUMBV6-NEXT: str     r2, [sp, #40]           @ 4-byte Spill
-; THUMBV6-NEXT: str     r3, [sp, #44]           @ 4-byte Spill
-; THUMBV6-NEXT: str     r4, [sp, #72]           @ 4-byte Spill
-; THUMBV6-NEXT: stm     r4!, {r0, r1}
-; THUMBV6-NEXT: ldr     r4, [sp, #120]
-; THUMBV6-NEXT: str     r6, [sp, #60]           @ 4-byte Spill
-; THUMBV6-NEXT: mov     r0, r6
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: mov     r2, r4
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: mov     r6, r0
-; THUMBV6-NEXT: str     r1, [sp, #52]           @ 4-byte Spill
-; THUMBV6-NEXT: ldr     r0, [sp, #124]
-; THUMBV6-NEXT: str     r0, [sp, #80]           @ 4-byte Spill
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: mov     r2, r7
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: str     r1, [sp, #28]           @ 4-byte Spill
-; THUMBV6-NEXT: adds    r6, r0, r6
-; THUMBV6-NEXT: str     r4, [sp, #64]           @ 4-byte Spill
-; THUMBV6-NEXT: mov     r0, r4
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: mov     r2, r7
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: str     r0, [sp, #24]           @ 4-byte Spill
-; THUMBV6-NEXT: adds    r0, r1, r6
-; THUMBV6-NEXT: str     r0, [sp, #20]           @ 4-byte Spill
-; THUMBV6-NEXT: mov     r0, r5
-; THUMBV6-NEXT: adcs    r0, r5
-; THUMBV6-NEXT: str     r0, [sp, #48]           @ 4-byte Spill
-; THUMBV6-NEXT: ldr     r7, [sp, #104]
-; THUMBV6-NEXT: ldr     r0, [sp, #68]           @ 4-byte Reload
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: mov     r2, r7
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: mov     r6, r0
-; THUMBV6-NEXT: str     r1, [sp, #56]           @ 4-byte Spill
-; THUMBV6-NEXT: ldr     r0, [sp, #108]
-; THUMBV6-NEXT: str     r0, [sp, #76]           @ 4-byte Spill
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: ldr     r4, [sp, #32]           @ 4-byte Reload
-; THUMBV6-NEXT: mov     r2, r4
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: str     r1, [sp, #36]           @ 4-byte Spill
-; THUMBV6-NEXT: adds    r6, r0, r6
-; THUMBV6-NEXT: mov     r0, r7
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: mov     r2, r4
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: adds    r2, r1, r6
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: adcs    r1, r5
-; THUMBV6-NEXT: ldr     r3, [sp, #24]           @ 4-byte Reload
-; THUMBV6-NEXT: adds    r0, r0, r3
-; THUMBV6-NEXT: ldr     r3, [sp, #20]           @ 4-byte Reload
-; THUMBV6-NEXT: adcs    r2, r3
-; THUMBV6-NEXT: ldr     r3, [sp, #40]           @ 4-byte Reload
-; THUMBV6-NEXT: adds    r0, r3, r0
-; THUMBV6-NEXT: ldr     r3, [sp, #72]           @ 4-byte Reload
-; THUMBV6-NEXT: str     r0, [r3, #8]
-; THUMBV6-NEXT: ldr     r0, [sp, #44]           @ 4-byte Reload
-; THUMBV6-NEXT: adcs    r2, r0
-; THUMBV6-NEXT: str     r2, [r3, #12]
-; THUMBV6-NEXT: ldr     r2, [sp, #28]           @ 4-byte Reload
-; THUMBV6-NEXT: adcs    r5, r5
-; THUMBV6-NEXT: movs    r0, #1
-; THUMBV6-NEXT: cmp     r2, #0
-; THUMBV6-NEXT: mov     r3, r0
-; THUMBV6-NEXT: bne     .LBB0_2
-; THUMBV6: mov     r3, r2
-; THUMBV6: ldr     r2, [sp, #60]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r2, #0
-; THUMBV6-NEXT: mov     r4, r0
-; THUMBV6-NEXT: bne     .LBB0_4
-; THUMBV6: mov     r4, r2
-; THUMBV6: ldr     r2, [sp, #80]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r2, #0
-; THUMBV6-NEXT: mov     r2, r0
-; THUMBV6-NEXT: bne     .LBB0_6
-; THUMBV6: ldr     r2, [sp, #80]           @ 4-byte Reload
-; THUMBV6: ands    r2, r4
-; THUMBV6-NEXT: orrs    r2, r3
-; THUMBV6-NEXT: ldr     r4, [sp, #52]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r4, #0
-; THUMBV6-NEXT: mov     r3, r0
-; THUMBV6-NEXT: bne     .LBB0_8
-; THUMBV6: mov     r3, r4
-; THUMBV6: orrs    r2, r3
-; THUMBV6-NEXT: ldr     r3, [sp, #48]           @ 4-byte Reload
-; THUMBV6-NEXT: orrs    r2, r3
-; THUMBV6-NEXT: ldr     r3, [sp, #36]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r3, #0
-; THUMBV6-NEXT: mov     r4, r0
-; THUMBV6-NEXT: bne     .LBB0_10
-; THUMBV6: mov     r4, r3
-; THUMBV6: ldr     r3, [sp, #68]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r3, #0
-; THUMBV6-NEXT: mov     r6, r0
-; THUMBV6-NEXT: bne     .LBB0_12
-; THUMBV6: mov     r6, r3
-; THUMBV6: ldr     r3, [sp, #76]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r3, #0
-; THUMBV6-NEXT: mov     r3, r0
-; THUMBV6-NEXT: bne     .LBB0_14
-; THUMBV6: ldr     r3, [sp, #76]           @ 4-byte Reload
-; THUMBV6: ands    r3, r6
-; THUMBV6-NEXT: orrs    r3, r4
-; THUMBV6-NEXT: ldr     r6, [sp, #56]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r6, #0
-; THUMBV6-NEXT: mov     r4, r0
-; THUMBV6-NEXT: bne     .LBB0_16
-; THUMBV6: mov     r4, r6
-; THUMBV6: orrs    r3, r4
-; THUMBV6-NEXT: orrs    r3, r1
-; THUMBV6-NEXT: ldr     r4, [sp, #64]           @ 4-byte Reload
-; THUMBV6-NEXT: ldr     r1, [sp, #80]           @ 4-byte Reload
-; THUMBV6-NEXT: orrs    r4, r1
-; THUMBV6-NEXT: cmp     r4, #0
-; THUMBV6-NEXT: mov     r1, r0
-; THUMBV6-NEXT: bne     .LBB0_18
-; THUMBV6: mov     r1, r4
-; THUMBV6: ldr     r4, [sp, #76]           @ 4-byte Reload
-; THUMBV6-NEXT: orrs    r7, r4
-; THUMBV6-NEXT: cmp     r7, #0
-; THUMBV6-NEXT: mov     r4, r0
-; THUMBV6-NEXT: bne     .LBB0_20
-; THUMBV6: mov     r4, r7
-; THUMBV6: ands    r4, r1
-; THUMBV6-NEXT: orrs    r4, r3
-; THUMBV6-NEXT: orrs    r4, r2
-; THUMBV6-NEXT: orrs    r4, r5
-; THUMBV6-NEXT: ands    r4, r0
-; THUMBV6-NEXT: ldr     r0, [sp, #72]           @ 4-byte Reload
-; THUMBV6-NEXT: strb    r4, [r0, #16]
-; THUMBV6-NEXT: add     sp, #84
-; THUMBV6-NEXT: pop     {r4, r5, r6, r7, pc}
+; THUMBV6:       @ %bb.0: @ %start
+; THUMBV6-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMBV6-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMBV6-NEXT:    .pad #84
+; THUMBV6-NEXT:    sub sp, #84
+; THUMBV6-NEXT:    mov r6, r3
+; THUMBV6-NEXT:    mov r7, r2
+; THUMBV6-NEXT:    mov r4, r0
+; THUMBV6-NEXT:    movs r5, #0
+; THUMBV6-NEXT:    mov r0, sp
+; THUMBV6-NEXT:    str r5, [r0, #12]
+; THUMBV6-NEXT:    str r5, [r0, #8]
+; THUMBV6-NEXT:    ldr r1, [sp, #116]
+; THUMBV6-NEXT:    str r1, [sp, #72] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [r0, #4]
+; THUMBV6-NEXT:    ldr r1, [sp, #112]
+; THUMBV6-NEXT:    str r1, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [r0]
+; THUMBV6-NEXT:    mov r0, r2
+; THUMBV6-NEXT:    mov r1, r3
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __multi3
+; THUMBV6-NEXT:    str r2, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    str r3, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    str r4, [sp, #76] @ 4-byte Spill
+; THUMBV6-NEXT:    stm r4!, {r0, r1}
+; THUMBV6-NEXT:    ldr r4, [sp, #120]
+; THUMBV6-NEXT:    str r6, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r0
+; THUMBV6-NEXT:    str r1, [sp, #48] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #124]
+; THUMBV6-NEXT:    str r0, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r1, [sp, #28] @ 4-byte Spill
+; THUMBV6-NEXT:    adds r6, r0, r6
+; THUMBV6-NEXT:    str r4, [sp, #68] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r4
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; THUMBV6-NEXT:    adds r0, r1, r6
+; THUMBV6-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r5
+; THUMBV6-NEXT:    adcs r0, r5
+; THUMBV6-NEXT:    str r0, [sp, #64] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r7, [sp, #104]
+; THUMBV6-NEXT:    ldr r0, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r0
+; THUMBV6-NEXT:    str r1, [sp, #52] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #108]
+; THUMBV6-NEXT:    str r0, [sp, #60] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    ldr r4, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r1, [sp, #32] @ 4-byte Spill
+; THUMBV6-NEXT:    adds r6, r0, r6
+; THUMBV6-NEXT:    str r7, [sp, #24] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    adds r1, r1, r6
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    adcs r2, r5
+; THUMBV6-NEXT:    str r2, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    ldr r2, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    str r0, [r2, #8]
+; THUMBV6-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r0
+; THUMBV6-NEXT:    str r1, [r2, #12]
+; THUMBV6-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r5, r5
+; THUMBV6-NEXT:    movs r0, #1
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r2, r0
+; THUMBV6-NEXT:    bne .LBB0_2
+; THUMBV6-NEXT:  @ %bb.1: @ %start
+; THUMBV6-NEXT:    mov r2, r1
+; THUMBV6-NEXT:  .LBB0_2: @ %start
+; THUMBV6-NEXT:    str r2, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r4, r0
+; THUMBV6-NEXT:    bne .LBB0_4
+; THUMBV6-NEXT:  @ %bb.3: @ %start
+; THUMBV6-NEXT:    mov r4, r1
+; THUMBV6-NEXT:  .LBB0_4: @ %start
+; THUMBV6-NEXT:    ldr r1, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r2, r0
+; THUMBV6-NEXT:    ldr r3, [sp, #48] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
+; THUMBV6-NEXT:    bne .LBB0_6
+; THUMBV6-NEXT:  @ %bb.5: @ %start
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:  .LBB0_6: @ %start
+; THUMBV6-NEXT:    cmp r3, #0
+; THUMBV6-NEXT:    mov r7, r0
+; THUMBV6-NEXT:    ldr r1, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    bne .LBB0_8
+; THUMBV6-NEXT:  @ %bb.7: @ %start
+; THUMBV6-NEXT:    mov r7, r3
+; THUMBV6-NEXT:  .LBB0_8: @ %start
+; THUMBV6-NEXT:    cmp r6, #0
+; THUMBV6-NEXT:    mov r3, r0
+; THUMBV6-NEXT:    bne .LBB0_10
+; THUMBV6-NEXT:  @ %bb.9: @ %start
+; THUMBV6-NEXT:    mov r3, r6
+; THUMBV6-NEXT:  .LBB0_10: @ %start
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    mov r1, r0
+; THUMBV6-NEXT:    bne .LBB0_12
+; THUMBV6-NEXT:  @ %bb.11: @ %start
+; THUMBV6-NEXT:    mov r1, r6
+; THUMBV6-NEXT:  .LBB0_12: @ %start
+; THUMBV6-NEXT:    str r7, [sp, #72] @ 4-byte Spill
+; THUMBV6-NEXT:    ands r2, r4
+; THUMBV6-NEXT:    ldr r6, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    cmp r6, #0
+; THUMBV6-NEXT:    mov r4, r0
+; THUMBV6-NEXT:    bne .LBB0_14
+; THUMBV6-NEXT:  @ %bb.13: @ %start
+; THUMBV6-NEXT:    mov r4, r6
+; THUMBV6-NEXT:  .LBB0_14: @ %start
+; THUMBV6-NEXT:    ldr r7, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r2, r7
+; THUMBV6-NEXT:    ands r4, r1
+; THUMBV6-NEXT:    orrs r4, r3
+; THUMBV6-NEXT:    ldr r3, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    cmp r3, #0
+; THUMBV6-NEXT:    mov r1, r0
+; THUMBV6-NEXT:    bne .LBB0_16
+; THUMBV6-NEXT:  @ %bb.15: @ %start
+; THUMBV6-NEXT:    mov r1, r3
+; THUMBV6-NEXT:  .LBB0_16: @ %start
+; THUMBV6-NEXT:    ldr r3, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r2, r3
+; THUMBV6-NEXT:    orrs r4, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r1, r3
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r3, r0
+; THUMBV6-NEXT:    bne .LBB0_18
+; THUMBV6-NEXT:  @ %bb.17: @ %start
+; THUMBV6-NEXT:    mov r3, r1
+; THUMBV6-NEXT:  .LBB0_18: @ %start
+; THUMBV6-NEXT:    ldr r1, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r2, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r4, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r1, r6
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r1, r0
+; THUMBV6-NEXT:    bne .LBB0_20
+; THUMBV6-NEXT:  @ %bb.19: @ %start
+; THUMBV6-NEXT:    mov r1, r6
+; THUMBV6-NEXT:  .LBB0_20: @ %start
+; THUMBV6-NEXT:    ands r1, r3
+; THUMBV6-NEXT:    orrs r1, r4
+; THUMBV6-NEXT:    orrs r1, r2
+; THUMBV6-NEXT:    orrs r1, r5
+; THUMBV6-NEXT:    ands r1, r0
+; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    strb r1, [r0, #16]
+; THUMBV6-NEXT:    add sp, #84
+; THUMBV6-NEXT:    pop {r4, r5, r6, r7, pc}
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
   %1 = extractvalue { i128, i1 } %0, 0
diff --git a/test/CodeGen/Thumb2/thumb2-tbh.ll b/test/CodeGen/Thumb2/thumb2-tbh.ll
index c67efa09b90..fd8070a8b58 100644
--- a/test/CodeGen/Thumb2/thumb2-tbh.ll
+++ b/test/CodeGen/Thumb2/thumb2-tbh.ll
@@ -1,6 +1,11 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -relocation-model=pic | FileCheck %s --check-prefix=CHECK --check-prefix=T2
-; RUN: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=pic | FileCheck %s --check-prefix=CHECK --check-prefix=T1
-; RUN: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=static | FileCheck %s --check-prefix=CHECK --check-prefix=T1
+; RUN: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=pic | FileCheck %s --check-prefix=T1DISABLED
+; FIXME: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=pic | FileCheck %s --check-prefix=CHECK --check-prefix=T1
+; FIXME: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=static | FileCheck %s --check-prefix=CHECK --check-prefix=T1
+
+; FIXME: Thumb1 tests temporarily disabled; MachineLICM is now hoisting the
+; subs, so the jump table can't be formed.
+; T1DISABLED: .data_region jt32
 
 ; Thumb2 target should reorder the bb's in order to use tbb / tbh.
 
-- 
GitLab


From 9bd251e6a004db33b170d13955a78d140163d651 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 26 Oct 2018 20:26:36 +0000
Subject: [PATCH 0651/1116] [Spectre] Fix MIR verifier errors in retpoline
 thunks

Summary:
The main challenge here is that X86InstrInfo::AnalyzeBranch doesn't
understand the way we're using a CALL instruction as a branch, so we
can't list the CallTarget MBB as a successor of the entry block. If we
don't list it as a successor, then the AsmPrinter doesn't print a label
for the MBB.

Fix the issue by inserting our own label at the beginning of the call
target block. We can rely on the AsmPrinter to always emit it, even
though the block appears to be unreachable, but address-taken.

Fixes PR38391.

Reviewers: thegameg, chandlerc, echristo

Subscribers: hiraditya, llvm-commits

Differential Revision: https://reviews.llvm.org/D53653

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345426 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86RetpolineThunks.cpp  | 23 +++++++++++++++++------
 test/CodeGen/X86/retpoline-external.ll |  8 ++++----
 test/CodeGen/X86/retpoline-regparm.ll  |  2 +-
 test/CodeGen/X86/retpoline.ll          | 23 ++++++++++++++---------
 4 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index f62e89eb1ba..08994cccb21 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -74,7 +74,7 @@ private:
 
   void createThunkFunction(Module &M, StringRef Name);
   void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
-  void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
+  void populateThunk(MachineFunction &MF, unsigned Reg);
 };
 
 } // end anonymous namespace
@@ -236,25 +236,33 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
 }
 
 void X86RetpolineThunks::populateThunk(MachineFunction &MF,
-                                       Optional<unsigned> Reg) {
+                                       unsigned Reg) {
   // Set MF properties. We never use vregs...
   MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
 
+  // Grab the entry MBB and erase any other blocks. O0 codegen appears to
+  // generate two bbs for the entry block.
   MachineBasicBlock *Entry = &MF.front();
   Entry->clear();
+  while (MF.size() > 1)
+    MF.erase(std::next(MF.begin()));
 
   MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
   MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
+  MCSymbol *TargetSym = MF.getContext().createTempSymbol();
   MF.push_back(CaptureSpec);
   MF.push_back(CallTarget);
 
   const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
   const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
 
-  BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addMBB(CallTarget);
-  Entry->addSuccessor(CallTarget);
+  Entry->addLiveIn(Reg);
+  BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym);
+
+  // The MIR verifier thinks that the CALL in the entry block will fall through
+  // to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is
+  // the successor, but the MIR verifier doesn't know how to cope with that.
   Entry->addSuccessor(CaptureSpec);
-  CallTarget->setHasAddressTaken();
 
   // In the capture loop for speculation, we want to stop the processor from
   // speculating as fast as possible. On Intel processors, the PAUSE instruction
@@ -270,7 +278,10 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
   CaptureSpec->setHasAddressTaken();
   CaptureSpec->addSuccessor(CaptureSpec);
 
+  CallTarget->addLiveIn(Reg);
+  CallTarget->setHasAddressTaken();
   CallTarget->setAlignment(4);
-  insertRegReturnAddrClobber(*CallTarget, *Reg);
+  insertRegReturnAddrClobber(*CallTarget, Reg);
+  CallTarget->back().setPreInstrSymbol(MF, TargetSym);
   BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
 }
diff --git a/test/CodeGen/X86/retpoline-external.ll b/test/CodeGen/X86/retpoline-external.ll
index 308a1a3181b..849660cdedb 100644
--- a/test/CodeGen/X86/retpoline-external.ll
+++ b/test/CodeGen/X86/retpoline-external.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
-; RUN: llc -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
 
-; RUN: llc -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86
-; RUN: llc -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST
+; RUN: llc -verify-machineinstrs -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86
+; RUN: llc -verify-machineinstrs -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST
 
 declare void @bar(i32)
 
diff --git a/test/CodeGen/X86/retpoline-regparm.ll b/test/CodeGen/X86/retpoline-regparm.ll
index 472cf0b1f0d..668047c3891 100644
--- a/test/CodeGen/X86/retpoline-regparm.ll
+++ b/test/CodeGen/X86/retpoline-regparm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-linux < %s | FileCheck --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" %s
+; RUN: llc -verify-machineinstrs -mtriple=i686-linux < %s | FileCheck --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" %s
 
 ; Test 32-bit retpoline when -mregparm=3 is used. This case is interesting
 ; because there are no available scratch registers.  The Linux kernel builds
diff --git a/test/CodeGen/X86/retpoline.ll b/test/CodeGen/X86/retpoline.ll
index 2625435ab8c..9a1673e8a56 100644
--- a/test/CodeGen/X86/retpoline.ll
+++ b/test/CodeGen/X86/retpoline.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
-; RUN: llc -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
 
-; RUN: llc -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86
-; RUN: llc -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST
+; RUN: llc -verify-machineinstrs -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86
+; RUN: llc -verify-machineinstrs -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST
 
 declare void @bar(i32)
 
@@ -428,8 +428,9 @@ latch:
 ; X64-NEXT:          lfence
 ; X64-NEXT:          jmp     [[CAPTURE_SPEC]]
 ; X64-NEXT:          .p2align        4, 0x90
-; X64-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X64-NEXT:  {{.*}}                                  # Block address taken
 ; X64-NEXT:                                          # %entry
+; X64-NEXT:  [[CALL_TARGET]]:
 ; X64-NEXT:          movq    %r11, (%rsp)
 ; X64-NEXT:          retq
 ;
@@ -446,8 +447,9 @@ latch:
 ; X86-NEXT:          lfence
 ; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
 ; X86-NEXT:          .p2align        4, 0x90
-; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:  {{.*}}                                  # Block address taken
 ; X86-NEXT:                                          # %entry
+; X86-NEXT:  [[CALL_TARGET]]:
 ; X86-NEXT:          movl    %eax, (%esp)
 ; X86-NEXT:          retl
 ;
@@ -464,8 +466,9 @@ latch:
 ; X86-NEXT:          lfence
 ; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
 ; X86-NEXT:          .p2align        4, 0x90
-; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:  {{.*}}                                  # Block address taken
 ; X86-NEXT:                                          # %entry
+; X86-NEXT:  [[CALL_TARGET]]:
 ; X86-NEXT:          movl    %ecx, (%esp)
 ; X86-NEXT:          retl
 ;
@@ -482,8 +485,9 @@ latch:
 ; X86-NEXT:          lfence
 ; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
 ; X86-NEXT:          .p2align        4, 0x90
-; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:  {{.*}}                                  # Block address taken
 ; X86-NEXT:                                          # %entry
+; X86-NEXT:  [[CALL_TARGET]]:
 ; X86-NEXT:          movl    %edx, (%esp)
 ; X86-NEXT:          retl
 ;
@@ -500,8 +504,9 @@ latch:
 ; X86-NEXT:          lfence
 ; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
 ; X86-NEXT:          .p2align        4, 0x90
-; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:  {{.*}}                                  # Block address taken
 ; X86-NEXT:                                          # %entry
+; X86-NEXT:  [[CALL_TARGET]]:
 ; X86-NEXT:          movl    %edi, (%esp)
 ; X86-NEXT:          retl
 
-- 
GitLab


From 151762554372980bc5606884dd937e3609c09dde Mon Sep 17 00:00:00 2001
From: George Burgess IV <george.burgess.iv@gmail.com>
Date: Fri, 26 Oct 2018 20:56:03 +0000
Subject: [PATCH 0652/1116] Add docs+a script for building clang/LLVM with PGO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Depending on who you ask, PGO grants a 15%-25% improvement in build
times when using clang. Sadly, hooking everything up properly to
generate a profile and apply it to clang isn't always straightforward.
This script (and the accompanying docs) aim to make this process easier;
ideally, a single invocation of the given script.

In terms of testing, I've got a cronjob on my Debian box that's meant to
run this a few times per week, and I tried manually running it on a puny
Gentoo box I have (four whole Atom cores!). Nothing obviously broke.
¯\_(ツ)_/¯

I don't know if we have a Python style guide, so I just shoved this
through yapf with all the defaults on.

Finally, though the focus is clang at the moment, the hope is that this
is easily applicable to other LLVM-y tools with minimal effort (e.g.
lld, opt, ...). Hence, this lives in llvm/utils and tries to be somewhat
ambiguous about naming.

Differential Revision: https://reviews.llvm.org/D53598


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345427 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/HowToBuildWithPGO.rst          | 163 ++++++++++
 docs/index.rst                      |   4 +
 utils/collect_and_build_with_pgo.py | 487 ++++++++++++++++++++++++++++
 3 files changed, 654 insertions(+)
 create mode 100644 docs/HowToBuildWithPGO.rst
 create mode 100755 utils/collect_and_build_with_pgo.py

diff --git a/docs/HowToBuildWithPGO.rst b/docs/HowToBuildWithPGO.rst
new file mode 100644
index 00000000000..ba93bc64a29
--- /dev/null
+++ b/docs/HowToBuildWithPGO.rst
@@ -0,0 +1,163 @@
+=============================================================
+How To Build Clang and LLVM with Profile-Guided Optimizations
+=============================================================
+
+Introduction
+============
+
+PGO (Profile-Guided Optimization) allows your compiler to better optimize code
+for how it actually runs. Users report that applying this to Clang and LLVM can
+decrease overall compile time by 20%.
+
+This guide walks you through how to build Clang with PGO, though it also applies
+to other subprojects, such as LLD.
+
+
+Using the script
+================
+
+We have a script at ``utils/collect_and_build_with_pgo.py``. This script is
+tested on a few Linux flavors, and requires a checkout of LLVM, Clang, and
+compiler-rt. Despite the the name, it performs four clean builds of Clang, so it
+can take a while to run to completion. Please see the script's ``--help`` for
+more information on how to run it, and the different options available to you.
+If you want to get the most out of PGO for a particular use-case (e.g. compiling
+a specific large piece of software), please do read the section below on
+'benchmark' selection.
+
+Please note that this script is only tested on a few Linux distros. Patches to
+add support for other platforms, as always, are highly appreciated. :)
+
+This script also supports a ``--dry-run`` option, which causes it to print
+important commands instead of running them.
+
+
+Selecting 'benchmarks'
+======================
+
+PGO does best when the profiles gathered represent how the user plans to use the
+compiler. Notably, highly accurate profiles of llc building x86_64 code aren't
+incredibly helpful if you're going to be targeting ARM.
+
+By default, the script above does two things to get solid coverage. It:
+
+- runs all of Clang and LLVM's lit tests, and
+- uses the instrumented Clang to build Clang, LLVM, and all of the other
+  LLVM subprojects available to it.
+
+Together, these should give you:
+
+- solid coverage of building C++,
+- good coverage of building C,
+- great coverage of running optimizations,
+- great coverage of the backend for your host's architecture, and
+- some coverage of other architectures (if other arches are supported backends).
+
+Altogether, this should cover a diverse set of uses for Clang and LLVM. If you
+have very specific needs (e.g. your compiler is meant to compile a large browser
+for four different platforms, or similar), you may want to do something else.
+This is configurable in the script itself.
+
+
+Building Clang with PGO
+=======================
+
+If you prefer to not use the script, this briefly goes over how to build
+Clang/LLVM with PGO.
+
+First, you should have at least LLVM, Clang, and compiler-rt checked out
+locally.
+
+Next, at a high level, you're going to need to do the following:
+
+1. Build a standard Release Clang and the relevant libclang_rt.profile library
+2. Build Clang using the Clang you built above, but with instrumentation
+3. Use the instrumented Clang to generate profiles, which consists of two steps:
+
+  - Running the instrumented Clang/LLVM/lld/etc. on tasks that represent how
+    users will use said tools.
+  - Using a tool to convert the "raw" profiles generated above into a single,
+    final PGO profile.
+
+4. Build a final release Clang (along with whatever other binaries you need)
+   using the profile collected from your benchmark
+
+In more detailed steps:
+
+1. Configure a Clang build as you normally would. It's highly recommended that
+   you use the Release configuration for this, since it will be used to build
+   another Clang. Because you need Clang and supporting libraries, you'll want
+   to build the ``all`` target (e.g. ``ninja all`` or ``make -j4 all``).
+
+2. Configure a Clang build as above, but add the following CMake args:
+
+   - ``-DLLVM_BUILD_INSTRUMENTED=IR`` -- This causes us to build everything
+     with instrumentation.
+   - ``-DLLVM_BUILD_RUNTIME=No`` -- A few projects have bad interactions when
+     built with profiling, and aren't necessary to build. This flag turns them
+     off.
+   - ``-DCMAKE_C_COMPILER=/path/to/stage1/clang`` - Use the Clang we built in
+     step 1.
+   - ``-DCMAKE_CXX_COMPILER=/path/to/stage1/clang++`` - Same as above.
+
+ In this build directory, you simply need to build the ``clang`` target (and
+ whatever supporting tooling your benchmark requires).
+
+3. As mentioned above, this has two steps: gathering profile data, and then
+   massaging it into a useful form:
+
+   a. Build your benchmark using the Clang generated in step 2. The 'standard'
+      benchmark recommended is to run ``check-clang`` and ``check-llvm`` in your
+      instrumented Clang's build directory, and to do a full build of Clang/LLVM
+      using your instrumented Clang. So, create yet another build directory,
+      with the following CMake arguments:
+
+      - ``-DCMAKE_C_COMPILER=/path/to/stage2/clang`` - Use the Clang we built in
+        step 2.
+      - ``-DCMAKE_CXX_COMPILER=/path/to/stage2/clang++`` - Same as above.
+
+      If your users are fans of debug info, you may want to consider using
+      ``-DCMAKE_BUILD_TYPE=RelWithDebInfo`` instead of
+      ``-DCMAKE_BUILD_TYPE=Release``. This will grant better coverage of
+      debug info pieces of clang, but will take longer to complete and will
+      result in a much larger build directory.
+
+      It's recommended to build the ``all`` target with your instrumented Clang,
+      since more coverage is often better.
+
+  b. You should now have a few ``*.profdata`` files in
+     ``path/to/stage2/profiles/``. You need to merge these using
+     ``llvm-profdata`` (even if you only have one! The profile merge transforms
+     profraw into actual profile data, as well). This can be done with
+     ``/path/to/stage1/llvm-profdata -merge
+     -output=/path/to/output/profdata.prof path/to/stage2/profiles/*.profdata``.
+
+4. Now, build your final, PGO-optimized Clang. To do this, you'll want to pass
+   the following additional arguments to CMake.
+
+   - ``-DLLVM_PROFDATA_FILE=/path/to/output/profdata.prof`` - Use the PGO
+     profile from the previous step.
+   - ``-DCMAKE_C_COMPILER=/path/to/stage1/clang`` - Use the Clang we built in
+     step 1.
+   - ``-DCMAKE_CXX_COMPILER=/path/to/stage1/clang++`` - Same as above.
+
+   From here, you can build whatever targets you need.
+
+   .. note::
+     You may see warnings about a mismatched profile in the build output. These
+     are generally harmless. To silence them, you can add
+     ``-DCMAKE_C_FLAGS='-Wno-backend-plugin'
+     -DCMAKE_CXX_FLAGS='-Wno-backend-plugin'`` to your CMake invocation.
+
+
+Congrats! You now have a Clang built with profile-guided optimizations, and you
+can delete all but the final build directory if you'd like.
+
+If this worked well for you and you plan on doing it often, there's a slight
+optimization that can be made: LLVM and Clang have a tool called tblgen that's
+built and run during the build process. While it's potentially nice to build
+this for coverage as part of step 3, none of your other builds should benefit
+from building it. You can pass the CMake options
+``-DCLANG_TABLEGEN=/path/to/stage1/bin/clang-tblgen
+-DLLVM_TABLEGEN=/path/to/stage1/bin/llvm-tblgen`` to steps 2 and onward to avoid
+these useless rebuilds.
diff --git a/docs/index.rst b/docs/index.rst
index 16d36866b5d..de9218e6f4c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -68,6 +68,7 @@ representation.
    CMakePrimer
    AdvancedBuilds
    HowToBuildOnARM
+   HowToBuildWithPGO
    HowToCrossCompileBuiltinsOnArm
    HowToCrossCompileLLVM
    CommandGuide/index
@@ -107,6 +108,9 @@ representation.
 :doc:`HowToBuildOnARM`
    Notes on building and testing LLVM/Clang on ARM.
 
+:doc:`HowToBuildWithPGO`
+    Notes on building LLVM/Clang with PGO.
+
 :doc:`HowToCrossCompileBuiltinsOnArm`
    Notes on cross-building and testing the compiler-rt builtins for Arm.
 
diff --git a/utils/collect_and_build_with_pgo.py b/utils/collect_and_build_with_pgo.py
new file mode 100755
index 00000000000..0b3943647bc
--- /dev/null
+++ b/utils/collect_and_build_with_pgo.py
@@ -0,0 +1,487 @@
+#!/usr/bin/env python3
+"""
+This script:
+- Builds clang with user-defined flags
+- Uses that clang to build an instrumented clang, which can be used to collect
+  PGO samples
+- Builds a user-defined set of sources (default: clang) to act as a
+  "benchmark" to generate a PGO profile
+- Builds clang once more with the PGO profile generated above
+
+This is a total of four clean builds of clang (by default). This may take a
+while. :)
+"""
+
+import argparse
+import collections
+import multiprocessing
+import os
+import shlex
+import shutil
+import subprocess
+import sys
+
+### User configuration
+
+
+# If you want to use a different 'benchmark' than building clang, make this
+# function do what you want. out_dir is the build directory for clang, so all
+# of the clang binaries will live under "${out_dir}/bin/". Using clang in
+# ${out_dir} will magically have the profiles go to the right place.
+#
+# You may assume that out_dir is a freshly-built directory that you can reach
+# in to build more things, if you'd like.
+def _run_benchmark(env, out_dir, include_debug_info):
+    """The 'benchmark' we run to generate profile data."""
+    target_dir = env.output_subdir('instrumentation_run')
+
+    # `check-llvm` and `check-clang` are cheap ways to increase coverage. The
+    # former lets us touch on the non-x86 backends a bit if configured, and the
+    # latter gives us more C to chew on (and will send us through diagnostic
+    # paths a fair amount, though the `if (stuff_is_broken) { diag() ... }`
+    # branches should still heavily be weighted in the not-taken direction,
+    # since we built all of LLVM/etc).
+    _build_things_in(env, target_dir, what=['check-llvm', 'check-clang'])
+
+    # Building tblgen gets us coverage; don't skip it. (out_dir may also not
+    # have them anyway, but that's less of an issue)
+    cmake = _get_cmake_invocation_for_bootstrap_from(
+        env, out_dir, skip_tablegens=False)
+
+    if include_debug_info:
+        cmake.add_flag('CMAKE_BUILD_TYPE', 'RelWithDebInfo')
+
+    _run_fresh_cmake(env, cmake, target_dir)
+
+    # Just build all the things. The more data we have, the better.
+    _build_things_in(env, target_dir, what=['all'])
+
+
+### Script
+
+
+class CmakeInvocation:
+    _cflags = ['CMAKE_C_FLAGS', 'CMAKE_CXX_FLAGS']
+    _ldflags = [
+        'CMAKE_EXE_LINKER_FLAGS',
+        'CMAKE_MODULE_LINKER_FLAGS',
+        'CMAKE_SHARED_LINKER_FLAGS',
+    ]
+
+    def __init__(self, cmake, maker, cmake_dir):
+        self._prefix = [cmake, '-G', maker, cmake_dir]
+
+        # Map of str -> (list|str).
+        self._flags = {}
+        for flag in CmakeInvocation._cflags + CmakeInvocation._ldflags:
+            self._flags[flag] = []
+
+    def add_new_flag(self, key, value):
+        self.add_flag(key, value, allow_overwrites=False)
+
+    def add_flag(self, key, value, allow_overwrites=True):
+        if key not in self._flags:
+            self._flags[key] = value
+            return
+
+        existing_value = self._flags[key]
+        if isinstance(existing_value, list):
+            existing_value.append(value)
+            return
+
+        if not allow_overwrites:
+            raise ValueError('Invalid overwrite of %s requested' % key)
+
+        self._flags[key] = value
+
+    def add_cflags(self, flags):
+        # No, I didn't intend to append ['-', 'O', '2'] to my flags, thanks :)
+        assert not isinstance(flags, str)
+        for f in CmakeInvocation._cflags:
+            self._flags[f].extend(flags)
+
+    def add_ldflags(self, flags):
+        assert not isinstance(flags, str)
+        for f in CmakeInvocation._ldflags:
+            self._flags[f].extend(flags)
+
+    def to_args(self):
+        args = self._prefix.copy()
+        for key, value in sorted(self._flags.items()):
+            if isinstance(value, list):
+                # We preload all of the list-y values (cflags, ...). If we've
+                # nothing to add, don't.
+                if not value:
+                    continue
+                value = ' '.join(value)
+
+            arg = '-D' + key
+            if value != '':
+                arg += '=' + value
+            args.append(arg)
+        return args
+
+
+class Env:
+    def __init__(self, llvm_dir, use_make, output_dir, default_cmake_args,
+                 dry_run):
+        self.llvm_dir = llvm_dir
+        self.use_make = use_make
+        self.output_dir = output_dir
+        self.default_cmake_args = default_cmake_args.copy()
+        self.dry_run = dry_run
+
+    def get_default_cmake_args_kv(self):
+        return self.default_cmake_args.items()
+
+    def get_cmake_maker(self):
+        return 'Ninja' if not self.use_make else 'Unix Makefiles'
+
+    def get_make_command(self):
+        if self.use_make:
+            return ['make', '-j{}'.format(multiprocessing.cpu_count())]
+        return ['ninja']
+
+    def output_subdir(self, name):
+        return os.path.join(self.output_dir, name)
+
+    def has_llvm_subproject(self, name):
+        if name == 'compiler-rt':
+            subdir = 'projects/compiler-rt'
+        elif name == 'clang':
+            subdir = 'tools/clang'
+        else:
+            raise ValueError('Unknown subproject: %s' % name)
+
+        return os.path.isdir(os.path.join(self.llvm_dir, subdir))
+
+    # Note that we don't allow capturing stdout/stderr. This works quite nicely
+    # with dry_run.
+    def run_command(self,
+                    cmd,
+                    cwd=None,
+                    check=False,
+                    silent_unless_error=False):
+        cmd_str = ' '.join(shlex.quote(s) for s in cmd)
+        print(
+            'Running `%s` in %s' % (cmd_str, shlex.quote(cwd or os.getcwd())))
+
+        if self.dry_run:
+            return
+
+        if silent_unless_error:
+            stdout, stderr = subprocess.PIPE, subprocess.STDOUT
+        else:
+            stdout, stderr = None, None
+
+        # Don't use subprocess.run because it's >= py3.5 only, and it's not too
+        # much extra effort to get what it gives us anyway.
+        popen = subprocess.Popen(
+            cmd,
+            stdin=subprocess.DEVNULL,
+            stdout=stdout,
+            stderr=stderr,
+            cwd=cwd)
+        stdout, _ = popen.communicate()
+        return_code = popen.wait(timeout=0)
+
+        if not return_code:
+            return
+
+        if silent_unless_error:
+            print(stdout.decode('utf-8', 'ignore'))
+
+        if check:
+            raise subprocess.CalledProcessError(
+                returncode=return_code, cmd=cmd, output=stdout, stderr=None)
+
+
+def _get_default_cmake_invocation(env):
+    inv = CmakeInvocation(
+        cmake='cmake', maker=env.get_cmake_maker(), cmake_dir=env.llvm_dir)
+    for key, value in env.get_default_cmake_args_kv():
+        inv.add_new_flag(key, value)
+    return inv
+
+
+def _get_cmake_invocation_for_bootstrap_from(env, out_dir,
+                                             skip_tablegens=True):
+    clang = os.path.join(out_dir, 'bin', 'clang')
+    cmake = _get_default_cmake_invocation(env)
+    cmake.add_new_flag('CMAKE_C_COMPILER', clang)
+    cmake.add_new_flag('CMAKE_CXX_COMPILER', clang + '++')
+
+    # We often get no value out of building new tblgens; the previous build
+    # should have them. It's still correct to build them, just slower.
+    def add_tablegen(key, binary):
+        path = os.path.join(out_dir, 'bin', binary)
+
+        # Check that this exists, since the user's allowed to specify their own
+        # stage1 directory (which is generally where we'll source everything
+        # from). Dry runs should hope for the best from our user, as well.
+        if env.dry_run or os.path.exists(path):
+            cmake.add_new_flag(key, path)
+
+    if skip_tablegens:
+        add_tablegen('LLVM_TABLEGEN', 'llvm-tblgen')
+        add_tablegen('CLANG_TABLEGEN', 'clang-tblgen')
+
+    return cmake
+
+
+def _build_things_in(env, target_dir, what):
+    cmd = env.get_make_command() + what
+    env.run_command(cmd, cwd=target_dir, check=True)
+
+
+def _run_fresh_cmake(env, cmake, target_dir):
+    if not env.dry_run:
+        try:
+            shutil.rmtree(target_dir)
+        except FileNotFoundError:
+            pass
+
+        os.makedirs(target_dir, mode=0o755)
+
+    cmake_args = cmake.to_args()
+    env.run_command(
+        cmake_args, cwd=target_dir, check=True, silent_unless_error=True)
+
+
+def _build_stage1_clang(env):
+    target_dir = env.output_subdir('stage1')
+    cmake = _get_default_cmake_invocation(env)
+    _run_fresh_cmake(env, cmake, target_dir)
+
+    # FIXME: The full build here is somewhat unfortunate. It's primarily
+    # because I don't know what to call libclang_rt.profile for arches that
+    # aren't x86_64 (and even then, it's in a subdir that contains clang's
+    # current version). It would be nice to figure out what target I can
+    # request to magically have libclang_rt.profile built for ${host}
+    _build_things_in(env, target_dir, what=['all'])
+    return target_dir
+
+
+def _generate_instrumented_clang_profile(env, stage1_dir, profile_dir,
+                                         output_file):
+    llvm_profdata = os.path.join(stage1_dir, 'bin', 'llvm-profdata')
+    if env.dry_run:
+        profiles = [os.path.join(profile_dir, '*.profraw')]
+    else:
+        profiles = [
+            os.path.join(profile_dir, f) for f in os.listdir(profile_dir)
+            if f.endswith('.profraw')
+        ]
+    cmd = [llvm_profdata, 'merge', '-output=' + output_file] + profiles
+    env.run_command(cmd, check=True)
+
+
+def _build_instrumented_clang(env, stage1_dir):
+    assert os.path.isabs(stage1_dir)
+
+    target_dir = os.path.join(env.output_dir, 'instrumented')
+    cmake = _get_cmake_invocation_for_bootstrap_from(env, stage1_dir)
+    cmake.add_new_flag('LLVM_BUILD_INSTRUMENTED', 'IR')
+
+    # libcxx's configure step messes with our link order: we'll link
+    # libclang_rt.profile after libgcc, and the former requires atexit from the
+    # latter. So, configure checks fail.
+    #
+    # Since we don't need libcxx or compiler-rt anyway, just disable them.
+    cmake.add_new_flag('LLVM_BUILD_RUNTIME', 'No')
+
+    _run_fresh_cmake(env, cmake, target_dir)
+    _build_things_in(env, target_dir, what=['clang', 'lld'])
+
+    profiles_dir = os.path.join(target_dir, 'profiles')
+    return target_dir, profiles_dir
+
+
+def _build_optimized_clang(env, stage1_dir, profdata_file):
+    if not env.dry_run and not os.path.exists(profdata_file):
+        raise ValueError('Looks like the profdata file at %s doesn\'t exist' %
+                         profdata_file)
+
+    target_dir = os.path.join(env.output_dir, 'optimized')
+    cmake = _get_cmake_invocation_for_bootstrap_from(env, stage1_dir)
+    cmake.add_new_flag('LLVM_PROFDATA_FILE', os.path.abspath(profdata_file))
+
+    # We'll get complaints about hash mismatches in `main` in tools/etc. Ignore
+    # it.
+    cmake.add_cflags(['-Wno-backend-plugin'])
+    _run_fresh_cmake(env, cmake, target_dir)
+    _build_things_in(env, target_dir, what=['clang'])
+    return target_dir
+
+
+Args = collections.namedtuple('Args', [
+    'do_optimized_build',
+    'include_debug_info',
+    'profile_location',
+    'stage1_dir',
+])
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description='Builds LLVM and Clang with instrumentation, collects '
+        'instrumentation profiles for them, and (optionally) builds things'
+        'with these PGO profiles. By default, it\'s assumed that you\'re '
+        'running this from your LLVM root, and all build artifacts will be '
+        'saved to $PWD/out.')
+    parser.add_argument(
+        '--cmake-extra-arg',
+        action='append',
+        default=[],
+        help='an extra arg to pass to all cmake invocations. Note that this '
+        'is interpreted as a -D argument, e.g. --cmake-extra-arg FOO=BAR will '
+        'be passed as -DFOO=BAR. This may be specified multiple times.')
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='print commands instead of running them')
+    parser.add_argument(
+        '--llvm-dir',
+        default='.',
+        help='directory containing an LLVM checkout (default: $PWD)')
+    parser.add_argument(
+        '--no-optimized-build',
+        action='store_true',
+        help='disable the final, PGO-optimized build')
+    parser.add_argument(
+        '--out-dir',
+        help='directory to write artifacts to (default: $llvm_dir/out)')
+    parser.add_argument(
+        '--profile-output',
+        help='where to output the profile (default is $out/pgo_profile.prof)')
+    parser.add_argument(
+        '--stage1-dir',
+        help='instead of having an initial build of everything, use the given '
+        'directory. It is expected that this directory will have clang, '
+        'llvm-profdata, and the appropriate libclang_rt.profile already built')
+    parser.add_argument(
+        '--use-debug-info-in-benchmark',
+        action='store_true',
+        help='use a regular build instead of RelWithDebInfo in the benchmark. '
+        'This increases benchmark execution time and disk space requirements, '
+        'but gives more coverage over debuginfo bits in LLVM and clang.')
+    parser.add_argument(
+        '--use-make',
+        action='store_true',
+        default=shutil.which('ninja') is None,
+        help='use Makefiles instead of ninja')
+
+    args = parser.parse_args()
+
+    llvm_dir = os.path.abspath(args.llvm_dir)
+    if args.out_dir is None:
+        output_dir = os.path.join(llvm_dir, 'out')
+    else:
+        output_dir = os.path.abspath(args.out_dir)
+
+    extra_args = {'CMAKE_BUILD_TYPE': 'Release'}
+    for arg in args.cmake_extra_arg:
+        if arg.startswith('-D'):
+            arg = arg[2:]
+        elif arg.startswith('-'):
+            raise ValueError('Unknown not- -D arg encountered; you may need '
+                             'to tweak the source...')
+        split = arg.split('=', 1)
+        if len(split) == 1:
+            key, val = split[0], ''
+        else:
+            key, val = split
+        extra_args[key] = val
+
+    env = Env(
+        default_cmake_args=extra_args,
+        dry_run=args.dry_run,
+        llvm_dir=llvm_dir,
+        output_dir=output_dir,
+        use_make=args.use_make,
+    )
+
+    if args.profile_output is not None:
+        profile_location = args.profile_output
+    else:
+        profile_location = os.path.join(env.output_dir, 'pgo_profile.prof')
+
+    result_args = Args(
+        do_optimized_build=not args.no_optimized_build,
+        include_debug_info=args.use_debug_info_in_benchmark,
+        profile_location=profile_location,
+        stage1_dir=args.stage1_dir,
+    )
+
+    return env, result_args
+
+
+def _looks_like_llvm_dir(directory):
+    """Arbitrary set of heuristics to determine if `directory` is an llvm dir.
+
+    Errs on the side of false-positives."""
+
+    contents = set(os.listdir(directory))
+    expected_contents = [
+        'CODE_OWNERS.TXT',
+        'cmake',
+        'docs',
+        'include',
+        'utils',
+    ]
+
+    if not all(c in contents for c in expected_contents):
+        return False
+
+    try:
+        include_listing = os.listdir(os.path.join(directory, 'include'))
+    except NotADirectoryError:
+        return False
+
+    return 'llvm' in include_listing
+
+
+def _die(*args, **kwargs):
+    kwargs['file'] = sys.stderr
+    print(*args, **kwargs)
+    sys.exit(1)
+
+
+def _main():
+    env, args = _parse_args()
+
+    if not _looks_like_llvm_dir(env.llvm_dir):
+        _die('Looks like %s isn\'t an LLVM directory; please see --help' %
+             env.llvm_dir)
+    if not env.has_llvm_subproject('clang'):
+        _die('Need a clang checkout at tools/clang')
+    if not env.has_llvm_subproject('compiler-rt'):
+        _die('Need a compiler-rt checkout at projects/compiler-rt')
+
+    def status(*args):
+        print(*args, file=sys.stderr)
+
+    if args.stage1_dir is None:
+        status('*** Building stage1 clang...')
+        stage1_out = _build_stage1_clang(env)
+    else:
+        stage1_out = args.stage1_dir
+
+    status('*** Building instrumented clang...')
+    instrumented_out, profile_dir = _build_instrumented_clang(env, stage1_out)
+    status('*** Running profdata benchmarks...')
+    _run_benchmark(env, instrumented_out, args.include_debug_info)
+    status('*** Generating profile...')
+    _generate_instrumented_clang_profile(env, stage1_out, profile_dir,
+                                         args.profile_location)
+
+    print('Final profile:', args.profile_location)
+    if args.do_optimized_build:
+        status('*** Building PGO-optimized binaries...')
+        optimized_out = _build_optimized_clang(env, stage1_out,
+                                               args.profile_location)
+        print('Final build directory:', optimized_out)
+
+
+if __name__ == '__main__':
+    _main()
-- 
GitLab


From 556cad18dba3a3b7ed84195e3597423b3fd17080 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 26 Oct 2018 20:59:55 +0000
Subject: [PATCH 0653/1116] [LegalizeTypes] Stop
 DAGTypeLegalizer::getSETCCWidenedResultTy from creating illegal setccs. Add
 checks for valid setccs

The DAGTypeLegalizer::getSETCCWidenedResultTy was widening the MaskVT, but the code in convertMask called after getSETCCWidenedResultTy had no idea this widening had occurred. So none of the operands were widened when convertMask created new setccs with the widened VT.

This patch removes the widening and adds some asserts to getNode to validate the types of setccs to prevent issues like this in the future.

Differential Revision: https://reviews.llvm.org/D53743

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345428 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeTypes.h         |  3 ---
 lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 16 +++-------------
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp        |  8 ++++++++
 3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index f31b115bc2d..605c63c72d4 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -849,9 +849,6 @@ private:
   /// MaskVT to ToMaskVT if needed with vector extension or truncation.
   SDValue convertMask(SDValue InMask, EVT MaskVT, EVT ToMaskVT);
 
-  /// Get the target mask VT, and widen if needed.
-  EVT getSETCCWidenedResultTy(SDValue SetCC);
-
   //===--------------------------------------------------------------------===//
   // Generic Splitting: LegalizeTypesGeneric.cpp
   //===--------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 58446101556..1027f31d084 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3373,16 +3373,6 @@ SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
   return Mask;
 }
 
-// Get the target mask VT, and widen if needed.
-EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) {
-  assert(SetCC->getOpcode() == ISD::SETCC);
-  LLVMContext &Ctx = *DAG.getContext();
-  EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType());
-  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
-    MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT);
-  return MaskVT;
-}
-
 // This method tries to handle VSELECT and its mask by legalizing operands
 // (which may require widening) and if needed adjusting the mask vector type
 // to match that of the VSELECT. Without it, many cases end up with
@@ -3450,7 +3440,7 @@ SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
 
   SDValue Mask;
   if (Cond->getOpcode() == ISD::SETCC) {
-    EVT MaskVT = getSETCCWidenedResultTy(Cond);
+    EVT MaskVT = getSetCCResultType(Cond.getOperand(0).getValueType());
     Mask = convertMask(Cond, MaskVT, ToMaskVT);
   } else if (isLogicalMaskOp(Cond->getOpcode()) &&
              Cond->getOperand(0).getOpcode() == ISD::SETCC &&
@@ -3458,8 +3448,8 @@ SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
     // Cond is (AND/OR/XOR (SETCC, SETCC))
     SDValue SETCC0 = Cond->getOperand(0);
     SDValue SETCC1 = Cond->getOperand(1);
-    EVT VT0 = getSETCCWidenedResultTy(SETCC0);
-    EVT VT1 = getSETCCWidenedResultTy(SETCC1);
+    EVT VT0 = getSetCCResultType(SETCC0.getOperand(0).getValueType());
+    EVT VT1 = getSetCCResultType(SETCC1.getOperand(0).getValueType());
     unsigned ScalarBits0 = VT0.getScalarSizeInBits();
     unsigned ScalarBits1 = VT1.getScalarSizeInBits();
     unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2d99a6aecb5..23abbf10f53 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5027,6 +5027,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     break;
   }
   case ISD::SETCC: {
+    assert(VT.isInteger() && "SETCC result type must be an integer!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           "SETCC operands must have the same type!");
+    assert(VT.isVector() == N1.getValueType().isVector() &&
+           "SETCC type should be vector iff the operand type is vector!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() == N1.getValueType().getVectorNumElements()) &&
+           "SETCC vector element counts must match!");
     // Use FoldSetCC to simplify SETCC's.
     if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
       return V;
-- 
GitLab


From 52931b4520cfe51b3c690eec57d4e2fd1fc92d4e Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 26 Oct 2018 21:05:14 +0000
Subject: [PATCH 0654/1116] [ValueTracking] peek through shuffles in
 ComputeNumSignBits (PR37549)

The motivating case is from PR37549:
https://bugs.llvm.org/show_bug.cgi?id=37549

The analysis improvement allows us to form a vector 'select' out of
bitwise logic (the use of ComputeNumSignBits was added at rL345149).

The smaller test shows another InstCombine improvement - we use
ComputeNumSignBits to add 'nsw' to shift-left. But the negative
test shows an example where we must not add 'nsw' - when the shuffle
mask contains undef elements.

Differential Revision: https://reviews.llvm.org/D53659


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345429 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ValueTracking.cpp                | 21 +++++++++++++++++++
 test/Transforms/InstCombine/logical-select.ll | 11 ++++------
 test/Transforms/InstCombine/nsw.ll            | 20 ++++++++++++++++--
 3 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index b7ff81f9d54..3cef373f324 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -2510,6 +2510,27 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
     // valid for all elements of the vector (for example if vector is sign
     // extended, shifted, etc).
     return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+
+  case Instruction::ShuffleVector:
+    // If the shuffle mask contains any undefined elements, that element of the
+    // result is undefined. Propagating information from a source operand may
+    // not be correct in that case, so just bail out.
+    if (cast<ShuffleVectorInst>(U)->getMask()->containsUndefElement())
+      break;
+
+    assert((!isa<UndefValue>(U->getOperand(0)) ||
+            !isa<UndefValue>(U->getOperand(1)))
+           && "Should have simplified shuffle with 2 undef inputs");
+
+    // Look through shuffle of 1 source vector.
+    if (isa<UndefValue>(U->getOperand(0)))
+      return ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
+    if (isa<UndefValue>(U->getOperand(1)))
+      return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+
+    // TODO: We can look through shuffles of 2 sources by computing the minimum
+    // sign bits for each operand (similar to what we do for binops).
+    break;
   }
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index db1eae05083..b4260af75b4 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -594,13 +594,10 @@ define <4 x i32> @computesignbits_through_shuffles(<4 x float> %x, <4 x float> %
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <4 x i32> [[SHUF_OR1]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[S4:%.*]] = shufflevector <4 x i32> [[SHUF_OR1]], <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
 ; CHECK-NEXT:    [[SHUF_OR2:%.*]] = or <4 x i32> [[S3]], [[S4]]
-; CHECK-NEXT:    [[NOT_OR2:%.*]] = xor <4 x i32> [[SHUF_OR2]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[XBC:%.*]] = bitcast <4 x float> [[X]] to <4 x i32>
-; CHECK-NEXT:    [[ZBC:%.*]] = bitcast <4 x float> [[Z:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[AND1:%.*]] = and <4 x i32> [[NOT_OR2]], [[XBC]]
-; CHECK-NEXT:    [[AND2:%.*]] = and <4 x i32> [[SHUF_OR2]], [[ZBC]]
-; CHECK-NEXT:    [[SEL:%.*]] = or <4 x i32> [[AND1]], [[AND2]]
-; CHECK-NEXT:    ret <4 x i32> [[SEL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[SHUF_OR2]] to <4 x i1>
+; CHECK-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[Z:%.*]], <4 x float> [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[DOTV]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %cmp = fcmp ole <4 x float> %x, %y
   %sext = sext <4 x i1> %cmp to <4 x i32>
diff --git a/test/Transforms/InstCombine/nsw.ll b/test/Transforms/InstCombine/nsw.ll
index ab2cbb2d865..8cb6421268f 100644
--- a/test/Transforms/InstCombine/nsw.ll
+++ b/test/Transforms/InstCombine/nsw.ll
@@ -99,13 +99,13 @@ define i8 @nopreserve4(i8 %A, i8 %B) {
   ret i8 %add
 }
 
-; TODO: ComputeNumSignBits()/computeKnownBits() should look through a shufflevector.
+; TODO: computeKnownBits() should look through a shufflevector.
 
 define <3 x i32> @shl_nuw_nsw_shuffle_splat_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @shl_nuw_nsw_shuffle_splat_vec(
 ; CHECK-NEXT:    [[T2:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[T2]], <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[T3:%.*]] = shl <3 x i32> [[SHUF]], <i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[T3:%.*]] = shl nsw <3 x i32> [[SHUF]], <i32 17, i32 17, i32 17>
 ; CHECK-NEXT:    ret <3 x i32> [[T3]]
 ;
   %t2 = zext <2 x i8> %x to <2 x i32>
@@ -114,3 +114,19 @@ define <3 x i32> @shl_nuw_nsw_shuffle_splat_vec(<2 x i8> %x) {
   ret <3 x i32> %t3
 }
 
+; Negative test - if the shuffle mask contains an undef, we bail out to
+; avoid propagating information that may not be used consistently by callers.
+
+define <3 x i32> @shl_nuw_nsw_shuffle_undef_elt_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @shl_nuw_nsw_shuffle_undef_elt_splat_vec(
+; CHECK-NEXT:    [[T2:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[T2]], <2 x i32> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+; CHECK-NEXT:    [[T3:%.*]] = shl <3 x i32> [[SHUF]], <i32 17, i32 17, i32 17>
+; CHECK-NEXT:    ret <3 x i32> [[T3]]
+;
+  %t2 = zext <2 x i8> %x to <2 x i32>
+  %shuf = shufflevector <2 x i32> %t2, <2 x i32> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+  %t3 = shl <3 x i32> %shuf, <i32 17, i32 17, i32 17>
+  ret <3 x i32> %t3
+}
+
-- 
GitLab


From 36943624b277e5c64b76dd42c89a132db7e7d16c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 26 Oct 2018 21:32:04 +0000
Subject: [PATCH 0655/1116] [DAGCombiner] rearrange code in
 narrowExtractedVectorBinOp(); NFC

We can extend this code to handle many more cases
if an extract is cheap, so prepping for that change.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345430 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 46 ++++++++++++------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e8584921c42..318e398211c 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16681,30 +16681,14 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   // some of these bailouts with other transforms.
 
   // The extract index must be a constant, so we can map it to a concat operand.
-  auto *ExtractIndex = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
-  if (!ExtractIndex)
-    return SDValue();
-
-  // Only handle the case where we are doubling and then halving. A larger ratio
-  // may require more than two narrow binops to replace the wide binop.
-  EVT VT = Extract->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-  assert((ExtractIndex->getZExtValue() % NumElems) == 0 &&
-         "Extract index is not a multiple of the vector length.");
-  if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2)
+  auto *ExtractIndexC = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
+  if (!ExtractIndexC)
     return SDValue();
 
   // We are looking for an optionally bitcasted wide vector binary operator
   // feeding an extract subvector.
   SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0));
-
-  // TODO: The motivating case for this transform is an x86 AVX1 target. That
-  // target has temptingly almost legal versions of bitwise logic ops in 256-bit
-  // flavors, but no other 256-bit integer support. This could be extended to
-  // handle any binop, but that may require fixing/adding other folds to avoid
-  // codegen regressions.
-  unsigned BOpcode = BinOp.getOpcode();
-  if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
+  if (!ISD::isBinaryOp(BinOp.getNode()))
     return SDValue();
 
   // The binop must be a vector type, so we can chop it in half.
@@ -16713,18 +16697,36 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
     return SDValue();
 
   // Bail out if the target does not support a narrower version of the binop.
+  unsigned BOpcode = BinOp.getOpcode();
   EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
                                    WideBVT.getVectorNumElements() / 2);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
     return SDValue();
 
-  SDValue LHS = peekThroughBitcasts(BinOp.getOperand(0));
-  SDValue RHS = peekThroughBitcasts(BinOp.getOperand(1));
+  // Only handle the case where we are doubling and then halving. A larger ratio
+  // may require more than two narrow binops to replace the wide binop.
+  EVT VT = Extract->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned ExtractIndex = ExtractIndexC->getZExtValue();
+  assert(ExtractIndex % NumElems == 0 &&
+         "Extract index is not a multiple of the vector length.");
+  if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2)
+    return SDValue();
+
+  // TODO: The motivating case for this transform is an x86 AVX1 target. That
+  // target has temptingly almost legal versions of bitwise logic ops in 256-bit
+  // flavors, but no other 256-bit integer support. This could be extended to
+  // handle any binop, but that may require fixing/adding other folds to avoid
+  // codegen regressions.
+  if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
+    return SDValue();
 
   // We need at least one concatenation operation of a binop operand to make
   // this transform worthwhile. The concat must double the input vector sizes.
   // TODO: Should we also handle INSERT_SUBVECTOR patterns?
+  SDValue LHS = peekThroughBitcasts(BinOp.getOperand(0));
+  SDValue RHS = peekThroughBitcasts(BinOp.getOperand(1));
   bool ConcatL =
       LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2;
   bool ConcatR =
@@ -16735,7 +16737,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   // If one of the binop operands was not the result of a concat, we must
   // extract a half-sized operand for our new narrow binop. We can't just reuse
   // the original extract index operand because we may have bitcasted.
-  unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems;
+  unsigned ConcatOpNum = ExtractIndex / NumElems;
   unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
   EVT ExtBOIdxVT = Extract->getOperand(1).getValueType();
   SDLoc DL(Extract);
-- 
GitLab


From 2d7ab83128c3609622fcc790dbe28185c2bd355a Mon Sep 17 00:00:00 2001
From: Volodymyr Sapsai <vsapsai@apple.com>
Date: Fri, 26 Oct 2018 22:14:33 +0000
Subject: [PATCH 0656/1116] [VFS] Add property 'fallthrough' that controls
 fallback to real file system.

Default property value 'true' preserves current behavior. Value 'false' can be
used to create VFS "root", file system that gives better control over which
files compiler can use during compilation as there are no unpredictable
accesses to real file system.

Non-fallthrough use case changes how we treat multiple VFS overlay
files. Instead of all of them being at the same level just above a real
file system, now they are nested and subsequent overlays can refer to
files in previous overlays.

rdar://problem/39465552

Reviewers: bruno, benlangmuir

Reviewed By: bruno

Subscribers: dexonsmith, cfe-commits, hiraditya

Differential Revision: https://reviews.llvm.org/D50539


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345431 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/VirtualFileSystem.cpp           | 114 +++++++++++++++++---
 unittests/Support/VirtualFileSystemTest.cpp |  86 +++++++++++++++
 2 files changed, 185 insertions(+), 15 deletions(-)

diff --git a/lib/Support/VirtualFileSystem.cpp b/lib/Support/VirtualFileSystem.cpp
index cf7fe967f01..c9920197fba 100644
--- a/lib/Support/VirtualFileSystem.cpp
+++ b/lib/Support/VirtualFileSystem.cpp
@@ -993,16 +993,44 @@ public:
   static bool classof(const Entry *E) { return E->getKind() == EK_File; }
 };
 
+// FIXME: reuse implementation common with OverlayFSDirIterImpl as these
+// iterators are conceptually similar.
 class VFSFromYamlDirIterImpl : public llvm::vfs::detail::DirIterImpl {
   std::string Dir;
   RedirectingDirectoryEntry::iterator Current, End;
 
-  std::error_code incrementImpl();
+  // To handle 'fallthrough' mode we need to iterate at first through
+  // RedirectingDirectoryEntry and then through ExternalFS. These operations are
+  // done sequentially, we just need to keep a track of what kind of iteration
+  // we are currently performing.
+
+  /// Flag telling if we should iterate through ExternalFS or stop at the last
+  /// RedirectingDirectoryEntry::iterator.
+  bool IterateExternalFS;
+  /// Flag telling if we have switched to iterating through ExternalFS.
+  bool IsExternalFSCurrent = false;
+  FileSystem &ExternalFS;
+  directory_iterator ExternalDirIter;
+  llvm::StringSet<> SeenNames;
+
+  /// To combine multiple iterations, different methods are responsible for
+  /// different iteration steps.
+  /// @{
+
+  /// Responsible for dispatching between RedirectingDirectoryEntry iteration
+  /// and ExternalFS iteration.
+  std::error_code incrementImpl(bool IsFirstTime);
+  /// Responsible for RedirectingDirectoryEntry iteration.
+  std::error_code incrementContent(bool IsFirstTime);
+  /// Responsible for ExternalFS iteration.
+  std::error_code incrementExternal();
+  /// @}
 
 public:
   VFSFromYamlDirIterImpl(const Twine &Path,
                          RedirectingDirectoryEntry::iterator Begin,
                          RedirectingDirectoryEntry::iterator End,
+                         bool IterateExternalFS, FileSystem &ExternalFS,
                          std::error_code &EC);
 
   std::error_code increment() override;
@@ -1028,6 +1056,7 @@ public:
 ///   'case-sensitive': <boolean, default=true>
 ///   'use-external-names': <boolean, default=true>
 ///   'overlay-relative': <boolean, default=false>
+///   'fallthrough': <boolean, default=true>
 ///
 /// Virtual directories are represented as
 /// \verbatim
@@ -1091,6 +1120,10 @@ class RedirectingFileSystem : public vfs::FileSystem {
   /// Whether to use to use the value of 'external-contents' for the
   /// names of files.  This global value is overridable on a per-file basis.
   bool UseExternalNames = true;
+
+  /// Whether to attempt a file lookup in external file system after it wasn't
+  /// found in VFS.
+  bool IsFallthrough = true;
   /// @}
 
   /// Virtual file paths and external files could be canonicalized without "..",
@@ -1141,6 +1174,8 @@ public:
     ErrorOr<Entry *> E = lookupPath(Dir);
     if (!E) {
       EC = E.getError();
+      if (IsFallthrough && EC == errc::no_such_file_or_directory)
+        return ExternalFS->dir_begin(Dir, EC);
       return {};
     }
     ErrorOr<Status> S = status(Dir, *E);
@@ -1156,7 +1191,8 @@ public:
 
     auto *D = cast<RedirectingDirectoryEntry>(*E);
     return directory_iterator(std::make_shared<VFSFromYamlDirIterImpl>(
-        Dir, D->contents_begin(), D->contents_end(), EC));
+        Dir, D->contents_begin(), D->contents_end(),
+        /*IterateExternalFS=*/IsFallthrough, *ExternalFS, EC));
   }
 
   void setExternalContentsPrefixDir(StringRef PrefixDir) {
@@ -1538,6 +1574,7 @@ public:
         KeyStatusPair("case-sensitive", false),
         KeyStatusPair("use-external-names", false),
         KeyStatusPair("overlay-relative", false),
+        KeyStatusPair("fallthrough", false),
         KeyStatusPair("roots", true),
     };
 
@@ -1595,6 +1632,9 @@ public:
       } else if (Key == "use-external-names") {
         if (!parseScalarBool(I.getValue(), FS->UseExternalNames))
           return false;
+      } else if (Key == "fallthrough") {
+        if (!parseScalarBool(I.getValue(), FS->IsFallthrough))
+          return false;
       } else {
         llvm_unreachable("key missing from Keys");
       }
@@ -1760,8 +1800,13 @@ ErrorOr<Status> RedirectingFileSystem::status(const Twine &Path, Entry *E) {
 
 ErrorOr<Status> RedirectingFileSystem::status(const Twine &Path) {
   ErrorOr<Entry *> Result = lookupPath(Path);
-  if (!Result)
+  if (!Result) {
+    if (IsFallthrough &&
+        Result.getError() == llvm::errc::no_such_file_or_directory) {
+      return ExternalFS->status(Path);
+    }
     return Result.getError();
+  }
   return status(Path, *Result);
 }
 
@@ -1793,8 +1838,13 @@ public:
 ErrorOr<std::unique_ptr<File>>
 RedirectingFileSystem::openFileForRead(const Twine &Path) {
   ErrorOr<Entry *> E = lookupPath(Path);
-  if (!E)
+  if (!E) {
+    if (IsFallthrough &&
+        E.getError() == llvm::errc::no_such_file_or_directory) {
+      return ExternalFS->openFileForRead(Path);
+    }
     return E.getError();
+  }
 
   auto *F = dyn_cast<RedirectingFileEntry>(*E);
   if (!F) // FIXME: errc::not_a_file?
@@ -2035,18 +2085,42 @@ void YAMLVFSWriter::write(llvm::raw_ostream &OS) {
 
 VFSFromYamlDirIterImpl::VFSFromYamlDirIterImpl(
     const Twine &_Path, RedirectingDirectoryEntry::iterator Begin,
-    RedirectingDirectoryEntry::iterator End, std::error_code &EC)
-    : Dir(_Path.str()), Current(Begin), End(End) {
-  EC = incrementImpl();
+    RedirectingDirectoryEntry::iterator End, bool IterateExternalFS,
+    FileSystem &ExternalFS, std::error_code &EC)
+    : Dir(_Path.str()), Current(Begin), End(End),
+      IterateExternalFS(IterateExternalFS), ExternalFS(ExternalFS) {
+  EC = incrementImpl(/*IsFirstTime=*/true);
 }
 
 std::error_code VFSFromYamlDirIterImpl::increment() {
-  assert(Current != End && "cannot iterate past end");
-  ++Current;
-  return incrementImpl();
+  return incrementImpl(/*IsFirstTime=*/false);
+}
+
+std::error_code VFSFromYamlDirIterImpl::incrementExternal() {
+  assert(!(IsExternalFSCurrent && ExternalDirIter == directory_iterator()) &&
+         "incrementing past end");
+  std::error_code EC;
+  if (IsExternalFSCurrent) {
+    ExternalDirIter.increment(EC);
+  } else if (IterateExternalFS) {
+    ExternalDirIter = ExternalFS.dir_begin(Dir, EC);
+    IsExternalFSCurrent = true;
+    if (EC && EC != errc::no_such_file_or_directory)
+      return EC;
+    EC = {};
+  }
+  if (EC || ExternalDirIter == directory_iterator()) {
+    CurrentEntry = directory_entry();
+  } else {
+    CurrentEntry = *ExternalDirIter;
+  }
+  return EC;
 }
 
-std::error_code VFSFromYamlDirIterImpl::incrementImpl() {
+std::error_code VFSFromYamlDirIterImpl::incrementContent(bool IsFirstTime) {
+  assert(IsFirstTime || Current != End && "cannot iterate past end");
+  if (!IsFirstTime)
+    ++Current;
   while (Current != End) {
     SmallString<128> PathStr(Dir);
     llvm::sys::path::append(PathStr, (*Current)->getName());
@@ -2060,12 +2134,22 @@ std::error_code VFSFromYamlDirIterImpl::incrementImpl() {
       break;
     }
     CurrentEntry = directory_entry(PathStr.str(), Type);
-    break;
+    return {};
   }
+  return incrementExternal();
+}
 
-  if (Current == End)
-    CurrentEntry = directory_entry();
-  return {};
+std::error_code VFSFromYamlDirIterImpl::incrementImpl(bool IsFirstTime) {
+  while (true) {
+    std::error_code EC = IsExternalFSCurrent ? incrementExternal()
+                                             : incrementContent(IsFirstTime);
+    if (EC || CurrentEntry.path().empty())
+      return EC;
+    StringRef Name = llvm::sys::path::filename(CurrentEntry.path());
+    if (SeenNames.insert(Name).second)
+      return EC; // name not seen before
+  }
+  llvm_unreachable("returned above");
 }
 
 vfs::recursive_directory_iterator::recursive_directory_iterator(
diff --git a/unittests/Support/VirtualFileSystemTest.cpp b/unittests/Support/VirtualFileSystemTest.cpp
index 58d928516f9..992704c18fa 100644
--- a/unittests/Support/VirtualFileSystemTest.cpp
+++ b/unittests/Support/VirtualFileSystemTest.cpp
@@ -1599,3 +1599,89 @@ TEST_F(VFSFromYAMLTest, RelativePaths) {
 
   EXPECT_EQ(3, NumDiagnostics);
 }
+
+TEST_F(VFSFromYAMLTest, NonFallthroughDirectoryIteration) {
+  IntrusiveRefCntPtr<DummyFileSystem> Lower(new DummyFileSystem());
+  Lower->addDirectory("//root/");
+  Lower->addRegularFile("//root/a");
+  Lower->addRegularFile("//root/b");
+  IntrusiveRefCntPtr<vfs::FileSystem> FS = getFromYAMLString(
+      "{ 'use-external-names': false,\n"
+      "  'fallthrough': false,\n"
+      "  'roots': [\n"
+      "{\n"
+      "  'type': 'directory',\n"
+      "  'name': '//root/',\n"
+      "  'contents': [ {\n"
+      "                  'type': 'file',\n"
+      "                  'name': 'c',\n"
+      "                  'external-contents': '//root/a'\n"
+      "                }\n"
+      "              ]\n"
+      "}\n"
+      "]\n"
+      "}",
+      Lower);
+  ASSERT_TRUE(FS.get() != nullptr);
+
+  std::error_code EC;
+  checkContents(FS->dir_begin("//root/", EC),
+                {"//root/c"});
+}
+
+TEST_F(VFSFromYAMLTest, DirectoryIterationWithDuplicates) {
+  IntrusiveRefCntPtr<DummyFileSystem> Lower(new DummyFileSystem());
+  Lower->addDirectory("//root/");
+  Lower->addRegularFile("//root/a");
+  Lower->addRegularFile("//root/b");
+  IntrusiveRefCntPtr<vfs::FileSystem> FS = getFromYAMLString(
+      "{ 'use-external-names': false,\n"
+      "  'roots': [\n"
+      "{\n"
+      "  'type': 'directory',\n"
+      "  'name': '//root/',\n"
+      "  'contents': [ {\n"
+      "                  'type': 'file',\n"
+      "                  'name': 'a',\n"
+      "                  'external-contents': '//root/a'\n"
+      "                }\n"
+      "              ]\n"
+      "}\n"
+      "]\n"
+      "}",
+	  Lower);
+  ASSERT_TRUE(FS.get() != nullptr);
+
+  std::error_code EC;
+  checkContents(FS->dir_begin("//root/", EC),
+                {"//root/a", "//root/b"});
+}
+
+TEST_F(VFSFromYAMLTest, DirectoryIterationErrorInVFSLayer) {
+  IntrusiveRefCntPtr<DummyFileSystem> Lower(new DummyFileSystem());
+  Lower->addDirectory("//root/");
+  Lower->addDirectory("//root/foo");
+  Lower->addRegularFile("//root/foo/a");
+  Lower->addRegularFile("//root/foo/b");
+  IntrusiveRefCntPtr<vfs::FileSystem> FS = getFromYAMLString(
+      "{ 'use-external-names': false,\n"
+      "  'roots': [\n"
+      "{\n"
+      "  'type': 'directory',\n"
+      "  'name': '//root/',\n"
+      "  'contents': [ {\n"
+      "                  'type': 'file',\n"
+      "                  'name': 'bar/a',\n"
+      "                  'external-contents': '//root/foo/a'\n"
+      "                }\n"
+      "              ]\n"
+      "}\n"
+      "]\n"
+      "}",
+      Lower);
+  ASSERT_TRUE(FS.get() != nullptr);
+
+  std::error_code EC;
+  checkContents(FS->dir_begin("//root/foo", EC),
+                {"//root/foo/a", "//root/foo/b"});
+}
-- 
GitLab


From b49d1c1b41a5a918f0cf99b57a9c464698595397 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Fri, 26 Oct 2018 22:51:51 +0000
Subject: [PATCH 0657/1116] Revert "[PassManager/Sanitizer] Enable usage of
 ported AddressSanitizer passes with -fsanitize=address"

This reverts commit 8d6af840396f2da2e4ed6aab669214ae25443204 and commit
b78d19c287b6e4a9abc9fb0545de9a3106d38d3d which causes slower build times
by initializing the AddressSanitizer on every function run.

The corresponding revisions are https://reviews.llvm.org/D52814 and
https://reviews.llvm.org/D52739.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345433 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/InitializePasses.h               |   4 +-
 .../Instrumentation/AddressSanitizerPass.h    |  41 ----
 lib/Passes/PassBuilder.cpp                    |   3 +-
 lib/Passes/PassRegistry.def                   |   2 -
 .../Instrumentation/AddressSanitizer.cpp      | 180 ++++++------------
 .../Instrumentation/Instrumentation.cpp       |   4 +-
 .../Instrumentation/AddressSanitizer/basic.ll |   2 -
 7 files changed, 68 insertions(+), 168 deletions(-)
 delete mode 100644 include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h

diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 42bfc55b1aa..1a9c6f82bfd 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -65,8 +65,8 @@ void initializeAAEvalLegacyPassPass(PassRegistry&);
 void initializeAAResultsWrapperPassPass(PassRegistry&);
 void initializeADCELegacyPassPass(PassRegistry&);
 void initializeAddDiscriminatorsLegacyPassPass(PassRegistry&);
-void initializeAddressSanitizerModuleLegacyPassPass(PassRegistry &);
-void initializeAddressSanitizerLegacyPassPass(PassRegistry &);
+void initializeAddressSanitizerModulePass(PassRegistry&);
+void initializeAddressSanitizerPass(PassRegistry&);
 void initializeAggressiveInstCombinerLegacyPassPass(PassRegistry&);
 void initializeAliasSetPrinterPass(PassRegistry&);
 void initializeAlignmentFromAssumptionsPass(PassRegistry&);
diff --git a/include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h b/include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h
deleted file mode 100644
index 021e1bd4c24..00000000000
--- a/include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//===--------- Definition of the AddressSanitizer class ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the AddressSanitizer class which is a port of the legacy
-// AddressSanitizer pass to use the new PassManager infrastructure.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_ADDRESSSANITIZERPASS_H
-#define LLVM_TRANSFORMS_INSTRUMENTATION_ADDRESSSANITIZERPASS_H
-
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-
-/// Public interface to the address sanitizer pass for instrumenting code to
-/// check for various memory bugs.
-class AddressSanitizerPass : public PassInfoMixin<AddressSanitizerPass> {
-public:
-  explicit AddressSanitizerPass(bool CompileKernel = false,
-                                bool Recover = false,
-                                bool UseAfterScope = false);
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-
-private:
-  bool CompileKernel;
-  bool Recover;
-  bool UseAfterScope;
-};
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 90561b05e62..c23c8c8d47a 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -62,6 +62,7 @@
 #include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
+#include "llvm/Transforms/Instrumentation/CGProfile.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/IPO/CalledValuePropagation.h"
@@ -87,9 +88,7 @@
 #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Instrumentation/AddressSanitizerPass.h"
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
-#include "llvm/Transforms/Instrumentation/CGProfile.h"
 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index ad03942fb9a..8de4541a772 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -40,7 +40,6 @@ MODULE_ALIAS_ANALYSIS("globals-aa", GlobalsAA())
 #define MODULE_PASS(NAME, CREATE_PASS)
 #endif
 MODULE_PASS("always-inline", AlwaysInlinerPass())
-MODULE_PASS("asan", AddressSanitizerPass(false, false, true))
 MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
 MODULE_PASS("cg-profile", CGProfilePass())
 MODULE_PASS("constmerge", ConstantMergePass())
@@ -148,7 +147,6 @@ FUNCTION_PASS("adce", ADCEPass())
 FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass())
 FUNCTION_PASS("aggressive-instcombine", AggressiveInstCombinePass())
 FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
-FUNCTION_PASS("asan", AddressSanitizerPass(false, false, false))
 FUNCTION_PASS("bdce", BDCEPass())
 FUNCTION_PASS("bounds-checking", BoundsCheckingPass())
 FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index dcbaf7a62f2..42b8179f800 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/IR/Argument.h"
@@ -69,10 +70,8 @@
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Instrumentation/AddressSanitizerPass.h"
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
@@ -598,22 +597,26 @@ static size_t RedzoneSizeForScale(int MappingScale) {
 namespace {
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
-struct AddressSanitizer {
-  explicit AddressSanitizer(Module &M, DominatorTree *DT,
-                            bool CompileKernel = false, bool Recover = false,
+struct AddressSanitizer : public FunctionPass {
+  // Pass identification, replacement for typeid
+  static char ID;
+
+  explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false,
                             bool UseAfterScope = false)
-      : UseAfterScope(UseAfterScope || ClUseAfterScope), DT(DT) {
+      : FunctionPass(ID), UseAfterScope(UseAfterScope || ClUseAfterScope) {
     this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
     this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ?
         ClEnableKasan : CompileKernel;
+    initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "AddressSanitizerFunctionPass";
+  }
 
-    // Initialize the private fields. No one has accessed them before.
-    GlobalsMD.init(M);
-    C = &(M.getContext());
-    LongSize = M.getDataLayout().getPointerSizeInBits();
-    IntptrTy = Type::getIntNTy(*C, LongSize);
-    TargetTriple = Triple(M.getTargetTriple());
-    Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 
   uint64_t getAllocaSizeInBytes(const AllocaInst &AI) const {
@@ -658,12 +661,12 @@ struct AddressSanitizer {
                                  Value *SizeArgument, uint32_t Exp);
   void instrumentMemIntrinsic(MemIntrinsic *MI);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool runOnFunction(Function &F) override;
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
   void maybeInsertDynamicShadowAtFunctionEntry(Function &F);
   void markEscapedLocalAllocas(Function &F);
-
-  /// Return true if the function changed.
-  bool instrument(Function &F, const TargetLibraryInfo *TLI);
+  bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
 
   DominatorTree &getDominatorTree() const { return *DT; }
 
@@ -721,12 +724,16 @@ private:
   DenseMap<const AllocaInst *, bool> ProcessedAllocas;
 };
 
-class AddressSanitizerModule {
+class AddressSanitizerModule : public ModulePass {
 public:
+  // Pass identification, replacement for typeid
+  static char ID;
+
   explicit AddressSanitizerModule(bool CompileKernel = false,
                                   bool Recover = false,
                                   bool UseGlobalsGC = true)
-      : UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+      : ModulePass(ID),
+        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
         // Not a typo: ClWithComdat is almost completely pointless without
         // ClUseGlobalsGC (because then it only works on modules without
         // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
@@ -735,12 +742,14 @@ public:
         // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
         // do globals-gc.
         UseCtorComdat(UseGlobalsGC && ClWithComdat) {
-    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
-    this->CompileKernel =
-        ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel;
-  }
+          this->Recover = ClRecover.getNumOccurrences() > 0 ?
+              ClRecover : Recover;
+          this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ?
+              ClEnableKasan : CompileKernel;
+	}
 
-  bool instrument(Module &M);
+  bool runOnModule(Module &M) override;
+  StringRef getPassName() const override { return "AddressSanitizerModule"; }
 
 private:
   void initializeCallbacks(Module &M);
@@ -1048,102 +1057,18 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
                      Instruction *ThenTerm, Value *ValueIfFalse);
 };
 
-class AddressSanitizerLegacyPass : public FunctionPass {
-public:
-  static char ID;
-
-  explicit AddressSanitizerLegacyPass(bool CompileKernel = false,
-                                      bool Recover = false,
-                                      bool UseAfterScope = false)
-      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover),
-        UseAfterScope(UseAfterScope) {
-    initializeAddressSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override {
-    return "AddressSanitizerFunctionPass";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-  }
-
-  bool runOnFunction(Function &F) override {
-    DominatorTree *DTree =
-        &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-    AddressSanitizer Sanitizer(*F.getParent(), DTree, CompileKernel, Recover,
-                               UseAfterScope);
-    return Sanitizer.instrument(F, TLI);
-  }
-
-private:
-  bool CompileKernel;
-  bool Recover;
-  bool UseAfterScope;
-};
-
-class AddressSanitizerModuleLegacyPass : public ModulePass {
-public:
-  static char ID;
-
-  explicit AddressSanitizerModuleLegacyPass(bool CompileKernel = false,
-                                            bool Recover = false,
-                                            bool UseAfterScope = true)
-      : ModulePass(ID), CompileKernel(CompileKernel), Recover(Recover),
-        UseAfterScope(UseAfterScope) {}
-
-  StringRef getPassName() const override { return "AddressSanitizerModule"; }
-
-  bool runOnModule(Module &M) override {
-    AddressSanitizerModule Sanitizer(CompileKernel, Recover, UseAfterScope);
-    return Sanitizer.instrument(M);
-  }
-
-private:
-  bool CompileKernel;
-  bool Recover;
-  bool UseAfterScope;
-};
-
 } // end anonymous namespace
 
-AddressSanitizerPass::AddressSanitizerPass(bool CompileKernel, bool Recover,
-                                           bool UseAfterScope)
-    : CompileKernel(CompileKernel), Recover(Recover),
-      UseAfterScope(UseAfterScope) {}
-
-PreservedAnalyses AddressSanitizerPass::run(Function &F,
-                                            AnalysisManager<Function> &AM) {
-  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
-  const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
-  AddressSanitizer Sanitizer(*F.getParent(), DT, CompileKernel, Recover,
-                             UseAfterScope);
-  if (Sanitizer.instrument(F, TLI))
-    return PreservedAnalyses::none();
-  return PreservedAnalyses::all();
-}
-
-PreservedAnalyses AddressSanitizerPass::run(Module &M,
-                                            AnalysisManager<Module> &AM) {
-  AddressSanitizerModule Sanitizer(CompileKernel, Recover, UseAfterScope);
-  if (Sanitizer.instrument(M))
-    return PreservedAnalyses::none();
-  return PreservedAnalyses::all();
-}
-
-char AddressSanitizerLegacyPass::ID = 0;
+char AddressSanitizer::ID = 0;
 
 INITIALIZE_PASS_BEGIN(
-    AddressSanitizerLegacyPass, "asan",
+    AddressSanitizer, "asan",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(
-    AddressSanitizerLegacyPass, "asan",
+    AddressSanitizer, "asan",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
     false)
 
@@ -1151,13 +1076,13 @@ FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel,
                                                        bool Recover,
                                                        bool UseAfterScope) {
   assert(!CompileKernel || Recover);
-  return new AddressSanitizerLegacyPass(CompileKernel, Recover, UseAfterScope);
+  return new AddressSanitizer(CompileKernel, Recover, UseAfterScope);
 }
 
-char AddressSanitizerModuleLegacyPass::ID = 0;
+char AddressSanitizerModule::ID = 0;
 
 INITIALIZE_PASS(
-    AddressSanitizerModuleLegacyPass, "asan-module",
+    AddressSanitizerModule, "asan-module",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
     "ModulePass",
     false, false)
@@ -1166,8 +1091,7 @@ ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel,
                                                    bool Recover,
                                                    bool UseGlobalsGC) {
   assert(!CompileKernel || Recover);
-  return new AddressSanitizerModuleLegacyPass(CompileKernel, Recover,
-                                              UseGlobalsGC);
+  return new AddressSanitizerModule(CompileKernel, Recover, UseGlobalsGC);
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
@@ -2331,7 +2255,7 @@ int AddressSanitizerModule::GetAsanVersion(const Module &M) const {
   return Version;
 }
 
-bool AddressSanitizerModule::instrument(Module &M) {
+bool AddressSanitizerModule::runOnModule(Module &M) {
   C = &(M.getContext());
   int LongSize = M.getDataLayout().getPointerSizeInBits();
   IntptrTy = Type::getIntNTy(*C, LongSize);
@@ -2450,6 +2374,25 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
                                            ArrayType::get(IRB.getInt8Ty(), 0));
 }
 
+// virtual
+bool AddressSanitizer::doInitialization(Module &M) {
+  // Initialize the private fields. No one has accessed them before.
+  GlobalsMD.init(M);
+
+  C = &(M.getContext());
+  LongSize = M.getDataLayout().getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  TargetTriple = Triple(M.getTargetTriple());
+
+  Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
+  return true;
+}
+
+bool AddressSanitizer::doFinalization(Module &M) {
+  GlobalsMD.reset();
+  return false;
+}
+
 bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // For each NSObject descendant having a +load method, this method is invoked
   // by the ObjC runtime before any of the static constructors is called.
@@ -2523,7 +2466,7 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
   }
 }
 
-bool AddressSanitizer::instrument(Function &F, const TargetLibraryInfo *TLI) {
+bool AddressSanitizer::runOnFunction(Function &F) {
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
   if (F.getName().startswith("__asan_")) return false;
@@ -2542,6 +2485,7 @@ bool AddressSanitizer::instrument(Function &F, const TargetLibraryInfo *TLI) {
   LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
 
   initializeCallbacks(*F.getParent());
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   FunctionStateRAII CleanupObj(this);
 
@@ -2562,6 +2506,8 @@ bool AddressSanitizer::instrument(Function &F, const TargetLibraryInfo *TLI) {
   bool IsWrite;
   unsigned Alignment;
   uint64_t TypeSize;
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   // Fill the set of memory operations to instrument.
   for (auto &BB : F) {
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 55b449ffca1..16976ef90ce 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -88,8 +88,8 @@ Comdat *llvm::GetOrCreateFunctionComdat(Function &F,
 /// initializeInstrumentation - Initialize all passes in the TransformUtils
 /// library.
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
-  initializeAddressSanitizerLegacyPassPass(Registry);
-  initializeAddressSanitizerModuleLegacyPassPass(Registry);
+  initializeAddressSanitizerPass(Registry);
+  initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingLegacyPassPass(Registry);
   initializeControlHeightReductionLegacyPassPass(Registry);
   initializeGCOVProfilerLegacyPassPass(Registry);
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index be80a89392c..099965348eb 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -1,9 +1,7 @@
 ; Test basic address sanitizer instrumentation.
 ;
 ; RUN: opt < %s -asan -asan-module -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
-; RUN: opt < %s -passes='function(asan),module(asan)' -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
 ; RUN: opt < %s -asan -asan-module -asan-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
-; RUN: opt < %s -passes='function(asan),module(asan)' -asan-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
-- 
GitLab


From 0cbc52e6d48994222bb8277964174feacda3ff8b Mon Sep 17 00:00:00 2001
From: Ryan Prichard <rprichard@google.com>
Date: Fri, 26 Oct 2018 23:01:54 +0000
Subject: [PATCH 0658/1116] [llvm-readobj] Fix bugs with unrecognized types in
 switch statements

Summary:
Add missing breaks. Several functions used nested switch statements,
where the outer switch branches based on the architecture, and the inner
switch handles architecture-specific types. If the type isn't
architecture-specific, break out to the generic types rather than fall
through.

getElfPtType: For GNU-style output, llvm-readobj prints
"<unknown>: 0xnnnnnnnn" for an unrecognized segment type, unless the
architecture is EM_ARM, EM_MIPS, or EM_MIPS_RS3_LE, in which case it
prints "". This behavior appears accidental, so instead, always print
the "<unknown>: 0xnnnnnnnn" string.

Reviewers: pcc, grimar

Reviewed By: grimar

Subscribers: sdardis, javed.absar, arichardson, kristof.beyls, atanasyan, llvm-commits

Differential Revision: https://reviews.llvm.org/D53730

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345436 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-readobj/ELFDumper.cpp | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index 5e8a35f13a1..a1cf0aef1b4 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -1194,6 +1194,7 @@ static const char *getElfSegmentType(unsigned Arch, unsigned Type) {
     switch (Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, PT_ARM_EXIDX);
     }
+    break;
   case ELF::EM_MIPS:
   case ELF::EM_MIPS_RS3_LE:
     switch (Type) {
@@ -1202,6 +1203,7 @@ static const char *getElfSegmentType(unsigned Arch, unsigned Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_OPTIONS);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_ABIFLAGS);
     }
+    break;
   }
 
   switch (Type) {
@@ -1248,7 +1250,7 @@ static std::string getElfPtType(unsigned Arch, unsigned Type) {
     case ELF::EM_ARM:
       if (Type == ELF::PT_ARM_EXIDX)
         return "EXIDX";
-      return "";
+      break;
     case ELF::EM_MIPS:
     case ELF::EM_MIPS_RS3_LE:
       switch (Type) {
@@ -1261,7 +1263,7 @@ static std::string getElfPtType(unsigned Arch, unsigned Type) {
       case PT_MIPS_ABIFLAGS:
         return "ABIFLAGS";
       }
-      return "";
+      break;
     }
   }
   return std::string("<unknown>: ") + to_string(format_hex(Type, 1));
@@ -1638,29 +1640,32 @@ static const char *getTypeString(unsigned Arch, uint64_t Type) {
   case EM_HEXAGON:
     switch (Type) {
 #define HEXAGON_DYNAMIC_TAG(name, value)                                       \
-  case DT_##name:                                                              \
-    return #name;
+    case DT_##name:                                                            \
+      return #name;
 #include "llvm/BinaryFormat/DynamicTags.def"
 #undef HEXAGON_DYNAMIC_TAG
     }
+    break;
 
   case EM_MIPS:
     switch (Type) {
 #define MIPS_DYNAMIC_TAG(name, value)                                          \
-  case DT_##name:                                                              \
-    return #name;
+    case DT_##name:                                                            \
+      return #name;
 #include "llvm/BinaryFormat/DynamicTags.def"
 #undef MIPS_DYNAMIC_TAG
     }
+    break;
 
-    case EM_PPC64:
-      switch(Type) {
+  case EM_PPC64:
+    switch(Type) {
 #define PPC64_DYNAMIC_TAG(name, value)                                         \
     case DT_##name:                                                            \
       return #name;
 #include "llvm/BinaryFormat/DynamicTags.def"
 #undef PPC64_DYNAMIC_TAG
     }
+    break;
   }
 #undef DYNAMIC_TAG
   switch (Type) {
@@ -2829,11 +2834,13 @@ std::string getSectionTypeString(unsigned Arch, unsigned Type) {
     case SHT_ARM_OVERLAYSECTION:
       return "ARM_OVERLAYSECTION";
     }
+    break;
   case EM_X86_64:
     switch (Type) {
     case SHT_X86_64_UNWIND:
       return "X86_64_UNWIND";
     }
+    break;
   case EM_MIPS:
   case EM_MIPS_RS3_LE:
     switch (Type) {
@@ -2846,6 +2853,7 @@ std::string getSectionTypeString(unsigned Arch, unsigned Type) {
     case SHT_MIPS_DWARF:
       return "SHT_MIPS_DWARF";
     }
+    break;
   }
   switch (Type) {
   case SHT_NULL:
-- 
GitLab


From 00233291cf25baa54e5d058610e8a985deaac9e4 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 26 Oct 2018 23:06:28 +0000
Subject: [PATCH 0659/1116] [x86] adjust tests to preserve behavior; NFC

I'm planning a binop optimization that would subvert the
domain forcing ops in these tests, so turning them into
zexts.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345437 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/stack-folding-int-avx2.ll    | 12 +++++-----
 test/CodeGen/X86/stack-folding-int-avx512.ll  | 24 +++++++++----------
 .../CodeGen/X86/stack-folding-int-avx512vl.ll | 12 +++++-----
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/test/CodeGen/X86/stack-folding-int-avx2.ll b/test/CodeGen/X86/stack-folding-int-avx2.ll
index 061a8c971f9..9335acb90c0 100644
--- a/test/CodeGen/X86/stack-folding-int-avx2.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx2.ll
@@ -38,14 +38,14 @@ define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) {
   ret <8 x float> %3
 }
 
-define <4 x i32> @stack_fold_extracti128(<8 x i32> %a0, <8 x i32> %a1) {
+define <4 x i32> @stack_fold_extracti128(<8 x i16> %a0, <8 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_extracti128
   ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  ret <4 x i32> %2
+  ; zext forces execution domain
+  %t1 = zext <8 x i16> %a0 to <8 x i32>
+  %t2 = shufflevector <8 x i32> %t1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %t3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  ret <4 x i32> %t2
 }
 
 define <8 x i32> @stack_fold_inserti128(<4 x i32> %a0, <4 x i32> %a1) {
diff --git a/test/CodeGen/X86/stack-folding-int-avx512.ll b/test/CodeGen/X86/stack-folding-int-avx512.ll
index 9e6abf6cf5d..01ae7ff6d43 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512.ll
@@ -154,41 +154,41 @@ define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %m
   ret <32 x i16> %9
 }
 
-define <4 x i32> @stack_fold_extracti32x4(<16 x i32> %a0, <16 x i32> %a1) {
+define <4 x i32> @stack_fold_extracti32x4(<16 x i16> %a0, <16 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_extracti32x4
   ;CHECK:       vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <16 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ; zext forces execution domain
+  %1 = zext <16 x i16> %a0 to <16 x i32>
   %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <4 x i32> %2
 }
 
-define <2 x i64> @stack_fold_extracti64x2(<8 x i64> %a0, <8 x i64> %a1) {
+define <2 x i64> @stack_fold_extracti64x2(<8 x i32> %a0, <8 x i64> %a1) {
   ;CHECK-LABEL: stack_fold_extracti64x2
   ;CHECK:       vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <8 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  ; zext forces execution domain
+  %1 = zext <8 x i32> %a0 to <8 x i64>
   %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <2 x i64> %2
 }
 
-define <8 x i32> @stack_fold_extracti32x8(<16 x i32> %a0, <16 x i32> %a1) {
+define <8 x i32> @stack_fold_extracti32x8(<16 x i16> %a0, <16 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_extracti32x8
   ;CHECK:       vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <16 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ; zext forces execution domain
+  %1 = zext <16 x i16> %a0 to <16 x i32>
   %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <8 x i32> %2
 }
 
-define <4 x i64> @stack_fold_extracti64x4(<8 x i64> %a0, <8 x i64> %a1) {
+define <4 x i64> @stack_fold_extracti64x4(<8 x i32> %a0, <8 x i64> %a1) {
   ;CHECK-LABEL: stack_fold_extracti64x4
   ;CHECK:       vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <8 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  ; zext forces execution domain
+  %1 = zext <8 x i32> %a0 to <8 x i64>
   %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <4 x i64> %2
diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
index 76542f4761b..8d8676f0f9e 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
@@ -133,21 +133,21 @@ define <4 x i64> @stack_fold_vpconflictq_ymm(<4 x i64> %a0) {
 }
 declare <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64>, <4 x i64>, i8) nounwind readnone
 
-define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) {
+define <4 x i32> @stack_fold_extracti32x4(<8 x i16> %a0, <8 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_extracti32x4
   ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ; zext forces execution domain
+  %1 = zext <8 x i16> %a0 to <8 x i32>
   %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <4 x i32> %2
 }
 
-define <2 x i64> @stack_fold_extracti64x2(<4 x i64> %a0, <4 x i64> %a1) {
+define <2 x i64> @stack_fold_extracti64x2(<4 x i32> %a0, <4 x i64> %a1) {
   ;CHECK-LABEL: stack_fold_extracti64x2
   ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <4 x i64> %a0, <i64 1, i64 1, i64 1, i64 1>
+  ; zext forces execution domain
+  %1 = zext <4 x i32> %a0 to <4 x i64>
   %2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <2 x i64> %2
-- 
GitLab


From 3875fd1877c36e47283fbee64868cf56866bf190 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 26 Oct 2018 23:50:23 +0000
Subject: [PATCH 0660/1116] Further split cpus test

On GreenDragon, CodeGen/X86/cpus-no-x86_64.ll was still timing out even
after breaking up the original test. I further split off the intel and
AMD cpus which hopefully resolves this.

http://green.lab.llvm.org/green/job/clang-stage2-cmake-RgSan/

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345438 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/cpus-amd-no-x86_64.ll   | 17 ++++++++++++++++
 test/CodeGen/X86/cpus-intel-no-x86_64.ll | 24 +++++++++++++++++++++++
 test/CodeGen/X86/cpus-no-x86_64.ll       | 25 ------------------------
 3 files changed, 41 insertions(+), 25 deletions(-)
 create mode 100644 test/CodeGen/X86/cpus-amd-no-x86_64.ll
 create mode 100644 test/CodeGen/X86/cpus-intel-no-x86_64.ll

diff --git a/test/CodeGen/X86/cpus-amd-no-x86_64.ll b/test/CodeGen/X86/cpus-amd-no-x86_64.ll
new file mode 100644
index 00000000000..0dadc599abd
--- /dev/null
+++ b/test/CodeGen/X86/cpus-amd-no-x86_64.ll
@@ -0,0 +1,17 @@
+; Check that we reject 64-bit mode on 32-bit only CPUs.
+; CHECK-NO-ERROR-NOT: not a recognized processor for this target
+; CHECK-ERROR64: LLVM ERROR: 64-bit code requested on a subtarget that doesn't support it!
+
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6-2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6-3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-tbird 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-4 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-xp 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-mp 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=geode 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+
+define void @foo() {
+  ret void
+}
diff --git a/test/CodeGen/X86/cpus-intel-no-x86_64.ll b/test/CodeGen/X86/cpus-intel-no-x86_64.ll
new file mode 100644
index 00000000000..d28ac9a83fd
--- /dev/null
+++ b/test/CodeGen/X86/cpus-intel-no-x86_64.ll
@@ -0,0 +1,24 @@
+; Check that we reject 64-bit mode on 32-bit only CPUs.
+; CHECK-NO-ERROR-NOT: not a recognized processor for this target
+; CHECK-ERROR64: LLVM ERROR: 64-bit code requested on a subtarget that doesn't support it!
+
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i386 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i486 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i586 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium-mmx 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i686 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentiumpro 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=lakemont 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+
+define void @foo() {
+  ret void
+}
diff --git a/test/CodeGen/X86/cpus-no-x86_64.ll b/test/CodeGen/X86/cpus-no-x86_64.ll
index de873c81205..e2e00038671 100644
--- a/test/CodeGen/X86/cpus-no-x86_64.ll
+++ b/test/CodeGen/X86/cpus-no-x86_64.ll
@@ -2,31 +2,6 @@
 ; CHECK-NO-ERROR-NOT: not a recognized processor for this target
 ; CHECK-ERROR64: LLVM ERROR: 64-bit code requested on a subtarget that doesn't support it!
 
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i386 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i486 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i586 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium-mmx 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i686 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentiumpro 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=lakemont 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6-2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6-3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-tbird 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-4 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-xp 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-mp 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=geode 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
 ; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=winchip-c6 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
 ; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=winchip2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
 ; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=c3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-- 
GitLab


From 3e9dcf4c0bf864c545dc732b13888cda81162eb8 Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Sat, 27 Oct 2018 00:49:33 +0000
Subject: [PATCH 0661/1116] DebugInfo: reduce DIE range verification on object
 files

Relocatable content may have overlapping ranges until the sections are
finalized.  This reduces the amount of verification that is done on an object
file so that invalid errors are not raised.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345441 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFVerifier.h  |  7 ++-
 lib/DebugInfo/DWARF/DWARFVerifier.cpp         | 57 ++++++++++++++-----
 .../llvm-dwarfdump/X86/debug-verify-object.s  | 57 +++++++++++++++++++
 3 files changed, 106 insertions(+), 15 deletions(-)
 create mode 100644 test/tools/llvm-dwarfdump/X86/debug-verify-object.s

diff --git a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 3ad65cf51b1..e47fbea5646 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -97,6 +97,9 @@ private:
   /// lies between to valid DIEs.
   std::map<uint64_t, std::set<uint32_t>> ReferenceToDIEOffsets;
   uint32_t NumDebugLineErrors = 0;
+  // Used to relax some checks that do not currently work portably
+  bool IsObjectFile;
+  bool IsMachOObject;
 
   raw_ostream &error() const;
   raw_ostream &warn() const;
@@ -286,8 +289,8 @@ private:
 
 public:
   DWARFVerifier(raw_ostream &S, DWARFContext &D,
-                DIDumpOptions DumpOpts = DIDumpOptions::getForSingleDIE())
-      : OS(S), DCtx(D), DumpOpts(std::move(DumpOpts)) {}
+                DIDumpOptions DumpOpts = DIDumpOptions::getForSingleDIE());
+
   /// Verify the information in any of the following sections, if available:
   /// .debug_abbrev, debug_abbrev.dwo
   ///
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index d30600accd0..1f089a7030d 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -394,20 +394,42 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
   // Build RI for this DIE and check that ranges within this DIE do not
   // overlap.
   DieRangeInfo RI(Die);
-  for (auto Range : Ranges) {
-    if (!Range.valid()) {
-      ++NumErrors;
-      error() << "Invalid address range " << Range << "\n";
-      continue;
-    }
 
-    // Verify that ranges don't intersect.
-    const auto IntersectingRange = RI.insert(Range);
-    if (IntersectingRange != RI.Ranges.end()) {
-      ++NumErrors;
-      error() << "DIE has overlapping address ranges: " << Range << " and "
-              << *IntersectingRange << "\n";
-      break;
+  // TODO support object files better
+  //
+  // Some object file formats (i.e. non-MachO) support COMDAT.  ELF in
+  // particular does so by placing each function into a section.  The DWARF data
+  // for the function at that point uses a section relative DW_FORM_addrp for
+  // the DW_AT_low_pc and a DW_FORM_data4 for the offset as the DW_AT_high_pc.
+  // In such a case, when the Die is the CU, the ranges will overlap, and we
+  // will flag valid conflicting ranges as invalid.
+  //
+  // For such targets, we should read the ranges from the CU and partition them
+  // by the section id.  The ranges within a particular section should be
+  // disjoint, although the ranges across sections may overlap.  We would map
+  // the child die to the entity that it references and the section with which
+  // it is associated.  The child would then be checked against the range
+  // information for the associated section.
+  //
+  // For now, simply elide the range verification for the CU DIEs if we are
+  // processing an object file.
+
+  if (!IsObjectFile || IsMachOObject || Die.getTag() == DW_TAG_subprogram) {
+    for (auto Range : Ranges) {
+      if (!Range.valid()) {
+        ++NumErrors;
+        error() << "Invalid address range " << Range << "\n";
+        continue;
+      }
+
+      // Verify that ranges don't intersect.
+      const auto IntersectingRange = RI.insert(Range);
+      if (IntersectingRange != RI.Ranges.end()) {
+        ++NumErrors;
+        error() << "DIE has overlapping address ranges: " << Range << " and "
+                << *IntersectingRange << "\n";
+        break;
+      }
     }
   }
 
@@ -745,6 +767,15 @@ void DWARFVerifier::verifyDebugLineRows() {
   }
 }
 
+DWARFVerifier::DWARFVerifier(raw_ostream &S, DWARFContext &D,
+                             DIDumpOptions DumpOpts)
+    : OS(S), DCtx(D), DumpOpts(std::move(DumpOpts)) {
+  if (const auto *F = DCtx.getDWARFObj().getFile()) {
+    IsObjectFile = F->isRelocatableObject();
+    IsMachOObject = F->isMachO();
+  }
+}
+
 bool DWARFVerifier::handleDebugLine() {
   NumDebugLineErrors = 0;
   OS << "Verifying .debug_line...\n";
diff --git a/test/tools/llvm-dwarfdump/X86/debug-verify-object.s b/test/tools/llvm-dwarfdump/X86/debug-verify-object.s
new file mode 100644
index 00000000000..57570c5b276
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/debug-verify-object.s
@@ -0,0 +1,57 @@
+# RUN: llvm-mc -filetype obj -o - %s | llvm-dwarfdump --verify -
+
+	.text
+
+	.section	.text.f,"ax",@progbits
+	.globl	f
+	.type	f,@function
+f:
+.Lfunc_begin0:
+	pushq	$32
+	popq	%rax
+	retq
+.Lfunc_end0:
+	.size	f, .Lfunc_end0-f
+
+	.section	.text.g,"ax",@progbits
+	.globl	g
+	.type	g,@function
+g:
+.Lfunc_begin1:
+	pushq   $64
+	popq    %rax
+	retq
+.Lfunc_end1:
+	.size	g, .Lfunc_end1-g
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                       # Abbreviation Code
+	.byte	17                      # DW_TAG_compile_unit
+	.byte	0                       # DW_CHILDREN_no
+	.byte	17                      # DW_AT_low_pc
+	.byte	1                       # DW_FORM_addr
+	.byte	85                      # DW_AT_ranges
+	.byte	23                      # DW_FORM_sec_offset
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	0                       # EOM(3)
+
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	20                      # Length of Unit
+	.short	4                       # DWARF version number
+	.long	.debug_abbrev           # Offset Into Abbrev. Section
+	.byte	8                       # Address Size (in bytes)
+	.byte	1                       # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit
+	.quad	0                       # DW_AT_low_pc
+	.long	.Ldebug_ranges0         # DW_AT_ranges
+
+	.section        .debug_ranges,"",@progbits
+.Ldebug_ranges0:
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_end0
+	.quad	.Lfunc_begin1
+	.quad	.Lfunc_end1
+	.quad	0
+	.quad	0
+
-- 
GitLab


From 29640ad38b7832c0125917bba9ea2cab43b1793f Mon Sep 17 00:00:00 2001
From: Brendon Cahoon <bcahoon@codeaurora.org>
Date: Sat, 27 Oct 2018 00:50:29 +0000
Subject: [PATCH 0662/1116] [Hexagon] Add missing assignment to Itinerary in
 Call_nr

The class definition for Call_nr has the itinerary as a
parameter, but the value is never assigned to the Itinerary
field for the instruction. This means the compiler is unable
to schedule and packetize the instruction correctly because
these instrution will not have any resource descritions.
I don't have a specific test case, but the ps_call_nr.ll
test failed with a proposed patch.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345442 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonPseudo.td | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
index fd7466349ec..6935e3b7beb 100644
--- a/lib/Target/Hexagon/HexagonPseudo.td
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -208,6 +208,7 @@ class Call_nr<bits<5> nbits, bit isPred, bit isFalse, dag iops,
     let isPredicable = 0;  // !if(isPred, 0, 1);
     let isPredicated = 0;  // isPred;
     let isPredicatedFalse = isFalse;
+    let Itinerary = itin;
 }
 
 def PS_call_nr : Call_nr<24, 0, 0, (ins s32_0Imm:$Ii), J2_call.Itinerary>;
-- 
GitLab


From f20a65307c9af83083d78cf5922b73b5d901c7dd Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Sat, 27 Oct 2018 02:27:38 +0000
Subject: [PATCH 0663/1116] test: add missing -triple

Ensure that the test builds for x86_64 as it is an assembly test.  This
should repair the buildbots.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345444 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-dwarfdump/X86/debug-verify-object.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/tools/llvm-dwarfdump/X86/debug-verify-object.s b/test/tools/llvm-dwarfdump/X86/debug-verify-object.s
index 57570c5b276..be79c95c0b1 100644
--- a/test/tools/llvm-dwarfdump/X86/debug-verify-object.s
+++ b/test/tools/llvm-dwarfdump/X86/debug-verify-object.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -filetype obj -o - %s | llvm-dwarfdump --verify -
+# RUN: llvm-mc -triple x86_64-unknown-linux-gnu -filetype obj -o - %s | llvm-dwarfdump --verify -
 
 	.text
 
-- 
GitLab


From 6cf89e3f8481a3eb947ce04f4baf018a8fc93959 Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Sat, 27 Oct 2018 04:51:12 +0000
Subject: [PATCH 0664/1116] Revert r345169 [along with its llvm counterpart
 r345170] as it makes Halide builds timeout.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345447 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonTargetTransformInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 5cfaa42ae5c..79b269bccfe 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hexagontti"
 
-static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(true),
+static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),
   cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
 
 static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
-- 
GitLab


From 797a40a2bbcc17235f798745e928465a7d0a2a6a Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sat, 27 Oct 2018 05:35:20 +0000
Subject: [PATCH 0665/1116] [X86] Add some isel patterns for
 scalar_to_vector/extract_vector_element that use the avx512 extended register
 classes when they are available.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345448 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrVecCompiler.td        |  44 +++++--
 test/CodeGen/X86/pr34653.ll                  | 129 +++++++------------
 test/CodeGen/X86/sse-intrinsics-fast-isel.ll |  16 +--
 3 files changed, 87 insertions(+), 102 deletions(-)

diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td
index 0d226a3367a..0aeed51dde5 100644
--- a/lib/Target/X86/X86InstrVecCompiler.td
+++ b/lib/Target/X86/X86InstrVecCompiler.td
@@ -16,19 +16,39 @@
 //  Non-instruction patterns
 //===----------------------------------------------------------------------===//
 
-// A vector extract of the first f32/f64 position is a subregister copy
-def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
-          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
-def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
-          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
-
-// Implicitly promote a 32-bit scalar to a vector.
-def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
-          (COPY_TO_REGCLASS FR32:$src, VR128)>;
-// Implicitly promote a 64-bit scalar to a vector.
-def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
-          (COPY_TO_REGCLASS FR64:$src, VR128)>;
+let Predicates = [NoAVX512] in {
+  // A vector extract of the first f32/f64 position is a subregister copy
+  def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+            (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
+  def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+            (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+}
+
+let Predicates = [HasAVX512] in {
+  // A vector extract of the first f32/f64 position is a subregister copy
+  def : Pat<(f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
+            (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X)>;
+  def : Pat<(f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))),
+            (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X)>;
+}
 
+let Predicates = [NoVLX] in {
+  // Implicitly promote a 32-bit scalar to a vector.
+  def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+            (COPY_TO_REGCLASS FR32:$src, VR128)>;
+  // Implicitly promote a 64-bit scalar to a vector.
+  def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+            (COPY_TO_REGCLASS FR64:$src, VR128)>;
+}
+
+let Predicates = [HasVLX] in {
+  // Implicitly promote a 32-bit scalar to a vector.
+  def : Pat<(v4f32 (scalar_to_vector FR32X:$src)),
+            (COPY_TO_REGCLASS FR32X:$src, VR128X)>;
+  // Implicitly promote a 64-bit scalar to a vector.
+  def : Pat<(v2f64 (scalar_to_vector FR64X:$src)),
+            (COPY_TO_REGCLASS FR64X:$src, VR128X)>;
+}
 
 //===----------------------------------------------------------------------===//
 // Subvector tricks
diff --git a/test/CodeGen/X86/pr34653.ll b/test/CodeGen/X86/pr34653.ll
index 858e0f46c3a..54d2e714635 100644
--- a/test/CodeGen/X86/pr34653.ll
+++ b/test/CodeGen/X86/pr34653.ll
@@ -12,7 +12,7 @@ define void @pr34653() {
 ; CHECK-NEXT:    movq %rsp, %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:    andq $-512, %rsp # imm = 0xFE00
-; CHECK-NEXT:    subq $2048, %rsp # imm = 0x800
+; CHECK-NEXT:    subq $1536, %rsp # imm = 0x600
 ; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
 ; CHECK-NEXT:    callq test
 ; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
@@ -32,53 +32,48 @@ define void @pr34653() {
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm10, %xmm13
 ; CHECK-NEXT:    vmovaps %xmm13, %xmm14
 ; CHECK-NEXT:    vmovaps %xmm10, %xmm15
-; CHECK-NEXT:    vmovaps %xmm15, %xmm2
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vextractf32x4 $3, %zmm9, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %zmm15, %zmm16
+; CHECK-NEXT:    vextractf32x4 $3, %zmm9, %xmm2
+; CHECK-NEXT:    vmovaps %zmm2, %zmm17
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm9, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm18
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm9, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm19
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm8, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm20
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm8, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm21
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm8, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm22
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm7, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm23
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm7, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm24
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm7, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %zmm0, %zmm25
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
 ; CHECK-NEXT:    # kill: def $ymm10 killed $ymm10 killed $zmm10
 ; CHECK-NEXT:    vextractf128 $1, %ymm10, %xmm10
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovaps %xmm10, %xmm0
+; CHECK-NEXT:    vmovaps %zmm10, %zmm26
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    # kill: def $ymm9 killed $ymm9 killed $zmm9
 ; CHECK-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovaps %xmm9, %xmm0
+; CHECK-NEXT:    vmovaps %zmm9, %zmm27
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -90,8 +85,7 @@ define void @pr34653() {
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    # kill: def $ymm8 killed $ymm8 killed $zmm8
 ; CHECK-NEXT:    vextractf128 $1, %ymm8, %xmm8
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovaps %xmm8, %xmm0
+; CHECK-NEXT:    vmovaps %zmm8, %zmm28
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -103,8 +97,7 @@ define void @pr34653() {
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    # kill: def $ymm7 killed $ymm7 killed $zmm7
 ; CHECK-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovaps %xmm7, %xmm0
+; CHECK-NEXT:    vmovaps %zmm7, %zmm29
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -112,54 +105,10 @@ define void @pr34653() {
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 8-byte Reload
+; CHECK-NEXT:    # xmm30 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 8-byte Reload
+; CHECK-NEXT:    # xmm31 = mem[0],zero
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
@@ -182,21 +131,37 @@ define void @pr34653() {
 ; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    vmovsd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm7, (%rsp) # 8-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 76623a2be22..2441a4cf40a 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -2055,16 +2055,16 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
 ; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
 ; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
-; X86-AVX512-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm3 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; X86-AVX512-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
-; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
-; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
+; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
+; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
+; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
+; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
+; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_set_ps:
-- 
GitLab


From 18412a68d2c58ad41afd8f98fd8266c34de08373 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric@codeaurora.org>
Date: Sat, 27 Oct 2018 06:13:06 +0000
Subject: [PATCH 0666/1116] [ARM64][Windows] MCLayer support for exception
 handling

Add ARM64 unwind codes to MCLayer, as well SEH directives that will be emitted
by the frame lowering patch to follow.  We only emit unwind codes into object
object files for now.

Differential Revision: https://reviews.llvm.org/D50166


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345450 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCStreamer.h                  |  21 +-
 include/llvm/MC/MCWin64EH.h                   |   8 +
 include/llvm/MC/MCWinEH.h                     |   9 +-
 include/llvm/Support/Win64EH.h                |  19 +-
 lib/CodeGen/AsmPrinter/WinException.cpp       |  26 +-
 lib/CodeGen/AsmPrinter/WinException.h         |   5 +-
 lib/MC/MCAsmStreamer.cpp                      |   5 +
 lib/MC/MCStreamer.cpp                         |  11 +
 lib/MC/MCWin64EH.cpp                          | 347 ++++++++++++++++++
 lib/Target/AArch64/AArch64AsmPrinter.cpp      |  97 +++++
 lib/Target/AArch64/AArch64InstrInfo.td        |  22 ++
 .../MCTargetDesc/AArch64ELFStreamer.cpp       |  24 +-
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp |   1 +
 .../MCTargetDesc/AArch64TargetStreamer.cpp    |  15 +
 .../MCTargetDesc/AArch64TargetStreamer.h      |  69 ++++
 .../MCTargetDesc/AArch64WinCOFFStreamer.cpp   | 155 ++++++++
 .../MCTargetDesc/AArch64WinCOFFStreamer.h     |  14 -
 test/CodeGen/AArch64/wineh1.mir               | 120 ++++++
 test/CodeGen/AArch64/wineh2.mir               | 185 ++++++++++
 test/CodeGen/AArch64/wineh3.mir               | 171 +++++++++
 test/CodeGen/AArch64/wineh4.mir               | 228 ++++++++++++
 test/CodeGen/AArch64/wineh5.mir               | 224 +++++++++++
 test/CodeGen/AArch64/wineh6.mir               | 138 +++++++
 test/CodeGen/AArch64/wineh7.mir               | 134 +++++++
 test/CodeGen/AArch64/wineh_shrinkwrap.mir     | 146 ++++++++
 25 files changed, 2139 insertions(+), 55 deletions(-)
 create mode 100644 test/CodeGen/AArch64/wineh1.mir
 create mode 100644 test/CodeGen/AArch64/wineh2.mir
 create mode 100644 test/CodeGen/AArch64/wineh3.mir
 create mode 100644 test/CodeGen/AArch64/wineh4.mir
 create mode 100644 test/CodeGen/AArch64/wineh5.mir
 create mode 100644 test/CodeGen/AArch64/wineh6.mir
 create mode 100644 test/CodeGen/AArch64/wineh7.mir
 create mode 100644 test/CodeGen/AArch64/wineh_shrinkwrap.mir

diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index d66a89f76a7..edf0a72d9c1 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -198,10 +198,6 @@ class MCStreamer {
 
   WinEH::FrameInfo *CurrentWinFrameInfo;
 
-  /// Retreive the current frame info if one is available and it is not yet
-  /// closed. Otherwise, issue an error and return null.
-  WinEH::FrameInfo *EnsureValidWinFrameInfo(SMLoc Loc);
-
   /// Tracks an index to represent the order a symbol was emitted in.
   /// Zero means we did not emit that symbol.
   DenseMap<const MCSymbol *, unsigned> SymbolOrdering;
@@ -224,10 +220,6 @@ protected:
   virtual void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
   virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &CurFrame);
 
-  /// When emitting an object file, create and emit a real label. When emitting
-  /// textual assembly, this should do nothing to avoid polluting our output.
-  virtual MCSymbol *EmitCFILabel();
-
   WinEH::FrameInfo *getCurrentWinFrameInfo() {
     return CurrentWinFrameInfo;
   }
@@ -266,6 +258,14 @@ public:
     return TargetStreamer.get();
   }
 
+  /// When emitting an object file, create and emit a real label. When emitting
+  /// textual assembly, this should do nothing to avoid polluting our output.
+  virtual MCSymbol *EmitCFILabel();
+
+  /// Retreive the current frame info if one is available and it is not yet
+  /// closed. Otherwise, issue an error and return null.
+  WinEH::FrameInfo *EnsureValidWinFrameInfo(SMLoc Loc);
+
   unsigned getNumFrameInfos() { return DwarfFrameInfos.size(); }
   ArrayRef<MCDwarfFrameInfo> getDwarfFrameInfos() const {
     return DwarfFrameInfos;
@@ -899,6 +899,11 @@ public:
 
   virtual void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc = SMLoc());
   virtual void EmitWinCFIEndProc(SMLoc Loc = SMLoc());
+  /// This is used on platforms, such as Windows on ARM64, that require function
+  /// or funclet sizes to be emitted in .xdata before the End marker is emitted
+  /// for the frame.  We cannot use the End marker, as it is not set at the
+  /// point of emitting .xdata, in order to indicate that the frame is active.
+  virtual void EmitWinCFIFuncletOrFuncEnd(SMLoc Loc = SMLoc());
   virtual void EmitWinCFIStartChained(SMLoc Loc = SMLoc());
   virtual void EmitWinCFIEndChained(SMLoc Loc = SMLoc());
   virtual void EmitWinCFIPushReg(unsigned Register, SMLoc Loc = SMLoc());
diff --git a/include/llvm/MC/MCWin64EH.h b/include/llvm/MC/MCWin64EH.h
index 83ea738de8c..1a9f6f403d7 100644
--- a/include/llvm/MC/MCWin64EH.h
+++ b/include/llvm/MC/MCWin64EH.h
@@ -56,6 +56,14 @@ public:
   void Emit(MCStreamer &Streamer) const override;
   void EmitUnwindInfo(MCStreamer &Streamer, WinEH::FrameInfo *FI) const override;
 };
+
+class ARM64UnwindEmitter : public WinEH::UnwindEmitter {
+public:
+  void Emit(MCStreamer &Streamer) const override;
+  void EmitUnwindInfo(MCStreamer &Streamer,
+                      WinEH::FrameInfo *FI) const override;
+};
+
 }
 } // end namespace llvm
 
diff --git a/include/llvm/MC/MCWinEH.h b/include/llvm/MC/MCWinEH.h
index 4ca52a6654e..98ef0367a11 100644
--- a/include/llvm/MC/MCWinEH.h
+++ b/include/llvm/MC/MCWinEH.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_MC_MCWINEH_H
 #define LLVM_MC_MCWINEH_H
 
+#include "llvm/ADT/MapVector.h"
 #include <vector>
 
 namespace llvm {
@@ -20,9 +21,9 @@ class MCSymbol;
 namespace WinEH {
 struct Instruction {
   const MCSymbol *Label;
-  const unsigned Offset;
-  const unsigned Register;
-  const unsigned Operation;
+  unsigned Offset;
+  unsigned Register;
+  unsigned Operation;
 
   Instruction(unsigned Op, MCSymbol *L, unsigned Reg, unsigned Off)
     : Label(L), Offset(Off), Register(Reg), Operation(Op) {}
@@ -31,6 +32,7 @@ struct Instruction {
 struct FrameInfo {
   const MCSymbol *Begin = nullptr;
   const MCSymbol *End = nullptr;
+  const MCSymbol *FuncletOrFuncEnd = nullptr;
   const MCSymbol *ExceptionHandler = nullptr;
   const MCSymbol *Function = nullptr;
   const MCSymbol *PrologEnd = nullptr;
@@ -43,6 +45,7 @@ struct FrameInfo {
   int LastFrameInst = -1;
   const FrameInfo *ChainedParent = nullptr;
   std::vector<Instruction> Instructions;
+  MapVector<MCSymbol*, std::vector<Instruction>> EpilogMap;
 
   FrameInfo() = default;
   FrameInfo(const MCSymbol *Function, const MCSymbol *BeginFuncEHLabel)
diff --git a/include/llvm/Support/Win64EH.h b/include/llvm/Support/Win64EH.h
index 928eb906de0..e27bf1b3a1a 100644
--- a/include/llvm/Support/Win64EH.h
+++ b/include/llvm/Support/Win64EH.h
@@ -33,7 +33,24 @@ enum UnwindOpcodes {
   UOP_SaveNonVolBig,
   UOP_SaveXMM128 = 8,
   UOP_SaveXMM128Big,
-  UOP_PushMachFrame
+  UOP_PushMachFrame,
+  // The following set of unwind opcodes is for ARM64.  They are documented at
+  // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+  UOP_AllocMedium,
+  UOP_SaveFPLRX,
+  UOP_SaveFPLR,
+  UOP_SaveReg,
+  UOP_SaveRegX,
+  UOP_SaveRegP,
+  UOP_SaveRegPX,
+  UOP_SaveFReg,
+  UOP_SaveFRegX,
+  UOP_SaveFRegP,
+  UOP_SaveFRegPX,
+  UOP_SetFP,
+  UOP_AddFP,
+  UOP_Nop,
+  UOP_End
 };
 
 /// UnwindCode - This union describes a single operation in a function prolog,
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index eff73a58d8d..2a97a2fde43 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -42,6 +42,7 @@ WinException::WinException(AsmPrinter *A) : EHStreamer(A) {
   // MSVC's EH tables are always composed of 32-bit words.  All known 64-bit
   // platforms use an imagerel32 relocation to refer to symbols.
   useImageRel32 = (A->getDataLayout().getPointerSizeInBits() == 64);
+  isAArch64 = Asm->TM.getTargetTriple().isAArch64();
 }
 
 WinException::~WinException() {}
@@ -242,6 +243,17 @@ void WinException::endFunclet() {
     if (F.hasPersonalityFn())
       Per = classifyEHPersonality(F.getPersonalityFn()->stripPointerCasts());
 
+    // On funclet exit, we emit a fake "function" end marker, so that the call
+    // to EmitWinEHHandlerData below can calculate the size of the funclet or
+    // function.
+    if (isAArch64) {
+      Asm->OutStreamer->SwitchSection(CurrentFuncletTextSection);
+      Asm->OutStreamer->EmitWinCFIFuncletOrFuncEnd();
+      MCSection *XData = Asm->OutStreamer->getAssociatedXDataSection(
+          Asm->OutStreamer->getCurrentSectionOnly());
+      Asm->OutStreamer->SwitchSection(XData);
+    }
+
     // Emit an UNWIND_INFO struct describing the prologue.
     Asm->OutStreamer->EmitWinEHHandlerData();
 
@@ -286,7 +298,10 @@ const MCExpr *WinException::create32bitRef(const GlobalValue *GV) {
   return create32bitRef(Asm->getSymbol(GV));
 }
 
-const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) {
+const MCExpr *WinException::getLabel(const MCSymbol *Label) {
+  if (isAArch64)
+    return MCSymbolRefExpr::create(Label, MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                   Asm->OutContext);
   return MCBinaryExpr::createAdd(create32bitRef(Label),
                                  MCConstantExpr::create(1, Asm->OutContext),
                                  Asm->OutContext);
@@ -588,7 +603,6 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
                                           const MCSymbol *EndLabel, int State) {
   auto &OS = *Asm->OutStreamer;
   MCContext &Ctx = Asm->OutContext;
-
   bool VerboseAsm = OS.isVerboseAsm();
   auto AddComment = [&](const Twine &Comment) {
     if (VerboseAsm)
@@ -613,9 +627,9 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
     }
 
     AddComment("LabelStart");
-    OS.EmitValue(getLabelPlusOne(BeginLabel), 4);
+    OS.EmitValue(getLabel(BeginLabel), 4);
     AddComment("LabelEnd");
-    OS.EmitValue(getLabelPlusOne(EndLabel), 4);
+    OS.EmitValue(getLabel(EndLabel), 4);
     AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction"
                                                              : "CatchAll");
     OS.EmitValue(FilterOrFinally, 4);
@@ -799,7 +813,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
       //   TypeDescriptor *Type;
       //   int32_t         CatchObjOffset;
       //   void          (*Handler)();
-      //   int32_t         ParentFrameOffset; // x64 only
+      //   int32_t         ParentFrameOffset; // x64 and AArch64 only
       // };
       OS.EmitLabel(HandlerMapXData);
       for (const WinEHHandlerType &HT : TBME.HandlerArray) {
@@ -901,7 +915,7 @@ void WinException::computeIP2StateTable(
         ChangeLabel = StateChange.PreviousEndLabel;
       // Emit an entry indicating that PCs after 'Label' have this EH state.
       IPToStateTable.push_back(
-          std::make_pair(getLabelPlusOne(ChangeLabel), StateChange.NewState));
+          std::make_pair(getLabel(ChangeLabel), StateChange.NewState));
       // FIXME: assert that NewState is between CatchLow and CatchHigh.
     }
   }
diff --git a/lib/CodeGen/AsmPrinter/WinException.h b/lib/CodeGen/AsmPrinter/WinException.h
index eed3c4453ff..728cde3b250 100644
--- a/lib/CodeGen/AsmPrinter/WinException.h
+++ b/lib/CodeGen/AsmPrinter/WinException.h
@@ -38,6 +38,9 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
   /// True if this is a 64-bit target and we should use image relative offsets.
   bool useImageRel32 = false;
 
+  /// True if we are generating exception handling on Windows for ARM64.
+  bool isAArch64 = false;
+
   /// Pointer to the current funclet entry BB.
   const MachineBasicBlock *CurrentFuncletEntry = nullptr;
 
@@ -72,7 +75,7 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
 
   const MCExpr *create32bitRef(const MCSymbol *Value);
   const MCExpr *create32bitRef(const GlobalValue *GV);
-  const MCExpr *getLabelPlusOne(const MCSymbol *Label);
+  const MCExpr *getLabel(const MCSymbol *Label);
   const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom);
   const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf,
                                  const MCSymbol *OffsetFrom);
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index f75a8e077e4..463e9066616 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -289,6 +289,7 @@ public:
 
   void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) override;
   void EmitWinCFIEndProc(SMLoc Loc) override;
+  void EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) override;
   void EmitWinCFIStartChained(SMLoc Loc) override;
   void EmitWinCFIEndChained(SMLoc Loc) override;
   void EmitWinCFIPushReg(unsigned Register, SMLoc Loc) override;
@@ -1589,6 +1590,10 @@ void MCAsmStreamer::EmitWinCFIEndProc(SMLoc Loc) {
   EmitEOL();
 }
 
+// TODO: Implement
+void MCAsmStreamer::EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
+}
+
 void MCAsmStreamer::EmitWinCFIStartChained(SMLoc Loc) {
   MCStreamer::EmitWinCFIStartChained(Loc);
 
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 1b704b89320..3722c0ad3c8 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -627,6 +627,17 @@ void MCStreamer::EmitWinCFIEndProc(SMLoc Loc) {
   CurFrame->End = Label;
 }
 
+void MCStreamer::EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
+  WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
+  if (!CurFrame)
+    return;
+  if (CurFrame->ChainedParent)
+    getContext().reportError(Loc, "Not all chained regions terminated!");
+
+  MCSymbol *Label = EmitCFILabel();
+  CurFrame->FuncletOrFuncEnd = Label;
+}
+
 void MCStreamer::EmitWinCFIStartChained(SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index 1407f25e6f2..0c8d58e5972 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp
@@ -11,6 +11,9 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Win64EH.h"
@@ -23,6 +26,8 @@ static uint8_t CountOfUnwindCodes(std::vector<WinEH::Instruction> &Insns) {
   uint8_t Count = 0;
   for (const auto &I : Insns) {
     switch (static_cast<Win64EH::UnwindOpcodes>(I.Operation)) {
+    default:
+      llvm_unreachable("Unsupported unwind code");
     case Win64EH::UOP_PushNonVol:
     case Win64EH::UOP_AllocSmall:
     case Win64EH::UOP_SetFPReg:
@@ -60,6 +65,8 @@ static void EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
   uint16_t w;
   b2 = (inst.Operation & 0x0F);
   switch (static_cast<Win64EH::UnwindOpcodes>(inst.Operation)) {
+  default:
+    llvm_unreachable("Unsupported unwind code");
   case Win64EH::UOP_PushNonVol:
     EmitAbsDifference(streamer, inst.Label, begin);
     b2 |= (inst.Register & 0x0F) << 4;
@@ -242,3 +249,343 @@ void llvm::Win64EH::UnwindEmitter::EmitUnwindInfo(
   ::EmitUnwindInfo(Streamer, info);
 }
 
+static int64_t GetAbsDifference(MCStreamer &Streamer, const MCSymbol *LHS,
+                                const MCSymbol *RHS) {
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Diff =
+      MCBinaryExpr::createSub(MCSymbolRefExpr::create(LHS, Context),
+                              MCSymbolRefExpr::create(RHS, Context), Context);
+  MCObjectStreamer *OS = (MCObjectStreamer *)(&Streamer);
+  int64_t value;
+  Diff->evaluateAsAbsolute(value, OS->getAssembler());
+  return value;
+}
+
+static uint32_t
+ARM64CountOfUnwindCodes(const std::vector<WinEH::Instruction> &Insns) {
+  uint32_t Count = 0;
+  for (const auto &I : Insns) {
+    switch (static_cast<Win64EH::UnwindOpcodes>(I.Operation)) {
+    default:
+      llvm_unreachable("Unsupported ARM64 unwind code");
+    case Win64EH::UOP_AllocSmall:
+      Count += 1;
+      break;
+    case Win64EH::UOP_AllocMedium:
+      Count += 2;
+      break;
+    case Win64EH::UOP_AllocLarge:
+      Count += 4;
+      break;
+    case Win64EH::UOP_SaveFPLRX:
+      Count += 1;
+      break;
+    case Win64EH::UOP_SaveFPLR:
+      Count += 1;
+      break;
+    case Win64EH::UOP_SaveReg:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveRegP:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveRegPX:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveRegX:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveFReg:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveFRegP:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveFRegX:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveFRegPX:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SetFP:
+      Count += 1;
+      break;
+    case Win64EH::UOP_AddFP:
+      Count += 2;
+      break;
+    case Win64EH::UOP_Nop:
+      Count += 1;
+      break;
+    case Win64EH::UOP_End:
+      Count += 1;
+      break;
+    }
+  }
+  return Count;
+}
+
+// Unwind opcode encodings and restrictions are documented at
+// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+static void ARM64EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
+                                WinEH::Instruction &inst) {
+  uint8_t b, reg;
+  switch (static_cast<Win64EH::UnwindOpcodes>(inst.Operation)) {
+  default:
+    llvm_unreachable("Unsupported ARM64 unwind code");
+  case Win64EH::UOP_AllocSmall:
+    b = (inst.Offset >> 4) & 0x1F;
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_AllocMedium: {
+    uint16_t hw = (inst.Offset >> 4) & 0x7FF;
+    b = 0xC0;
+    b |= (hw >> 8);
+    streamer.EmitIntValue(b, 1);
+    b = hw & 0xFF;
+    streamer.EmitIntValue(b, 1);
+    break;
+  }
+  case Win64EH::UOP_AllocLarge: {
+    uint32_t w;
+    b = 0xE0;
+    streamer.EmitIntValue(b, 1);
+    w = inst.Offset >> 4;
+    b = (w & 0x00FF0000) >> 16;
+    streamer.EmitIntValue(b, 1);
+    b = (w & 0x0000FF00) >> 8;
+    streamer.EmitIntValue(b, 1);
+    b = w & 0x000000FF;
+    streamer.EmitIntValue(b, 1);
+    break;
+  }
+  case Win64EH::UOP_SetFP:
+    b = 0xE1;
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_AddFP:
+    b = 0xE2;
+    streamer.EmitIntValue(b, 1);
+    b = (inst.Offset >> 3);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_Nop:
+    b = 0xE3;
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFPLRX:
+    b = 0x80;
+    b |= ((inst.Offset - 1) >> 3) & 0x3F;
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFPLR:
+    b = 0x40;
+    b |= (inst.Offset >> 3) & 0x3F;
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveReg:
+    assert(inst.Register >= 19 && "Saved reg must be >= 19");
+    reg = inst.Register - 19;
+    b = 0xD0 | ((reg & 0xC) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveRegX:
+    assert(inst.Register >= 19 && "Saved reg must be >= 19");
+    reg = inst.Register - 19;
+    b = 0xD4 | ((reg & 0x8) >> 3);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x7) << 5) | ((inst.Offset >> 3) - 1);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveRegP:
+    assert(inst.Register >= 19 && "Saved registers must be >= 19");
+    reg = inst.Register - 19;
+    b = 0xC8 | ((reg & 0xC) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveRegPX:
+    assert(inst.Register >= 19 && "Saved registers must be >= 19");
+    reg = inst.Register - 19;
+    b = 0xCC | ((reg & 0xC) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | ((inst.Offset >> 3) - 1);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFReg:
+    assert(inst.Register >= 8 && "Saved dreg must be >= 8");
+    reg = inst.Register - 8;
+    b = 0xDC | ((reg & 0x4) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFRegX:
+    assert(inst.Register >= 8 && "Saved dreg must be >= 8");
+    reg = inst.Register - 8;
+    b = 0xDE;
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x7) << 5) | ((inst.Offset >> 3) - 1);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFRegP:
+    assert(inst.Register >= 8 && "Saved dregs must be >= 8");
+    reg = inst.Register - 8;
+    b = 0xD8 | ((reg & 0x4) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFRegPX:
+    assert(inst.Register >= 8 && "Saved dregs must be >= 8");
+    reg = inst.Register - 8;
+    b = 0xDA | ((reg & 0x4) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | ((inst.Offset >> 3) - 1);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_End:
+    b = 0xE4;
+    streamer.EmitIntValue(b, 1);
+    break;
+  }
+}
+
+// Populate the .xdata section.  The format of .xdata on ARM64 is documented at
+// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
+  // If this UNWIND_INFO already has a symbol, it's already been emitted.
+  if (info->Symbol)
+    return;
+
+  MCContext &context = streamer.getContext();
+  MCSymbol *Label = context.createTempSymbol();
+
+  streamer.EmitValueToAlignment(4);
+  streamer.EmitLabel(Label);
+  info->Symbol = Label;
+
+  uint32_t FuncLength = 0x0;
+  FuncLength = (uint32_t)GetAbsDifference(streamer, info->FuncletOrFuncEnd,
+                                          info->Begin);
+  if (FuncLength)
+    FuncLength /= 4;
+  uint32_t PrologCodeBytes = ARM64CountOfUnwindCodes(info->Instructions);
+  uint32_t TotalCodeBytes = PrologCodeBytes;
+
+  // Process epilogs.
+  MapVector<MCSymbol *, uint32_t> EpilogInfo;
+  for (auto &I : info->EpilogMap) {
+    MCSymbol *EpilogStart = I.first;
+    auto &EpilogInstrs = I.second;
+    uint32_t CodeBytes = ARM64CountOfUnwindCodes(EpilogInstrs);
+    EpilogInfo[EpilogStart] = TotalCodeBytes;
+    TotalCodeBytes += CodeBytes;
+  }
+
+  // Code Words, Epilog count, E, X, Vers, Function Length
+  uint32_t row1 = 0x0;
+  uint8_t CodeWords = TotalCodeBytes / 4;
+  uint8_t CodeWordsMod = TotalCodeBytes % 4;
+  if (CodeWordsMod)
+    CodeWords++;
+  uint32_t EpilogCount = info->EpilogMap.size();
+  bool ExtensionWord = EpilogCount > 31 || TotalCodeBytes > 124;
+  if (!ExtensionWord) {
+    row1 |= (EpilogCount & 0x1F) << 22;
+    row1 |= (CodeWords & 0x1F) << 27;
+  }
+  // E is always 0 right now, TODO: packed epilog setup
+  if (info->HandlesExceptions) // X
+    row1 |= 1 << 20;
+  row1 |= FuncLength & 0x3FFFF;
+  streamer.EmitIntValue(row1, 4);
+
+  // Extended Code Words, Extended Epilog Count
+  if (ExtensionWord) {
+    uint32_t row2 = 0x0;
+    row2 |= (CodeWords & 0xFF) << 16;
+    row2 |= (EpilogCount & 0xFFFF);
+    streamer.EmitIntValue(row2, 4);
+  }
+
+  // Epilog Start Index, Epilog Start Offset
+  for (auto &I : EpilogInfo) {
+    MCSymbol *EpilogStart = I.first;
+    uint32_t EpilogIndex = I.second;
+    uint32_t EpilogOffset =
+        (uint32_t)GetAbsDifference(streamer, EpilogStart, info->Begin);
+    if (EpilogOffset)
+      EpilogOffset /= 4;
+    uint32_t row3 = EpilogOffset;
+    row3 |= (EpilogIndex & 0x3FF) << 22;
+    streamer.EmitIntValue(row3, 4);
+  }
+
+  // Emit prolog unwind instructions (in reverse order).
+  uint8_t numInst = info->Instructions.size();
+  for (uint8_t c = 0; c < numInst; ++c) {
+    WinEH::Instruction inst = info->Instructions.back();
+    info->Instructions.pop_back();
+    ARM64EmitUnwindCode(streamer, info->Begin, inst);
+  }
+
+  // Emit epilog unwind instructions
+  for (auto &I : info->EpilogMap) {
+    auto &EpilogInstrs = I.second;
+    for (uint32_t i = 0; i < EpilogInstrs.size(); i++) {
+      WinEH::Instruction inst = EpilogInstrs[i];
+      ARM64EmitUnwindCode(streamer, info->Begin, inst);
+    }
+  }
+
+  int32_t BytesMod = CodeWords * 4 - TotalCodeBytes;
+  assert(BytesMod >= 0);
+  for (int i = 0; i < BytesMod; i++)
+    streamer.EmitIntValue(0xE3, 1);
+
+  if (info->HandlesExceptions)
+    streamer.EmitValue(
+        MCSymbolRefExpr::create(info->ExceptionHandler,
+                                MCSymbolRefExpr::VK_COFF_IMGREL32, context),
+        4);
+}
+
+static void ARM64EmitRuntimeFunction(MCStreamer &streamer,
+                                     const WinEH::FrameInfo *info) {
+  MCContext &context = streamer.getContext();
+
+  streamer.EmitValueToAlignment(4);
+  EmitSymbolRefWithOfs(streamer, info->Function, info->Begin);
+  streamer.EmitValue(MCSymbolRefExpr::create(info->Symbol,
+                                             MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                             context),
+                     4);
+}
+
+void llvm::Win64EH::ARM64UnwindEmitter::Emit(MCStreamer &Streamer) const {
+  // Emit the unwind info structs first.
+  for (const auto &CFI : Streamer.getWinFrameInfos()) {
+    MCSection *XData = Streamer.getAssociatedXDataSection(CFI->TextSection);
+    Streamer.SwitchSection(XData);
+    ARM64EmitUnwindInfo(Streamer, CFI.get());
+  }
+
+  // Now emit RUNTIME_FUNCTION entries.
+  for (const auto &CFI : Streamer.getWinFrameInfos()) {
+    MCSection *PData = Streamer.getAssociatedPDataSection(CFI->TextSection);
+    Streamer.SwitchSection(PData);
+    ARM64EmitRuntimeFunction(Streamer, CFI.get());
+  }
+}
+
+void llvm::Win64EH::ARM64UnwindEmitter::EmitUnwindInfo(
+    MCStreamer &Streamer, WinEH::FrameInfo *info) const {
+  // Switch sections (the static function above is meant to be called from
+  // here and from Emit().
+  MCSection *XData = Streamer.getAssociatedXDataSection(info->TextSection);
+  Streamer.SwitchSection(XData);
+  ARM64EmitUnwindInfo(Streamer, info);
+}
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index b1375c969d9..1ff0392c0f2 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -21,6 +21,7 @@
 #include "InstPrinter/AArch64InstPrinter.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64TargetStreamer.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -665,6 +666,8 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     OutStreamer->EmitLabel(LOHLabel);
   }
 
+  AArch64TargetStreamer *TS =
+    static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
   // Do any manual lowerings.
   switch (MI->getOpcode()) {
   default:
@@ -817,6 +820,100 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case TargetOpcode::PATCHABLE_TAIL_CALL:
     LowerPATCHABLE_TAIL_CALL(*MI);
     return;
+
+  case AArch64::SEH_StackAlloc:
+    TS->EmitARM64WinCFIAllocStack(MI->getOperand(0).getImm());
+    return;
+
+  case AArch64::SEH_SaveFPLR:
+    TS->EmitARM64WinCFISaveFPLR(MI->getOperand(0).getImm());
+    return;
+
+  case AArch64::SEH_SaveFPLR_X:
+    assert(MI->getOperand(0).getImm() < 0 &&
+           "Pre increment SEH opcode must have a negative offset");
+    TS->EmitARM64WinCFISaveFPLRX(-MI->getOperand(0).getImm());
+    return;
+
+  case AArch64::SEH_SaveReg:
+    TS->EmitARM64WinCFISaveReg(MI->getOperand(0).getImm(),
+                               MI->getOperand(1).getImm());
+    return;
+
+  case AArch64::SEH_SaveReg_X:
+    assert(MI->getOperand(1).getImm() < 0 &&
+           "Pre increment SEH opcode must have a negative offset");
+    TS->EmitARM64WinCFISaveRegX(MI->getOperand(0).getImm(),
+		                -MI->getOperand(1).getImm());
+    return;
+
+  case AArch64::SEH_SaveRegP:
+    assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
+            "Non-consecutive registers not allowed for save_regp");
+    TS->EmitARM64WinCFISaveRegP(MI->getOperand(0).getImm(),
+                                MI->getOperand(2).getImm());
+    return;
+
+  case AArch64::SEH_SaveRegP_X:
+    assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
+            "Non-consecutive registers not allowed for save_regp_x");
+    assert(MI->getOperand(2).getImm() < 0 &&
+           "Pre increment SEH opcode must have a negative offset");
+    TS->EmitARM64WinCFISaveRegPX(MI->getOperand(0).getImm(),
+                                 -MI->getOperand(2).getImm());
+    return;
+
+  case AArch64::SEH_SaveFReg:
+    TS->EmitARM64WinCFISaveFReg(MI->getOperand(0).getImm(),
+                                MI->getOperand(1).getImm());
+    return;
+
+  case AArch64::SEH_SaveFReg_X:
+    assert(MI->getOperand(1).getImm() < 0 &&
+           "Pre increment SEH opcode must have a negative offset");
+    TS->EmitARM64WinCFISaveFRegX(MI->getOperand(0).getImm(),
+                                 -MI->getOperand(1).getImm());
+    return;
+
+  case AArch64::SEH_SaveFRegP:
+    assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
+            "Non-consecutive registers not allowed for save_regp");
+    TS->EmitARM64WinCFISaveFRegP(MI->getOperand(0).getImm(),
+                                 MI->getOperand(2).getImm());
+    return;
+
+  case AArch64::SEH_SaveFRegP_X:
+    assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
+            "Non-consecutive registers not allowed for save_regp_x");
+    assert(MI->getOperand(2).getImm() < 0 &&
+           "Pre increment SEH opcode must have a negative offset");
+    TS->EmitARM64WinCFISaveFRegPX(MI->getOperand(0).getImm(),
+                                  -MI->getOperand(2).getImm());
+    return;
+
+  case AArch64::SEH_SetFP:
+    TS->EmitARM64WinCFISetFP();
+    return;
+
+  case AArch64::SEH_AddFP:
+    TS->EmitARM64WinCFIAddFP(MI->getOperand(0).getImm());
+    return;
+
+  case AArch64::SEH_Nop:
+    TS->EmitARM64WinCFINop();
+    return;
+
+  case AArch64::SEH_PrologEnd:
+    TS->EmitARM64WinCFIPrologEnd();
+    return;
+
+  case AArch64::SEH_EpilogStart:
+    TS->EmitARM64WinCFIEpilogStart();
+    return;
+
+  case AArch64::SEH_EpilogEnd:
+    TS->EmitARM64WinCFIEpilogEnd();
+    return;
   }
 
   // Finally, do the automated lowerings for everything else.
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 59adec3ce38..77461eccf3e 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3138,6 +3138,28 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd),
   let hasNoSchedulingInfo = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Instructions used for emitting unwind opcodes on ARM64 Windows.
+//===----------------------------------------------------------------------===//
+let isPseudo = 1 in {
+  def SEH_StackAlloc : Pseudo<(outs), (ins i32imm:$size), []>, Sched<[]>;
+  def SEH_SaveFPLR : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveFPLR_X : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveReg : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveReg_X : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveRegP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveRegP_X : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveFReg : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveFReg_X :  Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveFRegP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveFRegP_X : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SetFP : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def SEH_AddFP : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
+  def SEH_Nop : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def SEH_PrologEnd : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def SEH_EpilogStart : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def SEH_EpilogEnd : Pseudo<(outs), (ins), []>, Sched<[]>;
+}
 
 //===----------------------------------------------------------------------===//
 // Floating point immediate move.
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index a09ac6b94c1..7ca191c86ad 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -60,16 +60,6 @@ void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) {
   OS << "\t.inst\t0x" << Twine::utohexstr(Inst) << "\n";
 }
 
-class AArch64TargetELFStreamer : public AArch64TargetStreamer {
-private:
-  AArch64ELFStreamer &getStreamer();
-
-  void emitInst(uint32_t Inst) override;
-
-public:
-  AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
-};
-
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
 /// the appropriate points in the object files. These symbols are defined in the
 /// AArch64 ELF ABI:
@@ -197,6 +187,8 @@ private:
 
 } // end anonymous namespace
 
+namespace llvm {
+
 AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
   return static_cast<AArch64ELFStreamer &>(Streamer);
 }
@@ -205,8 +197,6 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
   getStreamer().emitInst(Inst);
 }
 
-namespace llvm {
-
 MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
                                                  MCInstPrinter *InstPrint,
@@ -226,14 +216,4 @@ MCELFStreamer *createAArch64ELFStreamer(MCContext &Context,
   return S;
 }
 
-MCTargetStreamer *
-createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
-  const Triple &TT = STI.getTargetTriple();
-  if (TT.isOSBinFormatELF())
-    return new AArch64TargetELFStreamer(S);
-  if (TT.isOSBinFormatCOFF())
-    return new AArch64TargetWinCOFFStreamer(S);
-  return nullptr;
-}
-
 } // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index ebb49121c1b..0e486b93923 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -115,6 +115,7 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
 
   CommentString = ";";
   ExceptionsType = ExceptionHandling::WinEH;
+  WinEHEncodingType = WinEH::EncodingType::Itanium;
 }
 
 AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index dee964df263..a6b8d963bef 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -13,6 +13,7 @@
 
 #include "AArch64TargetStreamer.h"
 #include "llvm/MC/ConstantPools.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -52,3 +53,17 @@ void AArch64TargetStreamer::emitInst(uint32_t Inst) {
 
   getStreamer().EmitBytes(StringRef(Buffer, 4));
 }
+
+namespace llvm {
+
+MCTargetStreamer *
+createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new AArch64TargetELFStreamer(S);
+  if (TT.isOSBinFormatCOFF())
+    return new AArch64TargetWinCOFFStreamer(S);
+  return nullptr;
+}
+
+} // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 51432830f79..73fb9baea3e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -12,6 +12,10 @@
 
 #include "llvm/MC/MCStreamer.h"
 
+namespace {
+class AArch64ELFStreamer;
+}
+
 namespace llvm {
 
 class AArch64TargetStreamer : public MCTargetStreamer {
@@ -33,10 +37,75 @@ public:
   /// Callback used to implement the .inst directive.
   virtual void emitInst(uint32_t Inst);
 
+  virtual void EmitARM64WinCFIAllocStack(unsigned Size) {}
+  virtual void EmitARM64WinCFISaveFPLR(int Offset) {}
+  virtual void EmitARM64WinCFISaveFPLRX(int Offset) {}
+  virtual void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISetFP() {}
+  virtual void EmitARM64WinCFIAddFP(unsigned Size) {}
+  virtual void EmitARM64WinCFINop() {}
+  virtual void EmitARM64WinCFIPrologEnd() {}
+  virtual void EmitARM64WinCFIEpilogStart() {}
+  virtual void EmitARM64WinCFIEpilogEnd() {}
+
 private:
   std::unique_ptr<AssemblerConstantPools> ConstantPools;
 };
 
+class AArch64TargetELFStreamer : public AArch64TargetStreamer {
+private:
+  AArch64ELFStreamer &getStreamer();
+
+  void emitInst(uint32_t Inst) override;
+
+public:
+  AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
+};
+
+class AArch64TargetWinCOFFStreamer : public llvm::AArch64TargetStreamer {
+private:
+  // True if we are processing SEH directives in an epilogue.
+  bool InEpilogCFI = false;
+
+  // Symbol of the current epilog for which we are processing SEH directives.
+  MCSymbol *CurrentEpilog = nullptr;
+public:
+  AArch64TargetWinCOFFStreamer(llvm::MCStreamer &S)
+    : AArch64TargetStreamer(S) {}
+
+  // The unwind codes on ARM64 Windows are documented at
+  // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+  void EmitARM64WinCFIAllocStack(unsigned Size) override;
+  void EmitARM64WinCFISaveFPLR(int Offset) override;
+  void EmitARM64WinCFISaveFPLRX(int Offset) override;
+  void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISetFP() override;
+  void EmitARM64WinCFIAddFP(unsigned Size) override;
+  void EmitARM64WinCFINop() override;
+  void EmitARM64WinCFIPrologEnd() override;
+  void EmitARM64WinCFIEpilogStart() override;
+  void EmitARM64WinCFIEpilogEnd() override;
+private:
+  void EmitARM64WinUnwindCode(unsigned UnwindCode, int Reg, int Offset);
+};
+
+MCTargetStreamer *
+createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 9871dc553be..7a65c7a63f1 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -11,12 +11,16 @@
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
 
 using namespace llvm;
 
 namespace {
 
 class AArch64WinCOFFStreamer : public MCWinCOFFStreamer {
+  Win64EH::ARM64UnwindEmitter EHStreamer;
+
 public:
   friend class AArch64TargetWinCOFFStreamer;
 
@@ -25,17 +29,168 @@ public:
                          std::unique_ptr<MCObjectWriter> OW)
       : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
+  void EmitWinEHHandlerData(SMLoc Loc) override;
+  void EmitWindowsUnwindTables() override;
   void FinishImpl() override;
 };
 
+void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
+  MCStreamer::EmitWinEHHandlerData(Loc);
+
+  // We have to emit the unwind info now, because this directive
+  // actually switches to the .xdata section!
+  EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+}
+
+void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() {
+  if (!getNumWinFrameInfos())
+    return;
+  EHStreamer.Emit(*this);
+}
+
 void AArch64WinCOFFStreamer::FinishImpl() {
   EmitFrames(nullptr);
+  EmitWindowsUnwindTables();
 
   MCWinCOFFStreamer::FinishImpl();
 }
 } // end anonymous namespace
 
 namespace llvm {
+
+// Helper function to common out unwind code setup for those codes that can
+// belong to both prolog and epilog.
+// There are three types of Windows ARM64 SEH codes.  They can
+// 1) take no operands: SEH_Nop, SEH_PrologEnd, SEH_EpilogStart, SEH_EpilogEnd
+// 2) take an offset: SEH_StackAlloc, SEH_SaveFPLR, SEH_SaveFPLR_X
+// 3) take a register and an offset/size: all others
+void AArch64TargetWinCOFFStreamer::EmitARM64WinUnwindCode(unsigned UnwindCode,
+                                                          int Reg,
+                                                          int Offset) {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+  MCSymbol *Label = S.EmitCFILabel();
+  auto Inst = WinEH::Instruction(UnwindCode, Label, Reg, Offset);
+  if (InEpilogCFI)
+    CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
+  else
+    CurFrame->Instructions.push_back(Inst);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIAllocStack(unsigned Size) {
+  unsigned Op = Win64EH::UOP_AllocSmall;
+  if (Size >= 16384)
+    Op = Win64EH::UOP_AllocLarge;
+  else if (Size >= 512)
+    Op = Win64EH::UOP_AllocMedium;
+  EmitARM64WinUnwindCode(Op, -1, Size);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFPLR(int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFPLR, -1, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFPLRX(int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFPLRX, -1, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveReg(unsigned Reg,
+                                                          int Offset) {
+  assert(Offset >= 0 && Offset <= 504 &&
+        "Offset for save reg should be >= 0 && <= 504");
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveReg, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegX(unsigned Reg,
+                                                           int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegX, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegP(unsigned Reg,
+                                                           int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegP, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegPX(unsigned Reg,
+                                                            int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegPX, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFReg(unsigned Reg,
+                                                           int Offset) {
+  assert(Offset >= 0 && Offset <= 504 &&
+        "Offset for save reg should be >= 0 && <= 504");
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFReg, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFRegX(unsigned Reg,
+                                                            int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFRegX, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFRegP(unsigned Reg,
+                                                            int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFRegP, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFRegPX(unsigned Reg,
+                                                             int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFRegPX, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISetFP() {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SetFP, -1, 0);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIAddFP(unsigned Offset) {
+  assert(Offset <= 2040 && "UOP_AddFP must have offset <= 2040");
+  EmitARM64WinUnwindCode(Win64EH::UOP_AddFP, -1, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFINop() {
+  EmitARM64WinUnwindCode(Win64EH::UOP_Nop, -1, 0);
+}
+
+// The functions below handle opcodes that can end up in either a prolog or
+// an epilog, but not both.
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIPrologEnd() {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+
+  MCSymbol *Label = S.EmitCFILabel();
+  CurFrame->PrologEnd = Label;
+  WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
+  auto it = CurFrame->Instructions.begin();
+  CurFrame->Instructions.insert(it, Inst);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogStart() {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+
+  InEpilogCFI = true;
+  CurrentEpilog = S.EmitCFILabel();
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+
+  InEpilogCFI = false;
+  MCSymbol *Label = S.EmitCFILabel();
+  WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
+  CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
+  CurrentEpilog = nullptr;
+}
+
 MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
     std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
index c0542216358..ed265a876ab 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
@@ -17,20 +17,6 @@
 #include "AArch64TargetStreamer.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
 
-namespace {
-class AArch64WinCOFFStreamer;
-
-class AArch64TargetWinCOFFStreamer : public llvm::AArch64TargetStreamer {
-private:
-  AArch64WinCOFFStreamer &getStreamer();
-
-public:
-  AArch64TargetWinCOFFStreamer(llvm::MCStreamer &S)
-    : AArch64TargetStreamer(S) {}
-};
-
-} // end anonymous namespace
-
 namespace llvm {
 
 MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
diff --git a/test/CodeGen/AArch64/wineh1.mir b/test/CodeGen/AArch64/wineh1.mir
new file mode 100644
index 00000000000..6df9c638e65
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh1.mir
@@ -0,0 +1,120 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog -filetype=obj -disable-post-ra \
+# RUN:   | llvm-readobj -unwind | FileCheck %s
+# This test case checks the basic validity of the .xdata section.  It's
+# documented at:
+# https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+
+# We expect to see the following in the .xdata section:
+
+# CHECK: 	 ExceptionData {
+# CHECK-NEXT:      FunctionLength: 92
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 1
+# CHECK-NEXT:      ByteCodeLength: 28
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0xc808              ; stp x19, x20, [sp, #64]
+# CHECK-NEXT:        0xd0c7              ; str x22, [sp, #56]
+# CHECK-NEXT:        0xd086              ; str x21, [sp, #48]
+# CHECK-NEXT:        0xc904              ; stp x23, x24, [sp, #32]
+# CHECK-NEXT:        0xc982              ; stp x25, x26, [sp, #16]
+# CHECK-NEXT:        0xce09              ; stp x27, x28, [sp, #-80]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 15
+# CHECK-NEXT:          EpilogueStartIndex: 13
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xc808              ; ldp x19, x20, [sp, #64]
+# CHECK-NEXT:            0xd086              ; ldr x21, [sp, #48]
+# CHECK-NEXT:            0xe3                ; nop
+# CHECK-NEXT:            0xd0c7              ; ldr x22, [sp, #56]
+# CHECK-NEXT:            0xc904              ; ldp x23, x24, [sp, #32]
+# CHECK-NEXT:            0xc982              ; ldp x25, x26, [sp, #16]
+# CHECK-NEXT:            0xce09              ; ldp x27, x28, [sp], #80
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+...
+---
+name:            test
+alignment:       2
+tracksRegLiveness: true
+hasWinCFI: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  stackSize:       80
+  maxAlignment:    8
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+stack:
+  - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x19' }
+  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x20' }
+  - { id: 2, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x21' }
+  - { id: 3, type: spill-slot, offset: -32, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x22' }
+  - { id: 4, type: spill-slot, offset: -40, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x23' }
+  - { id: 5, type: spill-slot, offset: -48, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x24' }
+  - { id: 6, type: spill-slot, offset: -56, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x25' }
+  - { id: 7, type: spill-slot, offset: -64, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x26' }
+  - { id: 8, type: spill-slot, offset: -72, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x27' }
+  - { id: 9, type: spill-slot, offset: -80, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x28' }
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20
+    early-clobber $sp = frame-setup STPXpre killed $x27, killed $x28, $sp, -10 :: (store 8 into %stack.8), (store 8 into %stack.9)
+    frame-setup SEH_SaveRegP_X 27, 28, -80
+    frame-setup STPXi killed $x25, killed $x26, $sp, 2 :: (store 8 into %stack.6), (store 8 into %stack.7)
+    frame-setup SEH_SaveRegP 25, 26, 16
+    frame-setup STPXi killed $x23, killed $x24, $sp, 4 :: (store 8 into %stack.4), (store 8 into %stack.5)
+    frame-setup SEH_SaveRegP 23, 24, 32
+    frame-setup STRXui killed $x21, $sp, 6 :: (store 8 into %stack.2)
+    frame-setup SEH_SaveReg 21, 48
+    frame-setup STRXui killed $x22, $sp, 7 :: (store 8 into %stack.3)
+    frame-setup SEH_SaveReg 22, 56
+    frame-setup STPXi killed $x19, killed $x20, $sp, 8 :: (store 8 into %stack.0), (store 8 into %stack.1)
+    frame-setup SEH_SaveRegP 19, 20, 64
+    frame-setup SEH_PrologEnd
+    $x19 = ADDXrr $x0, killed $x1
+    $x20 = ADDXrr $x19, killed $x0
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    frame-destroy SEH_EpilogStart
+    $x19, $x20 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.0), (load 8 from %stack.1)
+    frame-destroy SEH_SaveRegP 19, 20, 64
+    $x21 = frame-destroy LDRXui $sp, 6 :: (load 8 from %stack.2)
+    frame-destroy SEH_SaveReg 21, 48
+    $x0 = COPY $x28
+    frame-destroy SEH_Nop
+    $x21 = frame-destroy LDRXui $sp, 6 :: (load 8 from %stack.2)
+    frame-destroy SEH_SaveReg 22, 56
+    $x23, $x24 = frame-destroy LDPXi $sp, 4 :: (load 8 from %stack.4), (load 8 from %stack.5)
+    frame-destroy SEH_SaveRegP 23, 24, 32
+    $x25, $x26 = frame-destroy LDPXi $sp, 2 :: (load 8 from %stack.6), (load 8 from %stack.7)
+    frame-destroy SEH_SaveRegP 25, 26, 16
+    early-clobber $sp, $x27, $x28 = frame-destroy LDPXpost $sp, 10 :: (load 8 from %stack.8), (load 8 from %stack.9)
+    frame-destroy SEH_SaveRegP_X 27, 28, -80
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh2.mir b/test/CodeGen/AArch64/wineh2.mir
new file mode 100644
index 00000000000..29b20963444
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh2.mir
@@ -0,0 +1,185 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:   -disable-post-ra -filetype=obj | llvm-readobj -unwind | FileCheck %s
+# Test that the pre/post increment save of a flating point register is correct.
+
+# CHECK:        ExceptionData {
+# CHECK-NEXT:      FunctionLength: 136
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 1
+# CHECK-NEXT:      ByteCodeLength: 40
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0xc80e              ; stp x19, x20, [sp, #112]
+# CHECK-NEXT:        0xc88c              ; stp x21, x22, [sp, #96]
+# CHECK-NEXT:        0xc90a              ; stp x23, x24, [sp, #80]
+# CHECK-NEXT:        0xc988              ; stp x25, x26, [sp, #64]
+# CHECK-NEXT:        0xca06              ; stp x27, x28, [sp, #48]
+# CHECK-NEXT:        0xdc45              ; str d9, [sp, #40]
+# CHECK-NEXT:        0xdc04              ; str d8, [sp, #32]
+# CHECK-NEXT:        0xd882              ; stp d10, d11, [sp, #16]
+# CHECK-NEXT:        0xde8f              ; str d12, [sp, #-128]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 25
+# CHECK-NEXT:          EpilogueStartIndex: 19
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xc80e              ; ldp x19, x20, [sp, #112]
+# CHECK-NEXT:            0xc88c              ; ldp x21, x22, [sp, #96]
+# CHECK-NEXT:            0xc90a              ; ldp x23, x24, [sp, #80]
+# CHECK-NEXT:            0xc988              ; ldp x25, x26, [sp, #64]
+# CHECK-NEXT:            0xca06              ; ldp x27, x28, [sp, #48]
+# CHECK-NEXT:            0xdc04              ; ldr d8, [sp, #32]
+# CHECK-NEXT:            0xdc45              ; ldr d9, [sp, #40]
+# CHECK-NEXT:            0xd882              ; ldp d10, d11, [sp, #16]
+# CHECK-NEXT:            0xde8f              ; ldr d12, [sp], #128
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       128
+  offsetAdjustment: 0
+  maxAlignment:    16
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x19', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x20', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -24, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x21', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, name: '', type: spill-slot, offset: -32, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x22', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 4, name: '', type: spill-slot, offset: -40, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x23', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: -48, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x24', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -56, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x25', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 7, name: '', type: spill-slot, offset: -64, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x26', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 8, name: '', type: spill-slot, offset: -72, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x27', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 9, name: '', type: spill-slot, offset: -80, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x28', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 10, name: '', type: spill-slot, offset: -88, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d8', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 11, name: '', type: spill-slot, offset: -96, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d9', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 12, name: '', type: spill-slot, offset: -104, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d10', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 13, name: '', type: spill-slot, offset: -112, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d11', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 14, name: '', type: spill-slot, offset: -128, size: 8, alignment: 16,
+      stack-id: 0, callee-saved-register: '$d12', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $d0, $d1, $d8, $d9, $d10, $d11, $d12, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28
+
+    early-clobber $sp = frame-setup STRDpre killed $d12, $sp, -128 :: (store 8 into %stack.14)
+    frame-setup SEH_SaveFReg_X 12, -128
+    frame-setup STPDi killed $d10, killed $d11, $sp, 2 :: (store 8 into %stack.12), (store 8 into %stack.13)
+    frame-setup SEH_SaveFRegP 10, 11, 16
+    frame-setup STRDui killed $d8, $sp, 4 :: (store 8 into %stack.10)
+    frame-setup SEH_SaveFReg 8, 32
+    frame-setup STRDui killed $d9, $sp, 5 :: (store 8 into %stack.11)
+    frame-setup SEH_SaveFReg 9, 40
+    frame-setup STPXi killed $x27, killed $x28, $sp, 6 :: (store 8 into %stack.8), (store 8 into %stack.9)
+    frame-setup SEH_SaveRegP 27, 28, 48
+    frame-setup STPXi killed $x25, killed $x26, $sp, 8 :: (store 8 into %stack.6), (store 8 into %stack.7)
+    frame-setup SEH_SaveRegP 25, 26, 64
+    frame-setup STPXi killed $x23, killed $x24, $sp, 10 :: (store 8 into %stack.4), (store 8 into %stack.5)
+    frame-setup SEH_SaveRegP 23, 24, 80
+    frame-setup STPXi killed $x21, killed $x22, $sp, 12 :: (store 8 into %stack.2), (store 8 into %stack.3)
+    frame-setup SEH_SaveRegP 21, 22, 96
+    frame-setup STPXi killed $x19, killed $x20, $sp, 14 :: (store 8 into %stack.0), (store 8 into %stack.1)
+    frame-setup SEH_SaveRegP 19, 20, 112
+    frame-setup SEH_PrologEnd
+    $x19 = ADDXrr $x0, killed $x1
+    $d8 = FADDDrr killed $d0, $d1
+    $d9 = FADDDrr $d8, $d1
+    $d10 = FADDDrr $d9, $d8
+    $d11 = FADDDrr killed $d9, $d10
+    $d12 = FADDDrr killed $d10, killed $d11
+    $x20 = ADDXrr $x19, killed $x0
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    $x0 = COPY $d12
+    $x0 = ADDXrr $x0, killed $x28
+    frame-destroy SEH_EpilogStart
+    $x19, $x20 = frame-destroy LDPXi $sp, 14 :: (load 8 from %stack.0), (load 8 from %stack.1)
+    frame-destroy SEH_SaveRegP 19, 20, 112
+    $x21, $x22 = frame-destroy LDPXi $sp, 12 :: (load 8 from %stack.2), (load 8 from %stack.3)
+    frame-destroy SEH_SaveRegP 21, 22, 96
+    $x23, $x24 = frame-destroy LDPXi $sp, 10 :: (load 8 from %stack.4), (load 8 from %stack.5)
+    frame-destroy SEH_SaveRegP 23, 24, 80
+    $x25, $x26 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.6), (load 8 from %stack.7)
+    frame-destroy SEH_SaveRegP 25, 26, 64
+    $x27, $x28 = frame-destroy LDPXi $sp, 6 :: (load 8 from %stack.8), (load 8 from %stack.9)
+    frame-destroy SEH_SaveRegP 27, 28, 48
+    $d8 = frame-destroy LDRDui $sp, 4 :: (load 8 from %stack.10)
+    frame-destroy SEH_SaveFReg 8, 32
+    $d9 = frame-destroy LDRDui $sp, 5 :: (load 8 from %stack.11)
+    frame-destroy SEH_SaveFReg 9, 40
+    $d10, $d11 = frame-destroy LDPDi $sp, 2 :: (load 8 from %stack.12), (load 8 from %stack.13)
+    frame-destroy SEH_SaveFRegP 10, 11, 16
+    early-clobber $sp, $d12 = frame-destroy LDRDpost $sp, 128 :: (load 8 from %stack.14)
+    frame-destroy SEH_SaveFReg_X 12, -128
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh3.mir b/test/CodeGen/AArch64/wineh3.mir
new file mode 100644
index 00000000000..6d54430cfc5
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh3.mir
@@ -0,0 +1,171 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:   -disable-post-ra -filetype=obj | llvm-readobj -unwind | FileCheck %s
+# Test that the register pairing of both general purpose and floating point
+# registers is correctly saved in the .xdata section, as well as the pre/post
+# increment of floating point register pairs.
+
+# CHECK:        ExceptionData {
+# CHECK-NEXT:      FunctionLength: 124
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 1
+# CHECK-NEXT:      ByteCodeLength: 32
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0xc80c              ; stp x19, x20, [sp, #96]
+# CHECK-NEXT:        0xc88a              ; stp x21, x22, [sp, #80]
+# CHECK-NEXT:        0xc908              ; stp x23, x24, [sp, #64]
+# CHECK-NEXT:        0xc986              ; stp x25, x26, [sp, #48]
+# CHECK-NEXT:        0xca04              ; stp x27, x28, [sp, #32]
+# CHECK-NEXT:        0xd802              ; stp d8, d9, [sp, #16]
+# CHECK-NEXT:        0xda8d              ; stp d10, d11, [sp, #-112]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 23
+# CHECK-NEXT:          EpilogueStartIndex: 15
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xc80c              ; ldp x19, x20, [sp, #96]
+# CHECK-NEXT:            0xc88a              ; ldp x21, x22, [sp, #80]
+# CHECK-NEXT:            0xc908              ; ldp x23, x24, [sp, #64]
+# CHECK-NEXT:            0xc986              ; ldp x25, x26, [sp, #48]
+# CHECK-NEXT:            0xca04              ; ldp x27, x28, [sp, #32]
+# CHECK-NEXT:            0xd802              ; ldp d8, d9, [sp, #16]
+# CHECK-NEXT:            0xda8d              ; ldp d10, d11, [sp], #112
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x19', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x20', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -24, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x21', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, name: '', type: spill-slot, offset: -32, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x22', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 4, name: '', type: spill-slot, offset: -40, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x23', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: -48, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x24', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -56, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x25', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 7, name: '', type: spill-slot, offset: -64, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x26', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 8, name: '', type: spill-slot, offset: -72, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x27', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 9, name: '', type: spill-slot, offset: -80, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x28', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 10, name: '', type: spill-slot, offset: -88, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d8', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 11, name: '', type: spill-slot, offset: -96, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d9', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 12, name: '', type: spill-slot, offset: -104, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d10', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 13, name: '', type: spill-slot, offset: -112, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d11', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $d0, $d1, $d10, $d11, $d8, $d9, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20
+
+    early-clobber $sp = frame-setup STPDpre killed $d10, killed $d11, $sp, -14 :: (store 8 into %stack.12), (store 8 into %stack.13)
+    frame-setup SEH_SaveFRegP_X 10, 11, -112
+    frame-setup STPDi killed $d8, killed $d9, $sp, 2 :: (store 8 into %stack.10), (store 8 into %stack.11)
+    frame-setup SEH_SaveFRegP 8, 9, 16
+    frame-setup STPXi killed $x27, killed $x28, $sp, 4 :: (store 8 into %stack.8), (store 8 into %stack.9)
+    frame-setup SEH_SaveRegP 27, 28, 32
+    frame-setup STPXi killed $x25, killed $x26, $sp, 6 :: (store 8 into %stack.6), (store 8 into %stack.7)
+    frame-setup SEH_SaveRegP 25, 26, 48
+    frame-setup STPXi killed $x23, killed $x24, $sp, 8 :: (store 8 into %stack.4), (store 8 into %stack.5)
+    frame-setup SEH_SaveRegP 23, 24, 64
+    frame-setup STPXi killed $x21, killed $x22, $sp, 10 :: (store 8 into %stack.2), (store 8 into %stack.3)
+    frame-setup SEH_SaveRegP 21, 22, 80
+    frame-setup STPXi killed $x19, killed $x20, $sp, 12 :: (store 8 into %stack.0), (store 8 into %stack.1)
+    frame-setup SEH_SaveRegP 19, 20, 96
+    frame-setup SEH_PrologEnd
+    $x19 = ADDXrr $x0, killed $x1
+    $d8 = FADDDrr killed $d0, $d1
+    $d9 = FADDDrr $d8, $d1
+    $d10 = FADDDrr $d9, $d8
+    $d11 = FADDDrr killed $d9, $d10
+    $x20 = ADDXrr $x19, killed $x0
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    $x0 = COPY $d11
+    $x0 = ADDXrr $x0, killed $x28
+    frame-destroy SEH_EpilogStart
+    $x19, $x20 = frame-destroy LDPXi $sp, 12 :: (load 8 from %stack.0), (load 8 from %stack.1)
+    frame-destroy SEH_SaveRegP 19, 20, 96
+    $x21, $x22 = frame-destroy LDPXi $sp, 10 :: (load 8 from %stack.2), (load 8 from %stack.3)
+    frame-destroy SEH_SaveRegP 21, 22, 80
+    $x23, $x24 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.4), (load 8 from %stack.5)
+    frame-destroy SEH_SaveRegP 23, 24, 64
+    $x25, $x26 = frame-destroy LDPXi $sp, 6 :: (load 8 from %stack.6), (load 8 from %stack.7)
+    frame-destroy SEH_SaveRegP 25, 26, 48
+    $x27, $x28 = frame-destroy LDPXi $sp, 4 :: (load 8 from %stack.8), (load 8 from %stack.9)
+    frame-destroy SEH_SaveRegP 27, 28, 32
+    $d8, $d9 = frame-destroy LDPDi $sp, 2 :: (load 8 from %stack.10), (load 8 from %stack.11)
+    frame-destroy SEH_SaveFRegP 8, 9, 16
+    early-clobber $sp, $d10, $d11 = frame-destroy LDPDpost $sp, 14 :: (load 8 from %stack.12), (load 8 from %stack.13)
+    frame-destroy SEH_SaveFRegP_X 10, 11, -112
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh4.mir b/test/CodeGen/AArch64/wineh4.mir
new file mode 100644
index 00000000000..39a0d7ec694
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh4.mir
@@ -0,0 +1,228 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:   -disable-branch-fold -disable-post-ra -filetype=obj \
+# RUN: | llvm-readobj -unwind | FileCheck %s
+# Check that multiple epilgoues are correctly placed in .xdata.
+
+# CHECK:        ExceptionData {
+# CHECK-NEXT:      FunctionLength: 164
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 2
+# CHECK-NEXT:      ByteCodeLength: 48
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0xc80c              ; stp x19, x20, [sp, #96]
+# CHECK-NEXT:        0xc88a              ; stp x21, x22, [sp, #80]
+# CHECK-NEXT:        0xc908              ; stp x23, x24, [sp, #64]
+# CHECK-NEXT:        0xc986              ; stp x25, x26, [sp, #48]
+# CHECK-NEXT:        0xca04              ; stp x27, x28, [sp, #32]
+# CHECK-NEXT:        0xd802              ; stp d8, d9, [sp, #16]
+# CHECK-NEXT:        0xda8d              ; stp d10, d11, [sp, #-112]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 16
+# CHECK-NEXT:          EpilogueStartIndex: 15
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xc80c              ; ldp x19, x20, [sp, #96]
+# CHECK-NEXT:            0xc88a              ; ldp x21, x22, [sp, #80]
+# CHECK-NEXT:            0xc908              ; ldp x23, x24, [sp, #64]
+# CHECK-NEXT:            0xc986              ; ldp x25, x26, [sp, #48]
+# CHECK-NEXT:            0xca04              ; ldp x27, x28, [sp, #32]
+# CHECK-NEXT:            0xd802              ; ldp d8, d9, [sp, #16]
+# CHECK-NEXT:            0xda8d              ; ldp d10, d11, [sp], #112
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 33
+# CHECK-NEXT:          EpilogueStartIndex: 30
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xc80c              ; ldp x19, x20, [sp, #96]
+# CHECK-NEXT:            0xc88a              ; ldp x21, x22, [sp, #80]
+# CHECK-NEXT:            0xc908              ; ldp x23, x24, [sp, #64]
+# CHECK-NEXT:            0xc986              ; ldp x25, x26, [sp, #48]
+# CHECK-NEXT:            0xca04              ; ldp x27, x28, [sp, #32]
+# CHECK-NEXT:            0xd802              ; ldp d8, d9, [sp, #16]
+# CHECK-NEXT:            0xda8d              ; ldp d10, d11, [sp], #112
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x19', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x20', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -24, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x21', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, name: '', type: spill-slot, offset: -32, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x22', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 4, name: '', type: spill-slot, offset: -40, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x23', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: -48, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x24', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -56, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x25', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 7, name: '', type: spill-slot, offset: -64, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x26', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 8, name: '', type: spill-slot, offset: -72, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x27', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 9, name: '', type: spill-slot, offset: -80, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x28', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 10, name: '', type: spill-slot, offset: -88, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d8', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 11, name: '', type: spill-slot, offset: -96, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d9', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 12, name: '', type: spill-slot, offset: -104, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d10', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 13, name: '', type: spill-slot, offset: -112, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d11', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $x0, $x1, $d0, $d1, $d10, $d11, $d8, $d9, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20
+
+    early-clobber $sp = frame-setup STPDpre killed $d10, killed $d11, $sp, -14 :: (store 8 into %stack.12), (store 8 into %stack.13)
+    frame-setup SEH_SaveFRegP_X 10, 11, -112
+    frame-setup STPDi killed $d8, killed $d9, $sp, 2 :: (store 8 into %stack.10), (store 8 into %stack.11)
+    frame-setup SEH_SaveFRegP 8, 9, 16
+    frame-setup STPXi killed $x27, killed $x28, $sp, 4 :: (store 8 into %stack.8), (store 8 into %stack.9)
+    frame-setup SEH_SaveRegP 27, 28, 32
+    frame-setup STPXi killed $x25, killed $x26, $sp, 6 :: (store 8 into %stack.6), (store 8 into %stack.7)
+    frame-setup SEH_SaveRegP 25, 26, 48
+    frame-setup STPXi killed $x23, killed $x24, $sp, 8 :: (store 8 into %stack.4), (store 8 into %stack.5)
+    frame-setup SEH_SaveRegP 23, 24, 64
+    frame-setup STPXi killed $x21, killed $x22, $sp, 10 :: (store 8 into %stack.2), (store 8 into %stack.3)
+    frame-setup SEH_SaveRegP 21, 22, 80
+    frame-setup STPXi killed $x19, killed $x20, $sp, 12 :: (store 8 into %stack.0), (store 8 into %stack.1)
+    frame-setup SEH_SaveRegP 19, 20, 96
+    frame-setup SEH_PrologEnd
+    frame-setup CFI_INSTRUCTION def_cfa_offset 112
+    frame-setup CFI_INSTRUCTION offset $w19, -8
+    frame-setup CFI_INSTRUCTION offset $w20, -16
+    frame-setup CFI_INSTRUCTION offset $w21, -24
+    frame-setup CFI_INSTRUCTION offset $w22, -32
+    frame-setup CFI_INSTRUCTION offset $w23, -40
+    frame-setup CFI_INSTRUCTION offset $w24, -48
+    frame-setup CFI_INSTRUCTION offset $w25, -56
+    frame-setup CFI_INSTRUCTION offset $w26, -64
+    frame-setup CFI_INSTRUCTION offset $w27, -72
+    frame-setup CFI_INSTRUCTION offset $w28, -80
+    frame-setup CFI_INSTRUCTION offset $b8, -88
+    frame-setup CFI_INSTRUCTION offset $b9, -96
+    frame-setup CFI_INSTRUCTION offset $b10, -104
+    frame-setup CFI_INSTRUCTION offset $b11, -112
+    $x19 = ADDXrr $x0, killed $x1
+    $d8 = FADDDrr killed $d0, $d1
+    $d9 = FADDDrr $d8, $d1
+    $d10 = FADDDrr $d9, $d8
+    $d11 = FADDDrr killed $d9, $d10
+    $x20 = SUBSXrr $x19, killed $x0, implicit-def $nzcv
+    Bcc 1, %bb.2, implicit killed $nzcv
+    B %bb.1
+
+  bb.1:
+    liveins: $x19, $x20
+
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    $x0 = COPY $x28
+    frame-destroy SEH_EpilogStart
+    $x19, $x20 = frame-destroy LDPXi $sp, 12 :: (load 8 from %stack.0), (load 8 from %stack.1)
+    frame-destroy SEH_SaveRegP 19, 20, 96
+    $x21, $x22 = frame-destroy LDPXi $sp, 10 :: (load 8 from %stack.2), (load 8 from %stack.3)
+    frame-destroy SEH_SaveRegP 21, 22, 80
+    $x23, $x24 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.4), (load 8 from %stack.5)
+    frame-destroy SEH_SaveRegP 23, 24, 64
+    $x25, $x26 = frame-destroy LDPXi $sp, 6 :: (load 8 from %stack.6), (load 8 from %stack.7)
+    frame-destroy SEH_SaveRegP 25, 26, 48
+    $x27, $x28 = frame-destroy LDPXi $sp, 4 :: (load 8 from %stack.8), (load 8 from %stack.9)
+    frame-destroy SEH_SaveRegP 27, 28, 32
+    $d8, $d9 = frame-destroy LDPDi $sp, 2 :: (load 8 from %stack.10), (load 8 from %stack.11)
+    frame-destroy SEH_SaveFRegP 8, 9, 16
+    early-clobber $sp, $d10, $d11 = frame-destroy LDPDpost $sp, 14 :: (load 8 from %stack.12), (load 8 from %stack.13)
+    frame-destroy SEH_SaveFRegP_X 10, 11, -112
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $x0
+
+  bb.2:
+    liveins: $x28, $d11
+
+    $x0 = COPY $d11
+    $x0 = ADDXrr $x0, killed $x28
+    frame-destroy SEH_EpilogStart
+    $x19, $x20 = frame-destroy LDPXi $sp, 12 :: (load 8 from %stack.0), (load 8 from %stack.1)
+    frame-destroy SEH_SaveRegP 19, 20, 96
+    $x21, $x22 = frame-destroy LDPXi $sp, 10 :: (load 8 from %stack.2), (load 8 from %stack.3)
+    frame-destroy SEH_SaveRegP 21, 22, 80
+    $x23, $x24 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.4), (load 8 from %stack.5)
+    frame-destroy SEH_SaveRegP 23, 24, 64
+    $x25, $x26 = frame-destroy LDPXi $sp, 6 :: (load 8 from %stack.6), (load 8 from %stack.7)
+    frame-destroy SEH_SaveRegP 25, 26, 48
+    $x27, $x28 = frame-destroy LDPXi $sp, 4 :: (load 8 from %stack.8), (load 8 from %stack.9)
+    frame-destroy SEH_SaveRegP 27, 28, 32
+    $d8, $d9 = frame-destroy LDPDi $sp, 2 :: (load 8 from %stack.10), (load 8 from %stack.11)
+    frame-destroy SEH_SaveFRegP 8, 9, 16
+    early-clobber $sp, $d10, $d11 = frame-destroy LDPDpost $sp, 14 :: (load 8 from %stack.12), (load 8 from %stack.13)
+    frame-destroy SEH_SaveFRegP_X 10, 11, -112
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh5.mir b/test/CodeGen/AArch64/wineh5.mir
new file mode 100644
index 00000000000..f1fa6d4d47b
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh5.mir
@@ -0,0 +1,224 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:   -disable-post-ra -filetype=obj | llvm-readobj -unwind | FileCheck %s
+
+# Check that that the large stack allocation is correctly represented in .xdata.
+
+# CHECK:        ExceptionData {
+# CHECK-NEXT:     FunctionLength: 156
+# CHECK-NEXT:     Version: 0
+# CHECK-NEXT:     ExceptionData: No
+# CHECK-NEXT:     EpiloguePacked: No
+# CHECK-NEXT:     EpilogueScopes: 1
+# CHECK-NEXT:     ByteCodeLength: 20
+# CHECK-NEXT:     Prologue [
+# CHECK-NEXT:       0xe002dac9          ; sub sp, #2993296
+# CHECK-NEXT:       0xe3                ; nop
+# CHECK-NEXT:       0xe3                ; nop
+# CHECK-NEXT:       0x42                ; stp x29, x30, [sp, #16]
+# CHECK-NEXT:       0xd53f              ; str x28, [sp, #256]!
+# CHECK-NEXT:       0xe4                ; end
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     EpilogueScopes [
+# CHECK-NEXT:       EpilogueScope {
+# CHECK-NEXT:         StartOffset: 34
+# CHECK-NEXT:         EpilogueStartIndex: 10
+# CHECK-NEXT:         Opcodes [
+# CHECK-NEXT:           0xe002da00          ; add sp, #2990080
+# CHECK-NEXT:           0xc0c9              ; add sp, #3216
+# CHECK-NEXT:           0x42                ; ldp x29, x30, [sp, #16]
+# CHECK-NEXT:           0xd53f              ; ldr x28, [sp], #256
+# CHECK-NEXT:           0xe4                ; end
+# CHECK-NEXT:         ]
+# CHECK-NEXT:       }
+# CHECK-NEXT:     ]
+# CHECK-NEXT:   }
+
+
+--- |
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  ; Function Attrs: noinline optnone
+  define dso_local i32 @"?func@@YAHH@Z"(i32 %i) #0 {
+  entry:
+    %retval = alloca i32, align 4
+    %i.addr = alloca i32, align 4
+    %A = alloca [748193 x i32], align 4
+    %a = alloca i32, align 4
+    %B = alloca [123 x i32], align 4
+    store i32 %i, i32* %i.addr, align 4
+    %0 = load i32, i32* %i.addr, align 4
+    %add = add nsw i32 %0, 2
+    store i32 %add, i32* %a, align 4
+    %call = call i32 @"?func2@@YAHXZ"()
+    %1 = load i32, i32* %i.addr, align 4
+    %cmp = icmp sgt i32 %1, 2
+    br i1 %cmp, label %if.then, label %if.else
+
+  if.then:                                          ; preds = %entry
+    %call1 = call i32 @"?func2@@YAHXZ"()
+    store i32 %call1, i32* %retval, align 4
+    br label %return
+
+  if.else:                                          ; preds = %entry
+    %arraydecay = getelementptr inbounds [123 x i32], [123 x i32]* %B, i32 0, i32 0
+    %call2 = call i32 @"?func3@@YAHPEAH@Z"(i32* %arraydecay)
+    store i32 %call2, i32* %retval, align 4
+    br label %return
+
+  return:                                           ; preds = %if.else, %if.then
+    %2 = load i32, i32* %retval, align 4
+    ret i32 %2
+  }
+
+  declare dso_local i32 @"?func2@@YAHXZ"() #1
+
+  declare dso_local i32 @"?func3@@YAHPEAH@Z"(i32*) #1
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #2
+
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { nounwind }
+
+  !llvm.module.flags = !{!0}
+
+  !0 = !{i32 1, !"wchar_size", i32 2}
+
+...
+---
+name:            '?func@@YAHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: true
+selected:        true
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       2993328
+  offsetAdjustment: 0
+  maxAlignment:    16
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  2993276
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: retval, type: default, offset: -36, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: i.addr, type: default, offset: -40, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: A, type: default, offset: -2992812, size: 2992772, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2992780, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 3, name: a, type: default, offset: -2992816, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2992784, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 4, name: B, type: default, offset: -2993308, size: 492, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2993276, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: -2993320, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -2993324, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 7, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$fp', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 8, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 9, name: '', type: spill-slot, offset: -32, size: 8, alignment: 16,
+      stack-id: 0, callee-saved-register: '$x28', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.1.entry:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+    liveins: $w0, $x28, $fp, $lr
+
+    early-clobber $sp = frame-setup STRXpre killed $x28, $sp, -32 :: (store 8 into %stack.9)
+    frame-setup SEH_SaveReg_X 28, -256
+    frame-setup STPXi killed $fp, killed $lr, $sp, 2 :: (store 8 into %stack.7), (store 8 into %stack.8)
+    frame-setup SEH_SaveFPLR 16
+    $x15 = frame-setup MOVi64imm 187081
+    frame-setup SEH_Nop
+    frame-setup BL &__chkstk, implicit-def $lr, implicit $sp, implicit $x15
+    frame-setup SEH_Nop
+    $sp = frame-setup SUBXrx64 killed $sp, killed $x15, 28
+    frame-setup SEH_StackAlloc 2993296
+    frame-setup SEH_PrologEnd
+    $x8 = ADDXri $sp, 730, 12
+    $x8 = ADDXri $x8, 3208, 0
+    renamable $w9 = MOVi32imm 2
+    STRWui killed renamable $w0, renamable $x8, 0 :: (store 4 into %ir.i.addr)
+    renamable $w0 = LDRWui renamable $x8, 0 :: (load 4 from %ir.i.addr)
+    renamable $w0 = ADDWri killed renamable $w0, 2, 0
+    STRWui killed renamable $w0, $sp, 128 :: (store 4 into %ir.a)
+    STRXui killed $x8, $sp, 1 :: (store 8 into %stack.5)
+    STRWui killed $w9, $sp, 1 :: (store 4 into %stack.6)
+    BL @"?func2@@YAHXZ", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def $w0
+    $x8 = LDRXui $sp, 1 :: (load 8 from %stack.5)
+    renamable $w9 = LDRWui killed renamable $x8, 0 :: (load 4 from %ir.i.addr)
+    $w10 = LDRWui $sp, 1 :: (load 4 from %stack.6)
+    $wzr = SUBSWrr killed renamable $w9, killed renamable $w10, implicit-def $nzcv
+    renamable $w9 = CSINCWr $wzr, $wzr, 13, implicit $nzcv
+    TBNZW killed renamable $w9, 0, %bb.2
+    B %bb.3
+
+  bb.2.if.then:
+    successors: %bb.4(0x80000000)
+
+    BL @"?func2@@YAHXZ", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def $w0
+    $x8 = LDRXui $sp, 1 :: (load 8 from %stack.5)
+    STRWui killed renamable $w0, killed renamable $x8, 1 :: (store 4 into %ir.retval)
+    B %bb.4
+
+  bb.3.if.else:
+    successors: %bb.4(0x80000000)
+
+    $x8 = ADDXri $sp, 20, 0
+    $x0 = COPY killed renamable $x8
+    BL @"?func3@@YAHPEAH@Z", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit killed $x0, implicit-def $w0
+    $x8 = LDRXui $sp, 1 :: (load 8 from %stack.5)
+    STRWui killed renamable $w0, killed renamable $x8, 1 :: (store 4 into %ir.retval)
+
+  bb.4.return:
+    $x8 = LDRXui $sp, 1 :: (load 8 from %stack.5)
+    renamable $w0 = LDRWui killed renamable $x8, 1 :: (load 4 from %ir.retval)
+    frame-destroy SEH_EpilogStart
+    $sp = frame-destroy ADDXri $sp, 730, 12
+    frame-destroy SEH_StackAlloc 2990080
+    $sp = frame-destroy ADDXri $sp, 3216, 0
+    frame-destroy SEH_StackAlloc 3216
+    $fp, $lr = frame-destroy LDPXi $sp, 2 :: (load 8 from %stack.7), (load 8 from %stack.8)
+    frame-destroy SEH_SaveFPLR 16
+    early-clobber $sp, $x28 = frame-destroy LDRXpost $sp, 32 :: (load 8 from %stack.9)
+    frame-destroy SEH_SaveReg_X 28, -256
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit killed $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh6.mir b/test/CodeGen/AArch64/wineh6.mir
new file mode 100644
index 00000000000..08db6656980
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh6.mir
@@ -0,0 +1,138 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:   -disable-post-ra -filetype=obj | llvm-readobj -unwind | FileCheck %s
+# Check save_fplr_x, set_fp, alloc_s
+
+# CHECK: 	ExceptionData {
+# CHECK-NEXT:      FunctionLength: 92
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 1
+# CHECK-NEXT:      ByteCodeLength: 8
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0x02                ; sub sp, #32
+# CHECK-NEXT:        0xe1                ; mov fp, sp
+# CHECK-NEXT:        0x81                ; stp x29, x30, [sp, #-16]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 20
+# CHECK-NEXT:          EpilogueStartIndex: 4
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xe1                ; mov fp, sp
+# CHECK-NEXT:            0x81                ; ldp x29, x30, [sp], #16
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+...
+---
+name:            '?func@@YAHHHHH@Z'
+alignment:       3
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI: true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+  - { reg: '$w1', virtual-reg: '' }
+  - { reg: '$w2', virtual-reg: '' }
+  - { reg: '$w3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       48
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  24
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: '', type: default, offset: -20, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: '', type: default, offset: -24, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: '', type: default, offset: -28, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -12, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 3, name: '', type: default, offset: -32, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -16, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 4, name: '', type: default, offset: -40, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -24, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 5, name: '', type: variable-sized, offset: -40,
+      alignment: 1, stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -24, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -48, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 7, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$fp', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 8, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $w0, $w1, $w2, $w3, $lr
+
+    early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store 8 into %stack.7), (store 8 into %stack.8)
+    frame-setup SEH_SaveFPLR_X -16
+    $fp = frame-setup ADDXri $sp, 0, 0
+    frame-setup SEH_SetFP
+    $sp = frame-setup SUBXri $sp, 32, 0
+    frame-setup SEH_StackAlloc 32
+    frame-setup SEH_PrologEnd
+    STURWi killed renamable $w3, $fp, -4
+    STURWi killed renamable $w2, $fp, -8
+    STURWi killed renamable $w1, $fp, -12
+    STURWi killed renamable $w0, $fp, -16
+    renamable $x8 = LDURSWi $fp, -16
+    renamable $x8 = ADDXri killed renamable $x8, 15, 0
+    renamable $x8 = UBFMXri killed renamable $x8, 4, 63
+    $x15 = COPY renamable $x8
+    STURXi killed $x8, $fp, -32 :: (store 8 into %stack.6)
+    BL &__chkstk, csr_aarch64_stackprobe_windows, implicit-def dead $lr, implicit $sp, implicit killed $x15
+    renamable $x8 = COPY $sp
+    $x15 = LDURXi $fp, -32 :: (load 8 from %stack.6)
+    renamable $x8 = SUBSXrs killed renamable $x8, killed renamable $x15, 4, implicit-def dead $nzcv
+    $sp = COPY renamable $x8
+    STURXi killed renamable $x8, $fp, -24
+    renamable $x0 = LDURXi $fp, -24
+    renamable $w1 = COPY $wzr
+    $w0 = COPY killed renamable $w1
+    frame-destroy SEH_EpilogStart
+    $sp = frame-destroy ADDXri $fp, 0, 0
+    frame-destroy SEH_SetFP
+    early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
+    frame-destroy SEH_SaveFPLR_X -16
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit killed $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh7.mir b/test/CodeGen/AArch64/wineh7.mir
new file mode 100644
index 00000000000..60094539297
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh7.mir
@@ -0,0 +1,134 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:   -filetype=obj -disable-post-ra | llvm-readobj -unwind | FileCheck %s
+# Check AddFP
+
+# CHECK:	 ExceptionData {
+# CHECK-NEXT:      FunctionLength: 72
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 1
+# CHECK-NEXT:      ByteCodeLength: 16
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0xe204              ; add fp, sp, #32
+# CHECK-NEXT:        0x44                ; stp x29, x30, [sp, #32]
+# CHECK-NEXT:        0xc802              ; stp x19, x20, [sp, #16]
+# CHECK-NEXT:        0xcc85              ; stp x21, x22, [sp, #-48]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 13
+# CHECK-NEXT:          EpilogueStartIndex: 8
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xe204              ; add fp, sp, #32
+# CHECK-NEXT:            0x44                ; ldp x29, x30, [sp, #32]
+# CHECK-NEXT:            0xc802              ; ldp x19, x20, [sp, #16]
+# CHECK-NEXT:            0xcc85              ; ldp x21, x22, [sp], #48
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+
+...
+---
+name:            '?func@@YAHHHHH@Z'
+alignment:       3
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI: true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+  - { reg: '$w1', virtual-reg: '' }
+  - { reg: '$w2', virtual-reg: '' }
+  - { reg: '$w3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       48
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: '', type: variable-sized, offset: -48,
+      alignment: 1, stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$fp', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, name: '', type: spill-slot, offset: -24, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x19', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 4, name: '', type: spill-slot, offset: -32, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x20', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: -40, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x21', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -48, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x22', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $w0, $w1, $w2, $w3, $x21, $x22, $x19, $x20, $lr
+
+    early-clobber $sp = frame-setup STPXpre killed $x21, killed $x22, $sp, -6 :: (store 8 into %stack.5), (store 8 into %stack.6)
+    frame-setup SEH_SaveRegP_X 21, 22, -48
+    frame-setup STPXi killed $x19, killed $x20, $sp, 2 :: (store 8 into %stack.3), (store 8 into %stack.4)
+    frame-setup SEH_SaveRegP 19, 20, 16
+    frame-setup STPXi killed $fp, killed $lr, $sp, 4 :: (store 8 into %stack.1), (store 8 into %stack.2)
+    frame-setup SEH_SaveFPLR 32
+    $fp = frame-setup ADDXri $sp, 32, 0
+    frame-setup SEH_AddFP 32
+    frame-setup SEH_PrologEnd
+    renamable $w19 = COPY $w3
+    renamable $w0 = KILL $w0, implicit-def $x0
+    renamable $w20 = COPY $w2
+    renamable $w21 = COPY $w1
+    renamable $x8 = SBFMXri killed renamable $x0, 0, 31
+    renamable $x9 = ADDXri killed renamable $x8, 15, 0
+    renamable $x15 = UBFMXri killed renamable $x9, 4, 63
+    renamable $x8 = COPY $sp
+    renamable $x22 = SUBXrs killed renamable $x8, killed renamable $x15, 4
+    $sp = COPY renamable $x22
+    $x0 = COPY renamable $x22
+    renamable $w8 = LDRWroW killed renamable $x22, killed renamable $w21, 1, 1
+    renamable $w9 = ADDWrr killed renamable $w19, killed renamable $w20
+    renamable $w0 = ADDWrr killed renamable $w9, killed renamable $w8
+    frame-destroy SEH_EpilogStart
+    $sp = frame-destroy SUBXri $fp, 32, 0
+    frame-destroy SEH_AddFP 32
+    $fp, $lr = frame-destroy LDPXi $sp, 4 :: (load 8 from %stack.1), (load 8 from %stack.2)
+    frame-destroy SEH_SaveFPLR 32
+    $x19, $x20 = frame-destroy LDPXi $sp, 2 :: (load 8 from %stack.3), (load 8 from %stack.4)
+    frame-destroy SEH_SaveRegP 19, 20, 16
+    early-clobber $sp, $x21, $x22 = frame-destroy LDPXpost $sp, 6 :: (load 8 from %stack.5), (load 8 from %stack.6)
+    frame-destroy SEH_SaveRegP_X 21, 22, -48
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh_shrinkwrap.mir b/test/CodeGen/AArch64/wineh_shrinkwrap.mir
new file mode 100644
index 00000000000..97204722bc1
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh_shrinkwrap.mir
@@ -0,0 +1,146 @@
+# RUN: llc -O2 -o - %s -mtriple=aarch64-windows -start-before=shrink-wrap \
+# RUN:   -stop-after=prologepilog | FileCheck %s --check-prefix=WIN64
+# RUN: llc -O2 -o - %s -mtriple=aarch64-linux -start-before=shrink-wrap \
+# RUN:   -stop-after=prologepilog | FileCheck %s --check-prefix=LINUX
+
+# This tests checks that shrink wrapping bails out on Windows AMR64 due to the
+# use of Windows CFI.  We don't currently support fragments for WIndows EH on
+# ARM64.
+# The same test gets shrink wrapped on Linux ARM64.
+
+# WIN64-LABEL: bb.0.entry:
+# WIN64: early-clobber $sp = frame-setup STRXpre killed $x28, $sp, -32
+# WIN64-LABEL: bb.1:
+# WIN64-LABEL: bb.2.if.then:
+
+# LINUX-LABEL: bb.0.entry:
+# LINUX-LABEL: bb.1:
+# LINUX-LABEL: bb.2.if.then:
+# LINUX: early-clobber $sp = frame-setup STRXpre killed $x28, $sp, -32
+--- |
+  ; ModuleID = 'shrink.cpp'
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  define dso_local i32 @"?func@@YAHHH@Z"(i32 %a, i32 %b) local_unnamed_addr #0 {
+  entry:
+    %A = alloca [1000 x i32], align 4
+    %cmp = icmp sgt i32 %a, 1
+    br i1 %cmp, label %if.then, label %return
+
+  if.then:                                          ; preds = %entry
+    %0 = bitcast [1000 x i32]* %A to i8*
+    call void @llvm.lifetime.start.p0i8(i64 4000, i8* nonnull %0) #3
+    %arraydecay2 = bitcast [1000 x i32]* %A to i32*
+    call void @"?init@@YAXPEAH@Z"(i32* nonnull %arraydecay2)
+    %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* %A, i64 0, i64 100
+    %1 = load i32, i32* %arrayidx, align 4, !tbaa !2
+    %add = add i32 %b, 1
+    %add1 = add i32 %add, %1
+    call void @llvm.lifetime.end.p0i8(i64 4000, i8* nonnull %0) #3
+    br label %return
+
+  return:                                           ; preds = %entry, %if.then
+    %retval.0 = phi i32 [ %add1, %if.then ], [ 0, %entry ]
+    ret i32 %retval.0
+  }
+
+  ; Function Attrs: argmemonly nounwind
+  declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+  declare dso_local void @"?init@@YAXPEAH@Z"(i32*) local_unnamed_addr #2
+
+  ; Function Attrs: argmemonly nounwind
+  declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #3
+
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { argmemonly nounwind }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #3 = { nounwind }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 2}
+  !1 = !{!"clang version 8.0.0"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C++ TBAA"}
+
+...
+---
+name:            '?func@@YAHHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+  - { reg: '$w1', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  4000
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: A, type: default, offset: 0, size: 4000, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4000, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $w0, $w1
+
+    dead $wzr = SUBSWri killed renamable $w0, 2, 0, implicit-def $nzcv
+    Bcc 10, %bb.2, implicit killed $nzcv
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    renamable $w0 = COPY $wzr
+    B %bb.3
+
+  bb.2.if.then:
+    successors: %bb.3(0x80000000)
+    liveins: $w1
+
+    renamable $w19 = COPY $w1
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $x0 = ADDXri %stack.0.A, 0, 0
+    BL @"?init@@YAXPEAH@Z", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $w8 = LDRWui %stack.0.A, 100 :: (dereferenceable load 4 from %ir.arrayidx, !tbaa !2)
+    renamable $w8 = ADDWrr killed renamable $w19, killed renamable $w8
+    renamable $w0 = ADDWri killed renamable $w8, 1, 0
+
+  bb.3.return:
+    liveins: $w0
+
+    RET_ReallyLR implicit $w0
+
+...
-- 
GitLab


From 643e94644ecbf40b2d03b01488487e313cf23367 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 27 Oct 2018 07:10:48 +0000
Subject: [PATCH 0667/1116] Revert rL345395: [X86][SSE] Move 2-input limit up
 from getFauxShuffleMask to resolveTargetShuffleInputs Makes no difference to
 actual shuffle decoding yet, but merges all the existing limits in one place
 for when proper support is fixed. ........ Its been reported that this is
 causing out of trunk failures.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345451 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2ebaec778e3..6d589eef5a9 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6325,6 +6325,9 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
         !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
       return false;
+    // TODO - Add support for more than 2 inputs.
+    if ((SrcInputs0.size() + SrcInputs1.size()) > 2)
+      return false;
     int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
     SmallVector<int, 64> Mask0, Mask1;
     scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
@@ -6384,7 +6387,8 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
         Mask[i + InsertIdx] = (NumElts * (1 + InputIdx)) + ExtractIdx + M;
       }
     }
-    return true;
+    // TODO - Add support for more than 1 subinput.
+    return Ops.size() <= 2;
   }
   case ISD::SCALAR_TO_VECTOR: {
     // Match against a scalar_to_vector of an extract from a vector,
@@ -6577,7 +6581,7 @@ static bool resolveTargetShuffleInputs(SDValue Op,
       return false;
 
   resolveTargetShuffleInputsAndMask(Inputs, Mask);
-  return Inputs.size() <= 2;
+  return true;
 }
 
 /// Returns the scalar element that will make up the ith
-- 
GitLab


From 57aa4fc38b0878c6328ab565a128bb79805e29bd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 27 Oct 2018 12:15:58 +0000
Subject: [PATCH 0668/1116] [TargetLowering] Move LegalizeDAG FP_TO_UINT
 handling to TargetLowering::expandFP_TO_UINT. NFCI.

First step towards fixing PR17686 and adding vector support.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345452 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/TargetLowering.h       |  6 ++++
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp    | 25 ++---------------
 lib/CodeGen/SelectionDAG/TargetLowering.cpp | 31 +++++++++++++++++++++
 3 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 4dfc72ea52a..2475a0f3686 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -3663,6 +3663,12 @@ public:
   /// \returns True, if the expansion was successful, false otherwise
   bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
 
+  /// Expand float to UINT conversion
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
   /// Expand UINT(i64) to double(f64) conversion
   /// \param N Node to expand
   /// \param Result output after conversion
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index e506f7b76b1..dcc47454399 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2877,29 +2877,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
       Results.push_back(Tmp1);
     break;
-  case ISD::FP_TO_UINT: {
-    SDValue True, False;
-    EVT VT =  Node->getOperand(0).getValueType();
-    EVT NVT = Node->getValueType(0);
-    APFloat apf(DAG.EVTToAPFloatSemantics(VT),
-                APInt::getNullValue(VT.getSizeInBits()));
-    APInt x = APInt::getSignMask(NVT.getSizeInBits());
-    (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
-    Tmp1 = DAG.getConstantFP(apf, dl, VT);
-    Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT),
-                        Node->getOperand(0),
-                        Tmp1, ISD::SETLT);
-    True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
-    // TODO: Should any fast-math-flags be set for the FSUB?
-    False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
-                        DAG.getNode(ISD::FSUB, dl, VT,
-                                    Node->getOperand(0), Tmp1));
-    False = DAG.getNode(ISD::XOR, dl, NVT, False,
-                        DAG.getConstant(x, dl, NVT));
-    Tmp1 = DAG.getSelect(dl, NVT, Tmp2, True, False);
-    Results.push_back(Tmp1);
+  case ISD::FP_TO_UINT:
+    if (TLI.expandFP_TO_UINT(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
     break;
-  }
   case ISD::VAARG:
     Results.push_back(DAG.expandVAArg(Node));
     Results.push_back(Results[0].getValue(1));
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1a29cb7ebf7..d6e7590b8fc 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4137,6 +4137,37 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
   return true;
 }
 
+bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
+                                      SelectionDAG &DAG) const {
+  SDLoc dl(SDValue(Node, 0));
+  SDValue Src = Node->getOperand(0);
+
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
+
+  // Expand based on maximum range of FP_TO_SINT:
+  // True = fp_to_sint(Src)
+  // False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000)
+  // Result = select (Src < 0x8000000000000000), True, False
+  APFloat apf(DAG.EVTToAPFloatSemantics(SrcVT),
+              APInt::getNullValue(SrcVT.getScalarSizeInBits()));
+  APInt x = APInt::getSignMask(DstVT.getScalarSizeInBits());
+  (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
+
+  SDValue Tmp1 = DAG.getConstantFP(apf, dl, SrcVT);
+  SDValue Tmp2 = DAG.getSetCC(dl, SetCCVT, Src, Tmp1, ISD::SETLT);
+  SDValue True = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
+  // TODO: Should any fast-math-flags be set for the FSUB?
+  SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT,
+                              DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Tmp1));
+  False =
+      DAG.getNode(ISD::XOR, dl, DstVT, False, DAG.getConstant(x, dl, DstVT));
+  Result = DAG.getSelect(dl, DstVT, Tmp2, True, False);
+  return true;
+}
+
 bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
                                       SelectionDAG &DAG) const {
   SDValue Src = Node->getOperand(0);
-- 
GitLab


From 5bdc0dc5915a6cbbaf23bda2959169afcc9951ef Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 27 Oct 2018 15:00:38 +0000
Subject: [PATCH 0669/1116] Regenerate FP_TO_INT tests.

Precursor to fix for PR17686

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345453 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/Mips/2008-07-07-Float2Int.ll |   26 +-
 test/CodeGen/Mips/msa/f16-llvm-ir.ll      | 3471 +++++++++++++++++----
 test/CodeGen/SystemZ/fp-conv-10.ll        |   57 +-
 test/CodeGen/SystemZ/fp-conv-12.ll        |   57 +-
 4 files changed, 3023 insertions(+), 588 deletions(-)

diff --git a/test/CodeGen/Mips/2008-07-07-Float2Int.ll b/test/CodeGen/Mips/2008-07-07-Float2Int.ll
index 4c552361d9d..1b2ac19cba0 100644
--- a/test/CodeGen/Mips/2008-07-07-Float2Int.ll
+++ b/test/CodeGen/Mips/2008-07-07-Float2Int.ll
@@ -1,17 +1,33 @@
-; RUN: llc -march=mips < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=mips-- | FileCheck %s
 
 define i32 @fptoint(float %a) nounwind {
+; CHECK-LABEL: fptoint:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    trunc.w.s $f0, $f12
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    mfc1 $2, $f0
 entry:
-; CHECK: trunc.w.s 
   fptosi float %a to i32		; <i32>:0 [#uses=1]
   ret i32 %0
 }
 
 define i32 @fptouint(float %a) nounwind {
+; CHECK-LABEL: fptouint:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui $1, %hi($CPI1_0)
+; CHECK-NEXT:    lwc1 $f0, %lo($CPI1_0)($1)
+; CHECK-NEXT:    sub.s $f1, $f12, $f0
+; CHECK-NEXT:    trunc.w.s $f1, $f1
+; CHECK-NEXT:    mfc1 $1, $f1
+; CHECK-NEXT:    lui $2, 32768
+; CHECK-NEXT:    xor $2, $1, $2
+; CHECK-NEXT:    trunc.w.s $f1, $f12
+; CHECK-NEXT:    mfc1 $1, $f1
+; CHECK-NEXT:    c.olt.s $f12, $f0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    movt $2, $1, $fcc0
 entry:
-; CHECK: fptouint
-; CHECK: trunc.w.s 
-; CHECK: trunc.w.s 
   fptoui float %a to i32		; <i32>:0 [#uses=1]
   ret i32 %0
 }
diff --git a/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index b3ed8bdd3b9..4618c96d879 100644
--- a/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -1,20 +1,21 @@
-; RUN: llc -relocation-model=pic -march=mipsel -mcpu=mips32r5 \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -relocation-model=pic -mtriple=mipsel-- -mcpu=mips32r5 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS32,MIPSR5,MIPS32-O32,MIPS32R5-O32
-; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r5 \
+; RUN: llc -relocation-model=pic -mtriple=mips64el-- -mcpu=mips64r5 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n32 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR5,MIPS64-N32,MIPS64R5-N32
-; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r5 \
+; RUN: llc -relocation-model=pic -mtriple=mips64el-- -mcpu=mips64r5 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n64 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR5,MIPS64-N64,MIPS64R5-N64
 
-; RUN: llc -relocation-model=pic -march=mipsel -mcpu=mips32r6 \
+; RUN: llc -relocation-model=pic -mtriple=mipsel-- -mcpu=mips32r6 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS32,MIPSR6,MIPSR6-O32
-; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r6 \
+; RUN: llc -relocation-model=pic -mtriple=mips64el-- -mcpu=mips64r6 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n32 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR6,MIPS64-N32,MIPSR6-N32
-; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r6 \
+; RUN: llc -relocation-model=pic -mtriple=mips64el-- -mcpu=mips64r6 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n64 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR6,MIPS64-N64,MIPSR6-N64
 
@@ -26,16 +27,73 @@
 declare float @k2(half *)
 
 define void @f3(i16 %b) {
+; MIPS32-LABEL: f3:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -32
+; MIPS32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 24($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $16, $2, $25
+; MIPS32-NEXT:    sh $4, 22($sp)
+; MIPS32-NEXT:    addiu $4, $sp, 22
+; MIPS32-NEXT:    lw $25, %call16(k2)($16)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    move $gp, $16
+; MIPS32-NEXT:    lw $1, %got(k)($16)
+; MIPS32-NEXT:    swc1 $f0, 0($1)
+; MIPS32-NEXT:    lw $16, 24($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N32-LABEL: f3:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(f3)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(f3)))
+; MIPS64-N32-NEXT:    sh $4, 14($sp)
+; MIPS64-N32-NEXT:    lw $25, %call16(k2)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    addiu $4, $sp, 14
+; MIPS64-N32-NEXT:    lw $1, %got_disp(k)($gp)
+; MIPS64-N32-NEXT:    swc1 $f0, 0($1)
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: f3:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(f3)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(f3)))
+; MIPS64-N64-NEXT:    sh $4, 14($sp)
+; MIPS64-N64-NEXT:    ld $25, %call16(k2)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    daddiu $4, $sp, 14
+; MIPS64-N64-NEXT:    ld $1, %got_disp(k)($gp)
+; MIPS64-N64-NEXT:    swc1 $f0, 0($1)
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: f3:
-
-; ALL: sh $4, [[O0:[0-9]+]]($sp)
-; ALL-DAG: jalr $25
-; MIPS32-DAG: addiu $4, $sp, [[O0]]
-; MIPS64-N32: addiu $4, $sp, [[O0]]
-; MIPS64-N64: daddiu $4, $sp, [[O0]]
-; ALL: swc1 $f0
-
   %0 = alloca half
   %1 = bitcast i16 %b to half
   store half %1, half * %0
@@ -45,16 +103,59 @@ entry:
 }
 
 define void  @f(i16 %b) {
-; ALL-LABEL: f:
-
-; ALL: sh $4, [[O0:[0-9]+]]($sp)
-; ALL: lh $[[R0:[0-9]+]], [[O0]]($sp)
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; ALL: swc1 $f[[F0]]
-
+; MIPS32-LABEL: f:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -8
+; MIPS32-NEXT:    .cfi_def_cfa_offset 8
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    sh $4, 4($sp)
+; MIPS32-NEXT:    lh $2, 4($sp)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    lw $1, %got(k)($1)
+; MIPS32-NEXT:    swc1 $f0, 0($1)
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 8
+;
+; MIPS64-N32-LABEL: f:
+; MIPS64-N32:       # %bb.0:
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -16
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(f)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(f)))
+; MIPS64-N32-NEXT:    sh $4, 12($sp)
+; MIPS64-N32-NEXT:    lh $2, 12($sp)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    lw $1, %got_disp(k)($1)
+; MIPS64-N32-NEXT:    swc1 $f0, 0($1)
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 16
+;
+; MIPS64-N64-LABEL: f:
+; MIPS64-N64:       # %bb.0:
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -16
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(f)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(f)))
+; MIPS64-N64-NEXT:    sh $4, 12($sp)
+; MIPS64-N64-NEXT:    lh $2, 12($sp)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    ld $1, %got_disp(k)($1)
+; MIPS64-N64-NEXT:    swc1 $f0, 0($1)
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 16
   %1 = bitcast i16 %b to half
   %2 = fpext half %1 to float
   store float %2, float * @k
@@ -72,180 +173,488 @@ define void  @f(i16 %b) {
 ; MIPSR5. Additionally, fp64 mode / FR=1 is required to use MSA.
 
 define void @fadd_f64() {
+; MIPS32-LABEL: fadd_f64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(h)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    fexupr.d $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f1
+; MIPS32-NEXT:    copy_s.w $2, $w0[1]
+; MIPS32-NEXT:    mthc1 $2, $f1
+; MIPS32-NEXT:    add.d $f0, $f1, $f1
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w1, $2
+; MIPS32-NEXT:    mfhc1 $2, $f0
+; MIPS32-NEXT:    insert.w $w1[1], $2
+; MIPS32-NEXT:    insert.w $w1[3], $2
+; MIPS32-NEXT:    fexdo.w $w0, $w1, $w1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fadd_f64:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fadd_f64)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fadd_f64)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(h)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    fexupr.d $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-N32-NEXT:    dmtc1 $2, $f0
+; MIPS64-N32-NEXT:    add.d $f0, $f0, $f0
+; MIPS64-N32-NEXT:    dmfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.d $w0, $2
+; MIPS64-N32-NEXT:    fexdo.w $w0, $w0, $w0
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fadd_f64:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fadd_f64)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fadd_f64)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(h)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    fexupr.d $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-N64-NEXT:    dmtc1 $2, $f0
+; MIPS64-N64-NEXT:    add.d $f0, $f0, $f0
+; MIPS64-N64-NEXT:    dmfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.d $w0, $2
+; MIPS64-N64-NEXT:    fexdo.w $w0, $w0, $w0
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fadd_f64:
   %0 = load half, half * @h, align 2
   %1 = fpext half %0 to double
-; ALL:    lh $[[R0:[0-9]+]]
-; ALL:    fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:    fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:    fexupr.d $w[[W2:[0-9]+]], $w[[W1]]
-; MIPS32: copy_s.w $[[R1:[0-9]+]], $w[[W2]][0]
-; MIPS32: mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32: copy_s.w $[[R2:[0-9]+]], $w[[W2]][1]
-; MIPS32: mthc1 $[[R2]], $f[[F0]]
-; MIPS64: copy_s.d $[[R2:[0-9]+]], $w[[W2]][0]
-; MIPS64: dmtc1 $[[R2]], $f[[F0:[0-9]+]]
-
   %2 = load half, half * @h, align 2
   %3 = fpext half %2 to double
   %add = fadd double %1, %3
-
-; ALL: add.d $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
-
   %4 = fptrunc double %add to half
-
-; MIPS32: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; MIPS32: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; MIPS32: mfhc1 $[[R3:[0-9]+]], $f[[F1]]
-; MIPS32: insert.w $w[[W2]][1], $[[R3]]
-; MIPS32: insert.w $w[[W2]][3], $[[R3]]
-
-; MIPS64: dmfc1 $[[R2:[0-9]+]], $f[[F1]]
-; MIPS64: fill.d $w[[W2:[0-9]+]], $[[R2]]
-
-; ALL:    fexdo.w $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:    fexdo.h $w[[W4:[0-9]+]], $w[[W3]], $w[[W3]]
-; ALL:    copy_u.h $[[R3:[0-9]+]], $w[[W4]][0]
-; ALL:    sh $[[R3]]
    store half %4, half * @h, align 2
   ret void
 }
 
 define i32 @ffptoui() {
+; MIPS32-O32-LABEL: ffptoui:
+; MIPS32-O32:       # %bb.0: # %entry
+; MIPS32-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-O32-NEXT:    addu $1, $2, $25
+; MIPS32-O32-NEXT:    lw $2, %got(h)($1)
+; MIPS32-O32-NEXT:    lw $3, %got($CPI3_0)($1)
+; MIPS32-O32-NEXT:    lwc1 $f0, %lo($CPI3_0)($3)
+; MIPS32-O32-NEXT:    lh $2, 0($2)
+; MIPS32-O32-NEXT:    fill.h $w1, $2
+; MIPS32-O32-NEXT:    fexupr.w $w1, $w1
+; MIPS32-O32-NEXT:    copy_s.w $2, $w1[0]
+; MIPS32-O32-NEXT:    mtc1 $2, $f2
+; MIPS32-O32-NEXT:    sub.s $f0, $f2, $f0
+; MIPS32-O32-NEXT:    mfc1 $2, $f0
+; MIPS32-O32-NEXT:    fill.w $w0, $2
+; MIPS32-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    fexupr.d $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-O32-NEXT:    mtc1 $2, $f3
+; MIPS32-O32-NEXT:    copy_s.w $2, $w0[1]
+; MIPS32-O32-NEXT:    mthc1 $2, $f3
+; MIPS32-O32-NEXT:    trunc.w.d $f0, $f3
+; MIPS32-O32-NEXT:    mfc1 $2, $f0
+; MIPS32-O32-NEXT:    fexupr.d $w0, $w1
+; MIPS32-O32-NEXT:    copy_s.w $3, $w0[0]
+; MIPS32-O32-NEXT:    mtc1 $3, $f1
+; MIPS32-O32-NEXT:    copy_s.w $3, $w0[1]
+; MIPS32-O32-NEXT:    mthc1 $3, $f1
+; MIPS32-O32-NEXT:    trunc.w.d $f0, $f1
+; MIPS32-O32-NEXT:    mfc1 $3, $f0
+; MIPS32-O32-NEXT:    lw $1, %got($CPI3_1)($1)
+; MIPS32-O32-NEXT:    addiu $1, $1, %lo($CPI3_1)
+; MIPS32-O32-NEXT:    lui $4, 32768
+; MIPS32-O32-NEXT:    xor $2, $2, $4
+; MIPS32-O32-NEXT:    lh $1, 0($1)
+; MIPS32-O32-NEXT:    fill.h $w0, $1
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-O32-NEXT:    mtc1 $1, $f0
+; MIPS32-O32-NEXT:    c.olt.s $f2, $f0
+; MIPS32-O32-NEXT:    jr $ra
+; MIPS32-O32-NEXT:    movt $2, $3, $fcc0
+;
+; MIPS64R5-N32-LABEL: ffptoui:
+; MIPS64R5-N32:       # %bb.0: # %entry
+; MIPS64R5-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
+; MIPS64R5-N32-NEXT:    addu $1, $1, $25
+; MIPS64R5-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
+; MIPS64R5-N32-NEXT:    lw $2, %got_disp(h)($1)
+; MIPS64R5-N32-NEXT:    lw $3, %got_page(.LCPI3_0)($1)
+; MIPS64R5-N32-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($3)
+; MIPS64R5-N32-NEXT:    lh $2, 0($2)
+; MIPS64R5-N32-NEXT:    fill.h $w1, $2
+; MIPS64R5-N32-NEXT:    fexupr.w $w1, $w1
+; MIPS64R5-N32-NEXT:    copy_s.w $2, $w1[0]
+; MIPS64R5-N32-NEXT:    mtc1 $2, $f2
+; MIPS64R5-N32-NEXT:    sub.s $f0, $f2, $f0
+; MIPS64R5-N32-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N32-NEXT:    fill.w $w0, $2
+; MIPS64R5-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    fexupr.d $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64R5-N32-NEXT:    dmtc1 $2, $f0
+; MIPS64R5-N32-NEXT:    trunc.w.d $f0, $f0
+; MIPS64R5-N32-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N32-NEXT:    fexupr.d $w0, $w1
+; MIPS64R5-N32-NEXT:    copy_s.d $3, $w0[0]
+; MIPS64R5-N32-NEXT:    dmtc1 $3, $f0
+; MIPS64R5-N32-NEXT:    trunc.w.d $f0, $f0
+; MIPS64R5-N32-NEXT:    mfc1 $3, $f0
+; MIPS64R5-N32-NEXT:    lw $1, %got_page(.LCPI3_1)($1)
+; MIPS64R5-N32-NEXT:    addiu $1, $1, %got_ofst(.LCPI3_1)
+; MIPS64R5-N32-NEXT:    lui $4, 32768
+; MIPS64R5-N32-NEXT:    xor $2, $2, $4
+; MIPS64R5-N32-NEXT:    lh $1, 0($1)
+; MIPS64R5-N32-NEXT:    fill.h $w0, $1
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N32-NEXT:    mtc1 $1, $f0
+; MIPS64R5-N32-NEXT:    c.olt.s $f2, $f0
+; MIPS64R5-N32-NEXT:    jr $ra
+; MIPS64R5-N32-NEXT:    movt $2, $3, $fcc0
+;
+; MIPS64R5-N64-LABEL: ffptoui:
+; MIPS64R5-N64:       # %bb.0: # %entry
+; MIPS64R5-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
+; MIPS64R5-N64-NEXT:    daddu $1, $1, $25
+; MIPS64R5-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
+; MIPS64R5-N64-NEXT:    ld $2, %got_disp(h)($1)
+; MIPS64R5-N64-NEXT:    ld $3, %got_page(.LCPI3_0)($1)
+; MIPS64R5-N64-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($3)
+; MIPS64R5-N64-NEXT:    lh $2, 0($2)
+; MIPS64R5-N64-NEXT:    fill.h $w1, $2
+; MIPS64R5-N64-NEXT:    fexupr.w $w1, $w1
+; MIPS64R5-N64-NEXT:    copy_s.w $2, $w1[0]
+; MIPS64R5-N64-NEXT:    mtc1 $2, $f2
+; MIPS64R5-N64-NEXT:    sub.s $f0, $f2, $f0
+; MIPS64R5-N64-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N64-NEXT:    fill.w $w0, $2
+; MIPS64R5-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    fexupr.d $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64R5-N64-NEXT:    dmtc1 $2, $f0
+; MIPS64R5-N64-NEXT:    trunc.w.d $f0, $f0
+; MIPS64R5-N64-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N64-NEXT:    fexupr.d $w0, $w1
+; MIPS64R5-N64-NEXT:    copy_s.d $3, $w0[0]
+; MIPS64R5-N64-NEXT:    dmtc1 $3, $f0
+; MIPS64R5-N64-NEXT:    trunc.w.d $f0, $f0
+; MIPS64R5-N64-NEXT:    mfc1 $3, $f0
+; MIPS64R5-N64-NEXT:    ld $1, %got_page(.LCPI3_1)($1)
+; MIPS64R5-N64-NEXT:    daddiu $1, $1, %got_ofst(.LCPI3_1)
+; MIPS64R5-N64-NEXT:    lui $4, 32768
+; MIPS64R5-N64-NEXT:    xor $2, $2, $4
+; MIPS64R5-N64-NEXT:    lh $1, 0($1)
+; MIPS64R5-N64-NEXT:    fill.h $w0, $1
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N64-NEXT:    mtc1 $1, $f0
+; MIPS64R5-N64-NEXT:    c.olt.s $f2, $f0
+; MIPS64R5-N64-NEXT:    jr $ra
+; MIPS64R5-N64-NEXT:    movt $2, $3, $fcc0
+;
+; MIPSR6-O32-LABEL: ffptoui:
+; MIPSR6-O32:       # %bb.0: # %entry
+; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    lw $2, %got(h)($1)
+; MIPSR6-O32-NEXT:    lw $1, %got($CPI3_0)($1)
+; MIPSR6-O32-NEXT:    lwc1 $f0, %lo($CPI3_0)($1)
+; MIPSR6-O32-NEXT:    lh $1, 0($2)
+; MIPSR6-O32-NEXT:    fill.h $w1, $1
+; MIPSR6-O32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-O32-NEXT:    copy_s.w $1, $w1[0]
+; MIPSR6-O32-NEXT:    mtc1 $1, $f2
+; MIPSR6-O32-NEXT:    cmp.lt.s $f3, $f2, $f0
+; MIPSR6-O32-NEXT:    sub.s $f0, $f2, $f0
+; MIPSR6-O32-NEXT:    mfc1 $1, $f0
+; MIPSR6-O32-NEXT:    fill.w $w0, $1
+; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-O32-NEXT:    fexupr.d $w0, $w0
+; MIPSR6-O32-NEXT:    copy_s.w $1, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $1, $f2
+; MIPSR6-O32-NEXT:    copy_s.w $1, $w0[1]
+; MIPSR6-O32-NEXT:    mthc1 $1, $f2
+; MIPSR6-O32-NEXT:    trunc.w.d $f0, $f2
+; MIPSR6-O32-NEXT:    mfc1 $1, $f0
+; MIPSR6-O32-NEXT:    fexupr.d $w0, $w1
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f1
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[1]
+; MIPSR6-O32-NEXT:    mthc1 $2, $f1
+; MIPSR6-O32-NEXT:    trunc.w.d $f0, $f1
+; MIPSR6-O32-NEXT:    mfc1 $2, $f0
+; MIPSR6-O32-NEXT:    lui $3, 32768
+; MIPSR6-O32-NEXT:    xor $1, $1, $3
+; MIPSR6-O32-NEXT:    mfc1 $3, $f3
+; MIPSR6-O32-NEXT:    seleqz $1, $1, $3
+; MIPSR6-O32-NEXT:    selnez $2, $2, $3
+; MIPSR6-O32-NEXT:    jr $ra
+; MIPSR6-O32-NEXT:    or $2, $2, $1
+;
+; MIPSR6-N32-LABEL: ffptoui:
+; MIPSR6-N32:       # %bb.0: # %entry
+; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
+; MIPSR6-N32-NEXT:    addu $1, $1, $25
+; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
+; MIPSR6-N32-NEXT:    lw $2, %got_disp(h)($1)
+; MIPSR6-N32-NEXT:    lw $1, %got_page(.LCPI3_0)($1)
+; MIPSR6-N32-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($1)
+; MIPSR6-N32-NEXT:    lh $1, 0($2)
+; MIPSR6-N32-NEXT:    fill.h $w1, $1
+; MIPSR6-N32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N32-NEXT:    copy_s.w $1, $w1[0]
+; MIPSR6-N32-NEXT:    mtc1 $1, $f2
+; MIPSR6-N32-NEXT:    cmp.lt.s $f3, $f2, $f0
+; MIPSR6-N32-NEXT:    sub.s $f0, $f2, $f0
+; MIPSR6-N32-NEXT:    mfc1 $1, $f0
+; MIPSR6-N32-NEXT:    fill.w $w0, $1
+; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N32-NEXT:    fexupr.d $w0, $w0
+; MIPSR6-N32-NEXT:    copy_s.d $1, $w0[0]
+; MIPSR6-N32-NEXT:    dmtc1 $1, $f0
+; MIPSR6-N32-NEXT:    trunc.w.d $f0, $f0
+; MIPSR6-N32-NEXT:    mfc1 $1, $f0
+; MIPSR6-N32-NEXT:    fexupr.d $w0, $w1
+; MIPSR6-N32-NEXT:    copy_s.d $2, $w0[0]
+; MIPSR6-N32-NEXT:    dmtc1 $2, $f0
+; MIPSR6-N32-NEXT:    trunc.w.d $f0, $f0
+; MIPSR6-N32-NEXT:    mfc1 $2, $f0
+; MIPSR6-N32-NEXT:    lui $3, 32768
+; MIPSR6-N32-NEXT:    xor $1, $1, $3
+; MIPSR6-N32-NEXT:    mfc1 $3, $f3
+; MIPSR6-N32-NEXT:    seleqz $1, $1, $3
+; MIPSR6-N32-NEXT:    selnez $2, $2, $3
+; MIPSR6-N32-NEXT:    jr $ra
+; MIPSR6-N32-NEXT:    or $2, $2, $1
+;
+; MIPSR6-N64-LABEL: ffptoui:
+; MIPSR6-N64:       # %bb.0: # %entry
+; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
+; MIPSR6-N64-NEXT:    daddu $1, $1, $25
+; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
+; MIPSR6-N64-NEXT:    ld $2, %got_disp(h)($1)
+; MIPSR6-N64-NEXT:    ld $1, %got_page(.LCPI3_0)($1)
+; MIPSR6-N64-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($1)
+; MIPSR6-N64-NEXT:    lh $1, 0($2)
+; MIPSR6-N64-NEXT:    fill.h $w1, $1
+; MIPSR6-N64-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N64-NEXT:    copy_s.w $1, $w1[0]
+; MIPSR6-N64-NEXT:    mtc1 $1, $f2
+; MIPSR6-N64-NEXT:    cmp.lt.s $f3, $f2, $f0
+; MIPSR6-N64-NEXT:    sub.s $f0, $f2, $f0
+; MIPSR6-N64-NEXT:    mfc1 $1, $f0
+; MIPSR6-N64-NEXT:    fill.w $w0, $1
+; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N64-NEXT:    fexupr.d $w0, $w0
+; MIPSR6-N64-NEXT:    copy_s.d $1, $w0[0]
+; MIPSR6-N64-NEXT:    dmtc1 $1, $f0
+; MIPSR6-N64-NEXT:    trunc.w.d $f0, $f0
+; MIPSR6-N64-NEXT:    mfc1 $1, $f0
+; MIPSR6-N64-NEXT:    fexupr.d $w0, $w1
+; MIPSR6-N64-NEXT:    copy_s.d $2, $w0[0]
+; MIPSR6-N64-NEXT:    dmtc1 $2, $f0
+; MIPSR6-N64-NEXT:    trunc.w.d $f0, $f0
+; MIPSR6-N64-NEXT:    mfc1 $2, $f0
+; MIPSR6-N64-NEXT:    lui $3, 32768
+; MIPSR6-N64-NEXT:    xor $1, $1, $3
+; MIPSR6-N64-NEXT:    mfc1 $3, $f3
+; MIPSR6-N64-NEXT:    seleqz $1, $1, $3
+; MIPSR6-N64-NEXT:    selnez $2, $2, $3
+; MIPSR6-N64-NEXT:    jr $ra
+; MIPSR6-N64-NEXT:    or $2, $2, $1
 entry:
-; ALL-LABEL: ffptoui:
   %0 = load half, half * @h, align 2
   %1 = fptoui half %0 to i32
 
-; MIPS32:       lwc1 $f[[FC:[0-9]+]], %lo($CPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64-N32:   lwc1 $f[[FC:[0-9]+]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64-N64:   lwc1 $f[[FC:[0-9]+]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-
-; ALL:          lh $[[R0:[0-9]+]]
-; ALL:          fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:          fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:          copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL:          mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPSR6:       cmp.lt.s  $f[[F1:[0-9]+]], $f[[F0]], $f[[FC]]
-; ALL:          sub.s $f[[F2:[0-9]+]], $f[[F0]], $f[[FC]]
-; ALL:          mfc1 $[[R2:[0-9]]], $f[[F2]]
-; ALL:          fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:          fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:          fexupr.w $w[[W4:[0-9]+]], $w[[W3]]
-; ALL:          fexupr.d $w[[W5:[0-9]+]], $w[[W4]]
-
-; MIPS32:       copy_s.w $[[R3:[0-9]+]], $w[[W5]][0]
-; MIPS32:       mtc1 $[[R3]], $f[[F3:[0-9]+]]
-; MIPS32:       copy_s.w $[[R4:[0-9]+]], $w[[W5]][1]
-; MIPS32:       mthc1 $[[R3]], $f[[F3]]
-
-; MIPS64:       copy_s.d $[[R2:[0-9]+]], $w[[W2]][0]
-; MIPS64:       dmtc1 $[[R2]], $f[[F3:[0-9]+]]
-
-; ALL:          trunc.w.d $f[[F4:[0-9]+]], $f[[F3]]
-; ALL:          mfc1 $[[R4:[0-9]+]], $f[[F4]]
-; ALL:          fexupr.d $w[[W6:[0-9]+]], $w[[W1]]
-
-; MIPS32:       copy_s.w $[[R5:[0-9]+]], $w[[W6]][0]
-; MIPS32:       mtc1 $[[R5]], $f[[F5:[0-9]+]]
-; MIPS32:       copy_s.w $[[R6:[0-9]+]], $w[[W6]][1]
-; MIPS32:       mthc1 $[[R6]], $f[[F5]]
-
-; MIPS64:       copy_s.d $[[R2:[0-9]+]], $w[[W2]][0]
-; MIPS64:       dmtc1 $[[R2]], $f[[F5:[0-9]+]]
-
-; ALL:          trunc.w.d $f[[F6:[0-9]]], $f[[F5]]
-; ALL:          mfc1 $[[R7:[0-9]]], $f[[F6]]
-
-; MIPS32R5-O32: lw $[[R13:[0-9]+]], %got($CPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS32R5-O32: addiu $[[R14:[0-9]+]], $[[R13]], %lo($CPI{{[0-9]+}}_{{[0-9]+}})
-
-; MIPS64R5-N32: lw $[[R13:[0-9]+]], %got_page(.LCPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64R5-N32: addiu $[[R14:[0-9]+]], $[[R13]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-
-; MIPS64R5-N64: ld $[[R13:[0-9]+]], %got_page(.LCPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64R5-N64: daddiu $[[R14:[0-9]+]], $[[R13]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-
-; ALL:          lui $[[R8:[0-9]+]], 32768
-; ALL:          xor $[[R9:[0-9]+]], $[[R4]], $[[R8]]
-
-; MIPSR5:       lh $[[R15:[0-9]+]], 0($[[R14]])
-; MIPSR5:       fill.h $w[[W7:[0-9]+]], $[[R15]]
-; MIPSR5:       fexupr.w $w[[W8:[0-9]+]], $w[[W7]]
-; MIPSR5:       copy_s.w $[[R16:[0-9]+]], $w[[W8]][0]
-; MIPSR5:       mtc1 $[[R16]], $f[[F7:[0-9]+]]
-; MIPSR5:       c.olt.s $f[[F0]], $f[[F7]]
-; MIPSR5:       movt $[[R9]], $[[R7]], $fcc0
-
-; MIPSR6:       mfc1 $[[R10:[0-9]+]], $f[[F1]]
-; MIPSR6:       seleqz $[[R11:[0-9]]], $[[R9]], $[[R10]]
-; MIPSR6:       selnez $[[R12:[0-9]]], $[[R7]], $[[R10]]
-; MIPSR6:       or $2, $[[R12]], $[[R11]]
+
+
+
+
+
+
+
 
   ret i32 %1
 }
 
 define i32 @ffptosi() {
+; MIPS32-LABEL: ffptosi:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(h)($1)
+; MIPS32-NEXT:    lh $1, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    fexupr.d $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    mtc1 $1, $f1
+; MIPS32-NEXT:    copy_s.w $1, $w0[1]
+; MIPS32-NEXT:    mthc1 $1, $f1
+; MIPS32-NEXT:    trunc.w.d $f0, $f1
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    mfc1 $2, $f0
+;
+; MIPS64-N32-LABEL: ffptosi:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptosi)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffptosi)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(h)($1)
+; MIPS64-N32-NEXT:    lh $1, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    fexupr.d $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.d $1, $w0[0]
+; MIPS64-N32-NEXT:    dmtc1 $1, $f0
+; MIPS64-N32-NEXT:    trunc.w.d $f0, $f0
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+;
+; MIPS64-N64-LABEL: ffptosi:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptosi)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffptosi)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(h)($1)
+; MIPS64-N64-NEXT:    lh $1, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    fexupr.d $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.d $1, $w0[0]
+; MIPS64-N64-NEXT:    dmtc1 $1, $f0
+; MIPS64-N64-NEXT:    trunc.w.d $f0, $f0
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
 entry:
-; ALL-LABEL: ffptosi:
   %0 = load half, half * @h, align 2
   %1 = fptosi half %0 to i32
   ret i32 %1
 
-; ALL:    lh $[[R0:[0-9]+]]
-; ALL:    fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:    fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:    fexupr.d $w[[W2:[0-9]+]], $w[[W1]]
 
-; MIPS32: copy_s.w $[[R2:[0-9]+]], $w[[W2]][0]
-; MIPS32: mtc1 $[[R2]], $f[[F0:[0-9]+]]
-; MIPS32: copy_s.w $[[R3:[0-9]+]], $w[[W2]][1]
-; MIPS32: mthc1 $[[R3]], $f[[F0]]
 
-; MIPS64: copy_s.d $[[R2:[0-9]+]], $w[[W2]][0]
-; MIPS64: dmtc1 $[[R2]], $f[[F0:[0-9]+]]
 
-; ALL:    trunc.w.d $f[[F1:[0-9]+]], $f[[F0]]
-; ALL:    mfc1 $2, $f[[F1]]
 }
 
 define void @uitofp(i32 %a) {
+; MIPS32-LABEL: uitofp:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -8
+; MIPS32-NEXT:    .cfi_def_cfa_offset 8
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lui $2, 17200
+; MIPS32-NEXT:    sw $2, 4($sp)
+; MIPS32-NEXT:    sw $4, 0($sp)
+; MIPS32-NEXT:    lw $2, %got($CPI5_0)($1)
+; MIPS32-NEXT:    ldc1 $f0, %lo($CPI5_0)($2)
+; MIPS32-NEXT:    ldc1 $f1, 0($sp)
+; MIPS32-NEXT:    sub.d $f0, $f1, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w1, $2
+; MIPS32-NEXT:    mfhc1 $2, $f0
+; MIPS32-NEXT:    insert.w $w1[1], $2
+; MIPS32-NEXT:    insert.w $w1[3], $2
+; MIPS32-NEXT:    fexdo.w $w0, $w1, $w1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    lw $1, %got(h)($1)
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    sh $2, 0($1)
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 8
+;
+; MIPS64-N32-LABEL: uitofp:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -16
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(uitofp)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(uitofp)))
+; MIPS64-N32-NEXT:    lui $2, 17200
+; MIPS64-N32-NEXT:    sw $2, 12($sp)
+; MIPS64-N32-NEXT:    sll $2, $4, 0
+; MIPS64-N32-NEXT:    sw $2, 8($sp)
+; MIPS64-N32-NEXT:    lw $2, %got_page(.LCPI5_0)($1)
+; MIPS64-N32-NEXT:    ldc1 $f0, %got_ofst(.LCPI5_0)($2)
+; MIPS64-N32-NEXT:    ldc1 $f1, 8($sp)
+; MIPS64-N32-NEXT:    sub.d $f0, $f1, $f0
+; MIPS64-N32-NEXT:    dmfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.d $w0, $2
+; MIPS64-N32-NEXT:    fexdo.w $w0, $w0, $w0
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    lw $1, %got_disp(h)($1)
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 16
+;
+; MIPS64-N64-LABEL: uitofp:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -16
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(uitofp)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(uitofp)))
+; MIPS64-N64-NEXT:    lui $2, 17200
+; MIPS64-N64-NEXT:    sw $2, 12($sp)
+; MIPS64-N64-NEXT:    sll $2, $4, 0
+; MIPS64-N64-NEXT:    sw $2, 8($sp)
+; MIPS64-N64-NEXT:    ld $2, %got_page(.LCPI5_0)($1)
+; MIPS64-N64-NEXT:    ldc1 $f0, %got_ofst(.LCPI5_0)($2)
+; MIPS64-N64-NEXT:    ldc1 $f1, 8($sp)
+; MIPS64-N64-NEXT:    sub.d $f0, $f1, $f0
+; MIPS64-N64-NEXT:    dmfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.d $w0, $2
+; MIPS64-N64-NEXT:    fexdo.w $w0, $w0, $w0
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    ld $1, %got_disp(h)($1)
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    sh $2, 0($1)
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 16
 entry:
-; ALL-LABEL: uitofp:
 
-; MIPS32-O32: ldc1 $f[[F0:[0-9]+]], %lo($CPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS32-O32: ldc1 $f[[F1:[0-9]+]], 0($sp)
 
-; MIPS64-N32: ldc1 $f[[F0:[0-9]+]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64-N32: ldc1 $f[[F1:[0-9]+]], 8($sp)
 
-; MIPS64-N64: ldc1 $f[[F0:[0-9]+]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64-N64: ldc1 $f[[F1:[0-9]+]], 8($sp)
 
-; MIPSR5:     sub.d $f[[F2:[0-9]+]], $f[[F1]], $f[[F0]]
-; MIPSR6-O32: sub.d $f[[F2:[0-9]+]], $f[[F0]], $f[[F1]]
-; MIPSR6-N32: sub.d $f[[F2:[0-9]+]], $f[[F1]], $f[[F0]]
-; MIPSR6-N64: sub.d $f[[F2:[0-9]+]], $f[[F1]], $f[[F0]]
 
-; MIPS32:     mfc1 $[[R0:[0-9]+]], $f[[F2]]
-; MIPS32:     fill.w $w[[W0:[0-9]+]], $[[R0]]
-; MIPS32:     mfhc1 $[[R1:[0-9]+]], $f[[F2]]
-; MIPS32:     insert.w $w[[W0]][1], $[[R1]]
-; MIPS32:     insert.w $w[[W0]][3], $[[R1]]
 
-; MIPS64-N64-DAG: ld $[[R3:[0-9]+]], %got_disp(h)
-; MIPS64-N32-DAG: lw $[[R3:[0-9]+]], %got_disp(h)
-; MIPS64-DAG:     dmfc1 $[[R1:[0-9]+]], $f[[F2]]
-; MIPS64-DAG:     fill.d $w[[W0:[0-9]+]], $[[R1]]
 
-; ALL-DAG:        fexdo.w $w[[W1:[0-9]+]], $w[[W0]], $w[[W0]]
-; ALL-DAG:        fexdo.h $w[[W2:[0-9]+]], $w[[W1]], $w[[W1]]
 
-; MIPS32-DAG:     lw $[[R3:[0-9]+]], %got(h)
 
-; ALL:        copy_u.h $[[R2:[0-9]+]], $w[[W2]]
-; ALL:        sh $[[R2]], 0($[[R3]])
   %0 = uitofp i32 %a to half
   store half %0, half * @h, align 2
   ret void
@@ -256,30 +665,74 @@ entry:
 ; We don't check f16 -> f64 expansion occurs, as we expand f16 to f32.
 
 define void @fadd() {
+; MIPS32-LABEL: fadd:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    add.s $f0, $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fadd:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fadd)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fadd)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    add.s $f0, $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fadd:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fadd)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fadd)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    add.s $f0, $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fadd:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %add = fadd float %1, %3
 
-; ALL: add.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
 
  %4 = call i16 @llvm.convert.to.fp16.f32(float %add)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
-; ALL: sh $[[R3]]
    store i16 %4, i16* @g, align 2
   ret void
 }
@@ -292,126 +745,338 @@ declare i16 @llvm.convert.to.fp16.f32(float)
 
 ; Function Attrs: nounwind
 define void @fsub() {
+; MIPS32-LABEL: fsub:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    sub.s $f0, $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fsub:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fsub)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fsub)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    sub.s $f0, $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fsub:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fsub)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fsub)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    sub.s $f0, $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fsub:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %sub = fsub float %1, %3
 
-; ALL: sub.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
 
   %4 = call i16 @llvm.convert.to.fp16.f32(float %sub)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %4, i16* @g, align 2
-; ALL: sh $[[R3]]
   ret void
 }
 
 define void @fmult() {
+; MIPS32-LABEL: fmult:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    mul.s $f0, $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fmult:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmult)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fmult)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    mul.s $f0, $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fmult:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmult)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fmult)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    mul.s $f0, $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fmult:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %mul = fmul float %1, %3
 
-; ALL: mul.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
 
   %4 = call i16 @llvm.convert.to.fp16.f32(float %mul)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %4, i16* @g, align 2
 
-; ALL: sh $[[R3]]
   ret void
 }
 
 define void @fdiv() {
+; MIPS32-LABEL: fdiv:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    div.s $f0, $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fdiv:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fdiv)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fdiv)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    div.s $f0, $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fdiv:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fdiv)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fdiv)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    div.s $f0, $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fdiv:
 
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %div = fdiv float %1, %3
 
-; ALL: div.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
 
   %4 = call i16 @llvm.convert.to.fp16.f32(float %div)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
   store i16 %4, i16* @g, align 2
-; ALL: sh $[[R3]]
   ret void
 }
 
 define void @frem() {
+; MIPS32-LABEL: frem:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    lw $25, %call16(fmodf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mov.s $f14, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: frem:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(frem)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(frem)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    lw $25, %call16(fmodf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mov.s $f13, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: frem:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(frem)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(frem)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    ld $25, %call16(fmodf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mov.s $f13, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: frem:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:        lh $[[R0:[0-9]+]]
-; ALL:        fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:        fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:        copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %rem = frem float %1, %3
 
-; MIPS32:     lw $25, %call16(fmodf)($gp)
-; MIPS64-N32: lw $25, %call16(fmodf)($gp)
-; MIPS64-N64: ld $25, %call16(fmodf)($gp)
-; ALL:        jalr $25
 
   %4 = call i16 @llvm.convert.to.fp16.f32(float %rem)
 
-; ALL:        mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:        fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:        fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:        copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %4, i16* @g, align 2
-; ALL:        sh $[[R3]]
 
   ret void
 }
@@ -419,31 +1084,127 @@ entry:
 @i1 = external global i16, align 1
 
 define void @fcmp() {
+; MIPS32-O32-LABEL: fcmp:
+; MIPS32-O32:       # %bb.0: # %entry
+; MIPS32-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-O32-NEXT:    addu $1, $2, $25
+; MIPS32-O32-NEXT:    lw $2, %got(g)($1)
+; MIPS32-O32-NEXT:    lh $2, 0($2)
+; MIPS32-O32-NEXT:    fill.h $w0, $2
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-O32-NEXT:    mtc1 $2, $f0
+; MIPS32-O32-NEXT:    addiu $2, $zero, 1
+; MIPS32-O32-NEXT:    c.un.s $f0, $f0
+; MIPS32-O32-NEXT:    movt $2, $zero, $fcc0
+; MIPS32-O32-NEXT:    lw $1, %got(i1)($1)
+; MIPS32-O32-NEXT:    jr $ra
+; MIPS32-O32-NEXT:    sh $2, 0($1)
+;
+; MIPS64R5-N32-LABEL: fcmp:
+; MIPS64R5-N32:       # %bb.0: # %entry
+; MIPS64R5-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fcmp)))
+; MIPS64R5-N32-NEXT:    addu $1, $1, $25
+; MIPS64R5-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fcmp)))
+; MIPS64R5-N32-NEXT:    lw $2, %got_disp(g)($1)
+; MIPS64R5-N32-NEXT:    lh $2, 0($2)
+; MIPS64R5-N32-NEXT:    fill.h $w0, $2
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5-N32-NEXT:    mtc1 $2, $f0
+; MIPS64R5-N32-NEXT:    addiu $2, $zero, 1
+; MIPS64R5-N32-NEXT:    c.un.s $f0, $f0
+; MIPS64R5-N32-NEXT:    movt $2, $zero, $fcc0
+; MIPS64R5-N32-NEXT:    lw $1, %got_disp(i1)($1)
+; MIPS64R5-N32-NEXT:    jr $ra
+; MIPS64R5-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64R5-N64-LABEL: fcmp:
+; MIPS64R5-N64:       # %bb.0: # %entry
+; MIPS64R5-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fcmp)))
+; MIPS64R5-N64-NEXT:    daddu $1, $1, $25
+; MIPS64R5-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fcmp)))
+; MIPS64R5-N64-NEXT:    ld $2, %got_disp(g)($1)
+; MIPS64R5-N64-NEXT:    lh $2, 0($2)
+; MIPS64R5-N64-NEXT:    fill.h $w0, $2
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5-N64-NEXT:    mtc1 $2, $f0
+; MIPS64R5-N64-NEXT:    addiu $2, $zero, 1
+; MIPS64R5-N64-NEXT:    c.un.s $f0, $f0
+; MIPS64R5-N64-NEXT:    movt $2, $zero, $fcc0
+; MIPS64R5-N64-NEXT:    ld $1, %got_disp(i1)($1)
+; MIPS64R5-N64-NEXT:    jr $ra
+; MIPS64R5-N64-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-O32-LABEL: fcmp:
+; MIPSR6-O32:       # %bb.0: # %entry
+; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    lw $2, %got(g)($1)
+; MIPSR6-O32-NEXT:    lh $2, 0($2)
+; MIPSR6-O32-NEXT:    fill.h $w0, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f0
+; MIPSR6-O32-NEXT:    cmp.un.s $f0, $f0, $f0
+; MIPSR6-O32-NEXT:    mfc1 $2, $f0
+; MIPSR6-O32-NEXT:    not $2, $2
+; MIPSR6-O32-NEXT:    andi $2, $2, 1
+; MIPSR6-O32-NEXT:    lw $1, %got(i1)($1)
+; MIPSR6-O32-NEXT:    jr $ra
+; MIPSR6-O32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N32-LABEL: fcmp:
+; MIPSR6-N32:       # %bb.0: # %entry
+; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fcmp)))
+; MIPSR6-N32-NEXT:    addu $1, $1, $25
+; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fcmp)))
+; MIPSR6-N32-NEXT:    lw $2, %got_disp(g)($1)
+; MIPSR6-N32-NEXT:    lh $2, 0($2)
+; MIPSR6-N32-NEXT:    fill.h $w0, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f0
+; MIPSR6-N32-NEXT:    cmp.un.s $f0, $f0, $f0
+; MIPSR6-N32-NEXT:    mfc1 $2, $f0
+; MIPSR6-N32-NEXT:    not $2, $2
+; MIPSR6-N32-NEXT:    andi $2, $2, 1
+; MIPSR6-N32-NEXT:    lw $1, %got_disp(i1)($1)
+; MIPSR6-N32-NEXT:    jr $ra
+; MIPSR6-N32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N64-LABEL: fcmp:
+; MIPSR6-N64:       # %bb.0: # %entry
+; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fcmp)))
+; MIPSR6-N64-NEXT:    daddu $1, $1, $25
+; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fcmp)))
+; MIPSR6-N64-NEXT:    ld $2, %got_disp(g)($1)
+; MIPSR6-N64-NEXT:    lh $2, 0($2)
+; MIPSR6-N64-NEXT:    fill.h $w0, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f0
+; MIPSR6-N64-NEXT:    cmp.un.s $f0, $f0, $f0
+; MIPSR6-N64-NEXT:    mfc1 $2, $f0
+; MIPSR6-N64-NEXT:    not $2, $2
+; MIPSR6-N64-NEXT:    andi $2, $2, 1
+; MIPSR6-N64-NEXT:    ld $1, %got_disp(i1)($1)
+; MIPSR6-N64-NEXT:    jr $ra
+; MIPSR6-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fcmp:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
-; ALL:        lh $[[R0:[0-9]+]]
-; ALL:        fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:        fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:        copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %fcmp = fcmp oeq float %1, %3
 
-; MIPSR5: addiu $[[R2:[0-9]+]], $zero, 1
-; MIPSR5: c.un.s $f[[F0]], $f[[F0]]
-; MIPSR5: movt $[[R2]], $zero, $fcc0
-; MIPSR6: cmp.un.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
-; MIPSR6: mfc1 $[[R3:[0-9]]], $f[[F1]]
-; MIPSR6: not $[[R4:[0-9]+]], $[[R3]]
-; MIPSR6: andi $[[R2:[0-9]+]], $[[R4]], 1
 
   %4 = zext i1 %fcmp to i16
   store i16 %4, i16* @i1, align 2
-; ALL:        sh $[[R2]]
 
   ret void
 }
@@ -451,125 +1212,406 @@ entry:
 declare float @llvm.powi.f32(float, i32)
 
 define void @fpowi() {
+; MIPS32-LABEL: fpowi:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    mul.s $f0, $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fpowi:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fpowi)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fpowi)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    mul.s $f0, $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fpowi:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fpowi)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fpowi)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    mul.s $f0, $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fpowi:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %powi = call float @llvm.powi.f32(float %1, i32 2)
 
-; ALL: mul.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
 
   %2 = call i16 @llvm.convert.to.fp16.f32(float %powi)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL: sh $[[R3]]
   ret void
 }
 
 define void @fpowi_var(i32 %var) {
+; MIPS32-LABEL: fpowi_var:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    lw $25, %call16(__powisf2)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    move $5, $4
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fpowi_var:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fpowi_var)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fpowi_var)))
+; MIPS64-N32-NEXT:    sll $5, $4, 0
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(__powisf2)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fpowi_var:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fpowi_var)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fpowi_var)))
+; MIPS64-N64-NEXT:    sll $5, $4, 0
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(__powisf2)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fpowi_var:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
 
   %powi = call float @llvm.powi.f32(float %1, i32 %var)
 
-; ALL-DAG: mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(__powisf2)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(__powisf2)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(__powisf2)($gp)
-; ALL-DAG:        jalr $25
 
   %2 = call i16 @llvm.convert.to.fp16.f32(float %powi)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
   ret void
 }
 
 declare float @llvm.pow.f32(float %Val, float %power)
 
 define void @fpow(float %var) {
+; MIPS32-LABEL: fpow:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    mov.s $f14, $f12
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(powf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fpow:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fpow)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fpow)))
+; MIPS64-N32-NEXT:    mov.s $f13, $f12
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(powf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fpow:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fpow)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fpow)))
+; MIPS64-N64-NEXT:    mov.s $f13, $f12
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(powf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fpow:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
 
   %powi = call float @llvm.pow.f32(float %1, float %var)
 
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(powf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(powf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(powf)($gp)
-; ALL-DAG:        jalr $25
 
   %2 = call i16 @llvm.convert.to.fp16.f32(float %powi)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
   ret void
 }
 
 declare float @llvm.log2.f32(float %Val)
 
 define void @flog2() {
+; MIPS32-LABEL: flog2:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(log2f)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: flog2:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(flog2)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(flog2)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(log2f)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: flog2:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(flog2)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(flog2)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(log2f)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: flog2:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(log2f)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(log2f)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(log2f)($gp)
-; ALL-DAG:        jalr $25
 
   %log2 = call float @llvm.log2.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %log2)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -577,31 +1619,108 @@ entry:
 declare float @llvm.log10.f32(float %Val)
 
 define void @flog10() {
+; MIPS32-LABEL: flog10:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(log10f)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: flog10:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(flog10)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(flog10)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(log10f)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: flog10:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(flog10)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(flog10)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(log10f)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: flog10:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(log10f)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(log10f)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(log10f)($gp)
-; ALL-DAG:        jalr $25
 
   %log10 = call float @llvm.log10.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %log10)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -609,28 +1728,72 @@ entry:
 declare float @llvm.sqrt.f32(float %Val)
 
 define void @fsqrt() {
+; MIPS32-LABEL: fsqrt:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    sqrt.s $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fsqrt:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fsqrt)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fsqrt)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    sqrt.s $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fsqrt:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fsqrt)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fsqrt)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    sqrt.s $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fsqrt:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; ALL: sqrt.s $f[[F1:[0-9]+]], $f[[F0]]
 
   %sqrt = call float @llvm.sqrt.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %sqrt)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL: sh $[[R3]]
 
   ret void
 }
@@ -638,31 +1801,108 @@ entry:
 declare float @llvm.sin.f32(float %Val)
 
 define void @fsin() {
+; MIPS32-LABEL: fsin:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(sinf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fsin:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fsin)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fsin)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(sinf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fsin:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fsin)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fsin)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(sinf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fsin:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(sinf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(sinf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(sinf)($gp)
-; ALL-DAG:        jalr $25
 
   %sin = call float @llvm.sin.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %sin)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -670,31 +1910,108 @@ entry:
 declare float @llvm.cos.f32(float %Val)
 
 define void @fcos() {
+; MIPS32-LABEL: fcos:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(cosf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fcos:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fcos)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fcos)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(cosf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fcos:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fcos)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fcos)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(cosf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fcos:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(cosf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(cosf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(cosf)($gp)
-; ALL-DAG:        jalr $25
 
   %cos = call float @llvm.cos.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %cos)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -702,30 +2019,107 @@ entry:
 declare float @llvm.exp.f32(float %Val)
 
 define void @fexp() {
+; MIPS32-LABEL: fexp:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(expf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fexp:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fexp)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fexp)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(expf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fexp:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fexp)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fexp)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(expf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fexp:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(expf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(expf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(expf)($gp)
-; ALL-DAG:        jalr $25
 
   %exp = call float @llvm.exp.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %exp)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -733,31 +2127,108 @@ entry:
 declare float @llvm.exp2.f32(float %Val)
 
 define void @fexp2() {
+; MIPS32-LABEL: fexp2:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(exp2f)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fexp2:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fexp2)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fexp2)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(exp2f)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fexp2:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fexp2)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fexp2)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(exp2f)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fexp2:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(exp2f)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(exp2f)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(exp2f)($gp)
-; ALL-DAG:        jalr $25
 
   %exp2 = call float @llvm.exp2.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %exp2)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -765,31 +2236,115 @@ entry:
 declare float @llvm.fma.f32(float, float, float)
 
 define void @ffma(float %b, float %c) {
+; MIPS32-LABEL: ffma:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    mov.s $f0, $f12
+; MIPS32-NEXT:    mfc1 $6, $f14
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w1, $1
+; MIPS32-NEXT:    fexupr.w $w1, $w1
+; MIPS32-NEXT:    copy_s.w $1, $w1[0]
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    lw $25, %call16(fmaf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mov.s $f14, $f0
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: ffma:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffma)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(ffma)))
+; MIPS64-N32-NEXT:    mov.s $f14, $f13
+; MIPS64-N32-NEXT:    mov.s $f13, $f12
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(fmaf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: ffma:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffma)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(ffma)))
+; MIPS64-N64-NEXT:    mov.s $f14, $f13
+; MIPS64-N64-NEXT:    mov.s $f13, $f12
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(fmaf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: ffma:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(fmaf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(fmaf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(fmaf)($gp)
-; ALL-DAG:        jalr $25
 
   %fma = call float @llvm.fma.f32(float %1, float %b, float %c)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %fma)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -800,34 +2355,134 @@ entry:
 declare float @llvm.fmuladd.f32(float, float, float)
 
 define void @ffmuladd(float %b, float %c) {
+; MIPS32-O32-LABEL: ffmuladd:
+; MIPS32-O32:       # %bb.0: # %entry
+; MIPS32-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-O32-NEXT:    addu $1, $2, $25
+; MIPS32-O32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-O32-NEXT:    lh $2, 0($1)
+; MIPS32-O32-NEXT:    fill.h $w0, $2
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-O32-NEXT:    mtc1 $2, $f0
+; MIPS32-O32-NEXT:    madd.s $f0, $f14, $f0, $f12
+; MIPS32-O32-NEXT:    mfc1 $2, $f0
+; MIPS32-O32-NEXT:    fill.w $w0, $2
+; MIPS32-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-O32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-O32-NEXT:    jr $ra
+; MIPS32-O32-NEXT:    sh $2, 0($1)
+;
+; MIPS64R5-N32-LABEL: ffmuladd:
+; MIPS64R5-N32:       # %bb.0: # %entry
+; MIPS64R5-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffmuladd)))
+; MIPS64R5-N32-NEXT:    addu $1, $1, $25
+; MIPS64R5-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffmuladd)))
+; MIPS64R5-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64R5-N32-NEXT:    lh $2, 0($1)
+; MIPS64R5-N32-NEXT:    fill.h $w0, $2
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5-N32-NEXT:    mtc1 $2, $f0
+; MIPS64R5-N32-NEXT:    madd.s $f0, $f13, $f0, $f12
+; MIPS64R5-N32-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N32-NEXT:    fill.w $w0, $2
+; MIPS64R5-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64R5-N32-NEXT:    jr $ra
+; MIPS64R5-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64R5-N64-LABEL: ffmuladd:
+; MIPS64R5-N64:       # %bb.0: # %entry
+; MIPS64R5-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffmuladd)))
+; MIPS64R5-N64-NEXT:    daddu $1, $1, $25
+; MIPS64R5-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffmuladd)))
+; MIPS64R5-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64R5-N64-NEXT:    lh $2, 0($1)
+; MIPS64R5-N64-NEXT:    fill.h $w0, $2
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5-N64-NEXT:    mtc1 $2, $f0
+; MIPS64R5-N64-NEXT:    madd.s $f0, $f13, $f0, $f12
+; MIPS64R5-N64-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N64-NEXT:    fill.w $w0, $2
+; MIPS64R5-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64R5-N64-NEXT:    jr $ra
+; MIPS64R5-N64-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-O32-LABEL: ffmuladd:
+; MIPSR6-O32:       # %bb.0: # %entry
+; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    lw $1, %got(g)($1)
+; MIPSR6-O32-NEXT:    lh $2, 0($1)
+; MIPSR6-O32-NEXT:    fill.h $w0, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f0
+; MIPSR6-O32-NEXT:    mul.s $f0, $f0, $f12
+; MIPSR6-O32-NEXT:    add.s $f0, $f0, $f14
+; MIPSR6-O32-NEXT:    mfc1 $2, $f0
+; MIPSR6-O32-NEXT:    fill.w $w0, $2
+; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-O32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-O32-NEXT:    jr $ra
+; MIPSR6-O32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N32-LABEL: ffmuladd:
+; MIPSR6-N32:       # %bb.0: # %entry
+; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffmuladd)))
+; MIPSR6-N32-NEXT:    addu $1, $1, $25
+; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffmuladd)))
+; MIPSR6-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPSR6-N32-NEXT:    lh $2, 0($1)
+; MIPSR6-N32-NEXT:    fill.h $w0, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f0
+; MIPSR6-N32-NEXT:    mul.s $f0, $f0, $f12
+; MIPSR6-N32-NEXT:    add.s $f0, $f0, $f13
+; MIPSR6-N32-NEXT:    mfc1 $2, $f0
+; MIPSR6-N32-NEXT:    fill.w $w0, $2
+; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N32-NEXT:    jr $ra
+; MIPSR6-N32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N64-LABEL: ffmuladd:
+; MIPSR6-N64:       # %bb.0: # %entry
+; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffmuladd)))
+; MIPSR6-N64-NEXT:    daddu $1, $1, $25
+; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffmuladd)))
+; MIPSR6-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPSR6-N64-NEXT:    lh $2, 0($1)
+; MIPSR6-N64-NEXT:    fill.h $w0, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f0
+; MIPSR6-N64-NEXT:    mul.s $f0, $f0, $f12
+; MIPSR6-N64-NEXT:    add.s $f0, $f0, $f13
+; MIPSR6-N64-NEXT:    mfc1 $2, $f0
+; MIPSR6-N64-NEXT:    fill.w $w0, $2
+; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N64-NEXT:    jr $ra
+; MIPSR6-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: ffmuladd:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL:            mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-O32:     madd.s $f[[F1:[0-9]]], $f14, $f[[F0]], $f12
 ; MIPS32-N32:     madd.s $f[[F1:[0-9]]], $f13, $f[[F0]], $f12
 ; MIPS32-N64:     madd.s $f[[F1:[0-9]]], $f13, $f[[F0]], $f12
-; MIPSR6:         mul.s $f[[F2:[0-9]+]], $f[[F0]], $f12
-; MIPSR6-O32:     add.s $f[[F1:[0-9]+]], $f[[F2]], $f14
-; MIPSR6-N32:     add.s $f[[F1:[0-9]+]], $f[[F2]], $f13
-; MIPSR6-N64:     add.s $f[[F1:[0-9]+]], $f[[F2]], $f13
 
   %fmuladd = call float @llvm.fmuladd.f32(float %1, float %b, float %c)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %fmuladd)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -835,60 +2490,184 @@ entry:
 declare float @llvm.fabs.f32(float %Val)
 
 define void @ffabs() {
+; MIPS32-LABEL: ffabs:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    abs.s $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: ffabs:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffabs)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffabs)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    abs.s $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: ffabs:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffabs)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffabs)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    abs.s $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: ffabs:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL:            mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; ALL:            abs.s $f[[F1:[0-9]+]], $f[[F0]]
 
   %fabs = call float @llvm.fabs.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %fabs)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
 
-; ALL:            sh $[[R3]]
   ret void
 }
 
 declare float @llvm.minnum.f32(float %Val, float %b)
 
 define void @fminnum(float %b) {
+; MIPS32-LABEL: fminnum:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    mov.s $f14, $f12
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(fminf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fminnum:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPS64-N32-NEXT:    mov.s $f13, $f12
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(fminf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fminnum:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPS64-N64-NEXT:    mov.s $f13, $f12
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(fminf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fminnum:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(fminf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(fminf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(fminf)($gp)
-; ALL-DAG:        jalr $25
 
   %minnum = call float @llvm.minnum.f32(float %1, float %b)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %minnum)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -896,31 +2675,111 @@ entry:
 declare float @llvm.maxnum.f32(float %Val, float %b)
 
 define void @fmaxnum(float %b) {
+; MIPS32-LABEL: fmaxnum:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    mov.s $f14, $f12
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(fmaxf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fmaxnum:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPS64-N32-NEXT:    mov.s $f13, $f12
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(fmaxf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fmaxnum:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPS64-N64-NEXT:    mov.s $f13, $f12
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(fmaxf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fmaxnum:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(fmaxf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(fmaxf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(fmaxf)($gp)
-; ALL-DAG:        jalr $25
 
   %maxnum = call float @llvm.maxnum.f32(float %1, float %b)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %maxnum)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:             sh $[[R3]]
 
   ret void
 }
@@ -930,28 +2789,72 @@ entry:
 declare float @llvm.copysign.f32(float %Val, float %b)
 
 define void @fcopysign(float %b) {
+; MIPS32-LABEL: fcopysign:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mfc1 $3, $f12
+; MIPS32-NEXT:    ext $3, $3, 31, 1
+; MIPS32-NEXT:    ins $2, $3, 31, 1
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fcopysign:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fcopysign)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fcopysign)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mfc1 $3, $f12
+; MIPS64-N32-NEXT:    ext $3, $3, 31, 1
+; MIPS64-N32-NEXT:    ins $2, $3, 31, 1
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fcopysign:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fcopysign)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fcopysign)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mfc1 $3, $f12
+; MIPS64-N64-NEXT:    ext $3, $3, 31, 1
+; MIPS64-N64-NEXT:    ins $2, $3, 31, 1
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fcopysign:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
 
   %copysign = call float @llvm.copysign.f32(float %1, float %b)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %copysign)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f12
-; ALL:            ext $[[R3:[0-9]+]], $3, 31, 1
-; ALL:            ins $[[R1]], $[[R3]], 31, 1
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R1]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -959,31 +2862,108 @@ entry:
 declare float @llvm.floor.f32(float %Val)
 
 define void @ffloor() {
+; MIPS32-LABEL: ffloor:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(floorf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: ffloor:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffloor)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(ffloor)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(floorf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: ffloor:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffloor)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(ffloor)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(floorf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: ffloor:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(floorf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(floorf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(floorf)($gp)
-; ALL-DAG:        jalr $25
 
   %floor = call float @llvm.floor.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %floor)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -991,31 +2971,108 @@ entry:
 declare float @llvm.ceil.f32(float %Val)
 
 define void @fceil() {
+; MIPS32-LABEL: fceil:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(ceilf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fceil:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fceil)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fceil)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(ceilf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fceil:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fceil)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fceil)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(ceilf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fceil:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(ceilf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(ceilf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(ceilf)($gp)
-; ALL-DAG:        jalr $25
 
   %ceil = call float @llvm.ceil.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %ceil)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -1023,31 +3080,108 @@ entry:
 declare float @llvm.trunc.f32(float %Val)
 
 define void @ftrunc() {
+; MIPS32-LABEL: ftrunc:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(truncf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: ftrunc:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ftrunc)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(ftrunc)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(truncf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: ftrunc:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ftrunc)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(ftrunc)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(truncf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: ftrunc:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(truncf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(truncf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(truncf)($gp)
-; ALL-DAG:        jalr $25
 
   %trunc = call float @llvm.trunc.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %trunc)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -1055,61 +3189,215 @@ entry:
 declare float @llvm.rint.f32(float %Val)
 
 define void @frint() {
+; MIPS32-LABEL: frint:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(rintf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: frint:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(frint)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(frint)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(rintf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: frint:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(frint)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(frint)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(rintf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: frint:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(rintf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(rintf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(rintf)($gp)
-; ALL-DAG:        jalr $25
   %rint = call float @llvm.rint.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %rint)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
   store i16 %2, i16* @g, align 2
 
-; ALL:            sh $[[R3]]
   ret void
 }
 
 declare float @llvm.nearbyint.f32(float %Val)
 
 define void @fnearbyint() {
+; MIPS32-LABEL: fnearbyint:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(nearbyintf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fnearbyint:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fnearbyint)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fnearbyint)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(nearbyintf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fnearbyint:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fnearbyint)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fnearbyint)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(nearbyintf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fnearbyint:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(nearbyintf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(nearbyintf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(nearbyintf)($gp)
-; ALL-DAG:        jalr $25
 
   %nearbyint = call float @llvm.nearbyint.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %nearbyint)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -1117,31 +3405,108 @@ entry:
 declare float @llvm.round.f32(float %Val)
 
 define void @fround() {
+; MIPS32-LABEL: fround:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(roundf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fround:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fround)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fround)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(roundf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fround:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fround)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fround)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(roundf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fround:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(roundf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(roundf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(roundf)($gp)
-; ALL-DAG:        jalr $25
 
   %round = call float @llvm.round.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %round)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
diff --git a/test/CodeGen/SystemZ/fp-conv-10.ll b/test/CodeGen/SystemZ/fp-conv-10.ll
index dc5178985d9..f897743ef11 100644
--- a/test/CodeGen/SystemZ/fp-conv-10.ll
+++ b/test/CodeGen/SystemZ/fp-conv-10.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Test conversion of floating-point values to unsigned i32s (z10 only).
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
@@ -10,11 +11,19 @@
 ; Test f32->i32.
 define i32 @f1(float %f) {
 ; CHECK-LABEL: f1:
-; CHECK: cebr
-; CHECK: sebr
-; CHECK: cfebr
-; CHECK: xilf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, .LCPI0_0
+; CHECK-NEXT:    le %f1, 0(%r1)
+; CHECK-NEXT:    cebr %f0, %f1
+; CHECK-NEXT:    jnl .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cfebr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    sebr %f0, %f1
+; CHECK-NEXT:    cfebr %r2, 5, %f0
+; CHECK-NEXT:    xilf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %conv = fptoui float %f to i32
   ret i32 %conv
 }
@@ -22,11 +31,19 @@ define i32 @f1(float %f) {
 ; Test f64->i32.
 define i32 @f2(double %f) {
 ; CHECK-LABEL: f2:
-; CHECK: cdbr
-; CHECK: sdbr
-; CHECK: cfdbr
-; CHECK: xilf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, .LCPI1_0
+; CHECK-NEXT:    ldeb %f1, 0(%r1)
+; CHECK-NEXT:    cdbr %f0, %f1
+; CHECK-NEXT:    jnl .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cfdbr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    sdbr %f0, %f1
+; CHECK-NEXT:    cfdbr %r2, 5, %f0
+; CHECK-NEXT:    xilf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %conv = fptoui double %f to i32
   ret i32 %conv
 }
@@ -34,11 +51,21 @@ define i32 @f2(double %f) {
 ; Test f128->i32.
 define i32 @f3(fp128 *%src) {
 ; CHECK-LABEL: f3:
-; CHECK: cxbr
-; CHECK: sxbr
-; CHECK: cfxbr
-; CHECK: xilf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld %f0, 0(%r2)
+; CHECK-NEXT:    ld %f2, 8(%r2)
+; CHECK-NEXT:    larl %r1, .LCPI2_0
+; CHECK-NEXT:    lxeb %f1, 0(%r1)
+; CHECK-NEXT:    cxbr %f0, %f1
+; CHECK-NEXT:    jnl .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cfxbr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    sxbr %f0, %f1
+; CHECK-NEXT:    cfxbr %r2, 5, %f0
+; CHECK-NEXT:    xilf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %f = load fp128, fp128 *%src
   %conv = fptoui fp128 %f to i32
   ret i32 %conv
diff --git a/test/CodeGen/SystemZ/fp-conv-12.ll b/test/CodeGen/SystemZ/fp-conv-12.ll
index d37a443c482..91c377fa3e2 100644
--- a/test/CodeGen/SystemZ/fp-conv-12.ll
+++ b/test/CodeGen/SystemZ/fp-conv-12.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Test conversion of floating-point values to unsigned i64s (z10 only).
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
@@ -9,11 +10,19 @@
 ; Test f32->i64.
 define i64 @f1(float %f) {
 ; CHECK-LABEL: f1:
-; CHECK: cebr
-; CHECK: sebr
-; CHECK: cgebr
-; CHECK: xihf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, .LCPI0_0
+; CHECK-NEXT:    le %f1, 0(%r1)
+; CHECK-NEXT:    cebr %f0, %f1
+; CHECK-NEXT:    jnl .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cgebr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    sebr %f0, %f1
+; CHECK-NEXT:    cgebr %r2, 5, %f0
+; CHECK-NEXT:    xihf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %conv = fptoui float %f to i64
   ret i64 %conv
 }
@@ -21,11 +30,19 @@ define i64 @f1(float %f) {
 ; Test f64->i64.
 define i64 @f2(double %f) {
 ; CHECK-LABEL: f2:
-; CHECK: cdbr
-; CHECK: sdbr
-; CHECK: cgdbr
-; CHECK: xihf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, .LCPI1_0
+; CHECK-NEXT:    ldeb %f1, 0(%r1)
+; CHECK-NEXT:    cdbr %f0, %f1
+; CHECK-NEXT:    jnl .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cgdbr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    sdbr %f0, %f1
+; CHECK-NEXT:    cgdbr %r2, 5, %f0
+; CHECK-NEXT:    xihf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %conv = fptoui double %f to i64
   ret i64 %conv
 }
@@ -33,11 +50,21 @@ define i64 @f2(double %f) {
 ; Test f128->i64.
 define i64 @f3(fp128 *%src) {
 ; CHECK-LABEL: f3:
-; CHECK: cxbr
-; CHECK: sxbr
-; CHECK: cgxbr
-; CHECK: xihf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld %f0, 0(%r2)
+; CHECK-NEXT:    ld %f2, 8(%r2)
+; CHECK-NEXT:    larl %r1, .LCPI2_0
+; CHECK-NEXT:    lxeb %f1, 0(%r1)
+; CHECK-NEXT:    cxbr %f0, %f1
+; CHECK-NEXT:    jnl .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cgxbr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    sxbr %f0, %f1
+; CHECK-NEXT:    cgxbr %r2, 5, %f0
+; CHECK-NEXT:    xihf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %f = load fp128, fp128 *%src
   %conv = fptoui fp128 %f to i64
   ret i64 %conv
-- 
GitLab


From 8177736107820082b75e2dd9bae47c055c0fdfd1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 27 Oct 2018 15:14:42 +0000
Subject: [PATCH 0670/1116] Fix -Wdocumentation warning. NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345454 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 8b6935fa039..81fba5d15ee 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2179,17 +2179,17 @@ computeDomSubtreeCost(DomTreeNode &N,
 /// Turns a llvm.experimental.guard intrinsic into implicit control flow branch,
 /// making the following replacement:
 ///
-///   <code before guard>
+///   --code before guard--
 ///   call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
-///   <code after guard>
+///   --code after guard--
 ///
 /// into
 ///
-///   <code before guard>
+///   --code before guard--
 ///   br i1 %cond, label %guarded, label %deopt
 ///
 /// guarded:
-///   <code after guard>
+///   --code after guard--
 ///
 /// deopt:
 ///   call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
-- 
GitLab


From 59de0e0629d4d78f285adcb6c6ba33e62ae5a9f5 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 27 Oct 2018 16:46:10 +0000
Subject: [PATCH 0671/1116] [x86] make test immune to improved extraction in
 D53784; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345455 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx2-schedule.ll | 54 +++++++++++++------------------
 1 file changed, 23 insertions(+), 31 deletions(-)

diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll
index e04eb583087..4cec0508f33 100644
--- a/test/CodeGen/X86/avx2-schedule.ll
+++ b/test/CodeGen/X86/avx2-schedule.ll
@@ -171,66 +171,58 @@ define <8 x float> @test_broadcastss_ymm(<4 x float> %a0) {
   ret <8 x float> %2
 }
 
-define <4 x i32> @test_extracti128(<8 x i32> %a0, <8 x i32> %a1, <4 x i32> *%a2) {
+define <4 x i32> @test_extracti128(<8 x i16> %a0, <4 x i32> *%a1) {
 ; GENERIC-LABEL: test_extracti128:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
-; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; GENERIC-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_extracti128:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; HASWELL-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
 ; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_extracti128:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
-; BROADWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; BROADWELL-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; BROADWELL-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; BROADWELL-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
 ; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_extracti128:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.33]
-; SKYLAKE-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SKYLAKE-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; SKYLAKE-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKYLAKE-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
 ; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_extracti128:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.33]
-; SKX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; SKX-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKX-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
+; SKX-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_extracti128:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.25]
-; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [2:0.25]
-; ZNVER1-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
+; ZNVER1-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [2:0.25]
+; ZNVER1-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:0.50]
 ; ZNVER1-NEXT:    vzeroupper # sched: [100:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = add <8 x i32> %a0, %a1
-  %2 = sub <8 x i32> %a0, %a1
-  %3 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %4 = shufflevector <8 x i32> %2, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  store <4 x i32> %3, <4 x i32> *%a2
-  ret <4 x i32> %4
+  %z = zext <8 x i16> %a0 to <8 x i32>
+  %ext = shufflevector <8 x i32> %z, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i32> %ext, <4 x i32> *%a1
+  ret <4 x i32> %ext
 }
 
 define <2 x double> @test_gatherdpd(<2 x double> %a0, i8* %a1, <4 x i32> %a2, <2 x double> %a3) {
-- 
GitLab


From 196c06052966d9bd3e88f1d9e501f7925eaf81c3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <florian.hahn@arm.com>
Date: Sat, 27 Oct 2018 16:53:45 +0000
Subject: [PATCH 0672/1116] [Local] Keep K's range if K does not move when
 combining metadata.

As K has to dominate I, IIUC I's range metadata must be a subset of
K's. After Eli's recent clarification to the LangRef, loading a value
outside of the range is undefined behavior.
Therefore if I's range contains elements outside of K's range and we would load
one such value, K would cause undefined behavior.

In cases like hoisting/sinking, we still want the most generic range
over all code paths to/from the hoist/sink point. As suggested in the
patches related to D47339, I will refactor the handling of those
scenarios and try to decouple it from this function as follow up, once
we switched to a similar handling of metadata in most of
combineMetadata.

I updated some tests checking mostly the merging of metadata to keep the
metadata of to dominating load. The most interesting one is probably test8 in
test/Transforms/JumpThreading/thread-loads.ll. It contained a comment
about the alias metadata preventing us to eliminate the branch, but it
seem like the actual problem currently is that we merge the ranges of
both loads and cannot eliminate the icmp afterwards. With this patch, we
manage to eliminate the icmp, as the range of the first load excludes 8.

Reviewers: efriedma, nlopes, davide

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D51629


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345456 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/Local.cpp                | 10 ++++++-
 test/Transforms/GVN/range.ll                  | 27 +++++++++----------
 .../InstCombine/load-combine-metadata.ll      |  2 +-
 test/Transforms/JumpThreading/thread-loads.ll | 12 ++++++---
 test/Transforms/NewGVN/range.ll               | 27 +++++++++----------
 5 files changed, 44 insertions(+), 34 deletions(-)

diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 82fb765842d..0dcd7371210 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -2315,7 +2315,15 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
         K->setMetadata(Kind, MDNode::intersect(JMD, KMD));
         break;
       case LLVMContext::MD_range:
-        K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD));
+
+        // If K does move, use most generic range. Otherwise keep the range of
+        // K.
+        if (DoesKMove)
+          // FIXME: If K does move, we should drop the range info and nonnull.
+          //        Currently this function is used with DoesKMove in passes
+          //        doing hoisting/sinking and the current behavior of using the
+          //        most generic range is correct in those cases.
+          K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD));
         break;
       case LLVMContext::MD_fpmath:
         K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
diff --git a/test/Transforms/GVN/range.ll b/test/Transforms/GVN/range.ll
index 39acc0c3515..fd5fa56b617 100644
--- a/test/Transforms/GVN/range.ll
+++ b/test/Transforms/GVN/range.ll
@@ -2,7 +2,7 @@
 
 define i32 @test1(i32* %p) {
 ; CHECK-LABEL: @test1(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range !0
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !0
@@ -12,8 +12,7 @@ define i32 @test1(i32* %p) {
 
 define i32 @test2(i32* %p) {
 ; CHECK-LABEL: @test2(i32* %p)
-; CHECK: %a = load i32, i32* %p
-; CHECK-NOT: range
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p
@@ -23,7 +22,7 @@ define i32 @test2(i32* %p) {
 
 define i32 @test3(i32* %p) {
 ; CHECK-LABEL: @test3(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[DISJOINT_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !1
@@ -33,7 +32,7 @@ define i32 @test3(i32* %p) {
 
 define i32 @test4(i32* %p) {
 ; CHECK-LABEL: @test4(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !2
@@ -43,7 +42,7 @@ define i32 @test4(i32* %p) {
 
 define i32 @test5(i32* %p) {
 ; CHECK-LABEL: @test5(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_SIGNED_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE3:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !3
   %b = load i32, i32* %p, !range !4
@@ -53,7 +52,7 @@ define i32 @test5(i32* %p) {
 
 define i32 @test6(i32* %p) {
 ; CHECK-LABEL: @test6(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_TEST6:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE5:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !5
   %b = load i32, i32* %p, !range !6
@@ -63,7 +62,7 @@ define i32 @test6(i32* %p) {
 
 define i32 @test7(i32* %p) {
 ; CHECK-LABEL: @test7(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_TEST7:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE7:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !7
   %b = load i32, i32* %p, !range !8
@@ -73,7 +72,7 @@ define i32 @test7(i32* %p) {
 
 define i32 @test8(i32* %p) {
 ; CHECK-LABEL: @test8(i32* %p)
-; CHECK: %a = load i32, i32* %p
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE9:[0-9]+]]
 ; CHECK-NOT: range
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !9
@@ -82,11 +81,11 @@ define i32 @test8(i32* %p) {
   ret i32 %c
 }
 
-; CHECK: ![[DISJOINT_RANGE]] = !{i32 0, i32 2, i32 3, i32 5}
-; CHECK: ![[MERGED_RANGE]] = !{i32 0, i32 5}
-; CHECK: ![[MERGED_SIGNED_RANGE]] = !{i32 -5, i32 -2, i32 1, i32 5}
-; CHECK: ![[MERGED_TEST6]] = !{i32 10, i32 1}
-; CHECK: ![[MERGED_TEST7]] = !{i32 3, i32 4, i32 5, i32 2}
+; CHECK: ![[RANGE0]] = !{i32 0, i32 2}
+; CHECK: ![[RANGE3]] = !{i32 -5, i32 -2}
+; CHECK: ![[RANGE5]] = !{i32 10, i32 1}
+; CHECK: ![[RANGE7]] = !{i32 1, i32 2, i32 3, i32 4}
+; CHECK: ![[RANGE9]] = !{i32 1, i32 5}
 
 !0 = !{i32 0, i32 2}
 !1 = !{i32 3, i32 5}
diff --git a/test/Transforms/InstCombine/load-combine-metadata.ll b/test/Transforms/InstCombine/load-combine-metadata.ll
index b7f42e7a0e7..536f1bb75f6 100644
--- a/test/Transforms/InstCombine/load-combine-metadata.ll
+++ b/test/Transforms/InstCombine/load-combine-metadata.ll
@@ -17,7 +17,7 @@ define void @test_load_load_combine_metadata(i32*, i32*, i32*) {
   ret void
 }
 
-; CHECK: ![[RANGE]] = !{i32 0, i32 5, i32 7, i32 9}
+; CHECK: ![[RANGE]] = !{i32 0, i32 5}
 !0 = !{ i32 0, i32 5 }
 !1 = !{ i32 7, i32 9 }
 !2 = !{!2}
diff --git a/test/Transforms/JumpThreading/thread-loads.ll b/test/Transforms/JumpThreading/thread-loads.ll
index 3606e796cdd..1156f39d4a2 100644
--- a/test/Transforms/JumpThreading/thread-loads.ll
+++ b/test/Transforms/JumpThreading/thread-loads.ll
@@ -246,13 +246,15 @@ bb3:
   ret i32 %res.0
 }
 
-; Make sure we merge the aliasing metadata. (If we don't, we have a load
-; with the wrong metadata, so the branch gets incorrectly eliminated.)
+; Make sure we merge the aliasing metadata. We keep the range metadata for the
+; first load, as it dominates the second load. Hence we can eliminate the
+; branch.
 define void @test8(i32*, i32*, i32*) {
 ; CHECK-LABEL: @test8(
-; CHECK: %a = load i32, i32* %0, !range !4
+; CHECK: %a = load i32, i32* %0, !range ![[RANGE4:[0-9]+]]
 ; CHECK-NEXT: store i32 %a
-; CHECK: br i1 %c
+; CHECK-NEXT: %xxx = tail call i32 (...) @f1()
+; CHECK-NEXT: ret void
   %a = load i32, i32* %0, !tbaa !0, !range !4, !alias.scope !9, !noalias !10
   %b = load i32, i32* %0, !range !5
   store i32 %a, i32* %1
@@ -525,6 +527,8 @@ right_x:
   ret i32 10
 }
 
+; CHECK: ![[RANGE4]] = !{i32 0, i32 1}
+
 !0 = !{!3, !3, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/NewGVN/range.ll b/test/Transforms/NewGVN/range.ll
index 55efa5955b1..29b911cb5f6 100644
--- a/test/Transforms/NewGVN/range.ll
+++ b/test/Transforms/NewGVN/range.ll
@@ -2,7 +2,7 @@
 
 define i32 @test1(i32* %p) {
 ; CHECK-LABEL: @test1(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range !0
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !0
@@ -12,8 +12,7 @@ define i32 @test1(i32* %p) {
 
 define i32 @test2(i32* %p) {
 ; CHECK-LABEL: @test2(i32* %p)
-; CHECK: %a = load i32, i32* %p
-; CHECK-NOT: range
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p
@@ -23,7 +22,7 @@ define i32 @test2(i32* %p) {
 
 define i32 @test3(i32* %p) {
 ; CHECK-LABEL: @test3(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[DISJOINT_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !1
@@ -33,7 +32,7 @@ define i32 @test3(i32* %p) {
 
 define i32 @test4(i32* %p) {
 ; CHECK-LABEL: @test4(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !2
@@ -43,7 +42,7 @@ define i32 @test4(i32* %p) {
 
 define i32 @test5(i32* %p) {
 ; CHECK-LABEL: @test5(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_SIGNED_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE3:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !3
   %b = load i32, i32* %p, !range !4
@@ -53,7 +52,7 @@ define i32 @test5(i32* %p) {
 
 define i32 @test6(i32* %p) {
 ; CHECK-LABEL: @test6(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_TEST6:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE5:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !5
   %b = load i32, i32* %p, !range !6
@@ -63,7 +62,7 @@ define i32 @test6(i32* %p) {
 
 define i32 @test7(i32* %p) {
 ; CHECK-LABEL: @test7(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_TEST7:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE7:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !7
   %b = load i32, i32* %p, !range !8
@@ -73,7 +72,7 @@ define i32 @test7(i32* %p) {
 
 define i32 @test8(i32* %p) {
 ; CHECK-LABEL: @test8(i32* %p)
-; CHECK: %a = load i32, i32* %p
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE9:[0-9]+]]
 ; CHECK-NOT: range
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !9
@@ -82,11 +81,11 @@ define i32 @test8(i32* %p) {
   ret i32 %c
 }
 
-; CHECK: ![[DISJOINT_RANGE]] = !{i32 0, i32 2, i32 3, i32 5}
-; CHECK: ![[MERGED_RANGE]] = !{i32 0, i32 5}
-; CHECK: ![[MERGED_SIGNED_RANGE]] = !{i32 -5, i32 -2, i32 1, i32 5}
-; CHECK: ![[MERGED_TEST6]] = !{i32 10, i32 1}
-; CHECK: ![[MERGED_TEST7]] = !{i32 3, i32 4, i32 5, i32 2}
+; CHECK: ![[RANGE0]] = !{i32 0, i32 2}
+; CHECK: ![[RANGE3]] = !{i32 -5, i32 -2}
+; CHECK: ![[RANGE5]] = !{i32 10, i32 1}
+; CHECK: ![[RANGE7]] = !{i32 1, i32 2, i32 3, i32 4}
+; CHECK: ![[RANGE9]] = !{i32 1, i32 5}
 
 !0 = !{i32 0, i32 2}
 !1 = !{i32 3, i32 5}
-- 
GitLab


From 836c763dadbd9478fa35b1a291a38bf17aa206ba Mon Sep 17 00:00:00 2001
From: Vlad Tsyrklevich <vlad@tsyrklevich.net>
Date: Sat, 27 Oct 2018 17:39:13 +0000
Subject: [PATCH 0673/1116] Revert "DebugInfo: reduce DIE range verification on
 object files"

This reverts commits r345441 and r345444, they were causing msan
buildbot failures.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345457 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFVerifier.h  |  7 +--
 lib/DebugInfo/DWARF/DWARFVerifier.cpp         | 57 +++++--------------
 .../llvm-dwarfdump/X86/debug-verify-object.s  | 57 -------------------
 3 files changed, 15 insertions(+), 106 deletions(-)
 delete mode 100644 test/tools/llvm-dwarfdump/X86/debug-verify-object.s

diff --git a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index e47fbea5646..3ad65cf51b1 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -97,9 +97,6 @@ private:
   /// lies between to valid DIEs.
   std::map<uint64_t, std::set<uint32_t>> ReferenceToDIEOffsets;
   uint32_t NumDebugLineErrors = 0;
-  // Used to relax some checks that do not currently work portably
-  bool IsObjectFile;
-  bool IsMachOObject;
 
   raw_ostream &error() const;
   raw_ostream &warn() const;
@@ -289,8 +286,8 @@ private:
 
 public:
   DWARFVerifier(raw_ostream &S, DWARFContext &D,
-                DIDumpOptions DumpOpts = DIDumpOptions::getForSingleDIE());
-
+                DIDumpOptions DumpOpts = DIDumpOptions::getForSingleDIE())
+      : OS(S), DCtx(D), DumpOpts(std::move(DumpOpts)) {}
   /// Verify the information in any of the following sections, if available:
   /// .debug_abbrev, debug_abbrev.dwo
   ///
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 1f089a7030d..d30600accd0 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -394,42 +394,20 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
   // Build RI for this DIE and check that ranges within this DIE do not
   // overlap.
   DieRangeInfo RI(Die);
+  for (auto Range : Ranges) {
+    if (!Range.valid()) {
+      ++NumErrors;
+      error() << "Invalid address range " << Range << "\n";
+      continue;
+    }
 
-  // TODO support object files better
-  //
-  // Some object file formats (i.e. non-MachO) support COMDAT.  ELF in
-  // particular does so by placing each function into a section.  The DWARF data
-  // for the function at that point uses a section relative DW_FORM_addrp for
-  // the DW_AT_low_pc and a DW_FORM_data4 for the offset as the DW_AT_high_pc.
-  // In such a case, when the Die is the CU, the ranges will overlap, and we
-  // will flag valid conflicting ranges as invalid.
-  //
-  // For such targets, we should read the ranges from the CU and partition them
-  // by the section id.  The ranges within a particular section should be
-  // disjoint, although the ranges across sections may overlap.  We would map
-  // the child die to the entity that it references and the section with which
-  // it is associated.  The child would then be checked against the range
-  // information for the associated section.
-  //
-  // For now, simply elide the range verification for the CU DIEs if we are
-  // processing an object file.
-
-  if (!IsObjectFile || IsMachOObject || Die.getTag() == DW_TAG_subprogram) {
-    for (auto Range : Ranges) {
-      if (!Range.valid()) {
-        ++NumErrors;
-        error() << "Invalid address range " << Range << "\n";
-        continue;
-      }
-
-      // Verify that ranges don't intersect.
-      const auto IntersectingRange = RI.insert(Range);
-      if (IntersectingRange != RI.Ranges.end()) {
-        ++NumErrors;
-        error() << "DIE has overlapping address ranges: " << Range << " and "
-                << *IntersectingRange << "\n";
-        break;
-      }
+    // Verify that ranges don't intersect.
+    const auto IntersectingRange = RI.insert(Range);
+    if (IntersectingRange != RI.Ranges.end()) {
+      ++NumErrors;
+      error() << "DIE has overlapping address ranges: " << Range << " and "
+              << *IntersectingRange << "\n";
+      break;
     }
   }
 
@@ -767,15 +745,6 @@ void DWARFVerifier::verifyDebugLineRows() {
   }
 }
 
-DWARFVerifier::DWARFVerifier(raw_ostream &S, DWARFContext &D,
-                             DIDumpOptions DumpOpts)
-    : OS(S), DCtx(D), DumpOpts(std::move(DumpOpts)) {
-  if (const auto *F = DCtx.getDWARFObj().getFile()) {
-    IsObjectFile = F->isRelocatableObject();
-    IsMachOObject = F->isMachO();
-  }
-}
-
 bool DWARFVerifier::handleDebugLine() {
   NumDebugLineErrors = 0;
   OS << "Verifying .debug_line...\n";
diff --git a/test/tools/llvm-dwarfdump/X86/debug-verify-object.s b/test/tools/llvm-dwarfdump/X86/debug-verify-object.s
deleted file mode 100644
index be79c95c0b1..00000000000
--- a/test/tools/llvm-dwarfdump/X86/debug-verify-object.s
+++ /dev/null
@@ -1,57 +0,0 @@
-# RUN: llvm-mc -triple x86_64-unknown-linux-gnu -filetype obj -o - %s | llvm-dwarfdump --verify -
-
-	.text
-
-	.section	.text.f,"ax",@progbits
-	.globl	f
-	.type	f,@function
-f:
-.Lfunc_begin0:
-	pushq	$32
-	popq	%rax
-	retq
-.Lfunc_end0:
-	.size	f, .Lfunc_end0-f
-
-	.section	.text.g,"ax",@progbits
-	.globl	g
-	.type	g,@function
-g:
-.Lfunc_begin1:
-	pushq   $64
-	popq    %rax
-	retq
-.Lfunc_end1:
-	.size	g, .Lfunc_end1-g
-
-	.section	.debug_abbrev,"",@progbits
-	.byte	1                       # Abbreviation Code
-	.byte	17                      # DW_TAG_compile_unit
-	.byte	0                       # DW_CHILDREN_no
-	.byte	17                      # DW_AT_low_pc
-	.byte	1                       # DW_FORM_addr
-	.byte	85                      # DW_AT_ranges
-	.byte	23                      # DW_FORM_sec_offset
-	.byte	0                       # EOM(1)
-	.byte	0                       # EOM(2)
-	.byte	0                       # EOM(3)
-
-	.section	.debug_info,"",@progbits
-.Lcu_begin0:
-	.long	20                      # Length of Unit
-	.short	4                       # DWARF version number
-	.long	.debug_abbrev           # Offset Into Abbrev. Section
-	.byte	8                       # Address Size (in bytes)
-	.byte	1                       # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit
-	.quad	0                       # DW_AT_low_pc
-	.long	.Ldebug_ranges0         # DW_AT_ranges
-
-	.section        .debug_ranges,"",@progbits
-.Ldebug_ranges0:
-	.quad	.Lfunc_begin0
-	.quad	.Lfunc_end0
-	.quad	.Lfunc_begin1
-	.quad	.Lfunc_end1
-	.quad	0
-	.quad	0
-
-- 
GitLab


From 549f667007381e273ce4c1f25fab65529d06a065 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 27 Oct 2018 18:37:59 +0000
Subject: [PATCH 0674/1116] [X86][SSE] LowerVSELECT - pull out repeated
 getOperand(). NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345458 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6d589eef5a9..c2ca88911d2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15680,11 +15680,15 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
 }
 
 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+
   // A vselect where all conditions and data are constants can be optimized into
   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
-  if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
-      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
-      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
+  if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
     return SDValue();
 
   // Try to lower this to a blend-style vector shuffle. This can handle all
@@ -15694,7 +15698,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
   // with patterns on the mask registers on AVX-512.
-  if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
+  if (Cond.getScalarValueSizeInBits() == 1)
     return Op;
 
   // Variable blends are only legal from SSE4.1 onward.
@@ -15708,11 +15712,9 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   // into an i1 condition so that we can use the mask-based 512-bit blend
   // instructions.
   if (VT.getSizeInBits() == 512) {
-    SDValue Cond = Op.getOperand(0);
     // The vNi1 condition case should be handled above as it can be trivially
     // lowered.
-    assert(Cond.getValueType().getScalarSizeInBits() ==
-               VT.getScalarSizeInBits() &&
+    assert(Cond.getScalarValueSizeInBits() == VT.getScalarSizeInBits() &&
            "Should have a size-matched integer condition!");
     // Build a mask by testing the condition against zero.
     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
@@ -15720,7 +15722,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
                                 getZeroVector(VT, Subtarget, DAG, dl),
                                 ISD::SETNE);
     // Now return a new VSELECT using the mask.
-    return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
+    return DAG.getSelect(dl, VT, Mask, LHS, RHS);
   }
 
   // Only some types will be legal on some subtargets. If we can emit a legal
@@ -15742,9 +15744,9 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   case MVT::v16i16: {
     // Bitcast everything to the vXi8 type and use a vXi8 vselect.
     MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
-    SDValue Cond = DAG.getBitcast(CastVT, Op->getOperand(0));
-    SDValue LHS = DAG.getBitcast(CastVT, Op->getOperand(1));
-    SDValue RHS = DAG.getBitcast(CastVT, Op->getOperand(2));
+    Cond = DAG.getBitcast(CastVT, Cond);
+    LHS = DAG.getBitcast(CastVT, LHS);
+    RHS = DAG.getBitcast(CastVT, RHS);
     SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
     return DAG.getBitcast(VT, Select);
   }
-- 
GitLab


From 97adbba88e84b661b3ddeb45b04a00886e4c118e Mon Sep 17 00:00:00 2001
From: George Burgess IV <george.burgess.iv@gmail.com>
Date: Sat, 27 Oct 2018 20:02:06 +0000
Subject: [PATCH 0675/1116] [utils] Run tests in the proper directory.

The intent here was to run check-llvm/check-clang in the instrumented
clang's build directory, not the maybe-not-yet-created uninstrumented
clang's. Oops. :)


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345461 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/collect_and_build_with_pgo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/collect_and_build_with_pgo.py b/utils/collect_and_build_with_pgo.py
index 0b3943647bc..144eed3cc22 100755
--- a/utils/collect_and_build_with_pgo.py
+++ b/utils/collect_and_build_with_pgo.py
@@ -41,7 +41,7 @@ def _run_benchmark(env, out_dir, include_debug_info):
     # paths a fair amount, though the `if (stuff_is_broken) { diag() ... }`
     # branches should still heavily be weighted in the not-taken direction,
     # since we built all of LLVM/etc).
-    _build_things_in(env, target_dir, what=['check-llvm', 'check-clang'])
+    _build_things_in(env, out_dir, what=['check-llvm', 'check-clang'])
 
     # Building tblgen gets us coverage; don't skip it. (out_dir may also not
     # have them anyway, but that's less of an issue)
-- 
GitLab


From 69daf48aab5793c30b257a2e96092f77060ac48f Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sat, 27 Oct 2018 20:36:11 +0000
Subject: [PATCH 0676/1116] [NFC][X86] Baseline tests for AMD BdVer2
 (Piledriver) Scheduler model

Adding the baseline tests in a preparatory NFC commit,
so that the actual commit shows the *diff*.

Yes, i'm aware that a few of these codegen-based sched tests
are testing wrong instructions, i will fix that afterwards.

For https://reviews.llvm.org/D52779

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345462 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/aes-schedule.ll              |   78 +
 test/CodeGen/X86/avx-schedule.ll              |  607 ++++
 test/CodeGen/X86/avx-vzeroupper.ll            |  108 +-
 test/CodeGen/X86/bmi-schedule.ll              |   93 +
 test/CodeGen/X86/cmov-schedule.ll             |  199 ++
 test/CodeGen/X86/f16c-schedule.ll             |   28 +
 test/CodeGen/X86/fma-schedule.ll              |  397 +++
 test/CodeGen/X86/fma4-schedule.ll             |  846 ++++--
 test/CodeGen/X86/lea32-schedule.ll            |   73 +
 test/CodeGen/X86/lea64-schedule.ll            |   56 +
 test/CodeGen/X86/lwp-schedule.ll              |  216 +-
 test/CodeGen/X86/lzcnt-schedule.ll            |   23 +
 test/CodeGen/X86/mmx-schedule.ll              |  622 +++++
 test/CodeGen/X86/popcnt-schedule.ll           |   23 +
 test/CodeGen/X86/recip-fastmath.ll            |  103 +
 test/CodeGen/X86/recip-fastmath2.ll           |  161 ++
 test/CodeGen/X86/schedule-x86-64-shld.ll      |  227 +-
 test/CodeGen/X86/schedule-x86_32.ll           |  228 ++
 test/CodeGen/X86/schedule-x86_64.ll           | 1671 +++++++++++
 test/CodeGen/X86/small-byval-memcpy.ll        |   38 +-
 test/CodeGen/X86/sse-schedule.ll              |  688 +++++
 test/CodeGen/X86/sse2-schedule.ll             | 1667 +++++++++++
 test/CodeGen/X86/sse3-schedule.ll             |  156 +-
 test/CodeGen/X86/sse41-schedule.ll            |  651 +++++
 test/CodeGen/X86/sse42-schedule.ll            |  172 ++
 test/CodeGen/X86/sse4a-schedule.ll            |   31 +
 test/CodeGen/X86/ssse3-schedule.ll            |  201 ++
 test/CodeGen/X86/tbm-schedule.ll              |  526 +++-
 test/CodeGen/X86/x87-schedule.ll              |  562 ++++
 test/CodeGen/X86/xop-schedule.ll              | 1558 ++++++++---
 test/tools/llvm-mca/X86/BdVer2/add-sequence.s |   95 +
 .../X86/BdVer2/clear-super-register-1.s       |   63 +
 .../X86/BdVer2/clear-super-register-2.s       |  137 +
 .../X86/BdVer2/dependency-breaking-cmp.s      |   72 +
 .../X86/BdVer2/dependency-breaking-pcmpeq.s   |   87 +
 .../X86/BdVer2/dependency-breaking-pcmpgt.s   |   87 +
 .../X86/BdVer2/dependency-breaking-sbb-1.s    |   73 +
 .../X86/BdVer2/dependency-breaking-sbb-2.s    |   80 +
 .../X86/BdVer2/dependent-pmuld-paddd.s        |   95 +
 test/tools/llvm-mca/X86/BdVer2/dot-product.s  |   74 +
 .../X86/BdVer2/hadd-read-after-ld-1.s         |   44 +
 .../X86/BdVer2/hadd-read-after-ld-2.s         |   44 +
 .../X86/BdVer2/instruction-info-view.s        |   36 +
 .../llvm-mca/X86/BdVer2/load-store-alias.s    |   93 +
 .../llvm-mca/X86/BdVer2/memcpy-like-test.s    |   93 +
 test/tools/llvm-mca/X86/BdVer2/one-idioms.s   |  142 +
 .../X86/BdVer2/partial-reg-update-2.s         |   47 +
 .../X86/BdVer2/partial-reg-update-3.s         |   78 +
 .../X86/BdVer2/partial-reg-update-4.s         |   79 +
 .../X86/BdVer2/partial-reg-update-5.s         |   61 +
 .../X86/BdVer2/partial-reg-update-6.s         |   80 +
 .../llvm-mca/X86/BdVer2/partial-reg-update.s  |   47 +
 test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s    |   99 +
 test/tools/llvm-mca/X86/BdVer2/pr37790.s      |   43 +
 test/tools/llvm-mca/X86/BdVer2/rank.s         |  109 +
 .../llvm-mca/X86/BdVer2/rcu-statistics.s      |   61 +
 .../llvm-mca/X86/BdVer2/read-advance-1.s      |   48 +
 .../llvm-mca/X86/BdVer2/read-advance-2.s      |   47 +
 .../llvm-mca/X86/BdVer2/read-advance-3.s      |   47 +
 .../X86/BdVer2/reg-move-elimination-1.s       |   80 +
 .../X86/BdVer2/reg-move-elimination-2.s       |  121 +
 .../X86/BdVer2/reg-move-elimination-3.s       |  106 +
 .../X86/BdVer2/reg-move-elimination-4.s       |   92 +
 .../X86/BdVer2/reg-move-elimination-5.s       |   92 +
 .../llvm-mca/X86/BdVer2/register-files-1.s    |   77 +
 .../llvm-mca/X86/BdVer2/register-files-2.s    |   77 +
 .../llvm-mca/X86/BdVer2/register-files-3.s    |   76 +
 .../llvm-mca/X86/BdVer2/register-files-4.s    |   60 +
 .../llvm-mca/X86/BdVer2/register-files-5.s    |  143 +
 .../llvm-mca/X86/BdVer2/resources-3dnow.s     |  208 ++
 .../tools/llvm-mca/X86/BdVer2/resources-adx.s |   55 +
 .../tools/llvm-mca/X86/BdVer2/resources-aes.s |   71 +
 .../llvm-mca/X86/BdVer2/resources-avx1.s      | 2431 +++++++++++++++++
 .../llvm-mca/X86/BdVer2/resources-bmi1.s      |  113 +
 .../X86/BdVer2/resources-clflushopt.s         |   33 +
 .../llvm-mca/X86/BdVer2/resources-cmov.s      |  323 +++
 .../llvm-mca/X86/BdVer2/resources-cmpxchg.s   |   36 +
 .../llvm-mca/X86/BdVer2/resources-f16c.s      |   57 +
 .../tools/llvm-mca/X86/BdVer2/resources-fma.s |  701 +++++
 .../llvm-mca/X86/BdVer2/resources-fma4.s      |  349 +++
 .../tools/llvm-mca/X86/BdVer2/resources-lea.s |  437 +++
 .../llvm-mca/X86/BdVer2/resources-lzcnt.s     |   50 +
 .../tools/llvm-mca/X86/BdVer2/resources-mmx.s |  393 +++
 .../llvm-mca/X86/BdVer2/resources-movbe.s     |   50 +
 .../llvm-mca/X86/BdVer2/resources-pclmul.s    |   36 +
 .../llvm-mca/X86/BdVer2/resources-popcnt.s    |   50 +
 .../llvm-mca/X86/BdVer2/resources-prefetchw.s |   36 +
 .../llvm-mca/X86/BdVer2/resources-sse1.s      |  461 ++++
 .../llvm-mca/X86/BdVer2/resources-sse2.s      |  949 +++++++
 .../llvm-mca/X86/BdVer2/resources-sse3.s      |   96 +
 .../llvm-mca/X86/BdVer2/resources-sse41.s     |  366 +++
 .../llvm-mca/X86/BdVer2/resources-sse42.s     |   99 +
 .../llvm-mca/X86/BdVer2/resources-sse4a.s     |   50 +
 .../llvm-mca/X86/BdVer2/resources-ssse3.s     |  253 ++
 .../tools/llvm-mca/X86/BdVer2/resources-tbm.s |  169 ++
 .../llvm-mca/X86/BdVer2/resources-x86_32.s    |   78 +
 .../llvm-mca/X86/BdVer2/resources-x86_64.s    | 2372 ++++++++++++++++
 .../tools/llvm-mca/X86/BdVer2/resources-x87.s |  521 ++++
 .../tools/llvm-mca/X86/BdVer2/resources-xop.s |  534 ++++
 .../X86/BdVer2/scheduler-queue-usage.s        |   60 +
 test/tools/llvm-mca/X86/BdVer2/simple-test.s  |   43 +
 .../X86/BdVer2/vbroadcast-operand-latency.s   |   67 +
 .../X86/BdVer2/vec-logic-read-after-ld-1.s    |   43 +
 .../X86/BdVer2/vec-logic-read-after-ld-2.s    |   44 +
 .../X86/BdVer2/xop-super-registers-1.s        |   89 +
 .../X86/BdVer2/xop-super-registers-2.s        |   89 +
 .../llvm-mca/X86/BdVer2/zero-idioms-avx-256.s |  365 +++
 test/tools/llvm-mca/X86/BdVer2/zero-idioms.s  |  427 +++
 test/tools/llvm-mca/X86/bextr-read-after-ld.s |   16 +
 test/tools/llvm-mca/X86/cpus.s                |    6 +
 test/tools/llvm-mca/X86/read-after-ld-1.s     |   16 +
 .../llvm-mca/X86/register-file-statistics.s   |    1 +
 .../llvm-mca/X86/scheduler-queue-usage.s      |   10 +
 .../tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s |    1 +
 .../X86/variable-blend-read-after-ld-1.s      |   18 +
 .../X86/variable-blend-read-after-ld-2.s      |   18 +
 116 files changed, 26834 insertions(+), 950 deletions(-)
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/add-sequence.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/dot-product.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/one-idioms.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/partial-reg-update-2.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/partial-reg-update.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/pr37790.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/rank.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/read-advance-1.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/read-advance-2.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/read-advance-3.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/register-files-1.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/register-files-2.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/register-files-3.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/register-files-4.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/register-files-5.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-adx.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-aes.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-avx1.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-cmov.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-f16c.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-fma.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-fma4.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-lea.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-mmx.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-movbe.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-sse1.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-sse2.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-sse3.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-sse41.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-sse42.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-tbm.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-x87.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/resources-xop.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/simple-test.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-1.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-2.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s
 create mode 100644 test/tools/llvm-mca/X86/BdVer2/zero-idioms.s

diff --git a/test/CodeGen/X86/aes-schedule.ll b/test/CodeGen/X86/aes-schedule.ll
index 344b2aa6a42..2328279c79b 100644
--- a/test/CodeGen/X86/aes-schedule.ll
+++ b/test/CodeGen/X86/aes-schedule.ll
@@ -14,6 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -92,6 +94,18 @@ define <2 x i64> @test_aesdec(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vaesdec (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aesdec:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aesdec %xmm1, %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    aesdec (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_aesdec:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaesdec %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaesdec (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aesdec:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aesdec %xmm1, %xmm0 # sched: [3:1.00]
@@ -195,6 +209,18 @@ define <2 x i64> @test_aesdeclast(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; SKX-NEXT:    vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aesdeclast:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aesdeclast %xmm1, %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    aesdeclast (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_aesdeclast:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aesdeclast:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aesdeclast %xmm1, %xmm0 # sched: [3:1.00]
@@ -298,6 +324,18 @@ define <2 x i64> @test_aesenc(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vaesenc (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aesenc:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aesenc %xmm1, %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    aesenc (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_aesenc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaesenc %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaesenc (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aesenc:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aesenc %xmm1, %xmm0 # sched: [3:1.00]
@@ -401,6 +439,18 @@ define <2 x i64> @test_aesenclast(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; SKX-NEXT:    vaesenclast (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aesenclast:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aesenclast %xmm1, %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    aesenclast (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_aesenclast:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaesenclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaesenclast (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aesenclast:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aesenclast %xmm1, %xmm0 # sched: [3:1.00]
@@ -517,6 +567,20 @@ define <2 x i64> @test_aesimc(<2 x i64> %a0, <2 x i64> *%a1) {
 ; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aesimc:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aesimc %xmm0, %xmm1 # sched: [12:2.00]
+; BDVER2-SSE-NEXT:    aesimc (%rdi), %xmm0 # sched: [18:2.00]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_aesimc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaesimc %xmm0, %xmm0 # sched: [12:2.00]
+; BDVER2-NEXT:    vaesimc (%rdi), %xmm1 # sched: [18:2.00]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aesimc:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aesimc %xmm0, %xmm1 # sched: [2:1.00]
@@ -637,6 +701,20 @@ define <2 x i64> @test_aeskeygenassist(<2 x i64> %a0, <2 x i64> *%a1) {
 ; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aeskeygenassist:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aeskeygenassist $7, %xmm0, %xmm1 # sched: [8:3.67]
+; BDVER2-SSE-NEXT:    aeskeygenassist $7, (%rdi), %xmm0 # sched: [8:3.33]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_aeskeygenassist:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaeskeygenassist $7, %xmm0, %xmm0 # sched: [8:3.67]
+; BDVER2-NEXT:    vaeskeygenassist $7, (%rdi), %xmm1 # sched: [8:3.33]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aeskeygenassist:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aeskeygenassist $7, %xmm0, %xmm1 # sched: [2:1.00]
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index ea9626979ae..4902044c766 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -6,6 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -46,6 +47,12 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_addpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_addpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -100,6 +107,12 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_addps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_addps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -154,6 +167,12 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; SKX-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_addsubpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_addsubpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -209,6 +228,12 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_addsubps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_addsubps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -270,6 +295,13 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andnotpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_andnotpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -339,6 +371,13 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andnotps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_andnotps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -408,6 +447,13 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_andpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -475,6 +521,13 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_andps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -542,6 +595,13 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blendpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
+; BDVER2-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1],mem[2,3] sched: [8:0.50]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_blendpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:1.00]
@@ -605,6 +665,13 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blendps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
+; BDVER2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4,5,6],ymm1[7] sched: [8:0.50]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_blendps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:1.00]
@@ -662,6 +729,12 @@ define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; SKX-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blendvpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_blendvpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
@@ -717,6 +790,12 @@ define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blendvps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_blendvps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
@@ -766,6 +845,11 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
 ; SKX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_broadcastf128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_broadcastf128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:2.00]
@@ -811,6 +895,11 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) {
 ; SKX-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_broadcastsd_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_broadcastsd_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [6:2.00]
@@ -857,6 +946,11 @@ define <4 x float> @test_broadcastss(float *%a0) {
 ; SKX-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_broadcastss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_broadcastss:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [6:1.00]
@@ -903,6 +997,11 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) {
 ; SKX-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_broadcastss_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_broadcastss_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [6:2.00]
@@ -961,6 +1060,13 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmppd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; BDVER2-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmppd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [2:2.00]
@@ -1027,6 +1133,13 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; BDVER2-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmpps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [2:2.00]
@@ -1093,6 +1206,13 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtdq2pd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvtdq2pd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [8:2.00]
@@ -1158,6 +1278,13 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtdq2ps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvtdq2ps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [8:2.00]
@@ -1221,6 +1348,13 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtpd2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtpd2dq %ymm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvtpd2dq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtpd2dqy (%rdi), %xmm1 # sched: [11:2.00]
@@ -1285,6 +1419,13 @@ define <8 x i32> @test_cvttpd2dq(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvttpd2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvttpd2dq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [11:2.00]
@@ -1348,6 +1489,13 @@ define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtpd2ps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [11:1.00]
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvtpd2ps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [11:2.00]
@@ -1411,6 +1559,13 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtps2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vcvtps2dq (%rdi), %ymm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvtps2dq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtps2dq (%rdi), %ymm1 # sched: [8:2.00]
@@ -1475,6 +1630,13 @@ define <8 x i32> @test_cvttps2dq(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvttps2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvttps2dq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [8:2.00]
@@ -1532,6 +1694,12 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [21:8.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_divpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [45:44.00]
+; BDVER2-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [52:44.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_divpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
@@ -1586,6 +1754,12 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [18:5.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_divps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [29:28.00]
+; BDVER2-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [36:28.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_divps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
@@ -1640,6 +1814,12 @@ define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
 ; SKX-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [20:1.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_dpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:2.00]
+; BDVER2-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [19:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_dpps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:6.00]
@@ -1701,6 +1881,13 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_extractf128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_extractf128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50]
@@ -1756,6 +1943,12 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; SKX-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_haddpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_haddpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -1811,6 +2004,12 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ; SKX-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_haddps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_haddps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -1866,6 +2065,12 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; SKX-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_hsubpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_hsubpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -1921,6 +2126,12 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ; SKX-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_hsubps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_hsubps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -1982,6 +2193,13 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
 ; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_insertf128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
+; BDVER2-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_insertf128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
@@ -2035,6 +2253,11 @@ define <32 x i8> @test_lddqu(i8* %a0) {
 ; SKX-NEXT:    vlddqu (%rdi), %ymm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lddqu:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vlddqu (%rdi), %ymm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lddqu:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vlddqu (%rdi), %ymm0 # sched: [5:1.00]
@@ -2092,6 +2315,13 @@ define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
 ; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maskmovpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
+; BDVER2-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_maskmovpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [6:1.00]
@@ -2155,6 +2385,13 @@ define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maskmovpd_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
+; BDVER2-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_maskmovpd_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [6:2.00]
@@ -2218,6 +2455,13 @@ define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maskmovps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
+; BDVER2-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_maskmovps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [6:1.00]
@@ -2281,6 +2525,13 @@ define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2)
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maskmovps_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
+; BDVER2-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_maskmovps_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [6:2.00]
@@ -2338,6 +2589,12 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maxpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_maxpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -2393,6 +2650,12 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maxps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_maxps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -2448,6 +2711,12 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_minpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_minpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vminpd %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -2503,6 +2772,12 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_minps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_minps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vminps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -2564,6 +2839,13 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movapd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovapd (%rdi), %ymm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movapd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovapd (%rdi), %ymm0 # sched: [5:1.00]
@@ -2626,6 +2908,13 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movaps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovaps (%rdi), %ymm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movaps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps (%rdi), %ymm0 # sched: [5:1.00]
@@ -2688,6 +2977,13 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movddup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
+; BDVER2-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movddup:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [6:2.00]
@@ -2745,6 +3041,12 @@ define i32 @test_movmskpd(<4 x double> %a0) {
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movmskpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovmskpd %ymm0, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movmskpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovmskpd %ymm0, %eax # sched: [3:1.00]
@@ -2797,6 +3099,12 @@ define i32 @test_movmskps(<8 x float> %a0) {
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movmskps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovmskps %ymm0, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movmskps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovmskps %ymm0, %eax # sched: [3:1.00]
@@ -2861,6 +3169,14 @@ define void @test_movntdq(<4 x i64> %a0, <4 x i64> *%a1) {
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movntdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movntdq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2916,6 +3232,12 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movntpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movntpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
@@ -2969,6 +3291,12 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movntps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movntps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
@@ -3028,6 +3356,13 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movshdup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
+; BDVER2-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movshdup:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [6:2.00]
@@ -3091,6 +3426,13 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movsldup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
+; BDVER2-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movsldup:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [6:2.00]
@@ -3156,6 +3498,13 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movupd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovupd (%rdi), %ymm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movupd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovupd (%rdi), %ymm0 # sched: [5:1.00]
@@ -3220,6 +3569,13 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movups:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovups (%rdi), %ymm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movups:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovups (%rdi), %ymm0 # sched: [5:1.00]
@@ -3276,6 +3632,12 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_mulpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_mulpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00]
@@ -3330,6 +3692,12 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_mulps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_mulps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -3390,6 +3758,13 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
 ; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: orpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: orpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -3457,6 +3832,13 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
 ; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_orps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_orps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -3524,6 +3906,13 @@ define <4 x double> @test_perm2f128(<4 x double> %a0, <4 x double> %a1, <4 x dou
 ; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_perm2f128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; BDVER2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_perm2f128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
@@ -3587,6 +3976,13 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
+; BDVER2-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_permilpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00]
@@ -3650,6 +4046,13 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilpd_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
+; BDVER2-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_permilpd_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:2.00]
@@ -3713,6 +4116,13 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
+; BDVER2-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_permilps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00]
@@ -3776,6 +4186,13 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilps_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; BDVER2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_permilps_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:2.00]
@@ -3833,6 +4250,12 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64>
 ; SKX-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilvarpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_permilvarpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
@@ -3888,6 +4311,12 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x
 ; SKX-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilvarpd_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_permilvarpd_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
@@ -3943,6 +4372,12 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *
 ; SKX-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilvarps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_permilvarps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
@@ -3998,6 +4433,12 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3
 ; SKX-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilvarps_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_permilvarps_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilps %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
@@ -4059,6 +4500,13 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rcpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps (%rdi), %ymm1 # sched: [14:2.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rcpps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpps (%rdi), %ymm1 # sched: [7:2.00]
@@ -4123,6 +4571,13 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_roundpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_roundpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [8:2.00]
@@ -4187,6 +4642,13 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_roundps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_roundps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [8:2.00]
@@ -4251,6 +4713,13 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rsqrtps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [14:2.00]
+; BDVER2-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rsqrtps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [7:2.00]
@@ -4315,6 +4784,13 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_shufpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
+; BDVER2-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_shufpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
@@ -4378,6 +4854,13 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_shufps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
+; BDVER2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,3],mem[0,0],ymm1[4,7],mem[4,4] sched: [8:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_shufps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
@@ -4441,6 +4924,13 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sqrtpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [52:44.00]
+; BDVER2-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [45:44.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sqrtpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [59:54.00]
@@ -4505,6 +4995,13 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sqrtps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [36:28.00]
+; BDVER2-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [29:28.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sqrtps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [47:42.00]
@@ -4563,6 +5060,12 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_subpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_subpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -4617,6 +5120,12 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_subps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_subps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -4689,6 +5198,15 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; SKX-NEXT:    adcl $0, %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_testpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
+; BDVER2-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-NEXT:    vtestpd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [2:0.67]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_testpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    xorl %eax, %eax # sched: [0:0.50]
@@ -4775,6 +5293,16 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_testpd_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
+; BDVER2-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-NEXT:    vtestpd (%rdi), %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [2:0.67]
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_testpd_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    xorl %eax, %eax # sched: [0:0.50]
@@ -4856,6 +5384,15 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; SKX-NEXT:    adcl $0, %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_testps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
+; BDVER2-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-NEXT:    vtestps (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [2:0.67]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_testps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    xorl %eax, %eax # sched: [0:0.50]
@@ -4942,6 +5479,16 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_testps_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
+; BDVER2-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-NEXT:    vtestps (%rdi), %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [2:0.67]
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_testps_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    xorl %eax, %eax # sched: [0:0.50]
@@ -5011,6 +5558,13 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_unpckhpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_unpckhpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
@@ -5068,6 +5622,12 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_unpckhps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; BDVER2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_unpckhps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
@@ -5128,6 +5688,13 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_unpcklpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_unpcklpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
@@ -5185,6 +5752,12 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_unpcklps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; BDVER2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_unpcklps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
@@ -5245,6 +5818,13 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xorpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xorpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -5312,6 +5892,13 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xorps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xorps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -5367,6 +5954,11 @@ define void @test_zeroall() {
 ; SKX-NEXT:    vzeroall # sched: [12:5.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_zeroall:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vzeroall # sched: [9:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_zeroall:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vzeroall # sched: [90:36.50]
@@ -5412,6 +6004,11 @@ define void @test_zeroupper() {
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_zeroupper:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_zeroupper:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vzeroupper # sched: [46:18.50]
@@ -5486,6 +6083,16 @@ define void @test_avx256_zero_idioms() {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_avx256_zero_idioms:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vxorps %ymm0, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vxorpd %ymm1, %ymm1, %ymm1 # sched: [1:1.00]
+; BDVER2-NEXT:    vandnps %ymm2, %ymm2, %ymm2 # sched: [1:1.00]
+; BDVER2-NEXT:    vandnpd %ymm3, %ymm3, %ymm3 # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_avx256_zero_idioms:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index e5bff60109e..824a3ffba6b 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BDVER2
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2
 
 declare i32 @foo()
@@ -56,6 +57,20 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
 ; FAST-ymm-zmm-NEXT:    addq $56, %rsp
 ; FAST-ymm-zmm-NEXT:    retq
 ;
+; BDVER2-LABEL: test01:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    subq $56, %rsp
+; BDVER2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; BDVER2-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; BDVER2-NEXT:    vzeroupper
+; BDVER2-NEXT:    callq do_sse
+; BDVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; BDVER2-NEXT:    callq do_sse
+; BDVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; BDVER2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; BDVER2-NEXT:    addq $56, %rsp
+; BDVER2-NEXT:    retq
+;
 ; BTVER2-LABEL: test01:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    subq $56, %rsp
@@ -86,11 +101,24 @@ define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
 ; VZ-NEXT:    vzeroupper
 ; VZ-NEXT:    jmp do_sse # TAILCALL
 ;
-; NO-VZ-LABEL: test02:
-; NO-VZ:       # %bb.0:
-; NO-VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; NO-VZ-NEXT:    jmp do_sse # TAILCALL
+; FAST-ymm-zmm-LABEL: test02:
+; FAST-ymm-zmm:       # %bb.0:
+; FAST-ymm-zmm-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; FAST-ymm-zmm-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; FAST-ymm-zmm-NEXT:    jmp do_sse # TAILCALL
+;
+; BDVER2-LABEL: test02:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; BDVER2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; BDVER2-NEXT:    vzeroupper
+; BDVER2-NEXT:    jmp do_sse # TAILCALL
+;
+; BTVER2-LABEL: test02:
+; BTVER2:       # %bb.0:
+; BTVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; BTVER2-NEXT:    jmp do_sse # TAILCALL
   %add.i = fadd <8 x float> %a, %b
   %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
   %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind
@@ -162,6 +190,37 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
 ; FAST-ymm-zmm-NEXT:    popq %rbx
 ; FAST-ymm-zmm-NEXT:    retq
 ;
+; BDVER2-LABEL: test03:
+; BDVER2:       # %bb.0: # %entry
+; BDVER2-NEXT:    pushq %rbx
+; BDVER2-NEXT:    subq $16, %rsp
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; BDVER2-NEXT:    .p2align 4, 0x90
+; BDVER2-NEXT:  .LBB3_1: # %while.cond
+; BDVER2-NEXT:    # =>This Inner Loop Header: Depth=1
+; BDVER2-NEXT:    callq foo
+; BDVER2-NEXT:    testl %eax, %eax
+; BDVER2-NEXT:    jne .LBB3_1
+; BDVER2-NEXT:  # %bb.2: # %for.body.preheader
+; BDVER2-NEXT:    movl $4, %ebx
+; BDVER2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; BDVER2-NEXT:    .p2align 4, 0x90
+; BDVER2-NEXT:  .LBB3_3: # %for.body
+; BDVER2-NEXT:    # =>This Inner Loop Header: Depth=1
+; BDVER2-NEXT:    callq do_sse
+; BDVER2-NEXT:    callq do_sse
+; BDVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm0
+; BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; BDVER2-NEXT:    vzeroupper
+; BDVER2-NEXT:    callq do_sse
+; BDVER2-NEXT:    addl $-1, %ebx
+; BDVER2-NEXT:    jne .LBB3_3
+; BDVER2-NEXT:  # %bb.4: # %for.end
+; BDVER2-NEXT:    addq $16, %rsp
+; BDVER2-NEXT:    popq %rbx
+; BDVER2-NEXT:    retq
+;
 ; BTVER2-LABEL: test03:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    pushq %rbx
@@ -230,15 +289,36 @@ define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind {
 ; VZ-NEXT:    vzeroupper
 ; VZ-NEXT:    retq
 ;
-; NO-VZ-LABEL: test04:
-; NO-VZ:       # %bb.0:
-; NO-VZ-NEXT:    pushq %rax
-; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; NO-VZ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; NO-VZ-NEXT:    callq do_avx
-; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; NO-VZ-NEXT:    popq %rax
-; NO-VZ-NEXT:    retq
+; FAST-ymm-zmm-LABEL: test04:
+; FAST-ymm-zmm:       # %bb.0:
+; FAST-ymm-zmm-NEXT:    pushq %rax
+; FAST-ymm-zmm-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; FAST-ymm-zmm-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; FAST-ymm-zmm-NEXT:    callq do_avx
+; FAST-ymm-zmm-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; FAST-ymm-zmm-NEXT:    popq %rax
+; FAST-ymm-zmm-NEXT:    retq
+;
+; BDVER2-LABEL: test04:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pushq %rax
+; BDVER2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; BDVER2-NEXT:    callq do_avx
+; BDVER2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; BDVER2-NEXT:    popq %rax
+; BDVER2-NEXT:    vzeroupper
+; BDVER2-NEXT:    retq
+;
+; BTVER2-LABEL: test04:
+; BTVER2:       # %bb.0:
+; BTVER2-NEXT:    pushq %rax
+; BTVER2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; BTVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; BTVER2-NEXT:    callq do_avx
+; BTVER2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; BTVER2-NEXT:    popq %rax
+; BTVER2-NEXT:    retq
   %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
   %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/test/CodeGen/X86/bmi-schedule.ll b/test/CodeGen/X86/bmi-schedule.ll
index bd0ba7e72c8..174efd2cfe9 100644
--- a/test/CodeGen/X86/bmi-schedule.ll
+++ b/test/CodeGen/X86/bmi-schedule.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+bmi  | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2  | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1  | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -36,6 +37,13 @@ define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) {
 ; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andn_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    andnl (%rdx), %edi, %eax # sched: [6:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_andn_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    andnl (%rdx), %edi, %eax # sched: [4:1.00]
@@ -86,6 +94,13 @@ define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) {
 ; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andn_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    andnq (%rdx), %rdi, %rax # sched: [6:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_andn_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    andnq (%rdx), %rdi, %rax # sched: [4:1.00]
@@ -136,6 +151,13 @@ define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) {
 ; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bextr_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [7:1.00]
+; BDVER2-NEXT:    bextrl %edi, %esi, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bextr_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [4:1.00]
@@ -186,6 +208,13 @@ define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) {
 ; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bextr_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [7:1.00]
+; BDVER2-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bextr_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [4:1.00]
@@ -236,6 +265,13 @@ define i32 @test_blsi_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsi_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsil (%rsi), %ecx # sched: [6:0.50]
+; BDVER2-NEXT:    blsil %edi, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_blsi_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsil (%rsi), %ecx # sched: [5:1.00]
@@ -287,6 +323,13 @@ define i64 @test_blsi_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsi_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsiq (%rsi), %rcx # sched: [6:0.50]
+; BDVER2-NEXT:    blsiq %rdi, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_blsi_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsiq (%rsi), %rcx # sched: [5:1.00]
@@ -338,6 +381,13 @@ define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsmsk_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsmskl (%rsi), %ecx # sched: [6:0.50]
+; BDVER2-NEXT:    blsmskl %edi, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_blsmsk_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsmskl (%rsi), %ecx # sched: [5:1.00]
@@ -389,6 +439,13 @@ define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsmsk_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsmskq (%rsi), %rcx # sched: [6:0.50]
+; BDVER2-NEXT:    blsmskq %rdi, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_blsmsk_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsmskq (%rsi), %rcx # sched: [5:1.00]
@@ -440,6 +497,13 @@ define i32 @test_blsr_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsr_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsrl (%rsi), %ecx # sched: [6:0.50]
+; BDVER2-NEXT:    blsrl %edi, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_blsr_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsrl (%rsi), %ecx # sched: [5:1.00]
@@ -491,6 +555,13 @@ define i64 @test_blsr_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsr_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsrq (%rsi), %rcx # sched: [6:0.50]
+; BDVER2-NEXT:    blsrq %rdi, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_blsr_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsrq (%rsi), %rcx # sched: [5:1.00]
@@ -546,6 +617,14 @@ define i16 @test_cttz_i16(i16 zeroext %a0, i16 *%a1) {
 ; SKYLAKE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cttz_i16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    tzcntw (%rsi), %cx # sched: [8:1.00]
+; BDVER2-NEXT:    tzcntw %di, %ax # sched: [3:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cttz_i16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    tzcntw (%rsi), %cx # sched: [5:1.00]
@@ -598,6 +677,13 @@ define i32 @test_cttz_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cttz_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    tzcntl (%rsi), %ecx # sched: [8:1.00]
+; BDVER2-NEXT:    tzcntl %edi, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cttz_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    tzcntl (%rsi), %ecx # sched: [5:1.00]
@@ -648,6 +734,13 @@ define i64 @test_cttz_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cttz_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    tzcntq (%rsi), %rcx # sched: [8:1.00]
+; BDVER2-NEXT:    tzcntq %rdi, %rax # sched: [3:1.00]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cttz_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    tzcntq (%rsi), %rcx # sched: [5:1.00]
diff --git a/test/CodeGen/X86/cmov-schedule.ll b/test/CodeGen/X86/cmov-schedule.ll
index 8993c30d1f8..93c771e305a 100644
--- a/test/CodeGen/X86/cmov-schedule.ll
+++ b/test/CodeGen/X86/cmov-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -540,6 +541,72 @@ define void @test_cmov_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmov_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmovow %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnow %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovaew %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovaew %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovaew %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovew %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovew %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnew %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnew %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbew %si, %di # sched: [3:1.00]
+; BDVER2-NEXT:    cmovbew %si, %di # sched: [3:1.00]
+; BDVER2-NEXT:    cmovaw %si, %di # sched: [3:1.00]
+; BDVER2-NEXT:    cmovaw %si, %di # sched: [3:1.00]
+; BDVER2-NEXT:    cmovsw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnsw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovpw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovpw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnpw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnpw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovlw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovlw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgew %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgew %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovlew %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovlew %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgw %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    cmovow (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnow (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovew (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovew (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnew (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnew (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbew (%rdx), %di # sched: [8:1.00]
+; BDVER2-NEXT:    cmovbew (%rdx), %di # sched: [8:1.00]
+; BDVER2-NEXT:    cmovaw (%rdx), %di # sched: [8:1.00]
+; BDVER2-NEXT:    cmovaw (%rdx), %di # sched: [8:1.00]
+; BDVER2-NEXT:    cmovsw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnsw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovpw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovpw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnpw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnpw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovlw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovlw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgew (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgew (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovlew (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovlew (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmov_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1204,6 +1271,72 @@ define void @test_cmov_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmov_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmovol %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnol %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovael %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovael %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovael %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovel %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovel %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnel %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnel %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbel %esi, %edi # sched: [3:1.00]
+; BDVER2-NEXT:    cmovbel %esi, %edi # sched: [3:1.00]
+; BDVER2-NEXT:    cmoval %esi, %edi # sched: [3:1.00]
+; BDVER2-NEXT:    cmoval %esi, %edi # sched: [3:1.00]
+; BDVER2-NEXT:    cmovsl %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnsl %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovpl %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovpl %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnpl %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnpl %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovll %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovll %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgel %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgel %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovlel %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovlel %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgl %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgl %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovol (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnol (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovel (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovel (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnel (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnel (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbel (%rdx), %edi # sched: [8:1.00]
+; BDVER2-NEXT:    cmovbel (%rdx), %edi # sched: [8:1.00]
+; BDVER2-NEXT:    cmoval (%rdx), %edi # sched: [8:1.00]
+; BDVER2-NEXT:    cmoval (%rdx), %edi # sched: [8:1.00]
+; BDVER2-NEXT:    cmovsl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnsl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovpl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovpl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnpl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnpl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovll (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovll (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgel (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgel (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovlel (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovlel (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmov_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1868,6 +2001,72 @@ define void @test_cmov_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmov_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmovoq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnoq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmoveq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmoveq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovneq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovneq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovbeq %rsi, %rdi # sched: [3:1.00]
+; BDVER2-NEXT:    cmovbeq %rsi, %rdi # sched: [3:1.00]
+; BDVER2-NEXT:    cmovaq %rsi, %rdi # sched: [3:1.00]
+; BDVER2-NEXT:    cmovaq %rsi, %rdi # sched: [3:1.00]
+; BDVER2-NEXT:    cmovsq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnsq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovpq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovpq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnpq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovnpq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovlq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovlq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgeq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgeq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovleq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovleq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovgq %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    cmovoq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnoq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmoveq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmoveq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovneq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovneq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovbeq (%rdx), %rdi # sched: [8:1.00]
+; BDVER2-NEXT:    cmovbeq (%rdx), %rdi # sched: [8:1.00]
+; BDVER2-NEXT:    cmovaq (%rdx), %rdi # sched: [8:1.00]
+; BDVER2-NEXT:    cmovaq (%rdx), %rdi # sched: [8:1.00]
+; BDVER2-NEXT:    cmovsq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnsq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovpq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovpq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnpq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovnpq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovlq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovlq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgeq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgeq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovleq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovleq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovgq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmov_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
diff --git a/test/CodeGen/X86/f16c-schedule.ll b/test/CodeGen/X86/f16c-schedule.ll
index a2155de7831..db183e1ef5f 100644
--- a/test/CodeGen/X86/f16c-schedule.ll
+++ b/test/CodeGen/X86/f16c-schedule.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+f16c | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -43,6 +44,13 @@ define <4 x float> @test_vcvtph2ps_128(<8 x i16> %a0, <8 x i16> *%a1) {
 ; SKYLAKE-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_vcvtph2ps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [8:1.00]
+; BDVER2-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_vcvtph2ps_128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [8:1.00]
@@ -100,6 +108,13 @@ define <8 x float> @test_vcvtph2ps_256(<8 x i16> %a0, <8 x i16> *%a1) {
 ; SKYLAKE-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_vcvtph2ps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [8:1.00]
+; BDVER2-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_vcvtph2ps_256:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [8:2.00]
@@ -152,6 +167,12 @@ define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16>
 ; SKYLAKE-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [6:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_vcvtps2ph_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [4:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_vcvtps2ph_128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
@@ -207,6 +228,13 @@ define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16>
 ; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_vcvtps2ph_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [4:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_vcvtps2ph_256:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:2.00]
diff --git a/test/CodeGen/X86/fma-schedule.ll b/test/CodeGen/X86/fma-schedule.ll
index 819b9c7f27d..82ea0ce7a4a 100644
--- a/test/CodeGen/X86/fma-schedule.ll
+++ b/test/CodeGen/X86/fma-schedule.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
@@ -24,6 +25,18 @@ define void @test_vfmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231pd {{.*#+}} xmm0 = (xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132pd {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd231pd {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -113,6 +126,19 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm2) + ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132pd {{.*#+}} ymm0 = (ymm0 * mem) + ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -206,6 +232,18 @@ define void @test_vfmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231ps {{.*#+}} xmm0 = (xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd231ps {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -295,6 +333,19 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * ymm2) + ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -388,6 +439,18 @@ define void @test_vfmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddsd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132sd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231sd {{.*#+}} xmm0 = (xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd231sd {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddsd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -476,6 +539,18 @@ define void @test_vfmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddss_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231ss {{.*#+}} xmm0 = (xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd231ss {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddss_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -568,6 +643,18 @@ define void @test_vfmaddsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddsubpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm2) +/- xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} xmm0 = (xmm1 * xmm2) +/- xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * mem) +/- xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} xmm0 = (xmm1 * mem) +/- xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddsubpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -657,6 +744,19 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddsubpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * ymm2) +/- ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * mem) +/- ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddsubpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -750,6 +850,18 @@ define void @test_vfmaddsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddsubps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm2) +/- xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} xmm0 = (xmm1 * xmm2) +/- xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * mem) +/- xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} xmm0 = (xmm1 * mem) +/- xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddsubps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -839,6 +951,19 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddsubps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * ymm2) +/- ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * mem) +/- ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddsubps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -936,6 +1061,18 @@ define void @test_vfmsubaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubaddpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm2) -/+ xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} xmm0 = (xmm1 * xmm2) -/+ xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * mem) -/+ xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} xmm0 = (xmm1 * mem) -/+ xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubaddpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1025,6 +1162,19 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubaddpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * ymm2) -/+ ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * mem) -/+ ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubaddpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1118,6 +1268,18 @@ define void @test_vfmsubaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubaddps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm2) -/+ xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} xmm0 = (xmm1 * xmm2) -/+ xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * mem) -/+ xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} xmm0 = (xmm1 * mem) -/+ xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubaddps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1207,6 +1369,19 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubaddps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * ymm2) -/+ ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * mem) -/+ ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubaddps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1304,6 +1479,18 @@ define void @test_vfmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231pd {{.*#+}} xmm0 = (xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub231pd {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1393,6 +1580,19 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * ymm2) - ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * mem) - ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1486,6 +1686,18 @@ define void @test_vfmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132ps {{.*#+}} xmm0 = (xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1575,6 +1787,19 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * ymm2) - ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * mem) - ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1668,6 +1893,18 @@ define void @test_vfmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubsd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132sd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231sd {{.*#+}} xmm0 = (xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132sd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub231sd {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubsd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1756,6 +1993,18 @@ define void @test_vfmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubss_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132ss {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132ss {{.*#+}} xmm0 = (xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubss_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1848,6 +2097,18 @@ define void @test_vfnmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} xmm0 = -(xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1937,6 +2198,19 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * ymm2) + ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * mem) + ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2030,6 +2304,18 @@ define void @test_vfnmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} xmm0 = -(xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2119,6 +2405,19 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * ymm2) + ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2212,6 +2511,18 @@ define void @test_vfnmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddsd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132sd {{.*#+}} xmm0 = -(xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231sd {{.*#+}} xmm0 = -(xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132sd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd231sd {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddsd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2300,6 +2611,18 @@ define void @test_vfnmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddss_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231ss {{.*#+}} xmm0 = -(xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd231ss {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddss_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2392,6 +2715,18 @@ define void @test_vfnmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} xmm0 = -(xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2481,6 +2816,19 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * ymm2) - ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * mem) - ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2574,6 +2922,18 @@ define void @test_vfnmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} xmm0 = -(xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2663,6 +3023,19 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * ymm2) - ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2756,6 +3129,18 @@ define void @test_vfnmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubsd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132sd {{.*#+}} xmm0 = -(xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231sd {{.*#+}} xmm0 = -(xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132sd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub231sd {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubsd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2844,6 +3229,18 @@ define void @test_vfnmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubss_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132ss {{.*#+}} xmm0 = -(xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231ss {{.*#+}} xmm0 = -(xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132ss {{.*#+}} xmm0 = -(xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub231ss {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubss_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
diff --git a/test/CodeGen/X86/fma4-schedule.ll b/test/CodeGen/X86/fma4-schedule.ll
index 65d5273bec7..f2e2caf14ba 100644
--- a/test/CodeGen/X86/fma4-schedule.ll
+++ b/test/CodeGen/X86/fma4-schedule.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma4 | FileCheck %s --check-prefixes=CHECK,GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma4 | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER12,BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma4 -mattr=-fma  | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER12,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 -mattr=-fma  | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER34,BDVER3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 -mattr=-fma  | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER34,BDVER4
 
 ;
 ; VFMADD
@@ -19,14 +19,23 @@ define void @test_vfmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmaddpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddpd $2, $1, $0, $0 \0A\09 vfmaddpd $3, $1, $0, $0 \0A\09 vfmaddpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -42,15 +51,25 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmaddpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddpd $2, $1, $0, $0 \0A\09 vfmaddpd $3, $1, $0, $0 \0A\09 vfmaddpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -65,14 +84,23 @@ define void @test_vfmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmaddps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddps $2, $1, $0, $0 \0A\09 vfmaddps $3, $1, $0, $0 \0A\09 vfmaddps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -88,15 +116,25 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmaddps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddps $2, $1, $0, $0 \0A\09 vfmaddps $3, $1, $0, $0 \0A\09 vfmaddps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -111,14 +149,23 @@ define void @test_vfmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddsd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddsd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmaddsd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddsd $2, $1, $0, $0 \0A\09 vfmaddsd $3, $1, $0, $0 \0A\09 vfmaddsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -133,14 +180,23 @@ define void @test_vfmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddss_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddss (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddss %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddss_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmaddss_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddss $2, $1, $0, $0 \0A\09 vfmaddss $3, $1, $0, $0 \0A\09 vfmaddss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -159,14 +215,23 @@ define void @test_vfmaddsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddsubpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsubpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsubpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddsubpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmaddsubpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsubpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsubpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddsubpd $2, $1, $0, $0 \0A\09 vfmaddsubpd $3, $1, $0, $0 \0A\09 vfmaddsubpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -182,15 +247,25 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddsubpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddsubpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddsubpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddsubpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmaddsubpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddsubpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddsubpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddsubpd $2, $1, $0, $0 \0A\09 vfmaddsubpd $3, $1, $0, $0 \0A\09 vfmaddsubpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -205,14 +280,23 @@ define void @test_vfmaddsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddsubps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsubps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsubps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddsubps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmaddsubps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsubps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsubps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddsubps $2, $1, $0, $0 \0A\09 vfmaddsubps $3, $1, $0, $0 \0A\09 vfmaddsubps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -228,15 +312,25 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddsubps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddsubps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddsubps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddsubps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmaddsubps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddsubps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddsubps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddsubps $2, $1, $0, $0 \0A\09 vfmaddsubps $3, $1, $0, $0 \0A\09 vfmaddsubps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -255,14 +349,23 @@ define void @test_vfmsubaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubaddpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubaddpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubaddpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubaddpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmsubaddpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubaddpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubaddpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubaddpd $2, $1, $0, $0 \0A\09 vfmsubaddpd $3, $1, $0, $0 \0A\09 vfmsubaddpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -278,15 +381,25 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubaddpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubaddpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubaddpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubaddpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmsubaddpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubaddpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubaddpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubaddpd $2, $1, $0, $0 \0A\09 vfmsubaddpd $3, $1, $0, $0 \0A\09 vfmsubaddpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -301,14 +414,23 @@ define void @test_vfmsubaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubaddps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubaddps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubaddps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubaddps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmsubaddps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubaddps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubaddps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubaddps $2, $1, $0, $0 \0A\09 vfmsubaddps $3, $1, $0, $0 \0A\09 vfmsubaddps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -324,15 +446,25 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubaddps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubaddps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubaddps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubaddps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmsubaddps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubaddps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubaddps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubaddps $2, $1, $0, $0 \0A\09 vfmsubaddps $3, $1, $0, $0 \0A\09 vfmsubaddps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -351,14 +483,23 @@ define void @test_vfmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmsubpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubpd $2, $1, $0, $0 \0A\09 vfmsubpd $3, $1, $0, $0 \0A\09 vfmsubpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -374,15 +515,25 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmsubpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubpd $2, $1, $0, $0 \0A\09 vfmsubpd $3, $1, $0, $0 \0A\09 vfmsubpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -397,14 +548,23 @@ define void @test_vfmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmsubps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubps $2, $1, $0, $0 \0A\09 vfmsubps $3, $1, $0, $0 \0A\09 vfmsubps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -420,15 +580,25 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmsubps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubps $2, $1, $0, $0 \0A\09 vfmsubps $3, $1, $0, $0 \0A\09 vfmsubps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -443,14 +613,23 @@ define void @test_vfmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubsd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubsd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubsd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubsd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmsubsd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubsd $2, $1, $0, $0 \0A\09 vfmsubsd $3, $1, $0, $0 \0A\09 vfmsubsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -465,14 +644,23 @@ define void @test_vfmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubss_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubss (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubss %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubss_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfmsubss_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubss $2, $1, $0, $0 \0A\09 vfmsubss $3, $1, $0, $0 \0A\09 vfmsubss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -491,14 +679,23 @@ define void @test_vfnmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddpd $2, $1, $0, $0 \0A\09 vfnmaddpd $3, $1, $0, $0 \0A\09 vfnmaddpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -514,15 +711,25 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmaddpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmaddpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmaddpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmaddpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddpd $2, $1, $0, $0 \0A\09 vfnmaddpd $3, $1, $0, $0 \0A\09 vfnmaddpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -537,14 +744,23 @@ define void @test_vfnmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddps $2, $1, $0, $0 \0A\09 vfnmaddps $3, $1, $0, $0 \0A\09 vfnmaddps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -560,15 +776,25 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmaddps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmaddps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmaddps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmaddps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddps $2, $1, $0, $0 \0A\09 vfnmaddps $3, $1, $0, $0 \0A\09 vfnmaddps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -583,14 +809,23 @@ define void @test_vfnmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddsd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddsd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddsd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddsd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddsd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddsd $2, $1, $0, $0 \0A\09 vfnmaddsd $3, $1, $0, $0 \0A\09 vfnmaddsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -605,14 +840,23 @@ define void @test_vfnmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddss_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddss (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddss %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddss_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddss_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddss $2, $1, $0, $0 \0A\09 vfnmaddss $3, $1, $0, $0 \0A\09 vfnmaddss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -631,14 +875,23 @@ define void @test_vfnmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubpd $2, $1, $0, $0 \0A\09 vfnmsubpd $3, $1, $0, $0 \0A\09 vfnmsubpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -654,15 +907,25 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmsubpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmsubpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmsubpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmsubpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubpd $2, $1, $0, $0 \0A\09 vfnmsubpd $3, $1, $0, $0 \0A\09 vfnmsubpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -677,14 +940,23 @@ define void @test_vfnmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubps $2, $1, $0, $0 \0A\09 vfnmsubps $3, $1, $0, $0 \0A\09 vfnmsubps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -700,15 +972,25 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmsubps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmsubps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmsubps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmsubps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubps $2, $1, $0, $0 \0A\09 vfnmsubps $3, $1, $0, $0 \0A\09 vfnmsubps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -723,14 +1005,23 @@ define void @test_vfnmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubsd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubsd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubsd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubsd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubsd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubsd $2, $1, $0, $0 \0A\09 vfnmsubsd $3, $1, $0, $0 \0A\09 vfnmsubsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -745,14 +1036,23 @@ define void @test_vfnmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubss_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubss (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubss %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubss_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubss_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubss $2, $1, $0, $0 \0A\09 vfnmsubss $3, $1, $0, $0 \0A\09 vfnmsubss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
diff --git a/test/CodeGen/X86/lea32-schedule.ll b/test/CodeGen/X86/lea32-schedule.ll
index a9608f0bd8c..ab509f57463 100644
--- a/test/CodeGen/X86/lea32-schedule.ll
+++ b/test/CodeGen/X86/lea32-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell   | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=-slow-3ops-lea      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -60,6 +61,12 @@ define i32 @test_lea_offset(i32) {
 ; SKYLAKE-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $edi killed $edi def $rdi
@@ -124,6 +131,12 @@ define i32 @test_lea_offset_big(i32) {
 ; SKYLAKE-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $edi killed $edi def $rdi
@@ -196,6 +209,13 @@ define i32 @test_lea_add(i32, i32) {
 ; SKYLAKE-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
@@ -274,6 +294,13 @@ define i32 @test_lea_add_offset(i32, i32) {
 ; SKYLAKE-NEXT:    addl $16, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal 16(%rdi,%rsi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
@@ -358,6 +385,13 @@ define i32 @test_lea_add_offset_big(i32, i32) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal -4096(%rdi,%rsi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
@@ -425,6 +459,12 @@ define i32 @test_lea_mul(i32) {
 ; SKYLAKE-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $edi killed $edi def $rdi
@@ -494,6 +534,12 @@ define i32 @test_lea_mul_offset(i32) {
 ; SKYLAKE-NEXT:    addl $-32, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $edi killed $edi def $rdi
@@ -569,6 +615,12 @@ define i32 @test_lea_mul_offset_big(i32) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $edi killed $edi def $rdi
@@ -641,6 +693,13 @@ define i32 @test_lea_add_scale(i32, i32) {
 ; SKYLAKE-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
@@ -720,6 +779,13 @@ define i32 @test_lea_add_scale_offset(i32, i32) {
 ; SKYLAKE-NEXT:    addl $96, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
@@ -805,6 +871,13 @@ define i32 @test_lea_add_scale_offset_big(i32, i32) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
diff --git a/test/CodeGen/X86/lea64-schedule.ll b/test/CodeGen/X86/lea64-schedule.ll
index df9df9b21ef..82269aaeadd 100644
--- a/test/CodeGen/X86/lea64-schedule.ll
+++ b/test/CodeGen/X86/lea64-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell   | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=-slow-3ops-lea      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -53,6 +54,11 @@ define i64 @test_lea_offset(i64) {
 ; SKYLAKE-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
@@ -108,6 +114,11 @@ define i64 @test_lea_offset_big(i64) {
 ; SKYLAKE-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
@@ -164,6 +175,11 @@ define i64 @test_lea_add(i64, i64) {
 ; SKYLAKE-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
@@ -224,6 +240,11 @@ define i64 @test_lea_add_offset(i64, i64) {
 ; SKYLAKE-NEXT:    addq $16, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [2:1.00]
@@ -290,6 +311,11 @@ define i64 @test_lea_add_offset_big(i64, i64) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [2:1.00]
@@ -346,6 +372,11 @@ define i64 @test_lea_mul(i64) {
 ; SKYLAKE-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [2:1.00]
@@ -406,6 +437,11 @@ define i64 @test_lea_mul_offset(i64) {
 ; SKYLAKE-NEXT:    addq $-32, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [2:1.00]
@@ -472,6 +508,11 @@ define i64 @test_lea_mul_offset_big(i64) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [2:1.00]
@@ -528,6 +569,11 @@ define i64 @test_lea_add_scale(i64, i64) {
 ; SKYLAKE-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [2:1.00]
@@ -589,6 +635,11 @@ define i64 @test_lea_add_scale_offset(i64, i64) {
 ; SKYLAKE-NEXT:    addq $96, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [2:1.00]
@@ -656,6 +707,11 @@ define i64 @test_lea_add_scale_offset_big(i64, i64) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [2:1.00]
diff --git a/test/CodeGen/X86/lwp-schedule.ll b/test/CodeGen/X86/lwp-schedule.ll
index 9e517ac62da..11699e7d37f 100644
--- a/test/CodeGen/X86/lwp-schedule.ll
+++ b/test/CodeGen/X86/lwp-schedule.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=x86-64 -mattr=+lwp | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=x86-64 -mattr=+lwp | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=x86-64 -mattr=+lwp | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
 ; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
 
@@ -11,10 +11,20 @@ define void @test_llwpcb(i8 *%a0) nounwind {
 ; GENERIC-NEXT:    llwpcb %rdi # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_llwpcb:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    llwpcb %rdi
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_llwpcb:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    llwpcb %rdi # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_llwpcb:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    llwpcb %rdi
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_llwpcb:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    llwpcb %rdi
+; BDVER4-NEXT:    retq
   tail call void @llvm.x86.llwpcb(i8 *%a0)
   ret void
 }
@@ -25,10 +35,20 @@ define i8* @test_slwpcb(i8 *%a0) nounwind {
 ; GENERIC-NEXT:    slwpcb %rax # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_slwpcb:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    slwpcb %rax
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_slwpcb:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    slwpcb %rax # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_slwpcb:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    slwpcb %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_slwpcb:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    slwpcb %rax
+; BDVER4-NEXT:    retq
   %1 = tail call i8* @llvm.x86.slwpcb()
   ret i8 *%1
 }
@@ -42,12 +62,27 @@ define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
 ; GENERIC-NEXT:    setb %al # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpins32_rri:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    addl %esi, %esi
-; BDVER-NEXT:    lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
-; BDVER-NEXT:    setb %al
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpins32_rri:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    addl %esi, %esi # sched: [1:0.33]
+; BDVER12-NEXT:    lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
+; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    setb %al # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_lwpins32_rri:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    addl %esi, %esi
+; BDVER3-NEXT:    lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
+; BDVER3-NEXT:    setb %al
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpins32_rri:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    addl %esi, %esi
+; BDVER4-NEXT:    lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
+; BDVER4-NEXT:    setb %al
+; BDVER4-NEXT:    retq
   %1 = add i32 %a1, %a1
   %2 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %1, i32 2309737967)
   ret i8 %2
@@ -61,11 +96,24 @@ define i8 @test_lwpins32_rmi(i32 %a0, i32 *%p1) nounwind {
 ; GENERIC-NEXT:    setb %al # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpins32_rmi:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
-; BDVER-NEXT:    setb %al
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpins32_rmi:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
+; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    setb %al # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_lwpins32_rmi:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
+; BDVER3-NEXT:    setb %al
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpins32_rmi:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
+; BDVER4-NEXT:    setb %al
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32 *%p1
   %1 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %a1, i32 1985229328)
   ret i8 %1
@@ -79,11 +127,24 @@ define i8 @test_lwpins64_rri(i64 %a0, i32 %a1) nounwind {
 ; GENERIC-NEXT:    setb %al # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpins64_rri:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
-; BDVER-NEXT:    setb %al
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpins64_rri:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
+; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    setb %al # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_lwpins64_rri:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
+; BDVER3-NEXT:    setb %al
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpins64_rri:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
+; BDVER4-NEXT:    setb %al
+; BDVER4-NEXT:    retq
   %1 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 2309737967)
   ret i8 %1
 }
@@ -96,11 +157,24 @@ define i8 @test_lwpins64_rmi(i64 %a0, i32 *%p1) nounwind {
 ; GENERIC-NEXT:    setb %al # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpins64_rmi:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
-; BDVER-NEXT:    setb %al
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpins64_rmi:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
+; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    setb %al # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_lwpins64_rmi:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
+; BDVER3-NEXT:    setb %al
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpins64_rmi:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
+; BDVER4-NEXT:    setb %al
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32 *%p1
   %1 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 1985229328)
   ret i8 %1
@@ -114,11 +188,24 @@ define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
 ; GENERIC-NEXT:    # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpval32_rri:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    addl %esi, %esi
-; BDVER-NEXT:    lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpval32_rri:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    addl %esi, %esi # sched: [1:0.33]
+; BDVER12-NEXT:    lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
+; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_lwpval32_rri:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    addl %esi, %esi
+; BDVER3-NEXT:    lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpval32_rri:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    addl %esi, %esi
+; BDVER4-NEXT:    lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
+; BDVER4-NEXT:    retq
   %1 = add i32 %a1, %a1
   tail call void @llvm.x86.lwpval32(i32 %a0, i32 %1, i32 4275878552)
   ret void
@@ -131,10 +218,21 @@ define void @test_lwpval32_rmi(i32 %a0, i32 *%p1) nounwind {
 ; GENERIC-NEXT:    # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpval32_rmi:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpval $305419896, (%rsi), %edi # imm = 0x12345678
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpval32_rmi:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpval $305419896, (%rsi), %edi # imm = 0x12345678
+; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_lwpval32_rmi:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpval $305419896, (%rsi), %edi # imm = 0x12345678
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpval32_rmi:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpval $305419896, (%rsi), %edi # imm = 0x12345678
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32 *%p1
   tail call void @llvm.x86.lwpval32(i32 %a0, i32 %a1, i32 305419896)
   ret void
@@ -147,10 +245,21 @@ define void @test_lwpval64_rri(i64 %a0, i32 %a1) nounwind {
 ; GENERIC-NEXT:    # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpval64_rri:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpval64_rri:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
+; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_lwpval64_rri:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpval64_rri:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
+; BDVER4-NEXT:    retq
   tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 4275878552)
   ret void
 }
@@ -162,10 +271,21 @@ define void @test_lwpval64_rmi(i64 %a0, i32 *%p1) nounwind {
 ; GENERIC-NEXT:    # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpval64_rmi:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpval64_rmi:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
+; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_lwpval64_rmi:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpval64_rmi:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32 *%p1
   tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 305419896)
   ret void
diff --git a/test/CodeGen/X86/lzcnt-schedule.ll b/test/CodeGen/X86/lzcnt-schedule.ll
index 001bb0be397..15622ad3426 100644
--- a/test/CodeGen/X86/lzcnt-schedule.ll
+++ b/test/CodeGen/X86/lzcnt-schedule.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake   | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl       | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+lzcnt    | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2    | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1    | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -40,6 +41,14 @@ define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) {
 ; SKYLAKE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctlz_i16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    lzcntw (%rsi), %cx # sched: [8:1.00]
+; BDVER2-NEXT:    lzcntw %di, %ax # sched: [3:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_ctlz_i16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    lzcntw (%rsi), %cx # sched: [4:1.00]
@@ -92,6 +101,13 @@ define i32 @test_ctlz_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctlz_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    lzcntl (%rsi), %ecx # sched: [8:1.00]
+; BDVER2-NEXT:    lzcntl %edi, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_ctlz_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    lzcntl (%rsi), %ecx # sched: [4:1.00]
@@ -142,6 +158,13 @@ define i64 @test_ctlz_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctlz_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    lzcntq (%rsi), %rcx # sched: [8:1.00]
+; BDVER2-NEXT:    lzcntq %rdi, %rax # sched: [3:1.00]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_ctlz_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    lzcntq (%rsi), %rcx # sched: [4:1.00]
diff --git a/test/CodeGen/X86/mmx-schedule.ll b/test/CodeGen/X86/mmx-schedule.ll
index f4e047cd686..0fec25a8fa6 100644
--- a/test/CodeGen/X86/mmx-schedule.ll
+++ b/test/CodeGen/X86/mmx-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -76,6 +77,14 @@ define i64 @test_cvtpd2pi(<2 x double> %a0, <2 x double>* %a1) optsize {
 ; SKX-NEXT:    movq %mm1, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtpd2pi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvtpd2pi (%rdi), %mm0 # sched: [10:1.00]
+; BDVER2-NEXT:    cvtpd2pi %xmm0, %mm1 # sched: [4:1.00]
+; BDVER2-NEXT:    por %mm1, %mm0 # sched: [1:0.33]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvtpd2pi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvtpd2pi (%rdi), %mm1 # sched: [8:1.00]
@@ -157,6 +166,13 @@ define <2 x double> @test_cvtpi2pd(x86_mmx %a0, x86_mmx* %a1) optsize {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtpi2pd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvtpi2pd %mm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    cvtpi2pd (%rdi), %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvtpi2pd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvtpi2pd (%rdi), %xmm1 # sched: [8:1.00]
@@ -235,6 +251,13 @@ define <4 x float> @test_cvtpi2ps(x86_mmx %a0, x86_mmx* %a1, <4 x float> %a2, <4
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtpi2ps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvtpi2ps %mm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    cvtpi2ps (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvtpi2ps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvtpi2ps (%rdi), %xmm1 # sched: [8:1.00]
@@ -321,6 +344,14 @@ define i64 @test_cvtps2pi(<4 x float> %a0, <4 x float>* %a1) optsize {
 ; SKX-NEXT:    movq %mm1, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtps2pi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvtps2pi %xmm0, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    cvtps2pi (%rdi), %mm1 # sched: [9:1.00]
+; BDVER2-NEXT:    por %mm0, %mm1 # sched: [1:0.33]
+; BDVER2-NEXT:    movq %mm1, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvtps2pi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvtps2pi (%rdi), %mm1 # sched: [8:1.00]
@@ -410,6 +441,14 @@ define i64 @test_cvttpd2pi(<2 x double> %a0, <2 x double>* %a1) optsize {
 ; SKX-NEXT:    movq %mm1, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvttpd2pi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvttpd2pi (%rdi), %mm0 # sched: [10:1.00]
+; BDVER2-NEXT:    cvttpd2pi %xmm0, %mm1 # sched: [4:1.00]
+; BDVER2-NEXT:    por %mm1, %mm0 # sched: [1:0.33]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvttpd2pi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvttpd2pi (%rdi), %mm1 # sched: [8:1.00]
@@ -499,6 +538,14 @@ define i64 @test_cvttps2pi(<4 x float> %a0, <4 x float>* %a1) optsize {
 ; SKX-NEXT:    movq %mm1, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvttps2pi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvttps2pi %xmm0, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    cvttps2pi (%rdi), %mm1 # sched: [9:1.00]
+; BDVER2-NEXT:    por %mm0, %mm1 # sched: [1:0.33]
+; BDVER2-NEXT:    movq %mm1, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cvttps2pi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvttps2pi (%rdi), %mm1 # sched: [8:1.00]
@@ -564,6 +611,11 @@ define void @test_emms() optsize {
 ; SKX-NEXT:    emms # sched: [10:4.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_emms:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    emms # sched: [31:10.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_emms:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    emms # sched: [2:0.50]
@@ -619,6 +671,11 @@ define void @test_maskmovq(x86_mmx %a0, x86_mmx %a1, i8* %a2) optsize {
 ; SKX-NEXT:    maskmovq %mm1, %mm0 # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maskmovq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    maskmovq %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_maskmovq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    maskmovq %mm1, %mm0 # sched: [1:0.50]
@@ -722,6 +779,17 @@ define i32 @test_movd(x86_mmx %a0, i32 %a1, i32 *%a2) {
 ; SKX-NEXT:    movl %ecx, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movd %edi, %mm1 # sched: [1:1.00]
+; BDVER2-NEXT:    movd (%rsi), %mm2 # sched: [5:0.50]
+; BDVER2-NEXT:    paddd %mm1, %mm2 # sched: [3:1.00]
+; BDVER2-NEXT:    paddd %mm2, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    movd %mm2, %ecx # sched: [2:1.00]
+; BDVER2-NEXT:    movd %mm0, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    movl %ecx, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movd %edi, %mm1 # sched: [8:0.50]
@@ -815,6 +883,13 @@ define i64 @test_movdq2q(<2 x i64> %a0) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movdq2q:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movdq2q %xmm0, %mm0 # sched: [2:1.00]
+; BDVER2-NEXT:    paddd %mm0, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movdq2q:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movdq2q %xmm0, %mm0 # sched: [1:0.50]
@@ -876,6 +951,11 @@ define void @test_movntq(x86_mmx* %a0, x86_mmx %a1) optsize {
 ; SKX-NEXT:    movntq %mm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movntq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movntq %mm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movntq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movntq %mm0, (%rdi) # sched: [2:1.00]
@@ -949,6 +1029,13 @@ define void @test_movq(i64 *%a0) {
 ; SKX-NEXT:    movq %mm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq (%rdi), %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    paddd %mm0, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    movq %mm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq (%rdi), %mm0 # sched: [5:1.00]
@@ -1011,6 +1098,11 @@ define <2 x i64> @test_movq2dq(x86_mmx %a0) optsize {
 ; SKX-NEXT:    movq2dq %mm0, %xmm0 # sched: [2:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movq2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq2dq %mm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movq2dq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq2dq %mm0, %xmm0 # sched: [1:0.50]
@@ -1082,6 +1174,13 @@ define i64 @test_pabsb(x86_mmx *%a0) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pabsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pabsb (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    pabsb %mm0, %mm0 # sched: [1:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pabsb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pabsb (%rdi), %mm0 # sched: [6:1.00]
@@ -1160,6 +1259,13 @@ define i64 @test_pabsd(x86_mmx *%a0) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pabsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pabsd (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    pabsd %mm0, %mm0 # sched: [1:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pabsd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pabsd (%rdi), %mm0 # sched: [6:1.00]
@@ -1238,6 +1344,13 @@ define i64 @test_pabsw(x86_mmx *%a0) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pabsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pabsw (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    pabsw %mm0, %mm0 # sched: [1:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pabsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pabsw (%rdi), %mm0 # sched: [6:1.00]
@@ -1316,6 +1429,13 @@ define i64 @test_packssdw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_packssdw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    packssdw %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    packssdw (%rdi), %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_packssdw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    packssdw %mm1, %mm0 # sched: [1:0.50]
@@ -1394,6 +1514,13 @@ define i64 @test_packsswb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_packsswb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    packsswb %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    packsswb (%rdi), %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_packsswb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    packsswb %mm1, %mm0 # sched: [1:0.50]
@@ -1472,6 +1599,13 @@ define i64 @test_packuswb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_packuswb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    packuswb %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    packuswb (%rdi), %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_packuswb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    packuswb %mm1, %mm0 # sched: [1:0.50]
@@ -1550,6 +1684,13 @@ define i64 @test_paddb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddb %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    paddb (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_paddb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddb %mm1, %mm0 # sched: [1:0.50]
@@ -1628,6 +1769,13 @@ define i64 @test_paddd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddd %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    paddd (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_paddd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddd %mm1, %mm0 # sched: [1:0.50]
@@ -1706,6 +1854,13 @@ define i64 @test_paddq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddq %mm1, %mm0 # sched: [1:0.50]
+; BDVER2-NEXT:    paddq (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_paddq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddq %mm1, %mm0 # sched: [1:0.50]
@@ -1784,6 +1939,13 @@ define i64 @test_paddsb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddsb %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    paddsb (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_paddsb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddsb %mm1, %mm0 # sched: [1:0.50]
@@ -1862,6 +2024,13 @@ define i64 @test_paddsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddsw %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    paddsw (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_paddsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddsw %mm1, %mm0 # sched: [1:0.50]
@@ -1940,6 +2109,13 @@ define i64 @test_paddusb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddusb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddusb %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    paddusb (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_paddusb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddusb %mm1, %mm0 # sched: [1:0.50]
@@ -2018,6 +2194,13 @@ define i64 @test_paddusw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddusw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddusw %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    paddusw (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_paddusw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddusw %mm1, %mm0 # sched: [1:0.50]
@@ -2096,6 +2279,13 @@ define i64 @test_paddw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddw %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    paddw (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_paddw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddw %mm1, %mm0 # sched: [1:0.50]
@@ -2174,6 +2364,13 @@ define i64 @test_palignr(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_palignr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    palignr $1, %mm1, %mm0 # sched: [1:0.50]
+; BDVER2-NEXT:    palignr $1, (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_palignr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    palignr $1, %mm1, %mm0 # sched: [1:0.50]
@@ -2252,6 +2449,13 @@ define i64 @test_pand(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pand:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pand %mm1, %mm0 # sched: [1:0.33]
+; BDVER2-NEXT:    pand (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pand:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pand %mm1, %mm0 # sched: [1:0.50]
@@ -2330,6 +2534,13 @@ define i64 @test_pandn(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pandn:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pandn %mm1, %mm0 # sched: [1:0.33]
+; BDVER2-NEXT:    pandn (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pandn:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pandn %mm1, %mm0 # sched: [1:0.50]
@@ -2408,6 +2619,13 @@ define i64 @test_pavgb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pavgb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pavgb %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pavgb (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pavgb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pavgb %mm1, %mm0 # sched: [1:0.50]
@@ -2486,6 +2704,13 @@ define i64 @test_pavgw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pavgw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pavgw %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pavgw (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pavgw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pavgw %mm1, %mm0 # sched: [1:0.50]
@@ -2564,6 +2789,13 @@ define i64 @test_pcmpeqb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpeqb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpeqb %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pcmpeqb (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pcmpeqb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpeqb %mm1, %mm0 # sched: [1:0.50]
@@ -2642,6 +2874,13 @@ define i64 @test_pcmpeqd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpeqd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpeqd %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pcmpeqd (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pcmpeqd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpeqd %mm1, %mm0 # sched: [1:0.50]
@@ -2720,6 +2959,13 @@ define i64 @test_pcmpeqw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpeqw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpeqw %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pcmpeqw (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pcmpeqw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpeqw %mm1, %mm0 # sched: [1:0.50]
@@ -2798,6 +3044,13 @@ define i64 @test_pcmpgtb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpgtb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpgtb %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pcmpgtb (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pcmpgtb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpgtb %mm1, %mm0 # sched: [1:0.50]
@@ -2876,6 +3129,13 @@ define i64 @test_pcmpgtd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpgtd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpgtd %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pcmpgtd (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pcmpgtd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpgtd %mm1, %mm0 # sched: [1:0.50]
@@ -2954,6 +3214,13 @@ define i64 @test_pcmpgtw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpgtw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpgtw %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pcmpgtw (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pcmpgtw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpgtw %mm1, %mm0 # sched: [1:0.50]
@@ -3016,6 +3283,11 @@ define i32 @test_pextrw(x86_mmx %a0) optsize {
 ; SKX-NEXT:    pextrw $0, %mm0, %eax # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pextrw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pextrw $0, %mm0, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pextrw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pextrw $0, %mm0, %eax # sched: [3:1.00]
@@ -3087,6 +3359,13 @@ define i64 @test_phaddd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phaddd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phaddd %mm1, %mm0 # sched: [3:1.50]
+; BDVER2-NEXT:    phaddd (%rdi), %mm0 # sched: [8:1.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_phaddd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phaddd %mm1, %mm0 # sched: [1:0.50]
@@ -3165,6 +3444,13 @@ define i64 @test_phaddsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phaddsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phaddsw %mm1, %mm0 # sched: [3:1.50]
+; BDVER2-NEXT:    phaddsw (%rdi), %mm0 # sched: [8:1.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_phaddsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phaddsw %mm1, %mm0 # sched: [1:0.50]
@@ -3243,6 +3529,13 @@ define i64 @test_phaddw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phaddw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phaddw %mm1, %mm0 # sched: [3:1.50]
+; BDVER2-NEXT:    phaddw (%rdi), %mm0 # sched: [8:1.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_phaddw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phaddw %mm1, %mm0 # sched: [1:0.50]
@@ -3321,6 +3614,13 @@ define i64 @test_phsubd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phsubd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phsubd %mm1, %mm0 # sched: [3:1.50]
+; BDVER2-NEXT:    phsubd (%rdi), %mm0 # sched: [8:1.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_phsubd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phsubd %mm1, %mm0 # sched: [1:0.50]
@@ -3399,6 +3699,13 @@ define i64 @test_phsubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phsubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phsubsw %mm1, %mm0 # sched: [3:1.50]
+; BDVER2-NEXT:    phsubsw (%rdi), %mm0 # sched: [8:1.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_phsubsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phsubsw %mm1, %mm0 # sched: [1:0.50]
@@ -3477,6 +3784,13 @@ define i64 @test_phsubw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phsubw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phsubw %mm1, %mm0 # sched: [3:1.50]
+; BDVER2-NEXT:    phsubw (%rdi), %mm0 # sched: [8:1.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_phsubw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phsubw %mm1, %mm0 # sched: [1:0.50]
@@ -3563,6 +3877,14 @@ define i64 @test_pinsrw(x86_mmx %a0, i32 %a1, i16* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pinsrw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pinsrw $0, %edi, %mm0 # sched: [2:1.00]
+; BDVER2-NEXT:    movswl (%rsi), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    pinsrw $1, %eax, %mm0 # sched: [2:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pinsrw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pinsrw $0, %edi, %mm0 # sched: [7:0.50]
@@ -3644,6 +3966,13 @@ define i64 @test_pmaddwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmaddwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmaddwd %mm1, %mm0 # sched: [5:1.00]
+; BDVER2-NEXT:    pmaddwd (%rdi), %mm0 # sched: [10:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pmaddwd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmaddwd %mm1, %mm0 # sched: [2:1.00]
@@ -3722,6 +4051,13 @@ define i64 @test_pmaddubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmaddubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmaddubsw %mm1, %mm0 # sched: [5:1.00]
+; BDVER2-NEXT:    pmaddubsw (%rdi), %mm0 # sched: [10:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pmaddubsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmaddubsw %mm1, %mm0 # sched: [2:1.00]
@@ -3800,6 +4136,13 @@ define i64 @test_pmaxsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmaxsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmaxsw %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pmaxsw (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pmaxsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmaxsw %mm1, %mm0 # sched: [1:0.50]
@@ -3878,6 +4221,13 @@ define i64 @test_pmaxub(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmaxub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmaxub %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pmaxub (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pmaxub:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmaxub %mm1, %mm0 # sched: [1:0.50]
@@ -3956,6 +4306,13 @@ define i64 @test_pminsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pminsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pminsw %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pminsw (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pminsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pminsw %mm1, %mm0 # sched: [1:0.50]
@@ -4034,6 +4391,13 @@ define i64 @test_pminub(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pminub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pminub %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    pminub (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pminub:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pminub %mm1, %mm0 # sched: [1:0.50]
@@ -4096,6 +4460,11 @@ define i32 @test_pmovmskb(x86_mmx %a0) optsize {
 ; SKX-NEXT:    pmovmskb %mm0, %eax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmovmskb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmovmskb %mm0, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pmovmskb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmovmskb %mm0, %eax # sched: [3:1.00]
@@ -4167,6 +4536,13 @@ define i64 @test_pmulhrsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmulhrsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmulhrsw %mm1, %mm0 # sched: [5:1.00]
+; BDVER2-NEXT:    pmulhrsw (%rdi), %mm0 # sched: [10:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pmulhrsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmulhrsw %mm1, %mm0 # sched: [2:1.00]
@@ -4245,6 +4621,13 @@ define i64 @test_pmulhw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmulhw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmulhw %mm1, %mm0 # sched: [5:1.00]
+; BDVER2-NEXT:    pmulhw (%rdi), %mm0 # sched: [10:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pmulhw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmulhw %mm1, %mm0 # sched: [2:1.00]
@@ -4323,6 +4706,13 @@ define i64 @test_pmulhuw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmulhuw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmulhuw %mm1, %mm0 # sched: [5:1.00]
+; BDVER2-NEXT:    pmulhuw (%rdi), %mm0 # sched: [10:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pmulhuw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmulhuw %mm1, %mm0 # sched: [2:1.00]
@@ -4401,6 +4791,13 @@ define i64 @test_pmullw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmullw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmullw %mm1, %mm0 # sched: [5:1.00]
+; BDVER2-NEXT:    pmullw (%rdi), %mm0 # sched: [10:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pmullw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmullw %mm1, %mm0 # sched: [2:1.00]
@@ -4479,6 +4876,13 @@ define i64 @test_pmuludq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmuludq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmuludq %mm1, %mm0 # sched: [5:1.00]
+; BDVER2-NEXT:    pmuludq (%rdi), %mm0 # sched: [10:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pmuludq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmuludq %mm1, %mm0 # sched: [2:1.00]
@@ -4557,6 +4961,13 @@ define i64 @test_por(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_por:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    por %mm1, %mm0 # sched: [1:0.33]
+; BDVER2-NEXT:    por (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_por:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    por %mm1, %mm0 # sched: [1:0.50]
@@ -4635,6 +5046,13 @@ define i64 @test_psadbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psadbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psadbw %mm1, %mm0 # sched: [5:1.00]
+; BDVER2-NEXT:    psadbw (%rdi), %mm0 # sched: [10:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psadbw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psadbw %mm1, %mm0 # sched: [2:0.50]
@@ -4713,6 +5131,13 @@ define i64 @test_pshufb(x86_mmx %a0, x86_mmx %a1, x86_mmx *%a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pshufb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pshufb %mm1, %mm0 # sched: [1:0.50]
+; BDVER2-NEXT:    pshufb (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pshufb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pshufb %mm1, %mm0 # sched: [2:0.50]
@@ -4791,6 +5216,13 @@ define i64 @test_pshufw(x86_mmx *%a0) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pshufw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [6:1.00]
+; BDVER2-NEXT:    pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pshufw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [6:1.00]
@@ -4869,6 +5301,13 @@ define i64 @test_psignb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psignb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psignb %mm1, %mm0 # sched: [1:0.50]
+; BDVER2-NEXT:    psignb (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psignb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psignb %mm1, %mm0 # sched: [1:0.50]
@@ -4947,6 +5386,13 @@ define i64 @test_psignd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psignd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psignd %mm1, %mm0 # sched: [1:0.50]
+; BDVER2-NEXT:    psignd (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psignd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psignd %mm1, %mm0 # sched: [1:0.50]
@@ -5025,6 +5471,13 @@ define i64 @test_psignw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psignw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psignw %mm1, %mm0 # sched: [1:0.50]
+; BDVER2-NEXT:    psignw (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psignw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psignw %mm1, %mm0 # sched: [1:0.50]
@@ -5111,6 +5564,14 @@ define i64 @test_pslld(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pslld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pslld %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    pslld (%rdi), %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    pslld $7, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pslld:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pslld %mm1, %mm0 # sched: [1:0.50]
@@ -5201,6 +5662,14 @@ define i64 @test_psllq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psllq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psllq %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    psllq (%rdi), %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    psllq $7, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psllq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psllq %mm1, %mm0 # sched: [1:0.50]
@@ -5291,6 +5760,14 @@ define i64 @test_psllw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psllw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psllw %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    psllw (%rdi), %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    psllw $7, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psllw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psllw %mm1, %mm0 # sched: [1:0.50]
@@ -5381,6 +5858,14 @@ define i64 @test_psrad(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psrad:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psrad %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    psrad (%rdi), %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    psrad $7, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psrad:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psrad %mm1, %mm0 # sched: [1:0.50]
@@ -5471,6 +5956,14 @@ define i64 @test_psraw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psraw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psraw %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    psraw (%rdi), %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    psraw $7, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psraw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psraw %mm1, %mm0 # sched: [1:0.50]
@@ -5561,6 +6054,14 @@ define i64 @test_psrld(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psrld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psrld %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    psrld (%rdi), %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    psrld $7, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psrld:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psrld %mm1, %mm0 # sched: [1:0.50]
@@ -5651,6 +6152,14 @@ define i64 @test_psrlq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psrlq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psrlq %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    psrlq (%rdi), %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    psrlq $7, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psrlq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psrlq %mm1, %mm0 # sched: [1:0.50]
@@ -5741,6 +6250,14 @@ define i64 @test_psrlw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psrlw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psrlw %mm1, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    psrlw (%rdi), %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    psrlw $7, %mm0 # sched: [1:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psrlw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psrlw %mm1, %mm0 # sched: [1:0.50]
@@ -5823,6 +6340,13 @@ define i64 @test_psubb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubb %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    psubb (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psubb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubb %mm1, %mm0 # sched: [1:0.50]
@@ -5901,6 +6425,13 @@ define i64 @test_psubd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubd %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    psubd (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psubd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubd %mm1, %mm0 # sched: [1:0.50]
@@ -5979,6 +6510,13 @@ define i64 @test_psubq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubq %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    psubq (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psubq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubq %mm1, %mm0 # sched: [1:0.50]
@@ -6057,6 +6595,13 @@ define i64 @test_psubsb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubsb %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    psubsb (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psubsb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubsb %mm1, %mm0 # sched: [1:0.50]
@@ -6135,6 +6680,13 @@ define i64 @test_psubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubsw %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    psubsw (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psubsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubsw %mm1, %mm0 # sched: [1:0.50]
@@ -6213,6 +6765,13 @@ define i64 @test_psubusb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubusb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubusb %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    psubusb (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psubusb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubusb %mm1, %mm0 # sched: [1:0.50]
@@ -6291,6 +6850,13 @@ define i64 @test_psubusw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubusw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubusw %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    psubusw (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psubusw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubusw %mm1, %mm0 # sched: [1:0.50]
@@ -6369,6 +6935,13 @@ define i64 @test_psubw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubw %mm1, %mm0 # sched: [3:1.00]
+; BDVER2-NEXT:    psubw (%rdi), %mm0 # sched: [8:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_psubw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubw %mm1, %mm0 # sched: [1:0.50]
@@ -6447,6 +7020,13 @@ define i64 @test_punpckhbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpckhbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:1.00]
+; BDVER2-NEXT:    punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [6:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_punpckhbw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:0.50]
@@ -6525,6 +7105,13 @@ define i64 @test_punpckhdq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpckhdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:1.00]
+; BDVER2-NEXT:    punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [6:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_punpckhdq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:0.50]
@@ -6603,6 +7190,13 @@ define i64 @test_punpckhwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpckhwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; BDVER2-NEXT:    punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_punpckhwd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:0.50]
@@ -6681,6 +7275,13 @@ define i64 @test_punpcklbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpcklbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; BDVER2-NEXT:    punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_punpcklbw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:0.50]
@@ -6759,6 +7360,13 @@ define i64 @test_punpckldq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpckldq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:1.00]
+; BDVER2-NEXT:    punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [6:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_punpckldq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:0.50]
@@ -6837,6 +7445,13 @@ define i64 @test_punpcklwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpcklwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:1.00]
+; BDVER2-NEXT:    punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [6:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_punpcklwd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:0.50]
@@ -6915,6 +7530,13 @@ define i64 @test_pxor(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pxor:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pxor %mm1, %mm0 # sched: [1:0.33]
+; BDVER2-NEXT:    pxor (%rdi), %mm0 # sched: [6:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pxor:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pxor %mm1, %mm0 # sched: [1:0.50]
diff --git a/test/CodeGen/X86/popcnt-schedule.ll b/test/CodeGen/X86/popcnt-schedule.ll
index 4f590bd96bb..d84d489a068 100644
--- a/test/CodeGen/X86/popcnt-schedule.ll
+++ b/test/CodeGen/X86/popcnt-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell   | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+popcnt      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -60,6 +61,14 @@ define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) {
 ; SKYLAKE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctpop_i16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    popcntw (%rsi), %cx # sched: [9:1.00]
+; BDVER2-NEXT:    popcntw %di, %ax # sched: [3:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_ctpop_i16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    popcntw (%rsi), %cx # sched: [4:1.00]
@@ -126,6 +135,13 @@ define i32 @test_ctpop_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctpop_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    popcntl (%rsi), %ecx # sched: [9:1.00]
+; BDVER2-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_ctpop_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    popcntl (%rsi), %ecx # sched: [4:1.00]
@@ -190,6 +206,13 @@ define i64 @test_ctpop_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctpop_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    popcntq (%rsi), %rcx # sched: [9:1.00]
+; BDVER2-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_ctpop_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    popcntq (%rsi), %rcx # sched: [4:1.00]
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 47484865693..167abccc339 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+fma4 -mattr=+avx -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
@@ -37,6 +38,12 @@ define float @f32_no_estimate(float %x) #0 {
 ; FMA-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_no_estimate:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: f32_no_estimate:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -105,6 +112,13 @@ define float @f32_one_step(float %x) #1 {
 ; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_one_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: f32_one_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -202,6 +216,16 @@ define float @f32_two_step(float %x) #2 {
 ; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_two_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: f32_two_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -300,6 +324,12 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
 ; FMA-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_no_estimate:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; BDVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v4f32_no_estimate:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1] sched: [5:1.00]
@@ -368,6 +398,13 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_one_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v4f32_one_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:1.00]
@@ -467,6 +504,16 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_two_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v4f32_two_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1] sched: [5:1.00]
@@ -568,6 +615,12 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
 ; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_no_estimate:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [29:28.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v8f32_no_estimate:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
@@ -643,6 +696,13 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_one_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v8f32_one_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
@@ -755,6 +815,16 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_two_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v8f32_two_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
@@ -864,6 +934,13 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ; FMA-RECIP-NEXT:    vdivps %ymm1, %ymm2, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_no_estimate:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [29:28.00]
+; BDVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [29:28.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v16f32_no_estimate:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
@@ -966,6 +1043,17 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_one_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm4 # sched: [7:2.00]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm4, %ymm1, %ymm4, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v16f32_one_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
@@ -1136,6 +1224,21 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_two_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v16f32_two_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll
index fdd441581dc..f669d5dc3f5 100644
--- a/test/CodeGen/X86/recip-fastmath2.ll
+++ b/test/CodeGen/X86/recip-fastmath2.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -print-schedule       | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -print-schedule  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+fma4 -mattr=+avx -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
@@ -30,6 +31,12 @@ define float @f32_no_step_2(float %x) #3 {
 ; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_no_step_2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: f32_no_step_2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
@@ -101,6 +108,14 @@ define float @f32_one_step_2(float %x) #1 {
 ; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_one_step_2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: f32_one_step_2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -196,6 +211,15 @@ define float @f32_one_step_2_divs(float %x) #1 {
 ; FMA-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_one_step_2_divs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
+; BDVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -309,6 +333,17 @@ define float @f32_two_step_2(float %x) #2 {
 ; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_two_step_2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: f32_two_step_2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -425,6 +460,14 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_one_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v4f32_one_step2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:1.00]
@@ -522,6 +565,15 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_one_step_2_divs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
+; BDVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v4f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:1.00]
@@ -637,6 +689,17 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_two_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v4f32_two_step2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1] sched: [5:1.00]
@@ -761,6 +824,14 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_one_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v8f32_one_step2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
@@ -867,6 +938,15 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_one_step_2_divs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00]
+; BDVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v8f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
@@ -996,6 +1076,17 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_two_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v8f32_two_step2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
@@ -1097,6 +1188,11 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_no_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v8f32_no_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
@@ -1151,6 +1247,12 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_no_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v8f32_no_step2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
@@ -1257,6 +1359,19 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_one_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm4 # sched: [7:2.00]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm4, %ymm0, %ymm4, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v16f32_one_step2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
@@ -1415,6 +1530,21 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_one_step_2_divs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:1.00]
+; BDVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v16f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
@@ -1613,6 +1743,23 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_two_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v16f32_two_step2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
@@ -1755,6 +1902,12 @@ define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_no_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v16f32_no_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
@@ -1821,6 +1974,14 @@ define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_no_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [7:2.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: v16f32_no_step2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [2:2.00]
diff --git a/test/CodeGen/X86/schedule-x86-64-shld.ll b/test/CodeGen/X86/schedule-x86-64-shld.ll
index 46388d7b4fd..a2e280126b4 100644
--- a/test/CodeGen/X86/schedule-x86-64-shld.ll
+++ b/test/CodeGen/X86/schedule-x86-64-shld.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+slow-shld | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER12 --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+slow-shld | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER12 --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER1
 
 
 ; uint64_t lshift10(uint64_t a, uint64_t b)
@@ -16,17 +17,17 @@ define i64 @lshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize {
 ; GENERIC-NEXT:    shldq $10, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift10_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; BDVER12-NEXT:    shldq $10, %rsi, %rax # sched: [2:0.67]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: lshift10_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    shldq $10, %rsi, %rax # sched: [3:3.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift10_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdi, %rax
-; BDVER1-NEXT:    shldq $10, %rsi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = shl i64 %a, 10
   %shr = lshr i64 %b, 54
@@ -41,19 +42,19 @@ define i64 @lshift10(i64 %a, i64 %b) nounwind readnone {
 ; GENERIC-NEXT:    shldq $10, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift10:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    shlq $10, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    shrq $54, %rsi # sched: [1:0.50]
+; BDVER12-NEXT:    leaq (%rsi,%rdi), %rax # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: lshift10:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    shlq $10, %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    shrq $54, %rsi # sched: [1:0.50]
 ; BTVER2-NEXT:    leaq (%rsi,%rdi), %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift10:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    shlq $10, %rdi
-; BDVER1-NEXT:    shrq $54, %rsi
-; BDVER1-NEXT:    leaq (%rsi,%rdi), %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = shl i64 %a, 10
   %shr = lshr i64 %b, 54
@@ -74,17 +75,17 @@ define i64 @rshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize {
 ; GENERIC-NEXT:    shrdq $62, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: rshift10_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; BDVER12-NEXT:    shrdq $62, %rsi, %rax # sched: [2:0.67]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: rshift10_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    shrdq $62, %rsi, %rax # sched: [3:3.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: rshift10_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdi, %rax
-; BDVER1-NEXT:    shrdq $62, %rsi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = lshr i64 %a, 62
   %shr = shl i64 %b, 2
@@ -100,17 +101,17 @@ define i64 @rshift10(i64 %a, i64 %b) nounwind readnone {
 ; GENERIC-NEXT:    shrdq $62, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: rshift10:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    shrq $62, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: rshift10:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    shrq $62, %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [2:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: rshift10:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    shrq $62, %rdi
-; BDVER1-NEXT:    leaq (%rdi,%rsi,4), %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = lshr i64 %a, 62
   %shr = shl i64 %b, 2
@@ -132,6 +133,14 @@ define i64 @lshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize
 ; GENERIC-NEXT:    shldq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_cl_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shldq %cl, %rsi, %rax # sched: [4:1.50]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: lshift_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
@@ -139,14 +148,6 @@ define i64 @lshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shldq %cl, %rsi, %rax # sched: [4:4.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_cl_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdx, %rcx
-; BDVER1-NEXT:    movq %rdi, %rax
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shldq %cl, %rsi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = shl i64 %a, %c
   %sub = sub nsw i64 64, %c
@@ -164,6 +165,17 @@ define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; GENERIC-NEXT:    shldq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_cl:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.33]
+; BDVER12-NEXT:    shlq %cl, %rdi # sched: [3:1.50]
+; BDVER12-NEXT:    negl %ecx # sched: [1:0.33]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shrq %cl, %rax # sched: [3:1.50]
+; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: lshift_cl:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
@@ -174,17 +186,6 @@ define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; BTVER2-NEXT:    shrq %cl, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_cl:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdx, %rcx
-; BDVER1-NEXT:    movq %rsi, %rax
-; BDVER1-NEXT:    shlq %cl, %rdi
-; BDVER1-NEXT:    negl %ecx
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shrq %cl, %rax
-; BDVER1-NEXT:    orq %rdi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = shl i64 %a, %c
   %sub = sub nsw i64 64, %c
@@ -208,6 +209,14 @@ define i64 @rshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize
 ; GENERIC-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: rshift_cl_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:1.50]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: rshift_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
@@ -215,14 +224,6 @@ define i64 @rshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:4.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: rshift_cl_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdx, %rcx
-; BDVER1-NEXT:    movq %rdi, %rax
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shrdq %cl, %rsi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shr = lshr i64 %a, %c
   %sub = sub nsw i64 64, %c
@@ -240,6 +241,17 @@ define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; GENERIC-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: rshift_cl:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.33]
+; BDVER12-NEXT:    shrq %cl, %rdi # sched: [3:1.50]
+; BDVER12-NEXT:    negl %ecx # sched: [1:0.33]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shlq %cl, %rax # sched: [3:1.50]
+; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: rshift_cl:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
@@ -250,17 +262,6 @@ define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; BTVER2-NEXT:    shlq %cl, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: rshift_cl:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdx, %rcx
-; BDVER1-NEXT:    movq %rsi, %rax
-; BDVER1-NEXT:    shrq %cl, %rdi
-; BDVER1-NEXT:    negl %ecx
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shlq %cl, %rax
-; BDVER1-NEXT:    orq %rdi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shr = lshr i64 %a, %c
   %sub = sub nsw i64 64, %c
@@ -284,19 +285,19 @@ define void @lshift_mem_cl_optsize(i64 %a, i64 %c) nounwind readnone optsize {
 ; GENERIC-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [10:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem_cl_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rsi, %rcx # sched: [1:0.33]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [10:1.50]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: lshift_mem_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [9:11.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem_cl_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rsi, %rcx
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shldq %cl, %rdi, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %b = load i64, i64* @x
   %shl = shl i64 %b, %c
@@ -315,6 +316,18 @@ define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone {
 ; GENERIC-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [10:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem_cl:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rsi, %rcx # sched: [1:0.33]
+; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
+; BDVER12-NEXT:    shlq %cl, %rax # sched: [3:1.50]
+; BDVER12-NEXT:    negl %ecx # sched: [1:0.33]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shrq %cl, %rdi # sched: [3:1.50]
+; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: lshift_mem_cl:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:1.00]
@@ -326,18 +339,6 @@ define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone {
 ; BTVER2-NEXT:    orq %rax, %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem_cl:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rsi, %rcx
-; BDVER1-NEXT:    movq {{.*}}(%rip), %rax
-; BDVER1-NEXT:    shlq %cl, %rax
-; BDVER1-NEXT:    negl %ecx
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shrq %cl, %rdi
-; BDVER1-NEXT:    orq %rax, %rdi
-; BDVER1-NEXT:    movq %rdi, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %b = load i64, i64* @x
   %shl = shl i64 %b, %c
@@ -354,6 +355,15 @@ define void @lshift_mem(i64 %a) nounwind readnone {
 ; GENERIC-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
+; BDVER12-NEXT:    shlq $10, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    shrq $54, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: lshift_mem:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:1.00]
@@ -362,15 +372,6 @@ define void @lshift_mem(i64 %a) nounwind readnone {
 ; BTVER2-NEXT:    orq %rax, %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq {{.*}}(%rip), %rax
-; BDVER1-NEXT:    shlq $10, %rax
-; BDVER1-NEXT:    shrq $54, %rdi
-; BDVER1-NEXT:    orq %rax, %rdi
-; BDVER1-NEXT:    movq %rdi, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %b = load i64, i64* @x
   %shl = shl i64 %b, 10
@@ -386,15 +387,15 @@ define void @lshift_mem_optsize(i64 %a) nounwind readnone optsize {
 ; GENERIC-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [8:1.00]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: lshift_mem_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [9:11.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    shldq $10, %rdi, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %b = load i64, i64* @x
   %shl = shl i64 %b, 10
@@ -412,6 +413,15 @@ define void @lshift_mem_b(i64 %b) nounwind readnone {
 ; GENERIC-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem_b:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
+; BDVER12-NEXT:    shlq $10, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    shrq $54, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: lshift_mem_b:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:1.00]
@@ -420,15 +430,6 @@ define void @lshift_mem_b(i64 %b) nounwind readnone {
 ; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem_b:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq {{.*}}(%rip), %rax
-; BDVER1-NEXT:    shlq $10, %rdi
-; BDVER1-NEXT:    shrq $54, %rax
-; BDVER1-NEXT:    orq %rdi, %rax
-; BDVER1-NEXT:    movq %rax, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %a = load i64, i64* @x
   %shl = shl i64 %b, 10
@@ -446,19 +447,19 @@ define void @lshift_mem_b_optsize(i64 %b) nounwind readnone optsize {
 ; GENERIC-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem_b_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
+; BDVER12-NEXT:    shrdq $54, %rdi, %rax # sched: [2:0.67]
+; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: lshift_mem_b_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:1.00]
 ; BTVER2-NEXT:    shrdq $54, %rdi, %rax # sched: [3:3.00]
 ; BTVER2-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem_b_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq {{.*}}(%rip), %rax
-; BDVER1-NEXT:    shrdq $54, %rdi, %rax
-; BDVER1-NEXT:    movq %rax, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %a = load i64, i64* @x
   %shl = shl i64 %b, 10
diff --git a/test/CodeGen/X86/schedule-x86_32.ll b/test/CodeGen/X86/schedule-x86_32.ll
index 873d6a679b0..6aff5a34a41 100644
--- a/test/CodeGen/X86/schedule-x86_32.ll
+++ b/test/CodeGen/X86/schedule-x86_32.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -76,6 +77,14 @@ define i8 @test_aaa(i8 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_aaa:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    aaa # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_aaa:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
@@ -168,6 +177,15 @@ define void @test_aad(i16 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_aad:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    aad # sched: [100:0.33]
+; BDVER2-NEXT:    aad $16 # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_aad:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
@@ -262,6 +280,15 @@ define void @test_aam(i8 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_aam:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    aam # sched: [100:0.33]
+; BDVER2-NEXT:    aam $16 # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_aam:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
@@ -348,6 +375,14 @@ define i8 @test_aas(i8 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_aas:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    aas # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_aas:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
@@ -440,6 +475,15 @@ define void @test_arpl(i16 %a0, i16 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_arpl:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    arpl %ax, (%ecx) # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_arpl:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
@@ -598,6 +642,23 @@ define void @test_bound(i16 %a0, i16 *%a1, i32 %a2, i32 *%a3) optsize {
 ; SKX-NEXT:    .cfi_def_cfa_offset 4
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_bound:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pushl %esi # sched: [5:1.00]
+; BDVER2-NEXT:    .cfi_def_cfa_offset 8
+; BDVER2-NEXT:    .cfi_offset %esi, -8
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bound %ax, (%esi) # sched: [100:0.33]
+; BDVER2-NEXT:    bound %ecx, (%edx) # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    popl %esi # sched: [6:0.50]
+; BDVER2-NEXT:    .cfi_def_cfa_offset 4
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_bound:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pushl %esi # sched: [1:1.00]
@@ -702,6 +763,14 @@ define i8 @test_daa(i8 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_daa:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    daa # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_daa:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
@@ -786,6 +855,14 @@ define i8 @test_das(i8 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_das:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    das # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_das:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
@@ -886,6 +963,16 @@ define void @test_dec16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_dec16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decw %ax # sched: [1:0.33]
+; BDVER2-NEXT:    decw (%ecx) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_dec16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
@@ -989,6 +1076,16 @@ define void @test_dec32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_dec32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decl %eax # sched: [1:0.33]
+; BDVER2-NEXT:    decl (%ecx) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_dec32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1093,6 +1190,16 @@ define void @test_inc16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_inc16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incw %ax # sched: [1:0.33]
+; BDVER2-NEXT:    incw (%ecx) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_inc16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
@@ -1196,6 +1303,16 @@ define void @test_inc32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_inc32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incl %eax # sched: [1:0.33]
+; BDVER2-NEXT:    incl (%ecx) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_inc32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1276,6 +1393,13 @@ define void @test_into() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_into:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    into # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_into:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1368,6 +1492,15 @@ define void @test_jcxz_jecxz() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_jcxz_jecxz:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:  JXTGT:
+; BDVER2-NEXT:    jcxz JXTGT # sched: [2:1.00]
+; BDVER2-NEXT:    jecxz JXTGT # sched: [2:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_jcxz_jecxz:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1448,6 +1581,13 @@ define void @test_leave() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_leave:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    leave # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_leave:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1604,6 +1744,23 @@ define void @test_pop_push() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_pop_push:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popl %ds # sched: [100:0.33]
+; BDVER2-NEXT:    popl %es # sched: [100:0.33]
+; BDVER2-NEXT:    popl %ss # sched: [100:0.33]
+; BDVER2-NEXT:    popl %fs # sched: [100:0.33]
+; BDVER2-NEXT:    popl %gs # sched: [100:0.33]
+; BDVER2-NEXT:    pushl %cs # sched: [100:0.33]
+; BDVER2-NEXT:    pushl %ds # sched: [100:0.33]
+; BDVER2-NEXT:    pushl %es # sched: [100:0.33]
+; BDVER2-NEXT:    pushl %ss # sched: [100:0.33]
+; BDVER2-NEXT:    pushl %fs # sched: [100:0.33]
+; BDVER2-NEXT:    pushl %gs # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_pop_push:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1760,6 +1917,21 @@ define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_pop_push_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popw %ax # sched: [6:0.50]
+; BDVER2-NEXT:    popw (%ecx) # sched: [6:0.50]
+; BDVER2-NEXT:    pushw %ax # sched: [5:1.00]
+; BDVER2-NEXT:    pushw (%ecx) # sched: [5:1.00]
+; BDVER2-NEXT:    pushw $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    pushw $7 # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_pop_push_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
@@ -1912,6 +2084,21 @@ define i32 @test_pop_push_32(i32 %a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_pop_push_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popl %eax # sched: [6:0.50]
+; BDVER2-NEXT:    popl (%ecx) # sched: [6:0.50]
+; BDVER2-NEXT:    pushl %eax # sched: [5:1.00]
+; BDVER2-NEXT:    pushl (%ecx) # sched: [5:1.00]
+; BDVER2-NEXT:    pushl $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    pushl $7 # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_pop_push_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2026,6 +2213,16 @@ define void @test_popa_popf_pusha_pushf() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_popa_popf_pusha_pushf:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popal # sched: [5:0.50]
+; BDVER2-NEXT:    popfl # sched: [5:0.50]
+; BDVER2-NEXT:    pushal # sched: [1:1.00]
+; BDVER2-NEXT:    pushfl # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_popa_popf_pusha_pushf:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2144,6 +2341,18 @@ define void @test_ret() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_ret:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    lretl # sched: [6:1.00]
+; BDVER2-NEXT:    lretl $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_ret:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2228,6 +2437,13 @@ define i8 @test_salc() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_salc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    salc # sched: [1:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_salc:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2345,6 +2561,18 @@ define void @test_xchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_xchg_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xchgl %eax, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    xchgl %ecx, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    xchgl %eax, (%edx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_xchg_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
diff --git a/test/CodeGen/X86/schedule-x86_64.ll b/test/CodeGen/X86/schedule-x86_64.ll
index e903ff51053..4cd50969ca9 100644
--- a/test/CodeGen/X86/schedule-x86_64.ll
+++ b/test/CodeGen/X86/schedule-x86_64.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -108,6 +109,18 @@ define void @test_adc_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_adc_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    adcb $7, %al # sched: [2:0.67]
+; BDVER2-NEXT:    adcb $7, %dil # sched: [2:0.67]
+; BDVER2-NEXT:    adcb $7, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    adcb %dl, %dil # sched: [2:0.67]
+; BDVER2-NEXT:    adcb %dil, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    adcb (%rsi), %dil # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_adc_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -271,6 +284,23 @@ define void @test_adc_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_adc_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    adcw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    adcw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    adcw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [9:1.00]
+; BDVER2-NEXT:    adcw $7, %di # sched: [2:0.67]
+; BDVER2-NEXT:    adcw $7, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    adcw %dx, %di # sched: [2:0.67]
+; BDVER2-NEXT:    adcw %di, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    adcw (%rsi), %di # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_adc_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -444,6 +474,23 @@ define void @test_adc_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_adc_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    adcl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    adcl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    adcl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [9:1.00]
+; BDVER2-NEXT:    adcl $7, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    adcl $7, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    adcl %edx, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    adcl %edi, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    adcl (%rsi), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_adc_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -617,6 +664,23 @@ define void @test_adc_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_adc_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    adcq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    adcq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    adcq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [9:1.00]
+; BDVER2-NEXT:    adcq $7, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    adcq $7, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    adcq %rdx, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    adcq %rdi, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    adcq (%rsi), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_adc_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -751,6 +815,18 @@ define void @test_add_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_add_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    addb $7, %al # sched: [1:0.33]
+; BDVER2-NEXT:    addb $7, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    addb $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    addb %dl, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    addb %dil, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    addb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_add_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -914,6 +990,23 @@ define void @test_add_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_add_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    addw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    addw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    addw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    addw $7, %di # sched: [1:0.33]
+; BDVER2-NEXT:    addw $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    addw %dx, %di # sched: [1:0.33]
+; BDVER2-NEXT:    addw %di, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    addw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_add_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1087,6 +1180,23 @@ define void @test_add_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_add_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    addl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    addl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    addl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    addl $7, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    addl $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    addl %edx, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    addl %edi, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    addl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_add_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1260,6 +1370,23 @@ define void @test_add_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_add_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    addq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    addq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    addq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    addq $7, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    addq $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    addq %rdx, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    addq %rdi, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    addq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_add_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1394,6 +1521,18 @@ define void @test_and_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_and_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    andb $7, %al # sched: [1:0.33]
+; BDVER2-NEXT:    andb $7, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    andb $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    andb %dl, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    andb %dil, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    andb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_and_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1557,6 +1696,23 @@ define void @test_and_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_and_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    andw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    andw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    andw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    andw $7, %di # sched: [1:0.33]
+; BDVER2-NEXT:    andw $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    andw %dx, %di # sched: [1:0.33]
+; BDVER2-NEXT:    andw %di, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    andw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_and_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1730,6 +1886,23 @@ define void @test_and_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_and_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    andl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    andl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    andl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    andl $7, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    andl $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    andl %edx, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    andl %edi, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    andl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_and_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1903,6 +2076,23 @@ define void @test_and_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_and_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    andq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    andq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    andq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    andq $7, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    andq $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    andq %rdx, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    andq %rdi, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    andq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_and_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2021,6 +2211,16 @@ define i16 @test_bsf16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsf16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsfw %di, %ax # sched: [3:1.00]
+; BDVER2-NEXT:    bsfw (%rsi), %cx # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bsf16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2119,6 +2319,15 @@ define i32 @test_bsf32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsf32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsfl %edi, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    bsfl (%rsi), %ecx # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bsf32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2215,6 +2424,15 @@ define i64 @test_bsf64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsf64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsfq %rdi, %rax # sched: [3:1.00]
+; BDVER2-NEXT:    bsfq (%rsi), %rcx # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bsf64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2320,6 +2538,16 @@ define i16 @test_bsr16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsr16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsrw %di, %ax # sched: [3:1.00]
+; BDVER2-NEXT:    bsrw (%rsi), %cx # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bsr16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2418,6 +2646,15 @@ define i32 @test_bsr32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsr32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsrl %edi, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    bsrl (%rsi), %ecx # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bsr32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2514,6 +2751,15 @@ define i64 @test_bsr64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsr64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsrq %rdi, %rax # sched: [3:1.00]
+; BDVER2-NEXT:    bsrq (%rsi), %rcx # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bsr64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2587,6 +2833,12 @@ define i32 @test_bswap32(i32 %a0) optsize {
 ; SKX-NEXT:    bswapl %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bswap32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    bswapl %eax # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bswap32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
@@ -2650,6 +2902,12 @@ define i64 @test_bswap64(i64 %a0) optsize {
 ; SKX-NEXT:    bswapq %rax # sched: [2:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bswap64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    bswapq %rax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bswap64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
@@ -2842,6 +3100,28 @@ define void @test_bt_btc_btr_bts_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bt_btc_btr_bts_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    btw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    btcw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    btrw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    btsw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    btw %si, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btcw %si, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btrw %si, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btsw %si, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    btcw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    btrw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    btsw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    btw $7, (%rdx) # sched: [6:0.50]
+; BDVER2-NEXT:    btcw $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrw $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsw $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bt_btc_btr_bts_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3065,6 +3345,28 @@ define void @test_bt_btc_btr_bts_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bt_btc_btr_bts_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    btl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    btcl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    btrl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    btsl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    btl %esi, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btcl %esi, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btrl %esi, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btsl %esi, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    btcl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    btrl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    btsl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    btl $7, (%rdx) # sched: [6:0.50]
+; BDVER2-NEXT:    btcl $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrl $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsl $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bt_btc_btr_bts_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3288,6 +3590,28 @@ define void @test_bt_btc_btr_bts_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bt_btc_btr_bts_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    btq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    btcq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    btrq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    btsq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    btq %rsi, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btcq %rsi, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btrq %rsi, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btsq %rsi, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    btcq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    btrq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    btsq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    btq $7, (%rdx) # sched: [6:0.50]
+; BDVER2-NEXT:    btcq $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrq $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsq $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_bt_btc_btr_bts_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3434,6 +3758,18 @@ define void @test_cbw_cdq_cdqe_cqo_cwd_cwde() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cbtw # sched: [1:0.33]
+; BDVER2-NEXT:    cltd # sched: [1:0.50]
+; BDVER2-NEXT:    cltq # sched: [1:0.33]
+; BDVER2-NEXT:    cqto # sched: [1:0.50]
+; BDVER2-NEXT:    cwtd # sched: [2:1.00]
+; BDVER2-NEXT:    cwtl # sched: [1:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3534,6 +3870,15 @@ define void @test_clc_cld_cmc() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_clc_cld_cmc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    clc # sched: [1:0.25]
+; BDVER2-NEXT:    cld # sched: [1:0.33]
+; BDVER2-NEXT:    cmc # sched: [1:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_clc_cld_cmc:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3652,6 +3997,18 @@ define void @test_cmp_8(i8 %a0, i8* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmp_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpb $7, %al # sched: [1:0.33]
+; BDVER2-NEXT:    cmpb $7, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    cmpb $7, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    cmpb %dil, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    cmpb %dil, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    cmpb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmp_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3815,6 +4172,23 @@ define void @test_cmp_16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmp_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    cmpw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    cmpw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [6:0.50]
+; BDVER2-NEXT:    cmpw $7, %di # sched: [1:0.33]
+; BDVER2-NEXT:    cmpw $7, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    cmpw %di, %di # sched: [1:0.33]
+; BDVER2-NEXT:    cmpw %di, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    cmpw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmp_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3988,6 +4362,23 @@ define void @test_cmp_32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmp_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    cmpl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    cmpl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:0.50]
+; BDVER2-NEXT:    cmpl $7, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    cmpl $7, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    cmpl %edi, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    cmpl %edi, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    cmpl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmp_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4161,6 +4552,23 @@ define void @test_cmp_64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmp_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    cmpq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    cmpq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:0.50]
+; BDVER2-NEXT:    cmpq $7, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    cmpq $7, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    cmpq %rdi, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    cmpq %rdi, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    cmpq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmp_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4279,6 +4687,16 @@ define void @test_cmps() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpsb %es:(%rdi), (%rsi) # sched: [8:1.00]
+; BDVER2-NEXT:    cmpsw %es:(%rdi), (%rsi) # sched: [8:1.00]
+; BDVER2-NEXT:    cmpsl %es:(%rdi), (%rsi) # sched: [8:1.00]
+; BDVER2-NEXT:    cmpsq %es:(%rdi), (%rsi) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4367,6 +4785,14 @@ define void @test_cmpxchg_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpxchg_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpxchgb %dil, %sil # sched: [5:1.33]
+; BDVER2-NEXT:    cmpxchgb %dil, (%rdx) # sched: [8:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmpxchg_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4450,6 +4876,14 @@ define void @test_cmpxchg_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpxchg_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpxchgw %di, %si # sched: [5:1.33]
+; BDVER2-NEXT:    cmpxchgw %di, (%rdx) # sched: [8:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmpxchg_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4533,6 +4967,14 @@ define void @test_cmpxchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpxchg_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpxchgl %edi, %esi # sched: [5:1.33]
+; BDVER2-NEXT:    cmpxchgl %edi, (%rdx) # sched: [8:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmpxchg_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4616,6 +5058,14 @@ define void @test_cmpxchg_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpxchg_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpxchgq %rdi, %rsi # sched: [5:1.33]
+; BDVER2-NEXT:    cmpxchgq %rdi, (%rdx) # sched: [8:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmpxchg_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4699,6 +5149,14 @@ define void @test_cmpxchg8b_cmpxchg16b(i8 *%a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpxchg8b_cmpxchg16b:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpxchg8b (%rdi) # sched: [6:1.00]
+; BDVER2-NEXT:    cmpxchg16b (%rdi) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cmpxchg8b_cmpxchg16b:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4775,6 +5233,13 @@ define void @test_cpuid() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cpuid:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cpuid # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_cpuid:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4857,6 +5322,14 @@ define void @test_dec8(i8 %a0, i8* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_dec8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decb %dil # sched: [1:0.33]
+; BDVER2-NEXT:    decb (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_dec8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4940,6 +5413,14 @@ define void @test_dec16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_dec16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decw %di # sched: [1:0.33]
+; BDVER2-NEXT:    decw (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_dec16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5023,6 +5504,14 @@ define void @test_dec32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_dec32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decl %edi # sched: [1:0.33]
+; BDVER2-NEXT:    decl (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_dec32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5106,6 +5595,14 @@ define void @test_dec64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_dec64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decq %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    decq (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_dec64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5254,6 +5751,22 @@ define void @test_div(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_div:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    divb %dil # sched: [25:10.00]
+; BDVER2-NEXT:    divb (%r8) # sched: [30:10.00]
+; BDVER2-NEXT:    divw %si # sched: [25:10.00]
+; BDVER2-NEXT:    divw (%r9) # sched: [30:10.00]
+; BDVER2-NEXT:    divl %edx # sched: [25:10.00]
+; BDVER2-NEXT:    divl (%rax) # sched: [30:10.00]
+; BDVER2-NEXT:    divq %rcx # sched: [25:10.00]
+; BDVER2-NEXT:    divq (%r10) # sched: [30:10.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_div:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
@@ -5354,6 +5867,14 @@ define void @test_enter() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_enter:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    enter $7, $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_enter:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5502,6 +6023,22 @@ define void @test_idiv(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_idiv:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    idivb %dil # sched: [25:10.00]
+; BDVER2-NEXT:    idivb (%r8) # sched: [30:10.00]
+; BDVER2-NEXT:    idivw %si # sched: [25:10.00]
+; BDVER2-NEXT:    idivw (%r9) # sched: [30:10.00]
+; BDVER2-NEXT:    idivl %edx # sched: [25:10.00]
+; BDVER2-NEXT:    idivl (%rax) # sched: [30:10.00]
+; BDVER2-NEXT:    idivq %rcx # sched: [25:10.00]
+; BDVER2-NEXT:    idivq (%r10) # sched: [30:10.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_idiv:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
@@ -5602,6 +6139,14 @@ define void @test_imul_8(i8 %a0, i8* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_imul_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    imulb %dil # sched: [3:1.00]
+; BDVER2-NEXT:    imulb (%rsi) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_imul_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5749,6 +6294,22 @@ define void @test_imul_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_imul_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    imulw %di # sched: [4:1.33]
+; BDVER2-NEXT:    imulw (%rsi) # sched: [9:1.33]
+; BDVER2-NEXT:    imulw %dx, %di # sched: [3:1.00]
+; BDVER2-NEXT:    imulw (%rsi), %di # sched: [8:1.00]
+; BDVER2-NEXT:    imulw $511, %di, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [4:1.00]
+; BDVER2-NEXT:    imulw $511, (%rsi), %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [8:1.00]
+; BDVER2-NEXT:    imulw $7, %di, %di # sched: [4:1.00]
+; BDVER2-NEXT:    imulw $7, (%rsi), %di # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_imul_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5912,6 +6473,22 @@ define void @test_imul_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_imul_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    imull %edi # sched: [4:1.00]
+; BDVER2-NEXT:    imull (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    imull %edx, %edi # sched: [3:1.00]
+; BDVER2-NEXT:    imull (%rsi), %edi # sched: [8:1.00]
+; BDVER2-NEXT:    imull $665536, %edi, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [3:1.00]
+; BDVER2-NEXT:    imull $665536, (%rsi), %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [8:1.00]
+; BDVER2-NEXT:    imull $7, %edi, %edi # sched: [3:1.00]
+; BDVER2-NEXT:    imull $7, (%rsi), %edi # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_imul_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6075,6 +6652,22 @@ define void @test_imul_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_imul_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    imulq %rdi # sched: [4:1.00]
+; BDVER2-NEXT:    imulq (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    imulq %rdx, %rdi # sched: [3:1.00]
+; BDVER2-NEXT:    imulq (%rsi), %rdi # sched: [8:1.00]
+; BDVER2-NEXT:    imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [3:1.00]
+; BDVER2-NEXT:    imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [8:1.00]
+; BDVER2-NEXT:    imulq $7, %rdi, %rdi # sched: [3:1.00]
+; BDVER2-NEXT:    imulq $7, (%rsi), %rdi # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_imul_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6207,6 +6800,18 @@ define void @test_in() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_in:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    inb $7, %al # sched: [100:0.33]
+; BDVER2-NEXT:    inw $7, %ax # sched: [100:0.33]
+; BDVER2-NEXT:    inl $7, %eax # sched: [100:0.33]
+; BDVER2-NEXT:    inb %dx, %al # sched: [100:0.33]
+; BDVER2-NEXT:    inw %dx, %ax # sched: [100:0.33]
+; BDVER2-NEXT:    inl %dx, %eax # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_in:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6299,6 +6904,14 @@ define void @test_inc8(i8 %a0, i8* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_inc8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incb %dil # sched: [1:0.33]
+; BDVER2-NEXT:    incb (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_inc8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6382,6 +6995,14 @@ define void @test_inc16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_inc16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incw %di # sched: [1:0.33]
+; BDVER2-NEXT:    incw (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_inc16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6465,6 +7086,14 @@ define void @test_inc32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_inc32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incl %edi # sched: [1:0.33]
+; BDVER2-NEXT:    incl (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_inc32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6548,6 +7177,14 @@ define void @test_inc64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_inc64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incq %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    incq (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_inc64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6640,6 +7277,15 @@ define void @test_ins() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ins:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    insb %dx, %es:(%rdi) # sched: [100:0.33]
+; BDVER2-NEXT:    insw %dx, %es:(%rdi) # sched: [100:0.33]
+; BDVER2-NEXT:    insl %dx, %es:(%rdi) # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_ins:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6718,6 +7364,13 @@ define void @test_int() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_int:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    int $7 # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_int:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6800,6 +7453,14 @@ define void @test_invlpg_invlpga(i8 *%a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_invlpg_invlpga:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    invlpg (%rdi) # sched: [100:0.33]
+; BDVER2-NEXT:    invlpga %rax, %ecx # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_invlpg_invlpga:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7116,6 +7777,43 @@ define void @test_jcc() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_jcc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:  JCCTGT:
+; BDVER2-NEXT:    jo JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jno JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jb JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jb JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jb JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jae JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jae JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jae JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    je JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    je JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jne JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jne JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jbe JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jbe JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    ja JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    ja JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    js JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jns JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jp JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jp JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jnp JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jnp JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jl JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jl JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jge JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jge JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jle JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jle JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jg JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jg JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_jcc:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7266,6 +7964,15 @@ define void @test_jecxz_jrcxz() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_jecxz_jrcxz:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:  JXTGT:
+; BDVER2-NEXT:    jecxz JXTGT # sched: [2:1.00]
+; BDVER2-NEXT:    jrcxz JXTGT # sched: [2:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_jecxz_jrcxz:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7354,6 +8061,14 @@ define void @test_lahf_sahf() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lahf_sahf:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    lahf # sched: [1:0.50]
+; BDVER2-NEXT:    sahf # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lahf_sahf:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7438,6 +8153,13 @@ define void @test_leave() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_leave:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    leave # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_leave:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7536,6 +8258,16 @@ define void @test_lods() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lods:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    lodsb (%rsi), %al # sched: [7:0.67]
+; BDVER2-NEXT:    lodsw (%rsi), %ax # sched: [7:0.67]
+; BDVER2-NEXT:    lodsl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    lodsq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_lods:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7640,6 +8372,16 @@ define void @test_loop() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_loop:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:  LTGT:
+; BDVER2-NEXT:    loop LTGT # sched: [1:1.00]
+; BDVER2-NEXT:    loope LTGT # sched: [1:1.00]
+; BDVER2-NEXT:    loopne LTGT # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_loop:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7730,6 +8472,14 @@ define void @test_movnti(i32 %a0, i32 *%a1, i64 %a2, i64 *%a3) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movnti:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    movntil %edi, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    movntiq %rdx, (%rcx) # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movnti:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7830,6 +8580,16 @@ define void @test_movs() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    movsb (%rsi), %es:(%rdi) # sched: [8:1.00]
+; BDVER2-NEXT:    movsw (%rsi), %es:(%rdi) # sched: [8:1.00]
+; BDVER2-NEXT:    movsl (%rsi), %es:(%rdi) # sched: [8:1.00]
+; BDVER2-NEXT:    movsq (%rsi), %es:(%rdi) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7929,6 +8689,15 @@ define i64 @test_movslq(i32 %a0, i32 *%a1) optsize {
 ; SKX-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movslq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    movslq %edi, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    movslq (%rsi), %rcx # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movslq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -8082,6 +8851,22 @@ define void @test_mul(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_mul:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    mulb %dil # sched: [3:1.00]
+; BDVER2-NEXT:    mulb (%r8) # sched: [8:1.00]
+; BDVER2-NEXT:    mulw %si # sched: [4:1.33]
+; BDVER2-NEXT:    mulw (%r9) # sched: [9:1.33]
+; BDVER2-NEXT:    mull %edx # sched: [4:1.00]
+; BDVER2-NEXT:    mull (%rax) # sched: [9:1.00]
+; BDVER2-NEXT:    mulq %rcx # sched: [4:1.00]
+; BDVER2-NEXT:    mulq (%r10) # sched: [9:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_mul:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
@@ -8246,6 +9031,22 @@ define void @test_neg(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_neg:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    negb %dil # sched: [1:0.33]
+; BDVER2-NEXT:    negb (%r8) # sched: [7:1.00]
+; BDVER2-NEXT:    negw %si # sched: [1:0.33]
+; BDVER2-NEXT:    negw (%r9) # sched: [7:1.00]
+; BDVER2-NEXT:    negl %edx # sched: [1:0.33]
+; BDVER2-NEXT:    negl (%rax) # sched: [7:1.00]
+; BDVER2-NEXT:    negq %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    negq (%r10) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_neg:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
@@ -8386,6 +9187,19 @@ define void @test_nop(i16 %a0, i32 %a1, i64 %a2, i16 *%p0, i32 *%p1, i64 *%p2) o
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_nop:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    nop # sched: [1:0.25]
+; BDVER2-NEXT:    nopw %di # sched: [1:0.25]
+; BDVER2-NEXT:    nopw (%rcx) # sched: [1:0.25]
+; BDVER2-NEXT:    nopl %esi # sched: [1:0.25]
+; BDVER2-NEXT:    nopl (%r8) # sched: [1:0.25]
+; BDVER2-NEXT:    nopq %rdx # sched: [1:0.25]
+; BDVER2-NEXT:    nopq (%r9) # sched: [1:0.25]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_nop:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -8544,6 +9358,22 @@ define void @test_not(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_not:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    notb %dil # sched: [1:0.33]
+; BDVER2-NEXT:    notb (%r8) # sched: [7:1.00]
+; BDVER2-NEXT:    notw %si # sched: [1:0.33]
+; BDVER2-NEXT:    notw (%r9) # sched: [7:1.00]
+; BDVER2-NEXT:    notl %edx # sched: [1:0.33]
+; BDVER2-NEXT:    notl (%rax) # sched: [7:1.00]
+; BDVER2-NEXT:    notq %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    notq (%r10) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_not:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
@@ -8676,6 +9506,18 @@ define void @test_or_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_or_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    orb $7, %al # sched: [1:0.33]
+; BDVER2-NEXT:    orb $7, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    orb $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    orb %dl, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    orb %dil, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    orb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_or_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -8839,6 +9681,23 @@ define void @test_or_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_or_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    orw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    orw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    orw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    orw $7, %di # sched: [1:0.33]
+; BDVER2-NEXT:    orw $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    orw %dx, %di # sched: [1:0.33]
+; BDVER2-NEXT:    orw %di, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    orw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_or_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9012,6 +9871,23 @@ define void @test_or_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_or_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    orl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    orl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    orl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    orl $7, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    orl $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    orl %edx, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    orl %edi, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    orl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_or_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9185,6 +10061,23 @@ define void @test_or_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_or_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    orq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    orq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    orq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    orq $7, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    orq $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    orq %rdx, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    orq %rdi, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    orq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_or_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9319,6 +10212,18 @@ define void @test_out() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_out:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    outb %al, $7 # sched: [100:0.33]
+; BDVER2-NEXT:    outw %ax, $7 # sched: [100:0.33]
+; BDVER2-NEXT:    outl %eax, $7 # sched: [100:0.33]
+; BDVER2-NEXT:    outb %al, %dx # sched: [100:0.33]
+; BDVER2-NEXT:    outw %ax, %dx # sched: [100:0.33]
+; BDVER2-NEXT:    outl %eax, %dx # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_out:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9419,6 +10324,15 @@ define void @test_outs() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_outs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    outsb (%rsi), %dx # sched: [100:0.33]
+; BDVER2-NEXT:    outsw (%rsi), %dx # sched: [100:0.33]
+; BDVER2-NEXT:    outsl (%rsi), %dx # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_outs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9497,6 +10411,13 @@ define void @test_pause() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pause:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    pause # sched: [4:1.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pause:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9595,6 +10516,16 @@ define void @test_pop_push() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pop_push:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popq %fs # sched: [100:0.33]
+; BDVER2-NEXT:    popq %gs # sched: [100:0.33]
+; BDVER2-NEXT:    pushq %fs # sched: [3:1.00]
+; BDVER2-NEXT:    pushq %gs # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pop_push:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9722,6 +10653,19 @@ define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pop_push_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popw %ax # sched: [6:0.50]
+; BDVER2-NEXT:    popw (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    pushw %di # sched: [5:1.00]
+; BDVER2-NEXT:    pushw (%rsi) # sched: [5:1.00]
+; BDVER2-NEXT:    pushw $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    pushw $7 # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pop_push_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9855,6 +10799,19 @@ define i64 @test_pop_push_64(i64 %a0, i64 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pop_push_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popq %rax # sched: [6:0.50]
+; BDVER2-NEXT:    popq (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    pushq %rdi # sched: [5:1.00]
+; BDVER2-NEXT:    pushq (%rsi) # sched: [5:1.00]
+; BDVER2-NEXT:    pushq $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    pushq $7 # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_pop_push_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9949,6 +10906,14 @@ define void @test_popf_pushf() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_popf_pushf:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popfq # sched: [5:0.50]
+; BDVER2-NEXT:    pushfq # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_popf_pushf:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10113,6 +11078,24 @@ define void @test_rcl_rcr_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rcl_rcr_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rclb %dil # sched: [2:1.50]
+; BDVER2-NEXT:    rcrb %dil # sched: [2:1.50]
+; BDVER2-NEXT:    rclb (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrb (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rclb $7, %dil # sched: [5:4.00]
+; BDVER2-NEXT:    rcrb $7, %dil # sched: [5:4.00]
+; BDVER2-NEXT:    rclb $7, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrb $7, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rclb %cl, %dil # sched: [5:4.00]
+; BDVER2-NEXT:    rcrb %cl, %dil # sched: [5:4.00]
+; BDVER2-NEXT:    rclb %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrb %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rcl_rcr_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10296,6 +11279,24 @@ define void @test_rcl_rcr_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rcl_rcr_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rclw %di # sched: [2:1.50]
+; BDVER2-NEXT:    rcrw %di # sched: [2:1.50]
+; BDVER2-NEXT:    rclw (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrw (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rclw $7, %di # sched: [5:4.00]
+; BDVER2-NEXT:    rcrw $7, %di # sched: [5:4.00]
+; BDVER2-NEXT:    rclw $7, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrw $7, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rclw %cl, %di # sched: [5:4.00]
+; BDVER2-NEXT:    rcrw %cl, %di # sched: [5:4.00]
+; BDVER2-NEXT:    rclw %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrw %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rcl_rcr_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10479,6 +11480,24 @@ define void @test_rcl_rcr_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rcl_rcr_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rcll %edi # sched: [2:1.50]
+; BDVER2-NEXT:    rcrl %edi # sched: [2:1.50]
+; BDVER2-NEXT:    rcll (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrl (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcll $7, %edi # sched: [5:4.00]
+; BDVER2-NEXT:    rcrl $7, %edi # sched: [5:4.00]
+; BDVER2-NEXT:    rcll $7, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrl $7, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcll %cl, %edi # sched: [5:4.00]
+; BDVER2-NEXT:    rcrl %cl, %edi # sched: [5:4.00]
+; BDVER2-NEXT:    rcll %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrl %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rcl_rcr_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10662,6 +11681,24 @@ define void @test_rcl_rcr_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rcl_rcr_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rclq %rdi # sched: [2:1.50]
+; BDVER2-NEXT:    rcrq %rdi # sched: [2:1.50]
+; BDVER2-NEXT:    rclq (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrq (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rclq $7, %rdi # sched: [5:4.00]
+; BDVER2-NEXT:    rcrq $7, %rdi # sched: [5:4.00]
+; BDVER2-NEXT:    rclq $7, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrq $7, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rclq %cl, %rdi # sched: [5:4.00]
+; BDVER2-NEXT:    rcrq %cl, %rdi # sched: [5:4.00]
+; BDVER2-NEXT:    rclq %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcrq %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rcl_rcr_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10766,6 +11803,14 @@ define void @test_rdmsr_wrmsr() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rdmsr_wrmsr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rdmsr # sched: [100:0.33]
+; BDVER2-NEXT:    wrmsr # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rdmsr_wrmsr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10842,6 +11887,13 @@ define void @test_rdpmc() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rdpmc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rdpmc # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rdpmc:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10924,6 +11976,14 @@ define void @test_rdtsc_rdtscp() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rdtsc_rdtscp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rdtsc # sched: [100:0.33]
+; BDVER2-NEXT:    rdtscp # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rdtsc_rdtscp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11040,6 +12100,18 @@ define void @test_ret() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ret:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    lretl # sched: [6:1.00]
+; BDVER2-NEXT:    lretl $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_ret:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11212,6 +12284,24 @@ define void @test_rol_ror_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rol_ror_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rolb %dil # sched: [2:1.00]
+; BDVER2-NEXT:    rorb %dil # sched: [2:1.00]
+; BDVER2-NEXT:    rolb (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rorb (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rolb $7, %dil # sched: [2:1.00]
+; BDVER2-NEXT:    rorb $7, %dil # sched: [2:1.00]
+; BDVER2-NEXT:    rolb $7, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rorb $7, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rolb %cl, %dil # sched: [3:1.50]
+; BDVER2-NEXT:    rorb %cl, %dil # sched: [3:1.50]
+; BDVER2-NEXT:    rolb %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    rorb %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rol_ror_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11395,6 +12485,24 @@ define void @test_rol_ror_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rol_ror_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rolw %di # sched: [2:1.00]
+; BDVER2-NEXT:    rorw %di # sched: [2:1.00]
+; BDVER2-NEXT:    rolw (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rorw (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rolw $7, %di # sched: [2:1.00]
+; BDVER2-NEXT:    rorw $7, %di # sched: [2:1.00]
+; BDVER2-NEXT:    rolw $7, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rorw $7, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rolw %cl, %di # sched: [3:1.50]
+; BDVER2-NEXT:    rorw %cl, %di # sched: [3:1.50]
+; BDVER2-NEXT:    rolw %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    rorw %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rol_ror_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11578,6 +12686,24 @@ define void @test_rol_ror_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rol_ror_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    roll %edi # sched: [2:1.00]
+; BDVER2-NEXT:    rorl %edi # sched: [2:1.00]
+; BDVER2-NEXT:    roll (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rorl (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    roll $7, %edi # sched: [2:1.00]
+; BDVER2-NEXT:    rorl $7, %edi # sched: [2:1.00]
+; BDVER2-NEXT:    roll $7, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rorl $7, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    roll %cl, %edi # sched: [3:1.50]
+; BDVER2-NEXT:    rorl %cl, %edi # sched: [3:1.50]
+; BDVER2-NEXT:    roll %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    rorl %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rol_ror_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11761,6 +12887,24 @@ define void @test_rol_ror_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rol_ror_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rolq %rdi # sched: [2:1.00]
+; BDVER2-NEXT:    rorq %rdi # sched: [2:1.00]
+; BDVER2-NEXT:    rolq (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rorq (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rolq $7, %rdi # sched: [2:1.00]
+; BDVER2-NEXT:    rorq $7, %rdi # sched: [2:1.00]
+; BDVER2-NEXT:    rolq $7, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rorq $7, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    rolq %cl, %rdi # sched: [3:1.50]
+; BDVER2-NEXT:    rorq %cl, %rdi # sched: [3:1.50]
+; BDVER2-NEXT:    rolq %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    rorq %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_rol_ror_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11993,6 +13137,30 @@ define void @test_sar_shl_shr_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sar_shl_shr_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sarb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shlb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shrb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    sarb (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shlb (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shrb (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shlb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shrb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    sarb $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shlb $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shrb $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarb %cl, %dil # sched: [3:1.50]
+; BDVER2-NEXT:    shlb %cl, %dil # sched: [3:1.50]
+; BDVER2-NEXT:    shrb %cl, %dil # sched: [3:1.50]
+; BDVER2-NEXT:    sarb %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    shlb %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    shrb %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sar_shl_shr_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -12236,6 +13404,30 @@ define void @test_sar_shl_shr_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sar_shl_shr_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sarw %di # sched: [1:0.50]
+; BDVER2-NEXT:    shlw %di # sched: [1:0.50]
+; BDVER2-NEXT:    shrw %di # sched: [1:0.50]
+; BDVER2-NEXT:    sarw (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shlw (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shrw (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    shlw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    shrw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    sarw $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shlw $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shrw $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarw %cl, %di # sched: [3:1.50]
+; BDVER2-NEXT:    shlw %cl, %di # sched: [3:1.50]
+; BDVER2-NEXT:    shrw %cl, %di # sched: [3:1.50]
+; BDVER2-NEXT:    sarw %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    shlw %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    shrw %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sar_shl_shr_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -12479,6 +13671,30 @@ define void @test_sar_shl_shr_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sar_shl_shr_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sarl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shll %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shrl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    sarl (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shll (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shrl (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shll $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shrl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    sarl $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shll $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shrl $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarl %cl, %edi # sched: [3:1.50]
+; BDVER2-NEXT:    shll %cl, %edi # sched: [3:1.50]
+; BDVER2-NEXT:    shrl %cl, %edi # sched: [3:1.50]
+; BDVER2-NEXT:    sarl %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    shll %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    shrl %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sar_shl_shr_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -12722,6 +13938,30 @@ define void @test_sar_shl_shr_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sar_shl_shr_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sarq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shlq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shrq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    sarq (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shlq (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shrq (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shlq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shrq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    sarq $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shlq $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    shrq $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarq %cl, %rdi # sched: [3:1.50]
+; BDVER2-NEXT:    shlq %cl, %rdi # sched: [3:1.50]
+; BDVER2-NEXT:    shrq %cl, %rdi # sched: [3:1.50]
+; BDVER2-NEXT:    sarq %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    shlq %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    shrq %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sar_shl_shr_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -12870,6 +14110,18 @@ define void @test_sbb_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sbb_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sbbb $7, %al # sched: [2:0.67]
+; BDVER2-NEXT:    sbbb $7, %dil # sched: [2:0.67]
+; BDVER2-NEXT:    sbbb $7, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    sbbb %dl, %dil # sched: [2:0.67]
+; BDVER2-NEXT:    sbbb %dil, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    sbbb (%rsi), %dil # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sbb_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -13033,6 +14285,23 @@ define void @test_sbb_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sbb_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sbbw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    sbbw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    sbbw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [9:1.00]
+; BDVER2-NEXT:    sbbw $7, %di # sched: [2:0.67]
+; BDVER2-NEXT:    sbbw $7, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    sbbw %dx, %di # sched: [2:0.67]
+; BDVER2-NEXT:    sbbw %di, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    sbbw (%rsi), %di # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sbb_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -13206,6 +14475,23 @@ define void @test_sbb_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sbb_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sbbl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    sbbl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    sbbl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [9:1.00]
+; BDVER2-NEXT:    sbbl $7, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    sbbl $7, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    sbbl %edx, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    sbbl %edi, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    sbbl (%rsi), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sbb_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -13379,6 +14665,23 @@ define void @test_sbb_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sbb_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sbbq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    sbbq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    sbbq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [9:1.00]
+; BDVER2-NEXT:    sbbq $7, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    sbbq $7, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    sbbq %rdx, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    sbbq %rdi, (%rsi) # sched: [9:1.00]
+; BDVER2-NEXT:    sbbq (%rsi), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sbb_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -13497,6 +14800,16 @@ define void @test_scas() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_scas:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    scasb %es:(%rdi), %al # sched: [2:0.67]
+; BDVER2-NEXT:    scasw %es:(%rdi), %ax # sched: [2:0.67]
+; BDVER2-NEXT:    scasl %es:(%rdi), %eax # sched: [2:0.67]
+; BDVER2-NEXT:    scasq %es:(%rdi), %rax # sched: [2:0.67]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_scas:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -13825,6 +15138,44 @@ define void @test_setcc(i8 %a0, i8 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_setcc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    seto %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setno %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setae %dil # sched: [1:0.50]
+; BDVER2-NEXT:    sete %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setne %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setbe %dil # sched: [2:1.00]
+; BDVER2-NEXT:    seta %dil # sched: [2:1.00]
+; BDVER2-NEXT:    sets %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setns %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setp %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setnp %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setl %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setge %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setle %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setg %dil # sched: [1:0.50]
+; BDVER2-NEXT:    seto (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setno (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setb (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setae (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    sete (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setne (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setbe (%rsi) # sched: [3:1.00]
+; BDVER2-NEXT:    seta (%rsi) # sched: [3:1.00]
+; BDVER2-NEXT:    sets (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setns (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setp (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setnp (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setl (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setge (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setle (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    setg (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_setcc:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14019,6 +15370,20 @@ define void @test_shld_shrd_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_shld_shrd_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    shldw %cl, %si, %di # sched: [4:1.50]
+; BDVER2-NEXT:    shrdw %cl, %si, %di # sched: [4:1.50]
+; BDVER2-NEXT:    shldw %cl, %si, (%rdx) # sched: [10:1.50]
+; BDVER2-NEXT:    shrdw %cl, %si, (%rdx) # sched: [10:1.50]
+; BDVER2-NEXT:    shldw $7, %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    shrdw $7, %si, %di # sched: [2:0.67]
+; BDVER2-NEXT:    shldw $7, %si, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    shrdw $7, %si, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_shld_shrd_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14162,6 +15527,20 @@ define void @test_shld_shrd_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_shld_shrd_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    shldl %cl, %esi, %edi # sched: [4:1.50]
+; BDVER2-NEXT:    shrdl %cl, %esi, %edi # sched: [4:1.50]
+; BDVER2-NEXT:    shldl %cl, %esi, (%rdx) # sched: [10:1.50]
+; BDVER2-NEXT:    shrdl %cl, %esi, (%rdx) # sched: [10:1.50]
+; BDVER2-NEXT:    shldl $7, %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    shrdl $7, %esi, %edi # sched: [2:0.67]
+; BDVER2-NEXT:    shldl $7, %esi, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    shrdl $7, %esi, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_shld_shrd_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14305,6 +15684,20 @@ define void @test_shld_shrd_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_shld_shrd_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    shldq %cl, %rsi, %rdi # sched: [4:1.50]
+; BDVER2-NEXT:    shrdq %cl, %rsi, %rdi # sched: [4:1.50]
+; BDVER2-NEXT:    shldq %cl, %rsi, (%rdx) # sched: [10:1.50]
+; BDVER2-NEXT:    shrdq %cl, %rsi, (%rdx) # sched: [10:1.50]
+; BDVER2-NEXT:    shldq $7, %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    shrdq $7, %rsi, %rdi # sched: [2:0.67]
+; BDVER2-NEXT:    shldq $7, %rsi, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    shrdq $7, %rsi, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_shld_shrd_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14405,6 +15798,14 @@ define void @test_stc_std() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_stc_std:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    stc # sched: [1:0.33]
+; BDVER2-NEXT:    std # sched: [1:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_stc_std:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14508,6 +15909,16 @@ define void @test_stos() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_stos:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    stosb %al, %es:(%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    stosw %ax, %es:(%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    stosl %eax, %es:(%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    stosq %rax, %es:(%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_stos:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14630,6 +16041,18 @@ define void @test_sub_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sub_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    subb $7, %al # sched: [1:0.33]
+; BDVER2-NEXT:    subb $7, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    subb $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    subb %dl, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    subb %dil, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    subb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sub_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14793,6 +16216,23 @@ define void @test_sub_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sub_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    subw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    subw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    subw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    subw $7, %di # sched: [1:0.33]
+; BDVER2-NEXT:    subw $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    subw %dx, %di # sched: [1:0.33]
+; BDVER2-NEXT:    subw %di, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    subw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sub_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14966,6 +16406,23 @@ define void @test_sub_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sub_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    subl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    subl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    subl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    subl $7, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    subl $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    subl %edx, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    subl %edi, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    subl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sub_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15139,6 +16596,23 @@ define void @test_sub_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sub_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    subq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    subq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    subq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    subq $7, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    subq $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    subq %rdx, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    subq %rdi, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    subq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_sub_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15271,6 +16745,17 @@ define void @test_test_8(i8 %a0, i8* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_test_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    testb $7, %al # sched: [1:0.33]
+; BDVER2-NEXT:    testb $7, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    testb $7, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    testb %dil, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    testb %dil, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_test_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15408,6 +16893,20 @@ define void @test_test_16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_test_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    testw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    testw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    testw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [6:0.50]
+; BDVER2-NEXT:    testw %di, %di # sched: [1:0.33]
+; BDVER2-NEXT:    testw %di, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_test_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15551,6 +17050,20 @@ define void @test_test_32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_test_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    testl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    testl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    testl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:0.50]
+; BDVER2-NEXT:    testl %edi, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    testl %edi, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_test_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15694,6 +17207,20 @@ define void @test_test_64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_test_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    testq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    testq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    testq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:0.50]
+; BDVER2-NEXT:    testq %rdi, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    testq %rdi, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_test_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15783,6 +17310,13 @@ define void @test_ud2() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ud2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    ud2 # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_ud2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15873,6 +17407,14 @@ define void @test_xadd_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xadd_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xaddb %dil, %sil # sched: [2:1.00]
+; BDVER2-NEXT:    xaddb %dil, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xadd_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15956,6 +17498,14 @@ define void @test_xadd_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xadd_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xaddw %di, %si # sched: [2:1.00]
+; BDVER2-NEXT:    xaddw %di, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xadd_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16039,6 +17589,14 @@ define void @test_xadd_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xadd_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xaddl %edi, %esi # sched: [2:1.00]
+; BDVER2-NEXT:    xaddl %edi, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xadd_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16122,6 +17680,14 @@ define void @test_xadd_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xadd_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xaddq %rdi, %rsi # sched: [2:1.00]
+; BDVER2-NEXT:    xaddq %rdi, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xadd_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16206,6 +17772,14 @@ define void @test_xchg_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xchg_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xchgb %sil, %dil # sched: [2:1.00]
+; BDVER2-NEXT:    xchgb %dil, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xchg_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16297,6 +17871,15 @@ define void @test_xchg_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xchg_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xchgw %di, %ax # sched: [2:1.00]
+; BDVER2-NEXT:    xchgw %si, %di # sched: [2:1.00]
+; BDVER2-NEXT:    xchgw %di, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xchg_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16390,6 +17973,15 @@ define void @test_xchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xchg_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xchgl %edi, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    xchgl %esi, %edi # sched: [2:1.00]
+; BDVER2-NEXT:    xchgl %edi, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xchg_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16483,6 +18075,15 @@ define void @test_xchg_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xchg_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xchgq %rdi, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    xchgq %rsi, %rdi # sched: [2:1.00]
+; BDVER2-NEXT:    xchgq %rdi, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xchg_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16561,6 +18162,13 @@ define void @test_xlat() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xlat:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xlatb # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xlat:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16675,6 +18283,18 @@ define void @test_xor_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xor_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xorb $7, %al # sched: [1:0.33]
+; BDVER2-NEXT:    xorb $7, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    xorb $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    xorb %dl, %dil # sched: [1:0.33]
+; BDVER2-NEXT:    xorb %dil, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    xorb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xor_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16838,6 +18458,23 @@ define void @test_xor_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xor_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xorw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    xorw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    xorw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    xorw $7, %di # sched: [1:0.33]
+; BDVER2-NEXT:    xorw $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    xorw %dx, %di # sched: [1:0.33]
+; BDVER2-NEXT:    xorw %di, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    xorw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xor_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -17011,6 +18648,23 @@ define void @test_xor_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xor_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xorl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    xorl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    xorl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    xorl $7, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    xorl $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    xorl %edx, %edi # sched: [1:0.33]
+; BDVER2-NEXT:    xorl %edi, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    xorl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xor_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -17184,6 +18838,23 @@ define void @test_xor_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xor_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xorq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    xorq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    xorq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    xorq $7, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    xorq $7, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    xorq %rdx, %rdi # sched: [1:0.33]
+; BDVER2-NEXT:    xorq %rdi, (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    xorq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_xor_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll
index c5c9a3d8416..b8c38d9f396 100644
--- a/test/CodeGen/X86/small-byval-memcpy.ll
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -1,25 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s --check-prefix=CORE2
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck %s --check-prefix=NEHALEM
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 | FileCheck %s --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)
 
 define void @copy16bytes(i8* nocapture %a, i8* nocapture readonly %b) {
+; CORE2-LABEL: copy16bytes:
+; CORE2:       ## %bb.0:
+; CORE2-NEXT:    movq (%rsi), %rax
+; CORE2-NEXT:    movq 8(%rsi), %rcx
+; CORE2-NEXT:    movq %rcx, 8(%rdi)
+; CORE2-NEXT:    movq %rax, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: copy16bytes:
+; NEHALEM:       ## %bb.0:
+; NEHALEM-NEXT:    movups (%rsi), %xmm0
+; NEHALEM-NEXT:    movups %xmm0, (%rdi)
+; NEHALEM-NEXT:    retq
+;
+; BDVER2-LABEL: copy16bytes:
+; BDVER2:       ## %bb.0:
+; BDVER2-NEXT:    movups (%rsi), %xmm0
+; BDVER2-NEXT:    movups %xmm0, (%rdi)
+; BDVER2-NEXT:    retq
+;
+; BTVER2-LABEL: copy16bytes:
+; BTVER2:       ## %bb.0:
+; BTVER2-NEXT:    vmovups (%rsi), %xmm0
+; BTVER2-NEXT:    vmovups %xmm0, (%rdi)
+; BTVER2-NEXT:    retq
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i1 false)
   ret void
 
   ; CHECK-LABEL: copy16bytes
-  ; CORE2: movq
-  ; CORE2-NEXT: movq
-  ; CORE2-NEXT: movq
-  ; CORE2-NEXT: movq
-  ; CORE2-NEXT: retq
 
-  ; NEHALEM: movups
-  ; NEHALEM-NEXT: movups
-  ; NEHALEM-NEXT: retq
 
-  ; BTVER2: movups
-  ; BTVER2-NEXT: movups
-  ; BTVER2-NEXT: retq
 }
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index cd1fdfbc6aa..d36546e8799 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -14,6 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -100,6 +102,18 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    addps (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_addps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
@@ -208,6 +222,18 @@ define float @test_addss(float %a0, float %a1, float *%a2) {
 ; SKX-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    addss (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_addss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [3:1.00]
@@ -320,6 +346,18 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_andps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    andps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andps (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_andps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_andps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    andps %xmm1, %xmm0 # sched: [1:0.50]
@@ -436,6 +474,18 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; SKX-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_andnotps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    andnps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andnps (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_andnotps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_andnotps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    andnps %xmm1, %xmm0 # sched: [1:0.50]
@@ -563,6 +613,20 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cmpps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cmpeqps %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    cmpeqps (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    orps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cmpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cmpps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cmpeqps %xmm0, %xmm1 # sched: [2:1.00]
@@ -679,6 +743,18 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) {
 ; SKX-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cmpss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cmpeqss %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    cmpeqss (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cmpss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cmpss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cmpeqss %xmm1, %xmm0 # sched: [2:1.00]
@@ -896,6 +972,34 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_comiss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    comiss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    comiss (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_comiss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcomiss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.33]
+; BDVER2-NEXT:    vcomiss (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.33]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.33]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_comiss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    comiss %xmm1, %xmm0 # sched: [3:1.00]
@@ -1051,6 +1155,20 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
 ; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsi2ss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsi2ssl %edi, %xmm1 # sched: [5:2.00]
+; BDVER2-SSE-NEXT:    cvtsi2ssl (%rsi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtsi2ss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsi2ss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsi2ssl (%rsi), %xmm0 # sched: [14:1.00]
@@ -1177,6 +1295,20 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
 ; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsi2ssq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsi2ssq %rdi, %xmm1 # sched: [5:2.00]
+; BDVER2-SSE-NEXT:    cvtsi2ssq (%rsi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtsi2ssq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsi2ssq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsi2ssq (%rsi), %xmm0 # sched: [14:1.00]
@@ -1303,6 +1435,20 @@ define i32 @test_cvtss2si(float %a0, float *%a1) {
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtss2si:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtss2si %xmm0, %ecx # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    cvtss2si (%rdi), %eax # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtss2si:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtss2si %xmm0, %ecx # sched: [5:1.00]
+; BDVER2-NEXT:    vcvtss2si (%rdi), %eax # sched: [10:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtss2si:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtss2si (%rdi), %eax # sched: [12:1.00]
@@ -1432,6 +1578,20 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) {
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtss2siq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtss2si %xmm0, %rcx # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    cvtss2si (%rdi), %rax # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtss2siq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtss2si %xmm0, %rcx # sched: [5:1.00]
+; BDVER2-NEXT:    vcvtss2si (%rdi), %rax # sched: [10:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtss2siq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtss2si (%rdi), %rax # sched: [12:1.00]
@@ -1561,6 +1721,20 @@ define i32 @test_cvttss2si(float %a0, float *%a1) {
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttss2si:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttss2si %xmm0, %ecx # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    cvttss2si (%rdi), %eax # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvttss2si:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttss2si %xmm0, %ecx # sched: [5:1.00]
+; BDVER2-NEXT:    vcvttss2si (%rdi), %eax # sched: [10:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttss2si:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttss2si (%rdi), %eax # sched: [12:1.00]
@@ -1687,6 +1861,20 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) {
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttss2siq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttss2si %xmm0, %rcx # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    cvttss2si (%rdi), %rax # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvttss2siq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttss2si %xmm0, %rcx # sched: [5:1.00]
+; BDVER2-NEXT:    vcvttss2si (%rdi), %rax # sched: [10:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttss2siq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttss2si (%rdi), %rax # sched: [12:1.00]
@@ -1800,6 +1988,18 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [17:5.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_divps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    divps %xmm1, %xmm0 # sched: [14:14.00]
+; BDVER2-SSE-NEXT:    divps (%rdi), %xmm0 # sched: [20:14.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_divps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [14:14.00]
+; BDVER2-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [20:14.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_divps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    divps %xmm1, %xmm0 # sched: [19:19.00]
@@ -1908,6 +2108,18 @@ define float @test_divss(float %a0, float %a1, float *%a2) {
 ; SKX-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [16:3.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_divss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    divss %xmm1, %xmm0 # sched: [14:14.00]
+; BDVER2-SSE-NEXT:    divss (%rdi), %xmm0 # sched: [20:14.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_divss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [14:14.00]
+; BDVER2-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [20:14.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_divss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    divss %xmm1, %xmm0 # sched: [19:19.00]
@@ -2016,6 +2228,18 @@ define void @test_ldmxcsr(i32 %a0) {
 ; SKX-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_ldmxcsr:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_ldmxcsr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; BDVER2-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_ldmxcsr:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
@@ -2126,6 +2350,18 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_maxps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    maxps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    maxps (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_maxps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_maxps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    maxps %xmm1, %xmm0 # sched: [2:1.00]
@@ -2235,6 +2471,18 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_maxss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    maxss %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    maxss (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_maxss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_maxss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    maxss %xmm1, %xmm0 # sched: [2:1.00]
@@ -2344,6 +2592,18 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_minps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    minps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    minps (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_minps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_minps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    minps %xmm1, %xmm0 # sched: [2:1.00]
@@ -2453,6 +2713,18 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_minss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    minss %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    minss (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_minss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_minss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    minss %xmm1, %xmm0 # sched: [2:1.00]
@@ -2575,6 +2847,20 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movaps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movaps (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movaps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovaps (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movaps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movaps (%rdi), %xmm0 # sched: [5:1.00]
@@ -2682,6 +2968,16 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movhlps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movhlps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movhlps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50]
@@ -2813,6 +3109,22 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movhps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movhps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movhps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
@@ -2935,6 +3247,18 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
 ; SKX-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movlhps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movlhps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movlhps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
@@ -3069,6 +3393,22 @@ define <4 x float> @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movlps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movlps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovlps %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movlps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
@@ -3178,6 +3518,16 @@ define i32 @test_movmskps(<4 x float> %a0) {
 ; SKX-NEXT:    vmovmskps %xmm0, %eax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movmskps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movmskps %xmm0, %eax # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movmskps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovmskps %xmm0, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movmskps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movmskps %xmm0, %eax # sched: [3:1.00]
@@ -3274,6 +3624,16 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movntps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movntps %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movntps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movntps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movntps %xmm0, (%rdi) # sched: [3:1.00]
@@ -3389,6 +3749,20 @@ define void @test_movss_mem(float* %a0, float* %a1) {
 ; SKX-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movss_mem:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-SSE-NEXT:    addss %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movss %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movss_mem:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movss_mem:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -3494,6 +3868,16 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) {
 ; SKX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movss_reg:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movss_reg:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movss_reg:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
@@ -3609,6 +3993,20 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movups:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movups (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movups %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movups:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movups:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movups (%rdi), %xmm0 # sched: [5:1.00]
@@ -3721,6 +4119,18 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mulps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mulps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    mulps (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_mulps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mulps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mulps %xmm1, %xmm0 # sched: [2:1.00]
@@ -3829,6 +4239,18 @@ define float @test_mulss(float %a0, float %a1, float *%a2) {
 ; SKX-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mulss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mulss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    mulss (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_mulss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mulss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mulss %xmm1, %xmm0 # sched: [2:1.00]
@@ -3941,6 +4363,18 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
 ; SKX-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_orps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    orps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    orps (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_orps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_orps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    orps %xmm1, %xmm0 # sched: [1:0.50]
@@ -4105,6 +4539,26 @@ define void @test_prefetch(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_prefetch:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    #APP
+; BDVER2-SSE-NEXT:    prefetchnta (%rdi) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    prefetcht0 (%rdi) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    prefetcht1 (%rdi) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    prefetcht2 (%rdi) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    #NO_APP
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_prefetch:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    prefetchnta (%rdi) # sched: [5:0.50]
+; BDVER2-NEXT:    prefetcht0 (%rdi) # sched: [5:0.50]
+; BDVER2-NEXT:    prefetcht1 (%rdi) # sched: [5:0.50]
+; BDVER2-NEXT:    prefetcht2 (%rdi) # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_prefetch:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    #APP
@@ -4242,6 +4696,20 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_rcpps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    rcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    rcpps (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_rcpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpps (%rdi), %xmm1 # sched: [11:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_rcpps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    rcpps %xmm0, %xmm1 # sched: [2:1.00]
@@ -4384,6 +4852,22 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_rcpss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    rcpss %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-SSE-NEXT:    rcpss %xmm1, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_rcpss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_rcpss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -4519,6 +5003,20 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_rsqrtps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    rsqrtps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    rsqrtps (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_rsqrtps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [11:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_rsqrtps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    rsqrtps %xmm0, %xmm1 # sched: [2:1.00]
@@ -4661,6 +5159,22 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_rsqrtss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    rsqrtss %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-SSE-NEXT:    rsqrtss %xmm1, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_rsqrtss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_rsqrtss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -4774,6 +5288,16 @@ define void @test_sfence() {
 ; SKX-NEXT:    sfence # sched: [2:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_sfence:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    sfence # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_sfence:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    sfence # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_sfence:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    sfence # sched: [1:1.00]
@@ -4890,6 +5414,20 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_shufps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_shufps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
+; BDVER2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_shufps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50]
@@ -5017,6 +5555,20 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_sqrtps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    sqrtps %xmm0, %xmm1 # sched: [14:14.00]
+; BDVER2-SSE-NEXT:    sqrtps (%rdi), %xmm0 # sched: [20:14.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_sqrtps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [14:14.00]
+; BDVER2-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [20:14.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_sqrtps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    sqrtps %xmm0, %xmm1 # sched: [21:21.00]
@@ -5159,6 +5711,22 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_sqrtss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    sqrtss %xmm0, %xmm0 # sched: [14:14.00]
+; BDVER2-SSE-NEXT:    movaps (%rdi), %xmm1 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    sqrtss %xmm1, %xmm1 # sched: [14:14.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_sqrtss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [14:14.00]
+; BDVER2-NEXT:    vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; BDVER2-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [14:14.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_sqrtss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movaps (%rdi), %xmm1 # sched: [5:1.00]
@@ -5277,6 +5845,18 @@ define i32 @test_stmxcsr() {
 ; SKX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_stmxcsr:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    stmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_stmxcsr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; BDVER2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_stmxcsr:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    stmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
@@ -5387,6 +5967,18 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_subps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    subps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    subps (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_subps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_subps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    subps %xmm1, %xmm0 # sched: [3:1.00]
@@ -5495,6 +6087,18 @@ define float @test_subss(float %a0, float %a1, float *%a2) {
 ; SKX-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_subss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    subss %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    subss (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_subss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_subss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    subss %xmm1, %xmm0 # sched: [3:1.00]
@@ -5707,6 +6311,34 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_ucomiss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    ucomiss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    ucomiss (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_ucomiss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vucomiss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.33]
+; BDVER2-NEXT:    vucomiss (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.33]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.33]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_ucomiss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    ucomiss %xmm1, %xmm0 # sched: [3:1.00]
@@ -5862,6 +6494,20 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_unpckhps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_unpckhps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; BDVER2-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_unpckhps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
@@ -5988,6 +6634,20 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_unpcklps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_unpcklps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; BDVER2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_unpcklps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
@@ -6105,6 +6765,18 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_xorps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    xorps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    xorps (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_xorps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_xorps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    xorps %xmm1, %xmm0 # sched: [1:0.50]
@@ -6249,6 +6921,22 @@ define <4 x float> @test_fnop() nounwind {
 ; SKX-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_fnop:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    #APP
+; BDVER2-SSE-NEXT:    nop # sched: [1:0.25]
+; BDVER2-SSE-NEXT:    #NO_APP
+; BDVER2-SSE-NEXT:    xorps %xmm0, %xmm0 # sched: [0:0.25]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_fnop:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    nop # sched: [1:0.25]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # sched: [0:0.25]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_fnop:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    xorps %xmm0, %xmm0 # sched: [0:0.50]
diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index f66ccedc052..b6079121206 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -14,6 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx,+xop -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -98,6 +100,18 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    addpd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_addpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
@@ -206,6 +220,18 @@ define double @test_addsd(double %a0, double %a1, double *%a2) {
 ; SKX-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    addsd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_addsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [3:1.00]
@@ -327,6 +353,20 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_andpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    andpd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andpd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_andpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_andpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    andpd %xmm1, %xmm0 # sched: [1:0.50]
@@ -457,6 +497,20 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_andnotpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    andnpd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andnpd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_andnotpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_andnotpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    andnpd %xmm1, %xmm0 # sched: [1:0.50]
@@ -569,6 +623,16 @@ define void @test_clflush(i8* %p){
 ; SKX-NEXT:    clflush (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_clflush:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    clflush (%rdi) # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_clflush:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    clflush (%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_clflush:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    clflush (%rdi) # sched: [5:1.00]
@@ -685,6 +749,20 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cmppd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cmpeqpd %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    cmpeqpd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    orpd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cmppd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cmppd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cmpeqpd %xmm0, %xmm1 # sched: [2:1.00]
@@ -800,6 +878,18 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) {
 ; SKX-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cmpsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cmpeqsd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    cmpeqsd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cmpsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cmpsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cmpeqsd %xmm1, %xmm0 # sched: [2:1.00]
@@ -1017,6 +1107,34 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_comisd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    comisd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    comisd (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_comisd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcomisd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.33]
+; BDVER2-NEXT:    vcomisd (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.33]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.33]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_comisd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    comisd %xmm1, %xmm0 # sched: [3:1.00]
@@ -1174,6 +1292,20 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtdq2pd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtdq2pd %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    cvtdq2pd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtdq2pd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtdq2pd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtdq2pd %xmm0, %xmm1 # sched: [3:1.00]
@@ -1303,6 +1435,20 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtdq2ps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtdq2ps %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    cvtdq2ps (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtdq2ps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtdq2ps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtdq2ps %xmm0, %xmm1 # sched: [3:1.00]
@@ -1431,6 +1577,20 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtpd2dq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtpd2dq %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    cvtpd2dq (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtpd2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtpd2dq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtpd2dq %xmm0, %xmm1 # sched: [3:1.00]
@@ -1560,6 +1720,20 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtpd2ps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtpd2ps %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    cvtpd2ps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtpd2ps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtpd2ps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtpd2ps %xmm0, %xmm1 # sched: [3:1.00]
@@ -1688,6 +1862,20 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtps2dq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtps2dq %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    cvtps2dq (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtps2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtps2dq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtps2dq %xmm0, %xmm1 # sched: [3:1.00]
@@ -1816,6 +2004,20 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtps2pd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtps2pd %xmm0, %xmm1 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    cvtps2pd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtps2pd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtps2pd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtps2pd %xmm0, %xmm1 # sched: [2:1.00]
@@ -1944,6 +2146,20 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) {
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsd2si:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsd2si %xmm0, %ecx # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    cvtsd2si (%rdi), %eax # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtsd2si:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [5:1.00]
+; BDVER2-NEXT:    vcvtsd2si (%rdi), %eax # sched: [10:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsd2si:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsd2si (%rdi), %eax # sched: [12:1.00]
@@ -2073,6 +2289,20 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) {
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsd2siq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsd2si %xmm0, %rcx # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    cvtsd2si (%rdi), %rax # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtsd2siq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [5:1.00]
+; BDVER2-NEXT:    vcvtsd2si (%rdi), %rax # sched: [10:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsd2siq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsd2si (%rdi), %rax # sched: [12:1.00]
@@ -2216,6 +2446,22 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
 ; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsd2ss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsd2ss %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
+; BDVER2-SSE-NEXT:    cvtsd2ss %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtsd2ss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
+; BDVER2-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsd2ss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsd2ss %xmm0, %xmm1 # sched: [7:2.00]
@@ -2346,6 +2592,20 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
 ; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsi2sd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsi2sdl %edi, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2sdl (%rsi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtsi2sd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsi2sd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsi2sdl (%rsi), %xmm0 # sched: [14:1.00]
@@ -2472,6 +2732,20 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
 ; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsi2sdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsi2sdq %rdi, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2sdq (%rsi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtsi2sdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsi2sdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsi2sdq (%rsi), %xmm0 # sched: [14:1.00]
@@ -2614,6 +2888,22 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
 ; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtss2sd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtss2sd %xmm0, %xmm1 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-SSE-NEXT:    cvtss2sd %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvtss2sd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtss2sd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtss2sd %xmm0, %xmm1 # sched: [7:2.00]
@@ -2746,6 +3036,20 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttpd2dq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttpd2dq %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    cvttpd2dq (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvttpd2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttpd2dq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttpd2dq %xmm0, %xmm1 # sched: [3:1.00]
@@ -2875,6 +3179,20 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttps2dq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttps2dq %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    cvttps2dq (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvttps2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttps2dq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttps2dq %xmm0, %xmm1 # sched: [3:1.00]
@@ -3001,6 +3319,20 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) {
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttsd2si:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttsd2si %xmm0, %ecx # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    cvttsd2si (%rdi), %eax # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvttsd2si:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [5:1.00]
+; BDVER2-NEXT:    vcvttsd2si (%rdi), %eax # sched: [10:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttsd2si:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttsd2si (%rdi), %eax # sched: [12:1.00]
@@ -3127,6 +3459,20 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) {
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttsd2siq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttsd2si %xmm0, %rcx # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    cvttsd2si (%rdi), %rax # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_cvttsd2siq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [5:1.00]
+; BDVER2-NEXT:    vcvttsd2si (%rdi), %rax # sched: [10:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttsd2siq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttsd2si (%rdi), %rax # sched: [12:1.00]
@@ -3240,6 +3586,18 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [20:4.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_divpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    divpd %xmm1, %xmm0 # sched: [22:22.00]
+; BDVER2-SSE-NEXT:    divpd (%rdi), %xmm0 # sched: [28:22.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_divpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [22:22.00]
+; BDVER2-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [28:22.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_divpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    divpd %xmm1, %xmm0 # sched: [19:19.00]
@@ -3348,6 +3706,18 @@ define double @test_divsd(double %a0, double %a1, double *%a2) {
 ; SKX-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [19:4.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_divsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    divsd %xmm1, %xmm0 # sched: [22:22.00]
+; BDVER2-SSE-NEXT:    divsd (%rdi), %xmm0 # sched: [28:22.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_divsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [22:22.00]
+; BDVER2-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [28:22.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_divsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    divsd %xmm1, %xmm0 # sched: [19:19.00]
@@ -3449,6 +3819,16 @@ define void @test_lfence() {
 ; SKX-NEXT:    lfence # sched: [2:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_lfence:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    lfence # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_lfence:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    lfence # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_lfence:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    lfence # sched: [1:1.00]
@@ -3545,6 +3925,16 @@ define void @test_mfence() {
 ; SKX-NEXT:    mfence # sched: [3:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mfence:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mfence # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_mfence:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    mfence # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mfence:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mfence # sched: [1:1.00]
@@ -3639,6 +4029,16 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) {
 ; SKX-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_maskmovdqu:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    maskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_maskmovdqu:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_maskmovdqu:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    maskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
@@ -3742,6 +4142,18 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_maxpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    maxpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    maxpd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_maxpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_maxpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    maxpd %xmm1, %xmm0 # sched: [2:1.00]
@@ -3851,6 +4263,18 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_maxsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    maxsd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    maxsd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_maxsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_maxsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    maxsd %xmm1, %xmm0 # sched: [2:1.00]
@@ -3960,6 +4384,18 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_minpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    minpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    minpd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_minpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_minpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    minpd %xmm1, %xmm0 # sched: [2:1.00]
@@ -4069,6 +4505,18 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_minsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    minsd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    minsd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_minsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_minsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    minsd %xmm1, %xmm0 # sched: [2:1.00]
@@ -4191,6 +4639,20 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movapd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movapd (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movapd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovapd (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movapd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movapd (%rdi), %xmm0 # sched: [5:1.00]
@@ -4316,6 +4778,20 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
 ; SKX-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movdqa:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqa (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movdqa:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movdqa:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqa (%rdi), %xmm0 # sched: [5:1.00]
@@ -4441,6 +4917,20 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
 ; SKX-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movdqu:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqu (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movdqu %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movdqu:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movdqu:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqu (%rdi), %xmm0 # sched: [5:1.00]
@@ -4605,6 +5095,26 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
 ; SKX-NEXT:    vmovd %xmm1, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movd %edi, %xmm1 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movd %xmm2, %eax # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    movd %xmm1, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovd %edi, %xmm1 # sched: [1:1.00]
+; BDVER2-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vmovd %xmm0, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    vmovd %xmm1, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -4786,6 +5296,26 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
 ; SKX-NEXT:    vmovq %xmm1, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movd_64:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movq %rdi, %xmm1 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero sched: [6:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movq %xmm2, %rax # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    movq %xmm1, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movd_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovq %rdi, %xmm1 # sched: [1:1.00]
+; BDVER2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [6:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vmovq %xmm0, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    vmovq %xmm1, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movd_64:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero sched: [5:1.00]
@@ -4942,6 +5472,22 @@ define <2 x double> @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a
 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movhpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movhpd %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movhpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movhpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
@@ -5088,6 +5634,22 @@ define <2 x double> @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a
 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movlpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movlpd %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movlpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movlpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
@@ -5196,6 +5758,16 @@ define i32 @test_movmskpd(<2 x double> %a0) {
 ; SKX-NEXT:    vmovmskpd %xmm0, %eax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movmskpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movmskpd %xmm0, %eax # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movmskpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovmskpd %xmm0, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movmskpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movmskpd %xmm0, %eax # sched: [3:1.00]
@@ -5301,6 +5873,18 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
 ; SKX-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movntdqa:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movntdq %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movntdqa:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movntdqa:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
@@ -5408,6 +5992,18 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movntpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movntpd %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movntpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movntpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
@@ -5528,6 +6124,20 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
 ; SKX-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movq_mem:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movq %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movq_mem:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movq_mem:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero sched: [5:1.00]
@@ -5644,6 +6254,18 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
 ; SKX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movq_reg:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movq_reg:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movq_reg:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50]
@@ -5764,6 +6386,20 @@ define void @test_movsd_mem(double* %a0, double* %a1) {
 ; SKX-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movsd_mem:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
+; BDVER2-SSE-NEXT:    addsd %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movsd %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movsd_mem:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
+; BDVER2-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movsd_mem:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [5:1.00]
@@ -5875,6 +6511,17 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) {
 ; SKX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movsd_reg:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movsd_reg:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movsd_reg:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:0.50]
@@ -5992,6 +6639,20 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movupd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movupd (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movupd %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movupd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovupd (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movupd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movupd (%rdi), %xmm0 # sched: [5:1.00]
@@ -6104,6 +6765,18 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mulpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mulpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    mulpd (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_mulpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mulpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mulpd %xmm1, %xmm0 # sched: [4:2.00]
@@ -6212,6 +6885,18 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) {
 ; SKX-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mulsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mulsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    mulsd (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_mulsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mulsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mulsd %xmm1, %xmm0 # sched: [4:2.00]
@@ -6333,6 +7018,20 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_orpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    orpd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    orpd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_orpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_orpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    orpd %xmm1, %xmm0 # sched: [1:0.50]
@@ -6454,6 +7153,18 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_packssdw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    packssdw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    packssdw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_packssdw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_packssdw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    packssdw %xmm1, %xmm0 # sched: [1:0.50]
@@ -6568,6 +7279,18 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_packsswb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    packsswb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    packsswb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_packsswb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_packsswb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    packsswb %xmm1, %xmm0 # sched: [1:0.50]
@@ -6682,6 +7405,18 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_packuswb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    packuswb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    packuswb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_packuswb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_packuswb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    packuswb %xmm1, %xmm0 # sched: [1:0.50]
@@ -6796,6 +7531,18 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_paddb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddb %xmm1, %xmm0 # sched: [1:0.50]
@@ -6908,6 +7655,18 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_paddd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
@@ -7016,6 +7775,18 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_paddq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
@@ -7128,6 +7899,18 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddsb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddsb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddsb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_paddsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddsb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddsb %xmm1, %xmm0 # sched: [1:0.50]
@@ -7241,6 +8024,18 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddsw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddsw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_paddsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -7354,6 +8149,18 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddusb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddusb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddusb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_paddusb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddusb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddusb %xmm1, %xmm0 # sched: [1:0.50]
@@ -7467,6 +8274,18 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddusw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddusw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddusw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_paddusw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddusw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddusw %xmm1, %xmm0 # sched: [1:0.50]
@@ -7580,6 +8399,18 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_paddw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
@@ -7701,6 +8532,20 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pand:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pand %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pand (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pand:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pand:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pand %xmm1, %xmm0 # sched: [1:0.50]
@@ -7843,6 +8688,22 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pandn:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pandn %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm1 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pandn (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pandn:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pandn:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pandn %xmm1, %xmm0 # sched: [1:0.50]
@@ -7966,6 +8827,18 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pavgb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pavgb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pavgb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pavgb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pavgb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pavgb %xmm1, %xmm0 # sched: [1:0.50]
@@ -8088,6 +8961,18 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pavgw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pavgw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pavgw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pavgw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pavgw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pavgw %xmm1, %xmm0 # sched: [1:0.50]
@@ -8221,6 +9106,20 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpeqb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpeqb %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpeqb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcomeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpeqb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpeqb %xmm0, %xmm1 # sched: [1:0.50]
@@ -8350,6 +9249,20 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpeqd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpeqd %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpeqd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcomeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpeqd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpeqd %xmm0, %xmm1 # sched: [1:0.50]
@@ -8479,6 +9392,20 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpeqw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpeqw %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpeqw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcomeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpeqw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpeqw %xmm0, %xmm1 # sched: [1:0.50]
@@ -8614,6 +9541,21 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpgtb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pcmpgtb %xmm1, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpgtb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpgtb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpgtb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.50]
@@ -8751,6 +9693,21 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpgtd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pcmpgtd %xmm1, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpgtd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpgtd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.50]
@@ -8888,6 +9845,21 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpgtw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pcmpgtw %xmm1, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpgtw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpgtw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpgtw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.50]
@@ -9004,6 +9976,18 @@ define i16 @test_pextrw(<8 x i16> %a0) {
 ; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pextrw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pextrw $6, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pextrw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpextrw $6, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pextrw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pextrw $6, %xmm0, %eax # sched: [3:1.00]
@@ -9114,6 +10098,18 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
 ; SKX-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pinsrw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pinsrw $1, %edi, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    pinsrw $3, (%rsi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pinsrw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pinsrw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pinsrw $1, %edi, %xmm0 # sched: [7:0.50]
@@ -9222,6 +10218,18 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaddwd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaddwd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    pmaddwd (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmaddwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaddwd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaddwd %xmm1, %xmm0 # sched: [2:1.00]
@@ -9336,6 +10344,18 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxsw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxsw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmaxsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -9449,6 +10469,18 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxub:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxub %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxub (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmaxub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxub:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxub %xmm1, %xmm0 # sched: [1:0.50]
@@ -9562,6 +10594,18 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminsw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminsw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pminsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -9675,6 +10719,18 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminub:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminub %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminub (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pminub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminub:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminub %xmm1, %xmm0 # sched: [1:0.50]
@@ -9773,6 +10829,16 @@ define i32 @test_pmovmskb(<16 x i8> %a0) {
 ; SKX-NEXT:    vpmovmskb %xmm0, %eax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovmskb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovmskb %xmm0, %eax # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovmskb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovmskb %xmm0, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovmskb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovmskb %xmm0, %eax # sched: [3:1.00]
@@ -9876,6 +10942,18 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmulhuw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmulhuw %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    pmulhuw (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmulhuw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmulhuw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmulhuw %xmm1, %xmm0 # sched: [2:1.00]
@@ -9985,6 +11063,18 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmulhw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmulhw %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    pmulhw (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmulhw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmulhw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmulhw %xmm1, %xmm0 # sched: [2:1.00]
@@ -10094,6 +11184,18 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmullw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmullw %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    pmullw (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmullw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmullw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmullw %xmm1, %xmm0 # sched: [2:1.00]
@@ -10202,6 +11304,18 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmuludq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmuludq %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    pmuludq (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmuludq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmuludq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmuludq %xmm1, %xmm0 # sched: [2:1.00]
@@ -10325,6 +11439,20 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_por:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    por (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_por:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_por:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.50]
@@ -10438,6 +11566,18 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psadbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psadbw %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    psadbw (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psadbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psadbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psadbw %xmm1, %xmm0 # sched: [2:0.50]
@@ -10564,6 +11704,20 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pshufd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,2,1,0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pshufd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50]
+; BDVER2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pshufd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] sched: [1:0.50]
@@ -10693,6 +11847,20 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pshufhw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pshufhw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
+; BDVER2-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pshufhw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
@@ -10822,6 +11990,20 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pshuflw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pshuflw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
+; BDVER2-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pshuflw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
@@ -10948,6 +12130,20 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pslld:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pslld %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    pslld (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    pslld $2, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pslld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pslld:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pslld %xmm1, %xmm0 # sched: [1:0.50]
@@ -11056,6 +12252,16 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) {
 ; SKX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pslldq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pslldq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pslldq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
@@ -11171,6 +12377,20 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psllq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psllq %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    psllq (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    psllq $2, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psllq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psllq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psllq %xmm1, %xmm0 # sched: [1:0.50]
@@ -11299,6 +12519,20 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psllw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psllw %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    psllw (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    psllw $2, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psllw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psllw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psllw %xmm1, %xmm0 # sched: [1:0.50]
@@ -11427,6 +12661,20 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psrad:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psrad %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    psrad (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    psrad $2, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psrad:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psrad:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psrad %xmm1, %xmm0 # sched: [1:0.50]
@@ -11555,6 +12803,20 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psraw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psraw %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    psraw (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    psraw $2, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psraw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psraw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psraw %xmm1, %xmm0 # sched: [1:0.50]
@@ -11683,6 +12945,20 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psrld:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psrld %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    psrld (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    psrld $2, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psrld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psrld:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psrld %xmm1, %xmm0 # sched: [1:0.50]
@@ -11791,6 +13067,16 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) {
 ; SKX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psrldq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psrldq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psrldq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
@@ -11906,6 +13192,20 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psrlq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psrlq %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    psrlq (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    psrlq $2, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psrlq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psrlq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psrlq %xmm1, %xmm0 # sched: [1:0.50]
@@ -12034,6 +13334,20 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psrlw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psrlw %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    psrlw (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    psrlw $2, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psrlw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psrlw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psrlw %xmm1, %xmm0 # sched: [1:0.50]
@@ -12153,6 +13467,18 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psubb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubb %xmm1, %xmm0 # sched: [1:0.50]
@@ -12265,6 +13591,18 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psubd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubd %xmm1, %xmm0 # sched: [1:0.50]
@@ -12373,6 +13711,18 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psubq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubq %xmm1, %xmm0 # sched: [1:0.50]
@@ -12485,6 +13835,18 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubsb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubsb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubsb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psubsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubsb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubsb %xmm1, %xmm0 # sched: [1:0.50]
@@ -12598,6 +13960,18 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubsw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubsw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -12711,6 +14085,18 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubusb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubusb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubusb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psubusb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubusb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubusb %xmm1, %xmm0 # sched: [1:0.50]
@@ -12824,6 +14210,18 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubusw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubusw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubusw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psubusw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubusw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubusw %xmm1, %xmm0 # sched: [1:0.50]
@@ -12937,6 +14335,18 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psubw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubw %xmm1, %xmm0 # sched: [1:0.50]
@@ -13049,6 +14459,18 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpckhbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_punpckhbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpckhbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
@@ -13172,6 +14594,20 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpckhdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_punpckhdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpckhdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
@@ -13298,6 +14734,20 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpckhqdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_punpckhqdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpckhqdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
@@ -13415,6 +14865,18 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpckhwd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_punpckhwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpckhwd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
@@ -13527,6 +14989,18 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpcklbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_punpcklbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpcklbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
@@ -13650,6 +15124,20 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpckldq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_punpckldq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpckldq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
@@ -13776,6 +15264,20 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpcklqdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_punpcklqdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpcklqdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
@@ -13893,6 +15395,18 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpcklwd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_punpcklwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpcklwd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
@@ -14014,6 +15528,20 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pxor:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pxor %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pxor (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pxor:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pxor:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pxor %xmm1, %xmm0 # sched: [1:0.50]
@@ -14140,6 +15668,20 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_shufpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_shufpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
+; BDVER2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_shufpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50]
@@ -14267,6 +15809,20 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_sqrtpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    sqrtpd %xmm0, %xmm1 # sched: [21:21.00]
+; BDVER2-SSE-NEXT:    sqrtpd (%rdi), %xmm0 # sched: [27:21.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_sqrtpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [21:21.00]
+; BDVER2-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [27:21.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_sqrtpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    sqrtpd %xmm0, %xmm1 # sched: [27:27.00]
@@ -14409,6 +15965,22 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_sqrtsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    sqrtsd %xmm0, %xmm0 # sched: [21:21.00]
+; BDVER2-SSE-NEXT:    movapd (%rdi), %xmm1 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    sqrtsd %xmm1, %xmm1 # sched: [21:21.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_sqrtsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:21.00]
+; BDVER2-NEXT:    vmovapd (%rdi), %xmm1 # sched: [6:0.50]
+; BDVER2-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:21.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_sqrtsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movapd (%rdi), %xmm1 # sched: [5:1.00]
@@ -14527,6 +16099,18 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_subpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    subpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    subpd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_subpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_subpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    subpd %xmm1, %xmm0 # sched: [3:1.00]
@@ -14635,6 +16219,18 @@ define double @test_subsd(double %a0, double %a1, double *%a2) {
 ; SKX-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_subsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    subsd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    subsd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_subsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_subsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    subsd %xmm1, %xmm0 # sched: [3:1.00]
@@ -14847,6 +16443,34 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_ucomisd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    ucomisd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    ucomisd (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_ucomisd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vucomisd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.33]
+; BDVER2-NEXT:    vucomisd (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.33]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.33]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_ucomisd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    ucomisd %xmm1, %xmm0 # sched: [3:1.00]
@@ -15002,6 +16626,20 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_unpckhpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_unpckhpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_unpckhpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
@@ -15136,6 +16774,21 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_unpcklpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm2, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_unpcklpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00]
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_unpcklpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:0.50]
@@ -15264,6 +16917,20 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_xorpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    xorpd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    xorpd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_xorpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_xorpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    xorpd %xmm1, %xmm0 # sched: [1:0.50]
diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll
index bb37f21e4f6..c9583a14292 100644
--- a/test/CodeGen/X86/sse3-schedule.ll
+++ b/test/CodeGen/X86/sse3-schedule.ll
@@ -14,7 +14,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse3 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+sse3 -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=+sse3 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,ZNVER1
@@ -98,6 +100,18 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; SKX-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addsubpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addsubpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    addsubpd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_addsubpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addsubpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addsubpd %xmm1, %xmm0 # sched: [3:1.00]
@@ -207,6 +221,18 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; SKX-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addsubps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addsubps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    addsubps (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_addsubps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addsubps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addsubps %xmm1, %xmm0 # sched: [3:1.00]
@@ -316,6 +342,18 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; SKX-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_haddpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    haddpd %xmm1, %xmm0 # sched: [5:2.00]
+; BDVER2-SSE-NEXT:    haddpd (%rdi), %xmm0 # sched: [11:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_haddpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_haddpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    haddpd %xmm1, %xmm0 # sched: [3:1.00]
@@ -425,6 +463,18 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; SKX-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_haddps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    haddps %xmm1, %xmm0 # sched: [5:2.00]
+; BDVER2-SSE-NEXT:    haddps (%rdi), %xmm0 # sched: [11:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_haddps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_haddps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    haddps %xmm1, %xmm0 # sched: [3:1.00]
@@ -534,6 +584,18 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; SKX-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_hsubpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    hsubpd %xmm1, %xmm0 # sched: [5:2.00]
+; BDVER2-SSE-NEXT:    hsubpd (%rdi), %xmm0 # sched: [11:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_hsubpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_hsubpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    hsubpd %xmm1, %xmm0 # sched: [3:1.00]
@@ -643,6 +705,18 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; SKX-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_hsubps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    hsubps %xmm1, %xmm0 # sched: [5:2.00]
+; BDVER2-SSE-NEXT:    hsubps (%rdi), %xmm0 # sched: [11:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_hsubps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_hsubps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    hsubps %xmm1, %xmm0 # sched: [3:1.00]
@@ -741,6 +815,16 @@ define <16 x i8> @test_lddqu(i8* %a0) {
 ; SKX-NEXT:    vlddqu (%rdi), %xmm0 # sched: [6:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_lddqu:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    lddqu (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_lddqu:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vlddqu (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_lddqu:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    lddqu (%rdi), %xmm0 # sched: [5:1.00]
@@ -857,6 +941,20 @@ define void @test_monitor(i8* %a0, i32 %a1, i32 %a2) {
 ; SKX-NEXT:    monitor # sched: [100:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_monitor:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    monitor # sched: [100:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_monitor:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %esi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    monitor # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_monitor:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.50]
@@ -982,6 +1080,20 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vsubpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movddup:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0] sched: [6:0.50]
+; BDVER2-SSE-NEXT:    subpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movddup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
+; BDVER2-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [6:0.50]
+; BDVER2-NEXT:    vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movddup:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:0.50]
@@ -1109,6 +1221,20 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movshdup:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movshdup {{.*#+}} xmm0 = mem[1,1,3,3] sched: [6:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movshdup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
+; BDVER2-NEXT:    vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movshdup:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [1:0.50]
@@ -1236,6 +1362,20 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movsldup:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movsldup {{.*#+}} xmm0 = mem[0,0,2,2] sched: [6:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movsldup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
+; BDVER2-NEXT:    vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movsldup:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [1:0.50]
@@ -1362,6 +1502,20 @@ define void @test_mwait(i32 %a0, i32 %a1) {
 ; SKX-NEXT:    mwait # sched: [20:2.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mwait:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl %esi, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    mwait # sched: [100:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_mwait:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %esi, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    movl %edi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    mwait # sched: [100:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mwait:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %esi, %eax # sched: [1:0.50]
diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll
index 313f6325319..ace3d16ea71 100644
--- a/test/CodeGen/X86/sse41-schedule.ll
+++ b/test/CodeGen/X86/sse41-schedule.ll
@@ -13,6 +13,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4.1 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx,+xop -mattr=+sse4.1 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -103,6 +105,20 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_blendpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_blendpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] sched: [1:0.50]
+; BDVER2-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_blendpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] sched: [1:0.50]
@@ -222,6 +238,20 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_blendps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_blendps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
+; BDVER2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_blendps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
@@ -350,6 +380,21 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; SKX-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_blendvpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm3 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    blendvpd %xmm0, %xmm1, %xmm3 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    blendvpd %xmm0, (%rdi), %xmm3 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm3, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_blendvpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_blendvpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movapd %xmm0, %xmm3 # sched: [1:0.50]
@@ -480,6 +525,21 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; SKX-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_blendvps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movaps %xmm0, %xmm3 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    blendvps %xmm0, (%rdi), %xmm3 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm3, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_blendvps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_blendvps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movaps %xmm0, %xmm3 # sched: [1:0.50]
@@ -589,6 +649,18 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_dppd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    dppd $7, %xmm1, %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    dppd $7, (%rdi), %xmm0 # sched: [15:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_dppd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_dppd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    dppd $7, %xmm1, %xmm0 # sched: [9:3.00]
@@ -692,6 +764,18 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
 ; SKX-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [19:1.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_dpps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    dpps $7, %xmm1, %xmm0 # sched: [12:2.00]
+; BDVER2-SSE-NEXT:    dpps $7, (%rdi), %xmm0 # sched: [18:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_dpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [12:2.00]
+; BDVER2-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [18:2.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_dpps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    dpps $7, %xmm1, %xmm0 # sched: [11:3.00]
@@ -795,6 +879,18 @@ define i32 @test_extractps(<4 x float> %a0, i32 *%a1) {
 ; SKX-NEXT:    vextractps $1, %xmm0, (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_extractps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    extractps $3, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    extractps $1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_extractps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vextractps $3, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    vextractps $1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_extractps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    extractps $3, %xmm0, %eax # sched: [3:1.00]
@@ -899,6 +995,18 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2)
 ; SKX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_insertps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
+; BDVER2-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_insertps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
+; BDVER2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_insertps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
@@ -990,6 +1098,16 @@ define <2 x i64> @test_movntdqa(i8* %a0) {
 ; SKX-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movntdqa:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movntdqa (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_movntdqa:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movntdqa:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movntdqa (%rdi), %xmm0 # sched: [5:1.00]
@@ -1087,6 +1205,18 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mpsadbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mpsadbw $7, %xmm1, %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    mpsadbw $7, (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_mpsadbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mpsadbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mpsadbw $7, %xmm1, %xmm0 # sched: [3:2.00]
@@ -1191,6 +1321,18 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_packusdw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    packusdw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    packusdw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_packusdw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_packusdw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    packusdw %xmm1, %xmm0 # sched: [1:0.50]
@@ -1316,6 +1458,21 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16
 ; SKX-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pblendvb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm3 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm3 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    pblendvb %xmm0, (%rdi), %xmm3 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    movdqa %xmm3, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pblendvb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pblendvb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqa %xmm0, %xmm3 # sched: [1:0.50]
@@ -1437,6 +1594,20 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pblendw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6],mem[7] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pblendw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
+; BDVER2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6],mem[7] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pblendw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
@@ -1544,6 +1715,18 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpeqq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpeqq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpeqq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcomeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpeqq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpeqq %xmm1, %xmm0 # sched: [1:0.50]
@@ -1648,6 +1831,18 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
 ; SKX-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pextrb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pextrb $3, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    pextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pextrb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpextrb $3, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pextrb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pextrb $3, %xmm0, %eax # sched: [3:1.00]
@@ -1763,6 +1958,20 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
 ; SKX-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pextrd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pextrd $3, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    pextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pextrd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpextrd $3, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pextrd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddd %xmm0, %xmm0 # sched: [1:0.50]
@@ -1870,6 +2079,18 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
 ; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pextrq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pextrq $1, %xmm0, %rax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    pextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pextrq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpextrq $1, %xmm0, %rax # sched: [3:1.00]
+; BDVER2-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pextrq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pextrq $1, %xmm0, %rax # sched: [3:1.00]
@@ -1972,6 +2193,18 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
 ; SKX-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pextrw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pextrw $3, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    pextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pextrw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpextrw $3, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pextrw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pextrw $3, %xmm0, %eax # sched: [3:1.00]
@@ -2075,6 +2308,18 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
 ; SKX-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phminposuw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phminposuw (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    phminposuw %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_phminposuw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phminposuw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phminposuw (%rdi), %xmm0 # sched: [7:1.00]
@@ -2178,6 +2423,18 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
 ; SKX-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pinsrb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pinsrb $1, %edi, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    pinsrb $3, (%rsi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pinsrb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pinsrb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pinsrb $1, %edi, %xmm0 # sched: [7:0.50]
@@ -2280,6 +2537,18 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
 ; SKX-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pinsrd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pinsrd $1, %edi, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    pinsrd $3, (%rsi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pinsrd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pinsrd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pinsrd $1, %edi, %xmm0 # sched: [7:0.50]
@@ -2394,6 +2663,20 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pinsrq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pinsrq $1, %rdi, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    pinsrq $1, (%rsi), %xmm1 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pinsrq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pinsrq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pinsrq $1, %rdi, %xmm0 # sched: [7:0.50]
@@ -2501,6 +2784,18 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxsb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxsb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxsb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmaxsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxsb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxsb %xmm1, %xmm0 # sched: [1:0.50]
@@ -2604,6 +2899,18 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxsd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxsd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmaxsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxsd %xmm1, %xmm0 # sched: [1:0.50]
@@ -2707,6 +3014,18 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxud:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxud %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxud (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmaxud:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxud:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxud %xmm1, %xmm0 # sched: [1:0.50]
@@ -2810,6 +3129,18 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxuw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxuw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxuw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmaxuw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxuw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxuw %xmm1, %xmm0 # sched: [1:0.50]
@@ -2913,6 +3244,18 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminsb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminsb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminsb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pminsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminsb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminsb %xmm1, %xmm0 # sched: [1:0.50]
@@ -3016,6 +3359,18 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminsd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminsd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pminsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminsd %xmm1, %xmm0 # sched: [1:0.50]
@@ -3119,6 +3474,18 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminud:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminud %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminud (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pminud:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminud:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminud %xmm1, %xmm0 # sched: [1:0.50]
@@ -3222,6 +3589,18 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminuw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminuw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminuw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pminuw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminuw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminuw %xmm1, %xmm0 # sched: [1:0.50]
@@ -3338,6 +3717,20 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
 ; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxbw %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxbw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovsxbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmovsxbw (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxbw %xmm0, %xmm1 # sched: [1:0.50]
@@ -3459,6 +3852,20 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxbd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxbd %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxbd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovsxbd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmovsxbd (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxbd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxbd %xmm0, %xmm1 # sched: [1:0.50]
@@ -3580,6 +3987,20 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxbq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxbq %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxbq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovsxbq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmovsxbq (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxbq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxbq %xmm0, %xmm1 # sched: [1:0.50]
@@ -3701,6 +4122,20 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxdq %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxdq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovsxdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmovsxdq (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxdq %xmm0, %xmm1 # sched: [1:0.50]
@@ -3822,6 +4257,20 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxwd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxwd %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxwd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovsxwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmovsxwd (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxwd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxwd %xmm0, %xmm1 # sched: [1:0.50]
@@ -3943,6 +4392,20 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxwq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxwq %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxwq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovsxwq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmovsxwq (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxwq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxwq %xmm0, %xmm1 # sched: [1:0.50]
@@ -4064,6 +4527,20 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
 ; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovzxbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
+; BDVER2-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
@@ -4185,6 +4662,20 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxbd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovzxbd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
+; BDVER2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxbd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
@@ -4306,6 +4797,20 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxbq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovzxbq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
+; BDVER2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxbq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
@@ -4427,6 +4932,20 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovzxdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
+; BDVER2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
@@ -4548,6 +5067,20 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxwd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovzxwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
+; BDVER2-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxwd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
@@ -4669,6 +5202,20 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxwq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmovzxwq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
+; BDVER2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxwq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
@@ -4789,6 +5336,20 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x
 ; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmuldq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmuldq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [11:1.00]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmuldq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [7:1.00]
@@ -4897,6 +5458,18 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmulld:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmulld %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    pmulld (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmulld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmulld:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmulld %xmm1, %xmm0 # sched: [4:2.00]
@@ -5047,6 +5620,26 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    movzbl %cl, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_ptest:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    ptest %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    ptest (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    setb %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movzbl %cl, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_ptest:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vptest %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-NEXT:    vptest (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    setb %cl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.33]
+; BDVER2-NEXT:    movzbl %cl, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_ptest:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    ptest %xmm1, %xmm0 # sched: [3:1.00]
@@ -5181,6 +5774,20 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_roundpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    roundpd $7, %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    roundpd $7, (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_roundpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vroundpd $7, (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_roundpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    roundpd $7, %xmm0, %xmm1 # sched: [3:1.00]
@@ -5303,6 +5910,20 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_roundps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    roundps $7, %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    roundps $7, (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_roundps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vroundps $7, (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_roundps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    roundps $7, %xmm0, %xmm1 # sched: [3:1.00]
@@ -5430,6 +6051,21 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_roundsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    roundsd $7, %xmm1, %xmm2 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    roundsd $7, (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm2, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_roundsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-NEXT:    vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_roundsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:0.50]
@@ -5559,6 +6195,21 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
 ; SKX-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_roundss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movaps %xmm0, %xmm2 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    roundss $7, %xmm1, %xmm2 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    roundss $7, (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm2, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_roundss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-NEXT:    vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_roundss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movaps %xmm0, %xmm2 # sched: [1:0.50]
diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll
index 7bb4ac6a995..088494b86dc 100644
--- a/test/CodeGen/X86/sse42-schedule.ll
+++ b/test/CodeGen/X86/sse42-schedule.ll
@@ -13,6 +13,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4.2,+pclmul -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx,+xop -mattr=+sse4.2,+pclmul -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -103,6 +105,20 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
 ; SKX-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: crc32_32_8:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: crc32_32_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: crc32_32_8:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
@@ -222,6 +238,20 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
 ; SKX-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: crc32_32_16:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: crc32_32_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: crc32_32_16:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
@@ -341,6 +371,20 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
 ; SKX-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: crc32_32_32:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: crc32_32_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: crc32_32_32:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
@@ -460,6 +504,20 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
 ; SKX-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: crc32_64_8:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: crc32_64_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: crc32_64_8:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.50]
@@ -579,6 +637,20 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
 ; SKX-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: crc32_64_64:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: crc32_64_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; BDVER2-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: crc32_64_64:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.50]
@@ -770,6 +842,32 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpestri:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 # sched: [4:2.67]
+; BDVER2-SSE-NEXT:    movl %ecx, %esi # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
+; BDVER2-SSE-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; BDVER2-SSE-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpestri:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.33]
+; BDVER2-NEXT:    vpcmpestri $7, %xmm1, %xmm0 # sched: [4:2.67]
+; BDVER2-NEXT:    movl %ecx, %esi # sched: [1:0.33]
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.33]
+; BDVER2-NEXT:    vpcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
+; BDVER2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; BDVER2-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpestri:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
@@ -950,6 +1048,26 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [25:4.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpestrm:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pcmpestrm $7, %xmm1, %xmm0 # sched: [11:2.67]
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpestrm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.33]
+; BDVER2-NEXT:    vpcmpestrm $7, %xmm1, %xmm0 # sched: [11:2.67]
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.33]
+; BDVER2-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpestrm:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
@@ -1105,6 +1223,24 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpistri:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
+; BDVER2-SSE-NEXT:    movl %ecx, %eax # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
+; BDVER2-SSE-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; BDVER2-SSE-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpistri:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
+; BDVER2-NEXT:    movl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    vpcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
+; BDVER2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; BDVER2-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpistri:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpistri $7, %xmm1, %xmm0 # sched: [7:2.00]
@@ -1221,6 +1357,18 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [16:3.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpistrm:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; BDVER2-SSE-NEXT:    pcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpistrm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; BDVER2-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpistrm:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpistrm $7, %xmm1, %xmm0 # sched: [8:2.00]
@@ -1324,6 +1472,18 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpgtq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpgtq %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    pcmpgtq (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pcmpgtq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomgtq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpgtq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpgtq %xmm1, %xmm0 # sched: [1:0.50]
@@ -1428,6 +1588,18 @@ define <2 x i64> @test_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pclmulqdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pclmulqdq $0, %xmm1, %xmm0 # sched: [14:6.00]
+; BDVER2-SSE-NEXT:    pclmulqdq $0, (%rdi), %xmm0 # sched: [14:5.67]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pclmulqdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [14:6.00]
+; BDVER2-NEXT:    vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [14:5.67]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pclmulqdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pclmulqdq $0, %xmm1, %xmm0 # sched: [2:1.00]
diff --git a/test/CodeGen/X86/sse4a-schedule.ll b/test/CodeGen/X86/sse4a-schedule.ll
index 681953a8358..29ad2688b48 100644
--- a/test/CodeGen/X86/sse4a-schedule.ll
+++ b/test/CodeGen/X86/sse4a-schedule.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4a | FileCheck %s --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1
 
@@ -9,6 +10,11 @@ define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
 ; GENERIC-NEXT:    extrq %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_extrq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    extrq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_extrq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    extrq %xmm1, %xmm0 # sched: [1:0.50]
@@ -29,6 +35,11 @@ define <2 x i64> @test_extrqi(<2 x i64> %a0) {
 ; GENERIC-NEXT:    extrq $2, $3, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_extrqi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    extrq $2, $3, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_extrqi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    extrq $2, $3, %xmm0 # sched: [1:0.50]
@@ -49,6 +60,11 @@ define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) {
 ; GENERIC-NEXT:    insertq %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_insertq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    insertq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_insertq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    insertq %xmm1, %xmm0 # sched: [2:2.00]
@@ -69,6 +85,11 @@ define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) {
 ; GENERIC-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_insertqi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_insertqi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [2:2.00]
@@ -89,6 +110,11 @@ define void @test_movntsd(i8* %p, <2 x double> %a) {
 ; GENERIC-NEXT:    movntsd %xmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_movntsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movntsd %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movntsd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movntsd %xmm0, (%rdi) # sched: [3:1.00]
@@ -109,6 +135,11 @@ define void @test_movntss(i8* %p, <4 x float> %a) {
 ; GENERIC-NEXT:    movntss %xmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_movntss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movntss %xmm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-LABEL: test_movntss:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movntss %xmm0, (%rdi) # sched: [3:1.00]
diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll
index b10b1bb5c74..9f1f7d69624 100644
--- a/test/CodeGen/X86/ssse3-schedule.ll
+++ b/test/CodeGen/X86/ssse3-schedule.ll
@@ -14,6 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+ssse3 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+ssse3 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -113,6 +115,20 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
 ; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pabsb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pabsb %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pabsb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pabsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpabsb %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpabsb (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pabsb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pabsb %xmm0, %xmm1 # sched: [1:0.50]
@@ -242,6 +258,20 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pabsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pabsd %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pabsd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pabsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpabsd %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpabsd (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pabsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pabsd %xmm0, %xmm1 # sched: [1:0.50]
@@ -371,6 +401,20 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pabsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pabsw %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pabsw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pabsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpabsw %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpabsw (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pabsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pabsw %xmm0, %xmm1 # sched: [1:0.50]
@@ -495,6 +539,19 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_palignr:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_palignr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
+; BDVER2-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_palignr:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
@@ -605,6 +662,18 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phaddd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phaddd %xmm1, %xmm0 # sched: [3:1.50]
+; BDVER2-SSE-NEXT:    phaddd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_phaddd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER2-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phaddd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phaddd %xmm1, %xmm0 # sched: [1:0.50]
@@ -714,6 +783,18 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phaddsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phaddsw %xmm1, %xmm0 # sched: [3:1.50]
+; BDVER2-SSE-NEXT:    phaddsw (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_phaddsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER2-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phaddsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phaddsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -823,6 +904,18 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phaddw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phaddw %xmm1, %xmm0 # sched: [3:1.50]
+; BDVER2-SSE-NEXT:    phaddw (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_phaddw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER2-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phaddw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phaddw %xmm1, %xmm0 # sched: [1:0.50]
@@ -932,6 +1025,18 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phsubd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phsubd %xmm1, %xmm0 # sched: [3:1.50]
+; BDVER2-SSE-NEXT:    phsubd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_phsubd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER2-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phsubd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phsubd %xmm1, %xmm0 # sched: [1:0.50]
@@ -1041,6 +1146,18 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phsubsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phsubsw %xmm1, %xmm0 # sched: [3:1.50]
+; BDVER2-SSE-NEXT:    phsubsw (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_phsubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER2-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phsubsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phsubsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -1150,6 +1267,18 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phsubw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phsubw %xmm1, %xmm0 # sched: [3:1.50]
+; BDVER2-SSE-NEXT:    phsubw (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_phsubw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER2-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phsubw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phsubw %xmm1, %xmm0 # sched: [1:0.50]
@@ -1259,6 +1388,18 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaddubsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaddubsw %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    pmaddubsw (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmaddubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaddubsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaddubsw %xmm1, %xmm0 # sched: [2:1.00]
@@ -1369,6 +1510,18 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmulhrsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmulhrsw %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    pmulhrsw (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pmulhrsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmulhrsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmulhrsw %xmm1, %xmm0 # sched: [2:1.00]
@@ -1478,6 +1631,18 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pshufb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pshufb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pshufb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_pshufb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pshufb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pshufb %xmm1, %xmm0 # sched: [2:2.00]
@@ -1591,6 +1756,18 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psignb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psignb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psignb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psignb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psignb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psignb %xmm1, %xmm0 # sched: [1:0.50]
@@ -1704,6 +1881,18 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psignd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psignd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psignd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psignd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psignd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psignd %xmm1, %xmm0 # sched: [1:0.50]
@@ -1817,6 +2006,18 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psignw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psignw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psignw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER2-LABEL: test_psignw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psignw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psignw %xmm1, %xmm0 # sched: [1:0.50]
diff --git a/test/CodeGen/X86/tbm-schedule.ll b/test/CodeGen/X86/tbm-schedule.ll
index 94bedaa04ae..5c73c4b49dc 100644
--- a/test/CodeGen/X86/tbm-schedule.ll
+++ b/test/CodeGen/X86/tbm-schedule.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+tbm | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+tbm | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
 
@@ -14,12 +14,28 @@ define i32 @test_x86_tbm_bextri_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_bextri_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
-; BDVER-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_bextri_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER2-NEXT:    # sched: [2:1.00]
+; BDVER2-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_bextri_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER3-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_bextri_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER4-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = lshr i32 %a0, 4
   %m0 = lshr i32 %a1, 4
@@ -39,12 +55,28 @@ define i64 @test_x86_tbm_bextri_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_bextri_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
-; BDVER-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_bextri_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER2-NEXT:    # sched: [2:1.00]
+; BDVER2-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER2-NEXT:    # sched: [7:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_bextri_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER3-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_bextri_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER4-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = lshr i64 %a0, 4
   %m0 = lshr i64 %a1, 4
@@ -62,12 +94,26 @@ define i32 @test_x86_tbm_blcfill_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcfill_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcfilll %edi, %ecx
-; BDVER-NEXT:    blcfilll (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcfill_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcfilll %edi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    blcfilll (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcfill_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcfilll %edi, %ecx
+; BDVER3-NEXT:    blcfilll (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcfill_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcfilll %edi, %ecx
+; BDVER4-NEXT:    blcfilll (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = add i32 %a0, 1
   %m0 = add i32 %a1, 1
@@ -85,12 +131,26 @@ define i64 @test_x86_tbm_blcfill_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcfill_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcfillq %rdi, %rcx
-; BDVER-NEXT:    blcfillq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcfill_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcfillq %rdi, %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    blcfillq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcfill_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcfillq %rdi, %rcx
+; BDVER3-NEXT:    blcfillq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcfill_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcfillq %rdi, %rcx
+; BDVER4-NEXT:    blcfillq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = add i64 %a0, 1
   %m0 = add i64 %a1, 1
@@ -108,12 +168,26 @@ define i32 @test_x86_tbm_blci_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blci_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcil %edi, %ecx
-; BDVER-NEXT:    blcil (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blci_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcil %edi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    blcil (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blci_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcil %edi, %ecx
+; BDVER3-NEXT:    blcil (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blci_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcil %edi, %ecx
+; BDVER4-NEXT:    blcil (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = add i32 1, %a0
   %m0 = add i32 1, %a1
@@ -133,12 +207,26 @@ define i64 @test_x86_tbm_blci_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blci_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blciq %rdi, %rcx
-; BDVER-NEXT:    blciq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blci_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blciq %rdi, %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    blciq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blci_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blciq %rdi, %rcx
+; BDVER3-NEXT:    blciq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blci_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blciq %rdi, %rcx
+; BDVER4-NEXT:    blciq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = add i64 1, %a0
   %m0 = add i64 1, %a1
@@ -158,12 +246,26 @@ define i32 @test_x86_tbm_blcic_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcic_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcicl %edi, %ecx
-; BDVER-NEXT:    blcicl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcic_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcicl %edi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    blcicl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcic_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcicl %edi, %ecx
+; BDVER3-NEXT:    blcicl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcic_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcicl %edi, %ecx
+; BDVER4-NEXT:    blcicl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = xor i32 %a0, -1
   %m0 = xor i32 %a1, -1
@@ -183,12 +285,26 @@ define i64 @test_x86_tbm_blcic_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcic_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcicq %rdi, %rcx
-; BDVER-NEXT:    blcicq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcic_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcicq %rdi, %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    blcicq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcic_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcicq %rdi, %rcx
+; BDVER3-NEXT:    blcicq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcic_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcicq %rdi, %rcx
+; BDVER4-NEXT:    blcicq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = xor i64 %a0, -1
   %m0 = xor i64 %a1, -1
@@ -208,12 +324,26 @@ define i32 @test_x86_tbm_blcmsk_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcmsk_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcmskl %edi, %ecx
-; BDVER-NEXT:    blcmskl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcmsk_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcmskl %edi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    blcmskl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcmsk_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcmskl %edi, %ecx
+; BDVER3-NEXT:    blcmskl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcmsk_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcmskl %edi, %ecx
+; BDVER4-NEXT:    blcmskl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = add i32 %a0, 1
   %m0 = add i32 %a1, 1
@@ -231,12 +361,26 @@ define i64 @test_x86_tbm_blcmsk_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcmsk_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcmskq %rdi, %rcx
-; BDVER-NEXT:    blcmskq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcmsk_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcmskq %rdi, %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    blcmskq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcmsk_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcmskq %rdi, %rcx
+; BDVER3-NEXT:    blcmskq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcmsk_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcmskq %rdi, %rcx
+; BDVER4-NEXT:    blcmskq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = add i64 %a0, 1
   %m0 = add i64 %a1, 1
@@ -254,12 +398,26 @@ define i32 @test_x86_tbm_blcs_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcs_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcsl %edi, %ecx
-; BDVER-NEXT:    blcsl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcs_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcsl %edi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    blcsl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcs_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcsl %edi, %ecx
+; BDVER3-NEXT:    blcsl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcs_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcsl %edi, %ecx
+; BDVER4-NEXT:    blcsl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = add i32 %a0, 1
   %m0 = add i32 %a1, 1
@@ -277,12 +435,26 @@ define i64 @test_x86_tbm_blcs_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcs_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcsq %rdi, %rcx
-; BDVER-NEXT:    blcsq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcs_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcsq %rdi, %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    blcsq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcs_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcsq %rdi, %rcx
+; BDVER3-NEXT:    blcsq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcs_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcsq %rdi, %rcx
+; BDVER4-NEXT:    blcsq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = add i64 %a0, 1
   %m0 = add i64 %a1, 1
@@ -300,12 +472,26 @@ define i32 @test_x86_tbm_blsfill_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blsfill_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blsfilll %edi, %ecx
-; BDVER-NEXT:    blsfilll (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blsfill_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsfilll %edi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    blsfilll (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blsfill_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blsfilll %edi, %ecx
+; BDVER3-NEXT:    blsfilll (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blsfill_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blsfilll %edi, %ecx
+; BDVER4-NEXT:    blsfilll (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = add i32 %a0, -1
   %m0 = add i32 %a1, -1
@@ -323,12 +509,26 @@ define i64 @test_x86_tbm_blsfill_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blsfill_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blsfillq %rdi, %rcx
-; BDVER-NEXT:    blsfillq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blsfill_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsfillq %rdi, %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    blsfillq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blsfill_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blsfillq %rdi, %rcx
+; BDVER3-NEXT:    blsfillq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blsfill_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blsfillq %rdi, %rcx
+; BDVER4-NEXT:    blsfillq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = add i64 %a0, -1
   %m0 = add i64 %a1, -1
@@ -346,12 +546,26 @@ define i32 @test_x86_tbm_blsic_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blsic_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blsicl %edi, %ecx
-; BDVER-NEXT:    blsicl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blsic_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsicl %edi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    blsicl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blsic_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blsicl %edi, %ecx
+; BDVER3-NEXT:    blsicl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blsic_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blsicl %edi, %ecx
+; BDVER4-NEXT:    blsicl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = xor i32 %a0, -1
   %m0 = xor i32 %a1, -1
@@ -371,12 +585,26 @@ define i64 @test_x86_tbm_blsic_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blsic_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blsicq %rdi, %rcx
-; BDVER-NEXT:    blsicq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blsic_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsicq %rdi, %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    blsicq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blsic_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blsicq %rdi, %rcx
+; BDVER3-NEXT:    blsicq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blsic_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blsicq %rdi, %rcx
+; BDVER4-NEXT:    blsicq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = xor i64 %a0, -1
   %m0 = xor i64 %a1, -1
@@ -396,12 +624,26 @@ define i32 @test_x86_tbm_t1mskc_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_t1mskc_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    t1mskcl %edi, %ecx
-; BDVER-NEXT:    t1mskcl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_t1mskc_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    t1mskcl %edi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    t1mskcl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_t1mskc_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    t1mskcl %edi, %ecx
+; BDVER3-NEXT:    t1mskcl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_t1mskc_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    t1mskcl %edi, %ecx
+; BDVER4-NEXT:    t1mskcl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = xor i32 %a0, -1
   %m0 = xor i32 %a1, -1
@@ -421,12 +663,26 @@ define i64 @test_x86_tbm_t1mskc_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_t1mskc_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    t1mskcq %rdi, %rcx
-; BDVER-NEXT:    t1mskcq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_t1mskc_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    t1mskcq %rdi, %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    t1mskcq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_t1mskc_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    t1mskcq %rdi, %rcx
+; BDVER3-NEXT:    t1mskcq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_t1mskc_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    t1mskcq %rdi, %rcx
+; BDVER4-NEXT:    t1mskcq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = xor i64 %a0, -1
   %m0 = xor i64 %a1, -1
@@ -446,12 +702,26 @@ define i32 @test_x86_tbm_tzmsk_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_tzmsk_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    tzmskl %edi, %ecx
-; BDVER-NEXT:    tzmskl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_tzmsk_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    tzmskl %edi, %ecx # sched: [1:0.33]
+; BDVER2-NEXT:    tzmskl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_tzmsk_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    tzmskl %edi, %ecx
+; BDVER3-NEXT:    tzmskl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_tzmsk_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    tzmskl %edi, %ecx
+; BDVER4-NEXT:    tzmskl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = xor i32 %a0, -1
   %m0 = xor i32 %a1, -1
@@ -471,12 +741,26 @@ define i64 @test_x86_tbm_tzmsk_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_tzmsk_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    tzmskq %rdi, %rcx
-; BDVER-NEXT:    tzmskq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_tzmsk_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    tzmskq %rdi, %rcx # sched: [1:0.33]
+; BDVER2-NEXT:    tzmskq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_tzmsk_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    tzmskq %rdi, %rcx
+; BDVER3-NEXT:    tzmskq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_tzmsk_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    tzmskq %rdi, %rcx
+; BDVER4-NEXT:    tzmskq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = xor i64 %a0, -1
   %m0 = xor i64 %a1, -1
diff --git a/test/CodeGen/X86/x87-schedule.ll b/test/CodeGen/X86/x87-schedule.ll
index f4f91d82c52..8a338f20748 100644
--- a/test/CodeGen/X86/x87-schedule.ll
+++ b/test/CodeGen/X86/x87-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -68,6 +69,13 @@ define void @test_f2xm1() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_f2xm1:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    f2xm1 # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_f2xm1:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -142,6 +150,13 @@ define void @test_fabs() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fabs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fabs # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fabs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -256,6 +271,18 @@ define void @test_fadd(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fadd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fadd %st(0), %st(1) # sched: [3:1.00]
+; BDVER2-NEXT:    fadd %st(2) # sched: [3:1.00]
+; BDVER2-NEXT:    fadds (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    faddl (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fadd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -380,6 +407,18 @@ define void @test_faddp_fiadd(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_faddp_fiadd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    faddp %st(1) # sched: [3:1.00]
+; BDVER2-NEXT:    faddp %st(2) # sched: [3:1.00]
+; BDVER2-NEXT:    fiadds (%ecx) # sched: [13:2.00]
+; BDVER2-NEXT:    fiaddl (%eax) # sched: [13:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_faddp_fiadd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -480,6 +519,15 @@ define void @test_fbld_fbstp(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fbld_fbstp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fbld (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    fbstp (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fbld_fbstp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -558,6 +606,13 @@ define void @test_fchs() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fchs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fchs # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fchs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -640,6 +695,14 @@ define void @test_fclex() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fclex:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    wait # sched: [100:0.33]
+; BDVER2-NEXT:    fnclex # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fclex:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -716,6 +779,13 @@ define void @test_fnclex() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fnclex:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fnclex # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fnclex:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -846,6 +916,20 @@ define void @test_fcmov() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fcmov:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fcmovb %st(1), %st(0) # sched: [3:2.00]
+; BDVER2-NEXT:    fcmovbe %st(1), %st(0) # sched: [3:2.00]
+; BDVER2-NEXT:    fcmove %st(1), %st(0) # sched: [3:2.00]
+; BDVER2-NEXT:    fcmovnb %st(1), %st(0) # sched: [3:2.00]
+; BDVER2-NEXT:    fcmovnbe %st(1), %st(0) # sched: [3:2.00]
+; BDVER2-NEXT:    fcmovne %st(1), %st(0) # sched: [3:2.00]
+; BDVER2-NEXT:    fcmovnu %st(1), %st(0) # sched: [3:2.00]
+; BDVER2-NEXT:    fcmovu %st(1), %st(0) # sched: [3:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fcmov:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -974,6 +1058,18 @@ define void @test_fcom(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fcom:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fcom %st(1) # sched: [1:1.00]
+; BDVER2-NEXT:    fcom %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fcoms (%ecx) # sched: [8:1.00]
+; BDVER2-NEXT:    fcoml (%eax) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fcom:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1106,6 +1202,19 @@ define void @test_fcomp_fcompp(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fcomp_fcompp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fcomp %st(1) # sched: [1:1.00]
+; BDVER2-NEXT:    fcomp %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fcomps (%ecx) # sched: [8:1.00]
+; BDVER2-NEXT:    fcompl (%eax) # sched: [8:1.00]
+; BDVER2-NEXT:    fcompp # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fcomp_fcompp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1200,6 +1309,14 @@ define void @test_fcomi_fcomip() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fcomi_fcomip:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fcomi %st(3) # sched: [3:1.00]
+; BDVER2-NEXT:    fcompi %st(3) # sched: [3:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fcomi_fcomip:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1276,6 +1393,13 @@ define void @test_fcos() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fcos:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fcos # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fcos:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1350,6 +1474,13 @@ define void @test_fdecstp() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fdecstp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fdecstp # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fdecstp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1464,6 +1595,18 @@ define void @test_fdiv(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fdiv:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fdiv %st(0), %st(1) # sched: [14:14.00]
+; BDVER2-NEXT:    fdiv %st(2) # sched: [14:14.00]
+; BDVER2-NEXT:    fdivs (%ecx) # sched: [31:1.00]
+; BDVER2-NEXT:    fdivl (%eax) # sched: [31:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fdiv:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1588,6 +1731,18 @@ define void @test_fdivp_fidiv(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fdivp_fidiv:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fdivp %st(1) # sched: [14:14.00]
+; BDVER2-NEXT:    fdivp %st(2) # sched: [14:14.00]
+; BDVER2-NEXT:    fidivs (%ecx) # sched: [34:1.00]
+; BDVER2-NEXT:    fidivl (%eax) # sched: [34:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fdivp_fidiv:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1712,6 +1867,18 @@ define void @test_fdivr(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fdivr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fdivr %st(0), %st(1) # sched: [14:14.00]
+; BDVER2-NEXT:    fdivr %st(2) # sched: [14:14.00]
+; BDVER2-NEXT:    fdivrs (%ecx) # sched: [31:1.00]
+; BDVER2-NEXT:    fdivrl (%eax) # sched: [31:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fdivr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1836,6 +2003,18 @@ define void @test_fdivrp_fidivr(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fdivrp_fidivr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fdivrp %st(1) # sched: [14:14.00]
+; BDVER2-NEXT:    fdivrp %st(2) # sched: [14:14.00]
+; BDVER2-NEXT:    fidivrs (%ecx) # sched: [34:1.00]
+; BDVER2-NEXT:    fidivrl (%eax) # sched: [34:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fdivrp_fidivr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1920,6 +2099,13 @@ define void @test_ffree() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_ffree:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    ffree %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_ffree:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2034,6 +2220,18 @@ define void @test_ficom(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_ficom:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    ficoms (%ecx) # sched: [11:2.00]
+; BDVER2-NEXT:    ficoml (%eax) # sched: [11:2.00]
+; BDVER2-NEXT:    ficomps (%ecx) # sched: [11:2.00]
+; BDVER2-NEXT:    ficompl (%eax) # sched: [11:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_ficom:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2158,6 +2356,18 @@ define void @test_fild(i16 *%a0, i32 *%a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fild:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    filds (%edx) # sched: [10:1.00]
+; BDVER2-NEXT:    fildl (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fildll (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fild:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2242,6 +2452,13 @@ define void @test_fincstp() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fincstp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fincstp # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fincstp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2324,6 +2541,14 @@ define void @test_finit() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_finit:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    wait # sched: [100:0.33]
+; BDVER2-NEXT:    fninit # sched: [5:1.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_finit:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2400,6 +2625,13 @@ define void @test_fninit() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fninit:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fninit # sched: [5:1.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fninit:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2554,6 +2786,23 @@ define void @test_fist_fistp_fisttp(i16* %a0, i32* %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fist_fistp_fisttp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fists (%edx) # sched: [9:1.00]
+; BDVER2-NEXT:    fistl (%ecx) # sched: [9:1.00]
+; BDVER2-NEXT:    fistps (%edx) # sched: [9:1.00]
+; BDVER2-NEXT:    fistpl (%ecx) # sched: [9:1.00]
+; BDVER2-NEXT:    fistpll (%eax) # sched: [9:1.00]
+; BDVER2-NEXT:    fisttps (%edx) # sched: [5:1.00]
+; BDVER2-NEXT:    fisttpl (%ecx) # sched: [5:1.00]
+; BDVER2-NEXT:    fisttpll (%eax) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fist_fistp_fisttp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2696,6 +2945,19 @@ define void @test_fld(i16* %a0, i32* %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fld %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    flds (%edx) # sched: [9:1.00]
+; BDVER2-NEXT:    fldl (%ecx) # sched: [9:1.00]
+; BDVER2-NEXT:    fldt (%eax) # sched: [9:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fld:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2798,6 +3060,15 @@ define void @test_fldcw_fldenv(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fldcw_fldenv:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fldcw (%eax) # sched: [8:2.00]
+; BDVER2-NEXT:    fldenv (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fldcw_fldenv:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2924,6 +3195,19 @@ define void @test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fld1 # sched: [1:1.00]
+; BDVER2-NEXT:    fldl2e # sched: [1:1.00]
+; BDVER2-NEXT:    fldl2t # sched: [1:1.00]
+; BDVER2-NEXT:    fldlg2 # sched: [1:1.00]
+; BDVER2-NEXT:    fldln2 # sched: [1:1.00]
+; BDVER2-NEXT:    fldpi # sched: [1:1.00]
+; BDVER2-NEXT:    fldz # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3050,6 +3334,18 @@ define void @test_fmul(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fmul:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fmul %st(0), %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fmul %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fmuls (%ecx) # sched: [12:1.00]
+; BDVER2-NEXT:    fmull (%eax) # sched: [12:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fmul:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -3174,6 +3470,18 @@ define void @test_fmulp_fimul(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fmulp_fimul:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fmulp %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fmulp %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fimuls (%ecx) # sched: [15:1.00]
+; BDVER2-NEXT:    fimull (%eax) # sched: [15:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fmulp_fimul:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -3258,6 +3566,13 @@ define void @test_fnop() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fnop:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fnop # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fnop:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3332,6 +3647,13 @@ define void @test_fpatan() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fpatan:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fpatan # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fpatan:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3414,6 +3736,14 @@ define void @test_fprem_fprem1() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fprem_fprem1:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fprem # sched: [100:0.33]
+; BDVER2-NEXT:    fprem1 # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fprem_fprem1:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3490,6 +3820,13 @@ define void @test_fptan() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fptan:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fptan # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fptan:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3564,6 +3901,13 @@ define void @test_frndint() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_frndint:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    frndint # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_frndint:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3646,6 +3990,14 @@ define void @test_frstor(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_frstor:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    frstor (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_frstor:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -3738,6 +4090,15 @@ define void @test_fsave(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsave:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    wait # sched: [100:0.33]
+; BDVER2-NEXT:    fnsave (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fsave:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -3824,6 +4185,14 @@ define void @test_fnsave(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fnsave:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fnsave (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fnsave:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -3900,6 +4269,13 @@ define void @test_fscale() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fscale:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fscale # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fscale:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3974,6 +4350,13 @@ define void @test_fsin() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsin:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsin # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fsin:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4048,6 +4431,13 @@ define void @test_fsincos() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsincos:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsincos # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fsincos:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4122,6 +4512,13 @@ define void @test_fsqrt() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsqrt:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsqrt # sched: [24:24.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fsqrt:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4268,6 +4665,22 @@ define void @test_fst_fstp(i16* %a0, i32* %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fst_fstp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fst %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fsts (%edx) # sched: [6:1.00]
+; BDVER2-NEXT:    fstl (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    fstp %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fstpl (%edx) # sched: [6:1.00]
+; BDVER2-NEXT:    fstpl (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    fstpt (%eax) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fst_fstp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -4408,6 +4821,19 @@ define void @test_fstcw_fstenv_fstsw(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fstcw_fstenv_fstsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    wait # sched: [100:0.33]
+; BDVER2-NEXT:    fnstcw (%eax) # sched: [7:1.00]
+; BDVER2-NEXT:    wait # sched: [100:0.33]
+; BDVER2-NEXT:    fnstenv (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    wait # sched: [100:0.33]
+; BDVER2-NEXT:    fnstsw (%eax) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fstcw_fstenv_fstsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -4518,6 +4944,16 @@ define void @test_fnstcw_fnstenv_fnstsw(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fnstcw_fnstenv_fnstsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fnstcw (%eax) # sched: [7:1.00]
+; BDVER2-NEXT:    fnstenv (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    fnstsw (%eax) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fnstcw_fnstenv_fnstsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -4638,6 +5074,18 @@ define void @test_fsub(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsub %st(0), %st(1) # sched: [3:1.00]
+; BDVER2-NEXT:    fsub %st(2) # sched: [3:1.00]
+; BDVER2-NEXT:    fsubs (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fsubl (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fsub:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -4762,6 +5210,18 @@ define void @test_fsubp_fisub(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsubp_fisub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsubp %st(1) # sched: [3:1.00]
+; BDVER2-NEXT:    fsubp %st(2) # sched: [3:1.00]
+; BDVER2-NEXT:    fisubs (%ecx) # sched: [13:2.00]
+; BDVER2-NEXT:    fisubl (%eax) # sched: [13:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fsubp_fisub:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -4886,6 +5346,18 @@ define void @test_fsubr(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsubr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsubr %st(0), %st(1) # sched: [3:1.00]
+; BDVER2-NEXT:    fsubr %st(2) # sched: [3:1.00]
+; BDVER2-NEXT:    fsubrs (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fsubrl (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fsubr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -5010,6 +5482,18 @@ define void @test_fsubrp_fisubr(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsubrp_fisubr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsubrp %st(1) # sched: [3:1.00]
+; BDVER2-NEXT:    fsubrp %st(2) # sched: [3:1.00]
+; BDVER2-NEXT:    fisubrs (%ecx) # sched: [13:2.00]
+; BDVER2-NEXT:    fisubrl (%eax) # sched: [13:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fsubrp_fisubr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -5094,6 +5578,13 @@ define void @test_ftst() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_ftst:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    ftst # sched: [3:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_ftst:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5200,6 +5691,17 @@ define void @test_fucom_fucomp_fucompp() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fucom_fucomp_fucompp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fucom %st(1) # sched: [1:1.00]
+; BDVER2-NEXT:    fucom %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fucomp %st(1) # sched: [1:1.00]
+; BDVER2-NEXT:    fucomp %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fucompp # sched: [3:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fucom_fucomp_fucompp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5290,6 +5792,14 @@ define void @test_fucomi_fucomip() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fucomi_fucomip:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fucomi %st(3) # sched: [3:1.00]
+; BDVER2-NEXT:    fucompi %st(3) # sched: [3:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fucomi_fucomip:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5366,6 +5876,13 @@ define void @test_fwait() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fwait:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    wait # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fwait:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5440,6 +5957,13 @@ define void @test_fxam() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fxam:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fxam # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fxam:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5522,6 +6046,14 @@ define void @test_fxch() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fxch:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fxch %st(1) # sched: [1:0.33]
+; BDVER2-NEXT:    fxch %st(3) # sched: [1:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fxch:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5614,6 +6146,15 @@ define void @test_fxrstor_fxsave(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fxrstor_fxsave:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fxrstor (%eax) # sched: [5:2.00]
+; BDVER2-NEXT:    fxsave (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fxrstor_fxsave:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -5692,6 +6233,13 @@ define void @test_fxtract() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fxtract:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fxtract # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fxtract:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5766,6 +6314,13 @@ define void @test_fyl2x() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fyl2x:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fyl2x # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fyl2x:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5840,6 +6395,13 @@ define void @test_fyl2xp1() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fyl2xp1:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fyl2xp1 # sched: [100:0.33]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [6:1.00]
+;
 ; BTVER2-LABEL: test_fyl2xp1:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
diff --git a/test/CodeGen/X86/xop-schedule.ll b/test/CodeGen/X86/xop-schedule.ll
index 9a314e2327b..ffa3152f926 100644
--- a/test/CodeGen/X86/xop-schedule.ll
+++ b/test/CodeGen/X86/xop-schedule.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+xop | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+xop | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+xop | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
 
@@ -17,16 +17,38 @@ define void @test_vfrczpd(<2 x double> %a0, <4 x double> %a1, <2 x double> *%a2,
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfrczpd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfrczpd %xmm0, %xmm0
-; BDVER-NEXT:    vfrczpd %ymm1, %ymm1
-; BDVER-NEXT:    vfrczpd (%rdi), %xmm0
-; BDVER-NEXT:    vfrczpd (%rsi), %ymm1
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfrczpd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfrczpd %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER12-NEXT:    vfrczpd %ymm1, %ymm1 # sched: [3:1.00]
+; BDVER12-NEXT:    vfrczpd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    vfrczpd (%rsi), %ymm1 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vfrczpd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vfrczpd %xmm0, %xmm0
+; BDVER3-NEXT:    vfrczpd %ymm1, %ymm1
+; BDVER3-NEXT:    vfrczpd (%rdi), %xmm0
+; BDVER3-NEXT:    vfrczpd (%rsi), %ymm1
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    vzeroupper
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vfrczpd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vfrczpd %xmm0, %xmm0
+; BDVER4-NEXT:    vfrczpd %ymm1, %ymm1
+; BDVER4-NEXT:    vfrczpd (%rdi), %xmm0
+; BDVER4-NEXT:    vfrczpd (%rsi), %ymm1
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    vzeroupper
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vfrczpd $0, $0 \0a\09 vfrczpd $1, $1 \0a\09 vfrczpd $2, $0 \0a\09 vfrczpd $3, $1", "x,x,*m,*m"(<2 x double> %a0, <4 x double> %a1, <2 x double> *%a2, <4 x double> *%a3)
   ret void
 }
@@ -43,16 +65,38 @@ define void @test_vfrczps(<4 x float> %a0, <4 x double> %a1, <4 x float> *%a2, <
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfrczps:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfrczps %xmm0, %xmm0
-; BDVER-NEXT:    vfrczps %ymm1, %ymm1
-; BDVER-NEXT:    vfrczps (%rdi), %xmm0
-; BDVER-NEXT:    vfrczps (%rsi), %ymm1
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfrczps:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfrczps %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER12-NEXT:    vfrczps %ymm1, %ymm1 # sched: [3:1.00]
+; BDVER12-NEXT:    vfrczps (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    vfrczps (%rsi), %ymm1 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vfrczps:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vfrczps %xmm0, %xmm0
+; BDVER3-NEXT:    vfrczps %ymm1, %ymm1
+; BDVER3-NEXT:    vfrczps (%rdi), %xmm0
+; BDVER3-NEXT:    vfrczps (%rsi), %ymm1
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    vzeroupper
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vfrczps:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vfrczps %xmm0, %xmm0
+; BDVER4-NEXT:    vfrczps %ymm1, %ymm1
+; BDVER4-NEXT:    vfrczps (%rdi), %xmm0
+; BDVER4-NEXT:    vfrczps (%rsi), %ymm1
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    vzeroupper
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vfrczps $0, $0 \0a\09 vfrczps $1, $1 \0a\09 vfrczps $2, $0 \0a\09 vfrczps $3, $1", "x,x,*m,*m"(<4 x float> %a0, <4 x double> %a1, <4 x float> *%a2, <4 x double> *%a3)
   ret void
 }
@@ -66,13 +110,29 @@ define void @test_vfrczsd(<2 x double> %a0, <2 x double> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfrczsd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfrczsd %xmm0, %xmm0
-; BDVER-NEXT:    vfrczsd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfrczsd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfrczsd %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER12-NEXT:    vfrczsd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vfrczsd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vfrczsd %xmm0, %xmm0
+; BDVER3-NEXT:    vfrczsd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vfrczsd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vfrczsd %xmm0, %xmm0
+; BDVER4-NEXT:    vfrczsd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vfrczsd $0, $0 \0a\09 vfrczsd $1, $0", "x,*m"(<2 x double> %a0, <2 x double> *%a1)
   ret void
 }
@@ -86,13 +146,29 @@ define void @test_vfrczss(<4 x float> %a0, <4 x double> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfrczss:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfrczss %xmm0, %xmm0
-; BDVER-NEXT:    vfrczss (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfrczss:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfrczss %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER12-NEXT:    vfrczss (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vfrczss:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vfrczss %xmm0, %xmm0
+; BDVER3-NEXT:    vfrczss (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vfrczss:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vfrczss %xmm0, %xmm0
+; BDVER4-NEXT:    vfrczss (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vfrczss $0, $0 \0a\09 vfrczss $1, $0", "x,*m"(<4 x float> %a0, <4 x double> *%a1)
   ret void
 }
@@ -107,14 +183,32 @@ define void @test_vpcmov_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpcmov_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcmov (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcmov %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpcmov_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcmov (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcmov %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpcmov_128:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcmov (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcmov %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpcmov_128:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcmov (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcmov %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpcmov $2, $1, $0, $0 \0a\09 vpcmov $3, $1, $0, $0 \0a\09 vpcmov $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -130,15 +224,35 @@ define void @test_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, <4 x i
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpcmov_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpcmov_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER12-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpcmov_256:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    vzeroupper
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpcmov_256:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    vzeroupper
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpcmov $2, $1, $0, $0 \0a\09 vpcmov $3, $1, $0, $0 \0a\09 vpcmov $2, $3, $0, $0", "x,x,x,*m"(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, <4 x i64> *%a3)
   ret void
 }
@@ -158,19 +272,47 @@ define void @test_vpcom(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpcom:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpcomb $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomd $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomq $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomw $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomb $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomd $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomq $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomw $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpcom:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpcomb $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcomd $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcomq $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcomw $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcomb $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomd $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomq $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomw $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpcom:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpcomb $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomd $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomq $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomw $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomb $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomd $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomq $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomw $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpcom:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpcomb $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomd $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomq $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomw $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomb $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomd $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomq $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomw $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpcomb $3, $1, $0, $0 \0a\09 vpcomd $3, $1, $0, $0 \0a\09 vpcomq $3, $1, $0, $0 \0a\09 vpcomw $3, $1, $0, $0 \0a\09 vpcomb $3, $2, $0, $0 \0a\09 vpcomd $3, $2, $0, $0 \0a\09 vpcomq $3, $2, $0, $0 \0a\09 vpcomw $3, $2, $0, $0", "x,x,*m,i"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2, i8 3)
   ret void
 }
@@ -190,19 +332,47 @@ define void @test_vpcomu(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpcomu:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpcomub $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomud $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomuq $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomuw $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomub $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomud $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomuq $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomuw $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpcomu:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpcomub $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcomud $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcomuq $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcomuw $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcomub $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomud $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomuq $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomuw $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpcomu:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpcomub $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomud $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomuq $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomuw $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomub $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomud $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomuq $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomuw $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpcomu:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpcomub $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomud $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomuq $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomuw $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomub $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomud $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomuq $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomuw $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpcomub $3, $1, $0, $0 \0a\09 vpcomud $3, $1, $0, $0 \0a\09 vpcomuq $3, $1, $0, $0 \0a\09 vpcomuw $3, $1, $0, $0 \0a\09 vpcomub $3, $2, $0, $0 \0a\09 vpcomud $3, $2, $0, $0 \0a\09 vpcomuq $3, $2, $0, $0 \0a\09 vpcomuw $3, $2, $0, $0", "x,x,*m,i"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2, i8 3)
   ret void
 }
@@ -217,14 +387,32 @@ define void @test_vpermil2pd_128(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpermil2pd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpermil2pd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpermil2pd_128:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpermil2pd_128:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpermil2pd $4, $2, $1, $0, $0 \0a\09 vpermil2pd $4, $2, $3, $0, $0 \0a\09 vpermil2pd $4, $3, $1, $0, $0", "x,x,x,*m,i"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3, i8 3)
   ret void
 }
@@ -240,15 +428,35 @@ define void @test_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpermil2pd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpermil2pd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER12-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpermil2pd_256:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER3-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    vzeroupper
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpermil2pd_256:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER4-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    vzeroupper
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpermil2pd $4, $2, $1, $0, $0 \0a\09 vpermil2pd $4, $2, $3, $0, $0 \0a\09 vpermil2pd $4, $3, $1, $0, $0", "x,x,x,*m,i"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3, i8 3)
   ret void
 }
@@ -263,14 +471,32 @@ define void @test_vpermil2ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpermil2ps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpermil2ps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpermil2ps_128:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpermil2ps_128:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpermil2ps $4, $2, $1, $0, $0 \0a\09 vpermil2ps $4, $2, $3, $0, $0 \0a\09 vpermil2ps $4, $3, $1, $0, $0", "x,x,x,*m,i"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3, i8 3)
   ret void
 }
@@ -286,15 +512,35 @@ define void @test_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpermil2ps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpermil2ps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER12-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpermil2ps_256:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER3-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    vzeroupper
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpermil2ps_256:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER4-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    vzeroupper
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpermil2ps $4, $2, $1, $0, $0 \0a\09 vpermil2ps $4, $2, $3, $0, $0 \0a\09 vpermil2ps $4, $3, $1, $0, $0", "x,x,x,*m,i"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3, i8 3)
   ret void
 }
@@ -308,13 +554,29 @@ define void @test_vphaddbd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddbd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddbd %xmm0, %xmm0
-; BDVER-NEXT:    vphaddbd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddbd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddbd %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphaddbd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphaddbd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddbd %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddbd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddbd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddbd %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddbd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddbd $0, $0 \0a\09 vphaddbd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -328,13 +590,29 @@ define void @test_vphaddbq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddbq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddbq %xmm0, %xmm0
-; BDVER-NEXT:    vphaddbq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddbq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddbq %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphaddbq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphaddbq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddbq %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddbq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddbq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddbq %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddbq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddbq $0, $0 \0a\09 vphaddbq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -348,13 +626,29 @@ define void @test_vphaddbw(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddbw:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddbw %xmm0, %xmm0
-; BDVER-NEXT:    vphaddbw (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddbw:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddbw %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphaddbw (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphaddbw:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddbw %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddbw (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddbw:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddbw %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddbw (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddbw $0, $0 \0a\09 vphaddbw $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -368,13 +662,29 @@ define void @test_vphadddq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphadddq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphadddq %xmm0, %xmm0
-; BDVER-NEXT:    vphadddq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphadddq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphadddq %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphadddq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphadddq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphadddq %xmm0, %xmm0
+; BDVER3-NEXT:    vphadddq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphadddq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphadddq %xmm0, %xmm0
+; BDVER4-NEXT:    vphadddq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphadddq $0, $0 \0a\09 vphadddq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -388,13 +698,29 @@ define void @test_vphaddubd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddubd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddubd %xmm0, %xmm0
-; BDVER-NEXT:    vphaddubd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddubd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddubd %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphaddubd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphaddubd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddubd %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddubd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddubd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddubd %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddubd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddubd $0, $0 \0a\09 vphaddubd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -408,13 +734,29 @@ define void @test_vphaddubq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddubq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddubq %xmm0, %xmm0
-; BDVER-NEXT:    vphaddubq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddubq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddubq %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphaddubq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphaddubq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddubq %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddubq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddubq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddubq %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddubq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddubq $0, $0 \0a\09 vphaddubq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -428,13 +770,29 @@ define void @test_vphaddubw(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddubw:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddubw %xmm0, %xmm0
-; BDVER-NEXT:    vphaddubw (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddubw:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddubw %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphaddubw (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphaddubw:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddubw %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddubw (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddubw:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddubw %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddubw (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddubw $0, $0 \0a\09 vphaddubw $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -448,13 +806,29 @@ define void @test_vphaddudq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddudq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddudq %xmm0, %xmm0
-; BDVER-NEXT:    vphaddudq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddudq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddudq %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphaddudq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphaddudq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddudq %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddudq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddudq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddudq %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddudq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddudq $0, $0 \0a\09 vphaddudq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -468,13 +842,29 @@ define void @test_vphadduwd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphadduwd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphadduwd %xmm0, %xmm0
-; BDVER-NEXT:    vphadduwd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphadduwd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphadduwd %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphadduwd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphadduwd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphadduwd %xmm0, %xmm0
+; BDVER3-NEXT:    vphadduwd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphadduwd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphadduwd %xmm0, %xmm0
+; BDVER4-NEXT:    vphadduwd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphadduwd $0, $0 \0a\09 vphadduwd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -488,13 +878,29 @@ define void @test_vphadduwq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphadduwq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphadduwq %xmm0, %xmm0
-; BDVER-NEXT:    vphadduwq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphadduwq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphadduwq %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphadduwq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphadduwq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphadduwq %xmm0, %xmm0
+; BDVER3-NEXT:    vphadduwq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphadduwq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphadduwq %xmm0, %xmm0
+; BDVER4-NEXT:    vphadduwq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphadduwq $0, $0 \0a\09 vphadduwq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -508,13 +914,29 @@ define void @test_vphaddwd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddwd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddwd %xmm0, %xmm0
-; BDVER-NEXT:    vphaddwd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddwd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddwd %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphaddwd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphaddwd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddwd %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddwd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddwd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddwd %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddwd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddwd $0, $0 \0a\09 vphaddwd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -528,13 +950,29 @@ define void @test_vphaddwq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddwq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddwq %xmm0, %xmm0
-; BDVER-NEXT:    vphaddwq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddwq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddwq %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphaddwq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphaddwq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddwq %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddwq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddwq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddwq %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddwq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddwq $0, $0 \0a\09 vphaddwq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -548,13 +986,29 @@ define void @test_vphsubbw(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphsubbw:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphsubbw %xmm0, %xmm0
-; BDVER-NEXT:    vphsubbw (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphsubbw:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphsubbw %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphsubbw (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphsubbw:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphsubbw %xmm0, %xmm0
+; BDVER3-NEXT:    vphsubbw (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphsubbw:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphsubbw %xmm0, %xmm0
+; BDVER4-NEXT:    vphsubbw (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphsubbw $0, $0 \0a\09 vphsubbw $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -568,13 +1022,29 @@ define void @test_vphsubdq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphsubdq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphsubdq %xmm0, %xmm0
-; BDVER-NEXT:    vphsubdq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphsubdq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphsubdq %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphsubdq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphsubdq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphsubdq %xmm0, %xmm0
+; BDVER3-NEXT:    vphsubdq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphsubdq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphsubdq %xmm0, %xmm0
+; BDVER4-NEXT:    vphsubdq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphsubdq $0, $0 \0a\09 vphsubdq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -588,13 +1058,29 @@ define void @test_vphsubwd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphsubwd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphsubwd %xmm0, %xmm0
-; BDVER-NEXT:    vphsubwd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphsubwd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphsubwd %xmm0, %xmm0 # sched: [3:1.50]
+; BDVER12-NEXT:    vphsubwd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vphsubwd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphsubwd %xmm0, %xmm0
+; BDVER3-NEXT:    vphsubwd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphsubwd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphsubwd %xmm0, %xmm0
+; BDVER4-NEXT:    vphsubwd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphsubwd $0, $0 \0a\09 vphsubwd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -608,13 +1094,29 @@ define void @test_vpmacsdd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacsdd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacsdd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmacsdd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacsdd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacsdd $2, $1, $0, $0 \0a\09 vpmacsdd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -628,13 +1130,29 @@ define void @test_vpmacsdqh(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacsdqh:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacsdqh:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmacsdqh:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacsdqh:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacsdqh $2, $1, $0, $0 \0a\09 vpmacsdqh $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -648,13 +1166,29 @@ define void @test_vpmacsdql(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacsdql:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacsdql:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmacsdql:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacsdql:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacsdql $2, $1, $0, $0 \0a\09 vpmacsdql $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -668,13 +1202,29 @@ define void @test_vpmacssdd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacssdd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacssdd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmacssdd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacssdd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacssdd $2, $1, $0, $0 \0a\09 vpmacssdd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -688,13 +1238,29 @@ define void @test_vpmacssdqh(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacssdqh:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacssdqh:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmacssdqh:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacssdqh:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacssdqh $2, $1, $0, $0 \0a\09 vpmacssdqh $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -708,13 +1274,29 @@ define void @test_vpmacssdql(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacssdql:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacssdql:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmacssdql:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacssdql:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacssdql $2, $1, $0, $0 \0a\09 vpmacssdql $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -728,13 +1310,29 @@ define void @test_vpmacsswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacsswd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacsswd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmacsswd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacsswd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacsswd $2, $1, $0, $0 \0a\09 vpmacsswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -748,13 +1346,29 @@ define void @test_vpmacssww(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacssww:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacssww %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacssww:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmacssww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmacssww:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacssww %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacssww:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacssww %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacssww $2, $1, $0, $0 \0a\09 vpmacssww $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -768,13 +1382,29 @@ define void @test_vpmacswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacswd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacswd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacswd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmacswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmacswd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacswd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacswd $2, $1, $0, $0 \0a\09 vpmacswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -788,13 +1418,29 @@ define void @test_vpmacsww(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacsww:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacsww %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacsww:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmacsww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmacsww:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacsww %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacsww:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacsww %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacsww $2, $1, $0, $0 \0a\09 vpmacsww $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -808,13 +1454,29 @@ define void @test_vpmadcsswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmadcsswd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmadcsswd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmadcsswd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmadcsswd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmadcsswd $2, $1, $0, $0 \0a\09 vpmadcsswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -828,13 +1490,29 @@ define void @test_vpmadcswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmadcswd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmadcswd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER12-NEXT:    vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpmadcswd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmadcswd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmadcswd $2, $1, $0, $0 \0a\09 vpmadcswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -849,14 +1527,32 @@ define void @test_vpperm(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpperm:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpperm:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpperm:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpperm:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpperm $2, $1, $0, $0 \0A\09 vpperm $3, $1, $0, $0 \0A\09 vpperm $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -888,31 +1584,83 @@ define void @test_vprot(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vprot:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vprotb %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vprotd %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vprotq %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vprotw %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vprotb (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vprotd (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vprotq (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vprotw (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vprotb %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vprotd %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vprotq %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vprotw %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vprotb $7, %xmm0, %xmm0
-; BDVER-NEXT:    vprotd $7, %xmm0, %xmm0
-; BDVER-NEXT:    vprotq $7, %xmm0, %xmm0
-; BDVER-NEXT:    vprotw $7, %xmm0, %xmm0
-; BDVER-NEXT:    vprotb $7, (%rdi), %xmm0
-; BDVER-NEXT:    vprotd $7, (%rdi), %xmm0
-; BDVER-NEXT:    vprotq $7, (%rdi), %xmm0
-; BDVER-NEXT:    vprotw $7, (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vprot:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vprotb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vprotd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vprotq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vprotw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vprotb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotb %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotd %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotq %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotw %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotb $7, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vprotd $7, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vprotq $7, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vprotw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vprotb $7, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotd $7, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotq $7, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotw $7, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vprot:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vprotb %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotd %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotq %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotw %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotb (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vprotd (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vprotq (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vprotw (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vprotb %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotd %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotq %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotw %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotb $7, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotd $7, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotq $7, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotw $7, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotb $7, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotd $7, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotq $7, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotw $7, (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vprot:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vprotb %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotd %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotq %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotw %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotb (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vprotd (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vprotq (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vprotw (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vprotb %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotd %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotq %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotw %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotb $7, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotd $7, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotq $7, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotw $7, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotb $7, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotd $7, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotq $7, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotw $7, (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vprotb $1, $0, $0 \0A\09 vprotd $1, $0, $0 \0A\09 vprotq $1, $0, $0 \0A\09 vprotw $1, $0, $0 \0A\09 vprotb $2, $0, $0 \0A\09 vprotd $2, $0, $0 \0A\09 vprotq $2, $0, $0 \0A\09 vprotw $2, $0, $0 \0A\09 vprotb $0, $2, $0 \0A\09 vprotd $0, $2, $0 \0A\09 vprotq $0, $2, $0 \0A\09 vprotw $0, $2, $0 \0A\09 vprotb $3, $0, $0 \0A\09 vprotd $3, $0, $0 \0A\09 vprotq $3, $0, $0 \0A\09 vprotw $3, $0, $0 \0A\09 vprotb $3, $2, $0 \0A\09 vprotd $3, $2, $0 \0A\09 vprotq $3, $2, $0 \0A\09 vprotw $3, $2, $0", "x,x,*m,i"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2, i8 7)
   ret void
 }
@@ -936,23 +1684,59 @@ define void @test_vpsha(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpsha:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpshab %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshad %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshab (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshad (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshaq (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshaw (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshab %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshad %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshaq %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshaw %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpsha:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpshab %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpshad %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpshaq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpshaw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpshab (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshad (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshaq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshaw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshab %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshad %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshaq %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshaw %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpsha:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshab (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshad (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshaq (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshaw (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshab %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshad %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshaq %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshaw %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpsha:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshab (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshad (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshaq (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshaw (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshab %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshad %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshaq %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshaw %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpshab $1, $0, $0 \0A\09 vpshad $1, $0, $0 \0A\09 vpshaq $1, $0, $0 \0A\09 vpshaw $1, $0, $0 \0A\09 vpshab $2, $0, $0 \0A\09 vpshad $2, $0, $0 \0A\09 vpshaq $2, $0, $0 \0A\09 vpshaw $2, $0, $0 \0A\09 vpshab $0, $2, $0 \0A\09 vpshad $0, $2, $0 \0A\09 vpshaq $0, $2, $0 \0A\09 vpshaw $0, $2, $0", "x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
   ret void
 }
@@ -976,23 +1760,59 @@ define void @test_vpshl(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpshl:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshld %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshlb (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshld (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshlq (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshlw (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshlb %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshld %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshlq %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshlw %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpshl:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpshlb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpshld %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpshlq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpshlw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BDVER12-NEXT:    vpshlb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshld (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshlq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshlw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshlb %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshld %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshlq %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshlw %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [1:1.00]
+;
+; BDVER3-LABEL: test_vpshl:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlb (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshld (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlq (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlw (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlb %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshld %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshlq %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshlw %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpshl:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlb (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshld (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlq (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlw (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlb %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshld %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshlq %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshlw %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpshlb $1, $0, $0 \0A\09 vpshld $1, $0, $0 \0A\09 vpshlq $1, $0, $0 \0A\09 vpshlw $1, $0, $0 \0A\09 vpshlb $2, $0, $0 \0A\09 vpshld $2, $0, $0 \0A\09 vpshlq $2, $0, $0 \0A\09 vpshlw $2, $0, $0 \0A\09 vpshlb $0, $2, $0 \0A\09 vpshld $0, $2, $0 \0A\09 vpshlq $0, $2, $0 \0A\09 vpshlw $0, $2, $0", "x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
   ret void
 }
diff --git a/test/tools/llvm-mca/X86/BdVer2/add-sequence.s b/test/tools/llvm-mca/X86/BdVer2/add-sequence.s
new file mode 100644
index 00000000000..287095b7fb5
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/add-sequence.s
@@ -0,0 +1,95 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1000 -timeline < %s | FileCheck %s
+
+add %eax, %ecx
+add %esi, %eax
+add %eax, %edx
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      3000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               2.99
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        addl	%eax, %ecx
+# CHECK-NEXT:  1      1     0.33                        addl	%esi, %eax
+# CHECK-NEXT:  1      1     0.33                        addl	%eax, %edx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     addl	%esi, %eax
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     addl	%eax, %edx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .  .   addl	%eax, %ecx
+# CHECK-NEXT: [0,1]     DeER .    .  .   addl	%esi, %eax
+# CHECK-NEXT: [0,2]     D=eER.    .  .   addl	%eax, %edx
+# CHECK-NEXT: [1,0]     D=eER.    .  .   addl	%eax, %ecx
+# CHECK-NEXT: [1,1]     .DeER.    .  .   addl	%esi, %eax
+# CHECK-NEXT: [1,2]     .D=eER    .  .   addl	%eax, %edx
+# CHECK-NEXT: [2,0]     .D=eER    .  .   addl	%eax, %ecx
+# CHECK-NEXT: [2,1]     .D=eER    .  .   addl	%esi, %eax
+# CHECK-NEXT: [2,2]     . D=eER   .  .   addl	%eax, %edx
+# CHECK-NEXT: [3,0]     . D=eER   .  .   addl	%eax, %ecx
+# CHECK-NEXT: [3,1]     . D=eER   .  .   addl	%esi, %eax
+# CHECK-NEXT: [3,2]     . D==eER  .  .   addl	%eax, %edx
+# CHECK-NEXT: [4,0]     .  D=eER  .  .   addl	%eax, %ecx
+# CHECK-NEXT: [4,1]     .  D=eER  .  .   addl	%esi, %eax
+# CHECK-NEXT: [4,2]     .  D==eER .  .   addl	%eax, %edx
+# CHECK-NEXT: [5,0]     .  D==eER .  .   addl	%eax, %ecx
+# CHECK-NEXT: [5,1]     .   D=eER .  .   addl	%esi, %eax
+# CHECK-NEXT: [5,2]     .   D==eER.  .   addl	%eax, %edx
+# CHECK-NEXT: [6,0]     .   D==eER.  .   addl	%eax, %ecx
+# CHECK-NEXT: [6,1]     .   D==eER.  .   addl	%esi, %eax
+# CHECK-NEXT: [6,2]     .    D==eER  .   addl	%eax, %edx
+# CHECK-NEXT: [7,0]     .    D==eER  .   addl	%eax, %ecx
+# CHECK-NEXT: [7,1]     .    D==eER  .   addl	%esi, %eax
+# CHECK-NEXT: [7,2]     .    D===eER .   addl	%eax, %edx
+# CHECK-NEXT: [8,0]     .    .D==eER .   addl	%eax, %ecx
+# CHECK-NEXT: [8,1]     .    .D==eER .   addl	%esi, %eax
+# CHECK-NEXT: [8,2]     .    .D===eER.   addl	%eax, %edx
+# CHECK-NEXT: [9,0]     .    .D===eER.   addl	%eax, %ecx
+# CHECK-NEXT: [9,1]     .    . D==eER.   addl	%esi, %eax
+# CHECK-NEXT: [9,2]     .    . D===eER   addl	%eax, %edx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     10    2.5    0.1    0.0       addl	%eax, %ecx
+# CHECK-NEXT: 1.     10    2.2    0.1    0.0       addl	%esi, %eax
+# CHECK-NEXT: 2.     10    3.0    0.0    0.0       addl	%eax, %edx
diff --git a/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s
new file mode 100644
index 00000000000..c8e18731a3e
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s
@@ -0,0 +1,63 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 -resource-pressure=false -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+
+## Sets register RAX.
+imulq $5, %rcx, %rax
+
+## Kills the previous definition of RAX.
+## The upper portion of RAX is cleared.
+lzcnt %ecx, %eax
+
+## The AND can start immediately after the LZCNT.
+## It doesn't need to wait for the IMUL.
+and   %rcx, %rax
+bsf   %rax, %rcx
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        imulq	$5, %rcx, %rax
+# CHECK-NEXT:  1      3     1.00                        lzcntl	%ecx, %eax
+# CHECK-NEXT:  1      1     0.33                        andq	%rcx, %rax
+# CHECK-NEXT:  1      3     1.00                        bsfq	%rax, %rcx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .  .   imulq	$5, %rcx, %rax
+# CHECK-NEXT: [0,1]     D=eeeER   .    .  .   lzcntl	%ecx, %eax
+# CHECK-NEXT: [0,2]     D====eER  .    .  .   andq	%rcx, %rax
+# CHECK-NEXT: [0,3]     D=====eeeER    .  .   bsfq	%rax, %rcx
+# CHECK-NEXT: [1,0]     .D=======eeeER .  .   imulq	$5, %rcx, %rax
+# CHECK-NEXT: [1,1]     .D========eeeER.  .   lzcntl	%ecx, %eax
+# CHECK-NEXT: [1,2]     .D===========eER  .   andq	%rcx, %rax
+# CHECK-NEXT: [1,3]     .D============eeeER   bsfq	%rax, %rcx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       imulq	$5, %rcx, %rax
+# CHECK-NEXT: 1.     2     5.5    1.5    0.0       lzcntl	%ecx, %eax
+# CHECK-NEXT: 2.     2     8.5    0.0    0.0       andq	%rcx, %rax
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       bsfq	%rax, %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s
new file mode 100644
index 00000000000..99f463c3509
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s
@@ -0,0 +1,137 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 -resource-pressure=false -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+
+# In this test, the VDIVPS takes 38 cycles to write to register YMM3.  The first
+# VADDPS does not depend on the VDIVPS (the WAW dependency is eliminated at
+# register renaming stage). So the first VADDPS can be executed in parallel to
+# the VDIVPS. That VADDPS also writes to register XMM3, and the upper half of
+# YMM3 is implicitly cleared. As a consequence, the definition of YMM3 from the
+# VDIVPS is killed, and the subsequent VADDPS instructions don't need to wait
+# for the VDIVPS to complete.
+# The block reciprocal throughput is limited by the VDIVPS reciprocal throughput
+# (which is 38 cycles). The sequence of VADDPS can be executed in parallel on
+# the FPA unit; their latency is "hidden" by the long latency of the VDIVPS.
+
+vdivps %ymm0, %ymm1, %ymm3
+vaddps %xmm0, %xmm1, %xmm3
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vandps %xmm4, %xmm1, %xmm0
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1800
+# CHECK-NEXT: Total Cycles:      2804
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.64
+# CHECK-NEXT: Block RThroughput: 28.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  3      29    28.00                       vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      1     1.00                        vandps	%xmm4, %xmm1, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER   .    .    .    .    .   .   vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT: [0,1]     DeeeE--------------------------R   .    .    .    .    .   .   vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: [0,2]     .D==eeeE-----------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,3]     .D===eeeE----------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,4]     .D====eeeE---------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,5]     .D=====eeeE--------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,6]     . D=====eeeE-------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,7]     . D======eeeE------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,8]     . D=======eeeE-----------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,9]     . D========eeeE----------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,10]    .  D========eeeE---------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,11]    .  D=========eeeE--------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,12]    .  D==========eeeE-------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,13]    .  D===========eeeE------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,14]    .   D===========eeeE-----------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,15]    .   D============eeeE----------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,16]    .   D=============eeeE---------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,17]    .   D================eE--------R   .    .    .    .    .   .   vandps	%xmm4, %xmm1, %xmm0
+# CHECK-NEXT: [1,0]     .    D=======================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeER   vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT: [1,1]     .    D================eeeE---------------------------------R   vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: [1,2]     .    .D==================eeeE------------------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,3]     .    .D===================eeeE-----------------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,4]     .    .D====================eeeE----------------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,5]     .    .D=====================eeeE---------------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,6]     .    . D=====================eeeE--------------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,7]     .    . D======================eeeE-------------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,8]     .    . D=======================eeeE------------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,9]     .    . D========================eeeE-----------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,10]    .    .  D========================eeeE----------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,11]    .    .  D=========================eeeE---------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,12]    .    .  D==========================eeeE--------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,13]    .    .  D===========================eeeE-------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,14]    .    .   D===========================eeeE------------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,15]    .    .   D============================eeeE-----------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,16]    .    .   D=============================eeeE----------------R   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,17]    .    .   D================================eE---------------R   vandps	%xmm4, %xmm1, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     12.5   4.0    0.0       vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.     2     9.0    0.5    29.5      vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 2.     2     11.0   0.0    26.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 3.     2     12.0   1.0    25.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 4.     2     13.0   2.0    24.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 5.     2     14.0   3.0    23.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 6.     2     14.0   4.0    22.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 7.     2     15.0   5.0    21.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 8.     2     16.0   6.0    20.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 9.     2     17.0   7.0    19.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 10.    2     17.0   8.0    18.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 11.    2     18.0   9.0    17.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 12.    2     19.0   10.0   16.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 13.    2     20.0   11.0   15.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 14.    2     20.0   12.0   14.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 15.    2     21.0   13.0   13.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 16.    2     22.0   14.0   12.5      vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 17.    2     25.0   0.0    11.5      vandps	%xmm4, %xmm1, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
new file mode 100644
index 00000000000..c51fb3677dd
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
@@ -0,0 +1,72 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+
+# Perf stat reports an IPC of 1.97 for this block of code.
+
+# The CMP instruction doesn't depend on the value of EAX.  It can set the flags
+# without having to read the inputs.
+
+cmp %eax, %eax
+cmovae %ebx, %eax
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      3000
+# CHECK-NEXT: Total Cycles:      4503
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.67
+# CHECK-NEXT: Block RThroughput: 0.8
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        cmpl	%eax, %eax
+# CHECK-NEXT:  2      2     0.67                        cmovael	%ebx, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     cmpl	%eax, %eax
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cmovael	%ebx, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    ..   cmpl	%eax, %eax
+# CHECK-NEXT: [0,1]     D=eeER    ..   cmovael	%ebx, %eax
+# CHECK-NEXT: [1,0]     D===eER   ..   cmpl	%eax, %eax
+# CHECK-NEXT: [1,1]     .D===eeER ..   cmovael	%ebx, %eax
+# CHECK-NEXT: [2,0]     .D=====eER..   cmpl	%eax, %eax
+# CHECK-NEXT: [2,1]     . D=====eeER   cmovael	%ebx, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     3.7    0.3    0.0       cmpl	%eax, %eax
+# CHECK-NEXT: 1.     3     4.0    0.0    0.0       cmovael	%ebx, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
new file mode 100644
index 00000000000..e72ce0c7ba5
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
@@ -0,0 +1,87 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+
+# perf stat reports an IPC of 2.00 for this block of code.
+
+# All of the vector packed compares from this test are dependency breaking
+# instructions. That means, there is no RAW dependency between any of the
+# instructions, and the code can be fully parallelized in hardware.
+
+vpcmpeqb %xmm0, %xmm0, %xmm1
+vpcmpeqw %xmm1, %xmm1, %xmm2
+vpcmpeqd %xmm2, %xmm2, %xmm3
+vpcmpeqq %xmm3, %xmm3, %xmm0
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      6000
+# CHECK-NEXT: Total Cycles:      6003
+# CHECK-NEXT: Total uOps:        6000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     2.00    -     2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .   .   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     D=eER.    .   .   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     D==eER    .   .   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,3]     D===eER   .   .   vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [1,0]     .D===eER  .   .   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [1,1]     .D====eER .   .   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .D=====eER.   .   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,3]     .D======eER   .   vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [2,0]     . D======eER  .   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [2,1]     . D=======eER .   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,2]     . D========eER.   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,3]     . D=========eER   vpcmpeqq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     4.0    0.3    0.0       vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     3     5.0    0.0    0.0       vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 2.     3     6.0    0.0    0.0       vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 3.     3     7.0    0.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
new file mode 100644
index 00000000000..463de0b8123
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
@@ -0,0 +1,87 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+
+# perf stat reports an IPC of 2.00 for this block of code.
+
+# All of the vector packed compares from this test are zero idioms.  These zero
+# idioms are all detected and removed by the register renamer.  That means, no
+# uOp is executed, and there is no RAW dependency for any of the packed
+# compares.
+
+vpcmpgtb %xmm0, %xmm0, %xmm1
+vpcmpgtw %xmm1, %xmm1, %xmm2
+vpcmpgtd %xmm2, %xmm2, %xmm3
+vpcmpgtq %xmm3, %xmm3, %xmm0
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      6000
+# CHECK-NEXT: Total Cycles:      1501
+# CHECK-NEXT: Total uOps:        6000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    4.00
+# CHECK-NEXT: IPC:               4.00
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123
+
+# CHECK:      [0,0]     DR .   vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     DR .   vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     DR .   vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,3]     DR .   vpcmpgtq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [1,0]     .DR.   vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [1,1]     .DR.   vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .DR.   vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,3]     .DR.   vpcmpgtq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [2,0]     . DR   vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [2,1]     . DR   vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,2]     . DR   vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,3]     . DR   vpcmpgtq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    0.0       vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     3     0.0    0.0    0.0       vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 2.     3     0.0    0.0    0.0       vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 3.     3     0.0    0.0    0.0       vpcmpgtq	%xmm3, %xmm3, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
new file mode 100644
index 00000000000..68d24f52128
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+
+# perf stat reports an IPC of 1.00 for this code block.
+
+# Although both SBB are dependency breaking instructions, there is still an
+# implicit dependency on EFLAGS which limits the ILP. So, the hardware backend
+# can only execute one instruction per cycle.
+
+sbb %edx, %edx
+sbb %eax, %eax
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      3000
+# CHECK-NEXT: Total Cycles:      6003
+# CHECK-NEXT: Total uOps:        6000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      2     0.67                        sbbl	%edx, %edx
+# CHECK-NEXT:  2      2     0.67                        sbbl	%eax, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.33   1.33    -     1.33    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     sbbl	%edx, %edx
+# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     sbbl	%eax, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .   .   sbbl	%edx, %edx
+# CHECK-NEXT: [0,1]     D==eeER   .   .   sbbl	%eax, %eax
+# CHECK-NEXT: [1,0]     .D===eeER .   .   sbbl	%edx, %edx
+# CHECK-NEXT: [1,1]     .D=====eeER   .   sbbl	%eax, %eax
+# CHECK-NEXT: [2,0]     . D======eeER .   sbbl	%edx, %edx
+# CHECK-NEXT: [2,1]     . D========eeER   sbbl	%eax, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     4.0    0.3    0.0       sbbl	%edx, %edx
+# CHECK-NEXT: 1.     3     6.0    0.0    0.0       sbbl	%eax, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
new file mode 100644
index 00000000000..88dd23be8f3
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
@@ -0,0 +1,80 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+
+# perf stat reports a throughput of 1.51 IPC for this block of code.
+
+# The SBB does not depend on the value of register EAX. That means, it doesn't
+# have to wait for the IMUL to write-back on EAX. However, it still depends on
+# the ADD for EFLAGS.
+
+imul %edx, %eax
+add %edx, %edx
+sbb %eax, %eax
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      4500
+# CHECK-NEXT: Total Cycles:      7503
+# CHECK-NEXT: Total uOps:        6000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.80
+# CHECK-NEXT: IPC:               0.60
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        imull	%edx, %eax
+# CHECK-NEXT:  1      1     0.33                        addl	%edx, %edx
+# CHECK-NEXT:  2      2     0.67                        sbbl	%eax, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.33   1.33    -     1.33    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imull	%edx, %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -     addl	%edx, %edx
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     sbbl	%eax, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    . .   imull	%edx, %eax
+# CHECK-NEXT: [0,1]     DeE--R    .    . .   addl	%edx, %edx
+# CHECK-NEXT: [0,2]     D===eeER  .    . .   sbbl	%eax, %eax
+# CHECK-NEXT: [1,0]     .D====eeeER    . .   imull	%edx, %eax
+# CHECK-NEXT: [1,1]     .DeE------R    . .   addl	%edx, %edx
+# CHECK-NEXT: [1,2]     .D=======eeER  . .   sbbl	%eax, %eax
+# CHECK-NEXT: [2,0]     . D========eeeER .   imull	%edx, %eax
+# CHECK-NEXT: [2,1]     . DeE----------R .   addl	%edx, %edx
+# CHECK-NEXT: [2,2]     . D===========eeER   sbbl	%eax, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     5.0    0.3    0.0       imull	%edx, %eax
+# CHECK-NEXT: 1.     3     1.0    0.3    6.0       addl	%edx, %edx
+# CHECK-NEXT: 2.     3     8.0    0.0    0.0       sbbl	%eax, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s b/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
new file mode 100644
index 00000000000..bf0f19ad31e
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
@@ -0,0 +1,95 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=500 -timeline < %s | FileCheck %s
+
+vpmuld %xmm0, %xmm0, %xmm1
+vpaddd %xmm1, %xmm1, %xmm0
+vpaddd %xmm0, %xmm0, %xmm3
+
+# CHECK:      Iterations:        500
+# CHECK-NEXT: Instructions:      1500
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      1     0.50                        vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT:  1      1     0.50                        vpaddd	%xmm0, %xmm0, %xmm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpaddd	%xmm0, %xmm0, %xmm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    .    .    .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [0,2]     D======eER.    .    .    .    .    .    .    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [1,0]     D======eeeeeER .    .    .    .    .    .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [1,1]     .D==========eER.    .    .    .    .    .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [1,2]     .D===========eER    .    .    .    .    .    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [2,0]     .D===========eeeeeER.    .    .    .    .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [2,1]     .D================eER    .    .    .    .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [2,2]     . D================eER   .    .    .    .    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [3,0]     . D================eeeeeER    .    .    .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [3,1]     . D=====================eER   .    .    .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [3,2]     . D======================eER  .    .    .    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [4,0]     .  D=====================eeeeeER   .    .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [4,1]     .  D==========================eER  .    .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [4,2]     .  D===========================eER .    .    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [5,0]     .  D===========================eeeeeER  .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [5,1]     .   D===============================eER .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [5,2]     .   D================================eER.    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [6,0]     .   D================================eeeeeER .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [6,1]     .   D=====================================eER.    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [6,2]     .    D=====================================eER    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [7,0]     .    D=====================================eeeeeER.    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [7,1]     .    D==========================================eER    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [7,2]     .    D===========================================eER   .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [8,0]     .    .D==========================================eeeeeER    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [8,1]     .    .D===============================================eER   .  .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [8,2]     .    .D================================================eER  .  .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [9,0]     .    .D================================================eeeeeER .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [9,1]     .    . D====================================================eER.   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [9,2]     .    . D=====================================================eER   vpaddd	%xmm0, %xmm0, %xmm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     10    25.0   0.1    0.0       vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     10    29.7   0.0    0.0       vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: 2.     10    30.5   0.0    0.0       vpaddd	%xmm0, %xmm0, %xmm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/dot-product.s b/test/tools/llvm-mca/X86/BdVer2/dot-product.s
new file mode 100644
index 00000000000..079872dc2a5
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dot-product.s
@@ -0,0 +1,74 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=300 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+vmulps   %xmm0, %xmm1, %xmm2
+vhaddps  %xmm2, %xmm2, %xmm3
+vhaddps  %xmm3, %xmm3, %xmm4
+
+# CHECK:      Iterations:        300
+# CHECK-NEXT: Instructions:      900
+# CHECK-NEXT: Total Cycles:      1211
+# CHECK-NEXT: Total uOps:        2100
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.73
+# CHECK-NEXT: IPC:               0.74
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      5     2.00                        vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  3      5     2.00                        vhaddps	%xmm3, %xmm3, %xmm4
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   2.00    -     4.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddps	%xmm3, %xmm3, %xmm4
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeeER  .    .    . .   vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    . .   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,2]     .D==========eeeeeER . .   vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [1,0]     .DeeeeeE----------R . .   vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,1]     . D=====eeeeeE----R . .   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,2]     .  D==========eeeeeER .   vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [2,0]     .  DeeeeeE----------R .   vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [2,1]     .   D=====eeeeeE----R .   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,2]     .    D==========eeeeeER   vhaddps	%xmm3, %xmm3, %xmm4
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.0    1.0    6.7       vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1.     3     6.0    0.7    2.7       vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 2.     3     11.0   1.0    0.0       vhaddps	%xmm3, %xmm3, %xmm4
diff --git a/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s
new file mode 100644
index 00000000000..e5d5140242d
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+
+vshufps $0, %xmm0, %xmm1, %xmm1
+vhaddps (%rdi), %xmm1, %xmm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      15
+# CHECK-NEXT: Total uOps:        5
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.13
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                        vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT:  4      11    2.00    *                   vhaddps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .   .   vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeER   vhaddps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       vhaddps	(%rdi), %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s
new file mode 100644
index 00000000000..08c256596f2
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+
+vshufps $0, %xmm0, %xmm1, %xmm1
+vhaddps (%rdi), %ymm1, %ymm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      16
+# CHECK-NEXT: Total uOps:        5
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.13
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                        vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT:  4      12    2.00    *                   vhaddps	(%rdi), %ymm1, %ymm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .    .   vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeER   vhaddps	(%rdi), %ymm1, %ymm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       vhaddps	(%rdi), %ymm1, %ymm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s b/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s
new file mode 100644
index 00000000000..6a92f84c35d
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -instruction-info=true < %s | FileCheck %s --check-prefix=ENABLED
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -instruction-info=false < %s | FileCheck %s -check-prefix=DISABLED
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -instruction-info < %s | FileCheck %s -check-prefix=ENABLED
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false < %s | FileCheck %s -check-prefix=ENABLED
+
+vmulps   %xmm0, %xmm1, %xmm2
+vhaddps  %xmm2, %xmm2, %xmm3
+vhaddps  %xmm3, %xmm3, %xmm4
+
+# DISABLED-NOT: Instruction Info:
+
+
+# ENABLED:       Iterations:        100
+# ENABLED-NEXT:  Instructions:      300
+# ENABLED-NEXT:  Total Cycles:      414
+# ENABLED-NEXT:  Total uOps:        700
+
+
+# ENABLED:       Dispatch Width:    4
+# ENABLED-NEXT:  uOps Per Cycle:    1.69
+# ENABLED-NEXT:  IPC:               0.72
+# ENABLED-NEXT:  Block RThroughput: 4.0
+
+# ENABLED:       Instruction Info:
+# ENABLED-NEXT:  [1]: #uOps
+# ENABLED-NEXT:  [2]: Latency
+# ENABLED-NEXT:  [3]: RThroughput
+# ENABLED-NEXT:  [4]: MayLoad
+# ENABLED-NEXT:  [5]: MayStore
+# ENABLED-NEXT:  [6]: HasSideEffects (U)
+
+# ENABLED:       [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# ENABLED-NEXT:   1      5     1.00                        vmulps	%xmm0, %xmm1, %xmm2
+# ENABLED-NEXT:   3      5     2.00                        vhaddps	%xmm2, %xmm2, %xmm3
+# ENABLED-NEXT:   3      5     2.00                        vhaddps	%xmm3, %xmm3, %xmm4
diff --git a/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s b/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
new file mode 100644
index 00000000000..fd123844d38
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
@@ -0,0 +1,93 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 -timeline -timeline-max-iterations=1 -noalias=false < %s | FileCheck %s
+
+vmovaps (%rsi), %xmm0
+vmovaps %xmm0, (%rdi)
+vmovaps 16(%rsi), %xmm0
+vmovaps %xmm0, 16(%rdi)
+vmovaps 32(%rsi), %xmm0
+vmovaps %xmm0, 32(%rdi)
+vmovaps 48(%rsi), %xmm0
+vmovaps %xmm0, 48(%rdi)
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      2803
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -      -     4.00    -      -     8.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .   vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: [0,2]     D=======eeeeeeER    .    .    .   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: [0,3]     D=============eER   .    .    .   vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: [0,4]     .D=============eeeeeeER  .    .   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: [0,5]     .D===================eER .    .   vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: [0,6]     .D====================eeeeeeER.   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: [0,7]     .D==========================eER   vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: 7.     1     27.0   0.0    0.0       vmovaps	%xmm0, 48(%rdi)
diff --git a/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s b/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s
new file mode 100644
index 00000000000..107262f9497
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s
@@ -0,0 +1,93 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 -timeline -timeline-max-iterations=1 < %s | FileCheck %s
+
+vmovaps (%rsi), %xmm0
+vmovaps %xmm0, (%rdi)
+vmovaps 16(%rsi), %xmm0
+vmovaps %xmm0, 16(%rdi)
+vmovaps 32(%rsi), %xmm0
+vmovaps %xmm0, 32(%rdi)
+vmovaps 48(%rsi), %xmm0
+vmovaps %xmm0, 48(%rdi)
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      409
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.96
+# CHECK-NEXT: IPC:               1.96
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -      -     4.00    -     3.94   4.06
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     0.97   0.03   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT:  -      -      -      -      -      -     0.03   0.97   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.97   0.03   vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.97   0.03   vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER . .   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: [0,1]     D======eER. .   vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: [0,2]     DeeeeeeE-R. .   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: [0,3]     D=======eER .   vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: [0,4]     .DeeeeeeE-R .   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: [0,5]     .D=======eER.   vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: [0,6]     .DeeeeeeE--R.   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: [0,7]     .D========eER   vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: 2.     1     1.0    1.0    1.0       vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: 3.     1     8.0    0.0    0.0       vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: 4.     1     1.0    1.0    1.0       vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: 5.     1     8.0    0.0    0.0       vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: 6.     1     1.0    1.0    2.0       vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: 7.     1     9.0    0.0    0.0       vmovaps	%xmm0, 48(%rdi)
diff --git a/test/tools/llvm-mca/X86/BdVer2/one-idioms.s b/test/tools/llvm-mca/X86/BdVer2/one-idioms.s
new file mode 100644
index 00000000000..599f0a01548
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/one-idioms.s
@@ -0,0 +1,142 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=1 -register-file-stats < %s | FileCheck %s
+
+# These are dependency-breaking one-idioms.
+# Much like zero-idioms, but they produce ones, and do consume resources.
+
+# perf stats reports a throughput of 2.00 IPC.
+
+pcmpeqb   %mm2, %mm2
+pcmpeqd   %mm2, %mm2
+pcmpeqw   %mm2, %mm2
+
+pcmpeqb   %xmm2, %xmm2
+pcmpeqd   %xmm2, %xmm2
+pcmpeqq   %xmm2, %xmm2
+pcmpeqw   %xmm2, %xmm2
+
+vpcmpeqb  %xmm3, %xmm3, %xmm3
+vpcmpeqd  %xmm3, %xmm3, %xmm3
+vpcmpeqq  %xmm3, %xmm3, %xmm3
+vpcmpeqw  %xmm3, %xmm3, %xmm3
+
+vpcmpeqb  %xmm3, %xmm3, %xmm5
+vpcmpeqd  %xmm3, %xmm3, %xmm5
+vpcmpeqq  %xmm3, %xmm3, %xmm5
+vpcmpeqw  %xmm3, %xmm3, %xmm5
+
+# FIXME: their handling is broken in llvm-mca.
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1500
+# CHECK-NEXT: Total Cycles:      903
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.66
+# CHECK-NEXT: IPC:               1.66
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        pcmpeqb	%mm2, %mm2
+# CHECK-NEXT:  1      3     1.00                        pcmpeqd	%mm2, %mm2
+# CHECK-NEXT:  1      3     1.00                        pcmpeqw	%mm2, %mm2
+# CHECK-NEXT:  1      1     0.50                        pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT:  1      1     0.50                        pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT:  1      1     0.50                        pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT:  1      1     0.50                        pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqw	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    1500
+# CHECK-NEXT: Max number of mappings used:         168
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     7.65    -     7.35    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -     0.75    -     0.25    -      -     pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -     0.49    -     0.51    -      -     pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -     0.64    -     0.36    -      -     pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -     0.21    -     0.79    -      -     pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -     0.44    -     0.56    -      -     vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -     0.26    -     0.74    -      -     vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -     0.25    -     0.75    -      -     vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -     0.25    -     0.75    -      -     vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -     0.55    -     0.45    -      -     vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -     0.44    -     0.56    -      -     vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -     0.37    -     0.63    -      -     vpcmpeqw	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    ..   pcmpeqb	%mm2, %mm2
+# CHECK-NEXT: [0,1]     D===eeeER ..   pcmpeqd	%mm2, %mm2
+# CHECK-NEXT: [0,2]     D======eeeER   pcmpeqw	%mm2, %mm2
+# CHECK-NEXT: [0,3]     DeE--------R   pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT: [0,4]     .DeE-------R   pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT: [0,5]     .D=eE------R   pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT: [0,6]     .D==eE-----R   pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT: [0,7]     .DeE-------R   vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,8]     . DeE------R   vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,9]     . D==eE----R   vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,10]    . D===eE---R   vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,11]    . D====eE--R   vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,12]    .  D====eE-R   vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,13]    .  D====eE-R   vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,14]    .  D=====eER   vpcmpeqw	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       pcmpeqb	%mm2, %mm2
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       pcmpeqd	%mm2, %mm2
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       pcmpeqw	%mm2, %mm2
+# CHECK-NEXT: 3.     1     1.0    1.0    8.0       pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT: 4.     1     1.0    0.0    7.0       pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT: 5.     1     2.0    0.0    6.0       pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT: 6.     1     3.0    0.0    5.0       pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT: 7.     1     1.0    1.0    7.0       vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 8.     1     1.0    0.0    6.0       vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 9.     1     3.0    1.0    4.0       vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 10.    1     4.0    0.0    3.0       vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 11.    1     5.0    0.0    2.0       vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 12.    1     5.0    1.0    1.0       vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 13.    1     5.0    1.0    1.0       vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 14.    1     6.0    2.0    0.0       vpcmpeqw	%xmm3, %xmm3, %xmm5
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-2.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-2.s
new file mode 100644
index 00000000000..91ecc93c880
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-2.s
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=false -timeline < %s | FileCheck %s
+
+imul   %rax, %rbx
+lzcnt  %ax,  %bx
+add    %ecx, %ebx
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      3
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        3
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.38
+# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        imulq	%rax, %rbx
+# CHECK-NEXT:  1      3     1.00                        lzcntw	%ax, %bx
+# CHECK-NEXT:  1      1     0.33                        addl	%ecx, %ebx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeeeER .   imulq	%rax, %rbx
+# CHECK-NEXT: [0,1]     D=eeeER.   lzcntw	%ax, %bx
+# CHECK-NEXT: [0,2]     D====eER   addl	%ecx, %ebx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       imulq	%rax, %rbx
+# CHECK-NEXT: 1.     1     2.0    2.0    0.0       lzcntw	%ax, %bx
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       addl	%ecx, %ebx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s
new file mode 100644
index 00000000000..d35a195bf35
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s
@@ -0,0 +1,78 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# perf stat reports a throughput of 1.00 IPC for this code snippet.
+
+# The ILP is limited by the false dependency on %dx. So, the mov cannot execute
+# in parallel with the add.
+
+add %cx, %dx
+mov %ax, %dx
+xor %bx, %dx
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      4500
+# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               2.99
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        addw	%cx, %dx
+# CHECK-NEXT:  1      1     0.33                        movw	%ax, %dx
+# CHECK-NEXT:  1      1     0.33                        xorw	%bx, %dx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.67    -      -     0.33    -      -     addw	%cx, %dx
+# CHECK-NEXT:  -      -      -     0.67    -     0.33    -      -     movw	%ax, %dx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorw	%bx, %dx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeER ..   addw	%cx, %dx
+# CHECK-NEXT: [0,1]     DeER ..   movw	%ax, %dx
+# CHECK-NEXT: [0,2]     D=eER..   xorw	%bx, %dx
+# CHECK-NEXT: [1,0]     D==eER.   addw	%cx, %dx
+# CHECK-NEXT: [1,1]     .DeE-R.   movw	%ax, %dx
+# CHECK-NEXT: [1,2]     .D=eER.   xorw	%bx, %dx
+# CHECK-NEXT: [2,0]     .D==eER   addw	%cx, %dx
+# CHECK-NEXT: [2,1]     .DeE--R   movw	%ax, %dx
+# CHECK-NEXT: [2,2]     . DeE-R   xorw	%bx, %dx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     2.3    0.3    0.0       addw	%cx, %dx
+# CHECK-NEXT: 1.     3     1.0    1.0    1.0       movw	%ax, %dx
+# CHECK-NEXT: 2.     3     1.7    0.0    0.3       xorw	%bx, %dx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s
new file mode 100644
index 00000000000..7cd4eb7b6ce
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s
@@ -0,0 +1,79 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# perf stat reports a throughput of 0.60 IPC for this code snippet.
+
+# The lzcnt cannot execute in parallel with the imul because there is a false
+# dependency on %bx.
+
+imul %ax, %bx
+lzcnt %ax, %bx
+add %cx, %bx
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      4500
+# CHECK-NEXT: Total Cycles:      3005
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.50
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        imulw	%ax, %bx
+# CHECK-NEXT:  1      3     1.00                        lzcntw	%ax, %bx
+# CHECK-NEXT:  1      1     0.33                        addw	%cx, %bx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     0.50   2.00    -     0.50    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulw	%ax, %bx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     lzcntw	%ax, %bx
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     addw	%cx, %bx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    ..   imulw	%ax, %bx
+# CHECK-NEXT: [0,1]     D=eeeER   ..   lzcntw	%ax, %bx
+# CHECK-NEXT: [0,2]     D====eER  ..   addw	%cx, %bx
+# CHECK-NEXT: [1,0]     D=====eeeER.   imulw	%ax, %bx
+# CHECK-NEXT: [1,1]     .D=eeeE---R.   lzcntw	%ax, %bx
+# CHECK-NEXT: [1,2]     .D====eE--R.   addw	%cx, %bx
+# CHECK-NEXT: [2,0]     .D=====eeeER   imulw	%ax, %bx
+# CHECK-NEXT: [2,1]     .D==eeeE---R   lzcntw	%ax, %bx
+# CHECK-NEXT: [2,2]     . D====eE--R   addw	%cx, %bx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     4.3    0.3    0.0       imulw	%ax, %bx
+# CHECK-NEXT: 1.     3     2.3    2.3    2.0       lzcntw	%ax, %bx
+# CHECK-NEXT: 2.     3     5.0    0.0    1.3       addw	%cx, %bx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s
new file mode 100644
index 00000000000..87098f08642
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s
@@ -0,0 +1,61 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# perf stat reports a throughput of 1.00 IPC for this code snippet.
+
+lzcnt %ax, %bx  ## partial register stall.
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      1500
+# CHECK-NEXT: Total Cycles:      1505
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        lzcntw	%ax, %bx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     lzcntw	%ax, %bx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeeeER .   lzcntw	%ax, %bx
+# CHECK-NEXT: [1,0]     D=eeeER.   lzcntw	%ax, %bx
+# CHECK-NEXT: [2,0]     D==eeeER   lzcntw	%ax, %bx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     2.0    2.0    0.0       lzcntw	%ax, %bx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s
new file mode 100644
index 00000000000..465c26c7968
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s
@@ -0,0 +1,80 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# perf stat reports a throughput of 0.60 IPC for this code snippet.
+# Each lzcnt has a false dependency on %ecx; the first lzcnt has to wait on the
+# imul. However, the folded load can start immediately.
+# The last lzcnt has a false dependency on %cx. However, even in this case, the
+# folded load can start immediately.
+
+imul %edx, %ecx
+lzcnt (%rsp), %cx
+lzcnt 2(%rsp), %cx
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      4500
+# CHECK-NEXT: Total Cycles:      4510
+# CHECK-NEXT: Total uOps:        7500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.66
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        imull	%edx, %ecx
+# CHECK-NEXT:  2      8     1.00    *                   lzcntw	(%rsp), %cx
+# CHECK-NEXT:  2      8     1.00    *                   lzcntw	2(%rsp), %cx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -     2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imull	%edx, %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -     1.00   lzcntw	(%rsp), %cx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -     1.00   lzcntw	2(%rsp), %cx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .  .   imull	%edx, %ecx
+# CHECK-NEXT: [0,1]     D=eeeeeeeeER   .  .   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [0,2]     .D=eeeeeeeeER  .  .   lzcntw	2(%rsp), %cx
+# CHECK-NEXT: [1,0]     .D=========eeeER  .   imull	%edx, %ecx
+# CHECK-NEXT: [1,1]     . D=eeeeeeeeE--R  .   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [1,2]     . D==eeeeeeeeE-R  .   lzcntw	2(%rsp), %cx
+# CHECK-NEXT: [2,0]     .  D==========eeeER   imull	%edx, %ecx
+# CHECK-NEXT: [2,1]     .  D==eeeeeeeeE---R   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [2,2]     .   D==eeeeeeeeE--R   lzcntw	2(%rsp), %cx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     7.3    0.3    0.0       imull	%edx, %ecx
+# CHECK-NEXT: 1.     3     2.3    2.3    1.7       lzcntw	(%rsp), %cx
+# CHECK-NEXT: 2.     3     2.7    2.7    1.0       lzcntw	2(%rsp), %cx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update.s
new file mode 100644
index 00000000000..995bb35d3ac
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update.s
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=false -timeline < %s | FileCheck %s
+
+imul %ax, %cx
+add  %al, %cl
+add  %ecx, %ebx
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      3
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        3
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.38
+# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        imulw	%ax, %cx
+# CHECK-NEXT:  1      1     0.33                        addb	%al, %cl
+# CHECK-NEXT:  1      1     0.33                        addl	%ecx, %ebx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeeeER .   imulw	%ax, %cx
+# CHECK-NEXT: [0,1]     D===eER.   addb	%al, %cl
+# CHECK-NEXT: [0,2]     D====eER   addl	%ecx, %ebx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       imulw	%ax, %cx
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       addb	%al, %cl
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       addl	%ecx, %ebx
diff --git a/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s b/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s
new file mode 100644
index 00000000000..9ca1d880673
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+
+# VALU0/VALU1
+vpmulld     %xmm0, %xmm1, %xmm2
+vpand       %xmm0, %xmm1, %xmm2
+
+# VIMUL/STC
+vcvttps2dq  %xmm0, %xmm2
+vpclmulqdq  $0, %xmm0, %xmm1, %xmm2
+
+# FPA/FPM
+vaddps      %xmm0, %xmm1, %xmm2
+vsqrtps     %xmm0, %xmm2
+
+# FPA/FPM YMM
+vaddps      %ymm0, %ymm1, %ymm2
+vsqrtps     %ymm0, %ymm2
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      4256
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.23
+# CHECK-NEXT: IPC:               0.19
+# CHECK-NEXT: Block RThroughput: 42.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.33                        vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    6.00                        vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    14.00                       vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  3      29    28.00                       vsqrtps	%ymm0, %ymm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -     42.00  6.03   3.96    -     17.01   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.05   0.06    -     0.89    -      -     vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.98   0.90    -     15.12   -      -     vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -     28.00  2.00    -      -     1.00    -      -     vsqrtps	%ymm0, %ymm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,1]     DeE----R  .    .    .    .    .    ..   vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     DeeeE--R  .    .    .    .    .    ..   vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: [0,3]     D=eeeeeeeeeeeeeeER  .    .    .    ..   vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,4]     .DeeeE-----------R  .    .    .    ..   vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,5]     .DeeeeeeeeeeeeeeER  .    .    .    ..   vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: [0,6]     .D=eeeE----------R  .    .    .    ..   vaddps	%ymm0, %ymm1, %ymm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     1.0    1.0    79.0      vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1.     2     1.0    1.0    82.5      vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 2.     2     1.5    1.5    80.0      vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 3.     2     1.5    1.5    74.0      vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 4.     2     2.0    2.0    84.0      vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 5.     2     9.5    9.5    65.0      vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 6.     2     2.5    2.5    83.0      vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 7.     2     147.5  147.5  0.0       vsqrtps	%ymm0, %ymm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/pr37790.s b/test/tools/llvm-mca/X86/BdVer2/pr37790.s
new file mode 100644
index 00000000000..2878b280a9c
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/pr37790.s
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -lqueue=2 -iterations=2 -resource-pressure=false -timeline -timeline-max-cycles=104 < %s | FileCheck %s
+
+int3
+stmxcsr (%rsp)
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      213
+# CHECK-NEXT: Total uOps:        10
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.05
+# CHECK-NEXT: IPC:               0.02
+# CHECK-NEXT: Block RThroughput: 1.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.33    *      *      U     int3
+# CHECK-NEXT:  4      5     1.00    *      *      U     stmxcsr	(%rsp)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER   int3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     3.0    0.5    0.0       int3
+# CHECK-NEXT: 1.     2     100.0  0.0    0.0       stmxcsr	(%rsp)
diff --git a/test/tools/llvm-mca/X86/BdVer2/rank.s b/test/tools/llvm-mca/X86/BdVer2/rank.s
new file mode 100644
index 00000000000..24f8c43676e
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/rank.s
@@ -0,0 +1,109 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+add %eax, %ecx
+add %eax, %edx
+add %eax, %ebx
+add %edx, %esi
+add %ebx, %eax
+add %edx, %esi
+add %ebx, %eax
+add %ebx, %eax
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: Block RThroughput: 2.7
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        addl	%eax, %ecx
+# CHECK-NEXT:  1      1     0.33                        addl	%eax, %edx
+# CHECK-NEXT:  1      1     0.33                        addl	%eax, %ebx
+# CHECK-NEXT:  1      1     0.33                        addl	%edx, %esi
+# CHECK-NEXT:  1      1     0.33                        addl	%ebx, %eax
+# CHECK-NEXT:  1      1     0.33                        addl	%edx, %esi
+# CHECK-NEXT:  1      1     0.33                        addl	%ebx, %eax
+# CHECK-NEXT:  1      1     0.33                        addl	%ebx, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     2.66   2.67    -     2.67    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -     addl	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.34    -     0.33    -      -     addl	%eax, %edx
+# CHECK-NEXT:  -      -     0.34   0.33    -     0.33    -      -     addl	%eax, %ebx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -     addl	%edx, %esi
+# CHECK-NEXT:  -      -     0.33   0.34    -     0.33    -      -     addl	%ebx, %eax
+# CHECK-NEXT:  -      -     0.34   0.33    -     0.33    -      -     addl	%edx, %esi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -     addl	%ebx, %eax
+# CHECK-NEXT:  -      -     0.33   0.34    -     0.33    -      -     addl	%ebx, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .   .   addl	%eax, %ecx
+# CHECK-NEXT: [0,1]     DeER .    .   .   addl	%eax, %edx
+# CHECK-NEXT: [0,2]     DeER .    .   .   addl	%eax, %ebx
+# CHECK-NEXT: [0,3]     D=eER.    .   .   addl	%edx, %esi
+# CHECK-NEXT: [0,4]     .DeER.    .   .   addl	%ebx, %eax
+# CHECK-NEXT: [0,5]     .D=eER    .   .   addl	%edx, %esi
+# CHECK-NEXT: [0,6]     .D=eER    .   .   addl	%ebx, %eax
+# CHECK-NEXT: [0,7]     .D==eER   .   .   addl	%ebx, %eax
+# CHECK-NEXT: [1,0]     . D==eER  .   .   addl	%eax, %ecx
+# CHECK-NEXT: [1,1]     . D==eER  .   .   addl	%eax, %edx
+# CHECK-NEXT: [1,2]     . D==eER  .   .   addl	%eax, %ebx
+# CHECK-NEXT: [1,3]     . D===eER .   .   addl	%edx, %esi
+# CHECK-NEXT: [1,4]     .  D==eER .   .   addl	%ebx, %eax
+# CHECK-NEXT: [1,5]     .  D===eER.   .   addl	%edx, %esi
+# CHECK-NEXT: [1,6]     .  D===eER.   .   addl	%ebx, %eax
+# CHECK-NEXT: [1,7]     .  D====eER   .   addl	%ebx, %eax
+# CHECK-NEXT: [2,0]     .   D====eER  .   addl	%eax, %ecx
+# CHECK-NEXT: [2,1]     .   D====eER  .   addl	%eax, %edx
+# CHECK-NEXT: [2,2]     .   D====eER  .   addl	%eax, %ebx
+# CHECK-NEXT: [2,3]     .   D=====eER .   addl	%edx, %esi
+# CHECK-NEXT: [2,4]     .    D====eER .   addl	%ebx, %eax
+# CHECK-NEXT: [2,5]     .    D=====eER.   addl	%edx, %esi
+# CHECK-NEXT: [2,6]     .    D=====eER.   addl	%ebx, %eax
+# CHECK-NEXT: [2,7]     .    D======eER   addl	%ebx, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     3.0    0.3    0.0       addl	%eax, %ecx
+# CHECK-NEXT: 1.     3     3.0    0.3    0.0       addl	%eax, %edx
+# CHECK-NEXT: 2.     3     3.0    0.3    0.0       addl	%eax, %ebx
+# CHECK-NEXT: 3.     3     4.0    0.0    0.0       addl	%edx, %esi
+# CHECK-NEXT: 4.     3     3.0    0.0    0.0       addl	%ebx, %eax
+# CHECK-NEXT: 5.     3     4.0    0.0    0.0       addl	%edx, %esi
+# CHECK-NEXT: 6.     3     4.0    0.0    0.0       addl	%ebx, %eax
+# CHECK-NEXT: 7.     3     5.0    0.0    0.0       addl	%ebx, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s b/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s
new file mode 100644
index 00000000000..afa5abd1cd2
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s
@@ -0,0 +1,61 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -retire-stats -iterations=1 < %s | FileCheck %s
+
+  vsqrtps %xmm0, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      16
+# CHECK-NEXT: Total Cycles:      20
+# CHECK-NEXT: Total uOps:        16
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.80
+# CHECK-NEXT: IPC:               0.80
+# CHECK-NEXT: Block RThroughput: 15.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      14    14.00                       vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+
+# CHECK:      Retire Control Unit - number of cycles where we saw N instructions retired:
+# CHECK-NEXT: [# retired], [# cycles]
+# CHECK-NEXT:  0,           16  (80.0%)
+# CHECK-NEXT:  1,           3  (15.0%)
+# CHECK-NEXT:  13,          1  (5.0%)
diff --git a/test/tools/llvm-mca/X86/BdVer2/read-advance-1.s b/test/tools/llvm-mca/X86/BdVer2/read-advance-1.s
new file mode 100644
index 00000000000..1c719a84a1b
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/read-advance-1.s
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+
+# The vmul can start executing 3cy in advance. That is beause the first use
+# operand (i.e. %xmm1) is a ReadAfterLd. That means, the memory operand is
+# evaluated before %xmm1.
+
+vaddps  %xmm0, %xmm0, %xmm1
+vmulps  (%rdi), %xmm1, %xmm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      14
+# CHECK-NEXT: Total uOps:        3
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.21
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .  .   vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     DeeeeeeeeeeeER   vmulps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       vmulps	(%rdi), %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/read-advance-2.s b/test/tools/llvm-mca/X86/BdVer2/read-advance-2.s
new file mode 100644
index 00000000000..7814b000ee4
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/read-advance-2.s
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=0 -timeline < %s | FileCheck %s
+
+  imull  %esi
+  imull  (%rdi)
+
+# The second integer multiply can start at cycle 2 because the implicit reads
+# can start after the load operand is evaluated.
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      13
+# CHECK-NEXT: Total uOps:        7
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.54
+# CHECK-NEXT: IPC:               0.15
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  3      4     1.00                        imull	%esi
+# CHECK-NEXT:  4      9     1.00    *                   imull	(%rdi)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   . .   imull	%esi
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeER   imull	(%rdi)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       imull	%esi
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       imull	(%rdi)
diff --git a/test/tools/llvm-mca/X86/BdVer2/read-advance-3.s b/test/tools/llvm-mca/X86/BdVer2/read-advance-3.s
new file mode 100644
index 00000000000..638f36c1711
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/read-advance-3.s
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=0 -timeline -dispatch=3 < %s | FileCheck %s
+
+  add %rdi, %rsi
+  add (%rsp), %rsi
+  add %rdx, %r8
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      3
+# CHECK-NEXT: Total Cycles:      9
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.44
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 1.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        addq	%rdi, %rsi
+# CHECK-NEXT:  2      6     0.50    *                   addq	(%rsp), %rsi
+# CHECK-NEXT:  1      1     0.33                        addq	%rdx, %r8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeER .  .   addq	%rdi, %rsi
+# CHECK-NEXT: [0,1]     DeeeeeeER   addq	(%rsp), %rsi
+# CHECK-NEXT: [0,2]     .DeE----R   addq	%rdx, %r8
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       addq	%rdi, %rsi
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       addq	(%rsp), %rsi
+# CHECK-NEXT: 2.     1     1.0    1.0    4.0       addq	%rdx, %r8
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s
new file mode 100644
index 00000000000..990cdc4fe31
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s
@@ -0,0 +1,80 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+# The register move from XMM0 to XMM1 can be eliminated at register renaming
+# stage. So, it should not consume pipeline resources.
+
+vxorps %xmm0, %xmm0, %xmm0
+vmovaps %xmm0, %xmm1
+vaddps %xmm1, %xmm1, %xmm2
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      9
+# CHECK-NEXT: Total Cycles:      9
+# CHECK-NEXT: Total uOps:        9
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm1, %xmm1, %xmm2
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    9
+# CHECK-NEXT: Max number of mappings used:         8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%xmm1, %xmm1, %xmm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DR   .  .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     DeER .  .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,2]     D=eeeER .   vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,0]     D-----R .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     .DeE--R .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,2]     .D=eeeER.   vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,0]     .D-----R.   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .D=eE--R.   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,2]     . D=eeeER   vaddps	%xmm1, %xmm1, %xmm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    3.3       vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     3     1.3    1.3    1.3       vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: 2.     3     2.0    0.0    0.0       vaddps	%xmm1, %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s
new file mode 100644
index 00000000000..6f22cdc0b7e
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s
@@ -0,0 +1,121 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+pxor %mm0, %mm0
+movq %mm0, %mm1
+
+xorps %xmm0, %xmm0
+movaps %xmm0, %xmm1
+movups %xmm1, %xmm2
+movapd %xmm2, %xmm3
+movupd %xmm3, %xmm4
+movdqa %xmm4, %xmm5
+movdqu %xmm5, %xmm0
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      27
+# CHECK-NEXT: Total Cycles:      22
+# CHECK-NEXT: Total uOps:        27
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.23
+# CHECK-NEXT: IPC:               1.23
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        pxor	%mm0, %mm0
+# CHECK-NEXT:  1      1     0.50                        movq	%mm0, %mm1
+# CHECK-NEXT:  1      0     0.25                        xorps	%xmm0, %xmm0
+# CHECK-NEXT:  1      1     1.00                        movaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      1     1.00                        movups	%xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        movapd	%xmm2, %xmm3
+# CHECK-NEXT:  1      1     1.00                        movupd	%xmm3, %xmm4
+# CHECK-NEXT:  1      1     0.33                        movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  1      1     0.33                        movdqu	%xmm5, %xmm0
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    27
+# CHECK-NEXT: Max number of mappings used:         21
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.67   1.67    -     4.67    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     0.67    -     0.33    -      -     pxor	%mm0, %mm0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movq	%mm0, %mm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -     0.67    -      -     0.33    -      -     movdqu	%xmm5, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeER .    .    .    ..   pxor	%mm0, %mm0
+# CHECK-NEXT: [0,1]     D=eER.    .    .    ..   movq	%mm0, %mm1
+# CHECK-NEXT: [0,2]     D---R.    .    .    ..   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,3]     D=eER.    .    .    ..   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,4]     .D=eER    .    .    ..   movups	%xmm1, %xmm2
+# CHECK-NEXT: [0,5]     .D==eER   .    .    ..   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,6]     .D===eER  .    .    ..   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,7]     .D====eER .    .    ..   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,8]     . D====eER.    .    ..   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     . DeE----R.    .    ..   pxor	%mm0, %mm0
+# CHECK-NEXT: [1,1]     . D=eE---R.    .    ..   movq	%mm0, %mm1
+# CHECK-NEXT: [1,2]     . D=====ER.    .    ..   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [1,3]     .  D====eER    .    ..   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,4]     .  D=====eER   .    ..   movups	%xmm1, %xmm2
+# CHECK-NEXT: [1,5]     .  D======eER  .    ..   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,6]     .  D=======eER .    ..   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,7]     .   D=======eER.    ..   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,8]     .   D========eER    ..   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .   DeE--------R    ..   pxor	%mm0, %mm0
+# CHECK-NEXT: [2,1]     .   D=eE-------R    ..   movq	%mm0, %mm1
+# CHECK-NEXT: [2,2]     .    D========ER    ..   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [2,3]     .    D========eER   ..   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,4]     .    D=========eER  ..   movups	%xmm1, %xmm2
+# CHECK-NEXT: [2,5]     .    D==========eER ..   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,6]     .    .D==========eER..   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,7]     .    .D===========eER.   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,8]     .    .D============eER   movdqu	%xmm5, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.0    1.0    4.0       pxor	%mm0, %mm0
+# CHECK-NEXT: 1.     3     2.0    0.0    3.3       movq	%mm0, %mm1
+# CHECK-NEXT: 2.     3     5.0    0.0    1.0       xorps	%xmm0, %xmm0
+# CHECK-NEXT: 3.     3     5.3    0.7    0.0       movaps	%xmm0, %xmm1
+# CHECK-NEXT: 4.     3     6.0    0.0    0.0       movups	%xmm1, %xmm2
+# CHECK-NEXT: 5.     3     7.0    0.0    0.0       movapd	%xmm2, %xmm3
+# CHECK-NEXT: 6.     3     7.7    0.0    0.0       movupd	%xmm3, %xmm4
+# CHECK-NEXT: 7.     3     8.3    0.0    0.0       movdqa	%xmm4, %xmm5
+# CHECK-NEXT: 8.     3     9.0    0.0    0.0       movdqu	%xmm5, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s
new file mode 100644
index 00000000000..202afac21ec
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s
@@ -0,0 +1,106 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+vxorps  %xmm0, %xmm0, %xmm0
+vmovaps %xmm0, %xmm1
+vmovups %xmm1, %xmm2
+vmovapd %xmm2, %xmm3
+vmovupd %xmm3, %xmm4
+vmovdqa %xmm4, %xmm5
+vmovdqu %xmm5, %xmm0
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      21
+# CHECK-NEXT: Total Cycles:      21
+# CHECK-NEXT: Total uOps:        21
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      1     1.00                        vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  1      1     0.33                        vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  1      1     0.33                        vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    21
+# CHECK-NEXT: Max number of mappings used:         17
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -     4.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DR   .    .    .    .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     DeER .    .    .    .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,2]     D=eER.    .    .    .   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [0,3]     D==eER    .    .    .   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,4]     .D==eER   .    .    .   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,5]     .D===eER  .    .    .   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,6]     .D====eER .    .    .   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     .D=====ER .    .    .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     . D====eER.    .    .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,2]     . D=====eER    .    .   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [1,3]     . D======eER   .    .   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,4]     . D=======eER  .    .   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,5]     .  D=======eER .    .   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,6]     .  D========eER.    .   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .  D=========ER.    .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .  D=========eER    .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,2]     .   D=========eER   .   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [2,3]     .   D==========eER  .   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,4]     .   D===========eER .   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,5]     .   D============eER.   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,6]     .    D============eER   vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     5.3    0.0    0.0       vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     3     5.3    0.3    0.0       vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: 2.     3     6.0    0.0    0.0       vmovups	%xmm1, %xmm2
+# CHECK-NEXT: 3.     3     7.0    0.0    0.0       vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: 4.     3     7.7    0.0    0.0       vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: 5.     3     8.3    0.0    0.0       vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: 6.     3     9.0    0.0    0.0       vmovdqu	%xmm5, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s
new file mode 100644
index 00000000000..339ec06bcc8
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+xor %eax, %eax
+mov %eax, %ebx
+mov %ebx, %ecx
+mov %ecx, %edx
+mov %edx, %eax
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      15
+# CHECK-NEXT: Total Cycles:      15
+# CHECK-NEXT: Total uOps:        15
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 1.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        xorl	%eax, %eax
+# CHECK-NEXT:  1      1     0.33                        movl	%eax, %ebx
+# CHECK-NEXT:  1      1     0.33                        movl	%ebx, %ecx
+# CHECK-NEXT:  1      1     0.33                        movl	%ecx, %edx
+# CHECK-NEXT:  1      1     0.33                        movl	%edx, %eax
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    18
+# CHECK-NEXT: Max number of mappings used:         15
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.33   1.33    -     1.33    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorl	%eax, %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movl	%eax, %ebx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movl	%ebx, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movl	%ecx, %edx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movl	%edx, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .   .   xorl	%eax, %eax
+# CHECK-NEXT: [0,1]     DeER .    .   .   movl	%eax, %ebx
+# CHECK-NEXT: [0,2]     D=eER.    .   .   movl	%ebx, %ecx
+# CHECK-NEXT: [0,3]     D==eER    .   .   movl	%ecx, %edx
+# CHECK-NEXT: [0,4]     .D==eER   .   .   movl	%edx, %eax
+# CHECK-NEXT: [1,0]     .D===ER   .   .   xorl	%eax, %eax
+# CHECK-NEXT: [1,1]     .D===eER  .   .   movl	%eax, %ebx
+# CHECK-NEXT: [1,2]     .D====eER .   .   movl	%ebx, %ecx
+# CHECK-NEXT: [1,3]     . D====eER.   .   movl	%ecx, %edx
+# CHECK-NEXT: [1,4]     . D=====eER   .   movl	%edx, %eax
+# CHECK-NEXT: [2,0]     . D======ER   .   xorl	%eax, %eax
+# CHECK-NEXT: [2,1]     . D======eER  .   movl	%eax, %ebx
+# CHECK-NEXT: [2,2]     .  D======eER .   movl	%ebx, %ecx
+# CHECK-NEXT: [2,3]     .  D=======eER.   movl	%ecx, %edx
+# CHECK-NEXT: [2,4]     .  D========eER   movl	%edx, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     3.7    0.0    0.0       xorl	%eax, %eax
+# CHECK-NEXT: 1.     3     4.0    0.3    0.0       movl	%eax, %ebx
+# CHECK-NEXT: 2.     3     4.7    0.0    0.0       movl	%ebx, %ecx
+# CHECK-NEXT: 3.     3     5.3    0.0    0.0       movl	%ecx, %edx
+# CHECK-NEXT: 4.     3     6.0    0.0    0.0       movl	%edx, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s
new file mode 100644
index 00000000000..66ce02cb0fc
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+xor %rax, %rax
+mov %rax, %rbx
+mov %rbx, %rcx
+mov %rcx, %rdx
+mov %rdx, %rax
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      15
+# CHECK-NEXT: Total Cycles:      15
+# CHECK-NEXT: Total uOps:        15
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 1.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        xorq	%rax, %rax
+# CHECK-NEXT:  1      1     0.33                        movq	%rax, %rbx
+# CHECK-NEXT:  1      1     0.33                        movq	%rbx, %rcx
+# CHECK-NEXT:  1      1     0.33                        movq	%rcx, %rdx
+# CHECK-NEXT:  1      1     0.33                        movq	%rdx, %rax
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    18
+# CHECK-NEXT: Max number of mappings used:         15
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.33   1.33    -     1.33    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorq	%rax, %rax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq	%rax, %rbx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq	%rbx, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq	%rcx, %rdx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq	%rdx, %rax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .   .   xorq	%rax, %rax
+# CHECK-NEXT: [0,1]     DeER .    .   .   movq	%rax, %rbx
+# CHECK-NEXT: [0,2]     D=eER.    .   .   movq	%rbx, %rcx
+# CHECK-NEXT: [0,3]     D==eER    .   .   movq	%rcx, %rdx
+# CHECK-NEXT: [0,4]     .D==eER   .   .   movq	%rdx, %rax
+# CHECK-NEXT: [1,0]     .D===ER   .   .   xorq	%rax, %rax
+# CHECK-NEXT: [1,1]     .D===eER  .   .   movq	%rax, %rbx
+# CHECK-NEXT: [1,2]     .D====eER .   .   movq	%rbx, %rcx
+# CHECK-NEXT: [1,3]     . D====eER.   .   movq	%rcx, %rdx
+# CHECK-NEXT: [1,4]     . D=====eER   .   movq	%rdx, %rax
+# CHECK-NEXT: [2,0]     . D======ER   .   xorq	%rax, %rax
+# CHECK-NEXT: [2,1]     . D======eER  .   movq	%rax, %rbx
+# CHECK-NEXT: [2,2]     .  D======eER .   movq	%rbx, %rcx
+# CHECK-NEXT: [2,3]     .  D=======eER.   movq	%rcx, %rdx
+# CHECK-NEXT: [2,4]     .  D========eER   movq	%rdx, %rax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     3.7    0.0    0.0       xorq	%rax, %rax
+# CHECK-NEXT: 1.     3     4.0    0.3    0.0       movq	%rax, %rbx
+# CHECK-NEXT: 2.     3     4.7    0.0    0.0       movq	%rbx, %rcx
+# CHECK-NEXT: 3.     3     5.3    0.0    0.0       movq	%rcx, %rdx
+# CHECK-NEXT: 4.     3     6.0    0.0    0.0       movq	%rdx, %rax
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-1.s b/test/tools/llvm-mca/X86/BdVer2/register-files-1.s
new file mode 100644
index 00000000000..d20b50dbec1
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-1.s
@@ -0,0 +1,77 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=5 -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+
+vaddps %xmm0, %xmm0, %xmm0
+vmulps %xmm0, %xmm0, %xmm0
+
+# CHECK:      Iterations:        5
+# CHECK-NEXT: Instructions:      10
+# CHECK-NEXT: Total Cycles:      43
+# CHECK-NEXT: Total uOps:        10
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.23
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              40  (93.0%)
+# CHECK-NEXT:  2,              1  (2.3%)
+# CHECK-NEXT:  4,              2  (4.7%)
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    10
+# CHECK-NEXT: Max number of mappings used:         10
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,0]     D========eeeER .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     D===========eeeeeER .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,0]     .D===============eeeER   .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .D==================eeeeeER   .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,0]     .D=======================eeeER.    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,1]     .D==========================eeeeeER.    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,0]     . D==============================eeeER  . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,1]     . D=================================eeeeeER   vmulps	%xmm0, %xmm0, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     5     16.2   0.2    0.0       vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     5     19.2   0.0    0.0       vmulps	%xmm0, %xmm0, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-2.s b/test/tools/llvm-mca/X86/BdVer2/register-files-2.s
new file mode 100644
index 00000000000..bcf2a08bc02
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-2.s
@@ -0,0 +1,77 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -register-file-size=5 -iterations=5 -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+
+vaddps %xmm0, %xmm0, %xmm0
+vmulps %xmm0, %xmm0, %xmm0
+
+# CHECK:      Iterations:        5
+# CHECK-NEXT: Instructions:      10
+# CHECK-NEXT: Total Cycles:      43
+# CHECK-NEXT: Total uOps:        10
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.23
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      20  (46.5%)
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              36  (83.7%)
+# CHECK-NEXT:  1,              6  (14.0%)
+# CHECK-NEXT:  4,              1  (2.3%)
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    10
+# CHECK-NEXT: Max number of mappings used:         5
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,0]     D========eeeER .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     D===========eeeeeER .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,0]     .D===============eeeER   .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .    D==============eeeeeER   .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,0]     .    .    D==============eeeER.    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,1]     .    .    .  D==============eeeeeER.    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,0]     .    .    .    .  D==============eeeER  . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,1]     .    .    .    .    .D==============eeeeeER   vmulps	%xmm0, %xmm0, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     5     11.2   0.2    0.0       vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     5     12.2   0.0    0.0       vmulps	%xmm0, %xmm0, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-3.s b/test/tools/llvm-mca/X86/BdVer2/register-files-3.s
new file mode 100644
index 00000000000..0be7dd3978e
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-3.s
@@ -0,0 +1,76 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -register-file-size=5 -iterations=2 -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+
+idiv %eax
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      55
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.04
+# CHECK-NEXT: IPC:               0.04
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      25    10.00                 U     idivl	%eax
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      27  (49.1%)
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              53  (96.4%)
+# CHECK-NEXT:  1,              2  (3.6%)
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    6
+# CHECK-NEXT: Max number of mappings used:         3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     idivl	%eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeER  .    .    .    .    .   .   idivl	%eax
+# CHECK-NEXT: [1,0]     .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeeeeeER   idivl	%eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     1.0    1.0    0.0       idivl	%eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-4.s b/test/tools/llvm-mca/X86/BdVer2/register-files-4.s
new file mode 100644
index 00000000000..8ad203d0151
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-4.s
@@ -0,0 +1,60 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=22 -dispatch-stats -register-file-stats -resource-pressure=false -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+idiv %eax
+
+# CHECK:      Iterations:        22
+# CHECK-NEXT: Instructions:      22
+# CHECK-NEXT: Total Cycles:      553
+# CHECK-NEXT: Total uOps:        22
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.04
+# CHECK-NEXT: IPC:               0.04
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      25    10.00                 U     idivl	%eax
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              547  (98.9%)
+# CHECK-NEXT:  2,              1  (0.2%)
+# CHECK-NEXT:  4,              5  (0.9%)
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    66
+# CHECK-NEXT: Max number of mappings used:         66
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeER  .    .    .    .    .    .    .    .    .    . .   idivl	%eax
+# CHECK-NEXT: [1,0]     D=========================eeeeeeeeeeeeeeeeeeeeeeeeeER  .    .    .    .    . .   idivl	%eax
+# CHECK-NEXT: [2,0]     D==================================================eeeeeeeeeeeeeeeeeeeeeeeeeER   idivl	%eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     26.0   0.3    0.0       idivl	%eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-5.s b/test/tools/llvm-mca/X86/BdVer2/register-files-5.s
new file mode 100644
index 00000000000..31696730fd2
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-5.s
@@ -0,0 +1,143 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=false -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+
+  vdivps %ymm0, %ymm0, %ymm1
+  vaddps %ymm0, %ymm0, %ymm2
+  vaddps %ymm0, %ymm0, %ymm3
+  vaddps %ymm0, %ymm0, %ymm4
+  vaddps %ymm0, %ymm0, %ymm5
+  vaddps %ymm0, %ymm0, %ymm6
+  vaddps %ymm0, %ymm0, %ymm7
+  vaddps %ymm0, %ymm0, %ymm8
+  vaddps %ymm0, %ymm0, %ymm9
+  vaddps %ymm0, %ymm0, %ymm10
+  vaddps %ymm0, %ymm0, %ymm11
+  vaddps %ymm0, %ymm0, %ymm12
+  vaddps %ymm0, %ymm0, %ymm13
+  vaddps %ymm0, %ymm0, %ymm14
+  vaddps %ymm0, %ymm0, %ymm15
+  vaddps %ymm2, %ymm0, %ymm0
+  vaddps %ymm2, %ymm0, %ymm3
+  vaddps %ymm2, %ymm0, %ymm4
+  vaddps %ymm2, %ymm0, %ymm5
+  vaddps %ymm2, %ymm0, %ymm6
+  vaddps %ymm2, %ymm0, %ymm7
+  vaddps %ymm2, %ymm0, %ymm8
+  vaddps %ymm2, %ymm0, %ymm9
+  vaddps %ymm2, %ymm0, %ymm10
+  vaddps %ymm2, %ymm0, %ymm11
+  vaddps %ymm2, %ymm0, %ymm12
+  vaddps %ymm2, %ymm0, %ymm13
+  vaddps %ymm2, %ymm0, %ymm14
+  vaddps %ymm2, %ymm0, %ymm15
+  vaddps %ymm3, %ymm0, %ymm2
+  vaddps %ymm3, %ymm0, %ymm4
+  vaddps %ymm3, %ymm0, %ymm5
+  vaddps %ymm3, %ymm0, %ymm6
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      33
+# CHECK-NEXT: Total Cycles:      37
+# CHECK-NEXT: Total uOps:        35
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.95
+# CHECK-NEXT: IPC:               0.89
+# CHECK-NEXT: Block RThroughput: 32.0
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              28  (75.7%)
+# CHECK-NEXT:  3,              1  (2.7%)
+# CHECK-NEXT:  4,              8  (21.6%)
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    33
+# CHECK-NEXT: Max number of mappings used:         33
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER   ..   vdivps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     DeeeE--------------------------R   ..   vaddps	%ymm0, %ymm0, %ymm2
+# CHECK-NEXT: [0,2]     .DeeeE-------------------------R   ..   vaddps	%ymm0, %ymm0, %ymm3
+# CHECK-NEXT: [0,3]     .D=eeeE------------------------R   ..   vaddps	%ymm0, %ymm0, %ymm4
+# CHECK-NEXT: [0,4]     .D==eeeE-----------------------R   ..   vaddps	%ymm0, %ymm0, %ymm5
+# CHECK-NEXT: [0,5]     .D===eeeE----------------------R   ..   vaddps	%ymm0, %ymm0, %ymm6
+# CHECK-NEXT: [0,6]     . D===eeeE---------------------R   ..   vaddps	%ymm0, %ymm0, %ymm7
+# CHECK-NEXT: [0,7]     . D=====eeeE-------------------R   ..   vaddps	%ymm0, %ymm0, %ymm8
+# CHECK-NEXT: [0,8]     . D======eeeE------------------R   ..   vaddps	%ymm0, %ymm0, %ymm9
+# CHECK-NEXT: [0,9]     . D=======eeeE-----------------R   ..   vaddps	%ymm0, %ymm0, %ymm10
+# CHECK-NEXT: [0,10]    .  D=======eeeE----------------R   ..   vaddps	%ymm0, %ymm0, %ymm11
+# CHECK-NEXT: [0,11]    .  D========eeeE---------------R   ..   vaddps	%ymm0, %ymm0, %ymm12
+# CHECK-NEXT: [0,12]    .  D=========eeeE--------------R   ..   vaddps	%ymm0, %ymm0, %ymm13
+# CHECK-NEXT: [0,13]    .  D===========eeeE------------R   ..   vaddps	%ymm0, %ymm0, %ymm14
+# CHECK-NEXT: [0,14]    .   D===========eeeE-----------R   ..   vaddps	%ymm0, %ymm0, %ymm15
+# CHECK-NEXT: [0,15]    .   D==eeeE--------------------R   ..   vaddps	%ymm2, %ymm0, %ymm0
+# CHECK-NEXT: [0,16]    .   D=========eeeE-------------R   ..   vaddps	%ymm2, %ymm0, %ymm3
+# CHECK-NEXT: [0,17]    .   D============eeeE----------R   ..   vaddps	%ymm2, %ymm0, %ymm4
+# CHECK-NEXT: [0,18]    .    D============eeeE---------R   ..   vaddps	%ymm2, %ymm0, %ymm5
+# CHECK-NEXT: [0,19]    .    D=============eeeE--------R   ..   vaddps	%ymm2, %ymm0, %ymm6
+# CHECK-NEXT: [0,20]    .    D==============eeeE-------R   ..   vaddps	%ymm2, %ymm0, %ymm7
+# CHECK-NEXT: [0,21]    .    D===============eeeE------R   ..   vaddps	%ymm2, %ymm0, %ymm8
+# CHECK-NEXT: [0,22]    .    .D===============eeeE-----R   ..   vaddps	%ymm2, %ymm0, %ymm9
+# CHECK-NEXT: [0,23]    .    .D================eeeE----R   ..   vaddps	%ymm2, %ymm0, %ymm10
+# CHECK-NEXT: [0,24]    .    .D=================eeeE---R   ..   vaddps	%ymm2, %ymm0, %ymm11
+# CHECK-NEXT: [0,25]    .    .D==================eeeE--R   ..   vaddps	%ymm2, %ymm0, %ymm12
+# CHECK-NEXT: [0,26]    .    . D==================eeeE-R   ..   vaddps	%ymm2, %ymm0, %ymm13
+# CHECK-NEXT: [0,27]    .    . D===================eeeER   ..   vaddps	%ymm2, %ymm0, %ymm14
+# CHECK-NEXT: [0,28]    .    . D====================eeeER  ..   vaddps	%ymm2, %ymm0, %ymm15
+# CHECK-NEXT: [0,29]    .    . D=====================eeeER ..   vaddps	%ymm3, %ymm0, %ymm2
+# CHECK-NEXT: [0,30]    .    .  D=====================eeeER..   vaddps	%ymm3, %ymm0, %ymm4
+# CHECK-NEXT: [0,31]    .    .  D======================eeeER.   vaddps	%ymm3, %ymm0, %ymm5
+# CHECK-NEXT: [0,32]    .    .  D=======================eeeER   vaddps	%ymm3, %ymm0, %ymm6
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vdivps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     1     1.0    1.0    26.0      vaddps	%ymm0, %ymm0, %ymm2
+# CHECK-NEXT: 2.     1     1.0    1.0    25.0      vaddps	%ymm0, %ymm0, %ymm3
+# CHECK-NEXT: 3.     1     2.0    2.0    24.0      vaddps	%ymm0, %ymm0, %ymm4
+# CHECK-NEXT: 4.     1     3.0    3.0    23.0      vaddps	%ymm0, %ymm0, %ymm5
+# CHECK-NEXT: 5.     1     4.0    4.0    22.0      vaddps	%ymm0, %ymm0, %ymm6
+# CHECK-NEXT: 6.     1     4.0    4.0    21.0      vaddps	%ymm0, %ymm0, %ymm7
+# CHECK-NEXT: 7.     1     6.0    6.0    19.0      vaddps	%ymm0, %ymm0, %ymm8
+# CHECK-NEXT: 8.     1     7.0    7.0    18.0      vaddps	%ymm0, %ymm0, %ymm9
+# CHECK-NEXT: 9.     1     8.0    8.0    17.0      vaddps	%ymm0, %ymm0, %ymm10
+# CHECK-NEXT: 10.    1     8.0    8.0    16.0      vaddps	%ymm0, %ymm0, %ymm11
+# CHECK-NEXT: 11.    1     9.0    9.0    15.0      vaddps	%ymm0, %ymm0, %ymm12
+# CHECK-NEXT: 12.    1     10.0   10.0   14.0      vaddps	%ymm0, %ymm0, %ymm13
+# CHECK-NEXT: 13.    1     12.0   12.0   12.0      vaddps	%ymm0, %ymm0, %ymm14
+# CHECK-NEXT: 14.    1     12.0   12.0   11.0      vaddps	%ymm0, %ymm0, %ymm15
+# CHECK-NEXT: 15.    1     3.0    3.0    20.0      vaddps	%ymm2, %ymm0, %ymm0
+# CHECK-NEXT: 16.    1     10.0   4.0    13.0      vaddps	%ymm2, %ymm0, %ymm3
+# CHECK-NEXT: 17.    1     13.0   7.0    10.0      vaddps	%ymm2, %ymm0, %ymm4
+# CHECK-NEXT: 18.    1     13.0   8.0    9.0       vaddps	%ymm2, %ymm0, %ymm5
+# CHECK-NEXT: 19.    1     14.0   9.0    8.0       vaddps	%ymm2, %ymm0, %ymm6
+# CHECK-NEXT: 20.    1     15.0   10.0   7.0       vaddps	%ymm2, %ymm0, %ymm7
+# CHECK-NEXT: 21.    1     16.0   11.0   6.0       vaddps	%ymm2, %ymm0, %ymm8
+# CHECK-NEXT: 22.    1     16.0   12.0   5.0       vaddps	%ymm2, %ymm0, %ymm9
+# CHECK-NEXT: 23.    1     17.0   13.0   4.0       vaddps	%ymm2, %ymm0, %ymm10
+# CHECK-NEXT: 24.    1     18.0   14.0   3.0       vaddps	%ymm2, %ymm0, %ymm11
+# CHECK-NEXT: 25.    1     19.0   15.0   2.0       vaddps	%ymm2, %ymm0, %ymm12
+# CHECK-NEXT: 26.    1     19.0   16.0   1.0       vaddps	%ymm2, %ymm0, %ymm13
+# CHECK-NEXT: 27.    1     20.0   17.0   0.0       vaddps	%ymm2, %ymm0, %ymm14
+# CHECK-NEXT: 28.    1     21.0   18.0   0.0       vaddps	%ymm2, %ymm0, %ymm15
+# CHECK-NEXT: 29.    1     22.0   12.0   0.0       vaddps	%ymm3, %ymm0, %ymm2
+# CHECK-NEXT: 30.    1     22.0   13.0   0.0       vaddps	%ymm3, %ymm0, %ymm4
+# CHECK-NEXT: 31.    1     23.0   14.0   0.0       vaddps	%ymm3, %ymm0, %ymm5
+# CHECK-NEXT: 32.    1     24.0   15.0   0.0       vaddps	%ymm3, %ymm0, %ymm6
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s b/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s
new file mode 100644
index 00000000000..52a0968d1fd
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s
@@ -0,0 +1,208 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+femms
+
+pavgusb     %mm0, %mm2
+pavgusb     (%rax), %mm2
+
+pf2id       %mm0, %mm2
+pf2id       (%rax), %mm2
+
+pf2iw       %mm0, %mm2
+pf2iw       (%rax), %mm2
+
+pfacc       %mm0, %mm2
+pfacc       (%rax), %mm2
+
+pfadd       %mm0, %mm2
+pfadd       (%rax), %mm2
+
+pfcmpeq     %mm0, %mm2
+pfcmpeq     (%rax), %mm2
+
+pfcmpge     %mm0, %mm2
+pfcmpge     (%rax), %mm2
+
+pfcmpgt     %mm0, %mm2
+pfcmpgt     (%rax), %mm2
+
+pfmax       %mm0, %mm2
+pfmax       (%rax), %mm2
+
+pfmin       %mm0, %mm2
+pfmin       (%rax), %mm2
+
+pfmul       %mm0, %mm2
+pfmul       (%rax), %mm2
+
+pfnacc      %mm0, %mm2
+pfnacc      (%rax), %mm2
+
+pfpnacc     %mm0, %mm2
+pfpnacc     (%rax), %mm2
+
+pfrcp       %mm0, %mm2
+pfrcp       (%rax), %mm2
+
+pfrcpit1    %mm0, %mm2
+pfrcpit1    (%rax), %mm2
+
+pfrcpit2    %mm0, %mm2
+pfrcpit2    (%rax), %mm2
+
+pfrsqit1    %mm0, %mm2
+pfrsqit1    (%rax), %mm2
+
+pfrsqrt     %mm0, %mm2
+pfrsqrt     (%rax), %mm2
+
+pfsub       %mm0, %mm2
+pfsub       (%rax), %mm2
+
+pfsubr      %mm0, %mm2
+pfsubr      (%rax), %mm2
+
+pi2fd       %mm0, %mm2
+pi2fd       (%rax), %mm2
+
+pi2fw       %mm0, %mm2
+pi2fw       (%rax), %mm2
+
+pmulhrw     %mm0, %mm2
+pmulhrw     (%rax), %mm2
+
+prefetch    (%rax)
+prefetchw   (%rax)
+
+pswapd      %mm0, %mm2
+pswapd      (%rax), %mm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  31     31    10.33   *      *      U     femms
+# CHECK-NEXT:  1      3     1.00                        pavgusb	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pavgusb	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pf2id	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pf2id	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pf2iw	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pf2iw	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfacc	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfacc	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfadd	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfadd	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfcmpeq	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfcmpeq	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfcmpge	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfcmpge	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfcmpgt	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfcmpgt	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfmax	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfmax	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfmin	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfmin	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfmul	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfmul	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfnacc	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfnacc	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfpnacc	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfpnacc	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfrcp	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfrcp	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfrcpit1	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfrcpit1	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfrcpit2	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfrcpit2	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfrsqit1	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfrsqit1	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfrsqrt	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfrsqrt	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfsub	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfsub	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pfsubr	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pfsubr	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pi2fd	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pi2fd	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pi2fw	%mm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   pi2fw	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pmulhrw	%mm0, %mm2
+# CHECK-NEXT:  2      10    1.00    *                   pmulhrw	(%rax), %mm2
+# CHECK-NEXT:  1      5     0.50    *      *            prefetch	(%rax)
+# CHECK-NEXT:  1      5     0.50    *      *            prefetchw	(%rax)
+# CHECK-NEXT:  1      1     1.00                        pswapd	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   pswapd	(%rax), %mm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     12.33  54.33   -     12.33  13.00  13.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     10.33  10.33   -     10.33   -      -     femms
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pavgusb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pavgusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pf2id	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pf2id	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pf2iw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pf2iw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfacc	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfadd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfadd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfcmpeq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfcmpeq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfcmpge	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfcmpge	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfcmpgt	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfcmpgt	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfmax	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfmax	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfmin	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfmin	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfmul	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfmul	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfnacc	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfnacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfpnacc	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfpnacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfrcp	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfrcp	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfrcpit1	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfrcpit1	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfrcpit2	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfrcpit2	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfrsqit1	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfrsqit1	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfrsqrt	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfrsqrt	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfsub	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfsub	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfsubr	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfsubr	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pi2fd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pi2fd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pi2fw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pi2fw	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhrw	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhrw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetch	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetchw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     pswapd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   pswapd	(%rax), %mm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-adx.s b/test/tools/llvm-mca/X86/BdVer2/resources-adx.s
new file mode 100644
index 00000000000..25f08545e4a
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-adx.s
@@ -0,0 +1,55 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+adcx        %ebx, %ecx
+adcx        (%rbx), %ecx
+adcx        %rbx, %rcx
+adcx        (%rbx), %rcx
+
+adox        %ebx, %ecx
+adox        (%rbx), %ecx
+adox        %rbx, %rcx
+adox        (%rbx), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      2     0.67                        adcxl	%ebx, %ecx
+# CHECK-NEXT:  3      7     0.67    *                   adcxl	(%rbx), %ecx
+# CHECK-NEXT:  2      2     0.67                        adcxq	%rbx, %rcx
+# CHECK-NEXT:  3      7     0.67    *                   adcxq	(%rbx), %rcx
+# CHECK-NEXT:  2      2     0.67                        adoxl	%ebx, %ecx
+# CHECK-NEXT:  3      7     0.67    *                   adoxl	(%rbx), %ecx
+# CHECK-NEXT:  2      2     0.67                        adoxq	%rbx, %rcx
+# CHECK-NEXT:  3      7     0.67    *                   adoxq	(%rbx), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     6.67   2.67    -     6.67   2.00   2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcxl	%ebx, %ecx
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcxl	(%rbx), %ecx
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcxq	%rbx, %rcx
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcxq	(%rbx), %rcx
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adoxl	%ebx, %ecx
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adoxl	(%rbx), %ecx
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adoxq	%rbx, %rcx
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adoxq	(%rbx), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-aes.s b/test/tools/llvm-mca/X86/BdVer2/resources-aes.s
new file mode 100644
index 00000000000..5720f208ecb
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-aes.s
@@ -0,0 +1,71 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+aesdec          %xmm0, %xmm2
+aesdec          (%rax), %xmm2
+
+aesdeclast      %xmm0, %xmm2
+aesdeclast      (%rax), %xmm2
+
+aesenc          %xmm0, %xmm2
+aesenc          (%rax), %xmm2
+
+aesenclast      %xmm0, %xmm2
+aesenclast      (%rax), %xmm2
+
+aesimc          %xmm0, %xmm2
+aesimc          (%rax), %xmm2
+
+aeskeygenassist $22, %xmm0, %xmm2
+aeskeygenassist $22, (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      7     1.00                        aesdec	%xmm0, %xmm2
+# CHECK-NEXT:  3      13    1.00    *                   aesdec	(%rax), %xmm2
+# CHECK-NEXT:  2      7     1.00                        aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT:  3      13    1.00    *                   aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  2      7     1.00                        aesenc	%xmm0, %xmm2
+# CHECK-NEXT:  3      13    1.00    *                   aesenc	(%rax), %xmm2
+# CHECK-NEXT:  2      7     1.00                        aesenclast	%xmm0, %xmm2
+# CHECK-NEXT:  3      13    1.00    *                   aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  2      12    2.00                        aesimc	%xmm0, %xmm2
+# CHECK-NEXT:  3      18    2.00    *                   aesimc	(%rax), %xmm2
+# CHECK-NEXT:  1      8     3.67                        aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  1      8     3.33    *                   aeskeygenassist	$22, (%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     9.67   9.67    -     21.67  3.00   3.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     aesdec	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   aesdec	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     aesenc	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   aesenc	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     aesenclast	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     2.00    -      -     aesimc	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     2.00   0.50   0.50   aesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -     3.67   3.67    -     3.67    -      -     aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     3.33   3.33    -     3.33   0.50   0.50   aeskeygenassist	$22, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s b/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s
new file mode 100644
index 00000000000..f0bf9e27294
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s
@@ -0,0 +1,2431 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+vaddpd            %xmm0, %xmm1, %xmm2
+vaddpd            (%rax), %xmm1, %xmm2
+
+vaddpd            %ymm0, %ymm1, %ymm2
+vaddpd            (%rax), %ymm1, %ymm2
+
+vaddps            %xmm0, %xmm1, %xmm2
+vaddps            (%rax), %xmm1, %xmm2
+
+vaddps            %ymm0, %ymm1, %ymm2
+vaddps            (%rax), %ymm1, %ymm2
+
+vaddsd            %xmm0, %xmm1, %xmm2
+vaddsd            (%rax), %xmm1, %xmm2
+
+vaddss            %xmm0, %xmm1, %xmm2
+vaddss            (%rax), %xmm1, %xmm2
+
+vaddsubpd         %xmm0, %xmm1, %xmm2
+vaddsubpd         (%rax), %xmm1, %xmm2
+
+vaddsubpd         %ymm0, %ymm1, %ymm2
+vaddsubpd         (%rax), %ymm1, %ymm2
+
+vaddsubps         %xmm0, %xmm1, %xmm2
+vaddsubps         (%rax), %xmm1, %xmm2
+
+vaddsubps         %ymm0, %ymm1, %ymm2
+vaddsubps         (%rax), %ymm1, %ymm2
+
+vaesdec           %xmm0, %xmm1, %xmm2
+vaesdec           (%rax), %xmm1, %xmm2
+
+vaesdeclast       %xmm0, %xmm1, %xmm2
+vaesdeclast       (%rax), %xmm1, %xmm2
+
+vaesenc           %xmm0, %xmm1, %xmm2
+vaesenc           (%rax), %xmm1, %xmm2
+
+vaesenclast       %xmm0, %xmm1, %xmm2
+vaesenclast       (%rax), %xmm1, %xmm2
+
+vaesimc           %xmm0, %xmm2
+vaesimc           (%rax), %xmm2
+
+vaeskeygenassist  $22, %xmm0, %xmm2
+vaeskeygenassist  $22, (%rax), %xmm2
+
+vandnpd           %xmm0, %xmm1, %xmm2
+vandnpd           (%rax), %xmm1, %xmm2
+
+vandnpd           %ymm0, %ymm1, %ymm2
+vandnpd           (%rax), %ymm1, %ymm2
+
+vandnps           %xmm0, %xmm1, %xmm2
+vandnps           (%rax), %xmm1, %xmm2
+
+vandnps           %ymm0, %ymm1, %ymm2
+vandnps           (%rax), %ymm1, %ymm2
+
+vandpd            %xmm0, %xmm1, %xmm2
+vandpd            (%rax), %xmm1, %xmm2
+
+vandpd            %ymm0, %ymm1, %ymm2
+vandpd            (%rax), %ymm1, %ymm2
+
+vandps            %xmm0, %xmm1, %xmm2
+vandps            (%rax), %xmm1, %xmm2
+
+vandps            %ymm0, %ymm1, %ymm2
+vandps            (%rax), %ymm1, %ymm2
+
+vblendpd          $11, %xmm0, %xmm1, %xmm2
+vblendpd          $11, (%rax), %xmm1, %xmm2
+
+vblendpd          $11, %ymm0, %ymm1, %ymm2
+vblendpd          $11, (%rax), %ymm1, %ymm2
+
+vblendps          $11, %xmm0, %xmm1, %xmm2
+vblendps          $11, (%rax), %xmm1, %xmm2
+
+vblendps          $11, %ymm0, %ymm1, %ymm2
+vblendps          $11, (%rax), %ymm1, %ymm2
+
+vblendvpd         %xmm3, %xmm0, %xmm1, %xmm2
+vblendvpd         %xmm3, (%rax), %xmm1, %xmm2
+
+vblendvpd         %ymm3, %ymm0, %ymm1, %ymm2
+vblendvpd         %ymm3, (%rax), %ymm1, %ymm2
+
+vblendvps         %xmm3, %xmm0, %xmm1, %xmm2
+vblendvps         %xmm3, (%rax), %xmm1, %xmm2
+
+vblendvps         %ymm3, %ymm0, %ymm1, %ymm2
+vblendvps         %ymm3, (%rax), %ymm1, %ymm2
+
+vbroadcastf128    (%rax), %ymm2
+
+vbroadcastsd      (%rax), %ymm2
+
+vbroadcastss      (%rax), %xmm2
+vbroadcastss      (%rax), %ymm2
+
+vcmppd            $0, %xmm0, %xmm1, %xmm2
+vcmppd            $0, (%rax), %xmm1, %xmm2
+
+vcmppd            $0, %ymm0, %ymm1, %ymm2
+vcmppd            $0, (%rax), %ymm1, %ymm2
+
+vcmpps            $0, %xmm0, %xmm1, %xmm2
+vcmpps            $0, (%rax), %xmm1, %xmm2
+
+vcmpps            $0, %ymm0, %ymm1, %ymm2
+vcmpps            $0, (%rax), %ymm1, %ymm2
+
+vcmpsd            $0, %xmm0, %xmm1, %xmm2
+vcmpsd            $0, (%rax), %xmm1, %xmm2
+
+vcmpss            $0, %xmm0, %xmm1, %xmm2
+vcmpss            $0, (%rax), %xmm1, %xmm2
+
+vcomisd           %xmm0, %xmm1
+vcomisd           (%rax), %xmm1
+
+vcomiss           %xmm0, %xmm1
+vcomiss           (%rax), %xmm1
+
+vcvtdq2pd         %xmm0, %xmm2
+vcvtdq2pd         (%rax), %xmm2
+
+vcvtdq2pd         %xmm0, %ymm2
+vcvtdq2pd         (%rax), %ymm2
+
+vcvtdq2ps         %xmm0, %xmm2
+vcvtdq2ps         (%rax), %xmm2
+
+vcvtdq2ps         %ymm0, %ymm2
+vcvtdq2ps         (%rax), %ymm2
+
+vcvtpd2dqx        %xmm0, %xmm2
+vcvtpd2dqx        (%rax), %xmm2
+
+vcvtpd2dqy        %ymm0, %xmm2
+vcvtpd2dqy        (%rax), %xmm2
+
+vcvtpd2psx        %xmm0, %xmm2
+vcvtpd2psx        (%rax), %xmm2
+
+vcvtpd2psy        %ymm0, %xmm2
+vcvtpd2psy        (%rax), %xmm2
+
+vcvtps2dq         %xmm0, %xmm2
+vcvtps2dq         (%rax), %xmm2
+
+vcvtps2dq         %ymm0, %ymm2
+vcvtps2dq         (%rax), %ymm2
+
+vcvtps2pd         %xmm0, %xmm2
+vcvtps2pd         (%rax), %xmm2
+
+vcvtps2pd         %xmm0, %ymm2
+vcvtps2pd         (%rax), %ymm2
+
+vcvtsd2si         %xmm0, %ecx
+vcvtsd2si         %xmm0, %rcx
+vcvtsd2si         (%rax), %ecx
+vcvtsd2si         (%rax), %rcx
+
+vcvtsd2ss         %xmm0, %xmm1, %xmm2
+vcvtsd2ss         (%rax), %xmm1, %xmm2
+
+vcvtsi2sdl        %ecx, %xmm0, %xmm2
+vcvtsi2sdq        %rcx, %xmm0, %xmm2
+vcvtsi2sdl        (%rax), %xmm0, %xmm2
+vcvtsi2sdq        (%rax), %xmm0, %xmm2
+
+vcvtsi2ssl        %ecx, %xmm0, %xmm2
+vcvtsi2ssq        %rcx, %xmm0, %xmm2
+vcvtsi2ssl        (%rax), %xmm0, %xmm2
+vcvtsi2ssq        (%rax), %xmm0, %xmm2
+
+vcvtss2sd         %xmm0, %xmm1, %xmm2
+vcvtss2sd         (%rax), %xmm1, %xmm2
+
+vcvtss2si         %xmm0, %ecx
+vcvtss2si         %xmm0, %rcx
+vcvtss2si         (%rax), %ecx
+vcvtss2si         (%rax), %rcx
+
+vcvttpd2dqx       %xmm0, %xmm2
+vcvttpd2dqx       (%rax), %xmm2
+
+vcvttpd2dqy       %ymm0, %xmm2
+vcvttpd2dqy       (%rax), %xmm2
+
+vcvttps2dq        %xmm0, %xmm2
+vcvttps2dq        (%rax), %xmm2
+
+vcvttps2dq        %ymm0, %ymm2
+vcvttps2dq        (%rax), %ymm2
+
+vcvttsd2si        %xmm0, %ecx
+vcvttsd2si        %xmm0, %rcx
+vcvttsd2si        (%rax), %ecx
+vcvttsd2si        (%rax), %rcx
+
+vcvttss2si        %xmm0, %ecx
+vcvttss2si        %xmm0, %rcx
+vcvttss2si        (%rax), %ecx
+vcvttss2si        (%rax), %rcx
+
+vdivpd            %xmm0, %xmm1, %xmm2
+vdivpd            (%rax), %xmm1, %xmm2
+
+vdivpd            %ymm0, %ymm1, %ymm2
+vdivpd            (%rax), %ymm1, %ymm2
+
+vdivps            %xmm0, %xmm1, %xmm2
+vdivps            (%rax), %xmm1, %xmm2
+
+vdivps            %ymm0, %ymm1, %ymm2
+vdivps            (%rax), %ymm1, %ymm2
+
+vdivsd            %xmm0, %xmm1, %xmm2
+vdivsd            (%rax), %xmm1, %xmm2
+
+vdivss            %xmm0, %xmm1, %xmm2
+vdivss            (%rax), %xmm1, %xmm2
+
+vdppd             $22, %xmm0, %xmm1, %xmm2
+vdppd             $22, (%rax), %xmm1, %xmm2
+
+vdpps             $22, %xmm0, %xmm1, %xmm2
+vdpps             $22, (%rax), %xmm1, %xmm2
+
+vdpps             $22, %ymm0, %ymm1, %ymm2
+vdpps             $22, (%rax), %ymm1, %ymm2
+
+vextractf128      $1, %ymm0, %xmm2
+vextractf128      $1, %ymm0, (%rax)
+
+vextractps        $1, %xmm0, %rcx
+vextractps        $1, %xmm0, (%rax)
+
+vhaddpd           %xmm0, %xmm1, %xmm2
+vhaddpd           (%rax), %xmm1, %xmm2
+
+vhaddpd           %ymm0, %ymm1, %ymm2
+vhaddpd           (%rax), %ymm1, %ymm2
+
+vhaddps           %xmm0, %xmm1, %xmm2
+vhaddps           (%rax), %xmm1, %xmm2
+
+vhaddps           %ymm0, %ymm1, %ymm2
+vhaddps           (%rax), %ymm1, %ymm2
+
+vhsubpd           %xmm0, %xmm1, %xmm2
+vhsubpd           (%rax), %xmm1, %xmm2
+
+vhsubpd           %ymm0, %ymm1, %ymm2
+vhsubpd           (%rax), %ymm1, %ymm2
+
+vhsubps           %xmm0, %xmm1, %xmm2
+vhsubps           (%rax), %xmm1, %xmm2
+
+vhsubps           %ymm0, %ymm1, %ymm2
+vhsubps           (%rax), %ymm1, %ymm2
+
+vinsertf128       $1, %xmm0, %ymm1, %ymm2
+vinsertf128       $1, (%rax), %ymm1, %ymm2
+
+vinsertps         $1, %xmm0, %xmm1, %xmm2
+vinsertps         $1, (%rax), %xmm1, %xmm2
+
+vlddqu            (%rax), %xmm2
+vlddqu            (%rax), %ymm2
+
+vldmxcsr          (%rax)
+
+vmaskmovdqu       %xmm0, %xmm1
+
+vmaskmovpd        (%rax), %xmm0, %xmm2
+vmaskmovpd        (%rax), %ymm0, %ymm2
+
+vmaskmovpd        %xmm0, %xmm1, (%rax)
+vmaskmovpd        %ymm0, %ymm1, (%rax)
+
+vmaskmovps        (%rax), %xmm0, %xmm2
+vmaskmovps        (%rax), %ymm0, %ymm2
+
+vmaskmovps        %xmm0, %xmm1, (%rax)
+vmaskmovps        %ymm0, %ymm1, (%rax)
+
+vmaxpd            %xmm0, %xmm1, %xmm2
+vmaxpd            (%rax), %xmm1, %xmm2
+
+vmaxpd            %ymm0, %ymm1, %ymm2
+vmaxpd            (%rax), %ymm1, %ymm2
+
+vmaxps            %xmm0, %xmm1, %xmm2
+vmaxps            (%rax), %xmm1, %xmm2
+
+vmaxps            %ymm0, %ymm1, %ymm2
+vmaxps            (%rax), %ymm1, %ymm2
+
+vmaxsd            %xmm0, %xmm1, %xmm2
+vmaxsd            (%rax), %xmm1, %xmm2
+
+vmaxss            %xmm0, %xmm1, %xmm2
+vmaxss            (%rax), %xmm1, %xmm2
+
+vminpd            %xmm0, %xmm1, %xmm2
+vminpd            (%rax), %xmm1, %xmm2
+
+vminpd            %ymm0, %ymm1, %ymm2
+vminpd            (%rax), %ymm1, %ymm2
+
+vminps            %xmm0, %xmm1, %xmm2
+vminps            (%rax), %xmm1, %xmm2
+
+vminps            %ymm0, %ymm1, %ymm2
+vminps            (%rax), %ymm1, %ymm2
+
+vminsd            %xmm0, %xmm1, %xmm2
+vminsd            (%rax), %xmm1, %xmm2
+
+vminss            %xmm0, %xmm1, %xmm2
+vminss            (%rax), %xmm1, %xmm2
+
+vmovapd           %xmm0, %xmm2
+vmovapd           %xmm0, (%rax)
+vmovapd           (%rax), %xmm2
+
+vmovapd           %ymm0, %ymm2
+vmovapd           %ymm0, (%rax)
+vmovapd           (%rax), %ymm2
+
+vmovaps           %xmm0, %xmm2
+vmovaps           %xmm0, (%rax)
+vmovaps           (%rax), %xmm2
+
+vmovaps           %ymm0, %ymm2
+vmovaps           %ymm0, (%rax)
+vmovaps           (%rax), %ymm2
+
+vmovd             %eax, %xmm2
+vmovd             (%rax), %xmm2
+
+vmovd             %xmm0, %ecx
+vmovd             %xmm0, (%rax)
+
+vmovddup          %xmm0, %xmm2
+vmovddup          (%rax), %xmm2
+
+vmovddup          %ymm0, %ymm2
+vmovddup          (%rax), %ymm2
+
+vmovdqa           %xmm0, %xmm2
+vmovdqa           %xmm0, (%rax)
+vmovdqa           (%rax), %xmm2
+
+vmovdqa           %ymm0, %ymm2
+vmovdqa           %ymm0, (%rax)
+vmovdqa           (%rax), %ymm2
+
+vmovdqu           %xmm0, %xmm2
+vmovdqu           %xmm0, (%rax)
+vmovdqu           (%rax), %xmm2
+
+vmovdqu           %ymm0, %ymm2
+vmovdqu           %ymm0, (%rax)
+vmovdqu           (%rax), %ymm2
+
+vmovhlps          %xmm0, %xmm1, %xmm2
+vmovlhps          %xmm0, %xmm1, %xmm2
+
+vmovhpd           %xmm0, (%rax)
+vmovhpd           (%rax), %xmm1, %xmm2
+
+vmovhps           %xmm0, (%rax)
+vmovhps           (%rax), %xmm1, %xmm2
+
+vmovlpd           %xmm0, (%rax)
+vmovlpd           (%rax), %xmm1, %xmm2
+
+vmovlps           %xmm0, (%rax)
+vmovlps           (%rax), %xmm1, %xmm2
+
+vmovmskpd         %xmm0, %rcx
+vmovmskpd         %ymm0, %rcx
+
+vmovmskps         %xmm0, %rcx
+vmovmskps         %ymm0, %rcx
+
+vmovntdq          %xmm0, (%rax)
+vmovntdq          %ymm0, (%rax)
+
+vmovntdqa         (%rax), %xmm2
+vmovntdqa         (%rax), %ymm2
+
+vmovntpd          %xmm0, (%rax)
+vmovntpd          %ymm0, (%rax)
+
+vmovntps          %xmm0, (%rax)
+vmovntps          %ymm0, (%rax)
+
+vmovq             %xmm0, %xmm2
+
+vmovq             %rax, %xmm2
+vmovq             (%rax), %xmm2
+
+vmovq             %xmm0, %rcx
+vmovq             %xmm0, (%rax)
+
+vmovsd            %xmm0, %xmm1, %xmm2
+vmovsd            %xmm0, (%rax)
+vmovsd            (%rax), %xmm2
+
+vmovshdup         %xmm0, %xmm2
+vmovshdup         (%rax), %xmm2
+
+vmovshdup         %ymm0, %ymm2
+vmovshdup         (%rax), %ymm2
+
+vmovsldup         %xmm0, %xmm2
+vmovsldup         (%rax), %xmm2
+
+vmovsldup         %ymm0, %ymm2
+vmovsldup         (%rax), %ymm2
+
+vmovss            %xmm0, %xmm1, %xmm2
+vmovss            %xmm0, (%rax)
+vmovss            (%rax), %xmm2
+
+vmovupd           %xmm0, %xmm2
+vmovupd           %xmm0, (%rax)
+vmovupd           (%rax), %xmm2
+
+vmovupd           %ymm0, %ymm2
+vmovupd           %ymm0, (%rax)
+vmovupd           (%rax), %ymm2
+
+vmovups           %xmm0, %xmm2
+vmovups           %xmm0, (%rax)
+vmovups           (%rax), %xmm2
+
+vmovups           %ymm0, %ymm2
+vmovups           %ymm0, (%rax)
+vmovups           (%rax), %ymm2
+
+vmpsadbw          $1, %xmm0, %xmm1, %xmm2
+vmpsadbw          $1, (%rax), %xmm1, %xmm2
+
+vmulpd            %xmm0, %xmm1, %xmm2
+vmulpd            (%rax), %xmm1, %xmm2
+
+vmulpd            %ymm0, %ymm1, %ymm2
+vmulpd            (%rax), %ymm1, %ymm2
+
+vmulps            %xmm0, %xmm1, %xmm2
+vmulps            (%rax), %xmm1, %xmm2
+
+vmulps            %ymm0, %ymm1, %ymm2
+vmulps            (%rax), %ymm1, %ymm2
+
+vmulsd            %xmm0, %xmm1, %xmm2
+vmulsd            (%rax), %xmm1, %xmm2
+
+vmulss            %xmm0, %xmm1, %xmm2
+vmulss            (%rax), %xmm1, %xmm2
+
+vorpd             %xmm0, %xmm1, %xmm2
+vorpd             (%rax), %xmm1, %xmm2
+
+vorpd             %ymm0, %ymm1, %ymm2
+vorpd             (%rax), %ymm1, %ymm2
+
+vorps             %xmm0, %xmm1, %xmm2
+vorps             (%rax), %xmm1, %xmm2
+
+vorps             %ymm0, %ymm1, %ymm2
+vorps             (%rax), %ymm1, %ymm2
+
+vpabsb            %xmm0, %xmm2
+vpabsb            (%rax), %xmm2
+
+vpabsd            %xmm0, %xmm2
+vpabsd            (%rax), %xmm2
+
+vpabsw            %xmm0, %xmm2
+vpabsw            (%rax), %xmm2
+
+vpackssdw         %xmm0, %xmm1, %xmm2
+vpackssdw         (%rax), %xmm1, %xmm2
+
+vpacksswb         %xmm0, %xmm1, %xmm2
+vpacksswb         (%rax), %xmm1, %xmm2
+
+vpackusdw         %xmm0, %xmm1, %xmm2
+vpackusdw         (%rax), %xmm1, %xmm2
+
+vpackuswb         %xmm0, %xmm1, %xmm2
+vpackuswb         (%rax), %xmm1, %xmm2
+
+vpaddb            %xmm0, %xmm1, %xmm2
+vpaddb            (%rax), %xmm1, %xmm2
+
+vpaddd            %xmm0, %xmm1, %xmm2
+vpaddd            (%rax), %xmm1, %xmm2
+
+vpaddq            %xmm0, %xmm1, %xmm2
+vpaddq            (%rax), %xmm1, %xmm2
+
+vpaddsb           %xmm0, %xmm1, %xmm2
+vpaddsb           (%rax), %xmm1, %xmm2
+
+vpaddsw           %xmm0, %xmm1, %xmm2
+vpaddsw           (%rax), %xmm1, %xmm2
+
+vpaddusb          %xmm0, %xmm1, %xmm2
+vpaddusb          (%rax), %xmm1, %xmm2
+
+vpaddusw          %xmm0, %xmm1, %xmm2
+vpaddusw          (%rax), %xmm1, %xmm2
+
+vpaddw            %xmm0, %xmm1, %xmm2
+vpaddw            (%rax), %xmm1, %xmm2
+
+vpalignr          $1, %xmm0, %xmm1, %xmm2
+vpalignr          $1, (%rax), %xmm1, %xmm2
+
+vpand             %xmm0, %xmm1, %xmm2
+vpand             (%rax), %xmm1, %xmm2
+
+vpandn            %xmm0, %xmm1, %xmm2
+vpandn            (%rax), %xmm1, %xmm2
+
+vpavgb            %xmm0, %xmm1, %xmm2
+vpavgb            (%rax), %xmm1, %xmm2
+
+vpavgw            %xmm0, %xmm1, %xmm2
+vpavgw            (%rax), %xmm1, %xmm2
+
+vpblendvb         %xmm3, %xmm0, %xmm1, %xmm2
+vpblendvb         %xmm3, (%rax), %xmm1, %xmm2
+
+vpblendw          $11, %xmm0, %xmm1, %xmm2
+vpblendw          $11, (%rax), %xmm1, %xmm2
+
+vpclmulqdq        $11, %xmm0, %xmm1, %xmm2
+vpclmulqdq        $11, (%rax), %xmm1, %xmm2
+
+vpcmpeqb          %xmm0, %xmm1, %xmm2
+vpcmpeqb          (%rax), %xmm1, %xmm2
+
+vpcmpeqd          %xmm0, %xmm1, %xmm2
+vpcmpeqd          (%rax), %xmm1, %xmm2
+
+vpcmpeqq          %xmm0, %xmm1, %xmm2
+vpcmpeqq          (%rax), %xmm1, %xmm2
+
+vpcmpeqw          %xmm0, %xmm1, %xmm2
+vpcmpeqw          (%rax), %xmm1, %xmm2
+
+vpcmpestri        $1, %xmm0, %xmm2
+vpcmpestri        $1, (%rax), %xmm2
+
+vpcmpestrm        $1, %xmm0, %xmm2
+vpcmpestrm        $1, (%rax), %xmm2
+
+vpcmpgtb          %xmm0, %xmm1, %xmm2
+vpcmpgtb          (%rax), %xmm1, %xmm2
+
+vpcmpgtd          %xmm0, %xmm1, %xmm2
+vpcmpgtd          (%rax), %xmm1, %xmm2
+
+vpcmpgtq          %xmm0, %xmm1, %xmm2
+vpcmpgtq          (%rax), %xmm1, %xmm2
+
+vpcmpgtw          %xmm0, %xmm1, %xmm2
+vpcmpgtw          (%rax), %xmm1, %xmm2
+
+vpcmpistri        $1, %xmm0, %xmm2
+vpcmpistri        $1, (%rax), %xmm2
+
+vpcmpistrm        $1, %xmm0, %xmm2
+vpcmpistrm        $1, (%rax), %xmm2
+
+vperm2f128        $1, %ymm0, %ymm1, %ymm2
+vperm2f128        $1, (%rax), %ymm1, %ymm2
+
+vpermilpd         $1, %xmm0, %xmm2
+vpermilpd         $1, (%rax), %xmm2
+vpermilpd         %xmm0, %xmm1, %xmm2
+vpermilpd         (%rax), %xmm1, %xmm2
+
+vpermilpd         $1, %ymm0, %ymm2
+vpermilpd         $1, (%rax), %ymm2
+vpermilpd         %ymm0, %ymm1, %ymm2
+vpermilpd         (%rax), %ymm1, %ymm2
+
+vpermilps         $1, %xmm0, %xmm2
+vpermilps         $1, (%rax), %xmm2
+vpermilps         %xmm0, %xmm1, %xmm2
+vpermilps         (%rax), %xmm1, %xmm2
+
+vpermilps         $1, %ymm0, %ymm2
+vpermilps         $1, (%rax), %ymm2
+vpermilps         %ymm0, %ymm1, %ymm2
+vpermilps         (%rax), %ymm1, %ymm2
+
+vpextrb           $1, %xmm0, %ecx
+vpextrb           $1, %xmm0, (%rax)
+
+vpextrd           $1, %xmm0, %ecx
+vpextrd           $1, %xmm0, (%rax)
+
+vpextrq           $1, %xmm0, %rcx
+vpextrq           $1, %xmm0, (%rax)
+
+vpextrw           $1, %xmm0, %ecx
+vpextrw           $1, %xmm0, (%rax)
+
+vphaddd           %xmm0, %xmm1, %xmm2
+vphaddd           (%rax), %xmm1, %xmm2
+
+vphaddsw          %xmm0, %xmm1, %xmm2
+vphaddsw          (%rax), %xmm1, %xmm2
+
+vphaddw           %xmm0, %xmm1, %xmm2
+vphaddw           (%rax), %xmm1, %xmm2
+
+vphminposuw       %xmm0, %xmm2
+vphminposuw       (%rax), %xmm2
+
+vphsubd           %xmm0, %xmm1, %xmm2
+vphsubd           (%rax), %xmm1, %xmm2
+
+vphsubsw          %xmm0, %xmm1, %xmm2
+vphsubsw          (%rax), %xmm1, %xmm2
+
+vphsubw           %xmm0, %xmm1, %xmm2
+vphsubw           (%rax), %xmm1, %xmm2
+
+vpinsrb           $1, %eax, %xmm1, %xmm2
+vpinsrb           $1, (%rax), %xmm1, %xmm2
+
+vpinsrd           $1, %eax, %xmm1, %xmm2
+vpinsrd           $1, (%rax), %xmm1, %xmm2
+
+vpinsrq           $1, %rax, %xmm1, %xmm2
+vpinsrq           $1, (%rax), %xmm1, %xmm2
+
+vpinsrw           $1, %eax, %xmm1, %xmm2
+vpinsrw           $1, (%rax), %xmm1, %xmm2
+
+vpmaddubsw        %xmm0, %xmm1, %xmm2
+vpmaddubsw        (%rax), %xmm1, %xmm2
+
+vpmaddwd          %xmm0, %xmm1, %xmm2
+vpmaddwd          (%rax), %xmm1, %xmm2
+
+vpmaxsb           %xmm0, %xmm1, %xmm2
+vpmaxsb           (%rax), %xmm1, %xmm2
+
+vpmaxsd           %xmm0, %xmm1, %xmm2
+vpmaxsd           (%rax), %xmm1, %xmm2
+
+vpmaxsw           %xmm0, %xmm1, %xmm2
+vpmaxsw           (%rax), %xmm1, %xmm2
+
+vpmaxub           %xmm0, %xmm1, %xmm2
+vpmaxub           (%rax), %xmm1, %xmm2
+
+vpmaxud           %xmm0, %xmm1, %xmm2
+vpmaxud           (%rax), %xmm1, %xmm2
+
+vpmaxuw           %xmm0, %xmm1, %xmm2
+vpmaxuw           (%rax), %xmm1, %xmm2
+
+vpminsb           %xmm0, %xmm1, %xmm2
+vpminsb           (%rax), %xmm1, %xmm2
+
+vpminsd           %xmm0, %xmm1, %xmm2
+vpminsd           (%rax), %xmm1, %xmm2
+
+vpminsw           %xmm0, %xmm1, %xmm2
+vpminsw           (%rax), %xmm1, %xmm2
+
+vpminub           %xmm0, %xmm1, %xmm2
+vpminub           (%rax), %xmm1, %xmm2
+
+vpminud           %xmm0, %xmm1, %xmm2
+vpminud           (%rax), %xmm1, %xmm2
+
+vpminuw           %xmm0, %xmm1, %xmm2
+vpminuw           (%rax), %xmm1, %xmm2
+
+vpmovmskb         %xmm0, %rcx
+
+vpmovsxbd         %xmm0, %xmm2
+vpmovsxbd         (%rax), %xmm2
+
+vpmovsxbq         %xmm0, %xmm2
+vpmovsxbq         (%rax), %xmm2
+
+vpmovsxbw         %xmm0, %xmm2
+vpmovsxbw         (%rax), %xmm2
+
+vpmovsxdq         %xmm0, %xmm2
+vpmovsxdq         (%rax), %xmm2
+
+vpmovsxwd         %xmm0, %xmm2
+vpmovsxwd         (%rax), %xmm2
+
+vpmovsxwq         %xmm0, %xmm2
+vpmovsxwq         (%rax), %xmm2
+
+vpmovzxbd         %xmm0, %xmm2
+vpmovzxbd         (%rax), %xmm2
+
+vpmovzxbq         %xmm0, %xmm2
+vpmovzxbq         (%rax), %xmm2
+
+vpmovzxbw         %xmm0, %xmm2
+vpmovzxbw         (%rax), %xmm2
+
+vpmovzxdq         %xmm0, %xmm2
+vpmovzxdq         (%rax), %xmm2
+
+vpmovzxwd         %xmm0, %xmm2
+vpmovzxwd         (%rax), %xmm2
+
+vpmovzxwq         %xmm0, %xmm2
+vpmovzxwq         (%rax), %xmm2
+
+vpmuldq           %xmm0, %xmm1, %xmm2
+vpmuldq           (%rax), %xmm1, %xmm2
+
+vpmulhrsw         %xmm0, %xmm1, %xmm2
+vpmulhrsw         (%rax), %xmm1, %xmm2
+
+vpmulhuw          %xmm0, %xmm1, %xmm2
+vpmulhuw          (%rax), %xmm1, %xmm2
+
+vpmulhw           %xmm0, %xmm1, %xmm2
+vpmulhw           (%rax), %xmm1, %xmm2
+
+vpmulld           %xmm0, %xmm1, %xmm2
+vpmulld           (%rax), %xmm1, %xmm2
+
+vpmullw           %xmm0, %xmm1, %xmm2
+vpmullw           (%rax), %xmm1, %xmm2
+
+vpmuludq          %xmm0, %xmm1, %xmm2
+vpmuludq          (%rax), %xmm1, %xmm2
+
+vpor              %xmm0, %xmm1, %xmm2
+vpor              (%rax), %xmm1, %xmm2
+
+vpsadbw           %xmm0, %xmm1, %xmm2
+vpsadbw           (%rax), %xmm1, %xmm2
+
+vpshufb           %xmm0, %xmm1, %xmm2
+vpshufb           (%rax), %xmm1, %xmm2
+
+vpshufd           $1, %xmm0, %xmm2
+vpshufd           $1, (%rax), %xmm2
+
+vpshufhw          $1, %xmm0, %xmm2
+vpshufhw          $1, (%rax), %xmm2
+
+vpshuflw          $1, %xmm0, %xmm2
+vpshuflw          $1, (%rax), %xmm2
+
+vpsignb           %xmm0, %xmm1, %xmm2
+vpsignb           (%rax), %xmm1, %xmm2
+
+vpsignd           %xmm0, %xmm1, %xmm2
+vpsignd           (%rax), %xmm1, %xmm2
+
+vpsignw           %xmm0, %xmm1, %xmm2
+vpsignw           (%rax), %xmm1, %xmm2
+
+vpslld            $1, %xmm0, %xmm2
+vpslld            %xmm0, %xmm1, %xmm2
+vpslld            (%rax), %xmm1, %xmm2
+
+vpslldq           $1, %xmm1, %xmm2
+
+vpsllq            $1, %xmm0, %xmm2
+vpsllq            %xmm0, %xmm1, %xmm2
+vpsllq            (%rax), %xmm1, %xmm2
+
+vpsllw            $1, %xmm0, %xmm2
+vpsllw            %xmm0, %xmm1, %xmm2
+vpsllw            (%rax), %xmm1, %xmm2
+
+vpsrad            $1, %xmm0, %xmm2
+vpsrad            %xmm0, %xmm1, %xmm2
+vpsrad            (%rax), %xmm1, %xmm2
+
+vpsraw            $1, %xmm0, %xmm2
+vpsraw            %xmm0, %xmm1, %xmm2
+vpsraw            (%rax), %xmm1, %xmm2
+
+vpsrld            $1, %xmm0, %xmm2
+vpsrld            %xmm0, %xmm1, %xmm2
+vpsrld            (%rax), %xmm1, %xmm2
+
+vpsrldq           $1, %xmm1, %xmm2
+
+vpsrlq            $1, %xmm0, %xmm2
+vpsrlq            %xmm0, %xmm1, %xmm2
+vpsrlq            (%rax), %xmm1, %xmm2
+
+vpsrlw            $1, %xmm0, %xmm2
+vpsrlw            %xmm0, %xmm1, %xmm2
+vpsrlw            (%rax), %xmm1, %xmm2
+
+vpsubb            %xmm0, %xmm1, %xmm2
+vpsubb            (%rax), %xmm1, %xmm2
+
+vpsubd            %xmm0, %xmm1, %xmm2
+vpsubd            (%rax), %xmm1, %xmm2
+
+vpsubq            %xmm0, %xmm1, %xmm2
+vpsubq            (%rax), %xmm1, %xmm2
+
+vpsubsb           %xmm0, %xmm1, %xmm2
+vpsubsb           (%rax), %xmm1, %xmm2
+
+vpsubsw           %xmm0, %xmm1, %xmm2
+vpsubsw           (%rax), %xmm1, %xmm2
+
+vpsubusb          %xmm0, %xmm1, %xmm2
+vpsubusb          (%rax), %xmm1, %xmm2
+
+vpsubusw          %xmm0, %xmm1, %xmm2
+vpsubusw          (%rax), %xmm1, %xmm2
+
+vpsubw            %xmm0, %xmm1, %xmm2
+vpsubw            (%rax), %xmm1, %xmm2
+
+vptest            %xmm0, %xmm1
+vptest            (%rax), %xmm1
+
+vptest            %ymm0, %ymm1
+vptest            (%rax), %ymm1
+
+vpunpckhbw        %xmm0, %xmm1, %xmm2
+vpunpckhbw        (%rax), %xmm1, %xmm2
+
+vpunpckhdq        %xmm0, %xmm1, %xmm2
+vpunpckhdq        (%rax), %xmm1, %xmm2
+
+vpunpckhqdq       %xmm0, %xmm1, %xmm2
+vpunpckhqdq       (%rax), %xmm1, %xmm2
+
+vpunpckhwd        %xmm0, %xmm1, %xmm2
+vpunpckhwd        (%rax), %xmm1, %xmm2
+
+vpunpcklbw        %xmm0, %xmm1, %xmm2
+vpunpcklbw        (%rax), %xmm1, %xmm2
+
+vpunpckldq        %xmm0, %xmm1, %xmm2
+vpunpckldq        (%rax), %xmm1, %xmm2
+
+vpunpcklqdq       %xmm0, %xmm1, %xmm2
+vpunpcklqdq       (%rax), %xmm1, %xmm2
+
+vpunpcklwd        %xmm0, %xmm1, %xmm2
+vpunpcklwd        (%rax), %xmm1, %xmm2
+
+vpxor             %xmm0, %xmm1, %xmm2
+vpxor             (%rax), %xmm1, %xmm2
+
+vrcpps            %xmm0, %xmm2
+vrcpps            (%rax), %xmm2
+
+vrcpps            %ymm0, %ymm2
+vrcpps            (%rax), %ymm2
+
+vrcpss            %xmm0, %xmm1, %xmm2
+vrcpss            (%rax), %xmm1, %xmm2
+
+vroundpd          $1, %xmm0, %xmm2
+vroundpd          $1, (%rax), %xmm2
+
+vroundpd          $1, %ymm0, %ymm2
+vroundpd          $1, (%rax), %ymm2
+
+vroundps          $1, %xmm0, %xmm2
+vroundps          $1, (%rax), %xmm2
+
+vroundps          $1, %ymm0, %ymm2
+vroundps          $1, (%rax), %ymm2
+
+vroundsd          $1, %xmm0, %xmm1, %xmm2
+vroundsd          $1, (%rax), %xmm1, %xmm2
+
+vroundss          $1, %xmm0, %xmm1, %xmm2
+vroundss          $1, (%rax), %xmm1, %xmm2
+
+vrsqrtps          %xmm0, %xmm2
+vrsqrtps          (%rax), %xmm2
+
+vrsqrtps          %ymm0, %ymm2
+vrsqrtps          (%rax), %ymm2
+
+vrsqrtss          %xmm0, %xmm1, %xmm2
+vrsqrtss          (%rax), %xmm1, %xmm2
+
+vshufpd           $1, %xmm0, %xmm1, %xmm2
+vshufpd           $1, (%rax), %xmm1, %xmm2
+
+vshufpd           $1, %ymm0, %ymm1, %ymm2
+vshufpd           $1, (%rax), %ymm1, %ymm2
+
+vshufps           $1, %xmm0, %xmm1, %xmm2
+vshufps           $1, (%rax), %xmm1, %xmm2
+
+vshufps           $1, %ymm0, %ymm1, %ymm2
+vshufps           $1, (%rax), %ymm1, %ymm2
+
+vsqrtpd           %xmm0, %xmm2
+vsqrtpd           (%rax), %xmm2
+
+vsqrtpd           %ymm0, %ymm2
+vsqrtpd           (%rax), %ymm2
+
+vsqrtps           %xmm0, %xmm2
+vsqrtps           (%rax), %xmm2
+
+vsqrtps           %ymm0, %ymm2
+vsqrtps           (%rax), %ymm2
+
+vsqrtsd           %xmm0, %xmm1, %xmm2
+vsqrtsd           (%rax), %xmm1, %xmm2
+
+vsqrtss           %xmm0, %xmm1, %xmm2
+vsqrtss           (%rax), %xmm1, %xmm2
+
+vstmxcsr          (%rax)
+
+vsubpd            %xmm0, %xmm1, %xmm2
+vsubpd            (%rax), %xmm1, %xmm2
+
+vsubpd            %ymm0, %ymm1, %ymm2
+vsubpd            (%rax), %ymm1, %ymm2
+
+vsubps            %xmm0, %xmm1, %xmm2
+vsubps            (%rax), %xmm1, %xmm2
+
+vsubps            %ymm0, %ymm1, %ymm2
+vsubps            (%rax), %ymm1, %ymm2
+
+vsubsd            %xmm0, %xmm1, %xmm2
+vsubsd            (%rax), %xmm1, %xmm2
+
+vsubss            %xmm0, %xmm1, %xmm2
+vsubss            (%rax), %xmm1, %xmm2
+
+vtestpd          %xmm0, %xmm1
+vtestpd          (%rax), %xmm1
+
+vtestpd          %ymm0, %ymm1
+vtestpd          (%rax), %ymm1
+
+vtestps          %xmm0, %xmm1
+vtestps          (%rax), %xmm1
+
+vtestps          %ymm0, %ymm1
+vtestps          (%rax), %ymm1
+
+vucomisd          %xmm0, %xmm1
+vucomisd          (%rax), %xmm1
+
+vucomiss          %xmm0, %xmm1
+vucomiss          (%rax), %xmm1
+
+vunpckhpd         %xmm0, %xmm1, %xmm2
+vunpckhpd         (%rax), %xmm1, %xmm2
+
+vunpckhpd         %ymm0, %ymm1, %ymm2
+vunpckhpd         (%rax), %ymm1, %ymm2
+
+vunpckhps         %xmm0, %xmm1, %xmm2
+vunpckhps         (%rax), %xmm1, %xmm2
+
+vunpckhps         %ymm0, %ymm1, %ymm2
+vunpckhps         (%rax), %ymm1, %ymm2
+
+vunpcklpd         %xmm0, %xmm1, %xmm2
+vunpcklpd         (%rax), %xmm1, %xmm2
+
+vunpcklpd         %ymm0, %ymm1, %ymm2
+vunpcklpd         (%rax), %ymm1, %ymm2
+
+vunpcklps         %xmm0, %xmm1, %xmm2
+vunpcklps         (%rax), %xmm1, %xmm2
+
+vunpcklps         %ymm0, %ymm1, %ymm2
+vunpcklps         (%rax), %ymm1, %ymm2
+
+vxorpd            %xmm0, %xmm1, %xmm2
+vxorpd            (%rax), %xmm1, %xmm2
+
+vxorpd            %ymm0, %ymm1, %ymm2
+vxorpd            (%rax), %ymm1, %ymm2
+
+vxorps            %xmm0, %xmm1, %xmm2
+vxorps            (%rax), %xmm1, %xmm2
+
+vxorps            %ymm0, %ymm1, %ymm2
+vxorps            (%rax), %ymm1, %ymm2
+
+vzeroall
+vzeroupper
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        vaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vaddsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vaddsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vaddss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vaddsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vaddsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vaddsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vaddsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vaddsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vaddsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00                        vaesdec	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      13    1.00    *                   vaesdec	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00                        vaesdeclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      13    1.00    *                   vaesdeclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00                        vaesenc	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      13    1.00    *                   vaesenc	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00                        vaesenclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      13    1.00    *                   vaesenclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      12    2.00                        vaesimc	%xmm0, %xmm2
+# CHECK-NEXT:  3      18    2.00    *                   vaesimc	(%rax), %xmm2
+# CHECK-NEXT:  1      8     3.67                        vaeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  1      8     3.33    *                   vaeskeygenassist	$22, (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vandnpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vandnpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vandnpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vandnpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vandnps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vandnps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vandnps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vandnps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vandpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vandpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vandpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vandpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vandps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vandps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vandps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vandps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     0.50                        vblendpd	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vblendpd	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vblendpd	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     0.50    *                   vblendpd	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     0.50                        vblendps	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vblendps	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vblendps	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     0.50    *                   vblendps	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      2     1.00                        vblendvpd	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   vblendvpd	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vblendvpd	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  3      9     1.00    *                   vblendvpd	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      2     1.00                        vblendvps	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   vblendvps	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vblendvps	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  3      9     1.00    *                   vblendvps	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vbroadcastf128	(%rax), %ymm2
+# CHECK-NEXT:  1      7     0.50    *                   vbroadcastsd	(%rax), %ymm2
+# CHECK-NEXT:  1      6     0.50    *                   vbroadcastss	(%rax), %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vbroadcastss	(%rax), %ymm2
+# CHECK-NEXT:  1      3     1.00                        vcmppd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcmppd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcmppd	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vcmppd	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vcmpps	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcmpps	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcmpps	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vcmpps	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vcmpsd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcmpsd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcmpss	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcmpss	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vcomisd	%xmm0, %xmm1
+# CHECK-NEXT:  3      8     1.00    *                   vcomisd	(%rax), %xmm1
+# CHECK-NEXT:  2      2     1.00                        vcomiss	%xmm0, %xmm1
+# CHECK-NEXT:  3      8     1.00    *                   vcomiss	(%rax), %xmm1
+# CHECK-NEXT:  2      4     1.00                        vcvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   vcvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvtdq2pd	%xmm0, %ymm2
+# CHECK-NEXT:  3      10    1.00    *                   vcvtdq2pd	(%rax), %ymm2
+# CHECK-NEXT:  1      3     1.00                        vcvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvtdq2ps	%ymm0, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vcvtdq2ps	(%rax), %ymm2
+# CHECK-NEXT:  2      4     1.00                        vcvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   vcvtpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvtpd2dq	%ymm0, %xmm2
+# CHECK-NEXT:  3      11    1.00    *                   vcvtpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   vcvtpd2psx	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvtpd2ps	%ymm0, %xmm2
+# CHECK-NEXT:  3      11    1.00    *                   vcvtpd2psy	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvtps2dq	%ymm0, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vcvtps2dq	(%rax), %ymm2
+# CHECK-NEXT:  2      2     1.00                        vcvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vcvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vcvtps2pd	%xmm0, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vcvtps2pd	(%rax), %ymm2
+# CHECK-NEXT:  2      5     1.00                        vcvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      5     1.00                        vcvtsd2si	%xmm0, %rcx
+# CHECK-NEXT:  3      10    1.00    *                   vcvtsd2si	(%rax), %ecx
+# CHECK-NEXT:  3      10    1.00    *                   vcvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  2      4     1.00                        vcvtsd2ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   vcvtsd2ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvtsi2sdl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvtsi2sdq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcvtsi2sdl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcvtsi2sdq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  3      5     2.00                        vcvtsi2ssl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  3      5     2.00                        vcvtsi2ssq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   vcvtsi2ssl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   vcvtsi2ssq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vcvtss2sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vcvtss2sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     1.00                        vcvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      5     1.00                        vcvtss2si	%xmm0, %rcx
+# CHECK-NEXT:  3      10    1.00    *                   vcvtss2si	(%rax), %ecx
+# CHECK-NEXT:  3      10    1.00    *                   vcvtss2si	(%rax), %rcx
+# CHECK-NEXT:  2      4     1.00                        vcvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   vcvttpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvttpd2dq	%ymm0, %xmm2
+# CHECK-NEXT:  3      11    1.00    *                   vcvttpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvttps2dq	%ymm0, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vcvttps2dq	(%rax), %ymm2
+# CHECK-NEXT:  2      5     1.00                        vcvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      5     1.00                        vcvttsd2si	%xmm0, %rcx
+# CHECK-NEXT:  3      10    1.00    *                   vcvttsd2si	(%rax), %ecx
+# CHECK-NEXT:  3      10    1.00    *                   vcvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  2      5     1.00                        vcvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      5     1.00                        vcvttss2si	%xmm0, %rcx
+# CHECK-NEXT:  3      10    1.00    *                   vcvttss2si	(%rax), %ecx
+# CHECK-NEXT:  3      10    1.00    *                   vcvttss2si	(%rax), %rcx
+# CHECK-NEXT:  1      22    22.00                       vdivpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      28    22.00   *                   vdivpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      45    44.00                       vdivpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  4      52    44.00   *                   vdivpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      14    14.00                       vdivps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      20    14.00   *                   vdivps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      29    28.00                       vdivps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  4      36    28.00   *                   vdivps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      22    22.00                       vdivsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      28    22.00   *                   vdivsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      14    14.00                       vdivss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      20    14.00   *                   vdivss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      9     1.00                        vdppd	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      15    1.00    *                   vdppd	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  4      12    2.00                        vdpps	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  5      18    2.00    *                   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  4      12    2.00                        vdpps	$22, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  5      19    2.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vextractf128	$1, %ymm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vextractf128	$1, %ymm0, (%rax)
+# CHECK-NEXT:  2      3     1.00                        vextractps	$1, %xmm0, %ecx
+# CHECK-NEXT:  3      5     1.00           *            vextractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  3      5     2.00                        vhaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      11    2.00    *                   vhaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     2.00                        vhaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  4      12    2.00    *                   vhaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  3      5     2.00                        vhaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      11    2.00    *                   vhaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     2.00                        vhaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  4      12    2.00    *                   vhaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  3      5     2.00                        vhsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      11    2.00    *                   vhsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     2.00                        vhsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  4      12    2.00    *                   vhsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  3      5     2.00                        vhsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      11    2.00    *                   vhsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     2.00                        vhsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  4      12    2.00    *                   vhsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vinsertf128	$1, %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     0.50    *                   vinsertf128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vinsertps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vinsertps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   vlddqu	(%rax), %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vlddqu	(%rax), %ymm2
+# CHECK-NEXT:  4      5     1.00    *      *      U     vldmxcsr	(%rax)
+# CHECK-NEXT:  1      1     1.00    *      *      U     vmaskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT:  3      8     1.00    *                   vmaskmovpd	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  3      9     1.00    *                   vmaskmovpd	(%rax), %ymm0, %ymm2
+# CHECK-NEXT:  3      5     1.00    *      *            vmaskmovpd	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT:  3      5     1.00    *      *            vmaskmovpd	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT:  3      8     1.00    *                   vmaskmovps	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  3      9     1.00    *                   vmaskmovps	(%rax), %ymm0, %ymm2
+# CHECK-NEXT:  3      5     1.00    *      *            vmaskmovps	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT:  3      5     1.00    *      *            vmaskmovps	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT:  1      3     1.00                        vmaxpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vmaxpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vmaxpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vmaxpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vmaxps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vmaxps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vmaxps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vmaxps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vmaxss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vmaxss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vminpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vminpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vminpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vminpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vminps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vminps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vminps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vminps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vminss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vminss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovapd	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   vmovapd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%ymm0, %ymm2
+# CHECK-NEXT:  1      1     1.00           *            vmovapd	%ymm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovapd	(%rax), %ymm2
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%ymm0, %ymm2
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%ymm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovaps	(%rax), %ymm2
+# CHECK-NEXT:  1      1     1.00                        vmovd	%eax, %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   vmovd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        vmovd	%xmm0, %ecx
+# CHECK-NEXT:  1      1     1.00           *            vmovd	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovddup	%xmm0, %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   vmovddup	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm0, %ymm2
+# CHECK-NEXT:  1      7     0.50    *                   vmovddup	(%rax), %ymm2
+# CHECK-NEXT:  1      1     0.33                        vmovdqa	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqa	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vmovdqa	%ymm0, %ymm2
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa	%ymm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa	(%rax), %ymm2
+# CHECK-NEXT:  1      1     0.33                        vmovdqu	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vmovdqu	%ymm0, %ymm2
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu	%ymm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu	(%rax), %ymm2
+# CHECK-NEXT:  1      1     1.00                        vmovhlps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovlhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovhpd	%xmm0, (%rax)
+# CHECK-NEXT:  2      7     1.00    *                   vmovhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovhps	%xmm0, (%rax)
+# CHECK-NEXT:  2      7     1.00    *                   vmovhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovlpd	%xmm0, (%rax)
+# CHECK-NEXT:  2      7     1.00    *                   vmovlpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovlps	%xmm0, (%rax)
+# CHECK-NEXT:  2      7     1.00    *                   vmovlps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00                        vmovmskpd	%xmm0, %ecx
+# CHECK-NEXT:  1      2     1.00                        vmovmskpd	%ymm0, %ecx
+# CHECK-NEXT:  1      2     1.00                        vmovmskps	%xmm0, %ecx
+# CHECK-NEXT:  1      2     1.00                        vmovmskps	%ymm0, %ecx
+# CHECK-NEXT:  1      1     1.00           *            vmovntdq	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            vmovntdq	%ymm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT:  1      1     1.00           *            vmovntpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            vmovntpd	%ymm0, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            vmovntps	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            vmovntps	%ymm0, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovq	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovq	%rax, %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   vmovq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        vmovq	%xmm0, %rcx
+# CHECK-NEXT:  1      1     1.00           *            vmovq	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovsd	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   vmovsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovshdup	%xmm0, %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   vmovshdup	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovshdup	%ymm0, %ymm2
+# CHECK-NEXT:  1      7     0.50    *                   vmovshdup	(%rax), %ymm2
+# CHECK-NEXT:  1      1     1.00                        vmovsldup	%xmm0, %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   vmovsldup	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovsldup	%ymm0, %ymm2
+# CHECK-NEXT:  1      7     0.50    *                   vmovsldup	(%rax), %ymm2
+# CHECK-NEXT:  1      1     1.00                        vmovss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovss	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   vmovss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovupd	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   vmovupd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%ymm0, %ymm2
+# CHECK-NEXT:  1      1     1.00           *            vmovupd	%ymm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovupd	(%rax), %ymm2
+# CHECK-NEXT:  1      1     1.00                        vmovups	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovups	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   vmovups	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmovups	%ymm0, %ymm2
+# CHECK-NEXT:  1      1     1.00           *            vmovups	%ymm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovups	(%rax), %ymm2
+# CHECK-NEXT:  3      7     1.00                        vmpsadbw	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      13    1.00    *                   vmpsadbw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vmulpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vmulpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      12    1.00    *                   vmulpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      12    1.00    *                   vmulps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vmulsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vmulsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vmulss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vmulss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     0.50                        vpabsb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpabsb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpabsd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpabsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpabsw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpabsw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpackssdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpackssdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpacksswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpacksswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpackusdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpackusdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpackuswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpackuswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpaddb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpaddb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpaddq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpaddq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpaddsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpaddsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpaddusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpaddusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpaddusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpaddusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpalignr	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpalignr	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.33                        vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpand	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.33                        vpandn	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpandn	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpavgb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpavgb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpavgw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpavgw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpblendvb	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpblendw	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpblendw	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      14    6.00                        vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    5.67    *                   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpcmpeqb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpcmpeqd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpcmpeqq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpcmpeqw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpcmpeqw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     2.67                        vpcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      4     2.33    *                   vpcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      11    2.67                        vpcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      11    2.33    *                   vpcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpcmpgtb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpcmpgtb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpcmpgtd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpcmpgtd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vpcmpgtq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vpcmpgtq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpcmpgtw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpcmpgtw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      11    3.00                        vpcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  4      17    3.00    *                   vpcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      11    3.00                        vpcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  4      17    3.00    *                   vpcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vperm2f128	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vperm2f128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpermilpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vpermilpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpermilpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vpermilpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpermilpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vpermilpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpermilpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vpermilpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpermilps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vpermilps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpermilps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vpermilps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpermilps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vpermilps	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpermilps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vpermilps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      3     1.00                        vpextrb	$1, %xmm0, %ecx
+# CHECK-NEXT:  3      5     1.00           *            vpextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      3     1.00                        vpextrd	$1, %xmm0, %ecx
+# CHECK-NEXT:  4      5     1.00           *            vpextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      3     1.00                        vpextrq	$1, %xmm0, %rcx
+# CHECK-NEXT:  4      5     1.00           *            vpextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      3     1.00                        vpextrw	$1, %xmm0, %ecx
+# CHECK-NEXT:  3      5     1.00           *            vpextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  3      3     1.50                        vphaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   vphaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      3     1.50                        vphaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   vphaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      3     1.50                        vphaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   vphaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  3      3     1.50                        vphsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   vphsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      3     1.50                        vphsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   vphsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      3     1.50                        vphsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   vphsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpinsrb	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpinsrb	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpinsrd	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpinsrd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpinsrq	$1, %rax, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpinsrq	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpinsrw	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpinsrw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vpmaddubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vpmaddubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vpmaddwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vpmaddwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmaxsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmaxsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmaxsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmaxsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmaxub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmaxub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmaxud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmaxud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmaxuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmaxuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpminsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpminsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpminsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpminsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpminub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpminub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpminud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpminud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpminuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpminuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00                        vpmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  1      1     0.50                        vpmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        vpmuldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vpmuldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vpmulhrsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vpmulhrsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vpmulhuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vpmulhuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vpmulhw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vpmulhw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vpmulld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vpmullw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vpmullw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vpmuludq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vpmuludq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.33                        vpor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vpsadbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vpsadbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpshufb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpshufb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsignb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpsignb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsignd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpsignd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsignw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpsignw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpslld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpslld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   vpslld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpslldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpsllq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpsllq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   vpsllq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpsllw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpsllw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   vpsllw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpsrad	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpsrad	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   vpsrad	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpsraw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpsraw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   vpsraw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpsrld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpsrld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   vpsrld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsrldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpsrlq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpsrlq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   vpsrlq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vpsrlw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpsrlw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   vpsrlw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsubb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpsubb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsubq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpsubq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsubsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpsubsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsubusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpsubusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsubusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpsubusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vptest	%xmm0, %xmm1
+# CHECK-NEXT:  3      8     1.00    *                   vptest	(%rax), %xmm1
+# CHECK-NEXT:  2      2     1.00                        vptest	%ymm0, %ymm1
+# CHECK-NEXT:  3      9     1.00    *                   vptest	(%rax), %ymm1
+# CHECK-NEXT:  1      1     0.50                        vpunpckhbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpunpckhbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpunpckhdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpunpckhdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpunpckhqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpunpckhqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpunpckhwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpunpckhwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpunpcklbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpunpcklbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpunpckldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpunpckldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpunpcklqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpunpcklqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpunpcklwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpunpcklwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.33                        vpxor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   vpxor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vrcpps	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vrcpps	(%rax), %xmm2
+# CHECK-NEXT:  3      7     2.00                        vrcpps	%ymm0, %ymm2
+# CHECK-NEXT:  4      14    2.00    *                   vrcpps	(%rax), %ymm2
+# CHECK-NEXT:  1      5     1.00                        vrcpss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vroundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vroundpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      3     1.00                        vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vroundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vroundps	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      3     1.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vrsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vrsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  3      7     2.00                        vrsqrtps	%ymm0, %ymm2
+# CHECK-NEXT:  4      14    2.00    *                   vrsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  1      5     1.00                        vrsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   vrsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vshufpd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vshufpd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vshufpd	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vshufpd	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vshufps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vshufps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vshufps	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vshufps	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      21    21.00                       vsqrtpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      27    21.00   *                   vsqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  3      45    44.00                       vsqrtpd	%ymm0, %ymm2
+# CHECK-NEXT:  4      52    44.00   *                   vsqrtpd	(%rax), %ymm2
+# CHECK-NEXT:  1      14    14.00                       vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  2      20    14.00   *                   vsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  3      29    28.00                       vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT:  4      36    28.00   *                   vsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  1      21    21.00                       vsqrtsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      27    21.00   *                   vsqrtsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      14    14.00                       vsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      20    14.00   *                   vsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  4      5     1.00    *      *      U     vstmxcsr	(%rax)
+# CHECK-NEXT:  1      3     1.00                        vsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vsubsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vsubsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vsubss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vsubss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vtestpd	%xmm0, %xmm1
+# CHECK-NEXT:  2      7     1.00    *                   vtestpd	(%rax), %xmm1
+# CHECK-NEXT:  1      1     1.00                        vtestpd	%ymm0, %ymm1
+# CHECK-NEXT:  2      8     1.00    *                   vtestpd	(%rax), %ymm1
+# CHECK-NEXT:  1      1     1.00                        vtestps	%xmm0, %xmm1
+# CHECK-NEXT:  2      7     1.00    *                   vtestps	(%rax), %xmm1
+# CHECK-NEXT:  1      1     1.00                        vtestps	%ymm0, %ymm1
+# CHECK-NEXT:  2      8     1.00    *                   vtestps	(%rax), %ymm1
+# CHECK-NEXT:  2      2     1.00                        vucomisd	%xmm0, %xmm1
+# CHECK-NEXT:  3      8     1.00    *                   vucomisd	(%rax), %xmm1
+# CHECK-NEXT:  2      2     1.00                        vucomiss	%xmm0, %xmm1
+# CHECK-NEXT:  3      8     1.00    *                   vucomiss	(%rax), %xmm1
+# CHECK-NEXT:  1      1     1.00                        vunpckhpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpckhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vunpckhpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vunpckhpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vunpckhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpckhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vunpckhps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vunpckhps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vunpcklpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpcklpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vunpcklpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vunpcklpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vunpcklps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpcklps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vunpcklps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vunpcklps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vxorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vxorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vxorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vxorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vxorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   vxorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vxorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vxorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  20     9     2.00    *      *      U     vzeroall
+# CHECK-NEXT:  1      100   0.33    *      *      U     vzeroupper
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -     572.00 246.83 317.33 39.00  365.83 179.50 179.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     vaesdec	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   vaesdec	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     vaesdeclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   vaesdeclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     vaesenc	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   vaesenc	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     vaesenclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   vaesenclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     2.00    -      -     vaesimc	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     2.00   0.50   0.50   vaesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -     3.67   3.67    -     3.67    -      -     vaeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     3.33   3.33    -     3.33   0.50   0.50   vaeskeygenassist	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandnpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandnpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandnps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandnps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vblendpd	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   vblendpd	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vblendpd	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   vblendpd	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vblendps	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   vblendps	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vblendps	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   vblendps	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vblendvpd	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vblendvpd	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vblendvpd	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vblendvpd	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vblendvps	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vblendvps	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vblendvps	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vblendvps	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vbroadcastf128	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vbroadcastsd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vbroadcastss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vbroadcastss	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmppd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmppd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmppd	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmppd	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmpps	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmpps	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmpps	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmpps	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmpsd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmpsd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmpss	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmpss	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcomisd	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcomiss	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtdq2pd	%xmm0, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtdq2pd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtdq2ps	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtdq2ps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtpd2dq	%ymm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtpd2psx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtpd2ps	%ymm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtpd2psy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtps2dq	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtps2dq	(%rax), %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vcvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vcvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vcvtps2pd	%xmm0, %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vcvtps2pd	(%rax), %ymm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvtsd2si	%xmm0, %rcx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvtsd2si	(%rax), %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtsd2ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtsd2ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtsi2sdl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtsi2sdq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtsi2sdl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtsi2sdq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vcvtsi2ssl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vcvtsi2ssq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtsi2ssl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtsi2ssq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vcvtss2sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vcvtss2sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvtss2si	%xmm0, %rcx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvtss2si	(%rax), %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvtss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvttpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvttpd2dq	%ymm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvttpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvttps2dq	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvttps2dq	(%rax), %ymm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvttsd2si	%xmm0, %rcx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvttsd2si	(%rax), %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvttss2si	%xmm0, %rcx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvttss2si	(%rax), %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvttss2si	(%rax), %rcx
+# CHECK-NEXT:  -     22.00  1.00    -      -      -      -      -     vdivpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -     22.00  1.00    -      -      -     0.50   0.50   vdivpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -     44.00  2.50    -      -     0.50    -      -     vdivpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -     44.00  2.50    -      -     0.50   0.50   0.50   vdivpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     vdivps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   vdivps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -     28.00  2.50    -      -     0.50    -      -     vdivps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -     28.00  2.50    -      -     0.50   0.50   0.50   vdivps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -     22.00  1.00    -      -      -      -      -     vdivsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -     22.00  1.00    -      -      -     0.50   0.50   vdivsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     vdivss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   vdivss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   0.50   0.50   vdppd	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   2.00    -     1.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   2.00    -     1.00   0.50   0.50   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   2.00    -     1.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.00   2.00    -     1.00   0.50   0.50   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vextractf128	$1, %ymm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vextractf128	$1, %ymm0, (%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vextractps	$1, %xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   0.50   0.50   vextractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vinsertf128	$1, %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   vinsertf128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vinsertps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vinsertps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vlddqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vlddqu	(%rax), %ymm2
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   vldmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmaskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vmaskmovpd	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vmaskmovpd	(%rax), %ymm0, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50   1.00    -     0.50   0.50   vmaskmovpd	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT:  -      -     0.50   0.50   1.00    -     0.50   0.50   vmaskmovpd	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vmaskmovps	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vmaskmovps	(%rax), %ymm0, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50   1.00    -     0.50   0.50   vmaskmovps	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT:  -      -     0.50   0.50   1.00    -     0.50   0.50   vmaskmovps	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovapd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovapd	%ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovaps	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovaps	%ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovd	%eax, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovd	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovd	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovddup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovddup	(%rax), %ymm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqa	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa	%ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa	(%rax), %ymm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu	%ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovhlps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovlhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovhpd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovhps	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovlpd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovlpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovlps	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovlps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovmskpd	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovmskpd	%ymm0, %ecx
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovmskps	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovmskps	%ymm0, %ecx
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntdq	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntdq	%ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntpd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntpd	%ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntps	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntps	%ymm0, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovq	%rax, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovq	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovq	%xmm0, %rcx
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovq	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovsd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovshdup	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovshdup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovshdup	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovshdup	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovsldup	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovsldup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovsldup	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovsldup	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovss	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovupd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovupd	%ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovups	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%ymm0, %ymm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovups	%ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %ymm2
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     vmpsadbw	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   0.50   0.50   vmpsadbw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpabsb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpabsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpabsd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpabsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpabsw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpabsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpackssdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpackssdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpacksswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpacksswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpackusdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpackusdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpackuswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpackuswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpalignr	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpalignr	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   vpand	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpandn	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   vpandn	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpavgb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpavgb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpavgw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpavgw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vpblendvb	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpblendw	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpblendw	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     6.00   6.00    -     6.00    -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     5.67   5.67    -     5.67   0.50   0.50   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpeqb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpeqb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpeqd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpeqd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpeqq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpeqq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpeqw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpeqw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     2.67   2.67    -     2.67    -      -     vpcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     2.33   2.33    -     2.33   0.50   0.50   vpcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -     2.67   2.67    -     2.67    -      -     vpcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     2.33   2.33    -     2.33   0.50   0.50   vpcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpgtb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpgtb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpgtd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpgtd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpcmpgtq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpcmpgtq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpgtw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpgtw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     3.00    -      -      -      -      -     vpcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     3.00    -      -      -     0.50   0.50   vpcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -     3.00    -      -      -      -      -     vpcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     3.00    -      -      -     0.50   0.50   vpcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vperm2f128	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vperm2f128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilps	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpextrb	$1, %xmm0, %ecx
+# CHECK-NEXT:  -      -      -     0.50   1.00   0.50   0.50   0.50   vpextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpextrd	$1, %xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00   0.50   1.00   0.50   0.50   0.50   vpextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpextrq	$1, %xmm0, %rcx
+# CHECK-NEXT:  -      -     1.00   0.50   1.00   0.50   0.50   0.50   vpextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpextrw	$1, %xmm0, %ecx
+# CHECK-NEXT:  -      -      -     0.50   1.00   0.50   0.50   0.50   vpextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     vpinsrb	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpinsrb	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     vpinsrd	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpinsrd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     vpinsrq	$1, %rax, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpinsrq	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     vpinsrw	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpinsrw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmaddubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmaddubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmaddwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmaddwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmuldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmuldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulhrsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulhrsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulhuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulhuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulhw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulhw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmullw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmullw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmuludq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmuludq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   vpor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsadbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpsadbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpshufb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpshufb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsignb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsignb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsignd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsignd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsignw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsignw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpslld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpslld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpslld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpslldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsllq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsllq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsllq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsllw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsllw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsllw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsrad	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsrad	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsrad	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsraw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsraw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsraw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsrld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsrld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsrld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsrldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsrlq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsrlq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsrlq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsrlw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsrlw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsrlw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vptest	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vptest	(%rax), %xmm1
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vptest	%ymm0, %ymm1
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vptest	(%rax), %ymm1
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpckhbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpckhbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpckhdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpckhdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpckhqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpckhqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpckhwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpckhwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpcklbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpcklbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpckldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpckldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpcklqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpcklqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpcklwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpcklwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpxor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   vpxor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vrcpps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vrcpps	(%rax), %xmm2
+# CHECK-NEXT:  -      -     2.50    -      -     0.50    -      -     vrcpps	%ymm0, %ymm2
+# CHECK-NEXT:  -      -     2.50    -      -     0.50   0.50   0.50   vrcpps	(%rax), %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vrcpss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundps	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundsd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundss	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vrsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vrsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -     2.50    -      -     0.50    -      -     vrsqrtps	%ymm0, %ymm2
+# CHECK-NEXT:  -      -     2.50    -      -     0.50   0.50   0.50   vrsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vrsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vrsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vshufpd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vshufpd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vshufpd	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vshufpd	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vshufps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vshufps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vshufps	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vshufps	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -     21.00  1.00    -      -      -      -      -     vsqrtpd	%xmm0, %xmm2
+# CHECK-NEXT:  -     21.00  1.00    -      -      -     0.50   0.50   vsqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  -     44.00  2.50    -      -     0.50    -      -     vsqrtpd	%ymm0, %ymm2
+# CHECK-NEXT:  -     44.00  2.50    -      -     0.50   0.50   0.50   vsqrtpd	(%rax), %ymm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   vsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -     28.00  2.50    -      -     0.50    -      -     vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT:  -     28.00  2.50    -      -     0.50   0.50   0.50   vsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  -     21.00  1.00    -      -      -      -      -     vsqrtsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -     21.00  1.00    -      -      -     0.50   0.50   vsqrtsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     vsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   vsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   vstmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vtestpd	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vtestpd	(%rax), %xmm1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vtestpd	%ymm0, %ymm1
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vtestpd	(%rax), %ymm1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vtestps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vtestps	(%rax), %xmm1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vtestps	%ymm0, %ymm1
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vtestps	(%rax), %ymm1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vucomisd	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vucomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vucomiss	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vucomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpckhpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpckhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpckhpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpckhpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpckhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpckhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpckhps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpckhps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpcklpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpcklpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpcklpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpcklpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpcklps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpcklps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpcklps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpcklps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     2.00    -      -     vzeroall
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vzeroupper
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s b/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s
new file mode 100644
index 00000000000..193f5537c7c
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s
@@ -0,0 +1,113 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+andn        %eax, %ebx, %ecx
+andn        (%rax), %ebx, %ecx
+
+andn        %rax, %rbx, %rcx
+andn        (%rax), %rbx, %rcx
+
+bextr       %eax, %ebx, %ecx
+bextr       %eax, (%rbx), %ecx
+
+bextr       %rax, %rbx, %rcx
+bextr       %rax, (%rbx), %rcx
+
+blsi        %eax, %ecx
+blsi        (%rax), %ecx
+
+blsi        %rax, %rcx
+blsi        (%rax), %rcx
+
+blsmsk      %eax, %ecx
+blsmsk      (%rax), %ecx
+
+blsmsk      %rax, %rcx
+blsmsk      (%rax), %rcx
+
+blsr        %eax, %ecx
+blsr        (%rax), %ecx
+
+blsr        %rax, %rcx
+blsr        (%rax), %rcx
+
+tzcnt       %eax, %ecx
+tzcnt       (%rax), %ecx
+
+tzcnt       %rax, %rcx
+tzcnt       (%rax), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        andnl	%eax, %ebx, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   andnl	(%rax), %ebx, %ecx
+# CHECK-NEXT:  1      1     0.33                        andnq	%rax, %rbx, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   andnq	(%rax), %rbx, %rcx
+# CHECK-NEXT:  2      2     1.00                        bextrl	%eax, %ebx, %ecx
+# CHECK-NEXT:  3      7     1.00    *                   bextrl	%eax, (%rbx), %ecx
+# CHECK-NEXT:  2      2     1.00                        bextrq	%rax, %rbx, %rcx
+# CHECK-NEXT:  3      7     1.00    *                   bextrq	%rax, (%rbx), %rcx
+# CHECK-NEXT:  1      1     0.33                        blsil	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blsil	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        blsiq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blsiq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.33                        blsmskl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blsmskl	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        blsmskq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blsmskq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.33                        blsrl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blsrl	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        blsrq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blsrq	(%rax), %rcx
+# CHECK-NEXT:  1      3     1.00                        tzcntl	%eax, %ecx
+# CHECK-NEXT:  2      8     1.00    *                   tzcntl	(%rax), %ecx
+# CHECK-NEXT:  1      3     1.00                        tzcntq	%rax, %rcx
+# CHECK-NEXT:  2      8     1.00    *                   tzcntq	(%rax), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     7.33   13.33   -     7.33   6.00   6.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andnl	%eax, %ebx, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andnl	(%rax), %ebx, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andnq	%rax, %rbx, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andnq	(%rax), %rbx, %rcx
+# CHECK-NEXT:  -      -     0.50   1.00    -     0.50    -      -     bextrl	%eax, %ebx, %ecx
+# CHECK-NEXT:  -      -     0.50   1.00    -     0.50   0.50   0.50   bextrl	%eax, (%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   1.00    -     0.50    -      -     bextrq	%rax, %rbx, %rcx
+# CHECK-NEXT:  -      -     0.50   1.00    -     0.50   0.50   0.50   bextrq	%rax, (%rbx), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsil	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsil	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsiq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsiq	(%rax), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsmskl	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsmskq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsmskq	(%rax), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsrl	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsrl	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsrq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsrq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     tzcntl	%eax, %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   tzcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     tzcntq	%rax, %rcx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   tzcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s b/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s
new file mode 100644
index 00000000000..4678467babd
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s
@@ -0,0 +1,33 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+clflushopt (%rax)
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  4      5     1.00    *      *      U     clflushopt	(%rax)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     0.50   0.50   1.00   1.00   0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.50   0.50   1.00   1.00   0.50   0.50   clflushopt	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s b/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s
new file mode 100644
index 00000000000..e41571428d9
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s
@@ -0,0 +1,323 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+cmovow    %si, %di
+cmovnow   %si, %di
+cmovbw    %si, %di
+cmovaew   %si, %di
+cmovew    %si, %di
+cmovnew   %si, %di
+cmovbew   %si, %di
+cmovaw    %si, %di
+cmovsw    %si, %di
+cmovnsw   %si, %di
+cmovpw    %si, %di
+cmovnpw   %si, %di
+cmovlw    %si, %di
+cmovgew   %si, %di
+cmovlew   %si, %di
+cmovgw    %si, %di
+
+cmovow    (%rax), %di
+cmovnow   (%rax), %di
+cmovbw    (%rax), %di
+cmovaew   (%rax), %di
+cmovew    (%rax), %di
+cmovnew   (%rax), %di
+cmovbew   (%rax), %di
+cmovaw    (%rax), %di
+cmovsw    (%rax), %di
+cmovnsw   (%rax), %di
+cmovpw    (%rax), %di
+cmovnpw   (%rax), %di
+cmovlw    (%rax), %di
+cmovgew   (%rax), %di
+cmovlew   (%rax), %di
+cmovgw    (%rax), %di
+
+cmovol    %esi, %edi
+cmovnol   %esi, %edi
+cmovbl    %esi, %edi
+cmovael   %esi, %edi
+cmovel    %esi, %edi
+cmovnel   %esi, %edi
+cmovbel   %esi, %edi
+cmoval    %esi, %edi
+cmovsl    %esi, %edi
+cmovnsl   %esi, %edi
+cmovpl    %esi, %edi
+cmovnpl   %esi, %edi
+cmovll    %esi, %edi
+cmovgel   %esi, %edi
+cmovlel   %esi, %edi
+cmovgl    %esi, %edi
+
+cmovol    (%rax), %edi
+cmovnol   (%rax), %edi
+cmovbl    (%rax), %edi
+cmovael   (%rax), %edi
+cmovel    (%rax), %edi
+cmovnel   (%rax), %edi
+cmovbel   (%rax), %edi
+cmoval    (%rax), %edi
+cmovsl    (%rax), %edi
+cmovnsl   (%rax), %edi
+cmovpl    (%rax), %edi
+cmovnpl   (%rax), %edi
+cmovll    (%rax), %edi
+cmovgel   (%rax), %edi
+cmovlel   (%rax), %edi
+cmovgl    (%rax), %edi
+
+cmovoq    %rsi, %rdi
+cmovnoq   %rsi, %rdi
+cmovbq    %rsi, %rdi
+cmovaeq   %rsi, %rdi
+cmoveq    %rsi, %rdi
+cmovneq   %rsi, %rdi
+cmovbeq   %rsi, %rdi
+cmovaq    %rsi, %rdi
+cmovsq    %rsi, %rdi
+cmovnsq   %rsi, %rdi
+cmovpq    %rsi, %rdi
+cmovnpq   %rsi, %rdi
+cmovlq    %rsi, %rdi
+cmovgeq   %rsi, %rdi
+cmovleq   %rsi, %rdi
+cmovgq    %rsi, %rdi
+
+cmovoq    (%rax), %rdi
+cmovnoq   (%rax), %rdi
+cmovbq    (%rax), %rdi
+cmovaeq   (%rax), %rdi
+cmoveq    (%rax), %rdi
+cmovneq   (%rax), %rdi
+cmovbeq   (%rax), %rdi
+cmovaq    (%rax), %rdi
+cmovsq    (%rax), %rdi
+cmovnsq   (%rax), %rdi
+cmovpq    (%rax), %rdi
+cmovnpq   (%rax), %rdi
+cmovlq    (%rax), %rdi
+cmovgeq   (%rax), %rdi
+cmovleq   (%rax), %rdi
+cmovgq    (%rax), %rdi
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      2     0.67                        cmovow	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovnow	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovbw	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovaew	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovew	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovnew	%si, %di
+# CHECK-NEXT:  3      3     1.00                        cmovbew	%si, %di
+# CHECK-NEXT:  3      3     1.00                        cmovaw	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovsw	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovnsw	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovpw	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovnpw	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovlw	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovgew	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovlew	%si, %di
+# CHECK-NEXT:  2      2     0.67                        cmovgw	%si, %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovow	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovnow	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovbw	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovaew	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovew	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovnew	(%rax), %di
+# CHECK-NEXT:  4      8     1.00    *                   cmovbew	(%rax), %di
+# CHECK-NEXT:  4      8     1.00    *                   cmovaw	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovsw	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovnsw	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovpw	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovnpw	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovlw	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovgew	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovlew	(%rax), %di
+# CHECK-NEXT:  3      7     0.67    *                   cmovgw	(%rax), %di
+# CHECK-NEXT:  2      2     0.67                        cmovol	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovnol	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovbl	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovael	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovel	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovnel	%esi, %edi
+# CHECK-NEXT:  3      3     1.00                        cmovbel	%esi, %edi
+# CHECK-NEXT:  3      3     1.00                        cmoval	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovsl	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovnsl	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovpl	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovnpl	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovll	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovgel	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovlel	%esi, %edi
+# CHECK-NEXT:  2      2     0.67                        cmovgl	%esi, %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovol	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovnol	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovbl	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovael	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovel	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovnel	(%rax), %edi
+# CHECK-NEXT:  4      8     1.00    *                   cmovbel	(%rax), %edi
+# CHECK-NEXT:  4      8     1.00    *                   cmoval	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovsl	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovnsl	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovpl	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovnpl	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovll	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovgel	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovlel	(%rax), %edi
+# CHECK-NEXT:  3      7     0.67    *                   cmovgl	(%rax), %edi
+# CHECK-NEXT:  2      2     0.67                        cmovoq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovnoq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovbq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovaeq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmoveq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovneq	%rsi, %rdi
+# CHECK-NEXT:  3      3     1.00                        cmovbeq	%rsi, %rdi
+# CHECK-NEXT:  3      3     1.00                        cmovaq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovsq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovnsq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovpq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovnpq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovlq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovgeq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovleq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        cmovgq	%rsi, %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovoq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovnoq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovbq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovaeq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmoveq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovneq	(%rax), %rdi
+# CHECK-NEXT:  4      8     1.00    *                   cmovbeq	(%rax), %rdi
+# CHECK-NEXT:  4      8     1.00    *                   cmovaq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovsq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovnsq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovpq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovnpq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovlq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovgeq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovleq	(%rax), %rdi
+# CHECK-NEXT:  3      7     0.67    *                   cmovgq	(%rax), %rdi
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     86.00  32.00   -     86.00  24.00  24.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovow	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnow	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovbw	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovaew	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovew	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnew	%si, %di
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmovbew	%si, %di
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmovaw	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovsw	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnsw	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovpw	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnpw	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovlw	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgew	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovlew	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgw	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovow	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnow	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovbw	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovaew	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovew	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnew	(%rax), %di
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmovbew	(%rax), %di
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmovaw	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovsw	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnsw	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovpw	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnpw	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovlw	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgew	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovlew	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgw	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovol	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnol	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovbl	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovael	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovel	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnel	%esi, %edi
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmovbel	%esi, %edi
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmoval	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovsl	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnsl	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovpl	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnpl	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovll	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgel	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovlel	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgl	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovol	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnol	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovbl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovael	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovel	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnel	(%rax), %edi
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmovbel	(%rax), %edi
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmoval	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovsl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnsl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovpl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnpl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovll	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgel	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovlel	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovoq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnoq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovbq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovaeq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmoveq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovneq	%rsi, %rdi
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmovbeq	%rsi, %rdi
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmovaq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovsq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnsq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovpq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnpq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovlq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgeq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovleq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovoq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnoq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovbq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovaeq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmoveq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovneq	(%rax), %rdi
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmovbeq	(%rax), %rdi
+# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmovaq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovsq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnsq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovpq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnpq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovlq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgeq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovleq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgq	(%rax), %rdi
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s b/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s
new file mode 100644
index 00000000000..19a220702b1
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+cmpxchg8b  (%rax)
+cmpxchg16b (%rax)
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  3      6     1.00    *      *            cmpxchg8b	(%rax)
+# CHECK-NEXT:  3      6     1.00    *      *            cmpxchg16b	(%rax)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     0.67   0.67   2.00   0.67   2.00   2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   cmpxchg8b	(%rax)
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   cmpxchg16b	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s b/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s
new file mode 100644
index 00000000000..7dea75f8f8f
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+vcvtph2ps   %xmm0, %xmm2
+vcvtph2ps   (%rax), %xmm2
+
+vcvtph2ps   %xmm0, %ymm2
+vcvtph2ps   (%rax), %ymm2
+
+vcvtps2ph   $0, %xmm0, %xmm2
+vcvtps2ph   $0, %xmm0, (%rax)
+
+vcvtps2ph   $0, %ymm0, %xmm2
+vcvtps2ph   $0, %ymm0, (%rax)
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        vcvtph2ps	%xmm0, %xmm2
+# CHECK-NEXT:  2      8     1.00    *                   vcvtph2ps	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvtph2ps	%xmm0, %ymm2
+# CHECK-NEXT:  2      8     1.00    *                   vcvtph2ps	(%rax), %ymm2
+# CHECK-NEXT:  1      3     1.00                        vcvtps2ph	$0, %xmm0, %xmm2
+# CHECK-NEXT:  1      4     1.00           *            vcvtps2ph	$0, %xmm0, (%rax)
+# CHECK-NEXT:  1      3     1.00                        vcvtps2ph	$0, %ymm0, %xmm2
+# CHECK-NEXT:  1      4     1.00           *            vcvtps2ph	$0, %ymm0, (%rax)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     8.00   2.00    -     2.00   2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtph2ps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtph2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtph2ps	%xmm0, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtph2ps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtps2ph	$0, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     0.50   0.50   vcvtps2ph	$0, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtps2ph	$0, %ymm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     0.50   0.50   vcvtps2ph	$0, %ymm0, (%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-fma.s b/test/tools/llvm-mca/X86/BdVer2/resources-fma.s
new file mode 100644
index 00000000000..05b63edb7f9
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-fma.s
@@ -0,0 +1,701 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+vfmadd132pd %xmm0, %xmm1, %xmm2
+vfmadd132pd (%rax), %xmm1, %xmm2
+
+vfmadd132pd %ymm0, %ymm1, %ymm2
+vfmadd132pd (%rax), %ymm1, %ymm2
+
+vfmadd213pd %xmm0, %xmm1, %xmm2
+vfmadd213pd (%rax), %xmm1, %xmm2
+
+vfmadd213pd %ymm0, %ymm1, %ymm2
+vfmadd213pd (%rax), %ymm1, %ymm2
+
+vfmadd231pd %xmm0, %xmm1, %xmm2
+vfmadd231pd (%rax), %xmm1, %xmm2
+
+vfmadd231pd %ymm0, %ymm1, %ymm2
+vfmadd231pd (%rax), %ymm1, %ymm2
+
+vfmadd132ps %xmm0, %xmm1, %xmm2
+vfmadd132ps (%rax), %xmm1, %xmm2
+
+vfmadd132ps %ymm0, %ymm1, %ymm2
+vfmadd132ps (%rax), %ymm1, %ymm2
+
+vfmadd213ps %xmm0, %xmm1, %xmm2
+vfmadd213ps (%rax), %xmm1, %xmm2
+
+vfmadd213ps %ymm0, %ymm1, %ymm2
+vfmadd213ps (%rax), %ymm1, %ymm2
+
+vfmadd231ps %xmm0, %xmm1, %xmm2
+vfmadd231ps (%rax), %xmm1, %xmm2
+
+vfmadd231ps %ymm0, %ymm1, %ymm2
+vfmadd231ps (%rax), %ymm1, %ymm2
+
+vfmadd132sd %xmm0, %xmm1, %xmm2
+vfmadd132sd (%rax), %xmm1, %xmm2
+
+vfmadd213sd %xmm0, %xmm1, %xmm2
+vfmadd213sd (%rax), %xmm1, %xmm2
+
+vfmadd231sd %xmm0, %xmm1, %xmm2
+vfmadd231sd (%rax), %xmm1, %xmm2
+
+vfmadd132ss %xmm0, %xmm1, %xmm2
+vfmadd132ss (%rax), %xmm1, %xmm2
+
+vfmadd213ss %xmm0, %xmm1, %xmm2
+vfmadd213ss (%rax), %xmm1, %xmm2
+
+vfmadd231ss %xmm0, %xmm1, %xmm2
+vfmadd231ss (%rax), %xmm1, %xmm2
+
+vfmaddsub132pd %xmm0, %xmm1, %xmm2
+vfmaddsub132pd (%rax), %xmm1, %xmm2
+
+vfmaddsub132pd %ymm0, %ymm1, %ymm2
+vfmaddsub132pd (%rax), %ymm1, %ymm2
+
+vfmaddsub213pd %xmm0, %xmm1, %xmm2
+vfmaddsub213pd (%rax), %xmm1, %xmm2
+
+vfmaddsub213pd %ymm0, %ymm1, %ymm2
+vfmaddsub213pd (%rax), %ymm1, %ymm2
+
+vfmaddsub231pd %xmm0, %xmm1, %xmm2
+vfmaddsub231pd (%rax), %xmm1, %xmm2
+
+vfmaddsub231pd %ymm0, %ymm1, %ymm2
+vfmaddsub231pd (%rax), %ymm1, %ymm2
+
+vfmaddsub132ps %xmm0, %xmm1, %xmm2
+vfmaddsub132ps (%rax), %xmm1, %xmm2
+
+vfmaddsub132ps %ymm0, %ymm1, %ymm2
+vfmaddsub132ps (%rax), %ymm1, %ymm2
+
+vfmaddsub213ps %xmm0, %xmm1, %xmm2
+vfmaddsub213ps (%rax), %xmm1, %xmm2
+
+vfmaddsub213ps %ymm0, %ymm1, %ymm2
+vfmaddsub213ps (%rax), %ymm1, %ymm2
+
+vfmaddsub231ps %xmm0, %xmm1, %xmm2
+vfmaddsub231ps (%rax), %xmm1, %xmm2
+
+vfmaddsub231ps %ymm0, %ymm1, %ymm2
+vfmaddsub231ps (%rax), %ymm1, %ymm2
+
+vfmsub132pd %xmm0, %xmm1, %xmm2
+vfmsub132pd (%rax), %xmm1, %xmm2
+
+vfmsub132pd %ymm0, %ymm1, %ymm2
+vfmsub132pd (%rax), %ymm1, %ymm2
+
+vfmsub213pd %xmm0, %xmm1, %xmm2
+vfmsub213pd (%rax), %xmm1, %xmm2
+
+vfmsub213pd %ymm0, %ymm1, %ymm2
+vfmsub213pd (%rax), %ymm1, %ymm2
+
+vfmsub231pd %xmm0, %xmm1, %xmm2
+vfmsub231pd (%rax), %xmm1, %xmm2
+
+vfmsub231pd %ymm0, %ymm1, %ymm2
+vfmsub231pd (%rax), %ymm1, %ymm2
+
+vfmsub132ps %xmm0, %xmm1, %xmm2
+vfmsub132ps (%rax), %xmm1, %xmm2
+
+vfmsub132ps %ymm0, %ymm1, %ymm2
+vfmsub132ps (%rax), %ymm1, %ymm2
+
+vfmsub213ps %xmm0, %xmm1, %xmm2
+vfmsub213ps (%rax), %xmm1, %xmm2
+
+vfmsub213ps %ymm0, %ymm1, %ymm2
+vfmsub213ps (%rax), %ymm1, %ymm2
+
+vfmsub231ps %xmm0, %xmm1, %xmm2
+vfmsub231ps (%rax), %xmm1, %xmm2
+
+vfmsub231ps %ymm0, %ymm1, %ymm2
+vfmsub231ps (%rax), %ymm1, %ymm2
+
+vfmsub132sd %xmm0, %xmm1, %xmm2
+vfmsub132sd (%rax), %xmm1, %xmm2
+
+vfmsub213sd %xmm0, %xmm1, %xmm2
+vfmsub213sd (%rax), %xmm1, %xmm2
+
+vfmsub231sd %xmm0, %xmm1, %xmm2
+vfmsub231sd (%rax), %xmm1, %xmm2
+
+vfmsub132ss %xmm0, %xmm1, %xmm2
+vfmsub132ss (%rax), %xmm1, %xmm2
+
+vfmsub213ss %xmm0, %xmm1, %xmm2
+vfmsub213ss (%rax), %xmm1, %xmm2
+
+vfmsub231ss %xmm0, %xmm1, %xmm2
+vfmsub231ss (%rax), %xmm1, %xmm2
+
+vfmsubadd132pd %xmm0, %xmm1, %xmm2
+vfmsubadd132pd (%rax), %xmm1, %xmm2
+
+vfmsubadd132pd %ymm0, %ymm1, %ymm2
+vfmsubadd132pd (%rax), %ymm1, %ymm2
+
+vfmsubadd213pd %xmm0, %xmm1, %xmm2
+vfmsubadd213pd (%rax), %xmm1, %xmm2
+
+vfmsubadd213pd %ymm0, %ymm1, %ymm2
+vfmsubadd213pd (%rax), %ymm1, %ymm2
+
+vfmsubadd231pd %xmm0, %xmm1, %xmm2
+vfmsubadd231pd (%rax), %xmm1, %xmm2
+
+vfmsubadd231pd %ymm0, %ymm1, %ymm2
+vfmsubadd231pd (%rax), %ymm1, %ymm2
+
+vfmsubadd132ps %xmm0, %xmm1, %xmm2
+vfmsubadd132ps (%rax), %xmm1, %xmm2
+
+vfmsubadd132ps %ymm0, %ymm1, %ymm2
+vfmsubadd132ps (%rax), %ymm1, %ymm2
+
+vfmsubadd213ps %xmm0, %xmm1, %xmm2
+vfmsubadd213ps (%rax), %xmm1, %xmm2
+
+vfmsubadd213ps %ymm0, %ymm1, %ymm2
+vfmsubadd213ps (%rax), %ymm1, %ymm2
+
+vfmsubadd231ps %xmm0, %xmm1, %xmm2
+vfmsubadd231ps (%rax), %xmm1, %xmm2
+
+vfmsubadd231ps %ymm0, %ymm1, %ymm2
+vfmsubadd231ps (%rax), %ymm1, %ymm2
+
+vfnmadd132pd %xmm0, %xmm1, %xmm2
+vfnmadd132pd (%rax), %xmm1, %xmm2
+
+vfnmadd132pd %ymm0, %ymm1, %ymm2
+vfnmadd132pd (%rax), %ymm1, %ymm2
+
+vfnmadd213pd %xmm0, %xmm1, %xmm2
+vfnmadd213pd (%rax), %xmm1, %xmm2
+
+vfnmadd213pd %ymm0, %ymm1, %ymm2
+vfnmadd213pd (%rax), %ymm1, %ymm2
+
+vfnmadd231pd %xmm0, %xmm1, %xmm2
+vfnmadd231pd (%rax), %xmm1, %xmm2
+
+vfnmadd231pd %ymm0, %ymm1, %ymm2
+vfnmadd231pd (%rax), %ymm1, %ymm2
+
+vfnmadd132ps %xmm0, %xmm1, %xmm2
+vfnmadd132ps (%rax), %xmm1, %xmm2
+
+vfnmadd132ps %ymm0, %ymm1, %ymm2
+vfnmadd132ps (%rax), %ymm1, %ymm2
+
+vfnmadd213ps %xmm0, %xmm1, %xmm2
+vfnmadd213ps (%rax), %xmm1, %xmm2
+
+vfnmadd213ps %ymm0, %ymm1, %ymm2
+vfnmadd213ps (%rax), %ymm1, %ymm2
+
+vfnmadd231ps %xmm0, %xmm1, %xmm2
+vfnmadd231ps (%rax), %xmm1, %xmm2
+
+vfnmadd231ps %ymm0, %ymm1, %ymm2
+vfnmadd231ps (%rax), %ymm1, %ymm2
+
+vfnmadd132sd %xmm0, %xmm1, %xmm2
+vfnmadd132sd (%rax), %xmm1, %xmm2
+
+vfnmadd213sd %xmm0, %xmm1, %xmm2
+vfnmadd213sd (%rax), %xmm1, %xmm2
+
+vfnmadd231sd %xmm0, %xmm1, %xmm2
+vfnmadd231sd (%rax), %xmm1, %xmm2
+
+vfnmadd132ss %xmm0, %xmm1, %xmm2
+vfnmadd132ss (%rax), %xmm1, %xmm2
+
+vfnmadd213ss %xmm0, %xmm1, %xmm2
+vfnmadd213ss (%rax), %xmm1, %xmm2
+
+vfnmadd231ss %xmm0, %xmm1, %xmm2
+vfnmadd231ss (%rax), %xmm1, %xmm2
+
+vfnmsub132pd %xmm0, %xmm1, %xmm2
+vfnmsub132pd (%rax), %xmm1, %xmm2
+
+vfnmsub132pd %ymm0, %ymm1, %ymm2
+vfnmsub132pd (%rax), %ymm1, %ymm2
+
+vfnmsub213pd %xmm0, %xmm1, %xmm2
+vfnmsub213pd (%rax), %xmm1, %xmm2
+
+vfnmsub213pd %ymm0, %ymm1, %ymm2
+vfnmsub213pd (%rax), %ymm1, %ymm2
+
+vfnmsub231pd %xmm0, %xmm1, %xmm2
+vfnmsub231pd (%rax), %xmm1, %xmm2
+
+vfnmsub231pd %ymm0, %ymm1, %ymm2
+vfnmsub231pd (%rax), %ymm1, %ymm2
+
+vfnmsub132ps %xmm0, %xmm1, %xmm2
+vfnmsub132ps (%rax), %xmm1, %xmm2
+
+vfnmsub132ps %ymm0, %ymm1, %ymm2
+vfnmsub132ps (%rax), %ymm1, %ymm2
+
+vfnmsub213ps %xmm0, %xmm1, %xmm2
+vfnmsub213ps (%rax), %xmm1, %xmm2
+
+vfnmsub213ps %ymm0, %ymm1, %ymm2
+vfnmsub213ps (%rax), %ymm1, %ymm2
+
+vfnmsub231ps %xmm0, %xmm1, %xmm2
+vfnmsub231ps (%rax), %xmm1, %xmm2
+
+vfnmsub231ps %ymm0, %ymm1, %ymm2
+vfnmsub231ps (%rax), %ymm1, %ymm2
+
+vfnmsub132sd %xmm0, %xmm1, %xmm2
+vfnmsub132sd (%rax), %xmm1, %xmm2
+
+vfnmsub213sd %xmm0, %xmm1, %xmm2
+vfnmsub213sd (%rax), %xmm1, %xmm2
+
+vfnmsub231sd %xmm0, %xmm1, %xmm2
+vfnmsub231sd (%rax), %xmm1, %xmm2
+
+vfnmsub132ss %xmm0, %xmm1, %xmm2
+vfnmsub132ss (%rax), %xmm1, %xmm2
+
+vfnmsub213ss %xmm0, %xmm1, %xmm2
+vfnmsub213ss (%rax), %xmm1, %xmm2
+
+vfnmsub231ss %xmm0, %xmm1, %xmm2
+vfnmsub231ss (%rax), %xmm1, %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsub231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231ss	(%rax), %xmm1, %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     96.00  96.00   -      -     48.00  48.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231ss	(%rax), %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s b/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s
new file mode 100644
index 00000000000..cc428167b23
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s
@@ -0,0 +1,349 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+vfmaddpd    %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddpd    (%rax), %xmm1, %xmm2, %xmm3
+vfmaddpd    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddpd    %ymm0, %ymm1, %ymm2, %ymm3
+vfmaddpd    (%rax), %ymm1, %ymm2, %ymm3
+vfmaddpd    %ymm0, (%rax), %ymm2, %ymm3
+
+vfmaddps    %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddps    (%rax), %xmm1, %xmm2, %xmm3
+vfmaddps    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddps    %ymm0, %ymm1, %ymm2, %ymm3
+vfmaddps    (%rax), %ymm1, %ymm2, %ymm3
+vfmaddps    %ymm0, (%rax), %ymm2, %ymm3
+
+vfmaddsd    %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddsd    (%rax), %xmm1, %xmm2, %xmm3
+vfmaddsd    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddss    %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddss    (%rax), %xmm1, %xmm2, %xmm3
+vfmaddss    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddsubpd %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddsubpd (%rax), %xmm1, %xmm2, %xmm3
+vfmaddsubpd %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddsubpd %ymm0, %ymm1, %ymm2, %ymm3
+vfmaddsubpd (%rax), %ymm1, %ymm2, %ymm3
+vfmaddsubpd %ymm0, (%rax), %ymm2, %ymm3
+
+vfmaddsubps %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddsubps (%rax), %xmm1, %xmm2, %xmm3
+vfmaddsubps %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddsubps %ymm0, %ymm1, %ymm2, %ymm3
+vfmaddsubps (%rax), %ymm1, %ymm2, %ymm3
+vfmaddsubps %ymm0, (%rax), %ymm2, %ymm3
+
+vfmsubaddpd %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubaddpd (%rax), %xmm1, %xmm2, %xmm3
+vfmsubaddpd %xmm0, (%rax), %xmm2, %xmm3
+
+vfmsubaddpd %ymm0, %ymm1, %ymm2, %ymm3
+vfmsubaddpd (%rax), %ymm1, %ymm2, %ymm3
+vfmsubaddpd %ymm0, (%rax), %ymm2, %ymm3
+
+vfmsubaddps %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubaddps (%rax), %xmm1, %xmm2, %xmm3
+vfmsubaddps %xmm0, (%rax), %xmm2, %xmm3
+
+vfmsubaddps %ymm0, %ymm1, %ymm2, %ymm3
+vfmsubaddps (%rax), %ymm1, %ymm2, %ymm3
+vfmsubaddps %ymm0, (%rax), %ymm2, %ymm3
+
+vfmsubpd    %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubpd    (%rax), %xmm1, %xmm2, %xmm3
+vfmsubpd    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmsubpd    %ymm0, %ymm1, %ymm2, %ymm3
+vfmsubpd    (%rax), %ymm1, %ymm2, %ymm3
+vfmsubpd    %ymm0, (%rax), %ymm2, %ymm3
+
+vfmsubps    %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubps    (%rax), %xmm1, %xmm2, %xmm3
+vfmsubps    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmsubps    %ymm0, %ymm1, %ymm2, %ymm3
+vfmsubps    (%rax), %ymm1, %ymm2, %ymm3
+vfmsubps    %ymm0, (%rax), %ymm2, %ymm3
+
+vfmsubsd    %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubsd    (%rax), %xmm1, %xmm2, %xmm3
+vfmsubsd    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmsubss    %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubss    (%rax), %xmm1, %xmm2, %xmm3
+vfmsubss    %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmaddpd   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmaddpd   (%rax), %xmm1, %xmm2, %xmm3
+vfnmaddpd   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmaddpd   %ymm0, %ymm1, %ymm2, %ymm3
+vfnmaddpd   (%rax), %ymm1, %ymm2, %ymm3
+vfnmaddpd   %ymm0, (%rax), %ymm2, %ymm3
+
+vfnmaddps   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmaddps   (%rax), %xmm1, %xmm2, %xmm3
+vfnmaddps   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmaddps   %ymm0, %ymm1, %ymm2, %ymm3
+vfnmaddps   (%rax), %ymm1, %ymm2, %ymm3
+vfnmaddps   %ymm0, (%rax), %ymm2, %ymm3
+
+vfnmaddsd   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmaddsd   (%rax), %xmm1, %xmm2, %xmm3
+vfnmaddsd   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmaddss   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmaddss   (%rax), %xmm1, %xmm2, %xmm3
+vfnmaddss   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmsubpd   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmsubpd   (%rax), %xmm1, %xmm2, %xmm3
+vfnmsubpd   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmsubpd   %ymm0, %ymm1, %ymm2, %ymm3
+vfnmsubpd   (%rax), %ymm1, %ymm2, %ymm3
+vfnmsubpd   %ymm0, (%rax), %ymm2, %ymm3
+
+vfnmsubps   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmsubps   (%rax), %xmm1, %xmm2, %xmm3
+vfnmsubps   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmsubps   %ymm0, %ymm1, %ymm2, %ymm3
+vfnmsubps   (%rax), %ymm1, %ymm2, %ymm3
+vfnmsubps   %ymm0, (%rax), %ymm2, %ymm3
+
+vfnmsubsd   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmsubsd   (%rax), %xmm1, %xmm2, %xmm3
+vfnmsubsd   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmsubss   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmsubss   (%rax), %xmm1, %xmm2, %xmm3
+vfnmsubss   %xmm0, (%rax), %xmm2, %xmm3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50                        vfmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfmsubss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfnmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfnmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfnmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfnmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      10    0.50    *                   vfnmsubss	%xmm0, (%rax), %xmm2, %xmm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     48.00  48.00   -      -     32.00  32.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubss	%xmm0, (%rax), %xmm2, %xmm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-lea.s b/test/tools/llvm-mca/X86/BdVer2/resources-lea.s
new file mode 100644
index 00000000000..455fbe0411e
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-lea.s
@@ -0,0 +1,437 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+lea 0(), %cx
+lea 0(), %ecx
+lea 0(), %rcx
+lea (%eax), %cx
+lea (%eax), %ecx
+lea (%eax), %rcx
+lea (%rax), %cx
+lea (%rax), %ecx
+lea (%rax), %rcx
+lea (, %ebx), %cx
+lea (, %ebx), %ecx
+lea (, %ebx), %rcx
+lea (, %rbx), %cx
+lea (, %rbx), %ecx
+lea (, %rbx), %rcx
+lea (, %ebx, 1), %cx
+lea (, %ebx, 1), %ecx
+lea (, %ebx, 1), %rcx
+lea (, %rbx, 1), %cx
+lea (, %rbx, 1), %ecx
+lea (, %rbx, 1), %rcx
+lea (, %ebx, 2), %cx
+lea (, %ebx, 2), %ecx
+lea (, %ebx, 2), %rcx
+lea (, %rbx, 2), %cx
+lea (, %rbx, 2), %ecx
+lea (, %rbx, 2), %rcx
+lea (%eax, %ebx), %cx
+lea (%eax, %ebx), %ecx
+lea (%eax, %ebx), %rcx
+lea (%rax, %rbx), %cx
+lea (%rax, %rbx), %ecx
+lea (%rax, %rbx), %rcx
+lea (%eax, %ebx, 1), %cx
+lea (%eax, %ebx, 1), %ecx
+lea (%eax, %ebx, 1), %rcx
+lea (%rax, %rbx, 1), %cx
+lea (%rax, %rbx, 1), %ecx
+lea (%rax, %rbx, 1), %rcx
+lea (%eax, %ebx, 2), %cx
+lea (%eax, %ebx, 2), %ecx
+lea (%eax, %ebx, 2), %rcx
+lea (%rax, %rbx, 2), %cx
+lea (%rax, %rbx, 2), %ecx
+lea (%rax, %rbx, 2), %rcx
+
+lea -16(), %cx
+lea -16(), %ecx
+lea -16(), %rcx
+lea -16(%eax), %cx
+lea -16(%eax), %ecx
+lea -16(%eax), %rcx
+lea -16(%rax), %cx
+lea -16(%rax), %ecx
+lea -16(%rax), %rcx
+lea -16(, %ebx), %cx
+lea -16(, %ebx), %ecx
+lea -16(, %ebx), %rcx
+lea -16(, %rbx), %cx
+lea -16(, %rbx), %ecx
+lea -16(, %rbx), %rcx
+lea -16(, %ebx, 1), %cx
+lea -16(, %ebx, 1), %ecx
+lea -16(, %ebx, 1), %rcx
+lea -16(, %rbx, 1), %cx
+lea -16(, %rbx, 1), %ecx
+lea -16(, %rbx, 1), %rcx
+lea -16(, %ebx, 2), %cx
+lea -16(, %ebx, 2), %ecx
+lea -16(, %ebx, 2), %rcx
+lea -16(, %rbx, 2), %cx
+lea -16(, %rbx, 2), %ecx
+lea -16(, %rbx, 2), %rcx
+lea -16(%eax, %ebx), %cx
+lea -16(%eax, %ebx), %ecx
+lea -16(%eax, %ebx), %rcx
+lea -16(%rax, %rbx), %cx
+lea -16(%rax, %rbx), %ecx
+lea -16(%rax, %rbx), %rcx
+lea -16(%eax, %ebx, 1), %cx
+lea -16(%eax, %ebx, 1), %ecx
+lea -16(%eax, %ebx, 1), %rcx
+lea -16(%rax, %rbx, 1), %cx
+lea -16(%rax, %rbx, 1), %ecx
+lea -16(%rax, %rbx, 1), %rcx
+lea -16(%eax, %ebx, 2), %cx
+lea -16(%eax, %ebx, 2), %ecx
+lea -16(%eax, %ebx, 2), %rcx
+lea -16(%rax, %rbx, 2), %cx
+lea -16(%rax, %rbx, 2), %ecx
+lea -16(%rax, %rbx, 2), %rcx
+
+lea 1024(), %cx
+lea 1024(), %ecx
+lea 1024(), %rcx
+lea 1024(%eax), %cx
+lea 1024(%eax), %ecx
+lea 1024(%eax), %rcx
+lea 1024(%rax), %cx
+lea 1024(%rax), %ecx
+lea 1024(%rax), %rcx
+lea 1024(, %ebx), %cx
+lea 1024(, %ebx), %ecx
+lea 1024(, %ebx), %rcx
+lea 1024(, %rbx), %cx
+lea 1024(, %rbx), %ecx
+lea 1024(, %rbx), %rcx
+lea 1024(, %ebx, 1), %cx
+lea 1024(, %ebx, 1), %ecx
+lea 1024(, %ebx, 1), %rcx
+lea 1024(, %rbx, 1), %cx
+lea 1024(, %rbx, 1), %ecx
+lea 1024(, %rbx, 1), %rcx
+lea 1024(, %ebx, 2), %cx
+lea 1024(, %ebx, 2), %ecx
+lea 1024(, %ebx, 2), %rcx
+lea 1024(, %rbx, 2), %cx
+lea 1024(, %rbx, 2), %ecx
+lea 1024(, %rbx, 2), %rcx
+lea 1024(%eax, %ebx), %cx
+lea 1024(%eax, %ebx), %ecx
+lea 1024(%eax, %ebx), %rcx
+lea 1024(%rax, %rbx), %cx
+lea 1024(%rax, %rbx), %ecx
+lea 1024(%rax, %rbx), %rcx
+lea 1024(%eax, %ebx, 1), %cx
+lea 1024(%eax, %ebx, 1), %ecx
+lea 1024(%eax, %ebx, 1), %rcx
+lea 1024(%rax, %rbx, 1), %cx
+lea 1024(%rax, %rbx, 1), %ecx
+lea 1024(%rax, %rbx, 1), %rcx
+lea 1024(%eax, %ebx, 2), %cx
+lea 1024(%eax, %ebx, 2), %ecx
+lea 1024(%eax, %ebx, 2), %rcx
+lea 1024(%rax, %rbx, 2), %cx
+lea 1024(%rax, %rbx, 2), %ecx
+lea 1024(%rax, %rbx, 2), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        leaw	0, %cx
+# CHECK-NEXT:  1      1     0.50                        leal	0, %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	0, %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(%eax), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(%eax), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(%eax), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(%rax), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(,%ebx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(,%ebx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(,%ebx,2), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(,%rbx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(,%rbx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(,%rbx,2), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(%eax,%ebx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	(%rax,%rbx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16, %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16, %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16, %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(%eax), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(%eax), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(%eax), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(%rax), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(%rax), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(%rax), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%ebx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(,%ebx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%ebx,2), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%rbx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(,%rbx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%rbx,2), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(%eax,%ebx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	-16(%rax,%rbx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	-16(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	-16(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024, %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024, %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024, %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(%eax), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(%eax), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(%eax), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(%rax), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(%rax), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(%rax), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%ebx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(,%ebx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%ebx,2), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%rbx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(,%rbx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%rbx,2), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(%eax,%ebx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  1      1     0.50                        leaw	1024(%rax,%rbx,2), %cx
+# CHECK-NEXT:  1      1     0.50                        leal	1024(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  1      1     0.50                        leaq	1024(%rax,%rbx,2), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     67.50  67.50   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	0, %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	0, %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	0, %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%eax), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%eax), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%eax), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%rax), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%rax), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%ebx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%rbx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16, %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16, %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16, %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%eax), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%eax), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%eax), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%rax), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%rax), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%rax), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%ebx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%rbx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024, %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024, %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024, %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%eax), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%eax), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%eax), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%rax), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%rax), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%rax), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%ebx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%rbx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%rax,%rbx,2), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s b/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s
new file mode 100644
index 00000000000..6ac0945d4d8
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s
@@ -0,0 +1,50 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+lzcntw      %cx, %cx
+lzcntw      (%rax), %cx
+
+lzcntl      %eax, %ecx
+lzcntl      (%rax), %ecx
+
+lzcntq      %rax, %rcx
+lzcntq      (%rax), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        lzcntw	%cx, %cx
+# CHECK-NEXT:  2      8     1.00    *                   lzcntw	(%rax), %cx
+# CHECK-NEXT:  1      3     1.00                        lzcntl	%eax, %ecx
+# CHECK-NEXT:  2      8     1.00    *                   lzcntl	(%rax), %ecx
+# CHECK-NEXT:  1      3     1.00                        lzcntq	%rax, %rcx
+# CHECK-NEXT:  2      8     1.00    *                   lzcntq	(%rax), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     6.00    -      -     1.50   1.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     lzcntw	%cx, %cx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   lzcntw	(%rax), %cx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     lzcntl	%eax, %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   lzcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     lzcntq	%rax, %rcx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   lzcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s b/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s
new file mode 100644
index 00000000000..8c9644b6494
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s
@@ -0,0 +1,393 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+emms
+
+movd        %eax, %mm2
+movd        (%rax), %mm2
+
+movd        %mm0, %ecx
+movd        %mm0, (%rax)
+
+movq        %rax, %mm2
+movq        (%rax), %mm2
+
+movq        %mm0, %rcx
+movq        %mm0, (%rax)
+
+packsswb    %mm0, %mm2
+packsswb    (%rax), %mm2
+
+packssdw    %mm0, %mm2
+packssdw    (%rax), %mm2
+
+packuswb    %mm0, %mm2
+packuswb    (%rax), %mm2
+
+paddb       %mm0, %mm2
+paddb       (%rax), %mm2
+
+paddd       %mm0, %mm2
+paddd       (%rax), %mm2
+
+paddsb      %mm0, %mm2
+paddsb      (%rax), %mm2
+
+paddsw      %mm0, %mm2
+paddsw      (%rax), %mm2
+
+paddusb     %mm0, %mm2
+paddusb     (%rax), %mm2
+
+paddusw     %mm0, %mm2
+paddusw     (%rax), %mm2
+
+paddw       %mm0, %mm2
+paddw       (%rax), %mm2
+
+pand        %mm0, %mm2
+pand        (%rax), %mm2
+
+pandn       %mm0, %mm2
+pandn       (%rax), %mm2
+
+pcmpeqb     %mm0, %mm2
+pcmpeqb     (%rax), %mm2
+
+pcmpeqd     %mm0, %mm2
+pcmpeqd     (%rax), %mm2
+
+pcmpeqw     %mm0, %mm2
+pcmpeqw     (%rax), %mm2
+
+pcmpgtb     %mm0, %mm2
+pcmpgtb     (%rax), %mm2
+
+pcmpgtd     %mm0, %mm2
+pcmpgtd     (%rax), %mm2
+
+pcmpgtw     %mm0, %mm2
+pcmpgtw     (%rax), %mm2
+
+pmaddwd     %mm0, %mm2
+pmaddwd     (%rax), %mm2
+
+pmulhw      %mm0, %mm2
+pmulhw      (%rax), %mm2
+
+pmullw      %mm0, %mm2
+pmullw      (%rax), %mm2
+
+por         %mm0, %mm2
+por         (%rax), %mm2
+
+pslld       $1, %mm2
+pslld       %mm0, %mm2
+pslld       (%rax), %mm2
+
+psllq       $1, %mm2
+psllq       %mm0, %mm2
+psllq       (%rax), %mm2
+
+psllw       $1, %mm2
+psllw       %mm0, %mm2
+psllw       (%rax), %mm2
+
+psrad       $1, %mm2
+psrad       %mm0, %mm2
+psrad       (%rax), %mm2
+
+psraw       $1, %mm2
+psraw       %mm0, %mm2
+psraw       (%rax), %mm2
+
+psrld       $1, %mm2
+psrld       %mm0, %mm2
+psrld       (%rax), %mm2
+
+psrlq       $1, %mm2
+psrlq       %mm0, %mm2
+psrlq       (%rax), %mm2
+
+psrlw       $1, %mm2
+psrlw       %mm0, %mm2
+psrlw       (%rax), %mm2
+
+psubb       %mm0, %mm2
+psubb       (%rax), %mm2
+
+psubd       %mm0, %mm2
+psubd       (%rax), %mm2
+
+psubsb      %mm0, %mm2
+psubsb      (%rax), %mm2
+
+psubsw      %mm0, %mm2
+psubsw      (%rax), %mm2
+
+psubusb     %mm0, %mm2
+psubusb     (%rax), %mm2
+
+psubusw     %mm0, %mm2
+psubusw     (%rax), %mm2
+
+psubw       %mm0, %mm2
+psubw       (%rax), %mm2
+
+punpckhbw   %mm0, %mm2
+punpckhbw   (%rax), %mm2
+
+punpckhdq   %mm0, %mm2
+punpckhdq   (%rax), %mm2
+
+punpckhwd   %mm0, %mm2
+punpckhwd   (%rax), %mm2
+
+punpcklbw   %mm0, %mm2
+punpcklbw   (%rax), %mm2
+
+punpckldq   %mm0, %mm2
+punpckldq   (%rax), %mm2
+
+punpcklwd   %mm0, %mm2
+punpcklwd   (%rax), %mm2
+
+pxor        %mm0, %mm2
+pxor        (%rax), %mm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  31     31    10.33   *      *      U     emms
+# CHECK-NEXT:  1      1     1.00                        movd	%eax, %mm2
+# CHECK-NEXT:  1      5     0.50    *                   movd	(%rax), %mm2
+# CHECK-NEXT:  1      2     1.00                        movd	%mm0, %ecx
+# CHECK-NEXT:  1      1     1.00           *      U     movd	%mm0, (%rax)
+# CHECK-NEXT:  1      1     1.00                        movq	%rax, %mm2
+# CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %mm2
+# CHECK-NEXT:  1      2     1.00                        movq	%mm0, %rcx
+# CHECK-NEXT:  1      1     1.00           *            movq	%mm0, (%rax)
+# CHECK-NEXT:  1      1     1.00                        packsswb	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   packsswb	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        packssdw	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   packssdw	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        packuswb	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   packuswb	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        paddb	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   paddb	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        paddd	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   paddd	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        paddsb	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   paddsb	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        paddsw	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   paddsw	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        paddusb	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   paddusb	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        paddusw	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   paddusw	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        paddw	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   paddw	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.33                        pand	%mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   pand	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.33                        pandn	%mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   pandn	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pcmpeqb	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pcmpeqb	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pcmpeqd	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pcmpeqd	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pcmpeqw	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pcmpeqw	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pcmpgtb	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pcmpgtb	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pcmpgtd	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pcmpgtd	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pcmpgtw	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pcmpgtw	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pmaddwd	%mm0, %mm2
+# CHECK-NEXT:  2      10    1.00    *                   pmaddwd	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pmulhw	%mm0, %mm2
+# CHECK-NEXT:  2      10    1.00    *                   pmulhw	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pmullw	%mm0, %mm2
+# CHECK-NEXT:  2      10    1.00    *                   pmullw	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.33                        por	%mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   por	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        pslld	$1, %mm2
+# CHECK-NEXT:  1      1     1.00                        pslld	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   pslld	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        psllq	$1, %mm2
+# CHECK-NEXT:  1      1     1.00                        psllq	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   psllq	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        psllw	$1, %mm2
+# CHECK-NEXT:  1      1     1.00                        psllw	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   psllw	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        psrad	$1, %mm2
+# CHECK-NEXT:  1      1     1.00                        psrad	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   psrad	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        psraw	$1, %mm2
+# CHECK-NEXT:  1      1     1.00                        psraw	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   psraw	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        psrld	$1, %mm2
+# CHECK-NEXT:  1      1     1.00                        psrld	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   psrld	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        psrlq	$1, %mm2
+# CHECK-NEXT:  1      1     1.00                        psrlq	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   psrlq	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        psrlw	$1, %mm2
+# CHECK-NEXT:  1      1     1.00                        psrlw	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   psrlw	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        psubb	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   psubb	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        psubd	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   psubd	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        psubsb	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   psubsb	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        psubsw	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   psubsw	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        psubusb	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   psubusb	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        psubusw	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   psubusw	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        psubw	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   psubw	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        punpckhbw	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   punpckhbw	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        punpckhdq	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   punpckhdq	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        punpckhwd	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   punpckhwd	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        punpcklbw	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   punpcklbw	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        punpckldq	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   punpckldq	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        punpcklwd	%mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   punpcklwd	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.33                        pxor	%mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   pxor	(%rax), %mm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     21.00  53.00  2.00   57.00  24.00  24.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     10.33  10.33   -     10.33   -      -     emms
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movd	%eax, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movd	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movd	%mm0, %ecx
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movd	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movq	%rax, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movq	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movq	%mm0, %rcx
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movq	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     packsswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   packsswb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     packssdw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   packssdw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     packuswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   packuswb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddsb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddsw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddusb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddusw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddusw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddw	(%rax), %mm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pand	%mm0, %mm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pand	(%rax), %mm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pandn	%mm0, %mm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pandn	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpeqb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpeqd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpeqw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpgtb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpgtd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpgtw	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmaddwd	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmaddwd	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhw	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhw	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmullw	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmullw	(%rax), %mm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     por	%mm0, %mm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   por	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     pslld	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     pslld	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   pslld	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psllq	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psllq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psllq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psllw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psllw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psllw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrad	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrad	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psrad	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psraw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psraw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psraw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrld	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrld	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psrld	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrlq	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrlq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psrlq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrlw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrlw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psrlw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubsb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubsw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubusb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubusw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubusw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpckhbw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpckhbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpckhdq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpckhdq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpckhwd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpckhwd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpcklbw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpcklbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpckldq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpckldq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpcklwd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpcklwd	(%rax), %mm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pxor	%mm0, %mm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pxor	(%rax), %mm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s b/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s
new file mode 100644
index 00000000000..aa8641484e1
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s
@@ -0,0 +1,50 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+movbe  %cx, (%rax)
+movbe  (%rax), %cx
+
+movbe  %ecx, (%rax)
+movbe  (%rax), %ecx
+
+movbe  %rcx, (%rax)
+movbe  (%rax), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00           *            movbew	%cx, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   movbew	(%rax), %cx
+# CHECK-NEXT:  1      1     1.00           *            movbel	%ecx, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   movbel	(%rax), %ecx
+# CHECK-NEXT:  1      1     1.00           *            movbeq	%rcx, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   movbeq	(%rax), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00   3.00   1.00   3.00   3.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movbew	%cx, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   movbew	(%rax), %cx
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movbel	%ecx, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   movbel	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movbeq	%rcx, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   movbeq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s b/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s
new file mode 100644
index 00000000000..12f879b5fb0
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+pclmulqdq     $11, %xmm0, %xmm2
+pclmulqdq     $11, (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      14    6.00                        pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT:  1      14    5.67    *                   pclmulqdq	$11, (%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     11.67  11.67   -     11.67  0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     6.00   6.00    -     6.00    -      -     pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     5.67   5.67    -     5.67   0.50   0.50   pclmulqdq	$11, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s b/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s
new file mode 100644
index 00000000000..c24ce8869f9
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s
@@ -0,0 +1,50 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+popcntw     %cx, %cx
+popcntw     (%rax), %cx
+
+popcntl     %eax, %ecx
+popcntl     (%rax), %ecx
+
+popcntq     %rax, %rcx
+popcntq     (%rax), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        popcntw	%cx, %cx
+# CHECK-NEXT:  2      9     1.00    *                   popcntw	(%rax), %cx
+# CHECK-NEXT:  1      3     1.00                        popcntl	%eax, %ecx
+# CHECK-NEXT:  2      9     1.00    *                   popcntl	(%rax), %ecx
+# CHECK-NEXT:  1      3     1.00                        popcntq	%rax, %rcx
+# CHECK-NEXT:  2      9     1.00    *                   popcntq	(%rax), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     6.00    -      -     1.50   1.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     popcntw	%cx, %cx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   popcntw	(%rax), %cx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     popcntl	%eax, %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   popcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     popcntq	%rax, %rcx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   popcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s b/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s
new file mode 100644
index 00000000000..b44b28c3725
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+prefetch    (%rax)
+prefetchw   (%rax)
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *      *            prefetch	(%rax)
+# CHECK-NEXT:  1      5     0.50    *      *            prefetchw	(%rax)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   1.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetch	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetchw	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s
new file mode 100644
index 00000000000..cc4d6ed0b43
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s
@@ -0,0 +1,461 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+addps       %xmm0, %xmm2
+addps       (%rax), %xmm2
+
+addss       %xmm0, %xmm2
+addss       (%rax), %xmm2
+
+andnps      %xmm0, %xmm2
+andnps      (%rax), %xmm2
+
+andps       %xmm0, %xmm2
+andps       (%rax), %xmm2
+
+cmpps       $0, %xmm0, %xmm2
+cmpps       $0, (%rax), %xmm2
+
+cmpss       $0, %xmm0, %xmm2
+cmpss       $0, (%rax), %xmm2
+
+comiss      %xmm0, %xmm1
+comiss      (%rax), %xmm1
+
+cvtpi2ps    %mm0, %xmm2
+cvtpi2ps    (%rax), %xmm2
+
+cvtps2pi    %xmm0, %mm2
+cvtps2pi    (%rax), %mm2
+
+cvtsi2ss    %ecx, %xmm2
+cvtsi2ss    %rcx, %xmm2
+cvtsi2ss    (%rax), %xmm2
+cvtsi2ss    (%rax), %xmm2
+
+cvtss2si    %xmm0, %ecx
+cvtss2si    %xmm0, %rcx
+cvtss2si    (%rax), %ecx
+cvtss2si    (%rax), %rcx
+
+cvttps2pi   %xmm0, %mm2
+cvttps2pi   (%rax), %mm2
+
+cvttss2si   %xmm0, %ecx
+cvttss2si   %xmm0, %rcx
+cvttss2si   (%rax), %ecx
+cvttss2si   (%rax), %rcx
+
+divps       %xmm0, %xmm2
+divps       (%rax), %xmm2
+
+divss       %xmm0, %xmm2
+divss       (%rax), %xmm2
+
+ldmxcsr     (%rax)
+
+maskmovq    %mm0, %mm1
+
+maxps       %xmm0, %xmm2
+maxps       (%rax), %xmm2
+
+maxss       %xmm0, %xmm2
+maxss       (%rax), %xmm2
+
+minps       %xmm0, %xmm2
+minps       (%rax), %xmm2
+
+minss       %xmm0, %xmm2
+minss       (%rax), %xmm2
+
+movaps      %xmm0, %xmm2
+movaps      %xmm0, (%rax)
+movaps      (%rax), %xmm2
+
+movhlps     %xmm0, %xmm2
+movlhps     %xmm0, %xmm2
+
+movhps      %xmm0, (%rax)
+movhps      (%rax), %xmm2
+
+movlps      %xmm0, (%rax)
+movlps      (%rax), %xmm2
+
+movmskps    %xmm0, %rcx
+
+movntps     %xmm0, (%rax)
+movntq      %mm0, (%rax)
+
+movss       %xmm0, %xmm2
+movss       %xmm0, (%rax)
+movss       (%rax), %xmm2
+
+movups      %xmm0, %xmm2
+movups      %xmm0, (%rax)
+movups      (%rax), %xmm2
+
+mulps       %xmm0, %xmm2
+mulps       (%rax), %xmm2
+
+mulss       %xmm0, %xmm2
+mulss       (%rax), %xmm2
+
+orps        %xmm0, %xmm2
+orps        (%rax), %xmm2
+
+pavgb       %mm0, %mm2
+pavgb       (%rax), %mm2
+
+pavgw       %mm0, %mm2
+pavgw       (%rax), %mm2
+
+pextrw      $1, %mm0, %rcx
+
+pinsrw      $1, %rax, %mm2
+pinsrw      $1, (%rax), %mm2
+
+pmaxsw      %mm0, %mm2
+pmaxsw      (%rax), %mm2
+
+pmaxub      %mm0, %mm2
+pmaxub      (%rax), %mm2
+
+pminsw      %mm0, %mm2
+pminsw      (%rax), %mm2
+
+pminub      %mm0, %mm2
+pminub      (%rax), %mm2
+
+pmovmskb    %xmm0, %rcx
+
+pmulhuw     %mm0, %mm2
+pmulhuw     (%rax), %mm2
+
+prefetcht0  (%rax)
+prefetcht1  (%rax)
+prefetcht2  (%rax)
+prefetchnta (%rax)
+
+psadbw      %mm0, %mm2
+psadbw      (%rax), %mm2
+
+pshufw      $1, %mm0, %mm2
+pshufw      $1, (%rax), %mm2
+
+rcpps       %xmm0, %xmm2
+rcpps       (%rax), %xmm2
+
+rcpss       %xmm0, %xmm2
+rcpss       (%rax), %xmm2
+
+rsqrtps     %xmm0, %xmm2
+rsqrtps     (%rax), %xmm2
+
+rsqrtss     %xmm0, %xmm2
+rsqrtss     (%rax), %xmm2
+
+sfence
+
+shufps      $1, %xmm0, %xmm2
+shufps      $1, (%rax), %xmm2
+
+sqrtps      %xmm0, %xmm2
+sqrtps      (%rax), %xmm2
+
+sqrtss      %xmm0, %xmm2
+sqrtss      (%rax), %xmm2
+
+stmxcsr     (%rax)
+
+subps       %xmm0, %xmm2
+subps       (%rax), %xmm2
+
+subss       %xmm0, %xmm2
+subss       (%rax), %xmm2
+
+ucomiss     %xmm0, %xmm1
+ucomiss     (%rax), %xmm1
+
+unpckhps    %xmm0, %xmm2
+unpckhps    (%rax), %xmm2
+
+unpcklps    %xmm0, %xmm2
+unpcklps    (%rax), %xmm2
+
+xorps       %xmm0, %xmm2
+xorps       (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        addps	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   addps	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        addss	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   addss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        andnps	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   andnps	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        andps	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   andps	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        cmpps	$0, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cmpps	$0, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        cmpss	$0, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cmpss	$0, (%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        comiss	%xmm0, %xmm1
+# CHECK-NEXT:  3      8     1.00    *                   comiss	(%rax), %xmm1
+# CHECK-NEXT:  1      3     1.00                        cvtpi2ps	%mm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtpi2ps	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        cvtps2pi	%xmm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtps2pi	(%rax), %mm2
+# CHECK-NEXT:  3      5     2.00                        cvtsi2ssl	%ecx, %xmm2
+# CHECK-NEXT:  3      5     2.00                        cvtsi2ssq	%rcx, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT:  2      5     1.00                        cvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      5     1.00                        cvtss2si	%xmm0, %rcx
+# CHECK-NEXT:  3      9     1.00    *                   cvtss2si	(%rax), %ecx
+# CHECK-NEXT:  3      9     1.00    *                   cvtss2si	(%rax), %rcx
+# CHECK-NEXT:  1      3     1.00                        cvttps2pi	%xmm0, %mm2
+# CHECK-NEXT:  2      9     1.00    *                   cvttps2pi	(%rax), %mm2
+# CHECK-NEXT:  2      5     1.00                        cvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      5     1.00                        cvttss2si	%xmm0, %rcx
+# CHECK-NEXT:  3      9     1.00    *                   cvttss2si	(%rax), %ecx
+# CHECK-NEXT:  3      9     1.00    *                   cvttss2si	(%rax), %rcx
+# CHECK-NEXT:  1      14    14.00                       divps	%xmm0, %xmm2
+# CHECK-NEXT:  2      20    14.00   *                   divps	(%rax), %xmm2
+# CHECK-NEXT:  1      14    14.00                       divss	%xmm0, %xmm2
+# CHECK-NEXT:  2      20    14.00   *                   divss	(%rax), %xmm2
+# CHECK-NEXT:  4      5     1.00    *      *      U     ldmxcsr	(%rax)
+# CHECK-NEXT:  1      1     1.00    *      *      U     maskmovq	%mm0, %mm1
+# CHECK-NEXT:  1      3     1.00                        maxps	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   maxps	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        maxss	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   maxss	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        minps	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   minps	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        minss	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   minss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        movaps	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movaps	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   movaps	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        movhlps	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00                        movlhps	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movhps	%xmm0, (%rax)
+# CHECK-NEXT:  2      7     1.00    *                   movhps	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movlps	%xmm0, (%rax)
+# CHECK-NEXT:  2      7     1.00    *                   movlps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        movmskps	%xmm0, %ecx
+# CHECK-NEXT:  1      1     1.00           *            movntps	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     1.00    *      *      U     movntq	%mm0, (%rax)
+# CHECK-NEXT:  1      1     1.00                        movss	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movss	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   movss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        movups	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movups	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   movups	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        mulps	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   mulps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        mulss	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   mulss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        orps	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   orps	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        pavgb	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pavgb	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pavgw	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pavgw	(%rax), %mm2
+# CHECK-NEXT:  2      3     1.00                        pextrw	$1, %mm0, %ecx
+# CHECK-NEXT:  2      2     1.00                        pinsrw	$1, %eax, %mm2
+# CHECK-NEXT:  2      7     0.50    *                   pinsrw	$1, (%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pmaxsw	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pmaxsw	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pmaxub	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pmaxub	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pminsw	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pminsw	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        pminub	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   pminub	(%rax), %mm2
+# CHECK-NEXT:  1      2     1.00                        pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  1      5     1.00                        pmulhuw	%mm0, %mm2
+# CHECK-NEXT:  2      10    1.00    *                   pmulhuw	(%rax), %mm2
+# CHECK-NEXT:  1      5     0.50    *      *            prefetcht0	(%rax)
+# CHECK-NEXT:  1      5     0.50    *      *            prefetcht1	(%rax)
+# CHECK-NEXT:  1      5     0.50    *      *            prefetcht2	(%rax)
+# CHECK-NEXT:  1      5     0.50    *      *            prefetchnta	(%rax)
+# CHECK-NEXT:  1      5     1.00                        psadbw	%mm0, %mm2
+# CHECK-NEXT:  2      10    1.00    *                   psadbw	(%rax), %mm2
+# CHECK-NEXT:  1      1     1.00                        pshufw	$1, %mm0, %mm2
+# CHECK-NEXT:  2      6     1.00    *                   pshufw	$1, (%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        rcpps	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   rcpps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        rcpss	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   rcpss	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        rsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   rsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        rsqrtss	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   rsqrtss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00    *      *      U     sfence
+# CHECK-NEXT:  1      1     1.00                        shufps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   shufps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      14    14.00                       sqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  2      20    14.00   *                   sqrtps	(%rax), %xmm2
+# CHECK-NEXT:  1      14    14.00                       sqrtss	%xmm0, %xmm2
+# CHECK-NEXT:  2      20    14.00   *                   sqrtss	(%rax), %xmm2
+# CHECK-NEXT:  4      5     1.00    *      *      U     stmxcsr	(%rax)
+# CHECK-NEXT:  1      3     1.00                        subps	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   subps	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        subss	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   subss	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        ucomiss	%xmm0, %xmm1
+# CHECK-NEXT:  3      8     1.00    *                   ucomiss	(%rax), %xmm1
+# CHECK-NEXT:  1      1     1.00                        unpckhps	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   unpckhps	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        unpcklps	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   unpcklps	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        xorps	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   xorps	(%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -     112.00 41.00  55.50  10.00  34.50  33.50  33.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addss	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andnps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   andnps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   andps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cmpps	$0, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cmpps	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cmpss	$0, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cmpss	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     comiss	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   comiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvtpi2ps	%mm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtpi2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvtps2pi	%xmm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtps2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     cvtsi2ssl	%ecx, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     cvtsi2ssq	%rcx, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvtss2si	%xmm0, %rcx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvtss2si	(%rax), %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvtss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvttps2pi	%xmm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvttps2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvttss2si	%xmm0, %rcx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvttss2si	(%rax), %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvttss2si	(%rax), %rcx
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     divps	%xmm0, %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   divps	(%rax), %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     divss	%xmm0, %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   divss	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   ldmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     maskmovq	%mm0, %mm1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     maxps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   maxps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     maxss	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   maxss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     minps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   minps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     minss	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   minss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movaps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movaps	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movaps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movhlps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movlhps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movhps	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   movhps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movlps	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   movlps	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movmskps	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntps	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntq	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movss	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movss	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movups	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movups	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movups	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     mulps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   mulps	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     mulss	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   mulss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     orps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   orps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pavgb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pavgb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pavgw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pavgw	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pextrw	$1, %mm0, %ecx
+# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     pinsrw	$1, %eax, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pinsrw	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pmaxsw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pmaxsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pmaxub	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pmaxub	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pminsw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pminsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pminub	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pminub	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhuw	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhuw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetcht0	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetcht1	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetcht2	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetchnta	(%rax)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psadbw	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   psadbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     pshufw	$1, %mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   pshufw	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     rcpps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   rcpps	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     rcpss	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   rcpss	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     rsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   rsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     rsqrtss	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   rsqrtss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   sfence
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     shufps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   shufps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     sqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   sqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     sqrtss	%xmm0, %xmm2
+# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   sqrtss	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   stmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     subps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   subps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     subss	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   subss	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     ucomiss	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   ucomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     unpckhps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   unpckhps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     unpcklps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   unpcklps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     xorps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   xorps	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s
new file mode 100644
index 00000000000..30534807cb3
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s
@@ -0,0 +1,949 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+addpd       %xmm0, %xmm2
+addpd       (%rax), %xmm2
+
+addsd       %xmm0, %xmm2
+addsd       (%rax), %xmm2
+
+andnpd      %xmm0, %xmm2
+andnpd      (%rax), %xmm2
+
+andpd       %xmm0, %xmm2
+andpd       (%rax), %xmm2
+
+clflush     (%rax)
+
+cmppd       $0, %xmm0, %xmm2
+cmppd       $0, (%rax), %xmm2
+
+cmpsd       $0, %xmm0, %xmm2
+cmpsd       $0, (%rax), %xmm2
+
+comisd      %xmm0, %xmm1
+comisd      (%rax), %xmm1
+
+cvtdq2pd    %xmm0, %xmm2
+cvtdq2pd    (%rax), %xmm2
+
+cvtdq2ps    %xmm0, %xmm2
+cvtdq2ps    (%rax), %xmm2
+
+cvtpd2dq    %xmm0, %xmm2
+cvtpd2dq    (%rax), %xmm2
+
+cvtpd2pi    %xmm0, %mm2
+cvtpd2pi    (%rax), %mm2
+
+cvtpd2ps    %xmm0, %xmm2
+cvtpd2ps    (%rax), %xmm2
+
+cvtpi2pd    %mm0, %xmm2
+cvtpi2pd    (%rax), %xmm2
+
+cvtps2dq    %xmm0, %xmm2
+cvtps2dq    (%rax), %xmm2
+
+cvtps2pd    %xmm0, %xmm2
+cvtps2pd    (%rax), %xmm2
+
+cvtsd2si    %xmm0, %ecx
+cvtsd2si    %xmm0, %rcx
+cvtsd2si    (%rax), %ecx
+cvtsd2si    (%rax), %rcx
+
+cvtsd2ss    %xmm0, %xmm2
+cvtsd2ss    (%rax), %xmm2
+
+cvtsi2sd    %ecx, %xmm2
+cvtsi2sd    %rcx, %xmm2
+cvtsi2sd    (%rax), %xmm2
+cvtsi2sd    (%rax), %xmm2
+
+cvtss2sd    %xmm0, %xmm2
+cvtss2sd    (%rax), %xmm2
+
+cvttpd2dq   %xmm0, %xmm2
+cvttpd2dq   (%rax), %xmm2
+
+cvttpd2pi   %xmm0, %mm2
+cvttpd2pi   (%rax), %mm2
+
+cvttps2dq   %xmm0, %xmm2
+cvttps2dq   (%rax), %xmm2
+
+cvttsd2si   %xmm0, %ecx
+cvttsd2si   %xmm0, %rcx
+cvttsd2si   (%rax), %ecx
+cvttsd2si   (%rax), %rcx
+
+divpd       %xmm0, %xmm2
+divpd       (%rax), %xmm2
+
+divsd       %xmm0, %xmm2
+divsd       (%rax), %xmm2
+
+lfence
+
+maskmovdqu  %xmm0, %xmm1
+
+maxpd       %xmm0, %xmm2
+maxpd       (%rax), %xmm2
+
+maxsd       %xmm0, %xmm2
+maxsd       (%rax), %xmm2
+
+minpd       %xmm0, %xmm2
+minpd       (%rax), %xmm2
+
+minsd       %xmm0, %xmm2
+minsd       (%rax), %xmm2
+
+movapd      %xmm0, %xmm2
+movapd      %xmm0, (%rax)
+movapd      (%rax), %xmm2
+
+movd        %eax, %xmm2
+movd        (%rax), %xmm2
+
+movd        %xmm0, %ecx
+movd        %xmm0, (%rax)
+
+movdqa      %xmm0, %xmm2
+movdqa      %xmm0, (%rax)
+movdqa      (%rax), %xmm2
+
+movdqu      %xmm0, %xmm2
+movdqu      %xmm0, (%rax)
+movdqu      (%rax), %xmm2
+
+movdq2q     %xmm0, %mm2
+
+movhpd      %xmm0, (%rax)
+movhpd      (%rax), %xmm2
+
+movlpd      %xmm0, (%rax)
+movlpd      (%rax), %xmm2
+
+movmskpd    %xmm0, %rcx
+
+movntil     %eax, (%rax)
+movntiq     %rax, (%rax)
+
+movntdq     %xmm0, (%rax)
+movntpd     %xmm0, (%rax)
+
+movq        %xmm0, %xmm2
+
+movq        %rax, %xmm2
+movq        (%rax), %xmm2
+
+movq        %xmm0, %rcx
+movq        %xmm0, (%rax)
+
+movq2dq     %mm0, %xmm2
+
+movsd       %xmm0, %xmm2
+movsd       %xmm0, (%rax)
+movsd       (%rax), %xmm2
+
+movupd      %xmm0, %xmm2
+movupd      %xmm0, (%rax)
+movupd      (%rax), %xmm2
+
+mulpd       %xmm0, %xmm2
+mulpd       (%rax), %xmm2
+
+mulsd       %xmm0, %xmm2
+mulsd       (%rax), %xmm2
+
+orpd        %xmm0, %xmm2
+orpd        (%rax), %xmm2
+
+packssdw    %xmm0, %xmm2
+packssdw    (%rax), %xmm2
+
+packsswb    %xmm0, %xmm2
+packsswb    (%rax), %xmm2
+
+packuswb    %xmm0, %xmm2
+packuswb    (%rax), %xmm2
+
+paddb       %xmm0, %xmm2
+paddb       (%rax), %xmm2
+
+paddd       %xmm0, %xmm2
+paddd       (%rax), %xmm2
+
+paddq       %mm0, %mm2
+paddq       (%rax), %mm2
+
+paddq       %xmm0, %xmm2
+paddq       (%rax), %xmm2
+
+paddsb      %xmm0, %xmm2
+paddsb      (%rax), %xmm2
+
+paddsw      %xmm0, %xmm2
+paddsw      (%rax), %xmm2
+
+paddusb     %xmm0, %xmm2
+paddusb     (%rax), %xmm2
+
+paddusw     %xmm0, %xmm2
+paddusw     (%rax), %xmm2
+
+paddw       %xmm0, %xmm2
+paddw       (%rax), %xmm2
+
+pand        %xmm0, %xmm2
+pand        (%rax), %xmm2
+
+pandn       %xmm0, %xmm2
+pandn       (%rax), %xmm2
+
+pavgb       %xmm0, %xmm2
+pavgb       (%rax), %xmm2
+
+pavgw       %xmm0, %xmm2
+pavgw       (%rax), %xmm2
+
+pcmpeqb     %xmm0, %xmm2
+pcmpeqb     (%rax), %xmm2
+
+pcmpeqd     %xmm0, %xmm2
+pcmpeqd     (%rax), %xmm2
+
+pcmpeqw     %xmm0, %xmm2
+pcmpeqw     (%rax), %xmm2
+
+pcmpgtb     %xmm0, %xmm2
+pcmpgtb     (%rax), %xmm2
+
+pcmpgtd     %xmm0, %xmm2
+pcmpgtd     (%rax), %xmm2
+
+pcmpgtw     %xmm0, %xmm2
+pcmpgtw     (%rax), %xmm2
+
+pextrw      $1, %xmm0, %rcx
+
+pmaddwd     %xmm0, %xmm2
+pmaddwd     (%rax), %xmm2
+
+pmaxsw      %xmm0, %xmm2
+pmaxsw      (%rax), %xmm2
+
+pmaxub      %xmm0, %xmm2
+pmaxub      (%rax), %xmm2
+
+pminsw      %xmm0, %xmm2
+pminsw      (%rax), %xmm2
+
+pminub      %xmm0, %xmm2
+pminub      (%rax), %xmm2
+
+pmovmskb    %xmm0, %rcx
+
+pmulhuw     %xmm0, %xmm2
+pmulhuw     (%rax), %xmm2
+
+pmulhw      %xmm0, %xmm2
+pmulhw      (%rax), %xmm2
+
+pmullw      %xmm0, %xmm2
+pmullw      (%rax), %xmm2
+
+pmuludq     %mm0, %mm2
+pmuludq     (%rax), %mm2
+
+pmuludq     %xmm0, %xmm2
+pmuludq     (%rax), %xmm2
+
+por         %xmm0, %xmm2
+por         (%rax), %xmm2
+
+psadbw      %xmm0, %xmm2
+psadbw      (%rax), %xmm2
+
+pshufd      $1, %xmm0, %xmm2
+pshufd      $1, (%rax), %xmm2
+
+pshufhw     $1, %xmm0, %xmm2
+pshufhw     $1, (%rax), %xmm2
+
+pshuflw     $1, %xmm0, %xmm2
+pshuflw     $1, (%rax), %xmm2
+
+pslld       $1, %xmm2
+pslld       %xmm0, %xmm2
+pslld       (%rax), %xmm2
+
+pslldq      $1, %xmm2
+
+psllq       $1, %xmm2
+psllq       %xmm0, %xmm2
+psllq       (%rax), %xmm2
+
+psllw       $1, %xmm2
+psllw       %xmm0, %xmm2
+psllw       (%rax), %xmm2
+
+psrad       $1, %xmm2
+psrad       %xmm0, %xmm2
+psrad       (%rax), %xmm2
+
+psraw       $1, %xmm2
+psraw       %xmm0, %xmm2
+psraw       (%rax), %xmm2
+
+psrld       $1, %xmm2
+psrld       %xmm0, %xmm2
+psrld       (%rax), %xmm2
+
+psrldq      $1, %xmm2
+
+psrlq       $1, %xmm2
+psrlq       %xmm0, %xmm2
+psrlq       (%rax), %xmm2
+
+psrlw       $1, %xmm2
+psrlw       %xmm0, %xmm2
+psrlw       (%rax), %xmm2
+
+psubb       %xmm0, %xmm2
+psubb       (%rax), %xmm2
+
+psubd       %xmm0, %xmm2
+psubd       (%rax), %xmm2
+
+psubq       %mm0, %mm2
+psubq       (%rax), %mm2
+
+psubq       %xmm0, %xmm2
+psubq       (%rax), %xmm2
+
+psubsb      %xmm0, %xmm2
+psubsb      (%rax), %xmm2
+
+psubsw      %xmm0, %xmm2
+psubsw      (%rax), %xmm2
+
+psubusb     %xmm0, %xmm2
+psubusb     (%rax), %xmm2
+
+psubusw     %xmm0, %xmm2
+psubusw     (%rax), %xmm2
+
+psubw       %xmm0, %xmm2
+psubw       (%rax), %xmm2
+
+punpckhbw   %xmm0, %xmm2
+punpckhbw   (%rax), %xmm2
+
+punpckhdq   %xmm0, %xmm2
+punpckhdq   (%rax), %xmm2
+
+punpckhqdq  %xmm0, %xmm2
+punpckhqdq  (%rax), %xmm2
+
+punpckhwd   %xmm0, %xmm2
+punpckhwd   (%rax), %xmm2
+
+punpcklbw   %xmm0, %xmm2
+punpcklbw   (%rax), %xmm2
+
+punpckldq   %xmm0, %xmm2
+punpckldq   (%rax), %xmm2
+
+punpcklqdq  %xmm0, %xmm2
+punpcklqdq  (%rax), %xmm2
+
+punpcklwd   %xmm0, %xmm2
+punpcklwd   (%rax), %xmm2
+
+pxor        %xmm0, %xmm2
+pxor        (%rax), %xmm2
+
+shufpd      $1, %xmm0, %xmm2
+shufpd      $1, (%rax), %xmm2
+
+sqrtpd      %xmm0, %xmm2
+sqrtpd      (%rax), %xmm2
+
+sqrtsd      %xmm0, %xmm2
+sqrtsd      (%rax), %xmm2
+
+subpd       %xmm0, %xmm2
+subpd       (%rax), %xmm2
+
+subsd       %xmm0, %xmm2
+subsd       (%rax), %xmm2
+
+ucomisd     %xmm0, %xmm1
+ucomisd     (%rax), %xmm1
+
+unpckhpd    %xmm0, %xmm2
+unpckhpd    (%rax), %xmm2
+
+unpcklpd    %xmm0, %xmm2
+unpcklpd    (%rax), %xmm2
+
+xorpd       %xmm0, %xmm2
+xorpd       (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        addpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   addpd	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        addsd	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   addsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        andnpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   andnpd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        andpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   andpd	(%rax), %xmm2
+# CHECK-NEXT:  4      5     1.00    *      *      U     clflush	(%rax)
+# CHECK-NEXT:  1      3     1.00                        cmppd	$0, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cmppd	$0, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        cmpsd	$0, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cmpsd	$0, (%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        comisd	%xmm0, %xmm1
+# CHECK-NEXT:  3      8     1.00    *                   comisd	(%rax), %xmm1
+# CHECK-NEXT:  2      4     1.00                        cvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   cvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        cvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        cvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   cvtpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        cvtpd2pi	%xmm0, %mm2
+# CHECK-NEXT:  3      10    1.00    *                   cvtpd2pi	(%rax), %mm2
+# CHECK-NEXT:  2      4     1.00                        cvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   cvtpd2ps	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        cvtpi2pd	%mm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   cvtpi2pd	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        cvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        cvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   cvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  2      5     1.00                        cvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      5     1.00                        cvtsd2si	%xmm0, %rcx
+# CHECK-NEXT:  3      9     1.00    *                   cvtsd2si	(%rax), %ecx
+# CHECK-NEXT:  3      9     1.00    *                   cvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  2      4     1.00                        cvtsd2ss	%xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   cvtsd2ss	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        cvtsi2sdl	%ecx, %xmm2
+# CHECK-NEXT:  2      4     1.00                        cvtsi2sdq	%rcx, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        cvtss2sd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   cvtss2sd	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        cvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   cvttpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        cvttpd2pi	%xmm0, %mm2
+# CHECK-NEXT:  3      10    1.00    *                   cvttpd2pi	(%rax), %mm2
+# CHECK-NEXT:  1      3     1.00                        cvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      5     1.00                        cvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      5     1.00                        cvttsd2si	%xmm0, %rcx
+# CHECK-NEXT:  3      9     1.00    *                   cvttsd2si	(%rax), %ecx
+# CHECK-NEXT:  3      9     1.00    *                   cvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  1      22    22.00                       divpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      28    22.00   *                   divpd	(%rax), %xmm2
+# CHECK-NEXT:  1      22    22.00                       divsd	%xmm0, %xmm2
+# CHECK-NEXT:  2      28    22.00   *                   divsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00    *      *      U     lfence
+# CHECK-NEXT:  1      1     1.00    *      *      U     maskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT:  1      3     1.00                        maxpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   maxpd	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        maxsd	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   maxsd	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        minpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   minpd	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        minsd	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   minsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        movapd	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movapd	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   movapd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        movd	%eax, %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   movd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        movd	%xmm0, %ecx
+# CHECK-NEXT:  1      1     1.00           *            movd	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     0.33                        movdqa	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movdqa	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   movdqa	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.33                        movdqu	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movdqu	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   movdqu	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        movdq2q	%xmm0, %mm2
+# CHECK-NEXT:  1      1     1.00           *            movhpd	%xmm0, (%rax)
+# CHECK-NEXT:  2      7     1.00    *                   movhpd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movlpd	%xmm0, (%rax)
+# CHECK-NEXT:  2      7     1.00    *                   movlpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        movmskpd	%xmm0, %ecx
+# CHECK-NEXT:  1      1     1.00           *            movntil	%eax, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movntiq	%rax, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movntdq	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movntpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     0.33                        movq	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00                        movq	%rax, %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   movq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        movq	%xmm0, %rcx
+# CHECK-NEXT:  1      1     1.00           *            movq	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     0.33                        movq2dq	%mm0, %xmm2
+# CHECK-NEXT:  1      1     1.00                        movsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movsd	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   movsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        movupd	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movupd	%xmm0, (%rax)
+# CHECK-NEXT:  1      6     0.50    *                   movupd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        mulpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   mulpd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        mulsd	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   mulsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        orpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   orpd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        packssdw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   packssdw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        packsswb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   packsswb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        packuswb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   packuswb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        paddb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   paddb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        paddd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   paddd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        paddq	%mm0, %mm2
+# CHECK-NEXT:  2      7     0.50    *                   paddq	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        paddq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   paddq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        paddsb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   paddsb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        paddsw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   paddsw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        paddusb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   paddusb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        paddusw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   paddusw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        paddw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   paddw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.33                        pand	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pand	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.33                        pandn	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pandn	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pavgb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pavgb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pavgw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pavgw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pcmpeqb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pcmpeqb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pcmpeqd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pcmpeqd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pcmpeqw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pcmpeqw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pcmpgtb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pcmpgtb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pcmpgtd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pcmpgtd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pcmpgtw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pcmpgtw	(%rax), %xmm2
+# CHECK-NEXT:  2      3     1.00                        pextrw	$1, %xmm0, %ecx
+# CHECK-NEXT:  1      5     1.00                        pmaddwd	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   pmaddwd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmaxsw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmaxsw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmaxub	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmaxub	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pminsw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pminsw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pminub	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pminub	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  1      5     1.00                        pmulhuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   pmulhuw	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        pmulhw	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   pmulhw	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        pmullw	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   pmullw	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        pmuludq	%mm0, %mm2
+# CHECK-NEXT:  2      10    1.00    *                   pmuludq	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pmuludq	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   pmuludq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.33                        por	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   por	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        psadbw	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   psadbw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        pslld	$1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        pslld	%xmm0, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   pslld	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pslldq	$1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        psllq	$1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        psllq	%xmm0, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   psllq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        psllw	$1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        psllw	%xmm0, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   psllw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        psrad	$1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        psrad	%xmm0, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   psrad	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        psraw	$1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        psraw	%xmm0, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   psraw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        psrld	$1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        psrld	%xmm0, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   psrld	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        psrldq	$1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        psrlq	$1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        psrlq	%xmm0, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   psrlq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        psrlw	$1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        psrlw	%xmm0, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   psrlw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        psubb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   psubb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        psubd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   psubd	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        psubq	%mm0, %mm2
+# CHECK-NEXT:  2      8     1.00    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        psubsb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   psubsb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        psubsw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   psubsw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        psubusb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   psubusb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        psubusw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   psubusw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        psubw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   psubw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        punpckhbw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   punpckhbw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        punpckhdq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   punpckhdq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        punpckhqdq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   punpckhqdq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        punpckhwd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   punpckhwd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        punpcklbw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   punpcklbw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        punpckldq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   punpckldq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        punpcklqdq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   punpcklqdq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        punpcklwd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   punpcklwd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.33                        pxor	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pxor	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        shufpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   shufpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      21    21.00                       sqrtpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      27    21.00   *                   sqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  1      21    21.00                       sqrtsd	%xmm0, %xmm2
+# CHECK-NEXT:  2      27    21.00   *                   sqrtsd	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        subpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   subpd	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        subsd	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   subsd	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        ucomisd	%xmm0, %xmm1
+# CHECK-NEXT:  3      8     1.00    *                   ucomisd	(%rax), %xmm1
+# CHECK-NEXT:  1      1     1.00                        unpckhpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   unpckhpd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        unpcklpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   unpcklpd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        xorpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   xorpd	(%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -     172.00 75.83  117.33 16.00  98.83  66.00  66.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addsd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andnpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   andnpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   andpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.50   0.50   1.00   1.00   0.50   0.50   clflush	(%rax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cmppd	$0, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cmppd	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cmpsd	$0, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cmpsd	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     comisd	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   comisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtpd2pi	%xmm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtpd2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtpd2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtpi2pd	%mm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtpi2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     cvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   cvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvtsd2si	%xmm0, %rcx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvtsd2si	(%rax), %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtsd2ss	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtsd2ss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtsi2sdl	%ecx, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtsi2sdq	%rcx, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     cvtss2sd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   cvtss2sd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvttpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvttpd2pi	%xmm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvttpd2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvttsd2si	%xmm0, %rcx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvttsd2si	(%rax), %ecx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  -     22.00  1.00    -      -      -      -      -     divpd	%xmm0, %xmm2
+# CHECK-NEXT:  -     22.00  1.00    -      -      -     0.50   0.50   divpd	(%rax), %xmm2
+# CHECK-NEXT:  -     22.00  1.00    -      -      -      -      -     divsd	%xmm0, %xmm2
+# CHECK-NEXT:  -     22.00  1.00    -      -      -     0.50   0.50   divsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   lfence
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   maskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     maxpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   maxpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     maxsd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   maxsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     minpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   minpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     minsd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   minsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movapd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movapd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movapd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movd	%eax, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movd	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movd	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movdqa	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movdqa	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movdqu	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movdqu	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movdqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     movdq2q	%xmm0, %mm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movhpd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   movhpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movlpd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   movlpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movmskpd	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntil	%eax, (%rax)
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntiq	%rax, (%rax)
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntdq	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntpd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movq	%rax, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movq	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movq	%xmm0, %rcx
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movq	%xmm0, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq2dq	%mm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movsd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movsd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movupd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movupd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movupd	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     mulpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   mulpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     mulsd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   mulsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     orpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   orpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     packssdw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   packssdw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     packsswb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   packsswb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     packuswb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   packuswb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddsb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddsw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddusb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddusb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddusw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddusw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pand	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pand	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pandn	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pandn	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pavgb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pavgb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pavgw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pavgw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpeqb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpeqb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpeqd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpeqd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpeqw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpeqw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpgtb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpgtb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpgtd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpgtd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpgtw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpgtw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pextrw	$1, %xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmaddwd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmaddwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxsw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxub	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxub	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminsw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminub	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminub	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhuw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmullw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmullw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmuludq	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmuludq	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmuludq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmuludq	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     por	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   por	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psadbw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   psadbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pslld	$1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pslld	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   pslld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pslldq	$1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psllq	$1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psllq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psllq	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psllw	$1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psllw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psllw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psrad	$1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psrad	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psrad	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psraw	$1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psraw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psraw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psrld	$1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psrld	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psrld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psrldq	$1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psrlq	$1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psrlq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psrlq	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psrlw	$1, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psrlw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psrlw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubusb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubusb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubusw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubusw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpckhbw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpckhbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpckhdq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpckhdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpckhqdq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpckhqdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpckhwd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpckhwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpcklbw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpcklbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpckldq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpckldq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpcklqdq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpcklqdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpcklwd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpcklwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pxor	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pxor	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     shufpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   shufpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -     21.00  1.00    -      -      -      -      -     sqrtpd	%xmm0, %xmm2
+# CHECK-NEXT:  -     21.00  1.00    -      -      -     0.50   0.50   sqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  -     21.00  1.00    -      -      -      -      -     sqrtsd	%xmm0, %xmm2
+# CHECK-NEXT:  -     21.00  1.00    -      -      -     0.50   0.50   sqrtsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     subpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   subpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     subsd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   subsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     ucomisd	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   ucomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     unpckhpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   unpckhpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     unpcklpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   unpcklpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     xorpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   xorpd	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s
new file mode 100644
index 00000000000..8438e1a7a84
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s
@@ -0,0 +1,96 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+addsubpd  %xmm0, %xmm2
+addsubpd  (%rax),  %xmm2
+
+addsubps  %xmm0, %xmm2
+addsubps  (%rax), %xmm2
+
+haddpd    %xmm0, %xmm2
+haddpd    (%rax), %xmm2
+
+haddps    %xmm0, %xmm2
+haddps    (%rax), %xmm2
+
+hsubpd    %xmm0, %xmm2
+hsubpd    (%rax), %xmm2
+
+hsubps    %xmm0, %xmm2
+hsubps    (%rax), %xmm2
+
+lddqu     (%rax), %xmm2
+
+movddup   %xmm0, %xmm2
+movddup   (%rax), %xmm2
+
+movshdup  %xmm0, %xmm2
+movshdup  (%rax), %xmm2
+
+movsldup  %xmm0, %xmm2
+movsldup  (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        addsubpd	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   addsubpd	(%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        addsubps	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   addsubps	(%rax), %xmm2
+# CHECK-NEXT:  3      5     2.00                        haddpd	%xmm0, %xmm2
+# CHECK-NEXT:  4      11    2.00    *                   haddpd	(%rax), %xmm2
+# CHECK-NEXT:  3      5     2.00                        haddps	%xmm0, %xmm2
+# CHECK-NEXT:  4      11    2.00    *                   haddps	(%rax), %xmm2
+# CHECK-NEXT:  3      5     2.00                        hsubpd	%xmm0, %xmm2
+# CHECK-NEXT:  4      11    2.00    *                   hsubpd	(%rax), %xmm2
+# CHECK-NEXT:  3      5     2.00                        hsubps	%xmm0, %xmm2
+# CHECK-NEXT:  4      11    2.00    *                   hsubps	(%rax), %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   lddqu	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        movddup	%xmm0, %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   movddup	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        movshdup	%xmm0, %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   movshdup	(%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        movsldup	%xmm0, %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   movsldup	(%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     12.00   -     19.00  5.00   5.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addsubpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addsubpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addsubps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addsubps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     haddpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   haddpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     haddps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   haddps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     hsubpd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   hsubpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     hsubps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   hsubps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   lddqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movddup	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movddup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movshdup	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movshdup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movsldup	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movsldup	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s
new file mode 100644
index 00000000000..08c6ccfde8f
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s
@@ -0,0 +1,366 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+blendpd     $11, %xmm0, %xmm2
+blendpd     $11, (%rax), %xmm2
+
+blendps     $11, %xmm0, %xmm2
+blendps     $11, (%rax), %xmm2
+
+blendvpd    %xmm0, %xmm2
+blendvpd    (%rax), %xmm2
+
+blendvps    %xmm0, %xmm2
+blendvps    (%rax), %xmm2
+
+dppd        $22, %xmm0, %xmm2
+dppd        $22, (%rax), %xmm2
+
+dpps        $22, %xmm0, %xmm2
+dpps        $22, (%rax), %xmm2
+
+extractps   $1, %xmm0, %rcx
+extractps   $1, %xmm0, (%rax)
+
+insertps    $1, %xmm0, %xmm2
+insertps    $1, (%rax), %xmm2
+
+movntdqa    (%rax), %xmm2
+
+mpsadbw     $1, %xmm0, %xmm2
+mpsadbw     $1, (%rax), %xmm2
+
+packusdw    %xmm0, %xmm2
+packusdw    (%rax), %xmm2
+
+pblendvb    %xmm0, %xmm2
+pblendvb    (%rax), %xmm2
+
+pblendw     $11, %xmm0, %xmm2
+pblendw     $11, (%rax), %xmm2
+
+pcmpeqq     %xmm0, %xmm2
+pcmpeqq     (%rax), %xmm2
+
+pextrb      $1, %xmm0, %ecx
+pextrb      $1, %xmm0, (%rax)
+
+pextrd      $1, %xmm0, %ecx
+pextrd      $1, %xmm0, (%rax)
+
+pextrq      $1, %xmm0, %rcx
+pextrq      $1, %xmm0, (%rax)
+
+pextrw      $1, %xmm0, (%rax)
+
+phminposuw  %xmm0, %xmm2
+phminposuw  (%rax), %xmm2
+
+pinsrb      $1, %eax, %xmm1
+pinsrb      $1, (%rax), %xmm1
+
+pinsrd      $1, %eax, %xmm1
+pinsrd      $1, (%rax), %xmm1
+
+pinsrq      $1, %rax, %xmm1
+pinsrq      $1, (%rax), %xmm1
+
+pmaxsb      %xmm0, %xmm2
+pmaxsb      (%rax), %xmm2
+
+pmaxsd      %xmm0, %xmm2
+pmaxsd      (%rax), %xmm2
+
+pmaxud      %xmm0, %xmm2
+pmaxud      (%rax), %xmm2
+
+pmaxuw      %xmm0, %xmm2
+pmaxuw      (%rax), %xmm2
+
+pminsb      %xmm0, %xmm2
+pminsb      (%rax), %xmm2
+
+pminsd      %xmm0, %xmm2
+pminsd      (%rax), %xmm2
+
+pminud      %xmm0, %xmm2
+pminud      (%rax), %xmm2
+
+pminuw      %xmm0, %xmm2
+pminuw      (%rax), %xmm2
+
+pmovsxbd    %xmm0, %xmm2
+pmovsxbd    (%rax), %xmm2
+
+pmovsxbq    %xmm0, %xmm2
+pmovsxbq    (%rax), %xmm2
+
+pmovsxbw    %xmm0, %xmm2
+pmovsxbw    (%rax), %xmm2
+
+pmovsxdq    %xmm0, %xmm2
+pmovsxdq    (%rax), %xmm2
+
+pmovsxwd    %xmm0, %xmm2
+pmovsxwd    (%rax), %xmm2
+
+pmovsxwq    %xmm0, %xmm2
+pmovsxwq    (%rax), %xmm2
+
+pmovzxbd    %xmm0, %xmm2
+pmovzxbd    (%rax), %xmm2
+
+pmovzxbq    %xmm0, %xmm2
+pmovzxbq    (%rax), %xmm2
+
+pmovzxbw    %xmm0, %xmm2
+pmovzxbw    (%rax), %xmm2
+
+pmovzxdq    %xmm0, %xmm2
+pmovzxdq    (%rax), %xmm2
+
+pmovzxwd    %xmm0, %xmm2
+pmovzxwd    (%rax), %xmm2
+
+pmovzxwq    %xmm0, %xmm2
+pmovzxwq    (%rax), %xmm2
+
+pmuldq      %xmm0, %xmm2
+pmuldq      (%rax), %xmm2
+
+pmulld      %xmm0, %xmm2
+pmulld      (%rax), %xmm2
+
+ptest       %xmm0, %xmm1
+ptest       (%rax), %xmm1
+
+roundpd     $1, %xmm0, %xmm2
+roundpd     $1, (%rax), %xmm2
+
+roundps     $1, %xmm0, %xmm2
+roundps     $1, (%rax), %xmm2
+
+roundsd     $1, %xmm0, %xmm2
+roundsd     $1, (%rax), %xmm2
+
+roundss     $1, %xmm0, %xmm2
+roundss     $1, (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        blendpd	$11, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   blendpd	$11, (%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        blendps	$11, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   blendps	$11, (%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        blendvpd	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   blendvpd	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        blendvps	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   blendvps	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  3      9     1.00                        dppd	$22, %xmm0, %xmm2
+# CHECK-NEXT:  4      15    1.00    *                   dppd	$22, (%rax), %xmm2
+# CHECK-NEXT:  4      12    2.00                        dpps	$22, %xmm0, %xmm2
+# CHECK-NEXT:  5      18    2.00    *                   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  2      3     1.00                        extractps	$1, %xmm0, %ecx
+# CHECK-NEXT:  3      5     1.00           *            extractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     1.00    *                   insertps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      6     0.50    *                   movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  3      7     1.00                        mpsadbw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  4      13    1.00    *                   mpsadbw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        packusdw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   packusdw	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        pblendvb	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  3      8     1.00    *                   pblendvb	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pblendw	$11, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pblendw	$11, (%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pcmpeqq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pcmpeqq	(%rax), %xmm2
+# CHECK-NEXT:  2      3     1.00                        pextrb	$1, %xmm0, %ecx
+# CHECK-NEXT:  3      5     1.00           *            pextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      3     1.00                        pextrd	$1, %xmm0, %ecx
+# CHECK-NEXT:  4      5     1.00           *            pextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      3     1.00                        pextrq	$1, %xmm0, %rcx
+# CHECK-NEXT:  4      5     1.00           *            pextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT:  3      5     1.00           *            pextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  1      5     1.00                        phminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        pinsrb	$1, %eax, %xmm1
+# CHECK-NEXT:  2      7     0.50    *                   pinsrb	$1, (%rax), %xmm1
+# CHECK-NEXT:  2      2     1.00                        pinsrd	$1, %eax, %xmm1
+# CHECK-NEXT:  2      7     0.50    *                   pinsrd	$1, (%rax), %xmm1
+# CHECK-NEXT:  2      2     1.00                        pinsrq	$1, %rax, %xmm1
+# CHECK-NEXT:  2      7     0.50    *                   pinsrq	$1, (%rax), %xmm1
+# CHECK-NEXT:  1      1     0.50                        pmaxsb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmaxsb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmaxsd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmaxsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmaxud	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmaxud	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmaxuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmaxuw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pminsb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pminsb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pminsd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pminsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pminud	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pminud	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pminuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pminuw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        pmuldq	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   pmuldq	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        pmulld	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   pmulld	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        ptest	%xmm0, %xmm1
+# CHECK-NEXT:  3      8     1.00    *                   ptest	(%rax), %xmm1
+# CHECK-NEXT:  1      3     1.00                        roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   roundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   roundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   roundsd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   roundss	$1, (%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     26.00  47.50  5.00   52.50  24.50  24.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     blendpd	$11, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   blendpd	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     blendps	$11, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   blendps	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     blendvpd	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   blendvpd	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     blendvps	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   blendvps	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     dppd	$22, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   0.50   0.50   dppd	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   2.00    -     1.00    -      -     dpps	$22, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   2.00    -     1.00   0.50   0.50   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     extractps	$1, %xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   0.50   0.50   extractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   insertps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     mpsadbw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   0.50   0.50   mpsadbw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     packusdw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   packusdw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     pblendvb	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   pblendvb	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pblendw	$11, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pblendw	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpeqq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpeqq	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pextrb	$1, %xmm0, %ecx
+# CHECK-NEXT:  -      -      -     0.50   1.00   0.50   0.50   0.50   pextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pextrd	$1, %xmm0, %ecx
+# CHECK-NEXT:  -      -     1.00   0.50   1.00   0.50   0.50   0.50   pextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pextrq	$1, %xmm0, %rcx
+# CHECK-NEXT:  -      -     1.00   0.50   1.00   0.50   0.50   0.50   pextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -     0.50   1.00   0.50   0.50   0.50   pextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     phminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     pinsrb	$1, %eax, %xmm1
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pinsrb	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     pinsrd	$1, %eax, %xmm1
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pinsrd	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     pinsrq	$1, %rax, %xmm1
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pinsrq	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxsb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxsd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxud	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxud	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxuw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminsb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminsd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminud	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminud	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminuw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmuldq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmuldq	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulld	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulld	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     ptest	%xmm0, %xmm1
+# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   ptest	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   roundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   roundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   roundsd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   roundss	$1, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s
new file mode 100644
index 00000000000..935c5e3d7f3
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+crc32b      %al, %ecx
+crc32b      (%rax), %ecx
+
+crc32l      %eax, %ecx
+crc32l      (%rax), %ecx
+
+crc32w      %ax, %ecx
+crc32w      (%rax), %ecx
+
+crc32b      %al, %rcx
+crc32b      (%rax), %rcx
+
+crc32q      %rax, %rcx
+crc32q      (%rax), %rcx
+
+pcmpestri   $1, %xmm0, %xmm2
+pcmpestri   $1, (%rax), %xmm2
+
+pcmpestrm   $1, %xmm0, %xmm2
+pcmpestrm   $1, (%rax), %xmm2
+
+pcmpistri   $1, %xmm0, %xmm2
+pcmpistri   $1, (%rax), %xmm2
+
+pcmpistrm   $1, %xmm0, %xmm2
+pcmpistrm   $1, (%rax), %xmm2
+
+pcmpgtq     %xmm0, %xmm2
+pcmpgtq     (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        crc32b	%al, %ecx
+# CHECK-NEXT:  2      8     1.00    *                   crc32b	(%rax), %ecx
+# CHECK-NEXT:  1      3     1.00                        crc32l	%eax, %ecx
+# CHECK-NEXT:  2      8     1.00    *                   crc32l	(%rax), %ecx
+# CHECK-NEXT:  1      3     1.00                        crc32w	%ax, %ecx
+# CHECK-NEXT:  2      8     1.00    *                   crc32w	(%rax), %ecx
+# CHECK-NEXT:  1      3     1.00                        crc32b	%al, %rcx
+# CHECK-NEXT:  2      8     1.00    *                   crc32b	(%rax), %rcx
+# CHECK-NEXT:  1      3     1.00                        crc32q	%rax, %rcx
+# CHECK-NEXT:  2      8     1.00    *                   crc32q	(%rax), %rcx
+# CHECK-NEXT:  1      4     2.67                        pcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      4     2.33    *                   pcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      11    2.67                        pcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      11    2.33    *                   pcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      11    3.00                        pcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  4      17    3.00    *                   pcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      11    3.00                        pcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  4      17    3.00    *                   pcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        pcmpgtq	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   pcmpgtq	(%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     24.00  20.00   -     10.00  5.00   5.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     crc32b	%al, %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   crc32b	(%rax), %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     crc32l	%eax, %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   crc32l	(%rax), %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     crc32w	%ax, %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   crc32w	(%rax), %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     crc32b	%al, %rcx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   crc32b	(%rax), %rcx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     crc32q	%rax, %rcx
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   crc32q	(%rax), %rcx
+# CHECK-NEXT:  -      -     2.67   2.67    -     2.67    -      -     pcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     2.33   2.33    -     2.33   0.50   0.50   pcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -     2.67   2.67    -     2.67    -      -     pcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     2.33   2.33    -     2.33   0.50   0.50   pcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -     3.00    -      -      -      -      -     pcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     3.00    -      -      -     0.50   0.50   pcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -     3.00    -      -      -      -      -     pcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     3.00    -      -      -     0.50   0.50   pcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pcmpgtq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pcmpgtq	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s
new file mode 100644
index 00000000000..f4b9c94d48a
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s
@@ -0,0 +1,50 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+extrq       %xmm0, %xmm2
+extrq       $22, $2, %xmm2
+
+insertq     %xmm0, %xmm2
+insertq     $22, $22, %xmm0, %xmm2
+
+movntsd     %xmm0, (%rax)
+movntss     %xmm0, (%rax)
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        extrq	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     0.50                        extrq	$22, $2, %xmm2
+# CHECK-NEXT:  1      1     0.50                        insertq	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     0.50                        insertq	$22, $22, %xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movntsd	%xmm0, (%rax)
+# CHECK-NEXT:  1      1     1.00           *            movntss	%xmm0, (%rax)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     2.00   2.00   2.00   1.00   1.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     extrq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     extrq	$22, $2, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     insertq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     insertq	$22, $22, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntsd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntss	%xmm0, (%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s b/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s
new file mode 100644
index 00000000000..c341022a288
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s
@@ -0,0 +1,253 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+pabsb       %mm0, %mm2
+pabsb       (%rax), %mm2
+
+pabsb       %xmm0, %xmm2
+pabsb       (%rax), %xmm2
+
+pabsd       %mm0, %mm2
+pabsd       (%rax), %mm2
+
+pabsd       %xmm0, %xmm2
+pabsd       (%rax), %xmm2
+
+pabsw       %mm0, %mm2
+pabsw       (%rax), %mm2
+
+pabsw       %xmm0, %xmm2
+pabsw       (%rax), %xmm2
+
+palignr     $1, %mm0, %mm2
+palignr     $1, (%rax), %mm2
+
+palignr     $1, %xmm0, %xmm2
+palignr     $1, (%rax), %xmm2
+
+phaddd      %mm0, %mm2
+phaddd      (%rax), %mm2
+
+phaddd      %xmm0, %xmm2
+phaddd      (%rax), %xmm2
+
+phaddsw     %mm0, %mm2
+phaddsw     (%rax), %mm2
+
+phaddsw     %xmm0, %xmm2
+phaddsw     (%rax), %xmm2
+
+phaddw      %mm0, %mm2
+phaddw      (%rax), %mm2
+
+phaddw      %xmm0, %xmm2
+phaddw      (%rax), %xmm2
+
+phsubd      %mm0, %mm2
+phsubd      (%rax), %mm2
+
+phsubd      %xmm0, %xmm2
+phsubd      (%rax), %xmm2
+
+phsubsw     %mm0, %mm2
+phsubsw     (%rax), %mm2
+
+phsubsw     %xmm0, %xmm2
+phsubsw     (%rax), %xmm2
+
+phsubw      %mm0, %mm2
+phsubw      (%rax), %mm2
+
+phsubw      %xmm0, %xmm2
+phsubw      (%rax), %xmm2
+
+pmaddubsw   %mm0, %mm2
+pmaddubsw   (%rax), %mm2
+
+pmaddubsw   %xmm0, %xmm2
+pmaddubsw   (%rax), %xmm2
+
+pmulhrsw    %mm0, %mm2
+pmulhrsw    (%rax), %mm2
+
+pmulhrsw    %xmm0, %xmm2
+pmulhrsw    (%rax), %xmm2
+
+pshufb      %mm0, %mm2
+pshufb      (%rax), %mm2
+
+pshufb      %xmm0, %xmm2
+pshufb      (%rax), %xmm2
+
+psignb      %mm0, %mm2
+psignb      (%rax), %mm2
+
+psignb      %xmm0, %xmm2
+psignb      (%rax), %xmm2
+
+psignd      %mm0, %mm2
+psignd      (%rax), %mm2
+
+psignd      %xmm0, %xmm2
+psignd      (%rax), %xmm2
+
+psignw      %mm0, %mm2
+psignw      (%rax), %mm2
+
+psignw      %xmm0, %xmm2
+psignw      (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        pabsb	%mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   pabsb	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        pabsb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pabsb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pabsd	%mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   pabsd	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        pabsd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pabsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pabsw	%mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   pabsw	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        pabsw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pabsw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        palignr	$1, %mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   palignr	$1, (%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        palignr	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   palignr	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      3     1.50                        phaddd	%mm0, %mm2
+# CHECK-NEXT:  4      8     1.50    *                   phaddd	(%rax), %mm2
+# CHECK-NEXT:  3      3     1.50                        phaddd	%xmm0, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   phaddd	(%rax), %xmm2
+# CHECK-NEXT:  3      3     1.50                        phaddsw	%mm0, %mm2
+# CHECK-NEXT:  4      8     1.50    *                   phaddsw	(%rax), %mm2
+# CHECK-NEXT:  3      3     1.50                        phaddsw	%xmm0, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   phaddsw	(%rax), %xmm2
+# CHECK-NEXT:  3      3     1.50                        phaddw	%mm0, %mm2
+# CHECK-NEXT:  4      8     1.50    *                   phaddw	(%rax), %mm2
+# CHECK-NEXT:  3      3     1.50                        phaddw	%xmm0, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   phaddw	(%rax), %xmm2
+# CHECK-NEXT:  3      3     1.50                        phsubd	%mm0, %mm2
+# CHECK-NEXT:  4      8     1.50    *                   phsubd	(%rax), %mm2
+# CHECK-NEXT:  3      3     1.50                        phsubd	%xmm0, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   phsubd	(%rax), %xmm2
+# CHECK-NEXT:  3      3     1.50                        phsubsw	%mm0, %mm2
+# CHECK-NEXT:  4      8     1.50    *                   phsubsw	(%rax), %mm2
+# CHECK-NEXT:  3      3     1.50                        phsubsw	%xmm0, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   phsubsw	(%rax), %xmm2
+# CHECK-NEXT:  3      3     1.50                        phsubw	%mm0, %mm2
+# CHECK-NEXT:  4      8     1.50    *                   phsubw	(%rax), %mm2
+# CHECK-NEXT:  3      3     1.50                        phsubw	%xmm0, %xmm2
+# CHECK-NEXT:  4      9     1.50    *                   phsubw	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        pmaddubsw	%mm0, %mm2
+# CHECK-NEXT:  2      10    1.00    *                   pmaddubsw	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pmaddubsw	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   pmaddubsw	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        pmulhrsw	%mm0, %mm2
+# CHECK-NEXT:  2      10    1.00    *                   pmulhrsw	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pmulhrsw	%xmm0, %xmm2
+# CHECK-NEXT:  2      11    1.00    *                   pmulhrsw	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        pshufb	%mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   pshufb	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        pshufb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   pshufb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        psignb	%mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   psignb	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psignb	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   psignb	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        psignd	%mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   psignd	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psignd	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   psignd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        psignw	%mm0, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   psignw	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psignw	%xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   psignw	(%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     8.00   52.00   -     52.00  16.00  16.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     palignr	$1, %mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   palignr	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     palignr	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   palignr	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddsw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddsw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubsw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubsw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmaddubsw	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmaddubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmaddubsw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmaddubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhrsw	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhrsw	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhrsw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhrsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pshufb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pshufb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pshufb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pshufb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignb	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignd	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignd	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignw	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s b/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s
new file mode 100644
index 00000000000..ebe9975654c
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s
@@ -0,0 +1,169 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+bextr        $8192, %ebx, %ecx
+bextr        $8192, (%rbx), %ecx
+
+bextr        $16384, %rbx, %rcx
+bextr        $16384, (%rbx), %rcx
+
+blcfill      %eax, %ecx
+blcfill      (%rax), %ecx
+
+blcfill      %rax, %rcx
+blcfill      (%rax), %rcx
+
+blci         %eax, %ecx
+blci         (%rax), %ecx
+
+blci         %rax, %rcx
+blci         (%rax), %rcx
+
+blcic        %eax, %ecx
+blcic        (%rax), %ecx
+
+blcic        %rax, %rcx
+blcic        (%rax), %rcx
+
+blcmsk       %eax, %ecx
+blcmsk       (%rax), %ecx
+
+blcmsk       %rax, %rcx
+blcmsk       (%rax), %rcx
+
+blcs         %eax, %ecx
+blcs         (%rax), %ecx
+
+blcs         %rax, %rcx
+blcs         (%rax), %rcx
+
+blsfill      %eax, %ecx
+blsfill      (%rax), %ecx
+
+blsfill      %rax, %rcx
+blsfill      (%rax), %rcx
+
+blsic        %eax, %ecx
+blsic        (%rax), %ecx
+
+blsic        %rax, %rcx
+blsic        (%rax), %rcx
+
+t1mskc       %eax, %ecx
+t1mskc       (%rax), %ecx
+
+t1mskc       %rax, %rcx
+t1mskc       (%rax), %rcx
+
+tzmsk        %eax, %ecx
+tzmsk        (%rax), %ecx
+
+tzmsk        %rax, %rcx
+tzmsk        (%rax), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      2     1.00                        bextrl	$8192, %ebx, %ecx
+# CHECK-NEXT:  3      7     1.00    *                   bextrl	$8192, (%rbx), %ecx
+# CHECK-NEXT:  2      2     1.00                        bextrq	$16384, %rbx, %rcx
+# CHECK-NEXT:  3      7     1.00    *                   bextrq	$16384, (%rbx), %rcx
+# CHECK-NEXT:  1      1     0.33                        blcfilll	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blcfilll	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        blcfillq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blcfillq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.33                        blcil	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blcil	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        blciq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blciq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.33                        blcicl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blcicl	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        blcicq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blcicq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.33                        blcmskl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blcmskl	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        blcmskq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blcmskq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.33                        blcsl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blcsl	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        blcsq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blcsq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.33                        blsfilll	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blsfilll	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        blsfillq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blsfillq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.33                        blsicl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blsicl	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        blsicq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blsicq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.33                        t1mskcl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   t1mskcl	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        t1mskcq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   t1mskcq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.33                        tzmskl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   tzmskl	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.33                        tzmskq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   tzmskq	(%rax), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     14.00  16.00   -     14.00  10.00  10.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.50   1.00    -     0.50    -      -     bextrl	$8192, %ebx, %ecx
+# CHECK-NEXT:  -      -     0.50   1.00    -     0.50   0.50   0.50   bextrl	$8192, (%rbx), %ecx
+# CHECK-NEXT:  -      -     0.50   1.00    -     0.50    -      -     bextrq	$16384, %rbx, %rcx
+# CHECK-NEXT:  -      -     0.50   1.00    -     0.50   0.50   0.50   bextrq	$16384, (%rbx), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcfilll	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcfilll	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcfillq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcfillq	(%rax), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcil	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcil	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blciq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blciq	(%rax), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcicl	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcicl	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcicq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcicq	(%rax), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcmskl	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcmskq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcmskq	(%rax), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcsl	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcsl	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcsq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcsq	(%rax), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsfilll	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsfilll	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsfillq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsfillq	(%rax), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsicl	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsicl	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsicq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsicq	(%rax), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     t1mskcl	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   t1mskcl	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     t1mskcq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   t1mskcq	(%rax), %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     tzmskl	%eax, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   tzmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     tzmskq	%rax, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   tzmskq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s b/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s
new file mode 100644
index 00000000000..b4672620cf4
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s
@@ -0,0 +1,78 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=i686-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+aaa
+
+aad
+aad $7
+
+aam
+aam $7
+
+aas
+
+bound %bx, (%eax)
+bound %ebx, (%eax)
+
+daa
+
+das
+
+into
+
+leave
+
+salc
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.33                        aaa
+# CHECK-NEXT:  1      100   0.33                        aad
+# CHECK-NEXT:  1      100   0.33                        aad	$7
+# CHECK-NEXT:  1      100   0.33                        aam
+# CHECK-NEXT:  1      100   0.33                        aam	$7
+# CHECK-NEXT:  1      100   0.33                        aas
+# CHECK-NEXT:  1      100   0.33                  U     bound	%bx, (%eax)
+# CHECK-NEXT:  1      100   0.33                  U     bound	%ebx, (%eax)
+# CHECK-NEXT:  1      100   0.33                        daa
+# CHECK-NEXT:  1      100   0.33                        das
+# CHECK-NEXT:  1      100   0.33                  U     into
+# CHECK-NEXT:  3      7     0.67    *                   leave
+# CHECK-NEXT:  1      1     0.33                  U     salc
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     4.67   4.67    -     4.67   0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aaa
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aad
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aad	$7
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aam
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aam	$7
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aas
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     bound	%bx, (%eax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     bound	%ebx, (%eax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     daa
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     das
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     into
+# CHECK-NEXT:  -      -     0.67   0.67    -     0.67   0.50   0.50   leave
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     salc
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s b/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s
new file mode 100644
index 00000000000..2ab041c3de6
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s
@@ -0,0 +1,2372 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+adcb $7, %al
+adcb $7, %dil
+adcb $7, (%rax)
+adcb %sil, %dil
+adcb %sil, (%rax)
+adcb (%rax), %dil
+
+adcw $511, %ax
+adcw $511, %di
+adcw $511, (%rax)
+adcw $7, %di
+adcw $7, (%rax)
+adcw %si, %di
+adcw %si, (%rax)
+adcw (%rax), %di
+
+adcl $665536, %eax
+adcl $665536, %edi
+adcl $665536, (%rax)
+adcl $7, %edi
+adcl $7, (%rax)
+adcl %esi, %edi
+adcl %esi, (%rax)
+adcl (%rax), %edi
+
+adcq $665536, %rax
+adcq $665536, %rdi
+adcq $665536, (%rax)
+adcq $7, %rdi
+adcq $7, (%rax)
+adcq %rsi, %rdi
+adcq %rsi, (%rax)
+adcq (%rax), %rdi
+
+addb $7, %al
+addb $7, %dil
+addb $7, (%rax)
+addb %sil, %dil
+addb %sil, (%rax)
+addb (%rax), %dil
+
+addw $511, %ax
+addw $511, %di
+addw $511, (%rax)
+addw $7, %di
+addw $7, (%rax)
+addw %si, %di
+addw %si, (%rax)
+addw (%rax), %di
+
+addl $665536, %eax
+addl $665536, %edi
+addl $665536, (%rax)
+addl $7, %edi
+addl $7, (%rax)
+addl %esi, %edi
+addl %esi, (%rax)
+addl (%rax), %edi
+
+addq $665536, %rax
+addq $665536, %rdi
+addq $665536, (%rax)
+addq $7, %rdi
+addq $7, (%rax)
+addq %rsi, %rdi
+addq %rsi, (%rax)
+addq (%rax), %rdi
+
+andb $7, %al
+andb $7, %dil
+andb $7, (%rax)
+andb %sil, %dil
+andb %sil, (%rax)
+andb (%rax), %dil
+
+andw $511, %ax
+andw $511, %di
+andw $511, (%rax)
+andw $7, %di
+andw $7, (%rax)
+andw %si, %di
+andw %si, (%rax)
+andw (%rax), %di
+
+andl $665536, %eax
+andl $665536, %edi
+andl $665536, (%rax)
+andl $7, %edi
+andl $7, (%rax)
+andl %esi, %edi
+andl %esi, (%rax)
+andl (%rax), %edi
+
+andq $665536, %rax
+andq $665536, %rdi
+andq $665536, (%rax)
+andq $7, %rdi
+andq $7, (%rax)
+andq %rsi, %rdi
+andq %rsi, (%rax)
+andq (%rax), %rdi
+
+bsfw %si, %di
+bsrw %si, %di
+bsfw (%rax), %di
+bsrw (%rax), %di
+
+bsfl %esi, %edi
+bsrl %esi, %edi
+bsfl (%rax), %edi
+bsrl (%rax), %edi
+
+bsfq %rsi, %rdi
+bsrq %rsi, %rdi
+bsfq (%rax), %rdi
+bsrq (%rax), %rdi
+
+bswap %eax
+bswap %rax
+
+btw  %si, %di
+btcw %si, %di
+btrw %si, %di
+btsw %si, %di
+btw  %si, (%rax)
+btcw %si, (%rax)
+btrw %si, (%rax)
+btsw %si, (%rax)
+btw  $7, %di
+btcw $7, %di
+btrw $7, %di
+btsw $7, %di
+btw  $7, (%rax)
+btcw $7, (%rax)
+btrw $7, (%rax)
+btsw $7, (%rax)
+
+btl  %esi, %edi
+btcl %esi, %edi
+btrl %esi, %edi
+btsl %esi, %edi
+btl  %esi, (%rax)
+btcl %esi, (%rax)
+btrl %esi, (%rax)
+btsl %esi, (%rax)
+btl  $7, %edi
+btcl $7, %edi
+btrl $7, %edi
+btsl $7, %edi
+btl  $7, (%rax)
+btcl $7, (%rax)
+btrl $7, (%rax)
+btsl $7, (%rax)
+
+btq  %rsi, %rdi
+btcq %rsi, %rdi
+btrq %rsi, %rdi
+btsq %rsi, %rdi
+btq  %rsi, (%rax)
+btcq %rsi, (%rax)
+btrq %rsi, (%rax)
+btsq %rsi, (%rax)
+btq  $7, %rdi
+btcq $7, %rdi
+btrq $7, %rdi
+btsq $7, %rdi
+btq  $7, (%rax)
+btcq $7, (%rax)
+btrq $7, (%rax)
+btsq $7, (%rax)
+
+cbw
+cwde
+cdqe
+cwd
+cdq
+cqo
+
+clc
+cld
+cmc
+
+cmpb $7, %al
+cmpb $7, %dil
+cmpb $7, (%rax)
+cmpb %sil, %dil
+cmpb %sil, (%rax)
+cmpb (%rax), %dil
+
+cmpw $511, %ax
+cmpw $511, %di
+cmpw $511, (%rax)
+cmpw $7, %di
+cmpw $7, (%rax)
+cmpw %si, %di
+cmpw %si, (%rax)
+cmpw (%rax), %di
+
+cmpl $665536, %eax
+cmpl $665536, %edi
+cmpl $665536, (%rax)
+cmpl $7, %edi
+cmpl $7, (%rax)
+cmpl %esi, %edi
+cmpl %esi, (%rax)
+cmpl (%rax), %edi
+
+cmpq $665536, %rax
+cmpq $665536, %rdi
+cmpq $665536, (%rax)
+cmpq $7, %rdi
+cmpq $7, (%rax)
+cmpq %rsi, %rdi
+cmpq %rsi, (%rax)
+cmpq (%rax), %rdi
+
+cmpsb
+cmpsw
+cmpsl
+cmpsq
+
+cmpxchgb %cl, %bl
+cmpxchgb %cl, (%rbx)
+
+cmpxchgw %cx, %bx
+cmpxchgw %cx, (%rbx)
+
+cmpxchgl %ecx, %ebx
+cmpxchgl %ecx, (%rbx)
+
+cmpxchgq %rcx, %rbx
+cmpxchgq %rcx, (%rbx)
+
+cpuid
+
+decb %dil
+decb (%rax)
+decw %di
+decw (%rax)
+decl %edi
+decl (%rax)
+decq %rdi
+decq (%rax)
+
+divb %dil
+divb (%rax)
+divw %si
+divw (%rax)
+divl %edx
+divl (%rax)
+divq %rcx
+divq (%rax)
+
+idivb %dil
+idivb (%rax)
+idivw %si
+idivw (%rax)
+idivl %edx
+idivl (%rax)
+idivq %rcx
+idivq (%rax)
+
+imulb %dil
+imulb (%rax)
+
+imulw %di
+imulw (%rax)
+imulw %si, %di
+imulw (%rax), %di
+imulw $511, %si, %di
+imulw $511, (%rax), %di
+imulw $7, %si, %di
+imulw $7, (%rax), %di
+
+imull %edi
+imull (%rax)
+imull %esi, %edi
+imull (%rax), %edi
+imull $665536, %esi, %edi
+imull $665536, (%rax), %edi
+imull $7, %esi, %edi
+imull $7, (%rax), %edi
+
+imulq %rdi
+imulq (%rax)
+imulq %rsi, %rdi
+imulq (%rax), %rdi
+imulq $665536, %rsi, %rdi
+imulq $665536, (%rax), %rdi
+imulq $7, %rsi, %rdi
+imulq $7, (%rax), %rdi
+
+inb $7,  %al
+inb %dx, %al
+inw $7,  %ax
+inw %dx, %ax
+inl $7,  %eax
+inl %dx, %eax
+
+incb %dil
+incb (%rax)
+incw %di
+incw (%rax)
+incl %edi
+incl (%rax)
+incq %rdi
+incq (%rax)
+
+insb
+insw
+insl
+
+int $7
+
+lahf
+
+lodsb
+lodsw
+lodsl
+lodsq
+
+movsb
+movsw
+movsl
+movsq
+
+movsbw %al, %di
+movzbw %al, %di
+movsbw (%rax), %di
+movzbw (%rax), %di
+movsbl %al, %edi
+movzbl %al, %edi
+movsbl (%rax), %edi
+movzbl (%rax), %edi
+movsbq %al, %rdi
+movzbq %al, %rdi
+movsbq (%rax), %rdi
+movzbq (%rax), %rdi
+
+movswl %ax, %edi
+movzwl %ax, %edi
+movswl (%rax), %edi
+movzwl (%rax), %edi
+movswq %ax, %rdi
+movzwq %ax, %rdi
+movswq (%rax), %rdi
+movzwq (%rax), %rdi
+
+movslq %eax, %rdi
+movslq (%rax), %rdi
+
+mulb %dil
+mulb (%rax)
+mulw %si
+mulw (%rax)
+mull %edx
+mull (%rax)
+mulq %rcx
+mulq (%rax)
+
+negb %dil
+negb (%r8)
+negw %si
+negw (%r9)
+negl %edx
+negl (%rax)
+negq %rcx
+negq (%r10)
+
+nop
+nopw %di
+nopw (%rcx)
+nopl %esi
+nopl (%r8)
+nopq %rdx
+nopq (%r9)
+
+notb %dil
+notb (%r8)
+notw %si
+notw (%r9)
+notl %edx
+notl (%rax)
+notq %rcx
+notq (%r10)
+
+orb $7, %al
+orb $7, %dil
+orb $7, (%rax)
+orb %sil, %dil
+orb %sil, (%rax)
+orb (%rax), %dil
+
+orw $511, %ax
+orw $511, %di
+orw $511, (%rax)
+orw $7, %di
+orw $7, (%rax)
+orw %si, %di
+orw %si, (%rax)
+orw (%rax), %di
+
+orl $665536, %eax
+orl $665536, %edi
+orl $665536, (%rax)
+orl $7, %edi
+orl $7, (%rax)
+orl %esi, %edi
+orl %esi, (%rax)
+orl (%rax), %edi
+
+orq $665536, %rax
+orq $665536, %rdi
+orq $665536, (%rax)
+orq $7, %rdi
+orq $7, (%rax)
+orq %rsi, %rdi
+orq %rsi, (%rax)
+orq (%rax), %rdi
+
+outb %al,  $7
+outb %al,  %dx
+outw %ax,  $7
+outw %ax,  %dx
+outl %eax, $7
+outl %eax, %dx
+
+outsb
+outsw
+outsl
+
+pause
+
+rclb %dil
+rcrb %dil
+rclb (%rax)
+rcrb (%rax)
+rclb $7, %dil
+rcrb $7, %dil
+rclb $7, (%rax)
+rcrb $7, (%rax)
+rclb %cl, %dil
+rcrb %cl, %dil
+rclb %cl, (%rax)
+rcrb %cl, (%rax)
+
+rclw %di
+rcrw %di
+rclw (%rax)
+rcrw (%rax)
+rclw $7, %di
+rcrw $7, %di
+rclw $7, (%rax)
+rcrw $7, (%rax)
+rclw %cl, %di
+rcrw %cl, %di
+rclw %cl, (%rax)
+rcrw %cl, (%rax)
+
+rcll %edi
+rcrl %edi
+rcll (%rax)
+rcrl (%rax)
+rcll $7, %edi
+rcrl $7, %edi
+rcll $7, (%rax)
+rcrl $7, (%rax)
+rcll %cl, %edi
+rcrl %cl, %edi
+rcll %cl, (%rax)
+rcrl %cl, (%rax)
+
+rclq %rdi
+rcrq %rdi
+rclq (%rax)
+rcrq (%rax)
+rclq $7, %rdi
+rcrq $7, %rdi
+rclq $7, (%rax)
+rcrq $7, (%rax)
+rclq %cl, %rdi
+rcrq %cl, %rdi
+rclq %cl, (%rax)
+rcrq %cl, (%rax)
+
+rolb %dil
+rorb %dil
+rolb (%rax)
+rorb (%rax)
+rolb $7, %dil
+rorb $7, %dil
+rolb $7, (%rax)
+rorb $7, (%rax)
+rolb %cl, %dil
+rorb %cl, %dil
+rolb %cl, (%rax)
+rorb %cl, (%rax)
+
+rolw %di
+rorw %di
+rolw (%rax)
+rorw (%rax)
+rolw $7, %di
+rorw $7, %di
+rolw $7, (%rax)
+rorw $7, (%rax)
+rolw %cl, %di
+rorw %cl, %di
+rolw %cl, (%rax)
+rorw %cl, (%rax)
+
+roll %edi
+rorl %edi
+roll (%rax)
+rorl (%rax)
+roll $7, %edi
+rorl $7, %edi
+roll $7, (%rax)
+rorl $7, (%rax)
+roll %cl, %edi
+rorl %cl, %edi
+roll %cl, (%rax)
+rorl %cl, (%rax)
+
+rolq %rdi
+rorq %rdi
+rolq (%rax)
+rorq (%rax)
+rolq $7, %rdi
+rorq $7, %rdi
+rolq $7, (%rax)
+rorq $7, (%rax)
+rolq %cl, %rdi
+rorq %cl, %rdi
+rolq %cl, (%rax)
+rorq %cl, (%rax)
+
+sahf
+
+sarb %dil
+shlb %dil
+shrb %dil
+sarb (%rax)
+shlb (%rax)
+shrb (%rax)
+sarb $7, %dil
+shlb $7, %dil
+shrb $7, %dil
+sarb $7, (%rax)
+shlb $7, (%rax)
+shrb $7, (%rax)
+sarb %cl, %dil
+shlb %cl, %dil
+shrb %cl, %dil
+sarb %cl, (%rax)
+shlb %cl, (%rax)
+shrb %cl, (%rax)
+
+sarw %di
+shlw %di
+shrw %di
+sarw (%rax)
+shlw (%rax)
+shrw (%rax)
+sarw $7, %di
+shlw $7, %di
+shrw $7, %di
+sarw $7, (%rax)
+shlw $7, (%rax)
+shrw $7, (%rax)
+sarw %cl, %di
+shlw %cl, %di
+shrw %cl, %di
+sarw %cl, (%rax)
+shlw %cl, (%rax)
+shrw %cl, (%rax)
+
+sarl %edi
+shll %edi
+shrl %edi
+sarl (%rax)
+shll (%rax)
+shrl (%rax)
+sarl $7, %edi
+shll $7, %edi
+shrl $7, %edi
+sarl $7, (%rax)
+shll $7, (%rax)
+shrl $7, (%rax)
+sarl %cl, %edi
+shll %cl, %edi
+shrl %cl, %edi
+sarl %cl, (%rax)
+shll %cl, (%rax)
+shrl %cl, (%rax)
+
+sarq %rdi
+shlq %rdi
+shrq %rdi
+sarq (%rax)
+shlq (%rax)
+shrq (%rax)
+sarq $7, %rdi
+shlq $7, %rdi
+shrq $7, %rdi
+sarq $7, (%rax)
+shlq $7, (%rax)
+shrq $7, (%rax)
+sarq %cl, %rdi
+shlq %cl, %rdi
+shrq %cl, %rdi
+sarq %cl, (%rax)
+shlq %cl, (%rax)
+shrq %cl, (%rax)
+
+sbbb $7, %al
+sbbb $7, %dil
+sbbb $7, (%rax)
+sbbb %sil, %dil
+sbbb %sil, (%rax)
+sbbb (%rax), %dil
+
+sbbw $511, %ax
+sbbw $511, %di
+sbbw $511, (%rax)
+sbbw $7, %di
+sbbw $7, (%rax)
+sbbw %si, %di
+sbbw %si, (%rax)
+sbbw (%rax), %di
+
+sbbl $665536, %eax
+sbbl $665536, %edi
+sbbl $665536, (%rax)
+sbbl $7, %edi
+sbbl $7, (%rax)
+sbbl %esi, %edi
+sbbl %esi, (%rax)
+sbbl (%rax), %edi
+
+sbbq $665536, %rax
+sbbq $665536, %rdi
+sbbq $665536, (%rax)
+sbbq $7, %rdi
+sbbq $7, (%rax)
+sbbq %rsi, %rdi
+sbbq %rsi, (%rax)
+sbbq (%rax), %rdi
+
+scasb
+scasw
+scasl
+scasq
+
+seto  %al
+seto  (%rax)
+setno %al
+setno (%rax)
+setb  %al
+setb  (%rax)
+setnb %al
+setnb (%rax)
+setz  %al
+setz  (%rax)
+setnz %al
+setnz (%rax)
+seta  %al
+seta  (%rax)
+setna %al
+setna (%rax)
+sets  %al
+sets  (%rax)
+setns %al
+setns (%rax)
+setp  %al
+setp  (%rax)
+setnp %al
+setnp (%rax)
+setl  %al
+setl  (%rax)
+setnl %al
+setnl (%rax)
+setg  %al
+setg  (%rax)
+setng %al
+setng (%rax)
+
+shldw %cl, %si, %di
+shrdw %cl, %si, %di
+shldw %cl, %si, (%rax)
+shrdw %cl, %si, (%rax)
+shldw $7, %si, %di
+shrdw $7, %si, %di
+shldw $7, %si, (%rax)
+shrdw $7, %si, (%rax)
+
+shldl %cl, %esi, %edi
+shrdl %cl, %esi, %edi
+shldl %cl, %esi, (%rax)
+shrdl %cl, %esi, (%rax)
+shldl $7, %esi, %edi
+shrdl $7, %esi, %edi
+shldl $7, %esi, (%rax)
+shrdl $7, %esi, (%rax)
+
+shldq %cl, %rsi, %rdi
+shrdq %cl, %rsi, %rdi
+shldq %cl, %rsi, (%rax)
+shrdq %cl, %rsi, (%rax)
+shldq $7, %rsi, %rdi
+shrdq $7, %rsi, %rdi
+shldq $7, %rsi, (%rax)
+shrdq $7, %rsi, (%rax)
+
+stc
+std
+
+stosb
+stosw
+stosl
+stosq
+
+subb $7, %al
+subb $7, %dil
+subb $7, (%rax)
+subb %sil, %dil
+subb %sil, (%rax)
+subb (%rax), %dil
+
+subw $511, %ax
+subw $511, %di
+subw $511, (%rax)
+subw $7, %di
+subw $7, (%rax)
+subw %si, %di
+subw %si, (%rax)
+subw (%rax), %di
+
+subl $665536, %eax
+subl $665536, %edi
+subl $665536, (%rax)
+subl $7, %edi
+subl $7, (%rax)
+subl %esi, %edi
+subl %esi, (%rax)
+subl (%rax), %edi
+
+subq $665536, %rax
+subq $665536, %rdi
+subq $665536, (%rax)
+subq $7, %rdi
+subq $7, (%rax)
+subq %rsi, %rdi
+subq %rsi, (%rax)
+subq (%rax), %rdi
+
+testb $7, %al
+testb $7, %dil
+testb $7, (%rax)
+testb %sil, %dil
+testb %sil, (%rax)
+
+testw $511, %ax
+testw $511, %di
+testw $511, (%rax)
+testw $7, %di
+testw $7, (%rax)
+testw %si, %di
+testw %si, (%rax)
+
+testl $665536, %eax
+testl $665536, %edi
+testl $665536, (%rax)
+testl $7, %edi
+testl $7, (%rax)
+testl %esi, %edi
+testl %esi, (%rax)
+
+testq $665536, %rax
+testq $665536, %rdi
+testq $665536, (%rax)
+testq $7, %rdi
+testq $7, (%rax)
+testq %rsi, %rdi
+testq %rsi, (%rax)
+
+ud2
+
+xaddb %bl, %cl
+xaddb %bl, (%rcx)
+
+xaddw %bx, %cx
+xaddw %ax, (%rbx)
+
+xaddl %ebx, %ecx
+xaddl %eax, (%rbx)
+
+xaddq %rbx, %rcx
+xaddq %rax, (%rbx)
+
+xchgb %bl, %cl
+xchgb %bl, (%rbx)
+
+xchgw %ax, %bx
+xchgw %bx, %cx
+xchgw %ax, (%rbx)
+
+xchgl %eax, %ebx
+xchgl %ebx, %ecx
+xchgl %eax, (%rbx)
+
+xchgq %rax, %rbx
+xchgq %rbx, %rcx
+xchgq %rax, (%rbx)
+
+xlatb
+
+xorb $7, %al
+xorb $7, %dil
+xorb $7, (%rax)
+xorb %sil, %dil
+xorb %sil, (%rax)
+xorb (%rax), %dil
+
+xorw $511, %ax
+xorw $511, %di
+xorw $511, (%rax)
+xorw $7, %di
+xorw $7, (%rax)
+xorw %si, %di
+xorw %si, (%rax)
+xorw (%rax), %di
+
+xorl $665536, %eax
+xorl $665536, %edi
+xorl $665536, (%rax)
+xorl $7, %edi
+xorl $7, (%rax)
+xorl %esi, %edi
+xorl %esi, (%rax)
+xorl (%rax), %edi
+
+xorq $665536, %rax
+xorq $665536, %rdi
+xorq $665536, (%rax)
+xorq $7, %rdi
+xorq $7, (%rax)
+xorq %rsi, %rdi
+xorq %rsi, (%rax)
+xorq (%rax), %rdi
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      2     0.67                        adcb	$7, %al
+# CHECK-NEXT:  2      2     0.67                        adcb	$7, %dil
+# CHECK-NEXT:  6      9     1.00    *      *            adcb	$7, (%rax)
+# CHECK-NEXT:  2      2     0.67                        adcb	%sil, %dil
+# CHECK-NEXT:  6      9     1.00    *      *            adcb	%sil, (%rax)
+# CHECK-NEXT:  3      7     0.67    *                   adcb	(%rax), %dil
+# CHECK-NEXT:  2      2     0.67                        adcw	$511, %ax
+# CHECK-NEXT:  2      2     0.67                        adcw	$511, %di
+# CHECK-NEXT:  6      9     1.00    *      *            adcw	$511, (%rax)
+# CHECK-NEXT:  2      2     0.67                        adcw	$7, %di
+# CHECK-NEXT:  6      9     1.00    *      *            adcw	$7, (%rax)
+# CHECK-NEXT:  2      2     0.67                        adcw	%si, %di
+# CHECK-NEXT:  6      9     1.00    *      *            adcw	%si, (%rax)
+# CHECK-NEXT:  3      7     0.67    *                   adcw	(%rax), %di
+# CHECK-NEXT:  2      2     0.67                        adcl	$665536, %eax
+# CHECK-NEXT:  2      2     0.67                        adcl	$665536, %edi
+# CHECK-NEXT:  6      9     1.00    *      *            adcl	$665536, (%rax)
+# CHECK-NEXT:  2      2     0.67                        adcl	$7, %edi
+# CHECK-NEXT:  6      9     1.00    *      *            adcl	$7, (%rax)
+# CHECK-NEXT:  2      2     0.67                        adcl	%esi, %edi
+# CHECK-NEXT:  6      9     1.00    *      *            adcl	%esi, (%rax)
+# CHECK-NEXT:  3      7     0.67    *                   adcl	(%rax), %edi
+# CHECK-NEXT:  2      2     0.67                        adcq	$665536, %rax
+# CHECK-NEXT:  2      2     0.67                        adcq	$665536, %rdi
+# CHECK-NEXT:  6      9     1.00    *      *            adcq	$665536, (%rax)
+# CHECK-NEXT:  2      2     0.67                        adcq	$7, %rdi
+# CHECK-NEXT:  6      9     1.00    *      *            adcq	$7, (%rax)
+# CHECK-NEXT:  2      2     0.67                        adcq	%rsi, %rdi
+# CHECK-NEXT:  6      9     1.00    *      *            adcq	%rsi, (%rax)
+# CHECK-NEXT:  3      7     0.67    *                   adcq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.33                        addb	$7, %al
+# CHECK-NEXT:  1      1     0.33                        addb	$7, %dil
+# CHECK-NEXT:  3      7     1.00    *      *            addb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        addb	%sil, %dil
+# CHECK-NEXT:  3      7     1.00    *      *            addb	%sil, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   addb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.33                        addw	$511, %ax
+# CHECK-NEXT:  1      1     0.33                        addw	$511, %di
+# CHECK-NEXT:  3      7     1.00    *      *            addw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.33                        addw	$7, %di
+# CHECK-NEXT:  3      7     1.00    *      *            addw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        addw	%si, %di
+# CHECK-NEXT:  3      7     1.00    *      *            addw	%si, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   addw	(%rax), %di
+# CHECK-NEXT:  1      1     0.33                        addl	$665536, %eax
+# CHECK-NEXT:  1      1     0.33                        addl	$665536, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            addl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        addl	$7, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            addl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        addl	%esi, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            addl	%esi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   addl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.33                        addq	$665536, %rax
+# CHECK-NEXT:  1      1     0.33                        addq	$665536, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            addq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        addq	$7, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            addq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        addq	%rsi, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            addq	%rsi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   addq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.33                        andb	$7, %al
+# CHECK-NEXT:  1      1     0.33                        andb	$7, %dil
+# CHECK-NEXT:  3      7     1.00    *      *            andb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        andb	%sil, %dil
+# CHECK-NEXT:  3      7     1.00    *      *            andb	%sil, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   andb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.33                        andw	$511, %ax
+# CHECK-NEXT:  1      1     0.33                        andw	$511, %di
+# CHECK-NEXT:  3      7     1.00    *      *            andw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.33                        andw	$7, %di
+# CHECK-NEXT:  3      7     1.00    *      *            andw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        andw	%si, %di
+# CHECK-NEXT:  3      7     1.00    *      *            andw	%si, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   andw	(%rax), %di
+# CHECK-NEXT:  1      1     0.33                        andl	$665536, %eax
+# CHECK-NEXT:  1      1     0.33                        andl	$665536, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            andl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        andl	$7, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            andl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        andl	%esi, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            andl	%esi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   andl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.33                        andq	$665536, %rax
+# CHECK-NEXT:  1      1     0.33                        andq	$665536, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            andq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        andq	$7, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            andq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        andq	%rsi, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            andq	%rsi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   andq	(%rax), %rdi
+# CHECK-NEXT:  1      3     1.00                        bsfw	%si, %di
+# CHECK-NEXT:  1      3     1.00                        bsrw	%si, %di
+# CHECK-NEXT:  2      8     1.00    *                   bsfw	(%rax), %di
+# CHECK-NEXT:  2      8     1.00    *                   bsrw	(%rax), %di
+# CHECK-NEXT:  1      3     1.00                        bsfl	%esi, %edi
+# CHECK-NEXT:  1      3     1.00                        bsrl	%esi, %edi
+# CHECK-NEXT:  2      8     1.00    *                   bsfl	(%rax), %edi
+# CHECK-NEXT:  2      8     1.00    *                   bsrl	(%rax), %edi
+# CHECK-NEXT:  1      3     1.00                        bsfq	%rsi, %rdi
+# CHECK-NEXT:  1      3     1.00                        bsrq	%rsi, %rdi
+# CHECK-NEXT:  2      8     1.00    *                   bsfq	(%rax), %rdi
+# CHECK-NEXT:  2      8     1.00    *                   bsrq	(%rax), %rdi
+# CHECK-NEXT:  1      1     1.00                        bswapl	%eax
+# CHECK-NEXT:  2      2     1.00                        bswapq	%rax
+# CHECK-NEXT:  1      1     0.50                        btw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        btcw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        btrw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        btsw	%si, %di
+# CHECK-NEXT:  6      9     1.00    *                   btw	%si, (%rax)
+# CHECK-NEXT:  6      9     1.00    *      *            btcw	%si, (%rax)
+# CHECK-NEXT:  6      9     1.00    *      *            btrw	%si, (%rax)
+# CHECK-NEXT:  6      9     1.00    *      *            btsw	%si, (%rax)
+# CHECK-NEXT:  1      1     0.50                        btw	$7, %di
+# CHECK-NEXT:  1      1     0.50                        btcw	$7, %di
+# CHECK-NEXT:  1      1     0.50                        btrw	$7, %di
+# CHECK-NEXT:  1      1     0.50                        btsw	$7, %di
+# CHECK-NEXT:  2      6     0.50    *                   btw	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            btcw	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            btrw	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            btsw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        btl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        btcl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        btrl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        btsl	%esi, %edi
+# CHECK-NEXT:  6      9     1.00    *                   btl	%esi, (%rax)
+# CHECK-NEXT:  6      9     1.00    *      *            btcl	%esi, (%rax)
+# CHECK-NEXT:  6      9     1.00    *      *            btrl	%esi, (%rax)
+# CHECK-NEXT:  6      9     1.00    *      *            btsl	%esi, (%rax)
+# CHECK-NEXT:  1      1     0.50                        btl	$7, %edi
+# CHECK-NEXT:  1      1     0.50                        btcl	$7, %edi
+# CHECK-NEXT:  1      1     0.50                        btrl	$7, %edi
+# CHECK-NEXT:  1      1     0.50                        btsl	$7, %edi
+# CHECK-NEXT:  2      6     0.50    *                   btl	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            btcl	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            btrl	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            btsl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        btq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        btcq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        btrq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        btsq	%rsi, %rdi
+# CHECK-NEXT:  6      9     1.00    *                   btq	%rsi, (%rax)
+# CHECK-NEXT:  6      9     1.00    *      *            btcq	%rsi, (%rax)
+# CHECK-NEXT:  6      9     1.00    *      *            btrq	%rsi, (%rax)
+# CHECK-NEXT:  6      9     1.00    *      *            btsq	%rsi, (%rax)
+# CHECK-NEXT:  1      1     0.50                        btq	$7, %rdi
+# CHECK-NEXT:  1      1     0.50                        btcq	$7, %rdi
+# CHECK-NEXT:  1      1     0.50                        btrq	$7, %rdi
+# CHECK-NEXT:  1      1     0.50                        btsq	$7, %rdi
+# CHECK-NEXT:  2      6     0.50    *                   btq	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            btcq	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            btrq	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            btsq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        cbtw
+# CHECK-NEXT:  1      1     0.33                        cwtl
+# CHECK-NEXT:  1      1     0.33                        cltq
+# CHECK-NEXT:  2      2     1.00                        cwtd
+# CHECK-NEXT:  1      1     0.50                        cltd
+# CHECK-NEXT:  1      1     0.50                        cqto
+# CHECK-NEXT:  1      1     0.25                  U     clc
+# CHECK-NEXT:  1      1     0.33                  U     cld
+# CHECK-NEXT:  1      1     0.33                  U     cmc
+# CHECK-NEXT:  1      1     0.33                        cmpb	$7, %al
+# CHECK-NEXT:  1      1     0.33                        cmpb	$7, %dil
+# CHECK-NEXT:  2      6     0.50    *                   cmpb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        cmpb	%sil, %dil
+# CHECK-NEXT:  2      6     0.50    *                   cmpb	%sil, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   cmpb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.33                        cmpw	$511, %ax
+# CHECK-NEXT:  1      1     0.33                        cmpw	$511, %di
+# CHECK-NEXT:  2      6     0.50    *                   cmpw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.33                        cmpw	$7, %di
+# CHECK-NEXT:  2      6     0.50    *                   cmpw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        cmpw	%si, %di
+# CHECK-NEXT:  2      6     0.50    *                   cmpw	%si, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   cmpw	(%rax), %di
+# CHECK-NEXT:  1      1     0.33                        cmpl	$665536, %eax
+# CHECK-NEXT:  1      1     0.33                        cmpl	$665536, %edi
+# CHECK-NEXT:  2      6     0.50    *                   cmpl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        cmpl	$7, %edi
+# CHECK-NEXT:  2      6     0.50    *                   cmpl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        cmpl	%esi, %edi
+# CHECK-NEXT:  2      6     0.50    *                   cmpl	%esi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   cmpl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.33                        cmpq	$665536, %rax
+# CHECK-NEXT:  1      1     0.33                        cmpq	$665536, %rdi
+# CHECK-NEXT:  2      6     0.50    *                   cmpq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        cmpq	$7, %rdi
+# CHECK-NEXT:  2      6     0.50    *                   cmpq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        cmpq	%rsi, %rdi
+# CHECK-NEXT:  2      6     0.50    *                   cmpq	%rsi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   cmpq	(%rax), %rdi
+# CHECK-NEXT:  5      8     1.00                  U     cmpsb	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  5      8     1.00                  U     cmpsw	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  5      8     1.00                  U     cmpsl	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  5      8     1.00                  U     cmpsq	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  4      5     1.33                        cmpxchgb	%cl, %bl
+# CHECK-NEXT:  6      8     2.00    *      *            cmpxchgb	%cl, (%rbx)
+# CHECK-NEXT:  4      5     1.33                        cmpxchgw	%cx, %bx
+# CHECK-NEXT:  6      8     2.00    *      *            cmpxchgw	%cx, (%rbx)
+# CHECK-NEXT:  4      5     1.33                        cmpxchgl	%ecx, %ebx
+# CHECK-NEXT:  6      8     2.00    *      *            cmpxchgl	%ecx, (%rbx)
+# CHECK-NEXT:  4      5     1.33                        cmpxchgq	%rcx, %rbx
+# CHECK-NEXT:  6      8     2.00    *      *            cmpxchgq	%rcx, (%rbx)
+# CHECK-NEXT:  1      100   0.33                  U     cpuid
+# CHECK-NEXT:  1      1     0.33                        decb	%dil
+# CHECK-NEXT:  3      7     1.00    *      *            decb	(%rax)
+# CHECK-NEXT:  1      1     0.33                        decw	%di
+# CHECK-NEXT:  3      7     1.00    *      *            decw	(%rax)
+# CHECK-NEXT:  1      1     0.33                        decl	%edi
+# CHECK-NEXT:  3      7     1.00    *      *            decl	(%rax)
+# CHECK-NEXT:  1      1     0.33                        decq	%rdi
+# CHECK-NEXT:  3      7     1.00    *      *            decq	(%rax)
+# CHECK-NEXT:  1      25    10.00                 U     divb	%dil
+# CHECK-NEXT:  2      30    10.00   *             U     divb	(%rax)
+# CHECK-NEXT:  1      25    10.00                 U     divw	%si
+# CHECK-NEXT:  2      30    10.00   *             U     divw	(%rax)
+# CHECK-NEXT:  1      25    10.00                 U     divl	%edx
+# CHECK-NEXT:  2      30    10.00   *             U     divl	(%rax)
+# CHECK-NEXT:  1      25    10.00                 U     divq	%rcx
+# CHECK-NEXT:  2      30    10.00   *             U     divq	(%rax)
+# CHECK-NEXT:  1      25    10.00                 U     idivb	%dil
+# CHECK-NEXT:  2      30    10.00   *             U     idivb	(%rax)
+# CHECK-NEXT:  1      25    10.00                 U     idivw	%si
+# CHECK-NEXT:  2      30    10.00   *             U     idivw	(%rax)
+# CHECK-NEXT:  1      25    10.00                 U     idivl	%edx
+# CHECK-NEXT:  2      30    10.00   *             U     idivl	(%rax)
+# CHECK-NEXT:  1      25    10.00                 U     idivq	%rcx
+# CHECK-NEXT:  2      30    10.00   *             U     idivq	(%rax)
+# CHECK-NEXT:  1      3     1.00                        imulb	%dil
+# CHECK-NEXT:  2      8     1.00    *                   imulb	(%rax)
+# CHECK-NEXT:  4      4     1.33                        imulw	%di
+# CHECK-NEXT:  5      9     1.33    *                   imulw	(%rax)
+# CHECK-NEXT:  1      3     1.00                        imulw	%si, %di
+# CHECK-NEXT:  2      8     1.00    *                   imulw	(%rax), %di
+# CHECK-NEXT:  2      4     1.00                        imulw	$511, %si, %di
+# CHECK-NEXT:  3      8     1.00    *                   imulw	$511, (%rax), %di
+# CHECK-NEXT:  2      4     1.00                        imulw	$7, %si, %di
+# CHECK-NEXT:  3      8     1.00    *                   imulw	$7, (%rax), %di
+# CHECK-NEXT:  3      4     1.00                        imull	%edi
+# CHECK-NEXT:  4      9     1.00    *                   imull	(%rax)
+# CHECK-NEXT:  1      3     1.00                        imull	%esi, %edi
+# CHECK-NEXT:  2      8     1.00    *                   imull	(%rax), %edi
+# CHECK-NEXT:  1      3     1.00                        imull	$665536, %esi, %edi
+# CHECK-NEXT:  2      8     1.00    *                   imull	$665536, (%rax), %edi
+# CHECK-NEXT:  1      3     1.00                        imull	$7, %esi, %edi
+# CHECK-NEXT:  2      8     1.00    *                   imull	$7, (%rax), %edi
+# CHECK-NEXT:  2      4     1.00                        imulq	%rdi
+# CHECK-NEXT:  3      9     1.00    *                   imulq	(%rax)
+# CHECK-NEXT:  1      3     1.00                        imulq	%rsi, %rdi
+# CHECK-NEXT:  2      8     1.00    *                   imulq	(%rax), %rdi
+# CHECK-NEXT:  1      3     1.00                        imulq	$665536, %rsi, %rdi
+# CHECK-NEXT:  2      8     1.00    *                   imulq	$665536, (%rax), %rdi
+# CHECK-NEXT:  1      3     1.00                        imulq	$7, %rsi, %rdi
+# CHECK-NEXT:  2      8     1.00    *                   imulq	$7, (%rax), %rdi
+# CHECK-NEXT:  1      100   0.33                  U     inb	$7, %al
+# CHECK-NEXT:  1      100   0.33                  U     inb	%dx, %al
+# CHECK-NEXT:  1      100   0.33                  U     inw	$7, %ax
+# CHECK-NEXT:  1      100   0.33                  U     inw	%dx, %ax
+# CHECK-NEXT:  1      100   0.33                  U     inl	$7, %eax
+# CHECK-NEXT:  1      100   0.33                  U     inl	%dx, %eax
+# CHECK-NEXT:  1      1     0.33                        incb	%dil
+# CHECK-NEXT:  3      7     1.00    *      *            incb	(%rax)
+# CHECK-NEXT:  1      1     0.33                        incw	%di
+# CHECK-NEXT:  3      7     1.00    *      *            incw	(%rax)
+# CHECK-NEXT:  1      1     0.33                        incl	%edi
+# CHECK-NEXT:  3      7     1.00    *      *            incl	(%rax)
+# CHECK-NEXT:  1      1     0.33                        incq	%rdi
+# CHECK-NEXT:  3      7     1.00    *      *            incq	(%rax)
+# CHECK-NEXT:  1      100   0.33                  U     insb	%dx, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.33                  U     insw	%dx, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.33                  U     insl	%dx, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.33    *      *      U     int	$7
+# CHECK-NEXT:  1      1     0.50                        lahf
+# CHECK-NEXT:  3      7     0.67                  U     lodsb	(%rsi), %al
+# CHECK-NEXT:  3      7     0.67                  U     lodsw	(%rsi), %ax
+# CHECK-NEXT:  2      6     0.50                  U     lodsl	(%rsi), %eax
+# CHECK-NEXT:  2      6     0.50                  U     lodsq	(%rsi), %rax
+# CHECK-NEXT:  5      8     1.00                  U     movsb	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  5      8     1.00                  U     movsw	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  5      8     1.00                  U     movsl	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  5      8     1.00                  U     movsq	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  1      1     0.33                        movsbw	%al, %di
+# CHECK-NEXT:  1      1     0.33                        movzbw	%al, %di
+# CHECK-NEXT:  1      5     0.50    *                   movsbw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   movzbw	(%rax), %di
+# CHECK-NEXT:  1      1     0.33                        movsbl	%al, %edi
+# CHECK-NEXT:  1      1     0.33                        movzbl	%al, %edi
+# CHECK-NEXT:  1      5     0.50    *                   movsbl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   movzbl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.33                        movsbq	%al, %rdi
+# CHECK-NEXT:  1      1     0.33                        movzbq	%al, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   movsbq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   movzbq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.33                        movswl	%ax, %edi
+# CHECK-NEXT:  1      1     0.33                        movzwl	%ax, %edi
+# CHECK-NEXT:  1      5     0.50    *                   movswl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   movzwl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.33                        movswq	%ax, %rdi
+# CHECK-NEXT:  1      1     0.33                        movzwq	%ax, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   movswq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   movzwq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.33                        movslq	%eax, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   movslq	(%rax), %rdi
+# CHECK-NEXT:  1      3     1.00                        mulb	%dil
+# CHECK-NEXT:  2      8     1.00    *                   mulb	(%rax)
+# CHECK-NEXT:  4      4     1.33                        mulw	%si
+# CHECK-NEXT:  5      9     1.33    *                   mulw	(%rax)
+# CHECK-NEXT:  3      4     1.00                        mull	%edx
+# CHECK-NEXT:  4      9     1.00    *                   mull	(%rax)
+# CHECK-NEXT:  2      4     1.00                        mulq	%rcx
+# CHECK-NEXT:  3      9     1.00    *                   mulq	(%rax)
+# CHECK-NEXT:  1      1     0.33                        negb	%dil
+# CHECK-NEXT:  3      7     1.00    *      *            negb	(%r8)
+# CHECK-NEXT:  1      1     0.33                        negw	%si
+# CHECK-NEXT:  3      7     1.00    *      *            negw	(%r9)
+# CHECK-NEXT:  1      1     0.33                        negl	%edx
+# CHECK-NEXT:  3      7     1.00    *      *            negl	(%rax)
+# CHECK-NEXT:  1      1     0.33                        negq	%rcx
+# CHECK-NEXT:  3      7     1.00    *      *            negq	(%r10)
+# CHECK-NEXT:  1      1     0.25                        nop
+# CHECK-NEXT:  1      1     0.25                        nopw	%di
+# CHECK-NEXT:  1      1     0.25                        nopw	(%rcx)
+# CHECK-NEXT:  1      1     0.25                        nopl	%esi
+# CHECK-NEXT:  1      1     0.25                        nopl	(%r8)
+# CHECK-NEXT:  1      1     0.25                        nopq	%rdx
+# CHECK-NEXT:  1      1     0.25                        nopq	(%r9)
+# CHECK-NEXT:  1      1     0.33                        notb	%dil
+# CHECK-NEXT:  3      7     1.00    *      *            notb	(%r8)
+# CHECK-NEXT:  1      1     0.33                        notw	%si
+# CHECK-NEXT:  3      7     1.00    *      *            notw	(%r9)
+# CHECK-NEXT:  1      1     0.33                        notl	%edx
+# CHECK-NEXT:  3      7     1.00    *      *            notl	(%rax)
+# CHECK-NEXT:  1      1     0.33                        notq	%rcx
+# CHECK-NEXT:  3      7     1.00    *      *            notq	(%r10)
+# CHECK-NEXT:  1      1     0.33                        orb	$7, %al
+# CHECK-NEXT:  1      1     0.33                        orb	$7, %dil
+# CHECK-NEXT:  3      7     1.00    *      *            orb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        orb	%sil, %dil
+# CHECK-NEXT:  3      7     1.00    *      *            orb	%sil, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   orb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.33                        orw	$511, %ax
+# CHECK-NEXT:  1      1     0.33                        orw	$511, %di
+# CHECK-NEXT:  3      7     1.00    *      *            orw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.33                        orw	$7, %di
+# CHECK-NEXT:  3      7     1.00    *      *            orw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        orw	%si, %di
+# CHECK-NEXT:  3      7     1.00    *      *            orw	%si, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   orw	(%rax), %di
+# CHECK-NEXT:  1      1     0.33                        orl	$665536, %eax
+# CHECK-NEXT:  1      1     0.33                        orl	$665536, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            orl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        orl	$7, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            orl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        orl	%esi, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            orl	%esi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   orl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.33                        orq	$665536, %rax
+# CHECK-NEXT:  1      1     0.33                        orq	$665536, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            orq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        orq	$7, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            orq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        orq	%rsi, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            orq	%rsi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   orq	(%rax), %rdi
+# CHECK-NEXT:  1      100   0.33                  U     outb	%al, $7
+# CHECK-NEXT:  1      100   0.33                  U     outb	%al, %dx
+# CHECK-NEXT:  1      100   0.33                  U     outw	%ax, $7
+# CHECK-NEXT:  1      100   0.33                  U     outw	%ax, %dx
+# CHECK-NEXT:  1      100   0.33                  U     outl	%eax, $7
+# CHECK-NEXT:  1      100   0.33                  U     outl	%eax, %dx
+# CHECK-NEXT:  1      100   0.33                  U     outsb	(%rsi), %dx
+# CHECK-NEXT:  1      100   0.33                  U     outsw	(%rsi), %dx
+# CHECK-NEXT:  1      100   0.33                  U     outsl	(%rsi), %dx
+# CHECK-NEXT:  4      4     1.33    *      *      U     pause
+# CHECK-NEXT:  3      2     1.50                        rclb	%dil
+# CHECK-NEXT:  3      2     1.50                        rcrb	%dil
+# CHECK-NEXT:  11     11    3.50           *            rclb	(%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrb	(%rax)
+# CHECK-NEXT:  8      5     4.00                        rclb	$7, %dil
+# CHECK-NEXT:  8      5     4.00                        rcrb	$7, %dil
+# CHECK-NEXT:  11     11    3.50           *            rclb	$7, (%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrb	$7, (%rax)
+# CHECK-NEXT:  8      5     4.00                        rclb	%cl, %dil
+# CHECK-NEXT:  8      5     4.00                        rcrb	%cl, %dil
+# CHECK-NEXT:  11     11    3.50           *            rclb	%cl, (%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrb	%cl, (%rax)
+# CHECK-NEXT:  3      2     1.50                        rclw	%di
+# CHECK-NEXT:  3      2     1.50                        rcrw	%di
+# CHECK-NEXT:  11     11    3.50           *            rclw	(%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrw	(%rax)
+# CHECK-NEXT:  8      5     4.00                        rclw	$7, %di
+# CHECK-NEXT:  8      5     4.00                        rcrw	$7, %di
+# CHECK-NEXT:  11     11    3.50           *            rclw	$7, (%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrw	$7, (%rax)
+# CHECK-NEXT:  8      5     4.00                        rclw	%cl, %di
+# CHECK-NEXT:  8      5     4.00                        rcrw	%cl, %di
+# CHECK-NEXT:  11     11    3.50           *            rclw	%cl, (%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrw	%cl, (%rax)
+# CHECK-NEXT:  3      2     1.50                        rcll	%edi
+# CHECK-NEXT:  3      2     1.50                        rcrl	%edi
+# CHECK-NEXT:  11     11    3.50           *            rcll	(%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrl	(%rax)
+# CHECK-NEXT:  8      5     4.00                        rcll	$7, %edi
+# CHECK-NEXT:  8      5     4.00                        rcrl	$7, %edi
+# CHECK-NEXT:  11     11    3.50           *            rcll	$7, (%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrl	$7, (%rax)
+# CHECK-NEXT:  8      5     4.00                        rcll	%cl, %edi
+# CHECK-NEXT:  8      5     4.00                        rcrl	%cl, %edi
+# CHECK-NEXT:  11     11    3.50           *            rcll	%cl, (%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrl	%cl, (%rax)
+# CHECK-NEXT:  3      2     1.50                        rclq	%rdi
+# CHECK-NEXT:  3      2     1.50                        rcrq	%rdi
+# CHECK-NEXT:  11     11    3.50           *            rclq	(%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrq	(%rax)
+# CHECK-NEXT:  8      5     4.00                        rclq	$7, %rdi
+# CHECK-NEXT:  8      5     4.00                        rcrq	$7, %rdi
+# CHECK-NEXT:  11     11    3.50           *            rclq	$7, (%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrq	$7, (%rax)
+# CHECK-NEXT:  8      5     4.00                        rclq	%cl, %rdi
+# CHECK-NEXT:  8      5     4.00                        rcrq	%cl, %rdi
+# CHECK-NEXT:  11     11    3.50           *            rclq	%cl, (%rax)
+# CHECK-NEXT:  11     11    3.50           *            rcrq	%cl, (%rax)
+# CHECK-NEXT:  2      2     1.00                        rolb	%dil
+# CHECK-NEXT:  2      2     1.00                        rorb	%dil
+# CHECK-NEXT:  5      8     1.00    *      *            rolb	(%rax)
+# CHECK-NEXT:  5      8     1.00    *      *            rorb	(%rax)
+# CHECK-NEXT:  2      2     1.00                        rolb	$7, %dil
+# CHECK-NEXT:  2      2     1.00                        rorb	$7, %dil
+# CHECK-NEXT:  5      8     1.00    *      *            rolb	$7, (%rax)
+# CHECK-NEXT:  5      8     1.00    *      *            rorb	$7, (%rax)
+# CHECK-NEXT:  3      3     1.50                        rolb	%cl, %dil
+# CHECK-NEXT:  3      3     1.50                        rorb	%cl, %dil
+# CHECK-NEXT:  6      9     1.50    *      *            rolb	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            rorb	%cl, (%rax)
+# CHECK-NEXT:  2      2     1.00                        rolw	%di
+# CHECK-NEXT:  2      2     1.00                        rorw	%di
+# CHECK-NEXT:  5      8     1.00    *      *            rolw	(%rax)
+# CHECK-NEXT:  5      8     1.00    *      *            rorw	(%rax)
+# CHECK-NEXT:  2      2     1.00                        rolw	$7, %di
+# CHECK-NEXT:  2      2     1.00                        rorw	$7, %di
+# CHECK-NEXT:  5      8     1.00    *      *            rolw	$7, (%rax)
+# CHECK-NEXT:  5      8     1.00    *      *            rorw	$7, (%rax)
+# CHECK-NEXT:  3      3     1.50                        rolw	%cl, %di
+# CHECK-NEXT:  3      3     1.50                        rorw	%cl, %di
+# CHECK-NEXT:  6      9     1.50    *      *            rolw	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            rorw	%cl, (%rax)
+# CHECK-NEXT:  2      2     1.00                        roll	%edi
+# CHECK-NEXT:  2      2     1.00                        rorl	%edi
+# CHECK-NEXT:  5      8     1.00    *      *            roll	(%rax)
+# CHECK-NEXT:  5      8     1.00    *      *            rorl	(%rax)
+# CHECK-NEXT:  2      2     1.00                        roll	$7, %edi
+# CHECK-NEXT:  2      2     1.00                        rorl	$7, %edi
+# CHECK-NEXT:  5      8     1.00    *      *            roll	$7, (%rax)
+# CHECK-NEXT:  5      8     1.00    *      *            rorl	$7, (%rax)
+# CHECK-NEXT:  3      3     1.50                        roll	%cl, %edi
+# CHECK-NEXT:  3      3     1.50                        rorl	%cl, %edi
+# CHECK-NEXT:  6      9     1.50    *      *            roll	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            rorl	%cl, (%rax)
+# CHECK-NEXT:  2      2     1.00                        rolq	%rdi
+# CHECK-NEXT:  2      2     1.00                        rorq	%rdi
+# CHECK-NEXT:  5      8     1.00    *      *            rolq	(%rax)
+# CHECK-NEXT:  5      8     1.00    *      *            rorq	(%rax)
+# CHECK-NEXT:  2      2     1.00                        rolq	$7, %rdi
+# CHECK-NEXT:  2      2     1.00                        rorq	$7, %rdi
+# CHECK-NEXT:  5      8     1.00    *      *            rolq	$7, (%rax)
+# CHECK-NEXT:  5      8     1.00    *      *            rorq	$7, (%rax)
+# CHECK-NEXT:  3      3     1.50                        rolq	%cl, %rdi
+# CHECK-NEXT:  3      3     1.50                        rorq	%cl, %rdi
+# CHECK-NEXT:  6      9     1.50    *      *            rolq	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            rorq	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sahf
+# CHECK-NEXT:  1      1     0.50                        sarb	%dil
+# CHECK-NEXT:  1      1     0.50                        shlb	%dil
+# CHECK-NEXT:  1      1     0.50                        shrb	%dil
+# CHECK-NEXT:  4      7     1.00    *      *            sarb	(%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shlb	(%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shrb	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sarb	$7, %dil
+# CHECK-NEXT:  1      1     0.50                        shlb	$7, %dil
+# CHECK-NEXT:  1      1     0.50                        shrb	$7, %dil
+# CHECK-NEXT:  4      7     1.00    *      *            sarb	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shlb	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shrb	$7, (%rax)
+# CHECK-NEXT:  3      3     1.50                        sarb	%cl, %dil
+# CHECK-NEXT:  3      3     1.50                        shlb	%cl, %dil
+# CHECK-NEXT:  3      3     1.50                        shrb	%cl, %dil
+# CHECK-NEXT:  6      9     1.50    *      *            sarb	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            shlb	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            shrb	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarw	%di
+# CHECK-NEXT:  1      1     0.50                        shlw	%di
+# CHECK-NEXT:  1      1     0.50                        shrw	%di
+# CHECK-NEXT:  4      7     1.00    *      *            sarw	(%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shlw	(%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shrw	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sarw	$7, %di
+# CHECK-NEXT:  1      1     0.50                        shlw	$7, %di
+# CHECK-NEXT:  1      1     0.50                        shrw	$7, %di
+# CHECK-NEXT:  4      7     1.00    *      *            sarw	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shlw	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shrw	$7, (%rax)
+# CHECK-NEXT:  3      3     1.50                        sarw	%cl, %di
+# CHECK-NEXT:  3      3     1.50                        shlw	%cl, %di
+# CHECK-NEXT:  3      3     1.50                        shrw	%cl, %di
+# CHECK-NEXT:  6      9     1.50    *      *            sarw	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            shlw	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            shrw	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarl	%edi
+# CHECK-NEXT:  1      1     0.50                        shll	%edi
+# CHECK-NEXT:  1      1     0.50                        shrl	%edi
+# CHECK-NEXT:  4      7     1.00    *      *            sarl	(%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shll	(%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shrl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sarl	$7, %edi
+# CHECK-NEXT:  1      1     0.50                        shll	$7, %edi
+# CHECK-NEXT:  1      1     0.50                        shrl	$7, %edi
+# CHECK-NEXT:  4      7     1.00    *      *            sarl	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shll	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shrl	$7, (%rax)
+# CHECK-NEXT:  3      3     1.50                        sarl	%cl, %edi
+# CHECK-NEXT:  3      3     1.50                        shll	%cl, %edi
+# CHECK-NEXT:  3      3     1.50                        shrl	%cl, %edi
+# CHECK-NEXT:  6      9     1.50    *      *            sarl	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            shll	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            shrl	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarq	%rdi
+# CHECK-NEXT:  1      1     0.50                        shlq	%rdi
+# CHECK-NEXT:  1      1     0.50                        shrq	%rdi
+# CHECK-NEXT:  4      7     1.00    *      *            sarq	(%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shlq	(%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shrq	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sarq	$7, %rdi
+# CHECK-NEXT:  1      1     0.50                        shlq	$7, %rdi
+# CHECK-NEXT:  1      1     0.50                        shrq	$7, %rdi
+# CHECK-NEXT:  4      7     1.00    *      *            sarq	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shlq	$7, (%rax)
+# CHECK-NEXT:  4      7     1.00    *      *            shrq	$7, (%rax)
+# CHECK-NEXT:  3      3     1.50                        sarq	%cl, %rdi
+# CHECK-NEXT:  3      3     1.50                        shlq	%cl, %rdi
+# CHECK-NEXT:  3      3     1.50                        shrq	%cl, %rdi
+# CHECK-NEXT:  6      9     1.50    *      *            sarq	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            shlq	%cl, (%rax)
+# CHECK-NEXT:  6      9     1.50    *      *            shrq	%cl, (%rax)
+# CHECK-NEXT:  2      2     0.67                        sbbb	$7, %al
+# CHECK-NEXT:  2      2     0.67                        sbbb	$7, %dil
+# CHECK-NEXT:  6      9     1.00    *      *            sbbb	$7, (%rax)
+# CHECK-NEXT:  2      2     0.67                        sbbb	%sil, %dil
+# CHECK-NEXT:  6      9     1.00    *      *            sbbb	%sil, (%rax)
+# CHECK-NEXT:  3      7     0.67    *                   sbbb	(%rax), %dil
+# CHECK-NEXT:  2      2     0.67                        sbbw	$511, %ax
+# CHECK-NEXT:  2      2     0.67                        sbbw	$511, %di
+# CHECK-NEXT:  6      9     1.00    *      *            sbbw	$511, (%rax)
+# CHECK-NEXT:  2      2     0.67                        sbbw	$7, %di
+# CHECK-NEXT:  6      9     1.00    *      *            sbbw	$7, (%rax)
+# CHECK-NEXT:  2      2     0.67                        sbbw	%si, %di
+# CHECK-NEXT:  6      9     1.00    *      *            sbbw	%si, (%rax)
+# CHECK-NEXT:  3      7     0.67    *                   sbbw	(%rax), %di
+# CHECK-NEXT:  2      2     0.67                        sbbl	$665536, %eax
+# CHECK-NEXT:  2      2     0.67                        sbbl	$665536, %edi
+# CHECK-NEXT:  6      9     1.00    *      *            sbbl	$665536, (%rax)
+# CHECK-NEXT:  2      2     0.67                        sbbl	$7, %edi
+# CHECK-NEXT:  6      9     1.00    *      *            sbbl	$7, (%rax)
+# CHECK-NEXT:  2      2     0.67                        sbbl	%esi, %edi
+# CHECK-NEXT:  6      9     1.00    *      *            sbbl	%esi, (%rax)
+# CHECK-NEXT:  3      7     0.67    *                   sbbl	(%rax), %edi
+# CHECK-NEXT:  2      2     0.67                        sbbq	$665536, %rax
+# CHECK-NEXT:  2      2     0.67                        sbbq	$665536, %rdi
+# CHECK-NEXT:  6      9     1.00    *      *            sbbq	$665536, (%rax)
+# CHECK-NEXT:  2      2     0.67                        sbbq	$7, %rdi
+# CHECK-NEXT:  6      9     1.00    *      *            sbbq	$7, (%rax)
+# CHECK-NEXT:  2      2     0.67                        sbbq	%rsi, %rdi
+# CHECK-NEXT:  6      9     1.00    *      *            sbbq	%rsi, (%rax)
+# CHECK-NEXT:  3      7     0.67    *                   sbbq	(%rax), %rdi
+# CHECK-NEXT:  2      2     0.67                  U     scasb	%es:(%rdi), %al
+# CHECK-NEXT:  2      2     0.67                  U     scasw	%es:(%rdi), %ax
+# CHECK-NEXT:  2      2     0.67                  U     scasl	%es:(%rdi), %eax
+# CHECK-NEXT:  2      2     0.67                  U     scasq	%es:(%rdi), %rax
+# CHECK-NEXT:  1      1     0.50                        seto	%al
+# CHECK-NEXT:  3      2     1.00           *            seto	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setno	%al
+# CHECK-NEXT:  3      2     1.00           *            setno	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setb	%al
+# CHECK-NEXT:  3      2     1.00           *            setb	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setae	%al
+# CHECK-NEXT:  3      2     1.00           *            setae	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sete	%al
+# CHECK-NEXT:  3      2     1.00           *            sete	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setne	%al
+# CHECK-NEXT:  3      2     1.00           *            setne	(%rax)
+# CHECK-NEXT:  2      2     1.00                        seta	%al
+# CHECK-NEXT:  4      3     1.00           *            seta	(%rax)
+# CHECK-NEXT:  2      2     1.00                        setbe	%al
+# CHECK-NEXT:  4      3     1.00           *            setbe	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sets	%al
+# CHECK-NEXT:  3      2     1.00           *            sets	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setns	%al
+# CHECK-NEXT:  3      2     1.00           *            setns	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setp	%al
+# CHECK-NEXT:  3      2     1.00           *            setp	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setnp	%al
+# CHECK-NEXT:  3      2     1.00           *            setnp	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setl	%al
+# CHECK-NEXT:  3      2     1.00           *            setl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setge	%al
+# CHECK-NEXT:  3      2     1.00           *            setge	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setg	%al
+# CHECK-NEXT:  3      2     1.00           *            setg	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setle	%al
+# CHECK-NEXT:  3      2     1.00           *            setle	(%rax)
+# CHECK-NEXT:  4      4     1.50                        shldw	%cl, %si, %di
+# CHECK-NEXT:  4      4     1.50                        shrdw	%cl, %si, %di
+# CHECK-NEXT:  7      10    1.50    *      *            shldw	%cl, %si, (%rax)
+# CHECK-NEXT:  7      10    1.50    *      *            shrdw	%cl, %si, (%rax)
+# CHECK-NEXT:  2      2     0.67                        shldw	$7, %si, %di
+# CHECK-NEXT:  2      2     0.67                        shrdw	$7, %si, %di
+# CHECK-NEXT:  5      8     1.00    *      *            shldw	$7, %si, (%rax)
+# CHECK-NEXT:  5      8     1.00    *      *            shrdw	$7, %si, (%rax)
+# CHECK-NEXT:  4      4     1.50                        shldl	%cl, %esi, %edi
+# CHECK-NEXT:  4      4     1.50                        shrdl	%cl, %esi, %edi
+# CHECK-NEXT:  7      10    1.50    *      *            shldl	%cl, %esi, (%rax)
+# CHECK-NEXT:  7      10    1.50    *      *            shrdl	%cl, %esi, (%rax)
+# CHECK-NEXT:  2      2     0.67                        shldl	$7, %esi, %edi
+# CHECK-NEXT:  2      2     0.67                        shrdl	$7, %esi, %edi
+# CHECK-NEXT:  5      8     1.00    *      *            shldl	$7, %esi, (%rax)
+# CHECK-NEXT:  5      8     1.00    *      *            shrdl	$7, %esi, (%rax)
+# CHECK-NEXT:  4      4     1.50                        shldq	%cl, %rsi, %rdi
+# CHECK-NEXT:  4      4     1.50                        shrdq	%cl, %rsi, %rdi
+# CHECK-NEXT:  7      10    1.50    *      *            shldq	%cl, %rsi, (%rax)
+# CHECK-NEXT:  7      10    1.50    *      *            shrdq	%cl, %rsi, (%rax)
+# CHECK-NEXT:  2      2     0.67                        shldq	$7, %rsi, %rdi
+# CHECK-NEXT:  2      2     0.67                        shrdq	$7, %rsi, %rdi
+# CHECK-NEXT:  5      8     1.00    *      *            shldq	$7, %rsi, (%rax)
+# CHECK-NEXT:  5      8     1.00    *      *            shrdq	$7, %rsi, (%rax)
+# CHECK-NEXT:  1      1     0.33                  U     stc
+# CHECK-NEXT:  1      1     0.33                  U     std
+# CHECK-NEXT:  3      5     1.00                  U     stosb	%al, %es:(%rdi)
+# CHECK-NEXT:  3      5     1.00                  U     stosw	%ax, %es:(%rdi)
+# CHECK-NEXT:  3      5     1.00                  U     stosl	%eax, %es:(%rdi)
+# CHECK-NEXT:  3      5     1.00                  U     stosq	%rax, %es:(%rdi)
+# CHECK-NEXT:  1      1     0.33                        subb	$7, %al
+# CHECK-NEXT:  1      1     0.33                        subb	$7, %dil
+# CHECK-NEXT:  3      7     1.00    *      *            subb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        subb	%sil, %dil
+# CHECK-NEXT:  3      7     1.00    *      *            subb	%sil, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   subb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.33                        subw	$511, %ax
+# CHECK-NEXT:  1      1     0.33                        subw	$511, %di
+# CHECK-NEXT:  3      7     1.00    *      *            subw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.33                        subw	$7, %di
+# CHECK-NEXT:  3      7     1.00    *      *            subw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        subw	%si, %di
+# CHECK-NEXT:  3      7     1.00    *      *            subw	%si, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   subw	(%rax), %di
+# CHECK-NEXT:  1      1     0.33                        subl	$665536, %eax
+# CHECK-NEXT:  1      1     0.33                        subl	$665536, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            subl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        subl	$7, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            subl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        subl	%esi, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            subl	%esi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   subl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.33                        subq	$665536, %rax
+# CHECK-NEXT:  1      1     0.33                        subq	$665536, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            subq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        subq	$7, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            subq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        subq	%rsi, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            subq	%rsi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   subq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.33                        testb	$7, %al
+# CHECK-NEXT:  1      1     0.33                        testb	$7, %dil
+# CHECK-NEXT:  2      6     0.50    *                   testb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        testb	%sil, %dil
+# CHECK-NEXT:  2      6     0.50    *                   testb	%sil, (%rax)
+# CHECK-NEXT:  1      1     0.33                        testw	$511, %ax
+# CHECK-NEXT:  1      1     0.33                        testw	$511, %di
+# CHECK-NEXT:  2      6     0.50    *                   testw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.33                        testw	$7, %di
+# CHECK-NEXT:  2      6     0.50    *                   testw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        testw	%si, %di
+# CHECK-NEXT:  2      6     0.50    *                   testw	%si, (%rax)
+# CHECK-NEXT:  1      1     0.33                        testl	$665536, %eax
+# CHECK-NEXT:  1      1     0.33                        testl	$665536, %edi
+# CHECK-NEXT:  2      6     0.50    *                   testl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        testl	$7, %edi
+# CHECK-NEXT:  2      6     0.50    *                   testl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        testl	%esi, %edi
+# CHECK-NEXT:  2      6     0.50    *                   testl	%esi, (%rax)
+# CHECK-NEXT:  1      1     0.33                        testq	$665536, %rax
+# CHECK-NEXT:  1      1     0.33                        testq	$665536, %rdi
+# CHECK-NEXT:  2      6     0.50    *                   testq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        testq	$7, %rdi
+# CHECK-NEXT:  2      6     0.50    *                   testq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        testq	%rsi, %rdi
+# CHECK-NEXT:  2      6     0.50    *                   testq	%rsi, (%rax)
+# CHECK-NEXT:  1      100   0.33    *             U     ud2
+# CHECK-NEXT:  3      2     1.00                        xaddb	%bl, %cl
+# CHECK-NEXT:  5      8     1.00    *      *            xaddb	%bl, (%rcx)
+# CHECK-NEXT:  3      2     1.00                        xaddw	%bx, %cx
+# CHECK-NEXT:  5      8     1.00    *      *            xaddw	%ax, (%rbx)
+# CHECK-NEXT:  3      2     1.00                        xaddl	%ebx, %ecx
+# CHECK-NEXT:  5      8     1.00    *      *            xaddl	%eax, (%rbx)
+# CHECK-NEXT:  3      2     1.00                        xaddq	%rbx, %rcx
+# CHECK-NEXT:  5      8     1.00    *      *            xaddq	%rax, (%rbx)
+# CHECK-NEXT:  3      2     1.00                        xchgb	%bl, %cl
+# CHECK-NEXT:  3      6     1.00    *      *            xchgb	%bl, (%rbx)
+# CHECK-NEXT:  3      2     1.00                        xchgw	%bx, %ax
+# CHECK-NEXT:  3      2     1.00                        xchgw	%bx, %cx
+# CHECK-NEXT:  3      6     1.00    *      *            xchgw	%ax, (%rbx)
+# CHECK-NEXT:  3      2     1.00                        xchgl	%ebx, %eax
+# CHECK-NEXT:  3      2     1.00                        xchgl	%ebx, %ecx
+# CHECK-NEXT:  3      6     1.00    *      *            xchgl	%eax, (%rbx)
+# CHECK-NEXT:  3      2     1.00                        xchgq	%rbx, %rax
+# CHECK-NEXT:  3      2     1.00                        xchgq	%rbx, %rcx
+# CHECK-NEXT:  3      6     1.00    *      *            xchgq	%rax, (%rbx)
+# CHECK-NEXT:  1      5     0.50    *                   xlatb
+# CHECK-NEXT:  1      1     0.33                        xorb	$7, %al
+# CHECK-NEXT:  1      1     0.33                        xorb	$7, %dil
+# CHECK-NEXT:  3      7     1.00    *      *            xorb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        xorb	%sil, %dil
+# CHECK-NEXT:  3      7     1.00    *      *            xorb	%sil, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   xorb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.33                        xorw	$511, %ax
+# CHECK-NEXT:  1      1     0.33                        xorw	$511, %di
+# CHECK-NEXT:  3      7     1.00    *      *            xorw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.33                        xorw	$7, %di
+# CHECK-NEXT:  3      7     1.00    *      *            xorw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        xorw	%si, %di
+# CHECK-NEXT:  3      7     1.00    *      *            xorw	%si, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   xorw	(%rax), %di
+# CHECK-NEXT:  1      1     0.33                        xorl	$665536, %eax
+# CHECK-NEXT:  1      1     0.33                        xorl	$665536, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            xorl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        xorl	$7, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            xorl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        xorl	%esi, %edi
+# CHECK-NEXT:  3      7     1.00    *      *            xorl	%esi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   xorl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.33                        xorq	$665536, %rax
+# CHECK-NEXT:  1      1     0.33                        xorq	$665536, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            xorq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.33                        xorq	$7, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            xorq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.33                        xorq	%rsi, %rdi
+# CHECK-NEXT:  3      7     1.00    *      *            xorq	%rsi, (%rax)
+# CHECK-NEXT:  2      6     0.50    *                   xorq	(%rax), %rdi
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT: 160.00  -     571.83 221.33 222.00 571.83 316.00 316.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcb	$7, %al
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcb	$7, %dil
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcb	$7, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcb	%sil, %dil
+# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   adcb	%sil, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcb	(%rax), %dil
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcw	$511, %ax
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcw	$511, %di
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcw	$511, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcw	$7, %di
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcw	%si, %di
+# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   adcw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcw	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcl	$665536, %eax
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcl	$665536, %edi
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcl	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcl	$7, %edi
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcl	%esi, %edi
+# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   adcl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcq	$665536, %rax
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcq	$665536, %rdi
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcq	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcq	$7, %rdi
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcq	%rsi, %rdi
+# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   adcq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addb	$7, %al
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addb	$7, %dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addb	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addb	%sil, %dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addb	%sil, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   addb	(%rax), %dil
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addw	$511, %ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addw	$511, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addw	$511, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addw	$7, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addw	%si, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   addw	(%rax), %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addl	$665536, %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addl	$665536, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addl	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addl	$7, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addl	%esi, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   addl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addq	$665536, %rax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addq	$665536, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addq	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addq	$7, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   addq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andb	$7, %al
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andb	$7, %dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andb	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andb	%sil, %dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andb	%sil, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andb	(%rax), %dil
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andw	$511, %ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andw	$511, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andw	$511, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andw	$7, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andw	%si, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andw	(%rax), %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andl	$665536, %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andl	$665536, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andl	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andl	$7, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andl	%esi, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andq	$665536, %rax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andq	$665536, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andq	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andq	$7, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsfw	%si, %di
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsrw	%si, %di
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsfw	(%rax), %di
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsrw	(%rax), %di
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsfl	%esi, %edi
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsrl	%esi, %edi
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsfl	(%rax), %edi
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsrl	(%rax), %edi
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsfq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsrq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsfq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsrq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bswapl	%eax
+# CHECK-NEXT:  -      -     0.50   1.00    -     0.50    -      -     bswapq	%rax
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btw	%si, %di
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcw	%si, %di
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrw	%si, %di
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsw	%si, %di
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btcw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btrw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btsw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btw	$7, %di
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcw	$7, %di
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrw	$7, %di
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsw	$7, %di
+# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   btw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btcw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btrw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btsw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btl	%esi, %edi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcl	%esi, %edi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrl	%esi, %edi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsl	%esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btcl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btrl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btsl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btl	$7, %edi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcl	$7, %edi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrl	$7, %edi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsl	$7, %edi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   btl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btcl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btrl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btsl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btcq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btrq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btsq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btq	$7, %rdi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcq	$7, %rdi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrq	$7, %rdi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsq	$7, %rdi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   btq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btcq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btrq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btsq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cbtw
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cwtl
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cltq
+# CHECK-NEXT:  -      -     1.33   0.33    -     0.33    -      -     cwtd
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     cltd
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     cqto
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     clc
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cld
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmc
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpb	$7, %al
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpb	$7, %dil
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpb	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpb	%sil, %dil
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpb	%sil, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpb	(%rax), %dil
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpw	$511, %ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpw	$511, %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpw	$511, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpw	$7, %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpw	%si, %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpw	(%rax), %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpl	$665536, %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpl	$665536, %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpl	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpl	$7, %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpl	%esi, %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpq	$665536, %rax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpq	$665536, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpq	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpq	$7, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpq	(%rax), %rdi
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   1.00   1.00   cmpsb	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   1.00   1.00   cmpsw	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   1.00   1.00   cmpsl	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   1.00   1.00   cmpsq	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -     1.50   1.00    -     1.50    -      -     cmpxchgb	%cl, %bl
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   2.33   1.00   1.00   cmpxchgb	%cl, (%rbx)
+# CHECK-NEXT:  -      -     1.50   1.00    -     1.50    -      -     cmpxchgw	%cx, %bx
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   2.33   1.00   1.00   cmpxchgw	%cx, (%rbx)
+# CHECK-NEXT:  -      -     1.50   1.00    -     1.50    -      -     cmpxchgl	%ecx, %ebx
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   2.33   1.00   1.00   cmpxchgl	%ecx, (%rbx)
+# CHECK-NEXT:  -      -     1.50   1.00    -     1.50    -      -     cmpxchgq	%rcx, %rbx
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   2.33   1.00   1.00   cmpxchgq	%rcx, (%rbx)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cpuid
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     decb	%dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   decb	(%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     decw	%di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   decw	(%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     decl	%edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   decl	(%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     decq	%rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   decq	(%rax)
+# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     divb	%dil
+# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   divb	(%rax)
+# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     divw	%si
+# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   divw	(%rax)
+# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     divl	%edx
+# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   divl	(%rax)
+# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     divq	%rcx
+# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   divq	(%rax)
+# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     idivb	%dil
+# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   idivb	(%rax)
+# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     idivw	%si
+# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   idivw	(%rax)
+# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     idivl	%edx
+# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   idivl	(%rax)
+# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     idivq	%rcx
+# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   idivq	(%rax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulb	%dil
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imulb	(%rax)
+# CHECK-NEXT:  -      -     1.17   1.67    -     1.17    -      -     imulw	%di
+# CHECK-NEXT:  -      -     1.17   1.67    -     1.17   0.50   0.50   imulw	(%rax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulw	%si, %di
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imulw	(%rax), %di
+# CHECK-NEXT:  -      -     0.33   1.33    -     0.33    -      -     imulw	$511, %si, %di
+# CHECK-NEXT:  -      -     0.33   1.33    -     0.33   0.50   0.50   imulw	$511, (%rax), %di
+# CHECK-NEXT:  -      -     0.33   1.33    -     0.33    -      -     imulw	$7, %si, %di
+# CHECK-NEXT:  -      -     0.33   1.33    -     0.33   0.50   0.50   imulw	$7, (%rax), %di
+# CHECK-NEXT:  -      -     0.83   1.33    -     0.83    -      -     imull	%edi
+# CHECK-NEXT:  -      -     0.83   1.33    -     0.83   0.50   0.50   imull	(%rax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imull	%esi, %edi
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imull	(%rax), %edi
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imull	$665536, %esi, %edi
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imull	$665536, (%rax), %edi
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imull	$7, %esi, %edi
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imull	$7, (%rax), %edi
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     imulq	%rdi
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   imulq	(%rax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imulq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulq	$665536, %rsi, %rdi
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imulq	$665536, (%rax), %rdi
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulq	$7, %rsi, %rdi
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imulq	$7, (%rax), %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inb	$7, %al
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inb	%dx, %al
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inw	$7, %ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inw	%dx, %ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inl	$7, %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inl	%dx, %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     incb	%dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   incb	(%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     incw	%di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   incw	(%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     incl	%edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   incl	(%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     incq	%rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   incq	(%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     insb	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     insw	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     insl	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     int	$7
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     lahf
+# CHECK-NEXT:  -      -     0.67   0.67    -     0.67   0.50   0.50   lodsb	(%rsi), %al
+# CHECK-NEXT:  -      -     0.67   0.67    -     0.67   0.50   0.50   lodsw	(%rsi), %ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   lodsl	(%rsi), %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   lodsq	(%rsi), %rax
+# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   movsb	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   movsw	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   movsl	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   movsq	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movsbw	%al, %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movzbw	%al, %di
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movsbw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movzbw	(%rax), %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movsbl	%al, %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movzbl	%al, %edi
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movsbl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movzbl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movsbq	%al, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movzbq	%al, %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movsbq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movzbq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movswl	%ax, %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movzwl	%ax, %edi
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movswl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movzwl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movswq	%ax, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movzwq	%ax, %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movswq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movzwq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movslq	%eax, %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movslq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     mulb	%dil
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   mulb	(%rax)
+# CHECK-NEXT:  -      -     1.17   1.67    -     1.17    -      -     mulw	%si
+# CHECK-NEXT:  -      -     1.17   1.67    -     1.17   0.50   0.50   mulw	(%rax)
+# CHECK-NEXT:  -      -     0.83   1.33    -     0.83    -      -     mull	%edx
+# CHECK-NEXT:  -      -     0.83   1.33    -     0.83   0.50   0.50   mull	(%rax)
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     mulq	%rcx
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   mulq	(%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     negb	%dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   negb	(%r8)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     negw	%si
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   negw	(%r9)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     negl	%edx
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   negl	(%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     negq	%rcx
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   negq	(%r10)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     nop
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopw	%di
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopw	(%rcx)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopl	%esi
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopl	(%r8)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopq	%rdx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopq	(%r9)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     notb	%dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   notb	(%r8)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     notw	%si
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   notw	(%r9)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     notl	%edx
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   notl	(%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     notq	%rcx
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   notq	(%r10)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orb	$7, %al
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orb	$7, %dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orb	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orb	%sil, %dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orb	%sil, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   orb	(%rax), %dil
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orw	$511, %ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orw	$511, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orw	$511, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orw	$7, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orw	%si, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   orw	(%rax), %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orl	$665536, %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orl	$665536, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orl	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orl	$7, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orl	%esi, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   orl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orq	$665536, %rax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orq	$665536, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orq	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orq	$7, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   orq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outb	%al, $7
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outb	%al, %dx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outw	%ax, $7
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outw	%ax, %dx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outl	%eax, $7
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outl	%eax, %dx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outsb	(%rsi), %dx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outsw	(%rsi), %dx
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outsl	(%rsi), %dx
+# CHECK-NEXT:  -      -     1.00   1.00    -     2.00    -      -     pause
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rclb	%dil
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rcrb	%dil
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclb	(%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrb	(%rax)
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclb	$7, %dil
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrb	$7, %dil
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclb	$7, (%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrb	$7, (%rax)
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclb	%cl, %dil
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrb	%cl, %dil
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclb	%cl, (%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrb	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rclw	%di
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rcrw	%di
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclw	(%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrw	(%rax)
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclw	$7, %di
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrw	$7, %di
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclw	$7, (%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrw	$7, (%rax)
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclw	%cl, %di
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrw	%cl, %di
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclw	%cl, (%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrw	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rcll	%edi
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rcrl	%edi
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcll	(%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrl	(%rax)
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcll	$7, %edi
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrl	$7, %edi
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcll	$7, (%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrl	$7, (%rax)
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcll	%cl, %edi
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrl	%cl, %edi
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcll	%cl, (%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrl	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rclq	%rdi
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rcrq	%rdi
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclq	(%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrq	(%rax)
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclq	$7, %rdi
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrq	$7, %rdi
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclq	$7, (%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrq	$7, (%rax)
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclq	%cl, %rdi
+# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrq	%cl, %rdi
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclq	%cl, (%rax)
+# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrq	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolb	%dil
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorb	%dil
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolb	(%rax)
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorb	(%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolb	$7, %dil
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorb	$7, %dil
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolb	$7, (%rax)
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorb	$7, (%rax)
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rolb	%cl, %dil
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rorb	%cl, %dil
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rolb	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rorb	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolw	%di
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorw	%di
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolw	(%rax)
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorw	(%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolw	$7, %di
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorw	$7, %di
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolw	$7, (%rax)
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorw	$7, (%rax)
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rolw	%cl, %di
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rorw	%cl, %di
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rolw	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rorw	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     roll	%edi
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorl	%edi
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   roll	(%rax)
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorl	(%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     roll	$7, %edi
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorl	$7, %edi
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   roll	$7, (%rax)
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorl	$7, (%rax)
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     roll	%cl, %edi
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rorl	%cl, %edi
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   roll	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rorl	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolq	%rdi
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorq	%rdi
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolq	(%rax)
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorq	(%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolq	$7, %rdi
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorq	$7, %rdi
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolq	$7, (%rax)
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorq	$7, (%rax)
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rolq	%cl, %rdi
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rorq	%cl, %rdi
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rolq	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rorq	%cl, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sahf
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarb	%dil
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlb	%dil
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrb	%dil
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarb	(%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlb	(%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrb	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarb	$7, %dil
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlb	$7, %dil
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrb	$7, %dil
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarb	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlb	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrb	$7, (%rax)
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     sarb	%cl, %dil
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shlb	%cl, %dil
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shrb	%cl, %dil
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   sarb	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shlb	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shrb	%cl, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarw	%di
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlw	%di
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrw	%di
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarw	(%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlw	(%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrw	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarw	$7, %di
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlw	$7, %di
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrw	$7, %di
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrw	$7, (%rax)
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     sarw	%cl, %di
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shlw	%cl, %di
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shrw	%cl, %di
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   sarw	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shlw	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shrw	%cl, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarl	%edi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shll	%edi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrl	%edi
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarl	(%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shll	(%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrl	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarl	$7, %edi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shll	$7, %edi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrl	$7, %edi
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shll	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrl	$7, (%rax)
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     sarl	%cl, %edi
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shll	%cl, %edi
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shrl	%cl, %edi
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   sarl	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shll	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shrl	%cl, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarq	%rdi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlq	%rdi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrq	%rdi
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarq	(%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlq	(%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrq	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarq	$7, %rdi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlq	$7, %rdi
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrq	$7, %rdi
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrq	$7, (%rax)
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     sarq	%cl, %rdi
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shlq	%cl, %rdi
+# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shrq	%cl, %rdi
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   sarq	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shlq	%cl, (%rax)
+# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shrq	%cl, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbb	$7, %al
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbb	$7, %dil
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbb	$7, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbb	%sil, %dil
+# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   sbbb	%sil, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   sbbb	(%rax), %dil
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbw	$511, %ax
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbw	$511, %di
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbw	$511, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbw	$7, %di
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbw	%si, %di
+# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   sbbw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   sbbw	(%rax), %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbl	$665536, %eax
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbl	$665536, %edi
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbl	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbl	$7, %edi
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbl	%esi, %edi
+# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   sbbl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   sbbl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbq	$665536, %rax
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbq	$665536, %rdi
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbq	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbq	$7, %rdi
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbq	%rsi, %rdi
+# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   sbbq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   sbbq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     scasb	%es:(%rdi), %al
+# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     scasw	%es:(%rdi), %ax
+# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     scasl	%es:(%rdi), %eax
+# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     scasq	%es:(%rdi), %rax
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     seto	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   seto	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setno	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setno	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setb	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setb	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setae	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setae	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sete	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   sete	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setne	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setne	(%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     seta	%al
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   seta	(%rax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     setbe	%al
+# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   setbe	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sets	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   sets	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setns	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setns	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setp	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setp	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setnp	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setnp	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setl	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setl	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setge	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setge	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setg	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setg	(%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setle	%al
+# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setle	(%rax)
+# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shldw	%cl, %si, %di
+# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shrdw	%cl, %si, %di
+# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shldw	%cl, %si, (%rax)
+# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shrdw	%cl, %si, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shldw	$7, %si, %di
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shrdw	$7, %si, %di
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shldw	$7, %si, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shrdw	$7, %si, (%rax)
+# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shldl	%cl, %esi, %edi
+# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shrdl	%cl, %esi, %edi
+# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shldl	%cl, %esi, (%rax)
+# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shrdl	%cl, %esi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shldl	$7, %esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shrdl	$7, %esi, %edi
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shldl	$7, %esi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shrdl	$7, %esi, (%rax)
+# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shldq	%cl, %rsi, %rdi
+# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shrdq	%cl, %rsi, %rdi
+# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shldq	%cl, %rsi, (%rax)
+# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shrdq	%cl, %rsi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shldq	$7, %rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shrdq	$7, %rsi, %rdi
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shldq	$7, %rsi, (%rax)
+# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shrdq	$7, %rsi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     stc
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     std
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   0.50   0.50   stosb	%al, %es:(%rdi)
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   0.50   0.50   stosw	%ax, %es:(%rdi)
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   0.50   0.50   stosl	%eax, %es:(%rdi)
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   0.50   0.50   stosq	%rax, %es:(%rdi)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subb	$7, %al
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subb	$7, %dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subb	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subb	%sil, %dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subb	%sil, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   subb	(%rax), %dil
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subw	$511, %ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subw	$511, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subw	$511, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subw	$7, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subw	%si, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   subw	(%rax), %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subl	$665536, %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subl	$665536, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subl	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subl	$7, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subl	%esi, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   subl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subq	$665536, %rax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subq	$665536, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subq	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subq	$7, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   subq	(%rax), %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testb	$7, %al
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testb	$7, %dil
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testb	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testb	%sil, %dil
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testb	%sil, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testw	$511, %ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testw	$511, %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testw	$511, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testw	$7, %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testw	%si, %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testl	$665536, %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testl	$665536, %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testl	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testl	$7, %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testl	%esi, %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testq	$665536, %rax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testq	$665536, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testq	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testq	$7, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     ud2
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xaddb	%bl, %cl
+# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   xaddb	%bl, (%rcx)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xaddw	%bx, %cx
+# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   xaddw	%ax, (%rbx)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xaddl	%ebx, %ecx
+# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   xaddl	%eax, (%rbx)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xaddq	%rbx, %rcx
+# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   xaddq	%rax, (%rbx)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgb	%bl, %cl
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xchgb	%bl, (%rbx)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgw	%bx, %ax
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgw	%bx, %cx
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xchgw	%ax, (%rbx)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgl	%ebx, %eax
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgl	%ebx, %ecx
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xchgl	%eax, (%rbx)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgq	%rbx, %rax
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgq	%rbx, %rcx
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xchgq	%rax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   xlatb
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorb	$7, %al
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorb	$7, %dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorb	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorb	%sil, %dil
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorb	%sil, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   xorb	(%rax), %dil
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorw	$511, %ax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorw	$511, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorw	$511, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorw	$7, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorw	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorw	%si, %di
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorw	%si, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   xorw	(%rax), %di
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorl	$665536, %eax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorl	$665536, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorl	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorl	$7, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorl	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorl	%esi, %edi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorl	%esi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   xorl	(%rax), %edi
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorq	$665536, %rax
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorq	$665536, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorq	$665536, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorq	$7, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorq	$7, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorq	%rsi, %rdi
+# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorq	%rsi, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   xorq	(%rax), %rdi
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x87.s b/test/tools/llvm-mca/X86/BdVer2/resources-x87.s
new file mode 100644
index 00000000000..1cba9a7d77f
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x87.s
@@ -0,0 +1,521 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+f2xm1
+
+fabs
+
+fadd %st(0), %st(1)
+fadd %st(2)
+fadds (%ecx)
+faddl (%ecx)
+faddp %st(1)
+faddp %st(2)
+fiadds (%ecx)
+fiaddl (%ecx)
+
+fbld (%ecx)
+fbstp (%eax)
+
+fchs
+
+fnclex
+
+fcmovb %st(1), %st(0)
+fcmovbe %st(1), %st(0)
+fcmove %st(1), %st(0)
+fcmovnb %st(1), %st(0)
+fcmovnbe %st(1), %st(0)
+fcmovne %st(1), %st(0)
+fcmovnu %st(1), %st(0)
+fcmovu %st(1), %st(0)
+
+fcom %st(1)
+fcom %st(3)
+fcoms (%ecx)
+fcoml (%eax)
+fcomp %st(1)
+fcomp %st(3)
+fcomps (%ecx)
+fcompl (%eax)
+fcompp
+
+fcomi %st(3)
+fcompi %st(3)
+
+fcos
+
+fdecstp
+
+fdiv %st(0), %st(1)
+fdiv %st(2)
+fdivs (%ecx)
+fdivl (%eax)
+fdivp %st(1)
+fdivp %st(2)
+fidivs (%ecx)
+fidivl (%eax)
+
+fdivr %st(0), %st(1)
+fdivr %st(2)
+fdivrs (%ecx)
+fdivrl (%eax)
+fdivrp %st(1)
+fdivrp %st(2)
+fidivrs (%ecx)
+fidivrl (%eax)
+
+ffree %st(0)
+
+ficoms (%ecx)
+ficoml (%eax)
+ficomps (%ecx)
+ficompl (%eax)
+
+filds (%edx)
+fildl (%ecx)
+fildll (%eax)
+
+fincstp
+
+fninit
+
+fists (%edx)
+fistl (%ecx)
+fistps (%edx)
+fistpl (%ecx)
+fistpll (%eax)
+
+fisttps (%edx)
+fisttpl (%ecx)
+fisttpll (%eax)
+
+fld %st(0)
+flds (%edx)
+fldl (%ecx)
+fldt (%eax)
+
+fldcw (%eax)
+fldenv (%eax)
+
+fld1
+fldl2e
+fldl2t
+fldlg2
+fldln2
+fldpi
+fldz
+
+fmul %st(0), %st(1)
+fmul %st(2)
+fmuls (%ecx)
+fmull (%eax)
+fmulp %st(1)
+fmulp %st(2)
+fimuls (%ecx)
+fimull (%eax)
+
+fnop
+
+fpatan
+
+fprem
+fprem1
+
+fptan
+
+frndint
+
+frstor (%eax)
+
+fnsave (%eax)
+
+fscale
+
+fsin
+
+fsincos
+
+fsqrt
+
+fst %st(0)
+fsts (%edx)
+fstl (%ecx)
+fstp %st(0)
+fstpl (%edx)
+fstpl (%ecx)
+fstpt (%eax)
+
+fnstcw (%eax)
+fnstenv (%eax)
+fnstsw (%eax)
+
+frstor (%eax)
+fsave (%eax)
+
+fsub %st(0), %st(1)
+fsub %st(2)
+fsubs (%ecx)
+fsubl (%eax)
+fsubp %st(1)
+fsubp %st(2)
+fisubs (%ecx)
+fisubl (%eax)
+
+fsubr %st(0), %st(1)
+fsubr %st(2)
+fsubrs (%ecx)
+fsubrl (%eax)
+fsubrp %st(1)
+fsubrp %st(2)
+fisubrs (%ecx)
+fisubrl (%eax)
+
+ftst
+
+fucom %st(1)
+fucom %st(3)
+fucomp %st(1)
+fucomp %st(3)
+fucompp
+
+fucomi %st(3)
+fucompi %st(3)
+
+fwait
+
+fxam
+
+fxch %st(1)
+fxch %st(3)
+
+fxrstor (%eax)
+fxsave (%eax)
+
+fxtract
+
+fyl2x
+fyl2xp1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.33                  U     f2xm1
+# CHECK-NEXT:  1      1     1.00                  U     fabs
+# CHECK-NEXT:  1      3     1.00                  U     fadd	%st(0), %st(1)
+# CHECK-NEXT:  1      3     1.00                  U     fadd	%st(2)
+# CHECK-NEXT:  2      10    1.00    *             U     fadds	(%ecx)
+# CHECK-NEXT:  2      10    1.00    *             U     faddl	(%ecx)
+# CHECK-NEXT:  1      3     1.00                  U     faddp	%st(1)
+# CHECK-NEXT:  1      3     1.00                  U     faddp	%st(2)
+# CHECK-NEXT:  3      13    2.00    *             U     fiadds	(%ecx)
+# CHECK-NEXT:  3      13    2.00    *             U     fiaddl	(%ecx)
+# CHECK-NEXT:  1      100   0.33                  U     fbld	(%ecx)
+# CHECK-NEXT:  1      100   0.33                  U     fbstp	(%eax)
+# CHECK-NEXT:  1      1     1.00                  U     fchs
+# CHECK-NEXT:  1      100   0.33                  U     fnclex
+# CHECK-NEXT:  3      3     2.00                  U     fcmovb	%st(1), %st(0)
+# CHECK-NEXT:  3      3     2.00                  U     fcmovbe	%st(1), %st(0)
+# CHECK-NEXT:  3      3     2.00                  U     fcmove	%st(1), %st(0)
+# CHECK-NEXT:  3      3     2.00                  U     fcmovnb	%st(1), %st(0)
+# CHECK-NEXT:  3      3     2.00                  U     fcmovnbe	%st(1), %st(0)
+# CHECK-NEXT:  3      3     2.00                  U     fcmovne	%st(1), %st(0)
+# CHECK-NEXT:  3      3     2.00                  U     fcmovnu	%st(1), %st(0)
+# CHECK-NEXT:  3      3     2.00                  U     fcmovu	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcom	%st(1)
+# CHECK-NEXT:  1      1     1.00                  U     fcom	%st(3)
+# CHECK-NEXT:  2      8     1.00                  U     fcoms	(%ecx)
+# CHECK-NEXT:  2      8     1.00                  U     fcoml	(%eax)
+# CHECK-NEXT:  1      1     1.00                  U     fcomp	%st(1)
+# CHECK-NEXT:  1      1     1.00                  U     fcomp	%st(3)
+# CHECK-NEXT:  2      8     1.00                  U     fcomps	(%ecx)
+# CHECK-NEXT:  2      8     1.00                  U     fcompl	(%eax)
+# CHECK-NEXT:  1      100   0.33                  U     fcompp
+# CHECK-NEXT:  3      3     1.00                  U     fcomi	%st(3)
+# CHECK-NEXT:  3      3     1.00                  U     fcompi	%st(3)
+# CHECK-NEXT:  1      100   0.33                  U     fcos
+# CHECK-NEXT:  1      1     1.00                  U     fdecstp
+# CHECK-NEXT:  1      14    14.00                 U     fdiv	%st(0), %st(1)
+# CHECK-NEXT:  1      14    14.00                 U     fdiv	%st(2)
+# CHECK-NEXT:  2      31    1.00    *             U     fdivs	(%ecx)
+# CHECK-NEXT:  2      31    1.00    *             U     fdivl	(%eax)
+# CHECK-NEXT:  1      14    14.00                 U     fdivp	%st(1)
+# CHECK-NEXT:  1      14    14.00                 U     fdivp	%st(2)
+# CHECK-NEXT:  3      34    1.00    *             U     fidivs	(%ecx)
+# CHECK-NEXT:  3      34    1.00    *             U     fidivl	(%eax)
+# CHECK-NEXT:  1      14    14.00                 U     fdivr	%st(0), %st(1)
+# CHECK-NEXT:  1      14    14.00                 U     fdivr	%st(2)
+# CHECK-NEXT:  2      31    1.00    *             U     fdivrs	(%ecx)
+# CHECK-NEXT:  2      31    1.00    *             U     fdivrl	(%eax)
+# CHECK-NEXT:  1      14    14.00                 U     fdivrp	%st(1)
+# CHECK-NEXT:  1      14    14.00                 U     fdivrp	%st(2)
+# CHECK-NEXT:  3      34    1.00    *             U     fidivrs	(%ecx)
+# CHECK-NEXT:  3      34    1.00    *             U     fidivrl	(%eax)
+# CHECK-NEXT:  1      1     1.00                  U     ffree	%st(0)
+# CHECK-NEXT:  3      11    2.00                  U     ficoms	(%ecx)
+# CHECK-NEXT:  3      11    2.00                  U     ficoml	(%eax)
+# CHECK-NEXT:  3      11    2.00                  U     ficomps	(%ecx)
+# CHECK-NEXT:  3      11    2.00                  U     ficompl	(%eax)
+# CHECK-NEXT:  2      10    1.00    *             U     filds	(%edx)
+# CHECK-NEXT:  2      10    1.00    *             U     fildl	(%ecx)
+# CHECK-NEXT:  2      10    1.00    *             U     fildll	(%eax)
+# CHECK-NEXT:  1      1     1.00                  U     fincstp
+# CHECK-NEXT:  4      5     1.33                  U     fninit
+# CHECK-NEXT:  4      9     1.00           *      U     fists	(%edx)
+# CHECK-NEXT:  4      9     1.00           *      U     fistl	(%ecx)
+# CHECK-NEXT:  4      9     1.00           *      U     fistps	(%edx)
+# CHECK-NEXT:  4      9     1.00           *      U     fistpl	(%ecx)
+# CHECK-NEXT:  4      9     1.00           *      U     fistpll	(%eax)
+# CHECK-NEXT:  3      5     1.00           *      U     fisttps	(%edx)
+# CHECK-NEXT:  3      5     1.00           *      U     fisttpl	(%ecx)
+# CHECK-NEXT:  3      5     1.00           *      U     fisttpll	(%eax)
+# CHECK-NEXT:  1      1     1.00                  U     fld	%st(0)
+# CHECK-NEXT:  3      9     1.00    *             U     flds	(%edx)
+# CHECK-NEXT:  3      9     1.00    *             U     fldl	(%ecx)
+# CHECK-NEXT:  3      9     1.00    *             U     fldt	(%eax)
+# CHECK-NEXT:  5      8     2.00    *             U     fldcw	(%eax)
+# CHECK-NEXT:  1      100   0.33                  U     fldenv	(%eax)
+# CHECK-NEXT:  2      1     1.00                  U     fld1
+# CHECK-NEXT:  2      1     1.00                  U     fldl2e
+# CHECK-NEXT:  2      1     1.00                  U     fldl2t
+# CHECK-NEXT:  2      1     1.00                  U     fldlg2
+# CHECK-NEXT:  2      1     1.00                  U     fldln2
+# CHECK-NEXT:  2      1     1.00                  U     fldpi
+# CHECK-NEXT:  1      1     1.00                  U     fldz
+# CHECK-NEXT:  1      5     1.00                  U     fmul	%st(0), %st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fmul	%st(2)
+# CHECK-NEXT:  2      12    1.00    *             U     fmuls	(%ecx)
+# CHECK-NEXT:  2      12    1.00    *             U     fmull	(%eax)
+# CHECK-NEXT:  1      5     1.00                  U     fmulp	%st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fmulp	%st(2)
+# CHECK-NEXT:  3      15    1.00    *             U     fimuls	(%ecx)
+# CHECK-NEXT:  3      15    1.00    *             U     fimull	(%eax)
+# CHECK-NEXT:  1      1     1.00                  U     fnop
+# CHECK-NEXT:  1      100   0.33                  U     fpatan
+# CHECK-NEXT:  1      100   0.33                  U     fprem
+# CHECK-NEXT:  1      100   0.33                  U     fprem1
+# CHECK-NEXT:  1      100   0.33                  U     fptan
+# CHECK-NEXT:  1      100   0.33                  U     frndint
+# CHECK-NEXT:  1      100   0.33                  U     frstor	(%eax)
+# CHECK-NEXT:  1      100   0.33                  U     fnsave	(%eax)
+# CHECK-NEXT:  1      100   0.33                  U     fscale
+# CHECK-NEXT:  1      100   0.33                  U     fsin
+# CHECK-NEXT:  1      100   0.33                  U     fsincos
+# CHECK-NEXT:  1      24    24.00                 U     fsqrt
+# CHECK-NEXT:  1      1     1.00                  U     fst	%st(0)
+# CHECK-NEXT:  3      6     1.00           *      U     fsts	(%edx)
+# CHECK-NEXT:  3      6     1.00           *      U     fstl	(%ecx)
+# CHECK-NEXT:  1      1     1.00                  U     fstp	%st(0)
+# CHECK-NEXT:  3      6     1.00           *      U     fstpl	(%edx)
+# CHECK-NEXT:  3      6     1.00           *      U     fstpl	(%ecx)
+# CHECK-NEXT:  3      6     1.00           *      U     fstpt	(%eax)
+# CHECK-NEXT:  4      7     1.00           *      U     fnstcw	(%eax)
+# CHECK-NEXT:  1      100   0.33                  U     fnstenv	(%eax)
+# CHECK-NEXT:  4      7     1.00                  U     fnstsw	(%eax)
+# CHECK-NEXT:  1      100   0.33                  U     frstor	(%eax)
+# CHECK-NEXT:  1      100   0.33                  U     wait
+# CHECK-NEXT:  1      100   0.33                  U     fnsave	(%eax)
+# CHECK-NEXT:  1      3     1.00                  U     fsub	%st(0), %st(1)
+# CHECK-NEXT:  1      3     1.00                  U     fsub	%st(2)
+# CHECK-NEXT:  2      10    1.00    *             U     fsubs	(%ecx)
+# CHECK-NEXT:  2      10    1.00    *             U     fsubl	(%eax)
+# CHECK-NEXT:  1      3     1.00                  U     fsubp	%st(1)
+# CHECK-NEXT:  1      3     1.00                  U     fsubp	%st(2)
+# CHECK-NEXT:  3      13    2.00    *             U     fisubs	(%ecx)
+# CHECK-NEXT:  3      13    2.00    *             U     fisubl	(%eax)
+# CHECK-NEXT:  1      3     1.00                  U     fsubr	%st(0), %st(1)
+# CHECK-NEXT:  1      3     1.00                  U     fsubr	%st(2)
+# CHECK-NEXT:  2      10    1.00    *             U     fsubrs	(%ecx)
+# CHECK-NEXT:  2      10    1.00    *             U     fsubrl	(%eax)
+# CHECK-NEXT:  1      3     1.00                  U     fsubrp	%st(1)
+# CHECK-NEXT:  1      3     1.00                  U     fsubrp	%st(2)
+# CHECK-NEXT:  3      13    2.00    *             U     fisubrs	(%ecx)
+# CHECK-NEXT:  3      13    2.00    *             U     fisubrl	(%eax)
+# CHECK-NEXT:  1      3     1.00                  U     ftst
+# CHECK-NEXT:  1      1     1.00                  U     fucom	%st(1)
+# CHECK-NEXT:  1      1     1.00                  U     fucom	%st(3)
+# CHECK-NEXT:  1      1     1.00                  U     fucomp	%st(1)
+# CHECK-NEXT:  1      1     1.00                  U     fucomp	%st(3)
+# CHECK-NEXT:  1      3     1.00                  U     fucompp
+# CHECK-NEXT:  3      3     1.00                  U     fucomi	%st(3)
+# CHECK-NEXT:  3      3     1.00                  U     fucompi	%st(3)
+# CHECK-NEXT:  1      100   0.33                  U     wait
+# CHECK-NEXT:  1      100   0.33                  U     fxam
+# CHECK-NEXT:  1      1     0.33                  U     fxch	%st(1)
+# CHECK-NEXT:  1      1     0.33                  U     fxch	%st(3)
+# CHECK-NEXT:  5      5     2.00    *      *      U     fxrstor	(%eax)
+# CHECK-NEXT:  1      100   0.33    *      *      U     fxsave	(%eax)
+# CHECK-NEXT:  1      100   0.33                  U     fxtract
+# CHECK-NEXT:  1      100   0.33                  U     fyl2x
+# CHECK-NEXT:  1      100   0.33                  U     fyl2xp1
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -     136.00 52.67  90.67  17.00  54.67  34.00  34.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     f2xm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fabs
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fadd	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fadd	%st(2)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fadds	(%ecx)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   faddl	(%ecx)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     faddp	%st(1)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     faddp	%st(2)
+# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fiadds	(%ecx)
+# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fiaddl	(%ecx)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fbld	(%ecx)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fbstp	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fchs
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fnclex
+# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovb	%st(1), %st(0)
+# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovbe	%st(1), %st(0)
+# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmove	%st(1), %st(0)
+# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovnb	%st(1), %st(0)
+# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovnbe	%st(1), %st(0)
+# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovne	%st(1), %st(0)
+# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovnu	%st(1), %st(0)
+# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovu	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fcom	%st(1)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fcom	%st(3)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fcoms	(%ecx)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fcoml	(%eax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fcomp	%st(1)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fcomp	%st(3)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fcomps	(%ecx)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fcompl	(%eax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fcompp
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     fcomi	%st(3)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     fcompi	%st(3)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fcos
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fdecstp
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdiv	%st(0), %st(1)
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdiv	%st(2)
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fdivs	(%ecx)
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fdivl	(%eax)
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivp	%st(1)
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivp	%st(2)
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fidivs	(%ecx)
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fidivl	(%eax)
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivr	%st(0), %st(1)
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivr	%st(2)
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fdivrs	(%ecx)
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fdivrl	(%eax)
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivrp	%st(1)
+# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivrp	%st(2)
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fidivrs	(%ecx)
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fidivrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     ffree	%st(0)
+# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   ficoms	(%ecx)
+# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   ficoml	(%eax)
+# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   ficomps	(%ecx)
+# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   ficompl	(%eax)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   filds	(%edx)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fildl	(%ecx)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fildll	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fincstp
+# CHECK-NEXT:  -      -     1.00   1.00    -     2.00    -      -     fninit
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     1.00   1.00   fists	(%edx)
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     1.00   1.00   fistl	(%ecx)
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     1.00   1.00   fistps	(%edx)
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     1.00   1.00   fistpl	(%ecx)
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     1.00   1.00   fistpll	(%eax)
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     0.50   0.50   fisttps	(%edx)
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     0.50   0.50   fisttpl	(%ecx)
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     0.50   0.50   fisttpll	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fld	%st(0)
+# CHECK-NEXT:  -      -     0.50   0.50    -     1.00   0.50   0.50   flds	(%edx)
+# CHECK-NEXT:  -      -     0.50   0.50    -     1.00   0.50   0.50   fldl	(%ecx)
+# CHECK-NEXT:  -      -     0.50   0.50    -     1.00   0.50   0.50   fldt	(%eax)
+# CHECK-NEXT:  -      -      -      -     1.00   2.00   1.00   1.00   fldcw	(%eax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fldenv	(%eax)
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     fld1
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     fldl2e
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     fldl2t
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     fldlg2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     fldln2
+# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     fldpi
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fldz
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     fmul	%st(0), %st(1)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     fmul	%st(2)
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fmuls	(%ecx)
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fmull	(%eax)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     fmulp	%st(1)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     fmulp	%st(2)
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fimuls	(%ecx)
+# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fimull	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fnop
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fpatan
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fprem
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fprem1
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fptan
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     frndint
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     frstor	(%eax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fnsave	(%eax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fscale
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fsin
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fsincos
+# CHECK-NEXT:  -     24.00  1.00    -      -      -      -      -     fsqrt
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fst	%st(0)
+# CHECK-NEXT:  -      -      -      -     1.00    -     1.00   1.00   fsts	(%edx)
+# CHECK-NEXT:  -      -      -      -     1.00    -     1.00   1.00   fstl	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fstp	%st(0)
+# CHECK-NEXT:  -      -      -      -     1.00    -     1.00   1.00   fstpl	(%edx)
+# CHECK-NEXT:  -      -      -      -     1.00    -     1.00   1.00   fstpl	(%ecx)
+# CHECK-NEXT:  -      -      -      -     1.00    -     1.00   1.00   fstpt	(%eax)
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00   1.00   fnstcw	(%eax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fnstenv	(%eax)
+# CHECK-NEXT:  -      -     1.00    -     1.00    -     1.00   1.00   fnstsw	(%eax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     frstor	(%eax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     wait
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fnsave	(%eax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsub	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsub	%st(2)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fsubs	(%ecx)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fsubl	(%eax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubp	%st(1)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubp	%st(2)
+# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fisubs	(%ecx)
+# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fisubl	(%eax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubr	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubr	%st(2)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fsubrs	(%ecx)
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fsubrl	(%eax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubrp	%st(1)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubrp	%st(2)
+# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fisubrs	(%ecx)
+# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fisubrl	(%eax)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     ftst
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fucom	%st(1)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fucom	%st(3)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fucomp	%st(1)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fucomp	%st(3)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fucompp
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     fucomi	%st(3)
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     fucompi	%st(3)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     wait
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fxam
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fxch	%st(1)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fxch	%st(3)
+# CHECK-NEXT:  -      -     0.50   0.50   1.00   2.00   0.50   0.50   fxrstor	(%eax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fxsave	(%eax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fxtract
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fyl2x
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fyl2xp1
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-xop.s b/test/tools/llvm-mca/X86/BdVer2/resources-xop.s
new file mode 100644
index 00000000000..61f39f07d78
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-xop.s
@@ -0,0 +1,534 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+
+vfrczpd %xmm0, %xmm3
+vfrczpd (%rax), %xmm3
+
+vfrczpd %ymm0, %ymm3
+vfrczpd (%rax), %ymm3
+
+vfrczps %xmm0, %xmm3
+vfrczps (%rax), %xmm3
+
+vfrczps %ymm0, %ymm3
+vfrczps (%rax), %ymm3
+
+vfrczsd %xmm0, %xmm3
+vfrczsd (%rax), %xmm3
+
+vfrczss %xmm0, %xmm3
+vfrczss (%rax), %xmm3
+
+vpcmov %xmm0, %xmm1, %xmm2, %xmm3
+vpcmov (%rax), %xmm0, %xmm1, %xmm3
+vpcmov %xmm0, (%rax), %xmm1, %xmm3
+
+vpcmov %ymm0, %ymm1, %ymm2, %ymm3
+vpcmov (%rax), %ymm0, %ymm1, %ymm3
+vpcmov %ymm0, (%rax), %ymm1, %ymm3
+
+vpcomb $0, %xmm0, %xmm1, %xmm3
+vpcomb $0, (%rax), %xmm0, %xmm3
+
+vpcomd $0, %xmm0, %xmm1, %xmm3
+vpcomd $0, (%rax), %xmm0, %xmm3
+
+vpcomq $0, %xmm0, %xmm1, %xmm3
+vpcomq $0, (%rax), %xmm0, %xmm3
+
+vpcomub $0, %xmm0, %xmm1, %xmm3
+vpcomub $0, (%rax), %xmm0, %xmm3
+
+vpcomud $0, %xmm0, %xmm1, %xmm3
+vpcomud $0, (%rax), %xmm0, %xmm3
+
+vpcomuq $0, %xmm0, %xmm1, %xmm3
+vpcomuq $0, (%rax), %xmm0, %xmm3
+
+vpcomuw $0, %xmm0, %xmm1, %xmm3
+vpcomuw $0, (%rax), %xmm0, %xmm3
+
+vpcomw $0, %xmm0, %xmm1, %xmm3
+vpcomw $0, (%rax), %xmm0, %xmm3
+
+vpermil2pd $0, %xmm0, %xmm1, %xmm2, %xmm3
+vpermil2pd $0, (%rax), %xmm0, %xmm1, %xmm3
+vpermil2pd $0, %xmm0, (%rax), %xmm1, %xmm3
+
+vpermil2pd $0, %ymm0, %ymm1, %ymm2, %ymm3
+vpermil2pd $0, (%rax), %ymm0, %ymm1, %ymm3
+vpermil2pd $0, %ymm0, (%rax), %ymm1, %ymm3
+
+vpermil2ps $0, %xmm0, %xmm1, %xmm2, %xmm3
+vpermil2ps $0, (%rax), %xmm0, %xmm1, %xmm3
+vpermil2ps $0, %xmm0, (%rax), %xmm1, %xmm3
+
+vpermil2ps $0, %ymm0, %ymm1, %ymm2, %ymm3
+vpermil2ps $0, (%rax), %ymm0, %ymm1, %ymm3
+vpermil2ps $0, %ymm0, (%rax), %ymm1, %ymm3
+
+vphaddbd %xmm0, %xmm3
+vphaddbd (%rax), %xmm3
+
+vphaddbq %xmm0, %xmm3
+vphaddbq (%rax), %xmm3
+
+vphaddbw %xmm0, %xmm3
+vphaddbw (%rax), %xmm3
+
+vphadddq %xmm0, %xmm3
+vphadddq (%rax), %xmm3
+
+vphaddubd %xmm0, %xmm3
+vphaddubd (%rax), %xmm3
+
+vphaddubq %xmm0, %xmm3
+vphaddubq (%rax), %xmm3
+
+vphaddubw %xmm0, %xmm3
+vphaddubw (%rax), %xmm3
+
+vphaddudq %xmm0, %xmm3
+vphaddudq (%rax), %xmm3
+
+vphadduwd %xmm0, %xmm3
+vphadduwd (%rax), %xmm3
+
+vphadduwq %xmm0, %xmm3
+vphadduwq (%rax), %xmm3
+
+vphaddwd %xmm0, %xmm3
+vphaddwd (%rax), %xmm3
+
+vphaddwq %xmm0, %xmm3
+vphaddwq (%rax), %xmm3
+
+vphsubbw %xmm0, %xmm3
+vphsubbw (%rax), %xmm3
+
+vphsubdq %xmm0, %xmm3
+vphsubdq (%rax), %xmm3
+
+vphsubwd %xmm0, %xmm3
+vphsubwd (%rax), %xmm3
+
+vpmacsdd %xmm0, %xmm1, %xmm2, %xmm3
+vpmacsdd %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacsdqh %xmm0, %xmm1, %xmm2, %xmm3
+vpmacsdqh %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacsdql %xmm0, %xmm1, %xmm2, %xmm3
+vpmacsdql %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacssdd %xmm0, %xmm1, %xmm2, %xmm3
+vpmacssdd %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacssdqh %xmm0, %xmm1, %xmm2, %xmm3
+vpmacssdqh %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacssdql %xmm0, %xmm1, %xmm2, %xmm3
+vpmacssdql %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacsswd %xmm0, %xmm1, %xmm2, %xmm3
+vpmacsswd %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacssww %xmm0, %xmm1, %xmm2, %xmm3
+vpmacssww %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacswd %xmm0, %xmm1, %xmm2, %xmm3
+vpmacswd %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacsww %xmm0, %xmm1, %xmm2, %xmm3
+vpmacsww %xmm0, (%rax), %xmm1, %xmm3
+
+vpmadcsswd %xmm0, %xmm1, %xmm2, %xmm3
+vpmadcsswd %xmm0, (%rax), %xmm1, %xmm3
+
+vpmadcswd %xmm0, %xmm1, %xmm2, %xmm3
+vpmadcswd %xmm0, (%rax), %xmm1, %xmm3
+
+vpperm %xmm0, %xmm1, %xmm2, %xmm3
+vpperm (%rax), %xmm0, %xmm1, %xmm3
+vpperm %xmm0, (%rax), %xmm1, %xmm3
+
+vprotb %xmm0, %xmm1, %xmm3
+vprotb (%rax), %xmm0, %xmm3
+vprotb %xmm0, (%rax), %xmm3
+
+vprotb $0, %xmm0, %xmm3
+vprotb $0, (%rax), %xmm3
+
+vprotd %xmm0, %xmm1, %xmm3
+vprotd (%rax), %xmm0, %xmm3
+vprotd %xmm0, (%rax), %xmm3
+
+vprotd $0, %xmm0, %xmm3
+vprotd $0, (%rax), %xmm3
+
+vprotq %xmm0, %xmm1, %xmm3
+vprotq (%rax), %xmm0, %xmm3
+vprotq %xmm0, (%rax), %xmm3
+
+vprotq $0, %xmm0, %xmm3
+vprotq $0, (%rax), %xmm3
+
+vprotw %xmm0, %xmm1, %xmm3
+vprotw (%rax), %xmm0, %xmm3
+vprotw %xmm0, (%rax), %xmm3
+
+vprotw $0, %xmm0, %xmm3
+vprotw $0, (%rax), %xmm3
+
+vpshab %xmm0, %xmm1, %xmm3
+vpshab (%rax), %xmm0, %xmm3
+vpshab %xmm0, (%rax), %xmm3
+
+vpshad %xmm0, %xmm1, %xmm3
+vpshad (%rax), %xmm0, %xmm3
+vpshad %xmm0, (%rax), %xmm3
+
+vpshaq %xmm0, %xmm1, %xmm3
+vpshaq (%rax), %xmm0, %xmm3
+vpshaq %xmm0, (%rax), %xmm3
+
+vpshaw %xmm0, %xmm1, %xmm3
+vpshaw (%rax), %xmm0, %xmm3
+vpshaw %xmm0, (%rax), %xmm3
+
+vpshlb %xmm0, %xmm1, %xmm3
+vpshlb (%rax), %xmm0, %xmm3
+vpshlb %xmm0, (%rax), %xmm3
+
+vpshld %xmm0, %xmm1, %xmm3
+vpshld (%rax), %xmm0, %xmm3
+vpshld %xmm0, (%rax), %xmm3
+
+vpshlq %xmm0, %xmm1, %xmm3
+vpshlq (%rax), %xmm0, %xmm3
+vpshlq %xmm0, (%rax), %xmm3
+
+vpshlw %xmm0, %xmm1, %xmm3
+vpshlw (%rax), %xmm0, %xmm3
+vpshlw %xmm0, (%rax), %xmm3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        vfrczpd	%xmm0, %xmm3
+# CHECK-NEXT:  2      9     1.00    *                   vfrczpd	(%rax), %xmm3
+# CHECK-NEXT:  1      3     1.00                        vfrczpd	%ymm0, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfrczpd	(%rax), %ymm3
+# CHECK-NEXT:  1      3     1.00                        vfrczps	%xmm0, %xmm3
+# CHECK-NEXT:  2      9     1.00    *                   vfrczps	(%rax), %xmm3
+# CHECK-NEXT:  1      3     1.00                        vfrczps	%ymm0, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfrczps	(%rax), %ymm3
+# CHECK-NEXT:  1      3     1.00                        vfrczsd	%xmm0, %xmm3
+# CHECK-NEXT:  2      9     1.00    *                   vfrczsd	(%rax), %xmm3
+# CHECK-NEXT:  1      3     1.00                        vfrczss	%xmm0, %xmm3
+# CHECK-NEXT:  2      9     1.00    *                   vfrczss	(%rax), %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcmov	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpcmov	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpcmov	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpcmov	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      8     1.00    *                   vpcmov	(%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  2      8     1.00    *                   vpcmov	%ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  1      1     0.50                        vpcomb	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpcomb	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcomd	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpcomd	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcomq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpcomq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcomub	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpcomub	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcomud	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpcomud	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcomuq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpcomuq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcomuw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpcomuw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpcomw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpcomw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpermil2pd	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpermil2pd	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpermil2pd	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpermil2pd	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      8     1.00    *                   vpermil2pd	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  2      8     1.00    *                   vpermil2pd	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  1      1     1.00                        vpermil2ps	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpermil2ps	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpermil2ps	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpermil2ps	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      8     1.00    *                   vpermil2ps	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  2      8     1.00    *                   vpermil2ps	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  3      3     1.50                        vphaddbd	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphaddbd	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphaddbq	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphaddbq	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphaddbw	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphaddbw	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphadddq	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphadddq	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphaddubd	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphaddubd	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphaddubq	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphaddubq	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphaddubw	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphaddubw	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphaddudq	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphaddudq	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphadduwd	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphadduwd	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphadduwq	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphadduwq	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphaddwd	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphaddwd	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphaddwq	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphaddwq	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphsubbw	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphsubbw	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphsubdq	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphsubdq	(%rax), %xmm3
+# CHECK-NEXT:  3      3     1.50                        vphsubwd	%xmm0, %xmm3
+# CHECK-NEXT:  4      9     1.50    *                   vphsubwd	(%rax), %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmacsdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmacsdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmacsdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmacsdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmacsdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmacsdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmacssdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmacssdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmacssdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmacssdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmacssdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmacssdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmacsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmacsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmacssww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmacssww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmacswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmacswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmacsww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmacsww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmadcsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmadcsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     1.00                        vpmadcswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      11    1.00    *                   vpmadcswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpperm	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpperm	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     0.50    *                   vpperm	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      1     1.00                        vprotb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vprotb	$0, %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotb	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vprotd	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotd	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotd	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vprotd	$0, %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotd	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vprotq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vprotq	$0, %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotq	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vprotw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vprotw	$0, %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vprotw	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpshab	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshab	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshab	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpshad	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshad	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshad	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpshaq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshaq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshaq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpshaw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshaw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshaw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpshlb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshlb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshlb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpshld	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshld	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshld	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpshlq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshlq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshlq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      1     1.00                        vpshlw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshlw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  2      7     1.00    *                   vpshlw	%xmm0, (%rax), %xmm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     68.00  68.00   -     71.00  41.50  41.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczpd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczpd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczpd	%ymm0, %ymm3
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczpd	(%rax), %ymm3
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczps	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczps	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczps	%ymm0, %ymm3
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczps	(%rax), %ymm3
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczsd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczsd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczss	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczss	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmov	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmov	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmov	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpcmov	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpcmov	(%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpcmov	%ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomb	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomb	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomd	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomd	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomub	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomub	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomud	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomud	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomuq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomuq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomuw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomuw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermil2pd	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2pd	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2pd	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermil2pd	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2pd	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2pd	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermil2ps	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2ps	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2ps	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermil2ps	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2ps	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2ps	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddbd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddbd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddbq	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddbq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddbw	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddbw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphadddq	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphadddq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddubd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddubd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddubq	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddubq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddubw	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddubw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddudq	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddudq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphadduwd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphadduwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphadduwq	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphadduwq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddwd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddwq	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddwq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubbw	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubbw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubdq	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubdq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubwd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacsdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacsdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacsdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacsdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacsdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacsdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacssdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacssdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacssdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacssdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacssdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacssdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacssww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacssww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacsww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacsww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmadcsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmadcsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmadcswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmadcswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpperm	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpperm	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpperm	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotb	$0, %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotb	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotd	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotd	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotd	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotd	$0, %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotd	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotq	$0, %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotq	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotw	$0, %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotw	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshab	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshab	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshab	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshad	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshad	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshad	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshaq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshaq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshaq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshaw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshaw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshaw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshlb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshld	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshld	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshld	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshlq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshlw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlw	%xmm0, (%rax), %xmm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s b/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s
new file mode 100644
index 00000000000..b2a8f4ac33e
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s
@@ -0,0 +1,60 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -scheduler-stats < %s | FileCheck %s
+
+vmulps (%rsi), %xmm0, %xmm0
+add  %rsi, %rsi
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      14
+# CHECK-NEXT: Total uOps:        3
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.21
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rsi), %xmm0, %xmm0
+# CHECK-NEXT:  1      1     0.33                        addq	%rsi, %rsi
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          13  (92.9%)
+# CHECK-NEXT:  2,          1  (7.1%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: SBPortAny        0          2          54
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00    -      -     1.00    -     1.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00   vmulps	(%rsi), %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     addq	%rsi, %rsi
diff --git a/test/tools/llvm-mca/X86/BdVer2/simple-test.s b/test/tools/llvm-mca/X86/BdVer2/simple-test.s
new file mode 100644
index 00000000000..f0ff718c9e7
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/simple-test.s
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 < %s | FileCheck %s
+
+add %edi, %eax
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      100
+# CHECK-NEXT: Total Cycles:      103
+# CHECK-NEXT: Total uOps:        100
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.97
+# CHECK-NEXT: IPC:               0.97
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        addl	%edi, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -     addl	%edi, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s b/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s
new file mode 100644
index 00000000000..ee54b757da2
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s
@@ -0,0 +1,67 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+leaq 8(%rsp, %rdi, 2), %rax
+vbroadcastss (%rax), %ymm0
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      60
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    3.33
+# CHECK-NEXT: IPC:               3.33
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT:  1      7     0.50    *                   vbroadcastss	(%rax), %ymm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vbroadcastss	(%rax), %ymm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    ..   leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: [0,1]     D=eeeeeeeER.   vbroadcastss	(%rax), %ymm0
+# CHECK-NEXT: [1,0]     DeE-------R.   leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: [1,1]     D=eeeeeeeER.   vbroadcastss	(%rax), %ymm0
+# CHECK-NEXT: [2,0]     .DeE------R.   leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: [2,1]     .D=eeeeeeeER   vbroadcastss	(%rax), %ymm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.0    1.0    4.3       leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: 1.     3     2.0    0.0    0.0       vbroadcastss	(%rax), %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-1.s b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-1.s
new file mode 100644
index 00000000000..721d276f2f4
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-1.s
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+
+vaddps %xmm0, %xmm0, %xmm1
+vandps (%rdi), %xmm1, %xmm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      10
+# CHECK-NEXT: Total uOps:        3
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.30
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  2      7     1.00    *                   vandps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER   .   vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     DeeeeeeeER   vandps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       vandps	(%rdi), %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-2.s b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-2.s
new file mode 100644
index 00000000000..4768971eb52
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-2.s
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+
+vaddps %ymm0, %ymm0, %ymm1
+vandps (%rdi), %ymm1, %ymm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      11
+# CHECK-NEXT: Total uOps:        3
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.18
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  2      8     1.00    *                   vandps	(%rdi), %ymm1, %ymm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     DeeeeeeeeER   vandps	(%rdi), %ymm1, %ymm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       vandps	(%rdi), %ymm1, %ymm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s
new file mode 100644
index 00000000000..d7d99861cfb
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+
+  vmulps  %ymm0, %ymm1, %ymm2
+  vfrczpd %xmm1, %xmm2
+  vmulps  %ymm2, %ymm3, %ymm4
+  vaddps  %ymm4, %ymm5, %ymm6
+  vmulps  %ymm6, %ymm3, %ymm4
+  vaddps  %ymm4, %ymm5, %ymm0
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      318
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.89
+# CHECK-NEXT: IPC:               1.89
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     3.00   3.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     DeeeE--R  .    .    .    ..   vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT: [0,2]     D===eeeeeER    .    .    ..   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [0,3]     D========eeeER .    .    ..   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [0,4]     .D==========eeeeeER .    ..   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [0,5]     .D===============eeeER   ..   vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: [1,0]     .D==================eeeeeER   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     .DeeeE--------------------R   vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT: [1,2]     . D==eeeeeE---------------R   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [1,3]     . D=======eeeE------------R   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [1,4]     . D==========eeeeeE-------R   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [1,5]     . D===============eeeE----R   vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     2     1.0    1.0    11.0      vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT: 2.     2     3.5    0.0    7.5       vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: 3.     2     8.5    0.0    6.0       vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: 4.     2     11.0   0.0    3.5       vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: 5.     2     16.0   0.0    2.0       vaddps	%ymm4, %ymm5, %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s
new file mode 100644
index 00000000000..ba59a86a048
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+
+  vmulps     %ymm0, %ymm1, %ymm2
+  vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2
+  vmulps     %ymm2, %ymm3, %ymm4
+  vaddps     %ymm4, %ymm5, %ymm6
+  vmulps     %ymm6, %ymm3, %ymm4
+  vaddps     %ymm4, %ymm5, %ymm0
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      316
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.90
+# CHECK-NEXT: IPC:               1.90
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     3.00   2.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .   .   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     DeE----R  .    .    .   .   vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     D=eeeeeER .    .    .   .   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [0,3]     D======eeeER   .    .   .   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [0,4]     .D========eeeeeER   .   .   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [0,5]     .D=============eeeER.   .   vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: [1,0]     .D================eeeeeER   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     .DeE--------------------R   vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     . DeeeeeE---------------R   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [1,3]     . D=====eeeE------------R   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [1,4]     . D========eeeeeE-------R   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [1,5]     . D=============eeeE----R   vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     2     1.0    1.0    12.0      vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT: 2.     2     1.5    0.0    7.5       vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: 3.     2     6.5    0.0    6.0       vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: 4.     2     9.0    0.0    3.5       vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: 5.     2     14.0   0.0    2.0       vaddps	%ymm4, %ymm5, %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s b/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s
new file mode 100644
index 00000000000..8290cacdf05
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s
@@ -0,0 +1,365 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# TODO: Fix the processor resource usage for zero-idiom YMM XOR instructions.
+#       Those vector XOR instructions should only consume 1cy of JFPU1 (instead
+#       of 2cy).
+
+# LLVM-MCA-BEGIN ZERO-IDIOM-1
+
+vaddps %ymm0, %ymm0, %ymm1
+vxorps %ymm1, %ymm1, %ymm1
+vblendps $2, %ymm1, %ymm2, %ymm3
+
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ZERO-IDIOM-2
+
+vaddpd %ymm0, %ymm0, %ymm1
+vxorpd %ymm1, %ymm1, %ymm1
+vblendpd $2, %ymm1, %ymm2, %ymm3
+
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ZERO-IDIOM-3
+vaddps %ymm0, %ymm1, %ymm2
+vandnps %ymm2, %ymm2, %ymm3
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ZERO-IDIOM-4
+vaddps %ymm0, %ymm1, %ymm2
+vandnps %ymm2, %ymm2, %ymm3
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ZERO-IDIOM-5
+vperm2f128 $136, %ymm0, %ymm0, %ymm1
+vaddps  %ymm1, %ymm1, %ymm0
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - ZERO-IDIOM-1
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      300
+# CHECK-NEXT: Total Cycles:      107
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    2.80
+# CHECK-NEXT: IPC:               2.80
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  1      1     1.00                        vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  1      1     0.50                        vblendps	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vblendps	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER   .   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     D===eER  .   vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,2]     D====eER .   vblendps	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     D=eeeE-R .   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1]     .D===eER .   vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [1,2]     .D====eER.   vblendps	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     .D=eeeE-R.   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1]     .D====eER.   vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [2,2]     . D====eER   vblendps	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.7    1.7    0.7       vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     3     4.3    0.0    0.0       vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 2.     3     5.0    0.0    0.0       vblendps	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      [1] Code Region - ZERO-IDIOM-2
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      300
+# CHECK-NEXT: Total Cycles:      107
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    2.80
+# CHECK-NEXT: IPC:               2.80
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  1      1     1.00                        vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  1      1     0.50                        vblendpd	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vblendpd	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER   .   vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     D===eER  .   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,2]     D====eER .   vblendpd	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     D=eeeE-R .   vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1]     .D===eER .   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [1,2]     .D====eER.   vblendpd	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     .D=eeeE-R.   vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1]     .D====eER.   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [2,2]     . D====eER   vblendpd	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.7    1.7    0.7       vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     3     4.3    0.0    0.0       vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 2.     3     5.0    0.0    0.0       vblendpd	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      [2] Code Region - ZERO-IDIOM-3
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      106
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.89
+# CHECK-NEXT: IPC:               1.89
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeER  .   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     D===eER .   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     D=eeeER .   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     D====eER.   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     .D=eeeER.   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [2,1]     .D====eER   vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.7    1.7    0.0       vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     3     4.7    0.0    0.0       vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      [3] Code Region - ZERO-IDIOM-4
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      106
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.89
+# CHECK-NEXT: IPC:               1.89
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeER  .   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     D===eER .   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     D=eeeER .   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     D====eER.   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     .D=eeeER.   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [2,1]     .D====eER   vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.7    1.7    0.0       vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     3     4.7    0.0    0.0       vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      [4] Code Region - ZERO-IDIOM-5
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                        vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm1, %ymm1, %ymm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm1, %ymm1, %ymm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .   .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     D=eeeER   .   .   vaddps	%ymm1, %ymm1, %ymm0
+# CHECK-NEXT: [1,0]     D====eER  .   .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1]     D=====eeeER   .   vaddps	%ymm1, %ymm1, %ymm0
+# CHECK-NEXT: [2,0]     .D=======eER  .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1]     .D========eeeER   vaddps	%ymm1, %ymm1, %ymm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     4.7    0.3    0.0       vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     3     5.7    0.0    0.0       vaddps	%ymm1, %ymm1, %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s b/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s
new file mode 100644
index 00000000000..034542e655b
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s
@@ -0,0 +1,427 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -register-file-stats -iterations=1 < %s | FileCheck %s
+
+subl  %eax, %eax
+subq  %rax, %rax
+xorl  %eax, %eax
+xorq  %rax, %rax
+
+pcmpgtb   %mm2, %mm2
+pcmpgtd   %mm2, %mm2
+# pcmpgtq   %mm2, %mm2 # invalid operand for instruction
+pcmpgtw   %mm2, %mm2
+
+pcmpgtb   %xmm2, %xmm2
+pcmpgtd   %xmm2, %xmm2
+pcmpgtq   %xmm2, %xmm2
+pcmpgtw   %xmm2, %xmm2
+
+vpcmpgtb  %xmm3, %xmm3, %xmm3
+vpcmpgtd  %xmm3, %xmm3, %xmm3
+vpcmpgtq  %xmm3, %xmm3, %xmm3
+vpcmpgtw  %xmm3, %xmm3, %xmm3
+
+vpcmpgtb  %xmm3, %xmm3, %xmm5
+vpcmpgtd  %xmm3, %xmm3, %xmm5
+vpcmpgtq  %xmm3, %xmm3, %xmm5
+vpcmpgtw  %xmm3, %xmm3, %xmm5
+
+psubb   %mm2, %mm2
+psubd   %mm2, %mm2
+psubq   %mm2, %mm2
+psubw   %mm2, %mm2
+psubb   %xmm2, %xmm2
+psubd   %xmm2, %xmm2
+psubq   %xmm2, %xmm2
+psubw   %xmm2, %xmm2
+vpsubb  %xmm3, %xmm3, %xmm3
+vpsubd  %xmm3, %xmm3, %xmm3
+vpsubq  %xmm3, %xmm3, %xmm3
+vpsubw  %xmm3, %xmm3, %xmm3
+
+vpsubb  %xmm3, %xmm3, %xmm5
+vpsubd  %xmm3, %xmm3, %xmm5
+vpsubq  %xmm3, %xmm3, %xmm5
+vpsubw  %xmm3, %xmm3, %xmm5
+
+psubsb   %mm2, %mm2
+psubsw   %mm2, %mm2
+psubsb   %xmm2, %xmm2
+psubsw   %xmm2, %xmm2
+vpsubsb  %xmm3, %xmm3, %xmm3
+vpsubsw  %xmm3, %xmm3, %xmm3
+
+vpsubsb  %xmm3, %xmm3, %xmm5
+vpsubsw  %xmm3, %xmm3, %xmm5
+
+psubusb   %mm2, %mm2
+psubusw   %mm2, %mm2
+psubusb   %xmm2, %xmm2
+psubusw   %xmm2, %xmm2
+vpsubusb  %xmm3, %xmm3, %xmm3
+vpsubusw  %xmm3, %xmm3, %xmm3
+
+vpsubsb  %xmm3, %xmm3, %xmm5
+vpsubsw  %xmm3, %xmm3, %xmm5
+
+andnps  %xmm0, %xmm0
+andnpd  %xmm1, %xmm1
+vandnps %xmm2, %xmm2, %xmm2
+vandnpd %xmm1, %xmm1, %xmm1
+pandn   %mm2, %mm2
+pandn   %xmm2, %xmm2
+vpandn  %xmm3, %xmm3, %xmm3
+
+vandnps %xmm2, %xmm2, %xmm5
+vandnpd %xmm1, %xmm1, %xmm5
+vpandn  %xmm3, %xmm3, %xmm5
+
+xorps  %xmm0, %xmm0
+xorpd  %xmm1, %xmm1
+vxorps %xmm2, %xmm2, %xmm2
+vxorpd %xmm1, %xmm1, %xmm1
+pxor   %mm2, %mm2
+pxor   %xmm2, %xmm2
+vpxor  %xmm3, %xmm3, %xmm3
+
+vxorps %xmm4, %xmm4, %xmm5
+vxorpd %xmm1, %xmm1, %xmm3
+vpxor  %xmm3, %xmm3, %xmm5
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      71
+# CHECK-NEXT: Total Cycles:      39
+# CHECK-NEXT: Total uOps:        71
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.82
+# CHECK-NEXT: IPC:               1.82
+# CHECK-NEXT: Block RThroughput: 17.8
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        subl	%eax, %eax
+# CHECK-NEXT:  1      0     0.25                        subq	%rax, %rax
+# CHECK-NEXT:  1      0     0.25                        xorl	%eax, %eax
+# CHECK-NEXT:  1      0     0.25                        xorq	%rax, %rax
+# CHECK-NEXT:  1      3     1.00                        pcmpgtb	%mm2, %mm2
+# CHECK-NEXT:  1      3     1.00                        pcmpgtd	%mm2, %mm2
+# CHECK-NEXT:  1      3     1.00                        pcmpgtw	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      3     1.00                        psubb	%mm2, %mm2
+# CHECK-NEXT:  1      3     1.00                        psubd	%mm2, %mm2
+# CHECK-NEXT:  1      3     1.00                        psubq	%mm2, %mm2
+# CHECK-NEXT:  1      3     1.00                        psubw	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        psubb	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        psubd	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        psubq	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        psubw	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      3     1.00                        psubsb	%mm2, %mm2
+# CHECK-NEXT:  1      3     1.00                        psubsw	%mm2, %mm2
+# CHECK-NEXT:  1      1     0.50                        psubsb	%xmm2, %xmm2
+# CHECK-NEXT:  1      1     0.50                        psubsw	%xmm2, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      1     0.50                        vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      3     1.00                        psubusb	%mm2, %mm2
+# CHECK-NEXT:  1      3     1.00                        psubusw	%mm2, %mm2
+# CHECK-NEXT:  1      1     0.50                        psubusb	%xmm2, %xmm2
+# CHECK-NEXT:  1      1     0.50                        psubusw	%xmm2, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      1     0.50                        vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      1     1.00                        andnps	%xmm0, %xmm0
+# CHECK-NEXT:  1      1     1.00                        andnpd	%xmm1, %xmm1
+# CHECK-NEXT:  1      1     1.00                        vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  1      1     0.33                        pandn	%mm2, %mm2
+# CHECK-NEXT:  1      1     0.33                        pandn	%xmm2, %xmm2
+# CHECK-NEXT:  1      1     0.33                        vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      1     1.00                        vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT:  1      1     1.00                        vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT:  1      1     0.33                        vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        xorps	%xmm0, %xmm0
+# CHECK-NEXT:  1      0     0.25                        xorpd	%xmm1, %xmm1
+# CHECK-NEXT:  1      0     0.25                        vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  1      1     0.33                        pxor	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pxor	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpxor	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    75
+# CHECK-NEXT: Max number of mappings used:         51
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SBDivider
+# CHECK-NEXT: [1]   - SBFPDivider
+# CHECK-NEXT: [2]   - SBPort0
+# CHECK-NEXT: [3]   - SBPort1
+# CHECK-NEXT: [4]   - SBPort4
+# CHECK-NEXT: [5]   - SBPort5
+# CHECK-NEXT: [6.0] - SBPort23
+# CHECK-NEXT: [6.1] - SBPort23
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -     5.00   16.00   -     13.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     subl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     subq	%rax, %rax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorq	%rax, %rax
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm2, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     psubb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     psubd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     psubq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     psubw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubsb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubsw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psubsb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psubsw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubusb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubusw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubusb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psubusw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andnps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andnpd	%xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pandn	%mm2, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pandn	%xmm2, %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorpd	%xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pxor	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     pxor	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DR   .    .    .    .    .    .    .  .   subl	%eax, %eax
+# CHECK-NEXT: [0,1]     DR   .    .    .    .    .    .    .  .   subq	%rax, %rax
+# CHECK-NEXT: [0,2]     DR   .    .    .    .    .    .    .  .   xorl	%eax, %eax
+# CHECK-NEXT: [0,3]     DR   .    .    .    .    .    .    .  .   xorq	%rax, %rax
+# CHECK-NEXT: [0,4]     .DeeeER   .    .    .    .    .    .  .   pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: [0,5]     .D===eeeER.    .    .    .    .    .  .   pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: [0,6]     .D======eeeER  .    .    .    .    .  .   pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: [0,7]     .D----------R  .    .    .    .    .  .   pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: [0,8]     . D---------R  .    .    .    .    .  .   pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: [0,9]     . D---------R  .    .    .    .    .  .   pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: [0,10]    . D---------R  .    .    .    .    .  .   pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: [0,11]    . D---------R  .    .    .    .    .  .   vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,12]    .  D--------R  .    .    .    .    .  .   vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,13]    .  D--------R  .    .    .    .    .  .   vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,14]    .  D--------R  .    .    .    .    .  .   vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,15]    .  D--------R  .    .    .    .    .  .   vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,16]    .   D-------R  .    .    .    .    .  .   vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,17]    .   D-------R  .    .    .    .    .  .   vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,18]    .   D-------R  .    .    .    .    .  .   vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,19]    .   D======eeeER    .    .    .    .  .   psubb	%mm2, %mm2
+# CHECK-NEXT: [0,20]    .    D========eeeER .    .    .    .  .   psubd	%mm2, %mm2
+# CHECK-NEXT: [0,21]    .    D===========eeeER   .    .    .  .   psubq	%mm2, %mm2
+# CHECK-NEXT: [0,22]    .    D==============eeeER.    .    .  .   psubw	%mm2, %mm2
+# CHECK-NEXT: [0,23]    .    D------------------R.    .    .  .   psubb	%xmm2, %xmm2
+# CHECK-NEXT: [0,24]    .    .D-----------------R.    .    .  .   psubd	%xmm2, %xmm2
+# CHECK-NEXT: [0,25]    .    .D-----------------R.    .    .  .   psubq	%xmm2, %xmm2
+# CHECK-NEXT: [0,26]    .    .D-----------------R.    .    .  .   psubw	%xmm2, %xmm2
+# CHECK-NEXT: [0,27]    .    .D-----------------R.    .    .  .   vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,28]    .    . D----------------R.    .    .  .   vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,29]    .    . D----------------R.    .    .  .   vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,30]    .    . D----------------R.    .    .  .   vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,31]    .    . D----------------R.    .    .  .   vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,32]    .    .  D---------------R.    .    .  .   vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,33]    .    .  D---------------R.    .    .  .   vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,34]    .    .  D---------------R.    .    .  .   vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,35]    .    .  D==============eeeER  .    .  .   psubsb	%mm2, %mm2
+# CHECK-NEXT: [0,36]    .    .   D================eeeER    .  .   psubsw	%mm2, %mm2
+# CHECK-NEXT: [0,37]    .    .   DeE------------------R    .  .   psubsb	%xmm2, %xmm2
+# CHECK-NEXT: [0,38]    .    .   D==eE----------------R    .  .   psubsw	%xmm2, %xmm2
+# CHECK-NEXT: [0,39]    .    .   DeE------------------R    .  .   vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,40]    .    .    DeE-----------------R    .  .   vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,41]    .    .    D=eE----------------R    .  .   vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,42]    .    .    D==eE---------------R    .  .   vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,43]    .    .    D==================eeeER .  .   psubusb	%mm2, %mm2
+# CHECK-NEXT: [0,44]    .    .    .D====================eeeER .   psubusw	%mm2, %mm2
+# CHECK-NEXT: [0,45]    .    .    .D=eE---------------------R .   psubusb	%xmm2, %xmm2
+# CHECK-NEXT: [0,46]    .    .    .D==eE--------------------R .   psubusw	%xmm2, %xmm2
+# CHECK-NEXT: [0,47]    .    .    .D===eE-------------------R .   vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,48]    .    .    . D===eE------------------R .   vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,49]    .    .    . D====eE-----------------R .   vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,50]    .    .    . D=====eE----------------R .   vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,51]    .    .    . D===eE------------------R .   andnps	%xmm0, %xmm0
+# CHECK-NEXT: [0,52]    .    .    .  D====eE----------------R .   andnpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,53]    .    .    .  D======eE--------------R .   vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,54]    .    .    .  D=====eE---------------R .   vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,55]    .    .    .  D=====================eER.   pandn	%mm2, %mm2
+# CHECK-NEXT: [0,56]    .    .    .   D======eE--------------R.   pandn	%xmm2, %xmm2
+# CHECK-NEXT: [0,57]    .    .    .   D==eE------------------R.   vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,58]    .    .    .   D=======eE-------------R.   vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: [0,59]    .    .    .   D======eE--------------R.   vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: [0,60]    .    .    .    D==eE-----------------R.   vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,61]    .    .    .    D=E-------------------R.   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,62]    .    .    .    D====E----------------R.   xorpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,63]    .    .    .    D======E--------------R.   vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,64]    .    .    .    .D===E----------------R.   vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,65]    .    .    .    .D===================eER   pxor	%mm2, %mm2
+# CHECK-NEXT: [0,66]    .    .    .    .D=====E---------------R   pxor	%xmm2, %xmm2
+# CHECK-NEXT: [0,67]    .    .    .    .D=E-------------------R   vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,68]    .    .    .    . D--------------------R   vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [0,69]    .    .    .    . D==E-----------------R   vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [0,70]    .    .    .    . D==E-----------------R   vpxor	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       subl	%eax, %eax
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       subq	%rax, %rax
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       xorl	%eax, %eax
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       xorq	%rax, %rax
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: 5.     1     4.0    0.0    0.0       pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: 7.     1     0.0    0.0    10.0      pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: 8.     1     0.0    0.0    9.0       pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: 9.     1     0.0    0.0    9.0       pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: 10.    1     0.0    0.0    9.0       pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: 11.    1     0.0    0.0    9.0       vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 12.    1     0.0    0.0    8.0       vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 13.    1     0.0    0.0    8.0       vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 14.    1     0.0    0.0    8.0       vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 15.    1     0.0    0.0    8.0       vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 16.    1     0.0    0.0    7.0       vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 17.    1     0.0    0.0    7.0       vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 18.    1     0.0    0.0    7.0       vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 19.    1     7.0    0.0    0.0       psubb	%mm2, %mm2
+# CHECK-NEXT: 20.    1     9.0    0.0    0.0       psubd	%mm2, %mm2
+# CHECK-NEXT: 21.    1     12.0   0.0    0.0       psubq	%mm2, %mm2
+# CHECK-NEXT: 22.    1     15.0   0.0    0.0       psubw	%mm2, %mm2
+# CHECK-NEXT: 23.    1     0.0    0.0    18.0      psubb	%xmm2, %xmm2
+# CHECK-NEXT: 24.    1     0.0    0.0    17.0      psubd	%xmm2, %xmm2
+# CHECK-NEXT: 25.    1     0.0    0.0    17.0      psubq	%xmm2, %xmm2
+# CHECK-NEXT: 26.    1     0.0    0.0    17.0      psubw	%xmm2, %xmm2
+# CHECK-NEXT: 27.    1     0.0    0.0    17.0      vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 28.    1     0.0    0.0    16.0      vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 29.    1     0.0    0.0    16.0      vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 30.    1     0.0    0.0    16.0      vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 31.    1     0.0    0.0    16.0      vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 32.    1     0.0    0.0    15.0      vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 33.    1     0.0    0.0    15.0      vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 34.    1     0.0    0.0    15.0      vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 35.    1     15.0   0.0    0.0       psubsb	%mm2, %mm2
+# CHECK-NEXT: 36.    1     17.0   0.0    0.0       psubsw	%mm2, %mm2
+# CHECK-NEXT: 37.    1     1.0    1.0    18.0      psubsb	%xmm2, %xmm2
+# CHECK-NEXT: 38.    1     3.0    1.0    16.0      psubsw	%xmm2, %xmm2
+# CHECK-NEXT: 39.    1     1.0    1.0    18.0      vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 40.    1     1.0    0.0    17.0      vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 41.    1     2.0    0.0    16.0      vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 42.    1     3.0    1.0    15.0      vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 43.    1     19.0   0.0    0.0       psubusb	%mm2, %mm2
+# CHECK-NEXT: 44.    1     21.0   0.0    0.0       psubusw	%mm2, %mm2
+# CHECK-NEXT: 45.    1     2.0    0.0    21.0      psubusb	%xmm2, %xmm2
+# CHECK-NEXT: 46.    1     3.0    0.0    20.0      psubusw	%xmm2, %xmm2
+# CHECK-NEXT: 47.    1     4.0    3.0    19.0      vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 48.    1     4.0    0.0    18.0      vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 49.    1     5.0    0.0    17.0      vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 50.    1     6.0    1.0    16.0      vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 51.    1     4.0    4.0    18.0      andnps	%xmm0, %xmm0
+# CHECK-NEXT: 52.    1     5.0    5.0    16.0      andnpd	%xmm1, %xmm1
+# CHECK-NEXT: 53.    1     7.0    5.0    14.0      vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 54.    1     6.0    0.0    15.0      vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 55.    1     22.0   0.0    0.0       pandn	%mm2, %mm2
+# CHECK-NEXT: 56.    1     7.0    0.0    14.0      pandn	%xmm2, %xmm2
+# CHECK-NEXT: 57.    1     3.0    0.0    18.0      vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 58.    1     8.0    0.0    13.0      vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: 59.    1     7.0    1.0    14.0      vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: 60.    1     3.0    0.0    17.0      vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 61.    1     2.0    0.0    19.0      xorps	%xmm0, %xmm0
+# CHECK-NEXT: 62.    1     5.0    0.0    16.0      xorpd	%xmm1, %xmm1
+# CHECK-NEXT: 63.    1     7.0    0.0    14.0      vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 64.    1     4.0    0.0    16.0      vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 65.    1     20.0   0.0    0.0       pxor	%mm2, %mm2
+# CHECK-NEXT: 66.    1     6.0    0.0    15.0      pxor	%xmm2, %xmm2
+# CHECK-NEXT: 67.    1     2.0    0.0    19.0      vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 68.    1     0.0    0.0    20.0      vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: 69.    1     3.0    0.0    17.0      vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: 70.    1     3.0    0.0    17.0      vpxor	%xmm3, %xmm3, %xmm5
diff --git a/test/tools/llvm-mca/X86/bextr-read-after-ld.s b/test/tools/llvm-mca/X86/bextr-read-after-ld.s
index 4e4e23231b9..c356fe7976b 100644
--- a/test/tools/llvm-mca/X86/bextr-read-after-ld.s
+++ b/test/tools/llvm-mca/X86/bextr-read-after-ld.s
@@ -2,6 +2,7 @@
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=HASWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=broadwell -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
 
@@ -11,6 +12,9 @@ bextrl	%esi, (%rdi), %eax
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
+# BDVER2-NEXT:  Total Cycles:      10
+# BDVER2-NEXT:  Total uOps:        4
+
 # BDWELL-NEXT:  Total Cycles:      10
 # BDWELL-NEXT:  Total uOps:        4
 
@@ -26,6 +30,11 @@ bextrl	%esi, (%rdi), %eax
 # ZNVER1-NEXT:  Total Cycles:      8
 # ZNVER1-NEXT:  Total uOps:        3
 
+# BDVER2:       Dispatch Width:    4
+# BDVER2-NEXT:  uOps Per Cycle:    0.40
+# BDVER2-NEXT:  IPC:               0.20
+# BDVER2-NEXT:  Block RThroughput: 1.0
+
 # BDWELL:       Dispatch Width:    4
 # BDWELL-NEXT:  uOps Per Cycle:    0.40
 # BDWELL-NEXT:  IPC:               0.20
@@ -61,6 +70,9 @@ bextrl	%esi, (%rdi), %eax
 
 # ALL:          [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 
+# BDVER2-NEXT:   1      1     0.33                        addl	%edi, %esi
+# BDVER2-NEXT:   3      7     1.00    *                   bextrl	%esi, (%rdi), %eax
+
 # BDWELL-NEXT:   1      1     0.25                        addl	%edi, %esi
 # BDWELL-NEXT:   3      7     0.50    *                   bextrl	%esi, (%rdi), %eax
 
@@ -78,12 +90,16 @@ bextrl	%esi, (%rdi), %eax
 
 # ALL:          Timeline view:
 
+# BDVER2-NEXT:  Index     0123456789
 # BDWELL-NEXT:  Index     0123456789
 # BTVER2-NEXT:  Index     0123456
 # HASWELL-NEXT: Index     0123456789
 # SKYLAKE-NEXT: Index     0123456789
 # ZNVER1-NEXT:  Index     01234567
 
+# BDVER2:       [0,0]     DeER .   .   addl	%edi, %esi
+# BDVER2-NEXT:  [0,1]     DeeeeeeeER   bextrl	%esi, (%rdi), %eax
+
 # BDWELL:       [0,0]     DeER .   .   addl	%edi, %esi
 # BDWELL-NEXT:  [0,1]     DeeeeeeeER   bextrl	%esi, (%rdi), %eax
 
diff --git a/test/tools/llvm-mca/X86/cpus.s b/test/tools/llvm-mca/X86/cpus.s
index 47e1e83c543..e666307d1a0 100644
--- a/test/tools/llvm-mca/X86/cpus.s
+++ b/test/tools/llvm-mca/X86/cpus.s
@@ -1,4 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=BDVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=BTVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=ZNVER1 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=SANDYBRIDGE %s
@@ -17,6 +18,11 @@ add %edi, %eax
 # ALL-NEXT:         Total Cycles:      103
 # ALL-NEXT:         Total uOps:        100
 
+# BDVER2:           Dispatch Width:    4
+# BDVER2-NEXT:      uOps Per Cycle:    0.97
+# BDVER2-NEXT:      IPC:               0.97
+# BDVER2-NEXT:      Block RThroughput: 0.3
+
 # BROADWELL:        Dispatch Width:    4
 # BROADWELL-NEXT:   uOps Per Cycle:    0.97
 # BROADWELL-NEXT:   IPC:               0.97
diff --git a/test/tools/llvm-mca/X86/read-after-ld-1.s b/test/tools/llvm-mca/X86/read-after-ld-1.s
index 1478eba77de..6c68ad13116 100644
--- a/test/tools/llvm-mca/X86/read-after-ld-1.s
+++ b/test/tools/llvm-mca/X86/read-after-ld-1.s
@@ -3,6 +3,7 @@
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=HASWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=broadwell -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
 
@@ -12,6 +13,9 @@ vaddps  (%rax), %xmm1, %xmm1
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
+# BDVER2-NEXT:  Total Cycles:      20
+# BDVER2-NEXT:  Total uOps:        3
+
 # BDWELL-NEXT:  Total Cycles:      17
 # BDWELL-NEXT:  Total uOps:        3
 
@@ -30,6 +34,11 @@ vaddps  (%rax), %xmm1, %xmm1
 # ZNVER1-NEXT:  Total Cycles:      20
 # ZNVER1-NEXT:  Total uOps:        2
 
+# BDVER2:       Dispatch Width:    4
+# BDVER2-NEXT:  uOps Per Cycle:    0.15
+# BDVER2-NEXT:  IPC:               0.10
+# BDVER2-NEXT:  Block RThroughput: 14.0
+
 # BDWELL:       Dispatch Width:    4
 # BDWELL-NEXT:  uOps Per Cycle:    0.18
 # BDWELL-NEXT:  IPC:               0.12
@@ -62,6 +71,9 @@ vaddps  (%rax), %xmm1, %xmm1
 
 # ALL:          Timeline view:
 
+# BDVER2-NEXT:                      0123456789
+# BDVER2-NEXT:  Index     0123456789
+
 # BDWELL-NEXT:                      0123456
 # BDWELL-NEXT:  Index     0123456789
 
@@ -80,6 +92,9 @@ vaddps  (%rax), %xmm1, %xmm1
 # ZNVER1-NEXT:                      0123456789
 # ZNVER1-NEXT:  Index     0123456789
 
+# BDVER2:       [0,0]     DeeeeeeeeeeeeeeER  .   vdivps	%xmm0, %xmm1, %xmm1
+# BDVER2-NEXT:  [0,1]     D========eeeeeeeeeER   vaddps	(%rax), %xmm1, %xmm1
+
 # BDWELL:       [0,0]     DeeeeeeeeeeeER ..   vdivps	%xmm0, %xmm1, %xmm1
 # BDWELL-NEXT:  [0,1]     D======eeeeeeeeER   vaddps	(%rax), %xmm1, %xmm1
 
@@ -107,6 +122,7 @@ vaddps  (%rax), %xmm1, %xmm1
 # ALL:                [0]    [1]    [2]    [3]
 # ALL-NEXT:     0.     1     1.0    1.0    0.0       vdivps	%xmm0, %xmm1, %xmm1
 
+# BDVER2-NEXT:  1.     1     9.0    0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
 # BDWELL-NEXT:  1.     1     7.0    0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
 # BTVER2-NEXT:  1.     1     15.0   0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
 # HASWELL-NEXT: 1.     1     8.0    0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
diff --git a/test/tools/llvm-mca/X86/register-file-statistics.s b/test/tools/llvm-mca/X86/register-file-statistics.s
index e605ea94f4a..914eeaa82dd 100644
--- a/test/tools/llvm-mca/X86/register-file-statistics.s
+++ b/test/tools/llvm-mca/X86/register-file-statistics.s
@@ -1,4 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL,BTVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL,ZNVER1 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL %s
diff --git a/test/tools/llvm-mca/X86/scheduler-queue-usage.s b/test/tools/llvm-mca/X86/scheduler-queue-usage.s
index e22f4a51887..d99a76bf833 100644
--- a/test/tools/llvm-mca/X86/scheduler-queue-usage.s
+++ b/test/tools/llvm-mca/X86/scheduler-queue-usage.s
@@ -1,4 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,BDVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,BTVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,ZNVER1 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,SNB %s
@@ -17,6 +18,12 @@ xor %eax, %ebx
 # ALL-NEXT:         0,          3  (75.0%)
 # ALL-NEXT:         1,          1  (25.0%)
 
+# BDVER2:          Scheduler's queue usage:
+# BDVER2-NEXT:     [1] Resource name.
+# BDVER2-NEXT:     [2] Average number of used buffer entries.
+# BDVER2-NEXT:     [3] Maximum number of used buffer entries.
+# BDVER2-NEXT:     [4] Total number of buffer entries.
+
 # BDW:             Scheduler's queue usage:
 # BDW-NEXT:        [1] Resource name.
 # BDW-NEXT:        [2] Average number of used buffer entries.
@@ -74,6 +81,9 @@ xor %eax, %ebx
 # ZNVER1-NEXT:     [3] Maximum number of used buffer entries.
 # ZNVER1-NEXT:     [4] Total number of buffer entries.
 
+# BDVER2:           [1]            [2]        [3]        [4]
+# BDVER2-NEXT:     SBPortAny        0          1          54
+
 # BDW:              [1]            [2]        [3]        [4]
 # BDW-NEXT:        BWPortAny        0          1          60
 
diff --git a/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s b/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s
index 66b87e72df2..e4531c990f0 100644
--- a/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s
+++ b/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s
@@ -1,4 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# ZZZ: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=HASWELL
diff --git a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s
index 3a2f4d260f2..c2e28922e3a 100644
--- a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s
+++ b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s
@@ -9,6 +9,8 @@
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
 
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
+
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
@@ -19,6 +21,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
+# BDVER2-NEXT:  Total Cycles:      11
+# BDVER2-NEXT:  Total uOps:        4
+
 # BDWELL-NEXT:  Total Cycles:      10
 # BDWELL-NEXT:  Total uOps:        4
 
@@ -40,6 +45,11 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  Total Cycles:      11
 # ZNVER1-NEXT:  Total uOps:        2
 
+# BDVER2:       Dispatch Width:    4
+# BDVER2-NEXT:  uOps Per Cycle:    0.36
+# BDVER2-NEXT:  IPC:               0.18
+# BDVER2-NEXT:  Block RThroughput: 1.0
+
 # BDWELL:       Dispatch Width:    4
 # BDWELL-NEXT:  uOps Per Cycle:    0.40
 # BDWELL-NEXT:  IPC:               0.20
@@ -75,6 +85,10 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  IPC:               0.18
 # ZNVER1-NEXT:  Block RThroughput: 1.0
 
+# BDVER2:       Timeline view:
+# BDVER2-NEXT:                      0
+# BDVER2-NEXT:  Index     0123456789
+
 # BDWELL:       Timeline view:
 # BDWELL-NEXT:  Index     0123456789
 
@@ -102,6 +116,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:                      0
 # ZNVER1-NEXT:  Index     0123456789
 
+# BDVER2:       [0,0]     DeeeER    .   vaddps	%xmm0, %xmm0, %xmm1
+# BDVER2-NEXT:  [0,1]     DeeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
+
 # BDWELL:       [0,0]     DeeeER   .   vaddps	%xmm0, %xmm0, %xmm1
 # BDWELL-NEXT:  [0,1]     DeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 
@@ -132,6 +149,7 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ALL:                [0]    [1]    [2]    [3]
 # ALL-NEXT:     0.     1     1.0    1.0    0.0       vaddps	%xmm0, %xmm0, %xmm1
 
+# BDVER2-NEXT:  1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # BDWELL-NEXT:  1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # BTVER2-NEXT:  1.     1     1.0    1.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # HASWELL-NEXT: 1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
diff --git a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s
index fd581e0debf..0aa71425e94 100644
--- a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s
+++ b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s
@@ -9,6 +9,8 @@
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
 
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
+
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
@@ -19,6 +21,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
+# BDVER2-NEXT:  Total Cycles:      11
+# BDVER2-NEXT:  Total uOps:        4
+
 # BDWELL-NEXT:  Total Cycles:      10
 # BDWELL-NEXT:  Total uOps:        4
 
@@ -40,6 +45,11 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  Total Cycles:      11
 # ZNVER1-NEXT:  Total uOps:        2
 
+# BDVER2:       Dispatch Width:    4
+# BDVER2-NEXT:  uOps Per Cycle:    0.36
+# BDVER2-NEXT:  IPC:               0.18
+# BDVER2-NEXT:  Block RThroughput: 1.0
+
 # BDWELL:       Dispatch Width:    4
 # BDWELL-NEXT:  uOps Per Cycle:    0.40
 # BDWELL-NEXT:  IPC:               0.20
@@ -75,6 +85,10 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  IPC:               0.18
 # ZNVER1-NEXT:  Block RThroughput: 1.0
 
+# BDVER2:       Timeline view:
+# BDVER2-NEXT:                      0
+# BDVER2-NEXT:  Index     0123456789
+
 # BDWELL:       Timeline view:
 # BDWELL-NEXT:  Index     0123456789
 
@@ -102,6 +116,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:                      0
 # ZNVER1-NEXT:  Index     0123456789
 
+# BDVER2:       [0,0]     DeeeER    .   vaddps	%xmm0, %xmm0, %xmm2
+# BDVER2-NEXT:  [0,1]     DeeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
+
 # BDWELL:       [0,0]     DeeeER   .   vaddps	%xmm0, %xmm0, %xmm2
 # BDWELL-NEXT:  [0,1]     DeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 
@@ -132,6 +149,7 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ALL:                [0]    [1]    [2]    [3]
 # ALL-NEXT:     0.     1     1.0    1.0    0.0       vaddps	%xmm0, %xmm0, %xmm2
 
+# BDVER2-NEXT:  1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # BDWELL-NEXT:  1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # BTVER2-NEXT:  1.     1     1.0    1.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # HASWELL-NEXT: 1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
-- 
GitLab


From 3a8afbd8a1a153d49969b13c1d8ba3858d860acd Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sat, 27 Oct 2018 20:46:30 +0000
Subject: [PATCH 0677/1116] AMD BdVer2 (Piledriver) Initial Scheduler model

Summary:
# Overview
This is somewhat partial.
* Latencies are good {F7371125}
  * All of these remaining inconsistencies //appear// to be noise/noisy/flaky.
* NumMicroOps are somewhat good {F7371158}
  * Most of the remaining inconsistencies are from `Ld` / `Ld_ReadAfterLd` classes
* Actual unit occupation (pipes, `ResourceCycles`) are undiscovered lands, i did not really look there.
  They are basically verbatum copy from `btver2`
* Many `InstRW`. And there are still inconsistencies left...

To be noted:
I think this is the first new schedule profile produced with the new next-gen tools like llvm-exegesis!

# Benchmark
I realize that isn't what was suggested, but i'll start with some "internal" public real-world benchmark i understand - [[ https://github.com/darktable-org/rawspeed | RawSpeed raw image decoding library ]].
Diff (the exact clang from trunk without/with this patch):
```
Comparing /home/lebedevri/rawspeed/build-old/src/utilities/rsbench/rsbench to /home/lebedevri/rawspeed/build-new/src/utilities/rsbench/rsbench
Benchmark                                                                                        Time             CPU      Time Old      Time New       CPU Old       CPU New
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Canon/EOS 5D Mark II/09.canon.sraw1.cr2/threads:8/real_time_pvalue                             0.0000          0.0000      U Test, Repetitions: 25 vs 25
Canon/EOS 5D Mark II/09.canon.sraw1.cr2/threads:8/real_time_mean                              -0.0607         -0.0604           234           219           233           219
Canon/EOS 5D Mark II/09.canon.sraw1.cr2/threads:8/real_time_median                            -0.0630         -0.0626           233           219           233           219
Canon/EOS 5D Mark II/09.canon.sraw1.cr2/threads:8/real_time_stddev                            +0.2581         +0.2587             1             2             1             2
Canon/EOS 5D Mark II/10.canon.sraw2.cr2/threads:8/real_time_pvalue                             0.0000          0.0000      U Test, Repetitions: 25 vs 25
Canon/EOS 5D Mark II/10.canon.sraw2.cr2/threads:8/real_time_mean                              -0.0770         -0.0767           144           133           144           133
Canon/EOS 5D Mark II/10.canon.sraw2.cr2/threads:8/real_time_median                            -0.0767         -0.0763           144           133           144           133
Canon/EOS 5D Mark II/10.canon.sraw2.cr2/threads:8/real_time_stddev                            -0.4170         -0.4156             1             0             1             0
Canon/EOS 5DS/2K4A9927.CR2/threads:8/real_time_pvalue                                          0.0000          0.0000      U Test, Repetitions: 25 vs 25
Canon/EOS 5DS/2K4A9927.CR2/threads:8/real_time_mean                                           -0.0271         -0.0270           463           450           463           450
Canon/EOS 5DS/2K4A9927.CR2/threads:8/real_time_median                                         -0.0093         -0.0093           453           449           453           449
Canon/EOS 5DS/2K4A9927.CR2/threads:8/real_time_stddev                                         -0.7280         -0.7280            13             4            13             4
Canon/EOS 5DS/2K4A9928.CR2/threads:8/real_time_pvalue                                          0.0004          0.0004      U Test, Repetitions: 25 vs 25
Canon/EOS 5DS/2K4A9928.CR2/threads:8/real_time_mean                                           -0.0065         -0.0065           569           565           569           565
Canon/EOS 5DS/2K4A9928.CR2/threads:8/real_time_median                                         -0.0077         -0.0077           569           564           569           564
Canon/EOS 5DS/2K4A9928.CR2/threads:8/real_time_stddev                                         +1.0077         +1.0068             2             5             2             5
Canon/EOS 5DS/2K4A9929.CR2/threads:8/real_time_pvalue                                          0.0220          0.0199      U Test, Repetitions: 25 vs 25
Canon/EOS 5DS/2K4A9929.CR2/threads:8/real_time_mean                                           +0.0006         +0.0007           312           312           312           312
Canon/EOS 5DS/2K4A9929.CR2/threads:8/real_time_median                                         +0.0031         +0.0032           311           312           311           312
Canon/EOS 5DS/2K4A9929.CR2/threads:8/real_time_stddev                                         -0.7069         -0.7072             4             1             4             1
Canon/EOS 10D/CRW_7673.CRW/threads:8/real_time_pvalue                                          0.0004          0.0004      U Test, Repetitions: 25 vs 25
Canon/EOS 10D/CRW_7673.CRW/threads:8/real_time_mean                                           -0.0015         -0.0015           141           141           141           141
Canon/EOS 10D/CRW_7673.CRW/threads:8/real_time_median                                         -0.0010         -0.0011           141           141           141           141
Canon/EOS 10D/CRW_7673.CRW/threads:8/real_time_stddev                                         -0.1486         -0.1456             0             0             0             0
Canon/EOS 40D/_MG_0154.CR2/threads:8/real_time_pvalue                                          0.6139          0.8766      U Test, Repetitions: 25 vs 25
Canon/EOS 40D/_MG_0154.CR2/threads:8/real_time_mean                                           -0.0008         -0.0005            60            60            60            60
Canon/EOS 40D/_MG_0154.CR2/threads:8/real_time_median                                         -0.0006         -0.0002            60            60            60            60
Canon/EOS 40D/_MG_0154.CR2/threads:8/real_time_stddev                                         -0.1467         -0.1390             0             0             0             0
Canon/EOS 77D/IMG_4049.CR2/threads:8/real_time_pvalue                                          0.0137          0.0137      U Test, Repetitions: 25 vs 25
Canon/EOS 77D/IMG_4049.CR2/threads:8/real_time_mean                                           +0.0002         +0.0002           275           275           275           275
Canon/EOS 77D/IMG_4049.CR2/threads:8/real_time_median                                         -0.0015         -0.0014           275           275           275           275
Canon/EOS 77D/IMG_4049.CR2/threads:8/real_time_stddev                                         +3.3687         +3.3587             0             2             0             2
Canon/PowerShot G1/crw_1693.crw/threads:8/real_time_pvalue                                     0.4041          0.3933      U Test, Repetitions: 25 vs 25
Canon/PowerShot G1/crw_1693.crw/threads:8/real_time_mean                                      +0.0004         +0.0004            67            67            67            67
Canon/PowerShot G1/crw_1693.crw/threads:8/real_time_median                                    -0.0000         -0.0000            67            67            67            67
Canon/PowerShot G1/crw_1693.crw/threads:8/real_time_stddev                                    +0.1947         +0.1995             0             0             0             0
Fujifilm/GFX 50S/20170525_0037TEST.RAF/threads:8/real_time_pvalue                              0.0074          0.0001      U Test, Repetitions: 25 vs 25
Fujifilm/GFX 50S/20170525_0037TEST.RAF/threads:8/real_time_mean                               -0.0092         +0.0074           547           542            25            25
Fujifilm/GFX 50S/20170525_0037TEST.RAF/threads:8/real_time_median                             -0.0054         +0.0115           544           541            25            25
Fujifilm/GFX 50S/20170525_0037TEST.RAF/threads:8/real_time_stddev                             -0.4086         -0.3486             8             5             0             0
Fujifilm/X-Pro2/_DSF3051.RAF/threads:8/real_time_pvalue                                        0.3320          0.0000      U Test, Repetitions: 25 vs 25
Fujifilm/X-Pro2/_DSF3051.RAF/threads:8/real_time_mean                                         +0.0015         +0.0204           218           218            12            12
Fujifilm/X-Pro2/_DSF3051.RAF/threads:8/real_time_median                                       +0.0001         +0.0203           218           218            12            12
Fujifilm/X-Pro2/_DSF3051.RAF/threads:8/real_time_stddev                                       +0.2259         +0.2023             1             1             0             0
GoPro/HERO6 Black/GOPR9172.GPR/threads:8/real_time_pvalue                                      0.0000          0.0001      U Test, Repetitions: 25 vs 25
GoPro/HERO6 Black/GOPR9172.GPR/threads:8/real_time_mean                                       -0.0209         -0.0179            96            94            90            88
GoPro/HERO6 Black/GOPR9172.GPR/threads:8/real_time_median                                     -0.0182         -0.0155            95            93            90            88
GoPro/HERO6 Black/GOPR9172.GPR/threads:8/real_time_stddev                                     -0.6164         -0.2703             2             1             2             1
Kodak/DCS Pro 14nx/D7465857.DCR/threads:8/real_time_pvalue                                     0.0000          0.0000      U Test, Repetitions: 25 vs 25
Kodak/DCS Pro 14nx/D7465857.DCR/threads:8/real_time_mean                                      -0.0098         -0.0098           176           175           176           175
Kodak/DCS Pro 14nx/D7465857.DCR/threads:8/real_time_median                                    -0.0126         -0.0126           176           174           176           174
Kodak/DCS Pro 14nx/D7465857.DCR/threads:8/real_time_stddev                                    +6.9789         +6.9157             0             2             0             2
Nikon/D850/Nikon-D850-14bit-lossless-compressed.NEF/threads:8/real_time_pvalue                 0.0000          0.0000      U Test, Repetitions: 25 vs 25
Nikon/D850/Nikon-D850-14bit-lossless-compressed.NEF/threads:8/real_time_mean                  -0.0237         -0.0238           474           463           474           463
Nikon/D850/Nikon-D850-14bit-lossless-compressed.NEF/threads:8/real_time_median                -0.0267         -0.0267           473           461           473           461
Nikon/D850/Nikon-D850-14bit-lossless-compressed.NEF/threads:8/real_time_stddev                +0.7179         +0.7178             3             5             3             5
Olympus/E-M1MarkII/Olympus_EM1mk2__HIRES_50MP.ORF/threads:8/real_time_pvalue                   0.6837          0.6554      U Test, Repetitions: 25 vs 25
Olympus/E-M1MarkII/Olympus_EM1mk2__HIRES_50MP.ORF/threads:8/real_time_mean                    -0.0014         -0.0013          1375          1373          1375          1373
Olympus/E-M1MarkII/Olympus_EM1mk2__HIRES_50MP.ORF/threads:8/real_time_median                  +0.0018         +0.0019          1371          1374          1371          1374
Olympus/E-M1MarkII/Olympus_EM1mk2__HIRES_50MP.ORF/threads:8/real_time_stddev                  -0.7457         -0.7382            11             3            10             3
Panasonic/DC-G9/P1000476.RW2/threads:8/real_time_pvalue                                        0.0000          0.0000      U Test, Repetitions: 25 vs 25
Panasonic/DC-G9/P1000476.RW2/threads:8/real_time_mean                                         -0.0080         -0.0289            22            22            10            10
Panasonic/DC-G9/P1000476.RW2/threads:8/real_time_median                                       -0.0070         -0.0287            22            22            10            10
Panasonic/DC-G9/P1000476.RW2/threads:8/real_time_stddev                                       +1.0977         +0.6614             0             0             0             0
Panasonic/DC-GH5/_T012014.RW2/threads:8/real_time_pvalue                                       0.0000          0.0000      U Test, Repetitions: 25 vs 25
Panasonic/DC-GH5/_T012014.RW2/threads:8/real_time_mean                                        +0.0132         +0.0967            35            36            10            11
Panasonic/DC-GH5/_T012014.RW2/threads:8/real_time_median                                      +0.0132         +0.0956            35            36            10            11
Panasonic/DC-GH5/_T012014.RW2/threads:8/real_time_stddev                                      -0.0407         -0.1695             0             0             0             0
Panasonic/DC-GH5S/P1022085.RW2/threads:8/real_time_pvalue                                      0.0000          0.0000      U Test, Repetitions: 25 vs 25
Panasonic/DC-GH5S/P1022085.RW2/threads:8/real_time_mean                                       +0.0331         +0.1307            13            13             6             6
Panasonic/DC-GH5S/P1022085.RW2/threads:8/real_time_median                                     +0.0430         +0.1373            12            13             6             6
Panasonic/DC-GH5S/P1022085.RW2/threads:8/real_time_stddev                                     -0.9006         -0.8847             1             0             0             0
Pentax/645Z/IMGP2837.PEF/threads:8/real_time_pvalue                                            0.0016          0.0010      U Test, Repetitions: 25 vs 25
Pentax/645Z/IMGP2837.PEF/threads:8/real_time_mean                                             -0.0023         -0.0024           395           394           395           394
Pentax/645Z/IMGP2837.PEF/threads:8/real_time_median                                           -0.0029         -0.0030           395           394           395           393
Pentax/645Z/IMGP2837.PEF/threads:8/real_time_stddev                                           -0.0275         -0.0375             1             1             1             1
Phase One/P65/CF027310.IIQ/threads:8/real_time_pvalue                                          0.0232          0.0000      U Test, Repetitions: 25 vs 25
Phase One/P65/CF027310.IIQ/threads:8/real_time_mean                                           -0.0047         +0.0039           114           113            28            28
Phase One/P65/CF027310.IIQ/threads:8/real_time_median                                         -0.0050         +0.0037           114           113            28            28
Phase One/P65/CF027310.IIQ/threads:8/real_time_stddev                                         -0.0599         -0.2683             1             1             0             0
Samsung/NX1/2016-07-23-142101_sam_9364.srw/threads:8/real_time_pvalue                          0.0000          0.0000      U Test, Repetitions: 25 vs 25
Samsung/NX1/2016-07-23-142101_sam_9364.srw/threads:8/real_time_mean                           +0.0206         +0.0207           405           414           405           414
Samsung/NX1/2016-07-23-142101_sam_9364.srw/threads:8/real_time_median                         +0.0204         +0.0205           405           414           405           414
Samsung/NX1/2016-07-23-142101_sam_9364.srw/threads:8/real_time_stddev                         +0.2155         +0.2212             1             1             1             1
Samsung/NX30/2015-03-07-163604_sam_7204.srw/threads:8/real_time_pvalue                         0.0000          0.0000      U Test, Repetitions: 25 vs 25
Samsung/NX30/2015-03-07-163604_sam_7204.srw/threads:8/real_time_mean                          -0.0109         -0.0108           147           145           147           145
Samsung/NX30/2015-03-07-163604_sam_7204.srw/threads:8/real_time_median                        -0.0104         -0.0103           147           145           147           145
Samsung/NX30/2015-03-07-163604_sam_7204.srw/threads:8/real_time_stddev                        -0.4919         -0.4800             0             0             0             0
Samsung/NX3000/_3184416.SRW/threads:8/real_time_pvalue                                         0.0000          0.0000      U Test, Repetitions: 25 vs 25
Samsung/NX3000/_3184416.SRW/threads:8/real_time_mean                                          -0.0149         -0.0147           220           217           220           217
Samsung/NX3000/_3184416.SRW/threads:8/real_time_median                                        -0.0173         -0.0169           221           217           220           217
Samsung/NX3000/_3184416.SRW/threads:8/real_time_stddev                                        +1.0337         +1.0341             1             3             1             3
Sony/DSLR-A350/DSC05472.ARW/threads:8/real_time_pvalue                                         0.0001          0.0001      U Test, Repetitions: 25 vs 25
Sony/DSLR-A350/DSC05472.ARW/threads:8/real_time_mean                                          -0.0019         -0.0019           194           193           194           193
Sony/DSLR-A350/DSC05472.ARW/threads:8/real_time_median                                        -0.0021         -0.0021           194           193           194           193
Sony/DSLR-A350/DSC05472.ARW/threads:8/real_time_stddev                                        -0.4441         -0.4282             0             0             0             0
Sony/ILCE-7RM2/14-bit-compressed.ARW/threads:8/real_time_pvalue                                0.0000          0.4263      U Test, Repetitions: 25 vs 25
Sony/ILCE-7RM2/14-bit-compressed.ARW/threads:8/real_time_mean                                 +0.0258         -0.0006            81            83            19            19
Sony/ILCE-7RM2/14-bit-compressed.ARW/threads:8/real_time_median                               +0.0235         -0.0011            81            82            19            19
Sony/ILCE-7RM2/14-bit-compressed.ARW/threads:8/real_time_stddev                               +0.1634         +0.1070             1             1             0             0
```
{F7443905}
If we look at the `_mean`s, the time column, the biggest win is `-7.7%` (`Canon/EOS 5D Mark II/10.canon.sraw2.cr2`),
and the biggest loose is `+3.3%` (`Panasonic/DC-GH5S/P1022085.RW2`);
Overall: mean `-0.7436%`, median `-0.23%`, `cbrt(sum(time^3))` = `-8.73%`
Looks good so far i'd say.

llvm-exegesis details:
{F7371117} {F7371125}
{F7371128} {F7371144} {F7371158}

Reviewers: craig.topper, RKSimon, andreadb, courbet, avt77, spatel, GGanesh

Reviewed By: andreadb

Subscribers: javed.absar, gbedwell, jfb, llvm-commits

Differential Revision: https://reviews.llvm.org/D52779

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345463 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.td                         |    5 +-
 lib/Target/X86/X86PfmCounters.td              |   12 +
 lib/Target/X86/X86ScheduleBdVer2.td           | 1278 ++++++++
 test/CodeGen/X86/aes-schedule.ll              |   84 +-
 test/CodeGen/X86/avx-schedule.ll              |  626 ++--
 test/CodeGen/X86/avx-vzeroupper.ll            |    8 +-
 test/CodeGen/X86/bmi-schedule.ll              |   94 +-
 test/CodeGen/X86/cmov-schedule.ll             |  368 +--
 test/CodeGen/X86/f16c-schedule.ll             |   30 +-
 test/CodeGen/X86/fma-schedule.ll              |  162 +-
 test/CodeGen/X86/fma.ll                       | 1672 +++++-----
 test/CodeGen/X86/fma4-schedule.ll             |  140 +-
 test/CodeGen/X86/lea32-schedule.ll            |   24 +-
 test/CodeGen/X86/lea64-schedule.ll            |   24 +-
 test/CodeGen/X86/lwp-intrinsics.ll            |   81 +-
 test/CodeGen/X86/lwp-schedule.ll              |   48 +-
 test/CodeGen/X86/lzcnt-schedule.ll            |   26 +-
 test/CodeGen/X86/memset.ll                    |    5 +-
 test/CodeGen/X86/mmx-schedule.ll              |  702 ++--
 test/CodeGen/X86/popcnt-schedule.ll           |   26 +-
 test/CodeGen/X86/recip-fastmath.ll            |   70 +-
 test/CodeGen/X86/recip-fastmath2.ll           |  128 +-
 test/CodeGen/X86/schedule-x86-64-shld.ll      |  108 +-
 test/CodeGen/X86/schedule-x86_32.ll           |  162 +-
 test/CodeGen/X86/schedule-x86_64.ll           | 1648 +++++-----
 test/CodeGen/X86/small-byval-memcpy.ll        |    2 +-
 test/CodeGen/X86/sse-schedule.ll              |  668 ++--
 test/CodeGen/X86/sse2-schedule.ll             | 1554 ++++-----
 test/CodeGen/X86/sse3-schedule.ll             |  162 +-
 test/CodeGen/X86/sse41-schedule.ll            |  590 ++--
 test/CodeGen/X86/sse42-schedule.ll            |  194 +-
 test/CodeGen/X86/sse4a-schedule.ll            |   26 +-
 test/CodeGen/X86/ssse3-schedule.ll            |  182 +-
 test/CodeGen/X86/tbm-schedule.ll              |  130 +-
 test/CodeGen/X86/wide-fma-contraction.ll      |    2 +-
 test/CodeGen/X86/x87-schedule.ll              |  394 +--
 test/CodeGen/X86/xop-schedule.ll              |  374 +--
 test/tools/llvm-mca/X86/BdVer2/add-sequence.s |  124 +-
 .../X86/BdVer2/clear-super-register-1.s       |   46 +-
 .../X86/BdVer2/clear-super-register-2.s       |  158 +-
 .../X86/BdVer2/dependency-breaking-cmp.s      |   73 +-
 .../X86/BdVer2/dependency-breaking-pcmpeq.s   |   92 +-
 .../X86/BdVer2/dependency-breaking-pcmpgt.s   |   86 +-
 .../X86/BdVer2/dependency-breaking-sbb-1.s    |   71 +-
 .../X86/BdVer2/dependency-breaking-sbb-2.s    |   84 +-
 .../X86/BdVer2/dependent-pmuld-paddd.s        |  114 +-
 test/tools/llvm-mca/X86/BdVer2/dot-product.s  |   82 +-
 .../X86/BdVer2/hadd-read-after-ld-1.s         |   20 +-
 .../X86/BdVer2/hadd-read-after-ld-2.s         |   22 +-
 .../X86/BdVer2/instruction-info-view.s        |   20 +-
 .../llvm-mca/X86/BdVer2/load-store-alias.s    |  100 +-
 .../llvm-mca/X86/BdVer2/memcpy-like-test.s    |   88 +-
 test/tools/llvm-mca/X86/BdVer2/one-idioms.s   |  174 +-
 .../X86/BdVer2/partial-reg-update-2.s         |   31 +-
 .../X86/BdVer2/partial-reg-update-3.s         |   83 +-
 .../X86/BdVer2/partial-reg-update-4.s         |   86 +-
 .../X86/BdVer2/partial-reg-update-5.s         |   58 +-
 .../X86/BdVer2/partial-reg-update-6.s         |   84 +-
 .../llvm-mca/X86/BdVer2/partial-reg-update.s  |   26 +-
 test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s    |  121 +-
 test/tools/llvm-mca/X86/BdVer2/pr37790.s      |   23 +-
 test/tools/llvm-mca/X86/BdVer2/rank.s         |  142 +-
 .../llvm-mca/X86/BdVer2/rcu-statistics.s      |   49 +-
 .../llvm-mca/X86/BdVer2/read-advance-1.s      |   20 +-
 .../llvm-mca/X86/BdVer2/read-advance-2.s      |   22 +-
 .../llvm-mca/X86/BdVer2/read-advance-3.s      |   28 +-
 .../X86/BdVer2/reg-move-elimination-1.s       |   93 +-
 .../X86/BdVer2/reg-move-elimination-2.s       |  170 +-
 .../X86/BdVer2/reg-move-elimination-3.s       |  144 +-
 .../X86/BdVer2/reg-move-elimination-4.s       |  118 +-
 .../X86/BdVer2/reg-move-elimination-5.s       |  118 +-
 .../llvm-mca/X86/BdVer2/register-files-1.s    |   90 +-
 .../llvm-mca/X86/BdVer2/register-files-2.s    |   92 +-
 .../llvm-mca/X86/BdVer2/register-files-3.s    |   76 +-
 .../llvm-mca/X86/BdVer2/register-files-4.s    |   39 +-
 .../llvm-mca/X86/BdVer2/register-files-5.s    |  166 +-
 .../llvm-mca/X86/BdVer2/resources-3dnow.s     |  236 +-
 .../tools/llvm-mca/X86/BdVer2/resources-adx.s |   68 +-
 .../tools/llvm-mca/X86/BdVer2/resources-aes.s |   84 +-
 .../llvm-mca/X86/BdVer2/resources-avx1.s      | 2760 ++++++++--------
 .../llvm-mca/X86/BdVer2/resources-bmi1.s      |  120 +-
 .../X86/BdVer2/resources-clflushopt.s         |   40 +-
 .../llvm-mca/X86/BdVer2/resources-cmov.s      |  420 +--
 .../llvm-mca/X86/BdVer2/resources-cmpxchg.s   |   44 +-
 .../llvm-mca/X86/BdVer2/resources-f16c.s      |   68 +-
 .../tools/llvm-mca/X86/BdVer2/resources-fma.s |  684 ++--
 .../llvm-mca/X86/BdVer2/resources-fma4.s      |  380 +--
 .../tools/llvm-mca/X86/BdVer2/resources-lea.s |  576 ++--
 .../llvm-mca/X86/BdVer2/resources-lzcnt.s     |   60 +-
 .../tools/llvm-mca/X86/BdVer2/resources-mmx.s |  452 +--
 .../llvm-mca/X86/BdVer2/resources-movbe.s     |   60 +-
 .../llvm-mca/X86/BdVer2/resources-pclmul.s    |   44 +-
 .../llvm-mca/X86/BdVer2/resources-popcnt.s    |   60 +-
 .../llvm-mca/X86/BdVer2/resources-prefetchw.s |   40 +-
 .../llvm-mca/X86/BdVer2/resources-sse1.s      |  508 +--
 .../llvm-mca/X86/BdVer2/resources-sse2.s      | 1072 +++---
 .../llvm-mca/X86/BdVer2/resources-sse3.s      |  112 +-
 .../llvm-mca/X86/BdVer2/resources-sse41.s     |  420 +--
 .../llvm-mca/X86/BdVer2/resources-sse42.s     |  116 +-
 .../llvm-mca/X86/BdVer2/resources-sse4a.s     |   60 +-
 .../llvm-mca/X86/BdVer2/resources-ssse3.s     |  292 +-
 .../tools/llvm-mca/X86/BdVer2/resources-tbm.s |  160 +-
 .../llvm-mca/X86/BdVer2/resources-x86_32.s    |   88 +-
 .../llvm-mca/X86/BdVer2/resources-x86_64.s    | 2900 +++++++++--------
 .../tools/llvm-mca/X86/BdVer2/resources-x87.s |  616 ++--
 .../tools/llvm-mca/X86/BdVer2/resources-xop.s |  624 ++--
 .../X86/BdVer2/scheduler-queue-usage.s        |   61 +-
 test/tools/llvm-mca/X86/BdVer2/simple-test.s  |   42 +-
 .../X86/BdVer2/vbroadcast-operand-latency.s   |   72 +-
 .../X86/BdVer2/vec-logic-read-after-ld-1.s    |   12 +-
 .../X86/BdVer2/vec-logic-read-after-ld-2.s    |   21 +-
 .../X86/BdVer2/xop-super-registers-1.s        |  110 +-
 .../X86/BdVer2/xop-super-registers-2.s        |  110 +-
 .../llvm-mca/X86/BdVer2/zero-idioms-avx-256.s |  378 ++-
 test/tools/llvm-mca/X86/BdVer2/zero-idioms.s  |  552 ++--
 test/tools/llvm-mca/X86/bextr-read-after-ld.s |   20 +-
 test/tools/llvm-mca/X86/cpus.s                |    4 +-
 test/tools/llvm-mca/X86/read-after-ld-1.s     |   20 +-
 .../llvm-mca/X86/scheduler-queue-usage.s      |    7 +-
 .../tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s |   26 +-
 .../X86/variable-blend-read-after-ld-1.s      |   17 +-
 .../X86/variable-blend-read-after-ld-2.s      |   17 +-
 122 files changed, 15983 insertions(+), 13692 deletions(-)
 create mode 100644 lib/Target/X86/X86ScheduleBdVer2.td

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 94da74225b1..3034b6618df 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -449,6 +449,7 @@ include "X86SchedHaswell.td"
 include "X86SchedBroadwell.td"
 include "X86ScheduleSLM.td"
 include "X86ScheduleZnver1.td"
+include "X86ScheduleBdVer2.td"
 include "X86ScheduleBtVer2.td"
 include "X86SchedSkylakeClient.td"
 include "X86SchedSkylakeServer.td"
@@ -1010,7 +1011,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
 ]>;
 
 // Bulldozer
-def : Proc<"bdver1", [
+def : ProcessorModel<"bdver1", BdVer2Model, [
   FeatureX87,
   FeatureCMOV,
   FeatureXOP,
@@ -1035,7 +1036,7 @@ def : Proc<"bdver1", [
   FeatureMacroFusion
 ]>;
 // Piledriver
-def : Proc<"bdver2", [
+def : ProcessorModel<"bdver2", BdVer2Model, [
   FeatureX87,
   FeatureCMOV,
   FeatureXOP,
diff --git a/lib/Target/X86/X86PfmCounters.td b/lib/Target/X86/X86PfmCounters.td
index 9e0f0c4f64a..c57798e621e 100644
--- a/lib/Target/X86/X86PfmCounters.td
+++ b/lib/Target/X86/X86PfmCounters.td
@@ -91,6 +91,18 @@ def SkylakeServerPfmCounters : ProcPfmCounters {
 }
 def : PfmCountersBinding<"skylake-avx512", SkylakeServerPfmCounters>;
 
+def BdVer2PfmCounters : ProcPfmCounters {
+  let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+  let UopsCounter = PfmCounter<"retired_uops">;
+  let IssueCounters = [
+    PfmIssueCounter<"PdFPU0", "dispatched_fpu_ops:ops_pipe0 + dispatched_fpu_ops:ops_dual_pipe0">,
+    PfmIssueCounter<"PdFPU1", "dispatched_fpu_ops:ops_pipe1 + dispatched_fpu_ops:ops_dual_pipe1">,
+    PfmIssueCounter<"PdFPU2", "dispatched_fpu_ops:ops_pipe2 + dispatched_fpu_ops:ops_dual_pipe2">,
+    PfmIssueCounter<"PdFPU3", "dispatched_fpu_ops:ops_pipe3 + dispatched_fpu_ops:ops_dual_pipe3">
+  ];
+}
+def : PfmCountersBinding<"bdver2", BdVer2PfmCounters>;
+
 def BtVer2PfmCounters : ProcPfmCounters {
   let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
   let UopsCounter = PfmCounter<"retired_uops">;
diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td
new file mode 100644
index 00000000000..bc5d112c2f4
--- /dev/null
+++ b/lib/Target/X86/X86ScheduleBdVer2.td
@@ -0,0 +1,1278 @@
+//=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AMD bdver2 (Piledriver) to support
+// instruction scheduling and other instruction cost heuristics.
+// Based on:
+//  * AMD Software Optimization Guide for AMD Family 15h Processors.
+//    https://support.amd.com/TechDocs/47414_15h_sw_opt_guide.pdf
+//  * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
+//    http://www.agner.org/optimize/microarchitecture.pdf
+//  * https://www.realworldtech.com/bulldozer/
+//    Yes, that is for Bulldozer aka bdver1, not Piledriver aka bdver2.
+//
+//===----------------------------------------------------------------------===//
+
+def BdVer2Model : SchedMachineModel {
+  let IssueWidth = 4; // Up to 4 IPC can be decoded, issued, retired.
+  let MicroOpBufferSize = 128; // RCU reorder buffer size, which is unconfirmed.
+  let LoopMicroOpBufferSize = -1; // There does not seem to be a loop buffer.
+  let LoadLatency = 4; // L1 data cache has a 4-cycle load-to-use latency.
+  let HighLatency = 25; // FIXME: any better choice?
+  let MispredictPenalty = 20; // Minimum branch misdirection penalty.
+
+  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
+
+  // FIXME: Incomplete. This flag is set to allow the scheduler to assign
+  //        a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+} // SchedMachineModel
+
+let SchedModel = BdVer2Model in {
+
+
+//===----------------------------------------------------------------------===//
+// Pipes
+//===----------------------------------------------------------------------===//
+
+// There are total of eight pipes.
+
+//===----------------------------------------------------------------------===//
+// Integer execution pipes
+//
+
+// Two EX (ALU) pipes.
+def PdEX0  : ProcResource<1>; // ALU, Integer Pipe0
+def PdEX1  : ProcResource<1>; // ALU, Integer Pipe1
+def PdEX01 : ProcResGroup<[PdEX0, PdEX1]>;
+
+// Two AGLU pipes, identical.
+def PdAGLU01 : ProcResource<2>; // AGU, Integer Pipe[23]
+
+//===----------------------------------------------------------------------===//
+// Floating point execution pipes
+//
+
+// Four FPU pipes.
+
+def PdFPU0 : ProcResource<1>; // Vector/FPU Pipe0
+def PdFPU1 : ProcResource<1>; // Vector/FPU Pipe1
+def PdFPU2 : ProcResource<1>; // Vector/FPU Pipe2
+def PdFPU3 : ProcResource<1>; // Vector/FPU Pipe3
+
+// FPU grouping
+def PdFPU01 : ProcResGroup<[PdFPU0, PdFPU1]>;
+def PdFPU23 : ProcResGroup<[PdFPU2, PdFPU3]>;
+
+
+//===----------------------------------------------------------------------===//
+// RCU
+//===----------------------------------------------------------------------===//
+
+// The Retire Control Unit on Piledriver can retire up to 4 macro-ops per cycle.
+// On the other hand, the RCU reorder buffer size for Piledriver does not
+// seem be specified in any trustworthy source.
+// But as per https://www.realworldtech.com/bulldozer/6/ the Bulldozer had
+// RCU reorder buffer size of 128. So that is a good guess for now.
+def PdRCU : RetireControlUnit<128, 4>;
+
+
+//===----------------------------------------------------------------------===//
+// Pipelines
+//===----------------------------------------------------------------------===//
+
+// There are total of two pipelines, each one with it's own scheduler.
+
+//===----------------------------------------------------------------------===//
+// Integer Pipeline Scheduling
+//
+
+// There is one Integer Scheduler per core.
+
+// Integer physical register file has 96 registers of 64-bit.
+def PdIntegerPRF : RegisterFile<96, [GR64, CCR]>;
+
+// Unified Integer, Memory Scheduler has 40 entries.
+def PdEX : ProcResGroup<[PdEX0, PdEX1, PdAGLU01]> {
+  // Up to 4 IPC can be decoded, issued, retired.
+  let BufferSize = 40;
+}
+
+
+//===----------------------------------------------------------------------===//
+// FPU Pipeline Scheduling
+//
+
+// The FPU unit is shared between the two cores.
+
+// FP physical register file has 160 registers of 128-bit.
+// Operations on 256-bit data types are cracked into two COPs.
+def PdFpuPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// Unified FP Scheduler has 64 entries,
+def PdFPU : ProcResGroup<[PdFPU0, PdFPU1, PdFPU2, PdFPU3]> {
+  // Up to 4 IPC can be decoded, issued, retired.
+  let BufferSize = 64;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Functional units
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Load-Store Units
+//
+
+// FIXME: does this even make sense?
+
+def PdLoad  : ProcResGroup<[PdAGLU01]> {
+  // For Piledriver, the load queue is 40 entries deep.
+  let BufferSize = 40;
+}
+
+def PdStore : ProcResGroup<[PdAGLU01]> {
+  // For Piledriver, the store queue is 24 entries deep.
+  let BufferSize = 24;
+}
+
+//===----------------------------------------------------------------------===//
+// Integer Execution Units
+//
+
+def PdDiv    : ProcResource<1>; // PdEX0; unpipelined integer division
+def PdCount  : ProcResource<1>; // PdEX0; POPCNT, LZCOUNT
+
+def PdMul    : ProcResource<1>; // PdEX1; integer multiplication
+def PdBranch : ProcResource<1>; // PdEX1; JMP, fused branches
+
+//===----------------------------------------------------------------------===//
+// Floating-Point Units
+//
+
+// Two FMAC/FPFMA units.
+def PdFPFMA  : ProcResource<2>; // PdFPU0, PdFPU1
+
+// One 128-bit integer multiply-accumulate unit.
+def PdFPMMA  : ProcResource<1>; // PdFPU0
+
+// One fp conversion unit.
+def PdFPCVT  : ProcResource<1>; // PdFPU0
+
+// One unit for shuffles, packs, permutes, shifts.
+def PdFPXBR  : ProcResource<1>; // PdFPU1
+
+// Two 128-bit packed integer units.
+def PdFPMAL  : ProcResource<2>; // PdFPU2, PdFPU3
+
+// One FP store unit.
+def PdFPSTO  : ProcResource<1>; // PdFPU3
+
+
+//===----------------------------------------------------------------------===//
+// Basic helper classes.
+//===----------------------------------------------------------------------===//
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass PdWriteRes<SchedWrite SchedRW,
+                      list<ProcResourceKind> ExePorts, int Lat = 1,
+                      list<int> Res = [], int UOps = 1> {
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
+}
+
+multiclass __pdWriteResPair<X86FoldableSchedWrite SchedRW,
+                            list<ProcResourceKind> ExePorts, int Lat,
+                            list<int> Res, int UOps,
+                            int LoadLat, int LoadRes, int LoadUOps> {
+  defm : PdWriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+
+  defm : PdWriteRes<SchedRW.Folded,
+                    !listconcat([PdLoad], ExePorts),
+                    !add(Lat, LoadLat),
+                    !if(!and(!empty(Res), !eq(LoadRes, 1)),
+                      [],
+                      !listconcat([LoadRes], Res)),
+                    !add(UOps, LoadUOps)>;
+}
+
+multiclass PdWriteResExPair<X86FoldableSchedWrite SchedRW,
+                            list<ProcResourceKind> ExePorts, int Lat = 1,
+                            list<int> Res = [], int UOps = 1,
+                            int LoadUOps = 0> {
+  defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+                          /*LoadLat*/4, /*LoadRes*/1, LoadUOps>;
+}
+
+multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW,
+                             list<ProcResourceKind> ExePorts, int Lat = 1,
+                             list<int> Res = [], int UOps = 1,
+                             int LoadUOps = 0> {
+  defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+                           /*LoadLat*/5, /*LoadRes*/1, LoadUOps>;
+}
+
+multiclass PdWriteResYMMPair<X86FoldableSchedWrite SchedRW,
+                             list<ProcResourceKind> ExePorts, int Lat,
+                             list<int> Res, int UOps = 2,
+                             int LoadUOps = 0> {
+  defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+                           /*LoadLat*/5, /*LoadRes*/2, LoadUOps>;
+}
+
+//===----------------------------------------------------------------------===//
+// Here be dragons.
+//===----------------------------------------------------------------------===//
+
+// L1 data cache has a 4-cycle load-to-use latency, so ReadAfterLd registers
+// needn't be available until 4 cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available
+// until 5 cycles after the memory operand.
+def : ReadAdvance<ReadAfterVecLd, 5>;
+def : ReadAdvance<ReadAfterVecXLd, 5>;
+def : ReadAdvance<ReadAfterVecYLd, 5>;
+
+// A folded store needs a cycle on the PdStore for the store data.
+def : WriteRes<WriteRMW, [PdStore]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad,    [PdLoad]> { let Latency = 5; }
+def : WriteRes<WriteStore,   [PdStore]>;
+def : WriteRes<WriteStoreNT, [PdStore]>;
+def : WriteRes<WriteMove,    [PdEX01]>;
+
+// Load/store MXCSR.
+// FIXME: These are copy and pasted from WriteLoad/Store.
+def : WriteRes<WriteLDMXCSR, [PdLoad]> { let Latency = 5; }
+def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; }
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero, [/*No ExePorts*/]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResExPair<WriteJump,  [PdEX1, PdBranch]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Special case scheduling classes.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteSystem,     [PdEX01]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [PdEX01]> { let Latency = 100; }
+def : WriteRes<WriteFence,      [PdStore]>;
+
+def PdWriteXLAT : SchedWriteRes<[PdEX01]> {
+  let Latency = 6;
+}
+def : InstRW<[PdWriteXLAT], (instrs XLAT)>;
+
+def PdWriteLARrr : SchedWriteRes<[PdEX01]> {
+  let Latency = 184;
+  let NumMicroOps = 45;
+}
+def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr",
+                                        "LSL(16|32|64)rr")>;
+
+// Nops don't have dependencies, so there's no actual latency, but we set this
+// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
+def : WriteRes<WriteNop, [PdEX01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResExPair<WriteALU,     [PdEX01]>;
+
+def PdWriteLXADD : SchedWriteRes<[PdEX01]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteLXADD], (instrs LXADD8, LXADD16, LXADD32, LXADD64)>;
+
+def PdWriteBMI1 : SchedWriteRes<[PdEX01]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBMI1],
+             (instrs BLCFILL32rr, BLCFILL64rr, BLCI32rr, BLCI64rr,
+                     BLCIC32rr, BLCIC64rr, BLCMSK32rr, BLCMSK64rr,
+                     BLCS32rr, BLCS64rr, BLSFILL32rr, BLSFILL64rr,
+                     BLSIC32rr, BLSIC64rr, T1MSKC32rr, T1MSKC64rr,
+                     TZMSK32rr, TZMSK64rr)>;
+
+def PdWriteBMI1m : SchedWriteRes<[PdEX01]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBMI1m],
+             (instrs BLCFILL32rm, BLCFILL64rm, BLCI32rm, BLCI64rm,
+                     BLCIC32rm, BLCIC64rm, BLCMSK32rm, BLCMSK64rm,
+                     BLCS32rm, BLCS64rm, BLSFILL32rm, BLSFILL64rm,
+                     BLSIC32rm, BLSIC64rm, T1MSKC32rm, T1MSKC64rm,
+                     TZMSK32rm, TZMSK64rm)>;
+
+defm : PdWriteResExPair<WriteADC,    [PdEX01],                  1,  [2]>;
+
+defm : PdWriteRes<WriteBSWAP32,      [PdEX1]>;
+defm : PdWriteRes<WriteBSWAP64,      [PdEX1]>;
+defm : PdWriteRes<WriteCMPXCHG,      [PdEX1],                   3,  [],       5>;
+defm : PdWriteRes<WriteCMPXCHGRMW,   [PdEX1, PdStore, PdLoad],  3,  [], 2>;
+defm : PdWriteRes<WriteXCHG,         [PdEX1],                   1,  [],       2>;
+
+def PdWriteCMPXCHG8rr : SchedWriteRes<[PdEX1]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+def : InstRW<[PdWriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
+
+def PdWriteCMPXCHG8rm : SchedWriteRes<[PdEX1]> {
+  let Latency = 3;
+  let NumMicroOps = 5;
+}
+def : InstRW<[PdWriteCMPXCHG8rm], (instrs CMPXCHG8rm)>;
+
+def PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm : SchedWriteRes<[PdEX1]> {
+  let Latency = 3;
+  let NumMicroOps = 6;
+}
+def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm],
+             (instrs CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm)>;
+
+def PdWriteCMPXCHG8B : SchedWriteRes<[PdEX1]> {
+  let Latency = 3;
+  let NumMicroOps = 18;
+}
+def : InstRW<[PdWriteCMPXCHG8B], (instrs CMPXCHG8B)>;
+
+def PdWriteCMPXCHG16B : SchedWriteRes<[PdEX1]> {
+  let Latency = 3;
+  let NumMicroOps = 22;
+}
+def : InstRW<[PdWriteCMPXCHG16B], (instrs CMPXCHG16B)>;
+
+def PdWriteXCHG16rr : SchedWriteRes<[PdEX1]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteXCHG16rr], (instrs XCHG16rr)>;
+
+def PdWriteXADD : SchedWriteRes<[PdEX1]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteXADD], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr)>;
+
+def PdWriteXADDm : SchedWriteRes<[PdEX1]> {
+let Latency = 6;
+let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteXADDm], (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm)>;
+
+defm : PdWriteResExPair<WriteIMul8,     [PdEX1, PdMul],          4>;
+defm : PdWriteResExPair<WriteIMul16,    [PdEX1, PdMul],          4,  [],    2>;
+defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul],          5,  [],    2>;
+defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul],          4>;
+defm : PdWriteResExPair<WriteIMul32,    [PdEX1, PdMul],          4>;
+defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul],          4,  [],    1, 1>;
+defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul],          4>;
+defm : PdWriteResExPair<WriteIMul64,    [PdEX1, PdMul],          6,  [1, 4]>;
+defm : PdWriteResExPair<WriteIMul64Imm, [PdEX1, PdMul],          6,  [1, 4],1, 1>;
+defm : PdWriteResExPair<WriteIMul64Reg, [PdEX1, PdMul],          6,  [1, 4]>;
+defm : X86WriteResUnsupported<WriteIMulH>; // BMI2 MULX
+
+defm : PdWriteResExPair<WriteDiv8,    [PdEX1, PdDiv],           12,  [1, 12]>;
+defm : PdWriteResExPair<WriteDiv16,   [PdEX1, PdDiv],           15,  [1, 15],   2>;
+defm : PdWriteResExPair<WriteDiv32,   [PdEX1, PdDiv],           14,  [1, 14],   2>;
+defm : PdWriteResExPair<WriteDiv64,   [PdEX1, PdDiv],           14,  [1, 14],   2>;
+
+defm : PdWriteResExPair<WriteIDiv8,   [PdEX1, PdDiv],           12,  [1, 12]>;
+defm : PdWriteResExPair<WriteIDiv16,  [PdEX1, PdDiv],           15,  [1, 17],   2>;
+defm : PdWriteResExPair<WriteIDiv32,  [PdEX1, PdDiv],           14,  [1, 25],   2>;
+defm : PdWriteResExPair<WriteIDiv64,  [PdEX1, PdDiv],           14,  [1, 14],   2>;
+
+defm : PdWriteResExPair<WriteCRC32,   [PdEX01],                  3,  [4],       3>;
+
+def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> {
+  let Latency = 5;
+  let ResourceCycles = [4];
+  let NumMicroOps = 5;
+}
+def : InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>;
+
+def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> {
+  let Latency = 6;
+  let ResourceCycles = [4];
+  let NumMicroOps = 7;
+}
+def : InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>;
+
+def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> {
+  let Latency = 10;
+  let ResourceCycles = [4];
+  let NumMicroOps = 11;
+}
+def : InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>;
+
+defm : PdWriteResExPair<WriteCMOV,    [PdEX01]>; // Conditional move.
+defm : PdWriteResExPair<WriteCMOV2,   [PdEX01], 1, [], 1, 1>; // Conditional (CF + ZF flag) move.
+
+def : InstRW<[WriteCMOV2.Folded], (instrs CMOVG16rm, CMOVG32rm, CMOVG64rm,
+                                          CMOVGE16rm, CMOVGE32rm, CMOVGE64rm,
+                                          CMOVL16rm, CMOVL32rm, CMOVL64rm,
+                                          CMOVLE16rm, CMOVLE32rm, CMOVLE64rm)>;
+
+defm : PdWriteRes<WriteFCMOV,        [PdFPU0, PdFPFMA]>; // x87 conditional move.
+
+def : WriteRes<WriteSETCC,           [PdEX01]>; // Setcc.
+def : WriteRes<WriteSETCCStore,      [PdEX01, PdStore]>;
+
+def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> {
+  let ResourceCycles = [2];
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteSETGEmSETGmSETLEmSETLm], (instrs SETGEm, SETGm,
+                                                      SETLEm, SETLm)>;
+
+defm : PdWriteRes<WriteLAHFSAHF,      [PdEX01],          2,  [],     2>;
+
+def WriteLAHF : SchedWriteRes<[PdEX01]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteLAHF], (instrs LAHF)>;
+
+def WriteSAHF : SchedWriteRes<[PdEX01]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteSAHF], (instrs SAHF)>;
+
+defm : PdWriteRes<WriteBitTest,          [PdEX01],         1, [1],     1>;
+defm : PdWriteRes<WriteBitTestImmLd,     [PdEX01, PdLoad], 5, [1, 1],  1>;
+defm : PdWriteRes<WriteBitTestRegLd,     [PdEX01, PdLoad], 5, [1, 1],  7>;
+defm : PdWriteRes<WriteBitTestSet,       [PdEX01],         2, [1],     2>;
+defm : PdWriteRes<WriteBitTestSetImmLd,  [PdEX01, PdLoad], 6, [1, 1],  4>;
+defm : PdWriteRes<WriteBitTestSetImmRMW, [PdEX01, PdLoad], 6, [1, 1],  4>;
+defm : PdWriteRes<WriteBitTestSetRegLd,  [PdEX01, PdLoad], 6, [1, 1], 10>;
+defm : PdWriteRes<WriteBitTestSetRegRMW, [PdEX01, PdLoad], 6, [1, 1], 10>;
+
+// This is for simple LEAs with one or two input operands.
+// FIXME: SAGU 3-operand LEA
+def : WriteRes<WriteLEA,              [PdEX01]> { let NumMicroOps = 2; }
+
+// Bit counts.
+defm : PdWriteResExPair<WriteBSF,     [PdEX01],          3,  [4],     6, 2>;
+defm : PdWriteResExPair<WriteBSR,     [PdEX01],          4,  [4],     7, 2>;
+defm : PdWriteResExPair<WritePOPCNT,  [PdEX01],          4>;
+defm : PdWriteResExPair<WriteLZCNT,   [PdEX01],          2,  [],      2>;
+defm : PdWriteResExPair<WriteTZCNT,   [PdEX01],          2,  [2],     2>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : PdWriteResExPair<WriteBEXTR,   [PdEX01],          2,  [],     2>;
+defm : PdWriteResExPair<WriteBLS,     [PdEX01],          2,  [],     2>;
+defm : PdWriteResExPair<WriteBZHI,    [PdEX01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResExPair<WriteShift,    [PdEX01]>;
+defm : PdWriteResExPair<WriteShiftCL,  [PdEX01]>;
+defm : PdWriteResExPair<WriteRotate,   [PdEX01]>;
+defm : PdWriteResExPair<WriteRotateCL, [PdEX01]>;
+
+def PdWriteRCL8rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 12;
+  let NumMicroOps = 26;
+}
+def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>;
+
+def PdWriteRCR8ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 12;
+  let NumMicroOps = 23;
+}
+def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>;
+
+def PdWriteRCR8rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 11;
+  let NumMicroOps = 24;
+}
+def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>;
+
+def PdWriteRCL16rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 10;
+  let NumMicroOps = 22;
+}
+def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>;
+
+def PdWriteRCR16ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 10;
+  let NumMicroOps = 19;
+}
+def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>;
+
+def PdWriteRCL32rCLRCL64rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 7;
+  let NumMicroOps = 17;
+}
+def : InstRW<[PdWriteRCL32rCLRCL64rCL], (instrs RCL32rCL, RCL64rCL)>;
+
+def PdWriteRCR64rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 7;
+  let NumMicroOps = 16;
+}
+def : InstRW<[PdWriteRCR64rCL], (instrs RCR64rCL)>;
+
+def PdWriteRCR32rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 7;
+  let NumMicroOps = 16;
+}
+def : InstRW<[PdWriteRCR32rCL ], (instrs RCR32rCL)>;
+
+def PdWriteRCR32riRCR64ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 7;
+  let NumMicroOps = 15;
+}
+def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>;
+
+
+def PdWriteRCR16rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 9;
+  let NumMicroOps = 20;
+}
+def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>;
+
+def PdWriteRCL16ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 11;
+  let NumMicroOps = 21;
+}
+def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>;
+
+def PdWriteRCL3264ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 8;
+  let NumMicroOps = 16;
+}
+def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>;
+
+def PdWriteRCL8ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 13;
+  let NumMicroOps = 25;
+}
+def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>;
+
+// SHLD/SHRD.
+defm : PdWriteRes<WriteSHDrri,       [PdEX01],         4, [6], 6>;
+defm : PdWriteRes<WriteSHDrrcl,      [PdEX01],         4, [8], 7>;
+
+def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> {
+  let Latency = 3;
+  let ResourceCycles = [6];
+  let NumMicroOps = 6;
+}
+def : InstRW<[PdWriteSHLD32rri8SHRD16rri8 ], (instrs SHLD32rri8, SHRD16rri8)>;
+
+def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 4;
+  let ResourceCycles = [8];
+  let NumMicroOps = 7;
+}
+def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL,
+                                                              SHLD32rrCL,
+                                                              SHRD32rrCL)>;
+
+defm : PdWriteRes<WriteSHDmri,       [PdLoad, PdEX01], 4, [1, 22], 8>;
+defm : PdWriteRes<WriteSHDmrcl,      [PdLoad, PdEX01], 4, [1, 22], 8>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteFLD0,               [PdFPU1, PdFPSTO], 3>;
+defm : PdWriteRes<WriteFLD1,               [PdFPU1, PdFPSTO], 3>;
+defm : PdWriteRes<WriteFLDC,               [PdFPU1, PdFPSTO], 3>;
+
+defm : PdWriteRes<WriteFLoad,              [PdLoad, PdFPU01, PdFPFMA], 5>;
+defm : PdWriteRes<WriteFLoadX,             [PdLoad, PdFPU01, PdFPFMA], 5>;
+defm : PdWriteRes<WriteFLoadY,             [PdLoad, PdFPU01, PdFPFMA], 5, [], 2>;
+
+defm : PdWriteRes<WriteFMaskedLoad,        [PdLoad, PdFPU01, PdFPFMA], 6, [1, 1, 2]>;
+defm : PdWriteRes<WriteFMaskedLoadY,       [PdLoad, PdFPU01, PdFPFMA], 6, [2, 2, 4], 2>;
+
+defm : PdWriteRes<WriteFStore,             [PdStore, PdFPU1,  PdFPSTO], 2>;
+defm : PdWriteRes<WriteFStoreX,            [PdStore, PdFPU1,  PdFPSTO]>;
+defm : PdWriteRes<WriteFStoreY,            [PdStore, PdFPU1,  PdFPSTO], 1, [], 4>;
+
+def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU1,  PdFPSTO]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteMOVHPm], (instrs MOVHPDmr, MOVHPSmr, VMOVHPDmr, VMOVHPSmr)>;
+
+def PdWriteVMOVUPDYmrVMOVUPSYmr : SchedWriteRes<[PdStore, PdFPU1,  PdFPSTO]> {
+  let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVMOVUPDYmrVMOVUPSYmr], (instrs VMOVUPDYmr, VMOVUPSYmr)>;
+
+defm : PdWriteRes<WriteFStoreNT,           [PdStore, PdFPU1,  PdFPSTO], 3>;
+defm : PdWriteRes<WriteFStoreNTX,          [PdStore, PdFPU1,  PdFPSTO], 3>;
+defm : PdWriteRes<WriteFStoreNTY,          [PdStore, PdFPU1,  PdFPSTO], 3, [2, 2, 2], 4>;
+
+defm : PdWriteRes<WriteFMaskedStore,       [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 4], 18>;
+defm : PdWriteRes<WriteFMaskedStoreY,      [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 4], 34>;
+
+defm : PdWriteRes<WriteFMove,              [PdFPU01, PdFPFMA]>;
+defm : PdWriteRes<WriteFMoveX,             [PdFPU01, PdFPFMA]>;
+defm : PdWriteRes<WriteFMoveY,             [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
+
+defm : PdWriteRes<WriteEMMS,               [PdFPU01, PdFPFMA], 2>;
+
+defm : PdWriteResXMMPair<WriteFAdd,         [PdFPU0, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFAddX,        [PdFPU0, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFAddY,        [PdFPU0, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+
+defm : PdWriteResXMMPair<WriteFAdd64,       [PdFPU0, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFAdd64X,      [PdFPU0, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFAdd64Y,      [PdFPU0, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+
+defm : PdWriteResXMMPair<WriteFCmp,         [PdFPU0, PdFPFMA],  2>;
+defm : PdWriteResXMMPair<WriteFCmpX,        [PdFPU0, PdFPFMA],  2>;
+defm : PdWriteResYMMPair<WriteFCmpY,        [PdFPU0, PdFPFMA],  2, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+
+defm : PdWriteResXMMPair<WriteFCmp64,       [PdFPU0, PdFPFMA],  2>;
+defm : PdWriteResXMMPair<WriteFCmp64X,      [PdFPU0, PdFPFMA],  2>;
+defm : PdWriteResYMMPair<WriteFCmp64Y,      [PdFPU0, PdFPFMA],  2, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+
+defm : PdWriteResXMMPair<WriteFCom,         [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
+
+def PdWriteFCOMPm : SchedWriteRes<[PdFPU1, PdFPFMA]> {
+  let Latency = 6;
+}
+def : InstRW<[PdWriteFCOMPm], (instrs FCOM32m, FCOM64m, FCOMP32m, FCOMP64m)>;
+
+def PdWriteTST_F_UCOM_FPPr : SchedWriteRes<[PdFPU1, PdFPFMA]>;
+def : InstRW<[PdWriteTST_F_UCOM_FPPr], (instrs TST_F, UCOM_FPPr)>;
+
+defm : PdWriteResXMMPair<WriteFMul,         [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFMulX,        [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFMulY,        [PdFPU1, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+
+defm : PdWriteResXMMPair<WriteFMul64,       [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFMul64X,      [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFMul64Y,      [PdFPU1, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+
+defm : PdWriteResXMMPair<WriteFMA,          [PdFPU, PdFPFMA], 5>;
+defm : PdWriteResXMMPair<WriteFMAX,         [PdFPU, PdFPFMA], 5>;
+defm : PdWriteResYMMPair<WriteFMAY,         [PdFPU, PdFPFMA], 5,   [1, 1]>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+
+
+defm : PdWriteResXMMPair<WriteDPPD,         [PdFPU1, PdFPFMA], 15, [1, 3],  15, 2>;
+
+defm : PdWriteResXMMPair<WriteDPPS,         [PdFPU1, PdFPFMA], 25, [1, 3],  16, 2>;
+defm : PdWriteResYMMPair<WriteDPPSY,        [PdFPU1, PdFPFMA], 27, [2, 6], /*or 29*/ 25, 4>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+
+def PdWriteVDPPSrri : SchedWriteRes<[PdFPU1, PdFPFMA]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 3];
+  let NumMicroOps = 17;
+}
+def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>;
+
+defm : PdWriteResXMMPair<WriteFRcp,         [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFRcpX,        [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFRcpY,        [PdFPU1, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+
+defm : PdWriteResXMMPair<WriteFRsqrt,       [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFRsqrtX,      [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFRsqrtY,      [PdFPU1, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+
+defm : PdWriteResXMMPair<WriteFDiv,         [PdFPU1, PdFPFMA], 9, [1, 19]>;
+defm : PdWriteResXMMPair<WriteFDivX,        [PdFPU1, PdFPFMA], 9, [1, 19]>;
+defm : PdWriteResYMMPair<WriteFDivY,        [PdFPU1, PdFPFMA], 9, [2, 38]>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+
+defm : PdWriteResXMMPair<WriteFDiv64,       [PdFPU1, PdFPFMA], 9, [1, 19]>;
+defm : PdWriteResXMMPair<WriteFDiv64X,      [PdFPU1, PdFPFMA], 9, [1, 19]>;
+defm : PdWriteResYMMPair<WriteFDiv64Y,      [PdFPU1, PdFPFMA], 9, [2, 38]>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+
+defm : PdWriteResXMMPair<WriteFSqrt,        [PdFPU1, PdFPFMA], 9, [1, 21]>;
+defm : PdWriteResXMMPair<WriteFSqrtX,       [PdFPU1, PdFPFMA], 9, [1, 21]>;
+defm : PdWriteResYMMPair<WriteFSqrtY,       [PdFPU1, PdFPFMA], 9, [2, 42]>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+
+defm : PdWriteResXMMPair<WriteFSqrt64,      [PdFPU1, PdFPFMA], 9, [1, 27]>;
+defm : PdWriteResXMMPair<WriteFSqrt64X,     [PdFPU1, PdFPFMA], 9, [1, 27]>;
+defm : PdWriteResYMMPair<WriteFSqrt64Y,     [PdFPU1, PdFPFMA], 9, [2, 54]>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+
+defm : PdWriteResXMMPair<WriteFSqrt80,      [PdFPU1, PdFPFMA],  1, [1, 35]>;
+defm : PdWriteResXMMPair<WriteFSign,        [PdFPU1, PdFPFMA]>;
+
+defm : PdWriteResXMMPair<WriteFRnd,         [PdFPU1, PdFPSTO],  4>;
+defm : PdWriteResYMMPair<WriteFRndY,        [PdFPU1, PdFPSTO],  4, [2, 1], 2>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+
+def PdWriteVFRCZ : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVFRCZ], (instrs VFRCZPDrr, VFRCZPSrr,
+                                     VFRCZSDrr, VFRCZSSrr)>;
+
+def PdWriteVFRCZm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 15;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVFRCZm], (instrs VFRCZPDrm, VFRCZPSrm,
+                                      VFRCZSDrm, VFRCZSSrm)>;
+
+def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 1];
+  let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>;
+
+def PdWriteVFRCZYm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 15;
+  let ResourceCycles = [2, 1];
+  let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVFRCZYm], (instrs VFRCZPSYrm, VFRCZPDYrm)>;
+
+defm : PdWriteResXMMPair<WriteFLogic,       [PdFPU01, PdFPFMA],  2>;
+defm : PdWriteResYMMPair<WriteFLogicY,      [PdFPU01, PdFPFMA],  2, [2, 2]>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+
+defm : PdWriteResXMMPair<WriteFTest,        [PdFPU0, PdFPFMA, PdEX0],  1, [], 2>;
+defm : PdWriteResYMMPair<WriteFTestY,       [PdFPU01, PdFPFMA, PdEX0], 1, [2, 2, 1], 4, 2>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+
+defm : PdWriteResXMMPair<WriteFShuffle,     [PdFPU01, PdFPFMA],  2>;
+defm : PdWriteResYMMPair<WriteFShuffleY,    [PdFPU01, PdFPFMA],  2, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+
+def PdWriteVBROADCASTF128 : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVBROADCASTF128], (instrs VBROADCASTF128)>;
+
+defm : PdWriteResXMMPair<WriteFVarShuffle,  [PdFPU01, PdFPFMA],  3, [1, 4]>;
+defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU01, PdFPFMA],  3, [2, 6], 2>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+
+defm : PdWriteResXMMPair<WriteFBlend,       [PdFPU01, PdFPFMA],  2>;
+defm : PdWriteResYMMPair<WriteFBlendY,      [PdFPU01, PdFPFMA],  2, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+
+defm : PdWriteResXMMPair<WriteFVarBlend,    [PdFPU01, PdFPFMA],  2, [1, 4]>;
+defm : PdWriteResYMMPair<WriteFVarBlendY,   [PdFPU01, PdFPFMA],  2, [2, 6], 2>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+
+defm : PdWriteResXMMPair<WriteFShuffle256,  [PdFPU01, PdFPFMA],  2, [], 2>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
+
+def PdWriteVEXTRACTF128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+  let Latency = 2;
+}
+def : InstRW<[PdWriteVEXTRACTF128rr], (instrs VEXTRACTF128rr)>;
+
+def PdWriteVEXTRACTF128mr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVEXTRACTF128mr], (instrs VEXTRACTF128mr)>;
+
+def PdWriteVPERM2F128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+  let Latency = 4;
+  let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>;
+
+def PdWriteVPERM2F128rm : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+  let Latency = 8; // 4 + 4
+  let NumMicroOps = 10;
+}
+def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Conversions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteCvtSS2I,   [PdFPU1, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
+
+defm : PdWriteResXMMPair<WriteCvtPS2I,   [PdFPU1, PdFPSTO], 4>;
+defm : PdWriteResYMMPair<WriteCvtPS2IY,  [PdFPU1, PdFPSTO], 4, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+
+defm : PdWriteResXMMPair<WriteCvtSD2I,   [PdFPU1, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
+
+defm : PdWriteResXMMPair<WriteCvtPD2I,   [PdFPU1, PdFPSTO],          8, [],        2>;
+defm : PdWriteResYMMPair<WriteCvtPD2IY,  [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteMMX_CVTTPD2PIirr], (instrs MMX_CVTTPD2PIirr)>;
+
+// FIXME: f+3 ST, LD+STC latency
+defm : PdWriteResXMMPair<WriteCvtI2SS,   [PdFPU1, PdFPSTO], 4, [], 2>;
+// FIXME: .Folded version is one NumMicroOp *less*..
+
+defm : PdWriteResXMMPair<WriteCvtI2PS,   [PdFPU1, PdFPSTO], 4>;
+defm : PdWriteResYMMPair<WriteCvtI2PSY,  [PdFPU1, PdFPSTO], 4, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+
+defm : PdWriteResXMMPair<WriteCvtI2SD,   [PdFPU1, PdFPSTO], 4, [], 2>;
+// FIXME: .Folded version is one NumMicroOp *less*..
+
+def WriteCVTSI642SDrr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 13;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteCVTSI642SDrr], (instrs CVTSI642SDrr, CVTSI642SSrr)>;
+
+defm : PdWriteResXMMPair<WriteCvtI2PD,   [PdFPU1, PdFPSTO], 8, [],     2>;
+defm : PdWriteResYMMPair<WriteCvtI2PDY,  [PdFPU1, PdFPSTO], 8, [2, 1], 4, 1>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : PdWriteResXMMPair<WriteCvtSS2SD,  [PdFPU1, PdFPSTO], 4>;
+
+defm : PdWriteResXMMPair<WriteCvtPS2PD,  [PdFPU1, PdFPSTO], 8, [],     2>;
+defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 1>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+
+defm : PdWriteResXMMPair<WriteCvtSD2SS,  [PdFPU1, PdFPSTO], 4>;
+
+defm : PdWriteResXMMPair<WriteCvtPD2PS,  [PdFPU1, PdFPSTO],          8, [],        2>;
+defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+def WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr,
+                                                            MMX_CVTPI2PDirr)>;
+
+def WriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>;
+
+defm : PdWriteResXMMPair<WriteCvtPH2PS,  [PdFPU1, PdFPSTO], 8, [],     2, 1>;
+defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+
+defm : PdWriteRes<WriteCvtPS2PH,        [PdFPU1, PdFPSTO],          8, [],        2>;
+defm : PdWriteRes<WriteCvtPS2PHY,       [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+
+defm : PdWriteRes<WriteCvtPS2PHSt,      [PdFPU1, PdFPSTO, PdStore],          4, [],           3>;
+defm : PdWriteRes<WriteCvtPS2PHYSt,     [PdFPU1, PdFPSTO, PdFPFMA, PdStore], 4, [2, 1, 1, 1], 4>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector integer operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteVecLoad,             [PdLoad, PdFPU01, PdFPMAL], 5>;
+defm : PdWriteRes<WriteVecLoadX,            [PdLoad, PdFPU01, PdFPMAL], 5>;
+defm : PdWriteRes<WriteVecLoadY,            [PdLoad, PdFPU01, PdFPMAL], 5, [], 2>;
+
+defm : PdWriteRes<WriteVecLoadNT,           [PdLoad, PdFPU01, PdFPMAL], 5>;
+defm : PdWriteRes<WriteVecLoadNTY,          [PdLoad, PdFPU01, PdFPMAL], 5>;
+
+defm : PdWriteRes<WriteVecMaskedLoad,       [PdLoad, PdFPU01, PdFPMAL], 6, [1, 1, 2]>;
+defm : PdWriteRes<WriteVecMaskedLoadY,      [PdLoad, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>;
+
+defm : PdWriteRes<WriteVecStore,            [PdStore, PdFPU1,   PdFPSTO], 2>;
+defm : PdWriteRes<WriteVecStoreX,           [PdStore, PdFPU1,   PdFPSTO]>;
+defm : PdWriteRes<WriteVecStoreY,           [PdStore, PdFPU1,   PdFPSTO], 1, [], 4>;
+
+def PdWriteVMOVDQUYmr : SchedWriteRes<[PdStore, PdFPU1,   PdFPSTO]> {
+  let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVMOVDQUYmr], (instrs VMOVDQUYmr)>;
+
+defm : PdWriteRes<WriteVecStoreNT,          [PdStore, PdFPU1,   PdFPSTO], 2>;
+defm : PdWriteRes<WriteVecStoreNTY,         [PdStore, PdFPU1,   PdFPSTO], 2, [2, 2, 2], 4>;
+
+defm : PdWriteRes<WriteVecMaskedStore,      [PdStore, PdFPU01, PdFPMAL], 6, [1, 1, 4]>;
+defm : PdWriteRes<WriteVecMaskedStoreY,     [PdStore, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>;
+
+defm : PdWriteRes<WriteVecMove,             [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteRes<WriteVecMoveX,            [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteRes<WriteVecMoveY,            [PdFPU01, PdFPMAL], 2, [2, 2], 2>;
+
+defm : PdWriteRes<WriteVecMoveToGpr,        [PdFPU0, PdFPFMA, PdEX0], 10>;
+defm : PdWriteRes<WriteVecMoveFromGpr,      [PdFPU01, PdFPFMA], 10, [], 2>;
+
+defm : PdWriteResXMMPair<WriteVecALU,        [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecALUX,       [PdFPU01, PdFPMAL], 2>;
+defm : X86WriteResPairUnsupported<WriteVecALUY>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+
+defm : PdWriteResXMMPair<WriteVecShift,      [PdFPU01, PdFPMAL], 3>;
+defm : PdWriteResXMMPair<WriteVecShiftX,     [PdFPU01, PdFPMAL], 3>;
+defm : X86WriteResPairUnsupported<WriteVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+
+defm : PdWriteResXMMPair<WriteVecShiftImm,   [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecShiftImmX,  [PdFPU01, PdFPMAL], 2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+
+defm : PdWriteResXMMPair<WriteVecIMul,       [PdFPU0, PdFPMMA], 4>;
+defm : PdWriteResXMMPair<WriteVecIMulX,      [PdFPU0, PdFPMMA], 4>;
+defm : X86WriteResPairUnsupported<WriteVecIMulY>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+
+defm : PdWriteResXMMPair<WritePMULLD,        [PdFPU0, PdFPU01, PdFPMMA, PdFPMAL], 5, [2, 1, 2, 1]>;
+defm : X86WriteResPairUnsupported<WritePMULLDY>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+
+def JWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPU01, PdFPMMA, PdFPMAL]> {
+  let Latency = 4;
+  let ResourceCycles = [2, 1, 2, 1];
+}
+def : InstRW<[JWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr,
+                                     VPMACSSDQLrr)>;
+
+defm : PdWriteResXMMPair<WriteMPSAD,         [PdFPU0, PdFPMMA], 9, [1, 2], 9>;
+defm : X86WriteResPairUnsupported<WriteMPSADY>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+
+defm : PdWriteResXMMPair<WritePSADBW,        [PdFPU01, PdFPMAL], 4, [], 2>;
+defm : PdWriteResXMMPair<WritePSADBWX,       [PdFPU01, PdFPMAL], 4, [], 2>;
+defm : X86WriteResPairUnsupported<WritePSADBWY>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+
+defm : PdWriteResXMMPair<WritePHMINPOS,      [PdFPU0,  PdFPMAL], 4, [], 2>;
+
+defm : PdWriteResXMMPair<WriteShuffle,       [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteShuffleX,      [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResYMMPair<WriteShuffleY,      [PdFPU01, PdFPMAL], 2,   [1, 1]>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+
+defm : PdWriteResXMMPair<WriteVarShuffle,    [PdFPU01, PdFPMAL], 3, [1, 4]>;
+defm : PdWriteResXMMPair<WriteVarShuffleX,   [PdFPU01, PdFPMAL], 3, [1, 4]>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+
+defm : PdWriteResXMMPair<WriteBlend,         [PdFPU01, PdFPMAL], 2>;
+defm : X86WriteResPairUnsupported<WriteBlendY>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+
+defm : PdWriteResXMMPair<WriteVarBlend,      [PdFPU01, PdFPMAL], 2, [1, 4]>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+
+defm : PdWriteResXMMPair<WriteVecLogic,      [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecLogicX,     [PdFPU01, PdFPMAL], 2>;
+defm : X86WriteResPairUnsupported<WriteVecLogicY>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+
+defm : PdWriteResXMMPair<WriteVecTest,       [PdFPU0, PdFPFMA, PdEX0],  1, [], 2>;
+defm : PdWriteResYMMPair<WriteVecTestY,      [PdFPU01, PdFPFMA, PdEX0], 1, [2, 2, 1], 4, 2>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+
+defm : PdWriteResXMMPair<WriteShuffle256,    [PdFPU01, PdFPMAL]>;
+defm : PdWriteResXMMPair<WriteVarShuffle256, [PdFPU01, PdFPMAL]>;
+
+defm : PdWriteResXMMPair<WriteVarVecShift,   [PdFPU01, PdFPMAL], 3>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector insert/extract operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteVecInsert,    [PdFPU01, PdFPMAL], 2, [], 2>;
+defm : PdWriteRes<WriteVecInsertLd,  [PdFPU01, PdFPMAL, PdLoad], 6, [], 2>;
+
+defm : PdWriteRes<WriteVecExtract,   [PdFPU0, PdFPFMA, PdEX0], 13, [], 2>;
+defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [], 2>;
+
+def PdWriteEXTRQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+  let Latency = 3;
+}
+def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE42 String instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 14, [1, 2, 1], 7, 1>;
+defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0],  6, [1, 2, 1], 7, 2>;
+
+defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 15, [1, 2, 6, 4, 1, 1], 27, 1>;
+defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 2, 6, 4, 1, 1], 27, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// MOVMSK Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0],   10, [], 2>;
+
+defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 13, [], 2>;
+defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
+// defm : X86WriteResUnsupported<WriteVecMOVMSKZ>;
+
+defm : PdWriteRes<WriteMMXMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 10, [], 2>;
+
+////////////////////////////////////////////////////////////////////////////////
+// AES Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteAESIMC,    [PdFPU0, PdFPMMA], 5>;
+defm : PdWriteResXMMPair<WriteAESKeyGen, [PdFPU0, PdFPMMA], 5>;
+defm : PdWriteResXMMPair<WriteAESDecEnc, [PdFPU0, PdFPMMA], 9, [], 2>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteFHAdd,  [PdFPU0, PdFPFMA], 11, [],     3, 1>;
+defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [2, 1], 8, 2>;
+defm : X86WriteResPairUnsupported<WriteFHAddZ>;
+
+defm : PdWriteResXMMPair<WritePHAdd,  [PdFPU01, PdFPMAL], 5, [], 3, 1>;
+defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2>;
+defm : X86WriteResPairUnsupported<WritePHAddY>;
+defm : X86WriteResPairUnsupported<WritePHAddZ>;
+
+def : InstRW<[WritePHAdd], (instrs PHADDDrr, PHSUBDrr,
+                                   PHADDWrr, PHSUBWrr,
+                                   PHADDSWrr, PHSUBSWrr,
+                                   VPHADDDrr, VPHSUBDrr,
+                                   VPHADDWrr, VPHSUBWrr,
+                                   VPHADDSWrr, VPHSUBSWrr)>;
+
+def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm,
+                                          PHADDWrm, PHSUBWrm,
+                                          PHADDSWrm, PHSUBSWrm,
+                                          VPHADDDrm, VPHSUBDrm,
+                                          VPHADDWrm, VPHSUBWrm,
+                                          VPHADDSWrm, VPHSUBSWrm)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [], 5, 1>;
+
+def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> {
+  let Latency = 13;
+  let NumMicroOps = 6;
+}
+def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE4A instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def PdWriteINSERTQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+  let Latency = 3;
+  let ResourceCycles = [1, 4];
+}
+def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// AVX instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def PdWriteVBROADCASTYLd : SchedWriteRes<[PdLoad, PdFPU01, PdFPFMA]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 2, 4];
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVBROADCASTYLd, ReadAfterLd], (instrs VBROADCASTSDYrm,
+                                                          VBROADCASTSSYrm)>;
+
+def PdWriteVZEROALL : SchedWriteRes<[]> {
+  let Latency = 90;
+  let NumMicroOps = 32;
+}
+def : InstRW<[PdWriteVZEROALL], (instrs VZEROALL)>;
+
+def PdWriteVZEROUPPER : SchedWriteRes<[]> {
+  let Latency = 46;
+  let NumMicroOps = 16;
+}
+def : InstRW<[PdWriteVZEROUPPER], (instrs VZEROUPPER)>;
+
+///////////////////////////////////////////////////////////////////////////////
+//  SchedWriteVariant definitions.
+///////////////////////////////////////////////////////////////////////////////
+
+def PdWriteZeroLatency : SchedWriteRes<[]> {
+  let Latency = 0;
+}
+
+def PdWriteZeroIdiom : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+  SchedVar<MCSchedPredicate<TruePred>,           [WriteALU]>
+]>;
+def : InstRW<[PdWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+                                         XOR32rr, XOR64rr)>;
+
+def PdWriteFZeroIdiom : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+  SchedVar<MCSchedPredicate<TruePred>,           [WriteFLogic]>
+]>;
+def : InstRW<[PdWriteFZeroIdiom], (instrs XORPSrr,  VXORPSrr,
+                                          XORPDrr,  VXORPDrr,
+                                          ANDNPSrr, VANDNPSrr,
+                                          ANDNPDrr, VANDNPDrr)>;
+
+// VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr "zero-idioms" have latency of 1.
+
+def PdWriteVZeroIdiomLogic : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+  SchedVar<MCSchedPredicate<TruePred>,           [WriteVecLogic]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
+
+def PdWriteVZeroIdiomLogicX : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+  SchedVar<MCSchedPredicate<TruePred>,           [WriteVecLogicX]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomLogicX], (instrs PXORrr,  VPXORrr,
+                                                PANDNrr, VPANDNrr)>;
+
+def PdWriteVZeroIdiomALU : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+  SchedVar<MCSchedPredicate<TruePred>,           [WriteVecALU]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBirr,   MMX_PSUBDirr,
+                                             MMX_PSUBQirr,   MMX_PSUBWirr,
+                                             MMX_PCMPGTBirr,
+                                             MMX_PCMPGTDirr,
+                                             MMX_PCMPGTWirr)>;
+
+def PdWriteVZeroIdiomALUX : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+    SchedVar<MCSchedPredicate<TruePred>,           [WriteVecALUX]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+                                              PSUBDrr, VPSUBDrr,
+                                              PSUBQrr, VPSUBQrr,
+                                              PSUBWrr, VPSUBWrr,
+                                              PCMPGTBrr, VPCMPGTBrr,
+                                              PCMPGTDrr, VPCMPGTDrr,
+                                              PCMPGTWrr, VPCMPGTWrr)>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+// VPCMPGTQ, but not PCMPGTQ!
+
+def : IsZeroIdiomFunction<[
+  // GPR Zero-idioms.
+  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
+
+  // MMX Zero-idioms.
+  DepBreakingClass<[
+    MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
+    MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
+    MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
+    MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
+  ], ZeroIdiomPredicate>,
+
+  // SSE Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
+
+    // int variants.
+    PXORrr, PANDNrr,
+    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+    PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
+    PCMPGTBrr, PCMPGTDrr, PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX Zero-idioms.
+  DepBreakingClass<[
+    // xmm fp variants.
+    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
+
+    // xmm int variants.
+    VPXORrr, VPANDNrr,
+    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+    VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
+    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+
+    // ymm variants.
+    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
+  ], ZeroIdiomPredicate>
+]>;
+
+def : IsDepBreakingFunction<[
+  // GPR
+  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
+  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
+
+  // MMX
+  DepBreakingClass<[
+    MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
+  ], ZeroIdiomPredicate>,
+
+  // SSE
+  DepBreakingClass<[
+    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr
+    // But not PCMPEQQrr.
+  ], ZeroIdiomPredicate>,
+
+  // AVX
+  DepBreakingClass<[
+    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr
+    // But not VPCMPEQQrr.
+  ], ZeroIdiomPredicate>
+]>;
+
+
+} // SchedModel
diff --git a/test/CodeGen/X86/aes-schedule.ll b/test/CodeGen/X86/aes-schedule.ll
index 2328279c79b..c622899ca09 100644
--- a/test/CodeGen/X86/aes-schedule.ll
+++ b/test/CodeGen/X86/aes-schedule.ll
@@ -14,8 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -96,15 +96,15 @@ define <2 x i64> @test_aesdec(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_aesdec:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    aesdec %xmm1, %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    aesdec (%rdi), %xmm0 # sched: [13:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    aesdec %xmm1, %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    aesdec (%rdi), %xmm0 # sched: [14:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_aesdec:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaesdec %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    vaesdec (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaesdec %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaesdec (%rdi), %xmm0, %xmm0 # sched: [14:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_aesdec:
 ; BTVER2-SSE:       # %bb.0:
@@ -211,15 +211,15 @@ define <2 x i64> @test_aesdeclast(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ;
 ; BDVER2-SSE-LABEL: test_aesdeclast:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    aesdeclast %xmm1, %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    aesdeclast (%rdi), %xmm0 # sched: [13:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    aesdeclast %xmm1, %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    aesdeclast (%rdi), %xmm0 # sched: [14:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_aesdeclast:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [14:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_aesdeclast:
 ; BTVER2-SSE:       # %bb.0:
@@ -326,15 +326,15 @@ define <2 x i64> @test_aesenc(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_aesenc:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    aesenc %xmm1, %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    aesenc (%rdi), %xmm0 # sched: [13:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    aesenc %xmm1, %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    aesenc (%rdi), %xmm0 # sched: [14:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_aesenc:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaesenc %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    vaesenc (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaesenc %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaesenc (%rdi), %xmm0, %xmm0 # sched: [14:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_aesenc:
 ; BTVER2-SSE:       # %bb.0:
@@ -441,15 +441,15 @@ define <2 x i64> @test_aesenclast(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ;
 ; BDVER2-SSE-LABEL: test_aesenclast:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    aesenclast %xmm1, %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    aesenclast (%rdi), %xmm0 # sched: [13:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    aesenclast %xmm1, %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    aesenclast (%rdi), %xmm0 # sched: [14:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_aesenclast:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaesenclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    vaesenclast (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaesenclast %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaesenclast (%rdi), %xmm0, %xmm0 # sched: [14:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_aesenclast:
 ; BTVER2-SSE:       # %bb.0:
@@ -569,17 +569,17 @@ define <2 x i64> @test_aesimc(<2 x i64> %a0, <2 x i64> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_aesimc:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    aesimc %xmm0, %xmm1 # sched: [12:2.00]
-; BDVER2-SSE-NEXT:    aesimc (%rdi), %xmm0 # sched: [18:2.00]
-; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    aesimc %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    aesimc (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_aesimc:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaesimc %xmm0, %xmm0 # sched: [12:2.00]
-; BDVER2-NEXT:    vaesimc (%rdi), %xmm1 # sched: [18:2.00]
-; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaesimc (%rdi), %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaesimc %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_aesimc:
 ; BTVER2-SSE:       # %bb.0:
@@ -703,17 +703,17 @@ define <2 x i64> @test_aeskeygenassist(<2 x i64> %a0, <2 x i64> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_aeskeygenassist:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    aeskeygenassist $7, %xmm0, %xmm1 # sched: [8:3.67]
-; BDVER2-SSE-NEXT:    aeskeygenassist $7, (%rdi), %xmm0 # sched: [8:3.33]
-; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    aeskeygenassist $7, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    aeskeygenassist $7, (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_aeskeygenassist:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaeskeygenassist $7, %xmm0, %xmm0 # sched: [8:3.67]
-; BDVER2-NEXT:    vaeskeygenassist $7, (%rdi), %xmm1 # sched: [8:3.33]
-; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaeskeygenassist $7, (%rdi), %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaeskeygenassist $7, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_aeskeygenassist:
 ; BTVER2-SSE:       # %bb.0:
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index 4902044c766..c9481ccdbf9 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -6,7 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -49,9 +49,9 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; BDVER2-LABEL: test_addpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_addpd:
 ; BTVER2:       # %bb.0:
@@ -109,9 +109,9 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; BDVER2-LABEL: test_addps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_addps:
 ; BTVER2:       # %bb.0:
@@ -169,9 +169,9 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ;
 ; BDVER2-LABEL: test_addsubpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_addsubpd:
 ; BTVER2:       # %bb.0:
@@ -230,9 +230,9 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ;
 ; BDVER2-LABEL: test_addsubps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_addsubps:
 ; BTVER2:       # %bb.0:
@@ -297,10 +297,10 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ;
 ; BDVER2-LABEL: test_andnotpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_andnotpd:
 ; BTVER2:       # %bb.0:
@@ -373,10 +373,10 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ;
 ; BDVER2-LABEL: test_andnotps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_andnotps:
 ; BTVER2:       # %bb.0:
@@ -449,10 +449,10 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; BDVER2-LABEL: test_andpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_andpd:
 ; BTVER2:       # %bb.0:
@@ -523,10 +523,10 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; BDVER2-LABEL: test_andps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_andps:
 ; BTVER2:       # %bb.0:
@@ -597,10 +597,10 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ;
 ; BDVER2-LABEL: test_blendpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
-; BDVER2-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1],mem[2,3] sched: [8:0.50]
-; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [2:1.00]
+; BDVER2-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1],mem[2,3] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_blendpd:
 ; BTVER2:       # %bb.0:
@@ -667,10 +667,10 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *
 ;
 ; BDVER2-LABEL: test_blendps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
-; BDVER2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4,5,6],ymm1[7] sched: [8:0.50]
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [2:1.00]
+; BDVER2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4,5,6],ymm1[7] sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_blendps:
 ; BTVER2:       # %bb.0:
@@ -731,9 +731,9 @@ define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ;
 ; BDVER2-LABEL: test_blendvpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:3.00]
+; BDVER2-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:3.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_blendvpd:
 ; BTVER2:       # %bb.0:
@@ -792,9 +792,9 @@ define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ;
 ; BDVER2-LABEL: test_blendvps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:3.00]
+; BDVER2-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:3.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_blendvps:
 ; BTVER2:       # %bb.0:
@@ -847,8 +847,8 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
 ;
 ; BDVER2-LABEL: test_broadcastf128:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [7:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastf128:
 ; BTVER2:       # %bb.0:
@@ -897,8 +897,8 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) {
 ;
 ; BDVER2-LABEL: test_broadcastsd_ymm:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [6:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastsd_ymm:
 ; BTVER2:       # %bb.0:
@@ -948,8 +948,8 @@ define <4 x float> @test_broadcastss(float *%a0) {
 ;
 ; BDVER2-LABEL: test_broadcastss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastss:
 ; BTVER2:       # %bb.0:
@@ -999,8 +999,8 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) {
 ;
 ; BDVER2-LABEL: test_broadcastss_ymm:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [6:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_broadcastss_ymm:
 ; BTVER2:       # %bb.0:
@@ -1062,10 +1062,10 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; BDVER2-LABEL: test_cmppd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; BDVER2-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [2:2.00]
+; BDVER2-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmppd:
 ; BTVER2:       # %bb.0:
@@ -1135,10 +1135,10 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; BDVER2-LABEL: test_cmpps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; BDVER2-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [2:2.00]
+; BDVER2-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmpps:
 ; BTVER2:       # %bb.0:
@@ -1208,10 +1208,10 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
 ;
 ; BDVER2-LABEL: test_cvtdq2pd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00]
-; BDVER2-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [13:2.00]
+; BDVER2-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2pd:
 ; BTVER2:       # %bb.0:
@@ -1280,10 +1280,10 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
 ;
 ; BDVER2-LABEL: test_cvtdq2ps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [9:2.00]
+; BDVER2-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [4:2.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2ps:
 ; BTVER2:       # %bb.0:
@@ -1350,10 +1350,10 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; BDVER2-LABEL: test_cvtpd2dq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtpd2dq %ymm0, %xmm0 # sched: [4:1.00]
-; BDVER2-NEXT:    vcvtpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
-; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtpd2dqy (%rdi), %xmm1 # sched: [13:2.00]
+; BDVER2-NEXT:    vcvtpd2dq %ymm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2dq:
 ; BTVER2:       # %bb.0:
@@ -1421,10 +1421,10 @@ define <8 x i32> @test_cvttpd2dq(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; BDVER2-LABEL: test_cvttpd2dq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00]
-; BDVER2-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
-; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [13:2.00]
+; BDVER2-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvttpd2dq:
 ; BTVER2:       # %bb.0:
@@ -1491,10 +1491,10 @@ define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; BDVER2-LABEL: test_cvtpd2ps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00]
-; BDVER2-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [11:1.00]
-; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [13:2.00]
+; BDVER2-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2ps:
 ; BTVER2:       # %bb.0:
@@ -1561,10 +1561,10 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_cvtps2dq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtps2dq %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vcvtps2dq (%rdi), %ymm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtps2dq (%rdi), %ymm1 # sched: [9:2.00]
+; BDVER2-NEXT:    vcvtps2dq %ymm0, %ymm0 # sched: [4:2.00]
+; BDVER2-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvtps2dq:
 ; BTVER2:       # %bb.0:
@@ -1632,10 +1632,10 @@ define <8 x i32> @test_cvttps2dq(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_cvttps2dq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [9:2.00]
+; BDVER2-NEXT:    vcvttps2dq %ymm0, %ymm0 # sched: [4:2.00]
+; BDVER2-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvttps2dq:
 ; BTVER2:       # %bb.0:
@@ -1696,9 +1696,9 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; BDVER2-LABEL: test_divpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [45:44.00]
-; BDVER2-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [52:44.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [9:19.00]
+; BDVER2-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [14:19.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_divpd:
 ; BTVER2:       # %bb.0:
@@ -1756,9 +1756,9 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; BDVER2-LABEL: test_divps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [29:28.00]
-; BDVER2-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [36:28.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [9:19.00]
+; BDVER2-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [14:19.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_divps:
 ; BTVER2:       # %bb.0:
@@ -1816,9 +1816,9 @@ define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
 ;
 ; BDVER2-LABEL: test_dpps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:2.00]
-; BDVER2-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [19:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [27:3.00]
+; BDVER2-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [32:3.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_dpps:
 ; BTVER2:       # %bb.0:
@@ -1883,10 +1883,10 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa
 ;
 ; BDVER2-LABEL: test_extractf128:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [7:0.50]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_extractf128:
 ; BTVER2:       # %bb.0:
@@ -1945,9 +1945,9 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ;
 ; BDVER2-LABEL: test_haddpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [11:2.00]
+; BDVER2-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [16:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_haddpd:
 ; BTVER2:       # %bb.0:
@@ -2006,9 +2006,9 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ;
 ; BDVER2-LABEL: test_haddps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [11:2.00]
+; BDVER2-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [16:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_haddps:
 ; BTVER2:       # %bb.0:
@@ -2067,9 +2067,9 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ;
 ; BDVER2-LABEL: test_hsubpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [11:2.00]
+; BDVER2-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [16:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_hsubpd:
 ; BTVER2:       # %bb.0:
@@ -2128,9 +2128,9 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ;
 ; BDVER2-LABEL: test_hsubps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [11:2.00]
+; BDVER2-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [16:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_hsubps:
 ; BTVER2:       # %bb.0:
@@ -2195,10 +2195,10 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
 ;
 ; BDVER2-LABEL: test_insertf128:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [2:0.50]
 ; BDVER2-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_insertf128:
 ; BTVER2:       # %bb.0:
@@ -2255,8 +2255,8 @@ define <32 x i8> @test_lddqu(i8* %a0) {
 ;
 ; BDVER2-LABEL: test_lddqu:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vlddqu (%rdi), %ymm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vlddqu (%rdi), %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lddqu:
 ; BTVER2:       # %bb.0:
@@ -2317,10 +2317,10 @@ define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
 ;
 ; BDVER2-LABEL: test_maskmovpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
-; BDVER2-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [6:1.00]
+; BDVER2-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [6:2.00]
+; BDVER2-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovpd:
 ; BTVER2:       # %bb.0:
@@ -2387,10 +2387,10 @@ define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2
 ;
 ; BDVER2-LABEL: test_maskmovpd_ymm:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
-; BDVER2-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [6:2.00]
+; BDVER2-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [6:2.00]
+; BDVER2-NEXT:    vmovapd %ymm2, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovpd_ymm:
 ; BTVER2:       # %bb.0:
@@ -2457,10 +2457,10 @@ define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
 ;
 ; BDVER2-LABEL: test_maskmovps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
-; BDVER2-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [6:1.00]
+; BDVER2-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [6:2.00]
+; BDVER2-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovps:
 ; BTVER2:       # %bb.0:
@@ -2527,10 +2527,10 @@ define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2)
 ;
 ; BDVER2-LABEL: test_maskmovps_ymm:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
-; BDVER2-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [6:2.00]
+; BDVER2-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [6:2.00]
+; BDVER2-NEXT:    vmovaps %ymm2, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovps_ymm:
 ; BTVER2:       # %bb.0:
@@ -2591,9 +2591,9 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; BDVER2-LABEL: test_maxpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_maxpd:
 ; BTVER2:       # %bb.0:
@@ -2652,9 +2652,9 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; BDVER2-LABEL: test_maxps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_maxps:
 ; BTVER2:       # %bb.0:
@@ -2713,9 +2713,9 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; BDVER2-LABEL: test_minpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vminpd %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_minpd:
 ; BTVER2:       # %bb.0:
@@ -2774,9 +2774,9 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; BDVER2-LABEL: test_minps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vminps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_minps:
 ; BTVER2:       # %bb.0:
@@ -2841,10 +2841,10 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
 ;
 ; BDVER2-LABEL: test_movapd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovapd (%rdi), %ymm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovapd (%rdi), %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
 ; BDVER2-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movapd:
 ; BTVER2:       # %bb.0:
@@ -2910,10 +2910,10 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_movaps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovaps (%rdi), %ymm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovaps (%rdi), %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
 ; BDVER2-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movaps:
 ; BTVER2:       # %bb.0:
@@ -2979,10 +2979,10 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; BDVER2-LABEL: test_movddup:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
-; BDVER2-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50]
-; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:1.00]
+; BDVER2-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [2:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movddup:
 ; BTVER2:       # %bb.0:
@@ -3043,9 +3043,9 @@ define i32 @test_movmskpd(<4 x double> %a0) {
 ;
 ; BDVER2-LABEL: test_movmskpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovmskpd %ymm0, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovmskpd %ymm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movmskpd:
 ; BTVER2:       # %bb.0:
@@ -3101,9 +3101,9 @@ define i32 @test_movmskps(<8 x float> %a0) {
 ;
 ; BDVER2-LABEL: test_movmskps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovmskps %ymm0, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovmskps %ymm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movmskps:
 ; BTVER2:       # %bb.0:
@@ -3172,10 +3172,10 @@ define void @test_movntdq(<4 x i64> %a0, <4 x i64> *%a1) {
 ; BDVER2-LABEL: test_movntdq:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
+; BDVER2-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [2:2.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movntdq:
 ; BTVER2:       # %bb.0:
@@ -3234,9 +3234,9 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; BDVER2-LABEL: test_movntpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [3:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movntpd:
 ; BTVER2:       # %bb.0:
@@ -3293,9 +3293,9 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_movntps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovntps %ymm0, (%rdi) # sched: [3:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movntps:
 ; BTVER2:       # %bb.0:
@@ -3358,10 +3358,10 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_movshdup:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
-; BDVER2-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50]
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:1.00]
+; BDVER2-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [2:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movshdup:
 ; BTVER2:       # %bb.0:
@@ -3428,10 +3428,10 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_movsldup:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
-; BDVER2-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50]
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:1.00]
+; BDVER2-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [2:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movsldup:
 ; BTVER2:       # %bb.0:
@@ -3500,10 +3500,10 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
 ;
 ; BDVER2-LABEL: test_movupd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovupd (%rdi), %ymm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovupd (%rdi), %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
 ; BDVER2-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movupd:
 ; BTVER2:       # %bb.0:
@@ -3571,10 +3571,10 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_movups:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovups (%rdi), %ymm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovups (%rdi), %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
 ; BDVER2-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movups:
 ; BTVER2:       # %bb.0:
@@ -3634,9 +3634,9 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; BDVER2-LABEL: test_mulpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_mulpd:
 ; BTVER2:       # %bb.0:
@@ -3694,9 +3694,9 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; BDVER2-LABEL: test_mulps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_mulps:
 ; BTVER2:       # %bb.0:
@@ -3760,10 +3760,10 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
 ;
 ; BDVER2-LABEL: orpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: orpd:
 ; BTVER2:       # %bb.0:
@@ -3834,10 +3834,10 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
 ;
 ; BDVER2-LABEL: test_orps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_orps:
 ; BTVER2:       # %bb.0:
@@ -3908,10 +3908,10 @@ define <4 x double> @test_perm2f128(<4 x double> %a0, <4 x double> %a1, <4 x dou
 ;
 ; BDVER2-LABEL: test_perm2f128:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; BDVER2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
-; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [4:0.50]
+; BDVER2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:0.50]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_perm2f128:
 ; BTVER2:       # %bb.0:
@@ -3978,10 +3978,10 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
 ;
 ; BDVER2-LABEL: test_permilpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
-; BDVER2-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00]
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:0.50]
+; BDVER2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [2:0.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_permilpd:
 ; BTVER2:       # %bb.0:
@@ -4048,10 +4048,10 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; BDVER2-LABEL: test_permilpd_ymm:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
-; BDVER2-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:1.00]
-; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [7:1.00]
+; BDVER2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [2:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_permilpd_ymm:
 ; BTVER2:       # %bb.0:
@@ -4118,10 +4118,10 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_permilps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
-; BDVER2-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:0.50]
+; BDVER2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [2:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_permilps:
 ; BTVER2:       # %bb.0:
@@ -4188,10 +4188,10 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_permilps_ymm:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
-; BDVER2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:1.00]
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [7:1.00]
+; BDVER2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [2:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_permilps_ymm:
 ; BTVER2:       # %bb.0:
@@ -4252,9 +4252,9 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64>
 ;
 ; BDVER2-LABEL: test_permilvarpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER2-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarpd:
 ; BTVER2:       # %bb.0:
@@ -4313,9 +4313,9 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x
 ;
 ; BDVER2-LABEL: test_permilvarpd_ymm:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
+; BDVER2-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:3.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarpd_ymm:
 ; BTVER2:       # %bb.0:
@@ -4374,9 +4374,9 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *
 ;
 ; BDVER2-LABEL: test_permilvarps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER2-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarps:
 ; BTVER2:       # %bb.0:
@@ -4435,9 +4435,9 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3
 ;
 ; BDVER2-LABEL: test_permilvarps_ymm:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpermilps %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
+; BDVER2-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:3.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_permilvarps_ymm:
 ; BTVER2:       # %bb.0:
@@ -4502,10 +4502,10 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_rcpps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps (%rdi), %ymm1 # sched: [14:2.00]
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vrcpps (%rdi), %ymm1 # sched: [10:2.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rcpps:
 ; BTVER2:       # %bb.0:
@@ -4573,10 +4573,10 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; BDVER2-LABEL: test_roundpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [9:2.00]
+; BDVER2-NEXT:    vroundpd $7, %ymm0, %ymm0 # sched: [4:2.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_roundpd:
 ; BTVER2:       # %bb.0:
@@ -4644,10 +4644,10 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_roundps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [9:2.00]
+; BDVER2-NEXT:    vroundps $7, %ymm0, %ymm0 # sched: [4:2.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_roundps:
 ; BTVER2:       # %bb.0:
@@ -4715,10 +4715,10 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_rsqrtps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [14:2.00]
-; BDVER2-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [7:2.00]
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [10:2.00]
+; BDVER2-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rsqrtps:
 ; BTVER2:       # %bb.0:
@@ -4786,10 +4786,10 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ;
 ; BDVER2-LABEL: test_shufpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
-; BDVER2-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00]
-; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [2:1.00]
+; BDVER2-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_shufpd:
 ; BTVER2:       # %bb.0:
@@ -4856,10 +4856,10 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ;
 ; BDVER2-LABEL: test_shufps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
-; BDVER2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,3],mem[0,0],ymm1[4,7],mem[4,4] sched: [8:1.00]
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [2:1.00]
+; BDVER2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,3],mem[0,0],ymm1[4,7],mem[4,4] sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_shufps:
 ; BTVER2:       # %bb.0:
@@ -4926,10 +4926,10 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; BDVER2-LABEL: test_sqrtpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [52:44.00]
-; BDVER2-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [45:44.00]
-; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [14:27.00]
+; BDVER2-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [9:27.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtpd:
 ; BTVER2:       # %bb.0:
@@ -4997,10 +4997,10 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; BDVER2-LABEL: test_sqrtps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [36:28.00]
-; BDVER2-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [29:28.00]
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [14:21.00]
+; BDVER2-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [9:21.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtps:
 ; BTVER2:       # %bb.0:
@@ -5062,9 +5062,9 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; BDVER2-LABEL: test_subpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_subpd:
 ; BTVER2:       # %bb.0:
@@ -5122,9 +5122,9 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; BDVER2-LABEL: test_subps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_subps:
 ; BTVER2:       # %bb.0:
@@ -5203,9 +5203,9 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
 ; BDVER2-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-NEXT:    setb %al # sched: [1:0.50]
-; BDVER2-NEXT:    vtestpd (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    adcl $0, %eax # sched: [2:0.67]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vtestpd (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_testpd:
 ; BTVER2:       # %bb.0:
@@ -5298,10 +5298,10 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
 ; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
 ; BDVER2-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:1.00]
 ; BDVER2-NEXT:    setb %al # sched: [1:0.50]
-; BDVER2-NEXT:    vtestpd (%rdi), %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    adcl $0, %eax # sched: [2:0.67]
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vtestpd (%rdi), %ymm0 # sched: [6:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_testpd_ymm:
 ; BTVER2:       # %bb.0:
@@ -5389,9 +5389,9 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
 ; BDVER2-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-NEXT:    setb %al # sched: [1:0.50]
-; BDVER2-NEXT:    vtestps (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    adcl $0, %eax # sched: [2:0.67]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vtestps (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_testps:
 ; BTVER2:       # %bb.0:
@@ -5484,10 +5484,10 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
 ; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
 ; BDVER2-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:1.00]
 ; BDVER2-NEXT:    setb %al # sched: [1:0.50]
-; BDVER2-NEXT:    vtestps (%rdi), %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    adcl $0, %eax # sched: [2:0.67]
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vtestps (%rdi), %ymm0 # sched: [6:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_testps_ymm:
 ; BTVER2:       # %bb.0:
@@ -5560,10 +5560,10 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ;
 ; BDVER2-LABEL: test_unpckhpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; BDVER2-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:1.00]
-; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [2:1.00]
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhpd:
 ; BTVER2:       # %bb.0:
@@ -5624,9 +5624,9 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ;
 ; BDVER2-LABEL: test_unpckhps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; BDVER2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [2:1.00]
+; BDVER2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhps:
 ; BTVER2:       # %bb.0:
@@ -5690,10 +5690,10 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ;
 ; BDVER2-LABEL: test_unpcklpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; BDVER2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00]
-; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [2:1.00]
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklpd:
 ; BTVER2:       # %bb.0:
@@ -5754,9 +5754,9 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ;
 ; BDVER2-LABEL: test_unpcklps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; BDVER2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [2:1.00]
+; BDVER2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklps:
 ; BTVER2:       # %bb.0:
@@ -5820,10 +5820,10 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; BDVER2-LABEL: test_xorpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xorpd:
 ; BTVER2:       # %bb.0:
@@ -5894,10 +5894,10 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; BDVER2-LABEL: test_xorps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xorps:
 ; BTVER2:       # %bb.0:
@@ -5956,8 +5956,8 @@ define void @test_zeroall() {
 ;
 ; BDVER2-LABEL: test_zeroall:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vzeroall # sched: [9:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroall # sched: [90:8.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_zeroall:
 ; BTVER2:       # %bb.0:
@@ -6006,8 +6006,8 @@ define void @test_zeroupper() {
 ;
 ; BDVER2-LABEL: test_zeroupper:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_zeroupper:
 ; BTVER2:       # %bb.0:
@@ -6086,12 +6086,12 @@ define void @test_avx256_zero_idioms() {
 ; BDVER2-LABEL: test_avx256_zero_idioms:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    vxorps %ymm0, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vxorpd %ymm1, %ymm1, %ymm1 # sched: [1:1.00]
-; BDVER2-NEXT:    vandnps %ymm2, %ymm2, %ymm2 # sched: [1:1.00]
-; BDVER2-NEXT:    vandnpd %ymm3, %ymm3, %ymm3 # sched: [1:1.00]
+; BDVER2-NEXT:    vxorps %ymm0, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vxorpd %ymm1, %ymm1, %ymm1 # sched: [2:1.00]
+; BDVER2-NEXT:    vandnps %ymm2, %ymm2, %ymm2 # sched: [2:1.00]
+; BDVER2-NEXT:    vandnpd %ymm3, %ymm3, %ymm3 # sched: [2:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_avx256_zero_idioms:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index 824a3ffba6b..26248bdae0a 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BDVER2
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BDVER2
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2
 
 declare i32 @foo()
@@ -60,8 +60,8 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
 ; BDVER2-LABEL: test01:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    subq $56, %rsp
-; BDVER2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
 ; BDVER2-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; BDVER2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
 ; BDVER2-NEXT:    vzeroupper
 ; BDVER2-NEXT:    callq do_sse
 ; BDVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
@@ -203,8 +203,8 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
 ; BDVER2-NEXT:    testl %eax, %eax
 ; BDVER2-NEXT:    jne .LBB3_1
 ; BDVER2-NEXT:  # %bb.2: # %for.body.preheader
-; BDVER2-NEXT:    movl $4, %ebx
 ; BDVER2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; BDVER2-NEXT:    movl $4, %ebx
 ; BDVER2-NEXT:    .p2align 4, 0x90
 ; BDVER2-NEXT:  .LBB3_3: # %for.body
 ; BDVER2-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -214,7 +214,7 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
 ; BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; BDVER2-NEXT:    vzeroupper
 ; BDVER2-NEXT:    callq do_sse
-; BDVER2-NEXT:    addl $-1, %ebx
+; BDVER2-NEXT:    decl %ebx
 ; BDVER2-NEXT:    jne .LBB3_3
 ; BDVER2-NEXT:  # %bb.4: # %for.end
 ; BDVER2-NEXT:    addq $16, %rsp
diff --git a/test/CodeGen/X86/bmi-schedule.ll b/test/CodeGen/X86/bmi-schedule.ll
index 174efd2cfe9..5b5b388c100 100644
--- a/test/CodeGen/X86/bmi-schedule.ll
+++ b/test/CodeGen/X86/bmi-schedule.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+bmi  | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2  | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2  | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1  | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -39,10 +39,10 @@ define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) {
 ;
 ; BDVER2-LABEL: test_andn_i32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.33]
-; BDVER2-NEXT:    andnl (%rdx), %edi, %eax # sched: [6:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    andnl (%rdx), %edi, %eax # sched: [5:0.50]
+; BDVER2-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_andn_i32:
 ; BTVER2:       # %bb.0:
@@ -96,10 +96,10 @@ define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) {
 ;
 ; BDVER2-LABEL: test_andn_i64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.33]
-; BDVER2-NEXT:    andnq (%rdx), %rdi, %rax # sched: [6:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    andnq (%rdx), %rdi, %rax # sched: [5:0.50]
+; BDVER2-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_andn_i64:
 ; BTVER2:       # %bb.0:
@@ -153,10 +153,10 @@ define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) {
 ;
 ; BDVER2-LABEL: test_bextr_i32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [7:1.00]
-; BDVER2-NEXT:    bextrl %edi, %esi, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [6:0.50]
+; BDVER2-NEXT:    bextrl %edi, %esi, %eax # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bextr_i32:
 ; BTVER2:       # %bb.0:
@@ -210,10 +210,10 @@ define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) {
 ;
 ; BDVER2-LABEL: test_bextr_i64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [7:1.00]
-; BDVER2-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [6:0.50]
+; BDVER2-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bextr_i64:
 ; BTVER2:       # %bb.0:
@@ -268,9 +268,9 @@ define i32 @test_blsi_i32(i32 %a0, i32 *%a1) {
 ; BDVER2-LABEL: test_blsi_i32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    blsil (%rsi), %ecx # sched: [6:0.50]
-; BDVER2-NEXT:    blsil %edi, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blsil %edi, %eax # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_blsi_i32:
 ; BTVER2:       # %bb.0:
@@ -326,9 +326,9 @@ define i64 @test_blsi_i64(i64 %a0, i64 *%a1) {
 ; BDVER2-LABEL: test_blsi_i64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    blsiq (%rsi), %rcx # sched: [6:0.50]
-; BDVER2-NEXT:    blsiq %rdi, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blsiq %rdi, %rax # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_blsi_i64:
 ; BTVER2:       # %bb.0:
@@ -384,9 +384,9 @@ define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) {
 ; BDVER2-LABEL: test_blsmsk_i32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    blsmskl (%rsi), %ecx # sched: [6:0.50]
-; BDVER2-NEXT:    blsmskl %edi, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blsmskl %edi, %eax # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_blsmsk_i32:
 ; BTVER2:       # %bb.0:
@@ -442,9 +442,9 @@ define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) {
 ; BDVER2-LABEL: test_blsmsk_i64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    blsmskq (%rsi), %rcx # sched: [6:0.50]
-; BDVER2-NEXT:    blsmskq %rdi, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blsmskq %rdi, %rax # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_blsmsk_i64:
 ; BTVER2:       # %bb.0:
@@ -500,9 +500,9 @@ define i32 @test_blsr_i32(i32 %a0, i32 *%a1) {
 ; BDVER2-LABEL: test_blsr_i32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    blsrl (%rsi), %ecx # sched: [6:0.50]
-; BDVER2-NEXT:    blsrl %edi, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blsrl %edi, %eax # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_blsr_i32:
 ; BTVER2:       # %bb.0:
@@ -558,9 +558,9 @@ define i64 @test_blsr_i64(i64 %a0, i64 *%a1) {
 ; BDVER2-LABEL: test_blsr_i64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    blsrq (%rsi), %rcx # sched: [6:0.50]
-; BDVER2-NEXT:    blsrq %rdi, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blsrq %rdi, %rax # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_blsr_i64:
 ; BTVER2:       # %bb.0:
@@ -619,11 +619,11 @@ define i16 @test_cttz_i16(i16 zeroext %a0, i16 *%a1) {
 ;
 ; BDVER2-LABEL: test_cttz_i16:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    tzcntw (%rsi), %cx # sched: [8:1.00]
-; BDVER2-NEXT:    tzcntw %di, %ax # sched: [3:1.00]
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    tzcntw (%rsi), %cx # sched: [6:1.00]
+; BDVER2-NEXT:    tzcntw %di, %ax # sched: [2:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
 ; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cttz_i16:
 ; BTVER2:       # %bb.0:
@@ -679,10 +679,10 @@ define i32 @test_cttz_i32(i32 %a0, i32 *%a1) {
 ;
 ; BDVER2-LABEL: test_cttz_i32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    tzcntl (%rsi), %ecx # sched: [8:1.00]
-; BDVER2-NEXT:    tzcntl %edi, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    tzcntl (%rsi), %ecx # sched: [6:1.00]
+; BDVER2-NEXT:    tzcntl %edi, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cttz_i32:
 ; BTVER2:       # %bb.0:
@@ -736,10 +736,10 @@ define i64 @test_cttz_i64(i64 %a0, i64 *%a1) {
 ;
 ; BDVER2-LABEL: test_cttz_i64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    tzcntq (%rsi), %rcx # sched: [8:1.00]
-; BDVER2-NEXT:    tzcntq %rdi, %rax # sched: [3:1.00]
-; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    tzcntq (%rsi), %rcx # sched: [6:1.00]
+; BDVER2-NEXT:    tzcntq %rdi, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cttz_i64:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/cmov-schedule.ll b/test/CodeGen/X86/cmov-schedule.ll
index 93c771e305a..de3e8637a18 100644
--- a/test/CodeGen/X86/cmov-schedule.ll
+++ b/test/CodeGen/X86/cmov-schedule.ll
@@ -8,7 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -544,68 +544,68 @@ define void @test_cmov_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; BDVER2-LABEL: test_cmov_16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cmovow %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnow %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovaew %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovaew %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovaew %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovew %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovew %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnew %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnew %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbew %si, %di # sched: [3:1.00]
-; BDVER2-NEXT:    cmovbew %si, %di # sched: [3:1.00]
-; BDVER2-NEXT:    cmovaw %si, %di # sched: [3:1.00]
-; BDVER2-NEXT:    cmovaw %si, %di # sched: [3:1.00]
-; BDVER2-NEXT:    cmovsw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnsw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovpw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovpw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnpw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnpw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovlw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovlw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgew %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgew %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovlew %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovlew %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgw %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    cmovow (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnow (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovew (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovew (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnew (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnew (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbew (%rdx), %di # sched: [8:1.00]
-; BDVER2-NEXT:    cmovbew (%rdx), %di # sched: [8:1.00]
-; BDVER2-NEXT:    cmovaw (%rdx), %di # sched: [8:1.00]
-; BDVER2-NEXT:    cmovaw (%rdx), %di # sched: [8:1.00]
-; BDVER2-NEXT:    cmovsw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnsw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovpw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovpw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnpw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnpw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovlw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovlw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgew (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgew (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovlew (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovlew (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgw (%rdx), %di # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgw (%rdx), %di # sched: [7:0.67]
+; BDVER2-NEXT:    cmovow %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnow %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovsw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnsw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovow (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnow (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovsw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnsw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgw (%rdx), %di # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmov_16:
 ; BTVER2:       # %bb.0:
@@ -1274,68 +1274,68 @@ define void @test_cmov_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; BDVER2-LABEL: test_cmov_32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cmovol %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnol %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovael %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovael %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovael %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovel %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovel %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnel %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnel %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbel %esi, %edi # sched: [3:1.00]
-; BDVER2-NEXT:    cmovbel %esi, %edi # sched: [3:1.00]
-; BDVER2-NEXT:    cmoval %esi, %edi # sched: [3:1.00]
-; BDVER2-NEXT:    cmoval %esi, %edi # sched: [3:1.00]
-; BDVER2-NEXT:    cmovsl %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnsl %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovpl %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovpl %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnpl %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnpl %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovll %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovll %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgel %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgel %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovlel %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovlel %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgl %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgl %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovol (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnol (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovel (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovel (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnel (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnel (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbel (%rdx), %edi # sched: [8:1.00]
-; BDVER2-NEXT:    cmovbel (%rdx), %edi # sched: [8:1.00]
-; BDVER2-NEXT:    cmoval (%rdx), %edi # sched: [8:1.00]
-; BDVER2-NEXT:    cmoval (%rdx), %edi # sched: [8:1.00]
-; BDVER2-NEXT:    cmovsl (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnsl (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovpl (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovpl (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnpl (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnpl (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovll (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovll (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgel (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgel (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovlel (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovlel (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgl (%rdx), %edi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgl (%rdx), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovol %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnol %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovael %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovael %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovael %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmoval %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmoval %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovsl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnsl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovll %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovll %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovol (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnol (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmoval (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmoval (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovsl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnsl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovll (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovll (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgl (%rdx), %edi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmov_32:
 ; BTVER2:       # %bb.0:
@@ -2004,68 +2004,68 @@ define void @test_cmov_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; BDVER2-LABEL: test_cmov_64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cmovoq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnoq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmoveq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmoveq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovneq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovneq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovbeq %rsi, %rdi # sched: [3:1.00]
-; BDVER2-NEXT:    cmovbeq %rsi, %rdi # sched: [3:1.00]
-; BDVER2-NEXT:    cmovaq %rsi, %rdi # sched: [3:1.00]
-; BDVER2-NEXT:    cmovaq %rsi, %rdi # sched: [3:1.00]
-; BDVER2-NEXT:    cmovsq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnsq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovpq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovpq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnpq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovnpq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovlq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovlq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgeq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgeq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovleq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovleq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovgq %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    cmovoq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnoq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmoveq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmoveq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovneq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovneq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovbeq (%rdx), %rdi # sched: [8:1.00]
-; BDVER2-NEXT:    cmovbeq (%rdx), %rdi # sched: [8:1.00]
-; BDVER2-NEXT:    cmovaq (%rdx), %rdi # sched: [8:1.00]
-; BDVER2-NEXT:    cmovaq (%rdx), %rdi # sched: [8:1.00]
-; BDVER2-NEXT:    cmovsq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnsq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovpq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovpq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnpq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovnpq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovlq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovlq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgeq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgeq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovleq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovleq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgq (%rdx), %rdi # sched: [7:0.67]
-; BDVER2-NEXT:    cmovgq (%rdx), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    cmovoq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnoq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmoveq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmoveq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovneq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovneq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovsq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnsq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovleq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovleq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovoq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnoq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmoveq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmoveq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovneq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovneq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovsq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnsq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovleq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovleq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgq (%rdx), %rdi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmov_64:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/f16c-schedule.ll b/test/CodeGen/X86/f16c-schedule.ll
index db183e1ef5f..534c63f708c 100644
--- a/test/CodeGen/X86/f16c-schedule.ll
+++ b/test/CodeGen/X86/f16c-schedule.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+f16c | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -46,10 +46,10 @@ define <4 x float> @test_vcvtph2ps_128(<8 x i16> %a0, <8 x i16> *%a1) {
 ;
 ; BDVER2-LABEL: test_vcvtph2ps_128:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [8:1.00]
-; BDVER2-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_vcvtph2ps_128:
 ; BTVER2:       # %bb.0:
@@ -110,10 +110,10 @@ define <8 x float> @test_vcvtph2ps_256(<8 x i16> %a0, <8 x i16> *%a1) {
 ;
 ; BDVER2-LABEL: test_vcvtph2ps_256:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [8:1.00]
-; BDVER2-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [13:2.00]
+; BDVER2-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_vcvtph2ps_256:
 ; BTVER2:       # %bb.0:
@@ -169,9 +169,9 @@ define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16>
 ;
 ; BDVER2-LABEL: test_vcvtps2ph_128:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [8:1.00]
 ; BDVER2-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [4:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_vcvtps2ph_128:
 ; BTVER2:       # %bb.0:
@@ -230,10 +230,10 @@ define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16>
 ;
 ; BDVER2-LABEL: test_vcvtps2ph_256:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [4:1.00]
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [4:2.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_vcvtps2ph_256:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/fma-schedule.ll b/test/CodeGen/X86/fma-schedule.ll
index 82ea0ce7a4a..6cdc615b231 100644
--- a/test/CodeGen/X86/fma-schedule.ll
+++ b/test/CodeGen/X86/fma-schedule.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
@@ -35,7 +35,7 @@ define void @test_vfmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER2-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmadd231pd {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmaddpd_128:
 ; HASWELL:       # %bb.0:
@@ -132,12 +132,12 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BDVER2-NEXT:    vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm2) + ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfmadd132pd {{.*#+}} ymm0 = (ymm0 * mem) + ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [10:0.50]
-; BDVER2-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd132pd {{.*#+}} ymm0 = (ymm0 * mem) + ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmaddpd_256:
 ; HASWELL:       # %bb.0:
@@ -242,7 +242,7 @@ define void @test_vfmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; BDVER2-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmadd231ps {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmaddps_128:
 ; HASWELL:       # %bb.0:
@@ -339,12 +339,12 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; BDVER2-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * ymm2) + ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [10:0.50]
-; BDVER2-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmaddps_256:
 ; HASWELL:       # %bb.0:
@@ -449,7 +449,7 @@ define void @test_vfmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER2-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmadd231sd {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmaddsd_128:
 ; HASWELL:       # %bb.0:
@@ -549,7 +549,7 @@ define void @test_vfmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; BDVER2-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmadd231ss {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmaddss_128:
 ; HASWELL:       # %bb.0:
@@ -653,7 +653,7 @@ define void @test_vfmaddsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} xmm0 = (xmm1 * mem) +/- xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmaddsubpd_128:
 ; HASWELL:       # %bb.0:
@@ -750,12 +750,12 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; BDVER2-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * ymm2) +/- ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * mem) +/- ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [10:0.50]
-; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * mem) +/- ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmaddsubpd_256:
 ; HASWELL:       # %bb.0:
@@ -860,7 +860,7 @@ define void @test_vfmaddsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} xmm0 = (xmm1 * mem) +/- xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmaddsubps_128:
 ; HASWELL:       # %bb.0:
@@ -957,12 +957,12 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; BDVER2-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * ymm2) +/- ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * mem) +/- ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [10:0.50]
-; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * mem) +/- ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmaddsubps_256:
 ; HASWELL:       # %bb.0:
@@ -1071,7 +1071,7 @@ define void @test_vfmsubaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} xmm0 = (xmm1 * mem) -/+ xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmsubaddpd_128:
 ; HASWELL:       # %bb.0:
@@ -1168,12 +1168,12 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; BDVER2-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * ymm2) -/+ ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * mem) -/+ ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [10:0.50]
-; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * mem) -/+ ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmsubaddpd_256:
 ; HASWELL:       # %bb.0:
@@ -1278,7 +1278,7 @@ define void @test_vfmsubaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} xmm0 = (xmm1 * mem) -/+ xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmsubaddps_128:
 ; HASWELL:       # %bb.0:
@@ -1375,12 +1375,12 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; BDVER2-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * ymm2) -/+ ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * mem) -/+ ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [10:0.50]
-; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * mem) -/+ ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmsubaddps_256:
 ; HASWELL:       # %bb.0:
@@ -1489,7 +1489,7 @@ define void @test_vfmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER2-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmsub231pd {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmsubpd_128:
 ; HASWELL:       # %bb.0:
@@ -1586,12 +1586,12 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BDVER2-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * ymm2) - ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * mem) - ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [10:0.50]
-; BDVER2-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * mem) - ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmsubpd_256:
 ; HASWELL:       # %bb.0:
@@ -1696,7 +1696,7 @@ define void @test_vfmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; BDVER2-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmsubps_128:
 ; HASWELL:       # %bb.0:
@@ -1793,12 +1793,12 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; BDVER2-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * ymm2) - ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * mem) - ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [10:0.50]
-; BDVER2-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * mem) - ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmsubps_256:
 ; HASWELL:       # %bb.0:
@@ -1903,7 +1903,7 @@ define void @test_vfmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER2-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmsub231sd {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmsubsd_128:
 ; HASWELL:       # %bb.0:
@@ -2003,7 +2003,7 @@ define void @test_vfmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; BDVER2-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfmsubss_128:
 ; HASWELL:       # %bb.0:
@@ -2107,7 +2107,7 @@ define void @test_vfnmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmaddpd_128:
 ; HASWELL:       # %bb.0:
@@ -2204,12 +2204,12 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BDVER2-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * ymm2) + ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * mem) + ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [10:0.50]
-; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * mem) + ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [10:1.00]
+; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmaddpd_256:
 ; HASWELL:       # %bb.0:
@@ -2314,7 +2314,7 @@ define void @test_vfnmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmaddps_128:
 ; HASWELL:       # %bb.0:
@@ -2411,12 +2411,12 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; BDVER2-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * ymm2) + ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [10:0.50]
-; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [10:1.00]
+; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmaddps_256:
 ; HASWELL:       # %bb.0:
@@ -2521,7 +2521,7 @@ define void @test_vfnmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER2-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfnmadd231sd {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmaddsd_128:
 ; HASWELL:       # %bb.0:
@@ -2621,7 +2621,7 @@ define void @test_vfnmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; BDVER2-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfnmadd231ss {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmaddss_128:
 ; HASWELL:       # %bb.0:
@@ -2725,7 +2725,7 @@ define void @test_vfnmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmsubpd_128:
 ; HASWELL:       # %bb.0:
@@ -2822,12 +2822,12 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BDVER2-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * ymm2) - ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * mem) - ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [10:0.50]
-; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * mem) - ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [10:1.00]
+; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmsubpd_256:
 ; HASWELL:       # %bb.0:
@@ -2932,7 +2932,7 @@ define void @test_vfnmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmsubps_128:
 ; HASWELL:       # %bb.0:
@@ -3029,12 +3029,12 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; BDVER2-NEXT:    vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm1 sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * ymm2) - ymm0 sched: [5:0.50]
-; BDVER2-NEXT:    vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm1 sched: [10:0.50]
-; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [10:0.50]
-; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [10:1.00]
+; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmsubps_256:
 ; HASWELL:       # %bb.0:
@@ -3139,7 +3139,7 @@ define void @test_vfnmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER2-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfnmsub231sd {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmsubsd_128:
 ; HASWELL:       # %bb.0:
@@ -3239,7 +3239,7 @@ define void @test_vfnmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; BDVER2-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
 ; BDVER2-NEXT:    vfnmsub231ss {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; HASWELL-LABEL: test_vfnmsubss_128:
 ; HASWELL:       # %bb.0:
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 149053ce056..c894a9f3d40 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -247,76 +247,6 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #
 ; FMA32-NEXT:    ## xmm0 = (xmm1 * xmm0) + xmm2
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v4f32:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    subl $108, %esp ## encoding: [0x83,0xec,0x6c]
-; FMACALL32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x40]
-; FMACALL32-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x30]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vextractps $2, %xmm1, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x4c,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x60]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x54]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x54]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x60]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
-; FMACALL32-NEXT:    addl $108, %esp ## encoding: [0x83,0xc4,0x6c]
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v4f32:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
@@ -407,6 +337,76 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #
 ; AVX512VL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## xmm0 = (xmm1 * xmm0) + xmm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v4f32:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    subl $108, %esp ## encoding: [0x83,0xec,0x6c]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm1, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x4c,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x54]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x54]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    addl $108, %esp ## encoding: [0x83,0xc4,0x6c]
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %call
@@ -419,165 +419,6 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #
 ; FMA32-NEXT:    ## ymm0 = (ymm1 * ymm0) + ymm2
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v8f32:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    subl $316, %esp ## encoding: [0x81,0xec,0x3c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## imm = 0x13C
-; FMACALL32-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $2, %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x5c,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40]
-; FMACALL32-NEXT:    vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xb4,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xa8,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x9c,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x90,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x84,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x78]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x78]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x84,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x90,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x9c,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xa8,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xb4,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
-; FMACALL32-NEXT:    addl $316, %esp ## encoding: [0x81,0xc4,0x3c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## imm = 0x13C
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v8f32:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
@@ -745,6 +586,165 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #
 ; AVX512VL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## ymm0 = (ymm1 * ymm0) + ymm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v8f32:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    subl $316, %esp ## encoding: [0x81,0xec,0x3c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## imm = 0x13C
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x5c,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xb4,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xa8,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x9c,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x90,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x84,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x78]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x78]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x84,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x90,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x9c,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xa8,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xb4,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    addl $316, %esp ## encoding: [0x81,0xc4,0x3c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## imm = 0x13C
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c)
   ret <8 x float> %call
@@ -765,321 +765,6 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMA32-NEXT:    popl %ebp ## encoding: [0x5d]
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v16f32:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    pushl %ebp ## encoding: [0x55]
-; FMACALL32-NEXT:    movl %esp, %ebp ## encoding: [0x89,0xe5]
-; FMACALL32-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
-; FMACALL32-NEXT:    subl $448, %esp ## encoding: [0x81,0xec,0xc0,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## imm = 0x1C0
-; FMACALL32-NEXT:    vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60]
-; FMACALL32-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm3, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd8,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xb0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x60]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x50]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x50]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x60]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x4c]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x10]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x20]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x40,0x30]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x3c]
-; FMACALL32-NEXT:    ## xmm2 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x38,0x10]
-; FMACALL32-NEXT:    ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x20]
-; FMACALL32-NEXT:    ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x30]
-; FMACALL32-NEXT:    ## xmm2 = xmm2[0,1,2],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01]
-; FMACALL32-NEXT:    movl %ebp, %esp ## encoding: [0x89,0xec]
-; FMACALL32-NEXT:    popl %ebp ## encoding: [0x5d]
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v16f32:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213ps %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0x6d,0xa8,0xc4]
@@ -1378,6 +1063,321 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; AVX512VL-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## zmm0 = (zmm1 * zmm0) + zmm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v16f32:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    pushl %ebp ## encoding: [0x55]
+; FMACALL32_BDVER2-NEXT:    movl %esp, %ebp ## encoding: [0x89,0xe5]
+; FMACALL32_BDVER2-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
+; FMACALL32_BDVER2-NEXT:    subl $448, %esp ## encoding: [0x81,0xec,0xc0,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## imm = 0x1C0
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x3c]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x38,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x4c]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x40,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01]
+; FMACALL32_BDVER2-NEXT:    movl %ebp, %esp ## encoding: [0x89,0xec]
+; FMACALL32_BDVER2-NEXT:    popl %ebp ## encoding: [0x5d]
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <16 x float> @llvm.fma.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c)
   ret <16 x float> %call
@@ -1390,41 +1390,6 @@ define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %
 ; FMA32-NEXT:    ## xmm0 = (xmm1 * xmm0) + xmm2
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v2f64:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    subl $108, %esp ## encoding: [0x83,0xec,0x6c]
-; FMACALL32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x30]
-; FMACALL32-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x54,0x24,0x10]
-; FMACALL32-NEXT:    vmovlhps %xmm1, %xmm0, %xmm2 ## encoding: [0xc5,0xf8,0x16,0xd1]
-; FMACALL32-NEXT:    ## xmm2 = xmm0[0],xmm1[0]
-; FMACALL32-NEXT:    vmovups %xmm2, (%esp) ## encoding: [0xc5,0xf8,0x11,0x14,0x24]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58]
-; FMACALL32-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x20]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    addl $108, %esp ## encoding: [0x83,0xc4,0x6c]
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v2f64:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
@@ -1477,6 +1442,41 @@ define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %
 ; AVX512VL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## xmm0 = (xmm1 * xmm0) + xmm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v2f64:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    subl $108, %esp ## encoding: [0x83,0xec,0x6c]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc1]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],xmm1[0]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x54,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    addl $108, %esp ## encoding: [0x83,0xc4,0x6c]
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
   ret <2 x double> %call
@@ -1489,90 +1489,6 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %
 ; FMA32-NEXT:    ## ymm0 = (ymm1 * ymm0) + ymm2
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v4f64:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    subl $252, %esp ## encoding: [0x81,0xec,0xfc,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x70]
-; FMACALL32-NEXT:    vmovlps %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x5c,0x24,0x10]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x60]
-; FMACALL32-NEXT:    vmovlhps %xmm2, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc2]
-; FMACALL32-NEXT:    ## xmm0 = xmm1[0],xmm2[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x44]
-; FMACALL32-NEXT:    vmovupd {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfd,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[1],mem[1]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x38]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x70]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68]
-; FMACALL32-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x38]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x44]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x28]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x18]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
-; FMACALL32-NEXT:    addl $252, %esp ## encoding: [0x81,0xc4,0xfc,0x00,0x00,0x00]
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v4f64:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
@@ -1664,6 +1580,90 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %
 ; AVX512VL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## ymm0 = (ymm1 * ymm0) + ymm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v4f64:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    subl $252, %esp ## encoding: [0x81,0xec,0xfc,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],xmm2[0]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x70]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x5c,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x44]
+; FMACALL32_BDVER2-NEXT:    vmovupd {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfd,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[1],mem[1]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x38]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x70]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x38]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x44]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x18]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    addl $252, %esp ## encoding: [0x81,0xc4,0xfc,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c)
   ret <4 x double> %call
@@ -1684,179 +1684,6 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMA32-NEXT:    popl %ebp ## encoding: [0x5d]
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v8f64:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    pushl %ebp ## encoding: [0x55]
-; FMACALL32-NEXT:    movl %esp, %ebp ## encoding: [0x89,0xe5]
-; FMACALL32-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
-; FMACALL32-NEXT:    subl $384, %esp ## encoding: [0x81,0xec,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## imm = 0x180
-; FMACALL32-NEXT:    vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x40,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x50,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],xmm2[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovapd 40(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[1],mem[1]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x29,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0x30,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[1],xmm1[1]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x30,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x30]
-; FMACALL32-NEXT:    vmovapd 8(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[1],mem[1]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x20]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x40,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x58,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x20]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x30]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x88,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x94,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x70]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x58]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x48]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x78]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70]
-; FMACALL32-NEXT:    ## xmm2 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0x16,0x54,0x24,0x68]
-; FMACALL32-NEXT:    ## xmm2 = xmm2[0],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01]
-; FMACALL32-NEXT:    movl %ebp, %esp ## encoding: [0x89,0xec]
-; FMACALL32-NEXT:    popl %ebp ## encoding: [0x5d]
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v8f64:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213pd %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0xed,0xa8,0xc4]
@@ -2011,6 +1838,179 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; AVX512VL-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## zmm0 = (zmm1 * zmm0) + zmm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v8f64:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    pushl %ebp ## encoding: [0x55]
+; FMACALL32_BDVER2-NEXT:    movl %esp, %ebp ## encoding: [0x89,0xe5]
+; FMACALL32_BDVER2-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
+; FMACALL32_BDVER2-NEXT:    subl $384, %esp ## encoding: [0x81,0xec,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## imm = 0x180
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x40,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x50,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],xmm2[0]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovapd 40(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[1],mem[1]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x29,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0x30,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[1],xmm1[1]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x30,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovapd 8(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[1],mem[1]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x40,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x58,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x88,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x94,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x70]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x58]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x48]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0x16,0x54,0x24,0x68]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x78]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01]
+; FMACALL32_BDVER2-NEXT:    movl %ebp, %esp ## encoding: [0x89,0xec]
+; FMACALL32_BDVER2-NEXT:    popl %ebp ## encoding: [0x5d]
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <8 x double> @llvm.fma.v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c)
   ret <8 x double> %call
diff --git a/test/CodeGen/X86/fma4-schedule.ll b/test/CodeGen/X86/fma4-schedule.ll
index f2e2caf14ba..c8b5debd3fb 100644
--- a/test/CodeGen/X86/fma4-schedule.ll
+++ b/test/CodeGen/X86/fma4-schedule.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma4 | FileCheck %s --check-prefixes=CHECK,GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma4 | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER12,BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma4 -mattr=-fma  | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER12,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1              | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER12,BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-fma  | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER12,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 -mattr=-fma  | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER34,BDVER3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 -mattr=-fma  | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER34,BDVER4
 
@@ -26,7 +26,7 @@ define void @test_vfmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER12-NEXT:    vfmaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmaddpd_128:
 ; BDVER34:       # %bb.0:
@@ -55,11 +55,11 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmaddpd_256:
 ; BDVER34:       # %bb.0:
@@ -91,7 +91,7 @@ define void @test_vfmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; BDVER12-NEXT:    vfmaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmaddps_128:
 ; BDVER34:       # %bb.0:
@@ -120,11 +120,11 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmaddps_256:
 ; BDVER34:       # %bb.0:
@@ -156,7 +156,7 @@ define void @test_vfmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER12-NEXT:    vfmaddsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmaddsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmaddsd_128:
 ; BDVER34:       # %bb.0:
@@ -187,7 +187,7 @@ define void @test_vfmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; BDVER12-NEXT:    vfmaddss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmaddss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmaddss_128:
 ; BDVER34:       # %bb.0:
@@ -222,7 +222,7 @@ define void @test_vfmaddsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; BDVER12-NEXT:    vfmaddsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmaddsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmaddsubpd_128:
 ; BDVER34:       # %bb.0:
@@ -251,11 +251,11 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfmaddsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfmaddsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmaddsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmaddsubpd_256:
 ; BDVER34:       # %bb.0:
@@ -287,7 +287,7 @@ define void @test_vfmaddsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BDVER12-NEXT:    vfmaddsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmaddsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmaddsubps_128:
 ; BDVER34:       # %bb.0:
@@ -316,11 +316,11 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfmaddsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfmaddsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmaddsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmaddsubps_256:
 ; BDVER34:       # %bb.0:
@@ -356,7 +356,7 @@ define void @test_vfmsubaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; BDVER12-NEXT:    vfmsubaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmsubaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmsubaddpd_128:
 ; BDVER34:       # %bb.0:
@@ -385,11 +385,11 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfmsubaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfmsubaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmsubaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmsubaddpd_256:
 ; BDVER34:       # %bb.0:
@@ -421,7 +421,7 @@ define void @test_vfmsubaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BDVER12-NEXT:    vfmsubaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmsubaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmsubaddps_128:
 ; BDVER34:       # %bb.0:
@@ -450,11 +450,11 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfmsubaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfmsubaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmsubaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmsubaddps_256:
 ; BDVER34:       # %bb.0:
@@ -490,7 +490,7 @@ define void @test_vfmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER12-NEXT:    vfmsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmsubpd_128:
 ; BDVER34:       # %bb.0:
@@ -519,11 +519,11 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmsubpd_256:
 ; BDVER34:       # %bb.0:
@@ -555,7 +555,7 @@ define void @test_vfmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; BDVER12-NEXT:    vfmsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmsubps_128:
 ; BDVER34:       # %bb.0:
@@ -584,11 +584,11 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmsubps_256:
 ; BDVER34:       # %bb.0:
@@ -620,7 +620,7 @@ define void @test_vfmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER12-NEXT:    vfmsubsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmsubsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmsubsd_128:
 ; BDVER34:       # %bb.0:
@@ -651,7 +651,7 @@ define void @test_vfmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; BDVER12-NEXT:    vfmsubss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfmsubss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfmsubss_128:
 ; BDVER34:       # %bb.0:
@@ -686,7 +686,7 @@ define void @test_vfnmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER12-NEXT:    vfnmaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfnmaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmaddpd_128:
 ; BDVER34:       # %bb.0:
@@ -715,11 +715,11 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfnmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfnmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfnmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmaddpd_256:
 ; BDVER34:       # %bb.0:
@@ -751,7 +751,7 @@ define void @test_vfnmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; BDVER12-NEXT:    vfnmaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfnmaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmaddps_128:
 ; BDVER34:       # %bb.0:
@@ -780,11 +780,11 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfnmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfnmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfnmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmaddps_256:
 ; BDVER34:       # %bb.0:
@@ -816,7 +816,7 @@ define void @test_vfnmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER12-NEXT:    vfnmaddsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfnmaddsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmaddsd_128:
 ; BDVER34:       # %bb.0:
@@ -847,7 +847,7 @@ define void @test_vfnmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; BDVER12-NEXT:    vfnmaddss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfnmaddss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmaddss_128:
 ; BDVER34:       # %bb.0:
@@ -882,7 +882,7 @@ define void @test_vfnmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER12-NEXT:    vfnmsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfnmsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmsubpd_128:
 ; BDVER34:       # %bb.0:
@@ -911,11 +911,11 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfnmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfnmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfnmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmsubpd_256:
 ; BDVER34:       # %bb.0:
@@ -947,7 +947,7 @@ define void @test_vfnmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; BDVER12-NEXT:    vfnmsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfnmsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmsubps_128:
 ; BDVER34:       # %bb.0:
@@ -976,11 +976,11 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
 ; BDVER12-NEXT:    vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER12-NEXT:    vfnmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
-; BDVER12-NEXT:    vfnmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfnmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmsubps_256:
 ; BDVER34:       # %bb.0:
@@ -1012,7 +1012,7 @@ define void @test_vfnmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER12-NEXT:    vfnmsubsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfnmsubsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmsubsd_128:
 ; BDVER34:       # %bb.0:
@@ -1043,7 +1043,7 @@ define void @test_vfnmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; BDVER12-NEXT:    vfnmsubss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    vfnmsubss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER34-LABEL: test_vfnmsubss_128:
 ; BDVER34:       # %bb.0:
diff --git a/test/CodeGen/X86/lea32-schedule.ll b/test/CodeGen/X86/lea32-schedule.ll
index ab509f57463..1e8ebfb766b 100644
--- a/test/CodeGen/X86/lea32-schedule.ll
+++ b/test/CodeGen/X86/lea32-schedule.ll
@@ -8,7 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell   | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=-slow-3ops-lea      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -65,7 +65,7 @@ define i32 @test_lea_offset(i32) {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; BDVER2-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_offset:
 ; BTVER2:       # %bb.0:
@@ -135,7 +135,7 @@ define i32 @test_lea_offset_big(i32) {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; BDVER2-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_offset_big:
 ; BTVER2:       # %bb.0:
@@ -214,7 +214,7 @@ define i32 @test_lea_add(i32, i32) {
 ; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; BDVER2-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add:
 ; BTVER2:       # %bb.0:
@@ -299,7 +299,7 @@ define i32 @test_lea_add_offset(i32, i32) {
 ; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; BDVER2-NEXT:    leal 16(%rdi,%rsi), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_offset:
 ; BTVER2:       # %bb.0:
@@ -390,7 +390,7 @@ define i32 @test_lea_add_offset_big(i32, i32) {
 ; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; BDVER2-NEXT:    leal -4096(%rdi,%rsi), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_offset_big:
 ; BTVER2:       # %bb.0:
@@ -463,7 +463,7 @@ define i32 @test_lea_mul(i32) {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; BDVER2-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul:
 ; BTVER2:       # %bb.0:
@@ -538,7 +538,7 @@ define i32 @test_lea_mul_offset(i32) {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; BDVER2-NEXT:    leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul_offset:
 ; BTVER2:       # %bb.0:
@@ -619,7 +619,7 @@ define i32 @test_lea_mul_offset_big(i32) {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; BDVER2-NEXT:    leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul_offset_big:
 ; BTVER2:       # %bb.0:
@@ -698,7 +698,7 @@ define i32 @test_lea_add_scale(i32, i32) {
 ; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; BDVER2-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale:
 ; BTVER2:       # %bb.0:
@@ -784,7 +784,7 @@ define i32 @test_lea_add_scale_offset(i32, i32) {
 ; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; BDVER2-NEXT:    leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale_offset:
 ; BTVER2:       # %bb.0:
@@ -876,7 +876,7 @@ define i32 @test_lea_add_scale_offset_big(i32, i32) {
 ; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; BDVER2-NEXT:    leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale_offset_big:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/lea64-schedule.ll b/test/CodeGen/X86/lea64-schedule.ll
index 82269aaeadd..cac9d2b5062 100644
--- a/test/CodeGen/X86/lea64-schedule.ll
+++ b/test/CodeGen/X86/lea64-schedule.ll
@@ -8,7 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell   | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=-slow-3ops-lea      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -57,7 +57,7 @@ define i64 @test_lea_offset(i64) {
 ; BDVER2-LABEL: test_lea_offset:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_offset:
 ; BTVER2:       # %bb.0:
@@ -117,7 +117,7 @@ define i64 @test_lea_offset_big(i64) {
 ; BDVER2-LABEL: test_lea_offset_big:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_offset_big:
 ; BTVER2:       # %bb.0:
@@ -178,7 +178,7 @@ define i64 @test_lea_add(i64, i64) {
 ; BDVER2-LABEL: test_lea_add:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add:
 ; BTVER2:       # %bb.0:
@@ -243,7 +243,7 @@ define i64 @test_lea_add_offset(i64, i64) {
 ; BDVER2-LABEL: test_lea_add_offset:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_offset:
 ; BTVER2:       # %bb.0:
@@ -314,7 +314,7 @@ define i64 @test_lea_add_offset_big(i64, i64) {
 ; BDVER2-LABEL: test_lea_add_offset_big:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_offset_big:
 ; BTVER2:       # %bb.0:
@@ -375,7 +375,7 @@ define i64 @test_lea_mul(i64) {
 ; BDVER2-LABEL: test_lea_mul:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul:
 ; BTVER2:       # %bb.0:
@@ -440,7 +440,7 @@ define i64 @test_lea_mul_offset(i64) {
 ; BDVER2-LABEL: test_lea_mul_offset:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul_offset:
 ; BTVER2:       # %bb.0:
@@ -511,7 +511,7 @@ define i64 @test_lea_mul_offset_big(i64) {
 ; BDVER2-LABEL: test_lea_mul_offset_big:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_mul_offset_big:
 ; BTVER2:       # %bb.0:
@@ -572,7 +572,7 @@ define i64 @test_lea_add_scale(i64, i64) {
 ; BDVER2-LABEL: test_lea_add_scale:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale:
 ; BTVER2:       # %bb.0:
@@ -638,7 +638,7 @@ define i64 @test_lea_add_scale_offset(i64, i64) {
 ; BDVER2-LABEL: test_lea_add_scale_offset:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale_offset:
 ; BTVER2:       # %bb.0:
@@ -710,7 +710,7 @@ define i64 @test_lea_add_scale_offset_big(i64, i64) {
 ; BDVER2-LABEL: test_lea_add_scale_offset_big:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lea_add_scale_offset_big:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/lwp-intrinsics.ll b/test/CodeGen/X86/lwp-intrinsics.ll
index 2d293651bcb..a9b8a65d2dd 100644
--- a/test/CodeGen/X86/lwp-intrinsics.ll
+++ b/test/CodeGen/X86/lwp-intrinsics.ll
@@ -40,14 +40,41 @@ define i8* @test_slwpcb(i8 *%a0) nounwind {
 }
 
 define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
-; X86-LABEL: test_lwpins32_rri:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
-; X86-NEXT:    setb %al
-; X86-NEXT:    retl
+; X86_BDVER1-LABEL: test_lwpins32_rri:
+; X86_BDVER1:       # %bb.0:
+; X86_BDVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER1-NEXT:    addl %ecx, %ecx
+; X86_BDVER1-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_BDVER1-NEXT:    setb %al
+; X86_BDVER1-NEXT:    retl
+;
+; X86_BDVER2-LABEL: test_lwpins32_rri:
+; X86_BDVER2:       # %bb.0:
+; X86_BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER2-NEXT:    addl %ecx, %ecx
+; X86_BDVER2-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_BDVER2-NEXT:    setb %al
+; X86_BDVER2-NEXT:    retl
+;
+; X86_BDVER3-LABEL: test_lwpins32_rri:
+; X86_BDVER3:       # %bb.0:
+; X86_BDVER3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER3-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER3-NEXT:    addl %ecx, %ecx
+; X86_BDVER3-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_BDVER3-NEXT:    setb %al
+; X86_BDVER3-NEXT:    retl
+;
+; X86_BDVER4-LABEL: test_lwpins32_rri:
+; X86_BDVER4:       # %bb.0:
+; X86_BDVER4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER4-NEXT:    addl %ecx, %ecx
+; X86_BDVER4-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_BDVER4-NEXT:    setb %al
+; X86_BDVER4-NEXT:    retl
 ;
 ; X64-LABEL: test_lwpins32_rri:
 ; X64:       # %bb.0:
@@ -80,13 +107,37 @@ define i8 @test_lwpins32_rmi(i32 %a0, i32 *%p1) nounwind {
 }
 
 define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
-; X86-LABEL: test_lwpval32_rri:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
-; X86-NEXT:    retl
+; X86_BDVER1-LABEL: test_lwpval32_rri:
+; X86_BDVER1:       # %bb.0:
+; X86_BDVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER1-NEXT:    addl %ecx, %ecx
+; X86_BDVER1-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_BDVER1-NEXT:    retl
+;
+; X86_BDVER2-LABEL: test_lwpval32_rri:
+; X86_BDVER2:       # %bb.0:
+; X86_BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER2-NEXT:    addl %ecx, %ecx
+; X86_BDVER2-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_BDVER2-NEXT:    retl
+;
+; X86_BDVER3-LABEL: test_lwpval32_rri:
+; X86_BDVER3:       # %bb.0:
+; X86_BDVER3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER3-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER3-NEXT:    addl %ecx, %ecx
+; X86_BDVER3-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_BDVER3-NEXT:    retl
+;
+; X86_BDVER4-LABEL: test_lwpval32_rri:
+; X86_BDVER4:       # %bb.0:
+; X86_BDVER4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER4-NEXT:    addl %ecx, %ecx
+; X86_BDVER4-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_BDVER4-NEXT:    retl
 ;
 ; X64-LABEL: test_lwpval32_rri:
 ; X64:       # %bb.0:
diff --git a/test/CodeGen/X86/lwp-schedule.ll b/test/CodeGen/X86/lwp-schedule.ll
index 11699e7d37f..c10282cfb8e 100644
--- a/test/CodeGen/X86/lwp-schedule.ll
+++ b/test/CodeGen/X86/lwp-schedule.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=x86-64 -mattr=+lwp | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=x86-64 -mattr=+lwp | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=x86-64 -mattr=+lwp | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
 ; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
 
@@ -13,8 +13,8 @@ define void @test_llwpcb(i8 *%a0) nounwind {
 ;
 ; BDVER12-LABEL: test_llwpcb:
 ; BDVER12:       # %bb.0:
-; BDVER12-NEXT:    llwpcb %rdi # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    llwpcb %rdi # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_llwpcb:
 ; BDVER3:       # %bb.0:
@@ -37,8 +37,8 @@ define i8* @test_slwpcb(i8 *%a0) nounwind {
 ;
 ; BDVER12-LABEL: test_slwpcb:
 ; BDVER12:       # %bb.0:
-; BDVER12-NEXT:    slwpcb %rax # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    slwpcb %rax # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_slwpcb:
 ; BDVER3:       # %bb.0:
@@ -64,11 +64,11 @@ define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
 ;
 ; BDVER12-LABEL: test_lwpins32_rri:
 ; BDVER12:       # %bb.0:
-; BDVER12-NEXT:    addl %esi, %esi # sched: [1:0.33]
+; BDVER12-NEXT:    addl %esi, %esi # sched: [1:0.50]
 ; BDVER12-NEXT:    lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
-; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    # sched: [100:0.50]
 ; BDVER12-NEXT:    setb %al # sched: [1:0.50]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_lwpins32_rri:
 ; BDVER3:       # %bb.0:
@@ -99,9 +99,9 @@ define i8 @test_lwpins32_rmi(i32 %a0, i32 *%p1) nounwind {
 ; BDVER12-LABEL: test_lwpins32_rmi:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
-; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    # sched: [100:0.50]
 ; BDVER12-NEXT:    setb %al # sched: [1:0.50]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_lwpins32_rmi:
 ; BDVER3:       # %bb.0:
@@ -130,9 +130,9 @@ define i8 @test_lwpins64_rri(i64 %a0, i32 %a1) nounwind {
 ; BDVER12-LABEL: test_lwpins64_rri:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
-; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    # sched: [100:0.50]
 ; BDVER12-NEXT:    setb %al # sched: [1:0.50]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_lwpins64_rri:
 ; BDVER3:       # %bb.0:
@@ -160,9 +160,9 @@ define i8 @test_lwpins64_rmi(i64 %a0, i32 *%p1) nounwind {
 ; BDVER12-LABEL: test_lwpins64_rmi:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
-; BDVER12-NEXT:    # sched: [100:0.33]
+; BDVER12-NEXT:    # sched: [100:0.50]
 ; BDVER12-NEXT:    setb %al # sched: [1:0.50]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_lwpins64_rmi:
 ; BDVER3:       # %bb.0:
@@ -190,10 +190,10 @@ define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
 ;
 ; BDVER12-LABEL: test_lwpval32_rri:
 ; BDVER12:       # %bb.0:
-; BDVER12-NEXT:    addl %esi, %esi # sched: [1:0.33]
+; BDVER12-NEXT:    addl %esi, %esi # sched: [1:0.50]
 ; BDVER12-NEXT:    lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
-; BDVER12-NEXT:    # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_lwpval32_rri:
 ; BDVER3:       # %bb.0:
@@ -221,8 +221,8 @@ define void @test_lwpval32_rmi(i32 %a0, i32 *%p1) nounwind {
 ; BDVER12-LABEL: test_lwpval32_rmi:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    lwpval $305419896, (%rsi), %edi # imm = 0x12345678
-; BDVER12-NEXT:    # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_lwpval32_rmi:
 ; BDVER3:       # %bb.0:
@@ -248,8 +248,8 @@ define void @test_lwpval64_rri(i64 %a0, i32 %a1) nounwind {
 ; BDVER12-LABEL: test_lwpval64_rri:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
-; BDVER12-NEXT:    # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_lwpval64_rri:
 ; BDVER3:       # %bb.0:
@@ -274,8 +274,8 @@ define void @test_lwpval64_rmi(i64 %a0, i32 *%p1) nounwind {
 ; BDVER12-LABEL: test_lwpval64_rmi:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
-; BDVER12-NEXT:    # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_lwpval64_rmi:
 ; BDVER3:       # %bb.0:
diff --git a/test/CodeGen/X86/lzcnt-schedule.ll b/test/CodeGen/X86/lzcnt-schedule.ll
index 15622ad3426..d8f9416b92b 100644
--- a/test/CodeGen/X86/lzcnt-schedule.ll
+++ b/test/CodeGen/X86/lzcnt-schedule.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake   | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl       | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+lzcnt    | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2    | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2    | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1    | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -43,11 +43,11 @@ define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) {
 ;
 ; BDVER2-LABEL: test_ctlz_i16:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    lzcntw (%rsi), %cx # sched: [8:1.00]
-; BDVER2-NEXT:    lzcntw %di, %ax # sched: [3:1.00]
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    lzcntw (%rsi), %cx # sched: [6:0.50]
+; BDVER2-NEXT:    lzcntw %di, %ax # sched: [2:0.50]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
 ; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ctlz_i16:
 ; BTVER2:       # %bb.0:
@@ -103,10 +103,10 @@ define i32 @test_ctlz_i32(i32 %a0, i32 *%a1) {
 ;
 ; BDVER2-LABEL: test_ctlz_i32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    lzcntl (%rsi), %ecx # sched: [8:1.00]
-; BDVER2-NEXT:    lzcntl %edi, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    lzcntl (%rsi), %ecx # sched: [6:0.50]
+; BDVER2-NEXT:    lzcntl %edi, %eax # sched: [2:0.50]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ctlz_i32:
 ; BTVER2:       # %bb.0:
@@ -160,10 +160,10 @@ define i64 @test_ctlz_i64(i64 %a0, i64 *%a1) {
 ;
 ; BDVER2-LABEL: test_ctlz_i64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    lzcntq (%rsi), %rcx # sched: [8:1.00]
-; BDVER2-NEXT:    lzcntq %rdi, %rax # sched: [3:1.00]
-; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    lzcntq (%rsi), %rcx # sched: [6:0.50]
+; BDVER2-NEXT:    lzcntq %rdi, %rax # sched: [2:0.50]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ctlz_i64:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/memset.ll b/test/CodeGen/X86/memset.ll
index 6d5c4cd0f8a..02fd8806254 100644
--- a/test/CodeGen/X86/memset.ll
+++ b/test/CodeGen/X86/memset.ll
@@ -22,7 +22,6 @@ define void @t() nounwind  {
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    retl
-; X86-NEXT:    ## -- End function
 ;
 ; XMM-LABEL: t:
 ; XMM:       ## %bb.0: ## %entry
@@ -35,7 +34,6 @@ define void @t() nounwind  {
 ; XMM-NEXT:    calll _foo
 ; XMM-NEXT:    addl $60, %esp
 ; XMM-NEXT:    retl
-; XMM-NEXT:    ## -- End function
 ;
 ; YMM-LABEL: t:
 ; YMM:       ## %bb.0: ## %entry
@@ -44,15 +42,14 @@ define void @t() nounwind  {
 ; YMM-NEXT:    andl $-32, %esp
 ; YMM-NEXT:    subl $96, %esp
 ; YMM-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; YMM-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
 ; YMM-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; YMM-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
 ; YMM-NEXT:    movl %eax, (%esp)
 ; YMM-NEXT:    vzeroupper
 ; YMM-NEXT:    calll _foo
 ; YMM-NEXT:    movl %ebp, %esp
 ; YMM-NEXT:    popl %ebp
 ; YMM-NEXT:    retl
-; YMM-NEXT:    ## -- End function
 entry:
 	%up_mvd = alloca [8 x %struct.x]		; <[8 x %struct.x]*> [#uses=2]
 	%up_mvd116 = getelementptr [8 x %struct.x], [8 x %struct.x]* %up_mvd, i32 0, i32 0		; <%struct.x*> [#uses=1]
diff --git a/test/CodeGen/X86/mmx-schedule.ll b/test/CodeGen/X86/mmx-schedule.ll
index 0fec25a8fa6..6a8a487d7c1 100644
--- a/test/CodeGen/X86/mmx-schedule.ll
+++ b/test/CodeGen/X86/mmx-schedule.ll
@@ -8,7 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -79,11 +79,11 @@ define i64 @test_cvtpd2pi(<2 x double> %a0, <2 x double>* %a1) optsize {
 ;
 ; BDVER2-LABEL: test_cvtpd2pi:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    cvtpd2pi (%rdi), %mm0 # sched: [10:1.00]
-; BDVER2-NEXT:    cvtpd2pi %xmm0, %mm1 # sched: [4:1.00]
-; BDVER2-NEXT:    por %mm1, %mm0 # sched: [1:0.33]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    cvtpd2pi (%rdi), %mm1 # sched: [13:1.00]
+; BDVER2-NEXT:    cvtpd2pi %xmm0, %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    por %mm0, %mm1 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm1, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2pi:
 ; BTVER2:       # %bb.0:
@@ -168,10 +168,10 @@ define <2 x double> @test_cvtpi2pd(x86_mmx %a0, x86_mmx* %a1) optsize {
 ;
 ; BDVER2-LABEL: test_cvtpi2pd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    cvtpi2pd %mm0, %xmm0 # sched: [4:1.00]
-; BDVER2-NEXT:    cvtpi2pd (%rdi), %xmm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    cvtpi2pd (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    cvtpi2pd %mm0, %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpi2pd:
 ; BTVER2:       # %bb.0:
@@ -253,10 +253,10 @@ define <4 x float> @test_cvtpi2ps(x86_mmx %a0, x86_mmx* %a1, <4 x float> %a2, <4
 ;
 ; BDVER2-LABEL: test_cvtpi2ps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    cvtpi2ps %mm0, %xmm0 # sched: [3:1.00]
 ; BDVER2-NEXT:    cvtpi2ps (%rdi), %xmm1 # sched: [9:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    cvtpi2ps %mm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpi2ps:
 ; BTVER2:       # %bb.0:
@@ -346,11 +346,11 @@ define i64 @test_cvtps2pi(<4 x float> %a0, <4 x float>* %a1) optsize {
 ;
 ; BDVER2-LABEL: test_cvtps2pi:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    cvtps2pi %xmm0, %mm0 # sched: [3:1.00]
 ; BDVER2-NEXT:    cvtps2pi (%rdi), %mm1 # sched: [9:1.00]
-; BDVER2-NEXT:    por %mm0, %mm1 # sched: [1:0.33]
-; BDVER2-NEXT:    movq %mm1, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    cvtps2pi %xmm0, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    por %mm0, %mm1 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm1, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvtps2pi:
 ; BTVER2:       # %bb.0:
@@ -443,11 +443,11 @@ define i64 @test_cvttpd2pi(<2 x double> %a0, <2 x double>* %a1) optsize {
 ;
 ; BDVER2-LABEL: test_cvttpd2pi:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    cvttpd2pi (%rdi), %mm0 # sched: [10:1.00]
-; BDVER2-NEXT:    cvttpd2pi %xmm0, %mm1 # sched: [4:1.00]
-; BDVER2-NEXT:    por %mm1, %mm0 # sched: [1:0.33]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    cvttpd2pi (%rdi), %mm1 # sched: [13:1.00]
+; BDVER2-NEXT:    cvttpd2pi %xmm0, %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    por %mm0, %mm1 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm1, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvttpd2pi:
 ; BTVER2:       # %bb.0:
@@ -540,11 +540,11 @@ define i64 @test_cvttps2pi(<4 x float> %a0, <4 x float>* %a1) optsize {
 ;
 ; BDVER2-LABEL: test_cvttps2pi:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    cvttps2pi %xmm0, %mm0 # sched: [3:1.00]
 ; BDVER2-NEXT:    cvttps2pi (%rdi), %mm1 # sched: [9:1.00]
-; BDVER2-NEXT:    por %mm0, %mm1 # sched: [1:0.33]
-; BDVER2-NEXT:    movq %mm1, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    cvttps2pi %xmm0, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    por %mm0, %mm1 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm1, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cvttps2pi:
 ; BTVER2:       # %bb.0:
@@ -613,8 +613,8 @@ define void @test_emms() optsize {
 ;
 ; BDVER2-LABEL: test_emms:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    emms # sched: [31:10.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    emms # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_emms:
 ; BTVER2:       # %bb.0:
@@ -673,8 +673,8 @@ define void @test_maskmovq(x86_mmx %a0, x86_mmx %a1, i8* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_maskmovq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    maskmovq %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    maskmovq %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovq:
 ; BTVER2:       # %bb.0:
@@ -781,14 +781,14 @@ define i32 @test_movd(x86_mmx %a0, i32 %a1, i32 *%a2) {
 ;
 ; BDVER2-LABEL: test_movd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movd %edi, %mm1 # sched: [1:1.00]
+; BDVER2-NEXT:    movd %edi, %mm1 # sched: [10:0.50]
 ; BDVER2-NEXT:    movd (%rsi), %mm2 # sched: [5:0.50]
-; BDVER2-NEXT:    paddd %mm1, %mm2 # sched: [3:1.00]
-; BDVER2-NEXT:    paddd %mm2, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    movd %mm2, %ecx # sched: [2:1.00]
-; BDVER2-NEXT:    movd %mm0, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    movl %ecx, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    paddd %mm1, %mm2 # sched: [2:0.50]
+; BDVER2-NEXT:    paddd %mm2, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movd %mm2, %ecx # sched: [10:1.00]
+; BDVER2-NEXT:    movd %mm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    movl %ecx, (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movd:
 ; BTVER2:       # %bb.0:
@@ -885,10 +885,10 @@ define i64 @test_movdq2q(<2 x i64> %a0) optsize {
 ;
 ; BDVER2-LABEL: test_movdq2q:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movdq2q %xmm0, %mm0 # sched: [2:1.00]
-; BDVER2-NEXT:    paddd %mm0, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movdq2q %xmm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddd %mm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movdq2q:
 ; BTVER2:       # %bb.0:
@@ -953,8 +953,8 @@ define void @test_movntq(x86_mmx* %a0, x86_mmx %a1) optsize {
 ;
 ; BDVER2-LABEL: test_movntq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movntq %mm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movntq %mm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movntq:
 ; BTVER2:       # %bb.0:
@@ -1032,9 +1032,9 @@ define void @test_movq(i64 *%a0) {
 ; BDVER2-LABEL: test_movq:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movq (%rdi), %mm0 # sched: [5:0.50]
-; BDVER2-NEXT:    paddd %mm0, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    movq %mm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    paddd %mm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movq:
 ; BTVER2:       # %bb.0:
@@ -1100,8 +1100,8 @@ define <2 x i64> @test_movq2dq(x86_mmx %a0) optsize {
 ;
 ; BDVER2-LABEL: test_movq2dq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movq2dq %mm0, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movq2dq %mm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movq2dq:
 ; BTVER2:       # %bb.0:
@@ -1176,10 +1176,10 @@ define i64 @test_pabsb(x86_mmx *%a0) optsize {
 ;
 ; BDVER2-LABEL: test_pabsb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pabsb (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    pabsb %mm0, %mm0 # sched: [1:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pabsb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    pabsb %mm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pabsb:
 ; BTVER2:       # %bb.0:
@@ -1261,10 +1261,10 @@ define i64 @test_pabsd(x86_mmx *%a0) optsize {
 ;
 ; BDVER2-LABEL: test_pabsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pabsd (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    pabsd %mm0, %mm0 # sched: [1:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pabsd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    pabsd %mm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pabsd:
 ; BTVER2:       # %bb.0:
@@ -1346,10 +1346,10 @@ define i64 @test_pabsw(x86_mmx *%a0) optsize {
 ;
 ; BDVER2-LABEL: test_pabsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pabsw (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    pabsw %mm0, %mm0 # sched: [1:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pabsw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    pabsw %mm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pabsw:
 ; BTVER2:       # %bb.0:
@@ -1431,10 +1431,10 @@ define i64 @test_packssdw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_packssdw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    packssdw %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    packssdw (%rdi), %mm0 # sched: [6:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    packssdw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    packssdw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_packssdw:
 ; BTVER2:       # %bb.0:
@@ -1516,10 +1516,10 @@ define i64 @test_packsswb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_packsswb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    packsswb %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    packsswb (%rdi), %mm0 # sched: [6:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    packsswb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    packsswb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_packsswb:
 ; BTVER2:       # %bb.0:
@@ -1601,10 +1601,10 @@ define i64 @test_packuswb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_packuswb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    packuswb %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    packuswb (%rdi), %mm0 # sched: [6:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    packuswb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    packuswb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_packuswb:
 ; BTVER2:       # %bb.0:
@@ -1686,10 +1686,10 @@ define i64 @test_paddb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_paddb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    paddb %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    paddb (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    paddb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_paddb:
 ; BTVER2:       # %bb.0:
@@ -1771,10 +1771,10 @@ define i64 @test_paddd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_paddd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    paddd %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    paddd (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    paddd %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_paddd:
 ; BTVER2:       # %bb.0:
@@ -1856,10 +1856,10 @@ define i64 @test_paddq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_paddq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    paddq %mm1, %mm0 # sched: [1:0.50]
+; BDVER2-NEXT:    paddq %mm1, %mm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    paddq (%rdi), %mm0 # sched: [7:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_paddq:
 ; BTVER2:       # %bb.0:
@@ -1941,10 +1941,10 @@ define i64 @test_paddsb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_paddsb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    paddsb %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    paddsb (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    paddsb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddsb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_paddsb:
 ; BTVER2:       # %bb.0:
@@ -2026,10 +2026,10 @@ define i64 @test_paddsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_paddsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    paddsw %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    paddsw (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    paddsw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddsw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_paddsw:
 ; BTVER2:       # %bb.0:
@@ -2111,10 +2111,10 @@ define i64 @test_paddusb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_paddusb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    paddusb %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    paddusb (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    paddusb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddusb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_paddusb:
 ; BTVER2:       # %bb.0:
@@ -2196,10 +2196,10 @@ define i64 @test_paddusw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_paddusw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    paddusw %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    paddusw (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    paddusw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddusw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_paddusw:
 ; BTVER2:       # %bb.0:
@@ -2281,10 +2281,10 @@ define i64 @test_paddw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_paddw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    paddw %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    paddw (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    paddw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_paddw:
 ; BTVER2:       # %bb.0:
@@ -2366,10 +2366,10 @@ define i64 @test_palignr(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_palignr:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    palignr $1, %mm1, %mm0 # sched: [1:0.50]
-; BDVER2-NEXT:    palignr $1, (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    palignr $1, %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    palignr $1, (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_palignr:
 ; BTVER2:       # %bb.0:
@@ -2451,10 +2451,10 @@ define i64 @test_pand(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pand:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pand %mm1, %mm0 # sched: [1:0.33]
-; BDVER2-NEXT:    pand (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pand %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pand (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pand:
 ; BTVER2:       # %bb.0:
@@ -2536,10 +2536,10 @@ define i64 @test_pandn(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pandn:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pandn %mm1, %mm0 # sched: [1:0.33]
-; BDVER2-NEXT:    pandn (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pandn %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pandn (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pandn:
 ; BTVER2:       # %bb.0:
@@ -2621,10 +2621,10 @@ define i64 @test_pavgb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pavgb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pavgb %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pavgb (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pavgb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pavgb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pavgb:
 ; BTVER2:       # %bb.0:
@@ -2706,10 +2706,10 @@ define i64 @test_pavgw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pavgw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pavgw %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pavgw (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pavgw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pavgw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pavgw:
 ; BTVER2:       # %bb.0:
@@ -2791,10 +2791,10 @@ define i64 @test_pcmpeqb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pcmpeqb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pcmpeqb %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pcmpeqb (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pcmpeqb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpeqb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqb:
 ; BTVER2:       # %bb.0:
@@ -2876,10 +2876,10 @@ define i64 @test_pcmpeqd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pcmpeqd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pcmpeqd %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pcmpeqd (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pcmpeqd %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpeqd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqd:
 ; BTVER2:       # %bb.0:
@@ -2961,10 +2961,10 @@ define i64 @test_pcmpeqw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pcmpeqw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pcmpeqw %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pcmpeqw (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pcmpeqw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpeqw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqw:
 ; BTVER2:       # %bb.0:
@@ -3046,10 +3046,10 @@ define i64 @test_pcmpgtb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pcmpgtb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pcmpgtb %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pcmpgtb (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pcmpgtb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpgtb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtb:
 ; BTVER2:       # %bb.0:
@@ -3131,10 +3131,10 @@ define i64 @test_pcmpgtd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pcmpgtd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pcmpgtd %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pcmpgtd (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pcmpgtd %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpgtd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtd:
 ; BTVER2:       # %bb.0:
@@ -3216,10 +3216,10 @@ define i64 @test_pcmpgtw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pcmpgtw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pcmpgtw %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pcmpgtw (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pcmpgtw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpgtw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtw:
 ; BTVER2:       # %bb.0:
@@ -3285,8 +3285,8 @@ define i32 @test_pextrw(x86_mmx %a0) optsize {
 ;
 ; BDVER2-LABEL: test_pextrw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pextrw $0, %mm0, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pextrw $0, %mm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pextrw:
 ; BTVER2:       # %bb.0:
@@ -3361,10 +3361,10 @@ define i64 @test_phaddd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_phaddd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    phaddd %mm1, %mm0 # sched: [3:1.50]
-; BDVER2-NEXT:    phaddd (%rdi), %mm0 # sched: [8:1.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    phaddd %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phaddd (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_phaddd:
 ; BTVER2:       # %bb.0:
@@ -3446,10 +3446,10 @@ define i64 @test_phaddsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_phaddsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    phaddsw %mm1, %mm0 # sched: [3:1.50]
-; BDVER2-NEXT:    phaddsw (%rdi), %mm0 # sched: [8:1.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    phaddsw %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phaddsw (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_phaddsw:
 ; BTVER2:       # %bb.0:
@@ -3531,10 +3531,10 @@ define i64 @test_phaddw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_phaddw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    phaddw %mm1, %mm0 # sched: [3:1.50]
-; BDVER2-NEXT:    phaddw (%rdi), %mm0 # sched: [8:1.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    phaddw %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phaddw (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_phaddw:
 ; BTVER2:       # %bb.0:
@@ -3616,10 +3616,10 @@ define i64 @test_phsubd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_phsubd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    phsubd %mm1, %mm0 # sched: [3:1.50]
-; BDVER2-NEXT:    phsubd (%rdi), %mm0 # sched: [8:1.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    phsubd %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phsubd (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_phsubd:
 ; BTVER2:       # %bb.0:
@@ -3701,10 +3701,10 @@ define i64 @test_phsubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_phsubsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    phsubsw %mm1, %mm0 # sched: [3:1.50]
-; BDVER2-NEXT:    phsubsw (%rdi), %mm0 # sched: [8:1.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    phsubsw %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phsubsw (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_phsubsw:
 ; BTVER2:       # %bb.0:
@@ -3786,10 +3786,10 @@ define i64 @test_phsubw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_phsubw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    phsubw %mm1, %mm0 # sched: [3:1.50]
-; BDVER2-NEXT:    phsubw (%rdi), %mm0 # sched: [8:1.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    phsubw %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phsubw (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_phsubw:
 ; BTVER2:       # %bb.0:
@@ -3879,11 +3879,11 @@ define i64 @test_pinsrw(x86_mmx %a0, i32 %a1, i16* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pinsrw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pinsrw $0, %edi, %mm0 # sched: [2:1.00]
 ; BDVER2-NEXT:    movswl (%rsi), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    pinsrw $1, %eax, %mm0 # sched: [2:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pinsrw $0, %edi, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pinsrw $1, %eax, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pinsrw:
 ; BTVER2:       # %bb.0:
@@ -3968,10 +3968,10 @@ define i64 @test_pmaddwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pmaddwd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pmaddwd %mm1, %mm0 # sched: [5:1.00]
-; BDVER2-NEXT:    pmaddwd (%rdi), %mm0 # sched: [10:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pmaddwd %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmaddwd (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pmaddwd:
 ; BTVER2:       # %bb.0:
@@ -4053,10 +4053,10 @@ define i64 @test_pmaddubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pmaddubsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pmaddubsw %mm1, %mm0 # sched: [5:1.00]
-; BDVER2-NEXT:    pmaddubsw (%rdi), %mm0 # sched: [10:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pmaddubsw %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmaddubsw (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pmaddubsw:
 ; BTVER2:       # %bb.0:
@@ -4138,10 +4138,10 @@ define i64 @test_pmaxsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pmaxsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pmaxsw %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pmaxsw (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pmaxsw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pmaxsw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxsw:
 ; BTVER2:       # %bb.0:
@@ -4223,10 +4223,10 @@ define i64 @test_pmaxub(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pmaxub:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pmaxub %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pmaxub (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pmaxub %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pmaxub (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxub:
 ; BTVER2:       # %bb.0:
@@ -4308,10 +4308,10 @@ define i64 @test_pminsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pminsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pminsw %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pminsw (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pminsw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pminsw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pminsw:
 ; BTVER2:       # %bb.0:
@@ -4393,10 +4393,10 @@ define i64 @test_pminub(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pminub:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pminub %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    pminub (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pminub %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pminub (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pminub:
 ; BTVER2:       # %bb.0:
@@ -4462,8 +4462,8 @@ define i32 @test_pmovmskb(x86_mmx %a0) optsize {
 ;
 ; BDVER2-LABEL: test_pmovmskb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pmovmskb %mm0, %eax # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pmovmskb %mm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pmovmskb:
 ; BTVER2:       # %bb.0:
@@ -4538,10 +4538,10 @@ define i64 @test_pmulhrsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pmulhrsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pmulhrsw %mm1, %mm0 # sched: [5:1.00]
-; BDVER2-NEXT:    pmulhrsw (%rdi), %mm0 # sched: [10:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pmulhrsw %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmulhrsw (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pmulhrsw:
 ; BTVER2:       # %bb.0:
@@ -4623,10 +4623,10 @@ define i64 @test_pmulhw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pmulhw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pmulhw %mm1, %mm0 # sched: [5:1.00]
-; BDVER2-NEXT:    pmulhw (%rdi), %mm0 # sched: [10:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pmulhw %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmulhw (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pmulhw:
 ; BTVER2:       # %bb.0:
@@ -4708,10 +4708,10 @@ define i64 @test_pmulhuw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pmulhuw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pmulhuw %mm1, %mm0 # sched: [5:1.00]
-; BDVER2-NEXT:    pmulhuw (%rdi), %mm0 # sched: [10:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pmulhuw %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmulhuw (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pmulhuw:
 ; BTVER2:       # %bb.0:
@@ -4793,10 +4793,10 @@ define i64 @test_pmullw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pmullw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pmullw %mm1, %mm0 # sched: [5:1.00]
-; BDVER2-NEXT:    pmullw (%rdi), %mm0 # sched: [10:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pmullw %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmullw (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pmullw:
 ; BTVER2:       # %bb.0:
@@ -4878,10 +4878,10 @@ define i64 @test_pmuludq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pmuludq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pmuludq %mm1, %mm0 # sched: [5:1.00]
-; BDVER2-NEXT:    pmuludq (%rdi), %mm0 # sched: [10:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pmuludq %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmuludq (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pmuludq:
 ; BTVER2:       # %bb.0:
@@ -4963,10 +4963,10 @@ define i64 @test_por(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_por:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    por %mm1, %mm0 # sched: [1:0.33]
-; BDVER2-NEXT:    por (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    por %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    por (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_por:
 ; BTVER2:       # %bb.0:
@@ -5048,10 +5048,10 @@ define i64 @test_psadbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psadbw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psadbw %mm1, %mm0 # sched: [5:1.00]
-; BDVER2-NEXT:    psadbw (%rdi), %mm0 # sched: [10:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psadbw %mm1, %mm0 # sched: [4:0.50]
+; BDVER2-NEXT:    psadbw (%rdi), %mm0 # sched: [9:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psadbw:
 ; BTVER2:       # %bb.0:
@@ -5133,10 +5133,10 @@ define i64 @test_pshufb(x86_mmx %a0, x86_mmx %a1, x86_mmx *%a2) optsize {
 ;
 ; BDVER2-LABEL: test_pshufb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pshufb %mm1, %mm0 # sched: [1:0.50]
-; BDVER2-NEXT:    pshufb (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pshufb %mm1, %mm0 # sched: [3:2.00]
+; BDVER2-NEXT:    pshufb (%rdi), %mm0 # sched: [8:2.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pshufb:
 ; BTVER2:       # %bb.0:
@@ -5218,10 +5218,10 @@ define i64 @test_pshufw(x86_mmx *%a0) optsize {
 ;
 ; BDVER2-LABEL: test_pshufw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [6:1.00]
-; BDVER2-NEXT:    pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [7:0.50]
+; BDVER2-NEXT:    pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pshufw:
 ; BTVER2:       # %bb.0:
@@ -5303,10 +5303,10 @@ define i64 @test_psignb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psignb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psignb %mm1, %mm0 # sched: [1:0.50]
-; BDVER2-NEXT:    psignb (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psignb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psignb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psignb:
 ; BTVER2:       # %bb.0:
@@ -5388,10 +5388,10 @@ define i64 @test_psignd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psignd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psignd %mm1, %mm0 # sched: [1:0.50]
-; BDVER2-NEXT:    psignd (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psignd %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psignd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psignd:
 ; BTVER2:       # %bb.0:
@@ -5473,10 +5473,10 @@ define i64 @test_psignw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psignw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psignw %mm1, %mm0 # sched: [1:0.50]
-; BDVER2-NEXT:    psignw (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psignw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psignw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psignw:
 ; BTVER2:       # %bb.0:
@@ -5566,11 +5566,11 @@ define i64 @test_pslld(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pslld:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pslld %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    pslld (%rdi), %mm0 # sched: [6:1.00]
-; BDVER2-NEXT:    pslld $7, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pslld %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    pslld (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    pslld $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pslld:
 ; BTVER2:       # %bb.0:
@@ -5664,11 +5664,11 @@ define i64 @test_psllq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psllq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psllq %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    psllq (%rdi), %mm0 # sched: [6:1.00]
-; BDVER2-NEXT:    psllq $7, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psllq %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psllq (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psllq $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psllq:
 ; BTVER2:       # %bb.0:
@@ -5762,11 +5762,11 @@ define i64 @test_psllw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psllw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psllw %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    psllw (%rdi), %mm0 # sched: [6:1.00]
-; BDVER2-NEXT:    psllw $7, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psllw %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psllw (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psllw $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psllw:
 ; BTVER2:       # %bb.0:
@@ -5860,11 +5860,11 @@ define i64 @test_psrad(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psrad:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psrad %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    psrad (%rdi), %mm0 # sched: [6:1.00]
-; BDVER2-NEXT:    psrad $7, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psrad %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psrad (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psrad $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psrad:
 ; BTVER2:       # %bb.0:
@@ -5958,11 +5958,11 @@ define i64 @test_psraw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psraw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psraw %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    psraw (%rdi), %mm0 # sched: [6:1.00]
-; BDVER2-NEXT:    psraw $7, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psraw %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psraw (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psraw $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psraw:
 ; BTVER2:       # %bb.0:
@@ -6056,11 +6056,11 @@ define i64 @test_psrld(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psrld:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psrld %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    psrld (%rdi), %mm0 # sched: [6:1.00]
-; BDVER2-NEXT:    psrld $7, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psrld %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psrld (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psrld $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psrld:
 ; BTVER2:       # %bb.0:
@@ -6154,11 +6154,11 @@ define i64 @test_psrlq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psrlq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psrlq %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    psrlq (%rdi), %mm0 # sched: [6:1.00]
-; BDVER2-NEXT:    psrlq $7, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psrlq %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psrlq (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psrlq $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psrlq:
 ; BTVER2:       # %bb.0:
@@ -6252,11 +6252,11 @@ define i64 @test_psrlw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psrlw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psrlw %mm1, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    psrlw (%rdi), %mm0 # sched: [6:1.00]
-; BDVER2-NEXT:    psrlw $7, %mm0 # sched: [1:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psrlw %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psrlw (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psrlw $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psrlw:
 ; BTVER2:       # %bb.0:
@@ -6342,10 +6342,10 @@ define i64 @test_psubb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psubb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psubb %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    psubb (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psubb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psubb:
 ; BTVER2:       # %bb.0:
@@ -6427,10 +6427,10 @@ define i64 @test_psubd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psubd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psubd %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    psubd (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psubd %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psubd:
 ; BTVER2:       # %bb.0:
@@ -6512,10 +6512,10 @@ define i64 @test_psubq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psubq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psubq %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    psubq (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psubq %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubq (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psubq:
 ; BTVER2:       # %bb.0:
@@ -6597,10 +6597,10 @@ define i64 @test_psubsb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psubsb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psubsb %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    psubsb (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psubsb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubsb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psubsb:
 ; BTVER2:       # %bb.0:
@@ -6682,10 +6682,10 @@ define i64 @test_psubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psubsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psubsw %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    psubsw (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psubsw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubsw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psubsw:
 ; BTVER2:       # %bb.0:
@@ -6767,10 +6767,10 @@ define i64 @test_psubusb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psubusb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psubusb %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    psubusb (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psubusb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubusb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psubusb:
 ; BTVER2:       # %bb.0:
@@ -6852,10 +6852,10 @@ define i64 @test_psubusw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psubusw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psubusw %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    psubusw (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psubusw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubusw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psubusw:
 ; BTVER2:       # %bb.0:
@@ -6937,10 +6937,10 @@ define i64 @test_psubw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_psubw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    psubw %mm1, %mm0 # sched: [3:1.00]
-; BDVER2-NEXT:    psubw (%rdi), %mm0 # sched: [8:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    psubw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_psubw:
 ; BTVER2:       # %bb.0:
@@ -7022,10 +7022,10 @@ define i64 @test_punpckhbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_punpckhbw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:1.00]
-; BDVER2-NEXT:    punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [6:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [2:0.50]
+; BDVER2-NEXT:    punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhbw:
 ; BTVER2:       # %bb.0:
@@ -7107,10 +7107,10 @@ define i64 @test_punpckhdq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_punpckhdq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:1.00]
-; BDVER2-NEXT:    punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [6:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [2:0.50]
+; BDVER2-NEXT:    punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhdq:
 ; BTVER2:       # %bb.0:
@@ -7192,10 +7192,10 @@ define i64 @test_punpckhwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_punpckhwd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
-; BDVER2-NEXT:    punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [2:0.50]
+; BDVER2-NEXT:    punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhwd:
 ; BTVER2:       # %bb.0:
@@ -7277,10 +7277,10 @@ define i64 @test_punpcklbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_punpcklbw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
-; BDVER2-NEXT:    punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [2:0.50]
+; BDVER2-NEXT:    punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_punpcklbw:
 ; BTVER2:       # %bb.0:
@@ -7362,10 +7362,10 @@ define i64 @test_punpckldq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_punpckldq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:1.00]
-; BDVER2-NEXT:    punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [6:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [2:0.50]
+; BDVER2-NEXT:    punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_punpckldq:
 ; BTVER2:       # %bb.0:
@@ -7447,10 +7447,10 @@ define i64 @test_punpcklwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_punpcklwd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:1.00]
-; BDVER2-NEXT:    punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [6:1.00]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [2:0.50]
+; BDVER2-NEXT:    punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_punpcklwd:
 ; BTVER2:       # %bb.0:
@@ -7532,10 +7532,10 @@ define i64 @test_pxor(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BDVER2-LABEL: test_pxor:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pxor %mm1, %mm0 # sched: [1:0.33]
-; BDVER2-NEXT:    pxor (%rdi), %mm0 # sched: [6:0.50]
-; BDVER2-NEXT:    movq %mm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    pxor %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pxor (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pxor:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/popcnt-schedule.ll b/test/CodeGen/X86/popcnt-schedule.ll
index d84d489a068..a039ba01a23 100644
--- a/test/CodeGen/X86/popcnt-schedule.ll
+++ b/test/CodeGen/X86/popcnt-schedule.ll
@@ -8,7 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell   | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+popcnt      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -63,11 +63,11 @@ define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) {
 ;
 ; BDVER2-LABEL: test_ctpop_i16:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    popcntw (%rsi), %cx # sched: [9:1.00]
-; BDVER2-NEXT:    popcntw %di, %ax # sched: [3:1.00]
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    popcntw (%rsi), %cx # sched: [8:0.50]
+; BDVER2-NEXT:    popcntw %di, %ax # sched: [4:0.50]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
 ; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ctpop_i16:
 ; BTVER2:       # %bb.0:
@@ -137,10 +137,10 @@ define i32 @test_ctpop_i32(i32 %a0, i32 *%a1) {
 ;
 ; BDVER2-LABEL: test_ctpop_i32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    popcntl (%rsi), %ecx # sched: [9:1.00]
-; BDVER2-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    popcntl (%rsi), %ecx # sched: [8:0.50]
+; BDVER2-NEXT:    popcntl %edi, %eax # sched: [4:0.50]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ctpop_i32:
 ; BTVER2:       # %bb.0:
@@ -208,10 +208,10 @@ define i64 @test_ctpop_i64(i64 %a0, i64 *%a1) {
 ;
 ; BDVER2-LABEL: test_ctpop_i64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    popcntq (%rsi), %rcx # sched: [9:1.00]
-; BDVER2-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
-; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    popcntq (%rsi), %rcx # sched: [8:0.50]
+; BDVER2-NEXT:    popcntq %rdi, %rax # sched: [4:0.50]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ctpop_i64:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 167abccc339..9e68636f904 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+fma4 -mattr=+avx -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
@@ -40,9 +40,9 @@ define float @f32_no_estimate(float %x) #0 {
 ;
 ; BDVER2-LABEL: f32_no_estimate:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
-; BDVER2-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: f32_no_estimate:
 ; BTVER2:       # %bb.0:
@@ -117,7 +117,7 @@ define float @f32_one_step(float %x) #1 {
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: f32_one_step:
 ; BTVER2:       # %bb.0:
@@ -219,12 +219,12 @@ define float @f32_two_step(float %x) #2 {
 ; BDVER2-LABEL: f32_two_step:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: f32_two_step:
 ; BTVER2:       # %bb.0:
@@ -326,9 +326,9 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
 ;
 ; BDVER2-LABEL: v4f32_no_estimate:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
-; BDVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v4f32_no_estimate:
 ; BTVER2:       # %bb.0:
@@ -403,7 +403,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v4f32_one_step:
 ; BTVER2:       # %bb.0:
@@ -507,12 +507,12 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; BDVER2-LABEL: v4f32_two_step:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v4f32_two_step:
 ; BTVER2:       # %bb.0:
@@ -617,9 +617,9 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
 ;
 ; BDVER2-LABEL: v8f32_no_estimate:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
-; BDVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [29:28.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [9:19.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v8f32_no_estimate:
 ; BTVER2:       # %bb.0:
@@ -698,10 +698,10 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ;
 ; BDVER2-LABEL: v8f32_one_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v8f32_one_step:
 ; BTVER2:       # %bb.0:
@@ -817,13 +817,13 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ;
 ; BDVER2-LABEL: v8f32_two_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v8f32_two_step:
 ; BTVER2:       # %bb.0:
@@ -936,10 +936,10 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ;
 ; BDVER2-LABEL: v16f32_no_estimate:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
-; BDVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [29:28.00]
-; BDVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [29:28.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [9:19.00]
+; BDVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [9:19.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v16f32_no_estimate:
 ; BTVER2:       # %bb.0:
@@ -1045,14 +1045,14 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ;
 ; BDVER2-LABEL: v16f32_one_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm4 # sched: [7:2.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm4 # sched: [5:2.00]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm4, %ymm1, %ymm4, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v16f32_one_step:
 ; BTVER2:       # %bb.0:
@@ -1226,18 +1226,18 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ;
 ; BDVER2-LABEL: v16f32_two_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v16f32_two_step:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll
index f669d5dc3f5..2a773f44956 100644
--- a/test/CodeGen/X86/recip-fastmath2.ll
+++ b/test/CodeGen/X86/recip-fastmath2.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -print-schedule       | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -print-schedule  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+fma4 -mattr=+avx -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
@@ -34,8 +34,8 @@ define float @f32_no_step_2(float %x) #3 {
 ; BDVER2-LABEL: f32_no_step_2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: f32_no_step_2:
 ; BTVER2:       # %bb.0:
@@ -113,8 +113,8 @@ define float @f32_one_step_2(float %x) #1 {
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: f32_one_step_2:
 ; BTVER2:       # %bb.0:
@@ -216,9 +216,9 @@ define float @f32_one_step_2_divs(float %x) #1 {
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
 ; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:1.00]
 ; BDVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
@@ -336,13 +336,13 @@ define float @f32_two_step_2(float %x) #2 {
 ; BDVER2-LABEL: f32_two_step_2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: f32_two_step_2:
 ; BTVER2:       # %bb.0:
@@ -465,8 +465,8 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v4f32_one_step2:
 ; BTVER2:       # %bb.0:
@@ -570,9 +570,9 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
 ; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:1.00]
 ; BDVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v4f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
@@ -692,13 +692,13 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; BDVER2-LABEL: v4f32_two_step2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v4f32_two_step2:
 ; BTVER2:       # %bb.0:
@@ -826,11 +826,11 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ;
 ; BDVER2-LABEL: v8f32_one_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v8f32_one_step2:
 ; BTVER2:       # %bb.0:
@@ -940,12 +940,12 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ;
 ; BDVER2-LABEL: v8f32_one_step_2_divs:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
 ; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00]
-; BDVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [10:2.00]
+; BDVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v8f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
@@ -1078,14 +1078,14 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ;
 ; BDVER2-LABEL: v8f32_two_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v8f32_two_step2:
 ; BTVER2:       # %bb.0:
@@ -1190,8 +1190,8 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
 ;
 ; BDVER2-LABEL: v8f32_no_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v8f32_no_step:
 ; BTVER2:       # %bb.0:
@@ -1249,9 +1249,9 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
 ;
 ; BDVER2-LABEL: v8f32_no_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v8f32_no_step2:
 ; BTVER2:       # %bb.0:
@@ -1361,16 +1361,16 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ;
 ; BDVER2-LABEL: v16f32_one_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm4 # sched: [7:2.00]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm4 # sched: [5:2.00]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm4, %ymm0, %ymm4, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v16f32_one_step2:
 ; BTVER2:       # %bb.0:
@@ -1532,18 +1532,18 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ;
 ; BDVER2-LABEL: v16f32_one_step_2_divs:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [10:2.00]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:1.00]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:1.00]
-; BDVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [10:2.00]
+; BDVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v16f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
@@ -1745,20 +1745,20 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ;
 ; BDVER2-LABEL: v16f32_two_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v16f32_two_step2:
 ; BTVER2:       # %bb.0:
@@ -1904,9 +1904,9 @@ define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
 ;
 ; BDVER2-LABEL: v16f32_no_step:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [7:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v16f32_no_step:
 ; BTVER2:       # %bb.0:
@@ -1976,11 +1976,11 @@ define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
 ;
 ; BDVER2-LABEL: v16f32_no_step2:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [7:2.00]
-; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
-; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v16f32_no_step2:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/schedule-x86-64-shld.ll b/test/CodeGen/X86/schedule-x86-64-shld.ll
index a2e280126b4..315a497bc3c 100644
--- a/test/CodeGen/X86/schedule-x86-64-shld.ll
+++ b/test/CodeGen/X86/schedule-x86-64-shld.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+slow-shld | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER12 --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+slow-shld | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER12 --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER12 --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER12 --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 
 
@@ -19,9 +19,9 @@ define i64 @lshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize {
 ;
 ; BDVER12-LABEL: lshift10_optsize:
 ; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; BDVER12-NEXT:    shldq $10, %rsi, %rax # sched: [2:0.67]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    shldq $10, %rsi, %rax # sched: [4:3.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift10_optsize:
 ; BTVER2:       # %bb.0: # %entry
@@ -47,7 +47,7 @@ define i64 @lshift10(i64 %a, i64 %b) nounwind readnone {
 ; BDVER12-NEXT:    shlq $10, %rdi # sched: [1:0.50]
 ; BDVER12-NEXT:    shrq $54, %rsi # sched: [1:0.50]
 ; BDVER12-NEXT:    leaq (%rsi,%rdi), %rax # sched: [1:0.50]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift10:
 ; BTVER2:       # %bb.0: # %entry
@@ -77,9 +77,9 @@ define i64 @rshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize {
 ;
 ; BDVER12-LABEL: rshift10_optsize:
 ; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; BDVER12-NEXT:    shrdq $62, %rsi, %rax # sched: [2:0.67]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    shrdq $62, %rsi, %rax # sched: [4:3.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: rshift10_optsize:
 ; BTVER2:       # %bb.0: # %entry
@@ -105,7 +105,7 @@ define i64 @rshift10(i64 %a, i64 %b) nounwind readnone {
 ; BDVER12:       # %bb.0: # %entry
 ; BDVER12-NEXT:    shrq $62, %rdi # sched: [1:0.50]
 ; BDVER12-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: rshift10:
 ; BTVER2:       # %bb.0: # %entry
@@ -135,11 +135,11 @@ define i64 @lshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize
 ;
 ; BDVER12-LABEL: lshift_cl_optsize:
 ; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
-; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
 ; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shldq %cl, %rsi, %rax # sched: [4:1.50]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    shldq %cl, %rsi, %rax # sched: [4:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
@@ -167,14 +167,14 @@ define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ;
 ; BDVER12-LABEL: lshift_cl:
 ; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
-; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.33]
-; BDVER12-NEXT:    shlq %cl, %rdi # sched: [3:1.50]
-; BDVER12-NEXT:    negl %ecx # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    shlq %cl, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    negl %ecx # sched: [1:0.50]
 ; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shrq %cl, %rax # sched: [3:1.50]
-; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    shrq %cl, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_cl:
 ; BTVER2:       # %bb.0: # %entry
@@ -211,11 +211,11 @@ define i64 @rshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize
 ;
 ; BDVER12-LABEL: rshift_cl_optsize:
 ; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
-; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
 ; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:1.50]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: rshift_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
@@ -243,14 +243,14 @@ define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ;
 ; BDVER12-LABEL: rshift_cl:
 ; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
-; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.33]
-; BDVER12-NEXT:    shrq %cl, %rdi # sched: [3:1.50]
-; BDVER12-NEXT:    negl %ecx # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    negl %ecx # sched: [1:0.50]
 ; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shlq %cl, %rax # sched: [3:1.50]
-; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    shlq %cl, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: rshift_cl:
 ; BTVER2:       # %bb.0: # %entry
@@ -287,10 +287,10 @@ define void @lshift_mem_cl_optsize(i64 %a, i64 %c) nounwind readnone optsize {
 ;
 ; BDVER12-LABEL: lshift_mem_cl_optsize:
 ; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rsi, %rcx # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
 ; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [10:1.50]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [4:11.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
@@ -318,15 +318,15 @@ define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone {
 ;
 ; BDVER12-LABEL: lshift_mem_cl:
 ; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rsi, %rcx # sched: [1:0.33]
 ; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
-; BDVER12-NEXT:    shlq %cl, %rax # sched: [3:1.50]
-; BDVER12-NEXT:    negl %ecx # sched: [1:0.33]
+; BDVER12-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
+; BDVER12-NEXT:    shlq %cl, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    negl %ecx # sched: [1:0.50]
 ; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER12-NEXT:    shrq %cl, %rdi # sched: [3:1.50]
-; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.33]
-; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem_cl:
 ; BTVER2:       # %bb.0: # %entry
@@ -358,11 +358,11 @@ define void @lshift_mem(i64 %a) nounwind readnone {
 ; BDVER12-LABEL: lshift_mem:
 ; BDVER12:       # %bb.0: # %entry
 ; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
-; BDVER12-NEXT:    shlq $10, %rax # sched: [1:0.50]
 ; BDVER12-NEXT:    shrq $54, %rdi # sched: [1:0.50]
-; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.33]
-; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    shlq $10, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem:
 ; BTVER2:       # %bb.0: # %entry
@@ -389,8 +389,8 @@ define void @lshift_mem_optsize(i64 %a) nounwind readnone optsize {
 ;
 ; BDVER12-LABEL: lshift_mem_optsize:
 ; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [8:1.00]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [4:11.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem_optsize:
 ; BTVER2:       # %bb.0: # %entry
@@ -418,9 +418,9 @@ define void @lshift_mem_b(i64 %b) nounwind readnone {
 ; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
 ; BDVER12-NEXT:    shlq $10, %rdi # sched: [1:0.50]
 ; BDVER12-NEXT:    shrq $54, %rax # sched: [1:0.50]
-; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.33]
-; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem_b:
 ; BTVER2:       # %bb.0: # %entry
@@ -450,9 +450,9 @@ define void @lshift_mem_b_optsize(i64 %b) nounwind readnone optsize {
 ; BDVER12-LABEL: lshift_mem_b_optsize:
 ; BDVER12:       # %bb.0: # %entry
 ; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
-; BDVER12-NEXT:    shrdq $54, %rdi, %rax # sched: [2:0.67]
-; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    shrdq $54, %rdi, %rax # sched: [4:3.00]
+; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem_b_optsize:
 ; BTVER2:       # %bb.0: # %entry
diff --git a/test/CodeGen/X86/schedule-x86_32.ll b/test/CodeGen/X86/schedule-x86_32.ll
index 6aff5a34a41..757a022839b 100644
--- a/test/CodeGen/X86/schedule-x86_32.ll
+++ b/test/CodeGen/X86/schedule-x86_32.ll
@@ -8,7 +8,7 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -81,9 +81,9 @@ define i8 @test_aaa(i8 %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    aaa # sched: [100:0.33]
+; BDVER2-NEXT:    aaa # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_aaa:
 ; BTVER2:       # %bb.0:
@@ -181,10 +181,10 @@ define void @test_aad(i16 %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    aad # sched: [100:0.33]
-; BDVER2-NEXT:    aad $16 # sched: [100:0.33]
+; BDVER2-NEXT:    aad # sched: [100:0.50]
+; BDVER2-NEXT:    aad $16 # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_aad:
 ; BTVER2:       # %bb.0:
@@ -284,10 +284,10 @@ define void @test_aam(i8 %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    aam # sched: [100:0.33]
-; BDVER2-NEXT:    aam $16 # sched: [100:0.33]
+; BDVER2-NEXT:    aam # sched: [100:0.50]
+; BDVER2-NEXT:    aam $16 # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_aam:
 ; BTVER2:       # %bb.0:
@@ -379,9 +379,9 @@ define i8 @test_aas(i8 %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    aas # sched: [100:0.33]
+; BDVER2-NEXT:    aas # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_aas:
 ; BTVER2:       # %bb.0:
@@ -480,9 +480,9 @@ define void @test_arpl(i16 %a0, i16 *%a1) optsize {
 ; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    arpl %ax, (%ecx) # sched: [100:0.33]
+; BDVER2-NEXT:    arpl %ax, (%ecx) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_arpl:
 ; BTVER2:       # %bb.0:
@@ -644,7 +644,7 @@ define void @test_bound(i16 %a0, i16 *%a1, i32 %a2, i32 *%a3) optsize {
 ;
 ; BDVER2-LABEL: test_bound:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    pushl %esi # sched: [5:1.00]
+; BDVER2-NEXT:    pushl %esi # sched: [1:0.50]
 ; BDVER2-NEXT:    .cfi_def_cfa_offset 8
 ; BDVER2-NEXT:    .cfi_offset %esi, -8
 ; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
@@ -652,12 +652,12 @@ define void @test_bound(i16 %a0, i16 *%a1, i32 %a2, i32 *%a3) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    bound %ax, (%esi) # sched: [100:0.33]
-; BDVER2-NEXT:    bound %ecx, (%edx) # sched: [100:0.33]
+; BDVER2-NEXT:    bound %ax, (%esi) # sched: [100:0.50]
+; BDVER2-NEXT:    bound %ecx, (%edx) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    popl %esi # sched: [6:0.50]
+; BDVER2-NEXT:    popl %esi # sched: [5:0.50]
 ; BDVER2-NEXT:    .cfi_def_cfa_offset 4
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bound:
 ; BTVER2:       # %bb.0:
@@ -767,9 +767,9 @@ define i8 @test_daa(i8 %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    daa # sched: [100:0.33]
+; BDVER2-NEXT:    daa # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_daa:
 ; BTVER2:       # %bb.0:
@@ -859,9 +859,9 @@ define i8 @test_das(i8 %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    das # sched: [100:0.33]
+; BDVER2-NEXT:    das # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_das:
 ; BTVER2:       # %bb.0:
@@ -968,10 +968,10 @@ define void @test_dec16(i16 %a0, i16* %a1) optsize {
 ; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    decw %ax # sched: [1:0.33]
-; BDVER2-NEXT:    decw (%ecx) # sched: [7:1.00]
+; BDVER2-NEXT:    decw %ax # sched: [1:0.50]
+; BDVER2-NEXT:    decw (%ecx) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_dec16:
 ; BTVER2:       # %bb.0:
@@ -1081,10 +1081,10 @@ define void @test_dec32(i32 %a0, i32* %a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    decl %eax # sched: [1:0.33]
-; BDVER2-NEXT:    decl (%ecx) # sched: [7:1.00]
+; BDVER2-NEXT:    decl %eax # sched: [1:0.50]
+; BDVER2-NEXT:    decl (%ecx) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_dec32:
 ; BTVER2:       # %bb.0:
@@ -1195,10 +1195,10 @@ define void @test_inc16(i16 %a0, i16* %a1) optsize {
 ; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    incw %ax # sched: [1:0.33]
-; BDVER2-NEXT:    incw (%ecx) # sched: [7:1.00]
+; BDVER2-NEXT:    incw %ax # sched: [1:0.50]
+; BDVER2-NEXT:    incw (%ecx) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_inc16:
 ; BTVER2:       # %bb.0:
@@ -1308,10 +1308,10 @@ define void @test_inc32(i32 %a0, i32* %a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    incl %eax # sched: [1:0.33]
-; BDVER2-NEXT:    incl (%ecx) # sched: [7:1.00]
+; BDVER2-NEXT:    incl %eax # sched: [1:0.50]
+; BDVER2-NEXT:    incl (%ecx) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_inc32:
 ; BTVER2:       # %bb.0:
@@ -1396,9 +1396,9 @@ define void @test_into() optsize {
 ; BDVER2-LABEL: test_into:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    into # sched: [100:0.33]
+; BDVER2-NEXT:    into # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_into:
 ; BTVER2:       # %bb.0:
@@ -1496,10 +1496,10 @@ define void @test_jcxz_jecxz() optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:  JXTGT:
-; BDVER2-NEXT:    jcxz JXTGT # sched: [2:1.00]
-; BDVER2-NEXT:    jecxz JXTGT # sched: [2:1.00]
+; BDVER2-NEXT:    jcxz JXTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jecxz JXTGT # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_jcxz_jecxz:
 ; BTVER2:       # %bb.0:
@@ -1584,9 +1584,9 @@ define void @test_leave() optsize {
 ; BDVER2-LABEL: test_leave:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    leave # sched: [7:0.67]
+; BDVER2-NEXT:    leave # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_leave:
 ; BTVER2:       # %bb.0:
@@ -1747,19 +1747,19 @@ define void @test_pop_push() optsize {
 ; BDVER2-LABEL: test_pop_push:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    popl %ds # sched: [100:0.33]
-; BDVER2-NEXT:    popl %es # sched: [100:0.33]
-; BDVER2-NEXT:    popl %ss # sched: [100:0.33]
-; BDVER2-NEXT:    popl %fs # sched: [100:0.33]
-; BDVER2-NEXT:    popl %gs # sched: [100:0.33]
-; BDVER2-NEXT:    pushl %cs # sched: [100:0.33]
-; BDVER2-NEXT:    pushl %ds # sched: [100:0.33]
-; BDVER2-NEXT:    pushl %es # sched: [100:0.33]
-; BDVER2-NEXT:    pushl %ss # sched: [100:0.33]
-; BDVER2-NEXT:    pushl %fs # sched: [100:0.33]
-; BDVER2-NEXT:    pushl %gs # sched: [100:0.33]
+; BDVER2-NEXT:    popl %ds # sched: [100:0.50]
+; BDVER2-NEXT:    popl %es # sched: [100:0.50]
+; BDVER2-NEXT:    popl %ss # sched: [100:0.50]
+; BDVER2-NEXT:    popl %fs # sched: [100:0.50]
+; BDVER2-NEXT:    popl %gs # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %cs # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %ds # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %es # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %ss # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %fs # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %gs # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pop_push:
 ; BTVER2:       # %bb.0:
@@ -1922,15 +1922,15 @@ define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
 ; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    popw %ax # sched: [6:0.50]
-; BDVER2-NEXT:    popw (%ecx) # sched: [6:0.50]
-; BDVER2-NEXT:    pushw %ax # sched: [5:1.00]
-; BDVER2-NEXT:    pushw (%ecx) # sched: [5:1.00]
+; BDVER2-NEXT:    popw %ax # sched: [5:0.50]
+; BDVER2-NEXT:    popw (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    pushw %ax # sched: [1:0.50]
+; BDVER2-NEXT:    pushw (%ecx) # sched: [6:1.00]
 ; BDVER2-NEXT:    pushw $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [1:1.00]
-; BDVER2-NEXT:    pushw $7 # sched: [1:1.00]
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    pushw $7 # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pop_push_16:
 ; BTVER2:       # %bb.0:
@@ -2089,15 +2089,15 @@ define i32 @test_pop_push_32(i32 %a0, i32 *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    popl %eax # sched: [6:0.50]
-; BDVER2-NEXT:    popl (%ecx) # sched: [6:0.50]
-; BDVER2-NEXT:    pushl %eax # sched: [5:1.00]
-; BDVER2-NEXT:    pushl (%ecx) # sched: [5:1.00]
+; BDVER2-NEXT:    popl %eax # sched: [5:0.50]
+; BDVER2-NEXT:    popl (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    pushl %eax # sched: [1:0.50]
+; BDVER2-NEXT:    pushl (%ecx) # sched: [6:1.00]
 ; BDVER2-NEXT:    pushl $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [1:1.00]
-; BDVER2-NEXT:    pushl $7 # sched: [1:1.00]
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    pushl $7 # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pop_push_32:
 ; BTVER2:       # %bb.0:
@@ -2218,10 +2218,10 @@ define void @test_popa_popf_pusha_pushf() optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    popal # sched: [5:0.50]
 ; BDVER2-NEXT:    popfl # sched: [5:0.50]
-; BDVER2-NEXT:    pushal # sched: [1:1.00]
-; BDVER2-NEXT:    pushfl # sched: [1:1.00]
+; BDVER2-NEXT:    pushal # sched: [1:0.50]
+; BDVER2-NEXT:    pushfl # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_popa_popf_pusha_pushf:
 ; BTVER2:       # %bb.0:
@@ -2344,14 +2344,14 @@ define void @test_ret() optsize {
 ; BDVER2-LABEL: test_ret:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ; BDVER2-NEXT:    retl $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [6:1.00]
-; BDVER2-NEXT:    lretl # sched: [6:1.00]
+; BDVER2-NEXT:    # sched: [5:1.00]
+; BDVER2-NEXT:    lretl # sched: [5:1.00]
 ; BDVER2-NEXT:    lretl $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ret:
 ; BTVER2:       # %bb.0:
@@ -2440,9 +2440,9 @@ define i8 @test_salc() optsize {
 ; BDVER2-LABEL: test_salc:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    salc # sched: [1:0.33]
+; BDVER2-NEXT:    salc # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_salc:
 ; BTVER2:       # %bb.0:
@@ -2567,11 +2567,11 @@ define void @test_xchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    xchgl %eax, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    xchgl %ecx, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    xchgl %eax, (%edx) # sched: [6:1.00]
+; BDVER2-NEXT:    xchgl %eax, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    xchgl %ecx, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    xchgl %eax, (%edx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xchg_32:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/schedule-x86_64.ll b/test/CodeGen/X86/schedule-x86_64.ll
index 4cd50969ca9..18541184eb8 100644
--- a/test/CodeGen/X86/schedule-x86_64.ll
+++ b/test/CodeGen/X86/schedule-x86_64.ll
@@ -8,7 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -112,14 +112,14 @@ define void @test_adc_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; BDVER2-LABEL: test_adc_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    adcb $7, %al # sched: [2:0.67]
-; BDVER2-NEXT:    adcb $7, %dil # sched: [2:0.67]
-; BDVER2-NEXT:    adcb $7, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    adcb %dl, %dil # sched: [2:0.67]
-; BDVER2-NEXT:    adcb %dil, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    adcb (%rsi), %dil # sched: [7:0.67]
+; BDVER2-NEXT:    adcb $7, %al # sched: [1:1.00]
+; BDVER2-NEXT:    adcb $7, %dil # sched: [1:1.00]
+; BDVER2-NEXT:    adcb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcb %dl, %dil # sched: [1:1.00]
+; BDVER2-NEXT:    adcb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcb (%rsi), %dil # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_adc_8:
 ; BTVER2:       # %bb.0:
@@ -288,18 +288,18 @@ define void @test_adc_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    adcw $511, %ax # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    adcw $511, %di # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    adcw $511, (%rsi) # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [9:1.00]
-; BDVER2-NEXT:    adcw $7, %di # sched: [2:0.67]
-; BDVER2-NEXT:    adcw $7, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    adcw %dx, %di # sched: [2:0.67]
-; BDVER2-NEXT:    adcw %di, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    adcw (%rsi), %di # sched: [7:0.67]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    adcw $7, %di # sched: [1:1.00]
+; BDVER2-NEXT:    adcw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcw %dx, %di # sched: [1:1.00]
+; BDVER2-NEXT:    adcw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcw (%rsi), %di # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_adc_16:
 ; BTVER2:       # %bb.0:
@@ -478,18 +478,18 @@ define void @test_adc_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    adcl $665536, %eax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    adcl $665536, %edi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    adcl $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [9:1.00]
-; BDVER2-NEXT:    adcl $7, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    adcl $7, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    adcl %edx, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    adcl %edi, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    adcl (%rsi), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    adcl $7, %edi # sched: [1:1.00]
+; BDVER2-NEXT:    adcl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcl %edx, %edi # sched: [1:1.00]
+; BDVER2-NEXT:    adcl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcl (%rsi), %edi # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_adc_32:
 ; BTVER2:       # %bb.0:
@@ -668,18 +668,18 @@ define void @test_adc_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    adcq $665536, %rax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    adcq $665536, %rdi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    adcq $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [9:1.00]
-; BDVER2-NEXT:    adcq $7, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    adcq $7, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    adcq %rdx, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    adcq %rdi, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    adcq (%rsi), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    adcq $7, %rdi # sched: [1:1.00]
+; BDVER2-NEXT:    adcq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcq %rdx, %rdi # sched: [1:1.00]
+; BDVER2-NEXT:    adcq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcq (%rsi), %rdi # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_adc_64:
 ; BTVER2:       # %bb.0:
@@ -818,14 +818,14 @@ define void @test_add_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; BDVER2-LABEL: test_add_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    addb $7, %al # sched: [1:0.33]
-; BDVER2-NEXT:    addb $7, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    addb $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    addb %dl, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    addb %dil, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    addb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    addb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    addb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    addb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addb %dl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    addb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addb (%rsi), %dil # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_add_8:
 ; BTVER2:       # %bb.0:
@@ -994,18 +994,18 @@ define void @test_add_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    addw $511, %ax # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    addw $511, %di # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    addw $511, (%rsi) # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    addw $7, %di # sched: [1:0.33]
-; BDVER2-NEXT:    addw $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    addw %dx, %di # sched: [1:0.33]
-; BDVER2-NEXT:    addw %di, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    addw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    addw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    addw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addw %dx, %di # sched: [1:0.50]
+; BDVER2-NEXT:    addw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addw (%rsi), %di # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_add_16:
 ; BTVER2:       # %bb.0:
@@ -1184,18 +1184,18 @@ define void @test_add_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    addl $665536, %eax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    addl $665536, %edi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    addl $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    addl $7, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    addl $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    addl %edx, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    addl %edi, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    addl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    addl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    addl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addl %edx, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    addl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addl (%rsi), %edi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_add_32:
 ; BTVER2:       # %bb.0:
@@ -1374,18 +1374,18 @@ define void @test_add_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    addq $665536, %rax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    addq $665536, %rdi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    addq $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    addq $7, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    addq $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    addq %rdx, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    addq %rdi, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    addq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    addq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    addq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addq %rdx, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    addq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addq (%rsi), %rdi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_add_64:
 ; BTVER2:       # %bb.0:
@@ -1524,14 +1524,14 @@ define void @test_and_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; BDVER2-LABEL: test_and_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    andb $7, %al # sched: [1:0.33]
-; BDVER2-NEXT:    andb $7, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    andb $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    andb %dl, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    andb %dil, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    andb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    andb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    andb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    andb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andb %dl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    andb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andb (%rsi), %dil # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_and_8:
 ; BTVER2:       # %bb.0:
@@ -1700,18 +1700,18 @@ define void @test_and_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    andw $511, %ax # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    andw $511, %di # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    andw $511, (%rsi) # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    andw $7, %di # sched: [1:0.33]
-; BDVER2-NEXT:    andw $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    andw %dx, %di # sched: [1:0.33]
-; BDVER2-NEXT:    andw %di, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    andw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    andw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    andw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andw %dx, %di # sched: [1:0.50]
+; BDVER2-NEXT:    andw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andw (%rsi), %di # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_and_16:
 ; BTVER2:       # %bb.0:
@@ -1890,18 +1890,18 @@ define void @test_and_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    andl $665536, %eax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    andl $665536, %edi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    andl $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    andl $7, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    andl $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    andl %edx, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    andl %edi, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    andl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    andl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    andl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andl %edx, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    andl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andl (%rsi), %edi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_and_32:
 ; BTVER2:       # %bb.0:
@@ -2080,18 +2080,18 @@ define void @test_and_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    andq $665536, %rax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    andq $665536, %rdi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    andq $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    andq $7, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    andq $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    andq %rdx, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    andq %rdi, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    andq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    andq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    andq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andq %rdx, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    andq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andq (%rsi), %rdi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_and_64:
 ; BTVER2:       # %bb.0:
@@ -2214,12 +2214,12 @@ define i16 @test_bsf16(i16 %a0, i16* %a1) optsize {
 ; BDVER2-LABEL: test_bsf16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    bsfw %di, %ax # sched: [3:1.00]
-; BDVER2-NEXT:    bsfw (%rsi), %cx # sched: [8:1.00]
+; BDVER2-NEXT:    bsfw %di, %ax # sched: [3:2.00]
+; BDVER2-NEXT:    bsfw (%rsi), %cx # sched: [7:2.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
 ; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bsf16:
 ; BTVER2:       # %bb.0:
@@ -2322,11 +2322,11 @@ define i32 @test_bsf32(i32 %a0, i32* %a1) optsize {
 ; BDVER2-LABEL: test_bsf32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    bsfl %edi, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    bsfl (%rsi), %ecx # sched: [8:1.00]
+; BDVER2-NEXT:    bsfl %edi, %eax # sched: [3:2.00]
+; BDVER2-NEXT:    bsfl (%rsi), %ecx # sched: [7:2.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bsf32:
 ; BTVER2:       # %bb.0:
@@ -2427,11 +2427,11 @@ define i64 @test_bsf64(i64 %a0, i64* %a1) optsize {
 ; BDVER2-LABEL: test_bsf64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    bsfq %rdi, %rax # sched: [3:1.00]
-; BDVER2-NEXT:    bsfq (%rsi), %rcx # sched: [8:1.00]
+; BDVER2-NEXT:    bsfq %rdi, %rax # sched: [3:2.00]
+; BDVER2-NEXT:    bsfq (%rsi), %rcx # sched: [7:2.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bsf64:
 ; BTVER2:       # %bb.0:
@@ -2541,12 +2541,12 @@ define i16 @test_bsr16(i16 %a0, i16* %a1) optsize {
 ; BDVER2-LABEL: test_bsr16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    bsrw %di, %ax # sched: [3:1.00]
-; BDVER2-NEXT:    bsrw (%rsi), %cx # sched: [8:1.00]
+; BDVER2-NEXT:    bsrw %di, %ax # sched: [4:2.00]
+; BDVER2-NEXT:    bsrw (%rsi), %cx # sched: [8:2.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
 ; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bsr16:
 ; BTVER2:       # %bb.0:
@@ -2649,11 +2649,11 @@ define i32 @test_bsr32(i32 %a0, i32* %a1) optsize {
 ; BDVER2-LABEL: test_bsr32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    bsrl %edi, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    bsrl (%rsi), %ecx # sched: [8:1.00]
+; BDVER2-NEXT:    bsrl %edi, %eax # sched: [4:2.00]
+; BDVER2-NEXT:    bsrl (%rsi), %ecx # sched: [8:2.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bsr32:
 ; BTVER2:       # %bb.0:
@@ -2754,11 +2754,11 @@ define i64 @test_bsr64(i64 %a0, i64* %a1) optsize {
 ; BDVER2-LABEL: test_bsr64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    bsrq %rdi, %rax # sched: [3:1.00]
-; BDVER2-NEXT:    bsrq (%rsi), %rcx # sched: [8:1.00]
+; BDVER2-NEXT:    bsrq %rdi, %rax # sched: [4:2.00]
+; BDVER2-NEXT:    bsrq (%rsi), %rcx # sched: [8:2.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bsr64:
 ; BTVER2:       # %bb.0:
@@ -2835,9 +2835,9 @@ define i32 @test_bswap32(i32 %a0) optsize {
 ;
 ; BDVER2-LABEL: test_bswap32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
 ; BDVER2-NEXT:    bswapl %eax # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bswap32:
 ; BTVER2:       # %bb.0:
@@ -2904,9 +2904,9 @@ define i64 @test_bswap64(i64 %a0) optsize {
 ;
 ; BDVER2-LABEL: test_bswap64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    bswapq %rax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    bswapq %rax # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bswap64:
 ; BTVER2:       # %bb.0:
@@ -3104,23 +3104,23 @@ define void @test_bt_btc_btr_bts_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    btw %si, %di # sched: [1:0.50]
-; BDVER2-NEXT:    btcw %si, %di # sched: [1:0.50]
-; BDVER2-NEXT:    btrw %si, %di # sched: [1:0.50]
-; BDVER2-NEXT:    btsw %si, %di # sched: [1:0.50]
-; BDVER2-NEXT:    btw %si, (%rdx) # sched: [9:1.00]
-; BDVER2-NEXT:    btcw %si, (%rdx) # sched: [9:1.00]
-; BDVER2-NEXT:    btrw %si, (%rdx) # sched: [9:1.00]
-; BDVER2-NEXT:    btsw %si, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btcw %si, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btrw %si, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btsw %si, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btw %si, (%rdx) # sched: [5:0.50]
+; BDVER2-NEXT:    btcw %si, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrw %si, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsw %si, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    btw $7, %di # sched: [1:0.50]
-; BDVER2-NEXT:    btcw $7, %di # sched: [1:0.50]
-; BDVER2-NEXT:    btrw $7, %di # sched: [1:0.50]
-; BDVER2-NEXT:    btsw $7, %di # sched: [1:0.50]
-; BDVER2-NEXT:    btw $7, (%rdx) # sched: [6:0.50]
+; BDVER2-NEXT:    btcw $7, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btrw $7, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btsw $7, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btw $7, (%rdx) # sched: [5:0.50]
 ; BDVER2-NEXT:    btcw $7, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    btrw $7, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    btsw $7, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bt_btc_btr_bts_16:
 ; BTVER2:       # %bb.0:
@@ -3349,23 +3349,23 @@ define void @test_bt_btc_btr_bts_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    btl %esi, %edi # sched: [1:0.50]
-; BDVER2-NEXT:    btcl %esi, %edi # sched: [1:0.50]
-; BDVER2-NEXT:    btrl %esi, %edi # sched: [1:0.50]
-; BDVER2-NEXT:    btsl %esi, %edi # sched: [1:0.50]
-; BDVER2-NEXT:    btl %esi, (%rdx) # sched: [9:1.00]
-; BDVER2-NEXT:    btcl %esi, (%rdx) # sched: [9:1.00]
-; BDVER2-NEXT:    btrl %esi, (%rdx) # sched: [9:1.00]
-; BDVER2-NEXT:    btsl %esi, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btcl %esi, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btrl %esi, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btsl %esi, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btl %esi, (%rdx) # sched: [5:0.50]
+; BDVER2-NEXT:    btcl %esi, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrl %esi, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsl %esi, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    btl $7, %edi # sched: [1:0.50]
-; BDVER2-NEXT:    btcl $7, %edi # sched: [1:0.50]
-; BDVER2-NEXT:    btrl $7, %edi # sched: [1:0.50]
-; BDVER2-NEXT:    btsl $7, %edi # sched: [1:0.50]
-; BDVER2-NEXT:    btl $7, (%rdx) # sched: [6:0.50]
+; BDVER2-NEXT:    btcl $7, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btrl $7, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btsl $7, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btl $7, (%rdx) # sched: [5:0.50]
 ; BDVER2-NEXT:    btcl $7, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    btrl $7, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    btsl $7, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bt_btc_btr_bts_32:
 ; BTVER2:       # %bb.0:
@@ -3594,23 +3594,23 @@ define void @test_bt_btc_btr_bts_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    btq %rsi, %rdi # sched: [1:0.50]
-; BDVER2-NEXT:    btcq %rsi, %rdi # sched: [1:0.50]
-; BDVER2-NEXT:    btrq %rsi, %rdi # sched: [1:0.50]
-; BDVER2-NEXT:    btsq %rsi, %rdi # sched: [1:0.50]
-; BDVER2-NEXT:    btq %rsi, (%rdx) # sched: [9:1.00]
-; BDVER2-NEXT:    btcq %rsi, (%rdx) # sched: [9:1.00]
-; BDVER2-NEXT:    btrq %rsi, (%rdx) # sched: [9:1.00]
-; BDVER2-NEXT:    btsq %rsi, (%rdx) # sched: [9:1.00]
+; BDVER2-NEXT:    btcq %rsi, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btrq %rsi, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btsq %rsi, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btq %rsi, (%rdx) # sched: [5:0.50]
+; BDVER2-NEXT:    btcq %rsi, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrq %rsi, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsq %rsi, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    btq $7, %rdi # sched: [1:0.50]
-; BDVER2-NEXT:    btcq $7, %rdi # sched: [1:0.50]
-; BDVER2-NEXT:    btrq $7, %rdi # sched: [1:0.50]
-; BDVER2-NEXT:    btsq $7, %rdi # sched: [1:0.50]
-; BDVER2-NEXT:    btq $7, (%rdx) # sched: [6:0.50]
+; BDVER2-NEXT:    btcq $7, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btrq $7, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btsq $7, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btq $7, (%rdx) # sched: [5:0.50]
 ; BDVER2-NEXT:    btcq $7, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    btrq $7, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    btsq $7, (%rdx) # sched: [7:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_bt_btc_btr_bts_64:
 ; BTVER2:       # %bb.0:
@@ -3761,14 +3761,14 @@ define void @test_cbw_cdq_cdqe_cqo_cwd_cwde() optsize {
 ; BDVER2-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cbtw # sched: [1:0.33]
+; BDVER2-NEXT:    cbtw # sched: [1:0.50]
 ; BDVER2-NEXT:    cltd # sched: [1:0.50]
-; BDVER2-NEXT:    cltq # sched: [1:0.33]
+; BDVER2-NEXT:    cltq # sched: [1:0.50]
 ; BDVER2-NEXT:    cqto # sched: [1:0.50]
-; BDVER2-NEXT:    cwtd # sched: [2:1.00]
-; BDVER2-NEXT:    cwtl # sched: [1:0.33]
+; BDVER2-NEXT:    cwtd # sched: [1:0.50]
+; BDVER2-NEXT:    cwtl # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
 ; BTVER2:       # %bb.0:
@@ -3873,11 +3873,11 @@ define void @test_clc_cld_cmc() optsize {
 ; BDVER2-LABEL: test_clc_cld_cmc:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    clc # sched: [1:0.25]
-; BDVER2-NEXT:    cld # sched: [1:0.33]
-; BDVER2-NEXT:    cmc # sched: [1:0.33]
+; BDVER2-NEXT:    clc # sched: [1:0.50]
+; BDVER2-NEXT:    cld # sched: [1:0.50]
+; BDVER2-NEXT:    cmc # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_clc_cld_cmc:
 ; BTVER2:       # %bb.0:
@@ -4000,14 +4000,14 @@ define void @test_cmp_8(i8 %a0, i8* %a1) optsize {
 ; BDVER2-LABEL: test_cmp_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cmpb $7, %al # sched: [1:0.33]
-; BDVER2-NEXT:    cmpb $7, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    cmpb $7, (%rsi) # sched: [6:0.50]
-; BDVER2-NEXT:    cmpb %dil, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    cmpb %dil, (%rsi) # sched: [6:0.50]
-; BDVER2-NEXT:    cmpb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    cmpb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    cmpb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    cmpb $7, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpb %dil, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    cmpb %dil, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpb (%rsi), %dil # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmp_8:
 ; BTVER2:       # %bb.0:
@@ -4176,18 +4176,18 @@ define void @test_cmp_16(i16 %a0, i16* %a1) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    cmpw $511, %ax # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    cmpw $511, %di # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    cmpw $511, (%rsi) # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [6:0.50]
-; BDVER2-NEXT:    cmpw $7, %di # sched: [1:0.33]
-; BDVER2-NEXT:    cmpw $7, (%rsi) # sched: [6:0.50]
-; BDVER2-NEXT:    cmpw %di, %di # sched: [1:0.33]
-; BDVER2-NEXT:    cmpw %di, (%rsi) # sched: [6:0.50]
-; BDVER2-NEXT:    cmpw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    cmpw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmpw $7, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpw %di, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmpw %di, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpw (%rsi), %di # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmp_16:
 ; BTVER2:       # %bb.0:
@@ -4366,18 +4366,18 @@ define void @test_cmp_32(i32 %a0, i32* %a1) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    cmpl $665536, %eax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    cmpl $665536, %edi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    cmpl $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [6:0.50]
-; BDVER2-NEXT:    cmpl $7, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    cmpl $7, (%rsi) # sched: [6:0.50]
-; BDVER2-NEXT:    cmpl %edi, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    cmpl %edi, (%rsi) # sched: [6:0.50]
-; BDVER2-NEXT:    cmpl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    cmpl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmpl $7, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpl %edi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmpl %edi, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpl (%rsi), %edi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmp_32:
 ; BTVER2:       # %bb.0:
@@ -4556,18 +4556,18 @@ define void @test_cmp_64(i64 %a0, i64* %a1) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    cmpq $665536, %rax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    cmpq $665536, %rdi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    cmpq $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [6:0.50]
-; BDVER2-NEXT:    cmpq $7, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    cmpq $7, (%rsi) # sched: [6:0.50]
-; BDVER2-NEXT:    cmpq %rdi, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    cmpq %rdi, (%rsi) # sched: [6:0.50]
-; BDVER2-NEXT:    cmpq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    cmpq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmpq $7, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpq %rdi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmpq %rdi, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpq (%rsi), %rdi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmp_64:
 ; BTVER2:       # %bb.0:
@@ -4690,12 +4690,12 @@ define void @test_cmps() optsize {
 ; BDVER2-LABEL: test_cmps:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cmpsb %es:(%rdi), (%rsi) # sched: [8:1.00]
-; BDVER2-NEXT:    cmpsw %es:(%rdi), (%rsi) # sched: [8:1.00]
-; BDVER2-NEXT:    cmpsl %es:(%rdi), (%rsi) # sched: [8:1.00]
-; BDVER2-NEXT:    cmpsq %es:(%rdi), (%rsi) # sched: [8:1.00]
+; BDVER2-NEXT:    cmpsb %es:(%rdi), (%rsi) # sched: [100:0.50]
+; BDVER2-NEXT:    cmpsw %es:(%rdi), (%rsi) # sched: [100:0.50]
+; BDVER2-NEXT:    cmpsl %es:(%rdi), (%rsi) # sched: [100:0.50]
+; BDVER2-NEXT:    cmpsq %es:(%rdi), (%rsi) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmps:
 ; BTVER2:       # %bb.0:
@@ -4788,10 +4788,10 @@ define void @test_cmpxchg_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; BDVER2-LABEL: test_cmpxchg_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cmpxchgb %dil, %sil # sched: [5:1.33]
-; BDVER2-NEXT:    cmpxchgb %dil, (%rdx) # sched: [8:2.00]
+; BDVER2-NEXT:    cmpxchgb %dil, %sil # sched: [3:1.00]
+; BDVER2-NEXT:    cmpxchgb %dil, (%rdx) # sched: [3:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmpxchg_8:
 ; BTVER2:       # %bb.0:
@@ -4879,10 +4879,10 @@ define void @test_cmpxchg_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; BDVER2-LABEL: test_cmpxchg_16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cmpxchgw %di, %si # sched: [5:1.33]
-; BDVER2-NEXT:    cmpxchgw %di, (%rdx) # sched: [8:2.00]
+; BDVER2-NEXT:    cmpxchgw %di, %si # sched: [3:1.00]
+; BDVER2-NEXT:    cmpxchgw %di, (%rdx) # sched: [3:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmpxchg_16:
 ; BTVER2:       # %bb.0:
@@ -4970,10 +4970,10 @@ define void @test_cmpxchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; BDVER2-LABEL: test_cmpxchg_32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cmpxchgl %edi, %esi # sched: [5:1.33]
-; BDVER2-NEXT:    cmpxchgl %edi, (%rdx) # sched: [8:2.00]
+; BDVER2-NEXT:    cmpxchgl %edi, %esi # sched: [3:1.00]
+; BDVER2-NEXT:    cmpxchgl %edi, (%rdx) # sched: [3:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmpxchg_32:
 ; BTVER2:       # %bb.0:
@@ -5061,10 +5061,10 @@ define void @test_cmpxchg_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; BDVER2-LABEL: test_cmpxchg_64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cmpxchgq %rdi, %rsi # sched: [5:1.33]
-; BDVER2-NEXT:    cmpxchgq %rdi, (%rdx) # sched: [8:2.00]
+; BDVER2-NEXT:    cmpxchgq %rdi, %rsi # sched: [3:1.00]
+; BDVER2-NEXT:    cmpxchgq %rdi, (%rdx) # sched: [3:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmpxchg_64:
 ; BTVER2:       # %bb.0:
@@ -5152,10 +5152,10 @@ define void @test_cmpxchg8b_cmpxchg16b(i8 *%a0) optsize {
 ; BDVER2-LABEL: test_cmpxchg8b_cmpxchg16b:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cmpxchg8b (%rdi) # sched: [6:1.00]
-; BDVER2-NEXT:    cmpxchg16b (%rdi) # sched: [6:1.00]
+; BDVER2-NEXT:    cmpxchg8b (%rdi) # sched: [3:1.00]
+; BDVER2-NEXT:    cmpxchg16b (%rdi) # sched: [3:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cmpxchg8b_cmpxchg16b:
 ; BTVER2:       # %bb.0:
@@ -5236,9 +5236,9 @@ define void @test_cpuid() optsize {
 ; BDVER2-LABEL: test_cpuid:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    cpuid # sched: [100:0.33]
+; BDVER2-NEXT:    cpuid # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_cpuid:
 ; BTVER2:       # %bb.0:
@@ -5325,10 +5325,10 @@ define void @test_dec8(i8 %a0, i8* %a1) optsize {
 ; BDVER2-LABEL: test_dec8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    decb %dil # sched: [1:0.33]
-; BDVER2-NEXT:    decb (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    decb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    decb (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_dec8:
 ; BTVER2:       # %bb.0:
@@ -5416,10 +5416,10 @@ define void @test_dec16(i16 %a0, i16* %a1) optsize {
 ; BDVER2-LABEL: test_dec16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    decw %di # sched: [1:0.33]
-; BDVER2-NEXT:    decw (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    decw %di # sched: [1:0.50]
+; BDVER2-NEXT:    decw (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_dec16:
 ; BTVER2:       # %bb.0:
@@ -5507,10 +5507,10 @@ define void @test_dec32(i32 %a0, i32* %a1) optsize {
 ; BDVER2-LABEL: test_dec32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    decl %edi # sched: [1:0.33]
-; BDVER2-NEXT:    decl (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    decl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    decl (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_dec32:
 ; BTVER2:       # %bb.0:
@@ -5598,10 +5598,10 @@ define void @test_dec64(i64 %a0, i64* %a1) optsize {
 ; BDVER2-LABEL: test_dec64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    decq %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    decq (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    decq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    decq (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_dec64:
 ; BTVER2:       # %bb.0:
@@ -5756,16 +5756,16 @@ define void @test_div(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
 ; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    divb %dil # sched: [25:10.00]
-; BDVER2-NEXT:    divb (%r8) # sched: [30:10.00]
-; BDVER2-NEXT:    divw %si # sched: [25:10.00]
-; BDVER2-NEXT:    divw (%r9) # sched: [30:10.00]
-; BDVER2-NEXT:    divl %edx # sched: [25:10.00]
-; BDVER2-NEXT:    divl (%rax) # sched: [30:10.00]
-; BDVER2-NEXT:    divq %rcx # sched: [25:10.00]
-; BDVER2-NEXT:    divq (%r10) # sched: [30:10.00]
+; BDVER2-NEXT:    divb %dil # sched: [12:12.00]
+; BDVER2-NEXT:    divb (%r8) # sched: [16:12.00]
+; BDVER2-NEXT:    divw %si # sched: [15:15.00]
+; BDVER2-NEXT:    divw (%r9) # sched: [19:15.00]
+; BDVER2-NEXT:    divl %edx # sched: [14:14.00]
+; BDVER2-NEXT:    divl (%rax) # sched: [18:14.00]
+; BDVER2-NEXT:    divq %rcx # sched: [14:14.00]
+; BDVER2-NEXT:    divq (%r10) # sched: [18:14.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_div:
 ; BTVER2:       # %bb.0:
@@ -5871,9 +5871,9 @@ define void @test_enter() optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    enter $7, $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [100:0.33]
+; BDVER2-NEXT:    # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_enter:
 ; BTVER2:       # %bb.0:
@@ -6028,16 +6028,16 @@ define void @test_idiv(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
 ; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    idivb %dil # sched: [25:10.00]
-; BDVER2-NEXT:    idivb (%r8) # sched: [30:10.00]
-; BDVER2-NEXT:    idivw %si # sched: [25:10.00]
-; BDVER2-NEXT:    idivw (%r9) # sched: [30:10.00]
-; BDVER2-NEXT:    idivl %edx # sched: [25:10.00]
-; BDVER2-NEXT:    idivl (%rax) # sched: [30:10.00]
-; BDVER2-NEXT:    idivq %rcx # sched: [25:10.00]
-; BDVER2-NEXT:    idivq (%r10) # sched: [30:10.00]
+; BDVER2-NEXT:    idivb %dil # sched: [12:12.00]
+; BDVER2-NEXT:    idivb (%r8) # sched: [16:12.00]
+; BDVER2-NEXT:    idivw %si # sched: [15:17.00]
+; BDVER2-NEXT:    idivw (%r9) # sched: [19:17.00]
+; BDVER2-NEXT:    idivl %edx # sched: [14:25.00]
+; BDVER2-NEXT:    idivl (%rax) # sched: [18:25.00]
+; BDVER2-NEXT:    idivq %rcx # sched: [14:14.00]
+; BDVER2-NEXT:    idivq (%r10) # sched: [18:14.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_idiv:
 ; BTVER2:       # %bb.0:
@@ -6142,10 +6142,10 @@ define void @test_imul_8(i8 %a0, i8* %a1) optsize {
 ; BDVER2-LABEL: test_imul_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    imulb %dil # sched: [3:1.00]
+; BDVER2-NEXT:    imulb %dil # sched: [4:1.00]
 ; BDVER2-NEXT:    imulb (%rsi) # sched: [8:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_imul_8:
 ; BTVER2:       # %bb.0:
@@ -6297,18 +6297,18 @@ define void @test_imul_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; BDVER2-LABEL: test_imul_16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    imulw %di # sched: [4:1.33]
-; BDVER2-NEXT:    imulw (%rsi) # sched: [9:1.33]
-; BDVER2-NEXT:    imulw %dx, %di # sched: [3:1.00]
+; BDVER2-NEXT:    imulw %di # sched: [4:1.00]
+; BDVER2-NEXT:    imulw (%rsi) # sched: [8:1.00]
+; BDVER2-NEXT:    imulw %dx, %di # sched: [4:1.00]
 ; BDVER2-NEXT:    imulw (%rsi), %di # sched: [8:1.00]
 ; BDVER2-NEXT:    imulw $511, %di, %di # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [4:1.00]
+; BDVER2-NEXT:    # sched: [5:1.00]
 ; BDVER2-NEXT:    imulw $511, (%rsi), %di # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [8:1.00]
-; BDVER2-NEXT:    imulw $7, %di, %di # sched: [4:1.00]
-; BDVER2-NEXT:    imulw $7, (%rsi), %di # sched: [8:1.00]
+; BDVER2-NEXT:    # sched: [9:1.00]
+; BDVER2-NEXT:    imulw $7, %di, %di # sched: [5:1.00]
+; BDVER2-NEXT:    imulw $7, (%rsi), %di # sched: [9:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_imul_16:
 ; BTVER2:       # %bb.0:
@@ -6477,17 +6477,17 @@ define void @test_imul_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    imull %edi # sched: [4:1.00]
-; BDVER2-NEXT:    imull (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    imull %edx, %edi # sched: [3:1.00]
+; BDVER2-NEXT:    imull (%rsi) # sched: [8:1.00]
+; BDVER2-NEXT:    imull %edx, %edi # sched: [4:1.00]
 ; BDVER2-NEXT:    imull (%rsi), %edi # sched: [8:1.00]
 ; BDVER2-NEXT:    imull $665536, %edi, %edi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [3:1.00]
+; BDVER2-NEXT:    # sched: [4:1.00]
 ; BDVER2-NEXT:    imull $665536, (%rsi), %edi # imm = 0xA27C0
 ; BDVER2-NEXT:    # sched: [8:1.00]
-; BDVER2-NEXT:    imull $7, %edi, %edi # sched: [3:1.00]
+; BDVER2-NEXT:    imull $7, %edi, %edi # sched: [4:1.00]
 ; BDVER2-NEXT:    imull $7, (%rsi), %edi # sched: [8:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_imul_32:
 ; BTVER2:       # %bb.0:
@@ -6655,18 +6655,18 @@ define void @test_imul_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; BDVER2-LABEL: test_imul_64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    imulq %rdi # sched: [4:1.00]
-; BDVER2-NEXT:    imulq (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    imulq %rdx, %rdi # sched: [3:1.00]
-; BDVER2-NEXT:    imulq (%rsi), %rdi # sched: [8:1.00]
+; BDVER2-NEXT:    imulq %rdi # sched: [6:4.00]
+; BDVER2-NEXT:    imulq (%rsi) # sched: [10:4.00]
+; BDVER2-NEXT:    imulq %rdx, %rdi # sched: [6:4.00]
+; BDVER2-NEXT:    imulq (%rsi), %rdi # sched: [10:4.00]
 ; BDVER2-NEXT:    imulq $665536, %rdi, %rdi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [3:1.00]
+; BDVER2-NEXT:    # sched: [6:4.00]
 ; BDVER2-NEXT:    imulq $665536, (%rsi), %rdi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [8:1.00]
-; BDVER2-NEXT:    imulq $7, %rdi, %rdi # sched: [3:1.00]
-; BDVER2-NEXT:    imulq $7, (%rsi), %rdi # sched: [8:1.00]
+; BDVER2-NEXT:    # sched: [10:4.00]
+; BDVER2-NEXT:    imulq $7, %rdi, %rdi # sched: [6:4.00]
+; BDVER2-NEXT:    imulq $7, (%rsi), %rdi # sched: [10:4.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_imul_64:
 ; BTVER2:       # %bb.0:
@@ -6803,14 +6803,14 @@ define void @test_in() optsize {
 ; BDVER2-LABEL: test_in:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    inb $7, %al # sched: [100:0.33]
-; BDVER2-NEXT:    inw $7, %ax # sched: [100:0.33]
-; BDVER2-NEXT:    inl $7, %eax # sched: [100:0.33]
-; BDVER2-NEXT:    inb %dx, %al # sched: [100:0.33]
-; BDVER2-NEXT:    inw %dx, %ax # sched: [100:0.33]
-; BDVER2-NEXT:    inl %dx, %eax # sched: [100:0.33]
+; BDVER2-NEXT:    inb $7, %al # sched: [100:0.50]
+; BDVER2-NEXT:    inw $7, %ax # sched: [100:0.50]
+; BDVER2-NEXT:    inl $7, %eax # sched: [100:0.50]
+; BDVER2-NEXT:    inb %dx, %al # sched: [100:0.50]
+; BDVER2-NEXT:    inw %dx, %ax # sched: [100:0.50]
+; BDVER2-NEXT:    inl %dx, %eax # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_in:
 ; BTVER2:       # %bb.0:
@@ -6907,10 +6907,10 @@ define void @test_inc8(i8 %a0, i8* %a1) optsize {
 ; BDVER2-LABEL: test_inc8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    incb %dil # sched: [1:0.33]
-; BDVER2-NEXT:    incb (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    incb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    incb (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_inc8:
 ; BTVER2:       # %bb.0:
@@ -6998,10 +6998,10 @@ define void @test_inc16(i16 %a0, i16* %a1) optsize {
 ; BDVER2-LABEL: test_inc16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    incw %di # sched: [1:0.33]
-; BDVER2-NEXT:    incw (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    incw %di # sched: [1:0.50]
+; BDVER2-NEXT:    incw (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_inc16:
 ; BTVER2:       # %bb.0:
@@ -7089,10 +7089,10 @@ define void @test_inc32(i32 %a0, i32* %a1) optsize {
 ; BDVER2-LABEL: test_inc32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    incl %edi # sched: [1:0.33]
-; BDVER2-NEXT:    incl (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    incl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    incl (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_inc32:
 ; BTVER2:       # %bb.0:
@@ -7180,10 +7180,10 @@ define void @test_inc64(i64 %a0, i64* %a1) optsize {
 ; BDVER2-LABEL: test_inc64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    incq %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    incq (%rsi) # sched: [7:1.00]
+; BDVER2-NEXT:    incq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    incq (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_inc64:
 ; BTVER2:       # %bb.0:
@@ -7280,11 +7280,11 @@ define void @test_ins() optsize {
 ; BDVER2-LABEL: test_ins:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    insb %dx, %es:(%rdi) # sched: [100:0.33]
-; BDVER2-NEXT:    insw %dx, %es:(%rdi) # sched: [100:0.33]
-; BDVER2-NEXT:    insl %dx, %es:(%rdi) # sched: [100:0.33]
+; BDVER2-NEXT:    insb %dx, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    insw %dx, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    insl %dx, %es:(%rdi) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ins:
 ; BTVER2:       # %bb.0:
@@ -7367,9 +7367,9 @@ define void @test_int() optsize {
 ; BDVER2-LABEL: test_int:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    int $7 # sched: [100:0.33]
+; BDVER2-NEXT:    int $7 # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_int:
 ; BTVER2:       # %bb.0:
@@ -7456,10 +7456,10 @@ define void @test_invlpg_invlpga(i8 *%a0) optsize {
 ; BDVER2-LABEL: test_invlpg_invlpga:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    invlpg (%rdi) # sched: [100:0.33]
-; BDVER2-NEXT:    invlpga %rax, %ecx # sched: [100:0.33]
+; BDVER2-NEXT:    invlpg (%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    invlpga %rax, %ecx # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_invlpg_invlpga:
 ; BTVER2:       # %bb.0:
@@ -7812,7 +7812,7 @@ define void @test_jcc() optsize {
 ; BDVER2-NEXT:    jg JCCTGT # sched: [1:1.00]
 ; BDVER2-NEXT:    jg JCCTGT # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_jcc:
 ; BTVER2:       # %bb.0:
@@ -7968,10 +7968,10 @@ define void @test_jecxz_jrcxz() optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:  JXTGT:
-; BDVER2-NEXT:    jecxz JXTGT # sched: [2:1.00]
-; BDVER2-NEXT:    jrcxz JXTGT # sched: [2:1.00]
+; BDVER2-NEXT:    jecxz JXTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jrcxz JXTGT # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_jecxz_jrcxz:
 ; BTVER2:       # %bb.0:
@@ -8064,10 +8064,10 @@ define void @test_lahf_sahf() optsize {
 ; BDVER2-LABEL: test_lahf_sahf:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    lahf # sched: [1:0.50]
-; BDVER2-NEXT:    sahf # sched: [1:0.50]
+; BDVER2-NEXT:    lahf # sched: [2:0.50]
+; BDVER2-NEXT:    sahf # sched: [2:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lahf_sahf:
 ; BTVER2:       # %bb.0:
@@ -8156,9 +8156,9 @@ define void @test_leave() optsize {
 ; BDVER2-LABEL: test_leave:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    leave # sched: [7:0.67]
+; BDVER2-NEXT:    leave # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_leave:
 ; BTVER2:       # %bb.0:
@@ -8261,12 +8261,12 @@ define void @test_lods() optsize {
 ; BDVER2-LABEL: test_lods:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    lodsb (%rsi), %al # sched: [7:0.67]
-; BDVER2-NEXT:    lodsw (%rsi), %ax # sched: [7:0.67]
-; BDVER2-NEXT:    lodsl (%rsi), %eax # sched: [6:0.50]
-; BDVER2-NEXT:    lodsq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    lodsb (%rsi), %al # sched: [100:0.50]
+; BDVER2-NEXT:    lodsw (%rsi), %ax # sched: [100:0.50]
+; BDVER2-NEXT:    lodsl (%rsi), %eax # sched: [100:0.50]
+; BDVER2-NEXT:    lodsq (%rsi), %rax # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_lods:
 ; BTVER2:       # %bb.0:
@@ -8380,7 +8380,7 @@ define void @test_loop() optsize {
 ; BDVER2-NEXT:    loope LTGT # sched: [1:1.00]
 ; BDVER2-NEXT:    loopne LTGT # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_loop:
 ; BTVER2:       # %bb.0:
@@ -8475,10 +8475,10 @@ define void @test_movnti(i32 %a0, i32 *%a1, i64 %a2, i64 *%a3) optsize {
 ; BDVER2-LABEL: test_movnti:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    movntil %edi, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    movntiq %rdx, (%rcx) # sched: [1:1.00]
+; BDVER2-NEXT:    movntil %edi, (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    movntiq %rdx, (%rcx) # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movnti:
 ; BTVER2:       # %bb.0:
@@ -8583,12 +8583,12 @@ define void @test_movs() optsize {
 ; BDVER2-LABEL: test_movs:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    movsb (%rsi), %es:(%rdi) # sched: [8:1.00]
-; BDVER2-NEXT:    movsw (%rsi), %es:(%rdi) # sched: [8:1.00]
-; BDVER2-NEXT:    movsl (%rsi), %es:(%rdi) # sched: [8:1.00]
-; BDVER2-NEXT:    movsq (%rsi), %es:(%rdi) # sched: [8:1.00]
+; BDVER2-NEXT:    movsb (%rsi), %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    movsw (%rsi), %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    movsl (%rsi), %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    movsq (%rsi), %es:(%rdi) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movs:
 ; BTVER2:       # %bb.0:
@@ -8692,11 +8692,11 @@ define i64 @test_movslq(i32 %a0, i32 *%a1) optsize {
 ; BDVER2-LABEL: test_movslq:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    movslq %edi, %rax # sched: [1:0.33]
+; BDVER2-NEXT:    movslq %edi, %rax # sched: [1:0.50]
 ; BDVER2-NEXT:    movslq (%rsi), %rcx # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movslq:
 ; BTVER2:       # %bb.0:
@@ -8856,16 +8856,16 @@ define void @test_mul(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
 ; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    mulb %dil # sched: [3:1.00]
+; BDVER2-NEXT:    mulb %dil # sched: [4:1.00]
 ; BDVER2-NEXT:    mulb (%r8) # sched: [8:1.00]
-; BDVER2-NEXT:    mulw %si # sched: [4:1.33]
-; BDVER2-NEXT:    mulw (%r9) # sched: [9:1.33]
+; BDVER2-NEXT:    mulw %si # sched: [4:1.00]
+; BDVER2-NEXT:    mulw (%r9) # sched: [8:1.00]
 ; BDVER2-NEXT:    mull %edx # sched: [4:1.00]
-; BDVER2-NEXT:    mull (%rax) # sched: [9:1.00]
-; BDVER2-NEXT:    mulq %rcx # sched: [4:1.00]
-; BDVER2-NEXT:    mulq (%r10) # sched: [9:1.00]
+; BDVER2-NEXT:    mull (%rax) # sched: [8:1.00]
+; BDVER2-NEXT:    mulq %rcx # sched: [6:4.00]
+; BDVER2-NEXT:    mulq (%r10) # sched: [10:4.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_mul:
 ; BTVER2:       # %bb.0:
@@ -9036,16 +9036,16 @@ define void @test_neg(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
 ; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    negb %dil # sched: [1:0.33]
-; BDVER2-NEXT:    negb (%r8) # sched: [7:1.00]
-; BDVER2-NEXT:    negw %si # sched: [1:0.33]
-; BDVER2-NEXT:    negw (%r9) # sched: [7:1.00]
-; BDVER2-NEXT:    negl %edx # sched: [1:0.33]
-; BDVER2-NEXT:    negl (%rax) # sched: [7:1.00]
-; BDVER2-NEXT:    negq %rcx # sched: [1:0.33]
-; BDVER2-NEXT:    negq (%r10) # sched: [7:1.00]
+; BDVER2-NEXT:    negb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    negb (%r8) # sched: [6:1.00]
+; BDVER2-NEXT:    negw %si # sched: [1:0.50]
+; BDVER2-NEXT:    negw (%r9) # sched: [6:1.00]
+; BDVER2-NEXT:    negl %edx # sched: [1:0.50]
+; BDVER2-NEXT:    negl (%rax) # sched: [6:1.00]
+; BDVER2-NEXT:    negq %rcx # sched: [1:0.50]
+; BDVER2-NEXT:    negq (%r10) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_neg:
 ; BTVER2:       # %bb.0:
@@ -9190,15 +9190,15 @@ define void @test_nop(i16 %a0, i32 %a1, i64 %a2, i16 *%p0, i32 *%p1, i64 *%p2) o
 ; BDVER2-LABEL: test_nop:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    nop # sched: [1:0.25]
-; BDVER2-NEXT:    nopw %di # sched: [1:0.25]
-; BDVER2-NEXT:    nopw (%rcx) # sched: [1:0.25]
-; BDVER2-NEXT:    nopl %esi # sched: [1:0.25]
-; BDVER2-NEXT:    nopl (%r8) # sched: [1:0.25]
-; BDVER2-NEXT:    nopq %rdx # sched: [1:0.25]
-; BDVER2-NEXT:    nopq (%r9) # sched: [1:0.25]
+; BDVER2-NEXT:    nop # sched: [1:0.50]
+; BDVER2-NEXT:    nopw %di # sched: [1:0.50]
+; BDVER2-NEXT:    nopw (%rcx) # sched: [1:0.50]
+; BDVER2-NEXT:    nopl %esi # sched: [1:0.50]
+; BDVER2-NEXT:    nopl (%r8) # sched: [1:0.50]
+; BDVER2-NEXT:    nopq %rdx # sched: [1:0.50]
+; BDVER2-NEXT:    nopq (%r9) # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_nop:
 ; BTVER2:       # %bb.0:
@@ -9363,16 +9363,16 @@ define void @test_not(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
 ; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    notb %dil # sched: [1:0.33]
-; BDVER2-NEXT:    notb (%r8) # sched: [7:1.00]
-; BDVER2-NEXT:    notw %si # sched: [1:0.33]
-; BDVER2-NEXT:    notw (%r9) # sched: [7:1.00]
-; BDVER2-NEXT:    notl %edx # sched: [1:0.33]
-; BDVER2-NEXT:    notl (%rax) # sched: [7:1.00]
-; BDVER2-NEXT:    notq %rcx # sched: [1:0.33]
-; BDVER2-NEXT:    notq (%r10) # sched: [7:1.00]
+; BDVER2-NEXT:    notb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    notb (%r8) # sched: [6:1.00]
+; BDVER2-NEXT:    notw %si # sched: [1:0.50]
+; BDVER2-NEXT:    notw (%r9) # sched: [6:1.00]
+; BDVER2-NEXT:    notl %edx # sched: [1:0.50]
+; BDVER2-NEXT:    notl (%rax) # sched: [6:1.00]
+; BDVER2-NEXT:    notq %rcx # sched: [1:0.50]
+; BDVER2-NEXT:    notq (%r10) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_not:
 ; BTVER2:       # %bb.0:
@@ -9509,14 +9509,14 @@ define void @test_or_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; BDVER2-LABEL: test_or_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    orb $7, %al # sched: [1:0.33]
-; BDVER2-NEXT:    orb $7, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    orb $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    orb %dl, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    orb %dil, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    orb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    orb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    orb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    orb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orb %dl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    orb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orb (%rsi), %dil # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_or_8:
 ; BTVER2:       # %bb.0:
@@ -9685,18 +9685,18 @@ define void @test_or_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    orw $511, %ax # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    orw $511, %di # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    orw $511, (%rsi) # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    orw $7, %di # sched: [1:0.33]
-; BDVER2-NEXT:    orw $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    orw %dx, %di # sched: [1:0.33]
-; BDVER2-NEXT:    orw %di, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    orw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    orw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    orw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orw %dx, %di # sched: [1:0.50]
+; BDVER2-NEXT:    orw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orw (%rsi), %di # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_or_16:
 ; BTVER2:       # %bb.0:
@@ -9875,18 +9875,18 @@ define void @test_or_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    orl $665536, %eax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    orl $665536, %edi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    orl $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    orl $7, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    orl $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    orl %edx, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    orl %edi, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    orl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    orl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    orl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orl %edx, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    orl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orl (%rsi), %edi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_or_32:
 ; BTVER2:       # %bb.0:
@@ -10065,18 +10065,18 @@ define void @test_or_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    orq $665536, %rax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    orq $665536, %rdi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    orq $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    orq $7, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    orq $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    orq %rdx, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    orq %rdi, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    orq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    orq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    orq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orq %rdx, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    orq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orq (%rsi), %rdi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_or_64:
 ; BTVER2:       # %bb.0:
@@ -10215,14 +10215,14 @@ define void @test_out() optsize {
 ; BDVER2-LABEL: test_out:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    outb %al, $7 # sched: [100:0.33]
-; BDVER2-NEXT:    outw %ax, $7 # sched: [100:0.33]
-; BDVER2-NEXT:    outl %eax, $7 # sched: [100:0.33]
-; BDVER2-NEXT:    outb %al, %dx # sched: [100:0.33]
-; BDVER2-NEXT:    outw %ax, %dx # sched: [100:0.33]
-; BDVER2-NEXT:    outl %eax, %dx # sched: [100:0.33]
+; BDVER2-NEXT:    outb %al, $7 # sched: [100:0.50]
+; BDVER2-NEXT:    outw %ax, $7 # sched: [100:0.50]
+; BDVER2-NEXT:    outl %eax, $7 # sched: [100:0.50]
+; BDVER2-NEXT:    outb %al, %dx # sched: [100:0.50]
+; BDVER2-NEXT:    outw %ax, %dx # sched: [100:0.50]
+; BDVER2-NEXT:    outl %eax, %dx # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_out:
 ; BTVER2:       # %bb.0:
@@ -10327,11 +10327,11 @@ define void @test_outs() optsize {
 ; BDVER2-LABEL: test_outs:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    outsb (%rsi), %dx # sched: [100:0.33]
-; BDVER2-NEXT:    outsw (%rsi), %dx # sched: [100:0.33]
-; BDVER2-NEXT:    outsl (%rsi), %dx # sched: [100:0.33]
+; BDVER2-NEXT:    outsb (%rsi), %dx # sched: [100:0.50]
+; BDVER2-NEXT:    outsw (%rsi), %dx # sched: [100:0.50]
+; BDVER2-NEXT:    outsl (%rsi), %dx # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_outs:
 ; BTVER2:       # %bb.0:
@@ -10414,9 +10414,9 @@ define void @test_pause() optsize {
 ; BDVER2-LABEL: test_pause:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    pause # sched: [4:1.33]
+; BDVER2-NEXT:    pause # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pause:
 ; BTVER2:       # %bb.0:
@@ -10519,12 +10519,12 @@ define void @test_pop_push() optsize {
 ; BDVER2-LABEL: test_pop_push:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    popq %fs # sched: [100:0.33]
-; BDVER2-NEXT:    popq %gs # sched: [100:0.33]
-; BDVER2-NEXT:    pushq %fs # sched: [3:1.00]
-; BDVER2-NEXT:    pushq %gs # sched: [5:1.00]
+; BDVER2-NEXT:    popq %fs # sched: [100:0.50]
+; BDVER2-NEXT:    popq %gs # sched: [100:0.50]
+; BDVER2-NEXT:    pushq %fs # sched: [100:0.50]
+; BDVER2-NEXT:    pushq %gs # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pop_push:
 ; BTVER2:       # %bb.0:
@@ -10656,15 +10656,15 @@ define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
 ; BDVER2-LABEL: test_pop_push_16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    popw %ax # sched: [6:0.50]
-; BDVER2-NEXT:    popw (%rsi) # sched: [6:0.50]
-; BDVER2-NEXT:    pushw %di # sched: [5:1.00]
-; BDVER2-NEXT:    pushw (%rsi) # sched: [5:1.00]
+; BDVER2-NEXT:    popw %ax # sched: [5:0.50]
+; BDVER2-NEXT:    popw (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    pushw %di # sched: [1:0.50]
+; BDVER2-NEXT:    pushw (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    pushw $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [1:1.00]
-; BDVER2-NEXT:    pushw $7 # sched: [1:1.00]
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    pushw $7 # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pop_push_16:
 ; BTVER2:       # %bb.0:
@@ -10802,15 +10802,15 @@ define i64 @test_pop_push_64(i64 %a0, i64 *%a1) optsize {
 ; BDVER2-LABEL: test_pop_push_64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    popq %rax # sched: [6:0.50]
-; BDVER2-NEXT:    popq (%rsi) # sched: [6:0.50]
-; BDVER2-NEXT:    pushq %rdi # sched: [5:1.00]
-; BDVER2-NEXT:    pushq (%rsi) # sched: [5:1.00]
+; BDVER2-NEXT:    popq %rax # sched: [5:0.50]
+; BDVER2-NEXT:    popq (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    pushq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    pushq (%rsi) # sched: [6:1.00]
 ; BDVER2-NEXT:    pushq $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [1:1.00]
-; BDVER2-NEXT:    pushq $7 # sched: [5:1.00]
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    pushq $7 # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_pop_push_64:
 ; BTVER2:       # %bb.0:
@@ -10910,9 +10910,9 @@ define void @test_popf_pushf() optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    popfq # sched: [5:0.50]
-; BDVER2-NEXT:    pushfq # sched: [5:1.00]
+; BDVER2-NEXT:    pushfq # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_popf_pushf:
 ; BTVER2:       # %bb.0:
@@ -11081,20 +11081,20 @@ define void @test_rcl_rcr_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; BDVER2-LABEL: test_rcl_rcr_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    rclb %dil # sched: [2:1.50]
-; BDVER2-NEXT:    rcrb %dil # sched: [2:1.50]
-; BDVER2-NEXT:    rclb (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrb (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rclb $7, %dil # sched: [5:4.00]
-; BDVER2-NEXT:    rcrb $7, %dil # sched: [5:4.00]
-; BDVER2-NEXT:    rclb $7, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrb $7, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rclb %cl, %dil # sched: [5:4.00]
-; BDVER2-NEXT:    rcrb %cl, %dil # sched: [5:4.00]
-; BDVER2-NEXT:    rclb %cl, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrb %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rclb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rcrb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rclb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclb $7, %dil # sched: [13:0.50]
+; BDVER2-NEXT:    rcrb $7, %dil # sched: [12:0.50]
+; BDVER2-NEXT:    rclb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclb %cl, %dil # sched: [12:0.50]
+; BDVER2-NEXT:    rcrb %cl, %dil # sched: [11:0.50]
+; BDVER2-NEXT:    rclb %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrb %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rcl_rcr_8:
 ; BTVER2:       # %bb.0:
@@ -11282,20 +11282,20 @@ define void @test_rcl_rcr_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; BDVER2-LABEL: test_rcl_rcr_16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    rclw %di # sched: [2:1.50]
-; BDVER2-NEXT:    rcrw %di # sched: [2:1.50]
-; BDVER2-NEXT:    rclw (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrw (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rclw $7, %di # sched: [5:4.00]
-; BDVER2-NEXT:    rcrw $7, %di # sched: [5:4.00]
-; BDVER2-NEXT:    rclw $7, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrw $7, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rclw %cl, %di # sched: [5:4.00]
-; BDVER2-NEXT:    rcrw %cl, %di # sched: [5:4.00]
-; BDVER2-NEXT:    rclw %cl, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrw %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rclw %di # sched: [1:0.50]
+; BDVER2-NEXT:    rcrw %di # sched: [1:0.50]
+; BDVER2-NEXT:    rclw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclw $7, %di # sched: [11:0.50]
+; BDVER2-NEXT:    rcrw $7, %di # sched: [10:0.50]
+; BDVER2-NEXT:    rclw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclw %cl, %di # sched: [10:0.50]
+; BDVER2-NEXT:    rcrw %cl, %di # sched: [9:0.50]
+; BDVER2-NEXT:    rclw %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrw %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rcl_rcr_16:
 ; BTVER2:       # %bb.0:
@@ -11483,20 +11483,20 @@ define void @test_rcl_rcr_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; BDVER2-LABEL: test_rcl_rcr_32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    rcll %edi # sched: [2:1.50]
-; BDVER2-NEXT:    rcrl %edi # sched: [2:1.50]
-; BDVER2-NEXT:    rcll (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrl (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcll $7, %edi # sched: [5:4.00]
-; BDVER2-NEXT:    rcrl $7, %edi # sched: [5:4.00]
-; BDVER2-NEXT:    rcll $7, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrl $7, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcll %cl, %edi # sched: [5:4.00]
-; BDVER2-NEXT:    rcrl %cl, %edi # sched: [5:4.00]
-; BDVER2-NEXT:    rcll %cl, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrl %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rcll %edi # sched: [1:0.50]
+; BDVER2-NEXT:    rcrl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    rcll (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrl (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcll $7, %edi # sched: [8:0.50]
+; BDVER2-NEXT:    rcrl $7, %edi # sched: [7:0.50]
+; BDVER2-NEXT:    rcll $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrl $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcll %cl, %edi # sched: [7:0.50]
+; BDVER2-NEXT:    rcrl %cl, %edi # sched: [7:0.50]
+; BDVER2-NEXT:    rcll %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrl %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rcl_rcr_32:
 ; BTVER2:       # %bb.0:
@@ -11684,20 +11684,20 @@ define void @test_rcl_rcr_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; BDVER2-LABEL: test_rcl_rcr_64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    rclq %rdi # sched: [2:1.50]
-; BDVER2-NEXT:    rcrq %rdi # sched: [2:1.50]
-; BDVER2-NEXT:    rclq (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrq (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rclq $7, %rdi # sched: [5:4.00]
-; BDVER2-NEXT:    rcrq $7, %rdi # sched: [5:4.00]
-; BDVER2-NEXT:    rclq $7, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrq $7, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rclq %cl, %rdi # sched: [5:4.00]
-; BDVER2-NEXT:    rcrq %cl, %rdi # sched: [5:4.00]
-; BDVER2-NEXT:    rclq %cl, (%rdx) # sched: [11:3.50]
-; BDVER2-NEXT:    rcrq %cl, (%rdx) # sched: [11:3.50]
+; BDVER2-NEXT:    rclq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rcrq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rclq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclq $7, %rdi # sched: [8:0.50]
+; BDVER2-NEXT:    rcrq $7, %rdi # sched: [7:0.50]
+; BDVER2-NEXT:    rclq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclq %cl, %rdi # sched: [7:0.50]
+; BDVER2-NEXT:    rcrq %cl, %rdi # sched: [7:0.50]
+; BDVER2-NEXT:    rclq %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrq %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rcl_rcr_64:
 ; BTVER2:       # %bb.0:
@@ -11806,10 +11806,10 @@ define void @test_rdmsr_wrmsr() optsize {
 ; BDVER2-LABEL: test_rdmsr_wrmsr:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    rdmsr # sched: [100:0.33]
-; BDVER2-NEXT:    wrmsr # sched: [100:0.33]
+; BDVER2-NEXT:    rdmsr # sched: [100:0.50]
+; BDVER2-NEXT:    wrmsr # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rdmsr_wrmsr:
 ; BTVER2:       # %bb.0:
@@ -11890,9 +11890,9 @@ define void @test_rdpmc() optsize {
 ; BDVER2-LABEL: test_rdpmc:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    rdpmc # sched: [100:0.33]
+; BDVER2-NEXT:    rdpmc # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rdpmc:
 ; BTVER2:       # %bb.0:
@@ -11979,10 +11979,10 @@ define void @test_rdtsc_rdtscp() optsize {
 ; BDVER2-LABEL: test_rdtsc_rdtscp:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    rdtsc # sched: [100:0.33]
-; BDVER2-NEXT:    rdtscp # sched: [100:0.33]
+; BDVER2-NEXT:    rdtsc # sched: [100:0.50]
+; BDVER2-NEXT:    rdtscp # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rdtsc_rdtscp:
 ; BTVER2:       # %bb.0:
@@ -12103,14 +12103,14 @@ define void @test_ret() optsize {
 ; BDVER2-LABEL: test_ret:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ; BDVER2-NEXT:    retq $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [6:1.00]
-; BDVER2-NEXT:    lretl # sched: [6:1.00]
+; BDVER2-NEXT:    # sched: [5:1.00]
+; BDVER2-NEXT:    lretl # sched: [5:1.00]
 ; BDVER2-NEXT:    lretl $4095 # imm = 0xFFF
-; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ret:
 ; BTVER2:       # %bb.0:
@@ -12287,20 +12287,20 @@ define void @test_rol_ror_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; BDVER2-LABEL: test_rol_ror_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    rolb %dil # sched: [2:1.00]
-; BDVER2-NEXT:    rorb %dil # sched: [2:1.00]
-; BDVER2-NEXT:    rolb (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rorb (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rolb $7, %dil # sched: [2:1.00]
-; BDVER2-NEXT:    rorb $7, %dil # sched: [2:1.00]
-; BDVER2-NEXT:    rolb $7, (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rorb $7, (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rolb %cl, %dil # sched: [3:1.50]
-; BDVER2-NEXT:    rorb %cl, %dil # sched: [3:1.50]
-; BDVER2-NEXT:    rolb %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    rorb %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    rolb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rorb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rolb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rorb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rolb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolb %cl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rorb %cl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rolb %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorb %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rol_ror_8:
 ; BTVER2:       # %bb.0:
@@ -12488,20 +12488,20 @@ define void @test_rol_ror_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; BDVER2-LABEL: test_rol_ror_16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    rolw %di # sched: [2:1.00]
-; BDVER2-NEXT:    rorw %di # sched: [2:1.00]
-; BDVER2-NEXT:    rolw (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rorw (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rolw $7, %di # sched: [2:1.00]
-; BDVER2-NEXT:    rorw $7, %di # sched: [2:1.00]
-; BDVER2-NEXT:    rolw $7, (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rorw $7, (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rolw %cl, %di # sched: [3:1.50]
-; BDVER2-NEXT:    rorw %cl, %di # sched: [3:1.50]
-; BDVER2-NEXT:    rolw %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    rorw %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    rolw %di # sched: [1:0.50]
+; BDVER2-NEXT:    rorw %di # sched: [1:0.50]
+; BDVER2-NEXT:    rolw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    rorw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    rolw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolw %cl, %di # sched: [1:0.50]
+; BDVER2-NEXT:    rorw %cl, %di # sched: [1:0.50]
+; BDVER2-NEXT:    rolw %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorw %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rol_ror_16:
 ; BTVER2:       # %bb.0:
@@ -12689,20 +12689,20 @@ define void @test_rol_ror_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; BDVER2-LABEL: test_rol_ror_32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    roll %edi # sched: [2:1.00]
-; BDVER2-NEXT:    rorl %edi # sched: [2:1.00]
-; BDVER2-NEXT:    roll (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rorl (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    roll $7, %edi # sched: [2:1.00]
-; BDVER2-NEXT:    rorl $7, %edi # sched: [2:1.00]
-; BDVER2-NEXT:    roll $7, (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rorl $7, (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    roll %cl, %edi # sched: [3:1.50]
-; BDVER2-NEXT:    rorl %cl, %edi # sched: [3:1.50]
-; BDVER2-NEXT:    roll %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    rorl %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    roll %edi # sched: [1:0.50]
+; BDVER2-NEXT:    rorl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    roll (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorl (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    roll $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    rorl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    roll $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorl $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    roll %cl, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    rorl %cl, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    roll %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorl %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rol_ror_32:
 ; BTVER2:       # %bb.0:
@@ -12890,20 +12890,20 @@ define void @test_rol_ror_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; BDVER2-LABEL: test_rol_ror_64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    rolq %rdi # sched: [2:1.00]
-; BDVER2-NEXT:    rorq %rdi # sched: [2:1.00]
-; BDVER2-NEXT:    rolq (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rorq (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rolq $7, %rdi # sched: [2:1.00]
-; BDVER2-NEXT:    rorq $7, %rdi # sched: [2:1.00]
-; BDVER2-NEXT:    rolq $7, (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rorq $7, (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    rolq %cl, %rdi # sched: [3:1.50]
-; BDVER2-NEXT:    rorq %cl, %rdi # sched: [3:1.50]
-; BDVER2-NEXT:    rolq %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    rorq %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    rolq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rorq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rolq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rorq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rolq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolq %cl, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rorq %cl, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rolq %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorq %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_rol_ror_64:
 ; BTVER2:       # %bb.0:
@@ -13143,23 +13143,23 @@ define void @test_sar_shl_shr_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; BDVER2-NEXT:    sarb %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    shlb %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    shrb %dil # sched: [1:0.50]
-; BDVER2-NEXT:    sarb (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shlb (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shrb (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrb (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    sarb $7, %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    shlb $7, %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    shrb $7, %dil # sched: [1:0.50]
-; BDVER2-NEXT:    sarb $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shlb $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shrb $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    sarb %cl, %dil # sched: [3:1.50]
-; BDVER2-NEXT:    shlb %cl, %dil # sched: [3:1.50]
-; BDVER2-NEXT:    shrb %cl, %dil # sched: [3:1.50]
-; BDVER2-NEXT:    sarb %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    shlb %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    shrb %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    sarb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarb %cl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shlb %cl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shrb %cl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    sarb %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlb %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrb %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sar_shl_shr_8:
 ; BTVER2:       # %bb.0:
@@ -13410,23 +13410,23 @@ define void @test_sar_shl_shr_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; BDVER2-NEXT:    sarw %di # sched: [1:0.50]
 ; BDVER2-NEXT:    shlw %di # sched: [1:0.50]
 ; BDVER2-NEXT:    shrw %di # sched: [1:0.50]
-; BDVER2-NEXT:    sarw (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shlw (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shrw (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrw (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    sarw $7, %di # sched: [1:0.50]
 ; BDVER2-NEXT:    shlw $7, %di # sched: [1:0.50]
 ; BDVER2-NEXT:    shrw $7, %di # sched: [1:0.50]
-; BDVER2-NEXT:    sarw $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shlw $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shrw $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    sarw %cl, %di # sched: [3:1.50]
-; BDVER2-NEXT:    shlw %cl, %di # sched: [3:1.50]
-; BDVER2-NEXT:    shrw %cl, %di # sched: [3:1.50]
-; BDVER2-NEXT:    sarw %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    shlw %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    shrw %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    sarw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarw %cl, %di # sched: [1:0.50]
+; BDVER2-NEXT:    shlw %cl, %di # sched: [1:0.50]
+; BDVER2-NEXT:    shrw %cl, %di # sched: [1:0.50]
+; BDVER2-NEXT:    sarw %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlw %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrw %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sar_shl_shr_16:
 ; BTVER2:       # %bb.0:
@@ -13677,23 +13677,23 @@ define void @test_sar_shl_shr_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; BDVER2-NEXT:    sarl %edi # sched: [1:0.50]
 ; BDVER2-NEXT:    shll %edi # sched: [1:0.50]
 ; BDVER2-NEXT:    shrl %edi # sched: [1:0.50]
-; BDVER2-NEXT:    sarl (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shll (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shrl (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarl (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shll (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrl (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    sarl $7, %edi # sched: [1:0.50]
 ; BDVER2-NEXT:    shll $7, %edi # sched: [1:0.50]
 ; BDVER2-NEXT:    shrl $7, %edi # sched: [1:0.50]
-; BDVER2-NEXT:    sarl $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shll $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shrl $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    sarl %cl, %edi # sched: [3:1.50]
-; BDVER2-NEXT:    shll %cl, %edi # sched: [3:1.50]
-; BDVER2-NEXT:    shrl %cl, %edi # sched: [3:1.50]
-; BDVER2-NEXT:    sarl %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    shll %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    shrl %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    sarl $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shll $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrl $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarl %cl, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shll %cl, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shrl %cl, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    sarl %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shll %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrl %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sar_shl_shr_32:
 ; BTVER2:       # %bb.0:
@@ -13944,23 +13944,23 @@ define void @test_sar_shl_shr_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; BDVER2-NEXT:    sarq %rdi # sched: [1:0.50]
 ; BDVER2-NEXT:    shlq %rdi # sched: [1:0.50]
 ; BDVER2-NEXT:    shrq %rdi # sched: [1:0.50]
-; BDVER2-NEXT:    sarq (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shlq (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shrq (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    sarq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrq (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    sarq $7, %rdi # sched: [1:0.50]
 ; BDVER2-NEXT:    shlq $7, %rdi # sched: [1:0.50]
 ; BDVER2-NEXT:    shrq $7, %rdi # sched: [1:0.50]
-; BDVER2-NEXT:    sarq $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shlq $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    shrq $7, (%rdx) # sched: [7:1.00]
-; BDVER2-NEXT:    sarq %cl, %rdi # sched: [3:1.50]
-; BDVER2-NEXT:    shlq %cl, %rdi # sched: [3:1.50]
-; BDVER2-NEXT:    shrq %cl, %rdi # sched: [3:1.50]
-; BDVER2-NEXT:    sarq %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    shlq %cl, (%rdx) # sched: [9:1.50]
-; BDVER2-NEXT:    shrq %cl, (%rdx) # sched: [9:1.50]
+; BDVER2-NEXT:    sarq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarq %cl, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shlq %cl, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    sarq %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlq %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrq %cl, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sar_shl_shr_64:
 ; BTVER2:       # %bb.0:
@@ -14113,14 +14113,14 @@ define void @test_sbb_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; BDVER2-LABEL: test_sbb_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    sbbb $7, %al # sched: [2:0.67]
-; BDVER2-NEXT:    sbbb $7, %dil # sched: [2:0.67]
-; BDVER2-NEXT:    sbbb $7, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    sbbb %dl, %dil # sched: [2:0.67]
-; BDVER2-NEXT:    sbbb %dil, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    sbbb (%rsi), %dil # sched: [7:0.67]
+; BDVER2-NEXT:    sbbb $7, %al # sched: [1:1.00]
+; BDVER2-NEXT:    sbbb $7, %dil # sched: [1:1.00]
+; BDVER2-NEXT:    sbbb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbb %dl, %dil # sched: [1:1.00]
+; BDVER2-NEXT:    sbbb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbb (%rsi), %dil # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sbb_8:
 ; BTVER2:       # %bb.0:
@@ -14289,18 +14289,18 @@ define void @test_sbb_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    sbbw $511, %ax # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    sbbw $511, %di # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    sbbw $511, (%rsi) # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [9:1.00]
-; BDVER2-NEXT:    sbbw $7, %di # sched: [2:0.67]
-; BDVER2-NEXT:    sbbw $7, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    sbbw %dx, %di # sched: [2:0.67]
-; BDVER2-NEXT:    sbbw %di, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    sbbw (%rsi), %di # sched: [7:0.67]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    sbbw $7, %di # sched: [1:1.00]
+; BDVER2-NEXT:    sbbw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbw %dx, %di # sched: [1:1.00]
+; BDVER2-NEXT:    sbbw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbw (%rsi), %di # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sbb_16:
 ; BTVER2:       # %bb.0:
@@ -14479,18 +14479,18 @@ define void @test_sbb_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    sbbl $665536, %eax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    sbbl $665536, %edi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    sbbl $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [9:1.00]
-; BDVER2-NEXT:    sbbl $7, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    sbbl $7, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    sbbl %edx, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    sbbl %edi, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    sbbl (%rsi), %edi # sched: [7:0.67]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    sbbl $7, %edi # sched: [1:1.00]
+; BDVER2-NEXT:    sbbl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbl %edx, %edi # sched: [1:1.00]
+; BDVER2-NEXT:    sbbl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbl (%rsi), %edi # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sbb_32:
 ; BTVER2:       # %bb.0:
@@ -14669,18 +14669,18 @@ define void @test_sbb_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    sbbq $665536, %rax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    sbbq $665536, %rdi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [2:0.67]
+; BDVER2-NEXT:    # sched: [1:1.00]
 ; BDVER2-NEXT:    sbbq $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [9:1.00]
-; BDVER2-NEXT:    sbbq $7, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    sbbq $7, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    sbbq %rdx, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    sbbq %rdi, (%rsi) # sched: [9:1.00]
-; BDVER2-NEXT:    sbbq (%rsi), %rdi # sched: [7:0.67]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    sbbq $7, %rdi # sched: [1:1.00]
+; BDVER2-NEXT:    sbbq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbq %rdx, %rdi # sched: [1:1.00]
+; BDVER2-NEXT:    sbbq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbq (%rsi), %rdi # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sbb_64:
 ; BTVER2:       # %bb.0:
@@ -14803,12 +14803,12 @@ define void @test_scas() optsize {
 ; BDVER2-LABEL: test_scas:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    scasb %es:(%rdi), %al # sched: [2:0.67]
-; BDVER2-NEXT:    scasw %es:(%rdi), %ax # sched: [2:0.67]
-; BDVER2-NEXT:    scasl %es:(%rdi), %eax # sched: [2:0.67]
-; BDVER2-NEXT:    scasq %es:(%rdi), %rax # sched: [2:0.67]
+; BDVER2-NEXT:    scasb %es:(%rdi), %al # sched: [100:0.50]
+; BDVER2-NEXT:    scasw %es:(%rdi), %ax # sched: [100:0.50]
+; BDVER2-NEXT:    scasl %es:(%rdi), %eax # sched: [100:0.50]
+; BDVER2-NEXT:    scasq %es:(%rdi), %rax # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_scas:
 ; BTVER2:       # %bb.0:
@@ -15147,8 +15147,8 @@ define void @test_setcc(i8 %a0, i8 *%a1) optsize {
 ; BDVER2-NEXT:    setae %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    sete %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    setne %dil # sched: [1:0.50]
-; BDVER2-NEXT:    setbe %dil # sched: [2:1.00]
-; BDVER2-NEXT:    seta %dil # sched: [2:1.00]
+; BDVER2-NEXT:    setbe %dil # sched: [1:0.50]
+; BDVER2-NEXT:    seta %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    sets %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    setns %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    setp %dil # sched: [1:0.50]
@@ -15157,24 +15157,24 @@ define void @test_setcc(i8 %a0, i8 *%a1) optsize {
 ; BDVER2-NEXT:    setge %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    setle %dil # sched: [1:0.50]
 ; BDVER2-NEXT:    setg %dil # sched: [1:0.50]
-; BDVER2-NEXT:    seto (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setno (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setb (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setae (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    sete (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setne (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setbe (%rsi) # sched: [3:1.00]
-; BDVER2-NEXT:    seta (%rsi) # sched: [3:1.00]
-; BDVER2-NEXT:    sets (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setns (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setp (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setnp (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setl (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setge (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setle (%rsi) # sched: [2:1.00]
-; BDVER2-NEXT:    setg (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    seto (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setno (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setb (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setae (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    sete (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setne (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setbe (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    seta (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    sets (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setns (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setp (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setnp (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setl (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setge (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setle (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setg (%rsi) # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_setcc:
 ; BTVER2:       # %bb.0:
@@ -15373,16 +15373,16 @@ define void @test_shld_shrd_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; BDVER2-LABEL: test_shld_shrd_16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    shldw %cl, %si, %di # sched: [4:1.50]
-; BDVER2-NEXT:    shrdw %cl, %si, %di # sched: [4:1.50]
-; BDVER2-NEXT:    shldw %cl, %si, (%rdx) # sched: [10:1.50]
-; BDVER2-NEXT:    shrdw %cl, %si, (%rdx) # sched: [10:1.50]
-; BDVER2-NEXT:    shldw $7, %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    shrdw $7, %si, %di # sched: [2:0.67]
-; BDVER2-NEXT:    shldw $7, %si, (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    shrdw $7, %si, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    shldw %cl, %si, %di # sched: [4:4.00]
+; BDVER2-NEXT:    shrdw %cl, %si, %di # sched: [4:4.00]
+; BDVER2-NEXT:    shldw %cl, %si, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdw %cl, %si, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shldw $7, %si, %di # sched: [4:3.00]
+; BDVER2-NEXT:    shrdw $7, %si, %di # sched: [3:3.00]
+; BDVER2-NEXT:    shldw $7, %si, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdw $7, %si, (%rdx) # sched: [4:11.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_shld_shrd_16:
 ; BTVER2:       # %bb.0:
@@ -15530,16 +15530,16 @@ define void @test_shld_shrd_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; BDVER2-LABEL: test_shld_shrd_32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    shldl %cl, %esi, %edi # sched: [4:1.50]
-; BDVER2-NEXT:    shrdl %cl, %esi, %edi # sched: [4:1.50]
-; BDVER2-NEXT:    shldl %cl, %esi, (%rdx) # sched: [10:1.50]
-; BDVER2-NEXT:    shrdl %cl, %esi, (%rdx) # sched: [10:1.50]
-; BDVER2-NEXT:    shldl $7, %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    shrdl $7, %esi, %edi # sched: [2:0.67]
-; BDVER2-NEXT:    shldl $7, %esi, (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    shrdl $7, %esi, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    shldl %cl, %esi, %edi # sched: [4:4.00]
+; BDVER2-NEXT:    shrdl %cl, %esi, %edi # sched: [4:4.00]
+; BDVER2-NEXT:    shldl %cl, %esi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdl %cl, %esi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shldl $7, %esi, %edi # sched: [3:3.00]
+; BDVER2-NEXT:    shrdl $7, %esi, %edi # sched: [4:3.00]
+; BDVER2-NEXT:    shldl $7, %esi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdl $7, %esi, (%rdx) # sched: [4:11.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_shld_shrd_32:
 ; BTVER2:       # %bb.0:
@@ -15687,16 +15687,16 @@ define void @test_shld_shrd_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; BDVER2-LABEL: test_shld_shrd_64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    shldq %cl, %rsi, %rdi # sched: [4:1.50]
-; BDVER2-NEXT:    shrdq %cl, %rsi, %rdi # sched: [4:1.50]
-; BDVER2-NEXT:    shldq %cl, %rsi, (%rdx) # sched: [10:1.50]
-; BDVER2-NEXT:    shrdq %cl, %rsi, (%rdx) # sched: [10:1.50]
-; BDVER2-NEXT:    shldq $7, %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    shrdq $7, %rsi, %rdi # sched: [2:0.67]
-; BDVER2-NEXT:    shldq $7, %rsi, (%rdx) # sched: [8:1.00]
-; BDVER2-NEXT:    shrdq $7, %rsi, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    shldq %cl, %rsi, %rdi # sched: [4:4.00]
+; BDVER2-NEXT:    shrdq %cl, %rsi, %rdi # sched: [4:4.00]
+; BDVER2-NEXT:    shldq %cl, %rsi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdq %cl, %rsi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shldq $7, %rsi, %rdi # sched: [4:3.00]
+; BDVER2-NEXT:    shrdq $7, %rsi, %rdi # sched: [4:3.00]
+; BDVER2-NEXT:    shldq $7, %rsi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdq $7, %rsi, (%rdx) # sched: [4:11.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_shld_shrd_64:
 ; BTVER2:       # %bb.0:
@@ -15801,10 +15801,10 @@ define void @test_stc_std() optsize {
 ; BDVER2-LABEL: test_stc_std:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    stc # sched: [1:0.33]
-; BDVER2-NEXT:    std # sched: [1:0.33]
+; BDVER2-NEXT:    stc # sched: [1:0.50]
+; BDVER2-NEXT:    std # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_stc_std:
 ; BTVER2:       # %bb.0:
@@ -15912,12 +15912,12 @@ define void @test_stos() optsize {
 ; BDVER2-LABEL: test_stos:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    stosb %al, %es:(%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    stosw %ax, %es:(%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    stosl %eax, %es:(%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    stosq %rax, %es:(%rdi) # sched: [5:1.00]
+; BDVER2-NEXT:    stosb %al, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    stosw %ax, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    stosl %eax, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    stosq %rax, %es:(%rdi) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_stos:
 ; BTVER2:       # %bb.0:
@@ -16044,14 +16044,14 @@ define void @test_sub_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; BDVER2-LABEL: test_sub_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    subb $7, %al # sched: [1:0.33]
-; BDVER2-NEXT:    subb $7, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    subb $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    subb %dl, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    subb %dil, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    subb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    subb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    subb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    subb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subb %dl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    subb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subb (%rsi), %dil # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sub_8:
 ; BTVER2:       # %bb.0:
@@ -16220,18 +16220,18 @@ define void @test_sub_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    subw $511, %ax # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    subw $511, %di # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    subw $511, (%rsi) # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    subw $7, %di # sched: [1:0.33]
-; BDVER2-NEXT:    subw $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    subw %dx, %di # sched: [1:0.33]
-; BDVER2-NEXT:    subw %di, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    subw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    subw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    subw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subw %dx, %di # sched: [1:0.50]
+; BDVER2-NEXT:    subw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subw (%rsi), %di # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sub_16:
 ; BTVER2:       # %bb.0:
@@ -16410,18 +16410,18 @@ define void @test_sub_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    subl $665536, %eax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    subl $665536, %edi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    subl $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    subl $7, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    subl $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    subl %edx, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    subl %edi, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    subl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    subl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    subl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subl %edx, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    subl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subl (%rsi), %edi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sub_32:
 ; BTVER2:       # %bb.0:
@@ -16600,18 +16600,18 @@ define void @test_sub_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    subq $665536, %rax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    subq $665536, %rdi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    subq $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    subq $7, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    subq $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    subq %rdx, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    subq %rdi, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    subq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    subq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    subq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subq %rdx, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    subq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subq (%rsi), %rdi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_sub_64:
 ; BTVER2:       # %bb.0:
@@ -16748,13 +16748,13 @@ define void @test_test_8(i8 %a0, i8* %a1) optsize {
 ; BDVER2-LABEL: test_test_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    testb $7, %al # sched: [1:0.33]
-; BDVER2-NEXT:    testb $7, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    testb $7, (%rsi) # sched: [6:0.50]
-; BDVER2-NEXT:    testb %dil, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    testb %dil, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    testb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    testb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    testb $7, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    testb %dil, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    testb %dil, (%rsi) # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_test_8:
 ; BTVER2:       # %bb.0:
@@ -16897,15 +16897,15 @@ define void @test_test_16(i16 %a0, i16* %a1) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    testw $511, %ax # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    testw $511, %di # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    testw $511, (%rsi) # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [6:0.50]
-; BDVER2-NEXT:    testw %di, %di # sched: [1:0.33]
-; BDVER2-NEXT:    testw %di, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    testw %di, %di # sched: [1:0.50]
+; BDVER2-NEXT:    testw %di, (%rsi) # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_test_16:
 ; BTVER2:       # %bb.0:
@@ -17054,15 +17054,15 @@ define void @test_test_32(i32 %a0, i32* %a1) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    testl $665536, %eax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    testl $665536, %edi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    testl $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [6:0.50]
-; BDVER2-NEXT:    testl %edi, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    testl %edi, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    testl %edi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    testl %edi, (%rsi) # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_test_32:
 ; BTVER2:       # %bb.0:
@@ -17211,15 +17211,15 @@ define void @test_test_64(i64 %a0, i64* %a1) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    testq $665536, %rax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    testq $665536, %rdi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    testq $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [6:0.50]
-; BDVER2-NEXT:    testq %rdi, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    testq %rdi, (%rsi) # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    testq %rdi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    testq %rdi, (%rsi) # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_test_64:
 ; BTVER2:       # %bb.0:
@@ -17313,9 +17313,9 @@ define void @test_ud2() optsize {
 ; BDVER2-LABEL: test_ud2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    ud2 # sched: [100:0.33]
+; BDVER2-NEXT:    ud2 # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ud2:
 ; BTVER2:       # %bb.0:
@@ -17411,9 +17411,9 @@ define void @test_xadd_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    xaddb %dil, %sil # sched: [2:1.00]
-; BDVER2-NEXT:    xaddb %dil, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    xaddb %dil, (%rdx) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xadd_8:
 ; BTVER2:       # %bb.0:
@@ -17502,9 +17502,9 @@ define void @test_xadd_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    xaddw %di, %si # sched: [2:1.00]
-; BDVER2-NEXT:    xaddw %di, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    xaddw %di, (%rdx) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xadd_16:
 ; BTVER2:       # %bb.0:
@@ -17593,9 +17593,9 @@ define void @test_xadd_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    xaddl %edi, %esi # sched: [2:1.00]
-; BDVER2-NEXT:    xaddl %edi, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    xaddl %edi, (%rdx) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xadd_32:
 ; BTVER2:       # %bb.0:
@@ -17684,9 +17684,9 @@ define void @test_xadd_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    xaddq %rdi, %rsi # sched: [2:1.00]
-; BDVER2-NEXT:    xaddq %rdi, (%rdx) # sched: [8:1.00]
+; BDVER2-NEXT:    xaddq %rdi, (%rdx) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xadd_64:
 ; BTVER2:       # %bb.0:
@@ -17775,10 +17775,10 @@ define void @test_xchg_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; BDVER2-LABEL: test_xchg_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    xchgb %sil, %dil # sched: [2:1.00]
-; BDVER2-NEXT:    xchgb %dil, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    xchgb %sil, %dil # sched: [1:1.00]
+; BDVER2-NEXT:    xchgb %dil, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xchg_8:
 ; BTVER2:       # %bb.0:
@@ -17874,11 +17874,11 @@ define void @test_xchg_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; BDVER2-LABEL: test_xchg_16:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    xchgw %di, %ax # sched: [2:1.00]
+; BDVER2-NEXT:    xchgw %di, %ax # sched: [1:1.00]
 ; BDVER2-NEXT:    xchgw %si, %di # sched: [2:1.00]
-; BDVER2-NEXT:    xchgw %di, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    xchgw %di, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xchg_16:
 ; BTVER2:       # %bb.0:
@@ -17976,11 +17976,11 @@ define void @test_xchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; BDVER2-LABEL: test_xchg_32:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    xchgl %edi, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    xchgl %esi, %edi # sched: [2:1.00]
-; BDVER2-NEXT:    xchgl %edi, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    xchgl %edi, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    xchgl %esi, %edi # sched: [1:1.00]
+; BDVER2-NEXT:    xchgl %edi, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xchg_32:
 ; BTVER2:       # %bb.0:
@@ -18078,11 +18078,11 @@ define void @test_xchg_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; BDVER2-LABEL: test_xchg_64:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    xchgq %rdi, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    xchgq %rsi, %rdi # sched: [2:1.00]
-; BDVER2-NEXT:    xchgq %rdi, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    xchgq %rdi, %rax # sched: [1:1.00]
+; BDVER2-NEXT:    xchgq %rsi, %rdi # sched: [1:1.00]
+; BDVER2-NEXT:    xchgq %rdi, (%rdx) # sched: [5:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xchg_64:
 ; BTVER2:       # %bb.0:
@@ -18165,9 +18165,9 @@ define void @test_xlat() optsize {
 ; BDVER2-LABEL: test_xlat:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    xlatb # sched: [5:0.50]
+; BDVER2-NEXT:    xlatb # sched: [6:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xlat:
 ; BTVER2:       # %bb.0:
@@ -18286,14 +18286,14 @@ define void @test_xor_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; BDVER2-LABEL: test_xor_8:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    xorb $7, %al # sched: [1:0.33]
-; BDVER2-NEXT:    xorb $7, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    xorb $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    xorb %dl, %dil # sched: [1:0.33]
-; BDVER2-NEXT:    xorb %dil, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    xorb (%rsi), %dil # sched: [6:0.50]
+; BDVER2-NEXT:    xorb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    xorb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    xorb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorb %dl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    xorb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorb (%rsi), %dil # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xor_8:
 ; BTVER2:       # %bb.0:
@@ -18462,18 +18462,18 @@ define void @test_xor_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    xorw $511, %ax # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    xorw $511, %di # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    xorw $511, (%rsi) # imm = 0x1FF
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    xorw $7, %di # sched: [1:0.33]
-; BDVER2-NEXT:    xorw $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    xorw %dx, %di # sched: [1:0.33]
-; BDVER2-NEXT:    xorw %di, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    xorw (%rsi), %di # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    xorw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    xorw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorw %dx, %di # sched: [1:0.50]
+; BDVER2-NEXT:    xorw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorw (%rsi), %di # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xor_16:
 ; BTVER2:       # %bb.0:
@@ -18652,18 +18652,18 @@ define void @test_xor_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    xorl $665536, %eax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    xorl $665536, %edi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    xorl $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    xorl $7, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    xorl $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    xorl %edx, %edi # sched: [1:0.33]
-; BDVER2-NEXT:    xorl %edi, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    xorl (%rsi), %edi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    xorl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    xorl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorl %edx, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    xorl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorl (%rsi), %edi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xor_32:
 ; BTVER2:       # %bb.0:
@@ -18842,18 +18842,18 @@ define void @test_xor_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    xorq $665536, %rax # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    xorq $665536, %rdi # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [1:0.33]
+; BDVER2-NEXT:    # sched: [1:0.50]
 ; BDVER2-NEXT:    xorq $665536, (%rsi) # imm = 0xA27C0
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    xorq $7, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    xorq $7, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    xorq %rdx, %rdi # sched: [1:0.33]
-; BDVER2-NEXT:    xorq %rdi, (%rsi) # sched: [7:1.00]
-; BDVER2-NEXT:    xorq (%rsi), %rdi # sched: [6:0.50]
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    xorq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    xorq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorq %rdx, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    xorq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorq (%rsi), %rdi # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_xor_64:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll
index b8c38d9f396..12498123e3a 100644
--- a/test/CodeGen/X86/small-byval-memcpy.ll
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s --check-prefix=CORE2
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck %s --check-prefix=NEHALEM
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 | FileCheck %s --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index d36546e8799..aca2ec8c5f5 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -14,8 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -104,15 +104,15 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ;
 ; BDVER2-SSE-LABEL: test_addps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    addps (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_addps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_addps:
 ; BTVER2-SSE:       # %bb.0:
@@ -224,15 +224,15 @@ define float @test_addss(float %a0, float %a1, float *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_addss:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    addss (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addss (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_addss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_addss:
 ; BTVER2-SSE:       # %bb.0:
@@ -348,15 +348,15 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ;
 ; BDVER2-SSE-LABEL: test_andps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    andps %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    andps (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andps %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    andps (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_andps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_andps:
 ; BTVER2-SSE:       # %bb.0:
@@ -476,15 +476,15 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ;
 ; BDVER2-SSE-LABEL: test_andnotps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    andnps %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    andnps (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andnps %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    andnps (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_andnotps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_andnotps:
 ; BTVER2-SSE:       # %bb.0:
@@ -615,17 +615,17 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ;
 ; BDVER2-SSE-LABEL: test_cmpps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cmpeqps %xmm0, %xmm1 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    cmpeqps (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    orps %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cmpeqps %xmm0, %xmm1 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    cmpeqps (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    orps %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cmpps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; BDVER2-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [2:1.00]
+; BDVER2-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cmpps:
 ; BTVER2-SSE:       # %bb.0:
@@ -745,15 +745,15 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_cmpss:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cmpeqss %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    cmpeqss (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cmpeqss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    cmpeqss (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cmpss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cmpss:
 ; BTVER2-SSE:       # %bb.0:
@@ -974,31 +974,31 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_comiss:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    comiss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    comiss %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    comiss (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    comiss (%rdi), %xmm0 # sched: [6:1.00]
 ; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_comiss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcomiss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vcomiss %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
-; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.33]
-; BDVER2-NEXT:    vcomiss (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-NEXT:    vcomiss (%rdi), %xmm0 # sched: [6:1.00]
 ; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
-; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.33]
-; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.33]
-; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_comiss:
 ; BTVER2-SSE:       # %bb.0:
@@ -1157,17 +1157,17 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtsi2ss:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtsi2ssl %edi, %xmm1 # sched: [5:2.00]
-; BDVER2-SSE-NEXT:    cvtsi2ssl (%rsi), %xmm0 # sched: [10:1.00]
-; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2ssl (%rsi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2ssl %edi, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtsi2ss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtsi2ss:
 ; BTVER2-SSE:       # %bb.0:
@@ -1297,17 +1297,17 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtsi2ssq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtsi2ssq %rdi, %xmm1 # sched: [5:2.00]
-; BDVER2-SSE-NEXT:    cvtsi2ssq (%rsi), %xmm0 # sched: [10:1.00]
-; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2ssq %rdi, %xmm1 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2ssq (%rsi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtsi2ssq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtsi2ssq:
 ; BTVER2-SSE:       # %bb.0:
@@ -1437,17 +1437,17 @@ define i32 @test_cvtss2si(float %a0, float *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtss2si:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtss2si %xmm0, %ecx # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    cvtss2si (%rdi), %eax # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtss2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvtss2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtss2si:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtss2si %xmm0, %ecx # sched: [5:1.00]
-; BDVER2-NEXT:    vcvtss2si (%rdi), %eax # sched: [10:1.00]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtss2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvtss2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtss2si:
 ; BTVER2-SSE:       # %bb.0:
@@ -1580,17 +1580,17 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtss2siq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtss2si %xmm0, %rcx # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    cvtss2si (%rdi), %rax # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtss2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvtss2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtss2siq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtss2si %xmm0, %rcx # sched: [5:1.00]
-; BDVER2-NEXT:    vcvtss2si (%rdi), %rax # sched: [10:1.00]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtss2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvtss2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtss2siq:
 ; BTVER2-SSE:       # %bb.0:
@@ -1723,17 +1723,17 @@ define i32 @test_cvttss2si(float %a0, float *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvttss2si:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvttss2si %xmm0, %ecx # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    cvttss2si (%rdi), %eax # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvttss2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvttss2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvttss2si:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvttss2si %xmm0, %ecx # sched: [5:1.00]
-; BDVER2-NEXT:    vcvttss2si (%rdi), %eax # sched: [10:1.00]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvttss2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvttss2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvttss2si:
 ; BTVER2-SSE:       # %bb.0:
@@ -1863,17 +1863,17 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvttss2siq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvttss2si %xmm0, %rcx # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    cvttss2si (%rdi), %rax # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvttss2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvttss2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvttss2siq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvttss2si %xmm0, %rcx # sched: [5:1.00]
-; BDVER2-NEXT:    vcvttss2si (%rdi), %rax # sched: [10:1.00]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvttss2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvttss2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvttss2siq:
 ; BTVER2-SSE:       # %bb.0:
@@ -1990,15 +1990,15 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ;
 ; BDVER2-SSE-LABEL: test_divps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    divps %xmm1, %xmm0 # sched: [14:14.00]
-; BDVER2-SSE-NEXT:    divps (%rdi), %xmm0 # sched: [20:14.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    divps %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-SSE-NEXT:    divps (%rdi), %xmm0 # sched: [14:9.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_divps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [14:14.00]
-; BDVER2-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [20:14.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [14:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_divps:
 ; BTVER2-SSE:       # %bb.0:
@@ -2110,15 +2110,15 @@ define float @test_divss(float %a0, float %a1, float *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_divss:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    divss %xmm1, %xmm0 # sched: [14:14.00]
-; BDVER2-SSE-NEXT:    divss (%rdi), %xmm0 # sched: [20:14.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    divss %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-SSE-NEXT:    divss (%rdi), %xmm0 # sched: [14:9.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_divss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [14:14.00]
-; BDVER2-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [20:14.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [14:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_divss:
 ; BTVER2-SSE:       # %bb.0:
@@ -2230,15 +2230,15 @@ define void @test_ldmxcsr(i32 %a0) {
 ;
 ; BDVER2-SSE-LABEL: test_ldmxcsr:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_ldmxcsr:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; BDVER2-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; BDVER2-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_ldmxcsr:
 ; BTVER2-SSE:       # %bb.0:
@@ -2352,15 +2352,15 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ;
 ; BDVER2-SSE-LABEL: test_maxps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    maxps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    maxps (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    maxps %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    maxps (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_maxps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_maxps:
 ; BTVER2-SSE:       # %bb.0:
@@ -2473,15 +2473,15 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ;
 ; BDVER2-SSE-LABEL: test_maxss:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    maxss %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    maxss (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    maxss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    maxss (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_maxss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_maxss:
 ; BTVER2-SSE:       # %bb.0:
@@ -2594,15 +2594,15 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ;
 ; BDVER2-SSE-LABEL: test_minps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    minps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    minps (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    minps %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    minps (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_minps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_minps:
 ; BTVER2-SSE:       # %bb.0:
@@ -2715,15 +2715,15 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ;
 ; BDVER2-SSE-LABEL: test_minss:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    minss %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    minss (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    minss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    minss (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_minss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_minss:
 ; BTVER2-SSE:       # %bb.0:
@@ -2849,17 +2849,17 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movaps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movaps (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-SSE-NEXT:    addps %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movaps (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm0, %xmm0 # sched: [5:1.00]
 ; BDVER2-SSE-NEXT:    movaps %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movaps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovaps (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovaps (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
 ; BDVER2-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movaps:
 ; BTVER2-SSE:       # %bb.0:
@@ -2970,13 +2970,13 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movhlps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movhlps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movhlps:
 ; BTVER2-SSE:       # %bb.0:
@@ -3111,19 +3111,19 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ;
 ; BDVER2-SSE-LABEL: test_movhps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movhps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movhps:
 ; BTVER2-SSE:       # %bb.0:
@@ -3249,15 +3249,15 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movlhps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movlhps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [2:0.50]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movlhps:
 ; BTVER2-SSE:       # %bb.0:
@@ -3395,19 +3395,19 @@ define <4 x float> @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ;
 ; BDVER2-SSE-LABEL: test_movlps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movlps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovlps %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movlps:
 ; BTVER2-SSE:       # %bb.0:
@@ -3520,13 +3520,13 @@ define i32 @test_movmskps(<4 x float> %a0) {
 ;
 ; BDVER2-SSE-LABEL: test_movmskps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movmskps %xmm0, %eax # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movmskps %xmm0, %eax # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movmskps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovmskps %xmm0, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovmskps %xmm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movmskps:
 ; BTVER2-SSE:       # %bb.0:
@@ -3626,13 +3626,13 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movntps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movntps %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movntps %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movntps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovntps %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movntps:
 ; BTVER2-SSE:       # %bb.0:
@@ -3751,17 +3751,17 @@ define void @test_movss_mem(float* %a0, float* %a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movss_mem:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
-; BDVER2-SSE-NEXT:    addss %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    movss %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addss %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movss %xmm0, (%rsi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movss_mem:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
-; BDVER2-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovss %xmm0, (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movss_mem:
 ; BTVER2-SSE:       # %bb.0:
@@ -3870,13 +3870,13 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movss_reg:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movss_reg:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movss_reg:
 ; BTVER2-SSE:       # %bb.0:
@@ -3995,17 +3995,17 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movups:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movups (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-SSE-NEXT:    addps %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movups (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm0, %xmm0 # sched: [5:1.00]
 ; BDVER2-SSE-NEXT:    movups %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movups:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovups (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
 ; BDVER2-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movups:
 ; BTVER2-SSE:       # %bb.0:
@@ -4122,14 +4122,14 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BDVER2-SSE-LABEL: test_mulps:
 ; BDVER2-SSE:       # %bb.0:
 ; BDVER2-SSE-NEXT:    mulps %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    mulps (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    mulps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_mulps:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_mulps:
 ; BTVER2-SSE:       # %bb.0:
@@ -4242,14 +4242,14 @@ define float @test_mulss(float %a0, float %a1, float *%a2) {
 ; BDVER2-SSE-LABEL: test_mulss:
 ; BDVER2-SSE:       # %bb.0:
 ; BDVER2-SSE-NEXT:    mulss %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    mulss (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    mulss (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_mulss:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_mulss:
 ; BTVER2-SSE:       # %bb.0:
@@ -4365,15 +4365,15 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
 ;
 ; BDVER2-SSE-LABEL: test_orps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    orps %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    orps (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    orps %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    orps (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_orps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_orps:
 ; BTVER2-SSE:       # %bb.0:
@@ -4547,7 +4547,7 @@ define void @test_prefetch(i8* %a0) optsize {
 ; BDVER2-SSE-NEXT:    prefetcht1 (%rdi) # sched: [5:0.50]
 ; BDVER2-SSE-NEXT:    prefetcht2 (%rdi) # sched: [5:0.50]
 ; BDVER2-SSE-NEXT:    #NO_APP
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_prefetch:
 ; BDVER2:       # %bb.0:
@@ -4557,7 +4557,7 @@ define void @test_prefetch(i8* %a0) optsize {
 ; BDVER2-NEXT:    prefetcht1 (%rdi) # sched: [5:0.50]
 ; BDVER2-NEXT:    prefetcht2 (%rdi) # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_prefetch:
 ; BTVER2-SSE:       # %bb.0:
@@ -4699,16 +4699,16 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
 ; BDVER2-SSE-LABEL: test_rcpps:
 ; BDVER2-SSE:       # %bb.0:
 ; BDVER2-SSE-NEXT:    rcpps %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    rcpps (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    rcpps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_rcpps:
 ; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps (%rdi), %xmm1 # sched: [10:1.00]
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vrcpps (%rdi), %xmm1 # sched: [11:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_rcpps:
 ; BTVER2-SSE:       # %bb.0:
@@ -4854,19 +4854,19 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_rcpss:
 ; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
 ; BDVER2-SSE-NEXT:    rcpss %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
 ; BDVER2-SSE-NEXT:    rcpss %xmm1, %xmm1 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_rcpss:
 ; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
 ; BDVER2-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_rcpss:
 ; BTVER2-SSE:       # %bb.0:
@@ -5006,16 +5006,16 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
 ; BDVER2-SSE-LABEL: test_rsqrtps:
 ; BDVER2-SSE:       # %bb.0:
 ; BDVER2-SSE-NEXT:    rsqrtps %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    rsqrtps (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    rsqrtps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_rsqrtps:
 ; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [10:1.00]
 ; BDVER2-NEXT:    vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [11:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_rsqrtps:
 ; BTVER2-SSE:       # %bb.0:
@@ -5161,19 +5161,19 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_rsqrtss:
 ; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
 ; BDVER2-SSE-NEXT:    rsqrtss %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
 ; BDVER2-SSE-NEXT:    rsqrtss %xmm1, %xmm1 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_rsqrtss:
 ; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
 ; BDVER2-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
 ; BDVER2-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_rsqrtss:
 ; BTVER2-SSE:       # %bb.0:
@@ -5290,13 +5290,13 @@ define void @test_sfence() {
 ;
 ; BDVER2-SSE-LABEL: test_sfence:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    sfence # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    sfence # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_sfence:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    sfence # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    sfence # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_sfence:
 ; BTVER2-SSE:       # %bb.0:
@@ -5416,17 +5416,17 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ;
 ; BDVER2-SSE-LABEL: test_shufps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_shufps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; BDVER2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] sched: [7:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [2:0.50]
+; BDVER2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_shufps:
 ; BTVER2-SSE:       # %bb.0:
@@ -5557,17 +5557,17 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_sqrtps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    sqrtps %xmm0, %xmm1 # sched: [14:14.00]
-; BDVER2-SSE-NEXT:    sqrtps (%rdi), %xmm0 # sched: [20:14.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    sqrtps %xmm0, %xmm1 # sched: [9:10.50]
+; BDVER2-SSE-NEXT:    sqrtps (%rdi), %xmm0 # sched: [14:10.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_sqrtps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [14:14.00]
-; BDVER2-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [20:14.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [14:10.50]
+; BDVER2-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [9:10.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_sqrtps:
 ; BTVER2-SSE:       # %bb.0:
@@ -5713,19 +5713,19 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_sqrtss:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    sqrtss %xmm0, %xmm0 # sched: [14:14.00]
-; BDVER2-SSE-NEXT:    movaps (%rdi), %xmm1 # sched: [6:0.50]
-; BDVER2-SSE-NEXT:    sqrtss %xmm1, %xmm1 # sched: [14:14.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movaps (%rdi), %xmm1 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    sqrtss %xmm0, %xmm0 # sched: [9:10.50]
+; BDVER2-SSE-NEXT:    sqrtss %xmm1, %xmm1 # sched: [9:10.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_sqrtss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [14:14.00]
-; BDVER2-NEXT:    vmovaps (%rdi), %xmm1 # sched: [6:0.50]
-; BDVER2-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [14:14.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovaps (%rdi), %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [9:10.50]
+; BDVER2-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [9:10.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_sqrtss:
 ; BTVER2-SSE:       # %bb.0:
@@ -5847,15 +5847,15 @@ define i32 @test_stmxcsr() {
 ;
 ; BDVER2-SSE-LABEL: test_stmxcsr:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    stmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    stmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_stmxcsr:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; BDVER2-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50]
 ; BDVER2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_stmxcsr:
 ; BTVER2-SSE:       # %bb.0:
@@ -5969,15 +5969,15 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ;
 ; BDVER2-SSE-LABEL: test_subps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    subps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    subps (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    subps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    subps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_subps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_subps:
 ; BTVER2-SSE:       # %bb.0:
@@ -6089,15 +6089,15 @@ define float @test_subss(float %a0, float %a1, float *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_subss:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    subss %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    subss (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    subss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    subss (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_subss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_subss:
 ; BTVER2-SSE:       # %bb.0:
@@ -6313,31 +6313,31 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_ucomiss:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    ucomiss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    ucomiss %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    ucomiss (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    ucomiss (%rdi), %xmm0 # sched: [6:1.00]
 ; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_ucomiss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vucomiss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vucomiss %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
-; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.33]
-; BDVER2-NEXT:    vucomiss (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-NEXT:    vucomiss (%rdi), %xmm0 # sched: [6:1.00]
 ; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
-; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.33]
-; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.33]
-; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_ucomiss:
 ; BTVER2-SSE:       # %bb.0:
@@ -6496,17 +6496,17 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ;
 ; BDVER2-SSE-LABEL: test_unpckhps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_unpckhps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; BDVER2-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
+; BDVER2-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_unpckhps:
 ; BTVER2-SSE:       # %bb.0:
@@ -6636,17 +6636,17 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ;
 ; BDVER2-SSE-LABEL: test_unpcklps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_unpcklps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; BDVER2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_unpcklps:
 ; BTVER2-SSE:       # %bb.0:
@@ -6767,15 +6767,15 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ;
 ; BDVER2-SSE-LABEL: test_xorps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    xorps %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    xorps (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    xorps %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    xorps (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_xorps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_xorps:
 ; BTVER2-SSE:       # %bb.0:
@@ -6923,19 +6923,19 @@ define <4 x float> @test_fnop() nounwind {
 ;
 ; BDVER2-SSE-LABEL: test_fnop:
 ; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    xorps %xmm0, %xmm0 # sched: [0:0.25]
 ; BDVER2-SSE-NEXT:    #APP
-; BDVER2-SSE-NEXT:    nop # sched: [1:0.25]
+; BDVER2-SSE-NEXT:    nop # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    #NO_APP
-; BDVER2-SSE-NEXT:    xorps %xmm0, %xmm0 # sched: [0:0.25]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_fnop:
 ; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # sched: [0:0.25]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    nop # sched: [1:0.25]
+; BDVER2-NEXT:    nop # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # sched: [0:0.25]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_fnop:
 ; BTVER2-SSE:       # %bb.0:
diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index b6079121206..d2ee1e09d08 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -14,8 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx,+xop -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -102,15 +102,15 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_addpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    addpd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addpd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_addpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_addpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -222,15 +222,15 @@ define double @test_addsd(double %a0, double %a1, double *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_addsd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    addsd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addsd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_addsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_addsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -355,17 +355,17 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_andpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    andpd %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    andpd (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andpd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    andpd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_andpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_andpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -499,17 +499,17 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ;
 ; BDVER2-SSE-LABEL: test_andnotpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    andnpd %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    andnpd (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andnpd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    andnpd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_andnotpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_andnotpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -625,13 +625,13 @@ define void @test_clflush(i8* %p){
 ;
 ; BDVER2-SSE-LABEL: test_clflush:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    clflush (%rdi) # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    clflush (%rdi) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_clflush:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    clflush (%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    clflush (%rdi) # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_clflush:
 ; BTVER2-SSE:       # %bb.0:
@@ -751,17 +751,17 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_cmppd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cmpeqpd %xmm0, %xmm1 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    cmpeqpd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    orpd %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cmpeqpd %xmm0, %xmm1 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    cmpeqpd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    orpd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cmppd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; BDVER2-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [2:1.00]
+; BDVER2-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cmppd:
 ; BTVER2-SSE:       # %bb.0:
@@ -880,15 +880,15 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_cmpsd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cmpeqsd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    cmpeqsd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cmpeqsd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    cmpeqsd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cmpsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cmpsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -1109,31 +1109,31 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_comisd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    comisd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    comisd %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    comisd (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    comisd (%rdi), %xmm0 # sched: [6:1.00]
 ; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_comisd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcomisd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vcomisd %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
-; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.33]
-; BDVER2-NEXT:    vcomisd (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-NEXT:    vcomisd (%rdi), %xmm0 # sched: [6:1.00]
 ; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
-; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.33]
-; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.33]
-; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_comisd:
 ; BTVER2-SSE:       # %bb.0:
@@ -1294,17 +1294,17 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtdq2pd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtdq2pd %xmm0, %xmm1 # sched: [4:1.00]
-; BDVER2-SSE-NEXT:    cvtdq2pd (%rdi), %xmm0 # sched: [10:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtdq2pd %xmm0, %xmm1 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    cvtdq2pd (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtdq2pd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
-; BDVER2-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtdq2pd:
 ; BTVER2-SSE:       # %bb.0:
@@ -1437,17 +1437,17 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtdq2ps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtdq2ps %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    cvtdq2ps %xmm0, %xmm1 # sched: [4:1.00]
 ; BDVER2-SSE-NEXT:    cvtdq2ps (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtdq2ps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
 ; BDVER2-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [9:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtdq2ps:
 ; BTVER2-SSE:       # %bb.0:
@@ -1579,17 +1579,17 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtpd2dq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtpd2dq %xmm0, %xmm1 # sched: [4:1.00]
-; BDVER2-SSE-NEXT:    cvtpd2dq (%rdi), %xmm0 # sched: [10:1.00]
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtpd2dq %xmm0, %xmm1 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    cvtpd2dq (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtpd2dq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
-; BDVER2-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtpd2dq:
 ; BTVER2-SSE:       # %bb.0:
@@ -1722,17 +1722,17 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtpd2ps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtpd2ps %xmm0, %xmm1 # sched: [4:1.00]
-; BDVER2-SSE-NEXT:    cvtpd2ps (%rdi), %xmm0 # sched: [10:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtpd2ps %xmm0, %xmm1 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    cvtpd2ps (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtpd2ps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
-; BDVER2-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtpd2ps:
 ; BTVER2-SSE:       # %bb.0:
@@ -1864,17 +1864,17 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtps2dq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtps2dq %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    cvtps2dq %xmm0, %xmm1 # sched: [4:1.00]
 ; BDVER2-SSE-NEXT:    cvtps2dq (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtps2dq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
 ; BDVER2-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [9:1.00]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtps2dq:
 ; BTVER2-SSE:       # %bb.0:
@@ -2006,17 +2006,17 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtps2pd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtps2pd %xmm0, %xmm1 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    cvtps2pd (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtps2pd %xmm0, %xmm1 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    cvtps2pd (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtps2pd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [7:1.00]
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtps2pd:
 ; BTVER2-SSE:       # %bb.0:
@@ -2148,17 +2148,17 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtsd2si:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtsd2si %xmm0, %ecx # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    cvtsd2si (%rdi), %eax # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtsd2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvtsd2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtsd2si:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [5:1.00]
-; BDVER2-NEXT:    vcvtsd2si (%rdi), %eax # sched: [10:1.00]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtsd2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtsd2si:
 ; BTVER2-SSE:       # %bb.0:
@@ -2291,17 +2291,17 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtsd2siq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtsd2si %xmm0, %rcx # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    cvtsd2si (%rdi), %rax # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtsd2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvtsd2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtsd2siq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [5:1.00]
-; BDVER2-NEXT:    vcvtsd2si (%rdi), %rax # sched: [10:1.00]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvtsd2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtsd2siq:
 ; BTVER2-SSE:       # %bb.0:
@@ -2449,18 +2449,18 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
 ; BDVER2-SSE-LABEL: test_cvtsd2ss:
 ; BDVER2-SSE:       # %bb.0:
 ; BDVER2-SSE-NEXT:    cvtsd2ss %xmm0, %xmm1 # sched: [4:1.00]
-; BDVER2-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
+; BDVER2-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
 ; BDVER2-SSE-NEXT:    cvtsd2ss %xmm0, %xmm0 # sched: [4:1.00]
-; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtsd2ss:
 ; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
 ; BDVER2-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
-; BDVER2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
 ; BDVER2-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
-; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtsd2ss:
 ; BTVER2-SSE:       # %bb.0:
@@ -2594,17 +2594,17 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtsi2sd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtsi2sdl %edi, %xmm1 # sched: [4:1.00]
 ; BDVER2-SSE-NEXT:    cvtsi2sdl (%rsi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2sdl %edi, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtsi2sd:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
 ; BDVER2-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
-; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtsi2sd:
 ; BTVER2-SSE:       # %bb.0:
@@ -2734,17 +2734,17 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtsi2sdq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtsi2sdq %rdi, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2sdq %rdi, %xmm1 # sched: [13:1.00]
 ; BDVER2-SSE-NEXT:    cvtsi2sdq (%rsi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtsi2sdq:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
 ; BDVER2-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
-; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtsi2sdq:
 ; BTVER2-SSE:       # %bb.0:
@@ -2890,19 +2890,19 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvtss2sd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvtss2sd %xmm0, %xmm1 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
-; BDVER2-SSE-NEXT:    cvtss2sd %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvtss2sd %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    cvtss2sd %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvtss2sd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
-; BDVER2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
-; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvtss2sd:
 ; BTVER2-SSE:       # %bb.0:
@@ -3038,17 +3038,17 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvttpd2dq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvttpd2dq %xmm0, %xmm1 # sched: [4:1.00]
-; BDVER2-SSE-NEXT:    cvttpd2dq (%rdi), %xmm0 # sched: [10:1.00]
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvttpd2dq %xmm0, %xmm1 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    cvttpd2dq (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvttpd2dq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
-; BDVER2-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvttpd2dq:
 ; BTVER2-SSE:       # %bb.0:
@@ -3181,17 +3181,17 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvttps2dq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvttps2dq %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    cvttps2dq %xmm0, %xmm1 # sched: [4:1.00]
 ; BDVER2-SSE-NEXT:    cvttps2dq (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvttps2dq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
 ; BDVER2-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [9:1.00]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvttps2dq:
 ; BTVER2-SSE:       # %bb.0:
@@ -3321,17 +3321,17 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvttsd2si:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvttsd2si %xmm0, %ecx # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    cvttsd2si (%rdi), %eax # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvttsd2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvttsd2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvttsd2si:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [5:1.00]
-; BDVER2-NEXT:    vcvttsd2si (%rdi), %eax # sched: [10:1.00]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvttsd2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvttsd2si:
 ; BTVER2-SSE:       # %bb.0:
@@ -3461,17 +3461,17 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_cvttsd2siq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    cvttsd2si %xmm0, %rcx # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    cvttsd2si (%rdi), %rax # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    cvttsd2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvttsd2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_cvttsd2siq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [5:1.00]
-; BDVER2-NEXT:    vcvttsd2si (%rdi), %rax # sched: [10:1.00]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vcvttsd2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_cvttsd2siq:
 ; BTVER2-SSE:       # %bb.0:
@@ -3588,15 +3588,15 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_divpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    divpd %xmm1, %xmm0 # sched: [22:22.00]
-; BDVER2-SSE-NEXT:    divpd (%rdi), %xmm0 # sched: [28:22.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    divpd %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-SSE-NEXT:    divpd (%rdi), %xmm0 # sched: [14:9.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_divpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [22:22.00]
-; BDVER2-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [28:22.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [14:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_divpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -3708,15 +3708,15 @@ define double @test_divsd(double %a0, double %a1, double *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_divsd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    divsd %xmm1, %xmm0 # sched: [22:22.00]
-; BDVER2-SSE-NEXT:    divsd (%rdi), %xmm0 # sched: [28:22.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    divsd %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-SSE-NEXT:    divsd (%rdi), %xmm0 # sched: [14:9.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_divsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [22:22.00]
-; BDVER2-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [28:22.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [14:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_divsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -3821,13 +3821,13 @@ define void @test_lfence() {
 ;
 ; BDVER2-SSE-LABEL: test_lfence:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    lfence # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    lfence # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_lfence:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    lfence # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    lfence # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_lfence:
 ; BTVER2-SSE:       # %bb.0:
@@ -3927,13 +3927,13 @@ define void @test_mfence() {
 ;
 ; BDVER2-SSE-LABEL: test_mfence:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    mfence # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    mfence # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_mfence:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    mfence # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    mfence # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_mfence:
 ; BTVER2-SSE:       # %bb.0:
@@ -4032,12 +4032,12 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) {
 ; BDVER2-SSE-LABEL: test_maskmovdqu:
 ; BDVER2-SSE:       # %bb.0:
 ; BDVER2-SSE-NEXT:    maskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_maskmovdqu:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_maskmovdqu:
 ; BTVER2-SSE:       # %bb.0:
@@ -4144,15 +4144,15 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_maxpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    maxpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    maxpd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    maxpd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    maxpd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_maxpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_maxpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -4265,15 +4265,15 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_maxsd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    maxsd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    maxsd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    maxsd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    maxsd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_maxsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_maxsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -4386,15 +4386,15 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_minpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    minpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    minpd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    minpd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    minpd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_minpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_minpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -4507,15 +4507,15 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_minsd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    minsd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    minsd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    minsd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    minsd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_minsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_minsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -4641,17 +4641,17 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movapd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movapd (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movapd (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [5:1.00]
 ; BDVER2-SSE-NEXT:    movapd %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movapd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovapd (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovapd (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
 ; BDVER2-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movapd:
 ; BTVER2-SSE:       # %bb.0:
@@ -4780,17 +4780,17 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movdqa:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movdqa (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movdqa (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    movdqa %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movdqa:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movdqa:
 ; BTVER2-SSE:       # %bb.0:
@@ -4919,17 +4919,17 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movdqu:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movdqu (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movdqu (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    movdqu %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movdqu:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movdqu:
 ; BTVER2-SSE:       # %bb.0:
@@ -5097,23 +5097,23 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_movd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movd %edi, %xmm1 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
-; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm1 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm2 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    movd %xmm2, %eax # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    movd %xmm1, (%rsi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    movd %edi, %xmm1 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movd %xmm2, %eax # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movd %xmm1, (%rsi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovd %edi, %xmm1 # sched: [1:1.00]
-; BDVER2-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; BDVER2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    vmovd %xmm0, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovd %edi, %xmm1 # sched: [10:0.50]
+; BDVER2-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vmovd %xmm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    vmovd %xmm1, (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movd:
 ; BTVER2-SSE:       # %bb.0:
@@ -5298,23 +5298,23 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_movd_64:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movq %rdi, %xmm1 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero sched: [6:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm1 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm2 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    movq %xmm2, %rax # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    movq %xmm1, (%rsi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    movq %rdi, %xmm1 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movq %xmm2, %rax # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movq %xmm1, (%rsi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movd_64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovq %rdi, %xmm1 # sched: [1:1.00]
-; BDVER2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [6:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; BDVER2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    vmovq %xmm0, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovq %rdi, %xmm1 # sched: [10:0.50]
+; BDVER2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [5:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vmovq %xmm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    vmovq %xmm1, (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movd_64:
 ; BTVER2-SSE:       # %bb.0:
@@ -5474,19 +5474,19 @@ define <2 x double> @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a
 ;
 ; BDVER2-SSE-LABEL: test_movhpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    movhpd %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movapd %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movhpd %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movhpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movhpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -5636,19 +5636,19 @@ define <2 x double> @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a
 ;
 ; BDVER2-SSE-LABEL: test_movlpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    movlpd %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movapd %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movlpd %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movlpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movlpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -5760,13 +5760,13 @@ define i32 @test_movmskpd(<2 x double> %a0) {
 ;
 ; BDVER2-SSE-LABEL: test_movmskpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movmskpd %xmm0, %eax # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movmskpd %xmm0, %eax # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movmskpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovmskpd %xmm0, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovmskpd %xmm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movmskpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -5875,15 +5875,15 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movntdqa:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    movntdq %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movntdq %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movntdqa:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movntdqa:
 ; BTVER2-SSE:       # %bb.0:
@@ -5994,15 +5994,15 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movntpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    movntpd %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movntpd %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movntpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movntpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -6126,17 +6126,17 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movq_mem:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    movq %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movq %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movq_mem:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vmovq %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movq_mem:
 ; BTVER2-SSE:       # %bb.0:
@@ -6256,15 +6256,15 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movq_reg:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movq_reg:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
-; BDVER2-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movq_reg:
 ; BTVER2-SSE:       # %bb.0:
@@ -6388,17 +6388,17 @@ define void @test_movsd_mem(double* %a0, double* %a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movsd_mem:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
-; BDVER2-SSE-NEXT:    addsd %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    movsd %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addsd %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movsd %xmm0, (%rsi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movsd_mem:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
-; BDVER2-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
+; BDVER2-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovsd %xmm0, (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movsd_mem:
 ; BTVER2-SSE:       # %bb.0:
@@ -6513,14 +6513,14 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movsd_reg:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movsd_reg:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movsd_reg:
 ; BTVER2-SSE:       # %bb.0:
@@ -6641,17 +6641,17 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movupd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movupd (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movupd (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [5:1.00]
 ; BDVER2-SSE-NEXT:    movupd %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movupd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovupd (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BDVER2-NEXT:    vmovupd (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
 ; BDVER2-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movupd:
 ; BTVER2-SSE:       # %bb.0:
@@ -6768,14 +6768,14 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BDVER2-SSE-LABEL: test_mulpd:
 ; BDVER2-SSE:       # %bb.0:
 ; BDVER2-SSE-NEXT:    mulpd %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    mulpd (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    mulpd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_mulpd:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_mulpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -6888,14 +6888,14 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) {
 ; BDVER2-SSE-LABEL: test_mulsd:
 ; BDVER2-SSE:       # %bb.0:
 ; BDVER2-SSE-NEXT:    mulsd %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    mulsd (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    mulsd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_mulsd:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_mulsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -7020,17 +7020,17 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_orpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    orpd %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    orpd (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    orpd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    orpd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_orpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_orpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -7155,15 +7155,15 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_packssdw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    packssdw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    packssdw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    packssdw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_packssdw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_packssdw:
 ; BTVER2-SSE:       # %bb.0:
@@ -7281,15 +7281,15 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_packsswb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    packsswb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    packsswb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    packsswb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_packsswb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_packsswb:
 ; BTVER2-SSE:       # %bb.0:
@@ -7407,15 +7407,15 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_packuswb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    packuswb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    packuswb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    packuswb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_packuswb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_packuswb:
 ; BTVER2-SSE:       # %bb.0:
@@ -7533,15 +7533,15 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_paddb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    paddb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    paddb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_paddb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_paddb:
 ; BTVER2-SSE:       # %bb.0:
@@ -7657,15 +7657,15 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_paddd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    paddd (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_paddd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_paddd:
 ; BTVER2-SSE:       # %bb.0:
@@ -7777,15 +7777,15 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_paddq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    paddq (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_paddq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_paddq:
 ; BTVER2-SSE:       # %bb.0:
@@ -7901,15 +7901,15 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_paddsb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    paddsb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddsb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    paddsb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_paddsb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_paddsb:
 ; BTVER2-SSE:       # %bb.0:
@@ -8026,15 +8026,15 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_paddsw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    paddsw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddsw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    paddsw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_paddsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_paddsw:
 ; BTVER2-SSE:       # %bb.0:
@@ -8151,15 +8151,15 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_paddusb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    paddusb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddusb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    paddusb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_paddusb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_paddusb:
 ; BTVER2-SSE:       # %bb.0:
@@ -8276,15 +8276,15 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_paddusw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    paddusw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddusw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    paddusw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_paddusw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_paddusw:
 ; BTVER2-SSE:       # %bb.0:
@@ -8401,15 +8401,15 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_paddw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    paddw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_paddw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_paddw:
 ; BTVER2-SSE:       # %bb.0:
@@ -8534,17 +8534,17 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pand:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pand %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pand %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pand (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pand:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pand:
 ; BTVER2-SSE:       # %bb.0:
@@ -8690,19 +8690,19 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pandn:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pandn %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm1 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pandn %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pandn (%rdi), %xmm1 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm1 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    movdqa %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pandn:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    vpandn %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pandn:
 ; BTVER2-SSE:       # %bb.0:
@@ -8829,15 +8829,15 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pavgb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pavgb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pavgb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pavgb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pavgb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pavgb:
 ; BTVER2-SSE:       # %bb.0:
@@ -8963,15 +8963,15 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pavgw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pavgw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pavgw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pavgw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pavgw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pavgw:
 ; BTVER2-SSE:       # %bb.0:
@@ -9108,17 +9108,17 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpeqb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pcmpeqb %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqb %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pcmpeqb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpeqb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomeqb %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpcomeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpeqb:
 ; BTVER2-SSE:       # %bb.0:
@@ -9251,17 +9251,17 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpeqd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pcmpeqd %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqd %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pcmpeqd (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpeqd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomeqd %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpcomeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpeqd:
 ; BTVER2-SSE:       # %bb.0:
@@ -9394,17 +9394,17 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpeqw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pcmpeqw %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqw %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pcmpeqw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpeqw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomeqw %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpcomeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpeqw:
 ; BTVER2-SSE:       # %bb.0:
@@ -9543,18 +9543,18 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpgtb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    pcmpgtb %xmm1, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pcmpgtb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pcmpgtb %xmm1, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpgtb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpcomgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpgtb:
 ; BTVER2-SSE:       # %bb.0:
@@ -9695,18 +9695,18 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpgtd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    pcmpgtd %xmm1, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pcmpeqd (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pcmpgtd %xmm1, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpgtd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpcomeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpgtd:
 ; BTVER2-SSE:       # %bb.0:
@@ -9847,18 +9847,18 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpgtw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    pcmpgtw %xmm1, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pcmpgtw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pcmpgtw %xmm1, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpgtw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpcomgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpgtw:
 ; BTVER2-SSE:       # %bb.0:
@@ -9978,15 +9978,15 @@ define i16 @test_pextrw(<8 x i16> %a0) {
 ;
 ; BDVER2-SSE-LABEL: test_pextrw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pextrw $6, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    pextrw $6, %xmm0, %eax # sched: [13:1.00]
 ; BDVER2-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pextrw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpextrw $6, %xmm0, %eax # sched: [3:1.00]
+; BDVER2-NEXT:    vpextrw $6, %xmm0, %eax # sched: [13:1.00]
 ; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pextrw:
 ; BTVER2-SSE:       # %bb.0:
@@ -10100,15 +10100,15 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pinsrw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pinsrw $1, %edi, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    pinsrw $3, (%rsi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pinsrw $1, %edi, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pinsrw $3, (%rsi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pinsrw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pinsrw:
 ; BTVER2-SSE:       # %bb.0:
@@ -10220,15 +10220,15 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmaddwd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmaddwd %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    pmaddwd (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pmaddwd %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmaddwd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmaddwd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmaddwd:
 ; BTVER2-SSE:       # %bb.0:
@@ -10346,15 +10346,15 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmaxsw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmaxsw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxsw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmaxsw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmaxsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmaxsw:
 ; BTVER2-SSE:       # %bb.0:
@@ -10471,15 +10471,15 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmaxub:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmaxub %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxub %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmaxub (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmaxub:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmaxub:
 ; BTVER2-SSE:       # %bb.0:
@@ -10596,15 +10596,15 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pminsw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pminsw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminsw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pminsw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pminsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pminsw:
 ; BTVER2-SSE:       # %bb.0:
@@ -10721,15 +10721,15 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pminub:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pminub %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminub %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pminub (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pminub:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pminub:
 ; BTVER2-SSE:       # %bb.0:
@@ -10831,13 +10831,13 @@ define i32 @test_pmovmskb(<16 x i8> %a0) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovmskb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovmskb %xmm0, %eax # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pmovmskb %xmm0, %eax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovmskb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovmskb %xmm0, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovmskb %xmm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovmskb:
 ; BTVER2-SSE:       # %bb.0:
@@ -10944,15 +10944,15 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmulhuw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmulhuw %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    pmulhuw (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pmulhuw %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmulhuw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmulhuw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmulhuw:
 ; BTVER2-SSE:       # %bb.0:
@@ -11065,15 +11065,15 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmulhw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmulhw %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    pmulhw (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pmulhw %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmulhw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmulhw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmulhw:
 ; BTVER2-SSE:       # %bb.0:
@@ -11186,15 +11186,15 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmullw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmullw %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    pmullw (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pmullw %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmullw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmullw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmullw:
 ; BTVER2-SSE:       # %bb.0:
@@ -11306,15 +11306,15 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmuludq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmuludq %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    pmuludq (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pmuludq %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmuludq (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmuludq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmuludq:
 ; BTVER2-SSE:       # %bb.0:
@@ -11441,17 +11441,17 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_por:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    por (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_por:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_por:
 ; BTVER2-SSE:       # %bb.0:
@@ -11568,15 +11568,15 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psadbw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psadbw %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    psadbw (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    psadbw %xmm1, %xmm0 # sched: [4:0.50]
+; BDVER2-SSE-NEXT:    psadbw (%rdi), %xmm0 # sched: [9:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psadbw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; BDVER2-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psadbw:
 ; BTVER2-SSE:       # %bb.0:
@@ -11706,17 +11706,17 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pshufd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,2,1,0] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pshufd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50]
 ; BDVER2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:0.50]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pshufd:
 ; BTVER2-SSE:       # %bb.0:
@@ -11849,17 +11849,17 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pshufhw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pshufhw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
 ; BDVER2-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
-; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [2:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pshufhw:
 ; BTVER2-SSE:       # %bb.0:
@@ -11992,17 +11992,17 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pshuflw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pshuflw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
 ; BDVER2-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
-; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [2:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pshuflw:
 ; BTVER2-SSE:       # %bb.0:
@@ -12132,17 +12132,17 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pslld:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pslld %xmm1, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    pslld (%rdi), %xmm0 # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    pslld $2, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pslld %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    pslld (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    pslld $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pslld:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpslld %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pslld:
 ; BTVER2-SSE:       # %bb.0:
@@ -12254,13 +12254,13 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) {
 ;
 ; BDVER2-SSE-LABEL: test_pslldq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pslldq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pslldq:
 ; BTVER2-SSE:       # %bb.0:
@@ -12379,17 +12379,17 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psllq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psllq %xmm1, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    psllq (%rdi), %xmm0 # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    psllq $2, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    psllq %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psllq (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psllq $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psllq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psllq:
 ; BTVER2-SSE:       # %bb.0:
@@ -12521,17 +12521,17 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psllw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psllw %xmm1, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    psllw (%rdi), %xmm0 # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    psllw $2, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    psllw %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psllw (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psllw $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psllw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psllw:
 ; BTVER2-SSE:       # %bb.0:
@@ -12663,17 +12663,17 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psrad:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psrad %xmm1, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    psrad (%rdi), %xmm0 # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    psrad $2, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    psrad %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psrad (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psrad $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psrad:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psrad:
 ; BTVER2-SSE:       # %bb.0:
@@ -12805,17 +12805,17 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psraw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psraw %xmm1, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    psraw (%rdi), %xmm0 # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    psraw $2, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    psraw %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psraw (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psraw $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psraw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psraw:
 ; BTVER2-SSE:       # %bb.0:
@@ -12947,17 +12947,17 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psrld:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psrld %xmm1, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    psrld (%rdi), %xmm0 # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    psrld $2, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    psrld %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psrld (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psrld $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psrld:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psrld:
 ; BTVER2-SSE:       # %bb.0:
@@ -13069,13 +13069,13 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) {
 ;
 ; BDVER2-SSE-LABEL: test_psrldq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psrldq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psrldq:
 ; BTVER2-SSE:       # %bb.0:
@@ -13194,17 +13194,17 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psrlq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psrlq %xmm1, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    psrlq (%rdi), %xmm0 # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    psrlq $2, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    psrlq %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psrlq (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psrlq $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psrlq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psrlq:
 ; BTVER2-SSE:       # %bb.0:
@@ -13336,17 +13336,17 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psrlw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psrlw %xmm1, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    psrlw (%rdi), %xmm0 # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    psrlw $2, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    psrlw %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psrlw (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psrlw $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psrlw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; BDVER2-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psrlw:
 ; BTVER2-SSE:       # %bb.0:
@@ -13469,15 +13469,15 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psubb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psubb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    psubb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psubb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psubb:
 ; BTVER2-SSE:       # %bb.0:
@@ -13593,15 +13593,15 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psubd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psubd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubd %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    psubd (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psubd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psubd:
 ; BTVER2-SSE:       # %bb.0:
@@ -13713,15 +13713,15 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psubq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psubq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubq %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    psubq (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psubq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psubq:
 ; BTVER2-SSE:       # %bb.0:
@@ -13837,15 +13837,15 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psubsb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psubsb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubsb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    psubsb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psubsb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psubsb:
 ; BTVER2-SSE:       # %bb.0:
@@ -13962,15 +13962,15 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psubsw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psubsw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubsw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    psubsw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psubsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psubsw:
 ; BTVER2-SSE:       # %bb.0:
@@ -14087,15 +14087,15 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psubusb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psubusb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubusb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    psubusb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psubusb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psubusb:
 ; BTVER2-SSE:       # %bb.0:
@@ -14212,15 +14212,15 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psubusw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psubusw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubusw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    psubusw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psubusw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psubusw:
 ; BTVER2-SSE:       # %bb.0:
@@ -14337,15 +14337,15 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psubw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psubw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psubw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    psubw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psubw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psubw:
 ; BTVER2-SSE:       # %bb.0:
@@ -14461,15 +14461,15 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_punpckhbw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_punpckhbw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [2:0.50]
 ; BDVER2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_punpckhbw:
 ; BTVER2-SSE:       # %bb.0:
@@ -14596,17 +14596,17 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_punpckhdq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_punpckhdq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
 ; BDVER2-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_punpckhdq:
 ; BTVER2-SSE:       # %bb.0:
@@ -14736,17 +14736,17 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ;
 ; BDVER2-SSE-LABEL: test_punpckhqdq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_punpckhqdq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [2:0.50]
 ; BDVER2-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_punpckhqdq:
 ; BTVER2-SSE:       # %bb.0:
@@ -14867,15 +14867,15 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_punpckhwd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_punpckhwd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [2:0.50]
 ; BDVER2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_punpckhwd:
 ; BTVER2-SSE:       # %bb.0:
@@ -14991,15 +14991,15 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_punpcklbw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_punpcklbw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [2:0.50]
 ; BDVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_punpcklbw:
 ; BTVER2-SSE:       # %bb.0:
@@ -15126,17 +15126,17 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_punpckldq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_punpckldq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [2:0.50]
 ; BDVER2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_punpckldq:
 ; BTVER2-SSE:       # %bb.0:
@@ -15266,17 +15266,17 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ;
 ; BDVER2-SSE-LABEL: test_punpcklqdq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_punpcklqdq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [2:0.50]
 ; BDVER2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_punpcklqdq:
 ; BTVER2-SSE:       # %bb.0:
@@ -15397,15 +15397,15 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_punpcklwd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_punpcklwd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; BDVER2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
 ; BDVER2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_punpcklwd:
 ; BTVER2-SSE:       # %bb.0:
@@ -15530,17 +15530,17 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pxor:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pxor %xmm1, %xmm0 # sched: [1:0.33]
+; BDVER2-SSE-NEXT:    pxor %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pxor (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pxor:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BDVER2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pxor:
 ; BTVER2-SSE:       # %bb.0:
@@ -15670,17 +15670,17 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ;
 ; BDVER2-SSE-LABEL: test_shufpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_shufpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; BDVER2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [2:0.50]
+; BDVER2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_shufpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -15811,17 +15811,17 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_sqrtpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    sqrtpd %xmm0, %xmm1 # sched: [21:21.00]
-; BDVER2-SSE-NEXT:    sqrtpd (%rdi), %xmm0 # sched: [27:21.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    sqrtpd %xmm0, %xmm1 # sched: [9:13.50]
+; BDVER2-SSE-NEXT:    sqrtpd (%rdi), %xmm0 # sched: [14:13.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_sqrtpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [21:21.00]
-; BDVER2-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [27:21.00]
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [14:13.50]
+; BDVER2-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [9:13.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_sqrtpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -15967,19 +15967,19 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_sqrtsd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    sqrtsd %xmm0, %xmm0 # sched: [21:21.00]
-; BDVER2-SSE-NEXT:    movapd (%rdi), %xmm1 # sched: [6:0.50]
-; BDVER2-SSE-NEXT:    sqrtsd %xmm1, %xmm1 # sched: [21:21.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movapd (%rdi), %xmm1 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    sqrtsd %xmm0, %xmm0 # sched: [9:13.50]
+; BDVER2-SSE-NEXT:    sqrtsd %xmm1, %xmm1 # sched: [9:13.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_sqrtsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:21.00]
-; BDVER2-NEXT:    vmovapd (%rdi), %xmm1 # sched: [6:0.50]
-; BDVER2-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:21.00]
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovapd (%rdi), %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [9:13.50]
+; BDVER2-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [9:13.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_sqrtsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -16101,15 +16101,15 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_subpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    subpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    subpd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    subpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    subpd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_subpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_subpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -16221,15 +16221,15 @@ define double @test_subsd(double %a0, double %a1, double *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_subsd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    subsd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    subsd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    subsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    subsd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_subsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_subsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -16445,31 +16445,31 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
 ;
 ; BDVER2-SSE-LABEL: test_ucomisd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    ucomisd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    ucomisd %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    ucomisd (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    ucomisd (%rdi), %xmm0 # sched: [6:1.00]
 ; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_ucomisd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vucomisd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vucomisd %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
-; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.33]
-; BDVER2-NEXT:    vucomisd (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-NEXT:    vucomisd (%rdi), %xmm0 # sched: [6:1.00]
 ; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
 ; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
-; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.33]
-; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.33]
-; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_ucomisd:
 ; BTVER2-SSE:       # %bb.0:
@@ -16628,17 +16628,17 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ;
 ; BDVER2-SSE-LABEL: test_unpckhpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_unpckhpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_unpckhpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -16776,18 +16776,18 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ;
 ; BDVER2-SSE-LABEL: test_unpcklpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm2, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm2, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_unpcklpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00]
-; BDVER2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
-; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [2:0.50]
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_unpcklpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -16919,17 +16919,17 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_xorpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    xorpd %xmm1, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    xorpd (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    xorpd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    xorpd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_xorpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER2-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_xorpd:
 ; BTVER2-SSE:       # %bb.0:
diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll
index c9583a14292..1c3419a35ff 100644
--- a/test/CodeGen/X86/sse3-schedule.ll
+++ b/test/CodeGen/X86/sse3-schedule.ll
@@ -14,9 +14,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse3 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+sse3 -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,BDVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=+sse3 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,ZNVER1
@@ -102,15 +102,15 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ;
 ; BDVER2-SSE-LABEL: test_addsubpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    addsubpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    addsubpd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addsubpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addsubpd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_addsubpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_addsubpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -223,15 +223,15 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ;
 ; BDVER2-SSE-LABEL: test_addsubps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    addsubps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    addsubps (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addsubps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addsubps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_addsubps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_addsubps:
 ; BTVER2-SSE:       # %bb.0:
@@ -344,15 +344,15 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ;
 ; BDVER2-SSE-LABEL: test_haddpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    haddpd %xmm1, %xmm0 # sched: [5:2.00]
-; BDVER2-SSE-NEXT:    haddpd (%rdi), %xmm0 # sched: [11:2.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    haddpd %xmm1, %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    haddpd (%rdi), %xmm0 # sched: [16:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_haddpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_haddpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -465,15 +465,15 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ;
 ; BDVER2-SSE-LABEL: test_haddps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    haddps %xmm1, %xmm0 # sched: [5:2.00]
-; BDVER2-SSE-NEXT:    haddps (%rdi), %xmm0 # sched: [11:2.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    haddps %xmm1, %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    haddps (%rdi), %xmm0 # sched: [16:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_haddps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_haddps:
 ; BTVER2-SSE:       # %bb.0:
@@ -586,15 +586,15 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ;
 ; BDVER2-SSE-LABEL: test_hsubpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    hsubpd %xmm1, %xmm0 # sched: [5:2.00]
-; BDVER2-SSE-NEXT:    hsubpd (%rdi), %xmm0 # sched: [11:2.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    hsubpd %xmm1, %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    hsubpd (%rdi), %xmm0 # sched: [16:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_hsubpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_hsubpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -707,15 +707,15 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ;
 ; BDVER2-SSE-LABEL: test_hsubps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    hsubps %xmm1, %xmm0 # sched: [5:2.00]
-; BDVER2-SSE-NEXT:    hsubps (%rdi), %xmm0 # sched: [11:2.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    hsubps %xmm1, %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    hsubps (%rdi), %xmm0 # sched: [16:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_hsubps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; BDVER2-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_hsubps:
 ; BTVER2-SSE:       # %bb.0:
@@ -817,13 +817,13 @@ define <16 x i8> @test_lddqu(i8* %a0) {
 ;
 ; BDVER2-SSE-LABEL: test_lddqu:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    lddqu (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    lddqu (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_lddqu:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vlddqu (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vlddqu (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_lddqu:
 ; BTVER2-SSE:       # %bb.0:
@@ -943,17 +943,17 @@ define void @test_monitor(i8* %a0, i32 %a1, i32 %a2) {
 ;
 ; BDVER2-SSE-LABEL: test_monitor:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.33]
 ; BDVER2-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    monitor # sched: [100:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    monitor # sched: [100:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_monitor:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl %esi, %ecx # sched: [1:0.33]
 ; BDVER2-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
-; BDVER2-NEXT:    monitor # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movl %esi, %ecx # sched: [1:0.50]
+; BDVER2-NEXT:    monitor # sched: [100:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_monitor:
 ; BTVER2-SSE:       # %bb.0:
@@ -1082,17 +1082,17 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movddup:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0] sched: [6:0.50]
-; BDVER2-SSE-NEXT:    subpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    subpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movddup:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
-; BDVER2-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [6:0.50]
-; BDVER2-NEXT:    vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [7:0.50]
+; BDVER2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [2:0.50]
+; BDVER2-NEXT:    vsubpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movddup:
 ; BTVER2-SSE:       # %bb.0:
@@ -1223,17 +1223,17 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movshdup:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movshdup {{.*#+}} xmm0 = mem[1,1,3,3] sched: [6:0.50]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movshdup {{.*#+}} xmm0 = mem[1,1,3,3] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movshdup:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
-; BDVER2-NEXT:    vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [7:0.50]
+; BDVER2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [2:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movshdup:
 ; BTVER2-SSE:       # %bb.0:
@@ -1364,17 +1364,17 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_movsldup:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movsldup {{.*#+}} xmm0 = mem[0,0,2,2] sched: [6:0.50]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movsldup {{.*#+}} xmm0 = mem[0,0,2,2] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movsldup:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
-; BDVER2-NEXT:    vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [7:0.50]
+; BDVER2-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [2:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movsldup:
 ; BTVER2-SSE:       # %bb.0:
@@ -1504,17 +1504,17 @@ define void @test_mwait(i32 %a0, i32 %a1) {
 ;
 ; BDVER2-SSE-LABEL: test_mwait:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movl %esi, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    mwait # sched: [100:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movl %esi, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    mwait # sched: [100:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_mwait:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl %esi, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    movl %edi, %ecx # sched: [1:0.33]
-; BDVER2-NEXT:    mwait # sched: [100:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movl %esi, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    movl %edi, %ecx # sched: [1:0.50]
+; BDVER2-NEXT:    mwait # sched: [100:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_mwait:
 ; BTVER2-SSE:       # %bb.0:
diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll
index ace3d16ea71..160b780ac37 100644
--- a/test/CodeGen/X86/sse41-schedule.ll
+++ b/test/CodeGen/X86/sse41-schedule.ll
@@ -13,8 +13,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4.1 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx,+xop -mattr=+sse4.1 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -107,17 +107,17 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ;
 ; BDVER2-SSE-LABEL: test_blendpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_blendpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vblendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] sched: [1:0.50]
+; BDVER2-NEXT:    vblendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] sched: [2:0.50]
 ; BDVER2-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:0.50]
-; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_blendpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -240,17 +240,17 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *
 ;
 ; BDVER2-SSE-LABEL: test_blendps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_blendps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
+; BDVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [2:0.50]
 ; BDVER2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3] sched: [7:0.50]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_blendps:
 ; BTVER2-SSE:       # %bb.0:
@@ -382,18 +382,18 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ;
 ; BDVER2-SSE-LABEL: test_blendvpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm3 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    blendvpd %xmm0, %xmm1, %xmm3 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    blendvpd %xmm0, (%rdi), %xmm3 # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    movapd %xmm3, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm3 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    blendvpd %xmm0, %xmm1, %xmm3 # sched: [2:2.00]
+; BDVER2-SSE-NEXT:    blendvpd %xmm0, (%rdi), %xmm3 # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    movapd %xmm3, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_blendvpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_blendvpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -527,18 +527,18 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ;
 ; BDVER2-SSE-LABEL: test_blendvps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movaps %xmm0, %xmm3 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    blendvps %xmm0, (%rdi), %xmm3 # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    movaps %xmm3, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm0, %xmm3 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3 # sched: [2:2.00]
+; BDVER2-SSE-NEXT:    blendvps %xmm0, (%rdi), %xmm3 # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    movaps %xmm3, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_blendvps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_blendvps:
 ; BTVER2-SSE:       # %bb.0:
@@ -651,15 +651,15 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ;
 ; BDVER2-SSE-LABEL: test_dppd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    dppd $7, %xmm1, %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    dppd $7, (%rdi), %xmm0 # sched: [15:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    dppd $7, %xmm1, %xmm0 # sched: [15:1.50]
+; BDVER2-SSE-NEXT:    dppd $7, (%rdi), %xmm0 # sched: [20:1.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_dppd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [15:1.50]
+; BDVER2-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [20:1.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_dppd:
 ; BTVER2-SSE:       # %bb.0:
@@ -766,15 +766,15 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
 ;
 ; BDVER2-SSE-LABEL: test_dpps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    dpps $7, %xmm1, %xmm0 # sched: [12:2.00]
-; BDVER2-SSE-NEXT:    dpps $7, (%rdi), %xmm0 # sched: [18:2.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    dpps $7, %xmm1, %xmm0 # sched: [25:1.50]
+; BDVER2-SSE-NEXT:    dpps $7, (%rdi), %xmm0 # sched: [30:1.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_dpps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [12:2.00]
-; BDVER2-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [18:2.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [25:1.50]
+; BDVER2-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [30:1.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_dpps:
 ; BTVER2-SSE:       # %bb.0:
@@ -881,15 +881,15 @@ define i32 @test_extractps(<4 x float> %a0, i32 *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_extractps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    extractps $3, %xmm0, %eax # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    extractps $1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    extractps $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    extractps $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_extractps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vextractps $3, %xmm0, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    vextractps $1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vextractps $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    vextractps $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_extractps:
 ; BTVER2-SSE:       # %bb.0:
@@ -997,15 +997,15 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2)
 ;
 ; BDVER2-SSE-LABEL: test_insertps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
-; BDVER2-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_insertps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
-; BDVER2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [2:0.50]
+; BDVER2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_insertps:
 ; BTVER2-SSE:       # %bb.0:
@@ -1100,13 +1100,13 @@ define <2 x i64> @test_movntdqa(i8* %a0) {
 ;
 ; BDVER2-SSE-LABEL: test_movntdqa:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movntdqa (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movntdqa (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_movntdqa:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_movntdqa:
 ; BTVER2-SSE:       # %bb.0:
@@ -1207,15 +1207,15 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_mpsadbw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    mpsadbw $7, %xmm1, %xmm0 # sched: [7:1.00]
-; BDVER2-SSE-NEXT:    mpsadbw $7, (%rdi), %xmm0 # sched: [13:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    mpsadbw $7, %xmm1, %xmm0 # sched: [9:2.00]
+; BDVER2-SSE-NEXT:    mpsadbw $7, (%rdi), %xmm0 # sched: [14:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_mpsadbw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER2-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [9:2.00]
+; BDVER2-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [14:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_mpsadbw:
 ; BTVER2-SSE:       # %bb.0:
@@ -1323,15 +1323,15 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_packusdw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    packusdw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    packusdw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    packusdw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_packusdw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_packusdw:
 ; BTVER2-SSE:       # %bb.0:
@@ -1460,18 +1460,18 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16
 ;
 ; BDVER2-SSE-LABEL: test_pblendvb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm3 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm3 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    pblendvb %xmm0, (%rdi), %xmm3 # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    movdqa %xmm3, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm3 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm3 # sched: [2:2.00]
+; BDVER2-SSE-NEXT:    pblendvb %xmm0, (%rdi), %xmm3 # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    movdqa %xmm3, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pblendvb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pblendvb:
 ; BTVER2-SSE:       # %bb.0:
@@ -1596,17 +1596,17 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pblendw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6],mem[7] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pblendw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
+; BDVER2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [2:0.50]
 ; BDVER2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6],mem[7] sched: [7:0.50]
-; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pblendw:
 ; BTVER2-SSE:       # %bb.0:
@@ -1717,15 +1717,15 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpeqq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pcmpeqq %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqq %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pcmpeqq (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpeqq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomeqq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpcomeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpeqq:
 ; BTVER2-SSE:       # %bb.0:
@@ -1833,15 +1833,15 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pextrb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pextrb $3, %xmm0, %eax # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    pextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pextrb $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    pextrb $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pextrb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpextrb $3, %xmm0, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpextrb $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pextrb:
 ; BTVER2-SSE:       # %bb.0:
@@ -1960,17 +1960,17 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pextrd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    pextrd $3, %xmm0, %eax # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    pextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pextrd $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    pextrd $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pextrd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    vpextrd $3, %xmm0, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpextrd $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pextrd:
 ; BTVER2-SSE:       # %bb.0:
@@ -2081,15 +2081,15 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pextrq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pextrq $1, %xmm0, %rax # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    pextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pextrq $1, %xmm0, %rax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    pextrq $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pextrq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpextrq $1, %xmm0, %rax # sched: [3:1.00]
-; BDVER2-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpextrq $1, %xmm0, %rax # sched: [13:1.00]
+; BDVER2-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pextrq:
 ; BTVER2-SSE:       # %bb.0:
@@ -2195,15 +2195,15 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pextrw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pextrw $3, %xmm0, %eax # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    pextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pextrw $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    pextrw $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pextrw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpextrw $3, %xmm0, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpextrw $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pextrw:
 ; BTVER2-SSE:       # %bb.0:
@@ -2310,15 +2310,15 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
 ;
 ; BDVER2-SSE-LABEL: test_phminposuw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    phminposuw (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    phminposuw %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    phminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    phminposuw %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_phminposuw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_phminposuw:
 ; BTVER2-SSE:       # %bb.0:
@@ -2425,15 +2425,15 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pinsrb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pinsrb $1, %edi, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    pinsrb $3, (%rsi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pinsrb $1, %edi, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pinsrb $3, (%rsi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pinsrb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pinsrb:
 ; BTVER2-SSE:       # %bb.0:
@@ -2539,15 +2539,15 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pinsrd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pinsrd $1, %edi, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    pinsrd $3, (%rsi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pinsrd $1, %edi, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pinsrd $3, (%rsi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pinsrd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pinsrd:
 ; BTVER2-SSE:       # %bb.0:
@@ -2665,17 +2665,17 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
 ;
 ; BDVER2-SSE-LABEL: test_pinsrq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pinsrq $1, %rdi, %xmm0 # sched: [2:1.00]
-; BDVER2-SSE-NEXT:    pinsrq $1, (%rsi), %xmm1 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pinsrq $1, (%rsi), %xmm1 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    pinsrq $1, %rdi, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pinsrq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:1.00]
-; BDVER2-NEXT:    vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:0.50]
+; BDVER2-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pinsrq:
 ; BTVER2-SSE:       # %bb.0:
@@ -2786,15 +2786,15 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmaxsb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmaxsb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxsb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmaxsb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmaxsb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmaxsb:
 ; BTVER2-SSE:       # %bb.0:
@@ -2901,15 +2901,15 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmaxsd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmaxsd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxsd %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmaxsd (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmaxsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmaxsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -3016,15 +3016,15 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmaxud:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmaxud %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxud %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmaxud (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmaxud:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmaxud:
 ; BTVER2-SSE:       # %bb.0:
@@ -3131,15 +3131,15 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmaxuw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmaxuw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmaxuw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmaxuw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmaxuw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmaxuw:
 ; BTVER2-SSE:       # %bb.0:
@@ -3246,15 +3246,15 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pminsb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pminsb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminsb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pminsb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pminsb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pminsb:
 ; BTVER2-SSE:       # %bb.0:
@@ -3361,15 +3361,15 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pminsd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pminsd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminsd %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pminsd (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pminsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pminsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -3476,15 +3476,15 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pminud:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pminud %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminud %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pminud (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pminud:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminud %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pminud:
 ; BTVER2-SSE:       # %bb.0:
@@ -3591,15 +3591,15 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pminuw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pminuw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pminuw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pminuw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pminuw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pminuw:
 ; BTVER2-SSE:       # %bb.0:
@@ -3719,17 +3719,17 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovsxbw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovsxbw %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxbw %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovsxbw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovsxbw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovsxbw (%rdi), %xmm1 # sched: [7:0.50]
-; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovsxbw %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovsxbw:
 ; BTVER2-SSE:       # %bb.0:
@@ -3854,17 +3854,17 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovsxbd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovsxbd %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxbd %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovsxbd (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovsxbd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovsxbd (%rdi), %xmm1 # sched: [7:0.50]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovsxbd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovsxbd:
 ; BTVER2-SSE:       # %bb.0:
@@ -3989,17 +3989,17 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovsxbq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovsxbq %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxbq %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovsxbq (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovsxbq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovsxbq (%rdi), %xmm1 # sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovsxbq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovsxbq:
 ; BTVER2-SSE:       # %bb.0:
@@ -4124,17 +4124,17 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovsxdq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovsxdq %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxdq %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovsxdq (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovsxdq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovsxdq (%rdi), %xmm1 # sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovsxdq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovsxdq:
 ; BTVER2-SSE:       # %bb.0:
@@ -4259,17 +4259,17 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovsxwd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovsxwd %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxwd %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovsxwd (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovsxwd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovsxwd (%rdi), %xmm1 # sched: [7:0.50]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovsxwd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovsxwd:
 ; BTVER2-SSE:       # %bb.0:
@@ -4394,17 +4394,17 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovsxwq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovsxwq %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovsxwq %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovsxwq (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovsxwq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovsxwq (%rdi), %xmm1 # sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovsxwq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovsxwq:
 ; BTVER2-SSE:       # %bb.0:
@@ -4529,17 +4529,17 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovzxbw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovzxbw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
-; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovzxbw:
 ; BTVER2-SSE:       # %bb.0:
@@ -4664,17 +4664,17 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovzxbd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovzxbd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovzxbd:
 ; BTVER2-SSE:       # %bb.0:
@@ -4799,17 +4799,17 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovzxbq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovzxbq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovzxbq:
 ; BTVER2-SSE:       # %bb.0:
@@ -4934,17 +4934,17 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovzxdq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovzxdq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovzxdq:
 ; BTVER2-SSE:       # %bb.0:
@@ -5069,17 +5069,17 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovzxwd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovzxwd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
-; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovzxwd:
 ; BTVER2-SSE:       # %bb.0:
@@ -5204,17 +5204,17 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pmovzxwq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
-; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmovzxwq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
 ; BDVER2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
-; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmovzxwq:
 ; BTVER2-SSE:       # %bb.0:
@@ -5338,17 +5338,17 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x
 ;
 ; BDVER2-SSE-LABEL: test_pmuldq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmuldq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [11:1.00]
-; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmuldq (%rdi), %xmm2, %xmm2 # sched: [9:1.00]
+; BDVER2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpor %xmm2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmuldq:
 ; BTVER2-SSE:       # %bb.0:
@@ -5460,15 +5460,15 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmulld:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmulld %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    pmulld (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pmulld %xmm1, %xmm0 # sched: [5:2.00]
+; BDVER2-SSE-NEXT:    pmulld (%rdi), %xmm0 # sched: [10:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmulld:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmulld:
 ; BTVER2-SSE:       # %bb.0:
@@ -5622,23 +5622,23 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_ptest:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    ptest %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    ptest %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-SSE-NEXT:    setb %al # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    ptest (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    ptest (%rdi), %xmm0 # sched: [6:1.00]
 ; BDVER2-SSE-NEXT:    setb %cl # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movzbl %cl, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movzbl %cl, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_ptest:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vptest %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vptest %xmm1, %xmm0 # sched: [1:1.00]
 ; BDVER2-NEXT:    setb %al # sched: [1:0.50]
-; BDVER2-NEXT:    vptest (%rdi), %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vptest (%rdi), %xmm0 # sched: [6:1.00]
 ; BDVER2-NEXT:    setb %cl # sched: [1:0.50]
-; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.33]
-; BDVER2-NEXT:    movzbl %cl, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-NEXT:    movzbl %cl, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_ptest:
 ; BTVER2-SSE:       # %bb.0:
@@ -5776,17 +5776,17 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_roundpd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    roundpd $7, %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    roundpd $7, %xmm0, %xmm1 # sched: [4:1.00]
 ; BDVER2-SSE-NEXT:    roundpd $7, (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_roundpd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
 ; BDVER2-NEXT:    vroundpd $7, (%rdi), %xmm1 # sched: [9:1.00]
-; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vroundpd $7, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_roundpd:
 ; BTVER2-SSE:       # %bb.0:
@@ -5912,17 +5912,17 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_roundps:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    roundps $7, %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    roundps $7, %xmm0, %xmm1 # sched: [4:1.00]
 ; BDVER2-SSE-NEXT:    roundps $7, (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_roundps:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
 ; BDVER2-NEXT:    vroundps $7, (%rdi), %xmm1 # sched: [9:1.00]
-; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vroundps $7, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_roundps:
 ; BTVER2-SSE:       # %bb.0:
@@ -6053,18 +6053,18 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ;
 ; BDVER2-SSE-LABEL: test_roundsd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    roundsd $7, %xmm1, %xmm2 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    roundsd $7, (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addpd %xmm2, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    roundsd $7, %xmm1, %xmm2 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm2, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_roundsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [4:1.00]
 ; BDVER2-NEXT:    vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_roundsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -6197,18 +6197,18 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
 ;
 ; BDVER2-SSE-LABEL: test_roundss:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movaps %xmm0, %xmm2 # sched: [1:1.00]
-; BDVER2-SSE-NEXT:    roundss $7, %xmm1, %xmm2 # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm0, %xmm2 # sched: [1:0.50]
 ; BDVER2-SSE-NEXT:    roundss $7, (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER2-SSE-NEXT:    addps %xmm2, %xmm0 # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    roundss $7, %xmm1, %xmm2 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm2, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_roundss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; BDVER2-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [4:1.00]
 ; BDVER2-NEXT:    vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_roundss:
 ; BTVER2-SSE:       # %bb.0:
diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll
index 088494b86dc..c08e17aa2be 100644
--- a/test/CodeGen/X86/sse42-schedule.ll
+++ b/test/CodeGen/X86/sse42-schedule.ll
@@ -13,8 +13,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4.2,+pclmul -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx,+xop -mattr=+sse4.2,+pclmul -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -107,17 +107,17 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
 ;
 ; BDVER2-SSE-LABEL: crc32_32_8:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BDVER2-SSE-NEXT:    crc32b (%rdx), %eax # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: crc32_32_8:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BDVER2-NEXT:    crc32b (%rdx), %eax # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: crc32_32_8:
 ; BTVER2-SSE:       # %bb.0:
@@ -240,17 +240,17 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
 ;
 ; BDVER2-SSE-LABEL: crc32_32_16:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    crc32w %si, %eax # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    crc32w %si, %eax # sched: [5:2.00]
+; BDVER2-SSE-NEXT:    crc32w (%rdx), %eax # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: crc32_32_16:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    crc32w %si, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    crc32w %si, %eax # sched: [5:2.00]
+; BDVER2-NEXT:    crc32w (%rdx), %eax # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: crc32_32_16:
 ; BTVER2-SSE:       # %bb.0:
@@ -373,17 +373,17 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
 ;
 ; BDVER2-SSE-LABEL: crc32_32_32:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    crc32l %esi, %eax # sched: [6:2.00]
+; BDVER2-SSE-NEXT:    crc32l (%rdx), %eax # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: crc32_32_32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    crc32l %esi, %eax # sched: [6:2.00]
+; BDVER2-NEXT:    crc32l (%rdx), %eax # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: crc32_32_32:
 ; BTVER2-SSE:       # %bb.0:
@@ -506,17 +506,17 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
 ;
 ; BDVER2-SSE-LABEL: crc32_64_8:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BDVER2-SSE-NEXT:    crc32b (%rdx), %eax # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: crc32_64_8:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
-; BDVER2-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BDVER2-NEXT:    crc32b (%rdx), %eax # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: crc32_64_8:
 ; BTVER2-SSE:       # %bb.0:
@@ -639,17 +639,17 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
 ;
 ; BDVER2-SSE-LABEL: crc32_64_64:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
-; BDVER2-SSE-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    crc32q %rsi, %rax # sched: [10:2.00]
+; BDVER2-SSE-NEXT:    crc32q (%rdx), %rax # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: crc32_64_64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
-; BDVER2-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    crc32q %rsi, %rax # sched: [10:2.00]
+; BDVER2-NEXT:    crc32q (%rdx), %rax # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: crc32_64_64:
 ; BTVER2-SSE:       # %bb.0:
@@ -844,29 +844,29 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpestri:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 # sched: [4:2.67]
-; BDVER2-SSE-NEXT:    movl %ecx, %esi # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    pcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 # sched: [15:4.00]
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl %ecx, %esi # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpestri $7, (%rdi), %xmm0 # sched: [20:4.50]
 ; BDVER2-SSE-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; BDVER2-SSE-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpestri:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.33]
-; BDVER2-NEXT:    vpcmpestri $7, %xmm1, %xmm0 # sched: [4:2.67]
-; BDVER2-NEXT:    movl %ecx, %esi # sched: [1:0.33]
-; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.33]
-; BDVER2-NEXT:    vpcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-NEXT:    vpcmpestri $7, %xmm1, %xmm0 # sched: [15:4.00]
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-NEXT:    movl %ecx, %esi # sched: [1:0.50]
+; BDVER2-NEXT:    vpcmpestri $7, (%rdi), %xmm0 # sched: [20:4.50]
 ; BDVER2-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; BDVER2-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpestri:
 ; BTVER2-SSE:       # %bb.0:
@@ -1050,23 +1050,23 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpestrm:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    pcmpestrm $7, %xmm1, %xmm0 # sched: [11:2.67]
-; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    pcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpestrm $7, %xmm1, %xmm0 # sched: [10:4.00]
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpestrm $7, (%rdi), %xmm0 # sched: [15:4.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpestrm:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.33]
-; BDVER2-NEXT:    vpcmpestrm $7, %xmm1, %xmm0 # sched: [11:2.67]
-; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.33]
-; BDVER2-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-NEXT:    vpcmpestrm $7, %xmm1, %xmm0 # sched: [10:4.00]
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [15:4.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpestrm:
 ; BTVER2-SSE:       # %bb.0:
@@ -1225,21 +1225,21 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpistri:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
-; BDVER2-SSE-NEXT:    movl %ecx, %eax # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    pcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
+; BDVER2-SSE-NEXT:    pcmpistri $7, %xmm1, %xmm0 # sched: [14:1.00]
+; BDVER2-SSE-NEXT:    movl %ecx, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpistri $7, (%rdi), %xmm0 # sched: [19:1.00]
 ; BDVER2-SSE-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; BDVER2-SSE-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpistri:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
-; BDVER2-NEXT:    movl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    vpcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
+; BDVER2-NEXT:    vpcmpistri $7, %xmm1, %xmm0 # sched: [14:1.00]
+; BDVER2-NEXT:    movl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    vpcmpistri $7, (%rdi), %xmm0 # sched: [19:1.00]
 ; BDVER2-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; BDVER2-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpistri:
 ; BTVER2-SSE:       # %bb.0:
@@ -1359,15 +1359,15 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpistrm:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
-; BDVER2-SSE-NEXT:    pcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pcmpistrm $7, %xmm1, %xmm0 # sched: [6:1.00]
+; BDVER2-SSE-NEXT:    pcmpistrm $7, (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpistrm:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
-; BDVER2-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpistrm:
 ; BTVER2-SSE:       # %bb.0:
@@ -1474,15 +1474,15 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pcmpgtq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pcmpgtq %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    pcmpgtq (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pcmpgtq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pcmpgtq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pcmpgtq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpcomgtq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpgtq:
 ; BTVER2-SSE:       # %bb.0:
@@ -1590,15 +1590,15 @@ define <2 x i64> @test_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pclmulqdq:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pclmulqdq $0, %xmm1, %xmm0 # sched: [14:6.00]
-; BDVER2-SSE-NEXT:    pclmulqdq $0, (%rdi), %xmm0 # sched: [14:5.67]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pclmulqdq $0, %xmm1, %xmm0 # sched: [12:1.00]
+; BDVER2-SSE-NEXT:    pclmulqdq $0, (%rdi), %xmm0 # sched: [17:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pclmulqdq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [14:6.00]
-; BDVER2-NEXT:    vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [14:5.67]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; BDVER2-NEXT:    vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [17:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pclmulqdq:
 ; BTVER2-SSE:       # %bb.0:
diff --git a/test/CodeGen/X86/sse4a-schedule.ll b/test/CodeGen/X86/sse4a-schedule.ll
index 29ad2688b48..ad76845a73d 100644
--- a/test/CodeGen/X86/sse4a-schedule.ll
+++ b/test/CodeGen/X86/sse4a-schedule.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4a | FileCheck %s --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1
 
@@ -12,8 +12,8 @@ define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
 ;
 ; BDVER2-LABEL: test_extrq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    extrq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    extrq %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_extrq:
 ; BTVER2:       # %bb.0:
@@ -37,8 +37,8 @@ define <2 x i64> @test_extrqi(<2 x i64> %a0) {
 ;
 ; BDVER2-LABEL: test_extrqi:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    extrq $2, $3, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    extrq $2, $3, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_extrqi:
 ; BTVER2:       # %bb.0:
@@ -62,8 +62,8 @@ define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) {
 ;
 ; BDVER2-LABEL: test_insertq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    insertq %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    insertq %xmm1, %xmm0 # sched: [3:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_insertq:
 ; BTVER2:       # %bb.0:
@@ -87,8 +87,8 @@ define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) {
 ;
 ; BDVER2-LABEL: test_insertqi:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [3:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_insertqi:
 ; BTVER2:       # %bb.0:
@@ -112,8 +112,8 @@ define void @test_movntsd(i8* %p, <2 x double> %a) {
 ;
 ; BDVER2-LABEL: test_movntsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movntsd %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movntsd %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movntsd:
 ; BTVER2:       # %bb.0:
@@ -137,8 +137,8 @@ define void @test_movntss(i8* %p, <4 x float> %a) {
 ;
 ; BDVER2-LABEL: test_movntss:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    movntss %xmm0, (%rdi) # sched: [1:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    movntss %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_movntss:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll
index 9f1f7d69624..5c8bd2dc843 100644
--- a/test/CodeGen/X86/ssse3-schedule.ll
+++ b/test/CodeGen/X86/ssse3-schedule.ll
@@ -14,8 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+ssse3 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+ssse3 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -117,17 +117,17 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pabsb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pabsb %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pabsb %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pabsb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pabsb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpabsb %xmm0, %xmm0 # sched: [1:0.50]
 ; BDVER2-NEXT:    vpabsb (%rdi), %xmm1 # sched: [7:0.50]
-; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpabsb %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pabsb:
 ; BTVER2-SSE:       # %bb.0:
@@ -260,17 +260,17 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pabsd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pabsd %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pabsd %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pabsd (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pabsd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpabsd %xmm0, %xmm0 # sched: [1:0.50]
 ; BDVER2-NEXT:    vpabsd (%rdi), %xmm1 # sched: [7:0.50]
-; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpabsd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pabsd:
 ; BTVER2-SSE:       # %bb.0:
@@ -403,17 +403,17 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
 ;
 ; BDVER2-SSE-LABEL: test_pabsw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pabsw %xmm0, %xmm1 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pabsw %xmm0, %xmm1 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    pabsw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pabsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpabsw %xmm0, %xmm0 # sched: [1:0.50]
 ; BDVER2-NEXT:    vpabsw (%rdi), %xmm1 # sched: [7:0.50]
-; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpabsw %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pabsw:
 ; BTVER2-SSE:       # %bb.0:
@@ -541,16 +541,16 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_palignr:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
+; BDVER2-SSE-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
-; BDVER2-SSE-NEXT:    movdqa %xmm1, %xmm0 # sched: [1:0.33]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    movdqa %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_palignr:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
+; BDVER2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [2:0.50]
 ; BDVER2-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_palignr:
 ; BTVER2-SSE:       # %bb.0:
@@ -664,15 +664,15 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_phaddd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    phaddd %xmm1, %xmm0 # sched: [3:1.50]
-; BDVER2-SSE-NEXT:    phaddd (%rdi), %xmm0 # sched: [9:1.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    phaddd %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phaddd (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_phaddd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER2-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_phaddd:
 ; BTVER2-SSE:       # %bb.0:
@@ -785,15 +785,15 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_phaddsw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    phaddsw %xmm1, %xmm0 # sched: [3:1.50]
-; BDVER2-SSE-NEXT:    phaddsw (%rdi), %xmm0 # sched: [9:1.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    phaddsw %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phaddsw (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_phaddsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER2-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_phaddsw:
 ; BTVER2-SSE:       # %bb.0:
@@ -906,15 +906,15 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_phaddw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    phaddw %xmm1, %xmm0 # sched: [3:1.50]
-; BDVER2-SSE-NEXT:    phaddw (%rdi), %xmm0 # sched: [9:1.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    phaddw %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phaddw (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_phaddw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER2-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_phaddw:
 ; BTVER2-SSE:       # %bb.0:
@@ -1027,15 +1027,15 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_phsubd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    phsubd %xmm1, %xmm0 # sched: [3:1.50]
-; BDVER2-SSE-NEXT:    phsubd (%rdi), %xmm0 # sched: [9:1.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    phsubd %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phsubd (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_phsubd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER2-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_phsubd:
 ; BTVER2-SSE:       # %bb.0:
@@ -1148,15 +1148,15 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_phsubsw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    phsubsw %xmm1, %xmm0 # sched: [3:1.50]
-; BDVER2-SSE-NEXT:    phsubsw (%rdi), %xmm0 # sched: [9:1.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    phsubsw %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phsubsw (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_phsubsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER2-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_phsubsw:
 ; BTVER2-SSE:       # %bb.0:
@@ -1269,15 +1269,15 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_phsubw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    phsubw %xmm1, %xmm0 # sched: [3:1.50]
-; BDVER2-SSE-NEXT:    phsubw (%rdi), %xmm0 # sched: [9:1.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    phsubw %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phsubw (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_phsubw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER2-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_phsubw:
 ; BTVER2-SSE:       # %bb.0:
@@ -1390,15 +1390,15 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmaddubsw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmaddubsw %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    pmaddubsw (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pmaddubsw %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmaddubsw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmaddubsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmaddubsw:
 ; BTVER2-SSE:       # %bb.0:
@@ -1512,15 +1512,15 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pmulhrsw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pmulhrsw %xmm1, %xmm0 # sched: [5:1.00]
-; BDVER2-SSE-NEXT:    pmulhrsw (%rdi), %xmm0 # sched: [11:1.00]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pmulhrsw %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmulhrsw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pmulhrsw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER2-NEXT:    vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pmulhrsw:
 ; BTVER2-SSE:       # %bb.0:
@@ -1633,15 +1633,15 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_pshufb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    pshufb %xmm1, %xmm0 # sched: [1:0.50]
-; BDVER2-SSE-NEXT:    pshufb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    pshufb %xmm1, %xmm0 # sched: [3:2.00]
+; BDVER2-SSE-NEXT:    pshufb (%rdi), %xmm0 # sched: [8:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_pshufb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER2-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER2-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pshufb:
 ; BTVER2-SSE:       # %bb.0:
@@ -1758,15 +1758,15 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psignb:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psignb %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psignb %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    psignb (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psignb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psignb:
 ; BTVER2-SSE:       # %bb.0:
@@ -1883,15 +1883,15 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psignd:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psignd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psignd %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    psignd (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psignd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psignd:
 ; BTVER2-SSE:       # %bb.0:
@@ -2008,15 +2008,15 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-SSE-LABEL: test_psignw:
 ; BDVER2-SSE:       # %bb.0:
-; BDVER2-SSE-NEXT:    psignw %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    psignw %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-SSE-NEXT:    psignw (%rdi), %xmm0 # sched: [7:0.50]
-; BDVER2-SSE-NEXT:    retq # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER2-LABEL: test_psignw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_psignw:
 ; BTVER2-SSE:       # %bb.0:
diff --git a/test/CodeGen/X86/tbm-schedule.ll b/test/CodeGen/X86/tbm-schedule.ll
index 5c73c4b49dc..b8f9bb08f3e 100644
--- a/test/CodeGen/X86/tbm-schedule.ll
+++ b/test/CodeGen/X86/tbm-schedule.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+tbm | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+tbm | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
 
@@ -16,12 +16,12 @@ define i32 @test_x86_tbm_bextri_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_bextri_u32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
-; BDVER2-NEXT:    # sched: [2:1.00]
 ; BDVER2-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    # sched: [6:0.50]
+; BDVER2-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER2-NEXT:    # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_bextri_u32:
 ; BDVER3:       # %bb.0:
@@ -57,12 +57,12 @@ define i64 @test_x86_tbm_bextri_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_bextri_u64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
-; BDVER2-NEXT:    # sched: [2:1.00]
 ; BDVER2-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
-; BDVER2-NEXT:    # sched: [7:1.00]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    # sched: [6:0.50]
+; BDVER2-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER2-NEXT:    # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_bextri_u64:
 ; BDVER3:       # %bb.0:
@@ -96,10 +96,10 @@ define i32 @test_x86_tbm_blcfill_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blcfill_u32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blcfilll %edi, %ecx # sched: [1:0.33]
 ; BDVER2-NEXT:    blcfilll (%rsi), %eax # sched: [6:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blcfilll %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blcfill_u32:
 ; BDVER3:       # %bb.0:
@@ -133,10 +133,10 @@ define i64 @test_x86_tbm_blcfill_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blcfill_u64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blcfillq %rdi, %rcx # sched: [1:0.33]
 ; BDVER2-NEXT:    blcfillq (%rsi), %rax # sched: [6:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blcfillq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blcfill_u64:
 ; BDVER3:       # %bb.0:
@@ -170,10 +170,10 @@ define i32 @test_x86_tbm_blci_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blci_u32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blcil %edi, %ecx # sched: [1:0.33]
 ; BDVER2-NEXT:    blcil (%rsi), %eax # sched: [6:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blcil %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blci_u32:
 ; BDVER3:       # %bb.0:
@@ -209,10 +209,10 @@ define i64 @test_x86_tbm_blci_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blci_u64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blciq %rdi, %rcx # sched: [1:0.33]
 ; BDVER2-NEXT:    blciq (%rsi), %rax # sched: [6:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blciq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blci_u64:
 ; BDVER3:       # %bb.0:
@@ -248,10 +248,10 @@ define i32 @test_x86_tbm_blcic_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blcic_u32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blcicl %edi, %ecx # sched: [1:0.33]
 ; BDVER2-NEXT:    blcicl (%rsi), %eax # sched: [6:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blcicl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blcic_u32:
 ; BDVER3:       # %bb.0:
@@ -287,10 +287,10 @@ define i64 @test_x86_tbm_blcic_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blcic_u64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blcicq %rdi, %rcx # sched: [1:0.33]
 ; BDVER2-NEXT:    blcicq (%rsi), %rax # sched: [6:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blcicq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blcic_u64:
 ; BDVER3:       # %bb.0:
@@ -326,10 +326,10 @@ define i32 @test_x86_tbm_blcmsk_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blcmsk_u32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blcmskl %edi, %ecx # sched: [1:0.33]
 ; BDVER2-NEXT:    blcmskl (%rsi), %eax # sched: [6:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blcmskl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blcmsk_u32:
 ; BDVER3:       # %bb.0:
@@ -363,10 +363,10 @@ define i64 @test_x86_tbm_blcmsk_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blcmsk_u64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blcmskq %rdi, %rcx # sched: [1:0.33]
 ; BDVER2-NEXT:    blcmskq (%rsi), %rax # sched: [6:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blcmskq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blcmsk_u64:
 ; BDVER3:       # %bb.0:
@@ -400,10 +400,10 @@ define i32 @test_x86_tbm_blcs_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blcs_u32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blcsl %edi, %ecx # sched: [1:0.33]
 ; BDVER2-NEXT:    blcsl (%rsi), %eax # sched: [6:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blcsl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blcs_u32:
 ; BDVER3:       # %bb.0:
@@ -437,10 +437,10 @@ define i64 @test_x86_tbm_blcs_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blcs_u64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blcsq %rdi, %rcx # sched: [1:0.33]
 ; BDVER2-NEXT:    blcsq (%rsi), %rax # sched: [6:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blcsq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blcs_u64:
 ; BDVER3:       # %bb.0:
@@ -474,10 +474,10 @@ define i32 @test_x86_tbm_blsfill_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blsfill_u32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blsfilll %edi, %ecx # sched: [1:0.33]
 ; BDVER2-NEXT:    blsfilll (%rsi), %eax # sched: [6:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blsfilll %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blsfill_u32:
 ; BDVER3:       # %bb.0:
@@ -511,10 +511,10 @@ define i64 @test_x86_tbm_blsfill_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blsfill_u64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blsfillq %rdi, %rcx # sched: [1:0.33]
 ; BDVER2-NEXT:    blsfillq (%rsi), %rax # sched: [6:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blsfillq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blsfill_u64:
 ; BDVER3:       # %bb.0:
@@ -548,10 +548,10 @@ define i32 @test_x86_tbm_blsic_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blsic_u32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blsicl %edi, %ecx # sched: [1:0.33]
 ; BDVER2-NEXT:    blsicl (%rsi), %eax # sched: [6:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blsicl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blsic_u32:
 ; BDVER3:       # %bb.0:
@@ -587,10 +587,10 @@ define i64 @test_x86_tbm_blsic_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_blsic_u64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blsicq %rdi, %rcx # sched: [1:0.33]
 ; BDVER2-NEXT:    blsicq (%rsi), %rax # sched: [6:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    blsicq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_blsic_u64:
 ; BDVER3:       # %bb.0:
@@ -626,10 +626,10 @@ define i32 @test_x86_tbm_t1mskc_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_t1mskc_u32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    t1mskcl %edi, %ecx # sched: [1:0.33]
 ; BDVER2-NEXT:    t1mskcl (%rsi), %eax # sched: [6:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    t1mskcl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_t1mskc_u32:
 ; BDVER3:       # %bb.0:
@@ -665,10 +665,10 @@ define i64 @test_x86_tbm_t1mskc_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_t1mskc_u64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    t1mskcq %rdi, %rcx # sched: [1:0.33]
 ; BDVER2-NEXT:    t1mskcq (%rsi), %rax # sched: [6:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    t1mskcq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_t1mskc_u64:
 ; BDVER3:       # %bb.0:
@@ -704,10 +704,10 @@ define i32 @test_x86_tbm_tzmsk_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_tzmsk_u32:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    tzmskl %edi, %ecx # sched: [1:0.33]
 ; BDVER2-NEXT:    tzmskl (%rsi), %eax # sched: [6:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    tzmskl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_tzmsk_u32:
 ; BDVER3:       # %bb.0:
@@ -743,10 +743,10 @@ define i64 @test_x86_tbm_tzmsk_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ;
 ; BDVER2-LABEL: test_x86_tbm_tzmsk_u64:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    tzmskq %rdi, %rcx # sched: [1:0.33]
 ; BDVER2-NEXT:    tzmskq (%rsi), %rax # sched: [6:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; BDVER2-NEXT:    retq # sched: [1:1.00]
+; BDVER2-NEXT:    tzmskq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_x86_tbm_tzmsk_u64:
 ; BDVER3:       # %bb.0:
diff --git a/test/CodeGen/X86/wide-fma-contraction.ll b/test/CodeGen/X86/wide-fma-contraction.ll
index 3ee09dd8f80..d15ced21e95 100644
--- a/test/CodeGen/X86/wide-fma-contraction.ll
+++ b/test/CodeGen/X86/wide-fma-contraction.ll
@@ -30,8 +30,8 @@ define <16 x float> @fmafunc(<16 x float> %a, <16 x float> %b, <16 x float> %c)
 ; CHECK-NOFMA-NEXT:    andl $-32, %esp
 ; CHECK-NOFMA-NEXT:    subl $32, %esp
 ; CHECK-NOFMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; CHECK-NOFMA-NEXT:    vaddps 8(%ebp), %ymm0, %ymm0
 ; CHECK-NOFMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; CHECK-NOFMA-NEXT:    vaddps 8(%ebp), %ymm0, %ymm0
 ; CHECK-NOFMA-NEXT:    vaddps 40(%ebp), %ymm1, %ymm1
 ; CHECK-NOFMA-NEXT:    movl %ebp, %esp
 ; CHECK-NOFMA-NEXT:    popl %ebp
diff --git a/test/CodeGen/X86/x87-schedule.ll b/test/CodeGen/X86/x87-schedule.ll
index 8a338f20748..937a2c4561b 100644
--- a/test/CodeGen/X86/x87-schedule.ll
+++ b/test/CodeGen/X86/x87-schedule.ll
@@ -8,7 +8,7 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
-; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -72,9 +72,9 @@ define void @test_f2xm1() optsize {
 ; BDVER2-LABEL: test_f2xm1:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    f2xm1 # sched: [100:0.33]
+; BDVER2-NEXT:    f2xm1 # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_f2xm1:
 ; BTVER2:       # %bb.0:
@@ -155,7 +155,7 @@ define void @test_fabs() optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    fabs # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fabs:
 ; BTVER2:       # %bb.0:
@@ -276,12 +276,12 @@ define void @test_fadd(float *%a0, double *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fadd %st(0), %st(1) # sched: [3:1.00]
-; BDVER2-NEXT:    fadd %st(2) # sched: [3:1.00]
+; BDVER2-NEXT:    fadd %st(0), %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fadd %st(2) # sched: [5:1.00]
 ; BDVER2-NEXT:    fadds (%ecx) # sched: [10:1.00]
 ; BDVER2-NEXT:    faddl (%eax) # sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fadd:
 ; BTVER2:       # %bb.0:
@@ -412,12 +412,12 @@ define void @test_faddp_fiadd(i16 *%a0, i32 *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    faddp %st(1) # sched: [3:1.00]
-; BDVER2-NEXT:    faddp %st(2) # sched: [3:1.00]
-; BDVER2-NEXT:    fiadds (%ecx) # sched: [13:2.00]
-; BDVER2-NEXT:    fiaddl (%eax) # sched: [13:2.00]
+; BDVER2-NEXT:    faddp %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    faddp %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fiadds (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fiaddl (%eax) # sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_faddp_fiadd:
 ; BTVER2:       # %bb.0:
@@ -523,10 +523,10 @@ define void @test_fbld_fbstp(i8* %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fbld (%eax) # sched: [100:0.33]
-; BDVER2-NEXT:    fbstp (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    fbld (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    fbstp (%eax) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fbld_fbstp:
 ; BTVER2:       # %bb.0:
@@ -611,7 +611,7 @@ define void @test_fchs() optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    fchs # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fchs:
 ; BTVER2:       # %bb.0:
@@ -698,10 +698,10 @@ define void @test_fclex() optsize {
 ; BDVER2-LABEL: test_fclex:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    wait # sched: [100:0.33]
-; BDVER2-NEXT:    fnclex # sched: [100:0.33]
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fnclex # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fclex:
 ; BTVER2:       # %bb.0:
@@ -782,9 +782,9 @@ define void @test_fnclex() optsize {
 ; BDVER2-LABEL: test_fnclex:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fnclex # sched: [100:0.33]
+; BDVER2-NEXT:    fnclex # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fnclex:
 ; BTVER2:       # %bb.0:
@@ -919,16 +919,16 @@ define void @test_fcmov() optsize {
 ; BDVER2-LABEL: test_fcmov:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fcmovb %st(1), %st(0) # sched: [3:2.00]
-; BDVER2-NEXT:    fcmovbe %st(1), %st(0) # sched: [3:2.00]
-; BDVER2-NEXT:    fcmove %st(1), %st(0) # sched: [3:2.00]
-; BDVER2-NEXT:    fcmovnb %st(1), %st(0) # sched: [3:2.00]
-; BDVER2-NEXT:    fcmovnbe %st(1), %st(0) # sched: [3:2.00]
-; BDVER2-NEXT:    fcmovne %st(1), %st(0) # sched: [3:2.00]
-; BDVER2-NEXT:    fcmovnu %st(1), %st(0) # sched: [3:2.00]
-; BDVER2-NEXT:    fcmovu %st(1), %st(0) # sched: [3:2.00]
+; BDVER2-NEXT:    fcmovb %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovbe %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmove %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovnb %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovnbe %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovne %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovnu %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovu %st(1), %st(0) # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fcmov:
 ; BTVER2:       # %bb.0:
@@ -1065,10 +1065,10 @@ define void @test_fcom(float *%a0, double *%a1) optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    fcom %st(1) # sched: [1:1.00]
 ; BDVER2-NEXT:    fcom %st(3) # sched: [1:1.00]
-; BDVER2-NEXT:    fcoms (%ecx) # sched: [8:1.00]
-; BDVER2-NEXT:    fcoml (%eax) # sched: [8:1.00]
+; BDVER2-NEXT:    fcoms (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    fcoml (%eax) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fcom:
 ; BTVER2:       # %bb.0:
@@ -1209,11 +1209,11 @@ define void @test_fcomp_fcompp(float *%a0, double *%a1) optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    fcomp %st(1) # sched: [1:1.00]
 ; BDVER2-NEXT:    fcomp %st(3) # sched: [1:1.00]
-; BDVER2-NEXT:    fcomps (%ecx) # sched: [8:1.00]
-; BDVER2-NEXT:    fcompl (%eax) # sched: [8:1.00]
-; BDVER2-NEXT:    fcompp # sched: [100:0.33]
+; BDVER2-NEXT:    fcomps (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    fcompl (%eax) # sched: [6:1.00]
+; BDVER2-NEXT:    fcompp # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fcomp_fcompp:
 ; BTVER2:       # %bb.0:
@@ -1312,10 +1312,10 @@ define void @test_fcomi_fcomip() optsize {
 ; BDVER2-LABEL: test_fcomi_fcomip:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fcomi %st(3) # sched: [3:1.00]
-; BDVER2-NEXT:    fcompi %st(3) # sched: [3:1.00]
+; BDVER2-NEXT:    fcomi %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fcompi %st(3) # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fcomi_fcomip:
 ; BTVER2:       # %bb.0:
@@ -1396,9 +1396,9 @@ define void @test_fcos() optsize {
 ; BDVER2-LABEL: test_fcos:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fcos # sched: [100:0.33]
+; BDVER2-NEXT:    fcos # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fcos:
 ; BTVER2:       # %bb.0:
@@ -1477,9 +1477,9 @@ define void @test_fdecstp() optsize {
 ; BDVER2-LABEL: test_fdecstp:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fdecstp # sched: [1:1.00]
+; BDVER2-NEXT:    fdecstp # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fdecstp:
 ; BTVER2:       # %bb.0:
@@ -1600,12 +1600,12 @@ define void @test_fdiv(float *%a0, double *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fdiv %st(0), %st(1) # sched: [14:14.00]
-; BDVER2-NEXT:    fdiv %st(2) # sched: [14:14.00]
-; BDVER2-NEXT:    fdivs (%ecx) # sched: [31:1.00]
-; BDVER2-NEXT:    fdivl (%eax) # sched: [31:1.00]
+; BDVER2-NEXT:    fdiv %st(0), %st(1) # sched: [9:9.50]
+; BDVER2-NEXT:    fdiv %st(2) # sched: [9:9.50]
+; BDVER2-NEXT:    fdivs (%ecx) # sched: [14:9.50]
+; BDVER2-NEXT:    fdivl (%eax) # sched: [14:9.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fdiv:
 ; BTVER2:       # %bb.0:
@@ -1736,12 +1736,12 @@ define void @test_fdivp_fidiv(i16 *%a0, i32 *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fdivp %st(1) # sched: [14:14.00]
-; BDVER2-NEXT:    fdivp %st(2) # sched: [14:14.00]
-; BDVER2-NEXT:    fidivs (%ecx) # sched: [34:1.00]
-; BDVER2-NEXT:    fidivl (%eax) # sched: [34:1.00]
+; BDVER2-NEXT:    fdivp %st(1) # sched: [9:9.50]
+; BDVER2-NEXT:    fdivp %st(2) # sched: [9:9.50]
+; BDVER2-NEXT:    fidivs (%ecx) # sched: [14:9.50]
+; BDVER2-NEXT:    fidivl (%eax) # sched: [14:9.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fdivp_fidiv:
 ; BTVER2:       # %bb.0:
@@ -1872,12 +1872,12 @@ define void @test_fdivr(float *%a0, double *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fdivr %st(0), %st(1) # sched: [14:14.00]
-; BDVER2-NEXT:    fdivr %st(2) # sched: [14:14.00]
-; BDVER2-NEXT:    fdivrs (%ecx) # sched: [31:1.00]
-; BDVER2-NEXT:    fdivrl (%eax) # sched: [31:1.00]
+; BDVER2-NEXT:    fdivr %st(0), %st(1) # sched: [9:9.50]
+; BDVER2-NEXT:    fdivr %st(2) # sched: [9:9.50]
+; BDVER2-NEXT:    fdivrs (%ecx) # sched: [14:9.50]
+; BDVER2-NEXT:    fdivrl (%eax) # sched: [14:9.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fdivr:
 ; BTVER2:       # %bb.0:
@@ -2008,12 +2008,12 @@ define void @test_fdivrp_fidivr(i16 *%a0, i32 *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fdivrp %st(1) # sched: [14:14.00]
-; BDVER2-NEXT:    fdivrp %st(2) # sched: [14:14.00]
-; BDVER2-NEXT:    fidivrs (%ecx) # sched: [34:1.00]
-; BDVER2-NEXT:    fidivrl (%eax) # sched: [34:1.00]
+; BDVER2-NEXT:    fdivrp %st(1) # sched: [9:9.50]
+; BDVER2-NEXT:    fdivrp %st(2) # sched: [9:9.50]
+; BDVER2-NEXT:    fidivrs (%ecx) # sched: [14:9.50]
+; BDVER2-NEXT:    fidivrl (%eax) # sched: [14:9.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fdivrp_fidivr:
 ; BTVER2:       # %bb.0:
@@ -2102,9 +2102,9 @@ define void @test_ffree() optsize {
 ; BDVER2-LABEL: test_ffree:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    ffree %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    ffree %st(0) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ffree:
 ; BTVER2:       # %bb.0:
@@ -2225,12 +2225,12 @@ define void @test_ficom(i16 *%a0, i32 *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    ficoms (%ecx) # sched: [11:2.00]
-; BDVER2-NEXT:    ficoml (%eax) # sched: [11:2.00]
-; BDVER2-NEXT:    ficomps (%ecx) # sched: [11:2.00]
-; BDVER2-NEXT:    ficompl (%eax) # sched: [11:2.00]
+; BDVER2-NEXT:    ficoms (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    ficoml (%eax) # sched: [6:1.00]
+; BDVER2-NEXT:    ficomps (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    ficompl (%eax) # sched: [6:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ficom:
 ; BTVER2:       # %bb.0:
@@ -2362,11 +2362,11 @@ define void @test_fild(i16 *%a0, i32 *%a1, i64 *%a2) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    filds (%edx) # sched: [10:1.00]
-; BDVER2-NEXT:    fildl (%ecx) # sched: [10:1.00]
-; BDVER2-NEXT:    fildll (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    filds (%edx) # sched: [5:0.50]
+; BDVER2-NEXT:    fildl (%ecx) # sched: [5:0.50]
+; BDVER2-NEXT:    fildll (%eax) # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fild:
 ; BTVER2:       # %bb.0:
@@ -2455,9 +2455,9 @@ define void @test_fincstp() optsize {
 ; BDVER2-LABEL: test_fincstp:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fincstp # sched: [1:1.00]
+; BDVER2-NEXT:    fincstp # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fincstp:
 ; BTVER2:       # %bb.0:
@@ -2544,10 +2544,10 @@ define void @test_finit() optsize {
 ; BDVER2-LABEL: test_finit:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    wait # sched: [100:0.33]
-; BDVER2-NEXT:    fninit # sched: [5:1.33]
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fninit # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_finit:
 ; BTVER2:       # %bb.0:
@@ -2628,9 +2628,9 @@ define void @test_fninit() optsize {
 ; BDVER2-LABEL: test_fninit:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fninit # sched: [5:1.33]
+; BDVER2-NEXT:    fninit # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fninit:
 ; BTVER2:       # %bb.0:
@@ -2792,16 +2792,16 @@ define void @test_fist_fistp_fisttp(i16* %a0, i32* %a1, i64 *%a2) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fists (%edx) # sched: [9:1.00]
-; BDVER2-NEXT:    fistl (%ecx) # sched: [9:1.00]
-; BDVER2-NEXT:    fistps (%edx) # sched: [9:1.00]
-; BDVER2-NEXT:    fistpl (%ecx) # sched: [9:1.00]
-; BDVER2-NEXT:    fistpll (%eax) # sched: [9:1.00]
-; BDVER2-NEXT:    fisttps (%edx) # sched: [5:1.00]
-; BDVER2-NEXT:    fisttpl (%ecx) # sched: [5:1.00]
-; BDVER2-NEXT:    fisttpll (%eax) # sched: [5:1.00]
+; BDVER2-NEXT:    fists (%edx) # sched: [1:0.50]
+; BDVER2-NEXT:    fistl (%ecx) # sched: [1:0.50]
+; BDVER2-NEXT:    fistps (%edx) # sched: [1:0.50]
+; BDVER2-NEXT:    fistpl (%ecx) # sched: [1:0.50]
+; BDVER2-NEXT:    fistpll (%eax) # sched: [1:0.50]
+; BDVER2-NEXT:    fisttps (%edx) # sched: [1:0.50]
+; BDVER2-NEXT:    fisttpl (%ecx) # sched: [1:0.50]
+; BDVER2-NEXT:    fisttpll (%eax) # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fist_fistp_fisttp:
 ; BTVER2:       # %bb.0:
@@ -2951,12 +2951,12 @@ define void @test_fld(i16* %a0, i32* %a1, i64 *%a2) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fld %st(0) # sched: [1:1.00]
-; BDVER2-NEXT:    flds (%edx) # sched: [9:1.00]
-; BDVER2-NEXT:    fldl (%ecx) # sched: [9:1.00]
-; BDVER2-NEXT:    fldt (%eax) # sched: [9:1.00]
+; BDVER2-NEXT:    fld %st(0) # sched: [1:0.50]
+; BDVER2-NEXT:    flds (%edx) # sched: [5:0.50]
+; BDVER2-NEXT:    fldl (%ecx) # sched: [5:0.50]
+; BDVER2-NEXT:    fldt (%eax) # sched: [5:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fld:
 ; BTVER2:       # %bb.0:
@@ -3064,10 +3064,10 @@ define void @test_fldcw_fldenv(i8* %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fldcw (%eax) # sched: [8:2.00]
-; BDVER2-NEXT:    fldenv (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    fldcw (%eax) # sched: [5:0.50]
+; BDVER2-NEXT:    fldenv (%eax) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fldcw_fldenv:
 ; BTVER2:       # %bb.0:
@@ -3198,15 +3198,15 @@ define void @test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz() optsize {
 ; BDVER2-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fld1 # sched: [1:1.00]
-; BDVER2-NEXT:    fldl2e # sched: [1:1.00]
-; BDVER2-NEXT:    fldl2t # sched: [1:1.00]
-; BDVER2-NEXT:    fldlg2 # sched: [1:1.00]
-; BDVER2-NEXT:    fldln2 # sched: [1:1.00]
-; BDVER2-NEXT:    fldpi # sched: [1:1.00]
-; BDVER2-NEXT:    fldz # sched: [1:1.00]
+; BDVER2-NEXT:    fld1 # sched: [3:1.00]
+; BDVER2-NEXT:    fldl2e # sched: [3:1.00]
+; BDVER2-NEXT:    fldl2t # sched: [3:1.00]
+; BDVER2-NEXT:    fldlg2 # sched: [3:1.00]
+; BDVER2-NEXT:    fldln2 # sched: [3:1.00]
+; BDVER2-NEXT:    fldpi # sched: [3:1.00]
+; BDVER2-NEXT:    fldz # sched: [3:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
 ; BTVER2:       # %bb.0:
@@ -3341,10 +3341,10 @@ define void @test_fmul(float *%a0, double *%a1) optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    fmul %st(0), %st(1) # sched: [5:1.00]
 ; BDVER2-NEXT:    fmul %st(2) # sched: [5:1.00]
-; BDVER2-NEXT:    fmuls (%ecx) # sched: [12:1.00]
-; BDVER2-NEXT:    fmull (%eax) # sched: [12:1.00]
+; BDVER2-NEXT:    fmuls (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fmull (%eax) # sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fmul:
 ; BTVER2:       # %bb.0:
@@ -3477,10 +3477,10 @@ define void @test_fmulp_fimul(i16 *%a0, i32 *%a1) optsize {
 ; BDVER2-NEXT:    #APP
 ; BDVER2-NEXT:    fmulp %st(1) # sched: [5:1.00]
 ; BDVER2-NEXT:    fmulp %st(2) # sched: [5:1.00]
-; BDVER2-NEXT:    fimuls (%ecx) # sched: [15:1.00]
-; BDVER2-NEXT:    fimull (%eax) # sched: [15:1.00]
+; BDVER2-NEXT:    fimuls (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fimull (%eax) # sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fmulp_fimul:
 ; BTVER2:       # %bb.0:
@@ -3569,9 +3569,9 @@ define void @test_fnop() optsize {
 ; BDVER2-LABEL: test_fnop:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fnop # sched: [1:1.00]
+; BDVER2-NEXT:    fnop # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fnop:
 ; BTVER2:       # %bb.0:
@@ -3650,9 +3650,9 @@ define void @test_fpatan() optsize {
 ; BDVER2-LABEL: test_fpatan:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fpatan # sched: [100:0.33]
+; BDVER2-NEXT:    fpatan # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fpatan:
 ; BTVER2:       # %bb.0:
@@ -3739,10 +3739,10 @@ define void @test_fprem_fprem1() optsize {
 ; BDVER2-LABEL: test_fprem_fprem1:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fprem # sched: [100:0.33]
-; BDVER2-NEXT:    fprem1 # sched: [100:0.33]
+; BDVER2-NEXT:    fprem # sched: [100:0.50]
+; BDVER2-NEXT:    fprem1 # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fprem_fprem1:
 ; BTVER2:       # %bb.0:
@@ -3823,9 +3823,9 @@ define void @test_fptan() optsize {
 ; BDVER2-LABEL: test_fptan:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fptan # sched: [100:0.33]
+; BDVER2-NEXT:    fptan # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fptan:
 ; BTVER2:       # %bb.0:
@@ -3904,9 +3904,9 @@ define void @test_frndint() optsize {
 ; BDVER2-LABEL: test_frndint:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    frndint # sched: [100:0.33]
+; BDVER2-NEXT:    frndint # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_frndint:
 ; BTVER2:       # %bb.0:
@@ -3994,9 +3994,9 @@ define void @test_frstor(i8* %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    frstor (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    frstor (%eax) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_frstor:
 ; BTVER2:       # %bb.0:
@@ -4094,10 +4094,10 @@ define void @test_fsave(i8* %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    wait # sched: [100:0.33]
-; BDVER2-NEXT:    fnsave (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fnsave (%eax) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fsave:
 ; BTVER2:       # %bb.0:
@@ -4189,9 +4189,9 @@ define void @test_fnsave(i8* %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fnsave (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    fnsave (%eax) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fnsave:
 ; BTVER2:       # %bb.0:
@@ -4272,9 +4272,9 @@ define void @test_fscale() optsize {
 ; BDVER2-LABEL: test_fscale:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fscale # sched: [100:0.33]
+; BDVER2-NEXT:    fscale # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fscale:
 ; BTVER2:       # %bb.0:
@@ -4353,9 +4353,9 @@ define void @test_fsin() optsize {
 ; BDVER2-LABEL: test_fsin:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fsin # sched: [100:0.33]
+; BDVER2-NEXT:    fsin # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fsin:
 ; BTVER2:       # %bb.0:
@@ -4434,9 +4434,9 @@ define void @test_fsincos() optsize {
 ; BDVER2-LABEL: test_fsincos:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fsincos # sched: [100:0.33]
+; BDVER2-NEXT:    fsincos # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fsincos:
 ; BTVER2:       # %bb.0:
@@ -4515,9 +4515,9 @@ define void @test_fsqrt() optsize {
 ; BDVER2-LABEL: test_fsqrt:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fsqrt # sched: [24:24.00]
+; BDVER2-NEXT:    fsqrt # sched: [1:17.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fsqrt:
 ; BTVER2:       # %bb.0:
@@ -4671,15 +4671,15 @@ define void @test_fst_fstp(i16* %a0, i32* %a1, i64 *%a2) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fst %st(0) # sched: [1:1.00]
-; BDVER2-NEXT:    fsts (%edx) # sched: [6:1.00]
-; BDVER2-NEXT:    fstl (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    fstp %st(0) # sched: [1:1.00]
-; BDVER2-NEXT:    fstpl (%edx) # sched: [6:1.00]
-; BDVER2-NEXT:    fstpl (%ecx) # sched: [6:1.00]
-; BDVER2-NEXT:    fstpt (%eax) # sched: [6:1.00]
+; BDVER2-NEXT:    fst %st(0) # sched: [1:0.50]
+; BDVER2-NEXT:    fsts (%edx) # sched: [1:0.50]
+; BDVER2-NEXT:    fstl (%ecx) # sched: [1:0.50]
+; BDVER2-NEXT:    fstp %st(0) # sched: [1:0.50]
+; BDVER2-NEXT:    fstpl (%edx) # sched: [1:0.50]
+; BDVER2-NEXT:    fstpl (%ecx) # sched: [1:0.50]
+; BDVER2-NEXT:    fstpt (%eax) # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fst_fstp:
 ; BTVER2:       # %bb.0:
@@ -4825,14 +4825,14 @@ define void @test_fstcw_fstenv_fstsw(i8* %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    wait # sched: [100:0.33]
-; BDVER2-NEXT:    fnstcw (%eax) # sched: [7:1.00]
-; BDVER2-NEXT:    wait # sched: [100:0.33]
-; BDVER2-NEXT:    fnstenv (%eax) # sched: [100:0.33]
-; BDVER2-NEXT:    wait # sched: [100:0.33]
-; BDVER2-NEXT:    fnstsw (%eax) # sched: [7:1.00]
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fnstcw (%eax) # sched: [1:0.50]
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fnstenv (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fnstsw (%eax) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fstcw_fstenv_fstsw:
 ; BTVER2:       # %bb.0:
@@ -4948,11 +4948,11 @@ define void @test_fnstcw_fnstenv_fnstsw(i8* %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fnstcw (%eax) # sched: [7:1.00]
-; BDVER2-NEXT:    fnstenv (%eax) # sched: [100:0.33]
-; BDVER2-NEXT:    fnstsw (%eax) # sched: [7:1.00]
+; BDVER2-NEXT:    fnstcw (%eax) # sched: [1:0.50]
+; BDVER2-NEXT:    fnstenv (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    fnstsw (%eax) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fnstcw_fnstenv_fnstsw:
 ; BTVER2:       # %bb.0:
@@ -5079,12 +5079,12 @@ define void @test_fsub(float *%a0, double *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fsub %st(0), %st(1) # sched: [3:1.00]
-; BDVER2-NEXT:    fsub %st(2) # sched: [3:1.00]
+; BDVER2-NEXT:    fsub %st(0), %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fsub %st(2) # sched: [5:1.00]
 ; BDVER2-NEXT:    fsubs (%ecx) # sched: [10:1.00]
 ; BDVER2-NEXT:    fsubl (%eax) # sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fsub:
 ; BTVER2:       # %bb.0:
@@ -5215,12 +5215,12 @@ define void @test_fsubp_fisub(i16 *%a0, i32 *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fsubp %st(1) # sched: [3:1.00]
-; BDVER2-NEXT:    fsubp %st(2) # sched: [3:1.00]
-; BDVER2-NEXT:    fisubs (%ecx) # sched: [13:2.00]
-; BDVER2-NEXT:    fisubl (%eax) # sched: [13:2.00]
+; BDVER2-NEXT:    fsubp %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fsubp %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fisubs (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fisubl (%eax) # sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fsubp_fisub:
 ; BTVER2:       # %bb.0:
@@ -5351,12 +5351,12 @@ define void @test_fsubr(float *%a0, double *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fsubr %st(0), %st(1) # sched: [3:1.00]
-; BDVER2-NEXT:    fsubr %st(2) # sched: [3:1.00]
+; BDVER2-NEXT:    fsubr %st(0), %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fsubr %st(2) # sched: [5:1.00]
 ; BDVER2-NEXT:    fsubrs (%ecx) # sched: [10:1.00]
 ; BDVER2-NEXT:    fsubrl (%eax) # sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fsubr:
 ; BTVER2:       # %bb.0:
@@ -5487,12 +5487,12 @@ define void @test_fsubrp_fisubr(i16 *%a0, i32 *%a1) optsize {
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fsubrp %st(1) # sched: [3:1.00]
-; BDVER2-NEXT:    fsubrp %st(2) # sched: [3:1.00]
-; BDVER2-NEXT:    fisubrs (%ecx) # sched: [13:2.00]
-; BDVER2-NEXT:    fisubrl (%eax) # sched: [13:2.00]
+; BDVER2-NEXT:    fsubrp %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fsubrp %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fisubrs (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fisubrl (%eax) # sched: [10:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fsubrp_fisubr:
 ; BTVER2:       # %bb.0:
@@ -5581,9 +5581,9 @@ define void @test_ftst() optsize {
 ; BDVER2-LABEL: test_ftst:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    ftst # sched: [3:1.00]
+; BDVER2-NEXT:    ftst # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_ftst:
 ; BTVER2:       # %bb.0:
@@ -5698,9 +5698,9 @@ define void @test_fucom_fucomp_fucompp() optsize {
 ; BDVER2-NEXT:    fucom %st(3) # sched: [1:1.00]
 ; BDVER2-NEXT:    fucomp %st(1) # sched: [1:1.00]
 ; BDVER2-NEXT:    fucomp %st(3) # sched: [1:1.00]
-; BDVER2-NEXT:    fucompp # sched: [3:1.00]
+; BDVER2-NEXT:    fucompp # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fucom_fucomp_fucompp:
 ; BTVER2:       # %bb.0:
@@ -5795,10 +5795,10 @@ define void @test_fucomi_fucomip() optsize {
 ; BDVER2-LABEL: test_fucomi_fucomip:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fucomi %st(3) # sched: [3:1.00]
-; BDVER2-NEXT:    fucompi %st(3) # sched: [3:1.00]
+; BDVER2-NEXT:    fucomi %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fucompi %st(3) # sched: [1:1.00]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fucomi_fucomip:
 ; BTVER2:       # %bb.0:
@@ -5879,9 +5879,9 @@ define void @test_fwait() optsize {
 ; BDVER2-LABEL: test_fwait:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    wait # sched: [100:0.33]
+; BDVER2-NEXT:    wait # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fwait:
 ; BTVER2:       # %bb.0:
@@ -5960,9 +5960,9 @@ define void @test_fxam() optsize {
 ; BDVER2-LABEL: test_fxam:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fxam # sched: [100:0.33]
+; BDVER2-NEXT:    fxam # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fxam:
 ; BTVER2:       # %bb.0:
@@ -6049,10 +6049,10 @@ define void @test_fxch() optsize {
 ; BDVER2-LABEL: test_fxch:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fxch %st(1) # sched: [1:0.33]
-; BDVER2-NEXT:    fxch %st(3) # sched: [1:0.33]
+; BDVER2-NEXT:    fxch %st(1) # sched: [1:0.50]
+; BDVER2-NEXT:    fxch %st(3) # sched: [1:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fxch:
 ; BTVER2:       # %bb.0:
@@ -6150,10 +6150,10 @@ define void @test_fxrstor_fxsave(i8* %a0) optsize {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fxrstor (%eax) # sched: [5:2.00]
-; BDVER2-NEXT:    fxsave (%eax) # sched: [100:0.33]
+; BDVER2-NEXT:    fxrstor (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    fxsave (%eax) # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fxrstor_fxsave:
 ; BTVER2:       # %bb.0:
@@ -6236,9 +6236,9 @@ define void @test_fxtract() optsize {
 ; BDVER2-LABEL: test_fxtract:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fxtract # sched: [100:0.33]
+; BDVER2-NEXT:    fxtract # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fxtract:
 ; BTVER2:       # %bb.0:
@@ -6317,9 +6317,9 @@ define void @test_fyl2x() optsize {
 ; BDVER2-LABEL: test_fyl2x:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fyl2x # sched: [100:0.33]
+; BDVER2-NEXT:    fyl2x # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fyl2x:
 ; BTVER2:       # %bb.0:
@@ -6398,9 +6398,9 @@ define void @test_fyl2xp1() optsize {
 ; BDVER2-LABEL: test_fyl2xp1:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    #APP
-; BDVER2-NEXT:    fyl2xp1 # sched: [100:0.33]
+; BDVER2-NEXT:    fyl2xp1 # sched: [100:0.50]
 ; BDVER2-NEXT:    #NO_APP
-; BDVER2-NEXT:    retl # sched: [6:1.00]
+; BDVER2-NEXT:    retl # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: test_fyl2xp1:
 ; BTVER2:       # %bb.0:
diff --git a/test/CodeGen/X86/xop-schedule.ll b/test/CodeGen/X86/xop-schedule.ll
index ffa3152f926..ba0073bc63d 100644
--- a/test/CodeGen/X86/xop-schedule.ll
+++ b/test/CodeGen/X86/xop-schedule.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+xop | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+xop | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+xop | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
 
@@ -20,13 +20,13 @@ define void @test_vfrczpd(<2 x double> %a0, <4 x double> %a1, <2 x double> *%a2,
 ; BDVER12-LABEL: test_vfrczpd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vfrczpd %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER12-NEXT:    vfrczpd %ymm1, %ymm1 # sched: [3:1.00]
-; BDVER12-NEXT:    vfrczpd (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER12-NEXT:    vfrczpd (%rsi), %ymm1 # sched: [10:1.00]
+; BDVER12-NEXT:    vfrczpd %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfrczpd %ymm1, %ymm1 # sched: [10:2.00]
+; BDVER12-NEXT:    vfrczpd (%rdi), %xmm0 # sched: [15:1.00]
+; BDVER12-NEXT:    vfrczpd (%rsi), %ymm1 # sched: [15:2.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vfrczpd:
 ; BDVER3:       # %bb.0:
@@ -68,13 +68,13 @@ define void @test_vfrczps(<4 x float> %a0, <4 x double> %a1, <4 x float> *%a2, <
 ; BDVER12-LABEL: test_vfrczps:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vfrczps %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER12-NEXT:    vfrczps %ymm1, %ymm1 # sched: [3:1.00]
-; BDVER12-NEXT:    vfrczps (%rdi), %xmm0 # sched: [9:1.00]
-; BDVER12-NEXT:    vfrczps (%rsi), %ymm1 # sched: [10:1.00]
+; BDVER12-NEXT:    vfrczps %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfrczps %ymm1, %ymm1 # sched: [10:2.00]
+; BDVER12-NEXT:    vfrczps (%rdi), %xmm0 # sched: [15:1.00]
+; BDVER12-NEXT:    vfrczps (%rsi), %ymm1 # sched: [15:2.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vfrczps:
 ; BDVER3:       # %bb.0:
@@ -113,10 +113,10 @@ define void @test_vfrczsd(<2 x double> %a0, <2 x double> *%a1) {
 ; BDVER12-LABEL: test_vfrczsd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vfrczsd %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER12-NEXT:    vfrczsd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    vfrczsd %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfrczsd (%rdi), %xmm0 # sched: [15:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vfrczsd:
 ; BDVER3:       # %bb.0:
@@ -149,10 +149,10 @@ define void @test_vfrczss(<4 x float> %a0, <4 x double> *%a1) {
 ; BDVER12-LABEL: test_vfrczss:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vfrczss %xmm0, %xmm0 # sched: [3:1.00]
-; BDVER12-NEXT:    vfrczss (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    vfrczss %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfrczss (%rdi), %xmm0 # sched: [15:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vfrczss:
 ; BDVER3:       # %bb.0:
@@ -186,11 +186,11 @@ define void @test_vpcmov_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; BDVER12-LABEL: test_vpcmov_128:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER12-NEXT:    vpcmov (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    vpcmov %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpcmov_128:
 ; BDVER3:       # %bb.0:
@@ -227,12 +227,12 @@ define void @test_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, <4 x i
 ; BDVER12-LABEL: test_vpcmov_256:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER12-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER12-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpcmov_256:
 ; BDVER3:       # %bb.0:
@@ -275,16 +275,16 @@ define void @test_vpcom(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BDVER12-LABEL: test_vpcom:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpcomb $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER12-NEXT:    vpcomd $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER12-NEXT:    vpcomq $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER12-NEXT:    vpcomw $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcomb $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomd $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomq $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomw $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER12-NEXT:    vpcomb $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    vpcomd $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    vpcomq $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    vpcomw $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpcom:
 ; BDVER3:       # %bb.0:
@@ -335,16 +335,16 @@ define void @test_vpcomu(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BDVER12-LABEL: test_vpcomu:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpcomub $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER12-NEXT:    vpcomud $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER12-NEXT:    vpcomuq $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER12-NEXT:    vpcomuw $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BDVER12-NEXT:    vpcomub $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomud $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomuq $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomuw $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
 ; BDVER12-NEXT:    vpcomub $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    vpcomud $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    vpcomuq $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    vpcomuw $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpcomu:
 ; BDVER3:       # %bb.0:
@@ -390,11 +390,11 @@ define void @test_vpermil2pd_128(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; BDVER12-LABEL: test_vpermil2pd_128:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER12-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER12-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [8:2.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpermil2pd_128:
 ; BDVER3:       # %bb.0:
@@ -431,12 +431,12 @@ define void @test_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; BDVER12-LABEL: test_vpermil2pd_256:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER12-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER12-NEXT:    vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
+; BDVER12-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:3.00]
+; BDVER12-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:3.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpermil2pd_256:
 ; BDVER3:       # %bb.0:
@@ -474,11 +474,11 @@ define void @test_vpermil2ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %
 ; BDVER12-LABEL: test_vpermil2ps_128:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER12-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER12-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [8:2.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpermil2ps_128:
 ; BDVER3:       # %bb.0:
@@ -515,12 +515,12 @@ define void @test_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %
 ; BDVER12-LABEL: test_vpermil2ps_256:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; BDVER12-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00]
+; BDVER12-NEXT:    vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
+; BDVER12-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:3.00]
+; BDVER12-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:3.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    vzeroupper # sched: [100:0.33]
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpermil2ps_256:
 ; BDVER3:       # %bb.0:
@@ -557,10 +557,10 @@ define void @test_vphaddbd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphaddbd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphaddbd %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphaddbd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphaddbd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddbd (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphaddbd:
 ; BDVER3:       # %bb.0:
@@ -593,10 +593,10 @@ define void @test_vphaddbq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphaddbq:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphaddbq %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphaddbq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphaddbq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddbq (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphaddbq:
 ; BDVER3:       # %bb.0:
@@ -629,10 +629,10 @@ define void @test_vphaddbw(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphaddbw:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphaddbw %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphaddbw (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphaddbw %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddbw (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphaddbw:
 ; BDVER3:       # %bb.0:
@@ -665,10 +665,10 @@ define void @test_vphadddq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphadddq:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphadddq %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphadddq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphadddq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphadddq (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphadddq:
 ; BDVER3:       # %bb.0:
@@ -701,10 +701,10 @@ define void @test_vphaddubd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphaddubd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphaddubd %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphaddubd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphaddubd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddubd (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphaddubd:
 ; BDVER3:       # %bb.0:
@@ -737,10 +737,10 @@ define void @test_vphaddubq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphaddubq:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphaddubq %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphaddubq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphaddubq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddubq (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphaddubq:
 ; BDVER3:       # %bb.0:
@@ -773,10 +773,10 @@ define void @test_vphaddubw(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphaddubw:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphaddubw %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphaddubw (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphaddubw %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddubw (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphaddubw:
 ; BDVER3:       # %bb.0:
@@ -809,10 +809,10 @@ define void @test_vphaddudq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphaddudq:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphaddudq %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphaddudq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphaddudq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddudq (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphaddudq:
 ; BDVER3:       # %bb.0:
@@ -845,10 +845,10 @@ define void @test_vphadduwd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphadduwd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphadduwd %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphadduwd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphadduwd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphadduwd (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphadduwd:
 ; BDVER3:       # %bb.0:
@@ -881,10 +881,10 @@ define void @test_vphadduwq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphadduwq:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphadduwq %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphadduwq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphadduwq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphadduwq (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphadduwq:
 ; BDVER3:       # %bb.0:
@@ -917,10 +917,10 @@ define void @test_vphaddwd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphaddwd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphaddwd %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphaddwd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphaddwd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddwd (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphaddwd:
 ; BDVER3:       # %bb.0:
@@ -953,10 +953,10 @@ define void @test_vphaddwq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphaddwq:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphaddwq %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphaddwq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphaddwq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddwq (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphaddwq:
 ; BDVER3:       # %bb.0:
@@ -989,10 +989,10 @@ define void @test_vphsubbw(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphsubbw:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphsubbw %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphsubbw (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphsubbw %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphsubbw (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphsubbw:
 ; BDVER3:       # %bb.0:
@@ -1025,10 +1025,10 @@ define void @test_vphsubdq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphsubdq:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphsubdq %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphsubdq (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphsubdq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphsubdq (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphsubdq:
 ; BDVER3:       # %bb.0:
@@ -1061,10 +1061,10 @@ define void @test_vphsubwd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BDVER12-LABEL: test_vphsubwd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vphsubwd %xmm0, %xmm0 # sched: [3:1.50]
-; BDVER12-NEXT:    vphsubwd (%rdi), %xmm0 # sched: [9:1.50]
+; BDVER12-NEXT:    vphsubwd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphsubwd (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vphsubwd:
 ; BDVER3:       # %bb.0:
@@ -1097,10 +1097,10 @@ define void @test_vpmacsdd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64
 ; BDVER12-LABEL: test_vpmacsdd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER12-NEXT:    vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmacsdd:
 ; BDVER3:       # %bb.0:
@@ -1133,10 +1133,10 @@ define void @test_vpmacsdqh(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; BDVER12-LABEL: test_vpmacsdqh:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
+; BDVER12-NEXT:    vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmacsdqh:
 ; BDVER3:       # %bb.0:
@@ -1169,10 +1169,10 @@ define void @test_vpmacsdql(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; BDVER12-LABEL: test_vpmacsdql:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
+; BDVER12-NEXT:    vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmacsdql:
 ; BDVER3:       # %bb.0:
@@ -1205,10 +1205,10 @@ define void @test_vpmacssdd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; BDVER12-LABEL: test_vpmacssdd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER12-NEXT:    vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmacssdd:
 ; BDVER3:       # %bb.0:
@@ -1241,10 +1241,10 @@ define void @test_vpmacssdqh(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; BDVER12-LABEL: test_vpmacssdqh:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
+; BDVER12-NEXT:    vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmacssdqh:
 ; BDVER3:       # %bb.0:
@@ -1277,10 +1277,10 @@ define void @test_vpmacssdql(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; BDVER12-LABEL: test_vpmacssdql:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
+; BDVER12-NEXT:    vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmacssdql:
 ; BDVER3:       # %bb.0:
@@ -1313,10 +1313,10 @@ define void @test_vpmacsswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; BDVER12-LABEL: test_vpmacsswd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmacsswd:
 ; BDVER3:       # %bb.0:
@@ -1349,10 +1349,10 @@ define void @test_vpmacssww(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; BDVER12-LABEL: test_vpmacssww:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmacssww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmacssww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmacssww:
 ; BDVER3:       # %bb.0:
@@ -1385,10 +1385,10 @@ define void @test_vpmacswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64
 ; BDVER12-LABEL: test_vpmacswd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmacswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmacswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmacswd:
 ; BDVER3:       # %bb.0:
@@ -1421,10 +1421,10 @@ define void @test_vpmacsww(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64
 ; BDVER12-LABEL: test_vpmacsww:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmacsww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmacsww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmacsww:
 ; BDVER3:       # %bb.0:
@@ -1457,10 +1457,10 @@ define void @test_vpmadcsswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; BDVER12-LABEL: test_vpmadcsswd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmadcsswd:
 ; BDVER3:       # %bb.0:
@@ -1493,10 +1493,10 @@ define void @test_vpmadcswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; BDVER12-LABEL: test_vpmadcswd:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BDVER12-NEXT:    vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER12-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpmadcswd:
 ; BDVER3:       # %bb.0:
@@ -1530,11 +1530,11 @@ define void @test_vpperm(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64>
 ; BDVER12-LABEL: test_vpperm:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BDVER12-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:0.50]
-; BDVER12-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER12-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER12-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpperm:
 ; BDVER3:       # %bb.0:
@@ -1587,28 +1587,28 @@ define void @test_vprot(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BDVER12-LABEL: test_vprot:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vprotb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vprotd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vprotq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vprotw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vprotb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vprotd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vprotq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vprotw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vprotb %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vprotd %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vprotq %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vprotw %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vprotb $7, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vprotd $7, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vprotq $7, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vprotw $7, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vprotb $7, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vprotd $7, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vprotq $7, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vprotw $7, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vprotb %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vprotd %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vprotq %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vprotw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vprotb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotb %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotd %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotq %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotw %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotb $7, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vprotd $7, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vprotq $7, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vprotw $7, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vprotb $7, (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vprotd $7, (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vprotq $7, (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vprotw $7, (%rdi), %xmm0 # sched: [7:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vprot:
 ; BDVER3:       # %bb.0:
@@ -1687,20 +1687,20 @@ define void @test_vpsha(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BDVER12-LABEL: test_vpsha:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpshab %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpshad %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpshaq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpshaw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpshab (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshad (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshaq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshaw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshab %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshad %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshaq %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshaw %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshab %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshad %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshaq %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshaw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshab (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshad (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshaq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshaw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshab %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshad %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshaq %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshaw %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpsha:
 ; BDVER3:       # %bb.0:
@@ -1763,20 +1763,20 @@ define void @test_vpshl(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BDVER12-LABEL: test_vpshl:
 ; BDVER12:       # %bb.0:
 ; BDVER12-NEXT:    #APP
-; BDVER12-NEXT:    vpshlb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpshld %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpshlq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpshlw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; BDVER12-NEXT:    vpshlb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshld (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshlq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshlw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshlb %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshld %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshlq %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
-; BDVER12-NEXT:    vpshlw %xmm0, (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpshlb %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshld %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshlq %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshlw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshlb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshlq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshlw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshlb %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshld %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshlq %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshlw %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
 ; BDVER12-NEXT:    #NO_APP
-; BDVER12-NEXT:    retq # sched: [1:1.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BDVER3-LABEL: test_vpshl:
 ; BDVER3:       # %bb.0:
diff --git a/test/tools/llvm-mca/X86/BdVer2/add-sequence.s b/test/tools/llvm-mca/X86/BdVer2/add-sequence.s
index 287095b7fb5..004def6ab71 100644
--- a/test/tools/llvm-mca/X86/BdVer2/add-sequence.s
+++ b/test/tools/llvm-mca/X86/BdVer2/add-sequence.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1000 -timeline < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1000 -timeline < %s | FileCheck %s
 
 add %eax, %ecx
 add %esi, %eax
@@ -7,13 +7,13 @@ add %eax, %edx
 
 # CHECK:      Iterations:        1000
 # CHECK-NEXT: Instructions:      3000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      1504
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    2.99
-# CHECK-NEXT: IPC:               2.99
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -24,64 +24,76 @@ add %eax, %edx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.33                        addl	%eax, %ecx
-# CHECK-NEXT:  1      1     0.33                        addl	%esi, %eax
-# CHECK-NEXT:  1      1     0.33                        addl	%eax, %edx
+# CHECK-NEXT:  1      1     0.50                        addl	%eax, %ecx
+# CHECK-NEXT:  1      1     0.50                        addl	%esi, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	%eax, %edx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addl	%eax, %ecx
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     addl	%esi, %eax
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     addl	%eax, %edx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%esi, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %edx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    .  .   addl	%eax, %ecx
-# CHECK-NEXT: [0,1]     DeER .    .  .   addl	%esi, %eax
-# CHECK-NEXT: [0,2]     D=eER.    .  .   addl	%eax, %edx
-# CHECK-NEXT: [1,0]     D=eER.    .  .   addl	%eax, %ecx
-# CHECK-NEXT: [1,1]     .DeER.    .  .   addl	%esi, %eax
-# CHECK-NEXT: [1,2]     .D=eER    .  .   addl	%eax, %edx
-# CHECK-NEXT: [2,0]     .D=eER    .  .   addl	%eax, %ecx
-# CHECK-NEXT: [2,1]     .D=eER    .  .   addl	%esi, %eax
-# CHECK-NEXT: [2,2]     . D=eER   .  .   addl	%eax, %edx
-# CHECK-NEXT: [3,0]     . D=eER   .  .   addl	%eax, %ecx
-# CHECK-NEXT: [3,1]     . D=eER   .  .   addl	%esi, %eax
-# CHECK-NEXT: [3,2]     . D==eER  .  .   addl	%eax, %edx
-# CHECK-NEXT: [4,0]     .  D=eER  .  .   addl	%eax, %ecx
-# CHECK-NEXT: [4,1]     .  D=eER  .  .   addl	%esi, %eax
-# CHECK-NEXT: [4,2]     .  D==eER .  .   addl	%eax, %edx
-# CHECK-NEXT: [5,0]     .  D==eER .  .   addl	%eax, %ecx
-# CHECK-NEXT: [5,1]     .   D=eER .  .   addl	%esi, %eax
-# CHECK-NEXT: [5,2]     .   D==eER.  .   addl	%eax, %edx
-# CHECK-NEXT: [6,0]     .   D==eER.  .   addl	%eax, %ecx
-# CHECK-NEXT: [6,1]     .   D==eER.  .   addl	%esi, %eax
-# CHECK-NEXT: [6,2]     .    D==eER  .   addl	%eax, %edx
-# CHECK-NEXT: [7,0]     .    D==eER  .   addl	%eax, %ecx
-# CHECK-NEXT: [7,1]     .    D==eER  .   addl	%esi, %eax
-# CHECK-NEXT: [7,2]     .    D===eER .   addl	%eax, %edx
-# CHECK-NEXT: [8,0]     .    .D==eER .   addl	%eax, %ecx
-# CHECK-NEXT: [8,1]     .    .D==eER .   addl	%esi, %eax
-# CHECK-NEXT: [8,2]     .    .D===eER.   addl	%eax, %edx
-# CHECK-NEXT: [9,0]     .    .D===eER.   addl	%eax, %ecx
-# CHECK-NEXT: [9,1]     .    . D==eER.   addl	%esi, %eax
-# CHECK-NEXT: [9,2]     .    . D===eER   addl	%eax, %edx
+# CHECK:      [0,0]     DeER .    .    . .   addl	%eax, %ecx
+# CHECK-NEXT: [0,1]     DeER .    .    . .   addl	%esi, %eax
+# CHECK-NEXT: [0,2]     D=eER.    .    . .   addl	%eax, %edx
+# CHECK-NEXT: [1,0]     D==eER    .    . .   addl	%eax, %ecx
+# CHECK-NEXT: [1,1]     .DeE-R    .    . .   addl	%esi, %eax
+# CHECK-NEXT: [1,2]     .D=eER    .    . .   addl	%eax, %edx
+# CHECK-NEXT: [2,0]     .D==eER   .    . .   addl	%eax, %ecx
+# CHECK-NEXT: [2,1]     .D==eER   .    . .   addl	%esi, %eax
+# CHECK-NEXT: [2,2]     . D==eER  .    . .   addl	%eax, %edx
+# CHECK-NEXT: [3,0]     . D===eER .    . .   addl	%eax, %ecx
+# CHECK-NEXT: [3,1]     . D==eE-R .    . .   addl	%esi, %eax
+# CHECK-NEXT: [3,2]     . D===eER .    . .   addl	%eax, %edx
+# CHECK-NEXT: [4,0]     .  D===eER.    . .   addl	%eax, %ecx
+# CHECK-NEXT: [4,1]     .  D===eER.    . .   addl	%esi, %eax
+# CHECK-NEXT: [4,2]     .  D====eER    . .   addl	%eax, %edx
+# CHECK-NEXT: [5,0]     .  D=====eER   . .   addl	%eax, %ecx
+# CHECK-NEXT: [5,1]     .   D===eE-R   . .   addl	%esi, %eax
+# CHECK-NEXT: [5,2]     .   D====eER   . .   addl	%eax, %edx
+# CHECK-NEXT: [6,0]     .   D=====eER  . .   addl	%eax, %ecx
+# CHECK-NEXT: [6,1]     .   D=====eER  . .   addl	%esi, %eax
+# CHECK-NEXT: [6,2]     .    D=====eER . .   addl	%eax, %edx
+# CHECK-NEXT: [7,0]     .    D======eER. .   addl	%eax, %ecx
+# CHECK-NEXT: [7,1]     .    D=====eE-R. .   addl	%esi, %eax
+# CHECK-NEXT: [7,2]     .    D======eER. .   addl	%eax, %edx
+# CHECK-NEXT: [8,0]     .    .D======eER .   addl	%eax, %ecx
+# CHECK-NEXT: [8,1]     .    .D======eER .   addl	%esi, %eax
+# CHECK-NEXT: [8,2]     .    .D=======eER.   addl	%eax, %edx
+# CHECK-NEXT: [9,0]     .    .D========eER   addl	%eax, %ecx
+# CHECK-NEXT: [9,1]     .    . D======eE-R   addl	%esi, %eax
+# CHECK-NEXT: [9,2]     .    . D=======eER   addl	%eax, %edx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -90,6 +102,6 @@ add %eax, %edx
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     10    2.5    0.1    0.0       addl	%eax, %ecx
-# CHECK-NEXT: 1.     10    2.2    0.1    0.0       addl	%esi, %eax
-# CHECK-NEXT: 2.     10    3.0    0.0    0.0       addl	%eax, %edx
+# CHECK-NEXT: 0.     10    5.0    0.6    0.0       addl	%eax, %ecx
+# CHECK-NEXT: 1.     10    4.2    0.5    0.5       addl	%esi, %eax
+# CHECK-NEXT: 2.     10    5.0    0.0    0.0       addl	%eax, %edx
diff --git a/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s
index c8e18731a3e..973ce7d8a04 100644
--- a/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 -resource-pressure=false -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 -resource-pressure=false -timeline -timeline-max-iterations=2 < %s | FileCheck %s
 
 ## Sets register RAX.
 imulq $5, %rcx, %rax
@@ -15,13 +15,13 @@ bsf   %rax, %rcx
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      803
-# CHECK-NEXT: Total uOps:        400
+# CHECK-NEXT: Total Cycles:      702
+# CHECK-NEXT: Total uOps:        1000
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.50
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: uOps Per Cycle:    1.42
+# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -32,23 +32,23 @@ bsf   %rax, %rcx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        imulq	$5, %rcx, %rax
-# CHECK-NEXT:  1      3     1.00                        lzcntl	%ecx, %eax
-# CHECK-NEXT:  1      1     0.33                        andq	%rcx, %rax
-# CHECK-NEXT:  1      3     1.00                        bsfq	%rax, %rcx
+# CHECK-NEXT:  1      6     4.00                        imulq	$5, %rcx, %rax
+# CHECK-NEXT:  2      2     0.50                        lzcntl	%ecx, %eax
+# CHECK-NEXT:  1      1     0.50                        andq	%rcx, %rax
+# CHECK-NEXT:  6      3     2.00                        bsfq	%rax, %rcx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .  .   imulq	$5, %rcx, %rax
-# CHECK-NEXT: [0,1]     D=eeeER   .    .  .   lzcntl	%ecx, %eax
-# CHECK-NEXT: [0,2]     D====eER  .    .  .   andq	%rcx, %rax
-# CHECK-NEXT: [0,3]     D=====eeeER    .  .   bsfq	%rax, %rcx
-# CHECK-NEXT: [1,0]     .D=======eeeER .  .   imulq	$5, %rcx, %rax
-# CHECK-NEXT: [1,1]     .D========eeeER.  .   lzcntl	%ecx, %eax
-# CHECK-NEXT: [1,2]     .D===========eER  .   andq	%rcx, %rax
-# CHECK-NEXT: [1,3]     .D============eeeER   bsfq	%rax, %rcx
+# CHECK:      [0,0]     DeeeeeeER .    .   imulq	$5, %rcx, %rax
+# CHECK-NEXT: [0,1]     DeeE----R .    .   lzcntl	%ecx, %eax
+# CHECK-NEXT: [0,2]     D==eE---R .    .   andq	%rcx, %rax
+# CHECK-NEXT: [0,3]     .D==eeeER .    .   bsfq	%rax, %rcx
+# CHECK-NEXT: [1,0]     . D====eeeeeeER.   imulq	$5, %rcx, %rax
+# CHECK-NEXT: [1,1]     .  D====eeE---R.   lzcntl	%ecx, %eax
+# CHECK-NEXT: [1,2]     .  D======eE--R.   andq	%rcx, %rax
+# CHECK-NEXT: [1,3]     .   D======eeeER   bsfq	%rax, %rcx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -57,7 +57,7 @@ bsf   %rax, %rcx
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     4.5    0.5    0.0       imulq	$5, %rcx, %rax
-# CHECK-NEXT: 1.     2     5.5    1.5    0.0       lzcntl	%ecx, %eax
-# CHECK-NEXT: 2.     2     8.5    0.0    0.0       andq	%rcx, %rax
-# CHECK-NEXT: 3.     2     9.5    0.0    0.0       bsfq	%rax, %rcx
+# CHECK-NEXT: 0.     2     3.0    0.5    0.0       imulq	$5, %rcx, %rax
+# CHECK-NEXT: 1.     2     3.0    1.0    3.5       lzcntl	%ecx, %eax
+# CHECK-NEXT: 2.     2     5.0    0.0    2.5       andq	%rcx, %rax
+# CHECK-NEXT: 3.     2     5.0    0.0    0.0       bsfq	%rax, %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s
index 99f463c3509..edbe726a9cb 100644
--- a/test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 -resource-pressure=false -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 -resource-pressure=false -timeline -timeline-max-iterations=2 < %s | FileCheck %s
 
 # In this test, the VDIVPS takes 38 cycles to write to register YMM3.  The first
 # VADDPS does not depend on the VDIVPS (the WAW dependency is eliminated at
@@ -33,13 +33,13 @@ vandps %xmm4, %xmm1, %xmm0
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1800
-# CHECK-NEXT: Total Cycles:      2804
-# CHECK-NEXT: Total uOps:        2000
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        3400
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.71
-# CHECK-NEXT: IPC:               0.64
-# CHECK-NEXT: Block RThroughput: 28.0
+# CHECK-NEXT: uOps Per Cycle:    0.85
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 31.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -50,65 +50,63 @@ vandps %xmm4, %xmm1, %xmm0
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  3      29    28.00                       vdivps	%ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT:  1      1     1.00                        vandps	%xmm4, %xmm1, %xmm0
+# CHECK-NEXT:  2      9     19.00                       vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      2     0.50                        vandps	%xmm4, %xmm1, %xmm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER   .    .    .    .    .   .   vdivps	%ymm0, %ymm1, %ymm3
-# CHECK-NEXT: [0,1]     DeeeE--------------------------R   .    .    .    .    .   .   vaddps	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: [0,2]     .D==eeeE-----------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,3]     .D===eeeE----------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,4]     .D====eeeE---------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,5]     .D=====eeeE--------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,6]     . D=====eeeE-------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,7]     . D======eeeE------------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,8]     . D=======eeeE-----------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,9]     . D========eeeE----------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,10]    .  D========eeeE---------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,11]    .  D=========eeeE--------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,12]    .  D==========eeeE-------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,13]    .  D===========eeeE------------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,14]    .   D===========eeeE-----------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,15]    .   D============eeeE----------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,16]    .   D=============eeeE---------R   .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [0,17]    .   D================eE--------R   .    .    .    .    .   .   vandps	%xmm4, %xmm1, %xmm0
-# CHECK-NEXT: [1,0]     .    D=======================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeER   vdivps	%ymm0, %ymm1, %ymm3
-# CHECK-NEXT: [1,1]     .    D================eeeE---------------------------------R   vaddps	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: [1,2]     .    .D==================eeeE------------------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,3]     .    .D===================eeeE-----------------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,4]     .    .D====================eeeE----------------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,5]     .    .D=====================eeeE---------------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,6]     .    . D=====================eeeE--------------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,7]     .    . D======================eeeE-------------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,8]     .    . D=======================eeeE------------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,9]     .    . D========================eeeE-----------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,10]    .    .  D========================eeeE----------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,11]    .    .  D=========================eeeE---------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,12]    .    .  D==========================eeeE--------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,13]    .    .  D===========================eeeE-------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,14]    .    .   D===========================eeeE------------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,15]    .    .   D============================eeeE-----------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,16]    .    .   D=============================eeeE----------------R   vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: [1,17]    .    .   D================================eE---------------R   vandps	%xmm4, %xmm1, %xmm0
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    .    .    .    .    .    .   .   vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT: [0,1]     DeeeeeE----R   .    .    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,3]     .D======eeeeeER.    .    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,4]     . D=======eeeeeER   .    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,5]     . D=========eeeeeER .    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,6]     .  D==========eeeeeER    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,7]     .  D============eeeeeER  .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,8]     .   D=============eeeeeER.    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,9]     .   D===============eeeeeER   .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,10]    .    D================eeeeeER .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,11]    .    D==================eeeeeER    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,12]    .    .D===================eeeeeER  .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,13]    .    .D=====================eeeeeER.    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,14]    .    . D======================eeeeeER   .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,15]    .    . D========================eeeeeER .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,16]    .    .  D=========================eeeeeER    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,17]    .    .  D==============================eeER  .    .    .    .    .    .    .   .   vandps	%xmm4, %xmm1, %xmm0
+# CHECK-NEXT: [1,0]     .    .   D===============================eeeeeeeeeER   .    .    .    .    .   .   vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT: [1,1]     .    .   D===============================eeeeeE----R   .    .    .    .    .   .   vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: [1,2]     .    .    D===================================eeeeeER  .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,3]     .    .    D=====================================eeeeeER.    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,4]     .    .    .D======================================eeeeeER   .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,5]     .    .    .D========================================eeeeeER .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,6]     .    .    . D=========================================eeeeeER    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,7]     .    .    . D===========================================eeeeeER  .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,8]     .    .    .  D============================================eeeeeER.    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,9]     .    .    .  D==============================================eeeeeER   .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,10]    .    .    .   D===============================================eeeeeER .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,11]    .    .    .   D=================================================eeeeeER    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,12]    .    .    .    D==================================================eeeeeER  .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,13]    .    .    .    D====================================================eeeeeER.   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,14]    .    .    .    .D=====================================================eeeeeER  .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,15]    .    .    .    .D=======================================================eeeeeER.   vaddps	%ymm3, %ymm1, %ymm4
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -117,21 +115,21 @@ vandps %xmm4, %xmm1, %xmm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     12.5   4.0    0.0       vdivps	%ymm0, %ymm1, %ymm3
-# CHECK-NEXT: 1.     2     9.0    0.5    29.5      vaddps	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT: 2.     2     11.0   0.0    26.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 3.     2     12.0   1.0    25.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 4.     2     13.0   2.0    24.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 5.     2     14.0   3.0    23.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 6.     2     14.0   4.0    22.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 7.     2     15.0   5.0    21.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 8.     2     16.0   6.0    20.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 9.     2     17.0   7.0    19.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 10.    2     17.0   8.0    18.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 11.    2     18.0   9.0    17.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 12.    2     19.0   10.0   16.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 13.    2     20.0   11.0   15.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 14.    2     20.0   12.0   14.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 15.    2     21.0   13.0   13.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 16.    2     22.0   14.0   12.5      vaddps	%ymm3, %ymm1, %ymm4
-# CHECK-NEXT: 17.    2     25.0   0.0    11.5      vandps	%xmm4, %xmm1, %xmm0
+# CHECK-NEXT: 0.     2     16.5   0.5    0.0       vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.     2     16.5   0.5    4.0       vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 2.     2     20.5   0.0    0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 3.     2     22.5   2.0    0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 4.     2     23.5   4.0    0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 5.     2     25.5   6.0    0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 6.     2     26.5   8.0    0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 7.     2     28.5   10.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 8.     2     29.5   12.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 9.     2     31.5   14.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 10.    2     32.5   16.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 11.    2     34.5   18.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 12.    2     35.5   20.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 13.    2     37.5   22.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 14.    2     38.5   23.5   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 15.    2     40.5   25.5   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 16.    2     41.5   27.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 17.    2     46.5   0.0    0.0       vandps	%xmm4, %xmm1, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
index c51fb3677dd..12bf3748cb1 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
 
 # Perf stat reports an IPC of 1.97 for this block of code.
 
@@ -11,13 +11,13 @@ cmovae %ebx, %eax
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      3000
-# CHECK-NEXT: Total Cycles:      4503
-# CHECK-NEXT: Total uOps:        4500
+# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.67
-# CHECK-NEXT: Block RThroughput: 0.8
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -28,38 +28,49 @@ cmovae %ebx, %eax
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.33                        cmpl	%eax, %eax
-# CHECK-NEXT:  2      2     0.67                        cmovael	%ebx, %eax
+# CHECK-NEXT:  1      1     0.50                        cmpl	%eax, %eax
+# CHECK-NEXT:  1      1     0.50                        cmovael	%ebx, %eax
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     cmpl	%eax, %eax
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cmovael	%ebx, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	%ebx, %eax
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     0123456
 
-# CHECK:      [0,0]     DeER .    ..   cmpl	%eax, %eax
-# CHECK-NEXT: [0,1]     D=eeER    ..   cmovael	%ebx, %eax
-# CHECK-NEXT: [1,0]     D===eER   ..   cmpl	%eax, %eax
-# CHECK-NEXT: [1,1]     .D===eeER ..   cmovael	%ebx, %eax
-# CHECK-NEXT: [2,0]     .D=====eER..   cmpl	%eax, %eax
-# CHECK-NEXT: [2,1]     . D=====eeER   cmovael	%ebx, %eax
+# CHECK:      [0,0]     DeER ..   cmpl	%eax, %eax
+# CHECK-NEXT: [0,1]     D=eER..   cmovael	%ebx, %eax
+# CHECK-NEXT: [1,0]     DeE-R..   cmpl	%eax, %eax
+# CHECK-NEXT: [1,1]     D==eER.   cmovael	%ebx, %eax
+# CHECK-NEXT: [2,0]     .DeE-R.   cmpl	%eax, %eax
+# CHECK-NEXT: [2,1]     .D==eER   cmovael	%ebx, %eax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -68,5 +79,5 @@ cmovae %ebx, %eax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     3.7    0.3    0.0       cmpl	%eax, %eax
-# CHECK-NEXT: 1.     3     4.0    0.0    0.0       cmovael	%ebx, %eax
+# CHECK-NEXT: 0.     3     1.0    1.0    0.7       cmpl	%eax, %eax
+# CHECK-NEXT: 1.     3     2.7    0.0    0.0       cmovael	%ebx, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
index e72ce0c7ba5..4f869e656f3 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
 
 # perf stat reports an IPC of 2.00 for this block of code.
 
@@ -14,12 +14,12 @@ vpcmpeqq %xmm3, %xmm3, %xmm0
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      6000
-# CHECK-NEXT: Total Cycles:      6003
+# CHECK-NEXT: Total Cycles:      3005
 # CHECK-NEXT: Total uOps:        6000
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    2.00
+# CHECK-NEXT: IPC:               2.00
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
@@ -31,48 +31,60 @@ vpcmpeqq %xmm3, %xmm3, %xmm0
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     2.00    -     2.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpcmpeqb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpcmpeqw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpcmpeqd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    .   .   vpcmpeqb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [0,1]     D=eER.    .   .   vpcmpeqw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [0,2]     D==eER    .   .   vpcmpeqd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [0,3]     D===eER   .   .   vpcmpeqq	%xmm3, %xmm3, %xmm0
-# CHECK-NEXT: [1,0]     .D===eER  .   .   vpcmpeqb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [1,1]     .D====eER .   .   vpcmpeqw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [1,2]     .D=====eER.   .   vpcmpeqd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [1,3]     .D======eER   .   vpcmpeqq	%xmm3, %xmm3, %xmm0
-# CHECK-NEXT: [2,0]     . D======eER  .   vpcmpeqb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [2,1]     . D=======eER .   vpcmpeqw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [2,2]     . D========eER.   vpcmpeqd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [2,3]     . D=========eER   vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK:      [0,0]     DeeER.    .   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     D=eeER    .   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     DeeE-R    .   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,3]     D==eeER   .   vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [1,0]     .DeeE-R   .   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [1,1]     .D==eeER  .   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .D=eeE-R  .   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,3]     .D===eeER .   vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [2,0]     . D=eeE-R .   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [2,1]     . D===eeER.   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,2]     . D==eeE-R.   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,3]     . D====eeER   vpcmpeqq	%xmm3, %xmm3, %xmm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -81,7 +93,7 @@ vpcmpeqq %xmm3, %xmm3, %xmm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     4.0    0.3    0.0       vpcmpeqb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: 1.     3     5.0    0.0    0.0       vpcmpeqw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: 2.     3     6.0    0.0    0.0       vpcmpeqd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: 3.     3     7.0    0.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: 0.     3     1.3    1.3    0.7       vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     3     3.0    3.0    0.0       vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 2.     3     2.0    2.0    1.0       vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 3.     3     4.0    0.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
index 463de0b8123..019d3fd5067 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
 
 # perf stat reports an IPC of 2.00 for this block of code.
 
@@ -15,12 +15,12 @@ vpcmpgtq %xmm3, %xmm3, %xmm0
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      6000
-# CHECK-NEXT: Total Cycles:      1501
+# CHECK-NEXT: Total Cycles:      1504
 # CHECK-NEXT: Total uOps:        6000
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    4.00
-# CHECK-NEXT: IPC:               4.00
+# CHECK-NEXT: uOps Per Cycle:    3.99
+# CHECK-NEXT: IPC:               3.99
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Instruction Info:
@@ -35,44 +35,56 @@ vpcmpgtq %xmm3, %xmm3, %xmm0
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtb	%xmm0, %xmm0, %xmm1
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm1, %xmm1, %xmm2
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT:  1      0     0.25                        vpcmpgtq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtq	%xmm3, %xmm3, %xmm0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     0123
+# CHECK-NEXT: Index     0123456
 
-# CHECK:      [0,0]     DR .   vpcmpgtb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [0,1]     DR .   vpcmpgtw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [0,2]     DR .   vpcmpgtd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [0,3]     DR .   vpcmpgtq	%xmm3, %xmm3, %xmm0
-# CHECK-NEXT: [1,0]     .DR.   vpcmpgtb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [1,1]     .DR.   vpcmpgtw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [1,2]     .DR.   vpcmpgtd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [1,3]     .DR.   vpcmpgtq	%xmm3, %xmm3, %xmm0
-# CHECK-NEXT: [2,0]     . DR   vpcmpgtb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [2,1]     . DR   vpcmpgtw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [2,2]     . DR   vpcmpgtd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [2,3]     . DR   vpcmpgtq	%xmm3, %xmm3, %xmm0
+# CHECK:      [0,0]     DR   ..   vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     DR   ..   vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     DR   ..   vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,3]     DeeER..   vpcmpgtq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [1,0]     .D--R..   vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [1,1]     .D--R..   vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .D--R..   vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,3]     .DeeER.   vpcmpgtq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [2,0]     . D--R.   vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [2,1]     . D--R.   vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,2]     . D--R.   vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,3]     . DeeER   vpcmpgtq	%xmm3, %xmm3, %xmm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -81,7 +93,7 @@ vpcmpgtq %xmm3, %xmm3, %xmm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     0.0    0.0    0.0       vpcmpgtb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: 1.     3     0.0    0.0    0.0       vpcmpgtw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: 2.     3     0.0    0.0    0.0       vpcmpgtd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: 3.     3     0.0    0.0    0.0       vpcmpgtq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: 0.     3     0.0    0.0    1.3       vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     3     0.0    0.0    1.3       vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 2.     3     0.0    0.0    1.3       vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 3.     3     1.0    1.0    0.0       vpcmpgtq	%xmm3, %xmm3, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
index 68d24f52128..0503bd8552b 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
 
 # perf stat reports an IPC of 1.00 for this code block.
 
@@ -12,13 +12,13 @@ sbb %eax, %eax
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      3000
-# CHECK-NEXT: Total Cycles:      6003
-# CHECK-NEXT: Total uOps:        6000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    4
 # CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.50
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -29,38 +29,49 @@ sbb %eax, %eax
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      2     0.67                        sbbl	%edx, %edx
-# CHECK-NEXT:  2      2     0.67                        sbbl	%eax, %eax
+# CHECK-NEXT:  1      1     1.00                        sbbl	%edx, %edx
+# CHECK-NEXT:  1      1     1.00                        sbbl	%eax, %eax
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.33   1.33    -     1.33    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     sbbl	%edx, %edx
-# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     sbbl	%eax, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%edx, %edx
+# CHECK-NEXT:  -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%eax, %eax
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .   .   sbbl	%edx, %edx
-# CHECK-NEXT: [0,1]     D==eeER   .   .   sbbl	%eax, %eax
-# CHECK-NEXT: [1,0]     .D===eeER .   .   sbbl	%edx, %edx
-# CHECK-NEXT: [1,1]     .D=====eeER   .   sbbl	%eax, %eax
-# CHECK-NEXT: [2,0]     . D======eeER .   sbbl	%edx, %edx
-# CHECK-NEXT: [2,1]     . D========eeER   sbbl	%eax, %eax
+# CHECK:      [0,0]     DeER .  .   sbbl	%edx, %edx
+# CHECK-NEXT: [0,1]     D=eER.  .   sbbl	%eax, %eax
+# CHECK-NEXT: [1,0]     D==eER  .   sbbl	%edx, %edx
+# CHECK-NEXT: [1,1]     D===eER .   sbbl	%eax, %eax
+# CHECK-NEXT: [2,0]     .D===eER.   sbbl	%edx, %edx
+# CHECK-NEXT: [2,1]     .D====eER   sbbl	%eax, %eax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -69,5 +80,5 @@ sbb %eax, %eax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     4.0    0.3    0.0       sbbl	%edx, %edx
-# CHECK-NEXT: 1.     3     6.0    0.0    0.0       sbbl	%eax, %eax
+# CHECK-NEXT: 0.     3     2.7    0.3    0.0       sbbl	%edx, %edx
+# CHECK-NEXT: 1.     3     3.7    0.0    0.0       sbbl	%eax, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
index 88dd23be8f3..ba29a29e7dc 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
 
 # perf stat reports a throughput of 1.51 IPC for this block of code.
 
@@ -13,13 +13,13 @@ sbb %eax, %eax
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      4500
-# CHECK-NEXT: Total Cycles:      7503
-# CHECK-NEXT: Total uOps:        6000
+# CHECK-NEXT: Total Cycles:      3006
+# CHECK-NEXT: Total uOps:        4500
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.80
-# CHECK-NEXT: IPC:               0.60
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.50
+# CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -30,43 +30,55 @@ sbb %eax, %eax
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        imull	%edx, %eax
-# CHECK-NEXT:  1      1     0.33                        addl	%edx, %edx
-# CHECK-NEXT:  2      2     0.67                        sbbl	%eax, %eax
+# CHECK-NEXT:  1      4     1.00                        imull	%edx, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	%edx, %edx
+# CHECK-NEXT:  1      1     1.00                        sbbl	%eax, %eax
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.33   1.33    -     1.33    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imull	%edx, %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -     addl	%edx, %edx
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     sbbl	%eax, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%edx, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %edx
+# CHECK-NEXT:  -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%eax, %eax
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234567
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    . .   imull	%edx, %eax
-# CHECK-NEXT: [0,1]     DeE--R    .    . .   addl	%edx, %edx
-# CHECK-NEXT: [0,2]     D===eeER  .    . .   sbbl	%eax, %eax
-# CHECK-NEXT: [1,0]     .D====eeeER    . .   imull	%edx, %eax
-# CHECK-NEXT: [1,1]     .DeE------R    . .   addl	%edx, %edx
-# CHECK-NEXT: [1,2]     .D=======eeER  . .   sbbl	%eax, %eax
-# CHECK-NEXT: [2,0]     . D========eeeER .   imull	%edx, %eax
-# CHECK-NEXT: [2,1]     . DeE----------R .   addl	%edx, %edx
-# CHECK-NEXT: [2,2]     . D===========eeER   sbbl	%eax, %eax
+# CHECK:      [0,0]     D=eeeeER  ..   imull	%edx, %eax
+# CHECK-NEXT: [0,1]     DeE----R  ..   addl	%edx, %edx
+# CHECK-NEXT: [0,2]     D==eE--R  ..   sbbl	%eax, %eax
+# CHECK-NEXT: [1,0]     D===eeeeER..   imull	%edx, %eax
+# CHECK-NEXT: [1,1]     .DeE-----R..   addl	%edx, %edx
+# CHECK-NEXT: [1,2]     .D===eE--R..   sbbl	%eax, %eax
+# CHECK-NEXT: [2,0]     .D====eeeeER   imull	%edx, %eax
+# CHECK-NEXT: [2,1]     .D=eE------R   addl	%edx, %edx
+# CHECK-NEXT: [2,2]     . D====eE--R   sbbl	%eax, %eax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -75,6 +87,6 @@ sbb %eax, %eax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     5.0    0.3    0.0       imull	%edx, %eax
-# CHECK-NEXT: 1.     3     1.0    0.3    6.0       addl	%edx, %edx
-# CHECK-NEXT: 2.     3     8.0    0.0    0.0       sbbl	%eax, %eax
+# CHECK-NEXT: 0.     3     3.7    0.7    0.0       imull	%edx, %eax
+# CHECK-NEXT: 1.     3     1.3    0.3    5.0       addl	%edx, %edx
+# CHECK-NEXT: 2.     3     4.0    2.0    2.0       sbbl	%eax, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s b/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
index bf0f19ad31e..bd5b724bbd1 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=500 -timeline < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=500 -timeline < %s | FileCheck %s
 
 vpmuld %xmm0, %xmm0, %xmm1
 vpaddd %xmm1, %xmm1, %xmm0
@@ -7,7 +7,7 @@ vpaddd %xmm0, %xmm0, %xmm3
 
 # CHECK:      Iterations:        500
 # CHECK-NEXT: Instructions:      1500
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      3005
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    4
@@ -24,64 +24,76 @@ vpaddd %xmm0, %xmm0, %xmm3
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      5     1.00                        vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT:  1      1     0.50                        vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT:  1      1     0.50                        vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      2     0.50                        vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT:  1      2     0.50                        vpaddd	%xmm0, %xmm0, %xmm3
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00   1.00    -     1.50   1.50    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpaddd	%xmm0, %xmm0, %xmm3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          01234
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    .    .    .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT: [0,2]     D======eER.    .    .    .    .    .    .    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
-# CHECK-NEXT: [1,0]     D======eeeeeER .    .    .    .    .    .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [1,1]     .D==========eER.    .    .    .    .    .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT: [1,2]     .D===========eER    .    .    .    .    .    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
-# CHECK-NEXT: [2,0]     .D===========eeeeeER.    .    .    .    .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [2,1]     .D================eER    .    .    .    .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT: [2,2]     . D================eER   .    .    .    .    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
-# CHECK-NEXT: [3,0]     . D================eeeeeER    .    .    .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [3,1]     . D=====================eER   .    .    .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT: [3,2]     . D======================eER  .    .    .    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
-# CHECK-NEXT: [4,0]     .  D=====================eeeeeER   .    .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [4,1]     .  D==========================eER  .    .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT: [4,2]     .  D===========================eER .    .    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
-# CHECK-NEXT: [5,0]     .  D===========================eeeeeER  .    .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [5,1]     .   D===============================eER .    .    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT: [5,2]     .   D================================eER.    .    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
-# CHECK-NEXT: [6,0]     .   D================================eeeeeER .    .    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [6,1]     .   D=====================================eER.    .    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT: [6,2]     .    D=====================================eER    .    .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
-# CHECK-NEXT: [7,0]     .    D=====================================eeeeeER.    .    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [7,1]     .    D==========================================eER    .    .  .   vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT: [7,2]     .    D===========================================eER   .    .  .   vpaddd	%xmm0, %xmm0, %xmm3
-# CHECK-NEXT: [8,0]     .    .D==========================================eeeeeER    .  .   vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [8,1]     .    .D===============================================eER   .  .   vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT: [8,2]     .    .D================================================eER  .  .   vpaddd	%xmm0, %xmm0, %xmm3
-# CHECK-NEXT: [9,0]     .    .D================================================eeeeeER .   vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [9,1]     .    . D====================================================eER.   vpaddd	%xmm1, %xmm1, %xmm0
-# CHECK-NEXT: [9,2]     .    . D=====================================================eER   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .    .    .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     D====eeER .    .    .    .    .    .    .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [0,2]     D======eeER    .    .    .    .    .    .    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [1,0]     D======eeeeER  .    .    .    .    .    .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [1,1]     .D=========eeER.    .    .    .    .    .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [1,2]     .D===========eeER   .    .    .    .    .    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [2,0]     .D===========eeeeER .    .    .    .    .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [2,1]     .D===============eeER    .    .    .    .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [2,2]     . D================eeER  .    .    .    .    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [3,0]     . D================eeeeER.    .    .    .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [3,1]     . D====================eeER   .    .    .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [3,2]     . D======================eeER .    .    .    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [4,0]     .  D=====================eeeeER    .    .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [4,1]     .  D=========================eeER  .    .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [4,2]     .  D===========================eeER.    .    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [5,0]     .  D===========================eeeeER   .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [5,1]     .   D==============================eeER .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [5,2]     .   D================================eeER    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [6,0]     .   D================================eeeeER  .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [6,1]     .   D====================================eeER.    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [6,2]     .    D=====================================eeER   .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [7,0]     .    D=====================================eeeeER .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [7,1]     .    D=========================================eeER    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [7,2]     .    D===========================================eeER  .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [8,0]     .    .D==========================================eeeeER.    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [8,1]     .    .D==============================================eeER   .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [8,2]     .    .D================================================eeER .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [9,0]     .    .D================================================eeeeER   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [9,1]     .    . D===================================================eeER .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [9,2]     .    . D=====================================================eeER   vpaddd	%xmm0, %xmm0, %xmm3
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -91,5 +103,5 @@ vpaddd %xmm0, %xmm0, %xmm3
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     10    25.0   0.1    0.0       vpmuldq	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: 1.     10    29.7   0.0    0.0       vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: 1.     10    28.7   0.0    0.0       vpaddd	%xmm1, %xmm1, %xmm0
 # CHECK-NEXT: 2.     10    30.5   0.0    0.0       vpaddd	%xmm0, %xmm0, %xmm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/dot-product.s b/test/tools/llvm-mca/X86/BdVer2/dot-product.s
index 079872dc2a5..d83cda27b0a 100644
--- a/test/tools/llvm-mca/X86/BdVer2/dot-product.s
+++ b/test/tools/llvm-mca/X86/BdVer2/dot-product.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=300 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=300 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
 
 vmulps   %xmm0, %xmm1, %xmm2
 vhaddps  %xmm2, %xmm2, %xmm3
@@ -7,13 +7,13 @@ vhaddps  %xmm3, %xmm3, %xmm4
 
 # CHECK:      Iterations:        300
 # CHECK-NEXT: Instructions:      900
-# CHECK-NEXT: Total Cycles:      1211
+# CHECK-NEXT: Total Cycles:      627
 # CHECK-NEXT: Total uOps:        2100
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.73
-# CHECK-NEXT: IPC:               0.74
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: uOps Per Cycle:    3.35
+# CHECK-NEXT: IPC:               1.44
+# CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -25,42 +25,54 @@ vhaddps  %xmm3, %xmm3, %xmm4
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      5     1.00                        vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      5     2.00                        vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT:  3      5     2.00                        vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT:  3      11    1.00                        vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  3      11    1.00                        vhaddps	%xmm3, %xmm3, %xmm4
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00   2.00    -     4.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.49   1.51    -      -      -      -     2.00   1.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.49   0.51    -      -      -      -     1.00    -      -      -      -      -     vhaddps	%xmm3, %xmm3, %xmm4
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    . .   vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [0,1]     D=====eeeeeER  .    . .   vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [0,2]     .D==========eeeeeER . .   vhaddps	%xmm3, %xmm3, %xmm4
-# CHECK-NEXT: [1,0]     .DeeeeeE----------R . .   vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [1,1]     . D=====eeeeeE----R . .   vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [1,2]     .  D==========eeeeeER .   vhaddps	%xmm3, %xmm3, %xmm4
-# CHECK-NEXT: [2,0]     .  DeeeeeE----------R .   vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [2,1]     .   D=====eeeeeE----R .   vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [2,2]     .    D==========eeeeeER   vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    . .   vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,1]     D=====eeeeeeeeeeeER .    .    . .   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,2]     .D===============eeeeeeeeeeeER. .   vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [1,0]     .DeeeeeE---------------------R. .   vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,1]     . D====eeeeeeeeeeeE----------R. .   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,2]     .  D==============eeeeeeeeeeeER .   vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [2,0]     .  DeeeeeE--------------------R .   vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [2,1]     .   D====eeeeeeeeeeeE---------R .   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,2]     .    D==============eeeeeeeeeeeER   vhaddps	%xmm3, %xmm3, %xmm4
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -69,6 +81,6 @@ vhaddps  %xmm3, %xmm3, %xmm4
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     1.0    1.0    6.7       vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1.     3     6.0    0.7    2.7       vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: 2.     3     11.0   1.0    0.0       vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: 0.     3     1.0    1.0    13.7      vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1.     3     5.3    0.0    6.3       vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 2.     3     15.3   0.0    0.0       vhaddps	%xmm3, %xmm3, %xmm4
diff --git a/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s
index e5d5140242d..c2ea467ef54 100644
--- a/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s
@@ -1,18 +1,18 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
 
 vshufps $0, %xmm0, %xmm1, %xmm1
 vhaddps (%rdi), %xmm1, %xmm2
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      2
-# CHECK-NEXT: Total Cycles:      15
+# CHECK-NEXT: Total Cycles:      20
 # CHECK-NEXT: Total uOps:        5
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.33
-# CHECK-NEXT: IPC:               0.13
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.10
+# CHECK-NEXT: Block RThroughput: 1.3
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -23,15 +23,15 @@ vhaddps (%rdi), %xmm1, %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     1.00                        vshufps	$0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT:  4      11    2.00    *                   vhaddps	(%rdi), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT:  4      16    1.00    *                   vhaddps	(%rdi), %xmm1, %xmm2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    .   .   vshufps	$0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeER   vhaddps	(%rdi), %xmm1, %xmm2
+# CHECK:      [0,0]     DeeER.    .    .   .   vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeeeeeER   vhaddps	(%rdi), %xmm1, %xmm2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
diff --git a/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s
index 08c256596f2..8988498705a 100644
--- a/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s
@@ -1,18 +1,18 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
 
 vshufps $0, %xmm0, %xmm1, %xmm1
 vhaddps (%rdi), %ymm1, %ymm2
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      2
-# CHECK-NEXT: Total Cycles:      16
-# CHECK-NEXT: Total uOps:        5
+# CHECK-NEXT: Total Cycles:      20
+# CHECK-NEXT: Total uOps:        11
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.31
-# CHECK-NEXT: IPC:               0.13
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: uOps Per Cycle:    0.55
+# CHECK-NEXT: IPC:               0.10
+# CHECK-NEXT: Block RThroughput: 2.8
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -23,15 +23,15 @@ vhaddps (%rdi), %ymm1, %ymm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     1.00                        vshufps	$0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT:  4      12    2.00    *                   vhaddps	(%rdi), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT:  10     16    2.00    *                   vhaddps	(%rdi), %ymm1, %ymm2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    .    .   vshufps	$0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeER   vhaddps	(%rdi), %ymm1, %ymm2
+# CHECK:      [0,0]     DeeER.    .    .   .   vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeeeeeER   vhaddps	(%rdi), %ymm1, %ymm2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
diff --git a/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s b/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s
index 6a92f84c35d..dfa9aaa6d89 100644
--- a/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s
+++ b/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -instruction-info=true < %s | FileCheck %s --check-prefix=ENABLED
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -instruction-info=false < %s | FileCheck %s -check-prefix=DISABLED
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -instruction-info < %s | FileCheck %s -check-prefix=ENABLED
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false < %s | FileCheck %s -check-prefix=ENABLED
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false -instruction-info=true < %s | FileCheck %s --check-prefix=ENABLED
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false -instruction-info=false < %s | FileCheck %s -check-prefix=DISABLED
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false -instruction-info < %s | FileCheck %s -check-prefix=ENABLED
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false < %s | FileCheck %s -check-prefix=ENABLED
 
 vmulps   %xmm0, %xmm1, %xmm2
 vhaddps  %xmm2, %xmm2, %xmm3
@@ -13,14 +13,14 @@ vhaddps  %xmm3, %xmm3, %xmm4
 
 # ENABLED:       Iterations:        100
 # ENABLED-NEXT:  Instructions:      300
-# ENABLED-NEXT:  Total Cycles:      414
+# ENABLED-NEXT:  Total Cycles:      228
 # ENABLED-NEXT:  Total uOps:        700
 
 
 # ENABLED:       Dispatch Width:    4
-# ENABLED-NEXT:  uOps Per Cycle:    1.69
-# ENABLED-NEXT:  IPC:               0.72
-# ENABLED-NEXT:  Block RThroughput: 4.0
+# ENABLED-NEXT:  uOps Per Cycle:    3.07
+# ENABLED-NEXT:  IPC:               1.32
+# ENABLED-NEXT:  Block RThroughput: 2.0
 
 # ENABLED:       Instruction Info:
 # ENABLED-NEXT:  [1]: #uOps
@@ -32,5 +32,5 @@ vhaddps  %xmm3, %xmm3, %xmm4
 
 # ENABLED:       [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # ENABLED-NEXT:   1      5     1.00                        vmulps	%xmm0, %xmm1, %xmm2
-# ENABLED-NEXT:   3      5     2.00                        vhaddps	%xmm2, %xmm2, %xmm3
-# ENABLED-NEXT:   3      5     2.00                        vhaddps	%xmm3, %xmm3, %xmm4
+# ENABLED-NEXT:   3      11    1.00                        vhaddps	%xmm2, %xmm2, %xmm3
+# ENABLED-NEXT:   3      11    1.00                        vhaddps	%xmm3, %xmm3, %xmm4
diff --git a/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s b/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
index fd123844d38..90d0d392977 100644
--- a/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
+++ b/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 -timeline -timeline-max-iterations=1 -noalias=false < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 -timeline -timeline-max-iterations=1 -noalias=false < %s | FileCheck %s
 
 vmovaps (%rsi), %xmm0
 vmovaps %xmm0, (%rdi)
@@ -12,12 +12,12 @@ vmovaps %xmm0, 48(%rdi)
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      800
-# CHECK-NEXT: Total Cycles:      2803
+# CHECK-NEXT: Total Cycles:      2403
 # CHECK-NEXT: Total uOps:        800
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Instruction Info:
@@ -29,52 +29,64 @@ vmovaps %xmm0, 48(%rdi)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      6     0.50    *                   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	(%rsi), %xmm0
 # CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, (%rdi)
-# CHECK-NEXT:  1      6     0.50    *                   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	16(%rsi), %xmm0
 # CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 16(%rdi)
-# CHECK-NEXT:  1      6     0.50    *                   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	32(%rsi), %xmm0
 # CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 32(%rdi)
-# CHECK-NEXT:  1      6     0.50    *                   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	48(%rsi), %xmm0
 # CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 48(%rdi)
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -      -     4.00    -      -     8.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -     8.00    -      -      -      -      -      -      -     4.00    -      -      -     4.00   3.99   4.01    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   vmovaps	(%rsi), %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, (%rdi)
-# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   vmovaps	16(%rsi), %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, 16(%rdi)
-# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   vmovaps	32(%rsi), %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, 32(%rdi)
-# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   vmovaps	48(%rsi), %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, 48(%rdi)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	(%rsi), %xmm0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 48(%rdi)
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   vmovaps	(%rsi), %xmm0
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .   vmovaps	%xmm0, (%rdi)
-# CHECK-NEXT: [0,2]     D=======eeeeeeER    .    .    .   vmovaps	16(%rsi), %xmm0
-# CHECK-NEXT: [0,3]     D=============eER   .    .    .   vmovaps	%xmm0, 16(%rdi)
-# CHECK-NEXT: [0,4]     .D=============eeeeeeER  .    .   vmovaps	32(%rsi), %xmm0
-# CHECK-NEXT: [0,5]     .D===================eER .    .   vmovaps	%xmm0, 32(%rdi)
-# CHECK-NEXT: [0,6]     .D====================eeeeeeER.   vmovaps	48(%rsi), %xmm0
-# CHECK-NEXT: [0,7]     .D==========================eER   vmovaps	%xmm0, 48(%rdi)
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    ..   vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: [0,2]     D======eeeeeER .    .    ..   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: [0,3]     D===========eER.    .    ..   vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: [0,4]     .D===========eeeeeER.    ..   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: [0,5]     .D================eER    ..   vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: [0,6]     .D=================eeeeeER.   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: [0,7]     .D======================eER   vmovaps	%xmm0, 48(%rdi)
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -84,10 +96,10 @@ vmovaps %xmm0, 48(%rdi)
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       vmovaps	(%rsi), %xmm0
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       vmovaps	%xmm0, (%rdi)
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       vmovaps	16(%rsi), %xmm0
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       vmovaps	%xmm0, 16(%rdi)
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       vmovaps	32(%rsi), %xmm0
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       vmovaps	%xmm0, 32(%rdi)
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       vmovaps	48(%rsi), %xmm0
-# CHECK-NEXT: 7.     1     27.0   0.0    0.0       vmovaps	%xmm0, 48(%rdi)
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: 7.     1     23.0   0.0    0.0       vmovaps	%xmm0, 48(%rdi)
diff --git a/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s b/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s
index 107262f9497..b69f77b3693 100644
--- a/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s
+++ b/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 -timeline -timeline-max-iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 -timeline -timeline-max-iterations=1 < %s | FileCheck %s
 
 vmovaps (%rsi), %xmm0
 vmovaps %xmm0, (%rdi)
@@ -12,7 +12,7 @@ vmovaps %xmm0, 48(%rdi)
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      800
-# CHECK-NEXT: Total Cycles:      409
+# CHECK-NEXT: Total Cycles:      408
 # CHECK-NEXT: Total uOps:        800
 
 # CHECK:      Dispatch Width:    4
@@ -29,52 +29,64 @@ vmovaps %xmm0, 48(%rdi)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      6     0.50    *                   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	(%rsi), %xmm0
 # CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, (%rdi)
-# CHECK-NEXT:  1      6     0.50    *                   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	16(%rsi), %xmm0
 # CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 16(%rdi)
-# CHECK-NEXT:  1      6     0.50    *                   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	32(%rsi), %xmm0
 # CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 32(%rdi)
-# CHECK-NEXT:  1      6     0.50    *                   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	48(%rsi), %xmm0
 # CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 48(%rdi)
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -      -     4.00    -     3.94   4.06
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 4.05   3.95    -      -      -      -      -      -     3.95   0.05    -      -      -     4.00   3.95   4.05    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -     0.97   0.03   vmovaps	(%rsi), %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, (%rdi)
-# CHECK-NEXT:  -      -      -      -      -      -     0.03   0.97   vmovaps	16(%rsi), %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.97   0.03   vmovaps	%xmm0, 16(%rdi)
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmovaps	32(%rsi), %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, 32(%rdi)
-# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   vmovaps	48(%rsi), %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.97   0.03   vmovaps	%xmm0, 48(%rdi)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     0.97   0.03    -      -      -      -     0.97   0.03    -      -      -      -     vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -      -     vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: 0.02   0.98    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: 0.02   0.98    -      -      -      -      -      -     1.00    -      -      -      -      -     0.98   0.02    -      -      -      -     vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: 0.98   0.02    -      -      -      -      -      -     0.98   0.02    -      -      -      -     1.00    -      -      -      -      -     vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: 0.03   0.97    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 48(%rdi)
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER . .   vmovaps	(%rsi), %xmm0
-# CHECK-NEXT: [0,1]     D======eER. .   vmovaps	%xmm0, (%rdi)
-# CHECK-NEXT: [0,2]     DeeeeeeE-R. .   vmovaps	16(%rsi), %xmm0
-# CHECK-NEXT: [0,3]     D=======eER .   vmovaps	%xmm0, 16(%rdi)
-# CHECK-NEXT: [0,4]     .DeeeeeeE-R .   vmovaps	32(%rsi), %xmm0
-# CHECK-NEXT: [0,5]     .D=======eER.   vmovaps	%xmm0, 32(%rdi)
-# CHECK-NEXT: [0,6]     .DeeeeeeE--R.   vmovaps	48(%rsi), %xmm0
-# CHECK-NEXT: [0,7]     .D========eER   vmovaps	%xmm0, 48(%rdi)
+# CHECK:      [0,0]     DeeeeeER  ..   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: [0,1]     D=====eER ..   vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: [0,2]     DeeeeeE-R ..   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: [0,3]     D======eER..   vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: [0,4]     .DeeeeeE-R..   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: [0,5]     .D======eER.   vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: [0,6]     .DeeeeeE--R.   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: [0,7]     .D=======eER   vmovaps	%xmm0, 48(%rdi)
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -84,10 +96,10 @@ vmovaps %xmm0, 48(%rdi)
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       vmovaps	(%rsi), %xmm0
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       vmovaps	%xmm0, (%rdi)
 # CHECK-NEXT: 2.     1     1.0    1.0    1.0       vmovaps	16(%rsi), %xmm0
-# CHECK-NEXT: 3.     1     8.0    0.0    0.0       vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       vmovaps	%xmm0, 16(%rdi)
 # CHECK-NEXT: 4.     1     1.0    1.0    1.0       vmovaps	32(%rsi), %xmm0
-# CHECK-NEXT: 5.     1     8.0    0.0    0.0       vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       vmovaps	%xmm0, 32(%rdi)
 # CHECK-NEXT: 6.     1     1.0    1.0    2.0       vmovaps	48(%rsi), %xmm0
-# CHECK-NEXT: 7.     1     9.0    0.0    0.0       vmovaps	%xmm0, 48(%rdi)
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       vmovaps	%xmm0, 48(%rdi)
diff --git a/test/tools/llvm-mca/X86/BdVer2/one-idioms.s b/test/tools/llvm-mca/X86/BdVer2/one-idioms.s
index 599f0a01548..c2e0debcf35 100644
--- a/test/tools/llvm-mca/X86/BdVer2/one-idioms.s
+++ b/test/tools/llvm-mca/X86/BdVer2/one-idioms.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=1 -register-file-stats < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=1 -register-file-stats < %s | FileCheck %s
 
 # These are dependency-breaking one-idioms.
 # Much like zero-idioms, but they produce ones, and do consume resources.
@@ -29,13 +29,13 @@ vpcmpeqw  %xmm3, %xmm3, %xmm5
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1500
-# CHECK-NEXT: Total Cycles:      903
+# CHECK-NEXT: Total Cycles:      754
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.66
-# CHECK-NEXT: IPC:               1.66
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -46,77 +46,99 @@ vpcmpeqw  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        pcmpeqb	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        pcmpeqd	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        pcmpeqw	%mm2, %mm2
-# CHECK-NEXT:  1      1     0.50                        pcmpeqb	%xmm2, %xmm2
-# CHECK-NEXT:  1      1     0.50                        pcmpeqd	%xmm2, %xmm2
-# CHECK-NEXT:  1      1     0.50                        pcmpeqq	%xmm2, %xmm2
-# CHECK-NEXT:  1      1     0.50                        pcmpeqw	%xmm2, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        pcmpeqb	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqd	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqw	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqw	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    1500
-# CHECK-NEXT: Max number of mappings used:         168
+# CHECK-NEXT: Max number of mappings used:         72
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 1500
+# CHECK-NEXT:    Max number of mappings used:      72
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     7.65    -     7.35    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     7.50   7.50    -      -     7.50   7.50    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqb	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqd	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqw	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     0.75    -     0.25    -      -     pcmpeqb	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -     0.49    -     0.51    -      -     pcmpeqd	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -     0.64    -     0.36    -      -     pcmpeqq	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -     0.21    -     0.79    -      -     pcmpeqw	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -     0.44    -     0.56    -      -     vpcmpeqb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -     0.26    -     0.74    -      -     vpcmpeqd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -     0.25    -     0.75    -      -     vpcmpeqq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpcmpeqw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -     0.25    -     0.75    -      -     vpcmpeqb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -     0.55    -     0.45    -      -     vpcmpeqd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -     0.44    -     0.56    -      -     vpcmpeqq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -     0.37    -     0.63    -      -     vpcmpeqw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -      -      -      -     pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    ..   pcmpeqb	%mm2, %mm2
-# CHECK-NEXT: [0,1]     D===eeeER ..   pcmpeqd	%mm2, %mm2
-# CHECK-NEXT: [0,2]     D======eeeER   pcmpeqw	%mm2, %mm2
-# CHECK-NEXT: [0,3]     DeE--------R   pcmpeqb	%xmm2, %xmm2
-# CHECK-NEXT: [0,4]     .DeE-------R   pcmpeqd	%xmm2, %xmm2
-# CHECK-NEXT: [0,5]     .D=eE------R   pcmpeqq	%xmm2, %xmm2
-# CHECK-NEXT: [0,6]     .D==eE-----R   pcmpeqw	%xmm2, %xmm2
-# CHECK-NEXT: [0,7]     .DeE-------R   vpcmpeqb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,8]     . DeE------R   vpcmpeqd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,9]     . D==eE----R   vpcmpeqq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,10]    . D===eE---R   vpcmpeqw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,11]    . D====eE--R   vpcmpeqb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,12]    .  D====eE-R   vpcmpeqd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,13]    .  D====eE-R   vpcmpeqq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,14]    .  D=====eER   vpcmpeqw	%xmm3, %xmm3, %xmm5
+# CHECK:      [0,0]     DeeER.    ..   pcmpeqb	%mm2, %mm2
+# CHECK-NEXT: [0,1]     DeeER.    ..   pcmpeqd	%mm2, %mm2
+# CHECK-NEXT: [0,2]     D=eeER    ..   pcmpeqw	%mm2, %mm2
+# CHECK-NEXT: [0,3]     D==eeER   ..   pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT: [0,4]     .DeeE-R   ..   pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT: [0,5]     .D==eeER  ..   pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT: [0,6]     .D=eeE-R  ..   pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT: [0,7]     .D===eeER ..   vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,8]     . D=eeE-R ..   vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,9]     . D===eeER..   vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,10]    . D==eeE-R..   vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,11]    . D===eeER..   vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,12]    .  D===eeER.   vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,13]    .  D===eeER.   vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,14]    .  D====eeER   vpcmpeqw	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -126,17 +148,17 @@ vpcmpeqw  %xmm3, %xmm3, %xmm5
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       pcmpeqb	%mm2, %mm2
-# CHECK-NEXT: 1.     1     4.0    0.0    0.0       pcmpeqd	%mm2, %mm2
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       pcmpeqw	%mm2, %mm2
-# CHECK-NEXT: 3.     1     1.0    1.0    8.0       pcmpeqb	%xmm2, %xmm2
-# CHECK-NEXT: 4.     1     1.0    0.0    7.0       pcmpeqd	%xmm2, %xmm2
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       pcmpeqq	%xmm2, %xmm2
-# CHECK-NEXT: 6.     1     3.0    0.0    5.0       pcmpeqw	%xmm2, %xmm2
-# CHECK-NEXT: 7.     1     1.0    1.0    7.0       vpcmpeqb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 8.     1     1.0    0.0    6.0       vpcmpeqd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 9.     1     3.0    1.0    4.0       vpcmpeqq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 10.    1     4.0    0.0    3.0       vpcmpeqw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 11.    1     5.0    0.0    2.0       vpcmpeqb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 12.    1     5.0    1.0    1.0       vpcmpeqd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 13.    1     5.0    1.0    1.0       vpcmpeqq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 14.    1     6.0    2.0    0.0       vpcmpeqw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       pcmpeqd	%mm2, %mm2
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       pcmpeqw	%mm2, %mm2
+# CHECK-NEXT: 3.     1     3.0    3.0    0.0       pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT: 4.     1     1.0    1.0    1.0       pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT: 6.     1     2.0    2.0    1.0       pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT: 7.     1     4.0    4.0    0.0       vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 8.     1     2.0    2.0    1.0       vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 10.    1     3.0    3.0    1.0       vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 11.    1     4.0    4.0    0.0       vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 12.    1     4.0    4.0    0.0       vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 13.    1     4.0    0.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 14.    1     5.0    5.0    0.0       vpcmpeqw	%xmm3, %xmm3, %xmm5
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-2.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-2.s
index 91ecc93c880..e5dcf7d761f 100644
--- a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-2.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=false -timeline < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=false -timeline < %s | FileCheck %s
 
 imul   %rax, %rbx
 lzcnt  %ax,  %bx
@@ -7,13 +7,13 @@ add    %ecx, %ebx
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      3
-# CHECK-NEXT: Total Cycles:      8
-# CHECK-NEXT: Total uOps:        3
+# CHECK-NEXT: Total Cycles:      11
+# CHECK-NEXT: Total uOps:        4
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.38
-# CHECK-NEXT: IPC:               0.38
-# CHECK-NEXT: Block RThroughput: 2.0
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -24,16 +24,17 @@ add    %ecx, %ebx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        imulq	%rax, %rbx
-# CHECK-NEXT:  1      3     1.00                        lzcntw	%ax, %bx
-# CHECK-NEXT:  1      1     0.33                        addl	%ecx, %ebx
+# CHECK-NEXT:  1      6     4.00                        imulq	%rax, %rbx
+# CHECK-NEXT:  2      2     0.50                        lzcntw	%ax, %bx
+# CHECK-NEXT:  1      1     0.50                        addl	%ecx, %ebx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     01234567
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER .   imulq	%rax, %rbx
-# CHECK-NEXT: [0,1]     D=eeeER.   lzcntw	%ax, %bx
-# CHECK-NEXT: [0,2]     D====eER   addl	%ecx, %ebx
+# CHECK:      [0,0]     DeeeeeeER .   imulq	%rax, %rbx
+# CHECK-NEXT: [0,1]     D=====eeER.   lzcntw	%ax, %bx
+# CHECK-NEXT: [0,2]     D=======eER   addl	%ecx, %ebx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -43,5 +44,5 @@ add    %ecx, %ebx
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       imulq	%rax, %rbx
-# CHECK-NEXT: 1.     1     2.0    2.0    0.0       lzcntw	%ax, %bx
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       addl	%ecx, %ebx
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       lzcntw	%ax, %bx
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       addl	%ecx, %ebx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s
index d35a195bf35..4aad4729a5f 100644
--- a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
 
 # perf stat reports a throughput of 1.00 IPC for this code snippet.
 
@@ -12,13 +12,13 @@ xor %bx, %dx
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      4500
-# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total Cycles:      4503
 # CHECK-NEXT: Total uOps:        4500
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    2.99
-# CHECK-NEXT: IPC:               2.99
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -29,42 +29,55 @@ xor %bx, %dx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.33                        addw	%cx, %dx
-# CHECK-NEXT:  1      1     0.33                        movw	%ax, %dx
-# CHECK-NEXT:  1      1     0.33                        xorw	%bx, %dx
+# CHECK-NEXT:  1      1     0.50                        addw	%cx, %dx
+# CHECK-NEXT:  1      1     0.50                        movw	%ax, %dx
+# CHECK-NEXT:  1      1     0.50                        xorw	%bx, %dx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.67    -      -     0.33    -      -     addw	%cx, %dx
-# CHECK-NEXT:  -      -      -     0.67    -     0.33    -      -     movw	%ax, %dx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorw	%bx, %dx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%cx, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movw	%ax, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	%bx, %dx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     0123456
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER ..   addw	%cx, %dx
-# CHECK-NEXT: [0,1]     DeER ..   movw	%ax, %dx
-# CHECK-NEXT: [0,2]     D=eER..   xorw	%bx, %dx
-# CHECK-NEXT: [1,0]     D==eER.   addw	%cx, %dx
-# CHECK-NEXT: [1,1]     .DeE-R.   movw	%ax, %dx
-# CHECK-NEXT: [1,2]     .D=eER.   xorw	%bx, %dx
-# CHECK-NEXT: [2,0]     .D==eER   addw	%cx, %dx
-# CHECK-NEXT: [2,1]     .DeE--R   movw	%ax, %dx
-# CHECK-NEXT: [2,2]     . DeE-R   xorw	%bx, %dx
+# CHECK:      [0,0]     DeER .    ..   addw	%cx, %dx
+# CHECK-NEXT: [0,1]     D=eER.    ..   movw	%ax, %dx
+# CHECK-NEXT: [0,2]     D==eER    ..   xorw	%bx, %dx
+# CHECK-NEXT: [1,0]     D===eER   ..   addw	%cx, %dx
+# CHECK-NEXT: [1,1]     .D===eER  ..   movw	%ax, %dx
+# CHECK-NEXT: [1,2]     .D====eER ..   xorw	%bx, %dx
+# CHECK-NEXT: [2,0]     .D=====eER..   addw	%cx, %dx
+# CHECK-NEXT: [2,1]     .D======eER.   movw	%ax, %dx
+# CHECK-NEXT: [2,2]     . D======eER   xorw	%bx, %dx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -73,6 +86,6 @@ xor %bx, %dx
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     2.3    0.3    0.0       addw	%cx, %dx
-# CHECK-NEXT: 1.     3     1.0    1.0    1.0       movw	%ax, %dx
-# CHECK-NEXT: 2.     3     1.7    0.0    0.3       xorw	%bx, %dx
+# CHECK-NEXT: 0.     3     3.7    0.3    0.0       addw	%cx, %dx
+# CHECK-NEXT: 1.     3     4.3    0.0    0.0       movw	%ax, %dx
+# CHECK-NEXT: 2.     3     5.0    0.0    0.0       xorw	%bx, %dx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s
index 7cd4eb7b6ce..6194ecbb127 100644
--- a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
 
 # perf stat reports a throughput of 0.60 IPC for this code snippet.
 
@@ -12,13 +12,13 @@ add %cx, %bx
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      4500
-# CHECK-NEXT: Total Cycles:      3005
-# CHECK-NEXT: Total uOps:        4500
+# CHECK-NEXT: Total Cycles:      9003
+# CHECK-NEXT: Total uOps:        6000
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.50
-# CHECK-NEXT: IPC:               1.50
-# CHECK-NEXT: Block RThroughput: 2.0
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -29,43 +29,55 @@ add %cx, %bx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        imulw	%ax, %bx
-# CHECK-NEXT:  1      3     1.00                        lzcntw	%ax, %bx
-# CHECK-NEXT:  1      1     0.33                        addw	%cx, %bx
+# CHECK-NEXT:  1      4     1.00                        imulw	%ax, %bx
+# CHECK-NEXT:  2      2     0.50                        lzcntw	%ax, %bx
+# CHECK-NEXT:  1      1     0.50                        addw	%cx, %bx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     0.50   2.00    -     0.50    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     1.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulw	%ax, %bx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     lzcntw	%ax, %bx
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     addw	%cx, %bx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	%ax, %bx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%ax, %bx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%cx, %bx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeER    ..   imulw	%ax, %bx
-# CHECK-NEXT: [0,1]     D=eeeER   ..   lzcntw	%ax, %bx
-# CHECK-NEXT: [0,2]     D====eER  ..   addw	%cx, %bx
-# CHECK-NEXT: [1,0]     D=====eeeER.   imulw	%ax, %bx
-# CHECK-NEXT: [1,1]     .D=eeeE---R.   lzcntw	%ax, %bx
-# CHECK-NEXT: [1,2]     .D====eE--R.   addw	%cx, %bx
-# CHECK-NEXT: [2,0]     .D=====eeeER   imulw	%ax, %bx
-# CHECK-NEXT: [2,1]     .D==eeeE---R   lzcntw	%ax, %bx
-# CHECK-NEXT: [2,2]     . D====eE--R   addw	%cx, %bx
+# CHECK:      [0,0]     DeeeeER   .    .    .   imulw	%ax, %bx
+# CHECK-NEXT: [0,1]     D===eeER  .    .    .   lzcntw	%ax, %bx
+# CHECK-NEXT: [0,2]     D=====eER .    .    .   addw	%cx, %bx
+# CHECK-NEXT: [1,0]     .D=====eeeeER  .    .   imulw	%ax, %bx
+# CHECK-NEXT: [1,1]     .D========eeER .    .   lzcntw	%ax, %bx
+# CHECK-NEXT: [1,2]     .D==========eER.    .   addw	%cx, %bx
+# CHECK-NEXT: [2,0]     . D==========eeeeER .   imulw	%ax, %bx
+# CHECK-NEXT: [2,1]     . D=============eeER.   lzcntw	%ax, %bx
+# CHECK-NEXT: [2,2]     . D===============eER   addw	%cx, %bx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -74,6 +86,6 @@ add %cx, %bx
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     4.3    0.3    0.0       imulw	%ax, %bx
-# CHECK-NEXT: 1.     3     2.3    2.3    2.0       lzcntw	%ax, %bx
-# CHECK-NEXT: 2.     3     5.0    0.0    1.3       addw	%cx, %bx
+# CHECK-NEXT: 0.     3     6.0    0.3    0.0       imulw	%ax, %bx
+# CHECK-NEXT: 1.     3     9.0    0.0    0.0       lzcntw	%ax, %bx
+# CHECK-NEXT: 2.     3     11.0   0.0    0.0       addw	%cx, %bx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s
index 87098f08642..ee892a4231f 100644
--- a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
 
 # perf stat reports a throughput of 1.00 IPC for this code snippet.
 
@@ -7,13 +7,13 @@ lzcnt %ax, %bx  ## partial register stall.
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      1500
-# CHECK-NEXT: Total Cycles:      1505
-# CHECK-NEXT: Total uOps:        1500
+# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: uOps Per Cycle:    1.99
 # CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: Block RThroughput: 0.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -24,32 +24,44 @@ lzcnt %ax, %bx  ## partial register stall.
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        lzcntw	%ax, %bx
+# CHECK-NEXT:  2      2     0.50                        lzcntw	%ax, %bx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     lzcntw	%ax, %bx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%ax, %bx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     01234567
+# CHECK-NEXT: Index     0123456
 
-# CHECK:      [0,0]     DeeeER .   lzcntw	%ax, %bx
-# CHECK-NEXT: [1,0]     D=eeeER.   lzcntw	%ax, %bx
-# CHECK-NEXT: [2,0]     D==eeeER   lzcntw	%ax, %bx
+# CHECK:      [0,0]     DeeER..   lzcntw	%ax, %bx
+# CHECK-NEXT: [1,0]     D=eeER.   lzcntw	%ax, %bx
+# CHECK-NEXT: [2,0]     .D=eeER   lzcntw	%ax, %bx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -58,4 +70,4 @@ lzcnt %ax, %bx  ## partial register stall.
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     2.0    2.0    0.0       lzcntw	%ax, %bx
+# CHECK-NEXT: 0.     3     1.7    0.3    0.0       lzcntw	%ax, %bx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s
index 465c26c7968..8723744aaa6 100644
--- a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
 
 # perf stat reports a throughput of 0.60 IPC for this code snippet.
 # Each lzcnt has a false dependency on %ecx; the first lzcnt has to wait on the
@@ -13,13 +13,13 @@ lzcnt 2(%rsp), %cx
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      4500
-# CHECK-NEXT: Total Cycles:      4510
+# CHECK-NEXT: Total Cycles:      10503
 # CHECK-NEXT: Total uOps:        7500
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.66
-# CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.43
+# CHECK-NEXT: Block RThroughput: 1.3
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -30,43 +30,55 @@ lzcnt 2(%rsp), %cx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        imull	%edx, %ecx
-# CHECK-NEXT:  2      8     1.00    *                   lzcntw	(%rsp), %cx
-# CHECK-NEXT:  2      8     1.00    *                   lzcntw	2(%rsp), %cx
+# CHECK-NEXT:  1      4     1.00                        imull	%edx, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   lzcntw	(%rsp), %cx
+# CHECK-NEXT:  2      6     0.50    *                   lzcntw	2(%rsp), %cx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     3.00    -      -      -     2.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -     2.00    -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     1.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imull	%edx, %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -     1.00   lzcntw	(%rsp), %cx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -     1.00   lzcntw	2(%rsp), %cx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%edx, %ecx
+# CHECK-NEXT:  -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	(%rsp), %cx
+# CHECK-NEXT:  -     1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	2(%rsp), %cx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
 
-# CHECK:      [0,0]     DeeeER    .    .  .   imull	%edx, %ecx
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER   .  .   lzcntw	(%rsp), %cx
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER  .  .   lzcntw	2(%rsp), %cx
-# CHECK-NEXT: [1,0]     .D=========eeeER  .   imull	%edx, %ecx
-# CHECK-NEXT: [1,1]     . D=eeeeeeeeE--R  .   lzcntw	(%rsp), %cx
-# CHECK-NEXT: [1,2]     . D==eeeeeeeeE-R  .   lzcntw	2(%rsp), %cx
-# CHECK-NEXT: [2,0]     .  D==========eeeER   imull	%edx, %ecx
-# CHECK-NEXT: [2,1]     .  D==eeeeeeeeE---R   lzcntw	(%rsp), %cx
-# CHECK-NEXT: [2,2]     .   D==eeeeeeeeE--R   lzcntw	2(%rsp), %cx
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   imull	%edx, %ecx
+# CHECK-NEXT: [0,1]     DeeeeeeER .    .    .  .   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [0,2]     .DeeeeeeER.    .    .  .   lzcntw	2(%rsp), %cx
+# CHECK-NEXT: [1,0]     .D======eeeeER .    .  .   imull	%edx, %ecx
+# CHECK-NEXT: [1,1]     . D=====eeeeeeER    .  .   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [1,2]     . D======eeeeeeER   .  .   lzcntw	2(%rsp), %cx
+# CHECK-NEXT: [2,0]     .  D===========eeeeER  .   imull	%edx, %ecx
+# CHECK-NEXT: [2,1]     .  D===========eeeeeeER.   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [2,2]     .   D===========eeeeeeER   lzcntw	2(%rsp), %cx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -75,6 +87,6 @@ lzcnt 2(%rsp), %cx
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     7.3    0.3    0.0       imull	%edx, %ecx
-# CHECK-NEXT: 1.     3     2.3    2.3    1.7       lzcntw	(%rsp), %cx
-# CHECK-NEXT: 2.     3     2.7    2.7    1.0       lzcntw	2(%rsp), %cx
+# CHECK-NEXT: 0.     3     6.7    0.3    0.0       imull	%edx, %ecx
+# CHECK-NEXT: 1.     3     6.3    0.0    0.0       lzcntw	(%rsp), %cx
+# CHECK-NEXT: 2.     3     6.7    0.0    0.0       lzcntw	2(%rsp), %cx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update.s
index 995bb35d3ac..6c1146b2224 100644
--- a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update.s
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=false -timeline < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=false -timeline < %s | FileCheck %s
 
 imul %ax, %cx
 add  %al, %cl
@@ -7,12 +7,12 @@ add  %ecx, %ebx
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      3
-# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total Cycles:      9
 # CHECK-NEXT: Total uOps:        3
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.38
-# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Instruction Info:
@@ -24,16 +24,16 @@ add  %ecx, %ebx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        imulw	%ax, %cx
-# CHECK-NEXT:  1      1     0.33                        addb	%al, %cl
-# CHECK-NEXT:  1      1     0.33                        addl	%ecx, %ebx
+# CHECK-NEXT:  1      4     1.00                        imulw	%ax, %cx
+# CHECK-NEXT:  1      1     0.50                        addb	%al, %cl
+# CHECK-NEXT:  1      1     0.50                        addl	%ecx, %ebx
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     01234567
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeeER .   imulw	%ax, %cx
-# CHECK-NEXT: [0,1]     D===eER.   addb	%al, %cl
-# CHECK-NEXT: [0,2]     D====eER   addl	%ecx, %ebx
+# CHECK:      [0,0]     DeeeeER .   imulw	%ax, %cx
+# CHECK-NEXT: [0,1]     D====eER.   addb	%al, %cl
+# CHECK-NEXT: [0,2]     D=====eER   addl	%ecx, %ebx
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -43,5 +43,5 @@ add  %ecx, %ebx
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       imulw	%ax, %cx
-# CHECK-NEXT: 1.     1     4.0    0.0    0.0       addb	%al, %cl
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       addl	%ecx, %ebx
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       addb	%al, %cl
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       addl	%ecx, %ebx
diff --git a/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s b/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s
index 9ca1d880673..86fee396350 100644
--- a/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s
+++ b/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
 
 # VALU0/VALU1
 vpmulld     %xmm0, %xmm1, %xmm2
@@ -19,13 +19,13 @@ vsqrtps     %ymm0, %ymm2
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      800
-# CHECK-NEXT: Total Cycles:      4256
-# CHECK-NEXT: Total uOps:        1000
+# CHECK-NEXT: Total Cycles:      3244
+# CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.23
-# CHECK-NEXT: IPC:               0.19
-# CHECK-NEXT: Block RThroughput: 42.0
+# CHECK-NEXT: uOps Per Cycle:    0.46
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 32.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -36,51 +36,72 @@ vsqrtps     %ymm0, %ymm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      5     1.00                        vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.33                        vpand	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcvttps2dq	%xmm0, %xmm2
-# CHECK-NEXT:  1      14    6.00                        vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      14    14.00                       vsqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  3      29    28.00                       vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT:  1      5     2.00                        vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  6      13    1.00                        vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     10.50                       vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      9     21.00                       vsqrtps	%ymm0, %ymm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     42.00  6.03   3.96    -     17.01   -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     32.71  32.29   -     2.00   3.00   1.00   6.00   6.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.05   0.06    -     0.89    -      -     vpand	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvttps2dq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.98   0.90    -     15.12   -      -     vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     vsqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -     28.00  2.00    -      -     1.00    -      -     vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00   2.00    -     2.00   1.00    -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.71  10.29   -      -      -      -      -     1.00    -      -      -      -     vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     21.00  21.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtps	%ymm0, %ymm2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789          0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [0,1]     DeE----R  .    .    .    .    .    ..   vpand	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [0,2]     DeeeE--R  .    .    .    .    .    ..   vcvttps2dq	%xmm0, %xmm2
-# CHECK-NEXT: [0,3]     D=eeeeeeeeeeeeeeER  .    .    .    ..   vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [0,4]     .DeeeE-----------R  .    .    .    ..   vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [0,5]     .DeeeeeeeeeeeeeeER  .    .    .    ..   vsqrtps	%xmm0, %xmm2
-# CHECK-NEXT: [0,6]     .D=eeeE----------R  .    .    .    ..   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    .    .    .    .  .   vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,1]     D=eeE--R  .    .    .    .    .    .    .    .    .    .  .   vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     D==eeeeER .    .    .    .    .    .    .    .    .    .  .   vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: [0,3]     .D=eeeeeeeeeeeeeER  .    .    .    .    .    .    .    .  .   vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,4]     . D=eeeeeE-------R  .    .    .    .    .    .    .    .  .   vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,5]     . D=eeeeeeeeeE---R  .    .    .    .    .    .    .    .  .   vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: [0,6]     .  D=eeeeeE------R  .    .    .    .    .    .    .    .  .   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,7]     .  D==eeeeeeeeeE--R .    .    .    .    .    .    .    .  .   vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT: [1,0]     .   D===eeeeeE----R .    .    .    .    .    .    .    .  .   vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,1]     .   DeeE----------R .    .    .    .    .    .    .    .  .   vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .   D====eeeeE----R .    .    .    .    .    .    .    .  .   vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: [1,3]     .    D=eeeeeeeeeeeeeER   .    .    .    .    .    .    .  .   vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,4]     .    .D==================eeeeeER   .    .    .    .    .  .   vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,5]     .    .D===================eeeeeeeeeER   .    .    .    .  .   vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: [1,6]     .    . D=======================================eeeeeER .  .   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,7]     .    . D========================================eeeeeeeeeER   vsqrtps	%ymm0, %ymm2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -89,11 +110,11 @@ vsqrtps     %ymm0, %ymm2
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     1.0    1.0    79.0      vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1.     2     1.0    1.0    82.5      vpand	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 2.     2     1.5    1.5    80.0      vcvttps2dq	%xmm0, %xmm2
-# CHECK-NEXT: 3.     2     1.5    1.5    74.0      vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 4.     2     2.0    2.0    84.0      vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 5.     2     9.5    9.5    65.0      vsqrtps	%xmm0, %xmm2
-# CHECK-NEXT: 6.     2     2.5    2.5    83.0      vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 7.     2     147.5  147.5  0.0       vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT: 0.     2     2.5    2.5    2.0       vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1.     2     1.5    1.5    6.0       vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 2.     2     4.0    4.0    2.0       vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 3.     2     2.0    2.0    0.0       vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 4.     2     10.5   10.5   3.5       vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 5.     2     11.0   11.0   1.5       vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 6.     2     21.0   21.0   3.0       vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 7.     2     22.0   22.0   1.0       vsqrtps	%ymm0, %ymm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/pr37790.s b/test/tools/llvm-mca/X86/BdVer2/pr37790.s
index 2878b280a9c..2471c42e445 100644
--- a/test/tools/llvm-mca/X86/BdVer2/pr37790.s
+++ b/test/tools/llvm-mca/X86/BdVer2/pr37790.s
@@ -1,18 +1,18 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -lqueue=2 -iterations=2 -resource-pressure=false -timeline -timeline-max-cycles=104 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -lqueue=2 -iterations=2 -resource-pressure=false -timeline -timeline-max-cycles=104 < %s | FileCheck %s
 
 int3
 stmxcsr (%rsp)
 
 # CHECK:      Iterations:        2
 # CHECK-NEXT: Instructions:      4
-# CHECK-NEXT: Total Cycles:      213
-# CHECK-NEXT: Total uOps:        10
+# CHECK-NEXT: Total Cycles:      205
+# CHECK-NEXT: Total uOps:        6
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.05
+# CHECK-NEXT: uOps Per Cycle:    0.03
 # CHECK-NEXT: IPC:               0.02
-# CHECK-NEXT: Block RThroughput: 1.3
+# CHECK-NEXT: Block RThroughput: 0.8
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -23,14 +23,15 @@ stmxcsr (%rsp)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      100   0.33    *      *      U     int3
-# CHECK-NEXT:  4      5     1.00    *      *      U     stmxcsr	(%rsp)
+# CHECK-NEXT:  1      100   0.50    *      *      U     int3
+# CHECK-NEXT:  2      1     0.50    *      *      U     stmxcsr	(%rsp)
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123
 
-# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER   int3
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER.   int3
+# CHECK-NEXT: [0,1]     D====================================================================================================eER   stmxcsr	(%rsp)
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -39,5 +40,5 @@ stmxcsr (%rsp)
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     3.0    0.5    0.0       int3
-# CHECK-NEXT: 1.     2     100.0  0.0    0.0       stmxcsr	(%rsp)
+# CHECK-NEXT: 0.     2     1.0    0.5    0.0       int3
+# CHECK-NEXT: 1.     2     100.5  0.0    0.0       stmxcsr	(%rsp)
diff --git a/test/tools/llvm-mca/X86/BdVer2/rank.s b/test/tools/llvm-mca/X86/BdVer2/rank.s
index 24f8c43676e..87f7d527c03 100644
--- a/test/tools/llvm-mca/X86/BdVer2/rank.s
+++ b/test/tools/llvm-mca/X86/BdVer2/rank.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
 
 add %eax, %ecx
 add %eax, %edx
@@ -12,13 +12,13 @@ add %ebx, %eax
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      800
-# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total Cycles:      503
 # CHECK-NEXT: Total uOps:        800
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.99
-# CHECK-NEXT: IPC:               1.99
-# CHECK-NEXT: Block RThroughput: 2.7
+# CHECK-NEXT: uOps Per Cycle:    1.59
+# CHECK-NEXT: IPC:               1.59
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -29,68 +29,80 @@ add %ebx, %eax
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.33                        addl	%eax, %ecx
-# CHECK-NEXT:  1      1     0.33                        addl	%eax, %edx
-# CHECK-NEXT:  1      1     0.33                        addl	%eax, %ebx
-# CHECK-NEXT:  1      1     0.33                        addl	%edx, %esi
-# CHECK-NEXT:  1      1     0.33                        addl	%ebx, %eax
-# CHECK-NEXT:  1      1     0.33                        addl	%edx, %esi
-# CHECK-NEXT:  1      1     0.33                        addl	%ebx, %eax
-# CHECK-NEXT:  1      1     0.33                        addl	%ebx, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	%eax, %ecx
+# CHECK-NEXT:  1      1     0.50                        addl	%eax, %edx
+# CHECK-NEXT:  1      1     0.50                        addl	%eax, %ebx
+# CHECK-NEXT:  1      1     0.50                        addl	%edx, %esi
+# CHECK-NEXT:  1      1     0.50                        addl	%ebx, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	%edx, %esi
+# CHECK-NEXT:  1      1     0.50                        addl	%ebx, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	%ebx, %eax
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     2.66   2.67    -     2.67    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -     addl	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.34    -     0.33    -      -     addl	%eax, %edx
-# CHECK-NEXT:  -      -     0.34   0.33    -     0.33    -      -     addl	%eax, %ebx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -     addl	%edx, %esi
-# CHECK-NEXT:  -      -     0.33   0.34    -     0.33    -      -     addl	%ebx, %eax
-# CHECK-NEXT:  -      -     0.34   0.33    -     0.33    -      -     addl	%edx, %esi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -     addl	%ebx, %eax
-# CHECK-NEXT:  -      -     0.33   0.34    -     0.33    -      -     addl	%ebx, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.01   0.99    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %edx
+# CHECK-NEXT:  -      -      -      -      -     0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ebx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %esi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %esi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    .   .   addl	%eax, %ecx
-# CHECK-NEXT: [0,1]     DeER .    .   .   addl	%eax, %edx
-# CHECK-NEXT: [0,2]     DeER .    .   .   addl	%eax, %ebx
-# CHECK-NEXT: [0,3]     D=eER.    .   .   addl	%edx, %esi
-# CHECK-NEXT: [0,4]     .DeER.    .   .   addl	%ebx, %eax
-# CHECK-NEXT: [0,5]     .D=eER    .   .   addl	%edx, %esi
-# CHECK-NEXT: [0,6]     .D=eER    .   .   addl	%ebx, %eax
-# CHECK-NEXT: [0,7]     .D==eER   .   .   addl	%ebx, %eax
-# CHECK-NEXT: [1,0]     . D==eER  .   .   addl	%eax, %ecx
-# CHECK-NEXT: [1,1]     . D==eER  .   .   addl	%eax, %edx
-# CHECK-NEXT: [1,2]     . D==eER  .   .   addl	%eax, %ebx
-# CHECK-NEXT: [1,3]     . D===eER .   .   addl	%edx, %esi
-# CHECK-NEXT: [1,4]     .  D==eER .   .   addl	%ebx, %eax
-# CHECK-NEXT: [1,5]     .  D===eER.   .   addl	%edx, %esi
-# CHECK-NEXT: [1,6]     .  D===eER.   .   addl	%ebx, %eax
-# CHECK-NEXT: [1,7]     .  D====eER   .   addl	%ebx, %eax
-# CHECK-NEXT: [2,0]     .   D====eER  .   addl	%eax, %ecx
-# CHECK-NEXT: [2,1]     .   D====eER  .   addl	%eax, %edx
-# CHECK-NEXT: [2,2]     .   D====eER  .   addl	%eax, %ebx
-# CHECK-NEXT: [2,3]     .   D=====eER .   addl	%edx, %esi
-# CHECK-NEXT: [2,4]     .    D====eER .   addl	%ebx, %eax
-# CHECK-NEXT: [2,5]     .    D=====eER.   addl	%edx, %esi
-# CHECK-NEXT: [2,6]     .    D=====eER.   addl	%ebx, %eax
-# CHECK-NEXT: [2,7]     .    D======eER   addl	%ebx, %eax
+# CHECK:      [0,0]     DeER .    .    . .   addl	%eax, %ecx
+# CHECK-NEXT: [0,1]     DeER .    .    . .   addl	%eax, %edx
+# CHECK-NEXT: [0,2]     D=eER.    .    . .   addl	%eax, %ebx
+# CHECK-NEXT: [0,3]     D=eER.    .    . .   addl	%edx, %esi
+# CHECK-NEXT: [0,4]     .D=eER    .    . .   addl	%ebx, %eax
+# CHECK-NEXT: [0,5]     .D=eER    .    . .   addl	%edx, %esi
+# CHECK-NEXT: [0,6]     .D==eER   .    . .   addl	%ebx, %eax
+# CHECK-NEXT: [0,7]     .D===eER  .    . .   addl	%ebx, %eax
+# CHECK-NEXT: [1,0]     . D====eER.    . .   addl	%eax, %ecx
+# CHECK-NEXT: [1,1]     . D===eE-R.    . .   addl	%eax, %edx
+# CHECK-NEXT: [1,2]     . D===eE-R.    . .   addl	%eax, %ebx
+# CHECK-NEXT: [1,3]     . D====eER.    . .   addl	%edx, %esi
+# CHECK-NEXT: [1,4]     .  D====eER    . .   addl	%ebx, %eax
+# CHECK-NEXT: [1,5]     .  D====eER    . .   addl	%edx, %esi
+# CHECK-NEXT: [1,6]     .  D=====eER   . .   addl	%ebx, %eax
+# CHECK-NEXT: [1,7]     .  D======eER  . .   addl	%ebx, %eax
+# CHECK-NEXT: [2,0]     .   D=======eER. .   addl	%eax, %ecx
+# CHECK-NEXT: [2,1]     .   D======eE-R. .   addl	%eax, %edx
+# CHECK-NEXT: [2,2]     .   D======eE-R. .   addl	%eax, %ebx
+# CHECK-NEXT: [2,3]     .   D=======eER. .   addl	%edx, %esi
+# CHECK-NEXT: [2,4]     .    D=======eER .   addl	%ebx, %eax
+# CHECK-NEXT: [2,5]     .    D=======eER .   addl	%edx, %esi
+# CHECK-NEXT: [2,6]     .    D========eER.   addl	%ebx, %eax
+# CHECK-NEXT: [2,7]     .    D=========eER   addl	%ebx, %eax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -99,11 +111,11 @@ add %ebx, %eax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     3.0    0.3    0.0       addl	%eax, %ecx
-# CHECK-NEXT: 1.     3     3.0    0.3    0.0       addl	%eax, %edx
-# CHECK-NEXT: 2.     3     3.0    0.3    0.0       addl	%eax, %ebx
-# CHECK-NEXT: 3.     3     4.0    0.0    0.0       addl	%edx, %esi
-# CHECK-NEXT: 4.     3     3.0    0.0    0.0       addl	%ebx, %eax
-# CHECK-NEXT: 5.     3     4.0    0.0    0.0       addl	%edx, %esi
-# CHECK-NEXT: 6.     3     4.0    0.0    0.0       addl	%ebx, %eax
-# CHECK-NEXT: 7.     3     5.0    0.0    0.0       addl	%ebx, %eax
+# CHECK-NEXT: 0.     3     4.7    1.0    0.0       addl	%eax, %ecx
+# CHECK-NEXT: 1.     3     4.0    0.3    0.7       addl	%eax, %edx
+# CHECK-NEXT: 2.     3     4.3    0.7    0.7       addl	%eax, %ebx
+# CHECK-NEXT: 3.     3     5.0    0.0    0.0       addl	%edx, %esi
+# CHECK-NEXT: 4.     3     5.0    0.7    0.0       addl	%ebx, %eax
+# CHECK-NEXT: 5.     3     5.0    0.0    0.0       addl	%edx, %esi
+# CHECK-NEXT: 6.     3     6.0    0.0    0.0       addl	%ebx, %eax
+# CHECK-NEXT: 7.     3     7.0    0.0    0.0       addl	%ebx, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s b/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s
index afa5abd1cd2..42467f7b3a1 100644
--- a/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s
+++ b/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -retire-stats -iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false -retire-stats -iterations=1 < %s | FileCheck %s
 
   vsqrtps %xmm0, %xmm2
   vaddps  %xmm0, %xmm1, %xmm2
@@ -20,13 +20,13 @@
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      16
-# CHECK-NEXT: Total Cycles:      20
+# CHECK-NEXT: Total Cycles:      22
 # CHECK-NEXT: Total uOps:        16
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.80
-# CHECK-NEXT: IPC:               0.80
-# CHECK-NEXT: Block RThroughput: 15.0
+# CHECK-NEXT: uOps Per Cycle:    0.73
+# CHECK-NEXT: IPC:               0.73
+# CHECK-NEXT: Block RThroughput: 18.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -37,25 +37,26 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      14    14.00                       vsqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     10.50                       vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
 
 # CHECK:      Retire Control Unit - number of cycles where we saw N instructions retired:
 # CHECK-NEXT: [# retired], [# cycles]
-# CHECK-NEXT:  0,           16  (80.0%)
-# CHECK-NEXT:  1,           3  (15.0%)
-# CHECK-NEXT:  13,          1  (5.0%)
+# CHECK-NEXT:  0,           11  (50.0%)
+# CHECK-NEXT:  1,           9  (40.9%)
+# CHECK-NEXT:  3,           1  (4.5%)
+# CHECK-NEXT:  4,           1  (4.5%)
diff --git a/test/tools/llvm-mca/X86/BdVer2/read-advance-1.s b/test/tools/llvm-mca/X86/BdVer2/read-advance-1.s
index 1c719a84a1b..912b11b2ddd 100644
--- a/test/tools/llvm-mca/X86/BdVer2/read-advance-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/read-advance-1.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
 
 # The vmul can start executing 3cy in advance. That is beause the first use
 # operand (i.e. %xmm1) is a ReadAfterLd. That means, the memory operand is
@@ -10,12 +10,12 @@ vmulps  (%rdi), %xmm1, %xmm2
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      2
-# CHECK-NEXT: Total Cycles:      14
-# CHECK-NEXT: Total uOps:        3
+# CHECK-NEXT: Total Cycles:      13
+# CHECK-NEXT: Total uOps:        2
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.21
-# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: uOps Per Cycle:    0.15
+# CHECK-NEXT: IPC:               0.15
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Instruction Info:
@@ -27,15 +27,15 @@ vmulps  (%rdi), %xmm1, %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rdi), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      10    1.00    *                   vmulps	(%rdi), %xmm1, %xmm2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .  .   vaddps	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [0,1]     DeeeeeeeeeeeER   vmulps	(%rdi), %xmm1, %xmm2
+# CHECK:      [0,0]     DeeeeeER  . .   vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     DeeeeeeeeeeER   vmulps	(%rdi), %xmm1, %xmm2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
diff --git a/test/tools/llvm-mca/X86/BdVer2/read-advance-2.s b/test/tools/llvm-mca/X86/BdVer2/read-advance-2.s
index 7814b000ee4..7f2d1ae9c6c 100644
--- a/test/tools/llvm-mca/X86/BdVer2/read-advance-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/read-advance-2.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=0 -timeline < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=0 -timeline < %s | FileCheck %s
 
   imull  %esi
   imull  (%rdi)
@@ -9,12 +9,12 @@
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      2
-# CHECK-NEXT: Total Cycles:      13
-# CHECK-NEXT: Total uOps:        7
+# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total uOps:        2
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.54
-# CHECK-NEXT: IPC:               0.15
+# CHECK-NEXT: uOps Per Cycle:    0.17
+# CHECK-NEXT: IPC:               0.17
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
@@ -26,15 +26,15 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  3      4     1.00                        imull	%esi
-# CHECK-NEXT:  4      9     1.00    *                   imull	(%rdi)
+# CHECK-NEXT:  1      4     1.00                        imull	%esi
+# CHECK-NEXT:  1      8     1.00    *                   imull	(%rdi)
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   . .   imull	%esi
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeER   imull	(%rdi)
+# CHECK:      [0,0]     DeeeeER   ..   imull	%esi
+# CHECK-NEXT: [0,1]     D=eeeeeeeeER   imull	(%rdi)
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -44,4 +44,4 @@
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       imull	%esi
-# CHECK-NEXT: 1.     1     1.0    1.0    0.0       imull	(%rdi)
+# CHECK-NEXT: 1.     1     2.0    1.0    0.0       imull	(%rdi)
diff --git a/test/tools/llvm-mca/X86/BdVer2/read-advance-3.s b/test/tools/llvm-mca/X86/BdVer2/read-advance-3.s
index 638f36c1711..44cea0a4253 100644
--- a/test/tools/llvm-mca/X86/BdVer2/read-advance-3.s
+++ b/test/tools/llvm-mca/X86/BdVer2/read-advance-3.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=0 -timeline -dispatch=3 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=0 -timeline -dispatch=3 < %s | FileCheck %s
 
   add %rdi, %rsi
   add (%rsp), %rsi
@@ -7,13 +7,13 @@
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      3
-# CHECK-NEXT: Total Cycles:      9
-# CHECK-NEXT: Total uOps:        4
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        3
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.44
-# CHECK-NEXT: IPC:               0.33
-# CHECK-NEXT: Block RThroughput: 1.3
+# CHECK-NEXT: uOps Per Cycle:    0.38
+# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -24,16 +24,16 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.33                        addq	%rdi, %rsi
-# CHECK-NEXT:  2      6     0.50    *                   addq	(%rsp), %rsi
-# CHECK-NEXT:  1      1     0.33                        addq	%rdx, %r8
+# CHECK-NEXT:  1      1     0.50                        addq	%rdi, %rsi
+# CHECK-NEXT:  1      5     0.50    *                   addq	(%rsp), %rsi
+# CHECK-NEXT:  1      1     0.50                        addq	%rdx, %r8
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT: Index     01234567
 
-# CHECK:      [0,0]     DeER .  .   addq	%rdi, %rsi
-# CHECK-NEXT: [0,1]     DeeeeeeER   addq	(%rsp), %rsi
-# CHECK-NEXT: [0,2]     .DeE----R   addq	%rdx, %r8
+# CHECK:      [0,0]     DeER . .   addq	%rdi, %rsi
+# CHECK-NEXT: [0,1]     DeeeeeER   addq	(%rsp), %rsi
+# CHECK-NEXT: [0,2]     D=eE---R   addq	%rdx, %r8
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -44,4 +44,4 @@
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       addq	%rdi, %rsi
 # CHECK-NEXT: 1.     1     1.0    0.0    0.0       addq	(%rsp), %rsi
-# CHECK-NEXT: 2.     1     1.0    1.0    4.0       addq	%rdx, %r8
+# CHECK-NEXT: 2.     1     2.0    2.0    3.0       addq	%rdx, %r8
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s
index 990cdc4fe31..19737e85d19 100644
--- a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
 
 # The register move from XMM0 to XMM1 can be eliminated at register renaming
 # stage. So, it should not consume pipeline resources.
@@ -10,12 +10,12 @@ vaddps %xmm1, %xmm1, %xmm2
 
 # CHECK:      Iterations:        3
 # CHECK-NEXT: Instructions:      9
-# CHECK-NEXT: Total Cycles:      9
+# CHECK-NEXT: Total Cycles:      11
 # CHECK-NEXT: Total uOps:        9
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    0.82
+# CHECK-NEXT: IPC:               0.82
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Instruction Info:
@@ -28,45 +28,68 @@ vaddps %xmm1, %xmm1, %xmm2
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      0     0.25                        vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  1      1     1.00                        vmovaps	%xmm0, %xmm1
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm1, %xmm1, %xmm2
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    9
-# CHECK-NEXT: Max number of mappings used:         8
+# CHECK-NEXT: Total number of mappings created:    6
+# CHECK-NEXT: Max number of mappings used:         6
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 6
+# CHECK-NEXT:    Max number of mappings used:      6
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   1.33    -      -      -      -     1.00   1.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%xmm0, %xmm1
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -      -     1.00    -      -      -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm1, %xmm1, %xmm2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
-
-# CHECK:      [0,0]     DR   .  .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [0,1]     DeER .  .   vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: [0,2]     D=eeeER .   vaddps	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [1,0]     D-----R .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [1,1]     .DeE--R .   vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: [1,2]     .D=eeeER.   vaddps	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [2,0]     .D-----R.   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [2,1]     .D=eE--R.   vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: [2,2]     . D=eeeER   vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     DeER .    .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,2]     D=eeeeeER .   vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,0]     D-------R .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     .DeE----R .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,2]     .D=eeeeeER.   vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,0]     .D-------R.   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .D=eE----R.   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,2]     . D=eeeeeER   vaddps	%xmm1, %xmm1, %xmm2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -75,6 +98,6 @@ vaddps %xmm1, %xmm1, %xmm2
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     0.0    0.0    3.3       vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: 1.     3     1.3    1.3    1.3       vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: 0.     3     0.0    0.0    4.7       vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     3     1.3    1.3    2.7       vmovaps	%xmm0, %xmm1
 # CHECK-NEXT: 2.     3     2.0    0.0    0.0       vaddps	%xmm1, %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s
index 6f22cdc0b7e..ee9fddec673 100644
--- a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
 
 pxor %mm0, %mm0
 movq %mm0, %mm1
@@ -14,13 +14,13 @@ movdqu %xmm5, %xmm0
 
 # CHECK:      Iterations:        3
 # CHECK-NEXT: Instructions:      27
-# CHECK-NEXT: Total Cycles:      22
+# CHECK-NEXT: Total Cycles:      18
 # CHECK-NEXT: Total uOps:        27
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.23
-# CHECK-NEXT: IPC:               1.23
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.50
+# CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -31,77 +31,99 @@ movdqu %xmm5, %xmm0
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.33                        pxor	%mm0, %mm0
-# CHECK-NEXT:  1      1     0.50                        movq	%mm0, %mm1
+# CHECK-NEXT:  1      0     0.25                        pxor	%mm0, %mm0
+# CHECK-NEXT:  1      2     0.50                        movq	%mm0, %mm1
 # CHECK-NEXT:  1      0     0.25                        xorps	%xmm0, %xmm0
-# CHECK-NEXT:  1      1     1.00                        movaps	%xmm0, %xmm1
-# CHECK-NEXT:  1      1     1.00                        movups	%xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        movapd	%xmm2, %xmm3
-# CHECK-NEXT:  1      1     1.00                        movupd	%xmm3, %xmm4
-# CHECK-NEXT:  1      1     0.33                        movdqa	%xmm4, %xmm5
-# CHECK-NEXT:  1      1     0.33                        movdqu	%xmm5, %xmm0
+# CHECK-NEXT:  1      1     0.50                        movaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      1     0.50                        movups	%xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        movapd	%xmm2, %xmm3
+# CHECK-NEXT:  1      1     0.50                        movupd	%xmm3, %xmm4
+# CHECK-NEXT:  1      2     0.50                        movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  1      2     0.50                        movdqu	%xmm5, %xmm0
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    27
-# CHECK-NEXT: Max number of mappings used:         21
+# CHECK-NEXT: Total number of mappings created:    21
+# CHECK-NEXT: Max number of mappings used:         16
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 21
+# CHECK-NEXT:    Max number of mappings used:      16
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.67   1.67    -     4.67    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.67   1.33    -     3.00    -      -     3.33   3.67    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     0.67    -     0.33    -      -     pxor	%mm0, %mm0
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movq	%mm0, %mm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movaps	%xmm0, %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movups	%xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movapd	%xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movupd	%xmm3, %xmm4
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     movdqa	%xmm4, %xmm5
-# CHECK-NEXT:  -      -     0.67    -      -     0.33    -      -     movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%mm0, %mm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     movq	%mm0, %mm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     movaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     movups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     1.00    -      -      -      -      -     movapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     movupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.33   0.67    -      -      -      -     movdqu	%xmm5, %xmm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
-
-# CHECK:      [0,0]     DeER .    .    .    ..   pxor	%mm0, %mm0
-# CHECK-NEXT: [0,1]     D=eER.    .    .    ..   movq	%mm0, %mm1
-# CHECK-NEXT: [0,2]     D---R.    .    .    ..   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [0,3]     D=eER.    .    .    ..   movaps	%xmm0, %xmm1
-# CHECK-NEXT: [0,4]     .D=eER    .    .    ..   movups	%xmm1, %xmm2
-# CHECK-NEXT: [0,5]     .D==eER   .    .    ..   movapd	%xmm2, %xmm3
-# CHECK-NEXT: [0,6]     .D===eER  .    .    ..   movupd	%xmm3, %xmm4
-# CHECK-NEXT: [0,7]     .D====eER .    .    ..   movdqa	%xmm4, %xmm5
-# CHECK-NEXT: [0,8]     . D====eER.    .    ..   movdqu	%xmm5, %xmm0
-# CHECK-NEXT: [1,0]     . DeE----R.    .    ..   pxor	%mm0, %mm0
-# CHECK-NEXT: [1,1]     . D=eE---R.    .    ..   movq	%mm0, %mm1
-# CHECK-NEXT: [1,2]     . D=====ER.    .    ..   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [1,3]     .  D====eER    .    ..   movaps	%xmm0, %xmm1
-# CHECK-NEXT: [1,4]     .  D=====eER   .    ..   movups	%xmm1, %xmm2
-# CHECK-NEXT: [1,5]     .  D======eER  .    ..   movapd	%xmm2, %xmm3
-# CHECK-NEXT: [1,6]     .  D=======eER .    ..   movupd	%xmm3, %xmm4
-# CHECK-NEXT: [1,7]     .   D=======eER.    ..   movdqa	%xmm4, %xmm5
-# CHECK-NEXT: [1,8]     .   D========eER    ..   movdqu	%xmm5, %xmm0
-# CHECK-NEXT: [2,0]     .   DeE--------R    ..   pxor	%mm0, %mm0
-# CHECK-NEXT: [2,1]     .   D=eE-------R    ..   movq	%mm0, %mm1
-# CHECK-NEXT: [2,2]     .    D========ER    ..   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [2,3]     .    D========eER   ..   movaps	%xmm0, %xmm1
-# CHECK-NEXT: [2,4]     .    D=========eER  ..   movups	%xmm1, %xmm2
-# CHECK-NEXT: [2,5]     .    D==========eER ..   movapd	%xmm2, %xmm3
-# CHECK-NEXT: [2,6]     .    .D==========eER..   movupd	%xmm3, %xmm4
-# CHECK-NEXT: [2,7]     .    .D===========eER.   movdqa	%xmm4, %xmm5
-# CHECK-NEXT: [2,8]     .    .D============eER   movdqu	%xmm5, %xmm0
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .    . .   pxor	%mm0, %mm0
+# CHECK-NEXT: [0,1]     DeeER.    .    . .   movq	%mm0, %mm1
+# CHECK-NEXT: [0,2]     D---R.    .    . .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,3]     DeE-R.    .    . .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,4]     .DeER.    .    . .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [0,5]     .D=eER    .    . .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,6]     .D==eER   .    . .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,7]     .D===eeER .    . .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,8]     . D====eeER    . .   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     . D-------R    . .   pxor	%mm0, %mm0
+# CHECK-NEXT: [1,1]     . DeeE----R    . .   movq	%mm0, %mm1
+# CHECK-NEXT: [1,2]     . D-------R    . .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [1,3]     .  DeE-----R   . .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,4]     .  D=eE----R   . .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [1,5]     .  D==eE---R   . .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,6]     .  D===eE--R   . .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,7]     .   D===eeE-R  . .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,8]     .   D=====eeER . .   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .   D--------R . .   pxor	%mm0, %mm0
+# CHECK-NEXT: [2,1]     .   D=eeE----R . .   movq	%mm0, %mm1
+# CHECK-NEXT: [2,2]     .    D-------R . .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [2,3]     .    D==eE----R. .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,4]     .    D===eE---R. .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [2,5]     .    D====eE--R. .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,6]     .    .D====eE-R. .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,7]     .    .D=====eeER .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,8]     .    .D=======eeER   movdqu	%xmm5, %xmm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -110,12 +132,12 @@ movdqu %xmm5, %xmm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     1.0    1.0    4.0       pxor	%mm0, %mm0
-# CHECK-NEXT: 1.     3     2.0    0.0    3.3       movq	%mm0, %mm1
-# CHECK-NEXT: 2.     3     5.0    0.0    1.0       xorps	%xmm0, %xmm0
-# CHECK-NEXT: 3.     3     5.3    0.7    0.0       movaps	%xmm0, %xmm1
-# CHECK-NEXT: 4.     3     6.0    0.0    0.0       movups	%xmm1, %xmm2
-# CHECK-NEXT: 5.     3     7.0    0.0    0.0       movapd	%xmm2, %xmm3
-# CHECK-NEXT: 6.     3     7.7    0.0    0.0       movupd	%xmm3, %xmm4
-# CHECK-NEXT: 7.     3     8.3    0.0    0.0       movdqa	%xmm4, %xmm5
-# CHECK-NEXT: 8.     3     9.0    0.0    0.0       movdqu	%xmm5, %xmm0
+# CHECK-NEXT: 0.     3     0.0    0.0    5.0       pxor	%mm0, %mm0
+# CHECK-NEXT: 1.     3     1.3    1.3    2.7       movq	%mm0, %mm1
+# CHECK-NEXT: 2.     3     0.0    0.0    5.7       xorps	%xmm0, %xmm0
+# CHECK-NEXT: 3.     3     1.7    1.7    3.3       movaps	%xmm0, %xmm1
+# CHECK-NEXT: 4.     3     2.3    0.0    2.3       movups	%xmm1, %xmm2
+# CHECK-NEXT: 5.     3     3.3    0.0    1.7       movapd	%xmm2, %xmm3
+# CHECK-NEXT: 6.     3     4.0    0.0    1.0       movupd	%xmm3, %xmm4
+# CHECK-NEXT: 7.     3     4.7    0.0    0.3       movdqa	%xmm4, %xmm5
+# CHECK-NEXT: 8.     3     6.3    0.0    0.0       movdqu	%xmm5, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s
index 202afac21ec..ada52545a9b 100644
--- a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
 
 vxorps  %xmm0, %xmm0, %xmm0
 vmovaps %xmm0, %xmm1
@@ -11,13 +11,13 @@ vmovdqu %xmm5, %xmm0
 
 # CHECK:      Iterations:        3
 # CHECK-NEXT: Instructions:      21
-# CHECK-NEXT: Total Cycles:      21
+# CHECK-NEXT: Total Cycles:      17
 # CHECK-NEXT: Total uOps:        21
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: uOps Per Cycle:    1.24
+# CHECK-NEXT: IPC:               1.24
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -29,66 +29,88 @@ vmovdqu %xmm5, %xmm0
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      0     0.25                        vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  1      1     1.00                        vmovaps	%xmm0, %xmm1
-# CHECK-NEXT:  1      1     1.00                        vmovups	%xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovapd	%xmm2, %xmm3
-# CHECK-NEXT:  1      1     1.00                        vmovupd	%xmm3, %xmm4
-# CHECK-NEXT:  1      1     0.33                        vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT:  1      1     0.33                        vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      1     0.50                        vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  1      2     0.50                        vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vmovdqu	%xmm5, %xmm0
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    21
-# CHECK-NEXT: Max number of mappings used:         17
+# CHECK-NEXT: Total number of mappings created:    18
+# CHECK-NEXT: Max number of mappings used:         15
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 18
+# CHECK-NEXT:    Max number of mappings used:      15
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00   1.00    -     4.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00   1.33   0.67    -      -     3.00   3.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%xmm0, %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%xmm3, %xmm4
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.67   0.33    -      -      -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -     0.67   0.33    -      -      -      -     vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -     0.33   0.67    -      -      -      -     vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.67   0.33    -      -     0.33   0.67    -      -      -      -     vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.67   0.33    -      -     0.67   0.33    -      -      -      -     vmovdqu	%xmm5, %xmm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DR   .    .    .    .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [0,1]     DeER .    .    .    .   vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: [0,2]     D=eER.    .    .    .   vmovups	%xmm1, %xmm2
-# CHECK-NEXT: [0,3]     D==eER    .    .    .   vmovapd	%xmm2, %xmm3
-# CHECK-NEXT: [0,4]     .D==eER   .    .    .   vmovupd	%xmm3, %xmm4
-# CHECK-NEXT: [0,5]     .D===eER  .    .    .   vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT: [0,6]     .D====eER .    .    .   vmovdqu	%xmm5, %xmm0
-# CHECK-NEXT: [1,0]     .D=====ER .    .    .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [1,1]     . D====eER.    .    .   vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: [1,2]     . D=====eER    .    .   vmovups	%xmm1, %xmm2
-# CHECK-NEXT: [1,3]     . D======eER   .    .   vmovapd	%xmm2, %xmm3
-# CHECK-NEXT: [1,4]     . D=======eER  .    .   vmovupd	%xmm3, %xmm4
-# CHECK-NEXT: [1,5]     .  D=======eER .    .   vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT: [1,6]     .  D========eER.    .   vmovdqu	%xmm5, %xmm0
-# CHECK-NEXT: [2,0]     .  D=========ER.    .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [2,1]     .  D=========eER    .   vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: [2,2]     .   D=========eER   .   vmovups	%xmm1, %xmm2
-# CHECK-NEXT: [2,3]     .   D==========eER  .   vmovapd	%xmm2, %xmm3
-# CHECK-NEXT: [2,4]     .   D===========eER .   vmovupd	%xmm3, %xmm4
-# CHECK-NEXT: [2,5]     .   D============eER.   vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT: [2,6]     .    D============eER   vmovdqu	%xmm5, %xmm0
+# CHECK:      [0,0]     DR   .    .    ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     DeER .    .    ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,2]     D=eER.    .    ..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [0,3]     D==eER    .    ..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,4]     .D==eER   .    ..   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,5]     .D===eeER .    ..   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,6]     .D=====eeER    ..   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     .D--------R    ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     . DeE-----R    ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,2]     . D=eE----R    ..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [1,3]     . D==eE----R   ..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,4]     . D===eE---R   ..   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,5]     .  D===eeE-R   ..   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,6]     .  D=====eeER  ..   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .  D--------R  ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .  D==eE----R  ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,2]     .   D===eE--R  ..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [2,3]     .   D====eE--R ..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,4]     .   D=====eE-R ..   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,5]     .   D======eeER..   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,6]     .    D=======eeER   vmovdqu	%xmm5, %xmm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -97,10 +119,10 @@ vmovdqu %xmm5, %xmm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     5.3    0.0    0.0       vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: 1.     3     5.3    0.3    0.0       vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: 2.     3     6.0    0.0    0.0       vmovups	%xmm1, %xmm2
-# CHECK-NEXT: 3.     3     7.0    0.0    0.0       vmovapd	%xmm2, %xmm3
-# CHECK-NEXT: 4.     3     7.7    0.0    0.0       vmovupd	%xmm3, %xmm4
-# CHECK-NEXT: 5.     3     8.3    0.0    0.0       vmovdqa	%xmm4, %xmm5
-# CHECK-NEXT: 6.     3     9.0    0.0    0.0       vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: 0.     3     0.0    0.0    5.3       vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     3     1.7    1.7    3.0       vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: 2.     3     2.7    0.3    2.0       vmovups	%xmm1, %xmm2
+# CHECK-NEXT: 3.     3     3.7    0.0    2.0       vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: 4.     3     4.3    0.0    1.3       vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: 5.     3     5.0    0.0    0.3       vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: 6.     3     6.7    0.0    0.0       vmovdqu	%xmm5, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s
index 339ec06bcc8..e651ff0becb 100644
--- a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
 
 xor %eax, %eax
 mov %eax, %ebx
@@ -9,13 +9,13 @@ mov %edx, %eax
 
 # CHECK:      Iterations:        3
 # CHECK-NEXT: Instructions:      15
-# CHECK-NEXT: Total Cycles:      15
+# CHECK-NEXT: Total Cycles:      11
 # CHECK-NEXT: Total uOps:        15
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 1.3
+# CHECK-NEXT: uOps Per Cycle:    1.36
+# CHECK-NEXT: IPC:               1.36
+# CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -27,56 +27,78 @@ mov %edx, %eax
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      0     0.25                        xorl	%eax, %eax
-# CHECK-NEXT:  1      1     0.33                        movl	%eax, %ebx
-# CHECK-NEXT:  1      1     0.33                        movl	%ebx, %ecx
-# CHECK-NEXT:  1      1     0.33                        movl	%ecx, %edx
-# CHECK-NEXT:  1      1     0.33                        movl	%edx, %eax
+# CHECK-NEXT:  1      1     0.50                        movl	%eax, %ebx
+# CHECK-NEXT:  1      1     0.50                        movl	%ebx, %ecx
+# CHECK-NEXT:  1      1     0.50                        movl	%ecx, %edx
+# CHECK-NEXT:  1      1     0.50                        movl	%edx, %eax
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    18
-# CHECK-NEXT: Max number of mappings used:         15
+# CHECK-NEXT: Total number of mappings created:    12
+# CHECK-NEXT: Max number of mappings used:         11
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 12
+# CHECK-NEXT:    Max number of mappings used:      11
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.33   1.33    -     1.33    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorl	%eax, %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movl	%eax, %ebx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movl	%ebx, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movl	%ecx, %edx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movl	%edx, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%eax, %ebx
+# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ebx, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ecx, %edx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%edx, %eax
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DR   .    .   .   xorl	%eax, %eax
-# CHECK-NEXT: [0,1]     DeER .    .   .   movl	%eax, %ebx
-# CHECK-NEXT: [0,2]     D=eER.    .   .   movl	%ebx, %ecx
-# CHECK-NEXT: [0,3]     D==eER    .   .   movl	%ecx, %edx
-# CHECK-NEXT: [0,4]     .D==eER   .   .   movl	%edx, %eax
-# CHECK-NEXT: [1,0]     .D===ER   .   .   xorl	%eax, %eax
-# CHECK-NEXT: [1,1]     .D===eER  .   .   movl	%eax, %ebx
-# CHECK-NEXT: [1,2]     .D====eER .   .   movl	%ebx, %ecx
-# CHECK-NEXT: [1,3]     . D====eER.   .   movl	%ecx, %edx
-# CHECK-NEXT: [1,4]     . D=====eER   .   movl	%edx, %eax
-# CHECK-NEXT: [2,0]     . D======ER   .   xorl	%eax, %eax
-# CHECK-NEXT: [2,1]     . D======eER  .   movl	%eax, %ebx
-# CHECK-NEXT: [2,2]     .  D======eER .   movl	%ebx, %ecx
-# CHECK-NEXT: [2,3]     .  D=======eER.   movl	%ecx, %edx
-# CHECK-NEXT: [2,4]     .  D========eER   movl	%edx, %eax
+# CHECK:      [0,0]     DR   .    .   xorl	%eax, %eax
+# CHECK-NEXT: [0,1]     DeER .    .   movl	%eax, %ebx
+# CHECK-NEXT: [0,2]     D=eER.    .   movl	%ebx, %ecx
+# CHECK-NEXT: [0,3]     D==eER    .   movl	%ecx, %edx
+# CHECK-NEXT: [0,4]     .D==eER   .   movl	%edx, %eax
+# CHECK-NEXT: [1,0]     .D----R   .   xorl	%eax, %eax
+# CHECK-NEXT: [1,1]     .DeE--R   .   movl	%eax, %ebx
+# CHECK-NEXT: [1,2]     .D=eE-R   .   movl	%ebx, %ecx
+# CHECK-NEXT: [1,3]     . D=eE-R  .   movl	%ecx, %edx
+# CHECK-NEXT: [1,4]     . D==eER  .   movl	%edx, %eax
+# CHECK-NEXT: [2,0]     . D----R  .   xorl	%eax, %eax
+# CHECK-NEXT: [2,1]     . D==eER  .   movl	%eax, %ebx
+# CHECK-NEXT: [2,2]     .  D==eER .   movl	%ebx, %ecx
+# CHECK-NEXT: [2,3]     .  D===eER.   movl	%ecx, %edx
+# CHECK-NEXT: [2,4]     .  D====eER   movl	%edx, %eax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -85,8 +107,8 @@ mov %edx, %eax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     3.7    0.0    0.0       xorl	%eax, %eax
-# CHECK-NEXT: 1.     3     4.0    0.3    0.0       movl	%eax, %ebx
-# CHECK-NEXT: 2.     3     4.7    0.0    0.0       movl	%ebx, %ecx
-# CHECK-NEXT: 3.     3     5.3    0.0    0.0       movl	%ecx, %edx
-# CHECK-NEXT: 4.     3     6.0    0.0    0.0       movl	%edx, %eax
+# CHECK-NEXT: 0.     3     0.0    0.0    2.7       xorl	%eax, %eax
+# CHECK-NEXT: 1.     3     1.7    1.7    0.7       movl	%eax, %ebx
+# CHECK-NEXT: 2.     3     2.3    0.0    0.3       movl	%ebx, %ecx
+# CHECK-NEXT: 3.     3     3.0    0.0    0.3       movl	%ecx, %edx
+# CHECK-NEXT: 4.     3     3.7    0.0    0.0       movl	%edx, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s
index 66ce02cb0fc..188eb5dd158 100644
--- a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
 
 xor %rax, %rax
 mov %rax, %rbx
@@ -9,13 +9,13 @@ mov %rdx, %rax
 
 # CHECK:      Iterations:        3
 # CHECK-NEXT: Instructions:      15
-# CHECK-NEXT: Total Cycles:      15
+# CHECK-NEXT: Total Cycles:      11
 # CHECK-NEXT: Total uOps:        15
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 1.3
+# CHECK-NEXT: uOps Per Cycle:    1.36
+# CHECK-NEXT: IPC:               1.36
+# CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -27,56 +27,78 @@ mov %rdx, %rax
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      0     0.25                        xorq	%rax, %rax
-# CHECK-NEXT:  1      1     0.33                        movq	%rax, %rbx
-# CHECK-NEXT:  1      1     0.33                        movq	%rbx, %rcx
-# CHECK-NEXT:  1      1     0.33                        movq	%rcx, %rdx
-# CHECK-NEXT:  1      1     0.33                        movq	%rdx, %rax
+# CHECK-NEXT:  1      1     0.50                        movq	%rax, %rbx
+# CHECK-NEXT:  1      1     0.50                        movq	%rbx, %rcx
+# CHECK-NEXT:  1      1     0.50                        movq	%rcx, %rdx
+# CHECK-NEXT:  1      1     0.50                        movq	%rdx, %rax
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    18
-# CHECK-NEXT: Max number of mappings used:         15
+# CHECK-NEXT: Total number of mappings created:    12
+# CHECK-NEXT: Max number of mappings used:         11
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 12
+# CHECK-NEXT:    Max number of mappings used:      11
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.33   1.33    -     1.33    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorq	%rax, %rax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq	%rax, %rbx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq	%rbx, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq	%rcx, %rdx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq	%rdx, %rax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rax, %rax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rax, %rbx
+# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rbx, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rcx, %rdx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rdx, %rax
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DR   .    .   .   xorq	%rax, %rax
-# CHECK-NEXT: [0,1]     DeER .    .   .   movq	%rax, %rbx
-# CHECK-NEXT: [0,2]     D=eER.    .   .   movq	%rbx, %rcx
-# CHECK-NEXT: [0,3]     D==eER    .   .   movq	%rcx, %rdx
-# CHECK-NEXT: [0,4]     .D==eER   .   .   movq	%rdx, %rax
-# CHECK-NEXT: [1,0]     .D===ER   .   .   xorq	%rax, %rax
-# CHECK-NEXT: [1,1]     .D===eER  .   .   movq	%rax, %rbx
-# CHECK-NEXT: [1,2]     .D====eER .   .   movq	%rbx, %rcx
-# CHECK-NEXT: [1,3]     . D====eER.   .   movq	%rcx, %rdx
-# CHECK-NEXT: [1,4]     . D=====eER   .   movq	%rdx, %rax
-# CHECK-NEXT: [2,0]     . D======ER   .   xorq	%rax, %rax
-# CHECK-NEXT: [2,1]     . D======eER  .   movq	%rax, %rbx
-# CHECK-NEXT: [2,2]     .  D======eER .   movq	%rbx, %rcx
-# CHECK-NEXT: [2,3]     .  D=======eER.   movq	%rcx, %rdx
-# CHECK-NEXT: [2,4]     .  D========eER   movq	%rdx, %rax
+# CHECK:      [0,0]     DR   .    .   xorq	%rax, %rax
+# CHECK-NEXT: [0,1]     DeER .    .   movq	%rax, %rbx
+# CHECK-NEXT: [0,2]     D=eER.    .   movq	%rbx, %rcx
+# CHECK-NEXT: [0,3]     D==eER    .   movq	%rcx, %rdx
+# CHECK-NEXT: [0,4]     .D==eER   .   movq	%rdx, %rax
+# CHECK-NEXT: [1,0]     .D----R   .   xorq	%rax, %rax
+# CHECK-NEXT: [1,1]     .DeE--R   .   movq	%rax, %rbx
+# CHECK-NEXT: [1,2]     .D=eE-R   .   movq	%rbx, %rcx
+# CHECK-NEXT: [1,3]     . D=eE-R  .   movq	%rcx, %rdx
+# CHECK-NEXT: [1,4]     . D==eER  .   movq	%rdx, %rax
+# CHECK-NEXT: [2,0]     . D----R  .   xorq	%rax, %rax
+# CHECK-NEXT: [2,1]     . D==eER  .   movq	%rax, %rbx
+# CHECK-NEXT: [2,2]     .  D==eER .   movq	%rbx, %rcx
+# CHECK-NEXT: [2,3]     .  D===eER.   movq	%rcx, %rdx
+# CHECK-NEXT: [2,4]     .  D====eER   movq	%rdx, %rax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -85,8 +107,8 @@ mov %rdx, %rax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     3.7    0.0    0.0       xorq	%rax, %rax
-# CHECK-NEXT: 1.     3     4.0    0.3    0.0       movq	%rax, %rbx
-# CHECK-NEXT: 2.     3     4.7    0.0    0.0       movq	%rbx, %rcx
-# CHECK-NEXT: 3.     3     5.3    0.0    0.0       movq	%rcx, %rdx
-# CHECK-NEXT: 4.     3     6.0    0.0    0.0       movq	%rdx, %rax
+# CHECK-NEXT: 0.     3     0.0    0.0    2.7       xorq	%rax, %rax
+# CHECK-NEXT: 1.     3     1.7    1.7    0.7       movq	%rax, %rbx
+# CHECK-NEXT: 2.     3     2.3    0.0    0.3       movq	%rbx, %rcx
+# CHECK-NEXT: 3.     3     3.0    0.0    0.3       movq	%rcx, %rdx
+# CHECK-NEXT: 4.     3     3.7    0.0    0.0       movq	%rdx, %rax
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-1.s b/test/tools/llvm-mca/X86/BdVer2/register-files-1.s
index d20b50dbec1..70685f1726a 100644
--- a/test/tools/llvm-mca/X86/BdVer2/register-files-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-1.s
@@ -1,17 +1,17 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=5 -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=5 -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
 
 vaddps %xmm0, %xmm0, %xmm0
 vmulps %xmm0, %xmm0, %xmm0
 
 # CHECK:      Iterations:        5
 # CHECK-NEXT: Instructions:      10
-# CHECK-NEXT: Total Cycles:      43
+# CHECK-NEXT: Total Cycles:      53
 # CHECK-NEXT: Total uOps:        10
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.23
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    0.19
+# CHECK-NEXT: IPC:               0.19
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Dynamic Dispatch Stall Cycles:
@@ -24,47 +24,69 @@ vmulps %xmm0, %xmm0, %xmm0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              40  (93.0%)
-# CHECK-NEXT:  2,              1  (2.3%)
-# CHECK-NEXT:  4,              2  (4.7%)
+# CHECK-NEXT:  0,              50  (94.3%)
+# CHECK-NEXT:  2,              1  (1.9%)
+# CHECK-NEXT:  4,              2  (3.8%)
 
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    10
 # CHECK-NEXT: Max number of mappings used:         10
 
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 10
+# CHECK-NEXT:    Max number of mappings used:      10
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012
+# CHECK-NEXT:                     0123456789          0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [1,0]     D========eeeER .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [1,1]     D===========eeeeeER .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [2,0]     .D===============eeeER   .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [2,1]     .D==================eeeeeER   .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [3,0]     .D=======================eeeER.    .    . .   vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [3,1]     .D==========================eeeeeER.    . .   vmulps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [4,0]     . D==============================eeeER  . .   vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [4,1]     . D=================================eeeeeER   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,0]     D==========eeeeeER  .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     D===============eeeeeER  .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,0]     .D===================eeeeeER  .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .D========================eeeeeER  .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,0]     .D=============================eeeeeER  .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,1]     .D==================================eeeeeER  .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,0]     . D======================================eeeeeER  . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,1]     . D===========================================eeeeeER   vmulps	%xmm0, %xmm0, %xmm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -73,5 +95,5 @@ vmulps %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     5     16.2   0.2    0.0       vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: 1.     5     19.2   0.0    0.0       vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 0.     5     20.2   0.2    0.0       vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     5     25.2   0.0    0.0       vmulps	%xmm0, %xmm0, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-2.s b/test/tools/llvm-mca/X86/BdVer2/register-files-2.s
index bcf2a08bc02..354876befcd 100644
--- a/test/tools/llvm-mca/X86/BdVer2/register-files-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-2.s
@@ -1,21 +1,21 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -register-file-size=5 -iterations=5 -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -register-file-size=5 -iterations=5 -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
 
 vaddps %xmm0, %xmm0, %xmm0
 vmulps %xmm0, %xmm0, %xmm0
 
 # CHECK:      Iterations:        5
 # CHECK-NEXT: Instructions:      10
-# CHECK-NEXT: Total Cycles:      43
+# CHECK-NEXT: Total Cycles:      53
 # CHECK-NEXT: Total uOps:        10
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.23
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    0.19
+# CHECK-NEXT: IPC:               0.19
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Dynamic Dispatch Stall Cycles:
-# CHECK-NEXT: RAT     - Register unavailable:                      20  (46.5%)
+# CHECK-NEXT: RAT     - Register unavailable:                      26  (49.1%)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
@@ -24,47 +24,69 @@ vmulps %xmm0, %xmm0, %xmm0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              36  (83.7%)
-# CHECK-NEXT:  1,              6  (14.0%)
-# CHECK-NEXT:  4,              1  (2.3%)
+# CHECK-NEXT:  0,              46  (86.8%)
+# CHECK-NEXT:  1,              6  (11.3%)
+# CHECK-NEXT:  4,              1  (1.9%)
 
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    10
 # CHECK-NEXT: Max number of mappings used:         5
 
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 10
+# CHECK-NEXT:    Max number of mappings used:      5
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012
+# CHECK-NEXT:                     0123456789          0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [1,0]     D========eeeER .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [1,1]     D===========eeeeeER .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [2,0]     .D===============eeeER   .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [2,1]     .    D==============eeeeeER   .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [3,0]     .    .    D==============eeeER.    .    . .   vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [3,1]     .    .    .  D==============eeeeeER.    . .   vmulps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [4,0]     .    .    .    .  D==============eeeER  . .   vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [4,1]     .    .    .    .    .D==============eeeeeER   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,0]     D==========eeeeeER  .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     D===============eeeeeER  .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,0]     .D===================eeeeeER  .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .    . D==================eeeeeER  .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,0]     .    .    . D==================eeeeeER  .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,1]     .    .    .    . D==================eeeeeER  .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,0]     .    .    .    .    . D==================eeeeeER  . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,1]     .    .    .    .    .    . D==================eeeeeER   vmulps	%xmm0, %xmm0, %xmm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -73,5 +95,5 @@ vmulps %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     5     11.2   0.2    0.0       vaddps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: 1.     5     12.2   0.0    0.0       vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 0.     5     14.0   0.2    0.0       vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     5     15.8   0.0    0.0       vmulps	%xmm0, %xmm0, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-3.s b/test/tools/llvm-mca/X86/BdVer2/register-files-3.s
index 0be7dd3978e..a5f5746d7f9 100644
--- a/test/tools/llvm-mca/X86/BdVer2/register-files-3.s
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-3.s
@@ -1,17 +1,17 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -register-file-size=5 -iterations=2 -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -register-file-size=5 -iterations=2 -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
 
 idiv %eax
 
 # CHECK:      Iterations:        2
 # CHECK-NEXT: Instructions:      2
-# CHECK-NEXT: Total Cycles:      55
-# CHECK-NEXT: Total uOps:        2
+# CHECK-NEXT: Total Cycles:      42
+# CHECK-NEXT: Total uOps:        4
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.04
-# CHECK-NEXT: IPC:               0.04
-# CHECK-NEXT: Block RThroughput: 10.0
+# CHECK-NEXT: uOps Per Cycle:    0.10
+# CHECK-NEXT: IPC:               0.05
+# CHECK-NEXT: Block RThroughput: 25.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -22,10 +22,10 @@ idiv %eax
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      25    10.00                 U     idivl	%eax
+# CHECK-NEXT:  2      14    25.00                 U     idivl	%eax
 
 # CHECK:      Dynamic Dispatch Stall Cycles:
-# CHECK-NEXT: RAT     - Register unavailable:                      27  (49.1%)
+# CHECK-NEXT: RAT     - Register unavailable:                      16  (38.1%)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
@@ -34,37 +34,59 @@ idiv %eax
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              53  (96.4%)
-# CHECK-NEXT:  1,              2  (3.6%)
+# CHECK-NEXT:  0,              40  (95.2%)
+# CHECK-NEXT:  2,              2  (4.8%)
 
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    6
 # CHECK-NEXT: Max number of mappings used:         3
 
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 6
+# CHECK-NEXT:    Max number of mappings used:      3
+
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     idivl	%eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%eax
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01
 
-# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeER  .    .    .    .    .   .   idivl	%eax
-# CHECK-NEXT: [1,0]     .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeeeeeER   idivl	%eax
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeER   .    .    .    .    ..   idivl	%eax
+# CHECK-NEXT: [1,0]     .    .    .    .D=========eeeeeeeeeeeeeeER   idivl	%eax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -73,4 +95,4 @@ idiv %eax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     1.0    1.0    0.0       idivl	%eax
+# CHECK-NEXT: 0.     2     5.5    5.5    0.0       idivl	%eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-4.s b/test/tools/llvm-mca/X86/BdVer2/register-files-4.s
index 8ad203d0151..09c9e4af7e0 100644
--- a/test/tools/llvm-mca/X86/BdVer2/register-files-4.s
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-4.s
@@ -1,17 +1,17 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=22 -dispatch-stats -register-file-stats -resource-pressure=false -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=22 -dispatch-stats -register-file-stats -resource-pressure=false -timeline -timeline-max-iterations=3 < %s | FileCheck %s
 
 idiv %eax
 
 # CHECK:      Iterations:        22
 # CHECK-NEXT: Instructions:      22
-# CHECK-NEXT: Total Cycles:      553
-# CHECK-NEXT: Total uOps:        22
+# CHECK-NEXT: Total Cycles:      542
+# CHECK-NEXT: Total uOps:        44
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.04
+# CHECK-NEXT: uOps Per Cycle:    0.08
 # CHECK-NEXT: IPC:               0.04
-# CHECK-NEXT: Block RThroughput: 10.0
+# CHECK-NEXT: Block RThroughput: 25.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -22,7 +22,7 @@ idiv %eax
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      25    10.00                 U     idivl	%eax
+# CHECK-NEXT:  2      14    25.00                 U     idivl	%eax
 
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
@@ -34,21 +34,30 @@ idiv %eax
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              547  (98.9%)
-# CHECK-NEXT:  2,              1  (0.2%)
-# CHECK-NEXT:  4,              5  (0.9%)
+# CHECK-NEXT:  0,              531  (98.0%)
+# CHECK-NEXT:  4,              11  (2.0%)
 
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    66
 # CHECK-NEXT: Max number of mappings used:         66
 
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 66
+# CHECK-NEXT:    Max number of mappings used:      66
+
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0123456789          01234567
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789
+# CHECK-NEXT:                     0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeER  .    .    .    .    .    .    .    .    .    . .   idivl	%eax
-# CHECK-NEXT: [1,0]     D=========================eeeeeeeeeeeeeeeeeeeeeeeeeER  .    .    .    .    . .   idivl	%eax
-# CHECK-NEXT: [2,0]     D==================================================eeeeeeeeeeeeeeeeeeeeeeeeeER   idivl	%eax
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeER   .    .    .    .    .    .    .    .    .    ..   idivl	%eax
+# CHECK-NEXT: [1,0]     D=========================eeeeeeeeeeeeeeER   .    .    .    .    ..   idivl	%eax
+# CHECK-NEXT: [2,0]     .D=================================================eeeeeeeeeeeeeeER   idivl	%eax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -57,4 +66,4 @@ idiv %eax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     26.0   0.3    0.0       idivl	%eax
+# CHECK-NEXT: 0.     3     25.7   7.7    0.0       idivl	%eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-5.s b/test/tools/llvm-mca/X86/BdVer2/register-files-5.s
index 31696730fd2..28922197333 100644
--- a/test/tools/llvm-mca/X86/BdVer2/register-files-5.s
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-5.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=false -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=false -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
 
   vdivps %ymm0, %ymm0, %ymm1
   vaddps %ymm0, %ymm0, %ymm2
@@ -37,13 +37,13 @@
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      33
-# CHECK-NEXT: Total Cycles:      37
-# CHECK-NEXT: Total uOps:        35
+# CHECK-NEXT: Total Cycles:      70
+# CHECK-NEXT: Total uOps:        66
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.95
-# CHECK-NEXT: IPC:               0.89
-# CHECK-NEXT: Block RThroughput: 32.0
+# CHECK-NEXT: uOps Per Cycle:    0.94
+# CHECK-NEXT: IPC:               0.47
+# CHECK-NEXT: Block RThroughput: 64.0
 
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
@@ -55,51 +55,61 @@
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              28  (75.7%)
-# CHECK-NEXT:  3,              1  (2.7%)
-# CHECK-NEXT:  4,              8  (21.6%)
+# CHECK-NEXT:  0,              53  (75.7%)
+# CHECK-NEXT:  2,              1  (1.4%)
+# CHECK-NEXT:  4,              16  (22.9%)
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    33
-# CHECK-NEXT: Max number of mappings used:         33
+# CHECK-NEXT: Total number of mappings created:    66
+# CHECK-NEXT: Max number of mappings used:         54
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 66
+# CHECK-NEXT:    Max number of mappings used:      54
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER   ..   vdivps	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [0,1]     DeeeE--------------------------R   ..   vaddps	%ymm0, %ymm0, %ymm2
-# CHECK-NEXT: [0,2]     .DeeeE-------------------------R   ..   vaddps	%ymm0, %ymm0, %ymm3
-# CHECK-NEXT: [0,3]     .D=eeeE------------------------R   ..   vaddps	%ymm0, %ymm0, %ymm4
-# CHECK-NEXT: [0,4]     .D==eeeE-----------------------R   ..   vaddps	%ymm0, %ymm0, %ymm5
-# CHECK-NEXT: [0,5]     .D===eeeE----------------------R   ..   vaddps	%ymm0, %ymm0, %ymm6
-# CHECK-NEXT: [0,6]     . D===eeeE---------------------R   ..   vaddps	%ymm0, %ymm0, %ymm7
-# CHECK-NEXT: [0,7]     . D=====eeeE-------------------R   ..   vaddps	%ymm0, %ymm0, %ymm8
-# CHECK-NEXT: [0,8]     . D======eeeE------------------R   ..   vaddps	%ymm0, %ymm0, %ymm9
-# CHECK-NEXT: [0,9]     . D=======eeeE-----------------R   ..   vaddps	%ymm0, %ymm0, %ymm10
-# CHECK-NEXT: [0,10]    .  D=======eeeE----------------R   ..   vaddps	%ymm0, %ymm0, %ymm11
-# CHECK-NEXT: [0,11]    .  D========eeeE---------------R   ..   vaddps	%ymm0, %ymm0, %ymm12
-# CHECK-NEXT: [0,12]    .  D=========eeeE--------------R   ..   vaddps	%ymm0, %ymm0, %ymm13
-# CHECK-NEXT: [0,13]    .  D===========eeeE------------R   ..   vaddps	%ymm0, %ymm0, %ymm14
-# CHECK-NEXT: [0,14]    .   D===========eeeE-----------R   ..   vaddps	%ymm0, %ymm0, %ymm15
-# CHECK-NEXT: [0,15]    .   D==eeeE--------------------R   ..   vaddps	%ymm2, %ymm0, %ymm0
-# CHECK-NEXT: [0,16]    .   D=========eeeE-------------R   ..   vaddps	%ymm2, %ymm0, %ymm3
-# CHECK-NEXT: [0,17]    .   D============eeeE----------R   ..   vaddps	%ymm2, %ymm0, %ymm4
-# CHECK-NEXT: [0,18]    .    D============eeeE---------R   ..   vaddps	%ymm2, %ymm0, %ymm5
-# CHECK-NEXT: [0,19]    .    D=============eeeE--------R   ..   vaddps	%ymm2, %ymm0, %ymm6
-# CHECK-NEXT: [0,20]    .    D==============eeeE-------R   ..   vaddps	%ymm2, %ymm0, %ymm7
-# CHECK-NEXT: [0,21]    .    D===============eeeE------R   ..   vaddps	%ymm2, %ymm0, %ymm8
-# CHECK-NEXT: [0,22]    .    .D===============eeeE-----R   ..   vaddps	%ymm2, %ymm0, %ymm9
-# CHECK-NEXT: [0,23]    .    .D================eeeE----R   ..   vaddps	%ymm2, %ymm0, %ymm10
-# CHECK-NEXT: [0,24]    .    .D=================eeeE---R   ..   vaddps	%ymm2, %ymm0, %ymm11
-# CHECK-NEXT: [0,25]    .    .D==================eeeE--R   ..   vaddps	%ymm2, %ymm0, %ymm12
-# CHECK-NEXT: [0,26]    .    . D==================eeeE-R   ..   vaddps	%ymm2, %ymm0, %ymm13
-# CHECK-NEXT: [0,27]    .    . D===================eeeER   ..   vaddps	%ymm2, %ymm0, %ymm14
-# CHECK-NEXT: [0,28]    .    . D====================eeeER  ..   vaddps	%ymm2, %ymm0, %ymm15
-# CHECK-NEXT: [0,29]    .    . D=====================eeeER ..   vaddps	%ymm3, %ymm0, %ymm2
-# CHECK-NEXT: [0,30]    .    .  D=====================eeeER..   vaddps	%ymm3, %ymm0, %ymm4
-# CHECK-NEXT: [0,31]    .    .  D======================eeeER.   vaddps	%ymm3, %ymm0, %ymm5
-# CHECK-NEXT: [0,32]    .    .  D=======================eeeER   vaddps	%ymm3, %ymm0, %ymm6
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    .    .    .    .   .   vdivps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     DeeeeeE----R   .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm2
+# CHECK-NEXT: [0,2]     .D=eeeeeE--R   .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm3
+# CHECK-NEXT: [0,3]     .D===eeeeeER   .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm4
+# CHECK-NEXT: [0,4]     . D====eeeeeER .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm5
+# CHECK-NEXT: [0,5]     . D======eeeeeER    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm6
+# CHECK-NEXT: [0,6]     .  D=======eeeeeER  .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm7
+# CHECK-NEXT: [0,7]     .  D===========eeeeeER   .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm8
+# CHECK-NEXT: [0,8]     .   D============eeeeeER .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm9
+# CHECK-NEXT: [0,9]     .   D==============eeeeeER    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm10
+# CHECK-NEXT: [0,10]    .    D===============eeeeeER  .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm11
+# CHECK-NEXT: [0,11]    .    D=================eeeeeER.    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm12
+# CHECK-NEXT: [0,12]    .    .D==================eeeeeER   .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm13
+# CHECK-NEXT: [0,13]    .    .D======================eeeeeER    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm14
+# CHECK-NEXT: [0,14]    .    . D=======================eeeeeER  .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm15
+# CHECK-NEXT: [0,15]    .    . D=====eeeeeE------------------R  .    .    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm0
+# CHECK-NEXT: [0,16]    .    .  D==================eeeeeE----R  .    .    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm3
+# CHECK-NEXT: [0,17]    .    .  D========================eeeeeER.    .    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm4
+# CHECK-NEXT: [0,18]    .    .   D=========================eeeeeER   .    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm5
+# CHECK-NEXT: [0,19]    .    .   D===========================eeeeeER .    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm6
+# CHECK-NEXT: [0,20]    .    .    D============================eeeeeER    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm7
+# CHECK-NEXT: [0,21]    .    .    D==============================eeeeeER  .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm8
+# CHECK-NEXT: [0,22]    .    .    .D===============================eeeeeER.    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm9
+# CHECK-NEXT: [0,23]    .    .    .D=================================eeeeeER   .    .    .   .   vaddps	%ymm2, %ymm0, %ymm10
+# CHECK-NEXT: [0,24]    .    .    . D==================================eeeeeER .    .    .   .   vaddps	%ymm2, %ymm0, %ymm11
+# CHECK-NEXT: [0,25]    .    .    . D====================================eeeeeER    .    .   .   vaddps	%ymm2, %ymm0, %ymm12
+# CHECK-NEXT: [0,26]    .    .    .  D=====================================eeeeeER  .    .   .   vaddps	%ymm2, %ymm0, %ymm13
+# CHECK-NEXT: [0,27]    .    .    .  D=======================================eeeeeER.    .   .   vaddps	%ymm2, %ymm0, %ymm14
+# CHECK-NEXT: [0,28]    .    .    .   D========================================eeeeeER   .   .   vaddps	%ymm2, %ymm0, %ymm15
+# CHECK-NEXT: [0,29]    .    .    .   D==========================================eeeeeER .   .   vaddps	%ymm3, %ymm0, %ymm2
+# CHECK-NEXT: [0,30]    .    .    .    D===========================================eeeeeER   .   vaddps	%ymm3, %ymm0, %ymm4
+# CHECK-NEXT: [0,31]    .    .    .    D=============================================eeeeeER .   vaddps	%ymm3, %ymm0, %ymm5
+# CHECK-NEXT: [0,32]    .    .    .    .D==============================================eeeeeER   vaddps	%ymm3, %ymm0, %ymm6
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -109,35 +119,35 @@
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       vdivps	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT: 1.     1     1.0    1.0    26.0      vaddps	%ymm0, %ymm0, %ymm2
-# CHECK-NEXT: 2.     1     1.0    1.0    25.0      vaddps	%ymm0, %ymm0, %ymm3
-# CHECK-NEXT: 3.     1     2.0    2.0    24.0      vaddps	%ymm0, %ymm0, %ymm4
-# CHECK-NEXT: 4.     1     3.0    3.0    23.0      vaddps	%ymm0, %ymm0, %ymm5
-# CHECK-NEXT: 5.     1     4.0    4.0    22.0      vaddps	%ymm0, %ymm0, %ymm6
-# CHECK-NEXT: 6.     1     4.0    4.0    21.0      vaddps	%ymm0, %ymm0, %ymm7
-# CHECK-NEXT: 7.     1     6.0    6.0    19.0      vaddps	%ymm0, %ymm0, %ymm8
-# CHECK-NEXT: 8.     1     7.0    7.0    18.0      vaddps	%ymm0, %ymm0, %ymm9
-# CHECK-NEXT: 9.     1     8.0    8.0    17.0      vaddps	%ymm0, %ymm0, %ymm10
-# CHECK-NEXT: 10.    1     8.0    8.0    16.0      vaddps	%ymm0, %ymm0, %ymm11
-# CHECK-NEXT: 11.    1     9.0    9.0    15.0      vaddps	%ymm0, %ymm0, %ymm12
-# CHECK-NEXT: 12.    1     10.0   10.0   14.0      vaddps	%ymm0, %ymm0, %ymm13
-# CHECK-NEXT: 13.    1     12.0   12.0   12.0      vaddps	%ymm0, %ymm0, %ymm14
-# CHECK-NEXT: 14.    1     12.0   12.0   11.0      vaddps	%ymm0, %ymm0, %ymm15
-# CHECK-NEXT: 15.    1     3.0    3.0    20.0      vaddps	%ymm2, %ymm0, %ymm0
-# CHECK-NEXT: 16.    1     10.0   4.0    13.0      vaddps	%ymm2, %ymm0, %ymm3
-# CHECK-NEXT: 17.    1     13.0   7.0    10.0      vaddps	%ymm2, %ymm0, %ymm4
-# CHECK-NEXT: 18.    1     13.0   8.0    9.0       vaddps	%ymm2, %ymm0, %ymm5
-# CHECK-NEXT: 19.    1     14.0   9.0    8.0       vaddps	%ymm2, %ymm0, %ymm6
-# CHECK-NEXT: 20.    1     15.0   10.0   7.0       vaddps	%ymm2, %ymm0, %ymm7
-# CHECK-NEXT: 21.    1     16.0   11.0   6.0       vaddps	%ymm2, %ymm0, %ymm8
-# CHECK-NEXT: 22.    1     16.0   12.0   5.0       vaddps	%ymm2, %ymm0, %ymm9
-# CHECK-NEXT: 23.    1     17.0   13.0   4.0       vaddps	%ymm2, %ymm0, %ymm10
-# CHECK-NEXT: 24.    1     18.0   14.0   3.0       vaddps	%ymm2, %ymm0, %ymm11
-# CHECK-NEXT: 25.    1     19.0   15.0   2.0       vaddps	%ymm2, %ymm0, %ymm12
-# CHECK-NEXT: 26.    1     19.0   16.0   1.0       vaddps	%ymm2, %ymm0, %ymm13
-# CHECK-NEXT: 27.    1     20.0   17.0   0.0       vaddps	%ymm2, %ymm0, %ymm14
-# CHECK-NEXT: 28.    1     21.0   18.0   0.0       vaddps	%ymm2, %ymm0, %ymm15
-# CHECK-NEXT: 29.    1     22.0   12.0   0.0       vaddps	%ymm3, %ymm0, %ymm2
-# CHECK-NEXT: 30.    1     22.0   13.0   0.0       vaddps	%ymm3, %ymm0, %ymm4
-# CHECK-NEXT: 31.    1     23.0   14.0   0.0       vaddps	%ymm3, %ymm0, %ymm5
-# CHECK-NEXT: 32.    1     24.0   15.0   0.0       vaddps	%ymm3, %ymm0, %ymm6
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       vaddps	%ymm0, %ymm0, %ymm2
+# CHECK-NEXT: 2.     1     2.0    2.0    2.0       vaddps	%ymm0, %ymm0, %ymm3
+# CHECK-NEXT: 3.     1     4.0    4.0    0.0       vaddps	%ymm0, %ymm0, %ymm4
+# CHECK-NEXT: 4.     1     5.0    5.0    0.0       vaddps	%ymm0, %ymm0, %ymm5
+# CHECK-NEXT: 5.     1     7.0    7.0    0.0       vaddps	%ymm0, %ymm0, %ymm6
+# CHECK-NEXT: 6.     1     8.0    8.0    0.0       vaddps	%ymm0, %ymm0, %ymm7
+# CHECK-NEXT: 7.     1     12.0   12.0   0.0       vaddps	%ymm0, %ymm0, %ymm8
+# CHECK-NEXT: 8.     1     13.0   13.0   0.0       vaddps	%ymm0, %ymm0, %ymm9
+# CHECK-NEXT: 9.     1     15.0   15.0   0.0       vaddps	%ymm0, %ymm0, %ymm10
+# CHECK-NEXT: 10.    1     16.0   16.0   0.0       vaddps	%ymm0, %ymm0, %ymm11
+# CHECK-NEXT: 11.    1     18.0   18.0   0.0       vaddps	%ymm0, %ymm0, %ymm12
+# CHECK-NEXT: 12.    1     19.0   19.0   0.0       vaddps	%ymm0, %ymm0, %ymm13
+# CHECK-NEXT: 13.    1     23.0   23.0   0.0       vaddps	%ymm0, %ymm0, %ymm14
+# CHECK-NEXT: 14.    1     24.0   24.0   0.0       vaddps	%ymm0, %ymm0, %ymm15
+# CHECK-NEXT: 15.    1     6.0    6.0    18.0      vaddps	%ymm2, %ymm0, %ymm0
+# CHECK-NEXT: 16.    1     19.0   9.0    4.0       vaddps	%ymm2, %ymm0, %ymm3
+# CHECK-NEXT: 17.    1     25.0   15.0   0.0       vaddps	%ymm2, %ymm0, %ymm4
+# CHECK-NEXT: 18.    1     26.0   17.0   0.0       vaddps	%ymm2, %ymm0, %ymm5
+# CHECK-NEXT: 19.    1     28.0   19.0   0.0       vaddps	%ymm2, %ymm0, %ymm6
+# CHECK-NEXT: 20.    1     29.0   21.0   0.0       vaddps	%ymm2, %ymm0, %ymm7
+# CHECK-NEXT: 21.    1     31.0   23.0   0.0       vaddps	%ymm2, %ymm0, %ymm8
+# CHECK-NEXT: 22.    1     32.0   25.0   0.0       vaddps	%ymm2, %ymm0, %ymm9
+# CHECK-NEXT: 23.    1     34.0   27.0   0.0       vaddps	%ymm2, %ymm0, %ymm10
+# CHECK-NEXT: 24.    1     35.0   29.0   0.0       vaddps	%ymm2, %ymm0, %ymm11
+# CHECK-NEXT: 25.    1     37.0   31.0   0.0       vaddps	%ymm2, %ymm0, %ymm12
+# CHECK-NEXT: 26.    1     38.0   33.0   0.0       vaddps	%ymm2, %ymm0, %ymm13
+# CHECK-NEXT: 27.    1     40.0   35.0   0.0       vaddps	%ymm2, %ymm0, %ymm14
+# CHECK-NEXT: 28.    1     41.0   37.0   0.0       vaddps	%ymm2, %ymm0, %ymm15
+# CHECK-NEXT: 29.    1     43.0   25.0   0.0       vaddps	%ymm3, %ymm0, %ymm2
+# CHECK-NEXT: 30.    1     44.0   27.0   0.0       vaddps	%ymm3, %ymm0, %ymm4
+# CHECK-NEXT: 31.    1     46.0   29.0   0.0       vaddps	%ymm3, %ymm0, %ymm5
+# CHECK-NEXT: 32.    1     47.0   31.0   0.0       vaddps	%ymm3, %ymm0, %ymm6
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s b/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s
index 52a0968d1fd..12d3e6f2cc0 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 femms
 
@@ -87,122 +87,134 @@ pswapd      (%rax), %mm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  31     31    10.33   *      *      U     femms
-# CHECK-NEXT:  1      3     1.00                        pavgusb	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pavgusb	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pf2id	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pf2id	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pf2iw	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pf2iw	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfacc	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfacc	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfadd	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfadd	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfcmpeq	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfcmpeq	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfcmpge	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfcmpge	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfcmpgt	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfcmpgt	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfmax	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfmax	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfmin	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfmin	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfmul	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfmul	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfnacc	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfnacc	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfpnacc	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfpnacc	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfrcp	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfrcp	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfrcpit1	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfrcpit1	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfrcpit2	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfrcpit2	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfrsqit1	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfrsqit1	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfrsqrt	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfrsqrt	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfsub	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfsub	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pfsubr	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pfsubr	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pi2fd	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pi2fd	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pi2fw	%mm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   pi2fw	(%rax), %mm2
-# CHECK-NEXT:  1      5     1.00                        pmulhrw	%mm0, %mm2
-# CHECK-NEXT:  2      10    1.00    *                   pmulhrw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50    *      *      U     femms
+# CHECK-NEXT:  1      2     0.50                        pavgusb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pavgusb	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pf2id	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pf2id	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pf2iw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pf2iw	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfacc	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfacc	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfadd	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfadd	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfcmpeq	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfcmpeq	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfcmpge	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfcmpge	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfcmpgt	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfcmpgt	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfmax	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfmax	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfmin	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfmin	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfmul	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfmul	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfnacc	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfnacc	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfpnacc	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfpnacc	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfrcp	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfrcp	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfrcpit1	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfrcpit1	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfrcpit2	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfrcpit2	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfrsqit1	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfrsqit1	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfrsqrt	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfrsqrt	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfsub	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfsub	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfsubr	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfsubr	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pi2fd	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pi2fd	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pi2fw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pi2fw	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmulhrw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhrw	(%rax), %mm2
 # CHECK-NEXT:  1      5     0.50    *      *            prefetch	(%rax)
 # CHECK-NEXT:  1      5     0.50    *      *            prefetchw	(%rax)
-# CHECK-NEXT:  1      1     1.00                        pswapd	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   pswapd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pswapd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pswapd	(%rax), %mm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     12.33  54.33   -     12.33  13.00  13.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 13.00  13.00   -      -      -      -      -      -     17.50  17.50  2.00   2.00   2.00   8.00   38.50  10.50   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     10.33  10.33   -     10.33   -      -     femms
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pavgusb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pavgusb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pf2id	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pf2id	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pf2iw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pf2iw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfacc	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfacc	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfadd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfadd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfcmpeq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfcmpeq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfcmpge	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfcmpge	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfcmpgt	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfcmpgt	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfmax	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfmax	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfmin	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfmin	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfmul	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfmul	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfnacc	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfnacc	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfpnacc	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfpnacc	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfrcp	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfrcp	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfrcpit1	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfrcpit1	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfrcpit2	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfrcpit2	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfrsqit1	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfrsqit1	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfrsqrt	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfrsqrt	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfsub	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfsub	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pfsubr	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pfsubr	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pi2fd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pi2fd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pi2fw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pi2fw	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhrw	%mm0, %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhrw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetch	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetchw	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     pswapd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   pswapd	(%rax), %mm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     femms
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgusb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2id	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2id	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2iw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2iw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfacc	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfadd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfadd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpeq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpeq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpge	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpge	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpgt	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpgt	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmax	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmax	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmin	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmin	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmul	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmul	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfnacc	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfnacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfpnacc	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfpnacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcp	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcp	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit1	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit1	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit2	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit2	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqit1	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqit1	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqrt	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqrt	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsub	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsub	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsubr	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsubr	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrw	(%rax), %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetch	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetchw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pswapd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pswapd	(%rax), %mm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-adx.s b/test/tools/llvm-mca/X86/BdVer2/resources-adx.s
index 25f08545e4a..a24213966ed 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-adx.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-adx.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 adcx        %ebx, %ecx
 adcx        (%rbx), %ecx
@@ -20,36 +20,48 @@ adox        (%rbx), %rcx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      2     0.67                        adcxl	%ebx, %ecx
-# CHECK-NEXT:  3      7     0.67    *                   adcxl	(%rbx), %ecx
-# CHECK-NEXT:  2      2     0.67                        adcxq	%rbx, %rcx
-# CHECK-NEXT:  3      7     0.67    *                   adcxq	(%rbx), %rcx
-# CHECK-NEXT:  2      2     0.67                        adoxl	%ebx, %ecx
-# CHECK-NEXT:  3      7     0.67    *                   adoxl	(%rbx), %ecx
-# CHECK-NEXT:  2      2     0.67                        adoxq	%rbx, %rcx
-# CHECK-NEXT:  3      7     0.67    *                   adoxq	(%rbx), %rcx
+# CHECK-NEXT:  1      1     1.00                        adcxl	%ebx, %ecx
+# CHECK-NEXT:  1      5     1.00    *                   adcxl	(%rbx), %ecx
+# CHECK-NEXT:  1      1     1.00                        adcxq	%rbx, %rcx
+# CHECK-NEXT:  1      5     1.00    *                   adcxq	(%rbx), %rcx
+# CHECK-NEXT:  1      1     1.00                        adoxl	%ebx, %ecx
+# CHECK-NEXT:  1      5     1.00    *                   adoxl	(%rbx), %ecx
+# CHECK-NEXT:  1      1     1.00                        adoxq	%rbx, %rcx
+# CHECK-NEXT:  1      5     1.00    *                   adoxq	(%rbx), %rcx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     6.67   2.67    -     6.67   2.00   2.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 2.00   2.00    -      -      -     8.00   8.00    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcxl	%ebx, %ecx
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcxl	(%rbx), %ecx
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcxq	%rbx, %rcx
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcxq	(%rbx), %rcx
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adoxl	%ebx, %ecx
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adoxl	(%rbx), %ecx
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adoxq	%rbx, %rcx
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adoxq	(%rbx), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxl	%ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxl	(%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxq	%rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxq	(%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxl	%ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxl	(%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxq	%rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxq	(%rbx), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-aes.s b/test/tools/llvm-mca/X86/BdVer2/resources-aes.s
index 5720f208ecb..c8d400142df 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-aes.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-aes.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 aesdec          %xmm0, %xmm2
 aesdec          (%rax), %xmm2
@@ -28,44 +28,56 @@ aeskeygenassist $22, (%rax), %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      7     1.00                        aesdec	%xmm0, %xmm2
-# CHECK-NEXT:  3      13    1.00    *                   aesdec	(%rax), %xmm2
-# CHECK-NEXT:  2      7     1.00                        aesdeclast	%xmm0, %xmm2
-# CHECK-NEXT:  3      13    1.00    *                   aesdeclast	(%rax), %xmm2
-# CHECK-NEXT:  2      7     1.00                        aesenc	%xmm0, %xmm2
-# CHECK-NEXT:  3      13    1.00    *                   aesenc	(%rax), %xmm2
-# CHECK-NEXT:  2      7     1.00                        aesenclast	%xmm0, %xmm2
-# CHECK-NEXT:  3      13    1.00    *                   aesenclast	(%rax), %xmm2
-# CHECK-NEXT:  2      12    2.00                        aesimc	%xmm0, %xmm2
-# CHECK-NEXT:  3      18    2.00    *                   aesimc	(%rax), %xmm2
-# CHECK-NEXT:  1      8     3.67                        aeskeygenassist	$22, %xmm0, %xmm2
-# CHECK-NEXT:  1      8     3.33    *                   aeskeygenassist	$22, (%rax), %xmm2
+# CHECK-NEXT:  2      9     1.00                        aesdec	%xmm0, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   aesdec	(%rax), %xmm2
+# CHECK-NEXT:  2      9     1.00                        aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  2      9     1.00                        aesenc	%xmm0, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   aesenc	(%rax), %xmm2
+# CHECK-NEXT:  2      9     1.00                        aesenclast	%xmm0, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        aesimc	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   aesimc	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   aeskeygenassist	$22, (%rax), %xmm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     9.67   9.67    -     21.67  3.00   3.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 3.00   3.00    -      -      -      -      -      -      -      -      -      -     12.00   -     12.00   -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     aesdec	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   aesdec	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     aesdeclast	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   aesdeclast	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     aesenc	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   aesenc	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     aesenclast	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   aesenclast	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     2.00    -      -     aesimc	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     2.00   0.50   0.50   aesimc	(%rax), %xmm2
-# CHECK-NEXT:  -      -     3.67   3.67    -     3.67    -      -     aeskeygenassist	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     3.33   3.33    -     3.33   0.50   0.50   aeskeygenassist	$22, (%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdec	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdec	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenc	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenclast	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesimc	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aeskeygenassist	$22, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s b/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s
index f0bf9e27294..837127d4e58 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 vaddpd            %xmm0, %xmm1, %xmm2
 vaddpd            (%rax), %xmm1, %xmm2
@@ -1030,1402 +1030,1414 @@ vzeroupper
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        vaddpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vaddpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vaddpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vaddps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vaddps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vaddsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vaddsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vaddss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddsubpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vaddsubpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddsubpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vaddsubpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vaddsubps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vaddsubps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vaddsubps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vaddsubps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      7     1.00                        vaesdec	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      13    1.00    *                   vaesdec	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00                        vaesdeclast	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      13    1.00    *                   vaesdeclast	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00                        vaesenc	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      13    1.00    *                   vaesenc	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00                        vaesenclast	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      13    1.00    *                   vaesenclast	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      12    2.00                        vaesimc	%xmm0, %xmm2
-# CHECK-NEXT:  3      18    2.00    *                   vaesimc	(%rax), %xmm2
-# CHECK-NEXT:  1      8     3.67                        vaeskeygenassist	$22, %xmm0, %xmm2
-# CHECK-NEXT:  1      8     3.33    *                   vaeskeygenassist	$22, (%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vandnpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vandnpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vandnpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vandnpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vandnps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vandnps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vandnps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vandnps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vandpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vandpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vandpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vandpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vandps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vandps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vandps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vandps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     0.50                        vblendpd	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vblendpd	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vblendpd	$11, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     0.50    *                   vblendpd	$11, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     0.50                        vblendps	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vblendps	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vblendps	$11, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     0.50    *                   vblendps	$11, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      2     1.00                        vblendvpd	%xmm3, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   vblendvpd	%xmm3, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vblendvpd	%ymm3, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  3      9     1.00    *                   vblendvpd	%ymm3, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      2     1.00                        vblendvps	%xmm3, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   vblendvps	%xmm3, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vblendvps	%ymm3, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  3      9     1.00    *                   vblendvps	%ymm3, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      7     1.00    *                   vbroadcastf128	(%rax), %ymm2
-# CHECK-NEXT:  1      7     0.50    *                   vbroadcastsd	(%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50    *                   vbroadcastss	(%rax), %xmm2
-# CHECK-NEXT:  1      7     0.50    *                   vbroadcastss	(%rax), %ymm2
-# CHECK-NEXT:  1      3     1.00                        vcmppd	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vcmppd	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcmppd	$0, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vcmppd	$0, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vcmpps	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vcmpps	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcmpps	$0, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vcmpps	$0, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vcmpsd	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vcmpsd	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcmpss	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vcmpss	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vcomisd	%xmm0, %xmm1
-# CHECK-NEXT:  3      8     1.00    *                   vcomisd	(%rax), %xmm1
-# CHECK-NEXT:  2      2     1.00                        vcomiss	%xmm0, %xmm1
-# CHECK-NEXT:  3      8     1.00    *                   vcomiss	(%rax), %xmm1
-# CHECK-NEXT:  2      4     1.00                        vcvtdq2pd	%xmm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   vcvtdq2pd	(%rax), %xmm2
-# CHECK-NEXT:  2      4     1.00                        vcvtdq2pd	%xmm0, %ymm2
-# CHECK-NEXT:  3      10    1.00    *                   vcvtdq2pd	(%rax), %ymm2
-# CHECK-NEXT:  1      3     1.00                        vcvtdq2ps	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vcvtdq2ps	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcvtdq2ps	%ymm0, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vcvtdq2ps	(%rax), %ymm2
-# CHECK-NEXT:  2      4     1.00                        vcvtpd2dq	%xmm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   vcvtpd2dqx	(%rax), %xmm2
-# CHECK-NEXT:  2      4     1.00                        vcvtpd2dq	%ymm0, %xmm2
-# CHECK-NEXT:  3      11    1.00    *                   vcvtpd2dqy	(%rax), %xmm2
-# CHECK-NEXT:  2      4     1.00                        vcvtpd2ps	%xmm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   vcvtpd2psx	(%rax), %xmm2
-# CHECK-NEXT:  2      4     1.00                        vcvtpd2ps	%ymm0, %xmm2
-# CHECK-NEXT:  3      11    1.00    *                   vcvtpd2psy	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcvtps2dq	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vcvtps2dq	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcvtps2dq	%ymm0, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vcvtps2dq	(%rax), %ymm2
-# CHECK-NEXT:  2      2     1.00                        vcvtps2pd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vcvtps2pd	(%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        vcvtps2pd	%xmm0, %ymm2
-# CHECK-NEXT:  2      7     1.00    *                   vcvtps2pd	(%rax), %ymm2
-# CHECK-NEXT:  2      5     1.00                        vcvtsd2si	%xmm0, %ecx
-# CHECK-NEXT:  2      5     1.00                        vcvtsd2si	%xmm0, %rcx
-# CHECK-NEXT:  3      10    1.00    *                   vcvtsd2si	(%rax), %ecx
-# CHECK-NEXT:  3      10    1.00    *                   vcvtsd2si	(%rax), %rcx
-# CHECK-NEXT:  2      4     1.00                        vcvtsd2ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   vcvtsd2ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vaddsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vaddsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vaddsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vaddsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vaddsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vaddsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      9     1.00                        vaesdec	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   vaesdec	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00                        vaesdeclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   vaesdeclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00                        vaesenc	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   vaesenc	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00                        vaesenclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   vaesenclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaesimc	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaesimc	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaeskeygenassist	$22, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vandnpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vandnpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vandnpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vandnpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vandnps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vandnps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vandnps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vandnps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vandpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vandpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vandpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vandpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vandps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vandps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vandps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vandps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vblendpd	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vblendpd	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vblendpd	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vblendpd	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vblendps	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vblendps	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vblendps	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vblendps	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     2.00                        vblendvpd	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   vblendvpd	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     3.00                        vblendvpd	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     3.00    *                   vblendvpd	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     2.00                        vblendvps	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   vblendvps	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     3.00                        vblendvps	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     3.00    *                   vblendvps	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      7     0.50    *                   vbroadcastf128	(%rax), %ymm2
+# CHECK-NEXT:  2      6     2.00    *                   vbroadcastsd	(%rax), %ymm2
+# CHECK-NEXT:  1      7     0.50    *                   vbroadcastss	(%rax), %xmm2
+# CHECK-NEXT:  2      6     2.00    *                   vbroadcastss	(%rax), %ymm2
+# CHECK-NEXT:  1      2     1.00                        vcmppd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vcmppd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vcmppd	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vcmppd	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vcmpps	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vcmpps	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vcmpps	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vcmpps	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vcmpsd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vcmpsd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00                        vcmpss	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vcmpss	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      1     1.00                        vcomisd	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vcomisd	(%rax), %xmm1
+# CHECK-NEXT:  2      1     1.00                        vcomiss	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vcomiss	(%rax), %xmm1
+# CHECK-NEXT:  2      8     1.00                        vcvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   vcvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvtdq2pd	%xmm0, %ymm2
+# CHECK-NEXT:  5      13    2.00    *                   vcvtdq2pd	(%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vcvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vcvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  2      4     2.00                        vcvtdq2ps	%ymm0, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vcvtdq2ps	(%rax), %ymm2
+# CHECK-NEXT:  2      8     1.00                        vcvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   vcvtpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvtpd2dq	%ymm0, %xmm2
+# CHECK-NEXT:  4      13    2.00    *                   vcvtpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  2      8     1.00                        vcvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   vcvtpd2psx	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvtpd2ps	%ymm0, %xmm2
+# CHECK-NEXT:  4      13    2.00    *                   vcvtpd2psy	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        vcvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vcvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      4     2.00                        vcvtps2dq	%ymm0, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vcvtps2dq	(%rax), %ymm2
+# CHECK-NEXT:  2      8     1.00                        vcvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   vcvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvtps2pd	%xmm0, %ymm2
+# CHECK-NEXT:  5      13    2.00    *                   vcvtps2pd	(%rax), %ymm2
+# CHECK-NEXT:  2      13    1.00                        vcvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        vcvtsd2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   vcvtsd2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   vcvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  1      4     1.00                        vcvtsd2ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vcvtsd2ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  2      4     1.00                        vcvtsi2sdl	%ecx, %xmm0, %xmm2
 # CHECK-NEXT:  2      4     1.00                        vcvtsi2sdq	%rcx, %xmm0, %xmm2
 # CHECK-NEXT:  2      9     1.00    *                   vcvtsi2sdl	(%rax), %xmm0, %xmm2
 # CHECK-NEXT:  2      9     1.00    *                   vcvtsi2sdq	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  3      5     2.00                        vcvtsi2ssl	%ecx, %xmm0, %xmm2
-# CHECK-NEXT:  3      5     2.00                        vcvtsi2ssq	%rcx, %xmm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   vcvtsi2ssl	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   vcvtsi2ssq	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vcvtss2sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vcvtss2sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      5     1.00                        vcvtss2si	%xmm0, %ecx
-# CHECK-NEXT:  2      5     1.00                        vcvtss2si	%xmm0, %rcx
-# CHECK-NEXT:  3      10    1.00    *                   vcvtss2si	(%rax), %ecx
-# CHECK-NEXT:  3      10    1.00    *                   vcvtss2si	(%rax), %rcx
-# CHECK-NEXT:  2      4     1.00                        vcvttpd2dq	%xmm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   vcvttpd2dqx	(%rax), %xmm2
-# CHECK-NEXT:  2      4     1.00                        vcvttpd2dq	%ymm0, %xmm2
-# CHECK-NEXT:  3      11    1.00    *                   vcvttpd2dqy	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcvttps2dq	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vcvttps2dq	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcvttps2dq	%ymm0, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vcvttps2dq	(%rax), %ymm2
-# CHECK-NEXT:  2      5     1.00                        vcvttsd2si	%xmm0, %ecx
-# CHECK-NEXT:  2      5     1.00                        vcvttsd2si	%xmm0, %rcx
-# CHECK-NEXT:  3      10    1.00    *                   vcvttsd2si	(%rax), %ecx
-# CHECK-NEXT:  3      10    1.00    *                   vcvttsd2si	(%rax), %rcx
-# CHECK-NEXT:  2      5     1.00                        vcvttss2si	%xmm0, %ecx
-# CHECK-NEXT:  2      5     1.00                        vcvttss2si	%xmm0, %rcx
-# CHECK-NEXT:  3      10    1.00    *                   vcvttss2si	(%rax), %ecx
-# CHECK-NEXT:  3      10    1.00    *                   vcvttss2si	(%rax), %rcx
-# CHECK-NEXT:  1      22    22.00                       vdivpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      28    22.00   *                   vdivpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      45    44.00                       vdivpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  4      52    44.00   *                   vdivpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      14    14.00                       vdivps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      20    14.00   *                   vdivps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      29    28.00                       vdivps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  4      36    28.00   *                   vdivps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      22    22.00                       vdivsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      28    22.00   *                   vdivsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      14    14.00                       vdivss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      20    14.00   *                   vdivss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      9     1.00                        vdppd	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      15    1.00    *                   vdppd	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  4      12    2.00                        vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  5      18    2.00    *                   vdpps	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  4      12    2.00                        vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  5      19    2.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vextractf128	$1, %ymm0, %xmm2
-# CHECK-NEXT:  1      1     1.00           *            vextractf128	$1, %ymm0, (%rax)
-# CHECK-NEXT:  2      3     1.00                        vextractps	$1, %xmm0, %ecx
-# CHECK-NEXT:  3      5     1.00           *            vextractps	$1, %xmm0, (%rax)
-# CHECK-NEXT:  3      5     2.00                        vhaddpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   vhaddpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      5     2.00                        vhaddpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  4      12    2.00    *                   vhaddpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  3      5     2.00                        vhaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   vhaddps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      5     2.00                        vhaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  4      12    2.00    *                   vhaddps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  3      5     2.00                        vhsubpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   vhsubpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      5     2.00                        vhsubpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  4      12    2.00    *                   vhsubpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  3      5     2.00                        vhsubps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   vhsubps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      5     2.00                        vhsubps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  4      12    2.00    *                   vhsubps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vinsertf128	$1, %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      4     1.00                        vcvtsi2ssl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvtsi2ssq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcvtsi2ssl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcvtsi2ssq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vcvtss2sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vcvtss2sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      13    1.00                        vcvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        vcvtss2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   vcvtss2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   vcvtss2si	(%rax), %rcx
+# CHECK-NEXT:  2      8     1.00                        vcvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   vcvttpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvttpd2dq	%ymm0, %xmm2
+# CHECK-NEXT:  4      13    2.00    *                   vcvttpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vcvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      4     2.00                        vcvttps2dq	%ymm0, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vcvttps2dq	(%rax), %ymm2
+# CHECK-NEXT:  2      13    1.00                        vcvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        vcvttsd2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   vcvttsd2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   vcvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  2      13    1.00                        vcvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        vcvttss2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   vcvttss2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   vcvttss2si	(%rax), %rcx
+# CHECK-NEXT:  1      9     9.50                        vdivpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   vdivpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      9     19.00                       vdivpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      14    19.00   *                   vdivpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      9     9.50                        vdivps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   vdivps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      9     19.00                       vdivps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      14    19.00   *                   vdivps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      9     9.50                        vdivsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   vdivsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      9     9.50                        vdivss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   vdivss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  15     15    1.50                        vdppd	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  17     20    1.50    *                   vdppd	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  17     25    1.50                        vdpps	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  18     30    1.50    *                   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  25     27    3.00                        vdpps	$22, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  29     32    3.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vextractf128	$1, %ymm0, %xmm2
+# CHECK-NEXT:  2      7     0.50           *            vextractf128	$1, %ymm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        vextractps	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            vextractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  3      11    1.00                        vhaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   vhaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  8      11    2.00                        vhaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  10     16    2.00    *                   vhaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  3      11    1.00                        vhaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   vhaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  8      11    2.00                        vhaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  10     16    2.00    *                   vhaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  3      11    1.00                        vhsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   vhsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  8      11    2.00                        vhsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  10     16    2.00    *                   vhsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  3      11    1.00                        vhsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   vhsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  8      11    2.00                        vhsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  10     16    2.00    *                   vhsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      2     0.50                        vinsertf128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      7     0.50    *                   vinsertf128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vinsertps	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vinsertps	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   vlddqu	(%rax), %xmm2
-# CHECK-NEXT:  1      7     0.50    *                   vlddqu	(%rax), %ymm2
-# CHECK-NEXT:  4      5     1.00    *      *      U     vldmxcsr	(%rax)
+# CHECK-NEXT:  1      2     0.50                        vinsertps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vinsertps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vlddqu	(%rax), %xmm2
+# CHECK-NEXT:  2      5     0.50    *                   vlddqu	(%rax), %ymm2
+# CHECK-NEXT:  1      5     0.50    *      *      U     vldmxcsr	(%rax)
 # CHECK-NEXT:  1      1     1.00    *      *      U     vmaskmovdqu	%xmm0, %xmm1
-# CHECK-NEXT:  3      8     1.00    *                   vmaskmovpd	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  3      9     1.00    *                   vmaskmovpd	(%rax), %ymm0, %ymm2
-# CHECK-NEXT:  3      5     1.00    *      *            vmaskmovpd	%xmm0, %xmm1, (%rax)
-# CHECK-NEXT:  3      5     1.00    *      *            vmaskmovpd	%ymm0, %ymm1, (%rax)
-# CHECK-NEXT:  3      8     1.00    *                   vmaskmovps	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  3      9     1.00    *                   vmaskmovps	(%rax), %ymm0, %ymm2
-# CHECK-NEXT:  3      5     1.00    *      *            vmaskmovps	%xmm0, %xmm1, (%rax)
-# CHECK-NEXT:  3      5     1.00    *      *            vmaskmovps	%ymm0, %ymm1, (%rax)
-# CHECK-NEXT:  1      3     1.00                        vmaxpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vmaxpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vmaxpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vmaxpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vmaxps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vmaxps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vmaxps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vmaxps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vmaxsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vmaxsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vmaxss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vmaxss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vminpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vminpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vminpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vminpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vminps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vminps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vminps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vminps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vminsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vminsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vminss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vminss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovapd	%xmm0, %xmm2
+# CHECK-NEXT:  1      6     1.00    *                   vmaskmovpd	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00    *                   vmaskmovpd	(%rax), %ymm0, %ymm2
+# CHECK-NEXT:  18     6     2.00    *      *            vmaskmovpd	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT:  34     6     2.00    *      *            vmaskmovpd	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT:  1      6     1.00    *                   vmaskmovps	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00    *                   vmaskmovps	(%rax), %ymm0, %ymm2
+# CHECK-NEXT:  18     6     2.00    *      *            vmaskmovps	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT:  34     6     2.00    *      *            vmaskmovps	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT:  1      2     1.00                        vmaxpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vmaxpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vmaxpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vmaxpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vmaxps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vmaxps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vmaxps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vmaxps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00                        vmaxss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vmaxss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00                        vminpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vminpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vminpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vminpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vminps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vminps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vminps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vminps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00                        vminss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vminss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            vmovapd	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovapd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovapd	%ymm0, %ymm2
-# CHECK-NEXT:  1      1     1.00           *            vmovapd	%ymm0, (%rax)
-# CHECK-NEXT:  1      7     0.50    *                   vmovapd	(%rax), %ymm2
-# CHECK-NEXT:  1      1     1.00                        vmovaps	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovapd	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovapd	%ymm0, %ymm2
+# CHECK-NEXT:  4      1     1.00           *            vmovapd	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovapd	(%rax), %ymm2
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovaps	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovaps	%ymm0, %ymm2
-# CHECK-NEXT:  1      1     1.00           *            vmovaps	%ymm0, (%rax)
-# CHECK-NEXT:  1      7     0.50    *                   vmovaps	(%rax), %ymm2
-# CHECK-NEXT:  1      1     1.00                        vmovd	%eax, %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   vmovd	(%rax), %xmm2
-# CHECK-NEXT:  1      2     1.00                        vmovd	%xmm0, %ecx
-# CHECK-NEXT:  1      1     1.00           *            vmovd	%xmm0, (%rax)
-# CHECK-NEXT:  1      1     1.00                        vmovddup	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   vmovddup	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm0, %ymm2
-# CHECK-NEXT:  1      7     0.50    *                   vmovddup	(%rax), %ymm2
-# CHECK-NEXT:  1      1     0.33                        vmovdqa	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovaps	%ymm0, %ymm2
+# CHECK-NEXT:  4      1     1.00           *            vmovaps	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovaps	(%rax), %ymm2
+# CHECK-NEXT:  2      10    0.50                        vmovd	%eax, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovd	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00                        vmovd	%xmm0, %ecx
+# CHECK-NEXT:  1      2     1.00           *            vmovd	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        vmovddup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vmovddup	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovddup	%ymm0, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vmovddup	(%rax), %ymm2
+# CHECK-NEXT:  1      2     0.50                        vmovdqa	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            vmovdqa	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovdqa	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vmovdqa	%ymm0, %ymm2
-# CHECK-NEXT:  1      1     1.00           *            vmovdqa	%ymm0, (%rax)
-# CHECK-NEXT:  1      7     0.50    *                   vmovdqa	(%rax), %ymm2
-# CHECK-NEXT:  1      1     0.33                        vmovdqu	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovdqa	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovdqa	%ymm0, %ymm2
+# CHECK-NEXT:  4      1     1.00           *            vmovdqa	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovdqa	(%rax), %ymm2
+# CHECK-NEXT:  1      2     0.50                        vmovdqu	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            vmovdqu	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovdqu	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vmovdqu	%ymm0, %ymm2
-# CHECK-NEXT:  1      1     1.00           *            vmovdqu	%ymm0, (%rax)
-# CHECK-NEXT:  1      7     0.50    *                   vmovdqu	(%rax), %ymm2
-# CHECK-NEXT:  1      1     1.00                        vmovhlps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovlhps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00           *            vmovhpd	%xmm0, (%rax)
-# CHECK-NEXT:  2      7     1.00    *                   vmovhpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00           *            vmovhps	%xmm0, (%rax)
-# CHECK-NEXT:  2      7     1.00    *                   vmovhps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00           *            vmovlpd	%xmm0, (%rax)
-# CHECK-NEXT:  2      7     1.00    *                   vmovlpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00           *            vmovlps	%xmm0, (%rax)
-# CHECK-NEXT:  2      7     1.00    *                   vmovlps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      2     1.00                        vmovmskpd	%xmm0, %ecx
-# CHECK-NEXT:  1      2     1.00                        vmovmskpd	%ymm0, %ecx
-# CHECK-NEXT:  1      2     1.00                        vmovmskps	%xmm0, %ecx
-# CHECK-NEXT:  1      2     1.00                        vmovmskps	%ymm0, %ecx
-# CHECK-NEXT:  1      1     1.00           *            vmovntdq	%xmm0, (%rax)
-# CHECK-NEXT:  1      1     1.00           *            vmovntdq	%ymm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovntdqa	(%rax), %xmm2
-# CHECK-NEXT:  1      7     0.50    *                   vmovntdqa	(%rax), %ymm2
-# CHECK-NEXT:  1      1     1.00           *            vmovntpd	%xmm0, (%rax)
-# CHECK-NEXT:  1      1     1.00           *            vmovntpd	%ymm0, (%rax)
-# CHECK-NEXT:  1      1     1.00           *            vmovntps	%xmm0, (%rax)
-# CHECK-NEXT:  1      1     1.00           *            vmovntps	%ymm0, (%rax)
-# CHECK-NEXT:  1      1     0.33                        vmovq	%xmm0, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovq	%rax, %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   vmovq	(%rax), %xmm2
-# CHECK-NEXT:  1      2     1.00                        vmovq	%xmm0, %rcx
-# CHECK-NEXT:  1      1     1.00           *            vmovq	%xmm0, (%rax)
-# CHECK-NEXT:  1      1     1.00                        vmovsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00           *            vmovsd	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovsd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovshdup	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   vmovshdup	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovshdup	%ymm0, %ymm2
-# CHECK-NEXT:  1      7     0.50    *                   vmovshdup	(%rax), %ymm2
-# CHECK-NEXT:  1      1     1.00                        vmovsldup	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   vmovsldup	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovsldup	%ymm0, %ymm2
-# CHECK-NEXT:  1      7     0.50    *                   vmovsldup	(%rax), %ymm2
-# CHECK-NEXT:  1      1     1.00                        vmovss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00           *            vmovss	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovss	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovupd	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovdqu	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovdqu	%ymm0, %ymm2
+# CHECK-NEXT:  8      1     1.00           *            vmovdqu	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovdqu	(%rax), %ymm2
+# CHECK-NEXT:  1      2     0.50                        vmovhlps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vmovlhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00           *            vmovhpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00           *            vmovhps	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            vmovlpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovlpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            vmovlps	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovlps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      10    1.00                        vmovmskpd	%xmm0, %ecx
+# CHECK-NEXT:  2      10    1.00                        vmovmskpd	%ymm0, %ecx
+# CHECK-NEXT:  2      10    1.00                        vmovmskps	%xmm0, %ecx
+# CHECK-NEXT:  2      10    1.00                        vmovmskps	%ymm0, %ecx
+# CHECK-NEXT:  1      2     1.00           *            vmovntdq	%xmm0, (%rax)
+# CHECK-NEXT:  4      2     2.00           *            vmovntdq	%ymm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT:  1      3     1.00           *            vmovntpd	%xmm0, (%rax)
+# CHECK-NEXT:  4      3     2.00           *            vmovntpd	%ymm0, (%rax)
+# CHECK-NEXT:  1      3     1.00           *            vmovntps	%xmm0, (%rax)
+# CHECK-NEXT:  4      3     2.00           *            vmovntps	%ymm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        vmovq	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    0.50                        vmovq	%rax, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovq	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00                        vmovq	%xmm0, %rcx
+# CHECK-NEXT:  1      2     1.00           *            vmovq	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        vmovsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            vmovsd	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vmovshdup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vmovshdup	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovshdup	%ymm0, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vmovshdup	(%rax), %ymm2
+# CHECK-NEXT:  1      2     0.50                        vmovsldup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vmovsldup	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovsldup	%ymm0, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vmovsldup	(%rax), %ymm2
+# CHECK-NEXT:  1      2     0.50                        vmovss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            vmovss	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            vmovupd	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovupd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovupd	%ymm0, %ymm2
-# CHECK-NEXT:  1      1     1.00           *            vmovupd	%ymm0, (%rax)
-# CHECK-NEXT:  1      7     0.50    *                   vmovupd	(%rax), %ymm2
-# CHECK-NEXT:  1      1     1.00                        vmovups	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovupd	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovupd	%ymm0, %ymm2
+# CHECK-NEXT:  8      1     1.00           *            vmovupd	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovupd	(%rax), %ymm2
+# CHECK-NEXT:  1      1     0.50                        vmovups	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            vmovups	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovups	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vmovups	%ymm0, %ymm2
-# CHECK-NEXT:  1      1     1.00           *            vmovups	%ymm0, (%rax)
-# CHECK-NEXT:  1      7     0.50    *                   vmovups	(%rax), %ymm2
-# CHECK-NEXT:  3      7     1.00                        vmpsadbw	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      13    1.00    *                   vmpsadbw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovups	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovups	%ymm0, %ymm2
+# CHECK-NEXT:  8      1     1.00           *            vmovups	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovups	(%rax), %ymm2
+# CHECK-NEXT:  9      9     2.00                        vmpsadbw	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  9      14    2.00    *                   vmpsadbw	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vmulpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vmulpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      12    1.00    *                   vmulpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    1.00    *                   vmulpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vmulpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vmulpd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     1.00                        vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      12    1.00    *                   vmulps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    1.00    *                   vmulps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vmulps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     1.00                        vmulsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vmulsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vmulsd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vmulss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vmulss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vorpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vorpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vorpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vorpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vorps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vorps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vorps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vorps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     0.50                        vpabsb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpabsb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpabsd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpabsd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpabsw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpabsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpackssdw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpackssdw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpacksswb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpacksswb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpackusdw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpackusdw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpackuswb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpackuswb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpaddb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpaddb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpaddd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpaddd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpaddq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpaddq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpaddsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpaddsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpaddsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpaddsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpaddusb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpaddusb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpaddusw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpaddusw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpaddw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpalignr	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpalignr	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.33                        vpand	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpand	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.33                        vpandn	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpandn	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpavgb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpavgb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpavgw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpavgw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpblendvb	%xmm3, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpblendw	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpblendw	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      14    6.00                        vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      14    5.67    *                   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpcmpeqb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpcmpeqd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpcmpeqq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpcmpeqw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpcmpeqw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      4     2.67                        vpcmpestri	$1, %xmm0, %xmm2
-# CHECK-NEXT:  1      4     2.33    *                   vpcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      11    2.67                        vpcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  1      11    2.33    *                   vpcmpestrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpcmpgtb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpcmpgtb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpcmpgtd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpcmpgtd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vpcmpgtq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpcmpgtq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpcmpgtw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpcmpgtw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      11    3.00                        vpcmpistri	$1, %xmm0, %xmm2
-# CHECK-NEXT:  4      17    3.00    *                   vpcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  3      11    3.00                        vpcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  4      17    3.00    *                   vpcmpistrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vperm2f128	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vperm2f128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vpermilpd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vpermilpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpermilpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vpermilpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpermilpd	$1, %ymm0, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vpermilpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      1     1.00                        vpermilpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vpermilpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vpermilps	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vpermilps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpermilps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vpermilps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpermilps	$1, %ymm0, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vpermilps	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      1     1.00                        vpermilps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vpermilps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      3     1.00                        vpextrb	$1, %xmm0, %ecx
-# CHECK-NEXT:  3      5     1.00           *            vpextrb	$1, %xmm0, (%rax)
-# CHECK-NEXT:  2      3     1.00                        vpextrd	$1, %xmm0, %ecx
-# CHECK-NEXT:  4      5     1.00           *            vpextrd	$1, %xmm0, (%rax)
-# CHECK-NEXT:  2      3     1.00                        vpextrq	$1, %xmm0, %rcx
-# CHECK-NEXT:  4      5     1.00           *            vpextrq	$1, %xmm0, (%rax)
-# CHECK-NEXT:  2      3     1.00                        vpextrw	$1, %xmm0, %ecx
-# CHECK-NEXT:  3      5     1.00           *            vpextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  3      3     1.50                        vphaddd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   vphaddd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      3     1.50                        vphaddsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   vphaddsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      3     1.50                        vphaddw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   vphaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vphminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vphminposuw	(%rax), %xmm2
-# CHECK-NEXT:  3      3     1.50                        vphsubd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   vphsubd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      3     1.50                        vphsubsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   vphsubsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  3      3     1.50                        vphsubw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   vphsubw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpinsrb	$1, %eax, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpinsrb	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpinsrd	$1, %eax, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpinsrd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpinsrq	$1, %rax, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpinsrq	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpinsrw	$1, %eax, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpinsrw	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vpmaddubsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmaddubsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vpmaddwd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmaddwd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmaxsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmaxsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmaxsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmaxsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmaxsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmaxsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmaxub	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmaxub	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmaxud	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmaxud	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmaxuw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmaxuw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpminsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpminsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpminsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpminsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpminsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpminsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpminub	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpminub	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpminud	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpminud	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpminuw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpminuw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      2     1.00                        vpmovmskb	%xmm0, %ecx
-# CHECK-NEXT:  1      1     0.50                        vpmovsxbd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovsxbd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmovsxbq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovsxbq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmovsxbw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovsxbw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmovsxdq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovsxdq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmovsxwd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovsxwd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmovsxwq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovsxwq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmovzxbd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovzxbd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmovzxbq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovzxbq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmovzxbw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovzxbw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmovzxdq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovzxdq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmovzxwd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovzxwd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpmovzxwq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpmovzxwq	(%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        vpmuldq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmuldq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vpmulhrsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmulhrsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vpmulhuw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmulhuw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vpmulhw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmulhw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmulld	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vpmullw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmullw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vpmuludq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmuludq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.33                        vpor	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpor	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vpsadbw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpsadbw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpshufb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpshufb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpshufd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpshufd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpshufhw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpshufhw	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpshuflw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpshuflw	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsignb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpsignb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsignd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpsignd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsignw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpsignw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpslld	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpslld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   vpslld	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpslldq	$1, %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpsllq	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpsllq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   vpsllq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpsllw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpsllw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   vpsllw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpsrad	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpsrad	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   vpsrad	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpsraw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpsraw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   vpsraw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpsrld	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpsrld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   vpsrld	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsrldq	$1, %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpsrlq	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpsrlq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   vpsrlq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpsrlw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vpsrlw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   vpsrlw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsubb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpsubb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsubd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpsubd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsubq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpsubq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsubsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpsubsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsubsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpsubsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsubusb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpsubusb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsubusw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpsubusw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsubw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpsubw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        vptest	%xmm0, %xmm1
-# CHECK-NEXT:  3      8     1.00    *                   vptest	(%rax), %xmm1
-# CHECK-NEXT:  2      2     1.00                        vptest	%ymm0, %ymm1
-# CHECK-NEXT:  3      9     1.00    *                   vptest	(%rax), %ymm1
-# CHECK-NEXT:  1      1     0.50                        vpunpckhbw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpunpckhbw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpunpckhdq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpunpckhdq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpunpckhqdq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpunpckhqdq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpunpckhwd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpunpckhwd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpunpcklbw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpunpcklbw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpunpckldq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpunpckldq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpunpcklqdq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpunpcklqdq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpunpcklwd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpunpcklwd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.33                        vpxor	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   vpxor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vmulss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vpabsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpabsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpabsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpabsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpabsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpabsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpackssdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpackssdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpacksswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpacksswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpackusdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpackusdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpackuswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpackuswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpalignr	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpalignr	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpand	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpandn	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpandn	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpavgb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpavgb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpavgw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpavgw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     2.00                        vpblendvb	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpblendw	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpblendw	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  6      13    1.00                        vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  6      17    1.00    *                   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpeqb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpeqd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpeqq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpeqw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  27     15    4.00                        vpcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  28     20    4.50    *                   vpcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT:  27     10    4.00                        vpcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  28     15    4.50    *                   vpcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpgtb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpgtd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpgtq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpgtw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  7      14    1.00                        vpcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  8      19    1.00    *                   vpcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      6     1.00                        vpcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  9      11    1.00    *                   vpcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  8      4     0.50                        vperm2f128	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  10     8     0.50    *                   vperm2f128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vpermilpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpermilpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     2.00                        vpermilpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     2.00    *                   vpermilpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpermilpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vpermilpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  2      3     3.00                        vpermilpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     3.00    *                   vpermilpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vpermilps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpermilps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     2.00                        vpermilps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     2.00    *                   vpermilps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpermilps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vpermilps	$1, (%rax), %ymm2
+# CHECK-NEXT:  2      3     3.00                        vpermilps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     3.00    *                   vpermilps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      13    1.00                        vpextrb	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            vpextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        vpextrd	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            vpextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        vpextrq	$1, %xmm0, %rcx
+# CHECK-NEXT:  2      13    1.00           *            vpextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        vpextrw	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            vpextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  3      5     0.50                        vphaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     0.50                        vphaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     0.50                        vphaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     1.00                        vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        vphsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     0.50                        vphsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     0.50                        vphsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     0.50                        vpinsrb	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     0.50    *                   vpinsrb	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     0.50                        vpinsrd	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     0.50    *                   vpinsrd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     0.50                        vpinsrq	$1, %rax, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     0.50    *                   vpinsrq	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     0.50                        vpinsrw	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     0.50    *                   vpinsrw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmaddubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmaddubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmaddwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmaddwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      13    1.00                        vpmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  1      2     0.50                        vpmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmuldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmuldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmulhrsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmulhrsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmulhuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmulhuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmulhw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmulhw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     2.00                        vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    2.00    *                   vpmulld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmullw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmullw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmuludq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmuludq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     0.50                        vpsadbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     0.50    *                   vpsadbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     2.00                        vpshufb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     2.00    *                   vpshufb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsignb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsignb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsignd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsignd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsignw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsignw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpslld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpslld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpslld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpslldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsllq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsllq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsllq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsllw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsllw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsllw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsrad	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsrad	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsrad	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsraw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsraw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsraw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsrld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsrld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsrld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsrldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsrlq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsrlq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsrlq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsrlw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsrlw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsrlw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      1     1.00                        vptest	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vptest	(%rax), %xmm1
+# CHECK-NEXT:  4      1     1.00                        vptest	%ymm0, %ymm1
+# CHECK-NEXT:  6      6     1.00    *                   vptest	(%rax), %ymm1
+# CHECK-NEXT:  1      2     0.50                        vpunpckhbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpckhbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpckhdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpckhdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpckhqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpckhqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpckhwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpckhwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpcklbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpcklbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpckldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpckldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpcklqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpcklqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpcklwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpcklwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpxor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpxor	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vrcpps	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vrcpps	(%rax), %xmm2
-# CHECK-NEXT:  3      7     2.00                        vrcpps	%ymm0, %ymm2
-# CHECK-NEXT:  4      14    2.00    *                   vrcpps	(%rax), %ymm2
+# CHECK-NEXT:  1      10    1.00    *                   vrcpps	(%rax), %xmm2
+# CHECK-NEXT:  2      5     2.00                        vrcpps	%ymm0, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  1      5     1.00                        vrcpss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vroundpd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        vroundpd	$1, %ymm0, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      3     1.00                        vroundps	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        vroundps	$1, %ymm0, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      3     1.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vroundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  2      4     2.00                        vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vroundpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vroundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  2      4     2.00                        vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vroundps	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vrsqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vrsqrtps	(%rax), %xmm2
-# CHECK-NEXT:  3      7     2.00                        vrsqrtps	%ymm0, %ymm2
-# CHECK-NEXT:  4      14    2.00    *                   vrsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  1      10    1.00    *                   vrsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  2      5     2.00                        vrsqrtps	%ymm0, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vrsqrtps	(%rax), %ymm2
 # CHECK-NEXT:  1      5     1.00                        vrsqrtss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vrsqrtss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vshufpd	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vshufpd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vshufpd	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vshufpd	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vshufps	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vshufps	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vshufps	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vshufps	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      21    21.00                       vsqrtpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      27    21.00   *                   vsqrtpd	(%rax), %xmm2
-# CHECK-NEXT:  3      45    44.00                       vsqrtpd	%ymm0, %ymm2
-# CHECK-NEXT:  4      52    44.00   *                   vsqrtpd	(%rax), %ymm2
-# CHECK-NEXT:  1      14    14.00                       vsqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  2      20    14.00   *                   vsqrtps	(%rax), %xmm2
-# CHECK-NEXT:  3      29    28.00                       vsqrtps	%ymm0, %ymm2
-# CHECK-NEXT:  4      36    28.00   *                   vsqrtps	(%rax), %ymm2
-# CHECK-NEXT:  1      21    21.00                       vsqrtsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      27    21.00   *                   vsqrtsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      14    14.00                       vsqrtss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      20    14.00   *                   vsqrtss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  4      5     1.00    *      *      U     vstmxcsr	(%rax)
-# CHECK-NEXT:  1      3     1.00                        vsubpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vsubpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vsubpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vsubpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vsubps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vsubps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vsubps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vsubps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vsubsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vsubsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vsubss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   vsubss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vtestpd	%xmm0, %xmm1
-# CHECK-NEXT:  2      7     1.00    *                   vtestpd	(%rax), %xmm1
-# CHECK-NEXT:  1      1     1.00                        vtestpd	%ymm0, %ymm1
-# CHECK-NEXT:  2      8     1.00    *                   vtestpd	(%rax), %ymm1
-# CHECK-NEXT:  1      1     1.00                        vtestps	%xmm0, %xmm1
-# CHECK-NEXT:  2      7     1.00    *                   vtestps	(%rax), %xmm1
-# CHECK-NEXT:  1      1     1.00                        vtestps	%ymm0, %ymm1
-# CHECK-NEXT:  2      8     1.00    *                   vtestps	(%rax), %ymm1
-# CHECK-NEXT:  2      2     1.00                        vucomisd	%xmm0, %xmm1
-# CHECK-NEXT:  3      8     1.00    *                   vucomisd	(%rax), %xmm1
-# CHECK-NEXT:  2      2     1.00                        vucomiss	%xmm0, %xmm1
-# CHECK-NEXT:  3      8     1.00    *                   vucomiss	(%rax), %xmm1
-# CHECK-NEXT:  1      1     1.00                        vunpckhpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vunpckhpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vunpckhpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vunpckhpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vunpckhps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vunpckhps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vunpckhps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vunpckhps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vunpcklpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vunpcklpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vunpcklpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vunpcklpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vunpcklps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vunpcklps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vunpcklps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vunpcklps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vxorpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vxorpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vxorpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vxorpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vxorps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   vxorps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vxorps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vxorps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  20     9     2.00    *      *      U     vzeroall
-# CHECK-NEXT:  1      100   0.33    *      *      U     vzeroupper
+# CHECK-NEXT:  1      10    1.00    *                   vrsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vshufpd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vshufpd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vshufpd	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vshufpd	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vshufps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vshufps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vshufps	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vshufps	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      9     13.50                       vsqrtpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    13.50   *                   vsqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  2      9     27.00                       vsqrtpd	%ymm0, %ymm2
+# CHECK-NEXT:  2      14    27.00   *                   vsqrtpd	(%rax), %ymm2
+# CHECK-NEXT:  1      9     10.50                       vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    10.50   *                   vsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  2      9     21.00                       vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT:  2      14    21.00   *                   vsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  1      9     13.50                       vsqrtsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    13.50   *                   vsqrtsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      9     10.50                       vsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    10.50   *                   vsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      1     0.50    *      *      U     vstmxcsr	(%rax)
+# CHECK-NEXT:  1      5     1.00                        vsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vsubsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vsubsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vsubss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vsubss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      1     1.00                        vtestpd	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vtestpd	(%rax), %xmm1
+# CHECK-NEXT:  4      1     1.00                        vtestpd	%ymm0, %ymm1
+# CHECK-NEXT:  6      6     1.00    *                   vtestpd	(%rax), %ymm1
+# CHECK-NEXT:  2      1     1.00                        vtestps	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vtestps	(%rax), %xmm1
+# CHECK-NEXT:  4      1     1.00                        vtestps	%ymm0, %ymm1
+# CHECK-NEXT:  6      6     1.00    *                   vtestps	(%rax), %ymm1
+# CHECK-NEXT:  2      1     1.00                        vucomisd	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vucomisd	(%rax), %xmm1
+# CHECK-NEXT:  2      1     1.00                        vucomiss	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vucomiss	(%rax), %xmm1
+# CHECK-NEXT:  1      2     0.50                        vunpckhpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vunpckhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vunpckhpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpckhpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vunpckhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vunpckhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vunpckhps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpckhps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vunpcklpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vunpcklpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vunpcklpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpcklpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vunpcklps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vunpcklps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vunpcklps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpcklps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vxorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vxorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vxorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vxorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vxorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vxorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vxorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vxorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  32     90    8.00    *      *      U     vzeroall
+# CHECK-NEXT:  16     46    4.00    *      *      U     vzeroupper
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     572.00 246.83 317.33 39.00  365.83 179.50 179.50
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 229.00 229.00  -      -      -     56.00   -      -     588.00 588.00 127.50 127.50 38.00  107.00 402.50 429.50  -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddsubpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddsubpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddsubpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddsubpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddsubps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddsubps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddsubps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vaddsubps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     vaesdec	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   vaesdec	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     vaesdeclast	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   vaesdeclast	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     vaesenc	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   vaesenc	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     vaesenclast	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33   0.50   0.50   vaesenclast	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     2.00    -      -     vaesimc	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     2.00   0.50   0.50   vaesimc	(%rax), %xmm2
-# CHECK-NEXT:  -      -     3.67   3.67    -     3.67    -      -     vaeskeygenassist	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     3.33   3.33    -     3.33   0.50   0.50   vaeskeygenassist	$22, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandnpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandnpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandnps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandnps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vandps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vblendpd	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   vblendpd	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vblendpd	$11, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   vblendpd	$11, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vblendps	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   vblendps	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vblendps	$11, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   vblendps	$11, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vblendvpd	%xmm3, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vblendvpd	%xmm3, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vblendvpd	%ymm3, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vblendvpd	%ymm3, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vblendvps	%xmm3, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vblendvps	%xmm3, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vblendvps	%ymm3, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vblendvps	%ymm3, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vbroadcastf128	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vbroadcastsd	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vbroadcastss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vbroadcastss	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmppd	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmppd	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmppd	$0, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmppd	$0, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmpps	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmpps	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmpps	$0, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmpps	$0, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmpsd	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmpsd	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcmpss	$0, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcmpss	$0, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcomisd	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcomisd	(%rax), %xmm1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcomiss	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcomiss	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtdq2pd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtdq2pd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtdq2pd	%xmm0, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtdq2pd	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtdq2ps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtdq2ps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtdq2ps	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtdq2ps	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtpd2dq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtpd2dqx	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtpd2dq	%ymm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtpd2dqy	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtpd2ps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtpd2psx	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtpd2ps	%ymm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtpd2psy	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtps2dq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtps2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtps2dq	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtps2dq	(%rax), %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vcvtps2pd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vcvtps2pd	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vcvtps2pd	%xmm0, %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vcvtps2pd	(%rax), %ymm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvtsd2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvtsd2si	%xmm0, %rcx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvtsd2si	(%rax), %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvtsd2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtsd2ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtsd2ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtsi2sdl	%ecx, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvtsi2sdq	%rcx, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtsi2sdl	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtsi2sdq	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vcvtsi2ssl	%ecx, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vcvtsi2ssq	%rcx, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtsi2ssl	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvtsi2ssq	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vcvtss2sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vcvtss2sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvtss2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvtss2si	%xmm0, %rcx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvtss2si	(%rax), %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvtss2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvttpd2dq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvttpd2dqx	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vcvttpd2dq	%ymm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vcvttpd2dqy	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvttps2dq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvttps2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvttps2dq	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvttps2dq	(%rax), %ymm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvttsd2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvttsd2si	%xmm0, %rcx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvttsd2si	(%rax), %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvttsd2si	(%rax), %rcx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvttss2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vcvttss2si	%xmm0, %rcx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvttss2si	(%rax), %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vcvttss2si	(%rax), %rcx
-# CHECK-NEXT:  -     22.00  1.00    -      -      -      -      -     vdivpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -     22.00  1.00    -      -      -     0.50   0.50   vdivpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -     44.00  2.50    -      -     0.50    -      -     vdivpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -     44.00  2.50    -      -     0.50   0.50   0.50   vdivpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     vdivps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   vdivps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -     28.00  2.50    -      -     0.50    -      -     vdivps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -     28.00  2.50    -      -     0.50   0.50   0.50   vdivps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -     22.00  1.00    -      -      -      -      -     vdivsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -     22.00  1.00    -      -      -     0.50   0.50   vdivsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     vdivss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   vdivss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   0.50   0.50   vdppd	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   2.00    -     1.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   2.00    -     1.00   0.50   0.50   vdpps	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   2.00    -     1.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00   2.00    -     1.00   0.50   0.50   vdpps	$22, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vextractf128	$1, %ymm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vextractf128	$1, %ymm0, (%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vextractps	$1, %xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -     1.00   1.00   0.50   0.50   vextractps	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhaddpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhaddpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhaddps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhaddps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhsubpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhsubpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhsubpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhsubpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhsubps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhsubps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     vhsubps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   vhsubps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vinsertf128	$1, %xmm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   vinsertf128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vinsertps	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vinsertps	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vlddqu	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vlddqu	(%rax), %ymm2
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   vldmxcsr	(%rax)
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmaskmovdqu	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vmaskmovpd	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vmaskmovpd	(%rax), %ymm0, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50   1.00    -     0.50   0.50   vmaskmovpd	%xmm0, %xmm1, (%rax)
-# CHECK-NEXT:  -      -     0.50   0.50   1.00    -     0.50   0.50   vmaskmovpd	%ymm0, %ymm1, (%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vmaskmovps	(%rax), %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vmaskmovps	(%rax), %ymm0, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50   1.00    -     0.50   0.50   vmaskmovps	%xmm0, %xmm1, (%rax)
-# CHECK-NEXT:  -      -     0.50   0.50   1.00    -     0.50   0.50   vmaskmovps	%ymm0, %ymm1, (%rax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vmaxss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vmaxss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovapd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovapd	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovaps	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovaps	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovd	%eax, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovd	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovd	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovddup	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovddup	(%rax), %ymm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqa	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa	(%rax), %ymm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovhlps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovlhps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovhpd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovhpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovhps	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovhps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovlpd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovlpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovlps	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovlps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovmskpd	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovmskpd	%ymm0, %ecx
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovmskps	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovmskps	%ymm0, %ecx
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntdq	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntdq	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovntdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovntdqa	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntpd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntpd	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntps	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovntps	%ymm0, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovq	%rax, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovq	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmovq	%xmm0, %rcx
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovq	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovsd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovshdup	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovshdup	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovshdup	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovshdup	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovsldup	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovsldup	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovsldup	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovsldup	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovss	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovupd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovupd	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovups	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%ymm0, %ymm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovups	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %ymm2
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     vmpsadbw	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   0.50   0.50   vmpsadbw	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vorpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vorpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vorpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vorpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vorps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vorps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vorps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vorps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpabsb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpabsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpabsd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpabsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpabsw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpabsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpackssdw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpackssdw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpacksswb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpacksswb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpackusdw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpackusdw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpackuswb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpackuswb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddusb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddusb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddusw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddusw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpaddw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpalignr	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpalignr	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpand	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   vpand	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpandn	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   vpandn	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpavgb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpavgb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpavgw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpavgw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     vpblendvb	%xmm3, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpblendw	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpblendw	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     6.00   6.00    -     6.00    -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     5.67   5.67    -     5.67   0.50   0.50   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpeqb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpeqb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpeqd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpeqd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpeqq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpeqq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpeqw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpeqw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     2.67   2.67    -     2.67    -      -     vpcmpestri	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     2.33   2.33    -     2.33   0.50   0.50   vpcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -     2.67   2.67    -     2.67    -      -     vpcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     2.33   2.33    -     2.33   0.50   0.50   vpcmpestrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpgtb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpgtb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpgtd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpgtd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpcmpgtq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpcmpgtq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmpgtw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmpgtw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     3.00    -      -      -      -      -     vpcmpistri	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     3.00    -      -      -     0.50   0.50   vpcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -     3.00    -      -      -      -      -     vpcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     3.00    -      -      -     0.50   0.50   vpcmpistrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vperm2f128	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vperm2f128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilpd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilpd	$1, %ymm0, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilps	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilps	$1, %ymm0, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermilps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermilps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpextrb	$1, %xmm0, %ecx
-# CHECK-NEXT:  -      -      -     0.50   1.00   0.50   0.50   0.50   vpextrb	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpextrd	$1, %xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00   0.50   1.00   0.50   0.50   0.50   vpextrd	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpextrq	$1, %xmm0, %rcx
-# CHECK-NEXT:  -      -     1.00   0.50   1.00   0.50   0.50   0.50   vpextrq	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpextrw	$1, %xmm0, %ecx
-# CHECK-NEXT:  -      -      -     0.50   1.00   0.50   0.50   0.50   vpextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vphminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vphminposuw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     vpinsrb	$1, %eax, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpinsrb	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     vpinsrd	$1, %eax, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpinsrd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     vpinsrq	$1, %rax, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpinsrq	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     vpinsrw	$1, %eax, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpinsrw	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmaddubsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmaddubsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmaddwd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmaddwd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxub	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxub	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxud	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxud	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmaxuw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmaxuw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminub	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminub	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminud	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminud	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpminuw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpminuw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmovmskb	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxbd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxbd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxbq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxbq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxbw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxdq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxwd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovsxwq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovsxwq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxbd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxbd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxbq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxbq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxbw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxdq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxwd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpmovzxwq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpmovzxwq	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmuldq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmuldq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulhrsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulhrsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulhuw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulhuw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulhw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulhw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmulld	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmullw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmullw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmuludq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmuludq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpor	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   vpor	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsadbw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpsadbw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpshufb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpshufb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpshufd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpshufd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpshufhw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpshufhw	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpshuflw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpshuflw	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsignb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsignb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsignd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsignd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsignw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsignw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpslld	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpslld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpslld	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpslldq	$1, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsllq	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsllq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsllq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsllw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsllw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsllw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsrad	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsrad	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsrad	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsraw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsraw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsraw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsrld	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsrld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsrld	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsrldq	$1, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsrlq	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsrlq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsrlq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpsrlw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     vpsrlw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   vpsrlw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubsb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubsb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubusb	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubusb	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubusw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubusw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpsubw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpsubw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vptest	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     vptest	%ymm0, %ymm1
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   vptest	(%rax), %ymm1
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpckhbw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpckhbw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpckhdq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpckhdq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpckhqdq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpckhqdq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpckhwd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpckhwd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpcklbw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpcklbw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpckldq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpckldq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpcklqdq	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpcklqdq	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpunpcklwd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpunpcklwd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpxor	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   vpxor	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vrcpps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vrcpps	(%rax), %xmm2
-# CHECK-NEXT:  -      -     2.50    -      -     0.50    -      -     vrcpps	%ymm0, %ymm2
-# CHECK-NEXT:  -      -     2.50    -      -     0.50   0.50   0.50   vrcpps	(%rax), %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundps	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundps	$1, %ymm0, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vroundss	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vrsqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vrsqrtps	(%rax), %xmm2
-# CHECK-NEXT:  -      -     2.50    -      -     0.50    -      -     vrsqrtps	%ymm0, %ymm2
-# CHECK-NEXT:  -      -     2.50    -      -     0.50   0.50   0.50   vrsqrtps	(%rax), %ymm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vrsqrtss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vrsqrtss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vshufpd	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vshufpd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vshufpd	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vshufpd	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vshufps	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vshufps	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vshufps	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vshufps	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -     21.00  1.00    -      -      -      -      -     vsqrtpd	%xmm0, %xmm2
-# CHECK-NEXT:  -     21.00  1.00    -      -      -     0.50   0.50   vsqrtpd	(%rax), %xmm2
-# CHECK-NEXT:  -     44.00  2.50    -      -     0.50    -      -     vsqrtpd	%ymm0, %ymm2
-# CHECK-NEXT:  -     44.00  2.50    -      -     0.50   0.50   0.50   vsqrtpd	(%rax), %ymm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     vsqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   vsqrtps	(%rax), %xmm2
-# CHECK-NEXT:  -     28.00  2.50    -      -     0.50    -      -     vsqrtps	%ymm0, %ymm2
-# CHECK-NEXT:  -     28.00  2.50    -      -     0.50   0.50   0.50   vsqrtps	(%rax), %ymm2
-# CHECK-NEXT:  -     21.00  1.00    -      -      -      -      -     vsqrtsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -     21.00  1.00    -      -      -     0.50   0.50   vsqrtsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     vsqrtss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   vsqrtss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   vstmxcsr	(%rax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubsd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubsd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vsubss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vsubss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vtestpd	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vtestpd	(%rax), %xmm1
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vtestpd	%ymm0, %ymm1
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vtestpd	(%rax), %ymm1
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vtestps	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vtestps	(%rax), %xmm1
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vtestps	%ymm0, %ymm1
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vtestps	(%rax), %ymm1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vucomisd	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vucomisd	(%rax), %xmm1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     vucomiss	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   vucomiss	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpckhpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpckhpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpckhpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpckhpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpckhps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpckhps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpckhps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpckhps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpcklpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpcklpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpcklpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpcklpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpcklps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpcklps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vunpcklps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vunpcklps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     2.00    -      -     vzeroall
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vzeroupper
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdec	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdec	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdeclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdeclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenc	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenc	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesimc	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaeskeygenassist	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendpd	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendpd	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendpd	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendpd	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendps	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendps	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendps	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendps	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvpd	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvpd	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvpd	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvpd	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvps	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvps	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvps	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvps	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vbroadcastf128	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vbroadcastsd	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vbroadcastss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vbroadcastss	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmppd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmppd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmppd	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmppd	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpps	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpps	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmpps	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmpps	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpsd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpsd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpss	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpss	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2pd	%xmm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2pd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2ps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2ps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2dq	%ymm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2psx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2ps	%ymm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2psy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2dq	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2dq	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2pd	%xmm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2pd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvttpd2dq	%ymm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvttpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvttps2dq	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvttps2dq	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdppd	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -      -     2.00    -      -      -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -      -     2.00    -      -      -      -     vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vextractf128	$1, %ymm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vextractf128	$1, %ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vextractps	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vextractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertf128	$1, %xmm0, %ymm1, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertf128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vlddqu	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vlddqu	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vldmxcsr	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmaskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovpd	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovpd	(%rax), %ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovpd	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovpd	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovps	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovps	(%rax), %ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovps	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovps	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovapd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovapd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovapd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovapd	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovapd	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovapd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovaps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovaps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovaps	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovaps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovd	%eax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovd	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovddup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovddup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovddup	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovddup	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqa	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqa	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -      -      -      -     vmovdqa	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqa	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqa	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqu	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqu	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -      -      -      -     vmovdqu	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqu	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqu	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovhlps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovlhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovhpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovhps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovlpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovlpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovlps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovlps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskpd	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskpd	%ymm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskps	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskps	%ymm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovntdq	%xmm0, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -     vmovntdq	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovntpd	%xmm0, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -     vmovntpd	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovntps	%xmm0, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -     vmovntps	%ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovq	%rax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovq	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovq	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovsd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovshdup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovshdup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovshdup	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovshdup	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsldup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsldup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovsldup	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovsldup	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovss	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovupd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovupd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovupd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovupd	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovupd	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovupd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovups	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovups	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovups	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovups	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovups	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovups	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     vmpsadbw	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     vmpsadbw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackssdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackssdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpacksswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpacksswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackusdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackusdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackuswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackuswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpalignr	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpalignr	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpand	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpandn	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpandn	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpblendvb	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpblendw	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpblendw	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vperm2f128	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vperm2f128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrb	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrd	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrq	$1, %xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrw	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrb	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrb	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrd	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrq	$1, %rax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrq	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrw	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhrsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhrsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmulld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmullw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmullw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuludq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuludq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsadbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsadbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpshufb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpshufb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrad	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrad	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrad	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsraw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsraw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsraw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vptest	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vptest	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vptest	%ymm0, %ymm1
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vptest	(%rax), %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpxor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpxor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrcpps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrcpps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundps	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundss	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrsqrtps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufpd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufpd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufpd	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufpd	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufps	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufps	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     27.00  27.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtpd	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     27.00  27.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtpd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     21.00  21.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     21.00  21.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vstmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestpd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestpd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestpd	%ymm0, %ymm1
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestpd	(%rax), %ymm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestps	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestps	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestps	%ymm0, %ymm1
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestps	(%rax), %ymm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vzeroall
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vzeroupper
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s b/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s
index 193f5537c7c..f1b155346b3 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 andn        %eax, %ebx, %ecx
 andn        (%rax), %ebx, %ecx
@@ -46,68 +46,80 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.33                        andnl	%eax, %ebx, %ecx
-# CHECK-NEXT:  2      6     0.50    *                   andnl	(%rax), %ebx, %ecx
-# CHECK-NEXT:  1      1     0.33                        andnq	%rax, %rbx, %rcx
-# CHECK-NEXT:  2      6     0.50    *                   andnq	(%rax), %rbx, %rcx
-# CHECK-NEXT:  2      2     1.00                        bextrl	%eax, %ebx, %ecx
-# CHECK-NEXT:  3      7     1.00    *                   bextrl	%eax, (%rbx), %ecx
-# CHECK-NEXT:  2      2     1.00                        bextrq	%rax, %rbx, %rcx
-# CHECK-NEXT:  3      7     1.00    *                   bextrq	%rax, (%rbx), %rcx
-# CHECK-NEXT:  1      1     0.33                        blsil	%eax, %ecx
+# CHECK-NEXT:  1      1     0.50                        andnl	%eax, %ebx, %ecx
+# CHECK-NEXT:  1      5     0.50    *                   andnl	(%rax), %ebx, %ecx
+# CHECK-NEXT:  1      1     0.50                        andnq	%rax, %rbx, %rcx
+# CHECK-NEXT:  1      5     0.50    *                   andnq	(%rax), %rbx, %rcx
+# CHECK-NEXT:  2      2     0.50                        bextrl	%eax, %ebx, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   bextrl	%eax, (%rbx), %ecx
+# CHECK-NEXT:  2      2     0.50                        bextrq	%rax, %rbx, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   bextrq	%rax, (%rbx), %rcx
+# CHECK-NEXT:  2      2     0.50                        blsil	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   blsil	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        blsiq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        blsiq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   blsiq	(%rax), %rcx
-# CHECK-NEXT:  1      1     0.33                        blsmskl	%eax, %ecx
+# CHECK-NEXT:  2      2     0.50                        blsmskl	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   blsmskl	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        blsmskq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        blsmskq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   blsmskq	(%rax), %rcx
-# CHECK-NEXT:  1      1     0.33                        blsrl	%eax, %ecx
+# CHECK-NEXT:  2      2     0.50                        blsrl	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   blsrl	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        blsrq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        blsrq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   blsrq	(%rax), %rcx
-# CHECK-NEXT:  1      3     1.00                        tzcntl	%eax, %ecx
-# CHECK-NEXT:  2      8     1.00    *                   tzcntl	(%rax), %ecx
-# CHECK-NEXT:  1      3     1.00                        tzcntq	%rax, %rcx
-# CHECK-NEXT:  2      8     1.00    *                   tzcntq	(%rax), %rcx
+# CHECK-NEXT:  2      2     1.00                        tzcntl	%eax, %ecx
+# CHECK-NEXT:  2      6     1.00    *                   tzcntl	(%rax), %ecx
+# CHECK-NEXT:  2      2     1.00                        tzcntq	%rax, %rcx
+# CHECK-NEXT:  2      6     1.00    *                   tzcntq	(%rax), %rcx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     7.33   13.33   -     7.33   6.00   6.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 6.00   6.00    -      -      -     14.00  14.00   -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andnl	%eax, %ebx, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andnl	(%rax), %ebx, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andnq	%rax, %rbx, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andnq	(%rax), %rbx, %rcx
-# CHECK-NEXT:  -      -     0.50   1.00    -     0.50    -      -     bextrl	%eax, %ebx, %ecx
-# CHECK-NEXT:  -      -     0.50   1.00    -     0.50   0.50   0.50   bextrl	%eax, (%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   1.00    -     0.50    -      -     bextrq	%rax, %rbx, %rcx
-# CHECK-NEXT:  -      -     0.50   1.00    -     0.50   0.50   0.50   bextrq	%rax, (%rbx), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsil	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsil	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsiq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsiq	(%rax), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsmskl	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsmskl	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsmskq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsmskq	(%rax), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsrl	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsrl	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsrq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsrq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     tzcntl	%eax, %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   tzcntl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     tzcntq	%rax, %rcx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   tzcntq	(%rax), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnl	%eax, %ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnl	(%rax), %ebx, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnq	%rax, %rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnq	(%rax), %rbx, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	%eax, %ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	%eax, (%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	%rax, %rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	%rax, (%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsil	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsil	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsiq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsiq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s b/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s
index 4678467babd..1f6b9ed0b3a 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 clflushopt (%rax)
 
@@ -12,22 +12,34 @@ clflushopt (%rax)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      5     1.00    *      *      U     clflushopt	(%rax)
+# CHECK-NEXT:  1      5     0.50    *      *      U     clflushopt	(%rax)
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     0.50   0.50   1.00   1.00   0.50   0.50
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.50   0.50   1.00   1.00   0.50   0.50   clflushopt	(%rax)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     clflushopt	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s b/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s
index e41571428d9..93151dc1a72 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 cmovow    %si, %di
 cmovnow   %si, %di
@@ -112,212 +112,224 @@ cmovgq    (%rax), %rdi
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      2     0.67                        cmovow	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovnow	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovbw	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovaew	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovew	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovnew	%si, %di
-# CHECK-NEXT:  3      3     1.00                        cmovbew	%si, %di
-# CHECK-NEXT:  3      3     1.00                        cmovaw	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovsw	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovnsw	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovpw	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovnpw	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovlw	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovgew	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovlew	%si, %di
-# CHECK-NEXT:  2      2     0.67                        cmovgw	%si, %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovow	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovnow	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovbw	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovaew	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovew	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovnew	(%rax), %di
-# CHECK-NEXT:  4      8     1.00    *                   cmovbew	(%rax), %di
-# CHECK-NEXT:  4      8     1.00    *                   cmovaw	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovsw	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovnsw	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovpw	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovnpw	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovlw	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovgew	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovlew	(%rax), %di
-# CHECK-NEXT:  3      7     0.67    *                   cmovgw	(%rax), %di
-# CHECK-NEXT:  2      2     0.67                        cmovol	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovnol	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovbl	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovael	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovel	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovnel	%esi, %edi
-# CHECK-NEXT:  3      3     1.00                        cmovbel	%esi, %edi
-# CHECK-NEXT:  3      3     1.00                        cmoval	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovsl	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovnsl	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovpl	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovnpl	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovll	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovgel	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovlel	%esi, %edi
-# CHECK-NEXT:  2      2     0.67                        cmovgl	%esi, %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovol	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovnol	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovbl	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovael	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovel	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovnel	(%rax), %edi
-# CHECK-NEXT:  4      8     1.00    *                   cmovbel	(%rax), %edi
-# CHECK-NEXT:  4      8     1.00    *                   cmoval	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovsl	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovnsl	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovpl	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovnpl	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovll	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovgel	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovlel	(%rax), %edi
-# CHECK-NEXT:  3      7     0.67    *                   cmovgl	(%rax), %edi
-# CHECK-NEXT:  2      2     0.67                        cmovoq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovnoq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovbq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovaeq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmoveq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovneq	%rsi, %rdi
-# CHECK-NEXT:  3      3     1.00                        cmovbeq	%rsi, %rdi
-# CHECK-NEXT:  3      3     1.00                        cmovaq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovsq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovnsq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovpq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovnpq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovlq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovgeq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovleq	%rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        cmovgq	%rsi, %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovoq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovnoq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovbq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovaeq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmoveq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovneq	(%rax), %rdi
-# CHECK-NEXT:  4      8     1.00    *                   cmovbeq	(%rax), %rdi
-# CHECK-NEXT:  4      8     1.00    *                   cmovaq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovsq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovnsq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovpq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovnpq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovlq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovgeq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovleq	(%rax), %rdi
-# CHECK-NEXT:  3      7     0.67    *                   cmovgq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovow	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovnow	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovbw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovaew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovnew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovbew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovaw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovsw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovnsw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovpw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovnpw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovlw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovgew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovlew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovgw	%si, %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovow	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovnow	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovbw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovaew	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovew	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovnew	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovbew	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovaw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovsw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovnsw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovpw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovnpw	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovlw	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovgew	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovlew	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovgw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        cmovol	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovnol	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovbl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovael	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovel	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovnel	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovbel	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmoval	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovsl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovnsl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovpl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovnpl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovll	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovgel	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovlel	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovgl	%esi, %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovol	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnol	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovbl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovael	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovel	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnel	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmovbel	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmoval	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovsl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnsl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovpl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnpl	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmovll	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmovgel	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmovlel	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmovgl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        cmovoq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovnoq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovbq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovaeq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmoveq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovneq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovbeq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovaq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovsq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovnsq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovpq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovnpq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovlq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovgeq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovleq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovgq	%rsi, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovoq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnoq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovbq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovaeq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmoveq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovneq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovbeq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovaq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovsq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnsq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovpq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnpq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovlq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovgeq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovleq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovgq	(%rax), %rdi
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     86.00  32.00   -     86.00  24.00  24.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 24.00  24.00   -      -      -     48.00  48.00   -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovow	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnow	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovbw	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovaew	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovew	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnew	%si, %di
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmovbew	%si, %di
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmovaw	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovsw	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnsw	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovpw	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnpw	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovlw	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgew	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovlew	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgw	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovow	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnow	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovbw	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovaew	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovew	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnew	(%rax), %di
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmovbew	(%rax), %di
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmovaw	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovsw	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnsw	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovpw	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnpw	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovlw	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgew	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovlew	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgw	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovol	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnol	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovbl	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovael	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovel	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnel	%esi, %edi
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmovbel	%esi, %edi
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmoval	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovsl	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnsl	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovpl	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnpl	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovll	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgel	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovlel	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgl	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovol	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnol	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovbl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovael	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovel	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnel	(%rax), %edi
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmovbel	(%rax), %edi
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmoval	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovsl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnsl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovpl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnpl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovll	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgel	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovlel	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovoq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnoq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovbq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovaeq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmoveq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovneq	%rsi, %rdi
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmovbeq	%rsi, %rdi
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33    -      -     cmovaq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovsq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnsq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovpq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovnpq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovlq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgeq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovleq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     cmovgq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovoq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnoq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovbq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovaeq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmoveq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovneq	(%rax), %rdi
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmovbeq	(%rax), %rdi
-# CHECK-NEXT:  -      -     1.33   0.33    -     1.33   0.50   0.50   cmovaq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovsq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnsq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovpq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovnpq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovlq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgeq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovleq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   cmovgq	(%rax), %rdi
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovow	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnow	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovow	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnow	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovol	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnol	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoval	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovll	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovol	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnol	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoval	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovll	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovoq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnoq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaeq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoveq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovneq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbeq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgeq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovleq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovoq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnoq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaeq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoveq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovneq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbeq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgeq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovleq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgq	(%rax), %rdi
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s b/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s
index 19a220702b1..d0ec04a5ee0 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 cmpxchg8b  (%rax)
 cmpxchg16b (%rax)
@@ -13,24 +13,36 @@ cmpxchg16b (%rax)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  3      6     1.00    *      *            cmpxchg8b	(%rax)
-# CHECK-NEXT:  3      6     1.00    *      *            cmpxchg16b	(%rax)
+# CHECK-NEXT:  18     3     1.00    *      *            cmpxchg8b	(%rax)
+# CHECK-NEXT:  22     3     1.00    *      *            cmpxchg16b	(%rax)
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     0.67   0.67   2.00   0.67   2.00   2.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   cmpxchg8b	(%rax)
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   cmpxchg16b	(%rax)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg8b	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg16b	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s b/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s
index 7dea75f8f8f..757687a4af7 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 vcvtph2ps   %xmm0, %xmm2
 vcvtph2ps   (%rax), %xmm2
@@ -22,36 +22,48 @@ vcvtps2ph   $0, %ymm0, (%rax)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        vcvtph2ps	%xmm0, %xmm2
-# CHECK-NEXT:  2      8     1.00    *                   vcvtph2ps	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcvtph2ps	%xmm0, %ymm2
-# CHECK-NEXT:  2      8     1.00    *                   vcvtph2ps	(%rax), %ymm2
-# CHECK-NEXT:  1      3     1.00                        vcvtps2ph	$0, %xmm0, %xmm2
-# CHECK-NEXT:  1      4     1.00           *            vcvtps2ph	$0, %xmm0, (%rax)
-# CHECK-NEXT:  1      3     1.00                        vcvtps2ph	$0, %ymm0, %xmm2
-# CHECK-NEXT:  1      4     1.00           *            vcvtps2ph	$0, %ymm0, (%rax)
+# CHECK-NEXT:  2      8     1.00                        vcvtph2ps	%xmm0, %xmm2
+# CHECK-NEXT:  3      13    1.00    *                   vcvtph2ps	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvtph2ps	%xmm0, %ymm2
+# CHECK-NEXT:  7      13    2.00    *                   vcvtph2ps	(%rax), %ymm2
+# CHECK-NEXT:  2      8     1.00                        vcvtps2ph	$0, %xmm0, %xmm2
+# CHECK-NEXT:  3      4     1.00           *            vcvtps2ph	$0, %xmm0, (%rax)
+# CHECK-NEXT:  4      8     2.00                        vcvtps2ph	$0, %ymm0, %xmm2
+# CHECK-NEXT:  4      4     2.00           *            vcvtps2ph	$0, %ymm0, (%rax)
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     8.00   2.00    -     2.00   2.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 2.50   2.50    -      -      -      -      -      -     1.00   1.00    -      -      -     8.00    -     12.00   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtph2ps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtph2ps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtph2ps	%xmm0, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vcvtph2ps	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtps2ph	$0, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00   1.00    -     0.50   0.50   vcvtps2ph	$0, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vcvtps2ph	$0, %ymm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00   1.00    -     0.50   0.50   vcvtps2ph	$0, %ymm0, (%rax)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtph2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtph2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtph2ps	%xmm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtph2ps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2ph	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2ph	$0, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2ph	$0, %ymm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2ph	$0, %ymm0, (%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-fma.s b/test/tools/llvm-mca/X86/BdVer2/resources-fma.s
index 05b63edb7f9..104b07fc5e6 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-fma.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-fma.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 vfmadd132pd %xmm0, %xmm1, %xmm2
 vfmadd132pd (%rax), %xmm1, %xmm2
@@ -299,403 +299,415 @@ vfnmsub231ss (%rax), %xmm1, %xmm2
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd132pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd213pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd231pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd132ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd213ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd231ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmadd132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd132sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmadd213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd213sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmadd231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd231sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmadd132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd132ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmadd213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd213ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmadd231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd231ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmaddsub132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmaddsub132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub132pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmaddsub213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmaddsub213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub213pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmaddsub231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmaddsub231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub231pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmaddsub132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmaddsub132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub132ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmaddsub213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmaddsub213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub213ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmaddsub231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmaddsub231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub231ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsub132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub132pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsub213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub213pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsub231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub231pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsub132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub132ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsub213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub213ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsub231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub231ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub132sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub213sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub231sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub132ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub213ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmsub231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsub231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub231ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfmsubadd132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsubadd132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd132pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsubadd213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsubadd213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd213pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsubadd231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsubadd231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd231pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsubadd132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsubadd132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd132ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsubadd213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsubadd213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd213ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfmsubadd231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfmsubadd231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd231ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmadd132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd132pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmadd213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd213pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmadd231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd231pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmadd132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd132ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmadd213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd213ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmadd231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd231ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd132sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd213sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd231sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd132ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd213ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfnmadd231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd231ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmsub132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub132pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmsub213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub213pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmsub231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub231pd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmsub132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub132ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmsub213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub213ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      5     0.50                        vfnmsub231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub231ps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub132sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub213sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub231sd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub132ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub213ss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vfnmsub231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsub231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub231ss	(%rax), %xmm1, %xmm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     96.00  96.00   -      -     48.00  48.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 66.00  66.00   -      -      -      -      -      -     96.00  96.00   -      -      -      -     48.00  48.00  48.00  48.00   -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsub231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsub231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub132ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub213ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsub231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsub231ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubadd231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubadd231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd132ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd213ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmadd231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmadd231ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231pd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231pd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231pd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231pd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231ps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231ps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231ps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231ps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231sd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231sd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub132ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub132ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub213ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub213ss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsub231ss	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsub231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ss	(%rax), %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s b/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s
index cc428167b23..b45abdfd387 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 vfmaddpd    %xmm0, %xmm1, %xmm2, %xmm3
 vfmaddpd    (%rax), %xmm1, %xmm2, %xmm3
@@ -139,211 +139,223 @@ vfnmsubss   %xmm0, (%rax), %xmm2, %xmm3
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      5     0.50                        vfmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddpd	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfmaddps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfmaddps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddps	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsd	%xmm0, (%rax), %xmm2, %xmm3
 # CHECK-NEXT:  1      5     0.50                        vfmaddss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddss	%xmm0, (%rax), %xmm2, %xmm3
 # CHECK-NEXT:  1      5     0.50                        vfmaddsubpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfmaddsubpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmaddsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsubpd	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfmaddsubps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfmaddsubps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmaddsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmaddsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsubps	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfmsubaddpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfmsubaddpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmsubaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubaddpd	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfmsubaddps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfmsubaddps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmsubaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubaddps	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubpd	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfmsubps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfmsubps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubps	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubsd	%xmm0, (%rax), %xmm2, %xmm3
 # CHECK-NEXT:  1      5     0.50                        vfmsubss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfmsubss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubss	%xmm0, (%rax), %xmm2, %xmm3
 # CHECK-NEXT:  1      5     0.50                        vfnmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfnmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfnmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmaddpd	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfnmaddps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfnmaddps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfnmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmaddps	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfnmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddsd	%xmm0, (%rax), %xmm2, %xmm3
 # CHECK-NEXT:  1      5     0.50                        vfnmaddss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddss	%xmm0, (%rax), %xmm2, %xmm3
 # CHECK-NEXT:  1      5     0.50                        vfnmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfnmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfnmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsubpd	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfnmsubps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  1      5     0.50                        vfnmsubps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfnmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsubps	%ymm0, (%rax), %ymm2, %ymm3
 # CHECK-NEXT:  1      5     0.50                        vfnmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubsd	%xmm0, (%rax), %xmm2, %xmm3
 # CHECK-NEXT:  1      5     0.50                        vfnmsubss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      10    0.50    *                   vfnmsubss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubss	%xmm0, (%rax), %xmm2, %xmm3
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     48.00  48.00   -      -     32.00  32.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 44.00  44.00   -      -      -      -      -      -     48.00  48.00   -      -      -      -     24.00  24.00  24.00  24.00   -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddss	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsubpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsubpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsubps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmaddsubps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmaddsubps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubaddpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubaddpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubaddps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubaddps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubaddps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubsd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmsubss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmsubss	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddsd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmaddss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmaddss	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubpd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubpd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubpd	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubpd	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubps	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubps	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubps	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubps	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubps	(%rax), %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubps	%ymm0, (%rax), %ymm2, %ymm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubsd	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubsd	%xmm0, (%rax), %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfnmsubss	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubss	(%rax), %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfnmsubss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubss	%xmm0, (%rax), %xmm2, %xmm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-lea.s b/test/tools/llvm-mca/X86/BdVer2/resources-lea.s
index 455fbe0411e..246d968a32e 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-lea.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-lea.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 lea 0(), %cx
 lea 0(), %ecx
@@ -148,290 +148,302 @@ lea 1024(%rax, %rbx, 2), %rcx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.50                        leaw	0, %cx
-# CHECK-NEXT:  1      1     0.50                        leal	0, %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	0, %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(%eax), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(%eax), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(%eax), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(%rax), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(%rax), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(,%ebx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(,%ebx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(,%ebx,2), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(,%rbx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(,%rbx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(,%rbx,2), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(%eax,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(%eax,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(%eax,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(%rax,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(%rax,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(%rax,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(%eax,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(%eax,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(%eax,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(%rax,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(%rax,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(%rax,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(%eax,%ebx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(%eax,%ebx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(%eax,%ebx,2), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	(%rax,%rbx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	(%rax,%rbx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	(%rax,%rbx,2), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16, %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16, %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16, %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(%eax), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(%eax), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(%eax), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(%rax), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(%rax), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(%rax), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%ebx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(,%ebx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%ebx,2), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(,%rbx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(,%rbx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(,%rbx,2), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(%eax,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(%eax,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(%eax,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(%rax,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(%rax,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(%rax,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(%eax,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(%eax,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(%eax,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(%rax,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(%rax,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(%rax,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(%eax,%ebx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(%eax,%ebx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(%eax,%ebx,2), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	-16(%rax,%rbx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	-16(%rax,%rbx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	-16(%rax,%rbx,2), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024, %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024, %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024, %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(%eax), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(%eax), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(%eax), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(%rax), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(%rax), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(%rax), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%ebx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(,%ebx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%ebx,2), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(,%rbx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(,%rbx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(,%rbx,2), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(%eax,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(%eax,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(%eax,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(%rax,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(%rax,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(%rax,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(%eax,%ebx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(%eax,%ebx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(%eax,%ebx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(%rax,%rbx), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(%rax,%rbx), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(%rax,%rbx), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(%eax,%ebx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(%eax,%ebx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(%eax,%ebx,2), %rcx
-# CHECK-NEXT:  1      1     0.50                        leaw	1024(%rax,%rbx,2), %cx
-# CHECK-NEXT:  1      1     0.50                        leal	1024(%rax,%rbx,2), %ecx
-# CHECK-NEXT:  1      1     0.50                        leaq	1024(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	0, %cx
+# CHECK-NEXT:  2      1     0.50                        leal	0, %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	0, %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%eax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%eax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%eax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%rax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%rax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%rax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%rbx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%eax,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%rax,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16, %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16, %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16, %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%eax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%eax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%eax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%rax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%rax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%rax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%rbx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%eax,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%rax,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024, %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024, %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024, %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%eax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%eax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%eax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%rax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%rax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%rax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%rbx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%eax,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%rax,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%rax,%rbx,2), %rcx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     67.50  67.50   -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     67.50  67.50   -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	0, %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	0, %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	0, %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%eax), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%eax), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%eax), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%rax), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%rax), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%ebx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%ebx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%ebx,2), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(,%rbx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(,%rbx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(,%rbx,2), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%eax,%ebx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%eax,%ebx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%eax,%ebx,2), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	(%rax,%rbx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	(%rax,%rbx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	(%rax,%rbx,2), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16, %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16, %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16, %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%eax), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%eax), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%eax), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%rax), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%rax), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%rax), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%ebx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%ebx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%ebx,2), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(,%rbx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(,%rbx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(,%rbx,2), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%eax,%ebx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%eax,%ebx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%eax,%ebx,2), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	-16(%rax,%rbx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	-16(%rax,%rbx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	-16(%rax,%rbx,2), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024, %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024, %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024, %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%eax), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%eax), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%eax), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%rax), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%rax), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%rax), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%ebx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%ebx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%ebx,2), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(,%rbx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(,%rbx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(,%rbx,2), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%eax,%ebx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%eax,%ebx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%eax,%ebx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%rax,%rbx), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%rax,%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%rax,%rbx), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%eax,%ebx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%eax,%ebx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%eax,%ebx,2), %rcx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaw	1024(%rax,%rbx,2), %cx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leal	1024(%rax,%rbx,2), %ecx
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	1024(%rax,%rbx,2), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	0, %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	0, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16, %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024, %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx,2), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s b/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s
index 6ac0945d4d8..1b2b38fb4a5 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 lzcntw      %cx, %cx
 lzcntw      (%rax), %cx
@@ -19,32 +19,44 @@ lzcntq      (%rax), %rcx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        lzcntw	%cx, %cx
-# CHECK-NEXT:  2      8     1.00    *                   lzcntw	(%rax), %cx
-# CHECK-NEXT:  1      3     1.00                        lzcntl	%eax, %ecx
-# CHECK-NEXT:  2      8     1.00    *                   lzcntl	(%rax), %ecx
-# CHECK-NEXT:  1      3     1.00                        lzcntq	%rax, %rcx
-# CHECK-NEXT:  2      8     1.00    *                   lzcntq	(%rax), %rcx
+# CHECK-NEXT:  2      2     0.50                        lzcntw	%cx, %cx
+# CHECK-NEXT:  2      6     0.50    *                   lzcntw	(%rax), %cx
+# CHECK-NEXT:  2      2     0.50                        lzcntl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   lzcntl	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        lzcntq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   lzcntq	(%rax), %rcx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     6.00    -      -     1.50   1.50
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 1.50   1.50    -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     lzcntw	%cx, %cx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   lzcntw	(%rax), %cx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     lzcntl	%eax, %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   lzcntl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     lzcntq	%rax, %rcx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   lzcntq	(%rax), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%cx, %cx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s b/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s
index 8c9644b6494..3dcc8083125 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 emms
 
@@ -164,230 +164,242 @@ pxor        (%rax), %mm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  31     31    10.33   *      *      U     emms
-# CHECK-NEXT:  1      1     1.00                        movd	%eax, %mm2
+# CHECK-NEXT:  1      2     0.50    *      *      U     emms
+# CHECK-NEXT:  2      10    0.50                        movd	%eax, %mm2
 # CHECK-NEXT:  1      5     0.50    *                   movd	(%rax), %mm2
-# CHECK-NEXT:  1      2     1.00                        movd	%mm0, %ecx
-# CHECK-NEXT:  1      1     1.00           *      U     movd	%mm0, (%rax)
-# CHECK-NEXT:  1      1     1.00                        movq	%rax, %mm2
+# CHECK-NEXT:  1      10    1.00                        movd	%mm0, %ecx
+# CHECK-NEXT:  1      2     1.00           *      U     movd	%mm0, (%rax)
+# CHECK-NEXT:  2      10    0.50                        movq	%rax, %mm2
 # CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %mm2
-# CHECK-NEXT:  1      2     1.00                        movq	%mm0, %rcx
-# CHECK-NEXT:  1      1     1.00           *            movq	%mm0, (%rax)
-# CHECK-NEXT:  1      1     1.00                        packsswb	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   packsswb	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        packssdw	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   packssdw	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        packuswb	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   packuswb	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        paddb	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   paddb	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        paddd	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   paddd	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        paddsb	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   paddsb	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        paddsw	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   paddsw	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        paddusb	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   paddusb	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        paddusw	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   paddusw	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        paddw	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   paddw	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.33                        pand	%mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   pand	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.33                        pandn	%mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   pandn	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pcmpeqb	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pcmpeqb	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pcmpeqd	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pcmpeqd	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pcmpeqw	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pcmpeqw	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pcmpgtb	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pcmpgtb	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pcmpgtd	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pcmpgtd	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pcmpgtw	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pcmpgtw	(%rax), %mm2
-# CHECK-NEXT:  1      5     1.00                        pmaddwd	%mm0, %mm2
-# CHECK-NEXT:  2      10    1.00    *                   pmaddwd	(%rax), %mm2
-# CHECK-NEXT:  1      5     1.00                        pmulhw	%mm0, %mm2
-# CHECK-NEXT:  2      10    1.00    *                   pmulhw	(%rax), %mm2
-# CHECK-NEXT:  1      5     1.00                        pmullw	%mm0, %mm2
-# CHECK-NEXT:  2      10    1.00    *                   pmullw	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.33                        por	%mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   por	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        pslld	$1, %mm2
-# CHECK-NEXT:  1      1     1.00                        pslld	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   pslld	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        psllq	$1, %mm2
-# CHECK-NEXT:  1      1     1.00                        psllq	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   psllq	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        psllw	$1, %mm2
-# CHECK-NEXT:  1      1     1.00                        psllw	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   psllw	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        psrad	$1, %mm2
-# CHECK-NEXT:  1      1     1.00                        psrad	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   psrad	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        psraw	$1, %mm2
-# CHECK-NEXT:  1      1     1.00                        psraw	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   psraw	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        psrld	$1, %mm2
-# CHECK-NEXT:  1      1     1.00                        psrld	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   psrld	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        psrlq	$1, %mm2
-# CHECK-NEXT:  1      1     1.00                        psrlq	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   psrlq	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        psrlw	$1, %mm2
-# CHECK-NEXT:  1      1     1.00                        psrlw	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   psrlw	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        psubb	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubb	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        psubd	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubd	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        psubsb	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubsb	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        psubsw	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubsw	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        psubusb	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubusb	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        psubusw	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubusw	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        psubw	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubw	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        punpckhbw	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   punpckhbw	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        punpckhdq	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   punpckhdq	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        punpckhwd	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   punpckhwd	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        punpcklbw	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   punpcklbw	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        punpckldq	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   punpckldq	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        punpcklwd	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   punpcklwd	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.33                        pxor	%mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   pxor	(%rax), %mm2
+# CHECK-NEXT:  1      10    1.00                        movq	%mm0, %rcx
+# CHECK-NEXT:  1      2     1.00           *            movq	%mm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        packsswb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   packsswb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        packssdw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   packssdw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        packuswb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   packuswb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddsb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddsb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddusb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddusb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddusw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddusw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pand	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pand	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pandn	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pandn	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtw	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmaddwd	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmaddwd	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmulhw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhw	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmullw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmullw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        por	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   por	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pslld	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        pslld	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   pslld	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psllq	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psllq	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psllq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psllw	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psllw	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psllw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psrad	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psrad	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psrad	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psraw	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psraw	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psraw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psrld	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psrld	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psrld	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psrlq	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psrlq	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psrlq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psrlw	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psrlw	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psrlw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubsb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubsb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubusb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubusb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubusw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubusw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpckhbw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhbw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpckhdq	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhdq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpckhwd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhwd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpcklbw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpcklbw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpckldq	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckldq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpcklwd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpcklwd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pxor	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pxor	(%rax), %mm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     21.00  53.00  2.00   57.00  24.00  24.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 24.00  24.00   -      -      -     2.00    -      -     2.50   2.50   46.00  46.00  6.00   2.00   55.50  49.50   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     10.33  10.33   -     10.33   -      -     emms
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movd	%eax, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movd	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movd	%mm0, %ecx
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movd	%mm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movq	%rax, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movq	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movq	%mm0, %rcx
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movq	%mm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     packsswb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   packsswb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     packssdw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   packssdw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     packuswb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   packuswb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddsb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddsb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddsw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddusb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddusb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddusw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddusw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     paddw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   paddw	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pand	%mm0, %mm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pand	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pandn	%mm0, %mm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pandn	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpeqb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpeqd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpeqw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpeqw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpgtb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpgtd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pcmpgtw	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmaddwd	%mm0, %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmaddwd	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhw	%mm0, %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhw	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmullw	%mm0, %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmullw	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     por	%mm0, %mm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   por	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     pslld	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     pslld	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   pslld	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psllq	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psllq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psllq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psllw	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psllw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psllw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrad	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrad	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psrad	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psraw	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psraw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psraw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrld	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrld	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psrld	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrlq	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrlq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psrlq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrlw	$1, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psrlw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   psrlw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubsb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubsb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubsw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubusb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubusb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubusw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubusw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpckhbw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpckhbw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpckhdq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpckhdq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpckhwd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpckhwd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpcklbw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpcklbw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpckldq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpckldq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     punpcklwd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   punpcklwd	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pxor	%mm0, %mm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pxor	(%rax), %mm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     emms
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movd	%eax, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movd	%mm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movd	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movq	%rax, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movq	%mm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movq	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	(%rax), %mm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s b/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s
index aa8641484e1..92367b17eef 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 movbe  %cx, (%rax)
 movbe  (%rax), %cx
@@ -19,32 +19,44 @@ movbe  (%rax), %rcx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     1.00           *            movbew	%cx, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   movbew	(%rax), %cx
-# CHECK-NEXT:  1      1     1.00           *            movbel	%ecx, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   movbel	(%rax), %ecx
-# CHECK-NEXT:  1      1     1.00           *            movbeq	%rcx, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   movbeq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.50           *            movbew	%cx, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movbew	(%rax), %cx
+# CHECK-NEXT:  1      1     0.50           *            movbel	%ecx, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movbel	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.50           *            movbeq	%rcx, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movbeq	(%rax), %rcx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00   1.00   3.00   1.00   3.00   3.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 3.00   3.00    -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movbew	%cx, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   movbew	(%rax), %cx
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movbel	%ecx, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   movbel	(%rax), %ecx
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movbeq	%rcx, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   movbeq	(%rax), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movbew	%cx, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movbew	(%rax), %cx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movbel	%ecx, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movbel	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movbeq	%rcx, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movbeq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s b/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s
index 12f879b5fb0..81bbc40143a 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 pclmulqdq     $11, %xmm0, %xmm2
 pclmulqdq     $11, (%rax), %xmm2
@@ -13,24 +13,36 @@ pclmulqdq     $11, (%rax), %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      14    6.00                        pclmulqdq	$11, %xmm0, %xmm2
-# CHECK-NEXT:  1      14    5.67    *                   pclmulqdq	$11, (%rax), %xmm2
+# CHECK-NEXT:  5      12    1.00                        pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT:  6      17    1.00    *                   pclmulqdq	$11, (%rax), %xmm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     11.67  11.67   -     11.67  0.50   0.50
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     6.00   6.00    -     6.00    -      -     pclmulqdq	$11, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     5.67   5.67    -     5.67   0.50   0.50   pclmulqdq	$11, (%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pclmulqdq	$11, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s b/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s
index c24ce8869f9..d31ed6cc528 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 popcntw     %cx, %cx
 popcntw     (%rax), %cx
@@ -19,32 +19,44 @@ popcntq     (%rax), %rcx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        popcntw	%cx, %cx
-# CHECK-NEXT:  2      9     1.00    *                   popcntw	(%rax), %cx
-# CHECK-NEXT:  1      3     1.00                        popcntl	%eax, %ecx
-# CHECK-NEXT:  2      9     1.00    *                   popcntl	(%rax), %ecx
-# CHECK-NEXT:  1      3     1.00                        popcntq	%rax, %rcx
-# CHECK-NEXT:  2      9     1.00    *                   popcntq	(%rax), %rcx
+# CHECK-NEXT:  1      4     0.50                        popcntw	%cx, %cx
+# CHECK-NEXT:  1      8     0.50    *                   popcntw	(%rax), %cx
+# CHECK-NEXT:  1      4     0.50                        popcntl	%eax, %ecx
+# CHECK-NEXT:  1      8     0.50    *                   popcntl	(%rax), %ecx
+# CHECK-NEXT:  1      4     0.50                        popcntq	%rax, %rcx
+# CHECK-NEXT:  1      8     0.50    *                   popcntq	(%rax), %rcx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     6.00    -      -     1.50   1.50
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 1.50   1.50    -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     popcntw	%cx, %cx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   popcntw	(%rax), %cx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     popcntl	%eax, %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   popcntl	(%rax), %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     popcntq	%rax, %rcx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   popcntq	(%rax), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntw	%cx, %cx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntw	(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s b/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s
index b44b28c3725..c6973d7bb86 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 prefetch    (%rax)
 prefetchw   (%rax)
@@ -17,20 +17,32 @@ prefetchw   (%rax)
 # CHECK-NEXT:  1      5     0.50    *      *            prefetchw	(%rax)
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -      -      -      -     1.00   1.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetch	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetchw	(%rax)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetch	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetchw	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s
index cc4d6ed0b43..85fa5d56db9 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 addps       %xmm0, %xmm2
 addps       (%rax), %xmm2
@@ -194,268 +194,280 @@ xorps       (%rax), %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        addps	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   addps	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        addss	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   addss	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        andnps	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   andnps	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        andps	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   andps	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        cmpps	$0, %xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   cmpps	$0, (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        cmpss	$0, %xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   cmpss	$0, (%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        comiss	%xmm0, %xmm1
-# CHECK-NEXT:  3      8     1.00    *                   comiss	(%rax), %xmm1
-# CHECK-NEXT:  1      3     1.00                        cvtpi2ps	%mm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   cvtpi2ps	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        cvtps2pi	%xmm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   cvtps2pi	(%rax), %mm2
-# CHECK-NEXT:  3      5     2.00                        cvtsi2ssl	%ecx, %xmm2
-# CHECK-NEXT:  3      5     2.00                        cvtsi2ssq	%rcx, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   cvtsi2ssl	(%rax), %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   cvtsi2ssl	(%rax), %xmm2
-# CHECK-NEXT:  2      5     1.00                        cvtss2si	%xmm0, %ecx
-# CHECK-NEXT:  2      5     1.00                        cvtss2si	%xmm0, %rcx
-# CHECK-NEXT:  3      9     1.00    *                   cvtss2si	(%rax), %ecx
-# CHECK-NEXT:  3      9     1.00    *                   cvtss2si	(%rax), %rcx
-# CHECK-NEXT:  1      3     1.00                        cvttps2pi	%xmm0, %mm2
-# CHECK-NEXT:  2      9     1.00    *                   cvttps2pi	(%rax), %mm2
-# CHECK-NEXT:  2      5     1.00                        cvttss2si	%xmm0, %ecx
-# CHECK-NEXT:  2      5     1.00                        cvttss2si	%xmm0, %rcx
-# CHECK-NEXT:  3      9     1.00    *                   cvttss2si	(%rax), %ecx
-# CHECK-NEXT:  3      9     1.00    *                   cvttss2si	(%rax), %rcx
-# CHECK-NEXT:  1      14    14.00                       divps	%xmm0, %xmm2
-# CHECK-NEXT:  2      20    14.00   *                   divps	(%rax), %xmm2
-# CHECK-NEXT:  1      14    14.00                       divss	%xmm0, %xmm2
-# CHECK-NEXT:  2      20    14.00   *                   divss	(%rax), %xmm2
-# CHECK-NEXT:  4      5     1.00    *      *      U     ldmxcsr	(%rax)
-# CHECK-NEXT:  1      1     1.00    *      *      U     maskmovq	%mm0, %mm1
-# CHECK-NEXT:  1      3     1.00                        maxps	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   maxps	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        maxss	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   maxss	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        minps	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   minps	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        minss	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   minss	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        movaps	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     1.00                        addps	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        addss	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addss	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        andnps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   andnps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        andps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   andps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        cmpps	$0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   cmpps	$0, (%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        cmpss	$0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   cmpss	$0, (%rax), %xmm2
+# CHECK-NEXT:  2      1     1.00                        comiss	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   comiss	(%rax), %xmm1
+# CHECK-NEXT:  2      4     1.00                        cvtpi2ps	%mm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtpi2ps	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        cvtps2pi	%xmm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtps2pi	(%rax), %mm2
+# CHECK-NEXT:  2      4     1.00                        cvtsi2ssl	%ecx, %xmm2
+# CHECK-NEXT:  2      13    1.00                        cvtsi2ssq	%rcx, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        cvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        cvtss2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   cvtss2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   cvtss2si	(%rax), %rcx
+# CHECK-NEXT:  1      4     1.00                        cvttps2pi	%xmm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   cvttps2pi	(%rax), %mm2
+# CHECK-NEXT:  2      13    1.00                        cvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        cvttss2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   cvttss2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   cvttss2si	(%rax), %rcx
+# CHECK-NEXT:  1      9     9.50                        divps	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   divps	(%rax), %xmm2
+# CHECK-NEXT:  1      9     9.50                        divss	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   divss	(%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *      *      U     ldmxcsr	(%rax)
+# CHECK-NEXT:  1      2     0.50    *      *      U     maskmovq	%mm0, %mm1
+# CHECK-NEXT:  1      2     1.00                        maxps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   maxps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        maxss	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   maxss	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        minps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   minps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        minss	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   minss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        movaps	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            movaps	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   movaps	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        movhlps	%xmm0, %xmm2
-# CHECK-NEXT:  1      1     1.00                        movlhps	%xmm0, %xmm2
-# CHECK-NEXT:  1      1     1.00           *            movhps	%xmm0, (%rax)
-# CHECK-NEXT:  2      7     1.00    *                   movhps	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00           *            movlps	%xmm0, (%rax)
-# CHECK-NEXT:  2      7     1.00    *                   movlps	(%rax), %xmm2
-# CHECK-NEXT:  1      2     1.00                        movmskps	%xmm0, %ecx
-# CHECK-NEXT:  1      1     1.00           *            movntps	%xmm0, (%rax)
-# CHECK-NEXT:  1      1     1.00    *      *      U     movntq	%mm0, (%rax)
-# CHECK-NEXT:  1      1     1.00                        movss	%xmm0, %xmm2
-# CHECK-NEXT:  1      1     1.00           *            movss	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   movss	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        movups	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movaps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movhlps	%xmm0, %xmm2
+# CHECK-NEXT:  1      2     0.50                        movlhps	%xmm0, %xmm2
+# CHECK-NEXT:  2      2     1.00           *            movhps	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   movhps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00           *            movlps	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   movlps	(%rax), %xmm2
+# CHECK-NEXT:  2      10    1.00                        movmskps	%xmm0, %ecx
+# CHECK-NEXT:  1      3     1.00           *            movntps	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     1.00    *      *      U     movntq	%mm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        movss	%xmm0, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            movss	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        movups	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            movups	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   movups	(%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movups	(%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00                        mulps	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   mulps	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   mulps	(%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00                        mulss	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   mulss	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        orps	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   orps	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        pavgb	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pavgb	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pavgw	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pavgw	(%rax), %mm2
-# CHECK-NEXT:  2      3     1.00                        pextrw	$1, %mm0, %ecx
-# CHECK-NEXT:  2      2     1.00                        pinsrw	$1, %eax, %mm2
-# CHECK-NEXT:  2      7     0.50    *                   pinsrw	$1, (%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pmaxsw	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pmaxsw	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pmaxub	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pmaxub	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pminsw	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pminsw	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        pminub	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   pminub	(%rax), %mm2
-# CHECK-NEXT:  1      2     1.00                        pmovmskb	%xmm0, %ecx
-# CHECK-NEXT:  1      5     1.00                        pmulhuw	%mm0, %mm2
-# CHECK-NEXT:  2      10    1.00    *                   pmulhuw	(%rax), %mm2
+# CHECK-NEXT:  1      10    1.00    *                   mulss	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        orps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   orps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pavgb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pavgb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pavgw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pavgw	(%rax), %mm2
+# CHECK-NEXT:  2      13    1.00                        pextrw	$1, %mm0, %ecx
+# CHECK-NEXT:  2      2     0.50                        pinsrw	$1, %eax, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   pinsrw	$1, (%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pmaxsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pmaxub	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxub	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pminsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pminsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pminub	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pminub	(%rax), %mm2
+# CHECK-NEXT:  2      13    1.00                        pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  1      4     1.00                        pmulhuw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhuw	(%rax), %mm2
 # CHECK-NEXT:  1      5     0.50    *      *            prefetcht0	(%rax)
 # CHECK-NEXT:  1      5     0.50    *      *            prefetcht1	(%rax)
 # CHECK-NEXT:  1      5     0.50    *      *            prefetcht2	(%rax)
 # CHECK-NEXT:  1      5     0.50    *      *            prefetchnta	(%rax)
-# CHECK-NEXT:  1      5     1.00                        psadbw	%mm0, %mm2
-# CHECK-NEXT:  2      10    1.00    *                   psadbw	(%rax), %mm2
-# CHECK-NEXT:  1      1     1.00                        pshufw	$1, %mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   pshufw	$1, (%rax), %mm2
+# CHECK-NEXT:  2      4     0.50                        psadbw	%mm0, %mm2
+# CHECK-NEXT:  2      9     0.50    *                   psadbw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pshufw	$1, %mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pshufw	$1, (%rax), %mm2
 # CHECK-NEXT:  1      5     1.00                        rcpps	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   rcpps	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   rcpps	(%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00                        rcpss	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   rcpss	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   rcpss	(%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00                        rsqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   rsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   rsqrtps	(%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00                        rsqrtss	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   rsqrtss	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00    *      *      U     sfence
-# CHECK-NEXT:  1      1     1.00                        shufps	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   shufps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      14    14.00                       sqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  2      20    14.00   *                   sqrtps	(%rax), %xmm2
-# CHECK-NEXT:  1      14    14.00                       sqrtss	%xmm0, %xmm2
-# CHECK-NEXT:  2      20    14.00   *                   sqrtss	(%rax), %xmm2
-# CHECK-NEXT:  4      5     1.00    *      *      U     stmxcsr	(%rax)
-# CHECK-NEXT:  1      3     1.00                        subps	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   subps	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        subss	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   subss	(%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        ucomiss	%xmm0, %xmm1
-# CHECK-NEXT:  3      8     1.00    *                   ucomiss	(%rax), %xmm1
-# CHECK-NEXT:  1      1     1.00                        unpckhps	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   unpckhps	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        unpcklps	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   unpcklps	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        xorps	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   xorps	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   rsqrtss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50    *      *      U     sfence
+# CHECK-NEXT:  1      2     0.50                        shufps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   shufps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      9     10.50                       sqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    10.50   *                   sqrtps	(%rax), %xmm2
+# CHECK-NEXT:  1      9     10.50                       sqrtss	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    10.50   *                   sqrtss	(%rax), %xmm2
+# CHECK-NEXT:  2      1     0.50    *      *      U     stmxcsr	(%rax)
+# CHECK-NEXT:  1      5     1.00                        subps	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   subps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        subss	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   subss	(%rax), %xmm2
+# CHECK-NEXT:  2      1     1.00                        ucomiss	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   ucomiss	(%rax), %xmm1
+# CHECK-NEXT:  1      2     0.50                        unpckhps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   unpckhps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        unpcklps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   unpcklps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        xorps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   xorps	(%rax), %xmm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     112.00 41.00  55.50  10.00  34.50  33.50  33.50
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 33.50  33.50   -      -      -     15.00   -      -     115.50 115.50 9.50   9.50   2.00   25.00  50.50  66.50   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addss	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andnps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   andnps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   andps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cmpps	$0, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cmpps	$0, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cmpss	$0, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cmpss	$0, (%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     comiss	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   comiss	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvtpi2ps	%mm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtpi2ps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvtps2pi	%xmm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtps2pi	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     cvtsi2ssl	%ecx, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     cvtsi2ssq	%rcx, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtsi2ssl	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtsi2ssl	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvtss2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvtss2si	%xmm0, %rcx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvtss2si	(%rax), %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvtss2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvttps2pi	%xmm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvttps2pi	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvttss2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvttss2si	%xmm0, %rcx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvttss2si	(%rax), %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvttss2si	(%rax), %rcx
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     divps	%xmm0, %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   divps	(%rax), %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     divss	%xmm0, %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   divss	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   ldmxcsr	(%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     maskmovq	%mm0, %mm1
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     maxps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   maxps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     maxss	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   maxss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     minps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   minps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     minss	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   minss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movaps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movaps	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movaps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movhlps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movlhps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movhps	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   movhps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movlps	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   movlps	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movmskps	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntps	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntq	%mm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movss	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movss	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movups	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movups	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movups	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     mulps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   mulps	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     mulss	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   mulss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     orps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   orps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pavgb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pavgb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pavgw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pavgw	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pextrw	$1, %mm0, %ecx
-# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     pinsrw	$1, %eax, %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pinsrw	$1, (%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pmaxsw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pmaxsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pmaxub	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pmaxub	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pminsw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pminsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pminub	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   pminub	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmovmskb	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhuw	%mm0, %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhuw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetcht0	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetcht1	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetcht2	(%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   prefetchnta	(%rax)
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psadbw	%mm0, %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   psadbw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     pshufw	$1, %mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   pshufw	$1, (%rax), %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     rcpps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   rcpps	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     rcpss	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   rcpss	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     rsqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   rsqrtps	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     rsqrtss	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   rsqrtss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   sfence
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     shufps	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   shufps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     sqrtps	%xmm0, %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   sqrtps	(%rax), %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     sqrtss	%xmm0, %xmm2
-# CHECK-NEXT:  -     14.00  1.00    -      -      -     0.50   0.50   sqrtss	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   stmxcsr	(%rax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     subps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   subps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     subss	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   subss	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     ucomiss	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   ucomiss	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     unpckhps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   unpckhps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     unpcklps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   unpcklps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     xorps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   xorps	(%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpps	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpps	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpss	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpss	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2ps	%mm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssl	%ecx, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssq	%rcx, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     ldmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     maskmovq	%mm0, %mm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movaps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movaps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movaps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movhlps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movlhps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movhps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movhps	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movlps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movlps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movmskps	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntq	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movss	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movups	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movups	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movups	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrw	$1, %mm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrw	$1, %eax, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrw	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	(%rax), %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetcht0	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetcht1	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetcht2	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetchnta	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufw	$1, %mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufw	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sfence
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     stmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorps	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s
index 30534807cb3..23be05e554a 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 addpd       %xmm0, %xmm2
 addpd       (%rax), %xmm2
@@ -402,548 +402,560 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        addpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   addpd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        addsd	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   addsd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        andnpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   andnpd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        andpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   andpd	(%rax), %xmm2
-# CHECK-NEXT:  4      5     1.00    *      *      U     clflush	(%rax)
-# CHECK-NEXT:  1      3     1.00                        cmppd	$0, %xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   cmppd	$0, (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        cmpsd	$0, %xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   cmpsd	$0, (%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        comisd	%xmm0, %xmm1
-# CHECK-NEXT:  3      8     1.00    *                   comisd	(%rax), %xmm1
-# CHECK-NEXT:  2      4     1.00                        cvtdq2pd	%xmm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   cvtdq2pd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        cvtdq2ps	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   cvtdq2ps	(%rax), %xmm2
-# CHECK-NEXT:  2      4     1.00                        cvtpd2dq	%xmm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   cvtpd2dq	(%rax), %xmm2
-# CHECK-NEXT:  2      4     1.00                        cvtpd2pi	%xmm0, %mm2
-# CHECK-NEXT:  3      10    1.00    *                   cvtpd2pi	(%rax), %mm2
-# CHECK-NEXT:  2      4     1.00                        cvtpd2ps	%xmm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   cvtpd2ps	(%rax), %xmm2
-# CHECK-NEXT:  2      4     1.00                        cvtpi2pd	%mm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   cvtpi2pd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        cvtps2dq	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   cvtps2dq	(%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        cvtps2pd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   cvtps2pd	(%rax), %xmm2
-# CHECK-NEXT:  2      5     1.00                        cvtsd2si	%xmm0, %ecx
-# CHECK-NEXT:  2      5     1.00                        cvtsd2si	%xmm0, %rcx
-# CHECK-NEXT:  3      9     1.00    *                   cvtsd2si	(%rax), %ecx
-# CHECK-NEXT:  3      9     1.00    *                   cvtsd2si	(%rax), %rcx
-# CHECK-NEXT:  2      4     1.00                        cvtsd2ss	%xmm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   cvtsd2ss	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        addpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addpd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        addsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        andnpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   andnpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        andpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   andpd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *      *      U     clflush	(%rax)
+# CHECK-NEXT:  1      2     1.00                        cmppd	$0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   cmppd	$0, (%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        cmpsd	$0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   cmpsd	$0, (%rax), %xmm2
+# CHECK-NEXT:  2      1     1.00                        comisd	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   comisd	(%rax), %xmm1
+# CHECK-NEXT:  2      8     1.00                        cvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        cvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  2      8     1.00                        cvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      6     1.00                        cvtpd2pi	%xmm0, %mm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtpd2pi	(%rax), %mm2
+# CHECK-NEXT:  2      8     1.00                        cvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtpd2ps	(%rax), %xmm2
+# CHECK-NEXT:  2      6     1.00                        cvtpi2pd	%mm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtpi2pd	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        cvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      8     1.00                        cvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        cvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        cvtsd2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   cvtsd2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   cvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  1      4     1.00                        cvtsd2ss	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtsd2ss	(%rax), %xmm2
 # CHECK-NEXT:  2      4     1.00                        cvtsi2sdl	%ecx, %xmm2
-# CHECK-NEXT:  2      4     1.00                        cvtsi2sdq	%rcx, %xmm2
+# CHECK-NEXT:  2      13    1.00                        cvtsi2sdq	%rcx, %xmm2
 # CHECK-NEXT:  2      9     1.00    *                   cvtsi2sdl	(%rax), %xmm2
 # CHECK-NEXT:  2      9     1.00    *                   cvtsi2sdl	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        cvtss2sd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   cvtss2sd	(%rax), %xmm2
-# CHECK-NEXT:  2      4     1.00                        cvttpd2dq	%xmm0, %xmm2
-# CHECK-NEXT:  3      10    1.00    *                   cvttpd2dq	(%rax), %xmm2
-# CHECK-NEXT:  2      4     1.00                        cvttpd2pi	%xmm0, %mm2
-# CHECK-NEXT:  3      10    1.00    *                   cvttpd2pi	(%rax), %mm2
-# CHECK-NEXT:  1      3     1.00                        cvttps2dq	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   cvttps2dq	(%rax), %xmm2
-# CHECK-NEXT:  2      5     1.00                        cvttsd2si	%xmm0, %ecx
-# CHECK-NEXT:  2      5     1.00                        cvttsd2si	%xmm0, %rcx
-# CHECK-NEXT:  3      9     1.00    *                   cvttsd2si	(%rax), %ecx
-# CHECK-NEXT:  3      9     1.00    *                   cvttsd2si	(%rax), %rcx
-# CHECK-NEXT:  1      22    22.00                       divpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      28    22.00   *                   divpd	(%rax), %xmm2
-# CHECK-NEXT:  1      22    22.00                       divsd	%xmm0, %xmm2
-# CHECK-NEXT:  2      28    22.00   *                   divsd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00    *      *      U     lfence
+# CHECK-NEXT:  1      4     1.00                        cvtss2sd	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtss2sd	(%rax), %xmm2
+# CHECK-NEXT:  2      8     1.00                        cvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvttpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      6     1.00                        cvttpd2pi	%xmm0, %mm2
+# CHECK-NEXT:  2      13    1.00    *                   cvttpd2pi	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        cvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        cvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        cvttsd2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   cvttsd2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   cvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  1      9     9.50                        divpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   divpd	(%rax), %xmm2
+# CHECK-NEXT:  1      9     9.50                        divsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   divsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50    *      *      U     lfence
 # CHECK-NEXT:  1      1     1.00    *      *      U     maskmovdqu	%xmm0, %xmm1
-# CHECK-NEXT:  1      3     1.00                        maxpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   maxpd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        maxsd	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   maxsd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        minpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   minpd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        minsd	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   minsd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        movapd	%xmm0, %xmm2
+# CHECK-NEXT:  1      2     1.00                        maxpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   maxpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        maxsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   maxsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        minpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   minpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        minsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   minsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        movapd	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            movapd	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   movapd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        movd	%eax, %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   movd	(%rax), %xmm2
-# CHECK-NEXT:  1      2     1.00                        movd	%xmm0, %ecx
-# CHECK-NEXT:  1      1     1.00           *            movd	%xmm0, (%rax)
-# CHECK-NEXT:  1      1     0.33                        movdqa	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movapd	(%rax), %xmm2
+# CHECK-NEXT:  2      10    0.50                        movd	%eax, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movd	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00                        movd	%xmm0, %ecx
+# CHECK-NEXT:  1      2     1.00           *            movd	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        movdqa	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            movdqa	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   movdqa	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.33                        movdqu	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movdqa	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movdqu	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            movdqu	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   movdqu	(%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        movdq2q	%xmm0, %mm2
-# CHECK-NEXT:  1      1     1.00           *            movhpd	%xmm0, (%rax)
-# CHECK-NEXT:  2      7     1.00    *                   movhpd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00           *            movlpd	%xmm0, (%rax)
-# CHECK-NEXT:  2      7     1.00    *                   movlpd	(%rax), %xmm2
-# CHECK-NEXT:  1      2     1.00                        movmskpd	%xmm0, %ecx
-# CHECK-NEXT:  1      1     1.00           *            movntil	%eax, (%rax)
-# CHECK-NEXT:  1      1     1.00           *            movntiq	%rax, (%rax)
-# CHECK-NEXT:  1      1     1.00           *            movntdq	%xmm0, (%rax)
-# CHECK-NEXT:  1      1     1.00           *            movntpd	%xmm0, (%rax)
-# CHECK-NEXT:  1      1     0.33                        movq	%xmm0, %xmm2
-# CHECK-NEXT:  1      1     1.00                        movq	%rax, %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   movq	(%rax), %xmm2
-# CHECK-NEXT:  1      2     1.00                        movq	%xmm0, %rcx
-# CHECK-NEXT:  1      1     1.00           *            movq	%xmm0, (%rax)
-# CHECK-NEXT:  1      1     0.33                        movq2dq	%mm0, %xmm2
-# CHECK-NEXT:  1      1     1.00                        movsd	%xmm0, %xmm2
-# CHECK-NEXT:  1      1     1.00           *            movsd	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   movsd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        movupd	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movdqu	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movdq2q	%xmm0, %mm2
+# CHECK-NEXT:  2      2     1.00           *            movhpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   movhpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00           *            movlpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   movlpd	(%rax), %xmm2
+# CHECK-NEXT:  2      10    1.00                        movmskpd	%xmm0, %ecx
+# CHECK-NEXT:  1      1     0.50           *            movntil	%eax, (%rax)
+# CHECK-NEXT:  1      1     0.50           *            movntiq	%rax, (%rax)
+# CHECK-NEXT:  1      2     1.00           *            movntdq	%xmm0, (%rax)
+# CHECK-NEXT:  1      3     1.00           *            movntpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        movq	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    0.50                        movq	%rax, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00                        movq	%xmm0, %rcx
+# CHECK-NEXT:  1      2     1.00           *            movq	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        movq2dq	%mm0, %xmm2
+# CHECK-NEXT:  1      2     0.50                        movsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            movsd	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        movupd	%xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00           *            movupd	%xmm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   movupd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movupd	(%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00                        mulpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   mulpd	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   mulpd	(%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00                        mulsd	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   mulsd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        orpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   orpd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        packssdw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   packssdw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        packsswb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   packsswb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        packuswb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   packuswb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        paddb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   paddb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        paddd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   paddd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        paddq	%mm0, %mm2
-# CHECK-NEXT:  2      7     0.50    *                   paddq	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        paddq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   paddq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        paddsb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   paddsb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        paddsw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   paddsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        paddusb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   paddusb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        paddusw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   paddusw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        paddw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   paddw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.33                        pand	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pand	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.33                        pandn	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pandn	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pavgb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pavgb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pavgw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pavgw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pcmpeqb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pcmpeqb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pcmpeqd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pcmpeqd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pcmpeqw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pcmpeqw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pcmpgtb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pcmpgtb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pcmpgtd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pcmpgtd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pcmpgtw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pcmpgtw	(%rax), %xmm2
-# CHECK-NEXT:  2      3     1.00                        pextrw	$1, %xmm0, %ecx
-# CHECK-NEXT:  1      5     1.00                        pmaddwd	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pmaddwd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmaxsw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmaxsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmaxub	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmaxub	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pminsw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pminsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pminub	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pminub	(%rax), %xmm2
-# CHECK-NEXT:  1      2     1.00                        pmovmskb	%xmm0, %ecx
-# CHECK-NEXT:  1      5     1.00                        pmulhuw	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pmulhuw	(%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        pmulhw	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pmulhw	(%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        pmullw	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pmullw	(%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        pmuludq	%mm0, %mm2
-# CHECK-NEXT:  2      10    1.00    *                   pmuludq	(%rax), %mm2
-# CHECK-NEXT:  1      5     1.00                        pmuludq	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pmuludq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.33                        por	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   por	(%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        psadbw	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   psadbw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pshufd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pshufd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pshufhw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pshufhw	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pshuflw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pshuflw	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        pslld	$1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        pslld	%xmm0, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   pslld	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pslldq	$1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        psllq	$1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        psllq	%xmm0, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   psllq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        psllw	$1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        psllw	%xmm0, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   psllw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        psrad	$1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        psrad	%xmm0, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   psrad	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        psraw	$1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        psraw	%xmm0, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   psraw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        psrld	$1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        psrld	%xmm0, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   psrld	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        psrldq	$1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        psrlq	$1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        psrlq	%xmm0, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   psrlq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        psrlw	$1, %xmm2
-# CHECK-NEXT:  2      2     1.00                        psrlw	%xmm0, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   psrlw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        psubb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   psubb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        psubd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   psubd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubq	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        psubq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        psubsb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   psubsb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        psubsw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   psubsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        psubusb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   psubusb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        psubusw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   psubusw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        psubw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   psubw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        punpckhbw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   punpckhbw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        punpckhdq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   punpckhdq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        punpckhqdq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   punpckhqdq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        punpckhwd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   punpckhwd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        punpcklbw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   punpcklbw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        punpckldq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   punpckldq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        punpcklqdq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   punpcklqdq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        punpcklwd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   punpcklwd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.33                        pxor	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pxor	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        shufpd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   shufpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      21    21.00                       sqrtpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      27    21.00   *                   sqrtpd	(%rax), %xmm2
-# CHECK-NEXT:  1      21    21.00                       sqrtsd	%xmm0, %xmm2
-# CHECK-NEXT:  2      27    21.00   *                   sqrtsd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        subpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   subpd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        subsd	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   subsd	(%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        ucomisd	%xmm0, %xmm1
-# CHECK-NEXT:  3      8     1.00    *                   ucomisd	(%rax), %xmm1
-# CHECK-NEXT:  1      1     1.00                        unpckhpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   unpckhpd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        unpcklpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   unpcklpd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        xorpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   xorpd	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   mulsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        orpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   orpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        packssdw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   packssdw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        packsswb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   packsswb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        packuswb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   packuswb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddq	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddusb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddusb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddusw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddusw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pand	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pand	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pandn	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pandn	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pavgb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pavgb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pavgw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pavgw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtw	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        pextrw	$1, %xmm0, %ecx
+# CHECK-NEXT:  1      4     1.00                        pmaddwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmaddwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmaxsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmaxub	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxub	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminub	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminub	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  1      4     1.00                        pmulhuw	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhuw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmulhw	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmullw	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmullw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmuludq	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmuludq	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmuludq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmuludq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        por	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   por	(%rax), %xmm2
+# CHECK-NEXT:  2      4     0.50                        psadbw	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     0.50    *                   psadbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pslld	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        pslld	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   pslld	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pslldq	$1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        psllq	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psllq	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psllq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psllw	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psllw	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psllw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psrad	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psrad	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psrad	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psraw	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psraw	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psraw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psrld	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psrld	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psrld	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psrldq	$1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        psrlq	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psrlq	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psrlq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psrlw	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psrlw	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psrlw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubq	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubusb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubusb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubusw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubusw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpckhbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpckhdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpckhqdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhqdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpckhwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpcklbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpcklbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpckldq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckldq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpcklqdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpcklqdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpcklwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpcklwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pxor	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pxor	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        shufpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   shufpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      9     13.50                       sqrtpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    13.50   *                   sqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  1      9     13.50                       sqrtsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    13.50   *                   sqrtsd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        subpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   subpd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        subsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   subsd	(%rax), %xmm2
+# CHECK-NEXT:  2      1     1.00                        ucomisd	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   ucomisd	(%rax), %xmm1
+# CHECK-NEXT:  1      2     0.50                        unpckhpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   unpckhpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        unpcklpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   unpcklpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        xorpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   xorpd	(%rax), %xmm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     172.00 75.83  117.33 16.00  98.83  66.00  66.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 66.00  66.00   -      -      -     17.00   -      -     124.50 124.50 66.50  66.50  12.00  50.00  119.50 140.50  -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addsd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andnpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   andnpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   andpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50   1.00   1.00   0.50   0.50   clflush	(%rax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cmppd	$0, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cmppd	$0, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cmpsd	$0, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cmpsd	$0, (%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     comisd	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   comisd	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtdq2pd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtdq2pd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvtdq2ps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtdq2ps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtpd2dq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtpd2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtpd2pi	%xmm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtpd2pi	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtpd2ps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtpd2ps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtpi2pd	%mm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtpi2pd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvtps2dq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtps2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     cvtps2pd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   cvtps2pd	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvtsd2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvtsd2si	%xmm0, %rcx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvtsd2si	(%rax), %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvtsd2si	(%rax), %rcx
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtsd2ss	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvtsd2ss	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtsi2sdl	%ecx, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvtsi2sdq	%rcx, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtsi2sdl	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvtsi2sdl	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     cvtss2sd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   cvtss2sd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvttpd2dq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvttpd2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     cvttpd2pi	%xmm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   cvttpd2pi	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     cvttps2dq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   cvttps2dq	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvttsd2si	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     cvttsd2si	%xmm0, %rcx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvttsd2si	(%rax), %ecx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   cvttsd2si	(%rax), %rcx
-# CHECK-NEXT:  -     22.00  1.00    -      -      -      -      -     divpd	%xmm0, %xmm2
-# CHECK-NEXT:  -     22.00  1.00    -      -      -     0.50   0.50   divpd	(%rax), %xmm2
-# CHECK-NEXT:  -     22.00  1.00    -      -      -      -      -     divsd	%xmm0, %xmm2
-# CHECK-NEXT:  -     22.00  1.00    -      -      -     0.50   0.50   divsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   lfence
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   maskmovdqu	%xmm0, %xmm1
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     maxpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   maxpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     maxsd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   maxsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     minpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   minpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     minsd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   minsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movapd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movapd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movapd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movd	%eax, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movd	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movd	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movdqa	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movdqa	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movdqu	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movdqu	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movdqu	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     1.33    -      -     movdq2q	%xmm0, %mm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movhpd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   movhpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movlpd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   movlpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movmskpd	%xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntil	%eax, (%rax)
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntiq	%rax, (%rax)
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntdq	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntpd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movq	%rax, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movq	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     movq	%xmm0, %rcx
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movq	%xmm0, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movq2dq	%mm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movsd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movsd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movupd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movupd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movupd	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     mulpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   mulpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     mulsd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   mulsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     orpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   orpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     packssdw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   packssdw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     packsswb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   packsswb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     packuswb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   packuswb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddsb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddsw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddusb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddusb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddusw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddusw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     paddw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   paddw	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pand	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pand	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pandn	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pandn	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pavgb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pavgb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pavgw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pavgw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpeqb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpeqb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpeqd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpeqd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpeqw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpeqw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpgtb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpgtb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpgtd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpgtd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpgtw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpgtw	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pextrw	$1, %xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmaddwd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmaddwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxsw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxub	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxub	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminsw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminub	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminub	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmovmskb	%xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhuw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhuw	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhw	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmullw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmullw	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmuludq	%mm0, %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmuludq	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmuludq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmuludq	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     por	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   por	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psadbw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   psadbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pshufd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pshufd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pshufhw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pshufhw	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pshuflw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pshuflw	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pslld	$1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pslld	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   pslld	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pslldq	$1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psllq	$1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psllq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psllq	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psllw	$1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psllw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psllw	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psrad	$1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psrad	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psrad	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psraw	$1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psraw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psraw	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psrld	$1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psrld	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psrld	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psrldq	$1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psrlq	$1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psrlq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psrlq	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     psrlw	$1, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     psrlw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50   0.50   0.50   psrlw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubq	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubusb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubusb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubusw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubusw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpckhbw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpckhbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpckhdq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpckhdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpckhqdq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpckhqdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpckhwd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpckhwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpcklbw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpcklbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpckldq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpckldq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpcklqdq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpcklqdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     punpcklwd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   punpcklwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     pxor	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   pxor	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     shufpd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   shufpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -     21.00  1.00    -      -      -      -      -     sqrtpd	%xmm0, %xmm2
-# CHECK-NEXT:  -     21.00  1.00    -      -      -     0.50   0.50   sqrtpd	(%rax), %xmm2
-# CHECK-NEXT:  -     21.00  1.00    -      -      -      -      -     sqrtsd	%xmm0, %xmm2
-# CHECK-NEXT:  -     21.00  1.00    -      -      -     0.50   0.50   sqrtsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     subpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   subpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     subsd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   subsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     ucomisd	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   ucomisd	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     unpckhpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   unpckhpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     unpcklpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   unpcklpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     xorpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   xorpd	(%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andpd	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     clflush	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmppd	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmppd	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpsd	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpsd	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2pd	%mm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2ss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2ss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdl	%ecx, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdq	%rcx, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtss2sd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtss2sd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divsd	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lfence
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     maskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movapd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movapd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movapd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movd	%eax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movd	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqa	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movdqa	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqu	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movdqu	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdq2q	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movhpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movhpd	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movlpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movlpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movmskpd	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movntil	%eax, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movntiq	%rax, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntdq	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntpd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movq	%rax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movq	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movq	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq2dq	%mm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movsd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movupd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movupd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movupd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrw	$1, %xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslldq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrldq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhqdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhqdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklqdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklqdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorpd	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s
index 8438e1a7a84..ce08757f7de 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 addsubpd  %xmm0, %xmm2
 addsubpd  (%rax),  %xmm2
@@ -39,58 +39,70 @@ movsldup  (%rax), %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        addsubpd	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   addsubpd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        addsubps	%xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   addsubps	(%rax), %xmm2
-# CHECK-NEXT:  3      5     2.00                        haddpd	%xmm0, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   haddpd	(%rax), %xmm2
-# CHECK-NEXT:  3      5     2.00                        haddps	%xmm0, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   haddps	(%rax), %xmm2
-# CHECK-NEXT:  3      5     2.00                        hsubpd	%xmm0, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   hsubpd	(%rax), %xmm2
-# CHECK-NEXT:  3      5     2.00                        hsubps	%xmm0, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   hsubps	(%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   lddqu	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        movddup	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   movddup	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        movshdup	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   movshdup	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        movsldup	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   movsldup	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        addsubpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addsubpd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        addsubps	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addsubps	(%rax), %xmm2
+# CHECK-NEXT:  3      11    1.00                        haddpd	%xmm0, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   haddpd	(%rax), %xmm2
+# CHECK-NEXT:  3      11    1.00                        haddps	%xmm0, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   haddps	(%rax), %xmm2
+# CHECK-NEXT:  3      11    1.00                        hsubpd	%xmm0, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   hsubpd	(%rax), %xmm2
+# CHECK-NEXT:  3      11    1.00                        hsubps	%xmm0, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   hsubps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   lddqu	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movddup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   movddup	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movshdup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   movshdup	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movsldup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   movsldup	(%rax), %xmm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     12.00   -     19.00  5.00   5.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 5.00   5.00    -      -      -      -      -      -     9.00   9.00   0.50   0.50    -      -     15.50  3.50    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addsubpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addsubpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     addsubps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   addsubps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     haddpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   haddpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     haddps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   haddps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     hsubpd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   hsubpd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00    -      -     hsubps	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     2.00   0.50   0.50   hsubps	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   lddqu	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movddup	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movddup	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movshdup	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movshdup	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     movsldup	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movsldup	(%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubps	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     lddqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movddup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movddup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movshdup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movshdup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsldup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsldup	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s
index 08c6ccfde8f..d1b36d10b2f 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 blendpd     $11, %xmm0, %xmm2
 blendpd     $11, (%rax), %xmm2
@@ -155,212 +155,224 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.50                        blendpd	$11, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   blendpd	$11, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        blendps	$11, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   blendps	$11, (%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        blendvpd	%xmm0, %xmm0, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   blendvpd	%xmm0, (%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        blendvps	%xmm0, %xmm0, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   blendvps	%xmm0, (%rax), %xmm2
-# CHECK-NEXT:  3      9     1.00                        dppd	$22, %xmm0, %xmm2
-# CHECK-NEXT:  4      15    1.00    *                   dppd	$22, (%rax), %xmm2
-# CHECK-NEXT:  4      12    2.00                        dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  5      18    2.00    *                   dpps	$22, (%rax), %xmm2
-# CHECK-NEXT:  2      3     1.00                        extractps	$1, %xmm0, %ecx
-# CHECK-NEXT:  3      5     1.00           *            extractps	$1, %xmm0, (%rax)
-# CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     1.00    *                   insertps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   movntdqa	(%rax), %xmm2
-# CHECK-NEXT:  3      7     1.00                        mpsadbw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  4      13    1.00    *                   mpsadbw	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        packusdw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   packusdw	(%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        pblendvb	%xmm0, %xmm0, %xmm2
-# CHECK-NEXT:  3      8     1.00    *                   pblendvb	%xmm0, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pblendw	$11, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pblendw	$11, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pcmpeqq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pcmpeqq	(%rax), %xmm2
-# CHECK-NEXT:  2      3     1.00                        pextrb	$1, %xmm0, %ecx
-# CHECK-NEXT:  3      5     1.00           *            pextrb	$1, %xmm0, (%rax)
-# CHECK-NEXT:  2      3     1.00                        pextrd	$1, %xmm0, %ecx
-# CHECK-NEXT:  4      5     1.00           *            pextrd	$1, %xmm0, (%rax)
-# CHECK-NEXT:  2      3     1.00                        pextrq	$1, %xmm0, %rcx
-# CHECK-NEXT:  4      5     1.00           *            pextrq	$1, %xmm0, (%rax)
-# CHECK-NEXT:  3      5     1.00           *            pextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  1      5     1.00                        phminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   phminposuw	(%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        pinsrb	$1, %eax, %xmm1
-# CHECK-NEXT:  2      7     0.50    *                   pinsrb	$1, (%rax), %xmm1
-# CHECK-NEXT:  2      2     1.00                        pinsrd	$1, %eax, %xmm1
-# CHECK-NEXT:  2      7     0.50    *                   pinsrd	$1, (%rax), %xmm1
-# CHECK-NEXT:  2      2     1.00                        pinsrq	$1, %rax, %xmm1
-# CHECK-NEXT:  2      7     0.50    *                   pinsrq	$1, (%rax), %xmm1
-# CHECK-NEXT:  1      1     0.50                        pmaxsb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmaxsb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmaxsd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmaxsd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmaxud	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmaxud	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmaxuw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmaxuw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pminsb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pminsb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pminsd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pminsd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pminud	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pminud	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pminuw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pminuw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovsxbd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovsxbd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovsxbq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovsxbq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovsxbw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovsxbw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovsxdq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovsxdq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovsxwd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovsxwd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovsxwq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovsxwq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovzxbd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovzxbd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovzxbq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovzxbq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovzxbw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovzxbw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovzxdq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovzxdq	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovzxwd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovzxwd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pmovzxwq	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pmovzxwq	(%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        pmuldq	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pmuldq	(%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        pmulld	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pmulld	(%rax), %xmm2
-# CHECK-NEXT:  2      2     1.00                        ptest	%xmm0, %xmm1
-# CHECK-NEXT:  3      8     1.00    *                   ptest	(%rax), %xmm1
-# CHECK-NEXT:  1      3     1.00                        roundpd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        roundps	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        roundsd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        roundss	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      9     1.00    *                   roundss	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        blendpd	$11, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   blendpd	$11, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        blendps	$11, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   blendps	$11, (%rax), %xmm2
+# CHECK-NEXT:  1      2     2.00                        blendvpd	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   blendvpd	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  1      2     2.00                        blendvps	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   blendvps	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  15     15    1.50                        dppd	$22, %xmm0, %xmm2
+# CHECK-NEXT:  17     20    1.50    *                   dppd	$22, (%rax), %xmm2
+# CHECK-NEXT:  16     25    1.50                        dpps	$22, %xmm0, %xmm2
+# CHECK-NEXT:  18     30    1.50    *                   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        extractps	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            extractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        insertps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   insertps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  9      9     2.00                        mpsadbw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  9      14    2.00    *                   mpsadbw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        packusdw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   packusdw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     2.00                        pblendvb	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   pblendvb	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pblendw	$11, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pblendw	$11, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqq	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        pextrb	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            pextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        pextrd	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            pextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        pextrq	$1, %xmm0, %rcx
+# CHECK-NEXT:  2      13    1.00           *            pextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00           *            pextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      4     1.00                        phminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  2      2     0.50                        pinsrb	$1, %eax, %xmm1
+# CHECK-NEXT:  2      6     0.50    *                   pinsrb	$1, (%rax), %xmm1
+# CHECK-NEXT:  2      2     0.50                        pinsrd	$1, %eax, %xmm1
+# CHECK-NEXT:  2      6     0.50    *                   pinsrd	$1, (%rax), %xmm1
+# CHECK-NEXT:  2      2     0.50                        pinsrq	$1, %rax, %xmm1
+# CHECK-NEXT:  2      6     0.50    *                   pinsrq	$1, (%rax), %xmm1
+# CHECK-NEXT:  1      2     0.50                        pmaxsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmaxsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmaxud	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxud	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmaxuw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxuw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminud	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminud	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminuw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminuw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmuldq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmuldq	(%rax), %xmm2
+# CHECK-NEXT:  1      5     2.00                        pmulld	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    2.00    *                   pmulld	(%rax), %xmm2
+# CHECK-NEXT:  2      1     1.00                        ptest	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   ptest	(%rax), %xmm1
+# CHECK-NEXT:  1      4     1.00                        roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   roundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   roundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   roundsd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   roundss	$1, (%rax), %xmm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     26.00  47.50  5.00   52.50  24.50  24.50
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 24.50  24.50   -      -      -     6.00    -      -     20.00  20.00  32.50  32.50  10.00  13.00  49.50  50.50   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     blendpd	$11, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   blendpd	$11, (%rax), %xmm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     blendps	$11, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   blendps	$11, (%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     blendvpd	%xmm0, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   blendvpd	%xmm0, (%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     blendvps	%xmm0, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   blendvps	%xmm0, (%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     dppd	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   0.50   0.50   dppd	$22, (%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   2.00    -     1.00    -      -     dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   2.00    -     1.00   0.50   0.50   dpps	$22, (%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     extractps	$1, %xmm0, %ecx
-# CHECK-NEXT:  -      -      -      -     1.00   1.00   0.50   0.50   extractps	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   insertps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movntdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     mpsadbw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   0.50   0.50   mpsadbw	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     packusdw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   packusdw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -     pblendvb	%xmm0, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -     1.00   0.50   0.50   pblendvb	%xmm0, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pblendw	$11, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pblendw	$11, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pcmpeqq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pcmpeqq	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pextrb	$1, %xmm0, %ecx
-# CHECK-NEXT:  -      -      -     0.50   1.00   0.50   0.50   0.50   pextrb	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pextrd	$1, %xmm0, %ecx
-# CHECK-NEXT:  -      -     1.00   0.50   1.00   0.50   0.50   0.50   pextrd	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -     1.00   0.50    -     0.50    -      -     pextrq	$1, %xmm0, %rcx
-# CHECK-NEXT:  -      -     1.00   0.50   1.00   0.50   0.50   0.50   pextrq	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -      -     0.50   1.00   0.50   0.50   0.50   pextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     phminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   phminposuw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     pinsrb	$1, %eax, %xmm1
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pinsrb	$1, (%rax), %xmm1
-# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     pinsrd	$1, %eax, %xmm1
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pinsrd	$1, (%rax), %xmm1
-# CHECK-NEXT:  -      -      -     0.50    -     1.50    -      -     pinsrq	$1, %rax, %xmm1
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pinsrq	$1, (%rax), %xmm1
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxsb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxsd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxud	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxud	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmaxuw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmaxuw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminsb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminsd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminud	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminud	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pminuw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pminuw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxbd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxbd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxbq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxbq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxbw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxdq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxwd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovsxwq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovsxwq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxbd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxbd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxbq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxbq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxbw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxbw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxdq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxdq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxwd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxwd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pmovzxwq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pmovzxwq	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmuldq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmuldq	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulld	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulld	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     ptest	%xmm0, %xmm1
-# CHECK-NEXT:  -      -     1.00    -      -     1.00   0.50   0.50   ptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     roundpd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     roundps	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     roundsd	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     roundss	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   roundss	$1, (%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendpd	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendpd	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendps	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendps	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvpd	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvpd	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvps	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvps	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dppd	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dppd	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dpps	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     extractps	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     extractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     insertps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     insertps	$1, (%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     mpsadbw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     mpsadbw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packusdw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packusdw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pblendvb	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pblendvb	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pblendw	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pblendw	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrb	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrd	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrq	$1, %xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     phminposuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrb	$1, %eax, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrb	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrd	$1, %eax, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrd	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrq	$1, %rax, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrq	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxud	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxud	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminud	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminud	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuldq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuldq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     pmulld	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     pmulld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ptest	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ptest	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundsd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundss	$1, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s
index 935c5e3d7f3..2d3a0ef4049 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 crc32b      %al, %ecx
 crc32b      (%rax), %ecx
@@ -40,60 +40,72 @@ pcmpgtq     (%rax), %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        crc32b	%al, %ecx
-# CHECK-NEXT:  2      8     1.00    *                   crc32b	(%rax), %ecx
-# CHECK-NEXT:  1      3     1.00                        crc32l	%eax, %ecx
-# CHECK-NEXT:  2      8     1.00    *                   crc32l	(%rax), %ecx
-# CHECK-NEXT:  1      3     1.00                        crc32w	%ax, %ecx
-# CHECK-NEXT:  2      8     1.00    *                   crc32w	(%rax), %ecx
-# CHECK-NEXT:  1      3     1.00                        crc32b	%al, %rcx
-# CHECK-NEXT:  2      8     1.00    *                   crc32b	(%rax), %rcx
-# CHECK-NEXT:  1      3     1.00                        crc32q	%rax, %rcx
-# CHECK-NEXT:  2      8     1.00    *                   crc32q	(%rax), %rcx
-# CHECK-NEXT:  1      4     2.67                        pcmpestri	$1, %xmm0, %xmm2
-# CHECK-NEXT:  1      4     2.33    *                   pcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      11    2.67                        pcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  1      11    2.33    *                   pcmpestrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  3      11    3.00                        pcmpistri	$1, %xmm0, %xmm2
-# CHECK-NEXT:  4      17    3.00    *                   pcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  3      11    3.00                        pcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  4      17    3.00    *                   pcmpistrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        pcmpgtq	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pcmpgtq	(%rax), %xmm2
+# CHECK-NEXT:  3      3     2.00                        crc32b	%al, %ecx
+# CHECK-NEXT:  3      7     2.00    *                   crc32b	(%rax), %ecx
+# CHECK-NEXT:  7      6     2.00                        crc32l	%eax, %ecx
+# CHECK-NEXT:  3      7     2.00    *                   crc32l	(%rax), %ecx
+# CHECK-NEXT:  5      5     2.00                        crc32w	%ax, %ecx
+# CHECK-NEXT:  3      7     2.00    *                   crc32w	(%rax), %ecx
+# CHECK-NEXT:  3      3     2.00                        crc32b	%al, %rcx
+# CHECK-NEXT:  3      7     2.00    *                   crc32b	(%rax), %rcx
+# CHECK-NEXT:  11     10    2.00                        crc32q	%rax, %rcx
+# CHECK-NEXT:  3      7     2.00    *                   crc32q	(%rax), %rcx
+# CHECK-NEXT:  27     15    4.00                        pcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  28     20    4.50    *                   pcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT:  27     10    4.00                        pcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  28     15    4.50    *                   pcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      14    1.00                        pcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  8      19    1.00    *                   pcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      6     1.00                        pcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  9      11    1.00    *                   pcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtq	(%rax), %xmm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     24.00  20.00   -     10.00  5.00   5.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 21.00  21.00   -      -      -     28.00  20.00   -     6.00   6.00   9.00   9.00    -      -     1.00   9.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     crc32b	%al, %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   crc32b	(%rax), %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     crc32l	%eax, %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   crc32l	(%rax), %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     crc32w	%ax, %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   crc32w	(%rax), %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     crc32b	%al, %rcx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   crc32b	(%rax), %rcx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     crc32q	%rax, %rcx
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   crc32q	(%rax), %rcx
-# CHECK-NEXT:  -      -     2.67   2.67    -     2.67    -      -     pcmpestri	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     2.33   2.33    -     2.33   0.50   0.50   pcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -     2.67   2.67    -     2.67    -      -     pcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     2.33   2.33    -     2.33   0.50   0.50   pcmpestrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -     3.00    -      -      -      -      -     pcmpistri	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     3.00    -      -      -     0.50   0.50   pcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -     3.00    -      -      -      -      -     pcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     3.00    -      -      -     0.50   0.50   pcmpistrm	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pcmpgtq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pcmpgtq	(%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	%al, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32l	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32l	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32w	%ax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32w	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	%al, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32q	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32q	(%rax), %rcx
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtq	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s
index f4b9c94d48a..55347137df4 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 extrq       %xmm0, %xmm2
 extrq       $22, $2, %xmm2
@@ -19,32 +19,44 @@ movntss     %xmm0, (%rax)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.50                        extrq	%xmm0, %xmm2
-# CHECK-NEXT:  1      1     0.50                        extrq	$22, $2, %xmm2
-# CHECK-NEXT:  1      1     0.50                        insertq	%xmm0, %xmm2
-# CHECK-NEXT:  1      1     0.50                        insertq	$22, $22, %xmm0, %xmm2
-# CHECK-NEXT:  1      1     1.00           *            movntsd	%xmm0, (%rax)
-# CHECK-NEXT:  1      1     1.00           *            movntss	%xmm0, (%rax)
+# CHECK-NEXT:  1      3     0.50                        extrq	%xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        extrq	$22, $2, %xmm2
+# CHECK-NEXT:  1      3     2.00                        insertq	%xmm0, %xmm2
+# CHECK-NEXT:  1      3     2.00                        insertq	$22, $22, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00           *            movntsd	%xmm0, (%rax)
+# CHECK-NEXT:  1      3     1.00           *            movntss	%xmm0, (%rax)
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     2.00   2.00   2.00   1.00   1.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     5.00   5.00    -     2.00   2.00   4.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     extrq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     extrq	$22, $2, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     insertq	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     insertq	$22, $22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntsd	%xmm0, (%rax)
-# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   movntss	%xmm0, (%rax)
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     extrq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     extrq	$22, $2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     insertq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     insertq	$22, $22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntsd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntss	%xmm0, (%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s b/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s
index c341022a288..c89ef297629 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 pabsb       %mm0, %mm2
 pabsb       (%rax), %mm2
@@ -106,148 +106,160 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.50                        pabsb	%mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   pabsb	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        pabsb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pabsb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pabsd	%mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   pabsd	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        pabsd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pabsd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pabsw	%mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   pabsw	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        pabsw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pabsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        palignr	$1, %mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   palignr	$1, (%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        palignr	$1, %xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   palignr	$1, (%rax), %xmm2
-# CHECK-NEXT:  3      3     1.50                        phaddd	%mm0, %mm2
-# CHECK-NEXT:  4      8     1.50    *                   phaddd	(%rax), %mm2
-# CHECK-NEXT:  3      3     1.50                        phaddd	%xmm0, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   phaddd	(%rax), %xmm2
-# CHECK-NEXT:  3      3     1.50                        phaddsw	%mm0, %mm2
-# CHECK-NEXT:  4      8     1.50    *                   phaddsw	(%rax), %mm2
-# CHECK-NEXT:  3      3     1.50                        phaddsw	%xmm0, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   phaddsw	(%rax), %xmm2
-# CHECK-NEXT:  3      3     1.50                        phaddw	%mm0, %mm2
-# CHECK-NEXT:  4      8     1.50    *                   phaddw	(%rax), %mm2
-# CHECK-NEXT:  3      3     1.50                        phaddw	%xmm0, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   phaddw	(%rax), %xmm2
-# CHECK-NEXT:  3      3     1.50                        phsubd	%mm0, %mm2
-# CHECK-NEXT:  4      8     1.50    *                   phsubd	(%rax), %mm2
-# CHECK-NEXT:  3      3     1.50                        phsubd	%xmm0, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   phsubd	(%rax), %xmm2
-# CHECK-NEXT:  3      3     1.50                        phsubsw	%mm0, %mm2
-# CHECK-NEXT:  4      8     1.50    *                   phsubsw	(%rax), %mm2
-# CHECK-NEXT:  3      3     1.50                        phsubsw	%xmm0, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   phsubsw	(%rax), %xmm2
-# CHECK-NEXT:  3      3     1.50                        phsubw	%mm0, %mm2
-# CHECK-NEXT:  4      8     1.50    *                   phsubw	(%rax), %mm2
-# CHECK-NEXT:  3      3     1.50                        phsubw	%xmm0, %xmm2
-# CHECK-NEXT:  4      9     1.50    *                   phsubw	(%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        pmaddubsw	%mm0, %mm2
-# CHECK-NEXT:  2      10    1.00    *                   pmaddubsw	(%rax), %mm2
-# CHECK-NEXT:  1      5     1.00                        pmaddubsw	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pmaddubsw	(%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        pmulhrsw	%mm0, %mm2
-# CHECK-NEXT:  2      10    1.00    *                   pmulhrsw	(%rax), %mm2
-# CHECK-NEXT:  1      5     1.00                        pmulhrsw	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pmulhrsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        pshufb	%mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   pshufb	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        pshufb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   pshufb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        psignb	%mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   psignb	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        psignb	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   psignb	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        psignd	%mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   psignd	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        psignd	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   psignd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        psignw	%mm0, %mm2
-# CHECK-NEXT:  2      6     0.50    *                   psignw	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        psignw	%xmm0, %xmm2
-# CHECK-NEXT:  2      7     0.50    *                   psignw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pabsb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pabsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pabsd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pabsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pabsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pabsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        palignr	$1, %mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   palignr	$1, (%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        palignr	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   palignr	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phaddd	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddd	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phaddd	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddd	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phaddsw	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddsw	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phaddsw	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddsw	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phaddw	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddw	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phaddw	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddw	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phsubd	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubd	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phsubd	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubd	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phsubsw	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubsw	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phsubsw	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubsw	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phsubw	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubw	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phsubw	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmaddubsw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmaddubsw	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmaddubsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmaddubsw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmulhrsw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhrsw	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmulhrsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhrsw	(%rax), %xmm2
+# CHECK-NEXT:  1      3     2.00                        pshufb	%mm0, %mm2
+# CHECK-NEXT:  1      8     2.00    *                   pshufb	(%rax), %mm2
+# CHECK-NEXT:  1      3     2.00                        pshufb	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     2.00    *                   pshufb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psignb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psignb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psignb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psignb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psignd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psignd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psignd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psignd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psignw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psignw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psignw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psignw	(%rax), %xmm2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     8.00   52.00   -     52.00  16.00  16.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 16.00  16.00   -      -      -      -      -      -      -      -     34.00  34.00  8.00    -     36.00  28.00   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pabsw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pabsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     palignr	$1, %mm0, %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   palignr	$1, (%rax), %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     palignr	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   palignr	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddsw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddsw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phaddw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phaddw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubsw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubsw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubsw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     phsubw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   phsubw	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmaddubsw	%mm0, %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmaddubsw	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmaddubsw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmaddubsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhrsw	%mm0, %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhrsw	(%rax), %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pmulhrsw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   pmulhrsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pshufb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pshufb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     pshufb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   pshufb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignb	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignb	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignb	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignd	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignd	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignd	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignw	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignw	(%rax), %mm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psignw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psignw	(%rax), %xmm2
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, %mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s b/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s
index ebe9975654c..0287d973171 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 bextr        $8192, %ebx, %ecx
 bextr        $8192, (%rbx), %ecx
@@ -70,100 +70,112 @@ tzmsk        (%rax), %rcx
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      2     1.00                        bextrl	$8192, %ebx, %ecx
-# CHECK-NEXT:  3      7     1.00    *                   bextrl	$8192, (%rbx), %ecx
-# CHECK-NEXT:  2      2     1.00                        bextrq	$16384, %rbx, %rcx
-# CHECK-NEXT:  3      7     1.00    *                   bextrq	$16384, (%rbx), %rcx
-# CHECK-NEXT:  1      1     0.33                        blcfilll	%eax, %ecx
+# CHECK-NEXT:  2      2     0.50                        bextrl	$8192, %ebx, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   bextrl	$8192, (%rbx), %ecx
+# CHECK-NEXT:  2      2     0.50                        bextrq	$16384, %rbx, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   bextrq	$16384, (%rbx), %rcx
+# CHECK-NEXT:  2      2     0.50                        blcfilll	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   blcfilll	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        blcfillq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        blcfillq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   blcfillq	(%rax), %rcx
-# CHECK-NEXT:  1      1     0.33                        blcil	%eax, %ecx
+# CHECK-NEXT:  2      2     0.50                        blcil	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   blcil	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        blciq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        blciq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   blciq	(%rax), %rcx
-# CHECK-NEXT:  1      1     0.33                        blcicl	%eax, %ecx
+# CHECK-NEXT:  2      2     0.50                        blcicl	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   blcicl	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        blcicq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        blcicq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   blcicq	(%rax), %rcx
-# CHECK-NEXT:  1      1     0.33                        blcmskl	%eax, %ecx
+# CHECK-NEXT:  2      2     0.50                        blcmskl	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   blcmskl	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        blcmskq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        blcmskq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   blcmskq	(%rax), %rcx
-# CHECK-NEXT:  1      1     0.33                        blcsl	%eax, %ecx
+# CHECK-NEXT:  2      2     0.50                        blcsl	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   blcsl	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        blcsq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        blcsq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   blcsq	(%rax), %rcx
-# CHECK-NEXT:  1      1     0.33                        blsfilll	%eax, %ecx
+# CHECK-NEXT:  2      2     0.50                        blsfilll	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   blsfilll	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        blsfillq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        blsfillq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   blsfillq	(%rax), %rcx
-# CHECK-NEXT:  1      1     0.33                        blsicl	%eax, %ecx
+# CHECK-NEXT:  2      2     0.50                        blsicl	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   blsicl	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        blsicq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        blsicq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   blsicq	(%rax), %rcx
-# CHECK-NEXT:  1      1     0.33                        t1mskcl	%eax, %ecx
+# CHECK-NEXT:  2      2     0.50                        t1mskcl	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   t1mskcl	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        t1mskcq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        t1mskcq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   t1mskcq	(%rax), %rcx
-# CHECK-NEXT:  1      1     0.33                        tzmskl	%eax, %ecx
+# CHECK-NEXT:  2      2     0.50                        tzmskl	%eax, %ecx
 # CHECK-NEXT:  2      6     0.50    *                   tzmskl	(%rax), %ecx
-# CHECK-NEXT:  1      1     0.33                        tzmskq	%rax, %rcx
+# CHECK-NEXT:  2      2     0.50                        tzmskq	%rax, %rcx
 # CHECK-NEXT:  2      6     0.50    *                   tzmskq	(%rax), %rcx
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     14.00  16.00   -     14.00  10.00  10.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 1.00   1.00    -      -      -     20.00  20.00   -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.50   1.00    -     0.50    -      -     bextrl	$8192, %ebx, %ecx
-# CHECK-NEXT:  -      -     0.50   1.00    -     0.50   0.50   0.50   bextrl	$8192, (%rbx), %ecx
-# CHECK-NEXT:  -      -     0.50   1.00    -     0.50    -      -     bextrq	$16384, %rbx, %rcx
-# CHECK-NEXT:  -      -     0.50   1.00    -     0.50   0.50   0.50   bextrq	$16384, (%rbx), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcfilll	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcfilll	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcfillq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcfillq	(%rax), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcil	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcil	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blciq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blciq	(%rax), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcicl	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcicl	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcicq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcicq	(%rax), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcmskl	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcmskl	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcmskq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcmskq	(%rax), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcsl	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcsl	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blcsq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blcsq	(%rax), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsfilll	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsfilll	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsfillq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsfillq	(%rax), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsicl	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsicl	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     blsicq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   blsicq	(%rax), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     t1mskcl	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   t1mskcl	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     t1mskcq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   t1mskcq	(%rax), %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     tzmskl	%eax, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   tzmskl	(%rax), %ecx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     tzmskq	%rax, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   tzmskq	(%rax), %rcx
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	$8192, %ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	$8192, (%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	$16384, %rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	$16384, (%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfilll	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfilll	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfillq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfillq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcil	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcil	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blciq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blciq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfilll	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfilll	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfillq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfillq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s b/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s
index b4672620cf4..5a6ee53713c 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=i686-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=i686-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 aaa
 
@@ -33,46 +33,58 @@ salc
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      100   0.33                        aaa
-# CHECK-NEXT:  1      100   0.33                        aad
-# CHECK-NEXT:  1      100   0.33                        aad	$7
-# CHECK-NEXT:  1      100   0.33                        aam
-# CHECK-NEXT:  1      100   0.33                        aam	$7
-# CHECK-NEXT:  1      100   0.33                        aas
-# CHECK-NEXT:  1      100   0.33                  U     bound	%bx, (%eax)
-# CHECK-NEXT:  1      100   0.33                  U     bound	%ebx, (%eax)
-# CHECK-NEXT:  1      100   0.33                        daa
-# CHECK-NEXT:  1      100   0.33                        das
-# CHECK-NEXT:  1      100   0.33                  U     into
-# CHECK-NEXT:  3      7     0.67    *                   leave
-# CHECK-NEXT:  1      1     0.33                  U     salc
+# CHECK-NEXT:  1      100   0.50                        aaa
+# CHECK-NEXT:  1      100   0.50                        aad
+# CHECK-NEXT:  1      100   0.50                        aad	$7
+# CHECK-NEXT:  1      100   0.50                        aam
+# CHECK-NEXT:  1      100   0.50                        aam	$7
+# CHECK-NEXT:  1      100   0.50                        aas
+# CHECK-NEXT:  1      100   0.50                  U     bound	%bx, (%eax)
+# CHECK-NEXT:  1      100   0.50                  U     bound	%ebx, (%eax)
+# CHECK-NEXT:  1      100   0.50                        daa
+# CHECK-NEXT:  1      100   0.50                        das
+# CHECK-NEXT:  1      100   0.50                  U     into
+# CHECK-NEXT:  1      1     0.50    *                   leave
+# CHECK-NEXT:  1      1     0.50                  U     salc
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     4.67   4.67    -     4.67   0.50   0.50
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     6.50   6.50    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aaa
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aad
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aad	$7
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aam
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aam	$7
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     aas
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     bound	%bx, (%eax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     bound	%ebx, (%eax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     daa
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     das
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     into
-# CHECK-NEXT:  -      -     0.67   0.67    -     0.67   0.50   0.50   leave
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     salc
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aaa
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aad
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aad	$7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aam
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aam	$7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aas
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bound	%bx, (%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bound	%ebx, (%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     daa
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     das
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     into
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leave
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     salc
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s b/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s
index 2ab041c3de6..b72522411b8 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 adcb $7, %al
 adcb $7, %dil
@@ -861,1512 +861,1524 @@ xorq (%rax), %rdi
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      2     0.67                        adcb	$7, %al
-# CHECK-NEXT:  2      2     0.67                        adcb	$7, %dil
-# CHECK-NEXT:  6      9     1.00    *      *            adcb	$7, (%rax)
-# CHECK-NEXT:  2      2     0.67                        adcb	%sil, %dil
-# CHECK-NEXT:  6      9     1.00    *      *            adcb	%sil, (%rax)
-# CHECK-NEXT:  3      7     0.67    *                   adcb	(%rax), %dil
-# CHECK-NEXT:  2      2     0.67                        adcw	$511, %ax
-# CHECK-NEXT:  2      2     0.67                        adcw	$511, %di
-# CHECK-NEXT:  6      9     1.00    *      *            adcw	$511, (%rax)
-# CHECK-NEXT:  2      2     0.67                        adcw	$7, %di
-# CHECK-NEXT:  6      9     1.00    *      *            adcw	$7, (%rax)
-# CHECK-NEXT:  2      2     0.67                        adcw	%si, %di
-# CHECK-NEXT:  6      9     1.00    *      *            adcw	%si, (%rax)
-# CHECK-NEXT:  3      7     0.67    *                   adcw	(%rax), %di
-# CHECK-NEXT:  2      2     0.67                        adcl	$665536, %eax
-# CHECK-NEXT:  2      2     0.67                        adcl	$665536, %edi
-# CHECK-NEXT:  6      9     1.00    *      *            adcl	$665536, (%rax)
-# CHECK-NEXT:  2      2     0.67                        adcl	$7, %edi
-# CHECK-NEXT:  6      9     1.00    *      *            adcl	$7, (%rax)
-# CHECK-NEXT:  2      2     0.67                        adcl	%esi, %edi
-# CHECK-NEXT:  6      9     1.00    *      *            adcl	%esi, (%rax)
-# CHECK-NEXT:  3      7     0.67    *                   adcl	(%rax), %edi
-# CHECK-NEXT:  2      2     0.67                        adcq	$665536, %rax
-# CHECK-NEXT:  2      2     0.67                        adcq	$665536, %rdi
-# CHECK-NEXT:  6      9     1.00    *      *            adcq	$665536, (%rax)
-# CHECK-NEXT:  2      2     0.67                        adcq	$7, %rdi
-# CHECK-NEXT:  6      9     1.00    *      *            adcq	$7, (%rax)
-# CHECK-NEXT:  2      2     0.67                        adcq	%rsi, %rdi
-# CHECK-NEXT:  6      9     1.00    *      *            adcq	%rsi, (%rax)
-# CHECK-NEXT:  3      7     0.67    *                   adcq	(%rax), %rdi
-# CHECK-NEXT:  1      1     0.33                        addb	$7, %al
-# CHECK-NEXT:  1      1     0.33                        addb	$7, %dil
-# CHECK-NEXT:  3      7     1.00    *      *            addb	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        addb	%sil, %dil
-# CHECK-NEXT:  3      7     1.00    *      *            addb	%sil, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   addb	(%rax), %dil
-# CHECK-NEXT:  1      1     0.33                        addw	$511, %ax
-# CHECK-NEXT:  1      1     0.33                        addw	$511, %di
-# CHECK-NEXT:  3      7     1.00    *      *            addw	$511, (%rax)
-# CHECK-NEXT:  1      1     0.33                        addw	$7, %di
-# CHECK-NEXT:  3      7     1.00    *      *            addw	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        addw	%si, %di
-# CHECK-NEXT:  3      7     1.00    *      *            addw	%si, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   addw	(%rax), %di
-# CHECK-NEXT:  1      1     0.33                        addl	$665536, %eax
-# CHECK-NEXT:  1      1     0.33                        addl	$665536, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            addl	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        addl	$7, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            addl	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        addl	%esi, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            addl	%esi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   addl	(%rax), %edi
-# CHECK-NEXT:  1      1     0.33                        addq	$665536, %rax
-# CHECK-NEXT:  1      1     0.33                        addq	$665536, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            addq	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        addq	$7, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            addq	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        addq	%rsi, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            addq	%rsi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   addq	(%rax), %rdi
-# CHECK-NEXT:  1      1     0.33                        andb	$7, %al
-# CHECK-NEXT:  1      1     0.33                        andb	$7, %dil
-# CHECK-NEXT:  3      7     1.00    *      *            andb	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        andb	%sil, %dil
-# CHECK-NEXT:  3      7     1.00    *      *            andb	%sil, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   andb	(%rax), %dil
-# CHECK-NEXT:  1      1     0.33                        andw	$511, %ax
-# CHECK-NEXT:  1      1     0.33                        andw	$511, %di
-# CHECK-NEXT:  3      7     1.00    *      *            andw	$511, (%rax)
-# CHECK-NEXT:  1      1     0.33                        andw	$7, %di
-# CHECK-NEXT:  3      7     1.00    *      *            andw	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        andw	%si, %di
-# CHECK-NEXT:  3      7     1.00    *      *            andw	%si, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   andw	(%rax), %di
-# CHECK-NEXT:  1      1     0.33                        andl	$665536, %eax
-# CHECK-NEXT:  1      1     0.33                        andl	$665536, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            andl	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        andl	$7, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            andl	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        andl	%esi, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            andl	%esi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   andl	(%rax), %edi
-# CHECK-NEXT:  1      1     0.33                        andq	$665536, %rax
-# CHECK-NEXT:  1      1     0.33                        andq	$665536, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            andq	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        andq	$7, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            andq	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        andq	%rsi, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            andq	%rsi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   andq	(%rax), %rdi
-# CHECK-NEXT:  1      3     1.00                        bsfw	%si, %di
-# CHECK-NEXT:  1      3     1.00                        bsrw	%si, %di
-# CHECK-NEXT:  2      8     1.00    *                   bsfw	(%rax), %di
-# CHECK-NEXT:  2      8     1.00    *                   bsrw	(%rax), %di
-# CHECK-NEXT:  1      3     1.00                        bsfl	%esi, %edi
-# CHECK-NEXT:  1      3     1.00                        bsrl	%esi, %edi
-# CHECK-NEXT:  2      8     1.00    *                   bsfl	(%rax), %edi
-# CHECK-NEXT:  2      8     1.00    *                   bsrl	(%rax), %edi
-# CHECK-NEXT:  1      3     1.00                        bsfq	%rsi, %rdi
-# CHECK-NEXT:  1      3     1.00                        bsrq	%rsi, %rdi
-# CHECK-NEXT:  2      8     1.00    *                   bsfq	(%rax), %rdi
-# CHECK-NEXT:  2      8     1.00    *                   bsrq	(%rax), %rdi
+# CHECK-NEXT:  1      1     1.00                        adcb	$7, %al
+# CHECK-NEXT:  1      1     1.00                        adcb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            adcb	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            adcb	%sil, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   adcb	(%rax), %dil
+# CHECK-NEXT:  1      1     1.00                        adcw	$511, %ax
+# CHECK-NEXT:  1      1     1.00                        adcw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            adcw	$511, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            adcw	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            adcw	%si, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   adcw	(%rax), %di
+# CHECK-NEXT:  1      1     1.00                        adcl	$665536, %eax
+# CHECK-NEXT:  1      1     1.00                        adcl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            adcl	$665536, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            adcl	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            adcl	%esi, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   adcl	(%rax), %edi
+# CHECK-NEXT:  1      1     1.00                        adcq	$665536, %rax
+# CHECK-NEXT:  1      1     1.00                        adcq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            adcq	$665536, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            adcq	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            adcq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   adcq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.50                        addb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        addb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            addb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            addb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   addb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        addw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        addw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            addw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            addw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            addw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   addw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        addl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            addl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            addl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            addl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   addl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        addq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        addq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            addq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            addq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            addq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   addq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.50                        andb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        andb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            andb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            andb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   andb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        andw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        andw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            andw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            andw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            andw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   andw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        andl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        andl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            andl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            andl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            andl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   andl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        andq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        andq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            andq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            andq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            andq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   andq	(%rax), %rdi
+# CHECK-NEXT:  6      3     2.00                        bsfw	%si, %di
+# CHECK-NEXT:  7      4     2.00                        bsrw	%si, %di
+# CHECK-NEXT:  8      7     2.00    *                   bsfw	(%rax), %di
+# CHECK-NEXT:  9      8     2.00    *                   bsrw	(%rax), %di
+# CHECK-NEXT:  6      3     2.00                        bsfl	%esi, %edi
+# CHECK-NEXT:  7      4     2.00                        bsrl	%esi, %edi
+# CHECK-NEXT:  8      7     2.00    *                   bsfl	(%rax), %edi
+# CHECK-NEXT:  9      8     2.00    *                   bsrl	(%rax), %edi
+# CHECK-NEXT:  6      3     2.00                        bsfq	%rsi, %rdi
+# CHECK-NEXT:  7      4     2.00                        bsrq	%rsi, %rdi
+# CHECK-NEXT:  8      7     2.00    *                   bsfq	(%rax), %rdi
+# CHECK-NEXT:  9      8     2.00    *                   bsrq	(%rax), %rdi
 # CHECK-NEXT:  1      1     1.00                        bswapl	%eax
-# CHECK-NEXT:  2      2     1.00                        bswapq	%rax
+# CHECK-NEXT:  1      1     1.00                        bswapq	%rax
 # CHECK-NEXT:  1      1     0.50                        btw	%si, %di
-# CHECK-NEXT:  1      1     0.50                        btcw	%si, %di
-# CHECK-NEXT:  1      1     0.50                        btrw	%si, %di
-# CHECK-NEXT:  1      1     0.50                        btsw	%si, %di
-# CHECK-NEXT:  6      9     1.00    *                   btw	%si, (%rax)
-# CHECK-NEXT:  6      9     1.00    *      *            btcw	%si, (%rax)
-# CHECK-NEXT:  6      9     1.00    *      *            btrw	%si, (%rax)
-# CHECK-NEXT:  6      9     1.00    *      *            btsw	%si, (%rax)
+# CHECK-NEXT:  2      2     0.50                        btcw	%si, %di
+# CHECK-NEXT:  2      2     0.50                        btrw	%si, %di
+# CHECK-NEXT:  2      2     0.50                        btsw	%si, %di
+# CHECK-NEXT:  7      5     0.50    *                   btw	%si, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btcw	%si, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btrw	%si, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btsw	%si, (%rax)
 # CHECK-NEXT:  1      1     0.50                        btw	$7, %di
-# CHECK-NEXT:  1      1     0.50                        btcw	$7, %di
-# CHECK-NEXT:  1      1     0.50                        btrw	$7, %di
-# CHECK-NEXT:  1      1     0.50                        btsw	$7, %di
-# CHECK-NEXT:  2      6     0.50    *                   btw	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            btcw	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            btrw	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            btsw	$7, (%rax)
+# CHECK-NEXT:  2      2     0.50                        btcw	$7, %di
+# CHECK-NEXT:  2      2     0.50                        btrw	$7, %di
+# CHECK-NEXT:  2      2     0.50                        btsw	$7, %di
+# CHECK-NEXT:  1      5     0.50    *                   btw	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btcw	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btrw	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btsw	$7, (%rax)
 # CHECK-NEXT:  1      1     0.50                        btl	%esi, %edi
-# CHECK-NEXT:  1      1     0.50                        btcl	%esi, %edi
-# CHECK-NEXT:  1      1     0.50                        btrl	%esi, %edi
-# CHECK-NEXT:  1      1     0.50                        btsl	%esi, %edi
-# CHECK-NEXT:  6      9     1.00    *                   btl	%esi, (%rax)
-# CHECK-NEXT:  6      9     1.00    *      *            btcl	%esi, (%rax)
-# CHECK-NEXT:  6      9     1.00    *      *            btrl	%esi, (%rax)
-# CHECK-NEXT:  6      9     1.00    *      *            btsl	%esi, (%rax)
+# CHECK-NEXT:  2      2     0.50                        btcl	%esi, %edi
+# CHECK-NEXT:  2      2     0.50                        btrl	%esi, %edi
+# CHECK-NEXT:  2      2     0.50                        btsl	%esi, %edi
+# CHECK-NEXT:  7      5     0.50    *                   btl	%esi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btcl	%esi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btrl	%esi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btsl	%esi, (%rax)
 # CHECK-NEXT:  1      1     0.50                        btl	$7, %edi
-# CHECK-NEXT:  1      1     0.50                        btcl	$7, %edi
-# CHECK-NEXT:  1      1     0.50                        btrl	$7, %edi
-# CHECK-NEXT:  1      1     0.50                        btsl	$7, %edi
-# CHECK-NEXT:  2      6     0.50    *                   btl	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            btcl	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            btrl	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            btsl	$7, (%rax)
+# CHECK-NEXT:  2      2     0.50                        btcl	$7, %edi
+# CHECK-NEXT:  2      2     0.50                        btrl	$7, %edi
+# CHECK-NEXT:  2      2     0.50                        btsl	$7, %edi
+# CHECK-NEXT:  1      5     0.50    *                   btl	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btcl	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btrl	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btsl	$7, (%rax)
 # CHECK-NEXT:  1      1     0.50                        btq	%rsi, %rdi
-# CHECK-NEXT:  1      1     0.50                        btcq	%rsi, %rdi
-# CHECK-NEXT:  1      1     0.50                        btrq	%rsi, %rdi
-# CHECK-NEXT:  1      1     0.50                        btsq	%rsi, %rdi
-# CHECK-NEXT:  6      9     1.00    *                   btq	%rsi, (%rax)
-# CHECK-NEXT:  6      9     1.00    *      *            btcq	%rsi, (%rax)
-# CHECK-NEXT:  6      9     1.00    *      *            btrq	%rsi, (%rax)
-# CHECK-NEXT:  6      9     1.00    *      *            btsq	%rsi, (%rax)
+# CHECK-NEXT:  2      2     0.50                        btcq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.50                        btrq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.50                        btsq	%rsi, %rdi
+# CHECK-NEXT:  7      5     0.50    *                   btq	%rsi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btcq	%rsi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btrq	%rsi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btsq	%rsi, (%rax)
 # CHECK-NEXT:  1      1     0.50                        btq	$7, %rdi
-# CHECK-NEXT:  1      1     0.50                        btcq	$7, %rdi
-# CHECK-NEXT:  1      1     0.50                        btrq	$7, %rdi
-# CHECK-NEXT:  1      1     0.50                        btsq	$7, %rdi
-# CHECK-NEXT:  2      6     0.50    *                   btq	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            btcq	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            btrq	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            btsq	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        cbtw
-# CHECK-NEXT:  1      1     0.33                        cwtl
-# CHECK-NEXT:  1      1     0.33                        cltq
-# CHECK-NEXT:  2      2     1.00                        cwtd
+# CHECK-NEXT:  2      2     0.50                        btcq	$7, %rdi
+# CHECK-NEXT:  2      2     0.50                        btrq	$7, %rdi
+# CHECK-NEXT:  2      2     0.50                        btsq	$7, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   btq	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btcq	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btrq	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btsq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cbtw
+# CHECK-NEXT:  1      1     0.50                        cwtl
+# CHECK-NEXT:  1      1     0.50                        cltq
+# CHECK-NEXT:  1      1     0.50                        cwtd
 # CHECK-NEXT:  1      1     0.50                        cltd
 # CHECK-NEXT:  1      1     0.50                        cqto
-# CHECK-NEXT:  1      1     0.25                  U     clc
-# CHECK-NEXT:  1      1     0.33                  U     cld
-# CHECK-NEXT:  1      1     0.33                  U     cmc
-# CHECK-NEXT:  1      1     0.33                        cmpb	$7, %al
-# CHECK-NEXT:  1      1     0.33                        cmpb	$7, %dil
-# CHECK-NEXT:  2      6     0.50    *                   cmpb	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        cmpb	%sil, %dil
-# CHECK-NEXT:  2      6     0.50    *                   cmpb	%sil, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   cmpb	(%rax), %dil
-# CHECK-NEXT:  1      1     0.33                        cmpw	$511, %ax
-# CHECK-NEXT:  1      1     0.33                        cmpw	$511, %di
-# CHECK-NEXT:  2      6     0.50    *                   cmpw	$511, (%rax)
-# CHECK-NEXT:  1      1     0.33                        cmpw	$7, %di
-# CHECK-NEXT:  2      6     0.50    *                   cmpw	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        cmpw	%si, %di
-# CHECK-NEXT:  2      6     0.50    *                   cmpw	%si, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   cmpw	(%rax), %di
-# CHECK-NEXT:  1      1     0.33                        cmpl	$665536, %eax
-# CHECK-NEXT:  1      1     0.33                        cmpl	$665536, %edi
-# CHECK-NEXT:  2      6     0.50    *                   cmpl	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        cmpl	$7, %edi
-# CHECK-NEXT:  2      6     0.50    *                   cmpl	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        cmpl	%esi, %edi
-# CHECK-NEXT:  2      6     0.50    *                   cmpl	%esi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   cmpl	(%rax), %edi
-# CHECK-NEXT:  1      1     0.33                        cmpq	$665536, %rax
-# CHECK-NEXT:  1      1     0.33                        cmpq	$665536, %rdi
-# CHECK-NEXT:  2      6     0.50    *                   cmpq	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        cmpq	$7, %rdi
-# CHECK-NEXT:  2      6     0.50    *                   cmpq	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        cmpq	%rsi, %rdi
-# CHECK-NEXT:  2      6     0.50    *                   cmpq	%rsi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   cmpq	(%rax), %rdi
-# CHECK-NEXT:  5      8     1.00                  U     cmpsb	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  5      8     1.00                  U     cmpsw	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  5      8     1.00                  U     cmpsl	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  5      8     1.00                  U     cmpsq	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  4      5     1.33                        cmpxchgb	%cl, %bl
-# CHECK-NEXT:  6      8     2.00    *      *            cmpxchgb	%cl, (%rbx)
-# CHECK-NEXT:  4      5     1.33                        cmpxchgw	%cx, %bx
-# CHECK-NEXT:  6      8     2.00    *      *            cmpxchgw	%cx, (%rbx)
-# CHECK-NEXT:  4      5     1.33                        cmpxchgl	%ecx, %ebx
-# CHECK-NEXT:  6      8     2.00    *      *            cmpxchgl	%ecx, (%rbx)
-# CHECK-NEXT:  4      5     1.33                        cmpxchgq	%rcx, %rbx
-# CHECK-NEXT:  6      8     2.00    *      *            cmpxchgq	%rcx, (%rbx)
-# CHECK-NEXT:  1      100   0.33                  U     cpuid
-# CHECK-NEXT:  1      1     0.33                        decb	%dil
-# CHECK-NEXT:  3      7     1.00    *      *            decb	(%rax)
-# CHECK-NEXT:  1      1     0.33                        decw	%di
-# CHECK-NEXT:  3      7     1.00    *      *            decw	(%rax)
-# CHECK-NEXT:  1      1     0.33                        decl	%edi
-# CHECK-NEXT:  3      7     1.00    *      *            decl	(%rax)
-# CHECK-NEXT:  1      1     0.33                        decq	%rdi
-# CHECK-NEXT:  3      7     1.00    *      *            decq	(%rax)
-# CHECK-NEXT:  1      25    10.00                 U     divb	%dil
-# CHECK-NEXT:  2      30    10.00   *             U     divb	(%rax)
-# CHECK-NEXT:  1      25    10.00                 U     divw	%si
-# CHECK-NEXT:  2      30    10.00   *             U     divw	(%rax)
-# CHECK-NEXT:  1      25    10.00                 U     divl	%edx
-# CHECK-NEXT:  2      30    10.00   *             U     divl	(%rax)
-# CHECK-NEXT:  1      25    10.00                 U     divq	%rcx
-# CHECK-NEXT:  2      30    10.00   *             U     divq	(%rax)
-# CHECK-NEXT:  1      25    10.00                 U     idivb	%dil
-# CHECK-NEXT:  2      30    10.00   *             U     idivb	(%rax)
-# CHECK-NEXT:  1      25    10.00                 U     idivw	%si
-# CHECK-NEXT:  2      30    10.00   *             U     idivw	(%rax)
-# CHECK-NEXT:  1      25    10.00                 U     idivl	%edx
-# CHECK-NEXT:  2      30    10.00   *             U     idivl	(%rax)
-# CHECK-NEXT:  1      25    10.00                 U     idivq	%rcx
-# CHECK-NEXT:  2      30    10.00   *             U     idivq	(%rax)
-# CHECK-NEXT:  1      3     1.00                        imulb	%dil
-# CHECK-NEXT:  2      8     1.00    *                   imulb	(%rax)
-# CHECK-NEXT:  4      4     1.33                        imulw	%di
-# CHECK-NEXT:  5      9     1.33    *                   imulw	(%rax)
-# CHECK-NEXT:  1      3     1.00                        imulw	%si, %di
-# CHECK-NEXT:  2      8     1.00    *                   imulw	(%rax), %di
-# CHECK-NEXT:  2      4     1.00                        imulw	$511, %si, %di
-# CHECK-NEXT:  3      8     1.00    *                   imulw	$511, (%rax), %di
-# CHECK-NEXT:  2      4     1.00                        imulw	$7, %si, %di
-# CHECK-NEXT:  3      8     1.00    *                   imulw	$7, (%rax), %di
-# CHECK-NEXT:  3      4     1.00                        imull	%edi
-# CHECK-NEXT:  4      9     1.00    *                   imull	(%rax)
-# CHECK-NEXT:  1      3     1.00                        imull	%esi, %edi
-# CHECK-NEXT:  2      8     1.00    *                   imull	(%rax), %edi
-# CHECK-NEXT:  1      3     1.00                        imull	$665536, %esi, %edi
+# CHECK-NEXT:  1      1     0.50                  U     clc
+# CHECK-NEXT:  1      1     0.50                  U     cld
+# CHECK-NEXT:  1      1     0.50                  U     cmc
+# CHECK-NEXT:  1      1     0.50                        cmpb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        cmpb	$7, %dil
+# CHECK-NEXT:  1      5     0.50    *                   cmpb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpb	%sil, %dil
+# CHECK-NEXT:  1      5     0.50    *                   cmpb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   cmpb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        cmpw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        cmpw	$511, %di
+# CHECK-NEXT:  1      5     0.50    *                   cmpw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpw	$7, %di
+# CHECK-NEXT:  1      5     0.50    *                   cmpw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpw	%si, %di
+# CHECK-NEXT:  1      5     0.50    *                   cmpw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   cmpw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        cmpl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        cmpl	$665536, %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmpl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpl	$7, %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmpl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpl	%esi, %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmpl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   cmpl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        cmpq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        cmpq	$665536, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmpq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpq	$7, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmpq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpq	%rsi, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmpq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   cmpq	(%rax), %rdi
+# CHECK-NEXT:  1      100   0.50                  U     cmpsb	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  1      100   0.50                  U     cmpsw	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  1      100   0.50                  U     cmpsl	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  1      100   0.50                  U     cmpsq	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  3      3     1.00                        cmpxchgb	%cl, %bl
+# CHECK-NEXT:  5      3     1.00    *      *            cmpxchgb	%cl, (%rbx)
+# CHECK-NEXT:  5      3     1.00                        cmpxchgw	%cx, %bx
+# CHECK-NEXT:  6      3     1.00    *      *            cmpxchgw	%cx, (%rbx)
+# CHECK-NEXT:  5      3     1.00                        cmpxchgl	%ecx, %ebx
+# CHECK-NEXT:  6      3     1.00    *      *            cmpxchgl	%ecx, (%rbx)
+# CHECK-NEXT:  5      3     1.00                        cmpxchgq	%rcx, %rbx
+# CHECK-NEXT:  6      3     1.00    *      *            cmpxchgq	%rcx, (%rbx)
+# CHECK-NEXT:  1      100   0.50                  U     cpuid
+# CHECK-NEXT:  1      1     0.50                        decb	%dil
+# CHECK-NEXT:  2      6     1.00    *      *            decb	(%rax)
+# CHECK-NEXT:  1      1     0.50                        decw	%di
+# CHECK-NEXT:  2      6     1.00    *      *            decw	(%rax)
+# CHECK-NEXT:  1      1     0.50                        decl	%edi
+# CHECK-NEXT:  2      6     1.00    *      *            decl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        decq	%rdi
+# CHECK-NEXT:  2      6     1.00    *      *            decq	(%rax)
+# CHECK-NEXT:  1      12    12.00                 U     divb	%dil
+# CHECK-NEXT:  1      16    12.00   *             U     divb	(%rax)
+# CHECK-NEXT:  2      15    15.00                 U     divw	%si
+# CHECK-NEXT:  2      19    15.00   *             U     divw	(%rax)
+# CHECK-NEXT:  2      14    14.00                 U     divl	%edx
+# CHECK-NEXT:  2      18    14.00   *             U     divl	(%rax)
+# CHECK-NEXT:  2      14    14.00                 U     divq	%rcx
+# CHECK-NEXT:  2      18    14.00   *             U     divq	(%rax)
+# CHECK-NEXT:  1      12    12.00                 U     idivb	%dil
+# CHECK-NEXT:  1      16    12.00   *             U     idivb	(%rax)
+# CHECK-NEXT:  2      15    17.00                 U     idivw	%si
+# CHECK-NEXT:  2      19    17.00   *             U     idivw	(%rax)
+# CHECK-NEXT:  2      14    25.00                 U     idivl	%edx
+# CHECK-NEXT:  2      18    25.00   *             U     idivl	(%rax)
+# CHECK-NEXT:  2      14    14.00                 U     idivq	%rcx
+# CHECK-NEXT:  2      18    14.00   *             U     idivq	(%rax)
+# CHECK-NEXT:  1      4     1.00                        imulb	%dil
+# CHECK-NEXT:  1      8     1.00    *                   imulb	(%rax)
+# CHECK-NEXT:  2      4     1.00                        imulw	%di
+# CHECK-NEXT:  2      8     1.00    *                   imulw	(%rax)
+# CHECK-NEXT:  1      4     1.00                        imulw	%si, %di
+# CHECK-NEXT:  1      8     1.00    *                   imulw	(%rax), %di
+# CHECK-NEXT:  2      5     1.00                        imulw	$511, %si, %di
+# CHECK-NEXT:  2      9     1.00    *                   imulw	$511, (%rax), %di
+# CHECK-NEXT:  2      5     1.00                        imulw	$7, %si, %di
+# CHECK-NEXT:  2      9     1.00    *                   imulw	$7, (%rax), %di
+# CHECK-NEXT:  1      4     1.00                        imull	%edi
+# CHECK-NEXT:  1      8     1.00    *                   imull	(%rax)
+# CHECK-NEXT:  1      4     1.00                        imull	%esi, %edi
+# CHECK-NEXT:  1      8     1.00    *                   imull	(%rax), %edi
+# CHECK-NEXT:  1      4     1.00                        imull	$665536, %esi, %edi
 # CHECK-NEXT:  2      8     1.00    *                   imull	$665536, (%rax), %edi
-# CHECK-NEXT:  1      3     1.00                        imull	$7, %esi, %edi
+# CHECK-NEXT:  1      4     1.00                        imull	$7, %esi, %edi
 # CHECK-NEXT:  2      8     1.00    *                   imull	$7, (%rax), %edi
-# CHECK-NEXT:  2      4     1.00                        imulq	%rdi
-# CHECK-NEXT:  3      9     1.00    *                   imulq	(%rax)
-# CHECK-NEXT:  1      3     1.00                        imulq	%rsi, %rdi
-# CHECK-NEXT:  2      8     1.00    *                   imulq	(%rax), %rdi
-# CHECK-NEXT:  1      3     1.00                        imulq	$665536, %rsi, %rdi
-# CHECK-NEXT:  2      8     1.00    *                   imulq	$665536, (%rax), %rdi
-# CHECK-NEXT:  1      3     1.00                        imulq	$7, %rsi, %rdi
-# CHECK-NEXT:  2      8     1.00    *                   imulq	$7, (%rax), %rdi
-# CHECK-NEXT:  1      100   0.33                  U     inb	$7, %al
-# CHECK-NEXT:  1      100   0.33                  U     inb	%dx, %al
-# CHECK-NEXT:  1      100   0.33                  U     inw	$7, %ax
-# CHECK-NEXT:  1      100   0.33                  U     inw	%dx, %ax
-# CHECK-NEXT:  1      100   0.33                  U     inl	$7, %eax
-# CHECK-NEXT:  1      100   0.33                  U     inl	%dx, %eax
-# CHECK-NEXT:  1      1     0.33                        incb	%dil
-# CHECK-NEXT:  3      7     1.00    *      *            incb	(%rax)
-# CHECK-NEXT:  1      1     0.33                        incw	%di
-# CHECK-NEXT:  3      7     1.00    *      *            incw	(%rax)
-# CHECK-NEXT:  1      1     0.33                        incl	%edi
-# CHECK-NEXT:  3      7     1.00    *      *            incl	(%rax)
-# CHECK-NEXT:  1      1     0.33                        incq	%rdi
-# CHECK-NEXT:  3      7     1.00    *      *            incq	(%rax)
-# CHECK-NEXT:  1      100   0.33                  U     insb	%dx, %es:(%rdi)
-# CHECK-NEXT:  1      100   0.33                  U     insw	%dx, %es:(%rdi)
-# CHECK-NEXT:  1      100   0.33                  U     insl	%dx, %es:(%rdi)
-# CHECK-NEXT:  1      100   0.33    *      *      U     int	$7
-# CHECK-NEXT:  1      1     0.50                        lahf
-# CHECK-NEXT:  3      7     0.67                  U     lodsb	(%rsi), %al
-# CHECK-NEXT:  3      7     0.67                  U     lodsw	(%rsi), %ax
-# CHECK-NEXT:  2      6     0.50                  U     lodsl	(%rsi), %eax
-# CHECK-NEXT:  2      6     0.50                  U     lodsq	(%rsi), %rax
-# CHECK-NEXT:  5      8     1.00                  U     movsb	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  5      8     1.00                  U     movsw	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  5      8     1.00                  U     movsl	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  5      8     1.00                  U     movsq	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  1      1     0.33                        movsbw	%al, %di
-# CHECK-NEXT:  1      1     0.33                        movzbw	%al, %di
+# CHECK-NEXT:  1      6     4.00                        imulq	%rdi
+# CHECK-NEXT:  1      10    4.00    *                   imulq	(%rax)
+# CHECK-NEXT:  1      6     4.00                        imulq	%rsi, %rdi
+# CHECK-NEXT:  1      10    4.00    *                   imulq	(%rax), %rdi
+# CHECK-NEXT:  1      6     4.00                        imulq	$665536, %rsi, %rdi
+# CHECK-NEXT:  2      10    4.00    *                   imulq	$665536, (%rax), %rdi
+# CHECK-NEXT:  1      6     4.00                        imulq	$7, %rsi, %rdi
+# CHECK-NEXT:  2      10    4.00    *                   imulq	$7, (%rax), %rdi
+# CHECK-NEXT:  1      100   0.50                  U     inb	$7, %al
+# CHECK-NEXT:  1      100   0.50                  U     inb	%dx, %al
+# CHECK-NEXT:  1      100   0.50                  U     inw	$7, %ax
+# CHECK-NEXT:  1      100   0.50                  U     inw	%dx, %ax
+# CHECK-NEXT:  1      100   0.50                  U     inl	$7, %eax
+# CHECK-NEXT:  1      100   0.50                  U     inl	%dx, %eax
+# CHECK-NEXT:  1      1     0.50                        incb	%dil
+# CHECK-NEXT:  2      6     1.00    *      *            incb	(%rax)
+# CHECK-NEXT:  1      1     0.50                        incw	%di
+# CHECK-NEXT:  2      6     1.00    *      *            incw	(%rax)
+# CHECK-NEXT:  1      1     0.50                        incl	%edi
+# CHECK-NEXT:  2      6     1.00    *      *            incl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        incq	%rdi
+# CHECK-NEXT:  2      6     1.00    *      *            incq	(%rax)
+# CHECK-NEXT:  1      100   0.50                  U     insb	%dx, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     insw	%dx, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     insl	%dx, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50    *      *      U     int	$7
+# CHECK-NEXT:  4      2     0.50                        lahf
+# CHECK-NEXT:  1      100   0.50                  U     lodsb	(%rsi), %al
+# CHECK-NEXT:  1      100   0.50                  U     lodsw	(%rsi), %ax
+# CHECK-NEXT:  1      100   0.50                  U     lodsl	(%rsi), %eax
+# CHECK-NEXT:  1      100   0.50                  U     lodsq	(%rsi), %rax
+# CHECK-NEXT:  1      100   0.50                  U     movsb	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     movsw	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     movsl	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     movsq	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  1      1     0.50                        movsbw	%al, %di
+# CHECK-NEXT:  1      1     0.50                        movzbw	%al, %di
 # CHECK-NEXT:  1      5     0.50    *                   movsbw	(%rax), %di
 # CHECK-NEXT:  1      5     0.50    *                   movzbw	(%rax), %di
-# CHECK-NEXT:  1      1     0.33                        movsbl	%al, %edi
-# CHECK-NEXT:  1      1     0.33                        movzbl	%al, %edi
+# CHECK-NEXT:  1      1     0.50                        movsbl	%al, %edi
+# CHECK-NEXT:  1      1     0.50                        movzbl	%al, %edi
 # CHECK-NEXT:  1      5     0.50    *                   movsbl	(%rax), %edi
 # CHECK-NEXT:  1      5     0.50    *                   movzbl	(%rax), %edi
-# CHECK-NEXT:  1      1     0.33                        movsbq	%al, %rdi
-# CHECK-NEXT:  1      1     0.33                        movzbq	%al, %rdi
+# CHECK-NEXT:  1      1     0.50                        movsbq	%al, %rdi
+# CHECK-NEXT:  1      1     0.50                        movzbq	%al, %rdi
 # CHECK-NEXT:  1      5     0.50    *                   movsbq	(%rax), %rdi
 # CHECK-NEXT:  1      5     0.50    *                   movzbq	(%rax), %rdi
-# CHECK-NEXT:  1      1     0.33                        movswl	%ax, %edi
-# CHECK-NEXT:  1      1     0.33                        movzwl	%ax, %edi
+# CHECK-NEXT:  1      1     0.50                        movswl	%ax, %edi
+# CHECK-NEXT:  1      1     0.50                        movzwl	%ax, %edi
 # CHECK-NEXT:  1      5     0.50    *                   movswl	(%rax), %edi
 # CHECK-NEXT:  1      5     0.50    *                   movzwl	(%rax), %edi
-# CHECK-NEXT:  1      1     0.33                        movswq	%ax, %rdi
-# CHECK-NEXT:  1      1     0.33                        movzwq	%ax, %rdi
+# CHECK-NEXT:  1      1     0.50                        movswq	%ax, %rdi
+# CHECK-NEXT:  1      1     0.50                        movzwq	%ax, %rdi
 # CHECK-NEXT:  1      5     0.50    *                   movswq	(%rax), %rdi
 # CHECK-NEXT:  1      5     0.50    *                   movzwq	(%rax), %rdi
-# CHECK-NEXT:  1      1     0.33                        movslq	%eax, %rdi
+# CHECK-NEXT:  1      1     0.50                        movslq	%eax, %rdi
 # CHECK-NEXT:  1      5     0.50    *                   movslq	(%rax), %rdi
-# CHECK-NEXT:  1      3     1.00                        mulb	%dil
-# CHECK-NEXT:  2      8     1.00    *                   mulb	(%rax)
-# CHECK-NEXT:  4      4     1.33                        mulw	%si
-# CHECK-NEXT:  5      9     1.33    *                   mulw	(%rax)
-# CHECK-NEXT:  3      4     1.00                        mull	%edx
-# CHECK-NEXT:  4      9     1.00    *                   mull	(%rax)
-# CHECK-NEXT:  2      4     1.00                        mulq	%rcx
-# CHECK-NEXT:  3      9     1.00    *                   mulq	(%rax)
-# CHECK-NEXT:  1      1     0.33                        negb	%dil
-# CHECK-NEXT:  3      7     1.00    *      *            negb	(%r8)
-# CHECK-NEXT:  1      1     0.33                        negw	%si
-# CHECK-NEXT:  3      7     1.00    *      *            negw	(%r9)
-# CHECK-NEXT:  1      1     0.33                        negl	%edx
-# CHECK-NEXT:  3      7     1.00    *      *            negl	(%rax)
-# CHECK-NEXT:  1      1     0.33                        negq	%rcx
-# CHECK-NEXT:  3      7     1.00    *      *            negq	(%r10)
-# CHECK-NEXT:  1      1     0.25                        nop
-# CHECK-NEXT:  1      1     0.25                        nopw	%di
-# CHECK-NEXT:  1      1     0.25                        nopw	(%rcx)
-# CHECK-NEXT:  1      1     0.25                        nopl	%esi
-# CHECK-NEXT:  1      1     0.25                        nopl	(%r8)
-# CHECK-NEXT:  1      1     0.25                        nopq	%rdx
-# CHECK-NEXT:  1      1     0.25                        nopq	(%r9)
-# CHECK-NEXT:  1      1     0.33                        notb	%dil
-# CHECK-NEXT:  3      7     1.00    *      *            notb	(%r8)
-# CHECK-NEXT:  1      1     0.33                        notw	%si
-# CHECK-NEXT:  3      7     1.00    *      *            notw	(%r9)
-# CHECK-NEXT:  1      1     0.33                        notl	%edx
-# CHECK-NEXT:  3      7     1.00    *      *            notl	(%rax)
-# CHECK-NEXT:  1      1     0.33                        notq	%rcx
-# CHECK-NEXT:  3      7     1.00    *      *            notq	(%r10)
-# CHECK-NEXT:  1      1     0.33                        orb	$7, %al
-# CHECK-NEXT:  1      1     0.33                        orb	$7, %dil
-# CHECK-NEXT:  3      7     1.00    *      *            orb	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        orb	%sil, %dil
-# CHECK-NEXT:  3      7     1.00    *      *            orb	%sil, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   orb	(%rax), %dil
-# CHECK-NEXT:  1      1     0.33                        orw	$511, %ax
-# CHECK-NEXT:  1      1     0.33                        orw	$511, %di
-# CHECK-NEXT:  3      7     1.00    *      *            orw	$511, (%rax)
-# CHECK-NEXT:  1      1     0.33                        orw	$7, %di
-# CHECK-NEXT:  3      7     1.00    *      *            orw	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        orw	%si, %di
-# CHECK-NEXT:  3      7     1.00    *      *            orw	%si, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   orw	(%rax), %di
-# CHECK-NEXT:  1      1     0.33                        orl	$665536, %eax
-# CHECK-NEXT:  1      1     0.33                        orl	$665536, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            orl	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        orl	$7, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            orl	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        orl	%esi, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            orl	%esi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   orl	(%rax), %edi
-# CHECK-NEXT:  1      1     0.33                        orq	$665536, %rax
-# CHECK-NEXT:  1      1     0.33                        orq	$665536, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            orq	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        orq	$7, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            orq	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        orq	%rsi, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            orq	%rsi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   orq	(%rax), %rdi
-# CHECK-NEXT:  1      100   0.33                  U     outb	%al, $7
-# CHECK-NEXT:  1      100   0.33                  U     outb	%al, %dx
-# CHECK-NEXT:  1      100   0.33                  U     outw	%ax, $7
-# CHECK-NEXT:  1      100   0.33                  U     outw	%ax, %dx
-# CHECK-NEXT:  1      100   0.33                  U     outl	%eax, $7
-# CHECK-NEXT:  1      100   0.33                  U     outl	%eax, %dx
-# CHECK-NEXT:  1      100   0.33                  U     outsb	(%rsi), %dx
-# CHECK-NEXT:  1      100   0.33                  U     outsw	(%rsi), %dx
-# CHECK-NEXT:  1      100   0.33                  U     outsl	(%rsi), %dx
-# CHECK-NEXT:  4      4     1.33    *      *      U     pause
-# CHECK-NEXT:  3      2     1.50                        rclb	%dil
-# CHECK-NEXT:  3      2     1.50                        rcrb	%dil
-# CHECK-NEXT:  11     11    3.50           *            rclb	(%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrb	(%rax)
-# CHECK-NEXT:  8      5     4.00                        rclb	$7, %dil
-# CHECK-NEXT:  8      5     4.00                        rcrb	$7, %dil
-# CHECK-NEXT:  11     11    3.50           *            rclb	$7, (%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrb	$7, (%rax)
-# CHECK-NEXT:  8      5     4.00                        rclb	%cl, %dil
-# CHECK-NEXT:  8      5     4.00                        rcrb	%cl, %dil
-# CHECK-NEXT:  11     11    3.50           *            rclb	%cl, (%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrb	%cl, (%rax)
-# CHECK-NEXT:  3      2     1.50                        rclw	%di
-# CHECK-NEXT:  3      2     1.50                        rcrw	%di
-# CHECK-NEXT:  11     11    3.50           *            rclw	(%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrw	(%rax)
-# CHECK-NEXT:  8      5     4.00                        rclw	$7, %di
-# CHECK-NEXT:  8      5     4.00                        rcrw	$7, %di
-# CHECK-NEXT:  11     11    3.50           *            rclw	$7, (%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrw	$7, (%rax)
-# CHECK-NEXT:  8      5     4.00                        rclw	%cl, %di
-# CHECK-NEXT:  8      5     4.00                        rcrw	%cl, %di
-# CHECK-NEXT:  11     11    3.50           *            rclw	%cl, (%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrw	%cl, (%rax)
-# CHECK-NEXT:  3      2     1.50                        rcll	%edi
-# CHECK-NEXT:  3      2     1.50                        rcrl	%edi
-# CHECK-NEXT:  11     11    3.50           *            rcll	(%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrl	(%rax)
-# CHECK-NEXT:  8      5     4.00                        rcll	$7, %edi
-# CHECK-NEXT:  8      5     4.00                        rcrl	$7, %edi
-# CHECK-NEXT:  11     11    3.50           *            rcll	$7, (%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrl	$7, (%rax)
-# CHECK-NEXT:  8      5     4.00                        rcll	%cl, %edi
-# CHECK-NEXT:  8      5     4.00                        rcrl	%cl, %edi
-# CHECK-NEXT:  11     11    3.50           *            rcll	%cl, (%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrl	%cl, (%rax)
-# CHECK-NEXT:  3      2     1.50                        rclq	%rdi
-# CHECK-NEXT:  3      2     1.50                        rcrq	%rdi
-# CHECK-NEXT:  11     11    3.50           *            rclq	(%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrq	(%rax)
-# CHECK-NEXT:  8      5     4.00                        rclq	$7, %rdi
-# CHECK-NEXT:  8      5     4.00                        rcrq	$7, %rdi
-# CHECK-NEXT:  11     11    3.50           *            rclq	$7, (%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrq	$7, (%rax)
-# CHECK-NEXT:  8      5     4.00                        rclq	%cl, %rdi
-# CHECK-NEXT:  8      5     4.00                        rcrq	%cl, %rdi
-# CHECK-NEXT:  11     11    3.50           *            rclq	%cl, (%rax)
-# CHECK-NEXT:  11     11    3.50           *            rcrq	%cl, (%rax)
-# CHECK-NEXT:  2      2     1.00                        rolb	%dil
-# CHECK-NEXT:  2      2     1.00                        rorb	%dil
-# CHECK-NEXT:  5      8     1.00    *      *            rolb	(%rax)
-# CHECK-NEXT:  5      8     1.00    *      *            rorb	(%rax)
-# CHECK-NEXT:  2      2     1.00                        rolb	$7, %dil
-# CHECK-NEXT:  2      2     1.00                        rorb	$7, %dil
-# CHECK-NEXT:  5      8     1.00    *      *            rolb	$7, (%rax)
-# CHECK-NEXT:  5      8     1.00    *      *            rorb	$7, (%rax)
-# CHECK-NEXT:  3      3     1.50                        rolb	%cl, %dil
-# CHECK-NEXT:  3      3     1.50                        rorb	%cl, %dil
-# CHECK-NEXT:  6      9     1.50    *      *            rolb	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            rorb	%cl, (%rax)
-# CHECK-NEXT:  2      2     1.00                        rolw	%di
-# CHECK-NEXT:  2      2     1.00                        rorw	%di
-# CHECK-NEXT:  5      8     1.00    *      *            rolw	(%rax)
-# CHECK-NEXT:  5      8     1.00    *      *            rorw	(%rax)
-# CHECK-NEXT:  2      2     1.00                        rolw	$7, %di
-# CHECK-NEXT:  2      2     1.00                        rorw	$7, %di
-# CHECK-NEXT:  5      8     1.00    *      *            rolw	$7, (%rax)
-# CHECK-NEXT:  5      8     1.00    *      *            rorw	$7, (%rax)
-# CHECK-NEXT:  3      3     1.50                        rolw	%cl, %di
-# CHECK-NEXT:  3      3     1.50                        rorw	%cl, %di
-# CHECK-NEXT:  6      9     1.50    *      *            rolw	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            rorw	%cl, (%rax)
-# CHECK-NEXT:  2      2     1.00                        roll	%edi
-# CHECK-NEXT:  2      2     1.00                        rorl	%edi
-# CHECK-NEXT:  5      8     1.00    *      *            roll	(%rax)
-# CHECK-NEXT:  5      8     1.00    *      *            rorl	(%rax)
-# CHECK-NEXT:  2      2     1.00                        roll	$7, %edi
-# CHECK-NEXT:  2      2     1.00                        rorl	$7, %edi
-# CHECK-NEXT:  5      8     1.00    *      *            roll	$7, (%rax)
-# CHECK-NEXT:  5      8     1.00    *      *            rorl	$7, (%rax)
-# CHECK-NEXT:  3      3     1.50                        roll	%cl, %edi
-# CHECK-NEXT:  3      3     1.50                        rorl	%cl, %edi
-# CHECK-NEXT:  6      9     1.50    *      *            roll	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            rorl	%cl, (%rax)
-# CHECK-NEXT:  2      2     1.00                        rolq	%rdi
-# CHECK-NEXT:  2      2     1.00                        rorq	%rdi
-# CHECK-NEXT:  5      8     1.00    *      *            rolq	(%rax)
-# CHECK-NEXT:  5      8     1.00    *      *            rorq	(%rax)
-# CHECK-NEXT:  2      2     1.00                        rolq	$7, %rdi
-# CHECK-NEXT:  2      2     1.00                        rorq	$7, %rdi
-# CHECK-NEXT:  5      8     1.00    *      *            rolq	$7, (%rax)
-# CHECK-NEXT:  5      8     1.00    *      *            rorq	$7, (%rax)
-# CHECK-NEXT:  3      3     1.50                        rolq	%cl, %rdi
-# CHECK-NEXT:  3      3     1.50                        rorq	%cl, %rdi
-# CHECK-NEXT:  6      9     1.50    *      *            rolq	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            rorq	%cl, (%rax)
-# CHECK-NEXT:  1      1     0.50                        sahf
+# CHECK-NEXT:  1      4     1.00                        mulb	%dil
+# CHECK-NEXT:  1      8     1.00    *                   mulb	(%rax)
+# CHECK-NEXT:  2      4     1.00                        mulw	%si
+# CHECK-NEXT:  2      8     1.00    *                   mulw	(%rax)
+# CHECK-NEXT:  1      4     1.00                        mull	%edx
+# CHECK-NEXT:  1      8     1.00    *                   mull	(%rax)
+# CHECK-NEXT:  1      6     4.00                        mulq	%rcx
+# CHECK-NEXT:  1      10    4.00    *                   mulq	(%rax)
+# CHECK-NEXT:  1      1     0.50                        negb	%dil
+# CHECK-NEXT:  2      6     1.00    *      *            negb	(%r8)
+# CHECK-NEXT:  1      1     0.50                        negw	%si
+# CHECK-NEXT:  2      6     1.00    *      *            negw	(%r9)
+# CHECK-NEXT:  1      1     0.50                        negl	%edx
+# CHECK-NEXT:  2      6     1.00    *      *            negl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        negq	%rcx
+# CHECK-NEXT:  2      6     1.00    *      *            negq	(%r10)
+# CHECK-NEXT:  1      1     0.50                        nop
+# CHECK-NEXT:  1      1     0.50                        nopw	%di
+# CHECK-NEXT:  1      1     0.50                        nopw	(%rcx)
+# CHECK-NEXT:  1      1     0.50                        nopl	%esi
+# CHECK-NEXT:  1      1     0.50                        nopl	(%r8)
+# CHECK-NEXT:  1      1     0.50                        nopq	%rdx
+# CHECK-NEXT:  1      1     0.50                        nopq	(%r9)
+# CHECK-NEXT:  1      1     0.50                        notb	%dil
+# CHECK-NEXT:  2      6     1.00    *      *            notb	(%r8)
+# CHECK-NEXT:  1      1     0.50                        notw	%si
+# CHECK-NEXT:  2      6     1.00    *      *            notw	(%r9)
+# CHECK-NEXT:  1      1     0.50                        notl	%edx
+# CHECK-NEXT:  2      6     1.00    *      *            notl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        notq	%rcx
+# CHECK-NEXT:  2      6     1.00    *      *            notq	(%r10)
+# CHECK-NEXT:  1      1     0.50                        orb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        orb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            orb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            orb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   orb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        orw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        orw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            orw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            orw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            orw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   orw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        orl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        orl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            orl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            orl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            orl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   orl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        orq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        orq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            orq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            orq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            orq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   orq	(%rax), %rdi
+# CHECK-NEXT:  1      100   0.50                  U     outb	%al, $7
+# CHECK-NEXT:  1      100   0.50                  U     outb	%al, %dx
+# CHECK-NEXT:  1      100   0.50                  U     outw	%ax, $7
+# CHECK-NEXT:  1      100   0.50                  U     outw	%ax, %dx
+# CHECK-NEXT:  1      100   0.50                  U     outl	%eax, $7
+# CHECK-NEXT:  1      100   0.50                  U     outl	%eax, %dx
+# CHECK-NEXT:  1      100   0.50                  U     outsb	(%rsi), %dx
+# CHECK-NEXT:  1      100   0.50                  U     outsw	(%rsi), %dx
+# CHECK-NEXT:  1      100   0.50                  U     outsl	(%rsi), %dx
+# CHECK-NEXT:  1      1     0.50    *      *      U     pause
+# CHECK-NEXT:  1      1     0.50                        rclb	%dil
+# CHECK-NEXT:  1      1     0.50                        rcrb	%dil
+# CHECK-NEXT:  2      5     1.00           *            rclb	(%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrb	(%rax)
+# CHECK-NEXT:  25     13    0.50                        rclb	$7, %dil
+# CHECK-NEXT:  23     12    0.50                        rcrb	$7, %dil
+# CHECK-NEXT:  2      5     1.00           *            rclb	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrb	$7, (%rax)
+# CHECK-NEXT:  26     12    0.50                        rclb	%cl, %dil
+# CHECK-NEXT:  24     11    0.50                        rcrb	%cl, %dil
+# CHECK-NEXT:  2      5     1.00           *            rclb	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrb	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rclw	%di
+# CHECK-NEXT:  1      1     0.50                        rcrw	%di
+# CHECK-NEXT:  2      5     1.00           *            rclw	(%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrw	(%rax)
+# CHECK-NEXT:  21     11    0.50                        rclw	$7, %di
+# CHECK-NEXT:  19     10    0.50                        rcrw	$7, %di
+# CHECK-NEXT:  2      5     1.00           *            rclw	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrw	$7, (%rax)
+# CHECK-NEXT:  22     10    0.50                        rclw	%cl, %di
+# CHECK-NEXT:  20     9     0.50                        rcrw	%cl, %di
+# CHECK-NEXT:  2      5     1.00           *            rclw	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrw	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rcll	%edi
+# CHECK-NEXT:  1      1     0.50                        rcrl	%edi
+# CHECK-NEXT:  2      5     1.00           *            rcll	(%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrl	(%rax)
+# CHECK-NEXT:  16     8     0.50                        rcll	$7, %edi
+# CHECK-NEXT:  15     7     0.50                        rcrl	$7, %edi
+# CHECK-NEXT:  2      5     1.00           *            rcll	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrl	$7, (%rax)
+# CHECK-NEXT:  17     7     0.50                        rcll	%cl, %edi
+# CHECK-NEXT:  16     7     0.50                        rcrl	%cl, %edi
+# CHECK-NEXT:  2      5     1.00           *            rcll	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrl	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rclq	%rdi
+# CHECK-NEXT:  1      1     0.50                        rcrq	%rdi
+# CHECK-NEXT:  2      5     1.00           *            rclq	(%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrq	(%rax)
+# CHECK-NEXT:  16     8     0.50                        rclq	$7, %rdi
+# CHECK-NEXT:  15     7     0.50                        rcrq	$7, %rdi
+# CHECK-NEXT:  2      5     1.00           *            rclq	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrq	$7, (%rax)
+# CHECK-NEXT:  17     7     0.50                        rclq	%cl, %rdi
+# CHECK-NEXT:  16     7     0.50                        rcrq	%cl, %rdi
+# CHECK-NEXT:  2      5     1.00           *            rclq	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrq	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolb	%dil
+# CHECK-NEXT:  1      1     0.50                        rorb	%dil
+# CHECK-NEXT:  2      5     1.00    *      *            rolb	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorb	(%rax)
+# CHECK-NEXT:  1      1     0.50                        rolb	$7, %dil
+# CHECK-NEXT:  1      1     0.50                        rorb	$7, %dil
+# CHECK-NEXT:  2      5     1.00    *      *            rolb	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolb	%cl, %dil
+# CHECK-NEXT:  1      1     0.50                        rorb	%cl, %dil
+# CHECK-NEXT:  2      5     1.00    *      *            rolb	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorb	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolw	%di
+# CHECK-NEXT:  1      1     0.50                        rorw	%di
+# CHECK-NEXT:  2      5     1.00    *      *            rolw	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorw	(%rax)
+# CHECK-NEXT:  1      1     0.50                        rolw	$7, %di
+# CHECK-NEXT:  1      1     0.50                        rorw	$7, %di
+# CHECK-NEXT:  2      5     1.00    *      *            rolw	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolw	%cl, %di
+# CHECK-NEXT:  1      1     0.50                        rorw	%cl, %di
+# CHECK-NEXT:  2      5     1.00    *      *            rolw	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorw	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        roll	%edi
+# CHECK-NEXT:  1      1     0.50                        rorl	%edi
+# CHECK-NEXT:  2      5     1.00    *      *            roll	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        roll	$7, %edi
+# CHECK-NEXT:  1      1     0.50                        rorl	$7, %edi
+# CHECK-NEXT:  2      5     1.00    *      *            roll	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        roll	%cl, %edi
+# CHECK-NEXT:  1      1     0.50                        rorl	%cl, %edi
+# CHECK-NEXT:  2      5     1.00    *      *            roll	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorl	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolq	%rdi
+# CHECK-NEXT:  1      1     0.50                        rorq	%rdi
+# CHECK-NEXT:  2      5     1.00    *      *            rolq	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorq	(%rax)
+# CHECK-NEXT:  1      1     0.50                        rolq	$7, %rdi
+# CHECK-NEXT:  1      1     0.50                        rorq	$7, %rdi
+# CHECK-NEXT:  2      5     1.00    *      *            rolq	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolq	%cl, %rdi
+# CHECK-NEXT:  1      1     0.50                        rorq	%cl, %rdi
+# CHECK-NEXT:  2      5     1.00    *      *            rolq	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorq	%cl, (%rax)
+# CHECK-NEXT:  2      2     0.50                        sahf
 # CHECK-NEXT:  1      1     0.50                        sarb	%dil
 # CHECK-NEXT:  1      1     0.50                        shlb	%dil
 # CHECK-NEXT:  1      1     0.50                        shrb	%dil
-# CHECK-NEXT:  4      7     1.00    *      *            sarb	(%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shlb	(%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shrb	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            sarb	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlb	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrb	(%rax)
 # CHECK-NEXT:  1      1     0.50                        sarb	$7, %dil
 # CHECK-NEXT:  1      1     0.50                        shlb	$7, %dil
 # CHECK-NEXT:  1      1     0.50                        shrb	$7, %dil
-# CHECK-NEXT:  4      7     1.00    *      *            sarb	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shlb	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shrb	$7, (%rax)
-# CHECK-NEXT:  3      3     1.50                        sarb	%cl, %dil
-# CHECK-NEXT:  3      3     1.50                        shlb	%cl, %dil
-# CHECK-NEXT:  3      3     1.50                        shrb	%cl, %dil
-# CHECK-NEXT:  6      9     1.50    *      *            sarb	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            shlb	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            shrb	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            sarb	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlb	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarb	%cl, %dil
+# CHECK-NEXT:  1      1     0.50                        shlb	%cl, %dil
+# CHECK-NEXT:  1      1     0.50                        shrb	%cl, %dil
+# CHECK-NEXT:  2      5     1.00    *      *            sarb	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlb	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrb	%cl, (%rax)
 # CHECK-NEXT:  1      1     0.50                        sarw	%di
 # CHECK-NEXT:  1      1     0.50                        shlw	%di
 # CHECK-NEXT:  1      1     0.50                        shrw	%di
-# CHECK-NEXT:  4      7     1.00    *      *            sarw	(%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shlw	(%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shrw	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            sarw	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlw	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrw	(%rax)
 # CHECK-NEXT:  1      1     0.50                        sarw	$7, %di
 # CHECK-NEXT:  1      1     0.50                        shlw	$7, %di
 # CHECK-NEXT:  1      1     0.50                        shrw	$7, %di
-# CHECK-NEXT:  4      7     1.00    *      *            sarw	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shlw	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shrw	$7, (%rax)
-# CHECK-NEXT:  3      3     1.50                        sarw	%cl, %di
-# CHECK-NEXT:  3      3     1.50                        shlw	%cl, %di
-# CHECK-NEXT:  3      3     1.50                        shrw	%cl, %di
-# CHECK-NEXT:  6      9     1.50    *      *            sarw	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            shlw	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            shrw	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            sarw	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlw	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarw	%cl, %di
+# CHECK-NEXT:  1      1     0.50                        shlw	%cl, %di
+# CHECK-NEXT:  1      1     0.50                        shrw	%cl, %di
+# CHECK-NEXT:  2      5     1.00    *      *            sarw	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlw	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrw	%cl, (%rax)
 # CHECK-NEXT:  1      1     0.50                        sarl	%edi
 # CHECK-NEXT:  1      1     0.50                        shll	%edi
 # CHECK-NEXT:  1      1     0.50                        shrl	%edi
-# CHECK-NEXT:  4      7     1.00    *      *            sarl	(%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shll	(%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shrl	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            sarl	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shll	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrl	(%rax)
 # CHECK-NEXT:  1      1     0.50                        sarl	$7, %edi
 # CHECK-NEXT:  1      1     0.50                        shll	$7, %edi
 # CHECK-NEXT:  1      1     0.50                        shrl	$7, %edi
-# CHECK-NEXT:  4      7     1.00    *      *            sarl	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shll	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shrl	$7, (%rax)
-# CHECK-NEXT:  3      3     1.50                        sarl	%cl, %edi
-# CHECK-NEXT:  3      3     1.50                        shll	%cl, %edi
-# CHECK-NEXT:  3      3     1.50                        shrl	%cl, %edi
-# CHECK-NEXT:  6      9     1.50    *      *            sarl	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            shll	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            shrl	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            sarl	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shll	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarl	%cl, %edi
+# CHECK-NEXT:  1      1     0.50                        shll	%cl, %edi
+# CHECK-NEXT:  1      1     0.50                        shrl	%cl, %edi
+# CHECK-NEXT:  2      5     1.00    *      *            sarl	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shll	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrl	%cl, (%rax)
 # CHECK-NEXT:  1      1     0.50                        sarq	%rdi
 # CHECK-NEXT:  1      1     0.50                        shlq	%rdi
 # CHECK-NEXT:  1      1     0.50                        shrq	%rdi
-# CHECK-NEXT:  4      7     1.00    *      *            sarq	(%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shlq	(%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shrq	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            sarq	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlq	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrq	(%rax)
 # CHECK-NEXT:  1      1     0.50                        sarq	$7, %rdi
 # CHECK-NEXT:  1      1     0.50                        shlq	$7, %rdi
 # CHECK-NEXT:  1      1     0.50                        shrq	$7, %rdi
-# CHECK-NEXT:  4      7     1.00    *      *            sarq	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shlq	$7, (%rax)
-# CHECK-NEXT:  4      7     1.00    *      *            shrq	$7, (%rax)
-# CHECK-NEXT:  3      3     1.50                        sarq	%cl, %rdi
-# CHECK-NEXT:  3      3     1.50                        shlq	%cl, %rdi
-# CHECK-NEXT:  3      3     1.50                        shrq	%cl, %rdi
-# CHECK-NEXT:  6      9     1.50    *      *            sarq	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            shlq	%cl, (%rax)
-# CHECK-NEXT:  6      9     1.50    *      *            shrq	%cl, (%rax)
-# CHECK-NEXT:  2      2     0.67                        sbbb	$7, %al
-# CHECK-NEXT:  2      2     0.67                        sbbb	$7, %dil
-# CHECK-NEXT:  6      9     1.00    *      *            sbbb	$7, (%rax)
-# CHECK-NEXT:  2      2     0.67                        sbbb	%sil, %dil
-# CHECK-NEXT:  6      9     1.00    *      *            sbbb	%sil, (%rax)
-# CHECK-NEXT:  3      7     0.67    *                   sbbb	(%rax), %dil
-# CHECK-NEXT:  2      2     0.67                        sbbw	$511, %ax
-# CHECK-NEXT:  2      2     0.67                        sbbw	$511, %di
-# CHECK-NEXT:  6      9     1.00    *      *            sbbw	$511, (%rax)
-# CHECK-NEXT:  2      2     0.67                        sbbw	$7, %di
-# CHECK-NEXT:  6      9     1.00    *      *            sbbw	$7, (%rax)
-# CHECK-NEXT:  2      2     0.67                        sbbw	%si, %di
-# CHECK-NEXT:  6      9     1.00    *      *            sbbw	%si, (%rax)
-# CHECK-NEXT:  3      7     0.67    *                   sbbw	(%rax), %di
-# CHECK-NEXT:  2      2     0.67                        sbbl	$665536, %eax
-# CHECK-NEXT:  2      2     0.67                        sbbl	$665536, %edi
-# CHECK-NEXT:  6      9     1.00    *      *            sbbl	$665536, (%rax)
-# CHECK-NEXT:  2      2     0.67                        sbbl	$7, %edi
-# CHECK-NEXT:  6      9     1.00    *      *            sbbl	$7, (%rax)
-# CHECK-NEXT:  2      2     0.67                        sbbl	%esi, %edi
-# CHECK-NEXT:  6      9     1.00    *      *            sbbl	%esi, (%rax)
-# CHECK-NEXT:  3      7     0.67    *                   sbbl	(%rax), %edi
-# CHECK-NEXT:  2      2     0.67                        sbbq	$665536, %rax
-# CHECK-NEXT:  2      2     0.67                        sbbq	$665536, %rdi
-# CHECK-NEXT:  6      9     1.00    *      *            sbbq	$665536, (%rax)
-# CHECK-NEXT:  2      2     0.67                        sbbq	$7, %rdi
-# CHECK-NEXT:  6      9     1.00    *      *            sbbq	$7, (%rax)
-# CHECK-NEXT:  2      2     0.67                        sbbq	%rsi, %rdi
-# CHECK-NEXT:  6      9     1.00    *      *            sbbq	%rsi, (%rax)
-# CHECK-NEXT:  3      7     0.67    *                   sbbq	(%rax), %rdi
-# CHECK-NEXT:  2      2     0.67                  U     scasb	%es:(%rdi), %al
-# CHECK-NEXT:  2      2     0.67                  U     scasw	%es:(%rdi), %ax
-# CHECK-NEXT:  2      2     0.67                  U     scasl	%es:(%rdi), %eax
-# CHECK-NEXT:  2      2     0.67                  U     scasq	%es:(%rdi), %rax
+# CHECK-NEXT:  2      5     1.00    *      *            sarq	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlq	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarq	%cl, %rdi
+# CHECK-NEXT:  1      1     0.50                        shlq	%cl, %rdi
+# CHECK-NEXT:  1      1     0.50                        shrq	%cl, %rdi
+# CHECK-NEXT:  2      5     1.00    *      *            sarq	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlq	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrq	%cl, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbb	$7, %al
+# CHECK-NEXT:  1      1     1.00                        sbbb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            sbbb	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            sbbb	%sil, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   sbbb	(%rax), %dil
+# CHECK-NEXT:  1      1     1.00                        sbbw	$511, %ax
+# CHECK-NEXT:  1      1     1.00                        sbbw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            sbbw	$511, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            sbbw	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            sbbw	%si, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   sbbw	(%rax), %di
+# CHECK-NEXT:  1      1     1.00                        sbbl	$665536, %eax
+# CHECK-NEXT:  1      1     1.00                        sbbl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbl	$665536, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbl	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbl	%esi, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   sbbl	(%rax), %edi
+# CHECK-NEXT:  1      1     1.00                        sbbq	$665536, %rax
+# CHECK-NEXT:  1      1     1.00                        sbbq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbq	$665536, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbq	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   sbbq	(%rax), %rdi
+# CHECK-NEXT:  1      100   0.50                  U     scasb	%es:(%rdi), %al
+# CHECK-NEXT:  1      100   0.50                  U     scasw	%es:(%rdi), %ax
+# CHECK-NEXT:  1      100   0.50                  U     scasl	%es:(%rdi), %eax
+# CHECK-NEXT:  1      100   0.50                  U     scasq	%es:(%rdi), %rax
 # CHECK-NEXT:  1      1     0.50                        seto	%al
-# CHECK-NEXT:  3      2     1.00           *            seto	(%rax)
+# CHECK-NEXT:  1      1     0.50           *            seto	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setno	%al
-# CHECK-NEXT:  3      2     1.00           *            setno	(%rax)
+# CHECK-NEXT:  1      1     0.50           *            setno	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setb	%al
-# CHECK-NEXT:  3      2     1.00           *            setb	(%rax)
+# CHECK-NEXT:  1      1     0.50           *            setb	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setae	%al
-# CHECK-NEXT:  3      2     1.00           *            setae	(%rax)
+# CHECK-NEXT:  1      1     0.50           *            setae	(%rax)
 # CHECK-NEXT:  1      1     0.50                        sete	%al
-# CHECK-NEXT:  3      2     1.00           *            sete	(%rax)
+# CHECK-NEXT:  1      1     0.50           *            sete	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setne	%al
-# CHECK-NEXT:  3      2     1.00           *            setne	(%rax)
-# CHECK-NEXT:  2      2     1.00                        seta	%al
-# CHECK-NEXT:  4      3     1.00           *            seta	(%rax)
-# CHECK-NEXT:  2      2     1.00                        setbe	%al
-# CHECK-NEXT:  4      3     1.00           *            setbe	(%rax)
+# CHECK-NEXT:  1      1     0.50           *            setne	(%rax)
+# CHECK-NEXT:  1      1     0.50                        seta	%al
+# CHECK-NEXT:  1      1     0.50           *            seta	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setbe	%al
+# CHECK-NEXT:  1      1     0.50           *            setbe	(%rax)
 # CHECK-NEXT:  1      1     0.50                        sets	%al
-# CHECK-NEXT:  3      2     1.00           *            sets	(%rax)
+# CHECK-NEXT:  1      1     0.50           *            sets	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setns	%al
-# CHECK-NEXT:  3      2     1.00           *            setns	(%rax)
+# CHECK-NEXT:  1      1     0.50           *            setns	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setp	%al
-# CHECK-NEXT:  3      2     1.00           *            setp	(%rax)
+# CHECK-NEXT:  1      1     0.50           *            setp	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setnp	%al
-# CHECK-NEXT:  3      2     1.00           *            setnp	(%rax)
+# CHECK-NEXT:  1      1     0.50           *            setnp	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setl	%al
-# CHECK-NEXT:  3      2     1.00           *            setl	(%rax)
+# CHECK-NEXT:  2      1     1.00           *            setl	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setge	%al
-# CHECK-NEXT:  3      2     1.00           *            setge	(%rax)
+# CHECK-NEXT:  2      1     1.00           *            setge	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setg	%al
-# CHECK-NEXT:  3      2     1.00           *            setg	(%rax)
+# CHECK-NEXT:  2      1     1.00           *            setg	(%rax)
 # CHECK-NEXT:  1      1     0.50                        setle	%al
-# CHECK-NEXT:  3      2     1.00           *            setle	(%rax)
-# CHECK-NEXT:  4      4     1.50                        shldw	%cl, %si, %di
-# CHECK-NEXT:  4      4     1.50                        shrdw	%cl, %si, %di
-# CHECK-NEXT:  7      10    1.50    *      *            shldw	%cl, %si, (%rax)
-# CHECK-NEXT:  7      10    1.50    *      *            shrdw	%cl, %si, (%rax)
-# CHECK-NEXT:  2      2     0.67                        shldw	$7, %si, %di
-# CHECK-NEXT:  2      2     0.67                        shrdw	$7, %si, %di
-# CHECK-NEXT:  5      8     1.00    *      *            shldw	$7, %si, (%rax)
-# CHECK-NEXT:  5      8     1.00    *      *            shrdw	$7, %si, (%rax)
-# CHECK-NEXT:  4      4     1.50                        shldl	%cl, %esi, %edi
-# CHECK-NEXT:  4      4     1.50                        shrdl	%cl, %esi, %edi
-# CHECK-NEXT:  7      10    1.50    *      *            shldl	%cl, %esi, (%rax)
-# CHECK-NEXT:  7      10    1.50    *      *            shrdl	%cl, %esi, (%rax)
-# CHECK-NEXT:  2      2     0.67                        shldl	$7, %esi, %edi
-# CHECK-NEXT:  2      2     0.67                        shrdl	$7, %esi, %edi
-# CHECK-NEXT:  5      8     1.00    *      *            shldl	$7, %esi, (%rax)
-# CHECK-NEXT:  5      8     1.00    *      *            shrdl	$7, %esi, (%rax)
-# CHECK-NEXT:  4      4     1.50                        shldq	%cl, %rsi, %rdi
-# CHECK-NEXT:  4      4     1.50                        shrdq	%cl, %rsi, %rdi
-# CHECK-NEXT:  7      10    1.50    *      *            shldq	%cl, %rsi, (%rax)
-# CHECK-NEXT:  7      10    1.50    *      *            shrdq	%cl, %rsi, (%rax)
-# CHECK-NEXT:  2      2     0.67                        shldq	$7, %rsi, %rdi
-# CHECK-NEXT:  2      2     0.67                        shrdq	$7, %rsi, %rdi
-# CHECK-NEXT:  5      8     1.00    *      *            shldq	$7, %rsi, (%rax)
-# CHECK-NEXT:  5      8     1.00    *      *            shrdq	$7, %rsi, (%rax)
-# CHECK-NEXT:  1      1     0.33                  U     stc
-# CHECK-NEXT:  1      1     0.33                  U     std
-# CHECK-NEXT:  3      5     1.00                  U     stosb	%al, %es:(%rdi)
-# CHECK-NEXT:  3      5     1.00                  U     stosw	%ax, %es:(%rdi)
-# CHECK-NEXT:  3      5     1.00                  U     stosl	%eax, %es:(%rdi)
-# CHECK-NEXT:  3      5     1.00                  U     stosq	%rax, %es:(%rdi)
-# CHECK-NEXT:  1      1     0.33                        subb	$7, %al
-# CHECK-NEXT:  1      1     0.33                        subb	$7, %dil
-# CHECK-NEXT:  3      7     1.00    *      *            subb	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        subb	%sil, %dil
-# CHECK-NEXT:  3      7     1.00    *      *            subb	%sil, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   subb	(%rax), %dil
-# CHECK-NEXT:  1      1     0.33                        subw	$511, %ax
-# CHECK-NEXT:  1      1     0.33                        subw	$511, %di
-# CHECK-NEXT:  3      7     1.00    *      *            subw	$511, (%rax)
-# CHECK-NEXT:  1      1     0.33                        subw	$7, %di
-# CHECK-NEXT:  3      7     1.00    *      *            subw	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        subw	%si, %di
-# CHECK-NEXT:  3      7     1.00    *      *            subw	%si, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   subw	(%rax), %di
-# CHECK-NEXT:  1      1     0.33                        subl	$665536, %eax
-# CHECK-NEXT:  1      1     0.33                        subl	$665536, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            subl	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        subl	$7, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            subl	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        subl	%esi, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            subl	%esi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   subl	(%rax), %edi
-# CHECK-NEXT:  1      1     0.33                        subq	$665536, %rax
-# CHECK-NEXT:  1      1     0.33                        subq	$665536, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            subq	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        subq	$7, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            subq	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        subq	%rsi, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            subq	%rsi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   subq	(%rax), %rdi
-# CHECK-NEXT:  1      1     0.33                        testb	$7, %al
-# CHECK-NEXT:  1      1     0.33                        testb	$7, %dil
-# CHECK-NEXT:  2      6     0.50    *                   testb	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        testb	%sil, %dil
-# CHECK-NEXT:  2      6     0.50    *                   testb	%sil, (%rax)
-# CHECK-NEXT:  1      1     0.33                        testw	$511, %ax
-# CHECK-NEXT:  1      1     0.33                        testw	$511, %di
-# CHECK-NEXT:  2      6     0.50    *                   testw	$511, (%rax)
-# CHECK-NEXT:  1      1     0.33                        testw	$7, %di
-# CHECK-NEXT:  2      6     0.50    *                   testw	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        testw	%si, %di
-# CHECK-NEXT:  2      6     0.50    *                   testw	%si, (%rax)
-# CHECK-NEXT:  1      1     0.33                        testl	$665536, %eax
-# CHECK-NEXT:  1      1     0.33                        testl	$665536, %edi
-# CHECK-NEXT:  2      6     0.50    *                   testl	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        testl	$7, %edi
-# CHECK-NEXT:  2      6     0.50    *                   testl	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        testl	%esi, %edi
-# CHECK-NEXT:  2      6     0.50    *                   testl	%esi, (%rax)
-# CHECK-NEXT:  1      1     0.33                        testq	$665536, %rax
-# CHECK-NEXT:  1      1     0.33                        testq	$665536, %rdi
-# CHECK-NEXT:  2      6     0.50    *                   testq	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        testq	$7, %rdi
-# CHECK-NEXT:  2      6     0.50    *                   testq	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        testq	%rsi, %rdi
-# CHECK-NEXT:  2      6     0.50    *                   testq	%rsi, (%rax)
-# CHECK-NEXT:  1      100   0.33    *             U     ud2
-# CHECK-NEXT:  3      2     1.00                        xaddb	%bl, %cl
-# CHECK-NEXT:  5      8     1.00    *      *            xaddb	%bl, (%rcx)
-# CHECK-NEXT:  3      2     1.00                        xaddw	%bx, %cx
-# CHECK-NEXT:  5      8     1.00    *      *            xaddw	%ax, (%rbx)
-# CHECK-NEXT:  3      2     1.00                        xaddl	%ebx, %ecx
-# CHECK-NEXT:  5      8     1.00    *      *            xaddl	%eax, (%rbx)
-# CHECK-NEXT:  3      2     1.00                        xaddq	%rbx, %rcx
-# CHECK-NEXT:  5      8     1.00    *      *            xaddq	%rax, (%rbx)
-# CHECK-NEXT:  3      2     1.00                        xchgb	%bl, %cl
-# CHECK-NEXT:  3      6     1.00    *      *            xchgb	%bl, (%rbx)
-# CHECK-NEXT:  3      2     1.00                        xchgw	%bx, %ax
-# CHECK-NEXT:  3      2     1.00                        xchgw	%bx, %cx
-# CHECK-NEXT:  3      6     1.00    *      *            xchgw	%ax, (%rbx)
-# CHECK-NEXT:  3      2     1.00                        xchgl	%ebx, %eax
-# CHECK-NEXT:  3      2     1.00                        xchgl	%ebx, %ecx
-# CHECK-NEXT:  3      6     1.00    *      *            xchgl	%eax, (%rbx)
-# CHECK-NEXT:  3      2     1.00                        xchgq	%rbx, %rax
-# CHECK-NEXT:  3      2     1.00                        xchgq	%rbx, %rcx
-# CHECK-NEXT:  3      6     1.00    *      *            xchgq	%rax, (%rbx)
-# CHECK-NEXT:  1      5     0.50    *                   xlatb
-# CHECK-NEXT:  1      1     0.33                        xorb	$7, %al
-# CHECK-NEXT:  1      1     0.33                        xorb	$7, %dil
-# CHECK-NEXT:  3      7     1.00    *      *            xorb	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        xorb	%sil, %dil
-# CHECK-NEXT:  3      7     1.00    *      *            xorb	%sil, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   xorb	(%rax), %dil
-# CHECK-NEXT:  1      1     0.33                        xorw	$511, %ax
-# CHECK-NEXT:  1      1     0.33                        xorw	$511, %di
-# CHECK-NEXT:  3      7     1.00    *      *            xorw	$511, (%rax)
-# CHECK-NEXT:  1      1     0.33                        xorw	$7, %di
-# CHECK-NEXT:  3      7     1.00    *      *            xorw	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        xorw	%si, %di
-# CHECK-NEXT:  3      7     1.00    *      *            xorw	%si, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   xorw	(%rax), %di
-# CHECK-NEXT:  1      1     0.33                        xorl	$665536, %eax
-# CHECK-NEXT:  1      1     0.33                        xorl	$665536, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            xorl	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        xorl	$7, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            xorl	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        xorl	%esi, %edi
-# CHECK-NEXT:  3      7     1.00    *      *            xorl	%esi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   xorl	(%rax), %edi
-# CHECK-NEXT:  1      1     0.33                        xorq	$665536, %rax
-# CHECK-NEXT:  1      1     0.33                        xorq	$665536, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            xorq	$665536, (%rax)
-# CHECK-NEXT:  1      1     0.33                        xorq	$7, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            xorq	$7, (%rax)
-# CHECK-NEXT:  1      1     0.33                        xorq	%rsi, %rdi
-# CHECK-NEXT:  3      7     1.00    *      *            xorq	%rsi, (%rax)
-# CHECK-NEXT:  2      6     0.50    *                   xorq	(%rax), %rdi
+# CHECK-NEXT:  2      1     1.00           *            setle	(%rax)
+# CHECK-NEXT:  7      4     4.00                        shldw	%cl, %si, %di
+# CHECK-NEXT:  7      4     4.00                        shrdw	%cl, %si, %di
+# CHECK-NEXT:  8      4     11.00   *      *            shldw	%cl, %si, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdw	%cl, %si, (%rax)
+# CHECK-NEXT:  6      4     3.00                        shldw	$7, %si, %di
+# CHECK-NEXT:  6      3     3.00                        shrdw	$7, %si, %di
+# CHECK-NEXT:  8      4     11.00   *      *            shldw	$7, %si, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdw	$7, %si, (%rax)
+# CHECK-NEXT:  7      4     4.00                        shldl	%cl, %esi, %edi
+# CHECK-NEXT:  7      4     4.00                        shrdl	%cl, %esi, %edi
+# CHECK-NEXT:  8      4     11.00   *      *            shldl	%cl, %esi, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdl	%cl, %esi, (%rax)
+# CHECK-NEXT:  6      3     3.00                        shldl	$7, %esi, %edi
+# CHECK-NEXT:  6      4     3.00                        shrdl	$7, %esi, %edi
+# CHECK-NEXT:  8      4     11.00   *      *            shldl	$7, %esi, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdl	$7, %esi, (%rax)
+# CHECK-NEXT:  7      4     4.00                        shldq	%cl, %rsi, %rdi
+# CHECK-NEXT:  7      4     4.00                        shrdq	%cl, %rsi, %rdi
+# CHECK-NEXT:  8      4     11.00   *      *            shldq	%cl, %rsi, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdq	%cl, %rsi, (%rax)
+# CHECK-NEXT:  6      4     3.00                        shldq	$7, %rsi, %rdi
+# CHECK-NEXT:  6      4     3.00                        shrdq	$7, %rsi, %rdi
+# CHECK-NEXT:  8      4     11.00   *      *            shldq	$7, %rsi, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdq	$7, %rsi, (%rax)
+# CHECK-NEXT:  1      1     0.50                  U     stc
+# CHECK-NEXT:  1      1     0.50                  U     std
+# CHECK-NEXT:  1      100   0.50                  U     stosb	%al, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     stosw	%ax, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     stosl	%eax, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     stosq	%rax, %es:(%rdi)
+# CHECK-NEXT:  1      1     0.50                        subb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        subb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            subb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            subb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   subb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        subw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        subw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            subw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            subw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            subw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   subw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        subl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        subl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            subl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            subl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            subl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   subl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        subq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        subq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            subq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            subq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            subq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   subq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.50                        testb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        testb	$7, %dil
+# CHECK-NEXT:  1      5     0.50    *                   testb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testb	%sil, %dil
+# CHECK-NEXT:  1      5     0.50    *                   testb	%sil, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        testw	$511, %di
+# CHECK-NEXT:  1      5     0.50    *                   testw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testw	$7, %di
+# CHECK-NEXT:  1      5     0.50    *                   testw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testw	%si, %di
+# CHECK-NEXT:  1      5     0.50    *                   testw	%si, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        testl	$665536, %edi
+# CHECK-NEXT:  1      5     0.50    *                   testl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testl	$7, %edi
+# CHECK-NEXT:  1      5     0.50    *                   testl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testl	%esi, %edi
+# CHECK-NEXT:  1      5     0.50    *                   testl	%esi, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        testq	$665536, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   testq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testq	$7, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   testq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testq	%rsi, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   testq	%rsi, (%rax)
+# CHECK-NEXT:  1      100   0.50    *             U     ud2
+# CHECK-NEXT:  4      2     1.00                        xaddb	%bl, %cl
+# CHECK-NEXT:  4      6     1.00    *      *            xaddb	%bl, (%rcx)
+# CHECK-NEXT:  4      2     1.00                        xaddw	%bx, %cx
+# CHECK-NEXT:  4      6     1.00    *      *            xaddw	%ax, (%rbx)
+# CHECK-NEXT:  4      2     1.00                        xaddl	%ebx, %ecx
+# CHECK-NEXT:  4      6     1.00    *      *            xaddl	%eax, (%rbx)
+# CHECK-NEXT:  4      2     1.00                        xaddq	%rbx, %rcx
+# CHECK-NEXT:  4      6     1.00    *      *            xaddq	%rax, (%rbx)
+# CHECK-NEXT:  2      1     1.00                        xchgb	%bl, %cl
+# CHECK-NEXT:  2      5     1.00    *      *            xchgb	%bl, (%rbx)
+# CHECK-NEXT:  2      1     1.00                        xchgw	%bx, %ax
+# CHECK-NEXT:  2      2     1.00                        xchgw	%bx, %cx
+# CHECK-NEXT:  2      5     1.00    *      *            xchgw	%ax, (%rbx)
+# CHECK-NEXT:  2      1     1.00                        xchgl	%ebx, %eax
+# CHECK-NEXT:  2      1     1.00                        xchgl	%ebx, %ecx
+# CHECK-NEXT:  2      5     1.00    *      *            xchgl	%eax, (%rbx)
+# CHECK-NEXT:  2      1     1.00                        xchgq	%rbx, %rax
+# CHECK-NEXT:  2      1     1.00                        xchgq	%rbx, %rcx
+# CHECK-NEXT:  2      5     1.00    *      *            xchgq	%rax, (%rbx)
+# CHECK-NEXT:  1      6     0.50    *                   xlatb
+# CHECK-NEXT:  1      1     0.50                        xorb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        xorb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            xorb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            xorb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   xorb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        xorw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        xorw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            xorw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            xorw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            xorw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   xorw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        xorl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        xorl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            xorl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            xorl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            xorl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   xorl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        xorq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        xorq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            xorq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            xorq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            xorq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   xorq	(%rax), %rdi
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT: 160.00  -     571.83 221.33 222.00 571.83 316.00 316.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 262.00 262.00  -      -     246.00 547.50 622.50  -      -      -      -      -      -      -      -      -      -      -      -     64.00
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcb	$7, %al
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcb	$7, %dil
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcb	$7, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcb	%sil, %dil
-# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   adcb	%sil, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcb	(%rax), %dil
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcw	$511, %ax
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcw	$511, %di
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcw	$511, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcw	$7, %di
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcw	%si, %di
-# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   adcw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcw	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcl	$665536, %eax
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcl	$665536, %edi
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcl	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcl	$7, %edi
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcl	%esi, %edi
-# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   adcl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcq	$665536, %rax
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcq	$665536, %rdi
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcq	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcq	$7, %rdi
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   adcq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     adcq	%rsi, %rdi
-# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   adcq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   adcq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addb	$7, %al
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addb	$7, %dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addb	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addb	%sil, %dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addb	%sil, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   addb	(%rax), %dil
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addw	$511, %ax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addw	$511, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addw	$511, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addw	$7, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addw	%si, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   addw	(%rax), %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addl	$665536, %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addl	$665536, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addl	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addl	$7, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addl	%esi, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   addl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addq	$665536, %rax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addq	$665536, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addq	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addq	$7, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     addq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   addq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   addq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andb	$7, %al
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andb	$7, %dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andb	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andb	%sil, %dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andb	%sil, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andb	(%rax), %dil
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andw	$511, %ax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andw	$511, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andw	$511, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andw	$7, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andw	%si, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andw	(%rax), %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andl	$665536, %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andl	$665536, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andl	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andl	$7, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andl	%esi, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andq	$665536, %rax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andq	$665536, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andq	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andq	$7, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     andq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   andq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   andq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsfw	%si, %di
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsrw	%si, %di
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsfw	(%rax), %di
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsrw	(%rax), %di
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsfl	%esi, %edi
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsrl	%esi, %edi
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsfl	(%rax), %edi
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsrl	(%rax), %edi
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsfq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bsrq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsfq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   bsrq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     bswapl	%eax
-# CHECK-NEXT:  -      -     0.50   1.00    -     0.50    -      -     bswapq	%rax
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btw	%si, %di
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcw	%si, %di
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrw	%si, %di
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsw	%si, %di
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btcw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btrw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btsw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btw	$7, %di
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcw	$7, %di
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrw	$7, %di
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsw	$7, %di
-# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   btw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btcw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btrw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btsw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btl	%esi, %edi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcl	%esi, %edi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrl	%esi, %edi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsl	%esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btcl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btrl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btsl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btl	$7, %edi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcl	$7, %edi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrl	$7, %edi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsl	$7, %edi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   btl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btcl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btrl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btsl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btcq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btrq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   1.83   1.00   1.00   btsq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btq	$7, %rdi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btcq	$7, %rdi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btrq	$7, %rdi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     btsq	$7, %rdi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50   0.50   0.50   btq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btcq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btrq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   btsq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cbtw
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cwtl
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cltq
-# CHECK-NEXT:  -      -     1.33   0.33    -     0.33    -      -     cwtd
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     cltd
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     cqto
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     clc
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cld
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmc
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpb	$7, %al
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpb	$7, %dil
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpb	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpb	%sil, %dil
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpb	%sil, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpb	(%rax), %dil
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpw	$511, %ax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpw	$511, %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpw	$511, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpw	$7, %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpw	%si, %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpw	(%rax), %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpl	$665536, %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpl	$665536, %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpl	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpl	$7, %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpl	%esi, %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpq	$665536, %rax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpq	$665536, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpq	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpq	$7, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cmpq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   cmpq	(%rax), %rdi
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   1.00   1.00   cmpsb	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   1.00   1.00   cmpsw	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   1.00   1.00   cmpsl	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00   1.00   1.00   cmpsq	%es:(%rdi), (%rsi)
-# CHECK-NEXT:  -      -     1.50   1.00    -     1.50    -      -     cmpxchgb	%cl, %bl
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   2.33   1.00   1.00   cmpxchgb	%cl, (%rbx)
-# CHECK-NEXT:  -      -     1.50   1.00    -     1.50    -      -     cmpxchgw	%cx, %bx
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   2.33   1.00   1.00   cmpxchgw	%cx, (%rbx)
-# CHECK-NEXT:  -      -     1.50   1.00    -     1.50    -      -     cmpxchgl	%ecx, %ebx
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   2.33   1.00   1.00   cmpxchgl	%ecx, (%rbx)
-# CHECK-NEXT:  -      -     1.50   1.00    -     1.50    -      -     cmpxchgq	%rcx, %rbx
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   2.33   1.00   1.00   cmpxchgq	%rcx, (%rbx)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     cpuid
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     decb	%dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   decb	(%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     decw	%di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   decw	(%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     decl	%edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   decl	(%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     decq	%rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   decq	(%rax)
-# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     divb	%dil
-# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   divb	(%rax)
-# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     divw	%si
-# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   divw	(%rax)
-# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     divl	%edx
-# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   divl	(%rax)
-# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     divq	%rcx
-# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   divq	(%rax)
-# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     idivb	%dil
-# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   idivb	(%rax)
-# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     idivw	%si
-# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   idivw	(%rax)
-# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     idivl	%edx
-# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   idivl	(%rax)
-# CHECK-NEXT: 10.00   -     1.00    -      -      -      -      -     idivq	%rcx
-# CHECK-NEXT: 10.00   -     1.00    -      -      -     0.50   0.50   idivq	(%rax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulb	%dil
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imulb	(%rax)
-# CHECK-NEXT:  -      -     1.17   1.67    -     1.17    -      -     imulw	%di
-# CHECK-NEXT:  -      -     1.17   1.67    -     1.17   0.50   0.50   imulw	(%rax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulw	%si, %di
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imulw	(%rax), %di
-# CHECK-NEXT:  -      -     0.33   1.33    -     0.33    -      -     imulw	$511, %si, %di
-# CHECK-NEXT:  -      -     0.33   1.33    -     0.33   0.50   0.50   imulw	$511, (%rax), %di
-# CHECK-NEXT:  -      -     0.33   1.33    -     0.33    -      -     imulw	$7, %si, %di
-# CHECK-NEXT:  -      -     0.33   1.33    -     0.33   0.50   0.50   imulw	$7, (%rax), %di
-# CHECK-NEXT:  -      -     0.83   1.33    -     0.83    -      -     imull	%edi
-# CHECK-NEXT:  -      -     0.83   1.33    -     0.83   0.50   0.50   imull	(%rax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imull	%esi, %edi
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imull	(%rax), %edi
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imull	$665536, %esi, %edi
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imull	$665536, (%rax), %edi
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imull	$7, %esi, %edi
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imull	$7, (%rax), %edi
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     imulq	%rdi
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   imulq	(%rax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulq	%rsi, %rdi
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imulq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulq	$665536, %rsi, %rdi
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imulq	$665536, (%rax), %rdi
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     imulq	$7, %rsi, %rdi
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   imulq	$7, (%rax), %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inb	$7, %al
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inb	%dx, %al
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inw	$7, %ax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inw	%dx, %ax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inl	$7, %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     inl	%dx, %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     incb	%dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   incb	(%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     incw	%di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   incw	(%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     incl	%edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   incl	(%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     incq	%rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   incq	(%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     insb	%dx, %es:(%rdi)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     insw	%dx, %es:(%rdi)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     insl	%dx, %es:(%rdi)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     int	$7
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     lahf
-# CHECK-NEXT:  -      -     0.67   0.67    -     0.67   0.50   0.50   lodsb	(%rsi), %al
-# CHECK-NEXT:  -      -     0.67   0.67    -     0.67   0.50   0.50   lodsw	(%rsi), %ax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   lodsl	(%rsi), %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   lodsq	(%rsi), %rax
-# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   movsb	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   movsw	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   movsl	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   movsq	(%rsi), %es:(%rdi)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movsbw	%al, %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movzbw	%al, %di
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movsbw	(%rax), %di
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movzbw	(%rax), %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movsbl	%al, %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movzbl	%al, %edi
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movsbl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movzbl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movsbq	%al, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movzbq	%al, %rdi
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movsbq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movzbq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movswl	%ax, %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movzwl	%ax, %edi
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movswl	(%rax), %edi
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movzwl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movswq	%ax, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movzwq	%ax, %rdi
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movswq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movzwq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     movslq	%eax, %rdi
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   movslq	(%rax), %rdi
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     mulb	%dil
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   mulb	(%rax)
-# CHECK-NEXT:  -      -     1.17   1.67    -     1.17    -      -     mulw	%si
-# CHECK-NEXT:  -      -     1.17   1.67    -     1.17   0.50   0.50   mulw	(%rax)
-# CHECK-NEXT:  -      -     0.83   1.33    -     0.83    -      -     mull	%edx
-# CHECK-NEXT:  -      -     0.83   1.33    -     0.83   0.50   0.50   mull	(%rax)
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     mulq	%rcx
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   mulq	(%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     negb	%dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   negb	(%r8)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     negw	%si
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   negw	(%r9)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     negl	%edx
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   negl	(%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     negq	%rcx
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   negq	(%r10)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     nop
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopw	%di
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopw	(%rcx)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopl	%esi
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopl	(%r8)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopq	%rdx
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     nopq	(%r9)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     notb	%dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   notb	(%r8)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     notw	%si
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   notw	(%r9)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     notl	%edx
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   notl	(%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     notq	%rcx
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   notq	(%r10)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orb	$7, %al
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orb	$7, %dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orb	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orb	%sil, %dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orb	%sil, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   orb	(%rax), %dil
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orw	$511, %ax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orw	$511, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orw	$511, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orw	$7, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orw	%si, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   orw	(%rax), %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orl	$665536, %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orl	$665536, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orl	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orl	$7, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orl	%esi, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   orl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orq	$665536, %rax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orq	$665536, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orq	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orq	$7, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     orq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   orq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   orq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outb	%al, $7
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outb	%al, %dx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outw	%ax, $7
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outw	%ax, %dx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outl	%eax, $7
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outl	%eax, %dx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outsb	(%rsi), %dx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outsw	(%rsi), %dx
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     outsl	(%rsi), %dx
-# CHECK-NEXT:  -      -     1.00   1.00    -     2.00    -      -     pause
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rclb	%dil
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rcrb	%dil
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclb	(%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrb	(%rax)
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclb	$7, %dil
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrb	$7, %dil
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclb	$7, (%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrb	$7, (%rax)
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclb	%cl, %dil
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrb	%cl, %dil
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclb	%cl, (%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrb	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rclw	%di
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rcrw	%di
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclw	(%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrw	(%rax)
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclw	$7, %di
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrw	$7, %di
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclw	$7, (%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrw	$7, (%rax)
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclw	%cl, %di
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrw	%cl, %di
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclw	%cl, (%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrw	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rcll	%edi
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rcrl	%edi
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcll	(%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrl	(%rax)
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcll	$7, %edi
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrl	$7, %edi
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcll	$7, (%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrl	$7, (%rax)
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcll	%cl, %edi
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrl	%cl, %edi
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcll	%cl, (%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrl	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rclq	%rdi
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rcrq	%rdi
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclq	(%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrq	(%rax)
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclq	$7, %rdi
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrq	$7, %rdi
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclq	$7, (%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrq	$7, (%rax)
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rclq	%cl, %rdi
-# CHECK-NEXT:  -      -     4.00    -      -     4.00    -      -     rcrq	%cl, %rdi
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rclq	%cl, (%rax)
-# CHECK-NEXT:  -      -     3.50    -      -     3.50   2.00   2.00   rcrq	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolb	%dil
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorb	%dil
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolb	(%rax)
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorb	(%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolb	$7, %dil
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorb	$7, %dil
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolb	$7, (%rax)
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorb	$7, (%rax)
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rolb	%cl, %dil
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rorb	%cl, %dil
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rolb	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rorb	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolw	%di
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorw	%di
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolw	(%rax)
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorw	(%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolw	$7, %di
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorw	$7, %di
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolw	$7, (%rax)
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorw	$7, (%rax)
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rolw	%cl, %di
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rorw	%cl, %di
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rolw	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rorw	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     roll	%edi
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorl	%edi
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   roll	(%rax)
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorl	(%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     roll	$7, %edi
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorl	$7, %edi
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   roll	$7, (%rax)
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorl	$7, (%rax)
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     roll	%cl, %edi
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rorl	%cl, %edi
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   roll	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rorl	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolq	%rdi
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorq	%rdi
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolq	(%rax)
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorq	(%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rolq	$7, %rdi
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     rorq	$7, %rdi
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rolq	$7, (%rax)
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   1.00   1.00   rorq	$7, (%rax)
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rolq	%cl, %rdi
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     rorq	%cl, %rdi
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rolq	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   rorq	%cl, (%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sahf
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarb	%dil
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlb	%dil
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrb	%dil
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarb	(%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlb	(%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrb	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarb	$7, %dil
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlb	$7, %dil
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrb	$7, %dil
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarb	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlb	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrb	$7, (%rax)
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     sarb	%cl, %dil
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shlb	%cl, %dil
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shrb	%cl, %dil
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   sarb	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shlb	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shrb	%cl, (%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarw	%di
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlw	%di
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrw	%di
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarw	(%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlw	(%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrw	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarw	$7, %di
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlw	$7, %di
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrw	$7, %di
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrw	$7, (%rax)
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     sarw	%cl, %di
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shlw	%cl, %di
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shrw	%cl, %di
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   sarw	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shlw	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shrw	%cl, (%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarl	%edi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shll	%edi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrl	%edi
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarl	(%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shll	(%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrl	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarl	$7, %edi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shll	$7, %edi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrl	$7, %edi
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shll	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrl	$7, (%rax)
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     sarl	%cl, %edi
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shll	%cl, %edi
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shrl	%cl, %edi
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   sarl	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shll	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shrl	%cl, (%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarq	%rdi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlq	%rdi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrq	%rdi
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarq	(%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlq	(%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrq	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sarq	$7, %rdi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shlq	$7, %rdi
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     shrq	$7, %rdi
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   sarq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shlq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   1.00   1.00   shrq	$7, (%rax)
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     sarq	%cl, %rdi
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shlq	%cl, %rdi
-# CHECK-NEXT:  -      -     1.50    -      -     1.50    -      -     shrq	%cl, %rdi
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   sarq	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shlq	%cl, (%rax)
-# CHECK-NEXT:  -      -     1.50    -     1.00   1.50   1.00   1.00   shrq	%cl, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbb	$7, %al
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbb	$7, %dil
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbb	$7, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbb	%sil, %dil
-# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   sbbb	%sil, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   sbbb	(%rax), %dil
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbw	$511, %ax
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbw	$511, %di
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbw	$511, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbw	$7, %di
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbw	%si, %di
-# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   sbbw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   sbbw	(%rax), %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbl	$665536, %eax
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbl	$665536, %edi
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbl	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbl	$7, %edi
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbl	%esi, %edi
-# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   sbbl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   sbbl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbq	$665536, %rax
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbq	$665536, %rdi
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbq	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbq	$7, %rdi
-# CHECK-NEXT:  -      -     1.00   1.00   1.00   1.00   1.00   1.00   sbbq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     sbbq	%rsi, %rdi
-# CHECK-NEXT:  -      -     1.33   0.33   1.00   1.33   1.00   1.00   sbbq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83   0.50   0.50   sbbq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     scasb	%es:(%rdi), %al
-# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     scasw	%es:(%rdi), %ax
-# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     scasl	%es:(%rdi), %eax
-# CHECK-NEXT:  -      -     0.67   0.67    -     0.67    -      -     scasq	%es:(%rdi), %rax
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     seto	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   seto	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setno	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setno	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setb	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setb	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setae	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setae	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sete	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   sete	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setne	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setne	(%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     seta	%al
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   seta	(%rax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     setbe	%al
-# CHECK-NEXT:  -      -     1.00    -     1.00   1.00   0.50   0.50   setbe	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     sets	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   sets	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setns	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setns	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setp	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setp	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setnp	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setnp	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setl	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setl	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setge	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setge	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setg	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setg	(%rax)
-# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     setle	%al
-# CHECK-NEXT:  -      -     0.50    -     1.00   0.50   0.50   0.50   setle	(%rax)
-# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shldw	%cl, %si, %di
-# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shrdw	%cl, %si, %di
-# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shldw	%cl, %si, (%rax)
-# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shrdw	%cl, %si, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shldw	$7, %si, %di
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shrdw	$7, %si, %di
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shldw	$7, %si, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shrdw	$7, %si, (%rax)
-# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shldl	%cl, %esi, %edi
-# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shrdl	%cl, %esi, %edi
-# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shldl	%cl, %esi, (%rax)
-# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shrdl	%cl, %esi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shldl	$7, %esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shrdl	$7, %esi, %edi
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shldl	$7, %esi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shrdl	$7, %esi, (%rax)
-# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shldq	%cl, %rsi, %rdi
-# CHECK-NEXT:  -      -     1.83   0.33    -     1.83    -      -     shrdq	%cl, %rsi, %rdi
-# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shldq	%cl, %rsi, (%rax)
-# CHECK-NEXT:  -      -     1.83   0.33   1.00   1.83   1.00   1.00   shrdq	%cl, %rsi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shldq	$7, %rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33    -     0.83    -      -     shrdq	$7, %rsi, %rdi
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shldq	$7, %rsi, (%rax)
-# CHECK-NEXT:  -      -     0.83   0.33   1.00   0.83   1.00   1.00   shrdq	$7, %rsi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     stc
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     std
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   0.50   0.50   stosb	%al, %es:(%rdi)
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   0.50   0.50   stosw	%ax, %es:(%rdi)
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   0.50   0.50   stosl	%eax, %es:(%rdi)
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   0.50   0.50   stosq	%rax, %es:(%rdi)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subb	$7, %al
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subb	$7, %dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subb	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subb	%sil, %dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subb	%sil, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   subb	(%rax), %dil
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subw	$511, %ax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subw	$511, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subw	$511, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subw	$7, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subw	%si, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   subw	(%rax), %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subl	$665536, %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subl	$665536, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subl	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subl	$7, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subl	%esi, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   subl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subq	$665536, %rax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subq	$665536, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subq	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subq	$7, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     subq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   subq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   subq	(%rax), %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testb	$7, %al
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testb	$7, %dil
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testb	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testb	%sil, %dil
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testb	%sil, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testw	$511, %ax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testw	$511, %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testw	$511, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testw	$7, %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testw	%si, %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testl	$665536, %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testl	$665536, %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testl	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testl	$7, %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testl	%esi, %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testq	$665536, %rax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testq	$665536, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testq	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testq	$7, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     testq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   testq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     ud2
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xaddb	%bl, %cl
-# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   xaddb	%bl, (%rcx)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xaddw	%bx, %cx
-# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   xaddw	%ax, (%rbx)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xaddl	%ebx, %ecx
-# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   xaddl	%eax, (%rbx)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xaddq	%rbx, %rcx
-# CHECK-NEXT:  -      -     0.67   0.67   1.00   0.67   1.00   1.00   xaddq	%rax, (%rbx)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgb	%bl, %cl
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xchgb	%bl, (%rbx)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgw	%bx, %ax
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgw	%bx, %cx
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xchgw	%ax, (%rbx)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgl	%ebx, %eax
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgl	%ebx, %ecx
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xchgl	%eax, (%rbx)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgq	%rbx, %rax
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     xchgq	%rbx, %rcx
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xchgq	%rax, (%rbx)
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   xlatb
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorb	$7, %al
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorb	$7, %dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorb	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorb	%sil, %dil
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorb	%sil, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   xorb	(%rax), %dil
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorw	$511, %ax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorw	$511, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorw	$511, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorw	$7, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorw	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorw	%si, %di
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorw	%si, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   xorw	(%rax), %di
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorl	$665536, %eax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorl	$665536, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorl	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorl	$7, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorl	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorl	%esi, %edi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorl	%esi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   xorl	(%rax), %edi
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorq	$665536, %rax
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorq	$665536, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorq	$665536, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorq	$7, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorq	$7, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     xorq	%rsi, %rdi
-# CHECK-NEXT:  -      -     0.33   0.33   1.00   0.33   1.00   1.00   xorq	%rsi, (%rax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33   0.50   0.50   xorq	(%rax), %rdi
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bswapl	%eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bswapq	%rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	%si, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	%si, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	%si, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	%si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	$7, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	%esi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	%esi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	%esi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	%esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	$7, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	%rsi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	%rsi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	%rsi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	%rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	$7, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cbtw
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cwtl
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cltq
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cwtd
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cltd
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cqto
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     clc
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cld
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmc
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	$7, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	%sil, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$511, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$7, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$665536, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$7, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$665536, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$7, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsb	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsw	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsl	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsq	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgb	%cl, %bl
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgb	%cl, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgw	%cx, %bx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgw	%cx, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgl	%ecx, %ebx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgl	%ecx, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgq	%rcx, %rbx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgq	%rcx, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cpuid
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decq	(%rax)
+# CHECK-NEXT:  -      -      -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divb	(%rax)
+# CHECK-NEXT:  -      -      -      -     15.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divw	%si
+# CHECK-NEXT: 0.50   0.50    -      -     15.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divw	(%rax)
+# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divl	%edx
+# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divl	(%rax)
+# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divq	%rcx
+# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divq	(%rax)
+# CHECK-NEXT:  -      -      -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	(%rax)
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	%si
+# CHECK-NEXT: 0.50   0.50    -      -     17.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	(%rax)
+# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%edx
+# CHECK-NEXT: 0.50   0.50    -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	(%rax)
+# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	%rcx
+# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	%di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$511, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$511, (%rax), %di
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$7, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$7, (%rax), %di
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$665536, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$665536, (%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$7, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$7, (%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	%rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$665536, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$665536, (%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$7, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$7, (%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inb	%dx, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inw	$7, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inw	%dx, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inl	$7, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inl	%dx, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     insb	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     insw	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     insl	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     int	$7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lahf
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsb	(%rsi), %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsw	(%rsi), %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsl	(%rsi), %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsq	(%rsi), %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsb	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsw	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsl	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsq	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbw	%al, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbw	%al, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbl	%al, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbl	%al, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbq	%al, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbq	%al, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswl	%ax, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwl	%ax, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswq	%ax, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwq	%ax, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movslq	%eax, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movslq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulw	%si
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mull	%edx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mull	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   mulq	%rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   mulq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negb	(%r8)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negw	%si
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negw	(%r9)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negl	%edx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negq	%rcx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negq	(%r10)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nop
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopw	(%rcx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopl	%esi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopl	(%r8)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopq	%rdx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopq	(%r9)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notb	(%r8)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notw	%si
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notw	(%r9)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notl	%edx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notq	%rcx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notq	(%r10)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outb	%al, $7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outb	%al, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outw	%ax, $7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outw	%ax, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outl	%eax, $7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outl	%eax, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outsb	(%rsi), %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outsw	(%rsi), %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outsl	(%rsi), %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     pause
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	%cl, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	%cl, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	%cl, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	%cl, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	%cl, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	%cl, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	%cl, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	%cl, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sahf
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	%cl, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	%cl, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	%cl, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	%cl, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasb	%es:(%rdi), %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasw	%es:(%rdi), %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasl	%es:(%rdi), %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasq	%es:(%rdi), %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seto	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seto	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setno	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setno	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setb	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setae	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setae	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sete	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sete	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setne	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setne	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seta	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seta	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setbe	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setbe	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sets	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sets	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setns	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setns	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setp	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setp	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setnp	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setnp	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setl	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setge	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setge	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setg	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setg	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setle	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setle	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	%cl, %si, %di
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	%cl, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	%cl, %si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	%cl, %si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	$7, %si, %di
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	$7, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	$7, %si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	$7, %si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	%cl, %esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	%cl, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	%cl, %esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	%cl, %esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	$7, %esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	$7, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	$7, %esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	$7, %esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	%cl, %rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	%cl, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	%cl, %rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	%cl, %rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	$7, %rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	$7, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	$7, %rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	$7, %rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stc
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     std
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosb	%al, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosw	%ax, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosl	%eax, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosq	%rax, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	$7, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	%sil, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	%sil, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$511, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$7, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	%si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$665536, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$7, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	%esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$665536, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$7, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	%rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     ud2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddb	%bl, %cl
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddb	%bl, (%rcx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddw	%bx, %cx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddw	%ax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddl	%ebx, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddl	%eax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddq	%rbx, %rcx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddq	%rax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgb	%bl, %cl
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgb	%bl, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgw	%bx, %ax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgw	%bx, %cx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgw	%ax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgl	%ebx, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgl	%ebx, %ecx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgl	%eax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgq	%rbx, %rax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgq	%rbx, %rcx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgq	%rax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xlatb
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	(%rax), %rdi
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x87.s b/test/tools/llvm-mca/X86/BdVer2/resources-x87.s
index 1cba9a7d77f..f64944cb112 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-x87.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x87.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 f2xm1
 
@@ -206,316 +206,328 @@ fyl2xp1
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      100   0.33                  U     f2xm1
+# CHECK-NEXT:  1      100   0.50                  U     f2xm1
 # CHECK-NEXT:  1      1     1.00                  U     fabs
-# CHECK-NEXT:  1      3     1.00                  U     fadd	%st(0), %st(1)
-# CHECK-NEXT:  1      3     1.00                  U     fadd	%st(2)
-# CHECK-NEXT:  2      10    1.00    *             U     fadds	(%ecx)
-# CHECK-NEXT:  2      10    1.00    *             U     faddl	(%ecx)
-# CHECK-NEXT:  1      3     1.00                  U     faddp	%st(1)
-# CHECK-NEXT:  1      3     1.00                  U     faddp	%st(2)
-# CHECK-NEXT:  3      13    2.00    *             U     fiadds	(%ecx)
-# CHECK-NEXT:  3      13    2.00    *             U     fiaddl	(%ecx)
-# CHECK-NEXT:  1      100   0.33                  U     fbld	(%ecx)
-# CHECK-NEXT:  1      100   0.33                  U     fbstp	(%eax)
+# CHECK-NEXT:  1      5     1.00                  U     fadd	%st(0), %st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fadd	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fadds	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     faddl	(%ecx)
+# CHECK-NEXT:  1      5     1.00                  U     faddp	%st(1)
+# CHECK-NEXT:  1      5     1.00                  U     faddp	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fiadds	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fiaddl	(%ecx)
+# CHECK-NEXT:  1      100   0.50                  U     fbld	(%ecx)
+# CHECK-NEXT:  1      100   0.50                  U     fbstp	(%eax)
 # CHECK-NEXT:  1      1     1.00                  U     fchs
-# CHECK-NEXT:  1      100   0.33                  U     fnclex
-# CHECK-NEXT:  3      3     2.00                  U     fcmovb	%st(1), %st(0)
-# CHECK-NEXT:  3      3     2.00                  U     fcmovbe	%st(1), %st(0)
-# CHECK-NEXT:  3      3     2.00                  U     fcmove	%st(1), %st(0)
-# CHECK-NEXT:  3      3     2.00                  U     fcmovnb	%st(1), %st(0)
-# CHECK-NEXT:  3      3     2.00                  U     fcmovnbe	%st(1), %st(0)
-# CHECK-NEXT:  3      3     2.00                  U     fcmovne	%st(1), %st(0)
-# CHECK-NEXT:  3      3     2.00                  U     fcmovnu	%st(1), %st(0)
-# CHECK-NEXT:  3      3     2.00                  U     fcmovu	%st(1), %st(0)
-# CHECK-NEXT:  1      1     1.00                  U     fcom	%st(1)
-# CHECK-NEXT:  1      1     1.00                  U     fcom	%st(3)
-# CHECK-NEXT:  2      8     1.00                  U     fcoms	(%ecx)
-# CHECK-NEXT:  2      8     1.00                  U     fcoml	(%eax)
-# CHECK-NEXT:  1      1     1.00                  U     fcomp	%st(1)
-# CHECK-NEXT:  1      1     1.00                  U     fcomp	%st(3)
-# CHECK-NEXT:  2      8     1.00                  U     fcomps	(%ecx)
-# CHECK-NEXT:  2      8     1.00                  U     fcompl	(%eax)
-# CHECK-NEXT:  1      100   0.33                  U     fcompp
-# CHECK-NEXT:  3      3     1.00                  U     fcomi	%st(3)
-# CHECK-NEXT:  3      3     1.00                  U     fcompi	%st(3)
-# CHECK-NEXT:  1      100   0.33                  U     fcos
-# CHECK-NEXT:  1      1     1.00                  U     fdecstp
-# CHECK-NEXT:  1      14    14.00                 U     fdiv	%st(0), %st(1)
-# CHECK-NEXT:  1      14    14.00                 U     fdiv	%st(2)
-# CHECK-NEXT:  2      31    1.00    *             U     fdivs	(%ecx)
-# CHECK-NEXT:  2      31    1.00    *             U     fdivl	(%eax)
-# CHECK-NEXT:  1      14    14.00                 U     fdivp	%st(1)
-# CHECK-NEXT:  1      14    14.00                 U     fdivp	%st(2)
-# CHECK-NEXT:  3      34    1.00    *             U     fidivs	(%ecx)
-# CHECK-NEXT:  3      34    1.00    *             U     fidivl	(%eax)
-# CHECK-NEXT:  1      14    14.00                 U     fdivr	%st(0), %st(1)
-# CHECK-NEXT:  1      14    14.00                 U     fdivr	%st(2)
-# CHECK-NEXT:  2      31    1.00    *             U     fdivrs	(%ecx)
-# CHECK-NEXT:  2      31    1.00    *             U     fdivrl	(%eax)
-# CHECK-NEXT:  1      14    14.00                 U     fdivrp	%st(1)
-# CHECK-NEXT:  1      14    14.00                 U     fdivrp	%st(2)
-# CHECK-NEXT:  3      34    1.00    *             U     fidivrs	(%ecx)
-# CHECK-NEXT:  3      34    1.00    *             U     fidivrl	(%eax)
-# CHECK-NEXT:  1      1     1.00                  U     ffree	%st(0)
-# CHECK-NEXT:  3      11    2.00                  U     ficoms	(%ecx)
-# CHECK-NEXT:  3      11    2.00                  U     ficoml	(%eax)
-# CHECK-NEXT:  3      11    2.00                  U     ficomps	(%ecx)
-# CHECK-NEXT:  3      11    2.00                  U     ficompl	(%eax)
-# CHECK-NEXT:  2      10    1.00    *             U     filds	(%edx)
-# CHECK-NEXT:  2      10    1.00    *             U     fildl	(%ecx)
-# CHECK-NEXT:  2      10    1.00    *             U     fildll	(%eax)
-# CHECK-NEXT:  1      1     1.00                  U     fincstp
-# CHECK-NEXT:  4      5     1.33                  U     fninit
-# CHECK-NEXT:  4      9     1.00           *      U     fists	(%edx)
-# CHECK-NEXT:  4      9     1.00           *      U     fistl	(%ecx)
-# CHECK-NEXT:  4      9     1.00           *      U     fistps	(%edx)
-# CHECK-NEXT:  4      9     1.00           *      U     fistpl	(%ecx)
-# CHECK-NEXT:  4      9     1.00           *      U     fistpll	(%eax)
-# CHECK-NEXT:  3      5     1.00           *      U     fisttps	(%edx)
-# CHECK-NEXT:  3      5     1.00           *      U     fisttpl	(%ecx)
-# CHECK-NEXT:  3      5     1.00           *      U     fisttpll	(%eax)
-# CHECK-NEXT:  1      1     1.00                  U     fld	%st(0)
-# CHECK-NEXT:  3      9     1.00    *             U     flds	(%edx)
-# CHECK-NEXT:  3      9     1.00    *             U     fldl	(%ecx)
-# CHECK-NEXT:  3      9     1.00    *             U     fldt	(%eax)
-# CHECK-NEXT:  5      8     2.00    *             U     fldcw	(%eax)
-# CHECK-NEXT:  1      100   0.33                  U     fldenv	(%eax)
-# CHECK-NEXT:  2      1     1.00                  U     fld1
-# CHECK-NEXT:  2      1     1.00                  U     fldl2e
-# CHECK-NEXT:  2      1     1.00                  U     fldl2t
-# CHECK-NEXT:  2      1     1.00                  U     fldlg2
-# CHECK-NEXT:  2      1     1.00                  U     fldln2
-# CHECK-NEXT:  2      1     1.00                  U     fldpi
-# CHECK-NEXT:  1      1     1.00                  U     fldz
+# CHECK-NEXT:  1      100   0.50                  U     fnclex
+# CHECK-NEXT:  1      1     1.00                  U     fcmovb	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovbe	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmove	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovnb	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovnbe	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovne	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovnu	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovu	%st(1), %st(0)
+# CHECK-NEXT:  2      1     1.00                  U     fcom	%st(1)
+# CHECK-NEXT:  2      1     1.00                  U     fcom	%st(3)
+# CHECK-NEXT:  1      6     1.00                  U     fcoms	(%ecx)
+# CHECK-NEXT:  1      6     1.00                  U     fcoml	(%eax)
+# CHECK-NEXT:  2      1     1.00                  U     fcomp	%st(1)
+# CHECK-NEXT:  2      1     1.00                  U     fcomp	%st(3)
+# CHECK-NEXT:  1      6     1.00                  U     fcomps	(%ecx)
+# CHECK-NEXT:  1      6     1.00                  U     fcompl	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fcompp
+# CHECK-NEXT:  2      1     1.00                  U     fcomi	%st(3)
+# CHECK-NEXT:  2      1     1.00                  U     fcompi	%st(3)
+# CHECK-NEXT:  1      100   0.50                  U     fcos
+# CHECK-NEXT:  1      100   0.50                  U     fdecstp
+# CHECK-NEXT:  1      9     9.50                  U     fdiv	%st(0), %st(1)
+# CHECK-NEXT:  1      9     9.50                  U     fdiv	%st(2)
+# CHECK-NEXT:  1      14    9.50    *             U     fdivs	(%ecx)
+# CHECK-NEXT:  1      14    9.50    *             U     fdivl	(%eax)
+# CHECK-NEXT:  1      9     9.50                  U     fdivp	%st(1)
+# CHECK-NEXT:  1      9     9.50                  U     fdivp	%st(2)
+# CHECK-NEXT:  1      14    9.50    *             U     fidivs	(%ecx)
+# CHECK-NEXT:  1      14    9.50    *             U     fidivl	(%eax)
+# CHECK-NEXT:  1      9     9.50                  U     fdivr	%st(0), %st(1)
+# CHECK-NEXT:  1      9     9.50                  U     fdivr	%st(2)
+# CHECK-NEXT:  1      14    9.50    *             U     fdivrs	(%ecx)
+# CHECK-NEXT:  1      14    9.50    *             U     fdivrl	(%eax)
+# CHECK-NEXT:  1      9     9.50                  U     fdivrp	%st(1)
+# CHECK-NEXT:  1      9     9.50                  U     fdivrp	%st(2)
+# CHECK-NEXT:  1      14    9.50    *             U     fidivrs	(%ecx)
+# CHECK-NEXT:  1      14    9.50    *             U     fidivrl	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     ffree	%st(0)
+# CHECK-NEXT:  2      6     1.00                  U     ficoms	(%ecx)
+# CHECK-NEXT:  2      6     1.00                  U     ficoml	(%eax)
+# CHECK-NEXT:  2      6     1.00                  U     ficomps	(%ecx)
+# CHECK-NEXT:  2      6     1.00                  U     ficompl	(%eax)
+# CHECK-NEXT:  1      5     0.50    *             U     filds	(%edx)
+# CHECK-NEXT:  1      5     0.50    *             U     fildl	(%ecx)
+# CHECK-NEXT:  1      5     0.50    *             U     fildll	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fincstp
+# CHECK-NEXT:  1      100   0.50                  U     fninit
+# CHECK-NEXT:  1      1     0.50           *      U     fists	(%edx)
+# CHECK-NEXT:  1      1     0.50           *      U     fistl	(%ecx)
+# CHECK-NEXT:  1      1     0.50           *      U     fistps	(%edx)
+# CHECK-NEXT:  1      1     0.50           *      U     fistpl	(%ecx)
+# CHECK-NEXT:  1      1     0.50           *      U     fistpll	(%eax)
+# CHECK-NEXT:  1      1     0.50           *      U     fisttps	(%edx)
+# CHECK-NEXT:  1      1     0.50           *      U     fisttpl	(%ecx)
+# CHECK-NEXT:  1      1     0.50           *      U     fisttpll	(%eax)
+# CHECK-NEXT:  1      1     0.50                  U     fld	%st(0)
+# CHECK-NEXT:  1      5     0.50    *             U     flds	(%edx)
+# CHECK-NEXT:  1      5     0.50    *             U     fldl	(%ecx)
+# CHECK-NEXT:  1      5     0.50    *             U     fldt	(%eax)
+# CHECK-NEXT:  1      5     0.50    *             U     fldcw	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fldenv	(%eax)
+# CHECK-NEXT:  1      3     1.00                  U     fld1
+# CHECK-NEXT:  1      3     1.00                  U     fldl2e
+# CHECK-NEXT:  1      3     1.00                  U     fldl2t
+# CHECK-NEXT:  1      3     1.00                  U     fldlg2
+# CHECK-NEXT:  1      3     1.00                  U     fldln2
+# CHECK-NEXT:  1      3     1.00                  U     fldpi
+# CHECK-NEXT:  1      3     1.00                  U     fldz
 # CHECK-NEXT:  1      5     1.00                  U     fmul	%st(0), %st(1)
 # CHECK-NEXT:  1      5     1.00                  U     fmul	%st(2)
-# CHECK-NEXT:  2      12    1.00    *             U     fmuls	(%ecx)
-# CHECK-NEXT:  2      12    1.00    *             U     fmull	(%eax)
+# CHECK-NEXT:  1      10    1.00    *             U     fmuls	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fmull	(%eax)
 # CHECK-NEXT:  1      5     1.00                  U     fmulp	%st(1)
 # CHECK-NEXT:  1      5     1.00                  U     fmulp	%st(2)
-# CHECK-NEXT:  3      15    1.00    *             U     fimuls	(%ecx)
-# CHECK-NEXT:  3      15    1.00    *             U     fimull	(%eax)
-# CHECK-NEXT:  1      1     1.00                  U     fnop
-# CHECK-NEXT:  1      100   0.33                  U     fpatan
-# CHECK-NEXT:  1      100   0.33                  U     fprem
-# CHECK-NEXT:  1      100   0.33                  U     fprem1
-# CHECK-NEXT:  1      100   0.33                  U     fptan
-# CHECK-NEXT:  1      100   0.33                  U     frndint
-# CHECK-NEXT:  1      100   0.33                  U     frstor	(%eax)
-# CHECK-NEXT:  1      100   0.33                  U     fnsave	(%eax)
-# CHECK-NEXT:  1      100   0.33                  U     fscale
-# CHECK-NEXT:  1      100   0.33                  U     fsin
-# CHECK-NEXT:  1      100   0.33                  U     fsincos
-# CHECK-NEXT:  1      24    24.00                 U     fsqrt
-# CHECK-NEXT:  1      1     1.00                  U     fst	%st(0)
-# CHECK-NEXT:  3      6     1.00           *      U     fsts	(%edx)
-# CHECK-NEXT:  3      6     1.00           *      U     fstl	(%ecx)
-# CHECK-NEXT:  1      1     1.00                  U     fstp	%st(0)
-# CHECK-NEXT:  3      6     1.00           *      U     fstpl	(%edx)
-# CHECK-NEXT:  3      6     1.00           *      U     fstpl	(%ecx)
-# CHECK-NEXT:  3      6     1.00           *      U     fstpt	(%eax)
-# CHECK-NEXT:  4      7     1.00           *      U     fnstcw	(%eax)
-# CHECK-NEXT:  1      100   0.33                  U     fnstenv	(%eax)
-# CHECK-NEXT:  4      7     1.00                  U     fnstsw	(%eax)
-# CHECK-NEXT:  1      100   0.33                  U     frstor	(%eax)
-# CHECK-NEXT:  1      100   0.33                  U     wait
-# CHECK-NEXT:  1      100   0.33                  U     fnsave	(%eax)
-# CHECK-NEXT:  1      3     1.00                  U     fsub	%st(0), %st(1)
-# CHECK-NEXT:  1      3     1.00                  U     fsub	%st(2)
-# CHECK-NEXT:  2      10    1.00    *             U     fsubs	(%ecx)
-# CHECK-NEXT:  2      10    1.00    *             U     fsubl	(%eax)
-# CHECK-NEXT:  1      3     1.00                  U     fsubp	%st(1)
-# CHECK-NEXT:  1      3     1.00                  U     fsubp	%st(2)
-# CHECK-NEXT:  3      13    2.00    *             U     fisubs	(%ecx)
-# CHECK-NEXT:  3      13    2.00    *             U     fisubl	(%eax)
-# CHECK-NEXT:  1      3     1.00                  U     fsubr	%st(0), %st(1)
-# CHECK-NEXT:  1      3     1.00                  U     fsubr	%st(2)
-# CHECK-NEXT:  2      10    1.00    *             U     fsubrs	(%ecx)
-# CHECK-NEXT:  2      10    1.00    *             U     fsubrl	(%eax)
-# CHECK-NEXT:  1      3     1.00                  U     fsubrp	%st(1)
-# CHECK-NEXT:  1      3     1.00                  U     fsubrp	%st(2)
-# CHECK-NEXT:  3      13    2.00    *             U     fisubrs	(%ecx)
-# CHECK-NEXT:  3      13    2.00    *             U     fisubrl	(%eax)
-# CHECK-NEXT:  1      3     1.00                  U     ftst
-# CHECK-NEXT:  1      1     1.00                  U     fucom	%st(1)
-# CHECK-NEXT:  1      1     1.00                  U     fucom	%st(3)
-# CHECK-NEXT:  1      1     1.00                  U     fucomp	%st(1)
-# CHECK-NEXT:  1      1     1.00                  U     fucomp	%st(3)
-# CHECK-NEXT:  1      3     1.00                  U     fucompp
-# CHECK-NEXT:  3      3     1.00                  U     fucomi	%st(3)
-# CHECK-NEXT:  3      3     1.00                  U     fucompi	%st(3)
-# CHECK-NEXT:  1      100   0.33                  U     wait
-# CHECK-NEXT:  1      100   0.33                  U     fxam
-# CHECK-NEXT:  1      1     0.33                  U     fxch	%st(1)
-# CHECK-NEXT:  1      1     0.33                  U     fxch	%st(3)
-# CHECK-NEXT:  5      5     2.00    *      *      U     fxrstor	(%eax)
-# CHECK-NEXT:  1      100   0.33    *      *      U     fxsave	(%eax)
-# CHECK-NEXT:  1      100   0.33                  U     fxtract
-# CHECK-NEXT:  1      100   0.33                  U     fyl2x
-# CHECK-NEXT:  1      100   0.33                  U     fyl2xp1
+# CHECK-NEXT:  1      10    1.00    *             U     fimuls	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fimull	(%eax)
+# CHECK-NEXT:  1      1     0.50                  U     fnop
+# CHECK-NEXT:  1      100   0.50                  U     fpatan
+# CHECK-NEXT:  1      100   0.50                  U     fprem
+# CHECK-NEXT:  1      100   0.50                  U     fprem1
+# CHECK-NEXT:  1      100   0.50                  U     fptan
+# CHECK-NEXT:  1      100   0.50                  U     frndint
+# CHECK-NEXT:  1      100   0.50                  U     frstor	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fnsave	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fscale
+# CHECK-NEXT:  1      100   0.50                  U     fsin
+# CHECK-NEXT:  1      100   0.50                  U     fsincos
+# CHECK-NEXT:  1      1     17.50                 U     fsqrt
+# CHECK-NEXT:  1      1     0.50                  U     fst	%st(0)
+# CHECK-NEXT:  1      1     0.50           *      U     fsts	(%edx)
+# CHECK-NEXT:  1      1     0.50           *      U     fstl	(%ecx)
+# CHECK-NEXT:  1      1     0.50                  U     fstp	%st(0)
+# CHECK-NEXT:  1      1     0.50           *      U     fstpl	(%edx)
+# CHECK-NEXT:  1      1     0.50           *      U     fstpl	(%ecx)
+# CHECK-NEXT:  1      1     0.50           *      U     fstpt	(%eax)
+# CHECK-NEXT:  1      1     0.50           *      U     fnstcw	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fnstenv	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fnstsw	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     frstor	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     wait
+# CHECK-NEXT:  1      100   0.50                  U     fnsave	(%eax)
+# CHECK-NEXT:  1      5     1.00                  U     fsub	%st(0), %st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fsub	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fsubs	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fsubl	(%eax)
+# CHECK-NEXT:  1      5     1.00                  U     fsubp	%st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fsubp	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fisubs	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fisubl	(%eax)
+# CHECK-NEXT:  1      5     1.00                  U     fsubr	%st(0), %st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fsubr	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fsubrs	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fsubrl	(%eax)
+# CHECK-NEXT:  1      5     1.00                  U     fsubrp	%st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fsubrp	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fisubrs	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fisubrl	(%eax)
+# CHECK-NEXT:  1      1     1.00                  U     ftst
+# CHECK-NEXT:  2      1     1.00                  U     fucom	%st(1)
+# CHECK-NEXT:  2      1     1.00                  U     fucom	%st(3)
+# CHECK-NEXT:  2      1     1.00                  U     fucomp	%st(1)
+# CHECK-NEXT:  2      1     1.00                  U     fucomp	%st(3)
+# CHECK-NEXT:  1      1     1.00                  U     fucompp
+# CHECK-NEXT:  2      1     1.00                  U     fucomi	%st(3)
+# CHECK-NEXT:  2      1     1.00                  U     fucompi	%st(3)
+# CHECK-NEXT:  1      100   0.50                  U     wait
+# CHECK-NEXT:  1      100   0.50                  U     fxam
+# CHECK-NEXT:  1      1     0.50                  U     fxch	%st(1)
+# CHECK-NEXT:  1      1     0.50                  U     fxch	%st(3)
+# CHECK-NEXT:  1      100   0.50    *      *      U     fxrstor	(%eax)
+# CHECK-NEXT:  1      100   0.50    *      *      U     fxsave	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fxtract
+# CHECK-NEXT:  1      100   0.50                  U     fyl2x
+# CHECK-NEXT:  1      100   0.50                  U     fyl2xp1
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     136.00 52.67  90.67  17.00  54.67  34.00  34.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 24.00  24.00   -      -      -     36.00  20.00   -     201.50 201.50  -      -      -     7.00   48.00  40.00   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     f2xm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fabs
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fadd	%st(0), %st(1)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fadd	%st(2)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fadds	(%ecx)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   faddl	(%ecx)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     faddp	%st(1)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     faddp	%st(2)
-# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fiadds	(%ecx)
-# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fiaddl	(%ecx)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fbld	(%ecx)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fbstp	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fchs
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fnclex
-# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovb	%st(1), %st(0)
-# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovbe	%st(1), %st(0)
-# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmove	%st(1), %st(0)
-# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovnb	%st(1), %st(0)
-# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovnbe	%st(1), %st(0)
-# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovne	%st(1), %st(0)
-# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovnu	%st(1), %st(0)
-# CHECK-NEXT:  -      -     0.50    -      -     2.50    -      -     fcmovu	%st(1), %st(0)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fcom	%st(1)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fcom	%st(3)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fcoms	(%ecx)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fcoml	(%eax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fcomp	%st(1)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fcomp	%st(3)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fcomps	(%ecx)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fcompl	(%eax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fcompp
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     fcomi	%st(3)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     fcompi	%st(3)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fcos
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fdecstp
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdiv	%st(0), %st(1)
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdiv	%st(2)
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fdivs	(%ecx)
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fdivl	(%eax)
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivp	%st(1)
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivp	%st(2)
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fidivs	(%ecx)
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fidivl	(%eax)
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivr	%st(0), %st(1)
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivr	%st(2)
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fdivrs	(%ecx)
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fdivrl	(%eax)
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivrp	%st(1)
-# CHECK-NEXT:  -     14.00  1.00    -      -      -      -      -     fdivrp	%st(2)
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fidivrs	(%ecx)
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fidivrl	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     ffree	%st(0)
-# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   ficoms	(%ecx)
-# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   ficoml	(%eax)
-# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   ficomps	(%ecx)
-# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   ficompl	(%eax)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   filds	(%edx)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fildl	(%ecx)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fildll	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fincstp
-# CHECK-NEXT:  -      -     1.00   1.00    -     2.00    -      -     fninit
-# CHECK-NEXT:  -      -      -     1.00   1.00    -     1.00   1.00   fists	(%edx)
-# CHECK-NEXT:  -      -      -     1.00   1.00    -     1.00   1.00   fistl	(%ecx)
-# CHECK-NEXT:  -      -      -     1.00   1.00    -     1.00   1.00   fistps	(%edx)
-# CHECK-NEXT:  -      -      -     1.00   1.00    -     1.00   1.00   fistpl	(%ecx)
-# CHECK-NEXT:  -      -      -     1.00   1.00    -     1.00   1.00   fistpll	(%eax)
-# CHECK-NEXT:  -      -      -     1.00   1.00    -     0.50   0.50   fisttps	(%edx)
-# CHECK-NEXT:  -      -      -     1.00   1.00    -     0.50   0.50   fisttpl	(%ecx)
-# CHECK-NEXT:  -      -      -     1.00   1.00    -     0.50   0.50   fisttpll	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fld	%st(0)
-# CHECK-NEXT:  -      -     0.50   0.50    -     1.00   0.50   0.50   flds	(%edx)
-# CHECK-NEXT:  -      -     0.50   0.50    -     1.00   0.50   0.50   fldl	(%ecx)
-# CHECK-NEXT:  -      -     0.50   0.50    -     1.00   0.50   0.50   fldt	(%eax)
-# CHECK-NEXT:  -      -      -      -     1.00   2.00   1.00   1.00   fldcw	(%eax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fldenv	(%eax)
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -      -     fld1
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     fldl2e
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     fldl2t
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     fldlg2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     fldln2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -      -     fldpi
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fldz
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     fmul	%st(0), %st(1)
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     fmul	%st(2)
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fmuls	(%ecx)
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   fmull	(%eax)
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     fmulp	%st(1)
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     fmulp	%st(2)
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fimuls	(%ecx)
-# CHECK-NEXT:  -      -     1.00   1.00    -      -     0.50   0.50   fimull	(%eax)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fnop
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fpatan
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fprem
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fprem1
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fptan
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     frndint
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     frstor	(%eax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fnsave	(%eax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fscale
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fsin
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fsincos
-# CHECK-NEXT:  -     24.00  1.00    -      -      -      -      -     fsqrt
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fst	%st(0)
-# CHECK-NEXT:  -      -      -      -     1.00    -     1.00   1.00   fsts	(%edx)
-# CHECK-NEXT:  -      -      -      -     1.00    -     1.00   1.00   fstl	(%ecx)
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     fstp	%st(0)
-# CHECK-NEXT:  -      -      -      -     1.00    -     1.00   1.00   fstpl	(%edx)
-# CHECK-NEXT:  -      -      -      -     1.00    -     1.00   1.00   fstpl	(%ecx)
-# CHECK-NEXT:  -      -      -      -     1.00    -     1.00   1.00   fstpt	(%eax)
-# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00   1.00   fnstcw	(%eax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fnstenv	(%eax)
-# CHECK-NEXT:  -      -     1.00    -     1.00    -     1.00   1.00   fnstsw	(%eax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     frstor	(%eax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     wait
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fnsave	(%eax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsub	%st(0), %st(1)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsub	%st(2)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fsubs	(%ecx)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fsubl	(%eax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubp	%st(1)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubp	%st(2)
-# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fisubs	(%ecx)
-# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fisubl	(%eax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubr	%st(0), %st(1)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubr	%st(2)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fsubrs	(%ecx)
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   fsubrl	(%eax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubrp	%st(1)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fsubrp	%st(2)
-# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fisubrs	(%ecx)
-# CHECK-NEXT:  -      -      -     2.00    -      -     0.50   0.50   fisubrl	(%eax)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     ftst
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fucom	%st(1)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fucom	%st(3)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fucomp	%st(1)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fucomp	%st(3)
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     fucompp
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     fucomi	%st(3)
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -     fucompi	%st(3)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     wait
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fxam
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fxch	%st(1)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fxch	%st(3)
-# CHECK-NEXT:  -      -     0.50   0.50   1.00   2.00   0.50   0.50   fxrstor	(%eax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fxsave	(%eax)
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fxtract
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fyl2x
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     fyl2xp1
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     f2xm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fabs
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fadd	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fadd	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fadds	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     faddl	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     faddp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     faddp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fiadds	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fiaddl	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fbld	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fbstp	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fchs
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnclex
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovb	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovbe	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmove	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovnb	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovnbe	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovne	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovnu	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovu	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcom	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcom	%st(3)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcoms	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcoml	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcomp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcomp	%st(3)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcomps	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcompl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fcompp
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcomi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcompi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fcos
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fdecstp
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdiv	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdiv	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivr	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivr	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     ffree	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficoms	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficoml	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficomps	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficompl	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     filds	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fildl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fildll	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fincstp
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fninit
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fists	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistps	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistpl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistpll	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fisttps	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fisttpl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fisttpll	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fld	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     flds	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fldl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fldt	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fldcw	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fldenv	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fld1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldl2e
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldl2t
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldlg2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldln2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldpi
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldz
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmul	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmul	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmuls	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmull	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmulp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmulp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fimuls	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fimull	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnop
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fpatan
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fprem
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fprem1
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fptan
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     frndint
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     frstor	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnsave	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fscale
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fsin
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fsincos
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     17.50  17.50   -      -      -      -      -     1.00    -      -      -      -     fsqrt
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fst	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fsts	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstl	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fstp	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstpl	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstpl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstpt	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnstcw	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnstenv	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnstsw	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     frstor	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     wait
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnsave	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsub	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsub	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubr	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubr	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     ftst
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucom	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucom	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucomp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucomp	%st(3)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fucompp
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucomi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucompi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     wait
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxam
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxch	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxch	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxrstor	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxsave	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxtract
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fyl2x
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fyl2xp1
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-xop.s b/test/tools/llvm-mca/X86/BdVer2/resources-xop.s
index 61f39f07d78..306917defb1 100644
--- a/test/tools/llvm-mca/X86/BdVer2/resources-xop.s
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-xop.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
 
 vfrczpd %xmm0, %xmm3
 vfrczpd (%rax), %xmm3
@@ -221,314 +221,326 @@ vpshlw %xmm0, (%rax), %xmm3
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        vfrczpd	%xmm0, %xmm3
-# CHECK-NEXT:  2      9     1.00    *                   vfrczpd	(%rax), %xmm3
-# CHECK-NEXT:  1      3     1.00                        vfrczpd	%ymm0, %ymm3
-# CHECK-NEXT:  2      10    1.00    *                   vfrczpd	(%rax), %ymm3
-# CHECK-NEXT:  1      3     1.00                        vfrczps	%xmm0, %xmm3
-# CHECK-NEXT:  2      9     1.00    *                   vfrczps	(%rax), %xmm3
-# CHECK-NEXT:  1      3     1.00                        vfrczps	%ymm0, %ymm3
-# CHECK-NEXT:  2      10    1.00    *                   vfrczps	(%rax), %ymm3
-# CHECK-NEXT:  1      3     1.00                        vfrczsd	%xmm0, %xmm3
-# CHECK-NEXT:  2      9     1.00    *                   vfrczsd	(%rax), %xmm3
-# CHECK-NEXT:  1      3     1.00                        vfrczss	%xmm0, %xmm3
-# CHECK-NEXT:  2      9     1.00    *                   vfrczss	(%rax), %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcmov	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpcmov	(%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpcmov	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpcmov	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      8     1.00    *                   vpcmov	(%rax), %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  2      8     1.00    *                   vpcmov	%ymm0, (%rax), %ymm1, %ymm3
-# CHECK-NEXT:  1      1     0.50                        vpcomb	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpcomb	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcomd	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpcomd	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcomq	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpcomq	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcomub	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpcomub	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcomud	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpcomud	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcomuq	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpcomuq	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcomuw	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpcomuw	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpcomw	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpcomw	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpermil2pd	$0, %xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpermil2pd	$0, (%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpermil2pd	$0, %xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpermil2pd	$0, %ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      8     1.00    *                   vpermil2pd	$0, (%rax), %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  2      8     1.00    *                   vpermil2pd	$0, %ymm0, (%rax), %ymm1, %ymm3
-# CHECK-NEXT:  1      1     1.00                        vpermil2ps	$0, %xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpermil2ps	$0, (%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpermil2ps	$0, %xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpermil2ps	$0, %ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  2      8     1.00    *                   vpermil2ps	$0, (%rax), %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  2      8     1.00    *                   vpermil2ps	$0, %ymm0, (%rax), %ymm1, %ymm3
-# CHECK-NEXT:  3      3     1.50                        vphaddbd	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphaddbd	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphaddbq	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphaddbq	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphaddbw	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphaddbw	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphadddq	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphadddq	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphaddubd	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphaddubd	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphaddubq	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphaddubq	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphaddubw	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphaddubw	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphaddudq	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphaddudq	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphadduwd	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphadduwd	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphadduwq	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphadduwq	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphaddwd	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphaddwd	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphaddwq	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphaddwq	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphsubbw	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphsubbw	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphsubdq	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphsubdq	(%rax), %xmm3
-# CHECK-NEXT:  3      3     1.50                        vphsubwd	%xmm0, %xmm3
-# CHECK-NEXT:  4      9     1.50    *                   vphsubwd	(%rax), %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmacsdd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmacsdd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmacsdqh	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmacsdqh	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmacsdql	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmacsdql	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmacssdd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmacssdd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmacssdqh	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmacssdqh	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmacssdql	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmacssdql	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmacsswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmacsswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmacssww	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmacssww	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmacswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmacswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmacsww	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmacsww	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmadcsswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmadcsswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      5     1.00                        vpmadcswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      11    1.00    *                   vpmadcswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpperm	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpperm	(%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     0.50    *                   vpperm	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  1      1     1.00                        vprotb	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotb	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotb	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vprotb	$0, %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotb	$0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vprotd	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotd	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotd	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vprotd	$0, %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotd	$0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vprotq	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotq	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotq	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vprotq	$0, %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotq	$0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vprotw	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotw	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotw	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vprotw	$0, %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vprotw	$0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpshab	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshab	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshab	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpshad	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshad	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshad	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpshaq	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshaq	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshaq	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpshaw	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshaw	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshaw	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpshlb	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshlb	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshlb	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpshld	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshld	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshld	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpshlq	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshlq	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshlq	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  1      1     1.00                        vpshlw	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshlw	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  2      7     1.00    *                   vpshlw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  2      10    1.00                        vfrczpd	%xmm0, %xmm3
+# CHECK-NEXT:  2      15    1.00    *                   vfrczpd	(%rax), %xmm3
+# CHECK-NEXT:  4      10    2.00                        vfrczpd	%ymm0, %ymm3
+# CHECK-NEXT:  8      15    2.00    *                   vfrczpd	(%rax), %ymm3
+# CHECK-NEXT:  2      10    1.00                        vfrczps	%xmm0, %xmm3
+# CHECK-NEXT:  2      15    1.00    *                   vfrczps	(%rax), %xmm3
+# CHECK-NEXT:  4      10    2.00                        vfrczps	%ymm0, %ymm3
+# CHECK-NEXT:  8      15    2.00    *                   vfrczps	(%rax), %ymm3
+# CHECK-NEXT:  2      10    1.00                        vfrczsd	%xmm0, %xmm3
+# CHECK-NEXT:  2      15    1.00    *                   vfrczsd	(%rax), %xmm3
+# CHECK-NEXT:  2      10    1.00                        vfrczss	%xmm0, %xmm3
+# CHECK-NEXT:  2      15    1.00    *                   vfrczss	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmov	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcmov	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcmov	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  2      2     0.50                        vpcmov	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      7     1.00    *                   vpcmov	(%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  2      7     1.00    *                   vpcmov	%ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  1      2     0.50                        vpcomb	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomb	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomd	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomd	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomub	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomub	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomud	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomud	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomuq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomuq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomuw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomuw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      3     2.00                        vpermil2pd	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpermil2pd	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpermil2pd	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  2      3     3.00                        vpermil2pd	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      8     3.00    *                   vpermil2pd	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  2      8     3.00    *                   vpermil2pd	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  1      3     2.00                        vpermil2ps	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpermil2ps	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpermil2ps	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  2      3     3.00                        vpermil2ps	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      8     3.00    *                   vpermil2ps	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  2      8     3.00    *                   vpermil2ps	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  1      2     0.50                        vphaddbd	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddbd	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddbq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddbq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddbw	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddbw	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphadddq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphadddq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddubd	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddubd	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddubq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddubq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddubw	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddubw	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddudq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddudq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphadduwd	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphadduwd	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphadduwq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphadduwq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddwd	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddwd	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddwq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddwq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphsubbw	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphsubbw	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphsubdq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphsubdq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphsubwd	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphsubwd	(%rax), %xmm3
+# CHECK-NEXT:  1      5     2.00                        vpmacsdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacsdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     2.00                        vpmacsdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacsdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     2.00                        vpmacsdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacsdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     2.00                        vpmacssdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacssdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     2.00                        vpmacssdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacssdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     2.00                        vpmacssdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacssdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmacsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmacsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmacssww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmacssww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmacswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmacswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmacsww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmacsww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmadcsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmadcsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmadcswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmadcswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      3     2.00                        vpperm	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpperm	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpperm	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      3     0.50                        vprotb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vprotb	$0, %xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vprotb	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vprotd	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotd	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotd	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vprotd	$0, %xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vprotd	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vprotq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vprotq	$0, %xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vprotq	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vprotw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vprotw	$0, %xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vprotw	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshab	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshab	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshab	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshad	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshad	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshad	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshaq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshaq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshaq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshaw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshaw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshaw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshlb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshld	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshld	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshld	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshlq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshlw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlw	%xmm0, (%rax), %xmm3
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     68.00  68.00   -     71.00  41.50  41.50
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 41.50  41.50   -      -      -      -      -      -     30.00  30.00  60.00  60.00  36.00  12.00  100.50 80.50   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczpd	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczpd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczpd	%ymm0, %ymm3
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczpd	(%rax), %ymm3
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczps	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczps	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczps	%ymm0, %ymm3
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczps	(%rax), %ymm3
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczsd	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczsd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczss	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vfrczss	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcmov	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmov	(%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcmov	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpcmov	%ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpcmov	(%rax), %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpcmov	%ymm0, (%rax), %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomb	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomb	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomd	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomd	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomq	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomq	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomub	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomub	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomud	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomud	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomuq	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomuq	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomuw	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomuw	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpcomw	$0, %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpcomw	$0, (%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermil2pd	$0, %xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2pd	$0, (%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2pd	$0, %xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermil2pd	$0, %ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2pd	$0, (%rax), %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2pd	$0, %ymm0, (%rax), %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermil2ps	$0, %xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2ps	$0, (%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2ps	$0, %xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermil2ps	$0, %ymm0, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2ps	$0, (%rax), %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpermil2ps	$0, %ymm0, (%rax), %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddbd	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddbd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddbq	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddbq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddbw	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddbw	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphadddq	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphadddq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddubd	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddubd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddubq	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddubq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddubw	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddubw	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddudq	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddudq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphadduwd	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphadduwd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphadduwq	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphadduwq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddwd	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddwd	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphaddwq	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphaddwq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubbw	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubbw	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubdq	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubdq	(%rax), %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50    -      -     vphsubwd	%xmm0, %xmm3
-# CHECK-NEXT:  -      -      -     1.50    -     1.50   0.50   0.50   vphsubwd	(%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacsdd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacsdd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacsdqh	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacsdqh	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacsdql	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacsdql	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacssdd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacssdd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacssdqh	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacssdqh	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacssdql	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacssdql	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacsswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacsswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacssww	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacssww	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmacsww	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmacsww	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmadcsswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmadcsswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpmadcswd	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpmadcswd	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpperm	%xmm0, %xmm1, %xmm2, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpperm	(%rax), %xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpperm	%xmm0, (%rax), %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotb	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotb	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotb	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotb	$0, %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotb	$0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotd	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotd	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotd	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotd	$0, %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotd	$0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotq	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotq	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotq	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotq	$0, %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotq	$0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotw	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotw	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotw	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vprotw	$0, %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vprotw	$0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshab	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshab	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshab	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshad	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshad	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshad	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshaq	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshaq	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshaq	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshaw	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshaw	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshaw	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshlb	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlb	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlb	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshld	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshld	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshld	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshlq	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlq	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlq	%xmm0, (%rax), %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpshlw	%xmm0, %xmm1, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlw	(%rax), %xmm0, %xmm3
-# CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vpshlw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczpd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczpd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczpd	%ymm0, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczpd	(%rax), %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczps	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczps	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczps	%ymm0, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczps	(%rax), %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczsd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczsd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczss	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczss	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	(%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomb	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomb	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomd	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomd	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomub	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomub	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomud	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomud	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2pd	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2pd	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2pd	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2pd	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2pd	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2pd	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2ps	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2ps	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2ps	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2ps	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2ps	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2ps	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbw	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadddq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadddq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubw	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddudq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddudq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubbw	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubbw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubdq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubdq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubwd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacssww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacssww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpperm	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpperm	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpperm	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshab	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshab	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshab	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshad	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshad	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshad	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshld	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshld	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshld	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlw	%xmm0, (%rax), %xmm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s b/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s
index b2a8f4ac33e..f1a7a47b47a 100644
--- a/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s
+++ b/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s
@@ -1,17 +1,17 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -scheduler-stats < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -scheduler-stats < %s | FileCheck %s
 
 vmulps (%rsi), %xmm0, %xmm0
 add  %rsi, %rsi
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      2
-# CHECK-NEXT: Total Cycles:      14
-# CHECK-NEXT: Total uOps:        3
+# CHECK-NEXT: Total Cycles:      13
+# CHECK-NEXT: Total uOps:        2
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.21
-# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: uOps Per Cycle:    0.15
+# CHECK-NEXT: IPC:               0.15
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Instruction Info:
@@ -23,13 +23,13 @@ add  %rsi, %rsi
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rsi), %xmm0, %xmm0
-# CHECK-NEXT:  1      1     0.33                        addq	%rsi, %rsi
+# CHECK-NEXT:  1      10    1.00    *                   vmulps	(%rsi), %xmm0, %xmm0
+# CHECK-NEXT:  1      1     0.50                        addq	%rsi, %rsi
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
 # CHECK-NEXT: [# issued], [# cycles]
-# CHECK-NEXT:  0,          13  (92.9%)
-# CHECK-NEXT:  2,          1  (7.1%)
+# CHECK-NEXT:  0,          12  (92.3%)
+# CHECK-NEXT:  2,          1  (7.7%)
 
 # CHECK:      Scheduler's queue usage:
 # CHECK-NEXT: [1] Resource name.
@@ -38,23 +38,38 @@ add  %rsi, %rsi
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: SBPortAny        0          2          54
+# CHECK-NEXT: PdEX             0          2          40
+# CHECK-NEXT: PdFPU            0          1          64
+# CHECK-NEXT: PdLoad           0          1          40
+# CHECK-NEXT: PdStore          0          1          24
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00    -      -     1.00    -     1.00
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -     1.00    -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     1.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00   vmulps	(%rsi), %xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     addq	%rsi, %rsi
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vmulps	(%rsi), %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	%rsi, %rsi
diff --git a/test/tools/llvm-mca/X86/BdVer2/simple-test.s b/test/tools/llvm-mca/X86/BdVer2/simple-test.s
index f0ff718c9e7..562bfbb0c07 100644
--- a/test/tools/llvm-mca/X86/BdVer2/simple-test.s
+++ b/test/tools/llvm-mca/X86/BdVer2/simple-test.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 < %s | FileCheck %s
 
 add %edi, %eax
 
@@ -11,7 +11,7 @@ add %edi, %eax
 # CHECK:      Dispatch Width:    4
 # CHECK-NEXT: uOps Per Cycle:    0.97
 # CHECK-NEXT: IPC:               0.97
-# CHECK-NEXT: Block RThroughput: 0.3
+# CHECK-NEXT: Block RThroughput: 0.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -22,22 +22,34 @@ add %edi, %eax
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.33                        addl	%edi, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	%edi, %eax
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.34    -      -     addl	%edi, %eax
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edi, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s b/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s
index ee54b757da2..9ab4ab0baeb 100644
--- a/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s
+++ b/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s
@@ -1,18 +1,18 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
 
 leaq 8(%rsp, %rdi, 2), %rax
 vbroadcastss (%rax), %ymm0
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      60
-# CHECK-NEXT: Total uOps:        200
+# CHECK-NEXT: Total Cycles:      206
+# CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    3.33
-# CHECK-NEXT: IPC:               3.33
-# CHECK-NEXT: Block RThroughput: 0.5
+# CHECK-NEXT: uOps Per Cycle:    1.94
+# CHECK-NEXT: IPC:               0.97
+# CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -23,38 +23,50 @@ vbroadcastss (%rax), %ymm0
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     0.50                        leaq	8(%rsp,%rdi,2), %rax
-# CHECK-NEXT:  1      7     0.50    *                   vbroadcastss	(%rax), %ymm0
+# CHECK-NEXT:  2      1     0.50                        leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT:  2      6     2.00    *                   vbroadcastss	(%rax), %ymm0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -     1.00    -      -      -     0.50   0.50    -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     leaq	8(%rsp,%rdi,2), %rax
-# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vbroadcastss	(%rax), %ymm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vbroadcastss	(%rax), %ymm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    ..   leaq	8(%rsp,%rdi,2), %rax
-# CHECK-NEXT: [0,1]     D=eeeeeeeER.   vbroadcastss	(%rax), %ymm0
-# CHECK-NEXT: [1,0]     DeE-------R.   leaq	8(%rsp,%rdi,2), %rax
-# CHECK-NEXT: [1,1]     D=eeeeeeeER.   vbroadcastss	(%rax), %ymm0
-# CHECK-NEXT: [2,0]     .DeE------R.   leaq	8(%rsp,%rdi,2), %rax
-# CHECK-NEXT: [2,1]     .D=eeeeeeeER   vbroadcastss	(%rax), %ymm0
+# CHECK:      [0,0]     DeER .    . .   leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: [0,1]     DeeeeeeER . .   vbroadcastss	(%rax), %ymm0
+# CHECK-NEXT: [1,0]     .DeE----R . .   leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: [1,1]     .DeeeeeeER. .   vbroadcastss	(%rax), %ymm0
+# CHECK-NEXT: [2,0]     . DeE----R. .   leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: [2,1]     . D==eeeeeeER   vbroadcastss	(%rax), %ymm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -63,5 +75,5 @@ vbroadcastss (%rax), %ymm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     1.0    1.0    4.3       leaq	8(%rsp,%rdi,2), %rax
-# CHECK-NEXT: 1.     3     2.0    0.0    0.0       vbroadcastss	(%rax), %ymm0
+# CHECK-NEXT: 0.     3     1.0    1.0    2.7       leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: 1.     3     1.7    0.7    0.0       vbroadcastss	(%rax), %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-1.s b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-1.s
index 721d276f2f4..70868928d17 100644
--- a/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-1.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
 
 vaddps %xmm0, %xmm0, %xmm1
 vandps (%rdi), %xmm1, %xmm2
@@ -7,10 +7,10 @@ vandps (%rdi), %xmm1, %xmm2
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      2
 # CHECK-NEXT: Total Cycles:      10
-# CHECK-NEXT: Total uOps:        3
+# CHECK-NEXT: Total uOps:        2
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.30
+# CHECK-NEXT: uOps Per Cycle:    0.20
 # CHECK-NEXT: IPC:               0.20
 # CHECK-NEXT: Block RThroughput: 1.0
 
@@ -23,13 +23,13 @@ vandps (%rdi), %xmm1, %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        vaddps	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT:  2      7     1.00    *                   vandps	(%rdi), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      7     0.50    *                   vandps	(%rdi), %xmm1, %xmm2
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER   .   vaddps	%xmm0, %xmm0, %xmm1
+# CHECK:      [0,0]     DeeeeeER .   vaddps	%xmm0, %xmm0, %xmm1
 # CHECK-NEXT: [0,1]     DeeeeeeeER   vandps	(%rdi), %xmm1, %xmm2
 
 # CHECK:      Average Wait times (based on the timeline view):
diff --git a/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-2.s b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-2.s
index 4768971eb52..ef72be2cbde 100644
--- a/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-2.s
@@ -1,18 +1,18 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
 
 vaddps %ymm0, %ymm0, %ymm1
 vandps (%rdi), %ymm1, %ymm2
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      2
-# CHECK-NEXT: Total Cycles:      11
-# CHECK-NEXT: Total uOps:        3
+# CHECK-NEXT: Total Cycles:      10
+# CHECK-NEXT: Total uOps:        4
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.18
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -23,15 +23,14 @@ vandps (%rdi), %ymm1, %ymm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT:  2      8     1.00    *                   vandps	(%rdi), %ymm1, %ymm2
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  2      7     1.00    *                   vandps	(%rdi), %ymm1, %ymm2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .   vaddps	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [0,1]     DeeeeeeeeER   vandps	(%rdi), %ymm1, %ymm2
+# CHECK:      [0,0]     DeeeeeER .   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     DeeeeeeeER   vandps	(%rdi), %ymm1, %ymm2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
diff --git a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s
index d7d99861cfb..678e6938bce 100644
--- a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s
+++ b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
 
   vmulps  %ymm0, %ymm1, %ymm2
   vfrczpd %xmm1, %xmm2
@@ -10,13 +10,13 @@
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      318
-# CHECK-NEXT: Total uOps:        600
+# CHECK-NEXT: Total Cycles:      717
+# CHECK-NEXT: Total uOps:        1200
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.89
-# CHECK-NEXT: IPC:               1.89
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: uOps Per Cycle:    1.67
+# CHECK-NEXT: IPC:               0.84
+# CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -27,52 +27,64 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vfrczpd	%xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00                        vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm4, %ymm5, %ymm0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     3.00   3.00    -      -      -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.04   4.96    -      -      -     1.00   4.00   7.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vfrczpd	%xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.01   0.99    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.01   0.99    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.02   0.98    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: [0,1]     DeeeE--R  .    .    .    ..   vfrczpd	%xmm1, %xmm2
-# CHECK-NEXT: [0,2]     D===eeeeeER    .    .    ..   vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT: [0,3]     D========eeeER .    .    ..   vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT: [0,4]     .D==========eeeeeER .    ..   vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT: [0,5]     .D===============eeeER   ..   vaddps	%ymm4, %ymm5, %ymm0
-# CHECK-NEXT: [1,0]     .D==================eeeeeER   vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: [1,1]     .DeeeE--------------------R   vfrczpd	%xmm1, %xmm2
-# CHECK-NEXT: [1,2]     . D==eeeeeE---------------R   vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT: [1,3]     . D=======eeeE------------R   vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT: [1,4]     . D==========eeeeeE-------R   vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT: [1,5]     . D===============eeeE----R   vaddps	%ymm4, %ymm5, %ymm0
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    .   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     D==eeeeeeeeeeER.    .    .    .    .    .   vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT: [0,2]     .D===========eeeeeER.    .    .    .    .   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [0,3]     .D================eeeeeER.    .    .    .   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [0,4]     . D====================eeeeeER.    .    .   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [0,5]     . D=========================eeeeeER.    .   vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: [1,0]     .  D=============================eeeeeER.   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     .  DeeeeeeeeeeE------------------------R.   vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .   D==========eeeeeE------------------R.   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [1,3]     .   D===============eeeeeE-------------R.   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [1,4]     .    D===================eeeeeE---------R   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [1,5]     .    D========================eeeeeE----R   vaddps	%ymm4, %ymm5, %ymm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -81,9 +93,9 @@
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.     2     1.0    1.0    11.0      vfrczpd	%xmm1, %xmm2
-# CHECK-NEXT: 2.     2     3.5    0.0    7.5       vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT: 3.     2     8.5    0.0    6.0       vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT: 4.     2     11.0   0.0    3.5       vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT: 5.     2     16.0   0.0    2.0       vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: 0.     2     15.5   0.5    0.0       vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     2     2.0    2.0    12.0      vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT: 2.     2     11.5   0.5    9.0       vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: 3.     2     16.5   0.0    6.5       vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: 4.     2     20.5   0.0    4.5       vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: 5.     2     25.5   0.0    2.0       vaddps	%ymm4, %ymm5, %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s
index ba59a86a048..c864c545f99 100644
--- a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s
+++ b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
 
   vmulps     %ymm0, %ymm1, %ymm2
   vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2
@@ -10,13 +10,13 @@
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      316
-# CHECK-NEXT: Total uOps:        600
+# CHECK-NEXT: Total Cycles:      653
+# CHECK-NEXT: Total uOps:        1100
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.90
-# CHECK-NEXT: IPC:               1.90
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: uOps Per Cycle:    1.68
+# CHECK-NEXT: IPC:               0.92
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -27,52 +27,64 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
-# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT:  1      5     1.00                        vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      3     2.00                        vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm4, %ymm5, %ymm0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     3.00   2.00    -     1.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.54   4.46    -      -      -      -     4.99   6.01    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.52   0.48    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.99   0.01    -      -      -      -     vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.52   0.48    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .   .   vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: [0,1]     DeE----R  .    .    .   .   vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
-# CHECK-NEXT: [0,2]     D=eeeeeER .    .    .   .   vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT: [0,3]     D======eeeER   .    .   .   vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT: [0,4]     .D========eeeeeER   .   .   vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT: [0,5]     .D=============eeeER.   .   vaddps	%ymm4, %ymm5, %ymm0
-# CHECK-NEXT: [1,0]     .D================eeeeeER   vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: [1,1]     .DeE--------------------R   vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
-# CHECK-NEXT: [1,2]     . DeeeeeE---------------R   vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT: [1,3]     . D=====eeeE------------R   vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT: [1,4]     . D========eeeeeE-------R   vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT: [1,5]     . D=============eeeE----R   vaddps	%ymm4, %ymm5, %ymm0
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    . .   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     DeeeE--R  .    .    .    .    . .   vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     .D==eeeeeER    .    .    .    . .   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [0,3]     .D=======eeeeeER    .    .    . .   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [0,4]     . D============eeeeeER   .    . .   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [0,5]     . D=================eeeeeER   . .   vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: [1,0]     .  D=====================eeeeeER.   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     .  D=eeeE----------------------R.   vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .   D=====eeeeeE---------------R.   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [1,3]     .   D===========eeeeeE---------R.   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [1,4]     .    D===============eeeeeE-----R   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [1,5]     .    D====================eeeeeER   vaddps	%ymm4, %ymm5, %ymm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -81,9 +93,9 @@
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       vmulps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.     2     1.0    1.0    12.0      vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
-# CHECK-NEXT: 2.     2     1.5    0.0    7.5       vmulps	%ymm2, %ymm3, %ymm4
-# CHECK-NEXT: 3.     2     6.5    0.0    6.0       vaddps	%ymm4, %ymm5, %ymm6
-# CHECK-NEXT: 4.     2     9.0    0.0    3.5       vmulps	%ymm6, %ymm3, %ymm4
-# CHECK-NEXT: 5.     2     14.0   0.0    2.0       vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: 0.     2     11.5   0.5    0.0       vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     2     1.5    1.5    12.0      vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT: 2.     2     4.5    1.0    7.5       vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: 3.     2     10.0   0.5    4.5       vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: 4.     2     14.5   0.5    2.5       vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: 5.     2     19.5   0.0    0.0       vaddps	%ymm4, %ymm5, %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s b/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s
index 8290cacdf05..b98f36f3258 100644
--- a/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s
+++ b/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
 
 # TODO: Fix the processor resource usage for zero-idiom YMM XOR instructions.
 #       Those vector XOR instructions should only consume 1cy of JFPU1 (instead
@@ -40,13 +40,13 @@ vaddps  %ymm1, %ymm1, %ymm0
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      300
-# CHECK-NEXT: Total Cycles:      107
-# CHECK-NEXT: Total uOps:        300
+# CHECK-NEXT: Total Cycles:      305
+# CHECK-NEXT: Total uOps:        600
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    2.80
-# CHECK-NEXT: IPC:               2.80
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: uOps Per Cycle:    1.97
+# CHECK-NEXT: IPC:               0.98
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -57,42 +57,55 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT:  1      1     1.00                        vxorps	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT:  1      1     0.50                        vblendps	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  2      2     1.00                        vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  2      2     1.00                        vblendps	$2, %ymm1, %ymm2, %ymm3
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.52   2.48    -      -      -      -     3.00   3.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorps	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vblendps	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.98   1.02    -      -      -      -     vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.02   0.98    -      -      -      -     0.02   1.98    -      -      -      -     vblendps	$2, %ymm1, %ymm2, %ymm3
 
 # CHECK:      Timeline view:
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER   .   vaddps	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [0,1]     D===eER  .   vxorps	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,2]     D====eER .   vblendps	$2, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: [1,0]     D=eeeE-R .   vaddps	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [1,1]     .D===eER .   vxorps	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [1,2]     .D====eER.   vblendps	$2, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: [2,0]     .D=eeeE-R.   vaddps	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [2,1]     .D====eER.   vxorps	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [2,2]     . D====eER   vblendps	$2, %ymm1, %ymm2, %ymm3
+# CHECK:      [0,0]     DeeeeeER  . .   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     DeeE---R  . .   vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,2]     .D=eeE-R  . .   vblendps	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     .D=eeeeeER. .   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1]     . D==eeE-R. .   vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [1,2]     . D====eeER .   vblendps	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     .  D=eeeeeER.   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1]     .  D===eeE-R.   vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [2,2]     .   D====eeER   vblendps	$2, %ymm1, %ymm2, %ymm3
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -101,21 +114,21 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     1.7    1.7    0.7       vaddps	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT: 1.     3     4.3    0.0    0.0       vxorps	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 2.     3     5.0    0.0    0.0       vblendps	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 0.     3     1.7    1.7    0.0       vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     3     2.7    2.7    1.7       vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 2.     3     4.0    0.0    0.3       vblendps	$2, %ymm1, %ymm2, %ymm3
 
 # CHECK:      [1] Code Region - ZERO-IDIOM-2
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      300
-# CHECK-NEXT: Total Cycles:      107
-# CHECK-NEXT: Total uOps:        300
+# CHECK-NEXT: Total Cycles:      305
+# CHECK-NEXT: Total uOps:        600
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    2.80
-# CHECK-NEXT: IPC:               2.80
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: uOps Per Cycle:    1.97
+# CHECK-NEXT: IPC:               0.98
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -126,42 +139,55 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        vaddpd	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT:  1      1     1.00                        vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT:  1      1     0.50                        vblendpd	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      5     2.00                        vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  2      2     1.00                        vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  2      2     1.00                        vblendpd	$2, %ymm1, %ymm2, %ymm3
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     1.00   1.00    -     1.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.52   2.48    -      -      -      -     3.00   3.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddpd	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vblendpd	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.98   1.02    -      -      -      -     vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.02   0.98    -      -      -      -     0.02   1.98    -      -      -      -     vblendpd	$2, %ymm1, %ymm2, %ymm3
 
 # CHECK:      Timeline view:
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER   .   vaddpd	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [0,1]     D===eER  .   vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,2]     D====eER .   vblendpd	$2, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: [1,0]     D=eeeE-R .   vaddpd	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [1,1]     .D===eER .   vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [1,2]     .D====eER.   vblendpd	$2, %ymm1, %ymm2, %ymm3
-# CHECK-NEXT: [2,0]     .D=eeeE-R.   vaddpd	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [2,1]     .D====eER.   vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [2,2]     . D====eER   vblendpd	$2, %ymm1, %ymm2, %ymm3
+# CHECK:      [0,0]     DeeeeeER  . .   vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     DeeE---R  . .   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,2]     .D=eeE-R  . .   vblendpd	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     .D=eeeeeER. .   vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1]     . D==eeE-R. .   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [1,2]     . D====eeER .   vblendpd	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     .  D=eeeeeER.   vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1]     .  D===eeE-R.   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [2,2]     .   D====eeER   vblendpd	$2, %ymm1, %ymm2, %ymm3
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -170,21 +196,21 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     1.7    1.7    0.7       vaddpd	%ymm0, %ymm0, %ymm1
-# CHECK-NEXT: 1.     3     4.3    0.0    0.0       vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 2.     3     5.0    0.0    0.0       vblendpd	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 0.     3     1.7    1.7    0.0       vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     3     2.7    2.7    1.7       vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 2.     3     4.0    0.0    0.3       vblendpd	$2, %ymm1, %ymm2, %ymm3
 
 # CHECK:      [2] Code Region - ZERO-IDIOM-3
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      106
-# CHECK-NEXT: Total uOps:        200
+# CHECK-NEXT: Total Cycles:      206
+# CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.89
-# CHECK-NEXT: IPC:               1.89
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: uOps Per Cycle:    1.94
+# CHECK-NEXT: IPC:               0.97
+# CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -195,37 +221,50 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      2     1.00                        vandnps	%ymm2, %ymm2, %ymm3
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -     2.00   2.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -     2.00    -      -      -      -     vandnps	%ymm2, %ymm2, %ymm3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER  .   vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: [0,1]     D===eER .   vandnps	%ymm2, %ymm2, %ymm3
-# CHECK-NEXT: [1,0]     D=eeeER .   vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: [1,1]     D====eER.   vandnps	%ymm2, %ymm2, %ymm3
-# CHECK-NEXT: [2,0]     .D=eeeER.   vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: [2,1]     .D====eER   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK:      [0,0]     DeeeeeER  ..   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     DeeE---R  ..   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     .D=eeeeeER..   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     .D=eeE---R..   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     . D==eeeeeER   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [2,1]     . D==eeE---R   vandnps	%ymm2, %ymm2, %ymm3
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -234,20 +273,20 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     1.7    1.7    0.0       vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.     3     4.7    0.0    0.0       vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: 0.     3     2.0    2.0    0.0       vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     3     2.0    2.0    3.0       vandnps	%ymm2, %ymm2, %ymm3
 
 # CHECK:      [3] Code Region - ZERO-IDIOM-4
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      106
-# CHECK-NEXT: Total uOps:        200
+# CHECK-NEXT: Total Cycles:      206
+# CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.89
-# CHECK-NEXT: IPC:               1.89
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: uOps Per Cycle:    1.94
+# CHECK-NEXT: IPC:               0.97
+# CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -258,37 +297,50 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      2     1.00                        vandnps	%ymm2, %ymm2, %ymm3
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -     2.00   2.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -     2.00    -      -      -      -     vandnps	%ymm2, %ymm2, %ymm3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER  .   vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: [0,1]     D===eER .   vandnps	%ymm2, %ymm2, %ymm3
-# CHECK-NEXT: [1,0]     D=eeeER .   vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: [1,1]     D====eER.   vandnps	%ymm2, %ymm2, %ymm3
-# CHECK-NEXT: [2,0]     .D=eeeER.   vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: [2,1]     .D====eER   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK:      [0,0]     DeeeeeER  ..   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     DeeE---R  ..   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     .D=eeeeeER..   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     .D=eeE---R..   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     . D==eeeeeER   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [2,1]     . D==eeE---R   vandnps	%ymm2, %ymm2, %ymm3
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -297,20 +349,20 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     1.7    1.7    0.0       vaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1.     3     4.7    0.0    0.0       vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: 0.     3     2.0    2.0    0.0       vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     3     2.0    2.0    3.0       vandnps	%ymm2, %ymm2, %ymm3
 
 # CHECK:      [4] Code Region - ZERO-IDIOM-5
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      403
-# CHECK-NEXT: Total uOps:        200
+# CHECK-NEXT: Total Cycles:      903
+# CHECK-NEXT: Total uOps:        1000
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.50
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: uOps Per Cycle:    1.11
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -321,38 +373,50 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1     1.00                        vperm2f128	$136, %ymm0, %ymm0, %ymm1
-# CHECK-NEXT:  1      3     1.00                        vaddps	%ymm1, %ymm1, %ymm0
+# CHECK-NEXT:  8      4     0.50                        vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm1, %ymm1, %ymm0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     2.00   1.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vperm2f128	$136, %ymm0, %ymm0, %ymm1
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vaddps	%ymm1, %ymm1, %ymm0
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm1, %ymm1, %ymm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeER .    .   .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [0,1]     D=eeeER   .   .   vaddps	%ymm1, %ymm1, %ymm0
-# CHECK-NEXT: [1,0]     D====eER  .   .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [1,1]     D=====eeeER   .   vaddps	%ymm1, %ymm1, %ymm0
-# CHECK-NEXT: [2,0]     .D=======eER  .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [2,1]     .D========eeeER   vaddps	%ymm1, %ymm1, %ymm0
+# CHECK:      [0,0]     DeeeeER   .    .    .    .   .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     . D==eeeeeER   .    .    .   .   vaddps	%ymm1, %ymm1, %ymm0
+# CHECK-NEXT: [1,0]     .  D======eeeeER    .    .   .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1]     .    D========eeeeeER    .   .   vaddps	%ymm1, %ymm1, %ymm0
+# CHECK-NEXT: [2,0]     .    .D============eeeeER.   .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1]     .    .  D==============eeeeeER   vaddps	%ymm1, %ymm1, %ymm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -361,5 +425,5 @@ vaddps  %ymm1, %ymm1, %ymm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     4.7    0.3    0.0       vperm2f128	$136, %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: 1.     3     5.7    0.0    0.0       vaddps	%ymm1, %ymm1, %ymm0
+# CHECK-NEXT: 0.     3     7.0    0.3    0.0       vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     3     9.0    0.0    0.0       vaddps	%ymm1, %ymm1, %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s b/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s
index 034542e655b..3f9c4dbb8f5 100644
--- a/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s
+++ b/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -register-file-stats -iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -register-file-stats -iterations=1 < %s | FileCheck %s
 
 subl  %eax, %eax
 subq  %rax, %rax
@@ -90,12 +90,12 @@ vpxor  %xmm3, %xmm3, %xmm5
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      71
-# CHECK-NEXT: Total Cycles:      39
+# CHECK-NEXT: Total Cycles:      26
 # CHECK-NEXT: Total uOps:        71
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.82
-# CHECK-NEXT: IPC:               1.82
+# CHECK-NEXT: uOps Per Cycle:    2.73
+# CHECK-NEXT: IPC:               2.73
 # CHECK-NEXT: Block RThroughput: 17.8
 
 # CHECK:      Instruction Info:
@@ -111,25 +111,25 @@ vpxor  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      0     0.25                        subq	%rax, %rax
 # CHECK-NEXT:  1      0     0.25                        xorl	%eax, %eax
 # CHECK-NEXT:  1      0     0.25                        xorq	%rax, %rax
-# CHECK-NEXT:  1      3     1.00                        pcmpgtb	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        pcmpgtd	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        pcmpgtw	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtb	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtd	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtw	%mm2, %mm2
 # CHECK-NEXT:  1      0     0.25                        pcmpgtb	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        pcmpgtd	%xmm2, %xmm2
-# CHECK-NEXT:  1      0     0.25                        pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtq	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        pcmpgtw	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtb	%xmm3, %xmm3, %xmm3
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  1      0     0.25                        vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtq	%xmm3, %xmm3, %xmm3
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm3, %xmm3, %xmm3
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtb	%xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  1      0     0.25                        vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtq	%xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  1      3     1.00                        psubb	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        psubd	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        psubw	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        psubb	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        psubd	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        psubq	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        psubw	%mm2, %mm2
 # CHECK-NEXT:  1      0     0.25                        psubb	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        psubd	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        psubq	%xmm2, %xmm2
@@ -142,37 +142,37 @@ vpxor  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      0     0.25                        vpsubd	%xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      0     0.25                        vpsubq	%xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      0     0.25                        vpsubw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  1      3     1.00                        psubsb	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        psubsw	%mm2, %mm2
-# CHECK-NEXT:  1      1     0.50                        psubsb	%xmm2, %xmm2
-# CHECK-NEXT:  1      1     0.50                        psubsw	%xmm2, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsubsb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpsubsw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpsubsb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  1      1     0.50                        vpsubsw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  1      3     1.00                        psubusb	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        psubusw	%mm2, %mm2
-# CHECK-NEXT:  1      1     0.50                        psubusb	%xmm2, %xmm2
-# CHECK-NEXT:  1      1     0.50                        psubusw	%xmm2, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsubusb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpsubusw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  1      1     0.50                        vpsubsb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  1      1     0.50                        vpsubsw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  1      1     1.00                        andnps	%xmm0, %xmm0
-# CHECK-NEXT:  1      1     1.00                        andnpd	%xmm1, %xmm1
-# CHECK-NEXT:  1      1     1.00                        vandnps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vandnpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT:  1      1     0.33                        pandn	%mm2, %mm2
-# CHECK-NEXT:  1      1     0.33                        pandn	%xmm2, %xmm2
-# CHECK-NEXT:  1      1     0.33                        vpandn	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  1      1     1.00                        vandnps	%xmm2, %xmm2, %xmm5
-# CHECK-NEXT:  1      1     1.00                        vandnpd	%xmm1, %xmm1, %xmm5
-# CHECK-NEXT:  1      1     0.33                        vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        psubsb	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        psubsw	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        psubsb	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubsw	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        psubusb	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        psubusw	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        psubusb	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubusw	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        andnps	%xmm0, %xmm0
+# CHECK-NEXT:  1      0     0.25                        andnpd	%xmm1, %xmm1
+# CHECK-NEXT:  1      0     0.25                        vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  1      0     0.25                        pandn	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pandn	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpandn	%xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      0     0.25                        xorps	%xmm0, %xmm0
 # CHECK-NEXT:  1      0     0.25                        xorpd	%xmm1, %xmm1
 # CHECK-NEXT:  1      0     0.25                        vxorps	%xmm2, %xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        vxorpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT:  1      1     0.33                        pxor	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pxor	%mm2, %mm2
 # CHECK-NEXT:  1      0     0.25                        pxor	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        vpxor	%xmm3, %xmm3, %xmm3
 # CHECK-NEXT:  1      0     0.25                        vxorps	%xmm4, %xmm4, %xmm5
@@ -180,172 +180,194 @@ vpxor  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      0     0.25                        vpxor	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    75
-# CHECK-NEXT: Max number of mappings used:         51
+# CHECK-NEXT: Total number of mappings created:    1
+# CHECK-NEXT: Max number of mappings used:         1
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 1
+# CHECK-NEXT:    Max number of mappings used:      1
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SBDivider
-# CHECK-NEXT: [1]   - SBFPDivider
-# CHECK-NEXT: [2]   - SBPort0
-# CHECK-NEXT: [3]   - SBPort1
-# CHECK-NEXT: [4]   - SBPort4
-# CHECK-NEXT: [5]   - SBPort5
-# CHECK-NEXT: [6.0] - SBPort23
-# CHECK-NEXT: [6.1] - SBPort23
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     5.00   16.00   -     13.00   -      -
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     8.00   11.00   -      -     9.00   10.00   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     subl	%eax, %eax
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     subq	%rax, %rax
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorl	%eax, %eax
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorq	%rax, %rax
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtb	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtd	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     pcmpgtw	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     pcmpgtb	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     pcmpgtd	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     pcmpgtq	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     pcmpgtw	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubb	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubd	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubw	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     psubb	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     psubd	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     psubq	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     psubw	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubsb	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubsw	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psubsb	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psubsw	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpsubsw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpsubsw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubusb	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubusw	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubusb	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psubusw	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpsubusb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpsubusw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpsubsb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andnps	%xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     andnpd	%xmm1, %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pandn	%mm2, %mm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pandn	%xmm2, %xmm2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnps	%xmm2, %xmm2, %xmm5
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vandnpd	%xmm1, %xmm1, %xmm5
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     xorpd	%xmm1, %xmm1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pxor	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     pxor	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorps	%xmm4, %xmm4, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subq	%rax, %rax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rax, %rax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubq	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     psubsb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     psubsw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     psubsb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     psubsw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     psubusb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     psubusw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     psubusb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     psubusw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andnps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andnpd	%xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pandn	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pandn	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorpd	%xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DR   .    .    .    .    .    .    .  .   subl	%eax, %eax
-# CHECK-NEXT: [0,1]     DR   .    .    .    .    .    .    .  .   subq	%rax, %rax
-# CHECK-NEXT: [0,2]     DR   .    .    .    .    .    .    .  .   xorl	%eax, %eax
-# CHECK-NEXT: [0,3]     DR   .    .    .    .    .    .    .  .   xorq	%rax, %rax
-# CHECK-NEXT: [0,4]     .DeeeER   .    .    .    .    .    .  .   pcmpgtb	%mm2, %mm2
-# CHECK-NEXT: [0,5]     .D===eeeER.    .    .    .    .    .  .   pcmpgtd	%mm2, %mm2
-# CHECK-NEXT: [0,6]     .D======eeeER  .    .    .    .    .  .   pcmpgtw	%mm2, %mm2
-# CHECK-NEXT: [0,7]     .D----------R  .    .    .    .    .  .   pcmpgtb	%xmm2, %xmm2
-# CHECK-NEXT: [0,8]     . D---------R  .    .    .    .    .  .   pcmpgtd	%xmm2, %xmm2
-# CHECK-NEXT: [0,9]     . D---------R  .    .    .    .    .  .   pcmpgtq	%xmm2, %xmm2
-# CHECK-NEXT: [0,10]    . D---------R  .    .    .    .    .  .   pcmpgtw	%xmm2, %xmm2
-# CHECK-NEXT: [0,11]    . D---------R  .    .    .    .    .  .   vpcmpgtb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,12]    .  D--------R  .    .    .    .    .  .   vpcmpgtd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,13]    .  D--------R  .    .    .    .    .  .   vpcmpgtq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,14]    .  D--------R  .    .    .    .    .  .   vpcmpgtw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,15]    .  D--------R  .    .    .    .    .  .   vpcmpgtb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,16]    .   D-------R  .    .    .    .    .  .   vpcmpgtd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,17]    .   D-------R  .    .    .    .    .  .   vpcmpgtq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,18]    .   D-------R  .    .    .    .    .  .   vpcmpgtw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,19]    .   D======eeeER    .    .    .    .  .   psubb	%mm2, %mm2
-# CHECK-NEXT: [0,20]    .    D========eeeER .    .    .    .  .   psubd	%mm2, %mm2
-# CHECK-NEXT: [0,21]    .    D===========eeeER   .    .    .  .   psubq	%mm2, %mm2
-# CHECK-NEXT: [0,22]    .    D==============eeeER.    .    .  .   psubw	%mm2, %mm2
-# CHECK-NEXT: [0,23]    .    D------------------R.    .    .  .   psubb	%xmm2, %xmm2
-# CHECK-NEXT: [0,24]    .    .D-----------------R.    .    .  .   psubd	%xmm2, %xmm2
-# CHECK-NEXT: [0,25]    .    .D-----------------R.    .    .  .   psubq	%xmm2, %xmm2
-# CHECK-NEXT: [0,26]    .    .D-----------------R.    .    .  .   psubw	%xmm2, %xmm2
-# CHECK-NEXT: [0,27]    .    .D-----------------R.    .    .  .   vpsubb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,28]    .    . D----------------R.    .    .  .   vpsubd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,29]    .    . D----------------R.    .    .  .   vpsubq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,30]    .    . D----------------R.    .    .  .   vpsubw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,31]    .    . D----------------R.    .    .  .   vpsubb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,32]    .    .  D---------------R.    .    .  .   vpsubd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,33]    .    .  D---------------R.    .    .  .   vpsubq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,34]    .    .  D---------------R.    .    .  .   vpsubw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,35]    .    .  D==============eeeER  .    .  .   psubsb	%mm2, %mm2
-# CHECK-NEXT: [0,36]    .    .   D================eeeER    .  .   psubsw	%mm2, %mm2
-# CHECK-NEXT: [0,37]    .    .   DeE------------------R    .  .   psubsb	%xmm2, %xmm2
-# CHECK-NEXT: [0,38]    .    .   D==eE----------------R    .  .   psubsw	%xmm2, %xmm2
-# CHECK-NEXT: [0,39]    .    .   DeE------------------R    .  .   vpsubsb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,40]    .    .    DeE-----------------R    .  .   vpsubsw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,41]    .    .    D=eE----------------R    .  .   vpsubsb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,42]    .    .    D==eE---------------R    .  .   vpsubsw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,43]    .    .    D==================eeeER .  .   psubusb	%mm2, %mm2
-# CHECK-NEXT: [0,44]    .    .    .D====================eeeER .   psubusw	%mm2, %mm2
-# CHECK-NEXT: [0,45]    .    .    .D=eE---------------------R .   psubusb	%xmm2, %xmm2
-# CHECK-NEXT: [0,46]    .    .    .D==eE--------------------R .   psubusw	%xmm2, %xmm2
-# CHECK-NEXT: [0,47]    .    .    .D===eE-------------------R .   vpsubusb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,48]    .    .    . D===eE------------------R .   vpsubusw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,49]    .    .    . D====eE-----------------R .   vpsubsb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,50]    .    .    . D=====eE----------------R .   vpsubsw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,51]    .    .    . D===eE------------------R .   andnps	%xmm0, %xmm0
-# CHECK-NEXT: [0,52]    .    .    .  D====eE----------------R .   andnpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,53]    .    .    .  D======eE--------------R .   vandnps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: [0,54]    .    .    .  D=====eE---------------R .   vandnpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: [0,55]    .    .    .  D=====================eER.   pandn	%mm2, %mm2
-# CHECK-NEXT: [0,56]    .    .    .   D======eE--------------R.   pandn	%xmm2, %xmm2
-# CHECK-NEXT: [0,57]    .    .    .   D==eE------------------R.   vpandn	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,58]    .    .    .   D=======eE-------------R.   vandnps	%xmm2, %xmm2, %xmm5
-# CHECK-NEXT: [0,59]    .    .    .   D======eE--------------R.   vandnpd	%xmm1, %xmm1, %xmm5
-# CHECK-NEXT: [0,60]    .    .    .    D==eE-----------------R.   vpandn	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,61]    .    .    .    D=E-------------------R.   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [0,62]    .    .    .    D====E----------------R.   xorpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,63]    .    .    .    D======E--------------R.   vxorps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: [0,64]    .    .    .    .D===E----------------R.   vxorpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: [0,65]    .    .    .    .D===================eER   pxor	%mm2, %mm2
-# CHECK-NEXT: [0,66]    .    .    .    .D=====E---------------R   pxor	%xmm2, %xmm2
-# CHECK-NEXT: [0,67]    .    .    .    .D=E-------------------R   vpxor	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,68]    .    .    .    . D--------------------R   vxorps	%xmm4, %xmm4, %xmm5
-# CHECK-NEXT: [0,69]    .    .    .    . D==E-----------------R   vxorpd	%xmm1, %xmm1, %xmm3
-# CHECK-NEXT: [0,70]    .    .    .    . D==E-----------------R   vpxor	%xmm3, %xmm3, %xmm5
+# CHECK:      [0,0]     DR   .    .    .    .    .   subl	%eax, %eax
+# CHECK-NEXT: [0,1]     DR   .    .    .    .    .   subq	%rax, %rax
+# CHECK-NEXT: [0,2]     DR   .    .    .    .    .   xorl	%eax, %eax
+# CHECK-NEXT: [0,3]     DR   .    .    .    .    .   xorq	%rax, %rax
+# CHECK-NEXT: [0,4]     .DR  .    .    .    .    .   pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: [0,5]     .DR  .    .    .    .    .   pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: [0,6]     .DR  .    .    .    .    .   pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: [0,7]     .DR  .    .    .    .    .   pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: [0,8]     . DR .    .    .    .    .   pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: [0,9]     . DeeER   .    .    .    .   pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: [0,10]    . D---R   .    .    .    .   pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: [0,11]    . D---R   .    .    .    .   vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,12]    .  D--R   .    .    .    .   vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,13]    .  DeeER  .    .    .    .   vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,14]    .  D---R  .    .    .    .   vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,15]    .  D---R  .    .    .    .   vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,16]    .   D--R  .    .    .    .   vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,17]    .   DeeER .    .    .    .   vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,18]    .   D---R .    .    .    .   vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,19]    .   D---R .    .    .    .   psubb	%mm2, %mm2
+# CHECK-NEXT: [0,20]    .    D--R .    .    .    .   psubd	%mm2, %mm2
+# CHECK-NEXT: [0,21]    .    D---R.    .    .    .   psubq	%mm2, %mm2
+# CHECK-NEXT: [0,22]    .    D---R.    .    .    .   psubw	%mm2, %mm2
+# CHECK-NEXT: [0,23]    .    D---R.    .    .    .   psubb	%xmm2, %xmm2
+# CHECK-NEXT: [0,24]    .    .D--R.    .    .    .   psubd	%xmm2, %xmm2
+# CHECK-NEXT: [0,25]    .    .D---R    .    .    .   psubq	%xmm2, %xmm2
+# CHECK-NEXT: [0,26]    .    .D---R    .    .    .   psubw	%xmm2, %xmm2
+# CHECK-NEXT: [0,27]    .    .D---R    .    .    .   vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,28]    .    . D--R    .    .    .   vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,29]    .    . D---R   .    .    .   vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,30]    .    . D---R   .    .    .   vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,31]    .    . D---R   .    .    .   vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,32]    .    .  D--R   .    .    .   vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,33]    .    .  D---R  .    .    .   vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,34]    .    .  D---R  .    .    .   vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,35]    .    .  DeeER  .    .    .   psubsb	%mm2, %mm2
+# CHECK-NEXT: [0,36]    .    .   DeeER .    .    .   psubsw	%mm2, %mm2
+# CHECK-NEXT: [0,37]    .    .   DeeER .    .    .   psubsb	%xmm2, %xmm2
+# CHECK-NEXT: [0,38]    .    .   D=eeER.    .    .   psubsw	%xmm2, %xmm2
+# CHECK-NEXT: [0,39]    .    .   D=eeER.    .    .   vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,40]    .    .    D=eeER    .    .   vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,41]    .    .    D=eeER    .    .   vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,42]    .    .    D==eeER   .    .   vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,43]    .    .    D==eeER   .    .   psubusb	%mm2, %mm2
+# CHECK-NEXT: [0,44]    .    .    .D==eeER  .    .   psubusw	%mm2, %mm2
+# CHECK-NEXT: [0,45]    .    .    .D==eeER  .    .   psubusb	%xmm2, %xmm2
+# CHECK-NEXT: [0,46]    .    .    .D===eeER .    .   psubusw	%xmm2, %xmm2
+# CHECK-NEXT: [0,47]    .    .    .D===eeER .    .   vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,48]    .    .    . D===eeER.    .   vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,49]    .    .    . D===eeER.    .   vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,50]    .    .    . D====eeER    .   vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,51]    .    .    . D-------R    .   andnps	%xmm0, %xmm0
+# CHECK-NEXT: [0,52]    .    .    .  D------R    .   andnpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,53]    .    .    .  D------R    .   vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,54]    .    .    .  D-------R   .   vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,55]    .    .    .  D-------R   .   pandn	%mm2, %mm2
+# CHECK-NEXT: [0,56]    .    .    .   D------R   .   pandn	%xmm2, %xmm2
+# CHECK-NEXT: [0,57]    .    .    .   D------R   .   vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,58]    .    .    .   D-------R  .   vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: [0,59]    .    .    .   D-------R  .   vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: [0,60]    .    .    .    D------R  .   vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,61]    .    .    .    D------R  .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,62]    .    .    .    D-------R .   xorpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,63]    .    .    .    D-------R .   vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,64]    .    .    .    .D------R .   vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,65]    .    .    .    .D------R .   pxor	%mm2, %mm2
+# CHECK-NEXT: [0,66]    .    .    .    .D-------R.   pxor	%xmm2, %xmm2
+# CHECK-NEXT: [0,67]    .    .    .    .D-------R.   vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,68]    .    .    .    . D------R.   vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [0,69]    .    .    .    . D------R.   vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [0,70]    .    .    .    . D-------R   vpxor	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -358,70 +380,70 @@ vpxor  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT: 1.     1     0.0    0.0    0.0       subq	%rax, %rax
 # CHECK-NEXT: 2.     1     0.0    0.0    0.0       xorl	%eax, %eax
 # CHECK-NEXT: 3.     1     0.0    0.0    0.0       xorq	%rax, %rax
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       pcmpgtb	%mm2, %mm2
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       pcmpgtd	%mm2, %mm2
-# CHECK-NEXT: 6.     1     7.0    0.0    0.0       pcmpgtw	%mm2, %mm2
-# CHECK-NEXT: 7.     1     0.0    0.0    10.0      pcmpgtb	%xmm2, %xmm2
-# CHECK-NEXT: 8.     1     0.0    0.0    9.0       pcmpgtd	%xmm2, %xmm2
-# CHECK-NEXT: 9.     1     0.0    0.0    9.0       pcmpgtq	%xmm2, %xmm2
-# CHECK-NEXT: 10.    1     0.0    0.0    9.0       pcmpgtw	%xmm2, %xmm2
-# CHECK-NEXT: 11.    1     0.0    0.0    9.0       vpcmpgtb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 12.    1     0.0    0.0    8.0       vpcmpgtd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 13.    1     0.0    0.0    8.0       vpcmpgtq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 14.    1     0.0    0.0    8.0       vpcmpgtw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 15.    1     0.0    0.0    8.0       vpcmpgtb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 16.    1     0.0    0.0    7.0       vpcmpgtd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 17.    1     0.0    0.0    7.0       vpcmpgtq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 18.    1     0.0    0.0    7.0       vpcmpgtw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 19.    1     7.0    0.0    0.0       psubb	%mm2, %mm2
-# CHECK-NEXT: 20.    1     9.0    0.0    0.0       psubd	%mm2, %mm2
-# CHECK-NEXT: 21.    1     12.0   0.0    0.0       psubq	%mm2, %mm2
-# CHECK-NEXT: 22.    1     15.0   0.0    0.0       psubw	%mm2, %mm2
-# CHECK-NEXT: 23.    1     0.0    0.0    18.0      psubb	%xmm2, %xmm2
-# CHECK-NEXT: 24.    1     0.0    0.0    17.0      psubd	%xmm2, %xmm2
-# CHECK-NEXT: 25.    1     0.0    0.0    17.0      psubq	%xmm2, %xmm2
-# CHECK-NEXT: 26.    1     0.0    0.0    17.0      psubw	%xmm2, %xmm2
-# CHECK-NEXT: 27.    1     0.0    0.0    17.0      vpsubb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 28.    1     0.0    0.0    16.0      vpsubd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 29.    1     0.0    0.0    16.0      vpsubq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 30.    1     0.0    0.0    16.0      vpsubw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 31.    1     0.0    0.0    16.0      vpsubb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 32.    1     0.0    0.0    15.0      vpsubd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 33.    1     0.0    0.0    15.0      vpsubq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 34.    1     0.0    0.0    15.0      vpsubw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 35.    1     15.0   0.0    0.0       psubsb	%mm2, %mm2
-# CHECK-NEXT: 36.    1     17.0   0.0    0.0       psubsw	%mm2, %mm2
-# CHECK-NEXT: 37.    1     1.0    1.0    18.0      psubsb	%xmm2, %xmm2
-# CHECK-NEXT: 38.    1     3.0    1.0    16.0      psubsw	%xmm2, %xmm2
-# CHECK-NEXT: 39.    1     1.0    1.0    18.0      vpsubsb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 40.    1     1.0    0.0    17.0      vpsubsw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 41.    1     2.0    0.0    16.0      vpsubsb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 42.    1     3.0    1.0    15.0      vpsubsw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 43.    1     19.0   0.0    0.0       psubusb	%mm2, %mm2
-# CHECK-NEXT: 44.    1     21.0   0.0    0.0       psubusw	%mm2, %mm2
-# CHECK-NEXT: 45.    1     2.0    0.0    21.0      psubusb	%xmm2, %xmm2
-# CHECK-NEXT: 46.    1     3.0    0.0    20.0      psubusw	%xmm2, %xmm2
-# CHECK-NEXT: 47.    1     4.0    3.0    19.0      vpsubusb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 48.    1     4.0    0.0    18.0      vpsubusw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 49.    1     5.0    0.0    17.0      vpsubsb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 50.    1     6.0    1.0    16.0      vpsubsw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 51.    1     4.0    4.0    18.0      andnps	%xmm0, %xmm0
-# CHECK-NEXT: 52.    1     5.0    5.0    16.0      andnpd	%xmm1, %xmm1
-# CHECK-NEXT: 53.    1     7.0    5.0    14.0      vandnps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: 54.    1     6.0    0.0    15.0      vandnpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: 55.    1     22.0   0.0    0.0       pandn	%mm2, %mm2
-# CHECK-NEXT: 56.    1     7.0    0.0    14.0      pandn	%xmm2, %xmm2
-# CHECK-NEXT: 57.    1     3.0    0.0    18.0      vpandn	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 58.    1     8.0    0.0    13.0      vandnps	%xmm2, %xmm2, %xmm5
-# CHECK-NEXT: 59.    1     7.0    1.0    14.0      vandnpd	%xmm1, %xmm1, %xmm5
-# CHECK-NEXT: 60.    1     3.0    0.0    17.0      vpandn	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 61.    1     2.0    0.0    19.0      xorps	%xmm0, %xmm0
-# CHECK-NEXT: 62.    1     5.0    0.0    16.0      xorpd	%xmm1, %xmm1
-# CHECK-NEXT: 63.    1     7.0    0.0    14.0      vxorps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: 64.    1     4.0    0.0    16.0      vxorpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: 65.    1     20.0   0.0    0.0       pxor	%mm2, %mm2
-# CHECK-NEXT: 66.    1     6.0    0.0    15.0      pxor	%xmm2, %xmm2
-# CHECK-NEXT: 67.    1     2.0    0.0    19.0      vpxor	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 68.    1     0.0    0.0    20.0      vxorps	%xmm4, %xmm4, %xmm5
-# CHECK-NEXT: 69.    1     3.0    0.0    17.0      vxorpd	%xmm1, %xmm1, %xmm3
-# CHECK-NEXT: 70.    1     3.0    0.0    17.0      vpxor	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: 9.     1     1.0    1.0    0.0       pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: 10.    1     0.0    0.0    3.0       pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: 11.    1     0.0    0.0    3.0       vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 12.    1     0.0    0.0    2.0       vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 13.    1     1.0    1.0    0.0       vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 14.    1     0.0    0.0    3.0       vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 15.    1     0.0    0.0    3.0       vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 16.    1     0.0    0.0    2.0       vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 17.    1     1.0    1.0    0.0       vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 18.    1     0.0    0.0    3.0       vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 19.    1     0.0    0.0    3.0       psubb	%mm2, %mm2
+# CHECK-NEXT: 20.    1     0.0    0.0    2.0       psubd	%mm2, %mm2
+# CHECK-NEXT: 21.    1     0.0    0.0    3.0       psubq	%mm2, %mm2
+# CHECK-NEXT: 22.    1     0.0    0.0    3.0       psubw	%mm2, %mm2
+# CHECK-NEXT: 23.    1     0.0    0.0    3.0       psubb	%xmm2, %xmm2
+# CHECK-NEXT: 24.    1     0.0    0.0    2.0       psubd	%xmm2, %xmm2
+# CHECK-NEXT: 25.    1     0.0    0.0    3.0       psubq	%xmm2, %xmm2
+# CHECK-NEXT: 26.    1     0.0    0.0    3.0       psubw	%xmm2, %xmm2
+# CHECK-NEXT: 27.    1     0.0    0.0    3.0       vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 28.    1     0.0    0.0    2.0       vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 29.    1     0.0    0.0    3.0       vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 30.    1     0.0    0.0    3.0       vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 31.    1     0.0    0.0    3.0       vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 32.    1     0.0    0.0    2.0       vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 33.    1     0.0    0.0    3.0       vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 34.    1     0.0    0.0    3.0       vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 35.    1     1.0    1.0    0.0       psubsb	%mm2, %mm2
+# CHECK-NEXT: 36.    1     1.0    1.0    0.0       psubsw	%mm2, %mm2
+# CHECK-NEXT: 37.    1     1.0    1.0    0.0       psubsb	%xmm2, %xmm2
+# CHECK-NEXT: 38.    1     2.0    2.0    0.0       psubsw	%xmm2, %xmm2
+# CHECK-NEXT: 39.    1     2.0    2.0    0.0       vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 40.    1     2.0    2.0    0.0       vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 41.    1     2.0    2.0    0.0       vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 42.    1     3.0    3.0    0.0       vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 43.    1     3.0    3.0    0.0       psubusb	%mm2, %mm2
+# CHECK-NEXT: 44.    1     3.0    3.0    0.0       psubusw	%mm2, %mm2
+# CHECK-NEXT: 45.    1     3.0    3.0    0.0       psubusb	%xmm2, %xmm2
+# CHECK-NEXT: 46.    1     4.0    4.0    0.0       psubusw	%xmm2, %xmm2
+# CHECK-NEXT: 47.    1     4.0    4.0    0.0       vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 48.    1     4.0    4.0    0.0       vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 49.    1     4.0    4.0    0.0       vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 50.    1     5.0    5.0    0.0       vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 51.    1     0.0    0.0    7.0       andnps	%xmm0, %xmm0
+# CHECK-NEXT: 52.    1     0.0    0.0    6.0       andnpd	%xmm1, %xmm1
+# CHECK-NEXT: 53.    1     0.0    0.0    6.0       vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 54.    1     0.0    0.0    7.0       vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 55.    1     0.0    0.0    7.0       pandn	%mm2, %mm2
+# CHECK-NEXT: 56.    1     0.0    0.0    6.0       pandn	%xmm2, %xmm2
+# CHECK-NEXT: 57.    1     0.0    0.0    6.0       vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 58.    1     0.0    0.0    7.0       vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: 59.    1     0.0    0.0    7.0       vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: 60.    1     0.0    0.0    6.0       vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 61.    1     0.0    0.0    6.0       xorps	%xmm0, %xmm0
+# CHECK-NEXT: 62.    1     0.0    0.0    7.0       xorpd	%xmm1, %xmm1
+# CHECK-NEXT: 63.    1     0.0    0.0    7.0       vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 64.    1     0.0    0.0    6.0       vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 65.    1     0.0    0.0    6.0       pxor	%mm2, %mm2
+# CHECK-NEXT: 66.    1     0.0    0.0    7.0       pxor	%xmm2, %xmm2
+# CHECK-NEXT: 67.    1     0.0    0.0    7.0       vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 68.    1     0.0    0.0    6.0       vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: 69.    1     0.0    0.0    6.0       vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: 70.    1     0.0    0.0    7.0       vpxor	%xmm3, %xmm3, %xmm5
diff --git a/test/tools/llvm-mca/X86/bextr-read-after-ld.s b/test/tools/llvm-mca/X86/bextr-read-after-ld.s
index c356fe7976b..4c1c38f2d10 100644
--- a/test/tools/llvm-mca/X86/bextr-read-after-ld.s
+++ b/test/tools/llvm-mca/X86/bextr-read-after-ld.s
@@ -2,7 +2,7 @@
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=HASWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=broadwell -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
 
@@ -12,8 +12,8 @@ bextrl	%esi, (%rdi), %eax
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
-# BDVER2-NEXT:  Total Cycles:      10
-# BDVER2-NEXT:  Total uOps:        4
+# BDVER2-NEXT:  Total Cycles:      9
+# BDVER2-NEXT:  Total uOps:        3
 
 # BDWELL-NEXT:  Total Cycles:      10
 # BDWELL-NEXT:  Total uOps:        4
@@ -31,8 +31,8 @@ bextrl	%esi, (%rdi), %eax
 # ZNVER1-NEXT:  Total uOps:        3
 
 # BDVER2:       Dispatch Width:    4
-# BDVER2-NEXT:  uOps Per Cycle:    0.40
-# BDVER2-NEXT:  IPC:               0.20
+# BDVER2-NEXT:  uOps Per Cycle:    0.33
+# BDVER2-NEXT:  IPC:               0.22
 # BDVER2-NEXT:  Block RThroughput: 1.0
 
 # BDWELL:       Dispatch Width:    4
@@ -70,8 +70,8 @@ bextrl	%esi, (%rdi), %eax
 
 # ALL:          [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 
-# BDVER2-NEXT:   1      1     0.33                        addl	%edi, %esi
-# BDVER2-NEXT:   3      7     1.00    *                   bextrl	%esi, (%rdi), %eax
+# BDVER2-NEXT:   1      1     0.50                        addl	%edi, %esi
+# BDVER2-NEXT:   2      6     0.50    *                   bextrl	%esi, (%rdi), %eax
 
 # BDWELL-NEXT:   1      1     0.25                        addl	%edi, %esi
 # BDWELL-NEXT:   3      7     0.50    *                   bextrl	%esi, (%rdi), %eax
@@ -90,15 +90,15 @@ bextrl	%esi, (%rdi), %eax
 
 # ALL:          Timeline view:
 
-# BDVER2-NEXT:  Index     0123456789
+# BDVER2-NEXT:  Index     012345678
 # BDWELL-NEXT:  Index     0123456789
 # BTVER2-NEXT:  Index     0123456
 # HASWELL-NEXT: Index     0123456789
 # SKYLAKE-NEXT: Index     0123456789
 # ZNVER1-NEXT:  Index     01234567
 
-# BDVER2:       [0,0]     DeER .   .   addl	%edi, %esi
-# BDVER2-NEXT:  [0,1]     DeeeeeeeER   bextrl	%esi, (%rdi), %eax
+# BDVER2:       [0,0]     DeER .  .   addl	%edi, %esi
+# BDVER2-NEXT:  [0,1]     DeeeeeeER   bextrl	%esi, (%rdi), %eax
 
 # BDWELL:       [0,0]     DeER .   .   addl	%edi, %esi
 # BDWELL-NEXT:  [0,1]     DeeeeeeeER   bextrl	%esi, (%rdi), %eax
diff --git a/test/tools/llvm-mca/X86/cpus.s b/test/tools/llvm-mca/X86/cpus.s
index e666307d1a0..49169f1a6fe 100644
--- a/test/tools/llvm-mca/X86/cpus.s
+++ b/test/tools/llvm-mca/X86/cpus.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=BDVER2 %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=BDVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=BTVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=ZNVER1 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=SANDYBRIDGE %s
@@ -21,7 +21,7 @@ add %edi, %eax
 # BDVER2:           Dispatch Width:    4
 # BDVER2-NEXT:      uOps Per Cycle:    0.97
 # BDVER2-NEXT:      IPC:               0.97
-# BDVER2-NEXT:      Block RThroughput: 0.3
+# BDVER2-NEXT:      Block RThroughput: 0.5
 
 # BROADWELL:        Dispatch Width:    4
 # BROADWELL-NEXT:   uOps Per Cycle:    0.97
diff --git a/test/tools/llvm-mca/X86/read-after-ld-1.s b/test/tools/llvm-mca/X86/read-after-ld-1.s
index 6c68ad13116..0820fbc7c9b 100644
--- a/test/tools/llvm-mca/X86/read-after-ld-1.s
+++ b/test/tools/llvm-mca/X86/read-after-ld-1.s
@@ -3,7 +3,7 @@
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=HASWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=broadwell -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
 
@@ -13,8 +13,8 @@ vaddps  (%rax), %xmm1, %xmm1
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
-# BDVER2-NEXT:  Total Cycles:      20
-# BDVER2-NEXT:  Total uOps:        3
+# BDVER2-NEXT:  Total Cycles:      17
+# BDVER2-NEXT:  Total uOps:        2
 
 # BDWELL-NEXT:  Total Cycles:      17
 # BDWELL-NEXT:  Total uOps:        3
@@ -35,9 +35,9 @@ vaddps  (%rax), %xmm1, %xmm1
 # ZNVER1-NEXT:  Total uOps:        2
 
 # BDVER2:       Dispatch Width:    4
-# BDVER2-NEXT:  uOps Per Cycle:    0.15
-# BDVER2-NEXT:  IPC:               0.10
-# BDVER2-NEXT:  Block RThroughput: 14.0
+# BDVER2-NEXT:  uOps Per Cycle:    0.12
+# BDVER2-NEXT:  IPC:               0.12
+# BDVER2-NEXT:  Block RThroughput: 10.0
 
 # BDWELL:       Dispatch Width:    4
 # BDWELL-NEXT:  uOps Per Cycle:    0.18
@@ -71,7 +71,7 @@ vaddps  (%rax), %xmm1, %xmm1
 
 # ALL:          Timeline view:
 
-# BDVER2-NEXT:                      0123456789
+# BDVER2-NEXT:                      0123456
 # BDVER2-NEXT:  Index     0123456789
 
 # BDWELL-NEXT:                      0123456
@@ -92,8 +92,8 @@ vaddps  (%rax), %xmm1, %xmm1
 # ZNVER1-NEXT:                      0123456789
 # ZNVER1-NEXT:  Index     0123456789
 
-# BDVER2:       [0,0]     DeeeeeeeeeeeeeeER  .   vdivps	%xmm0, %xmm1, %xmm1
-# BDVER2-NEXT:  [0,1]     D========eeeeeeeeeER   vaddps	(%rax), %xmm1, %xmm1
+# BDVER2:       [0,0]     DeeeeeeeeeER   ..   vdivps	%xmm0, %xmm1, %xmm1
+# BDVER2-NEXT:  [0,1]     D====eeeeeeeeeeER   vaddps	(%rax), %xmm1, %xmm1
 
 # BDWELL:       [0,0]     DeeeeeeeeeeeER ..   vdivps	%xmm0, %xmm1, %xmm1
 # BDWELL-NEXT:  [0,1]     D======eeeeeeeeER   vaddps	(%rax), %xmm1, %xmm1
@@ -122,7 +122,7 @@ vaddps  (%rax), %xmm1, %xmm1
 # ALL:                [0]    [1]    [2]    [3]
 # ALL-NEXT:     0.     1     1.0    1.0    0.0       vdivps	%xmm0, %xmm1, %xmm1
 
-# BDVER2-NEXT:  1.     1     9.0    0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
+# BDVER2-NEXT:  1.     1     5.0    0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
 # BDWELL-NEXT:  1.     1     7.0    0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
 # BTVER2-NEXT:  1.     1     15.0   0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
 # HASWELL-NEXT: 1.     1     8.0    0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
diff --git a/test/tools/llvm-mca/X86/scheduler-queue-usage.s b/test/tools/llvm-mca/X86/scheduler-queue-usage.s
index d99a76bf833..a1854a28219 100644
--- a/test/tools/llvm-mca/X86/scheduler-queue-usage.s
+++ b/test/tools/llvm-mca/X86/scheduler-queue-usage.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,BDVER2 %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,BDVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,BTVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,ZNVER1 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,SNB %s
@@ -82,7 +82,10 @@ xor %eax, %ebx
 # ZNVER1-NEXT:     [4] Total number of buffer entries.
 
 # BDVER2:           [1]            [2]        [3]        [4]
-# BDVER2-NEXT:     SBPortAny        0          1          54
+# BDVER2-NEXT:     PdEX             0          1          40
+# BDVER2-NEXT:     PdFPU            0          0          64
+# BDVER2-NEXT:     PdLoad           0          0          40
+# BDVER2-NEXT:     PdStore          0          0          24
 
 # BDW:              [1]            [2]        [3]        [4]
 # BDW-NEXT:        BWPortAny        0          1          60
diff --git a/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s b/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s
index e4531c990f0..aa1bc886c10 100644
--- a/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s
+++ b/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# ZZZ: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=HASWELL
@@ -30,6 +30,9 @@ rcpss (%rax), %xmm1
 
 # ALL:            Timeline view:
 
+# BDVER2-NEXT:                        01234567
+# BDVER2-NEXT:    Index     0123456789
+
 # BROADWELL-NEXT:                     0123456789
 # BROADWELL-NEXT: Index     0123456789
 
@@ -45,6 +48,9 @@ rcpss (%rax), %xmm1
 # ZNVER1-NEXT:                        0123456789          0
 # ZNVER1-NEXT:    Index     0123456789          0123456789
 
+# BDVER2:         [0,0]     DeER .    .    . .   leaq	8(%rsp,%rdi,2), %rax
+# BDVER2-NEXT:    [0,1]     D=eeeeeeeeeeeeeeER   sqrtss	(%rax), %xmm1
+
 # BROADWELL:      [0,0]     DeER .    .    .   .   leaq	8(%rsp,%rdi,2), %rax
 # BROADWELL-NEXT: [0,1]     D=eeeeeeeeeeeeeeeeER   sqrtss	(%rax), %xmm1
 
@@ -69,6 +75,7 @@ rcpss (%rax), %xmm1
 # ALL:                  [0]    [1]    [2]    [3]
 # ALL-NEXT:       0.     1     1.0    1.0    0.0       leaq	8(%rsp,%rdi,2), %rax
 
+# BDVER2-NEXT:    1.     1     2.0    0.0    0.0       sqrtss	(%rax), %xmm1
 # BROADWELL-NEXT: 1.     1     2.0    0.0    0.0       sqrtss	(%rax), %xmm1
 # BTVER2-NEXT:    1.     1     3.0    0.0    0.0       sqrtss	(%rax), %xmm1
 # HASWELL-NEXT:   1.     1     2.0    0.0    0.0       sqrtss	(%rax), %xmm1
@@ -79,6 +86,9 @@ rcpss (%rax), %xmm1
 
 # ALL:            Timeline view:
 
+# BDVER2-NEXT:                        01234567
+# BDVER2-NEXT:    Index     0123456789
+
 # BROADWELL-NEXT:                     0123456789
 # BROADWELL-NEXT: Index     0123456789          01234
 
@@ -94,6 +104,9 @@ rcpss (%rax), %xmm1
 # ZNVER1-NEXT:                        0123456789          0
 # ZNVER1-NEXT:    Index     0123456789          0123456789
 
+# BDVER2:         [0,0]     DeER .    .    . .   leaq	8(%rsp,%rdi,2), %rax
+# BDVER2-NEXT:    [0,1]     D=eeeeeeeeeeeeeeER   sqrtsd	(%rax), %xmm1
+
 # BROADWELL:      [0,0]     DeER .    .    .    .   .   leaq	8(%rsp,%rdi,2), %rax
 # BROADWELL-NEXT: [0,1]     D=eeeeeeeeeeeeeeeeeeeeeER   sqrtsd	(%rax), %xmm1
 
@@ -118,6 +131,7 @@ rcpss (%rax), %xmm1
 # ALL:                  [0]    [1]    [2]    [3]
 # ALL-NEXT:       0.     1     1.0    1.0    0.0       leaq	8(%rsp,%rdi,2), %rax
 
+# BDVER2-NEXT:    1.     1     2.0    0.0    0.0       sqrtsd	(%rax), %xmm1
 # BROADWELL-NEXT: 1.     1     2.0    0.0    0.0       sqrtsd	(%rax), %xmm1
 # BTVER2-NEXT:    1.     1     3.0    0.0    0.0       sqrtsd	(%rax), %xmm1
 # HASWELL-NEXT:   1.     1     2.0    0.0    0.0       sqrtsd	(%rax), %xmm1
@@ -128,6 +142,7 @@ rcpss (%rax), %xmm1
 
 # ALL:            Timeline view:
 
+# BDVER2-NEXT:                        0123
 # BROADWELL-NEXT:                     0123
 # BTVER2-NEXT:                        01
 # HASWELL-NEXT:                       0123
@@ -136,6 +151,9 @@ rcpss (%rax), %xmm1
 
 # ALL-NEXT:       Index     0123456789
 
+# BDVER2:         [0,0]     DeER .    .  .   leaq	8(%rsp,%rdi,2), %rax
+# BDVER2-NEXT:    [0,1]     D=eeeeeeeeeeER   rsqrtss	(%rax), %xmm1
+
 # BROADWELL:      [0,0]     DeER .    .  .   leaq	8(%rsp,%rdi,2), %rax
 # BROADWELL-NEXT: [0,1]     D=eeeeeeeeeeER   rsqrtss	(%rax), %xmm1
 
@@ -160,6 +178,7 @@ rcpss (%rax), %xmm1
 # ALL:                  [0]    [1]    [2]    [3]
 # ALL-NEXT:       0.     1     1.0    1.0    0.0       leaq	8(%rsp,%rdi,2), %rax
 
+# BDVER2-NEXT:    1.     1     2.0    0.0    0.0       rsqrtss	(%rax), %xmm1
 # BROADWELL-NEXT: 1.     1     2.0    0.0    0.0       rsqrtss	(%rax), %xmm1
 # BTVER2-NEXT:    1.     1     3.0    0.0    0.0       rsqrtss	(%rax), %xmm1
 # HASWELL-NEXT:   1.     1     2.0    0.0    0.0       rsqrtss	(%rax), %xmm1
@@ -170,6 +189,7 @@ rcpss (%rax), %xmm1
 
 # ALL:            Timeline view:
 
+# BDVER2-NEXT:                        0123
 # BROADWELL-NEXT:                     0123
 # BTVER2-NEXT:                        01
 # HASWELL-NEXT:                       0123
@@ -178,6 +198,9 @@ rcpss (%rax), %xmm1
 
 # ALL-NEXT:       Index     0123456789
 
+# BDVER2:         [0,0]     DeER .    .  .   leaq	8(%rsp,%rdi,2), %rax
+# BDVER2-NEXT:    [0,1]     D=eeeeeeeeeeER   rcpss	(%rax), %xmm1
+
 # BROADWELL:      [0,0]     DeER .    .  .   leaq	8(%rsp,%rdi,2), %rax
 # BROADWELL-NEXT: [0,1]     D=eeeeeeeeeeER   rcpss	(%rax), %xmm1
 
@@ -202,6 +225,7 @@ rcpss (%rax), %xmm1
 # ALL:                  [0]    [1]    [2]    [3]
 # ALL-NEXT:       0.     1     1.0    1.0    0.0       leaq	8(%rsp,%rdi,2), %rax
 
+# BDVER2-NEXT:    1.     1     2.0    0.0    0.0       rcpss	(%rax), %xmm1
 # BROADWELL-NEXT: 1.     1     2.0    0.0    0.0       rcpss	(%rax), %xmm1
 # BTVER2-NEXT:    1.     1     3.0    0.0    0.0       rcpss	(%rax), %xmm1
 # HASWELL-NEXT:   1.     1     2.0    0.0    0.0       rcpss	(%rax), %xmm1
diff --git a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s
index c2e28922e3a..2a6022c9367 100644
--- a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s
+++ b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s
@@ -9,7 +9,7 @@
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
 
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 
@@ -21,8 +21,8 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
-# BDVER2-NEXT:  Total Cycles:      11
-# BDVER2-NEXT:  Total uOps:        4
+# BDVER2-NEXT:  Total Cycles:      10
+# BDVER2-NEXT:  Total uOps:        2
 
 # BDWELL-NEXT:  Total Cycles:      10
 # BDWELL-NEXT:  Total uOps:        4
@@ -46,9 +46,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  Total uOps:        2
 
 # BDVER2:       Dispatch Width:    4
-# BDVER2-NEXT:  uOps Per Cycle:    0.36
-# BDVER2-NEXT:  IPC:               0.18
-# BDVER2-NEXT:  Block RThroughput: 1.0
+# BDVER2-NEXT:  uOps Per Cycle:    0.20
+# BDVER2-NEXT:  IPC:               0.20
+# BDVER2-NEXT:  Block RThroughput: 2.5
 
 # BDWELL:       Dispatch Width:    4
 # BDWELL-NEXT:  uOps Per Cycle:    0.40
@@ -86,7 +86,6 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  Block RThroughput: 1.0
 
 # BDVER2:       Timeline view:
-# BDVER2-NEXT:                      0
 # BDVER2-NEXT:  Index     0123456789
 
 # BDWELL:       Timeline view:
@@ -116,8 +115,8 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:                      0
 # ZNVER1-NEXT:  Index     0123456789
 
-# BDVER2:       [0,0]     DeeeER    .   vaddps	%xmm0, %xmm0, %xmm1
-# BDVER2-NEXT:  [0,1]     DeeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
+# BDVER2:       [0,0]     DeeeeeER .   vaddps	%xmm0, %xmm0, %xmm1
+# BDVER2-NEXT:  [0,1]     DeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 
 # BDWELL:       [0,0]     DeeeER   .   vaddps	%xmm0, %xmm0, %xmm1
 # BDWELL-NEXT:  [0,1]     DeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
diff --git a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s
index 0aa71425e94..e4bc9048eb8 100644
--- a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s
+++ b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s
@@ -9,7 +9,7 @@
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
 
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 
@@ -21,8 +21,8 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
-# BDVER2-NEXT:  Total Cycles:      11
-# BDVER2-NEXT:  Total uOps:        4
+# BDVER2-NEXT:  Total Cycles:      10
+# BDVER2-NEXT:  Total uOps:        2
 
 # BDWELL-NEXT:  Total Cycles:      10
 # BDWELL-NEXT:  Total uOps:        4
@@ -46,9 +46,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  Total uOps:        2
 
 # BDVER2:       Dispatch Width:    4
-# BDVER2-NEXT:  uOps Per Cycle:    0.36
-# BDVER2-NEXT:  IPC:               0.18
-# BDVER2-NEXT:  Block RThroughput: 1.0
+# BDVER2-NEXT:  uOps Per Cycle:    0.20
+# BDVER2-NEXT:  IPC:               0.20
+# BDVER2-NEXT:  Block RThroughput: 2.5
 
 # BDWELL:       Dispatch Width:    4
 # BDWELL-NEXT:  uOps Per Cycle:    0.40
@@ -86,7 +86,6 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  Block RThroughput: 1.0
 
 # BDVER2:       Timeline view:
-# BDVER2-NEXT:                      0
 # BDVER2-NEXT:  Index     0123456789
 
 # BDWELL:       Timeline view:
@@ -116,8 +115,8 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:                      0
 # ZNVER1-NEXT:  Index     0123456789
 
-# BDVER2:       [0,0]     DeeeER    .   vaddps	%xmm0, %xmm0, %xmm2
-# BDVER2-NEXT:  [0,1]     DeeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
+# BDVER2:       [0,0]     DeeeeeER .   vaddps	%xmm0, %xmm0, %xmm2
+# BDVER2-NEXT:  [0,1]     DeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 
 # BDWELL:       [0,0]     DeeeER   .   vaddps	%xmm0, %xmm0, %xmm2
 # BDWELL-NEXT:  [0,1]     DeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
-- 
GitLab


From a8bdd2f238893d4fa89d55fb8b420851215b0865 Mon Sep 17 00:00:00 2001
From: Renato Golin <renato.golin@linaro.org>
Date: Sat, 27 Oct 2018 22:13:43 +0000
Subject: [PATCH 0678/1116] Revert r344172: [LV] Add a new reduction pattern
 match

This patch has caused fast-math issues in the reduction pattern.

Will re-work and land again.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345465 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/IVDescriptors.h         |   7 +-
 lib/Analysis/IVDescriptors.cpp                |  71 +-
 test/Transforms/LoopVectorize/if-reduction.ll | 666 ------------------
 3 files changed, 7 insertions(+), 737 deletions(-)
 delete mode 100644 test/Transforms/LoopVectorize/if-reduction.ll

diff --git a/include/llvm/Analysis/IVDescriptors.h b/include/llvm/Analysis/IVDescriptors.h
index 64b4ae23cc5..d1d7e5ef022 100644
--- a/include/llvm/Analysis/IVDescriptors.h
+++ b/include/llvm/Analysis/IVDescriptors.h
@@ -140,8 +140,7 @@ public:
 
   /// Returns true if instruction I has multiple uses in Insts
   static bool hasMultipleUsesOf(Instruction *I,
-                                SmallPtrSetImpl<Instruction *> &Insts,
-                                unsigned MaxNumUses);
+                                SmallPtrSetImpl<Instruction *> &Insts);
 
   /// Returns true if all uses of the instruction I is within the Set.
   static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set);
@@ -151,10 +150,6 @@ public:
   /// or max(X, Y).
   static InstDesc isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev);
 
-  /// Returns a struct describing if the instruction is a
-  /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
-  static InstDesc isConditionalRdxPattern(RecurrenceKind Kind, Instruction *I);
-
   /// Returns identity corresponding to the RecurrenceKind.
   static Constant *getRecurrenceIdentity(RecurrenceKind K, Type *Tp);
 
diff --git a/lib/Analysis/IVDescriptors.cpp b/lib/Analysis/IVDescriptors.cpp
index 47bddf68f49..854a95573e9 100644
--- a/lib/Analysis/IVDescriptors.cpp
+++ b/lib/Analysis/IVDescriptors.cpp
@@ -299,17 +299,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
         return false;
     }
 
-    bool IsASelect = isa<SelectInst>(Cur);
-
-    // A conditional reduction operation must only have 2 or less uses in
-    // VisitedInsts.
-    if (IsASelect && (Kind == RK_FloatAdd || Kind == RK_FloatMult) &&
-        hasMultipleUsesOf(Cur, VisitedInsts, 2))
-      return false;
-
     // A reduction operation must only have one use of the reduction value.
-    if (!IsAPhi && !IsASelect && Kind != RK_IntegerMinMax &&
-        Kind != RK_FloatMinMax && hasMultipleUsesOf(Cur, VisitedInsts, 1))
+    if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
+        hasMultipleUsesOf(Cur, VisitedInsts))
       return false;
 
     // All inputs to a PHI node must be a reduction value.
@@ -370,8 +362,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
       } else if (!isa<PHINode>(UI) &&
                  ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
                    !isa<SelectInst>(UI)) ||
-                  (!isConditionalRdxPattern(Kind, UI).isRecurrence() &&
-                   !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence())))
+                  !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence()))
         return false;
 
       // Remember that we completed the cycle.
@@ -500,52 +491,6 @@ RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev) {
   return InstDesc(false, I);
 }
 
-/// Returns true if the select instruction has users in the compare-and-add
-/// reduction pattern below. The select instruction argument is the last one
-/// in the sequence.
-///
-/// %sum.1 = phi ...
-/// ...
-/// %cmp = fcmp pred %0, %CFP
-/// %add = fadd %0, %sum.1
-/// %sum.2 = select %cmp, %add, %sum.1
-RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isConditionalRdxPattern(
-    RecurrenceKind Kind, Instruction *I) {
-  SelectInst *SI = dyn_cast<SelectInst>(I);
-  if (!SI)
-    return InstDesc(false, I);
-
-  CmpInst *CI = dyn_cast<CmpInst>(SI->getCondition());
-  // Only handle single use cases for now.
-  if (!CI || !CI->hasOneUse())
-    return InstDesc(false, I);
-
-  Value *TrueVal = SI->getTrueValue();
-  Value *FalseVal = SI->getFalseValue();
-  // Handle only when either of operands of select instruction is a PHI
-  // node for now.
-  if ((isa<PHINode>(*TrueVal) && isa<PHINode>(*FalseVal)) ||
-      (!isa<PHINode>(*TrueVal) && !isa<PHINode>(*FalseVal)))
-    return InstDesc(false, I);
-
-  Instruction *I1 =
-      isa<PHINode>(*TrueVal) ? dyn_cast<Instruction>(FalseVal)
-                             : dyn_cast<Instruction>(TrueVal);
-  if (!I1 || !I1->isBinaryOp())
-    return InstDesc(false, I);
-
-  Value *Op1, *Op2;
-  if (m_FAdd(m_Value(Op1), m_Value(Op2)).match(I1) ||
-      m_FSub(m_Value(Op1), m_Value(Op2)).match(I1))
-    return InstDesc(Kind == RK_FloatAdd, SI);
-
-  if (m_FMul(m_Value(Op1), m_Value(Op2)).match(I1))
-    return InstDesc(Kind == RK_FloatMult, SI);
-
-  return InstDesc(false, I);
-}
-
 RecurrenceDescriptor::InstDesc
 RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
                                         InstDesc &Prev, bool HasFunNoNaNAttr) {
@@ -575,12 +520,9 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
   case Instruction::FSub:
   case Instruction::FAdd:
     return InstDesc(Kind == RK_FloatAdd, I, UAI);
-  case Instruction::Select:
-    if (Kind == RK_FloatAdd || Kind == RK_FloatMult)
-      return isConditionalRdxPattern(Kind, I);
-    LLVM_FALLTHROUGH;
   case Instruction::FCmp:
   case Instruction::ICmp:
+  case Instruction::Select:
     if (Kind != RK_IntegerMinMax &&
         (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
       return InstDesc(false, I);
@@ -589,14 +531,13 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
 }
 
 bool RecurrenceDescriptor::hasMultipleUsesOf(
-    Instruction *I, SmallPtrSetImpl<Instruction *> &Insts,
-    unsigned MaxNumUses) {
+    Instruction *I, SmallPtrSetImpl<Instruction *> &Insts) {
   unsigned NumUses = 0;
   for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E;
        ++Use) {
     if (Insts.count(dyn_cast<Instruction>(*Use)))
       ++NumUses;
-    if (NumUses > MaxNumUses)
+    if (NumUses > 1)
       return true;
   }
 
diff --git a/test/Transforms/LoopVectorize/if-reduction.ll b/test/Transforms/LoopVectorize/if-reduction.ll
deleted file mode 100644
index dd9a6118337..00000000000
--- a/test/Transforms/LoopVectorize/if-reduction.ll
+++ /dev/null
@@ -1,666 +0,0 @@
-; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-
-; Float pattern:
-;   Check vectorization of reduction code which has an fadd instruction after
-;   an fcmp instruction which compares an array element and 0.
-;
-; float fcmp_0_fadd_select1(float * restrict x, const int N) {
-;   float sum = 0.
-;   for (int i = 0; i < N; ++i)
-;     if (x[i] > (float)0.)
-;       sum += x[i];
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_0_fadd_select1(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
-; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
-define float @fcmp_0_fadd_select1(float* noalias %x, i32 %N) nounwind readonly {
-entry:
-  %cmp.1 = icmp sgt i32 %N, 0
-  br i1 %cmp.1, label %for.header, label %for.end
-
-for.header:                                       ; preds = %entry
-  %zext = zext i32 %N to i64
-  br label %for.body
-
-for.body:                                         ; preds = %header, %for.body
-  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
-  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
-  %cmp.2 = fcmp fast ogt float %0, 0.000000e+00
-  %add = fadd fast float %0, %sum.1
-  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %zext
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
-  ret float %1
-}
-
-; Double pattern:
-;   Check vectorization of reduction code which has an fadd instruction after
-;   an fcmp instruction which compares an array element and 0.
-;
-; double fcmp_0_fadd_select2(double * restrict x, const int N) {
-;   double sum = 0.
-;   for (int i = 0; i < N; ++i)
-;     if (x[i] > 0.)
-;       sum += x[i];
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_0_fadd_select2(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
-; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
-define double @fcmp_0_fadd_select2(double* noalias %x, i32 %N) nounwind readonly {
-entry:
-  %cmp.1 = icmp sgt i32 %N, 0
-  br i1 %cmp.1, label %for.header, label %for.end
-
-for.header:                                       ; preds = %entry
-  %zext = zext i32 %N to i64
-  br label %for.body
-
-for.body:                                         ; preds = %header, %for.body
-  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
-  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
-  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
-  %0 = load double, double* %arrayidx, align 4
-  %cmp.2 = fcmp fast ogt double %0, 0.000000e+00
-  %add = fadd fast double %0, %sum.1
-  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %zext
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
-  ret double %1
-}
-
-; Float pattern:
-;   Check vectorization of reduction code which has an fadd instruction after
-;   an fcmp instruction which compares an array element and a floating-point
-;   value.
-;
-; float fcmp_val_fadd_select1(float * restrict x, float y, const int N) {
-;   float sum = 0.
-;   for (int i = 0; i < N; ++i)
-;     if (x[i] > y)
-;       sum += x[i];
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_val_fadd_select1(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat2
-; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
-; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
-define float @fcmp_val_fadd_select1(float* noalias %x, float %y, i32 %N) nounwind readonly {
-entry:
-  %cmp.1 = icmp sgt i32 %N, 0
-  br i1 %cmp.1, label %for.header, label %for.end
-
-for.header:                                       ; preds = %entry
-  %zext = zext i32 %N to i64
-  br label %for.body
-
-for.body:                                         ; preds = %header, %for.body
-  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
-  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
-  %cmp.2 = fcmp fast ogt float %0, %y
-  %add = fadd fast float %0, %sum.1
-  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %zext
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
-  ret float %1
-}
-
-; Double pattern:
-;   Check vectorization of reduction code which has an fadd instruction after
-;   an fcmp instruction which compares an array element and a floating-point
-;   value.
-;
-; double fcmp_val_fadd_select2(double * restrict x, double y, const int N) {
-;   double sum = 0.
-;   for (int i = 0; i < N; ++i)
-;     if (x[i] > y)
-;       sum += x[i];
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_val_fadd_select2(
-; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat2
-; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
-; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
-define double @fcmp_val_fadd_select2(double* noalias %x, double %y, i32 %N) nounwind readonly {
-entry:
-  %cmp.1 = icmp sgt i32 %N, 0
-  br i1 %cmp.1, label %for.header, label %for.end
-
-for.header:                                       ; preds = %entry
-  %zext = zext i32 %N to i64
-  br label %for.body
-
-for.body:                                         ; preds = %header, %for.body
-  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
-  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
-  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
-  %0 = load double, double* %arrayidx, align 4
-  %cmp.2 = fcmp fast ogt double %0, %y
-  %add = fadd fast double %0, %sum.1
-  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %zext
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
-  ret double %1
-}
-
-; Float pattern:
-;   Check vectorization of reduction code which has an fadd instruction after
-;   an fcmp instruction which compares an array element and another array
-;   element.
-;
-; float fcmp_array_elm_fadd_select1(float * restrict x, float * restrict y,
-;                                   const int N) {
-;   float sum = 0.
-;   for (int i = 0; i < N; ++i)
-;     if (x[i] > y[i])
-;       sum += x[i];
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_array_elm_fadd_select1(
-; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %[[V1:.*]]
-; CHECK: %[[V4:.*]] = fadd fast <4 x float> %[[V0]], %[[V3:.*]]
-; CHECK: select <4 x i1> %[[V2]], <4 x float> %[[V4]], <4 x float> %[[V3]]
-define float @fcmp_array_elm_fadd_select1(float* noalias %x, float* noalias %y, i32 %N) nounwind readonly {
-entry:
-  %cmp.1 = icmp sgt i32 %N, 0
-  br i1 %cmp.1, label %for.header, label %for.end
-
-for.header:                                       ; preds = %entry
-  %zext = zext i32 %N to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.header
-  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
-  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
-  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 %indvars.iv
-  %0 = load float, float* %arrayidx.1, align 4
-  %arrayidx.2 = getelementptr inbounds float, float* %y, i64 %indvars.iv
-  %1 = load float, float* %arrayidx.2, align 4
-  %cmp.2 = fcmp fast ogt float %0, %1
-  %add = fadd fast float %0, %sum.1
-  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %zext
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %2 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
-  ret float %2
-}
-
-; Double pattern:
-;   Check vectorization of reduction code which has an fadd instruction after
-;   an fcmp instruction which compares an array element and another array
-;   element.
-;
-; double fcmp_array_elm_fadd_select2(double * restrict x, double * restrict y,
-;                                    const int N) {
-;   double sum = 0.
-;   for (int i = 0; i < N; ++i)
-;     if (x[i] > y[i])
-;       sum += x[i];
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_array_elm_fadd_select2(
-; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %[[V1:.*]]
-; CHECK: %[[V4:.*]] = fadd fast <4 x double> %[[V0]], %[[V3:.*]]
-; CHECK: select <4 x i1> %[[V2]], <4 x double> %[[V4]], <4 x double> %[[V3]]
-define double @fcmp_array_elm_fadd_select2(double* noalias %x, double* noalias %y, i32 %N) nounwind readonly {
-entry:
-  %cmp.1 = icmp sgt i32 %N, 0
-  br i1 %cmp.1, label %for.header, label %for.end
-
-for.header:                                       ; preds = %entry
-  %zext = zext i32 %N to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.header
-  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
-  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
-  %arrayidx.1 = getelementptr inbounds double, double* %x, i64 %indvars.iv
-  %0 = load double, double* %arrayidx.1, align 4
-  %arrayidx.2 = getelementptr inbounds double, double* %y, i64 %indvars.iv
-  %1 = load double, double* %arrayidx.2, align 4
-  %cmp.2 = fcmp fast ogt double %0, %1
-  %add = fadd fast double %0, %sum.1
-  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %zext
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %2 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
-  ret double %2
-}
-
-; Float pattern:
-;   Check vectorization of reduction code which has an fsub instruction after
-;   an fcmp instruction which compares an array element and 0.
-;
-; float fcmp_0_fsub_select1(float * restrict x, const int N) {
-;   float sum = 0.
-;   for (int i = 0; i < N; ++i)
-;     if (x[i] > (float)0.)
-;       sum -= x[i];
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_0_fsub_select1(
-; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fsub <4 x float> %[[V2:.*]], %[[V0]]
-; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
-define float @fcmp_0_fsub_select1(float* noalias %x, i32 %N) nounwind readonly {
-entry:
-  %cmp.1 = icmp sgt i32 %N, 0
-  br i1 %cmp.1, label %for.header, label %for.end
-
-for.header:                                       ; preds = %entry
-  %zext = zext i32 %N to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.header
-  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
-  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
-  %cmp.2 = fcmp ogt float %0, 0.000000e+00
-  %sub = fsub float %sum.1, %0
-  %sum.2 = select i1 %cmp.2, float %sub, float %sum.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %zext
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
-  ret float %1
-}
-
-; Double pattern:
-;   Check vectorization of reduction code which has an fsub instruction after
-;   an fcmp instruction which compares an array element and 0.
-;
-; double fcmp_0_fsub_select2(double * restrict x, const int N) {
-;   double sum = 0.
-;   for (int i = 0; i < N; ++i)
-;     if (x[i] > 0.)
-;       sum -= x[i];
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_0_fsub_select2(
-; CHECK: %[[V1:.*]] = fcmp ogt <4 x double> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fsub <4 x double> %[[V2:.*]], %[[V0]]
-; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
-define double @fcmp_0_fsub_select2(double* noalias %x, i32 %N) nounwind readonly {
-entry:
-  %cmp.1 = icmp sgt i32 %N, 0
-  br i1 %cmp.1, label %for.header, label %for.end
-
-for.header:                                       ; preds = %entry
-  %zext = zext i32 %N to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.header
-  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
-  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
-  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
-  %0 = load double, double* %arrayidx, align 4
-  %cmp.2 = fcmp ogt double %0, 0.000000e+00
-  %sub = fsub double %sum.1, %0
-  %sum.2 = select i1 %cmp.2, double %sub, double %sum.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %zext
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
-  ret double %1
-}
-
-; Float pattern:
-;   Check vectorization of reduction code which has an fmul instruction after
-;   an fcmp instruction which compares an array element and 0.
-;
-; float fcmp_0_fmult_select1(float * restrict x, const int N) {
-;   float sum = 0.
-;   for (int i = 0; i < N; ++i)
-;     if (x[i] > (float)0.)
-;       sum *= x[i];
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_0_fmult_select1(
-; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fmul <4 x float> %[[V2:.*]], %[[V0]]
-; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
-define float @fcmp_0_fmult_select1(float* noalias %x, i32 %N) nounwind readonly {
-entry:
-  %cmp.1 = icmp sgt i32 %N, 0
-  br i1 %cmp.1, label %for.header, label %for.end
-
-for.header:                                       ; preds = %entry
-  %zext = zext i32 %N to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.header
-  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
-  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
-  %cmp.2 = fcmp ogt float %0, 0.000000e+00
-  %mult = fmul float %sum.1, %0
-  %sum.2 = select i1 %cmp.2, float %mult, float %sum.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %zext
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
-  ret float %1
-}
-
-; Double pattern:
-;   Check vectorization of reduction code which has an fmul instruction after
-;   an fcmp instruction which compares an array element and 0.
-;
-; double fcmp_0_fmult_select2(double * restrict x, const int N) {
-;   double sum = 0.
-;   for (int i = 0; i < N; ++i)
-;     if (x[i] > 0.)
-;       sum *= x[i];
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_0_fmult_select2(
-; CHECK: %[[V1:.*]] = fcmp ogt <4 x double> %[[V0:.*]], zeroinitializer
-; CHECK: %[[V3:.*]] = fmul <4 x double> %[[V2:.*]], %[[V0]]
-; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
-define double @fcmp_0_fmult_select2(double* noalias %x, i32 %N) nounwind readonly {
-entry:
-  %cmp.1 = icmp sgt i32 %N, 0
-  br i1 %cmp.1, label %for.header, label %for.end
-
-for.header:                                       ; preds = %entry
-  %zext = zext i32 %N to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.header
-  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
-  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
-  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
-  %0 = load double, double* %arrayidx, align 4
-  %cmp.2 = fcmp ogt double %0, 0.000000e+00
-  %mult = fmul double %sum.1, %0
-  %sum.2 = select i1 %cmp.2, double %mult, double %sum.1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %zext
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
-  ret double %1
-}
-
-; Float multi pattern
-;   Check vectorisation of reduction code with a pair of selects to different
-;   fadd patterns.
-;
-; float fcmp_multi(float *a, int n) {
-;   float sum=0.0;
-;   for (int i=0;i<n;i++) {
-;     if (a[i]>1.0)
-;       sum+=a[i];
-;     else if (a[i]<3.0)
-;       sum+=2*a[i];
-;     else
-;       sum+=3*a[i];
-;   }
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_multi(
-; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
-; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
-; CHECK-DAG: %[[M1:.*]] = fmul fast <4 x float> %[[V0]], <float 3.000000e+00,
-; CHECK-DAG: %[[M2:.*]] = fmul fast <4 x float> %[[V0]], <float 2.000000e+00,
-; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
-; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
-; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
-; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
-; CHECK: %[[S1:.*]] = select <4 x i1> %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]]
-; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]]
-; CHECK: fadd fast <4 x float> %[[S2]],
-define float @fcmp_multi(float* nocapture readonly %a, i32 %n) nounwind readonly {
-entry:
-  %cmp10 = icmp sgt i32 %n, 0
-  br i1 %cmp10, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %n to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.inc, %for.body.preheader
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %sum.011 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
-  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
-  %cmp1 = fcmp ogt float %0, 1.000000e+00
-  br i1 %cmp1, label %for.inc, label %if.else
-
-if.else:                                          ; preds = %for.body
-  %cmp8 = fcmp olt float %0, 3.000000e+00
-  br i1 %cmp8, label %if.then10, label %if.else14
-
-if.then10:                                        ; preds = %if.else
-  %mul = fmul fast float %0, 2.000000e+00
-  br label %for.inc
-
-if.else14:                                        ; preds = %if.else
-  %mul17 = fmul fast float %0, 3.000000e+00
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body, %if.else14, %if.then10
-  %.pn = phi float [ %mul, %if.then10 ], [ %mul17, %if.else14 ], [ %0, %for.body ]
-  %sum.1 = fadd fast float %.pn, %sum.011
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.inc, %entry
-  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
-  ret float %sum.0.lcssa
-}
-
-; Float fadd + fsub patterns
-;   Check vectorisation of reduction code with a pair of selects to different
-;   instructions { fadd, fsub } but equivalent (change in constant).
-;
-; float fcmp_multi(float *a, int n) {
-;   float sum=0.0;
-;   for (int i=0;i<n;i++) {
-;     if (a[i]>1.0)
-;       sum+=a[i];
-;     else if (a[i]<3.0)
-;       sum-=a[i];
-;   }
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_fadd_fsub(
-; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
-; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
-; CHECK-DAG: %[[SUB:.*]] = fsub fast <4 x float>
-; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float>
-; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
-; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
-; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
-; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
-; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]]
-; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]]
-define float @fcmp_fadd_fsub(float* nocapture readonly %a, i32 %n) nounwind readonly {
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %n to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.inc, %for.body.preheader
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
-  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
-  %cmp1 = fcmp ogt float %0, 1.000000e+00
-  br i1 %cmp1, label %if.then, label %if.else
-
-if.then:                                          ; preds = %for.body
-  %add = fadd fast float %0, %sum.010
-  br label %for.inc
-
-if.else:                                          ; preds = %for.body
-  %cmp8 = fcmp olt float %0, 3.000000e+00
-  br i1 %cmp8, label %if.then10, label %for.inc
-
-if.then10:                                        ; preds = %if.else
-  %sub = fsub fast float %sum.010, %0
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.then, %if.then10, %if.else
-  %sum.1 = phi float [ %add, %if.then ], [ %sub, %if.then10 ], [ %sum.010, %if.else ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.inc, %entry
-  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
-  ret float %sum.0.lcssa
-}
-
-; Float fadd + fmul patterns
-;   Check lack of vectorisation of reduction code with a pair of non-compatible
-;   instructions { fadd, fmul }.
-;
-; float fcmp_multi(float *a, int n) {
-;   float sum=0.0;
-;   for (int i=0;i<n;i++) {
-;     if (a[i]>1.0)
-;       sum+=a[i];
-;     else if (a[i]<3.0)
-;       sum*=a[i];
-;   }
-;   return sum;
-; }
-
-; CHECK-LABEL: @fcmp_fadd_fmul(
-; CHECK-NOT: <4 x float>
-define float @fcmp_fadd_fmul(float* nocapture readonly %a, i32 %n) nounwind readonly {
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %n to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.inc, %for.body.preheader
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
-  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
-  %cmp1 = fcmp ogt float %0, 1.000000e+00
-  br i1 %cmp1, label %if.then, label %if.else
-
-if.then:                                          ; preds = %for.body
-  %add = fadd fast float %0, %sum.010
-  br label %for.inc
-
-if.else:                                          ; preds = %for.body
-  %cmp8 = fcmp olt float %0, 3.000000e+00
-  br i1 %cmp8, label %if.then10, label %for.inc
-
-if.then10:                                        ; preds = %if.else
-  %mul = fmul fast float %0, %sum.010
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.then, %if.then10, %if.else
-  %sum.1 = phi float [ %add, %if.then ], [ %mul, %if.then10 ], [ %sum.010, %if.else ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.inc, %entry
-  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
-  ret float %sum.0.lcssa
-}
-
-; Float fadd + store patterns
-;   Check lack of vectorisation of reduction code with a store back, given it
-;   has loop dependency on a[i].
-;
-; float fcmp_store_back(float a[], int LEN) {
-;     float sum = 0.0;
-;     for (int i = 0; i < LEN; i++) {
-;       sum += a[i];
-;       a[i] = sum;
-;     }
-;     return sum;
-; }
-
-; CHECK-LABEL: @fcmp_store_back(
-; CHECK-NOT: <4 x float>
-define float @fcmp_store_back(float* nocapture %a, i32 %LEN) nounwind readonly {
-entry:
-  %cmp7 = icmp sgt i32 %LEN, 0
-  br i1 %cmp7, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %LEN to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.preheader
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %sum.08 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
-  %add = fadd fast float %0, %sum.08
-  store float %add, float* %arrayidx, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
-  ret float %sum.0.lcssa
-}
-- 
GitLab


From 98cb136da40c6cd516d7e4ec1dcfda752f245346 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Sat, 27 Oct 2018 22:56:04 +0000
Subject: [PATCH 0679/1116] [utils] Fix _run_benchmark in
 collect_and_build_with_pgo.py

Summary: Also fix a FIXME in _build_stage1_clang: clang llvm-profdata profile are sufficient

Reviewers: george.burgess.iv

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53795

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345466 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/collect_and_build_with_pgo.py | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/utils/collect_and_build_with_pgo.py b/utils/collect_and_build_with_pgo.py
index 144eed3cc22..8cbeb2b6c51 100755
--- a/utils/collect_and_build_with_pgo.py
+++ b/utils/collect_and_build_with_pgo.py
@@ -35,14 +35,6 @@ def _run_benchmark(env, out_dir, include_debug_info):
     """The 'benchmark' we run to generate profile data."""
     target_dir = env.output_subdir('instrumentation_run')
 
-    # `check-llvm` and `check-clang` are cheap ways to increase coverage. The
-    # former lets us touch on the non-x86 backends a bit if configured, and the
-    # latter gives us more C to chew on (and will send us through diagnostic
-    # paths a fair amount, though the `if (stuff_is_broken) { diag() ... }`
-    # branches should still heavily be weighted in the not-taken direction,
-    # since we built all of LLVM/etc).
-    _build_things_in(env, out_dir, what=['check-llvm', 'check-clang'])
-
     # Building tblgen gets us coverage; don't skip it. (out_dir may also not
     # have them anyway, but that's less of an issue)
     cmake = _get_cmake_invocation_for_bootstrap_from(
@@ -56,6 +48,13 @@ def _run_benchmark(env, out_dir, include_debug_info):
     # Just build all the things. The more data we have, the better.
     _build_things_in(env, target_dir, what=['all'])
 
+    # `check-llvm` and `check-clang` are cheap ways to increase coverage. The
+    # former lets us touch on the non-x86 backends a bit if configured, and the
+    # latter gives us more C to chew on (and will send us through diagnostic
+    # paths a fair amount, though the `if (stuff_is_broken) { diag() ... }`
+    # branches should still heavily be weighted in the not-taken direction,
+    # since we built all of LLVM/etc).
+    _build_things_in(env, target_dir, what=['check-llvm', 'check-clang'])
 
 ### Script
 
@@ -252,13 +251,7 @@ def _build_stage1_clang(env):
     target_dir = env.output_subdir('stage1')
     cmake = _get_default_cmake_invocation(env)
     _run_fresh_cmake(env, cmake, target_dir)
-
-    # FIXME: The full build here is somewhat unfortunate. It's primarily
-    # because I don't know what to call libclang_rt.profile for arches that
-    # aren't x86_64 (and even then, it's in a subdir that contains clang's
-    # current version). It would be nice to figure out what target I can
-    # request to magically have libclang_rt.profile built for ${host}
-    _build_things_in(env, target_dir, what=['all'])
+    _build_things_in(env, target_dir, what=['clang', 'llvm-profdata', 'profile'])
     return target_dir
 
 
-- 
GitLab


From ff6f0b2f418aee9f1718ee8f3fcabee8f5dbc74d Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Sat, 27 Oct 2018 23:10:09 +0000
Subject: [PATCH 0680/1116] [utils] collect_and_build_with_pgo.py: revert part
 already fixed in rL345461

The change was inadvertently included in my last commit.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345467 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/collect_and_build_with_pgo.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/utils/collect_and_build_with_pgo.py b/utils/collect_and_build_with_pgo.py
index 8cbeb2b6c51..5a8686a88b4 100755
--- a/utils/collect_and_build_with_pgo.py
+++ b/utils/collect_and_build_with_pgo.py
@@ -35,6 +35,14 @@ def _run_benchmark(env, out_dir, include_debug_info):
     """The 'benchmark' we run to generate profile data."""
     target_dir = env.output_subdir('instrumentation_run')
 
+    # `check-llvm` and `check-clang` are cheap ways to increase coverage. The
+    # former lets us touch on the non-x86 backends a bit if configured, and the
+    # latter gives us more C to chew on (and will send us through diagnostic
+    # paths a fair amount, though the `if (stuff_is_broken) { diag() ... }`
+    # branches should still heavily be weighted in the not-taken direction,
+    # since we built all of LLVM/etc).
+    _build_things_in(env, out_dir, what=['check-llvm', 'check-clang'])
+
     # Building tblgen gets us coverage; don't skip it. (out_dir may also not
     # have them anyway, but that's less of an issue)
     cmake = _get_cmake_invocation_for_bootstrap_from(
@@ -48,14 +56,6 @@ def _run_benchmark(env, out_dir, include_debug_info):
     # Just build all the things. The more data we have, the better.
     _build_things_in(env, target_dir, what=['all'])
 
-    # `check-llvm` and `check-clang` are cheap ways to increase coverage. The
-    # former lets us touch on the non-x86 backends a bit if configured, and the
-    # latter gives us more C to chew on (and will send us through diagnostic
-    # paths a fair amount, though the `if (stuff_is_broken) { diag() ... }`
-    # branches should still heavily be weighted in the not-taken direction,
-    # since we built all of LLVM/etc).
-    _build_things_in(env, target_dir, what=['check-llvm', 'check-clang'])
-
 ### Script
 
 
-- 
GitLab


From 03f7634b5d1735520ddecabab29fc0b411317cbc Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 28 Oct 2018 01:32:47 +0000
Subject: [PATCH 0681/1116] [X86] Add test cases showing missed opportunities
 for optimizing vector fcopysign when the RHS is a splat constant.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345468 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/sse1-fcopysign.ll | 84 ++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 test/CodeGen/X86/sse1-fcopysign.ll

diff --git a/test/CodeGen/X86/sse1-fcopysign.ll b/test/CodeGen/X86/sse1-fcopysign.ll
new file mode 100644
index 00000000000..bd9e4708a42
--- /dev/null
+++ b/test/CodeGen/X86/sse1-fcopysign.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=-sse2,+sse | FileCheck %s --check-prefix=ALL --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse2,+sse | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+define float @f32_pos(float %a, float %b) nounwind {
+; X86-LABEL: f32_pos:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    andps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: f32_pos:
+; X64:       # %bb.0:
+; X64-NEXT:    andps {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %tmp = tail call float @llvm.copysign.f32(float %a, float 1.0)
+  ret float %tmp
+}
+
+define float @f32_neg(float %a, float %b) nounwind {
+; X86-LABEL: f32_neg:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    orps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: f32_neg:
+; X64:       # %bb.0:
+; X64-NEXT:    orps {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %tmp = tail call float @llvm.copysign.f32(float %a, float -1.0)
+  ret float %tmp
+}
+
+define <4 x float> @v4f32_pos(<4 x float> %a, <4 x float> %b) nounwind {
+; X86-LABEL: v4f32_pos:
+; X86:       # %bb.0:
+; X86-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; X86-NEXT:    andps {{\.LCPI.*}}, %xmm1
+; X86-NEXT:    andps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    orps %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: v4f32_pos:
+; X64:       # %bb.0:
+; X64-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; X64-NEXT:    andps {{.*}}(%rip), %xmm1
+; X64-NEXT:    andps {{.*}}(%rip), %xmm0
+; X64-NEXT:    orps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %tmp = tail call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  ret <4 x float> %tmp
+}
+
+define <4 x float> @v4f32_neg(<4 x float> %a, <4 x float> %b) nounwind {
+; X86-LABEL: v4f32_neg:
+; X86:       # %bb.0:
+; X86-NEXT:    movaps {{.*#+}} xmm1 = [-1,-1,-1,-1]
+; X86-NEXT:    andps {{\.LCPI.*}}, %xmm1
+; X86-NEXT:    andps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    orps %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: v4f32_neg:
+; X64:       # %bb.0:
+; X64-NEXT:    movaps {{.*#+}} xmm1 = [-1,-1,-1,-1]
+; X64-NEXT:    andps {{.*}}(%rip), %xmm1
+; X64-NEXT:    andps {{.*}}(%rip), %xmm0
+; X64-NEXT:    orps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %tmp = tail call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>)
+  ret <4 x float> %tmp
+}
+
+declare float @llvm.copysign.f32(float, float)
+declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
-- 
GitLab


From 346342d0699f9daf410ff92320dfcd349d0570c6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 28 Oct 2018 01:32:49 +0000
Subject: [PATCH 0682/1116] [DAGCombiner] Better constant vector support for
 FCOPYSIGN.

Enable constant folding when both operands are vectors of constants.

Turn into FNEG/FABS when the RHS is a splat constant vector.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345469 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  8 ++++----
 test/CodeGen/X86/sse1-fcopysign.ll       | 16 ++--------------
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 318e398211c..906223a624c 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11590,15 +11590,15 @@ static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
+  bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
   EVT VT = N->getValueType(0);
 
   if (N0CFP && N1CFP) // Constant fold
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1);
 
-  if (N1CFP) {
-    const APFloat &V = N1CFP->getValueAPF();
+  if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
+    const APFloat &V = N1C->getValueAPF();
     // copysign(x, c1) -> fabs(x)       iff ispos(c1)
     // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
     if (!V.isNegative()) {
diff --git a/test/CodeGen/X86/sse1-fcopysign.ll b/test/CodeGen/X86/sse1-fcopysign.ll
index bd9e4708a42..5132573ef53 100644
--- a/test/CodeGen/X86/sse1-fcopysign.ll
+++ b/test/CodeGen/X86/sse1-fcopysign.ll
@@ -43,18 +43,12 @@ define float @f32_neg(float %a, float %b) nounwind {
 define <4 x float> @v4f32_pos(<4 x float> %a, <4 x float> %b) nounwind {
 ; X86-LABEL: v4f32_pos:
 ; X86:       # %bb.0:
-; X86-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
-; X86-NEXT:    andps {{\.LCPI.*}}, %xmm1
 ; X86-NEXT:    andps {{\.LCPI.*}}, %xmm0
-; X86-NEXT:    orps %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: v4f32_pos:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
-; X64-NEXT:    andps {{.*}}(%rip), %xmm1
 ; X64-NEXT:    andps {{.*}}(%rip), %xmm0
-; X64-NEXT:    orps %xmm1, %xmm0
 ; X64-NEXT:    retq
   %tmp = tail call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   ret <4 x float> %tmp
@@ -63,18 +57,12 @@ define <4 x float> @v4f32_pos(<4 x float> %a, <4 x float> %b) nounwind {
 define <4 x float> @v4f32_neg(<4 x float> %a, <4 x float> %b) nounwind {
 ; X86-LABEL: v4f32_neg:
 ; X86:       # %bb.0:
-; X86-NEXT:    movaps {{.*#+}} xmm1 = [-1,-1,-1,-1]
-; X86-NEXT:    andps {{\.LCPI.*}}, %xmm1
-; X86-NEXT:    andps {{\.LCPI.*}}, %xmm0
-; X86-NEXT:    orps %xmm1, %xmm0
+; X86-NEXT:    orps {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: v4f32_neg:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm1 = [-1,-1,-1,-1]
-; X64-NEXT:    andps {{.*}}(%rip), %xmm1
-; X64-NEXT:    andps {{.*}}(%rip), %xmm0
-; X64-NEXT:    orps %xmm1, %xmm0
+; X64-NEXT:    orps {{.*}}(%rip), %xmm0
 ; X64-NEXT:    retq
   %tmp = tail call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>)
   ret <4 x float> %tmp
-- 
GitLab


From 5c0be92e19388bbd56543377110a443a8e0d51d3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 28 Oct 2018 13:07:25 +0000
Subject: [PATCH 0683/1116] [VectorLegalizer] Enable
 TargetLowering::expandFP_TO_UINT support.

Add vector support to TargetLowering::expandFP_TO_UINT.

This exposes an issue in X86TargetLowering::LowerVSELECT which was assuming that the select mask was the same width as the LHS/RHS ops - as long as the result is a sign splat we can easily sext/trunk this.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345473 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/LegalizeVectorOps.cpp        |  19 +-
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |   5 +
 lib/Target/X86/X86ISelLowering.cpp            |  28 +-
 test/CodeGen/X86/ftrunc.ll                    |  30 +-
 test/CodeGen/X86/vec_cast3.ll                 |  26 +-
 test/CodeGen/X86/vec_fp_to_int.ll             | 259 +++++++++---------
 6 files changed, 185 insertions(+), 182 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 6554d5a27b2..122a9856ade 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -86,9 +86,10 @@ class VectorLegalizer {
   /// operations to legalize them.
   SDValue Expand(SDValue Op);
 
-  /// Implements expansion for FNEG; falls back to UnrollVectorOp if
-  /// FSUB isn't legal.
-  ///
+  /// Implements expansion for FP_TO_UINT; falls back to UnrollVectorOp if
+  /// FP_TO_SINT isn't legal.
+  SDValue ExpandFP_TO_UINT(SDValue Op);
+
   /// Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
   /// SINT_TO_FLOAT and SHR on vectors isn't legal.
   SDValue ExpandUINT_TO_FLOAT(SDValue Op);
@@ -709,6 +710,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
     return ExpandVSELECT(Op);
   case ISD::SELECT:
     return ExpandSELECT(Op);
+  case ISD::FP_TO_UINT:
+    return ExpandFP_TO_UINT(Op);
   case ISD::UINT_TO_FP:
     return ExpandUINT_TO_FLOAT(Op);
   case ISD::FNEG:
@@ -1018,6 +1021,16 @@ SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
 }
 
+SDValue VectorLegalizer::ExpandFP_TO_UINT(SDValue Op) {
+  // Attempt to expand using TargetLowering.
+  SDValue Result;
+  if (TLI.expandFP_TO_UINT(Op.getNode(), Result, DAG))
+    return Result;
+
+  // Otherwise go ahead and unroll.
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
   EVT VT = Op.getOperand(0).getValueType();
   SDLoc DL(Op);
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index d6e7590b8fc..cf6910f4d76 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4147,6 +4147,11 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
   EVT SetCCVT =
       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
 
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (DstVT.isVector() && (!isOperationLegalOrCustom(ISD::FP_TO_SINT, DstVT) ||
+                           !isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT)))
+    return false;
+
   // Expand based on maximum range of FP_TO_SINT:
   // True = fp_to_sint(Src)
   // False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c2ca88911d2..060b36c868e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15698,7 +15698,9 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
   // with patterns on the mask registers on AVX-512.
-  if (Cond.getScalarValueSizeInBits() == 1)
+  MVT CondVT = Cond.getSimpleValueType();
+  unsigned CondEltSize = Cond.getScalarValueSizeInBits();
+  if (CondEltSize == 1)
     return Op;
 
   // Variable blends are only legal from SSE4.1 onward.
@@ -15707,24 +15709,34 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  unsigned NumElts = VT.getVectorNumElements();
 
   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
   // into an i1 condition so that we can use the mask-based 512-bit blend
   // instructions.
   if (VT.getSizeInBits() == 512) {
-    // The vNi1 condition case should be handled above as it can be trivially
-    // lowered.
-    assert(Cond.getScalarValueSizeInBits() == VT.getScalarSizeInBits() &&
-           "Should have a size-matched integer condition!");
     // Build a mask by testing the condition against zero.
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
-                                getZeroVector(VT, Subtarget, DAG, dl),
+                                getZeroVector(CondVT, Subtarget, DAG, dl),
                                 ISD::SETNE);
     // Now return a new VSELECT using the mask.
     return DAG.getSelect(dl, VT, Mask, LHS, RHS);
   }
 
+  // SEXT/TRUNC cases where the mask doesn't match the destination size.
+  if (CondEltSize != EltSize) {
+    // If we don't have a sign splat, rely on the expansion.
+    if (CondEltSize != DAG.ComputeNumSignBits(Cond))
+      return SDValue();
+
+    MVT NewCondSVT = MVT::getIntegerVT(EltSize);
+    MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
+    Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
+    return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
+  }
+
   // Only some types will be legal on some subtargets. If we can emit a legal
   // VSELECT-matching blend, return Op, and but if we need to expand, return
   // a null value.
@@ -15743,7 +15755,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   case MVT::v8i16:
   case MVT::v16i16: {
     // Bitcast everything to the vXi8 type and use a vXi8 vselect.
-    MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
+    MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
     Cond = DAG.getBitcast(CastVT, Cond);
     LHS = DAG.getBitcast(CastVT, LHS);
     RHS = DAG.getBitcast(CastVT, RHS);
diff --git a/test/CodeGen/X86/ftrunc.ll b/test/CodeGen/X86/ftrunc.ll
index ff40f619853..26cea1d71f3 100644
--- a/test/CodeGen/X86/ftrunc.ll
+++ b/test/CodeGen/X86/ftrunc.ll
@@ -63,25 +63,19 @@ define double @trunc_unsigned_f64(double %x) #0 {
 define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 {
 ; SSE2-LABEL: trunc_unsigned_v4f32:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
-; SSE2-NEXT:    cvttss2si %xmm1, %rax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE2-NEXT:    cvttss2si %xmm2, %rax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535]
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    cmpltps %xmm2, %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm0, %xmm3
+; SSE2-NEXT:    subps %xmm2, %xmm0
+; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE2-NEXT:    xorps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    andps %xmm1, %xmm3
+; SSE2-NEXT:    andnps %xmm0, %xmm1
+; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [65535,65535,65535,65535]
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    psrld $16, %xmm1
 ; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    addps {{.*}}(%rip), %xmm1
diff --git a/test/CodeGen/X86/vec_cast3.ll b/test/CodeGen/X86/vec_cast3.ll
index 9af324b76bc..b805b336106 100644
--- a/test/CodeGen/X86/vec_cast3.ll
+++ b/test/CodeGen/X86/vec_cast3.ll
@@ -317,25 +317,13 @@ define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2u32:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    subl $68, %esp
-; CHECK-WIDE-NEXT:    .cfi_def_cfa_offset 72
-; CHECK-WIDE-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    fisttpll (%esp)
-; CHECK-WIDE-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-WIDE-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpinsrd $3, (%esp), %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    addl $68, %esp
+; CHECK-WIDE-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; CHECK-WIDE-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
+; CHECK-WIDE-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vxorps LCPI11_1, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; CHECK-WIDE-NEXT:    retl
   %res = fptoui <2 x float> %src to <2 x i32>
   ret <2 x i32> %res
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index e09cd0a43d7..84a4385f2c9 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -627,16 +627,36 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; VEX-LABEL: fptoui_4f64_to_2i32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vcvttsd2si %xmm0, %rax
-; VEX-NEXT:    vmovd %eax, %xmm1
-; VEX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; VEX-NEXT:    vcvttsd2si %xmm0, %rax
-; VEX-NEXT:    vmovd %eax, %xmm0
-; VEX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; VEX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; VEX-NEXT:    retq
+; AVX1-LABEL: fptoui_4f64_to_2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd %xmm0, %xmm0
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_4f64_to_2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovapd %xmm0, %xmm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_4f64_to_2i32:
 ; AVX512F:       # %bb.0:
@@ -930,21 +950,34 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; VEX-LABEL: fptoui_4f64_to_4i32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; VEX-NEXT:    vcvttsd2si %xmm1, %rax
-; VEX-NEXT:    vcvttsd2si %xmm0, %rcx
-; VEX-NEXT:    vmovd %ecx, %xmm1
-; VEX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; VEX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; VEX-NEXT:    vcvttsd2si %xmm0, %rax
-; VEX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; VEX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; VEX-NEXT:    vcvttsd2si %xmm0, %rax
-; VEX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
-; VEX-NEXT:    vzeroupper
-; VEX-NEXT:    retq
+; AVX1-LABEL: fptoui_4f64_to_4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_4f64_to_4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_4f64_to_4i32:
 ; AVX512F:       # %bb.0:
@@ -1570,39 +1603,41 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
 define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
 ; SSE-LABEL: fptoui_4f32_to_4i32:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
 ; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT:    cvttss2si %xmm2, %rax
-; SSE-NEXT:    movd %eax, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    cmpltps %xmm2, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
+; SSE-NEXT:    subps %xmm2, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    xorps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    andps %xmm1, %xmm3
+; SSE-NEXT:    andnps %xmm0, %xmm1
+; SSE-NEXT:    orps %xmm3, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; VEX-LABEL: fptoui_4f32_to_4i32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; VEX-NEXT:    vcvttss2si %xmm1, %rax
-; VEX-NEXT:    vcvttss2si %xmm0, %rcx
-; VEX-NEXT:    vmovd %ecx, %xmm1
-; VEX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; VEX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; VEX-NEXT:    vcvttss2si %xmm2, %rax
-; VEX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; VEX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; VEX-NEXT:    vcvttss2si %xmm0, %rax
-; VEX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: fptoui_4f32_to_4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_4f32_to_4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorps %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_4f32_to_4i32:
 ; AVX512F:       # %bb.0:
@@ -1853,95 +1888,51 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
 define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
 ; SSE-LABEL: fptoui_8f32_to_8i32:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
 ; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    movaps %xmm2, %xmm3
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE-NEXT:    cvttss2si %xmm3, %rax
-; SSE-NEXT:    movd %eax, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE-NEXT:    cvttss2si %xmm2, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; SSE-NEXT:    cvttss2si %xmm2, %rax
-; SSE-NEXT:    movd %eax, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE-NEXT:    movaps %xmm1, %xmm2
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
-; SSE-NEXT:    cvttss2si %xmm2, %rax
-; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    cmpltps %xmm4, %xmm2
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
+; SSE-NEXT:    subps %xmm4, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    xorps %xmm5, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm3
+; SSE-NEXT:    andnps %xmm0, %xmm2
+; SSE-NEXT:    orps %xmm3, %xmm2
 ; SSE-NEXT:    movaps %xmm1, %xmm3
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; SSE-NEXT:    cvttss2si %xmm3, %rax
-; SSE-NEXT:    movd %eax, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movd %eax, %xmm2
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    cmpltps %xmm4, %xmm3
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm0
+; SSE-NEXT:    subps %xmm4, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE-NEXT:    xorps %xmm5, %xmm1
+; SSE-NEXT:    andps %xmm3, %xmm0
+; SSE-NEXT:    andnps %xmm1, %xmm3
+; SSE-NEXT:    orps %xmm0, %xmm3
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm3, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: fptoui_8f32_to_8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT:    vcvttss2si %xmm2, %rax
-; AVX1-NEXT:    vcvttss2si %xmm1, %rcx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX1-NEXT:    vcvttss2si %xmm1, %rax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vcvttss2si %xmm2, %rax
-; AVX1-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX1-NEXT:    vcvttss2si %xmm0, %rax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX1-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vsubps %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: fptoui_8f32_to_8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT:    vcvttss2si %xmm2, %rax
-; AVX2-NEXT:    vcvttss2si %xmm1, %rcx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-NEXT:    vcvttss2si %xmm1, %rax
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vcvttss2si %xmm2, %rax
-; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vcvttss2si %xmm0, %rax
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vsubps %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_8f32_to_8i32:
-- 
GitLab


From 340f56a47cc3addffff7b5179fdfd7cc85d90d06 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sun, 28 Oct 2018 13:39:01 +0000
Subject: [PATCH 0684/1116] [X86][NFC] sse2-schedule.ll: disable XOP for BdVer2
 tests

Else we are clearly testing the wrong instruction.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345474 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/sse2-schedule.ll | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index d2ee1e09d08..a833dcf0735 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -15,7 +15,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2,-xop | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -9115,8 +9115,8 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-LABEL: test_pcmpeqb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomeqb %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
-; BDVER2-NEXT:    vpcomeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
@@ -9258,8 +9258,8 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-LABEL: test_pcmpeqd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomeqd %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
-; BDVER2-NEXT:    vpcomeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
@@ -9401,8 +9401,8 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-LABEL: test_pcmpeqw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomeqw %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
-; BDVER2-NEXT:    vpcomeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
@@ -9551,8 +9551,8 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ;
 ; BDVER2-LABEL: test_pcmpgtb:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
-; BDVER2-NEXT:    vpcomgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
@@ -9703,8 +9703,8 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BDVER2-LABEL: test_pcmpgtd:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomgtd %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
-; BDVER2-NEXT:    vpcomeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
@@ -9855,8 +9855,8 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BDVER2-LABEL: test_pcmpgtw:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
-; BDVER2-NEXT:    vpcomgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
-- 
GitLab


From 459b074278f3605f89a2eb3822c30b739cabd546 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sun, 28 Oct 2018 13:39:06 +0000
Subject: [PATCH 0685/1116] [X86][NFC] sse41-schedule.ll: disable XOP for
 BdVer2 tests

Else we are clearly testing the wrong instruction.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345475 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/sse41-schedule.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll
index 160b780ac37..ea606463fc1 100644
--- a/test/CodeGen/X86/sse41-schedule.ll
+++ b/test/CodeGen/X86/sse41-schedule.ll
@@ -14,7 +14,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2,-xop   | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -1723,8 +1723,8 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-LABEL: test_pcmpeqq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomeqq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
-; BDVER2-NEXT:    vpcomeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpeqq:
-- 
GitLab


From b757770914c731e3e77e927950f503e6c04dc72c Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sun, 28 Oct 2018 13:39:10 +0000
Subject: [PATCH 0686/1116] [X86][NFC] sse42-schedule.ll: disable XOP for
 BdVer2 tests

Else we are clearly testing the wrong instruction.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345476 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/sse42-schedule.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll
index c08e17aa2be..97dffb4db09 100644
--- a/test/CodeGen/X86/sse42-schedule.ll
+++ b/test/CodeGen/X86/sse42-schedule.ll
@@ -14,7 +14,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2,-xop | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -1480,8 +1480,8 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ;
 ; BDVER2-LABEL: test_pcmpgtq:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
-; BDVER2-NEXT:    vpcomgtq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_pcmpgtq:
-- 
GitLab


From 1798150a2b18a21da426b2fb5223f8c2087e5010 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 28 Oct 2018 15:34:35 +0000
Subject: [PATCH 0687/1116] [TargetLowering] Move i64/vXi64 to f32/vXf32
 UINT_TO_FP handling to TargetLowering::expandUINT_TO_FP.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345478 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp    |  27 ------
 lib/CodeGen/SelectionDAG/TargetLowering.cpp | 101 ++++++++++++++------
 2 files changed, 72 insertions(+), 56 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index dcc47454399..130b33d0767 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2314,7 +2314,6 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
                                                    EVT DestVT,
                                                    const SDLoc &dl) {
   EVT SrcVT = Op0.getValueType();
-  EVT ShiftVT = TLI.getShiftAmountTy(SrcVT, DAG.getDataLayout());
 
   // TODO: Should any fast-math-flags be set for the created nodes?
   LLVM_DEBUG(dbgs() << "Legalizing INT_TO_FP\n");
@@ -2369,32 +2368,6 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
   assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
   // Code below here assumes !isSigned without checking again.
 
-  // TODO: Generalize this for use with other types.
-  if (SrcVT == MVT::i64 && DestVT == MVT::f32) {
-    LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f32\n");
-    // For unsigned conversions, convert them to signed conversions using the
-    // algorithm from the x86_64 __floatundidf in compiler_rt.
-    SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
-
-    SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
-    SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, ShiftConst);
-    SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
-    SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Op0, AndConst);
-    SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr);
-
-    SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Or);
-    SDValue Slow = DAG.getNode(ISD::FADD, dl, DestVT, SignCvt, SignCvt);
-
-    // TODO: This really should be implemented using a branch rather than a
-    // select.  We happen to get lucky and machinesink does the right
-    // thing most of the time.  This would be a good candidate for a
-    // pseudo-op, or, even better, for whole-function isel.
-    SDValue SignBitTest =
-        DAG.getSetCC(dl, getSetCCResultType(SrcVT), Op0,
-                     DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
-    return DAG.getSelect(dl, DestVT, SignBitTest, Slow, Fast);
-  }
-
   SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
 
   SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(SrcVT), Op0,
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cf6910f4d76..103a7509835 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4179,40 +4179,83 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
   EVT SrcVT = Src.getValueType();
   EVT DstVT = Node->getValueType(0);
 
-  if (SrcVT.getScalarType() != MVT::i64 || DstVT.getScalarType() != MVT::f64)
-    return false;
-
-  // Only expand vector types if we have the appropriate vector bit operations.
-  if (SrcVT.isVector() && (!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
-                           !isOperationLegalOrCustom(ISD::FADD, DstVT) ||
-                           !isOperationLegalOrCustom(ISD::FSUB, DstVT) ||
-                           !isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
-                           !isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
+  if (SrcVT.getScalarType() != MVT::i64)
     return false;
 
   SDLoc dl(SDValue(Node, 0));
   EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout());
 
-  // Implementation of unsigned i64 to f64 following the algorithm in
-  // __floatundidf in compiler_rt. This implementation has the advantage
-  // of performing rounding correctly, both in the default rounding mode
-  // and in all alternate rounding modes.
-  SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
-  SDValue TwoP84PlusTwoP52 =
-      DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT);
-  SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
-  SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
-  SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);
-
-  SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Src, LoMask);
-  SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Src, HiShift);
-  SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52);
-  SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
-  SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
-  SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
-  SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
-  Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
-  return true;
+  if (DstVT.getScalarType() == MVT::f32) {
+    // Only expand vector types if we have the appropriate vector bit
+    // operations.
+    if (SrcVT.isVector() &&
+        (!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
+         !isOperationLegalOrCustom(ISD::FADD, DstVT) ||
+         !isOperationLegalOrCustom(ISD::SINT_TO_FP, SrcVT) ||
+         !isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
+         !isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
+      return false;
+
+    // For unsigned conversions, convert them to signed conversions using the
+    // algorithm from the x86_64 __floatundidf in compiler_rt.
+    SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
+
+    SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
+    SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Src, ShiftConst);
+    SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
+    SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Src, AndConst);
+    SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr);
+
+    SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or);
+    SDValue Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt);
+
+    // TODO: This really should be implemented using a branch rather than a
+    // select.  We happen to get lucky and machinesink does the right
+    // thing most of the time.  This would be a good candidate for a
+    // pseudo-op, or, even better, for whole-function isel.
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
+
+    SDValue SignBitTest = DAG.getSetCC(
+        dl, SetCCVT, Src, DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
+    Result = DAG.getSelect(dl, DstVT, SignBitTest, Slow, Fast);
+    return true;
+  }
+
+  if (DstVT.getScalarType() == MVT::f64) {
+    // Only expand vector types if we have the appropriate vector bit
+    // operations.
+    if (SrcVT.isVector() &&
+        (!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
+         !isOperationLegalOrCustom(ISD::FADD, DstVT) ||
+         !isOperationLegalOrCustom(ISD::FSUB, DstVT) ||
+         !isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
+         !isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
+      return false;
+
+    // Implementation of unsigned i64 to f64 following the algorithm in
+    // __floatundidf in compiler_rt. This implementation has the advantage
+    // of performing rounding correctly, both in the default rounding mode
+    // and in all alternate rounding modes.
+    SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
+    SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
+        BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT);
+    SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
+    SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
+    SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);
+
+    SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Src, LoMask);
+    SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Src, HiShift);
+    SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52);
+    SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
+    SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
+    SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
+    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
+    Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
+    return true;
+  }
+
+  return false;
 }
 
 SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
-- 
GitLab


From 218c641bcf854439d58d2ad13769d632697644be Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Sun, 28 Oct 2018 19:09:14 +0000
Subject: [PATCH 0688/1116] [llvm-exegesis] Fix SNB counter definition and
 handling.

Summary: SNB is the only one that has P23 as a single proc res.

Reviewers: gchatelet

Subscribers: tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D53766

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345480 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/BenchmarkRunner.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 3f13c4638ec..437503f8486 100644
--- a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -61,13 +61,14 @@ private:
     // (e.g. P23 on SandyBridge).
     int64_t CounterValue = 0;
     llvm::SmallVector<llvm::StringRef, 2> CounterNames;
-    llvm::StringRef(Counters).split(CounterNames, ',');
+    llvm::StringRef(Counters).split(CounterNames, '+');
     char *const ScratchPtr = Scratch->ptr();
-    for (const auto &CounterName : CounterNames) {
+    for (auto &CounterName : CounterNames) {
+      CounterName = CounterName.trim();
       pfm::PerfEvent PerfEvent(CounterName);
       if (!PerfEvent.valid())
         llvm::report_fatal_error(
-            llvm::Twine("invalid perf event ").concat(Counters));
+            llvm::Twine("invalid perf event '").concat(CounterName).concat("'"));
       pfm::Counter Counter(PerfEvent);
       Scratch->clear();
       {
-- 
GitLab


From 5984510fd0f2ebba05b24c2ac21189e2454d4f45 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 28 Oct 2018 21:24:20 +0000
Subject: [PATCH 0689/1116] [SelectionDAG] Fix bad indentation. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345481 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 23abbf10f53..9a27cf36380 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4392,10 +4392,10 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
     SDValue V2 = BV2->getOperand(I);
 
     if (SVT.isInteger()) {
-        if (V1->getValueType(0).bitsGT(SVT))
-          V1 = getNode(ISD::TRUNCATE, DL, SVT, V1);
-        if (V2->getValueType(0).bitsGT(SVT))
-          V2 = getNode(ISD::TRUNCATE, DL, SVT, V2);
+      if (V1->getValueType(0).bitsGT(SVT))
+        V1 = getNode(ISD::TRUNCATE, DL, SVT, V1);
+      if (V2->getValueType(0).bitsGT(SVT))
+        V2 = getNode(ISD::TRUNCATE, DL, SVT, V2);
     }
 
     if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT)
-- 
GitLab


From a8288db805705e57558a483aa2f1418af31cedb4 Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Sun, 28 Oct 2018 22:30:48 +0000
Subject: [PATCH 0690/1116] Revert "Revert "DebugInfo: reduce DIE range
 verification on object files""

This reverts commit 836c763dadbd9478fa35b1a291a38bf17aa206ba.  Default
initialize the values that MSAN caught.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345482 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFVerifier.h  |  7 ++-
 lib/DebugInfo/DWARF/DWARFVerifier.cpp         | 58 ++++++++++++++-----
 .../llvm-dwarfdump/X86/debug-verify-object.s  | 57 ++++++++++++++++++
 3 files changed, 107 insertions(+), 15 deletions(-)
 create mode 100644 test/tools/llvm-dwarfdump/X86/debug-verify-object.s

diff --git a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 3ad65cf51b1..e47fbea5646 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -97,6 +97,9 @@ private:
   /// lies between to valid DIEs.
   std::map<uint64_t, std::set<uint32_t>> ReferenceToDIEOffsets;
   uint32_t NumDebugLineErrors = 0;
+  // Used to relax some checks that do not currently work portably
+  bool IsObjectFile;
+  bool IsMachOObject;
 
   raw_ostream &error() const;
   raw_ostream &warn() const;
@@ -286,8 +289,8 @@ private:
 
 public:
   DWARFVerifier(raw_ostream &S, DWARFContext &D,
-                DIDumpOptions DumpOpts = DIDumpOptions::getForSingleDIE())
-      : OS(S), DCtx(D), DumpOpts(std::move(DumpOpts)) {}
+                DIDumpOptions DumpOpts = DIDumpOptions::getForSingleDIE());
+
   /// Verify the information in any of the following sections, if available:
   /// .debug_abbrev, debug_abbrev.dwo
   ///
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index d30600accd0..fdb71958cc6 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -394,20 +394,42 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
   // Build RI for this DIE and check that ranges within this DIE do not
   // overlap.
   DieRangeInfo RI(Die);
-  for (auto Range : Ranges) {
-    if (!Range.valid()) {
-      ++NumErrors;
-      error() << "Invalid address range " << Range << "\n";
-      continue;
-    }
 
-    // Verify that ranges don't intersect.
-    const auto IntersectingRange = RI.insert(Range);
-    if (IntersectingRange != RI.Ranges.end()) {
-      ++NumErrors;
-      error() << "DIE has overlapping address ranges: " << Range << " and "
-              << *IntersectingRange << "\n";
-      break;
+  // TODO support object files better
+  //
+  // Some object file formats (i.e. non-MachO) support COMDAT.  ELF in
+  // particular does so by placing each function into a section.  The DWARF data
+  // for the function at that point uses a section relative DW_FORM_addrp for
+  // the DW_AT_low_pc and a DW_FORM_data4 for the offset as the DW_AT_high_pc.
+  // In such a case, when the Die is the CU, the ranges will overlap, and we
+  // will flag valid conflicting ranges as invalid.
+  //
+  // For such targets, we should read the ranges from the CU and partition them
+  // by the section id.  The ranges within a particular section should be
+  // disjoint, although the ranges across sections may overlap.  We would map
+  // the child die to the entity that it references and the section with which
+  // it is associated.  The child would then be checked against the range
+  // information for the associated section.
+  //
+  // For now, simply elide the range verification for the CU DIEs if we are
+  // processing an object file.
+
+  if (!IsObjectFile || IsMachOObject || Die.getTag() == DW_TAG_subprogram) {
+    for (auto Range : Ranges) {
+      if (!Range.valid()) {
+        ++NumErrors;
+        error() << "Invalid address range " << Range << "\n";
+        continue;
+      }
+
+      // Verify that ranges don't intersect.
+      const auto IntersectingRange = RI.insert(Range);
+      if (IntersectingRange != RI.Ranges.end()) {
+        ++NumErrors;
+        error() << "DIE has overlapping address ranges: " << Range << " and "
+                << *IntersectingRange << "\n";
+        break;
+      }
     }
   }
 
@@ -745,6 +767,16 @@ void DWARFVerifier::verifyDebugLineRows() {
   }
 }
 
+DWARFVerifier::DWARFVerifier(raw_ostream &S, DWARFContext &D,
+                             DIDumpOptions DumpOpts)
+    : OS(S), DCtx(D), DumpOpts(std::move(DumpOpts)), IsObjectFile(false),
+      IsMachOObject(false) {
+  if (const auto *F = DCtx.getDWARFObj().getFile()) {
+    IsObjectFile = F->isRelocatableObject();
+    IsMachOObject = F->isMachO();
+  }
+}
+
 bool DWARFVerifier::handleDebugLine() {
   NumDebugLineErrors = 0;
   OS << "Verifying .debug_line...\n";
diff --git a/test/tools/llvm-dwarfdump/X86/debug-verify-object.s b/test/tools/llvm-dwarfdump/X86/debug-verify-object.s
new file mode 100644
index 00000000000..be79c95c0b1
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/debug-verify-object.s
@@ -0,0 +1,57 @@
+# RUN: llvm-mc -triple x86_64-unknown-linux-gnu -filetype obj -o - %s | llvm-dwarfdump --verify -
+
+	.text
+
+	.section	.text.f,"ax",@progbits
+	.globl	f
+	.type	f,@function
+f:
+.Lfunc_begin0:
+	pushq	$32
+	popq	%rax
+	retq
+.Lfunc_end0:
+	.size	f, .Lfunc_end0-f
+
+	.section	.text.g,"ax",@progbits
+	.globl	g
+	.type	g,@function
+g:
+.Lfunc_begin1:
+	pushq   $64
+	popq    %rax
+	retq
+.Lfunc_end1:
+	.size	g, .Lfunc_end1-g
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                       # Abbreviation Code
+	.byte	17                      # DW_TAG_compile_unit
+	.byte	0                       # DW_CHILDREN_no
+	.byte	17                      # DW_AT_low_pc
+	.byte	1                       # DW_FORM_addr
+	.byte	85                      # DW_AT_ranges
+	.byte	23                      # DW_FORM_sec_offset
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	0                       # EOM(3)
+
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	20                      # Length of Unit
+	.short	4                       # DWARF version number
+	.long	.debug_abbrev           # Offset Into Abbrev. Section
+	.byte	8                       # Address Size (in bytes)
+	.byte	1                       # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit
+	.quad	0                       # DW_AT_low_pc
+	.long	.Ldebug_ranges0         # DW_AT_ranges
+
+	.section        .debug_ranges,"",@progbits
+.Ldebug_ranges0:
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_end0
+	.quad	.Lfunc_begin1
+	.quad	.Lfunc_end1
+	.quad	0
+	.quad	0
+
-- 
GitLab


From 7577443d842345d556edb75ed7ac9b33fccb5ff2 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 28 Oct 2018 23:51:33 +0000
Subject: [PATCH 0691/1116] [X86] Add test case to show failure to handle splat
 vectors in the constant check in LowerFCOPYSIGN.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345483 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/sse1-fcopysign.ll | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/test/CodeGen/X86/sse1-fcopysign.ll b/test/CodeGen/X86/sse1-fcopysign.ll
index 5132573ef53..59598ecd5df 100644
--- a/test/CodeGen/X86/sse1-fcopysign.ll
+++ b/test/CodeGen/X86/sse1-fcopysign.ll
@@ -68,5 +68,25 @@ define <4 x float> @v4f32_neg(<4 x float> %a, <4 x float> %b) nounwind {
   ret <4 x float> %tmp
 }
 
+define <4 x float> @v4f32_const_mag(<4 x float> %a, <4 x float> %b) nounwind {
+; X86-LABEL: v4f32_const_mag:
+; X86:       # %bb.0:
+; X86-NEXT:    andps {{\.LCPI.*}}, %xmm1
+; X86-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; X86-NEXT:    andps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    orps %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: v4f32_const_mag:
+; X64:       # %bb.0:
+; X64-NEXT:    andps {{.*}}(%rip), %xmm1
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; X64-NEXT:    andps {{.*}}(%rip), %xmm0
+; X64-NEXT:    orps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %tmp = tail call <4 x float> @llvm.copysign.v4f32(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> %b )
+  ret <4 x float> %tmp
+}
+
 declare float @llvm.copysign.f32(float, float)
 declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
-- 
GitLab


From 020d70964c63b90d99eed2c15ee4649bb3138dda Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 28 Oct 2018 23:51:35 +0000
Subject: [PATCH 0692/1116] [X86] Recognize constant splats in LowerFCOPYSIGN.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345484 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp |  2 +-
 test/CodeGen/X86/sse1-fcopysign.ll | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 060b36c868e..f2c5040b89e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -18060,7 +18060,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   // TODO: If we had general constant folding for FP logic ops, this check
   // wouldn't be necessary.
   SDValue MagBits;
-  if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
+  if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
     APFloat APF = Op0CN->getValueAPF();
     APF.clearSign();
     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
diff --git a/test/CodeGen/X86/sse1-fcopysign.ll b/test/CodeGen/X86/sse1-fcopysign.ll
index 59598ecd5df..ed7f31e444c 100644
--- a/test/CodeGen/X86/sse1-fcopysign.ll
+++ b/test/CodeGen/X86/sse1-fcopysign.ll
@@ -71,18 +71,16 @@ define <4 x float> @v4f32_neg(<4 x float> %a, <4 x float> %b) nounwind {
 define <4 x float> @v4f32_const_mag(<4 x float> %a, <4 x float> %b) nounwind {
 ; X86-LABEL: v4f32_const_mag:
 ; X86:       # %bb.0:
-; X86-NEXT:    andps {{\.LCPI.*}}, %xmm1
-; X86-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; X86-NEXT:    movaps %xmm1, %xmm0
 ; X86-NEXT:    andps {{\.LCPI.*}}, %xmm0
-; X86-NEXT:    orps %xmm1, %xmm0
+; X86-NEXT:    orps {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: v4f32_const_mag:
 ; X64:       # %bb.0:
-; X64-NEXT:    andps {{.*}}(%rip), %xmm1
-; X64-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; X64-NEXT:    movaps %xmm1, %xmm0
 ; X64-NEXT:    andps {{.*}}(%rip), %xmm0
-; X64-NEXT:    orps %xmm1, %xmm0
+; X64-NEXT:    orps {{.*}}(%rip), %xmm0
 ; X64-NEXT:    retq
   %tmp = tail call <4 x float> @llvm.copysign.v4f32(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> %b )
   ret <4 x float> %tmp
-- 
GitLab


From f3ae95d632a56659f51fb2feceb61e459e272d4d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 29 Oct 2018 04:52:04 +0000
Subject: [PATCH 0693/1116] [X86] Force floating point values in constant pool
 decoding to print in scientific notation so they can't be confused with
 integers.

When the floating point constants are whole numbers they have no decimal point so look like integers, but mean something very different in something like an 'and' instruction.

Ideally we would just print a decimal point and a 0, but I couldn't see how to make APFloat::toString do that.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345488 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86MCInstLower.cpp             |   3 +-
 test/CodeGen/X86/2011-10-19-widen_vselect.ll  |   4 +-
 test/CodeGen/X86/2011-20-21-zext-ui2fp.ll     |   2 +-
 test/CodeGen/X86/avx-basic.ll                 |   2 +-
 test/CodeGen/X86/avx-vbroadcast.ll            |   4 +-
 test/CodeGen/X86/avx-vperm2x128.ll            |   8 +-
 test/CodeGen/X86/avx2-fma-fneg-combine.ll     |   4 +-
 test/CodeGen/X86/avx2-vbroadcast.ll           |   8 +-
 test/CodeGen/X86/avx512-arith.ll              |   6 +-
 .../X86/avx512-intrinsics-fast-isel.ll        |  16 +-
 .../X86/broadcast-elm-cross-splat-vec.ll      |  84 ++++-----
 test/CodeGen/X86/buildvec-insertvec.ll        |   2 +-
 test/CodeGen/X86/combine-fabs.ll              |   4 +-
 test/CodeGen/X86/combine-fcopysign.ll         |  22 +--
 test/CodeGen/X86/cvtv2f32.ll                  |   8 +-
 test/CodeGen/X86/fma-fneg-combine.ll          |   2 +-
 test/CodeGen/X86/fma-intrinsics-fast-isel.ll  |   8 +-
 test/CodeGen/X86/fma_patterns.ll              |  60 +++----
 test/CodeGen/X86/fma_patterns_wide.ll         |  72 ++++----
 test/CodeGen/X86/fmul-combines.ll             |   4 +-
 test/CodeGen/X86/fold-vector-trunc-sitofp.ll  |   2 +-
 .../X86/insert-into-constant-vector.ll        |  68 ++++----
 test/CodeGen/X86/packss.ll                    |   2 +-
 test/CodeGen/X86/pow.ll                       |   4 +-
 test/CodeGen/X86/pr2656.ll                    |   2 +-
 test/CodeGen/X86/pr38639.ll                   |   4 +-
 test/CodeGen/X86/recip-fastmath.ll            | 164 +++++++++---------
 test/CodeGen/X86/recip-fastmath2.ll           | 162 ++++++++---------
 test/CodeGen/X86/select_const.ll              |   4 +-
 test/CodeGen/X86/splat-for-size.ll            |   8 +-
 test/CodeGen/X86/sqrt-fastmath.ll             |  44 ++---
 test/CodeGen/X86/sse2.ll                      |   4 +-
 test/CodeGen/X86/subvector-broadcast.ll       |   8 +-
 test/CodeGen/X86/v4f32-immediate.ll           |   4 +-
 test/CodeGen/X86/vec_cast3.ll                 |   4 +-
 test/CodeGen/X86/vec_floor.ll                 |  36 ++--
 test/CodeGen/X86/vec_fp_to_int.ll             |   8 +-
 test/CodeGen/X86/vec_fpext.ll                 |  12 +-
 test/CodeGen/X86/vec_int_to_fp.ll             |  14 +-
 test/CodeGen/X86/vec_ss_load_fold.ll          |   8 +-
 test/CodeGen/X86/vec_uint_to_fp-fastmath.ll   |   4 +-
 .../X86/vector-constrained-fp-intrinsics.ll   |  40 ++---
 .../X86/vector-shuffle-combining-avx.ll       |  16 +-
 .../X86/vector-shuffle-combining-avx2.ll      |   2 +-
 .../X86/vector-shuffle-combining-xop.ll       |  16 +-
 test/CodeGen/X86/vector-shuffle-combining.ll  |   8 +-
 test/CodeGen/X86/vselect-avx.ll               |   6 +-
 test/CodeGen/X86/widen_arith-6.ll             |   2 +-
 .../X86/x86-setcc-int-to-fp-combine.ll        |   4 +-
 49 files changed, 492 insertions(+), 491 deletions(-)

diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 86495f16c3a..2816f8c62bf 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1499,7 +1499,8 @@ static void printConstant(const APInt &Val, raw_ostream &CS) {
 
 static void printConstant(const APFloat &Flt, raw_ostream &CS) {
   SmallString<32> Str;
-  Flt.toString(Str);
+  // Force scientific notation to distinquish from integers.
+  Flt.toString(Str, 0, 0);
   CS << Str;
 }
 
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index d09abf5fbb1..44d9569bc57 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -72,7 +72,7 @@ define void @full_test() {
 ; X32-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; X32-NEXT:    xorps %xmm0, %xmm0
 ; X32-NEXT:    cmpltps %xmm2, %xmm0
-; X32-NEXT:    movaps {{.*#+}} xmm3 = <1,1,u,u>
+; X32-NEXT:    movaps {{.*#+}} xmm3 = <1.0E+0,1.0E+0,u,u>
 ; X32-NEXT:    addps %xmm1, %xmm3
 ; X32-NEXT:    movaps %xmm1, %xmm4
 ; X32-NEXT:    blendvps %xmm0, %xmm3, %xmm4
@@ -92,7 +92,7 @@ define void @full_test() {
 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    cmpltps %xmm2, %xmm0
-; X64-NEXT:    movaps {{.*#+}} xmm3 = <1,1,u,u>
+; X64-NEXT:    movaps {{.*#+}} xmm3 = <1.0E+0,1.0E+0,u,u>
 ; X64-NEXT:    addps %xmm1, %xmm3
 ; X64-NEXT:    movaps %xmm1, %xmm4
 ; X64-NEXT:    blendvps %xmm0, %xmm3, %xmm4
diff --git a/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll b/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
index 1b80dc9b1d2..ad3a17071d5 100644
--- a/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
+++ b/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
@@ -7,7 +7,7 @@
 define void @ui_to_fp_conv(<8 x float> * nocapture %aFOO, <8 x float>* nocapture %RET) nounwind {
 ; CHECK-LABEL: ui_to_fp_conv:
 ; CHECK:       # %bb.0: # %allocas
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1,1,0,0]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    movups %xmm1, 16(%rsi)
 ; CHECK-NEXT:    movups %xmm0, (%rsi)
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 80bbf29c1c5..b85fd4e6482 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -92,7 +92,7 @@ define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {
 define <16 x float> @fneg(<16 x float> %a) nounwind {
 ; CHECK-LABEL: fneg:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vxorps %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 5686e3abc97..b136c72366e 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -316,12 +316,12 @@ entry:
 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
 ; X32-LABEL: _e2:
 ; X32:       ## %bb.0: ## %entry
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: _e2:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X64-NEXT:    retq
 entry:
    %vecinit.i = insertelement <4 x float> undef, float       0xbf80000000000000, i32 0
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index 0c501ea6895..f470c97e472 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -532,7 +532,7 @@ define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounw
 ; AVX2-LABEL: ld0_hi0_lo1_4f64:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:
@@ -552,7 +552,7 @@ define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounw
 ; AVX2-LABEL: ld1_hi0_hi1_4f64:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:
@@ -572,7 +572,7 @@ define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind
 ; AVX2-LABEL: ld0_hi0_lo1_8f32:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:
@@ -592,7 +592,7 @@ define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind
 ; AVX2-LABEL: ld1_hi0_hi1_8f32:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/test/CodeGen/X86/avx2-fma-fneg-combine.ll
index 9ef7bcf026c..d9f41539bfe 100644
--- a/test/CodeGen/X86/avx2-fma-fneg-combine.ll
+++ b/test/CodeGen/X86/avx2-fma-fneg-combine.ll
@@ -44,14 +44,14 @@ define <4 x float> @test3(<4 x float> %a, <4 x float> %b, <4 x float> %c)  {
 ; X32-LABEL: test3:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
-; X32-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; X32-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; X32-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
-; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; X64-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 5d7ac684e54..b333e9109bd 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -615,13 +615,13 @@ entry:
 define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
 ; X32-AVX2-LABEL: V113:
 ; X32-AVX2:       ## %bb.0: ## %entry
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X32-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: V113:
 ; X64-AVX2:       ## %bb.0: ## %entry
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X64-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    retq
 ;
@@ -642,12 +642,12 @@ entry:
 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
 ; X32-LABEL: _e2:
 ; X32:       ## %bb.0:
-; X32-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X32-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: _e2:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X64-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X64-NEXT:    retq
   %vecinit.i = insertelement <4 x float> undef, float        0xbf80000000000000, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float  0xbf80000000000000, i32 1
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index 6bc69213d42..29793a7e0bc 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -969,7 +969,7 @@ define <16 x float>  @test_fxor(<16 x float> %a) {
 define <8 x float>  @test_fxor_8f32(<8 x float> %a) {
 ; AVX512F-LABEL: test_fxor_8f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX512F-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -980,13 +980,13 @@ define <8 x float>  @test_fxor_8f32(<8 x float> %a) {
 ;
 ; AVX512BW-LABEL: test_fxor_8f32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; AVX512BW-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX512BW-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_fxor_8f32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; AVX512DQ-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX512DQ-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index d888ea4ac44..e04d8e31944 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -3156,7 +3156,7 @@ entry:
 define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
 ; CHECK-LABEL: test_mm512_fnmsub_round_pd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
 ; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
@@ -3387,7 +3387,7 @@ entry:
 define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
 ; CHECK-LABEL: test_mm512_fnmsub_pd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
 ; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
@@ -3613,7 +3613,7 @@ entry:
 define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
 ; CHECK-LABEL: test_mm512_fnmsub_round_ps:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vpxord %zmm3, %zmm0, %zmm4
 ; CHECK-NEXT:    vpxord %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
@@ -3836,7 +3836,7 @@ entry:
 define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
 ; CHECK-LABEL: test_mm512_fnmsub_ps:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vpxord %zmm3, %zmm0, %zmm4
 ; CHECK-NEXT:    vpxord %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
@@ -7459,7 +7459,7 @@ define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movb 8(%ebp), %al
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
+; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X86-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
 ; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
 ; X86-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
@@ -7478,7 +7478,7 @@ define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X64-LABEL: test_mm512_mask_reduce_mul_pd:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
+; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X64-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
 ; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
 ; X64-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
@@ -7565,7 +7565,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
 ; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
 ; X86-NEXT:    vmulps %ymm0, %ymm1, %ymm0
@@ -7585,7 +7585,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X64-LABEL: test_mm512_mask_reduce_mul_ps:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
 ; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
 ; X64-NEXT:    vmulps %ymm0, %ymm1, %ymm0
diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index 19d3b47a659..353faabba2d 100644
--- a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -1600,7 +1600,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
 define <4 x float> @f4xf32_f64(<4 x float> %a) {
 ; AVX-LABEL: f4xf32_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [0.0078125018626451492,0.0078125018626451492]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX-NEXT:    # xmm1 = mem[0,0]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm0
@@ -1608,7 +1608,7 @@ define <4 x float> @f4xf32_f64(<4 x float> %a) {
 ;
 ; ALL32-LABEL: f4xf32_f64:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = [0.0078125018626451492,0.0078125018626451492]
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3]
 ; ALL32-NEXT:    # xmm1 = mem[0,0]
 ; ALL32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    vdivps %xmm0, %xmm1, %xmm0
@@ -1616,7 +1616,7 @@ define <4 x float> @f4xf32_f64(<4 x float> %a) {
 ;
 ; AVX-64-LABEL: f4xf32_f64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [0.0078125018626451492,0.0078125018626451492]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX-64-NEXT:    # xmm1 = mem[0,0]
 ; AVX-64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    vdivps %xmm0, %xmm1, %xmm0
@@ -1637,21 +1637,21 @@ define <4 x float> @f4xf32_f64(<4 x float> %a) {
 define <8 x float> @f8xf32_f64(<8 x float> %a) {
 ; AVX-LABEL: f8xf32_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    retl
 ;
 ; ALL32-LABEL: f8xf32_f64:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; ALL32-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; ALL32-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; ALL32-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; ALL32-NEXT:    retl
 ;
 ; AVX-64-LABEL: f8xf32_f64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX-64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; AVX-64-NEXT:    retq
@@ -1671,7 +1671,7 @@ define <8 x float> @f8xf32_f64(<8 x float> %a) {
 define <8 x float> @f8xf32_f128(<8 x float> %a) {
 ; AVX-LABEL: f8xf32_f128:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4,1,2,3,4,1,2,3]
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
@@ -1679,7 +1679,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
 ;
 ; ALL32-LABEL: f8xf32_f128:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4,1,2,3,4,1,2,3]
+; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
 ; ALL32-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; ALL32-NEXT:    vdivps %ymm0, %ymm1, %ymm0
@@ -1687,7 +1687,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
 ;
 ; AVX-64-LABEL: f8xf32_f128:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4,1,2,3,4,1,2,3]
+; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-64-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX-64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
@@ -1695,7 +1695,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
 ;
 ; ALL64-LABEL: f8xf32_f128:
 ; ALL64:       # %bb.0:
-; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4,1,2,3,4,1,2,3]
+; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
 ; ALL64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; ALL64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
@@ -1709,7 +1709,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
 define <16 x float> @f16xf32_f64(<16 x float> %a) {
 ; AVX-LABEL: f16xf32_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1718,7 +1718,7 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
 ;
 ; AVX2-LABEL: f16xf32_f64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1727,14 +1727,14 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
 ;
 ; AVX512-LABEL: f16xf32_f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX512-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    retl
 ;
 ; AVX-64-LABEL: f16xf32_f64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1765,7 +1765,7 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
 define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ; AVX-LABEL: f16xf32_f128:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4,1,2,3,4,1,2,3]
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -1775,7 +1775,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ;
 ; AVX2-LABEL: f16xf32_f128:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4,1,2,3,4,1,2,3]
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -1785,7 +1785,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ;
 ; AVX512-LABEL: f16xf32_f128:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3]
+; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
@@ -1793,7 +1793,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ;
 ; AVX-64-LABEL: f16xf32_f128:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4,1,2,3,4,1,2,3]
+; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -1803,7 +1803,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ;
 ; AVX2-64-LABEL: f16xf32_f128:
 ; AVX2-64:       # %bb.0:
-; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4,1,2,3,4,1,2,3]
+; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -1813,7 +1813,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ;
 ; AVX512F-64-LABEL: f16xf32_f128:
 ; AVX512F-64:       # %bb.0:
-; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3]
+; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    vdivps %zmm0, %zmm1, %zmm0
@@ -1827,7 +1827,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ; AVX-LABEL: f16xf32_f256:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [8,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1836,7 +1836,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ;
 ; AVX2-LABEL: f16xf32_f256:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [8,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1845,7 +1845,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ;
 ; AVX512-LABEL: f16xf32_f256:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8,1,2,3,4,5,6,7,8,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
@@ -1853,7 +1853,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ;
 ; AVX-64-LABEL: f16xf32_f256:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8,1,2,3,4,5,6,7]
+; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1862,7 +1862,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ;
 ; AVX2-64-LABEL: f16xf32_f256:
 ; AVX2-64:       # %bb.0:
-; AVX2-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8,1,2,3,4,5,6,7]
+; AVX2-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX2-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1871,7 +1871,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ;
 ; AVX512F-64-LABEL: f16xf32_f256:
 ; AVX512F-64:       # %bb.0:
-; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8,1,2,3,4,5,6,7,8,1,2,3,4,5,6,7]
+; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    vdivps %zmm0, %zmm1, %zmm0
@@ -1885,7 +1885,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 define <4 x double> @f4xf64_f128(<4 x double> %a) {
 ; AVX-LABEL: f4xf64_f128:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2,1,2,1]
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
@@ -1893,7 +1893,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
 ;
 ; ALL32-LABEL: f4xf64_f128:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2,1,2,1]
+; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
 ; ALL32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; ALL32-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
@@ -1901,7 +1901,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
 ;
 ; AVX-64-LABEL: f4xf64_f128:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2,1,2,1]
+; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX-64-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX-64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
@@ -1909,7 +1909,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
 ;
 ; ALL64-LABEL: f4xf64_f128:
 ; ALL64:       # %bb.0:
-; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2,1,2,1]
+; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
 ; ALL64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; ALL64-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
@@ -1923,7 +1923,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
 define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ; AVX-LABEL: f8xf64_f128:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2,1,2,1]
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
@@ -1933,7 +1933,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ;
 ; AVX2-LABEL: f8xf64_f128:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2,1,2,1]
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
@@ -1943,7 +1943,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ;
 ; AVX512-LABEL: f8xf64_f128:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2,1,2,1,2,1,2,1]
+; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
@@ -1951,7 +1951,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ;
 ; AVX-64-LABEL: f8xf64_f128:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2,1,2,1]
+; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
@@ -1961,7 +1961,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ;
 ; AVX2-64-LABEL: f8xf64_f128:
 ; AVX2-64:       # %bb.0:
-; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2,1,2,1]
+; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
@@ -1971,7 +1971,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ;
 ; AVX512F-64-LABEL: f8xf64_f128:
 ; AVX512F-64:       # %bb.0:
-; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2,1,2,1,2,1,2,1]
+; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
@@ -1992,7 +1992,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ; AVX-LABEL: f8xf64_f256:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovapd {{.*#+}} ymm2 = [4,1,2,3]
+; AVX-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
@@ -2001,7 +2001,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ;
 ; AVX2-LABEL: f8xf64_f256:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovapd {{.*#+}} ymm2 = [4,1,2,3]
+; AVX2-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
@@ -2010,7 +2010,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ;
 ; AVX512-LABEL: f8xf64_f256:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4,1,2,3,4,1,2,3]
+; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
@@ -2018,7 +2018,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ;
 ; AVX-64-LABEL: f8xf64_f256:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4,1,2,3]
+; AVX-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
@@ -2027,7 +2027,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ;
 ; AVX2-64-LABEL: f8xf64_f256:
 ; AVX2-64:       # %bb.0:
-; AVX2-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4,1,2,3]
+; AVX2-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; AVX2-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
@@ -2036,7 +2036,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ;
 ; AVX512F-64-LABEL: f8xf64_f256:
 ; AVX512F-64:       # %bb.0:
-; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4,1,2,3,4,1,2,3]
+; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
index ce7614c16fb..065d87ed888 100644
--- a/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -65,7 +65,7 @@ entry:
 define <2 x double> @test_negative_zero_2(<2 x double> %A) {
 ; SSE2-LABEL: test_negative_zero_2:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movapd {{.*#+}} xmm1 = <u,-0>
+; SSE2-NEXT:    movapd {{.*#+}} xmm1 = <u,-0.0E+0>
 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
diff --git a/test/CodeGen/X86/combine-fabs.ll b/test/CodeGen/X86/combine-fabs.ll
index c71eeb39623..b779c589cf9 100644
--- a/test/CodeGen/X86/combine-fabs.ll
+++ b/test/CodeGen/X86/combine-fabs.ll
@@ -24,12 +24,12 @@ define float @combine_fabs_constant() {
 define <4 x float> @combine_vec_fabs_constant() {
 ; SSE-LABEL: combine_vec_fabs_constant:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,2,2]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,2.0E+0,2.0E+0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_fabs_constant:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,2,2]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,2.0E+0,2.0E+0]
 ; AVX-NEXT:    retq
   %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> <float 0.0, float -0.0, float 2.0, float -2.0>)
   ret <4 x float> %1
diff --git a/test/CodeGen/X86/combine-fcopysign.ll b/test/CodeGen/X86/combine-fcopysign.ll
index 4b416085c5d..5d27fdfa889 100644
--- a/test/CodeGen/X86/combine-fcopysign.ll
+++ b/test/CodeGen/X86/combine-fcopysign.ll
@@ -62,7 +62,7 @@ define <4 x float> @combine_vec_fcopysign_neg_constant0(<4 x float> %x) {
 ;
 ; AVX-LABEL: combine_vec_fcopysign_neg_constant0:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> <float -2.0, float -2.0, float -2.0, float -2.0>)
@@ -77,7 +77,7 @@ define <4 x float> @combine_vec_fcopysign_neg_constant1(<4 x float> %x) {
 ;
 ; AVX-LABEL: combine_vec_fcopysign_neg_constant1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> <float -0.0, float -2.0, float -4.0, float -8.0>)
@@ -92,7 +92,7 @@ define <4 x float> @combine_vec_fcopysign_fneg_fabs_sgn(<4 x float> %x, <4 x flo
 ;
 ; AVX-LABEL: combine_vec_fcopysign_fneg_fabs_sgn:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
@@ -112,7 +112,7 @@ define <4 x float> @combine_vec_fcopysign_fabs_mag(<4 x float> %x, <4 x float> %
 ;
 ; AVX-LABEL: combine_vec_fcopysign_fabs_mag:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
@@ -134,7 +134,7 @@ define <4 x float> @combine_vec_fcopysign_fneg_mag(<4 x float> %x, <4 x float> %
 ;
 ; AVX-LABEL: combine_vec_fcopysign_fneg_mag:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
@@ -156,7 +156,7 @@ define <4 x float> @combine_vec_fcopysign_fcopysign_mag(<4 x float> %x, <4 x flo
 ;
 ; AVX-LABEL: combine_vec_fcopysign_fcopysign_mag:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
@@ -178,7 +178,7 @@ define <4 x float> @combine_vec_fcopysign_fcopysign_sgn(<4 x float> %x, <4 x flo
 ;
 ; AVX-LABEL: combine_vec_fcopysign_fcopysign_sgn:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandps %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
@@ -202,7 +202,7 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float
 ; SSE-NEXT:    movaps {{.*#+}} xmm7
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    andps %xmm7, %xmm2
-; SSE-NEXT:    movaps {{.*#+}} xmm8 = [-0,-0]
+; SSE-NEXT:    movaps {{.*#+}} xmm8 = [-0.0E+0,-0.0E+0]
 ; SSE-NEXT:    andps %xmm8, %xmm4
 ; SSE-NEXT:    orps %xmm4, %xmm2
 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
@@ -232,7 +232,7 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float
 ; AVX-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vcvtps2pd %xmm1, %ymm1
-; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
@@ -249,7 +249,7 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x doubl
 ; SSE-NEXT:    movaps {{.*#+}} xmm5
 ; SSE-NEXT:    andps %xmm5, %xmm0
 ; SSE-NEXT:    cvtsd2ss %xmm1, %xmm6
-; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-0,-0,-0,-0]
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; SSE-NEXT:    andps %xmm4, %xmm6
 ; SSE-NEXT:    orps %xmm6, %xmm0
 ; SSE-NEXT:    movshdup {{.*#+}} xmm6 = xmm3[1,1,3,3]
@@ -282,7 +282,7 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x doubl
 ; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
 ; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vcvtpd2ps %ymm1, %xmm1
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandpd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vorpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
index 974324f4bdb..cda0047ebee 100644
--- a/test/CodeGen/X86/cvtv2f32.ll
+++ b/test/CodeGen/X86/cvtv2f32.ll
@@ -44,7 +44,7 @@ define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v)
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; X32-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X32-NEXT:    movapd {{.*#+}} xmm1 = [4503599627370496,4503599627370496]
+; X32-NEXT:    movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
 ; X32-NEXT:    orpd %xmm1, %xmm2
 ; X32-NEXT:    subpd %xmm1, %xmm2
 ; X32-NEXT:    cvtpd2ps %xmm2, %xmm1
@@ -56,7 +56,7 @@ define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v)
 ; X64-NEXT:    movd %esi, %xmm1
 ; X64-NEXT:    movd %edi, %xmm2
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X64-NEXT:    movdqa {{.*#+}} xmm1 = [4503599627370496,4503599627370496]
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
 ; X64-NEXT:    por %xmm1, %xmm2
 ; X64-NEXT:    subpd %xmm1, %xmm2
 ; X64-NEXT:    cvtpd2ps %xmm2, %xmm1
@@ -74,7 +74,7 @@ define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) {
 ; X32:       # %bb.0:
 ; X32-NEXT:    xorps %xmm2, %xmm2
 ; X32-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; X32-NEXT:    movaps {{.*#+}} xmm0 = [4503599627370496,4503599627370496]
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [4.503599627370496E+15,4.503599627370496E+15]
 ; X32-NEXT:    orps %xmm0, %xmm2
 ; X32-NEXT:    subpd %xmm0, %xmm2
 ; X32-NEXT:    cvtpd2ps %xmm2, %xmm0
@@ -85,7 +85,7 @@ define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorps %xmm2, %xmm2
 ; X64-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; X64-NEXT:    movaps {{.*#+}} xmm0 = [4503599627370496,4503599627370496]
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [4.503599627370496E+15,4.503599627370496E+15]
 ; X64-NEXT:    orps %xmm0, %xmm2
 ; X64-NEXT:    subpd %xmm0, %xmm2
 ; X64-NEXT:    cvtpd2ps %xmm2, %xmm0
diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index 6a148397336..35965a8b66e 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -141,7 +141,7 @@ define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 ze
 ;
 ; KNL-LABEL: test11:
 ; KNL:       # %bb.0: # %entry
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; KNL-NEXT:    vxorps %xmm3, %xmm2, %xmm3
 ; KNL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; KNL-NEXT:    kmovw %edi, %k1
diff --git a/test/CodeGen/X86/fma-intrinsics-fast-isel.ll b/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
index fbe282f01ff..d82fe58ec40 100644
--- a/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
@@ -160,7 +160,7 @@ entry:
 define <4 x float> @test_mm_fnmsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; CHECK-LABEL: test_mm_fnmsub_ps:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [-0,-0,-0,-0]
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm0, %xmm4
 ; CHECK-NEXT:    vxorps %xmm3, %xmm2, %xmm0
 ; CHECK-NEXT:    vfmadd231ps {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0
@@ -175,7 +175,7 @@ entry:
 define <2 x double> @test_mm_fnmsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK-LABEL: test_mm_fnmsub_pd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovapd {{.*#+}} xmm3 = [-0,-0]
+; CHECK-NEXT:    vmovapd {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm0, %xmm4
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm2, %xmm0
 ; CHECK-NEXT:    vfmadd231pd {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0
@@ -342,7 +342,7 @@ entry:
 define <8 x float> @test_mm256_fnmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
 ; CHECK-LABEL: test_mm256_fnmsub_ps:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vxorps %ymm3, %ymm0, %ymm4
 ; CHECK-NEXT:    vxorps %ymm3, %ymm2, %ymm0
 ; CHECK-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0
@@ -357,7 +357,7 @@ entry:
 define <4 x double> @test_mm256_fnmsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
 ; CHECK-LABEL: test_mm256_fnmsub_pd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [-0,-0,-0,-0]
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vxorpd %ymm3, %ymm0, %ymm4
 ; CHECK-NEXT:    vxorpd %ymm3, %ymm2, %ymm0
 ; CHECK-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 038836bd524..e59d6b66bc6 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -791,21 +791,21 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x fl
 define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; FMA-INFS-NEXT:    retq
 ;
 ; FMA4-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; FMA4-INFS-NEXT:    retq
 ;
 ; AVX512-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-INFS-NEXT:    retq
@@ -832,21 +832,21 @@ define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
 define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA-INFS-NEXT:    retq
 ;
 ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA4-INFS-NEXT:    retq
 ;
 ; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX512-INFS-NEXT:    retq
@@ -873,21 +873,21 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
 define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <1,u,1,1>
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0>
 ; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA-INFS-NEXT:    retq
 ;
 ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <1,u,1,1>
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0>
 ; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA4-INFS-NEXT:    retq
 ;
 ; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX512-INFS-NEXT:    retq
@@ -914,21 +914,21 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float
 define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; FMA-INFS-NEXT:    retq
 ;
 ; FMA4-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; FMA4-INFS-NEXT:    retq
 ;
 ; AVX512-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-INFS-NEXT:    retq
@@ -955,21 +955,21 @@ define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y
 define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA-INFS-NEXT:    retq
 ;
 ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA4-INFS-NEXT:    retq
 ;
 ; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX512-INFS-NEXT:    retq
@@ -996,21 +996,21 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y
 define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1>
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0>
 ; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA-INFS-NEXT:    retq
 ;
 ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1>
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0>
 ; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA4-INFS-NEXT:    retq
 ;
 ; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX512-INFS-NEXT:    retq
@@ -1318,7 +1318,7 @@ define float @test_f32_interp(float %x, float %y, float %t) {
 define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) {
 ; FMA-INFS-LABEL: test_v4f32_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
 ; FMA-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
 ; FMA-INFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1326,7 +1326,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
 ;
 ; FMA4-INFS-LABEL: test_v4f32_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
 ; FMA4-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
 ; FMA4-INFS-NEXT:    vfmaddps %xmm1, %xmm2, %xmm0, %xmm0
@@ -1334,7 +1334,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
 ;
 ; AVX512-INFS-LABEL: test_v4f32_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
 ; AVX512-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
 ; AVX512-INFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1367,7 +1367,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
 define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) {
 ; FMA-INFS-LABEL: test_v8f32_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
 ; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
@@ -1375,7 +1375,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
 ;
 ; FMA4-INFS-LABEL: test_v8f32_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
 ; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vfmaddps %ymm1, %ymm2, %ymm0, %ymm0
@@ -1383,7 +1383,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
 ;
 ; AVX512-INFS-LABEL: test_v8f32_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
 ; AVX512-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
 ; AVX512-INFS-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
@@ -1465,7 +1465,7 @@ define double @test_f64_interp(double %x, double %y, double %t) {
 define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) {
 ; FMA-INFS-LABEL: test_v2f64_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
 ; FMA-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
 ; FMA-INFS-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1473,7 +1473,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
 ;
 ; FMA4-INFS-LABEL: test_v2f64_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
 ; FMA4-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
 ; FMA4-INFS-NEXT:    vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0
@@ -1481,7 +1481,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
 ;
 ; AVX512-INFS-LABEL: test_v2f64_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1,1]
+; AVX512-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
 ; AVX512-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
 ; AVX512-INFS-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1514,7 +1514,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
 define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) {
 ; FMA-INFS-LABEL: test_v4f64_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm3 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
 ; FMA-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
@@ -1522,7 +1522,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
 ;
 ; FMA4-INFS-LABEL: test_v4f64_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm3 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
 ; FMA4-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0
@@ -1530,7 +1530,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
 ;
 ; AVX512-INFS-LABEL: test_v4f64_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
 ; AVX512-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
 ; AVX512-INFS-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
diff --git a/test/CodeGen/X86/fma_patterns_wide.ll b/test/CodeGen/X86/fma_patterns_wide.ll
index 2bd64135712..bef31d8a8cc 100644
--- a/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/test/CodeGen/X86/fma_patterns_wide.ll
@@ -259,7 +259,7 @@ define <8 x double> @test_8f64_fmsub_load(<8 x double>* %a0, <8 x double> %a1, <
 define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -268,7 +268,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -305,7 +305,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %
 define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_one:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -314,7 +314,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_one:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -351,7 +351,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y
 define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -360,7 +360,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -397,7 +397,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float
 define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -406,7 +406,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double>
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -443,7 +443,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double>
 define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
 ; FMA-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -452,7 +452,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
 ; FMA4-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -461,7 +461,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
 ;
 ; AVX512-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %zmm0, %zmm2, %zmm0
 ; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
 ; AVX512-INFS-NEXT:    retq
@@ -490,7 +490,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
 define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
 ; FMA-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -499,7 +499,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
 ; FMA4-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -508,7 +508,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
 ;
 ; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
 ; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
 ; AVX512-INFS-NEXT:    retq
@@ -537,7 +537,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
 define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
 ; FMA-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -546,7 +546,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
 ; FMA4-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -555,7 +555,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
 ;
 ; AVX512-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm2 = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %zmm0, %zmm2, %zmm0
 ; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
 ; AVX512-INFS-NEXT:    retq
@@ -584,7 +584,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
 define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
 ; FMA-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -593,7 +593,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
 ; FMA4-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -602,7 +602,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
 ;
 ; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm2 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; AVX512-INFS-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
 ; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
 ; AVX512-INFS-NEXT:    retq
@@ -631,7 +631,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
 define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -640,7 +640,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -677,7 +677,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %
 define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -686,7 +686,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -723,7 +723,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y
 define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -732,7 +732,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -769,7 +769,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float
 define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -778,7 +778,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double>
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -819,7 +819,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double>
 define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) {
 ; FMA-INFS-LABEL: test_v16f32_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm4, %ymm6, %ymm7
 ; FMA-INFS-NEXT:    vsubps %ymm5, %ymm6, %ymm6
 ; FMA-INFS-NEXT:    vmulps %ymm6, %ymm3, %ymm3
@@ -830,7 +830,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
 ;
 ; FMA4-INFS-LABEL: test_v16f32_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm6, %ymm7
 ; FMA4-INFS-NEXT:    vsubps %ymm5, %ymm6, %ymm6
 ; FMA4-INFS-NEXT:    vmulps %ymm6, %ymm3, %ymm3
@@ -841,7 +841,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
 ;
 ; AVX512-INFS-LABEL: test_v16f32_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %zmm2, %zmm3, %zmm3
 ; AVX512-INFS-NEXT:    vmulps %zmm3, %zmm1, %zmm1
 ; AVX512-INFS-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1
@@ -878,7 +878,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
 define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) {
 ; FMA-INFS-LABEL: test_v8f64_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm6 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm6, %ymm7
 ; FMA-INFS-NEXT:    vsubpd %ymm5, %ymm6, %ymm6
 ; FMA-INFS-NEXT:    vmulpd %ymm6, %ymm3, %ymm3
@@ -889,7 +889,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
 ;
 ; FMA4-INFS-LABEL: test_v8f64_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm6 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm6, %ymm7
 ; FMA4-INFS-NEXT:    vsubpd %ymm5, %ymm6, %ymm6
 ; FMA4-INFS-NEXT:    vmulpd %ymm6, %ymm3, %ymm3
@@ -900,7 +900,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
 ;
 ; AVX512-INFS-LABEL: test_v8f64_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubpd %zmm2, %zmm3, %zmm3
 ; AVX512-INFS-NEXT:    vmulpd %zmm3, %zmm1, %zmm1
 ; AVX512-INFS-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1
@@ -1143,7 +1143,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
 ; FMA-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
-; FMA-NEXT:    vmovapd {{.*#+}} ymm2 = [-0,-0,-0,-0]
+; FMA-NEXT:    vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; FMA-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
 ; FMA-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
 ; FMA-NEXT:    retq
@@ -1152,7 +1152,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
 ; FMA4:       # %bb.0:
 ; FMA4-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
 ; FMA4-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vmovapd {{.*#+}} ymm2 = [-0,-0,-0,-0]
+; FMA4-NEXT:    vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; FMA4-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
 ; FMA4-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
 ; FMA4-NEXT:    retq
diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll
index 85f86110d05..f9843dced1b 100644
--- a/test/CodeGen/X86/fmul-combines.ll
+++ b/test/CodeGen/X86/fmul-combines.ll
@@ -61,7 +61,7 @@ define <4 x float> @fmul2_v4f32_undef(<4 x float> %x) {
 define <4 x float> @constant_fold_fmul_v4f32(<4 x float> %x) {
 ; CHECK-LABEL: constant_fold_fmul_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [8,8,8,8]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [8.0E+0,8.0E+0,8.0E+0,8.0E+0]
 ; CHECK-NEXT:    retq
   %y = fmul <4 x float> <float 4.0, float 4.0, float 4.0, float 4.0>, <float 2.0, float 2.0, float 2.0, float 2.0>
   ret <4 x float> %y
@@ -70,7 +70,7 @@ define <4 x float> @constant_fold_fmul_v4f32(<4 x float> %x) {
 define <4 x float> @constant_fold_fmul_v4f32_undef(<4 x float> %x) {
 ; CHECK-LABEL: constant_fold_fmul_v4f32_undef:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [8,NaN,8,NaN]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [8.0E+0,NaN,8.0E+0,NaN]
 ; CHECK-NEXT:    retq
   %y = fmul <4 x float> <float 4.0, float undef, float 4.0, float 4.0>, <float 2.0, float 2.0, float 2.0, float undef>
   ret <4 x float> %y
diff --git a/test/CodeGen/X86/fold-vector-trunc-sitofp.ll b/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
index e53e7f8f9c6..73c7dc1fae5 100644
--- a/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
+++ b/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
@@ -7,7 +7,7 @@
 define <4 x float> @test1() {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [-1,0,-1,0]
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,-1.0E+0,0.0E+0]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = trunc <4 x i3> <i3 -1, i3 -22, i3 7, i3 8> to <4 x i1>
   %2 = sitofp <4 x i1> %1 to <4 x float>
diff --git a/test/CodeGen/X86/insert-into-constant-vector.ll b/test/CodeGen/X86/insert-into-constant-vector.ll
index 3c8fbc5819e..9a70bc8fffd 100644
--- a/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -167,40 +167,40 @@ define <4 x float> @elt1_v4f32(float %x) {
 ; X32SSE2-LABEL: elt1_v4f32:
 ; X32SSE2:       # %bb.0:
 ; X32SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32SSE2-NEXT:    movaps {{.*#+}} xmm1 = <42,u,2,3>
+; X32SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X32SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
 ; X32SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; X32SSE2-NEXT:    retl
 ;
 ; X64SSE2-LABEL: elt1_v4f32:
 ; X64SSE2:       # %bb.0:
-; X64SSE2-NEXT:    movaps {{.*#+}} xmm1 = <42,u,2,3>
+; X64SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X64SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
 ; X64SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; X64SSE2-NEXT:    retq
 ;
 ; X32SSE4-LABEL: elt1_v4f32:
 ; X32SSE4:       # %bb.0:
-; X32SSE4-NEXT:    movaps {{.*#+}} xmm0 = <42,u,2,3>
+; X32SSE4-NEXT:    movaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X32SSE4-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X32SSE4-NEXT:    retl
 ;
 ; X64SSE4-LABEL: elt1_v4f32:
 ; X64SSE4:       # %bb.0:
-; X64SSE4-NEXT:    movaps {{.*#+}} xmm1 = <42,u,2,3>
+; X64SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X64SSE4-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
 ; X64SSE4-NEXT:    movaps %xmm1, %xmm0
 ; X64SSE4-NEXT:    retq
 ;
 ; X32AVX-LABEL: elt1_v4f32:
 ; X32AVX:       # %bb.0:
-; X32AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <42,u,2,3>
+; X32AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X32AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X32AVX-NEXT:    retl
 ;
 ; X64AVX-LABEL: elt1_v4f32:
 ; X64AVX:       # %bb.0:
-; X64AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <42,u,2,3>
+; X64AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X64AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
 ; X64AVX-NEXT:    retq
    %ins = insertelement <4 x float> <float 42.0, float 1.0, float 2.0, float 3.0>, float %x, i32 1
@@ -210,26 +210,26 @@ define <4 x float> @elt1_v4f32(float %x) {
 define <2 x double> @elt1_v2f64(double %x) {
 ; X32SSE-LABEL: elt1_v2f64:
 ; X32SSE:       # %bb.0:
-; X32SSE-NEXT:    movapd {{.*#+}} xmm0 = <42,u>
+; X32SSE-NEXT:    movapd {{.*#+}} xmm0 = <4.2E+1,u>
 ; X32SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; X32SSE-NEXT:    retl
 ;
 ; X64SSE-LABEL: elt1_v2f64:
 ; X64SSE:       # %bb.0:
-; X64SSE-NEXT:    movaps {{.*#+}} xmm1 = <42,u>
+; X64SSE-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u>
 ; X64SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X64SSE-NEXT:    movaps %xmm1, %xmm0
 ; X64SSE-NEXT:    retq
 ;
 ; X32AVX-LABEL: elt1_v2f64:
 ; X32AVX:       # %bb.0:
-; X32AVX-NEXT:    vmovapd {{.*#+}} xmm0 = <42,u>
+; X32AVX-NEXT:    vmovapd {{.*#+}} xmm0 = <4.2E+1,u>
 ; X32AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; X32AVX-NEXT:    retl
 ;
 ; X64AVX-LABEL: elt1_v2f64:
 ; X64AVX:       # %bb.0:
-; X64AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <42,u>
+; X64AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <4.2E+1,u>
 ; X64AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64AVX-NEXT:    retq
    %ins = insertelement <2 x double> <double 42.0, double 1.0>, double %x, i32 1
@@ -292,37 +292,37 @@ define <8 x float> @elt6_v8f32(float %x) {
 ; X32SSE2-LABEL: elt6_v8f32:
 ; X32SSE2:       # %bb.0:
 ; X32SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X32SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
 ; X32SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
 ; X32SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; X32SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X32SSE2-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0]
 ; X32SSE2-NEXT:    retl
 ;
 ; X64SSE2-LABEL: elt6_v8f32:
 ; X64SSE2:       # %bb.0:
-; X64SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X64SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
 ; X64SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
 ; X64SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; X64SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X64SSE2-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0]
 ; X64SSE2-NEXT:    retq
 ;
 ; X32SSE4-LABEL: elt6_v8f32:
 ; X32SSE4:       # %bb.0:
-; X32SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X32SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
 ; X32SSE4-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; X32SSE4-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X32SSE4-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0]
 ; X32SSE4-NEXT:    retl
 ;
 ; X64SSE4-LABEL: elt6_v8f32:
 ; X64SSE4:       # %bb.0:
-; X64SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X64SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
 ; X64SSE4-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
-; X64SSE4-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X64SSE4-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0]
 ; X64SSE4-NEXT:    retq
 ;
 ; X32AVX-LABEL: elt6_v8f32:
 ; X32AVX:       # %bb.0:
-; X32AVX-NEXT:    vmovaps {{.*#+}} ymm0 = <42,1,2,3,4,5,u,7>
+; X32AVX-NEXT:    vmovaps {{.*#+}} ymm0 = <4.2E+1,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,u,7.0E+0>
 ; X32AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X32AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
 ; X32AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -330,7 +330,7 @@ define <8 x float> @elt6_v8f32(float %x) {
 ;
 ; X64AVX-LABEL: elt6_v8f32:
 ; X64AVX:       # %bb.0:
-; X64AVX-NEXT:    vmovaps {{.*#+}} ymm1 = <42,1,2,3,4,5,u,7>
+; X64AVX-NEXT:    vmovaps {{.*#+}} ymm1 = <4.2E+1,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,u,7.0E+0>
 ; X64AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
 ; X64AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -413,49 +413,49 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 define <8 x double> @elt1_v8f64(double %x) {
 ; X32SSE-LABEL: elt1_v8f64:
 ; X32SSE:       # %bb.0:
-; X32SSE-NEXT:    movapd {{.*#+}} xmm0 = <42,u>
+; X32SSE-NEXT:    movapd {{.*#+}} xmm0 = <4.2E+1,u>
 ; X32SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; X32SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
-; X32SSE-NEXT:    movaps {{.*#+}} xmm2 = [4,5]
-; X32SSE-NEXT:    movaps {{.*#+}} xmm3 = [6,7]
+; X32SSE-NEXT:    movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0]
+; X32SSE-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0]
+; X32SSE-NEXT:    movaps {{.*#+}} xmm3 = [6.0E+0,7.0E+0]
 ; X32SSE-NEXT:    retl
 ;
 ; X64SSE-LABEL: elt1_v8f64:
 ; X64SSE:       # %bb.0:
-; X64SSE-NEXT:    movaps {{.*#+}} xmm4 = <42,u>
+; X64SSE-NEXT:    movaps {{.*#+}} xmm4 = <4.2E+1,u>
 ; X64SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; X64SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
-; X64SSE-NEXT:    movaps {{.*#+}} xmm2 = [4,5]
-; X64SSE-NEXT:    movaps {{.*#+}} xmm3 = [6,7]
+; X64SSE-NEXT:    movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0]
+; X64SSE-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0]
+; X64SSE-NEXT:    movaps {{.*#+}} xmm3 = [6.0E+0,7.0E+0]
 ; X64SSE-NEXT:    movaps %xmm4, %xmm0
 ; X64SSE-NEXT:    retq
 ;
 ; X32AVX2-LABEL: elt1_v8f64:
 ; X32AVX2:       # %bb.0:
-; X32AVX2-NEXT:    vmovapd {{.*#+}} ymm0 = <42,u,2,3>
+; X32AVX2-NEXT:    vmovapd {{.*#+}} ymm0 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X32AVX2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0]
 ; X32AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; X32AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,6,7]
+; X32AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X32AVX2-NEXT:    retl
 ;
 ; X64AVX2-LABEL: elt1_v8f64:
 ; X64AVX2:       # %bb.0:
-; X64AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <42,u,2,3>
+; X64AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X64AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; X64AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,6,7]
+; X64AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X64AVX2-NEXT:    retq
 ;
 ; X32AVX512F-LABEL: elt1_v8f64:
 ; X32AVX512F:       # %bb.0:
-; X32AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = <42,u,2,3,4,5,6,7>
+; X32AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
 ; X32AVX512F-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0]
 ; X32AVX512F-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
 ; X32AVX512F-NEXT:    retl
 ;
 ; X64AVX512F-LABEL: elt1_v8f64:
 ; X64AVX512F:       # %bb.0:
-; X64AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <42,u,2,3,4,5,6,7>
+; X64AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
 ; X64AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64AVX512F-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; X64AVX512F-NEXT:    retq
diff --git a/test/CodeGen/X86/packss.ll b/test/CodeGen/X86/packss.ll
index 2a4ee1f783f..3feb0d04f04 100644
--- a/test/CodeGen/X86/packss.ll
+++ b/test/CodeGen/X86/packss.ll
@@ -166,7 +166,7 @@ define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
 ; X86-SSE-NEXT:    psrlq $63, %xmm4
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; X86-SSE-NEXT:    movapd {{.*#+}} xmm2 = [4.9406564584124654E-324,-0]
+; X86-SSE-NEXT:    movapd {{.*#+}} xmm2 = [4.9406564584124654E-324,-0.0E+0]
 ; X86-SSE-NEXT:    xorpd %xmm2, %xmm0
 ; X86-SSE-NEXT:    psubq %xmm2, %xmm0
 ; X86-SSE-NEXT:    psrlq $63, %xmm3
diff --git a/test/CodeGen/X86/pow.ll b/test/CodeGen/X86/pow.ll
index f170488cb2f..45600540289 100644
--- a/test/CodeGen/X86/pow.ll
+++ b/test/CodeGen/X86/pow.ll
@@ -56,11 +56,11 @@ define <4 x float> @pow_v4f32_one_fourth_fmf(<4 x float> %x) nounwind {
 ; CHECK-NEXT:    rsqrtps %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm0, %xmm2
 ; CHECK-NEXT:    mulps %xmm1, %xmm2
-; CHECK-NEXT:    movaps {{.*#+}} xmm3 = [-0.5,-0.5,-0.5,-0.5]
+; CHECK-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; CHECK-NEXT:    movaps %xmm2, %xmm4
 ; CHECK-NEXT:    mulps %xmm3, %xmm4
 ; CHECK-NEXT:    mulps %xmm1, %xmm2
-; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [-3,-3,-3,-3]
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; CHECK-NEXT:    addps %xmm1, %xmm2
 ; CHECK-NEXT:    mulps %xmm4, %xmm2
 ; CHECK-NEXT:    xorps %xmm4, %xmm4
diff --git a/test/CodeGen/X86/pr2656.ll b/test/CodeGen/X86/pr2656.ll
index dc1fd88e425..53d1ea79f48 100644
--- a/test/CodeGen/X86/pr2656.ll
+++ b/test/CodeGen/X86/pr2656.ll
@@ -19,7 +19,7 @@ define void @foo(%struct.anon* byval %p) nounwind {
 ; CHECK-NEXT:    subl $28, %esp
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movaps {{.*#+}} xmm2 = [-0,-0,-0,-0]
+; CHECK-NEXT:    movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    xorps %xmm2, %xmm0
 ; CHECK-NEXT:    cvtss2sd %xmm0, %xmm0
 ; CHECK-NEXT:    xorps %xmm2, %xmm1
diff --git a/test/CodeGen/X86/pr38639.ll b/test/CodeGen/X86/pr38639.ll
index 4218db41185..bea6c84279f 100644
--- a/test/CodeGen/X86/pr38639.ll
+++ b/test/CodeGen/X86/pr38639.ll
@@ -4,11 +4,11 @@
 define <8 x double> @test(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = <u,0.82071743224100002,0.82071743224100002,0.82071743224100002>
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = <u,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1>
 ; CHECK-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; CHECK-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [0.82071743224100002,0.82071743224100002]
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1]
 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; CHECK-NEXT:    retq
   %1 = shufflevector <4 x double> %a, <4 x double> <double undef, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C>, <8 x i32> <i32 6, i32 5, i32 2, i32 3, i32 5, i32 1, i32 3, i32 7>
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 9e68636f904..a68940eb11a 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -307,62 +307,62 @@ define float @f32_two_step(float %x) #2 {
 define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
 ; SSE-LABEL: v4f32_no_estimate:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    divps %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v4f32_no_estimate:
 ; AVX-RECIP:       # %bb.0:
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v4f32_no_estimate:
 ; FMA-RECIP:       # %bb.0:
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BDVER2-LABEL: v4f32_no_estimate:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [9:9.50]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v4f32_no_estimate:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-LABEL: v4f32_no_estimate:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v4f32_no_estimate:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v4f32_no_estimate:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: v4f32_no_estimate:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SKX-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [11:3.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -374,7 +374,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    subps %xmm0, %xmm1
 ; SSE-NEXT:    mulps %xmm2, %xmm1
 ; SSE-NEXT:    addps %xmm2, %xmm1
@@ -385,7 +385,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -407,7 +407,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ;
 ; BTVER2-LABEL: v4f32_one_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
@@ -419,7 +419,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -428,7 +428,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; HASWELL-LABEL: v4f32_one_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
@@ -437,7 +437,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -446,7 +446,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; KNL-LABEL: v4f32_one_step:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; KNL-NEXT:    retq # sched: [7:1.00]
@@ -467,7 +467,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    subps %xmm3, %xmm4
 ; SSE-NEXT:    mulps %xmm2, %xmm4
@@ -483,7 +483,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
 ; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
 ; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
@@ -496,7 +496,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v4f32_two_step:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
@@ -507,7 +507,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; BDVER2-LABEL: v4f32_two_step:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
@@ -516,7 +516,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ;
 ; BTVER2-LABEL: v4f32_two_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
 ; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
@@ -532,7 +532,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -545,7 +545,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; HASWELL-LABEL: v4f32_two_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
@@ -557,7 +557,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
@@ -570,7 +570,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; KNL-LABEL: v4f32_two_step:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
@@ -581,7 +581,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; SKX-LABEL: v4f32_two_step:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
@@ -595,7 +595,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
 ; SSE-LABEL: v8f32_no_estimate:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm2, %xmm3
 ; SSE-NEXT:    divps %xmm0, %xmm3
 ; SSE-NEXT:    divps %xmm1, %xmm2
@@ -605,55 +605,55 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
 ;
 ; AVX-RECIP-LABEL: v8f32_no_estimate:
 ; AVX-RECIP:       # %bb.0:
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v8f32_no_estimate:
 ; FMA-RECIP:       # %bb.0:
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BDVER2-LABEL: v8f32_no_estimate:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [9:19.00]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v8f32_no_estimate:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [38:38.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-LABEL: v8f32_no_estimate:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [29:28.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v8f32_no_estimate:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_no_estimate:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00]
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: v8f32_no_estimate:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SKX-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [11:5.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -665,7 +665,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm4
 ; SSE-NEXT:    mulps %xmm4, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm2, %xmm3
 ; SSE-NEXT:    subps %xmm0, %xmm3
 ; SSE-NEXT:    mulps %xmm4, %xmm3
@@ -683,7 +683,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -705,7 +705,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ;
 ; BTVER2-LABEL: v8f32_one_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
@@ -717,7 +717,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -726,7 +726,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; HASWELL-LABEL: v8f32_one_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
@@ -735,7 +735,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -744,7 +744,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; KNL-LABEL: v8f32_one_step:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; KNL-NEXT:    retq # sched: [7:1.00]
@@ -766,7 +766,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; SSE-NEXT:    rcpps %xmm0, %xmm3
 ; SSE-NEXT:    movaps %xmm0, %xmm4
 ; SSE-NEXT:    mulps %xmm3, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm1, %xmm5
 ; SSE-NEXT:    subps %xmm4, %xmm5
 ; SSE-NEXT:    mulps %xmm3, %xmm5
@@ -794,7 +794,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
 ; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
@@ -807,7 +807,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v8f32_two_step:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
@@ -818,7 +818,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; BDVER2-LABEL: v8f32_two_step:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
@@ -827,7 +827,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ;
 ; BTVER2-LABEL: v8f32_two_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
@@ -843,7 +843,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -856,7 +856,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; HASWELL-LABEL: v8f32_two_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
@@ -868,7 +868,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
@@ -881,7 +881,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; KNL-LABEL: v8f32_two_step:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
@@ -892,7 +892,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; SKX-LABEL: v8f32_two_step:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50]
@@ -906,7 +906,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ; SSE-LABEL: v16f32_no_estimate:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm4, %xmm5
 ; SSE-NEXT:    divps %xmm0, %xmm5
 ; SSE-NEXT:    movaps %xmm4, %xmm6
@@ -922,62 +922,62 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ;
 ; AVX-RECIP-LABEL: v16f32_no_estimate:
 ; AVX-RECIP:       # %bb.0:
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vdivps %ymm1, %ymm2, %ymm1
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v16f32_no_estimate:
 ; FMA-RECIP:       # %bb.0:
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm2, %ymm0
 ; FMA-RECIP-NEXT:    vdivps %ymm1, %ymm2, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BDVER2-LABEL: v16f32_no_estimate:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [9:19.00]
 ; BDVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [9:19.00]
 ; BDVER2-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: v16f32_no_estimate:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [38:38.00]
 ; BTVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [38:38.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-LABEL: v16f32_no_estimate:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [29:28.00]
 ; SANDY-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [29:28.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v16f32_no_estimate:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [21:14.00]
 ; HASWELL-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [21:14.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v16f32_no_estimate:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm2, %ymm0
 ; HASWELL-NO-FMA-NEXT:    vdivps %ymm1, %ymm2, %ymm1
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v16f32_no_estimate:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [10:1.00]
 ; KNL-NEXT:    vdivps %zmm0, %zmm1, %zmm0 # sched: [21:14.00]
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: v16f32_no_estimate:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [8:0.50]
 ; SKX-NEXT:    vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -991,7 +991,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; SSE-NEXT:    movaps %xmm0, %xmm5
 ; SSE-NEXT:    rcpps %xmm0, %xmm6
 ; SSE-NEXT:    mulps %xmm6, %xmm5
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm3, %xmm0
 ; SSE-NEXT:    subps %xmm5, %xmm0
 ; SSE-NEXT:    mulps %xmm6, %xmm0
@@ -1021,7 +1021,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
@@ -1035,7 +1035,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; FMA-RECIP-LABEL: v16f32_one_step:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
@@ -1046,7 +1046,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; BDVER2-LABEL: v16f32_one_step:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vrcpps %ymm1, %ymm4 # sched: [5:2.00]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm1, %ymm1 # sched: [5:0.50]
@@ -1056,7 +1056,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ;
 ; BTVER2-LABEL: v16f32_one_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm4 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
@@ -1073,7 +1073,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
@@ -1087,7 +1087,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; HASWELL-LABEL: v16f32_one_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm4 # sched: [11:2.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50]
@@ -1099,7 +1099,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
@@ -1136,7 +1136,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; SSE-NEXT:    rcpps %xmm0, %xmm0
 ; SSE-NEXT:    movaps %xmm1, %xmm6
 ; SSE-NEXT:    mulps %xmm0, %xmm6
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm3, %xmm7
 ; SSE-NEXT:    subps %xmm6, %xmm7
 ; SSE-NEXT:    mulps %xmm0, %xmm7
@@ -1188,7 +1188,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm3
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
 ; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
 ; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
@@ -1210,7 +1210,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v16f32_two_step:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
@@ -1227,7 +1227,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; BDVER2-LABEL: v16f32_two_step:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
@@ -1241,7 +1241,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ;
 ; BTVER2-LABEL: v16f32_two_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
@@ -1266,7 +1266,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
@@ -1288,7 +1288,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; HASWELL-LABEL: v16f32_two_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
@@ -1306,7 +1306,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm3
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2
@@ -1328,7 +1328,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; KNL-LABEL: v16f32_two_step:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [10:1.00]
 ; KNL-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
@@ -1339,7 +1339,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; SKX-LABEL: v16f32_two_step:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [8:0.50]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50]
diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll
index 2a773f44956..dbe2689077e 100644
--- a/test/CodeGen/X86/recip-fastmath2.ll
+++ b/test/CodeGen/X86/recip-fastmath2.ll
@@ -433,7 +433,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    subps %xmm0, %xmm1
 ; SSE-NEXT:    mulps %xmm2, %xmm1
 ; SSE-NEXT:    addps %xmm2, %xmm1
@@ -445,7 +445,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -470,7 +470,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ;
 ; BTVER2-LABEL: v4f32_one_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
@@ -483,7 +483,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -493,7 +493,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; HASWELL-LABEL: v4f32_one_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
@@ -503,7 +503,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -513,7 +513,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; KNL-LABEL: v4f32_one_step2:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
@@ -535,11 +535,11 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm1
 ; SSE-NEXT:    mulps %xmm1, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    subps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm1, %xmm2
 ; SSE-NEXT:    addps %xmm1, %xmm2
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,3,4]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; SSE-NEXT:    mulps %xmm2, %xmm0
 ; SSE-NEXT:    mulps %xmm2, %xmm0
 ; SSE-NEXT:    retq
@@ -548,7 +548,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -576,7 +576,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ;
 ; BTVER2-LABEL: v4f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
@@ -590,7 +590,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -601,7 +601,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; HASWELL-LABEL: v4f32_one_step_2_divs:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
@@ -612,7 +612,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -623,7 +623,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; KNL-LABEL: v4f32_one_step_2_divs:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
@@ -649,7 +649,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    subps %xmm3, %xmm4
 ; SSE-NEXT:    mulps %xmm2, %xmm4
@@ -666,7 +666,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
 ; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
 ; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
@@ -680,7 +680,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v4f32_two_step2:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
@@ -692,7 +692,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; BDVER2-LABEL: v4f32_two_step2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
@@ -702,7 +702,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ;
 ; BTVER2-LABEL: v4f32_two_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
 ; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
@@ -719,7 +719,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -733,7 +733,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; HASWELL-LABEL: v4f32_two_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
@@ -746,7 +746,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -760,7 +760,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; KNL-LABEL: v4f32_two_step2:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
@@ -772,7 +772,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; SKX-LABEL: v4f32_two_step2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
@@ -789,7 +789,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm1, %xmm4
 ; SSE-NEXT:    mulps %xmm4, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm2, %xmm3
 ; SSE-NEXT:    subps %xmm1, %xmm3
 ; SSE-NEXT:    mulps %xmm4, %xmm3
@@ -809,7 +809,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -834,7 +834,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ;
 ; BTVER2-LABEL: v8f32_one_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
@@ -847,7 +847,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -857,7 +857,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; HASWELL-LABEL: v8f32_one_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
@@ -867,7 +867,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -877,7 +877,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; KNL-LABEL: v8f32_one_step2:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
@@ -899,7 +899,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm3, %xmm4
 ; SSE-NEXT:    subps %xmm0, %xmm4
 ; SSE-NEXT:    mulps %xmm2, %xmm4
@@ -909,9 +909,9 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; SSE-NEXT:    subps %xmm1, %xmm3
 ; SSE-NEXT:    mulps %xmm0, %xmm3
 ; SSE-NEXT:    addps %xmm0, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5,6,7,8]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
 ; SSE-NEXT:    mulps %xmm3, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,3,4]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; SSE-NEXT:    mulps %xmm4, %xmm0
 ; SSE-NEXT:    mulps %xmm4, %xmm0
 ; SSE-NEXT:    mulps %xmm3, %xmm1
@@ -921,7 +921,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -949,7 +949,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ;
 ; BTVER2-LABEL: v8f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
@@ -963,7 +963,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -974,7 +974,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; HASWELL-LABEL: v8f32_one_step_2_divs:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
@@ -985,7 +985,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -996,7 +996,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; KNL-LABEL: v8f32_one_step_2_divs:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
@@ -1023,7 +1023,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; SSE-NEXT:    rcpps %xmm1, %xmm3
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    mulps %xmm3, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm5
 ; SSE-NEXT:    subps %xmm4, %xmm5
 ; SSE-NEXT:    mulps %xmm3, %xmm5
@@ -1053,7 +1053,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
 ; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
@@ -1067,7 +1067,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v8f32_two_step2:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
@@ -1079,7 +1079,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; BDVER2-LABEL: v8f32_two_step2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
@@ -1089,7 +1089,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ;
 ; BTVER2-LABEL: v8f32_two_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
@@ -1106,7 +1106,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -1120,7 +1120,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; HASWELL-LABEL: v8f32_two_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
@@ -1133,7 +1133,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -1147,7 +1147,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; KNL-LABEL: v8f32_two_step2:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
@@ -1159,7 +1159,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; SKX-LABEL: v8f32_two_step2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50]
@@ -1300,7 +1300,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; SSE-NEXT:    movaps %xmm0, %xmm6
 ; SSE-NEXT:    rcpps %xmm3, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    subps %xmm4, %xmm3
 ; SSE-NEXT:    mulps %xmm2, %xmm3
@@ -1333,7 +1333,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
 ; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
@@ -1349,7 +1349,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; FMA-RECIP-LABEL: v16f32_one_step2:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
@@ -1362,7 +1362,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; BDVER2-LABEL: v16f32_one_step2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm4 # sched: [5:2.00]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm0, %ymm0 # sched: [5:0.50]
@@ -1374,7 +1374,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ;
 ; BTVER2-LABEL: v16f32_one_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm4 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
@@ -1393,7 +1393,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
@@ -1409,7 +1409,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; HASWELL-LABEL: v16f32_one_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm4 # sched: [11:2.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50]
@@ -1423,7 +1423,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
@@ -1460,7 +1460,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm6
 ; SSE-NEXT:    mulps %xmm6, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm4, %xmm5
 ; SSE-NEXT:    subps %xmm0, %xmm5
 ; SSE-NEXT:    mulps %xmm6, %xmm5
@@ -1482,13 +1482,13 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; SSE-NEXT:    subps %xmm3, %xmm4
 ; SSE-NEXT:    mulps %xmm0, %xmm4
 ; SSE-NEXT:    addps %xmm0, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [13,14,15,16]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1]
 ; SSE-NEXT:    mulps %xmm4, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [9,10,11,12]
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1]
 ; SSE-NEXT:    mulps %xmm7, %xmm2
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5,6,7,8]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
 ; SSE-NEXT:    mulps %xmm6, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,3,4]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; SSE-NEXT:    mulps %xmm5, %xmm0
 ; SSE-NEXT:    mulps %xmm5, %xmm0
 ; SSE-NEXT:    mulps %xmm6, %xmm1
@@ -1500,7 +1500,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
@@ -1518,7 +1518,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; FMA-RECIP-LABEL: v16f32_one_step_2_divs:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
@@ -1533,7 +1533,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; BDVER2-LABEL: v16f32_one_step_2_divs:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
 ; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
@@ -1547,7 +1547,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ;
 ; BTVER2-LABEL: v16f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
@@ -1568,7 +1568,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vrcpps %ymm1, %ymm4 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
@@ -1586,7 +1586,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; HASWELL-LABEL: v16f32_one_step_2_divs:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
@@ -1602,7 +1602,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm4 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
@@ -1647,7 +1647,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; SSE-NEXT:    movaps %xmm0, %xmm4
 ; SSE-NEXT:    rcpps %xmm3, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm7
 ; SSE-NEXT:    subps %xmm3, %xmm7
 ; SSE-NEXT:    mulps %xmm2, %xmm7
@@ -1703,7 +1703,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm3
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
 ; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
 ; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
@@ -1727,7 +1727,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v16f32_two_step2:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
@@ -1746,7 +1746,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; BDVER2-LABEL: v16f32_two_step2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
-; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:0.50]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
 ; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
@@ -1762,7 +1762,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ;
 ; BTVER2-LABEL: v16f32_two_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
@@ -1789,7 +1789,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
@@ -1813,7 +1813,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; HASWELL-LABEL: v16f32_two_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
@@ -1833,7 +1833,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
@@ -1857,7 +1857,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; KNL-LABEL: v16f32_two_step2:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [10:1.00]
 ; KNL-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
@@ -1869,7 +1869,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; SKX-LABEL: v16f32_two_step2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [8:0.50]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50]
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
index dcd57d2f537..68c83391e60 100644
--- a/test/CodeGen/X86/select_const.ll
+++ b/test/CodeGen/X86/select_const.ll
@@ -468,10 +468,10 @@ define <2 x double> @sel_constants_fmul_constant_vec(i1 %cond) {
 ; CHECK-NEXT:    testb $1, %dil
 ; CHECK-NEXT:    jne .LBB37_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [118.83,34.539999999999999]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.1883E+2,3.4539999999999999E+1]
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB37_1:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [-20.399999999999999,37.68]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [-2.0399999999999999E+1,3.768E+1]
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, <2 x double> <double -4.0, double 12.0>, <2 x double> <double 23.3, double 11.0>
   %bo = fmul <2 x double> %sel, <double 5.1, double 3.14>
diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll
index 7567dbcdad0..7aae59080fd 100644
--- a/test/CodeGen/X86/splat-for-size.ll
+++ b/test/CodeGen/X86/splat-for-size.ll
@@ -9,7 +9,7 @@
 define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
 ; CHECK-LABEL: splat_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = [1,1]
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
 ; CHECK-NEXT:    # xmm1 = mem[0,0]
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
@@ -20,7 +20,7 @@ define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
 define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
 ; CHECK-LABEL: splat_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
+; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
@@ -30,7 +30,7 @@ define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
 define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: splat_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
@@ -40,7 +40,7 @@ define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
 define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
 ; CHECK-LABEL: splat_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 78a0514b91d..6e0273d513f 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -178,7 +178,7 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
 ; SSE-NEXT:    rsqrtps %xmm0, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    mulps %xmm2, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-0.5,-0.5,-0.5,-0.5]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; SSE-NEXT:    mulps %xmm1, %xmm3
 ; SSE-NEXT:    mulps %xmm2, %xmm1
 ; SSE-NEXT:    addps {{.*}}(%rip), %xmm1
@@ -208,9 +208,9 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3,-3,-3,-3]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.5,-0.5,-0.5,-0.5]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
 ; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
 ; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
@@ -282,21 +282,21 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
 ; SSE-LABEL: v4f32_no_estimate:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    sqrtps %xmm0, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    divps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: v4f32_no_estimate:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vsqrtps %xmm0, %xmm0
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1]
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX1-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: v4f32_no_estimate:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vsqrtps %xmm0, %xmm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
@@ -331,9 +331,9 @@ define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-3,-3,-3,-3]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; AVX512-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.5,-0.5,-0.5,-0.5]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX512-NEXT:    vmulps %xmm0, %xmm2, %xmm0
 ; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
@@ -347,7 +347,7 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    sqrtps %xmm1, %xmm2
 ; SSE-NEXT:    sqrtps %xmm0, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    divps %xmm3, %xmm0
 ; SSE-NEXT:    divps %xmm2, %xmm1
@@ -356,14 +356,14 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
 ; AVX1-LABEL: v8f32_no_estimate:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vsqrtps %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX1-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: v8f32_no_estimate:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vsqrtps %ymm0, %ymm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    retq
   %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
@@ -375,11 +375,11 @@ define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
 ; SSE-LABEL: v8f32_estimate:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rsqrtps %xmm0, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-0.5,-0.5,-0.5,-0.5]
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; SSE-NEXT:    movaps %xmm3, %xmm2
 ; SSE-NEXT:    mulps %xmm3, %xmm2
 ; SSE-NEXT:    mulps %xmm0, %xmm2
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [-3,-3,-3,-3]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; SSE-NEXT:    addps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm4, %xmm2
 ; SSE-NEXT:    mulps %xmm3, %xmm2
@@ -408,9 +408,9 @@ define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrsqrtps %ymm0, %ymm1
 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-3,-3,-3,-3,-3,-3,-3,-3]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; AVX512-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
-; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    retq
@@ -426,7 +426,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ; SSE-NEXT:    sqrtps %xmm2, %xmm5
 ; SSE-NEXT:    sqrtps %xmm1, %xmm2
 ; SSE-NEXT:    sqrtps %xmm0, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm3, %xmm0
 ; SSE-NEXT:    divps %xmm1, %xmm0
 ; SSE-NEXT:    movaps %xmm3, %xmm1
@@ -440,7 +440,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vsqrtps %ymm1, %ymm1
 ; AVX1-NEXT:    vsqrtps %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    retq
@@ -448,7 +448,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ; AVX512-LABEL: v16f32_no_estimate:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vsqrtps %zmm0, %zmm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
@@ -462,11 +462,11 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    rsqrtps %xmm0, %xmm5
-; SSE-NEXT:    movaps {{.*#+}} xmm6 = [-0.5,-0.5,-0.5,-0.5]
+; SSE-NEXT:    movaps {{.*#+}} xmm6 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; SSE-NEXT:    movaps %xmm5, %xmm0
 ; SSE-NEXT:    mulps %xmm5, %xmm0
 ; SSE-NEXT:    mulps %xmm1, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm7 = [-3,-3,-3,-3]
+; SSE-NEXT:    movaps {{.*#+}} xmm7 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; SSE-NEXT:    addps %xmm7, %xmm0
 ; SSE-NEXT:    mulps %xmm6, %xmm0
 ; SSE-NEXT:    mulps %xmm5, %xmm0
@@ -498,10 +498,10 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
 ; AVX1-LABEL: v16f32_estimate:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vrsqrtps %ymm0, %ymm2
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5]
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX1-NEXT:    vmulps %ymm2, %ymm2, %ymm4
 ; AVX1-NEXT:    vmulps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [-3,-3,-3,-3,-3,-3,-3,-3]
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; AVX1-NEXT:    vaddps %ymm4, %ymm0, %ymm0
 ; AVX1-NEXT:    vmulps %ymm0, %ymm3, %ymm0
 ; AVX1-NEXT:    vmulps %ymm0, %ymm2, %ymm0
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index b2efb5b2933..be019aff514 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -395,7 +395,7 @@ define void @test12() nounwind {
 ; SSE-LABEL: test12:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movapd 0, %xmm0
-; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE-NEXT:    xorps %xmm2, %xmm2
 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
@@ -416,7 +416,7 @@ define void @test12() nounwind {
 ; AVX512-LABEL: test12:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovaps 0, %xmm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
diff --git a/test/CodeGen/X86/subvector-broadcast.ll b/test/CodeGen/X86/subvector-broadcast.ll
index 5e93944c7ac..a05288ac031 100644
--- a/test/CodeGen/X86/subvector-broadcast.ll
+++ b/test/CodeGen/X86/subvector-broadcast.ll
@@ -949,7 +949,7 @@ entry:
 define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
 ; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
 ; X32-AVX:       # %bb.0: # %entry
-; X32-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1,2,3,4]
+; X32-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; X32-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
 ; X32-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
@@ -963,7 +963,7 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
 ;
 ; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
 ; X32-AVX512:       # %bb.0: # %entry
-; X32-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1,2,3,4]
+; X32-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; X32-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
 ; X32-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
@@ -975,7 +975,7 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
 ;
 ; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
 ; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1,2,3,4]
+; X64-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; X64-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
 ; X64-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
 ; X64-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
@@ -989,7 +989,7 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
 ;
 ; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
 ; X64-AVX512:       # %bb.0: # %entry
-; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1,2,3,4]
+; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; X64-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
 ; X64-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
diff --git a/test/CodeGen/X86/v4f32-immediate.ll b/test/CodeGen/X86/v4f32-immediate.ll
index a0eb4092599..690ef825f7a 100644
--- a/test/CodeGen/X86/v4f32-immediate.ll
+++ b/test/CodeGen/X86/v4f32-immediate.ll
@@ -5,12 +5,12 @@
 define <4 x float> @foo() {
 ; X32-LABEL: foo:
 ; X32:       # %bb.0:
-; X32-NEXT:    movaps {{.*#+}} xmm0 = [3.22354245,2.29999995,1.20000005,0.100000001]
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [3.22354245E+0,2.29999995E+0,1.20000005E+0,1.00000001E-1]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = [3.22354245,2.29999995,1.20000005,0.100000001]
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [3.22354245E+0,2.29999995E+0,1.20000005E+0,1.00000001E-1]
 ; X64-NEXT:    retq
   ret <4 x float> <float 0x4009C9D0A0000000, float 0x4002666660000000, float 0x3FF3333340000000, float 0x3FB99999A0000000>
 }
diff --git a/test/CodeGen/X86/vec_cast3.ll b/test/CodeGen/X86/vec_cast3.ll
index b805b336106..e0cc4f3e396 100644
--- a/test/CodeGen/X86/vec_cast3.ll
+++ b/test/CodeGen/X86/vec_cast3.ll
@@ -90,7 +90,7 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [4503599627370496,4503599627370496]
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
 ; CHECK-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vcvtpd2ps %xmm0, %xmm0
@@ -99,7 +99,7 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) {
 ; CHECK-WIDE-LABEL: cvt_v2u32_v2f32:
 ; CHECK-WIDE:       ## %bb.0:
 ; CHECK-WIDE-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; CHECK-WIDE-NEXT:    vmovdqa {{.*#+}} xmm1 = [4503599627370496,4503599627370496]
+; CHECK-WIDE-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
 ; CHECK-WIDE-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; CHECK-WIDE-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
 ; CHECK-WIDE-NEXT:    vcvtpd2ps %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
index 3bbc468d07d..ef499af7540 100644
--- a/test/CodeGen/X86/vec_floor.ll
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -703,17 +703,17 @@ declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
 define <2 x double> @const_floor_v2f64() {
 ; SSE41-LABEL: const_floor_v2f64:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-2,2]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_floor_v2f64:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-2,2]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_floor_v2f64:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-2,2]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <2 x double> @llvm.floor.v2f64(<2 x double> <double -1.5, double 2.5>)
   ret <2 x double> %t
@@ -722,17 +722,17 @@ define <2 x double> @const_floor_v2f64() {
 define <4 x float> @const_floor_v4f32() {
 ; SSE41-LABEL: const_floor_v4f32:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-4,6,-9,2]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_floor_v4f32:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-4,6,-9,2]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_floor_v4f32:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-4,6,-9,2]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <4 x float> @llvm.floor.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
   ret <4 x float> %t
@@ -741,17 +741,17 @@ define <4 x float> @const_floor_v4f32() {
 define <2 x double> @const_ceil_v2f64() {
 ; SSE41-LABEL: const_ceil_v2f64:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-1,3]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_ceil_v2f64:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-1,3]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_ceil_v2f64:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-1,3]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> <double -1.5, double 2.5>)
   ret <2 x double> %t
@@ -760,17 +760,17 @@ define <2 x double> @const_ceil_v2f64() {
 define <4 x float> @const_ceil_v4f32() {
 ; SSE41-LABEL: const_ceil_v4f32:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-3,6,-9,3]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_ceil_v4f32:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-3,6,-9,3]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_ceil_v4f32:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-3,6,-9,3]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
   ret <4 x float> %t
@@ -779,17 +779,17 @@ define <4 x float> @const_ceil_v4f32() {
 define <2 x double> @const_trunc_v2f64() {
 ; SSE41-LABEL: const_trunc_v2f64:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-1,2]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_trunc_v2f64:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-1,2]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_trunc_v2f64:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-1,2]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> <double -1.5, double 2.5>)
   ret <2 x double> %t
@@ -798,17 +798,17 @@ define <2 x double> @const_trunc_v2f64() {
 define <4 x float> @const_trunc_v4f32() {
 ; SSE41-LABEL: const_trunc_v4f32:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-3,6,-9,2]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_trunc_v4f32:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-3,6,-9,2]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_trunc_v4f32:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-3,6,-9,2]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
   ret <4 x float> %t
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index 84a4385f2c9..e80abc91cd1 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -630,7 +630,7 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
 ; AVX1-LABEL: fptoui_4f64_to_2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovapd %xmm0, %xmm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
 ; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
@@ -645,7 +645,7 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
 ; AVX2-LABEL: fptoui_4f64_to_2i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovapd %xmm0, %xmm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
 ; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
@@ -952,7 +952,7 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
 ;
 ; AVX1-LABEL: fptoui_4f64_to_4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
 ; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
@@ -966,7 +966,7 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
 ;
 ; AVX2-LABEL: fptoui_4f64_to_4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
 ; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 7bc05fb39f0..b66d5d1bfff 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -253,42 +253,42 @@ entry:
 define <2 x double> @fpext_fromconst() {
 ; X32-SSE-LABEL: fpext_fromconst:
 ; X32-SSE:       # %bb.0: # %entry
-; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,-2]
+; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,-2.0E+0]
 ; X32-SSE-NEXT:    # encoding: [0x0f,0x28,0x05,A,A,A,A]
 ; X32-SSE-NEXT:    # fixup A - offset: 3, value: {{\.LCPI.*}}, kind: FK_Data_4
 ; X32-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X32-AVX-LABEL: fpext_fromconst:
 ; X32-AVX:       # %bb.0: # %entry
-; X32-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,-2]
+; X32-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1.0E+0,-2.0E+0]
 ; X32-AVX-NEXT:    # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
 ; X32-AVX-NEXT:    # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4
 ; X32-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X32-AVX512VL-LABEL: fpext_fromconst:
 ; X32-AVX512VL:       # %bb.0: # %entry
-; X32-AVX512VL-NEXT:    vmovaps {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [1,-2]
+; X32-AVX512VL-NEXT:    vmovaps {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0]
 ; X32-AVX512VL-NEXT:    # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
 ; X32-AVX512VL-NEXT:    # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4
 ; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: fpext_fromconst:
 ; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,-2]
+; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,-2.0E+0]
 ; X64-SSE-NEXT:    # encoding: [0x0f,0x28,0x05,A,A,A,A]
 ; X64-SSE-NEXT:    # fixup A - offset: 3, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: fpext_fromconst:
 ; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,-2]
+; X64-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1.0E+0,-2.0E+0]
 ; X64-AVX-NEXT:    # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
 ; X64-AVX-NEXT:    # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: fpext_fromconst:
 ; X64-AVX512VL:       # %bb.0: # %entry
-; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [1,-2]
+; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0]
 ; X64-AVX512VL-NEXT:    # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
 ; X64-AVX512VL-NEXT:    # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 14cce63ca96..af68937326c 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -680,7 +680,7 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65536,65536,65536,65536]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
 ; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
@@ -954,7 +954,7 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
 ; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [65536,65536]
+; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
 ; SSE2-NEXT:    mulpd %xmm2, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
@@ -974,7 +974,7 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrld $16, %xmm1
 ; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [65536,65536]
+; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
 ; SSE41-NEXT:    mulpd %xmm2, %xmm1
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
@@ -1004,7 +1004,7 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65536,65536,65536,65536]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
 ; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
@@ -3777,7 +3777,7 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
 ; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [65536,65536]
+; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
 ; SSE2-NEXT:    mulpd %xmm2, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
@@ -3798,7 +3798,7 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrld $16, %xmm1
 ; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [65536,65536]
+; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
 ; SSE41-NEXT:    mulpd %xmm2, %xmm1
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
@@ -3830,7 +3830,7 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65536,65536,65536,65536]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
 ; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
diff --git a/test/CodeGen/X86/vec_ss_load_fold.ll b/test/CodeGen/X86/vec_ss_load_fold.ll
index 6b3f7acac90..1f5503067c6 100644
--- a/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -247,22 +247,22 @@ define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
 define  <2 x double> @test5() nounwind uwtable readnone noinline {
 ; X32-LABEL: test5:
 ; X32:       ## %bb.0: ## %entry
-; X32-NEXT:    movaps {{.*#+}} xmm0 = [128,123.321]
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test5:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    movaps {{.*#+}} xmm0 = [128,123.321]
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
 ; X64-NEXT:    retq
 ;
 ; X32_AVX-LABEL: test5:
 ; X32_AVX:       ## %bb.0: ## %entry
-; X32_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [128,123.321]
+; X32_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
 ; X32_AVX-NEXT:    retl
 ;
 ; X64_AVX-LABEL: test5:
 ; X64_AVX:       ## %bb.0: ## %entry
-; X64_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [128,123.321]
+; X64_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
 ; X64_AVX-NEXT:    retq
 entry:
   %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 4.569870e+02, double 1.233210e+02>, i32 128) nounwind readnone
diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
index 8abd26805e6..5f4489c5ed2 100644
--- a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -109,7 +109,7 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psrld $16, %xmm2
 ; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
-; SSE2-NEXT:    movaps {{.*#+}} xmm3 = [65536,65536,65536,65536]
+; SSE2-NEXT:    movaps {{.*#+}} xmm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
 ; SSE2-NEXT:    mulps %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
 ; SSE2-NEXT:    pand %xmm4, %xmm0
@@ -129,7 +129,7 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psrld $16, %xmm2
 ; SSE41-NEXT:    cvtdq2ps %xmm2, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm3 = [65536,65536,65536,65536]
+; SSE41-NEXT:    movaps {{.*#+}} xmm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
 ; SSE41-NEXT:    mulps %xmm3, %xmm2
 ; SSE41-NEXT:    pxor %xmm4, %xmm4
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
diff --git a/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index f13178ed5ce..44c19483154 100644
--- a/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -26,13 +26,13 @@ entry:
 define <2 x double> @constrained_vector_fdiv_v2f64() {
 ; NO-FMA-LABEL: constrained_vector_fdiv_v2f64:
 ; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1,2]
+; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
 ; NO-FMA-NEXT:    divpd {{.*}}(%rip), %xmm0
 ; NO-FMA-NEXT:    retq
 ;
 ; HAS-FMA-LABEL: constrained_vector_fdiv_v2f64:
 ; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [1,2]
+; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
 ; HAS-FMA-NEXT:    vdivpd {{.*}}(%rip), %xmm0, %xmm0
 ; HAS-FMA-NEXT:    retq
 entry:
@@ -82,7 +82,7 @@ entry:
 define <3 x double> @constrained_vector_fdiv_v3f64() {
 ; NO-FMA-LABEL: constrained_vector_fdiv_v3f64:
 ; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1,2]
+; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
 ; NO-FMA-NEXT:    divpd {{.*}}(%rip), %xmm0
 ; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; NO-FMA-NEXT:    divsd {{.*}}(%rip), %xmm1
@@ -96,7 +96,7 @@ define <3 x double> @constrained_vector_fdiv_v3f64() {
 ; HAS-FMA:       # %bb.0: # %entry
 ; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; HAS-FMA-NEXT:    vdivsd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1,2]
+; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1.0E+0,2.0E+0]
 ; HAS-FMA-NEXT:    vdivpd {{.*}}(%rip), %xmm1, %xmm1
 ; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; HAS-FMA-NEXT:    retq
@@ -112,16 +112,16 @@ entry:
 define <4 x double> @constrained_vector_fdiv_v4f64() {
 ; NO-FMA-LABEL: constrained_vector_fdiv_v4f64:
 ; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm2 = [10,10]
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1,2]
+; NO-FMA-NEXT:    movapd {{.*#+}} xmm2 = [1.0E+1,1.0E+1]
+; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
 ; NO-FMA-NEXT:    divpd %xmm2, %xmm0
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [3,4]
+; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [3.0E+0,4.0E+0]
 ; NO-FMA-NEXT:    divpd %xmm2, %xmm1
 ; NO-FMA-NEXT:    retq
 ;
 ; HAS-FMA-LABEL: constrained_vector_fdiv_v4f64:
 ; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [1,2,3,4]
+; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; HAS-FMA-NEXT:    vdivpd {{.*}}(%rip), %ymm0, %ymm0
 ; HAS-FMA-NEXT:    retq
 entry:
@@ -499,7 +499,7 @@ define <4 x double> @constrained_vector_fmul_v4f64() {
 ; NO-FMA-LABEL: constrained_vector_fmul_v4f64:
 ; NO-FMA:       # %bb.0: # %entry
 ; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [2,3]
+; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [2.0E+0,3.0E+0]
 ; NO-FMA-NEXT:    mulpd %xmm1, %xmm0
 ; NO-FMA-NEXT:    mulpd {{.*}}(%rip), %xmm1
 ; NO-FMA-NEXT:    retq
@@ -630,7 +630,7 @@ define <4 x double> @constrained_vector_fadd_v4f64() {
 ; NO-FMA-LABEL: constrained_vector_fadd_v4f64:
 ; NO-FMA:       # %bb.0: # %entry
 ; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1,0.10000000000000001]
+; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,1.0000000000000001E-1]
 ; NO-FMA-NEXT:    addpd %xmm1, %xmm0
 ; NO-FMA-NEXT:    addpd {{.*}}(%rip), %xmm1
 ; NO-FMA-NEXT:    retq
@@ -836,8 +836,8 @@ define <2 x double> @constrained_vector_fma_v2f64() {
 ;
 ; HAS-FMA-LABEL: constrained_vector_fma_v2f64:
 ; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1.5,0.5]
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [3.5,2.5]
+; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1.5E+0,5.0E-1]
+; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [3.5E+0,2.5E+0]
 ; HAS-FMA-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
 ; HAS-FMA-NEXT:    retq
 entry:
@@ -936,8 +936,8 @@ define <3 x double> @constrained_vector_fma_v3f64() {
 ; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; HAS-FMA-NEXT:    vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [2.5,1.5]
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm2 = [5.5,4.5]
+; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [2.5E+0,1.5E+0]
+; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm2 = [5.5E+0,4.5E+0]
 ; HAS-FMA-NEXT:    vfmadd213pd {{.*#+}} xmm2 = (xmm0 * xmm2) + mem
 ; HAS-FMA-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm0
 ; HAS-FMA-NEXT:    retq
@@ -987,8 +987,8 @@ define <4 x double> @constrained_vector_fma_v4f64() {
 ;
 ; HAS-FMA-LABEL: constrained_vector_fma_v4f64:
 ; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm1 = [3.5,2.5,1.5,0.5]
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [7.5,6.5,5.5,4.5]
+; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1]
+; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0]
 ; HAS-FMA-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem
 ; HAS-FMA-NEXT:    retq
 entry:
@@ -1037,8 +1037,8 @@ define <4 x float> @constrained_vector_fma_v4f32() {
 ;
 ; HAS-FMA-LABEL: constrained_vector_fma_v4f32:
 ; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} xmm1 = [3.5,2.5,1.5,0.5]
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} xmm0 = [7.5,6.5,5.5,4.5]
+; HAS-FMA-NEXT:    vmovaps {{.*#+}} xmm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1]
+; HAS-FMA-NEXT:    vmovaps {{.*#+}} xmm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0]
 ; HAS-FMA-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
 ; HAS-FMA-NEXT:    retq
 entry:
@@ -1115,8 +1115,8 @@ define <8 x float> @constrained_vector_fma_v8f32() {
 ;
 ; HAS-FMA-LABEL: constrained_vector_fma_v8f32:
 ; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} ymm1 = [3.5,2.5,1.5,0.5,7.5,6.5,5.5,4.5]
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} ymm0 = [7.5,6.5,5.5,4.5,11.5,10.5,9.5,8.5]
+; HAS-FMA-NEXT:    vmovaps {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1,7.5E+0,6.5E+0,5.5E+0,4.5E+0]
+; HAS-FMA-NEXT:    vmovaps {{.*#+}} ymm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0,1.15E+1,1.05E+1,9.5E+0,8.5E+0]
 ; HAS-FMA-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem
 ; HAS-FMA-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index af3ef0894ac..ace577ce9a3 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -383,12 +383,12 @@ define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
 define <2 x double> @constant_fold_vpermilvar_pd() {
 ; X32-LABEL: constant_fold_vpermilvar_pd:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [2,1]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_pd:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [2,1]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0]
 ; X64-NEXT:    retq
   %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> <double 1.0, double 2.0>, <2 x i64> <i64 2, i64 0>)
   ret <2 x double> %1
@@ -397,12 +397,12 @@ define <2 x double> @constant_fold_vpermilvar_pd() {
 define <4 x double> @constant_fold_vpermilvar_pd_256() {
 ; X32-LABEL: constant_fold_vpermilvar_pd_256:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [2,1,3,4]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_pd_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [2,1,3,4]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0]
 ; X64-NEXT:    retq
   %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
   ret <4 x double> %1
@@ -411,12 +411,12 @@ define <4 x double> @constant_fold_vpermilvar_pd_256() {
 define <4 x float> @constant_fold_vpermilvar_ps() {
 ; X32-LABEL: constant_fold_vpermilvar_ps:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [4,1,3,2]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_ps:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [4,1,3,2]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0]
 ; X64-NEXT:    retq
   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x i32> <i32 3, i32 0, i32 2, i32 1>)
   ret <4 x float> %1
@@ -425,12 +425,12 @@ define <4 x float> @constant_fold_vpermilvar_ps() {
 define <8 x float> @constant_fold_vpermilvar_ps_256() {
 ; X32-LABEL: constant_fold_vpermilvar_ps_256:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1,1,3,2,5,6,6,6]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_ps_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1,1,3,2,5,6,6,6]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0]
 ; X64-NEXT:    retq
   %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>)
   ret <8 x float> %1
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index ae2a5513bfd..95d53ace5d3 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -691,7 +691,7 @@ define <8 x i32> @constant_fold_permd() {
 define <8 x float> @constant_fold_permps() {
 ; CHECK-LABEL: constant_fold_permps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
   ret <8 x float> %1
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index 52b5edd7abe..5fe0a2b460b 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -384,12 +384,12 @@ define void @buildvector_v4f32_07z6(float %a, <4 x float> %b, <4 x float>* %ptr)
 define <2 x double> @constant_fold_vpermil2pd() {
 ; X32-LABEL: constant_fold_vpermil2pd:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-2,2]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2pd:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-2,2]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
 ; X64-NEXT:    retq
   %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> <double 1.0, double 2.0>, <2 x double> <double -2.0, double -1.0>, <2 x i64> <i64 4, i64 2>, i8 2)
   ret <2 x double> %1
@@ -398,12 +398,12 @@ define <2 x double> @constant_fold_vpermil2pd() {
 define <4 x double> @constant_fold_vpermil2pd_256() {
 ; X32-LABEL: constant_fold_vpermil2pd_256:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-4,0,4,3]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2pd_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-4,0,4,3]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0]
 ; X64-NEXT:    retq
   %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x double> <double -4.0, double -3.0, double -2.0, double -1.0>, <4 x i64> <i64 4, i64 8, i64 2, i64 0>, i8 2)
   ret <4 x double> %1
@@ -412,12 +412,12 @@ define <4 x double> @constant_fold_vpermil2pd_256() {
 define <4 x float> @constant_fold_vpermil2ps() {
 ; X32-LABEL: constant_fold_vpermil2ps:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-4,1,3,0]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2ps:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-4,1,3,0]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0]
 ; X64-NEXT:    retq
   %1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> <float -4.0, float -3.0, float -2.0, float -1.0>, <4 x i32> <i32 4, i32 0, i32 2, i32 8>, i8 2)
   ret <4 x float> %1
@@ -426,12 +426,12 @@ define <4 x float> @constant_fold_vpermil2ps() {
 define <8 x float> @constant_fold_vpermil2ps_256() {
 ; X32-LABEL: constant_fold_vpermil2ps_256:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-8,1,3,0,5,0,5,7]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2ps_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-8,1,3,0,5,0,5,7]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0]
 ; X64-NEXT:    retq
   %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x float> <float -8.0, float -7.0, float -6.0, float -5.0, float -4.0, float -3.0, float -2.0, float -1.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 8, i32 0, i32 8, i32 0, i32 2>, i8 2)
   ret <8 x float> %1
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index 01e36681400..9c7163f39da 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2642,14 +2642,14 @@ define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>*
 define <4 x float> @combine_constant_insertion_v4f32(float %f) {
 ; SSE2-LABEL: combine_constant_insertion_v4f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <u,4,5,3>
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_constant_insertion_v4f32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <u,4,5,3>
+; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
@@ -2809,14 +2809,14 @@ define <4 x float> @PR30264(<4 x float> %x) {
 ;
 ; SSE41-LABEL: PR30264:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <u,u,4,1>
+; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
 ; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: PR30264:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <u,u,4,1>
+; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
 ; AVX-NEXT:    retq
   %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
index e27493e9758..145da66558f 100644
--- a/test/CodeGen/X86/vselect-avx.ll
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -46,7 +46,7 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    movq (%rdi,%rsi,8), %rax
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [0.5,0.5,0.5,0.5]
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
 ; AVX1-NEXT:    vblendvpd %ymm0, {{.*}}(%rip), %ymm1, %ymm0
 ; AVX1-NEXT:    vmovupd %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -57,8 +57,8 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
 ; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    movq (%rdi,%rsi,8), %rax
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [-0.5,-0.5,-0.5,-0.5]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.5,0.5,0.5,0.5]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovupd %ymm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/widen_arith-6.ll b/test/CodeGen/X86/widen_arith-6.ll
index 73b8f4ea276..c039096604e 100644
--- a/test/CodeGen/X86/widen_arith-6.ll
+++ b/test/CodeGen/X86/widen_arith-6.ll
@@ -14,7 +14,7 @@ define void @update(<3 x float>* %dst, <3 x float>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    movl $1073741824, {{[0-9]+}}(%esp) # imm = 0x40000000
 ; CHECK-NEXT:    movl $1065353216, {{[0-9]+}}(%esp) # imm = 0x3F800000
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <1976.04004,1976.04004,1976.04004,u>
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <1.97604004E+3,1.97604004E+3,1.97604004E+3,u>
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_2: # %forbody
diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
index e9f291d239b..70a72e7ee1a 100644
--- a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
+++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
@@ -52,7 +52,7 @@ define void @foo2(<4 x float>* noalias %result) nounwind {
 ; CHECK-NEXT: .long 1088421888              ## float 7
 ; CHECK-LABEL: foo2:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [4,5,6,7]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; CHECK-NEXT:    movaps %xmm0, (%rdi)
 ; CHECK-NEXT:    retq
   %val = uitofp <4 x i32> <i32 4, i32 5, i32 6, i32 7> to <4 x float>
@@ -89,7 +89,7 @@ define void @foo4(<4 x float>* noalias %result) nounwind {
 ; CHECK-NEXT: .long 1132396544              ## float 255
 ; CHECK-LABEL: foo4:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1,127,128,255]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.27E+2,1.28E+2,2.55E+2]
 ; CHECK-NEXT:    movaps %xmm0, (%rdi)
 ; CHECK-NEXT:    retq
   %val = uitofp <4 x i8> <i8 1, i8 127, i8 -128, i8 -1> to <4 x float>
-- 
GitLab


From 3bb7e2936fb205c645b85e8aa6144b26951196d4 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <sjoerd.meijer@arm.com>
Date: Mon, 29 Oct 2018 08:45:56 +0000
Subject: [PATCH 0694/1116] [ARM][NFC] Fix test inlineasm-X-allocation.ll

Differential Revision: https://reviews.llvm.org/D53748


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345491 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/ARM/inlineasm-X-allocation.ll | 29 +++++++++++-----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/test/CodeGen/ARM/inlineasm-X-allocation.ll b/test/CodeGen/ARM/inlineasm-X-allocation.ll
index e88d668f5cc..b2cb932f905 100644
--- a/test/CodeGen/ARM/inlineasm-X-allocation.ll
+++ b/test/CodeGen/ARM/inlineasm-X-allocation.ll
@@ -1,21 +1,20 @@
-; RUN: llc -mtriple=armv7-none-eabi -mattr=-neon,-vfpv2 %s -o - | FileCheck %s  -check-prefix=novfp
-; RUN: llc -mtriple=armv7-none-eabi -mattr=+neon %s -float-abi=hard -o - | FileCheck %s -check-prefix=vfp
+; RUN: llc -mtriple=armv7-none-eabi -mattr=-neon,-vfp2 %s -o - | FileCheck %s  -check-prefixes=COMMON,NOVFP
+; RUN: llc -mtriple=armv7-none-eabi -mattr=+neon %s -float-abi=hard -o - | FileCheck %s -check-prefixes=COMMON,VFP
 
-; vfp-LABEL: f1
-; vfp-CHECK: vadd.f32 s0, s0, s0
+; The intent here is to test "X", which says that any operand whatsoever is allowed.
+; Using this mechanism, we want to test toggling allocating GPR or SPR registers
+; depending on whether the float registers are available. Thus, the mnemonic is
+; totally irrelevant here, which is why we use FOO and also comment it out using "@"
+; to avoid assembler errors.
 
-; In the novfp case, the compiler is forced to assign a core register.
-; Although this register class can't be used with the vadd.f32 instruction,
-; the compiler behaved as expected since it is allowed to emit anything.
+; Note that this kind of IR can be generated by a function such as:
+;  void f1(float f) {asm volatile ("@FOO $0, $0" : : "X" (f));}
 
-; novfp-LABEL: f1
-; novfp-CHECK: vadd.f32 r0, r0, r0
-
-; This can be generated by a function such as:
-;  void f1(float f) {asm volatile ("add.f32 $0, $0, $0" : : "X" (f));}
-
-define arm_aapcs_vfpcc void @f1(float %f) {
+define arm_aapcs_vfpcc void @func(float %f) {
+; COMMON-LABEL: func
+; NOVFP:        FOO r0, r0
+; VFP:          FOO s0, s0
 entry:
-  call void asm sideeffect "vadd.f32 $0, $0, $0", "X" (float %f) nounwind
+  call void asm sideeffect "@FOO $0, $0", "X" (float %f) nounwind
   ret void
 }
-- 
GitLab


From 198a3abf530cdc60ed29d9f3da0bda460bfe5bec Mon Sep 17 00:00:00 2001
From: James Henderson <jh7370@my.bristol.ac.uk>
Date: Mon, 29 Oct 2018 10:05:39 +0000
Subject: [PATCH 0695/1116] [llvm-objdump] Add '--full-contents' as alias for
 '-s'

This fixes PR39404.

Reviewed By: jhenderson

Patch by Xing Guo

Differential Revision: https://reviews.llvm.org/D53576


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345495 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-objdump/full-contents.test | 47 ++++++++++++++++++++++
 tools/llvm-objdump/llvm-objdump.cpp        |  6 ++-
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 test/tools/llvm-objdump/full-contents.test

diff --git a/test/tools/llvm-objdump/full-contents.test b/test/tools/llvm-objdump/full-contents.test
new file mode 100644
index 00000000000..de0d584df32
--- /dev/null
+++ b/test/tools/llvm-objdump/full-contents.test
@@ -0,0 +1,47 @@
+# RUN: yaml2obj %s > %t
+# RUN: llvm-objdump --full-contents %t > %t.out1
+# RUN: llvm-objdump -s %t > %t.out2
+# RUN: cmp %t.out1 %t.out2
+# RUN: FileCheck %s --input-file=%t.out1
+
+# CHECK:      .bss
+# CHECK-NEXT: <skipping contents of bss section at [0000, 0040)>
+# CHECK:      .text
+# CHECK-NEXT:  0000 01234567                             .#Eg
+# CHECK:      .user-defined
+# CHECK-NEXT:  0000 76543210                             vT2.
+# CHECK:      .empty-section
+# CHECK-NEXT: <skipping contents of bss section at [0000, 0020)>
+# CHECK:      .symtab
+# CHECK:      .strtab
+# CHECK:      .shstrtab
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x0000000000000010
+    Size:            64
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000010
+    Content:         "01234567"
+    Size:            4
+  - Name:            .user-defined
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x0000000000000010
+    Content:         "76543210"
+    Size:            4
+  - Name:            .empty-section
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x0000000000000010
+    Size:            32
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 7107966b18d..a4fddf3f0c7 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -116,7 +116,11 @@ DynamicRelocationsd("R", cl::desc("Alias for --dynamic-reloc"),
              cl::aliasopt(DynamicRelocations));
 
 cl::opt<bool>
-llvm::SectionContents("s", cl::desc("Display the content of each section"));
+    llvm::SectionContents("full-contents",
+                          cl::desc("Display the content of each section"));
+static cl::alias SectionContentsShort("s",
+                                      cl::desc("Alias for --full-contents"),
+                                      cl::aliasopt(SectionContents));
 
 cl::opt<bool>
 llvm::SymbolTable("t", cl::desc("Display the symbol table"));
-- 
GitLab


From 11ead03db9dd6f8378275bbd014bbc171eef8544 Mon Sep 17 00:00:00 2001
From: Greg Bedwell <greg_bedwell@sn.scee.net>
Date: Mon, 29 Oct 2018 13:24:20 +0000
Subject: [PATCH 0696/1116] [llvm-mca][UpdateTestChecks] Don't try to align
 blocks that have already been subject to alignment in
 update_mca_test_checks.py

This fixes PR39466.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345499 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/update_mca_test_checks.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/utils/update_mca_test_checks.py b/utils/update_mca_test_checks.py
index a83186cd336..54d1cb443c3 100755
--- a/utils/update_mca_test_checks.py
+++ b/utils/update_mca_test_checks.py
@@ -267,10 +267,14 @@ def _align_matching_blocks(all_blocks, farthest_indexes):
         continue
 
       changed = False
-      while(index < farthest_indexes[block]):
-        blocks.insert(index, '')
-        index += 1
-        changed = True
+      # If the block has not already been subject to alignment (i.e. if the
+      # previous block is not empty) then insert empty blocks until the index
+      # matches the farthest index identified for that block.
+      if (index > 0) and blocks[index - 1]:
+        while(index < farthest_indexes[block]):
+          blocks.insert(index, '')
+          index += 1
+          changed = True
 
       if changed:
         # Bail out.  We'll need to re-do the farthest block analysis now that
-- 
GitLab


From ee5a5147a1466539ad34f04e8d0c77479b7e7d73 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Mon, 29 Oct 2018 13:29:22 +0000
Subject: [PATCH 0697/1116] [llvm-mca] Lower to mca::Instructon before the
 pipeline is run.

Before this change, the lowering of instructions from llvm::MCInst to
mca::Instruction was done as part of the first stage of the pipeline (i.e. the
FetchStage).  In particular, FetchStage was responsible for picking the next
instruction from the source sequence, and lower it to an mca::Instruction with
the help of an object of class InstrBuilder.

The dependency on InstrBuilder was problematic for a number of reasons. Class
InstrBuilder only knows how to lower from llvm::MCInst to mca::Instruction.
That means, it is hard to support a different scenario where instructions
in input are not instances of class llvm::MCInst. Even if we managed to
specialize InstrBuilder, and generalize most of its internal logic, the
dependency on InstrBuilder in FetchStage would have caused more troubles (other
than complicating the pipeline logic).

With this patch, the lowering step is done before the pipeline is run. The
pipeline is no longer responsible for lowering from MCInst to mca::Instruction.
As a consequence of this, the FetchStage no longer needs to interact with an
InstrBuilder. The mca::SourceMgr class now simply wraps a reference to a
sequence of mca::Instruction objects.
This simplifies the logic of FetchStage, and increases the usability of it.  As
a result, on a debug build, we see a 7-9% speedup; on a release build, the
speedup is around 3-4%.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345500 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/include/Instruction.h       | 23 +++++-----
 tools/llvm-mca/include/SourceMgr.h         | 20 ++++-----
 tools/llvm-mca/include/Stages/FetchStage.h |  7 +---
 tools/llvm-mca/lib/Context.cpp             |  2 +-
 tools/llvm-mca/lib/Instruction.cpp         |  2 +-
 tools/llvm-mca/lib/Stages/FetchStage.cpp   | 19 ++++-----
 tools/llvm-mca/llvm-mca.cpp                | 49 +++++++++++++---------
 7 files changed, 63 insertions(+), 59 deletions(-)

diff --git a/tools/llvm-mca/include/Instruction.h b/tools/llvm-mca/include/Instruction.h
index 9d1c91ad441..bbb40c42576 100644
--- a/tools/llvm-mca/include/Instruction.h
+++ b/tools/llvm-mca/include/Instruction.h
@@ -88,7 +88,7 @@ class ReadState;
 /// register write. It also tracks how many cycles are left before the write
 /// back stage.
 class WriteState {
-  const WriteDescriptor &WD;
+  const WriteDescriptor *WD;
   // On instruction issue, this field is set equal to the write latency.
   // Before instruction issue, this field defaults to -512, a special
   // value that represents an "unknown" number of cycles.
@@ -133,14 +133,17 @@ class WriteState {
 public:
   WriteState(const WriteDescriptor &Desc, unsigned RegID,
              bool clearsSuperRegs = false, bool writesZero = false)
-      : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
+      : WD(&Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
         ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero),
         IsEliminated(false), DependentWrite(nullptr), NumWriteUsers(0U) {}
 
+  WriteState(const WriteState &Other) = default;
+  WriteState &operator=(const WriteState &Other) = default;
+
   int getCyclesLeft() const { return CyclesLeft; }
-  unsigned getWriteResourceID() const { return WD.SClassOrWriteResourceID; }
+  unsigned getWriteResourceID() const { return WD->SClassOrWriteResourceID; }
   unsigned getRegisterID() const { return RegisterID; }
-  unsigned getLatency() const { return WD.Latency; }
+  unsigned getLatency() const { return WD->Latency; }
 
   void addUser(ReadState *Use, int ReadAdvance);
 
@@ -178,7 +181,7 @@ public:
 /// A read may be dependent on more than one write. This occurs when some
 /// writes only partially update the register associated to this read.
 class ReadState {
-  const ReadDescriptor &RD;
+  const ReadDescriptor *RD;
   // Physical register identified associated to this read.
   unsigned RegisterID;
   // Number of writes that contribute to the definition of RegisterID.
@@ -202,16 +205,16 @@ class ReadState {
 
 public:
   ReadState(const ReadDescriptor &Desc, unsigned RegID)
-      : RD(Desc), RegisterID(RegID), DependentWrites(0),
+      : RD(&Desc), RegisterID(RegID), DependentWrites(0),
         CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true),
         IndependentFromDef(false) {}
 
-  const ReadDescriptor &getDescriptor() const { return RD; }
-  unsigned getSchedClass() const { return RD.SchedClassID; }
+  const ReadDescriptor &getDescriptor() const { return *RD; }
+  unsigned getSchedClass() const { return RD->SchedClassID; }
   unsigned getRegisterID() const { return RegisterID; }
 
   bool isReady() const { return IsReady; }
-  bool isImplicitRead() const { return RD.isImplicitRead(); }
+  bool isImplicitRead() const { return RD->isImplicitRead(); }
 
   bool isIndependentFromDef() const { return IndependentFromDef; }
   void setIndependentFromDef() { IndependentFromDef = true; }
@@ -387,8 +390,6 @@ public:
   Instruction(const InstrDesc &D)
       : InstructionBase(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES),
         RCUTokenID(0) {}
-  Instruction(const Instruction &Other) = delete;
-  Instruction &operator=(const Instruction &Other) = delete;
 
   unsigned getRCUTokenID() const { return RCUTokenID; }
   int getCyclesLeft() const { return CyclesLeft; }
diff --git a/tools/llvm-mca/include/SourceMgr.h b/tools/llvm-mca/include/SourceMgr.h
index 12713588246..54b1a2c31ce 100644
--- a/tools/llvm-mca/include/SourceMgr.h
+++ b/tools/llvm-mca/include/SourceMgr.h
@@ -17,35 +17,35 @@
 #define LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/MC/MCInst.h"
-#include <vector>
 
 namespace mca {
 
-typedef std::pair<unsigned, const llvm::MCInst &> SourceRef;
+class Instruction;
+
+typedef std::pair<unsigned, const Instruction &> SourceRef;
 
 class SourceMgr {
-  llvm::ArrayRef<llvm::MCInst> Sequence;
+  using UniqueInst = std::unique_ptr<Instruction>;
+  llvm::ArrayRef<UniqueInst> Sequence;
   unsigned Current;
   const unsigned Iterations;
   static const unsigned DefaultIterations = 100;
 
 public:
-  SourceMgr(llvm::ArrayRef<llvm::MCInst> MCInstSequence, unsigned NumIterations)
-      : Sequence(MCInstSequence), Current(0),
-        Iterations(NumIterations ? NumIterations : DefaultIterations) {}
+  SourceMgr(llvm::ArrayRef<UniqueInst> S, unsigned Iter)
+      : Sequence(S), Current(0), Iterations(Iter ? Iter : DefaultIterations) {}
 
   unsigned getNumIterations() const { return Iterations; }
   unsigned size() const { return Sequence.size(); }
   bool hasNext() const { return Current < (Iterations * Sequence.size()); }
   void updateNext() { ++Current; }
 
-  const SourceRef peekNext() const {
+  SourceRef peekNext() const {
     assert(hasNext() && "Already at end of sequence!");
-    return SourceRef(Current, Sequence[Current % Sequence.size()]);
+    return SourceRef(Current, *Sequence[Current % Sequence.size()]);
   }
 
-  using const_iterator = llvm::ArrayRef<llvm::MCInst>::const_iterator;
+  using const_iterator = llvm::ArrayRef<UniqueInst>::const_iterator;
   const_iterator begin() const { return Sequence.begin(); }
   const_iterator end() const { return Sequence.end(); }
 };
diff --git a/tools/llvm-mca/include/Stages/FetchStage.h b/tools/llvm-mca/include/Stages/FetchStage.h
index 45e30e17b4d..a7aba2276d9 100644
--- a/tools/llvm-mca/include/Stages/FetchStage.h
+++ b/tools/llvm-mca/include/Stages/FetchStage.h
@@ -16,7 +16,6 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
 #define LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
 
-#include "InstrBuilder.h"
 #include "SourceMgr.h"
 #include "Stages/Stage.h"
 #include <map>
@@ -27,18 +26,16 @@ class FetchStage final : public Stage {
   InstRef CurrentInstruction;
   using InstMap = std::map<unsigned, std::unique_ptr<Instruction>>;
   InstMap Instructions;
-  InstrBuilder &IB;
   SourceMgr &SM;
 
   // Updates the program counter, and sets 'CurrentInstruction'.
-  llvm::Error getNextInstruction();
+  void getNextInstruction();
 
   FetchStage(const FetchStage &Other) = delete;
   FetchStage &operator=(const FetchStage &Other) = delete;
 
 public:
-  FetchStage(InstrBuilder &IB, SourceMgr &SM)
-      : CurrentInstruction(), IB(IB), SM(SM) {}
+  FetchStage(SourceMgr &SM) : CurrentInstruction(), SM(SM) {}
 
   bool isAvailable(const InstRef &IR) const override;
   bool hasWorkToComplete() const override;
diff --git a/tools/llvm-mca/lib/Context.cpp b/tools/llvm-mca/lib/Context.cpp
index c84ea73c4d2..4e30fc9de31 100644
--- a/tools/llvm-mca/lib/Context.cpp
+++ b/tools/llvm-mca/lib/Context.cpp
@@ -41,7 +41,7 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
   auto HWS = llvm::make_unique<Scheduler>(SM, LSU.get());
 
   // Create the pipeline stages.
-  auto Fetch = llvm::make_unique<FetchStage>(IB, SrcMgr);
+  auto Fetch = llvm::make_unique<FetchStage>(SrcMgr);
   auto Dispatch = llvm::make_unique<DispatchStage>(STI, MRI, Opts.DispatchWidth,
                                                    *RCU, *PRF);
   auto Execute = llvm::make_unique<ExecuteStage>(*HWS);
diff --git a/tools/llvm-mca/lib/Instruction.cpp b/tools/llvm-mca/lib/Instruction.cpp
index 12b6e185ced..42f5cd38ee9 100644
--- a/tools/llvm-mca/lib/Instruction.cpp
+++ b/tools/llvm-mca/lib/Instruction.cpp
@@ -93,7 +93,7 @@ void ReadState::cycleEvent() {
 
 #ifndef NDEBUG
 void WriteState::dump() const {
-  dbgs() << "{ OpIdx=" << WD.OpIndex << ", Lat=" << getLatency() << ", RegID "
+  dbgs() << "{ OpIdx=" << WD->OpIndex << ", Lat=" << getLatency() << ", RegID "
          << getRegisterID() << ", Cycles Left=" << getCyclesLeft() << " }";
 }
 
diff --git a/tools/llvm-mca/lib/Stages/FetchStage.cpp b/tools/llvm-mca/lib/Stages/FetchStage.cpp
index 515dc15c5b3..85d06d2d183 100644
--- a/tools/llvm-mca/lib/Stages/FetchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/FetchStage.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Stages/FetchStage.h"
+#include "Instruction.h"
 
 namespace mca {
 
@@ -25,20 +26,15 @@ bool FetchStage::isAvailable(const InstRef & /* unused */) const {
   return false;
 }
 
-llvm::Error FetchStage::getNextInstruction() {
+void FetchStage::getNextInstruction() {
   assert(!CurrentInstruction && "There is already an instruction to process!");
   if (!SM.hasNext())
-    return llvm::ErrorSuccess();
-  const SourceRef SR = SM.peekNext();
-  llvm::Expected<std::unique_ptr<Instruction>> InstOrErr =
-      IB.createInstruction(SR.second);
-  if (!InstOrErr)
-    return InstOrErr.takeError();
-  std::unique_ptr<Instruction> Inst = std::move(InstOrErr.get());
+    return;
+  SourceRef SR = SM.peekNext();
+  std::unique_ptr<Instruction> Inst = llvm::make_unique<Instruction>(SR.second);
   CurrentInstruction = InstRef(SR.first, Inst.get());
   Instructions[SR.first] = std::move(Inst);
   SM.updateNext();
-  return llvm::ErrorSuccess();
 }
 
 llvm::Error FetchStage::execute(InstRef & /*unused */) {
@@ -48,12 +44,13 @@ llvm::Error FetchStage::execute(InstRef & /*unused */) {
 
   // Move the program counter.
   CurrentInstruction.invalidate();
-  return getNextInstruction();
+  getNextInstruction();
+  return llvm::ErrorSuccess();
 }
 
 llvm::Error FetchStage::cycleStart() {
   if (!CurrentInstruction)
-    return getNextInstruction();
+    getNextInstruction();
   return llvm::ErrorSuccess();
 }
 
diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp
index b89e4bd9551..8f4e0717bd2 100644
--- a/tools/llvm-mca/llvm-mca.cpp
+++ b/tools/llvm-mca/llvm-mca.cpp
@@ -328,26 +328,12 @@ static void processViewOptions() {
 }
 
 // Returns true on success.
-static bool runPipeline(mca::Pipeline &P, MCInstPrinter &MCIP,
-                        const MCSubtargetInfo &STI) {
+static bool runPipeline(mca::Pipeline &P) {
   // Handle pipeline errors here.
   if (auto Err = P.run()) {
-    if (auto NewE = handleErrors(
-            std::move(Err),
-            [&MCIP, &STI](const mca::InstructionError<MCInst> &IE) {
-              std::string InstructionStr;
-              raw_string_ostream SS(InstructionStr);
-              WithColor::error() << IE.Message << '\n';
-              MCIP.printInst(&IE.Inst, SS, "", STI);
-              SS.flush();
-              WithColor::note() << "instruction: " << InstructionStr << '\n';
-            })) {
-      // Default case.
-      WithColor::error() << toString(std::move(NewE));
-    }
+    WithColor::error() << toString(std::move(Err));
     return false;
   }
-
   return true;
 }
 
@@ -513,14 +499,37 @@ int main(int argc, char **argv) {
       TOF->os() << "\n\n";
     }
 
+    // Lower the MCInst sequence into an mca::Instruction sequence.
     ArrayRef<MCInst> Insts = Region->getInstructions();
-    mca::SourceMgr S(Region->getInstructions(),
+    std::vector<std::unique_ptr<mca::Instruction>> LoweredSequence;
+    for (const MCInst &MCI : Insts) {
+      llvm::Expected<std::unique_ptr<mca::Instruction>> Inst = IB.createInstruction(MCI);
+      if (!Inst) {
+        if (auto NewE = handleErrors(Inst.takeError(),
+            [&IP, &STI](const mca::InstructionError<MCInst> &IE) {
+              std::string InstructionStr;
+              raw_string_ostream SS(InstructionStr);
+              WithColor::error() << IE.Message << '\n';
+              IP->printInst(&IE.Inst, SS, "", *STI);
+              SS.flush();
+              WithColor::note() << "instruction: " << InstructionStr << '\n';
+            })) {
+          // Default case.
+          WithColor::error() << toString(std::move(NewE));
+        }
+        return 1;
+      }
+
+      LoweredSequence.emplace_back(std::move(Inst.get()));
+    }
+
+    mca::SourceMgr S(LoweredSequence,
                      PrintInstructionTables ? 1 : Iterations);
 
     if (PrintInstructionTables) {
       //  Create a pipeline, stages, and a printer.
       auto P = llvm::make_unique<mca::Pipeline>();
-      P->appendStage(llvm::make_unique<mca::FetchStage>(IB, S));
+      P->appendStage(llvm::make_unique<mca::FetchStage>(S));
       P->appendStage(llvm::make_unique<mca::InstructionTables>(SM));
       mca::PipelinePrinter Printer(*P);
 
@@ -532,7 +541,7 @@ int main(int argc, char **argv) {
       Printer.addView(
           llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts));
 
-      if (!runPipeline(*P, *IP, *STI))
+      if (!runPipeline(*P))
         return 1;
 
       Printer.printReport(TOF->os());
@@ -574,7 +583,7 @@ int main(int argc, char **argv) {
           TimelineMaxCycles));
     }
 
-    if (!runPipeline(*P, *IP, *STI))
+    if (!runPipeline(*P))
       return 1;
 
     Printer.printReport(TOF->os());
-- 
GitLab


From 578a996f812873f4dc44bcab628e730ddf51c766 Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Mon, 29 Oct 2018 13:41:46 +0000
Subject: [PATCH 0698/1116] [X86] Remove outdated test

This test breaks the X86 MachineVerifier. It looks like the MIR part is
completely useless.

The original author suggests that it can be removed.

Differential Revision: https://reviews.llvm.org/D53767

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345501 91177308-0d34-0410-b5e6-96231b3b80d8
---
 ...instruction_and_target_split_perf_nops.mir | 288 ------------------
 1 file changed, 288 deletions(-)
 delete mode 100644 test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir

diff --git a/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir b/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir
deleted file mode 100644
index bbefc4f920a..00000000000
--- a/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir
+++ /dev/null
@@ -1,288 +0,0 @@
-# RUN: llc -mcpu=haswell -filetype=obj -start-before stack-protector -O2 %s -o - | llvm-objdump -d - | FileCheck %s
-
-# Test 1:
-#
-# Source C code:
-# volatile int y;
-# volatile int x;
-# 
-# int switchCase(int z, int w) {
-# 	int result = 0;
-# 	while (x > 0 && y < 0) {
-# 		switch(z) {
-# 			case 0:
-# 			result+=result*5;break;
-# 			case 1:
-# 			result--; break;
-# 			case 2:
-# 			result *= result; break;
-# 			case 3:
-# 			result <<= 7; break;
-# 			case 4:
-# 			result >>= 7; break;
-# 			case 5:
-# 			result = result * 16 | ~result; break;
-# 		}
-# 	}
-# 	return result;
-# }
-#
-# CHECK:       49:       eb 4a   jmp     74 <switchCase+0x95>
-# CHECK:       57:       eb 3c   jmp     60 <switchCase+0x95>
-# CHECK:       65:       eb 2e   jmp     46 <switchCase+0x95>
-# CHECK:       73:       eb 20   jmp     32 <switchCase+0x95>
-# CHECK:       81:       eb 12   jmp     18 <switchCase+0x95>
-# CHECK:       93:       7f 8b   jg      -117 <switchCase+0x20>
-
-# Test 2:
-#
-# Source C code:
-# 
-# int ifElse(int z) {
-# 	int w = 0;
-# 	while(1) {
-# 		if(x < 0)
-# 			w++;
-# 		else if(y > 0)
-# 			w--;
-# 		else if((x & y) == 3)
-# 			w*=2;
-# 		else if ((x | y) == 18)
-# 			w += 2;
-# 		else if ((y ^ x) == 154)
-# 			w -= 3;
-# 		else if(((y ^ x) & 1) != 0)
-# 			break;
-# 	}
-# 	return w;
-# }
-#
-# CHECK:       129:       eb 13   jmp     19 <ifElse+0x7e>
-# CHECK:       12e:       eb a0   jmp     -96 <ifElse+0x10>
-# CHECK:       132:       eb 9c   jmp     -100 <ifElse+0x10>
-# CHECK:       137:       eb 97   jmp     -105 <ifElse+0x10>
-# CHECK:       13c:       eb 92   jmp     -110 <ifElse+0x10>
---- |
-  ; ModuleID = 'D:\iusers\opaparo\dev_test\branch_instruction_and_target_split_perf_nops.ll'
-  source_filename = "D:\5C\5Ciusers\5C\5Copaparo\5C\5Cdev_test\5C\5Cbranch_instruction_and_target_split_perf_nops.c"
-  target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64-pc-windows-msvc19.0.24210"
-  
-  @x = common global i32 0, align 4
-  @y = common global i32 0, align 4
-  
-  ; Function Attrs: norecurse nounwind uwtable
-  define i32 @switchCase(i32 %z, i32 %w) local_unnamed_addr #0 {
-  entry:
-    %0 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %cmp19 = icmp sgt i32 %0, 0
-    br i1 %cmp19, label %land.rhs.preheader, label %while.end
-  
-  land.rhs.preheader:                               ; preds = %entry
-    br label %land.rhs
-  
-  land.rhs:                                         ; preds = %sw.epilog, %land.rhs.preheader
-    %result.020 = phi i32 [ %result.1, %sw.epilog ], [ 0, %land.rhs.preheader ]
-    %1 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %cmp1 = icmp slt i32 %1, 0
-    br i1 %cmp1, label %while.body, label %while.end
-  
-  while.body:                                       ; preds = %land.rhs
-    switch i32 %z, label %sw.epilog [
-      i32 0, label %sw.bb
-      i32 1, label %sw.bb2
-      i32 2, label %sw.bb3
-      i32 3, label %sw.bb5
-      i32 4, label %sw.bb6
-      i32 5, label %sw.bb7
-    ]
-  
-  sw.bb:                                            ; preds = %while.body
-    %add = mul nsw i32 %result.020, 6
-    br label %sw.epilog
-  
-  sw.bb2:                                           ; preds = %while.body
-    %dec = add nsw i32 %result.020, -1
-    br label %sw.epilog
-  
-  sw.bb3:                                           ; preds = %while.body
-    %mul4 = mul nsw i32 %result.020, %result.020
-    br label %sw.epilog
-  
-  sw.bb5:                                           ; preds = %while.body
-    %shl = shl i32 %result.020, 7
-    br label %sw.epilog
-  
-  sw.bb6:                                           ; preds = %while.body
-    %shr = ashr i32 %result.020, 7
-    br label %sw.epilog
-  
-  sw.bb7:                                           ; preds = %while.body
-    %mul8 = shl nsw i32 %result.020, 4
-    %neg = xor i32 %result.020, -1
-    %or = or i32 %mul8, %neg
-    br label %sw.epilog
-  
-  sw.epilog:                                        ; preds = %sw.bb7, %sw.bb6, %sw.bb5, %sw.bb3, %sw.bb2, %sw.bb, %while.body
-    %result.1 = phi i32 [ %result.020, %while.body ], [ %or, %sw.bb7 ], [ %shr, %sw.bb6 ], [ %shl, %sw.bb5 ], [ %mul4, %sw.bb3 ], [ %dec, %sw.bb2 ], [ %add, %sw.bb ]
-    %2 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %cmp = icmp sgt i32 %2, 0
-    br i1 %cmp, label %land.rhs, label %while.end
-  
-  while.end:                                        ; preds = %sw.epilog, %land.rhs, %entry
-    %result.0.lcssa = phi i32 [ 0, %entry ], [ %result.020, %land.rhs ], [ %result.1, %sw.epilog ]
-    ret i32 %result.0.lcssa
-  }
-  
-  ; Function Attrs: norecurse nounwind uwtable
-  define i32 @ifElse(i32 %z) local_unnamed_addr #0 {
-  entry:
-    br label %while.cond.outer
-  
-  while.cond.outer:                                 ; preds = %if.then, %if.then2, %if.then5, %if.then8, %if.then11, %entry
-    %w.0.ph = phi i32 [ 0, %entry ], [ %sub, %if.then11 ], [ %add, %if.then8 ], [ %mul, %if.then5 ], [ %dec, %if.then2 ], [ %inc, %if.then ]
-    br label %while.cond
-  
-  while.cond:                                       ; preds = %if.else12, %while.cond.outer
-    %0 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %cmp = icmp slt i32 %0, 0
-    br i1 %cmp, label %if.then, label %if.else
-  
-  if.then:                                          ; preds = %while.cond
-    %inc = add nsw i32 %w.0.ph, 1
-    br label %while.cond.outer
-  
-  if.else:                                          ; preds = %while.cond
-    %1 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %cmp1 = icmp sgt i32 %1, 0
-    br i1 %cmp1, label %if.then2, label %if.else3
-  
-  if.then2:                                         ; preds = %if.else
-    %dec = add nsw i32 %w.0.ph, -1
-    br label %while.cond.outer
-  
-  if.else3:                                         ; preds = %if.else
-    %2 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %3 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %and = and i32 %3, %2
-    %cmp4 = icmp eq i32 %and, 3
-    br i1 %cmp4, label %if.then5, label %if.else6
-  
-  if.then5:                                         ; preds = %if.else3
-    %mul = shl nsw i32 %w.0.ph, 1
-    br label %while.cond.outer
-  
-  if.else6:                                         ; preds = %if.else3
-    %4 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %5 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %or = or i32 %5, %4
-    %cmp7 = icmp eq i32 %or, 18
-    br i1 %cmp7, label %if.then8, label %if.else9
-  
-  if.then8:                                         ; preds = %if.else6
-    %add = add nsw i32 %w.0.ph, 2
-    br label %while.cond.outer
-  
-  if.else9:                                         ; preds = %if.else6
-    %6 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %7 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %xor = xor i32 %7, %6
-    %cmp10 = icmp eq i32 %xor, 154
-    br i1 %cmp10, label %if.then11, label %if.else12
-  
-  if.then11:                                        ; preds = %if.else9
-    %sub = add nsw i32 %w.0.ph, -3
-    br label %while.cond.outer
-  
-  if.else12:                                        ; preds = %if.else9
-    %8 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %9 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %xor13 = xor i32 %9, %8
-    %and14 = and i32 %xor13, 1
-    %cmp15 = icmp eq i32 %and14, 0
-    br i1 %cmp15, label %while.cond, label %while.end
-  
-  while.end:                                        ; preds = %if.else12
-    ret i32 %w.0.ph
-  }
-  
-  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  
-  !llvm.module.flags = !{!0, !1}
-  !llvm.ident = !{!2}
-  
-  !0 = !{i32 1, !"wchar_size", i32 2}
-  !1 = !{i32 7, !"PIC Level", i32 2}
-  !2 = !{!"clang version 6.0.0 (ssh://git-amr-1.devtools.intel.com:29418/dpd_icl-llvm_clang_worldread 3789ad4283ec09df1ed8411abbb227d76e7ef8cb) (ssh://git-amr-1.devtools.intel.com:29418/dpd_icl-llvm_llvm_worldread 42897913cc9fac0d94e8636d9aed4dc193d7864e)"}
-  !3 = !{!4, !4, i64 0}
-  !4 = !{!"int", !5, i64 0}
-  !5 = !{!"omnipotent char", !6, i64 0}
-  !6 = !{!"Simple C/C++ TBAA"}
-
-...
----
-name:            switchCase
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:       
-liveins:         
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 4294967295
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
-body:             |
-
-...
----
-name:            ifElse
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:       
-liveins:         
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 4294967295
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
-body:             |
-
-...
-- 
GitLab


From d32696ac377e9be3f23fef965a812d8897717dc5 Mon Sep 17 00:00:00 2001
From: James Henderson <jh7370@my.bristol.ac.uk>
Date: Mon, 29 Oct 2018 14:17:08 +0000
Subject: [PATCH 0699/1116] [llvm-objdump] Don't crash when using `-a` on
 non-archives

This fixes PR39402. The crash was caused when dereferencing nullptr in
DumpObject and printArchiveChild.

Reviewed By: jhenderson

Differential Revision: https://reviews.llvm.org/D53690

Patch by Xing GUO


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345503 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm-objdump/non-archive-object.test      | 25 +++++++++++++++++++
 tools/llvm-objdump/llvm-objdump.cpp           |  8 +++---
 2 files changed, 29 insertions(+), 4 deletions(-)
 create mode 100644 test/tools/llvm-objdump/non-archive-object.test

diff --git a/test/tools/llvm-objdump/non-archive-object.test b/test/tools/llvm-objdump/non-archive-object.test
new file mode 100644
index 00000000000..b1884102c02
--- /dev/null
+++ b/test/tools/llvm-objdump/non-archive-object.test
@@ -0,0 +1,25 @@
+# RUN: yaml2obj %s > %t
+# RUN: llvm-objdump -a %t | FileCheck %s
+
+# If this test has not crashed, then this test passed.
+# CHECK: file format ELF64-x86-64
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x0000000000000010
+    Size:            64
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000010
+    Content:         "01234567"
+    Size:            4
+
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index a4fddf3f0c7..463408b60c5 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -2309,8 +2309,8 @@ static void DumpObject(ObjectFile *o, const Archive *a = nullptr,
     outs() << ":\tfile format " << o->getFileFormatName() << "\n\n";
   }
 
-  if (ArchiveHeaders && !MachOOpt)
-    printArchiveChild(a->getFileName(), *c);
+  if (ArchiveHeaders && !MachOOpt && c)
+    printArchiveChild(ArchiveName, *c);
   if (Disassemble)
     DisassembleObject(o, Relocations);
   if (Relocations && !Disassemble)
@@ -2363,8 +2363,8 @@ static void DumpObject(const COFFImportFile *I, const Archive *A,
            << ":\tfile format COFF-import-file"
            << "\n\n";
 
-  if (ArchiveHeaders && !MachOOpt)
-    printArchiveChild(A->getFileName(), *C);
+  if (ArchiveHeaders && !MachOOpt && C)
+    printArchiveChild(ArchiveName, *C);
   if (SymbolTable)
     printCOFFSymbolTable(I);
 }
-- 
GitLab


From 6b8000447a600e0f4a3b071840ed142689b805e8 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm@meinersbur.de>
Date: Mon, 29 Oct 2018 14:51:02 +0000
Subject: [PATCH 0700/1116] [git/svn] Ignore Visual Studio's
 CMakeSettings.json.

When using Visual Studio's built-in support for CMake, the CMakeSettings.json contains the build configurations (build dir, generator, toolchain, cmake variables, etc). It is specific to the build machine, therefore should not be versioned.

Differential Revision: https://reviews.llvm.org/D53775

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345504 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 224bd2f3a9c..fd308878407 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,6 +41,8 @@ cscope.out
 autoconf/aclocal.m4
 autoconf/autom4te.cache
 /compile_commands.json
+# Visual Studio built-in CMake configuration
+/CMakeSettings.json
 
 #==============================================================================#
 # Directories to ignore (do not add trailing '/'s, they skip symlinks).
-- 
GitLab


From ef885d5463db055b2bfe7cdadd3d517801eff9f1 Mon Sep 17 00:00:00 2001
From: Robert Widmann <devteam.codafi@gmail.com>
Date: Mon, 29 Oct 2018 15:31:40 +0000
Subject: [PATCH 0701/1116] [LLVM-C] Add Builder Bindings to Common Memory
 Intrinsics

Summary: Add IRBuilder bindings for memmove, memcpy, and memset.

Reviewers: whitequark, deadalnix

Reviewed By: whitequark

Subscribers: harlanhaskins, llvm-commits

Differential Revision: https://reviews.llvm.org/D53555

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345508 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm-c/Core.h | 29 +++++++++++++++++++++++++++++
 lib/IR/Core.cpp       | 24 ++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 0c274b62567..c905cfbb08d 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -3473,6 +3473,35 @@ LLVMValueRef LLVMBuildNot(LLVMBuilderRef, LLVMValueRef V, const char *Name);
 LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef, LLVMTypeRef Ty, const char *Name);
 LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef, LLVMTypeRef Ty,
                                   LLVMValueRef Val, const char *Name);
+
+/**
+ * Creates and inserts a memset to the specified pointer and the 
+ * specified value.
+ *
+ * @see llvm::IRRBuilder::CreateMemSet()
+ */
+LLVMValueRef LLVMBuildMemSet(LLVMBuilderRef B, LLVMValueRef Ptr,
+                             LLVMValueRef Val, LLVMValueRef Len,
+                             unsigned Align);
+/**
+ * Creates and inserts a memcpy between the specified pointers.
+ *
+ * @see llvm::IRRBuilder::CreateMemCpy()
+ */
+LLVMValueRef LLVMBuildMemCpy(LLVMBuilderRef B, 
+                             LLVMValueRef Dst, unsigned DstAlign,
+                             LLVMValueRef Src, unsigned SrcAlign,
+                             LLVMValueRef Size);
+/**
+ * Creates and inserts a memmove between the specified pointers.
+ *
+ * @see llvm::IRRBuilder::CreateMemMove()
+ */
+LLVMValueRef LLVMBuildMemMove(LLVMBuilderRef B, 
+                              LLVMValueRef Dst, unsigned DstAlign,
+                              LLVMValueRef Src, unsigned SrcAlign,
+                              LLVMValueRef Size);
+
 LLVMValueRef LLVMBuildAlloca(LLVMBuilderRef, LLVMTypeRef Ty, const char *Name);
 LLVMValueRef LLVMBuildArrayAlloca(LLVMBuilderRef, LLVMTypeRef Ty,
                                   LLVMValueRef Val, const char *Name);
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 27906e68636..028f9e6199d 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -3234,6 +3234,30 @@ LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
   return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
 }
 
+LLVMValueRef LLVMBuildMemSet(LLVMBuilderRef B, LLVMValueRef Ptr, 
+                             LLVMValueRef Val, LLVMValueRef Len,
+                             unsigned Align) {
+  return wrap(unwrap(B)->CreateMemSet(unwrap(Ptr), unwrap(Val), unwrap(Len), Align));
+}
+
+LLVMValueRef LLVMBuildMemCpy(LLVMBuilderRef B, 
+                             LLVMValueRef Dst, unsigned DstAlign,
+                             LLVMValueRef Src, unsigned SrcAlign,
+                             LLVMValueRef Size) {
+  return wrap(unwrap(B)->CreateMemCpy(unwrap(Dst), DstAlign,
+                                      unwrap(Src), SrcAlign,
+                                      unwrap(Size)));
+}
+
+LLVMValueRef LLVMBuildMemMove(LLVMBuilderRef B,
+                              LLVMValueRef Dst, unsigned DstAlign,
+                              LLVMValueRef Src, unsigned SrcAlign,
+                              LLVMValueRef Size) {
+  return wrap(unwrap(B)->CreateMemMove(unwrap(Dst), DstAlign,
+                                       unwrap(Src), SrcAlign,
+                                       unwrap(Size)));
+}
+
 LLVMValueRef LLVMBuildAlloca(LLVMBuilderRef B, LLVMTypeRef Ty,
                              const char *Name) {
   return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), nullptr, Name));
-- 
GitLab


From e7732100b88cdb00a0f036c70495a63269f1e773 Mon Sep 17 00:00:00 2001
From: Luke Cheeseman <luke.cheeseman@arm.com>
Date: Mon, 29 Oct 2018 16:26:58 +0000
Subject: [PATCH 0702/1116] [AArch64] Return address signing B key support

- Add support to generate AUTIBSP, PACIBSP, RETAB instructions for return
  address signing
- The key used to sign the function is controlled by the function attribute
  "sign-return-address-key"

Differential Revision: https://reviews.llvm.org/D51427


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345511 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64FrameLowering.cpp | 23 ++++++++++++++++++---
 test/CodeGen/AArch64/sign-return-address.ll | 23 +++++++++++++++++++++
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index b0451ca2edb..a99dd356d4f 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -597,6 +597,17 @@ static void adaptForLdStOpt(MachineBasicBlock &MBB,
   //
 }
 
+static bool ShouldSignWithAKey(MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  if (!F.hasFnAttribute("sign-return-address-key"))
+    return true;
+
+  const StringRef Key =
+      F.getFnAttribute("sign-return-address-key").getValueAsString();
+  assert(Key.equals_lower("a_key") || Key.equals_lower("b_key"));
+  return Key.equals_lower("a_key");
+}
+
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -620,7 +631,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   DebugLoc DL;
 
   if (ShouldSignReturnAddress(MF)) {
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
+    BuildMI(
+        MBB, MBBI, DL,
+        TII->get(ShouldSignWithAKey(MF) ? AArch64::PACIASP : AArch64::PACIBSP))
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
@@ -907,10 +920,14 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
   // instructions, namely RETA{A,B}, that can be used instead.
   if (Subtarget.hasV8_3aOps() && MBBI != MBB.end() &&
       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::RETAA)).copyImplicitOps(*MBBI);
+    BuildMI(MBB, MBBI, DL,
+            TII->get(ShouldSignWithAKey(MF) ? AArch64::RETAA : AArch64::RETAB))
+        .copyImplicitOps(*MBBI);
     MBB.erase(MBBI);
   } else {
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::AUTIASP))
+    BuildMI(
+        MBB, MBBI, DL,
+        TII->get(ShouldSignWithAKey(MF) ? AArch64::AUTIASP : AArch64::AUTIBSP))
         .setMIFlag(MachineInstr::FrameDestroy);
   }
 }
diff --git a/test/CodeGen/AArch64/sign-return-address.ll b/test/CodeGen/AArch64/sign-return-address.ll
index a0c73058a30..c057c815acf 100644
--- a/test/CodeGen/AArch64/sign-return-address.ll
+++ b/test/CodeGen/AArch64/sign-return-address.ll
@@ -84,3 +84,26 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
   tail call fastcc i64 @bar(i64 %x)
   ret void
 }
+
+; CHECK-LABEL: @leaf_sign_all_a_key
+; CHECK: paciasp
+; CHECK: autiasp
+define i32 @leaf_sign_all_a_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" {
+  ret i32 %x
+}
+
+; CHECK-LABEL: @leaf_sign_all_b_key
+; CHECK: pacibsp
+; CHECK: autibsp
+define i32 @leaf_sign_all_b_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="b_key" {
+  ret i32 %x
+}
+
+; CHECK-LABEL: @leaf_sign_all_v83_b_key
+; CHECK: pacibsp
+; CHECK-NOT: ret
+; CHECK: retab
+; CHECK-NOT: ret
+define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" {
+  ret i32 %x
+}
-- 
GitLab


From 5aeb36fdcb23eb0c3dabf24e46c02a4190abe630 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Mon, 29 Oct 2018 16:54:37 +0000
Subject: [PATCH 0703/1116] [Intrinsic] Signed and Unsigned Saturation
 Subtraction Intirnsics

Add an intrinsic that takes 2 integers and perform saturation subtraction on
them.

This is a part of implementing fixed point arithmetic in clang where some of
the more complex operations will be implemented as intrinsics.

Differential Revision: https://reviews.llvm.org/D53783

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345512 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/ISDOpcodes.h             |   8 +
 include/llvm/CodeGen/TargetLowering.h         |   7 +-
 include/llvm/IR/Intrinsics.td                 |   6 +
 include/llvm/Target/TargetSelectionDAG.td     |   2 +
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |  10 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  35 ++-
 lib/CodeGen/SelectionDAG/LegalizeTypes.h      |   4 +-
 .../SelectionDAG/LegalizeVectorOps.cpp        |   2 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   4 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  12 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   2 +
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |  52 ++--
 lib/CodeGen/TargetLoweringBase.cpp            |   2 +
 lib/IR/Verifier.cpp                           |  16 +-
 test/CodeGen/X86/ssub_sat.ll                  | 267 ++++++++++++++++++
 test/CodeGen/X86/usub_sat.ll                  | 158 +++++++++++
 16 files changed, 546 insertions(+), 41 deletions(-)
 create mode 100644 test/CodeGen/X86/ssub_sat.ll
 create mode 100644 test/CodeGen/X86/usub_sat.ll

diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 1c0318d6a70..b8e3129ed6c 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -264,6 +264,14 @@ namespace ISD {
     /// resulting value is this minimum value.
     SADDSAT, UADDSAT,
 
+    /// RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2
+    /// integers with the same bit width (W). If the true value of LHS - RHS
+    /// exceeds the largest value that can be represented by W bits, the
+    /// resulting value is this maximum value. Otherwise, if this value is less
+    /// than the smallest value that can be represented by W bits, the
+    /// resulting value is this minimum value.
+    SSUBSAT, USUBSAT,
+
     /// Simple binary floating point operators.
     FADD, FSUB, FMUL, FDIV, FREM,
 
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 2475a0f3686..a4356db800a 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -3736,9 +3736,10 @@ public:
   SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
                                   SDValue Index) const;
 
-  /// Method for building the DAG expansion of ISD::[US]ADDSAT. This method
-  /// accepts integers or vectors of integers as its arguments.
-  SDValue getExpandedSaturationAddition(SDNode *Node, SelectionDAG &DAG) const;
+  /// Method for building the DAG expansion of ISD::[US][ADD|SUB]SAT. This
+  /// method accepts integers or vectors of integers as its arguments.
+  SDValue getExpandedSaturationAdditionSubtraction(SDNode *Node,
+                                                   SelectionDAG &DAG) const;
 
   //===--------------------------------------------------------------------===//
   // Instruction Emitting Hooks
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index e49fa147709..989a04d65de 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -716,6 +716,12 @@ def int_sadd_sat : Intrinsic<[llvm_anyint_ty],
 def int_uadd_sat : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
                              [IntrNoMem, IntrSpeculatable, Commutative]>;
+def int_ssub_sat : Intrinsic<[llvm_anyint_ty],
+                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                             [IntrNoMem, IntrSpeculatable]>;
+def int_usub_sat : Intrinsic<[llvm_anyint_ty],
+                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                             [IntrNoMem, IntrSpeculatable]>;
 
 //===------------------------- Memory Use Markers -------------------------===//
 //
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index dfc3ce86217..532e866be55 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -375,6 +375,8 @@ def umax       : SDNode<"ISD::UMAX"      , SDTIntBinOp,
 
 def saddsat    : SDNode<"ISD::SADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
 def uaddsat    : SDNode<"ISD::UADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
+def ssubsat    : SDNode<"ISD::SSUBSAT"   , SDTIntBinOp>;
+def usubsat    : SDNode<"ISD::USUBSAT"   , SDTIntBinOp>;
 
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 130b33d0767..6d9e69e2d64 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1115,7 +1115,9 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
                                             Node->getValueType(0));
     break;
   case ISD::SADDSAT:
-  case ISD::UADDSAT: {
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT: {
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   }
@@ -3254,8 +3256,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     break;
   }
   case ISD::SADDSAT:
-  case ISD::UADDSAT: {
-    Results.push_back(TLI.getExpandedSaturationAddition(Node, DAG));
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT: {
+    Results.push_back(TLI.getExpandedSaturationAdditionSubtraction(Node, DAG));
     break;
   }
   case ISD::SADDO:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 690a64e724b..f24659ac274 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -142,7 +142,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SUBCARRY:    Res = PromoteIntRes_ADDSUBCARRY(N, ResNo); break;
 
   case ISD::SADDSAT:
-  case ISD::UADDSAT:     Res = PromoteIntRes_ADDSAT(N); break;
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:     Res = PromoteIntRes_ADDSUBSAT(N); break;
 
   case ISD::ATOMIC_LOAD:
     Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
@@ -549,11 +551,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   return SDValue(Res.getNode(), 1);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_ADDSAT(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) {
   // For promoting iN -> iM, this can be expanded by
   // 1. ANY_EXTEND iN to iM
   // 2. SHL by M-N
-  // 3. U/SADDSAT
+  // 3. [US][ADD|SUB]SAT
   // 4. L/ASHR by M-N
   SDLoc dl(N);
   SDValue Op1 = N->getOperand(0);
@@ -561,9 +563,20 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSAT(SDNode *N) {
   unsigned OldBits = Op1.getValueSizeInBits();
 
   unsigned Opcode = N->getOpcode();
-  assert((Opcode == ISD::SADDSAT || Opcode == ISD::UADDSAT) &&
-         "Expected opcode to be SADDSAT or UADDSAT");
-  unsigned ShiftOp = Opcode == ISD::SADDSAT ? ISD::SRA : ISD::SRL;
+  unsigned ShiftOp;
+  switch (Opcode) {
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:
+    ShiftOp = ISD::SRA;
+    break;
+  case ISD::UADDSAT:
+  case ISD::USUBSAT:
+    ShiftOp = ISD::SRL;
+    break;
+  default:
+    llvm_unreachable("Expected opcode to be signed or unsigned saturation "
+                     "addition or subtraction");
+  }
 
   SDValue Op1Promoted = GetPromotedInteger(Op1);
   SDValue Op2Promoted = GetPromotedInteger(Op2);
@@ -1505,7 +1518,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SMULO: ExpandIntRes_XMULO(N, Lo, Hi); break;
 
   case ISD::SADDSAT:
-  case ISD::UADDSAT: ExpandIntRes_ADDSAT(N, Lo, Hi); break;
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -2468,9 +2483,9 @@ void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo,
   ReplaceValueWith(SDValue(N, 1), R.getValue(2));
 }
 
-void DAGTypeLegalizer::ExpandIntRes_ADDSAT(SDNode *N, SDValue &Lo,
-                                           SDValue &Hi) {
-  SDValue Result = TLI.getExpandedSaturationAddition(N, DAG);
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBSAT(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDValue Result = TLI.getExpandedSaturationAdditionSubtraction(N, DAG);
   SplitInteger(Result, Lo, Hi);
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 605c63c72d4..8b7c57cbb3b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -330,7 +330,7 @@ private:
   SDValue PromoteIntRes_UNDEF(SDNode *N);
   SDValue PromoteIntRes_VAARG(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
-  SDValue PromoteIntRes_ADDSAT(SDNode *N);
+  SDValue PromoteIntRes_ADDSUBSAT(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -415,7 +415,7 @@ private:
   void ExpandIntRes_SADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_XMULO             (SDNode *N, SDValue &Lo, SDValue &Hi);
-  void ExpandIntRes_ADDSAT            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBSAT         (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_ATOMIC_LOAD       (SDNode *N, SDValue &Lo, SDValue &Hi);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 122a9856ade..109276a5cbb 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -392,6 +392,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FCANONICALIZE:
   case ISD::SADDSAT:
   case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::FP_ROUND_INREG:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1027f31d084..e7ad25155eb 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -124,6 +124,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::SADDSAT:
   case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
 
   case ISD::FPOW:
   case ISD::FREM:
@@ -807,6 +809,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UMAX:
   case ISD::SADDSAT:
   case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
     SplitVecRes_BinOp(N, Lo, Hi);
     break;
   case ISD::FMA:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3434f24db91..ddead1d93a5 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5783,6 +5783,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, DAG.getNode(ISD::UADDSAT, sdl, Op1.getValueType(), Op1, Op2));
     return nullptr;
   }
+  case Intrinsic::ssub_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SSUBSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return nullptr;
+  }
+  case Intrinsic::usub_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return nullptr;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 1c9a49306c6..bae163d5386 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -286,6 +286,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
 
   case ISD::SADDSAT:                    return "saddsat";
   case ISD::UADDSAT:                    return "uaddsat";
+  case ISD::SSUBSAT:                    return "ssubsat";
+  case ISD::USUBSAT:                    return "usubsat";
 
   // Conversion operators.
   case ISD::SIGN_EXTEND:                return "sign_extend";
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 103a7509835..a356e4d728f 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4983,11 +4983,27 @@ SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
   return SDValue();
 }
 
-SDValue TargetLowering::getExpandedSaturationAddition(SDNode *Node,
-                                                      SelectionDAG &DAG) const {
+SDValue TargetLowering::getExpandedSaturationAdditionSubtraction(
+    SDNode *Node, SelectionDAG &DAG) const {
   unsigned Opcode = Node->getOpcode();
-  assert((Opcode == ISD::SADDSAT || Opcode == ISD::UADDSAT) &&
-         "Expected method to receive SADDSAT or UADDSAT node.");
+  unsigned OverflowOp;
+  switch (Opcode) {
+  case ISD::SADDSAT:
+    OverflowOp = ISD::SADDO;
+    break;
+  case ISD::UADDSAT:
+    OverflowOp = ISD::UADDO;
+    break;
+  case ISD::SSUBSAT:
+    OverflowOp = ISD::SSUBO;
+    break;
+  case ISD::USUBSAT:
+    OverflowOp = ISD::USUBO;
+    break;
+  default:
+    llvm_unreachable("Expected method to receive signed or unsigned saturation "
+                     "addition or subtraction node.");
+  }
   assert(Node->getNumOperands() == 2 && "Expected node to have 2 operands.");
 
   SDLoc dl(Node);
@@ -5002,31 +5018,35 @@ SDValue TargetLowering::getExpandedSaturationAddition(SDNode *Node,
   assert(LHS.getValueType() == RHS.getValueType() &&
          "Expected both operands to be the same type");
 
-  unsigned OverflowOp = Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::UADDO;
   unsigned BitWidth = LHS.getValueSizeInBits();
   EVT ResultType = LHS.getValueType();
   EVT BoolVT =
       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ResultType);
   SDValue Result =
       DAG.getNode(OverflowOp, dl, DAG.getVTList(ResultType, BoolVT), LHS, RHS);
-  SDValue Sum = Result.getValue(0);
+  SDValue SumDiff = Result.getValue(0);
   SDValue Overflow = Result.getValue(1);
   SDValue Zero = DAG.getConstant(0, dl, ResultType);
 
-  if (Opcode == ISD::SADDSAT) {
-    // SatMax -> Overflow && Sum < 0
-    // SatMin -> Overflow && Sum > 0
+  if (Opcode == ISD::UADDSAT) {
+    // Just need to check overflow for SatMax.
+    APInt MaxVal = APInt::getMaxValue(BitWidth);
+    SDValue SatMax = DAG.getConstant(MaxVal, dl, ResultType);
+    return DAG.getSelect(dl, ResultType, Overflow, SatMax, SumDiff);
+  } else if (Opcode == ISD::USUBSAT) {
+    // Just need to check overflow for SatMin.
+    APInt MinVal = APInt::getMinValue(BitWidth);
+    SDValue SatMin = DAG.getConstant(MinVal, dl, ResultType);
+    return DAG.getSelect(dl, ResultType, Overflow, SatMin, SumDiff);
+  } else {
+    // SatMax -> Overflow && SumDiff < 0
+    // SatMin -> Overflow && SumDiff >= 0
     APInt MinVal = APInt::getSignedMinValue(BitWidth);
     APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
     SDValue SatMin = DAG.getConstant(MinVal, dl, ResultType);
     SDValue SatMax = DAG.getConstant(MaxVal, dl, ResultType);
-    SDValue SumNeg = DAG.getSetCC(dl, BoolVT, Sum, Zero, ISD::SETLT);
+    SDValue SumNeg = DAG.getSetCC(dl, BoolVT, SumDiff, Zero, ISD::SETLT);
     Result = DAG.getSelect(dl, ResultType, SumNeg, SatMax, SatMin);
-    return DAG.getSelect(dl, ResultType, Overflow, Result, Sum);
-  } else {
-    // Just need to check overflow for SatMax.
-    APInt MaxVal = APInt::getMaxValue(BitWidth);
-    SDValue SatMax = DAG.getConstant(MaxVal, dl, ResultType);
-    return DAG.getSelect(dl, ResultType, Overflow, SatMax, Sum);
+    return DAG.getSelect(dl, ResultType, Overflow, Result, SumDiff);
   }
 }
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 715112edc17..30887e2d5f8 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -612,6 +612,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::ABS, VT, Expand);
     setOperationAction(ISD::SADDSAT, VT, Expand);
     setOperationAction(ISD::UADDSAT, VT, Expand);
+    setOperationAction(ISD::SSUBSAT, VT, Expand);
+    setOperationAction(ISD::USUBSAT, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index ae578c91ae8..3c6defdfde3 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -4475,15 +4475,17 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
     break;
   }
   case Intrinsic::sadd_sat:
-  case Intrinsic::uadd_sat: {
+  case Intrinsic::uadd_sat:
+  case Intrinsic::ssub_sat:
+  case Intrinsic::usub_sat: {
     Value *Op1 = CS.getArgOperand(0);
     Value *Op2 = CS.getArgOperand(1);
-    Assert(
-        Op1->getType()->isIntOrIntVectorTy(),
-        "first operand of [us]add_sat must be an int type or vector of ints");
-    Assert(
-        Op2->getType()->isIntOrIntVectorTy(),
-        "second operand of [us]add_sat must be an int type or vector of ints");
+    Assert(Op1->getType()->isIntOrIntVectorTy(),
+           "first operand of [us][add|sub]_sat must be an int type or vector "
+           "of ints");
+    Assert(Op2->getType()->isIntOrIntVectorTy(),
+           "second operand of [us][add|sub]_sat must be an int type or vector "
+           "of ints");
     break;
   }
   };
diff --git a/test/CodeGen/X86/ssub_sat.ll b/test/CodeGen/X86/ssub_sat.ll
new file mode 100644
index 00000000000..6d9a534fad9
--- /dev/null
+++ b/test/CodeGen/X86/ssub_sat.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=CHECK32
+
+declare  i4  @llvm.ssub.sat.i4   (i4,  i4)
+declare  i32 @llvm.ssub.sat.i32  (i32, i32)
+declare  i64 @llvm.ssub.sat.i64  (i64, i64)
+declare  <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
+
+define i32 @func(i32 %x, i32 %y) {
+; CHECK-LABEL: func:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    subl %esi, %ecx
+; CHECK-NEXT:    setns %al
+; CHECK-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK-NEXT:    subl %esi, %edi
+; CHECK-NEXT:    cmovnol %edi, %eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    .cfi_offset %esi, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    movl %eax, %esi
+; CHECK32-NEXT:    subl %edx, %esi
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    subl %edx, %eax
+; CHECK32-NEXT:    cmovol %ecx, %eax
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+  %tmp = call i32 @llvm.ssub.sat.i32(i32 %x, i32 %y);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) {
+; CHECK-LABEL: func2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    subq %rsi, %rax
+; CHECK-NEXT:    setns %cl
+; CHECK-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    subq %rsi, %rdi
+; CHECK-NEXT:    cmovnoq %rdi, %rax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func2:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    movl %ebx, %ebp
+; CHECK32-NEXT:    sbbl %esi, %ebp
+; CHECK32-NEXT:    movl %ebp, %eax
+; CHECK32-NEXT:    sarl $31, %eax
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    testl %ebp, %ebp
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    movl %ecx, %edx
+; CHECK32-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    testl %ebx, %ebx
+; CHECK32-NEXT:    setns %bl
+; CHECK32-NEXT:    cmpb %cl, %bl
+; CHECK32-NEXT:    setne %cl
+; CHECK32-NEXT:    testl %esi, %esi
+; CHECK32-NEXT:    setns %ch
+; CHECK32-NEXT:    cmpb %ch, %bl
+; CHECK32-NEXT:    setne %ch
+; CHECK32-NEXT:    testb %cl, %ch
+; CHECK32-NEXT:    cmovel %ebp, %edx
+; CHECK32-NEXT:    cmovel %edi, %eax
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+  %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %y);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) {
+; CHECK-LABEL: func3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlb $4, %sil
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    subb %sil, %cl
+; CHECK-NEXT:    setns %cl
+; CHECK-NEXT:    subb %sil, %al
+; CHECK-NEXT:    jno .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    addb $127, %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    sarb $4, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func3:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; CHECK32-NEXT:    shlb $4, %dl
+; CHECK32-NEXT:    shlb $4, %al
+; CHECK32-NEXT:    movl %eax, %ecx
+; CHECK32-NEXT:    subb %dl, %cl
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    subb %dl, %al
+; CHECK32-NEXT:    jno .LBB2_2
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    addb $127, %cl
+; CHECK32-NEXT:    movl %ecx, %eax
+; CHECK32-NEXT:  .LBB2_2:
+; CHECK32-NEXT:    sarb $4, %al
+; CHECK32-NEXT:    retl
+  %tmp = call i4 @llvm.ssub.sat.i4(i4 %x, i4 %y);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %r8d
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %r8d, %esi
+; CHECK-NEXT:    subl %ecx, %esi
+; CHECK-NEXT:    setns %dl
+; CHECK-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; CHECK-NEXT:    subl %ecx, %r8d
+; CHECK-NEXT:    cmovol %edx, %r8d
+; CHECK-NEXT:    movd %xmm1, %edx
+; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movl %ecx, %edi
+; CHECK-NEXT:    subl %edx, %edi
+; CHECK-NEXT:    setns %sil
+; CHECK-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    subl %edx, %ecx
+; CHECK-NEXT:    cmovol %esi, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; CHECK-NEXT:    movd %xmm2, %edx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm2, %eax
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    subl %edx, %esi
+; CHECK-NEXT:    setns %dil
+; CHECK-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    subl %edx, %eax
+; CHECK-NEXT:    cmovol %edi, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; CHECK-NEXT:    movd %xmm1, %r9d
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-NEXT:    movd %xmm0, %edx
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    movl %edx, %esi
+; CHECK-NEXT:    subl %r9d, %esi
+; CHECK-NEXT:    setns %dil
+; CHECK-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    subl %r9d, %edx
+; CHECK-NEXT:    cmovol %edi, %edx
+; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    movd %eax, %xmm1
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    movd %ecx, %xmm0
+; CHECK-NEXT:    movd %r8d, %xmm2
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: vec:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %ecx, %esi
+; CHECK32-NEXT:    subl %edx, %esi
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    subl %edx, %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    cmovol %eax, %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %edx, %edi
+; CHECK32-NEXT:    subl %esi, %edi
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    subl %esi, %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    cmovol %eax, %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %esi, %ebx
+; CHECK32-NEXT:    subl %edi, %ebx
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    subl %edi, %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    cmovol %eax, %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    xorl %ebx, %ebx
+; CHECK32-NEXT:    movl %edi, %ebp
+; CHECK32-NEXT:    subl %eax, %ebp
+; CHECK32-NEXT:    setns %bl
+; CHECK32-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    subl %eax, %edi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    cmovol %ebx, %edi
+; CHECK32-NEXT:    movl %ecx, 12(%eax)
+; CHECK32-NEXT:    movl %edx, 8(%eax)
+; CHECK32-NEXT:    movl %esi, 4(%eax)
+; CHECK32-NEXT:    movl %edi, (%eax)
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
+  ret <4 x i32> %tmp;
+}
diff --git a/test/CodeGen/X86/usub_sat.ll b/test/CodeGen/X86/usub_sat.ll
new file mode 100644
index 00000000000..1c9a5c56c25
--- /dev/null
+++ b/test/CodeGen/X86/usub_sat.ll
@@ -0,0 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=CHECK32
+
+declare  i4  @llvm.usub.sat.i4   (i4,  i4)
+declare  i32 @llvm.usub.sat.i32  (i32, i32)
+declare  i64 @llvm.usub.sat.i64  (i64, i64)
+declare  <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
+
+define i32 @func(i32 %x, i32 %y) {
+; CHECK-LABEL: func:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    subl %esi, %edi
+; CHECK-NEXT:    cmovael %edi, %eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    cmovbl %ecx, %eax
+; CHECK32-NEXT:    retl
+  %tmp = call i32 @llvm.usub.sat.i32(i32 %x, i32 %y);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) {
+; CHECK-LABEL: func2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    subq %rsi, %rdi
+; CHECK-NEXT:    cmovaeq %rdi, %rax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func2:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    cmovbl %ecx, %edx
+; CHECK32-NEXT:    cmovbl %ecx, %eax
+; CHECK32-NEXT:    retl
+  %tmp = call i64 @llvm.usub.sat.i64(i64 %x, i64 %y);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) {
+; CHECK-LABEL: func3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlb $4, %sil
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    subb %sil, %al
+; CHECK-NEXT:    jae .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    shrb $4, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func3:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK32-NEXT:    shlb $4, %cl
+; CHECK32-NEXT:    shlb $4, %al
+; CHECK32-NEXT:    subb %cl, %al
+; CHECK32-NEXT:    jae .LBB2_2
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:  .LBB2_2:
+; CHECK32-NEXT:    shrb $4, %al
+; CHECK32-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK32-NEXT:    retl
+  %tmp = call i4 @llvm.usub.sat.i4(i4 %x, i4 %y);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    cmovbl %edx, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; CHECK-NEXT:    movd %xmm3, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm3, %ecx
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    cmovbl %edx, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm3
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-NEXT:    movd %xmm1, %eax
+; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    cmovbl %edx, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT:    movd %xmm1, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    cmovbl %edx, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm0
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: vec:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    .cfi_offset %esi, -16
+; CHECK32-NEXT:    .cfi_offset %edi, -12
+; CHECK32-NEXT:    .cfi_offset %ebx, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    xorl %ebx, %ebx
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    cmovbl %ebx, %edi
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    cmovbl %ebx, %esi
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    cmovbl %ebx, %edx
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    cmovbl %ebx, %ecx
+; CHECK32-NEXT:    movl %ecx, 12(%eax)
+; CHECK32-NEXT:    movl %edx, 8(%eax)
+; CHECK32-NEXT:    movl %esi, 4(%eax)
+; CHECK32-NEXT:    movl %edi, (%eax)
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
+  ret <4 x i32> %tmp;
+}
-- 
GitLab


From fef0a760bec84d85e2e79e7dacfc2f10c9d20994 Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Mon, 29 Oct 2018 16:57:43 +0000
Subject: [PATCH 0704/1116] [X86] Enable the MachineVerifier by default

The machine verifier was disabled for x86 by default. There are now only
9 tests failing, compared to what previously was between 20 and 30.

This is a good opportunity to file bugs for all the remaining issues,
then explicitly disable the failing tests and enabling the machine
verifier by default.

This allows us to avoid adding new tests that break the verifier.

PR27481

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345513 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetMachine.h                       | 4 ----
 test/CodeGen/X86/avx512-regcall-NoMask.ll               | 7 ++++---
 test/CodeGen/X86/icall-branch-funnel.ll                 | 3 ++-
 test/CodeGen/X86/indirect-branch-tracking.ll            | 3 ++-
 test/CodeGen/X86/pr38795.ll                             | 3 ++-
 test/CodeGen/X86/scheduler-backtracking.ll              | 3 ++-
 test/CodeGen/X86/sjlj-eh.ll                             | 7 ++++---
 test/CodeGen/X86/speculative-load-hardening-indirect.ll | 5 +++--
 test/CodeGen/X86/win_coreclr_chkstk.ll                  | 3 ++-
 test/DebugInfo/X86/live-debug-vars-discard-invalid.mir  | 3 ++-
 10 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 5b21cd82b5b..f5b45da0c3d 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -53,10 +53,6 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
-
-  bool isMachineVerifierClean() const override {
-    return false;
-  }
 };
 
 } // end namespace llvm
diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll
index ea705d16c33..5ce1705e377 100644
--- a/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-pc-win32       -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq  | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-win32        -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq  | FileCheck %s --check-prefix=WIN64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu    -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq  | FileCheck %s --check-prefix=LINUXOSX64
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39437.
+; RUN: llc < %s -mtriple=i386-pc-win32       -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq -verify-machineinstrs=0  | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-win32        -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq -verify-machineinstrs=0  | FileCheck %s --check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu    -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq -verify-machineinstrs=0  | FileCheck %s --check-prefix=LINUXOSX64
 
 ; Test regcall when receiving/returning i1
 define x86_regcallcc i1 @test_argReti1(i1 %a)  {
diff --git a/test/CodeGen/X86/icall-branch-funnel.ll b/test/CodeGen/X86/icall-branch-funnel.ll
index 010734cd856..6d7e0c3d2c4 100644
--- a/test/CodeGen/X86/icall-branch-funnel.ll
+++ b/test/CodeGen/X86/icall-branch-funnel.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=x86_64-unknown-linux < %s | FileCheck %s
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436.
+; RUN: llc -mtriple=x86_64-unknown-linux -verify-machineinstrs=0 < %s | FileCheck %s
 
 @g = external global i8
 
diff --git a/test/CodeGen/X86/indirect-branch-tracking.ll b/test/CodeGen/X86/indirect-branch-tracking.ll
index 99d80852602..dc738bb7b54 100644
--- a/test/CodeGen/X86/indirect-branch-tracking.ll
+++ b/test/CodeGen/X86/indirect-branch-tracking.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=X86_64
 ; RUN: llc -mtriple=i386-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=X86
-; RUN: llc -mtriple i386-windows-gnu -exception-model sjlj < %s | FileCheck %s --check-prefix=SJLJ
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39439.
+; RUN: llc -mtriple i386-windows-gnu -exception-model sjlj -verify-machineinstrs=0 < %s | FileCheck %s --check-prefix=SJLJ
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Test1
diff --git a/test/CodeGen/X86/pr38795.ll b/test/CodeGen/X86/pr38795.ll
index 5603f056c67..6cb2a0859e3 100644
--- a/test/CodeGen/X86/pr38795.ll
+++ b/test/CodeGen/X86/pr38795.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc %s -O2 -mtriple=i386-unknown-linux-gnu -o - | FileCheck %s
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39440.
+; RUN: llc %s -O2 -mtriple=i386-unknown-linux-gnu -o - -verify-machineinstrs=0 | FileCheck %s
 @.str = external dso_local unnamed_addr constant [6 x i8], align 1
 @a = external dso_local local_unnamed_addr global i32, align 4
 @h = external dso_local local_unnamed_addr global i32, align 4
diff --git a/test/CodeGen/X86/scheduler-backtracking.ll b/test/CodeGen/X86/scheduler-backtracking.ll
index 0cd35114937..e558fed7436 100644
--- a/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/test/CodeGen/X86/scheduler-backtracking.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-hybrid | FileCheck %s --check-prefix=HYBRID
 ; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-burr   | FileCheck %s --check-prefix=BURR
 ; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=source      | FileCheck %s --check-prefix=SRC
-; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=linearize   | FileCheck %s --check-prefix=LIN
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39452.
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=linearize -verify-machineinstrs=0 | FileCheck %s --check-prefix=LIN
 
 ; PR22304 https://llvm.org/bugs/show_bug.cgi?id=22304
 ; Tests checking backtracking in source scheduler. llc used to crash on them.
diff --git a/test/CodeGen/X86/sjlj-eh.ll b/test/CodeGen/X86/sjlj-eh.ll
index 9a40b5932d4..8020e26234d 100644
--- a/test/CodeGen/X86/sjlj-eh.ll
+++ b/test/CodeGen/X86/sjlj-eh.ll
@@ -1,6 +1,7 @@
-; RUN: llc -mtriple i386-windows-gnu -exception-model sjlj -filetype asm -o - %s | FileCheck %s
-; RUN: llc -mtriple x86_64-windows-gnu -exception-model sjlj -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-X64
-; RUN: llc -mtriple x86_64-linux -exception-model sjlj -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-X64-LINUX
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39439.
+; RUN: llc -mtriple i386-windows-gnu -exception-model sjlj -filetype asm -o - %s -verify-machineinstrs=0 | FileCheck %s
+; RUN: llc -mtriple x86_64-windows-gnu -exception-model sjlj -filetype asm -o - %s -verify-machineinstrs=0 | FileCheck %s -check-prefix CHECK-X64
+; RUN: llc -mtriple x86_64-linux -exception-model sjlj -filetype asm -o - %s -verify-machineinstrs=0 | FileCheck %s -check-prefix CHECK-X64-LINUX
 
 declare void @_Z20function_that_throwsv()
 declare i32 @__gxx_personality_sj0(...)
diff --git a/test/CodeGen/X86/speculative-load-hardening-indirect.ll b/test/CodeGen/X86/speculative-load-hardening-indirect.ll
index 8761fcff5d9..0d04a85d367 100644
--- a/test/CodeGen/X86/speculative-load-hardening-indirect.ll
+++ b/test/CodeGen/X86/speculative-load-hardening-indirect.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -data-sections | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -relocation-model pic -data-sections | FileCheck %s --check-prefix=X64-PIC
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -data-sections -mattr=+retpoline | FileCheck %s --check-prefix=X64-RETPOLINE
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39451.
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -relocation-model pic -data-sections -verify-machineinstrs=0 | FileCheck %s --check-prefix=X64-PIC
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -data-sections -mattr=+retpoline -verify-machineinstrs=0 | FileCheck %s --check-prefix=X64-RETPOLINE
 ;
 ; FIXME: Add support for 32-bit.
 
diff --git a/test/CodeGen/X86/win_coreclr_chkstk.ll b/test/CodeGen/X86/win_coreclr_chkstk.ll
index 24f2b2be430..54789dc32d2 100644
--- a/test/CodeGen/X86/win_coreclr_chkstk.ll
+++ b/test/CodeGen/X86/win_coreclr_chkstk.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32-coreclr | FileCheck %s -check-prefix=WIN_X64
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR38376.
+; RUN: llc < %s -mtriple=x86_64-pc-win32-coreclr -verify-machineinstrs=0 | FileCheck %s -check-prefix=WIN_X64
 ; RUN: llc < %s -mtriple=x86_64-pc-linux         | FileCheck %s -check-prefix=LINUX
 
 ; By default, windows CoreCLR requires an inline prologue stack expansion check
diff --git a/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir b/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir
index c6d743171a5..f9a81cb156a 100644
--- a/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir
+++ b/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir
@@ -1,4 +1,5 @@
-# RUN: llc -mtriple=x86_64-linux-gnu -start-before greedy -stop-after virtregrewriter -o - %s | FileCheck %s
+# FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39481.
+# RUN: llc -mtriple=x86_64-linux-gnu -start-before greedy -stop-after virtregrewriter -o - -verify-machineinstrs=0 %s | FileCheck %s
 
 --- |
   ; ModuleID = '<stdin>'
-- 
GitLab


From 2aa608814cd2abcf0af014067b96c3ede8003c09 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Mon, 29 Oct 2018 17:26:01 +0000
Subject: [PATCH 0705/1116] [AMDGPU] Match v_swap_b32

Differential Revision: https://reviews.llvm.org/D52677

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345514 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUSubtarget.h        |   4 +
 lib/Target/AMDGPU/SIShrinkInstructions.cpp | 171 +++++++
 test/CodeGen/AMDGPU/v_swap_b32.mir         | 564 +++++++++++++++++++++
 3 files changed, 739 insertions(+)
 create mode 100644 test/CodeGen/AMDGPU/v_swap_b32.mir

diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index ca055f6c957..681ab3a2750 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -516,6 +516,10 @@ public:
     return FMA;
   }
 
+  bool hasSwap() const {
+    return GFX9Insts;
+  }
+
   TrapHandlerAbi getTrapHandlerAbi() const {
     return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
   }
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index d37ad077dd6..6e58c138a76 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -212,6 +212,169 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
   }
 }
 
+// This is the same as MachineInstr::readsRegister/modifiesRegister except
+// it takes subregs into account.
+static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
+                          unsigned Reg, unsigned SubReg,
+                          const SIRegisterInfo &TRI) {
+  for (const MachineOperand &MO : R) {
+    if (!MO.isReg())
+      continue;
+
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+        TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (TRI.regsOverlap(Reg, MO.getReg()))
+        return true;
+    } else if (MO.getReg() == Reg &&
+               TargetRegisterInfo::isVirtualRegister(Reg)) {
+      LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
+                            TRI.getSubRegIndexLaneMask(MO.getSubReg());
+      if (Overlap.any())
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool instReadsReg(const MachineInstr *MI,
+                         unsigned Reg, unsigned SubReg,
+                         const SIRegisterInfo &TRI) {
+  return instAccessReg(MI->uses(), Reg, SubReg, TRI);
+}
+
+static bool instModifiesReg(const MachineInstr *MI,
+                            unsigned Reg, unsigned SubReg,
+                            const SIRegisterInfo &TRI) {
+  return instAccessReg(MI->defs(), Reg, SubReg, TRI);
+}
+
+static TargetInstrInfo::RegSubRegPair
+getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
+                  const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
+  if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
+    } else {
+      LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub);
+      Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger()));
+    }
+  }
+  return TargetInstrInfo::RegSubRegPair(Reg, Sub);
+}
+
+// Match:
+// mov t, x
+// mov x, y
+// mov y, t
+//
+// =>
+//
+// mov t, x (t is potentially dead and move eliminated)
+// v_swap_b32 x, y
+//
+// Returns next valid instruction pointer if was able to create v_swap_b32.
+//
+// This shall not be done too early not to prevent possible folding which may
+// remove matched moves, and this should prefereably be done before RA to
+// release saved registers and also possibly after RA which can insert copies
+// too.
+//
+// This is really just a generic peephole that is not a canocical shrinking,
+// although requirements match the pass placement and it reduces code size too.
+static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
+                               const SIInstrInfo *TII) {
+  assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+         MovT.getOpcode() == AMDGPU::COPY);
+
+  unsigned T = MovT.getOperand(0).getReg();
+  unsigned Tsub = MovT.getOperand(0).getSubReg();
+  MachineOperand &Xop = MovT.getOperand(1);
+
+  if (!Xop.isReg())
+    return nullptr;
+  unsigned X = Xop.getReg();
+  unsigned Xsub = Xop.getSubReg();
+
+  unsigned Size = TII->getOpSize(MovT, 0) / 4;
+
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  if (!TRI.isVGPR(MRI, X))
+    return false;
+
+  for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) {
+    if (YTop.getSubReg() != Tsub)
+      continue;
+
+    MachineInstr &MovY = *YTop.getParent();
+    if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+         MovY.getOpcode() != AMDGPU::COPY) ||
+        MovY.getOperand(1).getSubReg() != Tsub)
+      continue;
+
+    unsigned Y = MovY.getOperand(0).getReg();
+    unsigned Ysub = MovY.getOperand(0).getSubReg();
+
+    if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent())
+      continue;
+
+    MachineInstr *MovX = nullptr;
+    auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end();
+    for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) {
+      if (instReadsReg(&*I, X, Xsub, TRI) ||
+          instModifiesReg(&*I, Y, Ysub, TRI) ||
+          instModifiesReg(&*I, T, Tsub, TRI) ||
+          (MovX && instModifiesReg(&*I, X, Xsub, TRI))) {
+        MovX = nullptr;
+        break;
+      }
+      if (!instReadsReg(&*I, Y, Ysub, TRI)) {
+        if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) {
+          MovX = nullptr;
+          break;
+        }
+        continue;
+      }
+      if (MovX ||
+          (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+           I->getOpcode() != AMDGPU::COPY) ||
+          I->getOperand(0).getReg() != X ||
+          I->getOperand(0).getSubReg() != Xsub) {
+        MovX = nullptr;
+        break;
+      }
+      MovX = &*I;
+    }
+
+    if (!MovX || I == E)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY);
+
+    for (unsigned I = 0; I < Size; ++I) {
+      TargetInstrInfo::RegSubRegPair X1, Y1;
+      X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
+      Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
+      BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(),
+                TII->get(AMDGPU::V_SWAP_B32))
+        .addDef(X1.Reg, 0, X1.SubReg)
+        .addDef(Y1.Reg, 0, Y1.SubReg)
+        .addReg(Y1.Reg, 0, Y1.SubReg)
+        .addReg(X1.Reg, 0, X1.SubReg).getInstr();
+    }
+    MovX->eraseFromParent();
+    MovY.eraseFromParent();
+    MachineInstr *Next = &*std::next(MovT.getIterator());
+    if (MRI.use_nodbg_empty(T))
+      MovT.eraseFromParent();
+    else
+      Xop.setIsKill(false);
+
+    return Next;
+  }
+
+  return nullptr;
+}
+
 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -252,6 +415,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         }
       }
 
+      if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+                           MI.getOpcode() == AMDGPU::COPY)) {
+        if (auto *NextMI = matchSwap(MI, MRI, TII)) {
+          Next = NextMI->getIterator();
+          continue;
+        }
+      }
+
       // Combine adjacent s_nops to use the immediate operand encoding how long
       // to wait.
       //
diff --git a/test/CodeGen/AMDGPU/v_swap_b32.mir b/test/CodeGen/AMDGPU/v_swap_b32.mir
new file mode 100644
index 00000000000..f0ce14bb9dd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/v_swap_b32.mir
@@ -0,0 +1,564 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: swap_phys_condensed
+# GCN: bb.0:
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: S_SETPC_B64_return
+---
+name:            swap_phys_condensed
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+    $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+# GCN-LABEL: name: swap_phys_sparse
+# GCN: bb.0:
+# GCN-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec
+# GCN-NEXT: S_SETPC_B64_return
+---
+name:            swap_phys_sparse
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+    $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+# GCN-LABEL: name: swap_phys_liveout
+# GCN: bb.0:
+# GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: S_SETPC_B64_return
+---
+name:            swap_phys_liveout
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+    $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr2, implicit $vgpr1
+...
+
+# GCN-LABEL: name: swap_phys_b64
+# GCN: bb.0:
+# GCN-NEXT: $vgpr0, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr1, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr1, implicit $exec
+---
+name:            swap_phys_b64
+body:             |
+  bb.0:
+    $vgpr4_vgpr5 = COPY killed $vgpr0_vgpr1
+    $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3
+    $vgpr2_vgpr3 = COPY killed $vgpr4_vgpr5
+...
+
+# GCN-LABEL: name: swap_phys_overlap_x
+# GCN: bb.0:
+# GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr3_vgpr4 = V_ADD_F64 0, $vgpr0_vgpr1, 0, $vgpr3_vgpr4, 0, 0, implicit $exec
+# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+---
+name:            swap_phys_overlap_x
+body:             |
+  bb.0:
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr3_vgpr4 = V_ADD_F64 0, $vgpr0_vgpr1, 0, $vgpr3_vgpr4, 0, 0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+...
+
+# GCN-LABEL: name: swap_phys_clobber_y
+# GCN: bb.0:
+# GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+---
+name:            swap_phys_clobber_y
+body:             |
+  bb.0:
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_copy_condense
+# GCN: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
+---
+name:            swap_virt_copy_condense
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_copy_sparse
+# GCN: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
+---
+name:            swap_virt_copy_sparse
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    S_NOP 0
+    %0 = COPY %1
+    S_NOP 0
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg
+# GCN: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+---
+name:            swap_virt_copy_subreg
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0
+    %2.sub1 = COPY %0.sub1
+    %0.sub0 = COPY %1.sub0
+    %0.sub1 = COPY %1.sub1
+    %1.sub0 = COPY %2.sub0
+...
+
+# GCN-LABEL: name: swap_virt_mov
+# GCN: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
+---
+name:            swap_virt_mov
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = V_MOV_B32_e32 %0, implicit $exec
+    %0 = V_MOV_B32_e32 %1, implicit $exec
+    %1 = V_MOV_B32_e32 %2, implicit $exec
+...
+
+# GCN-LABEL: name: swap_virt_read_x
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %3:vgpr_32 = COPY %0
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_read_x
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %3 = COPY %0
+    %0 = COPY %1
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_read_t_twice
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %3:vgpr_32 = COPY %2
+# GCN-NEXT: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_read_t_twice
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %3 = COPY %2
+    %0 = COPY %1
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_clobber_y
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_clobber_y
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %1 = IMPLICIT_DEF
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_clobber_x1
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_clobber_x1
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %0 = IMPLICIT_DEF
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_clobber_x2
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_clobber_x2
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = IMPLICIT_DEF
+    %0 = COPY %1
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_clobber_t
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %2:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_clobber_t
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %2 = IMPLICIT_DEF
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg_overlap_x_full
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
+# GCN-NEXT: %3:vreg_64 = COPY %0
+# GCN-NEXT: %0.sub0:vreg_64 = COPY %1.sub0
+# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
+---
+name:            swap_virt_copy_subreg_overlap_x_full
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+  - { id: 3, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0
+    %3 = COPY %0
+    %0.sub0 = COPY %1.sub0
+    %1.sub0 = COPY %2.sub0
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg_overlap_x_part
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
+# GCN-NEXT: %3:vreg_64 = COPY %0.sub0_sub1
+# GCN-NEXT: %0.sub0:vreg_128 = COPY %1.sub0
+# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
+---
+name:            swap_virt_copy_subreg_overlap_x_part
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+  - { id: 3, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0
+    %3 = COPY %0.sub0_sub1
+    %0.sub0 = COPY %1.sub0
+    %1.sub0 = COPY %2.sub0
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg_wide_y
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
+# GCN-NEXT: %0.sub0:vreg_64 = COPY %1.sub0
+# GCN-NEXT: %1:vreg_64 = COPY %2
+---
+name:            swap_virt_copy_subreg_wide_y
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0
+    %0.sub0 = COPY %1.sub0
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_b64
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+# GCN-NEXT: %0.sub1:vreg_64, %1.sub1:vreg_64 = V_SWAP_B32 %1.sub1, %0.sub1, implicit $exec
+---
+name:            swap_virt_b64
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_b128
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %0.sub0:vreg_128, %1.sub0:vreg_128 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+# GCN-NEXT: %0.sub1:vreg_128, %1.sub1:vreg_128 = V_SWAP_B32 %1.sub1, %0.sub1, implicit $exec
+# GCN-NEXT: %0.sub2:vreg_128, %1.sub2:vreg_128 = V_SWAP_B32 %1.sub2, %0.sub2, implicit $exec
+# GCN-NEXT: %0.sub3:vreg_128, %1.sub3:vreg_128 = V_SWAP_B32 %1.sub3, %0.sub3, implicit $exec
+---
+name:            swap_virt_b128
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_128 }
+  - { id: 2, class: vreg_128 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_b128_sub0_1
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %0.sub0:vreg_128, %1.sub0:vreg_128 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+# GCN-NEXT: %0.sub1:vreg_128, %1.sub1:vreg_128 = V_SWAP_B32 %1.sub1, %0.sub1, implicit $exec
+# GCN-NEXT: S_ENDPGM
+---
+name:            swap_virt_b128_sub0_1
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_128 }
+  - { id: 2, class: vreg_128 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0_sub1 = COPY %0.sub0_sub1
+    %0.sub0_sub1 = COPY %1.sub0_sub1
+    %1.sub0_sub1 = COPY %2.sub0_sub1
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_b128_sub2_3
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %0.sub2:vreg_128, %1.sub2:vreg_128 = V_SWAP_B32 %1.sub2, %0.sub2, implicit $exec
+# GCN-NEXT: %0.sub3:vreg_128, %1.sub3:vreg_128 = V_SWAP_B32 %1.sub3, %0.sub3, implicit $exec
+# GCN-NEXT: S_ENDPGM
+---
+name:            swap_virt_b128_sub2_3
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_128 }
+  - { id: 2, class: vreg_128 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub2_sub3 = COPY %0.sub2_sub3
+    %0.sub2_sub3 = COPY %1.sub2_sub3
+    %1.sub2_sub3 = COPY %2.sub2_sub3
+    S_ENDPGM
+...
+
+
+# GCN-LABEL: name: swap_virt_s_to_s
+# GCN: bb.0:
+# GCN-NEXT: %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:sgpr_32 = COPY %0
+# GCN-NEXT: %0:sgpr_32 = COPY %1
+# GCN-NEXT: %1:sgpr_32 = COPY %2
+---
+name:            swap_virt_s_to_s
+registers:
+  - { id: 0, class: sgpr_32 }
+  - { id: 1, class: sgpr_32 }
+  - { id: 2, class: sgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg_impdef_super
+# GCN: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+---
+name:            swap_virt_copy_subreg_impdef_super
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0, implicit-def %2, implicit $exec
+    %2.sub1 = COPY %0.sub1
+    %0.sub0 = COPY %1.sub0
+    %0.sub1 = COPY %1.sub1
+    %1.sub0 = COPY %2.sub0
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg_impuse_x
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
+# GCN-NEXT: %2.sub1:vreg_64 = COPY %0.sub1
+# GCN-NEXT: %0.sub0:vreg_64 = COPY %1.sub0, implicit %0
+# GCN-NEXT: %0.sub1:vreg_64 = COPY %1.sub1
+# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
+# GCN-NEXT: S_ENDPGM
+---
+name:            swap_virt_copy_subreg_impuse_x
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0
+    %2.sub1 = COPY %0.sub1
+    %0.sub0 = COPY %1.sub0, implicit %0
+    %0.sub1 = COPY %1.sub1
+    %1.sub0 = COPY %2.sub0
+    S_ENDPGM
+...
-- 
GitLab


From e6817098ad0ac21f0a0275f58dde593ac3c5c406 Mon Sep 17 00:00:00 2001
From: Bryan Chan <bryan.chan@huawei.com>
Date: Mon, 29 Oct 2018 17:27:34 +0000
Subject: [PATCH 0706/1116] [AArch64] Rename FP16FML instruction format (NFC)

Rename SIMDThreeSameMult (etc.) to SIMDThreeSameVectorFML (etc.) to follow
usual naming convention, and add some comments in the .td files.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345515 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrFormats.td | 124 ++++++++++++----------
 lib/Target/AArch64/AArch64InstrInfo.td    |  26 ++---
 2 files changed, 78 insertions(+), 72 deletions(-)

diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index d1e189362f0..aef0a7af500 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -4941,46 +4941,6 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
   let Inst{4-0}   = Rd;
 }
 
-// ARMv8.2 Fused Multiply Add Long Instructions (Vector)
-class BaseSIMDThreeSameMult<bit Q, bit U, bit b13, bits<3> size, string asm, string kind1,
-                                 string kind2, RegisterOperand RegType,
-                                 ValueType AccumType, ValueType InputType,
-                                 SDPatternOperator OpNode> :
-        BaseSIMDThreeSameVectorTied<Q, U, size, 0b11101, RegType, asm, kind1,
-		[(set (AccumType RegType:$dst),
-              (OpNode (AccumType RegType:$Rd),
-                      (InputType RegType:$Rn),
-                      (InputType RegType:$Rm)))]> {
-  let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
-  let Inst{13} = b13;
-}
-
-multiclass SIMDThreeSameMult<bit U, bit b13, bits<3> size, string asm, SDPatternOperator OpNode> {
-  def v4f16 : BaseSIMDThreeSameMult<0, U, b13, size, asm, ".2s", ".2h", V64,
-                                         v2f32, v4f16, OpNode>;
-  def v8f16 : BaseSIMDThreeSameMult<1, U, b13, size, asm, ".4s", ".4h", V128,
-                                         v4f32, v8f16, OpNode>;
-}
-
-class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
-                                 string kind2, RegisterOperand RegType,
-                                 ValueType AccumType, ValueType InputType,
-                                 SDPatternOperator OpNode> :
-        BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
-        [(set (AccumType RegType:$dst),
-              (OpNode (AccumType RegType:$Rd),
-                      (InputType RegType:$Rn),
-                      (InputType RegType:$Rm)))]> {
-  let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
-}
-
-multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
-                                         v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
-                                         v4i32, v16i8, OpNode>;
-}
-
 // All operand sizes distinguished in the encoding.
 multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
@@ -5221,6 +5181,51 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
         V128:$LHS, V128:$MHS, V128:$RHS)>;
 }
 
+// ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
+// bytes from S-sized elements.
+class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
+                                 string kind2, RegisterOperand RegType,
+                                 ValueType AccumType, ValueType InputType,
+                                 SDPatternOperator OpNode> :
+        BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
+        [(set (AccumType RegType:$dst),
+              (OpNode (AccumType RegType:$Rd),
+                      (InputType RegType:$Rn),
+                      (InputType RegType:$Rm)))]> {
+  let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
+}
+
+multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
+                                         v2i32, v8i8, OpNode>;
+  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
+                                         v4i32, v16i8, OpNode>;
+}
+
+// ARMv8.2-A Fused Multiply Add-Long Instructions (Vector): These instructions
+// select inputs from 4H vectors and accumulate outputs to a 2S vector (or from
+// 8H to 4S, when Q=1).
+class BaseSIMDThreeSameVectorFML<bit Q, bit U, bit b13, bits<3> size, string asm, string kind1,
+                                 string kind2, RegisterOperand RegType,
+                                 ValueType AccumType, ValueType InputType,
+                                 SDPatternOperator OpNode> :
+        BaseSIMDThreeSameVectorTied<Q, U, size, 0b11101, RegType, asm, kind1,
+		[(set (AccumType RegType:$dst),
+              (OpNode (AccumType RegType:$Rd),
+                      (InputType RegType:$Rn),
+                      (InputType RegType:$Rm)))]> {
+  let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
+  let Inst{13} = b13;
+}
+
+multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,
+                                  SDPatternOperator OpNode> {
+  def v4f16 : BaseSIMDThreeSameVectorFML<0, U, b13, size, asm, ".2s", ".2h", V64,
+                                         v2f32, v4f16, OpNode>;
+  def v8f16 : BaseSIMDThreeSameVectorFML<1, U, b13, size, asm, ".4s", ".4h", V128,
+                                         v4f32, v8f16, OpNode>;
+}
+
 
 //----------------------------------------------------------------------------
 // AdvSIMD two register vector instructions.
@@ -7427,7 +7432,7 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
   let Inst{4-0}   = Rd;
 }
 
-// ARMv8.2 Index Dot product instructions
+// ARMv8.2-A Dot Product Instructions (Indexed)
 class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
                                       string lhs_kind, string rhs_kind,
                                       RegisterOperand RegType,
@@ -7446,12 +7451,20 @@ class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
   let Inst{11}    = idx{1};  // H
 }
 
-// ARMv8.2 Fused Multiply Add Long Instructions (Indexed)
-class BaseSIMDThreeSameMultIndex<bit Q, bit U, bits<4> opc, string asm,
-                                 string dst_kind, string lhs_kind,
-                                 string rhs_kind, RegisterOperand RegType,
-                                 ValueType AccumType, ValueType InputType,
-                                 SDPatternOperator OpNode> :
+multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
+                                       SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b",
+                                              V64, v2i32, v8i8, OpNode>;
+  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b",
+                                              V128, v4i32, v16i8, OpNode>;
+}
+
+// ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed)
+class BaseSIMDThreeSameVectorFMLIndex<bit Q, bit U, bits<4> opc, string asm,
+                                      string dst_kind, string lhs_kind,
+                                      string rhs_kind, RegisterOperand RegType,
+                                      ValueType AccumType, ValueType InputType,
+                                      SDPatternOperator OpNode> :
         BaseSIMDIndexedTied<Q, U, 0, 0b10, opc, RegType, RegType, V128,
                             VectorIndexH, asm, "", dst_kind, lhs_kind, rhs_kind,
           [(set (AccumType RegType:$dst),
@@ -7466,19 +7479,12 @@ class BaseSIMDThreeSameMultIndex<bit Q, bit U, bits<4> opc, string asm,
   let Inst{20} = idx{0}; // M
 }
 
-multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
+multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
                                        SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b", V64,
-                                              v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b", V128,
-                                              v4i32, v16i8, OpNode>;
-}
-
-multiclass SIMDThreeSameMultIndex<bit U, bits<4> opc, string asm, SDPatternOperator OpNode> {
-  def v4f16 : BaseSIMDThreeSameMultIndex<0, U, opc, asm, ".2s", ".2h", ".h", V64,
-                                         v2f32, v4f16, OpNode>;
-  def v8f16 : BaseSIMDThreeSameMultIndex<1, U, opc, asm, ".4s", ".4h", ".h", V128,
-                                         v4f32, v8f16, OpNode>;
+  def v4f16 : BaseSIMDThreeSameVectorFMLIndex<0, U, opc, asm, ".2s", ".2h", ".h",
+                                              V64, v2f32, v4f16, OpNode>;
+  def v8f16 : BaseSIMDThreeSameVectorFMLIndex<1, U, opc, asm, ".4s", ".4h", ".h",
+                                              V128, v4f32, v8f16, OpNode>;
 }
 
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 77461eccf3e..2dc5991d708 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -514,7 +514,7 @@ def TSB   : CRmSystemI<barrier_op, 0b010, "tsb", []> {
 }
 }
 
-// ARMv8.2 Dot Product
+// ARMv8.2-A Dot Product
 let Predicates = [HasDotProd] in {
 defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
 defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
@@ -522,6 +522,18 @@ defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
 defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
 }
 
+// ARMv8.2-A FP16 Fused Multiply-Add Long
+let Predicates = [HasNEON, HasFP16FML] in {
+defm FMLAL      : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;
+defm FMLSL      : SIMDThreeSameVectorFML<0, 1, 0b101, "fmlsl", int_aarch64_neon_fmlsl>;
+defm FMLAL2     : SIMDThreeSameVectorFML<1, 0, 0b001, "fmlal2", int_aarch64_neon_fmlal2>;
+defm FMLSL2     : SIMDThreeSameVectorFML<1, 0, 0b101, "fmlsl2", int_aarch64_neon_fmlsl2>;
+defm FMLALlane  : SIMDThreeSameVectorFMLIndex<0, 0b0000, "fmlal", int_aarch64_neon_fmlal>;
+defm FMLSLlane  : SIMDThreeSameVectorFMLIndex<0, 0b0100, "fmlsl", int_aarch64_neon_fmlsl>;
+defm FMLAL2lane : SIMDThreeSameVectorFMLIndex<1, 0b1000, "fmlal2", int_aarch64_neon_fmlal2>;
+defm FMLSL2lane : SIMDThreeSameVectorFMLIndex<1, 0b1100, "fmlsl2", int_aarch64_neon_fmlsl2>;
+}
+
 // Armv8.2-A Crypto extensions
 let Predicates = [HasSHA3] in {
 def SHA512H   : CryptoRRRTied<0b0, 0b00, "sha512h">;
@@ -3484,18 +3496,6 @@ defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
 defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
                                                     int_aarch64_neon_sqsub>;
 
-// FP16FML
-let Predicates = [HasNEON, HasFP16FML] in {
-defm FMLAL      : SIMDThreeSameMult<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;
-defm FMLSL      : SIMDThreeSameMult<0, 1, 0b101, "fmlsl", int_aarch64_neon_fmlsl>;
-defm FMLAL2     : SIMDThreeSameMult<1, 0, 0b001, "fmlal2", int_aarch64_neon_fmlal2>;
-defm FMLSL2     : SIMDThreeSameMult<1, 0, 0b101, "fmlsl2", int_aarch64_neon_fmlsl2>;
-defm FMLALlane  : SIMDThreeSameMultIndex<0, 0b0000, "fmlal", int_aarch64_neon_fmlal>;
-defm FMLSLlane  : SIMDThreeSameMultIndex<0, 0b0100, "fmlsl", int_aarch64_neon_fmlsl>;
-defm FMLAL2lane : SIMDThreeSameMultIndex<1, 0b1000, "fmlal2", int_aarch64_neon_fmlal2>;
-defm FMLSL2lane : SIMDThreeSameMultIndex<1, 0b1100, "fmlsl2", int_aarch64_neon_fmlsl2>;
-}
-
 defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
 defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
                                   BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
-- 
GitLab


From 06c7d86e4988b7fe75fc9141b9edd31744dee9fe Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Mon, 29 Oct 2018 17:53:23 +0000
Subject: [PATCH 0707/1116] [AMDGPU] Fixed return value causing warning and
 regression

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345518 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIShrinkInstructions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 6e58c138a76..015773b1104 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -299,7 +299,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
 
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   if (!TRI.isVGPR(MRI, X))
-    return false;
+    return nullptr;
 
   for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) {
     if (YTop.getSubReg() != Tsub)
-- 
GitLab


From 69e1e93c55c2678b1044c1703cf6500f1ca11e1d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 29 Oct 2018 18:17:01 +0000
Subject: [PATCH 0708/1116] [X86] Add AES to KNL CPUs to match clang.

I believe this was lost from KNL when AES was pushed from Westmere to Skylake recently. KNL used to inherit from IVB.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345519 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.td | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 3034b6618df..74135656528 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -826,6 +826,7 @@ def KNLFeatures : ProcessorFeatures<[], [
   FeatureLAHFSAHF,
   FeatureSlow3OpsLEA,
   FeatureSlowIncDec,
+  FeatureAES,
   FeatureRDRAND,
   FeatureF16C,
   FeatureFSGSBase,
-- 
GitLab


From 125dd26c10d017f29cf8558bbe9372f743783c31 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Oct 2018 18:25:48 +0000
Subject: [PATCH 0709/1116] [X86][SSE] getFauxShuffleMask - Fix shuffle mask
 adjustment for multiple inserted subvectors

Part of the issue discovered in PR39483, although its not fully exposed until I reapply rL345395 (by reverting rL345451)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345520 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            |  7 +-
 .../X86/vector-shuffle-combining-avx.ll       | 97 +++++++++++++++++++
 2 files changed, 100 insertions(+), 4 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f2c5040b89e..35239b79f18 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6379,13 +6379,12 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
       Mask.push_back(i);
     for (int i = 0; i != (int)NumSubElts; ++i) {
       int M = SubMask[i];
-      if (M < 0) {
-        Mask[i + InsertIdx] = M;
-      } else {
+      if (0 <= M) {
         int InputIdx = M / NumSubElts;
         int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1);
-        Mask[i + InsertIdx] = (NumElts * (1 + InputIdx)) + ExtractIdx + M;
+        M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts);
       }
+      Mask[i + InsertIdx] = M;
     }
     // TODO - Add support for more than 1 subinput.
     return Ops.size() <= 2;
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index ace577ce9a3..678feb8b330 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -435,3 +435,100 @@ define <8 x float> @constant_fold_vpermilvar_ps_256() {
   %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>)
   ret <8 x float> %1
 }
+
+define void @PR39483() {
+; X32-AVX1-LABEL: PR39483:
+; X32-AVX1:       # %bb.0: # %entry
+; X32-AVX1-NEXT:    vmovups 32, %ymm0
+; X32-AVX1-NEXT:    vmovups 64, %ymm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,3]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; X32-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; X32-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; X32-AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; X32-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vmovups %ymm0, (%eax)
+;
+; X32-AVX2-LABEL: PR39483:
+; X32-AVX2:       # %bb.0: # %entry
+; X32-AVX2-NEXT:    vmovups 32, %ymm0
+; X32-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X32-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; X32-AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; X32-AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
+; X32-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; X32-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; X32-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vmovups %ymm0, (%eax)
+;
+; X32-AVX512-LABEL: PR39483:
+; X32-AVX512:       # %bb.0: # %entry
+; X32-AVX512-NEXT:    vmovups 0, %zmm0
+; X32-AVX512-NEXT:    vmovups 64, %ymm1
+; X32-AVX512-NEXT:    vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u>
+; X32-AVX512-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2
+; X32-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X32-AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm1
+; X32-AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; X32-AVX512-NEXT:    vmovups %ymm0, (%eax)
+;
+; X64-AVX1-LABEL: PR39483:
+; X64-AVX1:       # %bb.0: # %entry
+; X64-AVX1-NEXT:    vmovups 32, %ymm0
+; X64-AVX1-NEXT:    vmovups 64, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,3]
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; X64-AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; X64-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vmovups %ymm0, (%rax)
+;
+; X64-AVX2-LABEL: PR39483:
+; X64-AVX2:       # %bb.0: # %entry
+; X64-AVX2-NEXT:    vmovups 32, %ymm0
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; X64-AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
+; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; X64-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovups %ymm0, (%rax)
+;
+; X64-AVX512-LABEL: PR39483:
+; X64-AVX512:       # %bb.0: # %entry
+; X64-AVX512-NEXT:    vmovups 0, %zmm0
+; X64-AVX512-NEXT:    vmovups 64, %ymm1
+; X64-AVX512-NEXT:    vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u>
+; X64-AVX512-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2
+; X64-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm1
+; X64-AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; X64-AVX512-NEXT:    vmovups %ymm0, (%rax)
+entry:
+  %wide.vec = load <24 x float>, <24 x float>* null, align 4
+  %strided.vec18 = shufflevector <24 x float> %wide.vec, <24 x float> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %0 = fmul <8 x float> %strided.vec18, zeroinitializer
+  %1 = fadd <8 x float> zeroinitializer, %0
+  store <8 x float> %1, <8 x float>* undef, align 16
+  unreachable
+}
-- 
GitLab


From 7b852b7b76c9dcb1c1893b048779d535ede0bb9c Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Mon, 29 Oct 2018 18:38:12 +0000
Subject: [PATCH 0710/1116] [WebAssembly] Lower away condition truncations for
 scalar selects

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53676

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345521 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyInstrFloat.td      |  7 ++++
 .../WebAssembly/WebAssemblyInstrInteger.td    |  7 ++++
 test/CodeGen/WebAssembly/select.ll            | 42 ++++++++++++++++++-
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 3c02b0f01ea..c5290f00b43 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -122,3 +122,10 @@ def : Pat<(select (i32 (seteq I32:$cond, 0)), F32:$lhs, F32:$rhs),
           (SELECT_F32 F32:$rhs, F32:$lhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), F64:$lhs, F64:$rhs),
           (SELECT_F64 F64:$rhs, F64:$lhs, I32:$cond)>;
+
+// The legalizer inserts an unnecessary `and 1` to make input conform
+// to getBooleanContents, which we can lower away.
+def : Pat<(select (i32 (and I32:$cond, 1)), F32:$lhs, F32:$rhs),
+          (SELECT_F32 F32:$lhs, F32:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (and I32:$cond, 1)), F64:$lhs, F64:$rhs),
+          (SELECT_F64 F64:$lhs, F64:$rhs, I32:$cond)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index bd41f46214a..d5b63d64369 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -122,3 +122,10 @@ def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs),
           (SELECT_I32 I32:$rhs, I32:$lhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs),
           (SELECT_I64 I64:$rhs, I64:$lhs, I32:$cond)>;
+
+// The legalizer inserts an unnecessary `and 1` to make input conform
+// to getBooleanContents, which we can lower away.
+def : Pat<(select (i32 (and I32:$cond, 1)), I32:$lhs, I32:$rhs),
+          (SELECT_I32 I32:$lhs, I32:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (and I32:$cond, 1)), I64:$lhs, I64:$rhs),
+          (SELECT_I64 I64:$lhs, I64:$rhs, I32:$cond)>;
diff --git a/test/CodeGen/WebAssembly/select.ll b/test/CodeGen/WebAssembly/select.ll
index 6f6e95f8418..99b8d45d8e2 100644
--- a/test/CodeGen/WebAssembly/select.ll
+++ b/test/CodeGen/WebAssembly/select.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefixes CHECK,SLOW
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel -fast-isel-abort=1 | FileCheck %s
 
 ; Test that wasm select instruction is selected from LLVM select instruction.
@@ -16,6 +16,16 @@ define i32 @select_i32_bool(i1 zeroext %a, i32 %b, i32 %c) {
   ret i32 %cond
 }
 
+; CHECK-LABEL: select_i32_bool_nozext:
+; CHECK-NEXT: .param     i32, i32, i32{{$}}
+; CHECK-NEXT: .result    i32{{$}}
+; SLOW-NEXT: i32.select $push0=, $1, $2, $0{{$}}
+; SLOW-NEXT: return     $pop0{{$}}
+define i32 @select_i32_bool_nozext(i1 %a, i32 %b, i32 %c) {
+  %cond = select i1 %a, i32 %b, i32 %c
+  ret i32 %cond
+}
+
 ; CHECK-LABEL: select_i32_eq:
 ; CHECK-NEXT: .param     i32, i32, i32{{$}}
 ; CHECK-NEXT: .result    i32{{$}}
@@ -48,6 +58,16 @@ define i64 @select_i64_bool(i1 zeroext %a, i64 %b, i64 %c) {
   ret i64 %cond
 }
 
+; CHECK-LABEL: select_i64_bool_nozext:
+; CHECK-NEXT: .param     i32, i64, i64{{$}}
+; CHECK-NEXT: .result    i64{{$}}
+; SLOW-NEXT: i64.select $push0=, $1, $2, $0{{$}}
+; SLOW-NEXT: return     $pop0{{$}}
+define i64 @select_i64_bool_nozext(i1 %a, i64 %b, i64 %c) {
+  %cond = select i1 %a, i64 %b, i64 %c
+  ret i64 %cond
+}
+
 ; CHECK-LABEL: select_i64_eq:
 ; CHECK-NEXT: .param     i32, i64, i64{{$}}
 ; CHECK-NEXT: .result    i64{{$}}
@@ -80,6 +100,16 @@ define float @select_f32_bool(i1 zeroext %a, float %b, float %c) {
   ret float %cond
 }
 
+; CHECK-LABEL: select_f32_bool_nozext:
+; CHECK-NEXT: .param     i32, f32, f32{{$}}
+; CHECK-NEXT: .result    f32{{$}}
+; SLOW-NEXT: f32.select $push0=, $1, $2, $0{{$}}
+; SLOW-NEXT: return     $pop0{{$}}
+define float @select_f32_bool_nozext(i1 %a, float %b, float %c) {
+  %cond = select i1 %a, float %b, float %c
+  ret float %cond
+}
+
 ; CHECK-LABEL: select_f32_eq:
 ; CHECK-NEXT: .param     i32, f32, f32{{$}}
 ; CHECK-NEXT: .result    f32{{$}}
@@ -112,6 +142,16 @@ define double @select_f64_bool(i1 zeroext %a, double %b, double %c) {
   ret double %cond
 }
 
+; CHECK-LABEL: select_f64_bool_nozext:
+; CHECK-NEXT: .param     i32, f64, f64{{$}}
+; CHECK-NEXT: .result    f64{{$}}
+; SLOW-NEXT: f64.select $push0=, $1, $2, $0{{$}}
+; SLOW-NEXT: return     $pop0{{$}}
+define double @select_f64_bool_nozext(i1 %a, double %b, double %c) {
+  %cond = select i1 %a, double %b, double %c
+  ret double %cond
+}
+
 ; CHECK-LABEL: select_f64_eq:
 ; CHECK-NEXT: .param     i32, f64, f64{{$}}
 ; CHECK-NEXT: .result    f64{{$}}
-- 
GitLab


From c65373c0983e32bdc14fb3bb3ec8703f3332e451 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Mon, 29 Oct 2018 19:15:39 +0000
Subject: [PATCH 0711/1116] [HotColdSplitting] Allow outlining single-block
 cold regions

It can be profitable to outline single-block cold regions because they
may be large.

Allow outlining single-block regions if they have over some threshold of
non-debug, non-terminator instructions. I chose 3 as the threshold after
experimenting with several internal frameworks.

In practice, reducing the threshold further did not give much
improvement, whereas increasing it resulted in substantial regressions.

Differential Revision: https://reviews.llvm.org/D53824

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345524 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/HotColdSplitting.cpp       | 23 ++++++-
 test/Transforms/HotColdSplit/do-not-split.ll  | 64 ++++++++++++++++---
 test/Transforms/HotColdSplit/minsize.ll       | 23 ++-----
 .../HotColdSplit/split-out-dbg-val-of-arg.ll  | 34 +++-------
 4 files changed, 90 insertions(+), 54 deletions(-)

diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index 4f371a494e9..ce8a5060a3a 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
@@ -65,6 +66,10 @@ using namespace llvm;
 static cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis",
                               cl::init(true), cl::Hidden);
 
+static cl::opt<unsigned> MinOutliningInstCount(
+    "min-outlining-inst-count", cl::init(3), cl::Hidden,
+    cl::desc("Minimum number of instructions needed for a single-block region "
+             "to be an outlining candidate"));
 
 namespace {
 
@@ -130,6 +135,19 @@ static bool mayExtractBlock(const BasicBlock &BB) {
   return !BB.hasAddressTaken();
 }
 
+/// Check whether \p BB has at least \p Min non-debug, non-terminator
+/// instructions.
+static bool hasMinimumInstCount(const BasicBlock &BB, unsigned Min) {
+  unsigned Count = 0;
+  for (const Instruction &I : BB) {
+    if (isa<DbgInfoIntrinsic>(&I) || &I == BB.getTerminator())
+      continue;
+    if (++Count >= Min)
+      return true;
+  }
+  return false;
+}
+
 /// Identify the maximal region of cold blocks which includes \p SinkBB.
 ///
 /// Include all blocks post-dominated by \p SinkBB, \p SinkBB itself, and all
@@ -223,9 +241,8 @@ findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) {
     ++SuccIt;
   }
 
-  // TODO: Consider outlining regions with just 1 block, but more than some
-  // threshold of instructions.
-  if (ColdRegion.size() == 1)
+  if (ColdRegion.size() == 1 &&
+      !hasMinimumInstCount(*ColdRegion[0], MinOutliningInstCount))
     return {};
 
   return ColdRegion;
diff --git a/test/Transforms/HotColdSplit/do-not-split.ll b/test/Transforms/HotColdSplit/do-not-split.ll
index 1f626581919..213681383ea 100644
--- a/test/Transforms/HotColdSplit/do-not-split.ll
+++ b/test/Transforms/HotColdSplit/do-not-split.ll
@@ -6,7 +6,7 @@
 
 ; The cold region is too small to split.
 ; CHECK-LABEL: @foo
-; CHECK-NOT: codeRepl
+; CHECK-NOT: foo.cold.1
 define void @foo() {
 entry:
   br i1 undef, label %if.then, label %if.end
@@ -15,21 +15,28 @@ if.then:                                          ; preds = %entry
   unreachable
 
 if.end:                                           ; preds = %entry
-  br label %if.then12
+  ret void
+}
 
-if.then12:                                        ; preds = %if.end
-  br label %cleanup40
+; The cold region is still too small to split.
+; CHECK-LABEL: @bar
+; CHECK-NOT: bar.cold.1
+define void @bar() {
+entry:
+  br i1 undef, label %if.then, label %if.end
 
-cleanup40:                                        ; preds = %if.then12
-  br label %return
+if.then:                                          ; preds = %entry
+  call void @sink()
+  call void @sink()
+  ret void
 
-return:                                           ; preds = %cleanup40
+if.end:                                           ; preds = %entry
   ret void
 }
 
 ; Make sure we don't try to outline the entire function.
 ; CHECK-LABEL: @fun
-; CHECK-NOT: codeRepl
+; CHECK-NOT: fun.cold.1
 define void @fun() {
 entry:
   br i1 undef, label %if.then, label %if.end
@@ -43,14 +50,53 @@ if.end:                                           ; preds = %entry
 
 ; Don't outline infinite loops.
 ; CHECK-LABEL: @infinite_loop
-; CHECK-NOT: codeRepl
+; CHECK-NOT: infinite_loop.cold.1
 define void @infinite_loop() {
 entry:
   br label %loop
 
 loop:
+  call void @sink()
+  call void @sink()
   call void @sink()
   br label %loop
 }
 
+; Don't count debug intrinsics towards the outlining threshold.
+; CHECK-LABEL: @dont_count_debug_intrinsics
+; CHECK-NOT: dont_count_debug_intrinsics.cold.1
+define void @dont_count_debug_intrinsics(i32 %arg1) !dbg !6 {
+entry:
+  %var = add i32 0, 0, !dbg !11
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  ret void
+
+if.end:                                           ; preds = %entry
+  call void @llvm.dbg.value(metadata i32 %arg1, metadata !9, metadata !DIExpression()), !dbg !11
+  call void @llvm.dbg.value(metadata i32 %arg1, metadata !9, metadata !DIExpression()), !dbg !11
+  call void @sink()
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
 declare void @sink() cold
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!3, !4}
+!llvm.module.flags = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!2 = !{}
+!3 = !{i32 7}
+!4 = !{i32 1}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "dont_count_debug_intrinsics", linkageName: "dont_count_debug_intrinsics", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+!7 = !DISubroutineType(types: !2)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+!11 = !DILocation(line: 1, column: 1, scope: !6)
diff --git a/test/Transforms/HotColdSplit/minsize.ll b/test/Transforms/HotColdSplit/minsize.ll
index 4865fb6d024..eb42ad14af2 100644
--- a/test/Transforms/HotColdSplit/minsize.ll
+++ b/test/Transforms/HotColdSplit/minsize.ll
@@ -1,8 +1,7 @@
 ; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
 
 ; CHECK-LABEL: @fun
-; CHECK: codeRepl:
-; CHECK-NEXT: call void @fun.cold.1
+; CHECK: call void @fun.cold.1
 
 define void @fun() {
 entry:
@@ -12,21 +11,13 @@ if.then:
   ret void
 
 if.else:
-  br label %if.then4
-
-if.then4:
-  br i1 undef, label %if.then5, label %if.end
-
-if.then5:
-  br label %cleanup
-
-if.end:
-  br label %cleanup
-
-cleanup:
-  %cleanup.dest.slot.0 = phi i32 [ 1, %if.then5 ], [ 0, %if.end ]
-  unreachable
+  call void @sink()
+  call void @sink()
+  call void @sink()
+  ret void
 }
 
+declare void @sink() cold
+
 ; CHECK: define {{.*}} @fun.cold.1{{.*}}#[[outlined_func_attr:[0-9]+]]
 ; CHECK: attributes #[[outlined_func_attr]] = { {{.*}}minsize
diff --git a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
index 459ee6712bc..b77201fe0d3 100644
--- a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
+++ b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
@@ -6,33 +6,23 @@
 define void @foo(i32 %arg1) !dbg !6 {
 entry:
   %var = add i32 0, 0, !dbg !11
-  br i1 undef, label %if.then, label %if.end, !dbg !12
+  br i1 undef, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  ret void, !dbg !13
+  ret void
 
 if.end:                                           ; preds = %entry
   call void @llvm.dbg.value(metadata i32 %arg1, metadata !9, metadata !DIExpression()), !dbg !11
-  br label %if.then12, !dbg !14
-
-if.then12:                                        ; preds = %if.end
-  br label %cleanup40, !dbg !15
-
-cleanup40:                                        ; preds = %if.then12
-  br i1 undef, label %if.then5, label %if.end1, !dbg !16
-
-if.then5:
-  br label %return, !dbg !17
-
-if.end1:
-  br label %return, !dbg !18
-
-return:                                           ; preds = %cleanup40
-  unreachable, !dbg !19
+  call void @sink()
+  call void @sink()
+  call void @sink()
+  ret void
 }
 
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
+declare void @sink() cold
+
 !llvm.dbg.cu = !{!0}
 !llvm.debugify = !{!3, !4}
 !llvm.module.flags = !{!5}
@@ -49,11 +39,3 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
 !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
 !11 = !DILocation(line: 1, column: 1, scope: !6)
-!12 = !DILocation(line: 2, column: 1, scope: !6)
-!13 = !DILocation(line: 3, column: 1, scope: !6)
-!14 = !DILocation(line: 4, column: 1, scope: !6)
-!15 = !DILocation(line: 5, column: 1, scope: !6)
-!16 = !DILocation(line: 6, column: 1, scope: !6)
-!17 = !DILocation(line: 7, column: 1, scope: !6)
-!18 = !DILocation(line: 8, column: 1, scope: !6)
-!19 = !DILocation(line: 9, column: 1, scope: !6)
-- 
GitLab


From fe506c93e9a77773b46c3b9e213577f0ea812555 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Oct 2018 19:51:52 +0000
Subject: [PATCH 0712/1116] [X86] Set isMachineVerifierClean() back to false
 (PR27481)

Put back the isMachineVerifierClean() override removed at rL345513 to fix Windows ThinLTO tests


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345528 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetMachine.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index f5b45da0c3d..5b21cd82b5b 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -53,6 +53,10 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
 } // end namespace llvm
-- 
GitLab


From 8f7dc5cb4ecf5ee1ecd4a71fd4482031f38f2665 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Mon, 29 Oct 2018 20:10:42 +0000
Subject: [PATCH 0713/1116] Relax fast register allocator related test cases;
 NFC

- Relex hard coded registers and stack frame sizes
- Some test cleanups
- Change phi-dbg.ll to match on mir output after phi elimination instead
  of going through the whole codegen pipeline.

This is in preparation for https://reviews.llvm.org/D52010
I'm committing all the test changes upfront that work before and after
independently.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345532 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/AArch64/arm64-abi.ll             |   2 +-
 test/CodeGen/AArch64/arm64-fast-isel-icmp.ll  |  94 +++++++++-------
 .../AArch64/arm64-fast-isel-intrinsic.ll      | 102 +++++++++---------
 test/CodeGen/AArch64/arm64-fast-isel-ret.ll   |   7 +-
 test/CodeGen/AArch64/arm64-fast-isel.ll       |  10 +-
 .../AArch64/fast-isel-address-extends.ll      |   6 +-
 test/CodeGen/AArch64/fast-isel-atomic.ll      |  32 +++---
 .../AArch64/fast-isel-branch-cond-mask.ll     |   4 +-
 test/CodeGen/AArch64/phi-dbg.ll               |  39 ++++---
 test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll |   2 +-
 test/CodeGen/AMDGPU/spill-wide-sgpr.ll        |  12 +--
 test/CodeGen/ARM/fast-isel-align.ll           |   8 +-
 test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll   |  18 ++--
 test/CodeGen/ARM/fast-isel.ll                 |  20 ++--
 test/CodeGen/ARM/fp16-instructions.ll         |   2 +-
 test/CodeGen/Mips/Fast-ISel/bricmpi1.ll       |  16 +--
 test/CodeGen/Mips/Fast-ISel/callabi.ll        |   4 +-
 test/CodeGen/Mips/Fast-ISel/fastalloca.ll     |   4 +-
 test/CodeGen/Mips/Fast-ISel/logopm.ll         |   4 +-
 test/CodeGen/PowerPC/pr26180.ll               |   8 +-
 test/CodeGen/PowerPC/tls.ll                   |  10 +-
 test/CodeGen/PowerPC/vsx-spill.ll             |   8 +-
 test/CodeGen/SPARC/LeonCASAInstructionUT.ll   |   4 +-
 .../regalloc-fast-invalid-kill-flag.mir       |  14 +--
 test/CodeGen/SystemZ/rosbg-02.ll              |   2 +-
 test/CodeGen/X86/epilogue-cfi-no-fp.ll        |  26 ++---
 test/CodeGen/X86/fast-isel-extract.ll         |   6 +-
 test/CodeGen/X86/fast-isel-gep.ll             |   6 +-
 test/CodeGen/X86/fast-isel-x86-64.ll          |  10 +-
 test/CodeGen/X86/fold-sext-trunc.ll           |   2 +-
 test/CodeGen/X86/pr28489.ll                   |   4 +-
 test/CodeGen/X86/pr30813.ll                   |   6 +-
 test/CodeGen/X86/shift-i256.ll                |   2 +-
 test/CodeGen/X86/switch.ll                    |  10 +-
 test/CodeGen/X86/win32_sret.ll                |  18 ++--
 test/CodeGen/XCore/dwarf_debug.ll             |   4 +-
 test/DebugInfo/X86/parameters.ll              |   3 +-
 test/DebugInfo/X86/pieces-1.ll                |   3 +-
 test/DebugInfo/X86/sdag-split-arg.ll          |  10 +-
 test/DebugInfo/X86/vla.ll                     |   4 +-
 40 files changed, 283 insertions(+), 263 deletions(-)

diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
index bfc03c6b975..af99734e6a6 100644
--- a/test/CodeGen/AArch64/arm64-abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -128,7 +128,7 @@ entry:
 ; CHECK-LABEL: test3
 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
 ; FAST-LABEL: test3
-; FAST: sub sp, sp, #48
+; FAST: sub sp, sp, #{{[0-9]+}}
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
   %0 = load <2 x i32>, <2 x i32>* %in, align 8
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
index 4288aa1df44..dc64123b33c 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
@@ -4,7 +4,8 @@ define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_eq_imm
 ; CHECK:       cmp w0, #31
-; CHECK-NEXT:  cset w0, eq
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp eq i32 %a, 31
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -14,7 +15,8 @@ define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_eq_neg_imm
 ; CHECK:       cmn w0, #7
-; CHECK-NEXT:  cset w0, eq
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp eq i32 %a, -7
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -24,7 +26,8 @@ define i32 @icmp_eq_i32(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_eq_i32
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, eq
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -34,7 +37,8 @@ define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_ne
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, ne
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], ne
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ne i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -64,7 +68,8 @@ define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_ugt
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, hi
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], hi
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ugt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -74,7 +79,8 @@ define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_uge
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, hs
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], hs
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp uge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -84,7 +90,8 @@ define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_ult
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, lo
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], lo
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ult i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -94,7 +101,8 @@ define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_ule
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, ls
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], ls
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ule i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -104,7 +112,8 @@ define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_sgt
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, gt
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], gt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sgt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -114,7 +123,8 @@ define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_sge
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, ge
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], ge
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -124,7 +134,8 @@ define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_slt
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, lt
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], lt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp slt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -134,7 +145,8 @@ define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_sle
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, le
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], le
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sle i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -144,7 +156,8 @@ define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_i64
 ; CHECK:       cmp  x0, x1
-; CHECK-NEXT:  cset w{{[0-9]+}}, le
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], le
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sle i64 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -153,9 +166,10 @@ entry:
 define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_eq_i16
-; CHECK:       sxth w0, w0
-; CHECK:       cmp w0, w1, sxth
-; CHECK-NEXT:  cset w0, eq
+; CHECK:       sxth [[REG0:w[0-9]+]], w0
+; CHECK:       cmp [[REG0]], w1, sxth
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp eq i16 %a, %b
   ret i1 %cmp
 }
@@ -163,9 +177,10 @@ entry:
 define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_eq_i8
-; CHECK:       sxtb w0, w0
-; CHECK-NEXT:  cmp w0, w1, sxtb
-; CHECK-NEXT:  cset w0, eq
+; CHECK:       sxtb [[REG0:w[0-9]+]], w0
+; CHECK-NEXT:  cmp [[REG0]], w1, sxtb
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp eq i8 %a, %b
   ret i1 %cmp
 }
@@ -173,9 +188,10 @@ entry:
 define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
 entry:
 ; CHECK-LABEL: icmp_i16_unsigned
-; CHECK:       uxth w0, w0
-; CHECK-NEXT:  cmp w0, w1, uxth
-; CHECK-NEXT:  cset w0, lo
+; CHECK:       uxth [[REG0:w[0-9]+]], w0
+; CHECK-NEXT:  cmp [[REG0]], w1, uxth
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], lo
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ult i16 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -184,9 +200,10 @@ entry:
 define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
 entry:
 ; CHECK-LABEL: icmp_i8_signed
-; CHECK:       sxtb w0, w0
-; CHECK-NEXT:  cmp w0, w1, sxtb
-; CHECK-NEXT:  cset w0, gt
+; CHECK:       sxtb [[REG0:w[0-9]+]], w0
+; CHECK-NEXT:  cmp [[REG0]], w1, sxtb
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], gt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sgt i8 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -198,7 +215,8 @@ entry:
 ; CHECK:       sbfx [[REG1:w[0-9]+]], w0, #0, #1
 ; CHECK-NEXT:  sbfx [[REG2:w[0-9]+]], w1, #0, #1
 ; CHECK-NEXT:  cmp  [[REG1]], [[REG2]]
-; CHECK-NEXT:  cset w0, gt
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], gt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sgt i1 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -207,10 +225,10 @@ entry:
 define i32 @icmp_i16_signed_const(i16 %a) nounwind {
 entry:
 ; CHECK-LABEL: icmp_i16_signed_const
-; CHECK:       sxth w0, w0
-; CHECK-NEXT:  cmn w0, #233
-; CHECK-NEXT:  cset w0, lt
-; CHECK-NEXT:  and w0, w0, #0x1
+; CHECK:       sxth [[REG0:w[0-9]+]], w0
+; CHECK-NEXT:  cmn [[REG0]], #233
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], lt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp slt i16 %a, -233
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -219,10 +237,10 @@ entry:
 define i32 @icmp_i8_signed_const(i8 %a) nounwind {
 entry:
 ; CHECK-LABEL: icmp_i8_signed_const
-; CHECK:       sxtb w0, w0
-; CHECK-NEXT:  cmp w0, #124
-; CHECK-NEXT:  cset w0, gt
-; CHECK-NEXT:  and w0, w0, #0x1
+; CHECK:       sxtb [[REG0:w[0-9]+]], w0
+; CHECK-NEXT:  cmp [[REG0]], #124
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], gt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sgt i8 %a, 124
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -231,10 +249,10 @@ entry:
 define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
 entry:
 ; CHECK-LABEL: icmp_i1_unsigned_const
-; CHECK:       and w0, w0, #0x1
-; CHECK-NEXT:  cmp w0, #0
-; CHECK-NEXT:  cset w0, lo
-; CHECK-NEXT:  and w0, w0, #0x1
+; CHECK:       and [[REG0:w[0-9]+]], w0, #0x1
+; CHECK-NEXT:  cmp [[REG0]], #0
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], lo
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ult i1 %a, 0
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
index 0fcd4fe752f..c1b7d790878 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -7,8 +7,8 @@ define void @t1() {
 ; ARM64-LABEL: t1
 ; ARM64: adrp x8, _message@PAGE
 ; ARM64: add x0, x8, _message@PAGEOFF
-; ARM64: mov w9, wzr
-; ARM64: uxtb w1, w9
+; ARM64: mov [[REG:w[0-9]+]], wzr
+; ARM64: uxtb w1, [[REG]]
 ; ARM64: mov x2, #80
 ; ARM64: bl _memset
   call void @llvm.memset.p0i8.i64(i8* align 16 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i1 false)
@@ -48,15 +48,15 @@ declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1)
 define void @t4() {
 ; ARM64-LABEL: t4
 ; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr x10, [x9]
-; ARM64: str x10, [x8]
-; ARM64: ldr x10, [x9, #8]
-; ARM64: str x10, [x8, #8]
-; ARM64: ldrb w11, [x9, #16]
-; ARM64: strb w11, [x8, #16]
+; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
+; ARM64: add [[REG2:x[0-9]+]], [[REG1]], _message@PAGEOFF
+; ARM64: ldr x10, {{\[}}[[REG2]]{{\]}}
+; ARM64: str x10, {{\[}}[[REG0]]{{\]}}
+; ARM64: ldr x10, {{\[}}[[REG2]], #8]
+; ARM64: str x10, {{\[}}[[REG0]], #8]
+; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #16]
+; ARM64: strb [[REG3]], {{\[}}[[REG0]], #16]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 16 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 17, i1 false)
   ret void
@@ -65,15 +65,15 @@ define void @t4() {
 define void @t5() {
 ; ARM64-LABEL: t5
 ; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr x10, [x9]
-; ARM64: str x10, [x8]
-; ARM64: ldr x10, [x9, #8]
-; ARM64: str x10, [x8, #8]
-; ARM64: ldrb w11, [x9, #16]
-; ARM64: strb w11, [x8, #16]
+; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp [[REG3:x[0-9]+]], _message@PAGE
+; ARM64: add [[REG1:x[0-9]+]], [[REG3]], _message@PAGEOFF
+; ARM64: ldr x10, {{\[}}[[REG1]]]
+; ARM64: str x10, {{\[}}[[REG0]]]
+; ARM64: ldr x10, {{\[}}[[REG1]], #8]
+; ARM64: str x10, {{\[}}[[REG0]], #8]
+; ARM64: ldrb [[REG4:w[0-9]+]], {{\[}}[[REG1]], #16]
+; ARM64: strb [[REG4]], {{\[}}[[REG0]], #16]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 8 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 17, i1 false)
   ret void
@@ -82,15 +82,15 @@ define void @t5() {
 define void @t6() {
 ; ARM64-LABEL: t6
 ; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr w10, [x9]
-; ARM64: str w10, [x8]
-; ARM64: ldr w10, [x9, #4]
-; ARM64: str w10, [x8, #4]
-; ARM64: ldrb w10, [x9, #8]
-; ARM64: strb w10, [x8, #8]
+; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
+; ARM64: add [[REG2:x[0-9]+]], [[REG1]], _message@PAGEOFF
+; ARM64: ldr w10, {{\[}}[[REG2]]]
+; ARM64: str w10, {{\[}}[[REG0]]]
+; ARM64: ldr w10, {{\[}}[[REG2]], #4]
+; ARM64: str w10, {{\[}}[[REG0]], #4]
+; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #8]
+; ARM64: strb [[REG3]], {{\[}}[[REG0]], #8]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 4 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 9, i1 false)
   ret void
@@ -99,17 +99,17 @@ define void @t6() {
 define void @t7() {
 ; ARM64-LABEL: t7
 ; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldrh w10, [x9]
-; ARM64: strh w10, [x8]
-; ARM64: ldrh w10, [x9, #2]
-; ARM64: strh w10, [x8, #2]
-; ARM64: ldrh w10, [x9, #4]
-; ARM64: strh w10, [x8, #4]
-; ARM64: ldrb w10, [x9, #6]
-; ARM64: strb w10, [x8, #6]
+; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
+; ARM64: add [[REG2:x[0-9]+]], [[REG1]], _message@PAGEOFF
+; ARM64: ldrh w10, {{\[}}[[REG2]]]
+; ARM64: strh w10, {{\[}}[[REG0]]]
+; ARM64: ldrh w10, {{\[}}[[REG2]], #2]
+; ARM64: strh w10, {{\[}}[[REG0]], #2]
+; ARM64: ldrh w10, {{\[}}[[REG2]], #4]
+; ARM64: strh w10, {{\[}}[[REG0]], #4]
+; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #6]
+; ARM64: strb [[REG3]], {{\[}}[[REG0]], #6]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 2 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 7, i1 false)
   ret void
@@ -118,17 +118,17 @@ define void @t7() {
 define void @t8() {
 ; ARM64-LABEL: t8
 ; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldrb w10, [x9]
-; ARM64: strb w10, [x8]
-; ARM64: ldrb w10, [x9, #1]
-; ARM64: strb w10, [x8, #1]
-; ARM64: ldrb w10, [x9, #2]
-; ARM64: strb w10, [x8, #2]
-; ARM64: ldrb w10, [x9, #3]
-; ARM64: strb w10, [x8, #3]
+; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
+; ARM64: add [[REG2:x[0-9]+]], [[REG1:x[0-9]+]], _message@PAGEOFF
+; ARM64: ldrb w10, {{\[}}[[REG2]]]
+; ARM64: strb w10, {{\[}}[[REG0]]]
+; ARM64: ldrb w10, {{\[}}[[REG2]], #1]
+; ARM64: strb w10, {{\[}}[[REG0]], #1]
+; ARM64: ldrb w10, {{\[}}[[REG2]], #2]
+; ARM64: strb w10, {{\[}}[[REG0]], #2]
+; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #3]
+; ARM64: strb [[REG3]], {{\[}}[[REG0]], #3]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 1 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 4, i1 false)
   ret void
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
index 9a67fff00ac..81c9933a863 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
@@ -35,7 +35,7 @@ entry:
 define signext i16 @ret_i16(i16 signext %a) nounwind {
 entry:
 ; CHECK: @ret_i16
-; CHECK: sxth	w0, w0
+; CHECK: sxth	w0, {{w[0-9]+}}
   %a.addr = alloca i16, align 1
   store i16 %a, i16* %a.addr, align 1
   %0 = load i16, i16* %a.addr, align 1
@@ -45,7 +45,7 @@ entry:
 define signext i8 @ret_i8(i8 signext %a) nounwind {
 entry:
 ; CHECK: @ret_i8
-; CHECK: sxtb	w0, w0
+; CHECK: sxtb	w0, {{w[0-9]+}}
   %a.addr = alloca i8, align 1
   store i8 %a, i8* %a.addr, align 1
   %0 = load i8, i8* %a.addr, align 1
@@ -55,7 +55,8 @@ entry:
 define signext i1 @ret_i1(i1 signext %a) nounwind {
 entry:
 ; CHECK: @ret_i1
-; CHECK: and w0, w0, #0x1
+; CHECK: and [[REG:w[0-9]+]], {{w[0-9]+}}, #0x1
+; CHECK: sbfx w0, [[REG]], #0, #1
   %a.addr = alloca i1, align 1
   store i1 %a, i1* %a.addr, align 1
   %0 = load i1, i1* %a.addr, align 1
diff --git a/test/CodeGen/AArch64/arm64-fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll
index 508e36750ee..daccc86c709 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel.ll
@@ -30,11 +30,11 @@ define void @t1(i64 %a) nounwind {
 define zeroext i1 @i1(i1 %a) nounwind {
 entry:
 ; CHECK: @i1
-; CHECK: and w0, w0, #0x1
-; CHECK: strb w0, [sp, #15]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: and w0, w0, #0x1
-; CHECK: and w0, w0, #0x1
+; CHECK: and [[REG:w[0-9]+]], w0, #0x1
+; CHECK: strb [[REG]], [sp, #15]
+; CHECK: ldrb [[REG1:w[0-9]+]], [sp, #15]
+; CHECK: and [[REG2:w[0-9]+]], [[REG1]], #0x1
+; CHECK: and w0, [[REG2]], #0x1
 ; CHECK: add sp, sp, #16
 ; CHECK: ret
   %a.addr = alloca i1, align 1
diff --git a/test/CodeGen/AArch64/fast-isel-address-extends.ll b/test/CodeGen/AArch64/fast-isel-address-extends.ll
index 6a17ec502a0..8b0ffa8c10d 100644
--- a/test/CodeGen/AArch64/fast-isel-address-extends.ll
+++ b/test/CodeGen/AArch64/fast-isel-address-extends.ll
@@ -6,8 +6,10 @@ target triple = "arm64-apple-ios8.0.0"
 ; This test was trying to fold the sext %tmp142 in to the address arithmetic in %sunkaddr1.
 ; This was incorrect as %.mux isn't available in the last bb.
 
-; CHECK: sxtw [[REG:x[0-9]+]]
-; CHECK: strh wzr, {{\[}}[[REG]], {{.*}}, lsl #1]
+; CHECK: sxtw [[REG0:x[0-9]+]]
+; CHECK: str [[REG0]], [sp, [[OFFSET:#[0-9]+]]]
+; CHECK: ldr [[REG1:x[0-9]+]], [sp, [[OFFSET]]]
+; CHECK: strh wzr, [{{.*}}, [[REG1]], lsl #1]
 
 ; Function Attrs: nounwind optsize ssp
 define void @EdgeLoop(i32 %dir, i32 %edge, i32 %width, i16* %tmp89, i32 %tmp136, i16 %tmp144) #0 {
diff --git a/test/CodeGen/AArch64/fast-isel-atomic.ll b/test/CodeGen/AArch64/fast-isel-atomic.ll
index 452129e4951..240e8280572 100644
--- a/test/CodeGen/AArch64/fast-isel-atomic.ll
+++ b/test/CodeGen/AArch64/fast-isel-atomic.ll
@@ -91,8 +91,8 @@ define void @atomic_store_release_8(i8* %p, i8 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_release_8_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #1
-; CHECK-NEXT:  stlrb w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #1
+; CHECK-NEXT:  stlrb w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_8_off(i8* %p, i8 %val) #0 {
   %tmp0 = getelementptr i8, i8* %p, i32 1
@@ -111,8 +111,8 @@ define void @atomic_store_release_16(i16* %p, i16 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_release_16_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #2
-; CHECK-NEXT:  stlrh w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #2
+; CHECK-NEXT:  stlrh w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_16_off(i16* %p, i16 %val) #0 {
   %tmp0 = getelementptr i16, i16* %p, i32 1
@@ -131,8 +131,8 @@ define void @atomic_store_release_32(i32* %p, i32 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_release_32_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #4
-; CHECK-NEXT:  stlr w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #4
+; CHECK-NEXT:  stlr w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_32_off(i32* %p, i32 %val) #0 {
   %tmp0 = getelementptr i32, i32* %p, i32 1
@@ -151,8 +151,8 @@ define void @atomic_store_release_64(i64* %p, i64 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_release_64_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #8
-; CHECK-NEXT:  stlr x1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #8
+; CHECK-NEXT:  stlr x1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_64_off(i64* %p, i64 %val) #0 {
   %tmp0 = getelementptr i64, i64* %p, i32 1
@@ -172,8 +172,8 @@ define void @atomic_store_seq_cst_8(i8* %p, i8 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_seq_cst_8_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #1
-; CHECK-NEXT:  stlrb w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #1
+; CHECK-NEXT:  stlrb w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_8_off(i8* %p, i8 %val) #0 {
   %tmp0 = getelementptr i8, i8* %p, i32 1
@@ -192,8 +192,8 @@ define void @atomic_store_seq_cst_16(i16* %p, i16 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_seq_cst_16_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #2
-; CHECK-NEXT:  stlrh w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #2
+; CHECK-NEXT:  stlrh w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_16_off(i16* %p, i16 %val) #0 {
   %tmp0 = getelementptr i16, i16* %p, i32 1
@@ -212,8 +212,8 @@ define void @atomic_store_seq_cst_32(i32* %p, i32 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_seq_cst_32_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #4
-; CHECK-NEXT:  stlr w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #4
+; CHECK-NEXT:  stlr w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_32_off(i32* %p, i32 %val) #0 {
   %tmp0 = getelementptr i32, i32* %p, i32 1
@@ -232,8 +232,8 @@ define void @atomic_store_seq_cst_64(i64* %p, i64 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_seq_cst_64_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #8
-; CHECK-NEXT:  stlr x1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #8
+; CHECK-NEXT:  stlr x1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_64_off(i64* %p, i64 %val) #0 {
   %tmp0 = getelementptr i64, i64* %p, i32 1
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
index 55fbf63319e..0cafd883f69 100644
--- a/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
@@ -2,9 +2,9 @@
 
 define void @test(i64 %a, i64 %b, i2* %c) {
 ; CHECK-LABEL: test
-; CHECK:       and [[REG1:w[0-9]+]], w8, #0x3
+; CHECK:       and [[REG1:w[0-9]+]], {{w[0-9]+}}, #0x3
 ; CHECK-NEXT:  strb [[REG1]], {{\[}}x2{{\]}}
-; CHECK-NEXT:  tbz w9, #0,
+; CHECK-NEXT:  tbz {{w[0-9]+}}, #0,
  %1 = trunc i64 %a to i2
  %2 = trunc i64 %b to i1
 ; Force fast-isel to fall back to SDAG.
diff --git a/test/CodeGen/AArch64/phi-dbg.ll b/test/CodeGen/AArch64/phi-dbg.ll
index a2c97f31108..0b5c6677acd 100644
--- a/test/CodeGen/AArch64/phi-dbg.ll
+++ b/test/CodeGen/AArch64/phi-dbg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 %s -mtriple=aarch64 -o - | FileCheck %s
+; RUN: llc -O0 %s -mtriple=aarch64 -stop-after=phi-node-elimination -o - | FileCheck %s
 
 ; Test that a DEBUG_VALUE node is create for variable c after the phi has been
 ; converted to a ldr.    The DEBUG_VALUE must be *after* the ldr and not before it.
@@ -15,25 +15,34 @@
 ; }
 ;
 ; Function Attrs: nounwind
-define i32 @func(i32) #0 !dbg !8 {
-  call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !12, metadata !13), !dbg !14
+; CHECK: !14 = !DILocalVariable(name: "c"
+; CHECK-LABEL: name: func
+define i32 @func(i32 %a0) #0 !dbg !8 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %a0, i64 0, metadata !12, metadata !13), !dbg !14
   call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !15, metadata !13), !dbg !16
-  %2 = icmp slt i32 %0, 0, !dbg !17
-  br i1 %2, label %3, label %4, !dbg !19
+  %v2 = icmp slt i32 %a0, 0, !dbg !17
+  br i1 %v2, label %bb2, label %bb3, !dbg !19
 
-; <label>:3:                                      ; preds = %1
+bb2:
   call void @llvm.dbg.value(metadata i32 12, i64 0, metadata !15, metadata !13), !dbg !16
-  br label %4, !dbg !20
+  br label %bb3, !dbg !20
 
-; <label>:4:                                      ; preds = %3, %1
-  %.0 = phi i32 [ 12, %3 ], [ 1, %1 ]
-; CHECK: ldr     w[[REG:[0-9]+]], [sp, #8]
-; CHECK-NEXT: .Ltmp
+; CHECK: bb.2.bb2:
+; CHECK:  [[REG0:%[0-9]+]]:gpr32 = MOVi32imm 12
+; CHECK:  [[PHIREG:%[0-9]+]]:gpr32 = COPY [[REG0]]
+
+bb3:
+; CHECK: bb.3.bb3:
+; CHECK:   [[PHIDEST:%[0-9]+]]:gpr32 = COPY [[PHIREG]]
+; CHECK-NEXT:   DBG_VALUE debug-use [[PHIDEST]]
+  %.0 = phi i32 [ 12, %bb2 ], [ 1, %entry ]
   call void @llvm.dbg.value(metadata i32 %.0, i64 0, metadata !15, metadata !13), !dbg !16
-; CHECK-NEXT:  //DEBUG_VALUE: func:c <- $w[[REG]]
-  %5 = add nsw i32 %.0, %0, !dbg !22
-  call void @llvm.dbg.value(metadata i32 %5, i64 0, metadata !15, metadata !13), !dbg !16
-  ret i32 %5, !dbg !23
+; CHECK: [[ADD:%[0-9]+]]:gpr32 = nsw ADDWrr [[PHIDEST]]
+; CHECK-NEXT: DBG_VALUE debug-use [[ADD]]
+  %v5 = add nsw i32 %.0, %a0, !dbg !22
+  call void @llvm.dbg.value(metadata i32 %v5, i64 0, metadata !15, metadata !13), !dbg !16
+  ret i32 %v5, !dbg !23
 }
 
 ; Function Attrs: nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index 244fa678562..9fe0c0f8fc1 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -19,7 +19,7 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
 ; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; VI-NEXT: s_nop 0
 ; VI-NEXT: s_nop 0
-; VI-NEXT: v_mov_b32_dpp v2, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; VI-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
 @0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4
 define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {
 bb:
diff --git a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
index ebba35a6689..b8824be4725 100644
--- a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
+++ b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
@@ -45,11 +45,11 @@ ret:
 
 ; ALL-LABEL: {{^}}spill_sgpr_x4:
 ; SMEM: s_add_u32 m0, s3, 0x100{{$}}
-; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Spill
+; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill
 ; SMEM: s_cbranch_scc1
 
 ; SMEM: s_add_u32 m0, s3, 0x100{{$}}
-; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Reload
+; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload
 ; SMEM: s_dcache_wb
 ; SMEM: s_endpgm
 
@@ -94,15 +94,15 @@ ret:
 ; ALL-LABEL: {{^}}spill_sgpr_x8:
 
 ; SMEM: s_add_u32 m0, s3, 0x100{{$}}
-; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
+; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill
 ; SMEM: s_add_u32 m0, s3, 0x110{{$}}
-; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
+; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Spill
 ; SMEM: s_cbranch_scc1
 
 ; SMEM: s_add_u32 m0, s3, 0x100{{$}}
-; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
+; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload
 ; SMEM: s_add_u32 m0, s3, 0x110{{$}}
-; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
+; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload
 
 ; SMEM: s_dcache_wb
 ; SMEM: s_endpgm
diff --git a/test/CodeGen/ARM/fast-isel-align.ll b/test/CodeGen/ARM/fast-isel-align.ll
index 71cd73a4a25..9dab0abedb6 100644
--- a/test/CodeGen/ARM/fast-isel-align.ll
+++ b/test/CodeGen/ARM/fast-isel-align.ll
@@ -26,12 +26,12 @@
 define void @unaligned_store(float %x, float %y) nounwind {
 entry:
 ; ARM: @unaligned_store
-; ARM: vmov r1, s0
-; ARM: str r1, [r0]
+; ARM: vmov [[REG:r[0-9]+]], s0
+; ARM: str [[REG]], [{{r[0-9]+}}]
 
 ; THUMB: @unaligned_store
-; THUMB: vmov r1, s0
-; THUMB: str r1, [r0]
+; THUMB: vmov [[REG:r[0-9]+]], s0
+; THUMB: str [[REG]], [{{r[0-9]+}}]
 
   %add = fadd float %x, %y
   %0 = load %struct.anon*, %struct.anon** @a, align 4
diff --git a/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll b/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll
index ca512970c9c..f49c907c414 100644
--- a/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll
+++ b/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll
@@ -82,7 +82,8 @@ entry:
 ; ARM: t9
   %add.ptr = getelementptr inbounds i16, i16* %a, i64 -8
   store i16 0, i16* %add.ptr, align 2
-; ARM: strh	r1, [r0, #-16]
+; ARM: movw [[REG0:r[0-9]+]], #0
+; ARM: strh [[REG0]], [{{r[0-9]+}}, #-16]
   ret void
 }
 
@@ -93,9 +94,10 @@ entry:
 ; ARM: t10
   %add.ptr = getelementptr inbounds i16, i16* %a, i64 -128
   store i16 0, i16* %add.ptr, align 2
-; ARM: mvn r{{[1-9]}}, #255
-; ARM: add r0, r0, r{{[1-9]}}
-; ARM: strh r{{[1-9]}}, [r0]
+; ARM: mvn r1, #255
+; ARM: add [[REG0:r[0-9]+]], r0, r1
+; ARM: movw [[REG1:r[0-9]+]], #0
+; ARM: strh [[REG1]], {{\[}}[[REG0]]]
   ret void
 }
 
@@ -104,7 +106,8 @@ entry:
 ; ARM: t11
   %add.ptr = getelementptr inbounds i16, i16* %a, i64 8
   store i16 0, i16* %add.ptr, align 2
-; ARM: strh r{{[1-9]}}, [r0, #16]
+; ARM: movw [[REG1:r[0-9]+]], #0
+; ARM: strh [[REG1]], [{{r[0-9]+}}, #16]
   ret void
 }
 
@@ -115,8 +118,9 @@ entry:
 ; ARM: t12
   %add.ptr = getelementptr inbounds i16, i16* %a, i64 128
   store i16 0, i16* %add.ptr, align 2
-; ARM: add r0, r0, #256
-; ARM: strh r{{[1-9]}}, [r0]
+; ARM: add [[REG0:r[0-9]+]], r0, #256
+; ARM: movw [[REG1:r[0-9]+]], #0
+; ARM: strh [[REG1]], {{\[}}[[REG0]]]
   ret void
 }
 
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index 502285e85df..3661beab5c0 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -149,21 +149,21 @@ define void @test4() {
 
 ; THUMB: {{(movw r0, :lower16:L_test4g\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:L_test4g\$non_lazy_ptr)?}}
-; THUMB: ldr r0, [r0]
-; THUMB: ldr r1, [r0]
-; THUMB: adds r1, #1
-; THUMB: str r1, [r0]
+; THUMB: ldr [[REG:r[0-9]+]], [r0]
+; THUMB: ldr [[REG1:r[0-9]+]], {{\[}}[[REG]]]
+; THUMB: adds [[REG1]], #1
+; THUMB: str [[REG1]], {{\[}}[[REG]]]
 
 ; ARM-MACHO: {{(movw r0, :lower16:L_test4g\$non_lazy_ptr)|(ldr r0, .LCPI)}}
 ; ARM-MACHO: {{(movt r0, :upper16:L_test4g\$non_lazy_ptr)?}}
-; ARM-MACHO: ldr r0, [r0]
+; ARM-MACHO: ldr [[REG:r[0-9]+]], [r0]
 
-; ARM-ELF: movw r0, :lower16:test4g
-; ARM-ELF: movt r0, :upper16:test4g
+; ARM-ELF: movw [[REG:r[0-9]+]], :lower16:test4g
+; ARM-ELF: movt [[REG]], :upper16:test4g
 
-; ARM: ldr r1, [r0]
-; ARM: add r1, r1, #1
-; ARM: str r1, [r0]
+; ARM: ldr [[REG1:r[0-9]+]], {{\[}}[[REG]]]
+; ARM: add [[REG2:r[0-9]+]], [[REG1]], #1
+; ARM: str [[REG2]], {{\[}}[[REG]]]
 }
 
 ; ARM: @urem_fold
diff --git a/test/CodeGen/ARM/fp16-instructions.ll b/test/CodeGen/ARM/fp16-instructions.ll
index eb5ec5eb87d..6505d2bf673 100644
--- a/test/CodeGen/ARM/fp16-instructions.ll
+++ b/test/CodeGen/ARM/fp16-instructions.ll
@@ -1043,7 +1043,7 @@ entry:
 
 ; CHECK-SPILL-RELOAD-LABEL: fn1:
 ; CHECK-SPILL-RELOAD:       vstr.16 s0, [sp, #{{.}}]  @ 2-byte Spill
-; CHECK-SPILL-RELOAD-NEXT:  bl  fn2
+; CHECK-SPILL-RELOAD:  bl  fn2
 ; CHECK-SPILL-RELOAD-NEXT:  vldr.16 s0, [sp, #{{.}}]  @ 2-byte Reload
 }
 
diff --git a/test/CodeGen/Mips/Fast-ISel/bricmpi1.ll b/test/CodeGen/Mips/Fast-ISel/bricmpi1.ll
index 47b3c92203d..b5cf2a2030d 100644
--- a/test/CodeGen/Mips/Fast-ISel/bricmpi1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/bricmpi1.ll
@@ -109,11 +109,11 @@ end:
 define void @testsgt(i32, i32) {
 ; CHECK-LABEL: testsgt:
 ; CHECK:       andi $[[REG0:[0-9]+]], $4, 1
-; CHECK:       negu $[[REG0]], $[[REG0]]
+; CHECK:       negu $[[REG2:[0-9]+]], $[[REG0]]
 ; CHECK:       andi $[[REG1:[0-9]+]], $5, 1
-; CHECK:       negu $[[REG1]], $[[REG1]]
-; CHECK:       slt $[[REG2:[0-9]+]], $[[REG1]], $[[REG0]]
-; CHECK:       bnez $[[REG2]],
+; CHECK:       negu $[[REG3:[0-9]+]], $[[REG1]]
+; CHECK:       slt $[[REG4:[0-9]+]], $[[REG3]], $[[REG2]]
+; CHECK:       bnez $[[REG4]],
   %3 = trunc i32 %0 to i1
   %4 = trunc i32 %1 to i1
   %5 = icmp sgt i1 %3, %4
@@ -169,11 +169,11 @@ end:
 define void @testsle(i32, i32) {
 ; CHECK-LABEL: testsle:
 ; CHECK:       andi $[[REG0:[0-9]+]], $4, 1
-; CHECK:       negu $[[REG0]], $[[REG0]]
+; CHECK:       negu $[[REG2:[0-9]+]], $[[REG0]]
 ; CHECK:       andi $[[REG1:[0-9]+]], $5, 1
-; CHECK:       negu $[[REG1]], $[[REG1]]
-; CHECK:       slt $[[REG2:[0-9]+]], $[[REG1]], $[[REG0]]
-; CHECK:       beqz $[[REG2]],
+; CHECK:       negu $[[REG3:[0-9]+]], $[[REG1]]
+; CHECK:       slt $[[REG4:[0-9]+]], $[[REG3]], $[[REG2]]
+; CHECK:       beqz $[[REG4]],
   %3 = trunc i32 %0 to i1
   %4 = trunc i32 %1 to i1
   %5 = icmp sle i1 %3, %4
diff --git a/test/CodeGen/Mips/Fast-ISel/callabi.ll b/test/CodeGen/Mips/Fast-ISel/callabi.ll
index 485a1986b26..f22fbcc7b73 100644
--- a/test/CodeGen/Mips/Fast-ISel/callabi.ll
+++ b/test/CodeGen/Mips/Fast-ISel/callabi.ll
@@ -180,7 +180,7 @@ define void @cxcccc() {
   ; 32R1:       sra     $7, $[[R]], 24
   ; 32R2:       seb     $7, $[[R]]
 
-  ; ALL:        lw      $25, %got(xcccc)($2)
+  ; ALL:        lw      $25, %got(xcccc)(${{[0-9]+}})
   ; ALL:        jalr    $25
   ; ALL:        jr      $ra
   call void @xcccc(i8 88, i8 44, i8 11, i8 33)
@@ -209,7 +209,7 @@ define void @cxhhhh() {
   ; 32R1:       sra     $7, $[[R]], 16
   ; 32R2:       seh     $7, $[[R]]
 
-  ; ALL:        lw      $25, %got(xhhhh)($2)
+  ; ALL:        lw      $25, %got(xhhhh)(${{[0-9]+}})
   ; ALL:        jalr    $25
   ; ALL:        jr      $ra
 
diff --git a/test/CodeGen/Mips/Fast-ISel/fastalloca.ll b/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
index c420a044578..ad2a0f8f2a8 100644
--- a/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
+++ b/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
@@ -22,8 +22,8 @@ entry:
   %2 = load i32, i32* %x2, align 4
   store i32 %2, i32* @i, align 4
   %3 = load i32, i32* %retval
-; CHECK-DAG:    lw      $[[I_ADDR:[0-9]+]], %got(i)($[[REG_GP:[0-9]+]])
-; CHECK-DAG:    addiu   $[[A_ADDR:[0-9]+]], $sp, 8
+; CHECK:        lw      $[[I_ADDR:[0-9]+]], %got(i)($[[REG_GP:[0-9]+]])
+; CHECK:        addiu   $[[A_ADDR:[0-9]+]], $sp, 8
 ; CHECK-DAG:    sw      $[[A_ADDR]], [[A_ADDR_FI:[0-9]+]]($sp)
 ; CHECK-DAG:    lw      $[[A_ADDR2:[0-9]+]], [[A_ADDR_FI]]($sp)
 ; CHECK-DAG:    lw      $[[A_X:[0-9]+]], 0($[[A_ADDR2]])
diff --git a/test/CodeGen/Mips/Fast-ISel/logopm.ll b/test/CodeGen/Mips/Fast-ISel/logopm.ll
index 0519c07682e..ef6b5182a7a 100644
--- a/test/CodeGen/Mips/Fast-ISel/logopm.ll
+++ b/test/CodeGen/Mips/Fast-ISel/logopm.ll
@@ -245,7 +245,7 @@ entry:
 ; CHECK-DAG:    lw      $[[UC1_ADDR:[0-9]+]], %got(uc1)($[[REG_GP]])
 ; CHECK-DAG:    lbu     $[[UC1:[0-9]+]], 0($[[UC1_ADDR]])
 ; CHECK-DAG:    lbu     $[[UC2:[0-9]+]], 0($[[UC2_ADDR]])
-; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[UC2]], $[[UB1]]
+; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[UC2]], $[[UC1]]
 ; CHECK:        sb      $[[RES]], 0($[[UC_ADDR]])
   ret void
 }
@@ -430,7 +430,7 @@ entry:
 ; CHECK-DAG:    lw      $[[US1_ADDR:[0-9]+]], %got(us1)($[[REG_GP]])
 ; CHECK-DAG:    lhu     $[[US1:[0-9]+]], 0($[[US1_ADDR]])
 ; CHECK-DAG:    lhu     $[[US2:[0-9]+]], 0($[[US2_ADDR]])
-; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[US2]], $[[UB1]]
+; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[US2]], $[[US1]]
 ; CHECK:        sh      $[[RES]], 0($[[US_ADDR]])
 ; CHECK:        .end andUs
   ret void
diff --git a/test/CodeGen/PowerPC/pr26180.ll b/test/CodeGen/PowerPC/pr26180.ll
index e4cbcb8725d..d4b05dfeed6 100644
--- a/test/CodeGen/PowerPC/pr26180.ll
+++ b/test/CodeGen/PowerPC/pr26180.ll
@@ -6,9 +6,9 @@ define i32 @bad(double %x) {
   ret i32 %1
 }
 
-; CHECK: fctidz 1, 1
-; CHECK: stfd 1, [[OFF:.*]](1)
+; CHECK: fctidz [[REG0:[0-9]+]], 1
+; CHECK: stfd [[REG0]], [[OFF:.*]](1)
 ; CHECK: lwz {{[0-9]*}}, [[OFF]](1)
-; GENERIC: fctiwuz 1, 1
-; GENERIC: stfd 1, [[OFF:.*]](1)
+; GENERIC: fctiwuz [[REG0:[0-9]+]], 1
+; GENERIC: stfd [[REG0]], [[OFF:.*]](1)
 ; GENERIC: lwz {{[0-9]*}}, [[OFF]](1)
diff --git a/test/CodeGen/PowerPC/tls.ll b/test/CodeGen/PowerPC/tls.ll
index 8410e9885de..3ad93986bd4 100644
--- a/test/CodeGen/PowerPC/tls.ll
+++ b/test/CodeGen/PowerPC/tls.ll
@@ -11,12 +11,12 @@ target triple = "powerpc64-unknown-linux-gnu"
 define i32 @localexec() nounwind {
 entry:
 ;OPT0:          addis [[REG1:[0-9]+]], 13, a@tprel@ha
-;OPT0-NEXT:     addi [[REG1]], [[REG1]], a@tprel@l
-;OPT0-NEXT:     li [[REG2:[0-9]+]], 42
-;OPT0:          stw [[REG2]], 0([[REG1]])
+;OPT0-NEXT:     addi [[REG2:[0-9]+]], [[REG1]], a@tprel@l
+;OPT0-NEXT:     li [[REG3:[0-9]+]], 42
+;OPT0:          stw [[REG3]], 0([[REG2]])
 ;OPT1:          addis [[REG1:[0-9]+]], 13, a@tprel@ha
-;OPT1-NEXT:     li [[REG2:[0-9]+]], 42
-;OPT1:     stw [[REG2]], a@tprel@l([[REG1]])
+;OPT1-NEXT:     li [[REG3:[0-9]+]], 42
+;OPT1:     stw [[REG3]], a@tprel@l([[REG1]])
   store i32 42, i32* @a, align 4
   ret i32 0
 }
diff --git a/test/CodeGen/PowerPC/vsx-spill.ll b/test/CodeGen/PowerPC/vsx-spill.ll
index 3bea07f3b8d..d46664ba98d 100644
--- a/test/CodeGen/PowerPC/vsx-spill.ll
+++ b/test/CodeGen/PowerPC/vsx-spill.ll
@@ -60,8 +60,8 @@ entry:
 ; CHECK-REG: blr
 
 ; CHECK-FISL: @foo2
-; CHECK-FISL: xsadddp f1, f1, f1
-; CHECK-FISL: stxsdx f1, r1, r3
+; CHECK-FISL: xsadddp [[REG0:f[0-9]+]], f1, f1
+; CHECK-FISL: stxsdx [[REG0]], r1, r3
 ; CHECK-FISL: lxsdx f1, r1, r3
 ; CHECK-FISL: blr
 
@@ -71,8 +71,8 @@ entry:
 ; CHECK-P9-REG: blr
 
 ; CHECK-P9-FISL: @foo2
-; CHECK-P9-FISL: xsadddp f1, f1, f1
-; CHECK-P9-FISL: stfd f1, -152(r1)
+; CHECK-P9-FISL: xsadddp [[REG0:f[0-9]+]], f1, f1
+; CHECK-P9-FISL: stfd [[REG0]], -152(r1)
 ; CHECK-P9-FISL: lfd f1, -152(r1)
 ; CHECK-P9-FISL: blr
 
diff --git a/test/CodeGen/SPARC/LeonCASAInstructionUT.ll b/test/CodeGen/SPARC/LeonCASAInstructionUT.ll
index fa2fdd1c9b1..18c98091da7 100644
--- a/test/CodeGen/SPARC/LeonCASAInstructionUT.ll
+++ b/test/CodeGen/SPARC/LeonCASAInstructionUT.ll
@@ -19,7 +19,9 @@
 ; RUN: llc %s -O0 -march=sparc -mcpu=ma2x8x -o - | FileCheck %s
 
 ; CHECK-LABEL: casa_test
-; CHECK:       casa [%o0] 10, %o3, %o2
+; CHECK-DAG:   mov 1, [[R0:%[a-z0-9]+]]
+; CHECK-DAG:   mov %g0, [[R1:%[a-z0-9]+]]
+; CHECK:       casa [{{%[a-z0-9]+}}] 10, [[R1]], [[R0]]
 define void @casa_test(i32* %ptr) {
   %pair = cmpxchg i32* %ptr, i32 0, i32 1 monotonic monotonic
   %r = extractvalue { i32, i1 } %pair, 0
diff --git a/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir b/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir
index 3956ce99623..195dbb996ef 100644
--- a/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir
+++ b/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir
@@ -13,22 +13,18 @@
 name:            main
 alignment:       2
 tracksRegLiveness: true
-registers:       
-  - { id: 0, class: gr128bit }
-  - { id: 1, class: gr64bit }
-  - { id: 2, class: addr64bit }
-# CHECK: $r0q = L128
-# CHECK-NEXT: $r0l = COPY renamable $r1l
+# CHECK: $r0l = COPY renamable $r1l
 # Although R0L partially redefines R0Q, it must not mark R0Q as kill
 # because R1D is still live through that instruction.
 # CHECK-NOT: implicit killed $r0q
-# CHECK-NEXT: $r2d = COPY renamable $r1d
+# CHECK-NEXT: {{\$r[0-9]+d}} = COPY renamable $r1d
 # CHECK-NEXT: LARL
 body:             |
   bb.0:
+    %0 : gr128bit = IMPLICIT_DEF
     %0.subreg_hl32 = COPY %0.subreg_l32
-    %1 = COPY %0.subreg_l64
-    %2 = LARL @g_167
+    %1 : gr64bit = COPY %0.subreg_l64
+    %2 : addr64bit = LARL @g_167
     STC %1.subreg_l32, %2, 8, $noreg
 
 ...
diff --git a/test/CodeGen/SystemZ/rosbg-02.ll b/test/CodeGen/SystemZ/rosbg-02.ll
index fa1ac6e75ea..8a7357a5318 100644
--- a/test/CodeGen/SystemZ/rosbg-02.ll
+++ b/test/CodeGen/SystemZ/rosbg-02.ll
@@ -18,7 +18,7 @@ define void @main() {
   %7 = zext i1 %6 to i32
   %8 = load i32, i32* @g_999, align 4
   %9 = or i32 %8, %7
-; CHECK: rosbg   %r1, %r3, 63, 63, 33
+; CHECK: rosbg   {{%r[0-9]+}}, {{%r[0-9]+}}, 63, 63, 33
   store i32 %9, i32* @g_999, align 4
   ret void
 }
diff --git a/test/CodeGen/X86/epilogue-cfi-no-fp.ll b/test/CodeGen/X86/epilogue-cfi-no-fp.ll
index 6b0e79fce43..6ff0604cdba 100644
--- a/test/CodeGen/X86/epilogue-cfi-no-fp.ll
+++ b/test/CodeGen/X86/epilogue-cfi-no-fp.ll
@@ -1,33 +1,19 @@
 ; RUN: llc -O0 < %s | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-target triple = "i686-pc-linux"
+target triple = "i686--"
 
 ; Function Attrs: noinline nounwind
 define i32 @foo(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m) {
 ; CHECK-LABEL:   foo:
-; CHECK:         addl	$20, %esp
+; CHECK:         popl   %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popl	%esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    popl	%edi
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    popl	%ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    popl	%ebp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl
 entry:
-  %i.addr = alloca i32, align 4
-  %j.addr = alloca i32, align 4
-  %k.addr = alloca i32, align 4
-  %l.addr = alloca i32, align 4
-  %m.addr = alloca i32, align 4
-  store i32 %i, i32* %i.addr, align 4
-  store i32 %j, i32* %j.addr, align 4
-  store i32 %k, i32* %k.addr, align 4
-  store i32 %l, i32* %l.addr, align 4
-  store i32 %m, i32* %m.addr, align 4
+  tail call void asm sideeffect "nop", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp}"()
   ret i32 0
 }
-
-
-
diff --git a/test/CodeGen/X86/fast-isel-extract.ll b/test/CodeGen/X86/fast-isel-extract.ll
index fb20fdd0d36..62d5b440afa 100644
--- a/test/CodeGen/X86/fast-isel-extract.ll
+++ b/test/CodeGen/X86/fast-isel-extract.ll
@@ -12,7 +12,8 @@ define void @test1(i64*) nounwind ssp {
   ret void
 ; CHECK-LABEL: test1:
 ; CHECK: callq _f
-; CHECK-NEXT: addq	$10, %rax
+; CHECK-NOT: %rax
+; CHECK: addq $10, %rax
 }
 
 define void @test2(i64*) nounwind ssp {
@@ -23,7 +24,8 @@ define void @test2(i64*) nounwind ssp {
   ret void
 ; CHECK-LABEL: test2:
 ; CHECK: callq _f
-; CHECK-NEXT: addq	$10, %rdx
+; CHECK-NOT: %rdx
+; CHECK: addq $10, %rdx
 }
 
 declare %addovf @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/fast-isel-gep.ll b/test/CodeGen/X86/fast-isel-gep.ll
index 88a22ca899d..576990d2fe5 100644
--- a/test/CodeGen/X86/fast-isel-gep.ll
+++ b/test/CodeGen/X86/fast-isel-gep.ll
@@ -24,7 +24,7 @@ define i32 @test2(i64 %t3, i32* %t1) nounwind {
        %t15 = load i32, i32* %t9            ; <i32> [#uses=1]
        ret i32 %t15
 ; X32-LABEL: test2:
-; X32:  	movl	(%edx,%ecx,4), %e
+; X32:  	movl	({{%e[a-z]+}},{{%e[a-z]+}},4), %e
 ; X32:  	ret
 
 ; X64-LABEL: test2:
@@ -81,8 +81,8 @@ define i64 @test5(i8* %A, i32 %I, i64 %B) nounwind {
   %v11 = add i64 %B, %v10
   ret i64 %v11
 ; X64-LABEL: test5:
-; X64: movslq	%e[[A1]], %rax
-; X64-NEXT: (%r[[A0]],%rax),
+; X64: movslq	%e[[A1]], [[R0:%r[a-z]+]]
+; X64-NEXT: (%r[[A0]],[[R0]]),
 ; X64: ret
 }
 
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index 7fb2670e6d1..b1f380e3a85 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -84,7 +84,7 @@ entry:
   ret i64 %mul
 
 ; CHECK-LABEL: test6:
-; CHECK: shlq	$3, %rdi
+; CHECK: shlq	$3, {{%r[a-z]+}}
 }
 
 define i32 @test7(i32 %x) nounwind ssp {
@@ -92,7 +92,7 @@ entry:
   %mul = mul nsw i32 %x, 8
   ret i32 %mul
 ; CHECK-LABEL: test7:
-; CHECK: shll	$3, %edi
+; CHECK: shll	$3, {{%e[a-z]+}}
 }
 
 
@@ -103,7 +103,7 @@ entry:
   ret i64 %add
 
 ; CHECK-LABEL: test8:
-; CHECK: addq	$7, %rdi
+; CHECK: addq	$7, {{%r[a-z]+}}
 }
 
 define i64 @test9(i64 %x) nounwind ssp {
@@ -297,8 +297,10 @@ define void @test23(i8* noalias sret %result) {
   %b = call i8* @foo23()
   ret void
 ; CHECK-LABEL: test23:
+; CHECK: movq %rdi, [[STACK:[0-9]+\(%rsp\)]]
 ; CHECK: call
-; CHECK: movq  %rdi, %rax
+; CHECK: movq [[STACK]], %rdi
+; CHECK: movq %rdi, %rax
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/X86/fold-sext-trunc.ll b/test/CodeGen/X86/fold-sext-trunc.ll
index 7cab8ebe537..475fbea4285 100644
--- a/test/CodeGen/X86/fold-sext-trunc.ll
+++ b/test/CodeGen/X86/fold-sext-trunc.ll
@@ -13,7 +13,7 @@ define void @int322(i32 %foo) !dbg !5 {
 entry:
   %val = load i64, i64* getelementptr (%0, %0* bitcast (%struct.S1* @g_10 to %0*), i32 0, i32 0), !dbg !16
   %0 = load i32, i32* getelementptr inbounds (%struct.S1, %struct.S1* @g_10, i32 0, i32 1), align 4, !dbg !17
-; MIR: renamable $rax = MOVSX64rm32 {{.*}}, @g_10 + 4,{{.*}} debug-location !17 :: (dereferenceable load 4 from `i64* getelementptr (%0, %0* bitcast (%struct.S1* @g_10 to %0*), i32 0, i32 0)` + 4)
+; MIR: renamable {{\$r[a-z]+}} = MOVSX64rm32 {{.*}}, @g_10 + 4,{{.*}} debug-location !17 :: (dereferenceable load 4 from `i64* getelementptr (%0, %0* bitcast (%struct.S1* @g_10 to %0*), i32 0, i32 0)` + 4)
   %1 = sext i32 %0 to i64, !dbg !18
   %tmp4.i = lshr i64 %val, 32, !dbg !19
   %tmp5.i = trunc i64 %tmp4.i to i32, !dbg !20
diff --git a/test/CodeGen/X86/pr28489.ll b/test/CodeGen/X86/pr28489.ll
index 898b0870b65..8ab3fbb9d91 100644
--- a/test/CodeGen/X86/pr28489.ll
+++ b/test/CodeGen/X86/pr28489.ll
@@ -3,8 +3,8 @@ declare void @g(i32, i1)
 
 ;CHECK-LABEL: f:
 ;CHECK: cmpxchg8b
-;CHECK: sete %cl
-;CHECK: movzbl %cl
+;CHECK: sete [[REG:%[abcd]l]]
+;CHECK: movzbl [[REG]]
 define void @f(i64* %arg, i64 %arg1) {
 entry:
   %tmp5 = cmpxchg i64* %arg, i64 %arg1, i64 %arg1 seq_cst seq_cst
diff --git a/test/CodeGen/X86/pr30813.ll b/test/CodeGen/X86/pr30813.ll
index b830f1e04b7..e3e096bda6c 100644
--- a/test/CodeGen/X86/pr30813.ll
+++ b/test/CodeGen/X86/pr30813.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu -O0 %s -o - | FileCheck %s
 ; CHECK: patatino:
 ; CHECK:         .cfi_startproc
-; CHECK:         movzwl  (%rax), %ecx
-; CHECK:         movl    %ecx, %eax
-; CHECK:         movq    %rax, (%rdx)
+; CHECK:         movzwl  (%rax), [[REG0:%e[abcd]x]]
+; CHECK:         movl    [[REG0]], %e[[REG1C:[abcd]]]x
+; CHECK:         movq    %r[[REG1C]]x, ({{%r[abcd]x}})
 ; CHECK:         retq
 
 define void @patatino() {
diff --git a/test/CodeGen/X86/shift-i256.ll b/test/CodeGen/X86/shift-i256.ll
index 4fa3303baf0..9947d45649d 100644
--- a/test/CodeGen/X86/shift-i256.ll
+++ b/test/CodeGen/X86/shift-i256.ll
@@ -15,7 +15,7 @@ define i256 @shift2(i256 %c) nounwind
 {
   %b = shl i256 1, %c  ; %c must not be a constant
   ; Special case when %c is 0:
-  ; CHECK-X64: testb [[REG:%r[0-9]+b]], [[REG]]
+  ; CHECK-X64: testb [[REG:%(bpl|r[0-9]+b)]], {{%(bpl|r[0-9]+b)}}
   ; CHECK-X64: cmoveq
   ret i256 %b
 }
diff --git a/test/CodeGen/X86/switch.ll b/test/CodeGen/X86/switch.ll
index 95b2ed0e618..a0ebc4eeba2 100644
--- a/test/CodeGen/X86/switch.ll
+++ b/test/CodeGen/X86/switch.ll
@@ -318,15 +318,15 @@ return: ret void
 ; NOOPT-LABEL: optimal_jump_table1
 ; NOOPT: testl %edi, %edi
 ; NOOPT: je
-; NOOPT: subl $5, %eax
+; NOOPT: subl $5, [[REG:%e[abcd][xi]]]
 ; NOOPT: je
-; NOOPT: subl $6, %eax
+; NOOPT: subl $6, [[REG]]
 ; NOOPT: je
-; NOOPT: subl $12, %eax
+; NOOPT: subl $12, [[REG]]
 ; NOOPT: je
-; NOOPT: subl $13, %eax
+; NOOPT: subl $13, [[REG]]
 ; NOOPT: je
-; NOOPT: subl $15, %eax
+; NOOPT: subl $15, [[REG]]
 ; NOOPT: je
 }
 
diff --git a/test/CodeGen/X86/win32_sret.ll b/test/CodeGen/X86/win32_sret.ll
index 70fa22bb8ba..ea164ae28f8 100644
--- a/test/CodeGen/X86/win32_sret.ll
+++ b/test/CodeGen/X86/win32_sret.ll
@@ -16,7 +16,7 @@
 define void @sret1(i8* sret %x) nounwind {
 entry:
 ; WIN32-LABEL:      _sret1:
-; WIN32:      movb $42, (%eax)
+; WIN32:      movb $42, ({{%e[abcd]x}})
 ; WIN32-NOT:  popl %eax
 ; WIN32:    {{retl$}}
 
@@ -36,7 +36,7 @@ entry:
 define void @sret2(i8* sret %x, i8 %y) nounwind {
 entry:
 ; WIN32-LABEL:      _sret2:
-; WIN32:      movb {{.*}}, (%eax)
+; WIN32:      movb {{.*}}, ({{%e[abcd]x}})
 ; WIN32-NOT:  popl %eax
 ; WIN32:    {{retl$}}
 
@@ -56,8 +56,8 @@ entry:
 define void @sret3(i8* sret %x, i8* %y) nounwind {
 entry:
 ; WIN32-LABEL:      _sret3:
-; WIN32:      movb $42, (%eax)
-; WIN32-NOT:  movb $13, (%eax)
+; WIN32:      movb $42, ([[REG1:%e[abcd]x]])
+; WIN32-NOT:  movb $13, ([[REG1]])
 ; WIN32-NOT:  popl %eax
 ; WIN32:    {{retl$}}
 
@@ -81,7 +81,7 @@ entry:
 define void @sret4(%struct.S4* noalias sret %agg.result) {
 entry:
 ; WIN32-LABEL:     _sret4:
-; WIN32:     movl $42, (%eax)
+; WIN32:     movl $42, ({{%e[abcd]x}})
 ; WIN32-NOT: popl %eax
 ; WIN32:   {{retl$}}
 
@@ -118,8 +118,8 @@ entry:
 ; The address of the return structure is passed as an implicit parameter.
 ; In the -O0 build, %eax is spilled at the beginning of the function, hence we
 ; should match both 4(%esp) and 8(%esp).
-; WIN32:     {{[48]}}(%esp), %eax
-; WIN32:     movl $42, (%eax)
+; WIN32:     {{[48]}}(%esp), [[REG:%e[abcd]x]]
+; WIN32:     movl $42, ([[REG]])
 ; WIN32:     retl $4
 }
 
@@ -230,8 +230,8 @@ define void @test8_f(i64 inreg %a, i64* sret %out) {
 
 ; WIN32-LABEL: _test8_f:
 ; WIN32: movl {{[0-9]+}}(%esp), %[[out:[a-z]+]]
-; WIN32-DAG: movl %edx, 4(%[[out]])
-; WIN32-DAG: movl %eax, (%[[out]])
+; WIN32-DAG: movl {{%e[abcd]x}}, 4(%[[out]])
+; WIN32-DAG: movl {{%e[abcd]x}}, (%[[out]])
 ; WIN32: calll _clobber_eax
 ; WIN32: movl {{.*}}, %eax
 ; WIN32: retl
diff --git a/test/CodeGen/XCore/dwarf_debug.ll b/test/CodeGen/XCore/dwarf_debug.ll
index 4efd73e40a7..e0a75d25373 100644
--- a/test/CodeGen/XCore/dwarf_debug.ll
+++ b/test/CodeGen/XCore/dwarf_debug.ll
@@ -4,11 +4,11 @@
 ; target triple = "xcore"
 
 ; CHECK-LABEL: f
-; CHECK: entsp 2
+; CHECK: entsp [[S:[0-9]+]]
 ; ...the prologue...
 ; CHECK: .loc 1 2 0 prologue_end      # test.c:2:0
 ; CHECK: add r0, r0, 1
-; CHECK: retsp 2
+; CHECK: retsp [[S]]
 define i32 @f(i32 %a) !dbg !4 {
 entry:
   %a.addr = alloca i32, align 4
diff --git a/test/DebugInfo/X86/parameters.ll b/test/DebugInfo/X86/parameters.ll
index 7a5b852bde2..ed0048cc15e 100644
--- a/test/DebugInfo/X86/parameters.ll
+++ b/test/DebugInfo/X86/parameters.ll
@@ -1,7 +1,6 @@
 ; REQUIRES: object-emission
 ;
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 -filetype=obj < %s > %t
-; RUN: llvm-dwarfdump -v -debug-info %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 -filetype=obj %s -o - | llvm-dwarfdump -v -debug-info - | FileCheck %s
 
 ; Test case derived from compiling the following source with clang -g:
 ;
diff --git a/test/DebugInfo/X86/pieces-1.ll b/test/DebugInfo/X86/pieces-1.ll
index f91aec19f38..02b45d11fdc 100644
--- a/test/DebugInfo/X86/pieces-1.ll
+++ b/test/DebugInfo/X86/pieces-1.ll
@@ -1,5 +1,4 @@
-; RUN: llc -O0 %s -filetype=obj -o %t.o
-; RUN: llvm-dwarfdump -debug-loc %t.o | FileCheck %s
+; RUN: llc -O0 %s -filetype=obj -o - | llvm-dwarfdump -debug-loc - | FileCheck %s
 ;
 ; rdar://problem/15928306
 ;
diff --git a/test/DebugInfo/X86/sdag-split-arg.ll b/test/DebugInfo/X86/sdag-split-arg.ll
index 31cb678e8e1..745c1c366af 100644
--- a/test/DebugInfo/X86/sdag-split-arg.ll
+++ b/test/DebugInfo/X86/sdag-split-arg.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -O0 -filetype=asm %s -o - | FileCheck %s
 ; Test large integral function arguments passed in multiple registers.
-; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 64 16] $ax
-; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 48 16] $r9w
-; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 32 16] $r10w
-; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 16 16] $r11w
-; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 0 16] $bx
+; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 64 16] ${{([a-d]x)|(si)|(di)|(bp)|(r[0-9]+w)}}
+; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 48 16] ${{([a-d]x)|(si)|(di)|(bp)|(r[0-9]+w)}}
+; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 32 16] ${{([a-d]x)|(si)|(di)|(bp)|(r[0-9]+w)}}
+; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 16 16] ${{([a-d]x)|(si)|(di)|(bp)|(r[0-9]+w)}}
+; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 0 16]  ${{([a-d]x)|(si)|(di)|(bp)|(r[0-9]+w)}}
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
diff --git a/test/DebugInfo/X86/vla.ll b/test/DebugInfo/X86/vla.ll
index 6713d86769d..7d4aff8470d 100644
--- a/test/DebugInfo/X86/vla.ll
+++ b/test/DebugInfo/X86/vla.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin -filetype=asm %s -o - | FileCheck %s
 ; Ensure that we generate an indirect location for the variable length array a.
-; CHECK: ##DEBUG_VALUE: vla:a <- [DW_OP_deref] [$rcx+0]
-; CHECK: DW_OP_breg2
+; CHECK: ##DEBUG_VALUE: vla:a <- [DW_OP_deref] [{{\$r[a-z]+}}+0]
+; CHECK: DW_OP_breg{{[0-9]}}
 ; rdar://problem/13658587
 ;
 ; generated from:
-- 
GitLab


From 53e05d372d0b294e23a94b1a798f68270381572f Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Mon, 29 Oct 2018 20:27:07 +0000
Subject: [PATCH 0714/1116] [MachineOutliner] Inherit target features from
 parent function

If a function has target features, it may contain instructions that aren't
represented in the default set of instructions. If the outliner pulls out one
of these instructions, and the function doesn't have the right attributes
attached, we'll run into an LLVM error explaining that the target doesn't
support the necessary feature for the instruction.

This makes outlined functions inherit target features from their parents.

It also updates the machine-outliner.ll test to check that we're properly
inheriting target features.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345535 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineOutliner.cpp          |  8 ++++++++
 test/CodeGen/AArch64/machine-outliner.ll | 12 +++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 4b65d971a78..00856361db0 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -1202,6 +1202,14 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
   F->addFnAttr(Attribute::OptimizeForSize);
   F->addFnAttr(Attribute::MinSize);
 
+  // Include target features from an arbitrary candidate for the outlined
+  // function. This makes sure the outlined function knows what kinds of
+  // instructions are going into it. This is fine, since all parent functions
+  // must necessarily support the instructions that are in the outlined region.
+  const Function &ParentFn = OF.Candidates.front()->getMF()->getFunction();
+  if (ParentFn.hasFnAttribute("target-features"))
+    F->addFnAttr(ParentFn.getFnAttribute("target-features"));
+
   BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
   IRBuilder<> Builder(EntryBB);
   Builder.CreateRetVoid();
diff --git a/test/CodeGen/AArch64/machine-outliner.ll b/test/CodeGen/AArch64/machine-outliner.ll
index 9d922c27f88..19be14d8d39 100644
--- a/test/CodeGen/AArch64/machine-outliner.ll
+++ b/test/CodeGen/AArch64/machine-outliner.ll
@@ -1,6 +1,16 @@
 ; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple=aarch64-apple-darwin < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple=aarch64-apple-darwin -mcpu=cortex-a53 -enable-misched=false < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -enable-machine-outliner -enable-linkonceodr-outlining -mtriple=aarch64-apple-darwin < %s | FileCheck %s -check-prefix=ODR
+; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple=aarch64-apple-darwin -stop-after=machine-outliner < %s | FileCheck %s -check-prefix=TARGET_FEATURES
+
+; Make sure that we inherit target features from functions and make sure we have
+; the right function attributes.
+; TARGET_FEATURES: define internal void @OUTLINED_FUNCTION_{{[0-9]+}}()
+; TARGET_FEATURES-SAME: #[[ATTR_NUM:[0-9]+]]
+; TARGET_FEATURES-DAG: attributes #[[ATTR_NUM]] = {
+; TARGET_FEATURES-SAME: minsize
+; TARGET_FEATURES-SAME: optsize
+; TARGET_FEATURES-SAME: "target-features"="+sse"
 
 define linkonce_odr void @fish() #0 {
   ; CHECK-LABEL: _fish:
@@ -95,4 +105,4 @@ define void @dog() #0 {
 ; CHECK-NEXT: str     w8, [sp, #8]
 ; CHECK-NEXT: ret
 
-attributes #0 = { noredzone "target-cpu"="cyclone" }
+attributes #0 = { noredzone "target-cpu"="cyclone" "target-features"="+sse" }
-- 
GitLab


From fb954c0ce162320edb37c787a1b16fb5856692de Mon Sep 17 00:00:00 2001
From: Fedor Sergeev <fedor.sergeev@azul.com>
Date: Mon, 29 Oct 2018 20:38:23 +0000
Subject: [PATCH 0715/1116] [LoopUnroll] NFC. Factor out runtime-loop.ll common
 test behavior.

Adding COMMON prefix to get common part handled there.
Needed to simplify test changes for D53440.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345538 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/LoopUnroll/runtime-loop.ll | 43 ++++++++++++----------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/test/Transforms/LoopUnroll/runtime-loop.ll b/test/Transforms/LoopUnroll/runtime-loop.ll
index 720d0d76e4e..34eaa4ec333 100644
--- a/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop.ll
@@ -1,13 +1,15 @@
-; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefix=EPILOG
-; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
+; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=EPILOG,COMMON
+; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=PROLOG,COMMON
 
-; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll' -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefix=EPILOG
-; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll' -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=EPILOG,COMMON
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=PROLOG,COMMON
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Tests for unrolling loops with run-time trip counts
 
+; COMMON-LABEL: @test(
+
 ; EPILOG: %xtraiter = and i32 %n
 ; EPILOG:  %lcmp.mod = icmp ne i32 %xtraiter, 0
 ; EPILOG:  br i1 %lcmp.mod, label %for.body.epil.preheader, label %for.end.loopexit
@@ -54,11 +56,10 @@ for.end:                                          ; preds = %for.body, %entry
 ; Still try to completely unroll loops with compile-time trip counts
 ; even if the -unroll-runtime is specified
 
-; EPILOG: for.body:
-; EPILOG-NOT: for.body.epil:
-
-; PROLOG: for.body:
-; PROLOG-NOT: for.body.prol:
+; COMMON-LABEL: @test1(
+; COMMON: for.body:
+; COMMON-NOT: for.body.epil:
+; COMMON-NOT: for.body.prol:
 
 define i32 @test1(i32* nocapture %a) nounwind uwtable readonly {
 entry:
@@ -82,6 +83,7 @@ for.end:                                          ; preds = %for.body
 ; This is test 2007-05-09-UnknownTripCount.ll which can be unrolled now
 ; if the -unroll-runtime option is turned on
 
+; COMMON-LABEL: @foo(
 ; EPILOG: bb72.2:
 ; PROLOG: bb72.2:
 
@@ -105,6 +107,7 @@ cond_true138:
 
 ; Test run-time unrolling for a loop that counts down by -2.
 
+; COMMON-LABEL: @down(
 ; EPILOG: for.body.epil:
 ; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.cond.for.end_crit_edge.epilog-lcssa
 
@@ -138,6 +141,8 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 }
 
 ; Test run-time unrolling disable metadata.
+; COMMON-LABEL: @test2(
+
 ; EPILOG: for.body:
 ; EPILOG-NOT: for.body.epil:
 
@@ -174,11 +179,9 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; -runtime-unroll-multi-exit=true
 ; single exit, multiple exiting blocks.
 define void @unique_exit(i32 %arg) {
-; PROLOG: unique_exit(
-; PROLOG-NOT: .unr
+; COMMON-LABEL: @unique_exit(
+; COMMON-NOT: .unr
 
-; EPILOG: unique_exit(
-; EPILOG-NOT: .unr
 entry:
   %tmp = icmp sgt i32 undef, %arg
   br i1 %tmp, label %preheader, label %returnblock
@@ -206,11 +209,9 @@ latch:                                            ; preds = %header
 
 ; multiple exit blocks. don't unroll
 define void @multi_exit(i64 %trip, i1 %cond) {
-; PROLOG: multi_exit(
-; PROLOG-NOT: .unr
+; COMMON-LABEL: @multi_exit(
+; COMMON-NOT: .unr
 
-; EPILOG: multi_exit(
-; EPILOG-NOT: .unr
 entry:
   br label %loop_header
 
@@ -238,11 +239,15 @@ exit1:
 exit2.loopexit:
   ret void
 }
+
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.unroll.runtime.disable"}
 
-; EPILOG: !0 = distinct !{!0, !1}
+; need to use LABEL here to separate function IR matching from metadata matching
+; COMMON-LABEL: {{^}}!0 =
+
+; EPILOG-SAME: distinct !{!0, !1}
 ; EPILOG: !1 = !{!"llvm.loop.unroll.disable"}
 
-; PROLOG: !0 = distinct !{!0, !1}
+; PROLOG-SAME: distinct !{!0, !1}
 ; PROLOG: !1 = !{!"llvm.loop.unroll.disable"}
-- 
GitLab


From d06dc588fe6d44648e127c08facbd04d00c60077 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 29 Oct 2018 21:05:41 +0000
Subject: [PATCH 0716/1116] [InstSimplify] add tests for abs/nabs+icmp folding;
 NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345541 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstSimplify/icmp-abs-nabs.ll | 401 ++++++++++++++++++
 1 file changed, 401 insertions(+)
 create mode 100644 test/Transforms/InstSimplify/icmp-abs-nabs.ll

diff --git a/test/Transforms/InstSimplify/icmp-abs-nabs.ll b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
new file mode 100644
index 00000000000..1cb312bf0da
--- /dev/null
+++ b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
@@ -0,0 +1,401 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; This is canonical form for this IR.
+
+define i1 @abs_nsw_is_positive(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], -1
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sgt i32 %abs, -1
+  ret i1 %r
+}
+
+; Test non-canonical predicate and non-canonical form of abs().
+
+define i1 @abs_nsw_is_positive_sge(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive_sge(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sge i32 %abs, 0
+  ret i1 %r
+}
+
+; This is a range-based analysis. Any negative constant works.
+
+define i1 @abs_nsw_is_positive_reduced_range(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive_reduced_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], -42
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sgt i32 %abs, -42
+  ret i1 %r
+}
+
+; Negative test - we need 'nsw' in the abs().
+
+define i1 @abs_is_positive_reduced_range(i32 %x) {
+; CHECK-LABEL: @abs_is_positive_reduced_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], 42
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sgt i32 %abs, 42
+  ret i1 %r
+}
+
+; Negative test - range intersection is not subset.
+
+define i1 @abs_nsw_is_positive_wrong_range(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive_wrong_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sgt i32 %abs, 0
+  ret i1 %r
+}
+
+; This is canonical form for this IR.
+
+define i1 @abs_nsw_is_not_negative(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp slt i32 %abs, 0
+  ret i1 %r
+}
+
+; Test non-canonical predicate and non-canonical form of abs().
+
+define i1 @abs_nsw_is_not_negative_sle(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative_sle(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i32 [[ABS]], -1
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sle i32 %abs, -1
+  ret i1 %r
+}
+
+; This is a range-based analysis. Any negative constant works.
+
+define i1 @abs_nsw_is_not_negative_reduced_range(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative_reduced_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[ABS]], -24
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp slt i32 %abs, -24
+  ret i1 %r
+}
+
+; Negative test - we need 'nsw' in the abs().
+
+define i1 @abs_is_not_negative_reduced_range(i32 %x) {
+; CHECK-LABEL: @abs_is_not_negative_reduced_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[ABS]], 42
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp slt i32 %abs, 42
+  ret i1 %r
+}
+
+; Negative test - range intersection is not empty.
+
+define i1 @abs_nsw_is_not_negative_wrong_range(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative_wrong_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i32 [[ABS]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sle i32 %abs, 0
+  ret i1 %r
+}
+
+; This is canonical form for this IR. For nabs(), we don't require 'nsw'
+
+define i1 @nabs_is_negative_or_0(i32 %x) {
+; CHECK-LABEL: @nabs_is_negative_or_0(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[NABS]], 1
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp slt i32 %nabs, 1
+  ret i1 %r
+}
+
+; Test non-canonical predicate and non-canonical form of nabs().
+
+define i1 @nabs_is_negative_or_0_sle(i32 %x) {
+; CHECK-LABEL: @nabs_is_negative_or_0_sle(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i32 [[NABS]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp sle i32 %nabs, 0
+  ret i1 %r
+}
+
+; This is a range-based analysis. Any positive constant works.
+
+define i1 @nabs_is_negative_or_0_reduced_range(i32 %x) {
+; CHECK-LABEL: @nabs_is_negative_or_0_reduced_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[NABS]], 421
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp slt i32 %nabs, 421
+  ret i1 %r
+}
+
+; Negative test - range intersection is not subset.
+
+define i1 @nabs_is_negative_or_0_wrong_range(i32 %x) {
+; CHECK-LABEL: @nabs_is_negative_or_0_wrong_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[NABS]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp slt i32 %nabs, 0
+  ret i1 %r
+}
+
+; This is canonical form for this IR. For nabs(), we don't require 'nsw'
+
+define i1 @nabs_is_not_over_0(i32 %x) {
+; CHECK-LABEL: @nabs_is_not_over_0(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[NABS]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp sgt i32 %nabs, 0
+  ret i1 %r
+}
+
+; Test non-canonical predicate and non-canonical form of nabs().
+
+define i1 @nabs_is_not_over_0_sle(i32 %x) {
+; CHECK-LABEL: @nabs_is_not_over_0_sle(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sge i32 [[NABS]], 1
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp sge i32 %nabs, 1
+  ret i1 %r
+}
+
+; This is a range-based analysis. Any positive constant works.
+
+define i1 @nabs_is_not_over_0_reduced_range(i32 %x) {
+; CHECK-LABEL: @nabs_is_not_over_0_reduced_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[NABS]], 4223
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp sgt i32 %nabs, 4223
+  ret i1 %r
+}
+
+; Negative test - range intersection is not subset.
+
+define i1 @nabs_is_not_over_0_wrong_range(i32 %x) {
+; CHECK-LABEL: @nabs_is_not_over_0_wrong_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[NABS]], -1
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp sgt i32 %nabs, -1
+  ret i1 %r
+}
+
+; More miscellaneous tests for predicates/types.
+
+; Equality predicates are ok.
+
+define i1 @abs_nsw_is_positive_eq(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive_eq(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp eq i32 %abs, -8
+  ret i1 %r
+}
+
+; An unsigned compare may work.
+
+define i1 @abs_nsw_is_positive_ult(i8 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive_ult(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i8 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i8 [[NEGX]], i8 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[ABS]], -117
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i8 %x, 0
+  %negx = sub nsw i8 0, %x
+  %abs = select i1 %cmp, i8 %negx, i8 %x
+  %r = icmp ult i8 %abs, 139
+  ret i1 %r
+}
+
+; An unsigned compare may work.
+
+define i1 @abs_nsw_is_not_negative_ugt(i8 %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative_ugt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i8 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i8 [[NEGX]], i8 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[ABS]], 127
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i8 %x, 0
+  %negx = sub nsw i8 0, %x
+  %abs = select i1 %cmp, i8 %negx, i8 %x
+  %r = icmp ugt i8 %abs, 127
+  ret i1 %r
+}
+
+; Vector types are ok.
+
+define <2 x i1> @abs_nsw_is_not_negative_vec_splat(<2 x i32> %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative_vec_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i32> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[NEGX]], <2 x i32> [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt <2 x i32> [[ABS]], <i32 -8, i32 -8>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %cmp = icmp slt <2 x i32> %x, zeroinitializer
+  %negx = sub nsw <2 x i32> zeroinitializer, %x
+  %abs = select <2 x i1> %cmp, <2 x i32> %negx, <2 x i32> %x
+  %r = icmp slt <2 x i32> %abs, <i32 -8, i32 -8>
+  ret <2 x i1> %r
+}
+
+; Equality predicates are ok.
+
+define i1 @nabs_is_negative_or_0_ne(i8 %x) {
+; CHECK-LABEL: @nabs_is_negative_or_0_ne(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i8 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[NABS]], 12
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i8 %x, 0
+  %negx = sub i8 0, %x
+  %nabs = select i1 %cmp, i8 %x, i8 %negx
+  %r = icmp ne i8 %nabs, 12
+  ret i1 %r
+}
+
+; Vector types are ok.
+
+define <3 x i1> @nabs_is_not_over_0_sle_vec_splat(<3 x i33> %x) {
+; CHECK-LABEL: @nabs_is_not_over_0_sle_vec_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <3 x i33> [[X:%.*]], <i33 1, i33 1, i33 1>
+; CHECK-NEXT:    [[NEGX:%.*]] = sub <3 x i33> zeroinitializer, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select <3 x i1> [[CMP]], <3 x i33> [[X]], <3 x i33> [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sge <3 x i33> [[NABS]], <i33 1, i33 1, i33 1>
+; CHECK-NEXT:    ret <3 x i1> [[R]]
+;
+  %cmp = icmp slt <3 x i33> %x, <i33 1, i33 1, i33 1>
+  %negx = sub <3 x i33> zeroinitializer, %x
+  %nabs = select <3 x i1> %cmp, <3 x i33> %x, <3 x i33> %negx
+  %r = icmp sge <3 x i33> %nabs, <i33 1, i33 1, i33 1>
+  ret <3 x i1> %r
+}
+
-- 
GitLab


From 93ce40bd23b2a209b214c4ed279d2d778c9e10d2 Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Mon, 29 Oct 2018 21:07:27 +0000
Subject: [PATCH 0717/1116] AMDGPU: Enable code object v3 by default

Differential Revision: https://reviews.llvm.org/D53525


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345542 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPU.td                   | 45 ++++++++++------
 test/CodeGen/AMDGPU/addrspacecast.ll          |  4 +-
 test/CodeGen/AMDGPU/amdgpu.private-memory.ll  | 14 ++---
 .../attr-amdgpu-flat-work-group-size.ll       |  2 +-
 .../AMDGPU/call-graph-register-usage.ll       |  6 +--
 .../AMDGPU/callee-special-input-sgprs.ll      |  4 +-
 .../AMDGPU/callee-special-input-vgprs.ll      |  2 +-
 test/CodeGen/AMDGPU/debugger-emit-prologue.ll |  4 +-
 test/CodeGen/AMDGPU/elf-notes.ll              | 18 +++----
 .../flat-for-global-subtarget-feature.ll      |  4 +-
 test/CodeGen/AMDGPU/flat-scratch-reg.ll       |  6 +--
 test/CodeGen/AMDGPU/gfx902-without-xnack.ll   |  2 +-
 test/CodeGen/AMDGPU/hsa-fp-mode.ll            | 14 ++---
 test/CodeGen/AMDGPU/hsa-func.ll               | 12 ++---
 .../AMDGPU/hsa-metadata-enqueue-kernel.ll     |  4 +-
 .../AMDGPU/hsa-metadata-from-llvm-ir-full.ll  | 12 ++---
 .../AMDGPU/hsa-metadata-hidden-args.ll        |  6 +--
 test/CodeGen/AMDGPU/hsa-metadata-images.ll    |  6 +--
 .../AMDGPU/hsa-metadata-kernel-code-props.ll  |  6 +--
 .../AMDGPU/hsa-metadata-kernel-debug-props.ll |  8 +--
 test/CodeGen/AMDGPU/hsa-note-no-func.ll       | 52 +++++++++----------
 test/CodeGen/AMDGPU/hsa.ll                    | 12 ++---
 test/CodeGen/AMDGPU/kernel-args.ll            |  2 +-
 .../AMDGPU/kernel-argument-dag-lowering.ll    |  2 +-
 test/CodeGen/AMDGPU/large-alloca-compute.ll   |  4 +-
 .../AMDGPU/llvm.amdgcn.dispatch.ptr.ll        |  2 +-
 .../AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll |  2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll  |  2 +-
 .../AMDGPU/llvm.amdgcn.workgroup.id.ll        |  8 +--
 .../CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll |  8 +--
 test/CodeGen/AMDGPU/nop-data.ll               |  2 +-
 test/CodeGen/AMDGPU/promote-alloca-no-opts.ll |  4 +-
 .../promote-alloca-padding-size-estimate.ll   |  2 +-
 ...vgpr-spill-emergency-stack-slot-compute.ll |  4 +-
 test/MC/AMDGPU/hsa-exp.s                      |  4 +-
 test/MC/AMDGPU/hsa-text.s                     |  4 +-
 test/MC/AMDGPU/hsa.s                          |  4 +-
 test/MC/AMDGPU/hsa_code_object_isa_args.s     | 12 ++---
 test/MC/AMDGPU/hsa_isa_version_attrs.s        |  4 +-
 test/MC/AMDGPU/isa-version-hsa.s              | 14 ++---
 test/MC/AMDGPU/isa-version-pal.s              | 14 ++---
 test/MC/AMDGPU/isa-version-unk.s              | 14 ++---
 test/MC/AMDGPU/sym_option.s                   | 18 +++----
 test/Object/AMDGPU/objdump.s                  |  2 +-
 44 files changed, 195 insertions(+), 180 deletions(-)

diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 54b6c8a7882..edbdf01a591 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -474,34 +474,41 @@ def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
   [FeatureSouthernIslands,
    FeatureFastFMAF32,
    HalfRate64Ops,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
   [FeatureSouthernIslands,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
   [FeatureSeaIslands,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
   [FeatureSeaIslands,
    HalfRate64Ops,
    FeatureLDSBankCount32,
-   FeatureFastFMAF32]>;
+   FeatureFastFMAF32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
   [FeatureSeaIslands,
    FeatureLDSBankCount16,
-   FeatureFastFMAF32]>;
+   FeatureFastFMAF32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
   [FeatureSeaIslands,
-   FeatureLDSBankCount16]>;
+   FeatureLDSBankCount16,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
   [FeatureSeaIslands,
-   FeatureLDSBankCount32]>;
+   FeatureLDSBankCount32,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
   [FeatureVolcanicIslands,
@@ -509,49 +516,57 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
    HalfRate64Ops,
    FeatureLDSBankCount32,
    FeatureXNACK,
-   FeatureUnpackedD16VMem]>;
+   FeatureUnpackedD16VMem,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
    FeatureSGPRInitBug,
-   FeatureUnpackedD16VMem]>;
+   FeatureUnpackedD16VMem,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
-   FeatureUnpackedD16VMem]>;
+   FeatureUnpackedD16VMem,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount16,
-   FeatureXNACK]>;
+   FeatureXNACK,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureD16PreservesUnusedBits,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
    FeatureXNACK,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureD16PreservesUnusedBits,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
   [FeatureGFX9,
    FeatureLDSBankCount32,
    FeatureFmaMixInsts,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureD16PreservesUnusedBits,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
   [FeatureGFX9,
    HalfRate64Ops,
    FeatureFmaMixInsts,
    FeatureLDSBankCount32,
-   FeatureDLInsts]>;
+   FeatureDLInsts,
+   FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9,
   [FeatureGFX9,
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
index 95bbe958e93..ea40cda4fa6 100644
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s
 
 ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
 ; HSA: enable_sgpr_private_segment_buffer = 1
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 023c19915c7..199a96c6443 100644
--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -1,10 +1,10 @@
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
 
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s
 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 7fe5604c3ec..2f281cab48c 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=HSAMD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=HSAMD %s
 
 ; CHECK-LABEL: {{^}}min_64_max_64:
 ; CHECK: SGPRBlocks: 0
diff --git a/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 21c69d9bee7..c4c30a66755 100644
--- a/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
 
 ; Make sure to run a GPU with the SGPR allocation bug.
 
diff --git a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
index 6af0795b04c..907575c1ba8 100644
--- a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 ; GCN-LABEL: {{^}}use_dispatch_ptr:
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
diff --git a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 7f14a24d6da..750a0203c9b 100644
--- a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}use_workitem_id_x:
 ; GCN: s_waitcnt
diff --git a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
index 46d81e57065..b416537b9f8 100644
--- a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
+++ b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s --check-prefix=NOATTR
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck %s --check-prefix=NOATTR
 target datalayout = "A5"
 
 ; CHECK: debug_wavefront_private_segment_offset_sgpr = [[SOFF:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/elf-notes.ll b/test/CodeGen/AMDGPU/elf-notes.ll
index b81292bfdb9..43e569de65c 100644
--- a/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/test/CodeGen/AMDGPU/elf-notes.ll
@@ -1,12 +1,12 @@
-; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ELF --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ELF --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ELF --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ELF --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ELF --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ELF --check-prefix=GFX802 %s
 ; RUN: llc -march=r600 < %s | FileCheck --check-prefix=R600 %s
 
 ; OSABI-UNK-NOT: .hsa_code_object_version
diff --git a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index b2ac534a7d6..17f557b3a6c 100644
--- a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=HSA-NOADDR64 -check-prefix=ALL %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s
diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index a7664c399fb..38909d3e3e9 100644
--- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -7,9 +7,9 @@
 ; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=stoney  -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-XNACK -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-XNACK -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}no_vcc_no_flat:
 ; HSA-CI: is_xnack_enabled = 0
diff --git a/test/CodeGen/AMDGPU/gfx902-without-xnack.ll b/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
index 445e112a301..8577382cff5 100644
--- a/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
+++ b/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-xnack < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-code-object-v3,-xnack < %s | FileCheck %s
 
 ; CHECK: .hsa_code_object_isa 9,0,2,"AMD","AMDGPU"
 define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
diff --git a/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
index b1901cf894b..a454fa02579 100644
--- a/test/CodeGen/AMDGPU/hsa-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -70,10 +70,10 @@ define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, doub
   ret void
 }
 
-attributes #0 = { nounwind "target-cpu"="kaveri" }
-attributes #1 = { nounwind "target-cpu"="fiji" }
-attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-fp16-denormals" }
-attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-fp16-denormals" }
-attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" }
-attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" }
-attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-dx10-clamp" }
+attributes #0 = { nounwind "target-cpu"="kaveri" "target-features"="-code-object-v3" }
+attributes #1 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3" }
+attributes #2 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,+fp64-fp16-denormals" }
+attributes #3 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,-fp64-fp16-denormals" }
+attributes #4 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,+fp64-fp16-denormals" }
+attributes #5 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,-fp64-fp16-denormals" }
+attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3,-dx10-clamp" }
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
index d117cf59ee1..76a17215b7f 100644
--- a/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA-CI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo  | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA-VI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | FileCheck --check-prefix=HSA-CI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=carrizo  | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=carrizo | FileCheck --check-prefix=HSA-VI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
 ; directives.
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll b/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll
index 77624aaafad..f6e3d94b4dc 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
 
 ; CHECK: ---
 ; CHECK:  Version: [ 1, 0 ]
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
index 485e02da7d9..4dce2bf832e 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
 
 %struct.A = type { i8, float }
 %opencl.image1d_t = type opaque
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
index ed2e79684c6..6dbc1e2523d 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 
 ; CHECK: ---
 ; CHECK:  Version: [ 1, 0 ]
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-images.ll b/test/CodeGen/AMDGPU/hsa-metadata-images.ll
index 00dee3b6c69..fd015998429 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-images.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-images.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 
 %opencl.image1d_t = type opaque
 %opencl.image1d_array_t = type opaque
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 3dc3f320db8..b5b6aa450bf 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 
 @var = addrspace(1) global float 0.0
 
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll b/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll
index fab086e6cb1..7eacdc1cdab 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 target datalayout = "A5"
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata)
@@ -32,7 +32,7 @@ entry:
   ret void, !dbg !25
 }
 
-attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx800" "target-features"="+16-bit-insts,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops,+amdgpu-debugger-reserve-regs,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx800" "target-features"="+16-bit-insts,-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops,+amdgpu-debugger-reserve-regs,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !opencl.ocl.version = !{!3}
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
index e937aaca66f..39026e8c7bd 100644
--- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -1,29 +1,29 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx600 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI600 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx601 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI601 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx700 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx701 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx702 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI702 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx703 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx704 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=bonaire | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=mullins | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=hawaii | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kabini | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris10 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris11 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx801 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx802 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx803 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx902 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX902 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx904 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx906 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx909 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx600 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI600 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx601 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI601 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx700 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx701 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx702 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI702 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx703 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx704 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=bonaire | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=mullins | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=hawaii | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kabini | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=tonga -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=polaris10 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=polaris11 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx801 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx802 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx803 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx902 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX902 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx904 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx906 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx909 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s
 
 ; HSA: .hsa_code_object_version 2,1
 ; HSA-SI600: .hsa_code_object_isa 6,0,0,"AMD","AMDGPU"
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index 0b19fbe7d70..e23b2d922a3 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo  | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-CI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3 | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-VI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
 ; directives.
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index 11067522f85..64a5fbdf00a 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-code-object-v3 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s
 
diff --git a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index a1bb6c28e74..b7344cfb33c 100644
--- a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
 
 ; Repeat of some problematic tests in kernel-args.ll, with the IR
 ; argument lowering pass disabled. Struct padding needs to be
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
index d8cf52341e3..0343052601f 100644
--- a/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3,-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
 
 ; FIXME: align on alloca seems to be ignored for private_segment_alignment
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index b6f9f951d9b..a2f2ced72e7 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 5853d8d8e4e..ee039a392e2 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,OS-MESA3D,MESA,ALL %s
 ; RUN: llc -mtriple=amdgcn-mesa-unknown -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,MESA,ALL %s
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
index f8c60451ac7..6866d9537b3 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
index 349e7f0f0e8..377785e0ca2 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=UNKNOWN-OS -check-prefix=SI-MESA %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=UNKNOWN-OS -check-prefix=VI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
 
 declare i32 @llvm.amdgcn.workgroup.id.x() #0
 declare i32 @llvm.amdgcn.workgroup.id.y() #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index 8b80998cab6..13e204b03a0 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/test/CodeGen/AMDGPU/nop-data.ll b/test/CodeGen/AMDGPU/nop-data.ll
index 790e31c781a..4e836a398ee 100644
--- a/test/CodeGen/AMDGPU/nop-data.ll
+++ b/test/CodeGen/AMDGPU/nop-data.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - -mcpu=fiji | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - -mcpu=fiji | FileCheck %s
 
 ; CHECK: kernel0:
 ; CHECK-NEXT: s_endpgm
diff --git a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
index 6a41c3ad2e8..d7e38a602ff 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
-; RUN: llc -O1 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
+; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-code-object-v3,+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
+; RUN: llc -O1 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-code-object-v3,+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
 
 ; ALL-LABEL: {{^}}promote_alloca_i32_array_array:
 ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
diff --git a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
index e8dcb50a3c1..83a608ad5f3 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s
 
 ; This shows that the amount of LDS estimate is sensitive to the order
 ; of the LDS globals.
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index 9cdc333cbc0..e13199d68bc 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s
-; RUN: llc -march=amdgcn  -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
-; RUN: llc -march=amdgcn  -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn  -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3,+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn  -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
 
 ; This ends up using all 256 registers and requires register
 ; scavenging which will fail to find an unsued register.
diff --git a/test/MC/AMDGPU/hsa-exp.s b/test/MC/AMDGPU/hsa-exp.s
index b13755a19cc..8900a0638c9 100644
--- a/test/MC/AMDGPU/hsa-exp.s
+++ b/test/MC/AMDGPU/hsa-exp.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 // ELF: Section {
 // ELF: Name: .text
diff --git a/test/MC/AMDGPU/hsa-text.s b/test/MC/AMDGPU/hsa-text.s
index afe696af0a2..f4463fc5936 100644
--- a/test/MC/AMDGPU/hsa-text.s
+++ b/test/MC/AMDGPU/hsa-text.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF
 
 // For compatibility reasons we treat convert .text sections to .hsatext
 
diff --git a/test/MC/AMDGPU/hsa.s b/test/MC/AMDGPU/hsa.s
index 5ebc0a60e0f..0521c10e1a8 100644
--- a/test/MC/AMDGPU/hsa.s
+++ b/test/MC/AMDGPU/hsa.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 // ELF: Section {
 // ELF: Name: .text
diff --git a/test/MC/AMDGPU/hsa_code_object_isa_args.s b/test/MC/AMDGPU/hsa_code_object_isa_args.s
index 1c47c83e3e9..950f32cd19a 100644
--- a/test/MC/AMDGPU/hsa_code_object_isa_args.s
+++ b/test/MC/AMDGPU/hsa_code_object_isa_args.s
@@ -1,9 +1,9 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_700
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=gfx803 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_803
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=stoney -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_810
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_700
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=gfx803 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_803
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=stoney -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_810
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_700
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_803
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=stoney -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_810
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_700
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_803
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=stoney -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_810
 
 // ELF: SHT_NOTE
 // ELF: 0000: 04000000 08000000 01000000 414D4400
diff --git a/test/MC/AMDGPU/hsa_isa_version_attrs.s b/test/MC/AMDGPU/hsa_isa_version_attrs.s
index 631e1a45097..ddd76fcf918 100644
--- a/test/MC/AMDGPU/hsa_isa_version_attrs.s
+++ b/test/MC/AMDGPU/hsa_isa_version_attrs.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=gfx801 -mattr=-fast-fmaf -show-encoding %s | FileCheck --check-prefix=GFX8 %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -mattr=-mad-mix-insts -show-encoding %s | FileCheck --check-prefix=GFX9 %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx801 -mattr=-code-object-v3,-fast-fmaf -show-encoding %s | FileCheck --check-prefix=GFX8 %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -mattr=-code-object-v3,-mad-mix-insts -show-encoding %s | FileCheck --check-prefix=GFX9 %s
 
 .hsa_code_object_isa
 // GFX8:  .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
diff --git a/test/MC/AMDGPU/isa-version-hsa.s b/test/MC/AMDGPU/isa-version-hsa.s
index 9004e1c3ac5..74e688163bc 100644
--- a/test/MC/AMDGPU/isa-version-hsa.s
+++ b/test/MC/AMDGPU/isa-version-hsa.s
@@ -1,10 +1,10 @@
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
 
 // OSABI-HSA: .amd_amdgpu_isa "amdgcn-amd-amdhsa--gfx802"
 // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line
diff --git a/test/MC/AMDGPU/isa-version-pal.s b/test/MC/AMDGPU/isa-version-pal.s
index 42051b62c0d..a872ff84258 100644
--- a/test/MC/AMDGPU/isa-version-pal.s
+++ b/test/MC/AMDGPU/isa-version-pal.s
@@ -1,10 +1,10 @@
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: llvm-mc -triple amdgcn-amd-amdpal -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
-// RUN: llvm-mc -triple amdgcn-amd-amdpal -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
+// RUN: llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
 
 // OSABI-PAL: .amd_amdgpu_isa "amdgcn-amd-amdpal--gfx802"
 // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line
diff --git a/test/MC/AMDGPU/isa-version-unk.s b/test/MC/AMDGPU/isa-version-unk.s
index 81792ade083..2b20ecb9285 100644
--- a/test/MC/AMDGPU/isa-version-unk.s
+++ b/test/MC/AMDGPU/isa-version-unk.s
@@ -1,10 +1,10 @@
-// RUN: llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
-// RUN: llvm-mc -triple amdgcn-amd-unknown -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
+// RUN: llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
 
 // OSABI-UNK: .amd_amdgpu_isa "amdgcn-amd-unknown--gfx802"
 // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line
diff --git a/test/MC/AMDGPU/sym_option.s b/test/MC/AMDGPU/sym_option.s
index 8bc9495c9ed..98b4067168e 100644
--- a/test/MC/AMDGPU/sym_option.s
+++ b/test/MC/AMDGPU/sym_option.s
@@ -1,12 +1,12 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=tahiti      %s | FileCheck %s --check-prefix=SI
-// RUN: llvm-mc -arch=amdgcn -mcpu=bonaire %s | FileCheck %s --check-prefix=BONAIRE
-// RUN: llvm-mc -arch=amdgcn -mcpu=hawaii %s | FileCheck %s --check-prefix=HAWAII
-// RUN: llvm-mc -arch=amdgcn -mcpu=kabini  %s | FileCheck %s --check-prefix=KABINI
-// RUN: llvm-mc -arch=amdgcn -mcpu=iceland %s | FileCheck %s --check-prefix=ICELAND
-// RUN: llvm-mc -arch=amdgcn -mcpu=carrizo %s | FileCheck %s --check-prefix=CARRIZO
-// RUN: llvm-mc -arch=amdgcn -mcpu=tonga %s | FileCheck %s --check-prefix=TONGA
-// RUN: llvm-mc -arch=amdgcn -mcpu=fiji %s | FileCheck %s --check-prefix=FIJI
-// RUN: llvm-mc -arch=amdgcn -mcpu=stoney  %s | FileCheck %s --check-prefix=STONEY
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=tahiti %s | FileCheck %s --check-prefix=SI
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=bonaire %s | FileCheck %s --check-prefix=BONAIRE
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=hawaii %s | FileCheck %s --check-prefix=HAWAII
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=kabini  %s | FileCheck %s --check-prefix=KABINI
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck %s --check-prefix=ICELAND
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=carrizo %s | FileCheck %s --check-prefix=CARRIZO
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=tonga %s | FileCheck %s --check-prefix=TONGA
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=fiji %s | FileCheck %s --check-prefix=FIJI
+// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=stoney  %s | FileCheck %s --check-prefix=STONEY
 
 .byte .option.machine_version_major
 // SI: .byte 6
diff --git a/test/Object/AMDGPU/objdump.s b/test/Object/AMDGPU/objdump.s
index 31306ee90d8..3c3f4a11df4 100644
--- a/test/Object/AMDGPU/objdump.s
+++ b/test/Object/AMDGPU/objdump.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=tonga %s -filetype=obj | llvm-objdump -disassemble -arch-name=amdgcn -mcpu=tonga - | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=tonga %s -mattr=-code-object-v3 -filetype=obj | llvm-objdump -disassemble -arch-name=amdgcn -mcpu=tonga - | FileCheck %s
 
 	.text
 
-- 
GitLab


From 0763384459e04c9363d7ebfca94405c000d23c08 Mon Sep 17 00:00:00 2001
From: Erich Keane <erich.keane@intel.com>
Date: Mon, 29 Oct 2018 21:21:55 +0000
Subject: [PATCH 0718/1116] Add parens to fix incorrect assert check.

&& has higher priority than ||, so this assert works really oddly. Add
parens to match the programmer's intent.

Change-Id: I3abe1361ee0694462190c5015779db664012f3d4

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345543 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/VirtualFileSystem.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Support/VirtualFileSystem.cpp b/lib/Support/VirtualFileSystem.cpp
index c9920197fba..81ac5bbaa9c 100644
--- a/lib/Support/VirtualFileSystem.cpp
+++ b/lib/Support/VirtualFileSystem.cpp
@@ -2118,7 +2118,7 @@ std::error_code VFSFromYamlDirIterImpl::incrementExternal() {
 }
 
 std::error_code VFSFromYamlDirIterImpl::incrementContent(bool IsFirstTime) {
-  assert(IsFirstTime || Current != End && "cannot iterate past end");
+  assert((IsFirstTime || Current != End) && "cannot iterate past end");
   if (!IsFirstTime)
     ++Current;
   while (Current != End) {
-- 
GitLab


From 245f8d7f3fbf6152188205fe97e95b86f0726477 Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov <shal1t712@gmail.com>
Date: Mon, 29 Oct 2018 21:22:58 +0000
Subject: [PATCH 0719/1116] [llvm-objcopy] Move elf-specific code into
 subfolder

In this diff the elf-specific code is moved into the subfolder ELF
(and factored out from llvm-objcopy.cpp).

Test plan: make check-all

Differential revision: https://reviews.llvm.org/D53790


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345544 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objcopy/CMakeLists.txt       |   3 +-
 tools/llvm-objcopy/ELF/ELFObjcopy.cpp   | 503 ++++++++++++++++++++++++
 tools/llvm-objcopy/ELF/ELFObjcopy.h     |  34 ++
 tools/llvm-objcopy/{ => ELF}/Object.cpp |   0
 tools/llvm-objcopy/{ => ELF}/Object.h   |   0
 tools/llvm-objcopy/llvm-objcopy.cpp     | 472 +---------------------
 6 files changed, 540 insertions(+), 472 deletions(-)
 create mode 100644 tools/llvm-objcopy/ELF/ELFObjcopy.cpp
 create mode 100644 tools/llvm-objcopy/ELF/ELFObjcopy.h
 rename tools/llvm-objcopy/{ => ELF}/Object.cpp (100%)
 rename tools/llvm-objcopy/{ => ELF}/Object.h (100%)

diff --git a/tools/llvm-objcopy/CMakeLists.txt b/tools/llvm-objcopy/CMakeLists.txt
index 9ac7d0eb4c2..afbf7879176 100644
--- a/tools/llvm-objcopy/CMakeLists.txt
+++ b/tools/llvm-objcopy/CMakeLists.txt
@@ -17,7 +17,8 @@ add_llvm_tool(llvm-objcopy
   Buffer.cpp
   CopyConfig.cpp
   llvm-objcopy.cpp
-  Object.cpp
+  ELF/ELFObjcopy.cpp
+  ELF/Object.cpp
   DEPENDS
   ObjcopyOptsTableGen
   StripOptsTableGen
diff --git a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
new file mode 100644
index 00000000000..76379788205
--- /dev/null
+++ b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
@@ -0,0 +1,503 @@
+//===- ELFObjcopy.cpp -----------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ELFObjcopy.h"
+#include "Buffer.h"
+#include "CopyConfig.h"
+#include "llvm-objcopy.h"
+#include "Object.h"
+
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+
+namespace llvm {
+namespace objcopy {
+namespace elf {
+
+using namespace object;
+using namespace ELF;
+using SectionPred = std::function<bool(const SectionBase &Sec)>;
+
+static bool isDebugSection(const SectionBase &Sec) {
+  return StringRef(Sec.Name).startswith(".debug") ||
+         StringRef(Sec.Name).startswith(".zdebug") || Sec.Name == ".gdb_index";
+}
+
+static bool isDWOSection(const SectionBase &Sec) {
+  return StringRef(Sec.Name).endswith(".dwo");
+}
+
+static bool onlyKeepDWOPred(const Object &Obj, const SectionBase &Sec) {
+  // We can't remove the section header string table.
+  if (&Sec == Obj.SectionNames)
+    return false;
+  // Short of keeping the string table we want to keep everything that is a DWO
+  // section and remove everything else.
+  return !isDWOSection(Sec);
+}
+
+static ElfType getOutputElfType(const Binary &Bin) {
+  // Infer output ELF type from the input ELF object
+  if (isa<ELFObjectFile<ELF32LE>>(Bin))
+    return ELFT_ELF32LE;
+  if (isa<ELFObjectFile<ELF64LE>>(Bin))
+    return ELFT_ELF64LE;
+  if (isa<ELFObjectFile<ELF32BE>>(Bin))
+    return ELFT_ELF32BE;
+  if (isa<ELFObjectFile<ELF64BE>>(Bin))
+    return ELFT_ELF64BE;
+  llvm_unreachable("Invalid ELFType");
+}
+
+static ElfType getOutputElfType(const MachineInfo &MI) {
+  // Infer output ELF type from the binary arch specified
+  if (MI.Is64Bit)
+    return MI.IsLittleEndian ? ELFT_ELF64LE : ELFT_ELF64BE;
+  else
+    return MI.IsLittleEndian ? ELFT_ELF32LE : ELFT_ELF32BE;
+}
+
+static std::unique_ptr<Writer> createWriter(const CopyConfig &Config,
+                                            Object &Obj, Buffer &Buf,
+                                            ElfType OutputElfType) {
+  if (Config.OutputFormat == "binary") {
+    return llvm::make_unique<BinaryWriter>(Obj, Buf);
+  }
+  // Depending on the initial ELFT and OutputFormat we need a different Writer.
+  switch (OutputElfType) {
+  case ELFT_ELF32LE:
+    return llvm::make_unique<ELFWriter<ELF32LE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  case ELFT_ELF64LE:
+    return llvm::make_unique<ELFWriter<ELF64LE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  case ELFT_ELF32BE:
+    return llvm::make_unique<ELFWriter<ELF32BE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  case ELFT_ELF64BE:
+    return llvm::make_unique<ELFWriter<ELF64BE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  }
+  llvm_unreachable("Invalid output format");
+}
+
+static void splitDWOToFile(const CopyConfig &Config, const Reader &Reader,
+                           StringRef File, ElfType OutputElfType) {
+  auto DWOFile = Reader.create();
+  DWOFile->removeSections(
+      [&](const SectionBase &Sec) { return onlyKeepDWOPred(*DWOFile, Sec); });
+  FileBuffer FB(File);
+  auto Writer = createWriter(Config, *DWOFile, FB, OutputElfType);
+  Writer->finalize();
+  Writer->write();
+}
+
+static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
+                               Object &Obj) {
+  for (auto &Sec : Obj.sections()) {
+    if (Sec.Name == SecName) {
+      if (Sec.OriginalData.size() == 0)
+        return make_error<StringError>("Can't dump section \"" + SecName +
+                                           "\": it has no contents",
+                                       object_error::parse_failed);
+      Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+          FileOutputBuffer::create(Filename, Sec.OriginalData.size());
+      if (!BufferOrErr)
+        return BufferOrErr.takeError();
+      std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
+      std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(),
+                Buf->getBufferStart());
+      if (Error E = Buf->commit())
+        return E;
+      return Error::success();
+    }
+  }
+  return make_error<StringError>("Section not found",
+                                 object_error::parse_failed);
+}
+
+static bool isCompressed(const SectionBase &Section) {
+  const char *Magic = "ZLIB";
+  return StringRef(Section.Name).startswith(".zdebug") ||
+         (Section.OriginalData.size() > strlen(Magic) &&
+          !strncmp(reinterpret_cast<const char *>(Section.OriginalData.data()),
+                   Magic, strlen(Magic))) ||
+         (Section.Flags & ELF::SHF_COMPRESSED);
+}
+
+static bool isCompressable(const SectionBase &Section) {
+  return !isCompressed(Section) && isDebugSection(Section) &&
+         Section.Name != ".gdb_index";
+}
+
+static void replaceDebugSections(
+    const CopyConfig &Config, Object &Obj, SectionPred &RemovePred,
+    function_ref<bool(const SectionBase &)> shouldReplace,
+    function_ref<SectionBase *(const SectionBase *)> addSection) {
+  SmallVector<SectionBase *, 13> ToReplace;
+  SmallVector<RelocationSection *, 13> RelocationSections;
+  for (auto &Sec : Obj.sections()) {
+    if (RelocationSection *R = dyn_cast<RelocationSection>(&Sec)) {
+      if (shouldReplace(*R->getSection()))
+        RelocationSections.push_back(R);
+      continue;
+    }
+
+    if (shouldReplace(Sec))
+      ToReplace.push_back(&Sec);
+  }
+
+  for (SectionBase *S : ToReplace) {
+    SectionBase *NewSection = addSection(S);
+
+    for (RelocationSection *RS : RelocationSections) {
+      if (RS->getSection() == S)
+        RS->setSection(NewSection);
+    }
+  }
+
+  RemovePred = [shouldReplace, RemovePred](const SectionBase &Sec) {
+    return shouldReplace(Sec) || RemovePred(Sec);
+  };
+}
+
+// This function handles the high level operations of GNU objcopy including
+// handling command line options. It's important to outline certain properties
+// we expect to hold of the command line operations. Any operation that "keeps"
+// should keep regardless of a remove. Additionally any removal should respect
+// any previous removals. Lastly whether or not something is removed shouldn't
+// depend a) on the order the options occur in or b) on some opaque priority
+// system. The only priority is that keeps/copies overrule removes.
+static void handleArgs(const CopyConfig &Config, Object &Obj,
+                       const Reader &Reader, ElfType OutputElfType) {
+
+  if (!Config.SplitDWO.empty()) {
+    splitDWOToFile(Config, Reader, Config.SplitDWO, OutputElfType);
+  }
+
+  // TODO: update or remove symbols only if there is an option that affects
+  // them.
+  if (Obj.SymbolTable) {
+    Obj.SymbolTable->updateSymbols([&](Symbol &Sym) {
+      if ((Config.LocalizeHidden &&
+           (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
+          (!Config.SymbolsToLocalize.empty() &&
+           is_contained(Config.SymbolsToLocalize, Sym.Name)))
+        Sym.Binding = STB_LOCAL;
+
+      // Note: these two globalize flags have very similar names but different
+      // meanings:
+      //
+      // --globalize-symbol: promote a symbol to global
+      // --keep-global-symbol: all symbols except for these should be made local
+      //
+      // If --globalize-symbol is specified for a given symbol, it will be
+      // global in the output file even if it is not included via
+      // --keep-global-symbol. Because of that, make sure to check
+      // --globalize-symbol second.
+      if (!Config.SymbolsToKeepGlobal.empty() &&
+          !is_contained(Config.SymbolsToKeepGlobal, Sym.Name))
+        Sym.Binding = STB_LOCAL;
+
+      if (!Config.SymbolsToGlobalize.empty() &&
+          is_contained(Config.SymbolsToGlobalize, Sym.Name))
+        Sym.Binding = STB_GLOBAL;
+
+      if (!Config.SymbolsToWeaken.empty() &&
+          is_contained(Config.SymbolsToWeaken, Sym.Name) &&
+          Sym.Binding == STB_GLOBAL)
+        Sym.Binding = STB_WEAK;
+
+      if (Config.Weaken && Sym.Binding == STB_GLOBAL &&
+          Sym.getShndx() != SHN_UNDEF)
+        Sym.Binding = STB_WEAK;
+
+      const auto I = Config.SymbolsToRename.find(Sym.Name);
+      if (I != Config.SymbolsToRename.end())
+        Sym.Name = I->getValue();
+
+      if (!Config.SymbolsPrefix.empty() && Sym.Type != STT_SECTION)
+        Sym.Name = (Config.SymbolsPrefix + Sym.Name).str();
+    });
+
+    // The purpose of this loop is to mark symbols referenced by sections
+    // (like GroupSection or RelocationSection). This way, we know which
+    // symbols are still 'needed' and which are not.
+    if (Config.StripUnneeded) {
+      for (auto &Section : Obj.sections())
+        Section.markSymbols();
+    }
+
+    Obj.removeSymbols([&](const Symbol &Sym) {
+      if ((!Config.SymbolsToKeep.empty() &&
+           is_contained(Config.SymbolsToKeep, Sym.Name)) ||
+          (Config.KeepFileSymbols && Sym.Type == STT_FILE))
+        return false;
+
+      if (Config.DiscardAll && Sym.Binding == STB_LOCAL &&
+          Sym.getShndx() != SHN_UNDEF && Sym.Type != STT_FILE &&
+          Sym.Type != STT_SECTION)
+        return true;
+
+      if (Config.StripAll || Config.StripAllGNU)
+        return true;
+
+      if (!Config.SymbolsToRemove.empty() &&
+          is_contained(Config.SymbolsToRemove, Sym.Name)) {
+        return true;
+      }
+
+      if (Config.StripUnneeded && !Sym.Referenced &&
+          (Sym.Binding == STB_LOCAL || Sym.getShndx() == SHN_UNDEF) &&
+          Sym.Type != STT_FILE && Sym.Type != STT_SECTION)
+        return true;
+
+      return false;
+    });
+  }
+
+  SectionPred RemovePred = [](const SectionBase &) { return false; };
+
+  // Removes:
+  if (!Config.ToRemove.empty()) {
+    RemovePred = [&Config](const SectionBase &Sec) {
+      return is_contained(Config.ToRemove, Sec.Name);
+    };
+  }
+
+  if (Config.StripDWO || !Config.SplitDWO.empty())
+    RemovePred = [RemovePred](const SectionBase &Sec) {
+      return isDWOSection(Sec) || RemovePred(Sec);
+    };
+
+  if (Config.ExtractDWO)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      return onlyKeepDWOPred(Obj, Sec) || RemovePred(Sec);
+    };
+
+  if (Config.StripAllGNU)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      if (RemovePred(Sec))
+        return true;
+      if ((Sec.Flags & SHF_ALLOC) != 0)
+        return false;
+      if (&Sec == Obj.SectionNames)
+        return false;
+      switch (Sec.Type) {
+      case SHT_SYMTAB:
+      case SHT_REL:
+      case SHT_RELA:
+      case SHT_STRTAB:
+        return true;
+      }
+      return isDebugSection(Sec);
+    };
+
+  if (Config.StripSections) {
+    RemovePred = [RemovePred](const SectionBase &Sec) {
+      return RemovePred(Sec) || (Sec.Flags & SHF_ALLOC) == 0;
+    };
+  }
+
+  if (Config.StripDebug) {
+    RemovePred = [RemovePred](const SectionBase &Sec) {
+      return RemovePred(Sec) || isDebugSection(Sec);
+    };
+  }
+
+  if (Config.StripNonAlloc)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      if (RemovePred(Sec))
+        return true;
+      if (&Sec == Obj.SectionNames)
+        return false;
+      return (Sec.Flags & SHF_ALLOC) == 0;
+    };
+
+  if (Config.StripAll)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      if (RemovePred(Sec))
+        return true;
+      if (&Sec == Obj.SectionNames)
+        return false;
+      if (StringRef(Sec.Name).startswith(".gnu.warning"))
+        return false;
+      return (Sec.Flags & SHF_ALLOC) == 0;
+    };
+
+  // Explicit copies:
+  if (!Config.OnlyKeep.empty()) {
+    RemovePred = [&Config, RemovePred, &Obj](const SectionBase &Sec) {
+      // Explicitly keep these sections regardless of previous removes.
+      if (is_contained(Config.OnlyKeep, Sec.Name))
+        return false;
+
+      // Allow all implicit removes.
+      if (RemovePred(Sec))
+        return true;
+
+      // Keep special sections.
+      if (Obj.SectionNames == &Sec)
+        return false;
+      if (Obj.SymbolTable == &Sec ||
+          (Obj.SymbolTable && Obj.SymbolTable->getStrTab() == &Sec))
+        return false;
+
+      // Remove everything else.
+      return true;
+    };
+  }
+
+  if (!Config.Keep.empty()) {
+    RemovePred = [Config, RemovePred](const SectionBase &Sec) {
+      // Explicitly keep these sections regardless of previous removes.
+      if (is_contained(Config.Keep, Sec.Name))
+        return false;
+      // Otherwise defer to RemovePred.
+      return RemovePred(Sec);
+    };
+  }
+
+  // This has to be the last predicate assignment.
+  // If the option --keep-symbol has been specified
+  // and at least one of those symbols is present
+  // (equivalently, the updated symbol table is not empty)
+  // the symbol table and the string table should not be removed.
+  if ((!Config.SymbolsToKeep.empty() || Config.KeepFileSymbols) &&
+      Obj.SymbolTable && !Obj.SymbolTable->empty()) {
+    RemovePred = [&Obj, RemovePred](const SectionBase &Sec) {
+      if (&Sec == Obj.SymbolTable || &Sec == Obj.SymbolTable->getStrTab())
+        return false;
+      return RemovePred(Sec);
+    };
+  }
+
+  if (Config.CompressionType != DebugCompressionType::None)
+    replaceDebugSections(Config, Obj, RemovePred, isCompressable,
+                         [&Config, &Obj](const SectionBase *S) {
+                           return &Obj.addSection<CompressedSection>(
+                               *S, Config.CompressionType);
+                         });
+  else if (Config.DecompressDebugSections)
+    replaceDebugSections(
+        Config, Obj, RemovePred,
+        [](const SectionBase &S) { return isa<CompressedSection>(&S); },
+        [&Obj](const SectionBase *S) {
+          auto CS = cast<CompressedSection>(S);
+          return &Obj.addSection<DecompressedSection>(*CS);
+        });
+
+  Obj.removeSections(RemovePred);
+
+  if (!Config.SectionsToRename.empty()) {
+    for (auto &Sec : Obj.sections()) {
+      const auto Iter = Config.SectionsToRename.find(Sec.Name);
+      if (Iter != Config.SectionsToRename.end()) {
+        const SectionRename &SR = Iter->second;
+        Sec.Name = SR.NewName;
+        if (SR.NewFlags.hasValue()) {
+          // Preserve some flags which should not be dropped when setting flags.
+          // Also, preserve anything OS/processor dependant.
+          const uint64_t PreserveMask = ELF::SHF_COMPRESSED | ELF::SHF_EXCLUDE |
+                                        ELF::SHF_GROUP | ELF::SHF_LINK_ORDER |
+                                        ELF::SHF_MASKOS | ELF::SHF_MASKPROC |
+                                        ELF::SHF_TLS | ELF::SHF_INFO_LINK;
+          Sec.Flags = (Sec.Flags & PreserveMask) |
+                      (SR.NewFlags.getValue() & ~PreserveMask);
+        }
+      }
+    }
+  }
+
+  if (!Config.AddSection.empty()) {
+    for (const auto &Flag : Config.AddSection) {
+      auto SecPair = Flag.split("=");
+      auto SecName = SecPair.first;
+      auto File = SecPair.second;
+      auto BufOrErr = MemoryBuffer::getFile(File);
+      if (!BufOrErr)
+        reportError(File, BufOrErr.getError());
+      auto Buf = std::move(*BufOrErr);
+      auto BufPtr = reinterpret_cast<const uint8_t *>(Buf->getBufferStart());
+      auto BufSize = Buf->getBufferSize();
+      Obj.addSection<OwnedDataSection>(SecName,
+                                       ArrayRef<uint8_t>(BufPtr, BufSize));
+    }
+  }
+
+  if (!Config.DumpSection.empty()) {
+    for (const auto &Flag : Config.DumpSection) {
+      std::pair<StringRef, StringRef> SecPair = Flag.split("=");
+      StringRef SecName = SecPair.first;
+      StringRef File = SecPair.second;
+      if (Error E = dumpSectionToFile(SecName, File, Obj))
+        reportError(Config.InputFilename, std::move(E));
+    }
+  }
+
+  if (!Config.AddGnuDebugLink.empty())
+    Obj.addSection<GnuDebugLinkSection>(Config.AddGnuDebugLink);
+}
+
+void executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In,
+                               Buffer &Out) {
+  BinaryReader Reader(Config.BinaryArch, &In);
+  std::unique_ptr<Object> Obj = Reader.create();
+
+  const ElfType OutputElfType = getOutputElfType(Config.BinaryArch);
+  handleArgs(Config, *Obj, Reader, OutputElfType);
+  std::unique_ptr<Writer> Writer =
+      createWriter(Config, *Obj, Out, OutputElfType);
+  Writer->finalize();
+  Writer->write();
+}
+
+void executeObjcopyOnBinary(const CopyConfig &Config,
+                            object::ELFObjectFileBase &In, Buffer &Out) {
+  ELFReader Reader(&In);
+  std::unique_ptr<Object> Obj = Reader.create();
+  const ElfType OutputElfType = getOutputElfType(In);
+  handleArgs(Config, *Obj, Reader, OutputElfType);
+  std::unique_ptr<Writer> Writer =
+      createWriter(Config, *Obj, Out, OutputElfType);
+  Writer->finalize();
+  Writer->write();
+}
+
+} // end namespace elf
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/tools/llvm-objcopy/ELF/ELFObjcopy.h b/tools/llvm-objcopy/ELF/ELFObjcopy.h
new file mode 100644
index 00000000000..43f41c00ce5
--- /dev/null
+++ b/tools/llvm-objcopy/ELF/ELFObjcopy.h
@@ -0,0 +1,34 @@
+//===- ELFObjcopy.h ---------------------------------------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
+#define LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
+
+namespace llvm {
+class MemoryBuffer;
+
+namespace object {
+class ELFObjectFileBase;
+} // end namespace object
+
+namespace objcopy {
+struct CopyConfig;
+class Buffer;
+
+namespace elf {
+void executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In,
+                               Buffer &Out);
+void executeObjcopyOnBinary(const CopyConfig &Config,
+                            object::ELFObjectFileBase &In, Buffer &Out);
+
+} // end namespace elf
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
diff --git a/tools/llvm-objcopy/Object.cpp b/tools/llvm-objcopy/ELF/Object.cpp
similarity index 100%
rename from tools/llvm-objcopy/Object.cpp
rename to tools/llvm-objcopy/ELF/Object.cpp
diff --git a/tools/llvm-objcopy/Object.h b/tools/llvm-objcopy/ELF/Object.h
similarity index 100%
rename from tools/llvm-objcopy/Object.h
rename to tools/llvm-objcopy/ELF/Object.h
diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index b7e2361cc01..deaea5eff85 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -10,16 +10,12 @@
 #include "llvm-objcopy.h"
 #include "Buffer.h"
 #include "CopyConfig.h"
-#include "Object.h"
+#include "ELF/ELFObjcopy.h"
 
-#include "llvm/ADT/BitmaskEnum.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/Binary.h"
@@ -30,13 +26,9 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Compression.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/Path.h"
@@ -46,8 +38,6 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
-#include <functional>
-#include <iterator>
 #include <memory>
 #include <string>
 #include <system_error>
@@ -85,466 +75,6 @@ LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Error E) {
 } // end namespace objcopy
 } // end namespace llvm
 
-// TODO: move everything enclosed in the namespace llvm::objcopy::elf
-// into separate header+cpp files.
-namespace llvm {
-namespace objcopy {
-namespace elf {
-
-using namespace object;
-using namespace ELF;
-using SectionPred = std::function<bool(const SectionBase &Sec)>;
-
-static bool isDebugSection(const SectionBase &Sec) {
-  return StringRef(Sec.Name).startswith(".debug") ||
-         StringRef(Sec.Name).startswith(".zdebug") || Sec.Name == ".gdb_index";
-}
-
-static bool isDWOSection(const SectionBase &Sec) {
-  return StringRef(Sec.Name).endswith(".dwo");
-}
-
-static bool onlyKeepDWOPred(const Object &Obj, const SectionBase &Sec) {
-  // We can't remove the section header string table.
-  if (&Sec == Obj.SectionNames)
-    return false;
-  // Short of keeping the string table we want to keep everything that is a DWO
-  // section and remove everything else.
-  return !isDWOSection(Sec);
-}
-
-static ElfType getOutputElfType(const Binary &Bin) {
-  // Infer output ELF type from the input ELF object
-  if (isa<ELFObjectFile<ELF32LE>>(Bin))
-    return ELFT_ELF32LE;
-  if (isa<ELFObjectFile<ELF64LE>>(Bin))
-    return ELFT_ELF64LE;
-  if (isa<ELFObjectFile<ELF32BE>>(Bin))
-    return ELFT_ELF32BE;
-  if (isa<ELFObjectFile<ELF64BE>>(Bin))
-    return ELFT_ELF64BE;
-  llvm_unreachable("Invalid ELFType");
-}
-
-static ElfType getOutputElfType(const MachineInfo &MI) {
-  // Infer output ELF type from the binary arch specified
-  if (MI.Is64Bit)
-    return MI.IsLittleEndian ? ELFT_ELF64LE : ELFT_ELF64BE;
-  else
-    return MI.IsLittleEndian ? ELFT_ELF32LE : ELFT_ELF32BE;
-}
-
-static std::unique_ptr<Writer> createWriter(const CopyConfig &Config,
-                                            Object &Obj, Buffer &Buf,
-                                            ElfType OutputElfType) {
-  if (Config.OutputFormat == "binary") {
-    return llvm::make_unique<BinaryWriter>(Obj, Buf);
-  }
-  // Depending on the initial ELFT and OutputFormat we need a different Writer.
-  switch (OutputElfType) {
-  case ELFT_ELF32LE:
-    return llvm::make_unique<ELFWriter<ELF32LE>>(Obj, Buf,
-                                                 !Config.StripSections);
-  case ELFT_ELF64LE:
-    return llvm::make_unique<ELFWriter<ELF64LE>>(Obj, Buf,
-                                                 !Config.StripSections);
-  case ELFT_ELF32BE:
-    return llvm::make_unique<ELFWriter<ELF32BE>>(Obj, Buf,
-                                                 !Config.StripSections);
-  case ELFT_ELF64BE:
-    return llvm::make_unique<ELFWriter<ELF64BE>>(Obj, Buf,
-                                                 !Config.StripSections);
-  }
-  llvm_unreachable("Invalid output format");
-}
-
-static void splitDWOToFile(const CopyConfig &Config, const Reader &Reader,
-                           StringRef File, ElfType OutputElfType) {
-  auto DWOFile = Reader.create();
-  DWOFile->removeSections(
-      [&](const SectionBase &Sec) { return onlyKeepDWOPred(*DWOFile, Sec); });
-  FileBuffer FB(File);
-  auto Writer = createWriter(Config, *DWOFile, FB, OutputElfType);
-  Writer->finalize();
-  Writer->write();
-}
-
-static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
-                               Object &Obj) {
-  for (auto &Sec : Obj.sections()) {
-    if (Sec.Name == SecName) {
-      if (Sec.OriginalData.size() == 0)
-        return make_error<StringError>("Can't dump section \"" + SecName +
-                                           "\": it has no contents",
-                                       object_error::parse_failed);
-      Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
-          FileOutputBuffer::create(Filename, Sec.OriginalData.size());
-      if (!BufferOrErr)
-        return BufferOrErr.takeError();
-      std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
-      std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(),
-                Buf->getBufferStart());
-      if (Error E = Buf->commit())
-        return E;
-      return Error::success();
-    }
-  }
-  return make_error<StringError>("Section not found",
-                                 object_error::parse_failed);
-}
-
-static bool isCompressed(const SectionBase &Section) {
-  const char *Magic = "ZLIB";
-  return StringRef(Section.Name).startswith(".zdebug") ||
-         (Section.OriginalData.size() > strlen(Magic) &&
-          !strncmp(reinterpret_cast<const char *>(Section.OriginalData.data()),
-                   Magic, strlen(Magic))) ||
-         (Section.Flags & ELF::SHF_COMPRESSED);
-}
-
-static bool isCompressable(const SectionBase &Section) {
-  return !isCompressed(Section) && isDebugSection(Section) &&
-         Section.Name != ".gdb_index";
-}
-
-static void replaceDebugSections(
-    const CopyConfig &Config, Object &Obj, SectionPred &RemovePred,
-    function_ref<bool(const SectionBase &)> shouldReplace,
-    function_ref<SectionBase *(const SectionBase *)> addSection) {
-  SmallVector<SectionBase *, 13> ToReplace;
-  SmallVector<RelocationSection *, 13> RelocationSections;
-  for (auto &Sec : Obj.sections()) {
-    if (RelocationSection *R = dyn_cast<RelocationSection>(&Sec)) {
-      if (shouldReplace(*R->getSection()))
-        RelocationSections.push_back(R);
-      continue;
-    }
-
-    if (shouldReplace(Sec))
-      ToReplace.push_back(&Sec);
-  }
-
-  for (SectionBase *S : ToReplace) {
-    SectionBase *NewSection = addSection(S);
-
-    for (RelocationSection *RS : RelocationSections) {
-      if (RS->getSection() == S)
-        RS->setSection(NewSection);
-    }
-  }
-
-  RemovePred = [shouldReplace, RemovePred](const SectionBase &Sec) {
-    return shouldReplace(Sec) || RemovePred(Sec);
-  };
-}
-
-// This function handles the high level operations of GNU objcopy including
-// handling command line options. It's important to outline certain properties
-// we expect to hold of the command line operations. Any operation that "keeps"
-// should keep regardless of a remove. Additionally any removal should respect
-// any previous removals. Lastly whether or not something is removed shouldn't
-// depend a) on the order the options occur in or b) on some opaque priority
-// system. The only priority is that keeps/copies overrule removes.
-static void handleArgs(const CopyConfig &Config, Object &Obj,
-                       const Reader &Reader, ElfType OutputElfType) {
-
-  if (!Config.SplitDWO.empty()) {
-    splitDWOToFile(Config, Reader, Config.SplitDWO, OutputElfType);
-  }
-
-  // TODO: update or remove symbols only if there is an option that affects
-  // them.
-  if (Obj.SymbolTable) {
-    Obj.SymbolTable->updateSymbols([&](Symbol &Sym) {
-      if ((Config.LocalizeHidden &&
-           (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
-          (!Config.SymbolsToLocalize.empty() &&
-           is_contained(Config.SymbolsToLocalize, Sym.Name)))
-        Sym.Binding = STB_LOCAL;
-
-      // Note: these two globalize flags have very similar names but different
-      // meanings:
-      //
-      // --globalize-symbol: promote a symbol to global
-      // --keep-global-symbol: all symbols except for these should be made local
-      //
-      // If --globalize-symbol is specified for a given symbol, it will be
-      // global in the output file even if it is not included via
-      // --keep-global-symbol. Because of that, make sure to check
-      // --globalize-symbol second.
-      if (!Config.SymbolsToKeepGlobal.empty() &&
-          !is_contained(Config.SymbolsToKeepGlobal, Sym.Name))
-        Sym.Binding = STB_LOCAL;
-
-      if (!Config.SymbolsToGlobalize.empty() &&
-          is_contained(Config.SymbolsToGlobalize, Sym.Name))
-        Sym.Binding = STB_GLOBAL;
-
-      if (!Config.SymbolsToWeaken.empty() &&
-          is_contained(Config.SymbolsToWeaken, Sym.Name) &&
-          Sym.Binding == STB_GLOBAL)
-        Sym.Binding = STB_WEAK;
-
-      if (Config.Weaken && Sym.Binding == STB_GLOBAL &&
-          Sym.getShndx() != SHN_UNDEF)
-        Sym.Binding = STB_WEAK;
-
-      const auto I = Config.SymbolsToRename.find(Sym.Name);
-      if (I != Config.SymbolsToRename.end())
-        Sym.Name = I->getValue();
-
-      if (!Config.SymbolsPrefix.empty() && Sym.Type != STT_SECTION)
-        Sym.Name = (Config.SymbolsPrefix + Sym.Name).str();
-    });
-
-    // The purpose of this loop is to mark symbols referenced by sections
-    // (like GroupSection or RelocationSection). This way, we know which
-    // symbols are still 'needed' and which are not.
-    if (Config.StripUnneeded) {
-      for (auto &Section : Obj.sections())
-        Section.markSymbols();
-    }
-
-    Obj.removeSymbols([&](const Symbol &Sym) {
-      if ((!Config.SymbolsToKeep.empty() &&
-           is_contained(Config.SymbolsToKeep, Sym.Name)) ||
-          (Config.KeepFileSymbols && Sym.Type == STT_FILE))
-        return false;
-
-      if (Config.DiscardAll && Sym.Binding == STB_LOCAL &&
-          Sym.getShndx() != SHN_UNDEF && Sym.Type != STT_FILE &&
-          Sym.Type != STT_SECTION)
-        return true;
-
-      if (Config.StripAll || Config.StripAllGNU)
-        return true;
-
-      if (!Config.SymbolsToRemove.empty() &&
-          is_contained(Config.SymbolsToRemove, Sym.Name)) {
-        return true;
-      }
-
-      if (Config.StripUnneeded && !Sym.Referenced &&
-          (Sym.Binding == STB_LOCAL || Sym.getShndx() == SHN_UNDEF) &&
-          Sym.Type != STT_FILE && Sym.Type != STT_SECTION)
-        return true;
-
-      return false;
-    });
-  }
-
-  SectionPred RemovePred = [](const SectionBase &) { return false; };
-
-  // Removes:
-  if (!Config.ToRemove.empty()) {
-    RemovePred = [&Config](const SectionBase &Sec) {
-      return is_contained(Config.ToRemove, Sec.Name);
-    };
-  }
-
-  if (Config.StripDWO || !Config.SplitDWO.empty())
-    RemovePred = [RemovePred](const SectionBase &Sec) {
-      return isDWOSection(Sec) || RemovePred(Sec);
-    };
-
-  if (Config.ExtractDWO)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      return onlyKeepDWOPred(Obj, Sec) || RemovePred(Sec);
-    };
-
-  if (Config.StripAllGNU)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      if (RemovePred(Sec))
-        return true;
-      if ((Sec.Flags & SHF_ALLOC) != 0)
-        return false;
-      if (&Sec == Obj.SectionNames)
-        return false;
-      switch (Sec.Type) {
-      case SHT_SYMTAB:
-      case SHT_REL:
-      case SHT_RELA:
-      case SHT_STRTAB:
-        return true;
-      }
-      return isDebugSection(Sec);
-    };
-
-  if (Config.StripSections) {
-    RemovePred = [RemovePred](const SectionBase &Sec) {
-      return RemovePred(Sec) || (Sec.Flags & SHF_ALLOC) == 0;
-    };
-  }
-
-  if (Config.StripDebug) {
-    RemovePred = [RemovePred](const SectionBase &Sec) {
-      return RemovePred(Sec) || isDebugSection(Sec);
-    };
-  }
-
-  if (Config.StripNonAlloc)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      if (RemovePred(Sec))
-        return true;
-      if (&Sec == Obj.SectionNames)
-        return false;
-      return (Sec.Flags & SHF_ALLOC) == 0;
-    };
-
-  if (Config.StripAll)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      if (RemovePred(Sec))
-        return true;
-      if (&Sec == Obj.SectionNames)
-        return false;
-      if (StringRef(Sec.Name).startswith(".gnu.warning"))
-        return false;
-      return (Sec.Flags & SHF_ALLOC) == 0;
-    };
-
-  // Explicit copies:
-  if (!Config.OnlyKeep.empty()) {
-    RemovePred = [&Config, RemovePred, &Obj](const SectionBase &Sec) {
-      // Explicitly keep these sections regardless of previous removes.
-      if (is_contained(Config.OnlyKeep, Sec.Name))
-        return false;
-
-      // Allow all implicit removes.
-      if (RemovePred(Sec))
-        return true;
-
-      // Keep special sections.
-      if (Obj.SectionNames == &Sec)
-        return false;
-      if (Obj.SymbolTable == &Sec ||
-          (Obj.SymbolTable && Obj.SymbolTable->getStrTab() == &Sec))
-        return false;
-
-      // Remove everything else.
-      return true;
-    };
-  }
-
-  if (!Config.Keep.empty()) {
-    RemovePred = [Config, RemovePred](const SectionBase &Sec) {
-      // Explicitly keep these sections regardless of previous removes.
-      if (is_contained(Config.Keep, Sec.Name))
-        return false;
-      // Otherwise defer to RemovePred.
-      return RemovePred(Sec);
-    };
-  }
-
-  // This has to be the last predicate assignment.
-  // If the option --keep-symbol has been specified
-  // and at least one of those symbols is present
-  // (equivalently, the updated symbol table is not empty)
-  // the symbol table and the string table should not be removed.
-  if ((!Config.SymbolsToKeep.empty() || Config.KeepFileSymbols) &&
-      Obj.SymbolTable && !Obj.SymbolTable->empty()) {
-    RemovePred = [&Obj, RemovePred](const SectionBase &Sec) {
-      if (&Sec == Obj.SymbolTable || &Sec == Obj.SymbolTable->getStrTab())
-        return false;
-      return RemovePred(Sec);
-    };
-  }
-
-  if (Config.CompressionType != DebugCompressionType::None)
-    replaceDebugSections(Config, Obj, RemovePred, isCompressable,
-                         [&Config, &Obj](const SectionBase *S) {
-                           return &Obj.addSection<CompressedSection>(
-                               *S, Config.CompressionType);
-                         });
-  else if (Config.DecompressDebugSections)
-    replaceDebugSections(
-        Config, Obj, RemovePred,
-        [](const SectionBase &S) { return isa<CompressedSection>(&S); },
-        [&Obj](const SectionBase *S) {
-          auto CS = cast<CompressedSection>(S);
-          return &Obj.addSection<DecompressedSection>(*CS);
-        });
-
-  Obj.removeSections(RemovePred);
-
-  if (!Config.SectionsToRename.empty()) {
-    for (auto &Sec : Obj.sections()) {
-      const auto Iter = Config.SectionsToRename.find(Sec.Name);
-      if (Iter != Config.SectionsToRename.end()) {
-        const SectionRename &SR = Iter->second;
-        Sec.Name = SR.NewName;
-        if (SR.NewFlags.hasValue()) {
-          // Preserve some flags which should not be dropped when setting flags.
-          // Also, preserve anything OS/processor dependant.
-          const uint64_t PreserveMask = ELF::SHF_COMPRESSED | ELF::SHF_EXCLUDE |
-                                        ELF::SHF_GROUP | ELF::SHF_LINK_ORDER |
-                                        ELF::SHF_MASKOS | ELF::SHF_MASKPROC |
-                                        ELF::SHF_TLS | ELF::SHF_INFO_LINK;
-          Sec.Flags = (Sec.Flags & PreserveMask) |
-                      (SR.NewFlags.getValue() & ~PreserveMask);
-        }
-      }
-    }
-  }
-
-  if (!Config.AddSection.empty()) {
-    for (const auto &Flag : Config.AddSection) {
-      auto SecPair = Flag.split("=");
-      auto SecName = SecPair.first;
-      auto File = SecPair.second;
-      auto BufOrErr = MemoryBuffer::getFile(File);
-      if (!BufOrErr)
-        reportError(File, BufOrErr.getError());
-      auto Buf = std::move(*BufOrErr);
-      auto BufPtr = reinterpret_cast<const uint8_t *>(Buf->getBufferStart());
-      auto BufSize = Buf->getBufferSize();
-      Obj.addSection<OwnedDataSection>(SecName,
-                                       ArrayRef<uint8_t>(BufPtr, BufSize));
-    }
-  }
-
-  if (!Config.DumpSection.empty()) {
-    for (const auto &Flag : Config.DumpSection) {
-      std::pair<StringRef, StringRef> SecPair = Flag.split("=");
-      StringRef SecName = SecPair.first;
-      StringRef File = SecPair.second;
-      if (Error E = dumpSectionToFile(SecName, File, Obj))
-        reportError(Config.InputFilename, std::move(E));
-    }
-  }
-
-  if (!Config.AddGnuDebugLink.empty())
-    Obj.addSection<GnuDebugLinkSection>(Config.AddGnuDebugLink);
-}
-
-void executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In,
-                               Buffer &Out) {
-  BinaryReader Reader(Config.BinaryArch, &In);
-  std::unique_ptr<Object> Obj = Reader.create();
-
-  const ElfType OutputElfType = getOutputElfType(Config.BinaryArch);
-  handleArgs(Config, *Obj, Reader, OutputElfType);
-  std::unique_ptr<Writer> Writer =
-      createWriter(Config, *Obj, Out, OutputElfType);
-  Writer->finalize();
-  Writer->write();
-}
-
-void executeObjcopyOnBinary(const CopyConfig &Config,
-                            object::ELFObjectFileBase &In, Buffer &Out) {
-  ELFReader Reader(&In);
-  std::unique_ptr<Object> Obj = Reader.create();
-  const ElfType OutputElfType = getOutputElfType(In);
-  handleArgs(Config, *Obj, Reader, OutputElfType);
-  std::unique_ptr<Writer> Writer =
-      createWriter(Config, *Obj, Out, OutputElfType);
-  Writer->finalize();
-  Writer->write();
-}
-
-} // end namespace elf
-} // end namespace objcopy
-} // end namespace llvm
-
 using namespace llvm;
 using namespace llvm::object;
 using namespace llvm::objcopy;
-- 
GitLab


From 3efd3a74afeb0ba923837ee067b8d1ca4ce2aeb6 Mon Sep 17 00:00:00 2001
From: Wolfgang Pieb <Wolfgang.Pieb@sony.com>
Date: Mon, 29 Oct 2018 22:16:47 +0000
Subject: [PATCH 0720/1116] [DWARF][NFC] Refactor range list extraction and
 dumping

The purpose of this patch is twofold:
- Fold pre-DWARF v5 functionality into v5 to eliminate the need for 2 different
  versions of range list handling. We get rid of DWARFDebugRangelist{.cpp,.h}.
- Templatize the handling of range list tables so that location list handling
  can take advantage of it as well. Location list and range list tables have the
  same basic layout.

A non-NFC version of this patch was previously submitted with r342218, but it caused
errors with some TSan tests. This patch has no functional changes. The difference to
the non-NFC patch is that there are no changes to rangelist dumping in this patch.

Differential Revision: https://reviews.llvm.org/D53545


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345546 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFContext.h   |   8 +-
 .../DebugInfo/DWARF/DWARFDebugRangeList.h     |  85 ---------
 .../llvm/DebugInfo/DWARF/DWARFDebugRnglists.h |  38 +++-
 include/llvm/DebugInfo/DWARF/DWARFListTable.h | 165 +++++++++++++-----
 include/llvm/DebugInfo/DWARF/DWARFUnit.h      |   7 -
 lib/DebugInfo/DWARF/CMakeLists.txt            |   1 -
 lib/DebugInfo/DWARF/DWARFContext.cpp          |  74 ++++----
 lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp   |  96 ----------
 lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp    |  96 +++++++---
 lib/DebugInfo/DWARF/DWARFDie.cpp              |   1 -
 lib/DebugInfo/DWARF/DWARFListTable.cpp        |  74 +++++---
 lib/DebugInfo/DWARF/DWARFUnit.cpp             | 101 ++++++-----
 tools/dsymutil/DwarfLinker.cpp                |  23 +--
 tools/dsymutil/DwarfStreamer.cpp              |  19 +-
 tools/dsymutil/DwarfStreamer.h                |   4 +-
 15 files changed, 394 insertions(+), 398 deletions(-)
 delete mode 100644 include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
 delete mode 100644 lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp

diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 221f1f79698..13bcdd25c32 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -231,16 +231,16 @@ public:
   /// Get a DIE given an exact offset.
   DWARFDie getDIEForOffset(uint32_t Offset);
 
-  unsigned getMaxVersion() {
+  unsigned getMaxVersion(uint16_t DefaultVersion = 0) {
     // Ensure info units have been parsed to discover MaxVersion
     info_section_units();
-    return MaxVersion;
+    return MaxVersion ? MaxVersion : DefaultVersion;
   }
 
-  unsigned getMaxDWOVersion() {
+  unsigned getMaxDWOVersion(uint16_t DefaultVersion = 0) {
     // Ensure DWO info units have been parsed to discover MaxVersion
     dwo_info_section_units();
-    return MaxVersion;
+    return MaxVersion ? MaxVersion : DefaultVersion;
   }
 
   void setMaxVersionIfGreater(unsigned Version) {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
deleted file mode 100644
index bc26edf0064..00000000000
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ /dev/null
@@ -1,85 +0,0 @@
-//===- DWARFDebugRangeList.h ------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
-#define LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
-
-#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
-#include <cassert>
-#include <cstdint>
-#include <vector>
-
-namespace llvm {
-
-class raw_ostream;
-
-class DWARFDebugRangeList {
-public:
-  struct RangeListEntry {
-    /// A beginning address offset. This address offset has the size of an
-    /// address and is relative to the applicable base address of the
-    /// compilation unit referencing this range list. It marks the beginning
-    /// of an address range.
-    uint64_t StartAddress;
-    /// An ending address offset. This address offset again has the size of
-    /// an address and is relative to the applicable base address of the
-    /// compilation unit referencing this range list. It marks the first
-    /// address past the end of the address range. The ending address must
-    /// be greater than or equal to the beginning address.
-    uint64_t EndAddress;
-    /// A section index this range belongs to.
-    uint64_t SectionIndex;
-
-    /// The end of any given range list is marked by an end of list entry,
-    /// which consists of a 0 for the beginning address offset
-    /// and a 0 for the ending address offset.
-    bool isEndOfListEntry() const {
-      return (StartAddress == 0) && (EndAddress == 0);
-    }
-
-    /// A base address selection entry consists of:
-    /// 1. The value of the largest representable address offset
-    /// (for example, 0xffffffff when the size of an address is 32 bits).
-    /// 2. An address, which defines the appropriate base address for
-    /// use in interpreting the beginning and ending address offsets of
-    /// subsequent entries of the location list.
-    bool isBaseAddressSelectionEntry(uint8_t AddressSize) const {
-      assert(AddressSize == 4 || AddressSize == 8);
-      if (AddressSize == 4)
-        return StartAddress == -1U;
-      else
-        return StartAddress == -1ULL;
-    }
-  };
-
-private:
-  /// Offset in .debug_ranges section.
-  uint32_t Offset;
-  uint8_t AddressSize;
-  std::vector<RangeListEntry> Entries;
-
-public:
-  DWARFDebugRangeList() { clear(); }
-
-  void clear();
-  void dump(raw_ostream &OS) const;
-  Error extract(const DWARFDataExtractor &data, uint32_t *offset_ptr);
-  const std::vector<RangeListEntry> &getEntries() { return Entries; }
-
-  /// getAbsoluteRanges - Returns absolute address ranges defined by this range
-  /// list. Has to be passed base address of the compile unit referencing this
-  /// range list.
-  DWARFAddressRangesVector
-  getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr) const;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
index 5cc8d789e59..1f4b7717e23 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
@@ -13,8 +13,8 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFListTable.h"
 #include <cstdint>
 #include <map>
@@ -22,6 +22,8 @@
 
 namespace llvm {
 
+struct BaseAddress;
+class DWARFContext;
 class Error;
 class raw_ostream;
 class DWARFUnit;
@@ -35,12 +37,30 @@ struct RangeListEntry : public DWARFListEntryBase {
   uint64_t Value0;
   uint64_t Value1;
 
-  Error extract(DWARFDataExtractor Data, uint32_t End, uint32_t *OffsetPtr);
-  void dump(raw_ostream &OS, uint8_t AddrSize, uint8_t MaxEncodingStringLength,
-            uint64_t &CurrentBase, DIDumpOptions DumpOpts,
+  Error extract(DWARFDataExtractor Data, uint32_t End, uint16_t Version,
+                StringRef SectionName, uint32_t *OffsetPtr, bool isDWO = false);
+  bool isEndOfList() const { return EntryKind == dwarf::DW_RLE_end_of_list; }
+  bool isBaseAddressSelectionEntry() const {
+    return EntryKind == dwarf::DW_RLE_base_address;
+  }
+  uint64_t getStartAddress() const {
+    assert((EntryKind == dwarf::DW_RLE_start_end ||
+            EntryKind == dwarf::DW_RLE_offset_pair ||
+            EntryKind == dwarf::DW_RLE_startx_length) &&
+           "Unexpected range list entry kind");
+    return Value0;
+  }
+  uint64_t getEndAddress() const {
+    assert((EntryKind == dwarf::DW_RLE_start_end ||
+            EntryKind == dwarf::DW_RLE_offset_pair) &&
+           "Unexpected range list entry kind");
+    return Value1;
+  }
+  void dump(raw_ostream &OS, DWARFContext *C, uint8_t AddrSize, 
+            uint64_t &CurrentBase, unsigned Indent, uint16_t Version,
+            uint8_t MaxEncodingStringLength, DIDumpOptions DumpOpts,
             llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
                 LookupPooledAddress) const;
-  bool isSentinel() const { return EntryKind == dwarf::DW_RLE_end_of_list; }
 };
 
 /// A class representing a single rangelist.
@@ -54,10 +74,12 @@ public:
 
 class DWARFDebugRnglistTable : public DWARFListTableBase<DWARFDebugRnglist> {
 public:
-  DWARFDebugRnglistTable()
-      : DWARFListTableBase(/* SectionName    = */ ".debug_rnglists",
+  DWARFDebugRnglistTable(DWARFContext *C, StringRef SectionName,
+                         bool isDWO = false)
+      : DWARFListTableBase(C, SectionName, isDWO,
                            /* HeaderString   = */ "ranges:",
-                           /* ListTypeString = */ "range") {}
+                           /* ListTypeString = */ "range",
+                           dwarf::RangeListEncodingString) {}
 };
 
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFListTable.h b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
index 9b987314f20..66a96dfd610 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFListTable.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
@@ -23,6 +23,8 @@
 
 namespace llvm {
 
+class DWARFContext;
+
 /// A base class for DWARF list entries, such as range or location list
 /// entries.
 struct DWARFListEntryBase {
@@ -37,6 +39,7 @@ struct DWARFListEntryBase {
 /// A base class for lists of entries that are extracted from a particular
 /// section, such as range lists or location lists.
 template <typename ListEntryType> class DWARFListType {
+public:
   using EntryType = ListEntryType;
   using ListEntries = std::vector<EntryType>;
 
@@ -45,11 +48,26 @@ protected:
 
 public:
   const ListEntries &getEntries() const { return Entries; }
-  bool empty() const { return Entries.empty(); }
+  bool empty() const {
+    return Entries.empty() || Entries.begin()->isEndOfList();
+  }
   void clear() { Entries.clear(); }
+  uint32_t getOffset() const {
+    if (Entries.empty())
+      return 0;
+    return Entries.begin()->Offset;
+  }
+
+  /// Extract a list. The caller must pass the correct DWARF version.
+  /// The end-of-list entry is retained as the last element of the vector of
+  /// entries.
   Error extract(DWARFDataExtractor Data, uint32_t HeaderOffset, uint32_t End,
-                uint32_t *OffsetPtr, StringRef SectionName,
-                StringRef ListStringName);
+                uint16_t Version, uint32_t *OffsetPtr, StringRef SectionName,
+                StringRef ListStringName, bool isDWO = false);
+  void dump(raw_ostream &OS, DWARFContext *C, uint8_t AddressSize,
+            uint64_t BaseAddress, unsigned Indent, uint16_t Version,
+            size_t MaxEncodingStringLength,
+            DIDumpOptions DumpOpts, llvm::function_ref<Optional<SectionedAddress>(uint32_t)> LookupPooledAddress) const;
 };
 
 /// A class representing the header of a list table such as the range list
@@ -67,9 +85,9 @@ class DWARFListTableHeader {
     uint8_t AddrSize;
     /// The size in bytes of a segment selector on the target architecture.
     /// If the target system uses a flat address space, this value is 0.
-    uint8_t SegSize;
+    uint8_t SegSize = 0;
     /// The number of offsets that follow the header before the range lists.
-    uint32_t OffsetEntryCount;
+    uint32_t OffsetEntryCount = 0;
   };
 
   Header HeaderData;
@@ -78,10 +96,10 @@ class DWARFListTableHeader {
   /// FIXME: Generate the table and use the appropriate forms.
   std::vector<uint32_t> Offsets;
   /// The table's format, either DWARF32 or DWARF64.
-  dwarf::DwarfFormat Format;
+  dwarf::DwarfFormat Format = dwarf::DwarfFormat::DWARF32;
   /// The offset at which the header (and hence the table) is located within
   /// its section.
-  uint32_t HeaderOffset;
+  uint32_t HeaderOffset = 0;
   /// The name of the section the list is located in.
   StringRef SectionName;
   /// A characterization of the list for dumping purposes, e.g. "range" or
@@ -97,9 +115,19 @@ public:
     Offsets.clear();
   }
   uint32_t getHeaderOffset() const { return HeaderOffset; }
+
   uint8_t getAddrSize() const { return HeaderData.AddrSize; }
+  void setAddrSize(uint8_t AddrSize) { HeaderData.AddrSize = AddrSize; }
+
   uint32_t getLength() const { return HeaderData.Length; }
+  void setLength(uint32_t Length) { HeaderData.Length = Length; }
+
   uint16_t getVersion() const { return HeaderData.Version; }
+  void setVersion(uint16_t Version) { HeaderData.Version = Version; }
+
+  uint8_t getSegSize() const { return HeaderData.SegSize; }
+  uint32_t getOffsetEntryCount() const { return HeaderData.OffsetEntryCount; }
+
   StringRef getSectionName() const { return SectionName; }
   StringRef getListTypeString() const { return ListTypeString; }
   dwarf::DwarfFormat getFormat() const { return Format; }
@@ -116,8 +144,10 @@ public:
 
   /// Returns the length of the table, including the length field, or 0 if the
   /// length has not been determined (e.g. because the table has not yet been
-  /// parsed, or there was a problem in parsing).
-  uint32_t length() const;
+  /// parsed, or there was a problem in parsing). In fake tables, such as for
+  /// DWARF v4 and earlier, there is no header, so the length simply reflects
+  /// the size of the section.
+  uint32_t getTableLength() const;
 };
 
 /// A class representing a table of lists as specified in the DWARF v5
@@ -130,14 +160,22 @@ template <typename DWARFListType> class DWARFListTableBase {
   /// A mapping between file offsets and lists. It is used to find a particular
   /// list based on an offset (obtained from DW_AT_ranges, for example).
   std::map<uint32_t, DWARFListType> ListMap;
+  DWARFContext *Ctx;
+  /// True if this list is located in a split-DWARF (dwo or dwp) file.
+  bool isDWO;
   /// This string is displayed as a heading before the list is dumped
   /// (e.g. "ranges:").
   StringRef HeaderString;
+  /// A function returning the encoding string for a given list entry encoding,
+  /// e.g. "DW_RLE_start_end".
+  std::function<StringRef(unsigned)> EncodingString;
 
 protected:
-  DWARFListTableBase(StringRef SectionName, StringRef HeaderString,
-                     StringRef ListTypeString)
-      : Header(SectionName, ListTypeString), HeaderString(HeaderString) {}
+  DWARFListTableBase(DWARFContext *C, StringRef SectionName, bool isDWO,
+                     StringRef HeaderString, StringRef ListTypeString,
+                     std::function<StringRef(unsigned)> EncodingString)
+      : Header(SectionName, ListTypeString), Ctx(C), isDWO(isDWO),
+        HeaderString(HeaderString), EncodingString(EncodingString) {}
 
 public:
   void clear() {
@@ -148,14 +186,28 @@ public:
   Error extractHeaderAndOffsets(DWARFDataExtractor Data, uint32_t *OffsetPtr) {
     return Header.extract(Data, OffsetPtr);
   }
+
+  /// Initialize the table header to explicit values. This is used for DWARF v4
+  /// and earlier since there is no header that can be extracted from a section.
+  void setHeaderData(uint32_t Length, uint16_t Version, uint8_t AddrSize) {
+    assert(Header.getSegSize() == 0 &&
+           "Unexpected segsize in list table header.");
+    assert(Header.getOffsetEntryCount() == 0 &&
+           "Unexpected offset entry count in list table header.");
+    Header.setLength(Length);
+    Header.setVersion(Version);
+    Header.setAddrSize(AddrSize);
+  }
+
   /// Extract an entire table, including all list entries.
-  Error extract(DWARFDataExtractor Data, uint32_t *OffsetPtr);
+  Error extract(DWARFDataExtractor Data, uint16_t Version, uint32_t *OffsetPtr);
   /// Look up a list based on a given offset. Extract it and enter it into the
   /// list map if necessary.
   Expected<DWARFListType> findList(DWARFDataExtractor Data, uint32_t Offset);
 
   uint32_t getHeaderOffset() const { return Header.getHeaderOffset(); }
   uint8_t getAddrSize() const { return Header.getAddrSize(); }
+  StringRef getListTypeString() const { return Header.getListTypeString(); }
 
   void dump(raw_ostream &OS,
             llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
@@ -179,25 +231,35 @@ public:
     llvm_unreachable("Invalid DWARF format (expected DWARF32 or DWARF64");
   }
 
-  uint32_t length() { return Header.length(); }
+  uint16_t getVersion() const { return Header.getVersion(); }
+  uint32_t getLength() const { return Header.getTableLength(); }
 };
 
 template <typename DWARFListType>
 Error DWARFListTableBase<DWARFListType>::extract(DWARFDataExtractor Data,
+                                                 uint16_t Version,
                                                  uint32_t *OffsetPtr) {
+  assert(Version > 0 && "DWARF version required and not given.");
   clear();
-  if (Error E = extractHeaderAndOffsets(Data, OffsetPtr))
+  // For DWARF v4 and earlier, we cannot extract a table header, so we
+  // initialize it explicitly.
+  if (Version < 5)
+    setHeaderData(Data.size(), Version, Data.getAddressSize());
+  else if (Error E = extractHeaderAndOffsets(Data, OffsetPtr))
     return E;
 
   Data.setAddressSize(Header.getAddrSize());
-  uint32_t End = getHeaderOffset() + Header.length();
+  uint32_t End = getHeaderOffset() + getLength();
+  // Extract all lists.
   while (*OffsetPtr < End) {
     DWARFListType CurrentList;
     uint32_t Off = *OffsetPtr;
-    if (Error E = CurrentList.extract(Data, getHeaderOffset(), End, OffsetPtr,
-                                      Header.getSectionName(),
-                                      Header.getListTypeString()))
+    if (Error E = CurrentList.extract(
+            Data, getHeaderOffset(), End, Header.getVersion(), OffsetPtr,
+            Header.getSectionName(), Header.getListTypeString(), isDWO)) {
+      *OffsetPtr = End;
       return E;
+    }
     ListMap[Off] = CurrentList;
   }
 
@@ -208,22 +270,25 @@ Error DWARFListTableBase<DWARFListType>::extract(DWARFDataExtractor Data,
 }
 
 template <typename ListEntryType>
-Error DWARFListType<ListEntryType>::extract(DWARFDataExtractor Data,
-                                            uint32_t HeaderOffset, uint32_t End,
-                                            uint32_t *OffsetPtr,
-                                            StringRef SectionName,
-                                            StringRef ListTypeString) {
+Error DWARFListType<ListEntryType>::extract(
+    DWARFDataExtractor Data, uint32_t HeaderOffset, uint32_t End,
+    uint16_t Version, uint32_t *OffsetPtr, StringRef SectionName,
+    StringRef ListTypeString, bool isDWO) {
   if (*OffsetPtr < HeaderOffset || *OffsetPtr >= End)
     return createStringError(errc::invalid_argument,
                        "invalid %s list offset 0x%" PRIx32,
                        ListTypeString.data(), *OffsetPtr);
   Entries.clear();
+  uint32_t StartingOffset = *OffsetPtr;
   while (*OffsetPtr < End) {
     ListEntryType Entry;
-    if (Error E = Entry.extract(Data, End, OffsetPtr))
+    if (Error E =
+            Entry.extract(Data, End, Version, SectionName, OffsetPtr, isDWO))
       return E;
+    if (Version < 5)
+      Entry.Offset = StartingOffset;
     Entries.push_back(Entry);
-    if (Entry.isSentinel())
+    if (Entry.isEndOfList())
       return Error::success();
   }
   return createStringError(errc::illegal_byte_sequence,
@@ -232,31 +297,47 @@ Error DWARFListType<ListEntryType>::extract(DWARFDataExtractor Data,
                      SectionName.data(), HeaderOffset);
 }
 
+template <typename ListEntryType>
+void DWARFListType<ListEntryType>::dump(raw_ostream &OS, DWARFContext *C,
+                                        uint8_t AddressSize,
+                                        uint64_t BaseAddress, unsigned Indent,
+                                        uint16_t Version,
+                                        size_t MaxEncodingStringLength,
+                                        DIDumpOptions DumpOpts,
+                                        llvm::function_ref<Optional<SectionedAddress>(uint32_t)> LookupPooledAddress) const {
+  uint64_t CurrentBase = BaseAddress;
+  for (const auto &Entry : Entries)
+    Entry.dump(OS, C, AddressSize, CurrentBase, Indent, Version,
+               MaxEncodingStringLength, DumpOpts, LookupPooledAddress);
+}
+
 template <typename DWARFListType>
 void DWARFListTableBase<DWARFListType>::dump(
     raw_ostream &OS,
     llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
         LookupPooledAddress,
     DIDumpOptions DumpOpts) const {
-  Header.dump(OS, DumpOpts);
-  OS << HeaderString << "\n";
-
   // Determine the length of the longest encoding string we have in the table,
   // so we can align the output properly. We only need this in verbose mode.
   size_t MaxEncodingStringLength = 0;
-  if (DumpOpts.Verbose) {
-    for (const auto &List : ListMap)
-      for (const auto &Entry : List.second.getEntries())
-        MaxEncodingStringLength =
-            std::max(MaxEncodingStringLength,
-                     dwarf::RangeListEncodingString(Entry.EntryKind).size());
+  // Don't dump the fake table header we create for DWARF v4 and earlier.
+  if (Header.getVersion() > 4) {
+    Header.dump(OS, DumpOpts);
+    OS << HeaderString << '\n';
+    // Determine the length of the longest encoding string we have in the table,
+    // so we can align the output properly. We only need this in verbose mode.
+    if (DumpOpts.Verbose)
+      for (const auto &List : ListMap)
+        for (const auto &Entry : List.second.getEntries())
+          MaxEncodingStringLength = std::max(
+              MaxEncodingStringLength, EncodingString(Entry.EntryKind).size());
   }
 
   uint64_t CurrentBase = 0;
   for (const auto &List : ListMap)
-    for (const auto &Entry : List.second.getEntries())
-      Entry.dump(OS, getAddrSize(), MaxEncodingStringLength, CurrentBase,
-                 DumpOpts, LookupPooledAddress);
+    List.second.dump(OS, Ctx, getAddrSize(), CurrentBase, 0,
+                     Header.getVersion(), MaxEncodingStringLength, DumpOpts,
+                     LookupPooledAddress);
 }
 
 template <typename DWARFListType>
@@ -269,11 +350,11 @@ DWARFListTableBase<DWARFListType>::findList(DWARFDataExtractor Data,
 
   // Extract the list from the section and enter it into the list map.
   DWARFListType List;
-  uint32_t End = getHeaderOffset() + Header.length();
+  uint32_t End = getHeaderOffset() + getLength();
   uint32_t StartingOffset = Offset;
-  if (Error E =
-          List.extract(Data, getHeaderOffset(), End, &Offset,
-                       Header.getSectionName(), Header.getListTypeString()))
+  if (Error E = List.extract(Data, getHeaderOffset(), End, Header.getVersion(),
+                             &Offset, Header.getSectionName(),
+                             Header.getListTypeString(), isDWO))
     return std::move(E);
   ListMap[StartingOffset] = List;
   return List;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index c3252157b0b..eb4a198dd03 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -17,7 +17,6 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
@@ -312,12 +311,6 @@ public:
     return DataExtractor(StringSection, false, 0);
   }
 
-  /// Extract the range list referenced by this compile unit from the
-  /// .debug_ranges section. If the extraction is unsuccessful, an error
-  /// is returned. Successful extraction requires that the compile unit
-  /// has already been extracted.
-  Error extractRangeList(uint32_t RangeListOffset,
-                         DWARFDebugRangeList &RangeList) const;
   void clear();
 
   const Optional<StrOffsetsContributionDescriptor> &
diff --git a/lib/DebugInfo/DWARF/CMakeLists.txt b/lib/DebugInfo/DWARF/CMakeLists.txt
index b4770e561f7..437c845718d 100644
--- a/lib/DebugInfo/DWARF/CMakeLists.txt
+++ b/lib/DebugInfo/DWARF/CMakeLists.txt
@@ -15,7 +15,6 @@ add_llvm_library(LLVMDebugInfoDWARF
   DWARFDebugLoc.cpp
   DWARFDebugMacro.cpp
   DWARFDebugPubTable.cpp
-  DWARFDebugRangeList.cpp
   DWARFDebugRnglists.cpp
   DWARFDie.cpp
   DWARFExpression.cpp
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index a29c9c2f160..3a0f52753b0 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -25,7 +25,6 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
@@ -268,26 +267,31 @@ static void dumpAddrSection(raw_ostream &OS, DWARFDataExtractor &AddrData,
   }
 }
 
-// Dump the .debug_rnglists or .debug_rnglists.dwo section (DWARF v5).
+// Dump a section that contains a sequence of tables of lists, such as range
+// or location list tables. In DWARF v5 we expect to find properly formatted
+// tables with headers. In DWARF v4 and earlier we simply expect a sequence of
+// lists, which we treat, mutatis mutandis, like DWARF v5 tables.
+template <typename ListTable>
 static void
-dumpRnglistsSection(raw_ostream &OS, DWARFDataExtractor &rnglistData,
-                    llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
-                        LookupPooledAddress,
-                    DIDumpOptions DumpOpts) {
+dumpListSection(raw_ostream &OS, DWARFContext *C, StringRef SectionName,
+                uint16_t MaxVersion, DWARFDataExtractor &ListData,
+                llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+                    LookupPooledAddress,
+                DIDumpOptions DumpOpts, bool isDWO = false) {
   uint32_t Offset = 0;
-  while (rnglistData.isValidOffset(Offset)) {
-    llvm::DWARFDebugRnglistTable Rnglists;
-    uint32_t TableOffset = Offset;
-    if (Error Err = Rnglists.extract(rnglistData, &Offset)) {
+  while (ListData.isValidOffset(Offset)) {
+    ListTable Table(C, SectionName, isDWO);
+    if (Error Err = Table.extract(ListData, MaxVersion, &Offset)) {
       WithColor::error() << toString(std::move(Err)) << '\n';
-      uint64_t Length = Rnglists.length();
-      // Keep going after an error, if we can, assuming that the length field
-      // could be read. If it couldn't, stop reading the section.
-      if (Length == 0)
+      // If table extraction set Offset to 0, it indicates that we cannot
+      // continue to read the section.
+      if (Offset == 0)
         break;
-      Offset = TableOffset + Length;
+      // In DWARF v4 and earlier, dump as much of the lists as we can.
+      if (MaxVersion < 5)
+        Table.dump(OS, LookupPooledAddress, DumpOpts);
     } else {
-      Rnglists.dump(OS, LookupPooledAddress, DumpOpts);
+      Table.dump(OS, LookupPooledAddress, DumpOpts);
     }
   }
 }
@@ -508,22 +512,6 @@ void DWARFContext::dump(
     dumpAddrSection(OS, AddrData, DumpOpts, getMaxVersion(), getCUAddrSize());
   }
 
-  if (shouldDump(Explicit, ".debug_ranges", DIDT_ID_DebugRanges,
-                 DObj->getRangeSection().Data)) {
-    uint8_t savedAddressByteSize = getCUAddrSize();
-    DWARFDataExtractor rangesData(*DObj, DObj->getRangeSection(),
-                                  isLittleEndian(), savedAddressByteSize);
-    uint32_t offset = 0;
-    DWARFDebugRangeList rangeList;
-    while (rangesData.isValidOffset(offset)) {
-      if (Error E = rangeList.extract(rangesData, &offset)) {
-        WithColor::error() << toString(std::move(E)) << '\n';
-        break;
-      }
-      rangeList.dump(OS);
-    }
-  }
-
   auto LookupPooledAddress = [&](uint32_t Index) -> Optional<SectionedAddress> {
     const auto &CUs = compile_units();
     auto I = CUs.begin();
@@ -532,18 +520,32 @@ void DWARFContext::dump(
     return (*I)->getAddrOffsetSectionItem(Index);
   };
 
+  if (shouldDump(Explicit, ".debug_ranges", DIDT_ID_DebugRanges,
+                 DObj->getRangeSection().Data)) {
+    uint8_t savedAddressByteSize = getCUAddrSize();
+    DWARFDataExtractor rangesData(*DObj, DObj->getRangeSection(),
+                                  isLittleEndian(), savedAddressByteSize);
+    dumpListSection<DWARFDebugRnglistTable>(OS, this, ".debug_ranges",
+                                            /* MaxVersion = */ 4, rangesData,
+                                            LookupPooledAddress, DumpOpts);
+  }
+
   if (shouldDump(Explicit, ".debug_rnglists", DIDT_ID_DebugRnglists,
                  DObj->getRnglistsSection().Data)) {
     DWARFDataExtractor RnglistData(*DObj, DObj->getRnglistsSection(),
-                                   isLittleEndian(), 0);
-    dumpRnglistsSection(OS, RnglistData, LookupPooledAddress, DumpOpts);
+                                   isLittleEndian(), getCUAddrSize());
+    dumpListSection<DWARFDebugRnglistTable>(OS, this, ".debug_rnglists",
+                                            getMaxVersion(5), RnglistData,
+                                            LookupPooledAddress, DumpOpts);
   }
 
   if (shouldDump(ExplicitDWO, ".debug_rnglists.dwo", DIDT_ID_DebugRnglists,
                  DObj->getRnglistsDWOSection().Data)) {
     DWARFDataExtractor RnglistData(*DObj, DObj->getRnglistsDWOSection(),
-                                   isLittleEndian(), 0);
-    dumpRnglistsSection(OS, RnglistData, LookupPooledAddress, DumpOpts);
+                                   isLittleEndian(), getCUAddrSize());
+    dumpListSection<DWARFDebugRnglistTable>(OS, this, ".debug_rnglists.dwo",
+                                            getMaxVersion(5), RnglistData,
+                                            LookupPooledAddress, DumpOpts);
   }
 
   if (shouldDump(Explicit, ".debug_pubnames", DIDT_ID_DebugPubnames,
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
deleted file mode 100644
index dfb913000a4..00000000000
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-//===- DWARFDebugRangesList.cpp -------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cinttypes>
-#include <cstdint>
-
-using namespace llvm;
-
-void DWARFDebugRangeList::clear() {
-  Offset = -1U;
-  AddressSize = 0;
-  Entries.clear();
-}
-
-Error DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
-                                   uint32_t *offset_ptr) {
-  clear();
-  if (!data.isValidOffset(*offset_ptr))
-    return createStringError(errc::invalid_argument,
-                       "invalid range list offset 0x%" PRIx32, *offset_ptr);
-
-  AddressSize = data.getAddressSize();
-  if (AddressSize != 4 && AddressSize != 8)
-    return createStringError(errc::invalid_argument,
-                       "invalid address size: %" PRIu8, AddressSize);
-  Offset = *offset_ptr;
-  while (true) {
-    RangeListEntry Entry;
-    Entry.SectionIndex = -1ULL;
-
-    uint32_t prev_offset = *offset_ptr;
-    Entry.StartAddress = data.getRelocatedAddress(offset_ptr);
-    Entry.EndAddress =
-        data.getRelocatedAddress(offset_ptr, &Entry.SectionIndex);
-
-    // Check that both values were extracted correctly.
-    if (*offset_ptr != prev_offset + 2 * AddressSize) {
-      clear();
-      return createStringError(errc::invalid_argument,
-                         "invalid range list entry at offset 0x%" PRIx32,
-                         prev_offset);
-    }
-    if (Entry.isEndOfListEntry())
-      break;
-    Entries.push_back(Entry);
-  }
-  return Error::success();
-}
-
-void DWARFDebugRangeList::dump(raw_ostream &OS) const {
-  for (const RangeListEntry &RLE : Entries) {
-    const char *format_str = (AddressSize == 4
-                              ? "%08x %08"  PRIx64 " %08"  PRIx64 "\n"
-                              : "%08x %016" PRIx64 " %016" PRIx64 "\n");
-    OS << format(format_str, Offset, RLE.StartAddress, RLE.EndAddress);
-  }
-  OS << format("%08x <End of list>\n", Offset);
-}
-
-DWARFAddressRangesVector DWARFDebugRangeList::getAbsoluteRanges(
-    llvm::Optional<SectionedAddress> BaseAddr) const {
-  DWARFAddressRangesVector Res;
-  for (const RangeListEntry &RLE : Entries) {
-    if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
-      BaseAddr = {RLE.EndAddress, RLE.SectionIndex};
-      continue;
-    }
-
-    DWARFAddressRange E;
-    E.LowPC = RLE.StartAddress;
-    E.HighPC = RLE.EndAddress;
-    E.SectionIndex = RLE.SectionIndex;
-    // Base address of a range list entry is determined by the closest preceding
-    // base address selection entry in the same range list. It defaults to the
-    // base address of the compilation unit if there is no such entry.
-    if (BaseAddr) {
-      E.LowPC += BaseAddr->Address;
-      E.HighPC += BaseAddr->Address;
-      if (E.SectionIndex == -1ULL)
-        E.SectionIndex = BaseAddr->SectionIndex;
-    }
-    Res.push_back(E);
-  }
-  return Res;
-}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
index cb5fb0d49da..737603bc88c 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
@@ -13,19 +13,30 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
-                              uint32_t *OffsetPtr) {
+                              uint16_t Version, StringRef /* SectionName */,
+                              uint32_t *OffsetPtr, bool /* isDWO */) {
   Offset = *OffsetPtr;
   SectionIndex = -1ULL;
-  // The caller should guarantee that we have at least 1 byte available, so
-  // we just assert instead of revalidate.
-  assert(*OffsetPtr < End &&
-         "not enough space to extract a rangelist encoding");
-  uint8_t Encoding = Data.getU8(OffsetPtr);
+
+  assert((Data.getAddressSize() == 4 || Data.getAddressSize() == 8) &&
+         "Unsupported address size");
+
+  // We model a DWARF v4 range list entry like DWARF v5 DW_RLE_offset_pair,
+  // since it is subject to base adjustment.
+  uint8_t Encoding = dwarf::DW_RLE_offset_pair;
+  if (Version > 4) {
+    // The caller should guarantee that we have at least 1 byte available, so
+    // we just assert instead of revalidate.
+    assert(*OffsetPtr < End &&
+           "not enough space to extract a rangelist encoding");
+    Encoding = Data.getU8(OffsetPtr);
+  }
 
   switch (Encoding) {
   case dwarf::DW_RLE_end_of_list:
@@ -61,6 +72,23 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
     break;
   }
   case dwarf::DW_RLE_offset_pair: {
+    if (Version < 5) {
+      if ((End - *OffsetPtr) < unsigned(Data.getAddressSize() * 2))
+        return createStringError(
+            errc::illegal_byte_sequence,
+            "invalid range list entry at offset 0x%" PRIx32, *OffsetPtr);
+      Value0 = Data.getRelocatedAddress(OffsetPtr);
+      Value1 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
+      // Adjust the EntryKind for end-of-list and base_address based on the
+      // contents.
+      if (Value0 == maxUIntN(Data.getAddressSize() * 8)) {
+        Encoding = dwarf::DW_RLE_base_address;
+        Value0 = Value1;
+        Value1 = 0;
+      } else if (Value0 == 0 && Value1 == 0)
+        Encoding = dwarf::DW_RLE_end_of_list;
+      break;
+    }
     uint32_t PreviousOffset = *OffsetPtr - 1;
     Value0 = Data.getULEB128(OffsetPtr);
     Value1 = Data.getULEB128(OffsetPtr);
@@ -71,7 +99,7 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
                          PreviousOffset);
     break;
   }
-  case dwarf::DW_RLE_base_address: {
+  case dwarf::DW_RLE_base_address:
     if ((End - *OffsetPtr) < Data.getAddressSize())
       return createStringError(errc::invalid_argument,
                          "insufficient space remaining in table for "
@@ -79,18 +107,16 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
                          *OffsetPtr - 1);
     Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
     break;
-  }
-  case dwarf::DW_RLE_start_end: {
+  case dwarf::DW_RLE_start_end:
     if ((End - *OffsetPtr) < unsigned(Data.getAddressSize() * 2))
       return createStringError(errc::invalid_argument,
                          "insufficient space remaining in table for "
                          "DW_RLE_start_end encoding "
                          "at offset 0x%" PRIx32,
                          *OffsetPtr - 1);
-    Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
-    Value1 = Data.getRelocatedAddress(OffsetPtr);
+    Value0 = Data.getRelocatedAddress(OffsetPtr);
+    Value1 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
     break;
-  }
   case dwarf::DW_RLE_start_length: {
     uint32_t PreviousOffset = *OffsetPtr - 1;
     Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
@@ -173,8 +199,9 @@ DWARFDebugRnglist::getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr,
 }
 
 void RangeListEntry::dump(
-    raw_ostream &OS, uint8_t AddrSize, uint8_t MaxEncodingStringLength,
-    uint64_t &CurrentBase, DIDumpOptions DumpOpts,
+    raw_ostream &OS, DWARFContext *, uint8_t AddrSize, uint64_t &CurrentBase,
+    unsigned Indent, uint16_t Version, uint8_t MaxEncodingStringLength,
+    DIDumpOptions DumpOpts,
     llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
         LookupPooledAddress) const {
   auto PrintRawEntry = [](raw_ostream &OS, const RangeListEntry &Entry,
@@ -187,21 +214,34 @@ void RangeListEntry::dump(
     }
   };
 
+  // Output indentations before we print the actual entry. We only print
+  // anything for DW_RLE_base_address when we are in verbose mode.
+  if (Version < 5 || DumpOpts.Verbose || !isBaseAddressSelectionEntry())
+    OS.indent(Indent);
+
+  // Always print the section offset in DWARF v4 and earlier.
+  if (Version < 5) {
+    OS << format("%08x", Offset);
+    DumpOpts.Verbose = false;
+  }
+
   if (DumpOpts.Verbose) {
     // Print the section offset in verbose mode.
     OS << format("0x%8.8" PRIx32 ":", Offset);
-    auto EncodingString = dwarf::RangeListEncodingString(EntryKind);
-    // Unsupported encodings should have been reported during parsing.
-    assert(!EncodingString.empty() && "Unknown range entry encoding");
-    OS << format(" [%s%*c", EncodingString.data(),
-                 MaxEncodingStringLength - EncodingString.size() + 1, ']');
-    if (EntryKind != dwarf::DW_RLE_end_of_list)
-      OS << ": ";
+    if (Version > 4) {
+      auto EncodingString = dwarf::RangeListEncodingString(EntryKind);
+      // Unsupported encodings should have been reported during parsing.
+      assert(!EncodingString.empty() && "Unknown range entry encoding");
+      OS << format(" [%s%*c", EncodingString.data(),
+                   MaxEncodingStringLength - EncodingString.size() + 1, ']');
+      if (!isEndOfList())
+        OS << ": ";
+    }
   }
 
   switch (EntryKind) {
   case dwarf::DW_RLE_end_of_list:
-    OS << (DumpOpts.Verbose ? "" : "<End of list>");
+    OS << (DumpOpts.Verbose ? "" : " <End of list>");
     break;
     //  case dwarf::DW_RLE_base_addressx:
   case dwarf::DW_RLE_base_addressx: {
@@ -217,6 +257,13 @@ void RangeListEntry::dump(
   case dwarf::DW_RLE_base_address:
     // In non-verbose mode we do not print anything for this entry.
     CurrentBase = Value0;
+    if (Version < 5) {
+      // Dump the entry in pre-DWARF v5 format, i.e. with a -1 as Value0.
+      uint64_t allOnes = maxUIntN(AddrSize * 8);
+      OS << format(" %*.*" PRIx64, AddrSize * 2, AddrSize * 2, allOnes);
+      OS << format(" %*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value0);
+      break;
+    }
     if (!DumpOpts.Verbose)
       return;
     OS << format(" 0x%*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value0);
@@ -226,6 +273,11 @@ void RangeListEntry::dump(
     DWARFAddressRange(Value0, Value0 + Value1).dump(OS, AddrSize, DumpOpts);
     break;
   case dwarf::DW_RLE_offset_pair:
+    if (Version < 5) {
+      OS << format(" %*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value0);
+      OS << format(" %*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value1);
+      break;
+    }
     PrintRawEntry(OS, *this, AddrSize, DumpOpts);
     DWARFAddressRange(Value0 + CurrentBase, Value1 + CurrentBase)
         .dump(OS, AddrSize, DumpOpts);
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 31c4cd5e472..874a2ba07fa 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -15,7 +15,6 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
diff --git a/lib/DebugInfo/DWARF/DWARFListTable.cpp b/lib/DebugInfo/DWARF/DWARFListTable.cpp
index 462c036d73a..69a9231f785 100644
--- a/lib/DebugInfo/DWARF/DWARFListTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFListTable.cpp
@@ -20,30 +20,43 @@ Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
                                     uint32_t *OffsetPtr) {
   HeaderOffset = *OffsetPtr;
   // Read and verify the length field.
-  if (!Data.isValidOffsetForDataOfSize(*OffsetPtr, sizeof(uint32_t)))
+  if (!Data.isValidOffsetForDataOfSize(*OffsetPtr, sizeof(uint32_t))) {
+    // By setting *OffsetPtr to 0, we indicate to the caller that
+    // we could not detemine the length of the table.
+    *OffsetPtr = 0;
     return createStringError(errc::invalid_argument,
-                       "section is not large enough to contain a "
-                       "%s table length at offset 0x%" PRIx32,
-                       SectionName.data(), *OffsetPtr);
+                             "section is not large enough to contain a "
+                             "%s table length at offset 0x%" PRIx32,
+                             SectionName.data(), HeaderOffset);
+  }
   // TODO: Add support for DWARF64.
   HeaderData.Length = Data.getU32(OffsetPtr);
-  if (HeaderData.Length == 0xffffffffu)
+  if (HeaderData.Length == 0xffffffffu) {
+    *OffsetPtr = 0;
     return createStringError(errc::not_supported,
                        "DWARF64 is not supported in %s at offset 0x%" PRIx32,
                        SectionName.data(), HeaderOffset);
+  }
+
+  uint32_t TableLength = HeaderData.Length + sizeof(uint32_t);
+  uint32_t End = HeaderOffset + TableLength;
   Format = dwarf::DwarfFormat::DWARF32;
-  if (HeaderData.Length + sizeof(uint32_t) < sizeof(Header))
+  if (TableLength < sizeof(Header)) {
+    *OffsetPtr = End;
     return createStringError(errc::invalid_argument,
-                       "%s table at offset 0x%" PRIx32
-                       " has too small length (0x%" PRIx32
-                       ") to contain a complete header",
-                       SectionName.data(), HeaderOffset, length());
-  uint32_t End = HeaderOffset + length();
-  if (!Data.isValidOffsetForDataOfSize(HeaderOffset, End - HeaderOffset))
-    return createStringError(errc::invalid_argument,
-                       "section is not large enough to contain a %s table "
-                       "of length 0x%" PRIx32 " at offset 0x%" PRIx32,
-                       SectionName.data(), length(), HeaderOffset);
+                             "%s table at offset 0x%" PRIx32
+                             " has too small length (0x%" PRIx32
+                             ") to contain a complete header",
+                             SectionName.data(), HeaderOffset, TableLength);
+  }
+  if (!Data.isValidOffsetForDataOfSize(HeaderOffset, TableLength)) {
+    *OffsetPtr = 0; // No recovery if the length exceeds the section size.
+    return createStringError(
+        errc::invalid_argument,
+        "section is not large enough to contain a %s table "
+        "of length 0x%" PRIx32 " at offset 0x%" PRIx32,
+        SectionName.data(), TableLength, HeaderOffset);
+  }
 
   HeaderData.Version = Data.getU16(OffsetPtr);
   HeaderData.AddrSize = Data.getU8(OffsetPtr);
@@ -51,27 +64,36 @@ Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
   HeaderData.OffsetEntryCount = Data.getU32(OffsetPtr);
 
   // Perform basic validation of the remaining header fields.
-  if (HeaderData.Version != 5)
+  if (HeaderData.Version != 5) {
+    *OffsetPtr = End;
     return createStringError(errc::invalid_argument,
-                       "unrecognised %s table version %" PRIu16
-                       " in table at offset 0x%" PRIx32,
-                       SectionName.data(), HeaderData.Version, HeaderOffset);
-  if (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)
+                             "unrecognised %s table version %" PRIu16
+                             " in table at offset 0x%" PRIx32,
+                             SectionName.data(), HeaderData.Version,
+                             HeaderOffset);
+  }
+  if (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8) {
+    *OffsetPtr = End;
     return createStringError(errc::not_supported,
                        "%s table at offset 0x%" PRIx32
                        " has unsupported address size %" PRIu8,
                        SectionName.data(), HeaderOffset, HeaderData.AddrSize);
-  if (HeaderData.SegSize != 0)
+  }
+  if (HeaderData.SegSize != 0) {
+    *OffsetPtr = End;
     return createStringError(errc::not_supported,
                        "%s table at offset 0x%" PRIx32
                        " has unsupported segment selector size %" PRIu8,
                        SectionName.data(), HeaderOffset, HeaderData.SegSize);
+  }
   if (End < HeaderOffset + sizeof(HeaderData) +
-                HeaderData.OffsetEntryCount * sizeof(uint32_t))
+                HeaderData.OffsetEntryCount * sizeof(uint32_t)) {
+    *OffsetPtr = End;
     return createStringError(errc::invalid_argument,
         "%s table at offset 0x%" PRIx32 " has more offset entries (%" PRIu32
         ") than there is space for",
         SectionName.data(), HeaderOffset, HeaderData.OffsetEntryCount);
+  }
   Data.setAddressSize(HeaderData.AddrSize);
   for (uint32_t I = 0; I < HeaderData.OffsetEntryCount; ++I)
     Offsets.push_back(Data.getU32(OffsetPtr));
@@ -101,9 +123,11 @@ void DWARFListTableHeader::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   }
 }
 
-uint32_t DWARFListTableHeader::length() const {
+uint32_t DWARFListTableHeader::getTableLength() const {
   if (HeaderData.Length == 0)
     return 0;
+  assert(HeaderData.Version > 0 &&
+         "No DWARF version in header when using getTableLength()");
   // TODO: DWARF64 support.
-  return HeaderData.Length + sizeof(uint32_t);
+  return HeaderData.Length + (HeaderData.Version > 4) * sizeof(uint32_t);
 }
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index d475c44c393..88565af1ec0 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -296,13 +296,16 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
   return true;
 }
 
-// Parse the rangelist table header, including the optional array of offsets
+// Parse a list table header, including the optional array of offsets
 // following it (DWARF v5 and later).
-static Expected<DWARFDebugRnglistTable>
-parseRngListTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
+template <typename DWARFListTable>
+static Expected<DWARFListTable>
+parseListTableHeader(DWARFDataExtractor DA, DWARFContext *C,
+                     StringRef SectionName, uint32_t Offset, bool isDWO) {
   // TODO: Support DWARF64
   // We are expected to be called with Offset 0 or pointing just past the table
   // header, which is 12 bytes long for DWARF32.
+  DWARFListTable Table(C, SectionName, isDWO);
   if (Offset > 0) {
     if (Offset < 12U)
       return createStringError(errc::invalid_argument, "Did not detect a valid"
@@ -310,20 +313,46 @@ parseRngListTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
                                Offset);
     Offset -= 12U;
   }
-  llvm::DWARFDebugRnglistTable Table;
   if (Error E = Table.extractHeaderAndOffsets(DA, &Offset))
     return std::move(E);
   return Table;
 }
 
-Error DWARFUnit::extractRangeList(uint32_t RangeListOffset,
-                                  DWARFDebugRangeList &RangeList) const {
-  // Require that compile unit is extracted.
-  assert(!DieArray.empty());
-  DWARFDataExtractor RangesData(Context.getDWARFObj(), *RangeSection,
-                                isLittleEndian, getAddressByteSize());
-  uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
-  return RangeList.extract(RangesData, &ActualRangeListOffset);
+// Parse a DWARF v5 list table (e.g. either a rangelist table or a location
+// list table). For DWARF units with version 4 or earlier, we instead create
+// the table artifically by giving it a size that equals the section size.
+template <typename DWARFListTable>
+static Optional<DWARFListTable>
+setupListTable(DWARFUnit *U, const DWARFSection *Section, StringRef SectionName,
+               uint32_t &Base, bool isDWO, bool isLittleEndian) {
+  if (!Section->Data.size())
+    return None;
+  DWARFContext &Ctx = U->getContext();
+  DWARFListTable Table(&Ctx, SectionName, isDWO);
+  // Parse the list table header. Individual lists are extracted lazily.
+  DWARFDataExtractor DA(Ctx.getDWARFObj(), *Section, isLittleEndian,
+                        U->getAddressByteSize());
+  if (U->getVersion() < 5) {
+    Base = 0;
+    Table.setHeaderData(Section->Data.size(), U->getVersion(),
+                        DA.getAddressSize());
+    return Table;
+  }
+  if (auto TableOrError = parseListTableHeader<DWARFListTable>(
+          DA, &Ctx, SectionName, Base, isDWO))
+    Table = TableOrError.get();
+  else {
+    WithColor::error() << "parsing a " << Table.getListTypeString().data()
+                       << " list table: " << toString(TableOrError.takeError())
+                       << '\n';
+    return None;
+  }
+  // In a split dwarf unit, there are no attributes like DW_AT_rnglists_base or
+  // DW_AT_loclists_base that describe the table base. Adjust Base to point past
+  // the table header which is expected to start at offset 0.
+  if (isDWO)
+    Base = Table.getHeaderSize();
+  return Table;
 }
 
 void DWARFUnit::clear() {
@@ -437,35 +466,24 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
 
     // DWARF v5 uses the .debug_rnglists and .debug_rnglists.dwo sections to
     // describe address ranges.
+    StringRef RangeSectionName = ".debug_ranges";
     if (getVersion() >= 5) {
-      if (IsDWO)
+      if (IsDWO) {
+        RangeSectionName = ".debug_rnglists.dwo";
         setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
-      else
+      } else {
+        RangeSectionName = ".debug_rnglists";
         setRangesSection(&Context.getDWARFObj().getRnglistsSection(),
                          toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0));
-      if (RangeSection->Data.size()) {
-        // Parse the range list table header. Individual range lists are
-        // extracted lazily.
-        DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
-                                    isLittleEndian, 0);
-        if (auto TableOrError =
-                parseRngListTableHeader(RangesDA, RangeSectionBase))
-          RngListTable = TableOrError.get();
-        else
-          WithColor::error() << "parsing a range list table: "
-                             << toString(TableOrError.takeError())
-                             << '\n';
-
-        // In a split dwarf unit, there is no DW_AT_rnglists_base attribute.
-        // Adjust RangeSectionBase to point past the table header.
-        if (IsDWO && RngListTable)
-          RangeSectionBase = RngListTable->getHeaderSize();
       }
     }
+    RngListTable = setupListTable<DWARFDebugRnglistTable>(
+        this, RangeSection, RangeSectionName, RangeSectionBase, IsDWO,
+        isLittleEndian);
 
     // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
     // skeleton CU DIE, so that DWARF users not aware of it are not broken.
-    }
+  }
 
   return DieArray.size();
 }
@@ -503,16 +521,9 @@ bool DWARFUnit::parseDWO() {
   DWO->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
   if (getVersion() >= 5) {
     DWO->setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
-    DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
-                                isLittleEndian, 0);
-    if (auto TableOrError = parseRngListTableHeader(RangesDA, RangeSectionBase))
-      DWO->RngListTable = TableOrError.get();
-    else
-      WithColor::error() << "parsing a range list table: "
-                         << toString(TableOrError.takeError())
-                         << '\n';
-    if (DWO->RngListTable)
-      DWO->RangeSectionBase = DWO->RngListTable->getHeaderSize();
+    DWO->RngListTable = setupListTable<DWARFDebugRnglistTable>(
+        DWOCU, DWO->RangeSection, ".debug_rnglists.dwo", DWO->RangeSectionBase,
+        /* isDWO =*/true, isLittleEndian);
   } else {
     auto DWORangesBase = UnitDie.getRangesBaseAttribute();
     DWO->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0);
@@ -530,12 +541,6 @@ void DWARFUnit::clearDIEs(bool KeepCUDie) {
 
 Expected<DWARFAddressRangesVector>
 DWARFUnit::findRnglistFromOffset(uint32_t Offset) {
-  if (getVersion() <= 4) {
-    DWARFDebugRangeList RangeList;
-    if (Error E = extractRangeList(Offset, RangeList))
-      return std::move(E);
-    return RangeList.getAbsoluteRanges(getBaseAddress());
-  }
   if (RngListTable) {
     DWARFDataExtractor RangesData(Context.getDWARFObj(), *RangeSection,
                                   isLittleEndian, RngListTable->getAddrSize());
diff --git a/tools/dsymutil/DwarfLinker.cpp b/tools/dsymutil/DwarfLinker.cpp
index 2c5cce50132..02b0d59d59e 100644
--- a/tools/dsymutil/DwarfLinker.cpp
+++ b/tools/dsymutil/DwarfLinker.cpp
@@ -43,7 +43,6 @@
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFSection.h"
@@ -1576,7 +1575,7 @@ DIE *DwarfLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
 void DwarfLinker::patchRangesForUnit(const CompileUnit &Unit,
                                      DWARFContext &OrigDwarf,
                                      const DebugMapObject &DMO) const {
-  DWARFDebugRangeList RangeList;
+  DWARFDebugRnglist RangeList;
   const auto &FunctionRanges = Unit.getFunctionRanges();
   unsigned AddressSize = Unit.getOrigUnit().getAddressByteSize();
   DWARFDataExtractor RangeExtractor(OrigDwarf.getDWARFObj(),
@@ -1596,28 +1595,30 @@ void DwarfLinker::patchRangesForUnit(const CompileUnit &Unit,
   for (const auto &RangeAttribute : Unit.getRangesAttributes()) {
     uint32_t Offset = RangeAttribute.get();
     RangeAttribute.set(Streamer->getRangesSectionSize());
-    if (Error E = RangeList.extract(RangeExtractor, &Offset)) {
+    if (Error E = RangeList.extract(RangeExtractor, /* HeaderOffset = */0,
+                                    RangeExtractor.size(),
+                                    Unit.getOrigUnit().getVersion(), &Offset,
+                                    ".debug_ranges", "range")) {
       llvm::consumeError(std::move(E));
       reportWarning("invalid range list ignored.", DMO);
       RangeList.clear();
     }
     const auto &Entries = RangeList.getEntries();
-    if (!Entries.empty()) {
-      const DWARFDebugRangeList::RangeListEntry &First = Entries.front();
-
+    if (!RangeList.empty()) {
+      const auto &First = Entries.front();
       if (CurrRange == InvalidRange ||
-          First.StartAddress + OrigLowPc < CurrRange.start() ||
-          First.StartAddress + OrigLowPc >= CurrRange.stop()) {
-        CurrRange = FunctionRanges.find(First.StartAddress + OrigLowPc);
+          First.getStartAddress() + OrigLowPc < CurrRange.start() ||
+          First.getStartAddress() + OrigLowPc >= CurrRange.stop()) {
+        CurrRange = FunctionRanges.find(First.getStartAddress() + OrigLowPc);
         if (CurrRange == InvalidRange ||
-            CurrRange.start() > First.StartAddress + OrigLowPc) {
+            CurrRange.start() > First.getStartAddress() + OrigLowPc) {
           reportWarning("no mapping for range.", DMO);
           continue;
         }
       }
     }
 
-    Streamer->emitRangesEntries(UnitPcOffset, OrigLowPc, CurrRange, Entries,
+    Streamer->emitRangesEntries(UnitPcOffset, OrigLowPc, CurrRange, RangeList,
                                 AddressSize);
   }
 }
diff --git a/tools/dsymutil/DwarfStreamer.cpp b/tools/dsymutil/DwarfStreamer.cpp
index ef798be7bdf..835a27aefef 100644
--- a/tools/dsymutil/DwarfStreamer.cpp
+++ b/tools/dsymutil/DwarfStreamer.cpp
@@ -269,28 +269,27 @@ void DwarfStreamer::emitSwiftAST(StringRef Buffer) {
 void DwarfStreamer::emitRangesEntries(
     int64_t UnitPcOffset, uint64_t OrigLowPc,
     const FunctionIntervals::const_iterator &FuncRange,
-    const std::vector<DWARFDebugRangeList::RangeListEntry> &Entries,
-    unsigned AddressSize) {
+    const DWARFDebugRnglist &RangeList, unsigned AddressSize) {
   MS->SwitchSection(MC->getObjectFileInfo()->getDwarfRangesSection());
 
   // Offset each range by the right amount.
-  int64_t PcOffset = Entries.empty() ? 0 : FuncRange.value() + UnitPcOffset;
-  for (const auto &Range : Entries) {
-    if (Range.isBaseAddressSelectionEntry(AddressSize)) {
+  int64_t PcOffset = RangeList.empty() ? 0 : FuncRange.value() + UnitPcOffset;
+  for (const auto &Range : RangeList.getEntries()) {
+    if (Range.isBaseAddressSelectionEntry()) {
       warn("unsupported base address selection operation",
            "emitting debug_ranges");
       break;
     }
     // Do not emit empty ranges.
-    if (Range.StartAddress == Range.EndAddress)
+    if (Range.isEndOfList() || Range.getStartAddress() == Range.getEndAddress())
       continue;
 
     // All range entries should lie in the function range.
-    if (!(Range.StartAddress + OrigLowPc >= FuncRange.start() &&
-          Range.EndAddress + OrigLowPc <= FuncRange.stop()))
+    if (!(Range.getStartAddress() + OrigLowPc >= FuncRange.start() &&
+          Range.getEndAddress() + OrigLowPc <= FuncRange.stop()))
       warn("inconsistent range data.", "emitting debug_ranges");
-    MS->EmitIntValue(Range.StartAddress + PcOffset, AddressSize);
-    MS->EmitIntValue(Range.EndAddress + PcOffset, AddressSize);
+    MS->EmitIntValue(Range.getStartAddress() + PcOffset, AddressSize);
+    MS->EmitIntValue(Range.getEndAddress() + PcOffset, AddressSize);
     RangesSectionSize += 2 * AddressSize;
   }
 
diff --git a/tools/dsymutil/DwarfStreamer.h b/tools/dsymutil/DwarfStreamer.h
index 679d124f4cb..2ab880d17dd 100644
--- a/tools/dsymutil/DwarfStreamer.h
+++ b/tools/dsymutil/DwarfStreamer.h
@@ -17,7 +17,7 @@
 #include "llvm/CodeGen/AccelTable.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -83,7 +83,7 @@ public:
   void emitRangesEntries(
       int64_t UnitPcOffset, uint64_t OrigLowPc,
       const FunctionIntervals::const_iterator &FuncRange,
-      const std::vector<DWARFDebugRangeList::RangeListEntry> &Entries,
+      const DWARFDebugRnglist &RangeList,
       unsigned AddressSize);
 
   /// Emit debug_aranges entries for \p Unit and if \p DoRangesSection is true,
-- 
GitLab


From 6be73e7eade32430d605bdfc7e4fe8546060599f Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Mon, 29 Oct 2018 22:25:59 +0000
Subject: [PATCH 0721/1116] [AliasSetTracker] Cleanup addPointer interface.
 [NFCI]

Summary:
Attempting to simplify the addPointer interface.
Currently there's code decomposing a MemoryLocation into (Ptr, Size, AAMDNodes) only to recreate the MemoryLocation inside the call.

Reviewers: reames, mkazantsev

Subscribers: sanjoy, jlebar, llvm-commits

Differential Revision: https://reviews.llvm.org/D53836

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345548 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/AliasSetTracker.h |  7 +------
 lib/Analysis/AliasSetTracker.cpp        | 12 ++++++------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/include/llvm/Analysis/AliasSetTracker.h b/include/llvm/Analysis/AliasSetTracker.h
index cf4981d1eb2..d24453749fe 100644
--- a/include/llvm/Analysis/AliasSetTracker.h
+++ b/include/llvm/Analysis/AliasSetTracker.h
@@ -441,12 +441,7 @@ private:
     return *Entry;
   }
 
-  AliasSet &addPointer(Value *P, LocationSize Size, const AAMDNodes &AAInfo,
-                       AliasSet::AccessLattice E);
-  AliasSet &addPointer(MemoryLocation Loc,
-                       AliasSet::AccessLattice E) {
-    return addPointer(const_cast<Value*>(Loc.Ptr), Loc.Size, Loc.AATags, E);
-  }
+  AliasSet &addPointer(MemoryLocation Loc, AliasSet::AccessLattice E);
   AliasSet *mergeAliasSetsForPointer(const Value *Ptr, LocationSize Size,
                                      const AAMDNodes &AAInfo);
 
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 66544c51446..22c8ae20113 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -383,7 +383,7 @@ AliasSet &AliasSetTracker::getAliasSetFor(const MemoryLocation &MemLoc) {
 
 void AliasSetTracker::add(Value *Ptr, LocationSize Size,
                           const AAMDNodes &AAInfo) {
-  addPointer(Ptr, Size, AAInfo, AliasSet::NoAccess);
+  addPointer(MemoryLocation(Ptr, Size, AAInfo), AliasSet::NoAccess);
 }
 
 void AliasSetTracker::add(LoadInst *LI) {
@@ -518,8 +518,9 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
 
     // Loop over all of the pointers in this alias set.
     for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI)
-      addPointer(ASI.getPointer(), ASI.getSize(), ASI.getAAInfo(),
-                 (AliasSet::AccessLattice)AS.Access);
+      addPointer(
+          MemoryLocation(ASI.getPointer(), ASI.getSize(), ASI.getAAInfo()),
+          (AliasSet::AccessLattice)AS.Access);
   }
 }
 
@@ -612,10 +613,9 @@ AliasSet &AliasSetTracker::mergeAllAliasSets() {
   return *AliasAnyAS;
 }
 
-AliasSet &AliasSetTracker::addPointer(Value *P, LocationSize Size,
-                                      const AAMDNodes &AAInfo,
+AliasSet &AliasSetTracker::addPointer(MemoryLocation Loc,
                                       AliasSet::AccessLattice E) {
-  AliasSet &AS = getAliasSetFor(MemoryLocation(P, Size, AAInfo));
+  AliasSet &AS = getAliasSetFor(Loc);
   AS.Access |= E;
 
   if (!AliasAnyAS && (TotalMayAliasSetSize > SaturationThreshold)) {
-- 
GitLab


From ea8bef6b82646422409adde7444104fd90108c29 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Mon, 29 Oct 2018 22:38:13 +0000
Subject: [PATCH 0722/1116] Remove unneeded friend declarations that clang-cl
 warns on

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345549 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp     | 2 --
 lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp | 2 --
 2 files changed, 4 deletions(-)

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 7ca191c86ad..9a7e34b0aeb 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -75,8 +75,6 @@ void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) {
 /// by MachO. Beware!
 class AArch64ELFStreamer : public MCELFStreamer {
 public:
-  friend class AArch64TargetELFStreamer;
-
   AArch64ELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
                      std::unique_ptr<MCObjectWriter> OW,
                      std::unique_ptr<MCCodeEmitter> Emitter)
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 7a65c7a63f1..b828ab832e9 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -22,8 +22,6 @@ class AArch64WinCOFFStreamer : public MCWinCOFFStreamer {
   Win64EH::ARM64UnwindEmitter EHStreamer;
 
 public:
-  friend class AArch64TargetWinCOFFStreamer;
-
   AArch64WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
                          std::unique_ptr<MCCodeEmitter> CE,
                          std::unique_ptr<MCObjectWriter> OW)
-- 
GitLab


From 54d12f35afdda272641ddf0a894a6de6541270cd Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 30 Oct 2018 01:11:31 +0000
Subject: [PATCH 0723/1116] Pass TRI to printReg

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345553 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/LiveRangeCalc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index 04324943dfa..70e135ab1af 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -364,7 +364,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
 #ifndef NDEBUG
     if (MBB->pred_empty()) {
       MBB->getParent()->verify();
-      errs() << "Use of " << printReg(PhysReg)
+      errs() << "Use of " << printReg(PhysReg, MRI->getTargetRegisterInfo())
              << " does not have a corresponding definition on every path:\n";
       const MachineInstr *MI = Indexes->getInstructionFromIndex(Use);
       if (MI != nullptr)
-- 
GitLab


From 03eb1d66574b3c3471d12bc2804f7147181a3759 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 30 Oct 2018 01:11:52 +0000
Subject: [PATCH 0724/1116] Fix typos in comment

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345554 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/LiveIntervals.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/llvm/CodeGen/LiveIntervals.h b/include/llvm/CodeGen/LiveIntervals.h
index 291a07a712c..16ab1dc475c 100644
--- a/include/llvm/CodeGen/LiveIntervals.h
+++ b/include/llvm/CodeGen/LiveIntervals.h
@@ -198,10 +198,10 @@ class VirtRegMap;
     void pruneValue(LiveRange &LR, SlotIndex Kill,
                     SmallVectorImpl<SlotIndex> *EndPoints);
 
-    /// This function should not be used. Its intend is to tell you that
-    /// you are doing something wrong if you call pruveValue directly on a
+    /// This function should not be used. Its intent is to tell you that you are
+    /// doing something wrong if you call pruneValue directly on a
     /// LiveInterval. Indeed, you are supposed to call pruneValue on the main
-    /// LiveRange and all the LiveRange of the subranges if any.
+    /// LiveRange and all the LiveRanges of the subranges if any.
     LLVM_ATTRIBUTE_UNUSED void pruneValue(LiveInterval &, SlotIndex,
                                           SmallVectorImpl<SlotIndex> *) {
       llvm_unreachable(
-- 
GitLab


From 3311128c5875a644deddc21750845d237eed0a25 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 30 Oct 2018 01:12:12 +0000
Subject: [PATCH 0725/1116] Remove dead declaration

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345555 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/LiveDebugVariables.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index aa35880b063..0060399c2b0 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -39,13 +39,6 @@ public:
   LiveDebugVariables();
   ~LiveDebugVariables() override;
 
-  /// renameRegister - Move any user variables in OldReg to NewReg:SubIdx.
-  /// @param OldReg Old virtual register that is going away.
-  /// @param NewReg New register holding the user variables.
-  /// @param SubIdx If NewReg is a virtual register, SubIdx may indicate a sub-
-  ///               register.
-  void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx);
-
   /// splitRegister - Move any user variables in OldReg to the live ranges in
   /// NewRegs where they are live. Mark the values as unavailable where no new
   /// register is live.
-- 
GitLab


From 4ae1eac0ecba49e858cda21dda282d0c8993fb97 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 30 Oct 2018 01:33:14 +0000
Subject: [PATCH 0726/1116] AMDGPU: Use scavengeRegisterBackwards

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345559 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIInstrInfo.cpp         |  5 +-
 test/CodeGen/AMDGPU/branch-relax-spill.ll |  2 +-
 test/CodeGen/AMDGPU/branch-relaxation.ll  | 90 ++++++++++++-----------
 3 files changed, 51 insertions(+), 46 deletions(-)

diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index aa79ad8b9b3..d0d8576ade3 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1555,8 +1555,9 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   //   buzz;
 
   RS->enterBasicBlockEnd(MBB);
-  unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
-                                       MachineBasicBlock::iterator(GetPC), 0);
+  unsigned Scav = RS->scavengeRegisterBackwards(
+    AMDGPU::SReg_64RegClass,
+    MachineBasicBlock::iterator(GetPC), false, 0);
   MRI.replaceRegWith(PCReg, Scav);
   MRI.clearVirtRegs();
   RS->setRegUsed(Scav);
diff --git a/test/CodeGen/AMDGPU/branch-relax-spill.ll b/test/CodeGen/AMDGPU/branch-relax-spill.ll
index db476c21636..3d6906301d7 100644
--- a/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -3,7 +3,7 @@
 ; FIXME: This should be able to compile, but requires inserting an
 ; extra block to restore the scavenged register.
 
-; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot!
+; FAIL: LLVM ERROR: Error while trying to spill SGPR0_SGPR1 from class SReg_64: Cannot scavenge register without an emergency spill slot!
 
 define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 entry:
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll
index d4284a32c0e..72c983d5d97 100644
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 
 ; FIXME: We should use llvm-mc for this, but we can't even parse our own output.
@@ -61,10 +61,10 @@ bb3:
 ; GCN-NEXT: s_cbranch_scc0 [[LONGBB:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 
 ; GCN-NEXT: [[LONGBB]]:
 ; GCN-NEXT: ;;#ASMSTART
@@ -105,10 +105,10 @@ bb3:
 ; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 
 ; GCN-NEXT: [[LONGBB]]:
 ; GCN: v_nop_e64
@@ -191,10 +191,11 @@ bb3:
 
 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
 ; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONG_JUMP]]+4)-[[LOOPBB]]
-; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
-; GCN-NEXT: s_setpc_b64 vcc
+
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONG_JUMP]]+4)-[[LOOPBB]]
+; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 
 ; GCN-NEXT: [[ENDBB]]:
 ; GCN-NEXT: s_endpgm
@@ -225,20 +226,20 @@ bb3:
 ; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONG_JUMP0:BB[0-9]+_[0-9]+]]: ; %bb0
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]_[0-9]+]]-([[LONG_JUMP0]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], [[BB3:BB[0-9]_[0-9]+]]-([[LONG_JUMP0]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC0_LO]]:[[PC0_HI]]{{\]}}
 
 ; GCN-NEXT: [[BB2]]: ; %bb2
 ; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
 ; GCN: buffer_store_dword [[BB2_K]]
 
 ; GCN-NEXT: [[LONG_JUMP1:BB[0-9]+_[0-9]+]]: ; %bb2
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB4:BB[0-9]_[0-9]+]]-([[LONG_JUMP1]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC1_LO]], s[[PC1_LO]], [[BB4:BB[0-9]_[0-9]+]]-([[LONG_JUMP1]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC1_HI]], s[[PC1_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC1_LO]]:[[PC1_HI]]{{\]}}
 
 ; GCN: [[BB3]]: ; %bb3
 ; GCN: v_nop_e64
@@ -289,10 +290,11 @@ bb4:
 
 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
 ; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP]]
-; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONGBB]]+4)-[[LOOP]]
+; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
 define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 entry:
@@ -318,10 +320,11 @@ loop:
 ; GCN-NEXT: s_cbranch_scc0 [[BB1:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONGBB0:BB[0-9]+_[0-9]+]]: ; %bb0
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB0]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB0]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC0_LO]]:[[PC0_HI]]{{\]}}
 
 ; GCN-NEXT: [[BB1]]: ; %bb1
 ; GCN-NEXT: s_load_dword
@@ -330,10 +333,10 @@ loop:
 ; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]_[0-9]+]]
 
 ; GCN-NEXT: [[LONGBB1:BB[0-9]+_[0-9]+]]: ; %bb1
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]+_[0-9]+]]-([[LONGBB1]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC1_LO]], s[[PC1_LO]], [[BB3:BB[0-9]+_[0-9]+]]-([[LONGBB1]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC1_HI]], s[[PC1_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC1_LO]]:[[PC1_HI]]{{\]}}
 
 ; GCN-NEXT: [[BB2]]: ; %bb2
 ; GCN-NEXT: ;;#ASMSTART
@@ -389,10 +392,10 @@ bb3:
 ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %entry
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 
 ; GCN-NEXT: [[IF]]: ; %if
 ; GCN: buffer_store_dword
@@ -454,10 +457,10 @@ endif:
 
 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
 ; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP_BODY]]
-; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONGBB]]+4)-[[LOOP_BODY]]
+; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 
 ; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_endpgm
@@ -494,8 +497,9 @@ ret:
 ; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 
-; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
-; GCN: s_setpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
+; GCN-NEXT: s_addc_u32
+; GCN-NEXT: s_setpc_b64
 
 ; GCN-NEXT: [[LONG_BR_0]]:
 ; GCN-DAG: v_cmp_lt_i32
-- 
GitLab


From 3031f2125fb88bfb7fce512c68fb9564808cce57 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 30 Oct 2018 01:37:59 +0000
Subject: [PATCH 0727/1116] AMDGPU: Remove custom BUILD_VECTOR combine

This was looping in a testcase and removing it
now slightly improves a test.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345560 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIISelLowering.cpp          | 45 -------------------
 lib/Target/AMDGPU/SIISelLowering.h            |  1 -
 .../AMDGPU/build-vector-insert-elt-infloop.ll | 27 +++++++++++
 test/CodeGen/AMDGPU/mad-mix-hi.ll             | 12 ++---
 4 files changed, 33 insertions(+), 52 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 3ba04831d15..13b92fc07a1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -679,7 +679,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::BUILD_VECTOR);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -8133,48 +8132,6 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
   return SDValue();
 }
 
-static bool convertBuildVectorCastElt(SelectionDAG &DAG,
-                                      SDValue &Lo, SDValue &Hi) {
-  if (Hi.getOpcode() == ISD::BITCAST &&
-      Hi.getOperand(0).getValueType() == MVT::f16 &&
-      (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
-    Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
-    Hi = Hi.getOperand(0);
-    return true;
-  }
-
-  return false;
-}
-
-SDValue SITargetLowering::performBuildVectorCombine(
-  SDNode *N, DAGCombinerInfo &DCI) const {
-  SDLoc SL(N);
-
-  if (!isTypeLegal(MVT::v2i16))
-    return SDValue();
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-
-  if (VT == MVT::v2i16) {
-    SDValue Lo = N->getOperand(0);
-    SDValue Hi = N->getOperand(1);
-
-    // v2i16 build_vector (const|undef), (bitcast f16:$x)
-    // -> bitcast (v2f16 build_vector const|undef, $x
-    if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
-      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi  });
-      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
-    }
-
-    if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
-      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo  });
-      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
-    }
-  }
-
-  return SDValue();
-}
-
 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                           const SDNode *N0,
                                           const SDNode *N1) const {
@@ -8783,8 +8740,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   }
   case ISD::EXTRACT_VECTOR_ELT:
     return performExtractVectorEltCombine(N, DCI);
-  case ISD::BUILD_VECTOR:
-    return performBuildVectorCombine(N, DCI);
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index bcb46ec41d1..09e0a12cce8 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -154,7 +154,6 @@ private:
   SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
diff --git a/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll b/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
new file mode 100644
index 00000000000..865dccb2791
--- /dev/null
+++ b/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; There was an infinite loop in DAGCombiner from a target build_vector
+; combine and a generic insert_vector_elt combine.
+
+; GCN-LABEL: {{^}}combine_loop:
+; GCN: flat_load_ushort
+; GCN: flat_store_short
+; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+define amdgpu_kernel void @combine_loop(i16* %arg) #0 {
+bb:
+  br label %bb1
+
+bb1:
+  %tmp = phi <2 x i16> [ <i16 15360, i16 15360>, %bb ], [ %tmp5, %bb1 ]
+  %tmp2 = phi half [ 0xH0000, %bb ], [ %tmp8, %bb1 ]
+  %tmp3 = load volatile half, half* null, align 536870912
+  %tmp4 = bitcast half %tmp3 to i16
+  %tmp5 = insertelement <2 x i16> <i16 0, i16 undef>, i16 %tmp4, i32 1
+  %tmp6 = bitcast i16* %arg to half*
+  store half %tmp2, half* %tmp6, align 2
+  %tmp7 = bitcast <2 x i16> %tmp to <2 x half>
+  %tmp8 = extractelement <2 x half> %tmp7, i32 0
+  br label %bb1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/mad-mix-hi.ll b/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 53a00c240d3..6c27690fb2b 100644
--- a/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -49,9 +49,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
 }
 
 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
-; GFX9: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT: s_setpc_b64
 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 {
   %src0.ext = fpext half %src0 to float
@@ -66,9 +66,9 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
 }
 
 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
-; GFX9: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT: s_setpc_b64
 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 {
   %src0.ext = fpext half %src0 to float
-- 
GitLab


From 1de3cb13c18a2e0af24e13a71db6622eb2ebd38e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 30 Oct 2018 03:27:11 +0000
Subject: [PATCH 0728/1116] [X86] Stop changing f128 fand/for/fxor to v2i64.

The additional patterns don't cost us much and it seems better than changing element widths.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345564 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp    | 38 +++++++++++++--------------
 lib/Target/X86/X86InstrVecCompiler.td | 21 +++++++++++++--
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 35239b79f18..94f3d8c6026 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -37745,27 +37745,27 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   MVT VT = N->getSimpleValueType(0);
   // If we have integer vector types available, use the integer opcodes.
-  if ((VT.isVector() || VT == MVT::f128) && Subtarget.hasSSE2()) {
-    SDLoc dl(N);
+  if (!VT.isVector() || !Subtarget.hasSSE2())
+    return SDValue();
 
-    unsigned IntBits = std::min(VT.getScalarSizeInBits(), 64U);
-    MVT IntSVT = MVT::getIntegerVT(IntBits);
-    MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
+  SDLoc dl(N);
 
-    SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
-    SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
-    unsigned IntOpcode;
-    switch (N->getOpcode()) {
-    default: llvm_unreachable("Unexpected FP logic op");
-    case X86ISD::FOR: IntOpcode = ISD::OR; break;
-    case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
-    case X86ISD::FAND: IntOpcode = ISD::AND; break;
-    case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
-    }
-    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
-    return DAG.getBitcast(VT, IntOp);
-  }
-  return SDValue();
+  unsigned IntBits = VT.getScalarSizeInBits();
+  MVT IntSVT = MVT::getIntegerVT(IntBits);
+  MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
+
+  SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
+  SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
+  unsigned IntOpcode;
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("Unexpected FP logic op");
+  case X86ISD::FOR:   IntOpcode = ISD::OR; break;
+  case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
+  case X86ISD::FAND:  IntOpcode = ISD::AND; break;
+  case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+  }
+  SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+  return DAG.getBitcast(VT, IntOp);
 }
 
 
diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td
index 0aeed51dde5..c417dc99b84 100644
--- a/lib/Target/X86/X86InstrVecCompiler.td
+++ b/lib/Target/X86/X86InstrVecCompiler.td
@@ -466,8 +466,6 @@ def : Pat<(loadf128 addr:$src),
           (VMOVUPSZ128rm addr:$src)>;
 }
 
-// With SSE2 the DAG combiner converts fp logic ops to integer logic ops to
-// reduce patterns.
 let Predicates = [UseSSE1] in {
 // andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
 def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))),
@@ -489,4 +487,23 @@ def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
           (XORPSrr VR128:$src1, VR128:$src2)>;
 }
 
+let Predicates = [HasAVX] in {
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(f128 (X86fand VR128:$src1, (loadf128 addr:$src2))),
+          (VANDPSrm VR128:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)),
+          (VANDPSrr VR128:$src1, VR128:$src2)>;
 
+def : Pat<(f128 (X86for VR128:$src1, (loadf128 addr:$src2))),
+          (VORPSrm VR128:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)),
+          (VORPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128:$src1, (loadf128 addr:$src2))),
+          (VXORPSrm VR128:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
+          (VXORPSrr VR128:$src1, VR128:$src2)>;
+}
-- 
GitLab


From 141f443f6414fb710d76b61c50558330346c3139 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 30 Oct 2018 03:27:12 +0000
Subject: [PATCH 0729/1116] [X86] Cleanup the code in LowerFABSorFNEG and
 LowerFCOPYSIGN a little. NFC

Use SelectionDAG::EVTToAPFloatSemantics. Make the LogicVT calculation in LowerFABSorFNEG similar to LowerFCOPYSIGN. Use APInt::getSignedMaxValue instead of ~APInt::getSignMask.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345565 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 50 ++++++++++++------------------
 1 file changed, 20 insertions(+), 30 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 94f3d8c6026..c7d398873d2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -17956,43 +17956,36 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   bool IsF128 = (VT == MVT::f128);
+  assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
+          VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+          VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
+         "Unexpected type in LowerFABSorFNEG");
 
   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   // decide if we should generate a 16-byte constant mask when we only need 4 or
   // 8 bytes for the scalar case.
 
-  MVT LogicVT;
-  MVT EltVT;
-
-  if (VT.isVector()) {
-    LogicVT = VT;
-    EltVT = VT.getVectorElementType();
-  } else if (IsF128) {
-    // SSE instructions are used for optimized f128 logical operations.
-    LogicVT = MVT::f128;
-    EltVT = VT;
-  } else {
-    // There are no scalar bitwise logical SSE/AVX instructions, so we
-    // generate a 16-byte vector constant and logic op even for the scalar case.
-    // Using a 16-byte mask allows folding the load of the mask with
-    // the logic op, so it can save (~4 bytes) on code size.
+  // There are no scalar bitwise logical SSE/AVX instructions, so we
+  // generate a 16-byte vector constant and logic op even for the scalar case.
+  // Using a 16-byte mask allows folding the load of the mask with
+  // the logic op, so it can save (~4 bytes) on code size.
+  bool IsFakeVector = !VT.isVector() && !IsF128;
+  MVT LogicVT = VT;
+  if (IsFakeVector)
     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
-    EltVT = VT;
-  }
 
-  unsigned EltBits = EltVT.getSizeInBits();
+  unsigned EltBits = VT.getScalarSizeInBits();
   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
-  APInt MaskElt =
-    IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
-  const fltSemantics &Sem =
-      EltVT == MVT::f64 ? APFloat::IEEEdouble() :
-          (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
+  APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
+                           APInt::getSignMask(EltBits);
+  const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
 
   SDValue Op0 = Op.getOperand(0);
   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
-  unsigned LogicOp =
-    IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+  unsigned LogicOp = IsFABS  ? X86ISD::FAND :
+                     IsFNABS ? X86ISD::FOR  :
+                               X86ISD::FXOR;
   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
 
   if (VT.isVector() || IsF128)
@@ -18028,10 +18021,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
          "Unexpected type in LowerFCOPYSIGN");
 
-  MVT EltVT = VT.getScalarType();
-  const fltSemantics &Sem =
-      EltVT == MVT::f64 ? APFloat::IEEEdouble()
-                        : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
+  const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
 
   // Perform all scalar logic operations as 16-byte vectors because there are no
   // scalar FP logic instructions in SSE.
@@ -18048,7 +18038,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue SignMask = DAG.getConstantFP(
       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
   SDValue MagMask = DAG.getConstantFP(
-      APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
+      APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
 
   // First, clear all bits but the sign bit from the second operand (sign).
   if (IsFakeVector)
-- 
GitLab


From 4bef8b292d3a58063a9de0017a738917ff0188bb Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 30 Oct 2018 03:27:13 +0000
Subject: [PATCH 0730/1116] [AArch64] Add test case for D53229. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345566 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/AArch64/bitcast-promote-widen.ll | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 test/CodeGen/AArch64/bitcast-promote-widen.ll

diff --git a/test/CodeGen/AArch64/bitcast-promote-widen.ll b/test/CodeGen/AArch64/bitcast-promote-widen.ll
new file mode 100644
index 00000000000..a81c1c6f140
--- /dev/null
+++ b/test/CodeGen/AArch64/bitcast-promote-widen.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s
+
+; Test cases of bitcasts where one type needs to be widened and one needs to be promoted.
+
+define <2 x i16> @bitcast_v2i16_v2f16(<2 x half> %x) {
+; CHECK-LABEL: bitcast_v2i16_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16 // =16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str s0, [sp, #12]
+; CHECK-NEXT:    ldrh w8, [sp, #12]
+; CHECK-NEXT:    ldrh w9, [sp, #14]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    add sp, sp, #16 // =16
+; CHECK-NEXT:    ret
+  %y = bitcast <2 x half> %x to <2 x i16>
+  ret <2 x i16> %y
+}
+
+define <2 x half> @bitcast_v2f16_v2i16(<2 x i16> %x) {
+; CHECK-LABEL: bitcast_v2f16_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %y = bitcast <2 x i16> %x to <2 x half>
+  ret <2 x half> %y
+}
-- 
GitLab


From 52ba37f41c79d6e853309e5592bc42dc2bfbf3f6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 30 Oct 2018 03:27:15 +0000
Subject: [PATCH 0731/1116] [LegalizeTypes] Teach PromoteIntRes_BITCAST to
 better handle a bitcast with vector output type and a vector input type that
 needs to be widened

Summary: Previously if we had a bitcast vector output type that needs promotion and a vector input type that needs widening we would just do a stack store and load to handle the conversion. We can do a little better if we can widen the bitcast to a legal vector type the same size as the widened input type. Then we can do the bitcast between this widened type and the widened input type. Afterwards we can extract_subvector back to the original output and any_extend that. Type legalization will then circle back and handle promotion of the extract_subvector and the any_extend will just be removed. This will avoid going through the stack and allows us to remove a custom version of this legalization from X86.

Reviewers: efriedma, RKSimon

Reviewed By: efriedma

Subscribers: javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D53229

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345567 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 20 +++++++++++++++++++
 lib/Target/X86/X86ISelLowering.cpp            | 10 ++--------
 test/CodeGen/AArch64/bitcast-promote-widen.ll | 14 +++++--------
 3 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index f24659ac274..2b1df0165d3 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -310,6 +310,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
     // make us bitcast between two vectors which are legalized in different ways.
     if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector())
       return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetWidenedVector(InOp));
+    // If the output type is also a vector and widening it to the same size
+    // as the widened input type would be a legal type, we can widen the bitcast
+    // and handle the promotion after.
+    if (NOutVT.isVector()) {
+      unsigned WidenInSize = NInVT.getSizeInBits();
+      unsigned OutSize = OutVT.getSizeInBits();
+      if (WidenInSize % OutSize == 0) {
+        unsigned Scale = WidenInSize / OutSize;
+        EVT WideOutVT = EVT::getVectorVT(*DAG.getContext(),
+                                         OutVT.getVectorElementType(),
+                                         OutVT.getVectorNumElements() * Scale);
+        if (isTypeLegal(WideOutVT)) {
+          InOp = DAG.getBitcast(WideOutVT, GetWidenedVector(InOp));
+          MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+          InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, InOp,
+                             DAG.getConstant(0, dl, IdxTy));
+          return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, InOp);
+        }
+      }
+    }
   }
 
   return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c7d398873d2..da5340a050b 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26338,7 +26338,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
-    if ((SrcVT != MVT::f64 && SrcVT != MVT::v2f32) ||
+    if (SrcVT != MVT::f64 ||
         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) ||
         getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
       return;
@@ -26347,13 +26347,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT SVT = DstVT.getVectorElementType();
     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
     SDValue Res;
-    if (SrcVT == MVT::f64)
-      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                             MVT::v2f64, N->getOperand(0));
-    else
-      Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0),
-                        DAG.getUNDEF(MVT::v2f32));
-
+    Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0));
     Res = DAG.getBitcast(WiderVT, Res);
     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
                       DAG.getIntPtrConstant(0, dl));
diff --git a/test/CodeGen/AArch64/bitcast-promote-widen.ll b/test/CodeGen/AArch64/bitcast-promote-widen.ll
index a81c1c6f140..74f9e9c8566 100644
--- a/test/CodeGen/AArch64/bitcast-promote-widen.ll
+++ b/test/CodeGen/AArch64/bitcast-promote-widen.ll
@@ -6,16 +6,12 @@
 define <2 x i16> @bitcast_v2i16_v2f16(<2 x half> %x) {
 ; CHECK-LABEL: bitcast_v2i16_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16 // =16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str s0, [sp, #12]
-; CHECK-NEXT:    ldrh w8, [sp, #12]
-; CHECK-NEXT:    ldrh w9, [sp, #14]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    add sp, sp, #16 // =16
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    mov v1.s[1], w8
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %y = bitcast <2 x half> %x to <2 x i16>
   ret <2 x i16> %y
-- 
GitLab


From 8f9fb8bab2e9b5b27fe40d700d2abe967b99fbb5 Mon Sep 17 00:00:00 2001
From: David Bolvansky <david.bolvansky@gmail.com>
Date: Tue, 30 Oct 2018 09:07:22 +0000
Subject: [PATCH 0732/1116] [DAGCombiner] Improve X div/rem Y fold if single
 bit element type

Summary: Tests by @spatel, thanks

Reviewers: spatel, RKSimon

Reviewed By: spatel

Subscribers: sdardis, atanasyan, llvm-commits, spatel

Differential Revision: https://reviews.llvm.org/D52668

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345575 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   7 +-
 test/CodeGen/Mips/llvm-ir/sdiv.ll        |  35 +--
 test/CodeGen/Mips/llvm-ir/srem.ll        |  35 +--
 test/CodeGen/Mips/llvm-ir/udiv.ll        |  25 +-
 test/CodeGen/Mips/llvm-ir/urem.ll        |  42 +--
 test/CodeGen/X86/combine-sdiv.ll         | 312 +----------------------
 test/CodeGen/X86/combine-srem.ll         |  64 +----
 test/CodeGen/X86/combine-udiv.ll         | 159 +-----------
 test/CodeGen/X86/combine-urem.ll         |  93 +------
 test/CodeGen/X86/pr38539.ll              | 230 ++---------------
 10 files changed, 62 insertions(+), 940 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 906223a624c..64c7dca0f6e 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3138,11 +3138,12 @@ static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
 
   // X / 1 -> X
   // X % 1 -> 0
-  if (N1C && N1C->isOne())
-    return IsDiv ? N0 : DAG.getConstant(0, DL, VT);
   // If this is a boolean op (single-bit element type), we can't have
   // division-by-zero or remainder-by-zero, so assume the divisor is 1.
-  // Similarly, if we're zero-extending a boolean divisor, then assume it's a 1.
+  // TODO: Similarly, if we're zero-extending a boolean divisor, then assume
+  // it's a 1.
+  if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1))
+    return IsDiv ? N0 : DAG.getConstant(0, DL, VT);
 
   return SDValue();
 }
diff --git a/test/CodeGen/Mips/llvm-ir/sdiv.ll b/test/CodeGen/Mips/llvm-ir/sdiv.ll
index 03b831191a8..e54eaa63222 100644
--- a/test/CodeGen/Mips/llvm-ir/sdiv.ll
+++ b/test/CodeGen/Mips/llvm-ir/sdiv.ll
@@ -35,55 +35,32 @@
 define signext i1 @sdiv_i1(i1 signext %a, i1 signext %b) {
 ; GP32-LABEL: sdiv_i1:
 ; GP32:       # %bb.0: # %entry
-; GP32-NEXT:    div $zero, $4, $5
-; GP32-NEXT:    teq $5, $zero, 7
-; GP32-NEXT:    mflo $1
-; GP32-NEXT:    andi $1, $1, 1
 ; GP32-NEXT:    jr $ra
-; GP32-NEXT:    negu $2, $1
+; GP32-NEXT:    move $2, $4
 ;
 ; GP32R6-LABEL: sdiv_i1:
 ; GP32R6:       # %bb.0: # %entry
-; GP32R6-NEXT:    div $1, $4, $5
-; GP32R6-NEXT:    teq $5, $zero, 7
-; GP32R6-NEXT:    andi $1, $1, 1
 ; GP32R6-NEXT:    jr $ra
-; GP32R6-NEXT:    negu $2, $1
+; GP32R6-NEXT:    move $2, $4
 ;
 ; GP64-LABEL: sdiv_i1:
 ; GP64:       # %bb.0: # %entry
-; GP64-NEXT:    div $zero, $4, $5
-; GP64-NEXT:    teq $5, $zero, 7
-; GP64-NEXT:    mflo $1
-; GP64-NEXT:    andi $1, $1, 1
 ; GP64-NEXT:    jr $ra
-; GP64-NEXT:    negu $2, $1
+; GP64-NEXT:    move $2, $4
 ;
 ; GP64R6-LABEL: sdiv_i1:
 ; GP64R6:       # %bb.0: # %entry
-; GP64R6-NEXT:    div $1, $4, $5
-; GP64R6-NEXT:    teq $5, $zero, 7
-; GP64R6-NEXT:    andi $1, $1, 1
 ; GP64R6-NEXT:    jr $ra
-; GP64R6-NEXT:    negu $2, $1
+; GP64R6-NEXT:    move $2, $4
 ;
 ; MMR3-LABEL: sdiv_i1:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    div $zero, $4, $5
-; MMR3-NEXT:    teq $5, $zero, 7
-; MMR3-NEXT:    mflo16 $2
-; MMR3-NEXT:    andi16 $2, $2, 1
-; MMR3-NEXT:    li16 $3, 0
-; MMR3-NEXT:    subu16 $2, $3, $2
+; MMR3-NEXT:    move $2, $4
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: sdiv_i1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    div $2, $4, $5
-; MMR6-NEXT:    teq $5, $zero, 7
-; MMR6-NEXT:    andi16 $2, $2, 1
-; MMR6-NEXT:    li16 $3, 0
-; MMR6-NEXT:    subu16 $2, $3, $2
+; MMR6-NEXT:    move $2, $4
 ; MMR6-NEXT:    jrc $ra
 entry:
   %r = sdiv i1 %a, %b
diff --git a/test/CodeGen/Mips/llvm-ir/srem.ll b/test/CodeGen/Mips/llvm-ir/srem.ll
index 66ee6c01bd2..ef0502c85d5 100644
--- a/test/CodeGen/Mips/llvm-ir/srem.ll
+++ b/test/CodeGen/Mips/llvm-ir/srem.ll
@@ -35,55 +35,32 @@
 define signext i1 @srem_i1(i1 signext %a, i1 signext %b) {
 ; GP32-LABEL: srem_i1:
 ; GP32:       # %bb.0: # %entry
-; GP32-NEXT:    div $zero, $4, $5
-; GP32-NEXT:    teq $5, $zero, 7
-; GP32-NEXT:    mfhi $1
-; GP32-NEXT:    andi $1, $1, 1
 ; GP32-NEXT:    jr $ra
-; GP32-NEXT:    negu $2, $1
+; GP32-NEXT:    addiu $2, $zero, 0
 ;
 ; GP32R6-LABEL: srem_i1:
 ; GP32R6:       # %bb.0: # %entry
-; GP32R6-NEXT:    mod $1, $4, $5
-; GP32R6-NEXT:    teq $5, $zero, 7
-; GP32R6-NEXT:    andi $1, $1, 1
 ; GP32R6-NEXT:    jr $ra
-; GP32R6-NEXT:    negu $2, $1
+; GP32R6-NEXT:    addiu $2, $zero, 0
 ;
 ; GP64-LABEL: srem_i1:
 ; GP64:       # %bb.0: # %entry
-; GP64-NEXT:    div $zero, $4, $5
-; GP64-NEXT:    teq $5, $zero, 7
-; GP64-NEXT:    mfhi $1
-; GP64-NEXT:    andi $1, $1, 1
 ; GP64-NEXT:    jr $ra
-; GP64-NEXT:    negu $2, $1
+; GP64-NEXT:    addiu $2, $zero, 0
 ;
 ; GP64R6-LABEL: srem_i1:
 ; GP64R6:       # %bb.0: # %entry
-; GP64R6-NEXT:    mod $1, $4, $5
-; GP64R6-NEXT:    teq $5, $zero, 7
-; GP64R6-NEXT:    andi $1, $1, 1
 ; GP64R6-NEXT:    jr $ra
-; GP64R6-NEXT:    negu $2, $1
+; GP64R6-NEXT:    addiu $2, $zero, 0
 ;
 ; MMR3-LABEL: srem_i1:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    div $zero, $4, $5
-; MMR3-NEXT:    teq $5, $zero, 7
-; MMR3-NEXT:    mfhi16 $2
-; MMR3-NEXT:    andi16 $2, $2, 1
-; MMR3-NEXT:    li16 $3, 0
-; MMR3-NEXT:    subu16 $2, $3, $2
+; MMR3-NEXT:    li16 $2, 0
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: srem_i1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    mod $2, $4, $5
-; MMR6-NEXT:    teq $5, $zero, 7
-; MMR6-NEXT:    andi16 $2, $2, 1
-; MMR6-NEXT:    li16 $3, 0
-; MMR6-NEXT:    subu16 $2, $3, $2
+; MMR6-NEXT:    li16 $2, 0
 ; MMR6-NEXT:    jrc $ra
 entry:
   %r = srem i1 %a, %b
diff --git a/test/CodeGen/Mips/llvm-ir/udiv.ll b/test/CodeGen/Mips/llvm-ir/udiv.ll
index e0ba7bc770e..8694a9f92b6 100644
--- a/test/CodeGen/Mips/llvm-ir/udiv.ll
+++ b/test/CodeGen/Mips/llvm-ir/udiv.ll
@@ -35,41 +35,32 @@
 define zeroext i1 @udiv_i1(i1 zeroext %a, i1 zeroext %b) {
 ; GP32-LABEL: udiv_i1:
 ; GP32:       # %bb.0: # %entry
-; GP32-NEXT:    divu $zero, $4, $5
-; GP32-NEXT:    teq $5, $zero, 7
 ; GP32-NEXT:    jr $ra
-; GP32-NEXT:    mflo $2
+; GP32-NEXT:    move $2, $4
 ;
 ; GP32R6-LABEL: udiv_i1:
 ; GP32R6:       # %bb.0: # %entry
-; GP32R6-NEXT:    divu $2, $4, $5
-; GP32R6-NEXT:    teq $5, $zero, 7
-; GP32R6-NEXT:    jrc $ra
+; GP32R6-NEXT:    jr $ra
+; GP32R6-NEXT:    move $2, $4
 ;
 ; GP64-LABEL: udiv_i1:
 ; GP64:       # %bb.0: # %entry
-; GP64-NEXT:    divu $zero, $4, $5
-; GP64-NEXT:    teq $5, $zero, 7
 ; GP64-NEXT:    jr $ra
-; GP64-NEXT:    mflo $2
+; GP64-NEXT:    move $2, $4
 ;
 ; GP64R6-LABEL: udiv_i1:
 ; GP64R6:       # %bb.0: # %entry
-; GP64R6-NEXT:    divu $2, $4, $5
-; GP64R6-NEXT:    teq $5, $zero, 7
-; GP64R6-NEXT:    jrc $ra
+; GP64R6-NEXT:    jr $ra
+; GP64R6-NEXT:    move $2, $4
 ;
 ; MMR3-LABEL: udiv_i1:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    divu $zero, $4, $5
-; MMR3-NEXT:    teq $5, $zero, 7
-; MMR3-NEXT:    mflo16 $2
+; MMR3-NEXT:    move $2, $4
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: udiv_i1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    divu $2, $4, $5
-; MMR6-NEXT:    teq $5, $zero, 7
+; MMR6-NEXT:    move $2, $4
 ; MMR6-NEXT:    jrc $ra
 entry:
   %r = udiv i1 %a, %b
diff --git a/test/CodeGen/Mips/llvm-ir/urem.ll b/test/CodeGen/Mips/llvm-ir/urem.ll
index 83830a3689b..b744f706cbf 100644
--- a/test/CodeGen/Mips/llvm-ir/urem.ll
+++ b/test/CodeGen/Mips/llvm-ir/urem.ll
@@ -35,64 +35,32 @@
 define signext i1 @urem_i1(i1 signext %a, i1 signext %b) {
 ; GP32-LABEL: urem_i1:
 ; GP32:       # %bb.0: # %entry
-; GP32-NEXT:    andi $1, $5, 1
-; GP32-NEXT:    andi $2, $4, 1
-; GP32-NEXT:    divu $zero, $2, $1
-; GP32-NEXT:    teq $1, $zero, 7
-; GP32-NEXT:    mfhi $1
-; GP32-NEXT:    andi $1, $1, 1
 ; GP32-NEXT:    jr $ra
-; GP32-NEXT:    negu $2, $1
+; GP32-NEXT:    addiu $2, $zero, 0
 ;
 ; GP32R6-LABEL: urem_i1:
 ; GP32R6:       # %bb.0: # %entry
-; GP32R6-NEXT:    andi $1, $5, 1
-; GP32R6-NEXT:    andi $2, $4, 1
-; GP32R6-NEXT:    modu $2, $2, $1
-; GP32R6-NEXT:    teq $1, $zero, 7
 ; GP32R6-NEXT:    jr $ra
-; GP32R6-NEXT:    negu $2, $2
+; GP32R6-NEXT:    addiu $2, $zero, 0
 ;
 ; GP64-LABEL: urem_i1:
 ; GP64:       # %bb.0: # %entry
-; GP64-NEXT:    andi $1, $5, 1
-; GP64-NEXT:    andi $2, $4, 1
-; GP64-NEXT:    divu $zero, $2, $1
-; GP64-NEXT:    teq $1, $zero, 7
-; GP64-NEXT:    mfhi $1
-; GP64-NEXT:    andi $1, $1, 1
 ; GP64-NEXT:    jr $ra
-; GP64-NEXT:    negu $2, $1
+; GP64-NEXT:    addiu $2, $zero, 0
 ;
 ; GP64R6-LABEL: urem_i1:
 ; GP64R6:       # %bb.0: # %entry
-; GP64R6-NEXT:    andi $1, $5, 1
-; GP64R6-NEXT:    andi $2, $4, 1
-; GP64R6-NEXT:    modu $2, $2, $1
-; GP64R6-NEXT:    teq $1, $zero, 7
 ; GP64R6-NEXT:    jr $ra
-; GP64R6-NEXT:    negu $2, $2
+; GP64R6-NEXT:    addiu $2, $zero, 0
 ;
 ; MMR3-LABEL: urem_i1:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    andi16 $2, $5, 1
-; MMR3-NEXT:    andi16 $3, $4, 1
-; MMR3-NEXT:    divu $zero, $3, $2
-; MMR3-NEXT:    teq $2, $zero, 7
-; MMR3-NEXT:    mfhi16 $2
-; MMR3-NEXT:    andi16 $2, $2, 1
-; MMR3-NEXT:    li16 $3, 0
-; MMR3-NEXT:    subu16 $2, $3, $2
+; MMR3-NEXT:    li16 $2, 0
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: urem_i1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    andi16 $2, $5, 1
-; MMR6-NEXT:    andi16 $3, $4, 1
-; MMR6-NEXT:    modu $3, $3, $2
-; MMR6-NEXT:    teq $2, $zero, 7
 ; MMR6-NEXT:    li16 $2, 0
-; MMR6-NEXT:    subu16 $2, $2, $3
 ; MMR6-NEXT:    jrc $ra
 entry:
   %r = urem i1 %a, %b
diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll
index 26a3cd47645..baff826858c 100644
--- a/test/CodeGen/X86/combine-sdiv.ll
+++ b/test/CodeGen/X86/combine-sdiv.ll
@@ -3294,322 +3294,16 @@ define i1 @bool_sdiv(i1 %x, i1 %y) {
 ; CHECK-LABEL: bool_sdiv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    negb %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    cbtw
-; CHECK-NEXT:    andb $1, %sil
-; CHECK-NEXT:    negb %sil
-; CHECK-NEXT:    idivb %sil
 ; CHECK-NEXT:    retq
   %r = sdiv i1 %x, %y
   ret i1 %r
 }
 
 define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) {
-; SSE2-LABEL: boolvec_sdiv:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pslld $31, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pslld $31, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm2, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; SSE2-NEXT:    movd %xmm2, %ecx
-; SSE2-NEXT:    cltd
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm3, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd %xmm3, %ecx
-; SSE2-NEXT:    cltd
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movd %xmm1, %ecx
-; SSE2-NEXT:    cltd
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    cltd
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: boolvec_sdiv:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pslld $31, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    pslld $31, %xmm0
-; SSE41-NEXT:    psrad $31, %xmm0
-; SSE41-NEXT:    pextrd $1, %xmm0, %eax
-; SSE41-NEXT:    pextrd $1, %xmm1, %ecx
-; SSE41-NEXT:    cltd
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    movl %eax, %ecx
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    movd %xmm1, %esi
-; SSE41-NEXT:    cltd
-; SSE41-NEXT:    idivl %esi
-; SSE41-NEXT:    movd %eax, %xmm2
-; SSE41-NEXT:    pinsrd $1, %ecx, %xmm2
-; SSE41-NEXT:    pextrd $2, %xmm0, %eax
-; SSE41-NEXT:    pextrd $2, %xmm1, %ecx
-; SSE41-NEXT:    cltd
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm2
-; SSE41-NEXT:    pextrd $3, %xmm0, %eax
-; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
-; SSE41-NEXT:    cltd
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: boolvec_sdiv:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT:    cltd
-; AVX1-NEXT:    idivl %ecx
-; AVX1-NEXT:    movl %eax, %ecx
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vmovd %xmm1, %esi
-; AVX1-NEXT:    cltd
-; AVX1-NEXT:    idivl %esi
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT:    cltd
-; AVX1-NEXT:    idivl %ecx
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT:    cltd
-; AVX1-NEXT:    idivl %ecx
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: boolvec_sdiv:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX2-NEXT:    cltd
-; AVX2-NEXT:    idivl %ecx
-; AVX2-NEXT:    movl %eax, %ecx
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vmovd %xmm1, %esi
-; AVX2-NEXT:    cltd
-; AVX2-NEXT:    idivl %esi
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT:    cltd
-; AVX2-NEXT:    idivl %ecx
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT:    cltd
-; AVX2-NEXT:    idivl %ecx
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: boolvec_sdiv:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k3
-; AVX512F-NEXT:    kshiftrw $3, %k3, %k0
-; AVX512F-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k4
-; AVX512F-NEXT:    kshiftrw $3, %k4, %k1
-; AVX512F-NEXT:    kshiftrw $2, %k3, %k2
-; AVX512F-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512F-NEXT:    kmovw %k5, %ecx
-; AVX512F-NEXT:    kshiftrw $1, %k3, %k5
-; AVX512F-NEXT:    kmovw %k3, %edi
-; AVX512F-NEXT:    kshiftrw $1, %k4, %k3
-; AVX512F-NEXT:    kmovw %k4, %esi
-; AVX512F-NEXT:    kmovw %k5, %edx
-; AVX512F-NEXT:    kmovw %k3, %eax
-; AVX512F-NEXT:    andb $1, %al
-; AVX512F-NEXT:    negb %al
-; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512F-NEXT:    cbtw
-; AVX512F-NEXT:    andb $1, %dl
-; AVX512F-NEXT:    negb %dl
-; AVX512F-NEXT:    idivb %dl
-; AVX512F-NEXT:    movl %eax, %edx
-; AVX512F-NEXT:    andb $1, %sil
-; AVX512F-NEXT:    negb %sil
-; AVX512F-NEXT:    movl %esi, %eax
-; AVX512F-NEXT:    cbtw
-; AVX512F-NEXT:    andb $1, %dil
-; AVX512F-NEXT:    negb %dil
-; AVX512F-NEXT:    idivb %dil
-; AVX512F-NEXT:    movl %eax, %esi
-; AVX512F-NEXT:    andb $1, %cl
-; AVX512F-NEXT:    negb %cl
-; AVX512F-NEXT:    movl %ecx, %eax
-; AVX512F-NEXT:    cbtw
-; AVX512F-NEXT:    kmovw %k2, %ecx
-; AVX512F-NEXT:    andb $1, %cl
-; AVX512F-NEXT:    negb %cl
-; AVX512F-NEXT:    idivb %cl
-; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    kmovw %k1, %eax
-; AVX512F-NEXT:    andb $1, %al
-; AVX512F-NEXT:    negb %al
-; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512F-NEXT:    cbtw
-; AVX512F-NEXT:    kmovw %k0, %edi
-; AVX512F-NEXT:    andb $1, %dil
-; AVX512F-NEXT:    negb %dil
-; AVX512F-NEXT:    idivb %dil
-; AVX512F-NEXT:    # kill: def $al killed $al def $eax
-; AVX512F-NEXT:    kmovw %edx, %k0
-; AVX512F-NEXT:    kmovw %esi, %k1
-; AVX512F-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512F-NEXT:    kxorw %k0, %k2, %k0
-; AVX512F-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512F-NEXT:    kxorw %k0, %k1, %k0
-; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512F-NEXT:    kmovw %ecx, %k2
-; AVX512F-NEXT:    kxorw %k2, %k1, %k1
-; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512F-NEXT:    kxorw %k1, %k0, %k0
-; AVX512F-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $13, %k0, %k0
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512F-NEXT:    korw %k1, %k0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: boolvec_sdiv:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512BW-NEXT:    vptestmd %xmm1, %xmm1, %k3
-; AVX512BW-NEXT:    kshiftrw $3, %k3, %k0
-; AVX512BW-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512BW-NEXT:    vptestmd %xmm0, %xmm0, %k4
-; AVX512BW-NEXT:    kshiftrw $3, %k4, %k1
-; AVX512BW-NEXT:    kshiftrw $2, %k3, %k2
-; AVX512BW-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %ecx
-; AVX512BW-NEXT:    kshiftrw $1, %k3, %k5
-; AVX512BW-NEXT:    kmovd %k3, %edi
-; AVX512BW-NEXT:    kshiftrw $1, %k4, %k3
-; AVX512BW-NEXT:    kmovd %k4, %esi
-; AVX512BW-NEXT:    kmovd %k5, %edx
-; AVX512BW-NEXT:    kmovd %k3, %eax
-; AVX512BW-NEXT:    andb $1, %al
-; AVX512BW-NEXT:    negb %al
-; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT:    cbtw
-; AVX512BW-NEXT:    andb $1, %dl
-; AVX512BW-NEXT:    negb %dl
-; AVX512BW-NEXT:    idivb %dl
-; AVX512BW-NEXT:    movl %eax, %edx
-; AVX512BW-NEXT:    andb $1, %sil
-; AVX512BW-NEXT:    negb %sil
-; AVX512BW-NEXT:    movl %esi, %eax
-; AVX512BW-NEXT:    cbtw
-; AVX512BW-NEXT:    andb $1, %dil
-; AVX512BW-NEXT:    negb %dil
-; AVX512BW-NEXT:    idivb %dil
-; AVX512BW-NEXT:    movl %eax, %esi
-; AVX512BW-NEXT:    andb $1, %cl
-; AVX512BW-NEXT:    negb %cl
-; AVX512BW-NEXT:    movl %ecx, %eax
-; AVX512BW-NEXT:    cbtw
-; AVX512BW-NEXT:    kmovd %k2, %ecx
-; AVX512BW-NEXT:    andb $1, %cl
-; AVX512BW-NEXT:    negb %cl
-; AVX512BW-NEXT:    idivb %cl
-; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    andb $1, %al
-; AVX512BW-NEXT:    negb %al
-; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT:    cbtw
-; AVX512BW-NEXT:    kmovd %k0, %edi
-; AVX512BW-NEXT:    andb $1, %dil
-; AVX512BW-NEXT:    negb %dil
-; AVX512BW-NEXT:    idivb %dil
-; AVX512BW-NEXT:    # kill: def $al killed $al def $eax
-; AVX512BW-NEXT:    kmovd %edx, %k0
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512BW-NEXT:    kxorw %k0, %k2, %k0
-; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512BW-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512BW-NEXT:    kxorw %k0, %k1, %k0
-; AVX512BW-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512BW-NEXT:    kmovd %ecx, %k2
-; AVX512BW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512BW-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512BW-NEXT:    kshiftrw $13, %k0, %k0
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512BW-NEXT:    korw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; XOP-LABEL: boolvec_sdiv:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vpslld $31, %xmm1, %xmm1
-; XOP-NEXT:    vpsrad $31, %xmm1, %xmm1
-; XOP-NEXT:    vpslld $31, %xmm0, %xmm0
-; XOP-NEXT:    vpsrad $31, %xmm0, %xmm0
-; XOP-NEXT:    vpextrd $1, %xmm0, %eax
-; XOP-NEXT:    vpextrd $1, %xmm1, %ecx
-; XOP-NEXT:    cltd
-; XOP-NEXT:    idivl %ecx
-; XOP-NEXT:    movl %eax, %ecx
-; XOP-NEXT:    vmovd %xmm0, %eax
-; XOP-NEXT:    vmovd %xmm1, %esi
-; XOP-NEXT:    cltd
-; XOP-NEXT:    idivl %esi
-; XOP-NEXT:    vmovd %eax, %xmm2
-; XOP-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; XOP-NEXT:    vpextrd $2, %xmm0, %eax
-; XOP-NEXT:    vpextrd $2, %xmm1, %ecx
-; XOP-NEXT:    cltd
-; XOP-NEXT:    idivl %ecx
-; XOP-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; XOP-NEXT:    vpextrd $3, %xmm0, %eax
-; XOP-NEXT:    vpextrd $3, %xmm1, %ecx
-; XOP-NEXT:    cltd
-; XOP-NEXT:    idivl %ecx
-; XOP-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; XOP-NEXT:    retq
+; CHECK-LABEL: boolvec_sdiv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %r = sdiv <4 x i1> %x, %y
   ret <4 x i1> %r
 }
diff --git a/test/CodeGen/X86/combine-srem.ll b/test/CodeGen/X86/combine-srem.ll
index 7af33fea6db..71be666d6db 100644
--- a/test/CodeGen/X86/combine-srem.ll
+++ b/test/CodeGen/X86/combine-srem.ll
@@ -462,16 +462,7 @@ define i32 @ossfuzz6883() {
 define i1 @bool_srem(i1 %x, i1 %y) {
 ; CHECK-LABEL: bool_srem:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    negb %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    cbtw
-; CHECK-NEXT:    andb $1, %sil
-; CHECK-NEXT:    negb %sil
-; CHECK-NEXT:    idivb %sil
-; CHECK-NEXT:    movsbl %ah, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    retq
   %r = srem i1 %x, %y
   ret i1 %r
@@ -479,61 +470,12 @@ define i1 @bool_srem(i1 %x, i1 %y) {
 define <4 x i1> @boolvec_srem(<4 x i1> %x, <4 x i1> %y) {
 ; SSE-LABEL: boolvec_srem:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pslld $31, %xmm1
-; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    pslld $31, %xmm0
-; SSE-NEXT:    psrad $31, %xmm0
-; SSE-NEXT:    pextrd $1, %xmm0, %eax
-; SSE-NEXT:    pextrd $1, %xmm1, %ecx
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    movl %edx, %ecx
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    movd %xmm1, %esi
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %esi
-; SSE-NEXT:    movd %edx, %xmm2
-; SSE-NEXT:    pinsrd $1, %ecx, %xmm2
-; SSE-NEXT:    pextrd $2, %xmm0, %eax
-; SSE-NEXT:    pextrd $2, %xmm1, %ecx
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    pinsrd $2, %edx, %xmm2
-; SSE-NEXT:    pextrd $3, %xmm0, %eax
-; SSE-NEXT:    pextrd $3, %xmm1, %ecx
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    pinsrd $3, %edx, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: boolvec_srem:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    movl %edx, %ecx
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    vmovd %xmm1, %esi
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %esi
-; AVX-NEXT:    vmovd %edx, %xmm2
-; AVX-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
-; AVX-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %r = srem <4 x i1> %x, %y
   ret <4 x i1> %r
diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll
index d31d2504d65..632d1b698d1 100644
--- a/test/CodeGen/X86/combine-udiv.ll
+++ b/test/CodeGen/X86/combine-udiv.ll
@@ -911,166 +911,17 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
 define i1 @bool_udiv(i1 %x, i1 %y) {
 ; CHECK-LABEL: bool_udiv:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andb $1, %sil
-; CHECK-NEXT:    andb $1, %dil
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    # kill: def $eax killed $eax def $ax
-; CHECK-NEXT:    divb %sil
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %r = udiv i1 %x, %y
   ret i1 %r
 }
 
 define <4 x i1> @boolvec_udiv(<4 x i1> %x, <4 x i1> %y) {
-; SSE2-LABEL: boolvec_udiv:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm2, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; SSE2-NEXT:    movd %xmm2, %ecx
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm3, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd %xmm3, %ecx
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movd %xmm1, %ecx
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: boolvec_udiv:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    pextrd $1, %xmm0, %eax
-; SSE41-NEXT:    pextrd $1, %xmm1, %ecx
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    movl %eax, %ecx
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    movd %xmm1, %esi
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %esi
-; SSE41-NEXT:    movd %eax, %xmm2
-; SSE41-NEXT:    pinsrd $1, %ecx, %xmm2
-; SSE41-NEXT:    pextrd $2, %xmm0, %eax
-; SSE41-NEXT:    pextrd $2, %xmm1, %ecx
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm2
-; SSE41-NEXT:    pextrd $3, %xmm0, %eax
-; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: boolvec_udiv:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    movl %eax, %ecx
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vmovd %xmm1, %esi
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %esi
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: boolvec_udiv:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    movl %eax, %ecx
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vmovd %xmm1, %esi
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %esi
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX2-NEXT:    retq
-;
-; XOP-LABEL: boolvec_udiv:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
-; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; XOP-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vpextrd $1, %xmm0, %eax
-; XOP-NEXT:    vpextrd $1, %xmm1, %ecx
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    movl %eax, %ecx
-; XOP-NEXT:    vmovd %xmm0, %eax
-; XOP-NEXT:    vmovd %xmm1, %esi
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %esi
-; XOP-NEXT:    vmovd %eax, %xmm2
-; XOP-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; XOP-NEXT:    vpextrd $2, %xmm0, %eax
-; XOP-NEXT:    vpextrd $2, %xmm1, %ecx
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; XOP-NEXT:    vpextrd $3, %xmm0, %eax
-; XOP-NEXT:    vpextrd $3, %xmm1, %ecx
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; XOP-NEXT:    retq
+; CHECK-LABEL: boolvec_udiv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %r = udiv <4 x i1> %x, %y
   ret <4 x i1> %r
 }
diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll
index 11505edfb65..5629a53fd23 100644
--- a/test/CodeGen/X86/combine-urem.ll
+++ b/test/CodeGen/X86/combine-urem.ll
@@ -383,13 +383,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
 define i1 @bool_urem(i1 %x, i1 %y) {
 ; CHECK-LABEL: bool_urem:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andb $1, %sil
-; CHECK-NEXT:    andb $1, %dil
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    # kill: def $eax killed $eax def $ax
-; CHECK-NEXT:    divb %sil
-; CHECK-NEXT:    movzbl %ah, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    retq
   %r = urem i1 %x, %y
   ret i1 %r
@@ -398,88 +392,13 @@ define i1 @bool_urem(i1 %x, i1 %y) {
 define <4 x i1> @boolvec_urem(<4 x i1> %x, <4 x i1> %y) {
 ; SSE-LABEL: boolvec_urem:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pextrd $1, %xmm0, %eax
-; SSE-NEXT:    pextrd $1, %xmm1, %ecx
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    movl %edx, %ecx
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    movd %xmm1, %esi
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %esi
-; SSE-NEXT:    movd %edx, %xmm2
-; SSE-NEXT:    pinsrd $1, %ecx, %xmm2
-; SSE-NEXT:    pextrd $2, %xmm0, %eax
-; SSE-NEXT:    pextrd $2, %xmm1, %ecx
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    pinsrd $2, %edx, %xmm2
-; SSE-NEXT:    pextrd $3, %xmm0, %eax
-; SSE-NEXT:    pextrd $3, %xmm1, %ecx
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    pinsrd $3, %edx, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: boolvec_urem:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    movl %edx, %ecx
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vmovd %xmm1, %esi
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %esi
-; AVX1-NEXT:    vmovd %edx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: boolvec_urem:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    movl %edx, %ecx
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vmovd %xmm1, %esi
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %esi
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: boolvec_urem:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %r = urem <4 x i1> %x, %y
   ret <4 x i1> %r
 }
diff --git a/test/CodeGen/X86/pr38539.ll b/test/CodeGen/X86/pr38539.ll
index 9e16f7ca406..215d908a03f 100644
--- a/test/CodeGen/X86/pr38539.ll
+++ b/test/CodeGen/X86/pr38539.ll
@@ -6,68 +6,13 @@
 define void @f() {
 ; X64-LABEL: f:
 ; X64:       # %bb.0: # %BB
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    .cfi_def_cfa_offset 16
-; X64-NEXT:    pushq %r14
-; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    subq $16, %rsp
-; X64-NEXT:    .cfi_def_cfa_offset 48
-; X64-NEXT:    .cfi_offset %rbx, -32
-; X64-NEXT:    .cfi_offset %r14, -24
-; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
-; X64-NEXT:    movq (%rsp), %rbx
 ; X64-NEXT:    movb (%rax), %al
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    # kill: def $eax killed $eax def $ax
-; X64-NEXT:    divb (%rax)
-; X64-NEXT:    movl %eax, %r14d
-; X64-NEXT:    movq %rbp, %rcx
-; X64-NEXT:    shlq $62, %rcx
-; X64-NEXT:    sarq $62, %rcx
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    movq %rbx, %rdx
-; X64-NEXT:    callq __modti3
-; X64-NEXT:    andl $3, %edx
+; X64-NEXT:    movb (%rax), %al
 ; X64-NEXT:    testb %al, %al
 ; X64-NEXT:    setne (%rax)
-; X64-NEXT:    cmpq %rax, %rbx
-; X64-NEXT:    sbbq %rdx, %rbp
-; X64-NEXT:    setae %dl
-; X64-NEXT:    sbbb %cl, %cl
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    setne %bl
-; X64-NEXT:    negb %dl
-; X64-NEXT:    cmpb %r14b, %al
-; X64-NEXT:    setle %al
-; X64-NEXT:    negb %al
-; X64-NEXT:    cbtw
-; X64-NEXT:    idivb %dl
-; X64-NEXT:    movsbl %ah, %eax
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    negq %rax
-; X64-NEXT:    negb %bl
-; X64-NEXT:    leaq -16(%rsp,%rax), %rax
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    movq %rax, (%rax)
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    cbtw
-; X64-NEXT:    idivb %bl
-; X64-NEXT:    movsbl %ah, %eax
-; X64-NEXT:    andb $1, %al
-; X64-NEXT:    movb %al, (%rax)
-; X64-NEXT:    addq $16, %rsp
-; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    popq %rbx
-; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    popq %r14
-; X64-NEXT:    .cfi_def_cfa_offset 16
-; X64-NEXT:    popq %rbp
-; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    movb $0, (%rax)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: f:
@@ -77,75 +22,16 @@ define void @f() {
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    .cfi_offset %esi, -20
-; X86-NEXT:    .cfi_offset %edi, -16
-; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movb (%eax), %al
 ; X86-NEXT:    movb (%eax), %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    # kill: def $eax killed $eax def $ax
-; X86-NEXT:    divb (%eax)
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shll $30, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $30, %ecx
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    setae %dl
-; X86-NEXT:    sbbb %cl, %cl
 ; X86-NEXT:    testb %al, %al
-; X86-NEXT:    setne %ch
 ; X86-NEXT:    setne (%eax)
-; X86-NEXT:    negb %ch
-; X86-NEXT:    negb %dl
-; X86-NEXT:    cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT:    setle %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    cbtw
-; X86-NEXT:    idivb %dl
-; X86-NEXT:    movsbl %ah, %eax
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    leal (%eax,%eax,2), %eax
-; X86-NEXT:    leal -4(%esp,%eax,4), %eax
+; X86-NEXT:    leal -{{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%eax)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cbtw
-; X86-NEXT:    idivb %ch
-; X86-NEXT:    movsbl %ah, %eax
-; X86-NEXT:    andb $1, %al
-; X86-NEXT:    movb %al, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movb $0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
@@ -177,50 +63,13 @@ BB:
 define void @g() {
 ; X64-LABEL: g:
 ; X64:       # %bb.0: # %BB
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; X64-NEXT:    shlq $32, %rsi
-; X64-NEXT:    orq %rax, %rsi
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    shlq $30, %rdi
-; X64-NEXT:    sarq $30, %rdi
 ; X64-NEXT:    movb (%rax), %al
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    # kill: def $eax killed $eax def $ax
-; X64-NEXT:    divb (%rax)
-; X64-NEXT:    movl %eax, %r8d
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    idivq %rdi
-; X64-NEXT:    movabsq $17179869183, %rax # imm = 0x3FFFFFFFF
-; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    movb (%rax), %al
 ; X64-NEXT:    testb %al, %al
-; X64-NEXT:    setne %dil
 ; X64-NEXT:    setne (%rax)
-; X64-NEXT:    cmpq %rsi, %rax
-; X64-NEXT:    seta %dl
-; X64-NEXT:    setbe %cl
-; X64-NEXT:    negb %cl
-; X64-NEXT:    cmpb %r8b, %al
-; X64-NEXT:    setle %al
-; X64-NEXT:    negb %al
-; X64-NEXT:    cbtw
-; X64-NEXT:    idivb %cl
-; X64-NEXT:    movsbl %ah, %eax
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    shlq $3, %rax
-; X64-NEXT:    negq %rax
-; X64-NEXT:    negb %dil
-; X64-NEXT:    negb %dl
-; X64-NEXT:    leaq -16(%rsp,%rax), %rax
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    movq %rax, (%rax)
-; X64-NEXT:    movl %edx, %eax
-; X64-NEXT:    cbtw
-; X64-NEXT:    idivb %dil
-; X64-NEXT:    movsbl %ah, %eax
-; X64-NEXT:    andb $1, %al
-; X64-NEXT:    movb %al, (%rax)
+; X64-NEXT:    movb $0, (%rax)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: g:
@@ -230,63 +79,16 @@ define void @g() {
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_offset %esi, -20
-; X86-NEXT:    .cfi_offset %edi, -16
-; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esp), %edi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movb (%eax), %al
 ; X86-NEXT:    movb (%eax), %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    # kill: def $eax killed $eax def $ax
-; X86-NEXT:    divb (%eax)
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shll $30, %eax
-; X86-NEXT:    sarl $30, %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    calll __moddi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    andl $3, %edx
 ; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    setne (%eax)
-; X86-NEXT:    cmpl %eax, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    setae %dl
-; X86-NEXT:    sbbb %cl, %cl
-; X86-NEXT:    testb %al, %al
-; X86-NEXT:    setne %ch
-; X86-NEXT:    negb %dl
-; X86-NEXT:    cmpb %bl, %al
-; X86-NEXT:    setle %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    cbtw
-; X86-NEXT:    idivb %dl
-; X86-NEXT:    movsbl %ah, %eax
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    shll $3, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    negb %ch
-; X86-NEXT:    leal -8(%esp,%eax), %eax
+; X86-NEXT:    leal -{{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%eax)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cbtw
-; X86-NEXT:    idivb %ch
-; X86-NEXT:    movsbl %ah, %eax
-; X86-NEXT:    andb $1, %al
-; X86-NEXT:    movb %al, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movb $0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
-- 
GitLab


From e55c3ed6b1fd984be5e5adaf776f46c6fb16ad05 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 30 Oct 2018 10:32:11 +0000
Subject: [PATCH 0733/1116] [SelectionDAG] Add FoldBUILD_VECTOR to simplify new
 BUILD_VECTOR nodes

Similar to FoldCONCAT_VECTORS, this patch adds FoldBUILD_VECTOR to simplify cases that can avoid the creation of the BUILD_VECTOR - if all the operands are UNDEF or if the BUILD_VECTOR simplifies to a copy.

This exposed an assumption in some AMDGPU code that getBuildVector was guaranteed to be a BUILD_VECTOR node that I've tried to handle.

Differential Revision: https://reviews.llvm.org/D53760

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345578 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |  58 +++++++
 lib/Target/AMDGPU/R600ISelLowering.cpp        |  34 ++--
 test/CodeGen/AMDGPU/load-local-i16.ll         |   4 +-
 .../X86/clear_upper_vector_element_bits.ll    | 164 ++----------------
 test/CodeGen/X86/lower-vec-shift.ll           |   2 -
 5 files changed, 88 insertions(+), 174 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9a27cf36380..933898b17c8 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3805,6 +3805,38 @@ bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
   return (computeKnownBits(A).Zero | computeKnownBits(B).Zero).isAllOnesValue();
 }
 
+static SDValue FoldBUILD_VECTOR(const SDLoc &DL, EVT VT,
+                                ArrayRef<SDValue> Ops,
+                                SelectionDAG &DAG) {
+  int NumOps = Ops.size();
+  assert(NumOps != 0 && "Can't build an empty vector!");
+  assert(VT.getVectorNumElements() == NumOps &&
+         "Incorrect element count in BUILD_VECTOR!");
+
+  // BUILD_VECTOR of UNDEFs is UNDEF.
+  if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
+    return DAG.getUNDEF(VT);
+
+  // BUILD_VECTOR of seq extract/insert from the same vector + type is Identity.
+  SDValue IdentitySrc;
+  bool IsIdentity = true;
+  for (int i = 0; i != NumOps; ++i) {
+    if (Ops[i].getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Ops[i].getOperand(0).getValueType() != VT ||
+        (IdentitySrc && Ops[i].getOperand(0) != IdentitySrc) ||
+        !isa<ConstantSDNode>(Ops[i].getOperand(1)) ||
+        cast<ConstantSDNode>(Ops[i].getOperand(1))->getAPIntValue() != i) {
+      IsIdentity = false;
+      break;
+    }
+    IdentitySrc = Ops[i].getOperand(0);
+  }
+  if (IsIdentity)
+    return IdentitySrc;
+
+  return SDValue();
+}
+
 static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
                                   ArrayRef<SDValue> Ops,
                                   SelectionDAG &DAG) {
@@ -4059,6 +4091,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::MERGE_VALUES:
   case ISD::CONCAT_VECTORS:
     return Operand;         // Factor, merge or concat of one node?  No need.
+  case ISD::BUILD_VECTOR: {
+    // Attempt to simplify BUILD_VECTOR.
+    SDValue Ops[] = {Operand};
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
   case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
   case ISD::FP_EXTEND:
     assert(VT.isFloatingPoint() &&
@@ -4548,6 +4587,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (N2.getOpcode() == ISD::EntryToken) return N1;
     if (N1 == N2) return N1;
     break;
+  case ISD::BUILD_VECTOR: {
+    // Attempt to simplify BUILD_VECTOR.
+    SDValue Ops[] = {N1, N2};
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
   case ISD::CONCAT_VECTORS: {
     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
     SDValue Ops[] = {N1, N2};
@@ -5019,6 +5065,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     }
     break;
   }
+  case ISD::BUILD_VECTOR: {
+    // Attempt to simplify BUILD_VECTOR.
+    SDValue Ops[] = {N1, N2, N3};
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
   case ISD::CONCAT_VECTORS: {
     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
     SDValue Ops[] = {N1, N2, N3};
@@ -6788,6 +6841,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
   switch (Opcode) {
   default: break;
+  case ISD::BUILD_VECTOR:
+    // Attempt to simplify BUILD_VECTOR.
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
   case ISD::CONCAT_VECTORS:
     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
     if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 8864aabb063..e2a0f05d2b3 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1685,14 +1685,15 @@ bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
 static SDValue CompactSwizzlableVector(
   SelectionDAG &DAG, SDValue VectorEntry,
   DenseMap<unsigned, unsigned> &RemapSwizzle) {
-  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   assert(RemapSwizzle.empty());
-  SDValue NewBldVec[4] = {
-    VectorEntry.getOperand(0),
-    VectorEntry.getOperand(1),
-    VectorEntry.getOperand(2),
-    VectorEntry.getOperand(3)
-  };
+
+  SDLoc DL(VectorEntry);
+  EVT EltTy = VectorEntry.getValueType().getVectorElementType();
+
+  SDValue NewBldVec[4];
+  for (unsigned i = 0; i < 4; i++)
+    NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry,
+                               DAG.getIntPtrConstant(i, DL));
 
   for (unsigned i = 0; i < 4; i++) {
     if (NewBldVec[i].isUndef())
@@ -1727,15 +1728,17 @@ static SDValue CompactSwizzlableVector(
 
 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
-  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   assert(RemapSwizzle.empty());
-  SDValue NewBldVec[4] = {
-      VectorEntry.getOperand(0),
-      VectorEntry.getOperand(1),
-      VectorEntry.getOperand(2),
-      VectorEntry.getOperand(3)
-  };
-  bool isUnmovable[4] = { false, false, false, false };
+
+  SDLoc DL(VectorEntry);
+  EVT EltTy = VectorEntry.getValueType().getVectorElementType();
+
+  SDValue NewBldVec[4];
+  bool isUnmovable[4] = {false, false, false, false};
+  for (unsigned i = 0; i < 4; i++)
+    NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry,
+                               DAG.getIntPtrConstant(i, DL));
+
   for (unsigned i = 0; i < 4; i++) {
     RemapSwizzle[i] = i;
     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
@@ -1766,7 +1769,6 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
                                             SelectionDAG &DAG,
                                             const SDLoc &DL) const {
-  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
   // Old -> New swizzle values
   DenseMap<unsigned, unsigned> SwizzleRemap;
 
diff --git a/test/CodeGen/AMDGPU/load-local-i16.ll b/test/CodeGen/AMDGPU/load-local-i16.ll
index bcab550f6c7..5913e7275e5 100644
--- a/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -96,8 +96,8 @@ entry:
 ; GFX9-NOT: m0
 ; SICIVI: s_mov_b32 m0
 
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
 
 
 ; EG: LDS_READ_RET
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
index 961ec2be59e..983c7342603 100644
--- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -296,59 +296,10 @@ define <16 x i16> @_clearupper16xi16a(<16 x i16>) nounwind {
 }
 
 define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
-; SSE2-LABEL: _clearupper16xi8a:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: _clearupper16xi8a:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE42-NEXT:    retq
+; SSE-LABEL: _clearupper16xi8a:
+; SSE:       # %bb.0:
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: _clearupper16xi8a:
 ; AVX:       # %bb.0:
@@ -422,107 +373,12 @@ define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
 }
 
 define <32 x i8> @_clearupper32xi8a(<32 x i8>) nounwind {
-; SSE2-LABEL: _clearupper32xi8a:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm6
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: _clearupper32xi8a:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE42-NEXT:    andps %xmm2, %xmm0
-; SSE42-NEXT:    andps %xmm2, %xmm1
-; SSE42-NEXT:    retq
+; SSE-LABEL: _clearupper32xi8a:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: _clearupper32xi8a:
 ; AVX:       # %bb.0:
diff --git a/test/CodeGen/X86/lower-vec-shift.ll b/test/CodeGen/X86/lower-vec-shift.ll
index 31059c40648..4480642afb2 100644
--- a/test/CodeGen/X86/lower-vec-shift.ll
+++ b/test/CodeGen/X86/lower-vec-shift.ll
@@ -234,8 +234,6 @@ define <8 x i32> @test10(<8 x i32>* %a) {
 ; SSE-LABEL: test10:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE-NEXT:    psrad %xmm0, %xmm1
 ; SSE-NEXT:    psrad $1, %xmm0
 ; SSE-NEXT:    retq
 ;
-- 
GitLab


From e1ffb34fd887598804c6c12bca0022a02cf95caa Mon Sep 17 00:00:00 2001
From: "Diogo N. Sampaio" <diogo.sampaio@arm.com>
Date: Tue, 30 Oct 2018 11:06:50 +0000
Subject: [PATCH 0734/1116] [AArch64] Add support for UDF instruction

Summary: Add support for AArch64 UDF instruction.
UDF - Permanently Undefined generates an Undefined
Instruction exception (ESR_ELx.EC = 0b000000).

Reviewers: DavidSpickett, javed.absar, t.p.northover

Reviewed By: javed.absar

Subscribers: nhaehnle, kristof.beyls

Differential Revision: https://reviews.llvm.org/D53319


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345581 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrFormats.td | 37 +++++++++++++++++------
 lib/Target/AArch64/AArch64InstrInfo.td    |  2 ++
 test/MC/AArch64/udf.s                     | 15 +++++++++
 test/MC/AArch64/udf_not.s                 |  7 +++++
 test/MC/Disassembler/AArch64/udf.txt      | 30 ++++++++++++++++++
 5 files changed, 81 insertions(+), 10 deletions(-)
 create mode 100644 test/MC/AArch64/udf.s
 create mode 100644 test/MC/AArch64/udf_not.s
 create mode 100644 test/MC/Disassembler/AArch64/udf.txt

diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index aef0a7af500..ab90ea3f74a 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -263,6 +263,14 @@ class SImmOperand<int width> : AsmOperandClass {
   let PredicateMethod = "isSImm<" # width # ">";
 }
 
+
+class AsmImmRange<int Low, int High> : AsmOperandClass {
+  let Name = "Imm" # Low # "_" # High;
+  let DiagnosticType = "InvalidImm" # Low # "_" # High;
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
+}
+
 // Authenticated loads for v8.3 can have scaled 10-bit immediate offsets.
 def SImm10s8Operand : SImmScaledMemoryIndexed<10, 8>;
 def simm10Scaled : Operand<i64> {
@@ -287,6 +295,10 @@ def uimm6 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
   let ParserMatchClass = UImm6Operand;
 }
 
+def uimm16 : Operand<i16>, ImmLeaf<i16, [{return Imm >= 0 && Imm < 65536;}]>{
+  let ParserMatchClass = AsmImmRange<0, 65535>;
+}
+
 def SImm9Operand : SImmOperand<9>;
 def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
   let ParserMatchClass = SImm9Operand;
@@ -447,13 +459,6 @@ def simm4s16 : Operand<i64>, ImmLeaf<i64,
   let DecoderMethod = "DecodeSImm<4>";
 }
 
-class AsmImmRange<int Low, int High> : AsmOperandClass {
-  let Name = "Imm" # Low # "_" # High;
-  let DiagnosticType = "InvalidImm" # Low # "_" # High;
-  let RenderMethod = "addImmOperands";
-  let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
-}
-
 def Imm1_8Operand : AsmImmRange<1, 8>;
 def Imm1_16Operand : AsmImmRange<1, 16>;
 def Imm1_32Operand : AsmImmRange<1, 32>;
@@ -708,11 +713,10 @@ def logical_imm64_not : Operand<i64> {
 }
 
 // imm0_65535 predicate - True if the immediate is in the range [0,65535].
-def Imm0_65535Operand : AsmImmRange<0, 65535>;
 def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 65536;
 }]> {
-  let ParserMatchClass = Imm0_65535Operand;
+  let ParserMatchClass = AsmImmRange<0, 65535>;
   let PrintMethod = "printImmHex";
 }
 
@@ -1937,7 +1941,7 @@ class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
 //---
 
 def movimm32_imm : Operand<i32> {
-  let ParserMatchClass = Imm0_65535Operand;
+  let ParserMatchClass = AsmImmRange<0, 65535>;
   let EncoderMethod = "getMoveWideImmOpValue";
   let PrintMethod = "printImm";
 }
@@ -4082,6 +4086,19 @@ class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
   let Inst{1-0}   = ll;
 }
 
+//---
+// UDF : Permanently UNDEFINED instructions.  Format: Opc = 0x0000, 16 bit imm.
+//--
+let hasSideEffects = 1, isTrap = 1, mayLoad = 0, mayStore = 0 in {
+class UDFType<bits<16> opc, string asm>
+  : I<(outs), (ins uimm16:$imm),
+       asm, "\t$imm", "", []>,
+    Sched<[]> {
+  bits<16> imm;
+  let Inst{31-16} = opc;
+  let Inst{15-0} = imm;
+}
+}
 let Predicates = [HasFPARMv8] in {
 
 //---
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 2dc5991d708..37d3967df44 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1630,6 +1630,8 @@ def : InstAlias<"dcps1", (DCPS1 0)>;
 def : InstAlias<"dcps2", (DCPS2 0)>;
 def : InstAlias<"dcps3", (DCPS3 0)>;
 
+def UDF : UDFType<0, "udf">;
+
 //===----------------------------------------------------------------------===//
 // Load instructions.
 //===----------------------------------------------------------------------===//
diff --git a/test/MC/AArch64/udf.s b/test/MC/AArch64/udf.s
new file mode 100644
index 00000000000..f257157b826
--- /dev/null
+++ b/test/MC/AArch64/udf.s
@@ -0,0 +1,15 @@
+# RUN: llvm-mc -assemble -show-encoding -triple=aarch64- %s | FileCheck %s
+# CHECK:  .text
+# CHECK-NEXT: udf #0      // encoding: [0x00,0x00,0x00,0x00]
+# CHECK-NEXT: udf #1      // encoding: [0x01,0x00,0x00,0x00]
+# CHECK-NEXT: udf #16     // encoding: [0x10,0x00,0x00,0x00]
+# CHECK-NEXT: udf #32     // encoding: [0x20,0x00,0x00,0x00]
+# CHECK-NEXT: udf #48     // encoding: [0x30,0x00,0x00,0x00]
+# CHECK-NEXT: udf #65535      // encoding: [0xff,0xff,0x00,0x00]
+.text
+udf 0
+udf 1
+udf 16
+udf 32
+udf 48
+udf 65535
diff --git a/test/MC/AArch64/udf_not.s b/test/MC/AArch64/udf_not.s
new file mode 100644
index 00000000000..55b59fe4710
--- /dev/null
+++ b/test/MC/AArch64/udf_not.s
@@ -0,0 +1,7 @@
+# RUN: not llvm-mc -assemble -show-encoding -triple=aarch64- %s 2>&1 | FileCheck %s
+udf 65536
+udf -1
+udf -768
+# CHECK:{{.*}} immediate must be an integer in range [0, 65535].
+# CHECK:{{.*}} immediate must be an integer in range [0, 65535].
+# CHECK:{{.*}} immediate must be an integer in range [0, 65535].
diff --git a/test/MC/Disassembler/AArch64/udf.txt b/test/MC/Disassembler/AArch64/udf.txt
new file mode 100644
index 00000000000..7f8385b0062
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/udf.txt
@@ -0,0 +1,30 @@
+# RUN: llvm-mc -arch aarch64 -disassemble -o - %s | FileCheck %s
+# RUN: llvm-mc -arch aarch64 -disassemble -o - %s | \
+# RUN: llvm-mc -assemble -filetype=obj -arch aarch64 -o - | \
+# RUN: llvm-objdump -r -d --triple=arm64- - | \
+# RUN: FileCheck %s -check-prefix=OBJ
+[0x00,0x00,0x00,0x00]
+[0x01,0x00,0x00,0x00]
+[0x10,0x00,0x00,0x00]
+[0x20,0x00,0x00,0x00]
+[0x30,0x00,0x00,0x00]
+[0xff,0xff,0x00,0x00]
+[0x00,0xfd,0x00,0x00]
+# CHECK: .text
+# CHECK-NEXT: udf #0
+# CHECK-NEXT: udf #1
+# CHECK-NEXT: udf #16
+# CHECK-NEXT: udf #32
+# CHECK-NEXT: udf #48
+# CHECK-NEXT: udf #32767
+# CHECK-NEXT: udf #32000
+
+#OBJ: Disassembly of section .text:
+#OBJ-NEXT: $x.0:
+#OBJ-NEXT:        0:	00 00 00 00 	udf	#0
+#OBJ-NEXT:        4:	01 00 00 00 	udf	#1
+#OBJ-NEXT:        8:	10 00 00 00 	udf	#16
+#OBJ-NEXT:        c:	20 00 00 00 	udf	#32
+#OBJ-NEXT:       10:	30 00 00 00 	udf	#48
+#OBJ-NEXT:       14:	ff 7f 00 00 	udf	#32767
+#OBJ-NEXT:       18:	00 7d 00 00 	udf	#32000
-- 
GitLab


From f276cd655988dea8dd0a66597afdfe850ec07515 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Tue, 30 Oct 2018 11:12:29 +0000
Subject: [PATCH 0735/1116] [X86] Add extra-uses on the mask of pattern c of
 extract-{low,}bits.ll tests

Summary:
Because of the D48768, that pattern is always unfolded into pattern d,
thus we had no test coverage.

Reviewers: RKSimon, craig.topper

Reviewed By: craig.topper

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53574

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345583 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/extract-bits.ll    | 1617 ++++++++++++++++++++-------
 test/CodeGen/X86/extract-lowbits.ll | 1083 ++++++++++++++----
 2 files changed, 2089 insertions(+), 611 deletions(-)

diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll
index 4c0d62d5279..4cf3077b6d1 100644
--- a/test/CodeGen/X86/extract-bits.ll
+++ b/test/CodeGen/X86/extract-bits.ll
@@ -2801,63 +2801,133 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c0:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c0:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c0:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebx
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c0:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %edx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i32 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %shifted
   ret i32 %masked
 }
@@ -2865,65 +2935,135 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c1_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, %esi, %eax
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c1_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebx
 ; X64-NOBMI-NEXT:    negb %dl
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c1_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %skip = zext i8 %numskipbits to i32
   %shifted = lshr i32 %val, %skip
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
   %mask = lshr i32 -1, %sh_prom
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %shifted
   ret i32 %masked
 }
@@ -2931,68 +3071,137 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c2_load:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl (%eax), %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c2_load:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-BMI1BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, (%eax), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c2_load:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl (%rdi), %eax
+; X64-NOBMI-NEXT:    movl (%rdi), %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movl $-1, %ebx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebx
+; X64-NOBMI-NEXT:    movl %ebx, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebp, %ebx
+; X64-NOBMI-NEXT:    movl %ebx, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %ebp
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebp, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c2_load:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxl %esi, (%rdi), %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    shrxl %esi, (%rdi), %ebp
+; X64-BMI1BMI2-NEXT:    movl %edx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i32, i32* %w
   %shifted = lshr i32 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %shifted
   ret i32 %masked
 }
@@ -3000,63 +3209,131 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl (%eax), %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c3_load_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-BMI1BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, (%eax), %esi
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, %esi, %eax
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl (%rdi), %eax
+; X64-NOBMI-NEXT:    movl (%rdi), %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
 ; X64-NOBMI-NEXT:    negb %dl
+; X64-NOBMI-NEXT:    movl $-1, %ebx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebx
+; X64-NOBMI-NEXT:    movl %ebx, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebp, %ebx
+; X64-NOBMI-NEXT:    movl %ebx, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %ebp
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebp, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c3_load_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxl %esi, (%rdi), %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    shrxl %esi, (%rdi), %ebp
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i32, i32* %w
   %skip = zext i8 %numskipbits to i32
@@ -3064,6 +3341,7 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
   %mask = lshr i32 -1, %sh_prom
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %shifted
   ret i32 %masked
 }
@@ -3071,63 +3349,133 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c4_commutative:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebx
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c4_commutative:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %edx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i32 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %shifted, %mask ; swapped order
   ret i32 %masked
 }
@@ -3135,98 +3483,156 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c5_skipextrauses:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %ecx
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    subl $16, %esp
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %ecx
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %esi
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %ebx, (%esp)
 ; X86-NOBMI-NEXT:    calll use32
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c5_skipextrauses:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    subl $8, %esp
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
-; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
+; X86-BMI1NOTBM-NEXT:    subl $16, %esp
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %ebx, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c5_skipextrauses:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %edx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %edx, %esi
-; X86-BMI1BMI2-NEXT:    movl %ecx, (%esp)
+; X86-BMI1BMI2-NEXT:    subl $16, %esp
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    shrxl %edi, {{[0-9]+}}(%esp), %ebx
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, %ebx, %esi
+; X86-BMI1BMI2-NEXT:    movl %edi, (%esp)
 ; X86-BMI1BMI2-NEXT:    calll use32
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c5_skipextrauses:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %r14
 ; X64-NOBMI-NEXT:    pushq %rbx
-; X64-NOBMI-NEXT:    movl %edi, %ebx
-; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    shrl %cl, %ebx
+; X64-NOBMI-NEXT:    movl %esi, %r14d
+; X64-NOBMI-NEXT:    movl %edi, %ebp
+; X64-NOBMI-NEXT:    movl %r14d, %ecx
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movl $-1, %ebx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %ebx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebx
-; X64-NOBMI-NEXT:    movl %esi, %edi
+; X64-NOBMI-NEXT:    movl %ebx, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebp, %ebx
+; X64-NOBMI-NEXT:    movl %r14d, %edi
 ; X64-NOBMI-NEXT:    callq use32
 ; X64-NOBMI-NEXT:    movl %ebx, %eax
 ; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %r14
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    shll $8, %edx
-; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
+; X64-BMI1NOTBM-NEXT:    movl %esi, %r14d
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %r14d, %ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebp, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %r14d, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c5_skipextrauses:
 ; X64-BMI1BMI2:       # %bb.0:
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %r14
 ; X64-BMI1BMI2-NEXT:    pushq %rbx
-; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %ebx
-; X64-BMI1BMI2-NEXT:    movl %esi, %edi
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    movl %esi, %ebp
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %r14d
+; X64-BMI1BMI2-NEXT:    movl %edx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %r14d, %ebx
+; X64-BMI1BMI2-NEXT:    movl %ebp, %edi
 ; X64-BMI1BMI2-NEXT:    callq use32
 ; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
 ; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i32 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %shifted
   call void @use32(i32 %numskipbits)
   ret i32 %masked
@@ -3237,8 +3643,11 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c0:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $12, %esp
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3253,26 +3662,39 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:  .LBB30_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB30_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %ebx, %ebp
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB30_4:
-; X86-NOBMI-NEXT:    andl %edi, %edx
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebp, %esi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3287,26 +3709,39 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:  .LBB30_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB30_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB30_4:
-; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
+; X86-BMI1NOTBM-NEXT:    popl %ebp
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr64_c0:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $12, %esp
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3320,50 +3755,96 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:  .LBB30_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB30_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
 ; X86-BMI1BMI2-NEXT:  .LBB30_4:
-; X86-BMI1BMI2-NEXT:    andl %edi, %edx
-; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $12, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    popl %ebp
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c0:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %r14
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c0:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movq %rdx, %rbx
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %shifted
   ret i64 %masked
 }
@@ -3371,8 +3852,11 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $12, %esp
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3385,28 +3869,41 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB31_2:
-; X86-NOBMI-NEXT:    movb $64, %cl
-; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB31_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %ebx, %ebp
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB31_4:
-; X86-NOBMI-NEXT:    andl %edi, %edx
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebp, %esi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3421,26 +3918,39 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1NOTBM-NEXT:  .LBB31_2:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB31_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB31_4:
-; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
+; X86-BMI1NOTBM-NEXT:    popl %ebp
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr64_c1_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $12, %esp
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3454,55 +3964,99 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1BMI2-NEXT:  .LBB31_2:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB31_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
 ; X86-BMI1BMI2-NEXT:  .LBB31_4:
-; X86-BMI1BMI2-NEXT:    andl %edi, %edx
-; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $12, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    popl %ebp
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c1_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %r14
 ; X64-NOBMI-NEXT:    negb %dl
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c1_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
 ; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %skip = zext i8 %numskipbits to i64
   %shifted = lshr i64 %val, %skip
   %numhighbits = sub i8 64, %numlowbits
   %sh_prom = zext i8 %numhighbits to i64
   %mask = lshr i64 -1, %sh_prom
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %shifted
   ret i64 %masked
 }
@@ -3510,8 +4064,11 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c2_load:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $12, %esp
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
@@ -3527,26 +4084,39 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:  .LBB32_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB32_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %ebx, %ebp
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB32_4:
-; X86-NOBMI-NEXT:    andl %edi, %edx
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebp, %esi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
@@ -3562,26 +4132,39 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:  .LBB32_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB32_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB32_4:
-; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
+; X86-BMI1NOTBM-NEXT:    popl %ebp
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr64_c2_load:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $12, %esp
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl (%eax), %esi
@@ -3596,52 +4179,97 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:  .LBB32_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB32_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
 ; X86-BMI1BMI2-NEXT:  .LBB32_4:
-; X86-BMI1BMI2-NEXT:    andl %edi, %edx
-; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $12, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    popl %ebp
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c2_load:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq (%rdi), %rax
+; X64-NOBMI-NEXT:    movq (%rdi), %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %r14
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c2_load:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, (%rdi), %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movq %rdx, %rbx
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, (%rdi), %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i64, i64* %w
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %shifted
   ret i64 %masked
 }
@@ -3649,8 +4277,11 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $12, %esp
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
@@ -3666,26 +4297,39 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-NOBMI-NEXT:  .LBB33_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB33_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %ebx, %ebp
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB33_4:
-; X86-NOBMI-NEXT:    andl %edi, %edx
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebp, %esi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
@@ -3701,26 +4345,39 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:  .LBB33_2:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB33_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB33_4:
-; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
+; X86-BMI1NOTBM-NEXT:    popl %ebp
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr64_c3_load_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $12, %esp
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl (%eax), %esi
@@ -3735,50 +4392,92 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1BMI2-NEXT:  .LBB33_2:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB33_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
 ; X86-BMI1BMI2-NEXT:  .LBB33_4:
-; X86-BMI1BMI2-NEXT:    andl %edi, %edx
-; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $12, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    popl %ebp
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movq (%rdi), %rax
+; X64-NOBMI-NEXT:    movq (%rdi), %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %r14
 ; X64-NOBMI-NEXT:    negb %dl
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
+; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c3_load_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
 ; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, (%rdi), %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, (%rdi), %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i64, i64* %w
   %skip = zext i8 %numskipbits to i64
@@ -3786,6 +4485,7 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
   %numhighbits = sub i8 64, %numlowbits
   %sh_prom = zext i8 %numhighbits to i64
   %mask = lshr i64 -1, %sh_prom
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %shifted
   ret i64 %masked
 }
@@ -3793,133 +4493,208 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $12, %esp
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl %esi, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB34_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB34_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB34_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:    movl %ebx, %ebp
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB34_4:
-; X86-NOBMI-NEXT:    andl %edi, %edx
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebp, %esi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1NOTBM-NEXT:    movl %esi, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB34_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB34_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB34_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
-; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB34_4:
-; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
+; X86-BMI1NOTBM-NEXT:    popl %ebp
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr64_c4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $12, %esp
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB34_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB34_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %esi
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB34_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edi, %esi
-; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
 ; X86-BMI1BMI2-NEXT:  .LBB34_4:
-; X86-BMI1BMI2-NEXT:    andl %edi, %edx
-; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $12, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    popl %ebp
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c4_commutative:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %r14
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c4_commutative:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movq %rdx, %rbx
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %shifted, %mask ; swapped order
   ret i64 %masked
 }
@@ -3932,37 +4707,40 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %esi, %ebx
-; X86-NOBMI-NEXT:    movl %eax, %ecx
-; X86-NOBMI-NEXT:    shrl %cl, %ebx
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %edx
-; X86-NOBMI-NEXT:    testb $32, %al
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB35_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %ebx, %edx
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB35_2:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    shrl %cl, %ebp
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB35_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:    movl %ebp, %ebx
+; X86-NOBMI-NEXT:    xorl %ebp, %ebp
 ; X86-NOBMI-NEXT:  .LBB35_4:
-; X86-NOBMI-NEXT:    andl %ebx, %edi
-; X86-NOBMI-NEXT:    andl %edx, %esi
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ebp
-; X86-NOBMI-NEXT:    pushl %eax
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebx, %esi
+; X86-NOBMI-NEXT:    andl %ebp, %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NOBMI-NEXT:    calll use64
 ; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    movl %esi, %eax
@@ -3981,37 +4759,40 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $12, %esp
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %esi, %ebx
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %edx
-; X86-BMI1NOTBM-NEXT:    testb $32, %al
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB35_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %ebx, %edx
-; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
+; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB35_2:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB35_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
-; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
+; X86-BMI1NOTBM-NEXT:    movl %ebp, %ebx
+; X86-BMI1NOTBM-NEXT:    xorl %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:  .LBB35_4:
-; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
-; X86-BMI1NOTBM-NEXT:    andl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebp
-; X86-BMI1NOTBM-NEXT:    pushl %eax
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %edi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-BMI1NOTBM-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use64
 ; X86-BMI1NOTBM-NEXT:    addl $16, %esp
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
@@ -4030,35 +4811,38 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    subl $12, %esp
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edx
-; X86-BMI1BMI2-NEXT:    shrxl %eax, %esi, %ebx
-; X86-BMI1BMI2-NEXT:    testb $32, %al
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
+; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB35_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %ebx, %edx
-; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB35_2:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %esi
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB35_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edi, %esi
-; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:  .LBB35_4:
-; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
-; X86-BMI1BMI2-NEXT:    andl %edx, %esi
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %ebp
-; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebp, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-BMI1BMI2-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
@@ -4072,46 +4856,77 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ;
 ; X64-NOBMI-LABEL: bextr64_c5_skipextrauses:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r15
+; X64-NOBMI-NEXT:    pushq %r14
 ; X64-NOBMI-NEXT:    pushq %rbx
-; X64-NOBMI-NEXT:    movq %rdi, %rbx
-; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rsi, %r14
+; X64-NOBMI-NEXT:    movq %rdi, %r15
+; X64-NOBMI-NEXT:    movl %r14d, %ecx
+; X64-NOBMI-NEXT:    shrq %cl, %r15
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rbx
 ; X64-NOBMI-NEXT:    shrq %cl, %rbx
-; X64-NOBMI-NEXT:    movq %rsi, %rdi
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r15, %rbx
+; X64-NOBMI-NEXT:    movq %r14, %rdi
 ; X64-NOBMI-NEXT:    callq use64
 ; X64-NOBMI-NEXT:    movq %rbx, %rax
 ; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
+; X64-NOBMI-NEXT:    popq %r15
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r15
+; X64-BMI1NOTBM-NEXT:    pushq %r14
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
-; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %r14
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r15
+; X64-BMI1NOTBM-NEXT:    movl %r14d, %ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r15
+; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
+; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r15, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %r14, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
 ; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
+; X64-BMI1NOTBM-NEXT:    popq %r15
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c5_skipextrauses:
 ; X64-BMI1BMI2:       # %bb.0:
+; X64-BMI1BMI2-NEXT:    pushq %r15
+; X64-BMI1BMI2-NEXT:    pushq %r14
 ; X64-BMI1BMI2-NEXT:    pushq %rbx
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rbx
-; X64-BMI1BMI2-NEXT:    movq %rsi, %rdi
+; X64-BMI1BMI2-NEXT:    movq %rdx, %rbx
+; X64-BMI1BMI2-NEXT:    movq %rsi, %r14
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %r15
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r15, %rbx
+; X64-BMI1BMI2-NEXT:    movq %r14, %rdi
 ; X64-BMI1BMI2-NEXT:    callq use64
 ; X64-BMI1BMI2-NEXT:    movq %rbx, %rax
 ; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
+; X64-BMI1BMI2-NEXT:    popq %r15
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %shifted
   call void @use64(i64 %numskipbits)
   ret i64 %masked
diff --git a/test/CodeGen/X86/extract-lowbits.ll b/test/CodeGen/X86/extract-lowbits.ll
index 59865538585..8d18f29d332 100644
--- a/test/CodeGen/X86/extract-lowbits.ll
+++ b/test/CodeGen/X86/extract-lowbits.ll
@@ -1428,52 +1428,119 @@ define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; Pattern c. 32-bit
 ; ---------------------------------------------------------------------------- ;
 
+declare void @use32(i32)
+
 define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c0:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c0:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c0:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %esi, %ebx
+; X64-BMI1BMI2-NEXT:    movl %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %esi, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %val
   ret i32 %masked
 }
@@ -1481,50 +1548,115 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c1_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c1_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c1_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %esi, %ebx
+; X64-BMI1BMI2-NEXT:    movl %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
   %mask = lshr i32 -1, %sh_prom
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %val
   ret i32 %masked
 }
@@ -1532,53 +1664,106 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c2_load:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    andl %edx, %esi
+; X86-NOBMI-NEXT:    movl %edx, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
-; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
+; X86-BMI1NOTBM-NEXT:    andl %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %edx, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c2_load:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, (%ecx), %eax
+; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %esi
+; X86-BMI1BMI2-NEXT:    negl %ecx
+; X86-BMI1BMI2-NEXT:    movl $-1, %eax
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c2_load:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbx
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl (%rdi), %eax
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    movl $-1, %eax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    movl (%rdi), %ebx
+; X64-NOBMI-NEXT:    andl %eax, %ebx
+; X64-NOBMI-NEXT:    movl %eax, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    movl %ebx, %eax
+; X64-NOBMI-NEXT:    popq %rbx
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %ebx
+; X64-BMI1NOTBM-NEXT:    andl %eax, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %eax, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
+; X64-BMI1NOTBM-NEXT:    popq %rbx
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c2_load:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhil %esi, (%rdi), %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    bzhil %esi, (%rdi), %ebx
+; X64-BMI1BMI2-NEXT:    negl %esi
+; X64-BMI1BMI2-NEXT:    movl $-1, %eax
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %eax, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    popq %rbx
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i32, i32* %w
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %val
   ret i32 %masked
 }
@@ -1586,54 +1771,109 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    andl %edx, %esi
+; X86-NOBMI-NEXT:    movl %edx, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
-; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
+; X86-BMI1NOTBM-NEXT:    andl %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %edx, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c3_load_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %esi
+; X86-BMI1BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
+; X86-BMI1BMI2-NEXT:    negb %cl
+; X86-BMI1BMI2-NEXT:    movl $-1, %eax
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbx
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl (%rdi), %eax
 ; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    movl $-1, %eax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    movl (%rdi), %ebx
+; X64-NOBMI-NEXT:    andl %eax, %ebx
+; X64-NOBMI-NEXT:    movl %eax, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    movl %ebx, %eax
+; X64-NOBMI-NEXT:    popq %rbx
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %ebx
+; X64-BMI1NOTBM-NEXT:    andl %eax, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %eax, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
+; X64-BMI1NOTBM-NEXT:    popq %rbx
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c3_load_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhil %esi, (%rdi), %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    bzhil %esi, (%rdi), %ebx
+; X64-BMI1BMI2-NEXT:    # kill: def $sil killed $sil killed $esi def $esi
+; X64-BMI1BMI2-NEXT:    negb %sil
+; X64-BMI1BMI2-NEXT:    movl $-1, %eax
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %eax, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    popq %rbx
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i32, i32* %w
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
   %mask = lshr i32 -1, %sh_prom
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %val
   ret i32 %masked
 }
@@ -1641,131 +1881,275 @@ define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shll $8, %eax
-; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c4_commutative:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shll $8, %esi
-; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c4_commutative:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %esi, %ebx
+; X64-BMI1BMI2-NEXT:    movl %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %esi, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %val, %mask ; swapped order
   ret i32 %masked
 }
 
 ; 64-bit
 
+declare void @use64(i64)
+
 define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c0:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB25_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB25_2:
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB25_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB25_2:
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_c0:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB25_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB25_2:
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c0:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c0:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movq %rsi, %rbx
+; X64-BMI1BMI2-NEXT:    movq %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %val
   ret i64 %masked
 }
@@ -1773,82 +2157,157 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB26_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB26_2:
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB26_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB26_2:
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_c1_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB26_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB26_2:
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c1_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c1_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %esi, %ebx
+; X64-BMI1BMI2-NEXT:    movq %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i8 64, %numlowbits
   %sh_prom = zext i8 %numhighbits to i64
   %mask = lshr i64 -1, %sh_prom
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %val
   ret i64 %masked
 }
@@ -1856,89 +2315,153 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c2_load:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB27_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB27_2:
-; X86-NOBMI-NEXT:    andl (%esi), %eax
-; X86-NOBMI-NEXT:    andl 4(%esi), %edx
+; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    andl %edx, %esi
+; X86-NOBMI-NEXT:    movl 4(%eax), %edi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %edx
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB27_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %edx
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB27_2:
-; X86-BMI1NOTBM-NEXT:    andl (%esi), %eax
-; X86-BMI1NOTBM-NEXT:    andl 4(%esi), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
+; X86-BMI1NOTBM-NEXT:    andl %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl 4(%eax), %edi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %edx
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_c2_load:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %edx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %ebx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB27_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %ebx, %edx
+; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:  .LBB27_2:
-; X86-BMI1BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI1BMI2-NEXT:    andl 4(%esi), %edx
+; X86-BMI1BMI2-NEXT:    movl (%eax), %esi
+; X86-BMI1BMI2-NEXT:    andl %edx, %esi
+; X86-BMI1BMI2-NEXT:    movl 4(%eax), %edi
+; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %edx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
 ; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c2_load:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbx
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq (%rdi), %rax
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    movq $-1, %rax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    movq (%rdi), %rbx
+; X64-NOBMI-NEXT:    andq %rax, %rbx
+; X64-NOBMI-NEXT:    movq %rax, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    popq %rbx
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rbx
+; X64-BMI1NOTBM-NEXT:    andq %rax, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rax, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    popq %rbx
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c2_load:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhiq %rsi, (%rdi), %rax
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    bzhiq %rsi, (%rdi), %rbx
+; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
+; X64-BMI1BMI2-NEXT:    negl %esi
+; X64-BMI1BMI2-NEXT:    movq $-1, %rax
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rax, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    movq %rbx, %rax
+; X64-BMI1BMI2-NEXT:    popq %rbx
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i64, i64* %w
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %val
   ret i64 %masked
 }
@@ -1946,92 +2469,155 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB28_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB28_2:
-; X86-NOBMI-NEXT:    andl (%esi), %eax
-; X86-NOBMI-NEXT:    andl 4(%esi), %edx
+; X86-NOBMI-NEXT:    movl (%edx), %esi
+; X86-NOBMI-NEXT:    andl %eax, %esi
+; X86-NOBMI-NEXT:    movl 4(%edx), %edi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %eax
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB28_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB28_2:
-; X86-BMI1NOTBM-NEXT:    andl (%esi), %eax
-; X86-BMI1NOTBM-NEXT:    andl 4(%esi), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %esi
+; X86-BMI1NOTBM-NEXT:    andl %eax, %esi
+; X86-BMI1NOTBM-NEXT:    movl 4(%edx), %edi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %eax
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_c3_load_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %ebx
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB28_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:  .LBB28_2:
-; X86-BMI1BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI1BMI2-NEXT:    andl 4(%esi), %edx
+; X86-BMI1BMI2-NEXT:    movl (%edx), %esi
+; X86-BMI1BMI2-NEXT:    andl %eax, %esi
+; X86-BMI1BMI2-NEXT:    movl 4(%edx), %edi
+; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
 ; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbx
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movq (%rdi), %rax
 ; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    movq $-1, %rax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    movq (%rdi), %rbx
+; X64-NOBMI-NEXT:    andq %rax, %rbx
+; X64-NOBMI-NEXT:    movq %rax, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    popq %rbx
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    negb %cl
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rbx
+; X64-BMI1NOTBM-NEXT:    andq %rax, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rax, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    popq %rbx
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c3_load_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
+; X64-BMI1BMI2-NEXT:    pushq %rbx
 ; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1BMI2-NEXT:    bzhiq %rsi, (%rdi), %rax
+; X64-BMI1BMI2-NEXT:    bzhiq %rsi, (%rdi), %rbx
+; X64-BMI1BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI1BMI2-NEXT:    negb %sil
+; X64-BMI1BMI2-NEXT:    movq $-1, %rax
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rax, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    movq %rbx, %rax
+; X64-BMI1BMI2-NEXT:    popq %rbx
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i64, i64* %w
   %numhighbits = sub i8 64, %numlowbits
   %sh_prom = zext i8 %numhighbits to i64
   %mask = lshr i64 -1, %sh_prom
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %val
   ret i64 %masked
 }
@@ -2039,79 +2625,156 @@ define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB29_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB29_2:
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB29_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB29_2:
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_c4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB29_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB29_2:
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c4_commutative:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
-; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
+; X64-BMI1NOTBM-NEXT:    negl %ecx
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c4_commutative:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movq %rsi, %rbx
+; X64-BMI1BMI2-NEXT:    movq %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %val, %mask ; swapped order
   ret i64 %masked
 }
-- 
GitLab


From ae66d6c10922f7562dffabc9926901e4943b331a Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Tue, 30 Oct 2018 11:12:34 +0000
Subject: [PATCH 0736/1116] [X86][BMI1] X86DAGToDAGISel: select BEXTR from x & 
 (-1 >> (32 - y)) pattern

Summary:
The final pattern.
There is no test changes:
* We are looking for the pattern with one-use of it's mask,
* If the mask is one-use, D48768 will unfold it into pattern d.
* Thus, the tests have extra-use on the mask.
* Thus, only the BMI2 BZHI can be tested, and it already worked.
* So there is no BMI1 test coverage, we just assume it works since it uses the same codepath.

Reviewers: craig.topper, RKSimon

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53575

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345584 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp | 58 ++++++++++++++++++++----------
 lib/Target/X86/X86InstrInfo.td     | 40 ---------------------
 2 files changed, 40 insertions(+), 58 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 83d5be34dc7..717ecc031c0 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2749,11 +2749,45 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
     return true;
   };
 
+  // Match potentially-truncated (bitwidth - y)
+  auto matchShiftAmt = [checkOneUse, Size, &NBits](SDValue ShiftAmt) {
+    // Skip over a truncate of the shift amount.
+    if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
+      ShiftAmt = ShiftAmt.getOperand(0);
+      // The trunc should have been the only user of the real shift amount.
+      if (!checkOneUse(ShiftAmt))
+        return false;
+    }
+    // Match the shift amount as: (bitwidth - y). It should go away, too.
+    if (ShiftAmt.getOpcode() != ISD::SUB)
+      return false;
+    auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
+    if (!V0 || V0->getZExtValue() != Size)
+      return false;
+    NBits = ShiftAmt.getOperand(1);
+    return true;
+  };
+
+  // c) x &  (-1 >> (32 - y))
+  auto matchPatternC = [&checkOneUse, matchShiftAmt](SDValue Mask) -> bool {
+    // Match `l>>`. Must only have one use!
+    if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
+      return false;
+    // We should be shifting all-ones constant.
+    if (!isAllOnesConstant(Mask.getOperand(0)))
+      return false;
+    SDValue M1 = Mask.getOperand(1);
+    // The shift amount should not be used externally.
+    if (!checkOneUse(M1))
+      return false;
+    return matchShiftAmt(M1);
+  };
+
   SDValue X;
 
   // d) x << (32 - y) >> (32 - y)
-  auto matchPatternD = [&checkOneUse, &checkTwoUse, Size, &X,
-                        &NBits](SDNode *Node) -> bool {
+  auto matchPatternD = [&checkOneUse, &checkTwoUse, matchShiftAmt,
+                        &X](SDNode *Node) -> bool {
     if (Node->getOpcode() != ISD::SRL)
       return false;
     SDValue N0 = Node->getOperand(0);
@@ -2765,28 +2799,16 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
     // There should not be any uses of the shift amount outside of the pattern.
     if (N1 != N01 || !checkTwoUse(N1))
       return false;
-    // Skip over a truncate of the shift amount.
-    if (N1->getOpcode() == ISD::TRUNCATE) {
-      N1 = N1->getOperand(0);
-      // The trunc should have been the only user of the real shift amount.
-      if (!checkOneUse(N1))
-        return false;
-    }
-    // Match the shift amount as: (bitwidth - y). It should go away, too.
-    if (N1.getOpcode() != ISD::SUB)
-      return false;
-    auto N10 = dyn_cast<ConstantSDNode>(N1.getOperand(0));
-    if (!N10 || N10->getZExtValue() != Size)
+    if (!matchShiftAmt(N1))
       return false;
     X = N0->getOperand(0);
-    NBits = N1.getOperand(1);
     return true;
   };
 
-  auto matchLowBitMask = [&matchPatternA,
-                          &matchPatternB](SDValue Mask) -> bool {
+  auto matchLowBitMask = [&matchPatternA, &matchPatternB,
+                          &matchPatternC](SDValue Mask) -> bool {
     // FIXME: pattern c.
-    return matchPatternA(Mask) || matchPatternB(Mask);
+    return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
   };
 
   if (Node->getOpcode() == ISD::AND) {
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 8d3f7c856d0..992e9543b33 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -2499,46 +2499,6 @@ let Predicates = [HasBMI2, NoTBM] in {
                              (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
 }
 
-let Predicates = [HasBMI2] in {
-  multiclass _bmi_bzhi_pattern<dag regpattern, dag mempattern, RegisterClass RC,
-                               ValueType VT, Instruction DstInst,
-                               Instruction DstMemInst> {
-    def : Pat<regpattern,
-              (DstInst RC:$src,
-                (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
-    def : Pat<mempattern,
-              (DstMemInst addr:$src,
-                (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
-  }
-
-  multiclass bmi_bzhi_patterns<RegisterClass RC, int bitwidth, ValueType VT,
-                               Instruction DstInst, X86MemOperand x86memop,
-                               Instruction DstMemInst> {
-    // x & (-1 >> (bitwidth - y))
-    defm : _bmi_bzhi_pattern<(and RC:$src, (srl -1, (sub bitwidth, GR8:$lz))),
-                             (and (x86memop addr:$src),
-                                  (srl -1, (sub bitwidth, GR8:$lz))),
-                             RC, VT, DstInst, DstMemInst>;
-  }
-
-  defm : bmi_bzhi_patterns<GR32, 32, i32, BZHI32rr, loadi32, BZHI32rm>;
-  defm : bmi_bzhi_patterns<GR64, 64, i64, BZHI64rr, loadi64, BZHI64rm>;
-
-  // x & (-1 >> (32 - y))
-  def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
-            (BZHI32rr GR32:$src, GR32:$lz)>;
-  def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
-            (BZHI32rm addr:$src, GR32:$lz)>;
-
-  // x & (-1 >> (64 - y))
-  def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
-            (BZHI64rr GR64:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
-  def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
-            (BZHI64rm addr:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
-} // HasBMI2
-
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
                          X86MemOperand x86memop, Intrinsic Int,
                          PatFrag ld_frag> {
-- 
GitLab


From 8125e33cdb819f6c4100bf310777d289c27c54d0 Mon Sep 17 00:00:00 2001
From: Nicola Zaghen <nicola.zaghen@imgtec.com>
Date: Tue, 30 Oct 2018 11:15:04 +0000
Subject: [PATCH 0737/1116] [SROA] Use offset sizes from the DataLayout instead
 of the pointer siezes.

This fixes an assertion when constant folding a GEP when the part of the offset
was in i32 (IndexSize, as per DataLayout) and part in the i64 (PointerSize) in
the newly created test case.

Differential Revision: https://reviews.llvm.org/D52609


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345585 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/SROA.cpp              | 12 ++++-----
 test/Transforms/SROA/pointer-offset-size.ll | 29 +++++++++++++++++++++
 2 files changed, 35 insertions(+), 6 deletions(-)
 create mode 100644 test/Transforms/SROA/pointer-offset-size.ll

diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 0f43ee6bbd7..a8b9ee56639 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -1400,8 +1400,8 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
   if (Ty == TargetTy)
     return buildGEP(IRB, BasePtr, Indices, NamePrefix);
 
-  // Pointer size to use for the indices.
-  unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType());
+  // Offset size to use for the indices.
+  unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType());
 
   // See if we can descend into a struct and locate a field with the correct
   // type.
@@ -1413,7 +1413,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
 
     if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
       ElementTy = ArrayTy->getElementType();
-      Indices.push_back(IRB.getIntN(PtrSize, 0));
+      Indices.push_back(IRB.getIntN(OffsetSize, 0));
     } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
       ElementTy = VectorTy->getElementType();
       Indices.push_back(IRB.getInt32(0));
@@ -2377,7 +2377,7 @@ private:
 #endif
 
     return getAdjustedPtr(IRB, DL, &NewAI,
-                          APInt(DL.getPointerTypeSizeInBits(PointerTy), Offset),
+                          APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
                           PointerTy,
 #ifndef NDEBUG
                           Twine(OldName) + "."
@@ -2899,8 +2899,8 @@ private:
     unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
 
     // Compute the relative offset for the other pointer within the transfer.
-    unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
-    APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
+    unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
+    APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
     unsigned OtherAlign =
       IsDest ? II.getSourceAlignment() : II.getDestAlignment();
     OtherAlign =  MinAlign(OtherAlign ? OtherAlign : 1,
diff --git a/test/Transforms/SROA/pointer-offset-size.ll b/test/Transforms/SROA/pointer-offset-size.ll
new file mode 100644
index 00000000000..c632c37988b
--- /dev/null
+++ b/test/Transforms/SROA/pointer-offset-size.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64:32"
+
+%struct.test = type { %struct.basic, %struct.basic }
+%struct.basic = type { i16, i8 }
+
+define i16 @test(%struct.test* %ts2.i) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S_SROA_0:%.*]] = alloca [3 x i8], align 2
+; CHECK-NEXT:    [[S_SROA_0_0__SROA_CAST:%.*]] = bitcast %struct.test* [[TS2_I:%.*]] to i8*
+; CHECK-NEXT:    [[S_SROA_0_0__SROA_IDX:%.*]] = getelementptr inbounds [3 x i8], [3 x i8]* [[S_SROA_0]], i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[S_SROA_0_0__SROA_CAST]], i8* align 2 [[S_SROA_0_0__SROA_IDX]], i32 3, i1 false)
+; CHECK-NEXT:    [[X1_I_I:%.*]] = getelementptr inbounds [[STRUCT_TEST:%.*]], %struct.test* [[TS2_I]], i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[X1_I_I]]
+; CHECK-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  %s = alloca %struct.test
+  %0 = bitcast %struct.test* %ts2.i to i8*
+  %1 = bitcast %struct.test* %s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 3, i1 false)
+  %x1.i.i = getelementptr inbounds %struct.test, %struct.test* %ts2.i, i32 0, i32 0, i32 0
+  %2 = load i16, i16* %x1.i.i
+  ret i16 %2
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i1)
-- 
GitLab


From 7a9cc35ddd8187330514e492ed90ab53f32a54f7 Mon Sep 17 00:00:00 2001
From: "Diogo N. Sampaio" <diogo.sampaio@arm.com>
Date: Tue, 30 Oct 2018 11:39:33 +0000
Subject: [PATCH 0738/1116] [FIX][AArch64] Add support for UDF instruction

Fix wrong test files submited
in rL345581


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345587 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/AArch64/udf.s                | 12 +++---------
 test/MC/Disassembler/AArch64/udf.txt | 22 +++++-----------------
 2 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/test/MC/AArch64/udf.s b/test/MC/AArch64/udf.s
index f257157b826..a6a345b1ff6 100644
--- a/test/MC/AArch64/udf.s
+++ b/test/MC/AArch64/udf.s
@@ -1,15 +1,9 @@
 # RUN: llvm-mc -assemble -show-encoding -triple=aarch64- %s | FileCheck %s
 # CHECK:  .text
 # CHECK-NEXT: udf #0      // encoding: [0x00,0x00,0x00,0x00]
-# CHECK-NEXT: udf #1      // encoding: [0x01,0x00,0x00,0x00]
-# CHECK-NEXT: udf #16     // encoding: [0x10,0x00,0x00,0x00]
-# CHECK-NEXT: udf #32     // encoding: [0x20,0x00,0x00,0x00]
-# CHECK-NEXT: udf #48     // encoding: [0x30,0x00,0x00,0x00]
-# CHECK-NEXT: udf #65535      // encoding: [0xff,0xff,0x00,0x00]
+# CHECK-NEXT: udf #513    // encoding: [0x01,0x02,0x00,0x00]
+# CHECK-NEXT: udf #65535  // encoding: [0xff,0xff,0x00,0x00]
 .text
 udf 0
-udf 1
-udf 16
-udf 32
-udf 48
+udf 513
 udf 65535
diff --git a/test/MC/Disassembler/AArch64/udf.txt b/test/MC/Disassembler/AArch64/udf.txt
index 7f8385b0062..6b3a252f198 100644
--- a/test/MC/Disassembler/AArch64/udf.txt
+++ b/test/MC/Disassembler/AArch64/udf.txt
@@ -4,27 +4,15 @@
 # RUN: llvm-objdump -r -d --triple=arm64- - | \
 # RUN: FileCheck %s -check-prefix=OBJ
 [0x00,0x00,0x00,0x00]
-[0x01,0x00,0x00,0x00]
-[0x10,0x00,0x00,0x00]
-[0x20,0x00,0x00,0x00]
-[0x30,0x00,0x00,0x00]
+[0x01,0x02,0x00,0x00]
 [0xff,0xff,0x00,0x00]
-[0x00,0xfd,0x00,0x00]
 # CHECK: .text
 # CHECK-NEXT: udf #0
-# CHECK-NEXT: udf #1
-# CHECK-NEXT: udf #16
-# CHECK-NEXT: udf #32
-# CHECK-NEXT: udf #48
-# CHECK-NEXT: udf #32767
-# CHECK-NEXT: udf #32000
+# CHECK-NEXT: udf #513
+# CHECK-NEXT: udf #65535
 
 #OBJ: Disassembly of section .text:
 #OBJ-NEXT: $x.0:
 #OBJ-NEXT:        0:	00 00 00 00 	udf	#0
-#OBJ-NEXT:        4:	01 00 00 00 	udf	#1
-#OBJ-NEXT:        8:	10 00 00 00 	udf	#16
-#OBJ-NEXT:        c:	20 00 00 00 	udf	#32
-#OBJ-NEXT:       10:	30 00 00 00 	udf	#48
-#OBJ-NEXT:       14:	ff 7f 00 00 	udf	#32767
-#OBJ-NEXT:       18:	00 7d 00 00 	udf	#32000
+#OBJ-NEXT:        4:	01 02 00 00 	udf	#513
+#OBJ-NEXT:        8:	ff ff 00 00 	udf	#65535
-- 
GitLab


From 077c1227927d6ca6f0cfb8c98f72a33e5086fcbf Mon Sep 17 00:00:00 2001
From: James Henderson <jh7370@my.bristol.ac.uk>
Date: Tue, 30 Oct 2018 11:52:47 +0000
Subject: [PATCH 0739/1116] [llvm-size] Reject unknown radix values

This addresses https://bugs.llvm.org/show_bug.cgi?id=39403 by making
-radix an enumeration option with 8, 10, and 16 as the only accepted
values.

Reviewed by: jhenderson, MaskRay

Differential Revision: https://reviews.llvm.org/D53799

Patch by Eugene Sharygin


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345588 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-size/llvm-size.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/llvm-size/llvm-size.cpp b/tools/llvm-size/llvm-size.cpp
index ed53bacc7c3..ad1aefcafcb 100644
--- a/tools/llvm-size/llvm-size.cpp
+++ b/tools/llvm-size/llvm-size.cpp
@@ -71,9 +71,11 @@ ArchFlags("arch", cl::desc("architecture(s) from a Mach-O file to dump"),
 static bool ArchAll = false;
 
 enum RadixTy { octal = 8, decimal = 10, hexadecimal = 16 };
-static cl::opt<unsigned int>
-Radix("radix", cl::desc("Print size in radix. Only 8, 10, and 16 are valid"),
-      cl::init(decimal));
+static cl::opt<RadixTy> Radix(
+    "radix", cl::desc("Print size in radix"), cl::init(decimal),
+    cl::values(clEnumValN(octal, "8", "Print size in octal"),
+               clEnumValN(decimal, "10", "Print size in decimal"),
+               clEnumValN(hexadecimal, "16", "Print size in hexadecimal")));
 
 static cl::opt<RadixTy>
 RadixShort(cl::desc("Print size in radix:"),
@@ -865,7 +867,7 @@ int main(int argc, char **argv) {
   if (OutputFormatShort.getNumOccurrences())
     OutputFormat = static_cast<OutputFormatTy>(OutputFormatShort);
   if (RadixShort.getNumOccurrences())
-    Radix = RadixShort;
+    Radix = RadixShort.getValue();
 
   for (unsigned i = 0; i < ArchFlags.size(); ++i) {
     if (ArchFlags[i] == "all") {
-- 
GitLab


From 6165c5d19d4e987c0e797927d7875032f5b4ebb7 Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Tue, 30 Oct 2018 12:07:18 +0000
Subject: [PATCH 0740/1116] [llc] Error out when -print-machineinstrs is used
 with an unknown pass

We used to assert instead of reporting an error.

PR39494

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345589 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/TargetPassConfig.cpp             | 20 +++++++++++---------
 test/CodeGen/X86/llc-print-machineinstrs.mir | 12 ++++++++++++
 2 files changed, 23 insertions(+), 9 deletions(-)
 create mode 100644 test/CodeGen/X86/llc-print-machineinstrs.mir

diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index a3b24d1cd66..6a9c3c05f03 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -806,15 +806,17 @@ void TargetPassConfig::addMachinePasses() {
   AddingMachinePasses = true;
 
   // Insert a machine instr printer pass after the specified pass.
-  if (!StringRef(PrintMachineInstrs.getValue()).equals("") &&
-      !StringRef(PrintMachineInstrs.getValue()).equals("option-unspecified")) {
-    const PassRegistry *PR = PassRegistry::getPassRegistry();
-    const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
-    const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
-    assert (TPI && IPI && "Pass ID not registered!");
-    const char *TID = (const char *)(TPI->getTypeInfo());
-    const char *IID = (const char *)(IPI->getTypeInfo());
-    insertPass(TID, IID);
+  StringRef PrintMachineInstrsPassName = PrintMachineInstrs.getValue();
+  if (!PrintMachineInstrsPassName.equals("") &&
+      !PrintMachineInstrsPassName.equals("option-unspecified")) {
+    if (const PassInfo *TPI = getPassInfo(PrintMachineInstrsPassName)) {
+      const PassRegistry *PR = PassRegistry::getPassRegistry();
+      const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
+      assert(IPI && "failed to get \"machineinstr-printer\" PassInfo!");
+      const char *TID = (const char *)(TPI->getTypeInfo());
+      const char *IID = (const char *)(IPI->getTypeInfo());
+      insertPass(TID, IID);
+    }
   }
 
   // Print the instruction selected machine code...
diff --git a/test/CodeGen/X86/llc-print-machineinstrs.mir b/test/CodeGen/X86/llc-print-machineinstrs.mir
new file mode 100644
index 00000000000..a890840a478
--- /dev/null
+++ b/test/CodeGen/X86/llc-print-machineinstrs.mir
@@ -0,0 +1,12 @@
+# Check that -print-machineinstrs doesn't assert when it's passed an unknown pass name.
+# RUN: llc -mtriple=x86_64-- -start-before=greedy -print-machineinstrs=greedy %s -o /dev/null
+# RUN: not llc -mtriple=x86_64-- -start-before=greedy -print-machineinstrs=unknown %s -o /dev/null 2>&1 | FileCheck %s
+# CHECK: LLVM ERROR: "unknown" pass is not registered.
+
+...
+---
+name: fun
+tracksRegLiveness: true
+body: |
+  bb.0:
+    RET 0
-- 
GitLab


From 0d3744e3d7024e723cc6a09f99141f9e590d1549 Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Tue, 30 Oct 2018 12:20:17 +0000
Subject: [PATCH 0741/1116] [X86] Re-enable the machine verifier after fixing
 more tests

Was disabled again in r345528. Hopefully this the bots.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345593 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetMachine.h          | 4 ----
 test/DebugInfo/Generic/linear-dbg-value.ll | 3 ++-
 test/ThinLTO/X86/cfi-devirt.ll             | 4 ++++
 test/ThinLTO/X86/devirt-after-icp.ll       | 4 ++++
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 5b21cd82b5b..f5b45da0c3d 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -53,10 +53,6 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
-
-  bool isMachineVerifierClean() const override {
-    return false;
-  }
 };
 
 } // end namespace llvm
diff --git a/test/DebugInfo/Generic/linear-dbg-value.ll b/test/DebugInfo/Generic/linear-dbg-value.ll
index 62cbc4442aa..2ea78eb3dae 100644
--- a/test/DebugInfo/Generic/linear-dbg-value.ll
+++ b/test/DebugInfo/Generic/linear-dbg-value.ll
@@ -1,4 +1,5 @@
-; RUN: llc -stop-before=expand-isel-pseudos -pre-RA-sched=linearize < %s | FileCheck %s
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39452.
+; RUN: llc -stop-before=expand-isel-pseudos -pre-RA-sched=linearize -verify-machineinstrs=0 < %s | FileCheck %s
 source_filename = "linear-dbg-value.ll"
 
 ; Function Attrs: nounwind readonly uwtable
diff --git a/test/ThinLTO/X86/cfi-devirt.ll b/test/ThinLTO/X86/cfi-devirt.ll
index 134da52857a..7ade794d498 100644
--- a/test/ThinLTO/X86/cfi-devirt.ll
+++ b/test/ThinLTO/X86/cfi-devirt.ll
@@ -5,7 +5,9 @@
 ; RUN: opt -thinlto-bc -o %t.o %s
 
 ; Legacy PM
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436.
 ; RUN: llvm-lto2 run %t.o -save-temps -pass-remarks=. \
+; RUN:   -verify-machineinstrs=0 \
 ; RUN:   -o %t3 \
 ; RUN:   -r=%t.o,test,px \
 ; RUN:   -r=%t.o,_ZN1A1nEi,p \
@@ -22,7 +24,9 @@
 ; RUN: llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR
 
 ; New PM
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436.
 ; RUN: llvm-lto2 run %t.o -save-temps -use-new-pm -pass-remarks=. \
+; RUN:   -verify-machineinstrs=0 \
 ; RUN:   -o %t3 \
 ; RUN:   -r=%t.o,test,px \
 ; RUN:   -r=%t.o,_ZN1A1nEi,p \
diff --git a/test/ThinLTO/X86/devirt-after-icp.ll b/test/ThinLTO/X86/devirt-after-icp.ll
index b711e260c1e..987221787e2 100644
--- a/test/ThinLTO/X86/devirt-after-icp.ll
+++ b/test/ThinLTO/X86/devirt-after-icp.ll
@@ -45,7 +45,9 @@
 ; RUN: opt -thinlto-bc -o %t.o %s
 
 ; Legacy PM
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436.
 ; RUN: llvm-lto2 run %t.o -save-temps -pass-remarks=. \
+; RUN:   -verify-machineinstrs=0 \
 ; RUN:   -o %t3 \
 ; RUN:   -r=%t.o,_Z3bazP1A,px \
 ; RUN:   -r=%t.o,_ZN1A3fooEv, \
@@ -63,7 +65,9 @@
 ; RUN: llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR
 
 ; New PM
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436.
 ; RUN: llvm-lto2 run %t.o -save-temps -use-new-pm -pass-remarks=. \
+; RUN:   -verify-machineinstrs=0 \
 ; RUN:   -o %t3 \
 ; RUN:   -r=%t.o,_Z3bazP1A,px \
 ; RUN:   -r=%t.o,_ZN1A3fooEv, \
-- 
GitLab


From bb950aec63a9579dc71de2329554862130f742a0 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Tue, 30 Oct 2018 13:41:03 +0000
Subject: [PATCH 0742/1116] [SystemZ]  Improve isFoldableLoad() for Sub, SDiv
 and UDiv.

Sub, SDiv and UDiv are not commutative, so only the RHS operand can fold a
load. This patch adds a check for this.

Review: Ulrich Weigand
https://reviews.llvm.org/D53791

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345596 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SystemZ/SystemZTargetTransformInfo.cpp    |   5 +
 .../SystemZ/memop-folding-int-arith.ll        | 215 +++++++++++++++---
 2 files changed, 184 insertions(+), 36 deletions(-)

diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 3bc87ef0225..caa3f597445 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -876,6 +876,11 @@ isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
     UserI = cast<Instruction>(*UserI->user_begin());
     // Load (single use) -> trunc/extend (single use) -> UserI
   }
+  if ((UserI->getOpcode() == Instruction::Sub ||
+       UserI->getOpcode() == Instruction::SDiv ||
+       UserI->getOpcode() == Instruction::UDiv) &&
+      UserI->getOperand(1) != FoldedValue)
+    return false; // Not commutative, only RHS foldable.
   switch (UserI->getOpcode()) {
   case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
   case Instruction::Sub:
diff --git a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
index 8198386832e..d5c097ced62 100644
--- a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
@@ -85,7 +85,7 @@ define void @add() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = add i32 %sext_3, undef
 }
 
-define void @sub() {
+define void @sub_lhs_mem() {
   %li32 = load i32, i32* undef
   sub i32 %li32, undef
 
@@ -131,30 +131,30 @@ define void @sub() {
 
   ret void;
 
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; A sub LHS loaded operand is *not* foldable.
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = sub i32 %li32, undef
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_1 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = sub i32 %li32_0, %li32_1
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = sub i64 %li64, undef
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = sub i64 %li64_0, %li64_1
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_2 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = sub i32 %tr, undef
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li16_0 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_0 = load i16, i16* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i16 %li16_0 to i32
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = sub i32 %sext_0, undef
-; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   %li16_1 = load i16, i16* undef
-; Z14:   Cost Model: Found an estimated cost of 0 for instruction:   %li16_1 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_1 = load i16, i16* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i16 %li16_1 to i64
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = sub i64 %sext_1, undef
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_2 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_2 to i64
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = sub i64 %sext_2, undef
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_3 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %zext_0 = zext i32 %li32_3 to i64
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = sub i64 %zext_0, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_3 = load i16, i16* undef
@@ -163,6 +163,71 @@ define void @sub() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = sub i32 %sext_3, undef
 }
 
+define void @sub_rhs_mem() {
+  %li32 = load i32, i32* undef
+  sub i32 undef, %li32
+
+  %li64 = load i64, i64* undef
+  sub i64 undef, %li64
+
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  sub i32 undef, %tr
+
+  ; Sign-extended loads
+  %li16_0 = load i16, i16* undef
+  %sext_0 = sext i16 %li16_0 to i32
+  sub i32 undef, %sext_0
+
+  %li16_1 = load i16, i16* undef
+  %sext_1 = sext i16 %li16_1 to i64
+  sub i64 undef, %sext_1
+
+  %li32_2 = load i32, i32* undef
+  %sext_2 = sext i32 %li32_2 to i64
+  sub i64 undef, %sext_2
+
+  ; Zero-extended loads
+  %li32_3 = load i32, i32* undef
+  %zext_0 = zext i32 %li32_3 to i64
+  sub i64 undef, %zext_0
+
+  ; Loads with multiple uses are *not* folded
+  %li16_3 = load i16, i16* undef
+  %sext_3 = sext i16 %li16_3 to i32
+  %sext_4 = sext i16 %li16_3 to i32
+  sub i32 undef, %sext_3
+
+  ret void;
+
+; A sub RHS loaded operand is foldable.
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = sub i32 undef, %li32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = sub i64 undef, %li64
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = sub i32 undef, %tr
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li16_0 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i16 %li16_0 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = sub i32 undef, %sext_0
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   %li16_1 = load i16, i16* undef
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction:   %li16_1 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i16 %li16_1 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = sub i64 undef, %sext_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = sub i64 undef, %sext_2
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %zext_0 = zext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = sub i64 undef, %zext_0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_3 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_3 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_4 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = sub i32 undef, %sext_3
+}
+
 define void @mul() {
   %li32 = load i32, i32* undef
   mul i32 %li32, undef
@@ -240,7 +305,7 @@ define void @mul() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = mul i32 %sext_3, undef
 }
 
-define void @sdiv(i32 %arg32, i64 %arg64) {
+define void @sdiv_lhs(i32 %arg32, i64 %arg64) {
   %li32 = load i32, i32* undef
   sdiv i32 %li32, %arg32
 
@@ -272,29 +337,73 @@ define void @sdiv(i32 %arg32, i64 %arg64) {
   sdiv i64 %sext_1, undef
 
   ret void;
+
+; An sdiv loaded dividend (lhs) operand is *not* foldable.
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = sdiv i32 %li32, %arg32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %2 = sdiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %3 = sdiv i64 %li64, %arg64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %4 = sdiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %5 = sdiv i32 %tr, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %6 = sdiv i64 %sext_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %7 = sdiv i64 %sext_1, undef
+}
+
+define void @sdiv_rhs(i32 %arg32, i64 %arg64) {
+  %li32 = load i32, i32* undef
+  sdiv i32 %arg32, %li32
+
+  %li64 = load i64, i64* undef
+  sdiv i64 %arg64, %li64
+
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  sdiv i32 undef, %tr
+
+  ; Sign-extended loads
+  %li32_2 = load i32, i32* undef
+  %sext_0 = sext i32 %li32_2 to i64
+  sdiv i64 undef, %sext_0
+
+  ; Loads with multiple uses are *not* folded
+  %li32_3 = load i32, i32* undef
+  %sext_1 = sext i32 %li32_3 to i64
+  %sext_2 = sext i32 %li32_3 to i64
+  sdiv i64 undef, %sext_1
+
+  ret void;
+
+; An sdiv loaded divisor (rhs) operand is foldable.
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %1 = sdiv i32 %li32, %arg32
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %2 = sdiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = sdiv i32 %arg32, %li32
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:  %3 = sdiv i64 %li64, %arg64
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:  %4 = sdiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %2 = sdiv i64 %arg64, %li64
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %5 = sdiv i32 %tr, undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %3 = sdiv i32 undef, %tr
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i32 %li32_2 to i64
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %6 = sdiv i64 %sext_0, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %4 = sdiv i64 undef, %sext_0
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_3 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i32 %li32_3 to i64
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_3 to i64
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %7 = sdiv i64 %sext_1, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %5 = sdiv i64 undef, %sext_1
 }
 
-define void @udiv(i32 %arg32, i64 %arg64) {
+define void @udiv_lhs(i32 %arg32, i64 %arg64) {
   %li32 = load i32, i32* undef
   udiv i32 %li32, %arg32
 
@@ -320,22 +429,56 @@ define void @udiv(i32 %arg32, i64 %arg64) {
   udiv i64 %li64_3, undef
 
   ret void;
+
+; An udiv loaded dividend (lhs) operand is *not* foldable.
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = udiv i32 %li32, %arg32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %2 = udiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %3 = udiv i64 %li64, %arg64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %4 = udiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %5 = udiv i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %6 = udiv i64 %li64_3, undef
+}
+
+define void @udiv_rhs(i32 %arg32, i64 %arg64) {
+  %li32 = load i32, i32* undef
+  udiv i32 %arg32, %li32
+
+  %li64 = load i64, i64* undef
+  udiv i64 %arg64, %li64
+
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  udiv i32 undef, %tr_0
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  udiv i64 undef, %li64_3
+
+  ret void;
+
+; An udiv loaded divisor (rhs) operand is foldable.
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %1 = udiv i32 %li32, %arg32
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %2 = udiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = udiv i32 %arg32, %li32
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %3 = udiv i64 %li64, %arg64
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 21 for instruction:  %4 = udiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %2 = udiv i64 %arg64, %li64
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %5 = udiv i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %3 = udiv i32 undef, %tr_0
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %6 = udiv i64 %li64_3, undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %4 = udiv i64 undef, %li64_3
 }
 
 define void @and() {
-- 
GitLab


From 237ef87e5bbeefb7cab5ba5a154329b377395649 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 30 Oct 2018 13:47:19 +0000
Subject: [PATCH 0743/1116] [SelectionDAG] fix build warning for mismatched
 signs in compare; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345598 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 933898b17c8..84e955fd6f6 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3810,7 +3810,7 @@ static SDValue FoldBUILD_VECTOR(const SDLoc &DL, EVT VT,
                                 SelectionDAG &DAG) {
   int NumOps = Ops.size();
   assert(NumOps != 0 && "Can't build an empty vector!");
-  assert(VT.getVectorNumElements() == NumOps &&
+  assert(VT.getVectorNumElements() == (unsigned)NumOps &&
          "Incorrect element count in BUILD_VECTOR!");
 
   // BUILD_VECTOR of UNDEFs is UNDEF.
-- 
GitLab


From d19bba6122bae301c6522dbded9a6065b695c1f5 Mon Sep 17 00:00:00 2001
From: "Diogo N. Sampaio" <diogo.sampaio@arm.com>
Date: Tue, 30 Oct 2018 13:59:21 +0000
Subject: [PATCH 0744/1116] [FIX][AArch64] Add support for UDF instruction

Fix: Simplify test files from rL345581 failing
in windows bots.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345601 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/Disassembler/AArch64/udf.txt | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/test/MC/Disassembler/AArch64/udf.txt b/test/MC/Disassembler/AArch64/udf.txt
index 6b3a252f198..445803ebe71 100644
--- a/test/MC/Disassembler/AArch64/udf.txt
+++ b/test/MC/Disassembler/AArch64/udf.txt
@@ -6,13 +6,11 @@
 [0x00,0x00,0x00,0x00]
 [0x01,0x02,0x00,0x00]
 [0xff,0xff,0x00,0x00]
-# CHECK: .text
-# CHECK-NEXT: udf #0
+
+# CHECK: udf #0
 # CHECK-NEXT: udf #513
 # CHECK-NEXT: udf #65535
 
-#OBJ: Disassembly of section .text:
-#OBJ-NEXT: $x.0:
-#OBJ-NEXT:        0:	00 00 00 00 	udf	#0
+#OBJ:             0:	00 00 00 00 	udf	#0
 #OBJ-NEXT:        4:	01 02 00 00 	udf	#513
 #OBJ-NEXT:        8:	ff ff 00 00 	udf	#65535
-- 
GitLab


From 895148a280193a472fad1431e626a003cd9cf83b Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 30 Oct 2018 14:14:34 +0000
Subject: [PATCH 0745/1116] [DAGCombiner] narrow vector binops when extraction
 is cheap

Narrowing vector binops came up in the demanded bits discussion in D52912.

I don't think we're going to be able to do this transform in IR as a canonicalization
because of the risk of creating unsupported widths for vector ops, but we already have
a DAG TLI hook to allow what I was hoping for: isExtractSubvectorCheap(). This is
currently enabled for x86, ARM, and AArch64 (although only x86 has existing regression
test diffs).

This is artificially limited to not look through bitcasts because there are so many
test diffs already, but that's marked with a TODO and is a small follow-up.

Differential Revision: https://reviews.llvm.org/D53784


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345602 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp     | 41 ++++++---
 test/CodeGen/X86/2012-04-26-sdglue.ll        | 20 ++---
 test/CodeGen/X86/avx-logic.ll                | 32 +++----
 test/CodeGen/X86/avx-vzeroupper.ll           | 12 +--
 test/CodeGen/X86/avx512-hadd-hsub.ll         | 56 ++++++------
 test/CodeGen/X86/avx512-insert-extract.ll    |  8 +-
 test/CodeGen/X86/avx512-skx-insert-subvec.ll |  4 +-
 test/CodeGen/X86/known-signbits-vector.ll    | 12 +--
 test/CodeGen/X86/madd.ll                     | 36 ++++----
 test/CodeGen/X86/min-legal-vector-width.ll   |  8 +-
 test/CodeGen/X86/sad.ll                      | 22 ++---
 test/CodeGen/X86/shrink_vmul.ll              |  4 +-
 test/CodeGen/X86/vec_int_to_fp.ll            | 36 +++-----
 test/CodeGen/X86/vector-compare-all_of.ll    | 16 ++--
 test/CodeGen/X86/vector-compare-any_of.ll    | 16 ++--
 test/CodeGen/X86/vector-reduce-add.ll        | 48 +++++------
 test/CodeGen/X86/vector-reduce-and.ll        | 72 ++++++++--------
 test/CodeGen/X86/vector-reduce-fadd-fast.ll  | 90 +++++++-------------
 test/CodeGen/X86/vector-reduce-fmul-fast.ll  | 90 +++++++-------------
 test/CodeGen/X86/vector-reduce-mul.ll        | 60 ++++++-------
 test/CodeGen/X86/vector-reduce-or.ll         | 72 ++++++++--------
 test/CodeGen/X86/vector-reduce-xor.ll        | 72 ++++++++--------
 test/CodeGen/X86/vector-rotate-256.ll        | 12 +--
 test/CodeGen/X86/vector-rotate-512.ll        | 16 ++--
 24 files changed, 387 insertions(+), 468 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 64c7dca0f6e..742ca02a03d 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16673,10 +16673,8 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   return SDValue();
 }
 
-/// If we are extracting a subvector produced by a wide binary operator with at
-/// at least one operand that was the result of a vector concatenation, then try
-/// to use the narrow vector operands directly to avoid the concatenation and
-/// extraction.
+/// If we are extracting a subvector produced by a wide binary operator try
+/// to use a narrow binary operator and/or avoid concatenation and extraction.
 static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
   // some of these bailouts with other transforms.
@@ -16697,22 +16695,43 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   if (!WideBVT.isVector())
     return SDValue();
 
+  EVT VT = Extract->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned ExtractIndex = ExtractIndexC->getZExtValue();
+  assert(ExtractIndex % NumElems == 0 &&
+         "Extract index is not a multiple of the vector length.");
+  EVT SrcVT = Extract->getOperand(0).getValueType();
+  unsigned NumSrcElems = SrcVT.getVectorNumElements();
+  unsigned NarrowingRatio = NumSrcElems / NumElems;
+
   // Bail out if the target does not support a narrower version of the binop.
   unsigned BOpcode = BinOp.getOpcode();
+  unsigned WideNumElts = WideBVT.getVectorNumElements();
   EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
-                                   WideBVT.getVectorNumElements() / 2);
+                                   WideNumElts / NarrowingRatio);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
     return SDValue();
 
+  // If extraction is cheap, we don't need to look at the binop operands
+  // for concat ops. The narrow binop alone makes this transform profitable.
+  // TODO: We're not dealing with the bitcasted pattern here. That limitation
+  // should be lifted.
+  if (Extract->getOperand(0) == BinOp && BinOp.hasOneUse() &&
+      TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtractIndex)) {
+    // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
+    SDLoc DL(Extract);
+    SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                            BinOp.getOperand(0), Extract->getOperand(1));
+    SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                            BinOp.getOperand(1), Extract->getOperand(1));
+    return DAG.getNode(BOpcode, DL, NarrowBVT, X, Y,
+                       BinOp.getNode()->getFlags());
+  }
+
   // Only handle the case where we are doubling and then halving. A larger ratio
   // may require more than two narrow binops to replace the wide binop.
-  EVT VT = Extract->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-  unsigned ExtractIndex = ExtractIndexC->getZExtValue();
-  assert(ExtractIndex % NumElems == 0 &&
-         "Extract index is not a multiple of the vector length.");
-  if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2)
+  if (NarrowingRatio != 2)
     return SDValue();
 
   // TODO: The motivating case for this transform is an x86 AVX1 target. That
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 6e5f48e4e0d..afa8bf44c20 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -7,23 +7,19 @@
 define void @func(<4 x float> %a, <16 x i8> %b, <16 x i8> %c, <8 x float> %d, <8 x float> %e, <8 x float>* %f) nounwind ssp {
 ; CHECK-LABEL: func:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqu 0, %xmm3
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm1 = xmm3[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
-; CHECK-NEXT:    vmovdqu 32, %xmm3
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3]
-; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; CHECK-NEXT:    vmulps %ymm0, %ymm0, %ymm0
-; CHECK-NEXT:    vmulps %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vaddps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT:    vmovdqu 0, %xmm0
+; CHECK-NEXT:    vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vmulps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vmulps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmulps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vaddps %ymm0, %ymm0, %ymm0
 ; CHECK-NEXT:    vhaddps %ymm4, %ymm0, %ymm0
 ; CHECK-NEXT:    vsubps %ymm0, %ymm0, %ymm0
-; CHECK-NEXT:    vhaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vhaddps %ymm0, %ymm2, %ymm0
 ; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll
index f22c6257e45..44d0993b68d 100644
--- a/test/CodeGen/X86/avx-logic.ll
+++ b/test/CodeGen/X86/avx-logic.ll
@@ -338,17 +338,17 @@ define <8 x i32> @and_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
 define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
 ; AVX1-LABEL: andn_disguised_i8_elts:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
 ; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpandn %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; INT256-LABEL: andn_disguised_i8_elts:
@@ -417,17 +417,17 @@ define <8 x i32> @andn_constant_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %
 define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) {
 ; AVX1-LABEL: andn_variable_mask_operand_concat:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT:    vpandn %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm2, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; INT256-LABEL: andn_variable_mask_operand_concat:
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index 26248bdae0a..3662e39a641 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -96,28 +96,24 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
 define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
 ; VZ-LABEL: test02:
 ; VZ:       # %bb.0:
-; VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; VZ-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; VZ-NEXT:    vzeroupper
 ; VZ-NEXT:    jmp do_sse # TAILCALL
 ;
 ; FAST-ymm-zmm-LABEL: test02:
 ; FAST-ymm-zmm:       # %bb.0:
-; FAST-ymm-zmm-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; FAST-ymm-zmm-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; FAST-ymm-zmm-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; FAST-ymm-zmm-NEXT:    jmp do_sse # TAILCALL
 ;
 ; BDVER2-LABEL: test02:
 ; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; BDVER2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; BDVER2-NEXT:    vzeroupper
 ; BDVER2-NEXT:    jmp do_sse # TAILCALL
 ;
 ; BTVER2-LABEL: test02:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; BTVER2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; BTVER2-NEXT:    jmp do_sse # TAILCALL
   %add.i = fadd <8 x float> %a, %b
   %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
diff --git a/test/CodeGen/X86/avx512-hadd-hsub.ll b/test/CodeGen/X86/avx512-hadd-hsub.ll
index aed182179cf..00063521c6d 100644
--- a/test/CodeGen/X86/avx512-hadd-hsub.ll
+++ b/test/CodeGen/X86/avx512-hadd-hsub.ll
@@ -8,7 +8,7 @@ define i32 @hadd_16(<16 x i32> %x225) {
 ; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; KNL-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; KNL-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; KNL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovd %xmm0, %eax
 ; KNL-NEXT:    retq
 ;
@@ -17,7 +17,7 @@ define i32 @hadd_16(<16 x i32> %x225) {
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SKX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SKX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vmovd %xmm0, %eax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -35,7 +35,7 @@ define i32 @hsub_16(<16 x i32> %x225) {
 ; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; KNL-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; KNL-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; KNL-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovd %xmm0, %eax
 ; KNL-NEXT:    retq
 ;
@@ -44,7 +44,7 @@ define i32 @hsub_16(<16 x i32> %x225) {
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SKX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SKX-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vmovd %xmm0, %eax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -62,8 +62,7 @@ define float @fhadd_16(<16 x float> %x225) {
 ; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; KNL-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; KNL-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; KNL-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fhadd_16:
@@ -71,8 +70,7 @@ define float @fhadd_16(<16 x float> %x225) {
 ; SKX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; SKX-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SKX-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -89,8 +87,7 @@ define float @fhsub_16(<16 x float> %x225) {
 ; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; KNL-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; KNL-NEXT:    vsubps %zmm1, %zmm0, %zmm0
-; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; KNL-NEXT:    vsubps %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fhsub_16:
@@ -98,8 +95,7 @@ define float @fhsub_16(<16 x float> %x225) {
 ; SKX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; SKX-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SKX-NEXT:    vsubps %zmm1, %zmm0, %zmm0
-; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; SKX-NEXT:    vsubps %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -180,16 +176,14 @@ define <4 x double> @fadd_noundef_low(<8 x double> %x225, <8 x double> %x227) {
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; KNL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; KNL-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
-; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; KNL-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fadd_noundef_low:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; SKX-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
-; SKX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; SKX-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
 ; SKX-NEXT:    retq
   %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5 ,i32 13, i32 7, i32 15>
@@ -203,16 +197,18 @@ define <4 x double> @fadd_noundef_high(<8 x double> %x225, <8 x double> %x227) {
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; KNL-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; KNL-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
 ; KNL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; KNL-NEXT:    vextractf64x4 $1, %zmm2, %ymm1
+; KNL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fadd_noundef_high:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; SKX-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
 ; SKX-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; SKX-NEXT:    vextractf64x4 $1, %zmm2, %ymm1
+; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; SKX-NEXT:    retq
   %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5 ,i32 13, i32 7, i32 15>
@@ -227,16 +223,14 @@ define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) {
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
 ; KNL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; KNL-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
-; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; KNL-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: hadd_16_3_sv:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
 ; SKX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; SKX-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
-; SKX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; SKX-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; SKX-NEXT:    retq
   %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
 , i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30>
@@ -253,15 +247,13 @@ define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) {
 ; KNL-LABEL: fadd_noundef_eel:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; KNL-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; KNL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fadd_noundef_eel:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; SKX-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -277,18 +269,18 @@ define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) {
 ; KNL-LABEL: fsub_noundef_ee:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
-; KNL-NEXT:    vbroadcastsd %xmm0, %zmm0
-; KNL-NEXT:    vsubpd %zmm1, %zmm0, %zmm0
-; KNL-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; KNL-NEXT:    vbroadcastsd %xmm0, %zmm1
+; KNL-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
+; KNL-NEXT:    vsubpd %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fsub_noundef_ee:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
-; SKX-NEXT:    vbroadcastsd %xmm0, %zmm0
-; SKX-NEXT:    vsubpd %zmm1, %zmm0, %zmm0
-; SKX-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; SKX-NEXT:    vbroadcastsd %xmm0, %zmm1
+; SKX-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
+; SKX-NEXT:    vsubpd %xmm0, %xmm1, %xmm0
 ; SKX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index e29d62b2605..6944d3ea27b 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -851,7 +851,7 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; SKX-NEXT:    kxorb %k2, %k1, %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $5, %k1, %k1
-; SKX-NEXT:    kxorb %k1, %k0, %k0
+; SKX-NEXT:    kxorw %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def $al killed $al killed $eax
 ; SKX-NEXT:    retq
@@ -890,7 +890,7 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
 ; SKX-NEXT:    kshiftrb $7, %k0, %k0
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftlb $1, %k1, %k1
-; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    korw %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def $al killed $al killed $eax
 ; SKX-NEXT:    retq
@@ -1019,8 +1019,8 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vpminub %ymm3, %ymm1, %ymm0
 ; KNL-NEXT:    vpcmpeqb %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
@@ -1054,8 +1054,8 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vpminub %ymm3, %ymm1, %ymm0
 ; KNL-NEXT:    vpcmpeqb %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index d9dc5087f54..5338eb3c3a1 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -73,7 +73,7 @@ define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
 ; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovq2m %xmm0, %k1
 ; CHECK-NEXT:    kshiftlb $2, %k0, %k0
-; CHECK-NEXT:    korb %k0, %k1, %k0
+; CHECK-NEXT:    korw %k0, %k1, %k0
 ; CHECK-NEXT:    vpmovm2d %k0, %xmm0
 ; CHECK-NEXT:    retq
 
@@ -89,7 +89,7 @@ define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) {
 ; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovq2m %xmm0, %k1
 ; CHECK-NEXT:    kshiftlb $2, %k0, %k0
-; CHECK-NEXT:    korb %k0, %k1, %k0
+; CHECK-NEXT:    korw %k0, %k1, %k0
 ; CHECK-NEXT:    vpmovm2b %k0, %xmm0
 ; CHECK-NEXT:    retq
 
diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll
index 679e068b965..169342a3da5 100644
--- a/test/CodeGen/X86/known-signbits-vector.ll
+++ b/test/CodeGen/X86/known-signbits-vector.ll
@@ -240,21 +240,13 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1)
 define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind {
 ; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
 ; X32:       # %bb.0:
-; X32-NEXT:    vpsrad $16, %xmm0, %xmm1
-; X32-NEXT:    vpsrlq $16, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; X32-NEXT:    vpsrlq $16, %xmm0, %xmm0
-; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X32-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrad $16, %xmm0, %xmm1
-; X64-NEXT:    vpsrlq $16, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; X64-NEXT:    vpsrlq $16, %xmm0, %xmm0
-; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X64-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; X64-NEXT:    retq
   %1 = ashr <2 x i64> %a0, <i64 16, i64 16>
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index ef91981e701..92e5424253f 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -156,7 +156,7 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
@@ -283,7 +283,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -310,7 +310,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -476,7 +476,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -508,7 +508,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -537,7 +537,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -739,7 +739,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
@@ -875,7 +875,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -903,7 +903,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1087,7 +1087,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1121,7 +1121,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1151,7 +1151,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1355,7 +1355,7 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
@@ -1510,7 +1510,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1538,7 +1538,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1763,7 +1763,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1797,7 +1797,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -2730,7 +2730,7 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
diff --git a/test/CodeGen/X86/min-legal-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll
index 5e5d74defe4..9fc12e6a094 100644
--- a/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/test/CodeGen/X86/min-legal-vector-width.ll
@@ -191,7 +191,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -257,7 +257,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -321,7 +321,7 @@ define i32 @sad_16i8_256() "min-legal-vector-width"="256" {
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -383,7 +383,7 @@ define i32 @sad_16i8_512() "min-legal-vector-width"="512" {
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
index d7d1511d19d..51ac1d5caea 100644
--- a/test/CodeGen/X86/sad.ll
+++ b/test/CodeGen/X86/sad.ll
@@ -82,7 +82,7 @@ define i32 @sad_16i8() nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -107,7 +107,7 @@ define i32 @sad_16i8() nounwind {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -347,7 +347,7 @@ define i32 @sad_32i8() nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -374,7 +374,7 @@ define i32 @sad_32i8() nounwind {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -941,7 +941,7 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -989,7 +989,7 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1018,7 +1018,7 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1456,7 +1456,7 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1478,7 +1478,7 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1558,7 +1558,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1577,7 +1577,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index 018aee6ad06..729fb2f567e 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -2475,7 +2475,7 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; X86-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X86-AVX2-NEXT:    vmovd %eax, %xmm2
-; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, (%eax)
 ; X86-AVX2-NEXT:    vmovdqa %ymm1, (%eax)
 ; X86-AVX2-NEXT:    popl %esi
@@ -2723,7 +2723,7 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X64-AVX2-NEXT:    vmovd %eax, %xmm2
-; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vmovd %xmm0, (%rax)
 ; X64-AVX2-NEXT:    vmovdqa %ymm1, (%rax)
 ; X64-AVX2-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index af68937326c..f28fca7b75b 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -663,32 +663,16 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
 ; SSE41-NEXT:    addpd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: uitofp_4i32_to_2f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_4i32_to_2f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
-; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; VEX-LABEL: uitofp_4i32_to_2f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VEX-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
+; VEX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_4i32_to_2f64:
 ; AVX512F:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-compare-all_of.ll b/test/CodeGen/X86/vector-compare-all_of.ll
index 1974ad5facd..fe74d07512b 100644
--- a/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/test/CodeGen/X86/vector-compare-all_of.ll
@@ -64,7 +64,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vandpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vandpd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -200,7 +200,7 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -344,7 +344,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -510,7 +510,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -668,7 +668,7 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -695,7 +695,7 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -870,7 +870,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -899,7 +899,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-compare-any_of.ll b/test/CodeGen/X86/vector-compare-any_of.ll
index 92c2d0b5841..b7fa5cb64df 100644
--- a/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/test/CodeGen/X86/vector-compare-any_of.ll
@@ -62,7 +62,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vorpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vorpd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vorpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -188,7 +188,7 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -324,7 +324,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -477,7 +477,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -623,7 +623,7 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -649,7 +649,7 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -812,7 +812,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -840,7 +840,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-reduce-add.ll b/test/CodeGen/X86/vector-reduce-add.ll
index 21c10c97f49..e0f6f194f50 100644
--- a/test/CodeGen/X86/vector-reduce-add.ll
+++ b/test/CodeGen/X86/vector-reduce-add.ll
@@ -59,7 +59,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -69,7 +69,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -107,7 +107,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -119,7 +119,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -169,7 +169,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -182,7 +182,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -255,7 +255,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -267,7 +267,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -311,7 +311,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -325,7 +325,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -381,7 +381,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -396,7 +396,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -488,7 +488,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -503,7 +503,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -557,7 +557,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -574,7 +574,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -640,7 +640,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -658,7 +658,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -798,7 +798,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -815,7 +815,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -895,7 +895,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -914,7 +914,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -1010,7 +1010,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -1030,7 +1030,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-reduce-and.ll b/test/CodeGen/X86/vector-reduce-and.ll
index 560bceb2d05..305464e3707 100644
--- a/test/CodeGen/X86/vector-reduce-and.ll
+++ b/test/CodeGen/X86/vector-reduce-and.ll
@@ -49,7 +49,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -59,7 +59,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -69,7 +69,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -94,7 +94,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -105,7 +105,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -117,7 +117,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -148,7 +148,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -161,7 +161,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -174,7 +174,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -235,7 +235,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -247,7 +247,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -259,7 +259,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -288,7 +288,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -301,7 +301,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -315,7 +315,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -350,7 +350,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -365,7 +365,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -380,7 +380,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -457,7 +457,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -472,7 +472,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -487,7 +487,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -523,7 +523,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -539,7 +539,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -556,7 +556,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -598,7 +598,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -616,7 +616,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -634,7 +634,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -757,7 +757,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -774,7 +774,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -791,7 +791,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -851,7 +851,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -869,7 +869,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -888,7 +888,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -958,7 +958,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -978,7 +978,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -998,7 +998,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/test/CodeGen/X86/vector-reduce-fadd-fast.ll
index 281c4f28d99..9dadf969a0e 100644
--- a/test/CodeGen/X86/vector-reduce-fadd-fast.ll
+++ b/test/CodeGen/X86/vector-reduce-fadd-fast.ll
@@ -107,8 +107,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -119,8 +118,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
@@ -161,8 +159,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -175,8 +172,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
@@ -287,8 +283,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -299,8 +294,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
@@ -342,8 +336,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -356,8 +349,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
@@ -468,8 +460,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -480,8 +471,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
@@ -523,8 +513,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -537,8 +526,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
@@ -586,8 +574,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -596,8 +583,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX512-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
@@ -621,8 +607,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -633,8 +618,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
@@ -664,8 +648,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -677,8 +660,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
@@ -728,8 +710,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -738,8 +719,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
@@ -764,8 +744,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -776,8 +755,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
@@ -807,8 +785,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -820,8 +797,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
@@ -871,8 +847,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -881,8 +856,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
@@ -907,8 +881,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -919,8 +892,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
@@ -950,8 +922,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -963,8 +934,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
diff --git a/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/test/CodeGen/X86/vector-reduce-fmul-fast.ll
index 4c093562cb5..efacbf1e3b4 100644
--- a/test/CodeGen/X86/vector-reduce-fmul-fast.ll
+++ b/test/CodeGen/X86/vector-reduce-fmul-fast.ll
@@ -107,8 +107,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -119,8 +118,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
@@ -161,8 +159,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -175,8 +172,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
@@ -287,8 +283,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -299,8 +294,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
@@ -342,8 +336,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -356,8 +349,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
@@ -468,8 +460,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -480,8 +471,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
@@ -523,8 +513,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -537,8 +526,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
@@ -586,8 +574,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -596,8 +583,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX512-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
@@ -621,8 +607,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -633,8 +618,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
@@ -664,8 +648,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -677,8 +660,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
@@ -728,8 +710,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -738,8 +719,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
@@ -764,8 +744,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -776,8 +755,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
@@ -807,8 +785,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -820,8 +797,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
@@ -871,8 +847,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -881,8 +856,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
@@ -907,8 +881,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -919,8 +892,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
@@ -950,8 +922,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -963,8 +934,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
diff --git a/test/CodeGen/X86/vector-reduce-mul.ll b/test/CodeGen/X86/vector-reduce-mul.ll
index 210c076d2a6..58d712c35aa 100644
--- a/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/test/CodeGen/X86/vector-reduce-mul.ll
@@ -160,7 +160,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -184,7 +184,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512BW-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpsllq $32, %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -208,7 +208,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
 ; AVX512BWVL-NEXT:    vpsllq $32, %ymm2, %ymm2
 ; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
@@ -229,7 +229,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQVL-NEXT:    vpmullq %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT:    vpmullq %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
@@ -352,7 +352,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -385,7 +385,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -418,7 +418,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
 ; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
@@ -442,7 +442,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
@@ -655,7 +655,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -696,7 +696,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -737,7 +737,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
 ; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
@@ -763,7 +763,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
@@ -872,7 +872,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -884,7 +884,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -955,7 +955,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -969,7 +969,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1064,7 +1064,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1079,7 +1079,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1171,7 +1171,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -1186,7 +1186,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -1240,7 +1240,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -1257,7 +1257,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
 ; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512BW-NEXT:    vzeroupper
@@ -1274,7 +1274,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
 ; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512BWVL-NEXT:    vzeroupper
@@ -1290,7 +1290,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vmovd %xmm0, %eax
 ; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1306,7 +1306,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
 ; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
@@ -1372,7 +1372,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -1390,7 +1390,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
 ; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512BW-NEXT:    vzeroupper
@@ -1408,7 +1408,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
 ; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512BWVL-NEXT:    vzeroupper
@@ -1426,7 +1426,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vmovd %xmm0, %eax
 ; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1444,7 +1444,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
 ; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-reduce-or.ll b/test/CodeGen/X86/vector-reduce-or.ll
index 169394040bf..1b67c94e4ec 100644
--- a/test/CodeGen/X86/vector-reduce-or.ll
+++ b/test/CodeGen/X86/vector-reduce-or.ll
@@ -49,7 +49,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -59,7 +59,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -69,7 +69,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -94,7 +94,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -105,7 +105,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -117,7 +117,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -148,7 +148,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -161,7 +161,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -174,7 +174,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -235,7 +235,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -247,7 +247,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -259,7 +259,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -288,7 +288,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -301,7 +301,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -315,7 +315,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -350,7 +350,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -365,7 +365,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -380,7 +380,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -457,7 +457,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -472,7 +472,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -487,7 +487,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -523,7 +523,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -539,7 +539,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -556,7 +556,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -598,7 +598,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -616,7 +616,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -634,7 +634,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -757,7 +757,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -774,7 +774,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -791,7 +791,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -851,7 +851,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -869,7 +869,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -888,7 +888,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -958,7 +958,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -978,7 +978,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -998,7 +998,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-reduce-xor.ll b/test/CodeGen/X86/vector-reduce-xor.ll
index d1bf3e99c2c..0192ff3c923 100644
--- a/test/CodeGen/X86/vector-reduce-xor.ll
+++ b/test/CodeGen/X86/vector-reduce-xor.ll
@@ -49,7 +49,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -59,7 +59,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -69,7 +69,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -94,7 +94,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -105,7 +105,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -117,7 +117,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -148,7 +148,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -161,7 +161,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -174,7 +174,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -235,7 +235,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -247,7 +247,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -259,7 +259,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -288,7 +288,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -301,7 +301,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -315,7 +315,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -350,7 +350,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -365,7 +365,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -380,7 +380,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -457,7 +457,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -472,7 +472,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -487,7 +487,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -523,7 +523,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -539,7 +539,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -556,7 +556,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -598,7 +598,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -616,7 +616,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -634,7 +634,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -757,7 +757,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -774,7 +774,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -791,7 +791,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -851,7 +851,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -869,7 +869,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -888,7 +888,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -958,7 +958,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -978,7 +978,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -998,7 +998,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll
index dd5a5b01395..75fe0b322ca 100644
--- a/test/CodeGen/X86/vector-rotate-256.ll
+++ b/test/CodeGen/X86/vector-rotate-256.ll
@@ -690,8 +690,8 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm2
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
@@ -702,8 +702,8 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512F-NEXT:    vpbroadcastw %xmm1, %ymm2
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
@@ -714,8 +714,8 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512VL-NEXT:    vpbroadcastw %xmm1, %ymm2
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
diff --git a/test/CodeGen/X86/vector-rotate-512.ll b/test/CodeGen/X86/vector-rotate-512.ll
index 30de8d7c908..f838f1b54db 100644
--- a/test/CodeGen/X86/vector-rotate-512.ll
+++ b/test/CodeGen/X86/vector-rotate-512.ll
@@ -316,8 +316,8 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
 ; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm3
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT:    vpsubw %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm3, %xmm5, %xmm3
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsrlw %xmm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
@@ -331,8 +331,8 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
 ; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm3
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm4
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpsubw %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm3, %xmm5, %xmm3
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm0, %ymm4, %ymm0
@@ -468,14 +468,14 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-LABEL: splatvar_rotate_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw %xmm2, %zmm4, %zmm2
@@ -488,14 +488,14 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512VLBW-LABEL: splatvar_rotate_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %zmm2, %zmm3, %zmm2
 ; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm0, %zmm3
 ; AVX512VLBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm4, %zmm1
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpandq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm2, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpsrlw %xmm2, %zmm4, %zmm2
-- 
GitLab


From ccd4f446eb6f2890dce79a1a104f6ae4e4bc0a7d Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Tue, 30 Oct 2018 14:34:15 +0000
Subject: [PATCH 0746/1116] [LoopVectorizer]  Fix for cost values of memory
 accesses.

This commit is a combination of two patches:

* "Fix in getScalarizationOverhead()"

   If target returns false in TTI.prefersVectorizedAddressing(), it means the
   address registers will not need to be extracted. Therefore, there should
   be no operands scalarization overhead for a load instruction.

* "Don't pass the instruction pointer from getMemInstScalarizationCost."

   Since VF is always > 1, this is a cost query for an instruction in the
   vectorized loop and it should not be evaluated within the scalar
   context of the instruction.

Review: Ulrich Weigand, Hal Finkel
https://reviews.llvm.org/D52351
https://reviews.llvm.org/D52417

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345603 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp    |  9 +++++-
 .../SystemZ/load-scalarization-cost-0.ll      | 27 ++++++++++++++++++
 .../SystemZ/load-scalarization-cost-1.ll      | 28 +++++++++++++++++++
 3 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
 create mode 100644 test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f0a07eddc3b..006c13c233e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2982,6 +2982,10 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
        !TTI.supportsEfficientVectorElementLoadStore()))
     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
 
+  // Some targets keep addresses scalar.
+  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
+    return Cost;
+
   if (CallInst *CI = dyn_cast<CallInst>(I)) {
     SmallVector<const Value *, 4> Operands(CI->arg_operands());
     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
@@ -5372,6 +5376,7 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
 
 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
                                                                  unsigned VF) {
+  assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
   Type *ValTy = getMemInstValueType(I);
   auto SE = PSE.getSE();
 
@@ -5387,9 +5392,11 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   // Get the cost of the scalar memory instruction and address computation.
   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
 
+  // Don't pass *I here, since it is scalar but will actually be part of a
+  // vectorized loop where the user of it is a vectorized instruction.
   Cost += VF *
           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
-                              AS, I);
+                              AS);
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
diff --git a/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll b/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
new file mode 100644
index 00000000000..1925527eacf
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
@@ -0,0 +1,27 @@
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that a scalarized load does not get operands scalarization costs added.
+
+define void @fun(i64* %data, i64 %n, i64 %s, double* %Src) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %mul = mul nsw i64 %iv, %s
+  %gep = getelementptr inbounds double, double* %Src, i64 %mul
+  %bct = bitcast double* %gep to i64*
+  %ld = load i64, i64* %bct
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cmp110.us = icmp slt i64 %iv.next, %n
+  br i1 %cmp110.us, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %mul = mul nsw i64 %iv, %s
+; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %ld = load i64, i64* %bct
+}
diff --git a/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll b/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll
new file mode 100644
index 00000000000..fbf8b114542
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll
@@ -0,0 +1,28 @@
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -enable-interleaved-mem-accesses=false -disable-output < %s 2>&1 \
+; RUN:   | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that a scalarized load does not get a zero cost in a vectorized
+; loop. It can only be folded into the add operand in the scalar loop.
+
+define i32 @fun(i64* %data, i64 %n, i64 %s, i32* %Src) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %acc = phi i32 [ 0, %entry ], [ %acc_next, %for.body ]
+  %gep = getelementptr inbounds i32, i32* %Src, i64 %iv
+  %ld = load i32, i32* %gep
+  %acc_next = add i32 %acc, %ld
+  %iv.next = add nuw nsw i64 %iv, 2
+  %cmp110.us = icmp slt i64 %iv.next, %n
+  br i1 %cmp110.us, label %for.body, label %for.end
+
+for.end:
+  ret i32 %acc_next
+
+; CHECK: Found an estimated cost of 4 for VF 4 For instruction:   %ld = load i32, i32* %gep
+}
-- 
GitLab


From 78f5683a63cadff612b4a17efbf3773579e1575b Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Tue, 30 Oct 2018 15:04:40 +0000
Subject: [PATCH 0747/1116] [SchedModel]  Fix for read advance cycles with
 implicit pseudo operands.

The SchedModel allows the addition of ReadAdvances to express that certain
operands of the instructions are needed at a later point than the others.

RegAlloc may add pseudo operands that are not part of the instruction
descriptor, and therefore cannot have any read advance entries. This meant
that in some cases the desired read advance was nullified by such a pseudo
operand, which still had the original latency.

This patch fixes this by making sure that such pseudo operands get a zero
latency during DAG construction.

Review: Matthias Braun, Ulrich Weigand.
https://reviews.llvm.org/D49671

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345606 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/ScheduleDAGInstrs.cpp             |  20 +-
 test/CodeGen/AMDGPU/call-argument-types.ll    |  22 +-
 .../AMDGPU/call-preserved-registers.ll        |  36 +--
 .../AMDGPU/callee-special-input-sgprs.ll      |   3 +-
 test/CodeGen/AMDGPU/indirect-addressing-si.ll |   6 +-
 test/CodeGen/AMDGPU/inline-asm.ll             |   4 +-
 test/CodeGen/AMDGPU/insert_vector_elt.ll      |   2 +-
 test/CodeGen/AMDGPU/misched-killflags.mir     |  12 +-
 test/CodeGen/AMDGPU/nested-calls.ll           |   4 +-
 .../AMDGPU/undefined-subreg-liverange.ll      |  12 +-
 .../ARM/Windows/chkstk-movw-movt-isel.ll      |   6 +-
 test/CodeGen/ARM/Windows/chkstk.ll            |   6 +-
 test/CodeGen/ARM/Windows/memset.ll            |   4 +-
 test/CodeGen/ARM/arm-and-tst-peephole.ll      |   2 +-
 test/CodeGen/ARM/arm-shrink-wrapping.ll       |  28 +-
 .../ARM/cortex-a57-misched-ldm-wrback.ll      |   4 +-
 test/CodeGen/ARM/cortex-a57-misched-ldm.ll    |   2 +-
 .../ARM/cortex-a57-misched-vldm-wrback.ll     |   4 +-
 test/CodeGen/ARM/cortex-a57-misched-vldm.ll   |   4 +-
 test/CodeGen/ARM/fp16-instructions.ll         |   4 +-
 test/CodeGen/ARM/select.ll                    |   2 +-
 test/CodeGen/ARM/twoaddrinstr.ll              |   4 +-
 test/CodeGen/ARM/vcombine.ll                  |   8 +-
 test/CodeGen/ARM/vuzp.ll                      | 242 +++++++++---------
 test/CodeGen/SystemZ/misched-readadvances.mir |  31 +++
 .../Thumb2/umulo-128-legalisation-lowering.ll |   4 +-
 .../Thumb2/umulo-64-legalisation-lowering.ll  |   4 +-
 test/CodeGen/X86/lsr-loop-exit-cond.ll        |   8 +-
 test/CodeGen/X86/memset.ll                    |   2 +-
 test/CodeGen/X86/phys-reg-local-regalloc.ll   |   4 +-
 test/CodeGen/X86/schedule-x86-64-shld.ll      |   8 +-
 test/CodeGen/X86/schedule-x86_32.ll           |  10 +-
 32 files changed, 278 insertions(+), 234 deletions(-)
 create mode 100644 test/CodeGen/SystemZ/misched-readadvances.mir

diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 346f82ff95f..99406ed1496 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -234,6 +234,11 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
   // Ask the target if address-backscheduling is desirable, and if so how much.
   const TargetSubtargetInfo &ST = MF.getSubtarget();
 
+  // Only use any non-zero latency for real defs/uses, in contrast to
+  // "fake" operands added by regalloc.
+  const MCInstrDesc *DefMIDesc = &SU->getInstr()->getDesc();
+  bool ImplicitPseudoDef = (OperIdx >= DefMIDesc->getNumOperands() &&
+                            !DefMIDesc->hasImplicitDefOfPhysReg(MO.getReg()));
   for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
        Alias.isValid(); ++Alias) {
     if (!Uses.contains(*Alias))
@@ -257,11 +262,18 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
         Dep = SDep(SU, SDep::Data, *Alias);
         RegUse = UseSU->getInstr();
       }
-      Dep.setLatency(
-        SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, RegUse,
-                                         UseOp));
+      const MCInstrDesc *UseMIDesc =
+          (RegUse ? &UseSU->getInstr()->getDesc() : nullptr);
+      bool ImplicitPseudoUse =
+          (UseMIDesc && UseOp >= ((int)UseMIDesc->getNumOperands()) &&
+           !UseMIDesc->hasImplicitUseOfPhysReg(*Alias));
+      if (!ImplicitPseudoDef && !ImplicitPseudoUse) {
+        Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
+                                                        RegUse, UseOp));
+        ST.adjustSchedDependency(SU, UseSU, Dep);
+      } else
+        Dep.setLatency(0);
 
-      ST.adjustSchedDependency(SU, UseSU, Dep);
       UseSU->addPred(Dep);
     }
   }
diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll
index 581df1c8527..84d327b6f37 100644
--- a/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -61,11 +61,11 @@ declare void @external_void_func_v16i8(<16 x i8>) #0
 
 ; MESA-DAG: s_mov_b64 s[0:1], s[36:37]
 
+; GCN: v_mov_b32_e32 v0, 1{{$}}
+; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4
-; GCN-DAG: v_mov_b32_e32 v0, 1{{$}}
-; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
 
 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
@@ -123,12 +123,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm:
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
+; GCN: v_mov_b32_e32 v0, 0x7b
+; HSA-DAG: s_mov_b32 s4, s33{{$}}
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4
-; GCN-NEXT: v_mov_b32_e32 v0, 0x7b
 
-; HSA-DAG: s_mov_b32 s4, s33{{$}}
 ; GCN-DAG: s_mov_b32 s32, s33{{$}}
 
 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
@@ -144,11 +144,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
 ; GCN-DAG: buffer_load_sbyte v0
+; GCN: s_mov_b32 s4, s33
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s3
 
 ; GCN: s_waitcnt vmcnt(0)
@@ -165,11 +165,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; HSA-DAG: s_mov_b32 s33, s9{{$}}
 
 ; GCN-DAG: buffer_load_ubyte v0
+; GCN: s_mov_b32 s4, s33
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s33
 
 ; GCN: s_waitcnt vmcnt(0)
@@ -197,11 +197,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
 ; GCN-DAG: buffer_load_sshort v0
+; GCN: s_mov_b32 s4, s33
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s33
 
 ; GCN: s_waitcnt vmcnt(0)
@@ -218,11 +218,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 
 
 ; GCN-DAG: buffer_load_ushort v0
+; GCN: s_mov_b32 s4, s33
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s33
 
 ; GCN: s_waitcnt vmcnt(0)
@@ -237,11 +237,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm:
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
+; GCN: v_mov_b32_e32 v0, 42
+; GCN: s_mov_b32 s4, s33
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4
-; GCN: v_mov_b32_e32 v0, 42
-; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s33
 
 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
@@ -481,10 +481,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; HSA-DAG: s_mov_b32 s33, s9
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
+; GCN-NOT: v3
 ; GCN-DAG: v_mov_b32_e32 v0, 3
 ; GCN-DAG: v_mov_b32_e32 v1, 4
 ; GCN-DAG: v_mov_b32_e32 v2, 5
-; GCN-NOT: v3
 
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
diff --git a/test/CodeGen/AMDGPU/call-preserved-registers.ll b/test/CodeGen/AMDGPU/call-preserved-registers.ll
index 6d1e2467d08..57bc6171d7a 100644
--- a/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -6,10 +6,10 @@ declare void @external_void_func_void() #0
 
 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
 ; GCN: s_mov_b32 s33, s7
-; GCN: s_getpc_b64 s[34:35]
+; GCN: s_mov_b32 s4, s33
+; GCN-NEXT: s_getpc_b64 s[34:35]
 ; GCN-NEXT: s_add_u32 s34, s34,
 ; GCN-NEXT: s_addc_u32 s35, s35,
-; GCN-NEXT: s_mov_b32 s4, s33
 ; GCN-NEXT: s_mov_b32 s32, s33
 ; GCN: s_swappc_b64 s[30:31], s[34:35]
 
@@ -129,13 +129,13 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
 ; GCN: s_mov_b32 s34, s9
-; GCN: ; def s33
-; GCN-NEXT: #ASMEND
-; GCN: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
-; GCN-NEXT: s_mov_b32 s4, s34
-; GCN-NEXT: s_mov_b32 s32, s34
+; GCN: s_mov_b32 s4, s34
+; GCN-DAG: s_mov_b32 s32, s34
+; GCN-DAG: ; def s33
+; GCN-DAG: #ASMEND
+; GCN-DAG: s_getpc_b64 s[6:7]
+; GCN-DAG: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
+; GCN-DAG: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: ; use s33
@@ -150,13 +150,13 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32:
 ; GCN: s_mov_b32 s33, s9
-; GCN: ; def v32
-; GCN-NEXT: #ASMEND
-; GCN: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
-; GCN-NEXT: s_mov_b32 s4, s33
-; GCN-NEXT: s_mov_b32 s32, s33
+; GCN: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+; GCN-DAG: ; def v32
+; GCN-DAG: #ASMEND
+; GCN-DAG: s_getpc_b64 s[6:7]
+; GCN-DAG: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
+; GCN-DAG: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: ; use v32
@@ -183,10 +183,10 @@ define void @void_func_void_clobber_s33() #2 {
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33:
 ; GCN: s_mov_b32 s33, s7
-; GCN: s_getpc_b64
+; GCN: s_mov_b32 s4, s33
+; GCN-NEXT: s_getpc_b64
 ; GCN-NEXT: s_add_u32
 ; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_mov_b32 s4, s33
 ; GCN-NEXT: s_mov_b32 s32, s33
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
diff --git a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
index 907575c1ba8..e5c18062708 100644
--- a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -558,7 +558,8 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
 
 ; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill:
 ; GCN: s_mov_b32 s5, s32
-; GCN: s_add_u32 s32, s32, 0x400
+
+; GCN-DAG: s_add_u32 s32, s32, 0x400
 
 ; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s14
 ; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-68-9][0-9]*]], s15
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 31199b47e20..8e02303377c 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -386,9 +386,9 @@ bb2:
 ; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
 
 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
-; GCN: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
-; GCN: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
-; GCN: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
 
 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
 ; GCN-NEXT: s_waitcnt vmcnt(0)
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index 2856212bc89..a0563cdd319 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -186,8 +186,8 @@ entry:
 
 ; FIXME: Should not have intermediate sgprs
 ; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr:
-; CHECK: s_mov_b32 s1, 0
-; CHECK: s_mov_b32 s0, 0x1e240
+; CHECK-DAG: s_mov_b32 s1, 0
+; CHECK-DAG: s_mov_b32 s0, 0x1e240
 ; CHECK: v_mov_b32_e32 v0, s0
 ; CHECK: v_mov_b32_e32 v1, s1
 ; CHECK: use v[0:1]
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index a62ad820c89..692696ff730 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -352,7 +352,7 @@ endif:
 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
 
 ; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
-; GCN: v_movreld_b32_e32 v{{[0-9]+}}, 0
+; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0
 
 ; Increment to next element folded into base register, but FileCheck
 ; can't do math expressions
diff --git a/test/CodeGen/AMDGPU/misched-killflags.mir b/test/CodeGen/AMDGPU/misched-killflags.mir
index 811ef0d1375..0c58042d508 100644
--- a/test/CodeGen/AMDGPU/misched-killflags.mir
+++ b/test/CodeGen/AMDGPU/misched-killflags.mir
@@ -26,20 +26,20 @@ body: |
     S_ENDPGM
 ...
 # CHECK-LABEL: name: func0
-# CHECK: $sgpr10 = S_MOV_B32 5
-# CHECK: $sgpr9 = S_MOV_B32 4
-# CHECK: $sgpr8 = S_MOV_B32 3
-# CHECK: $sgpr33 = S_MOV_B32 killed $sgpr7
+# CHECK-DAG: $sgpr10 = S_MOV_B32 5
+# CHECK-DAG: $sgpr9 = S_MOV_B32 4
+# CHECK-DAG: $sgpr8 = S_MOV_B32 3
+# CHECK-DAG: $sgpr33 = S_MOV_B32 killed $sgpr7
 # CHECK: $vgpr0 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr8_sgpr9_sgpr10_sgpr11
+# CHECK: $sgpr32 = S_MOV_B32 $sgpr33
 # CHECK: BUNDLE implicit-def $sgpr6_sgpr7, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $scc {
 # CHECK:   $sgpr6_sgpr7 = S_GETPC_B64
 # CHECK:   $sgpr6 = S_ADD_U32 internal $sgpr6, 0, implicit-def $scc
 # CHECK:   $sgpr7 = S_ADDC_U32 internal $sgpr7, 0, implicit-def $scc, implicit internal $scc
 # CHECK: }
-# CHECK: $sgpr4 = S_MOV_B32 $sgpr33
+# CHECK: $sgpr4 = S_MOV_B32 killed $sgpr33
 # CHECK: $vgpr1 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
 # CHECK: $vgpr2 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
 # CHECK: $vgpr3 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
-# CHECK: $sgpr32 = S_MOV_B32 killed $sgpr33
 # CHECK: S_NOP 0, implicit killed $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3
 # CHECK: S_ENDPGM
diff --git a/test/CodeGen/AMDGPU/nested-calls.ll b/test/CodeGen/AMDGPU/nested-calls.ll
index 462274c65e7..7fbcb9706a8 100644
--- a/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/test/CodeGen/AMDGPU/nested-calls.ll
@@ -33,8 +33,8 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
 ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
 ; GCN: s_waitcnt
 ; GCN: s_mov_b32 s5, s32
-; GCN: s_add_u32 s32, s32, 0x1400{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
+; GCN-DAG: s_add_u32 s32, s32, 0x1400{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
 ; GCN: s_swappc_b64
 ; GCN: s_sub_u32 s32, s32, 0x1400{{$}}
 ; GCN: s_setpc_b64
diff --git a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index d4c05fb5682..0d6bb661797 100644
--- a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -57,18 +57,18 @@ bb11:                                             ; preds = %bb9
 
 ; CHECK-LABEL: {{^}}partially_undef_copy:
 ; CHECK: v_mov_b32_e32 v5, 5
-; CHECK: v_mov_b32_e32 v6, 6
+; CHECK-DAG: v_mov_b32_e32 v6, 6
 
-; CHECK: v_mov_b32_e32 v[[OUTPUT_LO:[0-9]+]], v5
+; CHECK-DAG: v_mov_b32_e32 v[[OUTPUT_LO:[0-9]+]], v5
 
 ; Undef copy
-; CHECK: v_mov_b32_e32 v1, v6
+; CHECK-DAG: v_mov_b32_e32 v1, v6
 
 ; undef copy
-; CHECK: v_mov_b32_e32 v2, v7
+; CHECK-DAG: v_mov_b32_e32 v2, v7
 
-; CHECK: v_mov_b32_e32 v[[OUTPUT_HI:[0-9]+]], v8
-; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6
+; CHECK-DAG: v_mov_b32_e32 v[[OUTPUT_HI:[0-9]+]], v8
+; CHECK-DAG: v_mov_b32_e32 v[[OUTPUT_LO]], v6
 
 ; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}}
 define amdgpu_kernel void @partially_undef_copy() #0 {
diff --git a/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll b/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
index 019298d2080..e75df160e00 100644
--- a/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
+++ b/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
@@ -19,9 +19,9 @@ entry:
 
 ; CHECK-LABEL: isel
 ; CHECK: push {r4, r5, r6, lr}
-; CHECK: movw r12, #0
-; CHECK: movt r12, #0
-; CHECK: movw r4, #{{\d*}}
+; CHECK-DAG: movw r12, #0
+; CHECK-DAG: movt r12, #0
+; CHECK-DAG: movw r4, #{{\d*}}
 ; CHECK: blx r12
 ; CHECK: sub.w sp, sp, r4
 
diff --git a/test/CodeGen/ARM/Windows/chkstk.ll b/test/CodeGen/ARM/Windows/chkstk.ll
index 330c1f45850..8fd41461459 100644
--- a/test/CodeGen/ARM/Windows/chkstk.ll
+++ b/test/CodeGen/ARM/Windows/chkstk.ll
@@ -16,9 +16,9 @@ entry:
 ; CHECK-DEFAULT-CODE-MODEL: 	sub.w sp, sp, r4
 
 ; CHECK-LARGE-CODE-MODEL: check_watermark:
-; CHECK-LARGE-CODE-MODEL: 	movw r12, :lower16:__chkstk
-; CHECK-LARGE-CODE-MODEL: 	movt r12, :upper16:__chkstk
-; CHECK-LARGE-CODE-MODEL: 	movw r4, #1024
+; CHECK-LARGE-CODE-MODEL-DAG: 	movw r12, :lower16:__chkstk
+; CHECK-LARGE-CODE-MODEL-DAG: 	movt r12, :upper16:__chkstk
+; CHECK-LARGE-CODE-MODEL-DAG: 	movw r4, #1024
 ; CHECK-LARGE-CODE-MODEL: 	blx r12
 ; CHECK-LARGE-CODE-MODEL: 	sub.w sp, sp, r4
 
diff --git a/test/CodeGen/ARM/Windows/memset.ll b/test/CodeGen/ARM/Windows/memset.ll
index c9b22f47a15..8cb257c1566 100644
--- a/test/CodeGen/ARM/Windows/memset.ll
+++ b/test/CodeGen/ARM/Windows/memset.ll
@@ -10,9 +10,9 @@ entry:
   unreachable
 }
 
-; CHECK: movw r0, :lower16:source
-; CHECK: movt r0, :upper16:source
 ; CHECK: movs r1, #0
 ; CHECK: mov.w r2, #512
+; CHECK: movw r0, :lower16:source
+; CHECK: movt r0, :upper16:source
 ; CHECK: memset
 
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index 8e38f18c069..b81cf443e53 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -162,8 +162,8 @@ define i32 @test_tst_assessment(i32 %a, i32 %b) {
 ;
 ; T2-LABEL: test_tst_assessment:
 ; T2:       @ %bb.0:
-; T2-NEXT:    lsls r1, r1, #31
 ; T2-NEXT:    and r0, r0, #1
+; T2-NEXT:    lsls r1, r1, #31
 ; T2-NEXT:    it ne
 ; T2-NEXT:    subne r0, #1
 ; T2-NEXT:    bx lr
diff --git a/test/CodeGen/ARM/arm-shrink-wrapping.ll b/test/CodeGen/ARM/arm-shrink-wrapping.ll
index c943f60c56d..bf4f1bd0d0c 100644
--- a/test/CodeGen/ARM/arm-shrink-wrapping.ll
+++ b/test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -104,10 +104,10 @@ declare i32 @doSomething(i32, i32*)
 ; Next BB.
 ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
 ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
-; ARM: subs [[IV]], [[IV]], #1
-; THUMB: subs [[IV]], #1
-; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]]
-; THUMB-NEXT: add [[SUM]], [[TMP]]
+; ARM: add [[SUM]], [[TMP]], [[SUM]]
+; THUMB: add [[SUM]], [[TMP]]
+; ARM-NEXT: subs [[IV]], [[IV]], #1
+; THUMB-NEXT: subs [[IV]], #1
 ; CHECK-NEXT: bne [[LOOP]]
 ;
 ; Next BB.
@@ -169,10 +169,10 @@ declare i32 @something(...)
 ; Next BB.
 ; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: @ %for.body
 ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
-; ARM: subs [[IV]], [[IV]], #1
-; THUMB: subs [[IV]], #1
 ; ARM: add [[SUM]], [[TMP]], [[SUM]]
 ; THUMB: add [[SUM]], [[TMP]]
+; ARM: subs [[IV]], [[IV]], #1
+; THUMB: subs [[IV]], #1
 ; CHECK-NEXT: bne [[LOOP_LABEL]]
 ; Next BB.
 ; CHECK: @ %for.exit
@@ -228,10 +228,10 @@ for.end:                                          ; preds = %for.body
 ; Next BB.
 ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
 ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
-; ARM: subs [[IV]], [[IV]], #1
-; THUMB: subs [[IV]], #1
-; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]]
-; THUMB-NEXT: add [[SUM]], [[TMP]]
+; ARM: add [[SUM]], [[TMP]], [[SUM]]
+; THUMB: add [[SUM]], [[TMP]]
+; ARM-NEXT: subs [[IV]], [[IV]], #1
+; THUMB-NEXT: subs [[IV]], #1
 ; CHECK-NEXT: bne [[LOOP]]
 ;
 ; Next BB.
@@ -307,10 +307,10 @@ declare void @somethingElse(...)
 ; Next BB.
 ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
 ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
-; ARM: subs [[IV]], [[IV]], #1
-; THUMB: subs [[IV]], #1
-; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]]
-; THUMB-NEXT: add [[SUM]], [[TMP]]
+; ARM: add [[SUM]], [[TMP]], [[SUM]]
+; THUMB: add [[SUM]], [[TMP]]
+; ARM-NEXT: subs [[IV]], [[IV]], #1
+; THUMB-NEXT: subs [[IV]], #1
 ; CHECK-NEXT: bne [[LOOP]]
 ;
 ; Next BB.
diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
index 0ae2d5f6f2f..2c0aa98eae0 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
@@ -18,9 +18,9 @@
 ; CHECK-NEXT:  Data
 ; CHECK-SAME:  Latency=3
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=3
+; CHECK-SAME:  Latency=0
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=4
+; CHECK-SAME:  Latency=0
 define i32 @bar(i32 %a1, i32 %b1, i32 %c1) minsize optsize {
   %1 = load i32, i32* @a, align 4
   %2 = load i32, i32* @b, align 4
diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
index bc7a14b1028..02d1c2f55f9 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
@@ -11,7 +11,7 @@
 ; CHECK:       Data
 ; CHECK-SAME:  Latency=3
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=3
+; CHECK-SAME:  Latency=0
 
 define i32 @foo(i32* %a) nounwind optsize {
 entry:
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
index b5edcc30422..1baf472ca49 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
@@ -20,9 +20,9 @@
 ; CHECK-NEXT:  Data
 ; CHECK-SAME:  Latency=5
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=5
+; CHECK-SAME:  Latency=0
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=6
+; CHECK-SAME:  Latency=0
 define i32 @bar(i32* %iptr) minsize optsize {
   %1 = load double, double* @a, align 8
   %2 = load double, double* @b, align 8
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
index 12c7b3270c3..8da133e806e 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
@@ -11,9 +11,9 @@
 ; CHECK:       Data
 ; CHECK-SAME:  Latency=5
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=5
+; CHECK-SAME:  Latency=0
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=6
+; CHECK-SAME:  Latency=0
 
 define double @foo(double* %a) nounwind optsize {
 entry:
diff --git a/test/CodeGen/ARM/fp16-instructions.ll b/test/CodeGen/ARM/fp16-instructions.ll
index 6505d2bf673..670fcf58b1e 100644
--- a/test/CodeGen/ARM/fp16-instructions.ll
+++ b/test/CodeGen/ARM/fp16-instructions.ll
@@ -935,9 +935,9 @@ entry:
 ; CHECK-SOFTFP-FP16-T32:       vmov	[[S6:s[0-9]]], r0
 ; CHECK-SOFTFP-FP16-T32:       vldr	s0, .LCP{{.*}}
 ; CHECK-SOFTFP-FP16-T32:       vcvtb.f32.f16	[[S6]], [[S6]]
-; CHECK-SOFTFP-FP16-T32:       vmov.f32	[[S2:s[0-9]]], #-2.000000e+00
-; CHECK-SOFTFP-FP16-T32:       vcmp.f32	[[S6]], s0
 ; CHECK-SOFTFP-FP16-T32:       vldr	[[S4:s[0-9]]], .LCPI{{.*}}
+; CHECK-SOFTFP-FP16-T32:       vcmp.f32	[[S6]], s0
+; CHECK-SOFTFP-FP16-T32:       vmov.f32	[[S2:s[0-9]]], #-2.000000e+00
 ; CHECK-SOFTFP-FP16-T32:       vmrs	APSR_nzcv, fpscr
 ; CHECK-SOFTFP-FP16-T32:       it eq
 ; CHECK-SOFTFP-FP16-T32:       vmoveq.f32	[[S4]], [[S2]]
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index e9394a72073..639b88183cc 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -80,8 +80,8 @@ define double @f7(double %a, double %b) {
 ; block generated, odds are good that we have close to the ideal code for this:
 ;
 ; CHECK-NEON-LABEL: f8:
-; CHECK-NEON:      movw    [[R3:r[0-9]+]], #1123
 ; CHECK-NEON:      adr     [[R2:r[0-9]+]], LCPI7_0
+; CHECK-NEON:      movw    [[R3:r[0-9]+]], #1123
 ; CHECK-NEON-NEXT: cmp     r0, [[R3]]
 ; CHECK-NEON-NEXT: it      eq
 ; CHECK-NEON-NEXT: addeq{{.*}} [[R2]], #4
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
index f0a95c833c6..e8c52e1b58d 100644
--- a/test/CodeGen/ARM/twoaddrinstr.ll
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -4,8 +4,8 @@
 define void @PR13378() nounwind {
 ; This was orriginally a crasher trying to schedule the instructions.
 ; CHECK-LABEL:      PR13378:
-; CHECK:        vld1.32
-; CHECK-NEXT:   vmov.i32
+; CHECK:        vmov.i32
+; CHECK-NEXT:   vld1.32
 ; CHECK-NEXT:   vst1.32
 ; CHECK-NEXT:   vst1.32
 ; CHECK-NEXT:   vmov.f32
diff --git a/test/CodeGen/ARM/vcombine.ll b/test/CodeGen/ARM/vcombine.ll
index c08ed81d042..de234b6879e 100644
--- a/test/CodeGen/ARM/vcombine.ll
+++ b/test/CodeGen/ARM/vcombine.ll
@@ -39,8 +39,8 @@ define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
 
-; CHECK-LE: vmov r0, r1, [[LD0]]
 ; CHECK-LE: vmov r2, r3, [[LD1]]
+; CHECK-LE: vmov r0, r1, [[LD0]]
 
 ; CHECK-BE: vmov r1, r0, d16
 ; CHECK-BE: vmov r3, r2, d17
@@ -56,8 +56,8 @@ define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind {
 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
 
-; CHECK-LE: vmov r0, r1, [[LD0]]
 ; CHECK-LE: vmov r2, r3, [[LD1]]
+; CHECK-LE: vmov r0, r1, [[LD0]]
 
 ; CHECK-BE: vmov r1, r0, d16
 ; CHECK-BE: vmov r3, r2, d17
@@ -72,11 +72,11 @@ define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
 
-; CHECK-LE: vmov r0, r1, [[LD0]]
 ; CHECK-LE: vmov r2, r3, [[LD1]]
+; CHECK-LE: vmov r0, r1, [[LD0]]
 
-; CHECK-BE: vmov r1, r0, [[LD0]]
 ; CHECK-BE: vmov r3, r2, [[LD1]]
+; CHECK-BE: vmov r1, r0, [[LD0]]
 	%tmp1 = load <1 x i64>, <1 x i64>* %A
 	%tmp2 = load <1 x i64>, <1 x i64>* %B
 	%tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> <i32 0, i32 1>
diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll
index 96cafdec7bf..281fe2537a4 100644
--- a/test/CodeGen/ARM/vuzp.ll
+++ b/test/CodeGen/ARM/vuzp.ll
@@ -324,23 +324,23 @@ define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8
 ; truncate from i32 to i16 and one vmovn.i16 to perform the final truncation for i8.
 ; CHECK-LABEL: cmpsel_trunc:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:	add	r12, sp, #16
-; CHECK-NEXT: 	vld1.64	{d16, d17}, [r12]
-; CHECK-NEXT:	mov	r12, sp
-; CHECK-NEXT:	vld1.64	{d18, d19}, [r12]
-; CHECK-NEXT:	add	r12, sp, #48
-; CHECK-NEXT:	vld1.64	{d20, d21}, [r12]
-; CHECK-NEXT:	add	r12, sp, #32
-; CHECK-NEXT:	vcgt.u32	q8, q10, q8
-; CHECK-NEXT:	vld1.64	{d20, d21}, [r12]
-; CHECK-NEXT:	vcgt.u32	q9, q10, q9
-; CHECK-NEXT:	vmov	d20, r2, r3
-; CHECK-NEXT:	vmovn.i32	d17, q8
-; CHECK-NEXT:	vmovn.i32	d16, q9
-; CHECK-NEXT:	vmov	d18, r0, r1
-; CHECK-NEXT:	vmovn.i16	d16, q8
-; CHECK-NEXT:	vbsl	d16, d18, d20
-; CHECK-NEXT:	vmov	r0, r1, d16
+; CHECK-NEXT:    add r12, sp, #16
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT:    mov r12, sp
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
+; CHECK-NEXT:    add r12, sp, #48
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r12]
+; CHECK-NEXT:    add r12, sp, #32
+; CHECK-NEXT:    vcgt.u32 q8, q10, q8
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r12]
+; CHECK-NEXT:    vcgt.u32 q9, q10, q9
+; CHECK-NEXT:    vmov d20, r2, r3
+; CHECK-NEXT:    vmovn.i32 d17, q8
+; CHECK-NEXT:    vmovn.i32 d16, q9
+; CHECK-NEXT:    vmov d18, r0, r1
+; CHECK-NEXT:    vmovn.i16 d16, q8
+; CHECK-NEXT:    vbsl d16, d18, d20
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
   %c = icmp ult <8 x i32> %cmp0, %cmp1
   %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1
@@ -353,28 +353,28 @@ define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8
 define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
 ; CHECK-LABEL: vuzp_trunc_and_shuffle:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:	.save	{r11, lr}
-; CHECK-NEXT:	push	{r11, lr}
-; CHECK-NEXT:	add	r12, sp, #8
-; CHECK-NEXT:	add	lr, sp, #24
-; CHECK-NEXT:	vld1.64	{d16, d17}, [r12]
-; CHECK-NEXT:	ldr	r12, [sp, #40]
-; CHECK-NEXT:	vld1.64	{d18, d19}, [lr]
-; CHECK-NEXT:	vcgt.u32	q8, q9, q8
-; CHECK-NEXT:	vld1.32	{d18[0]}, [r12:32]
-; CHECK-NEXT:	vmov.i8	d19, #0x7
-; CHECK-NEXT:	vmovl.u8	q10, d18
-; CHECK-NEXT:	vmovn.i32	d16, q8
-; CHECK-NEXT:	vneg.s8	d17, d19
-; CHECK-NEXT:	vmov	d18, r2, r3
-; CHECK-NEXT:	vuzp.8	d16, d20
-; CHECK-NEXT:	vshl.i8	d16, d16, #7
-; CHECK-NEXT:	vshl.s8	d16, d16, d17
-; CHECK-NEXT:	vmov	d17, r0, r1
-; CHECK-NEXT:	vbsl	d16, d17, d18
-; CHECK-NEXT:	vmov	r0, r1, d16
-; CHECK-NEXT:	pop	{r11, lr}
-; CHECK-NEXT:	mov	pc, lr
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    add r12, sp, #8
+; CHECK-NEXT:    add lr, sp, #24
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT:    ldr r12, [sp, #40]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [lr]
+; CHECK-NEXT:    vcgt.u32 q8, q9, q8
+; CHECK-NEXT:    vld1.32 {d18[0]}, [r12:32]
+; CHECK-NEXT:    vmov.i8 d19, #0x7
+; CHECK-NEXT:    vmovl.u8 q10, d18
+; CHECK-NEXT:    vmovn.i32 d16, q8
+; CHECK-NEXT:    vneg.s8 d17, d19
+; CHECK-NEXT:    vmov d18, r2, r3
+; CHECK-NEXT:    vuzp.8 d16, d20
+; CHECK-NEXT:    vshl.i8 d16, d16, #7
+; CHECK-NEXT:    vshl.s8 d16, d16, d17
+; CHECK-NEXT:    vmov d17, r0, r1
+; CHECK-NEXT:    vbsl d16, d17, d18
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    pop {r11, lr}
+; CHECK-NEXT:    mov pc, lr
                          <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
   %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
   %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
@@ -389,22 +389,22 @@ define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
 define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1,
 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:	mov	r12, sp
-; CHECK-NEXT:	vld1.64	{d16, d17}, [r12]
-; CHECK-NEXT:	add	r12, sp, #16
-; CHECK-NEXT:	vld1.64	{d18, d19}, [r12]
-; CHECK-NEXT:	vcgt.u32	q8, q9, q8
-; CHECK-NEXT:	vmov.i8	d18, #0x7
-; CHECK-NEXT:	vmovn.i32	d16, q8
-; CHECK-NEXT:	vuzp.8	d16, d17
-; CHECK-NEXT:	vneg.s8	d17, d18
-; CHECK-NEXT:	vshl.i8	d16, d16, #7
-; CHECK-NEXT:	vmov	d18, r2, r3
-; CHECK-NEXT:	vshl.s8	d16, d16, d17
-; CHECK-NEXT:	vmov	d17, r0, r1
-; CHECK-NEXT:	vbsl	d16, d17, d18
-; CHECK-NEXT:	vmov	r0, r1, d16
-; CHECK-NEXT:	mov	pc, lr
+; CHECK-NEXT:    mov r12, sp
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT:    add r12, sp, #16
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
+; CHECK-NEXT:    vcgt.u32 q8, q9, q8
+; CHECK-NEXT:    vmov.i8 d18, #0x7
+; CHECK-NEXT:    vmovn.i32 d16, q8
+; CHECK-NEXT:    vuzp.8 d16, d17
+; CHECK-NEXT:    vneg.s8 d17, d18
+; CHECK-NEXT:    vshl.i8 d16, d16, #7
+; CHECK-NEXT:    vmov d18, r2, r3
+; CHECK-NEXT:    vshl.s8 d16, d16, d17
+; CHECK-NEXT:    vmov d17, r0, r1
+; CHECK-NEXT:    vbsl d16, d17, d18
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
                          <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
   %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
   %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
@@ -417,23 +417,23 @@ define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1
 define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:	mov	r12, sp
-; CHECK-NEXT:	vld1.64	{d16, d17}, [r12]
-; CHECK-NEXT:	add	r12, sp, #16
-; CHECK-NEXT:	vld1.64	{d18, d19}, [r12]
-; CHECK-NEXT:	vcgt.u32	q8, q9, q8
-; CHECK-NEXT:	vldr	d18, .LCPI22_0
-; CHECK-NEXT:	vmov.i8	d19, #0x7
-; CHECK-NEXT:	vmovn.i32	d16, q8
-; CHECK-NEXT:	vtbl.8	d16, {d16}, d18
-; CHECK-NEXT:	vneg.s8	d17, d19
-; CHECK-NEXT:	vmov	d18, r2, r3
-; CHECK-NEXT:	vshl.i8	d16, d16, #7
-; CHECK-NEXT:	vshl.s8	d16, d16, d17
-; CHECK-NEXT:	vmov	d17, r0, r1
-; CHECK-NEXT:	vbsl	d16, d17, d18
-; CHECK-NEXT:	vmov	r0, r1, d16
-; CHECK-NEXT:	mov	pc, lr
+; CHECK-NEXT:    mov r12, sp
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT:    add r12, sp, #16
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
+; CHECK-NEXT:    vcgt.u32 q8, q9, q8
+; CHECK-NEXT:    vldr d18, .LCPI22_0
+; CHECK-NEXT:    vmov.i8 d19, #0x7
+; CHECK-NEXT:    vmovn.i32 d16, q8
+; CHECK-NEXT:    vtbl.8 d16, {d16}, d18
+; CHECK-NEXT:    vneg.s8 d17, d19
+; CHECK-NEXT:    vmov d18, r2, r3
+; CHECK-NEXT:    vshl.i8 d16, d16, #7
+; CHECK-NEXT:    vshl.s8 d16, d16, d17
+; CHECK-NEXT:    vmov d17, r0, r1
+; CHECK-NEXT:    vbsl d16, d17, d18
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 ; CHECK-NEXT:    .p2align 3
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI22_0:
@@ -459,55 +459,55 @@ define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
 define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
 ; CHECK-LABEL: vuzp_wide_type:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:	.save	{r4, lr}
-; CHECK-NEXT:	push	{r4, lr}
-; CHECK-NEXT:	add	r12, sp, #32
-; CHECK-NEXT:	add	lr, sp, #48
-; CHECK-NEXT:	vld1.32	{d17[0]}, [r12:32]
-; CHECK-NEXT:	add	r12, sp, #24
-; CHECK-NEXT:	vld1.32	{d16[0]}, [r12:32]
-; CHECK-NEXT:	add	r12, sp, #56
-; CHECK-NEXT:	vld1.32	{d19[0]}, [r12:32]
-; CHECK-NEXT:	ldr	r12, [sp, #68]
-; CHECK-NEXT:	vld1.32	{d18[0]}, [lr:32]
-; CHECK-NEXT:	add	lr, sp, #40
-; CHECK-NEXT:	vld1.32	{d20[0]}, [lr:32]
-; CHECK-NEXT:	ldr	r4, [r12]
-; CHECK-NEXT:	vmov.32	d23[0], r4
-; CHECK-NEXT:	add	r4, sp, #64
-; CHECK-NEXT:	vld1.32	{d24[0]}, [r4:32]
-; CHECK-NEXT:	add	r4, sp, #36
-; CHECK-NEXT:	vld1.32	{d17[1]}, [r4:32]
-; CHECK-NEXT:	add	r4, sp, #28
-; CHECK-NEXT:	vcgt.u32	q10, q12, q10
-; CHECK-NEXT:	vmov.u8	lr, d23[3]
-; CHECK-NEXT:	vld1.32	{d16[1]}, [r4:32]
-; CHECK-NEXT:	add	r4, sp, #60
-; CHECK-NEXT:	vld1.32	{d19[1]}, [r4:32]
-; CHECK-NEXT:	add	r4, sp, #52
-; CHECK-NEXT:	vld1.32	{d18[1]}, [r4:32]
-; CHECK-NEXT:	add	r4, r12, #4
-; CHECK-NEXT:	vcgt.u32	q8, q9, q8
-; CHECK-NEXT:	vmovn.i32	d19, q10
-; CHECK-NEXT:	vldr	d20, .LCPI23_0
-; CHECK-NEXT:	vmovn.i32	d18, q8
-; CHECK-NEXT:	vmovn.i16	d22, q9
-; CHECK-NEXT:	vmov.i8	q9, #0x7
-; CHECK-NEXT:	vmov.8	d17[0], lr
-; CHECK-NEXT:	vneg.s8	q9, q9
-; CHECK-NEXT:	vtbl.8	d16, {d22, d23}, d20
-; CHECK-NEXT:	vld1.8	{d17[1]}, [r4]
-; CHECK-NEXT:	add	r4, sp, #8
-; CHECK-NEXT:	vshl.i8	q8, q8, #7
-; CHECK-NEXT:	vld1.64	{d20, d21}, [r4]
-; CHECK-NEXT:	vshl.s8	q8, q8, q9
-; CHECK-NEXT:	vmov	d19, r2, r3
-; CHECK-NEXT:	vmov	d18, r0, r1
-; CHECK-NEXT:	vbsl	q8, q9, q10
-; CHECK-NEXT:	vmov	r0, r1, d16
-; CHECK-NEXT:	vmov	r2, r3, d17
-; CHECK-NEXT:	pop	{r4, lr}
-; CHECK-NEXT:	mov	pc, lr
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    add r12, sp, #32
+; CHECK-NEXT:    add lr, sp, #48
+; CHECK-NEXT:    vld1.32 {d17[0]}, [r12:32]
+; CHECK-NEXT:    add r12, sp, #24
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r12:32]
+; CHECK-NEXT:    add r12, sp, #56
+; CHECK-NEXT:    vld1.32 {d19[0]}, [r12:32]
+; CHECK-NEXT:    vld1.32 {d18[0]}, [lr:32]
+; CHECK-NEXT:    add lr, sp, #40
+; CHECK-NEXT:    vld1.32 {d20[0]}, [lr:32]
+; CHECK-NEXT:    ldr r12, [sp, #68]
+; CHECK-NEXT:    ldr r4, [r12]
+; CHECK-NEXT:    vmov.32 d23[0], r4
+; CHECK-NEXT:    add r4, sp, #64
+; CHECK-NEXT:    vld1.32 {d24[0]}, [r4:32]
+; CHECK-NEXT:    add r4, sp, #36
+; CHECK-NEXT:    vcgt.u32 q10, q12, q10
+; CHECK-NEXT:    vld1.32 {d17[1]}, [r4:32]
+; CHECK-NEXT:    add r4, sp, #28
+; CHECK-NEXT:    vld1.32 {d16[1]}, [r4:32]
+; CHECK-NEXT:    add r4, sp, #60
+; CHECK-NEXT:    vld1.32 {d19[1]}, [r4:32]
+; CHECK-NEXT:    add r4, sp, #52
+; CHECK-NEXT:    vld1.32 {d18[1]}, [r4:32]
+; CHECK-NEXT:    add r4, r12, #4
+; CHECK-NEXT:    vcgt.u32 q8, q9, q8
+; CHECK-NEXT:    vmovn.i32 d19, q10
+; CHECK-NEXT:    vmov.u8 lr, d23[3]
+; CHECK-NEXT:    vldr d20, .LCPI23_0
+; CHECK-NEXT:    vmovn.i32 d18, q8
+; CHECK-NEXT:    vmovn.i16 d22, q9
+; CHECK-NEXT:    vmov.i8 q9, #0x7
+; CHECK-NEXT:    vneg.s8 q9, q9
+; CHECK-NEXT:    vmov.8 d17[0], lr
+; CHECK-NEXT:    vtbl.8 d16, {d22, d23}, d20
+; CHECK-NEXT:    vld1.8 {d17[1]}, [r4]
+; CHECK-NEXT:    add r4, sp, #8
+; CHECK-NEXT:    vshl.i8 q8, q8, #7
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r4]
+; CHECK-NEXT:    vshl.s8 q8, q8, q9
+; CHECK-NEXT:    vmov d19, r2, r3
+; CHECK-NEXT:    vmov d18, r0, r1
+; CHECK-NEXT:    vbsl q8, q9, q10
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    pop {r4, lr}
+; CHECK-NEXT:    mov pc, lr
 ; CHECK-NEXT:    .p2align 3
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI23_0:
diff --git a/test/CodeGen/SystemZ/misched-readadvances.mir b/test/CodeGen/SystemZ/misched-readadvances.mir
new file mode 100644
index 00000000000..df8ca2f5f95
--- /dev/null
+++ b/test/CodeGen/SystemZ/misched-readadvances.mir
@@ -0,0 +1,31 @@
+# Check that the extra operand for the full register added by RegAlloc does
+# not have a latency that interferes with the latency adjustment
+# (ReadAdvance) for the MSY register operand.
+
+# RUN: llc %s -mtriple=s390x-linux-gnu -mcpu=z13 -start-before=machine-scheduler \
+# RUN:  -debug-only=machine-scheduler -o - 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# CHECK: ScheduleDAGMI::schedule starting
+# CHECK: SU(4): renamable $r2l = MSR renamable $r2l(tied-def 0), renamable $r2l
+# CHECK:   Latency : 6
+# CHECK: SU(5): renamable $r2l = MSY renamable $r2l(tied-def 0), renamable $r1d, -4, $noreg, implicit $r2d
+# CHECK:   Predecessors:
+# CHECK:     SU(4): Data Latency=2 Reg=$r2l
+# CHECK:     SU(4): Data Latency=0 Reg=$r2d
+
+---
+name:            Perl_do_sv_dump
+alignment:       4
+tracksRegLiveness: true
+body:             |
+    bb.0 :
+    %1:addr64bit = IMPLICIT_DEF
+    %2:addr64bit = IMPLICIT_DEF
+    %3:vr64bit = IMPLICIT_DEF
+
+    bb.1 :
+    %2:addr64bit = ALGFI %2, 4294967291, implicit-def dead $cc
+    %2.subreg_l32:addr64bit = MSR %2.subreg_l32, %2.subreg_l32
+    %2.subreg_l32:addr64bit = MSY %2.subreg_l32, %1, -4, $noreg
+...
diff --git a/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll b/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
index d639b7acbbc..5300bed0de8 100644
--- a/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
+++ b/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
@@ -88,15 +88,15 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV7-NEXT:    orrs r3, r2
 ; THUMBV7-NEXT:    ldr r2, [sp, #80]
 ; THUMBV7-NEXT:    orr.w r1, r1, r4
+; THUMBV7-NEXT:    orr.w r1, r1, r10
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r3, #1
-; THUMBV7-NEXT:    orr.w r1, r1, r10
 ; THUMBV7-NEXT:    orrs.w r7, r2, r11
 ; THUMBV7-NEXT:    orr.w r1, r1, r9
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r7, #1
-; THUMBV7-NEXT:    orr.w r0, r0, r12
 ; THUMBV7-NEXT:    ands r3, r7
+; THUMBV7-NEXT:    orr.w r0, r0, r12
 ; THUMBV7-NEXT:    orrs r1, r3
 ; THUMBV7-NEXT:    orrs r0, r1
 ; THUMBV7-NEXT:    orr.w r0, r0, r8
diff --git a/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll b/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
index e47e88a6832..161adf7e7d7 100644
--- a/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
+++ b/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
@@ -20,11 +20,11 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r1, #1
 ; THUMBV7-NEXT:    cmp r5, #0
+; THUMBV7-NEXT:    and.w r1, r1, r3
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r5, #1
-; THUMBV7-NEXT:    ands r1, r3
+; THUMBV7-NEXT:    orrs r1, r5
 ; THUMBV7-NEXT:    cmp.w lr, #0
-; THUMBV7-NEXT:    orr.w r1, r1, r5
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne.w lr, #1
 ; THUMBV7-NEXT:    orr.w r1, r1, lr
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll
index 0b5ce8a4ffb..7a266235109 100644
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -97,8 +97,8 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    pushq %r14
 ; ATOM-NEXT:    pushq %rbx
 ; ATOM-NEXT:    ## kill: def $ecx killed $ecx def $rcx
-; ATOM-NEXT:    movl 4(%rdx), %eax
 ; ATOM-NEXT:    movl (%rdx), %r15d
+; ATOM-NEXT:    movl 4(%rdx), %eax
 ; ATOM-NEXT:    leaq 20(%rdx), %r14
 ; ATOM-NEXT:    movq _Te0@{{.*}}(%rip), %r9
 ; ATOM-NEXT:    movq _Te1@{{.*}}(%rip), %r8
@@ -116,8 +116,8 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    movzbl %bl, %eax
 ; ATOM-NEXT:    movl (%r10,%rax,4), %eax
 ; ATOM-NEXT:    xorl (%r8,%rbp,4), %r15d
-; ATOM-NEXT:    xorl -4(%r14), %r15d
 ; ATOM-NEXT:    xorl (%r9,%rdi,4), %eax
+; ATOM-NEXT:    xorl -4(%r14), %r15d
 ; ATOM-NEXT:    xorl (%r14), %eax
 ; ATOM-NEXT:    addq $16, %r14
 ; ATOM-NEXT:  LBB0_1: ## %bb
@@ -130,14 +130,14 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    movzbl %dil, %edi
 ; ATOM-NEXT:    movl (%r8,%rdi,4), %ebx
 ; ATOM-NEXT:    movzbl %r15b, %edi
-; ATOM-NEXT:    movl (%r10,%rdi,4), %edi
 ; ATOM-NEXT:    xorl (%r9,%rbp,4), %ebx
+; ATOM-NEXT:    movl (%r10,%rdi,4), %edi
 ; ATOM-NEXT:    xorl -12(%r14), %ebx
 ; ATOM-NEXT:    xorl (%r9,%rax,4), %edi
 ; ATOM-NEXT:    movl %ebx, %eax
+; ATOM-NEXT:    xorl -8(%r14), %edi
 ; ATOM-NEXT:    shrl $24, %eax
 ; ATOM-NEXT:    movl (%r9,%rax,4), %r15d
-; ATOM-NEXT:    xorl -8(%r14), %edi
 ; ATOM-NEXT:    testq %r11, %r11
 ; ATOM-NEXT:    movl %edi, %eax
 ; ATOM-NEXT:    jne LBB0_2
diff --git a/test/CodeGen/X86/memset.ll b/test/CodeGen/X86/memset.ll
index 02fd8806254..02dfb34e100 100644
--- a/test/CodeGen/X86/memset.ll
+++ b/test/CodeGen/X86/memset.ll
@@ -41,8 +41,8 @@ define void @t() nounwind  {
 ; YMM-NEXT:    movl %esp, %ebp
 ; YMM-NEXT:    andl $-32, %esp
 ; YMM-NEXT:    subl $96, %esp
-; YMM-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; YMM-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; YMM-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; YMM-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
 ; YMM-NEXT:    movl %eax, (%esp)
 ; YMM-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/phys-reg-local-regalloc.ll b/test/CodeGen/X86/phys-reg-local-regalloc.ll
index a5453b9e1f8..2a129bc643b 100644
--- a/test/CodeGen/X86/phys-reg-local-regalloc.ll
+++ b/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -20,9 +20,9 @@ entry:
 ; On Intel Atom the scheduler moves a movl instruction
 ; used for the printf call to follow movl 24(%esp), %eax
 ; ATOM: movl 24(%esp), %eax
-; ATOM: movl
-; ATOM: movl   %eax, 36(%esp)
 ; ATOM-NOT: movl
+; ATOM: movl   %eax, 36(%esp)
+; ATOM: movl
 ; ATOM: movl 28(%esp), %ebx
 ; ATOM-NOT: movl
 ; ATOM: movl   %ebx, 40(%esp)
diff --git a/test/CodeGen/X86/schedule-x86-64-shld.ll b/test/CodeGen/X86/schedule-x86-64-shld.ll
index 315a497bc3c..0e66329f7b4 100644
--- a/test/CodeGen/X86/schedule-x86-64-shld.ll
+++ b/test/CodeGen/X86/schedule-x86-64-shld.ll
@@ -135,16 +135,16 @@ define i64 @lshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize
 ;
 ; BDVER12-LABEL: lshift_cl_optsize:
 ; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BDVER12-NEXT:    shldq %cl, %rsi, %rax # sched: [4:4.00]
 ; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: lshift_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shldq %cl, %rsi, %rax # sched: [4:4.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
@@ -211,16 +211,16 @@ define i64 @rshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize
 ;
 ; BDVER12-LABEL: rshift_cl_optsize:
 ; BDVER12:       # %bb.0: # %entry
-; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BDVER12-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:4.00]
 ; BDVER12-NEXT:    retq # sched: [5:1.00]
 ;
 ; BTVER2-LABEL: rshift_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:4.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
diff --git a/test/CodeGen/X86/schedule-x86_32.ll b/test/CodeGen/X86/schedule-x86_32.ll
index 757a022839b..6b8ad906fec 100644
--- a/test/CodeGen/X86/schedule-x86_32.ll
+++ b/test/CodeGen/X86/schedule-x86_32.ll
@@ -495,8 +495,8 @@ define void @test_arpl(i16 %a0, i16 *%a1) optsize {
 ;
 ; ZNVER1-LABEL: test_arpl:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    #APP
 ; ZNVER1-NEXT:    arpl %ax, (%ecx) # sched: [100:0.25]
 ; ZNVER1-NEXT:    #NO_APP
@@ -681,10 +681,10 @@ define void @test_bound(i16 %a0, i16 *%a1, i32 %a2, i32 *%a3) optsize {
 ; ZNVER1-NEXT:    pushl %esi # sched: [1:0.50]
 ; ZNVER1-NEXT:    .cfi_def_cfa_offset 8
 ; ZNVER1-NEXT:    .cfi_offset %esi, -8
+; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
 ; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [8:0.50]
-; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
 ; ZNVER1-NEXT:    #APP
 ; ZNVER1-NEXT:    bound %ax, (%esi) # sched: [100:0.25]
 ; ZNVER1-NEXT:    bound %ecx, (%edx) # sched: [100:0.25]
@@ -985,8 +985,8 @@ define void @test_dec16(i16 %a0, i16* %a1) optsize {
 ;
 ; ZNVER1-LABEL: test_dec16:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    #APP
 ; ZNVER1-NEXT:    decw %ax # sched: [1:0.25]
 ; ZNVER1-NEXT:    decw (%ecx) # sched: [5:0.50]
@@ -1212,8 +1212,8 @@ define void @test_inc16(i16 %a0, i16* %a1) optsize {
 ;
 ; ZNVER1-LABEL: test_inc16:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    #APP
 ; ZNVER1-NEXT:    incw %ax # sched: [1:0.25]
 ; ZNVER1-NEXT:    incw (%ecx) # sched: [5:0.50]
@@ -1949,8 +1949,8 @@ define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
 ;
 ; ZNVER1-LABEL: test_pop_push_16:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    #APP
 ; ZNVER1-NEXT:    popw %ax # sched: [8:0.50]
 ; ZNVER1-NEXT:    popw (%ecx) # sched: [5:0.50]
-- 
GitLab


From 36d678978c93e8f831c6bd03537e5dcdb6267e48 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 30 Oct 2018 15:26:39 +0000
Subject: [PATCH 0748/1116] [InstCombine] try to turn shuffle into
 insertelement

shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'

The motivating case is at least a couple of steps away: I noticed that
SLPVectorizer does not analyze shuffles as well as sequences of
insert/extract in PR34724:
https://bugs.llvm.org/show_bug.cgi?id=34724
...so SLP may fail to vectorize when source code has shuffles to start
with or instcombine has converted insert/extract to shuffles.

Independent of that, an insertelement is always a simpler op for IR
analysis vs. a shuffle, so we should transform to insert when possible.

I don't think there's any codegen concern here - if a target can't insert
a scalar directly to some fixed element in a vector (x86?), then this
should get expanded to the insert+shuffle that we started with.

Differential Revision: https://reviews.llvm.org/D53507


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345607 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineVectorOps.cpp      | 70 +++++++++++++++++++
 .../InstCombine/insert-extract-shuffle.ll     | 39 +++++------
 2 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 75f77779ab7..21dd7ed227a 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1531,6 +1531,71 @@ static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
   return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask));
 }
 
+/// Try to replace a shuffle with an insertelement.
+static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) {
+  Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
+  SmallVector<int, 16> Mask = Shuf.getShuffleMask();
+
+  // The shuffle must not change vector sizes.
+  // TODO: This restriction could be removed if the insert has only one use
+  //       (because the transform would require a new length-changing shuffle).
+  int NumElts = Mask.size();
+  if (NumElts != (int)(V0->getType()->getVectorNumElements()))
+    return nullptr;
+
+  // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'
+  auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) {
+    // We need an insertelement with a constant index.
+    if (!match(V0, m_InsertElement(m_Value(), m_Value(Scalar),
+                                   m_ConstantInt(IndexC))))
+      return false;
+
+    // Test the shuffle mask to see if it splices the inserted scalar into the
+    // operand 1 vector of the shuffle.
+    int NewInsIndex = -1;
+    for (int i = 0; i != NumElts; ++i) {
+      // Ignore undef mask elements.
+      if (Mask[i] == -1)
+        continue;
+
+      // The shuffle takes elements of operand 1 without lane changes.
+      if (Mask[i] == NumElts + i)
+        continue;
+
+      // The shuffle must choose the inserted scalar exactly once.
+      if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue())
+        return false;
+
+      // The shuffle is placing the inserted scalar into element i.
+      NewInsIndex = i;
+    }
+
+    assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?");
+
+    // Index is updated to the potentially translated insertion lane.
+    IndexC = ConstantInt::get(IndexC->getType(), NewInsIndex);
+    return true;
+  };
+
+  // If the shuffle is unnecessary, insert the scalar operand directly into
+  // operand 1 of the shuffle. Example:
+  // shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 0
+  Value *Scalar;
+  ConstantInt *IndexC;
+  if (isShufflingScalarIntoOp1(Scalar, IndexC))
+    return InsertElementInst::Create(V1, Scalar, IndexC);
+
+  // Try again after commuting shuffle. Example:
+  // shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> -->
+  // shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 3
+  std::swap(V0, V1);
+  ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
+  if (isShufflingScalarIntoOp1(Scalar, IndexC))
+    return InsertElementInst::Create(V1, Scalar, IndexC);
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
@@ -1556,6 +1621,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (Instruction *I = foldIdentityExtractShuffle(SVI))
     return I;
 
+  // This transform has the potential to lose undef knowledge, so it is
+  // intentionally placed after SimplifyDemandedVectorElts().
+  if (Instruction *I = foldShuffleWithInsert(SVI))
+    return I;
+
   SmallVector<int, 16> Mask = SVI.getShuffleMask();
   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
   unsigned LHSWidth = LHS->getType()->getVectorNumElements();
diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll
index c76d88a8e9e..2de9c66d463 100644
--- a/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -303,12 +303,11 @@ define <4 x float> @collectShuffleElts(<2 x float> %x, float %y) {
   ret <4 x float> %v3
 }
 
-; TODO: Simplest case - insert scalar into undef, then shuffle that value in place into another vector.
+; Simplest case - insert scalar into undef, then shuffle that value in place into another vector.
 
 define <4 x float> @insert_shuffle(float %x, <4 x float> %y) {
 ; CHECK-LABEL: @insert_shuffle(
-; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 0
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> [[Y:%.*]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv = insertelement <4 x float> undef, float %x, i32 0
@@ -316,12 +315,11 @@ define <4 x float> @insert_shuffle(float %x, <4 x float> %y) {
   ret <4 x float> %r
 }
 
-; TODO: Insert scalar into some element of a dummy vector, then move it to a different element in another vector.
+; Insert scalar into some element of a dummy vector, then move it to a different element in another vector.
 
 define <4 x float> @insert_shuffle_translate(float %x, <4 x float> %y) {
 ; CHECK-LABEL: @insert_shuffle_translate(
-; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 0
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> [[Y:%.*]], <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 1
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv = insertelement <4 x float> undef, float %x, i32 0
@@ -329,12 +327,11 @@ define <4 x float> @insert_shuffle_translate(float %x, <4 x float> %y) {
   ret <4 x float> %r
 }
 
-; TODO: The vector operand of the insert is irrelevant.
+; The vector operand of the insert is irrelevant.
 
 define <4 x float> @insert_not_undef_shuffle_translate(float %x, <4 x float> %y, <4 x float> %q) {
 ; CHECK-LABEL: @insert_not_undef_shuffle_translate(
-; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 3
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> [[Y:%.*]], <4 x i32> <i32 4, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 2
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv = insertelement <4 x float> %q, float %x, i32 3
@@ -342,12 +339,11 @@ define <4 x float> @insert_not_undef_shuffle_translate(float %x, <4 x float> %y,
   ret <4 x float> %r
 }
 
-; TODO: The insert may be the 2nd operand of the shuffle. The shuffle mask can include undef elements.
+; The insert may be the 2nd operand of the shuffle. The shuffle mask can include undef elements.
 
 define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x float> %y, <4 x float> %q) {
 ; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute(
-; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 2
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[XV]], <4 x i32> <i32 0, i32 6, i32 2, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 1
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv = insertelement <4 x float> %q, float %x, i32 2
@@ -355,13 +351,12 @@ define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x fl
   ret <4 x float> %r
 }
 
-; TODO: Both shuffle operands may be inserts - choose the correct side.
+; Both shuffle operands may be inserts - choose the correct side.
 
 define <4 x float> @insert_insert_shuffle_translate(float %x1, float %x2, <4 x float> %q) {
 ; CHECK-LABEL: @insert_insert_shuffle_translate(
-; CHECK-NEXT:    [[XV1:%.*]] = insertelement <4 x float> undef, float [[X1:%.*]], i32 0
 ; CHECK-NEXT:    [[XV2:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X2:%.*]], i32 2
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV1]], <4 x float> [[XV2]], <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[XV2]], float [[X1:%.*]], i32 1
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv1 = insertelement <4 x float> %q, float %x1, i32 0
@@ -370,13 +365,12 @@ define <4 x float> @insert_insert_shuffle_translate(float %x1, float %x2, <4 x f
   ret <4 x float> %r
 }
 
-; TODO: Both shuffle operands may be inserts - choose the correct side.
+; Both shuffle operands may be inserts - choose the correct side.
 
 define <4 x float> @insert_insert_shuffle_translate_commute(float %x1, float %x2, <4 x float> %q) {
 ; CHECK-LABEL: @insert_insert_shuffle_translate_commute(
 ; CHECK-NEXT:    [[XV1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i32 0
-; CHECK-NEXT:    [[XV2:%.*]] = insertelement <4 x float> undef, float [[X2:%.*]], i32 2
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV1]], <4 x float> [[XV2]], <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[XV1]], float [[X2:%.*]], i32 1
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv1 = insertelement <4 x float> %q, float %x1, i32 0
@@ -385,6 +379,9 @@ define <4 x float> @insert_insert_shuffle_translate_commute(float %x1, float %x2
   ret <4 x float> %r
 }
 
+; Negative test - this only works if the shuffle is choosing exactly 1 element from 1 of the inputs.
+; TODO: But this could be a special-case because we're inserting into the same base vector.
+
 define <4 x float> @insert_insert_shuffle_translate_wrong_mask(float %x1, float %x2, <4 x float> %q) {
 ; CHECK-LABEL: @insert_insert_shuffle_translate_wrong_mask(
 ; CHECK-NEXT:    [[XV1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i32 0
@@ -398,7 +395,7 @@ define <4 x float> @insert_insert_shuffle_translate_wrong_mask(float %x1, float
   ret <4 x float> %r
 }
 
-; TODO: The insert may have other uses.
+; The insert may have other uses.
 
 declare void @use(<4 x float>)
 
@@ -406,7 +403,7 @@ define <4 x float> @insert_not_undef_shuffle_translate_commute_uses(float %x, <4
 ; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute_uses(
 ; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X:%.*]], i32 2
 ; CHECK-NEXT:    call void @use(<4 x float> [[XV]])
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[XV]], <4 x i32> <i32 6, i32 undef, i32 2, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv = insertelement <4 x float> %q, float %x, i32 2
@@ -415,6 +412,8 @@ define <4 x float> @insert_not_undef_shuffle_translate_commute_uses(float %x, <4
   ret <4 x float> %r
 }
 
+; Negative test - size-changing shuffle.
+
 define <5 x float> @insert_not_undef_shuffle_translate_commute_lengthen(float %x, <4 x float> %y, <4 x float> %q) {
 ; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute_lengthen(
 ; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 2
-- 
GitLab


From 467c30721bbf0d3838a757c53455837d0f8851dc Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 30 Oct 2018 15:56:08 +0000
Subject: [PATCH 0749/1116] [llvm-mca] Move namespace mca inside llvm::

Summary: This allows to remove `using namespace llvm;` in those *.cpp files

When we want to revisit the decision (everything resides in llvm::mca::*) in the future, we can move things to a nested namespace of llvm::mca::, to conceptually make them separate from the rest of llvm::mca::*

Reviewers: andreadb, mattd

Reviewed By: andreadb

Subscribers: javed.absar, tschuett, gbedwell, llvm-commits

Differential Revision: https://reviews.llvm.org/D53407

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345612 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/CodeRegion.cpp                            | 2 ++
 tools/llvm-mca/CodeRegion.h                              | 2 ++
 tools/llvm-mca/PipelinePrinter.cpp                       | 2 ++
 tools/llvm-mca/PipelinePrinter.h                         | 2 ++
 tools/llvm-mca/Views/DispatchStatistics.cpp              | 4 ++--
 tools/llvm-mca/Views/DispatchStatistics.h                | 2 ++
 tools/llvm-mca/Views/InstructionInfoView.cpp             | 4 ++--
 tools/llvm-mca/Views/InstructionInfoView.h               | 2 ++
 tools/llvm-mca/Views/RegisterFileStatistics.cpp          | 4 ++--
 tools/llvm-mca/Views/RegisterFileStatistics.h            | 2 ++
 tools/llvm-mca/Views/ResourcePressureView.cpp            | 4 ++--
 tools/llvm-mca/Views/ResourcePressureView.h              | 2 ++
 tools/llvm-mca/Views/RetireControlUnitStatistics.cpp     | 4 ++--
 tools/llvm-mca/Views/RetireControlUnitStatistics.h       | 2 ++
 tools/llvm-mca/Views/SchedulerStatistics.cpp             | 4 ++--
 tools/llvm-mca/Views/SchedulerStatistics.h               | 2 ++
 tools/llvm-mca/Views/SummaryView.cpp                     | 4 ++--
 tools/llvm-mca/Views/SummaryView.h                       | 2 ++
 tools/llvm-mca/Views/TimelineView.cpp                    | 4 ++--
 tools/llvm-mca/Views/TimelineView.h                      | 2 ++
 tools/llvm-mca/Views/View.cpp                            | 2 ++
 tools/llvm-mca/Views/View.h                              | 2 ++
 tools/llvm-mca/include/Context.h                         | 2 ++
 tools/llvm-mca/include/HWEventListener.h                 | 2 ++
 tools/llvm-mca/include/HardwareUnits/HardwareUnit.h      | 2 ++
 tools/llvm-mca/include/HardwareUnits/LSUnit.h            | 2 ++
 tools/llvm-mca/include/HardwareUnits/RegisterFile.h      | 2 ++
 tools/llvm-mca/include/HardwareUnits/ResourceManager.h   | 2 ++
 tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h | 2 ++
 tools/llvm-mca/include/HardwareUnits/Scheduler.h         | 2 ++
 tools/llvm-mca/include/InstrBuilder.h                    | 2 ++
 tools/llvm-mca/include/Instruction.h                     | 2 ++
 tools/llvm-mca/include/Pipeline.h                        | 2 ++
 tools/llvm-mca/include/SourceMgr.h                       | 2 ++
 tools/llvm-mca/include/Stages/DispatchStage.h            | 2 ++
 tools/llvm-mca/include/Stages/ExecuteStage.h             | 2 ++
 tools/llvm-mca/include/Stages/FetchStage.h               | 2 ++
 tools/llvm-mca/include/Stages/InstructionTables.h        | 2 ++
 tools/llvm-mca/include/Stages/RetireStage.h              | 2 ++
 tools/llvm-mca/include/Stages/Stage.h                    | 2 ++
 tools/llvm-mca/include/Support.h                         | 2 ++
 tools/llvm-mca/lib/Context.cpp                           | 4 ++--
 tools/llvm-mca/lib/HWEventListener.cpp                   | 2 ++
 tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp        | 2 ++
 tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp              | 4 ++--
 tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp        | 4 ++--
 tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp     | 4 ++--
 tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp   | 4 ++--
 tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp           | 4 ++--
 tools/llvm-mca/lib/InstrBuilder.cpp                      | 4 ++--
 tools/llvm-mca/lib/Instruction.cpp                       | 4 ++--
 tools/llvm-mca/lib/Pipeline.cpp                          | 4 ++--
 tools/llvm-mca/lib/Stages/DispatchStage.cpp              | 4 ++--
 tools/llvm-mca/lib/Stages/ExecuteStage.cpp               | 4 ++--
 tools/llvm-mca/lib/Stages/FetchStage.cpp                 | 2 ++
 tools/llvm-mca/lib/Stages/InstructionTables.cpp          | 4 ++--
 tools/llvm-mca/lib/Stages/RetireStage.cpp                | 2 ++
 tools/llvm-mca/lib/Stages/Stage.cpp                      | 2 ++
 tools/llvm-mca/lib/Support.cpp                           | 4 ++--
 59 files changed, 118 insertions(+), 42 deletions(-)

diff --git a/tools/llvm-mca/CodeRegion.cpp b/tools/llvm-mca/CodeRegion.cpp
index 591c45feb6d..29a27c50c17 100644
--- a/tools/llvm-mca/CodeRegion.cpp
+++ b/tools/llvm-mca/CodeRegion.cpp
@@ -14,6 +14,7 @@
 
 #include "CodeRegion.h"
 
+namespace llvm {
 namespace mca {
 
 bool CodeRegion::isLocInRange(llvm::SMLoc Loc) const {
@@ -63,3 +64,4 @@ void CodeRegions::addInstruction(const llvm::MCInst &Instruction) {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/CodeRegion.h b/tools/llvm-mca/CodeRegion.h
index 21ca8da9b53..6ca2bd15128 100644
--- a/tools/llvm-mca/CodeRegion.h
+++ b/tools/llvm-mca/CodeRegion.h
@@ -41,6 +41,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include <vector>
 
+namespace llvm {
 namespace mca {
 
 /// A region of assembly code.
@@ -123,5 +124,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/PipelinePrinter.cpp b/tools/llvm-mca/PipelinePrinter.cpp
index 8b2157a8eb6..18ef45fc2a6 100644
--- a/tools/llvm-mca/PipelinePrinter.cpp
+++ b/tools/llvm-mca/PipelinePrinter.cpp
@@ -15,6 +15,7 @@
 #include "PipelinePrinter.h"
 #include "Views/View.h"
 
+namespace llvm {
 namespace mca {
 
 void PipelinePrinter::printReport(llvm::raw_ostream &OS) const {
@@ -22,3 +23,4 @@ void PipelinePrinter::printReport(llvm::raw_ostream &OS) const {
     V->printView(OS);
 }
 } // namespace mca.
+} // namespace llvm
diff --git a/tools/llvm-mca/PipelinePrinter.h b/tools/llvm-mca/PipelinePrinter.h
index a90b3a2af42..7e426383f21 100644
--- a/tools/llvm-mca/PipelinePrinter.h
+++ b/tools/llvm-mca/PipelinePrinter.h
@@ -24,6 +24,7 @@
 
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 /// A printer class that knows how to collects statistics on the
@@ -48,5 +49,6 @@ public:
   void printReport(llvm::raw_ostream &OS) const;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H
diff --git a/tools/llvm-mca/Views/DispatchStatistics.cpp b/tools/llvm-mca/Views/DispatchStatistics.cpp
index 98adcfb450d..2562c82407b 100644
--- a/tools/llvm-mca/Views/DispatchStatistics.cpp
+++ b/tools/llvm-mca/Views/DispatchStatistics.cpp
@@ -16,8 +16,7 @@
 #include "Views/DispatchStatistics.h"
 #include "llvm/Support/Format.h"
 
-using namespace llvm;
-
+namespace llvm {
 namespace mca {
 
 void DispatchStatistics::onEvent(const HWStallEvent &Event) {
@@ -84,3 +83,4 @@ void DispatchStatistics::printDispatchStalls(raw_ostream &OS) const {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/DispatchStatistics.h b/tools/llvm-mca/Views/DispatchStatistics.h
index 0f6f75e0954..6679c81efe9 100644
--- a/tools/llvm-mca/Views/DispatchStatistics.h
+++ b/tools/llvm-mca/Views/DispatchStatistics.h
@@ -39,6 +39,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include <map>
 
+namespace llvm {
 namespace mca {
 
 class DispatchStatistics : public View {
@@ -80,5 +81,6 @@ public:
   }
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/InstructionInfoView.cpp b/tools/llvm-mca/Views/InstructionInfoView.cpp
index 0a97e569c47..5016afb49e4 100644
--- a/tools/llvm-mca/Views/InstructionInfoView.cpp
+++ b/tools/llvm-mca/Views/InstructionInfoView.cpp
@@ -14,10 +14,9 @@
 
 #include "Views/InstructionInfoView.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 void InstructionInfoView::printView(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
@@ -87,3 +86,4 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
   OS << Buffer;
 }
 } // namespace mca.
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/InstructionInfoView.h b/tools/llvm-mca/Views/InstructionInfoView.h
index f7bbe6147d7..3ef95d47449 100644
--- a/tools/llvm-mca/Views/InstructionInfoView.h
+++ b/tools/llvm-mca/Views/InstructionInfoView.h
@@ -45,6 +45,7 @@
 
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 /// A view that prints out generic instruction information.
@@ -63,5 +64,6 @@ public:
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/RegisterFileStatistics.cpp b/tools/llvm-mca/Views/RegisterFileStatistics.cpp
index 2697f528a0a..bd638d9795a 100644
--- a/tools/llvm-mca/Views/RegisterFileStatistics.cpp
+++ b/tools/llvm-mca/Views/RegisterFileStatistics.cpp
@@ -15,8 +15,7 @@
 #include "Views/RegisterFileStatistics.h"
 #include "llvm/Support/Format.h"
 
-using namespace llvm;
-
+namespace llvm {
 namespace mca {
 
 RegisterFileStatistics::RegisterFileStatistics(const MCSubtargetInfo &sti)
@@ -106,3 +105,4 @@ void RegisterFileStatistics::printView(raw_ostream &OS) const {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/RegisterFileStatistics.h b/tools/llvm-mca/Views/RegisterFileStatistics.h
index 1e89d66dc50..86858d8bba8 100644
--- a/tools/llvm-mca/Views/RegisterFileStatistics.h
+++ b/tools/llvm-mca/Views/RegisterFileStatistics.h
@@ -36,6 +36,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
+namespace llvm {
 namespace mca {
 
 class RegisterFileStatistics : public View {
@@ -58,5 +59,6 @@ public:
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/ResourcePressureView.cpp b/tools/llvm-mca/Views/ResourcePressureView.cpp
index e7943252206..6df61840437 100644
--- a/tools/llvm-mca/Views/ResourcePressureView.cpp
+++ b/tools/llvm-mca/Views/ResourcePressureView.cpp
@@ -16,10 +16,9 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 ResourcePressureView::ResourcePressureView(const llvm::MCSubtargetInfo &sti,
                                            MCInstPrinter &Printer,
                                            ArrayRef<MCInst> S)
@@ -183,3 +182,4 @@ void ResourcePressureView::printResourcePressurePerInst(raw_ostream &OS) const {
   }
 }
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/ResourcePressureView.h b/tools/llvm-mca/Views/ResourcePressureView.h
index 5ee86df424b..572ce6fe6b7 100644
--- a/tools/llvm-mca/Views/ResourcePressureView.h
+++ b/tools/llvm-mca/Views/ResourcePressureView.h
@@ -65,6 +65,7 @@
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
+namespace llvm {
 namespace mca {
 
 /// This class collects resource pressure statistics and it is able to print
@@ -98,5 +99,6 @@ public:
   }
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp b/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
index a9a4ac9a33d..7e2fd316c97 100644
--- a/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
+++ b/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
@@ -15,8 +15,7 @@
 #include "Views/RetireControlUnitStatistics.h"
 #include "llvm/Support/Format.h"
 
-using namespace llvm;
-
+namespace llvm {
 namespace mca {
 
 void RetireControlUnitStatistics::onEvent(const HWInstructionEvent &Event) {
@@ -47,3 +46,4 @@ void RetireControlUnitStatistics::printView(raw_ostream &OS) const {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/RetireControlUnitStatistics.h b/tools/llvm-mca/Views/RetireControlUnitStatistics.h
index e9be542a786..9a4821ec31a 100644
--- a/tools/llvm-mca/Views/RetireControlUnitStatistics.h
+++ b/tools/llvm-mca/Views/RetireControlUnitStatistics.h
@@ -30,6 +30,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include <map>
 
+namespace llvm {
 namespace mca {
 
 class RetireControlUnitStatistics : public View {
@@ -54,5 +55,6 @@ public:
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/SchedulerStatistics.cpp b/tools/llvm-mca/Views/SchedulerStatistics.cpp
index 958b3b548f4..edd6056c1e8 100644
--- a/tools/llvm-mca/Views/SchedulerStatistics.cpp
+++ b/tools/llvm-mca/Views/SchedulerStatistics.cpp
@@ -16,8 +16,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
 
-using namespace llvm;
-
+namespace llvm {
 namespace mca {
 
 void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
@@ -127,3 +126,4 @@ void SchedulerStatistics::printView(raw_ostream &OS) const {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/SchedulerStatistics.h b/tools/llvm-mca/Views/SchedulerStatistics.h
index 3515546f083..56dd3af1912 100644
--- a/tools/llvm-mca/Views/SchedulerStatistics.h
+++ b/tools/llvm-mca/Views/SchedulerStatistics.h
@@ -42,6 +42,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include <map>
 
+namespace llvm {
 namespace mca {
 
 class SchedulerStatistics final : public View {
@@ -86,5 +87,6 @@ public:
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/SummaryView.cpp b/tools/llvm-mca/Views/SummaryView.cpp
index 2007746b81f..fdf27600c93 100644
--- a/tools/llvm-mca/Views/SummaryView.cpp
+++ b/tools/llvm-mca/Views/SummaryView.cpp
@@ -18,12 +18,11 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Format.h"
 
+namespace llvm {
 namespace mca {
 
 #define DEBUG_TYPE "llvm-mca"
 
-using namespace llvm;
-
 SummaryView::SummaryView(const MCSchedModel &Model, ArrayRef<MCInst> S,
                          unsigned Width)
     : SM(Model), Source(S), DispatchWidth(Width), LastInstructionIdx(0),
@@ -88,3 +87,4 @@ void SummaryView::printView(raw_ostream &OS) const {
   OS << Buffer;
 }
 } // namespace mca.
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/SummaryView.h b/tools/llvm-mca/Views/SummaryView.h
index 8c330f28f39..f59fd4233fb 100644
--- a/tools/llvm-mca/Views/SummaryView.h
+++ b/tools/llvm-mca/Views/SummaryView.h
@@ -34,6 +34,7 @@
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
 /// A view that collects and prints a few performance numbers.
@@ -71,5 +72,6 @@ public:
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/TimelineView.cpp b/tools/llvm-mca/Views/TimelineView.cpp
index de347b54bd9..7d55bbc99c7 100644
--- a/tools/llvm-mca/Views/TimelineView.cpp
+++ b/tools/llvm-mca/Views/TimelineView.cpp
@@ -14,8 +14,7 @@
 
 #include "Views/TimelineView.h"
 
-using namespace llvm;
-
+namespace llvm {
 namespace mca {
 
 TimelineView::TimelineView(const MCSubtargetInfo &sti, MCInstPrinter &Printer,
@@ -292,3 +291,4 @@ void TimelineView::printTimeline(raw_ostream &OS) const {
   }
 }
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/TimelineView.h b/tools/llvm-mca/Views/TimelineView.h
index 9b39a98c74a..ee981800161 100644
--- a/tools/llvm-mca/Views/TimelineView.h
+++ b/tools/llvm-mca/Views/TimelineView.h
@@ -108,6 +108,7 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
 /// This class listens to instruction state transition events
@@ -183,5 +184,6 @@ public:
   }
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/View.cpp b/tools/llvm-mca/Views/View.cpp
index 1cf4daeec84..6cfb9dd9f39 100644
--- a/tools/llvm-mca/Views/View.cpp
+++ b/tools/llvm-mca/Views/View.cpp
@@ -14,7 +14,9 @@
 
 #include "Views/View.h"
 
+namespace llvm {
 namespace mca {
 
 void View::anchor() {}
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/View.h b/tools/llvm-mca/Views/View.h
index 9ba94a5da97..c332bb53938 100644
--- a/tools/llvm-mca/Views/View.h
+++ b/tools/llvm-mca/Views/View.h
@@ -19,6 +19,7 @@
 #include "HWEventListener.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
 class View : public HWEventListener {
@@ -28,5 +29,6 @@ public:
   void anchor() override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/Context.h b/tools/llvm-mca/include/Context.h
index 9d64ae32f1c..d383e2361be 100644
--- a/tools/llvm-mca/include/Context.h
+++ b/tools/llvm-mca/include/Context.h
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include <memory>
 
+namespace llvm {
 namespace mca {
 
 /// This is a convenience struct to hold the parameters necessary for creating
@@ -64,4 +65,5 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 #endif // LLVM_TOOLS_LLVM_MCA_CONTEXT_H
diff --git a/tools/llvm-mca/include/HWEventListener.h b/tools/llvm-mca/include/HWEventListener.h
index cef78041565..81c76c5eb8d 100644
--- a/tools/llvm-mca/include/HWEventListener.h
+++ b/tools/llvm-mca/include/HWEventListener.h
@@ -19,6 +19,7 @@
 #include "Support.h"
 #include "llvm/ADT/ArrayRef.h"
 
+namespace llvm {
 namespace mca {
 
 // An HWInstructionEvent represents state changes of instructions that
@@ -151,5 +152,6 @@ private:
   virtual void anchor();
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h b/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h
index e8c496ab967..5070418c11b 100644
--- a/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h
+++ b/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
 #define LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
 
+namespace llvm {
 namespace mca {
 
 class HardwareUnit {
@@ -28,4 +29,5 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 #endif // LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
diff --git a/tools/llvm-mca/include/HardwareUnits/LSUnit.h b/tools/llvm-mca/include/HardwareUnits/LSUnit.h
index b348c973ee0..c979ac9cf82 100644
--- a/tools/llvm-mca/include/HardwareUnits/LSUnit.h
+++ b/tools/llvm-mca/include/HardwareUnits/LSUnit.h
@@ -19,6 +19,7 @@
 #include "HardwareUnits/HardwareUnit.h"
 #include <set>
 
+namespace llvm {
 namespace mca {
 
 class InstRef;
@@ -156,5 +157,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
index 4b8b623bfe6..5a5543ebacd 100644
--- a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
+++ b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/Error.h"
 
+namespace llvm {
 namespace mca {
 
 class ReadState;
@@ -225,5 +226,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H
diff --git a/tools/llvm-mca/include/HardwareUnits/ResourceManager.h b/tools/llvm-mca/include/HardwareUnits/ResourceManager.h
index dfac15f53fc..bf7c1e67115 100644
--- a/tools/llvm-mca/include/HardwareUnits/ResourceManager.h
+++ b/tools/llvm-mca/include/HardwareUnits/ResourceManager.h
@@ -23,6 +23,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
 
+namespace llvm {
 namespace mca {
 
 /// Used to notify the internal state of a processor resource.
@@ -357,5 +358,6 @@ public:
 #endif
 };
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_RESOURCE_MANAGER_H
diff --git a/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h b/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h
index 552a2094ff1..2f7a1b1d503 100644
--- a/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h
+++ b/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCSchedule.h"
 #include <vector>
 
+namespace llvm {
 namespace mca {
 
 /// This class tracks which instructions are in-flight (i.e., dispatched but not
@@ -98,5 +99,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_RETIRE_CONTROL_UNIT_H
diff --git a/tools/llvm-mca/include/HardwareUnits/Scheduler.h b/tools/llvm-mca/include/HardwareUnits/Scheduler.h
index db124958ee5..941224c1204 100644
--- a/tools/llvm-mca/include/HardwareUnits/Scheduler.h
+++ b/tools/llvm-mca/include/HardwareUnits/Scheduler.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
 
+namespace llvm {
 namespace mca {
 
 class SchedulerStrategy {
@@ -209,5 +210,6 @@ public:
 #endif // !NDEBUG
 };
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
diff --git a/tools/llvm-mca/include/InstrBuilder.h b/tools/llvm-mca/include/InstrBuilder.h
index 0fd97cb1ed5..ca615c053c8 100644
--- a/tools/llvm-mca/include/InstrBuilder.h
+++ b/tools/llvm-mca/include/InstrBuilder.h
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Error.h"
 
+namespace llvm {
 namespace mca {
 
 /// A builder class that knows how to construct Instruction objects.
@@ -71,5 +72,6 @@ public:
   createInstruction(const llvm::MCInst &MCI);
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/Instruction.h b/tools/llvm-mca/include/Instruction.h
index bbb40c42576..8509af2e0ff 100644
--- a/tools/llvm-mca/include/Instruction.h
+++ b/tools/llvm-mca/include/Instruction.h
@@ -29,6 +29,7 @@
 #include <set>
 #include <vector>
 
+namespace llvm {
 namespace mca {
 
 constexpr int UNKNOWN_CYCLES = -512;
@@ -508,5 +509,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/Pipeline.h b/tools/llvm-mca/include/Pipeline.h
index ad487e7564b..cb58e9a1fbd 100644
--- a/tools/llvm-mca/include/Pipeline.h
+++ b/tools/llvm-mca/include/Pipeline.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Error.h"
 
+namespace llvm {
 namespace mca {
 
 class HWEventListener;
@@ -70,5 +71,6 @@ public:
   void addEventListener(HWEventListener *Listener);
 };
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_PIPELINE_H
diff --git a/tools/llvm-mca/include/SourceMgr.h b/tools/llvm-mca/include/SourceMgr.h
index 54b1a2c31ce..4a55bdba5b4 100644
--- a/tools/llvm-mca/include/SourceMgr.h
+++ b/tools/llvm-mca/include/SourceMgr.h
@@ -18,6 +18,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 
+namespace llvm {
 namespace mca {
 
 class Instruction;
@@ -51,5 +52,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/Stages/DispatchStage.h b/tools/llvm-mca/include/Stages/DispatchStage.h
index 5a2ac3e6088..0153c649b42 100644
--- a/tools/llvm-mca/include/Stages/DispatchStage.h
+++ b/tools/llvm-mca/include/Stages/DispatchStage.h
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
+namespace llvm {
 namespace mca {
 
 // Implements the hardware dispatch logic.
@@ -92,5 +93,6 @@ public:
 #endif
 };
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_DISPATCH_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/ExecuteStage.h b/tools/llvm-mca/include/Stages/ExecuteStage.h
index 63e6f0bc2b8..0f46c8a3878 100644
--- a/tools/llvm-mca/include/Stages/ExecuteStage.h
+++ b/tools/llvm-mca/include/Stages/ExecuteStage.h
@@ -23,6 +23,7 @@
 #include "Stages/Stage.h"
 #include "llvm/ADT/ArrayRef.h"
 
+namespace llvm {
 namespace mca {
 
 class ExecuteStage final : public Stage {
@@ -74,5 +75,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_EXECUTE_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/FetchStage.h b/tools/llvm-mca/include/Stages/FetchStage.h
index a7aba2276d9..8622ab07e9e 100644
--- a/tools/llvm-mca/include/Stages/FetchStage.h
+++ b/tools/llvm-mca/include/Stages/FetchStage.h
@@ -20,6 +20,7 @@
 #include "Stages/Stage.h"
 #include <map>
 
+namespace llvm {
 namespace mca {
 
 class FetchStage final : public Stage {
@@ -45,5 +46,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/InstructionTables.h b/tools/llvm-mca/include/Stages/InstructionTables.h
index de31a7949bb..2b6e542d973 100644
--- a/tools/llvm-mca/include/Stages/InstructionTables.h
+++ b/tools/llvm-mca/include/Stages/InstructionTables.h
@@ -23,6 +23,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
 
+namespace llvm {
 namespace mca {
 
 class InstructionTables final : public Stage {
@@ -39,5 +40,6 @@ public:
   llvm::Error execute(InstRef &IR) override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/Stages/RetireStage.h b/tools/llvm-mca/include/Stages/RetireStage.h
index 2041105a194..e9975ca3bbd 100644
--- a/tools/llvm-mca/include/Stages/RetireStage.h
+++ b/tools/llvm-mca/include/Stages/RetireStage.h
@@ -21,6 +21,7 @@
 #include "HardwareUnits/RetireControlUnit.h"
 #include "Stages/Stage.h"
 
+namespace llvm {
 namespace mca {
 
 class RetireStage final : public Stage {
@@ -42,5 +43,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_RETIRE_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/Stage.h b/tools/llvm-mca/include/Stages/Stage.h
index 5470c9cf0d9..383abbe217e 100644
--- a/tools/llvm-mca/include/Stages/Stage.h
+++ b/tools/llvm-mca/include/Stages/Stage.h
@@ -20,6 +20,7 @@
 #include "llvm/Support/Error.h"
 #include <set>
 
+namespace llvm {
 namespace mca {
 
 class InstRef;
@@ -83,4 +84,5 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 #endif // LLVM_TOOLS_LLVM_MCA_STAGE_H
diff --git a/tools/llvm-mca/include/Support.h b/tools/llvm-mca/include/Support.h
index 9371394542d..43fb72c0229 100644
--- a/tools/llvm-mca/include/Support.h
+++ b/tools/llvm-mca/include/Support.h
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/Error.h"
 
+namespace llvm {
 namespace mca {
 
 template <typename T>
@@ -114,5 +115,6 @@ double computeBlockRThroughput(const llvm::MCSchedModel &SM,
                                unsigned DispatchWidth, unsigned NumMicroOps,
                                llvm::ArrayRef<unsigned> ProcResourceUsage);
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/lib/Context.cpp b/tools/llvm-mca/lib/Context.cpp
index 4e30fc9de31..5b6f52478dd 100644
--- a/tools/llvm-mca/lib/Context.cpp
+++ b/tools/llvm-mca/lib/Context.cpp
@@ -24,10 +24,9 @@
 #include "Stages/FetchStage.h"
 #include "Stages/RetireStage.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 std::unique_ptr<Pipeline>
 Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
                                SourceMgr &SrcMgr) {
@@ -63,3 +62,4 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HWEventListener.cpp b/tools/llvm-mca/lib/HWEventListener.cpp
index f27a04a9a98..3930e2555a9 100644
--- a/tools/llvm-mca/lib/HWEventListener.cpp
+++ b/tools/llvm-mca/lib/HWEventListener.cpp
@@ -14,8 +14,10 @@
 
 #include "HWEventListener.h"
 
+namespace llvm {
 namespace mca {
 
 // Anchor the vtable here.
 void HWEventListener::anchor() {}
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp b/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp
index daeda06d859..4e46ffacbd4 100644
--- a/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp
@@ -15,9 +15,11 @@
 
 #include "HardwareUnits/HardwareUnit.h"
 
+namespace llvm {
 namespace mca {
 
 // Pin the vtable with this method.
 HardwareUnit::~HardwareUnit() = default;
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp b/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
index aca90165af2..6923c6e0dc8 100644
--- a/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
@@ -17,10 +17,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 #ifndef NDEBUG
@@ -164,3 +163,4 @@ void LSUnit::onInstructionExecuted(const InstRef &IR) {
   }
 }
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
index 4a2a00523ae..71aec49ce77 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
@@ -18,10 +18,9 @@
 #include "Instruction.h"
 #include "llvm/Support/Debug.h"
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 RegisterFile::RegisterFile(const MCSchedModel &SM, const MCRegisterInfo &mri,
@@ -469,3 +468,4 @@ void RegisterFile::dump() const {
 #endif
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp b/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
index e033217d52d..e371f50ed48 100644
--- a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
@@ -18,10 +18,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 ResourceStrategy::~ResourceStrategy() = default;
 
@@ -305,3 +304,4 @@ void ResourceManager::releaseResource(uint64_t ResourceID) {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp b/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
index 8f543eeb8c2..0456e1d7a5b 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
@@ -15,10 +15,9 @@
 #include "HardwareUnits/RetireControlUnit.h"
 #include "llvm/Support/Debug.h"
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 RetireControlUnit::RetireControlUnit(const MCSchedModel &SM)
@@ -85,3 +84,4 @@ void RetireControlUnit::dump() const {
 #endif
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp b/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
index 3d91cb12c2d..b1ac8d99b86 100644
--- a/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
@@ -15,10 +15,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 
 void Scheduler::initializeStrategy(std::unique_ptr<SchedulerStrategy> S) {
@@ -243,3 +242,4 @@ bool Scheduler::isReady(const InstRef &IR) const {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/InstrBuilder.cpp b/tools/llvm-mca/lib/InstrBuilder.cpp
index 3704eaf6a50..535ad4d57fe 100644
--- a/tools/llvm-mca/lib/InstrBuilder.cpp
+++ b/tools/llvm-mca/lib/InstrBuilder.cpp
@@ -22,10 +22,9 @@
 
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 InstrBuilder::InstrBuilder(const llvm::MCSubtargetInfo &sti,
                            const llvm::MCInstrInfo &mcii,
                            const llvm::MCRegisterInfo &mri,
@@ -539,3 +538,4 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
   return std::move(NewIS);
 }
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Instruction.cpp b/tools/llvm-mca/lib/Instruction.cpp
index 42f5cd38ee9..832a6199f00 100644
--- a/tools/llvm-mca/lib/Instruction.cpp
+++ b/tools/llvm-mca/lib/Instruction.cpp
@@ -16,10 +16,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 void ReadState::writeStartEvent(unsigned Cycles) {
   assert(DependentWrites);
   assert(CyclesLeft == UNKNOWN_CYCLES);
@@ -181,3 +180,4 @@ void Instruction::cycleEvent() {
 const unsigned WriteRef::INVALID_IID = std::numeric_limits<unsigned>::max();
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Pipeline.cpp b/tools/llvm-mca/lib/Pipeline.cpp
index ad49522ad79..309f415913d 100644
--- a/tools/llvm-mca/lib/Pipeline.cpp
+++ b/tools/llvm-mca/lib/Pipeline.cpp
@@ -17,12 +17,11 @@
 #include "HWEventListener.h"
 #include "llvm/Support/Debug.h"
 
+namespace llvm {
 namespace mca {
 
 #define DEBUG_TYPE "llvm-mca"
 
-using namespace llvm;
-
 void Pipeline::addEventListener(HWEventListener *Listener) {
   if (Listener)
     Listeners.insert(Listener);
@@ -95,3 +94,4 @@ void Pipeline::notifyCycleEnd() {
     Listener->onCycleEnd();
 }
 } // namespace mca.
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/DispatchStage.cpp b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
index 0246151c64c..104446e711e 100644
--- a/tools/llvm-mca/lib/Stages/DispatchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
@@ -21,10 +21,9 @@
 #include "HardwareUnits/Scheduler.h"
 #include "llvm/Support/Debug.h"
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 void DispatchStage::notifyInstructionDispatched(const InstRef &IR,
@@ -185,3 +184,4 @@ void DispatchStage::dump() const {
 }
 #endif
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/ExecuteStage.cpp b/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
index 3b45a84c338..298f08a2887 100644
--- a/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
+++ b/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
@@ -21,10 +21,9 @@
 
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 HWStallEvent::GenericEventType toHWStallEventType(Scheduler::Status Status) {
   switch (Status) {
   case Scheduler::SC_LOAD_QUEUE_FULL:
@@ -217,3 +216,4 @@ void ExecuteStage::notifyReservedOrReleasedBuffers(const InstRef &IR,
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/FetchStage.cpp b/tools/llvm-mca/lib/Stages/FetchStage.cpp
index 85d06d2d183..6e91dd6121d 100644
--- a/tools/llvm-mca/lib/Stages/FetchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/FetchStage.cpp
@@ -16,6 +16,7 @@
 #include "Stages/FetchStage.h"
 #include "Instruction.h"
 
+namespace llvm {
 namespace mca {
 
 bool FetchStage::hasWorkToComplete() const { return CurrentInstruction; }
@@ -69,3 +70,4 @@ llvm::Error FetchStage::cycleEnd() {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/InstructionTables.cpp b/tools/llvm-mca/lib/Stages/InstructionTables.cpp
index 06319f857dc..33c30e7f95c 100644
--- a/tools/llvm-mca/lib/Stages/InstructionTables.cpp
+++ b/tools/llvm-mca/lib/Stages/InstructionTables.cpp
@@ -17,10 +17,9 @@
 
 #include "Stages/InstructionTables.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 Error InstructionTables::execute(InstRef &IR) {
   const InstrDesc &Desc = IR.getInstruction()->getDesc();
   UsedResources.clear();
@@ -67,3 +66,4 @@ Error InstructionTables::execute(InstRef &IR) {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/RetireStage.cpp b/tools/llvm-mca/lib/Stages/RetireStage.cpp
index 8297c9c9ea5..47eed5f2c9c 100644
--- a/tools/llvm-mca/lib/Stages/RetireStage.cpp
+++ b/tools/llvm-mca/lib/Stages/RetireStage.cpp
@@ -20,6 +20,7 @@
 
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 llvm::Error RetireStage::cycleStart() {
@@ -58,3 +59,4 @@ void RetireStage::notifyInstructionRetired(const InstRef &IR) const {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/Stage.cpp b/tools/llvm-mca/lib/Stages/Stage.cpp
index e8cd74f2163..c3cfe47d24e 100644
--- a/tools/llvm-mca/lib/Stages/Stage.cpp
+++ b/tools/llvm-mca/lib/Stages/Stage.cpp
@@ -15,6 +15,7 @@
 
 #include "Stages/Stage.h"
 
+namespace llvm {
 namespace mca {
 
 // Pin the vtable here in the implementation file.
@@ -25,3 +26,4 @@ void Stage::addListener(HWEventListener *Listener) {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Support.cpp b/tools/llvm-mca/lib/Support.cpp
index 8f6b8a91f38..a6ff26dafb5 100644
--- a/tools/llvm-mca/lib/Support.cpp
+++ b/tools/llvm-mca/lib/Support.cpp
@@ -16,10 +16,9 @@
 #include "Support.h"
 #include "llvm/MC/MCSchedule.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 void computeProcResourceMasks(const MCSchedModel &SM,
                               SmallVectorImpl<uint64_t> &Masks) {
   unsigned ProcResourceID = 0;
@@ -77,3 +76,4 @@ double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth,
 }
 
 } // namespace mca
+} // namespace llvm
-- 
GitLab


From 7e7913225715cf5248a8654220e82be5e5af7690 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 30 Oct 2018 16:21:56 +0000
Subject: [PATCH 0750/1116] [InstCombine] use getFltSemantics() instead of
 duplicating it; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345613 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCompares.cpp       | 22 +++----------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index bf8bc8818f7..afc2175992f 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5368,33 +5368,17 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         if (!RHSF)
           break;
 
-        const fltSemantics *Sem;
-        // FIXME: This shouldn't be here.
-        if (LHSExt->getSrcTy()->isHalfTy())
-          Sem = &APFloat::IEEEhalf();
-        else if (LHSExt->getSrcTy()->isFloatTy())
-          Sem = &APFloat::IEEEsingle();
-        else if (LHSExt->getSrcTy()->isDoubleTy())
-          Sem = &APFloat::IEEEdouble();
-        else if (LHSExt->getSrcTy()->isFP128Ty())
-          Sem = &APFloat::IEEEquad();
-        else if (LHSExt->getSrcTy()->isX86_FP80Ty())
-          Sem = &APFloat::x87DoubleExtended();
-        else if (LHSExt->getSrcTy()->isPPC_FP128Ty())
-          Sem = &APFloat::PPCDoubleDouble();
-        else
-          break;
-
+        const fltSemantics &FPSem = LHSExt->getSrcTy()->getFltSemantics();
         bool Lossy;
         APFloat F = RHSF->getValueAPF();
-        F.convert(*Sem, APFloat::rmNearestTiesToEven, &Lossy);
+        F.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
 
         // Avoid lossy conversions and denormals. Zero is a special case
         // that's OK to convert.
         APFloat Fabs = F;
         Fabs.clearSign();
         if (!Lossy &&
-            ((Fabs.compare(APFloat::getSmallestNormalized(*Sem)) !=
+            ((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
                  APFloat::cmpLessThan) || Fabs.isZero()))
 
           return new FCmpInst(Pred, LHSExt->getOperand(0),
-- 
GitLab


From 0a2198d1201baa2a410e106f638a8c45c71fc25f Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 30 Oct 2018 16:23:38 +0000
Subject: [PATCH 0751/1116] [llvm-objcopy] Fix
 --keep-global-symbol/--globalize-symbol for undefined symbols.

Summary: --keep-global-symbol and --globalize-symbol don't make sense for undefined symbols, so it should be ignored for those symbols. This matches GNU objcopy behavior.

Reviewers: jhenderson, alexshap, jakehehrlich, espindola

Reviewed By: jhenderson, jakehehrlich

Subscribers: emaste, arichardson, llvm-commits

Differential Revision: https://reviews.llvm.org/D53733

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345614 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-objcopy/globalize.test           | 15 ++++++++++++++-
 test/tools/llvm-objcopy/keep-global-symbols.test | 10 +++++++---
 tools/llvm-objcopy/ELF/ELFObjcopy.cpp            |  6 ++++--
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/test/tools/llvm-objcopy/globalize.test b/test/tools/llvm-objcopy/globalize.test
index 5c9d62e6c08..4941cf12e92 100644
--- a/test/tools/llvm-objcopy/globalize.test
+++ b/test/tools/llvm-objcopy/globalize.test
@@ -1,5 +1,8 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy --globalize-symbol Global --globalize-symbol Local --globalize-symbol Weak %t %t2
+# RUN: llvm-objcopy --globalize-symbol Global \
+# RUN:   --globalize-symbol Local \
+# RUN:   --globalize-symbol Weak \
+# RUN:   --globalize-symbol WeakUndef %t %t2
 # RUN: llvm-readobj -symbols %t2 | FileCheck %s
 
 !ELF
@@ -28,6 +31,7 @@ Symbols:
       Size:     8
       Section:  .text
       Value:    0x1008
+    - Name:     WeakUndef
   Global:
     - Name:     Global
       Type:     STT_FUNC
@@ -72,4 +76,13 @@ Symbols:
 #CHECK-NEXT:    Other: 0
 #CHECK-NEXT:    Section: .text
 #CHECK-NEXT:  }
+#CHECK-NEXT:  Symbol {
+#CHECK-NEXT:    Name: WeakUndef
+#CHECK-NEXT:    Value: 0x0
+#CHECK-NEXT:    Size: 0
+#CHECK-NEXT:    Binding: Weak
+#CHECK-NEXT:    Type: None
+#CHECK-NEXT:    Other: 0
+#CHECK-NEXT:    Section: Undefined
+#CHECK-NEXT:  }
 #CHECK-NEXT:]
diff --git a/test/tools/llvm-objcopy/keep-global-symbols.test b/test/tools/llvm-objcopy/keep-global-symbols.test
index 4f580b4ae8a..8ce1d7f3a2a 100644
--- a/test/tools/llvm-objcopy/keep-global-symbols.test
+++ b/test/tools/llvm-objcopy/keep-global-symbols.test
@@ -18,6 +18,8 @@
 # "Global5 Global6": Global, because it appears in %t-globals2.txt, but we only
 #     trim leading and trailing whitespace. We don't just take the first chunk
 #     that looks like a symbol.
+# Global7: Global, because even though it doesn't appear as any -G flags, does
+#     not get demoted since it's undefined.
 
 # RUN: echo Global2 > %t-globals1.txt
 # RUN: echo "  Global3  " > %t-globals2.txt
@@ -77,8 +79,9 @@ Symbols:
       Section:     .text
     - Name:        "Global5 Global6"
       Section:     .text
+    - Name:        Global7
 
-# CHECK:      Symbol table '.symtab' contains 13 entries:
+# CHECK:      Symbol table '.symtab' contains 14 entries:
 # CHECK-NEXT:    Num: Value Size Type Bind Vis Ndx Name
 # CHECK-NEXT:      0: {{.*}}  LOCAL  {{.*}}
 # CHECK-NEXT:      1: {{.*}}  LOCAL  {{.*}} Local1
@@ -91,5 +94,6 @@ Symbols:
 # CHECK-NEXT:      8: {{.*}}  GLOBAL {{.*}} Global3
 # CHECK-NEXT:      9: {{.*}}  GLOBAL {{.*}} Global4
 # CHECK-NEXT:     10: {{.*}}  GLOBAL {{.*}} Global5 Global6
-# CHECK-NEXT:     11: {{.*}}  WEAK   {{.*}} Weak1
-# CHECK-NEXT:     12: {{.*}}  GLOBAL {{.*}} Weak2
+# CHECK-NEXT:     11: {{.*}}  GLOBAL {{.*}} UND Global7
+# CHECK-NEXT:     12: {{.*}}  WEAK   {{.*}} Weak1
+# CHECK-NEXT:     13: {{.*}}  GLOBAL {{.*}} Weak2
diff --git a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
index 76379788205..2bad270cda7 100644
--- a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
+++ b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
@@ -230,11 +230,13 @@ static void handleArgs(const CopyConfig &Config, Object &Obj,
       // --keep-global-symbol. Because of that, make sure to check
       // --globalize-symbol second.
       if (!Config.SymbolsToKeepGlobal.empty() &&
-          !is_contained(Config.SymbolsToKeepGlobal, Sym.Name))
+          !is_contained(Config.SymbolsToKeepGlobal, Sym.Name) &&
+          Sym.getShndx() != SHN_UNDEF)
         Sym.Binding = STB_LOCAL;
 
       if (!Config.SymbolsToGlobalize.empty() &&
-          is_contained(Config.SymbolsToGlobalize, Sym.Name))
+          is_contained(Config.SymbolsToGlobalize, Sym.Name) &&
+          Sym.getShndx() != SHN_UNDEF)
         Sym.Binding = STB_GLOBAL;
 
       if (!Config.SymbolsToWeaken.empty() &&
-- 
GitLab


From 9abf4aed732782d913cfa1afd85e0d20e10674d0 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 30 Oct 2018 16:58:43 +0000
Subject: [PATCH 0752/1116] [InstSimplify] add tests for fcmp folds; NFC

This is part of a problem noted in PR39475:
https://bugs.llvm.org/show_bug.cgi?id=39475


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345615 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstSimplify/floating-point-compare.ll    | 75 ++++++++++++++++++-
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/test/Transforms/InstSimplify/floating-point-compare.ll b/test/Transforms/InstSimplify/floating-point-compare.ll
index eeae34c3059..a3c17b1fea0 100644
--- a/test/Transforms/InstSimplify/floating-point-compare.ll
+++ b/test/Transforms/InstSimplify/floating-point-compare.ll
@@ -234,15 +234,86 @@ define i1 @orderedLessZeroPowi(double,double) {
   ret i1 %olt
 }
 
-define i1 @orderedLessZeroUIToFP(i32) {
+define i1 @orderedLessZeroUIToFP(i32 %x) {
 ; CHECK-LABEL: @orderedLessZeroUIToFP(
 ; CHECK-NEXT:    ret i1 true
 ;
-  %a = uitofp i32 %0 to float
+  %a = uitofp i32 %x to float
   %uge = fcmp uge float %a, 0.000000e+00
   ret i1 %uge
 }
 
+define <2 x i1> @orderedLessZeroUIToFP_vec(<2 x i32> %x) {
+; CHECK-LABEL: @orderedLessZeroUIToFP_vec(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %a = uitofp <2 x i32> %x to <2 x float>
+  %uge = fcmp uge <2 x float> %a, zeroinitializer
+  ret <2 x i1> %uge
+}
+
+define i1 @orderedLessZeroUIToFP_nnan(i32 %x) {
+; CHECK-LABEL: @orderedLessZeroUIToFP_nnan(
+; CHECK-NEXT:    [[A:%.*]] = uitofp i32 [[X:%.*]] to float
+; CHECK-NEXT:    [[UGE:%.*]] = fcmp nnan oge float [[A]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[UGE]]
+;
+  %a = uitofp i32 %x to float
+  %uge = fcmp nnan oge float %a, 0.000000e+00
+  ret i1 %uge
+}
+
+define <2 x i1> @orderedLessZeroUIToFP_nnan_vec(<2 x i32> %x) {
+; CHECK-LABEL: @orderedLessZeroUIToFP_nnan_vec(
+; CHECK-NEXT:    [[A:%.*]] = uitofp <2 x i32> [[X:%.*]] to <2 x float>
+; CHECK-NEXT:    [[UGE:%.*]] = fcmp nnan oge <2 x float> [[A]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[UGE]]
+;
+  %a = uitofp <2 x i32> %x to <2 x float>
+  %uge = fcmp nnan oge <2 x float> %a, zeroinitializer
+  ret <2 x i1> %uge
+}
+
+define i1 @fabs_is_nan_or_positive_or_zero(double %x) {
+; CHECK-LABEL: @fabs_is_nan_or_positive_or_zero(
+; CHECK-NEXT:    ret i1 true
+;
+  %fabs = tail call double @llvm.fabs.f64(double %x)
+  %cmp = fcmp uge double %fabs, 0.0
+  ret i1 %cmp
+}
+
+define <2 x i1> @fabs_is_nan_or_positive_or_zero_vec(<2 x double> %x) {
+; CHECK-LABEL: @fabs_is_nan_or_positive_or_zero_vec(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %fabs = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
+  %cmp = fcmp uge <2 x double> %fabs, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+define i1 @fabs_nnan_is_positive_or_zero(double %x) {
+; CHECK-LABEL: @fabs_nnan_is_positive_or_zero(
+; CHECK-NEXT:    [[FABS:%.*]] = tail call double @llvm.fabs.f64(double [[X:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan oge double [[FABS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fabs = tail call double @llvm.fabs.f64(double %x)
+  %cmp = fcmp nnan oge double %fabs, 0.0
+  ret i1 %cmp
+}
+
+define <2 x i1> @fabs_nnan_is_positive_or_zero_vec(<2 x double> %x) {
+; CHECK-LABEL: @fabs_nnan_is_positive_or_zero_vec(
+; CHECK-NEXT:    [[FABS:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[X:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan oge <2 x double> [[FABS]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %fabs = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
+  %cmp = fcmp nnan oge <2 x double> %fabs, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
 define i1 @orderedLessZeroSelect(float, float) {
 ; CHECK-LABEL: @orderedLessZeroSelect(
 ; CHECK-NEXT:    ret i1 true
-- 
GitLab


From cbcde934b39b8f45d4e796f5727dafb6cb6b4f1d Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Tue, 30 Oct 2018 17:51:14 +0000
Subject: [PATCH 0753/1116] [InstCombine] Add preliminary tests for nested
 min/max combines. NFC

Summary: As requested in D53774.

Reviewers: spatel

Reviewed By: spatel

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53875

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345616 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/maximum.ll | 60 +++++++++++++++++++++++++
 test/Transforms/InstCombine/maxnum.ll  | 60 +++++++++++++++++++++++++
 test/Transforms/InstCombine/minimum.ll | 62 +++++++++++++++++++++++++-
 test/Transforms/InstCombine/minnum.ll  | 62 +++++++++++++++++++++++++-
 4 files changed, 242 insertions(+), 2 deletions(-)

diff --git a/test/Transforms/InstCombine/maximum.ll b/test/Transforms/InstCombine/maximum.ll
index 302b21cf626..1b3114788a5 100644
--- a/test/Transforms/InstCombine/maximum.ll
+++ b/test/Transforms/InstCombine/maximum.ll
@@ -145,6 +145,66 @@ define float @maximum_f32_val_nan(float %x) {
   ret float %y
 }
 
+define float @maximum_f32_1_maximum_val_p0(float %x) {
+; CHECK-LABEL: @maximum_f32_1_maximum_val_p0(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maximum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maximum.f32(float %x, float 0.0)
+  %z = call float @llvm.maximum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maximum_f32_1_maximum_p0_val_fast(float %x) {
+; CHECK-LABEL: @maximum_f32_1_maximum_p0_val_fast(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maximum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.maximum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maximum.f32(float 0.0, float %x)
+  %z = call fast float @llvm.maximum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maximum_f32_1_maximum_p0_val_nnan_ninf(float %x) {
+; CHECK-LABEL: @maximum_f32_1_maximum_p0_val_nnan_ninf(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maximum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.maximum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maximum.f32(float 0.0, float %x)
+  %z = call nnan ninf float @llvm.maximum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maximum_f32_p0_maximum_val_n0(float %x) {
+; CHECK-LABEL: @maximum_f32_p0_maximum_val_n0(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maximum.f32(float %x, float -0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float [[Y]], float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maximum.f32(float %x, float -0.0)
+  %z = call float @llvm.maximum.f32(float %y, float 0.0)
+  ret float %z
+}
+
+define float @maximum_f32_1_maximum_p0_val(float %x) {
+; CHECK-LABEL: @maximum_f32_1_maximum_p0_val(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maximum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maximum.f32(float 0.0, float %x)
+  %z = call float @llvm.maximum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define <2 x float> @maximum_f32_1_maximum_val_p0_val_v2f32(<2 x float> %x) {
+; CHECK-LABEL: @maximum_f32_1_maximum_val_p0_val_v2f32(
+; CHECK-NEXT: [[Y:%.*]] = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.maximum.v2f32(<2 x float> [[Y]], <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; CHECK-NEXT: ret <2 x float> [[RES]]
+  %y = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+  %z = call <2 x float> @llvm.maximum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
+  ret <2 x float> %z
+}
+
 define float @maximum4(float %x, float %y, float %z, float %w) {
 ; CHECK-LABEL: @maximum4(
 ; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
diff --git a/test/Transforms/InstCombine/maxnum.ll b/test/Transforms/InstCombine/maxnum.ll
index e3630ba3ea5..a621d99ac1f 100644
--- a/test/Transforms/InstCombine/maxnum.ll
+++ b/test/Transforms/InstCombine/maxnum.ll
@@ -145,6 +145,66 @@ define float @maxnum_f32_val_nan(float %x) {
   ret float %y
 }
 
+define float @maxnum_f32_1_maxnum_val_p0(float %x) {
+; CHECK-LABEL: @maxnum_f32_1_maxnum_val_p0(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maxnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maxnum.f32(float %x, float 0.0)
+  %z = call float @llvm.maxnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maxnum_f32_1_maxnum_p0_val_fast(float %x) {
+; CHECK-LABEL: @maxnum_f32_1_maxnum_p0_val_fast(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maxnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.maxnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maxnum.f32(float 0.0, float %x)
+  %z = call fast float @llvm.maxnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maxnum_f32_1_maxnum_p0_val_nnan_ninf(float %x) {
+; CHECK-LABEL: @maxnum_f32_1_maxnum_p0_val_nnan_ninf(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maxnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.maxnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maxnum.f32(float 0.0, float %x)
+  %z = call nnan ninf float @llvm.maxnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maxnum_f32_p0_maxnum_val_n0(float %x) {
+; CHECK-LABEL: @maxnum_f32_p0_maxnum_val_n0(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maxnum.f32(float %x, float -0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float [[Y]], float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maxnum.f32(float %x, float -0.0)
+  %z = call float @llvm.maxnum.f32(float %y, float 0.0)
+  ret float %z
+}
+
+define float @maxnum_f32_1_maxnum_p0_val(float %x) {
+; CHECK-LABEL: @maxnum_f32_1_maxnum_p0_val(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maxnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maxnum.f32(float 0.0, float %x)
+  %z = call float @llvm.maxnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define <2 x float> @maxnum_f32_1_maxnum_val_p0_val_v2f32(<2 x float> %x) {
+; CHECK-LABEL: @maxnum_f32_1_maxnum_val_p0_val_v2f32(
+; CHECK-NEXT: [[Y:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[Y]], <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; CHECK-NEXT: ret <2 x float> [[RES]]
+  %y = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+  %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
+  ret <2 x float> %z
+}
+
 define float @maxnum4(float %x, float %y, float %z, float %w) {
 ; CHECK-LABEL: @maxnum4(
 ; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
diff --git a/test/Transforms/InstCombine/minimum.ll b/test/Transforms/InstCombine/minimum.ll
index 858a3c1d377..6485a27fb52 100644
--- a/test/Transforms/InstCombine/minimum.ll
+++ b/test/Transforms/InstCombine/minimum.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
 declare float @llvm.minimum.f32(float, float)
-declare float @llvm.minimum.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
 declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
 
 declare double @llvm.minimum.f64(double, double)
@@ -147,6 +147,66 @@ define float @minimum_f32_val_nan(float %x) {
   ret float %y
 }
 
+define float @minimum_f32_1_minimum_val_p0(float %x) {
+; CHECK-LABEL: @minimum_f32_1_minimum_val_p0(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minimum.f32(float %x, float 0.0)
+  %z = call float @llvm.minimum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minimum_f32_1_minimum_p0_val_fast(float %x) {
+; CHECK-LABEL: @minimum_f32_1_minimum_p0_val_fast(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.minimum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minimum.f32(float 0.0, float %x)
+  %z = call fast float @llvm.minimum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minimum_f32_1_minimum_p0_val_nnan_ninf(float %x) {
+; CHECK-LABEL: @minimum_f32_1_minimum_p0_val_nnan_ninf(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.minimum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minimum.f32(float 0.0, float %x)
+  %z = call nnan ninf float @llvm.minimum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minimum_f32_p0_minimum_val_n0(float %x) {
+; CHECK-LABEL: @minimum_f32_p0_minimum_val_n0(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minimum.f32(float %x, float -0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float [[Y]], float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minimum.f32(float %x, float -0.0)
+  %z = call float @llvm.minimum.f32(float %y, float 0.0)
+  ret float %z
+}
+
+define float @minimum_f32_1_minimum_p0_val(float %x) {
+; CHECK-LABEL: @minimum_f32_1_minimum_p0_val(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minimum.f32(float 0.0, float %x)
+  %z = call float @llvm.minimum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define <2 x float> @minimum_f32_1_minimum_val_p0_val_v2f32(<2 x float> %x) {
+; CHECK-LABEL: @minimum_f32_1_minimum_val_p0_val_v2f32(
+; CHECK-NEXT: [[Y:%.*]] = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.minimum.v2f32(<2 x float> [[Y]], <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; CHECK-NEXT: ret <2 x float> [[RES]]
+  %y = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+  %z = call <2 x float> @llvm.minimum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
+  ret <2 x float> %z
+}
+
 define float @minimum4(float %x, float %y, float %z, float %w) {
 ; CHECK-LABEL: @minimum4(
 ; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
diff --git a/test/Transforms/InstCombine/minnum.ll b/test/Transforms/InstCombine/minnum.ll
index a5236d2e50f..00cf66103a9 100644
--- a/test/Transforms/InstCombine/minnum.ll
+++ b/test/Transforms/InstCombine/minnum.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
 declare float @llvm.minnum.f32(float, float)
-declare float @llvm.minnum.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
 
 declare double @llvm.minnum.f64(double, double)
@@ -147,6 +147,66 @@ define float @minnum_f32_val_nan(float %x) {
   ret float %y
 }
 
+define float @minnum_f32_1_minnum_val_p0(float %x) {
+; CHECK-LABEL: @minnum_f32_1_minnum_val_p0(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minnum.f32(float %x, float 0.0)
+  %z = call float @llvm.minnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minnum_f32_1_minnum_p0_val_fast(float %x) {
+; CHECK-LABEL: @minnum_f32_1_minnum_p0_val_fast(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.minnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minnum.f32(float 0.0, float %x)
+  %z = call fast float @llvm.minnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minnum_f32_1_minnum_p0_val_nnan_ninf(float %x) {
+; CHECK-LABEL: @minnum_f32_1_minnum_p0_val_nnan_ninf(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.minnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minnum.f32(float 0.0, float %x)
+  %z = call nnan ninf float @llvm.minnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minnum_f32_p0_minnum_val_n0(float %x) {
+; CHECK-LABEL: @minnum_f32_p0_minnum_val_n0(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minnum.f32(float %x, float -0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float [[Y]], float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minnum.f32(float %x, float -0.0)
+  %z = call float @llvm.minnum.f32(float %y, float 0.0)
+  ret float %z
+}
+
+define float @minnum_f32_1_minnum_p0_val(float %x) {
+; CHECK-LABEL: @minnum_f32_1_minnum_p0_val(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minnum.f32(float 0.0, float %x)
+  %z = call float @llvm.minnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define <2 x float> @minnum_f32_1_minnum_val_p0_val_v2f32(<2 x float> %x) {
+; CHECK-LABEL: @minnum_f32_1_minnum_val_p0_val_v2f32(
+; CHECK-NEXT: [[Y:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> [[Y]], <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; CHECK-NEXT: ret <2 x float> [[RES]]
+  %y = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+  %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
+  ret <2 x float> %z
+}
+
 define float @minnum4(float %x, float %y, float %z, float %w) {
 ; CHECK-LABEL: @minnum4(
 ; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
-- 
GitLab


From 38ad550dd9adf9ceb49422c2206dcf1f769d42cb Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 30 Oct 2018 18:10:02 +0000
Subject: [PATCH 0754/1116] [TTI] Fix uses of SK_ExtractSubvector shuffle costs
 (PR39368)

Correct costings of SK_ExtractSubvector requires the SubTy argument to indicate the type/size of the extracted subvector.

Unlike the rest of the shuffle kinds this means that the main Ty argument represents the source vector type not the destination!

I've done my best to fix a number of vectorizer uses:

SLP - the reduction epilogue costs should be using a SK_PermuteSingleSrc shuffle as these all occur at the hardware vector width - we're not extracting (illegal) subvector types. This is causing the cost model diffs as SK_ExtractSubvector costs are poorly handled and tend to just return 1 at the moment.

LV - I'm not clear on what the SK_ExtractSubvector should represents for recurrences - I've used a <1 x ?> subvector extraction as that seems to match the VF delta.

Differential Revision: https://reviews.llvm.org/D53573

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345617 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/TargetTransformInfo.h   |   4 +-
 include/llvm/CodeGen/BasicTTIImpl.h           |  18 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    |   2 +-
 .../CostModel/AArch64/vector-reduce.ll        |  44 +--
 test/Analysis/CostModel/X86/reduce-add.ll     |  72 ++---
 test/Analysis/CostModel/X86/reduce-and.ll     | 293 ++++++++++++------
 test/Analysis/CostModel/X86/reduce-mul.ll     | 104 ++++---
 test/Analysis/CostModel/X86/reduce-or.ll      | 293 ++++++++++++------
 test/Analysis/CostModel/X86/reduce-smax.ll    |  40 +--
 test/Analysis/CostModel/X86/reduce-smin.ll    |  40 +--
 test/Analysis/CostModel/X86/reduce-umax.ll    |  40 +--
 test/Analysis/CostModel/X86/reduce-umin.ll    |  40 +--
 test/Analysis/CostModel/X86/reduce-xor.ll     | 293 ++++++++++++------
 test/Analysis/CostModel/X86/reduction.ll      |   4 +-
 14 files changed, 799 insertions(+), 488 deletions(-)

diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index c2a9d1ec195..3edbe9fb7c5 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -770,7 +770,9 @@ public:
 
   /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
   /// The index and subtype parameters are used by the subvector insertion and
-  /// extraction shuffle kinds.
+  /// extraction shuffle kinds to show the insert/extract point and the type of
+  /// the subvector being inserted/extracted. 
+  /// NOTE: For subvector extractions Tp represents the source type.
   int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0,
                      Type *SubTp = nullptr) const;
 
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index 0cd38617123..daf5fbfef8e 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1334,12 +1334,13 @@ public:
         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
     while (NumVecElts > MVTLen) {
       NumVecElts /= 2;
+      Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
       // Assume the pairwise shuffles add a cost.
       ShuffleCost += (IsPairwise + 1) *
                      ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                                 NumVecElts, Ty);
+                                                 NumVecElts, SubTy);
       ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
-      Ty = VectorType::get(ScalarTy, NumVecElts);
+      Ty = SubTy;
       ++LongVectorCount;
     }
     // The minimal length of the vector is limited by the real length of vector
@@ -1347,8 +1348,8 @@ public:
     // reduction operations are performed on the vectors with the same
     // architecture-dependent length.
     ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
-                   ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                               NumVecElts, Ty);
+                   ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
+                                               0, Ty);
     ArithCost += (NumReduxLevels - LongVectorCount) *
                  ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
     return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
@@ -1381,15 +1382,16 @@ public:
         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
     while (NumVecElts > MVTLen) {
       NumVecElts /= 2;
+      Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
       // Assume the pairwise shuffles add a cost.
       ShuffleCost += (IsPairwise + 1) *
                      ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                                 NumVecElts, Ty);
+                                                 NumVecElts, SubTy);
       MinMaxCost +=
           ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
           ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
                                           nullptr);
-      Ty = VectorType::get(ScalarTy, NumVecElts);
+      Ty = SubTy;
       CondTy = VectorType::get(ScalarCondTy, NumVecElts);
       ++LongVectorCount;
     }
@@ -1398,8 +1400,8 @@ public:
     // reduction opertions are perfomed on the vectors with the same
     // architecture-dependent length.
     ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
-                   ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                               NumVecElts, Ty);
+                   ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
+                                               0, Ty);
     MinMaxCost +=
         (NumReduxLevels - LongVectorCount) *
         (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 006c13c233e..ffa6b242e00 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5754,7 +5754,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // First-order recurrences are replaced by vector shuffles inside the loop.
     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                VectorTy, VF - 1, VectorTy);
+                                VectorTy, VF - 1, ToVectorTy(RetTy, 1));
 
     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
     // converted into select instructions. We require N - 1 selects per phi
diff --git a/test/Analysis/CostModel/AArch64/vector-reduce.ll b/test/Analysis/CostModel/AArch64/vector-reduce.ll
index 5bf50764e2e..c268a18e7f8 100644
--- a/test/Analysis/CostModel/AArch64/vector-reduce.ll
+++ b/test/Analysis/CostModel/AArch64/vector-reduce.ll
@@ -47,7 +47,7 @@ define i32 @add.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: umin.i8.v8i8
-; COST:       Found an estimated cost of 157 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: umin.i8.v8i8
 ; CODE:       uminv b0, v0.8b
 define i8 @umin.i8.v8i8(<8 x i8> %v) {
@@ -56,7 +56,7 @@ define i8 @umin.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: umin.i8.v16i8
-; COST:       Found an estimated cost of 388 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: umin.i8.v16i8
 ; CODE:       uminv b0, v0.16b
 define i8 @umin.i8.v16i8(<16 x i8> %v) {
@@ -65,7 +65,7 @@ define i8 @umin.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: umin.i16.v4i16
-; COST:       Found an estimated cost of 58 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: umin.i16.v4i16
 ; CODE:       uminv h0, v0.4h
 define i16 @umin.i16.v4i16(<4 x i16> %v) {
@@ -74,7 +74,7 @@ define i16 @umin.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: umin.i16.v8i16
-; COST:       Found an estimated cost of 157 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: umin.i16.v8i16
 ; CODE:       uminv h0, v0.8h
 define i16 @umin.i16.v8i16(<8 x i16> %v) {
@@ -83,7 +83,7 @@ define i16 @umin.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: umin.i32.v4i32
-; COST:       Found an estimated cost of 58 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: umin.i32.v4i32
 ; CODE:       uminv s0, v0.4s
 define i32 @umin.i32.v4i32(<4 x i32> %v) {
@@ -92,7 +92,7 @@ define i32 @umin.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: umax.i8.v8i8
-; COST:       Found an estimated cost of 157 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: umax.i8.v8i8
 ; CODE:       umaxv b0, v0.8b
 define i8 @umax.i8.v8i8(<8 x i8> %v) {
@@ -101,7 +101,7 @@ define i8 @umax.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: umax.i8.v16i8
-; COST:       Found an estimated cost of 388 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: umax.i8.v16i8
 ; CODE:       umaxv b0, v0.16b
 define i8 @umax.i8.v16i8(<16 x i8> %v) {
@@ -110,7 +110,7 @@ define i8 @umax.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: umax.i16.v4i16
-; COST:       Found an estimated cost of 58 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: umax.i16.v4i16
 ; CODE:       umaxv h0, v0.4h
 define i16 @umax.i16.v4i16(<4 x i16> %v) {
@@ -119,7 +119,7 @@ define i16 @umax.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: umax.i16.v8i16
-; COST:       Found an estimated cost of 157 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: umax.i16.v8i16
 ; CODE:       umaxv h0, v0.8h
 define i16 @umax.i16.v8i16(<8 x i16> %v) {
@@ -128,7 +128,7 @@ define i16 @umax.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: umax.i32.v4i32
-; COST:       Found an estimated cost of 58 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: umax.i32.v4i32
 ; CODE:       umaxv s0, v0.4s
 define i32 @umax.i32.v4i32(<4 x i32> %v) {
@@ -137,7 +137,7 @@ define i32 @umax.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: smin.i8.v8i8
-; COST:       Found an estimated cost of 157 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: smin.i8.v8i8
 ; CODE:       sminv b0, v0.8b
 define i8 @smin.i8.v8i8(<8 x i8> %v) {
@@ -146,7 +146,7 @@ define i8 @smin.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: smin.i8.v16i8
-; COST:       Found an estimated cost of 388 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: smin.i8.v16i8
 ; CODE:       sminv b0, v0.16b
 define i8 @smin.i8.v16i8(<16 x i8> %v) {
@@ -155,7 +155,7 @@ define i8 @smin.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: smin.i16.v4i16
-; COST:       Found an estimated cost of 58 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: smin.i16.v4i16
 ; CODE:       sminv h0, v0.4h
 define i16 @smin.i16.v4i16(<4 x i16> %v) {
@@ -164,7 +164,7 @@ define i16 @smin.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: smin.i16.v8i16
-; COST:       Found an estimated cost of 157 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: smin.i16.v8i16
 ; CODE:       sminv h0, v0.8h
 define i16 @smin.i16.v8i16(<8 x i16> %v) {
@@ -173,7 +173,7 @@ define i16 @smin.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: smin.i32.v4i32
-; COST:       Found an estimated cost of 58 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: smin.i32.v4i32
 ; CODE:       sminv s0, v0.4s
 define i32 @smin.i32.v4i32(<4 x i32> %v) {
@@ -182,7 +182,7 @@ define i32 @smin.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: smax.i8.v8i8
-; COST:       Found an estimated cost of 157 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: smax.i8.v8i8
 ; CODE:       smaxv b0, v0.8b
 define i8 @smax.i8.v8i8(<8 x i8> %v) {
@@ -191,7 +191,7 @@ define i8 @smax.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: smax.i8.v16i8
-; COST:       Found an estimated cost of 388 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: smax.i8.v16i8
 ; CODE:       smaxv b0, v0.16b
 define i8 @smax.i8.v16i8(<16 x i8> %v) {
@@ -200,7 +200,7 @@ define i8 @smax.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: smax.i16.v4i16
-; COST:       Found an estimated cost of 58 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: smax.i16.v4i16
 ; CODE:       smaxv h0, v0.4h
 define i16 @smax.i16.v4i16(<4 x i16> %v) {
@@ -209,7 +209,7 @@ define i16 @smax.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: smax.i16.v8i16
-; COST:       Found an estimated cost of 157 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: smax.i16.v8i16
 ; CODE:       smaxv h0, v0.8h
 define i16 @smax.i16.v8i16(<8 x i16> %v) {
@@ -218,7 +218,7 @@ define i16 @smax.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: smax.i32.v4i32
-; COST:       Found an estimated cost of 58 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: smax.i32.v4i32
 ; CODE:       smaxv s0, v0.4s
 define i32 @smax.i32.v4i32(<4 x i32> %v) {
@@ -227,7 +227,7 @@ define i32 @smax.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: fmin.f32.v4f32
-; COST:       Found an estimated cost of 58 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %v)
 ; CODE-LABEL: fmin.f32.v4f32
 ; CODE:       fminnmv s0, v0.4s
 define float @fmin.f32.v4f32(<4 x float> %v) {
@@ -236,7 +236,7 @@ define float @fmin.f32.v4f32(<4 x float> %v) {
 }
 
 ; COST-LABEL: fmax.f32.v4f32
-; COST:       Found an estimated cost of 58 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %v)
 ; CODE-LABEL: fmax.f32.v4f32
 ; CODE:       fmaxnmv s0, v0.4s
 define float @fmax.f32.v4f32(<4 x float> %v) {
diff --git a/test/Analysis/CostModel/X86/reduce-add.ll b/test/Analysis/CostModel/X86/reduce-add.ll
index 046aaf04e33..97f7a75ffa2 100644
--- a/test/Analysis/CostModel/X86/reduce-add.ll
+++ b/test/Analysis/CostModel/X86/reduce-add.ll
@@ -109,10 +109,10 @@ define i32 @reduce_i32(i32 %arg) {
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
@@ -134,25 +134,25 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
@@ -166,9 +166,9 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512DQ-LABEL: 'reduce_i16'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V4  = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
@@ -181,11 +181,11 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
@@ -207,41 +207,41 @@ define i32 @reduce_i8(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V8   = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
diff --git a/test/Analysis/CostModel/X86/reduce-and.ll b/test/Analysis/CostModel/X86/reduce-and.ll
index 18abdd4a6dc..1dfa0953c28 100644
--- a/test/Analysis/CostModel/X86/reduce-and.ll
+++ b/test/Analysis/CostModel/X86/reduce-and.ll
@@ -17,13 +17,21 @@ define i32 @reduce_i64(i32 %arg) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
@@ -50,13 +58,21 @@ define i32 @reduce_i32(i32 %arg) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
@@ -75,28 +91,52 @@ define i32 @reduce_i32(i32 %arg) {
 }
 
 define i32 @reduce_i16(i32 %arg) {
-; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
@@ -110,9 +150,9 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512DQ-LABEL: 'reduce_i16'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V4  = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
@@ -124,44 +164,68 @@ define i32 @reduce_i16(i32 %arg) {
 }
 
 define i32 @reduce_i8(i32 %arg) {
-; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V8   = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
@@ -173,59 +237,92 @@ define i32 @reduce_i8(i32 %arg) {
 }
 
 define i32 @reduce_i1(i32 %arg) {
-; SSE-LABEL: 'reduce_i1'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_i1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i1'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i1'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i1'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-LABEL: 'reduce_i1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
diff --git a/test/Analysis/CostModel/X86/reduce-mul.ll b/test/Analysis/CostModel/X86/reduce-mul.ll
index 1e659a180fb..97e67a92f8f 100644
--- a/test/Analysis/CostModel/X86/reduce-mul.ll
+++ b/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -20,9 +20,9 @@ define i32 @reduce_i64(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 154 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
@@ -93,9 +93,9 @@ define i32 @reduce_i32(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
@@ -141,10 +141,10 @@ define i32 @reduce_i32(i32 %arg) {
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
@@ -166,25 +166,25 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
@@ -198,9 +198,9 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512DQ-LABEL: 'reduce_i16'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V4  = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
@@ -212,52 +212,68 @@ define i32 @reduce_i16(i32 %arg) {
 }
 
 define i32 @reduce_i8(i32 %arg) {
-; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 239 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 275 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 239 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 239 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 167 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 325 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 157 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 226 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 241 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 182 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 197 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 201 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 182 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 197 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V8   = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
diff --git a/test/Analysis/CostModel/X86/reduce-or.ll b/test/Analysis/CostModel/X86/reduce-or.ll
index 47e473147a9..13814ac2b76 100644
--- a/test/Analysis/CostModel/X86/reduce-or.ll
+++ b/test/Analysis/CostModel/X86/reduce-or.ll
@@ -17,13 +17,21 @@ define i32 @reduce_i64(i32 %arg) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
@@ -50,13 +58,21 @@ define i32 @reduce_i32(i32 %arg) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
@@ -75,28 +91,52 @@ define i32 @reduce_i32(i32 %arg) {
 }
 
 define i32 @reduce_i16(i32 %arg) {
-; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
@@ -110,9 +150,9 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512DQ-LABEL: 'reduce_i16'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V4  = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
@@ -124,44 +164,68 @@ define i32 @reduce_i16(i32 %arg) {
 }
 
 define i32 @reduce_i8(i32 %arg) {
-; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V8   = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
@@ -173,59 +237,92 @@ define i32 @reduce_i8(i32 %arg) {
 }
 
 define i32 @reduce_i1(i32 %arg) {
-; SSE-LABEL: 'reduce_i1'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_i1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i1'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i1'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i1'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-LABEL: 'reduce_i1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
diff --git a/test/Analysis/CostModel/X86/reduce-smax.ll b/test/Analysis/CostModel/X86/reduce-smax.ll
index 23d8b2c5c18..5426c7f9c80 100644
--- a/test/Analysis/CostModel/X86/reduce-smax.ll
+++ b/test/Analysis/CostModel/X86/reduce-smax.ll
@@ -36,9 +36,9 @@ define i32 @reduce_i64(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
@@ -125,10 +125,10 @@ define i32 @reduce_i32(i32 %arg) {
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
@@ -150,9 +150,9 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
@@ -197,11 +197,11 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
@@ -223,9 +223,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
@@ -248,8 +248,8 @@ define i32 @reduce_i8(i32 %arg) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
diff --git a/test/Analysis/CostModel/X86/reduce-smin.ll b/test/Analysis/CostModel/X86/reduce-smin.ll
index 0b3c72b9daa..b8076a98513 100644
--- a/test/Analysis/CostModel/X86/reduce-smin.ll
+++ b/test/Analysis/CostModel/X86/reduce-smin.ll
@@ -36,9 +36,9 @@ define i32 @reduce_i64(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
@@ -125,10 +125,10 @@ define i32 @reduce_i32(i32 %arg) {
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
@@ -150,9 +150,9 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
@@ -197,11 +197,11 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
@@ -223,9 +223,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
@@ -248,8 +248,8 @@ define i32 @reduce_i8(i32 %arg) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
diff --git a/test/Analysis/CostModel/X86/reduce-umax.ll b/test/Analysis/CostModel/X86/reduce-umax.ll
index ae542a07dd3..6b947ebc225 100644
--- a/test/Analysis/CostModel/X86/reduce-umax.ll
+++ b/test/Analysis/CostModel/X86/reduce-umax.ll
@@ -36,9 +36,9 @@ define i32 @reduce_i64(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
@@ -125,10 +125,10 @@ define i32 @reduce_i32(i32 %arg) {
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
@@ -150,9 +150,9 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
@@ -197,11 +197,11 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
@@ -223,9 +223,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
@@ -248,8 +248,8 @@ define i32 @reduce_i8(i32 %arg) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
diff --git a/test/Analysis/CostModel/X86/reduce-umin.ll b/test/Analysis/CostModel/X86/reduce-umin.ll
index 3462c6ec0c6..0fe9029bc82 100644
--- a/test/Analysis/CostModel/X86/reduce-umin.ll
+++ b/test/Analysis/CostModel/X86/reduce-umin.ll
@@ -36,9 +36,9 @@ define i32 @reduce_i64(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
@@ -125,10 +125,10 @@ define i32 @reduce_i32(i32 %arg) {
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
@@ -150,9 +150,9 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
@@ -197,11 +197,11 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
@@ -223,9 +223,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; AVX1-LABEL: 'reduce_i8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
@@ -248,8 +248,8 @@ define i32 @reduce_i8(i32 %arg) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
diff --git a/test/Analysis/CostModel/X86/reduce-xor.ll b/test/Analysis/CostModel/X86/reduce-xor.ll
index e7eb295f63f..f8e82d05aa7 100644
--- a/test/Analysis/CostModel/X86/reduce-xor.ll
+++ b/test/Analysis/CostModel/X86/reduce-xor.ll
@@ -17,13 +17,21 @@ define i32 @reduce_i64(i32 %arg) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
@@ -50,13 +58,21 @@ define i32 @reduce_i32(i32 %arg) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
@@ -75,28 +91,52 @@ define i32 @reduce_i32(i32 %arg) {
 }
 
 define i32 @reduce_i16(i32 %arg) {
-; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
@@ -110,9 +150,9 @@ define i32 @reduce_i16(i32 %arg) {
 ; AVX512DQ-LABEL: 'reduce_i16'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V4  = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
@@ -124,44 +164,68 @@ define i32 @reduce_i16(i32 %arg) {
 }
 
 define i32 @reduce_i8(i32 %arg) {
-; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V8   = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
@@ -173,59 +237,92 @@ define i32 @reduce_i8(i32 %arg) {
 }
 
 define i32 @reduce_i1(i32 %arg) {
-; SSE-LABEL: 'reduce_i1'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_i1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i1'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i1'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i1'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-LABEL: 'reduce_i1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
diff --git a/test/Analysis/CostModel/X86/reduction.ll b/test/Analysis/CostModel/X86/reduction.ll
index 306a46d21ce..04e40d72246 100644
--- a/test/Analysis/CostModel/X86/reduction.ll
+++ b/test/Analysis/CostModel/X86/reduction.ll
@@ -614,7 +614,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction8i16'
@@ -1113,7 +1113,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8i16'
-- 
GitLab


From 0a9adfeb4f04b5e0577c76c9eba322f0ba519e65 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Tue, 30 Oct 2018 18:20:59 +0000
Subject: [PATCH 0755/1116] [SystemZ] Simplify LRV/STRV ISD nodes

The LRV and STRV nodes carry an extra operand to indicate the
type of the memory access.  This is redundant, since the nodes
are actually of class MemIntrinsicNode and therefore hold that
same information already as MemoryVT.

NFC intended.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345618 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/SystemZ/SystemZISelLowering.cpp | 12 +++----
 lib/Target/SystemZ/SystemZISelLowering.h   | 14 ++------
 lib/Target/SystemZ/SystemZInstrInfo.td     | 19 +++++-----
 lib/Target/SystemZ/SystemZOperators.td     | 42 ++++++++++++----------
 4 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index d86737e2192..d2c33546716 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -5398,8 +5398,7 @@ SDValue SystemZTargetLowering::combineSTORE(
         BSwapOp = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), MVT::i32, BSwapOp);
 
       SDValue Ops[] = {
-        N->getOperand(0), BSwapOp, N->getOperand(2),
-        DAG.getValueType(Op1.getValueType())
+        N->getOperand(0), BSwapOp, N->getOperand(2)
       };
 
       return
@@ -5496,13 +5495,14 @@ SDValue SystemZTargetLowering::combineBSWAP(
       // Create the byte-swapping load.
       SDValue Ops[] = {
         LD->getChain(),    // Chain
-        LD->getBasePtr(),  // Ptr
-        DAG.getValueType(N->getValueType(0)) // VT
+        LD->getBasePtr()   // Ptr
       };
+      EVT LoadVT = N->getValueType(0);
+      if (LoadVT == MVT::i16)
+        LoadVT = MVT::i32;
       SDValue BSLoad =
         DAG.getMemIntrinsicNode(SystemZISD::LRV, SDLoc(N),
-                                DAG.getVTList(N->getValueType(0) == MVT::i64 ?
-                                              MVT::i64 : MVT::i32, MVT::Other),
+                                DAG.getVTList(LoadVT, MVT::Other),
                                 Ops, LD->getMemoryVT(), LD->getMemOperand());
 
       // If this is an i16 load, insert the truncate.
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 4b6be9bff0a..6a29ed62065 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -337,18 +337,8 @@ enum NodeType : unsigned {
   // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
   ATOMIC_CMP_SWAP_128,
 
-  // Byte swapping load.
-  //
-  // Operand 0: the address to load from
-  // Operand 1: the type of load (i16, i32, i64)
-  LRV,
-
-  // Byte swapping store.
-  //
-  // Operand 0: the value to store
-  // Operand 1: the address to store to
-  // Operand 2: the type of store (i16, i32, i64)
-  STRV,
+  // Byte swapping load/store.  Same operands as regular load/store.
+  LRV, STRV,
 
   // Prefetch from the second operand using the 4-bit control code in
   // the first operand.  The code is 1 for a load prefetch and 2 for
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index bb5b7aae883..8d3b1011d0a 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -756,16 +756,15 @@ def STMH : StoreMultipleRSY<"stmh", 0xEB26, GRH32>;
 def LRVR  : UnaryRRE<"lrvr",  0xB91F, bswap, GR32, GR32>;
 def LRVGR : UnaryRRE<"lrvgr", 0xB90F, bswap, GR64, GR64>;
 
-// Byte-swapping loads.  Unlike normal loads, these instructions are
-// allowed to access storage more than once.
-def LRVH : UnaryRXY<"lrvh", 0xE31F, z_lrvh, GR32, 2>;
-def LRV  : UnaryRXY<"lrv",  0xE31E, z_lrv,  GR32, 4>;
-def LRVG : UnaryRXY<"lrvg", 0xE30F, z_lrvg, GR64, 8>;
-
-// Likewise byte-swapping stores.
-def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>;
-def STRV  : StoreRXY<"strv",  0xE33E, z_strv,  GR32, 4>;
-def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>;
+// Byte-swapping loads.
+def LRVH : UnaryRXY<"lrvh", 0xE31F, z_loadbswap16, GR32, 2>;
+def LRV  : UnaryRXY<"lrv",  0xE31E, z_loadbswap32, GR32, 4>;
+def LRVG : UnaryRXY<"lrvg", 0xE30F, z_loadbswap64, GR64, 8>;
+
+// Byte-swapping stores.
+def STRVH : StoreRXY<"strvh", 0xE33F, z_storebswap16, GR32, 2>;
+def STRV  : StoreRXY<"strv",  0xE33E, z_storebswap32, GR32, 4>;
+def STRVG : StoreRXY<"strvg", 0xE32F, z_storebswap64, GR64, 8>;
 
 // Byte-swapping memory-to-memory moves.
 let mayLoad = 1, mayStore = 1 in
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 1f299d5fd76..c55a6273f5e 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -127,14 +127,6 @@ def SDT_ZIPM                : SDTypeProfile<1, 1,
 def SDT_ZPrefetch           : SDTypeProfile<0, 2,
                                             [SDTCisVT<0, i32>,
                                              SDTCisPtrTy<1>]>;
-def SDT_ZLoadBSwap          : SDTypeProfile<1, 2,
-                                            [SDTCisInt<0>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisVT<2, OtherVT>]>;
-def SDT_ZStoreBSwap         : SDTypeProfile<0, 3,
-                                            [SDTCisInt<0>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisVT<2, OtherVT>]>;
 def SDT_ZTBegin             : SDTypeProfile<1, 2,
                                             [SDTCisVT<0, i32>,
                                              SDTCisPtrTy<1>,
@@ -283,9 +275,9 @@ def z_subcarry_1        : SDNode<"SystemZISD::SUBCARRY", SDT_ZBinaryWithCarry>;
 def z_membarrier        : SDNode<"SystemZISD::MEMBARRIER", SDTNone,
                                  [SDNPHasChain, SDNPSideEffect]>;
 
-def z_loadbswap        : SDNode<"SystemZISD::LRV", SDT_ZLoadBSwap,
+def z_loadbswap        : SDNode<"SystemZISD::LRV", SDTLoad,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-def z_storebswap       : SDNode<"SystemZISD::STRV", SDT_ZStoreBSwap,
+def z_storebswap       : SDNode<"SystemZISD::STRV", SDTStore,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 def z_tdc               : SDNode<"SystemZISD::TDC", SDT_ZTest>;
@@ -429,16 +421,28 @@ def z_vsrl              : SDNode<"ISD::SRL", SDT_ZVecBinary>;
 // Pattern fragments
 //===----------------------------------------------------------------------===//
 
-def z_lrvh  : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i16)>;
-def z_lrv   : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i32)>;
-def z_lrvg  : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i64)>;
+def z_loadbswap16 : PatFrag<(ops node:$addr), (z_loadbswap node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_loadbswap32 : PatFrag<(ops node:$addr), (z_loadbswap node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_loadbswap64 : PatFrag<(ops node:$addr), (z_loadbswap node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
 
-def z_strvh : PatFrag<(ops node:$src, node:$addr),
-                      (z_storebswap node:$src, node:$addr, i16)>;
-def z_strv  : PatFrag<(ops node:$src, node:$addr),
-                      (z_storebswap node:$src, node:$addr, i32)>;
-def z_strvg : PatFrag<(ops node:$src, node:$addr),
-                      (z_storebswap node:$src, node:$addr, i64)>;
+def z_storebswap16 : PatFrag<(ops node:$src, node:$addr),
+                             (z_storebswap node:$src, node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_storebswap32 : PatFrag<(ops node:$src, node:$addr),
+                             (z_storebswap node:$src, node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_storebswap64 : PatFrag<(ops node:$src, node:$addr),
+                             (z_storebswap node:$src, node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
 
 // Fragments including CC as an implicit source.
 def z_br_ccmask
-- 
GitLab


From 92ddcaeaddf86f7169db524168175bac5febfba6 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 30 Oct 2018 18:25:28 +0000
Subject: [PATCH 0756/1116] [DebugInfo] Define base function on DWARFDie
 reverse iterators

This defines member function base on the specialization of
std::reverse_iterator for DWARFDie::iterator as required by C++
[reverse.iter.conv].

This fixes unit test DWARFDebugInfoTest.cpp under EXPENSIVE_CHECKS which
currently can't be built due to GNU C++ Library calling this member
function in debug mode.

This fixes https://llvm.org/PR38785

Patch by: Eugene Sharygin

Differential revision: https://reviews.llvm.org/D53792

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345621 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFDie.h          | 4 ++++
 unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h
index c77034f6348..baa47c2bfa5 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -404,6 +404,10 @@ public:
       Die = Die.getPreviousSibling();
   }
 
+  llvm::DWARFDie::iterator base() const {
+    return llvm::DWARFDie::iterator(AtEnd ? Die : Die.getSibling());
+  }
+
   reverse_iterator<llvm::DWARFDie::iterator> &operator++() {
     assert(!AtEnd && "Incrementing rend");
     llvm::DWARFDie D = Die.getPreviousSibling();
diff --git a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
index 1be0363adb0..ffbde2df2bc 100644
--- a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
+++ b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
@@ -1227,6 +1227,10 @@ TEST(DWARFDebugInfo, TestRelations) {
   EXPECT_THAT(std::vector<DWARFDie>(A.rbegin(), A.rend()),
               testing::ElementsAre(D, C, B));
 
+  // Make sure conversion from reverse iterator works as expected.
+  EXPECT_EQ(A.rbegin().base(), A.end());
+  EXPECT_EQ(A.rend().base(), A.begin());
+
   // Make sure iterator is bidirectional.
   {
     auto Begin = A.begin();
-- 
GitLab


From e446f79b59cfbf41cd4a8f6b16ffb6a494def48f Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Tue, 30 Oct 2018 18:25:38 +0000
Subject: [PATCH 0757/1116] Fix printing bug in pdb2yaml.

We were using the wrong enum table when mapping enum values
to strings for public symbol flags.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345622 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/ObjectYAML/CodeViewYAMLSymbols.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
index 745f79cd77f..713e9a710e9 100644
--- a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -108,7 +108,7 @@ void ScalarBitSetTraits<ExportFlags>::bitset(IO &io, ExportFlags &Flags) {
 }
 
 void ScalarBitSetTraits<PublicSymFlags>::bitset(IO &io, PublicSymFlags &Flags) {
-  auto FlagNames = getProcSymFlagNames();
+  auto FlagNames = getPublicSymFlagNames();
   for (const auto &E : FlagNames) {
     io.bitSetCase(Flags, E.Name.str().c_str(),
                   static_cast<PublicSymFlags>(E.Value));
-- 
GitLab


From b79937c973852dcd099267d65ecc965f0b821982 Mon Sep 17 00:00:00 2001
From: Nirav Dave <niravd@google.com>
Date: Tue, 30 Oct 2018 18:26:43 +0000
Subject: [PATCH 0758/1116] [DAG] Add const variants for BaseIndexOffset
 functions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345623 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/SelectionDAGAddressAnalysis.h    | 11 +++++++----
 .../SelectionDAG/SelectionDAGAddressAnalysis.cpp      |  7 ++++---
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
index 580606441a9..2b2c48d57bc 100644
--- a/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
+++ b/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
@@ -45,18 +45,21 @@ public:
         IsIndexSignExt(IsIndexSignExt) {}
 
   SDValue getBase() { return Base; }
+  SDValue getBase() const { return Base; }
   SDValue getIndex() { return Index; }
+  SDValue getIndex() const { return Index; }
 
-  bool equalBaseIndex(BaseIndexOffset &Other, const SelectionDAG &DAG) {
+  bool equalBaseIndex(const BaseIndexOffset &Other,
+                      const SelectionDAG &DAG) const {
     int64_t Off;
     return equalBaseIndex(Other, DAG, Off);
   }
 
-  bool equalBaseIndex(BaseIndexOffset &Other, const SelectionDAG &DAG,
-                      int64_t &Off);
+  bool equalBaseIndex(const BaseIndexOffset &Other, const SelectionDAG &DAG,
+                      int64_t &Off) const;
 
   /// Parses tree in Ptr for base, index, offset addresses.
-  static BaseIndexOffset match(LSBaseSDNode *N, const SelectionDAG &DAG);
+  static BaseIndexOffset match(const LSBaseSDNode *N, const SelectionDAG &DAG);
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index c859f16e74f..8c57f18183e 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -19,8 +19,9 @@
 
 using namespace llvm;
 
-bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
-                                     const SelectionDAG &DAG, int64_t &Off) {
+bool BaseIndexOffset::equalBaseIndex(const BaseIndexOffset &Other,
+                                     const SelectionDAG &DAG,
+                                     int64_t &Off) const {
   // Conservatively fail if we a match failed..
   if (!Base.getNode() || !Other.Base.getNode())
     return false;
@@ -75,7 +76,7 @@ bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
 }
 
 /// Parses tree in Ptr for base, index, offset addresses.
-BaseIndexOffset BaseIndexOffset::match(LSBaseSDNode *N,
+BaseIndexOffset BaseIndexOffset::match(const LSBaseSDNode *N,
                                        const SelectionDAG &DAG) {
   SDValue Ptr = N->getBasePtr();
 
-- 
GitLab


From 14842a4c81b75cc8aef8364867d888e0c71cc089 Mon Sep 17 00:00:00 2001
From: Calixte Denizet <cdenizet@mozilla.com>
Date: Tue, 30 Oct 2018 18:41:31 +0000
Subject: [PATCH 0759/1116] [GCOV] Function counters are wrong when on one line

Summary:
After commit https://reviews.llvm.org/rL344228, the function definitions have a counter but when on one line the counter is wrong (e.g. void foo() { })
I added a test in: https://reviews.llvm.org/D53601

Reviewers: marco-c

Reviewed By: marco-c

Subscribers: llvm-commits, sylvestre.ledru

Differential Revision: https://reviews.llvm.org/D53600

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345624 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Instrumentation/GCOVProfiling.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index ee546a9a828..01938a0f357 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -572,9 +572,8 @@ void GCOVProfiler::emitProfileNotes() {
 
       // Add the function line number to the lines of the entry block
       // to have a counter for the function definition.
-      Func.getBlock(&EntryBlock)
-          .getFile(SP->getFilename())
-          .addLine(SP->getLine());
+      uint32_t Line = SP->getLine();
+      Func.getBlock(&EntryBlock).getFile(SP->getFilename()).addLine(Line);
 
       for (auto &BB : F) {
         GCOVBlock &Block = Func.getBlock(&BB);
@@ -587,7 +586,6 @@ void GCOVProfiler::emitProfileNotes() {
           Block.addEdge(Func.getReturnBlock());
         }
 
-        uint32_t Line = 0;
         for (auto &I : BB) {
           // Debug intrinsic locations correspond to the location of the
           // declaration, not necessarily any statements or expressions.
@@ -609,6 +607,7 @@ void GCOVProfiler::emitProfileNotes() {
           GCOVLines &Lines = Block.getFile(SP->getFilename());
           Lines.addLine(Loc.getLine());
         }
+        Line = 0;
       }
       EdgeDestinations += Func.getEdgeDestinations();
     }
-- 
GitLab


From 528c13b2d370d87a5debc5cb086f8021897da6cb Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 30 Oct 2018 18:48:42 +0000
Subject: [PATCH 0760/1116] [X86] In lowerVectorShuffleAsBroadcast, make
 peeking through CONCAT_VECTORS work correctly if we already walked through a
 bitcast that changed the element size.

The CONCAT_VECTORS case was using the original mask element count to determine how to adjust the broadcast index. But if we looked through a bitcast the original mask size doesn't tell us anything about the concat_vectors.

This patch switchs to using the concat_vectors input element count directly instead.

Differential Revision: https://reviews.llvm.org/D53823

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345626 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp        |  3 ++-
 test/CodeGen/X86/vector-shuffle-256-v8.ll | 29 +++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index da5340a050b..9acae2cab22 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -11238,7 +11238,8 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
       continue;
     }
     case ISD::CONCAT_VECTORS: {
-      int OperandSize = Mask.size() / V.getNumOperands();
+      int OperandSize =
+          V.getOperand(0).getSimpleValueType().getVectorNumElements();
       V = V.getOperand(BroadcastIdx / OperandSize);
       BroadcastIdx %= OperandSize;
       continue;
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 74c477300d6..addf2d2563f 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -2848,3 +2848,32 @@ entry:
   %add = add <8 x i32> %shuffle, %shuffle1
   ret <8 x i32> %add
 }
+
+; This test used to crash due to bad handling of concat_vectors after a bitcast
+; in lowerVectorShuffleAsBroadcast.
+define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) {
+; AVX1-LABEL: broadcast_concat_crash:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,3,1,1]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2OR512VL-LABEL: broadcast_concat_crash:
+; AVX2OR512VL:       # %bb.0: # %entry
+; AVX2OR512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
+; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2OR512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX2OR512VL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    retq
+entry:
+  %tmp = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %bc = bitcast <8 x float> %tmp to <4 x i64>
+  %tmp1 = extractelement <4 x i64> %bc, i32 3
+  %tmp2 = bitcast i64 %tmp1 to <2 x float>
+  %tmp4 = extractelement <2 x float> %tmp2, i32 1
+  %tmp5 = insertelement <8 x float> undef, float %tmp4, i32 4
+  %tmp6 = insertelement <8 x float> %tmp5, float %z, i32 5
+  ret <8 x float> %tmp6
+}
-- 
GitLab


From bbd45bf0b7c6ef5d76c9c4ca3f596fd9bb604e82 Mon Sep 17 00:00:00 2001
From: David Greene <greened@obbligato.org>
Date: Tue, 30 Oct 2018 19:17:51 +0000
Subject: [PATCH 0761/1116] [AArch64] Create proper memoperand for multi-vector
 stores

Re-apply r345315 with testcase fixes.

Include all of the store's source vector operands when creating the
MachineMemOperand. Previously, we were missing the first operand,
making the store size seem smaller than it really is.

Differential Revision: https://reviews.llvm.org/D52816


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345631 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp    |  2 +-
 .../AArch64/multi-vector-store-size.ll        | 82 +++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/AArch64/multi-vector-store-size.ll

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index a7a1b0a5feb..2a42d2db75d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7972,7 +7972,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+    for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
diff --git a/test/CodeGen/AArch64/multi-vector-store-size.ll b/test/CodeGen/AArch64/multi-vector-store-size.ll
new file mode 100644
index 00000000000..8764eb447a3
--- /dev/null
+++ b/test/CodeGen/AArch64/multi-vector-store-size.ll
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=instruction-select < %s | FileCheck %s
+
+declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+define void @addstx(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entire vector is stored.
+  tail call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
+; CHECK: ST2Twov4s {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
+; CHECK: ST3Threev4s {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
+; CHECK: ST4Fourv4s {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
+
+define void @addst1x(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entire vector is stored.
+  tail call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
+; CHECK: ST1Twov4s {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
+; CHECK: ST1Threev4s {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
+; CHECK: ST1Fourv4s {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
+
+define void @addstxlane(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entire vector is stored.
+  tail call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, i64 1, float* %res)
+; CHECK: ST2i32 {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, float* %res)
+; CHECK: ST3i32 {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, float* %res)
+; CHECK: ST4i32 {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
-- 
GitLab


From 76f9294c76f92c596c39db9e3e648ef581da6217 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Tue, 30 Oct 2018 19:24:51 +0000
Subject: [PATCH 0762/1116] [AArch64] [Windows] SEH opcodes should be
 scheduling boundaries.

Prevents the post-RA scheduler from modifying the prologue sequences
emitting by frame lowering. This is roughly similar to what we do for
other targets: TargetInstrInfo::isSchedulingBoundary checks
isPosition(), which checks for CFI_INSTRUCTION.

isSEHInstruction is taken from D50288; it'll land with whatever patch
lands first.

Differential Revision: https://reviews.llvm.org/D53851


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345634 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrInfo.cpp | 34 +++++++++++++++++++++++++
 lib/Target/AArch64/AArch64InstrInfo.h   |  7 +++++
 test/CodeGen/AArch64/wineh1.mir         |  2 +-
 test/CodeGen/AArch64/wineh2.mir         |  2 +-
 test/CodeGen/AArch64/wineh3.mir         |  2 +-
 test/CodeGen/AArch64/wineh4.mir         |  2 +-
 test/CodeGen/AArch64/wineh5.mir         |  2 +-
 test/CodeGen/AArch64/wineh6.mir         |  2 +-
 test/CodeGen/AArch64/wineh7.mir         |  2 +-
 9 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 503bda08a9c..4953892ed4a 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1085,6 +1085,32 @@ bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const {
   }
 }
 
+bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    default:
+      return false;
+    case AArch64::SEH_StackAlloc:
+    case AArch64::SEH_SaveFPLR:
+    case AArch64::SEH_SaveFPLR_X:
+    case AArch64::SEH_SaveReg:
+    case AArch64::SEH_SaveReg_X:
+    case AArch64::SEH_SaveRegP:
+    case AArch64::SEH_SaveRegP_X:
+    case AArch64::SEH_SaveFReg:
+    case AArch64::SEH_SaveFReg_X:
+    case AArch64::SEH_SaveFRegP:
+    case AArch64::SEH_SaveFRegP_X:
+    case AArch64::SEH_SetFP:
+    case AArch64::SEH_AddFP:
+    case AArch64::SEH_Nop:
+    case AArch64::SEH_PrologEnd:
+    case AArch64::SEH_EpilogStart:
+    case AArch64::SEH_EpilogEnd:
+      return true;
+  }
+}
+
 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
                                              unsigned &SrcReg, unsigned &DstReg,
                                              unsigned &SubIdx) const {
@@ -1137,6 +1163,14 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
   return false;
 }
 
+bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                            const MachineBasicBlock *MBB,
+                                            const MachineFunction &MF) const {
+  if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
+    return true;
+  return isSEHInstruction(MI);
+}
+
 /// analyzeCompare - For a comparison instruction, return the source registers
 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
 /// Return true if the comparison instruction can be analyzed.
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 05721336df7..e8e93e64200 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -189,6 +189,10 @@ public:
                     unsigned FalseReg) const override;
   void getNoop(MCInst &NopInst) const override;
 
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
   /// Return true if the comparison instruction can be analyzed.
@@ -262,6 +266,9 @@ public:
   /// Returns true if the instruction has a shift by immediate that can be
   /// executed in one cycle less.
   bool isFalkorShiftExtFast(const MachineInstr &MI) const;
+  /// Return true if the instructions is a SEH instruciton used for unwinding
+  /// on Windows.
+  static bool isSEHInstruction(const MachineInstr &MI);
 
 private:
   /// Sets the offsets on outlined instructions in \p MBB which use SP
diff --git a/test/CodeGen/AArch64/wineh1.mir b/test/CodeGen/AArch64/wineh1.mir
index 6df9c638e65..c89daf1ce22 100644
--- a/test/CodeGen/AArch64/wineh1.mir
+++ b/test/CodeGen/AArch64/wineh1.mir
@@ -1,4 +1,4 @@
-# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog -filetype=obj -disable-post-ra \
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog -filetype=obj  \
 # RUN:   | llvm-readobj -unwind | FileCheck %s
 # This test case checks the basic validity of the .xdata section.  It's
 # documented at:
diff --git a/test/CodeGen/AArch64/wineh2.mir b/test/CodeGen/AArch64/wineh2.mir
index 29b20963444..e2c31fd56ce 100644
--- a/test/CodeGen/AArch64/wineh2.mir
+++ b/test/CodeGen/AArch64/wineh2.mir
@@ -1,5 +1,5 @@
 # RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
-# RUN:   -disable-post-ra -filetype=obj | llvm-readobj -unwind | FileCheck %s
+# RUN:    -filetype=obj | llvm-readobj -unwind | FileCheck %s
 # Test that the pre/post increment save of a flating point register is correct.
 
 # CHECK:        ExceptionData {
diff --git a/test/CodeGen/AArch64/wineh3.mir b/test/CodeGen/AArch64/wineh3.mir
index 6d54430cfc5..ffca6c157a3 100644
--- a/test/CodeGen/AArch64/wineh3.mir
+++ b/test/CodeGen/AArch64/wineh3.mir
@@ -1,5 +1,5 @@
 # RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
-# RUN:   -disable-post-ra -filetype=obj | llvm-readobj -unwind | FileCheck %s
+# RUN:    -filetype=obj | llvm-readobj -unwind | FileCheck %s
 # Test that the register pairing of both general purpose and floating point
 # registers is correctly saved in the .xdata section, as well as the pre/post
 # increment of floating point register pairs.
diff --git a/test/CodeGen/AArch64/wineh4.mir b/test/CodeGen/AArch64/wineh4.mir
index 39a0d7ec694..4d4cc892c2e 100644
--- a/test/CodeGen/AArch64/wineh4.mir
+++ b/test/CodeGen/AArch64/wineh4.mir
@@ -1,5 +1,5 @@
 # RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
-# RUN:   -disable-branch-fold -disable-post-ra -filetype=obj \
+# RUN:   -disable-branch-fold  -filetype=obj \
 # RUN: | llvm-readobj -unwind | FileCheck %s
 # Check that multiple epilgoues are correctly placed in .xdata.
 
diff --git a/test/CodeGen/AArch64/wineh5.mir b/test/CodeGen/AArch64/wineh5.mir
index f1fa6d4d47b..c47bad5d290 100644
--- a/test/CodeGen/AArch64/wineh5.mir
+++ b/test/CodeGen/AArch64/wineh5.mir
@@ -1,5 +1,5 @@
 # RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
-# RUN:   -disable-post-ra -filetype=obj | llvm-readobj -unwind | FileCheck %s
+# RUN:    -filetype=obj | llvm-readobj -unwind | FileCheck %s
 
 # Check that that the large stack allocation is correctly represented in .xdata.
 
diff --git a/test/CodeGen/AArch64/wineh6.mir b/test/CodeGen/AArch64/wineh6.mir
index 08db6656980..fd1f9ece3a6 100644
--- a/test/CodeGen/AArch64/wineh6.mir
+++ b/test/CodeGen/AArch64/wineh6.mir
@@ -1,5 +1,5 @@
 # RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
-# RUN:   -disable-post-ra -filetype=obj | llvm-readobj -unwind | FileCheck %s
+# RUN:    -filetype=obj | llvm-readobj -unwind | FileCheck %s
 # Check save_fplr_x, set_fp, alloc_s
 
 # CHECK: 	ExceptionData {
diff --git a/test/CodeGen/AArch64/wineh7.mir b/test/CodeGen/AArch64/wineh7.mir
index 60094539297..547c622a704 100644
--- a/test/CodeGen/AArch64/wineh7.mir
+++ b/test/CodeGen/AArch64/wineh7.mir
@@ -1,5 +1,5 @@
 # RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
-# RUN:   -filetype=obj -disable-post-ra | llvm-readobj -unwind | FileCheck %s
+# RUN:   -filetype=obj  | llvm-readobj -unwind | FileCheck %s
 # Check AddFP
 
 # CHECK:	 ExceptionData {
-- 
GitLab


From e7db9d074e2534526f6b92efa9e22459e09b728c Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson@ericsson.com>
Date: Tue, 30 Oct 2018 20:16:39 +0000
Subject: [PATCH 0763/1116] [DAGCombiner] Fix for big endian in
 ForwardStoreValueToDirectLoad

Summary:
Normalize the offset for endianess before checking
if the store cover the load in ForwardStoreValueToDirectLoad.

Without this we missed out on some optimizations for big
endian targets. If for example having a 4 bytes store followed
by a 1 byte load, loading the least significant byte from the
store, the STCoversLD check would fail (see @test4 in
test/CodeGen/AArch64/load-store-forwarding.ll).

This patch also fixes a problem seen in an out-of-tree target.
The target has i40 as a legal type, it is big endian,
and the StoreSize for i40 is 48 bits. So when normalizing
the offset for endianess we need to take the StoreSize into
account (assuming that padding added when storing into
a larger StoreSize always is added at the most significant
end).

Reviewers: niravd

Reviewed By: niravd

Subscribers: javed.absar, kristof.beyls, llvm-commits, uabelho

Differential Revision: https://reviews.llvm.org/D53776

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345636 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      | 22 +++---
 test/CodeGen/AArch64/load-store-forwarding.ll | 77 +++++++++++++++++++
 2 files changed, 90 insertions(+), 9 deletions(-)
 create mode 100644 test/CodeGen/AArch64/load-store-forwarding.ll

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 742ca02a03d..fba2aa9cb52 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12854,20 +12854,24 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
   BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
   BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
   int64_t Offset;
+  if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset))
+    return SDValue();
+
+  // Normalize for Endianness. After this Offset=0 will denote that the least
+  // significant bit in the loaded value maps to the least significant bit in
+  // the stored value). With Offset=n (for n > 0) the loaded value starts at the
+  // n:th least significant byte of the stored value.
+  if (DAG.getDataLayout().isBigEndian())
+    Offset = (STMemType.getStoreSizeInBits() -
+              LDMemType.getStoreSizeInBits()) / 8 - Offset;
 
+  // Check that the stored value cover all bits that are loaded.
   bool STCoversLD =
-      BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset) && (Offset >= 0) &&
-      (Offset * 8 <= LDMemType.getSizeInBits()) &&
+      (Offset >= 0) &&
       (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits());
-
   if (!STCoversLD)
     return SDValue();
 
-  // Normalize for Endianness.
-  if (DAG.getDataLayout().isBigEndian())
-    Offset =
-        (STMemType.getSizeInBits() - LDMemType.getSizeInBits()) / 8 - Offset;
-
   // Memory as copy space (potentially masked).
   if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
     // Simple case: Direct non-truncating forwarding
@@ -12899,7 +12903,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
       continue;
     if (STMemType != LDMemType) {
       // TODO: Support vectors? This requires extract_subvector/bitcast.
-      if (!STMemType.isVector() && !LDMemType.isVector() && 
+      if (!STMemType.isVector() && !LDMemType.isVector() &&
           STMemType.isInteger() && LDMemType.isInteger())
         Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val);
       else
diff --git a/test/CodeGen/AArch64/load-store-forwarding.ll b/test/CodeGen/AArch64/load-store-forwarding.ll
new file mode 100644
index 00000000000..e6124270169
--- /dev/null
+++ b/test/CodeGen/AArch64/load-store-forwarding.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64_be -o - %s | FileCheck %s --check-prefix CHECK-BE
+; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s --check-prefix CHECK-LE
+
+define i8 @test1(i32 %a, i8* %pa) {
+; CHECK-BE-LABEL: test1:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    str w0, [x1]
+; CHECK-BE-NEXT:    ldrb w0, [x1]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LE-LABEL: test1:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    str w0, [x1]
+; CHECK-LE-NEXT:    ret
+  %p32 = bitcast i8* %pa to i32*
+  %p8 = getelementptr i8, i8* %pa, i32 0
+  store i32 %a, i32* %p32
+  %res = load i8, i8* %p8
+  ret i8 %res
+}
+
+define i8 @test2(i32 %a, i8* %pa) {
+; CHECK-BE-LABEL: test2:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    str w0, [x1]
+; CHECK-BE-NEXT:    ldrb w0, [x1, #1]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LE-LABEL: test2:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    str w0, [x1]
+; CHECK-LE-NEXT:    ubfx w0, w0, #8, #8
+; CHECK-LE-NEXT:    ret
+  %p32 = bitcast i8* %pa to i32*
+  %p8 = getelementptr i8, i8* %pa, i32 1
+  store i32 %a, i32* %p32
+  %res = load i8, i8* %p8
+  ret i8 %res
+}
+
+define i8 @test3(i32 %a, i8* %pa) {
+; CHECK-BE-LABEL: test3:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    str w0, [x1]
+; CHECK-BE-NEXT:    ldrb w0, [x1, #2]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LE-LABEL: test3:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    str w0, [x1]
+; CHECK-LE-NEXT:    ubfx w0, w0, #16, #8
+; CHECK-LE-NEXT:    ret
+  %p32 = bitcast i8* %pa to i32*
+  %p8 = getelementptr i8, i8* %pa, i32 2
+  store i32 %a, i32* %p32
+  %res = load i8, i8* %p8
+  ret i8 %res
+}
+
+define i8 @test4(i32 %a, i8* %pa) {
+; CHECK-BE-LABEL: test4:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    str w0, [x1]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LE-LABEL: test4:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    str w0, [x1]
+; CHECK-LE-NEXT:    lsr w0, w0, #24
+; CHECK-LE-NEXT:    ret
+  %p32 = bitcast i8* %pa to i32*
+  %p8 = getelementptr i8, i8* %pa, i32 3
+  store i32 %a, i32* %p32
+  %res = load i8, i8* %p8
+  ret i8 %res
+}
-- 
GitLab


From a0a61c42505ff74c336c8265c4c8d46493808cc4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 30 Oct 2018 20:33:58 +0000
Subject: [PATCH 0764/1116] [ScalarizeMaskedMemIntrin] Limit the scope of some
 variables that are only used inside loops.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345638 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/ScalarizeMaskedMemIntrin.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
index 1fb116e9b48..2684f92b3a9 100644
--- a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
+++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -138,8 +138,6 @@ static void scalarizeMaskedLoad(CallInst *CI) {
   IRBuilder<> Builder(CI->getContext());
   Instruction *InsertPt = CI;
   BasicBlock *IfBlock = CI->getParent();
-  BasicBlock *CondBlock = nullptr;
-  BasicBlock *PrevIfBlock = CI->getParent();
 
   Builder.SetInsertPoint(InsertPt);
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
@@ -195,7 +193,8 @@ static void scalarizeMaskedLoad(CallInst *CI) {
     //  %Elt = load i32* %EltAddr
     //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
     //
-    CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
+                                                     "cond.load");
     Builder.SetInsertPoint(InsertPt);
 
     Value *Gep =
@@ -211,7 +210,7 @@ static void scalarizeMaskedLoad(CallInst *CI) {
     Instruction *OldBr = IfBlock->getTerminator();
     BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
     OldBr->eraseFromParent();
-    PrevIfBlock = IfBlock;
+    BasicBlock *PrevIfBlock = IfBlock;
     IfBlock = NewIfBlock;
 
     // Create the phi to join the new and previous value.
@@ -372,8 +371,6 @@ static void scalarizeMaskedGather(CallInst *CI) {
   IRBuilder<> Builder(CI->getContext());
   Instruction *InsertPt = CI;
   BasicBlock *IfBlock = CI->getParent();
-  BasicBlock *CondBlock = nullptr;
-  BasicBlock *PrevIfBlock = CI->getParent();
   Builder.SetInsertPoint(InsertPt);
   unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
 
@@ -416,7 +413,7 @@ static void scalarizeMaskedGather(CallInst *CI) {
     //  %Elt = load i32* %EltAddr
     //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
     //
-    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
     Builder.SetInsertPoint(InsertPt);
 
     Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
@@ -433,7 +430,7 @@ static void scalarizeMaskedGather(CallInst *CI) {
     Instruction *OldBr = IfBlock->getTerminator();
     BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
     OldBr->eraseFromParent();
-    PrevIfBlock = IfBlock;
+    BasicBlock *PrevIfBlock = IfBlock;
     IfBlock = NewIfBlock;
 
     PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
-- 
GitLab


From dfdeb96122e87f5a137bba01078a4eb891fe1f9c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 30 Oct 2018 20:42:03 +0000
Subject: [PATCH 0765/1116] [x86] try to make test immune to better div
 optimization; NFCI

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345639 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/pr32282.ll | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll
index 7ec6a5dbf7f..6b08e2fa413 100644
--- a/test/CodeGen/X86/pr32282.ll
+++ b/test/CodeGen/X86/pr32282.ll
@@ -9,11 +9,10 @@
 @d = common global i64 zeroinitializer, align 8
 @e = common global i64 zeroinitializer, align 8
 
-define void @foo() {
+define void @foo(i64 %x) nounwind {
 ; X86-LABEL: foo:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    movl d, %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    movl d+4, %ecx
@@ -26,40 +25,35 @@ define void @foo() {
 ; X86-NEXT:    addl $7, %eax
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    pushl %ecx
-; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $0
-; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $0
-; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divdi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_adjust_cfa_offset -16
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    setne {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq {{.*}}(%rip), %rax
-; X64-NEXT:    movabsq $3013716102212485120, %rcx # imm = 0x29D2DED3DE400000
-; X64-NEXT:    andnq %rcx, %rax, %rcx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq {{.*}}(%rip), %rcx
+; X64-NEXT:    movabsq $3013716102212485120, %rdx # imm = 0x29D2DED3DE400000
+; X64-NEXT:    andnq %rdx, %rcx, %rcx
 ; X64-NEXT:    shrq $21, %rcx
 ; X64-NEXT:    addq $7, %rcx
-; X64-NEXT:    movabsq $4393751543808, %rax # imm = 0x3FF00000000
-; X64-NEXT:    testq %rax, %rcx
+; X64-NEXT:    movq %rdi, %rdx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    shrq $32, %rdx
 ; X64-NEXT:    je .LBB0_1
 ; X64-NEXT:  # %bb.2:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    divq %rcx
+; X64-NEXT:    cqto
+; X64-NEXT:    idivq %rcx
 ; X64-NEXT:    jmp .LBB0_3
 ; X64-NEXT:  .LBB0_1:
-; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    divl %ecx
 ; X64-NEXT:    # kill: def $eax killed $eax def $rax
 ; X64-NEXT:  .LBB0_3:
@@ -86,7 +80,7 @@ define void @foo() {
   %18 = ashr i64 %4, %17
   %19 = and i64 %18, 9223372036854775806
   %20 = add nsw i64 7, %19
-  %21 = sdiv i64 0, %20
+  %21 = sdiv i64 %x, %20
   %22 = icmp ne i64 %21, 0
   %23 = zext i1 %22 to i8
   store i8 %23, i8* %1, align 1
-- 
GitLab


From 94a3208c98e145ff06c97057546ec006bd6f2bcf Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 30 Oct 2018 20:44:54 +0000
Subject: [PATCH 0766/1116] [x86] try to make test immune to better div
 optimization; NFCI

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345640 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/copy-eflags.ll | 51 ++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/test/CodeGen/X86/copy-eflags.ll b/test/CodeGen/X86/copy-eflags.ll
index 10fccacf193..836027f47bf 100644
--- a/test/CodeGen/X86/copy-eflags.ll
+++ b/test/CodeGen/X86/copy-eflags.ll
@@ -308,47 +308,46 @@ bb1:
 ; Use a particular instruction pattern in order to lower to the post-RA pseudo
 ; used to lower SETB into an SBB pattern in order to make sure that kind of
 ; usage of a copied EFLAGS continues to work.
-define void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3) {
+define void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3, i32 %x) nounwind {
 ; X32-LABEL: PR37431:
 ; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %esi, -8
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl (%eax), %eax
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    sarl $31, %ecx
 ; X32-NEXT:    cmpl %eax, %eax
 ; X32-NEXT:    sbbl %ecx, %eax
-; X32-NEXT:    setb %al
-; X32-NEXT:    sbbb %cl, %cl
+; X32-NEXT:    setb %cl
+; X32-NEXT:    sbbb %dl, %dl
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movb %cl, (%edx)
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    xorl %ecx, %ecx
-; X32-NEXT:    subl %eax, %ecx
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    xorl %edx, %edx
-; X32-NEXT:    idivl %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movb %dl, (%edi)
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    xorl %edi, %edi
+; X32-NEXT:    subl %ecx, %edi
+; X32-NEXT:    cltd
+; X32-NEXT:    idivl %edi
 ; X32-NEXT:    movb %dl, (%esi)
 ; X32-NEXT:    popl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    popl %edi
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: PR37431:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movslq (%rdi), %rax
-; X64-NEXT:    cmpq %rax, %rax
-; X64-NEXT:    sbbb %dl, %dl
-; X64-NEXT:    cmpq %rax, %rax
-; X64-NEXT:    movb %dl, (%rsi)
-; X64-NEXT:    sbbl %esi, %esi
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    idivl %esi
-; X64-NEXT:    movb %dl, (%rcx)
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movslq (%rdi), %rdx
+; X64-NEXT:    cmpq %rdx, %rax
+; X64-NEXT:    sbbb %cl, %cl
+; X64-NEXT:    cmpq %rdx, %rax
+; X64-NEXT:    movb %cl, (%rsi)
+; X64-NEXT:    sbbl %ecx, %ecx
+; X64-NEXT:    cltd
+; X64-NEXT:    idivl %ecx
+; X64-NEXT:    movb %dl, (%r8)
 ; X64-NEXT:    retq
 entry:
   %tmp = load i32, i32* %arg1
@@ -358,7 +357,7 @@ entry:
   %tmp4 = sub i8 0, %tmp3
   store i8 %tmp4, i8* %arg2
   %tmp5 = sext i8 %tmp4 to i32
-  %tmp6 = srem i32 0, %tmp5
+  %tmp6 = srem i32 %x, %tmp5
   %tmp7 = trunc i32 %tmp6 to i8
   store i8 %tmp7, i8* %arg3
   ret void
-- 
GitLab


From 80491cca53efb2ea4868aa1ce1d148b119137423 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Grang <mgrang@codeaurora.org>
Date: Tue, 30 Oct 2018 20:46:10 +0000
Subject: [PATCH 0767/1116] [COFF, ARM64] Make sure to forward arguments from
 vararg to musttail vararg

Summary:
    Thunk functions in Windows are varag functions that call a musttail function
    to pass the arguments after the fixup is done.  We need to make sure that we
    forward the arguments from the caller vararg to the callee vararg function.
    This is the same mechanism that is used for Windows on X86.

Reviewers: ssijaric, eli.friedman, TomTan, mgrang, mstorsjo, rnk, compnerd, efriedma

Reviewed By: efriedma

Subscribers: efriedma, kristof.beyls, chrib, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D53843

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345641 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp    | 19 +++++++++++
 .../AArch64/AArch64MachineFunctionInfo.h      |  8 +++++
 test/CodeGen/AArch64/vararg-tallcall.ll       | 34 +++++++++++++++++++
 3 files changed, 61 insertions(+)
 create mode 100644 test/CodeGen/AArch64/vararg-tallcall.ll

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2a42d2db75d..3c107016c8b 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3148,6 +3148,17 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     // We currently pass all varargs at 8-byte alignment.
     StackOffset = ((StackOffset + 7) & ~7);
     FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
+
+    if (MFI.hasMustTailInVarArgFunc()) {
+      SmallVector<MVT, 2> RegParmTypes;
+      RegParmTypes.push_back(MVT::i64);
+      RegParmTypes.push_back(MVT::f128);
+      // Compute the set of forwarded registers. The rest are scratch.
+      SmallVectorImpl<ForwardedRegister> &Forwards =
+                                       FuncInfo->getForwardedMustTailRegParms();
+      CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
+                                               CC_AArch64_AAPCS);
+    }
   }
 
   unsigned StackArgSize = CCInfo.getNextStackOffset();
@@ -3608,6 +3619,14 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   SmallVector<SDValue, 8> MemOpChains;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
+  if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
+    const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
+    for (const auto &F : Forwards) {
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
+       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+    }
+  }
+
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
        ++i, ++realArgIdx) {
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 63c0ba2811e..5183e7d3c0d 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include <cassert>
@@ -97,6 +98,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// attribute, in which case it is set to false at construction.
   Optional<bool> HasRedZone;
 
+  /// ForwardedMustTailRegParms - A list of virtual and physical registers
+  /// that must be forwarded to every musttail call.
+  SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
 public:
   AArch64FunctionInfo() = default;
 
@@ -209,6 +213,10 @@ public:
     LOHRelated.insert(Args.begin(), Args.end());
   }
 
+  SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
+    return ForwardedMustTailRegParms;
+  }
+
 private:
   // Hold the lists of LOHs.
   MILOHContainer LOHContainerSet;
diff --git a/test/CodeGen/AArch64/vararg-tallcall.ll b/test/CodeGen/AArch64/vararg-tallcall.ll
new file mode 100644
index 00000000000..28182226803
--- /dev/null
+++ b/test/CodeGen/AArch64/vararg-tallcall.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
+
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+
+%class.X = type { i8 }
+%struct.B = type { i32 (...)** }
+
+$"??_9B@@$BA@AA" = comdat any
+
+; Function Attrs: noinline optnone
+define linkonce_odr void @"??_9B@@$BA@AA"(%struct.B* %this, ...) #1 comdat align 2  {
+entry:
+  %this.addr = alloca %struct.B*, align 8
+  store %struct.B* %this, %struct.B** %this.addr, align 8
+  %this1 = load %struct.B*, %struct.B** %this.addr, align 8
+  call void asm sideeffect "", "~{d0}"()
+  %0 = bitcast %struct.B* %this1 to void (%struct.B*, ...)***
+  %vtable = load void (%struct.B*, ...)**, void (%struct.B*, ...)*** %0, align 8
+  %vfn = getelementptr inbounds void (%struct.B*, ...)*, void (%struct.B*, ...)** %vtable, i64 0
+  %1 = load void (%struct.B*, ...)*, void (%struct.B*, ...)** %vfn, align 8
+  musttail call void (%struct.B*, ...) %1(%struct.B* %this1, ...)
+  ret void
+                                                  ; No predecessors!
+  ret void
+}
+
+attributes #1 = { noinline optnone "thunk" }
+
+; CHECK: mov     v16.16b, v0.16b
+; CHECK: ldr     x8, [x0]
+; CHECK: ldr     x8, [x8]
+; CHECK: mov     v0.16b, v16.16b
+; CHECK: br      x8
-- 
GitLab


From bf4357523a6d5a4ce4867f00b8a4e7ba8107a0ca Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 30 Oct 2018 20:46:23 +0000
Subject: [PATCH 0768/1116] [x86] try to make test immune to better div
 optimization; NFCI

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345642 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/copy-eflags.ll | 43 ++++++++++++++-------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/test/CodeGen/X86/copy-eflags.ll b/test/CodeGen/X86/copy-eflags.ll
index 836027f47bf..1e9a598c651 100644
--- a/test/CodeGen/X86/copy-eflags.ll
+++ b/test/CodeGen/X86/copy-eflags.ll
@@ -200,45 +200,37 @@ else:
 ; Test a function that gets special select lowering into CFG with copied EFLAGS
 ; threaded across the CFG. This requires our EFLAGS copy rewriting to handle
 ; cross-block rewrites in at least some narrow cases.
-define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %ptr2) {
+define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %ptr2, i32 %x) nounwind {
 ; X32-LABEL: PR37100:
 ; X32:       # %bb.0: # %bb
 ; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NEXT:    pushl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NEXT:    pushl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 16
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 20
-; X32-NEXT:    .cfi_offset %esi, -20
-; X32-NEXT:    .cfi_offset %edi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X32-NEXT:    jmp .LBB3_1
 ; X32-NEXT:    .p2align 4, 0x90
 ; X32-NEXT:  .LBB3_5: # %bb1
 ; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    xorl %edx, %edx
-; X32-NEXT:    idivl %ebp
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    cltd
+; X32-NEXT:    idivl %edi
 ; X32-NEXT:  .LBB3_1: # %bb1
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movsbl %cl, %eax
 ; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    sarl $31, %edx
-; X32-NEXT:    cmpl %eax, %esi
+; X32-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    sbbl %edx, %eax
 ; X32-NEXT:    setl %al
 ; X32-NEXT:    setl %dl
-; X32-NEXT:    movzbl %dl, %ebp
-; X32-NEXT:    negl %ebp
+; X32-NEXT:    movzbl %dl, %edi
+; X32-NEXT:    negl %edi
 ; X32-NEXT:    testb %al, %al
 ; X32-NEXT:    jne .LBB3_3
 ; X32-NEXT:  # %bb.2: # %bb1
@@ -246,33 +238,34 @@ define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %
 ; X32-NEXT:    movb %ch, %cl
 ; X32-NEXT:  .LBB3_3: # %bb1
 ; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; X32-NEXT:    movb %cl, (%ebx)
-; X32-NEXT:    movl (%edi), %edx
+; X32-NEXT:    movb %cl, (%ebp)
+; X32-NEXT:    movl (%ebx), %edx
 ; X32-NEXT:    testb %al, %al
 ; X32-NEXT:    jne .LBB3_5
 ; X32-NEXT:  # %bb.4: # %bb1
 ; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    jmp .LBB3_5
 ;
 ; X64-LABEL: PR37100:
 ; X64:       # %bb.0: # %bb
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; X64-NEXT:    jmp .LBB3_1
 ; X64-NEXT:    .p2align 4, 0x90
 ; X64-NEXT:  .LBB3_5: # %bb1
 ; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    movl %r10d, %eax
+; X64-NEXT:    cltd
 ; X64-NEXT:    idivl %esi
 ; X64-NEXT:  .LBB3_1: # %bb1
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    movsbq %dil, %rax
 ; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpq %rax, %r10
+; X64-NEXT:    cmpq %rax, %r11
 ; X64-NEXT:    setl %sil
 ; X64-NEXT:    negl %esi
-; X64-NEXT:    cmpq %rax, %r10
+; X64-NEXT:    cmpq %rax, %r11
 ; X64-NEXT:    jl .LBB3_3
 ; X64-NEXT:  # %bb.2: # %bb1
 ; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
@@ -300,7 +293,7 @@ bb1:
   store volatile i8 %tmp8, i8* %ptr1
   %tmp9 = load volatile i32, i32* %ptr2
   %tmp10 = select i1 %tmp6, i32 %tmp7, i32 %tmp9
-  %tmp11 = srem i32 0, %tmp10
+  %tmp11 = srem i32 %x, %tmp10
   %tmp12 = trunc i32 %tmp11 to i16
   br label %bb1
 }
-- 
GitLab


From 044ef75cd4e41c6ff066fd5ac179caf1334a222d Mon Sep 17 00:00:00 2001
From: Quentin Colombet <quentin.colombet@gmail.com>
Date: Tue, 30 Oct 2018 20:51:04 +0000
Subject: [PATCH 0769/1116] [InstCombine] Teach the move free before null test
 opti how to deal with noop casts

InstCombine features an optimization that essentially replaces:
if (a)
  free(a)
into:
free(a)

Right now, this optimization is gated by the minsize attribute and therefore
we only perform it if we can prove that we are going to be able to eliminate
the branch and the destination block.

However when casts are involved the optimization would fail to apply, because
the optimization was not smart enough to realize that it is possible to also
move the casts away from the destination block and that is harmless to the
performance since they are just noops.
E.g.,
foo(int *a)
if (a)
  free((char*)a)

Wouldn't be optimized by instcombine, because
- We would refuse to hoist the `bitcast i32* %a to i8` in the source block
- We would fail to see that `bitcast i32* %a to i8` and %a are the same value.

This patch fixes both these problems:
- It teaches the pattern matching of the comparison how to look
  through casts.
- It checks that whether the additional instruction in the destination block
  can be hoisted and are harmless performance-wise.
- It hoists all the code of the destination block in the source block.

Differential Revision: D53356

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345644 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstructionCombining.cpp      | 48 ++++++++++++++-----
 .../InstCombine/malloc-free-delete.ll         | 29 +++++++++++
 2 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 8506cf9baee..fd64cc58a1d 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2322,14 +2322,14 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
 /// The move is performed only if the block containing the call to free
 /// will be removed, i.e.:
 /// 1. it has only one predecessor P, and P has two successors
-/// 2. it contains the call and an unconditional branch
+/// 2. it contains the call, noops, and an unconditional branch
 /// 3. its successor is the same as its predecessor's successor
 ///
 /// The profitability is out-of concern here and this function should
 /// be called only if the caller knows this transformation would be
 /// profitable (e.g., for code size).
-static Instruction *
-tryToMoveFreeBeforeNullTest(CallInst &FI) {
+static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
+                                                const DataLayout &DL) {
   Value *Op = FI.getArgOperand(0);
   BasicBlock *FreeInstrBB = FI.getParent();
   BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor();
@@ -2342,20 +2342,34 @@ tryToMoveFreeBeforeNullTest(CallInst &FI) {
     return nullptr;
 
   // Validate constraint #2: Does this block contains only the call to
-  //                         free and an unconditional branch?
-  // FIXME: We could check if we can speculate everything in the
-  //        predecessor block
-  if (FreeInstrBB->size() != 2)
-    return nullptr;
+  //                         free, noops, and an unconditional branch?
   BasicBlock *SuccBB;
-  if (!match(FreeInstrBB->getTerminator(), m_UnconditionalBr(SuccBB)))
+  Instruction *FreeInstrBBTerminator = FreeInstrBB->getTerminator();
+  if (!match(FreeInstrBBTerminator, m_UnconditionalBr(SuccBB)))
     return nullptr;
 
+  // If there are only 2 instructions in the block, at this point,
+  // this is the call to free and unconditional.
+  // If there are more than 2 instructions, check that they are noops
+  // i.e., they won't hurt the performance of the generated code.
+  if (FreeInstrBB->size() != 2) {
+    for (const Instruction &Inst : *FreeInstrBB) {
+      if (&Inst == &FI || &Inst == FreeInstrBBTerminator)
+        continue;
+      auto *Cast = dyn_cast<CastInst>(&Inst);
+      if (!Cast || !Cast->isNoopCast(DL))
+        return nullptr;
+    }
+  }
   // Validate the rest of constraint #1 by matching on the pred branch.
   Instruction *TI = PredBB->getTerminator();
   BasicBlock *TrueBB, *FalseBB;
   ICmpInst::Predicate Pred;
-  if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Op), m_Zero()), TrueBB, FalseBB)))
+  if (!match(TI, m_Br(m_ICmp(Pred,
+                             m_CombineOr(m_Specific(Op),
+                                         m_Specific(Op->stripPointerCasts())),
+                             m_Zero()),
+                      TrueBB, FalseBB)))
     return nullptr;
   if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
     return nullptr;
@@ -2366,7 +2380,17 @@ tryToMoveFreeBeforeNullTest(CallInst &FI) {
   assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) &&
          "Broken CFG: missing edge from predecessor to successor");
 
-  FI.moveBefore(TI);
+  // At this point, we know that everything in FreeInstrBB can be moved
+  // before TI.
+  for (BasicBlock::iterator It = FreeInstrBB->begin(), End = FreeInstrBB->end();
+       It != End;) {
+    Instruction &Instr = *It++;
+    if (&Instr == FreeInstrBBTerminator)
+      break;
+    Instr.moveBefore(TI);
+  }
+  assert(FreeInstrBB->size() == 1 &&
+         "Only the branch instruction should remain");
   return &FI;
 }
 
@@ -2393,7 +2417,7 @@ Instruction *InstCombiner::visitFree(CallInst &FI) {
   // into
   // free(foo);
   if (MinimizeSize)
-    if (Instruction *I = tryToMoveFreeBeforeNullTest(FI))
+    if (Instruction *I = tryToMoveFreeBeforeNullTest(FI, DL))
       return I;
 
   return nullptr;
diff --git a/test/Transforms/InstCombine/malloc-free-delete.ll b/test/Transforms/InstCombine/malloc-free-delete.ll
index e66151025b5..7e7b6d9aee5 100644
--- a/test/Transforms/InstCombine/malloc-free-delete.ll
+++ b/test/Transforms/InstCombine/malloc-free-delete.ll
@@ -257,3 +257,32 @@ define void @test11() {
   call void @_ZdlPv(i8* %call)
   ret void
 }
+
+;; Check that the optimization that moves a call to free in its predecessor
+;; block (see test6) also happens when noop casts are involved.
+; CHECK-LABEL: @test12(
+define void @test12(i32* %foo) minsize {
+; CHECK:  %tobool = icmp eq i32* %foo, null
+;; Everything before the call to free should have been moved as well.
+; CHECK-NEXT:   %bitcast = bitcast i32* %foo to i8*
+;; Call to free moved
+; CHECK-NEXT: tail call void @free(i8* %bitcast)
+; CHECK-NEXT: br i1 %tobool, label %if.end, label %if.then
+; CHECK: if.then:
+;; Block is now empty and may be simplified by simplifycfg
+; CHECK-NEXT:   br label %if.end
+; CHECK: if.end:
+; CHECK-NEXT:  ret void
+entry:
+  %tobool = icmp eq i32* %foo, null
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %bitcast = bitcast i32* %foo to i8*
+  tail call void @free(i8* %bitcast)
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
-- 
GitLab


From ef9778473933112da9030ebf19e20b6a6fe57f4d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 30 Oct 2018 20:52:25 +0000
Subject: [PATCH 0770/1116] [InstCombine] use 'match' to reduce code; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345647 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCompares.cpp       | 182 +++++++++---------
 1 file changed, 90 insertions(+), 92 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index afc2175992f..d7af4b88a81 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5357,104 +5357,102 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         return nullptr;
     }
 
-  // Handle fcmp with constant RHS
-  if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
-    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
-      switch (LHSI->getOpcode()) {
-      case Instruction::FPExt: {
-        // fcmp (fpext x), C -> fcmp x, (fptrunc C) if fptrunc is lossless
-        FPExtInst *LHSExt = cast<FPExtInst>(LHSI);
-        ConstantFP *RHSF = dyn_cast<ConstantFP>(RHSC);
-        if (!RHSF)
-          break;
-
-        const fltSemantics &FPSem = LHSExt->getSrcTy()->getFltSemantics();
-        bool Lossy;
-        APFloat F = RHSF->getValueAPF();
-        F.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
-
-        // Avoid lossy conversions and denormals. Zero is a special case
-        // that's OK to convert.
-        APFloat Fabs = F;
-        Fabs.clearSign();
-        if (!Lossy &&
-            ((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
-                 APFloat::cmpLessThan) || Fabs.isZero()))
-
-          return new FCmpInst(Pred, LHSExt->getOperand(0),
-                              ConstantFP::get(RHSC->getContext(), F));
+  // Handle fcmp with instruction LHS and constant RHS.
+  Instruction *LHSI;
+  Constant *RHSC;
+  if (match(Op0, m_Instruction(LHSI)) && match(Op1, m_Constant(RHSC))) {
+    switch (LHSI->getOpcode()) {
+    case Instruction::FPExt: {
+      // fcmp (fpext x), C -> fcmp x, (fptrunc C) if fptrunc is lossless
+      FPExtInst *LHSExt = cast<FPExtInst>(LHSI);
+      ConstantFP *RHSF = dyn_cast<ConstantFP>(RHSC);
+      if (!RHSF)
         break;
-      }
-      case Instruction::PHI:
-        // Only fold fcmp into the PHI if the phi and fcmp are in the same
-        // block.  If in the same block, we're encouraging jump threading.  If
-        // not, we are just pessimizing the code by making an i1 phi.
-        if (LHSI->getParent() == I.getParent())
-          if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
-            return NV;
-        break;
-      case Instruction::SIToFP:
-      case Instruction::UIToFP:
-        if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
-          return NV;
-        break;
-      case Instruction::FSub: {
-        // fcmp pred (fneg x), C -> fcmp swap(pred) x, -C
-        Value *Op;
-        if (match(LHSI, m_FNeg(m_Value(Op))))
-          return new FCmpInst(I.getSwappedPredicate(), Op,
-                              ConstantExpr::getFNeg(RHSC));
-        break;
-      }
-      case Instruction::FDiv:
-        if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC))
+
+      const fltSemantics &FPSem = LHSExt->getSrcTy()->getFltSemantics();
+      bool Lossy;
+      APFloat F = RHSF->getValueAPF();
+      F.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
+
+      // Avoid lossy conversions and denormals.
+      // Zero is a special case that's OK to convert.
+      APFloat Fabs = F;
+      Fabs.clearSign();
+      if (!Lossy &&
+          ((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
+                APFloat::cmpLessThan) || Fabs.isZero()))
+        return new FCmpInst(Pred, LHSExt->getOperand(0),
+                            ConstantFP::get(RHSC->getContext(), F));
+      break;
+    }
+    case Instruction::PHI:
+      // Only fold fcmp into the PHI if the phi and fcmp are in the same
+      // block.  If in the same block, we're encouraging jump threading.  If
+      // not, we are just pessimizing the code by making an i1 phi.
+      if (LHSI->getParent() == I.getParent())
+        if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
           return NV;
+      break;
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+      if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
+        return NV;
+      break;
+    case Instruction::FSub: {
+      // fcmp pred (fneg x), C -> fcmp swap(pred) x, -C
+      Value *Op;
+      if (match(LHSI, m_FNeg(m_Value(Op))))
+        return new FCmpInst(I.getSwappedPredicate(), Op,
+                            ConstantExpr::getFNeg(RHSC));
+      break;
+    }
+    case Instruction::FDiv:
+      if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC))
+        return NV;
+      break;
+    case Instruction::Load:
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
+        if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+          if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+              !cast<LoadInst>(LHSI)->isVolatile())
+            if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
+              return Res;
+      break;
+    case Instruction::Call: {
+      if (!RHSC->isNullValue())
         break;
-      case Instruction::Load:
-        if (GetElementPtrInst *GEP =
-            dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
-          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
-            if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
-                !cast<LoadInst>(LHSI)->isVolatile())
-              if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
-                return Res;
-        }
-        break;
-      case Instruction::Call: {
-        if (!RHSC->isNullValue())
-          break;
 
-        CallInst *CI = cast<CallInst>(LHSI);
-        Intrinsic::ID IID = getIntrinsicForCallSite(CI, &TLI);
-        if (IID != Intrinsic::fabs)
-          break;
+      CallInst *CI = cast<CallInst>(LHSI);
+      Intrinsic::ID IID = getIntrinsicForCallSite(CI, &TLI);
+      if (IID != Intrinsic::fabs)
+        break;
 
-        // Various optimization for fabs compared with zero.
-        switch (Pred) {
-        default:
-          break;
-        // fabs(x) < 0 --> false
-        case FCmpInst::FCMP_OLT:
-          llvm_unreachable("handled by SimplifyFCmpInst");
-        // fabs(x) > 0 --> x != 0
-        case FCmpInst::FCMP_OGT:
-          return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), RHSC);
-        // fabs(x) <= 0 --> x == 0
-        case FCmpInst::FCMP_OLE:
-          return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0), RHSC);
-        // fabs(x) >= 0 --> !isnan(x)
-        case FCmpInst::FCMP_OGE:
-          return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0), RHSC);
-        // fabs(x) == 0 --> x == 0
-        // fabs(x) != 0 --> x != 0
-        case FCmpInst::FCMP_OEQ:
-        case FCmpInst::FCMP_UEQ:
-        case FCmpInst::FCMP_ONE:
-        case FCmpInst::FCMP_UNE:
-          return new FCmpInst(Pred, CI->getArgOperand(0), RHSC);
-        }
-      }
+      // Various optimization for fabs compared with zero.
+      switch (Pred) {
+      default:
+        break;
+      // fabs(x) < 0 --> false
+      case FCmpInst::FCMP_OLT:
+        llvm_unreachable("handled by SimplifyFCmpInst");
+      // fabs(x) > 0 --> x != 0
+      case FCmpInst::FCMP_OGT:
+        return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), RHSC);
+      // fabs(x) <= 0 --> x == 0
+      case FCmpInst::FCMP_OLE:
+        return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0), RHSC);
+      // fabs(x) >= 0 --> !isnan(x)
+      case FCmpInst::FCMP_OGE:
+        return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0), RHSC);
+      // fabs(x) == 0 --> x == 0
+      // fabs(x) != 0 --> x != 0
+      case FCmpInst::FCMP_OEQ:
+      case FCmpInst::FCMP_UEQ:
+      case FCmpInst::FCMP_ONE:
+      case FCmpInst::FCMP_UNE:
+        return new FCmpInst(Pred, CI->getArgOperand(0), RHSC);
       }
+    }
+    }
   }
 
   // fcmp pred (fneg x), (fneg y) -> fcmp swap(pred) x, y
-- 
GitLab


From 3277e77bfd3cacd2edf930801ab74ea68b48c6ed Mon Sep 17 00:00:00 2001
From: Cameron McInally <cameron.mcinally@nyu.edu>
Date: Tue, 30 Oct 2018 21:01:29 +0000
Subject: [PATCH 0771/1116] [FPEnv] [FPEnv] Add constrained intrinsics for
 MAXNUM and MINNUM

Differential Revision: https://reviews.llvm.org/D53216


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345650 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                              |  74 +++
 include/llvm/CodeGen/ISDOpcodes.h             |   2 +-
 include/llvm/CodeGen/SelectionDAGNodes.h      |   2 +
 include/llvm/CodeGen/TargetLowering.h         |   2 +
 include/llvm/IR/IntrinsicInst.h               |   2 +
 include/llvm/IR/Intrinsics.td                 |  10 +
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |   4 +
 .../SelectionDAG/LegalizeVectorOps.cpp        |   4 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   6 +
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |   2 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   8 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   2 +
 lib/IR/Verifier.cpp                           |   2 +
 .../X86/vector-constrained-fp-intrinsics.ll   | 519 ++++++++++++++++++
 14 files changed, 638 insertions(+), 1 deletion(-)

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index d396e3f1cbf..39134fafd46 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -14485,6 +14485,80 @@ mode is determined by the runtime floating-point environment.  The rounding
 mode argument is only intended as information to the compiler.
 
 
+'``llvm.experimental.constrained.maxnum``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.maxnum(<type> <op1>, <type> <op2>
+                                            metadata <rounding mode>,
+                                            metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.maxnum``' intrinsic returns the maximum 
+of the two arguments.
+
+Arguments:
+""""""""""
+
+The first two arguments and the return value are floating-point numbers 
+of the same type.
+
+The third and forth arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function follows the IEEE-754 semantics for maxNum. The rounding mode is
+described, not determined, by the rounding mode argument. The actual rounding
+mode is determined by the runtime floating-point environment. The rounding
+mode argument is only intended as information to the compiler.
+
+
+'``llvm.experimental.constrained.minnum``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.minnum(<type> <op1>, <type> <op2>
+                                            metadata <rounding mode>,
+                                            metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.minnum``' intrinsic returns the minimum
+of the two arguments.
+
+Arguments:
+""""""""""
+
+The first two arguments and the return value are floating-point numbers
+of the same type.
+
+The third and forth arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function follows the IEEE-754 semantics for minNum. The rounding mode is
+described, not determined, by the rounding mode argument. The actual rounding
+mode is determined by the runtime floating-point environment. The rounding
+mode argument is only intended as information to the compiler.
+
+
 General Intrinsics
 ------------------
 
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index b8e3129ed6c..a023aa5b3f6 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -288,7 +288,7 @@ namespace ISD {
     /// They are used to limit optimizations while the DAG is being optimized.
     STRICT_FSQRT, STRICT_FPOW, STRICT_FPOWI, STRICT_FSIN, STRICT_FCOS,
     STRICT_FEXP, STRICT_FEXP2, STRICT_FLOG, STRICT_FLOG10, STRICT_FLOG2,
-    STRICT_FRINT, STRICT_FNEARBYINT,
+    STRICT_FRINT, STRICT_FNEARBYINT, STRICT_FMAXNUM, STRICT_FMINNUM,
 
     /// FMA - Perform a * b + c with no intermediate rounding step.
     FMA,
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 28d27b7a459..262c7b7b58c 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -672,6 +672,8 @@ public:
       case ISD::STRICT_FLOG2:
       case ISD::STRICT_FRINT:
       case ISD::STRICT_FNEARBYINT:
+      case ISD::STRICT_FMAXNUM:
+      case ISD::STRICT_FMINNUM:
         return true;
     }
   }
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index a4356db800a..7a779f0b433 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -819,6 +819,8 @@ public:
       case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
       case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
       case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
+      case ISD::STRICT_FMAXNUM: EqOpc = ISD::FMAXNUM; break;
+      case ISD::STRICT_FMINNUM: EqOpc = ISD::FMINNUM; break;
     }
 
     auto Action = getOperationAction(EqOpc, VT);
diff --git a/include/llvm/IR/IntrinsicInst.h b/include/llvm/IR/IntrinsicInst.h
index 32a62a4cafc..54e344d829a 100644
--- a/include/llvm/IR/IntrinsicInst.h
+++ b/include/llvm/IR/IntrinsicInst.h
@@ -251,6 +251,8 @@ namespace llvm {
       case Intrinsic::experimental_constrained_log2:
       case Intrinsic::experimental_constrained_rint:
       case Intrinsic::experimental_constrained_nearbyint:
+      case Intrinsic::experimental_constrained_maxnum:
+      case Intrinsic::experimental_constrained_minnum:
         return true;
       default: return false;
       }
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 989a04d65de..47a66a27e38 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -565,6 +565,16 @@ let IntrProperties = [IntrInaccessibleMemOnly] in {
                                                          [ LLVMMatchType<0>,
                                                            llvm_metadata_ty,
                                                            llvm_metadata_ty ]>;
+  def int_experimental_constrained_maxnum : Intrinsic<[ llvm_anyfloat_ty ],
+                                                      [ LLVMMatchType<0>,
+                                                        LLVMMatchType<0>,
+                                                        llvm_metadata_ty,
+                                                        llvm_metadata_ty ]>;
+  def int_experimental_constrained_minnum : Intrinsic<[ llvm_anyfloat_ty ],
+                                                      [ LLVMMatchType<0>,
+                                                        LLVMMatchType<0>,
+                                                        llvm_metadata_ty,
+                                                        llvm_metadata_ty ]>;
 }
 // FIXME: Add intrinsics for fcmp, fptrunc, fpext, fptoui and fptosi.
 // FIXME: Add intrinsics for fabs, copysign, floor, ceil, trunc and round?
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 6d9e69e2d64..65f78773241 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1107,6 +1107,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
     // These pseudo-ops get legalized as if they were their non-strict
     // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
     // is also legal, but if ISD::FSQRT requires expansion then so does
@@ -3833,11 +3835,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   }
   case ISD::FMINNUM:
+  case ISD::STRICT_FMINNUM:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64,
                                       RTLIB::FMIN_F80, RTLIB::FMIN_F128,
                                       RTLIB::FMIN_PPCF128));
     break;
   case ISD::FMAXNUM:
+  case ISD::STRICT_FMAXNUM:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64,
                                       RTLIB::FMAX_F80, RTLIB::FMAX_F128,
                                       RTLIB::FMAX_PPCF128));
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 109276a5cbb..1b68f217590 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -305,6 +305,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
     // These pseudo-ops get legalized as if they were their non-strict
     // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
     // is also legal, but if ISD::FSQRT requires expansion then so does
@@ -751,6 +753,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
     return ExpandStrictFPOp(Op);
   default:
     return DAG.UnrollVectorOp(Op.getNode());
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index e7ad25155eb..59bd751f4ec 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -164,6 +164,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
     R = ScalarizeVecRes_StrictFPOp(N);
     break;
   }
@@ -834,6 +836,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
     SplitVecRes_StrictFPOp(N, Lo, Hi);
     break;
   }
@@ -2400,6 +2404,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
     Res = WidenVecRes_StrictFP(N);
     break;
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 84e955fd6f6..4d509c99c2e 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7404,6 +7404,8 @@ SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
     NewOpc = ISD::FNEARBYINT;
     IsUnary = true;
     break;
+  case ISD::STRICT_FMAXNUM: NewOpc = ISD::FMAXNUM; break;
+  case ISD::STRICT_FMINNUM: NewOpc = ISD::FMINNUM; break;
   }
 
   // We're taking this node out of the chain, so we need to re-link things.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ddead1d93a5..5ecb2abbcbf 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5627,6 +5627,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::experimental_constrained_log2:
   case Intrinsic::experimental_constrained_rint:
   case Intrinsic::experimental_constrained_nearbyint:
+  case Intrinsic::experimental_constrained_maxnum:
+  case Intrinsic::experimental_constrained_minnum:
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
     return nullptr;
   case Intrinsic::fmuladd: {
@@ -6374,6 +6376,12 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   case Intrinsic::experimental_constrained_nearbyint:
     Opcode = ISD::STRICT_FNEARBYINT;
     break;
+  case Intrinsic::experimental_constrained_maxnum:
+    Opcode = ISD::STRICT_FMAXNUM;
+    break;
+  case Intrinsic::experimental_constrained_minnum:
+    Opcode = ISD::STRICT_FMINNUM;
+    break;
   }
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Chain = getRoot();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index bae163d5386..5c17a5d295d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -175,7 +175,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   // Unary operators
   case ISD::FABS:                       return "fabs";
   case ISD::FMINNUM:                    return "fminnum";
+  case ISD::STRICT_FMINNUM:             return "strict_fminnum";
   case ISD::FMAXNUM:                    return "fmaxnum";
+  case ISD::STRICT_FMAXNUM:             return "strict_fmaxnum";
   case ISD::FMINNUM_IEEE:               return "fminnum_ieee";
   case ISD::FMAXNUM_IEEE:               return "fmaxnum_ieee";
   case ISD::FMINIMUM:                   return "fminimum";
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 3c6defdfde3..7c0381a7222 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -4104,6 +4104,8 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
   case Intrinsic::experimental_constrained_log2:
   case Intrinsic::experimental_constrained_rint:
   case Intrinsic::experimental_constrained_nearbyint:
+  case Intrinsic::experimental_constrained_maxnum:
+  case Intrinsic::experimental_constrained_minnum:
     visitConstrainedFPIntrinsic(
         cast<ConstrainedFPIntrinsic>(*CS.getInstruction()));
     break;
diff --git a/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 44c19483154..ad07e9ba12c 100644
--- a/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -3668,6 +3668,515 @@ entry:
   ret <4 x double> %nearby
 }
 
+define <1 x float> @constrained_vector_maxnum_v1f32() {
+; NO-FMA-LABEL: constrained_vector_maxnum_v1f32:
+; NO-FMA:       # %bb.0: # %entry
+; NO-FMA-NEXT:    pushq %rax
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
+; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    callq fmaxf
+; NO-FMA-NEXT:    popq %rax
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
+; NO-FMA-NEXT:    retq
+;
+; HAS-FMA-LABEL: constrained_vector_maxnum_v1f32:
+; HAS-FMA:       # %bb.0: # %entry
+; HAS-FMA-NEXT:    pushq %rax
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    callq fmaxf
+; HAS-FMA-NEXT:    popq %rax
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
+; HAS-FMA-NEXT:    retq
+entry:
+  %max = call <1 x float> @llvm.experimental.constrained.maxnum.v1f32(
+                               <1 x float> <float 42.0>, <1 x float> <float 41.0>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %max
+}
+
+define <2 x double> @constrained_vector_maxnum_v2f64() {
+; NO-FMA-LABEL: constrained_vector_maxnum_v2f64:
+; NO-FMA:       # %bb.0: # %entry
+; NO-FMA-NEXT:    subq $24, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmax
+; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmax
+; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; NO-FMA-NEXT:    addq $24, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
+; NO-FMA-NEXT:    retq
+;
+; HAS-FMA-LABEL: constrained_vector_maxnum_v2f64:
+; HAS-FMA:       # %bb.0: # %entry
+; HAS-FMA-NEXT:    subq $24, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmax
+; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmax
+; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; HAS-FMA-NEXT:    addq $24, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
+; HAS-FMA-NEXT:    retq
+entry:
+  %max = call <2 x double> @llvm.experimental.constrained.maxnum.v2f64(
+                                <2 x double> <double 43.0, double 42.0>,
+                                <2 x double> <double 41.0, double 40.0>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %max
+}
+
+define <3 x float> @constrained_vector_maxnum_v3f32() {
+; NO-FMA-LABEL: constrained_vector_maxnum_v3f32:
+; NO-FMA:       # %bb.0: # %entry
+; NO-FMA-NEXT:    subq $40, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
+; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    callq fmaxf
+; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    callq fmaxf
+; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    callq fmaxf
+; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
+; NO-FMA-NEXT:    movaps %xmm1, %xmm0
+; NO-FMA-NEXT:    addq $40, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
+; NO-FMA-NEXT:    retq
+;
+; HAS-FMA-LABEL: constrained_vector_maxnum_v3f32:
+; HAS-FMA:       # %bb.0: # %entry
+; HAS-FMA-NEXT:    subq $40, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    callq fmaxf
+; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    callq fmaxf
+; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    callq fmaxf
+; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; HAS-FMA-NEXT:    addq $40, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
+; HAS-FMA-NEXT:    retq
+entry:
+  %max = call <3 x float> @llvm.experimental.constrained.maxnum.v3f32(
+                              <3 x float> <float 43.0, float 44.0, float 45.0>,
+                              <3 x float> <float 41.0, float 42.0, float 43.0>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %max
+}
+
+define <3 x double> @constrained_vector_max_v3f64() {
+; NO-FMA-LABEL: constrained_vector_max_v3f64:
+; NO-FMA:       # %bb.0: # %entry
+; NO-FMA-NEXT:    subq $24, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmax
+; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmax
+; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmax
+; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
+; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT:    # xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT:    # xmm1 = mem[0],zero
+; NO-FMA-NEXT:    addq $24, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
+; NO-FMA-NEXT:    retq
+;
+; HAS-FMA-LABEL: constrained_vector_max_v3f64:
+; HAS-FMA:       # %bb.0: # %entry
+; HAS-FMA-NEXT:    subq $56, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmax
+; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmax
+; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    vzeroupper
+; HAS-FMA-NEXT:    callq fmax
+; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; HAS-FMA-NEXT:    addq $56, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
+; HAS-FMA-NEXT:    retq
+entry:
+  %max = call <3 x double> @llvm.experimental.constrained.maxnum.v3f64(
+                          <3 x double> <double 43.0, double 44.0, double 45.0>,
+                          <3 x double> <double 40.0, double 41.0, double 42.0>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %max
+}
+
+define <4 x double> @constrained_vector_maxnum_v4f64() {
+; NO-FMA-LABEL: constrained_vector_maxnum_v4f64:
+; NO-FMA:       # %bb.0: # %entry
+; NO-FMA-NEXT:    subq $40, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmax
+; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmax
+; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmax
+; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmax
+; NO-FMA-NEXT:    movaps %xmm0, %xmm1
+; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
+; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT:    addq $40, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
+; NO-FMA-NEXT:    retq
+;
+; HAS-FMA-LABEL: constrained_vector_maxnum_v4f64:
+; HAS-FMA:       # %bb.0: # %entry
+; HAS-FMA-NEXT:    subq $40, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmax
+; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmax
+; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmax
+; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmax
+; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    addq $40, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
+; HAS-FMA-NEXT:    retq
+entry:
+  %max = call <4 x double> @llvm.experimental.constrained.maxnum.v4f64(
+                                <4 x double> <double 44.0, double 45.0,
+                                              double 46.0, double 47.0>,
+                                <4 x double> <double 40.0, double 41.0,
+                                              double 42.0, double 43.0>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <4 x double> %max
+}
+
+define <1 x float> @constrained_vector_minnum_v1f32() {
+; NO-FMA-LABEL: constrained_vector_minnum_v1f32:
+; NO-FMA:       # %bb.0: # %entry
+; NO-FMA-NEXT:    pushq %rax
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
+; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    callq fminf
+; NO-FMA-NEXT:    popq %rax
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
+; NO-FMA-NEXT:    retq
+;
+; HAS-FMA-LABEL: constrained_vector_minnum_v1f32:
+; HAS-FMA:       # %bb.0: # %entry
+; HAS-FMA-NEXT:    pushq %rax
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    callq fminf
+; HAS-FMA-NEXT:    popq %rax
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
+; HAS-FMA-NEXT:    retq
+entry:
+  %min = call <1 x float> @llvm.experimental.constrained.minnum.v1f32(
+                               <1 x float> <float 42.0>, <1 x float> <float 41.0>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %min
+}
+
+define <2 x double> @constrained_vector_minnum_v2f64() {
+; NO-FMA-LABEL: constrained_vector_minnum_v2f64:
+; NO-FMA:       # %bb.0: # %entry
+; NO-FMA-NEXT:    subq $24, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmin
+; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmin
+; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; NO-FMA-NEXT:    addq $24, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
+; NO-FMA-NEXT:    retq
+;
+; HAS-FMA-LABEL: constrained_vector_minnum_v2f64:
+; HAS-FMA:       # %bb.0: # %entry
+; HAS-FMA-NEXT:    subq $24, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmin
+; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmin
+; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; HAS-FMA-NEXT:    addq $24, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
+; HAS-FMA-NEXT:    retq
+entry:
+  %min = call <2 x double> @llvm.experimental.constrained.minnum.v2f64(
+                                <2 x double> <double 43.0, double 42.0>,
+                                <2 x double> <double 41.0, double 40.0>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %min
+}
+
+define <3 x float> @constrained_vector_minnum_v3f32() {
+; NO-FMA-LABEL: constrained_vector_minnum_v3f32:
+; NO-FMA:       # %bb.0: # %entry
+; NO-FMA-NEXT:    subq $40, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
+; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    callq fminf
+; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    callq fminf
+; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; NO-FMA-NEXT:    callq fminf
+; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
+; NO-FMA-NEXT:    movaps %xmm1, %xmm0
+; NO-FMA-NEXT:    addq $40, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
+; NO-FMA-NEXT:    retq
+;
+; HAS-FMA-LABEL: constrained_vector_minnum_v3f32:
+; HAS-FMA:       # %bb.0: # %entry
+; HAS-FMA-NEXT:    subq $40, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    callq fminf
+; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    callq fminf
+; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HAS-FMA-NEXT:    callq fminf
+; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; HAS-FMA-NEXT:    addq $40, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
+; HAS-FMA-NEXT:    retq
+entry:
+  %min = call <3 x float> @llvm.experimental.constrained.minnum.v3f32(
+                              <3 x float> <float 43.0, float 44.0, float 45.0>,
+                              <3 x float> <float 41.0, float 42.0, float 43.0>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %min
+}
+
+define <3 x double> @constrained_vector_min_v3f64() {entry:
+; NO-FMA-LABEL: constrained_vector_min_v3f64:
+; NO-FMA:       # %bb.0: # %entry
+; NO-FMA-NEXT:    subq $24, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmin
+; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmin
+; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmin
+; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
+; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; NO-FMA-NEXT:    # xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; NO-FMA-NEXT:    # xmm1 = mem[0],zero
+; NO-FMA-NEXT:    addq $24, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
+; NO-FMA-NEXT:    retq
+;
+; HAS-FMA-LABEL: constrained_vector_min_v3f64:
+; HAS-FMA:       # %bb.0: # %entry
+; HAS-FMA-NEXT:    subq $56, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmin
+; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmin
+; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    vzeroupper
+; HAS-FMA-NEXT:    callq fmin
+; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; HAS-FMA-NEXT:    addq $56, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
+; HAS-FMA-NEXT:    retq
+  %min = call <3 x double> @llvm.experimental.constrained.minnum.v3f64(
+                          <3 x double> <double 43.0, double 44.0, double 45.0>,
+                          <3 x double> <double 40.0, double 41.0, double 42.0>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %min
+}
+
+define <4 x double> @constrained_vector_minnum_v4f64() {
+; NO-FMA-LABEL: constrained_vector_minnum_v4f64:
+; NO-FMA:       # %bb.0: # %entry
+; NO-FMA-NEXT:    subq $40, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmin
+; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmin
+; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmin
+; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NO-FMA-NEXT:    callq fmin
+; NO-FMA-NEXT:    movaps %xmm0, %xmm1
+; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
+; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; NO-FMA-NEXT:    addq $40, %rsp
+; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
+; NO-FMA-NEXT:    retq
+;
+; HAS-FMA-LABEL: constrained_vector_minnum_v4f64:
+; HAS-FMA:       # %bb.0: # %entry
+; HAS-FMA-NEXT:    subq $40, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmin
+; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmin
+; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmin
+; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; HAS-FMA-NEXT:    callq fmin
+; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
+; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; HAS-FMA-NEXT:    addq $40, %rsp
+; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
+; HAS-FMA-NEXT:    retq
+entry:
+  %min = call <4 x double> @llvm.experimental.constrained.minnum.v4f64(
+                                <4 x double> <double 44.0, double 45.0,
+                                              double 46.0, double 47.0>,
+                                <4 x double> <double 40.0, double 41.0,
+                                              double 42.0, double 43.0>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <4 x double> %min
+}
+
 ; Single width declarations
 declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata)
@@ -3688,6 +4197,8 @@ declare <2 x double> @llvm.experimental.constrained.log10.v2f64(<2 x double>, me
 declare <2 x double> @llvm.experimental.constrained.log2.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.maxnum.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.minnum.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 
 ; Scalar width declarations
 declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata)
@@ -3708,6 +4219,8 @@ declare <1 x float> @llvm.experimental.constrained.log10.v1f32(<1 x float>, meta
 declare <1 x float> @llvm.experimental.constrained.log2.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.rint.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.nearbyint.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.maxnum.v1f32(<1 x float>, <1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.minnum.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 
 ; Illegal width declarations
 declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata)
@@ -3746,6 +4259,10 @@ declare <3 x float> @llvm.experimental.constrained.rint.v3f32(<3 x float>, metad
 declare <3 x double> @llvm.experimental.constrained.rint.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.nearbyint.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.nearbyint.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.maxnum.v3f32(<3 x float>, <3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.maxnum.v3f64(<3 x double>, <3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.minnum.v3f32(<3 x float>, <3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.minnum.v3f64(<3 x double>, <3 x double>, metadata, metadata)
 
 ; Double width declarations
 declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata)
@@ -3767,3 +4284,5 @@ declare <4 x double> @llvm.experimental.constrained.log10.v4f64(<4 x double>, me
 declare <4 x double> @llvm.experimental.constrained.log2.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.rint.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.maxnum.v4f64(<4 x double>, <4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.minnum.v4f64(<4 x double>, <4 x double>, metadata, metadata)
-- 
GitLab


From b5d6bd0e48699a374d12bd68e923ffe2b0620186 Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Tue, 30 Oct 2018 22:02:40 +0000
Subject: [PATCH 0772/1116] Revert r345542: AMDGPU: Enable code object v3 by
 default

It breaks mesa.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345662 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPU.td                   | 45 ++++++----------
 test/CodeGen/AMDGPU/addrspacecast.ll          |  4 +-
 test/CodeGen/AMDGPU/amdgpu.private-memory.ll  | 14 ++---
 .../attr-amdgpu-flat-work-group-size.ll       |  2 +-
 .../AMDGPU/call-graph-register-usage.ll       |  6 +--
 .../AMDGPU/callee-special-input-sgprs.ll      |  4 +-
 .../AMDGPU/callee-special-input-vgprs.ll      |  2 +-
 test/CodeGen/AMDGPU/debugger-emit-prologue.ll |  4 +-
 test/CodeGen/AMDGPU/elf-notes.ll              | 18 +++----
 .../flat-for-global-subtarget-feature.ll      |  4 +-
 test/CodeGen/AMDGPU/flat-scratch-reg.ll       |  6 +--
 test/CodeGen/AMDGPU/gfx902-without-xnack.ll   |  2 +-
 test/CodeGen/AMDGPU/hsa-fp-mode.ll            | 14 ++---
 test/CodeGen/AMDGPU/hsa-func.ll               | 12 ++---
 .../AMDGPU/hsa-metadata-enqueue-kernel.ll     |  4 +-
 .../AMDGPU/hsa-metadata-from-llvm-ir-full.ll  | 12 ++---
 .../AMDGPU/hsa-metadata-hidden-args.ll        |  6 +--
 test/CodeGen/AMDGPU/hsa-metadata-images.ll    |  6 +--
 .../AMDGPU/hsa-metadata-kernel-code-props.ll  |  6 +--
 .../AMDGPU/hsa-metadata-kernel-debug-props.ll |  8 +--
 test/CodeGen/AMDGPU/hsa-note-no-func.ll       | 52 +++++++++----------
 test/CodeGen/AMDGPU/hsa.ll                    | 12 ++---
 test/CodeGen/AMDGPU/kernel-args.ll            |  2 +-
 .../AMDGPU/kernel-argument-dag-lowering.ll    |  2 +-
 test/CodeGen/AMDGPU/large-alloca-compute.ll   |  4 +-
 .../AMDGPU/llvm.amdgcn.dispatch.ptr.ll        |  2 +-
 .../AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll |  2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll  |  2 +-
 .../AMDGPU/llvm.amdgcn.workgroup.id.ll        |  8 +--
 .../CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll |  8 +--
 test/CodeGen/AMDGPU/nop-data.ll               |  2 +-
 test/CodeGen/AMDGPU/promote-alloca-no-opts.ll |  4 +-
 .../promote-alloca-padding-size-estimate.ll   |  2 +-
 ...vgpr-spill-emergency-stack-slot-compute.ll |  4 +-
 test/MC/AMDGPU/hsa-exp.s                      |  4 +-
 test/MC/AMDGPU/hsa-text.s                     |  4 +-
 test/MC/AMDGPU/hsa.s                          |  4 +-
 test/MC/AMDGPU/hsa_code_object_isa_args.s     | 12 ++---
 test/MC/AMDGPU/hsa_isa_version_attrs.s        |  4 +-
 test/MC/AMDGPU/isa-version-hsa.s              | 14 ++---
 test/MC/AMDGPU/isa-version-pal.s              | 14 ++---
 test/MC/AMDGPU/isa-version-unk.s              | 14 ++---
 test/MC/AMDGPU/sym_option.s                   | 18 +++----
 test/Object/AMDGPU/objdump.s                  |  2 +-
 44 files changed, 180 insertions(+), 195 deletions(-)

diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index edbdf01a591..54b6c8a7882 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -474,41 +474,34 @@ def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
   [FeatureSouthernIslands,
    FeatureFastFMAF32,
    HalfRate64Ops,
-   FeatureLDSBankCount32,
-   FeatureCodeObjectV3]>;
+   FeatureLDSBankCount32]>;
 
 def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
   [FeatureSouthernIslands,
-   FeatureLDSBankCount32,
-   FeatureCodeObjectV3]>;
+   FeatureLDSBankCount32]>;
 
 def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
   [FeatureSeaIslands,
-   FeatureLDSBankCount32,
-   FeatureCodeObjectV3]>;
+   FeatureLDSBankCount32]>;
 
 def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
   [FeatureSeaIslands,
    HalfRate64Ops,
    FeatureLDSBankCount32,
-   FeatureFastFMAF32,
-   FeatureCodeObjectV3]>;
+   FeatureFastFMAF32]>;
 
 def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
   [FeatureSeaIslands,
    FeatureLDSBankCount16,
-   FeatureFastFMAF32,
-   FeatureCodeObjectV3]>;
+   FeatureFastFMAF32]>;
 
 def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
   [FeatureSeaIslands,
-   FeatureLDSBankCount16,
-   FeatureCodeObjectV3]>;
+   FeatureLDSBankCount16]>;
 
 def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
   [FeatureSeaIslands,
-   FeatureLDSBankCount32,
-   FeatureCodeObjectV3]>;
+   FeatureLDSBankCount32]>;
 
 def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
   [FeatureVolcanicIslands,
@@ -516,57 +509,49 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
    HalfRate64Ops,
    FeatureLDSBankCount32,
    FeatureXNACK,
-   FeatureUnpackedD16VMem,
-   FeatureCodeObjectV3]>;
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
    FeatureSGPRInitBug,
-   FeatureUnpackedD16VMem,
-   FeatureCodeObjectV3]>;
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
-   FeatureUnpackedD16VMem,
-   FeatureCodeObjectV3]>;
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount16,
-   FeatureXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureXNACK]>;
 
 def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
-   FeatureD16PreservesUnusedBits,
-   FeatureCodeObjectV3]>;
+   FeatureD16PreservesUnusedBits]>;
 
 def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
    FeatureXNACK,
-   FeatureD16PreservesUnusedBits,
-   FeatureCodeObjectV3]>;
+   FeatureD16PreservesUnusedBits]>;
 
 def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
   [FeatureGFX9,
    FeatureLDSBankCount32,
    FeatureFmaMixInsts,
-   FeatureD16PreservesUnusedBits,
-   FeatureCodeObjectV3]>;
+   FeatureD16PreservesUnusedBits]>;
 
 def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
   [FeatureGFX9,
    HalfRate64Ops,
    FeatureFmaMixInsts,
    FeatureLDSBankCount32,
-   FeatureDLInsts,
-   FeatureCodeObjectV3]>;
+   FeatureDLInsts]>;
 
 def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9,
   [FeatureGFX9,
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
index ea40cda4fa6..95bbe958e93 100644
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s
 
 ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
 ; HSA: enable_sgpr_private_segment_buffer = 1
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 199a96c6443..023c19915c7 100644
--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -1,10 +1,10 @@
-; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s
-; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s
-; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
 
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s
 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 2f281cab48c..7fe5604c3ec 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=HSAMD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=HSAMD %s
 
 ; CHECK-LABEL: {{^}}min_64_max_64:
 ; CHECK: SGPRBlocks: 0
diff --git a/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index c4c30a66755..21c69d9bee7 100644
--- a/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
 
 ; Make sure to run a GPU with the SGPR allocation bug.
 
diff --git a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
index e5c18062708..79abb96cccf 100644
--- a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 ; GCN-LABEL: {{^}}use_dispatch_ptr:
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
diff --git a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 750a0203c9b..7f14a24d6da 100644
--- a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}use_workitem_id_x:
 ; GCN: s_waitcnt
diff --git a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
index b416537b9f8..46d81e57065 100644
--- a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
+++ b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck %s --check-prefix=NOATTR
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s --check-prefix=NOATTR
 target datalayout = "A5"
 
 ; CHECK: debug_wavefront_private_segment_offset_sgpr = [[SOFF:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/elf-notes.ll b/test/CodeGen/AMDGPU/elf-notes.ll
index 43e569de65c..b81292bfdb9 100644
--- a/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/test/CodeGen/AMDGPU/elf-notes.ll
@@ -1,12 +1,12 @@
-; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ELF --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ELF --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ELF --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ELF --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ELF --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes  | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ELF --check-prefix=GFX802 %s
 ; RUN: llc -march=r600 < %s | FileCheck --check-prefix=R600 %s
 
 ; OSABI-UNK-NOT: .hsa_code_object_version
diff --git a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index 17f557b3a6c..b2ac534a7d6 100644
--- a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=HSA-NOADDR64 -check-prefix=ALL %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s
diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index 38909d3e3e9..a7664c399fb 100644
--- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -7,9 +7,9 @@
 ; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=stoney  -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-XNACK -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-XNACK -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}no_vcc_no_flat:
 ; HSA-CI: is_xnack_enabled = 0
diff --git a/test/CodeGen/AMDGPU/gfx902-without-xnack.ll b/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
index 8577382cff5..445e112a301 100644
--- a/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
+++ b/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-code-object-v3,-xnack < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-xnack < %s | FileCheck %s
 
 ; CHECK: .hsa_code_object_isa 9,0,2,"AMD","AMDGPU"
 define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
diff --git a/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
index a454fa02579..b1901cf894b 100644
--- a/test/CodeGen/AMDGPU/hsa-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -70,10 +70,10 @@ define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, doub
   ret void
 }
 
-attributes #0 = { nounwind "target-cpu"="kaveri" "target-features"="-code-object-v3" }
-attributes #1 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3" }
-attributes #2 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,+fp64-fp16-denormals" }
-attributes #3 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,-fp64-fp16-denormals" }
-attributes #4 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,+fp64-fp16-denormals" }
-attributes #5 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,-fp64-fp16-denormals" }
-attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3,-dx10-clamp" }
+attributes #0 = { nounwind "target-cpu"="kaveri" }
+attributes #1 = { nounwind "target-cpu"="fiji" }
+attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-fp16-denormals" }
+attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-fp16-denormals" }
+attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" }
+attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" }
+attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-dx10-clamp" }
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
index 76a17215b7f..d117cf59ee1 100644
--- a/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | FileCheck --check-prefix=HSA-CI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=carrizo  | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=carrizo | FileCheck --check-prefix=HSA-VI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA-CI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo  | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA-VI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
 ; directives.
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll b/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll
index f6e3d94b4dc..77624aaafad 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
 
 ; CHECK: ---
 ; CHECK:  Version: [ 1, 0 ]
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
index 4dce2bf832e..485e02da7d9 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
 
 %struct.A = type { i8, float }
 %opencl.image1d_t = type opaque
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
index 6dbc1e2523d..ed2e79684c6 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 
 ; CHECK: ---
 ; CHECK:  Version: [ 1, 0 ]
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-images.ll b/test/CodeGen/AMDGPU/hsa-metadata-images.ll
index fd015998429..00dee3b6c69 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-images.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-images.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 
 %opencl.image1d_t = type opaque
 %opencl.image1d_array_t = type opaque
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index b5b6aa450bf..3dc3f320db8 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 
 @var = addrspace(1) global float 0.0
 
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll b/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll
index 7eacdc1cdab..fab086e6cb1 100644
--- a/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
 target datalayout = "A5"
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata)
@@ -32,7 +32,7 @@ entry:
   ret void, !dbg !25
 }
 
-attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx800" "target-features"="+16-bit-insts,-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops,+amdgpu-debugger-reserve-regs,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx800" "target-features"="+16-bit-insts,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops,+amdgpu-debugger-reserve-regs,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !opencl.ocl.version = !{!3}
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
index 39026e8c7bd..e937aaca66f 100644
--- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -1,29 +1,29 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx600 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI600 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx601 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI601 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx700 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx701 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx702 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI702 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx703 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx704 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=bonaire | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=mullins | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=hawaii | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kabini | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=tonga -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=polaris10 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=polaris11 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx801 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx802 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx803 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx902 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX902 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx904 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx906 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=gfx909 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx600 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI600 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx601 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI601 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx700 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx701 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx702 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI702 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx703 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx704 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=bonaire | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=mullins | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=hawaii | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kabini | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris10 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris11 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx801 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx802 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx803 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx902 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX902 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx904 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx906 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx909 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s
 
 ; HSA: .hsa_code_object_version 2,1
 ; HSA-SI600: .hsa_code_object_isa 6,0,0,"AMD","AMDGPU"
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index e23b2d922a3..0b19fbe7d70 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-CI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3 | FileCheck --check-prefix=HSA %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-VI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo  | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
 ; directives.
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index 64a5fbdf00a..11067522f85 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-code-object-v3 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s
 
diff --git a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index b7344cfb33c..a1bb6c28e74 100644
--- a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
 
 ; Repeat of some problematic tests in kernel-args.ll, with the IR
 ; argument lowering pass disabled. Struct padding needs to be
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 0343052601f..d8cf52341e3 100644
--- a/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3,-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
 
 ; FIXME: align on alloca seems to be ignored for private_segment_alignment
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index a2f2ced72e7..b6f9f951d9b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index ee039a392e2..5853d8d8e4e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,OS-MESA3D,MESA,ALL %s
 ; RUN: llc -mtriple=amdgcn-mesa-unknown -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,MESA,ALL %s
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
index 6866d9537b3..f8c60451ac7 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
index 377785e0ca2..349e7f0f0e8 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=UNKNOWN-OS -check-prefix=SI-MESA %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=UNKNOWN-OS -check-prefix=VI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
 
 declare i32 @llvm.amdgcn.workgroup.id.x() #0
 declare i32 @llvm.amdgcn.workgroup.id.y() #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index 13e204b03a0..8b80998cab6 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/test/CodeGen/AMDGPU/nop-data.ll b/test/CodeGen/AMDGPU/nop-data.ll
index 4e836a398ee..790e31c781a 100644
--- a/test/CodeGen/AMDGPU/nop-data.ll
+++ b/test/CodeGen/AMDGPU/nop-data.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - -mcpu=fiji | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - -mcpu=fiji | FileCheck %s
 
 ; CHECK: kernel0:
 ; CHECK-NEXT: s_endpgm
diff --git a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
index d7e38a602ff..6a41c3ad2e8 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-code-object-v3,+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
-; RUN: llc -O1 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-code-object-v3,+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
+; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
+; RUN: llc -O1 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
 
 ; ALL-LABEL: {{^}}promote_alloca_i32_array_array:
 ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
diff --git a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
index 83a608ad5f3..e8dcb50a3c1 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s
 
 ; This shows that the amount of LDS estimate is sensitive to the order
 ; of the LDS globals.
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index e13199d68bc..9cdc333cbc0 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s
-; RUN: llc -march=amdgcn  -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3,+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
-; RUN: llc -march=amdgcn  -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn  -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn  -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
 
 ; This ends up using all 256 registers and requires register
 ; scavenging which will fail to find an unsued register.
diff --git a/test/MC/AMDGPU/hsa-exp.s b/test/MC/AMDGPU/hsa-exp.s
index 8900a0638c9..b13755a19cc 100644
--- a/test/MC/AMDGPU/hsa-exp.s
+++ b/test/MC/AMDGPU/hsa-exp.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 // ELF: Section {
 // ELF: Name: .text
diff --git a/test/MC/AMDGPU/hsa-text.s b/test/MC/AMDGPU/hsa-text.s
index f4463fc5936..afe696af0a2 100644
--- a/test/MC/AMDGPU/hsa-text.s
+++ b/test/MC/AMDGPU/hsa-text.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF
 
 // For compatibility reasons we treat convert .text sections to .hsatext
 
diff --git a/test/MC/AMDGPU/hsa.s b/test/MC/AMDGPU/hsa.s
index 0521c10e1a8..5ebc0a60e0f 100644
--- a/test/MC/AMDGPU/hsa.s
+++ b/test/MC/AMDGPU/hsa.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
 
 // ELF: Section {
 // ELF: Name: .text
diff --git a/test/MC/AMDGPU/hsa_code_object_isa_args.s b/test/MC/AMDGPU/hsa_code_object_isa_args.s
index 950f32cd19a..1c47c83e3e9 100644
--- a/test/MC/AMDGPU/hsa_code_object_isa_args.s
+++ b/test/MC/AMDGPU/hsa_code_object_isa_args.s
@@ -1,9 +1,9 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_700
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_803
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=stoney -mattr=-code-object-v3 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_810
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_700
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_803
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=stoney -mattr=-code-object-v3 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_810
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_700
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=gfx803 -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_803
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=stoney -show-encoding %s | FileCheck %s --check-prefix=ASM --check-prefix=ASM_810
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_700
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=gfx803 -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_803
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=stoney -show-encoding %s | llvm-readobj -s -sd | FileCheck %s --check-prefix=ELF --check-prefix=ELF_810
 
 // ELF: SHT_NOTE
 // ELF: 0000: 04000000 08000000 01000000 414D4400
diff --git a/test/MC/AMDGPU/hsa_isa_version_attrs.s b/test/MC/AMDGPU/hsa_isa_version_attrs.s
index ddd76fcf918..631e1a45097 100644
--- a/test/MC/AMDGPU/hsa_isa_version_attrs.s
+++ b/test/MC/AMDGPU/hsa_isa_version_attrs.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=gfx801 -mattr=-code-object-v3,-fast-fmaf -show-encoding %s | FileCheck --check-prefix=GFX8 %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -mattr=-code-object-v3,-mad-mix-insts -show-encoding %s | FileCheck --check-prefix=GFX9 %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx801 -mattr=-fast-fmaf -show-encoding %s | FileCheck --check-prefix=GFX8 %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -mattr=-mad-mix-insts -show-encoding %s | FileCheck --check-prefix=GFX9 %s
 
 .hsa_code_object_isa
 // GFX8:  .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
diff --git a/test/MC/AMDGPU/isa-version-hsa.s b/test/MC/AMDGPU/isa-version-hsa.s
index 74e688163bc..9004e1c3ac5 100644
--- a/test/MC/AMDGPU/isa-version-hsa.s
+++ b/test/MC/AMDGPU/isa-version-hsa.s
@@ -1,10 +1,10 @@
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
 
 // OSABI-HSA: .amd_amdgpu_isa "amdgcn-amd-amdhsa--gfx802"
 // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line
diff --git a/test/MC/AMDGPU/isa-version-pal.s b/test/MC/AMDGPU/isa-version-pal.s
index a872ff84258..42051b62c0d 100644
--- a/test/MC/AMDGPU/isa-version-pal.s
+++ b/test/MC/AMDGPU/isa-version-pal.s
@@ -1,10 +1,10 @@
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
-// RUN: llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: llvm-mc -triple amdgcn-amd-amdpal -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
+// RUN: llvm-mc -triple amdgcn-amd-amdpal -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
 
 // OSABI-PAL: .amd_amdgpu_isa "amdgcn-amd-amdpal--gfx802"
 // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line
diff --git a/test/MC/AMDGPU/isa-version-unk.s b/test/MC/AMDGPU/isa-version-unk.s
index 2b20ecb9285..81792ade083 100644
--- a/test/MC/AMDGPU/isa-version-unk.s
+++ b/test/MC/AMDGPU/isa-version-unk.s
@@ -1,10 +1,10 @@
-// RUN: llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
-// RUN: llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
-// RUN: not llvm-mc -triple amdgcn-amd-unknown -mattr=-code-object-v3 -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
-// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mattr=-code-object-v3 -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx802 %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
+// RUN: llvm-mc -triple amdgcn-amd-unknown -mcpu=iceland %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK %s
+// RUN: not llvm-mc -triple amdgcn-amd-unknown -mcpu=gfx803 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=gfx802 %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
+// RUN: not llvm-mc -triple amdgcn-amd-amdpal -mcpu=iceland %s 2>&1 | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ERR %s
 
 // OSABI-UNK: .amd_amdgpu_isa "amdgcn-amd-unknown--gfx802"
 // OSABI-UNK-ERR: error: .amd_amdgpu_isa directive does not match triple and/or mcpu arguments specified through the command line
diff --git a/test/MC/AMDGPU/sym_option.s b/test/MC/AMDGPU/sym_option.s
index 98b4067168e..8bc9495c9ed 100644
--- a/test/MC/AMDGPU/sym_option.s
+++ b/test/MC/AMDGPU/sym_option.s
@@ -1,12 +1,12 @@
-// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=tahiti %s | FileCheck %s --check-prefix=SI
-// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=bonaire %s | FileCheck %s --check-prefix=BONAIRE
-// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=hawaii %s | FileCheck %s --check-prefix=HAWAII
-// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=kabini  %s | FileCheck %s --check-prefix=KABINI
-// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=iceland %s | FileCheck %s --check-prefix=ICELAND
-// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=carrizo %s | FileCheck %s --check-prefix=CARRIZO
-// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=tonga %s | FileCheck %s --check-prefix=TONGA
-// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=fiji %s | FileCheck %s --check-prefix=FIJI
-// RUN: llvm-mc -arch=amdgcn -mattr=-code-object-v3 -mcpu=stoney  %s | FileCheck %s --check-prefix=STONEY
+// RUN: llvm-mc -arch=amdgcn -mcpu=tahiti      %s | FileCheck %s --check-prefix=SI
+// RUN: llvm-mc -arch=amdgcn -mcpu=bonaire %s | FileCheck %s --check-prefix=BONAIRE
+// RUN: llvm-mc -arch=amdgcn -mcpu=hawaii %s | FileCheck %s --check-prefix=HAWAII
+// RUN: llvm-mc -arch=amdgcn -mcpu=kabini  %s | FileCheck %s --check-prefix=KABINI
+// RUN: llvm-mc -arch=amdgcn -mcpu=iceland %s | FileCheck %s --check-prefix=ICELAND
+// RUN: llvm-mc -arch=amdgcn -mcpu=carrizo %s | FileCheck %s --check-prefix=CARRIZO
+// RUN: llvm-mc -arch=amdgcn -mcpu=tonga %s | FileCheck %s --check-prefix=TONGA
+// RUN: llvm-mc -arch=amdgcn -mcpu=fiji %s | FileCheck %s --check-prefix=FIJI
+// RUN: llvm-mc -arch=amdgcn -mcpu=stoney  %s | FileCheck %s --check-prefix=STONEY
 
 .byte .option.machine_version_major
 // SI: .byte 6
diff --git a/test/Object/AMDGPU/objdump.s b/test/Object/AMDGPU/objdump.s
index 3c3f4a11df4..31306ee90d8 100644
--- a/test/Object/AMDGPU/objdump.s
+++ b/test/Object/AMDGPU/objdump.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=tonga %s -mattr=-code-object-v3 -filetype=obj | llvm-objdump -disassemble -arch-name=amdgcn -mcpu=tonga - | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=tonga %s -filetype=obj | llvm-objdump -disassemble -arch-name=amdgcn -mcpu=tonga - | FileCheck %s
 
 	.text
 
-- 
GitLab


From fbcb9d60a26cd9aa10b5fb290d00206631d1cba5 Mon Sep 17 00:00:00 2001
From: David Bolvansky <david.bolvansky@gmail.com>
Date: Tue, 30 Oct 2018 22:08:13 +0000
Subject: [PATCH 0773/1116] [ARM][NFC] Make tests immune to better div
 optimizations

Summary: Related to D52504

Reviewers: spatel

Reviewed By: spatel

Subscribers: javed.absar, kristof.beyls, chrib, llvm-commits

Differential Revision: https://reviews.llvm.org/D53901

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345665 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/ARM/fold-sext-sextload.ll   | 17 ++++++++---------
 test/CodeGen/ARM/fold-zext-zextload.ll   | 13 ++++++-------
 test/CodeGen/ARM/vector-extend-narrow.ll |  4 ++--
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/test/CodeGen/ARM/fold-sext-sextload.ll b/test/CodeGen/ARM/fold-sext-sextload.ll
index 484e93f59d4..96e2e78a47d 100644
--- a/test/CodeGen/ARM/fold-sext-sextload.ll
+++ b/test/CodeGen/ARM/fold-sext-sextload.ll
@@ -1,15 +1,14 @@
 ; RUN: llc -mtriple armv7 %s -stop-before=livedebugvalues -o - | FileCheck %s
 
-define <4 x i8> @i(<4 x i8>*) !dbg !8 {
-  %2 = load <4 x i8>, <4 x i8>* %0, align 4, !dbg !14
+define <4 x i8> @i(<4 x i8>*, <4 x i8>) !dbg !8 {
+  %3 = load <4 x i8>, <4 x i8>* %0, align 4, !dbg !14
   ; CHECK: $[[reg:.*]] = VLD1LNd32 {{.*}} debug-location !14 :: (load 4 from %ir.0)
-  ; CHECK-NEXT: VMOVLsv8i16 {{.*}} $[[reg]], {{.*}} debug-location !14
-  ; CHECK-NEXT: VMOVLsv4i32 {{.*}} $[[reg]], {{.*}} debug-location !14
-
-  %3 = sdiv <4 x i8> zeroinitializer, %2, !dbg !15
-  call void @llvm.dbg.value(metadata <4 x i8> %2, metadata !11, metadata !DIExpression()), !dbg !14
-  call void @llvm.dbg.value(metadata <4 x i8> %3, metadata !13, metadata !DIExpression()), !dbg !15
-  ret <4 x i8> %3, !dbg !16
+  ; CHECK: VMOVLsv8i16 {{.*}} $[[reg]], {{.*}} debug-location !14
+  ; CHECK: VMOVLsv4i32 {{.*}} $[[reg]], {{.*}} debug-location !14
+  %4 = sdiv <4 x i8> %1, %3, !dbg !15
+  call void @llvm.dbg.value(metadata <4 x i8> %3, metadata !11, metadata !DIExpression()), !dbg !14
+  call void @llvm.dbg.value(metadata <4 x i8> %4, metadata !13, metadata !DIExpression()), !dbg !15
+  ret <4 x i8> %4, !dbg !16
 }
 
 declare void @llvm.dbg.value(metadata, metadata, metadata)
diff --git a/test/CodeGen/ARM/fold-zext-zextload.ll b/test/CodeGen/ARM/fold-zext-zextload.ll
index 3ff0dd885a8..25e226fda66 100644
--- a/test/CodeGen/ARM/fold-zext-zextload.ll
+++ b/test/CodeGen/ARM/fold-zext-zextload.ll
@@ -1,15 +1,14 @@
 ; RUN: llc -mtriple armv7 %s -stop-before=livedebugvalues -o - | FileCheck %s
 
-define <4 x i8> @i(<4 x i8>*) !dbg !8 {
-  %2 = load <4 x i8>, <4 x i8>* %0, align 4, !dbg !14
+define <4 x i8> @i(<4 x i8>*, <4 x i8>) !dbg !8 {
+  %3 = load <4 x i8>, <4 x i8>* %0, align 4, !dbg !14
   ; CHECK: $[[reg:.*]] = VLD1LNd32 {{.*}} debug-location !14 :: (load 4 from %ir.0)
   ; CHECK-NEXT: VMOVLuv8i16 {{.*}} $[[reg]], {{.*}} debug-location !14
   ; CHECK-NEXT: VMOVLuv4i32 {{.*}} $[[reg]], {{.*}} debug-location !14
-
-  %3 = udiv <4 x i8> zeroinitializer, %2, !dbg !15
-  call void @llvm.dbg.value(metadata <4 x i8> %2, metadata !11, metadata !DIExpression()), !dbg !14
-  call void @llvm.dbg.value(metadata <4 x i8> %3, metadata !13, metadata !DIExpression()), !dbg !15
-  ret <4 x i8> %3, !dbg !16
+  %4 = udiv <4 x i8> %1, %3, !dbg !15
+  call void @llvm.dbg.value(metadata <4 x i8> %3, metadata !11, metadata !DIExpression()), !dbg !14
+  call void @llvm.dbg.value(metadata <4 x i8> %4, metadata !13, metadata !DIExpression()), !dbg !15
+  ret <4 x i8> %4, !dbg !16
 }
 
 declare void @llvm.dbg.value(metadata, metadata, metadata)
diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll
index d054bfda615..1aaffcc302d 100644
--- a/test/CodeGen/ARM/vector-extend-narrow.ll
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll
@@ -48,7 +48,7 @@ define <4 x i8> @h(<4 x float> %v) {
 }
 
 ; CHECK-LABEL: i:
-define <4 x i8> @i(<4 x i8>* %x) {
+define <4 x i8> @i(<4 x i8>* %x, <4 x i8> %y) {
 ; Note: vld1 here is reasonably important. Mixing VFP and NEON
 ; instructions is bad on some cores
   ; CHECK: vld1
@@ -59,7 +59,7 @@ define <4 x i8> @i(<4 x i8>* %x) {
   ; CHECK: vmul
   ; CHECK: vmovn
   %1 = load <4 x i8>, <4 x i8>* %x, align 4
-  %2 = sdiv <4 x i8> zeroinitializer, %1
+  %2 = sdiv <4 x i8> %y, %1
   ret <4 x i8> %2
 }
 ; CHECK-LABEL: j:
-- 
GitLab


From 83c015bd7346755833a1ffbf06d8d6c8a7e4adca Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Tue, 30 Oct 2018 23:28:27 +0000
Subject: [PATCH 0774/1116] MachineOperand/MIParser: Do not print debug-use
 flag, infer it

The debug-use flag must be set exactly for uses on DBG_VALUEs.  This is
so obvious that it can be trivially inferred while parsing. This will
reduce noise when printing while omitting an information that has little
value to the user.

The parser will keep recognizing the flag for compatibility with old
`.mir` files.

Differential Revision: https://reviews.llvm.org/D53903

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345671 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MIRParser/MIParser.cpp            |  2 +
 lib/CodeGen/MachineOperand.cpp                |  4 +-
 test/CodeGen/AArch64/GlobalISel/debug-cpp.ll  |  2 +-
 .../CodeGen/AArch64/GlobalISel/debug-insts.ll |  6 +-
 .../GlobalISel/regbankselect-dbg-value.mir    |  4 +-
 .../AArch64/GlobalISel/select-dbg-value.mir   |  8 +-
 test/CodeGen/AArch64/phi-dbg.ll               |  4 +-
 ...ld-immediate-operand-shrink-with-carry.mir |  4 +-
 test/CodeGen/AMDGPU/regcoalesce-dbg.mir       |  2 +-
 test/CodeGen/AMDGPU/sched-crash-dbg-value.mir |  4 +-
 test/CodeGen/ARM/ARMLoadStoreDBG.mir          | 22 +++---
 test/CodeGen/ARM/dbg-range-extension.mir      | 52 ++++++-------
 test/CodeGen/ARM/sched-it-debug-nodes.mir     | 22 +++---
 test/CodeGen/Hexagon/early-if-debug.mir       | 20 ++---
 test/CodeGen/MIR/X86/diexpr-win32.mir         |  6 +-
 .../MIR/X86/instructions-debug-location.mir   |  6 +-
 test/CodeGen/MIR/X86/pr38773.mir              |  4 +-
 test/CodeGen/PowerPC/debuginfo-split-int.ll   |  4 +-
 test/CodeGen/PowerPC/debuginfo-stackarg.ll    |  2 +-
 .../X86/coalesce-dbg-value-subreg-rewrite.mir |  4 +-
 .../dbg-changes-codegen-branch-folding2.mir   |  2 +-
 test/CodeGen/X86/dbg-value-superreg-copy.mir  |  2 +-
 test/CodeGen/X86/lea-opt-with-debug.mir       |  4 +-
 test/CodeGen/X86/machine-cp-debug.mir         |  2 +-
 test/CodeGen/X86/opt_phis.mir                 |  2 +-
 test/CodeGen/X86/post-ra-sched-with-debug.mir | 14 ++--
 test/CodeGen/X86/postra-ignore-dbg-instrs.mir |  6 +-
 test/CodeGen/X86/shrink_wrap_dbg_value.mir    |  8 +-
 .../compiler-gen-bbs-livedebugvalues.ll       | 10 +--
 test/DebugInfo/ARM/sdag-split-arg1.ll         |  2 +-
 test/DebugInfo/MIR/AArch64/clobber-sp.mir     | 12 +--
 .../MIR/ARM/live-debug-values-reg-copy.mir    | 10 +--
 .../MIR/ARM/split-superreg-complex.mir        |  2 +-
 .../MIR/ARM/split-superreg-piece.mir          |  2 +-
 test/DebugInfo/MIR/ARM/split-superreg.mir     |  2 +-
 test/DebugInfo/MIR/Mips/last-inst-bundled.mir | 10 +--
 .../MIR/Mips/live-debug-values-reg-copy.mir   | 20 ++---
 test/DebugInfo/MIR/X86/bit-piece-dh.mir       |  2 +-
 test/DebugInfo/MIR/X86/kill-after-spill.mir   | 12 +--
 .../MIR/X86/live-debug-values-3preds.mir      | 68 ++++++++---------
 .../MIR/X86/live-debug-values-reg-copy.mir    |  8 +-
 .../MIR/X86/live-debug-values-spill.mir       | 74 +++++++++----------
 test/DebugInfo/MIR/X86/live-debug-values.mir  | 18 ++---
 .../live-debug-vars-unused-arg-debugonly.mir  | 16 ++--
 .../MIR/X86/live-debug-vars-unused-arg.mir    | 18 ++---
 .../MIR/X86/livedebugvalues-limit.mir         | 28 +++----
 test/DebugInfo/MIR/X86/mlicm-hoist.mir        |  8 +-
 test/DebugInfo/MIR/X86/regcoalescer.mir       |  4 +-
 test/DebugInfo/MSP430/sdagsplit-1.ll          |  8 +-
 .../WebAssembly/dbg-value-live-interval.ll    |  2 +-
 .../DebugInfo/WebAssembly/dbg-value-move-2.ll |  2 +-
 test/DebugInfo/WebAssembly/dbg-value-move.ll  |  2 +-
 test/DebugInfo/X86/bbjoin.ll                  |  8 +-
 .../X86/live-debug-vars-discard-invalid.mir   | 38 +++++-----
 test/DebugInfo/X86/live-debug-vars-dse.mir    |  2 +-
 test/DebugInfo/X86/live-debug-vars-index.mir  |  8 +-
 test/DebugInfo/X86/pr34545.ll                 | 10 +--
 test/DebugInfo/X86/sdag-combine.ll            |  2 +-
 test/DebugInfo/X86/sdag-dangling-dbgvalue.ll  | 24 +++---
 test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll |  8 +-
 test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll | 12 +--
 test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll | 12 +--
 test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll | 10 +--
 test/DebugInfo/X86/sdag-legalize-multires.ll  |  4 +-
 test/DebugInfo/X86/sdag-salvage-add.ll        |  4 +-
 test/DebugInfo/X86/sdagsplit-1.ll             |  4 +-
 66 files changed, 357 insertions(+), 351 deletions(-)

diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index da758da873c..1a6174bf9ee 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -752,6 +752,8 @@ bool MIParser::parse(MachineInstr *&MI) {
     Optional<unsigned> TiedDefIdx;
     if (parseMachineOperandAndTargetFlags(MO, TiedDefIdx))
       return true;
+    if (OpCode == TargetOpcode::DBG_VALUE && MO.isReg())
+      MO.setIsDebug();
     Operands.push_back(
         ParsedMachineOperand(MO, Loc, Token.location(), TiedDefIdx));
     if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) ||
diff --git a/lib/CodeGen/MachineOperand.cpp b/lib/CodeGen/MachineOperand.cpp
index 97d32a5d66a..4fe51f66248 100644
--- a/lib/CodeGen/MachineOperand.cpp
+++ b/lib/CodeGen/MachineOperand.cpp
@@ -744,10 +744,10 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << "undef ";
     if (isEarlyClobber())
       OS << "early-clobber ";
-    if (isDebug())
-      OS << "debug-use ";
     if (TargetRegisterInfo::isPhysicalRegister(getReg()) && isRenamable())
       OS << "renamable ";
+    // isDebug() is exactly true for register operands of a DBG_VALUE. So we
+    // simply infer it when parsing and do not need to print it.
 
     const MachineRegisterInfo *MRI = nullptr;
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
diff --git a/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll b/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll
index e603af678de..caf0a2eebca 100644
--- a/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll
+++ b/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll
@@ -18,7 +18,7 @@ target triple = "aarch64-unknown-linux-gnu"
 %struct.NTCopy = type { i32 }
 
 ; CHECK-LABEL: name: _Z3foo6NTCopy
-; CHECK: DBG_VALUE debug-use %{{[0-9]+}}(p0), 0, !23, !DIExpression(), debug-location !24
+; CHECK: DBG_VALUE %{{[0-9]+}}(p0), 0, !23, !DIExpression(), debug-location !24
 ; Function Attrs: noinline nounwind optnone
 define dso_local i32 @_Z3foo6NTCopy(%struct.NTCopy* %o) #0 !dbg !7 {
 entry:
diff --git a/test/CodeGen/AArch64/GlobalISel/debug-insts.ll b/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
index 256eb37f6d4..2945d65d3e2 100644
--- a/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
+++ b/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
@@ -15,7 +15,7 @@ entry:
 }
 
 ; CHECK-LABEL: name: debug_declare_vla
-; CHECK: DBG_VALUE debug-use %{{[0-9]+}}(p0), 0, !14, !DIExpression(), debug-location !15
+; CHECK: DBG_VALUE %{{[0-9]+}}(p0), 0, !14, !DIExpression(), debug-location !15
 define void @debug_declare_vla(i32 %in) #0 !dbg !13 {
 entry:
   %vla.addr = alloca i32, i32 %in
@@ -27,10 +27,10 @@ entry:
 ; CHECK: [[IN:%[0-9]+]]:_(s32) = COPY $w0
 define void @debug_value(i32 %in) #0 !dbg !16 {
   %addr = alloca i32
-; CHECK: DBG_VALUE debug-use [[IN]](s32), debug-use $noreg, !17, !DIExpression(), debug-location !18
+; CHECK: DBG_VALUE [[IN]](s32), $noreg, !17, !DIExpression(), debug-location !18
   call void @llvm.dbg.value(metadata i32 %in, i64 0, metadata !17, metadata !DIExpression()), !dbg !18
   store i32 %in, i32* %addr
-; CHECK: DBG_VALUE debug-use %1(p0), debug-use $noreg, !17, !DIExpression(DW_OP_deref), debug-location !18
+; CHECK: DBG_VALUE %1(p0), $noreg, !17, !DIExpression(DW_OP_deref), debug-location !18
   call void @llvm.dbg.value(metadata i32* %addr, i64 0, metadata !17, metadata !DIExpression(DW_OP_deref)), !dbg !18
 ; CHECK: DBG_VALUE 123, 0, !17, !DIExpression(), debug-location !18
   call void @llvm.dbg.value(metadata i32 123, i64 0, metadata !17, metadata !DIExpression()), !dbg !18
diff --git a/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir b/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir
index 1528a809771..c64e2f78ab3 100644
--- a/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir
+++ b/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir
@@ -36,8 +36,8 @@ body: |
   bb.0:
     liveins: $w0
     %0:_(s32) = COPY $w0
-    ; CHECK: DBG_VALUE debug-use %0(s32), debug-use $noreg, !7, !DIExpression(), debug-location !9
-    DBG_VALUE debug-use %0(s32), debug-use $noreg, !7, !DIExpression(), debug-location !9
+    ; CHECK: DBG_VALUE %0(s32), $noreg, !7, !DIExpression(), debug-location !9
+    DBG_VALUE %0(s32), $noreg, !7, !DIExpression(), debug-location !9
 
     ; CHECK: DBG_VALUE $noreg, 0, !7, !DIExpression(), debug-location !9
     DBG_VALUE $noreg, 0, !7, !DIExpression(), debug-location !9
diff --git a/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir b/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir
index f75d5629478..72fbfad89c2 100644
--- a/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir
+++ b/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir
@@ -46,11 +46,11 @@ body: |
     ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
     ; CHECK: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[COPY]], [[COPY]]
     ; CHECK: $w0 = COPY [[ADDWrr]]
-    ; CHECK: DBG_VALUE debug-use [[ADDWrr]], debug-use $noreg, !7, !DIExpression(), debug-location !9
+    ; CHECK: DBG_VALUE [[ADDWrr]], $noreg, !7, !DIExpression(), debug-location !9
     %0:gpr(s32) = COPY $w0
     %1:gpr(s32) = G_ADD %0, %0
     $w0 = COPY %1(s32)
-    DBG_VALUE debug-use %1(s32), debug-use $noreg, !7, !DIExpression(), debug-location !9
+    DBG_VALUE %1(s32), $noreg, !7, !DIExpression(), debug-location !9
 ...
 
 ---
@@ -62,7 +62,7 @@ body: |
     liveins: $w0
     ; CHECK-LABEL: name: test_dbg_value_dead
     ; CHECK-NOT: COPY
-    ; CHECK: DBG_VALUE debug-use $noreg, debug-use $noreg, !7, !DIExpression(), debug-location !9
+    ; CHECK: DBG_VALUE $noreg, $noreg, !7, !DIExpression(), debug-location !9
     %0:gpr(s32) = COPY $w0
-    DBG_VALUE debug-use %0(s32), debug-use $noreg, !7, !DIExpression(), debug-location !9
+    DBG_VALUE %0(s32), $noreg, !7, !DIExpression(), debug-location !9
 ...
diff --git a/test/CodeGen/AArch64/phi-dbg.ll b/test/CodeGen/AArch64/phi-dbg.ll
index 0b5c6677acd..4f7c005f802 100644
--- a/test/CodeGen/AArch64/phi-dbg.ll
+++ b/test/CodeGen/AArch64/phi-dbg.ll
@@ -35,11 +35,11 @@ bb2:
 bb3:
 ; CHECK: bb.3.bb3:
 ; CHECK:   [[PHIDEST:%[0-9]+]]:gpr32 = COPY [[PHIREG]]
-; CHECK-NEXT:   DBG_VALUE debug-use [[PHIDEST]]
+; CHECK-NEXT:   DBG_VALUE [[PHIDEST]]
   %.0 = phi i32 [ 12, %bb2 ], [ 1, %entry ]
   call void @llvm.dbg.value(metadata i32 %.0, i64 0, metadata !15, metadata !13), !dbg !16
 ; CHECK: [[ADD:%[0-9]+]]:gpr32 = nsw ADDWrr [[PHIDEST]]
-; CHECK-NEXT: DBG_VALUE debug-use [[ADD]]
+; CHECK-NEXT: DBG_VALUE [[ADD]]
   %v5 = add nsw i32 %.0, %a0, !dbg !22
   call void @llvm.dbg.value(metadata i32 %v5, i64 0, metadata !15, metadata !13), !dbg !16
   ret i32 %v5, !dbg !23
diff --git a/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir b/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir
index ec0faf7ab66..f3a1168885f 100644
--- a/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir
+++ b/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir
@@ -63,7 +63,7 @@ body:             |
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345
     ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
-    ; GCN: DBG_VALUE debug-use %5:sreg_64_xexec, debug-use $noreg
+    ; GCN: DBG_VALUE %5:sreg_64_xexec, $noreg
     ; GCN: S_ENDPGM implicit [[V_ADD_I32_e32_]]
     %0:sreg_32_xm0 = S_MOV_B32 12345
     %1:vgpr_32 = IMPLICIT_DEF
@@ -71,7 +71,7 @@ body:             |
     %3:vgpr_32 = IMPLICIT_DEF
 
     %4:vgpr_32, %5:sreg_64_xexec = V_ADD_I32_e64 %0, %1, implicit $exec
-    DBG_VALUE debug-use %5, debug-use $noreg
+    DBG_VALUE %5, $noreg
     S_ENDPGM implicit %4
 
 ...
diff --git a/test/CodeGen/AMDGPU/regcoalesce-dbg.mir b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
index 693b7d827c5..d69bbda463c 100644
--- a/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
+++ b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
@@ -5,7 +5,7 @@
 # a slot index.
 
 # CHECK: %13.sub2:sgpr_128 = S_MOV_B32 0
-# CHECK: DBG_VALUE{{.*}}debug-use %13.sub2
+# CHECK: DBG_VALUE{{.*}} %13.sub2
 
 --- |
   define amdgpu_kernel void @test(i32 addrspace(1)* %out) { ret void }
diff --git a/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
index 182096305f4..3d52f5aad04 100644
--- a/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
+++ b/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
@@ -169,7 +169,7 @@
 ---
 
 # CHECK: name: sched_dbg_value_crash
-# CHECK: DBG_VALUE debug-use %99, debug-use $noreg, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8
+# CHECK: DBG_VALUE %99, $noreg, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8
 
 name:            sched_dbg_value_crash
 alignment:       0
@@ -319,7 +319,7 @@ body:             |
     %124:vgpr_32 = IMPLICIT_DEF
     %125:vgpr_32 = IMPLICIT_DEF
     %126:vgpr_32 = IMPLICIT_DEF
-    DBG_VALUE debug-use %103, debug-use _, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8
+    DBG_VALUE %103, _, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8
     ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32
     %127:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
     $sgpr4 = COPY $sgpr101
diff --git a/test/CodeGen/ARM/ARMLoadStoreDBG.mir b/test/CodeGen/ARM/ARMLoadStoreDBG.mir
index 76f1523f779..ce33dcf52ec 100644
--- a/test/CodeGen/ARM/ARMLoadStoreDBG.mir
+++ b/test/CodeGen/ARM/ARMLoadStoreDBG.mir
@@ -120,19 +120,19 @@ body:             |
   bb.0.entry:
     liveins: $r0, $r1, $r2, $r3, $lr, $r7
 
-    DBG_VALUE debug-use $r0, debug-use $noreg, !18, !27, debug-location !28
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
-    DBG_VALUE debug-use $r2, debug-use $noreg, !20, !27, debug-location !28
-    DBG_VALUE debug-use $r3, debug-use $noreg, !21, !27, debug-location !28
+    DBG_VALUE $r0, $noreg, !18, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r2, $noreg, !20, !27, debug-location !28
+    DBG_VALUE $r3, $noreg, !21, !27, debug-location !28
     t2CMPri $r3, 4, 14, $noreg, implicit-def $cpsr, debug-location !31
     t2Bcc %bb.2.if.end, 2, killed $cpsr
 
   bb.1:
     liveins: $lr, $r7
 
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
     $r0 = t2MOVi -1, 14, $noreg, $noreg
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
     tBX_RET 14, $noreg, implicit $r0, debug-location !34
 
   bb.2.if.end:
@@ -142,12 +142,12 @@ body:             |
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4
     frame-setup CFI_INSTRUCTION offset $r7, -8
-    DBG_VALUE debug-use $r0, debug-use $noreg, !18, !27, debug-location !28
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
-    DBG_VALUE debug-use $r2, debug-use $noreg, !20, !27, debug-location !28
-    DBG_VALUE debug-use $r3, debug-use $noreg, !21, !27, debug-location !28
+    DBG_VALUE $r0, $noreg, !18, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r2, $noreg, !20, !27, debug-location !28
+    DBG_VALUE $r3, $noreg, !21, !27, debug-location !28
     $r1 = COPY killed $r2, debug-location !32
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
     $r2 = COPY killed $r3, debug-location !32
     tBL 14, $noreg, @g, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit-def $sp, debug-location !32
     $r0 = t2MOVi 0, 14, $noreg, $noreg
diff --git a/test/CodeGen/ARM/dbg-range-extension.mir b/test/CodeGen/ARM/dbg-range-extension.mir
index 0dd9ed2b207..f2b174a8ac0 100644
--- a/test/CodeGen/ARM/dbg-range-extension.mir
+++ b/test/CodeGen/ARM/dbg-range-extension.mir
@@ -23,37 +23,37 @@
 # CHECK: [[VAR_I:![0-9]+]] = !DILocalVariable(name: "i",
 
 # CHECK: bb.0.entry
-# CHECK: DBG_VALUE debug-use $r0, debug-use $noreg, [[VAR_A]]
-# CHECK: DBG_VALUE debug-use [[REG_A:\$r[0-9]+]], debug-use $noreg, [[VAR_A]]
-# CHECK: DBG_VALUE debug-use [[REG_B:\$r[0-9]+]], debug-use $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE $r0, $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_A:\$r[0-9]+]], $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_B:\$r[0-9]+]], $noreg, [[VAR_B]]
 
 # CHECK: bb.1.if.then
-# CHECK: DBG_VALUE debug-use [[REG_B]], debug-use $noreg, [[VAR_B]]
-# CHECK: DBG_VALUE debug-use [[REG_A]], debug-use $noreg, [[VAR_A]]
-# CHECK: DBG_VALUE debug-use [[REG_C:\$r[0-9]+]], debug-use $noreg, [[VAR_C]]
+# CHECK: DBG_VALUE [[REG_B]], $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE [[REG_A]], $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_C:\$r[0-9]+]], $noreg, [[VAR_C]]
 # CHECK: DBG_VALUE 1, 0, [[VAR_I]]
 
 # CHECK: bb.2.for.body
-# CHECK: DBG_VALUE debug-use [[REG_I:\$r[0-9]+]], debug-use $noreg, [[VAR_I]]
-# CHECK: DBG_VALUE debug-use [[REG_C]], debug-use $noreg, [[VAR_C]]
-# CHECK: DBG_VALUE debug-use [[REG_B]], debug-use $noreg, [[VAR_B]]
-# CHECK: DBG_VALUE debug-use [[REG_A]], debug-use $noreg, [[VAR_A]]
-# CHECK: DBG_VALUE debug-use [[REG_I]], debug-use $noreg, [[VAR_I]]
+# CHECK: DBG_VALUE [[REG_I:\$r[0-9]+]], $noreg, [[VAR_I]]
+# CHECK: DBG_VALUE [[REG_C]], $noreg, [[VAR_C]]
+# CHECK: DBG_VALUE [[REG_B]], $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE [[REG_A]], $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_I]], $noreg, [[VAR_I]]
 
 # CHECK: bb.3.for.cond
-# CHECK: DBG_VALUE debug-use [[REG_C]], debug-use $noreg, [[VAR_C]]
-# CHECK: DBG_VALUE debug-use [[REG_B]], debug-use $noreg, [[VAR_B]]
-# CHECK: DBG_VALUE debug-use [[REG_A]], debug-use $noreg, [[VAR_A]]
-# CHECK: DBG_VALUE debug-use [[REG_I]], debug-use $noreg, [[VAR_I]]
+# CHECK: DBG_VALUE [[REG_C]], $noreg, [[VAR_C]]
+# CHECK: DBG_VALUE [[REG_B]], $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE [[REG_A]], $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_I]], $noreg, [[VAR_I]]
 
 # CHECK: bb.4.for.cond.cleanup
-# CHECK: DBG_VALUE debug-use [[REG_C]], debug-use $noreg, [[VAR_C]]
-# CHECK: DBG_VALUE debug-use [[REG_B]], debug-use $noreg, [[VAR_B]]
-# CHECK: DBG_VALUE debug-use [[REG_A]], debug-use $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_C]], $noreg, [[VAR_C]]
+# CHECK: DBG_VALUE [[REG_B]], $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE [[REG_A]], $noreg, [[VAR_A]]
 
 # CHECK: bb.5.if.end
-# CHECK: DBG_VALUE debug-use [[REG_B]], debug-use $noreg, [[VAR_B]]
-# CHECK: DBG_VALUE debug-use [[REG_A]], debug-use $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_B]], $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE [[REG_A]], $noreg, [[VAR_A]]
 --- |
   ; ModuleID = '/data/kwalker/work/OpenSource-llvm/llvm/test/CodeGen/ARM/dbg-range-extension.ll'
   source_filename = "/data/kwalker/work/OpenSource-llvm/llvm/test/CodeGen/ARM/dbg-range-extension.ll"
@@ -219,14 +219,14 @@ body:             |
     frame-setup CFI_INSTRUCTION offset $r6, -16
     frame-setup CFI_INSTRUCTION offset $r5, -20
     frame-setup CFI_INSTRUCTION offset $r4, -24
-    DBG_VALUE debug-use $r0, debug-use $noreg, !13, !20, debug-location !21
+    DBG_VALUE $r0, $noreg, !13, !20, debug-location !21
     $r4 = MOVr killed $r0, 14, $noreg, $noreg
-    DBG_VALUE debug-use $r4, debug-use $noreg, !13, !20, debug-location !21
+    DBG_VALUE $r4, $noreg, !13, !20, debug-location !21
     $r0 = MOVi 10, 14, $noreg, _, debug-location !22
     $r1 = MOVi 11, 14, $noreg, _, debug-location !22
     BL @func2, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit killed $r1, implicit-def $sp, implicit-def $r0, debug-location !22
     $r5 = MOVr killed $r0, 14, $noreg, _, debug-location !22
-    DBG_VALUE debug-use $r5, debug-use $noreg, !14, !20, debug-location !23
+    DBG_VALUE $r5, $noreg, !14, !20, debug-location !23
     CMPri $r4, 0, 14, $noreg, implicit-def $cpsr, debug-location !25
     Bcc %bb.5.if.end, 0, killed $cpsr
   
@@ -237,7 +237,7 @@ body:             |
     $r1 = MOVi 13, 14, $noreg, _, debug-location !26
     BL @func2, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit killed $r1, implicit-def $sp, implicit-def $r0, debug-location !26
     $r6 = MOVr killed $r0, 14, $noreg, _, debug-location !26
-    DBG_VALUE debug-use $r6, debug-use $noreg, !15, !20, debug-location !27
+    DBG_VALUE $r6, $noreg, !15, !20, debug-location !27
     $r7 = MOVi 1, 14, $noreg, $noreg
     DBG_VALUE 1, 0, !18, !20, debug-location !28
     B %bb.3.for.cond
@@ -249,12 +249,12 @@ body:             |
     $r0 = MOVr $r7, 14, $noreg, _, debug-location !36
     BL @func2, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit killed $r1, implicit-def $sp, implicit-def dead $r0, debug-location !36
     $r7 = ADDri killed $r7, 1, 14, $noreg, _, debug-location !38
-    DBG_VALUE debug-use $r7, debug-use $noreg, !18, !20, debug-location !28
+    DBG_VALUE $r7, $noreg, !18, !20, debug-location !28
   
   bb.3.for.cond:
     liveins: $r4, $r5, $r6, $r7
   
-    DBG_VALUE debug-use $r7, debug-use $noreg, !18, !20, debug-location !28
+    DBG_VALUE $r7, $noreg, !18, !20, debug-location !28
     CMPrr $r7, $r4, 14, $noreg, implicit-def $cpsr, debug-location !33
     Bcc %bb.2.for.body, 11, killed $cpsr, debug-location !33
   
diff --git a/test/CodeGen/ARM/sched-it-debug-nodes.mir b/test/CodeGen/ARM/sched-it-debug-nodes.mir
index 8d0688ef01d..ec42e7df3b2 100644
--- a/test/CodeGen/ARM/sched-it-debug-nodes.mir
+++ b/test/CodeGen/ARM/sched-it-debug-nodes.mir
@@ -33,7 +33,7 @@
   ; hopefully, triggering an assert).
 
   ; CHECK: BUNDLE implicit-def dead $itstate{{.*}} {
-  ; CHECK: DBG_VALUE debug-use $r1, debug-use $noreg, !"u"
+  ; CHECK: DBG_VALUE $r1, $noreg, !"u"
   ; CHECK-NOT: DBG_VALUE killed $r1, $noreg, !"u"
 
   declare arm_aapcscc void @g(%struct.s*, i8*, i32) #1
@@ -131,23 +131,23 @@ body:             |
   bb.0.entry:
     liveins: $r0, $r1, $r2, $r3, $lr, $r7
 
-    DBG_VALUE debug-use $r0, debug-use $noreg, !18, !27, debug-location !28
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
-    DBG_VALUE debug-use $r2, debug-use $noreg, !20, !27, debug-location !28
-    DBG_VALUE debug-use $r3, debug-use $noreg, !21, !27, debug-location !28
+    DBG_VALUE $r0, $noreg, !18, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r2, $noreg, !20, !27, debug-location !28
+    DBG_VALUE $r3, $noreg, !21, !27, debug-location !28
     t2CMPri $r3, 4, 14, $noreg, implicit-def $cpsr, debug-location !31
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
     $r0 = t2MOVi -1, 3, $cpsr, $noreg, implicit undef $r0
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
     tBX_RET 3, $cpsr, implicit $r0, debug-location !34
     $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4
     frame-setup CFI_INSTRUCTION offset $r7, -8
-    DBG_VALUE debug-use $r0, debug-use $noreg, !18, !27, debug-location !28
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
-    DBG_VALUE debug-use $r2, debug-use $noreg, !20, !27, debug-location !28
-    DBG_VALUE debug-use $r3, debug-use $noreg, !21, !27, debug-location !28
+    DBG_VALUE $r0, $noreg, !18, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r2, $noreg, !20, !27, debug-location !28
+    DBG_VALUE $r3, $noreg, !21, !27, debug-location !28
     $r1 = tMOVr killed $r2, 14, $noreg, debug-location !32
     $r2 = tMOVr killed $r3, 14, $noreg, debug-location !32
     tBL 14, $noreg, @g, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit-def $sp, debug-location !32
diff --git a/test/CodeGen/Hexagon/early-if-debug.mir b/test/CodeGen/Hexagon/early-if-debug.mir
index 27e6124d352..b76f41019a0 100644
--- a/test/CodeGen/Hexagon/early-if-debug.mir
+++ b/test/CodeGen/Hexagon/early-if-debug.mir
@@ -6,11 +6,11 @@
 # CHECK: %0:intregs = COPY $r0
 # CHECK: %1:predregs = C2_cmpeqi %0, 0
 # CHECK: %2:intregs = A2_tfrsi 123
-# CHECK: DBG_VALUE debug-use %0, debug-use $noreg
-# CHECK: DBG_VALUE debug-use %0, debug-use $noreg
-# CHECK: DBG_VALUE debug-use %0, debug-use $noreg
-# CHECK: DBG_VALUE debug-use %0, debug-use $noreg
-# CHECK: DBG_VALUE debug-use %0, debug-use $noreg
+# CHECK: DBG_VALUE %0, $noreg
+# CHECK: DBG_VALUE %0, $noreg
+# CHECK: DBG_VALUE %0, $noreg
+# CHECK: DBG_VALUE %0, $noreg
+# CHECK: DBG_VALUE %0, $noreg
 # CHECK: %3:intregs = A2_tfrsi 321
 # CHECK: %5:intregs = C2_mux %1, %2, %3
 
@@ -40,11 +40,11 @@ body:             |
     J2_jump %bb.1, implicit-def dead $pc
 
   bb.1:
-    DBG_VALUE debug-use %0, debug-use $noreg, !1, !1
-    DBG_VALUE debug-use %0, debug-use $noreg, !1, !1
-    DBG_VALUE debug-use %0, debug-use $noreg, !1, !1
-    DBG_VALUE debug-use %0, debug-use $noreg, !1, !1
-    DBG_VALUE debug-use %0, debug-use $noreg, !1, !1
+    DBG_VALUE %0, $noreg, !1, !1
+    DBG_VALUE %0, $noreg, !1, !1
+    DBG_VALUE %0, $noreg, !1, !1
+    DBG_VALUE %0, $noreg, !1, !1
+    DBG_VALUE %0, $noreg, !1, !1
     %3 = A2_tfrsi 321
 
   bb.2:
diff --git a/test/CodeGen/MIR/X86/diexpr-win32.mir b/test/CodeGen/MIR/X86/diexpr-win32.mir
index 3388ef714d4..384c6bf57c9 100644
--- a/test/CodeGen/MIR/X86/diexpr-win32.mir
+++ b/test/CodeGen/MIR/X86/diexpr-win32.mir
@@ -193,8 +193,8 @@ body:             |
     CFI_INSTRUCTION def_cfa_offset 8
     CFI_INSTRUCTION offset $esi, -8
     $esi = MOV32rm $esp, 1, _, 8, _ :: (load 4 from %fixed-stack.2)
-    DBG_VALUE debug-use $esp, 0, !26, !10, debug-location !25
-    DBG_VALUE debug-use $esp, 0, !23, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_deref), debug-location !25
+    DBG_VALUE $esp, 0, !26, !10, debug-location !25
+    DBG_VALUE $esp, 0, !23, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_deref), debug-location !25
     CALLpcrel32 @getString, csr_32, implicit $esp, implicit-def $esp, implicit-def $eax, debug-location !29
     $ecx = MOV32rm $eax, 1, _, 0, _, debug-location !29 :: (dereferenceable load 4 from %ir.1)
     $edx = MOV32rm $eax, 1, _, 4, _, debug-location !29 :: (dereferenceable load 4 from %ir.1 + 4)
@@ -245,7 +245,7 @@ body:             |
   bb.0.entry:
     $eax = MOV32rm $esp, 1, _, 4, _ :: (load 4 from %fixed-stack.1)
     $eax = MOV32rm killed $eax, 1, _, 0, _, debug-location !34 :: (load 4 from %ir.0)
-    DBG_VALUE debug-use $eax, 0, !35, !DIExpression(DW_OP_constu, 4, DW_OP_minus), debug-location !34
+    DBG_VALUE $eax, 0, !35, !DIExpression(DW_OP_constu, 4, DW_OP_minus), debug-location !34
     $eax = ADD32rm killed $eax, $esp, 1, _, 8, _, implicit-def dead $eflags, debug-location !36 :: (load 4 from %fixed-stack.0)
     RET 0, $eax, debug-location !36
 
diff --git a/test/CodeGen/MIR/X86/instructions-debug-location.mir b/test/CodeGen/MIR/X86/instructions-debug-location.mir
index ec819628f44..8b6c5cbf526 100644
--- a/test/CodeGen/MIR/X86/instructions-debug-location.mir
+++ b/test/CodeGen/MIR/X86/instructions-debug-location.mir
@@ -59,10 +59,14 @@ stack:
 body: |
   bb.0.entry:
     liveins: $edi
-    ; CHECK: DBG_VALUE debug-use $noreg, 0, !11, !DIExpression(), debug-location !12
+    ; CHECK: DBG_VALUE $noreg, 0, !11, !DIExpression(), debug-location !12
+    ; CHECK: DBG_VALUE $noreg, 0, !11, !DIExpression(), debug-location !12
     ; CHECK: $eax = COPY %0, debug-location !13
     ; CHECK: RETQ $eax, debug-location !13
     %0 = COPY $edi
+    DBG_VALUE _, 0, !12, !DIExpression(), debug-location !13
+    ; Test whether debug-use is still recognized for compatibility with old
+    ; files.
     DBG_VALUE debug-use _, 0, !12, !DIExpression(), debug-location !13
     MOV32mr %stack.0.x.addr, 1, _, 0, _, %0
     $eax = COPY %0, debug-location !14
diff --git a/test/CodeGen/MIR/X86/pr38773.mir b/test/CodeGen/MIR/X86/pr38773.mir
index 0cf0bb25b9e..19b0debf297 100644
--- a/test/CodeGen/MIR/X86/pr38773.mir
+++ b/test/CodeGen/MIR/X86/pr38773.mir
@@ -97,8 +97,8 @@ body:             |
     IDIV32r killed renamable $ecx, implicit-def $eax, implicit-def dead $edx, implicit-def dead $eflags, implicit $eax, implicit killed $edx
     renamable $ecx = COPY $eax
     ; CHECK:        IDIV32r killed renamable $ecx
-    ; CHECK-NEXT:   DBG_VALUE debug-use $eax, debug-use $noreg, !12, !DIExpression(), debug-location !13
-    DBG_VALUE debug-use $ecx, debug-use $noreg, !12, !DIExpression(), debug-location !13
+    ; CHECK-NEXT:   DBG_VALUE $eax, $noreg, !12, !DIExpression(), debug-location !13
+    DBG_VALUE $ecx, $noreg, !12, !DIExpression(), debug-location !13
     $eax = COPY killed renamable $ecx
     RET 0, $eax
 
diff --git a/test/CodeGen/PowerPC/debuginfo-split-int.ll b/test/CodeGen/PowerPC/debuginfo-split-int.ll
index 5a1e409441b..e12d5e5d220 100644
--- a/test/CodeGen/PowerPC/debuginfo-split-int.ll
+++ b/test/CodeGen/PowerPC/debuginfo-split-int.ll
@@ -27,9 +27,9 @@ target triple = "ppc32"
 ;
 ; High 32 bits in R3, low 32 bits in R4
 ; CHECK: %0:gprc = COPY $r3
-; CHECK: DBG_VALUE debug-use %0, debug-use $noreg, [[DL]], !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK: DBG_VALUE %0, $noreg, [[DL]], !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 ; CHECK: %1:gprc = COPY $r4
-; CHECK: DBG_VALUE debug-use %1, debug-use $noreg, [[DL]], !DIExpression(DW_OP_LLVM_fragment, 32, 32)
+; CHECK: DBG_VALUE %1, $noreg, [[DL]], !DIExpression(DW_OP_LLVM_fragment, 32, 32)
 define void @bar() local_unnamed_addr #0 !dbg !6 {
   %1 = alloca i64, align 8
   %2 = tail call i64 @foo()
diff --git a/test/CodeGen/PowerPC/debuginfo-stackarg.ll b/test/CodeGen/PowerPC/debuginfo-stackarg.ll
index 3830589b4cb..b49f363ed80 100644
--- a/test/CodeGen/PowerPC/debuginfo-stackarg.ll
+++ b/test/CodeGen/PowerPC/debuginfo-stackarg.ll
@@ -34,7 +34,7 @@ define i64 @foo(i64 %bar1, i64 %bar2, i64 %bar3, i64 %bar4, i64 %bar5) local_unn
 ; We expect to find a DBG_VALUE refering to the metadata id for bar5, using the lowest
 ; of the two fixed stack offsets found earlier.
 ; CHECK-LABEL: body:
-; CHECK: DBG_VALUE debug-use $r1, 0, !17, !DIExpression(DW_OP_plus_uconst, 8)
+; CHECK: DBG_VALUE $r1, 0, !17, !DIExpression(DW_OP_plus_uconst, 8)
 entry:
   tail call void @llvm.dbg.value(metadata i64 %bar1, metadata !13, metadata !DIExpression()), !dbg !18
   tail call void @llvm.dbg.value(metadata i64 %bar2, metadata !14, metadata !DIExpression()), !dbg !19
diff --git a/test/CodeGen/X86/coalesce-dbg-value-subreg-rewrite.mir b/test/CodeGen/X86/coalesce-dbg-value-subreg-rewrite.mir
index 3c339bb013d..8fa3d82a966 100644
--- a/test/CodeGen/X86/coalesce-dbg-value-subreg-rewrite.mir
+++ b/test/CodeGen/X86/coalesce-dbg-value-subreg-rewrite.mir
@@ -36,7 +36,7 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     %0:gr16_abcd = MOV16ri 0
-    DBG_VALUE debug-use %0.sub_8bit:gr16_abcd, debug-use $noreg, !11, !DIExpression(), debug-location !13
+    DBG_VALUE %0.sub_8bit:gr16_abcd, $noreg, !11, !DIExpression(), debug-location !13
     undef %6.sub_8bit:gr16_abcd = COPY killed %0.sub_8bit
     dead $dx = COPY killed %6
 
@@ -48,4 +48,4 @@ body:             |
 #
 # CHECK:      bb.0.entry:
 # CHECK-NEXT:    $dx = MOV16ri 0
-# CHECK-NEXT:    DBG_VALUE debug-use $dl,
+# CHECK-NEXT:    DBG_VALUE $dl,
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding2.mir b/test/CodeGen/X86/dbg-changes-codegen-branch-folding2.mir
index 1a9221ae9e9..def14391a51 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding2.mir
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding2.mir
@@ -207,7 +207,7 @@ body:             |
     liveins: $rdi
   
     dead renamable $al = MOV8rm $rsp, 1, $noreg, -121, $noreg 
-    DBG_VALUE debug-use $al, debug-use $noreg, !16, !DIExpression(), debug-location !19
+    DBG_VALUE $al, $noreg, !16, !DIExpression(), debug-location !19
     renamable $ecx = XOR32rr undef $ecx, undef $ecx, implicit-def dead $eflags
     JMP_1 %bb.1
 ...
diff --git a/test/CodeGen/X86/dbg-value-superreg-copy.mir b/test/CodeGen/X86/dbg-value-superreg-copy.mir
index cd316dea88b..0a8af06b9ab 100644
--- a/test/CodeGen/X86/dbg-value-superreg-copy.mir
+++ b/test/CodeGen/X86/dbg-value-superreg-copy.mir
@@ -37,7 +37,7 @@ body:             |
     %0:gr16_abcd = MOV16ri 1
 
   bb.1:
-    DBG_VALUE debug-use %0.sub_8bit_hi, debug-use $noreg, !7, !DIExpression(), debug-location !9
+    DBG_VALUE %0.sub_8bit_hi, $noreg, !7, !DIExpression(), debug-location !9
     %1:gr16 = COPY %0
     %2:gr16 = COPY %0
 
diff --git a/test/CodeGen/X86/lea-opt-with-debug.mir b/test/CodeGen/X86/lea-opt-with-debug.mir
index 34525d73ea7..a1cf2041db6 100644
--- a/test/CodeGen/X86/lea-opt-with-debug.mir
+++ b/test/CodeGen/X86/lea-opt-with-debug.mir
@@ -98,7 +98,7 @@ body:             |
     ; CHECK: %3:gr64_nosp = LEA64r %2, 2, %2, 0, $noreg, debug-location !13
     ; CHECK-NEXT: %4:gr64 = LEA64r %1, 4, %3, 0, $noreg, debug-location !13
     ; CHECK-NOT: %0:gr64 = LEA64r %1, 4, %3, 8, $noreg, debug-location !14
-    ; CHECK: DBG_VALUE debug-use %4, debug-use $noreg, !11, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_stack_value), debug-location !15
+    ; CHECK: DBG_VALUE %4, $noreg, !11, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_stack_value), debug-location !15
 
     %1 = MOV64rm $rip, 1, $noreg, @c, $noreg, debug-location !13 :: (dereferenceable load 8 from @c)
     %2 = MOVSX64rm32 $rip, 1, $noreg, @a, $noreg, debug-location !13 :: (dereferenceable load 4 from @a)
@@ -107,7 +107,7 @@ body:             |
     %5 = COPY %4.sub_32bit, debug-location !13
     MOV32mr $rip, 1, $noreg, @d, $noreg, killed %5, debug-location !13 :: (store 4 into @d)
     %0 = LEA64r %1, 4, %3, 8, $noreg, debug-location !14
-    DBG_VALUE debug-use %0, debug-use $noreg, !11, !DIExpression(), debug-location !15
+    DBG_VALUE %0, $noreg, !11, !DIExpression(), debug-location !15
 
     ; CHECK-LABEL: bb.1 (%ir-block.8):
     ; CHECK: %6:gr32 = MOV32rm %4, 1, $noreg, 8, $noreg, debug-location !17 :: (load 4 from %ir.7)
diff --git a/test/CodeGen/X86/machine-cp-debug.mir b/test/CodeGen/X86/machine-cp-debug.mir
index a7fcd9801e7..a3230e8910c 100644
--- a/test/CodeGen/X86/machine-cp-debug.mir
+++ b/test/CodeGen/X86/machine-cp-debug.mir
@@ -19,5 +19,5 @@ body: |
   bb.0:
     liveins: $eax
     $ebx = COPY $eax
-    DBG_VALUE debug-use $ebx, debug-use _, !1, !1
+    DBG_VALUE $ebx, _, !1, !1
 ...
diff --git a/test/CodeGen/X86/opt_phis.mir b/test/CodeGen/X86/opt_phis.mir
index e282a92e201..f00ee76385d 100644
--- a/test/CodeGen/X86/opt_phis.mir
+++ b/test/CodeGen/X86/opt_phis.mir
@@ -27,7 +27,7 @@ body:             |
 
   bb.1:
     %1:gr32 = PHI %0, %bb.0, %2, %bb.1
-    DBG_VALUE debug-use %1, debug-use _, !7, !DIExpression(), debug-location !6
+    DBG_VALUE %1, _, !7, !DIExpression(), debug-location !6
     %2:gr32 = IMPLICIT_DEF
     JMP_1 %bb.1
 ...
diff --git a/test/CodeGen/X86/post-ra-sched-with-debug.mir b/test/CodeGen/X86/post-ra-sched-with-debug.mir
index f4f69b60279..079374752b1 100644
--- a/test/CodeGen/X86/post-ra-sched-with-debug.mir
+++ b/test/CodeGen/X86/post-ra-sched-with-debug.mir
@@ -251,8 +251,8 @@ body:             |
     liveins: $esi, $rdi, $r14, $rbx, $rbp
 
     ; CHECK:      [[REGISTER:\$r[a-z0-9]+]] = LEA64r {{\$r[a-z0-9]+}}, 1, $noreg, -20, $noreg
-    ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use $noreg, ![[J_VAR]], !DIExpression(), debug-location ![[J_LOC]]
-    ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
+    ; CHECK-NEXT: DBG_VALUE [[REGISTER]], $noreg, ![[J_VAR]], !DIExpression(), debug-location ![[J_LOC]]
+    ; CHECK-NEXT: DBG_VALUE [[REGISTER]], $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
 
     frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
     CFI_INSTRUCTION def_cfa_offset 16
@@ -268,8 +268,8 @@ body:             |
     $rbx = MOV64rr $rdi
     CALL64pcrel32 @_ZN1lC2Ei, csr_64, implicit $rsp, implicit $rdi, implicit $esi, implicit-def $rsp
     $rdi = LEA64r $rbx, 1, $noreg, 8, $noreg
-    DBG_VALUE debug-use $rdi, debug-use $noreg, !20, !17, debug-location !27
-    DBG_VALUE debug-use $rdi, debug-use $noreg, !10, !17, debug-location !18
+    DBG_VALUE $rdi, $noreg, !20, !17, debug-location !27
+    DBG_VALUE $rdi, $noreg, !10, !17, debug-location !18
     $rax = MOV64rm $rbx, 1, $noreg, 16, $noreg :: (load 8)
     MOV64mr $rbx, 1, $noreg, 8, $noreg, killed $rax :: (store 8)
     MOV64mr $rbx, 1, $noreg, 24, $noreg, $rdi :: (store 8)
@@ -286,9 +286,9 @@ body:             |
     $rsi = CMOVNE64rr killed $rsi, $rdx, implicit killed $eflags
     $rsi = OR64rr killed $rsi, killed $rcx, implicit-def $eflags
     $rcx = LEA64r $rbp, 1, $noreg, -20, $noreg
-    DBG_VALUE debug-use $rcx, debug-use $noreg, !46, !17, debug-location !48
-    DBG_VALUE debug-use $rcx, debug-use $noreg, !39, !17, debug-location !44
-    DBG_VALUE debug-use $rbp, -20, !29, !17, debug-location !36
+    DBG_VALUE $rcx, $noreg, !46, !17, debug-location !48
+    DBG_VALUE $rcx, $noreg, !39, !17, debug-location !44
+    DBG_VALUE $rbp, -20, !29, !17, debug-location !36
     $rcx = CMOVNE64rr killed $rcx, killed $rdx, implicit killed $eflags
     $rcx = OR64rr killed $rcx, killed $rsi, implicit-def dead $eflags
     $rdx = MOVSX64rm32 $rbx, 1, $noreg, 0, $noreg :: (load 4, align 8)
diff --git a/test/CodeGen/X86/postra-ignore-dbg-instrs.mir b/test/CodeGen/X86/postra-ignore-dbg-instrs.mir
index 0286e6e68bf..25e6992b7b7 100644
--- a/test/CodeGen/X86/postra-ignore-dbg-instrs.mir
+++ b/test/CodeGen/X86/postra-ignore-dbg-instrs.mir
@@ -62,7 +62,7 @@
 # CHECK-NOT: $eax = COPY $edi
 # CHECK: bb.1:
 # CHECK: renamable $eax = COPY $edi
-# CHECK-NEXT: DBG_VALUE debug-use $eax,
+# CHECK-NEXT: DBG_VALUE $eax,
 # CHECK: bb.2:
 name:            x1
 alignment:       4
@@ -71,9 +71,9 @@ body: |
   bb.0:
     successors: %bb.2, %bb.1; %bb.2, %bb.1
     liveins: $edi
-    DBG_VALUE debug-use $edi, debug-use $noreg, !14, !DIExpression(), debug-location !16
+    DBG_VALUE $edi, $noreg, !14, !DIExpression(), debug-location !16
     renamable $eax = COPY $edi
-    DBG_VALUE debug-use $eax, debug-use $noreg, !14, !DIExpression(), debug-location !16
+    DBG_VALUE $eax, $noreg, !14, !DIExpression(), debug-location !16
     CMP32mi8 $rip, 1, $noreg, @x0, $noreg, 0, implicit-def $eflags, debug-location !16
     JE_1 %bb.2, implicit killed $eflags, debug-location !16
     JMP_1 %bb.1, debug-location !16
diff --git a/test/CodeGen/X86/shrink_wrap_dbg_value.mir b/test/CodeGen/X86/shrink_wrap_dbg_value.mir
index 429ea72db8e..6943033c565 100644
--- a/test/CodeGen/X86/shrink_wrap_dbg_value.mir
+++ b/test/CodeGen/X86/shrink_wrap_dbg_value.mir
@@ -136,8 +136,8 @@ body:             |
     successors: %bb.4(0x40000000), %bb.1(0x40000000)
     liveins: $ecx, $edx
   
-    DBG_VALUE debug-use $edx, debug-use $noreg, !15, !DIExpression(), debug-location !25
-    DBG_VALUE debug-use $ecx, debug-use $noreg, !16, !DIExpression(), debug-location !26
+    DBG_VALUE $edx, $noreg, !15, !DIExpression(), debug-location !25
+    DBG_VALUE $ecx, $noreg, !16, !DIExpression(), debug-location !26
     $eax = COPY $ecx
     DBG_VALUE %fixed-stack.0, 0, !16, !DIExpression(), debug-location !26
     DBG_VALUE %fixed-stack.1, 0, !15, !DIExpression(), debug-location !25
@@ -149,9 +149,9 @@ body:             |
     successors: %bb.2(0x80000000)
   
     $esi = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0)
-    DBG_VALUE debug-use $esi, debug-use $noreg, !13, !DIExpression(), debug-location !19
+    DBG_VALUE $esi, $noreg, !13, !DIExpression(), debug-location !19
     $edi = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.1)
-    DBG_VALUE debug-use $edi, debug-use $noreg, !14, !DIExpression(), debug-location !20
+    DBG_VALUE $edi, $noreg, !14, !DIExpression(), debug-location !20
     $edi = DEC32r killed $edi, implicit-def dead $eflags, debug-location !30
     $ebx = LEA32r %fixed-stack.1, 1, $noreg, 0, $noreg
   
diff --git a/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.ll b/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.ll
index 0606ddf6087..5c2fe8447a6 100644
--- a/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.ll
+++ b/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.ll
@@ -12,28 +12,28 @@ entry:
 ; CHECK-LABEL: bb.0.entry:
   %var = add i32 %x, 1, !dbg !12
   call void @llvm.dbg.value(metadata i32 %var, metadata !9, metadata !DIExpression()), !dbg !12
-; CHECK: DBG_VALUE debug-use renamable $w0, debug-use $noreg, !9, !DIExpression(), debug-location !12
+; CHECK: DBG_VALUE renamable $w0, $noreg, !9, !DIExpression(), debug-location !12
 ; CHECK-NEXT: STRWui killed $w0, $sp, 3 :: (store 4 into %stack.0)
-; CHECK-NEXT: DBG_VALUE debug-use $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+; CHECK-NEXT: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
 
   br label %artificial-bb-1, !dbg !13
 
 artificial-bb-1:                                  ; preds = %entry
 ; CHECK-LABEL: bb.1.artificial-bb-1:
-; CHECK: DBG_VALUE debug-use $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
 
   br label %artificial-bb-2
 
 artificial-bb-2:                                  ; preds = %artificial-bb-1
 ; CHECK-LABEL: bb.2.artificial-bb-2:
-; CHECK: DBG_VALUE debug-use $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
 
   %invisible = add i32 %var, 1
   br label %return, !dbg !14
 
 return:                                           ; preds = %artificial-bb-2
 ; CHECK-LABEL: bb.3.return:
-; CHECK: DBG_VALUE debug-use $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
 
   call void @use(i32 %var)
   ret void, !dbg !15
diff --git a/test/DebugInfo/ARM/sdag-split-arg1.ll b/test/DebugInfo/ARM/sdag-split-arg1.ll
index 90834a44ba8..78cdc4dd4bd 100644
--- a/test/DebugInfo/ARM/sdag-split-arg1.ll
+++ b/test/DebugInfo/ARM/sdag-split-arg1.ll
@@ -7,7 +7,7 @@ entry:
   %0 = bitcast double %a to i64
   %extract.t84 = trunc i64 %0 to i32
   tail call void @llvm.dbg.value(metadata i32 %extract.t84, metadata !8, metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32)), !dbg !12
-  ; CHECK: DBG_VALUE debug-use $r0, debug-use $noreg, !6, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+  ; CHECK: DBG_VALUE $r0, $noreg, !6, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
   %r.sroa.0.0.insert.ext35 = zext i32 %extract.t84 to i64
   ret i64 %r.sroa.0.0.insert.ext35
 }
diff --git a/test/DebugInfo/MIR/AArch64/clobber-sp.mir b/test/DebugInfo/MIR/AArch64/clobber-sp.mir
index 222bbd798ba..4594065cc29 100644
--- a/test/DebugInfo/MIR/AArch64/clobber-sp.mir
+++ b/test/DebugInfo/MIR/AArch64/clobber-sp.mir
@@ -145,11 +145,11 @@ body:             |
     $sp = frame-setup SUBXri $sp, 32, 0
     frame-setup STPXi killed $fp, killed $lr, $sp, 2 :: (store 8 into %stack.3), (store 8 into %stack.2)
     $fp = frame-setup ADDXri $sp, 16, 0
-    DBG_VALUE debug-use $w0, debug-use _, !19, !22, debug-location !23
+    DBG_VALUE $w0, _, !19, !22, debug-location !23
     STURWi killed $w0, $fp, -4 :: (store 4 into %stack.0.x.addr)
-    DBG_VALUE debug-use $w1, debug-use _, !20, !22, debug-location !28
+    DBG_VALUE $w1, _, !20, !22, debug-location !28
     STRWui killed $w1, $sp, 2, debug-location !30 :: (store 4 into %stack.1)
-    DBG_VALUE debug-use $sp, 0, !20, !36, debug-location !28
+    DBG_VALUE $sp, 0, !20, !36, debug-location !28
     BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $d0, implicit killed $d1, implicit killed $d2, implicit killed $d3, implicit-def $sp, debug-location !30
     $w0 = LDRWui $sp, 2, debug-location !33 :: (load 4 from %stack.1)
     CBZW killed $w0, %bb.2.if.end, debug-location !33
@@ -157,13 +157,13 @@ body:             |
   bb.1.if.then:
     successors: %bb.2.if.end(0x80000000)
 
-    DBG_VALUE debug-use $sp, 0, !20, !36, debug-location !28
+    DBG_VALUE $sp, 0, !20, !36, debug-location !28
     $x0 = SUBXri $fp, 4, 0
-    DBG_VALUE debug-use $x0, debug-use _, !19, !22, debug-location !23
+    DBG_VALUE $x0, _, !19, !22, debug-location !23
     BL @h, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, debug-location !34
 
   bb.2.if.end:
-    DBG_VALUE debug-use $sp, 0, !20, !36, debug-location !28
+    DBG_VALUE $sp, 0, !20, !36, debug-location !28
     $w8 = MOVZWi 0, 0
     $x0 = ORRXrs $xzr, undef $x8, 0, implicit killed $w8, debug-location !35
     $fp, $lr = LDPXi $sp, 2, debug-location !35 :: (load 8 from %stack.3), (load 8 from %stack.2)
diff --git a/test/DebugInfo/MIR/ARM/live-debug-values-reg-copy.mir b/test/DebugInfo/MIR/ARM/live-debug-values-reg-copy.mir
index e14c0a47051..e29420e27d5 100644
--- a/test/DebugInfo/MIR/ARM/live-debug-values-reg-copy.mir
+++ b/test/DebugInfo/MIR/ARM/live-debug-values-reg-copy.mir
@@ -5,9 +5,9 @@
 # to another. The altered instructions are labeled below.
 #
 # CHECK: ![[ARG1:.*]] = !DILocalVariable(name: "arg1"
-# CHECK: DBG_VALUE debug-use $r4, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK: DBG_VALUE $r4, $noreg, ![[ARG1]], !DIExpression(), debug-location
 # CHECK: $r5 = MOVr killed $r4, 14, $noreg, $noreg, debug-location
-# CHECK-NEXT: DBG_VALUE debug-use $r5, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK-NEXT: DBG_VALUE $r5, $noreg, ![[ARG1]], !DIExpression(), debug-location
 --- |
   ; ModuleID = 'live-debug-values-reg-copy.ll'
   source_filename = "live-debug-values-reg-copy.c"
@@ -119,8 +119,8 @@ body:             |
     frame-setup CFI_INSTRUCTION offset $r11, -8
     frame-setup CFI_INSTRUCTION offset $r5, -12
     frame-setup CFI_INSTRUCTION offset $r4, -16
-    DBG_VALUE debug-use $r0, debug-use $noreg, !13, !DIExpression(), debug-location !16
-    DBG_VALUE debug-use $r0, debug-use $noreg, !13, !DIExpression(), debug-location !16
+    DBG_VALUE $r0, $noreg, !13, !DIExpression(), debug-location !16
+    DBG_VALUE $r0, $noreg, !13, !DIExpression(), debug-location !16
     CMPri renamable $r0, 10, 14, $noreg, implicit-def $cpsr, debug-location !16
     Bcc %bb.2, 13, killed $cpsr, debug-location !16
   
@@ -132,7 +132,7 @@ body:             |
   
   bb.2.if.else:
     renamable $r4 = ADDri killed renamable $r0, 10, 14, $noreg, $noreg, debug-location !16
-    DBG_VALUE debug-use $r4, debug-use $noreg, !13, !DIExpression(), debug-location !16
+    DBG_VALUE $r4, $noreg, !13, !DIExpression(), debug-location !16
     $r0 = MOVr $r4, 14, $noreg, $noreg, debug-location !16
     BL @externFunc2, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit-def $sp, implicit-def $r0, debug-location !16
     $r5 = MOVr killed $r0, 14, $noreg, $noreg, debug-location !16
diff --git a/test/DebugInfo/MIR/ARM/split-superreg-complex.mir b/test/DebugInfo/MIR/ARM/split-superreg-complex.mir
index 89472ec1da0..868321bab2a 100644
--- a/test/DebugInfo/MIR/ARM/split-superreg-complex.mir
+++ b/test/DebugInfo/MIR/ARM/split-superreg-complex.mir
@@ -113,7 +113,7 @@ body:             |
     tBL 14, _, @v, csr_ios, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $r0, implicit-def $r1, implicit-def $r2, implicit-def $r3, debug-location !19
     $d1 = VMOVDRR killed $r2, killed $r3, 14, _, implicit-def $q0, debug-location !19
     $d0 = VMOVDRR killed $r0, killed $r1, 14, _, implicit killed $q0, implicit-def $q0, debug-location !19
-    DBG_VALUE debug-use $q0, debug-use _, !14, !20, debug-location !21
+    DBG_VALUE $q0, _, !14, !20, debug-location !21
     $s4 = VMOVS $s1, 14, _, implicit-def $d2, debug-location !24
     $d0 = VADDfd $d0, killed $d2, 14, _, implicit killed $q0, debug-location !24
     $r0 = VMOVRS $s0, 14, _, implicit killed $d0, debug-location !25
diff --git a/test/DebugInfo/MIR/ARM/split-superreg-piece.mir b/test/DebugInfo/MIR/ARM/split-superreg-piece.mir
index 945fc09d6e4..69b4f7a07a4 100644
--- a/test/DebugInfo/MIR/ARM/split-superreg-piece.mir
+++ b/test/DebugInfo/MIR/ARM/split-superreg-piece.mir
@@ -113,7 +113,7 @@ body:             |
     tBL 14, _, @v, csr_ios, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $r0, implicit-def $r1, implicit-def $r2, implicit-def $r3, debug-location !19
     $d1 = VMOVDRR killed $r2, killed $r3, 14, _, implicit-def $q0, debug-location !19
     $d0 = VMOVDRR killed $r0, killed $r1, 14, _, implicit killed $q0, implicit-def $q0, debug-location !19
-    DBG_VALUE debug-use $q0, debug-use _, !14, !20, debug-location !21
+    DBG_VALUE $q0, _, !14, !20, debug-location !21
     $s4 = VMOVS $s1, 14, _, implicit-def $d2, debug-location !24
     $d0 = VADDfd $d0, killed $d2, 14, _, implicit killed $q0, debug-location !24
     $r0 = VMOVRS $s0, 14, _, implicit killed $d0, debug-location !25
diff --git a/test/DebugInfo/MIR/ARM/split-superreg.mir b/test/DebugInfo/MIR/ARM/split-superreg.mir
index a87c33485bc..39b8b4341fa 100644
--- a/test/DebugInfo/MIR/ARM/split-superreg.mir
+++ b/test/DebugInfo/MIR/ARM/split-superreg.mir
@@ -113,7 +113,7 @@ body:             |
     tBL 14, _, @v, csr_ios, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $r0, implicit-def $r1, implicit-def $r2, implicit-def $r3, debug-location !19
     $d1 = VMOVDRR killed $r2, killed $r3, 14, _, implicit-def $q0, debug-location !19
     $d0 = VMOVDRR killed $r0, killed $r1, 14, _, implicit killed $q0, implicit-def $q0, debug-location !19
-    DBG_VALUE debug-use $q0, debug-use _, !14, !20, debug-location !21
+    DBG_VALUE $q0, _, !14, !20, debug-location !21
     $s4 = VMOVS $s1, 14, _, implicit-def $d2, debug-location !24
     $d0 = VADDfd $d0, killed $d2, 14, _, implicit killed $q0, debug-location !24
     $r0 = VMOVRS $s0, 14, _, implicit killed $d0, debug-location !25
diff --git a/test/DebugInfo/MIR/Mips/last-inst-bundled.mir b/test/DebugInfo/MIR/Mips/last-inst-bundled.mir
index e228c8876a5..b1239697b7b 100644
--- a/test/DebugInfo/MIR/Mips/last-inst-bundled.mir
+++ b/test/DebugInfo/MIR/Mips/last-inst-bundled.mir
@@ -21,7 +21,7 @@
 #
 # Check that last bundled instruction of block gets recognized as end of basic block.
 # CHECK: bb.2.if.end
-# CHECK-NEXT: DBG_VALUE debug-use $s0, debug-use $noreg, !12, !DIExpression(), debug-location !17
+# CHECK-NEXT: DBG_VALUE $s0, $noreg, !12, !DIExpression(), debug-location !17
 
 --- |
   ; ModuleID = '<stdin>'
@@ -161,15 +161,15 @@ body:             |
     SW killed $s0, $sp, 24 :: (store 4 into %stack.2)
     CFI_INSTRUCTION offset $ra_64, -4
     CFI_INSTRUCTION offset $s0_64, -8
-    DBG_VALUE debug-use $a0, debug-use $noreg, !12, !DIExpression(), debug-location !17
+    DBG_VALUE $a0, $noreg, !12, !DIExpression(), debug-location !17
     $s0 = OR $a0, $zero
-    DBG_VALUE debug-use $s0, debug-use $noreg, !12, !DIExpression(), debug-location !17
-    DBG_VALUE debug-use $sp, 0, !13, !DIExpression(DW_OP_plus_uconst, 20), debug-location !19
+    DBG_VALUE $s0, $noreg, !12, !DIExpression(), debug-location !17
+    DBG_VALUE $sp, 0, !13, !DIExpression(DW_OP_plus_uconst, 20), debug-location !19
     JAL @set_cond, csr_o32, implicit-def dead $ra, implicit $a0, implicit $a1, implicit-def $sp, debug-location !20 {
       renamable $a1 = LEA_ADDiu $sp, 20
     }
     renamable $at = LW $sp, 20, debug-location !21 :: (dereferenceable load 4 from %ir.condition, !tbaa !23)
-    DBG_VALUE debug-use $at, debug-use $noreg, !13, !DIExpression(), debug-location !19
+    DBG_VALUE $at, $noreg, !13, !DIExpression(), debug-location !19
     BEQ killed renamable $at, $zero, %bb.2, implicit-def $at, debug-location !27 {
       NOP debug-location !27
     }
diff --git a/test/DebugInfo/MIR/Mips/live-debug-values-reg-copy.mir b/test/DebugInfo/MIR/Mips/live-debug-values-reg-copy.mir
index 70a85c075ec..dd009b8de45 100644
--- a/test/DebugInfo/MIR/Mips/live-debug-values-reg-copy.mir
+++ b/test/DebugInfo/MIR/Mips/live-debug-values-reg-copy.mir
@@ -6,12 +6,12 @@
 #
 # CHECK: ![[ARG1:.*]] = !DILocalVariable(name: "arg1"
 # CHECK: ![[ARG2:.*]] = !DILocalVariable(name: "arg2"
-# CHECK: DBG_VALUE debug-use $s0_64, debug-use $noreg, ![[ARG2]], !DIExpression(), debug-location
+# CHECK: DBG_VALUE $s0_64, $noreg, ![[ARG2]], !DIExpression(), debug-location
 # CHECK: $s1_64 = OR64 killed $s0_64, $zero_64, debug-location
-# CHECK-NEXT: DBG_VALUE debug-use $s1_64, debug-use $noreg, ![[ARG2]], !DIExpression(), debug-location
-# CHECK: DBG_VALUE debug-use $f24, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK-NEXT: DBG_VALUE $s1_64, $noreg, ![[ARG2]], !DIExpression(), debug-location
+# CHECK: DBG_VALUE $f24, $noreg, ![[ARG1]], !DIExpression(), debug-location
 # CHECK: $f26 = FMOV_S killed $f24, debug-location
-# CHECK-NEXT: DBG_VALUE debug-use $f26, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK-NEXT: DBG_VALUE $f26, $noreg, ![[ARG1]], !DIExpression(), debug-location
 
 --- |
   ; ModuleID = 'live-debug-values-reg-copy.ll'
@@ -161,11 +161,11 @@ body:             |
     CFI_INSTRUCTION offset $d24_64, -12
     CFI_INSTRUCTION offset $ra_64, -24
     CFI_INSTRUCTION offset $s0_64, -32
-    DBG_VALUE debug-use $f12, debug-use $noreg, !14, !DIExpression(), debug-location !19
-    DBG_VALUE debug-use $a1_64, debug-use $noreg, !15, !DIExpression(), debug-location !19
-    DBG_VALUE debug-use $s0, debug-use $noreg, !15, !DIExpression(), debug-location !19
-    DBG_VALUE debug-use $s0_64, debug-use $noreg, !15, !DIExpression(), debug-location !19
-    DBG_VALUE debug-use $f12, debug-use $noreg, !14, !DIExpression(), debug-location !19
+    DBG_VALUE $f12, $noreg, !14, !DIExpression(), debug-location !19
+    DBG_VALUE $a1_64, $noreg, !15, !DIExpression(), debug-location !19
+    DBG_VALUE $s0, $noreg, !15, !DIExpression(), debug-location !19
+    DBG_VALUE $s0_64, $noreg, !15, !DIExpression(), debug-location !19
+    DBG_VALUE $f12, $noreg, !14, !DIExpression(), debug-location !19
     renamable $d0_64 = CVT_D64_S renamable $f12, debug-location !19
     renamable $at_64 = LUi64 target-flags(mips-highest) %const.0
     renamable $at_64 = DADDiu killed renamable $at_64, target-flags(mips-higher) %const.0
@@ -211,7 +211,7 @@ body:             |
     renamable $at_64 = DSLL killed renamable $at_64, 16
     renamable $f0 = LWC1 killed renamable $at_64, target-flags(mips-abs-lo) %const.1, debug-location !19 :: (load 4 from constant-pool)
     renamable $f24 = FADD_S killed renamable $f12, killed renamable $f0, debug-location !19
-    DBG_VALUE debug-use $f24, debug-use $noreg, !14, !DIExpression(), debug-location !19
+    DBG_VALUE $f24, $noreg, !14, !DIExpression(), debug-location !19
     JAL @externFunc2, csr_n64, implicit-def dead $ra, implicit $f12, implicit-def $sp, implicit-def $f0, debug-location !19 {
       $f12 = FMOV_S $f24, debug-location !19
     }
diff --git a/test/DebugInfo/MIR/X86/bit-piece-dh.mir b/test/DebugInfo/MIR/X86/bit-piece-dh.mir
index 8c74f8395fe..e8100b71eff 100644
--- a/test/DebugInfo/MIR/X86/bit-piece-dh.mir
+++ b/test/DebugInfo/MIR/X86/bit-piece-dh.mir
@@ -88,7 +88,7 @@ body:             |
     CFI_INSTRUCTION offset $rbp, -16
     $rbp = frame-setup MOV64rr $rsp
     CFI_INSTRUCTION def_cfa_register $rbp
-    DBG_VALUE debug-use $dh, debug-use _, !14, !15, debug-location !16
+    DBG_VALUE $dh, _, !14, !15, debug-location !16
     $edi = SHR32ri killed $edi, 8, implicit-def dead $eflags, debug-location !17
     $eax = MOVSX32rr8 $dil, implicit killed $edi, debug-location !20
     $rbp = POP64r implicit-def $rsp, implicit $rsp, debug-location !20
diff --git a/test/DebugInfo/MIR/X86/kill-after-spill.mir b/test/DebugInfo/MIR/X86/kill-after-spill.mir
index e9c03938f2b..5110dc349be 100644
--- a/test/DebugInfo/MIR/X86/kill-after-spill.mir
+++ b/test/DebugInfo/MIR/X86/kill-after-spill.mir
@@ -14,8 +14,8 @@
 # ...
 #
 # CHECK: bb.1.if.end:
-# CHECK: DBG_VALUE debug-use $rbp, 0, !37, !DIExpression(DW_OP_constu, 44, DW_OP_minus), debug-location !58
-# CHECK-NOT: DBG_VALUE debug-use $rbp, 0, !36, !DIExpression(DW_OP_constu, 48, DW_OP_minus), debug-location !57
+# CHECK: DBG_VALUE $rbp, 0, !37, !DIExpression(DW_OP_constu, 44, DW_OP_minus), debug-location !58
+# CHECK-NOT: DBG_VALUE $rbp, 0, !36, !DIExpression(DW_OP_constu, 48, DW_OP_minus), debug-location !57
 
 --- |
   ; ModuleID = '<stdin>'
@@ -274,12 +274,12 @@ body:             |
     CFI_INSTRUCTION offset $r13, -40
     CFI_INSTRUCTION offset $r14, -32
     CFI_INSTRUCTION offset $r15, -24
-    DBG_VALUE debug-use $edi, debug-use $noreg, !36, !DIExpression(), debug-location !57
-    DBG_VALUE debug-use $esi, debug-use $noreg, !37, !DIExpression(), debug-location !58
+    DBG_VALUE $edi, $noreg, !36, !DIExpression(), debug-location !57
+    DBG_VALUE $esi, $noreg, !37, !DIExpression(), debug-location !58
     $ebx = MOV32rr $esi
-    DBG_VALUE debug-use $ebx, debug-use $noreg, !37, !DIExpression(), debug-location !58
+    DBG_VALUE $ebx, $noreg, !37, !DIExpression(), debug-location !58
     $r15d = MOV32rr $edi
-    DBG_VALUE debug-use $r15d, debug-use $noreg, !36, !DIExpression(), debug-location !57
+    DBG_VALUE $r15d, $noreg, !36, !DIExpression(), debug-location !57
     renamable $r14 = MOV64ri -9223372036854775808
     $edi = MOV32rr $ebx
     CALL64pcrel32 @func1, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
diff --git a/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir b/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
index 1f62a0f8136..8bc340721be 100644
--- a/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
@@ -31,9 +31,9 @@
 # DBG_VALUE for variables "x", "y" and "z" are extended into %bb.9 from its
 # predecessors %bb.0, %bb.2 and %bb.8.
 # CHECK:      bb.9.for.end:
-# CHECK-DAG:  DBG_VALUE debug-use $edi, debug-use $noreg, ![[X_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
-# CHECK-DAG:  DBG_VALUE debug-use $esi, debug-use $noreg, ![[Y_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
-# CHECK-DAG:  DBG_VALUE debug-use $edx, debug-use $noreg, ![[Z_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
+# CHECK-DAG:  DBG_VALUE $edi, $noreg, ![[X_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
+# CHECK-DAG:  DBG_VALUE $esi, $noreg, ![[Y_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
+# CHECK-DAG:  DBG_VALUE $edx, $noreg, ![[Z_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
 # CHECK:      RET
 
 --- |
@@ -186,10 +186,10 @@ body:             |
     successors: %bb.1.for.body.preheader(20), %bb.9.for.end(12)
     liveins: $ecx, $edi, $edx, $esi
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $r8d = MOV32rr $esi, debug-location !26
     $r8d = IMUL32rr killed $r8d, $edi, implicit-def dead $eflags, debug-location !26
@@ -200,10 +200,10 @@ body:             |
     successors: %bb.3.for.body(0)
     liveins: $ecx, $edi, $edx, $esi, $r8d
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags
   
@@ -211,10 +211,10 @@ body:             |
     successors: %bb.4.if.then(4), %bb.5.if.end(124)
     liveins: $eax, $ecx, $edi, $edx, $esi, $r8d
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     TEST32rr $edi, $edi, implicit-def $eflags, debug-location !35
     JG_1 %bb.4.if.then, implicit $eflags
@@ -223,10 +223,10 @@ body:             |
     successors: %bb.6.if.then.4(4), %bb.7.if.end.6(124)
     liveins: $eax, $ecx, $edi, $edx, $esi, $r8d
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     TEST32rr $esi, $esi, implicit-def $eflags, debug-location !39
     JG_1 %bb.6.if.then.4, implicit $eflags
@@ -235,10 +235,10 @@ body:             |
     successors: %bb.8.if.then.8(4), %bb.2.for.cond(124)
     liveins: $eax, $ecx, $edi, $edx, $esi, $r8d
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     TEST32rr $edx, $edx, implicit-def $eflags, debug-location !45
     JG_1 %bb.8.if.then.8, implicit $eflags
@@ -247,13 +247,13 @@ body:             |
     successors: %bb.3.for.body(124), %bb.9.for.end(4)
     liveins: $eax, $ecx, $edi, $edx, $esi, $r8d
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $eax = INC32r killed $eax, implicit-def dead $eflags, debug-location !44
-    DBG_VALUE debug-use $eax, debug-use _, !13, !17, debug-location !25
+    DBG_VALUE $eax, _, !13, !17, debug-location !25
     CMP32rr $eax, $r8d, implicit-def $eflags, debug-location !31
     JL_1 %bb.3.for.body, implicit $eflags
     JMP_1 %bb.9.for.end
@@ -261,8 +261,8 @@ body:             |
   bb.4.if.then:
     liveins: $ecx, $edi
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $ecx = IMUL32rr killed $ecx, killed $edi, implicit-def dead $eflags, debug-location !36
     DBG_VALUE 0, 0, !13, !17, debug-location !25
@@ -272,8 +272,8 @@ body:             |
   bb.6.if.then.4:
     liveins: $ecx, $esi
   
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $ecx = IMUL32rr killed $ecx, killed $esi, implicit-def dead $eflags, debug-location !40
     DBG_VALUE 0, 0, !13, !17, debug-location !25
@@ -284,8 +284,8 @@ body:             |
     successors: %bb.9.for.end(0)
     liveins: $ecx, $edx
   
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $ecx = IMUL32rr killed $ecx, killed $edx, implicit-def dead $eflags, debug-location !46
   
diff --git a/test/DebugInfo/MIR/X86/live-debug-values-reg-copy.mir b/test/DebugInfo/MIR/X86/live-debug-values-reg-copy.mir
index 3e3d0992ac8..edc2a2624ee 100644
--- a/test/DebugInfo/MIR/X86/live-debug-values-reg-copy.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-values-reg-copy.mir
@@ -5,9 +5,9 @@
 # to another. The altered instructions are labeled below.
 #
 # CHECK: ![[ARG1:.*]] = !DILocalVariable(name: "arg1"
-# CHECK: DBG_VALUE debug-use $ebx, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK: DBG_VALUE $ebx, $noreg, ![[ARG1]], !DIExpression(), debug-location
 # CHECK: $r12d = MOV32rr killed $ebx, implicit-def $r12
-# CHECK-NEXT: DBG_VALUE debug-use $r12d, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK-NEXT: DBG_VALUE $r12d, $noreg, ![[ARG1]], !DIExpression(), debug-location
 --- |
   ; ModuleID = 'live-debug-values-reg-copy.ll'
   source_filename = "live-debug-values-reg-copy.c"
@@ -148,9 +148,9 @@ body:             |
     CFI_INSTRUCTION def_cfa_offset 32
     CFI_INSTRUCTION offset $rbx, -24
     CFI_INSTRUCTION offset $rbp, -16
-    DBG_VALUE debug-use $edi, debug-use $noreg, !12, !DIExpression(), debug-location !15
+    DBG_VALUE $edi, $noreg, !12, !DIExpression(), debug-location !15
     $ebx = MOV32rr $edi, implicit-def $rbx
-    DBG_VALUE debug-use $ebx, debug-use $noreg, !12, !DIExpression(), debug-location !15
+    DBG_VALUE $ebx, $noreg, !12, !DIExpression(), debug-location !15
     renamable $rdi = LEA64r $rsp, 1, $noreg, 4, $noreg
     CALL64pcrel32 @init, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, debug-location !15
     renamable $edi = MOV32rm $rsp, 1, $noreg, 4, $noreg :: (dereferenceable load 4 from %ir.local1, !tbaa !20)
diff --git a/test/DebugInfo/MIR/X86/live-debug-values-spill.mir b/test/DebugInfo/MIR/X86/live-debug-values-spill.mir
index fb83963a4e9..78a9a01dda7 100644
--- a/test/DebugInfo/MIR/X86/live-debug-values-spill.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-values-spill.mir
@@ -53,35 +53,35 @@
 #
 # GENERATE:      bb.1.if.end:
 # GENERATE:      MOV32mr $rbp, 1, $noreg, -48, $noreg, killed $edx :: (store 4 into %stack.5)
-# GENERATE-NEXT: DBG_VALUE debug-use $rbp, 0, ![[INT0]], !DIExpression(DW_OP_constu, 48, DW_OP_minus)
+# GENERATE-NEXT: DBG_VALUE $rbp, 0, ![[INT0]], !DIExpression(DW_OP_constu, 48, DW_OP_minus)
 # GENERATE:      MOV32mr $rbp, 1, $noreg, -52, $noreg, killed $r8d :: (store 4 into %stack.4)
-# GENERATE-NEXT: DBG_VALUE debug-use $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
+# GENERATE-NEXT: DBG_VALUE $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
 # GENERATE:      MOV32mr $rbp, 1, $noreg, -56, $noreg, killed $esi :: (store 4 into %stack.3)
-# GENERATE-NEXT: DBG_VALUE debug-use $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
+# GENERATE-NEXT: DBG_VALUE $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
 #
 # Check that the spill locations that are valid at the end of bb.1.if.end are
 # propagated to subsequent BBs.
 #
 # GENERATE:      bb.2.if.then4:
 # GENERATE-NOT:  bb.3:
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
 #
 # GENERATE:      bb.3:
 # GENERATE-NOT:  bb.4.if.end13:
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
 #
 # GENERATE:      bb.4.if.end13:
 # GENERATE-NOT:  bb.5.cleanup:
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
 # 
 # Check that the spill location rbp-48 (the variable int0) is not propagated 
 # because int0 is redefined within the same basic block.
 #
 # TERMINATE:     bb.2.if.then4:
-# TERMINATE-NOT: DBG_VALUE debug-use $rbp, -48,
+# TERMINATE-NOT: DBG_VALUE $rbp, -48,
 --- |
   ; ModuleID = '<stdin>'
   source_filename = "spill1.c"
@@ -369,31 +369,31 @@ body:             |
     CFI_INSTRUCTION offset $r13, -40
     CFI_INSTRUCTION offset $r14, -32
     CFI_INSTRUCTION offset $r15, -24
-    DBG_VALUE debug-use $edi, debug-use _, !24, !38, debug-location !39
-    DBG_VALUE debug-use $esi, debug-use _, !25, !38, debug-location !40
-    DBG_VALUE debug-use $edx, debug-use _, !26, !38, debug-location !41
-    DBG_VALUE debug-use $ecx, debug-use _, !27, !38, debug-location !42
-    DBG_VALUE debug-use $r8d, debug-use _, !28, !38, debug-location !43
-    DBG_VALUE debug-use $r9d, debug-use _, !29, !38, debug-location !44
+    DBG_VALUE $edi, _, !24, !38, debug-location !39
+    DBG_VALUE $esi, _, !25, !38, debug-location !40
+    DBG_VALUE $edx, _, !26, !38, debug-location !41
+    DBG_VALUE $ecx, _, !27, !38, debug-location !42
+    DBG_VALUE $r8d, _, !28, !38, debug-location !43
+    DBG_VALUE $r9d, _, !29, !38, debug-location !44
     $r14d = MOV32rr $r8d
-    DBG_VALUE debug-use $r14d, debug-use _, !28, !38, debug-location !43
+    DBG_VALUE $r14d, _, !28, !38, debug-location !43
     $r12d = MOV32rr $esi
-    DBG_VALUE debug-use $r12d, debug-use _, !25, !38, debug-location !40
+    DBG_VALUE $r12d, _, !25, !38, debug-location !40
     $eax = MOV32rr $edi
-    DBG_VALUE debug-use $eax, debug-use _, !24, !38, debug-location !39
+    DBG_VALUE $eax, _, !24, !38, debug-location !39
     $r13d = MOV32rm $rip, 1, _, @glob0, _, debug-location !46 :: (dereferenceable load 4 from @glob0, !tbaa !47)
-    DBG_VALUE debug-use $r13d, debug-use _, !31, !38, debug-location !51
+    DBG_VALUE $r13d, _, !31, !38, debug-location !51
     $r8d = MOV32rm $rip, 1, _, @glob1, _, debug-location !52 :: (dereferenceable load 4 from @glob1, !tbaa !47)
-    DBG_VALUE debug-use $r8d, debug-use _, !32, !38, debug-location !53
+    DBG_VALUE $r8d, _, !32, !38, debug-location !53
     $r15d = MOV32rm $rip, 1, _, @glob2, _, debug-location !54 :: (dereferenceable load 4 from @glob2, !tbaa !47)
-    DBG_VALUE debug-use $r15d, debug-use _, !33, !38, debug-location !55
+    DBG_VALUE $r15d, _, !33, !38, debug-location !55
     $esi = MOV32rm $rip, 1, _, @glob3, _, debug-location !56 :: (dereferenceable load 4 from @glob3, !tbaa !47)
-    DBG_VALUE debug-use $esi, debug-use _, !34, !38, debug-location !57
+    DBG_VALUE $esi, _, !34, !38, debug-location !57
     $ebx = MOV32rm $rip, 1, _, @glob4, _, debug-location !59 :: (dereferenceable load 4 from @glob4, !tbaa !47)
-    DBG_VALUE debug-use $ebx, debug-use _, !35, !38, debug-location !60
+    DBG_VALUE $ebx, _, !35, !38, debug-location !60
     MOV32mr $rbp, 1, _, -44, _, $ebx, debug-location !60 :: (store 4 into %ir.inte, !tbaa !47)
     $edi = MOV32rm $rip, 1, _, @glob5, _, debug-location !62 :: (dereferenceable load 4 from @glob5, !tbaa !47)
-    DBG_VALUE debug-use $edi, debug-use _, !36, !38, debug-location !63
+    DBG_VALUE $edi, _, !36, !38, debug-location !63
     MOV32mr $rbp, 1, _, -60, _, $edi, debug-location !63 :: (store 4 into %ir.intf, !tbaa !47)
     TEST32rr killed $eax, $eax, implicit-def $eflags, debug-location !67
     JNE_1 %bb.5.cleanup, implicit $eflags
@@ -405,11 +405,11 @@ body:             |
     MOV32mr $rbp, 1, _, -48, _, killed $edx :: (store 4 into %stack.5)
     MOV32mr $rbp, 1, _, -52, _, killed $r8d :: (store 4 into %stack.4)
     MOV32mr $rbp, 1, _, -56, _, killed $esi :: (store 4 into %stack.3)
-    DBG_VALUE debug-use _, debug-use _, !30, !38, debug-location !45
+    DBG_VALUE _, _, !30, !38, debug-location !45
     $r14d = ADD32rr killed $r14d, killed $ecx, implicit-def dead $eflags, debug-location !68
     $r14d = ADD32rr killed $r14d, killed $r9d, implicit-def dead $eflags, debug-location !69
     $r14d = IMUL32rm killed $r14d, $rbp, 1, _, 16, _, implicit-def dead $eflags, debug-location !70 :: (load 4 from %fixed-stack.6, align 16)
-    DBG_VALUE debug-use $r14d, debug-use _, !26, !38, debug-location !41
+    DBG_VALUE $r14d, _, !26, !38, debug-location !41
     CALL64pcrel32 @use, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, debug-location !72
     $edi = MOV32rr killed $ebx, debug-location !73
     CALL64pcrel32 @use, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, debug-location !73
@@ -421,21 +421,21 @@ body:             |
     liveins: $r14d, $r15d, $rbp
   
     $rdi = LEA64r $rbp, 1, _, -44, _
-    DBG_VALUE debug-use $rbp, -44, !35, !38, debug-location !60
+    DBG_VALUE $rbp, -44, !35, !38, debug-location !60
     $rsi = LEA64r $rbp, 1, _, -60, _
-    DBG_VALUE debug-use $rbp, -60, !36, !38, debug-location !63
+    DBG_VALUE $rbp, -60, !36, !38, debug-location !63
     $rdx = LEA64r $rbp, 1, _, -64, _
-    DBG_VALUE debug-use $rbp, -64, !37, !38, debug-location !78
+    DBG_VALUE $rbp, -64, !37, !38, debug-location !78
     CALL64pcrel32 @set, csr_64, implicit $rsp, implicit $rdi, implicit $rsi, implicit $rdx, implicit-def $rsp, debug-location !79
     $eax = MOV32rm $rbp, 1, _, -44, _, debug-location !81 :: (dereferenceable load 4 from %ir.inte, !tbaa !47)
-    DBG_VALUE debug-use $eax, debug-use _, !35, !38, debug-location !60
+    DBG_VALUE $eax, _, !35, !38, debug-location !60
     $r15d = ADD32rm killed $r15d, $rbp, 1, _, -52, _, implicit-def dead $eflags, debug-location !82 :: (load 4 from %stack.4)
     $r15d = IMUL32rr killed $r15d, $eax, implicit-def dead $eflags, debug-location !82
     $r15d = ADD32rm killed $r15d, $rbp, 1, _, -56, _, implicit-def dead $eflags, debug-location !83 :: (load 4 from %stack.3)
     $r15d = IMUL32rr killed $r15d, killed $eax, implicit-def dead $eflags, debug-location !84
-    DBG_VALUE debug-use $r15d, debug-use _, !31, !38, debug-location !51
+    DBG_VALUE $r15d, _, !31, !38, debug-location !51
     $r13d = MOV32rr killed $r15d
-    DBG_VALUE debug-use $r13d, debug-use _, !31, !38, debug-location !51
+    DBG_VALUE $r13d, _, !31, !38, debug-location !51
     JMP_1 %bb.4.if.end13
   
   bb.2:
@@ -443,17 +443,17 @@ body:             |
     liveins: $r13d, $r14d, $rbp
   
     $r14d = ADD32rm killed $r14d, $rbp, 1, _, -48, _, implicit-def dead $eflags, debug-location !71 :: (load 4 from %stack.5)
-    DBG_VALUE debug-use $r14d, debug-use _, !26, !38, debug-location !41
+    DBG_VALUE $r14d, _, !26, !38, debug-location !41
   
   bb.4.if.end13:
     successors: %bb.5.cleanup(0x80000000)
     liveins: $r13d, $r14d, $rbp
   
-    DBG_VALUE debug-use $r14d, debug-use _, !26, !38, debug-location !41
-    DBG_VALUE debug-use $r13d, debug-use _, !31, !38, debug-location !51
+    DBG_VALUE $r14d, _, !26, !38, debug-location !41
+    DBG_VALUE $r13d, _, !31, !38, debug-location !51
     $r13d = IMUL32rm killed $r13d, $rbp, 1, _, 16, _, implicit-def dead $eflags, debug-location !86 :: (load 4 from %fixed-stack.6, align 16)
     $r13d = ADD32rr killed $r13d, killed $r14d, implicit-def dead $eflags, debug-location !87
-    DBG_VALUE debug-use $r13d, debug-use _, !26, !38, debug-location !41
+    DBG_VALUE $r13d, _, !26, !38, debug-location !41
     $edi = MOV32rr killed $r13d, debug-location !88
     CALL64pcrel32 @use, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, debug-location !88
   
diff --git a/test/DebugInfo/MIR/X86/live-debug-values.mir b/test/DebugInfo/MIR/X86/live-debug-values.mir
index c3558aaed31..5245285da5e 100644
--- a/test/DebugInfo/MIR/X86/live-debug-values.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-values.mir
@@ -35,7 +35,7 @@
 # CHECK: ![[N_VAR:[0-9]+]] = !DILocalVariable(name: "n",{{.*}})
 #
 # CHECK:      bb.5.if.end.7:
-# CHECK:        DBG_VALUE debug-use $ebx, debug-use $noreg, ![[N_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
+# CHECK:        DBG_VALUE $ebx, $noreg, ![[N_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
 
 
 --- |
@@ -193,10 +193,10 @@ body:             |
     frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
     CFI_INSTRUCTION def_cfa_offset 16
     CFI_INSTRUCTION offset $rbx, -16
-    DBG_VALUE debug-use $edi, debug-use _, !12, !20, debug-location !21
-    DBG_VALUE debug-use $rsi, debug-use _, !13, !20, debug-location !22
+    DBG_VALUE $edi, _, !12, !20, debug-location !21
+    DBG_VALUE $rsi, _, !13, !20, debug-location !22
     $eax = MOV32rr $edi
-    DBG_VALUE debug-use $eax, debug-use _, !12, !20, debug-location !21
+    DBG_VALUE $eax, _, !12, !20, debug-location !21
     $edi = MOV32ri 2
     CMP32ri8 killed $eax, 2, implicit-def $eflags, debug-location !26
     JNE_1 %bb.2.if.end, implicit $eflags
@@ -205,12 +205,12 @@ body:             |
     successors: %bb.2.if.end(0)
     liveins: $rsi
   
-    DBG_VALUE debug-use $rsi, debug-use _, !13, !20, debug-location !22
+    DBG_VALUE $rsi, _, !13, !20, debug-location !22
     $rdi = MOV64rm killed $rsi, 1, _, 8, _, debug-location !27 :: (load 8 from %ir.arrayidx, !tbaa !28)
     dead $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags, implicit-def $al, debug-location !32
     CALL64pcrel32 @atoi, csr_64, implicit $rsp, implicit $rdi, implicit $al, implicit-def $rsp, implicit-def $eax, debug-location !32
     $edi = MOV32rr $eax, debug-location !32
-    DBG_VALUE debug-use $edi, debug-use _, !14, !20, debug-location !33
+    DBG_VALUE $edi, _, !14, !20, debug-location !33
   
   bb.2.if.end:
     successors: %bb.3.if.then.3(16), %bb.4.if.else.5(16)
@@ -218,7 +218,7 @@ body:             |
   
     CALL64pcrel32 @change, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, implicit-def $eax, debug-location !34
     $ebx = MOV32rr $eax, debug-location !34
-    DBG_VALUE debug-use $ebx, debug-use _, !14, !20, debug-location !33
+    DBG_VALUE $ebx, _, !14, !20, debug-location !33
     CMP32ri8 $ebx, 11, implicit-def $eflags, debug-location !37
     JL_1 %bb.4.if.else.5, implicit killed $eflags, debug-location !37
   
@@ -226,7 +226,7 @@ body:             |
     successors: %bb.5.if.end.7(0)
     liveins: $ebx
   
-    DBG_VALUE debug-use $ebx, debug-use _, !14, !20, debug-location !33
+    DBG_VALUE $ebx, _, !14, !20, debug-location !33
     $edi = MOV32rr $ebx, debug-location !38
     CALL64pcrel32 @modify, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, implicit-def $eax, debug-location !38
     $ecx = MOV32rr $eax, debug-location !38
@@ -237,7 +237,7 @@ body:             |
     successors: %bb.5.if.end.7(0)
     liveins: $ebx
   
-    DBG_VALUE debug-use $ebx, debug-use _, !14, !20, debug-location !33
+    DBG_VALUE $ebx, _, !14, !20, debug-location !33
     $edi = MOV32rr killed $ebx, debug-location !42
     CALL64pcrel32 @inc, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, implicit-def $eax, debug-location !42
     $ecx = MOV32rr $eax, debug-location !42
diff --git a/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg-debugonly.mir b/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg-debugonly.mir
index f8d3603b79d..4a2f96c5d5e 100644
--- a/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg-debugonly.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg-debugonly.mir
@@ -130,12 +130,12 @@ stack:
 constants:
 body:             |
   bb.0.entry:
-    DBG_VALUE debug-use $edi, debug-use _, !21, !DIExpression(), debug-location !25
-    DBG_VALUE debug-use $rsi, debug-use _, !22, !DIExpression(), debug-location !26
+    DBG_VALUE $edi, _, !21, !DIExpression(), debug-location !25
+    DBG_VALUE $rsi, _, !22, !DIExpression(), debug-location !26
     %2 = MOV32rm $rip, 1, _, @bar, _, debug-location !27 :: (dereferenceable load 4 from `i32* getelementptr inbounds ([2 x i32], [2 x i32]* @bar, i64 0, i64 0)`, !tbaa !28)
-    DBG_VALUE debug-use %2, debug-use _, !23, !DIExpression(), debug-location !32
+    DBG_VALUE %2, _, !23, !DIExpression(), debug-location !32
     %3 = MOV32rm $rip, 1, _, @bar + 4, _, debug-location !33 :: (dereferenceable load 4 from `i32* getelementptr inbounds ([2 x i32], [2 x i32]* @bar, i64 0, i64 1)`, !tbaa !28)
-    DBG_VALUE debug-use %3, debug-use _, !24, !DIExpression(), debug-location !34
+    DBG_VALUE %3, _, !24, !DIExpression(), debug-location !34
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp, debug-location !35
     $edi = COPY %2, debug-location !35
     $esi = COPY %3, debug-location !35
@@ -153,11 +153,11 @@ body:             |
 # not cover the whole BB.
 #
 # CHECKDBG-LABEL: ********** EMITTING LIVE DEBUG VARIABLES **********
-# CHECKDBG-NEXT: !"argc,5"        [0B;0e):0 Loc0=debug-use $edi
+# CHECKDBG-NEXT: !"argc,5"        [0B;0e):0 Loc0=$edi
 # CHECKDBG-NEXT:         [0B;0e):0 %bb.0-160B
-# CHECKDBG-NEXT: !"argv,5"        [0B;0e):0 Loc0=debug-use $rsi
+# CHECKDBG-NEXT: !"argv,5"        [0B;0e):0 Loc0=$rsi
 # CHECKDBG-NEXT:         [0B;0e):0 %bb.0-160B
-# CHECKDBG-NEXT: !"a0,7"  [16r;64r):0 Loc0=debug-use %2
+# CHECKDBG-NEXT: !"a0,7"  [16r;64r):0 Loc0=%2
 # CHECKDBG-NEXT:         [16r;64r):0 %bb.0-160B
-# CHECKDBG-NEXT: !"a1,8"  [32r;80r):0 Loc0=debug-use %3
+# CHECKDBG-NEXT: !"a1,8"  [32r;80r):0 Loc0=%3
 # CHECKDBG-NEXT:         [32r;80r):0 %bb.0-160B
diff --git a/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg.mir b/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg.mir
index 430dbb742d4..ac0d519ddfe 100644
--- a/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg.mir
@@ -128,12 +128,12 @@ stack:
 constants:
 body:             |
   bb.0.entry:
-    DBG_VALUE debug-use $edi, debug-use _, !21, !DIExpression(), debug-location !25
-    DBG_VALUE debug-use $rsi, debug-use _, !22, !DIExpression(), debug-location !26
+    DBG_VALUE $edi, _, !21, !DIExpression(), debug-location !25
+    DBG_VALUE $rsi, _, !22, !DIExpression(), debug-location !26
     %2 = MOV32rm $rip, 1, _, @bar, _, debug-location !27 :: (dereferenceable load 4 from `i32* getelementptr inbounds ([2 x i32], [2 x i32]* @bar, i64 0, i64 0)`, !tbaa !28)
-    DBG_VALUE debug-use %2, debug-use _, !23, !DIExpression(), debug-location !32
+    DBG_VALUE %2, _, !23, !DIExpression(), debug-location !32
     %3 = MOV32rm $rip, 1, _, @bar + 4, _, debug-location !33 :: (dereferenceable load 4 from `i32* getelementptr inbounds ([2 x i32], [2 x i32]* @bar, i64 0, i64 1)`, !tbaa !28)
-    DBG_VALUE debug-use %3, debug-use _, !24, !DIExpression(), debug-location !34
+    DBG_VALUE %3, _, !24, !DIExpression(), debug-location !34
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp, debug-location !35
     $edi = COPY %2, debug-location !35
     $esi = COPY %3, debug-location !35
@@ -150,9 +150,9 @@ body:             |
 # CHECKMIR: ![[ARGV:[0-9]+]] = !DILocalVariable(name: "argv", arg: 2
 # CHECKMIR: name:            main
 # CHECKMIR: body:
-# CHECKMIR: DBG_VALUE debug-use $edi, debug-use $noreg, ![[ARGC]]
-# CHECKMIR-NOT: DBG_VALUE debug-use %{{.*}}, debug-use $noreg, ![[ARGC]]
-# CHECKMIR: DBG_VALUE debug-use $rsi, debug-use $noreg, ![[ARGV]]
-# CHECKMIR-NOT: DBG_VALUE debug-use %{{.*}}, debug-use $noreg, ![[ARGC]]
-# CHECKMIR-NOT: DBG_VALUE debug-use %{{.*}}, debug-use $noreg, ![[ARGV]]
+# CHECKMIR: DBG_VALUE $edi, $noreg, ![[ARGC]]
+# CHECKMIR-NOT: DBG_VALUE %{{.*}}, $noreg, ![[ARGC]]
+# CHECKMIR: DBG_VALUE $rsi, $noreg, ![[ARGV]]
+# CHECKMIR-NOT: DBG_VALUE %{{.*}}, $noreg, ![[ARGC]]
+# CHECKMIR-NOT: DBG_VALUE %{{.*}}, $noreg, ![[ARGV]]
 
diff --git a/test/DebugInfo/MIR/X86/livedebugvalues-limit.mir b/test/DebugInfo/MIR/X86/livedebugvalues-limit.mir
index 6c78a76a328..509d16a736c 100644
--- a/test/DebugInfo/MIR/X86/livedebugvalues-limit.mir
+++ b/test/DebugInfo/MIR/X86/livedebugvalues-limit.mir
@@ -25,13 +25,13 @@
   ; CHECK: ![[CS3]] = distinct !DILocation(line: 8, column: 3, scope: !{{[0-9]+}})
   ;
   ; CHECK:  bb.1.if.then:
-  ; CHECK:      DBG_VALUE debug-use $ebx, debug-use $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
-  ; CHECK-NOT:  DBG_VALUE debug-use $ebx, debug-use $noreg, ![[A_VAR]], !DIExpression(), debug-location
-  ; CHECK:      DBG_VALUE debug-use $ebx, debug-use $noreg, ![[A_VAR]], !DIExpression(), debug-location ![[INLCS2]]
+  ; CHECK:      DBG_VALUE $ebx, $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
+  ; CHECK-NOT:  DBG_VALUE $ebx, $noreg, ![[A_VAR]], !DIExpression(), debug-location
+  ; CHECK:      DBG_VALUE $ebx, $noreg, ![[A_VAR]], !DIExpression(), debug-location ![[INLCS2]]
   ; CHECK: bb.2.if.end:
-  ; CHECK:     DBG_VALUE debug-use $ebx, debug-use $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
-  ; CHECK-NOT: DBG_VALUE debug-use $ebx, debug-use $noreg, ![[A_VAR]], !DIExpression(), debug-location
-  ; CHECK:     DBG_VALUE debug-use $ebx, debug-use $noreg, ![[A_VAR]], !DIExpression(), debug-location ![[INLCS3]]
+  ; CHECK:     DBG_VALUE $ebx, $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
+  ; CHECK-NOT: DBG_VALUE $ebx, $noreg, ![[A_VAR]], !DIExpression(), debug-location
+  ; CHECK:     DBG_VALUE $ebx, $noreg, ![[A_VAR]], !DIExpression(), debug-location ![[INLCS3]]
   ;
   ; ModuleID = 'livedebugvalues-limit.ll'
   source_filename = "livedebugvalues-limit.c"
@@ -159,7 +159,7 @@ body:             |
     CFI_INSTRUCTION offset $rbp, -16
     $rbp = frame-setup MOV64rr $rsp
     CFI_INSTRUCTION def_cfa_register $rbp
-    DBG_VALUE debug-use $edi, debug-use _, !12, !13, debug-location !14
+    DBG_VALUE $edi, _, !12, !13, debug-location !14
     $rbp = POP64r implicit-def $rsp, implicit $rsp, debug-location !15
     TAILJMPd64 @sink, csr_64, implicit $rsp, implicit $rsp, implicit $edi, debug-location !15
 
@@ -208,10 +208,10 @@ body:             |
     frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
     frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
     CFI_INSTRUCTION offset $rbx, -24
-    DBG_VALUE debug-use $edi, debug-use _, !19, !13, debug-location !20
+    DBG_VALUE $edi, _, !19, !13, debug-location !20
     $ebx = MOV32rr $edi
-    DBG_VALUE debug-use $ebx, debug-use _, !12, !13, debug-location !21
-    DBG_VALUE debug-use $ebx, debug-use _, !19, !13, debug-location !20
+    DBG_VALUE $ebx, _, !12, !13, debug-location !21
+    DBG_VALUE $ebx, _, !19, !13, debug-location !20
     CALL64pcrel32 @sink, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, debug-location !23
     TEST32rr $ebx, $ebx, implicit-def $eflags, debug-location !24
     JE_1 %bb.2.if.end, implicit $eflags
@@ -220,18 +220,18 @@ body:             |
     successors: %bb.2.if.end
     liveins: $ebx, $rbp
   
-    DBG_VALUE debug-use $ebx, debug-use _, !19, !13, debug-location !20
-    DBG_VALUE debug-use $ebx, debug-use _, !12, !13, debug-location !27
+    DBG_VALUE $ebx, _, !19, !13, debug-location !20
+    DBG_VALUE $ebx, _, !12, !13, debug-location !27
     $edi = MOV32rr $ebx, debug-location !29
     CALL64pcrel32 @sink, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, debug-location !29
   
   bb.2.if.end:
     liveins: $ebx, $rbp
   
-    DBG_VALUE debug-use $ebx, debug-use _, !19, !13, debug-location !20
+    DBG_VALUE $ebx, _, !19, !13, debug-location !20
     $edi = MOV32rr killed $ebx, debug-location !33
     $rsp = ADD64ri8 $rsp, 8, implicit-def dead $eflags, debug-location !33
-    DBG_VALUE debug-use $ebx, debug-use _, !12, !13, debug-location !31
+    DBG_VALUE $ebx, _, !12, !13, debug-location !31
     $rbx = POP64r implicit-def $rsp, implicit $rsp, debug-location !33
     $rbp = POP64r implicit-def $rsp, implicit $rsp, debug-location !33
     TAILJMPd64 @sink, csr_64, implicit $rsp, implicit $rsp, implicit $edi, debug-location !33
diff --git a/test/DebugInfo/MIR/X86/mlicm-hoist.mir b/test/DebugInfo/MIR/X86/mlicm-hoist.mir
index 2c2f4edad4f..0797e89d2c6 100644
--- a/test/DebugInfo/MIR/X86/mlicm-hoist.mir
+++ b/test/DebugInfo/MIR/X86/mlicm-hoist.mir
@@ -122,17 +122,17 @@ body:             |
     successors: %bb.1.while.body(0x80000000)
     liveins: $rdi
 
-    DBG_VALUE debug-use $rdi, debug-use _, !16, !17, debug-location !18
+    DBG_VALUE $rdi, _, !16, !17, debug-location !18
     %2 = COPY $rdi
-    DBG_VALUE debug-use %2, debug-use _, !16, !17, debug-location !18
+    DBG_VALUE %2, _, !16, !17, debug-location !18
 
   bb.1.while.body:
     successors: %bb.1.while.body(0x80000000)
 
     %0 = PHI %2, %bb.0.entry, %1, %bb.1.while.body
-    DBG_VALUE debug-use %0, debug-use _, !16, !17, debug-location !18
+    DBG_VALUE %0, _, !16, !17, debug-location !18
     %1 = ADD64ri8 %0, 4, implicit-def dead $eflags, debug-location !20
-    DBG_VALUE debug-use %1, debug-use _, !16, !17, debug-location !18
+    DBG_VALUE %1, _, !16, !17, debug-location !18
     %3 = MOV32rm %0, 1, _, 0, _, debug-location !21 :: (load 4 from %ir.p.addr.0, !tbaa !22)
     %4 = MOV64rm $rip, 1, _, target-flags(x86-gotpcrel) @x, _, debug-location !26 :: (load 8 from got)
     MOV32mr killed %4, 1, _, 0, _, killed %3, debug-location !26 :: (store 4 into @x, !tbaa !22)
diff --git a/test/DebugInfo/MIR/X86/regcoalescer.mir b/test/DebugInfo/MIR/X86/regcoalescer.mir
index 4136d5ebe63..8601893cdc7 100644
--- a/test/DebugInfo/MIR/X86/regcoalescer.mir
+++ b/test/DebugInfo/MIR/X86/regcoalescer.mir
@@ -40,11 +40,11 @@ registers:
 body:             |
   bb.0.entry:
     %0 = MOV32r0 implicit-def dead $eflags, debug-location !19
-    DBG_VALUE debug-use %0, debug-use _, !18, !DIExpression(), debug-location !20
+    DBG_VALUE %0, _, !18, !DIExpression(), debug-location !20
     $eax = COPY killed %0, debug-location !21
     RET 0, killed $eax, debug-location !21
 
 ...
 
 # CHECK: $eax = MOV32r0
-# CHECK-NEXT: DBG_VALUE debug-use $eax
+# CHECK-NEXT: DBG_VALUE $eax
diff --git a/test/DebugInfo/MSP430/sdagsplit-1.ll b/test/DebugInfo/MSP430/sdagsplit-1.ll
index 7f2356a083f..9e77e950f83 100644
--- a/test/DebugInfo/MSP430/sdagsplit-1.ll
+++ b/test/DebugInfo/MSP430/sdagsplit-1.ll
@@ -13,10 +13,10 @@
 ;      return 0;
 ;    }
 ;
-; CHECK-DAG: DBG_VALUE debug-use $r{{[0-9]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 32, 16), debug-location !{{[0-9]+}}
-; CHECK-DAG: DBG_VALUE debug-use $r{{[0-9]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 48, 16), debug-location !{{[0-9]+}}
-; CHECK-DAG: DBG_VALUE debug-use $r{{[0-9]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 0, 16), debug-location !{{[0-9]+}}
-; CHECK-DAG: DBG_VALUE debug-use $r{{[0-9]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 16, 16), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE $r{{[0-9]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 32, 16), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE $r{{[0-9]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 48, 16), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE $r{{[0-9]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 0, 16), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE $r{{[0-9]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 16, 16), debug-location !{{[0-9]+}}
 
 ; ModuleID = 'sdagsplit-1.c'
 target datalayout = "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16"
diff --git a/test/DebugInfo/WebAssembly/dbg-value-live-interval.ll b/test/DebugInfo/WebAssembly/dbg-value-live-interval.ll
index b79ef23fc96..1754d2a7a21 100644
--- a/test/DebugInfo/WebAssembly/dbg-value-live-interval.ll
+++ b/test/DebugInfo/WebAssembly/dbg-value-live-interval.ll
@@ -3,7 +3,7 @@
 ; CHECK: After WebAssembly Optimize Live Intervals:
 ; CHECK: bb.3.for.body.for.body_crit_edge:
 ; CHECK: [[REG:%[0-9]+]]:i32 = nsw ADD_I32 {{.*}} fib.c:7:7
-; CHECK: DBG_VALUE debug-use [[REG]]:i32, debug-use $noreg, !"a", {{.*}} fib.c:5:13
+; CHECK: DBG_VALUE [[REG]]:i32, $noreg, !"a", {{.*}} fib.c:5:13
 ; CHECK: After WebAssembly Store Results:
 
 ; ModuleID = 'fib.bc'
diff --git a/test/DebugInfo/WebAssembly/dbg-value-move-2.ll b/test/DebugInfo/WebAssembly/dbg-value-move-2.ll
index 30a87d1f3cd..90e8b66609b 100644
--- a/test/DebugInfo/WebAssembly/dbg-value-move-2.ll
+++ b/test/DebugInfo/WebAssembly/dbg-value-move-2.ll
@@ -3,7 +3,7 @@
 ; CHECK: After WebAssembly Register Stackify:
 ; CHECK: bb.2.for.body:
 ; CHECK: [[REG:%[0-9]+]]:i32 = TEE_I32 {{.*}} fib2.c:6:7
-; CHECK-NEXT: DBG_VALUE debug-use [[REG]]:i32, debug-use $noreg, !"a", {{.*}} fib2.c:2:13
+; CHECK-NEXT: DBG_VALUE [[REG]]:i32, $noreg, !"a", {{.*}} fib2.c:2:13
 ; CHECK: After WebAssembly Register Coloring:
 
 ; ModuleID = 'fib2.bc'
diff --git a/test/DebugInfo/WebAssembly/dbg-value-move.ll b/test/DebugInfo/WebAssembly/dbg-value-move.ll
index 7644b97a7b7..8514f3dcaa7 100644
--- a/test/DebugInfo/WebAssembly/dbg-value-move.ll
+++ b/test/DebugInfo/WebAssembly/dbg-value-move.ll
@@ -3,7 +3,7 @@
 ; CHECK: After WebAssembly Register Stackify:
 ; CHECK: bb.3.for.body.for.body_crit_edge:
 ; CHECK: [[REG:%[0-9]+]]:i32 = nsw ADD_I32 {{.*}} fib.c:7:7
-; CHECK-NEXT: DBG_VALUE debug-use [[REG]]:i32, debug-use $noreg, !"a", {{.*}} fib.c:5:13
+; CHECK-NEXT: DBG_VALUE [[REG]]:i32, $noreg, !"a", {{.*}} fib.c:5:13
 ; CHECK: After WebAssembly Register Coloring:
 
 ; ModuleID = 'fib.bc'
diff --git a/test/DebugInfo/X86/bbjoin.ll b/test/DebugInfo/X86/bbjoin.ll
index b3f20a9b8e3..c175108f384 100644
--- a/test/DebugInfo/X86/bbjoin.ll
+++ b/test/DebugInfo/X86/bbjoin.ll
@@ -11,12 +11,12 @@
 ; }
 ; CHECK: ![[X:.*]] = !DILocalVariable(name: "x",
 ; CHECK: bb.0.entry:
-; CHECK:   DBG_VALUE 23, debug-use $noreg, ![[X]],
-; CHECK:   DBG_VALUE debug-use $rsp, debug-use $noreg, ![[X]], !DIExpression(DW_OP_plus_uconst, 4, DW_OP_deref),
+; CHECK:   DBG_VALUE 23, $noreg, ![[X]],
+; CHECK:   DBG_VALUE $rsp, $noreg, ![[X]], !DIExpression(DW_OP_plus_uconst, 4, DW_OP_deref),
 ; CHECK: bb.1.if.then:
-; CHECK:   DBG_VALUE 43, debug-use $noreg, ![[X]],
+; CHECK:   DBG_VALUE 43, $noreg, ![[X]],
 ; CHECK: bb.2.if.end:
-; CHECK-NOT:  DBG_VALUE 23, debug-use $noreg, ![[X]],
+; CHECK-NOT:  DBG_VALUE 23, $noreg, ![[X]],
 ; CHECK:   RETQ $eax
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir b/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir
index f9a81cb156a..92fc740b77e 100644
--- a/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir
+++ b/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir
@@ -65,7 +65,7 @@ body:             |
 
   bb.1:
     ; This DBG_VALUE will be discarded (use before def of %0).
-    DBG_VALUE debug-use %0, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %0, $noreg, !18, !DIExpression(), debug-location !25
     %0:gr64 = IMPLICIT_DEF
     %0:gr64 = IMPLICIT_DEF
     %0:gr64 = IMPLICIT_DEF
@@ -73,32 +73,32 @@ body:             |
 
   bb.2:
     ; This DBG_VALUE will be discarded (%1 is defined earlier, but it is not live in, so we do not know where %1 is stored).
-    DBG_VALUE debug-use %1, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %1, $noreg, !18, !DIExpression(), debug-location !25
     %1:gr64 = IMPLICIT_DEF
     %1:gr64 = IMPLICIT_DEF
     %1:gr64 = IMPLICIT_DEF
     %1:gr64 = IMPLICIT_DEF
     ; This DBG_VALUE is kept, even if %1 is dead, it was defined in the prev instruction,
     ; so the value should be available for as long as the register allocated to %1 is live.
-    DBG_VALUE debug-use %1, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %1, $noreg, !18, !DIExpression(), debug-location !25
 
   bb.3:
     %1:gr64 = IMPLICIT_DEF
-    DBG_VALUE 0, debug-use $noreg, !23, !DIExpression(), debug-location !25
+    DBG_VALUE 0, $noreg, !23, !DIExpression(), debug-location !25
     ; This DBG_VALUE is kept, even if %1 is dead, it was defined in the prev non-dbg instruction,
     ; so the value should be available for as long as the register allocated to %1 is live.
-    DBG_VALUE debug-use %1, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %1, $noreg, !18, !DIExpression(), debug-location !25
 
   bb.4:
     ; All DBG_VALUEs here should survive. %2 is livein as it was defined in bb.0, and it has use/def in the BTS64rr instruction.
-    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %2, $noreg, !18, !DIExpression(), debug-location !25
     %2:gr64 = BTS64rr %2, 0, implicit-def $eflags
-    DBG_VALUE 0, debug-use $noreg, !23, !DIExpression(), debug-location !25
-    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE 0, $noreg, !23, !DIExpression(), debug-location !25
+    DBG_VALUE %2, $noreg, !18, !DIExpression(), debug-location !25
     %2:gr64 = BTS64rr %2, 0, implicit-def $eflags
-    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %2, $noreg, !18, !DIExpression(), debug-location !25
     %2:gr64 = BTS64rr %2, 0, implicit-def $eflags
-    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %2, $noreg, !18, !DIExpression(), debug-location !25
 
   bb.5:
     RET 0, debug-location !32
@@ -107,29 +107,29 @@ body:             |
 # CHECK-LABEL: name: foobar
 
 # CHECK-LABEL: bb.1:
-# CHECK:        DBG_VALUE debug-use $noreg
+# CHECK:        DBG_VALUE $noreg
 
 # CHECK-LABEL: bb.2:
-# CHECK:        DBG_VALUE debug-use $noreg
+# CHECK:        DBG_VALUE $noreg
 # CHECK-NEXT:   dead renamable $rcx = IMPLICIT_DEF
 # CHECK-NEXT:   dead renamable $rcx = IMPLICIT_DEF
 # CHECK-NEXT:   dead renamable $rcx = IMPLICIT_DEF
 # CHECK-NEXT:   dead renamable $rcx = IMPLICIT_DEF
-# CHECK-NEXT:   DBG_VALUE debug-use $rcx, debug-use $noreg, !18, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE $rcx, $noreg, !18, !DIExpression()
 
 # CHECK-LABEL: bb.3:
 # CHECK:        dead renamable $rcx = IMPLICIT_DEF
-# CHECK-NEXT:   DBG_VALUE 0, debug-use $noreg, !23, !DIExpression()
-# CHECK-NEXT:   DBG_VALUE debug-use $rcx, debug-use $noreg, !18, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE 0, $noreg, !23, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE $rcx, $noreg, !18, !DIExpression()
 
 # CHECK-LABEL: bb.4:
 # CHECK:        liveins: $rax
-# CHECK:        DBG_VALUE debug-use $rax, debug-use $noreg, !18, !DIExpression()
+# CHECK:        DBG_VALUE $rax, $noreg, !18, !DIExpression()
 # CHECK-NEXT:   renamable $rax = BTS64rr killed renamable $rax, 0, implicit-def $eflags
-# CHECK-NEXT:   DBG_VALUE 0, debug-use $noreg, !23, !DIExpression()
-# CHECK-NEXT:   DBG_VALUE debug-use $rax, debug-use $noreg, !18, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE 0, $noreg, !23, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE $rax, $noreg, !18, !DIExpression()
 # CHECK-NEXT:   renamable $rax = BTS64rr killed renamable $rax, 0, implicit-def $eflags
-# CHECK-NEXT:   DBG_VALUE debug-use $rax, debug-use $noreg, !18, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE $rax, $noreg, !18, !DIExpression()
 # CHECK-NEXT:   dead renamable $rax = BTS64rr killed renamable $rax, 0, implicit-def $eflags
 
 # CHECK-LABEL: bb.5:
diff --git a/test/DebugInfo/X86/live-debug-vars-dse.mir b/test/DebugInfo/X86/live-debug-vars-dse.mir
index bf6c71fa0ff..3a82c9d377b 100644
--- a/test/DebugInfo/X86/live-debug-vars-dse.mir
+++ b/test/DebugInfo/X86/live-debug-vars-dse.mir
@@ -134,7 +134,7 @@ body:             |
     $rcx = COPY %1, debug-location !15
     CALL64pcrel32 @escape, csr_win64, implicit $rsp, implicit $ssp, implicit $rcx, implicit-def $rsp, implicit-def $ssp, debug-location !15
     ADJCALLSTACKUP64 32, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp, debug-location !15
-    DBG_VALUE 1, debug-use _, !13, !DIExpression(), debug-location !16
+    DBG_VALUE 1, _, !13, !DIExpression(), debug-location !16
     MOV32mi $rip, 1, _, @global, _, 1, debug-location !17 :: (store 4 into @global)
     DBG_VALUE %stack.0.x.addr, 0, !13, !DIExpression(), debug-location !18
     MOV32mi %stack.0.x.addr, 1, _, 0, _, 2, debug-location !18 :: (store 4 into %ir.x.addr)
diff --git a/test/DebugInfo/X86/live-debug-vars-index.mir b/test/DebugInfo/X86/live-debug-vars-index.mir
index 1a38a101d64..c4ba4950517 100644
--- a/test/DebugInfo/X86/live-debug-vars-index.mir
+++ b/test/DebugInfo/X86/live-debug-vars-index.mir
@@ -40,14 +40,14 @@ tracksRegLiveness: true
 body:             |
   bb.0:
 
-    DBG_VALUE debug-use $esi, debug-use $noreg, !13, !DIExpression(), debug-location !11
+    DBG_VALUE $esi, $noreg, !13, !DIExpression(), debug-location !11
     DBG_LABEL !8, debug-location !9
-    DBG_VALUE debug-use $edi, debug-use $noreg, !10, !DIExpression(), debug-location !11
+    DBG_VALUE $edi, $noreg, !10, !DIExpression(), debug-location !11
     RET 0, undef $eax, debug-location !12
 ...
 
 # CHECK-LABEL: name:            foo
 # CHECK: bb.0:
 # CHECK-DAG: DBG_LABEL
-# CHECK-DAG: DBG_VALUE debug-use $esi
-# CHECK-DAG: DBG_VALUE debug-use $edi
+# CHECK-DAG: DBG_VALUE $esi
+# CHECK-DAG: DBG_VALUE $edi
diff --git a/test/DebugInfo/X86/pr34545.ll b/test/DebugInfo/X86/pr34545.ll
index 8d781157d92..fe5d2a285f5 100644
--- a/test/DebugInfo/X86/pr34545.ll
+++ b/test/DebugInfo/X86/pr34545.ll
@@ -1,13 +1,13 @@
 ; RUN: llc -O1 -filetype=asm -mtriple x86_64-unknown-linux-gnu -mcpu=x86-64 -o - %s -stop-after=livedebugvars | FileCheck %s
 
 ; CHECK: $eax = MOV32rm
-; CHECK: DBG_VALUE debug-use $eax
+; CHECK: DBG_VALUE $eax
 ; CHECK: $eax = SHL32rCL killed renamable $eax
-; CHECK: DBG_VALUE debug-use $eax
-; CHECK: DBG_VALUE debug-use $rsp, 0, !{{[0-9]+}}, !DIExpression(DW_OP_constu, 4, DW_OP_minus)
-; CHECK: DBG_VALUE debug-use $eax
+; CHECK: DBG_VALUE $eax
+; CHECK: DBG_VALUE $rsp, 0, !{{[0-9]+}}, !DIExpression(DW_OP_constu, 4, DW_OP_minus)
+; CHECK: DBG_VALUE $eax
 ; CHECK: $eax = SHL32rCL killed renamable $eax
-; CHECK: DBG_VALUE debug-use $eax
+; CHECK: DBG_VALUE $eax
 ; CHECK: RETQ $eax
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/DebugInfo/X86/sdag-combine.ll b/test/DebugInfo/X86/sdag-combine.ll
index 3023ce751c0..c95afdb0cf0 100644
--- a/test/DebugInfo/X86/sdag-combine.ll
+++ b/test/DebugInfo/X86/sdag-combine.ll
@@ -15,7 +15,7 @@ define swiftcc void @g() #0 !dbg !5 {
 entry:
   %0 = alloca %TSb, align 1
   %1 = call swiftcc i1 @f(), !dbg !7
-  ; CHECK: DBG_VALUE debug-use $rcx, debug-use $noreg, !8, !DIExpression(), debug-location !7
+  ; CHECK: DBG_VALUE $rcx, $noreg, !8, !DIExpression(), debug-location !7
   call void @llvm.dbg.value(metadata i1 %1, metadata !8, metadata !DIExpression()), !dbg !7
   %2 = getelementptr inbounds %TSb, %TSb* %0, i32 0, i32 0, !dbg !7
   store i1 %1, i1* %2, align 1, !dbg !7
diff --git a/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll b/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
index 45eff822474..0266f6108e7 100644
--- a/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
+++ b/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
@@ -62,50 +62,50 @@ target triple = "x86_64-apple-macosx10.4.0"
 
 @S = global %struct.SS { i32 23, i32 -17 }, align 4, !dbg !0
 
-; Verify that the def comes before the debug-use for foo1.
+; Verify that the def comes before the for foo1.
 ; TODO: Currently dbg.value for bar1 is dropped(?), is that expected?
 define i32 @test1() local_unnamed_addr #0 !dbg !17 {
 ; CHECK-LABEL: bb.0.entry1
 ; CHECK-NEXT:    [[REG1:%[0-9]+]]:gr64 =
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG1]], debug-use $noreg, ![[FOO1]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG1]], $noreg, ![[FOO1]], !DIExpression()
 entry1:
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !20, metadata !DIExpression()), !dbg !23
   call void @llvm.dbg.value(metadata %struct.SS* null, metadata !22, metadata !DIExpression()), !dbg !24
   ret i32 ptrtoint (%struct.SS* @S to i32), !dbg !25
 }
 
-; Verify that the def comes before the debug-use for foo2 and bar2.
+; Verify that the def comes before the for foo2 and bar2.
 define i32 @test2() local_unnamed_addr #0 !dbg !26 {
 ; CHECK-LABEL: bb.0.entry2
 ; CHECK-NEXT:    [[REG2:%[0-9]+]]:gr64 =
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG2]], debug-use $noreg, ![[FOO2]], !DIExpression()
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG2]], debug-use $noreg, ![[BAR2]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG2]], $noreg, ![[FOO2]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG2]], $noreg, ![[BAR2]], !DIExpression()
 entry2:
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !28, metadata !DIExpression()), !dbg !30
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !29, metadata !DIExpression()), !dbg !31
   ret i32 add (i32 ptrtoint (%struct.SS* @S to i32), i32 ptrtoint (%struct.SS* @S to i32)), !dbg !32
 }
 
-; Verify that the def comes before the debug-use for foo3 and bar3.
+; Verify that the def comes before the for foo3 and bar3.
 define i32 @test3() local_unnamed_addr #0 !dbg !33 {
 ; CHECK-LABEL: bb.0.entry3
 ; CHECK-NEXT:    [[REG3:%[0-9]+]]:gr64 =
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG3]], debug-use $noreg, ![[BAR3]], !DIExpression()
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG3]], debug-use $noreg, ![[FOO3]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG3]], $noreg, ![[BAR3]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG3]], $noreg, ![[FOO3]], !DIExpression()
 entry3:
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !36, metadata !DIExpression()), !dbg !38
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !35, metadata !DIExpression()), !dbg !37
   ret i32 add (i32 ptrtoint (%struct.SS* @S to i32), i32 ptrtoint (%struct.SS* @S to i32)), !dbg !39
 }
 
-; Verify that the def comes before the debug-use for bar4.
+; Verify that the def comes before the for bar4.
 ; TODO: Currently dbg.value for foo4 is dropped. It is set to null and not
 ;       used. Just like in test1 it can be discussed if there should be a
 ;       DBG_VALUE for foo4 here.
 define i32 @test4() local_unnamed_addr #0 !dbg !40 {
 ; CHECK-LABEL: bb.0.entry4
 ; CHECK-NEXT:    [[REG4:%[0-9]+]]:gr64 =
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG4]], debug-use $noreg, ![[BAR4]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG4]], $noreg, ![[BAR4]], !DIExpression()
 entry4:
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !42, metadata !DIExpression()), !dbg !44
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !43, metadata !DIExpression()), !dbg !45
@@ -119,8 +119,8 @@ entry4:
 define i32 @test5() local_unnamed_addr #0 !dbg !47 {
 ; CHECK-LABEL: bb.0.entry5:
 ; CHECK-NEXT:    [[REG5:%[0-9]+]]:gr64 =
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG5]], debug-use $noreg, ![[BAR5]], !DIExpression()
-; CHECK-NOT:     DBG_VALUE debug-use [[REG5]], debug-use $noreg, ![[FOO5]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG5]], $noreg, ![[BAR5]], !DIExpression()
+; CHECK-NOT:     DBG_VALUE [[REG5]], $noreg, ![[FOO5]], !DIExpression()
 ; CHECK:         RET
 entry5:
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !49, metadata !DIExpression()), !dbg !51
diff --git a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll
index 1dc51f55249..116e05746f3 100644
--- a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll
+++ b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll
@@ -49,7 +49,7 @@ for.body.lr.ph:                                   ; preds = %entry
 for.cond.cleanup:                                 ; preds = %for.body, %entry
 ; CHECK-LABEL: bb.{{.*}}.for.cond.cleanup:
 ; CHECK:      [[REG1:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG1]]
+; CHECK-NEXT: DBG_VALUE [[REG1]]
   %x.0.lcssa = phi i32 [ 9, %entry ], [ %add, %for.body ]
   call void @llvm.dbg.value(metadata i32 %x.0.lcssa, metadata !15, metadata !DIExpression()), !dbg !26
   %2 = bitcast [80 x i32]* %arr to i8*, !dbg !37
@@ -63,9 +63,9 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 ; CHECK:      [[REG2:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG3:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG4:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG2]]
-; CHECK-NEXT: DBG_VALUE debug-use [[REG3]]
-; CHECK-NEXT: DBG_VALUE debug-use [[REG4]]
+; CHECK-NEXT: DBG_VALUE [[REG2]]
+; CHECK-NEXT: DBG_VALUE [[REG3]]
+; CHECK-NEXT: DBG_VALUE [[REG4]]
   %u.023 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
   %y.022 = phi i32 [ 13, %for.body.lr.ph ], [ %mul, %for.body ]
   %x.021 = phi i32 [ 9, %for.body.lr.ph ], [ %add, %for.body ]
diff --git a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll
index 7958dd878f8..6c6a9597b5a 100644
--- a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll
+++ b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll
@@ -28,7 +28,7 @@ for.body.lr.ph:                                   ; preds = %entry
 for.cond.cleanup:                                 ; preds = %for.body, %entry
 ; CHECK-LABEL: bb.{{.*}}.for.cond.cleanup:
 ; CHECK:      [[REG1:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG1]]
+; CHECK-NEXT: DBG_VALUE [[REG1]]
   %x.0.lcssa = phi i32 [ 9, %entry ], [ %add, %for.body ]
   call void @llvm.dbg.value(metadata i32 %x.0.lcssa, metadata !15, metadata !DIExpression()), !dbg !26
   %2 = bitcast [80 x i32]* %arr to i8*, !dbg !37
@@ -42,16 +42,16 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 ; CHECK:      [[REG2:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG3:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG4:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG3]], debug-use $noreg, !16
-; CHECK-NEXT: DBG_VALUE 555, debug-use $noreg, !17
+; CHECK-NEXT: DBG_VALUE [[REG3]], $noreg, !16
+; CHECK-NEXT: DBG_VALUE 555, $noreg, !17
 ; XXX: Shouldn't the following DBG_VALUE be placed after the add (ADD32rr).
-; CHECK-NEXT: DBG_VALUE debug-use [[REG2]], debug-use $noreg, !17
+; CHECK-NEXT: DBG_VALUE [[REG2]], $noreg, !17
 ; CHECK-NEXT: ADD32rr
 ; XXX: Shouldn't the following DBG_VALUE be placed after the mul (LEA etc).
-; CHECK-NEXT: DBG_VALUE 777, debug-use $noreg, !17
+; CHECK-NEXT: DBG_VALUE 777, $noreg, !17
 ; CHECK:      INC32r
 ; XXX: Shouldn't the following DBG_VALUE be placed after the icmp (the non-dead implicit def of $eflags)
-; CHECK:      DBG_VALUE debug-use [[REG4]]
+; CHECK:      DBG_VALUE [[REG4]]
 ; CHECK-NEXT: implicit-def $eflags
   %u.023 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
   %y.022 = phi i32 [ 13, %for.body.lr.ph ], [ %mul, %for.body ]
diff --git a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll
index 83fbee5a612..4aa7243c9e9 100644
--- a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll
+++ b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll
@@ -65,12 +65,12 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 ; CHECK-NEXT: [[REG5:%[0-9]+]]:gr32_nosp = PHI
 ; CHECK-NEXT: [[REG6:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG7:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG2]], debug-use $noreg, !19, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG3]], debug-use $noreg, !19, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG4]], debug-use $noreg, !18, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG5]], debug-use $noreg, !18, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG6]], debug-use $noreg, !17, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG7]], debug-use $noreg, !17, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
+; CHECK-NEXT: DBG_VALUE [[REG2]], $noreg, !19, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK-NEXT: DBG_VALUE [[REG3]], $noreg, !19, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
+; CHECK-NEXT: DBG_VALUE [[REG4]], $noreg, !18, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK-NEXT: DBG_VALUE [[REG5]], $noreg, !18, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
+; CHECK-NEXT: DBG_VALUE [[REG6]], $noreg, !17, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK-NEXT: DBG_VALUE [[REG7]], $noreg, !17, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
   %u.023 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
   %y.022 = phi i64 [ 13, %for.body.lr.ph ], [ %mul, %for.body ]
   %x.021 = phi i64 [ 9, %for.body.lr.ph ], [ %add, %for.body ]
diff --git a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll
index 6872c2c9d30..23674ee32cc 100644
--- a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll
+++ b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll
@@ -11,11 +11,11 @@
 ; CHECK:      [[REG1:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG2:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG3:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG1]], debug-use $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG2]], debug-use $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG3]], debug-use $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 64, 16)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG1]], debug-use $noreg, !12, !DIExpression(DW_OP_LLVM_fragment, 10, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG2]], debug-use $noreg, !12, !DIExpression(DW_OP_LLVM_fragment, 42, 13)
+; CHECK-NEXT: DBG_VALUE [[REG1]], $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK-NEXT: DBG_VALUE [[REG2]], $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
+; CHECK-NEXT: DBG_VALUE [[REG3]], $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 64, 16)
+; CHECK-NEXT: DBG_VALUE [[REG1]], $noreg, !12, !DIExpression(DW_OP_LLVM_fragment, 10, 32)
+; CHECK-NEXT: DBG_VALUE [[REG2]], $noreg, !12, !DIExpression(DW_OP_LLVM_fragment, 42, 13)
 ; CHECK-NOT:  DBG_VALUE
 
 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
diff --git a/test/DebugInfo/X86/sdag-legalize-multires.ll b/test/DebugInfo/X86/sdag-legalize-multires.ll
index 0ceace388e9..f0db358b542 100644
--- a/test/DebugInfo/X86/sdag-legalize-multires.ll
+++ b/test/DebugInfo/X86/sdag-legalize-multires.ll
@@ -21,10 +21,10 @@ entry:
   %0 = call float @llvm.cos.f32(float 1.500000e+00), !dbg !13
   ; CHECK: $xmm1 = MOVAPSrr $xmm0
   call void @llvm.dbg.value(metadata float %0, metadata !15, metadata !DIExpression()), !dbg !13
-  ; CHECK: DBG_VALUE debug-use {{.*}}$xmm1, {{.*}}, ![[RSIN]], !DIExpression(),
+  ; CHECK: DBG_VALUE {{.*}}$xmm1, {{.*}}, ![[RSIN]], !DIExpression(),
   %1 = call float @llvm.sin.f32(float 1.500000e+00), !dbg !13
   call void @llvm.dbg.value(metadata float %1, metadata !11, metadata !DIExpression()), !dbg !13
-  ; CHECK: DBG_VALUE debug-use {{.*}}$xmm0, {{.*}}, ![[RCOS]], !DIExpression(),
+  ; CHECK: DBG_VALUE {{.*}}$xmm0, {{.*}}, ![[RCOS]], !DIExpression(),
   call void @g(float %0, float %1), !dbg !13
   ret void, !dbg !13
 }
diff --git a/test/DebugInfo/X86/sdag-salvage-add.ll b/test/DebugInfo/X86/sdag-salvage-add.ll
index f3f129e9bea..fda9b33d43b 100644
--- a/test/DebugInfo/X86/sdag-salvage-add.ll
+++ b/test/DebugInfo/X86/sdag-salvage-add.ll
@@ -24,9 +24,9 @@
 ;
 ; CHECK:   ![[S4:.*]] = !DILocalVariable(name: "s4", 
 ; CHECK:   ![[MYVAR:.*]] = !DILocalVariable(name: "myVar", 
-; CHECK:      DBG_VALUE debug-use $rax, debug-use $noreg, ![[MYVAR]],
+; CHECK:      DBG_VALUE $rax, $noreg, ![[MYVAR]],
 ; CHECK-SAME:           !DIExpression(DW_OP_plus_uconst, 4096, DW_OP_stack_value)
-; CHECK-NEXT: DBG_VALUE debug-use $rax, debug-use $noreg, ![[S4]],
+; CHECK-NEXT: DBG_VALUE $rax, $noreg, ![[S4]],
 ; CHECK-SAME:           !DIExpression(DW_OP_plus_uconst, 4096, DW_OP_stack_value)
 ; CHECK-NEXT: $rdi = MOV64rm killed renamable $rax, 1, $noreg, 4096, $noreg,
 
diff --git a/test/DebugInfo/X86/sdagsplit-1.ll b/test/DebugInfo/X86/sdagsplit-1.ll
index a2e02d8ad8a..87eb3b10c32 100644
--- a/test/DebugInfo/X86/sdagsplit-1.ll
+++ b/test/DebugInfo/X86/sdagsplit-1.ll
@@ -13,8 +13,8 @@
 ;      return 0;
 ;    }
 ;
-; CHECK-DAG: DBG_VALUE debug-use ${{[a-z]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 0, 32), debug-location !{{[0-9]+}}
-; CHECK-DAG: DBG_VALUE debug-use ${{[a-z]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 32, 32), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE ${{[a-z]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 0, 32), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE ${{[a-z]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 32, 32), debug-location !{{[0-9]+}}
 
 ; ModuleID = 'sdagsplit-1.c'
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-- 
GitLab


From 1c6bfcc50c346e44d3a1c1a9b0e6da159e503362 Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Tue, 30 Oct 2018 23:45:27 +0000
Subject: [PATCH 0775/1116] DWARFVerifier: make the verifier more comprehensive
 for objects

Make the code do what was mentioned in the comment: only skip the CU types.
This enables the lexical blocks to be verified as well.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345675 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/DWARF/DWARFVerifier.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index fdb71958cc6..f3b242c47d7 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -414,7 +414,7 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
   // For now, simply elide the range verification for the CU DIEs if we are
   // processing an object file.
 
-  if (!IsObjectFile || IsMachOObject || Die.getTag() == DW_TAG_subprogram) {
+  if (!IsObjectFile || IsMachOObject || Die.getTag() != DW_TAG_compile_unit) {
     for (auto Range : Ranges) {
       if (!Range.valid()) {
         ++NumErrors;
-- 
GitLab


From b3bc95870d94ab6a58a997ba3fb4ac4ed37f861a Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Wed, 31 Oct 2018 00:23:23 +0000
Subject: [PATCH 0776/1116] ADT/STLExtras: Introduce llvm::empty; NFC

This is modeled after C++17 std::empty().

Differential Revision: https://reviews.llvm.org/D53909

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345679 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/STLExtras.h                   |  6 ++++++
 lib/Analysis/LazyCallGraph.cpp                 |  2 +-
 lib/CodeGen/GlobalISel/InstructionSelector.cpp |  2 +-
 lib/CodeGen/GlobalISel/LegalizerInfo.cpp       |  3 +--
 lib/CodeGen/GlobalISel/RegBankSelect.cpp       |  2 +-
 lib/CodeGen/GlobalISel/RegisterBankInfo.cpp    |  2 +-
 lib/ExecutionEngine/Orc/ExecutionUtils.cpp     |  2 +-
 lib/IR/DebugInfo.cpp                           |  2 +-
 lib/Transforms/IPO/PartialInlining.cpp         |  4 ++--
 lib/Transforms/Scalar/NewGVN.cpp               |  2 +-
 lib/Transforms/Utils/PredicateInfo.cpp         |  4 ++--
 lib/Transforms/Utils/SimplifyCFG.cpp           |  2 +-
 unittests/ADT/STLExtrasTest.cpp                | 17 +++++++++++++++++
 13 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index c209c4aede9..4a93ee55e76 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -195,6 +195,12 @@ void adl_swap(T &&lhs, T &&rhs) noexcept(
   adl_detail::adl_swap(std::forward<T>(lhs), std::forward<T>(rhs));
 }
 
+/// Test whether \p RangeOrContainer is empty. Similar to C++17 std::empty.
+template <typename T>
+constexpr bool empty(const T &RangeOrContainer) {
+  return adl_begin(RangeOrContainer) == adl_end(RangeOrContainer);
+}
+
 // mapped_iterator - This is a simple iterator adapter that causes a function to
 // be applied whenever operator* is invoked on the iterator.
 
diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index b1d585bfc68..3f22ada803c 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp
@@ -619,7 +619,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(
 
   // If the merge range is empty, then adding the edge didn't actually form any
   // new cycles. We're done.
-  if (MergeRange.begin() == MergeRange.end()) {
+  if (empty(MergeRange)) {
     // Now that the SCC structure is finalized, flip the kind to call.
     SourceN->setEdgeKind(TargetN, Edge::Call);
     return false; // No new cycle.
diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 5e77fcbb0ed..38913e4afcb 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -80,5 +80,5 @@ bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI,
     return true;
 
   return !MI.mayLoadOrStore() && !MI.hasUnmodeledSideEffects() &&
-         MI.implicit_operands().begin() == MI.implicit_operands().end();
+         empty(MI.implicit_operands());
 }
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index b6ed2654bd0..ca776de0a0f 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -298,8 +298,7 @@ LegalizeRuleSet &LegalizerInfo::getActionDefinitionsBuilder(
     std::initializer_list<unsigned> Opcodes) {
   unsigned Representative = *Opcodes.begin();
 
-  assert(Opcodes.begin() != Opcodes.end() &&
-         Opcodes.begin() + 1 != Opcodes.end() &&
+  assert(!empty(Opcodes) && Opcodes.begin() + 1 != Opcodes.end() &&
          "Initializer list must have at least two opcodes");
 
   for (auto I = Opcodes.begin() + 1, E = Opcodes.end(); I != E; ++I)
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 9e2d48d1dc4..6bb48dc2e8a 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -140,7 +140,7 @@ bool RegBankSelect::repairReg(
     return false;
   assert(ValMapping.NumBreakDowns == 1 && "Not yet implemented");
   // An empty range of new register means no repairing.
-  assert(NewVRegs.begin() != NewVRegs.end() && "We should not have to repair");
+  assert(!empty(NewVRegs) && "We should not have to repair");
 
   // Assume we are repairing a use and thus, the original reg will be
   // the source of the repairing.
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index dd15567ef1c..28404e52d6e 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -426,7 +426,7 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
            "This mapping is too complex for this function");
     iterator_range<SmallVectorImpl<unsigned>::const_iterator> NewRegs =
         OpdMapper.getVRegs(OpIdx);
-    if (NewRegs.begin() == NewRegs.end()) {
+    if (empty(NewRegs)) {
       LLVM_DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
       continue;
     }
diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index d9ff07efbe9..7c3c50b4d6e 100644
--- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -88,7 +88,7 @@ iterator_range<CtorDtorIterator> getDestructors(const Module &M) {
 }
 
 void CtorDtorRunner::add(iterator_range<CtorDtorIterator> CtorDtors) {
-  if (CtorDtors.begin() == CtorDtors.end())
+  if (empty(CtorDtors))
     return;
 
   MangleAndInterner Mangle(
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index e5fb765f778..02b7953cb5b 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -280,7 +280,7 @@ bool DebugInfoFinder::addScope(DIScope *Scope) {
 }
 
 static MDNode *stripDebugLocFromLoopID(MDNode *N) {
-  assert(N->op_begin() != N->op_end() && "Missing self reference?");
+  assert(!empty(N->operands()) && "Missing self reference?");
 
   // if there is no debug location, we do not have to rewrite this MDNode.
   if (std::none_of(N->op_begin() + 1, N->op_end(), [](const MDOperand &Op) {
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 11c4bbc437c..bcb19af85b2 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -1251,7 +1251,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
   if (PSI->isFunctionEntryCold(F))
     return {false, nullptr};
 
-  if (F->user_begin() == F->user_end())
+  if (empty(F->users()))
     return {false, nullptr};
 
   OptimizationRemarkEmitter ORE(F);
@@ -1357,7 +1357,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     return false;
   }
 
-  assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
+  assert(empty(Cloner.OrigFunc->users()) &&
          "F's users should all be replaced!");
 
   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index f5c1493781d..cd57ebd0c6f 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -1751,7 +1751,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
     return true;
   });
   // If we are left with no operands, it's dead.
-  if (Filtered.begin() == Filtered.end()) {
+  if (empty(Filtered)) {
     // If it has undef at this point, it means there are no-non-undef arguments,
     // and thus, the value of the phi node must be undef.
     if (HasUndef) {
diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp
index 9d9624850fb..585ce6b4c11 100644
--- a/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/lib/Transforms/Utils/PredicateInfo.cpp
@@ -522,7 +522,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
     if (isa<PredicateWithEdge>(ValInfo)) {
       IRBuilder<> B(getBranchTerminator(ValInfo));
       Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
-      if (IF->user_begin() == IF->user_end())
+      if (empty(IF->users()))
         CreatedDeclarations.insert(IF);
       CallInst *PIC =
           B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
@@ -534,7 +534,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
              "Should not have gotten here without it being an assume");
       IRBuilder<> B(PAssume->AssumeInst);
       Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
-      if (IF->user_begin() == IF->user_end())
+      if (empty(IF->users()))
         CreatedDeclarations.insert(IF);
       CallInst *PIC = B.CreateCall(IF, Op);
       PredicateMap.insert({PIC, ValInfo});
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index dd0d441a4da..849f9ee1d19 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5260,7 +5260,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   // Figure out the corresponding result for each case value and phi node in the
   // common destination, as well as the min and max case values.
-  assert(SI->case_begin() != SI->case_end());
+  assert(!empty(SI->cases()));
   SwitchInst::CaseIt CI = SI->case_begin();
   ConstantInt *MinCaseVal = CI->getCaseValue();
   ConstantInt *MaxCaseVal = CI->getCaseValue();
diff --git a/unittests/ADT/STLExtrasTest.cpp b/unittests/ADT/STLExtrasTest.cpp
index 427d470b61d..80e10d071ab 100644
--- a/unittests/ADT/STLExtrasTest.cpp
+++ b/unittests/ADT/STLExtrasTest.cpp
@@ -364,6 +364,23 @@ TEST(STLExtrasTest, ADLTest) {
   EXPECT_EQ(5, count);
 }
 
+TEST(STLExtrasTest, EmptyTest) {
+  std::vector<void*> V;
+  EXPECT_TRUE(empty(V));
+  V.push_back(nullptr);
+  EXPECT_FALSE(empty(V));
+
+  std::initializer_list<int> E = {};
+  std::initializer_list<int> NotE = {7, 13, 42};
+  EXPECT_TRUE(empty(E));
+  EXPECT_FALSE(empty(NotE));
+
+  auto R0 = make_range(V.begin(), V.begin());
+  EXPECT_TRUE(empty(R0));
+  auto R1 = make_range(V.begin(), V.end());
+  EXPECT_FALSE(empty(R1));
+}
+
 TEST(STLExtrasTest, EarlyIncrementTest) {
   std::list<int> L = {1, 2, 3, 4};
 
-- 
GitLab


From 72e5d0dd3b20f90576a7e35d8f1801578203b8af Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 31 Oct 2018 00:31:02 +0000
Subject: [PATCH 0777/1116] Don't duplicate function/class name at the
 beginning of the comment. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345681 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/Cloning.h | 36 +++++++++++--------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index e4d6053b70b..d7dce53fc76 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -49,7 +49,6 @@ class ProfileSummaryInfo;
 class ReturnInst;
 
 /// Return an exact copy of the specified module
-///
 std::unique_ptr<Module> CloneModule(const Module &M);
 std::unique_ptr<Module> CloneModule(const Module &M, ValueToValueMapTy &VMap);
 
@@ -61,17 +60,15 @@ std::unique_ptr<Module>
 CloneModule(const Module &M, ValueToValueMapTy &VMap,
             function_ref<bool(const GlobalValue *)> ShouldCloneDefinition);
 
-/// ClonedCodeInfo - This struct can be used to capture information about code
+/// This struct can be used to capture information about code
 /// being cloned, while it is being cloned.
 struct ClonedCodeInfo {
-  /// ContainsCalls - This is set to true if the cloned code contains a normal
-  /// call instruction.
+  /// This is set to true if the cloned code contains a normal call instruction.
   bool ContainsCalls = false;
 
-  /// ContainsDynamicAllocas - This is set to true if the cloned code contains
-  /// a 'dynamic' alloca.  Dynamic allocas are allocas that are either not in
-  /// the entry block or they are in the entry block but are not a constant
-  /// size.
+  /// This is set to true if the cloned code contains a 'dynamic' alloca.
+  /// Dynamic allocas are allocas that are either not in the entry block or they
+  /// are in the entry block but are not a constant size.
   bool ContainsDynamicAllocas = false;
 
   /// All cloned call sites that have operand bundles attached are appended to
@@ -82,7 +79,7 @@ struct ClonedCodeInfo {
   ClonedCodeInfo() = default;
 };
 
-/// CloneBasicBlock - Return a copy of the specified basic block, but without
+/// Return a copy of the specified basic block, but without
 /// embedding the block into a particular function.  The block returned is an
 /// exact copy of the specified basic block, without any remapping having been
 /// performed.  Because of this, this is only suitable for applications where
@@ -109,13 +106,12 @@ struct ClonedCodeInfo {
 /// If you would like to collect additional information about the cloned
 /// function, you can specify a ClonedCodeInfo object with the optional fifth
 /// parameter.
-///
 BasicBlock *CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
                             const Twine &NameSuffix = "", Function *F = nullptr,
                             ClonedCodeInfo *CodeInfo = nullptr,
                             DebugInfoFinder *DIFinder = nullptr);
 
-/// CloneFunction - Return a copy of the specified function and add it to that
+/// Return a copy of the specified function and add it to that
 /// function's module.  Also, any references specified in the VMap are changed
 /// to refer to their mapped value instead of the original one.  If any of the
 /// arguments to the function are in the VMap, the arguments are deleted from
@@ -154,7 +150,7 @@ void CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
                                const char *NameSuffix = "",
                                ClonedCodeInfo *CodeInfo = nullptr);
 
-/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
+/// This works exactly like CloneFunctionInto,
 /// except that it does some simple constant prop and DCE on the fly.  The
 /// effect of this is to copy significantly less code in cases where (for
 /// example) a function call with constant arguments is inlined, and those
@@ -172,8 +168,8 @@ void CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                                ClonedCodeInfo *CodeInfo = nullptr,
                                Instruction *TheCall = nullptr);
 
-/// InlineFunctionInfo - This class captures the data input to the
-/// InlineFunction call, and records the auxiliary results produced by it.
+/// This class captures the data input to the InlineFunction call, and records
+/// the auxiliary results produced by it.
 class InlineFunctionInfo {
 public:
   explicit InlineFunctionInfo(CallGraph *cg = nullptr,
@@ -185,19 +181,19 @@ public:
       : CG(cg), GetAssumptionCache(GetAssumptionCache), PSI(PSI),
         CallerBFI(CallerBFI), CalleeBFI(CalleeBFI) {}
 
-  /// CG - If non-null, InlineFunction will update the callgraph to reflect the
+  /// If non-null, InlineFunction will update the callgraph to reflect the
   /// changes it makes.
   CallGraph *CG;
   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
   ProfileSummaryInfo *PSI;
   BlockFrequencyInfo *CallerBFI, *CalleeBFI;
 
-  /// StaticAllocas - InlineFunction fills this in with all static allocas that
-  /// get copied into the caller.
+  /// InlineFunction fills this in with all static allocas that get copied into
+  /// the caller.
   SmallVector<AllocaInst *, 4> StaticAllocas;
 
-  /// InlinedCalls - InlineFunction fills this in with callsites that were
-  /// inlined from the callee.  This is only filled in if CG is non-null.
+  /// InlineFunction fills this in with callsites that were inlined from the
+  /// callee. This is only filled in if CG is non-null.
   SmallVector<WeakTrackingVH, 8> InlinedCalls;
 
   /// All of the new call sites inlined into the caller.
@@ -214,7 +210,7 @@ public:
   }
 };
 
-/// InlineFunction - This function inlines the called function into the basic
+/// This function inlines the called function into the basic
 /// block of the caller.  This returns false if it is not possible to inline
 /// this call.  The program is still in a well defined state if this occurs
 /// though.
-- 
GitLab


From f087eba4103aa6f6a91790c8a5e98902434b78e8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 31 Oct 2018 00:31:06 +0000
Subject: [PATCH 0778/1116] Use the container form llvm::sort(C)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345682 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/TableGen/RegisterInfoEmitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 7c38dc55e81..ded54c828bc 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -296,7 +296,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
            PSetE = PSetIDs.end(); PSetI != PSetE; ++PSetI) {
       PSets[i].push_back(RegBank.getRegPressureSet(*PSetI).Order);
     }
-    llvm::sort(PSets[i].begin(), PSets[i].end());
+    llvm::sort(PSets[i]);
     PSetsSeqs.add(PSets[i]);
   }
 
-- 
GitLab


From 0fecaffa30522bb4dc0f191eb7b4b6af9f6825f5 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 31 Oct 2018 00:31:06 +0000
Subject: [PATCH 0779/1116] Use llvm::any_of instead std::any_of. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345683 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/SafepointIRVerifier.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/IR/SafepointIRVerifier.cpp b/lib/IR/SafepointIRVerifier.cpp
index d2102138d79..3596b31dd25 100644
--- a/lib/IR/SafepointIRVerifier.cpp
+++ b/lib/IR/SafepointIRVerifier.cpp
@@ -257,8 +257,7 @@ static bool containsGCPtrType(Type *Ty) {
   if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
     return containsGCPtrType(AT->getElementType());
   if (StructType *ST = dyn_cast<StructType>(Ty))
-    return std::any_of(ST->subtypes().begin(), ST->subtypes().end(),
-                       containsGCPtrType);
+    return llvm::any_of(ST->subtypes(), containsGCPtrType);
   return false;
 }
 
-- 
GitLab


From fe0f4354daf9844b4d40e2ef66cf6ebb96880e33 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 31 Oct 2018 00:31:07 +0000
Subject: [PATCH 0780/1116] [llvm-objcopy] Delete a redundant override whose
 base is empty

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345684 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objcopy/ELF/Object.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/llvm-objcopy/ELF/Object.h b/tools/llvm-objcopy/ELF/Object.h
index 38ef21ffec9..4aa3125f26c 100644
--- a/tools/llvm-objcopy/ELF/Object.h
+++ b/tools/llvm-objcopy/ELF/Object.h
@@ -571,7 +571,6 @@ public:
   void setFlagWord(ELF::Elf32_Word W) { FlagWord = W; }
   void addMember(SectionBase *Sec) { GroupMembers.push_back(Sec); }
 
-  void initialize(SectionTableRef SecTable) override{};
   void accept(SectionVisitor &) const override;
   void finalize() override;
   void removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
-- 
GitLab


From daecf945cc43952a482a30a234faae59b41c3789 Mon Sep 17 00:00:00 2001
From: Wolfgang Pieb <Wolfgang.Pieb@sony.com>
Date: Wed, 31 Oct 2018 01:12:58 +0000
Subject: [PATCH 0781/1116] [DWARF] Revert r345546: Refactor range list
 extraction and dumping

This patch caused some internal tests to break which are being investigated.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345687 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFContext.h   |   8 +-
 .../DebugInfo/DWARF/DWARFDebugRangeList.h     |  85 +++++++++
 .../llvm/DebugInfo/DWARF/DWARFDebugRnglists.h |  38 +---
 include/llvm/DebugInfo/DWARF/DWARFListTable.h | 165 +++++-------------
 include/llvm/DebugInfo/DWARF/DWARFUnit.h      |   7 +
 lib/DebugInfo/DWARF/CMakeLists.txt            |   1 +
 lib/DebugInfo/DWARF/DWARFContext.cpp          |  74 ++++----
 lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp   |  96 ++++++++++
 lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp    |  96 +++-------
 lib/DebugInfo/DWARF/DWARFDie.cpp              |   1 +
 lib/DebugInfo/DWARF/DWARFListTable.cpp        |  74 +++-----
 lib/DebugInfo/DWARF/DWARFUnit.cpp             | 101 +++++------
 tools/dsymutil/DwarfLinker.cpp                |  23 ++-
 tools/dsymutil/DwarfStreamer.cpp              |  19 +-
 tools/dsymutil/DwarfStreamer.h                |   4 +-
 15 files changed, 398 insertions(+), 394 deletions(-)
 create mode 100644 include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
 create mode 100644 lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp

diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 13bcdd25c32..221f1f79698 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -231,16 +231,16 @@ public:
   /// Get a DIE given an exact offset.
   DWARFDie getDIEForOffset(uint32_t Offset);
 
-  unsigned getMaxVersion(uint16_t DefaultVersion = 0) {
+  unsigned getMaxVersion() {
     // Ensure info units have been parsed to discover MaxVersion
     info_section_units();
-    return MaxVersion ? MaxVersion : DefaultVersion;
+    return MaxVersion;
   }
 
-  unsigned getMaxDWOVersion(uint16_t DefaultVersion = 0) {
+  unsigned getMaxDWOVersion() {
     // Ensure DWO info units have been parsed to discover MaxVersion
     dwo_info_section_units();
-    return MaxVersion ? MaxVersion : DefaultVersion;
+    return MaxVersion;
   }
 
   void setMaxVersionIfGreater(unsigned Version) {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
new file mode 100644
index 00000000000..bc26edf0064
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -0,0 +1,85 @@
+//===- DWARFDebugRangeList.h ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
+#define LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
+
+#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFDebugRangeList {
+public:
+  struct RangeListEntry {
+    /// A beginning address offset. This address offset has the size of an
+    /// address and is relative to the applicable base address of the
+    /// compilation unit referencing this range list. It marks the beginning
+    /// of an address range.
+    uint64_t StartAddress;
+    /// An ending address offset. This address offset again has the size of
+    /// an address and is relative to the applicable base address of the
+    /// compilation unit referencing this range list. It marks the first
+    /// address past the end of the address range. The ending address must
+    /// be greater than or equal to the beginning address.
+    uint64_t EndAddress;
+    /// A section index this range belongs to.
+    uint64_t SectionIndex;
+
+    /// The end of any given range list is marked by an end of list entry,
+    /// which consists of a 0 for the beginning address offset
+    /// and a 0 for the ending address offset.
+    bool isEndOfListEntry() const {
+      return (StartAddress == 0) && (EndAddress == 0);
+    }
+
+    /// A base address selection entry consists of:
+    /// 1. The value of the largest representable address offset
+    /// (for example, 0xffffffff when the size of an address is 32 bits).
+    /// 2. An address, which defines the appropriate base address for
+    /// use in interpreting the beginning and ending address offsets of
+    /// subsequent entries of the location list.
+    bool isBaseAddressSelectionEntry(uint8_t AddressSize) const {
+      assert(AddressSize == 4 || AddressSize == 8);
+      if (AddressSize == 4)
+        return StartAddress == -1U;
+      else
+        return StartAddress == -1ULL;
+    }
+  };
+
+private:
+  /// Offset in .debug_ranges section.
+  uint32_t Offset;
+  uint8_t AddressSize;
+  std::vector<RangeListEntry> Entries;
+
+public:
+  DWARFDebugRangeList() { clear(); }
+
+  void clear();
+  void dump(raw_ostream &OS) const;
+  Error extract(const DWARFDataExtractor &data, uint32_t *offset_ptr);
+  const std::vector<RangeListEntry> &getEntries() { return Entries; }
+
+  /// getAbsoluteRanges - Returns absolute address ranges defined by this range
+  /// list. Has to be passed base address of the compile unit referencing this
+  /// range list.
+  DWARFAddressRangesVector
+  getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
index 1f4b7717e23..5cc8d789e59 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
@@ -13,8 +13,8 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DIContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFListTable.h"
 #include <cstdint>
 #include <map>
@@ -22,8 +22,6 @@
 
 namespace llvm {
 
-struct BaseAddress;
-class DWARFContext;
 class Error;
 class raw_ostream;
 class DWARFUnit;
@@ -37,30 +35,12 @@ struct RangeListEntry : public DWARFListEntryBase {
   uint64_t Value0;
   uint64_t Value1;
 
-  Error extract(DWARFDataExtractor Data, uint32_t End, uint16_t Version,
-                StringRef SectionName, uint32_t *OffsetPtr, bool isDWO = false);
-  bool isEndOfList() const { return EntryKind == dwarf::DW_RLE_end_of_list; }
-  bool isBaseAddressSelectionEntry() const {
-    return EntryKind == dwarf::DW_RLE_base_address;
-  }
-  uint64_t getStartAddress() const {
-    assert((EntryKind == dwarf::DW_RLE_start_end ||
-            EntryKind == dwarf::DW_RLE_offset_pair ||
-            EntryKind == dwarf::DW_RLE_startx_length) &&
-           "Unexpected range list entry kind");
-    return Value0;
-  }
-  uint64_t getEndAddress() const {
-    assert((EntryKind == dwarf::DW_RLE_start_end ||
-            EntryKind == dwarf::DW_RLE_offset_pair) &&
-           "Unexpected range list entry kind");
-    return Value1;
-  }
-  void dump(raw_ostream &OS, DWARFContext *C, uint8_t AddrSize, 
-            uint64_t &CurrentBase, unsigned Indent, uint16_t Version,
-            uint8_t MaxEncodingStringLength, DIDumpOptions DumpOpts,
+  Error extract(DWARFDataExtractor Data, uint32_t End, uint32_t *OffsetPtr);
+  void dump(raw_ostream &OS, uint8_t AddrSize, uint8_t MaxEncodingStringLength,
+            uint64_t &CurrentBase, DIDumpOptions DumpOpts,
             llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
                 LookupPooledAddress) const;
+  bool isSentinel() const { return EntryKind == dwarf::DW_RLE_end_of_list; }
 };
 
 /// A class representing a single rangelist.
@@ -74,12 +54,10 @@ public:
 
 class DWARFDebugRnglistTable : public DWARFListTableBase<DWARFDebugRnglist> {
 public:
-  DWARFDebugRnglistTable(DWARFContext *C, StringRef SectionName,
-                         bool isDWO = false)
-      : DWARFListTableBase(C, SectionName, isDWO,
+  DWARFDebugRnglistTable()
+      : DWARFListTableBase(/* SectionName    = */ ".debug_rnglists",
                            /* HeaderString   = */ "ranges:",
-                           /* ListTypeString = */ "range",
-                           dwarf::RangeListEncodingString) {}
+                           /* ListTypeString = */ "range") {}
 };
 
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFListTable.h b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
index 66a96dfd610..9b987314f20 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFListTable.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
@@ -23,8 +23,6 @@
 
 namespace llvm {
 
-class DWARFContext;
-
 /// A base class for DWARF list entries, such as range or location list
 /// entries.
 struct DWARFListEntryBase {
@@ -39,7 +37,6 @@ struct DWARFListEntryBase {
 /// A base class for lists of entries that are extracted from a particular
 /// section, such as range lists or location lists.
 template <typename ListEntryType> class DWARFListType {
-public:
   using EntryType = ListEntryType;
   using ListEntries = std::vector<EntryType>;
 
@@ -48,26 +45,11 @@ protected:
 
 public:
   const ListEntries &getEntries() const { return Entries; }
-  bool empty() const {
-    return Entries.empty() || Entries.begin()->isEndOfList();
-  }
+  bool empty() const { return Entries.empty(); }
   void clear() { Entries.clear(); }
-  uint32_t getOffset() const {
-    if (Entries.empty())
-      return 0;
-    return Entries.begin()->Offset;
-  }
-
-  /// Extract a list. The caller must pass the correct DWARF version.
-  /// The end-of-list entry is retained as the last element of the vector of
-  /// entries.
   Error extract(DWARFDataExtractor Data, uint32_t HeaderOffset, uint32_t End,
-                uint16_t Version, uint32_t *OffsetPtr, StringRef SectionName,
-                StringRef ListStringName, bool isDWO = false);
-  void dump(raw_ostream &OS, DWARFContext *C, uint8_t AddressSize,
-            uint64_t BaseAddress, unsigned Indent, uint16_t Version,
-            size_t MaxEncodingStringLength,
-            DIDumpOptions DumpOpts, llvm::function_ref<Optional<SectionedAddress>(uint32_t)> LookupPooledAddress) const;
+                uint32_t *OffsetPtr, StringRef SectionName,
+                StringRef ListStringName);
 };
 
 /// A class representing the header of a list table such as the range list
@@ -85,9 +67,9 @@ class DWARFListTableHeader {
     uint8_t AddrSize;
     /// The size in bytes of a segment selector on the target architecture.
     /// If the target system uses a flat address space, this value is 0.
-    uint8_t SegSize = 0;
+    uint8_t SegSize;
     /// The number of offsets that follow the header before the range lists.
-    uint32_t OffsetEntryCount = 0;
+    uint32_t OffsetEntryCount;
   };
 
   Header HeaderData;
@@ -96,10 +78,10 @@ class DWARFListTableHeader {
   /// FIXME: Generate the table and use the appropriate forms.
   std::vector<uint32_t> Offsets;
   /// The table's format, either DWARF32 or DWARF64.
-  dwarf::DwarfFormat Format = dwarf::DwarfFormat::DWARF32;
+  dwarf::DwarfFormat Format;
   /// The offset at which the header (and hence the table) is located within
   /// its section.
-  uint32_t HeaderOffset = 0;
+  uint32_t HeaderOffset;
   /// The name of the section the list is located in.
   StringRef SectionName;
   /// A characterization of the list for dumping purposes, e.g. "range" or
@@ -115,19 +97,9 @@ public:
     Offsets.clear();
   }
   uint32_t getHeaderOffset() const { return HeaderOffset; }
-
   uint8_t getAddrSize() const { return HeaderData.AddrSize; }
-  void setAddrSize(uint8_t AddrSize) { HeaderData.AddrSize = AddrSize; }
-
   uint32_t getLength() const { return HeaderData.Length; }
-  void setLength(uint32_t Length) { HeaderData.Length = Length; }
-
   uint16_t getVersion() const { return HeaderData.Version; }
-  void setVersion(uint16_t Version) { HeaderData.Version = Version; }
-
-  uint8_t getSegSize() const { return HeaderData.SegSize; }
-  uint32_t getOffsetEntryCount() const { return HeaderData.OffsetEntryCount; }
-
   StringRef getSectionName() const { return SectionName; }
   StringRef getListTypeString() const { return ListTypeString; }
   dwarf::DwarfFormat getFormat() const { return Format; }
@@ -144,10 +116,8 @@ public:
 
   /// Returns the length of the table, including the length field, or 0 if the
   /// length has not been determined (e.g. because the table has not yet been
-  /// parsed, or there was a problem in parsing). In fake tables, such as for
-  /// DWARF v4 and earlier, there is no header, so the length simply reflects
-  /// the size of the section.
-  uint32_t getTableLength() const;
+  /// parsed, or there was a problem in parsing).
+  uint32_t length() const;
 };
 
 /// A class representing a table of lists as specified in the DWARF v5
@@ -160,22 +130,14 @@ template <typename DWARFListType> class DWARFListTableBase {
   /// A mapping between file offsets and lists. It is used to find a particular
   /// list based on an offset (obtained from DW_AT_ranges, for example).
   std::map<uint32_t, DWARFListType> ListMap;
-  DWARFContext *Ctx;
-  /// True if this list is located in a split-DWARF (dwo or dwp) file.
-  bool isDWO;
   /// This string is displayed as a heading before the list is dumped
   /// (e.g. "ranges:").
   StringRef HeaderString;
-  /// A function returning the encoding string for a given list entry encoding,
-  /// e.g. "DW_RLE_start_end".
-  std::function<StringRef(unsigned)> EncodingString;
 
 protected:
-  DWARFListTableBase(DWARFContext *C, StringRef SectionName, bool isDWO,
-                     StringRef HeaderString, StringRef ListTypeString,
-                     std::function<StringRef(unsigned)> EncodingString)
-      : Header(SectionName, ListTypeString), Ctx(C), isDWO(isDWO),
-        HeaderString(HeaderString), EncodingString(EncodingString) {}
+  DWARFListTableBase(StringRef SectionName, StringRef HeaderString,
+                     StringRef ListTypeString)
+      : Header(SectionName, ListTypeString), HeaderString(HeaderString) {}
 
 public:
   void clear() {
@@ -186,28 +148,14 @@ public:
   Error extractHeaderAndOffsets(DWARFDataExtractor Data, uint32_t *OffsetPtr) {
     return Header.extract(Data, OffsetPtr);
   }
-
-  /// Initialize the table header to explicit values. This is used for DWARF v4
-  /// and earlier since there is no header that can be extracted from a section.
-  void setHeaderData(uint32_t Length, uint16_t Version, uint8_t AddrSize) {
-    assert(Header.getSegSize() == 0 &&
-           "Unexpected segsize in list table header.");
-    assert(Header.getOffsetEntryCount() == 0 &&
-           "Unexpected offset entry count in list table header.");
-    Header.setLength(Length);
-    Header.setVersion(Version);
-    Header.setAddrSize(AddrSize);
-  }
-
   /// Extract an entire table, including all list entries.
-  Error extract(DWARFDataExtractor Data, uint16_t Version, uint32_t *OffsetPtr);
+  Error extract(DWARFDataExtractor Data, uint32_t *OffsetPtr);
   /// Look up a list based on a given offset. Extract it and enter it into the
   /// list map if necessary.
   Expected<DWARFListType> findList(DWARFDataExtractor Data, uint32_t Offset);
 
   uint32_t getHeaderOffset() const { return Header.getHeaderOffset(); }
   uint8_t getAddrSize() const { return Header.getAddrSize(); }
-  StringRef getListTypeString() const { return Header.getListTypeString(); }
 
   void dump(raw_ostream &OS,
             llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
@@ -231,35 +179,25 @@ public:
     llvm_unreachable("Invalid DWARF format (expected DWARF32 or DWARF64");
   }
 
-  uint16_t getVersion() const { return Header.getVersion(); }
-  uint32_t getLength() const { return Header.getTableLength(); }
+  uint32_t length() { return Header.length(); }
 };
 
 template <typename DWARFListType>
 Error DWARFListTableBase<DWARFListType>::extract(DWARFDataExtractor Data,
-                                                 uint16_t Version,
                                                  uint32_t *OffsetPtr) {
-  assert(Version > 0 && "DWARF version required and not given.");
   clear();
-  // For DWARF v4 and earlier, we cannot extract a table header, so we
-  // initialize it explicitly.
-  if (Version < 5)
-    setHeaderData(Data.size(), Version, Data.getAddressSize());
-  else if (Error E = extractHeaderAndOffsets(Data, OffsetPtr))
+  if (Error E = extractHeaderAndOffsets(Data, OffsetPtr))
     return E;
 
   Data.setAddressSize(Header.getAddrSize());
-  uint32_t End = getHeaderOffset() + getLength();
-  // Extract all lists.
+  uint32_t End = getHeaderOffset() + Header.length();
   while (*OffsetPtr < End) {
     DWARFListType CurrentList;
     uint32_t Off = *OffsetPtr;
-    if (Error E = CurrentList.extract(
-            Data, getHeaderOffset(), End, Header.getVersion(), OffsetPtr,
-            Header.getSectionName(), Header.getListTypeString(), isDWO)) {
-      *OffsetPtr = End;
+    if (Error E = CurrentList.extract(Data, getHeaderOffset(), End, OffsetPtr,
+                                      Header.getSectionName(),
+                                      Header.getListTypeString()))
       return E;
-    }
     ListMap[Off] = CurrentList;
   }
 
@@ -270,25 +208,22 @@ Error DWARFListTableBase<DWARFListType>::extract(DWARFDataExtractor Data,
 }
 
 template <typename ListEntryType>
-Error DWARFListType<ListEntryType>::extract(
-    DWARFDataExtractor Data, uint32_t HeaderOffset, uint32_t End,
-    uint16_t Version, uint32_t *OffsetPtr, StringRef SectionName,
-    StringRef ListTypeString, bool isDWO) {
+Error DWARFListType<ListEntryType>::extract(DWARFDataExtractor Data,
+                                            uint32_t HeaderOffset, uint32_t End,
+                                            uint32_t *OffsetPtr,
+                                            StringRef SectionName,
+                                            StringRef ListTypeString) {
   if (*OffsetPtr < HeaderOffset || *OffsetPtr >= End)
     return createStringError(errc::invalid_argument,
                        "invalid %s list offset 0x%" PRIx32,
                        ListTypeString.data(), *OffsetPtr);
   Entries.clear();
-  uint32_t StartingOffset = *OffsetPtr;
   while (*OffsetPtr < End) {
     ListEntryType Entry;
-    if (Error E =
-            Entry.extract(Data, End, Version, SectionName, OffsetPtr, isDWO))
+    if (Error E = Entry.extract(Data, End, OffsetPtr))
       return E;
-    if (Version < 5)
-      Entry.Offset = StartingOffset;
     Entries.push_back(Entry);
-    if (Entry.isEndOfList())
+    if (Entry.isSentinel())
       return Error::success();
   }
   return createStringError(errc::illegal_byte_sequence,
@@ -297,47 +232,31 @@ Error DWARFListType<ListEntryType>::extract(
                      SectionName.data(), HeaderOffset);
 }
 
-template <typename ListEntryType>
-void DWARFListType<ListEntryType>::dump(raw_ostream &OS, DWARFContext *C,
-                                        uint8_t AddressSize,
-                                        uint64_t BaseAddress, unsigned Indent,
-                                        uint16_t Version,
-                                        size_t MaxEncodingStringLength,
-                                        DIDumpOptions DumpOpts,
-                                        llvm::function_ref<Optional<SectionedAddress>(uint32_t)> LookupPooledAddress) const {
-  uint64_t CurrentBase = BaseAddress;
-  for (const auto &Entry : Entries)
-    Entry.dump(OS, C, AddressSize, CurrentBase, Indent, Version,
-               MaxEncodingStringLength, DumpOpts, LookupPooledAddress);
-}
-
 template <typename DWARFListType>
 void DWARFListTableBase<DWARFListType>::dump(
     raw_ostream &OS,
     llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
         LookupPooledAddress,
     DIDumpOptions DumpOpts) const {
+  Header.dump(OS, DumpOpts);
+  OS << HeaderString << "\n";
+
   // Determine the length of the longest encoding string we have in the table,
   // so we can align the output properly. We only need this in verbose mode.
   size_t MaxEncodingStringLength = 0;
-  // Don't dump the fake table header we create for DWARF v4 and earlier.
-  if (Header.getVersion() > 4) {
-    Header.dump(OS, DumpOpts);
-    OS << HeaderString << '\n';
-    // Determine the length of the longest encoding string we have in the table,
-    // so we can align the output properly. We only need this in verbose mode.
-    if (DumpOpts.Verbose)
-      for (const auto &List : ListMap)
-        for (const auto &Entry : List.second.getEntries())
-          MaxEncodingStringLength = std::max(
-              MaxEncodingStringLength, EncodingString(Entry.EntryKind).size());
+  if (DumpOpts.Verbose) {
+    for (const auto &List : ListMap)
+      for (const auto &Entry : List.second.getEntries())
+        MaxEncodingStringLength =
+            std::max(MaxEncodingStringLength,
+                     dwarf::RangeListEncodingString(Entry.EntryKind).size());
   }
 
   uint64_t CurrentBase = 0;
   for (const auto &List : ListMap)
-    List.second.dump(OS, Ctx, getAddrSize(), CurrentBase, 0,
-                     Header.getVersion(), MaxEncodingStringLength, DumpOpts,
-                     LookupPooledAddress);
+    for (const auto &Entry : List.second.getEntries())
+      Entry.dump(OS, getAddrSize(), MaxEncodingStringLength, CurrentBase,
+                 DumpOpts, LookupPooledAddress);
 }
 
 template <typename DWARFListType>
@@ -350,11 +269,11 @@ DWARFListTableBase<DWARFListType>::findList(DWARFDataExtractor Data,
 
   // Extract the list from the section and enter it into the list map.
   DWARFListType List;
-  uint32_t End = getHeaderOffset() + getLength();
+  uint32_t End = getHeaderOffset() + Header.length();
   uint32_t StartingOffset = Offset;
-  if (Error E = List.extract(Data, getHeaderOffset(), End, Header.getVersion(),
-                             &Offset, Header.getSectionName(),
-                             Header.getListTypeString(), isDWO))
+  if (Error E =
+          List.extract(Data, getHeaderOffset(), End, &Offset,
+                       Header.getSectionName(), Header.getListTypeString()))
     return std::move(E);
   ListMap[StartingOffset] = List;
   return List;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index eb4a198dd03..c3252157b0b 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
@@ -311,6 +312,12 @@ public:
     return DataExtractor(StringSection, false, 0);
   }
 
+  /// Extract the range list referenced by this compile unit from the
+  /// .debug_ranges section. If the extraction is unsuccessful, an error
+  /// is returned. Successful extraction requires that the compile unit
+  /// has already been extracted.
+  Error extractRangeList(uint32_t RangeListOffset,
+                         DWARFDebugRangeList &RangeList) const;
   void clear();
 
   const Optional<StrOffsetsContributionDescriptor> &
diff --git a/lib/DebugInfo/DWARF/CMakeLists.txt b/lib/DebugInfo/DWARF/CMakeLists.txt
index 437c845718d..b4770e561f7 100644
--- a/lib/DebugInfo/DWARF/CMakeLists.txt
+++ b/lib/DebugInfo/DWARF/CMakeLists.txt
@@ -15,6 +15,7 @@ add_llvm_library(LLVMDebugInfoDWARF
   DWARFDebugLoc.cpp
   DWARFDebugMacro.cpp
   DWARFDebugPubTable.cpp
+  DWARFDebugRangeList.cpp
   DWARFDebugRnglists.cpp
   DWARFDie.cpp
   DWARFExpression.cpp
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 3a0f52753b0..a29c9c2f160 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -25,6 +25,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
@@ -267,31 +268,26 @@ static void dumpAddrSection(raw_ostream &OS, DWARFDataExtractor &AddrData,
   }
 }
 
-// Dump a section that contains a sequence of tables of lists, such as range
-// or location list tables. In DWARF v5 we expect to find properly formatted
-// tables with headers. In DWARF v4 and earlier we simply expect a sequence of
-// lists, which we treat, mutatis mutandis, like DWARF v5 tables.
-template <typename ListTable>
+// Dump the .debug_rnglists or .debug_rnglists.dwo section (DWARF v5).
 static void
-dumpListSection(raw_ostream &OS, DWARFContext *C, StringRef SectionName,
-                uint16_t MaxVersion, DWARFDataExtractor &ListData,
-                llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
-                    LookupPooledAddress,
-                DIDumpOptions DumpOpts, bool isDWO = false) {
+dumpRnglistsSection(raw_ostream &OS, DWARFDataExtractor &rnglistData,
+                    llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+                        LookupPooledAddress,
+                    DIDumpOptions DumpOpts) {
   uint32_t Offset = 0;
-  while (ListData.isValidOffset(Offset)) {
-    ListTable Table(C, SectionName, isDWO);
-    if (Error Err = Table.extract(ListData, MaxVersion, &Offset)) {
+  while (rnglistData.isValidOffset(Offset)) {
+    llvm::DWARFDebugRnglistTable Rnglists;
+    uint32_t TableOffset = Offset;
+    if (Error Err = Rnglists.extract(rnglistData, &Offset)) {
       WithColor::error() << toString(std::move(Err)) << '\n';
-      // If table extraction set Offset to 0, it indicates that we cannot
-      // continue to read the section.
-      if (Offset == 0)
+      uint64_t Length = Rnglists.length();
+      // Keep going after an error, if we can, assuming that the length field
+      // could be read. If it couldn't, stop reading the section.
+      if (Length == 0)
         break;
-      // In DWARF v4 and earlier, dump as much of the lists as we can.
-      if (MaxVersion < 5)
-        Table.dump(OS, LookupPooledAddress, DumpOpts);
+      Offset = TableOffset + Length;
     } else {
-      Table.dump(OS, LookupPooledAddress, DumpOpts);
+      Rnglists.dump(OS, LookupPooledAddress, DumpOpts);
     }
   }
 }
@@ -512,6 +508,22 @@ void DWARFContext::dump(
     dumpAddrSection(OS, AddrData, DumpOpts, getMaxVersion(), getCUAddrSize());
   }
 
+  if (shouldDump(Explicit, ".debug_ranges", DIDT_ID_DebugRanges,
+                 DObj->getRangeSection().Data)) {
+    uint8_t savedAddressByteSize = getCUAddrSize();
+    DWARFDataExtractor rangesData(*DObj, DObj->getRangeSection(),
+                                  isLittleEndian(), savedAddressByteSize);
+    uint32_t offset = 0;
+    DWARFDebugRangeList rangeList;
+    while (rangesData.isValidOffset(offset)) {
+      if (Error E = rangeList.extract(rangesData, &offset)) {
+        WithColor::error() << toString(std::move(E)) << '\n';
+        break;
+      }
+      rangeList.dump(OS);
+    }
+  }
+
   auto LookupPooledAddress = [&](uint32_t Index) -> Optional<SectionedAddress> {
     const auto &CUs = compile_units();
     auto I = CUs.begin();
@@ -520,32 +532,18 @@ void DWARFContext::dump(
     return (*I)->getAddrOffsetSectionItem(Index);
   };
 
-  if (shouldDump(Explicit, ".debug_ranges", DIDT_ID_DebugRanges,
-                 DObj->getRangeSection().Data)) {
-    uint8_t savedAddressByteSize = getCUAddrSize();
-    DWARFDataExtractor rangesData(*DObj, DObj->getRangeSection(),
-                                  isLittleEndian(), savedAddressByteSize);
-    dumpListSection<DWARFDebugRnglistTable>(OS, this, ".debug_ranges",
-                                            /* MaxVersion = */ 4, rangesData,
-                                            LookupPooledAddress, DumpOpts);
-  }
-
   if (shouldDump(Explicit, ".debug_rnglists", DIDT_ID_DebugRnglists,
                  DObj->getRnglistsSection().Data)) {
     DWARFDataExtractor RnglistData(*DObj, DObj->getRnglistsSection(),
-                                   isLittleEndian(), getCUAddrSize());
-    dumpListSection<DWARFDebugRnglistTable>(OS, this, ".debug_rnglists",
-                                            getMaxVersion(5), RnglistData,
-                                            LookupPooledAddress, DumpOpts);
+                                   isLittleEndian(), 0);
+    dumpRnglistsSection(OS, RnglistData, LookupPooledAddress, DumpOpts);
   }
 
   if (shouldDump(ExplicitDWO, ".debug_rnglists.dwo", DIDT_ID_DebugRnglists,
                  DObj->getRnglistsDWOSection().Data)) {
     DWARFDataExtractor RnglistData(*DObj, DObj->getRnglistsDWOSection(),
-                                   isLittleEndian(), getCUAddrSize());
-    dumpListSection<DWARFDebugRnglistTable>(OS, this, ".debug_rnglists.dwo",
-                                            getMaxVersion(5), RnglistData,
-                                            LookupPooledAddress, DumpOpts);
+                                   isLittleEndian(), 0);
+    dumpRnglistsSection(OS, RnglistData, LookupPooledAddress, DumpOpts);
   }
 
   if (shouldDump(Explicit, ".debug_pubnames", DIDT_ID_DebugPubnames,
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
new file mode 100644
index 00000000000..dfb913000a4
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -0,0 +1,96 @@
+//===- DWARFDebugRangesList.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
+
+using namespace llvm;
+
+void DWARFDebugRangeList::clear() {
+  Offset = -1U;
+  AddressSize = 0;
+  Entries.clear();
+}
+
+Error DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
+                                   uint32_t *offset_ptr) {
+  clear();
+  if (!data.isValidOffset(*offset_ptr))
+    return createStringError(errc::invalid_argument,
+                       "invalid range list offset 0x%" PRIx32, *offset_ptr);
+
+  AddressSize = data.getAddressSize();
+  if (AddressSize != 4 && AddressSize != 8)
+    return createStringError(errc::invalid_argument,
+                       "invalid address size: %" PRIu8, AddressSize);
+  Offset = *offset_ptr;
+  while (true) {
+    RangeListEntry Entry;
+    Entry.SectionIndex = -1ULL;
+
+    uint32_t prev_offset = *offset_ptr;
+    Entry.StartAddress = data.getRelocatedAddress(offset_ptr);
+    Entry.EndAddress =
+        data.getRelocatedAddress(offset_ptr, &Entry.SectionIndex);
+
+    // Check that both values were extracted correctly.
+    if (*offset_ptr != prev_offset + 2 * AddressSize) {
+      clear();
+      return createStringError(errc::invalid_argument,
+                         "invalid range list entry at offset 0x%" PRIx32,
+                         prev_offset);
+    }
+    if (Entry.isEndOfListEntry())
+      break;
+    Entries.push_back(Entry);
+  }
+  return Error::success();
+}
+
+void DWARFDebugRangeList::dump(raw_ostream &OS) const {
+  for (const RangeListEntry &RLE : Entries) {
+    const char *format_str = (AddressSize == 4
+                              ? "%08x %08"  PRIx64 " %08"  PRIx64 "\n"
+                              : "%08x %016" PRIx64 " %016" PRIx64 "\n");
+    OS << format(format_str, Offset, RLE.StartAddress, RLE.EndAddress);
+  }
+  OS << format("%08x <End of list>\n", Offset);
+}
+
+DWARFAddressRangesVector DWARFDebugRangeList::getAbsoluteRanges(
+    llvm::Optional<SectionedAddress> BaseAddr) const {
+  DWARFAddressRangesVector Res;
+  for (const RangeListEntry &RLE : Entries) {
+    if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
+      BaseAddr = {RLE.EndAddress, RLE.SectionIndex};
+      continue;
+    }
+
+    DWARFAddressRange E;
+    E.LowPC = RLE.StartAddress;
+    E.HighPC = RLE.EndAddress;
+    E.SectionIndex = RLE.SectionIndex;
+    // Base address of a range list entry is determined by the closest preceding
+    // base address selection entry in the same range list. It defaults to the
+    // base address of the compilation unit if there is no such entry.
+    if (BaseAddr) {
+      E.LowPC += BaseAddr->Address;
+      E.HighPC += BaseAddr->Address;
+      if (E.SectionIndex == -1ULL)
+        E.SectionIndex = BaseAddr->SectionIndex;
+    }
+    Res.push_back(E);
+  }
+  return Res;
+}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
index 737603bc88c..cb5fb0d49da 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
@@ -13,30 +13,19 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
-                              uint16_t Version, StringRef /* SectionName */,
-                              uint32_t *OffsetPtr, bool /* isDWO */) {
+                              uint32_t *OffsetPtr) {
   Offset = *OffsetPtr;
   SectionIndex = -1ULL;
-
-  assert((Data.getAddressSize() == 4 || Data.getAddressSize() == 8) &&
-         "Unsupported address size");
-
-  // We model a DWARF v4 range list entry like DWARF v5 DW_RLE_offset_pair,
-  // since it is subject to base adjustment.
-  uint8_t Encoding = dwarf::DW_RLE_offset_pair;
-  if (Version > 4) {
-    // The caller should guarantee that we have at least 1 byte available, so
-    // we just assert instead of revalidate.
-    assert(*OffsetPtr < End &&
-           "not enough space to extract a rangelist encoding");
-    Encoding = Data.getU8(OffsetPtr);
-  }
+  // The caller should guarantee that we have at least 1 byte available, so
+  // we just assert instead of revalidate.
+  assert(*OffsetPtr < End &&
+         "not enough space to extract a rangelist encoding");
+  uint8_t Encoding = Data.getU8(OffsetPtr);
 
   switch (Encoding) {
   case dwarf::DW_RLE_end_of_list:
@@ -72,23 +61,6 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
     break;
   }
   case dwarf::DW_RLE_offset_pair: {
-    if (Version < 5) {
-      if ((End - *OffsetPtr) < unsigned(Data.getAddressSize() * 2))
-        return createStringError(
-            errc::illegal_byte_sequence,
-            "invalid range list entry at offset 0x%" PRIx32, *OffsetPtr);
-      Value0 = Data.getRelocatedAddress(OffsetPtr);
-      Value1 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
-      // Adjust the EntryKind for end-of-list and base_address based on the
-      // contents.
-      if (Value0 == maxUIntN(Data.getAddressSize() * 8)) {
-        Encoding = dwarf::DW_RLE_base_address;
-        Value0 = Value1;
-        Value1 = 0;
-      } else if (Value0 == 0 && Value1 == 0)
-        Encoding = dwarf::DW_RLE_end_of_list;
-      break;
-    }
     uint32_t PreviousOffset = *OffsetPtr - 1;
     Value0 = Data.getULEB128(OffsetPtr);
     Value1 = Data.getULEB128(OffsetPtr);
@@ -99,7 +71,7 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
                          PreviousOffset);
     break;
   }
-  case dwarf::DW_RLE_base_address:
+  case dwarf::DW_RLE_base_address: {
     if ((End - *OffsetPtr) < Data.getAddressSize())
       return createStringError(errc::invalid_argument,
                          "insufficient space remaining in table for "
@@ -107,16 +79,18 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
                          *OffsetPtr - 1);
     Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
     break;
-  case dwarf::DW_RLE_start_end:
+  }
+  case dwarf::DW_RLE_start_end: {
     if ((End - *OffsetPtr) < unsigned(Data.getAddressSize() * 2))
       return createStringError(errc::invalid_argument,
                          "insufficient space remaining in table for "
                          "DW_RLE_start_end encoding "
                          "at offset 0x%" PRIx32,
                          *OffsetPtr - 1);
-    Value0 = Data.getRelocatedAddress(OffsetPtr);
-    Value1 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
+    Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
+    Value1 = Data.getRelocatedAddress(OffsetPtr);
     break;
+  }
   case dwarf::DW_RLE_start_length: {
     uint32_t PreviousOffset = *OffsetPtr - 1;
     Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
@@ -199,9 +173,8 @@ DWARFDebugRnglist::getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr,
 }
 
 void RangeListEntry::dump(
-    raw_ostream &OS, DWARFContext *, uint8_t AddrSize, uint64_t &CurrentBase,
-    unsigned Indent, uint16_t Version, uint8_t MaxEncodingStringLength,
-    DIDumpOptions DumpOpts,
+    raw_ostream &OS, uint8_t AddrSize, uint8_t MaxEncodingStringLength,
+    uint64_t &CurrentBase, DIDumpOptions DumpOpts,
     llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
         LookupPooledAddress) const {
   auto PrintRawEntry = [](raw_ostream &OS, const RangeListEntry &Entry,
@@ -214,34 +187,21 @@ void RangeListEntry::dump(
     }
   };
 
-  // Output indentations before we print the actual entry. We only print
-  // anything for DW_RLE_base_address when we are in verbose mode.
-  if (Version < 5 || DumpOpts.Verbose || !isBaseAddressSelectionEntry())
-    OS.indent(Indent);
-
-  // Always print the section offset in DWARF v4 and earlier.
-  if (Version < 5) {
-    OS << format("%08x", Offset);
-    DumpOpts.Verbose = false;
-  }
-
   if (DumpOpts.Verbose) {
     // Print the section offset in verbose mode.
     OS << format("0x%8.8" PRIx32 ":", Offset);
-    if (Version > 4) {
-      auto EncodingString = dwarf::RangeListEncodingString(EntryKind);
-      // Unsupported encodings should have been reported during parsing.
-      assert(!EncodingString.empty() && "Unknown range entry encoding");
-      OS << format(" [%s%*c", EncodingString.data(),
-                   MaxEncodingStringLength - EncodingString.size() + 1, ']');
-      if (!isEndOfList())
-        OS << ": ";
-    }
+    auto EncodingString = dwarf::RangeListEncodingString(EntryKind);
+    // Unsupported encodings should have been reported during parsing.
+    assert(!EncodingString.empty() && "Unknown range entry encoding");
+    OS << format(" [%s%*c", EncodingString.data(),
+                 MaxEncodingStringLength - EncodingString.size() + 1, ']');
+    if (EntryKind != dwarf::DW_RLE_end_of_list)
+      OS << ": ";
   }
 
   switch (EntryKind) {
   case dwarf::DW_RLE_end_of_list:
-    OS << (DumpOpts.Verbose ? "" : " <End of list>");
+    OS << (DumpOpts.Verbose ? "" : "<End of list>");
     break;
     //  case dwarf::DW_RLE_base_addressx:
   case dwarf::DW_RLE_base_addressx: {
@@ -257,13 +217,6 @@ void RangeListEntry::dump(
   case dwarf::DW_RLE_base_address:
     // In non-verbose mode we do not print anything for this entry.
     CurrentBase = Value0;
-    if (Version < 5) {
-      // Dump the entry in pre-DWARF v5 format, i.e. with a -1 as Value0.
-      uint64_t allOnes = maxUIntN(AddrSize * 8);
-      OS << format(" %*.*" PRIx64, AddrSize * 2, AddrSize * 2, allOnes);
-      OS << format(" %*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value0);
-      break;
-    }
     if (!DumpOpts.Verbose)
       return;
     OS << format(" 0x%*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value0);
@@ -273,11 +226,6 @@ void RangeListEntry::dump(
     DWARFAddressRange(Value0, Value0 + Value1).dump(OS, AddrSize, DumpOpts);
     break;
   case dwarf::DW_RLE_offset_pair:
-    if (Version < 5) {
-      OS << format(" %*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value0);
-      OS << format(" %*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value1);
-      break;
-    }
     PrintRawEntry(OS, *this, AddrSize, DumpOpts);
     DWARFAddressRange(Value0 + CurrentBase, Value1 + CurrentBase)
         .dump(OS, AddrSize, DumpOpts);
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 874a2ba07fa..31c4cd5e472 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -15,6 +15,7 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
diff --git a/lib/DebugInfo/DWARF/DWARFListTable.cpp b/lib/DebugInfo/DWARF/DWARFListTable.cpp
index 69a9231f785..462c036d73a 100644
--- a/lib/DebugInfo/DWARF/DWARFListTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFListTable.cpp
@@ -20,43 +20,30 @@ Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
                                     uint32_t *OffsetPtr) {
   HeaderOffset = *OffsetPtr;
   // Read and verify the length field.
-  if (!Data.isValidOffsetForDataOfSize(*OffsetPtr, sizeof(uint32_t))) {
-    // By setting *OffsetPtr to 0, we indicate to the caller that
-    // we could not detemine the length of the table.
-    *OffsetPtr = 0;
+  if (!Data.isValidOffsetForDataOfSize(*OffsetPtr, sizeof(uint32_t)))
     return createStringError(errc::invalid_argument,
-                             "section is not large enough to contain a "
-                             "%s table length at offset 0x%" PRIx32,
-                             SectionName.data(), HeaderOffset);
-  }
+                       "section is not large enough to contain a "
+                       "%s table length at offset 0x%" PRIx32,
+                       SectionName.data(), *OffsetPtr);
   // TODO: Add support for DWARF64.
   HeaderData.Length = Data.getU32(OffsetPtr);
-  if (HeaderData.Length == 0xffffffffu) {
-    *OffsetPtr = 0;
+  if (HeaderData.Length == 0xffffffffu)
     return createStringError(errc::not_supported,
                        "DWARF64 is not supported in %s at offset 0x%" PRIx32,
                        SectionName.data(), HeaderOffset);
-  }
-
-  uint32_t TableLength = HeaderData.Length + sizeof(uint32_t);
-  uint32_t End = HeaderOffset + TableLength;
   Format = dwarf::DwarfFormat::DWARF32;
-  if (TableLength < sizeof(Header)) {
-    *OffsetPtr = End;
+  if (HeaderData.Length + sizeof(uint32_t) < sizeof(Header))
     return createStringError(errc::invalid_argument,
-                             "%s table at offset 0x%" PRIx32
-                             " has too small length (0x%" PRIx32
-                             ") to contain a complete header",
-                             SectionName.data(), HeaderOffset, TableLength);
-  }
-  if (!Data.isValidOffsetForDataOfSize(HeaderOffset, TableLength)) {
-    *OffsetPtr = 0; // No recovery if the length exceeds the section size.
-    return createStringError(
-        errc::invalid_argument,
-        "section is not large enough to contain a %s table "
-        "of length 0x%" PRIx32 " at offset 0x%" PRIx32,
-        SectionName.data(), TableLength, HeaderOffset);
-  }
+                       "%s table at offset 0x%" PRIx32
+                       " has too small length (0x%" PRIx32
+                       ") to contain a complete header",
+                       SectionName.data(), HeaderOffset, length());
+  uint32_t End = HeaderOffset + length();
+  if (!Data.isValidOffsetForDataOfSize(HeaderOffset, End - HeaderOffset))
+    return createStringError(errc::invalid_argument,
+                       "section is not large enough to contain a %s table "
+                       "of length 0x%" PRIx32 " at offset 0x%" PRIx32,
+                       SectionName.data(), length(), HeaderOffset);
 
   HeaderData.Version = Data.getU16(OffsetPtr);
   HeaderData.AddrSize = Data.getU8(OffsetPtr);
@@ -64,36 +51,27 @@ Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
   HeaderData.OffsetEntryCount = Data.getU32(OffsetPtr);
 
   // Perform basic validation of the remaining header fields.
-  if (HeaderData.Version != 5) {
-    *OffsetPtr = End;
+  if (HeaderData.Version != 5)
     return createStringError(errc::invalid_argument,
-                             "unrecognised %s table version %" PRIu16
-                             " in table at offset 0x%" PRIx32,
-                             SectionName.data(), HeaderData.Version,
-                             HeaderOffset);
-  }
-  if (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8) {
-    *OffsetPtr = End;
+                       "unrecognised %s table version %" PRIu16
+                       " in table at offset 0x%" PRIx32,
+                       SectionName.data(), HeaderData.Version, HeaderOffset);
+  if (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)
     return createStringError(errc::not_supported,
                        "%s table at offset 0x%" PRIx32
                        " has unsupported address size %" PRIu8,
                        SectionName.data(), HeaderOffset, HeaderData.AddrSize);
-  }
-  if (HeaderData.SegSize != 0) {
-    *OffsetPtr = End;
+  if (HeaderData.SegSize != 0)
     return createStringError(errc::not_supported,
                        "%s table at offset 0x%" PRIx32
                        " has unsupported segment selector size %" PRIu8,
                        SectionName.data(), HeaderOffset, HeaderData.SegSize);
-  }
   if (End < HeaderOffset + sizeof(HeaderData) +
-                HeaderData.OffsetEntryCount * sizeof(uint32_t)) {
-    *OffsetPtr = End;
+                HeaderData.OffsetEntryCount * sizeof(uint32_t))
     return createStringError(errc::invalid_argument,
         "%s table at offset 0x%" PRIx32 " has more offset entries (%" PRIu32
         ") than there is space for",
         SectionName.data(), HeaderOffset, HeaderData.OffsetEntryCount);
-  }
   Data.setAddressSize(HeaderData.AddrSize);
   for (uint32_t I = 0; I < HeaderData.OffsetEntryCount; ++I)
     Offsets.push_back(Data.getU32(OffsetPtr));
@@ -123,11 +101,9 @@ void DWARFListTableHeader::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   }
 }
 
-uint32_t DWARFListTableHeader::getTableLength() const {
+uint32_t DWARFListTableHeader::length() const {
   if (HeaderData.Length == 0)
     return 0;
-  assert(HeaderData.Version > 0 &&
-         "No DWARF version in header when using getTableLength()");
   // TODO: DWARF64 support.
-  return HeaderData.Length + (HeaderData.Version > 4) * sizeof(uint32_t);
+  return HeaderData.Length + sizeof(uint32_t);
 }
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 88565af1ec0..d475c44c393 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -296,16 +296,13 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
   return true;
 }
 
-// Parse a list table header, including the optional array of offsets
+// Parse the rangelist table header, including the optional array of offsets
 // following it (DWARF v5 and later).
-template <typename DWARFListTable>
-static Expected<DWARFListTable>
-parseListTableHeader(DWARFDataExtractor DA, DWARFContext *C,
-                     StringRef SectionName, uint32_t Offset, bool isDWO) {
+static Expected<DWARFDebugRnglistTable>
+parseRngListTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
   // TODO: Support DWARF64
   // We are expected to be called with Offset 0 or pointing just past the table
   // header, which is 12 bytes long for DWARF32.
-  DWARFListTable Table(C, SectionName, isDWO);
   if (Offset > 0) {
     if (Offset < 12U)
       return createStringError(errc::invalid_argument, "Did not detect a valid"
@@ -313,46 +310,20 @@ parseListTableHeader(DWARFDataExtractor DA, DWARFContext *C,
                                Offset);
     Offset -= 12U;
   }
+  llvm::DWARFDebugRnglistTable Table;
   if (Error E = Table.extractHeaderAndOffsets(DA, &Offset))
     return std::move(E);
   return Table;
 }
 
-// Parse a DWARF v5 list table (e.g. either a rangelist table or a location
-// list table). For DWARF units with version 4 or earlier, we instead create
-// the table artifically by giving it a size that equals the section size.
-template <typename DWARFListTable>
-static Optional<DWARFListTable>
-setupListTable(DWARFUnit *U, const DWARFSection *Section, StringRef SectionName,
-               uint32_t &Base, bool isDWO, bool isLittleEndian) {
-  if (!Section->Data.size())
-    return None;
-  DWARFContext &Ctx = U->getContext();
-  DWARFListTable Table(&Ctx, SectionName, isDWO);
-  // Parse the list table header. Individual lists are extracted lazily.
-  DWARFDataExtractor DA(Ctx.getDWARFObj(), *Section, isLittleEndian,
-                        U->getAddressByteSize());
-  if (U->getVersion() < 5) {
-    Base = 0;
-    Table.setHeaderData(Section->Data.size(), U->getVersion(),
-                        DA.getAddressSize());
-    return Table;
-  }
-  if (auto TableOrError = parseListTableHeader<DWARFListTable>(
-          DA, &Ctx, SectionName, Base, isDWO))
-    Table = TableOrError.get();
-  else {
-    WithColor::error() << "parsing a " << Table.getListTypeString().data()
-                       << " list table: " << toString(TableOrError.takeError())
-                       << '\n';
-    return None;
-  }
-  // In a split dwarf unit, there are no attributes like DW_AT_rnglists_base or
-  // DW_AT_loclists_base that describe the table base. Adjust Base to point past
-  // the table header which is expected to start at offset 0.
-  if (isDWO)
-    Base = Table.getHeaderSize();
-  return Table;
+Error DWARFUnit::extractRangeList(uint32_t RangeListOffset,
+                                  DWARFDebugRangeList &RangeList) const {
+  // Require that compile unit is extracted.
+  assert(!DieArray.empty());
+  DWARFDataExtractor RangesData(Context.getDWARFObj(), *RangeSection,
+                                isLittleEndian, getAddressByteSize());
+  uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
+  return RangeList.extract(RangesData, &ActualRangeListOffset);
 }
 
 void DWARFUnit::clear() {
@@ -466,24 +437,35 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
 
     // DWARF v5 uses the .debug_rnglists and .debug_rnglists.dwo sections to
     // describe address ranges.
-    StringRef RangeSectionName = ".debug_ranges";
     if (getVersion() >= 5) {
-      if (IsDWO) {
-        RangeSectionName = ".debug_rnglists.dwo";
+      if (IsDWO)
         setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
-      } else {
-        RangeSectionName = ".debug_rnglists";
+      else
         setRangesSection(&Context.getDWARFObj().getRnglistsSection(),
                          toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0));
+      if (RangeSection->Data.size()) {
+        // Parse the range list table header. Individual range lists are
+        // extracted lazily.
+        DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
+                                    isLittleEndian, 0);
+        if (auto TableOrError =
+                parseRngListTableHeader(RangesDA, RangeSectionBase))
+          RngListTable = TableOrError.get();
+        else
+          WithColor::error() << "parsing a range list table: "
+                             << toString(TableOrError.takeError())
+                             << '\n';
+
+        // In a split dwarf unit, there is no DW_AT_rnglists_base attribute.
+        // Adjust RangeSectionBase to point past the table header.
+        if (IsDWO && RngListTable)
+          RangeSectionBase = RngListTable->getHeaderSize();
       }
     }
-    RngListTable = setupListTable<DWARFDebugRnglistTable>(
-        this, RangeSection, RangeSectionName, RangeSectionBase, IsDWO,
-        isLittleEndian);
 
     // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
     // skeleton CU DIE, so that DWARF users not aware of it are not broken.
-  }
+    }
 
   return DieArray.size();
 }
@@ -521,9 +503,16 @@ bool DWARFUnit::parseDWO() {
   DWO->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
   if (getVersion() >= 5) {
     DWO->setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
-    DWO->RngListTable = setupListTable<DWARFDebugRnglistTable>(
-        DWOCU, DWO->RangeSection, ".debug_rnglists.dwo", DWO->RangeSectionBase,
-        /* isDWO =*/true, isLittleEndian);
+    DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
+                                isLittleEndian, 0);
+    if (auto TableOrError = parseRngListTableHeader(RangesDA, RangeSectionBase))
+      DWO->RngListTable = TableOrError.get();
+    else
+      WithColor::error() << "parsing a range list table: "
+                         << toString(TableOrError.takeError())
+                         << '\n';
+    if (DWO->RngListTable)
+      DWO->RangeSectionBase = DWO->RngListTable->getHeaderSize();
   } else {
     auto DWORangesBase = UnitDie.getRangesBaseAttribute();
     DWO->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0);
@@ -541,6 +530,12 @@ void DWARFUnit::clearDIEs(bool KeepCUDie) {
 
 Expected<DWARFAddressRangesVector>
 DWARFUnit::findRnglistFromOffset(uint32_t Offset) {
+  if (getVersion() <= 4) {
+    DWARFDebugRangeList RangeList;
+    if (Error E = extractRangeList(Offset, RangeList))
+      return std::move(E);
+    return RangeList.getAbsoluteRanges(getBaseAddress());
+  }
   if (RngListTable) {
     DWARFDataExtractor RangesData(Context.getDWARFObj(), *RangeSection,
                                   isLittleEndian, RngListTable->getAddrSize());
diff --git a/tools/dsymutil/DwarfLinker.cpp b/tools/dsymutil/DwarfLinker.cpp
index 02b0d59d59e..2c5cce50132 100644
--- a/tools/dsymutil/DwarfLinker.cpp
+++ b/tools/dsymutil/DwarfLinker.cpp
@@ -43,6 +43,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFSection.h"
@@ -1575,7 +1576,7 @@ DIE *DwarfLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
 void DwarfLinker::patchRangesForUnit(const CompileUnit &Unit,
                                      DWARFContext &OrigDwarf,
                                      const DebugMapObject &DMO) const {
-  DWARFDebugRnglist RangeList;
+  DWARFDebugRangeList RangeList;
   const auto &FunctionRanges = Unit.getFunctionRanges();
   unsigned AddressSize = Unit.getOrigUnit().getAddressByteSize();
   DWARFDataExtractor RangeExtractor(OrigDwarf.getDWARFObj(),
@@ -1595,30 +1596,28 @@ void DwarfLinker::patchRangesForUnit(const CompileUnit &Unit,
   for (const auto &RangeAttribute : Unit.getRangesAttributes()) {
     uint32_t Offset = RangeAttribute.get();
     RangeAttribute.set(Streamer->getRangesSectionSize());
-    if (Error E = RangeList.extract(RangeExtractor, /* HeaderOffset = */0,
-                                    RangeExtractor.size(),
-                                    Unit.getOrigUnit().getVersion(), &Offset,
-                                    ".debug_ranges", "range")) {
+    if (Error E = RangeList.extract(RangeExtractor, &Offset)) {
       llvm::consumeError(std::move(E));
       reportWarning("invalid range list ignored.", DMO);
       RangeList.clear();
     }
     const auto &Entries = RangeList.getEntries();
-    if (!RangeList.empty()) {
-      const auto &First = Entries.front();
+    if (!Entries.empty()) {
+      const DWARFDebugRangeList::RangeListEntry &First = Entries.front();
+
       if (CurrRange == InvalidRange ||
-          First.getStartAddress() + OrigLowPc < CurrRange.start() ||
-          First.getStartAddress() + OrigLowPc >= CurrRange.stop()) {
-        CurrRange = FunctionRanges.find(First.getStartAddress() + OrigLowPc);
+          First.StartAddress + OrigLowPc < CurrRange.start() ||
+          First.StartAddress + OrigLowPc >= CurrRange.stop()) {
+        CurrRange = FunctionRanges.find(First.StartAddress + OrigLowPc);
         if (CurrRange == InvalidRange ||
-            CurrRange.start() > First.getStartAddress() + OrigLowPc) {
+            CurrRange.start() > First.StartAddress + OrigLowPc) {
           reportWarning("no mapping for range.", DMO);
           continue;
         }
       }
     }
 
-    Streamer->emitRangesEntries(UnitPcOffset, OrigLowPc, CurrRange, RangeList,
+    Streamer->emitRangesEntries(UnitPcOffset, OrigLowPc, CurrRange, Entries,
                                 AddressSize);
   }
 }
diff --git a/tools/dsymutil/DwarfStreamer.cpp b/tools/dsymutil/DwarfStreamer.cpp
index 835a27aefef..ef798be7bdf 100644
--- a/tools/dsymutil/DwarfStreamer.cpp
+++ b/tools/dsymutil/DwarfStreamer.cpp
@@ -269,27 +269,28 @@ void DwarfStreamer::emitSwiftAST(StringRef Buffer) {
 void DwarfStreamer::emitRangesEntries(
     int64_t UnitPcOffset, uint64_t OrigLowPc,
     const FunctionIntervals::const_iterator &FuncRange,
-    const DWARFDebugRnglist &RangeList, unsigned AddressSize) {
+    const std::vector<DWARFDebugRangeList::RangeListEntry> &Entries,
+    unsigned AddressSize) {
   MS->SwitchSection(MC->getObjectFileInfo()->getDwarfRangesSection());
 
   // Offset each range by the right amount.
-  int64_t PcOffset = RangeList.empty() ? 0 : FuncRange.value() + UnitPcOffset;
-  for (const auto &Range : RangeList.getEntries()) {
-    if (Range.isBaseAddressSelectionEntry()) {
+  int64_t PcOffset = Entries.empty() ? 0 : FuncRange.value() + UnitPcOffset;
+  for (const auto &Range : Entries) {
+    if (Range.isBaseAddressSelectionEntry(AddressSize)) {
       warn("unsupported base address selection operation",
            "emitting debug_ranges");
       break;
     }
     // Do not emit empty ranges.
-    if (Range.isEndOfList() || Range.getStartAddress() == Range.getEndAddress())
+    if (Range.StartAddress == Range.EndAddress)
       continue;
 
     // All range entries should lie in the function range.
-    if (!(Range.getStartAddress() + OrigLowPc >= FuncRange.start() &&
-          Range.getEndAddress() + OrigLowPc <= FuncRange.stop()))
+    if (!(Range.StartAddress + OrigLowPc >= FuncRange.start() &&
+          Range.EndAddress + OrigLowPc <= FuncRange.stop()))
       warn("inconsistent range data.", "emitting debug_ranges");
-    MS->EmitIntValue(Range.getStartAddress() + PcOffset, AddressSize);
-    MS->EmitIntValue(Range.getEndAddress() + PcOffset, AddressSize);
+    MS->EmitIntValue(Range.StartAddress + PcOffset, AddressSize);
+    MS->EmitIntValue(Range.EndAddress + PcOffset, AddressSize);
     RangesSectionSize += 2 * AddressSize;
   }
 
diff --git a/tools/dsymutil/DwarfStreamer.h b/tools/dsymutil/DwarfStreamer.h
index 2ab880d17dd..679d124f4cb 100644
--- a/tools/dsymutil/DwarfStreamer.h
+++ b/tools/dsymutil/DwarfStreamer.h
@@ -17,7 +17,7 @@
 #include "llvm/CodeGen/AccelTable.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -83,7 +83,7 @@ public:
   void emitRangesEntries(
       int64_t UnitPcOffset, uint64_t OrigLowPc,
       const FunctionIntervals::const_iterator &FuncRange,
-      const DWARFDebugRnglist &RangeList,
+      const std::vector<DWARFDebugRangeList::RangeListEntry> &Entries,
       unsigned AddressSize);
 
   /// Emit debug_aranges entries for \p Unit and if \p DoRangesSection is true,
-- 
GitLab


From b6dd98ba241c016501337c75989675186f397e40 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Wed, 31 Oct 2018 01:30:41 +0000
Subject: [PATCH 0782/1116] Try to fix ambiguities with C++17 headers in
 unittest

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345689 91177308-0d34-0410-b5e6-96231b3b80d8
---
 unittests/ADT/STLExtrasTest.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unittests/ADT/STLExtrasTest.cpp b/unittests/ADT/STLExtrasTest.cpp
index 80e10d071ab..642dc3f6bdd 100644
--- a/unittests/ADT/STLExtrasTest.cpp
+++ b/unittests/ADT/STLExtrasTest.cpp
@@ -365,6 +365,9 @@ TEST(STLExtrasTest, ADLTest) {
 }
 
 TEST(STLExtrasTest, EmptyTest) {
+  // Try to avoid ambiguities with C++17 headers.
+  using llvm::empty;
+
   std::vector<void*> V;
   EXPECT_TRUE(empty(V));
   V.push_back(nullptr);
-- 
GitLab


From 4f27756950035dc48483bc22392180e7f3030f33 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Wed, 31 Oct 2018 01:58:00 +0000
Subject: [PATCH 0783/1116] 2nd attempt to fix ambiguities because of ADL

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345690 91177308-0d34-0410-b5e6-96231b3b80d8
---
 unittests/ADT/STLExtrasTest.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/unittests/ADT/STLExtrasTest.cpp b/unittests/ADT/STLExtrasTest.cpp
index 642dc3f6bdd..e65e71fe485 100644
--- a/unittests/ADT/STLExtrasTest.cpp
+++ b/unittests/ADT/STLExtrasTest.cpp
@@ -365,23 +365,20 @@ TEST(STLExtrasTest, ADLTest) {
 }
 
 TEST(STLExtrasTest, EmptyTest) {
-  // Try to avoid ambiguities with C++17 headers.
-  using llvm::empty;
-
   std::vector<void*> V;
-  EXPECT_TRUE(empty(V));
+  EXPECT_TRUE(llvm::empty(V));
   V.push_back(nullptr);
-  EXPECT_FALSE(empty(V));
+  EXPECT_FALSE(llvm::empty(V));
 
   std::initializer_list<int> E = {};
   std::initializer_list<int> NotE = {7, 13, 42};
-  EXPECT_TRUE(empty(E));
-  EXPECT_FALSE(empty(NotE));
+  EXPECT_TRUE(llvm::empty(E));
+  EXPECT_FALSE(llvm::empty(NotE));
 
   auto R0 = make_range(V.begin(), V.begin());
-  EXPECT_TRUE(empty(R0));
+  EXPECT_TRUE(llvm::empty(R0));
   auto R1 = make_range(V.begin(), V.end());
-  EXPECT_FALSE(empty(R1));
+  EXPECT_FALSE(llvm::empty(R1));
 }
 
 TEST(STLExtrasTest, EarlyIncrementTest) {
-- 
GitLab


From d71ec72114694a314d6f9ad5dd1a9fa397ae47e9 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 31 Oct 2018 05:16:14 +0000
Subject: [PATCH 0784/1116] [ORC] Fix hex printing of uint64_t values.

A plain "%x" format string will drop the high 32-bits. Use the PRIx64 macro
instead.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345696 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Orc/OrcRemoteTargetClient.h               | 48 ++++++++++---------
 lib/ExecutionEngine/Orc/Core.cpp              | 10 ++--
 lib/ExecutionEngine/Orc/IndirectionUtils.cpp  |  2 +-
 3 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
index 99468e269d3..3e07f5cf374 100644
--- a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
+++ b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
@@ -118,30 +118,33 @@ public:
         Unmapped.back().RemoteCodeAddr =
             Client.reserveMem(Id, CodeSize, CodeAlign);
 
-        LLVM_DEBUG(dbgs() << "  code: "
-                          << format("0x%016x", Unmapped.back().RemoteCodeAddr)
-                          << " (" << CodeSize << " bytes, alignment "
-                          << CodeAlign << ")\n");
+        LLVM_DEBUG(
+            dbgs() << "  code: "
+                   << format("0x%016" PRIx64, Unmapped.back().RemoteCodeAddr)
+                   << " (" << CodeSize << " bytes, alignment " << CodeAlign
+                   << ")\n");
       }
 
       if (RODataSize != 0) {
         Unmapped.back().RemoteRODataAddr =
             Client.reserveMem(Id, RODataSize, RODataAlign);
 
-        LLVM_DEBUG(dbgs() << "  ro-data: "
-                          << format("0x%016x", Unmapped.back().RemoteRODataAddr)
-                          << " (" << RODataSize << " bytes, alignment "
-                          << RODataAlign << ")\n");
+        LLVM_DEBUG(
+            dbgs() << "  ro-data: "
+                   << format("0x%016" PRIx64, Unmapped.back().RemoteRODataAddr)
+                   << " (" << RODataSize << " bytes, alignment " << RODataAlign
+                   << ")\n");
       }
 
       if (RWDataSize != 0) {
         Unmapped.back().RemoteRWDataAddr =
             Client.reserveMem(Id, RWDataSize, RWDataAlign);
 
-        LLVM_DEBUG(dbgs() << "  rw-data: "
-                          << format("0x%016x", Unmapped.back().RemoteRWDataAddr)
-                          << " (" << RWDataSize << " bytes, alignment "
-                          << RWDataAlign << ")\n");
+        LLVM_DEBUG(
+            dbgs() << "  rw-data: "
+                   << format("0x%016" PRIx64, Unmapped.back().RemoteRWDataAddr)
+                   << " (" << RWDataSize << " bytes, alignment " << RWDataAlign
+                   << ")\n");
       }
     }
 
@@ -269,9 +272,9 @@ public:
       for (auto &Alloc : Allocs) {
         NextAddr = alignTo(NextAddr, Alloc.getAlign());
         Dyld.mapSectionAddress(Alloc.getLocalAddress(), NextAddr);
-        LLVM_DEBUG(dbgs() << "     "
-                          << static_cast<void *>(Alloc.getLocalAddress())
-                          << " -> " << format("0x%016x", NextAddr) << "\n");
+        LLVM_DEBUG(
+            dbgs() << "     " << static_cast<void *>(Alloc.getLocalAddress())
+                   << " -> " << format("0x%016" PRIx64, NextAddr) << "\n");
         Alloc.setRemoteAddress(NextAddr);
 
         // Only advance NextAddr if it was non-null to begin with,
@@ -293,7 +296,7 @@ public:
           LLVM_DEBUG(dbgs() << "  copying section: "
                             << static_cast<void *>(Alloc.getLocalAddress())
                             << " -> "
-                            << format("0x%016x", Alloc.getRemoteAddress())
+                            << format("0x%016" PRIx64, Alloc.getRemoteAddress())
                             << " (" << Alloc.getSize() << " bytes)\n";);
 
           if (Client.writeMem(Alloc.getRemoteAddress(), Alloc.getLocalAddress(),
@@ -306,7 +309,8 @@ public:
                           << (Permissions & sys::Memory::MF_WRITE ? 'W' : '-')
                           << (Permissions & sys::Memory::MF_EXEC ? 'X' : '-')
                           << " permissions on block: "
-                          << format("0x%016x", RemoteSegmentAddr) << "\n");
+                          << format("0x%016" PRIx64, RemoteSegmentAddr)
+                          << "\n");
         if (Client.setProtections(Id, RemoteSegmentAddr, Permissions))
           return true;
       }
@@ -510,8 +514,8 @@ public:
   /// Call the int(void) function at the given address in the target and return
   /// its result.
   Expected<int> callIntVoid(JITTargetAddress Addr) {
-    LLVM_DEBUG(dbgs() << "Calling int(*)(void) " << format("0x%016x", Addr)
-                      << "\n");
+    LLVM_DEBUG(dbgs() << "Calling int(*)(void) "
+                      << format("0x%016" PRIx64, Addr) << "\n");
     return callB<exec::CallIntVoid>(Addr);
   }
 
@@ -520,15 +524,15 @@ public:
   Expected<int> callMain(JITTargetAddress Addr,
                          const std::vector<std::string> &Args) {
     LLVM_DEBUG(dbgs() << "Calling int(*)(int, char*[]) "
-                      << format("0x%016x", Addr) << "\n");
+                      << format("0x%016" PRIx64, Addr) << "\n");
     return callB<exec::CallMain>(Addr, Args);
   }
 
   /// Call the void() function at the given address in the target and wait for
   /// it to finish.
   Error callVoidVoid(JITTargetAddress Addr) {
-    LLVM_DEBUG(dbgs() << "Calling void(*)(void) " << format("0x%016x", Addr)
-                      << "\n");
+    LLVM_DEBUG(dbgs() << "Calling void(*)(void) "
+                      << format("0x%016" PRIx64, Addr) << "\n");
     return callB<exec::CallVoidVoid>(Addr);
   }
 
diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index 9cbb03734ed..f99cbec6d3b 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -170,7 +170,8 @@ raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags) {
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const JITEvaluatedSymbol &Sym) {
-  return OS << format("0x%016x", Sym.getAddress()) << " " << Sym.getFlags();
+  return OS << format("0x%016" PRIx64, Sym.getAddress()) << " "
+            << Sym.getFlags();
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV) {
@@ -1392,9 +1393,8 @@ JITDylib::lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
 
 void JITDylib::dump(raw_ostream &OS) {
   ES.runSessionLocked([&, this]() {
-    OS << "JITDylib \"" << JITDylibName
-       << "\" (ES: " << format("0x%016x", reinterpret_cast<uintptr_t>(&ES))
-       << "):\n"
+    OS << "JITDylib \"" << JITDylibName << "\" (ES: "
+       << format("0x%016" PRIx64, reinterpret_cast<uintptr_t>(&ES)) << "):\n"
        << "Search order: [";
     for (auto &KV : SearchOrder)
       OS << " (\"" << KV.first->getName() << "\", "
@@ -1405,7 +1405,7 @@ void JITDylib::dump(raw_ostream &OS) {
     for (auto &KV : Symbols) {
       OS << "    \"" << *KV.first << "\": ";
       if (auto Addr = KV.second.getAddress())
-        OS << format("0x%016x", Addr) << ", " << KV.second.getFlags();
+        OS << format("0x%016" PRIx64, Addr) << ", " << KV.second.getFlags();
       else
         OS << "<not resolved>";
       if (KV.second.getFlags().isLazy() ||
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index af7fcddd53d..82000ec5b32 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -92,7 +92,7 @@ JITTargetAddress JITCompileCallbackManager::executeCompileCallback(
       {
         raw_string_ostream ErrMsgStream(ErrMsg);
         ErrMsgStream << "No compile callback for trampoline at "
-                     << format("0x%016x", TrampolineAddr);
+                     << format("0x%016" PRIx64, TrampolineAddr);
       }
       ES.reportError(
           make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode()));
-- 
GitLab


From 6ee01635feb937cff99fd5924f6b85dcb36fdc96 Mon Sep 17 00:00:00 2001
From: Kristina Brooks <kristina@nym.hush.com>
Date: Wed, 31 Oct 2018 05:45:01 +0000
Subject: [PATCH 0785/1116] [llvm-objdump] support '--syms' as an alias of -t

This adds support for '--syms' as an alias of '-t' for llvm-objdump,
fixing PR39406 (https://bugs.llvm.org/show_bug.cgi?id=39406).

Patch by Higuoxing (Xing).

Differential Revision: https://reviews.llvm.org/D53803

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345697 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-objdump/symbol-table-elf.test | 47 +++++++++++++++++++
 tools/llvm-objdump/llvm-objdump.cpp           |  5 +-
 2 files changed, 50 insertions(+), 2 deletions(-)
 create mode 100644 test/tools/llvm-objdump/symbol-table-elf.test

diff --git a/test/tools/llvm-objdump/symbol-table-elf.test b/test/tools/llvm-objdump/symbol-table-elf.test
new file mode 100644
index 00000000000..fc1eccdffb7
--- /dev/null
+++ b/test/tools/llvm-objdump/symbol-table-elf.test
@@ -0,0 +1,47 @@
+# RUN: yaml2obj %s > %t
+# RUN: llvm-objdump --syms %t | FileCheck %s
+# RUN: llvm-objdump -t     %t | FileCheck %s
+
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: 0000000000000000         *UND*     00000000
+# CHECK-NEXT: 0000000000001004 l     F .text     00000000 lfoo
+# CHECK-NEXT: 0000000000001008 l       .text     00000000 lbar
+# CHECK-NEXT: 0000000000001004 g     F .text     00000000 foo
+# CHECK-NEXT: 0000000000001008 g       .text     00000000 bar
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x0000000000000010
+    Size:            64
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000010
+    Content:         "00000000"
+Symbols:
+   Global:
+     - Name:     foo
+       Type:     STT_FUNC
+       Section:  .text
+       Value:    0x1004
+     - Name:     bar
+       Type:     STT_OBJECT
+       Section:  .text
+       Value:    0x1008
+   Local:
+     - Name:     lfoo
+       Type:     STT_FUNC
+       Section:  .text
+       Value:    0x1004
+     - Name:     lbar
+       Type:     STT_OBJECT
+       Section:  .text
+       Value:    0x1008
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 463408b60c5..4ad29d2143f 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -122,8 +122,9 @@ static cl::alias SectionContentsShort("s",
                                       cl::desc("Alias for --full-contents"),
                                       cl::aliasopt(SectionContents));
 
-cl::opt<bool>
-llvm::SymbolTable("t", cl::desc("Display the symbol table"));
+cl::opt<bool> llvm::SymbolTable("syms", cl::desc("Display the symbol table"));
+static cl::alias SymbolTableShort("t", cl::desc("Alias for --syms"),
+                                  cl::aliasopt(llvm::SymbolTable));
 
 cl::opt<bool>
 llvm::ExportsTrie("exports-trie", cl::desc("Display mach-o exported symbols"));
-- 
GitLab


From d75205d3b71ffef56305b0ee99473b346876dd59 Mon Sep 17 00:00:00 2001
From: Martin Storsjo <martin@martin.st>
Date: Wed, 31 Oct 2018 08:14:09 +0000
Subject: [PATCH 0786/1116] [AArch64] Mark condition flags and x16/x17 as
 clobbered when calling __chkstk

This is similar to SVN r311061 for ARM.

Differential Revision: https://reviews.llvm.org/D53878

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345698 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64FrameLowering.cpp | 6 ++++++
 test/CodeGen/AArch64/chkstk.ll              | 8 ++++++++
 2 files changed, 14 insertions(+)

diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index a99dd356d4f..974377d3f62 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -731,6 +731,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
           .addExternalSymbol("__chkstk")
           .addReg(AArch64::X15, RegState::Implicit)
+          .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
+          .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
+          .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
           .setMIFlags(MachineInstr::FrameSetup);
       break;
     case CodeModel::Large:
@@ -743,6 +746,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR))
           .addReg(AArch64::X16, RegState::Kill)
           .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
+          .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
+          .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
+          .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
           .setMIFlags(MachineInstr::FrameSetup);
       break;
     }
diff --git a/test/CodeGen/AArch64/chkstk.ll b/test/CodeGen/AArch64/chkstk.ll
index 1c2e5528f10..1037a5fdae0 100644
--- a/test/CodeGen/AArch64/chkstk.ll
+++ b/test/CodeGen/AArch64/chkstk.ll
@@ -1,8 +1,12 @@
 ; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs %s -o - \
 ; RUN:  | FileCheck -check-prefix CHECK-DEFAULT-CODE-MODEL %s
+; RUN: llc -mtriple=aarch64-windows -print-machineinstrs=prologepilog %s -o - 2>&1 \
+; RUN:  | FileCheck -check-prefix CHECK-REGSTATE %s
 
 ; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs -code-model=large %s -o - \
 ; RUN:  | FileCheck -check-prefix CHECK-LARGE-CODE-MODEL %s
+; RUN: llc -mtriple=aarch64-windows -print-machineinstrs=prologepilog -code-model=large %s -o - 2>&1 \
+; RUN:  | FileCheck -check-prefix CHECK-REGSTATE-LARGE %s
 
 define void @check_watermark() {
 entry:
@@ -16,6 +20,8 @@ entry:
 ; CHECK-DEFAULT-CODE-MODEL:     bl __chkstk
 ; CHECK-DEFAULT-CODE-MODEL:     sub sp, sp, x15, lsl #4
 
+; CHECK-REGSTATE: frame-setup BL &__chkstk, implicit-def $lr, implicit $sp, implicit $x15, implicit-def dead $x16, implicit-def dead $x17, implicit-def dead $nzcv
+
 ; CHECK-LARGE-CODE-MODEL: check_watermark:
 ; CHECK-LARGE-CODE-MODEL-DAG: stp x29, x30, [sp
 ; CHECK-LARGE-CODE-MODEL-DAG: orr x15, xzr, #0x100
@@ -23,3 +29,5 @@ entry:
 ; CHECK-LARGE-CODE-MODEL-DAG: add x16, x16, __chkstk
 ; CHECK-LARGE-CODE-MODEL:     blr x16
 ; CHECK-LARGE-CODE-MODEL:     sub sp, sp, x15, lsl #4
+
+; CHECK-REGSTATE-LARGE: frame-setup BLR killed $x16, implicit-def $lr, implicit $sp, implicit-def $x15, implicit-def dead $x16, implicit-def dead $x17, implicit-def dead $nzcv
-- 
GitLab


From 2c097a41315c16916e117ce86c8d0265adc6f4d1 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric@codeaurora.org>
Date: Wed, 31 Oct 2018 09:27:01 +0000
Subject: [PATCH 0787/1116] [ARM64] [Windows] Exception handling support in
 frame lowering

Emit pseudo instructions indicating unwind codes corresponding to each
instruction inside the prologue/epilogue.  These are used by the MCLayer to
populate the .xdata section.

Differential Revision: https://reviews.llvm.org/D50288


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345701 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../AArch64/AArch64CallingConvention.td       |   8 +
 lib/Target/AArch64/AArch64FrameLowering.cpp   | 410 +++++++++++++++---
 lib/Target/AArch64/AArch64InstrInfo.cpp       |  23 +-
 lib/Target/AArch64/AArch64InstrInfo.h         |   2 +-
 lib/Target/AArch64/AArch64RegisterInfo.cpp    |   2 +
 lib/Target/AArch64/AArch64RegisterInfo.h      |   5 +
 test/CodeGen/AArch64/chkstk.ll                |   4 +-
 test/CodeGen/AArch64/win64_vararg.ll          |  68 +--
 test/CodeGen/AArch64/wineh-frame0.mir         |  60 +++
 test/CodeGen/AArch64/wineh-frame1.mir         |  94 ++++
 test/CodeGen/AArch64/wineh-frame2.mir         |  72 +++
 test/CodeGen/AArch64/wineh-frame3.mir         |  59 +++
 test/CodeGen/AArch64/wineh-frame4.mir         |  59 +++
 test/CodeGen/AArch64/wineh-frame5.mir         | 135 ++++++
 test/CodeGen/AArch64/wineh-frame6.mir         | 150 +++++++
 test/CodeGen/AArch64/wineh-frame7.mir         | 189 ++++++++
 test/CodeGen/AArch64/wineh-frame8.mir         |  88 ++++
 17 files changed, 1341 insertions(+), 87 deletions(-)
 create mode 100644 test/CodeGen/AArch64/wineh-frame0.mir
 create mode 100644 test/CodeGen/AArch64/wineh-frame1.mir
 create mode 100644 test/CodeGen/AArch64/wineh-frame2.mir
 create mode 100644 test/CodeGen/AArch64/wineh-frame3.mir
 create mode 100644 test/CodeGen/AArch64/wineh-frame4.mir
 create mode 100644 test/CodeGen/AArch64/wineh-frame5.mir
 create mode 100644 test/CodeGen/AArch64/wineh-frame6.mir
 create mode 100644 test/CodeGen/AArch64/wineh-frame7.mir
 create mode 100644 test/CodeGen/AArch64/wineh-frame8.mir

diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 91fe3f237af..2f6cb4c8670 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -288,6 +288,14 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
                                            D8,  D9,  D10, D11,
                                            D12, D13, D14, D15)>;
 
+// Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x.
+// We put FP before LR, so that frame lowering logic generates (FP,LR) pairs,
+// and not (LR,FP) pairs.
+def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add FP, LR, X19, X20, X21, X22,
+                                               X23, X24, X25, X26, X27, X28,
+                                               D8, D9, D10, D11,
+                                               D12, D13, D14, D15)>;
+
 // AArch64 PCS for vector functions (VPCS)
 // must (additionally) preserve full Q8-Q23 registers
 def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 974377d3f62..9c85001481d 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -115,11 +115,13 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -434,12 +436,154 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
   return true;
 }
 
+// Given a load or a store instruction, generate an appropriate unwinding SEH
+// code on Windows.
+static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
+                                             const TargetInstrInfo &TII,
+                                             MachineInstr::MIFlag Flag) {
+  unsigned Opc = MBBI->getOpcode();
+  MachineBasicBlock *MBB = MBBI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  DebugLoc DL = MBBI->getDebugLoc();
+  unsigned ImmIdx = MBBI->getNumOperands() - 1;
+  int Imm = MBBI->getOperand(ImmIdx).getImm();
+  MachineInstrBuilder MIB;
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+  switch (Opc) {
+  default:
+    llvm_unreachable("No SEH Opcode for this instruction");
+  case AArch64::LDPDpost:
+    Imm = -Imm;
+    LLVM_FALLTHROUGH;
+  case AArch64::STPDpre: {
+    unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
+    unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
+              .addImm(Reg0)
+              .addImm(Reg1)
+              .addImm(Imm * 8)
+              .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::LDPXpost:
+    Imm = -Imm;
+    LLVM_FALLTHROUGH;
+  case AArch64::STPXpre: {
+    unsigned Reg0 = MBBI->getOperand(1).getReg();
+    unsigned Reg1 = MBBI->getOperand(2).getReg();
+    if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
+      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
+                .addImm(Imm * 8)
+                .setMIFlag(Flag);
+    else
+      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
+                .addImm(RegInfo->getSEHRegNum(Reg0))
+                .addImm(RegInfo->getSEHRegNum(Reg1))
+                .addImm(Imm * 8)
+                .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::LDRDpost:
+    Imm = -Imm;
+    LLVM_FALLTHROUGH;
+  case AArch64::STRDpre: {
+    unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
+              .addImm(Reg)
+              .addImm(Imm)
+              .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::LDRXpost:
+    Imm = -Imm;
+    LLVM_FALLTHROUGH;
+  case AArch64::STRXpre: {
+    unsigned Reg =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
+              .addImm(Reg)
+              .addImm(Imm)
+              .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::STPDi:
+  case AArch64::LDPDi: {
+    unsigned Reg0 =  RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
+    unsigned Reg1 =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
+              .addImm(Reg0)
+              .addImm(Reg1)
+              .addImm(Imm * 8)
+              .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::STPXi:
+  case AArch64::LDPXi: {
+    unsigned Reg0 = MBBI->getOperand(0).getReg();
+    unsigned Reg1 = MBBI->getOperand(1).getReg();
+    if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
+      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
+                .addImm(Imm * 8)
+                .setMIFlag(Flag);
+    else
+      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
+                .addImm(RegInfo->getSEHRegNum(Reg0))
+                .addImm(RegInfo->getSEHRegNum(Reg1))
+                .addImm(Imm * 8)
+                .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::STRXui:
+  case AArch64::LDRXui: {
+    int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
+              .addImm(Reg)
+              .addImm(Imm * 8)
+              .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::STRDui:
+  case AArch64::LDRDui: {
+    unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
+              .addImm(Reg)
+              .addImm(Imm * 8)
+              .setMIFlag(Flag);
+    break;
+  }
+  }
+  auto I = MBB->insertAfter(MBBI, MIB);
+  return I;
+}
+
+// Fix up the SEH opcode associated with the save/restore instruction.
+static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
+                           unsigned LocalStackSize) {
+  MachineOperand *ImmOpnd = nullptr;
+  unsigned ImmIdx = MBBI->getNumOperands() - 1;
+  switch (MBBI->getOpcode()) {
+  default:
+    llvm_unreachable("Fix the offset in the SEH instruction");
+  case AArch64::SEH_SaveFPLR:
+  case AArch64::SEH_SaveRegP:
+  case AArch64::SEH_SaveReg:
+  case AArch64::SEH_SaveFRegP:
+  case AArch64::SEH_SaveFReg:
+    ImmOpnd = &MBBI->getOperand(ImmIdx);
+    break;
+  }
+  if (ImmOpnd)
+    ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
+}
+
 // Convert callee-save register save/restore instruction to do stack pointer
 // decrement/increment to allocate/deallocate the callee-save stack area by
 // converting store/load to use pre/post increment version.
 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
+    const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
+    bool NeedsWinCFI, bool InProlog = true) {
   // Ignore instructions that do not operate on SP, i.e. shadow call stack
   // instructions.
   while (MBBI->getOpcode() == AArch64::STRXpost ||
@@ -447,7 +591,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     assert(MBBI->getOperand(0).getReg() != AArch64::SP);
     ++MBBI;
   }
-
   unsigned NewOpc;
   int Scale = 1;
   switch (MBBI->getOpcode()) {
@@ -496,6 +639,12 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     NewOpc = AArch64::LDRQpost;
     break;
   }
+  // Get rid of the SEH code associated with the old instruction.
+  if (NeedsWinCFI) {
+    auto SEH = std::next(MBBI);
+    if (AArch64InstrInfo::isSEHInstruction(*SEH))
+      SEH->eraseFromParent();
+  }
 
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
   MIB.addReg(AArch64::SP, RegState::Define);
@@ -517,13 +666,22 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
   MIB.setMIFlags(MBBI->getFlags());
   MIB.setMemRefs(MBBI->memoperands());
 
+  // Generate a new SEH code that corresponds to the new instruction.
+  if (NeedsWinCFI)
+    InsertSEH(*MIB, *TII,
+              InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
+
   return std::prev(MBB.erase(MBBI));
 }
 
 // Fixup callee-save register save/restore instructions to take into account
 // combined SP bump by adding the local stack size to the stack offsets.
 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
-                                              unsigned LocalStackSize) {
+                                              unsigned LocalStackSize,
+                                              bool NeedsWinCFI) {
+  if (AArch64InstrInfo::isSEHInstruction(MI))
+    return;
+
   unsigned Opc = MI.getOpcode();
 
   // Ignore instructions that do not operate on SP, i.e. shadow call stack
@@ -563,6 +721,14 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
   // All generated opcodes have scaled offsets.
   assert(LocalStackSize % Scale == 0);
   OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
+
+  if (NeedsWinCFI) {
+    auto MBBI = std::next(MachineBasicBlock::iterator(MI));
+    assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
+    assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
+           "Expecting a SEH instruction");
+    fixupSEHOpcode(MBBI, LocalStackSize);
+  }
 }
 
 static void adaptForLdStOpt(MachineBasicBlock &MBB,
@@ -618,9 +784,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  bool needsFrameMoves = MMI.hasDebugInfo() || F.needsUnwindTableEntry();
+  bool needsFrameMoves = (MMI.hasDebugInfo() || F.needsUnwindTableEntry()) &&
+                         !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool HasFP = hasFP(MF);
-
+  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                     F.needsUnwindTableEntry();
+  MF.setHasWinCFI(NeedsWinCFI);
   // At this point, we're going to decide whether or not the function uses a
   // redzone. In most cases, the function doesn't have a redzone so let's
   // assume that's false and set it to true in the case that there's a redzone.
@@ -645,10 +814,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   int NumBytes = (int)MFI.getStackSize();
   if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
     assert(!HasFP && "unexpected function without stack frame but with FP");
-
     // All of the stack allocation is for locals.
     AFI->setLocalStackSize(NumBytes);
-
     if (!NumBytes)
       return;
     // REDZONE: If the stack size is less than 128 bytes, we don't need
@@ -658,17 +825,23 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       ++NumRedZoneFunctions;
     } else {
       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
-
-      // Label used to tie together the PROLOG_LABEL and the MachineMoves.
-      MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
-      // Encode the stack size of the leaf function.
-      unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
+                      MachineInstr::FrameSetup, false, NeedsWinCFI);
+      if (!NeedsWinCFI) {
+        // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+        MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
+        // Encode the stack size of the leaf function.
+        unsigned CFIIndex = MF.addFrameInst(
+            MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+        BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlags(MachineInstr::FrameSetup);
+      }
     }
+
+    if (NeedsWinCFI)
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+          .setMIFlag(MachineInstr::FrameSetup);
+
     return;
   }
 
@@ -679,15 +852,14 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   // All of the remaining stack allocations are for locals.
   AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
-
   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
   if (CombineSPBump) {
     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
-                    MachineInstr::FrameSetup);
+                    MachineInstr::FrameSetup, false, NeedsWinCFI);
     NumBytes = 0;
   } else if (PrologueSaveSize != 0) {
-    MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
-                                                     -PrologueSaveSize);
+    MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
+        MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI);
     NumBytes -= PrologueSaveSize;
   }
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
@@ -698,9 +870,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   MachineBasicBlock::iterator End = MBB.end();
   while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
     if (CombineSPBump)
-      fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize());
+      fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
+                                        NeedsWinCFI);
     ++MBBI;
   }
+
   if (HasFP) {
     // Only set up FP if we actually need to. Frame pointer is fp =
     // sp - fixedobject - 16.
@@ -713,15 +887,42 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     // Note: All stores of callee-saved registers are marked as "FrameSetup".
     // This code marks the instruction(s) that set the FP also.
     emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
-                    MachineInstr::FrameSetup);
+                    MachineInstr::FrameSetup, false, NeedsWinCFI);
   }
 
   if (windowsRequiresStackProbe(MF, NumBytes)) {
     uint32_t NumWords = NumBytes >> 4;
-
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
-        .addImm(NumWords)
-        .setMIFlags(MachineInstr::FrameSetup);
+    if (NeedsWinCFI) {
+      // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
+      // exceed this amount.  We need to move at most 2^24 - 1 into x15.
+      // This is at most two instructions, MOVZ follwed by MOVK.
+      // TODO: Fix to use multiple stack alloc unwind codes for stacks
+      // exceeding 256MB in size.
+      if (NumBytes >= (1 << 28))
+        report_fatal_error("Stack size cannot exceed 256MB for stack "
+                            "unwinding purposes");
+
+      uint32_t LowNumWords = NumWords & 0xFFFF;
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
+            .addImm(LowNumWords)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+            .setMIFlag(MachineInstr::FrameSetup);
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlag(MachineInstr::FrameSetup);
+      if ((NumWords & 0xFFFF0000) != 0) {
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
+              .addReg(AArch64::X15)
+              .addImm((NumWords & 0xFFFF0000) >> 16) // High half
+              .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
+              .setMIFlag(MachineInstr::FrameSetup);
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
+    } else {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
+          .addImm(NumWords)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
 
     switch (MF.getTarget().getCodeModel()) {
     case CodeModel::Tiny:
@@ -735,6 +936,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
           .setMIFlags(MachineInstr::FrameSetup);
+      if (NeedsWinCFI)
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlag(MachineInstr::FrameSetup);
       break;
     case CodeModel::Large:
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
@@ -742,6 +946,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           .addExternalSymbol("__chkstk")
           .addExternalSymbol("__chkstk")
           .setMIFlags(MachineInstr::FrameSetup);
+      if (NeedsWinCFI)
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlag(MachineInstr::FrameSetup);
 
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR))
           .addReg(AArch64::X16, RegState::Kill)
@@ -750,6 +957,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
           .setMIFlags(MachineInstr::FrameSetup);
+      if (NeedsWinCFI)
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlag(MachineInstr::FrameSetup);
       break;
     }
 
@@ -758,6 +968,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
         .addReg(AArch64::X15, RegState::Kill)
         .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
         .setMIFlags(MachineInstr::FrameSetup);
+    if (NeedsWinCFI)
+       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
     NumBytes = 0;
   }
 
@@ -777,7 +991,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       // the correct value here, as NumBytes also includes padding bytes,
       // which shouldn't be counted here.
       emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
+                      MachineInstr::FrameSetup, false, NeedsWinCFI);
 
     if (NeedsRealignment) {
       const unsigned Alignment = MFI.getMaxAlignment();
@@ -800,6 +1014,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           .addReg(scratchSPReg, RegState::Kill)
           .addImm(andMaskEncoded);
       AFI->setStackRealigned(true);
+      if (NeedsWinCFI)
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+            .addImm(NumBytes & andMaskEncoded)
+            .setMIFlag(MachineInstr::FrameSetup);
     }
   }
 
@@ -813,8 +1031,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   if (RegInfo->hasBasePointer(MF)) {
     TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
                      false);
+    if (NeedsWinCFI)
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+          .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  // The very last FrameSetup instruction indicates the end of prologue. Emit a
+  // SEH opcode indicating the prologue end.
+  if (NeedsWinCFI)
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+        .setMIFlag(MachineInstr::FrameSetup);
+
   if (needsFrameMoves) {
     const DataLayout &TD = MF.getDataLayout();
     const int StackGrowth = -TD.getPointerSize(0);
@@ -946,6 +1173,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL;
   bool IsTailCallReturn = false;
+  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                     MF.getFunction().needsUnwindTableEntry();
+
   if (MBB.end() != MBBI) {
     DL = MBBI->getDebugLoc();
     unsigned RetOpcode = MBBI->getOpcode();
@@ -953,8 +1183,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
                        RetOpcode == AArch64::TCRETURNri ||
                        RetOpcode == AArch64::TCRETURNriBTI;
   }
+
   int NumBytes = MFI.getStackSize();
-  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
 
   // All calls are tail calls in GHC calling conv, and functions have no
   // prologue/epilogue.
@@ -1019,14 +1250,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (!CombineSPBump && PrologueSaveSize != 0) {
     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
+    while (AArch64InstrInfo::isSEHInstruction(*Pop))
+      Pop = std::prev(Pop);
     // Converting the last ldp to a post-index ldp is valid only if the last
     // ldp's offset is 0.
     const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
     // If the offset is 0, convert it to a post-index ldp.
-    if (OffsetOp.getImm() == 0) {
-      convertCalleeSaveRestoreToSPPrePostIncDec(MBB, Pop, DL, TII,
-                                                PrologueSaveSize);
-    } else {
+    if (OffsetOp.getImm() == 0)
+      convertCalleeSaveRestoreToSPPrePostIncDec(
+          MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, false);
+    else {
       // If not, make sure to emit an add after the last ldp.
       // We're doing this by transfering the size to be restored from the
       // adjustment *before* the CSR pops to the adjustment *after* the CSR
@@ -1046,14 +1279,23 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
       ++LastPopI;
       break;
     } else if (CombineSPBump)
-      fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize());
+      fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
+                                        NeedsWinCFI);
   }
 
+  if (NeedsWinCFI)
+    BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
+        .setMIFlag(MachineInstr::FrameDestroy);
+
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
-                    NumBytes + AfterCSRPopSize, TII,
-                    MachineInstr::FrameDestroy);
+                    NumBytes + AfterCSRPopSize, TII, MachineInstr::FrameDestroy,
+                    false, NeedsWinCFI);
+    if (NeedsWinCFI)
+      BuildMI(MBB, MBB.getFirstTerminator(), DL,
+              TII->get(AArch64::SEH_EpilogEnd))
+          .setMIFlag(MachineInstr::FrameDestroy);
     return;
   }
 
@@ -1081,9 +1323,15 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
       adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
 
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
-                    StackRestoreBytes, TII, MachineInstr::FrameDestroy);
-    if (Done)
+                    StackRestoreBytes, TII, MachineInstr::FrameDestroy, false,
+                    NeedsWinCFI);
+    if (Done) {
+      if (NeedsWinCFI)
+        BuildMI(MBB, MBB.getFirstTerminator(), DL,
+                TII->get(AArch64::SEH_EpilogEnd))
+            .setMIFlag(MachineInstr::FrameDestroy);
       return;
+    }
 
     NumBytes = 0;
   }
@@ -1095,10 +1343,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   if (MFI.hasVarSizedObjects() || AFI->isStackRealigned())
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
                     -AFI->getCalleeSavedStackSize() + 16, TII,
-                    MachineInstr::FrameDestroy);
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI);
   else if (NumBytes)
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
-                    MachineInstr::FrameDestroy);
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI);
 
   // This must be placed after the callee-save restore code because that code
   // assumes the SP is at the same location as it was after the callee-save save
@@ -1119,8 +1367,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
 
     emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
-                    AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
+                    AfterCSRPopSize, TII, MachineInstr::FrameDestroy, false,
+                    NeedsWinCFI);
   }
+  if (NeedsWinCFI)
+    BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
+        .setMIFlag(MachineInstr::FrameDestroy);
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -1245,6 +1497,23 @@ static bool produceCompactUnwindFrame(MachineFunction &MF) {
            Attrs.hasAttrSomewhere(Attribute::SwiftError));
 }
 
+static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
+                                             bool NeedsWinCFI) {
+  // If we are generating register pairs for a Windows function that requires
+  // EH support, then pair consecutive registers only.  There are no unwind
+  // opcodes for saves/restores of non-consectuve register pairs.
+  // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x.
+  // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+
+  // TODO: LR can be paired with any register.  We don't support this yet in
+  // the MCLayer.  We need to add support for the save_lrpair unwind code.
+  if (!NeedsWinCFI)
+    return false;
+  if (Reg2 == Reg1 + 1)
+    return false;
+  return true;
+}
+
 namespace {
 
 struct RegPairInfo {
@@ -1269,6 +1538,8 @@ static void computeCalleeSaveRegisterPairs(
   if (CSI.empty())
     return;
 
+  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                     MF.getFunction().needsUnwindTableEntry();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   CallingConv::ID CC = MF.getFunction().getCallingConv();
@@ -1281,7 +1552,11 @@ static void computeCalleeSaveRegisterPairs(
           (Count & 1) == 0) &&
          "Odd number of callee-saved regs to spill!");
   int Offset = AFI->getCalleeSavedStackSize();
-
+  // On Linux, we will have either one or zero non-paired register.  On Windows
+  // with CFI, we can have multiple unpaired registers in order to utilize the
+  // available unwind codes.  This flag assures that the alignment fixup is done
+  // only once, as intened.
+  bool FixupDone = false;
   for (unsigned i = 0; i < Count; ++i) {
     RegPairInfo RPI;
     RPI.Reg1 = CSI[i].getReg();
@@ -1300,11 +1575,13 @@ static void computeCalleeSaveRegisterPairs(
       unsigned NextReg = CSI[i + 1].getReg();
       switch (RPI.Type) {
       case RegPairInfo::GPR:
-        if (AArch64::GPR64RegClass.contains(NextReg))
+        if (AArch64::GPR64RegClass.contains(NextReg) &&
+            !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::FPR64:
-        if (AArch64::FPR64RegClass.contains(NextReg))
+        if (AArch64::FPR64RegClass.contains(NextReg) &&
+            !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::FPR128:
@@ -1349,8 +1626,9 @@ static void computeCalleeSaveRegisterPairs(
 
     // Round up size of non-pair to pair size if we need to pad the
     // callee-save area to ensure 16-byte alignment.
-    if (AFI->hasCalleeSaveStackFreeSpace() &&
+    if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone &&
         RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) {
+      FixupDone = true;
       Offset -= 8;
       assert(Offset % 16 == 0);
       assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
@@ -1374,6 +1652,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                     MF.getFunction().needsUnwindTableEntry();
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
 
@@ -1391,6 +1671,10 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
         .addImm(8)
         .setMIFlag(MachineInstr::FrameSetup);
 
+    if (NeedsWinCFI)
+      BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
+          .setMIFlag(MachineInstr::FrameSetup);
+
     // This instruction also makes x18 live-in to the entry block.
     MBB.addLiveIn(AArch64::X18);
   }
@@ -1436,6 +1720,17 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
                dbgs() << ")\n");
 
+    assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
+           "Windows unwdinding requires a consecutive (FP,LR) pair");
+    // Windows unwind codes require consecutive registers if registers are
+    // paired.  Make the switch here, so that the code below will save (x,x+1)
+    // and not (x+1,x).
+    unsigned FrameIdxReg1 = RPI.FrameIdx;
+    unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
+    if (NeedsWinCFI && RPI.isPaired()) {
+      std::swap(Reg1, Reg2);
+      std::swap(FrameIdxReg1, FrameIdxReg2);
+    }
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
     if (!MRI.isReserved(Reg1))
       MBB.addLiveIn(Reg1);
@@ -1444,7 +1739,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
         MBB.addLiveIn(Reg2);
       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
       MIB.addMemOperand(MF.getMachineMemOperand(
-          MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+          MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
           MachineMemOperand::MOStore, Size, Align));
     }
     MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
@@ -1453,8 +1748,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
                             // where factor*scale is implicit
         .setMIFlag(MachineInstr::FrameSetup);
     MIB.addMemOperand(MF.getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+        MachinePointerInfo::getFixedStack(MF,FrameIdxReg1),
         MachineMemOperand::MOStore, Size, Align));
+    if (NeedsWinCFI)
+      InsertSEH(MIB, TII, MachineInstr::FrameSetup);
+
   }
   return true;
 }
@@ -1467,6 +1765,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
+  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                     MF.getFunction().needsUnwindTableEntry();
 
   if (MI != MBB.end())
     DL = MI->getDebugLoc();
@@ -1512,11 +1812,20 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
                dbgs() << ")\n");
 
+    // Windows unwind codes require consecutive registers if registers are
+    // paired.  Make the switch here, so that the code below will save (x,x+1)
+    // and not (x+1,x).
+    unsigned FrameIdxReg1 = RPI.FrameIdx;
+    unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
+    if (NeedsWinCFI && RPI.isPaired()) {
+      std::swap(Reg1, Reg2);
+      std::swap(FrameIdxReg1, FrameIdxReg2);
+    }
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
     if (RPI.isPaired()) {
       MIB.addReg(Reg2, getDefRegState(true));
       MIB.addMemOperand(MF.getMachineMemOperand(
-          MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+          MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
           MachineMemOperand::MOLoad, Size, Align));
     }
     MIB.addReg(Reg1, getDefRegState(true))
@@ -1525,10 +1834,11 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
                             // where factor*scale is implicit
         .setMIFlag(MachineInstr::FrameDestroy);
     MIB.addMemOperand(MF.getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+        MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
         MachineMemOperand::MOLoad, Size, Align));
+    if (NeedsWinCFI)
+      InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
   };
-
   if (ReverseCSRRestoreSeq)
     for (const RegPairInfo &RPI : reverse(RegPairs))
       EmitMI(RPI);
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 4953892ed4a..c168184beb9 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3060,7 +3060,8 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                            unsigned DestReg, unsigned SrcReg, int Offset,
                            const TargetInstrInfo *TII,
-                           MachineInstr::MIFlag Flag, bool SetNZCV) {
+                           MachineInstr::MIFlag Flag, bool SetNZCV,
+                           bool NeedsWinCFI) {
   if (DestReg == SrcReg && Offset == 0)
     return;
 
@@ -3105,6 +3106,11 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
         .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
         .setMIFlag(Flag);
 
+   if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP)
+     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+         .addImm(ThisVal)
+         .setMIFlag(Flag);
+
     SrcReg = DestReg;
     Offset -= ThisVal;
     if (Offset == 0)
@@ -3115,6 +3121,21 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
       .addImm(Offset)
       .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
       .setMIFlag(Flag);
+
+  if (NeedsWinCFI) {
+    if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
+        (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
+      if (Offset == 0)
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
+                setMIFlag(Flag);
+      else
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
+                addImm(Offset).setMIFlag(Flag);
+    } else if (DestReg == AArch64::SP) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
+              addImm(Offset).setMIFlag(Flag);
+    }
+  }
 }
 
 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index e8e93e64200..c156df57127 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -296,7 +296,7 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                      int Offset, const TargetInstrInfo *TII,
                      MachineInstr::MIFlag = MachineInstr::NoFlags,
-                     bool SetNZCV = false);
+                     bool SetNZCV = false,  bool NeedsWinCFI = false);
 
 /// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
 /// FP. Return false if the offset could not be handled directly in MI, and
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index ff1c1c97988..0bab5c05ba6 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -43,6 +43,8 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
 const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows())
+    return CSR_Win_AArch64_AAPCS_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::GHC)
     // GHC set of callee saved regs is empty as all those regs are
     // used for passing STG regs around
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 1c25a654f24..4653c7af59d 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -30,6 +30,11 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
 public:
   AArch64RegisterInfo(const Triple &TT);
 
+  // FIXME: This should be tablegen'd like getDwarfRegNum is
+  int getSEHRegNum(unsigned i) const {
+    return getEncodingValue(i);
+  }
+
   bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
   bool isAnyArgRegReserved(const MachineFunction &MF) const;
   void emitReservedArgRegCallError(const MachineFunction &MF) const;
diff --git a/test/CodeGen/AArch64/chkstk.ll b/test/CodeGen/AArch64/chkstk.ll
index 1037a5fdae0..9689a3b9b58 100644
--- a/test/CodeGen/AArch64/chkstk.ll
+++ b/test/CodeGen/AArch64/chkstk.ll
@@ -16,7 +16,7 @@ entry:
 
 ; CHECK-DEFAULT-CODE-MODEL: check_watermark:
 ; CHECK-DEFAULT-CODE-MODEL-DAG: stp x29, x30, [sp
-; CHECK-DEFAULT-CODE-MODEL-DAG: orr x15, xzr, #0x100
+; CHECK-DEFAULT-CODE-MODEL-DAG: mov x15, #256
 ; CHECK-DEFAULT-CODE-MODEL:     bl __chkstk
 ; CHECK-DEFAULT-CODE-MODEL:     sub sp, sp, x15, lsl #4
 
@@ -24,7 +24,7 @@ entry:
 
 ; CHECK-LARGE-CODE-MODEL: check_watermark:
 ; CHECK-LARGE-CODE-MODEL-DAG: stp x29, x30, [sp
-; CHECK-LARGE-CODE-MODEL-DAG: orr x15, xzr, #0x100
+; CHECK-LARGE-CODE-MODEL-DAG: mov x15, #256
 ; CHECK-LARGE-CODE-MODEL-DAG: adrp x16, __chkstk
 ; CHECK-LARGE-CODE-MODEL-DAG: add x16, x16, __chkstk
 ; CHECK-LARGE-CODE-MODEL:     blr x16
diff --git a/test/CodeGen/AArch64/win64_vararg.ll b/test/CodeGen/AArch64/win64_vararg.ll
index 9cc9f50adb7..38da60b81a5 100644
--- a/test/CodeGen/AArch64/win64_vararg.ll
+++ b/test/CodeGen/AArch64/win64_vararg.ll
@@ -104,7 +104,7 @@ declare i64* @__local_stdio_printf_options() local_unnamed_addr #4
 
 ; CHECK-LABEL: fp
 ; CHECK: str     x21, [sp, #-96]!
-; CHECK: stp     x20, x19, [sp, #16]
+; CHECK: stp     x19, x20, [sp, #16]
 ; CHECK: stp     x29, x30, [sp, #32]
 ; CHECK: add     x29, sp, #32
 ; CHECK: add     x8, x29, #24
@@ -124,10 +124,10 @@ declare i64* @__local_stdio_printf_options() local_unnamed_addr #4
 ; CHECK: mov     x3, x19
 ; CHECK: mov     x4, xzr
 ; CHECK: bl      __stdio_common_vsprintf
-; CHECK: ldp     x29, x30, [sp, #32]
-; CHECK: ldp     x20, x19, [sp, #16]
 ; CHECK: cmp     w0, #0
 ; CHECK: csinv   w0, w0, wzr, ge
+; CHECK: ldp     x29, x30, [sp, #32]
+; CHECK: ldp     x19, x20, [sp, #16]
 ; CHECK: ldr     x21, [sp], #96
 ; CHECK: ret
 define i32 @fp(i8*, i64, i8*, ...) local_unnamed_addr #6 {
@@ -151,8 +151,8 @@ attributes #6 = { "no-frame-pointer-elim"="true" }
 
 ; CHECK-LABEL: vla
 ; CHECK: str     x23, [sp, #-112]!
-; CHECK: stp     x22, x21, [sp, #16]
-; CHECK: stp     x20, x19, [sp, #32]
+; CHECK: stp     x21, x22, [sp, #16]
+; CHECK: stp     x19, x20, [sp, #32]
 ; CHECK: stp     x29, x30, [sp, #48]
 ; CHECK: add     x29, sp, #48
 ; CHECK: add     x8, x29, #16
@@ -183,8 +183,8 @@ attributes #6 = { "no-frame-pointer-elim"="true" }
 ; CHECK: mov     sp, [[REG2]]
 ; CHECK: sub     sp, x29, #48
 ; CHECK: ldp     x29, x30, [sp, #48]
-; CHECK: ldp     x20, x19, [sp, #32]
-; CHECK: ldp     x22, x21, [sp, #16]
+; CHECK: ldp     x19, x20, [sp, #32]
+; CHECK: ldp     x21, x22, [sp, #16]
 ; CHECK: ldr     x23, [sp], #112
 ; CHECK: ret
 define void @vla(i32, i8*, ...) local_unnamed_addr {
@@ -211,32 +211,34 @@ declare i8* @llvm.stacksave()
 declare void @llvm.stackrestore(i8*)
 
 ; CHECK-LABEL: snprintf
-; CHECK: sub     sp,  sp, #96
-; CHECK: stp     x21, x20, [sp, #16]
-; CHECK: stp     x19, x30, [sp, #32]
-; CHECK: add     x8, sp, #56
-; CHECK: mov     x19, x2
-; CHECK: mov     x20, x1
-; CHECK: mov     x21, x0
-; CHECK: stp     x6, x7, [sp, #80]
-; CHECK: stp     x4, x5, [sp, #64]
-; CHECK: str     x3, [sp, #56]
-; CHECK: str     x8, [sp, #8]
-; CHECK: bl      __local_stdio_printf_options
-; CHECK: ldr     x8, [x0]
-; CHECK: add     x5, sp, #56
-; CHECK: mov     x1, x21
-; CHECK: mov     x2, x20
-; CHECK: orr     x0, x8, #0x2
-; CHECK: mov     x3, x19
-; CHECK: mov     x4, xzr
-; CHECK: bl      __stdio_common_vsprintf
-; CHECK: ldp     x19, x30, [sp, #32]
-; CHECK: ldp     x21, x20, [sp, #16]
-; CHECK: cmp     w0, #0
-; CHECK: csinv   w0, w0, wzr, ge
-; CHECK: add     sp, sp, #96
-; CHECK: ret
+; CHECK-DAG: sub     sp,  sp, #96
+; CHECK-DAG: str     x21, [sp, #16]
+; CHECK-DAG: stp     x19, x20, [sp, #24]
+; CHECK-DAG: str     x30, [sp, #40]
+; CHECK-DAG: add     x8, sp, #56
+; CHECK-DAG: mov     x19, x2
+; CHECK-DAG: mov     x20, x1
+; CHECK-DAG: mov     x21, x0
+; CHECK-DAG: stp     x6, x7, [sp, #80]
+; CHECK-DAG: stp     x4, x5, [sp, #64]
+; CHECK-DAG: str     x3, [sp, #56]
+; CHECK-DAG: str     x8, [sp, #8]
+; CHECK-DAG: bl      __local_stdio_printf_options
+; CHECK-DAG: ldr     x8, [x0]
+; CHECK-DAG: add     x5, sp, #56
+; CHECK-DAG: mov     x1, x21
+; CHECK-DAG: mov     x2, x20
+; CHECK-DAG: orr     x0, x8, #0x2
+; CHECK-DAG: mov     x3, x19
+; CHECK-DAG: mov     x4, xzr
+; CHECK-DAG: bl      __stdio_common_vsprintf
+; CHECK-DAG: ldr     x30, [sp, #40]
+; CHECK-DAG: ldp     x19, x20, [sp, #24]
+; CHECK-DAG: ldr     x21, [sp, #16]
+; CHECK-DAG: cmp     w0, #0
+; CHECK-DAG: csinv   w0, w0, wzr, ge
+; CHECK-DAG: add     sp, sp, #96
+; CHECK-DAG: ret
 define i32 @snprintf(i8*, i64, i8*, ...) local_unnamed_addr #5 {
   %4 = alloca i8*, align 8
   %5 = bitcast i8** %4 to i8*
diff --git a/test/CodeGen/AArch64/wineh-frame0.mir b/test/CodeGen/AArch64/wineh-frame0.mir
new file mode 100644
index 00000000000..b59627d7f31
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame0.mir
@@ -0,0 +1,60 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check save_regp_x, save_regp
+
+# CHECK:        early-clobber $sp = frame-setup STPXpre killed $x27, killed $x28, $sp, -10
+# CHECK-NEXT:   frame-setup SEH_SaveRegP_X 27, 28, -80
+# CHECK-NEXT:   frame-setup STPXi killed $x25, killed $x26, $sp, 2
+# CHECK-NEXT:   frame-setup SEH_SaveRegP 25, 26, 16
+# CHECK-NEXT:   frame-setup STPXi killed $x23, killed $x24, $sp, 4
+# CHECK-NEXT:   frame-setup SEH_SaveRegP 23, 24, 32
+# CHECK-NEXT:   frame-setup STPXi killed $x21, killed $x22, $sp, 6
+# CHECK-NEXT:   frame-setup SEH_SaveRegP 21, 22, 48
+# CHECK-NEXT:   frame-setup STPXi killed $x19, killed $x20, $sp, 8
+# CHECK-NEXT:   frame-setup SEH_SaveRegP 19, 20, 64
+# CHECK-NEXT:   frame-setup SEH_PrologEnd
+# CHECK:        frame-destroy SEH_EpilogStart
+# CHECK-NEXT:   $x19, $x20 = frame-destroy LDPXi $sp, 8
+# CHECK-NEXT:   frame-destroy SEH_SaveRegP 19, 20, 64
+# CHECK-NEXT:   $x21, $x22 = frame-destroy LDPXi $sp, 6
+# CHECK-NEXT:   frame-destroy SEH_SaveRegP 21, 22, 48
+# CHECK-NEXT:   $x23, $x24 = frame-destroy LDPXi $sp, 4
+# CHECK-NEXT:   frame-destroy SEH_SaveRegP 23, 24, 32
+# CHECK-NEXT:   $x25, $x26 = frame-destroy LDPXi $sp, 2
+# CHECK-NEXT:   frame-destroy SEH_SaveRegP 25, 26, 16
+# CHECK-NEXT:   early-clobber $sp, $x27, $x28 = frame-destroy LDPXpost $sp, 10
+# CHECK-NEXT:   frame-destroy SEH_SaveRegP_X 27, 28, -80
+# CHECK-NEXT:   frame-destroy SEH_EpilogEnd
+# CHECK-NEXT:   RET_ReallyLR implicit $x0
+
+...
+---
+name:            test
+alignment:       2
+tracksRegLiveness: true
+hasWinCFI: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  stackSize:       80
+  maxAlignment:    8
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+stack:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20
+    $x19 = ADDXrr $x0, killed $x1
+    $x20 = ADDXrr $x19, killed $x0
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    $x0 = COPY $x28
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame1.mir b/test/CodeGen/AArch64/wineh-frame1.mir
new file mode 100644
index 00000000000..deff40160b2
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame1.mir
@@ -0,0 +1,94 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check save_fregp_x, save_fregp
+
+# CHECK:         early-clobber $sp = frame-setup STPDpre killed $d10, killed $d11, $sp, -14
+# CHECK-NEXT:    frame-setup SEH_SaveFRegP_X 10, 11, -112
+# CHECK-NEXT:    frame-setup STPDi killed $d8, killed $d9, $sp, 2
+# CHECK-NEXT:    frame-setup SEH_SaveFRegP 8, 9, 16
+# CHECK-NEXT:    frame-setup STPXi killed $x27, killed $x28, $sp, 4
+# CHECK-NEXT:    frame-setup SEH_SaveRegP 27, 28, 32
+# CHECK-NEXT:    frame-setup STPXi killed $x25, killed $x26, $sp, 6
+# CHECK-NEXT:    frame-setup SEH_SaveRegP 25, 26, 48
+# CHECK-NEXT:    frame-setup STPXi killed $x23, killed $x24, $sp, 8
+# CHECK-NEXT:    frame-setup SEH_SaveRegP 23, 24, 64
+# CHECK-NEXT:    frame-setup STPXi killed $x21, killed $x22, $sp, 10
+# CHECK-NEXT:    frame-setup SEH_SaveRegP 21, 22, 80
+# CHECK-NEXT:    frame-setup STPXi killed $x19, killed $x20, $sp, 12
+# CHECK-NEXT:    frame-setup SEH_SaveRegP 19, 20, 96
+# CHECK-NEXT:    frame-setup SEH_PrologEnd
+# CHECK:         frame-destroy SEH_EpilogStart
+# CHECK-NEXT:    $x19, $x20 = frame-destroy LDPXi $sp, 12
+# CHECK-NEXT:    frame-destroy SEH_SaveRegP 19, 20, 96
+# CHECK-NEXT:    $x21, $x22 = frame-destroy LDPXi $sp, 10
+# CHECK-NEXT:    frame-destroy SEH_SaveRegP 21, 22, 80
+# CHECK-NEXT:    $x23, $x24 = frame-destroy LDPXi $sp, 8
+# CHECK-NEXT:    frame-destroy SEH_SaveRegP 23, 24, 64
+# CHECK-NEXT:    $x25, $x26 = frame-destroy LDPXi $sp, 6
+# CHECK-NEXT:    frame-destroy SEH_SaveRegP 25, 26, 48
+# CHECK-NEXT:    $x27, $x28 = frame-destroy LDPXi $sp, 4
+# CHECK-NEXT:    frame-destroy SEH_SaveRegP 27, 28, 32
+# CHECK-NEXT:    $d8, $d9 = frame-destroy LDPDi $sp, 2
+# CHECK-NEXT:    frame-destroy SEH_SaveFRegP 8, 9, 16
+# CHECK-NEXT:    early-clobber $sp, $d10, $d11 = frame-destroy LDPDpost $sp, 14
+# CHECK-NEXT:    frame-destroy SEH_SaveFRegP_X 10, 11, -112
+# CHECK-NEXT:    frame-destroy SEH_EpilogEnd
+# CHECK-NEXT:    RET_ReallyLR implicit $x0
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $d0, $d1, $d10, $d11, $d8, $d9, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20
+
+    $x19 = ADDXrr $x0, killed $x1
+    $d8 = FADDDrr killed $d0, $d1
+    $d9 = FADDDrr $d8, $d1
+    $d10 = FADDDrr $d9, $d8
+    $d11 = FADDDrr killed $d9, $d10
+    $x20 = ADDXrr $x19, killed $x0
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    $x0 = COPY $d11
+    $x0 = ADDXrr $x0, killed $x28
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame2.mir b/test/CodeGen/AArch64/wineh-frame2.mir
new file mode 100644
index 00000000000..ae2aaf7f27d
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame2.mir
@@ -0,0 +1,72 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check save_freg_x, save_frep, save_reg
+
+# CHECK:       early-clobber $sp = frame-setup STRDpre killed $d12, $sp, -48
+# CHECK-NEXT:  frame-setup SEH_SaveFReg_X 12, -48
+# CHECK-NEXT:  frame-setup STPDi killed $d10, killed $d11, $sp, 1
+# CHECK-NEXT:  frame-setup SEH_SaveFRegP 10, 11, 8
+# CHECK-NEXT:  frame-setup STPDi killed $d8, killed $d9, $sp, 3
+# CHECK-NEXT:  frame-setup SEH_SaveFRegP 8, 9, 24
+# CHECK-NEXT:  frame-setup STRXui killed $x19, $sp, 5
+# CHECK-NEXT:  frame-setup SEH_SaveReg 19, 40
+# CHECK-NEXT:  frame-setup SEH_PrologEnd
+# CHECK:       frame-destroy SEH_EpilogStart
+# CHECK-NEXT:  $x19 = frame-destroy LDRXui $sp, 5
+# CHECK-NEXT:  frame-destroy SEH_SaveReg 19, 40
+# CHECK-NEXT:  $d8, $d9 = frame-destroy LDPDi $sp, 3
+# CHECK-NEXT:  frame-destroy SEH_SaveFRegP 8, 9, 24
+# CHECK-NEXT:  $d10, $d11 = frame-destroy LDPDi $sp, 1
+# CHECK-NEXT:  frame-destroy SEH_SaveFRegP 10, 11, 8
+# CHECK-NEXT:  early-clobber $sp, $d12 = frame-destroy LDRDpost $sp, 48
+# CHECK-NEXT:  frame-destroy SEH_SaveFReg_X 12, -48
+# CHECK-NEXT:  frame-destroy SEH_EpilogEnd
+# CHECK-NEXT:  RET_ReallyLR implicit $x0
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $d0, $d1, $d10, $d11, $d8, $d9
+    $x19 = ADDXrr $x0, killed $x1
+    $d8 = FADDDrr killed $d0, $d1
+    $d9 = FADDDrr $d8, $d1
+    $d10 = FADDDrr $d9, $d8
+    $d11 = FADDDrr killed $d9, $d10
+    $d12 = FADDDrr $d11, killed $d11
+    $x0 = COPY $d12
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame3.mir b/test/CodeGen/AArch64/wineh-frame3.mir
new file mode 100644
index 00000000000..d6e927d4bd5
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame3.mir
@@ -0,0 +1,59 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check save_reg_x, save_reg
+
+# CHECK:      early-clobber $sp = frame-setup STRXpre killed $x22, $sp, -16
+# CHECK-NEXT: frame-setup SEH_SaveReg_X 22, -16
+# CHECK-NEXT: frame-setup STRXui killed $x19, $sp, 1
+# CHECK-NEXT: frame-setup SEH_SaveReg 19, 8
+# CHECK-NEXT: frame-setup SEH_PrologEnd
+# CHECK:      frame-destroy SEH_EpilogStart
+# CHECK-NEXT: $x19 = frame-destroy LDRXui $sp, 1
+# CHECK-NEXT: frame-destroy SEH_SaveReg 19, 8
+# CHECK-NEXT: early-clobber $sp, $x22 = frame-destroy LDRXpost $sp, 16
+# CHECK-NEXT: frame-destroy SEH_SaveReg_X 22, -16
+# CHECK-NEXT: frame-destroy SEH_EpilogEnd
+# CHECK-NEXT: RET_ReallyLR implicit $x0
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+    $x19 = ADDXrr $x0, killed $x1
+    $x22 = ADDXrr killed $x19, $x0
+    $x0 = COPY killed $x22
+    RET_ReallyLR implicit $x0
+...
diff --git a/test/CodeGen/AArch64/wineh-frame4.mir b/test/CodeGen/AArch64/wineh-frame4.mir
new file mode 100644
index 00000000000..63a8dc67779
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame4.mir
@@ -0,0 +1,59 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check save_freg_x, save_freg
+
+# CHECK:       early-clobber $sp = frame-setup STRDpre killed $d10, $sp, -16
+# CHECK-NEXT:  frame-setup SEH_SaveFReg_X 10, -16
+# CHECK-NEXT:  frame-setup STRDui killed $d8, $sp, 1 :: (store 8 into %stack.0)
+# CHECK-NEXT:  frame-setup SEH_SaveFReg 8, 8
+# CHECK-NEXT:  frame-setup SEH_PrologEnd
+# CHECK:       frame-destroy SEH_EpilogStart
+# CHECK-NEXT:  $d8 = frame-destroy LDRDui $sp, 1 :: (load 8 from %stack.0)
+# CHECK-NEXT:  frame-destroy SEH_SaveFReg 8, 8
+# CHECK-NEXT:  early-clobber $sp, $d10 = frame-destroy LDRDpost $sp, 16 :: (load 8 from %stack.1)
+# CHECK-NEXT:  frame-destroy SEH_SaveFReg_X 10, -16
+# CHECK-NEXT:  frame-destroy SEH_EpilogEnd
+# CHECK-NEXT:  RET_ReallyLR implicit $x0
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $d0, $d1
+    $d8 = FADDDrr $d0, killed $d1
+    $d10 = FADDDrr killed $d8, $d0
+    $x0 = COPY killed $d10
+    RET_ReallyLR implicit $x0
+...
diff --git a/test/CodeGen/AArch64/wineh-frame5.mir b/test/CodeGen/AArch64/wineh-frame5.mir
new file mode 100644
index 00000000000..2a4eed4ca92
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame5.mir
@@ -0,0 +1,135 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check multiple epilogues, save_reg, save_reg_x.
+
+# CHECK-LABEL:   bb.0.entry:
+# CHECK:         early-clobber $sp = frame-setup STRXpre killed $x28, $sp, -32
+# CHECK-NEXT:    frame-setup SEH_SaveReg_X 28, -32
+# CHECK-NEXT:    frame-setup STRXui killed $x19, $sp, 1
+# CHECK-NEXT:    frame-setup SEH_SaveReg 19, 8
+# CHECK-NEXT:    frame-setup STRXui killed $lr, $sp, 2
+# CHECK-NEXT:    frame-setup SEH_SaveReg 30, 16
+# CHECK-NEXT:    $sp = frame-setup SUBXri $sp, 496, 0
+# CHECK-NEXT:    frame-setup SEH_StackAlloc 496
+# CHECK-NEXT:    frame-setup SEH_PrologEnd
+
+# CHECK-LABEL:   bb.1.if.then:
+# CHECK:         frame-destroy SEH_EpilogStart
+# CHECK-NEXT:    $sp = frame-destroy ADDXri $sp, 496, 0
+# CHECK-NEXT:    frame-destroy SEH_StackAlloc 496
+# CHECK-NEXT:    $lr = frame-destroy LDRXui $sp, 2
+# CHECK-NEXT:    frame-destroy SEH_SaveReg 30, 16
+# CHECK-NEXT:    $x19 = frame-destroy LDRXui $sp, 1
+# CHECK-NEXT:    frame-destroy SEH_SaveReg 19, 8
+# CHECK-NEXT:    early-clobber $sp, $x28 = frame-destroy LDRXpost $sp, 32
+# CHECK-NEXT:    frame-destroy SEH_SaveReg_X 28, -32
+# CHECK-NEXT:    frame-destroy SEH_EpilogEnd
+# CHECK-NEXT:    TCRETURNdi @"?func2@@YAHXZ", 0, csr_aarch64_aapcs, implicit $sp
+
+
+--- |
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  define dso_local i32 @"?func@@YAHH@Z"(i32 %i) local_unnamed_addr #0 {
+  entry:
+    %B = alloca [123 x i32], align 4
+    %call = tail call i32 @"?func2@@YAHXZ"()
+    %cmp = icmp sgt i32 %i, 2
+    br i1 %cmp, label %if.then, label %if.else
+
+  if.then:                                          ; preds = %entry
+    %call1 = tail call i32 @"?func2@@YAHXZ"()
+    ret i32 %call1
+
+  if.else:                                          ; preds = %entry
+    %0 = bitcast [123 x i32]* %B to i8*
+    call void @llvm.lifetime.start.p0i8(i64 492, i8* nonnull %0) #3
+    %arraydecay7 = bitcast [123 x i32]* %B to i32*
+    %call2 = call i32 @"?func3@@YAHPEAH@Z"(i32* nonnull %arraydecay7)
+    call void @llvm.lifetime.end.p0i8(i64 492, i8* nonnull %0) #3
+    ret i32 %call2
+  }
+
+  ; Function Attrs: argmemonly nounwind
+  declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+  declare dso_local i32 @"?func2@@YAHXZ"() local_unnamed_addr #2
+
+  declare dso_local i32 @"?func3@@YAHPEAH@Z"(i32*) local_unnamed_addr #2
+
+  ; Function Attrs: argmemonly nounwind
+  declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #3
+
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { argmemonly nounwind }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #3 = { nounwind }
+
+...
+---
+name:            '?func@@YAHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  492
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: B, type: default, offset: 0, size: 492, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -492, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $w0
+
+    renamable $w19 = COPY $w0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @"?func2@@YAHXZ", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    dead $wzr = SUBSWri killed renamable $w19, 3, 0, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit killed $nzcv
+    B %bb.1
+
+  bb.1.if.then:
+    TCRETURNdi @"?func2@@YAHXZ", 0, csr_aarch64_aapcs, implicit $sp
+
+  bb.2.if.else:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $x0 = ADDXri %stack.0.B, 0, 0
+    BL @"?func3@@YAHPEAH@Z", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    RET_ReallyLR implicit $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame6.mir b/test/CodeGen/AArch64/wineh-frame6.mir
new file mode 100644
index 00000000000..b86422e8097
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame6.mir
@@ -0,0 +1,150 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Test that stack probe results in Nop unwind codes in the prologue.  Test
+# save_fplr, save_reg_x and stack_alloc with multiple updates
+
+# CHECK:      early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2
+# CHECK-NEXT: frame-setup SEH_SaveFPLR_X -16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
+# CHECK-NEXT: frame-setup SEH_SetFP
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 32, 0
+# CHECK-NEXT: frame-setup SEH_StackAlloc 32
+# CHECK-NEXT: frame-setup SEH_PrologEnd
+# CHECK:      frame-destroy SEH_EpilogStart
+# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
+# CHECK-NEXT: frame-destroy SEH_SetFP
+# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
+# CHECK-NEXT: frame-destroy SEH_SaveFPLR_X -16
+# CHECK-NEXT: frame-destroy SEH_EpilogEnd
+# CHECK-NEXT: RET_ReallyLR implicit killed $w0
+--- |
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  ; Function Attrs: noinline optnone
+  define dso_local i32 @"?func@@YAHHHHH@Z"(i32 %n, i32 %idx, i32 %b, i32 %c) #0 {
+  entry:
+    %c.addr = alloca i32, align 4
+    %b.addr = alloca i32, align 4
+    %idx.addr = alloca i32, align 4
+    %n.addr = alloca i32, align 4
+    %a = alloca i32*, align 8
+    store i32 %c, i32* %c.addr, align 4
+    store i32 %b, i32* %b.addr, align 4
+    store i32 %idx, i32* %idx.addr, align 4
+    store i32 %n, i32* %n.addr, align 4
+    %0 = load i32, i32* %n.addr, align 4
+    %conv = sext i32 %0 to i64
+    %1 = alloca i8, i64 %conv, align 16
+    %2 = bitcast i8* %1 to i32*
+    store i32* %2, i32** %a, align 8
+    %3 = load i32*, i32** %a, align 8
+    call void @"?init@@YAXPEAH@Z"(i32* %3)
+    ret i32 0
+  }
+
+  declare dso_local void @"?init@@YAXPEAH@Z"(i32*) #1
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #2
+
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { nounwind }
+
+...
+---
+name:            '?func@@YAHHHHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+  - { reg: '$w1', virtual-reg: '' }
+  - { reg: '$w2', virtual-reg: '' }
+  - { reg: '$w3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  24
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: c.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: b.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: idx.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -12, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 3, name: n.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -16, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 4, name: a, type: default, offset: 0, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -24, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 5, name: '', type: variable-sized, offset: 0,
+      alignment: 1, stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -24, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $w0, $w1, $w2, $w3
+
+    STRWui killed renamable $w3, %stack.0.c.addr, 0 :: (store 4 into %ir.c.addr)
+    STRWui killed renamable $w2, %stack.1.b.addr, 0 :: (store 4 into %ir.b.addr)
+    STRWui killed renamable $w1, %stack.2.idx.addr, 0 :: (store 4 into %ir.idx.addr)
+    STRWui killed renamable $w0, %stack.3.n.addr, 0 :: (store 4 into %ir.n.addr)
+    renamable $x8 = LDRSWui %stack.3.n.addr, 0 :: (dereferenceable load 4 from %ir.n.addr)
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $x8 = nuw ADDXri killed renamable $x8, 15, 0
+    renamable $x8 = UBFMXri killed renamable $x8, 4, 63
+    $x15 = COPY renamable $x8
+    STRXui killed $x8, %stack.6, 0 :: (store 8 into %stack.6)
+    BL &__chkstk, csr_aarch64_stackprobe_windows, implicit-def dead $lr, implicit $sp, implicit killed $x15
+    renamable $x8 = COPY $sp
+    $x15 = LDRXui %stack.6, 0 :: (load 8 from %stack.6)
+    renamable $x8 = SUBSXrs killed renamable $x8, killed renamable $x15, 4, implicit-def dead $nzcv
+    $sp = COPY renamable $x8
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    STRXui killed renamable $x8, %stack.4.a, 0 :: (store 8 into %ir.a)
+    renamable $x0 = LDRXui %stack.4.a, 0 :: (dereferenceable load 8 from %ir.a)
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @"?init@@YAXPEAH@Z", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $w1 = COPY $wzr
+    $w0 = COPY killed renamable $w1
+    RET_ReallyLR implicit killed $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame7.mir b/test/CodeGen/AArch64/wineh-frame7.mir
new file mode 100644
index 00000000000..3e3e79eda63
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame7.mir
@@ -0,0 +1,189 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Test that stack probe results in Nop unwind codes in the prologue.  Test
+# save_fplr, save_reg_x and stack_alloc with multiple updates.
+
+# CHECK:      early-clobber $sp = frame-setup STRXpre killed $x28, $sp, -32
+# CHECK-NEXT: frame-setup SEH_SaveReg_X 28, -32
+# CHECK-NEXT: frame-setup STPXi killed $fp, killed $lr, $sp, 2
+# CHECK-NEXT: frame-setup SEH_SaveFPLR 16
+# CHECK-NEXT: $x15 = frame-setup MOVZXi 56009, 0
+# CHECK-NEXT: frame-setup SEH_Nop
+# CHECK-NEXT: $x15 = frame-setup MOVKXi $x15, 2, 16
+# CHECK-NEXT: frame-setup SEH_Nop
+# CHECK-NEXT: frame-setup BL &__chkstk, implicit-def $lr, implicit $sp, implicit $x15
+# CHECK-NEXT: frame-setup SEH_Nop
+# CHECK-NEXT: $sp = frame-setup SUBXrx64 killed $sp, killed $x15, 28
+# CHECK-NEXT: frame-setup SEH_StackAlloc 2993296
+# CHECK-NEXT: frame-setup SEH_PrologEnd
+# CHECK:      frame-destroy SEH_EpilogStart
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 730, 12
+# CHECK-NEXT: frame-destroy SEH_StackAlloc 2990080
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 3216, 0
+# CHECK-NEXT: frame-destroy SEH_StackAlloc 3216
+# CHECK-NEXT: $fp, $lr = frame-destroy LDPXi $sp, 2
+# CHECK-NEXT: frame-destroy SEH_SaveFPLR 16
+# CHECK-NEXT: early-clobber $sp, $x28 = frame-destroy LDRXpost $sp, 32
+# CHECK-NEXT: frame-destroy SEH_SaveReg_X 28, -32
+# CHECK-NEXT: frame-destroy SEH_EpilogEnd
+# CHECK-NEXT: RET_ReallyLR implicit killed $w0
+--- |
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  ; Function Attrs: noinline optnone
+  define dso_local i32 @"?func@@YAHH@Z"(i32 %i) #0 {
+  entry:
+    %retval = alloca i32, align 4
+    %i.addr = alloca i32, align 4
+    %A = alloca [748193 x i32], align 4
+    %a = alloca i32, align 4
+    %B = alloca [123 x i32], align 4
+    store i32 %i, i32* %i.addr, align 4
+    %0 = load i32, i32* %i.addr, align 4
+    %add = add nsw i32 %0, 2
+    store i32 %add, i32* %a, align 4
+    %call = call i32 @"?func2@@YAHXZ"()
+    %1 = load i32, i32* %i.addr, align 4
+    %cmp = icmp sgt i32 %1, 2
+    br i1 %cmp, label %if.then, label %if.else
+
+  if.then:                                          ; preds = %entry
+    %call1 = call i32 @"?func2@@YAHXZ"()
+    store i32 %call1, i32* %retval, align 4
+    br label %return
+
+  if.else:                                          ; preds = %entry
+    %arraydecay = getelementptr inbounds [123 x i32], [123 x i32]* %B, i32 0, i32 0
+    %call2 = call i32 @"?func3@@YAHPEAH@Z"(i32* %arraydecay)
+    store i32 %call2, i32* %retval, align 4
+    br label %return
+
+  return:                                           ; preds = %if.else, %if.then
+    %2 = load i32, i32* %retval, align 4
+    ret i32 %2
+  }
+
+  declare dso_local i32 @"?func2@@YAHXZ"() #1
+
+  declare dso_local i32 @"?func3@@YAHPEAH@Z"(i32*) #1
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #2
+
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { nounwind }
+
+...
+---
+name:            '?func@@YAHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: true
+selected:        true
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+liveins:
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  2993276
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: retval, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: i.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: A, type: default, offset: 0, size: 2992772, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2992780, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 3, name: a, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2992784, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 4, name: B, type: default, offset: 0, size: 492, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2993276, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.1.entry:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+    liveins: $w0
+
+    renamable $x8 = ADDXri %stack.1.i.addr, 0, 0
+    renamable $w9 = MOVi32imm 2
+    STRWui killed renamable $w0, renamable $x8, 0 :: (store 4 into %ir.i.addr)
+    renamable $w0 = LDRWui renamable $x8, 0 :: (load 4 from %ir.i.addr)
+    renamable $w0 = ADDWri killed renamable $w0, 2, 0
+    STRWui killed renamable $w0, %stack.3.a, 0 :: (store 4 into %ir.a)
+    ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    STRXui killed $x8, %stack.5, 0 :: (store 8 into %stack.5)
+    STRWui killed $w9, %stack.6, 0 :: (store 4 into %stack.6)
+    BL @"?func2@@YAHXZ", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    $x8 = LDRXui %stack.5, 0 :: (load 8 from %stack.5)
+    renamable $w9 = LDRWui killed renamable $x8, 0 :: (load 4 from %ir.i.addr)
+    $w10 = LDRWui %stack.6, 0 :: (load 4 from %stack.6)
+    $wzr = SUBSWrr killed renamable $w9, killed renamable $w10, implicit-def $nzcv
+    renamable $w9 = CSINCWr $wzr, $wzr, 13, implicit $nzcv
+    TBNZW killed renamable $w9, 0, %bb.2
+    B %bb.3
+
+  bb.2.if.then:
+    successors: %bb.4(0x80000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    BL @"?func2@@YAHXZ", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    $x8 = LDRXui %stack.5, 0 :: (load 8 from %stack.5)
+    STRWui killed renamable $w0, killed renamable $x8, 1 :: (store 4 into %ir.retval)
+    B %bb.4
+
+  bb.3.if.else:
+    successors: %bb.4(0x80000000)
+
+    renamable $x8 = ADDXri %stack.4.B, 0, 0
+    ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    $x0 = COPY killed renamable $x8
+    BL @"?func3@@YAHPEAH@Z", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit killed $x0, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    $x8 = LDRXui %stack.5, 0 :: (load 8 from %stack.5)
+    STRWui killed renamable $w0, killed renamable $x8, 1 :: (store 4 into %ir.retval)
+
+  bb.4.return:
+    $x8 = LDRXui %stack.5, 0 :: (load 8 from %stack.5)
+    renamable $w0 = LDRWui killed renamable $x8, 1 :: (load 4 from %ir.retval)
+    RET_ReallyLR implicit killed $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame8.mir b/test/CodeGen/AArch64/wineh-frame8.mir
new file mode 100644
index 00000000000..6fc7416d6d6
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame8.mir
@@ -0,0 +1,88 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Test that the frame lowering emits correct SEH updates for the case without
+# a stack frame (e.g. no callee saved registers, no frame pointer, just locals)
+
+# CHECK:      $sp = frame-setup SUBXri $sp, 16, 0
+# CHECK-NEXT: frame-setup SEH_StackAlloc 16
+# CHECK-NEXT: frame-setup SEH_PrologEnd
+# CHECK:      frame-destroy SEH_EpilogStart
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
+# CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+# CHECK-NEXT: frame-destroy SEH_EpilogEnd
+# CHECK-NEXT: RET_ReallyLR implicit killed $w0
+
+--- |
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  ; Function Attrs: noinline nounwind optnone uwtable
+  define dso_local i32 @"?func@@YAHH@Z"(i32 %a) #0 {
+  entry:
+    %a.addr = alloca i32, align 4
+    %b = alloca i32, align 4
+    store i32 %a, i32* %a.addr, align 4
+    store i32 2, i32* %b, align 4
+    %0 = load i32, i32* %b, align 4
+    %1 = load i32, i32* %a.addr, align 4
+    %add = add nsw i32 %0, %1
+    ret i32 %add
+  }
+
+  attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+...
+---
+name:            '?func@@YAHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: true
+selected:        true
+failedISel:      false
+tracksRegLiveness: true
+registers:
+liveins:
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  8
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: a.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: b, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+constants:
+body:             |
+  bb.1.entry:
+    liveins: $w0
+
+    renamable $w8 = MOVi32imm 2
+    STRWui killed renamable $w0, %stack.0.a.addr, 0 :: (store 4 into %ir.a.addr)
+    STRWui killed renamable $w8, %stack.1.b, 0 :: (store 4 into %ir.b)
+    renamable $w8 = LDRWui %stack.1.b, 0 :: (load 4 from %ir.b)
+    renamable $w0 = LDRWui %stack.0.a.addr, 0 :: (load 4 from %ir.a.addr)
+    renamable $w0 = nsw ADDWrr killed renamable $w8, killed renamable $w0
+    RET_ReallyLR implicit killed $w0
+
+...
-- 
GitLab


From 761dc549d14bb5f53765ef02ac8e00dc3e0a05e1 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 31 Oct 2018 09:32:47 +0000
Subject: [PATCH 0788/1116] [MSan] another take at instrumenting inline
 assembly - now with calls

Turns out it's not always possible to figure out whether an asm()
statement argument points to a valid memory region.
One example would be per-CPU objects in the Linux kernel, for which the
addresses are calculated using the FS register and a small offset in the
.data..percpu section.
To avoid pulling all sorts of checks into the instrumentation, we replace
actual checking/unpoisoning code with calls to
msan_instrument_asm_load(ptr, size) and
msan_instrument_asm_store(ptr, size) functions in the runtime.

This patch doesn't implement the runtime hooks in compiler-rt, as there's
been no demand in assembly instrumentation for userspace apps so far.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345702 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Instrumentation/MemorySanitizer.cpp       | 130 ++++++++--
 .../MemorySanitizer/msan_asm_conservative.ll  | 236 ++++++++++++++++++
 .../MemorySanitizer/msan_x86_bts_asm.ll       |  28 ++-
 3 files changed, 359 insertions(+), 35 deletions(-)
 create mode 100644 test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll

diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 6421b6efac4..960c1f42900 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -90,6 +90,24 @@
 /// value. It implements the store part as a simple atomic store by storing a
 /// clean shadow.
 ///
+///                      Instrumenting inline assembly.
+///
+/// For inline assembly code LLVM has little idea about which memory locations
+/// become initialized depending on the arguments. It can be possible to figure
+/// out which arguments are meant to point to inputs and outputs, but the
+/// actual semantics can be only visible at runtime. In the Linux kernel it's
+/// also possible that the arguments only indicate the offset for a base taken
+/// from a segment register, so it's dangerous to treat any asm() arguments as
+/// pointers. We take a conservative approach generating calls to
+///   __msan_instrument_asm_load(ptr, size) and
+///   __msan_instrument_asm_store(ptr, size)
+/// , which defer the memory checking/unpoisoning to the runtime library.
+/// The latter can perform more complex address checks to figure out whether
+/// it's safe to touch the shadow memory.
+/// Like with atomic operations, we call __msan_instrument_asm_store() before
+/// the assembly call, so that changes to the shadow memory will be seen by
+/// other threads together with main memory initialization.
+///
 ///                  KernelMemorySanitizer (KMSAN) implementation.
 ///
 /// The major differences between KMSAN and MSan instrumentation are:
@@ -549,6 +567,7 @@ private:
   Value *MsanMetadataPtrForLoadN, *MsanMetadataPtrForStoreN;
   Value *MsanMetadataPtrForLoad_1_8[4];
   Value *MsanMetadataPtrForStore_1_8[4];
+  Value *MsanInstrumentAsmStoreFn, *MsanInstrumentAsmLoadFn;
 
   /// Helper to choose between different MsanMetadataPtrXxx().
   Value *getKmsanShadowOriginAccessFn(bool isStore, int size);
@@ -757,6 +776,13 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
 
+  MsanInstrumentAsmLoadFn =
+      M.getOrInsertFunction("__msan_instrument_asm_load", IRB.getVoidTy(),
+                            PointerType::get(IRB.getInt8Ty(), 0), IntptrTy);
+  MsanInstrumentAsmStoreFn =
+      M.getOrInsertFunction("__msan_instrument_asm_store", IRB.getVoidTy(),
+                            PointerType::get(IRB.getInt8Ty(), 0), IntptrTy);
+
   if (CompileKernel) {
     createKernelApi(M);
   } else {
@@ -3444,37 +3470,97 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Nothing to do here.
   }
 
+  void instrumentAsmArgument(Value *Operand, Instruction &I, IRBuilder<> &IRB,
+                             const DataLayout &DL, bool isOutput) {
+    // For each assembly argument, we check its value for being initialized.
+    // If the argument is a pointer, we assume it points to a single element
+    // of the corresponding type (or to a 8-byte word, if the type is unsized).
+    // Each such pointer is instrumented with a call to the runtime library.
+    Type *OpType = Operand->getType();
+    // Check the operand value itself.
+    insertShadowCheck(Operand, &I);
+    if (!OpType->isPointerTy()) {
+      assert(!isOutput);
+      return;
+    }
+    Value *Hook =
+        isOutput ? MS.MsanInstrumentAsmStoreFn : MS.MsanInstrumentAsmLoadFn;
+    Type *ElType = OpType->getPointerElementType();
+    if (!ElType->isSized())
+      return;
+    int Size = DL.getTypeStoreSize(ElType);
+    Value *Ptr = IRB.CreatePointerCast(Operand, IRB.getInt8PtrTy());
+    Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size);
+    IRB.CreateCall(Hook, {Ptr, SizeVal});
+  }
+
+  /// Get the number of output arguments returned by pointers.
+  int getNumOutputArgs(InlineAsm *IA, CallInst *CI) {
+    int NumRetOutputs = 0;
+    int NumOutputs = 0;
+    Type *RetTy = dyn_cast<Value>(CI)->getType();
+    if (!RetTy->isVoidTy()) {
+      // Register outputs are returned via the CallInst return value.
+      StructType *ST = dyn_cast_or_null<StructType>(RetTy);
+      if (ST)
+        NumRetOutputs = ST->getNumElements();
+      else
+        NumRetOutputs = 1;
+    }
+    InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+    for (size_t i = 0, n = Constraints.size(); i < n; i++) {
+      InlineAsm::ConstraintInfo Info = Constraints[i];
+      switch (Info.Type) {
+      case InlineAsm::isOutput:
+        NumOutputs++;
+        break;
+      default:
+        break;
+      }
+    }
+    return NumOutputs - NumRetOutputs;
+  }
+
   void visitAsmInstruction(Instruction &I) {
     // Conservative inline assembly handling: check for poisoned shadow of
     // asm() arguments, then unpoison the result and all the memory locations
     // pointed to by those arguments.
+    // An inline asm() statement in C++ contains lists of input and output
+    // arguments used by the assembly code. These are mapped to operands of the
+    // CallInst as follows:
+    //  - nR register outputs ("=r) are returned by value in a single structure
+    //  (SSA value of the CallInst);
+    //  - nO other outputs ("=m" and others) are returned by pointer as first
+    // nO operands of the CallInst;
+    //  - nI inputs ("r", "m" and others) are passed to CallInst as the
+    // remaining nI operands.
+    // The total number of asm() arguments in the source is nR+nO+nI, and the
+    // corresponding CallInst has nO+nI+1 operands (the last operand is the
+    // function to be called).
+    const DataLayout &DL = F.getParent()->getDataLayout();
     CallInst *CI = dyn_cast<CallInst>(&I);
-
-    for (size_t i = 0, n = CI->getNumOperands(); i < n; i++) {
+    IRBuilder<> IRB(&I);
+    InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+    int OutputArgs = getNumOutputArgs(IA, CI);
+    // The last operand of a CallInst is the function itself.
+    int NumOperands = CI->getNumOperands() - 1;
+
+    // Check input arguments. Doing so before unpoisoning output arguments, so
+    // that we won't overwrite uninit values before checking them.
+    for (int i = OutputArgs; i < NumOperands; i++) {
       Value *Operand = CI->getOperand(i);
-      if (Operand->getType()->isSized())
-        insertShadowCheck(Operand, &I);
+      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ false);
     }
-    setShadow(&I, getCleanShadow(&I));
-    setOrigin(&I, getCleanOrigin());
-    IRBuilder<> IRB(&I);
-    IRB.SetInsertPoint(I.getNextNode());
-    for (size_t i = 0, n = CI->getNumOperands(); i < n; i++) {
+    // Unpoison output arguments. This must happen before the actual InlineAsm
+    // call, so that the shadow for memory published in the asm() statement
+    // remains valid.
+    for (int i = 0; i < OutputArgs; i++) {
       Value *Operand = CI->getOperand(i);
-      Type *OpType = Operand->getType();
-      if (!OpType->isPointerTy())
-        continue;
-      Type *ElType = OpType->getPointerElementType();
-      if (!ElType->isSized())
-        continue;
-      Value *ShadowPtr, *OriginPtr;
-      std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
-          Operand, IRB, ElType, /*Alignment*/ 1, /*isStore*/ true);
-      Value *CShadow = getCleanShadow(ElType);
-      IRB.CreateStore(
-          CShadow,
-          IRB.CreatePointerCast(ShadowPtr, CShadow->getType()->getPointerTo()));
+      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ true);
     }
+
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
   }
 
   void visitInstruction(Instruction &I) {
diff --git a/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll b/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
new file mode 100644
index 00000000000..4dcca26fd0b
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
@@ -0,0 +1,236 @@
+; Test for handling of asm constraints in MSan instrumentation.
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S | FileCheck -check-prefixes=CHECK,CHECK-NONCONS %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S | FileCheck -check-prefixes=CHECK,CHECK-CONS %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.pair = type { i32, i32 }
+
+@id1 = common dso_local global i32 0, align 4
+@is1 = common dso_local global i32 0, align 4
+@id2 = common dso_local global i32 0, align 4
+@is2 = common dso_local global i32 0, align 4
+@id3 = common dso_local global i32 0, align 4
+@pair2 = common dso_local global %struct.pair zeroinitializer, align 4
+@pair1 = common dso_local global %struct.pair zeroinitializer, align 4
+@c2 = common dso_local global i8 0, align 1
+@c1 = common dso_local global i8 0, align 1
+@memcpy_d1 = common dso_local global i8* (i8*, i8*, i32)* null, align 8
+@memcpy_d2 = common dso_local global i8* (i8*, i8*, i32)* null, align 8
+@memcpy_s1 = common dso_local global i8* (i8*, i8*, i32)* null, align 8
+@memcpy_s2 = common dso_local global i8* (i8*, i8*, i32)* null, align 8
+
+; The functions below were generated from a C source that contains declarations like follows:
+;   void f1() {
+;     asm("" : "=r" (id1) : "r" (is1));
+;   }
+; with corresponding input/output constraints.
+; Note that the assembly statement is always empty, as MSan doesn't look at it anyway.
+
+; One input register, one output register:
+;   asm("" : "=r" (id1) : "r" (is1));
+define dso_local void @f_1i_1o_reg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @is1, align 4
+  %1 = call i32 asm "", "=r,r,~{dirflag},~{fpsr},~{flags}"(i32 %0)
+  store i32 %1, i32* @id1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_1i_1o_reg
+; CHECK: [[IS1_F1:%.*]] = load i32, i32* @is1, align 4
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call i32 asm "",{{.*}}(i32 [[IS1_F1]])
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
+
+
+; Two input registers, two output registers:
+;   asm("" : "=r" (id1), "=r" (id2) : "r" (is1), "r"(is2));
+define dso_local void @f_2i_2o_reg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @is1, align 4
+  %1 = load i32, i32* @is2, align 4
+  %2 = call { i32, i32 } asm "", "=r,=r,r,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
+  %asmresult = extractvalue { i32, i32 } %2, 0
+  %asmresult1 = extractvalue { i32, i32 } %2, 1
+  store i32 %asmresult, i32* @id1, align 4
+  store i32 %asmresult1, i32* @id2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_2i_2o_reg
+; CHECK: [[IS1_F2:%.*]] = load i32, i32* @is1, align 4
+; CHECK: [[IS2_F2:%.*]] = load i32, i32* @is2, align 4
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[IS1_F2]], i32 [[IS2_F2]])
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
+
+; Input same as output, used twice:
+;   asm("" : "=r" (id1), "=r" (id2) : "r" (id1), "r" (id2));
+define dso_local void @f_2i_2o_reuse2_reg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @id1, align 4
+  %1 = load i32, i32* @id2, align 4
+  %2 = call { i32, i32 } asm "", "=r,=r,r,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
+  %asmresult = extractvalue { i32, i32 } %2, 0
+  %asmresult1 = extractvalue { i32, i32 } %2, 1
+  store i32 %asmresult, i32* @id1, align 4
+  store i32 %asmresult1, i32* @id2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_2i_2o_reuse2_reg
+; CHECK: [[ID1_F3:%.*]] = load i32, i32* @id1, align 4
+; CHECK: [[ID2_F3:%.*]] = load i32, i32* @id2, align 4
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[ID1_F3]], i32 [[ID2_F3]])
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
+
+
+; One of the input registers is also an output:
+;   asm("" : "=r" (id1), "=r" (id2) : "r" (id1), "r"(is1));
+define dso_local void @f_2i_2o_reuse1_reg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @id1, align 4
+  %1 = load i32, i32* @is1, align 4
+  %2 = call { i32, i32 } asm "", "=r,=r,r,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
+  %asmresult = extractvalue { i32, i32 } %2, 0
+  %asmresult1 = extractvalue { i32, i32 } %2, 1
+  store i32 %asmresult, i32* @id1, align 4
+  store i32 %asmresult1, i32* @id2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_2i_2o_reuse1_reg
+; CHECK: [[ID1_F4:%.*]] = load i32, i32* @id1, align 4
+; CHECK: [[IS1_F4:%.*]] = load i32, i32* @is1, align 4
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[ID1_F4]], i32 [[IS1_F4]])
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
+
+
+; One input register, three output registers:
+;   asm("" : "=r" (id1), "=r" (id2), "=r" (id3) : "r" (is1));
+define dso_local void @f_1i_3o_reg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @is1, align 4
+  %1 = call { i32, i32, i32 } asm "", "=r,=r,=r,r,~{dirflag},~{fpsr},~{flags}"(i32 %0)
+  %asmresult = extractvalue { i32, i32, i32 } %1, 0
+  %asmresult1 = extractvalue { i32, i32, i32 } %1, 1
+  %asmresult2 = extractvalue { i32, i32, i32 } %1, 2
+  store i32 %asmresult, i32* @id1, align 4
+  store i32 %asmresult1, i32* @id2, align 4
+  store i32 %asmresult2, i32* @id3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_1i_3o_reg
+; CHECK: [[IS1_F5:%.*]] = load i32, i32* @is1, align 4
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call { i32, i32, i32 } asm "",{{.*}}(i32 [[IS1_F5]])
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id3 to i64)
+
+
+; 2 input memory args, 2 output memory args:
+;  asm("" : "=m" (id1), "=m" (id2) : "m" (is1), "m"(is2))
+define dso_local void @f_2i_2o_mem() sanitize_memory {
+entry:
+  call void asm "", "=*m,=*m,*m,*m,~{dirflag},~{fpsr},~{flags}"(i32* @id1, i32* @id2, i32* @is1, i32* @is2)
+  ret void
+}
+
+; CHECK-LABEL: @f_2i_2o_mem
+; CHECK-CONS: call void @__msan_instrument_asm_load({{.*}}@is1{{.*}}, i64 4)
+; CHECK-CONS: call void @__msan_instrument_asm_load({{.*}}@is2{{.*}}, i64 4)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id1{{.*}}, i64 4)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id2{{.*}}, i64 4)
+; CHECK: call void asm "", "=*m,=*m,*m,*m,~{dirflag},~{fpsr},~{flags}"(i32* @id1, i32* @id2, i32* @is1, i32* @is2)
+
+
+; Same input and output passed as both memory and register:
+;  asm("" : "=r" (id1), "=m"(id1) : "r"(is1), "m"(is1));
+define dso_local void @f_1i_1o_memreg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @is1, align 4
+  %1 = call i32 asm "", "=r,=*m,r,*m,~{dirflag},~{fpsr},~{flags}"(i32* @id1, i32 %0, i32* @is1)
+  store i32 %1, i32* @id1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_1i_1o_memreg
+; CHECK: [[IS1_F7:%.*]] = load i32, i32* @is1, align 4
+; CHECK-CONS: call void @__msan_instrument_asm_load({{.*}}@is1{{.*}}, i64 4)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id1{{.*}}, i64 4)
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call i32 asm "", "=r,=*m,r,*m,~{dirflag},~{fpsr},~{flags}"(i32* @id1, i32 [[IS1_F7]], i32* @is1)
+
+
+; Three outputs, first and last returned via regs, second via mem:
+;  asm("" : "=r" (id1), "=m"(id2), "=r" (id3):);
+define dso_local void @f_3o_reg_mem_reg() sanitize_memory {
+entry:
+  %0 = call { i32, i32 } asm "", "=r,=*m,=r,~{dirflag},~{fpsr},~{flags}"(i32* @id2)
+  %asmresult = extractvalue { i32, i32 } %0, 0
+  %asmresult1 = extractvalue { i32, i32 } %0, 1
+  store i32 %asmresult, i32* @id1, align 4
+  store i32 %asmresult1, i32* @id3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_3o_reg_mem_reg
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id2{{.*}}), i64 4)
+; CHECK: call { i32, i32 } asm "", "=r,=*m,=r,~{dirflag},~{fpsr},~{flags}"(i32* @id2)
+
+
+; Three inputs and three outputs of different types: a pair, a char, a function pointer.
+; Everything is meant to be passed in registers, but LLVM chooses to return the integer pair by pointer:
+;  asm("" : "=r" (pair2), "=r" (c2), "=r" (memcpy_d1) : "r"(pair1), "r"(c1), "r"(memcpy_s1));
+define dso_local void @f_3i_3o_complex_reg() sanitize_memory {
+entry:
+  %0 = load i64, i64* bitcast (%struct.pair* @pair1 to i64*), align 4
+  %1 = load i8, i8* @c1, align 1
+  %2 = load i8* (i8*, i8*, i32)*, i8* (i8*, i8*, i32)** @memcpy_s1, align 8
+  %3 = call { i8, i8* (i8*, i8*, i32)* } asm "", "=*r,=r,=r,r,r,r,~{dirflag},~{fpsr},~{flags}"(%struct.pair* @pair2, i64 %0, i8 %1, i8* (i8*, i8*, i32)* %2)
+  %asmresult = extractvalue { i8, i8* (i8*, i8*, i32)* } %3, 0
+  %asmresult1 = extractvalue { i8, i8* (i8*, i8*, i32)* } %3, 1
+  store i8 %asmresult, i8* @c2, align 1
+  store i8* (i8*, i8*, i32)* %asmresult1, i8* (i8*, i8*, i32)** @memcpy_d1, align 8
+  ret void
+}
+
+; CHECK-LABEL: @f_3i_3o_complex_reg
+; CHECK: [[PAIR1_F9:%.*]] = load {{.*}} @pair1
+; CHECK: [[C1_F9:%.*]] = load {{.*}} @c1
+; CHECK: [[MEMCPY_S1_F9:%.*]] = load {{.*}} @memcpy_s1
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@pair2{{.*}}, i64 8)
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call { i8, i8* (i8*, i8*, i32)* } asm "", "=*r,=r,=r,r,r,r,~{dirflag},~{fpsr},~{flags}"(%struct.pair* @pair2, {{.*}}[[PAIR1_F9]], i8 [[C1_F9]], {{.*}} [[MEMCPY_S1_F9]])
+
+; Three inputs and three outputs of different types: a pair, a char, a function pointer.
+; Everything is passed in memory:
+;  asm("" : "=m" (pair2), "=m" (c2), "=m" (memcpy_d1) : "m"(pair1), "m"(c1), "m"(memcpy_s1));
+define dso_local void @f_3i_3o_complex_mem() sanitize_memory {
+entry:
+  call void asm "", "=*m,=*m,=*m,*m,*m,*m,~{dirflag},~{fpsr},~{flags}"(%struct.pair* @pair2, i8* @c2, i8* (i8*, i8*, i32)** @memcpy_d1, %struct.pair* @pair1, i8* @c1, i8* (i8*, i8*, i32)** @memcpy_s1)
+  ret void
+}
+
+; CHECK-LABEL: @f_3i_3o_complex_mem
+; CHECK-CONS: call void @__msan_instrument_asm_load({{.*}}@pair1{{.*}}, i64 8)
+; CHECK-CONS: call void @__msan_instrument_asm_load(i8* @c1, i64 1)
+; CHECK-CONS: call void @__msan_instrument_asm_load({{.*}}@memcpy_s1{{.*}}, i64 8)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@pair2{{.*}}, i64 8)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@c2{{.*}}, i64 1)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@memcpy_d1{{.*}}, i64 8)
+; CHECK: call void asm "", "=*m,=*m,=*m,*m,*m,*m,~{dirflag},~{fpsr},~{flags}"(%struct.pair* @pair2, i8* @c2, i8* (i8*, i8*, i32)** @memcpy_d1, %struct.pair* @pair1, i8* @c1, i8* (i8*, i8*, i32)** @memcpy_s1)
diff --git a/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll b/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll
index 0b9f455f1d0..7240e1086da 100644
--- a/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ;    unsigned long *addr = &value;
 ;    asm("btsq %2, %1; setc %0" : "=qm" (bit), "=m" (addr): "Ir" (nr));
 ;    if (bit)
-;      return 0
+;      return 0;
 ;    else
 ;      return 1;
 ;  }
@@ -52,25 +52,27 @@ if.else:                                          ; preds = %entry
   ret i32 1
 }
 
-; Start with the asm call
+; Hooks for inputs usually go before the assembly statement. But here we have none,
+; because %nr is passed by value. However we check %nr for being initialized.
+; CHECK-CONS: [[NRC:%.*]] = ptrtoint i64* %nr to i64
+
+; In the conservative mode, call the store hooks for %bit and %addr:
+; CHECK-CONS: call void @__msan_instrument_asm_store(i8* %bit, i64 1)
+; CHECK-CONS: [[ADDR8S:%.*]] = bitcast i64** %addr to i8*
+; CHECK-CONS: call void @__msan_instrument_asm_store(i8* [[ADDR8S]], i64 8)
+
+; Landing pad for the %nr check above.
+; CHECK-CONS: call void @__msan_warning_noreturn()
+
 ; CHECK: call void asm "btsq $2, $1; setc $0"
 
 ; Calculating the shadow offset of %bit.
 ; CHECK: [[PTR:%.*]] = ptrtoint {{.*}} %bit to i64
-; CHECK: [[SH_NUM:%.*]] = xor i64 [[PTR]], [[OFF:[0-9]*]]
+; CHECK: [[SH_NUM:%.*]] = xor i64 [[PTR]]
 ; CHECK: [[SHADOW:%.*]] = inttoptr i64 [[SH_NUM]] {{.*}}
 
-; In the conservative mode, unpoison the shadow.
-; CHECK-CONS: store i8 0, i8* [[SHADOW]]
-; Now calculate the shadow address again, because MSan does this for every
-; shadow access.
-; CHECK-CONS: [[PTR2:%.*]] = ptrtoint {{.*}} %bit to i64
-; CHECK-CONS: [[SH_NUM2:%.*]] = xor i64 [[PTR2]], [[OFF]]
-; CHECK-CONS: [[SHADOW2:%.*]] = inttoptr i64 [[SH_NUM2]] {{.*}}
-
 ; Now load the shadow value for the boolean.
-; CHECK-NONCONS: [[MSLD:%.*]] = load {{.*}} [[SHADOW]]
-; CHECK-CONS: [[MSLD:%.*]] = load {{.*}} [[SHADOW2]]
+; CHECK: [[MSLD:%.*]] = load {{.*}} [[SHADOW]]
 ; CHECK: [[MSPROP:%.*]] = trunc i8 [[MSLD]] to i1
 
 ; Is the shadow poisoned?
-- 
GitLab


From bb1b895ac0603485c5657967abb52016bd22905a Mon Sep 17 00:00:00 2001
From: Kristina Brooks <kristina@nym.hush.com>
Date: Wed, 31 Oct 2018 09:34:08 +0000
Subject: [PATCH 0789/1116] [llvm-objdump] Add --reloc alias for -r (PR39407)

This addresses PR39407 (https://bugs.llvm.org/show_bug.cgi?id=39407)
improving compatibility with GNU binutils counterparts.

Reviewed By: kristina

Patch by Higuoxing (Xing).

Differential Revision: https://reviews.llvm.org/D53804

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345703 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-objdump/relocations-elf.test | 73 ++++++++++++++++++++
 tools/llvm-objdump/llvm-objdump.cpp          |  6 +-
 2 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 test/tools/llvm-objdump/relocations-elf.test

diff --git a/test/tools/llvm-objdump/relocations-elf.test b/test/tools/llvm-objdump/relocations-elf.test
new file mode 100644
index 00000000000..a29b3e6a6fb
--- /dev/null
+++ b/test/tools/llvm-objdump/relocations-elf.test
@@ -0,0 +1,73 @@
+# RUN: yaml2obj %s > %t
+# RUN: llvm-objdump --reloc %t | FileCheck %s
+# RUN: llvm-objdump -r      %t | FileCheck %s
+
+# CHECK: RELOCATION RECORDS FOR [.rel.text]:
+# CHECK: 0000000000000001 R_X86_64_32 glob1
+# CHECK: 0000000000000001 R_X86_64_32S glob2
+# CHECK: 0000000000000002 R_X86_64_64 loc1
+
+# CHECK: RELOCATION RECORDS FOR [.rela.text]:
+# CHECK: 0000000000000001 R_X86_64_32 glob1+1
+# CHECK: 0000000000000001 R_X86_64_32S glob2+2
+# CHECK: 0000000000000002 R_X86_64_64 loc1+3
+
+!ELF
+FileHeader: !FileHeader
+  Class: ELFCLASS64
+  Data: ELFDATA2LSB
+  Type: ET_REL
+  Machine: EM_X86_64
+
+Sections:
+- Name: .text
+  Type: SHT_PROGBITS
+  Content: "0000000000000000"
+  AddressAlign: 16
+  Flags: [SHF_ALLOC]
+
+- Name: .rel.text
+  Type: SHT_REL
+  Info: .text
+  AddressAlign: 4
+  Relocations:
+    - Offset: 0x1
+      Symbol: glob1
+      Type: R_X86_64_32
+    - Offset: 0x1
+      Symbol: glob2
+      Type: R_X86_64_32S
+    - Offset: 0x2
+      Symbol: loc1
+      Type: R_X86_64_64
+
+- Name: .rela.text
+  Type: SHT_RELA
+  Link: .symtab
+  Info: .text
+  AddressAlign: 4
+  Relocations:
+    - Offset: 0x1
+      Addend: 1
+      Symbol: glob1
+      Type: R_X86_64_32
+    - Offset: 0x1
+      Addend: 2
+      Symbol: glob2
+      Type: R_X86_64_32S
+    - Offset: 0x2
+      Addend: 3
+      Symbol: loc1
+      Type: R_X86_64_64
+
+Symbols:
+  Local:
+    - Name: loc1
+    - Name: loc2
+  Global:
+    - Name: glob1
+      Section: .text
+      Value: 0x0
+      Size: 4
+    - Name: glob2
+
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 4ad29d2143f..18fcfef514e 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -106,7 +106,11 @@ DisassembleFunctions("df",
 static StringSet<> DisasmFuncsSet;
 
 cl::opt<bool>
-llvm::Relocations("r", cl::desc("Display the relocation entries in the file"));
+llvm::Relocations("reloc",
+                  cl::desc("Display the relocation entries in the file"));
+static cl::alias RelocationsShort("r", cl::desc("Alias for --reloc"),
+                                  cl::NotHidden,
+                                  cl::aliasopt(llvm::Relocations));
 
 cl::opt<bool>
 llvm::DynamicRelocations("dynamic-reloc",
-- 
GitLab


From a3cc6ce0cb753863efd3b169db16fbc28c78f330 Mon Sep 17 00:00:00 2001
From: Kristina Brooks <kristina@nym.hush.com>
Date: Wed, 31 Oct 2018 09:35:25 +0000
Subject: [PATCH 0790/1116] [llvm-objdump] Mark syms/t flags as NotHidden. NFC.

Slight improvement to help output of llvm-objdump that exposes the
shorter -t flag for -syms instead of it being hidden away.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345704 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objdump/llvm-objdump.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 18fcfef514e..671e8a2c4f7 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -128,6 +128,7 @@ static cl::alias SectionContentsShort("s",
 
 cl::opt<bool> llvm::SymbolTable("syms", cl::desc("Display the symbol table"));
 static cl::alias SymbolTableShort("t", cl::desc("Alias for --syms"),
+                                  cl::NotHidden,
                                   cl::aliasopt(llvm::SymbolTable));
 
 cl::opt<bool>
-- 
GitLab


From 06bac6c858af2917bd8c9eeb3763042f12906d79 Mon Sep 17 00:00:00 2001
From: Dorit Nuzman <dorit.nuzman@intel.com>
Date: Wed, 31 Oct 2018 09:57:56 +0000
Subject: [PATCH 0791/1116] [LV] Support vectorization of interleave-groups
 that require an epilog under optsize using masked wide loads

Under Opt for Size, the vectorizer does not vectorize interleave-groups that
have gaps at the end of the group (such as a loop that reads only the even
elements: a[2*i]) because that implies that we'll require a scalar epilogue
(which is not allowed under Opt for Size). This patch extends the support for
masked-interleave-groups (introduced by D53011 for conditional accesses) to
also cover the case of gaps in a group of loads; Targets that enable the
masked-interleave-group feature don't have to invalidate interleave-groups of
loads with gaps; they could now use masked wide-loads and shuffles (if that's
what the cost model selects).

Reviewers: Ayal, hsaito, dcaballe, fhahn

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D53668


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345705 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/TargetTransformInfo.h   |  20 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   3 +-
 include/llvm/Analysis/VectorUtils.h           |  19 +-
 include/llvm/CodeGen/BasicTTIImpl.h           |  16 +-
 lib/Analysis/TargetTransformInfo.cpp          |   9 +-
 lib/Analysis/VectorUtils.cpp                  |  24 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   9 +-
 .../AArch64/AArch64TargetTransformInfo.h      |   4 +-
 lib/Target/ARM/ARMTargetTransformInfo.cpp     |   8 +-
 lib/Target/ARM/ARMTargetTransformInfo.h       |   4 +-
 .../Hexagon/HexagonTargetTransformInfo.cpp    |   8 +-
 .../Hexagon/HexagonTargetTransformInfo.h      |   3 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.cpp |   8 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.h   |   3 +-
 .../SystemZ/SystemZTargetTransformInfo.cpp    |   8 +-
 .../SystemZ/SystemZTargetTransformInfo.h      |   4 +-
 lib/Target/X86/X86TargetTransformInfo.cpp     |  28 +-
 lib/Target/X86/X86TargetTransformInfo.h       |   9 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 114 +++++--
 .../x86-interleaved-accesses-masked-group.ll  | 305 ++++++++++++++----
 20 files changed, 453 insertions(+), 153 deletions(-)

diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 3edbe9fb7c5..eb0e0270157 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -588,7 +588,8 @@ public:
   bool enableInterleavedAccessVectorization() const;
 
   /// Enable matching of interleaved access groups that contain predicated 
-  /// accesses and are vectorized using masked vector loads/stores.
+  /// accesses or gaps and therefore vectorized using masked
+  /// vector loads/stores.
   bool enableMaskedInterleavedAccessVectorization() const;
 
   /// Indicate that it is potentially unsafe to automatically vectorize
@@ -827,11 +828,13 @@ public:
   ///    load allows gaps)
   /// \p Alignment is the alignment of the memory operation
   /// \p AddressSpace is address space of the pointer.
-  /// \p IsMasked indicates if the memory access is predicated.
+  /// \p UseMaskForCond indicates if the memory access is predicated.
+  /// \p UseMaskForGaps indicates if gaps should be masked.
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, 
-                                 bool IsMasked = false) const;
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false) const;
 
   /// Calculate the cost of performing a vector reduction.
   ///
@@ -1142,7 +1145,8 @@ public:
                                          ArrayRef<unsigned> Indices,
                                          unsigned Alignment,
                                          unsigned AddressSpace,
-                                         bool IsMasked = false) = 0;
+                                         bool UseMaskForCond = false,
+                                         bool UseMaskForGaps = false) = 0;
   virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                          bool IsPairwiseForm) = 0;
   virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
@@ -1484,9 +1488,11 @@ public:
   }
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked) override {
+                                 unsigned AddressSpace, bool UseMaskForCond,
+                                 bool UseMaskForGaps) override {
     return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
   }
   int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                  bool IsPairwiseForm) override {
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index c64d4d36805..05e93099e12 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -453,7 +453,8 @@ public:
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
                                       unsigned Alignment, unsigned AddressSpace,
-                                      bool IsMasked = false) {
+                                      bool UseMaskForCond = false,
+                                      bool UseMaskForGaps = false) {
     return 1;
   }
 
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index 85d1a01e315..797260f439a 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -24,6 +24,7 @@ namespace llvm {
 template <typename T> class ArrayRef;
 class DemandedBits;
 class GetElementPtrInst;
+class InterleaveGroup; 
 class Loop;
 class ScalarEvolution;
 class TargetTransformInfo;
@@ -125,6 +126,20 @@ computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
 /// This function always sets a (possibly null) value for each K in Kinds.
 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 
+/// Create a mask that filters the members of an interleave group where there
+/// are gaps.
+///
+/// For example, the mask for \p Group with interleave-factor 3
+/// and \p VF 4, that has only its first member present is:
+///
+///   <1,0,0,1,0,0,1,0,0,1,0,0>
+///
+/// Note: The result is a mask of 0's and 1's, as opposed to the other
+/// create[*]Mask() utilities which create a shuffle mask (mask that
+/// consists of indices).
+Constant *createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
+                               const InterleaveGroup &Group);
+
 /// Create a mask with replicated elements.
 ///
 /// This function creates a shuffle mask for replicating each of the \p VF 
@@ -406,8 +421,8 @@ public:
   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
 
   /// Invalidate groups that require a scalar epilogue (due to gaps). This can
-  /// happen when we optimize for size and don't allow creating a scalar
-  /// epilogue.
+  /// happen when optimizing for size forbids a scalar epilogue, and the gap
+  /// cannot be filtered by masking the load/store.
   void invalidateGroupsRequiringScalarEpilogue();
 
 private:
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index daf5fbfef8e..224a41bc2b7 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -804,7 +804,8 @@ public:
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
                                       unsigned Alignment, unsigned AddressSpace,
-                                      bool IsMasked = false) {
+                                      bool UseMaskForCond = false,
+                                      bool UseMaskForGaps = false) {
     VectorType *VT = dyn_cast<VectorType>(VecTy);
     assert(VT && "Expect a vector type for interleaved memory op");
 
@@ -816,7 +817,7 @@ public:
 
     // Firstly, the cost of load/store operation.
     unsigned Cost;
-    if (IsMasked)
+    if (UseMaskForCond || UseMaskForGaps)
       Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
           Opcode, VecTy, Alignment, AddressSpace);
     else
@@ -917,7 +918,7 @@ public:
                     ->getVectorInstrCost(Instruction::InsertElement, VT, i);
     }
 
-    if (!IsMasked)
+    if (!UseMaskForCond)
       return Cost;
 
     Type *I8Type = Type::getInt8Ty(VT->getContext());
@@ -942,6 +943,15 @@ public:
       Cost += static_cast<T *>(this)->getVectorInstrCost(
           Instruction::InsertElement, MaskVT, i);
 
+    // The Gaps mask is invariant and created outside the loop, therefore the
+    // cost of creating it is not accounted for here. However if we have both
+    // a MaskForGaps and some other mask that guards the execution of the
+    // memory access, we need to account for the cost of And-ing the two masks
+    // inside the loop.
+    if (UseMaskForGaps)
+      Cost += static_cast<T *>(this)->getArithmeticInstrCost(
+          BinaryOperator::And, MaskVT); 
+
     return Cost;
   }
 
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 867403d0ef1..6e4eb8ff0cd 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -519,9 +519,12 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
 
 int TargetTransformInfo::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace, bool IsMasked) const {
-  int Cost = TTIImpl->getInterleavedMemoryOpCost(
-      Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked);
+    unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+    bool UseMaskForGaps) const {
+  int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                                 Alignment, AddressSpace,
+                                                 UseMaskForCond,
+                                                 UseMaskForGaps);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 8b6702c8544..38dca50e82a 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -504,6 +504,25 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
   return Inst;
 }
 
+Constant *llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
+                                           const InterleaveGroup &Group) {
+  // All 1's means mask is not needed.
+  if (Group.getNumMembers() == Group.getFactor())
+    return nullptr;
+
+  // TODO: support reversed access.
+  assert(!Group.isReverse() && "Reversed group not supported.");
+
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < Group.getFactor(); ++j) {
+      unsigned HasMember = Group.getMember(j) ? 1 : 0;
+      Mask.push_back(Builder.getInt1(HasMember));
+    }
+
+  return ConstantVector::get(Mask);
+}
+
 Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, 
                                      unsigned ReplicationFactor, unsigned VF) {
   SmallVector<Constant *, 16> MaskVec;
@@ -935,9 +954,10 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
   }
   for (auto *Ptr : DelSet) {
     LLVM_DEBUG(
-        dbgs() 
+        dbgs()
         << "LV: Invalidate candidate interleaved group due to gaps that "
-           "require a scalar epilogue.\n");
+           "require a scalar epilogue (not allowed under optsize) and cannot "
+           "be masked (not enabled). \n");
     releaseGroup(Ptr);
   }
 
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 77c83970f68..a256cb7c921 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -660,11 +660,13 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
                                                unsigned AddressSpace,
-                                               bool IsMasked) {
+                                               bool UseMaskForCond,
+                                               bool UseMaskForGaps) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
-  if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+  if (!UseMaskForCond && !UseMaskForGaps && 
+      Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
@@ -677,7 +679,8 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
 }
 
 int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index b3893d32850..08c1a892422 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -146,7 +146,9 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked = false);
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
 
   bool
   shouldConsiderAddressTypePromotion(const Instruction &I,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index a07c1e83a3f..f72bb8632eb 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -564,7 +564,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
                                            unsigned AddressSpace,
-                                           bool IsMasked) {
+                                           bool UseMaskForCond,
+                                           bool UseMaskForGaps) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
@@ -572,7 +573,7 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
-      !IsMasked) {
+      !UseMaskForCond && !UseMaskForGaps) {
     unsigned NumElts = VecTy->getVectorNumElements();
     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
@@ -585,7 +586,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
 }
 
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 84e3055c6bc..2dd143d48a1 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -169,7 +169,9 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked);
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 79b269bccfe..4c671460c90 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -206,10 +206,12 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
 
 unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
       Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      unsigned Alignment, unsigned AddressSpace, bool IsMasked) {
-  if (Indices.size() != Factor || IsMasked)
+      unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+      bool UseMaskForGaps) {
+  if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
   return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr);
 }
 
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 901a91692e8..5c6f85584ec 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -123,7 +123,8 @@ public:
             bool VariableMask, unsigned Alignment);
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
             unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
-            unsigned AddressSpace, bool IsMasked);
+            unsigned AddressSpace, bool UseMaskForCond = false,
+            bool UseMaskForGaps = false);
   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
             const Instruction *I);
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f67bacc87ec..bc9bcab83a0 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -474,10 +474,12 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
                                            unsigned AddressSpace,
-                                           bool IsMasked) {
-  if (IsMasked)
+                                           bool UseMaskForCond,
+                                           bool UseMaskForGaps) {
+  if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
 
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 252d46e7a2a..9221a910288 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -91,7 +91,8 @@ public:
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
                                  unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
 
   /// @}
 };
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index caa3f597445..94db56e3738 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -969,10 +969,12 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
                                                unsigned AddressSpace,
-                                               bool IsMasked) {
-  if (IsMasked)
+                                               bool UseMaskForCond,
+                                               bool UseMaskForGaps) {
+  if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
 
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 347a8a632f0..406f075c8a6 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -93,7 +93,9 @@ public:
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked = false);
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
   /// @}
 };
 
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 8d8bc0b35cb..ebb8aca5fb1 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2784,11 +2784,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
                                                unsigned AddressSpace,
-                                               bool IsMasked) {
+                                               bool UseMaskForCond,
+                                               bool UseMaskForGaps) {
 
-  if (IsMasked)
+  if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
 
   // We currently Support only fully-interleaved groups, with no gaps.
   // TODO: Support also strided loads (interleaved-groups with gaps).
@@ -2898,11 +2900,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                                  ArrayRef<unsigned> Indices,
                                                  unsigned Alignment,
                                                  unsigned AddressSpace,
-                                                 bool IsMasked) {
+                                                 bool UseMaskForCond,
+                                                 bool UseMaskForGaps) {
 
-  if (IsMasked)
+  if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
 
   // VecTy for interleave memop is <VF*Factor x Elt>.
   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -3021,7 +3025,8 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
                                            unsigned AddressSpace,
-                                           bool IsMasked) {
+                                           bool UseMaskForCond,
+                                           bool UseMaskForGaps) {
   auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
     Type *EltTy = VecTy->getVectorElementType();
     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
@@ -3033,11 +3038,14 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   };
   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
     return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
-                                            Alignment, AddressSpace, IsMasked);
+                                            Alignment, AddressSpace,
+                                            UseMaskForCond, UseMaskForGaps);
   if (ST->hasAVX2())
     return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
-                                          Alignment, AddressSpace, IsMasked);
+                                          Alignment, AddressSpace,
+                                          UseMaskForCond, UseMaskForGaps);
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
 }
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 2bd778a4211..1637592c81f 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -102,15 +102,18 @@ public:
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
                                  unsigned Alignment, unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
   int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
                                  unsigned Alignment, unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
   int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
                                  unsigned Alignment, unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
 
   int getIntImmCost(int64_t);
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ffa6b242e00..23d4a6b2166 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -172,6 +172,8 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 
+/// An interleave-group may need masking if it resides in a block that needs
+/// predication, or in order to mask away gaps. 
 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
@@ -1134,11 +1136,15 @@ public:
   }
 
   /// Returns true if an interleaved group requires a scalar iteration
-  /// to handle accesses with gaps.
+  /// to handle accesses with gaps, and there is nothing preventing us from
+  /// creating a scalar epilogue.
   bool requiresScalarEpilogue() const {
-    return InterleaveInfo.requiresScalarEpilogue();
+    return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
   }
 
+  /// Returns true if a scalar epilogue is not allowed due to optsize.
+  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
+
   /// Returns true if all loop blocks should be masked to fold tail loop.
   bool foldTailByMasking() const { return FoldTailByMasking; }
 
@@ -1229,6 +1235,15 @@ private:
   /// vectorization as a predicated block.
   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
 
+  /// Records whether it is allowed to have the original scalar loop execute at
+  /// least once. This may be needed as a fallback loop in case runtime 
+  /// aliasing/dependence checks fail, or to handle the tail/remainder
+  /// iterations when the trip count is unknown or doesn't divide by the VF,
+  /// or as a peel-loop to handle gaps in interleave-groups.
+  /// Under optsize and when the trip count is very small we don't allow any
+  /// iterations to execute in the scalar loop.
+  bool IsScalarEpilogueAllowed = true;
+
   /// All blocks of loop are to be masked to fold tail of scalar iterations.
   bool FoldTailByMasking = false;
 
@@ -1938,6 +1953,17 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
                                      "reverse");
 }
 
+// Return whether we allow using masked interleave-groups (for dealing with
+// strided loads/stores that reside in predicated blocks, or for dealing
+// with gaps).
+static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
+  // If an override option has been passed in for interleaved accesses, use it.
+  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
+    return EnableMaskedInterleavedMemAccesses;
+
+  return TTI.enableMaskedInterleavedAccessVectorization();
+}
+
 // Try to vectorize the interleave group that \p Instr belongs to.
 //
 // E.g. Translate following interleaved load group (factor = 3):
@@ -1990,12 +2016,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
   unsigned Index = Group->getIndex(Instr);
 
   VectorParts Mask;
-  bool IsMaskRequired = BlockInMask;
-  if (IsMaskRequired) {
+  bool IsMaskForCondRequired = BlockInMask;
+  if (IsMaskForCondRequired) {
     Mask = *BlockInMask;
     // TODO: extend the masked interleaved-group support to reversed access.
     assert(!Group->isReverse() && "Reversed masked interleave-group "
-                                  "not supported."); 
+                                  "not supported.");
   }
 
   // If the group is reverse, adjust the index to refer to the last vector lane
@@ -2036,20 +2062,35 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
   setDebugLocFromInst(Builder, Instr);
   Value *UndefVec = UndefValue::get(VecTy);
 
+  Value *MaskForGaps = nullptr;
+  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
+    MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
+    assert(MaskForGaps && "Mask for Gaps is required but it is null");
+  }
+
   // Vectorize the interleaved load group.
   if (isa<LoadInst>(Instr)) {
     // For each unroll part, create a wide load for the group.
     SmallVector<Value *, 2> NewLoads;
     for (unsigned Part = 0; Part < UF; Part++) {
       Instruction *NewLoad;
-      if (IsMaskRequired) {
-        auto *Undefs = UndefValue::get(Mask[Part]->getType());
-        auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
-        Value *ShuffledMask = Builder.CreateShuffleVector(
-            Mask[Part], Undefs, RepMask, "interleaved.mask");
-        NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 
-                                           ShuffledMask, UndefVec,
-                                           "wide.masked.vec");
+      if (IsMaskForCondRequired || MaskForGaps) {
+        assert(useMaskedInterleavedAccesses(*TTI) &&
+               "masked interleaved groups are not allowed.");
+        Value *GroupMask = MaskForGaps;
+        if (IsMaskForCondRequired) {
+          auto *Undefs = UndefValue::get(Mask[Part]->getType());
+          auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+          Value *ShuffledMask = Builder.CreateShuffleVector(
+              Mask[Part], Undefs, RepMask, "interleaved.mask");
+          GroupMask = MaskForGaps
+                          ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
+                                                MaskForGaps)
+                          : ShuffledMask;
+        }
+        NewLoad =
+            Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
+                                     GroupMask, UndefVec, "wide.masked.vec");
       }
       else
         NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], 
@@ -2121,7 +2162,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
                                               "interleaved.vec");
 
     Instruction *NewStoreInstr;
-    if (IsMaskRequired) {
+    if (IsMaskForCondRequired) {
       auto *Undefs = UndefValue::get(Mask[Part]->getType());
       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
       Value *ShuffledMask = Builder.CreateShuffleVector(
@@ -4333,29 +4374,32 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
   return false;
 }
 
-static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
-  if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0))
-    return TTI.enableMaskedInterleavedAccessVectorization();
-
-  // If an override option has been passed in for interleaved accesses, use it.
-  return EnableMaskedInterleavedMemAccesses;
-}
-
 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
                                                                unsigned VF) {
   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
   assert(getWideningDecision(I, VF) == CM_Unknown &&
          "Decision should not be set yet.");
-
-  if (!Legal->blockNeedsPredication(I->getParent()) ||
-      !Legal->isMaskRequired(I))
+  auto *Group = getInterleavedAccessGroup(I);
+  assert(Group && "Must have a group.");
+
+  // Check if masking is required.
+  // A Group may need masking for one of two reasons: it resides in a block that
+  // needs predication, or it was decided to use masking to deal with gaps.
+  bool PredicatedAccessRequiresMasking = 
+      Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
+  bool AccessWithGapsRequiresMasking = 
+      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
     return true;
 
-  if (!useMaskedInterleavedAccesses(TTI))
-    return false;
+  // If masked interleaving is required, we expect that the user/target had
+  // enabled it, because otherwise it either wouldn't have been created or
+  // it should have been invalidated by the CostModel.
+  assert(useMaskedInterleavedAccesses(TTI) &&
+         "Masked interleave-groups for predicated accesses are not enabled.");
 
   auto *Ty = getMemInstValueType(I);
-  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) 
+  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
                           : TTI.isLegalMaskedStore(Ty);
 }
 
@@ -4606,9 +4650,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
   // Record that scalar epilogue is not allowed.
   LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
 
+  IsScalarEpilogueAllowed = !OptForSize;
+
   // We don't create an epilogue when optimizing for size.
-  // Invalidate interleave groups that require an epilogue.
-  InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+  // Invalidate interleave groups that require an epilogue if we can't mask
+  // the interleave-group.
+  if (!useMaskedInterleavedAccesses(TTI)) 
+    InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
 
   unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
 
@@ -5495,13 +5543,15 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   }
 
   // Calculate the cost of the whole interleaved group.
+  bool UseMaskForGaps = 
+      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
   unsigned Cost = TTI.getInterleavedMemoryOpCost(
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
-      Group->getAlignment(), AS, Legal->isMaskRequired(I));
+      Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
 
   if (Group->isReverse()) {
     // TODO: Add support for reversed masked interleaved access.
-    assert(!Legal->isMaskRequired(I) && 
+    assert(!Legal->isMaskRequired(I) &&
            "Reverse masked interleaved access not supported.");
     Cost += Group->getNumMembers() *
             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index a2304e447f5..0b23d6286f9 100644
--- a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -84,26 +84,39 @@ for.end:
 
 ; Exactly the same scenario except we are now optimizing for size, therefore
 ; we check that no scalar epilogue is created. Since we can't create an epilog
-; the interleave-group is invalidated because is has gaps, so we end up
-; scalarizing.
-; (Before the fix that this test checks, we used to create an epilogue despite
-; optsize, and vectorized the access as an interleaved-group. This is now fixed,
-; and we make sure that a scalar epilogue does not exist).
+; we need the ability to mask out the gaps.
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads with the mask properly shuffled and
+; And-ed with the gaps mask.
 
 ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
-;ENABLED_MASKED_STRIDED: vector.body:
-;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
-;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-;ENABLED_MASKED_STRIDED-NOT:   %interleaved.mask = 
-;ENABLED_MASKED_STRIDED-NOT:   call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
-;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
-;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-;ENABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
-;ENABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
-;ENABLED_MASKED_STRIDED-NOT:   %interleaved.mask = 
-;ENABLED_MASKED_STRIDED-NOT:   call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
-;ENABLED_MASKED_STRIDED-NOT: for.body:
-;ENABLED_MASKED_STRIDED:     for.end:
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]])
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+
 
 define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
 entry:
@@ -138,12 +151,15 @@ for.end:
 ; remainder loop into the main loop using masking) together with interleaved-
 ; groups.
 ; When masked-interleave-group is disabled the interleave-groups will be
-; invalidated during Legality checks;
-; When masked-interleave-group is enabled the interleave-groups will be
-; invalidated during cost-model checks, because we don't have a way to support
-; interleave-groups with gaps that require an epilogue using masking.
-; So in both cases we check for no epilogue and scalarized conditional accesses.
-
+; invalidated during Legality checks; So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
 ; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p,
 ;                      unsigned char* restrict q,
 ;                      unsigned char guard,
@@ -178,21 +194,39 @@ for.end:
 
 
 ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi 
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}}
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; ENABLED_MASKED_STRIDED:       pred.load.if:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
 ; ENABLED_MASKED_STRIDED-NOT:   for.body:
 ; ENABLED_MASKED_STRIDED:       for.end:
 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
@@ -231,17 +265,115 @@ for.end:
   ret void
 }
 
+; Same, with stride 3. This is to check the gaps-mask and the shuffled mask
+; with a different stride.
+; So accesses are with gaps under Optsize scenario again, with unknown trip-
+; count, in order to check the behavior of folding-the-tail (folding the
+; remainder loop into the main loop using masking) together with interleaved-
+; groups.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
+; void masked_strided3_optsize_unknown_tc(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard,
+;                      int n) {
+;   for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char t = p[3*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided3_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[INDEX]], 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <24 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0v24i8(<24 x i8>* [[TMP5]], i32 1, <24 x i1> [[TMP6]], <24 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+;
+define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.010, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = mul nsw i32 %ix.010, 3
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
 
-; Same, but the load/store are not predicated. The interleave-group is
-; invalidated here as well because we have gaps and we can't create an epilog.
-; The access is thus scalarized.
+; Back to stride 2 with gaps with a known trip count under opt for size,
+; but this time the load/store are not predicated. 
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks because we have gaps and we can't
+; create an epilog. The access is thus scalarized.
 ; (Before the fix that this test checks, we used to create an epilogue despite
 ; optsize, and vectorized the access as an interleaved-group. This is now fixed,
 ; and we make sure that a scalar epilogue does not exist).
-; Since enable-masked-interleaved-accesses currently only affects predicated
-; accesses, the behavior is the same with this switch set/unset.
-
-
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads (masking away the gaps).
+;
 ; void unconditional_strided1_optsize(const unsigned char* restrict p,
 ;                                unsigned char* restrict q,
 ;                                unsigned char guard) {
@@ -259,11 +391,25 @@ for.end:
 ;DISABLED_MASKED_STRIDED:     for.end:
 
 ;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
-;ENABLED_MASKED_STRIDED: vector.body:
-;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
-;ENABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
-;ENABLED_MASKED_STRIDED-NOT: for.body:
-;ENABLED_MASKED_STRIDED:     for.end:
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+
 
 define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
 entry:
@@ -289,13 +435,17 @@ for.end:
 ; Unconditioal accesses with gaps under Optsize scenario again, with unknown
 ; trip-count this time, in order to check the behavior of folding-the-tail 
 ; (folding the remainder loop into the main loop using masking) together with
-; interleaved-groups.
-; The interleave-groups will be invalidated during cost-model checks, because
-; we don't have a way to support interleave-groups with gaps that require an
-; epilogue using masking (even when interleaved-masking is enabled; this
-; is not yet supported).
-; So we check for no epilogue and for scalarized conditional accesses.
-
+; interleaved-groups. Folding-the-tail turns the accesses to conditional which
+; requires proper masking. In addition we need to mask out the gaps (all
+; because we are not allowed to use an epilog due to optsize).
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks. So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The shuffled mask is also And-ed with the gaps mask.
+;
 ;   for(ix=0; ix < n; ++ix) {
 ;         char t = p[2*ix];
 ;         q[ix] = t;
@@ -319,21 +469,36 @@ for.end:
 ; DISABLED_MASKED_STRIDED:       for.end:
 ; DISABLED_MASKED_STRIDED-NEXT:    ret void
 
-
 ; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; ENABLED_MASKED_STRIDED:       pred.load.if:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP2]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]], label [[FOR_END]], label [[VECTOR_BODY]]
 ; ENABLED_MASKED_STRIDED-NOT:   for.body:
 ; ENABLED_MASKED_STRIDED:       for.end:
 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
-- 
GitLab


From b1ebaf21d0d719dd21ca85927195ef0629687553 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Wed, 31 Oct 2018 10:30:50 +0000
Subject: [PATCH 0792/1116] [IndVars] Strengthen restricton in
 rewriteLoopExitValues

For some unclear reason rewriteLoopExitValues considers recalculation
after the loop profitable if it has some "soft uses" outside the loop (i.e. any
use other than call and return), even if we have proved that it has a user inside
the loop which we think will not be optimized away.

There is no existing unit test that would explain this. This patch provides an
example when rematerialisation of exit value is not profitable but it passes
this check due to presence of a "soft use" outside the loop.

It makes no sense to recalculate value on exit if we are going to compute it
due to some irremovable within the loop. This patch disallows applying this
transform in the described situation.

Differential Revision: https://reviews.llvm.org/D51581
Reviewed By: etherzhhb


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345708 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/IndVarSimplify.cpp      | 35 ++++---------------
 .../IndVarSimplify/dont-recompute.ll          | 26 ++++++++++++++
 2 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index c35478e220b..ec51ad71abc 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -595,41 +595,20 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
             !isSafeToExpand(ExitValue, *SE))
           continue;
 
-        // Computing the value outside of the loop brings no benefit if :
-        //  - it is definitely used inside the loop in a way which can not be
-        //    optimized away.
-        //  - no use outside of the loop can take advantage of hoisting the
-        //    computation out of the loop
+        // Computing the value outside of the loop brings no benefit if it is
+        // definitely used inside the loop in a way which can not be optimized
+        // away.
         if (ExitValue->getSCEVType()>=scMulExpr) {
           bool HasHardInternalUses = false;
-          bool HasSoftExternalUses = false;
           for (auto *IB : Inst->users()) {
             Instruction *UseInstr = cast<Instruction>(IB);
             unsigned Opc = UseInstr->getOpcode();
-            if (L->contains(UseInstr)) {
-              if (Opc == Instruction::Call)
-                HasHardInternalUses = true;
-            } else {
-              if (Opc == Instruction::PHI) {
-                // Do not count the Phi as a use. LCSSA may have inserted
-                // plenty of trivial ones.
-                for (auto *PB : UseInstr->users()) {
-                  unsigned PhiOpc = cast<Instruction>(PB)->getOpcode();
-                  if (PhiOpc != Instruction::Call &&
-                      PhiOpc != Instruction::Ret) {
-                    HasSoftExternalUses = true;
-                    break;
-                  }
-                }
-                continue;
-              }
-              if (Opc != Instruction::Call && Opc != Instruction::Ret) {
-                HasSoftExternalUses = true;
-                break;
-              }
+            if (L->contains(UseInstr) && Opc == Instruction::Call) {
+              HasHardInternalUses = true;
+              break;
             }
           }
-          if (HasHardInternalUses && !HasSoftExternalUses)
+          if (HasHardInternalUses)
             continue;
         }
 
diff --git a/test/Transforms/IndVarSimplify/dont-recompute.ll b/test/Transforms/IndVarSimplify/dont-recompute.ll
index 713a55154ba..c87cd6596c6 100644
--- a/test/Transforms/IndVarSimplify/dont-recompute.ll
+++ b/test/Transforms/IndVarSimplify/dont-recompute.ll
@@ -97,3 +97,29 @@ for.end:                                          ; preds = %for.body
   tail call void @func(i32 %add)
   ret void
 }
+
+; CHECK-LABEL: @test4(
+define void @test4(i32 %m) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add i32 %a.05, %m
+; CHECK: tail call void @func(i32 %add)
+  tail call void @func(i32 %add)
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 186
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+; CHECK: for.end:
+; CHECK-NOT: mul i32 %m, 186
+; CHECK:%add.lcssa = phi i32 [ %add, %for.body ]
+; CHECK-NEXT: %soft_use = add i32 %add.lcssa, 123
+; CHECK-NEXT: tail call void @func(i32 %soft_use)
+  %soft_use = add i32 %add, 123
+  tail call void @func(i32 %soft_use)
+  ret void
+}
-- 
GitLab


From 5ab552691a86947ec745969a2b9a3d0aa3520a50 Mon Sep 17 00:00:00 2001
From: Neil Henning <neil.henning@amd.com>
Date: Wed, 31 Oct 2018 10:34:48 +0000
Subject: [PATCH 0793/1116] [AMDGPU] support image load/store a16

Our a16 support was only enabled for sample/gather and buffer
load/store, but not for image load/store operations (which take an i16
as the pixel index rather than a half).

Fix our isel lowering and add test cases to prove it out.

Differential Revision: https://reviews.llvm.org/D53750

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345710 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIISelLowering.cpp          |   6 +-
 .../AMDGPU/llvm.amdgcn.image.a16.dim.ll       | 530 ++++++++++++++++++
 .../AMDGPU/llvm.amdgcn.image.load.a16.d16.ll  | 128 +++++
 .../AMDGPU/llvm.amdgcn.image.load.a16.ll      | 128 +++++
 .../AMDGPU/llvm.amdgcn.image.store.a16.d16.ll | 140 +++++
 .../AMDGPU/llvm.amdgcn.image.store.a16.ll     | 128 +++++
 test/MC/AMDGPU/mimg.s                         |  88 +++
 7 files changed, 1146 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 13b92fc07a1..e41cf6e771b 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4726,9 +4726,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   // Check for 16 bit addresses and pack if true.
   unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
   MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
-  if (VAddrVT.getScalarType() == MVT::f16 &&
+  const MVT VAddrScalarVT = VAddrVT.getScalarType();
+  if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
       ST->hasFeature(AMDGPU::FeatureR128A16)) {
     IsA16 = true;
+    const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
     for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
       SDValue AddrLo, AddrHi;
       // Push back extra arguments.
@@ -4747,7 +4749,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
           AddrHi = Op.getOperand(i + 1);
           i++;
         }
-        AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f16,
+        AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
                              {AddrLo, AddrHi});
         AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
       }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
new file mode 100644
index 00000000000..96f0210825c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
@@ -0,0 +1,530 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}load_1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %t = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 %s, i16 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %r = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_cube:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1darray:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %slice = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i16(i32 15, i16 %s, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_2darray:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_2dmsaa:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %fragid = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_2darraymsaa:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %fragid = extractelement <2 x i16> %coords_hi, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_1d:
+; GCN: image_load_mip v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %mip = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i16(i32 15, i16 %s, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_2d:
+; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %mip = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_3d:
+; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %r = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_cube:
+; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_1darray:
+; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %slice = extractelement <2 x i16> %coords_lo, i32 1
+  %mip = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i16(i32 15, i16 %s, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_2darray:
+; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}store_1d:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2d:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %t = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_3d:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %r = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_cube:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.cube.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1darray:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %slice = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.1darray.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2darray:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.2darray.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2dmsaa:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %fragid = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2darraymsaa:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %fragid = extractelement <2 x i16> %coords_hi, i32 1
+  call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_1d:
+; GCN: image_store_mip v[0:3], v4, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %mip = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.mip.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_2d:
+; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %mip = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.mip.2d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_3d:
+; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %r = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  call void @llvm.amdgcn.image.store.mip.3d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %r, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_cube:
+; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  call void @llvm.amdgcn.image.store.mip.cube.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_1darray:
+; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %slice = extractelement <2 x i16> %coords_lo, i32 1
+  %mip = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_2darray:
+; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}getresinfo_1d:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_2d:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_3d:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_cube:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_1darray:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_2darray:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_2dmsaa:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_2darraymsaa:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1d_V1:
+; GCN: image_load v0, v0, s[0:7] dmask:0x8 unorm a16
+define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call float @llvm.amdgcn.image.load.1d.f32.i16(i32 8, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret float %v
+}
+
+; GCN-LABEL: {{^}}load_1d_V2:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm a16
+define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i16(i32 9, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+; GCN-LABEL: {{^}}store_1d_V1:
+; GCN: image_store v0, v1, s[0:7] dmask:0x2 unorm a16
+define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.f32.i16(float %vdata, i32 2, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1d_V2:
+; GCN: image_store v[0:1], v2, s[0:7] dmask:0xc unorm a16
+define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v2f32.i16(<2 x float> %vdata, i32 12, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_1d_glc:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc a16{{$}}
+define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 1)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1d_slc:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc a16{{$}}
+define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 2)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1d_glc_slc:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc a16{{$}}
+define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 3)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}store_1d_glc:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc a16{{$}}
+define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 1)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1d_slc:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc a16{{$}}
+define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 2)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1d_glc_slc:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc a16{{$}}
+define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 3)
+  ret void
+}
+
+; GCN-LABEL: {{^}}getresinfo_dmask0:
+; GCN-NOT: image
+; GCN: ; return to shader part epilog
+define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %r = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 0, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %r
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16(i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i16(i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i16(i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i16(i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #1
+
+declare void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float>, i32, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float>, i32, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.cube.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1darray.v4f32.i16(<4 x float>, i32, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darray.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i16(<4 x float>, i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #0
+
+declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i16(<4 x float>, i32, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i16(<4 x float>, i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i16(<4 x float>, i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i16(<4 x float>, i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #0
+
+declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+
+declare float @llvm.amdgcn.image.load.1d.f32.i16(i32, i16, <8 x i32>, i32, i32) #1
+declare float @llvm.amdgcn.image.load.2d.f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i16(i32, i16, <8 x i32>, i32, i32) #1
+declare void @llvm.amdgcn.image.store.1d.f32.i16(float, i32, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1d.v2f32.i16(<2 x float>, i32, i16, <8 x i32>, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
new file mode 100644
index 00000000000..1fbfccb0e39
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
@@ -0,0 +1,128 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}load.f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps <4 x half> @load.f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v2f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v3f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps <4 x half> @load.v4f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps <4 x half> @load.f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v2f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v3f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps <4 x half> @load.v4f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps <4 x half> @load.f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v2f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v3f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps <4 x half> @load.v4f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+declare <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32, i16, i16, <8 x i32>, i32, i32) #2
+declare <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
new file mode 100644
index 00000000000..d857ae115a7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
@@ -0,0 +1,128 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}load.f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps <4 x float> @load.v2f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps <4 x float> @load.v3f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load.v4f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps <4 x float> @load.f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps <4 x float> @load.v2f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps <4 x float> @load.v3f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load.v4f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps <4 x float> @load.f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps <4 x float> @load.v2f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps <4 x float> @load.v3f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load.v4f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
new file mode 100644
index 00000000000..48d26f7db20
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
@@ -0,0 +1,140 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}store.f16.1d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps void @store.f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.1d.v4f16.i16(<4 x half> %bitcast, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f16.1d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps void @store.v2f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.1d.v4f16.i16(<4 x half> %bitcast, i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f16.1d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps void @store.v3f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.1d.v4f16.i16(<4 x half> %bitcast, i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f16.1d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps void @store.v4f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.1d.v4f16.i16(<4 x half> %bitcast, i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.f16.2d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps void @store.f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.2d.v4f16.i16(<4 x half> %bitcast, i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f16.2d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps void @store.v2f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.2d.v4f16.i16(<4 x half> %bitcast, i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f16.2d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps void @store.v3f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.2d.v4f16.i16(<4 x half> %bitcast, i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f16.2d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps void @store.v4f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.2d.v4f16.i16(<4 x half> %bitcast, i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.f16.3d:
+; GCN: image_store v[2:3], v[0:1], s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps void @store.f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.3d.v4f16.i16(<4 x half> %bitcast, i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f16.3d:
+; GCN: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps void @store.v2f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.3d.v4f16.i16(<4 x half> %bitcast, i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f16.3d:
+; GCN: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps void @store.v3f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.3d.v4f16.i16(<4 x half> %bitcast, i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f16.3d:
+; GCN: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps void @store.v4f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.3d.v4f16.i16(<4 x half> %bitcast, i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.image.store.1d.v4f16.i16(<4 x half>, i32, i16, <8 x i32>, i32, i32) #2
+declare void @llvm.amdgcn.image.store.2d.v4f16.i16(<4 x half>, i32, i16, i16, <8 x i32>, i32, i32) #2
+declare void @llvm.amdgcn.image.store.3d.v4f16.i16(<4 x half>, i32, i16, i16, i16, <8 x i32>, i32, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
new file mode 100644
index 00000000000..f5ec31ba781
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
@@ -0,0 +1,128 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}store.f32.1d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps void @store.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f32.1d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps void @store.v2f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f32.1d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps void @store.v3f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f32.1d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store.v4f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.f32.2d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps void @store.f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f32.2d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps void @store.v2f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float> %val, i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f32.2d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps void @store.v3f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float> %val, i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f32.2d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store.v4f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float> %val, i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.f32.3d:
+; GCN: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps void @store.f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f32.3d:
+; GCN: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps void @store.v2f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float> %val, i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f32.3d:
+; GCN: image_store v[2:5], v[0:1], s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps void @store.v3f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float> %val, i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f32.3d:
+; GCN: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store.v4f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float> %val, i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float>, i32, i16, <8 x i32>, i32, i32) #2
+declare void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float>, i32, i16, i16, <8 x i32>, i32, i32) #2
+declare void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/MC/AMDGPU/mimg.s b/test/MC/AMDGPU/mimg.s
index 95bc4c306e7..83835270a1d 100644
--- a/test/MC/AMDGPU/mimg.s
+++ b/test/MC/AMDGPU/mimg.s
@@ -157,6 +157,84 @@ image_load v[5:7], v[1:4], s[8:15] dmask:0xf tfe d16
 // GFX8_1:   image_load v[5:7], v[1:4], s[8:15] dmask:0xf tfe d16 ; encoding: [0x00,0x0f,0x01,0xf0,0x01,0x05,0x02,0x80]
 // GFX9:     image_load v[5:7], v[1:4], s[8:15] dmask:0xf tfe d16 ; encoding: [0x00,0x0f,0x01,0xf0,0x01,0x05,0x02,0x80]
 
+//===----------------------------------------------------------------------===//
+// Image Load/Store: a16
+//===----------------------------------------------------------------------===//
+
+image_load v5, v[1:2], s[8:15] unorm a16
+// GFX9:     image_load v5, v[1:2], s[8:15] unorm a16 ; encoding: [0x00,0x90,0x00,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_load v[5:6], v[1:2], s[8:15] dmask:0x3 unorm a16
+// GFX9:     image_load v[5:6], v[1:2], s[8:15] dmask:0x3 unorm a16 ; encoding: [0x00,0x93,0x00,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_load v[5:7], v[1:2], s[8:15] dmask:0x7 unorm a16
+// GFX9:     image_load v[5:7], v[1:2], s[8:15] dmask:0x7 unorm a16 ; encoding: [0x00,0x97,0x00,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_load v[5:8], v[1:2], s[8:15] dmask:0xf unorm a16
+// GFX9:     image_load v[5:8], v[1:2], s[8:15] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x00,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v5, v[1:2], s[8:15] unorm a16
+// GFX9:     image_store v5, v[1:2], s[8:15] unorm a16 ; encoding: [0x00,0x90,0x20,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v[5:6], v[1:2], s[8:15] dmask:0x3 unorm a16
+// GFX9:     image_store v[5:6], v[1:2], s[8:15] dmask:0x3 unorm a16 ; encoding: [0x00,0x93,0x20,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v[5:7], v[1:2], s[8:15] dmask:0x7 unorm a16
+// GFX9:     image_store v[5:7], v[1:2], s[8:15] dmask:0x7 unorm a16 ; encoding: [0x00,0x97,0x20,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v[5:8], v[1:2], s[8:15] dmask:0xf unorm a16
+// GFX9:     image_store v[5:8], v[1:2], s[8:15] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x20,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+/===----------------------------------------------------------------------===//
+// Image Load/Store: a16 & d16
+//===----------------------------------------------------------------------===//
+
+image_load v5, v[1:2], s[8:15] dmask:0x3 unorm a16 d16
+// GFX9:     image_load v5, v[1:2], s[8:15] dmask:0x3 unorm a16 d16 ; encoding: [0x00,0x93,0x00,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_load v[5:6], v[1:2], s[8:15] dmask:0x7 unorm a16 d16
+// GFX9:     image_load v[5:6], v[1:2], s[8:15] dmask:0x7 unorm a16 d16 ; encoding: [0x00,0x97,0x00,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_load v[5:6], v[1:2], s[8:15] dmask:0xf unorm a16 d16
+// GFX9:     image_load v[5:6], v[1:2], s[8:15] dmask:0xf unorm a16 d16 ; encoding: [0x00,0x9f,0x00,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v5, v[1:2], s[8:15] dmask:0x3 unorm a16 d16
+// GFX9:     image_store v5, v[1:2], s[8:15] dmask:0x3 unorm a16 d16 ; encoding: [0x00,0x93,0x20,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v[5:6], v[1:2], s[8:15] dmask:0x7 unorm a16 d16
+// GFX9:     image_store v[5:6], v[1:2], s[8:15] dmask:0x7 unorm a16 d16 ; encoding: [0x00,0x97,0x20,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v[5:6], v[1:2], s[8:15] dmask:0xf unorm a16 d16
+// GFX9:     image_store v[5:6], v[1:2], s[8:15] dmask:0xf unorm a16 d16 ; encoding: [0x00,0x9f,0x20,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
 //===----------------------------------------------------------------------===//
 // Image Load/Store: PCK variants
 //===----------------------------------------------------------------------===//
@@ -193,6 +271,11 @@ image_load_mip_pck v5, v[1:4], s[8:15] dmask:0x1 d16
 // NOVI:   error: invalid operand for instruction
 // NOGFX9: error: invalid operand for instruction
 
+image_load_mip_pck v5, v[1:2], s[8:15] dmask:0x1 a16
+// GFX9:   image_load_mip_pck v5, v[1:2], s[8:15] dmask:0x1 a16 ; encoding: [0x00,0x81,0x10,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI: error: a16 modifier is not supported on this GPU
+// NOVI:   error: a16 modifier is not supported on this GPU
+
 image_store_mip_pck v252, v2, s[12:19] dmask:0x1 unorm
 // GCN: image_store_mip_pck v252, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x2c,0xf0,0x02,0xfc,0x03,0x00]
 
@@ -216,6 +299,11 @@ image_store_mip_pck v252, v[2:5], s[12:19] dmask:0x1 d16
 // NOVI:   error: invalid operand for instruction
 // NOGFX9: error: invalid operand for instruction
 
+image_store_mip_pck v252, v[2:3], s[12:19] dmask:0x1 a16
+// GFX9:   image_store_mip_pck v252, v[2:3], s[12:19] dmask:0x1 a16 ; encoding: [0x00,0x81,0x2c,0xf0,0x02,0xfc,0x03,0x00]
+// NOSICI: error: a16 modifier is not supported on this GPU
+// NOVI:   error: a16 modifier is not supported on this GPU
+
 //===----------------------------------------------------------------------===//
 // Image Sample
 //===----------------------------------------------------------------------===//
-- 
GitLab


From c8be3e89c6ebfa34e1258568e5c202cd2fdd06a7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <florian.hahn@arm.com>
Date: Wed, 31 Oct 2018 11:00:48 +0000
Subject: [PATCH 0794/1116] [ADT] Remove illegal comparison of singular
 iterators from SmallSetTest

This removes the assertion that a copy of a moved-from SmallSetIterator
equals the original, which is illegal due to SmallSetIterator including
an instance of a standard `std::set` iterator.

C++ [iterator.requirements.general] states that comparing singular
iterators has undefined result:

> Iterators can also have singular values that are not associated with
> any sequence. [...] Results of most expressions are undefined for
> singular values; the only exceptions are destroying an iterator that
> holds a singular value, the assignment of a non-singular value to an
> iterator that holds a singular value, and, for iterators that satisfy
> the Cpp17DefaultConstructible requirements, using a value-initialized
> iterator as the source of a copy or move operation.

This assertion triggers the following error in the GNU C++ Library in
debug mode under EXPENSIVE_CHECKS:

  /usr/include/c++/8.2.1/debug/safe_iterator.h:518:
  Error: attempt to compare a singular iterator to a singular iterator.

  Objects involved in the operation:
      iterator "lhs" @ 0x0x7fff86420670 {
        state = singular;
      }
      iterator "rhs" @ 0x0x7fff86420640 {
        state = singular;
      }

Patch by Eugene Sharygin.

Reviewers: fhahn, dblaikie, chandlerc

Reviewed By: fhahn, dblaikie

Differential Revision: https://reviews.llvm.org/D53793


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345712 91177308-0d34-0410-b5e6-96231b3b80d8
---
 unittests/ADT/SmallSetTest.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/unittests/ADT/SmallSetTest.cpp b/unittests/ADT/SmallSetTest.cpp
index d78a72b38f8..3391a5c83f5 100644
--- a/unittests/ADT/SmallSetTest.cpp
+++ b/unittests/ADT/SmallSetTest.cpp
@@ -142,8 +142,4 @@ TEST(SmallSetTest, IteratorIncMoveCopy) {
   auto Iter2 = s1.begin();
   Iter = std::move(Iter2);
   EXPECT_EQ("str 0", *Iter);
-
-  auto Iter3 = s1.end();
-  Iter3 = Iter2;
-  EXPECT_EQ(Iter3, Iter2);
 }
-- 
GitLab


From ce084155a27c9e8a2c11f5d46d5020ef545e1472 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Wed, 31 Oct 2018 11:28:23 +0000
Subject: [PATCH 0795/1116] [NFC] Add tests for loop-simplifycfg for further
 development

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345713 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../LoopSimplifyCFG/constant-fold-branch.ll   | 1408 +++++++++++++++++
 1 file changed, 1408 insertions(+)
 create mode 100644 test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll

diff --git a/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll b/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
new file mode 100644
index 00000000000..69d79d5a6f1
--- /dev/null
+++ b/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -0,0 +1,1408 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-simplifycfg < %s | FileCheck %s
+; RUN: opt -S -passes='require<domtree>,loop(simplify-cfg)' < %s | FileCheck %s
+; RUN: opt -S -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+; Make sure that we can eliminate a provably dead backedge.
+define i32 @dead_backedge_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_backedge_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_BE:%.*]], [[HEADER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[I_1:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[I_1]], 100
+; CHECK-NEXT:    br i1 [[CMP1]], label [[HEADER_BACKEDGE]], label [[DEAD_BACKEDGE:%.*]]
+; CHECK:       header.backedge:
+; CHECK-NEXT:    [[I_BE]] = phi i32 [ [[I_1]], [[HEADER]] ], [ [[I_2:%.*]], [[DEAD_BACKEDGE]] ]
+; CHECK-NEXT:    br label [[HEADER]]
+; CHECK:       dead_backedge:
+; CHECK-NEXT:    [[I_2]] = add i32 [[I_1]], 10
+; CHECK-NEXT:    br i1 false, label [[HEADER_BACKEDGE]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_2_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.1, %header], [%i.2, %dead_backedge]
+  %i.1 = add i32 %i, 1
+  %cmp1 = icmp slt i32 %i.1, 100
+  br i1 %cmp1, label %header, label %dead_backedge
+
+dead_backedge:
+  %i.2 = add i32 %i.1, 10
+  br i1 false, label %header, label %exit
+
+exit:
+  ret i32 %i.2
+}
+
+; Check that we can eliminate a triangle.
+define i32 @dead_block_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_block_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can eliminate dead branches of a switch.
+define i32 @dead_block_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_block_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can eliminate several dead blocks.
+define i32 @dead_block_propogate_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_block_propogate_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+
+dummy:
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can eliminate several blocks while removing a switch.
+define i32 @dead_block_propogate_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_block_propogate_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+
+dummy:
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we preserve static reachibility of a dead exit block while deleting
+; a branch.
+define i32 @dead_exit_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_exit_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[DUMMY:%.*]]
+; CHECK:       dummy:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I_LCSSA]], [[DUMMY]] ], [ [[I_INC_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[I_1]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  br label %dummy
+
+dummy:
+  br label %exit
+
+backedge:
+  %i.inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  %i.1 = phi i32 [%i.inc, %backedge], [%i, %dummy]
+  ret i32 %i.1
+}
+
+; Check that we preserve static reachibility of a dead exit block while deleting
+; a switch.
+define i32 @dead_exit_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_exit_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I]], [[HEADER]] ], [ [[I]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[DUMMY:%.*]]
+; CHECK:       dummy:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I_LCSSA]], [[DUMMY]] ], [ [[I_INC_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[I_1]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+
+dead:
+  br label %dummy
+
+dummy:
+  br label %exit
+
+backedge:
+  %i.inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  %i.1 = phi i32 [%i.inc, %backedge], [%i, %dummy]
+  ret i32 %i.1
+}
+
+; Check that we can completely eliminate the current loop, branch case.
+define i32 @dead_loop_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_loop_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 false, label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+
+dummy:
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 false, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can completely eliminate the current loop, switch case.
+define i32 @dead_loop_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_loop_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 false, label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+
+dummy:
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 false, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can delete a dead inner loop entirely.
+define i32 @dead_sub_loop_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_sub_loop_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[LIVE_PREHEADER:%.*]], label [[DEAD_PREHEADER:%.*]]
+; CHECK:       live_preheader:
+; CHECK-NEXT:    br label [[LIVE_LOOP:%.*]]
+; CHECK:       live_loop:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 0, [[LIVE_PREHEADER]] ], [ [[A_INC:%.*]], [[LIVE_LOOP]] ]
+; CHECK-NEXT:    [[A_INC]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp slt i32 [[A_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_A]], label [[LIVE_LOOP]], label [[EXIT_A:%.*]]
+; CHECK:       exit.a:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       dead_preheader:
+; CHECK-NEXT:    br label [[DEAD_LOOP:%.*]]
+; CHECK:       dead_loop:
+; CHECK-NEXT:    [[B:%.*]] = phi i32 [ 0, [[DEAD_PREHEADER]] ], [ [[B_INC:%.*]], [[DEAD_LOOP]] ]
+; CHECK-NEXT:    [[B_INC]] = add i32 [[B]], 1
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp slt i32 [[B_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_B]], label [[DEAD_LOOP]], label [[EXIT_B:%.*]]
+; CHECK:       exit.b:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %live_preheader, label %dead_preheader
+
+live_preheader:
+  br label %live_loop
+
+live_loop:
+  %a = phi i32 [0, %live_preheader], [%a.inc, %live_loop]
+  %a.inc = add i32 %a, 1
+  %cmp.a = icmp slt i32 %a.inc, %end
+  br i1 %cmp.a, label %live_loop, label %exit.a
+
+exit.a:
+  br label %backedge
+
+dead_preheader:
+  br label %dead_loop
+
+dead_loop:
+  %b = phi i32 [0, %dead_preheader], [%b.inc, %dead_loop]
+  %b.inc = add i32 %b, 1
+  %cmp.b = icmp slt i32 %b.inc, %end
+  br i1 %cmp.b, label %dead_loop, label %exit.b
+
+exit.b:
+  br label %backedge
+
+backedge:
+  %i.inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @dead_sub_loop_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_sub_loop_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD_PREHEADER:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD_PREHEADER]]
+; CHECK-NEXT:    i32 1, label [[LIVE_PREHEADER:%.*]]
+; CHECK-NEXT:    i32 2, label [[DEAD_PREHEADER]]
+; CHECK-NEXT:    ]
+; CHECK:       live_preheader:
+; CHECK-NEXT:    br label [[LIVE_LOOP:%.*]]
+; CHECK:       live_loop:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 0, [[LIVE_PREHEADER]] ], [ [[A_INC:%.*]], [[LIVE_LOOP]] ]
+; CHECK-NEXT:    [[A_INC]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp slt i32 [[A_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_A]], label [[LIVE_LOOP]], label [[EXIT_A:%.*]]
+; CHECK:       exit.a:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       dead_preheader:
+; CHECK-NEXT:    br label [[DEAD_LOOP:%.*]]
+; CHECK:       dead_loop:
+; CHECK-NEXT:    [[B:%.*]] = phi i32 [ 0, [[DEAD_PREHEADER]] ], [ [[B_INC:%.*]], [[DEAD_LOOP]] ]
+; CHECK-NEXT:    [[B_INC]] = add i32 [[B]], 1
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp slt i32 [[B_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_B]], label [[DEAD_LOOP]], label [[EXIT_B:%.*]]
+; CHECK:       exit.b:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead_preheader [i32 0, label %dead_preheader
+  i32 1, label %live_preheader
+  i32 2, label %dead_preheader]
+
+live_preheader:
+  br label %live_loop
+
+live_loop:
+  %a = phi i32 [0, %live_preheader], [%a.inc, %live_loop]
+  %a.inc = add i32 %a, 1
+  %cmp.a = icmp slt i32 %a.inc, %end
+  br i1 %cmp.a, label %live_loop, label %exit.a
+
+exit.a:
+  br label %backedge
+
+dead_preheader:
+  br label %dead_loop
+
+dead_loop:
+  %b = phi i32 [0, %dead_preheader], [%b.inc, %dead_loop]
+  %b.inc = add i32 %b, 1
+  %cmp.b = icmp slt i32 %b.inc, %end
+  br i1 %cmp.b, label %dead_loop, label %exit.b
+
+exit.b:
+  br label %backedge
+
+backedge:
+  %i.inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we preserve static reachability of an exit block even if we prove
+; that the loop is infinite. Branch case.
+define i32 @inf_loop_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @inf_loop_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 true, label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+
+dummy:
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 true, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @inf_loop_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @inf_loop_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 true, label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+dummy:
+  br label %backedge
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 true, label %header, label %exit
+exit:
+  ret i32 %i.inc
+}
+
+; Check that when the block is not actually dead, we don't remove it.
+define i32 @no_live_block_test_branch_loop(i1 %c, i32 %end) {
+; CHECK-LABEL: @no_live_block_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[CHECK:%.*]], label [[LIVE:%.*]]
+; CHECK:       check:
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[LIVE]]
+; CHECK:       live:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[CHECK]] ], [ [[I_2]], [[LIVE]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 %c, label %check, label %live
+
+check:
+  br i1 true, label %backedge, label %live
+
+live:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %check], [%i.2, %live]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @no_live_block_test_switch_loop(i1 %c, i32 %end) {
+; CHECK-LABEL: @no_live_block_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[CHECK:%.*]], label [[LIVE:%.*]]
+; CHECK:       check:
+; CHECK-NEXT:    switch i32 1, label [[LIVE]] [
+; CHECK-NEXT:    i32 0, label [[LIVE]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[LIVE]]
+; CHECK-NEXT:    ]
+; CHECK:       live:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[CHECK]] ], [ [[I_2]], [[LIVE]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 %c, label %check, label %live
+
+check:
+  switch i32 1, label %live [i32 0, label %live
+  i32 1, label %backedge
+  i32 2, label %live]
+
+live:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %check], [%i.2, %live]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can remove part of blocks of inner loop while the loop still
+; preserves, in presence of outer loop.
+define i32 @partial_sub_loop_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @partial_sub_loop_test_branch_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @partial_sub_loop_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @partial_sub_loop_test_switch_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can completely delete inner loop and preserve the outer loop.
+define i32 @full_sub_loop_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_branch_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    br i1 false, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    br i1 false, label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  br i1 false, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  br i1 false, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @full_sub_loop_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_switch_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    switch i32 1, label [[OUTER_BACKEDGE]] [
+; CHECK-NEXT:    i32 0, label [[HEADER]]
+; CHECK-NEXT:    ]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  switch i32 1, label %dead [i32 0, label %backedge]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  switch i32 1, label %outer_backedge [i32 0, label %header]
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Inverted condition in live_part.
+define i32 @full_sub_loop_test_branch_loop_inverse_1(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_branch_loop_inverse_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    br i1 false, label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  br i1 false, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @full_sub_loop_test_switch_loop_inverse_1(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_switch_loop_inverse_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    switch i32 1, label [[BACKEDGE]] [
+; CHECK-NEXT:    i32 0, label [[DEAD:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    switch i32 1, label [[OUTER_BACKEDGE]] [
+; CHECK-NEXT:    i32 0, label [[HEADER]]
+; CHECK-NEXT:    ]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  switch i32 1, label %backedge [i32 0, label %dead]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  switch i32 1, label %outer_backedge [i32 0, label %header]
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @full_sub_loop_test_branch_loop_inverse_2(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_branch_loop_inverse_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    br i1 false, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    br i1 true, label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  br i1 false, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  br i1 true, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @full_sub_loop_test_switch_loop_inverse_2(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_switch_loop_inverse_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    switch i32 1, label [[HEADER]] [
+; CHECK-NEXT:    i32 0, label [[OUTER_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  switch i32 1, label %dead [i32 0, label %backedge]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  switch i32 1, label %header [i32 0, label %outer_backedge]
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+
+define i32 @full_sub_loop_test_branch_loop_inverse_3(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_branch_loop_inverse_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    br i1 true, label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  br i1 true, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @full_sub_loop_test_switch_loop_inverse_3(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_switch_loop_inverse_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    switch i32 1, label [[BACKEDGE]] [
+; CHECK-NEXT:    i32 0, label [[DEAD:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    switch i32 1, label [[HEADER]] [
+; CHECK-NEXT:    i32 0, label [[OUTER_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  switch i32 1, label %backedge [i32 0, label %dead]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  switch i32 1, label %header [i32 0, label %outer_backedge]
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
-- 
GitLab


From cf7570b9ac8113de825dab5e1dcffac180ecc310 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Wed, 31 Oct 2018 12:28:05 +0000
Subject: [PATCH 0796/1116] [tblgen][PredicateExpander] Add the ability to
 describe more complex constraints on instruction operands.

Before this patch, class PredicateExpander only knew how to expand simple
predicates that performed checks on instruction operands.
In particular, the new scheduling predicate syntax was not rich enough to
express checks like this one:

  Foo(MI->getOperand(0).getImm()) == ExpectedVal;

Here, the immediate operand value at index zero is passed in input to function
Foo, and ExpectedVal is compared against the value returned by function Foo.

While this predicate pattern doesn't show up in any X86 model, it shows up in
other upstream targets. So, being able to support those predicates is
fundamental if we want to be able to modernize all the scheduling models
upstream.

With this patch, we allow users to specify if a register/immediate operand value
needs to be passed in input to a function as part of the predicate check. Now,
register/immediate operand checks all derive from base class CheckOperandBase.

This patch also changes where TIIPredicate definitions are expanded by the
instructon info emitter. Before, definitions were expanded in class
XXXGenInstrInfo (where XXX is a target name).
With the introduction of this new syntax, we may want to have TIIPredicates
expanded directly in XXXInstrInfo. That is because functions used by the new
operand predicates may only exist in the derived class (i.e. XXXInstrInfo).

This patch is a non functional change for the existing scheduling models.
In future, we will be able to use this richer syntax to better describe complex
scheduling predicates, and expose them to llvm-mca.

Differential Revision: https://reviews.llvm.org/D53880


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345714 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetInstrPredicate.td | 34 +++++++++++---
 lib/Target/X86/X86InstrInfo.cpp             |  3 ++
 lib/Target/X86/X86InstrInfo.h               |  3 ++
 utils/TableGen/InstrInfoEmitter.cpp         | 30 ++++++++++--
 utils/TableGen/PredicateExpander.cpp        | 52 +++++++++++++++++----
 utils/TableGen/PredicateExpander.h          | 13 ++++--
 6 files changed, 111 insertions(+), 24 deletions(-)

diff --git a/include/llvm/Target/TargetInstrPredicate.td b/include/llvm/Target/TargetInstrPredicate.td
index d25309a45ba..e70da009790 100644
--- a/include/llvm/Target/TargetInstrPredicate.td
+++ b/include/llvm/Target/TargetInstrPredicate.td
@@ -106,28 +106,50 @@ class CheckSameRegOperand<int First, int Second> : MCInstPredicate {
   int SecondIndex = Second;
 }
 
+// Base class for checks on register/immediate operands.
+// It allows users to define checks like:
+//    MyFunction(MI->getOperand(Index).getImm()) == Val;
+//
+// In the example above, `MyFunction` is a function that takes as input an
+// immediate operand value, and returns another value. Field `FunctionMapper` is
+// the name of the function to call on the operand value.
+class CheckOperandBase<int Index, string Fn = ""> : MCOperandPredicate<Index> {
+  string FunctionMapper = Fn;
+}
+
 // Check that the machine register operand at position `Index` references
 // register R. This predicate assumes that we already checked that the machine
 // operand at position `Index` is a register operand.
-class CheckRegOperand<int Index, Register R> : MCOperandPredicate<Index> {
+class CheckRegOperand<int Index, Register R> : CheckOperandBase<Index> {
   Register Reg = R;
 }
 
 // Check if register operand at index `Index` is the invalid register.
-class CheckInvalidRegOperand<int Index> : MCOperandPredicate<Index>;
+class CheckInvalidRegOperand<int Index> : CheckOperandBase<Index>;
 
 // Check that the operand at position `Index` is immediate `Imm`.
-class CheckImmOperand<int Index, int Imm> : MCOperandPredicate<Index> {
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against `Imm`.
+class CheckImmOperand<int Index, int Imm> : CheckOperandBase<Index> {
   int ImmVal = Imm;
 }
 
 // Similar to CheckImmOperand, however the immediate is not a literal number.
 // This is useful when we want to compare the value of an operand against an
 // enum value, and we know the actual integer value of that enum.
-class CheckImmOperand_s<int Index, string Value> : MCOperandPredicate<Index> {
+class CheckImmOperand_s<int Index, string Value> : CheckOperandBase<Index> {
   string ImmVal = Value;
 }
 
+// Expands to a call to `FunctionMapper` if field `FunctionMapper` is set.
+// Otherwise, it expands to a CheckNot<CheckInvalidRegOperand<Index>>.
+class CheckRegOperandSimple<int Index> : CheckOperandBase<Index>;
+
+// Expands to a call to `FunctionMapper` if field `FunctionMapper` is set.
+// Otherwise, it simply evaluates to TruePred.
+class CheckImmOperandSimple<int Index> : CheckOperandBase<Index>;
+
 // Check that the operand at position `Index` is immediate value zero.
 class CheckZeroOperand<int Index> : CheckImmOperand<Index, 0>;
 
@@ -206,13 +228,13 @@ class FunctionPredicateBase<string name, MCStatement body> {
   MCStatement Body = body;
 }
 
-// Check that a call to method `Name` in class "XXXGenInstrInfo" (where XXX is
+// Check that a call to method `Name` in class "XXXInstrInfo" (where XXX is
 // the name of a target) returns true.
 //
 // TIIPredicate definitions are used to model calls to the target-specific
 // InstrInfo. A TIIPredicate is treated specially by the InstrInfoEmitter
 // tablegen backend, which will use it to automatically generate a definition in
-// the target specific `GenInstrInfo` class.
+// the target specific `InstrInfo` class.
 //
 // There cannot be multiple TIIPredicate definitions with the same name for the
 // same target.
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index db0cb63ae69..88f2f0fffd6 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -7844,3 +7844,6 @@ X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
 
   return It;
 }
+
+#define GET_TII_HELPERS
+#include "X86GenInstrInfo.inc"
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 85afcf8904a..f3965db4fe7 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -558,6 +558,9 @@ public:
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
                      const outliner::Candidate &C) const override;
 
+#define GET_TII_HELPER_DECLS
+#include "X86GenInstrInfo.inc"
+
 protected:
   /// Commutes the operands in the given instruction by changing the operands
   /// order and/or changing the instruction's opcode and/or the immediate value
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index ef8c849e25f..55b6f192c2f 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -66,7 +66,8 @@ private:
   /// This method is used to custom expand TIIPredicate definitions.
   /// See file llvm/Target/TargetInstPredicates.td for a description of what is
   /// a TIIPredicate and how to use it.
-  void emitTIIHelperMethods(raw_ostream &OS, StringRef TargetName);
+  void emitTIIHelperMethods(raw_ostream &OS, StringRef TargetName,
+                            bool ExpandDefinition = true);
 
   /// Expand TIIPredicate definitions to functions that accept a const MCInst
   /// reference.
@@ -400,7 +401,8 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
 }
 
 void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS,
-                                            StringRef TargetName) {
+                                            StringRef TargetName,
+                                            bool ExpandDefinition) {
   RecVec TIIPredicates = Records.getAllDerivedDefinitions("TIIPredicate");
   if (TIIPredicates.empty())
     return;
@@ -410,8 +412,17 @@ void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS,
   PE.setIndentLevel(2);
 
   for (const Record *Rec : TIIPredicates) {
-    OS << "\n  static bool " << Rec->getValueAsString("FunctionName");
-    OS << "(const MachineInstr &MI) {\n";
+    OS << "\n  " << (ExpandDefinition ? "" : "static ") << "bool ";
+    if (ExpandDefinition)
+      OS << TargetName << "InstrInfo::";
+    OS << Rec->getValueAsString("FunctionName");
+    OS << "(const MachineInstr &MI)";
+    if (!ExpandDefinition) {
+      OS << ";\n";
+      continue;
+    }
+
+    OS << " {\n";
 
     OS.indent(PE.getIndentLevel() * 2);
     PE.expandStatement(OS, Rec->getValueAsDef("Body"));
@@ -517,12 +528,21 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
      << "(int CFSetupOpcode = -1, int CFDestroyOpcode = -1, int CatchRetOpcode = -1, int ReturnOpcode = -1);\n"
      << "  ~" << ClassName << "() override = default;\n";
 
-  emitTIIHelperMethods(OS, TargetName);
 
   OS << "\n};\n} // end llvm namespace\n";
 
   OS << "#endif // GET_INSTRINFO_HEADER\n\n";
 
+  OS << "#ifdef GET_TII_HELPER_DECLS\n";
+  OS << "#undef GET_TII_HELPER_DECLS\n";
+  emitTIIHelperMethods(OS, TargetName, /* ExpandDefintion = */false);
+  OS << "#endif // GET_TII_HELPER_DECLS\n\n";
+
+  OS << "#ifdef GET_TII_HELPERS\n";
+  OS << "#undef GET_TII_HELPERS\n";
+  emitTIIHelperMethods(OS, TargetName, /* ExpandDefintion = */true);
+  OS << "#endif // GET_TTI_HELPERS\n\n";
+
   OS << "#ifdef GET_INSTRINFO_CTOR_DTOR\n";
   OS << "#undef GET_INSTRINFO_CTOR_DTOR\n";
 
diff --git a/utils/TableGen/PredicateExpander.cpp b/utils/TableGen/PredicateExpander.cpp
index 83f67c023e5..ad7bf60caab 100644
--- a/utils/TableGen/PredicateExpander.cpp
+++ b/utils/TableGen/PredicateExpander.cpp
@@ -20,23 +20,43 @@ void PredicateExpander::expandTrue(raw_ostream &OS) { OS << "true"; }
 void PredicateExpander::expandFalse(raw_ostream &OS) { OS << "false"; }
 
 void PredicateExpander::expandCheckImmOperand(raw_ostream &OS, int OpIndex,
-                                              int ImmVal) {
+                                              int ImmVal,
+                                              StringRef FunctionMapper) {
+  if (!FunctionMapper.empty())
+    OS << FunctionMapper << "(";
   OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
-     << ").getImm() " << (shouldNegate() ? "!= " : "== ") << ImmVal;
+     << ").getImm()";
+  OS << (FunctionMapper.empty() ? " " : ") ");
+  OS << (shouldNegate() ? "!= " : "== ") << ImmVal;
 }
 
 void PredicateExpander::expandCheckImmOperand(raw_ostream &OS, int OpIndex,
-                                              StringRef ImmVal) {
+                                              StringRef ImmVal,
+                                              StringRef FunctionMapper) {
+  if (!FunctionMapper.empty())
+    OS << FunctionMapper << "(";
   OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
-     << ").getImm() " << (shouldNegate() ? "!= " : "== ") << ImmVal;
+     << ").getImm()";
+
+  OS << (FunctionMapper.empty() ? "" : ")");
+  if (ImmVal.empty())
+    return;
+  OS << (shouldNegate() ? " != " : " == ") << ImmVal;
 }
 
 void PredicateExpander::expandCheckRegOperand(raw_ostream &OS, int OpIndex,
-                                              const Record *Reg) {
+                                              const Record *Reg,
+                                              StringRef FunctionMapper) {
   assert(Reg->isSubClassOf("Register") && "Expected a register Record!");
 
+  if (!FunctionMapper.empty())
+    OS << FunctionMapper << "(";
   OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
-     << ").getReg() " << (shouldNegate() ? "!= " : "== ");
+     << ").getReg()";
+  OS << (FunctionMapper.empty() ? "" : ")");
+  if (!Reg)
+    return;
+  OS << (shouldNegate() ? " != " : " == ");
   const StringRef Str = Reg->getValueAsString("Namespace");
   if (!Str.empty())
     OS << Str << "::";
@@ -137,7 +157,7 @@ void PredicateExpander::expandPredicateSequence(raw_ostream &OS,
 void PredicateExpander::expandTIIFunctionCall(raw_ostream &OS,
                                               StringRef MethodName) {
   OS << (shouldNegate() ? "!" : "");
-  OS << TargetName << (shouldExpandForMC() ? "_MC::" : "GenInstrInfo::");
+  OS << TargetName << (shouldExpandForMC() ? "_MC::" : "InstrInfo::");
   OS << MethodName << (isByRef() ? "(MI)" : "(*MI)");
 }
 
@@ -266,18 +286,30 @@ void PredicateExpander::expandPredicate(raw_ostream &OS, const Record *Rec) {
 
   if (Rec->isSubClassOf("CheckRegOperand"))
     return expandCheckRegOperand(OS, Rec->getValueAsInt("OpIndex"),
-                                 Rec->getValueAsDef("Reg"));
+                                 Rec->getValueAsDef("Reg"),
+                                 Rec->getValueAsString("FunctionMapper"));
+
+  if (Rec->isSubClassOf("CheckRegOperandSimple"))
+    return expandCheckRegOperand(OS, Rec->getValueAsInt("OpIndex"),
+                                 nullptr,
+                                 Rec->getValueAsString("FunctionMapper"));
 
   if (Rec->isSubClassOf("CheckInvalidRegOperand"))
     return expandCheckInvalidRegOperand(OS, Rec->getValueAsInt("OpIndex"));
 
   if (Rec->isSubClassOf("CheckImmOperand"))
     return expandCheckImmOperand(OS, Rec->getValueAsInt("OpIndex"),
-                                 Rec->getValueAsInt("ImmVal"));
+                                 Rec->getValueAsInt("ImmVal"),
+                                 Rec->getValueAsString("FunctionMapper"));
 
   if (Rec->isSubClassOf("CheckImmOperand_s"))
     return expandCheckImmOperand(OS, Rec->getValueAsInt("OpIndex"),
-                                 Rec->getValueAsString("ImmVal"));
+                                 Rec->getValueAsString("ImmVal"),
+                                 Rec->getValueAsString("FunctionMapper"));
+
+  if (Rec->isSubClassOf("CheckImmOperandSimple"))
+    return expandCheckImmOperand(OS, Rec->getValueAsInt("OpIndex"), "", 
+                                 Rec->getValueAsString("FunctionMapper"));
 
   if (Rec->isSubClassOf("CheckSameRegOperand"))
     return expandCheckSameRegOperand(OS, Rec->getValueAsInt("FirstIndex"),
diff --git a/utils/TableGen/PredicateExpander.h b/utils/TableGen/PredicateExpander.h
index 255e40c4998..0f3ee6867e6 100644
--- a/utils/TableGen/PredicateExpander.h
+++ b/utils/TableGen/PredicateExpander.h
@@ -56,9 +56,16 @@ public:
   using RecVec = std::vector<Record *>;
   void expandTrue(raw_ostream &OS);
   void expandFalse(raw_ostream &OS);
-  void expandCheckImmOperand(raw_ostream &OS, int OpIndex, int ImmVal);
-  void expandCheckImmOperand(raw_ostream &OS, int OpIndex, StringRef ImmVal);
-  void expandCheckRegOperand(raw_ostream &OS, int OpIndex, const Record *Reg);
+  void expandCheckImmOperand(raw_ostream &OS, int OpIndex, int ImmVal,
+                             StringRef FunctionMapper);
+  void expandCheckImmOperand(raw_ostream &OS, int OpIndex, StringRef ImmVal,
+                             StringRef FunctionMapperer);
+  void expandCheckImmOperandSimple(raw_ostream &OS, int OpIndex,
+                                   StringRef FunctionMapper);
+  void expandCheckRegOperand(raw_ostream &OS, int OpIndex, const Record *Reg,
+                             StringRef FunctionMapper);
+  void expandCheckRegOperandSimple(raw_ostream &OS, int OpIndex,
+                                   StringRef FunctionMapper);
   void expandCheckSameRegOperand(raw_ostream &OS, int First, int Second);
   void expandCheckNumOperands(raw_ostream &OS, int NumOps);
   void expandCheckOpcode(raw_ostream &OS, const Record *Inst);
-- 
GitLab


From 893f08bea2e95509cf1a46d80f55700e53e69af2 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 31 Oct 2018 13:25:10 +0000
Subject: [PATCH 0797/1116] [InstSimplify] fold icmp based on range of abs/nabs

This is a fix for PR39475:
https://bugs.llvm.org/show_bug.cgi?id=39475

We managed to get some of these patterns using computeKnownBits in D47041, but that
can't be used for nabs(). Instead, put in some range-based logic, so we can fold
both abs/nabs with icmp with a constant value.

Alive proofs:
https://rise4fun.com/Alive/21r

Name: abs_nsw_is_positive
  %cmp = icmp slt i32 %x, 0
  %negx = sub nsw i32 0, %x
  %abs = select i1 %cmp, i32 %negx, i32 %x
  %r = icmp sgt i32 %abs, -1
    =>
  %r = i1 true

Name: abs_nsw_is_not_negative
  %cmp = icmp slt i32 %x, 0
  %negx = sub nsw i32 0, %x
  %abs = select i1 %cmp, i32 %negx, i32 %x
  %r = icmp slt i32 %abs, 0
    =>
  %r = i1 false

Name: nabs_is_negative_or_0
  %cmp = icmp slt i32 %x, 0
  %negx = sub i32 0, %x
  %nabs = select i1 %cmp, i32 %x, i32 %negx
  %r = icmp slt i32 %nabs, 1
    =>
  %r = i1 true

Name: nabs_is_not_over_0
  %cmp = icmp slt i32 %x, 0
  %negx = sub i32 0, %x
  %nabs = select i1 %cmp, i32 %x, i32 %negx
  %r = icmp sgt i32 %nabs, 0
    =>
  %r = i1 false

Differential Revision: https://reviews.llvm.org/D53844


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345717 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/InstructionSimplify.cpp          | 42 +++++++++
 test/Transforms/InstSimplify/icmp-abs-nabs.ll | 90 ++++---------------
 2 files changed, 57 insertions(+), 75 deletions(-)

diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 6ff72638512..b1381932e7f 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -2996,6 +2996,45 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
+static Value *simplifyICmpWithAbsNabs(CmpInst::Predicate Pred, Value *Op0,
+                                      Value *Op1) {
+  // We need a comparison with a constant.
+  const APInt *C;
+  if (!match(Op1, m_APInt(C)))
+    return nullptr;
+
+  // matchSelectPattern returns the negation part of an abs pattern in SP1.
+  // If the negate has an NSW flag, abs(INT_MIN) is undefined. Without that
+  // constraint, we can't make a contiguous range for the result of abs.
+  ICmpInst::Predicate AbsPred = ICmpInst::BAD_ICMP_PREDICATE;
+  Value *SP0, *SP1;
+  SelectPatternFlavor SPF = matchSelectPattern(Op0, SP0, SP1).Flavor;
+  if (SPF == SelectPatternFlavor::SPF_ABS &&
+      cast<Instruction>(SP1)->hasNoSignedWrap())
+    // The result of abs(X) is >= 0 (with nsw).
+    AbsPred = ICmpInst::ICMP_SGE;
+  if (SPF == SelectPatternFlavor::SPF_NABS)
+    // The result of -abs(X) is <= 0.
+    AbsPred = ICmpInst::ICMP_SLE;
+
+  if (AbsPred == ICmpInst::BAD_ICMP_PREDICATE)
+    return nullptr;
+
+  // Intersect the range of abs/nabs with the range of this icmp.
+  // If there is no intersection, the icmp must be false.
+  // If the intersection equals the range of abs/nabs, the icmp must be true.
+  APInt Zero = APInt::getNullValue(C->getBitWidth());
+  ConstantRange AbsRange = ConstantRange::makeExactICmpRegion(AbsPred, Zero);
+  ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(Pred, *C);
+  ConstantRange Intersection = AbsRange.intersectWith(CmpRange);
+  if (Intersection.isEmptySet())
+    return getFalse(GetCompareTy(Op0));
+  if (Intersection == AbsRange)
+    return getTrue(GetCompareTy(Op0));
+
+  return nullptr;
+}
+
 /// Simplify integer comparisons where at least one operand of the compare
 /// matches an integer min/max idiom.
 static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
@@ -3427,6 +3466,9 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   if (Value *V = simplifyICmpWithMinMax(Pred, LHS, RHS, Q, MaxRecurse))
     return V;
 
+  if (Value *V = simplifyICmpWithAbsNabs(Pred, LHS, RHS))
+    return V;
+
   // Simplify comparisons of related pointers using a powerful, recursive
   // GEP-walk when we have target data available..
   if (LHS->getType()->isPointerTy())
diff --git a/test/Transforms/InstSimplify/icmp-abs-nabs.ll b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
index 1cb312bf0da..52321136dcf 100644
--- a/test/Transforms/InstSimplify/icmp-abs-nabs.ll
+++ b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
@@ -5,11 +5,7 @@
 
 define i1 @abs_nsw_is_positive(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_positive(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], -1
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub nsw i32 0, %x
@@ -35,11 +31,7 @@ define i1 @abs_nsw_is_positive_sge(i32 %x) {
 
 define i1 @abs_nsw_is_positive_reduced_range(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_positive_reduced_range(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], -42
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub nsw i32 0, %x
@@ -99,11 +91,7 @@ define i1 @abs_nsw_is_not_negative(i32 %x) {
 
 define i1 @abs_nsw_is_not_negative_sle(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_sle(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sle i32 [[ABS]], -1
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub nsw i32 0, %x
@@ -116,11 +104,7 @@ define i1 @abs_nsw_is_not_negative_sle(i32 %x) {
 
 define i1 @abs_nsw_is_not_negative_reduced_range(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_reduced_range(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[ABS]], -24
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub nsw i32 0, %x
@@ -167,11 +151,7 @@ define i1 @abs_nsw_is_not_negative_wrong_range(i32 %x) {
 
 define i1 @nabs_is_negative_or_0(i32 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[NABS]], 1
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub i32 0, %x
@@ -184,11 +164,7 @@ define i1 @nabs_is_negative_or_0(i32 %x) {
 
 define i1 @nabs_is_negative_or_0_sle(i32 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0_sle(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sle i32 [[NABS]], 0
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -201,11 +177,7 @@ define i1 @nabs_is_negative_or_0_sle(i32 %x) {
 
 define i1 @nabs_is_negative_or_0_reduced_range(i32 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0_reduced_range(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[NABS]], 421
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -235,11 +207,7 @@ define i1 @nabs_is_negative_or_0_wrong_range(i32 %x) {
 
 define i1 @nabs_is_not_over_0(i32 %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[NABS]], 0
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub i32 0, %x
@@ -252,11 +220,7 @@ define i1 @nabs_is_not_over_0(i32 %x) {
 
 define i1 @nabs_is_not_over_0_sle(i32 %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0_sle(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sge i32 [[NABS]], 1
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -269,11 +233,7 @@ define i1 @nabs_is_not_over_0_sle(i32 %x) {
 
 define i1 @nabs_is_not_over_0_reduced_range(i32 %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0_reduced_range(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[NABS]], 4223
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -318,11 +278,7 @@ define i1 @abs_nsw_is_positive_eq(i32 %x) {
 
 define i1 @abs_nsw_is_positive_ult(i8 %x) {
 ; CHECK-LABEL: @abs_nsw_is_positive_ult(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i8 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i8 [[NEGX]], i8 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[ABS]], -117
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i8 %x, 0
   %negx = sub nsw i8 0, %x
@@ -335,11 +291,7 @@ define i1 @abs_nsw_is_positive_ult(i8 %x) {
 
 define i1 @abs_nsw_is_not_negative_ugt(i8 %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_ugt(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i8 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i8 [[NEGX]], i8 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[ABS]], 127
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i8 %x, 0
   %negx = sub nsw i8 0, %x
@@ -352,11 +304,7 @@ define i1 @abs_nsw_is_not_negative_ugt(i8 %x) {
 
 define <2 x i1> @abs_nsw_is_not_negative_vec_splat(<2 x i32> %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_vec_splat(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i32> [[X:%.*]], zeroinitializer
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[NEGX]], <2 x i32> [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp slt <2 x i32> [[ABS]], <i32 -8, i32 -8>
-; CHECK-NEXT:    ret <2 x i1> [[R]]
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %cmp = icmp slt <2 x i32> %x, zeroinitializer
   %negx = sub nsw <2 x i32> zeroinitializer, %x
@@ -369,11 +317,7 @@ define <2 x i1> @abs_nsw_is_not_negative_vec_splat(<2 x i32> %x) {
 
 define i1 @nabs_is_negative_or_0_ne(i8 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0_ne(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i8 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[NABS]], 12
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i8 %x, 0
   %negx = sub i8 0, %x
@@ -386,11 +330,7 @@ define i1 @nabs_is_negative_or_0_ne(i8 %x) {
 
 define <3 x i1> @nabs_is_not_over_0_sle_vec_splat(<3 x i33> %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0_sle_vec_splat(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <3 x i33> [[X:%.*]], <i33 1, i33 1, i33 1>
-; CHECK-NEXT:    [[NEGX:%.*]] = sub <3 x i33> zeroinitializer, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select <3 x i1> [[CMP]], <3 x i33> [[X]], <3 x i33> [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sge <3 x i33> [[NABS]], <i33 1, i33 1, i33 1>
-; CHECK-NEXT:    ret <3 x i1> [[R]]
+; CHECK-NEXT:    ret <3 x i1> zeroinitializer
 ;
   %cmp = icmp slt <3 x i33> %x, <i33 1, i33 1, i33 1>
   %negx = sub <3 x i33> zeroinitializer, %x
-- 
GitLab


From 91533e99d40c335134a635deaee79c726bb618f0 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Wed, 31 Oct 2018 13:26:48 +0000
Subject: [PATCH 0798/1116] AMDGPU: Remove PHI loop condition optimization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
The optimization to early break out of loops if all threads are dead was
never fully implemented.

But the PHI node analyzing is actually causing a number of problems, so
remove all the extra code for it.

(This does actually regress code quality in a few places because it
 ends up relying more heavily on phi's of i1, which we don't do a
 great job with. However, since it fixes real bugs in the wild, we
 should take this change. I have some prototype changes to improve
 i1 lowering in general -- not just for control flow -- which should
 help recover the code quality, I just need to make those changes
 fit for general consumption. -- Nicolai)

Change-Id: I6fc6c6c8961857ac6009fcfb9f7e5e48dc23fbb1
Patch-by: Christian König <christian.koenig@amd.com>

Reviewers: arsenm, rampitec, tpr

Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D53359

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345718 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsAMDGPU.td           |  8 --
 lib/Target/AMDGPU/AMDGPUInstrInfo.td          |  8 --
 lib/Target/AMDGPU/SIAnnotateControlFlow.cpp   | 86 +------------------
 lib/Target/AMDGPU/SIFixSGPRCopies.cpp         |  2 -
 lib/Target/AMDGPU/SIInstructions.td           | 16 ----
 lib/Target/AMDGPU/SILowerControlFlow.cpp      | 29 -------
 test/CodeGen/AMDGPU/loop_break.ll             | 70 +++++++--------
 test/CodeGen/AMDGPU/multilevel-break.ll       | 29 ++++---
 test/CodeGen/AMDGPU/nested-loop-conditions.ll | 44 +++++-----
 test/CodeGen/AMDGPU/valu-i1.ll                | 10 +--
 10 files changed, 77 insertions(+), 225 deletions(-)

diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index ccf43a61c3d..67e7da7797a 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1484,18 +1484,10 @@ def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_i64_ty],
   [llvm_i64_ty], [IntrConvergent]
 >;
 
-def int_amdgcn_break : Intrinsic<[llvm_i64_ty],
-  [llvm_i64_ty], [IntrNoMem, IntrConvergent]
->;
-
 def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty],
   [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
 >;
 
-def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty],
-  [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
->;
-
 def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
   [llvm_i64_ty], [IntrConvergent]
 >;
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 7442a59e594..82644be2656 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -62,18 +62,10 @@ def AMDGPULoopOp : SDTypeProfile<0, 2,
   [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
 >;
 
-def AMDGPUBreakOp : SDTypeProfile<1, 1,
-  [SDTCisVT<0, i64>, SDTCisVT<1, i64>]
->;
-
 def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
   [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
 >;
 
-def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
-  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
->;
-
 def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
   [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
 >;
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 8248dbe1b0f..90f430d5ca4 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -66,9 +66,7 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   Function *If;
   Function *Else;
-  Function *Break;
   Function *IfBreak;
-  Function *ElseBreak;
   Function *Loop;
   Function *EndCf;
 
@@ -95,8 +93,7 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   Value *
   handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L,
-                      BranchInst *Term,
-                      SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions);
+                      BranchInst *Term);
 
   void handleLoop(BranchInst *Term);
 
@@ -149,9 +146,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
 
   If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if);
   Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else);
-  Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break);
   IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break);
-  ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break);
   Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop);
   EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf);
   return false;
@@ -227,76 +222,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
 
 /// Recursively handle the condition leading to a loop
 Value *SIAnnotateControlFlow::handleLoopCondition(
-    Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,
-    SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {
-  // Only search through PHI nodes which are inside the loop.  If we try this
-  // with PHI nodes that are outside of the loop, we end up inserting new PHI
-  // nodes outside of the loop which depend on values defined inside the loop.
-  // This will break the module with
-  // 'Instruction does not dominate all users!' errors.
-  PHINode *Phi = nullptr;
-  if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
-    BasicBlock *Parent = Phi->getParent();
-    PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front());
-    Value *Ret = NewPhi;
-
-    // Handle all non-constant incoming values first
-    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-      Value *Incoming = Phi->getIncomingValue(i);
-      BasicBlock *From = Phi->getIncomingBlock(i);
-      if (isa<ConstantInt>(Incoming)) {
-        NewPhi->addIncoming(Broken, From);
-        continue;
-      }
-
-      Phi->setIncomingValue(i, BoolFalse);
-      Value *PhiArg = handleLoopCondition(Incoming, Broken, L,
-                                          Term, LoopPhiConditions);
-      NewPhi->addIncoming(PhiArg, From);
-    }
-
-    BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
-
-    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-      Value *Incoming = Phi->getIncomingValue(i);
-      if (Incoming != BoolTrue)
-        continue;
-
-      BasicBlock *From = Phi->getIncomingBlock(i);
-      if (From == IDom) {
-        // We're in the following situation:
-        //   IDom/From
-        //      |   \
-        //      |   If-block
-        //      |   /
-        //     Parent
-        // where we want to break out of the loop if the If-block is not taken.
-        // Due to the depth-first traversal, there should be an end.cf
-        // intrinsic in Parent, and we insert an else.break before it.
-        //
-        // Note that the end.cf need not be the first non-phi instruction
-        // of parent, particularly when we're dealing with a multi-level
-        // break, but it should occur within a group of intrinsic calls
-        // at the beginning of the block.
-        CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
-        while (OldEnd && OldEnd->getCalledFunction() != EndCf)
-          OldEnd = dyn_cast<CallInst>(OldEnd->getNextNode());
-        if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
-          Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
-          Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
-          continue;
-        }
-      }
-
-      Instruction *Insert = From->getTerminator();
-      Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
-      NewPhi->setIncomingValue(i, PhiArg);
-    }
-
-    LoopPhiConditions.push_back(WeakTrackingVH(Phi));
-    return Ret;
-  }
-
+    Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term) {
   if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
     BasicBlock *Parent = Inst->getParent();
     Instruction *Insert;
@@ -335,21 +261,15 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   BasicBlock *Target = Term->getSuccessor(1);
   PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());
 
-  SmallVector<WeakTrackingVH, 8> LoopPhiConditions;
   Value *Cond = Term->getCondition();
   Term->setCondition(BoolTrue);
-  Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions);
+  Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
 
   for (BasicBlock *Pred : predecessors(Target))
     Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred);
 
   Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
 
-  for (WeakTrackingVH Val : llvm::reverse(LoopPhiConditions)) {
-    if (PHINode *Cond = cast_or_null<PHINode>(Val))
-      eraseIfUnused(Cond);
-  }
-
   push(Term->getSuccessor(0), Arg);
 }
 
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index ed52278e441..5324cbc912d 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -327,9 +327,7 @@ static bool phiHasBreakDef(const MachineInstr &PHI,
     switch (DefInstr->getOpcode()) {
     default:
       break;
-    case AMDGPU::SI_BREAK:
     case AMDGPU::SI_IF_BREAK:
-    case AMDGPU::SI_ELSE_BREAK:
       return true;
     case AMDGPU::PHI:
       if (phiHasBreakDef(*DefInstr, MRI, Visited))
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 67aea73d1ca..9714203d3d7 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -264,14 +264,6 @@ def SI_END_CF : CFPseudoInstSI <
   let mayStore = 1;
 }
 
-def SI_BREAK : CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$src),
-  [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
-  let Size = 4;
-  let isAsCheapAsAMove = 1;
-  let isReMaterializable = 1;
-}
-
 def SI_IF_BREAK : CFPseudoInstSI <
   (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
   [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
@@ -280,14 +272,6 @@ def SI_IF_BREAK : CFPseudoInstSI <
   let isReMaterializable = 1;
 }
 
-def SI_ELSE_BREAK : CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
-  [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
-  let Size = 4;
-  let isAsCheapAsAMove = 1;
-  let isReMaterializable = 1;
-}
-
 let Uses = [EXEC] in {
 
 multiclass PseudoInstKill <dag ins> {
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index ad30317c344..1aa1feebbda 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -85,9 +85,7 @@ private:
 
   void emitIf(MachineInstr &MI);
   void emitElse(MachineInstr &MI);
-  void emitBreak(MachineInstr &MI);
   void emitIfBreak(MachineInstr &MI);
-  void emitElseBreak(MachineInstr &MI);
   void emitLoop(MachineInstr &MI);
   void emitEndCf(MachineInstr &MI);
 
@@ -329,20 +327,6 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
 }
 
-void SILowerControlFlow::emitBreak(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  const DebugLoc &DL = MI.getDebugLoc();
-  unsigned Dst = MI.getOperand(0).getReg();
-
-  MachineInstr *Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
-                         .addReg(AMDGPU::EXEC)
-                         .add(MI.getOperand(1));
-
-  if (LIS)
-    LIS->ReplaceMachineInstrInMaps(MI, *Or);
-  MI.eraseFromParent();
-}
-
 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -384,11 +368,6 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
-  // Lowered in the same way as emitIfBreak above.
-  emitIfBreak(MI);
-}
-
 void SILowerControlFlow::emitLoop(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -515,18 +494,10 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
         emitElse(MI);
         break;
 
-      case AMDGPU::SI_BREAK:
-        emitBreak(MI);
-        break;
-
       case AMDGPU::SI_IF_BREAK:
         emitIfBreak(MI);
         break;
 
-      case AMDGPU::SI_ELSE_BREAK:
-        emitElseBreak(MI);
-        break;
-
       case AMDGPU::SI_LOOP:
         emitLoop(MI);
         break;
diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll
index b2641cd4d2e..576950188d3 100644
--- a/test/CodeGen/AMDGPU/loop_break.ll
+++ b/test/CodeGen/AMDGPU/loop_break.ll
@@ -5,16 +5,17 @@
 
 ; OPT-LABEL: @break_loop(
 ; OPT: bb1:
-; OPT: call i64 @llvm.amdgcn.break(i64
+; OPT: icmp slt i32
 ; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
 
 ; OPT: bb4:
 ; OPT: load volatile
+; OPT: icmp slt i32
 ; OPT: xor i1 %cmp1
-; OPT: call i64 @llvm.amdgcn.if.break(
 ; OPT: br label %Flow
 
 ; OPT: Flow:
+; OPT: call i64 @llvm.amdgcn.if.break(
 ; OPT: call i1 @llvm.amdgcn.loop(i64
 ; OPT: br i1 %{{[0-9]+}}, label %bb9, label %bb1
 
@@ -23,21 +24,19 @@
 
 ; TODO: Can remove exec fixes in return block
 ; GCN-LABEL: {{^}}break_loop:
-; GCN: s_mov_b64 [[INITMASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
-; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]]
 ; GCN: v_cmp_lt_i32_e32 vcc, -1
 ; GCN: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
+; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
 
 ; GCN: ; %bb.2: ; %bb4
 ; GCN: buffer_load_dword
 ; GCN: v_cmp_ge_i32_e32 vcc,
-; GCN: s_or_b64 [[MASK]], vcc, [[INITMASK]]
 
 ; GCN: [[FLOW]]:
-; GCN: s_mov_b64 [[INITMASK]], [[MASK]]
+; GCN: s_or_b64 [[MASK]], vcc, [[MASK]]
 ; GCN: s_andn2_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]
 
@@ -66,25 +65,26 @@ bb9:
 
 ; OPT-LABEL: @undef_phi_cond_break_loop(
 ; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
 ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
-; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 undef, i64 %phi.broken)
+; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
+; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
 ; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
 
 ; OPT: bb4:
 ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
 ; OPT-NEXT: br label %Flow
 
 ; OPT: Flow:
-; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
 ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
-; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken)
+; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0)
+; OPT-NEXT: br i1 %1, label %bb9, label %bb1
 
 ; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
 define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
@@ -119,25 +119,26 @@ bb9:                                              ; preds = %Flow
 
 ; OPT-LABEL: @constexpr_phi_cond_break_loop(
 ; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
 ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
-; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), i64 %phi.broken)
+; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
+; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
 ; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
 
 ; OPT: bb4:
 ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
 ; OPT-NEXT: br label %Flow
 
 ; OPT: Flow:
-; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
 ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
-; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), %bb1 ]
+; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken)
+; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0)
+; OPT-NEXT: br i1 %1, label %bb9, label %bb1
 
 ; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
 define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
@@ -169,25 +170,26 @@ bb9:                                              ; preds = %Flow
 
 ; OPT-LABEL: @true_phi_cond_break_loop(
 ; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
 ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
-; OPT: %0 = call i64 @llvm.amdgcn.break(i64 %phi.broken)
-; OPT: br i1 %cmp0, label %bb4, label %Flow
+; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
+; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
+; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
 
 ; OPT: bb4:
 ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
 ; OPT-NEXT: br label %Flow
 
 ; OPT: Flow:
-; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
 ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
-; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
+; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken)
+; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0)
+; OPT-NEXT: br i1 %1, label %bb9, label %bb1
 
 ; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
 define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
@@ -219,7 +221,7 @@ bb9:                                              ; preds = %Flow
 
 ; OPT-LABEL: @false_phi_cond_break_loop(
 ; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
 ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
 ; OPT-NOT: call
 ; OPT: br i1 %cmp0, label %bb4, label %Flow
@@ -227,17 +229,17 @@ bb9:                                              ; preds = %Flow
 ; OPT: bb4:
 ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
 ; OPT-NEXT: br label %Flow
 
 ; OPT: Flow:
-; OPT-NEXT: %loop.phi = phi i64 [ %0, %bb4 ], [ %phi.broken, %bb1 ]
 ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ false, %bb1 ]
+; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken)
+; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0)
 ; OPT-NEXT: br i1 %1, label %bb9, label %bb1
 
 ; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
 define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
diff --git a/test/CodeGen/AMDGPU/multilevel-break.ll b/test/CodeGen/AMDGPU/multilevel-break.ll
index 216ca1973b5..c4e2f1e3487 100644
--- a/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -10,11 +10,12 @@
 ;
 ; OPT: Flow:
 ;
-; Ensure two else.break calls, for both the inner and outer loops
+; Ensure two if.break calls, for both the inner and outer loops
 
-; OPT:        call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
-; OPT-NEXT:   call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
-; OPT-NEXT:   call void @llvm.amdgcn.end.cf
+; OPT:        call void @llvm.amdgcn.end.cf
+; OPT-NEXT:   call i64 @llvm.amdgcn.if.break(i1
+; OPT-NEXT:   call i1 @llvm.amdgcn.loop(i64
+; OPT-NEXT:   call i64 @llvm.amdgcn.if.break(i1
 ;
 ; OPT: Flow1:
 
@@ -30,10 +31,9 @@
 
 ; Ensure extra or eliminated
 ; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]]
-; GCN-NEXT: s_mov_b64
-; GCN-NEXT: s_and_b64 [[MASKED_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_BREAK]]
-; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
-; TODO: get rid of redundant loop counter moves
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
+; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], vcc, s{{\[[0-9]+:[0-9]+\]}}
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; GCN-NEXT: v_mov_b32_e32
 ; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
 ; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]
@@ -43,8 +43,9 @@
 
 ; Ensure copy is eliminated
 ; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]]
-; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_BREAK]]
+; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, vcc
 ; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
+; GCN-NEXT: s_mov_b64
 ; GCN-NEXT: v_mov_b32_e32
 ; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]]
 ; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]]
@@ -71,9 +72,8 @@ ENDIF:                                            ; preds = %LOOP
 }
 
 ; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
-; OPT: llvm.amdgcn.break
-; OPT: llvm.amdgcn.loop
 ; OPT: llvm.amdgcn.if.break
+; OPT: llvm.amdgcn.loop
 ; OPT: llvm.amdgcn.if.break
 ; OPT: llvm.amdgcn.end.cf
 
@@ -82,9 +82,10 @@ ENDIF:                                            ; preds = %LOOP
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
 
-; Uses a copy intsead of an or
-; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]]
-; GCN: s_or_b64 [[BREAK_REG]], exec, [[BREAK_REG]]
+; GCN: s_or_b64 [[BREAK_REG]], vcc, [[BREAK_REG]]
+; GCN: s_andn2_b64 exec, exec, [[BREAK_REG]]
+; GCN-NEXT: s_cbranch_execnz
+
 define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 8489a785310..a007c965f94 100644
--- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -10,7 +10,7 @@
 ; IR-LABEL: @reduced_nested_loop_conditions(
 
 ; IR: bb5:
-; IR-NEXT: %phi.broken = phi i64 [ %loop.phi, %bb10 ], [ 0, %bb ]
+; IR-NEXT: %phi.broken = phi i64 [ %3, %bb10 ], [ 0, %bb ]
 ; IR-NEXT: %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ]
 ; IR-NEXT: %tmp7 = icmp eq i32 %tmp6, 1
 ; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %tmp7)
@@ -19,25 +19,23 @@
 ; IR-NEXT: br i1 %1, label %bb8, label %Flow
 
 ; IR: bb8:
-; IR-NEXT: %3 = call i64 @llvm.amdgcn.break(i64 %phi.broken)
 ; IR-NEXT: br label %bb13
 
 ; IR: bb10:
-; IR-NEXT: %loop.phi = phi i64 [ %6, %Flow ]
-; IR-NEXT: %tmp11 = phi i32 [ %5, %Flow ]
-; IR-NEXT: %4 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; IR-NEXT: %tmp11 = phi i32 [ %6, %Flow ]
+; IR-NEXT: %tmp12 = phi i1 [ %5, %Flow ]
+; IR-NEXT: %3 = call i64 @llvm.amdgcn.if.break(i1 %tmp12, i64 %phi.broken)
+; IR-NEXT: %4 = call i1 @llvm.amdgcn.loop(i64 %3)
 ; IR-NEXT: br i1 %4, label %bb23, label %bb5
 
 ; IR: Flow:
-; IR-NEXT: %loop.phi1 = phi i64 [ %loop.phi2, %bb4 ], [ %phi.broken, %bb5 ]
-; IR-NEXT: %5 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
-; IR-NEXT: %6 = call i64 @llvm.amdgcn.else.break(i64 %2, i64 %loop.phi1)
+; IR-NEXT: %5 = phi i1 [ %tmp22, %bb4 ], [ true, %bb5 ]
+; IR-NEXT: %6 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %2)
 ; IR-NEXT: br label %bb10
 
 ; IR: bb13:
-; IR-NEXT: %loop.phi3 = phi i64 [ %loop.phi4, %bb3 ], [ %3, %bb8 ]
-; IR-NEXT: %tmp14 = phi i1 [ false, %bb3 ], [ true, %bb8 ]
+; IR-NEXT: %tmp14 = phi i1 [ %tmp22, %bb3 ], [ true, %bb8 ]
 ; IR-NEXT: %tmp15 = bitcast i64 %tmp2 to <2 x i32>
 ; IR-NEXT: br i1 %tmp14, label %bb16, label %bb20
 
@@ -48,13 +46,12 @@
 ; IR-NEXT: br label %bb20
 
 ; IR: bb20:
-; IR-NEXT: %loop.phi4 = phi i64 [ %phi.broken, %bb16 ], [ %phi.broken, %bb13 ]
-; IR-NEXT: %loop.phi2 = phi i64 [ %phi.broken, %bb16 ], [ %loop.phi3, %bb13 ]
 ; IR-NEXT: %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ]
+; IR-NEXT: %tmp22 = phi i1 [ false, %bb16 ], [ %tmp14, %bb13 ]
 ; IR-NEXT: br label %bb9
 
 ; IR: bb23:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %3)
 ; IR-NEXT: ret void
 
 ; GCN-LABEL: {{^}}reduced_nested_loop_conditions:
@@ -125,7 +122,7 @@ bb23:                                             ; preds = %bb10
 
 ; IR: Flow3:
 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %21)
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %13)
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %14)
 ; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
 ; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
 ; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow4
@@ -147,25 +144,24 @@ bb23:                                             ; preds = %bb10
 ; IR-NEXT: %8 = call { i1, i64 } @llvm.amdgcn.if(i1 %tmp15)
 
 ; IR: Flow1:
-; IR-NEXT: %loop.phi = phi i64 [ %18, %bb21 ], [ %phi.broken, %bb14 ]
 ; IR-NEXT: %11 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %bb14 ]
 ; IR-NEXT: %12 = phi i32 [ %tmp10, %bb21 ], [ undef, %bb14 ]
-; IR-NEXT: %13 = phi i1 [ %17, %bb21 ], [ false, %bb14 ]
-; IR-NEXT: %14 = phi i1 [ false, %bb21 ], [ true, %bb14 ]
-; IR-NEXT: %15 = call i64 @llvm.amdgcn.else.break(i64 %10, i64 %loop.phi)
+; IR-NEXT: %13 = phi i1 [ %18, %bb21 ], [ true, %bb14 ]
+; IR-NEXT: %14 = phi i1 [ %18, %bb21 ], [ false, %bb14 ]
+; IR-NEXT: %15 = phi i1 [ false, %bb21 ], [ true, %bb14 ]
 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %10)
-; IR-NEXT: %16 = call i1 @llvm.amdgcn.loop(i64 %15)
-; IR-NEXT: br i1 %16, label %Flow2, label %bb14
+; IR-NEXT: %16 = call i64 @llvm.amdgcn.if.break(i1 %13, i64 %phi.broken)
+; IR-NEXT: %17 = call i1 @llvm.amdgcn.loop(i64 %16)
+; IR-NEXT: br i1 %17, label %Flow2, label %bb14
 
 ; IR: bb21:
 ; IR: %tmp12 = icmp slt i32 %tmp11, 9
-; IR-NEXT: %17 = xor i1 %tmp12, true
-; IR-NEXT: %18 = call i64 @llvm.amdgcn.if.break(i1 %17, i64 %phi.broken)
+; IR-NEXT: %18 = xor i1 %tmp12, true
 ; IR-NEXT: br label %Flow1
 
 ; IR: Flow2:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %15)
-; IR-NEXT: %19 = call { i1, i64 } @llvm.amdgcn.if(i1 %14)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %16)
+; IR-NEXT: %19 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
 ; IR-NEXT: %20 = extractvalue { i1, i64 } %19, 0
 ; IR-NEXT: %21 = extractvalue { i1, i64 } %19, 1
 ; IR-NEXT: br i1 %20, label %bb31.loopexit, label %Flow3
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
index 58bd9a0cdef..3d980b749a9 100644
--- a/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -212,20 +212,16 @@ exit:
 ; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
 ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
-; SI: s_xor_b64 [[ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG2]]
 ; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]
 
 ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20
 ; SI: buffer_store_dword
-; SI: v_cmp_ge_i64_e{{32|64}} [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
-; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
 
 ; SI: [[LABEL_FLOW]]:
 ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
-; SI-NEXT: s_or_b64 exec, exec, [[ORNEG3]]
-; SI-NEXT: s_mov_b64 [[MOVED_TMP:s\[[0-9]+:[0-9]+\]]], [[TMP]]
-; SI-NEXT: s_and_b64 [[MASKED_ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG3]]
-; SI-NEXT: s_or_b64 [[COND_STATE]], [[MASKED_ORNEG3]], [[MOVED_TMP]]
+; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
 ; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
 ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
 
-- 
GitLab


From 776a459079f1e573f4013f208d46f5e1a11d4ce8 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Wed, 31 Oct 2018 13:27:08 +0000
Subject: [PATCH 0799/1116] AMDGPU: Rewrite SILowerI1Copies to always stay on
 SALU

Summary:
Instead of writing boolean values temporarily into 32-bit VGPRs
if they are involved in PHIs or are observed from outside a loop,
we use bitwise masking operations to combine lane masks in a way
that is consistent with wave control flow.

Move SIFixSGPRCopies to before this pass, since that pass
incorrectly attempts to move SGPR phis to VGPRs.

This should recover most of the code quality that was lost with
the bug fix in "AMDGPU: Remove PHI loop condition optimization".

There are still some relevant cases where code quality could be
improved, in particular:

- We often introduce redundant masks with EXEC. Ideally, we'd
  have a generic computeKnownBits-like analysis to determine
  whether masks are already masked by EXEC, so we can avoid this
  masking both here and when lowering uniform control flow.

- The criterion we use to determine whether a def is observed
  from outside a loop is conservative: it doesn't check whether
  (loop) branch conditions are uniform.

Change-Id: Ibabdb373a7510e426b90deef00f5e16c5d56e64b

Reviewers: arsenm, rampitec, tpr

Subscribers: kzhuravl, jvesely, wdng, mgorny, yaxunl, dstuttard, t-tye, eraman, llvm-commits

Differential Revision: https://reviews.llvm.org/D53496

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345719 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUTargetMachine.cpp     |   2 +-
 lib/Target/AMDGPU/SIFixSGPRCopies.cpp         |   6 +-
 lib/Target/AMDGPU/SILowerI1Copies.cpp         | 830 ++++++++++++++++--
 .../AMDGPU/Utils/AMDGPULaneDominator.cpp      |  75 --
 lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h |  24 -
 lib/Target/AMDGPU/Utils/CMakeLists.txt        |   1 -
 test/CodeGen/AMDGPU/add_i1.ll                 |   4 +-
 test/CodeGen/AMDGPU/i1-copy-from-loop.ll      |  30 +-
 .../AMDGPU/i1-copy-phi-uniform-branch.ll      |  38 +
 test/CodeGen/AMDGPU/i1-copy-phi.ll            |  16 +-
 test/CodeGen/AMDGPU/inline-asm.ll             |   9 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll   |  30 +-
 test/CodeGen/AMDGPU/loop_break.ll             |  33 +-
 .../AMDGPU/multi-divergent-exit-region.ll     |  47 +-
 test/CodeGen/AMDGPU/multilevel-break.ll       |  94 +-
 test/CodeGen/AMDGPU/select-opt.ll             |   1 -
 test/CodeGen/AMDGPU/sgpr-control-flow.ll      |  30 +-
 test/CodeGen/AMDGPU/si-annotate-cf.ll         |  49 +-
 test/CodeGen/AMDGPU/sub_i1.ll                 |   4 +-
 test/CodeGen/AMDGPU/valu-i1.ll                |  18 +-
 test/CodeGen/AMDGPU/waitcnt-looptest.ll       |   2 +-
 21 files changed, 1013 insertions(+), 330 deletions(-)
 delete mode 100644 lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
 delete mode 100644 lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
 create mode 100644 test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll

diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6d39c254c73..48cde90a972 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -817,8 +817,8 @@ bool GCNPassConfig::addILPOpts() {
 
 bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
-  addPass(createSILowerI1CopiesPass());
   addPass(&SIFixSGPRCopiesID);
+  addPass(createSILowerI1CopiesPass());
   return false;
 }
 
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 5324cbc912d..809f5bab469 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -183,13 +183,15 @@ getCopyRegClasses(const MachineInstr &Copy,
 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
                              const TargetRegisterClass *DstRC,
                              const SIRegisterInfo &TRI) {
-  return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
+  return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
+         TRI.hasVGPRs(SrcRC);
 }
 
 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
                              const TargetRegisterClass *DstRC,
                              const SIRegisterInfo &TRI) {
-  return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
+  return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
+         TRI.hasVGPRs(DstRC);
 }
 
 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index ecc6cff407e..eb038bb5d5f 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -5,37 +5,61 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-/// i1 values are usually inserted by the CFG Structurize pass and they are
-/// unique in that they can be copied from VALU to SALU registers.
-/// This is not possible for any other value type.  Since there are no
-/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
-///
 //===----------------------------------------------------------------------===//
 //
+// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
+// to lane masks (64-bit scalar registers). The pass assumes machine SSA form
+// and a wave-level control flow graph.
+//
+// Before this pass, values that are semantically i1 and are defined and used
+// within the same basic block are already represented as lane masks in scalar
+// registers. However, values that cross basic blocks are always transferred
+// between basic blocks in vreg_1 virtual registers and are lowered by this
+// pass.
+//
+// The only instructions that use or define vreg_1 virtual registers are COPY,
+// PHI, and IMPLICIT_DEF.
+//
+//===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "si-i1-copies"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPULaneDominator.h"
-#include "llvm/CodeGen/LiveIntervals.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
 
+#define DEBUG_TYPE "si-i1-copies"
+
 using namespace llvm;
 
+static unsigned createLaneMaskReg(MachineFunction &MF);
+static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
+
 namespace {
 
 class SILowerI1Copies : public MachineFunctionPass {
 public:
   static char ID;
 
+private:
+  MachineFunction *MF = nullptr;
+  MachineDominatorTree *DT = nullptr;
+  MachinePostDominatorTree *PDT = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  const GCNSubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+
+  DenseSet<unsigned> ConstrainRegs;
+
 public:
   SILowerI1Copies() : MachineFunctionPass(ID) {
     initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
@@ -47,14 +71,337 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
+
+private:
+  void lowerCopiesFromI1();
+  void lowerPhis();
+  void lowerCopiesToI1();
+  bool isConstantLaneMask(unsigned Reg, bool &Val) const;
+  void buildMergeLaneMasks(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, const DebugLoc &DL,
+                           unsigned DstReg, unsigned PrevReg, unsigned CurReg);
+  MachineBasicBlock::iterator
+  getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
+
+  bool isLaneMaskReg(unsigned Reg) const {
+    return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
+           TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
+               ST->getWavefrontSize();
+  }
+};
+
+/// Helper class that determines the relationship between incoming values of a
+/// phi in the control flow graph to determine where an incoming value can
+/// simply be taken as a scalar lane mask as-is, and where it needs to be
+/// merged with another, previously defined lane mask.
+///
+/// The approach is as follows:
+///  - Determine all basic blocks which, starting from the incoming blocks,
+///    a wave may reach before entering the def block (the block containing the
+///    phi).
+///  - If an incoming block has no predecessors in this set, we can take the
+///    incoming value as a scalar lane mask as-is.
+///  -- A special case of this is when the def block has a self-loop.
+///  - Otherwise, the incoming value needs to be merged with a previously
+///    defined lane mask.
+///  - If there is a path into the set of reachable blocks that does _not_ go
+///    through an incoming block where we can take the scalar lane mask as-is,
+///    we need to invent an available value for the SSAUpdater. Choices are
+///    0 and undef, with differing consequences for how to merge values etc.
+///
+/// TODO: We could use region analysis to quickly skip over SESE regions during
+///       the traversal.
+///
+class PhiIncomingAnalysis {
+  MachinePostDominatorTree &PDT;
+
+  // For each reachable basic block, whether it is a source in the induced
+  // subgraph of the CFG.
+  DenseMap<MachineBasicBlock *, bool> ReachableMap;
+  SmallVector<MachineBasicBlock *, 4> ReachableOrdered;
+  SmallVector<MachineBasicBlock *, 4> Stack;
+  SmallVector<MachineBasicBlock *, 4> Predecessors;
+
+public:
+  PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {}
+
+  /// Returns whether \p MBB is a source in the induced subgraph of reachable
+  /// blocks.
+  bool isSource(MachineBasicBlock &MBB) const {
+    return ReachableMap.find(&MBB)->second;
+  }
+
+  ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
+
+  void analyze(MachineBasicBlock &DefBlock,
+               ArrayRef<MachineBasicBlock *> IncomingBlocks) {
+    assert(Stack.empty());
+    ReachableMap.clear();
+    ReachableOrdered.clear();
+    Predecessors.clear();
+
+    // Insert the def block first, so that it acts as an end point for the
+    // traversal.
+    ReachableMap.try_emplace(&DefBlock, false);
+    ReachableOrdered.push_back(&DefBlock);
+
+    for (MachineBasicBlock *MBB : IncomingBlocks) {
+      if (MBB == &DefBlock) {
+        ReachableMap[&DefBlock] = true; // self-loop on DefBlock
+        continue;
+      }
+
+      ReachableMap.try_emplace(MBB, false);
+      ReachableOrdered.push_back(MBB);
+
+      // If this block has a divergent terminator and the def block is its
+      // post-dominator, the wave may first visit the other successors.
+      bool Divergent = false;
+      for (MachineInstr &MI : MBB->terminators()) {
+        if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
+            MI.getOpcode() == AMDGPU::SI_IF ||
+            MI.getOpcode() == AMDGPU::SI_ELSE ||
+            MI.getOpcode() == AMDGPU::SI_LOOP) {
+          Divergent = true;
+          break;
+        }
+      }
+
+      if (Divergent && PDT.dominates(&DefBlock, MBB)) {
+        for (MachineBasicBlock *Succ : MBB->successors())
+          Stack.push_back(Succ);
+      }
+    }
+
+    while (!Stack.empty()) {
+      MachineBasicBlock *MBB = Stack.pop_back_val();
+      if (!ReachableMap.try_emplace(MBB, false).second)
+        continue;
+      ReachableOrdered.push_back(MBB);
+
+      for (MachineBasicBlock *Succ : MBB->successors())
+        Stack.push_back(Succ);
+    }
+
+    for (MachineBasicBlock *MBB : ReachableOrdered) {
+      bool HaveReachablePred = false;
+      for (MachineBasicBlock *Pred : MBB->predecessors()) {
+        if (ReachableMap.count(Pred)) {
+          HaveReachablePred = true;
+        } else {
+          Stack.push_back(Pred);
+        }
+      }
+      if (!HaveReachablePred)
+        ReachableMap[MBB] = true;
+      if (HaveReachablePred) {
+        for (MachineBasicBlock *UnreachablePred : Stack) {
+          if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
+            Predecessors.push_back(UnreachablePred);
+        }
+      }
+      Stack.clear();
+    }
+  }
+};
+
+/// Helper class that detects loops which require us to lower an i1 COPY into
+/// bitwise manipulation.
+///
+/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
+/// between loops with the same header. Consider this example:
+///
+///  A-+-+
+///  | | |
+///  B-+ |
+///  |   |
+///  C---+
+///
+/// A is the header of a loop containing A, B, and C as far as LoopInfo is
+/// concerned. However, an i1 COPY in B that is used in C must be lowered to
+/// bitwise operations to combine results from different loop iterations when
+/// B has a divergent branch (since by default we will compile this code such
+/// that threads in a wave are merged at the entry of C).
+///
+/// The following rule is implemented to determine whether bitwise operations
+/// are required: use the bitwise lowering for a def in block B if a backward
+/// edge to B is reachable without going through the nearest common
+/// post-dominator of B and all uses of the def.
+///
+/// TODO: This rule is conservative because it does not check whether the
+///       relevant branches are actually divergent.
+///
+/// The class is designed to cache the CFG traversal so that it can be re-used
+/// for multiple defs within the same basic block.
+///
+/// TODO: We could use region analysis to quickly skip over SESE regions during
+///       the traversal.
+///
+class LoopFinder {
+  MachineDominatorTree &DT;
+  MachinePostDominatorTree &PDT;
+
+  // All visited / reachable block, tagged by level (level 0 is the def block,
+  // level 1 are all blocks reachable including but not going through the def
+  // block's IPDOM, etc.).
+  DenseMap<MachineBasicBlock *, unsigned> Visited;
+
+  // Nearest common dominator of all visited blocks by level (level 0 is the
+  // def block). Used for seeding the SSAUpdater.
+  SmallVector<MachineBasicBlock *, 4> CommonDominators;
+
+  // Post-dominator of all visited blocks.
+  MachineBasicBlock *VisitedPostDom = nullptr;
+
+  // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
+  // reachable without going through the IPDOM of the def block (if the IPDOM
+  // itself has an edge to the def block, the loop level is 2), etc.
+  unsigned FoundLoopLevel = ~0u;
+
+  MachineBasicBlock *DefBlock = nullptr;
+  SmallVector<MachineBasicBlock *, 4> Stack;
+  SmallVector<MachineBasicBlock *, 4> NextLevel;
+
+public:
+  LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
+      : DT(DT), PDT(PDT) {}
+
+  void initialize(MachineBasicBlock &MBB) {
+    Visited.clear();
+    CommonDominators.clear();
+    Stack.clear();
+    NextLevel.clear();
+    VisitedPostDom = nullptr;
+    FoundLoopLevel = ~0u;
+
+    DefBlock = &MBB;
+  }
+
+  /// Check whether a backward edge can be reached without going through the
+  /// given \p PostDom of the def block.
+  ///
+  /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
+  unsigned findLoop(MachineBasicBlock *PostDom) {
+    MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
+
+    if (!VisitedPostDom)
+      advanceLevel();
+
+    unsigned Level = 0;
+    while (PDNode->getBlock() != PostDom) {
+      if (PDNode->getBlock() == VisitedPostDom)
+        advanceLevel();
+      PDNode = PDNode->getIDom();
+      Level++;
+      if (FoundLoopLevel == Level)
+        return Level;
+    }
+
+    return 0;
+  }
+
+  /// Add undef values dominating the loop and the optionally given additional
+  /// blocks, so that the SSA updater doesn't have to search all the way to the
+  /// function entry.
+  void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
+                      ArrayRef<MachineBasicBlock *> Blocks = {}) {
+    assert(LoopLevel < CommonDominators.size());
+
+    MachineBasicBlock *Dom = CommonDominators[LoopLevel];
+    for (MachineBasicBlock *MBB : Blocks)
+      Dom = DT.findNearestCommonDominator(Dom, MBB);
+
+    if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
+      SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
+    } else {
+      // The dominator is part of the loop or the given blocks, so add the
+      // undef value to unreachable predecessors instead.
+      for (MachineBasicBlock *Pred : Dom->predecessors()) {
+        if (!inLoopLevel(*Pred, LoopLevel, Blocks))
+          SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
+      }
+    }
+  }
+
+private:
+  bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
+                   ArrayRef<MachineBasicBlock *> Blocks) const {
+    auto DomIt = Visited.find(&MBB);
+    if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
+      return true;
+
+    if (llvm::find(Blocks, &MBB) != Blocks.end())
+      return true;
+
+    return false;
+  }
+
+  void advanceLevel() {
+    MachineBasicBlock *VisitedDom;
+
+    if (!VisitedPostDom) {
+      VisitedPostDom = DefBlock;
+      VisitedDom = DefBlock;
+      Stack.push_back(DefBlock);
+    } else {
+      VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
+      VisitedDom = CommonDominators.back();
+
+      for (unsigned i = 0; i < NextLevel.size();) {
+        if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
+          Stack.push_back(NextLevel[i]);
+
+          NextLevel[i] = NextLevel.back();
+          NextLevel.pop_back();
+        } else {
+          i++;
+        }
+      }
+    }
+
+    unsigned Level = CommonDominators.size();
+    while (!Stack.empty()) {
+      MachineBasicBlock *MBB = Stack.pop_back_val();
+      if (!PDT.dominates(VisitedPostDom, MBB))
+        NextLevel.push_back(MBB);
+
+      Visited[MBB] = Level;
+      VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
+
+      for (MachineBasicBlock *Succ : MBB->successors()) {
+        if (Succ == DefBlock) {
+          if (MBB == VisitedPostDom)
+            FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
+          else
+            FoundLoopLevel = std::min(FoundLoopLevel, Level);
+          continue;
+        }
+
+        if (Visited.try_emplace(Succ, ~0u).second) {
+          if (MBB == VisitedPostDom)
+            NextLevel.push_back(Succ);
+          else
+            Stack.push_back(Succ);
+        }
+      }
+    }
+
+    CommonDominators.push_back(VisitedDom);
+  }
 };
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE,
-                "SI Lower i1 Copies", false, false)
+INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
+                    false)
 
 char SILowerI1Copies::ID = 0;
 
@@ -64,104 +411,415 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
   return new SILowerI1Copies();
 }
 
-bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+static unsigned createLaneMaskReg(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+}
+
+static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
+  MachineFunction &MF = *MBB.getParent();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
-  const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+  unsigned UndefReg = createLaneMaskReg(MF);
+  BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
+          UndefReg);
+  return UndefReg;
+}
 
-  std::vector<unsigned> I1Defs;
+/// Lower all instructions that def or use vreg_1 registers.
+///
+/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
+/// occur around inline assembly. We do this first, before vreg_1 registers
+/// are changed to scalar mask registers.
+///
+/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
+/// all others, because phi lowering looks through copies and can therefore
+/// often make copy lowering unnecessary.
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
+  MF = &TheMF;
+  MRI = &MF->getRegInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
+  PDT = &getAnalysis<MachinePostDominatorTree>();
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
+  ST = &MF->getSubtarget<GCNSubtarget>();
+  TII = ST->getInstrInfo();
 
-    MachineBasicBlock &MBB = *BI;
-    MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
-      Next = std::next(I);
-      MachineInstr &MI = *I;
+  lowerCopiesFromI1();
+  lowerPhis();
+  lowerCopiesToI1();
 
-      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
-        unsigned Reg = MI.getOperand(0).getReg();
-        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-        if (RC == &AMDGPU::VReg_1RegClass)
-          MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
-        continue;
-      }
+  for (unsigned Reg : ConstrainRegs)
+    MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
+  ConstrainRegs.clear();
+
+  return true;
+}
 
+void SILowerI1Copies::lowerCopiesFromI1() {
+  SmallVector<MachineInstr *, 4> DeadCopies;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
       if (MI.getOpcode() != AMDGPU::COPY)
         continue;
 
-      const MachineOperand &Dst = MI.getOperand(0);
-      const MachineOperand &Src = MI.getOperand(1);
-
-      if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
-          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+          MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
         continue;
 
-      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
-      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
+      if (isLaneMaskReg(DstReg) ||
+          (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+           MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
+        continue;
 
+      // Copy into a 32-bit vector register.
+      LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
       DebugLoc DL = MI.getDebugLoc();
-      MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
-      if (DstRC == &AMDGPU::VReg_1RegClass &&
-          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
-        I1Defs.push_back(Dst.getReg());
-
-        if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
-          if (DefInst->getOperand(1).isImm()) {
-            I1Defs.push_back(Dst.getReg());
-
-            int64_t Val = DefInst->getOperand(1).getImm();
-            assert(Val == 0 || Val == -1);
-
-            BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
-                .add(Dst)
-                .addImm(Val);
-            MI.eraseFromParent();
-            continue;
+
+      assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
+      assert(!MI.getOperand(0).getSubReg());
+
+      ConstrainRegs.insert(SrcReg);
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+          .addImm(0)
+          .addImm(-1)
+          .addReg(SrcReg);
+      DeadCopies.push_back(&MI);
+    }
+
+    for (MachineInstr *MI : DeadCopies)
+      MI->eraseFromParent();
+    DeadCopies.clear();
+  }
+}
+
+void SILowerI1Copies::lowerPhis() {
+  MachineSSAUpdater SSAUpdater(*MF);
+  LoopFinder LF(*DT, *PDT);
+  PhiIncomingAnalysis PIA(*PDT);
+  SmallVector<MachineInstr *, 4> DeadPhis;
+  SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
+  SmallVector<unsigned, 4> IncomingRegs;
+  SmallVector<unsigned, 4> IncomingUpdated;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    LF.initialize(MBB);
+
+    for (MachineInstr &MI : MBB.phis()) {
+      unsigned DstReg = MI.getOperand(0).getReg();
+      if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);
+
+      MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+
+      // Collect incoming values.
+      for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+        assert(i + 1 < MI.getNumOperands());
+        unsigned IncomingReg = MI.getOperand(i).getReg();
+        MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
+        MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
+
+        if (IncomingDef->getOpcode() == AMDGPU::COPY) {
+          IncomingReg = IncomingDef->getOperand(1).getReg();
+          assert(isLaneMaskReg(IncomingReg));
+          assert(!IncomingDef->getOperand(1).getSubReg());
+        } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
+          continue;
+        } else {
+          assert(IncomingDef->isPHI());
+        }
+
+        IncomingBlocks.push_back(IncomingMBB);
+        IncomingRegs.push_back(IncomingReg);
+      }
+
+      // Phis in a loop that are observed outside the loop receive a simple but
+      // conservatively correct treatment.
+      MachineBasicBlock *PostDomBound = &MBB;
+      for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
+        PostDomBound =
+            PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
+      }
+
+      unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
+
+      SSAUpdater.Initialize(DstReg);
+
+      if (FoundLoopLevel) {
+        LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
+
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          IncomingUpdated.push_back(createLaneMaskReg(*MF));
+          SSAUpdater.AddAvailableValue(IncomingBlocks[i],
+                                       IncomingUpdated.back());
+        }
+
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          MachineBasicBlock &IMBB = *IncomingBlocks[i];
+          buildMergeLaneMasks(
+              IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
+              SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
+        }
+      } else {
+        // The phi is not observed from outside a loop. Use a more accurate
+        // lowering.
+        PIA.analyze(MBB, IncomingBlocks);
+
+        for (MachineBasicBlock *MBB : PIA.predecessors())
+          SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
+
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          MachineBasicBlock &IMBB = *IncomingBlocks[i];
+          if (PIA.isSource(IMBB)) {
+            IncomingUpdated.push_back(0);
+            SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
+          } else {
+            IncomingUpdated.push_back(createLaneMaskReg(*MF));
+            SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
           }
         }
 
-        unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
-        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc)
-            .add(Src);
-        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
-            .add(Dst)
-            .addImm(0)
-            .addImm(-1)
-            .addReg(TmpSrc);
-        MI.eraseFromParent();
-      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
-                 SrcRC == &AMDGPU::VReg_1RegClass) {
-        if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
-            DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
-            DefInst->getOperand(1).getImm() == 0 &&
-            DefInst->getOperand(2).getImm() != 0 &&
-            DefInst->getOperand(3).isReg() &&
-            TargetRegisterInfo::isVirtualRegister(
-              DefInst->getOperand(3).getReg()) &&
-            TRI->getCommonSubClass(
-              MRI.getRegClass(DefInst->getOperand(3).getReg()),
-              &AMDGPU::SGPR_64RegClass) &&
-            AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {
-          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
-              .add(Dst)
-              .addReg(AMDGPU::EXEC)
-              .add(DefInst->getOperand(3));
-        } else {
-          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
-              .add(Dst)
-              .add(Src)
-              .addImm(0);
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          if (!IncomingUpdated[i])
+            continue;
+
+          MachineBasicBlock &IMBB = *IncomingBlocks[i];
+          buildMergeLaneMasks(
+              IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
+              SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
         }
-        MI.eraseFromParent();
+      }
+
+      unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
+      if (NewReg != DstReg) {
+        MRI->replaceRegWith(NewReg, DstReg);
+
+        // Ensure that DstReg has a single def and mark the old PHI node for
+        // deletion.
+        MI.getOperand(0).setReg(NewReg);
+        DeadPhis.push_back(&MI);
+      }
+
+      IncomingBlocks.clear();
+      IncomingRegs.clear();
+      IncomingUpdated.clear();
+    }
+
+    for (MachineInstr *MI : DeadPhis)
+      MI->eraseFromParent();
+    DeadPhis.clear();
+  }
+}
+
+void SILowerI1Copies::lowerCopiesToI1() {
+  MachineSSAUpdater SSAUpdater(*MF);
+  LoopFinder LF(*DT, *PDT);
+  SmallVector<MachineInstr *, 4> DeadCopies;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    LF.initialize(MBB);
+
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
+          MI.getOpcode() != AMDGPU::COPY)
+        continue;
+
+      unsigned DstReg = MI.getOperand(0).getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(DstReg) ||
+          MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+        continue;
+
+      if (MRI->use_empty(DstReg)) {
+        DeadCopies.push_back(&MI);
+        continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
+
+      MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
+        continue;
+
+      DebugLoc DL = MI.getDebugLoc();
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      assert(!MI.getOperand(1).getSubReg());
+
+      if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+          !isLaneMaskReg(SrcReg)) {
+        assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
+        unsigned TmpReg = createLaneMaskReg(*MF);
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
+            .addReg(SrcReg)
+            .addImm(0);
+        MI.getOperand(1).setReg(TmpReg);
+        SrcReg = TmpReg;
+      }
+
+      // Defs in a loop that are observed outside the loop must be transformed
+      // into appropriate bit manipulation.
+      MachineBasicBlock *PostDomBound = &MBB;
+      for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
+        PostDomBound =
+            PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
+      }
+
+      unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
+      if (FoundLoopLevel) {
+        SSAUpdater.Initialize(DstReg);
+        SSAUpdater.AddAvailableValue(&MBB, DstReg);
+        LF.addLoopEntries(FoundLoopLevel, SSAUpdater);
+
+        buildMergeLaneMasks(MBB, MI, DL, DstReg,
+                            SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
+        DeadCopies.push_back(&MI);
       }
     }
+
+    for (MachineInstr *MI : DeadCopies)
+      MI->eraseFromParent();
+    DeadCopies.clear();
   }
+}
 
-  for (unsigned Reg : I1Defs)
-    MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
+  const MachineInstr *MI;
+  for (;;) {
+    MI = MRI->getUniqueVRegDef(Reg);
+    if (MI->getOpcode() != AMDGPU::COPY)
+      break;
+
+    Reg = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      return false;
+    if (!isLaneMaskReg(Reg))
+      return false;
+  }
+
+  if (MI->getOpcode() != AMDGPU::S_MOV_B64)
+    return false;
+
+  if (!MI->getOperand(1).isImm())
+    return false;
+
+  int64_t Imm = MI->getOperand(1).getImm();
+  if (Imm == 0) {
+    Val = false;
+    return true;
+  }
+  if (Imm == -1) {
+    Val = true;
+    return true;
+  }
 
   return false;
 }
+
+static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
+  Def = false;
+  Use = false;
+
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
+      if (MO.isUse())
+        Use = true;
+      else
+        Def = true;
+    }
+  }
+}
+
+/// Return a point at the end of the given \p MBB to insert SALU instructions
+/// for lane mask calculation. Take terminators and SCC into account.
+MachineBasicBlock::iterator
+SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
+  auto InsertionPt = MBB.getFirstTerminator();
+  bool TerminatorsUseSCC = false;
+  for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
+    bool DefsSCC;
+    instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
+    if (TerminatorsUseSCC || DefsSCC)
+      break;
+  }
+
+  if (!TerminatorsUseSCC)
+    return InsertionPt;
+
+  while (InsertionPt != MBB.begin()) {
+    InsertionPt--;
+
+    bool DefSCC, UseSCC;
+    instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
+    if (DefSCC)
+      return InsertionPt;
+  }
+
+  // We should have at least seen an IMPLICIT_DEF or COPY
+  llvm_unreachable("SCC used by terminator but no def in block");
+}
+
+void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          const DebugLoc &DL, unsigned DstReg,
+                                          unsigned PrevReg, unsigned CurReg) {
+  bool PrevVal;
+  bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
+  bool CurVal;
+  bool CurConstant = isConstantLaneMask(CurReg, CurVal);
+
+  if (PrevConstant && CurConstant) {
+    if (PrevVal == CurVal) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
+    } else if (CurVal) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC);
+    } else {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
+          .addReg(AMDGPU::EXEC)
+          .addImm(-1);
+    }
+    return;
+  }
+
+  unsigned PrevMaskedReg = 0;
+  unsigned CurMaskedReg = 0;
+  if (!PrevConstant) {
+    if (CurConstant && CurVal) {
+      PrevMaskedReg = PrevReg;
+    } else {
+      PrevMaskedReg = createLaneMaskReg(*MF);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
+          .addReg(PrevReg)
+          .addReg(AMDGPU::EXEC);
+    }
+  }
+  if (!CurConstant) {
+    // TODO: check whether CurReg is already masked by EXEC
+    if (PrevConstant && PrevVal) {
+      CurMaskedReg = CurReg;
+    } else {
+      CurMaskedReg = createLaneMaskReg(*MF);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
+          .addReg(CurReg)
+          .addReg(AMDGPU::EXEC);
+    }
+  }
+
+  if (PrevConstant && !PrevVal) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
+        .addReg(CurMaskedReg);
+  } else if (CurConstant && !CurVal) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
+        .addReg(PrevMaskedReg);
+  } else if (PrevConstant && PrevVal) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
+        .addReg(CurMaskedReg)
+        .addReg(AMDGPU::EXEC);
+  } else {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
+        .addReg(PrevMaskedReg)
+        .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
+  }
+}
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
deleted file mode 100644
index 1924f71f11c..00000000000
--- a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// MBB A lane-dominates MBB B if
-// 1. A dominates B in the usual sense, i.e. every path from the entry to B
-//    goes through A, and
-// 2. whenever B executes, every active lane during that execution of B was
-//    also active during the most recent execution of A.
-//
-// The simplest example where A dominates B but does not lane-dominate it is
-// where A is a loop:
-//
-//     |
-//     +--+
-//     A  |
-//     +--+
-//     |
-//     B
-//
-// Unfortunately, the second condition is not fully captured by the control
-// flow graph when it is unstructured (as may happen when branch conditions are
-// uniform).
-//
-// The following replacement of the second condition is a conservative
-// approximation. It is an equivalent condition when the CFG is fully
-// structured:
-//
-// 2'. every cycle in the CFG that contains A also contains B.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPULaneDominator.h"
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-
-namespace llvm {
-
-namespace AMDGPU {
-
-// Given machine basic blocks A and B where A dominates B, check whether
-// A lane-dominates B.
-//
-// The check is conservative, i.e. there can be false-negatives.
-bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) {
-  // Check whether A is reachable from itself without going through B.
-  DenseSet<MachineBasicBlock *> Reachable;
-  SmallVector<MachineBasicBlock *, 8> Stack;
-
-  Stack.push_back(A);
-  do {
-    MachineBasicBlock *MBB = Stack.back();
-    Stack.pop_back();
-
-    for (MachineBasicBlock *Succ : MBB->successors()) {
-      if (Succ == A)
-        return false;
-      if (Succ != B && Reachable.insert(Succ).second)
-        Stack.push_back(Succ);
-    }
-  } while (!Stack.empty());
-
-  return true;
-}
-
-} // namespace AMDGPU
-
-} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
deleted file mode 100644
index 4f33a89a364..00000000000
--- a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
-#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
-
-namespace llvm {
-
-class MachineBasicBlock;
-
-namespace AMDGPU {
-
-bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB);
-
-} // end namespace AMDGPU
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt
index c5ed32e4682..01b80ebe8d3 100644
--- a/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -2,5 +2,4 @@ add_llvm_library(LLVMAMDGPUUtils
   AMDGPUBaseInfo.cpp
   AMDKernelCodeTUtils.cpp
   AMDGPUAsmUtils.cpp
-  AMDGPULaneDominator.cpp
   )
diff --git a/test/CodeGen/AMDGPU/add_i1.ll b/test/CodeGen/AMDGPU/add_i1.ll
index fb3b69ca3bd..c5f7e3af5e3 100644
--- a/test/CodeGen/AMDGPU/add_i1.ll
+++ b/test/CodeGen/AMDGPU/add_i1.ll
@@ -21,8 +21,8 @@ define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)
 }
 
 ; GCN-LABEL: {{^}}add_i1_cf:
-; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}
-; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+; GCN: ; %endif
+; GCN: s_not_b64
 define amdgpu_kernel void @add_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
index 84a2d3d3a7b..ae78a1ecf32 100644
--- a/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -1,19 +1,25 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}i1_copy_from_loop:
 ;
-; Cannot use an SGPR mask to copy %cc out of the loop, since the mask would
-; only contain the lanes that were active during the last loop iteration.
-;
 ; SI: ; %for.body
-; SI:      v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4,
-; SI:      v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]]
-; SI-NEXT: s_cbranch_vccnz [[ENDIF:BB[0-9_]+]]
-; SI:      [[ENDIF]]:
-; SI-NOT:  [[VREG]]
-; SI:      ; %for.end
-; SI:      v_cmp_ne_u32_e32 vcc, 0, [[VREG]]
+; SI:      v_cmp_gt_u32_e64  [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4,
+; SI-DAG:  s_andn2_b64       [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI-DAG:  s_and_b64         [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec
+; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]]
+
+; SI: ; %Flow1
+; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], exec
+
+; SI: ; %Flow
+; SI-DAG:  s_andn2_b64       [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
+; SI-DAG:  s_and_b64         [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI:      s_or_b64          [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
+
+; SI: ; %for.end
+; SI:      s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]]
+
 define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
 entry:
   br label %for.body
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll b/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
new file mode 100644
index 00000000000..0aacbbfda18
--- /dev/null
+++ b/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_dont_clobber_scc:
+
+; GCN: ; %entry
+; GCN:      s_cmp_eq_u32    s0, 0
+; GCN:      s_cbranch_scc1  [[PREEXIT:BB[0-9_]+]]
+
+; GCN: ; %blocka
+; GCN:      s_xor_b64       s[{{[0-9:]+}}], exec, -1
+; GCN:      s_cmp_eq_u32    s1, 0
+; GCN:      s_cbranch_scc1  [[EXIT:BB[0-9_]+]]
+
+; GCN: [[PREEXIT]]:
+; GCN: [[EXIT]]:
+
+define amdgpu_vs float @test_dont_clobber_scc(i32 inreg %uni, i32 inreg %uni2) #0 {
+entry:
+  %cc.uni = icmp eq i32 %uni, 0
+  br i1 %cc.uni, label %exit, label %blocka
+
+blocka:
+  call void asm sideeffect "; dummy a", ""()
+  %cc.uni2 = icmp eq i32 %uni2, 0
+  br i1 %cc.uni2, label %exit, label %blockb
+
+blockb:
+  call void asm sideeffect "; dummy b", ""()
+  br label %exit
+
+exit:
+  %cc.phi = phi i1 [ true, %entry ], [ false, %blocka ], [ false, %blockb ]
+  call void asm sideeffect "; dummy exit", ""()
+  %r = select i1 %cc.phi, float 1.0, float 2.0
+  ret float %r
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 63a9f1feb6d..5b25271ce17 100644
--- a/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -2,12 +2,16 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}br_i1_phi:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; SI: s_and_saveexec_b64
-; SI: v_mov_b32_e32 [[REG]], -1{{$}}
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]]
-; SI: s_and_saveexec_b64
-; SI: s_endpgm
+
+; SI: ; %bb
+; SI:    s_mov_b64           [[TMP:s\[[0-9]+:[0-9]+\]]], 0
+
+; SI: ; %bb2
+; SI:    s_mov_b64           [[TMP]], exec
+
+; SI: ; %bb3
+; SI:    s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[TMP]]
+
 define amdgpu_kernel void @br_i1_phi(i32 %arg) {
 bb:
   %tidig = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index a0563cdd319..9615efaaa93 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -198,7 +198,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:
-; CHECK: v_mov_b32_e32 v0, -1{{$}}
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], -1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, [[MASK]]
 ; CHECK: ; use v0
 define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
 entry:
@@ -212,10 +213,14 @@ entry:
 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]]
 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK: ; use v0
+; CHECK: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK: v_cndmask_b32_e64 [[STORE:v[0-9]+]], 0, 1, vcc
+; CHECK: {{buffer|flat}}_store_byte [[STORE]],
 define amdgpu_kernel void @i1_input_phys_vgpr() {
 entry:
   %val = load i1, i1 addrspace(1)* undef
-  call void asm sideeffect "; use $0 ", "{v0}"(i1 %val)
+  %cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val)
+  store i1 %cc, i1 addrspace(1)* undef
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 34b842d8436..63c1556212d 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 ; FIXME: Enable for VI.
 
@@ -144,20 +144,24 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
-; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
-; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
 
-; SI: buffer_load_dword [[LOAD:v[0-9]+]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: ; %entry
+; SI:     v_cmp_eq_u32_e64   [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
+; SI:     s_mov_b64          vcc, 0
+; SI:     s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]]
 
+; SI: ; %bb
+; SI:     buffer_load_dword  [[LOAD:v[0-9]+]],
+; SI:     v_cmp_ne_u32_e32   vcc, 0, [[LOAD]]
+; SI:     s_and_b64          vcc, vcc, exec
+
+; SI: ; %exit
+; SI:     s_or_b64           exec, exec, [[SAVE]]
+; SI-NOT: vcc
+; SI:     v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI:     buffer_store_dword
+; SI:     s_endpgm
 
-; SI: BB9_2:
-; SI: s_or_b64 exec, exec, [[SAVE]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: buffer_store_dword
-; SI: s_endpgm
 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll
index 576950188d3..f37b3a3637a 100644
--- a/test/CodeGen/AMDGPU/loop_break.ll
+++ b/test/CodeGen/AMDGPU/loop_break.ll
@@ -22,23 +22,28 @@
 ; OPT: bb9:
 ; OPT: call void @llvm.amdgcn.end.cf(i64
 
-; TODO: Can remove exec fixes in return block
 ; GCN-LABEL: {{^}}break_loop:
-; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN:      s_mov_b64         [[OUTER_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
-; GCN: v_cmp_lt_i32_e32 vcc, -1
-; GCN: s_and_b64 vcc, exec, vcc
-; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
-
-; GCN: ; %bb.2: ; %bb4
-; GCN: buffer_load_dword
-; GCN: v_cmp_ge_i32_e32 vcc,
-
-; GCN: [[FLOW]]:
-; GCN: s_or_b64 [[MASK]], vcc, [[MASK]]
-; GCN: s_andn2_b64 exec, exec, [[MASK]]
-; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]
+; GCN:      v_cmp_lt_i32_e32  vcc, -1
+; GCN:      s_and_b64         vcc, exec, vcc
+; GCN:      s_or_b64          [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec
+; GCN:      s_cbranch_vccnz   [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: ; %bb4
+; GCN:      buffer_load_dword
+; GCN:      v_cmp_ge_i32_e32  vcc,
+; GCN:      s_andn2_b64       [[INNER_MASK]], [[INNER_MASK]], exec
+; GCN:      s_and_b64         [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN:      s_or_b64          [[INNER_MASK]], [[INNER_MASK]], [[TMP0]]
+
+; GCN: [[FLOW]]: ; %Flow
+; GCN:      s_and_b64         [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]]
+; GCN:      s_or_b64          [[TMP1]], [[TMP1]], [[OUTER_MASK]]
+; GCN:      s_mov_b64         [[OUTER_MASK]], [[TMP1]]
+; GCN:      s_andn2_b64       exec, exec, [[TMP1]]
+; GCN-NEXT: s_cbranch_execnz  [[LOOP_ENTRY]]
 
 ; GCN: ; %bb.4: ; %bb9
 ; GCN-NEXT: s_endpgm
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index fbdf9832b29..679fd7c9870 100644
--- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -59,31 +59,48 @@
 
 
 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
-; GCN: v_cmp_lt_i32_e32 vcc, 1
-; GCN: s_and_saveexec_b64
-; GCN: s_xor_b64
 
+; GCN:      s_mov_b64           [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
+; GCN:      v_cmp_lt_i32_e32    vcc, 1,
+; GCN:      s_mov_b64           [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
+; GCN:      s_and_saveexec_b64
+; GCN:      s_xor_b64
+
+; GCN: ; %LeafBlock1
+; GCN-NEXT: s_mov_b64           [[EXIT0]], exec
+; GCN-NEXT: v_cmp_ne_u32_e32    vcc, 2,
+; GCN-NEXT: s_and_b64           [[EXIT1]], vcc, exec
+
+; GCN: ; %Flow
+; GCN-NEXT: s_or_saveexec_b64
+; GCN-NEXT: s_xor_b64
 
 ; FIXME: Why is this compare essentially repeated?
-; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
-; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: ; %LeafBlock
+; GCN-DAG:  v_cmp_eq_u32_e32    vcc, 1,
+; GCN-DAG:  v_cmp_ne_u32_e64    [[TMP1:s\[[0-9]+:[0-9]+\]]], 1,
+; GCN-DAG:  s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
+; GCN-DAG:  s_andn2_b64         [[EXIT1]], [[EXIT1]], exec
+; GCN-DAG:  s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN-DAG:  s_and_b64           [[TMP1]], [[TMP1]], exec
+; GCN-DAG:  s_or_b64            [[EXIT0]], [[EXIT0]], [[TMP0]]
+; GCN-DAG:  s_or_b64            [[EXIT1]], [[EXIT1]], [[TMP1]]
 
 ; GCN: ; %Flow4
-; GCN-NEXT: s_or_b64 exec, exec
-; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_or_b64            exec, exec,
+; GCN-NEXT: s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]]
+; GCN-NEXT: s_xor_b64
 
 ; GCN: ; %exit1
-; GCN: ds_write_b32
+; GCN:      ds_write_b32
+; GCN:      s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
 
-; GCN: %Flow5
-; GCN-NEXT: s_or_b64 exec, exec
-; GCN: v_cmp_ne_u32_e32 vcc, 0
-; GCN-NEXT: s_and_saveexec_b64
+; GCN: ; %Flow5
+; GCN-NEXT: s_or_b64            exec, exec,
+; GCN-NEXT; s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]]
 
 ; GCN: ; %exit0
-; GCN: buffer_store_dword
+; GCN:      buffer_store_dword
 
 ; GCN: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_endpgm
diff --git a/test/CodeGen/AMDGPU/multilevel-break.ll b/test/CodeGen/AMDGPU/multilevel-break.ll
index c4e2f1e3487..4c1a769d599 100644
--- a/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -21,34 +21,46 @@
 
 ; GCN-LABEL: {{^}}multi_else_break:
 
+; GCN: ; %main_body
+; GCN:      s_mov_b64           [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+
 ; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
+; GCN:      s_mov_b64           [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
-; GCN: s_and_saveexec_b64 [[SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], vcc
-
-; GCN: BB{{[0-9]+}}_{{[0-9]+}}: ; %Flow{{$}}
-; GCN-NEXT: ; in Loop: Header=[[INNER_LOOP]] Depth=2
-
-; Ensure extra or eliminated
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]]
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], vcc, s{{\[[0-9]+:[0-9]+\]}}
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
-; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]
-
-; GCN: ; %bb.{{[0-9]+}}: ; %Flow2{{$}}
-; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1
-
-; Ensure copy is eliminated
-; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]]
-; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, vcc
-; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
-; GCN-NEXT: s_mov_b64
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]]
-; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]]
+; GCN:      s_or_b64            [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]], [[BREAK_OUTER]], exec
+; GCN:      s_or_b64            [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]], [[BREAK_INNER]], exec
+; GCN:      s_and_saveexec_b64  [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+
+; FIXME: duplicate comparison
+; GCN: ; %ENDIF
+; GCN-DAG:  v_cmp_eq_u32_e32    vcc,
+; GCN-DAG:  v_cmp_ne_u32_e64    [[TMP51NEG:s\[[0-9]+:[0-9]+\]]],
+; GCN-DAG:  s_andn2_b64         [[BREAK_OUTER]], [[BREAK_OUTER]], exec
+; GCN-DAG:  s_andn2_b64         [[BREAK_INNER]], [[BREAK_INNER]], exec
+; GCN-DAG:  s_and_b64           [[TMP_EQ:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN-DAG:  s_and_b64           [[TMP_NE:s\[[0-9]+:[0-9]+\]]], [[TMP51NEG]], exec
+; GCN-DAG:  s_or_b64            [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]]
+; GCN-DAG:  s_or_b64            [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]]
+
+; GCN: ; %Flow
+; GCN:      s_or_b64            exec, exec, [[SAVE_EXEC]]
+; GCN:      s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER]]
+; GCN:      s_or_b64            [[TMP0]], [[TMP0]], [[LEFT_INNER]]
+; GCN:      s_mov_b64           [[LEFT_INNER]], [[TMP0]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP0]]
+; GCN:      s_cbranch_execnz    [[INNER_LOOP]]
+
+; GCN: ; %Flow2
+; GCN:      s_or_b64            exec, exec, [[TMP0]]
+; GCN:      s_and_b64           [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER]]
+; GCN:      s_or_b64            [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
+; GCN:      s_mov_b64           [[LEFT_OUTER]], [[TMP1]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP1]]
+; GCN:      s_cbranch_execnz    [[OUTER_LOOP]]
+
+; GCN: ; %IF
+; GCN-NEXT: s_endpgm
 define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 main_body:
   br label %LOOP.outer
@@ -78,12 +90,38 @@ ENDIF:                                            ; preds = %LOOP
 ; OPT: llvm.amdgcn.end.cf
 
 ; GCN-LABEL: {{^}}multi_if_break_loop:
-; GCN: s_mov_b64 [[BREAK_REG:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN:      s_mov_b64          [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
+; GCN:      s_mov_b64          [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+
+; GCN: ; %LeafBlock1
+; GCN:      s_mov_b64
+; GCN:      s_mov_b64          [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+
+; GCN: ; %case1
+; GCN:      buffer_load_dword  [[LOAD2:v[0-9]+]],
+; GCN:      v_cmp_ge_i32_e32   vcc, {{v[0-9]+}}, [[LOAD2]]
+; GCN:      s_orn2_b64         [[BREAK]], vcc, exec
+
+; GCN: ; %Flow3
+; GCN:      s_branch           [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN:      s_mov_b64          [[BREAK]], -1{{$}}
+
+; GCN: [[FLOW]]: ; %Flow
+
+; GCN: ; %case0
+; GCN:      buffer_load_dword  [[LOAD1:v[0-9]+]],
+; GCN-DAG:  s_andn2_b64        [[BREAK]], [[BREAK]], exec
+; GCN-DAG:  v_cmp_ge_i32_e32   vcc, {{v[0-9]+}}, [[LOAD1]]
+; GCN-DAG:  s_and_b64          [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN:      s_or_b64           [[BREAK]], [[BREAK]], [[TMP]]
 
-; GCN: s_or_b64 [[BREAK_REG]], vcc, [[BREAK_REG]]
-; GCN: s_andn2_b64 exec, exec, [[BREAK_REG]]
+; GCN: ; %Flow4
+; GCN:      s_and_b64          [[BREAK]], exec, [[BREAK]]
+; GCN:      s_or_b64           [[LEFT]], [[BREAK]], [[OLD_LEFT]]
+; GCN:      s_andn2_b64        exec, exec, [[LEFT]]
 ; GCN-NEXT: s_cbranch_execnz
 
 define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
diff --git a/test/CodeGen/AMDGPU/select-opt.ll b/test/CodeGen/AMDGPU/select-opt.ll
index 33028f17531..f773357976c 100644
--- a/test/CodeGen/AMDGPU/select-opt.ll
+++ b/test/CodeGen/AMDGPU/select-opt.ll
@@ -137,7 +137,6 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, flo
 ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0
 ; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
 ; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}}
-; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
 define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
 entry:
diff --git a/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 04df33b8dd4..3db6fd2d898 100644
--- a/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -100,22 +100,22 @@ endif:
   ret void
 }
 
-; FIXME: Should write to different SGPR pairs instead of copying to
-; VALU for i1 phi.
-
 ; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
-
-; SI: BB{{[0-9]+}}_2:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
-
-; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
-; SI: buffer_store_dword [[RESULT]]
+
+; SI: ; %else
+; SI:      buffer_load_dword  [[AVAL:v[0-9]+]]
+; SI:      v_cmp_gt_i32_e64   [[PHI:s\[[0-9]+:[0-9]+\]]], 0, [[AVAL]]
+
+; SI: ; %if
+; SI:      buffer_load_dword  [[AVAL:v[0-9]+]]
+; SI:      v_cmp_eq_u32_e32   [[CMP_ELSE:vcc]], 0, [[AVAL]]
+; SI-DAG:  s_andn2_b64        [[PHI]], [[PHI]], exec
+; SI-DAG:  s_and_b64          [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP_ELSE]], exec
+; SI:      s_or_b64           [[PHI]], [[PHI]], [[TMP]]
+
+; SI: ; %endif
+; SI:      v_cndmask_b32_e64  [[RESULT:v[0-9]+]], 0, -1, [[PHI]]
+; SI:      buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf.ll b/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 73e56593ce8..6215a486a36 100644
--- a/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
 
@@ -27,18 +27,23 @@ ENDIF:
 
 
 ; FUNC-LABEL: {{^}}phi_cond_outside_loop:
-; FIXME: This could be folded into the s_or_b64 instruction
-; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0
-; SI: [[LOOP_LABEL:[A-Z0-9]+]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
-; SI_IF_BREAK instruction:
-; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]]
+; SI:     s_mov_b64         [[LEFT:s\[[0-9]+:[0-9]+\]]], 0
+; SI:     s_mov_b64         [[PHI:s\[[0-9]+:[0-9]+\]]], 0
 
-; SI_LOOP instruction:
-; SI: s_andn2_b64 exec, exec, [[BREAK]]
-; SI: s_cbranch_execnz [[LOOP_LABEL]]
-; SI: s_endpgm
+; SI: ; %else
+; SI:     v_cmp_eq_u32_e64  [[TMP:s\[[0-9]+:[0-9]+\]]],
+; SI:     s_and_b64         [[PHI]], [[TMP]], exec
+
+; SI: ; %endif
+
+; SI: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]: ; %loop
+; SI:     s_mov_b64         [[TMP:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+; SI:     s_and_b64         [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[PHI]]
+; SI:     s_or_b64          [[LEFT]], [[TMP1]], [[TMP]]
+; SI:     s_andn2_b64       exec, exec, [[LEFT]]
+; SI:     s_cbranch_execnz  [[LOOP_LABEL]]
+; SI:     s_endpgm
 
 define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
 entry:
@@ -90,19 +95,21 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; This broke the old AMDIL cfg structurizer
 ; FUNC-LABEL: {{^}}loop_land_info_assert:
 ; SI:      v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
-; SI:      s_and_b64 vcc, exec, [[CMP4]]
-; SI-NEXT: s_cbranch_vccnz [[BR1:BB[0-9_]+]]
-; SI-NEXT: s_branch [[BR2:BB[0-9_]+]]
-; SI-NEXT: BB{{[0-9_]+}}:
-; SI-NEXT: buffer_store_dword
+; SI:      s_and_b64        [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]]
+; SI:      s_mov_b64        vcc, [[CMP4M]]
+; SI-NEXT: s_cbranch_vccnz  [[CONVEX_EXIT:BB[0-9_]+]]
+; SI-NEXT: s_branch         [[FOR_COND_PREHDR:BB[0-9_]+]]
+
+; SI: ; %if.else
+; SI:      buffer_store_dword
 
 ; SI:      [[INFLOOP:BB[0-9]+_[0-9]+]]:
 
-; SI:      [[BR1]]:
-; SI-NEXT: s_and_b64 vcc, exec,
-; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]
+; SI:      [[CONVEX_EXIT]]:
+; SI:      s_mov_b64        vcc,
+; SI-NEXT: s_cbranch_vccnz  [[ENDPGM:BB[0-9]+_[0-9]+]]
 ; SI:      s_branch [[INFLOOP]]
-; SI-NEXT: [[BR2]]:
+; SI-NEXT: [[FOR_COND_PREHDR]]:
 ; SI:      s_cbranch_vccz [[ENDPGM]]
 
 ; SI:      [[ENDPGM]]:
diff --git a/test/CodeGen/AMDGPU/sub_i1.ll b/test/CodeGen/AMDGPU/sub_i1.ll
index 70562a59f0a..6861d32dccf 100644
--- a/test/CodeGen/AMDGPU/sub_i1.ll
+++ b/test/CodeGen/AMDGPU/sub_i1.ll
@@ -21,8 +21,8 @@ define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)
 }
 
 ; GCN-LABEL: {{^}}sub_i1_cf:
-; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}
-; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+; GCN: ; %endif
+; GCN: s_not_b64
 define amdgpu_kernel void @sub_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
index 3d980b749a9..ca85f0bee4c 100644
--- a/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -8,23 +8,22 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 
 ; waitcnt should be inserted after exec modification
-; SI: v_cmp_lt_i32_e32 vcc, 0,
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
+; SI:      v_cmp_lt_i32_e32 vcc, 0,
+; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
+; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
 ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
 ; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
 
 ; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
-; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
-; SI: v_mov_b32_e32 v{{[0-9]}}, -1
-; SI: s_and_saveexec_b64
+; SI:      s_mov_b64 s[{{[0-9]:[0-9]}}], -1
+; SI:      s_and_saveexec_b64
 ; SI-NEXT: ; mask branch
 
 ; v_mov should be after exec modification
 ; SI: [[FLOW_BB]]:
 ; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
-; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
 ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
 ; SI-NEXT: ; mask branch
 ;
@@ -220,9 +219,10 @@ exit:
 ; SI: [[LABEL_FLOW]]:
 ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
 ; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
-; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
+; SI-NEXT: s_or_b64 [[TMP2:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[COND_STATE]]
+; SI-NEXT: s_mov_b64 [[COND_STATE]], [[TMP2]]
+; SI-NEXT: s_andn2_b64 exec, exec, [[TMP2]]
 ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
 
 ; SI: [[LABEL_EXIT]]:
diff --git a/test/CodeGen/AMDGPU/waitcnt-looptest.ll b/test/CodeGen/AMDGPU/waitcnt-looptest.ll
index a941e5fb1f7..08267b76aef 100644
--- a/test/CodeGen/AMDGPU/waitcnt-looptest.ll
+++ b/test/CodeGen/AMDGPU/waitcnt-looptest.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}testKernel
 ; GCN: BB0_1:
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cmp_eq_f32_e64
+; GCN-NEXT: v_cmp_eq_f32_e32
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT: v_cmp_eq_f32_e32
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
-- 
GitLab


From 125c8f987dd711fb344397da122f40d6b8da2568 Mon Sep 17 00:00:00 2001
From: David Bolvansky <david.bolvansky@gmail.com>
Date: Wed, 31 Oct 2018 14:18:57 +0000
Subject: [PATCH 0800/1116] [DAGCombiner] Fold 0 div/rem X to 0

Reviewers: RKSimon, spatel, javed.absar, craig.topper, t.p.northover

Reviewed By: RKSimon

Subscribers: craig.topper, llvm-commits

Differential Revision: https://reviews.llvm.org/D52504

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345721 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   7 +-
 test/CodeGen/X86/combine-sdiv.ll         |  86 ++----------------
 test/CodeGen/X86/combine-srem.ll         |  50 +----------
 test/CodeGen/X86/combine-udiv.ll         | 108 ++---------------------
 test/CodeGen/X86/combine-urem.ll         |  50 +----------
 5 files changed, 24 insertions(+), 277 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fba2aa9cb52..e6ea4898717 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3128,8 +3128,11 @@ static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
   if (N0.isUndef())
     return DAG.getConstant(0, DL, VT);
 
-  // TODO: 0 / X -> 0
-  // TODO: 0 % X -> 0
+  // 0 / X -> 0
+  // 0 % X -> 0
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  if (N0C && N0C->isNullValue())
+    return N0;
 
   // X / X -> 1
   // X % X -> 0
diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll
index baff826858c..a78ecd27c99 100644
--- a/test/CodeGen/X86/combine-sdiv.ll
+++ b/test/CodeGen/X86/combine-sdiv.ll
@@ -107,99 +107,25 @@ define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
   ret <4 x i32> %1
 }
 
-; TODO fold (sdiv 0, x) -> 0
+; fold (sdiv 0, x) -> 0
 define i32 @combine_sdiv_zero(i32 %x) {
 ; CHECK-LABEL: combine_sdiv_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    idivl %edi
 ; CHECK-NEXT:    retq
   %1 = sdiv i32 0, %x
   ret i32 %1
 }
 
 define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) {
-; SSE2-LABEL: combine_vec_sdiv_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm1, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm2, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: combine_vec_sdiv_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrd $1, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    movl %eax, %ecx
-; SSE41-NEXT:    movd %xmm0, %esi
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    idivl %esi
-; SSE41-NEXT:    movd %eax, %xmm1
-; SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
-; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm1
-; SSE41-NEXT:    pextrd $3, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: combine_vec_sdiv_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_sdiv_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    vmovd %xmm0, %esi
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %esi
-; AVX-NEXT:    vmovd %eax, %xmm1
-; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = sdiv <4 x i32> zeroinitializer, %x
   ret <4 x i32> %1
diff --git a/test/CodeGen/X86/combine-srem.ll b/test/CodeGen/X86/combine-srem.ll
index 71be666d6db..dab3bdcedb2 100644
--- a/test/CodeGen/X86/combine-srem.ll
+++ b/test/CodeGen/X86/combine-srem.ll
@@ -100,14 +100,11 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) {
   ret <4 x i32> %1
 }
 
-; TODO fold (srem 0, x) -> 0
+; fold (srem 0, x) -> 0
 define i32 @combine_srem_zero(i32 %x) {
 ; CHECK-LABEL: combine_srem_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    idivl %edi
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    retq
   %1 = srem i32 0, %x
   ret i32 %1
@@ -116,53 +113,12 @@ define i32 @combine_srem_zero(i32 %x) {
 define <4 x i32> @combine_vec_srem_zero(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_srem_zero:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pextrd $1, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    movl %edx, %ecx
-; SSE-NEXT:    movd %xmm0, %esi
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    idivl %esi
-; SSE-NEXT:    movd %edx, %xmm1
-; SSE-NEXT:    pinsrd $1, %ecx, %xmm1
-; SSE-NEXT:    pextrd $2, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    pinsrd $2, %edx, %xmm1
-; SSE-NEXT:    pextrd $3, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    pinsrd $3, %edx, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_srem_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    movl %edx, %ecx
-; AVX-NEXT:    vmovd %xmm0, %esi
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %esi
-; AVX-NEXT:    vmovd %edx, %xmm1
-; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $3, %edx, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = srem <4 x i32> zeroinitializer, %x
   ret <4 x i32> %1
diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll
index 632d1b698d1..34f64e52437 100644
--- a/test/CodeGen/X86/combine-udiv.ll
+++ b/test/CodeGen/X86/combine-udiv.ll
@@ -90,124 +90,30 @@ define <4 x i32> @combine_vec_udiv_by_minsigned(<4 x i32> %x) {
   ret <4 x i32> %1
 }
 
-; TODO fold (udiv 0, x) -> 0
+; fold (udiv 0, x) -> 0
 define i32 @combine_udiv_zero(i32 %x) {
 ; CHECK-LABEL: combine_udiv_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %edi
 ; CHECK-NEXT:    retq
   %1 = udiv i32 0, %x
   ret i32 %1
 }
 
 define <4 x i32> @combine_vec_udiv_zero(<4 x i32> %x) {
-; SSE2-LABEL: combine_vec_udiv_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm1, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm2, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: combine_vec_udiv_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrd $1, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    movl %eax, %ecx
-; SSE41-NEXT:    movd %xmm0, %esi
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %esi
-; SSE41-NEXT:    movd %eax, %xmm1
-; SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
-; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm1
-; SSE41-NEXT:    pextrd $3, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: combine_vec_udiv_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_udiv_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    vmovd %xmm0, %esi
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %esi
-; AVX-NEXT:    vmovd %eax, %xmm1
-; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: combine_vec_udiv_zero:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vpextrd $1, %xmm0, %ecx
-; XOP-NEXT:    xorl %eax, %eax
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    movl %eax, %ecx
-; XOP-NEXT:    vmovd %xmm0, %esi
-; XOP-NEXT:    xorl %eax, %eax
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %esi
-; XOP-NEXT:    vmovd %eax, %xmm1
-; XOP-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; XOP-NEXT:    vpextrd $2, %xmm0, %ecx
-; XOP-NEXT:    xorl %eax, %eax
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; XOP-NEXT:    vpextrd $3, %xmm0, %ecx
-; XOP-NEXT:    xorl %eax, %eax
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; XOP-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
   %1 = udiv <4 x i32> zeroinitializer, %x
   ret <4 x i32> %1
diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll
index 5629a53fd23..b21ed8ec60c 100644
--- a/test/CodeGen/X86/combine-urem.ll
+++ b/test/CodeGen/X86/combine-urem.ll
@@ -89,14 +89,11 @@ define <4 x i32> @combine_vec_urem_by_minsigned(<4 x i32> %x) {
   ret <4 x i32> %1
 }
 
-; TODO fold (urem 0, x) -> 0
+; fold (urem 0, x) -> 0
 define i32 @combine_urem_zero(i32 %x) {
 ; CHECK-LABEL: combine_urem_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %edi
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    retq
   %1 = urem i32 0, %x
   ret i32 %1
@@ -105,53 +102,12 @@ define i32 @combine_urem_zero(i32 %x) {
 define <4 x i32> @combine_vec_urem_zero(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_urem_zero:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pextrd $1, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    movl %edx, %ecx
-; SSE-NEXT:    movd %xmm0, %esi
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %esi
-; SSE-NEXT:    movd %edx, %xmm1
-; SSE-NEXT:    pinsrd $1, %ecx, %xmm1
-; SSE-NEXT:    pextrd $2, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    pinsrd $2, %edx, %xmm1
-; SSE-NEXT:    pextrd $3, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    pinsrd $3, %edx, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_urem_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    movl %edx, %ecx
-; AVX-NEXT:    vmovd %xmm0, %esi
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %esi
-; AVX-NEXT:    vmovd %edx, %xmm1
-; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    vpinsrd $3, %edx, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = urem <4 x i32> zeroinitializer, %x
   ret <4 x i32> %1
-- 
GitLab


From ea1119a1f5889a6180b6b3e6c3a1e52127064dda Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 31 Oct 2018 14:29:21 +0000
Subject: [PATCH 0801/1116] [InstSimplify] add tests for fcmp and known
 positive; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345722 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstSimplify/floating-point-compare.ll    | 120 +++++++++++++++---
 1 file changed, 100 insertions(+), 20 deletions(-)

diff --git a/test/Transforms/InstSimplify/floating-point-compare.ll b/test/Transforms/InstSimplify/floating-point-compare.ll
index a3c17b1fea0..c6240e06e43 100644
--- a/test/Transforms/InstSimplify/floating-point-compare.ll
+++ b/test/Transforms/InstSimplify/floating-point-compare.ll
@@ -234,44 +234,84 @@ define i1 @orderedLessZeroPowi(double,double) {
   ret i1 %olt
 }
 
-define i1 @orderedLessZeroUIToFP(i32 %x) {
-; CHECK-LABEL: @orderedLessZeroUIToFP(
+define i1 @UIToFP_is_nan_or_positive_or_zero(i32 %x) {
+; CHECK-LABEL: @UIToFP_is_nan_or_positive_or_zero(
 ; CHECK-NEXT:    ret i1 true
 ;
   %a = uitofp i32 %x to float
-  %uge = fcmp uge float %a, 0.000000e+00
-  ret i1 %uge
+  %r = fcmp uge float %a, 0.000000e+00
+  ret i1 %r
 }
 
-define <2 x i1> @orderedLessZeroUIToFP_vec(<2 x i32> %x) {
-; CHECK-LABEL: @orderedLessZeroUIToFP_vec(
+define <2 x i1> @UIToFP_is_nan_or_positive_or_zero_vec(<2 x i32> %x) {
+; CHECK-LABEL: @UIToFP_is_nan_or_positive_or_zero_vec(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %a = uitofp <2 x i32> %x to <2 x float>
-  %uge = fcmp uge <2 x float> %a, zeroinitializer
-  ret <2 x i1> %uge
+  %r = fcmp uge <2 x float> %a, zeroinitializer
+  ret <2 x i1> %r
 }
 
-define i1 @orderedLessZeroUIToFP_nnan(i32 %x) {
-; CHECK-LABEL: @orderedLessZeroUIToFP_nnan(
+define i1 @UIToFP_nnan_is_positive_or_zero(i32 %x) {
+; CHECK-LABEL: @UIToFP_nnan_is_positive_or_zero(
 ; CHECK-NEXT:    [[A:%.*]] = uitofp i32 [[X:%.*]] to float
-; CHECK-NEXT:    [[UGE:%.*]] = fcmp nnan oge float [[A]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[UGE]]
+; CHECK-NEXT:    [[R:%.*]] = fcmp nnan oge float [[A]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %a = uitofp i32 %x to float
-  %uge = fcmp nnan oge float %a, 0.000000e+00
-  ret i1 %uge
+  %r = fcmp nnan oge float %a, 0.000000e+00
+  ret i1 %r
 }
 
-define <2 x i1> @orderedLessZeroUIToFP_nnan_vec(<2 x i32> %x) {
-; CHECK-LABEL: @orderedLessZeroUIToFP_nnan_vec(
+define <2 x i1> @UIToFP_nnan_is_positive_or_zero_vec(<2 x i32> %x) {
+; CHECK-LABEL: @UIToFP_nnan_is_positive_or_zero_vec(
 ; CHECK-NEXT:    [[A:%.*]] = uitofp <2 x i32> [[X:%.*]] to <2 x float>
-; CHECK-NEXT:    [[UGE:%.*]] = fcmp nnan oge <2 x float> [[A]], zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[UGE]]
+; CHECK-NEXT:    [[R:%.*]] = fcmp nnan oge <2 x float> [[A]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %a = uitofp <2 x i32> %x to <2 x float>
+  %r = fcmp nnan oge <2 x float> %a, zeroinitializer
+  ret <2 x i1> %r
+}
+
+define i1 @UIToFP_is_not_negative(i32 %x) {
+; CHECK-LABEL: @UIToFP_is_not_negative(
+; CHECK-NEXT:    ret i1 false
+;
+  %a = uitofp i32 %x to float
+  %r = fcmp olt float %a, 0.000000e+00
+  ret i1 %r
+}
+
+define <2 x i1> @UIToFP_is_not_negative_vec(<2 x i32> %x) {
+; CHECK-LABEL: @UIToFP_is_not_negative_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %a = uitofp <2 x i32> %x to <2 x float>
-  %uge = fcmp nnan oge <2 x float> %a, zeroinitializer
-  ret <2 x i1> %uge
+  %r = fcmp olt <2 x float> %a, zeroinitializer
+  ret <2 x i1> %r
+}
+
+define i1 @UIToFP_nnan_is_not_negative(i32 %x) {
+; CHECK-LABEL: @UIToFP_nnan_is_not_negative(
+; CHECK-NEXT:    [[A:%.*]] = uitofp i32 [[X:%.*]] to float
+; CHECK-NEXT:    [[R:%.*]] = fcmp nnan ult float [[A]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %a = uitofp i32 %x to float
+  %r = fcmp nnan ult float %a, 0.000000e+00
+  ret i1 %r
+}
+
+define <2 x i1> @UIToFP_nnan_is_not_negative_vec(<2 x i32> %x) {
+; CHECK-LABEL: @UIToFP_nnan_is_not_negative_vec(
+; CHECK-NEXT:    [[A:%.*]] = uitofp <2 x i32> [[X:%.*]] to <2 x float>
+; CHECK-NEXT:    [[R:%.*]] = fcmp nnan ult <2 x float> [[A]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %a = uitofp <2 x i32> %x to <2 x float>
+  %r = fcmp nnan ult <2 x float> %a, zeroinitializer
+  ret <2 x i1> %r
 }
 
 define i1 @fabs_is_nan_or_positive_or_zero(double %x) {
@@ -314,6 +354,46 @@ define <2 x i1> @fabs_nnan_is_positive_or_zero_vec(<2 x double> %x) {
   ret <2 x i1> %cmp
 }
 
+define i1 @fabs_is_not_negative(double %x) {
+; CHECK-LABEL: @fabs_is_not_negative(
+; CHECK-NEXT:    ret i1 false
+;
+  %fabs = tail call double @llvm.fabs.f64(double %x)
+  %cmp = fcmp olt double %fabs, 0.0
+  ret i1 %cmp
+}
+
+define <2 x i1> @fabs_is_not_negative_vec(<2 x double> %x) {
+; CHECK-LABEL: @fabs_is_not_negative_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %fabs = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
+  %cmp = fcmp olt <2 x double> %fabs, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+define i1 @fabs_nnan_is_not_negative(double %x) {
+; CHECK-LABEL: @fabs_nnan_is_not_negative(
+; CHECK-NEXT:    [[FABS:%.*]] = tail call double @llvm.fabs.f64(double [[X:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ult double [[FABS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fabs = tail call double @llvm.fabs.f64(double %x)
+  %cmp = fcmp nnan ult double %fabs, 0.0
+  ret i1 %cmp
+}
+
+define <2 x i1> @fabs_nnan_is_not_negative_vec(<2 x double> %x) {
+; CHECK-LABEL: @fabs_nnan_is_not_negative_vec(
+; CHECK-NEXT:    [[FABS:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[X:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ult <2 x double> [[FABS]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %fabs = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
+  %cmp = fcmp nnan ult <2 x double> %fabs, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
 define i1 @orderedLessZeroSelect(float, float) {
 ; CHECK-LABEL: @orderedLessZeroSelect(
 ; CHECK-NEXT:    ret i1 true
-- 
GitLab


From 2f9c8a0c1d8efc294e83bf8df8e12aa00a748f8a Mon Sep 17 00:00:00 2001
From: Fedor Sergeev <fedor.sergeev@azul.com>
Date: Wed, 31 Oct 2018 14:33:14 +0000
Subject: [PATCH 0802/1116] [LoopUnroll] allow customization for
 new-pass-manager version of LoopUnroll

Unlike its legacy counterpart new pass manager's LoopUnrollPass does
not provide any means to select which flavors of unroll to run
(runtime, peeling, partial), relying on global defaults.

In some cases having ability to run a restricted LoopUnroll that
does more than LoopFullUnroll is needed.

Introduced LoopUnrollOptions to select optional unroll behaviors.
Added 'unroll<peeling>' to PassRegistry mainly for the sake of testing.

Reviewers: chandlerc, tejohnson
Differential Revision: https://reviews.llvm.org/D53440

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345723 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/Transforms/Scalar/LoopUnrollPass.h   | 60 ++++++++++++++++++-
 lib/Passes/PassBuilder.cpp                    |  2 +-
 lib/Passes/PassRegistry.def                   |  1 +
 lib/Transforms/Scalar/LoopUnrollPass.cpp      | 21 +++----
 test/Transforms/LoopUnroll/peel-loop.ll       |  2 +
 test/Transforms/LoopUnroll/runtime-loop.ll    | 34 ++++++++++-
 6 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/include/llvm/Transforms/Scalar/LoopUnrollPass.h b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
index 9848e0d54f2..20c9a26b98c 100644
--- a/include/llvm/Transforms/Scalar/LoopUnrollPass.h
+++ b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPUNROLLPASS_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPUNROLLPASS_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
 
@@ -30,16 +31,71 @@ public:
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
 
+/// A set of parameters used to control various transforms performed by the
+/// LoopUnroll pass. Each of the boolean parameters can be set to:
+///      true - enabling the transformation.
+///      false - disabling the transformation.
+///      None - relying on a global default.
+///
+/// There is also OptLevel parameter, which is used for additional loop unroll
+/// tuning.
+///
+/// Intended use is to create a default object, modify parameters with
+/// additional setters and then pass it to LoopUnrollPass.
+///
+struct LoopUnrollOptions {
+  Optional<bool> AllowPartial;
+  Optional<bool> AllowPeeling;
+  Optional<bool> AllowRuntime;
+  Optional<bool> AllowUpperBound;
+  int OptLevel;
+
+  LoopUnrollOptions(int OptLevel = 2) : OptLevel(OptLevel) {}
+
+  /// Enables or disables partial unrolling. When disabled only full unrolling
+  /// is allowed.
+  LoopUnrollOptions &setPartial(bool Partial) {
+    AllowPartial = Partial;
+    return *this;
+  }
+
+  /// Enables or disables unrolling of loops with runtime trip count.
+  LoopUnrollOptions &setRuntime(bool Runtime) {
+    AllowRuntime = Runtime;
+    return *this;
+  }
+
+  /// Enables or disables loop peeling.
+  LoopUnrollOptions &setPeeling(bool Peeling) {
+    AllowPeeling = Peeling;
+    return *this;
+  }
+
+  /// Enables or disables the use of trip count upper bound
+  /// in loop unrolling.
+  LoopUnrollOptions &setUpperBound(bool UpperBound) {
+    AllowUpperBound = UpperBound;
+    return *this;
+  }
+
+  // Sets "optimization level" tuning parameter for loop unrolling.
+  LoopUnrollOptions &setOptLevel(int O) {
+    OptLevel = O;
+    return *this;
+  }
+};
+
 /// Loop unroll pass that will support both full and partial unrolling.
 /// It is a function pass to have access to function and module analyses.
 /// It will also put loops into canonical form (simplified and LCSSA).
 class LoopUnrollPass : public PassInfoMixin<LoopUnrollPass> {
-  const int OptLevel;
+  LoopUnrollOptions UnrollOpts;
 
 public:
   /// This uses the target information (or flags) to control the thresholds for
   /// different unrolling stategies but supports all of them.
-  explicit LoopUnrollPass(int OptLevel = 2) : OptLevel(OptLevel) {}
+  explicit LoopUnrollPass(LoopUnrollOptions UnrollOpts = {})
+      : UnrollOpts(UnrollOpts) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index c23c8c8d47a..0c6dfff06f1 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -830,7 +830,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
     OptimizePM.addPass(
         createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
   }
-  OptimizePM.addPass(LoopUnrollPass(Level));
+  OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions(Level)));
   OptimizePM.addPass(InstCombinePass());
   OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   OptimizePM.addPass(createFunctionToLoopPassAdaptor(LICMPass(), DebugLogging));
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index 8de4541a772..99df2ad2719 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -215,6 +215,7 @@ FUNCTION_PASS("sroa", SROA())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
 FUNCTION_PASS("unroll", LoopUnrollPass())
+FUNCTION_PASS("unroll<peeling;no-runtime>",LoopUnrollPass(LoopUnrollOptions().setPeeling(true).setRuntime(false)))
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 FUNCTION_PASS("verify<loops>", LoopVerifierPass())
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 34d2b2a8b27..d10dae124a7 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1333,23 +1333,20 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
     Loop *ParentL = L.getParentLoop();
 #endif
 
-    // The API here is quite complex to call, but there are only two interesting
-    // states we support: partial and full (or "simple") unrolling. However, to
-    // enable these things we actually pass "None" in for the optional to avoid
-    // providing an explicit choice.
-    Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam,
-        AllowPeeling;
     // Check if the profile summary indicates that the profiled application
     // has a huge working set size, in which case we disable peeling to avoid
     // bloating it further.
+    Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
     if (PSI && PSI->hasHugeWorkingSetSize())
-      AllowPeeling = false;
+      LocalAllowPeeling = false;
     std::string LoopName = L.getName();
-    LoopUnrollResult Result =
-        tryToUnrollLoop(&L, DT, &LI, SE, TTI, AC, ORE,
-                        /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
-                        /*Threshold*/ None, AllowPartialParam, RuntimeParam,
-                        UpperBoundParam, AllowPeeling);
+    // The API here is quite complex to call and we allow to select some
+    // flavors of unrolling during construction time (by setting UnrollOpts).
+    LoopUnrollResult Result = tryToUnrollLoop(
+        &L, DT, &LI, SE, TTI, AC, ORE,
+        /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, /*Count*/ None,
+        /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
+        UnrollOpts.AllowUpperBound, LocalAllowPeeling);
     Changed |= Result != LoopUnrollResult::Unmodified;
 
     // The parent must not be damaged by unrolling!
diff --git a/test/Transforms/LoopUnroll/peel-loop.ll b/test/Transforms/LoopUnroll/peel-loop.ll
index d535414b3eb..eb3d29cb494 100644
--- a/test/Transforms/LoopUnroll/peel-loop.ll
+++ b/test/Transforms/LoopUnroll/peel-loop.ll
@@ -1,4 +1,6 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-force-peel-count=3 -verify-dom-info -simplifycfg -instcombine | FileCheck %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll,simplify-cfg,instcombine' -unroll-force-peel-count=3 -verify-dom-info | FileCheck %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll<peeling;no-runtime>,simplify-cfg,instcombine' -unroll-force-peel-count=3 -verify-dom-info | FileCheck %s
 
 ; Basic loop peeling - check that we can peel-off the first 3 loop iterations
 ; when explicitly requested.
diff --git a/test/Transforms/LoopUnroll/runtime-loop.ll b/test/Transforms/LoopUnroll/runtime-loop.ll
index 34eaa4ec333..19072855d25 100644
--- a/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop.ll
@@ -1,8 +1,16 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=EPILOG,COMMON
 ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=PROLOG,COMMON
-
+;
 ; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll' -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=EPILOG,COMMON
 ; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=PROLOG,COMMON
+;
+; Restricted versions of unroll (unroll<peeling;noruntime>, unroll-full) should not be doing runtime unrolling
+; even if it is globally enabled through -unroll-runtime option
+;
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll<peeling;no-runtime>' -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=NOEPILOG,COMMON
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll<peeling;no-runtime>' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=NOPROLOG,COMMON
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=NOEPILOG,COMMON
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=NOPROLOG,COMMON
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -14,22 +22,32 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; EPILOG:  %lcmp.mod = icmp ne i32 %xtraiter, 0
 ; EPILOG:  br i1 %lcmp.mod, label %for.body.epil.preheader, label %for.end.loopexit
 
+; NOEPILOG-NOT: %xtraiter = and i32 %n
+
 ; PROLOG: %xtraiter = and i32 %n
 ; PROLOG:  %lcmp.mod = icmp ne i32 %xtraiter, 0
 ; PROLOG:  br i1 %lcmp.mod, label %for.body.prol.preheader, label %for.body.prol.loopexit
 
+; NOPROLOG-NOT: %xtraiter = and i32 %n
+
 ; EPILOG: for.body.epil:
 ; EPILOG: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %for.body.epil ],  [ %indvars.iv.unr, %for.body.epil.preheader ]
 ; EPILOG:  %epil.iter.sub = sub i32 %epil.iter, 1
 ; EPILOG:  %epil.iter.cmp = icmp ne i32 %epil.iter.sub, 0
 ; EPILOG:  br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop !0
 
+; NOEPILOG: for.body:
+; NOEPILOG-NOT: for.body.epil:
+
 ; PROLOG: for.body.prol:
 ; PROLOG: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
 ; PROLOG:  %prol.iter.sub = sub i32 %prol.iter, 1
 ; PROLOG:  %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0
 ; PROLOG:  br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !llvm.loop !0
 
+; NOPROLOG: for.body:
+; NOPROLOG-NOT: for.body.prol:
+
 
 define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
 entry:
@@ -86,6 +104,8 @@ for.end:                                          ; preds = %for.body
 ; COMMON-LABEL: @foo(
 ; EPILOG: bb72.2:
 ; PROLOG: bb72.2:
+; NOEPILOG-NOT: bb72.2:
+; NOPROLOG-NOT: bb72.2:
 
 define void @foo(i32 %trips) {
 entry:
@@ -111,9 +131,15 @@ cond_true138:
 ; EPILOG: for.body.epil:
 ; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.cond.for.end_crit_edge.epilog-lcssa
 
+; NOEPILOG: for.body:
+; NOEPILOG-NOT: for.body.epil:
+
 ; PROLOG: for.body.prol:
 ; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit
 
+; NOPROLOG: for.body:
+; NOPROLOG-NOT: for.body.prol:
+
 define zeroext i16 @down(i16* nocapture %p, i32 %len) nounwind uwtable readonly {
 entry:
   %cmp2 = icmp eq i32 %len, 0
@@ -146,9 +172,15 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; EPILOG: for.body:
 ; EPILOG-NOT: for.body.epil:
 
+; NOEPILOG: for.body:
+; NOEPILOG-NOT: for.body.epil:
+
 ; PROLOG: for.body:
 ; PROLOG-NOT: for.body.prol:
 
+; NOPROLOG: for.body:
+; NOPROLOG-NOT: for.body.prol:
+
 define zeroext i16 @test2(i16* nocapture %p, i32 %len) nounwind uwtable readonly {
 entry:
   %cmp2 = icmp eq i32 %len, 0
-- 
GitLab


From 1ef057dce8d1d8e69d26724202d8ccd5ed6fd093 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 31 Oct 2018 14:57:23 +0000
Subject: [PATCH 0803/1116] [InstSimplify] fold 'fcmp nnan oge X, 0.0' when X
 is not negative

This re-raises some of the open questions about how to apply and use fast-math-flags in IR from PR38086:
https://bugs.llvm.org/show_bug.cgi?id=38086
...but given the current implementation (no FMF on casts), this is likely the only way to predicate the
transform.

This is part of solving PR39475:
https://bugs.llvm.org/show_bug.cgi?id=39475

Differential Revision: https://reviews.llvm.org/D53874


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345725 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/InstructionSimplify.cpp             |  4 ++++
 .../InstCombine/InstCombineCompares.cpp          |  5 +++--
 .../InstSimplify/floating-point-compare.ll       | 16 ++++------------
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index b1381932e7f..efe88507aef 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -3612,6 +3612,10 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
     if (C->isZero()) {
       switch (Pred) {
+      case FCmpInst::FCMP_OGE:
+        if (FMF.noNaNs() && CannotBeOrderedLessThanZero(LHS, Q.TLI))
+          return getTrue(RetTy);
+        break;
       case FCmpInst::FCMP_UGE:
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getTrue(RetTy);
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index d7af4b88a81..d3702a4885c 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5433,8 +5433,8 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         break;
       // fabs(x) < 0 --> false
       case FCmpInst::FCMP_OLT:
-        llvm_unreachable("handled by SimplifyFCmpInst");
-      // fabs(x) > 0 --> x != 0
+        llvm_unreachable("fcmp should have simplified");
+          // fabs(x) > 0 --> x != 0
       case FCmpInst::FCMP_OGT:
         return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), RHSC);
       // fabs(x) <= 0 --> x == 0
@@ -5442,6 +5442,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0), RHSC);
       // fabs(x) >= 0 --> !isnan(x)
       case FCmpInst::FCMP_OGE:
+        assert(!I.hasNoNaNs() && "fcmp should have simplified");
         return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0), RHSC);
       // fabs(x) == 0 --> x == 0
       // fabs(x) != 0 --> x != 0
diff --git a/test/Transforms/InstSimplify/floating-point-compare.ll b/test/Transforms/InstSimplify/floating-point-compare.ll
index c6240e06e43..7df803edc68 100644
--- a/test/Transforms/InstSimplify/floating-point-compare.ll
+++ b/test/Transforms/InstSimplify/floating-point-compare.ll
@@ -254,9 +254,7 @@ define <2 x i1> @UIToFP_is_nan_or_positive_or_zero_vec(<2 x i32> %x) {
 
 define i1 @UIToFP_nnan_is_positive_or_zero(i32 %x) {
 ; CHECK-LABEL: @UIToFP_nnan_is_positive_or_zero(
-; CHECK-NEXT:    [[A:%.*]] = uitofp i32 [[X:%.*]] to float
-; CHECK-NEXT:    [[R:%.*]] = fcmp nnan oge float [[A]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %a = uitofp i32 %x to float
   %r = fcmp nnan oge float %a, 0.000000e+00
@@ -265,9 +263,7 @@ define i1 @UIToFP_nnan_is_positive_or_zero(i32 %x) {
 
 define <2 x i1> @UIToFP_nnan_is_positive_or_zero_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @UIToFP_nnan_is_positive_or_zero_vec(
-; CHECK-NEXT:    [[A:%.*]] = uitofp <2 x i32> [[X:%.*]] to <2 x float>
-; CHECK-NEXT:    [[R:%.*]] = fcmp nnan oge <2 x float> [[A]], zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[R]]
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %a = uitofp <2 x i32> %x to <2 x float>
   %r = fcmp nnan oge <2 x float> %a, zeroinitializer
@@ -334,9 +330,7 @@ define <2 x i1> @fabs_is_nan_or_positive_or_zero_vec(<2 x double> %x) {
 
 define i1 @fabs_nnan_is_positive_or_zero(double %x) {
 ; CHECK-LABEL: @fabs_nnan_is_positive_or_zero(
-; CHECK-NEXT:    [[FABS:%.*]] = tail call double @llvm.fabs.f64(double [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan oge double [[FABS]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %fabs = tail call double @llvm.fabs.f64(double %x)
   %cmp = fcmp nnan oge double %fabs, 0.0
@@ -345,9 +339,7 @@ define i1 @fabs_nnan_is_positive_or_zero(double %x) {
 
 define <2 x i1> @fabs_nnan_is_positive_or_zero_vec(<2 x double> %x) {
 ; CHECK-LABEL: @fabs_nnan_is_positive_or_zero_vec(
-; CHECK-NEXT:    [[FABS:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan oge <2 x double> [[FABS]], zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %fabs = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
   %cmp = fcmp nnan oge <2 x double> %fabs, zeroinitializer
-- 
GitLab


From 123a45feb6685748c5f67029a3f80a332b88bae1 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 31 Oct 2018 15:31:45 +0000
Subject: [PATCH 0804/1116] [InstCombine] add assertion that InstSimplify has
 folded a fabs+fcmp; NFC

The 'OLT' case was updated at rL266175, so I assume it was just an
oversight that 'UGE' was not included because that patch handled
both predicates in InstSimplify.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345727 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineCompares.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index d3702a4885c..1ad648fe783 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5431,10 +5431,13 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
       switch (Pred) {
       default:
         break;
-      // fabs(x) < 0 --> false
+      case FCmpInst::FCMP_UGE:
       case FCmpInst::FCMP_OLT:
+        // fabs(x) >= 0.0 --> true
+        // fabs(x) <  0.0 --> false
         llvm_unreachable("fcmp should have simplified");
-          // fabs(x) > 0 --> x != 0
+
+      // fabs(x) > 0 --> x != 0
       case FCmpInst::FCMP_OGT:
         return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), RHSC);
       // fabs(x) <= 0 --> x == 0
-- 
GitLab


From 5406c80642f9234bb8f15c44b3ccf1cbddba59df Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 31 Oct 2018 15:35:46 +0000
Subject: [PATCH 0805/1116] [InstSimplify] fold 'fcmp nnan ult X, 0.0' when X
 is not negative

This is the inverted case for the transform added with D53874 / rL345725.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345728 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/InstructionSimplify.cpp             |  5 ++++-
 .../InstSimplify/floating-point-compare.ll       | 16 ++++------------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index efe88507aef..8571dc2cf2f 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -3620,8 +3620,11 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getTrue(RetTy);
         break;
+      case FCmpInst::FCMP_ULT:
+        if (FMF.noNaNs() && CannotBeOrderedLessThanZero(LHS, Q.TLI))
+          return getFalse(RetTy);
+        break;
       case FCmpInst::FCMP_OLT:
-        // X < 0
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getFalse(RetTy);
         break;
diff --git a/test/Transforms/InstSimplify/floating-point-compare.ll b/test/Transforms/InstSimplify/floating-point-compare.ll
index 7df803edc68..14e6ccee7b2 100644
--- a/test/Transforms/InstSimplify/floating-point-compare.ll
+++ b/test/Transforms/InstSimplify/floating-point-compare.ll
@@ -290,9 +290,7 @@ define <2 x i1> @UIToFP_is_not_negative_vec(<2 x i32> %x) {
 
 define i1 @UIToFP_nnan_is_not_negative(i32 %x) {
 ; CHECK-LABEL: @UIToFP_nnan_is_not_negative(
-; CHECK-NEXT:    [[A:%.*]] = uitofp i32 [[X:%.*]] to float
-; CHECK-NEXT:    [[R:%.*]] = fcmp nnan ult float [[A]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %a = uitofp i32 %x to float
   %r = fcmp nnan ult float %a, 0.000000e+00
@@ -301,9 +299,7 @@ define i1 @UIToFP_nnan_is_not_negative(i32 %x) {
 
 define <2 x i1> @UIToFP_nnan_is_not_negative_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @UIToFP_nnan_is_not_negative_vec(
-; CHECK-NEXT:    [[A:%.*]] = uitofp <2 x i32> [[X:%.*]] to <2 x float>
-; CHECK-NEXT:    [[R:%.*]] = fcmp nnan ult <2 x float> [[A]], zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[R]]
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %a = uitofp <2 x i32> %x to <2 x float>
   %r = fcmp nnan ult <2 x float> %a, zeroinitializer
@@ -366,9 +362,7 @@ define <2 x i1> @fabs_is_not_negative_vec(<2 x double> %x) {
 
 define i1 @fabs_nnan_is_not_negative(double %x) {
 ; CHECK-LABEL: @fabs_nnan_is_not_negative(
-; CHECK-NEXT:    [[FABS:%.*]] = tail call double @llvm.fabs.f64(double [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ult double [[FABS]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %fabs = tail call double @llvm.fabs.f64(double %x)
   %cmp = fcmp nnan ult double %fabs, 0.0
@@ -377,9 +371,7 @@ define i1 @fabs_nnan_is_not_negative(double %x) {
 
 define <2 x i1> @fabs_nnan_is_not_negative_vec(<2 x double> %x) {
 ; CHECK-LABEL: @fabs_nnan_is_not_negative_vec(
-; CHECK-NEXT:    [[FABS:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ult <2 x double> [[FABS]], zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %fabs = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
   %cmp = fcmp nnan ult <2 x double> %fabs, zeroinitializer
-- 
GitLab


From d510dbfbdb1aa1df25f072b4730c20f493ce1c17 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Wed, 31 Oct 2018 15:53:28 +0000
Subject: [PATCH 0806/1116] [llvm-mca] Remove namespace prefixes made redundant
 by r345612. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345730 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/include/Context.h              |  9 ++---
 tools/llvm-mca/include/HWEventListener.h      | 19 +++++-----
 tools/llvm-mca/include/HardwareUnits/LSUnit.h |  6 +--
 .../include/HardwareUnits/RegisterFile.h      | 34 ++++++++---------
 .../include/HardwareUnits/ResourceManager.h   | 25 ++++++-------
 .../include/HardwareUnits/RetireControlUnit.h |  2 +-
 .../include/HardwareUnits/Scheduler.h         | 30 +++++++--------
 tools/llvm-mca/include/InstrBuilder.h         | 37 ++++++++-----------
 tools/llvm-mca/include/Instruction.h          | 24 ++++++------
 tools/llvm-mca/include/Pipeline.h             |  6 +--
 tools/llvm-mca/include/SourceMgr.h            |  6 +--
 tools/llvm-mca/include/Stages/DispatchStage.h | 21 +++++------
 tools/llvm-mca/include/Stages/ExecuteStage.h  | 12 +++---
 tools/llvm-mca/include/Stages/FetchStage.h    |  6 +--
 .../include/Stages/InstructionTables.h        | 10 ++---
 tools/llvm-mca/include/Stages/RetireStage.h   |  4 +-
 tools/llvm-mca/include/Stages/Stage.h         |  8 ++--
 tools/llvm-mca/include/Support.h              | 19 +++++-----
 18 files changed, 129 insertions(+), 149 deletions(-)

diff --git a/tools/llvm-mca/include/Context.h b/tools/llvm-mca/include/Context.h
index d383e2361be..ebd1528e371 100644
--- a/tools/llvm-mca/include/Context.h
+++ b/tools/llvm-mca/include/Context.h
@@ -43,13 +43,12 @@ struct PipelineOptions {
 };
 
 class Context {
-  llvm::SmallVector<std::unique_ptr<HardwareUnit>, 4> Hardware;
-  const llvm::MCRegisterInfo &MRI;
-  const llvm::MCSubtargetInfo &STI;
+  SmallVector<std::unique_ptr<HardwareUnit>, 4> Hardware;
+  const MCRegisterInfo &MRI;
+  const MCSubtargetInfo &STI;
 
 public:
-  Context(const llvm::MCRegisterInfo &R, const llvm::MCSubtargetInfo &S)
-      : MRI(R), STI(S) {}
+  Context(const MCRegisterInfo &R, const MCSubtargetInfo &S) : MRI(R), STI(S) {}
   Context(const Context &C) = delete;
   Context &operator=(const Context &C) = delete;
 
diff --git a/tools/llvm-mca/include/HWEventListener.h b/tools/llvm-mca/include/HWEventListener.h
index 81c76c5eb8d..0216fae7866 100644
--- a/tools/llvm-mca/include/HWEventListener.h
+++ b/tools/llvm-mca/include/HWEventListener.h
@@ -62,23 +62,22 @@ public:
 class HWInstructionIssuedEvent : public HWInstructionEvent {
 public:
   using ResourceRef = std::pair<uint64_t, uint64_t>;
-  HWInstructionIssuedEvent(
-      const InstRef &IR,
-      llvm::ArrayRef<std::pair<ResourceRef, ResourceCycles>> UR)
+  HWInstructionIssuedEvent(const InstRef &IR,
+                           ArrayRef<std::pair<ResourceRef, ResourceCycles>> UR)
       : HWInstructionEvent(HWInstructionEvent::Issued, IR), UsedResources(UR) {}
 
-  llvm::ArrayRef<std::pair<ResourceRef, ResourceCycles>> UsedResources;
+  ArrayRef<std::pair<ResourceRef, ResourceCycles>> UsedResources;
 };
 
 class HWInstructionDispatchedEvent : public HWInstructionEvent {
 public:
-  HWInstructionDispatchedEvent(const InstRef &IR, llvm::ArrayRef<unsigned> Regs,
+  HWInstructionDispatchedEvent(const InstRef &IR, ArrayRef<unsigned> Regs,
                                unsigned UOps)
       : HWInstructionEvent(HWInstructionEvent::Dispatched, IR),
         UsedPhysRegs(Regs), MicroOpcodes(UOps) {}
   // Number of physical register allocated for this instruction. There is one
   // entry per register file.
-  llvm::ArrayRef<unsigned> UsedPhysRegs;
+  ArrayRef<unsigned> UsedPhysRegs;
   // Number of micro opcodes dispatched.
   // This field is often set to the total number of micro-opcodes specified by
   // the instruction descriptor of IR.
@@ -93,12 +92,12 @@ public:
 
 class HWInstructionRetiredEvent : public HWInstructionEvent {
 public:
-  HWInstructionRetiredEvent(const InstRef &IR, llvm::ArrayRef<unsigned> Regs)
+  HWInstructionRetiredEvent(const InstRef &IR, ArrayRef<unsigned> Regs)
       : HWInstructionEvent(HWInstructionEvent::Retired, IR),
         FreedPhysRegs(Regs) {}
   // Number of register writes that have been architecturally committed. There
   // is one entry per register file.
-  llvm::ArrayRef<unsigned> FreedPhysRegs;
+  ArrayRef<unsigned> FreedPhysRegs;
 };
 
 // A HWStallEvent represents a pipeline stall caused by the lack of hardware
@@ -142,9 +141,9 @@ public:
   // Events generated by the Scheduler when buffered resources are
   // consumed/freed for an instruction.
   virtual void onReservedBuffers(const InstRef &Inst,
-                                 llvm::ArrayRef<unsigned> Buffers) {}
+                                 ArrayRef<unsigned> Buffers) {}
   virtual void onReleasedBuffers(const InstRef &Inst,
-                                 llvm::ArrayRef<unsigned> Buffers) {}
+                                 ArrayRef<unsigned> Buffers) {}
 
   virtual ~HWEventListener() {}
 
diff --git a/tools/llvm-mca/include/HardwareUnits/LSUnit.h b/tools/llvm-mca/include/HardwareUnits/LSUnit.h
index c979ac9cf82..6b36282ca72 100644
--- a/tools/llvm-mca/include/HardwareUnits/LSUnit.h
+++ b/tools/llvm-mca/include/HardwareUnits/LSUnit.h
@@ -129,11 +129,7 @@ public:
   void dump() const;
 #endif
 
-  enum Status {
-    LSU_AVAILABLE = 0,
-    LSU_LQUEUE_FULL,
-    LSU_SQUEUE_FULL
-  };
+  enum Status { LSU_AVAILABLE = 0, LSU_LQUEUE_FULL, LSU_SQUEUE_FULL };
 
   // Returns LSU_AVAILABLE if there are enough load/store queue entries to serve
   // IR. It also returns LSU_AVAILABLE if IR is not a memory operation.
diff --git a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
index 5a5543ebacd..1cca8b5294d 100644
--- a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
+++ b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
@@ -34,7 +34,7 @@ class WriteRef;
 /// Manages hardware register files, and tracks register definitions for
 /// register renaming purposes.
 class RegisterFile : public HardwareUnit {
-  const llvm::MCRegisterInfo &MRI;
+  const MCRegisterInfo &MRI;
 
   // class RegisterMappingTracker is a  physical register file (PRF) descriptor.
   // There is one RegisterMappingTracker for every PRF definition in the
@@ -85,7 +85,7 @@ class RegisterFile : public HardwareUnit {
   //
   // Users can limit the number of physical registers that are available in
   // regsiter file #0 specifying command line flag `-register-file-size=<uint>`.
-  llvm::SmallVector<RegisterMappingTracker, 4> RegisterFiles;
+  SmallVector<RegisterMappingTracker, 4> RegisterFiles;
 
   // This type is used to propagate information about the owner of a register,
   // and the cost of allocating it in the PRF. Register cost is defined as the
@@ -101,7 +101,7 @@ class RegisterFile : public HardwareUnit {
   //
   // There is a RegisterRenamingInfo object for every logical register defined
   // by the target. RegisteRenamingInfo objects are stored into vector
-  // `RegisterMappings`, and llvm::MCPhysReg IDs can be used to reference
+  // `RegisterMappings`, and MCPhysReg IDs can be used to reference
   // elements in that vector.
   //
   // Each RegisterRenamingInfo is owned by a PRF, and field `IndexPlusCost`
@@ -117,8 +117,8 @@ class RegisterFile : public HardwareUnit {
   // register definition.
   struct RegisterRenamingInfo {
     IndexPlusCostPairTy IndexPlusCost;
-    llvm::MCPhysReg RenameAs;
-    llvm::MCPhysReg AliasRegID;
+    MCPhysReg RenameAs;
+    MCPhysReg AliasRegID;
     bool AllowMoveElimination;
     RegisterRenamingInfo()
         : IndexPlusCost(std::make_pair(0U, 1U)), RenameAs(0U), AliasRegID(0U),
@@ -144,7 +144,7 @@ class RegisterFile : public HardwareUnit {
 
   // Used to track zero registers. There is one bit for each register defined by
   // the target. Bits are set for registers that are known to be zero.
-  llvm::APInt ZeroRegisters;
+  APInt ZeroRegisters;
 
   // This method creates a new register file descriptor.
   // The new register file owns all of the registers declared by register
@@ -160,41 +160,40 @@ class RegisterFile : public HardwareUnit {
   // Here FPRegisterFile contains all the registers defined by register class
   // VR128RegClass and VR256RegClass. FPRegisterFile implements 60
   // registers which can be used for register renaming purpose.
-  void addRegisterFile(const llvm::MCRegisterFileDesc &RF,
-                       llvm::ArrayRef<llvm::MCRegisterCostEntry> Entries);
+  void addRegisterFile(const MCRegisterFileDesc &RF,
+                       ArrayRef<MCRegisterCostEntry> Entries);
 
   // Consumes physical registers in each register file specified by the
   // `IndexPlusCostPairTy`. This method is called from `addRegisterMapping()`.
   void allocatePhysRegs(const RegisterRenamingInfo &Entry,
-                        llvm::MutableArrayRef<unsigned> UsedPhysRegs);
+                        MutableArrayRef<unsigned> UsedPhysRegs);
 
   // Releases previously allocated physical registers from the register file(s).
   // This method is called from `invalidateRegisterMapping()`.
   void freePhysRegs(const RegisterRenamingInfo &Entry,
-                    llvm::MutableArrayRef<unsigned> FreedPhysRegs);
+                    MutableArrayRef<unsigned> FreedPhysRegs);
 
   // Create an instance of RegisterMappingTracker for every register file
   // specified by the processor model.
   // If no register file is specified, then this method creates a default
   // register file with an unbounded number of physical registers.
-  void initialize(const llvm::MCSchedModel &SM, unsigned NumRegs);
+  void initialize(const MCSchedModel &SM, unsigned NumRegs);
 
 public:
-  RegisterFile(const llvm::MCSchedModel &SM, const llvm::MCRegisterInfo &mri,
+  RegisterFile(const MCSchedModel &SM, const MCRegisterInfo &mri,
                unsigned NumRegs = 0);
 
   // This method updates the register mappings inserting a new register
   // definition. This method is also responsible for updating the number of
   // allocated physical registers in each register file modified by the write.
   // No physical regiser is allocated if this write is from a zero-idiom.
-  void addRegisterWrite(WriteRef Write,
-                        llvm::MutableArrayRef<unsigned> UsedPhysRegs);
+  void addRegisterWrite(WriteRef Write, MutableArrayRef<unsigned> UsedPhysRegs);
 
   // Removes write \param WS from the register mappings.
   // Physical registers may be released to reflect this update.
   // No registers are released if this write is from a zero-idiom.
   void removeRegisterWrite(const WriteState &WS,
-                           llvm::MutableArrayRef<unsigned> FreedPhysRegs);
+                           MutableArrayRef<unsigned> FreedPhysRegs);
 
   // Returns true if a move from RS to WS can be eliminated.
   // On success, it updates WriteState by setting flag `WS.isEliminated`.
@@ -212,9 +211,8 @@ public:
   //
   // Current implementation can simulate up to 32 register files (including the
   // special register file at index #0).
-  unsigned isAvailable(llvm::ArrayRef<unsigned> Regs) const;
-  void collectWrites(llvm::SmallVectorImpl<WriteRef> &Writes,
-                     unsigned RegID) const;
+  unsigned isAvailable(ArrayRef<unsigned> Regs) const;
+  void collectWrites(SmallVectorImpl<WriteRef> &Writes, unsigned RegID) const;
   unsigned getNumRegisterFiles() const { return RegisterFiles.size(); }
 
   // Notify each PRF that a new cycle just started.
diff --git a/tools/llvm-mca/include/HardwareUnits/ResourceManager.h b/tools/llvm-mca/include/HardwareUnits/ResourceManager.h
index bf7c1e67115..065ead8f1a8 100644
--- a/tools/llvm-mca/include/HardwareUnits/ResourceManager.h
+++ b/tools/llvm-mca/include/HardwareUnits/ResourceManager.h
@@ -189,8 +189,7 @@ class ResourceState {
   }
 
 public:
-  ResourceState(const llvm::MCProcResourceDesc &Desc, unsigned Index,
-                uint64_t Mask);
+  ResourceState(const MCProcResourceDesc &Desc, unsigned Index, uint64_t Mask);
 
   unsigned getProcResourceID() const { return ProcResourceDescIndex; }
   uint64_t getResourceMask() const { return ResourceMask; }
@@ -211,9 +210,7 @@ public:
   /// `NumUnits` available units.
   bool isReady(unsigned NumUnits = 1) const;
 
-  bool isAResourceGroup() const {
-    return llvm::countPopulation(ResourceMask) > 1;
-  }
+  bool isAResourceGroup() const { return countPopulation(ResourceMask) > 1; }
 
   bool containsResource(uint64_t ID) const { return ResourceMask & ID; }
 
@@ -228,7 +225,7 @@ public:
   }
 
   unsigned getNumUnits() const {
-    return isAResourceGroup() ? 1U : llvm::countPopulation(ResourceSizeMask);
+    return isAResourceGroup() ? 1U : countPopulation(ResourceSizeMask);
   }
 
   /// Checks if there is an available slot in the resource buffer.
@@ -286,10 +283,10 @@ class ResourceManager {
 
   // Keeps track of which resources are busy, and how many cycles are left
   // before those become usable again.
-  llvm::SmallDenseMap<ResourceRef, unsigned> BusyResources;
+  SmallDenseMap<ResourceRef, unsigned> BusyResources;
 
   // A table to map processor resource IDs to processor resource masks.
-  llvm::SmallVector<uint64_t, 8> ProcResID2Mask;
+  SmallVector<uint64_t, 8> ProcResID2Mask;
 
   // Returns the actual resource unit that will be used.
   ResourceRef selectPipe(uint64_t ResourceID);
@@ -305,7 +302,7 @@ class ResourceManager {
                              uint64_t ResourceMask);
 
 public:
-  ResourceManager(const llvm::MCSchedModel &SM);
+  ResourceManager(const MCSchedModel &SM);
   virtual ~ResourceManager() = default;
 
   // Overrides the selection strategy for the resource at index ResourceID in
@@ -319,17 +316,17 @@ public:
 
   // Returns RS_BUFFER_AVAILABLE if buffered resources are not reserved, and if
   // there are enough available slots in the buffers.
-  ResourceStateEvent canBeDispatched(llvm::ArrayRef<uint64_t> Buffers) const;
+  ResourceStateEvent canBeDispatched(ArrayRef<uint64_t> Buffers) const;
 
   // Return the processor resource identifier associated to this Mask.
   unsigned resolveResourceMask(uint64_t Mask) const;
 
   // Consume a slot in every buffered resource from array 'Buffers'. Resource
   // units that are dispatch hazards (i.e. BufferSize=0) are marked as reserved.
-  void reserveBuffers(llvm::ArrayRef<uint64_t> Buffers);
+  void reserveBuffers(ArrayRef<uint64_t> Buffers);
 
   // Release buffer entries previously allocated by method reserveBuffers.
-  void releaseBuffers(llvm::ArrayRef<uint64_t> Buffers);
+  void releaseBuffers(ArrayRef<uint64_t> Buffers);
 
   // Reserve a processor resource. A reserved resource is not available for
   // instruction issue until it is released.
@@ -346,9 +343,9 @@ public:
 
   void issueInstruction(
       const InstrDesc &Desc,
-      llvm::SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Pipes);
+      SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Pipes);
 
-  void cycleEvent(llvm::SmallVectorImpl<ResourceRef> &ResourcesFreed);
+  void cycleEvent(SmallVectorImpl<ResourceRef> &ResourcesFreed);
 
 #ifndef NDEBUG
   void dump() const {
diff --git a/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h b/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h
index 2f7a1b1d503..12e0a1fba13 100644
--- a/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h
+++ b/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h
@@ -63,7 +63,7 @@ private:
   std::vector<RUToken> Queue;
 
 public:
-  RetireControlUnit(const llvm::MCSchedModel &SM);
+  RetireControlUnit(const MCSchedModel &SM);
 
   bool isEmpty() const { return AvailableSlots == Queue.size(); }
   bool isAvailable(unsigned Quantity = 1) const {
diff --git a/tools/llvm-mca/include/HardwareUnits/Scheduler.h b/tools/llvm-mca/include/HardwareUnits/Scheduler.h
index 941224c1204..17332b430d2 100644
--- a/tools/llvm-mca/include/HardwareUnits/Scheduler.h
+++ b/tools/llvm-mca/include/HardwareUnits/Scheduler.h
@@ -105,25 +105,25 @@ class Scheduler : public HardwareUnit {
   /// Issue an instruction without updating the ready queue.
   void issueInstructionImpl(
       InstRef &IR,
-      llvm::SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Pipes);
+      SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Pipes);
 
   // Identify instructions that have finished executing, and remove them from
   // the IssuedSet. References to executed instructions are added to input
   // vector 'Executed'.
-  void updateIssuedSet(llvm::SmallVectorImpl<InstRef> &Executed);
+  void updateIssuedSet(SmallVectorImpl<InstRef> &Executed);
 
   // Try to promote instructions from WaitSet to ReadySet.
   // Add promoted instructions to the 'Ready' vector in input.
-  void promoteToReadySet(llvm::SmallVectorImpl<InstRef> &Ready);
+  void promoteToReadySet(SmallVectorImpl<InstRef> &Ready);
 
 public:
-  Scheduler(const llvm::MCSchedModel &Model, LSUnit *Lsu)
-      : LSU(Lsu), Resources(llvm::make_unique<ResourceManager>(Model)) {
+  Scheduler(const MCSchedModel &Model, LSUnit *Lsu)
+      : LSU(Lsu), Resources(make_unique<ResourceManager>(Model)) {
     initializeStrategy(nullptr);
   }
-  Scheduler(const llvm::MCSchedModel &Model, LSUnit *Lsu,
+  Scheduler(const MCSchedModel &Model, LSUnit *Lsu,
             std::unique_ptr<SchedulerStrategy> SelectStrategy)
-      : LSU(Lsu), Resources(llvm::make_unique<ResourceManager>(Model)) {
+      : LSU(Lsu), Resources(make_unique<ResourceManager>(Model)) {
     initializeStrategy(std::move(SelectStrategy));
   }
   Scheduler(std::unique_ptr<ResourceManager> RM, LSUnit *Lsu,
@@ -168,8 +168,8 @@ public:
   /// result of this event.
   void issueInstruction(
       InstRef &IR,
-      llvm::SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Used,
-      llvm::SmallVectorImpl<InstRef> &Ready);
+      SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Used,
+      SmallVectorImpl<InstRef> &Ready);
 
   /// Returns true if IR has to be issued immediately, or if IR is a zero
   /// latency instruction.
@@ -182,9 +182,9 @@ public:
   /// have changed in state, and that are now available to new instructions.
   /// Instructions executed are added to vector Executed, while vector Ready is
   /// populated with instructions that have become ready in this new cycle.
-  void cycleEvent(llvm::SmallVectorImpl<ResourceRef> &Freed,
-                  llvm::SmallVectorImpl<InstRef> &Ready,
-                  llvm::SmallVectorImpl<InstRef> &Executed);
+  void cycleEvent(SmallVectorImpl<ResourceRef> &Freed,
+                  SmallVectorImpl<InstRef> &Ready,
+                  SmallVectorImpl<InstRef> &Executed);
 
   /// Convert a resource mask into a valid llvm processor resource identifier.
   unsigned getResourceID(uint64_t Mask) const {
@@ -203,9 +203,9 @@ public:
   // This routine performs a sanity check.  This routine should only be called
   // when we know that 'IR' is not in the scheduler's instruction queues.
   void sanityCheck(const InstRef &IR) const {
-    assert(llvm::find(WaitSet, IR) == WaitSet.end());
-    assert(llvm::find(ReadySet, IR) == ReadySet.end());
-    assert(llvm::find(IssuedSet, IR) == IssuedSet.end());
+    assert(find(WaitSet, IR) == WaitSet.end() && "Already in the wait set!");
+    assert(find(ReadySet, IR) == ReadySet.end() && "Already in the ready set!");
+    assert(find(IssuedSet, IR) == IssuedSet.end() && "Already executing!");
   }
 #endif // !NDEBUG
 };
diff --git a/tools/llvm-mca/include/InstrBuilder.h b/tools/llvm-mca/include/InstrBuilder.h
index ca615c053c8..67aa889cf7b 100644
--- a/tools/llvm-mca/include/InstrBuilder.h
+++ b/tools/llvm-mca/include/InstrBuilder.h
@@ -37,39 +37,32 @@ namespace mca {
 /// Information from the machine scheduling model is used to identify processor
 /// resources that are consumed by an instruction.
 class InstrBuilder {
-  const llvm::MCSubtargetInfo &STI;
-  const llvm::MCInstrInfo &MCII;
-  const llvm::MCRegisterInfo &MRI;
-  const llvm::MCInstrAnalysis &MCIA;
-  llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
+  const MCSubtargetInfo &STI;
+  const MCInstrInfo &MCII;
+  const MCRegisterInfo &MRI;
+  const MCInstrAnalysis &MCIA;
+  SmallVector<uint64_t, 8> ProcResourceMasks;
 
-  llvm::DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
-  llvm::DenseMap<const llvm::MCInst *, std::unique_ptr<const InstrDesc>>
-      VariantDescriptors;
+  DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
+  DenseMap<const MCInst *, std::unique_ptr<const InstrDesc>> VariantDescriptors;
 
-  llvm::Expected<const InstrDesc &>
-  createInstrDescImpl(const llvm::MCInst &MCI);
-  llvm::Expected<const InstrDesc &>
-  getOrCreateInstrDesc(const llvm::MCInst &MCI);
+  Expected<const InstrDesc &> createInstrDescImpl(const MCInst &MCI);
+  Expected<const InstrDesc &> getOrCreateInstrDesc(const MCInst &MCI);
 
   InstrBuilder(const InstrBuilder &) = delete;
   InstrBuilder &operator=(const InstrBuilder &) = delete;
 
-  llvm::Error populateWrites(InstrDesc &ID, const llvm::MCInst &MCI,
-                             unsigned SchedClassID);
-  llvm::Error populateReads(InstrDesc &ID, const llvm::MCInst &MCI,
-                            unsigned SchedClassID);
-  llvm::Error verifyInstrDesc(const InstrDesc &ID,
-                              const llvm::MCInst &MCI) const;
+  Error populateWrites(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID);
+  Error populateReads(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID);
+  Error verifyInstrDesc(const InstrDesc &ID, const MCInst &MCI) const;
 
 public:
-  InstrBuilder(const llvm::MCSubtargetInfo &STI, const llvm::MCInstrInfo &MCII,
-               const llvm::MCRegisterInfo &RI, const llvm::MCInstrAnalysis &IA);
+  InstrBuilder(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
+               const MCRegisterInfo &RI, const MCInstrAnalysis &IA);
 
   void clear() { VariantDescriptors.shrink_and_clear(); }
 
-  llvm::Expected<std::unique_ptr<Instruction>>
-  createInstruction(const llvm::MCInst &MCI);
+  Expected<std::unique_ptr<Instruction>> createInstruction(const MCInst &MCI);
 };
 } // namespace mca
 } // namespace llvm
diff --git a/tools/llvm-mca/include/Instruction.h b/tools/llvm-mca/include/Instruction.h
index 8509af2e0ff..f83be1ff4bb 100644
--- a/tools/llvm-mca/include/Instruction.h
+++ b/tools/llvm-mca/include/Instruction.h
@@ -329,26 +329,26 @@ class InstructionBase {
 
   // Output dependencies.
   // One entry per each implicit and explicit register definition.
-  llvm::SmallVector<WriteState, 4> Defs;
+  SmallVector<WriteState, 4> Defs;
 
   // Input dependencies.
   // One entry per each implicit and explicit register use.
-  llvm::SmallVector<ReadState, 4> Uses;
+  SmallVector<ReadState, 4> Uses;
 
 public:
   InstructionBase(const InstrDesc &D) : Desc(D), IsOptimizableMove(false) {}
 
-  llvm::SmallVectorImpl<WriteState> &getDefs() { return Defs; }
-  const llvm::ArrayRef<WriteState> getDefs() const { return Defs; }
-  llvm::SmallVectorImpl<ReadState> &getUses() { return Uses; }
-  const llvm::ArrayRef<ReadState> getUses() const { return Uses; }
+  SmallVectorImpl<WriteState> &getDefs() { return Defs; }
+  const ArrayRef<WriteState> getDefs() const { return Defs; }
+  SmallVectorImpl<ReadState> &getUses() { return Uses; }
+  const ArrayRef<ReadState> getUses() const { return Uses; }
   const InstrDesc &getDesc() const { return Desc; }
 
   unsigned getLatency() const { return Desc.MaxLatency; }
 
   bool hasDependentUsers() const {
-    return llvm::any_of(
-        Defs, [](const WriteState &Def) { return Def.getNumUsers() > 0; });
+    return any_of(Defs,
+                  [](const WriteState &Def) { return Def.getNumUsers() > 0; });
   }
 
   unsigned getNumUsers() const {
@@ -420,8 +420,8 @@ public:
 
   bool isEliminated() const {
     return isReady() && getDefs().size() &&
-           llvm::all_of(getDefs(),
-                        [](const WriteState &W) { return W.isEliminated(); });
+           all_of(getDefs(),
+                  [](const WriteState &W) { return W.isEliminated(); });
   }
 
   // Forces a transition from state IS_AVAILABLE to state IS_EXECUTED.
@@ -458,12 +458,12 @@ public:
   void invalidate() { Data.second = nullptr; }
 
 #ifndef NDEBUG
-  void print(llvm::raw_ostream &OS) const { OS << getSourceIndex(); }
+  void print(raw_ostream &OS) const { OS << getSourceIndex(); }
 #endif
 };
 
 #ifndef NDEBUG
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const InstRef &IR) {
+inline raw_ostream &operator<<(raw_ostream &OS, const InstRef &IR) {
   IR.print(OS);
   return OS;
 }
diff --git a/tools/llvm-mca/include/Pipeline.h b/tools/llvm-mca/include/Pipeline.h
index cb58e9a1fbd..47ff07b2882 100644
--- a/tools/llvm-mca/include/Pipeline.h
+++ b/tools/llvm-mca/include/Pipeline.h
@@ -55,11 +55,11 @@ class Pipeline {
   Pipeline &operator=(const Pipeline &P) = delete;
 
   /// An ordered list of stages that define this instruction pipeline.
-  llvm::SmallVector<std::unique_ptr<Stage>, 8> Stages;
+  SmallVector<std::unique_ptr<Stage>, 8> Stages;
   std::set<HWEventListener *> Listeners;
   unsigned Cycles;
 
-  llvm::Error runCycle();
+  Error runCycle();
   bool hasWorkToProcess();
   void notifyCycleBegin();
   void notifyCycleEnd();
@@ -67,7 +67,7 @@ class Pipeline {
 public:
   Pipeline() : Cycles(0) {}
   void appendStage(std::unique_ptr<Stage> S);
-  llvm::Error run();
+  Error run();
   void addEventListener(HWEventListener *Listener);
 };
 } // namespace mca
diff --git a/tools/llvm-mca/include/SourceMgr.h b/tools/llvm-mca/include/SourceMgr.h
index 4a55bdba5b4..e5180107011 100644
--- a/tools/llvm-mca/include/SourceMgr.h
+++ b/tools/llvm-mca/include/SourceMgr.h
@@ -27,13 +27,13 @@ typedef std::pair<unsigned, const Instruction &> SourceRef;
 
 class SourceMgr {
   using UniqueInst = std::unique_ptr<Instruction>;
-  llvm::ArrayRef<UniqueInst> Sequence;
+  ArrayRef<UniqueInst> Sequence;
   unsigned Current;
   const unsigned Iterations;
   static const unsigned DefaultIterations = 100;
 
 public:
-  SourceMgr(llvm::ArrayRef<UniqueInst> S, unsigned Iter)
+  SourceMgr(ArrayRef<UniqueInst> S, unsigned Iter)
       : Sequence(S), Current(0), Iterations(Iter ? Iter : DefaultIterations) {}
 
   unsigned getNumIterations() const { return Iterations; }
@@ -46,7 +46,7 @@ public:
     return SourceRef(Current, *Sequence[Current % Sequence.size()]);
   }
 
-  using const_iterator = llvm::ArrayRef<UniqueInst>::const_iterator;
+  using const_iterator = ArrayRef<UniqueInst>::const_iterator;
   const_iterator begin() const { return Sequence.begin(); }
   const_iterator end() const { return Sequence.end(); }
 };
diff --git a/tools/llvm-mca/include/Stages/DispatchStage.h b/tools/llvm-mca/include/Stages/DispatchStage.h
index 0153c649b42..3595f3122cc 100644
--- a/tools/llvm-mca/include/Stages/DispatchStage.h
+++ b/tools/llvm-mca/include/Stages/DispatchStage.h
@@ -53,30 +53,29 @@ class DispatchStage final : public Stage {
   unsigned AvailableEntries;
   unsigned CarryOver;
   InstRef CarriedOver;
-  const llvm::MCSubtargetInfo &STI;
+  const MCSubtargetInfo &STI;
   RetireControlUnit &RCU;
   RegisterFile &PRF;
 
   bool checkRCU(const InstRef &IR) const;
   bool checkPRF(const InstRef &IR) const;
   bool canDispatch(const InstRef &IR) const;
-  llvm::Error dispatch(InstRef IR);
+  Error dispatch(InstRef IR);
 
-  void updateRAWDependencies(ReadState &RS, const llvm::MCSubtargetInfo &STI);
+  void updateRAWDependencies(ReadState &RS, const MCSubtargetInfo &STI);
 
   void notifyInstructionDispatched(const InstRef &IR,
-                                   llvm::ArrayRef<unsigned> UsedPhysRegs,
+                                   ArrayRef<unsigned> UsedPhysRegs,
                                    unsigned uOps) const;
 
-  void collectWrites(llvm::SmallVectorImpl<WriteRef> &Vec,
-                     unsigned RegID) const {
+  void collectWrites(SmallVectorImpl<WriteRef> &Vec, unsigned RegID) const {
     return PRF.collectWrites(Vec, RegID);
   }
 
 public:
-  DispatchStage(const llvm::MCSubtargetInfo &Subtarget,
-                const llvm::MCRegisterInfo &MRI, unsigned MaxDispatchWidth,
-                RetireControlUnit &R, RegisterFile &F)
+  DispatchStage(const MCSubtargetInfo &Subtarget, const MCRegisterInfo &MRI,
+                unsigned MaxDispatchWidth, RetireControlUnit &R,
+                RegisterFile &F)
       : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth),
         CarryOver(0U), CarriedOver(), STI(Subtarget), RCU(R), PRF(F) {}
 
@@ -85,8 +84,8 @@ public:
   // The dispatch logic internally doesn't buffer instructions. So there is
   // never work to do at the beginning of every cycle.
   bool hasWorkToComplete() const override { return false; }
-  llvm::Error cycleStart() override;
-  llvm::Error execute(InstRef &IR) override;
+  Error cycleStart() override;
+  Error execute(InstRef &IR) override;
 
 #ifndef NDEBUG
   void dump() const;
diff --git a/tools/llvm-mca/include/Stages/ExecuteStage.h b/tools/llvm-mca/include/Stages/ExecuteStage.h
index 0f46c8a3878..91b24059c95 100644
--- a/tools/llvm-mca/include/Stages/ExecuteStage.h
+++ b/tools/llvm-mca/include/Stages/ExecuteStage.h
@@ -29,14 +29,14 @@ namespace mca {
 class ExecuteStage final : public Stage {
   Scheduler &HWS;
 
-  llvm::Error issueInstruction(InstRef &IR);
+  Error issueInstruction(InstRef &IR);
 
   // Called at the beginning of each cycle to issue already dispatched
   // instructions to the underlying pipelines.
-  llvm::Error issueReadyInstructions();
+  Error issueReadyInstructions();
 
   // Used to notify instructions eliminated at register renaming stage.
-  llvm::Error handleInstructionEliminated(InstRef &IR);
+  Error handleInstructionEliminated(InstRef &IR);
 
   ExecuteStage(const ExecuteStage &Other) = delete;
   ExecuteStage &operator=(const ExecuteStage &Other) = delete;
@@ -60,12 +60,12 @@ public:
   // state changes, and processor resources freed by the scheduler.
   // Instructions that transitioned to the 'Executed' state are automatically
   // moved to the next stage (i.e. RetireStage).
-  llvm::Error cycleStart() override;
-  llvm::Error execute(InstRef &IR) override;
+  Error cycleStart() override;
+  Error execute(InstRef &IR) override;
 
   void notifyInstructionIssued(
       const InstRef &IR,
-      llvm::ArrayRef<std::pair<ResourceRef, ResourceCycles>> Used) const;
+      ArrayRef<std::pair<ResourceRef, ResourceCycles>> Used) const;
   void notifyInstructionExecuted(const InstRef &IR) const;
   void notifyInstructionReady(const InstRef &IR) const;
   void notifyResourceAvailable(const ResourceRef &RR) const;
diff --git a/tools/llvm-mca/include/Stages/FetchStage.h b/tools/llvm-mca/include/Stages/FetchStage.h
index 8622ab07e9e..55bf2011b32 100644
--- a/tools/llvm-mca/include/Stages/FetchStage.h
+++ b/tools/llvm-mca/include/Stages/FetchStage.h
@@ -40,9 +40,9 @@ public:
 
   bool isAvailable(const InstRef &IR) const override;
   bool hasWorkToComplete() const override;
-  llvm::Error execute(InstRef &IR) override;
-  llvm::Error cycleStart() override;
-  llvm::Error cycleEnd() override;
+  Error execute(InstRef &IR) override;
+  Error cycleStart() override;
+  Error cycleEnd() override;
 };
 
 } // namespace mca
diff --git a/tools/llvm-mca/include/Stages/InstructionTables.h b/tools/llvm-mca/include/Stages/InstructionTables.h
index 2b6e542d973..e618d06b1b7 100644
--- a/tools/llvm-mca/include/Stages/InstructionTables.h
+++ b/tools/llvm-mca/include/Stages/InstructionTables.h
@@ -27,17 +27,17 @@ namespace llvm {
 namespace mca {
 
 class InstructionTables final : public Stage {
-  const llvm::MCSchedModel &SM;
-  llvm::SmallVector<std::pair<ResourceRef, ResourceCycles>, 4> UsedResources;
-  llvm::SmallVector<uint64_t, 8> Masks;
+  const MCSchedModel &SM;
+  SmallVector<std::pair<ResourceRef, ResourceCycles>, 4> UsedResources;
+  SmallVector<uint64_t, 8> Masks;
 
 public:
-  InstructionTables(const llvm::MCSchedModel &Model) : Stage(), SM(Model) {
+  InstructionTables(const MCSchedModel &Model) : Stage(), SM(Model) {
     computeProcResourceMasks(Model, Masks);
   }
 
   bool hasWorkToComplete() const override { return false; }
-  llvm::Error execute(InstRef &IR) override;
+  Error execute(InstRef &IR) override;
 };
 } // namespace mca
 } // namespace llvm
diff --git a/tools/llvm-mca/include/Stages/RetireStage.h b/tools/llvm-mca/include/Stages/RetireStage.h
index e9975ca3bbd..28eda40984f 100644
--- a/tools/llvm-mca/include/Stages/RetireStage.h
+++ b/tools/llvm-mca/include/Stages/RetireStage.h
@@ -37,8 +37,8 @@ public:
       : Stage(), RCU(R), PRF(F) {}
 
   bool hasWorkToComplete() const override { return !RCU.isEmpty(); }
-  llvm::Error cycleStart() override;
-  llvm::Error execute(InstRef &IR) override;
+  Error cycleStart() override;
+  Error execute(InstRef &IR) override;
   void notifyInstructionRetired(const InstRef &IR) const;
 };
 
diff --git a/tools/llvm-mca/include/Stages/Stage.h b/tools/llvm-mca/include/Stages/Stage.h
index 383abbe217e..5665fc453bf 100644
--- a/tools/llvm-mca/include/Stages/Stage.h
+++ b/tools/llvm-mca/include/Stages/Stage.h
@@ -47,13 +47,13 @@ public:
 
   /// Called once at the start of each cycle.  This can be used as a setup
   /// phase to prepare for the executions during the cycle.
-  virtual llvm::Error cycleStart() { return llvm::ErrorSuccess(); }
+  virtual Error cycleStart() { return ErrorSuccess(); }
 
   /// Called once at the end of each cycle.
-  virtual llvm::Error cycleEnd() { return llvm::ErrorSuccess(); }
+  virtual Error cycleEnd() { return ErrorSuccess(); }
 
   /// The primary action that this stage performs on instruction IR.
-  virtual llvm::Error execute(InstRef &IR) = 0;
+  virtual Error execute(InstRef &IR) = 0;
 
   void setNextInSequence(Stage *NextStage) {
     assert(!NextInSequence && "This stage already has a NextInSequence!");
@@ -68,7 +68,7 @@ public:
   ///
   /// Stages are responsible for moving instructions to their immediate
   /// successor stages.
-  llvm::Error moveToTheNextStage(InstRef &IR) {
+  Error moveToTheNextStage(InstRef &IR) {
     assert(checkNextStage(IR) && "Next stage is not ready!");
     return NextInSequence->execute(IR);
   }
diff --git a/tools/llvm-mca/include/Support.h b/tools/llvm-mca/include/Support.h
index 43fb72c0229..e7a4e33ed74 100644
--- a/tools/llvm-mca/include/Support.h
+++ b/tools/llvm-mca/include/Support.h
@@ -24,7 +24,7 @@ namespace llvm {
 namespace mca {
 
 template <typename T>
-class InstructionError : public llvm::ErrorInfo<InstructionError<T>> {
+class InstructionError : public ErrorInfo<InstructionError<T>> {
 public:
   static char ID;
   std::string Message;
@@ -33,10 +33,10 @@ public:
   InstructionError(std::string M, const T &MCI)
       : Message(std::move(M)), Inst(MCI) {}
 
-  void log(llvm::raw_ostream &OS) const override { OS << Message; }
+  void log(raw_ostream &OS) const override { OS << Message; }
 
   std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
+    return inconvertibleErrorCode();
   }
 };
 
@@ -70,8 +70,7 @@ public:
     else {
       // Create a common denominator for LHS and RHS by calculating the least
       // common multiple from the GCD.
-      unsigned GCD =
-          llvm::GreatestCommonDivisor64(Denominator, RHS.Denominator);
+      unsigned GCD = GreatestCommonDivisor64(Denominator, RHS.Denominator);
       unsigned LCM = (Denominator * RHS.Denominator) / GCD;
       unsigned LHSNumerator = Numerator * (LCM / Denominator);
       unsigned RHSNumerator = RHS.Numerator * (LCM / RHS.Denominator);
@@ -104,16 +103,16 @@ public:
 ///
 /// Resource masks are used by the ResourceManager to solve set membership
 /// problems with simple bit manipulation operations.
-void computeProcResourceMasks(const llvm::MCSchedModel &SM,
-                              llvm::SmallVectorImpl<uint64_t> &Masks);
+void computeProcResourceMasks(const MCSchedModel &SM,
+                              SmallVectorImpl<uint64_t> &Masks);
 
 /// Compute the reciprocal block throughput from a set of processor resource
 /// cycles. The reciprocal block throughput is computed as the MAX between:
 ///  - NumMicroOps / DispatchWidth
 ///  - ProcResourceCycles / #ProcResourceUnits  (for every consumed resource).
-double computeBlockRThroughput(const llvm::MCSchedModel &SM,
-                               unsigned DispatchWidth, unsigned NumMicroOps,
-                               llvm::ArrayRef<unsigned> ProcResourceUsage);
+double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth,
+                               unsigned NumMicroOps,
+                               ArrayRef<unsigned> ProcResourceUsage);
 } // namespace mca
 } // namespace llvm
 
-- 
GitLab


From aaf702f58f4432a6622cb8a8903570687d8cf485 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Wed, 31 Oct 2018 15:54:31 +0000
Subject: [PATCH 0807/1116] [Hexagon] Make sure not to use GP-relative
 addressing with PIC

Make sure that -relocation-model=pic prevents use of GP-relative
addressing modes.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345731 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Hexagon/HexagonSplitConst32AndConst64.cpp |  2 +-
 .../Hexagon/HexagonTargetObjectFile.cpp       | 10 ++++-
 lib/Target/Hexagon/HexagonTargetObjectFile.h  |  2 +-
 test/CodeGen/Hexagon/pic-sdata.ll             | 37 +++++++++++++++++++
 4 files changed, 47 insertions(+), 4 deletions(-)
 create mode 100644 test/CodeGen/Hexagon/pic-sdata.ll

diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index c41f0d3c085..55de2512094 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -63,7 +63,7 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
   auto &HST = Fn.getSubtarget<HexagonSubtarget>();
   auto &HTM = static_cast<const HexagonTargetMachine&>(Fn.getTarget());
   auto &TLOF = *HTM.getObjFileLowering();
-  if (HST.useSmallData() && TLOF.isSmallDataEnabled())
+  if (HST.useSmallData() && TLOF.isSmallDataEnabled(HTM))
     return false;
 
   const TargetInstrInfo *TII = HST.getInstrInfo();
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index e771f383dff..386cd14c827 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -199,6 +199,11 @@ MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal(
 /// section.
 bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
       const TargetMachine &TM) const {
+  if (!isSmallDataEnabled(TM)) {
+    LLVM_DEBUG(dbgs() << "Small data is not available.\n");
+    return false;
+  }
+
   // Only global variables, not functions.
   LLVM_DEBUG(dbgs() << "Checking if value is in small-data, -G"
                     << SmallDataThreshold << ": \"" << GO->getName() << "\": ");
@@ -263,8 +268,9 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
   return true;
 }
 
-bool HexagonTargetObjectFile::isSmallDataEnabled() const {
-  return SmallDataThreshold > 0;
+bool HexagonTargetObjectFile::isSmallDataEnabled(const TargetMachine &TM)
+    const {
+  return SmallDataThreshold > 0 && !TM.isPositionIndependent();
 }
 
 unsigned HexagonTargetObjectFile::getSmallDataSize() const {
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index eff44f097e0..18863630fde 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -29,7 +29,7 @@ namespace llvm {
     bool isGlobalInSmallSection(const GlobalObject *GO,
                                 const TargetMachine &TM) const;
 
-    bool isSmallDataEnabled() const;
+    bool isSmallDataEnabled(const TargetMachine &TM) const;
 
     unsigned getSmallDataSize() const;
 
diff --git a/test/CodeGen/Hexagon/pic-sdata.ll b/test/CodeGen/Hexagon/pic-sdata.ll
new file mode 100644
index 00000000000..3e4dc2dc93e
--- /dev/null
+++ b/test/CodeGen/Hexagon/pic-sdata.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=hexagon -hexagon-small-data-threshold=8 -relocation-model=static < %s | FileCheck --check-prefixes=CHECK,STATIC %s
+; RUN: llc -march=hexagon -hexagon-small-data-threshold=8 -relocation-model=pic < %s | FileCheck --check-prefixes=CHECK,PIC %s
+
+; If a global has a specified section, it should probably be placed in that
+; section, but with PIC any accesses to globals in small data should still
+; go through GOT.
+
+@g0 = global i32 zeroinitializer
+@g1 = global i32 zeroinitializer, section ".sdata"
+
+; CHECK-LABEL: f0:
+; STATIC: memw(gp+#g0)
+; PIC: r[[R0:[0-9]+]] = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+; PIC: = memw(r[[R0]]+##g0@GOT)
+define i32 @f0() #0 {
+  %v0 = load i32, i32* @g0
+  ret i32 %v0
+}
+
+; CHECK-LABEL: f1:
+; STATIC: memw(gp+#g1)
+; PIC: r[[R1:[0-9]+]] = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+; PIC: = memw(r[[R1]]+##g1@GOT)
+define i32 @f1() #0 {
+  %v0 = load i32, i32* @g1
+  ret i32 %v0
+}
+
+; CHECK-LABEL: f2:
+; STATIC: CONST64(#123456789012345678)
+; PIC: r0 = ##-1506741426
+; PIC: r1 = ##28744523
+define i64 @f2() #0 {
+  ret i64 123456789012345678
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" }
-- 
GitLab


From 22968f72bdd327050228e70780d6d08848c5d576 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 31 Oct 2018 16:34:43 +0000
Subject: [PATCH 0808/1116] [InstCombine] refactor fabs+fcmp fold; NFC

Also, remove/replace/minimize/enhance the tests for this fold.
The code drops FMF, so it needs more tests and at least 1 fix.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345734 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCompares.cpp       |  84 +++----
 test/Transforms/InstCombine/fcmp.ll           | 220 +++++-------------
 2 files changed, 105 insertions(+), 199 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 1ad648fe783..9155ad12598 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5288,6 +5288,46 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
   return NewFCI;
 }
 
+/// Optimize fabs(X) compared with zero.
+static Instruction *foldFabsWithFcmpZero(FCmpInst &I) {
+  Value *X;
+  if (!match(I.getOperand(0), m_Intrinsic<Intrinsic::fabs>(m_Value(X))) ||
+      !match(I.getOperand(1), m_PosZeroFP()))
+    return nullptr;
+
+  switch (I.getPredicate()) {
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_OLT:
+    // fabs(X) >= 0.0 --> true
+    // fabs(X) <  0.0 --> false
+    llvm_unreachable("fcmp should have simplified");
+
+  case FCmpInst::FCMP_OGT:
+    // fabs(X) > 0.0 --> X != 0.0
+    return new FCmpInst(FCmpInst::FCMP_ONE, X, I.getOperand(1));
+
+  case FCmpInst::FCMP_OLE:
+    // fabs(X) <= 0.0 --> X == 0.0
+    return new FCmpInst(FCmpInst::FCMP_OEQ, X, I.getOperand(1));
+
+  case FCmpInst::FCMP_OGE:
+    // fabs(X) >= 0.0 --> !isnan(X)
+    assert(!I.hasNoNaNs() && "fcmp should have simplified");
+    return new FCmpInst(FCmpInst::FCMP_ORD, X, I.getOperand(1));
+
+  case FCmpInst::FCMP_OEQ:
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_ONE:
+  case FCmpInst::FCMP_UNE:
+    // fabs(X) == 0.0 --> X == 0.0
+    // fabs(X) != 0.0 --> X != 0.0
+    return new FCmpInst(I.getPredicate(), X, I.getOperand(1));
+
+  default:
+    return nullptr;
+  }
+}
+
 Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   bool Changed = false;
 
@@ -5418,45 +5458,11 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
             if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
               return Res;
       break;
-    case Instruction::Call: {
-      if (!RHSC->isNullValue())
-        break;
-
-      CallInst *CI = cast<CallInst>(LHSI);
-      Intrinsic::ID IID = getIntrinsicForCallSite(CI, &TLI);
-      if (IID != Intrinsic::fabs)
-        break;
-
-      // Various optimization for fabs compared with zero.
-      switch (Pred) {
-      default:
-        break;
-      case FCmpInst::FCMP_UGE:
-      case FCmpInst::FCMP_OLT:
-        // fabs(x) >= 0.0 --> true
-        // fabs(x) <  0.0 --> false
-        llvm_unreachable("fcmp should have simplified");
-
-      // fabs(x) > 0 --> x != 0
-      case FCmpInst::FCMP_OGT:
-        return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), RHSC);
-      // fabs(x) <= 0 --> x == 0
-      case FCmpInst::FCMP_OLE:
-        return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0), RHSC);
-      // fabs(x) >= 0 --> !isnan(x)
-      case FCmpInst::FCMP_OGE:
-        assert(!I.hasNoNaNs() && "fcmp should have simplified");
-        return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0), RHSC);
-      // fabs(x) == 0 --> x == 0
-      // fabs(x) != 0 --> x != 0
-      case FCmpInst::FCMP_OEQ:
-      case FCmpInst::FCMP_UEQ:
-      case FCmpInst::FCMP_ONE:
-      case FCmpInst::FCMP_UNE:
-        return new FCmpInst(Pred, CI->getArgOperand(0), RHSC);
-      }
-    }
-    }
+    case Instruction::Call:
+      if (Instruction *X = foldFabsWithFcmpZero(I))
+        return X;
+      break;
+  }
   }
 
   // fcmp pred (fneg x), (fneg y) -> fcmp swap(pred) x, y
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index ff47496abe9..c19aae4c03b 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
-declare double @llvm.fabs.f64(double) readnone
+declare half @llvm.fabs.f16(half)
+declare double @llvm.fabs.f64(double)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 
 define i1 @test1(float %x, float %y) {
 ; CHECK-LABEL: @test1(
@@ -133,194 +135,92 @@ define float @test8(float %x) {
 ; Float comparison to zero shouldn't cast to double.
 }
 
-declare double @fabs(double) readnone
-
-define i32 @test9(double %a) {
-; CHECK-LABEL: @test9(
-; CHECK-NEXT:    ret i32 0
-;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp olt double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @test9_intrinsic(double %a) {
-; CHECK-LABEL: @test9_intrinsic(
-; CHECK-NEXT:    ret i32 0
-;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp olt double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @test10(double %a) {
-; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define i1 @fabs_uge(double %a) {
+; CHECK-LABEL: @fabs_uge(
+; CHECK-NEXT:    ret i1 true
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp ole double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp uge double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test10_intrinsic(double %a) {
-; CHECK-LABEL: @test10_intrinsic(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define i1 @fabs_olt(half %a) {
+; CHECK-LABEL: @fabs_olt(
+; CHECK-NEXT:    ret i1 false
 ;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp ole double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call half @llvm.fabs.f16(half %a)
+  %cmp = fcmp olt half %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test11(double %a) {
-; CHECK-LABEL: @test11(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define <2 x i1> @fabs_ole(<2 x float> %a) {
+; CHECK-LABEL: @fabs_ole(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp ogt double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp ole <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
 }
 
-define i32 @test11_intrinsic(double %a) {
-; CHECK-LABEL: @test11_intrinsic(
+define i1 @fabs_ogt(double %a) {
+; CHECK-LABEL: @fabs_ogt(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp ogt double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @test12(double %a) {
-; CHECK-LABEL: @test12(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp oge double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp ogt double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test12_intrinsic(double %a) {
-; CHECK-LABEL: @test12_intrinsic(
+define i1 @fabs_oge(double %a) {
+; CHECK-LABEL: @fabs_oge(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp oge double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @test13(double %a) {
-; CHECK-LABEL: @test13(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp une double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @test13_intrinsic(double %a) {
-; CHECK-LABEL: @test13_intrinsic(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp une double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp oge double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test14(double %a) {
-; CHECK-LABEL: @test14(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define i1 @fabs_une(half %a) {
+; CHECK-LABEL: @fabs_une(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une half [[A:%.*]], 0xH0000
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp oeq double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call half @llvm.fabs.f16(half %a)
+  %cmp = fcmp une half %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test14_intrinsic(double %a) {
-; CHECK-LABEL: @test14_intrinsic(
+define i1 @fabs_oeq(double %a) {
+; CHECK-LABEL: @fabs_oeq(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp oeq double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @test15(double %a) {
-; CHECK-LABEL: @test15(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp one double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp oeq double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test15_intrinsic(double %a) {
-; CHECK-LABEL: @test15_intrinsic(
+define i1 @fabs_one(double %a) {
+; CHECK-LABEL: @fabs_one(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp one double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @test16(double %a) {
-; CHECK-LABEL: @test16(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp ueq double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp one double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test16_intrinsic(double %a) {
-; CHECK-LABEL: @test16_intrinsic(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define <2 x i1> @fabs_ueq(<2 x float> %a) {
+; CHECK-LABEL: @fabs_ueq(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp ueq double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp ueq <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
 }
 
 ; Don't crash.
-- 
GitLab


From 08b668a1c47135a9032d003f8b8bf94cff763f9f Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Wed, 31 Oct 2018 17:18:41 +0000
Subject: [PATCH 0809/1116] MachineModuleInfo: Initialize DbgInfoAvailable
 depending on debug_cus existing

Before this patch DbgInfoAvailable was set to true in
DwarfDebug::beginModule() or CodeViewDebug::CodeViewDebug(). This made
MIR testing weird since passes would suddenly stop dealing with debug
info just because we stopped the pipeline before the debug printers.

This patch changes the logic to initialize DbgInfoAvailable based on the
fact that debug_compile_units exist in the llvm Module. The debug
printers may then override it with false in case of debug printing being
disabled.

Differential Revision: https://reviews.llvm.org/D53885

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345740 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/CodeViewDebug.cpp |  1 +
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp    |  7 +++++--
 lib/CodeGen/MachineModuleInfo.cpp        |  3 ++-
 test/CodeGen/AArch64/fast-isel-dbg.ll    | 26 ++++++++++++++++++++++++
 4 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 test/CodeGen/AArch64/fast-isel-dbg.ll

diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 3b503b683a0..42259d4a62d 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -117,6 +117,7 @@ CodeViewDebug::CodeViewDebug(AsmPrinter *AP)
   if (!MMI->getModule()->getNamedMetadata("llvm.dbg.cu") ||
       !AP->getObjFileLowering().getCOFFDebugSymbolsSection()) {
     Asm = nullptr;
+    MMI->setDebugInfoAvailability(false);
     return;
   }
   // Tell MMI that we have debug info.
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 3a1e54812a1..2807734969a 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -701,15 +701,18 @@ sortGlobalExprs(SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &GVEs) {
 void DwarfDebug::beginModule() {
   NamedRegionTimer T(DbgTimerName, DbgTimerDescription, DWARFGroupName,
                      DWARFGroupDescription, TimePassesIsEnabled);
-  if (DisableDebugInfoPrinting)
+  if (DisableDebugInfoPrinting) {
+    MMI->setDebugInfoAvailability(false);
     return;
+  }
 
   const Module *M = MMI->getModule();
 
   unsigned NumDebugCUs = std::distance(M->debug_compile_units_begin(),
                                        M->debug_compile_units_end());
   // Tell MMI whether we have debug info.
-  MMI->setDebugInfoAvailability(NumDebugCUs > 0);
+  assert(MMI->hasDebugInfo() == (NumDebugCUs > 0) &&
+         "DebugInfoAvailabilty initialized unexpectedly");
   SingleCU = NumDebugCUs == 1;
   DenseMap<DIGlobalVariable *, SmallVector<DwarfCompileUnit::GlobalExpr, 1>>
       GVMap;
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 639cd80768f..ce556903dc0 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -206,10 +206,11 @@ MachineModuleInfo::~MachineModuleInfo() = default;
 bool MachineModuleInfo::doInitialization(Module &M) {
   ObjFileMMI = nullptr;
   CurCallSite = 0;
-  DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false;
+  UsesVAFloatArgument = UsesMorestackAddr = false;
   HasSplitStack = HasNosplitStack = false;
   AddrLabelSymbols = nullptr;
   TheModule = &M;
+  DbgInfoAvailable = !empty(M.debug_compile_units());
   return false;
 }
 
diff --git a/test/CodeGen/AArch64/fast-isel-dbg.ll b/test/CodeGen/AArch64/fast-isel-dbg.ll
new file mode 100644
index 00000000000..4d26b9142af
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-dbg.ll
@@ -0,0 +1,26 @@
+; RUN: llc -o - %s -fast-isel -stop-before=expand-isel-pseudos | FileCheck %s
+; Make sure fast-isel produces DBG_VALUE instructions even if no debug printer
+; is scheduled because of -stop-before.
+target triple="aarch64--"
+
+; CHECK-LABEL: name: func
+; CHECK: DBG_VALUE
+define void @func(i32 %a) !dbg !4 {
+  call void @llvm.dbg.declare(metadata i32 %a, metadata !5, metadata !DIExpression()), !dbg !7
+  ret void
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+attributes #0 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "fast-isel-dbg.ll", directory: "/")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "func", scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !0)
+!5 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 17, type: !6)
+!6 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!7 = !DILocation(line: 17, scope: !4)
-- 
GitLab


From 99c33171a136fe2ea2923ecbc3f22af99659c062 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Wed, 31 Oct 2018 17:31:23 +0000
Subject: [PATCH 0810/1116] [globalisel][irtranslator] Verify that DILocations
 aren't lost in translation

Summary:
Also fix a couple bugs where DILocations are lost. EntryBuilder wasn't passing
on debug locations for PHI's, constants, GLOBAL_VALUE, etc.

Reviewers: aprantl, vsk, bogner, aditya_nandakumar, volkan, rtereshin, aemerson

Reviewed By: aemerson

Subscribers: aemerson, rovka, kristof.beyls, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D53740

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345743 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/GlobalISel/IRTranslator.cpp       | 92 ++++++++++++++-----
 .../GlobalISel/irtranslator-dilocation.ll     | 51 ++++++++++
 2 files changed, 120 insertions(+), 23 deletions(-)
 create mode 100644 test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll

diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index ab7d3a87975..aa5022cd397 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -104,6 +104,36 @@ IRTranslator::IRTranslator() : MachineFunctionPass(ID) {
   initializeIRTranslatorPass(*PassRegistry::getPassRegistry());
 }
 
+#ifndef NDEBUG
+/// Verify that every instruction created has the same DILocation as the
+/// instruction being translated.
+class DILocationVerifier : MachineFunction::Delegate {
+  MachineFunction &MF;
+  const Instruction *CurrInst = nullptr;
+
+public:
+  DILocationVerifier(MachineFunction &MF) : MF(MF) { MF.setDelegate(this); }
+  ~DILocationVerifier() { MF.resetDelegate(this); }
+
+  const Instruction *getCurrentInst() const { return CurrInst; }
+  void setCurrentInst(const Instruction *Inst) { CurrInst = Inst; }
+
+  void MF_HandleInsertion(const MachineInstr &MI) override {
+    assert(getCurrentInst() && "Inserted instruction without a current MI");
+
+    // Only print the check message if we're actually checking it.
+#ifndef NDEBUG
+    LLVM_DEBUG(dbgs() << "Checking DILocation from " << *CurrInst
+                      << " was copied to " << MI);
+#endif
+    assert(CurrInst->getDebugLoc() == MI.getDebugLoc() &&
+           "Line info was not transferred to all instructions");
+  }
+  void MF_HandleRemoval(const MachineInstr &MI) override {}
+};
+#endif // ifndef NDEBUG
+
+
 void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<StackProtector>();
   AU.addRequired<TargetPassConfig>();
@@ -1468,9 +1498,16 @@ bool IRTranslator::translateAtomicRMW(const User &U,
 }
 
 void IRTranslator::finishPendingPhis() {
+#ifndef NDEBUG
+  DILocationVerifier Verifier(*MF);
+#endif // ifndef NDEBUG
   for (auto &Phi : PendingPHIs) {
     const PHINode *PI = Phi.first;
     ArrayRef<MachineInstr *> ComponentPHIs = Phi.second;
+    EntryBuilder.setDebugLoc(PI->getDebugLoc());
+#ifndef NDEBUG
+    Verifier.setCurrentInst(PI);
+#endif // ifndef NDEBUG
 
     // All MachineBasicBlocks exist, add them to the PHI. We assume IRTranslator
     // won't create extra control flow here, otherwise we need to find the
@@ -1509,6 +1546,7 @@ bool IRTranslator::valueIsSplit(const Value &V,
 
 bool IRTranslator::translate(const Instruction &Inst) {
   CurBuilder.setDebugLoc(Inst.getDebugLoc());
+  EntryBuilder.setDebugLoc(Inst.getDebugLoc());
   switch(Inst.getOpcode()) {
 #define HANDLE_INST(NUM, OPCODE, CLASS) \
     case Instruction::OPCODE: return translate##OPCODE(Inst, CurBuilder);
@@ -1684,31 +1722,39 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   }
 
   // Need to visit defs before uses when translating instructions.
-  ReversePostOrderTraversal<const Function *> RPOT(&F);
-  for (const BasicBlock *BB : RPOT) {
-    MachineBasicBlock &MBB = getMBB(*BB);
-    // Set the insertion point of all the following translations to
-    // the end of this basic block.
-    CurBuilder.setMBB(MBB);
-
-    for (const Instruction &Inst : *BB) {
-      if (translate(Inst))
-        continue;
-
-      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
-                                 Inst.getDebugLoc(), BB);
-      R << "unable to translate instruction: " << ore::NV("Opcode", &Inst);
-
-      if (ORE->allowExtraAnalysis("gisel-irtranslator")) {
-        std::string InstStrStorage;
-        raw_string_ostream InstStr(InstStrStorage);
-        InstStr << Inst;
+  {
+    ReversePostOrderTraversal<const Function *> RPOT(&F);
+#ifndef NDEBUG
+    DILocationVerifier Verifier(*MF);
+#endif // ifndef NDEBUG
+    for (const BasicBlock *BB : RPOT) {
+      MachineBasicBlock &MBB = getMBB(*BB);
+      // Set the insertion point of all the following translations to
+      // the end of this basic block.
+      CurBuilder.setMBB(MBB);
+
+      for (const Instruction &Inst : *BB) {
+#ifndef NDEBUG
+        Verifier.setCurrentInst(&Inst);
+#endif // ifndef NDEBUG
+        if (translate(Inst))
+          continue;
+
+        OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                   Inst.getDebugLoc(), BB);
+        R << "unable to translate instruction: " << ore::NV("Opcode", &Inst);
+
+        if (ORE->allowExtraAnalysis("gisel-irtranslator")) {
+          std::string InstStrStorage;
+          raw_string_ostream InstStr(InstStrStorage);
+          InstStr << Inst;
+
+          R << ": '" << InstStr.str() << "'";
+        }
 
-        R << ": '" << InstStr.str() << "'";
+        reportTranslationError(*MF, *TPC, *ORE, R);
+        return false;
       }
-
-      reportTranslationError(*MF, *TPC, *ORE, R);
-      return false;
     }
   }
 
diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll
new file mode 100644
index 00000000000..c6e47a3b05f
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll
@@ -0,0 +1,51 @@
+; RUN: llc -O0 -mtriple=aarch64-apple-ios -global-isel -debug-only=irtranslator \
+; RUN:     -stop-after=irtranslator %s -o - 2>&1 | FileCheck %s
+
+; CHECK: Checking DILocation from   %retval = alloca i32, align 4 was copied to G_FRAME_INDEX
+; CHECK: Checking DILocation from   %rv = alloca i32, align 4 was copied to G_FRAME_INDEX
+; CHECK: Checking DILocation from   store i32 0, i32* %retval, align 4 was copied to G_CONSTANT
+; CHECK: Checking DILocation from   store i32 0, i32* %retval, align 4 was copied to G_STORE
+; CHECK: Checking DILocation from   store i32 0, i32* %rv, align 4, !dbg !12 was copied to G_STORE debug-location !12; t.cpp:2:5
+; CHECK: Checking DILocation from   %0 = load i32, i32* %rv, align 4, !dbg !13 was copied to G_LOAD debug-location !13; t.cpp:3:8
+; CHECK: Checking DILocation from   ret i32 %0, !dbg !14 was copied to COPY debug-location !14; t.cpp:3:1
+; CHECK: Checking DILocation from   ret i32 %0, !dbg !14 was copied to RET_ReallyLR implicit $w0, debug-location !14; t.cpp:3:1
+
+source_filename = "t.cpp"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-unknown-linux-gnu"
+
+; Function Attrs: noinline norecurse nounwind optnone
+define dso_local i32 @main() !dbg !7 {
+entry:
+  %retval = alloca i32, align 4
+  %rv = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  call void @llvm.dbg.declare(metadata i32* %rv, metadata !11, metadata !DIExpression()), !dbg !12
+  store i32 0, i32* %rv, align 4, !dbg !12
+  %0 = load i32, i32* %rv, align 4, !dbg !13
+  ret i32 %0, !dbg !14
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk) (llvm/trunk 344296)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "t.cpp", directory: "/Volumes/Data/llvm.org/svn/build")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk) (llvm/trunk 344296)"}
+!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DILocalVariable(name: "rv", scope: !7, file: !1, line: 2, type: !10)
+!12 = !DILocation(line: 2, column: 5, scope: !7)
+!13 = !DILocation(line: 3, column: 8, scope: !7)
+!14 = !DILocation(line: 3, column: 1, scope: !7)
+
-- 
GitLab


From 8fa1a87ff8c245b0e7dd2387c4514ab20090383e Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Wed, 31 Oct 2018 17:46:21 +0000
Subject: [PATCH 0811/1116] TableGen: Fix ASAN error

Summary:
As a bonus, this arguably improves the code by making it simpler.

gcc 8 on Ubuntu 18.10 reports the following:

==39667==ERROR: AddressSanitizer: stack-use-after-scope on address 0x7fffffff8ae0 at pc 0x555555dbfc68 bp 0x7fffffff8760 sp 0x7fffffff8750
WRITE of size 8 at 0x7fffffff8ae0 thread T0
    #0 0x555555dbfc67 in std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider::_Alloc_hider(char*, std::allocator<char>&&) /usr/include/c++/8/bits/basic_string.h:149
    #1 0x555555dbfc67 in std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >&&) /usr/include/c++/8/bits/basic_string.h:542
    #2 0x555555dbfc67 in std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > std::operator+<char, std::char_traits<char>, std::allocator<char> >(char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >&&) /usr/include/c++/8/bits/basic_string.h:6009
    #3 0x555555dbfc67 in searchableFieldType /home/nha/amd/build/san/llvm-src/utils/TableGen/SearchableTableEmitter.cpp:168
    (...)

Address 0x7fffffff8ae0 is located in stack of thread T0 at offset 864 in frame
    #0 0x555555dbef3f in searchableFieldType /home/nha/amd/build/san/llvm-src/utils/TableGen/SearchableTableEmitter.cpp:148

Reviewers: fhahn, simon_tatham, kparzysz

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53931

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345749 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/TableGen/SearchableTableEmitter.cpp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/utils/TableGen/SearchableTableEmitter.cpp b/utils/TableGen/SearchableTableEmitter.cpp
index 61c918bd014..f98a7c74bf0 100644
--- a/utils/TableGen/SearchableTableEmitter.cpp
+++ b/utils/TableGen/SearchableTableEmitter.cpp
@@ -155,17 +155,15 @@ private:
     } else if (BitsRecTy *BI = dyn_cast<BitsRecTy>(Field.RecType)) {
       unsigned NumBits = BI->getNumBits();
       if (NumBits <= 8)
-        NumBits = 8;
-      else if (NumBits <= 16)
-        NumBits = 16;
-      else if (NumBits <= 32)
-        NumBits = 32;
-      else if (NumBits <= 64)
-        NumBits = 64;
-      else
-        PrintFatalError(Twine("bitfield '") + Field.Name +
-                        "' too large to search");
-      return "uint" + utostr(NumBits) + "_t";
+        return "uint8_t";
+      if (NumBits <= 16)
+        return "uint16_t";
+      if (NumBits <= 32)
+        return "uint32_t";
+      if (NumBits <= 64)
+        return "uint64_t";
+      PrintFatalError(Twine("bitfield '") + Field.Name +
+                      "' too large to search");
     } else if (Field.Enum || Field.IsIntrinsic || Field.IsInstruction)
       return "unsigned";
     PrintFatalError(Twine("Field '") + Field.Name + "' has unknown type '" +
-- 
GitLab


From ec248eb35c4c3e8c32aff2768d0c64ec0a62032e Mon Sep 17 00:00:00 2001
From: Matt Davis <Matthew.Davis@sony.com>
Date: Wed, 31 Oct 2018 17:47:25 +0000
Subject: [PATCH 0812/1116] [llvm-mca] Remove the verb 'assemble' from a few
 options in help. NFC.

* MCA does not assemble anything.
* Ran clang-format.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345750 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/llvm-mca.cpp | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp
index 8f4e0717bd2..b3a4c495d7e 100644
--- a/tools/llvm-mca/llvm-mca.cpp
+++ b/tools/llvm-mca/llvm-mca.cpp
@@ -69,15 +69,13 @@ static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
                                            cl::value_desc("filename"));
 
 static cl::opt<std::string>
-    ArchName("march",
-             cl::desc("Target arch to assemble for, "
-                      "see -version for available targets"),
+    ArchName("march", cl::desc("Target architecture. "
+                               "See -version for available targets"),
              cl::cat(ToolOptions));
 
 static cl::opt<std::string>
     TripleName("mtriple",
-               cl::desc("Target triple to assemble for, "
-                        "see -version for available targets"),
+               cl::desc("Target triple. See -version for available targets"),
                cl::cat(ToolOptions));
 
 static cl::opt<std::string>
@@ -503,17 +501,20 @@ int main(int argc, char **argv) {
     ArrayRef<MCInst> Insts = Region->getInstructions();
     std::vector<std::unique_ptr<mca::Instruction>> LoweredSequence;
     for (const MCInst &MCI : Insts) {
-      llvm::Expected<std::unique_ptr<mca::Instruction>> Inst = IB.createInstruction(MCI);
+      llvm::Expected<std::unique_ptr<mca::Instruction>> Inst =
+          IB.createInstruction(MCI);
       if (!Inst) {
-        if (auto NewE = handleErrors(Inst.takeError(),
-            [&IP, &STI](const mca::InstructionError<MCInst> &IE) {
-              std::string InstructionStr;
-              raw_string_ostream SS(InstructionStr);
-              WithColor::error() << IE.Message << '\n';
-              IP->printInst(&IE.Inst, SS, "", *STI);
-              SS.flush();
-              WithColor::note() << "instruction: " << InstructionStr << '\n';
-            })) {
+        if (auto NewE = handleErrors(
+                Inst.takeError(),
+                [&IP, &STI](const mca::InstructionError<MCInst> &IE) {
+                  std::string InstructionStr;
+                  raw_string_ostream SS(InstructionStr);
+                  WithColor::error() << IE.Message << '\n';
+                  IP->printInst(&IE.Inst, SS, "", *STI);
+                  SS.flush();
+                  WithColor::note() << "instruction: " << InstructionStr
+                                    << '\n';
+                })) {
           // Default case.
           WithColor::error() << toString(std::move(NewE));
         }
@@ -523,8 +524,7 @@ int main(int argc, char **argv) {
       LoweredSequence.emplace_back(std::move(Inst.get()));
     }
 
-    mca::SourceMgr S(LoweredSequence,
-                     PrintInstructionTables ? 1 : Iterations);
+    mca::SourceMgr S(LoweredSequence, PrintInstructionTables ? 1 : Iterations);
 
     if (PrintInstructionTables) {
       //  Create a pipeline, stages, and a printer.
-- 
GitLab


From b5378372b8a7fd5d30a26980869516ed91bd621e Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Wed, 31 Oct 2018 17:50:52 +0000
Subject: [PATCH 0813/1116] [InstCombine] Combine nested min/max intrinsics
 with constants

Reviewers: arsenm, spatel

Reviewed By: spatel

Subscribers: lebedev.ri, wdng, llvm-commits

Differential Revision: https://reviews.llvm.org/D53774

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345751 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCalls.cpp          | 36 ++++++++++++++++++-
 test/Transforms/InstCombine/maximum.ll        | 18 ++++------
 test/Transforms/InstCombine/maxnum.ll         | 18 ++++------
 test/Transforms/InstCombine/minimum.ll        | 18 ++++------
 test/Transforms/InstCombine/minnum.ll         | 18 ++++------
 5 files changed, 59 insertions(+), 49 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 116b11386f0..bdd9a43be27 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2032,6 +2032,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return II;
     }
 
+    Intrinsic::ID IID = II->getIntrinsicID();
     Value *X, *Y;
     if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
         (Arg0->hasOneUse() || Arg1->hasOneUse())) {
@@ -2039,7 +2040,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       // min(-X, -Y) --> -(max(X, Y))
       // max(-X, -Y) --> -(min(X, Y))
       Intrinsic::ID NewIID;
-      switch (II->getIntrinsicID()) {
+      switch (IID) {
       case Intrinsic::maxnum:
         NewIID = Intrinsic::minnum;
         break;
@@ -2060,6 +2061,39 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       FNeg->copyIRFlags(II);
       return FNeg;
     }
+
+    // m(m(X, C2), C1) -> m(X, C)
+    const APFloat *C1, *C2;
+    if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) {
+      if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) &&
+          ((match(M->getArgOperand(0), m_Value(X)) &&
+            match(M->getArgOperand(1), m_APFloat(C2))) ||
+           (match(M->getArgOperand(1), m_Value(X)) &&
+            match(M->getArgOperand(0), m_APFloat(C2))))) {
+        APFloat Res(0.0);
+        switch (IID) {
+        case Intrinsic::maxnum:
+          Res = maxnum(*C1, *C2);
+          break;
+        case Intrinsic::minnum:
+          Res = minnum(*C1, *C2);
+          break;
+        case Intrinsic::maximum:
+          Res = maximum(*C1, *C2);
+          break;
+        case Intrinsic::minimum:
+          Res = minimum(*C1, *C2);
+          break;
+        default:
+          llvm_unreachable("unexpected intrinsic ID");
+        }
+        Instruction *NewCall = Builder.CreateBinaryIntrinsic(
+            IID, X, ConstantFP::get(Arg0->getType(), Res));
+        NewCall->copyIRFlags(II);
+        return replaceInstUsesWith(*II, NewCall);
+      }
+    }
+
     break;
   }
   case Intrinsic::fmuladd: {
diff --git a/test/Transforms/InstCombine/maximum.ll b/test/Transforms/InstCombine/maximum.ll
index 1b3114788a5..bd97a3794d4 100644
--- a/test/Transforms/InstCombine/maximum.ll
+++ b/test/Transforms/InstCombine/maximum.ll
@@ -147,8 +147,7 @@ define float @maximum_f32_val_nan(float %x) {
 
 define float @maximum_f32_1_maximum_val_p0(float %x) {
 ; CHECK-LABEL: @maximum_f32_1_maximum_val_p0(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maximum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float %x, float 1.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.maximum.f32(float %x, float 0.0)
   %z = call float @llvm.maximum.f32(float %y, float 1.0)
@@ -157,8 +156,7 @@ define float @maximum_f32_1_maximum_val_p0(float %x) {
 
 define float @maximum_f32_1_maximum_p0_val_fast(float %x) {
 ; CHECK-LABEL: @maximum_f32_1_maximum_p0_val_fast(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maximum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.maximum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.maximum.f32(float %x, float 1.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.maximum.f32(float 0.0, float %x)
   %z = call fast float @llvm.maximum.f32(float %y, float 1.0)
@@ -167,8 +165,7 @@ define float @maximum_f32_1_maximum_p0_val_fast(float %x) {
 
 define float @maximum_f32_1_maximum_p0_val_nnan_ninf(float %x) {
 ; CHECK-LABEL: @maximum_f32_1_maximum_p0_val_nnan_ninf(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maximum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.maximum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.maximum.f32(float %x, float 1.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.maximum.f32(float 0.0, float %x)
   %z = call nnan ninf float @llvm.maximum.f32(float %y, float 1.0)
@@ -177,8 +174,7 @@ define float @maximum_f32_1_maximum_p0_val_nnan_ninf(float %x) {
 
 define float @maximum_f32_p0_maximum_val_n0(float %x) {
 ; CHECK-LABEL: @maximum_f32_p0_maximum_val_n0(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maximum.f32(float %x, float -0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float [[Y]], float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float %x, float 0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.maximum.f32(float %x, float -0.0)
   %z = call float @llvm.maximum.f32(float %y, float 0.0)
@@ -187,8 +183,7 @@ define float @maximum_f32_p0_maximum_val_n0(float %x) {
 
 define float @maximum_f32_1_maximum_p0_val(float %x) {
 ; CHECK-LABEL: @maximum_f32_1_maximum_p0_val(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maximum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float %x, float 1.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.maximum.f32(float 0.0, float %x)
   %z = call float @llvm.maximum.f32(float %y, float 1.0)
@@ -197,8 +192,7 @@ define float @maximum_f32_1_maximum_p0_val(float %x) {
 
 define <2 x float> @maximum_f32_1_maximum_val_p0_val_v2f32(<2 x float> %x) {
 ; CHECK-LABEL: @maximum_f32_1_maximum_val_p0_val_v2f32(
-; CHECK-NEXT: [[Y:%.*]] = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
-; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.maximum.v2f32(<2 x float> [[Y]], <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> <float 1.000000e+00, float 1.000000e+00>)
 ; CHECK-NEXT: ret <2 x float> [[RES]]
   %y = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
   %z = call <2 x float> @llvm.maximum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
diff --git a/test/Transforms/InstCombine/maxnum.ll b/test/Transforms/InstCombine/maxnum.ll
index a621d99ac1f..d81158c066f 100644
--- a/test/Transforms/InstCombine/maxnum.ll
+++ b/test/Transforms/InstCombine/maxnum.ll
@@ -147,8 +147,7 @@ define float @maxnum_f32_val_nan(float %x) {
 
 define float @maxnum_f32_1_maxnum_val_p0(float %x) {
 ; CHECK-LABEL: @maxnum_f32_1_maxnum_val_p0(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maxnum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float %x, float 1.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.maxnum.f32(float %x, float 0.0)
   %z = call float @llvm.maxnum.f32(float %y, float 1.0)
@@ -157,8 +156,7 @@ define float @maxnum_f32_1_maxnum_val_p0(float %x) {
 
 define float @maxnum_f32_1_maxnum_p0_val_fast(float %x) {
 ; CHECK-LABEL: @maxnum_f32_1_maxnum_p0_val_fast(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maxnum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.maxnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.maxnum.f32(float %x, float 1.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.maxnum.f32(float 0.0, float %x)
   %z = call fast float @llvm.maxnum.f32(float %y, float 1.0)
@@ -167,8 +165,7 @@ define float @maxnum_f32_1_maxnum_p0_val_fast(float %x) {
 
 define float @maxnum_f32_1_maxnum_p0_val_nnan_ninf(float %x) {
 ; CHECK-LABEL: @maxnum_f32_1_maxnum_p0_val_nnan_ninf(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maxnum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.maxnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.maxnum.f32(float %x, float 1.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.maxnum.f32(float 0.0, float %x)
   %z = call nnan ninf float @llvm.maxnum.f32(float %y, float 1.0)
@@ -177,8 +174,7 @@ define float @maxnum_f32_1_maxnum_p0_val_nnan_ninf(float %x) {
 
 define float @maxnum_f32_p0_maxnum_val_n0(float %x) {
 ; CHECK-LABEL: @maxnum_f32_p0_maxnum_val_n0(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maxnum.f32(float %x, float -0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float [[Y]], float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float %x, float 0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.maxnum.f32(float %x, float -0.0)
   %z = call float @llvm.maxnum.f32(float %y, float 0.0)
@@ -187,8 +183,7 @@ define float @maxnum_f32_p0_maxnum_val_n0(float %x) {
 
 define float @maxnum_f32_1_maxnum_p0_val(float %x) {
 ; CHECK-LABEL: @maxnum_f32_1_maxnum_p0_val(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.maxnum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float %x, float 1.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.maxnum.f32(float 0.0, float %x)
   %z = call float @llvm.maxnum.f32(float %y, float 1.0)
@@ -197,8 +192,7 @@ define float @maxnum_f32_1_maxnum_p0_val(float %x) {
 
 define <2 x float> @maxnum_f32_1_maxnum_val_p0_val_v2f32(<2 x float> %x) {
 ; CHECK-LABEL: @maxnum_f32_1_maxnum_val_p0_val_v2f32(
-; CHECK-NEXT: [[Y:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
-; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[Y]], <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> <float 1.000000e+00, float 1.000000e+00>)
 ; CHECK-NEXT: ret <2 x float> [[RES]]
   %y = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
   %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
diff --git a/test/Transforms/InstCombine/minimum.ll b/test/Transforms/InstCombine/minimum.ll
index 6485a27fb52..32aae6417eb 100644
--- a/test/Transforms/InstCombine/minimum.ll
+++ b/test/Transforms/InstCombine/minimum.ll
@@ -149,8 +149,7 @@ define float @minimum_f32_val_nan(float %x) {
 
 define float @minimum_f32_1_minimum_val_p0(float %x) {
 ; CHECK-LABEL: @minimum_f32_1_minimum_val_p0(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.minimum.f32(float %x, float 0.0)
   %z = call float @llvm.minimum.f32(float %y, float 1.0)
@@ -159,8 +158,7 @@ define float @minimum_f32_1_minimum_val_p0(float %x) {
 
 define float @minimum_f32_1_minimum_p0_val_fast(float %x) {
 ; CHECK-LABEL: @minimum_f32_1_minimum_p0_val_fast(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.minimum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.minimum.f32(float %x, float 0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.minimum.f32(float 0.0, float %x)
   %z = call fast float @llvm.minimum.f32(float %y, float 1.0)
@@ -169,8 +167,7 @@ define float @minimum_f32_1_minimum_p0_val_fast(float %x) {
 
 define float @minimum_f32_1_minimum_p0_val_nnan_ninf(float %x) {
 ; CHECK-LABEL: @minimum_f32_1_minimum_p0_val_nnan_ninf(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.minimum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.minimum.f32(float %x, float 0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.minimum.f32(float 0.0, float %x)
   %z = call nnan ninf float @llvm.minimum.f32(float %y, float 1.0)
@@ -179,8 +176,7 @@ define float @minimum_f32_1_minimum_p0_val_nnan_ninf(float %x) {
 
 define float @minimum_f32_p0_minimum_val_n0(float %x) {
 ; CHECK-LABEL: @minimum_f32_p0_minimum_val_n0(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minimum.f32(float %x, float -0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float [[Y]], float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float %x, float -0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.minimum.f32(float %x, float -0.0)
   %z = call float @llvm.minimum.f32(float %y, float 0.0)
@@ -189,8 +185,7 @@ define float @minimum_f32_p0_minimum_val_n0(float %x) {
 
 define float @minimum_f32_1_minimum_p0_val(float %x) {
 ; CHECK-LABEL: @minimum_f32_1_minimum_p0_val(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.minimum.f32(float 0.0, float %x)
   %z = call float @llvm.minimum.f32(float %y, float 1.0)
@@ -199,8 +194,7 @@ define float @minimum_f32_1_minimum_p0_val(float %x) {
 
 define <2 x float> @minimum_f32_1_minimum_val_p0_val_v2f32(<2 x float> %x) {
 ; CHECK-LABEL: @minimum_f32_1_minimum_val_p0_val_v2f32(
-; CHECK-NEXT: [[Y:%.*]] = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
-; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.minimum.v2f32(<2 x float> [[Y]], <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
 ; CHECK-NEXT: ret <2 x float> [[RES]]
   %y = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
   %z = call <2 x float> @llvm.minimum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
diff --git a/test/Transforms/InstCombine/minnum.ll b/test/Transforms/InstCombine/minnum.ll
index 00cf66103a9..73b4f0c9251 100644
--- a/test/Transforms/InstCombine/minnum.ll
+++ b/test/Transforms/InstCombine/minnum.ll
@@ -149,8 +149,7 @@ define float @minnum_f32_val_nan(float %x) {
 
 define float @minnum_f32_1_minnum_val_p0(float %x) {
 ; CHECK-LABEL: @minnum_f32_1_minnum_val_p0(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.minnum.f32(float %x, float 0.0)
   %z = call float @llvm.minnum.f32(float %y, float 1.0)
@@ -159,8 +158,7 @@ define float @minnum_f32_1_minnum_val_p0(float %x) {
 
 define float @minnum_f32_1_minnum_p0_val_fast(float %x) {
 ; CHECK-LABEL: @minnum_f32_1_minnum_p0_val_fast(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.minnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.minnum.f32(float %x, float 0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.minnum.f32(float 0.0, float %x)
   %z = call fast float @llvm.minnum.f32(float %y, float 1.0)
@@ -169,8 +167,7 @@ define float @minnum_f32_1_minnum_p0_val_fast(float %x) {
 
 define float @minnum_f32_1_minnum_p0_val_nnan_ninf(float %x) {
 ; CHECK-LABEL: @minnum_f32_1_minnum_p0_val_nnan_ninf(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.minnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.minnum.f32(float %x, float 0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.minnum.f32(float 0.0, float %x)
   %z = call nnan ninf float @llvm.minnum.f32(float %y, float 1.0)
@@ -179,8 +176,7 @@ define float @minnum_f32_1_minnum_p0_val_nnan_ninf(float %x) {
 
 define float @minnum_f32_p0_minnum_val_n0(float %x) {
 ; CHECK-LABEL: @minnum_f32_p0_minnum_val_n0(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minnum.f32(float %x, float -0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float [[Y]], float 0.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.minnum.f32(float %x, float -0.0)
   %z = call float @llvm.minnum.f32(float %y, float 0.0)
@@ -189,8 +185,7 @@ define float @minnum_f32_p0_minnum_val_n0(float %x) {
 
 define float @minnum_f32_1_minnum_p0_val(float %x) {
 ; CHECK-LABEL: @minnum_f32_1_minnum_p0_val(
-; CHECK-NEXT: [[Y:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
-; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float [[Y]], float 1.000000e+00)
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
 ; CHECK-NEXT: ret float [[RES]]
   %y = call float @llvm.minnum.f32(float 0.0, float %x)
   %z = call float @llvm.minnum.f32(float %y, float 1.0)
@@ -199,8 +194,7 @@ define float @minnum_f32_1_minnum_p0_val(float %x) {
 
 define <2 x float> @minnum_f32_1_minnum_val_p0_val_v2f32(<2 x float> %x) {
 ; CHECK-LABEL: @minnum_f32_1_minnum_val_p0_val_v2f32(
-; CHECK-NEXT: [[Y:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
-; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> [[Y]], <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
 ; CHECK-NEXT: ret <2 x float> [[RES]]
   %y = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
   %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
-- 
GitLab


From da002c739749e7f1939114f1b3f019cc749f9fd3 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 31 Oct 2018 17:55:40 +0000
Subject: [PATCH 0814/1116] [InstCombine] add tests for fcmp with -0.0; NFC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

From IEEE754: "Comparisons shall ignore the sign of zero (so +0 = −0)."


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345752 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/fcmp-special.ll | 57 +++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/test/Transforms/InstCombine/fcmp-special.ll b/test/Transforms/InstCombine/fcmp-special.ll
index 8d131b3c2a6..5d4cc9a8616 100644
--- a/test/Transforms/InstCombine/fcmp-special.ll
+++ b/test/Transforms/InstCombine/fcmp-special.ll
@@ -161,6 +161,7 @@ define <2 x i1> @ord_vec_with_undef(<2 x double> %x) {
   %f = fcmp ord <2 x double> %x, <double 0.0, double undef>
   ret <2 x i1> %f
 }
+
 ; TODO: This could be handled in InstSimplify.
 
 define i1 @nnan_ops_to_fcmp_ord(float %x, float %y) {
@@ -185,3 +186,59 @@ define i1 @nnan_ops_to_fcmp_uno(float %x, float %y) {
   ret i1 %cmp
 }
 
+; TODO: For any predicate/type/FMF, comparison to -0.0 is the same as comparison to +0.0.
+
+define i1 @negative_zero_oeq(float %x) {
+; CHECK-LABEL: @negative_zero_oeq(
+; CHECK-NEXT:    [[R:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %r = fcmp oeq float %x, -0.0
+  ret i1 %r
+}
+
+define i1 @negative_zero_oge(double %x) {
+; CHECK-LABEL: @negative_zero_oge(
+; CHECK-NEXT:    [[R:%.*]] = fcmp nnan oge double [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %r = fcmp nnan oge double %x, -0.0
+  ret i1 %r
+}
+
+define i1 @negative_zero_uge(half %x) {
+; CHECK-LABEL: @negative_zero_uge(
+; CHECK-NEXT:    [[R:%.*]] = fcmp fast uge half [[X:%.*]], 0xH8000
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %r = fcmp fast uge half %x, -0.0
+  ret i1 %r
+}
+
+define <2 x i1> @negative_zero_olt_vec(<2 x float> %x) {
+; CHECK-LABEL: @negative_zero_olt_vec(
+; CHECK-NEXT:    [[R:%.*]] = fcmp reassoc ninf olt <2 x float> [[X:%.*]], <float -0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = fcmp reassoc ninf olt <2 x float> %x, <float -0.0, float -0.0>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @negative_zero_une_vec_undef(<2 x double> %x) {
+; CHECK-LABEL: @negative_zero_une_vec_undef(
+; CHECK-NEXT:    [[R:%.*]] = fcmp nnan une <2 x double> [[X:%.*]], <double -0.000000e+00, double undef>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = fcmp nnan une <2 x double> %x, <double -0.0, double undef>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @negative_zero_ule_vec_mixed(<2 x float> %x) {
+; CHECK-LABEL: @negative_zero_ule_vec_mixed(
+; CHECK-NEXT:    [[R:%.*]] = fcmp ule <2 x float> [[X:%.*]], <float 0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = fcmp ule <2 x float> %x, <float 0.0, float -0.0>
+  ret <2 x i1> %r
+}
+
-- 
GitLab


From 39aa3ef71c833a6f2f668b2be946494cf49eaac0 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Wed, 31 Oct 2018 17:58:47 +0000
Subject: [PATCH 0815/1116] [globalisel][irtranslator] Fix test from r345743 on
 non-asserts builds.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345754 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll
index c6e47a3b05f..63c5eecd8b9 100644
--- a/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll
+++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O0 -mtriple=aarch64-apple-ios -global-isel -debug-only=irtranslator \
 ; RUN:     -stop-after=irtranslator %s -o - 2>&1 | FileCheck %s
 
+; REQUIRES: asserts
+
 ; CHECK: Checking DILocation from   %retval = alloca i32, align 4 was copied to G_FRAME_INDEX
 ; CHECK: Checking DILocation from   %rv = alloca i32, align 4 was copied to G_FRAME_INDEX
 ; CHECK: Checking DILocation from   store i32 0, i32* %retval, align 4 was copied to G_CONSTANT
-- 
GitLab


From 94e927ca2efda98bebc0cc00bd2afa1bf58be9ab Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 31 Oct 2018 18:14:14 +0000
Subject: [PATCH 0816/1116] [SelectionDAG]
 SelectionDAGLegalize::ExpandBITREVERSE - ensure we use ShiftTy

We should be using the getShiftAmountTy value type for shift amounts.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345756 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 65f78773241..9b5aa480705 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2534,22 +2534,22 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
     // swap i4: ((V & 0xF0) >> 4) | ((V & 0x0F) << 4)
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, VT));
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, VT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
 
     // swap i2: ((V & 0xCC) >> 2) | ((V & 0x33) << 2)
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, VT));
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, VT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
 
     // swap i1: ((V & 0xAA) >> 1) | ((V & 0x55) << 1)
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, VT));
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, VT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
     return Tmp;
   }
-- 
GitLab


From 0a69194ac1faa375fd4f1c63840672d02f43ddce Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 31 Oct 2018 18:17:51 +0000
Subject: [PATCH 0817/1116] [InstCombine] regenerate test checks; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345757 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/minmax-fp.ll | 64 ++++++++++++------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/test/Transforms/InstCombine/minmax-fp.ll b/test/Transforms/InstCombine/minmax-fp.ll
index b94bce2dbb8..cc166d2be42 100644
--- a/test/Transforms/InstCombine/minmax-fp.ll
+++ b/test/Transforms/InstCombine/minmax-fp.ll
@@ -4,8 +4,8 @@
 ; This is the canonical form for a type-changing min/max.
 define double @t1(float %a) {
 ; CHECK-LABEL: @t1(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float %a, 5.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 5.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[TMP1]] to double
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
@@ -18,8 +18,8 @@ define double @t1(float %a) {
 ; Check this is converted into canonical form, as above.
 define double @t2(float %a) {
 ; CHECK-LABEL: @t2(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float %a, 5.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 5.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[TMP1]] to double
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
@@ -32,8 +32,8 @@ define double @t2(float %a) {
 ; Same again, with trunc.
 define float @t4(double %a) {
 ; CHECK-LABEL: @t4(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge double %a, 5.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], double 5.000000e+00, double %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge double [[A:%.*]], 5.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], double 5.000000e+00, double [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc double [[TMP1]] to float
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
@@ -46,8 +46,8 @@ define float @t4(double %a) {
 ; different values, should not be converted.
 define double @t5(float %a) {
 ; CHECK-LABEL: @t5(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float %a, 5.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext float %a to double
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float [[A:%.*]], 5.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[A]] to double
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], double [[TMP2]], double 5.001000e+00
 ; CHECK-NEXT:    ret double [[TMP3]]
 ;
@@ -60,8 +60,8 @@ define double @t5(float %a) {
 ; Signed zero, should not be converted
 define double @t6(float %a) {
 ; CHECK-LABEL: @t6(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float %a, -0.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext float %a to double
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float [[A:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[A]] to double
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], double [[TMP2]], double 0.000000e+00
 ; CHECK-NEXT:    ret double [[TMP3]]
 ;
@@ -74,8 +74,8 @@ define double @t6(float %a) {
 ; Signed zero, should not be converted
 define double @t7(float %a) {
 ; CHECK-LABEL: @t7(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float %a, 0.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext float %a to double
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[A]] to double
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], double [[TMP2]], double -0.000000e+00
 ; CHECK-NEXT:    ret double [[TMP3]]
 ;
@@ -87,8 +87,8 @@ define double @t7(float %a) {
 
 define i64 @t8(float %a) {
 ; CHECK-LABEL: @t8(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float %a, 5.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 5.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptoui float [[TMP1]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
@@ -100,8 +100,8 @@ define i64 @t8(float %a) {
 
 define i8 @t9(float %a) {
 ; CHECK-LABEL: @t9(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float %a, 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
@@ -114,8 +114,8 @@ define i8 @t9(float %a) {
   ; Either operand could be NaN, but fast modifier applied.
 define i8 @t11(float %a, float %b) {
 ; CHECK-LABEL: @t11(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp fast oge float %b, %a
-; CHECK-NEXT:    [[DOTV:%.*]] = select i1 [[DOTINV]], float %a, float %b
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp fast oge float [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[DOTV:%.*]] = select i1 [[DOTINV]], float [[A]], float [[B]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptosi float [[DOTV]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
@@ -129,8 +129,8 @@ define i8 @t11(float %a, float %b) {
 ; Either operand could be NaN, but nnan modifier applied.
 define i8 @t12(float %a, float %b) {
 ; CHECK-LABEL: @t12(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp nnan oge float %b, %a
-; CHECK-NEXT:    [[DOTV:%.*]] = select i1 [[DOTINV]], float %a, float %b
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp nnan oge float [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[DOTV:%.*]] = select i1 [[DOTINV]], float [[A]], float [[B]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptosi float [[DOTV]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
@@ -144,8 +144,8 @@ define i8 @t12(float %a, float %b) {
 ; Float and int values do not match.
 define i8 @t13(float %a) {
 ; CHECK-LABEL: @t13(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float %a, 1.500000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float %a to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float [[A:%.*]], 1.500000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[A]] to i8
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], i8 [[TMP2]], i8 1
 ; CHECK-NEXT:    ret i8 [[TMP3]]
 ;
@@ -158,8 +158,8 @@ define i8 @t13(float %a) {
 ; %a could be -0.0, but it doesn't matter because the conversion to int is the same for 0.0 or -0.0.
 define i8 @t14(float %a) {
 ; CHECK-LABEL: @t14(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float %a, 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
@@ -171,8 +171,8 @@ define i8 @t14(float %a) {
 
 define i8 @t14_commute(float %a) {
 ; CHECK-LABEL: @t14_commute(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt float %a, 0.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], float %a, float 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], float [[A]], float 0.000000e+00
 ; CHECK-NEXT:    [[TMP3:%.*]] = fptosi float [[TMP2]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP3]]
 ;
@@ -184,8 +184,8 @@ define i8 @t14_commute(float %a) {
 
 define i8 @t15(float %a) {
 ; CHECK-LABEL: @t15(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp nsz oge float %a, 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp nsz oge float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
@@ -197,8 +197,8 @@ define i8 @t15(float %a) {
 
 define double @t16(i32 %x) {
 ; CHECK-LABEL: @t16(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
-; CHECK-NEXT:    [[CST:%.*]] = sitofp i32 %x to double
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[CST:%.*]] = sitofp i32 [[X]] to double
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], double [[CST]], double 5.000000e-01
 ; CHECK-NEXT:    ret double [[SEL]]
 ;
@@ -210,8 +210,8 @@ define double @t16(i32 %x) {
 
 define double @t17(i32 %x) {
 ; CHECK-LABEL: @t17(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 %x, 2
-; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[TMP1]], i32 %x, i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[TMP1]], i32 [[X]], i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = sitofp i32 [[SEL1]] to double
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
-- 
GitLab


From ed6d50a5c3ba8c73215b66967755bcccc401d7ac Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 31 Oct 2018 18:19:52 +0000
Subject: [PATCH 0818/1116] Fix comment typo. NFCI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345758 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 9b5aa480705..a96b8628ac8 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -238,7 +238,7 @@ public:
 } // end anonymous namespace
 
 /// Return a vector shuffle operation which
-/// performs the same shuffe in terms of order or result bytes, but on a type
+/// performs the same shuffle in terms of order or result bytes, but on a type
 /// whose vector element type is narrower than the original shuffle type.
 /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
 SDValue SelectionDAGLegalize::ShuffleWithNarrowerEltType(
-- 
GitLab


From b52849d771c75c437393f3fc2b14336e6dffe42c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 31 Oct 2018 18:46:15 +0000
Subject: [PATCH 0819/1116] [SelectionDAGISel] Suppress a
 -Wunused-but-set-variable warning in release builds. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345761 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 90bcaa653c3..dca358032fb 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1156,6 +1156,7 @@ static void mapWasmLandingPadIndex(MachineBasicBlock *MBB,
       }
     }
     assert(IntrFound && "wasm.landingpad.index intrinsic not found!");
+    (void)IntrFound;
   }
 }
 
-- 
GitLab


From 3c1f69b206d6aa35f30f5458110a3a9259470f51 Mon Sep 17 00:00:00 2001
From: Scott Linder <scott@scottlinder.com>
Date: Wed, 31 Oct 2018 18:54:06 +0000
Subject: [PATCH 0820/1116] [AMDGPU] Remove FeatureVGPRSpilling

This feature is only relevant to shaders, and is no longer used. When disabled,
lowering of reserved registers for shaders causes a compiler crash.

Remove the feature and add a test for compilation of shaders at OptNone.

Differential Revision: https://reviews.llvm.org/D53829


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345763 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPU.td                   |  6 --
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        |  8 +--
 lib/Target/AMDGPU/AMDGPUSubtarget.cpp         |  5 --
 lib/Target/AMDGPU/AMDGPUSubtarget.h           |  3 -
 lib/Target/AMDGPU/SIInstrInfo.cpp             | 19 ------
 lib/Target/AMDGPU/SIMachineFunctionInfo.cpp   | 15 ++---
 .../CodeGen/AMDGPU/local-stack-slot-offset.ll |  4 +-
 test/CodeGen/AMDGPU/noop-shader-O0.ll         | 66 +++++++++++++++++++
 .../schedule-vs-if-nested-loop-failure.ll     |  4 +-
 test/CodeGen/AMDGPU/scratch-simple.ll         |  6 +-
 test/CodeGen/AMDGPU/selected-stack-object.ll  | 15 -----
 test/CodeGen/AMDGPU/si-sgpr-spill.ll          |  4 +-
 test/CodeGen/AMDGPU/spill-m0.ll               | 10 +--
 ...vgpr-spill-emergency-stack-slot-compute.ll |  8 +--
 .../AMDGPU/vgpr-spill-emergency-stack-slot.ll |  6 +-
 15 files changed, 95 insertions(+), 84 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/noop-shader-O0.ll
 delete mode 100644 test/CodeGen/AMDGPU/selected-stack-object.ll

diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 54b6c8a7882..ec351356f79 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -327,12 +327,6 @@ def FeatureEnableHugePrivateBuffer : SubtargetFeature<
   "Enable private/scratch buffer sizes greater than 128 GB"
 >;
 
-def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
-  "EnableVGPRSpilling",
-  "true",
-  "Enable spilling of VGPRs to scratch memory"
->;
-
 def FeatureDumpCode : SubtargetFeature <"DumpCode",
   "DumpCode",
   "true",
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 7448dd71004..d07c0516c27 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1008,7 +1008,6 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                                          const SIProgramInfo &CurrentProgramInfo) {
-  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
 
@@ -1029,10 +1028,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(RsrcReg, 4);
     OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
-    if (STM.isVGPRSpillingEnabled(MF.getFunction())) {
-      OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
-      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
-    }
+    OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
+    OutStreamer->EmitIntValue(
+        S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
   }
 
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index d34834329b5..9a7e6918d41 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -171,7 +171,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     DebuggerEmitPrologue(false),
 
     EnableHugePrivateBuffer(false),
-    EnableVGPRSpilling(false),
     EnableLoadStoreOpt(false),
     EnableUnsafeDSOffsetFolding(false),
     EnableSIScheduler(false),
@@ -480,10 +479,6 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
     Policy.ShouldTrackLaneMasks = true;
 }
 
-bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
-  return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
-}
-
 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     if (SGPRs <= 80)
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 681ab3a2750..162305ddee2 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -322,7 +322,6 @@ protected:
 
   // Used as options.
   bool EnableHugePrivateBuffer;
-  bool EnableVGPRSpilling;
   bool EnableLoadStoreOpt;
   bool EnableUnsafeDSOffsetFolding;
   bool EnableSIScheduler;
@@ -748,8 +747,6 @@ public:
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
-  bool isVGPRSpillingEnabled(const Function &F) const;
-
   unsigned getMaxNumUserSGPRs() const {
     return 16;
   }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index d0d8576ade3..4dd06df1233 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -908,16 +908,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
-    LLVMContext &Ctx = MF->getFunction().getContext();
-    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
-                  " spill register");
-    BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
-      .addReg(SrcReg);
-
-    return;
-  }
-
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
   unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
@@ -1010,15 +1000,6 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
-    LLVMContext &Ctx = MF->getFunction().getContext();
-    Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
-                  " restore register");
-    BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
-
-    return;
-  }
-
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
   unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index ee1ff85523a..181cc41bd5f 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -117,7 +117,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   }
 
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-  bool MaySpill = ST.isVGPRSpillingEnabled(F);
   bool HasStackObjects = FrameInfo.hasStackObjects();
 
   if (isEntryFunction()) {
@@ -126,21 +125,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     if (WorkItemIDZ)
       WorkItemIDY = true;
 
-    if (HasStackObjects || MaySpill) {
-      PrivateSegmentWaveByteOffset = true;
+    PrivateSegmentWaveByteOffset = true;
 
     // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
     if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
         (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
-      ArgInfo.PrivateSegmentWaveByteOffset
-        = ArgDescriptor::createRegister(AMDGPU::SGPR5);
-    }
+      ArgInfo.PrivateSegmentWaveByteOffset =
+          ArgDescriptor::createRegister(AMDGPU::SGPR5);
   }
 
   bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
   if (isAmdHsaOrMesa) {
-    if (HasStackObjects || MaySpill)
-      PrivateSegmentBuffer = true;
+    PrivateSegmentBuffer = true;
 
     if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
       DispatchPtr = true;
@@ -151,8 +147,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     if (F.hasFnAttribute("amdgpu-dispatch-id"))
       DispatchID = true;
   } else if (ST.isMesaGfxShader(F)) {
-    if (HasStackObjects || MaySpill)
-      ImplicitBufferPtr = true;
+    ImplicitBufferPtr = true;
   }
 
   if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-offset.ll b/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
index 928eecaae02..790715cda72 100644
--- a/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
+++ b/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+;RUN: llc < %s -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
 
 ; Allocate two stack slots of 2052 bytes each requiring a total of 4104 bytes.
 ; Extracting the last element of each does not fit into the offset field of
diff --git a/test/CodeGen/AMDGPU/noop-shader-O0.ll b/test/CodeGen/AMDGPU/noop-shader-O0.ll
new file mode 100644
index 00000000000..af47170a4a9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/noop-shader-O0.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Ensure NOOP shaders compile at OptNone.
+
+; Confirm registers reserved in SIMachineFunctionInfo are those expected during
+; lowering, even when e.g. spilling is required due to being at OptNone.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+target triple = "amdgcn-amd-amdpal"
+
+define amdgpu_vs void @noop_vs() {
+; GCN-LABEL: noop_vs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_ls void @noop_ls() {
+; GCN-LABEL: noop_ls:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_hs void @noop_hs() {
+; GCN-LABEL: noop_hs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_es void @noop_es() {
+; GCN-LABEL: noop_es:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_gs void @noop_gs() {
+; GCN-LABEL: noop_gs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_ps void @noop_ps() {
+; GCN-LABEL: noop_ps:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_cs void @noop_cs() {
+; GCN-LABEL: noop_cs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
index ff3d0fc9bfa..96ebb6f8362 100644
--- a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
+++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -march=amdgcn -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 declare void @llvm.amdgcn.s.barrier() nounwind convergent
 
diff --git a/test/CodeGen/AMDGPU/scratch-simple.ll b/test/CodeGen/AMDGPU/scratch-simple.ll
index e4405900840..b2781a77811 100644
--- a/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
 
 ; This used to fail due to a v_add_i32 instruction with an illegal immediate
 ; operand that was created during Local Stack Slot Allocation. Test case derived
diff --git a/test/CodeGen/AMDGPU/selected-stack-object.ll b/test/CodeGen/AMDGPU/selected-stack-object.ll
deleted file mode 100644
index 50ca59ace94..00000000000
--- a/test/CodeGen/AMDGPU/selected-stack-object.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; "Assertion failure" should be caught with both XFAIL * and +Asserts.
-; XFAIL: *
-; REQUIRES: asserts
-
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-
-; See also local-stack-slot-bug.ll
-; This fails because a stack object is created during instruction selection.
-
-; CHECK-LABEL: {{^}}main:
-define amdgpu_ps float @main(i32 %idx) {
-main_body:
-  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
-  ret float %v1
-}
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index 683c6695322..c4964e68e28 100644
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling,-mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll
index 5984d5a30f3..ab54f9096cf 100644
--- a/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/test/CodeGen/AMDGPU/spill-m0.ll
@@ -1,8 +1,8 @@
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling  -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga  -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=1 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=GCN %s
 
 ; XXX - Why does it like to use vcc?
 
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index 9cdc333cbc0..32607c75e67 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s
-; RUN: llc -march=amdgcn  -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s
+; RUN: llc -march=amdgcn  -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
 ; RUN: llc -march=amdgcn  -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
 
 ; This ends up using all 256 registers and requires register
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index c743d6a48ae..e803bd40684 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; This ends up using all 255 registers and requires register
 ; scavenging which will fail to find an unsued register.
-- 
GitLab


From 6bf883d664c6b492a8b8bdce1fe634cf79b5732c Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Wed, 31 Oct 2018 19:49:37 +0000
Subject: [PATCH 0821/1116] [globalisel] Add comments indicating the operand
 order

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345769 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/GenericOpcodes.td | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/llvm/Target/GenericOpcodes.td b/include/llvm/Target/GenericOpcodes.td
index 399cea95107..af4fa8a1f04 100644
--- a/include/llvm/Target/GenericOpcodes.td
+++ b/include/llvm/Target/GenericOpcodes.td
@@ -649,6 +649,9 @@ def G_EXTRACT : GenericInstruction {
 // Extract multiple registers specified size, starting from blocks given by
 // indexes. This will almost certainly be mapped to sub-register COPYs after
 // register banks have been selected.
+// The output operands are always ordered from lowest bits to highest:
+//   %bits_0_7:(s8), %bits_8_15:(s8),
+//       %bits_16_23:(s8), %bits_24_31:(s8) = G_UNMERGE_VALUES %0:(s32)
 def G_UNMERGE_VALUES : GenericInstruction {
   let OutOperandList = (outs type0:$dst0, variable_ops);
   let InOperandList = (ins type1:$src);
@@ -662,7 +665,10 @@ def G_INSERT : GenericInstruction {
   let hasSideEffects = 0;
 }
 
-/// Concatenate multiple registers of the same size into a wider register.
+// Concatenate multiple registers of the same size into a wider register.
+// The input operands are always ordered from lowest bits to highest:
+//   %0:(s32) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8),
+//                             %bits_16_23:(s8), %bits_24_31:(s8)
 def G_MERGE_VALUES : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src0, variable_ops);
-- 
GitLab


From 30005234d7dd9172d4ee8d1fc63f555b2a810ae0 Mon Sep 17 00:00:00 2001
From: Scott Linder <scott@scottlinder.com>
Date: Wed, 31 Oct 2018 19:57:36 +0000
Subject: [PATCH 0822/1116] [SelectionDAG] Handle constant range [0,1) in
 lowerRangeToAssertZExt

lowerRangeToAssertZExt currently relies on something like EarlyCSE having
eliminated the constant range [0,1). At -O0 this leads to an assert.

Differential Revision: https://reviews.llvm.org/D53888


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345770 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  3 ++-
 test/CodeGen/AMDGPU/zext-lid.ll               | 26 ++++++++++++++-----
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 5ecb2abbcbf..dac99eddec3 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7993,7 +7993,8 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
     return Op;
 
   APInt Hi = CR.getUnsignedMax();
-  unsigned Bits = Hi.getActiveBits();
+  unsigned Bits = std::max(Hi.getActiveBits(),
+                           static_cast<unsigned>(IntegerType::MIN_INT_BITS));
 
   EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
 
diff --git a/test/CodeGen/AMDGPU/zext-lid.ll b/test/CodeGen/AMDGPU/zext-lid.ll
index 9a9c1fe7550..e257980dc0e 100644
--- a/test/CodeGen/AMDGPU/zext-lid.ll
+++ b/test/CodeGen/AMDGPU/zext-lid.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=amdgcn < %s | FileCheck %s
+; RUN: llc -O0 -march=amdgcn < %s | FileCheck %s
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s
 
 ; CHECK-NOT: and_b32
@@ -43,10 +44,21 @@ bb:
   ret void
 }
 
+; When EarlyCSE is not run this call produces a range max with 0 active bits,
+; which is a special case as an AssertZext from width 0 is invalid.
+; OPT-LABEL: @zext_grp_size_1x1x1
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !4
+define amdgpu_kernel void @zext_grp_size_1x1x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !1 {
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = and i32 %tmp, 1
+  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
 ; OPT-LABEL: @zext_grp_size_512
-; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5
-; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !5
-; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !5
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !6
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !6
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !6
 define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -74,7 +86,7 @@ entry:
 }
 
 ; OPT-LABEL: @func_test_workitem_id_x_default_range(
-; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !6
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !7
 define void @func_test_workitem_id_x_default_range(i32 addrspace(1)* nocapture %out) #4 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -96,11 +108,13 @@ attributes #3 = { nounwind readnone }
 attributes #4 = { nounwind }
 
 !0 = !{i32 32, i32 4, i32 1}
+!1 = !{i32 1, i32 1, i32 1}
 
 ; OPT: !0 = !{i32 0, i32 128}
 ; OPT: !1 = !{i32 32, i32 4, i32 1}
 ; OPT: !2 = !{i32 0, i32 32}
 ; OPT: !3 = !{i32 0, i32 4}
 ; OPT: !4 = !{i32 0, i32 1}
-; OPT: !5 = !{i32 0, i32 512}
-; OPT: !6 = !{i32 0, i32 1024}
+; OPT: !5 = !{i32 1, i32 1, i32 1}
+; OPT: !6 = !{i32 0, i32 512}
+; OPT: !7 = !{i32 0, i32 1024}
-- 
GitLab


From cd627aacb4c4b981bb7a74d42c3ef20d36be201b Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 31 Oct 2018 20:03:27 +0000
Subject: [PATCH 0823/1116] [InstCombine] add tests for fmin/fmax pattern
 matching failure; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345771 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/minmax-fp.ll | 46 ++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/test/Transforms/InstCombine/minmax-fp.ll b/test/Transforms/InstCombine/minmax-fp.ll
index cc166d2be42..292e50eb1f9 100644
--- a/test/Transforms/InstCombine/minmax-fp.ll
+++ b/test/Transforms/InstCombine/minmax-fp.ll
@@ -57,7 +57,10 @@ define double @t5(float %a) {
   ret double %3
 }
 
-; Signed zero, should not be converted
+; TODO:
+; From IEEE754: "Comparisons shall ignore the sign of zero (so +0 = −0)."
+; So the compare constant may be treated as +0.0, and we sink the fpext.
+
 define double @t6(float %a) {
 ; CHECK-LABEL: @t6(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float [[A:%.*]], -0.000000e+00
@@ -71,7 +74,10 @@ define double @t6(float %a) {
   ret double %3
 }
 
-; Signed zero, should not be converted
+; TODO:
+; From IEEE754: "Comparisons shall ignore the sign of zero (so +0 = −0)."
+; So the compare constant may be treated as -0.0, and we sink the fpext.
+
 define double @t7(float %a) {
 ; CHECK-LABEL: @t7(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float [[A:%.*]], 0.000000e+00
@@ -85,6 +91,42 @@ define double @t7(float %a) {
   ret double %3
 }
 
+; TODO:
+; min(min(x, 0.0), 0.0) --> min(x, 0.0)
+
+define float @fmin_fmin_zero_mismatch(float %x) {
+; CHECK-LABEL: @fmin_fmin_zero_mismatch(
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[MIN1:%.*]] = select i1 [[CMP1]], float [[X]], float 0.000000e+00
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[MIN1]], 0.000000e+00
+; CHECK-NEXT:    [[MIN2:%.*]] = select i1 [[CMP2]], float [[MIN1]], float 0.000000e+00
+; CHECK-NEXT:    ret float [[MIN2]]
+;
+  %cmp1 = fcmp olt float %x, -0.0
+  %min1 = select i1 %cmp1, float %x, float 0.0
+  %cmp2 = fcmp olt float %min1, 0.0
+  %min2 = select i1 %cmp2, float %min1, float 0.0
+  ret float %min2
+}
+
+; TODO:
+; max(max(x, -0.0), -0.0) --> max(x, -0.0)
+
+define float @fmax_fmax_zero_mismatch(float %x) {
+; CHECK-LABEL: @fmax_fmax_zero_mismatch(
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], float [[X]], float -0.000000e+00
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[MAX1]], 0.000000e+00
+; CHECK-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], float -0.000000e+00, float [[MAX1]]
+; CHECK-NEXT:    ret float [[MAX2]]
+;
+  %cmp1 = fcmp ogt float %x, 0.0
+  %max1 = select i1 %cmp1, float %x, float -0.0
+  %cmp2 = fcmp ogt float 0.0, %max1
+  %max2 = select i1 %cmp2, float -0.0, float %max1
+  ret float %max2
+}
+
 define i64 @t8(float %a) {
 ; CHECK-LABEL: @t8(
 ; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 5.000000e+00
-- 
GitLab


From 792c305d691df9aeb9268bd048121f5a13da6c99 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Wed, 31 Oct 2018 20:05:32 +0000
Subject: [PATCH 0824/1116] [adt] SparseBitVector::test() should be const

Summary:
Re-worked SparseBitVector's most-recently-used-word caching (CurrElementIter)
such that SparseBitVector::test() can be made const. This came up when
attempting to test individual bits in a SparseBitVector which was a member of a
const object.

The cached iterator has no bearing on the externally visible state, it's merely
a performance optimization. Therefore it has been made mutable and
FindLowerBound() has been split into a const and non-const function
(FindLowerBound/FindLowerBoundConst) for the const/non-const
interfaces.

Reviewers: rtereshin

Reviewed By: rtereshin

Subscribers: rtereshin, dexonsmith, kristina, llvm-commits

Differential Revision: https://reviews.llvm.org/D53447

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345772 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/SparseBitVector.h    | 38 ++++++++++++++++++++-------
 unittests/ADT/SparseBitVectorTest.cpp |  5 ++++
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/include/llvm/ADT/SparseBitVector.h b/include/llvm/ADT/SparseBitVector.h
index 4cbf40c7680..09a91b6614e 100644
--- a/include/llvm/ADT/SparseBitVector.h
+++ b/include/llvm/ADT/SparseBitVector.h
@@ -261,21 +261,33 @@ class SparseBitVector {
     BITWORD_SIZE = SparseBitVectorElement<ElementSize>::BITWORD_SIZE
   };
 
-  // Pointer to our current Element.
-  ElementListIter CurrElementIter;
+  // Pointer to our current Element. This has no visible effect on the external
+  // state of a SparseBitVector, it's just used to improve performance in the
+  // common case of testing/modifying bits with similar indices.
+  mutable ElementListIter CurrElementIter;
   ElementList Elements;
 
   // This is like std::lower_bound, except we do linear searching from the
   // current position.
-  ElementListIter FindLowerBound(unsigned ElementIndex) {
+  ElementListIter FindLowerBoundImpl(unsigned ElementIndex) const {
+
+    // We cache a non-const iterator so we're forced to resort to const_cast to
+    // get the begin/end in the case where 'this' is const. To avoid duplication
+    // of code with the only difference being whether the const cast is present
+    // 'this' is always const in this particular function and we sort out the
+    // difference in FindLowerBound and FindLowerBoundConst.
+    ElementListIter Begin =
+        const_cast<SparseBitVector<ElementSize> *>(this)->Elements.begin();
+    ElementListIter End =
+        const_cast<SparseBitVector<ElementSize> *>(this)->Elements.end();
 
     if (Elements.empty()) {
-      CurrElementIter = Elements.begin();
-      return Elements.begin();
+      CurrElementIter = Begin;
+      return CurrElementIter;
     }
 
     // Make sure our current iterator is valid.
-    if (CurrElementIter == Elements.end())
+    if (CurrElementIter == End)
       --CurrElementIter;
 
     // Search from our current iterator, either backwards or forwards,
@@ -284,17 +296,23 @@ class SparseBitVector {
     if (CurrElementIter->index() == ElementIndex) {
       return ElementIter;
     } else if (CurrElementIter->index() > ElementIndex) {
-      while (ElementIter != Elements.begin()
+      while (ElementIter != Begin
              && ElementIter->index() > ElementIndex)
         --ElementIter;
     } else {
-      while (ElementIter != Elements.end() &&
+      while (ElementIter != End &&
              ElementIter->index() < ElementIndex)
         ++ElementIter;
     }
     CurrElementIter = ElementIter;
     return ElementIter;
   }
+  ElementListConstIter FindLowerBoundConst(unsigned ElementIndex) const {
+    return FindLowerBoundImpl(ElementIndex);
+  }
+  ElementListIter FindLowerBound(unsigned ElementIndex) {
+    return FindLowerBoundImpl(ElementIndex);
+  }
 
   // Iterator to walk set bits in the bitmap.  This iterator is a lot uglier
   // than it would be, in order to be efficient.
@@ -464,12 +482,12 @@ public:
   }
 
   // Test, Reset, and Set a bit in the bitmap.
-  bool test(unsigned Idx) {
+  bool test(unsigned Idx) const {
     if (Elements.empty())
       return false;
 
     unsigned ElementIndex = Idx / ElementSize;
-    ElementListIter ElementIter = FindLowerBound(ElementIndex);
+    ElementListConstIter ElementIter = FindLowerBoundConst(ElementIndex);
 
     // If we can't find an element that is supposed to contain this bit, there
     // is nothing more to do.
diff --git a/unittests/ADT/SparseBitVectorTest.cpp b/unittests/ADT/SparseBitVectorTest.cpp
index 9d6f4f1665d..097f4a0b737 100644
--- a/unittests/ADT/SparseBitVectorTest.cpp
+++ b/unittests/ADT/SparseBitVectorTest.cpp
@@ -31,6 +31,11 @@ TEST(SparseBitVectorTest, TrivialOperation) {
   EXPECT_TRUE(Vec.test(17));
   Vec.clear();
   EXPECT_FALSE(Vec.test(17));
+
+  Vec.set(5);
+  const SparseBitVector<> ConstVec = Vec;
+  EXPECT_TRUE(ConstVec.test(5));
+  EXPECT_FALSE(ConstVec.test(17));
 }
 
 TEST(SparseBitVectorTest, IntersectWith) {
-- 
GitLab


From 63293707a9b9d81cc75bdfc8b422fb9bf89f58b1 Mon Sep 17 00:00:00 2001
From: Richard Smith <richard-llvm@metafoo.co.uk>
Date: Wed, 31 Oct 2018 20:38:41 +0000
Subject: [PATCH 0825/1116] Remove unused internal template parameter.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345773 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/iterator.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/llvm/ADT/iterator.h b/include/llvm/ADT/iterator.h
index cb40fc1781d..7f7ed69a005 100644
--- a/include/llvm/ADT/iterator.h
+++ b/include/llvm/ADT/iterator.h
@@ -202,9 +202,7 @@ template <
     typename ReferenceT = typename std::conditional<
         std::is_same<T, typename std::iterator_traits<
                             WrappedIteratorT>::value_type>::value,
-        typename std::iterator_traits<WrappedIteratorT>::reference, T &>::type,
-    // Don't provide these, they are mostly to act as aliases below.
-    typename WrappedTraitsT = std::iterator_traits<WrappedIteratorT>>
+        typename std::iterator_traits<WrappedIteratorT>::reference, T &>::type>
 class iterator_adaptor_base
     : public iterator_facade_base<DerivedT, IteratorCategoryT, T,
                                   DifferenceTypeT, PointerT, ReferenceT> {
-- 
GitLab


From 29d4d646370eda076199df4ba82a855670143373 Mon Sep 17 00:00:00 2001
From: Wolfgang Pieb <Wolfgang.Pieb@sony.com>
Date: Wed, 31 Oct 2018 21:05:51 +0000
Subject: [PATCH 0826/1116] [DWARF][NFC] Refactor a function to return
 Optional<> instead of bool

Minor refactor of DWARFUnit::getStringOffsetSectionItem().

Differential Revision: https://reviews.llvm.org/D53948


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345776 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFUnit.h |  2 +-
 lib/DebugInfo/DWARF/DWARFFormValue.cpp   |  8 +++++---
 lib/DebugInfo/DWARF/DWARFUnit.cpp        | 10 ++++------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index c3252157b0b..458278e4282 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -304,7 +304,7 @@ public:
   }
 
   Optional<SectionedAddress> getAddrOffsetSectionItem(uint32_t Index) const;
-  bool getStringOffsetSectionItem(uint32_t Index, uint64_t &Result) const;
+  Optional<uint64_t> getStringOffsetSectionItem(uint32_t Index) const;
 
   DWARFDataExtractor getDebugInfoExtractor() const;
 
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index ed510a0e4cd..9226dcad39a 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -542,10 +542,12 @@ Optional<const char *> DWARFFormValue::getAsCString() const {
   if (Form == DW_FORM_GNU_str_index || Form == DW_FORM_strx ||
       Form == DW_FORM_strx1 || Form == DW_FORM_strx2 || Form == DW_FORM_strx3 ||
       Form == DW_FORM_strx4) {
-    uint64_t StrOffset;
-    if (!U || !U->getStringOffsetSectionItem(Offset, StrOffset))
+    if (!U)
+      return None;
+    Optional<uint64_t> StrOffset = U->getStringOffsetSectionItem(Offset);
+    if (!StrOffset)
       return None;
-    Offset = StrOffset;
+    Offset = *StrOffset;
   }
   // Prefer the Unit's string extractor, because for .dwo it will point to
   // .debug_str.dwo, while the Context's extractor always uses .debug_str.
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index d475c44c393..1caaa249bef 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -217,18 +217,16 @@ DWARFUnit::getAddrOffsetSectionItem(uint32_t Index) const {
   return {{Address, Section}};
 }
 
-bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
-                                           uint64_t &Result) const {
+Optional<uint64_t> DWARFUnit::getStringOffsetSectionItem(uint32_t Index) const {
   if (!StringOffsetsTableContribution)
-    return false;
+    return None;
   unsigned ItemSize = getDwarfStringOffsetsByteSize();
   uint32_t Offset = getStringOffsetsBase() + Index * ItemSize;
   if (StringOffsetSection.Data.size() < Offset + ItemSize)
-    return false;
+    return None;
   DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection,
                         isLittleEndian, 0);
-  Result = DA.getRelocatedValue(ItemSize, &Offset);
-  return true;
+  return DA.getRelocatedValue(ItemSize, &Offset);
 }
 
 bool DWARFUnitHeader::extract(DWARFContext &Context,
-- 
GitLab


From b98edf7f1c3faa7ee5934d9e1a7829d53dadbd51 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 31 Oct 2018 21:11:59 +0000
Subject: [PATCH 0827/1116] [ValueTracking] add tests for fmin/fmax; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345777 91177308-0d34-0410-b5e6-96231b3b80d8
---
 unittests/Analysis/ValueTrackingTest.cpp | 176 +++++++++++++++++++++++
 1 file changed, 176 insertions(+)

diff --git a/unittests/Analysis/ValueTrackingTest.cpp b/unittests/Analysis/ValueTrackingTest.cpp
index d6365176d08..ccae9d19ebb 100644
--- a/unittests/Analysis/ValueTrackingTest.cpp
+++ b/unittests/Analysis/ValueTrackingTest.cpp
@@ -149,6 +149,182 @@ TEST_F(MatchSelectPatternTest, FMinConstantZeroNsz) {
   expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, true});
 }
 
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero1) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float -0.0, %a\n"
+      "  %A = select i1 %1, float 0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero2) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float %a, -0.0\n"
+      "  %A = select i1 %1, float 0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero3) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float 0.0, %a\n"
+      "  %A = select i1 %1, float -0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero4) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float %a, 0.0\n"
+      "  %A = select i1 %1, float -0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero5) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float -0.0, %a\n"
+      "  %A = select i1 %1, float %a, float 0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero6) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float %a, -0.0\n"
+      "  %A = select i1 %1, float %a, float 0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero7) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float 0.0, %a\n"
+      "  %A = select i1 %1, float %a, float -0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero8) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float %a, 0.0\n"
+      "  %A = select i1 %1, float %a, float -0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero1) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float -0.0, %a\n"
+      "  %A = select i1 %1, float 0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero2) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float %a, -0.0\n"
+      "  %A = select i1 %1, float 0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero3) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float 0.0, %a\n"
+      "  %A = select i1 %1, float -0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero4) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float %a, 0.0\n"
+      "  %A = select i1 %1, float -0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero5) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float -0.0, %a\n"
+      "  %A = select i1 %1, float %a, float 0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero6) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float %a, -0.0\n"
+      "  %A = select i1 %1, float %a, float 0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero7) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float 0.0, %a\n"
+      "  %A = select i1 %1, float %a, float -0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero8) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float %a, 0.0\n"
+      "  %A = select i1 %1, float %a, float -0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // FIXME: The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
 TEST_F(MatchSelectPatternTest, VectorFMinimum) {
   parseAssembly(
       "define <4 x float> @test(<4 x float> %a) {\n"
-- 
GitLab


From 832efd7ec3d612735d46253fa9602ae03eb53db1 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 31 Oct 2018 21:24:30 +0000
Subject: [PATCH 0828/1116] Check shouldReduceLoadWidth from SimplifySetCC

SimplifySetCC could shrink a load without checking for
profitability or legality of such shink with a target.

Added checks to prevent shrinking of aligned scalar loads
in AMDGPU below dword as scalar engine does not support it.

Differential Revision: https://reviews.llvm.org/D53846

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345778 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |  3 +-
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp      | 12 ++++
 .../CodeGen/AMDGPU/setcc-limit-load-shrink.ll | 65 +++++++++++++++++++
 3 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll

diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a356e4d728f..d5665ab67c5 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2284,7 +2284,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       }
       if (bestWidth) {
         EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
-        if (newVT.isRound()) {
+        if (newVT.isRound() &&
+            shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) {
           EVT PtrType = Lod->getOperand(1).getValueType();
           SDValue Ptr = Lod->getBasePtr();
           if (bestOffset != 0)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a1b9198f945..9823dd7709d 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -667,6 +667,18 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
   EVT OldVT = N->getValueType(0);
   unsigned OldSize = OldVT.getStoreSizeInBits();
 
+  MemSDNode *MN = cast<MemSDNode>(N);
+  unsigned AS = MN->getAddressSpace();
+  // Do not shrink an aligned scalar load to sub-dword.
+  // Scalar engine cannot do sub-dword loads.
+  if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
+      (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+       (isa<LoadSDNode>(N) &&
+        AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
+      AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
+    return false;
+
   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
   // extloads, so doing one requires using a buffer_load. In cases where we
   // still couldn't use a scalar load, using the wider load shouldn't really
diff --git a/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll b/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
new file mode 100644
index 00000000000..ae50d4f18c4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013
+define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
+  %load = load i32, i32 addrspace(4)* %ptr, align 4
+  %and = and i32 %load, 524288
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: const_load_no_shrink_dword_to_aligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003
+define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
+  %load = load i32, i32 addrspace(4)* %ptr, align 4
+  %and = and i32 %load, 8
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: global_load_no_shrink_dword_to_unaligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013
+define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %and = and i32 %load, 524288
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: global_load_no_shrink_dword_to_aligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003
+define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %and = and i32 %load, 8
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: const_load_shrink_dword_to_unaligned_byte:
+; GCN: global_load_ushort
+define amdgpu_kernel void @const_load_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
+  %load = load i32, i32 addrspace(4)* %ptr, align 2
+  %and = and i32 %load, 524288
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
-- 
GitLab


From 9268a4a0f0e110ce0a9f7c6db424f1fdfd270bea Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 31 Oct 2018 21:37:40 +0000
Subject: [PATCH 0829/1116] revert rL345717 : [InstSimplify] fold icmp based on
 range of abs/nabs

This can miscompile as shown in PR39510:
https://bugs.llvm.org/show_bug.cgi?id=39510


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345780 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/InstructionSimplify.cpp          | 42 ---------
 test/Transforms/InstSimplify/icmp-abs-nabs.ll | 90 +++++++++++++++----
 2 files changed, 75 insertions(+), 57 deletions(-)

diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 8571dc2cf2f..c4b076341fc 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -2996,45 +2996,6 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
-static Value *simplifyICmpWithAbsNabs(CmpInst::Predicate Pred, Value *Op0,
-                                      Value *Op1) {
-  // We need a comparison with a constant.
-  const APInt *C;
-  if (!match(Op1, m_APInt(C)))
-    return nullptr;
-
-  // matchSelectPattern returns the negation part of an abs pattern in SP1.
-  // If the negate has an NSW flag, abs(INT_MIN) is undefined. Without that
-  // constraint, we can't make a contiguous range for the result of abs.
-  ICmpInst::Predicate AbsPred = ICmpInst::BAD_ICMP_PREDICATE;
-  Value *SP0, *SP1;
-  SelectPatternFlavor SPF = matchSelectPattern(Op0, SP0, SP1).Flavor;
-  if (SPF == SelectPatternFlavor::SPF_ABS &&
-      cast<Instruction>(SP1)->hasNoSignedWrap())
-    // The result of abs(X) is >= 0 (with nsw).
-    AbsPred = ICmpInst::ICMP_SGE;
-  if (SPF == SelectPatternFlavor::SPF_NABS)
-    // The result of -abs(X) is <= 0.
-    AbsPred = ICmpInst::ICMP_SLE;
-
-  if (AbsPred == ICmpInst::BAD_ICMP_PREDICATE)
-    return nullptr;
-
-  // Intersect the range of abs/nabs with the range of this icmp.
-  // If there is no intersection, the icmp must be false.
-  // If the intersection equals the range of abs/nabs, the icmp must be true.
-  APInt Zero = APInt::getNullValue(C->getBitWidth());
-  ConstantRange AbsRange = ConstantRange::makeExactICmpRegion(AbsPred, Zero);
-  ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(Pred, *C);
-  ConstantRange Intersection = AbsRange.intersectWith(CmpRange);
-  if (Intersection.isEmptySet())
-    return getFalse(GetCompareTy(Op0));
-  if (Intersection == AbsRange)
-    return getTrue(GetCompareTy(Op0));
-
-  return nullptr;
-}
-
 /// Simplify integer comparisons where at least one operand of the compare
 /// matches an integer min/max idiom.
 static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
@@ -3466,9 +3427,6 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   if (Value *V = simplifyICmpWithMinMax(Pred, LHS, RHS, Q, MaxRecurse))
     return V;
 
-  if (Value *V = simplifyICmpWithAbsNabs(Pred, LHS, RHS))
-    return V;
-
   // Simplify comparisons of related pointers using a powerful, recursive
   // GEP-walk when we have target data available..
   if (LHS->getType()->isPointerTy())
diff --git a/test/Transforms/InstSimplify/icmp-abs-nabs.ll b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
index 52321136dcf..1cb312bf0da 100644
--- a/test/Transforms/InstSimplify/icmp-abs-nabs.ll
+++ b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
@@ -5,7 +5,11 @@
 
 define i1 @abs_nsw_is_positive(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_positive(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], -1
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub nsw i32 0, %x
@@ -31,7 +35,11 @@ define i1 @abs_nsw_is_positive_sge(i32 %x) {
 
 define i1 @abs_nsw_is_positive_reduced_range(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_positive_reduced_range(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], -42
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub nsw i32 0, %x
@@ -91,7 +99,11 @@ define i1 @abs_nsw_is_not_negative(i32 %x) {
 
 define i1 @abs_nsw_is_not_negative_sle(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_sle(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i32 [[ABS]], -1
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub nsw i32 0, %x
@@ -104,7 +116,11 @@ define i1 @abs_nsw_is_not_negative_sle(i32 %x) {
 
 define i1 @abs_nsw_is_not_negative_reduced_range(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_reduced_range(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[ABS]], -24
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub nsw i32 0, %x
@@ -151,7 +167,11 @@ define i1 @abs_nsw_is_not_negative_wrong_range(i32 %x) {
 
 define i1 @nabs_is_negative_or_0(i32 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[NABS]], 1
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub i32 0, %x
@@ -164,7 +184,11 @@ define i1 @nabs_is_negative_or_0(i32 %x) {
 
 define i1 @nabs_is_negative_or_0_sle(i32 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0_sle(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i32 [[NABS]], 0
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -177,7 +201,11 @@ define i1 @nabs_is_negative_or_0_sle(i32 %x) {
 
 define i1 @nabs_is_negative_or_0_reduced_range(i32 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0_reduced_range(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[NABS]], 421
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -207,7 +235,11 @@ define i1 @nabs_is_negative_or_0_wrong_range(i32 %x) {
 
 define i1 @nabs_is_not_over_0(i32 %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[NABS]], 0
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub i32 0, %x
@@ -220,7 +252,11 @@ define i1 @nabs_is_not_over_0(i32 %x) {
 
 define i1 @nabs_is_not_over_0_sle(i32 %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0_sle(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sge i32 [[NABS]], 1
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -233,7 +269,11 @@ define i1 @nabs_is_not_over_0_sle(i32 %x) {
 
 define i1 @nabs_is_not_over_0_reduced_range(i32 %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0_reduced_range(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[NABS]], 4223
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -278,7 +318,11 @@ define i1 @abs_nsw_is_positive_eq(i32 %x) {
 
 define i1 @abs_nsw_is_positive_ult(i8 %x) {
 ; CHECK-LABEL: @abs_nsw_is_positive_ult(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i8 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i8 [[NEGX]], i8 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[ABS]], -117
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %x, 0
   %negx = sub nsw i8 0, %x
@@ -291,7 +335,11 @@ define i1 @abs_nsw_is_positive_ult(i8 %x) {
 
 define i1 @abs_nsw_is_not_negative_ugt(i8 %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_ugt(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i8 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i8 [[NEGX]], i8 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[ABS]], 127
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %x, 0
   %negx = sub nsw i8 0, %x
@@ -304,7 +352,11 @@ define i1 @abs_nsw_is_not_negative_ugt(i8 %x) {
 
 define <2 x i1> @abs_nsw_is_not_negative_vec_splat(<2 x i32> %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_vec_splat(
-; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i32> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[NEGX]], <2 x i32> [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt <2 x i32> [[ABS]], <i32 -8, i32 -8>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %cmp = icmp slt <2 x i32> %x, zeroinitializer
   %negx = sub nsw <2 x i32> zeroinitializer, %x
@@ -317,7 +369,11 @@ define <2 x i1> @abs_nsw_is_not_negative_vec_splat(<2 x i32> %x) {
 
 define i1 @nabs_is_negative_or_0_ne(i8 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0_ne(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i8 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[NABS]], 12
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %x, 0
   %negx = sub i8 0, %x
@@ -330,7 +386,11 @@ define i1 @nabs_is_negative_or_0_ne(i8 %x) {
 
 define <3 x i1> @nabs_is_not_over_0_sle_vec_splat(<3 x i33> %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0_sle_vec_splat(
-; CHECK-NEXT:    ret <3 x i1> zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <3 x i33> [[X:%.*]], <i33 1, i33 1, i33 1>
+; CHECK-NEXT:    [[NEGX:%.*]] = sub <3 x i33> zeroinitializer, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select <3 x i1> [[CMP]], <3 x i33> [[X]], <3 x i33> [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sge <3 x i33> [[NABS]], <i33 1, i33 1, i33 1>
+; CHECK-NEXT:    ret <3 x i1> [[R]]
 ;
   %cmp = icmp slt <3 x i33> %x, <i33 1, i33 1, i33 1>
   %negx = sub <3 x i33> zeroinitializer, %x
-- 
GitLab


From d001dd065e3c301d356fb1c0b5280c3d38439e17 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Wed, 31 Oct 2018 21:45:48 +0000
Subject: [PATCH 0830/1116] [ARM] Add missing pseudo-instruction for Thumb1
 RSBS.

Shows up rarely for 64-bit arithmetic, more frequently for the compare
patterns added in r325323.

Differential Revision: https://reviews.llvm.org/D53848


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345782 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMBaseInstrInfo.cpp  |  1 +
 lib/Target/ARM/ARMInstrThumb.td      |  6 ++++
 test/CodeGen/ARM/and-load-combine.ll | 42 ++++++++++------------------
 test/CodeGen/ARM/atomic-cmpxchg.ll   |  6 ++--
 test/CodeGen/ARM/select-imm.ll       | 12 +++-----
 test/CodeGen/ARM/smml.ll             |  2 +-
 test/CodeGen/Thumb/branchless-cmp.ll |  9 ++----
 test/CodeGen/Thumb/long-setcc.ll     |  3 +-
 8 files changed, 32 insertions(+), 49 deletions(-)

diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index c9d78df4b37..bbebed59c85 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2199,6 +2199,7 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
   {ARM::tSUBSi8, ARM::tSUBi8},
   {ARM::tSUBSrr, ARM::tSUBrr},
   {ARM::tSBCS, ARM::tSBC},
+  {ARM::tRSBS, ARM::tRSB},
 
   {ARM::t2ADDSri, ARM::t2ADDri},
   {ARM::t2ADDSrr, ARM::t2ADDrr},
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 8b85db7e685..3c153625b01 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1343,6 +1343,12 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
                                                            tGPR:$Rm))]>,
                 Requires<[IsThumb1Only]>,
                 Sched<[WriteALU]>;
+
+  def tRSBS   : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn),
+                            2, IIC_iALUr,
+                            [(set tGPR:$Rd, CPSR, (ARMsubc 0, tGPR:$Rn))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
 }
 
 // Sign-extend byte
diff --git a/test/CodeGen/ARM/and-load-combine.ll b/test/CodeGen/ARM/and-load-combine.ll
index 09acefad305..8f08909c816 100644
--- a/test/CodeGen/ARM/and-load-combine.ll
+++ b/test/CodeGen/ARM/and-load-combine.ll
@@ -28,8 +28,7 @@ define arm_aapcscc zeroext i1 @cmp_xor8_short_short(i16* nocapture readonly %a,
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    eors r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -74,8 +73,7 @@ define arm_aapcscc zeroext i1 @cmp_xor8_short_int(i16* nocapture readonly %a, i3
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    eors r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -121,8 +119,7 @@ define arm_aapcscc zeroext i1 @cmp_xor8_int_int(i32* nocapture readonly %a, i32*
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    eors r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -167,8 +164,7 @@ define arm_aapcscc zeroext i1 @cmp_xor16(i32* nocapture readonly %a, i32* nocapt
 ; THUMB1-NEXT:    ldrh r0, [r0]
 ; THUMB1-NEXT:    ldrh r1, [r1]
 ; THUMB1-NEXT:    eors r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -213,8 +209,7 @@ define arm_aapcscc zeroext i1 @cmp_or8_short_short(i16* nocapture readonly %a, i
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    orrs r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -259,8 +254,7 @@ define arm_aapcscc zeroext i1 @cmp_or8_short_int(i16* nocapture readonly %a, i32
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    orrs r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -306,8 +300,7 @@ define arm_aapcscc zeroext i1 @cmp_or8_int_int(i32* nocapture readonly %a, i32*
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    orrs r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -352,8 +345,7 @@ define arm_aapcscc zeroext i1 @cmp_or16(i32* nocapture readonly %a, i32* nocaptu
 ; THUMB1-NEXT:    ldrh r0, [r0]
 ; THUMB1-NEXT:    ldrh r1, [r1]
 ; THUMB1-NEXT:    orrs r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -398,8 +390,7 @@ define arm_aapcscc zeroext i1 @cmp_and8_short_short(i16* nocapture readonly %a,
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    ldrb r2, [r0]
 ; THUMB1-NEXT:    ands r2, r1
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r2
+; THUMB1-NEXT:    rsbs r0, r2, #0
 ; THUMB1-NEXT:    adcs r0, r2
 ; THUMB1-NEXT:    bx lr
 ;
@@ -444,8 +435,7 @@ define arm_aapcscc zeroext i1 @cmp_and8_short_int(i16* nocapture readonly %a, i3
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -491,8 +481,7 @@ define arm_aapcscc zeroext i1 @cmp_and8_int_int(i32* nocapture readonly %a, i32*
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    ldrb r2, [r0]
 ; THUMB1-NEXT:    ands r2, r1
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r2
+; THUMB1-NEXT:    rsbs r0, r2, #0
 ; THUMB1-NEXT:    adcs r0, r2
 ; THUMB1-NEXT:    bx lr
 ;
@@ -537,8 +526,7 @@ define arm_aapcscc zeroext i1 @cmp_and16(i32* nocapture readonly %a, i32* nocapt
 ; THUMB1-NEXT:    ldrh r1, [r1]
 ; THUMB1-NEXT:    ldrh r2, [r0]
 ; THUMB1-NEXT:    ands r2, r1
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r2
+; THUMB1-NEXT:    rsbs r0, r2, #0
 ; THUMB1-NEXT:    adcs r0, r2
 ; THUMB1-NEXT:    bx lr
 ;
@@ -881,8 +869,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
 ; THUMB1-NEXT:    ands r0, r1
 ; THUMB1-NEXT:    uxtb r1, r2
 ; THUMB1-NEXT:    subs r1, r0, r1
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -929,8 +916,7 @@ define arm_aapcscc i1 @test7(i16* %x, i16 %y, i8 %z) {
 ; THUMB1-NEXT:    ands r0, r1
 ; THUMB1-NEXT:    uxtb r1, r2
 ; THUMB1-NEXT:    subs r1, r0, r1
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
diff --git a/test/CodeGen/ARM/atomic-cmpxchg.ll b/test/CodeGen/ARM/atomic-cmpxchg.ll
index fd87e50d0b7..b5214f8d67e 100644
--- a/test/CodeGen/ARM/atomic-cmpxchg.ll
+++ b/test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -24,8 +24,7 @@ entry:
 ; CHECK-THUMB: bl __sync_val_compare_and_swap_1
 ; CHECK-THUMB-NOT: mov [[R1:r[0-7]]], r0
 ; CHECK-THUMB: subs [[R1:r[0-7]]], r0, {{r[0-9]+}}
-; CHECK-THUMB: movs r0, #0
-; CHECK-THUMB: subs r0, r0, [[R1]]
+; CHECK-THUMB: rsbs r0, [[R1]], #0
 ; CHECK-THUMB: adcs r0, [[R1]]
 
 ; CHECK-ARMV6-LABEL: test_cmpxchg_res_i8:
@@ -47,8 +46,7 @@ entry:
 ; CHECK-THUMBV6-NEXT:  bl __sync_val_compare_and_swap_1
 ; CHECK-THUMBV6-NEXT:  uxtb r1, r4
 ; CHECK-THUMBV6-NEXT:  subs [[R1:r[0-7]]], r0, {{r[0-9]+}}
-; CHECK-THUMBV6-NEXT:  movs r0, #0
-; CHECK-THUMBV6-NEXT:  subs r0, r0, [[R1]]
+; CHECK-THUMBV6-NEXT:  rsbs r0, [[R1]], #0
 ; CHECK-THUMBV6-NEXT:  adcs r0, [[R1]]
 
 ; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8:
diff --git a/test/CodeGen/ARM/select-imm.ll b/test/CodeGen/ARM/select-imm.ll
index 04f6d252e27..1e27024e7c7 100644
--- a/test/CodeGen/ARM/select-imm.ll
+++ b/test/CodeGen/ARM/select-imm.ll
@@ -71,8 +71,7 @@ entry:
 ; ARMT2: lsr r0, r0, #5
 
 ; THUMB1-LABEL: t3:
-; THUMB1: movs r1, #0
-; THUMB1: subs r1, r1, r0
+; THUMB1: rsbs r1, r0, #0
 ; THUMB1: adcs r0, r1
 
 ; THUMB2-LABEL: t3:
@@ -116,8 +115,7 @@ entry:
 
 ; THUMB1-LABEL: t5:
 ; THUMB1-NOT: bne
-; THUMB1: movs r0, #0
-; THUMB1: subs r0, r0, r1
+; THUMB1: rsbs r0, r1, #0
 ; THUMB1: adcs r0, r1
 
 ; THUMB2-LABEL: t5:
@@ -196,8 +194,7 @@ entry:
 ; THUMB1: bl t7
 ; THUMB1: mov r1, r0
 ; THUMB1: subs r2, r4, #5
-; THUMB1: movs r0, #0
-; THUMB1: subs r0, r0, r2
+; THUMB1: rsbs r0, r2, #0
 ; THUMB1: adcs r0, r2
 
 ; THUMB2-LABEL: t8:
@@ -302,8 +299,7 @@ entry:
 ; ARMT2: lsr r0, r0, #5
 
 ; THUMB1-LABEL: t10:
-; THUMB1: movs r0, #0
-; THUMB1: subs r0, r0, r1
+; THUMB1: rsbs r0, r1, #0
 ; THUMB1: adcs r0, r1
 
 ; THUMB2-LABEL: t10:
diff --git a/test/CodeGen/ARM/smml.ll b/test/CodeGen/ARM/smml.ll
index ba996e5ddd8..79048348e9b 100644
--- a/test/CodeGen/ARM/smml.ll
+++ b/test/CodeGen/ARM/smml.ll
@@ -44,7 +44,7 @@ declare void @opaque(i32)
 define void @test_used_flags(i32 %in1, i32 %in2) {
 ; CHECK-LABEL: test_used_flags:
 ; CHECK-THUMB: movs    r2, #0
-; CHECK-THUMB: subs    r0, r2, r0
+; CHECK-THUMB: rsbs    r0, r0, #0
 ; CHECK-THUMB: sbcs    r2, r1
 ; CHECK-THUMB: bge
 ; CHECK-V6: smull [[PROD_LO:r[0-9]+]], [[PROD_HI:r[0-9]+]], r0, r1
diff --git a/test/CodeGen/Thumb/branchless-cmp.ll b/test/CodeGen/Thumb/branchless-cmp.ll
index 8435529d681..ed34d630733 100644
--- a/test/CodeGen/Thumb/branchless-cmp.ll
+++ b/test/CodeGen/Thumb/branchless-cmp.ll
@@ -20,8 +20,7 @@ entry:
 ; CHECK-LABEL: test1b:
 ; CHECK-NOT: b{{(ne)|(eq)}}
 ; CHECK:       subs    r1, r0, r1
-; CHECK-NEXT:  movs    r0, #0
-; CHECK-NEXT:  subs    r0, r0, r1
+; CHECK-NEXT:  rsbs    r0, r1, #0
 ; CHECK-NEXT:  adcs    r0, r1
 }
 
@@ -33,8 +32,7 @@ entry:
 ; CHECK-LABEL: test2a:
 ; CHECK-NOT: b{{(ne)|(eq)}}
 ; CHECK:       subs    r1, r0, r1
-; CHECK-NEXT:  movs    r0, #0
-; CHECK-NEXT:  subs    r0, r0, r1
+; CHECK-NEXT:  rsbs    r0, r1, #0
 ; CHECK-NEXT:  adcs    r0, r1
 }
 
@@ -71,8 +69,7 @@ entry:
 ; CHECK-LABEL: test3b:
 ; CHECK-NOT: b{{(ne)|(eq)}}
 ; CHECK:      subs	r0, r0, r1
-; CHECK-NEXT: movs	r1, #0
-; CHECK-NEXT: subs	r1, r1, r0
+; CHECK-NEXT: rsbs	r1, r0, #0
 ; CHECK-NEXT: adcs	r1, r0
 ; CHECK-NEXT: lsls	r0, r1, #2
 }
diff --git a/test/CodeGen/Thumb/long-setcc.ll b/test/CodeGen/Thumb/long-setcc.ll
index f077d0e4cf4..b8b9cff7b36 100644
--- a/test/CodeGen/Thumb/long-setcc.ll
+++ b/test/CodeGen/Thumb/long-setcc.ll
@@ -9,8 +9,7 @@ define i1 @t1(i64 %x) {
 
 define i1 @t2(i64 %x) {
 ; CHECK-LABEL: t2:
-; CHECK: movs  r0, #0
-; CHECK: subs  r0, r0, r1
+; CHECK: rsbs  r0, r1, #0
 ; CHECK: adcs  r0, r1
   %tmp = icmp ult i64 %x, 4294967296
   ret i1 %tmp
-- 
GitLab


From c5f8ae0a0fb0f0750004a8110d90d408f336c91b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 31 Oct 2018 21:53:24 +0000
Subject: [PATCH 0831/1116] Revert r345165 "[X86] Bring back the MOV64r0 pseudo
 instruction"

Google is reporting regressions on some benchmarks.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345785 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FastISel.cpp                |  32 +-
 lib/Target/X86/X86ISelDAGToDAG.cpp            |  13 +-
 lib/Target/X86/X86InstrCompiler.td            |   6 +-
 lib/Target/X86/X86InstrInfo.cpp               |  23 +-
 .../X86/X86SpeculativeLoadHardening.cpp       |  10 +-
 test/CodeGen/X86/GlobalISel/constant.ll       |   2 +-
 test/CodeGen/X86/avg.ll                       | 455 +++++++++---------
 test/CodeGen/X86/crash-O0.ll                  |   8 +-
 test/CodeGen/X86/hoist-spill.ll               |   2 +
 test/CodeGen/X86/machine-cse.ll               |  15 +-
 test/CodeGen/X86/madd.ll                      |  66 +--
 test/CodeGen/X86/mmx-arith.ll                 |  11 +-
 test/CodeGen/X86/pr32284.ll                   |  19 +-
 test/CodeGen/X86/pr32340.ll                   |  25 +-
 test/CodeGen/X86/scheduler-backtracking.ll    | 212 ++++----
 test/CodeGen/X86/spill-zero-x86_64.ll         |  75 ---
 test/CodeGen/X86/swifterror.ll                |  21 +-
 17 files changed, 472 insertions(+), 523 deletions(-)
 delete mode 100644 test/CodeGen/X86/spill-zero-x86_64.ll

diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index b87f4802473..a49ad8bd59d 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1916,8 +1916,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
     { &X86::GR64RegClass, X86::RAX, X86::RDX, {
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
-        { X86::DIV64r,  X86::MOV64r0, Copy,            X86::RAX, U }, // UDiv
-        { X86::DIV64r,  X86::MOV64r0, Copy,            X86::RDX, U }, // URem
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
       }
     }, // i64
   };
@@ -1964,22 +1964,26 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(OpEntry.OpSignExtend));
     else {
-      unsigned ZeroReg = createResultReg(VT == MVT::i64 ? &X86::GR64RegClass
-                                                        : &X86::GR32RegClass);
+      unsigned Zero32 = createResultReg(&X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(OpEntry.OpSignExtend), ZeroReg);
+              TII.get(X86::MOV32r0), Zero32);
 
       // Copy the zero into the appropriate sub/super/identical physical
       // register. Unfortunately the operations needed are not uniform enough
       // to fit neatly into the table above.
-      if (VT == MVT::i16)
+      if (VT == MVT::i16) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Copy), TypeEntry.HighInReg)
-          .addReg(ZeroReg, 0, X86::sub_16bit);
-      else
+          .addReg(Zero32, 0, X86::sub_16bit);
+      } else if (VT == MVT::i32) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Copy), TypeEntry.HighInReg)
-            .addReg(ZeroReg);
+            .addReg(Zero32);
+      } else if (VT == MVT::i64) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
+            .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
+      }
     }
   }
   // Generate the DIV/IDIV instruction.
@@ -3704,9 +3708,6 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
 
   uint64_t Imm = CI->getZExtValue();
   if (Imm == 0) {
-    if (VT.SimpleTy == MVT::i64)
-      return fastEmitInst_(X86::MOV64r0, &X86::GR64RegClass);
-
     unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Unexpected value type");
@@ -3719,6 +3720,13 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
                                         X86::sub_16bit);
     case MVT::i32:
       return SrcReg;
+    case MVT::i64: {
+      unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+        .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+      return ResultReg;
+    }
     }
   }
 
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 717ecc031c0..16819f4451c 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -3591,10 +3591,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
       } else {
         // Zero out the high part, effectively zero extending the input.
-        unsigned ClrOpc = NVT.SimpleTy == MVT::i64 ? X86::MOV64r0
-                                                   : X86::MOV32r0;
-        MVT ClrVT = NVT.SimpleTy == MVT::i64 ? MVT::i64 : MVT::i32;
-        SDValue ClrNode = SDValue(CurDAG->getMachineNode(ClrOpc, dl, ClrVT), 0);
+        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
         switch (NVT.SimpleTy) {
         case MVT::i16:
           ClrNode =
@@ -3605,7 +3602,15 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
                       0);
           break;
         case MVT::i32:
+          break;
         case MVT::i64:
+          ClrNode =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                          CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
+                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
+                                                    MVT::i32)),
+                      0);
           break;
         default:
           llvm_unreachable("Unexpected division source");
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 2805517b747..71b43a38dc2 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -270,18 +270,16 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>;
 // Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isPseudo = 1, AddedComplexity = 10 in {
+    isPseudo = 1, AddedComplexity = 10 in
 def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)]>, Sched<[WriteZero]>;
-def MOV64r0  : I<0, Pseudo, (outs GR64:$dst), (ins), "",
-                 [(set GR64:$dst, 0)]>, Sched<[WriteZero]>;
-}
 
 // Other widths can also make use of the 32-bit xor, which may have a smaller
 // encoding and avoid partial register updates.
 let AddedComplexity = 10 in {
 def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
 def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>;
 }
 
 let Predicates = [OptForSize, Not64BitMode],
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 88f2f0fffd6..ae45301f04b 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -683,10 +683,8 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
   if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
     // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
     // effects.
-    unsigned NewOpc = X86::MOV32ri;
     int Value;
     switch (Orig.getOpcode()) {
-    case X86::MOV64r0:  NewOpc = X86::MOV32ri64; Value = 0; break;
     case X86::MOV32r0:  Value = 0; break;
     case X86::MOV32r1:  Value = 1; break;
     case X86::MOV32r_1: Value = -1; break;
@@ -695,7 +693,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
     }
 
     const DebugLoc &DL = Orig.getDebugLoc();
-    BuildMI(MBB, I, DL, get(NewOpc))
+    BuildMI(MBB, I, DL, get(X86::MOV32ri))
         .add(Orig.getOperand(0))
         .addImm(Value);
   } else {
@@ -3752,9 +3750,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
       // MOV32r0 etc. are implemented with xor which clobbers condition code.
       // They are safe to move up, if the definition to EFLAGS is dead and
       // earlier instructions do not read or write EFLAGS.
-      if (!Movr0Inst &&
-          (Instr.getOpcode() == X86::MOV32r0 ||
-           Instr.getOpcode() == X86::MOV64r0) &&
+      if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
           Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
         Movr0Inst = &Instr;
         continue;
@@ -4159,15 +4155,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case X86::MOV32r0:
     return Expand2AddrUndef(MIB, get(X86::XOR32rr));
-  case X86::MOV64r0: {
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-    unsigned Reg = MIB->getOperand(0).getReg();
-    unsigned Reg32 = TRI->getSubReg(Reg, X86::sub_32bit);
-    MIB->getOperand(0).setReg(Reg32);
-    Expand2AddrUndef(MIB, get(X86::XOR32rr));
-    MIB.addReg(Reg, RegState::ImplicitDefine);
-    return true;
-  }
   case X86::MOV32r1:
     return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
   case X86::MOV32r_1:
@@ -4911,10 +4898,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     isTwoAddrFold = true;
   } else {
     if (OpNum == 0) {
-      if (MI.getOpcode() == X86::MOV32r0 || MI.getOpcode() == X86::MOV64r0) {
-        unsigned NewOpc = MI.getOpcode() == X86::MOV64r0 ? X86::MOV64mi32
-                                                         : X86::MOV32mi;
-        NewMI = MakeM0Inst(*this, NewOpc, MOs, InsertPt, MI);
+      if (MI.getOpcode() == X86::MOV32r0) {
+        NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
         if (NewMI)
           return NewMI;
       }
diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 20997ecc07d..14e4c455a08 100644
--- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -487,14 +487,20 @@ bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
     // Otherwise, just build the predicate state itself by zeroing a register
     // as we don't need any initial state.
     PS->InitialReg = MRI->createVirtualRegister(PS->RC);
-    auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64r0),
-                         PS->InitialReg);
+    unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+    auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
+                         PredStateSubReg);
     ++NumInstsInserted;
     MachineOperand *ZeroEFLAGSDefOp =
         ZeroI->findRegisterDefOperand(X86::EFLAGS);
     assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
            "Must have an implicit def of EFLAGS!");
     ZeroEFLAGSDefOp->setIsDead(true);
+    BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
+            PS->InitialReg)
+        .addImm(0)
+        .addReg(PredStateSubReg)
+        .addImm(X86::sub_32bit);
   }
 
   // We're going to need to trace predicate state throughout the function's
diff --git a/test/CodeGen/X86/GlobalISel/constant.ll b/test/CodeGen/X86/GlobalISel/constant.ll
index 2043c60f499..f6ebb70fcf5 100644
--- a/test/CodeGen/X86/GlobalISel/constant.ll
+++ b/test/CodeGen/X86/GlobalISel/constant.ll
@@ -54,7 +54,7 @@ define i64 @const_i64_i32() {
 define void @main(i32 ** %data) {
 ; ALL-LABEL: main:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    movq $0, %rax
 ; ALL-NEXT:    movq %rax, (%rdi)
 ; ALL-NEXT:    retq
   store i32* null, i32** %data, align 8
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index c4b15070bad..84f1296d51c 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -2141,7 +2141,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    pushq %r13
 ; AVX1-NEXT:    pushq %r12
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    subq $16, %rsp
+; AVX1-NEXT:    subq $24, %rsp
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -2152,12 +2152,12 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vmovq %xmm5, %rbp
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %rcx
-; AVX1-NEXT:    vmovq %xmm4, %rsi
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX1-NEXT:    vmovq %xmm4, %rcx
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %r10
+; AVX1-NEXT:    vpextrq $1, %xmm4, %r8
 ; AVX1-NEXT:    vmovq %xmm4, %r11
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
@@ -2166,7 +2166,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm4, %r15
-; AVX1-NEXT:    vmovq %xmm4, %rdx
+; AVX1-NEXT:    vmovq %xmm4, %rdi
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
@@ -2175,28 +2175,27 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    vmovq %xmm3, %r9
+; AVX1-NEXT:    vmovq %xmm3, %r10
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %r14
-; AVX1-NEXT:    addq %rbx, %r14
-; AVX1-NEXT:    vmovq %xmm4, %r8
-; AVX1-NEXT:    addq %rbp, %r8
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX1-NEXT:    addq %rbx, %rdx
+; AVX1-NEXT:    vmovq %xmm4, %r9
+; AVX1-NEXT:    addq %rbp, %r9
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rdi
-; AVX1-NEXT:    addq %rcx, %rdi
-; AVX1-NEXT:    vmovq %xmm3, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
 ; AVX1-NEXT:    addq %rsi, %rax
-; AVX1-NEXT:    movq %rax, %rsi
+; AVX1-NEXT:    movq %rax, %r14
+; AVX1-NEXT:    vmovq %xmm3, %rbp
+; AVX1-NEXT:    addq %rcx, %rbp
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX1-NEXT:    addq %r10, %rax
-; AVX1-NEXT:    movq %rax, %r10
+; AVX1-NEXT:    vpextrq $1, %xmm3, %rsi
+; AVX1-NEXT:    addq %r8, %rsi
 ; AVX1-NEXT:    vmovq %xmm3, %rax
 ; AVX1-NEXT:    addq %r11, %rax
 ; AVX1-NEXT:    movq %rax, %r11
@@ -2204,17 +2203,17 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
 ; AVX1-NEXT:    addq %r13, %rax
-; AVX1-NEXT:    movq %rax, %rbx
+; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    vmovq %xmm2, %rax
 ; AVX1-NEXT:    addq %r12, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %r8
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
 ; AVX1-NEXT:    addq %r15, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rbx
 ; AVX1-NEXT:    vmovq %xmm3, %rax
-; AVX1-NEXT:    addq %rdx, %rax
+; AVX1-NEXT:    addq %rdi, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
@@ -2227,40 +2226,41 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rbp
-; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; AVX1-NEXT:    vmovq %xmm2, %r15
-; AVX1-NEXT:    addq %r9, %r15
+; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm2, %r12
+; AVX1-NEXT:    addq %r10, %r12
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm0, %r9
-; AVX1-NEXT:    addq %rax, %r9
-; AVX1-NEXT:    vmovq %xmm1, %rcx
-; AVX1-NEXT:    vmovq %xmm0, %rdx
-; AVX1-NEXT:    addq %rcx, %rdx
-; AVX1-NEXT:    addq $-1, %r14
-; AVX1-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX1-NEXT:    addq %rax, %r10
+; AVX1-NEXT:    vmovq %xmm1, %rax
+; AVX1-NEXT:    vmovq %xmm0, %rdi
+; AVX1-NEXT:    addq %rax, %rdi
+; AVX1-NEXT:    addq $-1, %rdx
+; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r8
-; AVX1-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %r9
+; AVX1-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
-; AVX1-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rdi
-; AVX1-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %r14
+; AVX1-NEXT:    movq %r14, (%rsp) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rsi
-; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %rbp
+; AVX1-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r10
-; AVX1-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %rsi
+; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2269,90 +2269,93 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movl $0, %ebp
+; AVX1-NEXT:    adcq $-1, %rbp
+; AVX1-NEXT:    addq $-1, %r8
+; AVX1-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movl $0, %r15d
+; AVX1-NEXT:    adcq $-1, %r15
 ; AVX1-NEXT:    addq $-1, %rbx
 ; AVX1-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, %rsi
 ; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    movl $0, %r12d
-; AVX1-NEXT:    adcq $-1, %r12
-; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    movl $0, %r13d
 ; AVX1-NEXT:    adcq $-1, %r13
 ; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    movl $0, %r14d
 ; AVX1-NEXT:    adcq $-1, %r14
-; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    movl $0, %ebx
-; AVX1-NEXT:    adcq $-1, %rbx
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX1-NEXT:    addq $-1, %rcx
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX1-NEXT:    addq $-1, %rdx
 ; AVX1-NEXT:    movl $0, %r11d
 ; AVX1-NEXT:    adcq $-1, %r11
-; AVX1-NEXT:    addq $-1, %rbp
-; AVX1-NEXT:    movl $0, %r10d
-; AVX1-NEXT:    adcq $-1, %r10
-; AVX1-NEXT:    addq $-1, %r15
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    addq $-1, %rax
+; AVX1-NEXT:    movl $0, %ebx
+; AVX1-NEXT:    adcq $-1, %rbx
+; AVX1-NEXT:    addq $-1, %r12
+; AVX1-NEXT:    movl $0, %r9d
+; AVX1-NEXT:    adcq $-1, %r9
+; AVX1-NEXT:    addq $-1, %r10
 ; AVX1-NEXT:    movl $0, %r8d
 ; AVX1-NEXT:    adcq $-1, %r8
-; AVX1-NEXT:    addq $-1, %r9
-; AVX1-NEXT:    movl $0, %edi
-; AVX1-NEXT:    adcq $-1, %rdi
-; AVX1-NEXT:    addq $-1, %rdx
-; AVX1-NEXT:    movl $0, %eax
-; AVX1-NEXT:    adcq $-1, %rax
-; AVX1-NEXT:    shldq $63, %rdx, %rax
-; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    shldq $63, %r9, %rdi
-; AVX1-NEXT:    shldq $63, %r15, %r8
-; AVX1-NEXT:    shldq $63, %rbp, %r10
-; AVX1-NEXT:    shldq $63, %rcx, %r11
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rdx, %rbx
+; AVX1-NEXT:    addq $-1, %rdi
+; AVX1-NEXT:    movl $0, %ecx
+; AVX1-NEXT:    adcq $-1, %rcx
+; AVX1-NEXT:    shldq $63, %rdi, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    shldq $63, %r10, %r8
+; AVX1-NEXT:    shldq $63, %r12, %r9
+; AVX1-NEXT:    shldq $63, %rax, %rbx
+; AVX1-NEXT:    shldq $63, %rdx, %r11
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rdx, %r14
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rdx, %r13
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rdx, %r12
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rax, %rsi
 ; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %r15
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %rbp
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rax, %rsi
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %r15
+; AVX1-NEXT:    shldq $63, %rax, %rcx
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rcx, %rax
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rcx, %r9
-; AVX1-NEXT:    movq (%rsp), %rcx # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rdx, %rcx
+; AVX1-NEXT:    shldq $63, %rax, %rdi
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX1-NEXT:    movq (%rsp), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %r12
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %r10
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rbp, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm8
-; AVX1-NEXT:    vmovq %rcx, %xmm0
-; AVX1-NEXT:    vmovq %r9, %xmm1
-; AVX1-NEXT:    vmovq %rax, %xmm11
-; AVX1-NEXT:    vmovq %r15, %xmm2
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rdx, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm8
+; AVX1-NEXT:    vmovq %r10, %xmm0
+; AVX1-NEXT:    vmovq %r12, %xmm1
+; AVX1-NEXT:    vmovq %rdi, %xmm11
+; AVX1-NEXT:    vmovq %rcx, %xmm2
 ; AVX1-NEXT:    vmovq %rsi, %xmm13
-; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 8-byte Folded Reload
-; AVX1-NEXT:    # xmm14 = mem[0],zero
-; AVX1-NEXT:    vmovq %r12, %xmm15
-; AVX1-NEXT:    vmovq %r13, %xmm9
-; AVX1-NEXT:    vmovq %r14, %xmm10
-; AVX1-NEXT:    vmovq %rbx, %xmm12
+; AVX1-NEXT:    vmovq %rbp, %xmm14
+; AVX1-NEXT:    vmovq %r15, %xmm15
+; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 8-byte Folded Reload
+; AVX1-NEXT:    # xmm9 = mem[0],zero
+; AVX1-NEXT:    vmovq %r13, %xmm10
+; AVX1-NEXT:    vmovq %r14, %xmm12
 ; AVX1-NEXT:    vmovq %r11, %xmm3
-; AVX1-NEXT:    vmovq %r10, %xmm4
-; AVX1-NEXT:    vmovq %r8, %xmm5
-; AVX1-NEXT:    vmovq %rdi, %xmm6
+; AVX1-NEXT:    vmovq %rbx, %xmm4
+; AVX1-NEXT:    vmovq %r9, %xmm5
+; AVX1-NEXT:    vmovq %r8, %xmm6
 ; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
 ; AVX1-NEXT:    # xmm7 = mem[0],zero
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0]
@@ -2379,7 +2382,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
-; AVX1-NEXT:    addq $16, %rsp
+; AVX1-NEXT:    addq $24, %rsp
 ; AVX1-NEXT:    popq %rbx
 ; AVX1-NEXT:    popq %r12
 ; AVX1-NEXT:    popq %r13
@@ -2404,15 +2407,15 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX2-NEXT:    vpextrq $1, %xmm4, %rbx
-; AVX2-NEXT:    vmovq %xmm4, %rdx
+; AVX2-NEXT:    vmovq %xmm4, %rbp
 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rdi
 ; AVX2-NEXT:    vmovq %xmm3, %rcx
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpextrq $1, %xmm3, %r9
-; AVX2-NEXT:    vmovq %xmm3, %r10
-; AVX2-NEXT:    vpextrq $1, %xmm2, %r13
+; AVX2-NEXT:    vpextrq $1, %xmm3, %rdx
+; AVX2-NEXT:    vmovq %xmm3, %r9
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
 ; AVX2-NEXT:    vmovq %xmm2, %r12
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -2430,26 +2433,26 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vpextrq $1, %xmm4, %rbp
-; AVX2-NEXT:    addq %rbx, %rbp
-; AVX2-NEXT:    vmovq %xmm4, %rax
-; AVX2-NEXT:    addq %rdx, %rax
-; AVX2-NEXT:    movq %rax, %r11
-; AVX2-NEXT:    vpextrq $1, %xmm3, %r8
-; AVX2-NEXT:    addq %rdi, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX2-NEXT:    addq %rbx, %rax
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    vmovq %xmm4, %r13
+; AVX2-NEXT:    addq %rbp, %r13
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r10
+; AVX2-NEXT:    addq %rdi, %r10
 ; AVX2-NEXT:    vmovq %xmm3, %r14
 ; AVX2-NEXT:    addq %rcx, %r14
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX2-NEXT:    addq %r9, %rax
-; AVX2-NEXT:    movq %rax, %rbx
-; AVX2-NEXT:    vmovq %xmm3, %rax
-; AVX2-NEXT:    addq %r10, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
-; AVX2-NEXT:    addq %r13, %rcx
+; AVX2-NEXT:    addq %rdx, %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    vmovq %xmm3, %r8
+; AVX2-NEXT:    addq %r9, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    addq %r11, %rax
+; AVX2-NEXT:    movq %rax, %r11
 ; AVX2-NEXT:    vmovq %xmm2, %rax
 ; AVX2-NEXT:    addq %r12, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2471,8 +2474,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpextrq $1, %xmm2, %r12
-; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rbp
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; AVX2-NEXT:    vmovq %xmm2, %r9
 ; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
@@ -2481,36 +2484,36 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    vmovq %xmm1, %rdx
 ; AVX2-NEXT:    vmovq %xmm0, %rsi
 ; AVX2-NEXT:    addq %rdx, %rsi
-; AVX2-NEXT:    addq $-1, %rbp
-; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq $-1, %rbx
+; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %eax
 ; AVX2-NEXT:    adcq $-1, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, %r11
-; AVX2-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq $-1, %r13
+; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %eax
 ; AVX2-NEXT:    adcq $-1, %rax
 ; AVX2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, %r8
-; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq $-1, %r10
+; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %eax
 ; AVX2-NEXT:    adcq $-1, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    addq $-1, %r14
 ; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl $0, %ebp
-; AVX2-NEXT:    adcq $-1, %rbp
-; AVX2-NEXT:    addq $-1, %rbx
-; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movl $0, %r13d
+; AVX2-NEXT:    adcq $-1, %r13
+; AVX2-NEXT:    addq $-1, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %eax
 ; AVX2-NEXT:    adcq $-1, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    addq $-1, %r10
-; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq $-1, %r8
+; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %r15d
 ; AVX2-NEXT:    adcq $-1, %r15
-; AVX2-NEXT:    addq $-1, %rcx
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq $-1, %r11
+; AVX2-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movl $0, %ebx
 ; AVX2-NEXT:    adcq $-1, %rbx
 ; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
@@ -2525,13 +2528,13 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    adcq $-1, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movl $0, %r13d
-; AVX2-NEXT:    adcq $-1, %r13
+; AVX2-NEXT:    movl $0, %r12d
+; AVX2-NEXT:    adcq $-1, %r12
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX2-NEXT:    addq $-1, %rcx
 ; AVX2-NEXT:    movl $0, %r11d
 ; AVX2-NEXT:    adcq $-1, %r11
-; AVX2-NEXT:    addq $-1, %r12
+; AVX2-NEXT:    addq $-1, %rbp
 ; AVX2-NEXT:    movl $0, %r14d
 ; AVX2-NEXT:    adcq $-1, %r14
 ; AVX2-NEXT:    addq $-1, %r9
@@ -2547,10 +2550,10 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    shldq $63, %rdi, %rdx
 ; AVX2-NEXT:    shldq $63, %r9, %r10
-; AVX2-NEXT:    shldq $63, %r12, %r14
+; AVX2-NEXT:    shldq $63, %rbp, %r14
 ; AVX2-NEXT:    shldq $63, %rcx, %r11
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %r13
+; AVX2-NEXT:    shldq $63, %rcx, %r12
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX2-NEXT:    shldq $63, %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
@@ -2566,10 +2569,10 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX2-NEXT:    shldq $63, %rcx, %rax
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %rbp
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rcx, %r13
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    shldq $63, %rcx, %r12
+; AVX2-NEXT:    shldq $63, %rcx, %rbp
 ; AVX2-NEXT:    movq (%rsp), %rdi # 8-byte Reload
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX2-NEXT:    shldq $63, %rcx, %rdi
@@ -2578,8 +2581,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    shldq $63, %rcx, %rsi
 ; AVX2-NEXT:    vmovq %rsi, %xmm8
 ; AVX2-NEXT:    vmovq %rdi, %xmm9
-; AVX2-NEXT:    vmovq %r12, %xmm10
-; AVX2-NEXT:    vmovq %rbp, %xmm11
+; AVX2-NEXT:    vmovq %rbp, %xmm10
+; AVX2-NEXT:    vmovq %r13, %xmm11
 ; AVX2-NEXT:    vmovq %rax, %xmm12
 ; AVX2-NEXT:    vmovq %r15, %xmm13
 ; AVX2-NEXT:    vmovq %rbx, %xmm14
@@ -2587,7 +2590,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX2-NEXT:    vmovq %r9, %xmm0
 ; AVX2-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload
 ; AVX2-NEXT:    # xmm1 = mem[0],zero
-; AVX2-NEXT:    vmovq %r13, %xmm2
+; AVX2-NEXT:    vmovq %r12, %xmm2
 ; AVX2-NEXT:    vmovq %r11, %xmm3
 ; AVX2-NEXT:    vmovq %r14, %xmm4
 ; AVX2-NEXT:    vmovq %r10, %xmm5
@@ -2644,7 +2647,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    pushq %r13
 ; AVX512-NEXT:    pushq %r12
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $16, %rsp
+; AVX512-NEXT:    subq $24, %rsp
 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -2657,8 +2660,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    vmovq %xmm3, %rsi
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rcx
-; AVX512-NEXT:    vmovq %xmm3, %r10
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rdx
+; AVX512-NEXT:    vmovq %xmm3, %r8
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r13
 ; AVX512-NEXT:    vmovq %xmm2, %r12
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
@@ -2666,7 +2669,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm3, %r15
 ; AVX512-NEXT:    vmovq %xmm3, %r14
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r9
 ; AVX512-NEXT:    vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
@@ -2678,34 +2681,35 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %r11
-; AVX512-NEXT:    addq %rbx, %r11
+; AVX512-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX512-NEXT:    addq %rbx, %rax
+; AVX512-NEXT:    movq %rax, %rbx
 ; AVX512-NEXT:    vmovq %xmm4, %rax
 ; AVX512-NEXT:    addq %rbp, %rax
-; AVX512-NEXT:    movq %rax, %rbx
-; AVX512-NEXT:    vpextrq $1, %xmm3, %r8
-; AVX512-NEXT:    addq %rdi, %r8
-; AVX512-NEXT:    vmovq %xmm3, %r15
-; AVX512-NEXT:    addq %rsi, %r15
+; AVX512-NEXT:    movq %rax, %rbp
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512-NEXT:    addq %rdi, %rax
+; AVX512-NEXT:    movq %rax, %rdi
+; AVX512-NEXT:    vmovq %xmm3, %r10
+; AVX512-NEXT:    addq %rsi, %r10
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rdi
-; AVX512-NEXT:    addq %rcx, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rcx
+; AVX512-NEXT:    addq %rdx, %rcx
 ; AVX512-NEXT:    vmovq %xmm3, %rax
-; AVX512-NEXT:    addq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
+; AVX512-NEXT:    addq %r8, %rax
+; AVX512-NEXT:    movq %rax, %r8
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %rsi
 ; AVX512-NEXT:    addq %r13, %rsi
-; AVX512-NEXT:    vmovq %xmm2, %rax
-; AVX512-NEXT:    addq %r12, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vmovq %xmm2, %r11
+; AVX512-NEXT:    addq %r12, %r11
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512-NEXT:    addq %rdx, %rax
+; AVX512-NEXT:    addq %r15, %rax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    vmovq %xmm3, %rax
 ; AVX512-NEXT:    addq %r14, %rax
@@ -2718,33 +2722,24 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rbp
-; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    vmovq %xmm2, %r14
 ; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512-NEXT:    vpextrq $1, %xmm1, %r9
 ; AVX512-NEXT:    addq %rax, %r9
-; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vmovq %xmm1, %rdx
-; AVX512-NEXT:    addq %rcx, %rdx
-; AVX512-NEXT:    addq $-1, %r11
-; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    addq %rax, %rdx
 ; AVX512-NEXT:    addq $-1, %rbx
 ; AVX512-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movl $0, %eax
 ; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; AVX512-NEXT:    addq $-1, %r8
-; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    addq $-1, %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    addq $-1, %rbp
+; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movl $0, %eax
 ; AVX512-NEXT:    adcq $-1, %rax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2752,94 +2747,108 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movl $0, %eax
 ; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, (%rsp) # 8-byte Spill
 ; AVX512-NEXT:    addq $-1, %r10
 ; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movl $0, %eax
 ; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    addq $-1, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movl $0, %eax
+; AVX512-NEXT:    adcq $-1, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    addq $-1, %r8
+; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movl $0, %eax
+; AVX512-NEXT:    adcq $-1, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    addq $-1, %rsi
 ; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %r12d
-; AVX512-NEXT:    adcq $-1, %r12
-; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movl $0, %ebx
-; AVX512-NEXT:    adcq $-1, %rbx
-; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX512-NEXT:    movl $0, %r13d
 ; AVX512-NEXT:    adcq $-1, %r13
-; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    addq $-1, %r11
+; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movl $0, %r15d
 ; AVX512-NEXT:    adcq $-1, %r15
 ; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movl $0, %r11d
-; AVX512-NEXT:    adcq $-1, %r11
+; AVX512-NEXT:    movl $0, %eax
+; AVX512-NEXT:    adcq $-1, %rax
+; AVX512-NEXT:    movq %rax, %rsi
 ; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movl $0, %r8d
-; AVX512-NEXT:    adcq $-1, %r8
+; AVX512-NEXT:    movl $0, %r12d
+; AVX512-NEXT:    adcq $-1, %r12
+; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movl $0, %ebx
+; AVX512-NEXT:    adcq $-1, %rbx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; AVX512-NEXT:    addq $-1, %rbp
+; AVX512-NEXT:    movl $0, %r11d
+; AVX512-NEXT:    adcq $-1, %r11
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    addq $-1, %rax
 ; AVX512-NEXT:    movl $0, %r10d
 ; AVX512-NEXT:    adcq $-1, %r10
 ; AVX512-NEXT:    addq $-1, %r14
+; AVX512-NEXT:    movl $0, %r8d
+; AVX512-NEXT:    adcq $-1, %r8
+; AVX512-NEXT:    addq $-1, %r9
 ; AVX512-NEXT:    movl $0, %edi
 ; AVX512-NEXT:    adcq $-1, %rdi
-; AVX512-NEXT:    addq $-1, %r9
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    adcq $-1, %rsi
 ; AVX512-NEXT:    addq $-1, %rdx
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    adcq $-1, %rax
-; AVX512-NEXT:    shldq $63, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq $63, %r9, %rsi
-; AVX512-NEXT:    shldq $63, %r14, %rdi
-; AVX512-NEXT:    shldq $63, %rbp, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %r8
+; AVX512-NEXT:    movl $0, %ecx
+; AVX512-NEXT:    adcq $-1, %rcx
+; AVX512-NEXT:    shldq $63, %rdx, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq $63, %r9, %rdi
+; AVX512-NEXT:    shldq $63, %r14, %r8
+; AVX512-NEXT:    shldq $63, %rax, %r10
+; AVX512-NEXT:    shldq $63, %rbp, %r11
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %r11
+; AVX512-NEXT:    shldq $63, %rdx, %rbx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %r15
+; AVX512-NEXT:    shldq $63, %rdx, %r12
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rdx, %r13
+; AVX512-NEXT:    shldq $63, %rdx, %rsi
+; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rax, %rbx
+; AVX512-NEXT:    shldq $63, %rax, %r15
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rax, %r12
+; AVX512-NEXT:    shldq $63, %rax, %r13
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rax, %rcx
-; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq $63, %rax, %rsi
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX512-NEXT:    shldq $63, %rax, %rcx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX512-NEXT:    shldq $63, %rdx, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX512-NEXT:    movq (%rsp), %r14 # 8-byte Reload
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX512-NEXT:    shldq $63, %rdx, %r14
-; AVX512-NEXT:    movq (%rsp), %r9 # 8-byte Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX512-NEXT:    shldq $63, %rdx, %r9
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq $63, %rbp, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm8
+; AVX512-NEXT:    shldq $63, %rdx, %rbp
+; AVX512-NEXT:    vmovq %rbp, %xmm8
 ; AVX512-NEXT:    vmovq %r9, %xmm9
 ; AVX512-NEXT:    vmovq %r14, %xmm10
 ; AVX512-NEXT:    vmovq %rax, %xmm11
 ; AVX512-NEXT:    vmovq %rcx, %xmm12
-; AVX512-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 8-byte Folded Reload
-; AVX512-NEXT:    # xmm13 = mem[0],zero
-; AVX512-NEXT:    vmovq %r12, %xmm14
-; AVX512-NEXT:    vmovq %rbx, %xmm15
-; AVX512-NEXT:    vmovq %r13, %xmm0
-; AVX512-NEXT:    vmovq %r15, %xmm1
-; AVX512-NEXT:    vmovq %r11, %xmm2
-; AVX512-NEXT:    vmovq %r8, %xmm3
+; AVX512-NEXT:    vmovq %rsi, %xmm13
+; AVX512-NEXT:    vmovq %r13, %xmm14
+; AVX512-NEXT:    vmovq %r15, %xmm15
+; AVX512-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovq %r12, %xmm1
+; AVX512-NEXT:    vmovq %rbx, %xmm2
+; AVX512-NEXT:    vmovq %r11, %xmm3
 ; AVX512-NEXT:    vmovq %r10, %xmm4
-; AVX512-NEXT:    vmovq %rdi, %xmm5
-; AVX512-NEXT:    vmovq %rsi, %xmm6
+; AVX512-NEXT:    vmovq %r8, %xmm5
+; AVX512-NEXT:    vmovq %rdi, %xmm6
 ; AVX512-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
 ; AVX512-NEXT:    # xmm7 = mem[0],zero
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0]
@@ -2860,7 +2869,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
-; AVX512-NEXT:    addq $16, %rsp
+; AVX512-NEXT:    addq $24, %rsp
 ; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    popq %r12
 ; AVX512-NEXT:    popq %r13
diff --git a/test/CodeGen/X86/crash-O0.ll b/test/CodeGen/X86/crash-O0.ll
index deaf19daccc..1a234d45cb2 100644
--- a/test/CodeGen/X86/crash-O0.ll
+++ b/test/CodeGen/X86/crash-O0.ll
@@ -77,11 +77,11 @@ define i64 @addressModeWith32bitIndex(i32 %V) {
 ; CHECK-NEXT:    movq %rsp, %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    movq %rcx, %rax
 ; CHECK-NEXT:    cqto
-; CHECK-NEXT:    movslq %edi, %rcx
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
-; CHECK-NEXT:    idivq (%rsi,%rcx,8)
+; CHECK-NEXT:    movslq %edi, %rsi
+; CHECK-NEXT:    idivq (%rcx,%rsi,8)
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %gep = getelementptr i64, i64* null, i32 %V
diff --git a/test/CodeGen/X86/hoist-spill.ll b/test/CodeGen/X86/hoist-spill.ll
index 040924a6c28..6a3f5ca01e8 100644
--- a/test/CodeGen/X86/hoist-spill.ll
+++ b/test/CodeGen/X86/hoist-spill.ll
@@ -2,7 +2,9 @@
 
 ; Check no spills to the same stack slot after hoisting.
 ; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET1:-?[0-9]*]](%rsp)
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET2:-?[0-9]*]](%rsp)
 ; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET1]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET2]](%rsp)
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/machine-cse.ll b/test/CodeGen/X86/machine-cse.ll
index 8ce61be555f..b55b43fafa5 100644
--- a/test/CodeGen/X86/machine-cse.ll
+++ b/test/CodeGen/X86/machine-cse.ll
@@ -133,26 +133,25 @@ return:
 define i8* @bsd_memchr(i8* %s, i32 %a, i32 %c, i64 %n) nounwind ssp {
 ; CHECK-LABEL: bsd_memchr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testq %rcx, %rcx
-; CHECK-NEXT:    je .LBB3_5
+; CHECK-NEXT:    je .LBB3_4
 ; CHECK-NEXT:  # %bb.1: # %preheader
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    movzbl %dl, %edx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB3_2: # %do.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpl %edx, %esi
-; CHECK-NEXT:    je .LBB3_3
-; CHECK-NEXT:  # %bb.4: # %do.cond
+; CHECK-NEXT:    je .LBB3_5
+; CHECK-NEXT:  # %bb.3: # %do.cond
 ; CHECK-NEXT:    # in Loop: Header=BB3_2 Depth=1
-; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    incq %rax
 ; CHECK-NEXT:    decq %rcx
 ; CHECK-NEXT:    jne .LBB3_2
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:  .LBB3_5: # %return
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB3_3:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    retq
 entry:
   %cmp = icmp eq i64 %n, 0
   br i1 %cmp, label %return, label %preheader
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index 92e5424253f..4cb6daeec1a 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -356,7 +356,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; SSE2-NEXT:    xorl %ecx, %ecx
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB3_1: # %vector.body
@@ -365,18 +365,18 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm6
 ; SSE2-NEXT:    movdqu 32(%rdi,%rcx,2), %xmm7
 ; SSE2-NEXT:    movdqu 48(%rdi,%rcx,2), %xmm9
-; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm1
-; SSE2-NEXT:    pmaddwd %xmm5, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm1
-; SSE2-NEXT:    pmaddwd %xmm6, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    movdqu 32(%rsi,%rcx,2), %xmm1
-; SSE2-NEXT:    pmaddwd %xmm7, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movdqu 48(%rsi,%rcx,2), %xmm1
-; SSE2-NEXT:    pmaddwd %xmm9, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm3
+; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm0
+; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm0
+; SSE2-NEXT:    pmaddwd %xmm6, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    movdqu 32(%rsi,%rcx,2), %xmm0
+; SSE2-NEXT:    pmaddwd %xmm7, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movdqu 48(%rsi,%rcx,2), %xmm0
+; SSE2-NEXT:    pmaddwd %xmm9, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm3
 ; SSE2-NEXT:    addq $16, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB3_1
@@ -385,14 +385,14 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; SSE2-NEXT:    paddd %xmm8, %xmm3
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm8, %xmm2
-; SSE2-NEXT:    paddd %xmm8, %xmm0
-; SSE2-NEXT:    paddd %xmm3, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm8, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: _Z10test_shortPsS_i_1024:
@@ -949,7 +949,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    xorl %ecx, %ecx
 ; SSE2-NEXT:    pxor %xmm9, %xmm9
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB7_1: # %vector.body
@@ -963,9 +963,9 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    movq {{.*#+}} xmm7 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm7
-; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
@@ -980,11 +980,11 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
 ; SSE2-NEXT:    pmaddwd %xmm7, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    paddd %xmm2, %xmm1
 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pmaddwd %xmm1, %xmm2
+; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm2, %xmm3
 ; SSE2-NEXT:    addq $32, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
@@ -994,14 +994,14 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    paddd %xmm8, %xmm3
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm8, %xmm9
-; SSE2-NEXT:    paddd %xmm8, %xmm0
-; SSE2-NEXT:    paddd %xmm3, %xmm0
-; SSE2-NEXT:    paddd %xmm9, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm8, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: _Z9test_charPcS_i_1024:
diff --git a/test/CodeGen/X86/mmx-arith.ll b/test/CodeGen/X86/mmx-arith.ll
index 4362a193014..2d24cb8df35 100644
--- a/test/CodeGen/X86/mmx-arith.ll
+++ b/test/CodeGen/X86/mmx-arith.ll
@@ -604,13 +604,12 @@ define <1 x i64> @test3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind {
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorl %r8d, %r8d
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    je .LBB3_3
-; X64-NEXT:  # %bb.1: # %bb26.preheader
-; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    je .LBB3_2
 ; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB3_2: # %bb26
+; X64-NEXT:  .LBB3_1: # %bb26
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    movslq %r8d, %r8
 ; X64-NEXT:    movq (%rdi,%r8,8), %rcx
@@ -618,8 +617,8 @@ define <1 x i64> @test3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind {
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    incl %r8d
 ; X64-NEXT:    cmpl %edx, %r8d
-; X64-NEXT:    jb .LBB3_2
-; X64-NEXT:  .LBB3_3: # %bb31
+; X64-NEXT:    jb .LBB3_1
+; X64-NEXT:  .LBB3_2: # %bb31
 ; X64-NEXT:    retq
 entry:
   %tmp2942 = icmp eq i32 %count, 0
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index 878c1c5af61..ab6680cf45a 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -10,12 +10,13 @@ define void @foo() {
 ; X86-O0-LABEL: foo:
 ; X86-O0:       # %bb.0: # %entry
 ; X86-O0-NEXT:    xorl %eax, %eax
-; X86-O0-NEXT:    xorl %ecx, %ecx
+; X86-O0-NEXT:    movl %eax, %ecx
+; X86-O0-NEXT:    xorl %eax, %eax
 ; X86-O0-NEXT:    movzbl c, %edx
-; X86-O0-NEXT:    subl %edx, %ecx
-; X86-O0-NEXT:    movslq %ecx, %rsi
-; X86-O0-NEXT:    subq %rsi, %rax
-; X86-O0-NEXT:    movb %al, %dil
+; X86-O0-NEXT:    subl %edx, %eax
+; X86-O0-NEXT:    movslq %eax, %rsi
+; X86-O0-NEXT:    subq %rsi, %rcx
+; X86-O0-NEXT:    movb %cl, %dil
 ; X86-O0-NEXT:    cmpb $0, %dil
 ; X86-O0-NEXT:    setne %dil
 ; X86-O0-NEXT:    andb $1, %dil
@@ -25,13 +26,13 @@ define void @foo() {
 ; X86-O0-NEXT:    xorb $-1, %dil
 ; X86-O0-NEXT:    xorb $-1, %dil
 ; X86-O0-NEXT:    andb $1, %dil
-; X86-O0-NEXT:    movzbl %dil, %ecx
+; X86-O0-NEXT:    movzbl %dil, %eax
 ; X86-O0-NEXT:    movzbl c, %edx
-; X86-O0-NEXT:    cmpl %edx, %ecx
+; X86-O0-NEXT:    cmpl %edx, %eax
 ; X86-O0-NEXT:    setle %dil
 ; X86-O0-NEXT:    andb $1, %dil
-; X86-O0-NEXT:    movzbl %dil, %ecx
-; X86-O0-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X86-O0-NEXT:    movzbl %dil, %eax
+; X86-O0-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
 ; X86-O0-NEXT:    retq
 ;
 ; X64-LABEL: foo:
diff --git a/test/CodeGen/X86/pr32340.ll b/test/CodeGen/X86/pr32340.ll
index 559bd8d6b5a..b530bb18c93 100644
--- a/test/CodeGen/X86/pr32340.ll
+++ b/test/CodeGen/X86/pr32340.ll
@@ -14,21 +14,22 @@ define void @foo() {
 ; X64-LABEL: foo:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    movw $0, var_825
-; X64-NEXT:    movzwl var_32, %ecx
+; X64-NEXT:    movzwl var_32, %eax
 ; X64-NEXT:    movzwl var_901, %edx
-; X64-NEXT:    movl %ecx, %esi
+; X64-NEXT:    movl %eax, %esi
 ; X64-NEXT:    xorl %edx, %esi
-; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    movl %eax, %edx
 ; X64-NEXT:    xorl %esi, %edx
-; X64-NEXT:    addl %ecx, %edx
+; X64-NEXT:    addl %eax, %edx
 ; X64-NEXT:    movslq %edx, %rdi
 ; X64-NEXT:    movq %rdi, var_826
-; X64-NEXT:    movzwl var_32, %ecx
-; X64-NEXT:    movl %ecx, %edi
-; X64-NEXT:    movzwl var_901, %ecx
-; X64-NEXT:    xorl $51981, %ecx # imm = 0xCB0D
-; X64-NEXT:    movslq %ecx, %r8
+; X64-NEXT:    movzwl var_32, %eax
+; X64-NEXT:    movl %eax, %edi
+; X64-NEXT:    movzwl var_901, %eax
+; X64-NEXT:    xorl $51981, %eax # imm = 0xCB0D
+; X64-NEXT:    movslq %eax, %r8
 ; X64-NEXT:    movabsq $-1142377792914660288, %r9 # imm = 0xF02575732E06E440
 ; X64-NEXT:    xorq %r9, %r8
 ; X64-NEXT:    movq %rdi, %r9
@@ -40,11 +41,11 @@ define void @foo() {
 ; X64-NEXT:    orq %r8, %rdi
 ; X64-NEXT:    movw %di, %r10w
 ; X64-NEXT:    movw %r10w, var_900
-; X64-NEXT:    cmpq var_28, %rax
+; X64-NEXT:    cmpq var_28, %rcx
 ; X64-NEXT:    setne %r11b
 ; X64-NEXT:    andb $1, %r11b
-; X64-NEXT:    movzbl %r11b, %ecx
-; X64-NEXT:    movw %cx, %r10w
+; X64-NEXT:    movzbl %r11b, %eax
+; X64-NEXT:    movw %ax, %r10w
 ; X64-NEXT:    movw %r10w, var_827
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/scheduler-backtracking.ll b/test/CodeGen/X86/scheduler-backtracking.ll
index e558fed7436..811bd9bd031 100644
--- a/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/test/CodeGen/X86/scheduler-backtracking.ll
@@ -20,18 +20,18 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    pushq %rbx
 ; ILP-NEXT:    movq %rcx, %r9
 ; ILP-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; ILP-NEXT:    xorl %eax, %eax
 ; ILP-NEXT:    addq $1, %rsi
 ; ILP-NEXT:    adcq $0, %rdx
 ; ILP-NEXT:    adcq $0, %r9
 ; ILP-NEXT:    adcq $0, %r8
 ; ILP-NEXT:    leal 1(%rsi,%rsi), %edi
 ; ILP-NEXT:    movl $1, %ebp
-; ILP-NEXT:    xorl %eax, %eax
-; ILP-NEXT:    xorl %r11d, %r11d
+; ILP-NEXT:    xorl %r14d, %r14d
 ; ILP-NEXT:    movl %edi, %ecx
-; ILP-NEXT:    shldq %cl, %rbp, %r11
-; ILP-NEXT:    movl $1, %r14d
-; ILP-NEXT:    shlq %cl, %r14
+; ILP-NEXT:    shldq %cl, %rbp, %r14
+; ILP-NEXT:    movl $1, %r11d
+; ILP-NEXT:    shlq %cl, %r11
 ; ILP-NEXT:    movb $-128, %r10b
 ; ILP-NEXT:    subb %dil, %r10b
 ; ILP-NEXT:    movq %r9, %r13
@@ -42,33 +42,33 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    xorl %r15d, %r15d
 ; ILP-NEXT:    movl %edi, %ecx
 ; ILP-NEXT:    shldq %cl, %r15, %r15
-; ILP-NEXT:    movq %rsi, %rbp
-; ILP-NEXT:    shrdq %cl, %rdx, %rbp
+; ILP-NEXT:    movq %rsi, %rbx
+; ILP-NEXT:    shrdq %cl, %rdx, %rbx
 ; ILP-NEXT:    shrq %cl, %rdx
 ; ILP-NEXT:    addb $-128, %cl
 ; ILP-NEXT:    shrdq %cl, %r8, %r9
 ; ILP-NEXT:    testb $64, %dil
-; ILP-NEXT:    cmovneq %r14, %r11
-; ILP-NEXT:    cmoveq %rbp, %rdx
+; ILP-NEXT:    cmovneq %r11, %r14
+; ILP-NEXT:    cmoveq %rbx, %rdx
 ; ILP-NEXT:    cmovneq %rax, %r15
-; ILP-NEXT:    cmovneq %rax, %r14
+; ILP-NEXT:    cmovneq %rax, %r11
 ; ILP-NEXT:    testb $64, %r10b
 ; ILP-NEXT:    cmovneq %rax, %r12
 ; ILP-NEXT:    cmovneq %rax, %r13
-; ILP-NEXT:    movl $1, %ebp
-; ILP-NEXT:    shlq %cl, %rbp
+; ILP-NEXT:    movl $1, %ebx
+; ILP-NEXT:    shlq %cl, %rbx
 ; ILP-NEXT:    orl %edx, %r13d
 ; ILP-NEXT:    xorl %edx, %edx
-; ILP-NEXT:    movl $1, %ebx
-; ILP-NEXT:    shldq %cl, %rbx, %rdx
+; ILP-NEXT:    movl $1, %ebp
+; ILP-NEXT:    shldq %cl, %rbp, %rdx
 ; ILP-NEXT:    shrq %cl, %r8
 ; ILP-NEXT:    testb $64, %cl
 ; ILP-NEXT:    cmoveq %r9, %r8
-; ILP-NEXT:    cmovneq %rbp, %rdx
-; ILP-NEXT:    cmovneq %rax, %rbp
+; ILP-NEXT:    cmovneq %rbx, %rdx
+; ILP-NEXT:    cmovneq %rax, %rbx
 ; ILP-NEXT:    testb %dil, %dil
-; ILP-NEXT:    cmovsq %rax, %r11
 ; ILP-NEXT:    cmovsq %rax, %r14
+; ILP-NEXT:    cmovsq %rax, %r11
 ; ILP-NEXT:    jns .LBB0_2
 ; ILP-NEXT:  # %bb.1:
 ; ILP-NEXT:    movl %r8d, %r13d
@@ -77,20 +77,20 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:  # %bb.3:
 ; ILP-NEXT:    movl %r13d, %esi
 ; ILP-NEXT:  .LBB0_4:
-; ILP-NEXT:    cmovnsq %r12, %rbp
-; ILP-NEXT:    cmoveq %rax, %rbp
+; ILP-NEXT:    cmovnsq %r12, %rbx
+; ILP-NEXT:    cmoveq %rax, %rbx
 ; ILP-NEXT:    cmovnsq %r15, %rdx
 ; ILP-NEXT:    cmoveq %rax, %rdx
 ; ILP-NEXT:    testb $1, %sil
 ; ILP-NEXT:    cmovneq %rax, %rdx
 ; ILP-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; ILP-NEXT:    movq %rdx, 24(%rax)
-; ILP-NEXT:    cmovneq %rax, %rbp
-; ILP-NEXT:    movq %rbp, 16(%rax)
-; ILP-NEXT:    cmovneq %rax, %r11
-; ILP-NEXT:    movq %r11, 8(%rax)
+; ILP-NEXT:    cmovneq %rax, %rbx
+; ILP-NEXT:    movq %rbx, 16(%rax)
 ; ILP-NEXT:    cmovneq %rax, %r14
-; ILP-NEXT:    movq %r14, (%rax)
+; ILP-NEXT:    movq %r14, 8(%rax)
+; ILP-NEXT:    cmovneq %rax, %r11
+; ILP-NEXT:    movq %r11, (%rax)
 ; ILP-NEXT:    popq %rbx
 ; ILP-NEXT:    popq %r12
 ; ILP-NEXT:    popq %r13
@@ -101,6 +101,7 @@ define i256 @test1(i256 %a) nounwind {
 ;
 ; HYBRID-LABEL: test1:
 ; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    pushq %rbp
 ; HYBRID-NEXT:    pushq %r15
 ; HYBRID-NEXT:    pushq %r14
 ; HYBRID-NEXT:    pushq %r13
@@ -112,82 +113,84 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-NEXT:    adcq $0, %rdx
 ; HYBRID-NEXT:    adcq $0, %r9
 ; HYBRID-NEXT:    adcq $0, %r8
+; HYBRID-NEXT:    xorl %r10d, %r10d
 ; HYBRID-NEXT:    leal 1(%rsi,%rsi), %edi
 ; HYBRID-NEXT:    xorl %r14d, %r14d
-; HYBRID-NEXT:    xorl %r15d, %r15d
 ; HYBRID-NEXT:    movl %edi, %ecx
-; HYBRID-NEXT:    shldq %cl, %r15, %r15
+; HYBRID-NEXT:    shldq %cl, %r14, %r14
 ; HYBRID-NEXT:    testb $64, %dil
-; HYBRID-NEXT:    cmovneq %r14, %r15
-; HYBRID-NEXT:    movl $1, %r11d
+; HYBRID-NEXT:    cmovneq %r10, %r14
+; HYBRID-NEXT:    movl $1, %ebp
 ; HYBRID-NEXT:    movl $1, %r12d
 ; HYBRID-NEXT:    shlq %cl, %r12
 ; HYBRID-NEXT:    testb $64, %dil
-; HYBRID-NEXT:    movq %r12, %r10
-; HYBRID-NEXT:    cmovneq %r14, %r10
+; HYBRID-NEXT:    movq %r12, %r11
+; HYBRID-NEXT:    cmovneq %r10, %r11
 ; HYBRID-NEXT:    movq %rsi, %rbx
 ; HYBRID-NEXT:    shrdq %cl, %rdx, %rbx
 ; HYBRID-NEXT:    shrq %cl, %rdx
 ; HYBRID-NEXT:    testb $64, %dil
 ; HYBRID-NEXT:    cmoveq %rbx, %rdx
-; HYBRID-NEXT:    xorl %r13d, %r13d
-; HYBRID-NEXT:    shldq %cl, %r11, %r13
+; HYBRID-NEXT:    xorl %r15d, %r15d
+; HYBRID-NEXT:    shldq %cl, %rbp, %r15
 ; HYBRID-NEXT:    testb $64, %dil
-; HYBRID-NEXT:    cmovneq %r12, %r13
+; HYBRID-NEXT:    cmovneq %r12, %r15
 ; HYBRID-NEXT:    movb $-128, %cl
 ; HYBRID-NEXT:    subb %dil, %cl
-; HYBRID-NEXT:    movq %r9, %rbx
-; HYBRID-NEXT:    shlq %cl, %rbx
+; HYBRID-NEXT:    movq %r9, %r13
+; HYBRID-NEXT:    shlq %cl, %r13
 ; HYBRID-NEXT:    movl $1, %r12d
-; HYBRID-NEXT:    shrdq %cl, %r14, %r12
+; HYBRID-NEXT:    shrdq %cl, %r10, %r12
 ; HYBRID-NEXT:    testb $64, %cl
-; HYBRID-NEXT:    cmovneq %r14, %r12
-; HYBRID-NEXT:    cmovneq %r14, %rbx
-; HYBRID-NEXT:    orl %edx, %ebx
+; HYBRID-NEXT:    cmovneq %r10, %r12
+; HYBRID-NEXT:    cmovneq %r10, %r13
+; HYBRID-NEXT:    orl %edx, %r13d
 ; HYBRID-NEXT:    movl %edi, %ecx
 ; HYBRID-NEXT:    addb $-128, %cl
 ; HYBRID-NEXT:    shrdq %cl, %r8, %r9
 ; HYBRID-NEXT:    shrq %cl, %r8
 ; HYBRID-NEXT:    xorl %edx, %edx
-; HYBRID-NEXT:    shldq %cl, %r11, %rdx
-; HYBRID-NEXT:    shlq %cl, %r11
+; HYBRID-NEXT:    shldq %cl, %rbp, %rdx
+; HYBRID-NEXT:    shlq %cl, %rbp
 ; HYBRID-NEXT:    testb $64, %cl
-; HYBRID-NEXT:    cmovneq %r11, %rdx
+; HYBRID-NEXT:    cmovneq %rbp, %rdx
 ; HYBRID-NEXT:    cmoveq %r9, %r8
-; HYBRID-NEXT:    cmovneq %r14, %r11
+; HYBRID-NEXT:    cmovneq %r10, %rbp
 ; HYBRID-NEXT:    testb %dil, %dil
 ; HYBRID-NEXT:    jns .LBB0_2
 ; HYBRID-NEXT:  # %bb.1:
-; HYBRID-NEXT:    movl %r8d, %ebx
+; HYBRID-NEXT:    movl %r8d, %r13d
 ; HYBRID-NEXT:  .LBB0_2:
 ; HYBRID-NEXT:    je .LBB0_4
 ; HYBRID-NEXT:  # %bb.3:
-; HYBRID-NEXT:    movl %ebx, %esi
+; HYBRID-NEXT:    movl %r13d, %esi
 ; HYBRID-NEXT:  .LBB0_4:
-; HYBRID-NEXT:    cmovsq %r14, %r13
-; HYBRID-NEXT:    cmovnsq %r12, %r11
-; HYBRID-NEXT:    cmoveq %r14, %r11
-; HYBRID-NEXT:    cmovnsq %r15, %rdx
-; HYBRID-NEXT:    cmoveq %r14, %rdx
-; HYBRID-NEXT:    cmovsq %r14, %r10
+; HYBRID-NEXT:    cmovsq %r10, %r15
+; HYBRID-NEXT:    cmovnsq %r12, %rbp
+; HYBRID-NEXT:    cmoveq %r10, %rbp
+; HYBRID-NEXT:    cmovnsq %r14, %rdx
+; HYBRID-NEXT:    cmoveq %r10, %rdx
+; HYBRID-NEXT:    cmovsq %r10, %r11
 ; HYBRID-NEXT:    testb $1, %sil
 ; HYBRID-NEXT:    cmovneq %rax, %rdx
 ; HYBRID-NEXT:    movq %rdx, 24(%rax)
+; HYBRID-NEXT:    cmovneq %rax, %rbp
+; HYBRID-NEXT:    movq %rbp, 16(%rax)
+; HYBRID-NEXT:    cmovneq %rax, %r15
+; HYBRID-NEXT:    movq %r15, 8(%rax)
 ; HYBRID-NEXT:    cmovneq %rax, %r11
-; HYBRID-NEXT:    movq %r11, 16(%rax)
-; HYBRID-NEXT:    cmovneq %rax, %r13
-; HYBRID-NEXT:    movq %r13, 8(%rax)
-; HYBRID-NEXT:    cmovneq %rax, %r10
-; HYBRID-NEXT:    movq %r10, (%rax)
+; HYBRID-NEXT:    movq %r11, (%rax)
 ; HYBRID-NEXT:    popq %rbx
 ; HYBRID-NEXT:    popq %r12
 ; HYBRID-NEXT:    popq %r13
 ; HYBRID-NEXT:    popq %r14
 ; HYBRID-NEXT:    popq %r15
+; HYBRID-NEXT:    popq %rbp
 ; HYBRID-NEXT:    retq
 ;
 ; BURR-LABEL: test1:
 ; BURR:       # %bb.0:
+; BURR-NEXT:    pushq %rbp
 ; BURR-NEXT:    pushq %r15
 ; BURR-NEXT:    pushq %r14
 ; BURR-NEXT:    pushq %r13
@@ -199,78 +202,79 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-NEXT:    adcq $0, %rdx
 ; BURR-NEXT:    adcq $0, %r9
 ; BURR-NEXT:    adcq $0, %r8
+; BURR-NEXT:    xorl %r10d, %r10d
 ; BURR-NEXT:    leal 1(%rsi,%rsi), %edi
 ; BURR-NEXT:    xorl %r14d, %r14d
-; BURR-NEXT:    xorl %r15d, %r15d
 ; BURR-NEXT:    movl %edi, %ecx
-; BURR-NEXT:    shldq %cl, %r15, %r15
+; BURR-NEXT:    shldq %cl, %r14, %r14
 ; BURR-NEXT:    testb $64, %dil
-; BURR-NEXT:    cmovneq %r14, %r15
-; BURR-NEXT:    movl $1, %r11d
+; BURR-NEXT:    cmovneq %r10, %r14
+; BURR-NEXT:    movl $1, %ebp
 ; BURR-NEXT:    movl $1, %r12d
 ; BURR-NEXT:    shlq %cl, %r12
 ; BURR-NEXT:    testb $64, %dil
-; BURR-NEXT:    movq %r12, %r10
-; BURR-NEXT:    cmovneq %r14, %r10
+; BURR-NEXT:    movq %r12, %r11
+; BURR-NEXT:    cmovneq %r10, %r11
 ; BURR-NEXT:    movq %rsi, %rbx
 ; BURR-NEXT:    shrdq %cl, %rdx, %rbx
 ; BURR-NEXT:    shrq %cl, %rdx
 ; BURR-NEXT:    testb $64, %dil
 ; BURR-NEXT:    cmoveq %rbx, %rdx
-; BURR-NEXT:    xorl %r13d, %r13d
-; BURR-NEXT:    shldq %cl, %r11, %r13
+; BURR-NEXT:    xorl %r15d, %r15d
+; BURR-NEXT:    shldq %cl, %rbp, %r15
 ; BURR-NEXT:    testb $64, %dil
-; BURR-NEXT:    cmovneq %r12, %r13
+; BURR-NEXT:    cmovneq %r12, %r15
 ; BURR-NEXT:    movb $-128, %cl
 ; BURR-NEXT:    subb %dil, %cl
-; BURR-NEXT:    movq %r9, %rbx
-; BURR-NEXT:    shlq %cl, %rbx
+; BURR-NEXT:    movq %r9, %r13
+; BURR-NEXT:    shlq %cl, %r13
 ; BURR-NEXT:    movl $1, %r12d
-; BURR-NEXT:    shrdq %cl, %r14, %r12
+; BURR-NEXT:    shrdq %cl, %r10, %r12
 ; BURR-NEXT:    testb $64, %cl
-; BURR-NEXT:    cmovneq %r14, %r12
-; BURR-NEXT:    cmovneq %r14, %rbx
-; BURR-NEXT:    orl %edx, %ebx
+; BURR-NEXT:    cmovneq %r10, %r12
+; BURR-NEXT:    cmovneq %r10, %r13
+; BURR-NEXT:    orl %edx, %r13d
 ; BURR-NEXT:    movl %edi, %ecx
 ; BURR-NEXT:    addb $-128, %cl
 ; BURR-NEXT:    shrdq %cl, %r8, %r9
 ; BURR-NEXT:    xorl %edx, %edx
-; BURR-NEXT:    shldq %cl, %r11, %rdx
+; BURR-NEXT:    shldq %cl, %rbp, %rdx
 ; BURR-NEXT:    shrq %cl, %r8
-; BURR-NEXT:    shlq %cl, %r11
+; BURR-NEXT:    shlq %cl, %rbp
 ; BURR-NEXT:    testb $64, %cl
-; BURR-NEXT:    cmovneq %r11, %rdx
+; BURR-NEXT:    cmovneq %rbp, %rdx
 ; BURR-NEXT:    cmoveq %r9, %r8
-; BURR-NEXT:    cmovneq %r14, %r11
+; BURR-NEXT:    cmovneq %r10, %rbp
 ; BURR-NEXT:    testb %dil, %dil
 ; BURR-NEXT:    jns .LBB0_2
 ; BURR-NEXT:  # %bb.1:
-; BURR-NEXT:    movl %r8d, %ebx
+; BURR-NEXT:    movl %r8d, %r13d
 ; BURR-NEXT:  .LBB0_2:
 ; BURR-NEXT:    je .LBB0_4
 ; BURR-NEXT:  # %bb.3:
-; BURR-NEXT:    movl %ebx, %esi
+; BURR-NEXT:    movl %r13d, %esi
 ; BURR-NEXT:  .LBB0_4:
-; BURR-NEXT:    cmovsq %r14, %r13
-; BURR-NEXT:    cmovnsq %r12, %r11
-; BURR-NEXT:    cmoveq %r14, %r11
-; BURR-NEXT:    cmovnsq %r15, %rdx
-; BURR-NEXT:    cmoveq %r14, %rdx
-; BURR-NEXT:    cmovsq %r14, %r10
+; BURR-NEXT:    cmovsq %r10, %r15
+; BURR-NEXT:    cmovnsq %r12, %rbp
+; BURR-NEXT:    cmoveq %r10, %rbp
+; BURR-NEXT:    cmovnsq %r14, %rdx
+; BURR-NEXT:    cmoveq %r10, %rdx
+; BURR-NEXT:    cmovsq %r10, %r11
 ; BURR-NEXT:    testb $1, %sil
 ; BURR-NEXT:    cmovneq %rax, %rdx
 ; BURR-NEXT:    movq %rdx, 24(%rax)
+; BURR-NEXT:    cmovneq %rax, %rbp
+; BURR-NEXT:    movq %rbp, 16(%rax)
+; BURR-NEXT:    cmovneq %rax, %r15
+; BURR-NEXT:    movq %r15, 8(%rax)
 ; BURR-NEXT:    cmovneq %rax, %r11
-; BURR-NEXT:    movq %r11, 16(%rax)
-; BURR-NEXT:    cmovneq %rax, %r13
-; BURR-NEXT:    movq %r13, 8(%rax)
-; BURR-NEXT:    cmovneq %rax, %r10
-; BURR-NEXT:    movq %r10, (%rax)
+; BURR-NEXT:    movq %r11, (%rax)
 ; BURR-NEXT:    popq %rbx
 ; BURR-NEXT:    popq %r12
 ; BURR-NEXT:    popq %r13
 ; BURR-NEXT:    popq %r14
 ; BURR-NEXT:    popq %r15
+; BURR-NEXT:    popq %rbp
 ; BURR-NEXT:    retq
 ;
 ; SRC-LABEL: test1:
@@ -297,8 +301,8 @@ define i256 @test1(i256 %a) nounwind {
 ; SRC-NEXT:    movl %r11d, %ecx
 ; SRC-NEXT:    shrdq %cl, %rdx, %rbp
 ; SRC-NEXT:    shrq %cl, %rdx
-; SRC-NEXT:    movl $1, %edi
 ; SRC-NEXT:    xorl %r15d, %r15d
+; SRC-NEXT:    movl $1, %edi
 ; SRC-NEXT:    xorl %r14d, %r14d
 ; SRC-NEXT:    shldq %cl, %rdi, %r14
 ; SRC-NEXT:    xorl %r13d, %r13d
@@ -906,15 +910,15 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
 ; ILP-LABEL: test4:
 ; ILP:       # %bb.0:
 ; ILP-NEXT:    xorl %ecx, %ecx
+; ILP-NEXT:    xorl %edx, %edx
 ; ILP-NEXT:    addq $1, %rsi
-; ILP-NEXT:    setb %cl
+; ILP-NEXT:    setb %dl
 ; ILP-NEXT:    movl $2, %eax
-; ILP-NEXT:    xorl %edx, %edx
 ; ILP-NEXT:    cmpq %rdi, %rsi
-; ILP-NEXT:    sbbq $0, %rcx
-; ILP-NEXT:    movl $0, %ecx
-; ILP-NEXT:    sbbq $0, %rcx
 ; ILP-NEXT:    sbbq $0, %rdx
+; ILP-NEXT:    movl $0, %edx
+; ILP-NEXT:    sbbq $0, %rdx
+; ILP-NEXT:    sbbq $0, %rcx
 ; ILP-NEXT:    setae %cl
 ; ILP-NEXT:    movzbl %cl, %ecx
 ; ILP-NEXT:    subq %rcx, %rax
@@ -923,14 +927,14 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
 ; HYBRID-LABEL: test4:
 ; HYBRID:       # %bb.0:
 ; HYBRID-NEXT:    xorl %eax, %eax
-; HYBRID-NEXT:    addq $1, %rsi
-; HYBRID-NEXT:    setb %al
 ; HYBRID-NEXT:    xorl %ecx, %ecx
+; HYBRID-NEXT:    addq $1, %rsi
+; HYBRID-NEXT:    setb %cl
 ; HYBRID-NEXT:    cmpq %rdi, %rsi
-; HYBRID-NEXT:    sbbq $0, %rax
-; HYBRID-NEXT:    movl $0, %eax
-; HYBRID-NEXT:    sbbq $0, %rax
 ; HYBRID-NEXT:    sbbq $0, %rcx
+; HYBRID-NEXT:    movl $0, %ecx
+; HYBRID-NEXT:    sbbq $0, %rcx
+; HYBRID-NEXT:    sbbq $0, %rax
 ; HYBRID-NEXT:    setae %al
 ; HYBRID-NEXT:    movzbl %al, %ecx
 ; HYBRID-NEXT:    movl $2, %eax
@@ -940,14 +944,14 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
 ; BURR-LABEL: test4:
 ; BURR:       # %bb.0:
 ; BURR-NEXT:    xorl %eax, %eax
-; BURR-NEXT:    addq $1, %rsi
-; BURR-NEXT:    setb %al
 ; BURR-NEXT:    xorl %ecx, %ecx
+; BURR-NEXT:    addq $1, %rsi
+; BURR-NEXT:    setb %cl
 ; BURR-NEXT:    cmpq %rdi, %rsi
-; BURR-NEXT:    sbbq $0, %rax
-; BURR-NEXT:    movl $0, %eax
-; BURR-NEXT:    sbbq $0, %rax
 ; BURR-NEXT:    sbbq $0, %rcx
+; BURR-NEXT:    movl $0, %ecx
+; BURR-NEXT:    sbbq $0, %rcx
+; BURR-NEXT:    sbbq $0, %rax
 ; BURR-NEXT:    setae %al
 ; BURR-NEXT:    movzbl %al, %ecx
 ; BURR-NEXT:    movl $2, %eax
diff --git a/test/CodeGen/X86/spill-zero-x86_64.ll b/test/CodeGen/X86/spill-zero-x86_64.ll
deleted file mode 100644
index d90cca6eabd..00000000000
--- a/test/CodeGen/X86/spill-zero-x86_64.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
-
-; This test checks that we use "movq $0, (%rsp)" to spill a 0 to the stack. It
-; was reduced from a larger function.
-
-; CHECK:    movq $0, (%rsp) # 8-byte Folded Spill
-
-%struct.foo = type { i8*, i32 }
-
-declare void @pluto()
-
-define void @spam() {
-bb:
-  br label %bb13
-
-bb1:                                              ; preds = %bb18
-  call void @pluto()
-  %tmp = getelementptr inbounds %struct.foo, %struct.foo* %tmp20, i64 0, i32 1
-  %tmp2 = bitcast i32* %tmp to %struct.foo**
-  store %struct.foo* null, %struct.foo** %tmp2
-  unreachable
-
-bb3:                                              ; preds = %bb18
-  call void @pluto()
-  store i8* %tmp22, i8** undef
-  unreachable
-
-bb4:                                              ; preds = %bb18
-  call void @pluto()
-  br label %bb13
-
-bb5:                                              ; preds = %bb18
-  %tmp7 = add nsw i32 %tmp23, 1
-  store i8* %tmp22, i8** undef
-  unreachable
-
-bb8:                                              ; preds = %bb18
-  store %struct.foo* %tmp14, %struct.foo** undef
-  unreachable
-
-bb9:                                              ; preds = %bb18
-  %tmp10 = load %struct.foo*, %struct.foo** undef
-  br label %bb13
-
-bb13:                                             ; preds = %bb18, %bb9, %bb4, %bb
-  %tmp14 = phi %struct.foo* [ %tmp14, %bb18 ], [ %tmp14, %bb4 ], [ null, %bb ], [ %tmp10, %bb9 ]
-  %tmp15 = phi %struct.foo* [ %tmp26, %bb18 ], [ %tmp26, %bb4 ], [ null, %bb ], [ %tmp26, %bb9 ]
-  %tmp16 = phi i32 [ %tmp23, %bb18 ], [ %tmp23, %bb4 ], [ 0, %bb ], [ %tmp23, %bb9 ]
-  br label %bb17
-
-bb17:                                             ; preds = %bb13
-  br i1 false, label %bb27, label %bb18
-
-bb18:                                             ; preds = %bb17
-  %tmp19 = load %struct.foo*, %struct.foo** undef
-  %tmp20 = getelementptr inbounds %struct.foo, %struct.foo* %tmp19, i64 0
-  %tmp21 = getelementptr inbounds %struct.foo, %struct.foo* %tmp20, i64 0, i32 0
-  %tmp22 = load i8*, i8** %tmp21
-  %tmp23 = add nsw i32 %tmp16, -1
-  %tmp24 = getelementptr inbounds %struct.foo, %struct.foo* %tmp15, i64 0, i32 1
-  %tmp25 = bitcast i32* %tmp24 to %struct.foo**
-  %tmp26 = load %struct.foo*, %struct.foo** %tmp25
-  switch i32 undef, label %bb9 [
-    i32 1, label %bb1
-    i32 2, label %bb3
-    i32 3, label %bb4
-    i32 4, label %bb5
-    i32 5, label %bb13
-    i32 6, label %bb8
-  ]
-
-bb27:                                             ; preds = %bb17
-  ret void
-}
diff --git a/test/CodeGen/X86/swifterror.ll b/test/CodeGen/X86/swifterror.ll
index a88a714f016..cb0597f7151 100644
--- a/test/CodeGen/X86/swifterror.ll
+++ b/test/CodeGen/X86/swifterror.ll
@@ -41,7 +41,8 @@ define float @caller(i8* %error_ref) {
 ; CHECK-APPLE: callq {{.*}}free
 
 ; CHECK-O0-LABEL: caller:
-; CHECK-O0: xorl %r12d, %r12d
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
 ; CHECK-O0: callq {{.*}}foo
 ; CHECK-O0: jne
 entry:
@@ -77,7 +78,8 @@ define float @caller2(i8* %error_ref) {
 ; CHECK-APPLE: callq {{.*}}free
 
 ; CHECK-O0-LABEL: caller2:
-; CHECK-O0: xorl %r12d, %r12d
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
 ; CHECK-O0: callq {{.*}}foo
 ; CHECK-O0: movq %r12, [[ID:%[a-z]+]]
 ; CHECK-O0: cmpq $0, %r12
@@ -252,7 +254,8 @@ define float @caller3(i8* %error_ref) {
 ; CHECK-APPLE: callq {{.*}}free
 
 ; CHECK-O0-LABEL: caller3:
-; CHECK-O0: xorl %r12d, %r12d
+; CHECK-O0: xorl
+; CHECK-O0: movl {{.*}}, %r12d
 ; CHECK-O0: movl $1, %esi
 ; CHECK-O0: movq {{.*}}, %rdi
 ; CHECK-O0: callq {{.*}}foo_sret
@@ -310,12 +313,14 @@ define float @caller_with_multiple_swifterror_values(i8* %error_ref, i8* %error_
 ; CHECK-O0-LABEL: caller_with_multiple_swifterror_values:
 
 ; The first swifterror value:
-; CHECK-O0: xorl %r12d, %r12d
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
 ; CHECK-O0: callq {{.*}}foo
 ; CHECK-O0: jne
 
 ; The second swifterror value:
-; CHECK-O0: xorl %r12d, %r12d
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
 ; CHECK-O0: callq {{.*}}foo
 ; CHECK-O0: jne
 entry:
@@ -710,7 +715,8 @@ declare swiftcc void @foo2(%swift_error** swifterror)
 ; Make sure we properly assign registers during fast-isel.
 ; CHECK-O0-LABEL: testAssign
 ; CHECK-O0:        pushq   %r12
-; CHECK-O0:        xorl    %r12d, %r12d
+; CHECK-O0:        xorl    [[ZERO:%[a-z0-9]+]], [[ZERO]]
+; CHECK-O0:        movl    [[ZERO]], %r12d
 ; CHECK-O0:        callq   _foo2
 ; CHECK-O0:        movq    %r12, [[SLOT:[-a-z0-9\(\)\%]*]]
 ;
@@ -786,7 +792,8 @@ a:
 
 ; CHECK-O0-LABEL: testAssign4
 ; CHECK-O0:        callq   _foo2
-; CHECK-O0:        xorl    %eax, %eax
+; CHECK-O0:        xorl    %ecx, %ecx
+; CHECK-O0:        movl    %ecx, %eax
 ; CHECK-O0:        movq    %rax, [[SLOT:[-a-z0-9\(\)\%]*]]
 ; CHECK-O0:        movq    [[SLOT]], %rax
 ; CHECK-O0:        movq    %rax, [[SLOT2:[-a-z0-9\(\)\%]*]]
-- 
GitLab


From 844c605ee0bcd2c416a1b63e1d0d5ea033cde38c Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Wed, 31 Oct 2018 21:56:49 +0000
Subject: [PATCH 0832/1116] [AArch64] Sort switch cases (NFC)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345786 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64Subtarget.cpp | 43 +++++++++++++------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index c181f4016b6..49d737bea6a 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -67,16 +67,30 @@ void AArch64Subtarget::initializeProperties() {
   // this in the future so we can specify it together with the subtarget
   // features.
   switch (ARMProcFamily) {
+  case Others:
+    break;
+  case CortexA35:
+    break;
+  case CortexA53:
+    PrefFunctionAlignment = 3;
+    break;
+  case CortexA55:
+    break;
+  case CortexA57:
+    MaxInterleaveFactor = 4;
+    PrefFunctionAlignment = 4;
+    break;
+  case CortexA72:
+  case CortexA73:
+  case CortexA75:
+    PrefFunctionAlignment = 4;
+    break;
   case Cyclone:
     CacheLineSize = 64;
     PrefetchDistance = 280;
     MinPrefetchStride = 2048;
     MaxPrefetchIterationsAhead = 3;
     break;
-  case CortexA57:
-    MaxInterleaveFactor = 4;
-    PrefFunctionAlignment = 4;
-    break;
   case ExynosM1:
     MaxInterleaveFactor = 4;
     MaxJumpTableSize = 8;
@@ -98,11 +112,6 @@ void AArch64Subtarget::initializeProperties() {
     MinPrefetchStride = 2048;
     MaxPrefetchIterationsAhead = 8;
     break;
-  case Saphira:
-    MaxInterleaveFactor = 4;
-    // FIXME: remove this to enable 64-bit SLP if performance looks good.
-    MinVectorRegisterBitWidth = 128;
-    break;
   case Kryo:
     MaxInterleaveFactor = 4;
     VectorInsertExtractBaseCost = 2;
@@ -113,6 +122,11 @@ void AArch64Subtarget::initializeProperties() {
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
+  case Saphira:
+    MaxInterleaveFactor = 4;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
+    break;
   case ThunderX2T99:
     CacheLineSize = 64;
     PrefFunctionAlignment = 3;
@@ -134,17 +148,6 @@ void AArch64Subtarget::initializeProperties() {
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
-  case CortexA35: break;
-  case CortexA53:
-    PrefFunctionAlignment = 3;
-    break;
-  case CortexA55: break;
-  case CortexA72:
-  case CortexA73:
-  case CortexA75:
-    PrefFunctionAlignment = 4;
-    break;
-  case Others: break;
   }
 }
 
-- 
GitLab


From 4cf0e5ba23efbaa11b48a04b79fca8e74ec044dd Mon Sep 17 00:00:00 2001
From: George Burgess IV <george.burgess.iv@gmail.com>
Date: Wed, 31 Oct 2018 22:45:31 +0000
Subject: [PATCH 0833/1116] [InlineCost] Remove a dead constant; NFC

My `grep`-fu indicates that this hasn't been used for years. It also no
longer makes much sense to have this flavor of penalty in general, since
a call to a noreturn should mean that we're in a BB that's terminated by
`unreachable`. That case is accounted for by
CallAnalyzer::allowSizeGrowth.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345789 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/InlineCost.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/llvm/Analysis/InlineCost.h b/include/llvm/Analysis/InlineCost.h
index 529fb75bec9..4c270354b0c 100644
--- a/include/llvm/Analysis/InlineCost.h
+++ b/include/llvm/Analysis/InlineCost.h
@@ -46,7 +46,6 @@ const int IndirectCallThreshold = 100;
 const int CallPenalty = 25;
 const int LastCallToStaticBonus = 15000;
 const int ColdccPenalty = 2000;
-const int NoreturnPenalty = 10000;
 /// Do not inline functions which allocate this many bytes on the stack
 /// when the caller is recursive.
 const unsigned TotalAllocaSizeRecursiveCaller = 1024;
-- 
GitLab


From 58970cd2b44d321b07d4a65f1f06bc22cfdcad24 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Wed, 31 Oct 2018 23:03:58 +0000
Subject: [PATCH 0834/1116] [IR] Allow increasing the alignment of dso-local
 globals.

I think this is the actual important property; the previous visibility
check was an approximation.

Differential Revision: https://reviews.llvm.org/D53852


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345790 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Globals.cpp          | 2 +-
 test/CodeGen/ARM/memfunc.ll | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 3f57b1dbfa8..cbd6450a20c 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -252,7 +252,7 @@ bool GlobalValue::canIncreaseAlignment() const {
   // Conservatively assume ELF if there's no parent pointer.
   bool isELF =
       (!Parent || Triple(Parent->getTargetTriple()).isOSBinFormatELF());
-  if (isELF && hasDefaultVisibility() && !hasLocalLinkage())
+  if (isELF && !isDSOLocal())
     return false;
 
   return true;
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index b415ff7b7f4..6c0668a53e8 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -388,6 +388,7 @@ entry:
 @arr7 = external global [7 x i8], align 1
 @arr8 = internal global [128 x i8] undef
 @arr9 = weak_odr global [128 x i8] undef
+@arr10 = dso_local global [8 x i8] c"\01\02\03\04\05\06\07\08", align 1
 define void @f9(i8* %dest, i32 %n) "no-frame-pointer-elim"="true" {
 entry:
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr1, i32 0, i32 0), i32 %n, i1 false)
@@ -399,7 +400,7 @@ entry:
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr7, i32 0, i32 0), i32 %n, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([128 x i8], [128 x i8]* @arr8, i32 0, i32 0), i32 %n, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([128 x i8], [128 x i8]* @arr9, i32 0, i32 0), i32 %n, i1 false)
-
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @arr10, i32 0, i32 0), i32 %n, i1 false)
   unreachable
 }
 
@@ -427,6 +428,11 @@ entry:
 ; CHECK-GNUEABI: arr8,128,16
 ; CHECK: .p2align 4
 ; CHECK: arr9:
+; CHECK-IOS: .p2align 3
+; CHECK-DARWIN: .p2align 2
+; CHECK-EABI: .p2align 2
+; CHECK-GNUEABI: .p2align 2
+; CHECK: arr10:
 
 ; CHECK-NOT: arr7:
 
-- 
GitLab


From 585b6667b4712e3c7f32401e929855b3313b4ff2 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Grang <mgrang@codeaurora.org>
Date: Wed, 31 Oct 2018 23:16:20 +0000
Subject: [PATCH 0835/1116] [COFF, ARM64] Implement Intrinsic.sponentry for
 AArch64

Summary: This patch adds Intrinsic.sponentry. This intrinsic is required to correctly support setjmp for AArch64 Windows platform.

Reviewers: mgrang, TomTan, rnk, compnerd, mstorsjo, efriedma

Reviewed By: efriedma

Subscribers: majnemer, chrib, javed.absar, kristof.beyls, llvm-commits

Differential Revision: https://reviews.llvm.org/D53673

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345791 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                              |  79 ++++++++-----
 include/llvm/CodeGen/ISDOpcodes.h             |   2 +-
 include/llvm/IR/Intrinsics.td                 |   1 +
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |   1 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   4 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   1 +
 lib/Target/AArch64/AArch64FastISel.cpp        |  16 +++
 lib/Target/AArch64/AArch64ISelLowering.cpp    |  12 ++
 lib/Target/AArch64/AArch64ISelLowering.h      |   1 +
 test/CodeGen/AArch64/sponentry.ll             | 104 ++++++++++++++++++
 10 files changed, 191 insertions(+), 30 deletions(-)
 create mode 100644 test/CodeGen/AArch64/sponentry.ll

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 39134fafd46..d57f79f0039 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -2926,7 +2926,7 @@ Simple Constants
     hexadecimal notation (see below). The assembler requires the exact
     decimal value of a floating-point constant. For example, the
     assembler accepts 1.25 but rejects 1.3 because 1.3 is a repeating
-    decimal in binary. Floating-point constants must have a 
+    decimal in binary. Floating-point constants must have a
     :ref:`floating-point <t_floating>` type.
 **Null pointer constants**
     The identifier '``null``' is recognized as a null pointer constant
@@ -3331,7 +3331,7 @@ The following is the syntax for constant expressions:
     value won't fit in the integer type, the result is a
     :ref:`poison value <poisonvalues>`.
 ``uitofp (CST to TYPE)``
-    Convert an unsigned integer constant to the corresponding 
+    Convert an unsigned integer constant to the corresponding
     floating-point constant. TYPE must be a scalar or vector floating-point
     type.  CST must be of scalar or vector integer type. Both CST and TYPE must
     be scalars, or vectors of the same number of elements.
@@ -5434,7 +5434,7 @@ Irreducible loop header weights are typically based on profile data.
 '``invariant.group``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The experimental ``invariant.group`` metadata may be attached to 
+The experimental ``invariant.group`` metadata may be attached to
 ``load``/``store`` instructions referencing a single metadata with no entries.
 The existence of the ``invariant.group`` metadata on the instruction tells
 the optimizer that every ``load`` and ``store`` to the same pointer operand
@@ -6875,7 +6875,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fadd``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -6883,7 +6883,7 @@ Semantics:
 
 The value produced is the floating-point sum of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -6972,7 +6972,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fsub``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -6980,7 +6980,7 @@ Semantics:
 
 The value produced is the floating-point difference of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7067,7 +7067,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fmul``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7075,7 +7075,7 @@ Semantics:
 
 The value produced is the floating-point product of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7201,7 +7201,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fdiv``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7209,7 +7209,7 @@ Semantics:
 
 The value produced is the floating-point quotient of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7344,7 +7344,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``frem``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7352,10 +7352,10 @@ Semantics:
 
 The value produced is the floating-point remainder of the two operands.
 This is the same output as a libm '``fmod``' function, but without any
-possibility of setting ``errno``. The remainder has the same sign as the 
+possibility of setting ``errno``. The remainder has the same sign as the
 dividend.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -8809,7 +8809,7 @@ Semantics:
 
 The '``fptrunc``' instruction casts a ``value`` from a larger
 :ref:`floating-point <t_floating>` type to a smaller :ref:`floating-point
-<t_floating>` type.  
+<t_floating>` type.
 This instruction is assumed to execute in the default :ref:`floating-point
 environment <floatenv>`.
 
@@ -10330,6 +10330,27 @@ of the obvious source-language caller.
 
 This intrinsic is only implemented for x86.
 
+'``llvm.sponentry``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i8* @llvm.sponentry()
+
+Overview:
+"""""""""
+
+The '``llvm.sponentry``' intrinsic returns the stack pointer value at
+the entry of the current function calling this intrinsic.
+
+Semantics:
+""""""""""
+
+Note this intrinsic is only verified on AArch64.
+
 '``llvm.frameaddress``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -12115,11 +12136,11 @@ Overview:
 
 The '``llvm.fshl``' family of intrinsic functions performs a funnel shift left:
 the first two values are concatenated as { %a : %b } (%a is the most significant
-bits of the wide value), the combined value is shifted left, and the most 
-significant bits are extracted to produce a result that is the same size as the 
-original arguments. If the first 2 arguments are identical, this is equivalent 
-to a rotate left operation. For vector types, the operation occurs for each 
-element of the vector. The shift argument is treated as an unsigned amount 
+bits of the wide value), the combined value is shifted left, and the most
+significant bits are extracted to produce a result that is the same size as the
+original arguments. If the first 2 arguments are identical, this is equivalent
+to a rotate left operation. For vector types, the operation occurs for each
+element of the vector. The shift argument is treated as an unsigned amount
 modulo the element size of the arguments.
 
 Arguments:
@@ -12161,11 +12182,11 @@ Overview:
 
 The '``llvm.fshr``' family of intrinsic functions performs a funnel shift right:
 the first two values are concatenated as { %a : %b } (%a is the most significant
-bits of the wide value), the combined value is shifted right, and the least 
-significant bits are extracted to produce a result that is the same size as the 
-original arguments. If the first 2 arguments are identical, this is equivalent 
-to a rotate right operation. For vector types, the operation occurs for each 
-element of the vector. The shift argument is treated as an unsigned amount 
+bits of the wide value), the combined value is shifted right, and the least
+significant bits are extracted to produce a result that is the same size as the
+original arguments. If the first 2 arguments are identical, this is equivalent
+to a rotate right operation. For vector types, the operation occurs for each
+element of the vector. The shift argument is treated as an unsigned amount
 modulo the element size of the arguments.
 
 Arguments:
@@ -13446,7 +13467,7 @@ The '``llvm.masked.expandload``' intrinsic is designed for reading multiple scal
     %Tmp = call <8 x double> @llvm.masked.expandload.v8f64(double* %Bptr, <8 x i1> %Mask, <8 x double> undef)
     ; Store the result in A
     call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %Tmp, <8 x double>* %Aptr, i32 8, <8 x i1> %Mask)
-    
+
     ; %Bptr should be increased on each iteration according to the number of '1' elements in the Mask.
     %MaskI = bitcast <8 x i1> %Mask to i8
     %MaskIPopcnt = call i8 @llvm.ctpop.i8(i8 %MaskI)
@@ -13503,7 +13524,7 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i
     %Tmp = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %Aptr, i32 8, <8 x i1> %Mask, <8 x double> undef)
     ; Store all selected elements consecutively in array B
     call <void> @llvm.masked.compressstore.v8f64(<8 x double> %Tmp, double* %Bptr, <8 x i1> %Mask)
-    
+
     ; %Bptr should be increased on each iteration according to the number of '1' elements in the Mask.
     %MaskI = bitcast <8 x i1> %Mask to i8
     %MaskIPopcnt = call i8 @llvm.ctpop.i8(i8 %MaskI)
@@ -14136,7 +14157,7 @@ Overview:
 
 The '``llvm.experimental.constrained.powi``' intrinsic returns the first operand
 raised to the (positive or negative) power specified by the second operand. The
-order of evaluation of multiplications is not defined. When a vector of 
+order of evaluation of multiplications is not defined. When a vector of
 floating-point type is used, the second argument remains a scalar integer value.
 
 
@@ -14462,7 +14483,7 @@ Overview:
 """""""""
 
 The '``llvm.experimental.constrained.nearbyint``' intrinsic returns the first
-operand rounded to the nearest integer. It will not raise an inexact 
+operand rounded to the nearest integer. It will not raise an inexact
 floating-point exception if the operand is not an integer.
 
 
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index a023aa5b3f6..da10119f438 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -70,7 +70,7 @@ namespace ISD {
     /// of the frame or return address to return.  An index of zero corresponds
     /// to the current function's frame or return address, an index of one to
     /// the parent's frame or return address, and so on.
-    FRAMEADDR, RETURNADDR, ADDROFRETURNADDR,
+    FRAMEADDR, RETURNADDR, ADDROFRETURNADDR, SPONENTRY,
 
     /// LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
     /// Materializes the offset from the local object pointer of another
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 47a66a27e38..c965140a00b 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -320,6 +320,7 @@ def int_gcwrite : Intrinsic<[],
 def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_addressofreturnaddress : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_frameaddress  : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_sponentry  : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_read_register  : Intrinsic<[llvm_anyint_ty], [llvm_metadata_ty],
                                    [IntrReadMem], "llvm.read_register">;
 def int_write_register : Intrinsic<[], [llvm_metadata_ty, llvm_anyint_ty],
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index a96b8628ac8..d5fb7a0697d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1059,6 +1059,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::FRAMEADDR:
   case ISD::RETURNADDR:
   case ISD::ADDROFRETURNADDR:
+  case ISD::SPONENTRY:
     // These operations lie about being legal: when they claim to be legal,
     // they should actually be custom-lowered.
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dac99eddec3..cb650c6fc13 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5050,6 +5050,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, DAG.getNode(ISD::ADDROFRETURNADDR, sdl,
                              TLI.getPointerTy(DAG.getDataLayout())));
     return nullptr;
+  case Intrinsic::sponentry:
+    setValue(&I, DAG.getNode(ISD::SPONENTRY, sdl,
+                             TLI.getPointerTy(DAG.getDataLayout())));
+    return nullptr;
   case Intrinsic::frameaddress:
     setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
                              TLI.getPointerTy(DAG.getDataLayout()),
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 5c17a5d295d..c21f2d3b717 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -124,6 +124,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::RETURNADDR:                 return "RETURNADDR";
   case ISD::ADDROFRETURNADDR:           return "ADDROFRETURNADDR";
   case ISD::FRAMEADDR:                  return "FRAMEADDR";
+  case ISD::SPONENTRY:                  return "SPONENTRY";
   case ISD::LOCAL_RECOVER:              return "LOCAL_RECOVER";
   case ISD::READ_REGISTER:              return "READ_REGISTER";
   case ISD::WRITE_REGISTER:             return "WRITE_REGISTER";
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 5e4c5dcf09c..317c3f134db 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3450,6 +3450,22 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     updateValueMap(II, SrcReg);
     return true;
   }
+  case Intrinsic::sponentry: {
+    MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
+
+    // SP = FP + Fixed Object + 16
+    MVT VT = TLI.getPointerTy(DL);
+    int FI = MFI.CreateFixedObject(4, 0, false);
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::ADDXri), ResultReg)
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addImm(0);
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
   case Intrinsic::memcpy:
   case Intrinsic::memmove: {
     const auto *MTI = cast<MemTransferInst>(II);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3c107016c8b..f5652a9f380 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2863,6 +2863,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFP_EXTEND(Op, DAG);
   case ISD::FRAMEADDR:
     return LowerFRAMEADDR(Op, DAG);
+  case ISD::SPONENTRY:
+    return LowerSPONENTRY(Op, DAG);
   case ISD::RETURNADDR:
     return LowerRETURNADDR(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:
@@ -5171,6 +5173,16 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
   return FrameAddr;
 }
 
+SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+  EVT VT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+  int FI = MFI.CreateFixedObject(4, 0, false);
+  return DAG.getFrameIndex(FI, VT);
+}
+
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 3e89de665a7..7b4119a21d0 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -617,6 +617,7 @@ private:
   SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
diff --git a/test/CodeGen/AArch64/sponentry.ll b/test/CodeGen/AArch64/sponentry.ll
new file mode 100644
index 00000000000..5b3638a1d86
--- /dev/null
+++ b/test/CodeGen/AArch64/sponentry.ll
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple=aarch64-windows-msvc -disable-fp-elim %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-windows-msvc -fast-isel -disable-fp-elim %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s --check-prefix=NOFP
+; RUN: llc -mtriple=aarch64-windows-msvc -fast-isel %s -o - | FileCheck %s --check-prefix=NOFP
+
+@env2 = common dso_local global [24 x i64]* null, align 8
+
+define dso_local void @bar() {
+  %1 = call i8* @llvm.sponentry()
+  %2 = load [24 x i64]*, [24 x i64]** @env2, align 8
+  %3 = getelementptr inbounds [24 x i64], [24 x i64]* %2, i32 0, i32 0
+  %4 = bitcast i64* %3 to i8*
+  %5 = call i32 @_setjmpex(i8* %4, i8* %1) #2
+  ret void
+}
+
+; CHECK: bar:
+; CHECK: mov     x29, sp
+; CHECK: add     x1, x29, #16
+; CEHCK: bl      _setjmpex
+
+; NOFP: str     x30, [sp, #-16]!
+; NOFP: add     x1, sp, #16
+
+define dso_local void @foo([24 x i64]*) {
+  %2 = alloca [24 x i64]*, align 8
+  %3 = alloca i32, align 4
+  %4 = alloca [100 x i32], align 4
+  store [24 x i64]* %0, [24 x i64]** %2, align 8
+  %5 = call i8* @llvm.sponentry()
+  %6 = load [24 x i64]*, [24 x i64]** %2, align 8
+  %7 = getelementptr inbounds [24 x i64], [24 x i64]* %6, i32 0, i32 0
+  %8 = bitcast i64* %7 to i8*
+  %9 = call i32 @_setjmpex(i8* %8, i8* %5)
+  store i32 %9, i32* %3, align 4
+  ret void
+}
+
+; CHECK: foo:
+; CHECK: sub     sp, sp, #448
+; CHECK: add     x29, sp, #432
+; CHECK: add     x1, x29, #16
+; CEHCK: bl      _setjmpex
+
+; NOFP: sub     sp, sp, #432
+; NOFP: add     x1, sp, #432
+
+define dso_local void @var_args(i8*, ...) {
+  %2 = alloca i8*, align 8
+  %3 = alloca i8*, align 8
+  store i8* %0, i8** %2, align 8
+  %4 = bitcast i8** %3 to i8*
+  call void @llvm.va_start(i8* %4)
+  %5 = load i8*, i8** %3, align 8
+  %6 = getelementptr inbounds i8, i8* %5, i64 8
+  store i8* %6, i8** %3, align 8
+  %7 = bitcast i8* %5 to i32*
+  %8 = load i32, i32* %7, align 8
+  %9 = bitcast i8** %3 to i8*
+  call void @llvm.va_end(i8* %9)
+  %10 = call i8* @llvm.sponentry()
+  %11 = load [24 x i64]*, [24 x i64]** @env2, align 8
+  %12 = getelementptr inbounds [24 x i64], [24 x i64]* %11, i32 0, i32 0
+  %13 = bitcast i64* %12 to i8*
+  %14 = call i32 @_setjmpex(i8* %13, i8* %10) #3
+  ret void
+}
+
+; CHECK: var_args:
+; CHECK: sub     sp, sp, #96
+; CHECK: add     x29, sp, #16
+; CHECK: add     x1, x29, #80
+; CEHCK: bl      _setjmpex
+
+; NOFP: sub     sp, sp, #96
+; NOFP: add     x1, sp, #96
+
+define dso_local void @manyargs(i64 %x1, i64 %x2, i64 %x3, i64 %x4, i64 %x5, i64 %x6, i64 %x7, i64 %x8, i64 %x9, i64 %x10) {
+  %1 = call i8* @llvm.sponentry()
+  %2 = load [24 x i64]*, [24 x i64]** @env2, align 8
+  %3 = getelementptr inbounds [24 x i64], [24 x i64]* %2, i32 0, i32 0
+  %4 = bitcast i64* %3 to i8*
+  %5 = call i32 @_setjmpex(i8* %4, i8* %1) #2
+  ret void
+}
+
+; CHECK: manyargs:
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: add     x1, x29, #16
+
+; NOFP: str     x30, [sp, #-16]!
+; NOFP: add     x1, sp, #16
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.sponentry()
+
+; Function Attrs: returns_twice
+declare dso_local i32 @_setjmpex(i8*, i8*)
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*) #1
+
+; Function Attrs: nounwind
+declare void @llvm.va_end(i8*) #1
-- 
GitLab


From d886fa4497261da972a721aee3992c052250dfb7 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 31 Oct 2018 23:36:10 +0000
Subject: [PATCH 0836/1116] [VFS] Add support for "no_push" to VFS recursive
 iterators.

The "regular" file system has a useful feature that makes it possible to
stop recursing when using the recursive directory iterators. This
functionality was missing for the VFS recursive iterator and this patch
adds that.

Differential revision: https://reviews.llvm.org/D53465

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345793 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/VirtualFileSystem.h    | 27 ++++---
 lib/Support/VirtualFileSystem.cpp           | 29 ++++----
 unittests/Support/VirtualFileSystemTest.cpp | 79 +++++++++++++++++++++
 3 files changed, 115 insertions(+), 20 deletions(-)

diff --git a/include/llvm/Support/VirtualFileSystem.h b/include/llvm/Support/VirtualFileSystem.h
index 7e1828062b3..282893e21dc 100644
--- a/include/llvm/Support/VirtualFileSystem.h
+++ b/include/llvm/Support/VirtualFileSystem.h
@@ -193,14 +193,22 @@ public:
 
 class FileSystem;
 
+namespace detail {
+
+/// Keeps state for the recursive_directory_iterator.
+struct RecDirIterState {
+  std::stack<directory_iterator, std::vector<directory_iterator>> Stack;
+  bool HasNoPushRequest = false;
+};
+
+} // end namespace detail
+
 /// An input iterator over the recursive contents of a virtual path,
 /// similar to llvm::sys::fs::recursive_directory_iterator.
 class recursive_directory_iterator {
-  using IterState =
-      std::stack<directory_iterator, std::vector<directory_iterator>>;
-
   FileSystem *FS;
-  std::shared_ptr<IterState> State; // Input iterator semantics on copy.
+  std::shared_ptr<detail::RecDirIterState>
+      State; // Input iterator semantics on copy.
 
 public:
   recursive_directory_iterator(FileSystem &FS, const Twine &Path,
@@ -212,8 +220,8 @@ public:
   /// Equivalent to operator++, with an error code.
   recursive_directory_iterator &increment(std::error_code &EC);
 
-  const directory_entry &operator*() const { return *State->top(); }
-  const directory_entry *operator->() const { return &*State->top(); }
+  const directory_entry &operator*() const { return *State->Stack.top(); }
+  const directory_entry *operator->() const { return &*State->Stack.top(); }
 
   bool operator==(const recursive_directory_iterator &Other) const {
     return State == Other.State; // identity
@@ -224,9 +232,12 @@ public:
 
   /// Gets the current level. Starting path is at level 0.
   int level() const {
-    assert(!State->empty() && "Cannot get level without any iteration state");
-    return State->size() - 1;
+    assert(!State->Stack.empty() &&
+           "Cannot get level without any iteration state");
+    return State->Stack.size() - 1;
   }
+
+  void no_push() { State->HasNoPushRequest = true; }
 };
 
 /// The virtual file system interface.
diff --git a/lib/Support/VirtualFileSystem.cpp b/lib/Support/VirtualFileSystem.cpp
index 81ac5bbaa9c..9440eacaa89 100644
--- a/lib/Support/VirtualFileSystem.cpp
+++ b/lib/Support/VirtualFileSystem.cpp
@@ -2157,28 +2157,33 @@ vfs::recursive_directory_iterator::recursive_directory_iterator(
     : FS(&FS_) {
   directory_iterator I = FS->dir_begin(Path, EC);
   if (I != directory_iterator()) {
-    State = std::make_shared<IterState>();
-    State->push(I);
+    State = std::make_shared<detail::RecDirIterState>();
+    State->Stack.push(I);
   }
 }
 
 vfs::recursive_directory_iterator &
 recursive_directory_iterator::increment(std::error_code &EC) {
-  assert(FS && State && !State->empty() && "incrementing past end");
-  assert(!State->top()->path().empty() && "non-canonical end iterator");
+  assert(FS && State && !State->Stack.empty() && "incrementing past end");
+  assert(!State->Stack.top()->path().empty() && "non-canonical end iterator");
   vfs::directory_iterator End;
-  if (State->top()->type() == sys::fs::file_type::directory_file) {
-    vfs::directory_iterator I = FS->dir_begin(State->top()->path(), EC);
-    if (I != End) {
-      State->push(I);
-      return *this;
+
+  if (State->HasNoPushRequest)
+    State->HasNoPushRequest = false;
+  else {
+    if (State->Stack.top()->type() == sys::fs::file_type::directory_file) {
+      vfs::directory_iterator I = FS->dir_begin(State->Stack.top()->path(), EC);
+      if (I != End) {
+        State->Stack.push(I);
+        return *this;
+      }
     }
   }
 
-  while (!State->empty() && State->top().increment(EC) == End)
-    State->pop();
+  while (!State->Stack.empty() && State->Stack.top().increment(EC) == End)
+    State->Stack.pop();
 
-  if (State->empty())
+  if (State->Stack.empty())
     State.reset(); // end iterator
 
   return *this;
diff --git a/unittests/Support/VirtualFileSystemTest.cpp b/unittests/Support/VirtualFileSystemTest.cpp
index 992704c18fa..d5c01141bba 100644
--- a/unittests/Support/VirtualFileSystemTest.cpp
+++ b/unittests/Support/VirtualFileSystemTest.cpp
@@ -478,6 +478,85 @@ TEST(VirtualFileSystemTest, BasicRealFSRecursiveIteration) {
   EXPECT_EQ(1, Counts[3]); // d
 }
 
+TEST(VirtualFileSystemTest, BasicRealFSRecursiveIterationNoPush) {
+  ScopedDir TestDirectory("virtual-file-system-test", /*Unique*/ true);
+
+  ScopedDir _a(TestDirectory + "/a");
+  ScopedDir _ab(TestDirectory + "/a/b");
+  ScopedDir _c(TestDirectory + "/c");
+  ScopedDir _cd(TestDirectory + "/c/d");
+  ScopedDir _e(TestDirectory + "/e");
+  ScopedDir _ef(TestDirectory + "/e/f");
+  ScopedDir _g(TestDirectory + "/g");
+
+  IntrusiveRefCntPtr<vfs::FileSystem> FS = vfs::getRealFileSystem();
+
+  // Test that calling no_push on entries without subdirectories has no effect.
+  {
+    std::error_code EC;
+    auto I = vfs::recursive_directory_iterator(*FS, Twine(TestDirectory), EC);
+    ASSERT_FALSE(EC);
+
+    std::vector<std::string> Contents;
+    for (auto E = vfs::recursive_directory_iterator(); !EC && I != E;
+         I.increment(EC)) {
+      Contents.push_back(I->path());
+      char last = I->path().back();
+      switch (last) {
+      case 'b':
+      case 'd':
+      case 'f':
+      case 'g':
+        I.no_push();
+        break;
+      default:
+        break;
+      }
+    }
+    EXPECT_EQ(7U, Contents.size());
+  }
+
+  // Test that calling no_push skips subdirectories.
+  {
+    std::error_code EC;
+    auto I = vfs::recursive_directory_iterator(*FS, Twine(TestDirectory), EC);
+    ASSERT_FALSE(EC);
+
+    std::vector<std::string> Contents;
+    for (auto E = vfs::recursive_directory_iterator(); !EC && I != E;
+         I.increment(EC)) {
+      Contents.push_back(I->path());
+      char last = I->path().back();
+      switch (last) {
+      case 'a':
+      case 'c':
+      case 'e':
+        I.no_push();
+        break;
+      default:
+        break;
+      }
+    }
+
+    // Check contents, which may be in any order
+    EXPECT_EQ(4U, Contents.size());
+    int Counts[7] = {0, 0, 0, 0, 0, 0, 0};
+    for (const std::string &Name : Contents) {
+      ASSERT_FALSE(Name.empty());
+      int Index = Name[Name.size() - 1] - 'a';
+      ASSERT_TRUE(Index >= 0 && Index < 7);
+      Counts[Index]++;
+    }
+    EXPECT_EQ(1, Counts[0]); // a
+    EXPECT_EQ(0, Counts[1]); // b
+    EXPECT_EQ(1, Counts[2]); // c
+    EXPECT_EQ(0, Counts[3]); // d
+    EXPECT_EQ(1, Counts[4]); // e
+    EXPECT_EQ(0, Counts[5]); // f
+    EXPECT_EQ(1, Counts[6]); // g
+  }
+}
+
 #ifdef LLVM_ON_UNIX
 TEST(VirtualFileSystemTest, BrokenSymlinkRealFSRecursiveIteration) {
   ScopedDir TestDirectory("virtual-file-system-test", /*Unique*/ true);
-- 
GitLab


From 41079ac2534408660ea7d7d2c46a871f468d8070 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 31 Oct 2018 23:50:53 +0000
Subject: [PATCH 0837/1116] [WebAssembly] Handle vector IMPLICIT_DEFs.

Summary:
Also reduce the test case for implicit defs and test it with all
register classes.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53855

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345794 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyRegStackify.cpp    |   5 +
 test/CodeGen/WebAssembly/implicit-def.ll      | 151 ++++++++++++++----
 2 files changed, 122 insertions(+), 34 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 4649230d454..dc2aab87593 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -118,6 +118,11 @@ static void ConvertImplicitDefToConstZero(MachineInstr *MI,
     ConstantFP *Val = cast<ConstantFP>(Constant::getNullValue(
         Type::getDoubleTy(MF.getFunction().getContext())));
     MI->addOperand(MachineOperand::CreateFPImm(Val));
+  } else if (RegClass == &WebAssembly::V128RegClass) {
+    // TODO: make splat instead of constant
+    MI->setDesc(TII->get(WebAssembly::CONST_V128_v16i8));
+    for (int I = 0; I < 16; ++I)
+      MI->addOperand(MachineOperand::CreateImm(0));
   } else {
     llvm_unreachable("Unexpected reg class");
   }
diff --git a/test/CodeGen/WebAssembly/implicit-def.ll b/test/CodeGen/WebAssembly/implicit-def.ll
index 16b4031c96b..8f7dcc8cee3 100644
--- a/test/CodeGen/WebAssembly/implicit-def.ll
+++ b/test/CodeGen/WebAssembly/implicit-def.ll
@@ -1,50 +1,133 @@
-; RUN: llc -o - %s -asm-verbose=false -wasm-keep-registers | FileCheck %s
+; RUN: llc -o - %s -asm-verbose=false -wasm-keep-registers -disable-wasm-fallthrough-return-opt -mattr=+simd128 | FileCheck %s
+
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; Test that stackified IMPLICIT_DEF instructions are converted into
-; CONST_I32 to provide an explicit push.
-
-; CHECK:      br_if 2,
-; CHECK:      i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: return $pop[[L0]]{{$}}
-define i1 @f() {
-  %a = xor i1 0, 0
-  switch i1 %a, label %C [
-    i1 0, label %A
-    i1 1, label %B
-  ]
-
-A:
-  %b = xor i1 0, 0
+; CONST_XXX instructions to provide an explicit push.
+
+; CHECK-LABEL: implicit_def_i32:
+; CHECK: .LBB{{[0-9]+}}_4:{{$}}
+; CHECK-NEXT: end_block{{$}}
+; CHECK-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+; CHECK-NEXT: end_function{{$}}
+define i32 @implicit_def_i32() {
+  br i1 undef, label %A, label %X
+
+A:                                                ; preds = %0
+  %d = icmp slt i1 0, 0
+  br i1 %d, label %C, label %B
+
+B:                                                ; preds = %A
+  br label %C
+
+C:                                                ; preds = %B, %A
+  %h = phi i32 [ undef, %A ], [ 0, %B ]
+  br label %X
+
+X:                                                ; preds = %0, C
+  %i = phi i32 [ 1, %0 ], [ %h, %C ]
+  ret i32 %i
+}
+
+; CHECK-LABEL: implicit_def_i64:
+; CHECK: .LBB{{[0-9]+}}_4:{{$}}
+; CHECK-NEXT: end_block{{$}}
+; CHECK-NEXT: i64.const $push[[R:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+; CHECK-NEXT: end_function{{$}}
+define i64 @implicit_def_i64() {
+  br i1 undef, label %A, label %X
+
+A:                                                ; preds = %0
+  %d = icmp slt i1 0, 0
+  br i1 %d, label %C, label %B
+
+B:                                                ; preds = %A
+  br label %C
+
+C:                                                ; preds = %B, %A
+  %h = phi i64 [ undef, %A ], [ 0, %B ]
   br label %X
 
-B:
-  %c = xor i1 0, 0
-  br i1 %c, label %D, label %X
+X:                                                ; preds = %0, C
+  %i = phi i64 [ 1, %0 ], [ %h, %C ]
+  ret i64 %i
+}
+
+; CHECK-LABEL: implicit_def_f32:
+; CHECK: .LBB{{[0-9]+}}_4:{{$}}
+; CHECK-NEXT: end_block{{$}}
+; CHECK-NEXT: f32.const $push[[R:[0-9]+]]=, 0x0p0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+; CHECK-NEXT: end_function{{$}}
+define float @implicit_def_f32() {
+  br i1 undef, label %A, label %X
 
-C:
-  %d = icmp slt i32 0, 0
-  br i1 %d, label %G, label %F
+A:                                                ; preds = %0
+  %d = icmp slt i1 0, 0
+  br i1 %d, label %C, label %B
 
-D:
-  %e = xor i1 0, 0
-  br i1 %e, label %E, label %X
+B:                                                ; preds = %A
+  br label %C
 
-E:
-  %f = xor i1 0, 0
+C:                                                ; preds = %B, %A
+  %h = phi float [ undef, %A ], [ 0.0, %B ]
   br label %X
 
-F:
-  %g = xor i1 0, 0
-  br label %G
+X:                                                ; preds = %0, C
+  %i = phi float [ 1.0, %0 ], [ %h, %C ]
+  ret float %i
+}
+
+; CHECK-LABEL: implicit_def_f64:
+; CHECK: .LBB{{[0-9]+}}_4:{{$}}
+; CHECK-NEXT: end_block{{$}}
+; CHECK-NEXT: f64.const $push[[R:[0-9]+]]=, 0x0p0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+; CHECK-NEXT: end_function{{$}}
+define double @implicit_def_f64() {
+  br i1 undef, label %A, label %X
+
+A:                                                ; preds = %0
+  %d = icmp slt i1 0, 0
+  br i1 %d, label %C, label %B
+
+B:                                                ; preds = %A
+  br label %C
 
-G:
-  %h = phi i1 [ undef, %C ], [ false, %F ]
+C:                                                ; preds = %B, %A
+  %h = phi double [ undef, %A ], [ 0.0, %B ]
   br label %X
 
-X:
-  %i = phi i1 [ true, %A ], [ true, %B ], [ true, %D ], [ true, %E ], [ %h, %G ]
-  ret i1 %i
+X:                                                ; preds = %0, C
+  %i = phi double [ 1.0, %0 ], [ %h, %C ]
+  ret double %i
 }
 
+; CHECK-LABEL: implicit_def_v4i32:
+; CHECK: .LBB{{[0-9]+}}_4:{{$}}
+; CHECK-NEXT: end_block{{$}}
+; CHECK-NEXT: v128.const $push[[R:[0-9]+]]=, 0, 0, 0, 0, 0, 0, 0, 0,
+; CHECK-SAME:                                0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+; CHECK-NEXT: end_function{{$}}
+define <4 x i32> @implicit_def_v4i32() {
+  br i1 undef, label %A, label %X
+
+A:                                                ; preds = %0
+  %d = icmp slt i1 0, 0
+  br i1 %d, label %C, label %B
+
+B:                                                ; preds = %A
+  br label %C
+
+C:                                                ; preds = %B, %A
+  %h = phi <4 x i32> [ undef, %A ], [ <i32 0, i32 0, i32 0, i32 0>, %B ]
+  br label %X
+
+X:                                                ; preds = %0, C
+  %i = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %0 ], [ %h, %C ]
+  ret <4 x i32> %i
+}
-- 
GitLab


From 6700bd314aa8c9729f574423e5020492849e1d97 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 31 Oct 2018 23:58:20 +0000
Subject: [PATCH 0838/1116] [WebAssembly] Process p2align operands for SIMD
 loads and stores

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53886

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345795 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssemblySetP2AlignOperands.cpp         |  12 +
 .../WebAssembly/simd-load-store-alignment.ll  | 534 ++++++++++++++++++
 test/CodeGen/WebAssembly/simd-offset.ll       | 192 +++----
 3 files changed, 642 insertions(+), 96 deletions(-)
 create mode 100644 test/CodeGen/WebAssembly/simd-load-store-alignment.ll

diff --git a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
index aaa0bbcbc57..c95af88c6f4 100644
--- a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
@@ -89,6 +89,12 @@ bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::LOAD_I64:
       case WebAssembly::LOAD_F32:
       case WebAssembly::LOAD_F64:
+      case WebAssembly::LOAD_v16i8:
+      case WebAssembly::LOAD_v8i16:
+      case WebAssembly::LOAD_v4i32:
+      case WebAssembly::LOAD_v2i64:
+      case WebAssembly::LOAD_v4f32:
+      case WebAssembly::LOAD_v2f64:
       case WebAssembly::LOAD8_S_I32:
       case WebAssembly::LOAD8_U_I32:
       case WebAssembly::LOAD16_S_I32:
@@ -164,6 +170,12 @@ bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::STORE_I64:
       case WebAssembly::STORE_F32:
       case WebAssembly::STORE_F64:
+      case WebAssembly::STORE_v16i8:
+      case WebAssembly::STORE_v8i16:
+      case WebAssembly::STORE_v4i32:
+      case WebAssembly::STORE_v2i64:
+      case WebAssembly::STORE_v4f32:
+      case WebAssembly::STORE_v2f64:
       case WebAssembly::STORE8_I32:
       case WebAssembly::STORE16_I32:
       case WebAssembly::STORE8_I64:
diff --git a/test/CodeGen/WebAssembly/simd-load-store-alignment.ll b/test/CodeGen/WebAssembly/simd-load-store-alignment.ll
new file mode 100644
index 00000000000..f113840c049
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-load-store-alignment.ll
@@ -0,0 +1,534 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-unimplemented-simd -mattr=+simd128 | FileCheck %s
+
+; Test loads and stores with custom alignment values.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; ==============================================================================
+; 16 x i8
+; ==============================================================================
+
+; CHECK-LABEL: load_v16i8_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @load_v16i8_a1(<16 x i8> *%p) {
+  %v = load <16 x i8>, <16 x i8>* %p, align 1
+  ret <16 x i8> %v
+}
+
+; CHECK-LABEL: load_v16i8_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @load_v16i8_a4(<16 x i8> *%p) {
+  %v = load <16 x i8>, <16 x i8>* %p, align 4
+  ret <16 x i8> %v
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v16i8_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @load_v16i8_a16(<16 x i8> *%p) {
+  %v = load <16 x i8>, <16 x i8>* %p, align 16
+  ret <16 x i8> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v16i8_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @load_v16i8_a32(<16 x i8> *%p) {
+  %v = load <16 x i8>, <16 x i8>* %p, align 32
+  ret <16 x i8> %v
+}
+
+; CHECK-LABEL: store_v16i8_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v16i8_a1(<16 x i8> *%p, <16 x i8> %v) {
+  store <16 x i8> %v, <16 x i8>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v16i8_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v16i8_a4(<16 x i8> *%p, <16 x i8> %v) {
+  store <16 x i8> %v, <16 x i8>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v16i8_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v16i8_a16(<16 x i8> *%p, <16 x i8> %v) {
+  store <16 x i8> %v, <16 x i8>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v16i8_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v16i8_a32(<16 x i8> *%p, <16 x i8> %v) {
+  store <16 x i8> %v, <16 x i8>* %p, align 32
+  ret void
+}
+
+; ==============================================================================
+; 8 x i16
+; ==============================================================================
+
+; CHECK-LABEL: load_v8i16_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @load_v8i16_a1(<8 x i16> *%p) {
+  %v = load <8 x i16>, <8 x i16>* %p, align 1
+  ret <8 x i16> %v
+}
+
+; CHECK-LABEL: load_v8i16_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @load_v8i16_a4(<8 x i16> *%p) {
+  %v = load <8 x i16>, <8 x i16>* %p, align 4
+  ret <8 x i16> %v
+}
+
+; 8 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v8i16_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @load_v8i16_a16(<8 x i16> *%p) {
+  %v = load <8 x i16>, <8 x i16>* %p, align 16
+  ret <8 x i16> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v8i16_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @load_v8i16_a32(<8 x i16> *%p) {
+  %v = load <8 x i16>, <8 x i16>* %p, align 32
+  ret <8 x i16> %v
+}
+
+; CHECK-LABEL: store_v8i16_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v8i16_a1(<8 x i16> *%p, <8 x i16> %v) {
+  store <8 x i16> %v, <8 x i16>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v8i16_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v8i16_a4(<8 x i16> *%p, <8 x i16> %v) {
+  store <8 x i16> %v, <8 x i16>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v8i16_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v8i16_a16(<8 x i16> *%p, <8 x i16> %v) {
+  store <8 x i16> %v, <8 x i16>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v8i16_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v8i16_a32(<8 x i16> *%p, <8 x i16> %v) {
+  store <8 x i16> %v, <8 x i16>* %p, align 32
+  ret void
+}
+
+; ==============================================================================
+; 4 x i32
+; ==============================================================================
+
+; CHECK-LABEL: load_v4i32_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @load_v4i32_a1(<4 x i32> *%p) {
+  %v = load <4 x i32>, <4 x i32>* %p, align 1
+  ret <4 x i32> %v
+}
+
+; CHECK-LABEL: load_v4i32_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @load_v4i32_a4(<4 x i32> *%p) {
+  %v = load <4 x i32>, <4 x i32>* %p, align 4
+  ret <4 x i32> %v
+}
+
+; 4 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v4i32_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @load_v4i32_a16(<4 x i32> *%p) {
+  %v = load <4 x i32>, <4 x i32>* %p, align 16
+  ret <4 x i32> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v4i32_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @load_v4i32_a32(<4 x i32> *%p) {
+  %v = load <4 x i32>, <4 x i32>* %p, align 32
+  ret <4 x i32> %v
+}
+
+; CHECK-LABEL: store_v4i32_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4i32_a1(<4 x i32> *%p, <4 x i32> %v) {
+  store <4 x i32> %v, <4 x i32>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v4i32_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4i32_a4(<4 x i32> *%p, <4 x i32> %v) {
+  store <4 x i32> %v, <4 x i32>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v4i32_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4i32_a16(<4 x i32> *%p, <4 x i32> %v) {
+  store <4 x i32> %v, <4 x i32>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v4i32_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4i32_a32(<4 x i32> *%p, <4 x i32> %v) {
+  store <4 x i32> %v, <4 x i32>* %p, align 32
+  ret void
+}
+
+; ==============================================================================
+; 2 x i64
+; ==============================================================================
+
+; CHECK-LABEL: load_v2i64_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_v2i64_a1(<2 x i64> *%p) {
+  %v = load <2 x i64>, <2 x i64>* %p, align 1
+  ret <2 x i64> %v
+}
+
+; CHECK-LABEL: load_v2i64_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_v2i64_a4(<2 x i64> *%p) {
+  %v = load <2 x i64>, <2 x i64>* %p, align 4
+  ret <2 x i64> %v
+}
+
+; 2 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v2i64_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_v2i64_a16(<2 x i64> *%p) {
+  %v = load <2 x i64>, <2 x i64>* %p, align 16
+  ret <2 x i64> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v2i64_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_v2i64_a32(<2 x i64> *%p) {
+  %v = load <2 x i64>, <2 x i64>* %p, align 32
+  ret <2 x i64> %v
+}
+
+; CHECK-LABEL: store_v2i64_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2i64_a1(<2 x i64> *%p, <2 x i64> %v) {
+  store <2 x i64> %v, <2 x i64>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v2i64_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2i64_a4(<2 x i64> *%p, <2 x i64> %v) {
+  store <2 x i64> %v, <2 x i64>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v2i64_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2i64_a16(<2 x i64> *%p, <2 x i64> %v) {
+  store <2 x i64> %v, <2 x i64>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v2i64_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2i64_a32(<2 x i64> *%p, <2 x i64> %v) {
+  store <2 x i64> %v, <2 x i64>* %p, align 32
+  ret void
+}
+
+; ==============================================================================
+; 4 x float
+; ==============================================================================
+
+; CHECK-LABEL: load_v4f32_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @load_v4f32_a1(<4 x float> *%p) {
+  %v = load <4 x float>, <4 x float>* %p, align 1
+  ret <4 x float> %v
+}
+
+; CHECK-LABEL: load_v4f32_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @load_v4f32_a4(<4 x float> *%p) {
+  %v = load <4 x float>, <4 x float>* %p, align 4
+  ret <4 x float> %v
+}
+
+; 4 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v4f32_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @load_v4f32_a16(<4 x float> *%p) {
+  %v = load <4 x float>, <4 x float>* %p, align 16
+  ret <4 x float> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v4f32_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @load_v4f32_a32(<4 x float> *%p) {
+  %v = load <4 x float>, <4 x float>* %p, align 32
+  ret <4 x float> %v
+}
+
+; CHECK-LABEL: store_v4f32_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4f32_a1(<4 x float> *%p, <4 x float> %v) {
+  store <4 x float> %v, <4 x float>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v4f32_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4f32_a4(<4 x float> *%p, <4 x float> %v) {
+  store <4 x float> %v, <4 x float>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v4f32_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4f32_a16(<4 x float> *%p, <4 x float> %v) {
+  store <4 x float> %v, <4 x float>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v4f32_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4f32_a32(<4 x float> *%p, <4 x float> %v) {
+  store <4 x float> %v, <4 x float>* %p, align 32
+  ret void
+}
+
+; ==============================================================================
+; 2 x double
+; ==============================================================================
+
+; CHECK-LABEL: load_v2f64_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @load_v2f64_a1(<2 x double> *%p) {
+  %v = load <2 x double>, <2 x double>* %p, align 1
+  ret <2 x double> %v
+}
+
+; CHECK-LABEL: load_v2f64_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @load_v2f64_a4(<2 x double> *%p) {
+  %v = load <2 x double>, <2 x double>* %p, align 4
+  ret <2 x double> %v
+}
+
+; 2 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v2f64_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @load_v2f64_a16(<2 x double> *%p) {
+  %v = load <2 x double>, <2 x double>* %p, align 16
+  ret <2 x double> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v2f64_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @load_v2f64_a32(<2 x double> *%p) {
+  %v = load <2 x double>, <2 x double>* %p, align 32
+  ret <2 x double> %v
+}
+
+; CHECK-LABEL: store_v2f64_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2f64_a1(<2 x double> *%p, <2 x double> %v) {
+  store <2 x double> %v, <2 x double>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v2f64_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2f64_a4(<2 x double> *%p, <2 x double> %v) {
+  store <2 x double> %v, <2 x double>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v2f64_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2f64_a16(<2 x double> *%p, <2 x double> %v) {
+  store <2 x double> %v, <2 x double>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v2f64_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2f64_a32(<2 x double> *%p, <2 x double> %v) {
+  store <2 x double> %v, <2 x double>* %p, align 32
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/simd-offset.ll b/test/CodeGen/WebAssembly/simd-offset.ll
index 5ce0ca94dc4..ed20225f021 100644
--- a/test/CodeGen/WebAssembly/simd-offset.ll
+++ b/test/CodeGen/WebAssembly/simd-offset.ll
@@ -14,7 +14,7 @@ target triple = "wasm32-unknown-unknown"
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8(<16 x i8>* %p) {
   %v = load <16 x i8>, <16 x i8>* %p
@@ -25,7 +25,7 @@ define <16 x i8> @load_v16i8(<16 x i8>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_folded_offset(<16 x i8>* %p) {
   %q = ptrtoint <16 x i8>* %p to i32
@@ -39,7 +39,7 @@ define <16 x i8> @load_v16i8_with_folded_offset(<16 x i8>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_folded_gep_offset(<16 x i8>* %p) {
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 1
@@ -53,7 +53,7 @@ define <16 x i8> @load_v16i8_with_folded_gep_offset(<16 x i8>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_unfolded_gep_negative_offset(<16 x i8>* %p) {
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 -1
@@ -67,7 +67,7 @@ define <16 x i8> @load_v16i8_with_unfolded_gep_negative_offset(<16 x i8>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_unfolded_offset(<16 x i8>* %p) {
   %q = ptrtoint <16 x i8>* %p to i32
@@ -83,7 +83,7 @@ define <16 x i8> @load_v16i8_with_unfolded_offset(<16 x i8>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_unfolded_gep_offset(<16 x i8>* %p) {
   %s = getelementptr <16 x i8>, <16 x i8>* %p, i32 1
@@ -95,7 +95,7 @@ define <16 x i8> @load_v16i8_with_unfolded_gep_offset(<16 x i8>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_from_numeric_address() {
   %s = inttoptr i32 32 to <16 x i8>*
@@ -107,7 +107,7 @@ define <16 x i8> @load_v16i8_from_numeric_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v16i8($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v16i8($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v16i8 = global <16 x i8> <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
 define <16 x i8> @load_v16i8_from_global_address() {
@@ -118,7 +118,7 @@ define <16 x i8> @load_v16i8_from_global_address() {
 ; CHECK-LABEL: store_v16i8:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v16i8(<16 x i8> %v, <16 x i8>* %p) {
   store <16 x i8> %v , <16 x i8>* %p
   ret void
@@ -127,7 +127,7 @@ define void @store_v16i8(<16 x i8> %v, <16 x i8>* %p) {
 ; CHECK-LABEL: store_v16i8_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v16i8_with_folded_offset(<16 x i8> %v, <16 x i8>* %p) {
   %q = ptrtoint <16 x i8>* %p to i32
   %r = add nuw i32 %q, 16
@@ -139,7 +139,7 @@ define void @store_v16i8_with_folded_offset(<16 x i8> %v, <16 x i8>* %p) {
 ; CHECK-LABEL: store_v16i8_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v16i8_with_folded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 1
   store <16 x i8> %v , <16 x i8>* %s
@@ -151,7 +151,7 @@ define void @store_v16i8_with_folded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v16i8_with_unfolded_gep_negative_offset(<16 x i8> %v, <16 x i8>* %p) {
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 -1
   store <16 x i8> %v , <16 x i8>* %s
@@ -163,7 +163,7 @@ define void @store_v16i8_with_unfolded_gep_negative_offset(<16 x i8> %v, <16 x i
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v16i8_with_unfolded_offset(<16 x i8> %v, <16 x i8>* %p) {
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 -1
   store <16 x i8> %v , <16 x i8>* %s
@@ -175,7 +175,7 @@ define void @store_v16i8_with_unfolded_offset(<16 x i8> %v, <16 x i8>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v16i8_with_unfolded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
   %s = getelementptr <16 x i8>, <16 x i8>* %p, i32 1
   store <16 x i8> %v , <16 x i8>* %s
@@ -186,7 +186,7 @@ define void @store_v16i8_with_unfolded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[R]]), $0{{$}}
 define void @store_v16i8_to_numeric_address(<16 x i8> %v) {
   %s = inttoptr i32 32 to <16 x i8>*
   store <16 x i8> %v , <16 x i8>* %s
@@ -197,7 +197,7 @@ define void @store_v16i8_to_numeric_address(<16 x i8> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v16i8($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v16i8($pop[[R]]), $0{{$}}
 define void @store_v16i8_to_global_address(<16 x i8> %v) {
   store <16 x i8> %v , <16 x i8>* @gv_v16i8
   ret void
@@ -210,7 +210,7 @@ define void @store_v16i8_to_global_address(<16 x i8> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16(<8 x i16>* %p) {
   %v = load <8 x i16>, <8 x i16>* %p
@@ -221,7 +221,7 @@ define <8 x i16> @load_v8i16(<8 x i16>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_folded_offset(<8 x i16>* %p) {
   %q = ptrtoint <8 x i16>* %p to i32
@@ -235,7 +235,7 @@ define <8 x i16> @load_v8i16_with_folded_offset(<8 x i16>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_folded_gep_offset(<8 x i16>* %p) {
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 1
@@ -249,7 +249,7 @@ define <8 x i16> @load_v8i16_with_folded_gep_offset(<8 x i16>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_unfolded_gep_negative_offset(<8 x i16>* %p) {
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 -1
@@ -263,7 +263,7 @@ define <8 x i16> @load_v8i16_with_unfolded_gep_negative_offset(<8 x i16>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[L0:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[L0:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[L0]]{{$}}
 define <8 x i16> @load_v8i16_with_unfolded_offset(<8 x i16>* %p) {
   %q = ptrtoint <8 x i16>* %p to i32
@@ -279,7 +279,7 @@ define <8 x i16> @load_v8i16_with_unfolded_offset(<8 x i16>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_unfolded_gep_offset(<8 x i16>* %p) {
   %s = getelementptr <8 x i16>, <8 x i16>* %p, i32 1
@@ -291,7 +291,7 @@ define <8 x i16> @load_v8i16_with_unfolded_gep_offset(<8 x i16>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_from_numeric_address() {
   %s = inttoptr i32 32 to <8 x i16>*
@@ -303,7 +303,7 @@ define <8 x i16> @load_v8i16_from_numeric_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v8i16($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v8i16($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v8i16 = global <8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
 define <8 x i16> @load_v8i16_from_global_address() {
@@ -314,7 +314,7 @@ define <8 x i16> @load_v8i16_from_global_address() {
 ; CHECK-LABEL: store_v8i16:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v8i16(<8 x i16> %v, <8 x i16>* %p) {
   store <8 x i16> %v , <8 x i16>* %p
   ret void
@@ -323,7 +323,7 @@ define void @store_v8i16(<8 x i16> %v, <8 x i16>* %p) {
 ; CHECK-LABEL: store_v8i16_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) {
   %q = ptrtoint <8 x i16>* %p to i32
   %r = add nuw i32 %q, 16
@@ -335,7 +335,7 @@ define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) {
 ; CHECK-LABEL: store_v8i16_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 1
   store <8 x i16> %v , <8 x i16>* %s
@@ -347,7 +347,7 @@ define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i16>* %p) {
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 -1
   store <8 x i16> %v , <8 x i16>* %s
@@ -359,7 +359,7 @@ define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i1
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) {
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 -1
   store <8 x i16> %v , <8 x i16>* %s
@@ -371,7 +371,7 @@ define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
   %s = getelementptr <8 x i16>, <8 x i16>* %p, i32 1
   store <8 x i16> %v , <8 x i16>* %s
@@ -382,7 +382,7 @@ define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v8i16_to_numeric_address(<8 x i16> %v) {
   %s = inttoptr i32 32 to <8 x i16>*
   store <8 x i16> %v , <8 x i16>* %s
@@ -393,7 +393,7 @@ define void @store_v8i16_to_numeric_address(<8 x i16> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v8i16($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v8i16($pop[[R]]), $0{{$}}
 define void @store_v8i16_to_global_address(<8 x i16> %v) {
   store <8 x i16> %v , <8 x i16>* @gv_v8i16
   ret void
@@ -406,7 +406,7 @@ define void @store_v8i16_to_global_address(<8 x i16> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32(<4 x i32>* %p) {
   %v = load <4 x i32>, <4 x i32>* %p
@@ -417,7 +417,7 @@ define <4 x i32> @load_v4i32(<4 x i32>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_folded_offset(<4 x i32>* %p) {
   %q = ptrtoint <4 x i32>* %p to i32
@@ -431,7 +431,7 @@ define <4 x i32> @load_v4i32_with_folded_offset(<4 x i32>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_folded_gep_offset(<4 x i32>* %p) {
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 1
@@ -445,7 +445,7 @@ define <4 x i32> @load_v4i32_with_folded_gep_offset(<4 x i32>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_unfolded_gep_negative_offset(<4 x i32>* %p) {
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 -1
@@ -459,7 +459,7 @@ define <4 x i32> @load_v4i32_with_unfolded_gep_negative_offset(<4 x i32>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_unfolded_offset(<4 x i32>* %p) {
   %q = ptrtoint <4 x i32>* %p to i32
@@ -475,7 +475,7 @@ define <4 x i32> @load_v4i32_with_unfolded_offset(<4 x i32>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_unfolded_gep_offset(<4 x i32>* %p) {
   %s = getelementptr <4 x i32>, <4 x i32>* %p, i32 1
@@ -487,7 +487,7 @@ define <4 x i32> @load_v4i32_with_unfolded_gep_offset(<4 x i32>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_from_numeric_address() {
   %s = inttoptr i32 32 to <4 x i32>*
@@ -499,7 +499,7 @@ define <4 x i32> @load_v4i32_from_numeric_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4i32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4i32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v4i32 = global <4 x i32> <i32 42, i32 42, i32 42, i32 42>
 define <4 x i32> @load_v4i32_from_global_address() {
@@ -510,7 +510,7 @@ define <4 x i32> @load_v4i32_from_global_address() {
 ; CHECK-LABEL: store_v4i32:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v4i32(<4 x i32> %v, <4 x i32>* %p) {
   store <4 x i32> %v , <4 x i32>* %p
   ret void
@@ -519,7 +519,7 @@ define void @store_v4i32(<4 x i32> %v, <4 x i32>* %p) {
 ; CHECK-LABEL: store_v4i32_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) {
   %q = ptrtoint <4 x i32>* %p to i32
   %r = add nuw i32 %q, 16
@@ -531,7 +531,7 @@ define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) {
 ; CHECK-LABEL: store_v4i32_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 1
   store <4 x i32> %v , <4 x i32>* %s
@@ -543,7 +543,7 @@ define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i32>* %p) {
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 -1
   store <4 x i32> %v , <4 x i32>* %s
@@ -555,7 +555,7 @@ define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i3
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) {
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 -1
   store <4 x i32> %v , <4 x i32>* %s
@@ -567,7 +567,7 @@ define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
   %s = getelementptr <4 x i32>, <4 x i32>* %p, i32 1
   store <4 x i32> %v , <4 x i32>* %s
@@ -578,7 +578,7 @@ define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v4i32_to_numeric_address(<4 x i32> %v) {
   %s = inttoptr i32 32 to <4 x i32>*
   store <4 x i32> %v , <4 x i32>* %s
@@ -589,7 +589,7 @@ define void @store_v4i32_to_numeric_address(<4 x i32> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v4i32($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v4i32($pop[[R]]), $0{{$}}
 define void @store_v4i32_to_global_address(<4 x i32> %v) {
   store <4 x i32> %v , <4 x i32>* @gv_v4i32
   ret void
@@ -603,7 +603,7 @@ define void @store_v4i32_to_global_address(<4 x i32> %v) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64(<2 x i64>* %p) {
   %v = load <2 x i64>, <2 x i64>* %p
@@ -615,7 +615,7 @@ define <2 x i64> @load_v2i64(<2 x i64>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_folded_offset(<2 x i64>* %p) {
   %q = ptrtoint <2 x i64>* %p to i32
@@ -630,7 +630,7 @@ define <2 x i64> @load_v2i64_with_folded_offset(<2 x i64>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_folded_gep_offset(<2 x i64>* %p) {
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 1
@@ -645,7 +645,7 @@ define <2 x i64> @load_v2i64_with_folded_gep_offset(<2 x i64>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_unfolded_gep_negative_offset(<2 x i64>* %p) {
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 -1
@@ -660,7 +660,7 @@ define <2 x i64> @load_v2i64_with_unfolded_gep_negative_offset(<2 x i64>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_unfolded_offset(<2 x i64>* %p) {
   %q = ptrtoint <2 x i64>* %p to i32
@@ -677,7 +677,7 @@ define <2 x i64> @load_v2i64_with_unfolded_offset(<2 x i64>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_unfolded_gep_offset(<2 x i64>* %p) {
   %s = getelementptr <2 x i64>, <2 x i64>* %p, i32 1
@@ -690,7 +690,7 @@ define <2 x i64> @load_v2i64_with_unfolded_gep_offset(<2 x i64>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_from_numeric_address() {
   %s = inttoptr i32 32 to <2 x i64>*
@@ -703,7 +703,7 @@ define <2 x i64> @load_v2i64_from_numeric_address() {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2i64($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2i64($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v2i64 = global <2 x i64> <i64 42, i64 42>
 define <2 x i64> @load_v2i64_from_global_address() {
@@ -715,7 +715,7 @@ define <2 x i64> @load_v2i64_from_global_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v2i64(<2 x i64> %v, <2 x i64>* %p) {
   store <2 x i64> %v , <2 x i64>* %p
   ret void
@@ -725,7 +725,7 @@ define void @store_v2i64(<2 x i64> %v, <2 x i64>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2i64_with_folded_offset(<2 x i64> %v, <2 x i64>* %p) {
   %q = ptrtoint <2 x i64>* %p to i32
   %r = add nuw i32 %q, 16
@@ -738,7 +738,7 @@ define void @store_v2i64_with_folded_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2i64_with_folded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 1
   store <2 x i64> %v , <2 x i64>* %s
@@ -751,7 +751,7 @@ define void @store_v2i64_with_folded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2i64_with_unfolded_gep_negative_offset(<2 x i64> %v, <2 x i64>* %p) {
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 -1
   store <2 x i64> %v , <2 x i64>* %s
@@ -764,7 +764,7 @@ define void @store_v2i64_with_unfolded_gep_negative_offset(<2 x i64> %v, <2 x i6
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2i64_with_unfolded_offset(<2 x i64> %v, <2 x i64>* %p) {
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 -1
   store <2 x i64> %v , <2 x i64>* %s
@@ -777,7 +777,7 @@ define void @store_v2i64_with_unfolded_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2i64_with_unfolded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
   %s = getelementptr <2 x i64>, <2 x i64>* %p, i32 1
   store <2 x i64> %v , <2 x i64>* %s
@@ -789,7 +789,7 @@ define void @store_v2i64_with_unfolded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v2i64_to_numeric_address(<2 x i64> %v) {
   %s = inttoptr i32 32 to <2 x i64>*
   store <2 x i64> %v , <2 x i64>* %s
@@ -801,7 +801,7 @@ define void @store_v2i64_to_numeric_address(<2 x i64> %v) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v2i64($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v2i64($pop[[R]]), $0{{$}}
 define void @store_v2i64_to_global_address(<2 x i64> %v) {
   store <2 x i64> %v , <2 x i64>* @gv_v2i64
   ret void
@@ -814,7 +814,7 @@ define void @store_v2i64_to_global_address(<2 x i64> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32(<4 x float>* %p) {
   %v = load <4 x float>, <4 x float>* %p
@@ -825,7 +825,7 @@ define <4 x float> @load_v4f32(<4 x float>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_folded_offset(<4 x float>* %p) {
   %q = ptrtoint <4 x float>* %p to i32
@@ -839,7 +839,7 @@ define <4 x float> @load_v4f32_with_folded_offset(<4 x float>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_folded_gep_offset(<4 x float>* %p) {
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 1
@@ -853,7 +853,7 @@ define <4 x float> @load_v4f32_with_folded_gep_offset(<4 x float>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_unfolded_gep_negative_offset(<4 x float>* %p) {
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 -1
@@ -867,7 +867,7 @@ define <4 x float> @load_v4f32_with_unfolded_gep_negative_offset(<4 x float>* %p
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_unfolded_offset(<4 x float>* %p) {
   %q = ptrtoint <4 x float>* %p to i32
@@ -883,7 +883,7 @@ define <4 x float> @load_v4f32_with_unfolded_offset(<4 x float>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_unfolded_gep_offset(<4 x float>* %p) {
   %s = getelementptr <4 x float>, <4 x float>* %p, i32 1
@@ -895,7 +895,7 @@ define <4 x float> @load_v4f32_with_unfolded_gep_offset(<4 x float>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_from_numeric_address() {
   %s = inttoptr i32 32 to <4 x float>*
@@ -907,7 +907,7 @@ define <4 x float> @load_v4f32_from_numeric_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4f32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4f32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v4f32 = global <4 x float> <float 42., float 42., float 42., float 42.>
 define <4 x float> @load_v4f32_from_global_address() {
@@ -918,7 +918,7 @@ define <4 x float> @load_v4f32_from_global_address() {
 ; CHECK-LABEL: store_v4f32:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v4f32(<4 x float> %v, <4 x float>* %p) {
   store <4 x float> %v , <4 x float>* %p
   ret void
@@ -927,7 +927,7 @@ define void @store_v4f32(<4 x float> %v, <4 x float>* %p) {
 ; CHECK-LABEL: store_v4f32_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4f32_with_folded_offset(<4 x float> %v, <4 x float>* %p) {
   %q = ptrtoint <4 x float>* %p to i32
   %r = add nuw i32 %q, 16
@@ -939,7 +939,7 @@ define void @store_v4f32_with_folded_offset(<4 x float> %v, <4 x float>* %p) {
 ; CHECK-LABEL: store_v4f32_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4f32_with_folded_gep_offset(<4 x float> %v, <4 x float>* %p) {
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 1
   store <4 x float> %v , <4 x float>* %s
@@ -951,7 +951,7 @@ define void @store_v4f32_with_folded_gep_offset(<4 x float> %v, <4 x float>* %p)
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4f32_with_unfolded_gep_negative_offset(<4 x float> %v, <4 x float>* %p) {
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 -1
   store <4 x float> %v , <4 x float>* %s
@@ -963,7 +963,7 @@ define void @store_v4f32_with_unfolded_gep_negative_offset(<4 x float> %v, <4 x
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4f32_with_unfolded_offset(<4 x float> %v, <4 x float>* %p) {
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 -1
   store <4 x float> %v , <4 x float>* %s
@@ -975,7 +975,7 @@ define void @store_v4f32_with_unfolded_offset(<4 x float> %v, <4 x float>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4f32_with_unfolded_gep_offset(<4 x float> %v, <4 x float>* %p) {
   %s = getelementptr <4 x float>, <4 x float>* %p, i32 1
   store <4 x float> %v , <4 x float>* %s
@@ -986,7 +986,7 @@ define void @store_v4f32_with_unfolded_gep_offset(<4 x float> %v, <4 x float>* %
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v4f32_to_numeric_address(<4 x float> %v) {
   %s = inttoptr i32 32 to <4 x float>*
   store <4 x float> %v , <4 x float>* %s
@@ -997,7 +997,7 @@ define void @store_v4f32_to_numeric_address(<4 x float> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v4f32($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v4f32($pop[[R]]), $0{{$}}
 define void @store_v4f32_to_global_address(<4 x float> %v) {
   store <4 x float> %v , <4 x float>* @gv_v4f32
   ret void
@@ -1011,7 +1011,7 @@ define void @store_v4f32_to_global_address(<4 x float> %v) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64(<2 x double>* %p) {
   %v = load <2 x double>, <2 x double>* %p
@@ -1023,7 +1023,7 @@ define <2 x double> @load_v2f64(<2 x double>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_folded_offset(<2 x double>* %p) {
   %q = ptrtoint <2 x double>* %p to i32
@@ -1038,7 +1038,7 @@ define <2 x double> @load_v2f64_with_folded_offset(<2 x double>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_folded_gep_offset(<2 x double>* %p) {
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 1
@@ -1053,7 +1053,7 @@ define <2 x double> @load_v2f64_with_folded_gep_offset(<2 x double>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_unfolded_gep_negative_offset(<2 x double>* %p) {
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 -1
@@ -1068,7 +1068,7 @@ define <2 x double> @load_v2f64_with_unfolded_gep_negative_offset(<2 x double>*
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_unfolded_offset(<2 x double>* %p) {
   %q = ptrtoint <2 x double>* %p to i32
@@ -1085,7 +1085,7 @@ define <2 x double> @load_v2f64_with_unfolded_offset(<2 x double>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_unfolded_gep_offset(<2 x double>* %p) {
   %s = getelementptr <2 x double>, <2 x double>* %p, i32 1
@@ -1098,7 +1098,7 @@ define <2 x double> @load_v2f64_with_unfolded_gep_offset(<2 x double>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_from_numeric_address() {
   %s = inttoptr i32 32 to <2 x double>*
@@ -1111,7 +1111,7 @@ define <2 x double> @load_v2f64_from_numeric_address() {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2f64($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2f64($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v2f64 = global <2 x double> <double 42., double 42.>
 define <2 x double> @load_v2f64_from_global_address() {
@@ -1123,7 +1123,7 @@ define <2 x double> @load_v2f64_from_global_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v2f64(<2 x double> %v, <2 x double>* %p) {
   store <2 x double> %v , <2 x double>* %p
   ret void
@@ -1133,7 +1133,7 @@ define void @store_v2f64(<2 x double> %v, <2 x double>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2f64_with_folded_offset(<2 x double> %v, <2 x double>* %p) {
   %q = ptrtoint <2 x double>* %p to i32
   %r = add nuw i32 %q, 16
@@ -1146,7 +1146,7 @@ define void @store_v2f64_with_folded_offset(<2 x double> %v, <2 x double>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2f64_with_folded_gep_offset(<2 x double> %v, <2 x double>* %p) {
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 1
   store <2 x double> %v , <2 x double>* %s
@@ -1159,7 +1159,7 @@ define void @store_v2f64_with_folded_gep_offset(<2 x double> %v, <2 x double>* %
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2f64_with_unfolded_gep_negative_offset(<2 x double> %v, <2 x double>* %p) {
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 -1
   store <2 x double> %v , <2 x double>* %s
@@ -1172,7 +1172,7 @@ define void @store_v2f64_with_unfolded_gep_negative_offset(<2 x double> %v, <2 x
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2f64_with_unfolded_offset(<2 x double> %v, <2 x double>* %p) {
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 -1
   store <2 x double> %v , <2 x double>* %s
@@ -1185,7 +1185,7 @@ define void @store_v2f64_with_unfolded_offset(<2 x double> %v, <2 x double>* %p)
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2f64_with_unfolded_gep_offset(<2 x double> %v, <2 x double>* %p) {
   %s = getelementptr <2 x double>, <2 x double>* %p, i32 1
   store <2 x double> %v , <2 x double>* %s
@@ -1197,7 +1197,7 @@ define void @store_v2f64_with_unfolded_gep_offset(<2 x double> %v, <2 x double>*
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v2f64_to_numeric_address(<2 x double> %v) {
   %s = inttoptr i32 32 to <2 x double>*
   store <2 x double> %v , <2 x double>* %s
@@ -1209,7 +1209,7 @@ define void @store_v2f64_to_numeric_address(<2 x double> %v) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v2f64($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v2f64($pop[[R]]), $0{{$}}
 define void @store_v2f64_to_global_address(<2 x double> %v) {
   store <2 x double> %v , <2 x double>* @gv_v2f64
   ret void
-- 
GitLab


From f4bf727b25104cc8b0682f1eaf21865ff7dbf05c Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Thu, 1 Nov 2018 00:01:02 +0000
Subject: [PATCH 0839/1116] [WebAssembly] Lower vselect

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53630

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345797 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  9 ++
 test/CodeGen/WebAssembly/simd-vselect.ll      | 90 +++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 test/CodeGen/WebAssembly/simd-vselect.ll

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 06414c27318..70720e0c32c 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -151,6 +151,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})
       setOperationAction(Op, MVT::v2i64, Custom);
 
+  // There is no select instruction for vectors
+  if (Subtarget->hasSIMD128()) {
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
+      setOperationAction(ISD::VSELECT, T, Expand);
+    if (EnableUnimplementedWasmSIMDInstrs)
+      for (auto T : {MVT::v2i64, MVT::v2f64})
+        setOperationAction(ISD::VSELECT, T, Expand);
+  }
+
   // As a special case, these operators use the type to mean the type to
   // sign-extend from.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
diff --git a/test/CodeGen/WebAssembly/simd-vselect.ll b/test/CodeGen/WebAssembly/simd-vselect.ll
new file mode 100644
index 00000000000..fd020511cb1
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-vselect.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-unimplemented-simd -mattr=+simd128,+sign-ext | FileCheck %s
+
+; Test that lanewise vector selects lower correctly to bitselects
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: vselect_v16i8:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 7{{$}}
+; CHECK-NEXT: i8x16.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 7{{$}}
+; CHECK-NEXT: i8x16.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @vselect_v16i8(<16 x i1> %c, <16 x i8> %x, <16 x i8> %y) {
+  %res = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: vselect_v8i16:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 15{{$}}
+; CHECK-NEXT: i16x8.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 15{{$}}
+; CHECK-NEXT: i16x8.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @vselect_v8i16(<8 x i1> %c, <8 x i16> %x, <8 x i16> %y) {
+  %res = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %res
+}
+
+; CHECK-LABEL: vselect_v4i32:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @vselect_v4i32(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y) {
+  %res = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %res
+}
+
+; CHECK-LABEL: vselect_v2i64:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @vselect_v2i64(<2 x i1> %c, <2 x i64> %x, <2 x i64> %y) {
+  %res = select <2 x i1> %c, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %res
+}
+
+; CHECK-LABEL: vselect_v4f32:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @vselect_v4f32(<4 x i1> %c, <4 x float> %x, <4 x float> %y) {
+  %res = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: vselect_v2f64:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @vselect_v2f64(<2 x i1> %c, <2 x double> %x, <2 x double> %y) {
+  %res = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %res
+}
-- 
GitLab


From 3fe1b12fca949399a3334a072ee7f96e2b6f557e Mon Sep 17 00:00:00 2001
From: Dean Michael Berris <dberris@google.com>
Date: Thu, 1 Nov 2018 00:18:52 +0000
Subject: [PATCH 0840/1116] [XRay] Add CPU ID in Custom Event FDR Records

Summary:
This change cuts across compiler-rt and llvm, to increment the FDR log
version number to 4, and include the CPU ID in the custom event records.

This is a step towards allowing us to change the `llvm::xray::Trace`
object to start representing both custom and typed events in the stream
of records. Follow-on changes will allow us to change the kinds of
records we're presenting in the stream of traces, to incorporate the
data in custom/typed events.

A follow-on change will handle the typed event case, where it may not
fit within the 15-byte buffer for metadata records.

This work is part of the larger effort to enable writing analysis and
processing tools using a common in-memory representation of the events
found in traces. The work will focus on porting existing tools in LLVM
to use the common representation and informing the design of a
library/framework for expressing trace event analysis as C++ programs.

Reviewers: mboerger, eizan

Subscribers: hiraditya, mgrang, llvm-commits

Differential Revision: https://reviews.llvm.org/D53920

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345798 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/XRay/FDRRecords.h             | 14 +++++++++++---
 lib/XRay/FDRTraceWriter.cpp                |  7 ++++---
 lib/XRay/FileHeaderReader.cpp              |  3 +--
 lib/XRay/RecordInitializer.cpp             | 13 +++++++++++++
 lib/XRay/RecordPrinter.cpp                 |  5 +++--
 lib/XRay/Trace.cpp                         | 13 ++++++-------
 unittests/XRay/FDRProducerConsumerTest.cpp |  2 +-
 unittests/XRay/FDRRecordPrinterTest.cpp    |  4 ++--
 8 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/include/llvm/XRay/FDRRecords.h b/include/llvm/XRay/FDRRecords.h
index c524dab2a33..2d47ab3cfe5 100644
--- a/include/llvm/XRay/FDRRecords.h
+++ b/include/llvm/XRay/FDRRecords.h
@@ -153,13 +153,14 @@ public:
 class CustomEventRecord : public MetadataRecord {
   int32_t Size = 0;
   uint64_t TSC = 0;
+  uint16_t CPU = 0;
   std::string Data{};
   friend class RecordInitializer;
 
 public:
   CustomEventRecord() = default;
-  explicit CustomEventRecord(uint64_t S, uint64_t T, std::string D)
-      : MetadataRecord(), Size(S), TSC(T), Data(std::move(D)) {}
+  explicit CustomEventRecord(uint64_t S, uint64_t T, uint16_t C, std::string D)
+      : MetadataRecord(), Size(S), TSC(T), CPU(C), Data(std::move(D)) {}
 
   MetadataType metadataType() const override {
     return MetadataType::CustomEvent;
@@ -167,6 +168,7 @@ public:
 
   int32_t size() const { return Size; }
   uint64_t tsc() const { return TSC; }
+  uint16_t cpu() const { return CPU; }
   StringRef data() const { return Data; }
 
   Error apply(RecordVisitor &V) override;
@@ -272,10 +274,16 @@ public:
 class RecordInitializer : public RecordVisitor {
   DataExtractor &E;
   uint32_t &OffsetPtr;
+  uint16_t Version;
 
 public:
+  static constexpr uint16_t DefaultVersion = 4u;
+
+  explicit RecordInitializer(DataExtractor &DE, uint32_t &OP, uint16_t V)
+      : RecordVisitor(), E(DE), OffsetPtr(OP), Version(V) {}
+
   explicit RecordInitializer(DataExtractor &DE, uint32_t &OP)
-      : RecordVisitor(), E(DE), OffsetPtr(OP) {}
+      : RecordInitializer(DE, OP, DefaultVersion) {}
 
   Error visit(BufferExtents &) override;
   Error visit(WallclockRecord &) override;
diff --git a/lib/XRay/FDRTraceWriter.cpp b/lib/XRay/FDRTraceWriter.cpp
index d0206e775a8..4f40593cba0 100644
--- a/lib/XRay/FDRTraceWriter.cpp
+++ b/lib/XRay/FDRTraceWriter.cpp
@@ -94,9 +94,10 @@ Error FDRTraceWriter::visit(TSCWrapRecord &R) {
 }
 
 Error FDRTraceWriter::visit(CustomEventRecord &R) {
-  if (auto E = writeMetadata<5u>(OS, R.size(), R.tsc()))
+  if (auto E = writeMetadata<5u>(OS, R.size(), R.tsc(), R.cpu()))
     return E;
-  ArrayRef<char> Bytes(R.data().data(), R.data().size());
+  auto D = R.data();
+  ArrayRef<char> Bytes(D.data(), D.size());
   OS.write(Bytes);
   return Error::success();
 }
@@ -127,7 +128,7 @@ Error FDRTraceWriter::visit(FunctionRecord &R) {
   OS.write(TypeRecordFuncId);
   OS.write(R.delta());
   return Error::success();
-} // namespace xray
+}
 
 } // namespace xray
 } // namespace llvm
diff --git a/lib/XRay/FileHeaderReader.cpp b/lib/XRay/FileHeaderReader.cpp
index 967e85f30d2..9dea217840b 100644
--- a/lib/XRay/FileHeaderReader.cpp
+++ b/lib/XRay/FileHeaderReader.cpp
@@ -63,8 +63,7 @@ Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
   // Manually advance the offset pointer 16 bytes, after getting a raw memcpy
   // from the underlying data.
   OffsetPtr += 16;
-  if (FileHeader.Version != 1 && FileHeader.Version != 2 &&
-      FileHeader.Version != 3)
+  if (FileHeader.Version < 1 || FileHeader.Version > 4)
     return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Unsupported XRay file version: %d at offset %d",
                              FileHeader.Version, OffsetPtr);
diff --git a/lib/XRay/RecordInitializer.cpp b/lib/XRay/RecordInitializer.cpp
index fe76f7d79fb..2ebaa1cec26 100644
--- a/lib/XRay/RecordInitializer.cpp
+++ b/lib/XRay/RecordInitializer.cpp
@@ -118,6 +118,19 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
         std::make_error_code(std::errc::invalid_argument),
         "Cannot read a custom event TSC field at offset %d.", OffsetPtr);
 
+  // For version 4 onwards, of the FDR log, we want to also capture the CPU ID
+  // of the custom event.
+  if (Version >= 4) {
+    PreReadOffset = OffsetPtr;
+    R.CPU = E.getU16(&OffsetPtr);
+    if (PreReadOffset == OffsetPtr)
+      return createStringError(
+          std::make_error_code(std::errc::invalid_argument),
+          "Missing CPU field at offset %d", OffsetPtr);
+  }
+
+  assert(OffsetPtr > BeginOffset &&
+         OffsetPtr - BeginOffset <= MetadataRecord::kMetadataBodySize);
   OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - BeginOffset);
 
   // Next we read in a fixed chunk of data from the given offset.
diff --git a/lib/XRay/RecordPrinter.cpp b/lib/XRay/RecordPrinter.cpp
index 09b25ddba25..81d77f67cc1 100644
--- a/lib/XRay/RecordPrinter.cpp
+++ b/lib/XRay/RecordPrinter.cpp
@@ -35,8 +35,9 @@ Error RecordPrinter::visit(TSCWrapRecord &R) {
 }
 
 Error RecordPrinter::visit(CustomEventRecord &R) {
-  OS << formatv("<Custom Event: tsc = {0}, size = {1}, data = '{2}'>", R.tsc(),
-                R.size(), R.data())
+  OS << formatv(
+            "<Custom Event: tsc = {0}, cpu = {1}, size = {2}, data = '{3}'>",
+            R.tsc(), R.cpu(), R.size(), R.data())
      << Delim;
   return Error::success();
 }
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
index 1d7c723864d..e7b878cb83f 100644
--- a/lib/XRay/Trace.cpp
+++ b/lib/XRay/Trace.cpp
@@ -310,12 +310,11 @@ Error loadFDRLog(StringRef Data, bool IsLittleEndian,
   {
     for (auto &PTB : Index) {
       auto &Blocks = PTB.second;
-      llvm::sort(
-          Blocks,
-          [](const BlockIndexer::Block &L, const BlockIndexer::Block &R) {
-            return (L.WallclockTime->seconds() < R.WallclockTime->seconds() &&
-                    L.WallclockTime->nanos() < R.WallclockTime->nanos());
-          });
+      llvm::sort(Blocks, [](const BlockIndexer::Block &L,
+                            const BlockIndexer::Block &R) {
+        return (L.WallclockTime->seconds() < R.WallclockTime->seconds() &&
+                L.WallclockTime->nanos() < R.WallclockTime->nanos());
+      });
       auto Adder = [&](const XRayRecord &R) { Records.push_back(R); };
       TraceExpander Expander(Adder, FileHeader.Version);
       for (auto &B : Blocks) {
@@ -435,7 +434,7 @@ Expected<Trace> llvm::xray::loadTrace(const DataExtractor &DE, bool Sort) {
     }
     break;
   case FLIGHT_DATA_RECORDER_FORMAT:
-    if (Version == 1 || Version == 2 || Version == 3) {
+    if (Version >= 1 && Version <= 4) {
       if (auto E = loadFDRLog(DE.getData(), DE.isLittleEndian(), T.FileHeader,
                               T.Records))
         return std::move(E);
diff --git a/unittests/XRay/FDRProducerConsumerTest.cpp b/unittests/XRay/FDRProducerConsumerTest.cpp
index 838e6ca9bf1..09ec44db26e 100644
--- a/unittests/XRay/FDRProducerConsumerTest.cpp
+++ b/unittests/XRay/FDRProducerConsumerTest.cpp
@@ -54,7 +54,7 @@ template <> std::unique_ptr<Record> MakeRecord<WallclockRecord>() {
 }
 
 template <> std::unique_ptr<Record> MakeRecord<CustomEventRecord>() {
-  return make_unique<CustomEventRecord>(4, 1, "data");
+  return make_unique<CustomEventRecord>(4, 1, 2, "data");
 }
 
 template <> std::unique_ptr<Record> MakeRecord<CallArgRecord>() {
diff --git a/unittests/XRay/FDRRecordPrinterTest.cpp b/unittests/XRay/FDRRecordPrinterTest.cpp
index 339d4b0d428..321892e7240 100644
--- a/unittests/XRay/FDRRecordPrinterTest.cpp
+++ b/unittests/XRay/FDRRecordPrinterTest.cpp
@@ -55,11 +55,11 @@ template <> struct Helper<TSCWrapRecord> {
 
 template <> struct Helper<CustomEventRecord> {
   static std::unique_ptr<Record> construct() {
-    return make_unique<CustomEventRecord>(4, 1, "data");
+    return make_unique<CustomEventRecord>(4, 1, 2, "data");
   }
 
   static const char *expected() {
-    return "<Custom Event: tsc = 1, size = 4, data = 'data'>";
+    return "<Custom Event: tsc = 1, cpu = 2, size = 4, data = 'data'>";
   }
 };
 
-- 
GitLab


From e19f1300d602cb47372a155a71e336e6463aafc0 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Thu, 1 Nov 2018 00:38:01 +0000
Subject: [PATCH 0841/1116] X86: Consistently declare pass initializers in
 X86.h; NFC

This avoids declaring them twice: in X86TargetMachine.cpp and the file
implementing the pass.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345801 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/ShadowCallStack.cpp              |  4 ----
 lib/Target/X86/X86.h                            | 17 +++++++++++++----
 .../X86/X86AvoidStoreForwardingBlocks.cpp       |  4 ----
 lib/Target/X86/X86CallFrameOptimization.cpp     |  4 ----
 lib/Target/X86/X86CmovConversion.cpp            |  6 ------
 lib/Target/X86/X86DomainReassignment.cpp        |  4 ----
 lib/Target/X86/X86FixupLEAs.cpp                 |  4 ----
 lib/Target/X86/X86SpeculativeLoadHardening.cpp  |  6 ------
 lib/Target/X86/X86TargetMachine.cpp             | 15 ---------------
 lib/Target/X86/X86WinEHState.cpp                |  4 ----
 10 files changed, 13 insertions(+), 55 deletions(-)

diff --git a/lib/Target/X86/ShadowCallStack.cpp b/lib/Target/X86/ShadowCallStack.cpp
index 9a39455f9dd..ab2cebcb58e 100644
--- a/lib/Target/X86/ShadowCallStack.cpp
+++ b/lib/Target/X86/ShadowCallStack.cpp
@@ -31,10 +31,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-void initializeShadowCallStackPass(PassRegistry &);
-}
-
 namespace {
 
 class ShadowCallStack : public MachineFunctionPass {
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index d5405703fdf..19f8e35ade0 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -115,8 +115,6 @@ FunctionPass *createX86FixupBWInsts();
 /// to another, when profitable.
 FunctionPass *createX86DomainReassignmentPass();
 
-void initializeFixupBWInstPassPass(PassRegistry &);
-
 /// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX
 /// encoding when possible in order to reduce code size.
 FunctionPass *createX86EvexToVexInsts();
@@ -128,10 +126,21 @@ InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   X86Subtarget &,
                                                   X86RegisterBankInfo &);
 
-void initializeEvexToVexInstPassPass(PassRegistry &);
-
 FunctionPass *createX86SpeculativeLoadHardeningPass();
 
+void initializeEvexToVexInstPassPass(PassRegistry &);
+void initializeFixupBWInstPassPass(PassRegistry &);
+void initializeFixupLEAPassPass(PassRegistry &);
+void initializeShadowCallStackPass(PassRegistry &);
+void initializeWinEHStatePassPass(PassRegistry &);
+void initializeX86AvoidSFBPassPass(PassRegistry &);
+void initializeX86CallFrameOptimizationPass(PassRegistry &);
+void initializeX86CmovConverterPassPass(PassRegistry &);
+void initializeX86DomainReassignmentPass(PassRegistry &);
+void initializeX86ExecutionDomainFixPass(PassRegistry &);
+void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
+void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index ab2cbfc33e1..eb9c4b3e597 100644
--- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -52,10 +52,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "x86-avoid-SFB"
 
-namespace llvm {
-void initializeX86AvoidSFBPassPass(PassRegistry &);
-} // end namespace llvm
-
 static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(
     "x86-disable-avoid-SFB", cl::Hidden,
     cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index c73fd6eb144..24d7a219e75 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -56,10 +56,6 @@ static cl::opt<bool>
                cl::desc("Avoid optimizing x86 call frames for size"),
                cl::init(false), cl::Hidden);
 
-namespace llvm {
-void initializeX86CallFrameOptimizationPass(PassRegistry &);
-}
-
 namespace {
 
 class X86CallFrameOptimization : public MachineFunctionPass {
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index 1c5f110d8c6..c3e76fd2a85 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -81,12 +81,6 @@ STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
 STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
 STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
 
-namespace llvm {
-
-void initializeX86CmovConverterPassPass(PassRegistry &);
-
-} // end namespace llvm
-
 // This internal switch can be used to turn off the cmov/branch optimization.
 static cl::opt<bool>
     EnableCmovConverter("x86-cmov-converter",
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index 62588e9509d..7e1f1e7876c 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -31,10 +31,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-void initializeX86DomainReassignmentPass(PassRegistry &);
-}
-
 #define DEBUG_TYPE "x86-domain-reassignment"
 
 STATISTIC(NumClosuresConverted, "Number of closures converted by the pass");
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 33a8baac594..ed24d6a8547 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -25,10 +25,6 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-namespace llvm {
-void initializeFixupLEAPassPass(PassRegistry &);
-}
-
 #define FIXUPLEA_DESC "X86 LEA Fixup"
 #define FIXUPLEA_NAME "x86-fixup-LEAs"
 
diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 14e4c455a08..b8cb11fb862 100644
--- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -119,12 +119,6 @@ static cl::opt<bool> HardenIndirectCallsAndJumps(
              "mitigate Spectre v1.2 style attacks."),
     cl::init(true), cl::Hidden);
 
-namespace llvm {
-
-void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
-
-} // end namespace llvm
-
 namespace {
 
 class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 3583a9cfb8d..6426e5b076c 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -59,21 +59,6 @@ static cl::opt<bool> EnableCondBrFoldingPass("x86-condbr-folding",
                                         "folding pass"),
                                cl::init(true), cl::Hidden);
 
-namespace llvm {
-
-void initializeWinEHStatePassPass(PassRegistry &);
-void initializeFixupLEAPassPass(PassRegistry &);
-void initializeShadowCallStackPass(PassRegistry &);
-void initializeX86CallFrameOptimizationPass(PassRegistry &);
-void initializeX86CmovConverterPassPass(PassRegistry &);
-void initializeX86ExecutionDomainFixPass(PassRegistry &);
-void initializeX86DomainReassignmentPass(PassRegistry &);
-void initializeX86AvoidSFBPassPass(PassRegistry &);
-void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
-void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
-
-} // end namespace llvm
-
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
   RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index c11e7e365a1..185deda97c1 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -34,10 +34,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "winehstate"
 
-namespace llvm {
-void initializeWinEHStatePassPass(PassRegistry &);
-}
-
 namespace {
 const int OverdefinedState = INT_MIN;
 
-- 
GitLab


From fb149994336bc03673265d67ac219f4ef249a308 Mon Sep 17 00:00:00 2001
From: Li Jia He <hljhehlj@cn.ibm.com>
Date: Thu, 1 Nov 2018 02:35:17 +0000
Subject: [PATCH 0842/1116] =?UTF-8?q?[PowerPC]=20Support=20constraint=20'w?=
 =?UTF-8?q?i'=20in=20asm=20=20=20From=20the=20gcc=20manual,=20we=20can=20s?=
 =?UTF-8?q?ee=20that=20the=20specific=20limit=20of=20wi=20inline=20asm=20i?=
 =?UTF-8?q?s=20=E2=80=9CFP=20or=20VSX=20register=20to=20hold=2064-bit=20in?=
 =?UTF-8?q?tegers=20for=20VSX=20insns=20or=20NO=5FREGS=E2=80=9D.=20The=20l?=
 =?UTF-8?q?ink=20is=C2=A0https://gcc.gnu.org/onlinedocs/gcc-8.2.0/gcc/Mach?=
 =?UTF-8?q?ine-Constraints.html#Machine-Constraints.=C2=A0We=20should=20ac?=
 =?UTF-8?q?cept=20this=20constraint.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed By: jsji

Differential Revision: https://reviews.llvm.org/D53265


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345810 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCISelLowering.cpp    |  8 ++++++--
 test/CodeGen/PowerPC/inlineasm-vsx-reg.ll | 15 +++++++++++++++
 test/CodeGen/PowerPC/vec-asm-disabled.ll  |  9 +++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index a135667beaa..4ed110e6663 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -13362,7 +13362,8 @@ PPCTargetLowering::getConstraintType(StringRef Constraint) const {
   } else if (Constraint == "wc") { // individual CR bits.
     return C_RegisterClass;
   } else if (Constraint == "wa" || Constraint == "wd" ||
-             Constraint == "wf" || Constraint == "ws") {
+             Constraint == "wf" || Constraint == "ws" ||
+             Constraint == "wi") {
     return C_RegisterClass; // VSX registers.
   }
   return TargetLowering::getConstraintType(Constraint);
@@ -13392,6 +13393,8 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
     return CW_Register;
   else if (StringRef(constraint) == "ws" && type->isDoubleTy())
     return CW_Register;
+  else if (StringRef(constraint) == "wi" && type->isIntegerTy(64))
+    return CW_Register; // just hold 64-bit integers data.
 
   switch (*constraint) {
   default:
@@ -13474,7 +13477,8 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     // An individual CR bit.
     return std::make_pair(0U, &PPC::CRBITRCRegClass);
   } else if ((Constraint == "wa" || Constraint == "wd" ||
-             Constraint == "wf") && Subtarget.hasVSX()) {
+             Constraint == "wf" || Constraint == "wi") &&
+             Subtarget.hasVSX()) {
     return std::make_pair(0U, &PPC::VSRCRegClass);
   } else if (Constraint == "ws" && Subtarget.hasVSX()) {
     if (VT == MVT::f32 && Subtarget.hasP8Vector())
diff --git a/test/CodeGen/PowerPC/inlineasm-vsx-reg.ll b/test/CodeGen/PowerPC/inlineasm-vsx-reg.ll
index 9de6358427d..0ebb4493065 100644
--- a/test/CodeGen/PowerPC/inlineasm-vsx-reg.ll
+++ b/test/CodeGen/PowerPC/inlineasm-vsx-reg.ll
@@ -12,6 +12,21 @@ entry:
 ; CHECK: #NO_APP
 }
 
+define signext i32 @foo1(<4 x float> %__A) {
+entry:
+  %0 = tail call { i32, <4 x float> } asm "xxsldwi ${1:x},${2:x},${2:x},3;\0Axscvspdp ${1:x},${1:x};\0Afctiw  $1,$1;\0Amfvsrd  $0,${1:x};\0A", "=r,=&^wi,^wa"(<4 x float> %__A)
+  %asmresult = extractvalue { i32, <4 x float> } %0, 0
+  ret i32 %asmresult
+
+; CHECK: #APP
+; CHECK: xxsldwi vs0, v2, v2, 3
+; CHECK: xscvspdp f0, f0
+; CEHCK: fctiw f0, f0
+; CHECK: mffprd r3, f0
+; CEHCK: extsw r3, r3
+; CHECK: #NO_APP
+}
+
 define double @test() {
   entry:
     %0 = tail call double asm "mtvsrd ${0:x}, 1", "=^ws,~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14}"()
diff --git a/test/CodeGen/PowerPC/vec-asm-disabled.ll b/test/CodeGen/PowerPC/vec-asm-disabled.ll
index 333ccce6b89..614f3e3f03a 100644
--- a/test/CodeGen/PowerPC/vec-asm-disabled.ll
+++ b/test/CodeGen/PowerPC/vec-asm-disabled.ll
@@ -10,5 +10,14 @@ entry:
 ; CHECK: error: couldn't allocate output register for constraint 'wd'
 }
 
+define signext i32 @testi2(<4 x float> %__A) #0 {
+entry:
+  %0 = tail call { i32, <4 x float> } asm "xxsldwi ${1:x},${2:x},${2:x},3", "=^wi,=&^wi,^wi"(<4 x float> %__A) #0
+  %asmresult = extractvalue { i32, <4 x float> } %0, 0
+  ret i32 %asmresult
+
+; CHECK: error: couldn't allocate output register for constraint 'wi'
+}
+
 attributes #0 = { nounwind "target-features"="-vsx" }
 
-- 
GitLab


From fd25d2bf307043f0f4a3b8fa13a9380512d7deb7 Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Thu, 1 Nov 2018 04:02:41 +0000
Subject: [PATCH 0843/1116] [CodeView] Emit the correct TypeIndex for
 std::nullptr_t.

The TypeIndex used by cl.exe is 0x103, which indicates a SimpleTypeMode
of NearPointer (note the absence of the bitness, normally pointers use a
mode of NearPointer32 or NearPointer64) and a SimpleTypeKind of void.
So this is basically a void*, but without a specified size, which makes
sense given how std::nullptr_t is defined.

clang-cl was actually not emitting *anything* for this. Instead, when we
encountered std::nullptr_t in a DIType, we would actually just emit a
TypeIndex of 0, which is obviously wrong.

std::nullptr_t in DWARF is represented as a DW_TAG_unspecified_type with
a name of "decltype(nullptr)", so we add that logic along with a test,
as well as an update to the dumping code so that we no longer print
void* when dumping 0x103 (which would previously treat Void/NearPointer
no differently than Void/NearPointer64).

Differential Revision: https://reviews.llvm.org/D53957

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345811 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/CodeView/TypeIndex.h |  7 ++++
 lib/CodeGen/AsmPrinter/CodeViewDebug.cpp    |  2 +
 lib/DebugInfo/CodeView/TypeIndex.cpp        |  3 ++
 test/DebugInfo/COFF/types-std-nullptr-t.ll  | 42 +++++++++++++++++++++
 4 files changed, 54 insertions(+)
 create mode 100644 test/DebugInfo/COFF/types-std-nullptr-t.ll

diff --git a/include/llvm/DebugInfo/CodeView/TypeIndex.h b/include/llvm/DebugInfo/CodeView/TypeIndex.h
index 681b5f3aca9..58463a6b13d 100644
--- a/include/llvm/DebugInfo/CodeView/TypeIndex.h
+++ b/include/llvm/DebugInfo/CodeView/TypeIndex.h
@@ -145,6 +145,13 @@ public:
     return TypeIndex(SimpleTypeKind::Void, SimpleTypeMode::NearPointer64);
   }
 
+  static TypeIndex NullptrT() {
+    // std::nullptr_t uses the pointer mode that doesn't indicate bit-width,
+    // presumably because std::nullptr_t is intended to be compatible with any
+    // pointer type.
+    return TypeIndex(SimpleTypeKind::Void, SimpleTypeMode::NearPointer);
+  }
+
   static TypeIndex SignedCharacter() {
     return TypeIndex(SimpleTypeKind::SignedCharacter);
   }
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 42259d4a62d..9b2b3477be7 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -1516,6 +1516,8 @@ TypeIndex CodeViewDebug::lowerType(const DIType *Ty, const DIType *ClassTy) {
   case dwarf::DW_TAG_union_type:
     return lowerTypeUnion(cast<DICompositeType>(Ty));
   case dwarf::DW_TAG_unspecified_type:
+    if (Ty->getName() == "decltype(nullptr)")
+      return TypeIndex::NullptrT();
     return TypeIndex::None();
   default:
     // Use the null type index.
diff --git a/lib/DebugInfo/CodeView/TypeIndex.cpp b/lib/DebugInfo/CodeView/TypeIndex.cpp
index 24fe5fcb28d..332d67470da 100644
--- a/lib/DebugInfo/CodeView/TypeIndex.cpp
+++ b/lib/DebugInfo/CodeView/TypeIndex.cpp
@@ -74,6 +74,9 @@ StringRef TypeIndex::simpleTypeName(TypeIndex TI) {
   if (TI.isNoneType())
     return "<no type>";
 
+  if (TI == TypeIndex::NullptrT())
+    return "std::nullptr_t";
+
   // This is a simple type.
   for (const auto &SimpleTypeName : SimpleTypeNames) {
     if (SimpleTypeName.Kind == TI.getSimpleKind()) {
diff --git a/test/DebugInfo/COFF/types-std-nullptr-t.ll b/test/DebugInfo/COFF/types-std-nullptr-t.ll
new file mode 100644
index 00000000000..4d64a67860d
--- /dev/null
+++ b/test/DebugInfo/COFF/types-std-nullptr-t.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -filetype=obj | llvm-readobj - -codeview | FileCheck %s
+
+; C++ source to regenerate:
+; $ cat foo.cpp
+; decltype(nullptr) NullPtr = nullptr;
+; $ clang hello.cpp -S -emit-llvm -g -gcodeview -o t.ll
+
+; CHECK: CodeViewDebugInfo [
+; CHECK:   Subsection [
+; CHECK:     SubSectionType: Symbols (0xF1)
+; CHECK:     GlobalData {
+; CHECK:       Kind: S_GDATA32 (0x110D)
+; CHECK:       DataOffset: ?NullPtr@@3$$TA+0x0
+; CHECK:       Type: std::nullptr_t (0x103)
+; CHECK:       DisplayName: NullPtr
+; CHECK:       LinkageName: ?NullPtr@@3$$TA
+; CHECK:     }
+
+
+; ModuleID = 'foo.cpp'
+source_filename = "foo.cpp"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.15.26730"
+
+@"?NullPtr@@3$$TA" = dso_local global i8* null, align 8, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9, !10}
+!llvm.ident = !{!11}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "NullPtr", linkageName: "?NullPtr@@3$$TA", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 8.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: None)
+!3 = !DIFile(filename: "foo.cpp", directory: "D:\5Csrc\5Cllvmbuild\5Ccl\5CDebug\5Cx64", checksumkind: CSK_MD5, checksum: "0d5c7c9860a17e584808c03a24a135e6")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(tag: DW_TAG_unspecified_type, name: "decltype(nullptr)")
+!7 = !{i32 2, !"CodeView", i32 1}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 2}
+!10 = !{i32 7, !"PIC Level", i32 2}
+!11 = !{!"clang version 8.0.0 "}
-- 
GitLab


From 27ade8ff0f1054f9bfb005ae038cc0f992217c5b Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Thu, 1 Nov 2018 06:18:27 +0000
Subject: [PATCH 0844/1116] [SCEV] Avoid redundant computations when doing
 AddRec merge

When we calculate a product of 2 AddRecs, we end up making quite massive
computations to deduce the operands of resulting AddRec. This process can
be optimized by computing all args of intermediate sum and then calling
`getAddExpr` once rather than calling `getAddExpr` with intermediate
result every time a new argument is computed.

Differential Revision: https://reviews.llvm.org/D53189
Reviewed By: rtereshin


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345813 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ScalarEvolution.cpp                    | 11 ++++++-----
 test/Analysis/ScalarEvolution/binomial-explision.ll |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 8fe500f150b..77f2467d72d 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -3060,7 +3060,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       SmallVector<const SCEV*, 7> AddRecOps;
       for (int x = 0, xe = AddRec->getNumOperands() +
              OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) {
-        const SCEV *Term = getZero(Ty);
+        SmallVector <const SCEV *, 7> SumOps;
         for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
           uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
           for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
@@ -3075,12 +3075,13 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
             const SCEV *CoeffTerm = getConstant(Ty, Coeff);
             const SCEV *Term1 = AddRec->getOperand(y-z);
             const SCEV *Term2 = OtherAddRec->getOperand(z);
-            Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1, Term2,
-                                               SCEV::FlagAnyWrap, Depth + 1),
-                              SCEV::FlagAnyWrap, Depth + 1);
+            SumOps.push_back(getMulExpr(CoeffTerm, Term1, Term2,
+                                        SCEV::FlagAnyWrap, Depth + 1));
           }
         }
-        AddRecOps.push_back(Term);
+        if (SumOps.empty())
+          SumOps.push_back(getZero(Ty));
+        AddRecOps.push_back(getAddExpr(SumOps, SCEV::FlagAnyWrap, Depth + 1));
       }
       if (!Overflow) {
         const SCEV *NewAddRec = getAddRecExpr(AddRecOps, AddRec->getLoop(),
diff --git a/test/Analysis/ScalarEvolution/binomial-explision.ll b/test/Analysis/ScalarEvolution/binomial-explision.ll
index 82d0beda6b5..ff27bfcbd76 100644
--- a/test/Analysis/ScalarEvolution/binomial-explision.ll
+++ b/test/Analysis/ScalarEvolution/binomial-explision.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 define void @test(i32 %x, i64 %y, i1 %cond) {
 
 ; CHECK: %tmp19 = mul i32 %tmp17, %tmp18
-; CHECK: ((((
+; CHECK: ((((((
 ; CHECK-NOT: (((((
 ; CHECK: %tmp20 = add i32 %tmp19, %x
 
-- 
GitLab


From 2f425e9c7946b9d74e64ebbfa33c1caa36914402 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Thu, 1 Nov 2018 06:47:01 +0000
Subject: [PATCH 0845/1116] [IndVars] Smart hard uses detection

When rewriting loop exit values, IndVars considers this transform not profitable if
the loop instruction has a loop user which it believes cannot be optimized away.
In current implementation only calls that immediately use the instruction are considered
as such.

This patch extends the definition of "hard" users to any side-effecting instructions
(which usually cannot be optimized away from the loop) and also allows handling
of not just immediate users, but use chains.

Differentlai Revision: https://reviews.llvm.org/D51584
Reviewed By: etherzhhb


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345814 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/IndVarSimplify.cpp      | 39 +++++++++-----
 test/Analysis/ScalarEvolution/pr28705.ll      |  6 +--
 .../IndVarSimplify/dont-recompute.ll          | 51 +++++++++++++++++++
 .../IndVarSimplify/lrev-existing-umin.ll      | 38 ++++++++++++++
 4 files changed, 118 insertions(+), 16 deletions(-)

diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index ec51ad71abc..3e4e0f46ca3 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -145,6 +145,7 @@ class IndVarSimplify {
   bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
   bool rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
   bool rewriteFirstIterationLoopExitValues(Loop *L);
+  bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) const;
 
   bool linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
                                  PHINode *IndVar, SCEVExpander &Rewriter);
@@ -524,6 +525,29 @@ struct RewritePhi {
 // As a side effect, reduces the amount of IV processing within the loop.
 //===----------------------------------------------------------------------===//
 
+bool IndVarSimplify::hasHardUserWithinLoop(const Loop *L, const Instruction *I) const {
+  SmallPtrSet<const Instruction *, 8> Visited;
+  SmallVector<const Instruction *, 8> WorkList;
+  Visited.insert(I);
+  WorkList.push_back(I);
+  while (!WorkList.empty()) {
+    const Instruction *Curr = WorkList.pop_back_val();
+    // This use is outside the loop, nothing to do.
+    if (!L->contains(Curr))
+      continue;
+    // Do we assume it is a "hard" use which will not be eliminated easily?
+    if (Curr->mayHaveSideEffects())
+      return true;
+    // Otherwise, add all its users to worklist.
+    for (auto U : Curr->users()) {
+      auto *UI = cast<Instruction>(U);
+      if (Visited.insert(UI).second)
+        WorkList.push_back(UI);
+    }
+  }
+  return false;
+}
+
 /// Check to see if this loop has a computable loop-invariant execution count.
 /// If so, this means that we can compute the final value of any expressions
 /// that are recurrent in the loop, and substitute the exit values from the loop
@@ -598,19 +622,8 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         // Computing the value outside of the loop brings no benefit if it is
         // definitely used inside the loop in a way which can not be optimized
         // away.
-        if (ExitValue->getSCEVType()>=scMulExpr) {
-          bool HasHardInternalUses = false;
-          for (auto *IB : Inst->users()) {
-            Instruction *UseInstr = cast<Instruction>(IB);
-            unsigned Opc = UseInstr->getOpcode();
-            if (L->contains(UseInstr) && Opc == Instruction::Call) {
-              HasHardInternalUses = true;
-              break;
-            }
-          }
-          if (HasHardInternalUses)
-            continue;
-        }
+        if (hasHardUserWithinLoop(L, Inst))
+          continue;
 
         bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst);
         Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst);
diff --git a/test/Analysis/ScalarEvolution/pr28705.ll b/test/Analysis/ScalarEvolution/pr28705.ll
index 8fbc08e3ca6..9a8487a6c66 100644
--- a/test/Analysis/ScalarEvolution/pr28705.ll
+++ b/test/Analysis/ScalarEvolution/pr28705.ll
@@ -1,11 +1,11 @@
 ; PR28705
 ; RUN: opt < %s -indvars -S | FileCheck %s
 
-; Check IndVarSimplify replaces the exitval use of the induction var "%inc.i.i"
-; with "%.sroa.speculated + 1".
+; Check IndVarSimplify doesn't replace external use of the induction var
+; "%inc.i.i" with "%.sroa.speculated + 1" because it is not profitable.
 ;
 ; CHECK-LABEL: @foo(
-; CHECK: %[[EXIT:.+]] = sub i32 %.sroa.speculated, -1
+; CHECK: %[[EXIT:.+]] = phi i32 [ %inc.i.i, %for.body650 ]
 ; CHECK: %DB.sroa.9.0.lcssa = phi i32 [ 1, %entry ], [ %[[EXIT]], %loopexit ]
 ;
 define void @foo(i32 %sub.ptr.div.i, i8* %ref.i1174) local_unnamed_addr {
diff --git a/test/Transforms/IndVarSimplify/dont-recompute.ll b/test/Transforms/IndVarSimplify/dont-recompute.ll
index c87cd6596c6..22087710a9c 100644
--- a/test/Transforms/IndVarSimplify/dont-recompute.ll
+++ b/test/Transforms/IndVarSimplify/dont-recompute.ll
@@ -123,3 +123,54 @@ for.end:                                          ; preds = %for.body
   tail call void @func(i32 %soft_use)
   ret void
 }
+
+; CHECK-LABEL: @test5(
+define void @test5(i32 %m) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add i32 %a.05, %m
+  %soft_use = add i32 %add, 123
+; CHECK: tail call void @func(i32 %soft_use)
+  tail call void @func(i32 %soft_use)
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 186
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+; CHECK: for.end:
+; CHECK-NOT: mul i32 %m, 186
+; CHECK:%add.lcssa = phi i32 [ %add, %for.body ]
+; CHECK-NEXT: tail call void @func(i32 %add.lcssa)
+  tail call void @func(i32 %add)
+  ret void
+}
+
+; CHECK-LABEL: @test6(
+define void @test6(i32 %m, i32* %p) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add i32 %a.05, %m
+  %soft_use = add i32 %add, 123
+; CHECK: store i32 %soft_use, i32* %pidx
+  %pidx = getelementptr i32, i32* %p, i32 %add
+  store i32 %soft_use, i32* %pidx
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 186
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+; CHECK: for.end:
+; CHECK-NOT: mul i32 %m, 186
+; CHECK:%add.lcssa = phi i32 [ %add, %for.body ]
+; CHECK-NEXT: tail call void @func(i32 %add.lcssa)
+  tail call void @func(i32 %add)
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/lrev-existing-umin.ll b/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
index 961c9fd944d..fff76675f17 100644
--- a/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
+++ b/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S -indvars < %s | FileCheck %s
 
+; Do not rewrite the user outside the loop because we must keep the instruction
+; inside the loop due to store. Rewrite doesn't give us any profit.
 define void @f(i32 %length.i.88, i32 %length.i, i8* %tmp12, i32 %tmp10, i8* %tmp8) {
 ; CHECK-LABEL: @f(
 not_zero11.preheader:
@@ -22,6 +24,42 @@ not_zero11:
   %tmp23 = icmp slt i32 %tmp22, %tmp14
   br i1 %tmp23, label %not_zero11, label %main.exit.selector
 
+main.exit.selector:
+; CHECK-LABEL: main.exit.selector:
+; CHECK:   %tmp22.lcssa = phi i32 [ %tmp22, %not_zero11 ]
+; CHECK:   %tmp24 = icmp slt i32 %tmp22.lcssa, %length.
+  %tmp24 = icmp slt i32 %tmp22, %length.i
+  br i1 %tmp24, label %not_zero11.postloop, label %leave
+
+leave:
+  ret void
+
+not_zero11.postloop:
+  ret void
+}
+
+; Rewrite the user outside the loop because there is no hard users inside the loop.
+define void @f1(i32 %length.i.88, i32 %length.i, i8* %tmp12, i32 %tmp10, i8* %tmp8) {
+; CHECK-LABEL: @f1(
+not_zero11.preheader:
+  %tmp13 = icmp ugt i32 %length.i, %length.i.88
+  %tmp14 = select i1 %tmp13, i32 %length.i.88, i32 %length.i
+  %tmp15 = icmp sgt i32 %tmp14, 0
+  br i1 %tmp15, label %not_zero11, label %not_zero11.postloop
+
+not_zero11:
+  %v_1 = phi i32 [ %tmp22, %not_zero11 ], [ 0, %not_zero11.preheader ]
+  %tmp16 = zext i32 %v_1 to i64
+  %tmp17 = getelementptr inbounds i8, i8* %tmp8, i64 %tmp16
+  %tmp18 = load i8, i8* %tmp17, align 1
+  %tmp19 = zext i8 %tmp18 to i32
+  %tmp20 = or i32 %tmp19, %tmp10
+  %tmp21 = trunc i32 %tmp20 to i8
+  %addr22 = getelementptr inbounds i8, i8* %tmp12, i64 %tmp16
+  %tmp22 = add nuw nsw i32 %v_1, 1
+  %tmp23 = icmp slt i32 %tmp22, %tmp14
+  br i1 %tmp23, label %not_zero11, label %main.exit.selector
+
 main.exit.selector:
 ; CHECK-LABEL: main.exit.selector:
 ; CHECK: %tmp24 = icmp slt i32 %tmp14, %length.i
-- 
GitLab


From b7d7362e652b868e42032b8bee5e853e5f90dfe1 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Thu, 1 Nov 2018 09:01:51 +0000
Subject: [PATCH 0846/1116] [SystemZ::TTI]  Accurate costs for i1->double
 vector conversions

This factors out a new method getBoolVecToIntConversionCost() containing the
code for vector sext/zext of i1, in order to reuse it for i1 to double vector
conversions.

Review: Ulrich Weigand
https://reviews.llvm.org/D53923

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345817 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SystemZ/SystemZTargetTransformInfo.cpp    | 43 ++++++++++++-------
 .../SystemZ/SystemZTargetTransformInfo.h      |  2 +
 test/Analysis/CostModel/SystemZ/cmp-tofp.ll   | 43 +++++++++++++++++++
 3 files changed, 73 insertions(+), 15 deletions(-)
 create mode 100644 test/Analysis/CostModel/SystemZ/cmp-tofp.ll

diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 94db56e3738..18333bfcc11 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -635,6 +635,25 @@ static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
   return nullptr;
 }
 
+// Get the cost of converting a boolean vector to a vector with same width
+// and element size as Dst, plus the cost of zero extending if needed.
+unsigned SystemZTTIImpl::
+getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
+                              const Instruction *I) {
+  assert (Dst->isVectorTy());
+  unsigned VF = Dst->getVectorNumElements();
+  unsigned Cost = 0;
+  // If we know what the widths of the compared operands, get any cost of
+  // converting it to match Dst. Otherwise assume same widths.
+  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+  if (CmpOpTy != nullptr)
+    Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
+  if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
+    // One 'vn' per dst vector with an immediate mask.
+    Cost += getNumVectorRegs(Dst);
+  return Cost;
+}
+
 int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                      const Instruction *I) {
   unsigned DstScalarBits = Dst->getScalarSizeInBits();
@@ -666,19 +685,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 
         return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
       }
-      else if (SrcScalarBits == 1) {
-        // This should be extension of a compare i1 result.
-        // If we know what the widths of the compared operands, get the
-        // cost of converting it to Dst. Otherwise assume same widths.
-        unsigned Cost = 0;
-        Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
-        if (CmpOpTy != nullptr)
-          Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
-        if (Opcode == Instruction::ZExt)
-          // One 'vn' per dst vector with an immediate mask.
-          Cost += NumDstVectors;
-        return Cost;
-      }
+      else if (SrcScalarBits == 1)
+        return getBoolVecToIntConversionCost(Opcode, Dst, I);
     }
 
     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
@@ -687,8 +695,13 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       // (seems to miss on differentiating on scalar/vector types).
 
       // Only 64 bit vector conversions are natively supported.
-      if (SrcScalarBits == 64 && DstScalarBits == 64)
-        return NumDstVectors;
+      if (DstScalarBits == 64) {
+        if (SrcScalarBits == 64)
+          return NumDstVectors;
+
+        if (SrcScalarBits == 1)
+          return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
+      }
 
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values. Base implementation does not
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 406f075c8a6..dd85c4ea541 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -80,6 +80,8 @@ public:
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
   unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
+  unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
+                                         const Instruction *I);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                        const Instruction *I = nullptr);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
diff --git a/test/Analysis/CostModel/SystemZ/cmp-tofp.ll b/test/Analysis/CostModel/SystemZ/cmp-tofp.ll
new file mode 100644
index 00000000000..f50e3ea23cf
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/cmp-tofp.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Costs for conversion of i1 vectors to vectors of double.
+
+define <2 x double> @fun0(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = uitofp <2 x i1> %cmp to <2 x double>
+  ret <2 x double> %v
+
+; CHECK: fun0
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = uitofp <2 x i1> %cmp to <2 x double>
+}
+
+define <2 x double> @fun1(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = sitofp <2 x i1> %cmp to <2 x double>
+  ret <2 x double> %v
+
+; CHECK: fun1
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sitofp <2 x i1> %cmp to <2 x double>
+}
+
+define <2 x double> @fun2(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = uitofp <2 x i1> %cmp to <2 x double>
+  ret <2 x double> %v
+
+; CHECK: fun2
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = uitofp <2 x i1> %cmp to <2 x double>
+}
+
+define <2 x double> @fun3(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = sitofp <2 x i1> %cmp to <2 x double>
+  ret <2 x double> %v
+
+; CHECK: fun3
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sitofp <2 x i1> %cmp to <2 x double>
+}
-- 
GitLab


From e46f5df5597cf319f406630da05da1146aa447e5 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Thu, 1 Nov 2018 09:05:32 +0000
Subject: [PATCH 0847/1116] [SystemZ::TTI]  Recognize the higher cost of scalar
 i1 -> fp conversion

Scalar i1 to fp conversions are done with a branch sequence, so it should
have a higher cost.

Review: Ulrich Weigand
https://reviews.llvm.org/D53924

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345818 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  4 +++-
 .../CostModel/SystemZ/cmp-tofp-scalar.ll      | 23 +++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 test/Analysis/CostModel/SystemZ/cmp-tofp-scalar.ll

diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 18333bfcc11..e7052e2e469 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -749,7 +749,9 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     assert (!Dst->isVectorTy());
 
     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)
-      return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/);
+      return (SrcScalarBits >= 32
+                ? 1
+                : SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/);
 
     if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
         Src->isIntegerTy(1)) {
diff --git a/test/Analysis/CostModel/SystemZ/cmp-tofp-scalar.ll b/test/Analysis/CostModel/SystemZ/cmp-tofp-scalar.ll
new file mode 100644
index 00000000000..6cd4ead76a5
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/cmp-tofp-scalar.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Costs for conversion of i1 to fp.
+
+define float @fun0(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = uitofp i1 %cmp to float
+  ret float %v
+
+; CHECK: fun0
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = uitofp i1 %cmp to float
+}
+
+define double @fun1(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = uitofp i1 %cmp to double
+  ret double %v
+
+; CHECK: fun1
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = uitofp i1 %cmp to double
+}
-- 
GitLab


From 1edc3c60f395a751e385c84853fe97df44ab6a0e Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Thu, 1 Nov 2018 09:42:50 +0000
Subject: [PATCH 0848/1116] [NFC] Reorganize code to prepare it for more
 transforms

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345820 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index ed37fc8825d..6cac3787311 100644
--- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -41,8 +41,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-simplifycfg"
 
-static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
-                            ScalarEvolution &SE, MemorySSAUpdater *MSSAU) {
+static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT,
+                                        LoopInfo &LI, MemorySSAUpdater *MSSAU) {
   bool Changed = false;
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   // Copy blocks into a temporary array to avoid iterator invalidation issues
@@ -63,14 +63,25 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
     // Merge Succ into Pred and delete it.
     MergeBlockIntoPredecessor(Succ, &DTU, &LI, MSSAU);
 
-    SE.forgetTopmostLoop(&L);
-
     Changed = true;
   }
 
   return Changed;
 }
 
+static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                            ScalarEvolution &SE, MemorySSAUpdater *MSSAU) {
+  bool Changed = false;
+
+  // Eliminate unconditional branches by merging blocks into their predecessors.
+  Changed |= mergeBlocksIntoPredecessors(L, DT, LI, MSSAU);
+
+  if (Changed)
+    SE.forgetTopmostLoop(&L);
+
+  return Changed;
+}
+
 PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
                                            LoopStandardAnalysisResults &AR,
                                            LPMUpdater &) {
-- 
GitLab


From eae41c39e729844adeb32ff31d6e8ca403b3b878 Mon Sep 17 00:00:00 2001
From: Stefan Maksimovic <stefan.maksimovic@mips.com>
Date: Thu, 1 Nov 2018 10:10:42 +0000
Subject: [PATCH 0849/1116] [Mips] Conditionally remove successor block

In MipsBranchExpansion::splitMBB, upon splitting
a block with two direct branches, remove the successor
of the newly created block (which inherits successors from
the original block) which is pointed to by the last
branch in the original block only if the targets of two
branches differ.

This is to fix the failing test when ran with
-verify-machineinstrs enabled.

Differential Revision: https://reviews.llvm.org/D53756


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345821 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MipsBranchExpansion.cpp | 3 ++-
 test/CodeGen/Mips/micromips-mtc-mfc.ll  | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/Target/Mips/MipsBranchExpansion.cpp b/lib/Target/Mips/MipsBranchExpansion.cpp
index f316e308be7..a8aca905965 100644
--- a/lib/Target/Mips/MipsBranchExpansion.cpp
+++ b/lib/Target/Mips/MipsBranchExpansion.cpp
@@ -271,7 +271,8 @@ void MipsBranchExpansion::splitMBB(MachineBasicBlock *MBB) {
   // Insert NewMBB and fix control flow.
   MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
   NewMBB->transferSuccessors(MBB);
-  NewMBB->removeSuccessor(Tgt, true);
+  if (Tgt != getTargetMBB(*LastBr))
+    NewMBB->removeSuccessor(Tgt, true);
   MBB->addSuccessor(NewMBB);
   MBB->addSuccessor(Tgt);
   MFp->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
diff --git a/test/CodeGen/Mips/micromips-mtc-mfc.ll b/test/CodeGen/Mips/micromips-mtc-mfc.ll
index 1db9337a982..c60b0067522 100644
--- a/test/CodeGen/Mips/micromips-mtc-mfc.ll
+++ b/test/CodeGen/Mips/micromips-mtc-mfc.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=mips -mcpu=mips32r2 -mattr=+micromips \
+; RUN: llc -mtriple=mips -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs \
 ; RUN:     -show-mc-encoding < %s | FileCheck --check-prefix=MM2 %s
-; RUN: llc -mtriple=mips -mcpu=mips32r6 -mattr=+micromips \
+; RUN: llc -mtriple=mips -mcpu=mips32r6 -mattr=+micromips -verify-machineinstrs \
 ; RUN:     -show-mc-encoding < %s | FileCheck --check-prefix=MM6 %s
 
 define double @foo(double %a, double %b) {
-- 
GitLab


From 477ccd4e50edcf297765c7bbb6464abd3533f150 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Thu, 1 Nov 2018 10:16:06 +0000
Subject: [PATCH 0850/1116] [NFC] Specialize public API of ICFLoopSafetyInfo
 for insertions and removals

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345822 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/MustExecute.h | 18 ++++++++++++------
 lib/Analysis/MustExecute.cpp        |  8 +++++++-
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h
index 62d9b056e88..c4005a9af95 100644
--- a/include/llvm/Analysis/MustExecute.h
+++ b/include/llvm/Analysis/MustExecute.h
@@ -125,8 +125,9 @@ public:
 
 /// This implementation of LoopSafetyInfo use ImplicitControlFlowTracking to
 /// give precise answers on "may throw" queries. This implementation uses cache
-/// that should be invalidated by calling the method dropCachedInfo whenever we
-/// modify a basic block's contents by adding or removing instructions.
+/// that should be invalidated by calling the methods insertInstructionTo and
+/// removeInstruction whenever we modify a basic block's contents by adding or
+/// removing instructions.
 class ICFLoopSafetyInfo: public LoopSafetyInfo {
   bool MayThrow = false;       // The current loop contains an instruction which
                                // may throw.
@@ -144,10 +145,15 @@ public:
                                      const DominatorTree *DT,
                                      const Loop *CurLoop) const;
 
-  /// Drops cached information regarding the implicit control flow in block
-  /// \p BB. It should be called for every block in which we add or remove any
-  /// instructions  to a block before we make queries to it.
-  void dropCachedInfo(const BasicBlock *BB);
+  /// Inform the safety info that we are planning to insert a new instruction
+  /// into the basic block \p BB. It will make all cache updates to keep it
+  /// correct after this insertion.
+  void insertInstructionTo(const BasicBlock *BB);
+
+  /// Inform safety info that we are planning to remove the instruction \p Inst
+  /// from its block. It will make all cache updates to keep it correct after
+  /// this removal.
+  void removeInstruction(const Instruction *Inst);
 
   ICFLoopSafetyInfo(DominatorTree *DT) : LoopSafetyInfo(), ICF(DT) {};
 
diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp
index 64ee2a7e5b0..7507aebb527 100644
--- a/lib/Analysis/MustExecute.cpp
+++ b/lib/Analysis/MustExecute.cpp
@@ -82,10 +82,16 @@ void ICFLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) {
   computeBlockColors(CurLoop);
 }
 
-void ICFLoopSafetyInfo::dropCachedInfo(const BasicBlock *BB) {
+void ICFLoopSafetyInfo::insertInstructionTo(const BasicBlock *BB) {
   ICF.invalidateBlock(BB);
 }
 
+void ICFLoopSafetyInfo::removeInstruction(const Instruction *Inst) {
+  // TODO: So far we just conservatively drop cache, but maybe we can not do it
+  // when Inst is not an ICF instruction. Follow-up on that.
+  ICF.invalidateBlock(Inst->getParent());
+}
+
 void LoopSafetyInfo::computeBlockColors(const Loop *CurLoop) {
   // Compute funclet colors if we might sink/hoist in a function with a funclet
   // personality routine.
-- 
GitLab


From 04cce6e8dc7596c4bfab17543f8ec0c619c1d73a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 1 Nov 2018 11:52:09 +0000
Subject: [PATCH 0851/1116] [X86][SSE] Move 2-input limit up from
 getFauxShuffleMask to resolveTargetShuffleInputs (reapplied)

Reapplying an updated version of rL345395 (reverted in rL345451), now the issues noticed in PR39483 have been fixed.

This patch allows resolveTargetShuffleInputs to remove UNDEF inputs from cases where we have more than 2 inputs.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345824 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            |  9 +++-----
 .../X86/avx512-shuffles/partial_permute.ll    | 21 ++++++++++---------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9acae2cab22..498a8e8178a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6325,9 +6325,6 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
         !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
       return false;
-    // TODO - Add support for more than 2 inputs.
-    if ((SrcInputs0.size() + SrcInputs1.size()) > 2)
-      return false;
     int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
     SmallVector<int, 64> Mask0, Mask1;
     scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
@@ -6386,8 +6383,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
       }
       Mask[i + InsertIdx] = M;
     }
-    // TODO - Add support for more than 1 subinput.
-    return Ops.size() <= 2;
+    return true;
   }
   case ISD::SCALAR_TO_VECTOR: {
     // Match against a scalar_to_vector of an extract from a vector,
@@ -6580,7 +6576,8 @@ static bool resolveTargetShuffleInputs(SDValue Op,
       return false;
 
   resolveTargetShuffleInputsAndMask(Inputs, Mask);
-  return true;
+  // TODO - Add support for more than 2 inputs.
+  return Inputs.size() <= 2;
 }
 
 /// Returns the scalar element that will make up the ith
diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index d198fe87bed..0768508cca9 100644
--- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -4019,11 +4019,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,5,5]
+; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpeqpd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4034,11 +4034,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [1,1,5,5]
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
-- 
GitLab


From 4234638f13b107784fc82d539e640a0d82b48b1f Mon Sep 17 00:00:00 2001
From: Chad Rosier <mcrosier@codeaurora.org>
Date: Thu, 1 Nov 2018 13:45:16 +0000
Subject: [PATCH 0852/1116] [AArch64] Add support for ARMv8.4 in Saphira.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345827 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e3f69c7509f..9d596a1821c 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -487,7 +487,7 @@ def ProcSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
                                    FeatureLSLFast,
-                                   HasV8_3aOps]>;
+                                   HasV8_4aOps]>;
 
 def ProcThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
                                          "ThunderX2T99",
-- 
GitLab


From 5c891e3a6b9cf7ffb59dd6ab46f01d4b252f716f Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 1 Nov 2018 13:55:59 +0000
Subject: [PATCH 0853/1116] [ADT] Clean up SparseBitVector copying and make it
 moveable

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345829 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/SparseBitVector.h    | 39 +++++++++------------------
 unittests/ADT/SparseBitVectorTest.cpp | 16 +++++++++++
 2 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/include/llvm/ADT/SparseBitVector.h b/include/llvm/ADT/SparseBitVector.h
index 09a91b6614e..84e73bcbace 100644
--- a/include/llvm/ADT/SparseBitVector.h
+++ b/include/llvm/ADT/SparseBitVector.h
@@ -261,11 +261,11 @@ class SparseBitVector {
     BITWORD_SIZE = SparseBitVectorElement<ElementSize>::BITWORD_SIZE
   };
 
+  ElementList Elements;
   // Pointer to our current Element. This has no visible effect on the external
   // state of a SparseBitVector, it's just used to improve performance in the
   // common case of testing/modifying bits with similar indices.
   mutable ElementListIter CurrElementIter;
-  ElementList Elements;
 
   // This is like std::lower_bound, except we do linear searching from the
   // current position.
@@ -441,22 +441,12 @@ class SparseBitVector {
 public:
   using iterator = SparseBitVectorIterator;
 
-  SparseBitVector() {
-    CurrElementIter = Elements.begin();
-  }
+  SparseBitVector() : Elements(), CurrElementIter(Elements.begin()) {}
 
-  // SparseBitVector copy ctor.
-  SparseBitVector(const SparseBitVector &RHS) {
-    ElementListConstIter ElementIter = RHS.Elements.begin();
-    while (ElementIter != RHS.Elements.end()) {
-      Elements.push_back(SparseBitVectorElement<ElementSize>(*ElementIter));
-      ++ElementIter;
-    }
-
-    CurrElementIter = Elements.begin ();
-  }
-
-  ~SparseBitVector() = default;
+  SparseBitVector(const SparseBitVector &RHS)
+      : Elements(RHS.Elements), CurrElementIter(Elements.begin()) {}
+  SparseBitVector(SparseBitVector &&RHS)
+      : Elements(std::move(RHS.Elements)), CurrElementIter(Elements.begin()) {}
 
   // Clear.
   void clear() {
@@ -468,16 +458,13 @@ public:
     if (this == &RHS)
       return *this;
 
-    Elements.clear();
-
-    ElementListConstIter ElementIter = RHS.Elements.begin();
-    while (ElementIter != RHS.Elements.end()) {
-      Elements.push_back(SparseBitVectorElement<ElementSize>(*ElementIter));
-      ++ElementIter;
-    }
-
-    CurrElementIter = Elements.begin ();
-
+    Elements = RHS.Elements;
+    CurrElementIter = Elements.begin();
+    return *this;
+  }
+  SparseBitVector &operator=(SparseBitVector &&RHS) {
+    Elements = std::move(RHS.Elements);
+    CurrElementIter = Elements.begin();
     return *this;
   }
 
diff --git a/unittests/ADT/SparseBitVectorTest.cpp b/unittests/ADT/SparseBitVectorTest.cpp
index 097f4a0b737..7675ddac14b 100644
--- a/unittests/ADT/SparseBitVectorTest.cpp
+++ b/unittests/ADT/SparseBitVectorTest.cpp
@@ -36,6 +36,22 @@ TEST(SparseBitVectorTest, TrivialOperation) {
   const SparseBitVector<> ConstVec = Vec;
   EXPECT_TRUE(ConstVec.test(5));
   EXPECT_FALSE(ConstVec.test(17));
+
+  Vec.set(1337);
+  EXPECT_TRUE(Vec.test(1337));
+  Vec = ConstVec;
+  EXPECT_FALSE(Vec.test(1337));
+
+  Vec.set(1337);
+  EXPECT_FALSE(Vec.empty());
+  SparseBitVector<> MovedVec(std::move(Vec));
+  EXPECT_TRUE(Vec.empty());
+  EXPECT_TRUE(MovedVec.test(5));
+  EXPECT_TRUE(MovedVec.test(1337));
+
+  Vec = std::move(MovedVec);
+  EXPECT_TRUE(MovedVec.empty());
+  EXPECT_FALSE(Vec.empty());
 }
 
 TEST(SparseBitVectorTest, IntersectWith) {
-- 
GitLab


From f097f64ecbfd4eeced3a12fe5760d5304c172eb8 Mon Sep 17 00:00:00 2001
From: Aleksandar Beserminji <abeserminji@wavecomp.com>
Date: Thu, 1 Nov 2018 13:57:54 +0000
Subject: [PATCH 0854/1116] [mips][micromips] Fix JmpLink to
 TargetExternalSymbol

When matching MipsISD::JmpLink t9, TargetExternalSymbol:i32'...',
wrong JALR16_MM is selected. This patch adds missing pattern for
JmpLink, so that JAL instruction is selected.

Differential Revision: https://reviews.llvm.org/D53366


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345830 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MicroMipsInstrInfo.td               |  2 ++
 .../Mips/micromips-target-external-symbol-reloc.ll  | 13 +++++++++++++
 2 files changed, 15 insertions(+)
 create mode 100644 test/CodeGen/Mips/micromips-target-external-symbol-reloc.ll

diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 174a05ea7af..af380a0ec71 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1271,6 +1271,8 @@ let AddedComplexity = 40 in
 def : MipsPat<(bswap GPR32:$rt), (ROTR_MM (WSBH_MM GPR32:$rt), 16)>,
       ISA_MICROMIPS;
 
+def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
+              (JAL_MM texternalsym:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
 def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
               (TAILCALL_MM tglobaladdr:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
 def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
diff --git a/test/CodeGen/Mips/micromips-target-external-symbol-reloc.ll b/test/CodeGen/Mips/micromips-target-external-symbol-reloc.ll
new file mode 100644
index 00000000000..df592c49cc6
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-target-external-symbol-reloc.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=mips-mti-linux-gnu -mcpu=mips32r2 -mattr=+micromips -stop-after=expand-isel-pseudos < %s | FileCheck %s
+
+; CHECK: JAL_MM
+; CHECK-NOT: JALR16_MM
+
+define dso_local void @foo(i32* nocapture %ar) local_unnamed_addr {
+entry:
+  %0 = bitcast i32* %ar to i8*
+  tail call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 0, i32 100, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1)
-- 
GitLab


From 258eac5bd6d1aa5b9c4e13c81052a0549af4575b Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 1 Nov 2018 14:03:22 +0000
Subject: [PATCH 0855/1116] [InstSimplify] add tests for icmp fold bug
 (PR39510); NFC

Verify that set intersection/subset are not confused.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345831 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstSimplify/icmp-abs-nabs.ll | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/test/Transforms/InstSimplify/icmp-abs-nabs.ll b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
index 1cb312bf0da..c2e24de885f 100644
--- a/test/Transforms/InstSimplify/icmp-abs-nabs.ll
+++ b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
@@ -399,3 +399,38 @@ define <3 x i1> @nabs_is_not_over_0_sle_vec_splat(<3 x i33> %x) {
   ret <3 x i1> %r
 }
 
+; Negative test - intersection does not equal absolute value range.
+; PR39510 - https://bugs.llvm.org/show_bug.cgi?id=39510
+
+define i1 @abs_no_intersection(i32 %a) {
+; CHECK-LABEL: @abs_no_intersection(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 0, [[A]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[A]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i32 [[COND]], 2
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %a, 0
+  %sub = sub nsw i32 0, %a
+  %cond = select i1 %cmp, i32 %sub, i32 %a
+  %r = icmp ne i32 %cond, 2
+  ret i1 %r
+}
+
+; Negative test - intersection does not equal absolute value range.
+
+define i1 @nabs_no_intersection(i32 %a) {
+; CHECK-LABEL: @nabs_no_intersection(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[A]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[A]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i32 [[COND]], -2
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp sgt i32 %a, 0
+  %sub = sub i32 0, %a
+  %cond = select i1 %cmp, i32 %sub, i32 %a
+  %r = icmp ne i32 %cond, -2
+  ret i1 %r
+}
+
-- 
GitLab


From 148536769ac0882f42d3f1cdeac21eecab0b17be Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 1 Nov 2018 14:07:39 +0000
Subject: [PATCH 0856/1116] [InstSimplify] fold icmp based on range of abs/nabs
 (2nd try)

This is retrying the fold from rL345717
(reverted at rL347780)
...with a fix for the miscompile
demonstrated by PR39510:
https://bugs.llvm.org/show_bug.cgi?id=39510

Original commit message:

This is a fix for PR39475:
https://bugs.llvm.org/show_bug.cgi?id=39475

We managed to get some of these patterns using computeKnownBits in https://reviews.llvm.org/D47041, but that
can't be used for nabs(). Instead, put in some range-based logic, so we can fold
both abs/nabs with icmp with a constant value.

Alive proofs:
https://rise4fun.com/Alive/21r

Name: abs_nsw_is_positive

  %cmp = icmp slt i32 %x, 0
  %negx = sub nsw i32 0, %x
  %abs = select i1 %cmp, i32 %negx, i32 %x
  %r = icmp sgt i32 %abs, -1
    =>
  %r = i1 true


Name: abs_nsw_is_not_negative

  %cmp = icmp slt i32 %x, 0
  %negx = sub nsw i32 0, %x
  %abs = select i1 %cmp, i32 %negx, i32 %x
  %r = icmp slt i32 %abs, 0
    =>
  %r = i1 false


Name: nabs_is_negative_or_0

  %cmp = icmp slt i32 %x, 0
  %negx = sub i32 0, %x
  %nabs = select i1 %cmp, i32 %x, i32 %negx
  %r = icmp slt i32 %nabs, 1
    =>
  %r = i1 true

Name: nabs_is_not_over_0

  %cmp = icmp slt i32 %x, 0
  %negx = sub i32 0, %x
  %nabs = select i1 %cmp, i32 %x, i32 %negx
  %r = icmp sgt i32 %nabs, 0
    =>
  %r = i1 false

Differential Revision: https://reviews.llvm.org/D53844


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345832 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/InstructionSimplify.cpp          | 41 +++++++++
 test/Transforms/InstSimplify/icmp-abs-nabs.ll | 90 ++++---------------
 2 files changed, 56 insertions(+), 75 deletions(-)

diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index c4b076341fc..db929aa7059 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -2996,6 +2996,44 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
+static Value *simplifyICmpWithAbsNabs(CmpInst::Predicate Pred, Value *Op0,
+                                      Value *Op1) {
+  // We need a comparison with a constant.
+  const APInt *C;
+  if (!match(Op1, m_APInt(C)))
+    return nullptr;
+
+  // matchSelectPattern returns the negation part of an abs pattern in SP1.
+  // If the negate has an NSW flag, abs(INT_MIN) is undefined. Without that
+  // constraint, we can't make a contiguous range for the result of abs.
+  ICmpInst::Predicate AbsPred = ICmpInst::BAD_ICMP_PREDICATE;
+  Value *SP0, *SP1;
+  SelectPatternFlavor SPF = matchSelectPattern(Op0, SP0, SP1).Flavor;
+  if (SPF == SelectPatternFlavor::SPF_ABS &&
+      cast<Instruction>(SP1)->hasNoSignedWrap())
+    // The result of abs(X) is >= 0 (with nsw).
+    AbsPred = ICmpInst::ICMP_SGE;
+  if (SPF == SelectPatternFlavor::SPF_NABS)
+    // The result of -abs(X) is <= 0.
+    AbsPred = ICmpInst::ICMP_SLE;
+
+  if (AbsPred == ICmpInst::BAD_ICMP_PREDICATE)
+    return nullptr;
+
+  // If there is no intersection between abs/nabs and the range of this icmp,
+  // the icmp must be false. If the abs/nabs range is a subset of the icmp
+  // range, the icmp must be true.
+  APInt Zero = APInt::getNullValue(C->getBitWidth());
+  ConstantRange AbsRange = ConstantRange::makeExactICmpRegion(AbsPred, Zero);
+  ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(Pred, *C);
+  if (AbsRange.intersectWith(CmpRange).isEmptySet())
+    return getFalse(GetCompareTy(Op0));
+  if (CmpRange.contains(AbsRange))
+    return getTrue(GetCompareTy(Op0));
+
+  return nullptr;
+}
+
 /// Simplify integer comparisons where at least one operand of the compare
 /// matches an integer min/max idiom.
 static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
@@ -3427,6 +3465,9 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   if (Value *V = simplifyICmpWithMinMax(Pred, LHS, RHS, Q, MaxRecurse))
     return V;
 
+  if (Value *V = simplifyICmpWithAbsNabs(Pred, LHS, RHS))
+    return V;
+
   // Simplify comparisons of related pointers using a powerful, recursive
   // GEP-walk when we have target data available..
   if (LHS->getType()->isPointerTy())
diff --git a/test/Transforms/InstSimplify/icmp-abs-nabs.ll b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
index c2e24de885f..41ffc33ef57 100644
--- a/test/Transforms/InstSimplify/icmp-abs-nabs.ll
+++ b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
@@ -5,11 +5,7 @@
 
 define i1 @abs_nsw_is_positive(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_positive(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], -1
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub nsw i32 0, %x
@@ -35,11 +31,7 @@ define i1 @abs_nsw_is_positive_sge(i32 %x) {
 
 define i1 @abs_nsw_is_positive_reduced_range(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_positive_reduced_range(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], -42
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub nsw i32 0, %x
@@ -99,11 +91,7 @@ define i1 @abs_nsw_is_not_negative(i32 %x) {
 
 define i1 @abs_nsw_is_not_negative_sle(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_sle(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sle i32 [[ABS]], -1
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub nsw i32 0, %x
@@ -116,11 +104,7 @@ define i1 @abs_nsw_is_not_negative_sle(i32 %x) {
 
 define i1 @abs_nsw_is_not_negative_reduced_range(i32 %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_reduced_range(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[ABS]], -24
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub nsw i32 0, %x
@@ -167,11 +151,7 @@ define i1 @abs_nsw_is_not_negative_wrong_range(i32 %x) {
 
 define i1 @nabs_is_negative_or_0(i32 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[NABS]], 1
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub i32 0, %x
@@ -184,11 +164,7 @@ define i1 @nabs_is_negative_or_0(i32 %x) {
 
 define i1 @nabs_is_negative_or_0_sle(i32 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0_sle(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sle i32 [[NABS]], 0
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -201,11 +177,7 @@ define i1 @nabs_is_negative_or_0_sle(i32 %x) {
 
 define i1 @nabs_is_negative_or_0_reduced_range(i32 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0_reduced_range(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[NABS]], 421
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -235,11 +207,7 @@ define i1 @nabs_is_negative_or_0_wrong_range(i32 %x) {
 
 define i1 @nabs_is_not_over_0(i32 %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[NABS]], 0
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i32 %x, 0
   %negx = sub i32 0, %x
@@ -252,11 +220,7 @@ define i1 @nabs_is_not_over_0(i32 %x) {
 
 define i1 @nabs_is_not_over_0_sle(i32 %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0_sle(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sge i32 [[NABS]], 1
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -269,11 +233,7 @@ define i1 @nabs_is_not_over_0_sle(i32 %x) {
 
 define i1 @nabs_is_not_over_0_reduced_range(i32 %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0_reduced_range(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[NABS]], 4223
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i32 %x, 1
   %negx = sub i32 0, %x
@@ -318,11 +278,7 @@ define i1 @abs_nsw_is_positive_eq(i32 %x) {
 
 define i1 @abs_nsw_is_positive_ult(i8 %x) {
 ; CHECK-LABEL: @abs_nsw_is_positive_ult(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i8 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i8 [[NEGX]], i8 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[ABS]], -117
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i8 %x, 0
   %negx = sub nsw i8 0, %x
@@ -335,11 +291,7 @@ define i1 @abs_nsw_is_positive_ult(i8 %x) {
 
 define i1 @abs_nsw_is_not_negative_ugt(i8 %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_ugt(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i8 0, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i8 [[NEGX]], i8 [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[ABS]], 127
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i8 %x, 0
   %negx = sub nsw i8 0, %x
@@ -352,11 +304,7 @@ define i1 @abs_nsw_is_not_negative_ugt(i8 %x) {
 
 define <2 x i1> @abs_nsw_is_not_negative_vec_splat(<2 x i32> %x) {
 ; CHECK-LABEL: @abs_nsw_is_not_negative_vec_splat(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i32> [[X:%.*]], zeroinitializer
-; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X]]
-; CHECK-NEXT:    [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[NEGX]], <2 x i32> [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp slt <2 x i32> [[ABS]], <i32 -8, i32 -8>
-; CHECK-NEXT:    ret <2 x i1> [[R]]
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %cmp = icmp slt <2 x i32> %x, zeroinitializer
   %negx = sub nsw <2 x i32> zeroinitializer, %x
@@ -369,11 +317,7 @@ define <2 x i1> @abs_nsw_is_not_negative_vec_splat(<2 x i32> %x) {
 
 define i1 @nabs_is_negative_or_0_ne(i8 %x) {
 ; CHECK-LABEL: @nabs_is_negative_or_0_ne(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0
-; CHECK-NEXT:    [[NEGX:%.*]] = sub i8 0, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[NABS]], 12
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i8 %x, 0
   %negx = sub i8 0, %x
@@ -386,11 +330,7 @@ define i1 @nabs_is_negative_or_0_ne(i8 %x) {
 
 define <3 x i1> @nabs_is_not_over_0_sle_vec_splat(<3 x i33> %x) {
 ; CHECK-LABEL: @nabs_is_not_over_0_sle_vec_splat(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <3 x i33> [[X:%.*]], <i33 1, i33 1, i33 1>
-; CHECK-NEXT:    [[NEGX:%.*]] = sub <3 x i33> zeroinitializer, [[X]]
-; CHECK-NEXT:    [[NABS:%.*]] = select <3 x i1> [[CMP]], <3 x i33> [[X]], <3 x i33> [[NEGX]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sge <3 x i33> [[NABS]], <i33 1, i33 1, i33 1>
-; CHECK-NEXT:    ret <3 x i1> [[R]]
+; CHECK-NEXT:    ret <3 x i1> zeroinitializer
 ;
   %cmp = icmp slt <3 x i33> %x, <i33 1, i33 1, i33 1>
   %negx = sub <3 x i33> zeroinitializer, %x
-- 
GitLab


From aa868d8549ed9f3a9a7692203e55b5a12e6201b2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 1 Nov 2018 14:57:07 +0000
Subject: [PATCH 0857/1116] [X86][X86FixupLEA] Rename processInstructionForSLM
 to processInstructionForSlowLEA (NFCI)

The function isn't SLM specific (its driven by the FeatureSlowLEA flag).

Minor tidyup prior to PR38225.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345836 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FixupLEAs.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index ed24d6a8547..da5f1695750 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -58,10 +58,9 @@ class FixupLEAPass : public MachineFunctionPass {
                           MachineFunction::iterator MFI);
 
   /// Given a LEA instruction which is unprofitable
-  /// on Silvermont try to replace it with an equivalent ADD instruction
-  void processInstructionForSLM(MachineBasicBlock::iterator &I,
-                                MachineFunction::iterator MFI);
-
+  /// on SlowLEA targets try to replace it with an equivalent ADD instruction.
+  void processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
+                                    MachineFunction::iterator MFI);
 
   /// Given a LEA instruction which is unprofitable
   /// on SNB+ try to replace it with other instructions.
@@ -411,8 +410,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
   }
 }
 
-void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
-                                            MachineFunction::iterator MFI) {
+void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
+                                                MachineFunction::iterator MFI) {
   MachineInstr &MI = *I;
   const int Opcode = MI.getOpcode();
   if (!isLEA(Opcode))
@@ -576,7 +575,7 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
 
     if (OptLEA) {
       if (MF.getSubtarget<X86Subtarget>().slowLEA())
-        processInstructionForSLM(I, MFI);
+        processInstructionForSlowLEA(I, MFI);
 
       else {
         if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) {
-- 
GitLab


From c35d5db855b6b0e723de0cbde00f34a549ff0373 Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Thu, 1 Nov 2018 15:07:32 +0000
Subject: [PATCH 0858/1116] [MS Demangler] Expose the Demangler AST publicly.

LLDB would like to use this in order to build a clang AST from
a mangled name.

This is NFC otherwise.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345837 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Demangle/MicrosoftDemangle.h     | 276 ++++++++++++++++++
 .../llvm}/Demangle/MicrosoftDemangleNodes.h   |  98 -------
 lib/Demangle/MicrosoftDemangle.cpp            | 167 +----------
 lib/Demangle/MicrosoftDemangleNodes.cpp       |   2 +-
 4 files changed, 289 insertions(+), 254 deletions(-)
 create mode 100644 include/llvm/Demangle/MicrosoftDemangle.h
 rename {lib => include/llvm}/Demangle/MicrosoftDemangleNodes.h (86%)

diff --git a/include/llvm/Demangle/MicrosoftDemangle.h b/include/llvm/Demangle/MicrosoftDemangle.h
new file mode 100644
index 00000000000..b186758ebe2
--- /dev/null
+++ b/include/llvm/Demangle/MicrosoftDemangle.h
@@ -0,0 +1,276 @@
+//===------------------------- MicrosoftDemangle.h --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEMANGLE_MICROSOFT_DEMANGLE_H
+#define LLVM_DEMANGLE_MICROSOFT_DEMANGLE_H
+
+#include "llvm/Demangle/Compiler.h"
+#include "llvm/Demangle/MicrosoftDemangleNodes.h"
+#include "llvm/Demangle/StringView.h"
+#include "llvm/Demangle/Utility.h"
+
+#include <utility>
+
+namespace llvm {
+namespace ms_demangle {
+// This memory allocator is extremely fast, but it doesn't call dtors
+// for allocated objects. That means you can't use STL containers
+// (such as std::vector) with this allocator. But it pays off --
+// the demangler is 3x faster with this allocator compared to one with
+// STL containers.
+constexpr size_t AllocUnit = 4096;
+
+class ArenaAllocator {
+  struct AllocatorNode {
+    uint8_t *Buf = nullptr;
+    size_t Used = 0;
+    size_t Capacity = 0;
+    AllocatorNode *Next = nullptr;
+  };
+
+  void addNode(size_t Capacity) {
+    AllocatorNode *NewHead = new AllocatorNode;
+    NewHead->Buf = new uint8_t[Capacity];
+    NewHead->Next = Head;
+    NewHead->Capacity = Capacity;
+    Head = NewHead;
+    NewHead->Used = 0;
+  }
+
+public:
+  ArenaAllocator() { addNode(AllocUnit); }
+
+  ~ArenaAllocator() {
+    while (Head) {
+      assert(Head->Buf);
+      delete[] Head->Buf;
+      AllocatorNode *Next = Head->Next;
+      delete Head;
+      Head = Next;
+    }
+  }
+
+  char *allocUnalignedBuffer(size_t Length) {
+    uint8_t *Buf = Head->Buf + Head->Used;
+
+    Head->Used += Length;
+    if (Head->Used > Head->Capacity) {
+      // It's possible we need a buffer which is larger than our default unit
+      // size, so we need to be careful to add a node with capacity that is at
+      // least as large as what we need.
+      addNode(std::max(AllocUnit, Length));
+      Head->Used = Length;
+      Buf = Head->Buf;
+    }
+
+    return reinterpret_cast<char *>(Buf);
+  }
+
+  template <typename T, typename... Args> T *allocArray(size_t Count) {
+
+    size_t Size = Count * sizeof(T);
+    assert(Head && Head->Buf);
+
+    size_t P = (size_t)Head->Buf + Head->Used;
+    uintptr_t AlignedP =
+        (((size_t)P + alignof(T) - 1) & ~(size_t)(alignof(T) - 1));
+    uint8_t *PP = (uint8_t *)AlignedP;
+    size_t Adjustment = AlignedP - P;
+
+    Head->Used += Size + Adjustment;
+    if (Head->Used < Head->Capacity)
+      return new (PP) T[Count]();
+
+    addNode(AllocUnit);
+    Head->Used = Size;
+    return new (Head->Buf) T[Count]();
+  }
+
+  template <typename T, typename... Args> T *alloc(Args &&... ConstructorArgs) {
+
+    size_t Size = sizeof(T);
+    assert(Head && Head->Buf);
+
+    size_t P = (size_t)Head->Buf + Head->Used;
+    uintptr_t AlignedP =
+        (((size_t)P + alignof(T) - 1) & ~(size_t)(alignof(T) - 1));
+    uint8_t *PP = (uint8_t *)AlignedP;
+    size_t Adjustment = AlignedP - P;
+
+    Head->Used += Size + Adjustment;
+    if (Head->Used < Head->Capacity)
+      return new (PP) T(std::forward<Args>(ConstructorArgs)...);
+
+    addNode(AllocUnit);
+    Head->Used = Size;
+    return new (Head->Buf) T(std::forward<Args>(ConstructorArgs)...);
+  }
+
+private:
+  AllocatorNode *Head = nullptr;
+};
+
+struct BackrefContext {
+  static constexpr size_t Max = 10;
+
+  TypeNode *FunctionParams[Max];
+  size_t FunctionParamCount = 0;
+
+  // The first 10 BackReferences in a mangled name can be back-referenced by
+  // special name @[0-9]. This is a storage for the first 10 BackReferences.
+  NamedIdentifierNode *Names[Max];
+  size_t NamesCount = 0;
+};
+
+enum class QualifierMangleMode { Drop, Mangle, Result };
+
+enum NameBackrefBehavior : uint8_t {
+  NBB_None = 0,          // don't save any names as backrefs.
+  NBB_Template = 1 << 0, // save template instanations.
+  NBB_Simple = 1 << 1,   // save simple names.
+};
+
+enum class FunctionIdentifierCodeGroup { Basic, Under, DoubleUnder };
+
+// Demangler class takes the main role in demangling symbols.
+// It has a set of functions to parse mangled symbols into Type instances.
+// It also has a set of functions to convert Type instances to strings.
+class Demangler {
+public:
+  Demangler() = default;
+  virtual ~Demangler() = default;
+
+  // You are supposed to call parse() first and then check if error is true.  If
+  // it is false, call output() to write the formatted name to the given stream.
+  SymbolNode *parse(StringView &MangledName);
+
+  TagTypeNode *parseTagUniqueName(StringView &MangledName);
+
+  // True if an error occurred.
+  bool Error = false;
+
+  void dumpBackReferences();
+
+private:
+  SymbolNode *demangleEncodedSymbol(StringView &MangledName,
+                                    QualifiedNameNode *QN);
+
+  VariableSymbolNode *demangleVariableEncoding(StringView &MangledName,
+                                               StorageClass SC);
+  FunctionSymbolNode *demangleFunctionEncoding(StringView &MangledName);
+
+  Qualifiers demanglePointerExtQualifiers(StringView &MangledName);
+
+  // Parser functions. This is a recursive-descent parser.
+  TypeNode *demangleType(StringView &MangledName, QualifierMangleMode QMM);
+  PrimitiveTypeNode *demanglePrimitiveType(StringView &MangledName);
+  CustomTypeNode *demangleCustomType(StringView &MangledName);
+  TagTypeNode *demangleClassType(StringView &MangledName);
+  PointerTypeNode *demanglePointerType(StringView &MangledName);
+  PointerTypeNode *demangleMemberPointerType(StringView &MangledName);
+  FunctionSignatureNode *demangleFunctionType(StringView &MangledName,
+                                              bool HasThisQuals);
+
+  ArrayTypeNode *demangleArrayType(StringView &MangledName);
+
+  NodeArrayNode *demangleTemplateParameterList(StringView &MangledName);
+  NodeArrayNode *demangleFunctionParameterList(StringView &MangledName);
+
+  std::pair<uint64_t, bool> demangleNumber(StringView &MangledName);
+  uint64_t demangleUnsigned(StringView &MangledName);
+  int64_t demangleSigned(StringView &MangledName);
+
+  void memorizeString(StringView s);
+  void memorizeIdentifier(IdentifierNode *Identifier);
+
+  /// Allocate a copy of \p Borrowed into memory that we own.
+  StringView copyString(StringView Borrowed);
+
+  QualifiedNameNode *demangleFullyQualifiedTypeName(StringView &MangledName);
+  QualifiedNameNode *demangleFullyQualifiedSymbolName(StringView &MangledName);
+
+  IdentifierNode *demangleUnqualifiedTypeName(StringView &MangledName,
+                                              bool Memorize);
+  IdentifierNode *demangleUnqualifiedSymbolName(StringView &MangledName,
+                                                NameBackrefBehavior NBB);
+
+  QualifiedNameNode *demangleNameScopeChain(StringView &MangledName,
+                                            IdentifierNode *UnqualifiedName);
+  IdentifierNode *demangleNameScopePiece(StringView &MangledName);
+
+  NamedIdentifierNode *demangleBackRefName(StringView &MangledName);
+  IdentifierNode *demangleTemplateInstantiationName(StringView &MangledName,
+                                                    NameBackrefBehavior NBB);
+  IdentifierNode *demangleFunctionIdentifierCode(StringView &MangledName);
+  IdentifierNode *
+  demangleFunctionIdentifierCode(StringView &MangledName,
+                                 FunctionIdentifierCodeGroup Group);
+  StructorIdentifierNode *demangleStructorIdentifier(StringView &MangledName,
+                                                     bool IsDestructor);
+  ConversionOperatorIdentifierNode *
+  demangleConversionOperatorIdentifier(StringView &MangledName);
+  LiteralOperatorIdentifierNode *
+  demangleLiteralOperatorIdentifier(StringView &MangledName);
+
+  SymbolNode *demangleSpecialIntrinsic(StringView &MangledName);
+  SpecialTableSymbolNode *
+  demangleSpecialTableSymbolNode(StringView &MangledName,
+                                 SpecialIntrinsicKind SIK);
+  LocalStaticGuardVariableNode *
+  demangleLocalStaticGuard(StringView &MangledName);
+  VariableSymbolNode *demangleUntypedVariable(ArenaAllocator &Arena,
+                                              StringView &MangledName,
+                                              StringView VariableName);
+  VariableSymbolNode *
+  demangleRttiBaseClassDescriptorNode(ArenaAllocator &Arena,
+                                      StringView &MangledName);
+  FunctionSymbolNode *demangleInitFiniStub(StringView &MangledName,
+                                           bool IsDestructor);
+
+  NamedIdentifierNode *demangleSimpleName(StringView &MangledName,
+                                          bool Memorize);
+  NamedIdentifierNode *demangleAnonymousNamespaceName(StringView &MangledName);
+  NamedIdentifierNode *demangleLocallyScopedNamePiece(StringView &MangledName);
+  EncodedStringLiteralNode *demangleStringLiteral(StringView &MangledName);
+  FunctionSymbolNode *demangleVcallThunkNode(StringView &MangledName);
+
+  StringView demangleSimpleString(StringView &MangledName, bool Memorize);
+
+  FuncClass demangleFunctionClass(StringView &MangledName);
+  CallingConv demangleCallingConvention(StringView &MangledName);
+  StorageClass demangleVariableStorageClass(StringView &MangledName);
+  void demangleThrowSpecification(StringView &MangledName);
+  wchar_t demangleWcharLiteral(StringView &MangledName);
+  uint8_t demangleCharLiteral(StringView &MangledName);
+
+  std::pair<Qualifiers, bool> demangleQualifiers(StringView &MangledName);
+
+  // Memory allocator.
+  ArenaAllocator Arena;
+
+  // A single type uses one global back-ref table for all function params.
+  // This means back-refs can even go "into" other types.  Examples:
+  //
+  //  // Second int* is a back-ref to first.
+  //  void foo(int *, int*);
+  //
+  //  // Second int* is not a back-ref to first (first is not a function param).
+  //  int* foo(int*);
+  //
+  //  // Second int* is a back-ref to first (ALL function types share the same
+  //  // back-ref map.
+  //  using F = void(*)(int*);
+  //  F G(int *);
+  BackrefContext Backrefs;
+};
+
+} // namespace ms_demangle
+} // namespace llvm
+
+#endif // LLVM_DEMANGLE_MICROSOFT_DEMANGLE_H
diff --git a/lib/Demangle/MicrosoftDemangleNodes.h b/include/llvm/Demangle/MicrosoftDemangleNodes.h
similarity index 86%
rename from lib/Demangle/MicrosoftDemangleNodes.h
rename to include/llvm/Demangle/MicrosoftDemangleNodes.h
index caa7eb3b526..1d0b66a7bf4 100644
--- a/lib/Demangle/MicrosoftDemangleNodes.h
+++ b/include/llvm/Demangle/MicrosoftDemangleNodes.h
@@ -10,104 +10,6 @@ class OutputStream;
 namespace llvm {
 namespace ms_demangle {
 
-// This memory allocator is extremely fast, but it doesn't call dtors
-// for allocated objects. That means you can't use STL containers
-// (such as std::vector) with this allocator. But it pays off --
-// the demangler is 3x faster with this allocator compared to one with
-// STL containers.
-constexpr size_t AllocUnit = 4096;
-
-class ArenaAllocator {
-  struct AllocatorNode {
-    uint8_t *Buf = nullptr;
-    size_t Used = 0;
-    size_t Capacity = 0;
-    AllocatorNode *Next = nullptr;
-  };
-
-  void addNode(size_t Capacity) {
-    AllocatorNode *NewHead = new AllocatorNode;
-    NewHead->Buf = new uint8_t[Capacity];
-    NewHead->Next = Head;
-    NewHead->Capacity = Capacity;
-    Head = NewHead;
-    NewHead->Used = 0;
-  }
-
-public:
-  ArenaAllocator() { addNode(AllocUnit); }
-
-  ~ArenaAllocator() {
-    while (Head) {
-      assert(Head->Buf);
-      delete[] Head->Buf;
-      AllocatorNode *Next = Head->Next;
-      delete Head;
-      Head = Next;
-    }
-  }
-
-  char *allocUnalignedBuffer(size_t Length) {
-    uint8_t *Buf = Head->Buf + Head->Used;
-
-    Head->Used += Length;
-    if (Head->Used > Head->Capacity) {
-      // It's possible we need a buffer which is larger than our default unit
-      // size, so we need to be careful to add a node with capacity that is at
-      // least as large as what we need.
-      addNode(std::max(AllocUnit, Length));
-      Head->Used = Length;
-      Buf = Head->Buf;
-    }
-
-    return reinterpret_cast<char *>(Buf);
-  }
-
-  template <typename T, typename... Args>
-  T *allocArray(size_t Count) {
-
-    size_t Size = Count * sizeof(T);
-    assert(Head && Head->Buf);
-
-    size_t P = (size_t)Head->Buf + Head->Used;
-    uintptr_t AlignedP =
-        (((size_t)P + alignof(T) - 1) & ~(size_t)(alignof(T) - 1));
-    uint8_t *PP = (uint8_t *)AlignedP;
-    size_t Adjustment = AlignedP - P;
-
-    Head->Used += Size + Adjustment;
-    if (Head->Used < Head->Capacity)
-      return new (PP) T[Count]();
-
-    addNode(AllocUnit);
-    Head->Used = Size;
-    return new (Head->Buf) T[Count]();
-  }
-
-  template <typename T, typename... Args> T *alloc(Args &&... ConstructorArgs) {
-
-    size_t Size = sizeof(T);
-    assert(Head && Head->Buf);
-
-    size_t P = (size_t)Head->Buf + Head->Used;
-    uintptr_t AlignedP =
-        (((size_t)P + alignof(T) - 1) & ~(size_t)(alignof(T) - 1));
-    uint8_t *PP = (uint8_t *)AlignedP;
-    size_t Adjustment = AlignedP - P;
-
-    Head->Used += Size + Adjustment;
-    if (Head->Used < Head->Capacity)
-      return new (PP) T(std::forward<Args>(ConstructorArgs)...);
-
-    addNode(AllocUnit);
-    Head->Used = Size;
-    return new (Head->Buf) T(std::forward<Args>(ConstructorArgs)...);
-  }
-
-private:
-  AllocatorNode *Head = nullptr;
-};
-
 // Storage classes
 enum Qualifiers : uint8_t {
   Q_None = 0,
diff --git a/lib/Demangle/MicrosoftDemangle.cpp b/lib/Demangle/MicrosoftDemangle.cpp
index 59fb7c9ae9f..882e4a57845 100644
--- a/lib/Demangle/MicrosoftDemangle.cpp
+++ b/lib/Demangle/MicrosoftDemangle.cpp
@@ -14,8 +14,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MicrosoftDemangleNodes.h"
+#include "llvm/Demangle/MicrosoftDemangle.h"
 #include "llvm/Demangle/Demangle.h"
+#include "llvm/Demangle/MicrosoftDemangleNodes.h"
 
 #include "llvm/Demangle/Compiler.h"
 #include "llvm/Demangle/StringView.h"
@@ -33,21 +34,12 @@ static bool startsWithDigit(StringView S) {
   return !S.empty() && std::isdigit(S.front());
 }
 
-enum class QualifierMangleMode { Drop, Mangle, Result };
 
 struct NodeList {
   Node *N = nullptr;
   NodeList *Next = nullptr;
 };
 
-enum class FunctionIdentifierCodeGroup { Basic, Under, DoubleUnder };
-
-enum NameBackrefBehavior : uint8_t {
-  NBB_None = 0,          // don't save any names as backrefs.
-  NBB_Template = 1 << 0, // save template instanations.
-  NBB_Simple = 1 << 1,   // save simple names.
-};
-
 static bool isMemberPointer(StringView MangledName) {
   switch (MangledName.popFront()) {
   case '$':
@@ -246,151 +238,6 @@ demanglePointerCVQualifiers(StringView &MangledName) {
   return std::make_pair(Q_None, PointerAffinity::Pointer);
 }
 
-namespace {
-
-struct BackrefContext {
-  static constexpr size_t Max = 10;
-
-  TypeNode *FunctionParams[Max];
-  size_t FunctionParamCount = 0;
-
-  // The first 10 BackReferences in a mangled name can be back-referenced by
-  // special name @[0-9]. This is a storage for the first 10 BackReferences.
-  NamedIdentifierNode *Names[Max];
-  size_t NamesCount = 0;
-};
-
-// Demangler class takes the main role in demangling symbols.
-// It has a set of functions to parse mangled symbols into Type instances.
-// It also has a set of functions to cnovert Type instances to strings.
-class Demangler {
-public:
-  Demangler() = default;
-  virtual ~Demangler() = default;
-
-  // You are supposed to call parse() first and then check if error is true.  If
-  // it is false, call output() to write the formatted name to the given stream.
-  SymbolNode *parse(StringView &MangledName);
-
-  // True if an error occurred.
-  bool Error = false;
-
-  void dumpBackReferences();
-
-private:
-  SymbolNode *demangleEncodedSymbol(StringView &MangledName,
-                                    QualifiedNameNode *QN);
-
-  VariableSymbolNode *demangleVariableEncoding(StringView &MangledName,
-                                               StorageClass SC);
-  FunctionSymbolNode *demangleFunctionEncoding(StringView &MangledName);
-
-  Qualifiers demanglePointerExtQualifiers(StringView &MangledName);
-
-  // Parser functions. This is a recursive-descent parser.
-  TypeNode *demangleType(StringView &MangledName, QualifierMangleMode QMM);
-  PrimitiveTypeNode *demanglePrimitiveType(StringView &MangledName);
-  CustomTypeNode *demangleCustomType(StringView &MangledName);
-  TagTypeNode *demangleClassType(StringView &MangledName);
-  PointerTypeNode *demanglePointerType(StringView &MangledName);
-  PointerTypeNode *demangleMemberPointerType(StringView &MangledName);
-  FunctionSignatureNode *demangleFunctionType(StringView &MangledName,
-                                              bool HasThisQuals);
-
-  ArrayTypeNode *demangleArrayType(StringView &MangledName);
-
-  NodeArrayNode *demangleTemplateParameterList(StringView &MangledName);
-  NodeArrayNode *demangleFunctionParameterList(StringView &MangledName);
-
-  std::pair<uint64_t, bool> demangleNumber(StringView &MangledName);
-  uint64_t demangleUnsigned(StringView &MangledName);
-  int64_t demangleSigned(StringView &MangledName);
-
-  void memorizeString(StringView s);
-  void memorizeIdentifier(IdentifierNode *Identifier);
-
-  /// Allocate a copy of \p Borrowed into memory that we own.
-  StringView copyString(StringView Borrowed);
-
-  QualifiedNameNode *demangleFullyQualifiedTypeName(StringView &MangledName);
-  QualifiedNameNode *demangleFullyQualifiedSymbolName(StringView &MangledName);
-
-  IdentifierNode *demangleUnqualifiedTypeName(StringView &MangledName,
-                                              bool Memorize);
-  IdentifierNode *demangleUnqualifiedSymbolName(StringView &MangledName,
-                                                NameBackrefBehavior NBB);
-
-  QualifiedNameNode *demangleNameScopeChain(StringView &MangledName,
-                                            IdentifierNode *UnqualifiedName);
-  IdentifierNode *demangleNameScopePiece(StringView &MangledName);
-
-  NamedIdentifierNode *demangleBackRefName(StringView &MangledName);
-  IdentifierNode *demangleTemplateInstantiationName(StringView &MangledName,
-                                                    NameBackrefBehavior NBB);
-  IdentifierNode *demangleFunctionIdentifierCode(StringView &MangledName);
-  IdentifierNode *
-  demangleFunctionIdentifierCode(StringView &MangledName,
-                                 FunctionIdentifierCodeGroup Group);
-  StructorIdentifierNode *demangleStructorIdentifier(StringView &MangledName,
-                                                     bool IsDestructor);
-  ConversionOperatorIdentifierNode *
-  demangleConversionOperatorIdentifier(StringView &MangledName);
-  LiteralOperatorIdentifierNode *
-  demangleLiteralOperatorIdentifier(StringView &MangledName);
-
-  SymbolNode *demangleSpecialIntrinsic(StringView &MangledName);
-  SpecialTableSymbolNode *
-  demangleSpecialTableSymbolNode(StringView &MangledName,
-                                 SpecialIntrinsicKind SIK);
-  LocalStaticGuardVariableNode *
-  demangleLocalStaticGuard(StringView &MangledName);
-  VariableSymbolNode *demangleUntypedVariable(ArenaAllocator &Arena,
-                                              StringView &MangledName,
-                                              StringView VariableName);
-  VariableSymbolNode *
-  demangleRttiBaseClassDescriptorNode(ArenaAllocator &Arena,
-                                      StringView &MangledName);
-  FunctionSymbolNode *demangleInitFiniStub(StringView &MangledName,
-                                           bool IsDestructor);
-
-  NamedIdentifierNode *demangleSimpleName(StringView &MangledName,
-                                          bool Memorize);
-  NamedIdentifierNode *demangleAnonymousNamespaceName(StringView &MangledName);
-  NamedIdentifierNode *demangleLocallyScopedNamePiece(StringView &MangledName);
-  EncodedStringLiteralNode *demangleStringLiteral(StringView &MangledName);
-  FunctionSymbolNode *demangleVcallThunkNode(StringView &MangledName);
-
-  StringView demangleSimpleString(StringView &MangledName, bool Memorize);
-
-  FuncClass demangleFunctionClass(StringView &MangledName);
-  CallingConv demangleCallingConvention(StringView &MangledName);
-  StorageClass demangleVariableStorageClass(StringView &MangledName);
-  void demangleThrowSpecification(StringView &MangledName);
-  wchar_t demangleWcharLiteral(StringView &MangledName);
-  uint8_t demangleCharLiteral(StringView &MangledName);
-
-  std::pair<Qualifiers, bool> demangleQualifiers(StringView &MangledName);
-
-  // Memory allocator.
-  ArenaAllocator Arena;
-
-  // A single type uses one global back-ref table for all function params.
-  // This means back-refs can even go "into" other types.  Examples:
-  //
-  //  // Second int* is a back-ref to first.
-  //  void foo(int *, int*);
-  //
-  //  // Second int* is not a back-ref to first (first is not a function param).
-  //  int* foo(int*);
-  //
-  //  // Second int* is a back-ref to first (ALL function types share the same
-  //  // back-ref map.
-  //  using F = void(*)(int*);
-  //  F G(int *);
-  BackrefContext Backrefs;
-};
-} // namespace
-
 StringView Demangler::copyString(StringView Borrowed) {
   char *Stable = Arena.allocUnalignedBuffer(Borrowed.size() + 1);
   std::strcpy(Stable, Borrowed.begin());
@@ -886,6 +733,16 @@ SymbolNode *Demangler::parse(StringView &MangledName) {
   return Symbol;
 }
 
+TagTypeNode *Demangler::parseTagUniqueName(StringView &MangledName) {
+  if (!MangledName.consumeFront(".?A"))
+    return nullptr;
+  MangledName.consumeFront(".?A");
+  if (MangledName.empty())
+    return nullptr;
+
+  return demangleClassType(MangledName);
+}
+
 // <type-encoding> ::= <storage-class> <variable-type>
 // <storage-class> ::= 0  # private static member
 //                 ::= 1  # protected static member
diff --git a/lib/Demangle/MicrosoftDemangleNodes.cpp b/lib/Demangle/MicrosoftDemangleNodes.cpp
index 93719f89342..d5ee47761bd 100644
--- a/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MicrosoftDemangleNodes.h"
+#include "llvm/Demangle/MicrosoftDemangleNodes.h"
 #include "llvm/Demangle/Compiler.h"
 #include "llvm/Demangle/Utility.h"
 #include <cctype>
-- 
GitLab


From b74f1454d8d25f7dad7fdf92959ac9180a39f0c8 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Thu, 1 Nov 2018 15:23:42 +0000
Subject: [PATCH 0859/1116] [ARM][CGP] Negative constant operand handling

While mutating instructions, we sign extended negative constant
operands for binary operators that can safely overflow. This was to
allow instructions, such as add nuw i8 %a, -2, to still be able to
perform a subtraction. However, the code to handle constants doesn't
take into consideration that instructions, such as sub nuw i8 -2, %a,
require the i8 -2 to be converted into i32 254.

This is a relatively simple fix, but I've taken the time to
reorganise the code a bit - mainly that instructions that can be
promoted are cached and splitting up the Mutate function.

Differential Revision: https://reviews.llvm.org/D53972


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345840 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMCodeGenPrepare.cpp          | 255 +++++++++++++-----
 test/CodeGen/ARM/{ => CGP}/arm-cgp-calls.ll   |   0
 test/CodeGen/ARM/{ => CGP}/arm-cgp-casts.ll   |   0
 test/CodeGen/ARM/{ => CGP}/arm-cgp-icmps.ll   |   0
 .../CodeGen/ARM/{ => CGP}/arm-cgp-overflow.ll |  52 ++++
 .../CodeGen/ARM/{ => CGP}/arm-cgp-phis-ret.ll |   0
 .../CodeGen/ARM/{ => CGP}/arm-cgp-pointers.ll |   0
 .../ARM/{ => CGP}/arm-cgp-signed-icmps.ll     |   0
 test/CodeGen/ARM/{ => CGP}/arm-cgp-signed.ll  |   0
 9 files changed, 238 insertions(+), 69 deletions(-)
 rename test/CodeGen/ARM/{ => CGP}/arm-cgp-calls.ll (100%)
 rename test/CodeGen/ARM/{ => CGP}/arm-cgp-casts.ll (100%)
 rename test/CodeGen/ARM/{ => CGP}/arm-cgp-icmps.ll (100%)
 rename test/CodeGen/ARM/{ => CGP}/arm-cgp-overflow.ll (78%)
 rename test/CodeGen/ARM/{ => CGP}/arm-cgp-phis-ret.ll (100%)
 rename test/CodeGen/ARM/{ => CGP}/arm-cgp-pointers.ll (100%)
 rename test/CodeGen/ARM/{ => CGP}/arm-cgp-signed-icmps.ll (100%)
 rename test/CodeGen/ARM/{ => CGP}/arm-cgp-signed.ll (100%)

diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp
index fb9fad472d9..2403b9e1327 100644
--- a/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -110,11 +110,26 @@ namespace {
 class IRPromoter {
   SmallPtrSet<Value*, 8> NewInsts;
   SmallVector<Instruction*, 4> InstsToRemove;
+  DenseMap<Value*, Type*> TruncTysMap;
+  SmallPtrSet<Value*, 8> Promoted;
   Module *M = nullptr;
   LLVMContext &Ctx;
+  Type *ExtTy = nullptr;
+  Type *OrigTy = nullptr;
+
+  void PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
+                         SmallPtrSetImpl<Instruction*> &SafeToPromote);
+  void ExtendSources(SmallPtrSetImpl<Value*> &Sources);
+  void PromoteTree(SmallPtrSetImpl<Value*> &Visited,
+                   SmallPtrSetImpl<Value*> &Sources,
+                   SmallPtrSetImpl<Instruction*> &Sinks,
+                   SmallPtrSetImpl<Instruction*> &SafeToPromote);
+  void TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
+                     SmallPtrSetImpl<Instruction*> &Sinks);
 
 public:
-  IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { }
+  IRPromoter(Module *M) : M(M), Ctx(M->getContext()),
+                          ExtTy(Type::getInt32Ty(Ctx)) { }
 
   void Cleanup() {
     for (auto *I : InstsToRemove) {
@@ -129,14 +144,17 @@ public:
   void Mutate(Type *OrigTy,
               SmallPtrSetImpl<Value*> &Visited,
               SmallPtrSetImpl<Value*> &Sources,
-              SmallPtrSetImpl<Instruction*> &Sinks);
+              SmallPtrSetImpl<Instruction*> &Sinks,
+              SmallPtrSetImpl<Instruction*> &SafeToPromote);
 };
 
 class ARMCodeGenPrepare : public FunctionPass {
   const ARMSubtarget *ST = nullptr;
   IRPromoter *Promoter = nullptr;
   std::set<Value*> AllVisited;
+  SmallPtrSet<Instruction*, 8> SafeToPromote;
 
+  bool isSafeOverflow(Instruction *I);
   bool isSupportedValue(Value *V);
   bool isLegalToPromote(Value *V);
   bool TryToPromote(Value *V);
@@ -241,8 +259,8 @@ static bool isSink(Value *V) {
 }
 
 /// Return whether the instruction can be promoted within any modifications to
-/// it's operands or result.
-static bool isSafeOverflow(Instruction *I) {
+/// its operands or result.
+bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) {
   // FIXME Do we need NSW too?
   if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
     return true;
@@ -386,11 +404,13 @@ static bool isPromotedResultSafe(Value *V) {
   // If I is only being used by something that will require its value to be
   // truncated, then we don't care about the promoted result.
   auto *I = cast<Instruction>(V);
-  if (I->hasOneUse() && isSink(*I->use_begin()))
+  if (I->hasOneUse() && isSink(*I->use_begin())) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: Only use is a sink: " << *V << "\n");
     return true;
+  }
 
   if (isa<OverflowingBinaryOperator>(I))
-    return isSafeOverflow(I);
+    return false;
   return true;
 }
 
@@ -414,56 +434,84 @@ static Intrinsic::ID getNarrowIntrinsic(Instruction *I) {
   llvm_unreachable("unhandled opcode for narrow intrinsic");
 }
 
-void IRPromoter::Mutate(Type *OrigTy,
-                        SmallPtrSetImpl<Value*> &Visited,
-                        SmallPtrSetImpl<Value*> &Sources,
-                        SmallPtrSetImpl<Instruction*> &Sinks) {
+static void ReplaceAllUsersOfWith(Value *From, Value *To) {
+  SmallVector<Instruction*, 4> Users;
+  Instruction *InstTo = dyn_cast<Instruction>(To);
+  for (Use &U : From->uses()) {
+    auto *User = cast<Instruction>(U.getUser());
+    if (InstTo && User->isIdenticalTo(InstTo))
+      continue;
+    Users.push_back(User);
+  }
+
+  for (auto *U : Users)
+    U->replaceUsesOfWith(From, To);
+}
+
+void
+IRPromoter::PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
+                             SmallPtrSetImpl<Instruction*> &SafeToPromote) {
   IRBuilder<> Builder{Ctx};
-  Type *ExtTy = Type::getInt32Ty(M->getContext());
-  SmallPtrSet<Value*, 8> Promoted;
-  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
-             << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
+  // First step is to prepare the instructions for mutation. Most constants
+  // just need to be zero extended into their new type, but complications arise
+  // because:
+  // - For nuw binary operators, negative immediates would need sign extending;
+  //   however, instead we'll change them to positive and zext them. We can do
+  //   this because:
+  //   > The operators that can wrap are: add, sub, mul and shl.
+  //   > shl interprets its second operand as unsigned and if the first operand
+  //     is an immediate, it will need zext to be nuw.
+  //   > I'm assuming mul cannot be nuw while using a negative immediate...
+  //   > Which leaves the nuw add and sub to be handled; as with shl, if an
+  //     immediate is used as operand 0, it will need zext to be nuw.
+  // - We also allow add and sub to safely overflow in certain circumstances
+  //   and only when the value (operand 0) is being decreased.
+  //
+  // For adds and subs, that are either nuw or safely wrap and use a negative
+  // immediate as operand 1, we create an equivalent instruction using a
+  // positive immediate. That positive immediate can then be zext along with
+  // all the other immediates later.
+  for (auto *V : Visited) {
+    if (!isa<Instruction>(V))
+      continue;
 
-  // Cache original types.
-  DenseMap<Value*, Type*> TruncTysMap;
-  for (auto *V : Visited)
-    TruncTysMap[V] = V->getType();
+    auto *I = cast<Instruction>(V);
+    if (SafeToPromote.count(I)) {
 
-  auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) {
-    SmallVector<Instruction*, 4> Users;
-    Instruction *InstTo = dyn_cast<Instruction>(To);
-    for (Use &U : From->uses()) {
-      auto *User = cast<Instruction>(U.getUser());
-      if (InstTo && User->isIdenticalTo(InstTo))
+      if (!isa<OverflowingBinaryOperator>(I))
         continue;
-      Users.push_back(User);
-    }
 
-    for (auto *U : Users)
-      U->replaceUsesOfWith(From, To);
-  };
-
-  auto FixConst = [&](ConstantInt *Const, Instruction *I) {
-    Constant *NewConst = isSafeOverflow(I) && Const->isNegative() ?
-      ConstantExpr::getSExt(Const, ExtTy) :
-      ConstantExpr::getZExt(Const, ExtTy);
-    I->replaceUsesOfWith(Const, NewConst);
-  };
+      if (auto *Const = dyn_cast<ConstantInt>(I->getOperand(1))) {
+        if (!Const->isNegative())
+          break;
+
+        unsigned Opc = I->getOpcode();
+        assert((Opc == Instruction::Add || Opc == Instruction::Sub) &&
+               "expected only an add or sub to use a negative imm");
+
+        LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n");
+        auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs());
+        Builder.SetInsertPoint(I);
+        Value *NewVal = Opc == Instruction::Sub ?
+          Builder.CreateAdd(I->getOperand(0), NewConst) :
+          Builder.CreateSub(I->getOperand(0), NewConst);
+        LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n");
+
+        if (auto *NewInst = dyn_cast<Instruction>(NewVal)) {
+          NewInst->copyIRFlags(I);
+          NewInsts.insert(NewInst);
+        }
+        InstsToRemove.push_back(I);
+        I->replaceAllUsesWith(NewVal);
+      }
+    }
+  }
+  for (auto *I : NewInsts)
+    Visited.insert(I);
+}
 
-  auto InsertDSPIntrinsic = [&](Instruction *I) {
-    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
-               << *I << "\n");
-    Function *DSPInst =
-      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
-    Builder.SetInsertPoint(I);
-    Builder.SetCurrentDebugLocation(I->getDebugLoc());
-    Value *Args[] = { I->getOperand(0), I->getOperand(1) };
-    CallInst *Call = Builder.CreateCall(DSPInst, Args);
-    ReplaceAllUsersOfWith(I, Call);
-    InstsToRemove.push_back(I);
-    NewInsts.insert(Call);
-    TruncTysMap[Call] = OrigTy;
-  };
+void IRPromoter::ExtendSources(SmallPtrSetImpl<Value*> &Sources) {
+  IRBuilder<> Builder{Ctx};
 
   auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
     LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
@@ -480,7 +528,8 @@ void IRPromoter::Mutate(Type *OrigTy,
     TruncTysMap[ZExt] = TruncTysMap[V];
   };
 
-  // First, insert extending instructions between the sources and their users.
+
+  // Now, insert extending instructions between the sources and their users.
   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n");
   for (auto V : Sources) {
     LLVM_DEBUG(dbgs() << " - " << *V << "\n");
@@ -494,9 +543,17 @@ void IRPromoter::Mutate(Type *OrigTy,
     }
     Promoted.insert(V);
   }
+}
 
+void IRPromoter::PromoteTree(SmallPtrSetImpl<Value*> &Visited,
+                             SmallPtrSetImpl<Value*> &Sources,
+                             SmallPtrSetImpl<Instruction*> &Sinks,
+                             SmallPtrSetImpl<Instruction*> &SafeToPromote) {
   LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
-  // Then mutate the types of the instructions within the tree. Here we handle
+
+  IRBuilder<> Builder{Ctx};
+
+  // Mutate the types of the instructions within the tree. Here we handle
   // constant operands.
   for (auto *V : Visited) {
     if (Sources.count(V))
@@ -511,9 +568,10 @@ void IRPromoter::Mutate(Type *OrigTy,
       if ((Op->getType() == ExtTy) || !isa<IntegerType>(Op->getType()))
         continue;
 
-      if (auto *Const = dyn_cast<ConstantInt>(Op))
-        FixConst(Const, I);
-      else if (isa<UndefValue>(Op))
+      if (auto *Const = dyn_cast<ConstantInt>(Op)) {
+        Constant *NewConst = ConstantExpr::getZExt(Const, ExtTy);
+        I->setOperand(i, NewConst);
+      } else if (isa<UndefValue>(Op))
         I->setOperand(i, UndefValue::get(ExtTy));
     }
 
@@ -523,20 +581,42 @@ void IRPromoter::Mutate(Type *OrigTy,
     }
   }
 
-  // Now we need to remove any zexts that have become unnecessary, as well
-  // as insert any intrinsics.
+  // Finally, any instructions that should be promoted but haven't yet been,
+  // need to be handled using intrinsics.
   for (auto *V : Visited) {
-    if (Sources.count(V))
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
       continue;
 
-    if (!shouldPromote(V) || isPromotedResultSafe(V))
+    if (Sources.count(I) || Sinks.count(I))
       continue;
 
+    if (!shouldPromote(I) || SafeToPromote.count(I) || NewInsts.count(I))
+      continue;
+  
     assert(EnableDSP && "DSP intrinisc insertion not enabled!");
 
     // Replace unsafe instructions with appropriate intrinsic calls.
-    InsertDSPIntrinsic(cast<Instruction>(V));
+    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
+               << *I << "\n");
+    Function *DSPInst =
+      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
+    Builder.SetInsertPoint(I);
+    Builder.SetCurrentDebugLocation(I->getDebugLoc());
+    Value *Args[] = { I->getOperand(0), I->getOperand(1) };
+    CallInst *Call = Builder.CreateCall(DSPInst, Args);
+    ReplaceAllUsersOfWith(I, Call);
+    InstsToRemove.push_back(I);
+    NewInsts.insert(Call);
+    TruncTysMap[Call] = OrigTy;
   }
+}
+
+void IRPromoter::TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
+                               SmallPtrSetImpl<Instruction*> &Sinks) {
+  LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
+
+  IRBuilder<> Builder{Ctx};
 
   auto InsertTrunc = [&](Value *V) -> Instruction* {
     if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType()))
@@ -558,7 +638,6 @@ void IRPromoter::Mutate(Type *OrigTy,
     return Trunc;
   };
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
   // Fix up any stores or returns that use the results of the promoted
   // chain.
   for (auto I : Sinks) {
@@ -584,6 +663,36 @@ void IRPromoter::Mutate(Type *OrigTy,
       }
     }
   }
+}
+
+void IRPromoter::Mutate(Type *OrigTy,
+                        SmallPtrSetImpl<Value*> &Visited,
+                        SmallPtrSetImpl<Value*> &Sources,
+                        SmallPtrSetImpl<Instruction*> &Sinks,
+                        SmallPtrSetImpl<Instruction*> &SafeToPromote) {
+  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
+             << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
+  this->OrigTy = OrigTy;
+
+  // Cache original types.
+  for (auto *V : Visited)
+    TruncTysMap[V] = V->getType();
+
+  // Convert adds and subs using negative immediates to equivalent instructions
+  // that use positive constants.
+  PrepareConstants(Visited, SafeToPromote);
+
+  // Insert zext instructions between sources and their users.
+  ExtendSources(Sources);
+
+  // Promote visited instructions, mutating their types in place. Also insert
+  // DSP intrinsics, if enabled, for adds and subs which would be unsafe to
+  // promote.
+  PromoteTree(Visited, Sources, Sinks, SafeToPromote);
+
+  // Finally, insert trunc instructions for use by calls, stores etc...
+  TruncateSinks(Sources, Sinks);
+
   LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete:\n");
   LLVM_DEBUG(dbgs();
              for (auto *V : Sources)
@@ -651,11 +760,20 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
 /// smaller than the targeted promoted type. Check that we're not trying to
 /// promote something larger than our base 'TypeSize' type.
 bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
-  if (isPromotedResultSafe(V))
-    return true;
 
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
+    return true;
+
+  if (SafeToPromote.count(I))
+   return true;
+
+  if (isPromotedResultSafe(V) || isSafeOverflow(I)) {
+    SafeToPromote.insert(I);
+    return true;
+  }
+
+  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
     return false;
 
   // If promotion is not safe, can we use a DSP instruction to natively
@@ -666,9 +784,6 @@ bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
   if (ST->isThumb() && !ST->hasThumb2())
     return false;
 
-  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
-    return false;
-
   // TODO
   // Would it be profitable? For Thumb code, these parallel DSP instructions
   // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
@@ -680,6 +795,7 @@ bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
         return false;
     }
   }
+  LLVM_DEBUG(dbgs() << "ARM CGP: Will use an intrinsic for: " << *I << "\n");
   return true;
 }
 
@@ -689,6 +805,8 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
   if (TypeSize > 16 || TypeSize < 8)
     return false;
 
+  SafeToPromote.clear();
+
   if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
     return false;
 
@@ -698,9 +816,8 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
   SetVector<Value*> WorkList;
   SmallPtrSet<Value*, 8> Sources;
   SmallPtrSet<Instruction*, 4> Sinks;
-  WorkList.insert(V);
   SmallPtrSet<Value*, 16> CurrentVisited;
-  CurrentVisited.clear();
+  WorkList.insert(V);
 
   // Return true if V was added to the worklist as a supported instruction,
   // if it was already visited, or if we don't need to explore it (e.g.
@@ -783,7 +900,7 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
   if (ToPromote < 2)
     return false;
 
-  Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks);
+  Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote);
   return true;
 }
 
diff --git a/test/CodeGen/ARM/arm-cgp-calls.ll b/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
similarity index 100%
rename from test/CodeGen/ARM/arm-cgp-calls.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-calls.ll
diff --git a/test/CodeGen/ARM/arm-cgp-casts.ll b/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
similarity index 100%
rename from test/CodeGen/ARM/arm-cgp-casts.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-casts.ll
diff --git a/test/CodeGen/ARM/arm-cgp-icmps.ll b/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll
similarity index 100%
rename from test/CodeGen/ARM/arm-cgp-icmps.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-icmps.ll
diff --git a/test/CodeGen/ARM/arm-cgp-overflow.ll b/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll
similarity index 78%
rename from test/CodeGen/ARM/arm-cgp-overflow.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-overflow.ll
index d0c191cc542..8e10876c0b1 100644
--- a/test/CodeGen/ARM/arm-cgp-overflow.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll
@@ -168,6 +168,7 @@ define i32 @safe_sub_underflow_neg(i8 zeroext %a) {
   ret i32 %res
 }
 
+; CHECK-LABEL: unsafe_sub_underflow_neg
 ; CHECK:  subs r0, #4
 ; CHECK:  uxtb [[EXT:r[0-9]+]], r0
 ; CHECK:  cmp [[EXT]], #253
@@ -178,3 +179,54 @@ define i32 @unsafe_sub_underflow_neg(i8 zeroext %a) {
   %res = select i1 %cmp, i32 8, i32 16
   ret i32 %res
 }
+
+; CHECK:      rsb.w [[RSUB:r[0-9]+]], r0, #248
+; CHECK-NOT:  uxt
+; CHECK:      cmp [[RSUB]], #252
+define i32 @safe_sub_imm_var(i8* %b) {
+entry:
+  %0 = load i8, i8* %b, align 1
+  %sub = sub nuw nsw i8 -8, %0
+  %cmp = icmp ugt i8 %sub, 252
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+; CHECK-LABEL: safe_sub_var_imm
+; CHECK:      add.w [[ADD:r[0-9]+]], r0, #8
+; CHECK-NOT:  uxt
+; CHECK:      cmp [[ADD]], #252
+define i32 @safe_sub_var_imm(i8* %b) {
+entry:
+  %0 = load i8, i8* %b, align 1
+  %sub = sub nuw nsw i8 %0, -8
+  %cmp = icmp ugt i8 %sub, 252
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+; CHECK-LABEL: safe_add_imm_var
+; CHECK:      add.w [[ADD:r[0-9]+]], r0, #129
+; CHECK-NOT:  uxt
+; CHECK:      cmp [[ADD]], #127
+define i32 @safe_add_imm_var(i8* %b) {
+entry:
+  %0 = load i8, i8* %b, align 1
+  %add = add nuw nsw i8 -127, %0
+  %cmp = icmp ugt i8 %add, 127
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+; CHECK-LABEL: safe_add_var_imm
+; CHECK:      sub.w [[SUB:r[0-9]+]], r0, #127
+; CHECK-NOT:  uxt
+; CHECK:      cmp [[SUB]], #127
+define i32 @safe_add_var_imm(i8* %b) {
+entry:
+  %0 = load i8, i8* %b, align 1
+  %add = add nuw nsw i8 %0, -127
+  %cmp = icmp ugt i8 %add, 127
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
diff --git a/test/CodeGen/ARM/arm-cgp-phis-ret.ll b/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll
similarity index 100%
rename from test/CodeGen/ARM/arm-cgp-phis-ret.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll
diff --git a/test/CodeGen/ARM/arm-cgp-pointers.ll b/test/CodeGen/ARM/CGP/arm-cgp-pointers.ll
similarity index 100%
rename from test/CodeGen/ARM/arm-cgp-pointers.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-pointers.ll
diff --git a/test/CodeGen/ARM/arm-cgp-signed-icmps.ll b/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
similarity index 100%
rename from test/CodeGen/ARM/arm-cgp-signed-icmps.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
diff --git a/test/CodeGen/ARM/arm-cgp-signed.ll b/test/CodeGen/ARM/CGP/arm-cgp-signed.ll
similarity index 100%
rename from test/CodeGen/ARM/arm-cgp-signed.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-signed.ll
-- 
GitLab


From 14550f9fb5e160399f16b64e45712e5a902e316c Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Thu, 1 Nov 2018 15:41:11 +0000
Subject: [PATCH 0860/1116] [MC] Implement EmitRawText in MCNullStreamer

Summary:
This adds dummy implementation of `EmitRawText` in `MCNullStreamer`.

This fixes the behavior of `AsmPrinter` with `MCNullStreamer` on targets
on which no integrated assembler is used. An attempt to emit inline asm
on such a target would previously lead to a crash, since `AsmPrinter` does not
check for `hasRawTextSupport` in `EmitInlineAsm` and calls `EmitRawText`
anyway if integrated assembler is disabled (the behavior has changed
in D2686).

Error message printed by MCStreamer:

> EmitRawText called on an MCStreamer that doesn't support it, something
> must not be fully mc'ized

Patch by Eugene Sharygin

Reviewers: dsanders, echristo

Reviewed By: dsanders

Subscribers: eraman, llvm-commits

Differential Revision: https://reviews.llvm.org/D53938

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345841 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/MC/MCNullStreamer.cpp                        | 4 ++++
 test/CodeGen/Hexagon/inline-asm-filetype-null.ll | 8 ++++++++
 2 files changed, 12 insertions(+)
 create mode 100644 test/CodeGen/Hexagon/inline-asm-filetype-null.ll

diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index a96dec18444..4e97e7550bc 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -22,6 +23,9 @@ namespace {
     /// @name MCStreamer Interface
     /// @{
 
+    bool hasRawTextSupport() const override { return true; }
+    void EmitRawTextImpl(StringRef String) override {}
+
     bool EmitSymbolAttribute(MCSymbol *Symbol,
                              MCSymbolAttr Attribute) override {
       return true;
diff --git a/test/CodeGen/Hexagon/inline-asm-filetype-null.ll b/test/CodeGen/Hexagon/inline-asm-filetype-null.ll
new file mode 100644
index 00000000000..9fbbcff71f5
--- /dev/null
+++ b/test/CodeGen/Hexagon/inline-asm-filetype-null.ll
@@ -0,0 +1,8 @@
+; RUN: llc -filetype=null < %s
+
+target triple = "hexagon"
+
+define void @foo() {
+  tail call void asm sideeffect "//", ""()
+  ret void
+}
-- 
GitLab


From 69778b6a71151ab152c3daa8ffc245ef11f559a3 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 1 Nov 2018 15:41:12 +0000
Subject: [PATCH 0861/1116] [DAGCombiner] make sure we have a whole-number
 extract before trying to narrow a vector op (PR39511)

The test causes a crash because we were trying to extract v4f32 to v3f32, and the
narrowing factor was then 4/3 = 1 producing a bogus narrow type.

This should fix:
https://bugs.llvm.org/show_bug.cgi?id=39511


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345842 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  6 +++++-
 test/CodeGen/X86/vector-narrow-binop.ll  | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e6ea4898717..d0c898f2e97 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16708,10 +16708,14 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   assert(ExtractIndex % NumElems == 0 &&
          "Extract index is not a multiple of the vector length.");
   EVT SrcVT = Extract->getOperand(0).getValueType();
+
+  // Bail out if this is not a proper multiple width extraction.
   unsigned NumSrcElems = SrcVT.getVectorNumElements();
-  unsigned NarrowingRatio = NumSrcElems / NumElems;
+  if (NumSrcElems % NumElems != 0)
+    return SDValue();
 
   // Bail out if the target does not support a narrower version of the binop.
+  unsigned NarrowingRatio = NumSrcElems / NumElems;
   unsigned BOpcode = BinOp.getOpcode();
   unsigned WideNumElts = WideBVT.getVectorNumElements();
   EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll
index 9b05ce4485e..c20dc09a6b2 100644
--- a/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/test/CodeGen/X86/vector-narrow-binop.ll
@@ -80,3 +80,21 @@ define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c,
   ret <4 x i32> %sub
 }
 
+; When extracting from a vector binop, the source width should be a multiple of the destination width.
+; https://bugs.llvm.org/show_bug.cgi?id=39511
+
+define <3 x float> @PR39511(<4 x float> %t0, <3 x float>* %b) {
+; SSE-LABEL: PR39511:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR39511:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %add = fadd <4 x float> %t0, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %ext = shufflevector <4 x float> %add, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %ext
+}
+
-- 
GitLab


From b530051d91442121708d68a7bbe521283d100f2a Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Thu, 1 Nov 2018 16:02:12 +0000
Subject: [PATCH 0862/1116] [llvm-objcopy] Use proper cases

Reviewers: jhenderson, alexshap, jakehehrlich, espindola, rupprecht

Reviewed By: jhenderson, rupprecht

Subscribers: emaste, arichardson, rupprecht, llvm-commits

Differential Revision: https://reviews.llvm.org/D53971

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345845 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objcopy/ELF/Object.cpp | 34 +++++++++++++++----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/tools/llvm-objcopy/ELF/Object.cpp b/tools/llvm-objcopy/ELF/Object.cpp
index 5b2138436d5..ef3029b477a 100644
--- a/tools/llvm-objcopy/ELF/Object.cpp
+++ b/tools/llvm-objcopy/ELF/Object.cpp
@@ -615,12 +615,12 @@ void GnuDebugLinkSection::init(StringRef File, StringRef Data) {
   // establish the order that sections should go in. By using the maximum
   // possible offset we cause this section to wind up at the end.
   OriginalOffset = std::numeric_limits<uint64_t>::max();
-  JamCRC crc;
-  crc.update(ArrayRef<char>(Data.data(), Data.size()));
+  JamCRC CRC;
+  CRC.update(ArrayRef<char>(Data.data(), Data.size()));
   // The CRC32 value needs to be complemented because the JamCRC dosn't
   // finalize the CRC32 value. It also dosn't negate the initial CRC32 value
   // but it starts by default at 0xFFFFFFFF which is the complement of zero.
-  CRC32 = ~crc.getCRC();
+  CRC32 = ~CRC.getCRC();
 }
 
 GnuDebugLinkSection::GnuDebugLinkSection(StringRef File) : FileName(File) {
@@ -748,7 +748,7 @@ void BinaryELFBuilder<ELFT>::addData(SymbolTableSection *SymTab) {
 
   std::string SanitizedFilename = MemBuf->getBufferIdentifier().str();
   std::replace_if(std::begin(SanitizedFilename), std::end(SanitizedFilename),
-                  [](char c) { return !isalnum(c); }, '_');
+                  [](char C) { return !isalnum(C); }, '_');
   Twine Prefix = Twine("_binary_") + SanitizedFilename;
 
   SymTab->addSymbol(Prefix + "_start", STB_GLOBAL, STT_NOTYPE, &DataSection,
@@ -1128,20 +1128,20 @@ std::unique_ptr<Object> BinaryReader::create() const {
 
 std::unique_ptr<Object> ELFReader::create() const {
   auto Obj = llvm::make_unique<Object>();
-  if (auto *o = dyn_cast<ELFObjectFile<ELF32LE>>(Bin)) {
-    ELFBuilder<ELF32LE> Builder(*o, *Obj);
+  if (auto *O = dyn_cast<ELFObjectFile<ELF32LE>>(Bin)) {
+    ELFBuilder<ELF32LE> Builder(*O, *Obj);
     Builder.build();
     return Obj;
-  } else if (auto *o = dyn_cast<ELFObjectFile<ELF64LE>>(Bin)) {
-    ELFBuilder<ELF64LE> Builder(*o, *Obj);
+  } else if (auto *O = dyn_cast<ELFObjectFile<ELF64LE>>(Bin)) {
+    ELFBuilder<ELF64LE> Builder(*O, *Obj);
     Builder.build();
     return Obj;
-  } else if (auto *o = dyn_cast<ELFObjectFile<ELF32BE>>(Bin)) {
-    ELFBuilder<ELF32BE> Builder(*o, *Obj);
+  } else if (auto *O = dyn_cast<ELFObjectFile<ELF32BE>>(Bin)) {
+    ELFBuilder<ELF32BE> Builder(*O, *Obj);
     Builder.build();
     return Obj;
-  } else if (auto *o = dyn_cast<ELFObjectFile<ELF64BE>>(Bin)) {
-    ELFBuilder<ELF64BE> Builder(*o, *Obj);
+  } else if (auto *O = dyn_cast<ELFObjectFile<ELF64BE>>(Bin)) {
+    ELFBuilder<ELF64BE> Builder(*O, *Obj);
     Builder.build();
     return Obj;
   }
@@ -1308,7 +1308,7 @@ static uint64_t alignToAddr(uint64_t Offset, uint64_t Addr, uint64_t Align) {
 }
 
 // Orders segments such that if x = y->ParentSegment then y comes before x.
-static void OrderSegments(std::vector<Segment *> &Segments) {
+static void orderSegments(std::vector<Segment *> &Segments) {
   std::stable_sort(std::begin(Segments), std::end(Segments),
                    compareSegmentsByOffset);
 }
@@ -1350,7 +1350,7 @@ static uint64_t LayoutSegments(std::vector<Segment *> &Segments,
 // sections had a ParentSegment or an offset one past the last section if there
 // was a section that didn't have a ParentSegment.
 template <class Range>
-static uint64_t LayoutSections(Range Sections, uint64_t Offset) {
+static uint64_t layoutSections(Range Sections, uint64_t Offset) {
   // Now the offset of every segment has been set we can assign the offsets
   // of each section. For sections that are covered by a segment we should use
   // the segment's original offset and the section's original offset to compute
@@ -1394,13 +1394,13 @@ template <class ELFT> void ELFWriter<ELFT>::assignOffsets() {
     OrderedSegments.push_back(&Segment);
   OrderedSegments.push_back(&Obj.ElfHdrSegment);
   OrderedSegments.push_back(&Obj.ProgramHdrSegment);
-  OrderSegments(OrderedSegments);
+  orderSegments(OrderedSegments);
   // Offset is used as the start offset of the first segment to be laid out.
   // Since the ELF Header (ElfHdrSegment) must be at the start of the file,
   // we start at offset 0.
   uint64_t Offset = 0;
   Offset = LayoutSegments(OrderedSegments, Offset);
-  Offset = LayoutSections(Obj.sections(), Offset);
+  Offset = layoutSections(Obj.sections(), Offset);
   // If we need to write the section header table out then we need to align the
   // Offset so that SHOffset is valid.
   if (WriteSectionHeaders)
@@ -1585,7 +1585,7 @@ void BinaryWriter::finalize() {
       continue;
     AllocatedSections.push_back(&Section);
   }
-  LayoutSections(make_pointee_range(AllocatedSections), Offset);
+  layoutSections(make_pointee_range(AllocatedSections), Offset);
 
   // Now that every section has been laid out we just need to compute the total
   // file size. This might not be the same as the offset returned by
-- 
GitLab


From 0fc1aec92a5b39e690e1edc6108057069fc4a685 Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Thu, 1 Nov 2018 16:37:29 +0000
Subject: [PATCH 0863/1116] [NativePDB] Get LLDB types from PDB function types.

This adds basic support for getting function signature types
into LLDB's type system, including into clang's AST.  There are
a few edge cases which are not correctly handled, mostly dealing
with nested classes, but this isn't specific to functions and
apply equally to variable types.  Note that no attempt has been
made yet to deal with member function types, which will happen
in subsequent patches.

Differential Revision: https://reviews.llvm.org/D53951

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345848 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/CodeView/SymbolDeserializer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
index b5479db97a1..6b5dd2d20d1 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
@@ -47,7 +47,7 @@ public:
     return Error::success();
   }
   template <typename T> static Expected<T> deserializeAs(CVSymbol Symbol) {
-    T Record(Symbol.kind());
+    T Record(static_cast<SymbolRecordKind>(Symbol.kind()));
     if (auto EC = deserializeAs<T>(Symbol, Record))
       return std::move(EC);
     return Record;
-- 
GitLab


From 376ec2b8fb9af4f3bb398c7b289b7c77ae9e9ce2 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Thu, 1 Nov 2018 16:44:45 +0000
Subject: [PATCH 0864/1116] [ARM] Attempt to fix ppc64be buildbot

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345850 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMCodeGenPrepare.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp
index 2403b9e1327..0a6ea9dc325 100644
--- a/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -633,8 +633,9 @@ void IRPromoter::TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
     LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for "
                << *V << "\n");
     Builder.SetInsertPoint(cast<Instruction>(V));
-    auto *Trunc = cast<Instruction>(Builder.CreateTrunc(V, TruncTy));
-    NewInsts.insert(Trunc);
+    auto *Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(V, TruncTy));
+    if (Trunc)
+      NewInsts.insert(Trunc);
     return Trunc;
   };
 
-- 
GitLab


From b5216e928ab9b088a78741dddac2d4eec53e278b Mon Sep 17 00:00:00 2001
From: Cameron McInally <cameron.mcinally@nyu.edu>
Date: Thu, 1 Nov 2018 16:57:52 +0000
Subject: [PATCH 0865/1116] Fix whitespace in test/Assembler/fast-math-flags.ll

Differential Revision: https://reviews.llvm.org/D53981


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345851 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Assembler/fast-math-flags.ll | 138 +++++++++++++++---------------
 1 file changed, 69 insertions(+), 69 deletions(-)

diff --git a/test/Assembler/fast-math-flags.ll b/test/Assembler/fast-math-flags.ll
index 664b1bd271e..edff26e6d68 100644
--- a/test/Assembler/fast-math-flags.ll
+++ b/test/Assembler/fast-math-flags.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llvm-dis | FileCheck %s
-; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as < %s | llvm-dis | FileCheck -strict-whitespace %s
+; RUN: opt -S < %s | FileCheck -strict-whitespace %s
 ; RUN: verify-uselistorder %s
 
 @addr   = external global i64
@@ -11,67 +11,67 @@ declare float @foo(float)
 
 define float @none(float %x, float %y) {
 entry:
-; CHECK:  %vec = load  <3 x float>,  <3 x float>* @vec
-  %vec    = load  <3 x float>,  <3 x float>* @vec
+; CHECK:  %vec = load <3 x float>, <3 x float>* @vec
+  %vec    = load <3 x float>, <3 x float>* @vec
 ; CHECK:  %select = load i1, i1* @select
   %select = load i1, i1* @select
-; CHECK:  %arr    = load [3 x float], [3 x float]* @arr
+; CHECK:  %arr = load [3 x float], [3 x float]* @arr
   %arr    = load [3 x float], [3 x float]* @arr
 
-; CHECK:  %a = fadd  float %x, %y
-  %a = fadd  float %x, %y
-; CHECK:  %a_vec = fadd  <3 x float> %vec, %vec
-  %a_vec = fadd  <3 x float> %vec, %vec
-; CHECK:  %b = fsub  float %x, %y
-  %b = fsub  float %x, %y
-; CHECK:  %b_vec = fsub  <3 x float> %vec, %vec
-  %b_vec = fsub  <3 x float> %vec, %vec
-; CHECK:  %c = fmul  float %x, %y
-  %c = fmul  float %x, %y
-; CHECK:  %c_vec = fmul  <3 x float> %vec, %vec
-  %c_vec = fmul  <3 x float> %vec, %vec
-; CHECK:  %d = fdiv  float %x, %y
-  %d = fdiv  float %x, %y
-; CHECK:  %d_vec = fdiv  <3 x float> %vec, %vec
-  %d_vec = fdiv  <3 x float> %vec, %vec
-; CHECK:  %e = frem  float %x, %y
-  %e = frem  float %x, %y
-; CHECK:  %e_vec = frem  <3 x float> %vec, %vec
-  %e_vec = frem  <3 x float> %vec, %vec
-; CHECK:  ret  float %e
-  ret  float %e
+; CHECK:  %a = fadd float %x, %y
+  %a = fadd float %x, %y
+; CHECK:  %a_vec = fadd <3 x float> %vec, %vec
+  %a_vec = fadd <3 x float> %vec, %vec
+; CHECK:  %b = fsub float %x, %y
+  %b = fsub float %x, %y
+; CHECK:  %b_vec = fsub <3 x float> %vec, %vec
+  %b_vec = fsub <3 x float> %vec, %vec
+; CHECK:  %c = fmul float %x, %y
+  %c = fmul float %x, %y
+; CHECK:  %c_vec = fmul <3 x float> %vec, %vec
+  %c_vec = fmul <3 x float> %vec, %vec
+; CHECK:  %d = fdiv float %x, %y
+  %d = fdiv float %x, %y
+; CHECK:  %d_vec = fdiv <3 x float> %vec, %vec
+  %d_vec = fdiv <3 x float> %vec, %vec
+; CHECK:  %e = frem float %x, %y
+  %e = frem float %x, %y
+; CHECK:  %e_vec = frem <3 x float> %vec, %vec
+  %e_vec = frem <3 x float> %vec, %vec
+; CHECK:  ret float %e
+  ret float %e
 }
 
 ; CHECK: no_nan
 define float @no_nan(float %x, float %y) {
 entry:
 ; CHECK:  %vec = load <3 x float>, <3 x float>* @vec
-  %vec    = load  <3 x float>,  <3 x float>* @vec
+  %vec    = load <3 x float>, <3 x float>* @vec
 ; CHECK:  %select = load i1, i1* @select
   %select = load i1, i1* @select
-; CHECK:  %arr = load  [3 x float],  [3 x float]* @arr
-  %arr    = load  [3 x float],  [3 x float]* @arr
+; CHECK:  %arr = load [3 x float], [3 x float]* @arr
+  %arr    = load [3 x float], [3 x float]* @arr
 
-; CHECK:  %a = fadd nnan  float %x, %y
-  %a = fadd nnan  float %x, %y
-; CHECK:  %a_vec = fadd nnan  <3 x float> %vec, %vec
-  %a_vec = fadd nnan  <3 x float> %vec, %vec
-; CHECK:  %b = fsub nnan  float %x, %y
-  %b = fsub nnan  float %x, %y
-; CHECK:  %b_vec = fsub nnan  <3 x float> %vec, %vec
-  %b_vec = fsub nnan  <3 x float> %vec, %vec
-; CHECK:  %c = fmul nnan  float %x, %y
-  %c = fmul nnan  float %x, %y
-; CHECK:  %c_vec = fmul nnan  <3 x float> %vec, %vec
+; CHECK:  %a = fadd nnan float %x, %y
+  %a = fadd nnan float %x, %y
+; CHECK:  %a_vec = fadd nnan <3 x float> %vec, %vec
+  %a_vec = fadd nnan <3 x float> %vec, %vec
+; CHECK:  %b = fsub nnan float %x, %y
+  %b = fsub nnan float %x, %y
+; CHECK:  %b_vec = fsub nnan <3 x float> %vec, %vec
+  %b_vec = fsub nnan <3 x float> %vec, %vec
+; CHECK:  %c = fmul nnan float %x, %y
+  %c = fmul nnan float %x, %y
+; CHECK:  %c_vec = fmul nnan <3 x float> %vec, %vec
   %c_vec = fmul nnan <3 x float> %vec, %vec
-; CHECK:  %d = fdiv nnan  float %x, %y
+; CHECK:  %d = fdiv nnan float %x, %y
   %d = fdiv nnan float %x, %y
-; CHECK:  %d_vec = fdiv nnan  <3 x float> %vec, %vec
+; CHECK:  %d_vec = fdiv nnan <3 x float> %vec, %vec
   %d_vec = fdiv nnan <3 x float> %vec, %vec
-; CHECK:  %e = frem nnan  float %x, %y
-  %e = frem nnan  float %x, %y
-; CHECK:  %e_vec = frem nnan  <3 x float> %vec, %vec
-  %e_vec = frem nnan  <3 x float> %vec, %vec
+; CHECK:  %e = frem nnan float %x, %y
+  %e = frem nnan float %x, %y
+; CHECK:  %e_vec = frem nnan <3 x float> %vec, %vec
+  %e_vec = frem nnan <3 x float> %vec, %vec
 ; CHECK:  ret float %e
   ret float %e
 }
@@ -120,28 +120,28 @@ entry:
 ; CHECK:  %arr = load [3 x float], [3 x float]* @arr
   %arr    = load [3 x float], [3 x float]* @arr
 
-; CHECK:  %a = fadd nnan ninf  float %x, %y
-  %a = fadd ninf nnan  float %x, %y
-; CHECK:  %a_vec = fadd nnan  <3 x float> %vec, %vec
-  %a_vec = fadd nnan  <3 x float> %vec, %vec
-; CHECK:  %b = fsub nnan  float %x, %y
-  %b = fsub nnan  float %x, %y
-; CHECK:  %b_vec = fsub nnan ninf  <3 x float> %vec, %vec
-  %b_vec = fsub ninf nnan  <3 x float> %vec, %vec
-; CHECK:  %c = fmul nnan  float %x, %y
-  %c = fmul nnan  float %x, %y
-; CHECK:  %c_vec = fmul nnan  <3 x float> %vec, %vec
+; CHECK:  %a = fadd nnan ninf float %x, %y
+  %a = fadd ninf nnan float %x, %y
+; CHECK:  %a_vec = fadd nnan <3 x float> %vec, %vec
+  %a_vec = fadd nnan <3 x float> %vec, %vec
+; CHECK:  %b = fsub nnan float %x, %y
+  %b = fsub nnan float %x, %y
+; CHECK:  %b_vec = fsub nnan ninf <3 x float> %vec, %vec
+  %b_vec = fsub ninf nnan <3 x float> %vec, %vec
+; CHECK:  %c = fmul nnan float %x, %y
+  %c = fmul nnan float %x, %y
+; CHECK:  %c_vec = fmul nnan <3 x float> %vec, %vec
   %c_vec = fmul nnan <3 x float> %vec, %vec
-; CHECK:  %d = fdiv nnan ninf  float %x, %y
+; CHECK:  %d = fdiv nnan ninf float %x, %y
   %d = fdiv ninf nnan float %x, %y
-; CHECK:  %d_vec = fdiv nnan  <3 x float> %vec, %vec
+; CHECK:  %d_vec = fdiv nnan <3 x float> %vec, %vec
   %d_vec = fdiv nnan <3 x float> %vec, %vec
-; CHECK:  %e = frem nnan  float %x, %y
-  %e = frem nnan  float %x, %y
-; CHECK:  %e_vec = frem nnan ninf  <3 x float> %vec, %vec
-  %e_vec = frem ninf nnan  <3 x float> %vec, %vec
-; CHECK:  ret  float %e
-  ret  float %e
+; CHECK:  %e = frem nnan float %x, %y
+  %e = frem nnan float %x, %y
+; CHECK:  %e_vec = frem nnan ninf <3 x float> %vec, %vec
+  %e_vec = frem ninf nnan <3 x float> %vec, %vec
+; CHECK:  ret float %e
+  ret float %e
 }
 
 ; CHECK: mixed_flags
@@ -151,7 +151,7 @@ entry:
   %vec    = load <3 x float>, <3 x float>* @vec
 ; CHECK:  %select = load i1, i1* @select
   %select = load i1, i1* @select
-; CHECK:  %arr    = load [3 x float], [3 x float]* @arr
+; CHECK:  %arr = load [3 x float], [3 x float]* @arr
   %arr    = load [3 x float], [3 x float]* @arr
 
 ; CHECK:  %a = fadd nnan ninf afn float %x, %y
@@ -174,6 +174,6 @@ entry:
   %e = frem nnan nsz float %x, %y
 ; CHECK:  %e_vec = frem nnan <3 x float> %vec, %vec
   %e_vec = frem nnan <3 x float> %vec, %vec
-; CHECK:  ret  float %e
-  ret  float %e
+; CHECK:  ret float %e
+  ret float %e
 }
-- 
GitLab


From f4dd4f28a9f4d43f9e4cdf20b7093d6837867e09 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 1 Nov 2018 16:57:54 +0000
Subject: [PATCH 0866/1116] [InstCombine] add test for ComputeNumSignBits on
 2-input shuffle; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345852 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/logical-select.ll | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index b4260af75b4..888c6a544ea 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -616,3 +616,24 @@ define <4 x i32> @computesignbits_through_shuffles(<4 x float> %x, <4 x float> %
   ret <4 x i32> %sel
 }
 
+define <4 x i32> @computesignbits_through_two_input_shuffle(<4 x i32> %x, <4 x i32> %y, <4 x i1> %cond1, <4 x i1> %cond2) {
+; CHECK-LABEL: @computesignbits_through_two_input_shuffle(
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext <4 x i1> [[COND1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext <4 x i1> [[COND2:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i32> [[SEXT1]], <4 x i32> [[SEXT2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[NOTCOND:%.*]] = xor <4 x i32> [[COND]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[AND1:%.*]] = and <4 x i32> [[NOTCOND]], [[X:%.*]]
+; CHECK-NEXT:    [[AND2:%.*]] = and <4 x i32> [[COND]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = or <4 x i32> [[AND1]], [[AND2]]
+; CHECK-NEXT:    ret <4 x i32> [[SEL]]
+;
+  %sext1 = sext <4 x i1> %cond1 to <4 x i32>
+  %sext2 = sext <4 x i1> %cond2 to <4 x i32>
+  %cond = shufflevector <4 x i32> %sext1, <4 x i32> %sext2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %notcond = xor <4 x i32> %cond, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and1 = and <4 x i32> %notcond, %x
+  %and2 = and <4 x i32> %cond, %y
+  %sel = or <4 x i32> %and1, %and2
+  ret <4 x i32> %sel
+}
+
-- 
GitLab


From d919ed9caca10f97c1b791f8b6325518e4ac94a4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Thu, 1 Nov 2018 17:20:40 +0000
Subject: [PATCH 0867/1116] [llvm-objcopy] For multiclass Eq, associate help
 text with --name= , not with --name

Summary:
Before:
% llvm-objcopy -help
...
 --weaken-symbol=symbol  Mark <symbol> as weak
 --weaken-symbol symbol  Mark <symbol> as weak

After:
% llvm-objcopy -help
...
 --weaken-symbol=symbol  Mark <symbol> as weak

Reviewers: jhenderson, rupprecht, alexshap, jakehehrlich

Reviewed By: jhenderson

Subscribers: llvm-commits, kristina

Differential Revision: https://reviews.llvm.org/D53983

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345855 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objcopy/ObjcopyOpts.td | 126 +++++++++++++-----------------
 tools/llvm-objcopy/StripOpts.td   |  20 +++--
 2 files changed, 65 insertions(+), 81 deletions(-)

diff --git a/tools/llvm-objcopy/ObjcopyOpts.td b/tools/llvm-objcopy/ObjcopyOpts.td
index f6c8a959e8b..8ed6df9a9f6 100644
--- a/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/tools/llvm-objcopy/ObjcopyOpts.td
@@ -1,25 +1,24 @@
 include "llvm/Option/OptParser.td"
 
-multiclass Eq<string name> {
+multiclass Eq<string name, string help> {
   def NAME: Separate<["--", "-"], name>;
-  def NAME # _eq: Joined<["--", "-"], name # "=">, Alias<!cast<Separate>(NAME)>;
+  def NAME # _eq: Joined<["--", "-"], name # "=">, Alias<!cast<Separate>(NAME)>,
+    HelpText<help>;
 }
 
 def help : Flag<["-", "--"], "help">;
-defm binary_architecture : Eq<"binary-architecture">,
-                           HelpText<"Used when transforming an architecture-less format (such as binary) to another format">;
+defm binary_architecture
+    : Eq<"binary-architecture", "Used when transforming an architecture-less "
+                                "format (such as binary) to another format">;
 def B : JoinedOrSeparate<["-"], "B">,
         Alias<binary_architecture>;
-defm target : Eq<"target">,
-              HelpText<"Format of the input and output file">,
+defm target : Eq<"target", "Format of the input and output file">,
               Values<"binary">;
 def F : JoinedOrSeparate<[ "-" ], "F">, Alias<target>;
-defm input_target : Eq<"input-target">,
-                    HelpText<"Format of the input file">,
+defm input_target : Eq<"input-target", "Format of the input file">,
                     Values<"binary">;
 def I : JoinedOrSeparate<[ "-" ], "I">, Alias<input_target>;
-defm output_target : Eq<"output-target">,
-                     HelpText<"Format of the output file">,
+defm output_target : Eq<"output-target", "Format of the output file">,
                      Values<"binary">;
 def compress_debug_sections : Flag<["--", "-"], "compress-debug-sections">;
 def compress_debug_sections_eq : Joined<["--", "-"], "compress-debug-sections=">,
@@ -31,44 +30,40 @@ def decompress_debug_sections : Flag<["-", "--"], "decompress-debug-sections">,
                                 HelpText<"Decompress DWARF debug sections.">;
 def O : JoinedOrSeparate<["-"], "O">,
         Alias<output_target>;
-defm split_dwo : Eq<"split-dwo">,
-                 MetaVarName<"dwo-file">,
-                 HelpText<"Equivalent to extract-dwo on the input file to <dwo-file>, then strip-dwo on the input file">;
+defm split_dwo
+    : Eq<"split-dwo", "Equivalent to extract-dwo on the input file to "
+                      "<dwo-file>, then strip-dwo on the input file">,
+      MetaVarName<"dwo-file">;
 
 def preserve_dates : Flag<[ "-", "--" ], "preserve-dates">,
                      HelpText<"Preserve access and modification timestamps">;
 
 def p : Flag<[ "-" ], "p">, Alias<preserve_dates>;
 
-defm add_gnu_debuglink : Eq<"add-gnu-debuglink">,
-                         MetaVarName<"debug-file">,
-                         HelpText<"Add a .gnu_debuglink for <debug-file>">;
-defm remove_section : Eq<"remove-section">,
-                      MetaVarName<"section">,
-                      HelpText<"Remove <section>">;
+defm add_gnu_debuglink : Eq<"add-gnu-debuglink", "Add a .gnu_debuglink for <debug-file>">,
+                         MetaVarName<"debug-file">;
+defm remove_section : Eq<"remove-section", "Remove <section>">,
+                      MetaVarName<"section">;
 defm rename_section
-    : Eq<"rename-section">,
-      MetaVarName<"old=new[,flag1,...]">,
-      HelpText<
-          "Renames a section from old to new, optionally with specified flags. "
-          "Flags supported for GNU compatibility: alloc, load, noload, "
-          "readonly, debug, code, data, rom, share, contents, merge, strings.">;
-defm redefine_symbol : Eq<"redefine-sym">,
-                       MetaVarName<"old=new">,
-                       HelpText<"Change the name of a symbol old to new">;
+    : Eq<"rename-section",
+         "Renames a section from old to new, optionally with specified flags. "
+         "Flags supported for GNU compatibility: alloc, load, noload, "
+         "readonly, debug, code, data, rom, share, contents, merge, strings.">,
+      MetaVarName<"old=new[,flag1,...]">;
+defm redefine_symbol : Eq<"redefine-sym", "Change the name of a symbol old to new">,
+                       MetaVarName<"old=new">;
 def R : JoinedOrSeparate<["-"], "R">,
         Alias<remove_section>;
-defm keep : Eq<"keep">,
-            MetaVarName<"section">,
-            HelpText<"Keep <section>">;
-defm only_keep : Eq<"only-keep">,
-                 MetaVarName<"section">,
-                 HelpText<"Remove all but <section>">;
+defm keep : Eq<"keep", "Keep <section>">,
+            MetaVarName<"section">;
+defm only_keep : Eq<"only-keep", "Remove all but <section>">,
+                 MetaVarName<"section">;
 def j : JoinedOrSeparate<["-"], "j">,
                       Alias<only_keep>;
-defm add_section : Eq<"add-section">,
-                   MetaVarName<"section=file">,
-                   HelpText<"Make a section named <section> with the contents of <file>.">;
+defm add_section
+    : Eq<"add-section",
+         "Make a section named <section> with the contents of <file>.">,
+      MetaVarName<"section=file">;
 def strip_all : Flag<["-", "--"], "strip-all">,
                 HelpText<"Remove non-allocated sections other than .gnu.warning* sections">;
 def S : Flag<["-"], "S">,
@@ -87,38 +82,33 @@ def extract_dwo : Flag<["-", "--"], "extract-dwo">,
                   HelpText<"Remove all sections that are not DWARF .dwo sections from file">;
 def localize_hidden : Flag<["-", "--"], "localize-hidden">,
                       HelpText<"Mark all symbols that have hidden or internal visibility as local">;
-defm localize_symbol : Eq<"localize-symbol">,
-                       MetaVarName<"symbol">,
-                       HelpText<"Mark <symbol> as local">;
+defm localize_symbol : Eq<"localize-symbol", "Mark <symbol> as local">,
+                       MetaVarName<"symbol">;
 def L : JoinedOrSeparate<["-"], "L">,
         Alias<localize_symbol>;
-defm globalize_symbol : Eq<"globalize-symbol">,
-                       MetaVarName<"symbol">,
-                       HelpText<"Mark <symbol> as global">;
+defm globalize_symbol : Eq<"globalize-symbol", "Mark <symbol> as global">,
+                       MetaVarName<"symbol">;
 
 defm keep_global_symbol
-    : Eq<"keep-global-symbol">,
-      MetaVarName<"symbol">,
-      HelpText<"Convert all symbols except <symbol> to local. May be repeated "
-               "to convert all except a set of symbols to local.">;
+    : Eq<"keep-global-symbol", "Convert all symbols except <symbol> to local. May be repeated "
+               "to convert all except a set of symbols to local.">,
+      MetaVarName<"symbol">;
 def G : JoinedOrSeparate<[ "-" ], "G">, Alias<keep_global_symbol>;
 
 defm keep_global_symbols
-    : Eq<"keep-global-symbols">,
-      MetaVarName<"filename">,
-      HelpText<
-          "Reads a list of symbols from <filename> and runs as if "
-	  "--keep-global-symbol=<symbol> is set for each one. <filename> "
-	  "contains one symbol per line and may contain comments beginning "
-	  "with '#'. Leading and trailing whitespace is stripped from each "
-	  "line. May be repeated to read symbols from many files.">;
+    : Eq<"keep-global-symbols",
+         "Reads a list of symbols from <filename> and runs as if "
+	     "--keep-global-symbol=<symbol> is set for each one. <filename> "
+	     "contains one symbol per line and may contain comments beginning "
+	     "with '#'. Leading and trailing whitespace is stripped from each "
+	     "line. May be repeated to read symbols from many files.">,
+      MetaVarName<"filename">;
 
 def version : Flag<[ "-", "--" ], "version">,
               HelpText<"Print the version and exit.">;
 
-defm weaken_symbol : Eq<"weaken-symbol">,
-                       MetaVarName<"symbol">,
-                       HelpText<"Mark <symbol> as weak">;
+defm weaken_symbol : Eq<"weaken-symbol", "Mark <symbol> as weak">,
+                       MetaVarName<"symbol">;
 def W : JoinedOrSeparate<["-"], "W">,
         Alias<weaken_symbol>;
 def weaken : Flag<["-", "--"], "weaken">,
@@ -127,14 +117,12 @@ def discard_all : Flag<["-", "--"], "discard-all">,
                       HelpText<"Remove all local symbols except file and section symbols">;
 def x : Flag<["-"], "x">,
         Alias<discard_all>;
-defm strip_symbol : Eq<"strip-symbol">,
-                       MetaVarName<"symbol">,
-                       HelpText<"Remove symbol <symbol>">;
+defm strip_symbol : Eq<"strip-symbol", "Remove symbol <symbol>">,
+                       MetaVarName<"symbol">;
 def N : JoinedOrSeparate<["-"], "N">,
         Alias<strip_symbol>;
-defm keep_symbol : Eq<"keep-symbol">,
-                       MetaVarName<"symbol">,
-                       HelpText<"Do not remove symbol <symbol>">;
+defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
+                       MetaVarName<"symbol">;
 def K : JoinedOrSeparate<["-"], "K">,
         Alias<keep_symbol>;
 def only_keep_debug : Flag<["-", "--"], "only-keep-debug">,
@@ -143,9 +131,7 @@ def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
                       HelpText<"Remove all symbols not needed by relocations">;
 def keep_file_symbols : Flag<["-", "--"], "keep-file-symbols">,
                       HelpText<"Do not remove file symbols">;
-defm dump_section : Eq<"dump-section">,
-                   MetaVarName<"section=file">,
-                   HelpText<"Dump contents of section named <section> into file <file>">;
-defm prefix_symbols : Eq<"prefix-symbols">,
-                       MetaVarName<"prefix">,
-                       HelpText<"Add <prefix> to the start of every symbol name">;
+defm dump_section : Eq<"dump-section", "Dump contents of section named <section> into file <file>">,
+                   MetaVarName<"section=file">;
+defm prefix_symbols : Eq<"prefix-symbols", "Add <prefix> to the start of every symbol name">,
+                       MetaVarName<"prefix">;
diff --git a/tools/llvm-objcopy/StripOpts.td b/tools/llvm-objcopy/StripOpts.td
index b155933616d..b224b1296b3 100644
--- a/tools/llvm-objcopy/StripOpts.td
+++ b/tools/llvm-objcopy/StripOpts.td
@@ -1,15 +1,15 @@
 include "llvm/Option/OptParser.td"
 
-multiclass Eq<string name> {
+multiclass Eq<string name, string help> {
   def NAME: Separate<["--", "-"], name>;
-  def NAME # _eq: Joined<["--", "-"], name # "=">, Alias<!cast<Separate>(NAME)>;
+  def NAME # _eq: Joined<["--", "-"], name # "=">, Alias<!cast<Separate>(NAME)>,
+    HelpText<help>;
 }
 
 def help : Flag<["-", "--"], "help">;
 
-defm output : Eq<"o">,
-              MetaVarName<"output">,
-              HelpText<"Write output to <file>">;
+defm output : Eq<"o", "Write output to <file>">,
+              MetaVarName<"output">;
 
 def preserve_dates : Flag<[ "-", "--" ], "preserve-dates">,
                      HelpText<"Preserve access and modification timestamps">;
@@ -34,16 +34,14 @@ def g : Flag<["-"], "g">,
 def S : Flag<["-"], "S">,
         Alias<strip_debug>;
 
-defm remove_section : Eq<"remove-section">,
-                      MetaVarName<"section">,
-                      HelpText<"Remove <section>">;
+defm remove_section : Eq<"remove-section", "Remove <section>">,
+                      MetaVarName<"section">;
 
 def R : JoinedOrSeparate<["-"], "R">,
         Alias<remove_section>;
 
-defm keep_symbol : Eq<"keep-symbol">,
-                   MetaVarName<"symbol">,
-                   HelpText<"Do not remove symbol <symbol>">;
+defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
+                   MetaVarName<"symbol">;
 
 def K : JoinedOrSeparate<["-"], "K">,
         Alias<keep_symbol>;
-- 
GitLab


From 1033d5958c91ff2c58b6b8c8879f003d86028b22 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Thu, 1 Nov 2018 17:26:36 +0000
Subject: [PATCH 0868/1116] [llvm-objcopy] Don't apply --localize flags to
 common symbols

Summary:
--localize-symbol and --localize-hidden will currently localize common symbols. GNU objcopy will not localize these symbols even when explicitly requested, which seems reasonable; common symbols should always be global so they can be merged during linking.

See PR39461

Reviewers: jakehehrlich, jhenderson, alexshap, MaskRay, espindola

Reviewed By: jakehehrlich, jhenderson, alexshap, MaskRay

Subscribers: emaste, arichardson, alexshap, MaskRay, llvm-commits

Differential Revision: https://reviews.llvm.org/D53782

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345856 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-objcopy/localize-hidden.test | 17 ++++++++++++++++
 test/tools/llvm-objcopy/localize.test        | 21 +++++++++++++++++++-
 tools/llvm-objcopy/ELF/ELFObjcopy.cpp        |  9 +++++----
 tools/llvm-objcopy/ELF/Object.cpp            |  2 ++
 tools/llvm-objcopy/ELF/Object.h              |  1 +
 5 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/test/tools/llvm-objcopy/localize-hidden.test b/test/tools/llvm-objcopy/localize-hidden.test
index 92577075f07..05d747b800b 100644
--- a/test/tools/llvm-objcopy/localize-hidden.test
+++ b/test/tools/llvm-objcopy/localize-hidden.test
@@ -55,6 +55,12 @@ Symbols:
       Value:    0x2006
       Size:     2
       Visibility: STV_HIDDEN
+    - Name:     hiddenGlobalCommon
+      Type:     STT_OBJECT
+      Index:    SHN_COMMON
+      Value:    0x2006
+      Size:     2
+      Visibility: STV_HIDDEN
     - Name:     undefGlobal
       Type:     STT_FUNC
       Size:     8
@@ -142,6 +148,17 @@ Symbols:
 #CHECK-NEXT:    Section: .text
 #CHECK-NEXT:  }
 #CHECK-NEXT:  Symbol {
+#CHECK-NEXT:    Name: hiddenGlobalCommon
+#CHECK-NEXT:    Value: 0x2006
+#CHECK-NEXT:    Size: 2
+#CHECK-NEXT:    Binding: Global
+#CHECK-NEXT:    Type: Object
+#CHECK-NEXT:    Other [
+#CHECK-NEXT:      STV_HIDDEN
+#CHECK-NEXT:    ]
+#CHECK-NEXT:    Section: Common (0xF
+#CHECK-NEXT:  }
+#CHECK-NEXT:  Symbol {
 #CHECK-NEXT:    Name: undefGlobal
 #CHECK-NEXT:    Value: 0x0
 #CHECK-NEXT:    Size: 8
diff --git a/test/tools/llvm-objcopy/localize.test b/test/tools/llvm-objcopy/localize.test
index d52852ac673..2e2d6ccd6bf 100644
--- a/test/tools/llvm-objcopy/localize.test
+++ b/test/tools/llvm-objcopy/localize.test
@@ -1,5 +1,10 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy --localize-symbol Global -L Local -L Weak %t %t2
+# RUN: llvm-objcopy \
+# RUN:     --localize-symbol Global \
+# RUN:     -L Local \
+# RUN:     -L Weak \
+# RUN:     -L GlobalCommon \
+# RUN:     %t %t2
 # RUN: llvm-readobj -symbols %t2 | FileCheck %s
 
 !ELF
@@ -40,6 +45,11 @@ Symbols:
       Size:     8
       Section:  .text
       Value:    0x1010
+    - Name:     GlobalCommon
+      Type:     STT_OBJECT
+      Index:    SHN_COMMON
+      Value:    0x2006
+      Size:     2
 
 #CHECK: Symbols [
 #CHECK-NEXT:  Symbol {
@@ -78,4 +88,13 @@ Symbols:
 #CHECK-NEXT:    Other: 0
 #CHECK-NEXT:    Section: .text
 #CHECK-NEXT:  }
+#CHECK-NEXT:  Symbol {
+#CHECK-NEXT:    Name: GlobalCommon
+#CHECK-NEXT:    Value: 0x2006
+#CHECK-NEXT:    Size: 2
+#CHECK-NEXT:    Binding: Global
+#CHECK-NEXT:    Type: Object
+#CHECK-NEXT:    Other: 0
+#CHECK-NEXT:    Section: Common (0xF
+#CHECK-NEXT:  }
 #CHECK-NEXT:]
diff --git a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
index 2bad270cda7..a367a30c467 100644
--- a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
+++ b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
@@ -213,10 +213,11 @@ static void handleArgs(const CopyConfig &Config, Object &Obj,
   // them.
   if (Obj.SymbolTable) {
     Obj.SymbolTable->updateSymbols([&](Symbol &Sym) {
-      if ((Config.LocalizeHidden &&
-           (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
-          (!Config.SymbolsToLocalize.empty() &&
-           is_contained(Config.SymbolsToLocalize, Sym.Name)))
+      if (!Sym.isCommon() &&
+          ((Config.LocalizeHidden &&
+            (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
+           (!Config.SymbolsToLocalize.empty() &&
+            is_contained(Config.SymbolsToLocalize, Sym.Name))))
         Sym.Binding = STB_LOCAL;
 
       // Note: these two globalize flags have very similar names but different
diff --git a/tools/llvm-objcopy/ELF/Object.cpp b/tools/llvm-objcopy/ELF/Object.cpp
index ef3029b477a..c2af99fc197 100644
--- a/tools/llvm-objcopy/ELF/Object.cpp
+++ b/tools/llvm-objcopy/ELF/Object.cpp
@@ -332,6 +332,8 @@ uint16_t Symbol::getShndx() const {
   llvm_unreachable("Symbol with invalid ShndxType encountered");
 }
 
+bool Symbol::isCommon() const { return getShndx() == SHN_COMMON; }
+
 void SymbolTableSection::assignIndices() {
   uint32_t Index = 0;
   for (auto &Sym : Symbols)
diff --git a/tools/llvm-objcopy/ELF/Object.h b/tools/llvm-objcopy/ELF/Object.h
index 4aa3125f26c..91ff1cddac1 100644
--- a/tools/llvm-objcopy/ELF/Object.h
+++ b/tools/llvm-objcopy/ELF/Object.h
@@ -415,6 +415,7 @@ struct Symbol {
   bool Referenced = false;
 
   uint16_t getShndx() const;
+  bool isCommon() const;
 };
 
 class SectionIndexSection : public SectionBase {
-- 
GitLab


From 167fb187524f4315dd25fecf4db4f5486449f4d9 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Thu, 1 Nov 2018 17:36:37 +0000
Subject: [PATCH 0869/1116] [llvm-objcopy] Support
 --{enable,disable}-deterministic-archives

Summary: ar and objcopy/strip all support configuring whether archives are written deterministically (timestamps/UIDs/GIDs/etc zero'd). This has been ported to llvm-ar (the U/D modifiers) but not yet to llvm-objcopy/strip.

Reviewers: jakehehrlich, jhenderson, alexshap

Reviewed By: jhenderson

Subscribers: ruiu, mgrang, llvm-commits

Differential Revision: https://reviews.llvm.org/D53913

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345859 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm-objcopy/deterministic-archive.test   | 65 +++++++++++++++++++
 tools/llvm-objcopy/CopyConfig.cpp             |  8 +++
 tools/llvm-objcopy/CopyConfig.h               |  1 +
 tools/llvm-objcopy/ObjcopyOpts.td             | 16 +++++
 tools/llvm-objcopy/StripOpts.td               | 16 +++++
 tools/llvm-objcopy/llvm-objcopy.cpp           |  8 +--
 6 files changed, 110 insertions(+), 4 deletions(-)
 create mode 100644 test/tools/llvm-objcopy/deterministic-archive.test

diff --git a/test/tools/llvm-objcopy/deterministic-archive.test b/test/tools/llvm-objcopy/deterministic-archive.test
new file mode 100644
index 00000000000..fd520fb9ed7
--- /dev/null
+++ b/test/tools/llvm-objcopy/deterministic-archive.test
@@ -0,0 +1,65 @@
+# RUN: yaml2obj %s > %t.o
+
+# Create an archive, specifying U so that timestamps/etc. are preserved.
+# We only test timestamps as a proxy for full deterministic writing; i.e. we
+# assume UID/GIDs are preserved if timestamps are preserved.
+# RUN: touch -t 199505050555.55 %t.o
+# RUN: rm -f %t.a
+# RUN: llvm-ar crsU %t.a %t.o
+
+# Test short flags.
+# RUN: llvm-objcopy -D %t.a %t.2D.a
+# RUN: env TZ=GMT llvm-ar tv %t.2D.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-objcopy -U %t.a %t.2U.a
+# RUN: env TZ=GMT llvm-ar tv %t.2U.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+
+# RUN: llvm-strip -D %t.a -o %t.3D.a
+# RUN: env TZ=GMT llvm-ar tv %t.3D.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-strip -U %t.a -o %t.3U.a
+# RUN: env TZ=GMT llvm-ar tv %t.3U.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+
+# Test long flags.
+# RUN: llvm-objcopy --enable-deterministic-archives %t.a %t.4D.a
+# RUN: env TZ=GMT llvm-ar tv %t.4D.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-objcopy --disable-deterministic-archives %t.a %t.4U.a
+# RUN: env TZ=GMT llvm-ar tv %t.4U.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+
+# RUN: llvm-strip --enable-deterministic-archives %t.a -o %t.5D.a
+# RUN: env TZ=GMT llvm-ar tv %t.5D.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-strip --disable-deterministic-archives %t.a -o %t.5U.a
+# RUN: env TZ=GMT llvm-ar tv %t.5U.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+
+# If unspecified, verify that deterministic is the default.
+# RUN: llvm-objcopy %t.a %t.6.a
+# RUN: env TZ=GMT llvm-ar tv %t.6.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-strip %t.a -o %t.7.a
+# RUN: env TZ=GMT llvm-ar tv %t.7.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+
+# If both are specified, last one wins.
+# RUN: llvm-objcopy -U -D %t.a %t.8.a
+# RUN: env TZ=GMT llvm-ar tv %t.8.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-objcopy -D -U %t.a %t.9.a
+# RUN: env TZ=GMT llvm-ar tv %t.9.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+# RUN: llvm-objcopy -D -U -D -U --enable-deterministic-archives %t.a %t.10.a
+# RUN: env TZ=GMT llvm-ar tv %t.10.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+
+# RUN: llvm-strip -U -D %t.a -o %t.11.a
+# RUN: env TZ=GMT llvm-ar tv %t.11.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-strip -D -U %t.a -o %t.12.a
+# RUN: env TZ=GMT llvm-ar tv %t.12.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+# RUN: llvm-strip -D -U -D -U --enable-deterministic-archives %t.a -o %t.13.a
+# RUN: env TZ=GMT llvm-ar tv %t.13.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+
+# CHECK-DETERMINISTIC: {{[[:space:]]1970[[:space:]]}}
+# CHECK-NONDETERMINISTIC:  {{[[:space:]]1995[[:space:]]}}
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
diff --git a/tools/llvm-objcopy/CopyConfig.cpp b/tools/llvm-objcopy/CopyConfig.cpp
index 24c72de8882..8df93efa785 100644
--- a/tools/llvm-objcopy/CopyConfig.cpp
+++ b/tools/llvm-objcopy/CopyConfig.cpp
@@ -343,6 +343,10 @@ DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
   for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbol))
     Config.SymbolsToKeep.push_back(Arg->getValue());
 
+  Config.DeterministicArchives = InputArgs.hasFlag(
+      OBJCOPY_enable_deterministic_archives,
+      OBJCOPY_disable_deterministic_archives, /*default=*/true);
+
   Config.PreserveDates = InputArgs.hasArg(OBJCOPY_preserve_dates);
 
   if (Config.DecompressDebugSections &&
@@ -411,6 +415,10 @@ DriverConfig parseStripOptions(ArrayRef<const char *> ArgsArr) {
   for (auto Arg : InputArgs.filtered(STRIP_keep_symbol))
     Config.SymbolsToKeep.push_back(Arg->getValue());
 
+  Config.DeterministicArchives =
+      InputArgs.hasFlag(STRIP_enable_deterministic_archives,
+                        STRIP_disable_deterministic_archives, /*default=*/true);
+
   Config.PreserveDates = InputArgs.hasArg(STRIP_preserve_dates);
 
   DriverConfig DC;
diff --git a/tools/llvm-objcopy/CopyConfig.h b/tools/llvm-objcopy/CopyConfig.h
index 203432a11a6..7ebe2a072bb 100644
--- a/tools/llvm-objcopy/CopyConfig.h
+++ b/tools/llvm-objcopy/CopyConfig.h
@@ -72,6 +72,7 @@ struct CopyConfig {
   StringMap<StringRef> SymbolsToRename;
 
   // Boolean options
+  bool DeterministicArchives = true;
   bool DiscardAll = false;
   bool ExtractDWO = false;
   bool KeepFileSymbols = false;
diff --git a/tools/llvm-objcopy/ObjcopyOpts.td b/tools/llvm-objcopy/ObjcopyOpts.td
index 8ed6df9a9f6..3b8453beecb 100644
--- a/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/tools/llvm-objcopy/ObjcopyOpts.td
@@ -35,6 +35,22 @@ defm split_dwo
                       "<dwo-file>, then strip-dwo on the input file">,
       MetaVarName<"dwo-file">;
 
+def enable_deterministic_archives
+    : Flag<["-", "--"], "enable-deterministic-archives">,
+      HelpText<"Enable deterministic mode when copying archives (use zero for "
+               "UIDs, GIDs, and timestamps).">;
+def D : Flag<["-"], "D">,
+        Alias<enable_deterministic_archives>,
+        HelpText<"Alias for --enable-deterministic-archives">;
+
+def disable_deterministic_archives
+    : Flag<["-", "--"], "disable-deterministic-archives">,
+      HelpText<"Disable deterministic mode when copying archives (use real "
+               "values for UIDs, GIDs, and timestamps).">;
+def U : Flag<["-"], "U">,
+        Alias<disable_deterministic_archives>,
+        HelpText<"Alias for --disable-deterministic-archives">;
+
 def preserve_dates : Flag<[ "-", "--" ], "preserve-dates">,
                      HelpText<"Preserve access and modification timestamps">;
 
diff --git a/tools/llvm-objcopy/StripOpts.td b/tools/llvm-objcopy/StripOpts.td
index b224b1296b3..3657bdb703d 100644
--- a/tools/llvm-objcopy/StripOpts.td
+++ b/tools/llvm-objcopy/StripOpts.td
@@ -8,6 +8,22 @@ multiclass Eq<string name, string help> {
 
 def help : Flag<["-", "--"], "help">;
 
+def enable_deterministic_archives
+    : Flag<["-", "--"], "enable-deterministic-archives">,
+      HelpText<"Enable deterministic mode when stripping archives (use zero "
+               "for UIDs, GIDs, and timestamps).">;
+def D : Flag<["-"], "D">,
+        Alias<enable_deterministic_archives>,
+        HelpText<"Alias for --enable-deterministic-archives">;
+
+def disable_deterministic_archives
+    : Flag<["-", "--"], "disable-deterministic-archives">,
+      HelpText<"Disable deterministic mode when stripping archives (use real "
+               "values for UIDs, GIDs, and timestamps).">;
+def U : Flag<["-"], "U">,
+        Alias<disable_deterministic_archives>,
+        HelpText<"Alias for --disable-deterministic-archives">;
+
 defm output : Eq<"o", "Write output to <file>">,
               MetaVarName<"output">;
 
diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index deaea5eff85..a033aaecb98 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -147,7 +147,7 @@ static void executeObjcopyOnArchive(const CopyConfig &Config,
     executeObjcopyOnBinary(Config, *Bin, MB);
 
     Expected<NewArchiveMember> Member =
-        NewArchiveMember::getOldMember(Child, true);
+        NewArchiveMember::getOldMember(Child, Config.DeterministicArchives);
     if (!Member)
       reportError(Ar.getFileName(), Member.takeError());
     Member->Buf = MB.releaseMemoryBuffer();
@@ -157,9 +157,9 @@ static void executeObjcopyOnArchive(const CopyConfig &Config,
 
   if (Err)
     reportError(Config.InputFilename, std::move(Err));
-  if (Error E =
-          deepWriteArchive(Config.OutputFilename, NewArchiveMembers,
-                           Ar.hasSymbolTable(), Ar.kind(), true, Ar.isThin()))
+  if (Error E = deepWriteArchive(Config.OutputFilename, NewArchiveMembers,
+                                 Ar.hasSymbolTable(), Ar.kind(),
+                                 Config.DeterministicArchives, Ar.isThin()))
     reportError(Config.OutputFilename, std::move(E));
 }
 
-- 
GitLab


From 9a78ad0d49cfe6c93c3ff45f5a9910e9943da099 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Thu, 1 Nov 2018 17:48:46 +0000
Subject: [PATCH 0870/1116] [llvm-strip] Support --keep and --strip-all-gnu
 from llvm-objcopy

Summary: Add --keep and --strip-all-gnu from llvm-objcopy into llvm-strip.

Reviewers: jakehehrlich, jhenderson, alexshap

Reviewed By: jhenderson, alexshap

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53954

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345861 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-objcopy/basic-keep.test    | 2 ++
 test/tools/llvm-objcopy/strip-all-gnu.test | 2 ++
 tools/llvm-objcopy/CopyConfig.cpp          | 7 ++++++-
 tools/llvm-objcopy/ObjcopyOpts.td          | 4 ++--
 tools/llvm-objcopy/StripOpts.td            | 5 +++++
 5 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/test/tools/llvm-objcopy/basic-keep.test b/test/tools/llvm-objcopy/basic-keep.test
index 2ea4ea35577..8f4acb0c971 100644
--- a/test/tools/llvm-objcopy/basic-keep.test
+++ b/test/tools/llvm-objcopy/basic-keep.test
@@ -1,6 +1,8 @@
 # RUN: yaml2obj %s > %t
 # RUN: llvm-objcopy -strip-non-alloc -keep=.test %t %t2
+# RUN: llvm-strip --strip-all -keep=.test %t -o %t3
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
+# RUN: cmp %t2 %t3
 
 !ELF
 FileHeader:
diff --git a/test/tools/llvm-objcopy/strip-all-gnu.test b/test/tools/llvm-objcopy/strip-all-gnu.test
index 15e200525b2..f6dbcc70cf4 100644
--- a/test/tools/llvm-objcopy/strip-all-gnu.test
+++ b/test/tools/llvm-objcopy/strip-all-gnu.test
@@ -1,7 +1,9 @@
 # RUN: yaml2obj %s > %t
 # RUN: cp %t %t1
 # RUN: llvm-objcopy --strip-all-gnu %t %t2
+# RUN: llvm-strip --strip-all-gnu %t -o %t3
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
+# RUN: cmp %t2 %t3
 
 !ELF
 FileHeader:
diff --git a/tools/llvm-objcopy/CopyConfig.cpp b/tools/llvm-objcopy/CopyConfig.cpp
index 8df93efa785..67963a22a1c 100644
--- a/tools/llvm-objcopy/CopyConfig.cpp
+++ b/tools/llvm-objcopy/CopyConfig.cpp
@@ -405,10 +405,15 @@ DriverConfig parseStripOptions(ArrayRef<const char *> ArgsArr) {
   Config.DiscardAll = InputArgs.hasArg(STRIP_discard_all);
   Config.StripUnneeded = InputArgs.hasArg(STRIP_strip_unneeded);
   Config.StripAll = InputArgs.hasArg(STRIP_strip_all);
+  Config.StripAllGNU = InputArgs.hasArg(STRIP_strip_all_gnu);
 
-  if (!Config.StripDebug && !Config.StripUnneeded && !Config.DiscardAll)
+  if (!Config.StripDebug && !Config.StripUnneeded && !Config.DiscardAll &&
+      !Config.StripAllGNU)
     Config.StripAll = true;
 
+  for (auto Arg : InputArgs.filtered(STRIP_keep))
+    Config.Keep.push_back(Arg->getValue());
+
   for (auto Arg : InputArgs.filtered(STRIP_remove_section))
     Config.ToRemove.push_back(Arg->getValue());
 
diff --git a/tools/llvm-objcopy/ObjcopyOpts.td b/tools/llvm-objcopy/ObjcopyOpts.td
index 3b8453beecb..effcca89e4d 100644
--- a/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/tools/llvm-objcopy/ObjcopyOpts.td
@@ -85,7 +85,7 @@ def strip_all : Flag<["-", "--"], "strip-all">,
 def S : Flag<["-"], "S">,
         Alias<strip_all>;
 def strip_all_gnu : Flag<["-", "--"], "strip-all-gnu">,
-                    HelpText<"Compaitable with GNU objcopy's --strip-all">;
+                    HelpText<"Compatible with GNU objcopy's --strip-all">;
 def strip_debug : Flag<["-", "--"], "strip-debug">,
                   HelpText<"Remove all debug information">;
 def strip_dwo : Flag<["-", "--"], "strip-dwo">,
@@ -142,7 +142,7 @@ defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
 def K : JoinedOrSeparate<["-"], "K">,
         Alias<keep_symbol>;
 def only_keep_debug : Flag<["-", "--"], "only-keep-debug">,
-                          HelpText<"Currently ignored. Only for compaitability with GNU objcopy.">;
+                          HelpText<"Currently ignored. Only for compatibility with GNU objcopy.">;
 def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
                       HelpText<"Remove all symbols not needed by relocations">;
 def keep_file_symbols : Flag<["-", "--"], "keep-file-symbols">,
diff --git a/tools/llvm-objcopy/StripOpts.td b/tools/llvm-objcopy/StripOpts.td
index 3657bdb703d..99d5d83914b 100644
--- a/tools/llvm-objcopy/StripOpts.td
+++ b/tools/llvm-objcopy/StripOpts.td
@@ -38,6 +38,9 @@ def strip_all : Flag<["-", "--"], "strip-all">,
 def s : Flag<["-"], "s">,
         Alias<strip_all>;
 
+def strip_all_gnu : Flag<["-", "--"], "strip-all-gnu">,
+                    HelpText<"Compatible with GNU strip's --strip-all">;
+
 def strip_debug : Flag<["-", "--"], "strip-debug">,
                   HelpText<"Remove debugging symbols only">;
 
@@ -56,6 +59,8 @@ defm remove_section : Eq<"remove-section", "Remove <section>">,
 def R : JoinedOrSeparate<["-"], "R">,
         Alias<remove_section>;
 
+defm keep : Eq<"keep", "Keep <section>">, MetaVarName<"section">;
+
 defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
                    MetaVarName<"symbol">;
 
-- 
GitLab


From fb61af32fa778edc8bec8005c10e3bc3f23c2ba9 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Grang <mgrang@codeaurora.org>
Date: Thu, 1 Nov 2018 17:53:57 +0000
Subject: [PATCH 0871/1116] Revert "[COFF, ARM64] Implement Intrinsic.sponentry
 for AArch64"

This reverts commit 585b6667b4712e3c7f32401e929855b3313b4ff2.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345863 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                              |  79 +++++--------
 include/llvm/CodeGen/ISDOpcodes.h             |   2 +-
 include/llvm/IR/Intrinsics.td                 |   1 -
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |   1 -
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   4 -
 .../SelectionDAG/SelectionDAGDumper.cpp       |   1 -
 lib/Target/AArch64/AArch64FastISel.cpp        |  16 ---
 lib/Target/AArch64/AArch64ISelLowering.cpp    |  12 --
 lib/Target/AArch64/AArch64ISelLowering.h      |   1 -
 test/CodeGen/AArch64/sponentry.ll             | 104 ------------------
 10 files changed, 30 insertions(+), 191 deletions(-)
 delete mode 100644 test/CodeGen/AArch64/sponentry.ll

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index d57f79f0039..39134fafd46 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -2926,7 +2926,7 @@ Simple Constants
     hexadecimal notation (see below). The assembler requires the exact
     decimal value of a floating-point constant. For example, the
     assembler accepts 1.25 but rejects 1.3 because 1.3 is a repeating
-    decimal in binary. Floating-point constants must have a
+    decimal in binary. Floating-point constants must have a 
     :ref:`floating-point <t_floating>` type.
 **Null pointer constants**
     The identifier '``null``' is recognized as a null pointer constant
@@ -3331,7 +3331,7 @@ The following is the syntax for constant expressions:
     value won't fit in the integer type, the result is a
     :ref:`poison value <poisonvalues>`.
 ``uitofp (CST to TYPE)``
-    Convert an unsigned integer constant to the corresponding
+    Convert an unsigned integer constant to the corresponding 
     floating-point constant. TYPE must be a scalar or vector floating-point
     type.  CST must be of scalar or vector integer type. Both CST and TYPE must
     be scalars, or vectors of the same number of elements.
@@ -5434,7 +5434,7 @@ Irreducible loop header weights are typically based on profile data.
 '``invariant.group``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The experimental ``invariant.group`` metadata may be attached to
+The experimental ``invariant.group`` metadata may be attached to 
 ``load``/``store`` instructions referencing a single metadata with no entries.
 The existence of the ``invariant.group`` metadata on the instruction tells
 the optimizer that every ``load`` and ``store`` to the same pointer operand
@@ -6875,7 +6875,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fadd``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -6883,7 +6883,7 @@ Semantics:
 
 The value produced is the floating-point sum of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`.
+environment <floatenv>`. 
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -6972,7 +6972,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fsub``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -6980,7 +6980,7 @@ Semantics:
 
 The value produced is the floating-point difference of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`.
+environment <floatenv>`. 
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7067,7 +7067,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fmul``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7075,7 +7075,7 @@ Semantics:
 
 The value produced is the floating-point product of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`.
+environment <floatenv>`. 
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7201,7 +7201,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fdiv``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7209,7 +7209,7 @@ Semantics:
 
 The value produced is the floating-point quotient of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`.
+environment <floatenv>`. 
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7344,7 +7344,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``frem``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7352,10 +7352,10 @@ Semantics:
 
 The value produced is the floating-point remainder of the two operands.
 This is the same output as a libm '``fmod``' function, but without any
-possibility of setting ``errno``. The remainder has the same sign as the
+possibility of setting ``errno``. The remainder has the same sign as the 
 dividend.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`.
+environment <floatenv>`. 
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -8809,7 +8809,7 @@ Semantics:
 
 The '``fptrunc``' instruction casts a ``value`` from a larger
 :ref:`floating-point <t_floating>` type to a smaller :ref:`floating-point
-<t_floating>` type.
+<t_floating>` type.  
 This instruction is assumed to execute in the default :ref:`floating-point
 environment <floatenv>`.
 
@@ -10330,27 +10330,6 @@ of the obvious source-language caller.
 
 This intrinsic is only implemented for x86.
 
-'``llvm.sponentry``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-
-::
-
-      declare i8* @llvm.sponentry()
-
-Overview:
-"""""""""
-
-The '``llvm.sponentry``' intrinsic returns the stack pointer value at
-the entry of the current function calling this intrinsic.
-
-Semantics:
-""""""""""
-
-Note this intrinsic is only verified on AArch64.
-
 '``llvm.frameaddress``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -12136,11 +12115,11 @@ Overview:
 
 The '``llvm.fshl``' family of intrinsic functions performs a funnel shift left:
 the first two values are concatenated as { %a : %b } (%a is the most significant
-bits of the wide value), the combined value is shifted left, and the most
-significant bits are extracted to produce a result that is the same size as the
-original arguments. If the first 2 arguments are identical, this is equivalent
-to a rotate left operation. For vector types, the operation occurs for each
-element of the vector. The shift argument is treated as an unsigned amount
+bits of the wide value), the combined value is shifted left, and the most 
+significant bits are extracted to produce a result that is the same size as the 
+original arguments. If the first 2 arguments are identical, this is equivalent 
+to a rotate left operation. For vector types, the operation occurs for each 
+element of the vector. The shift argument is treated as an unsigned amount 
 modulo the element size of the arguments.
 
 Arguments:
@@ -12182,11 +12161,11 @@ Overview:
 
 The '``llvm.fshr``' family of intrinsic functions performs a funnel shift right:
 the first two values are concatenated as { %a : %b } (%a is the most significant
-bits of the wide value), the combined value is shifted right, and the least
-significant bits are extracted to produce a result that is the same size as the
-original arguments. If the first 2 arguments are identical, this is equivalent
-to a rotate right operation. For vector types, the operation occurs for each
-element of the vector. The shift argument is treated as an unsigned amount
+bits of the wide value), the combined value is shifted right, and the least 
+significant bits are extracted to produce a result that is the same size as the 
+original arguments. If the first 2 arguments are identical, this is equivalent 
+to a rotate right operation. For vector types, the operation occurs for each 
+element of the vector. The shift argument is treated as an unsigned amount 
 modulo the element size of the arguments.
 
 Arguments:
@@ -13467,7 +13446,7 @@ The '``llvm.masked.expandload``' intrinsic is designed for reading multiple scal
     %Tmp = call <8 x double> @llvm.masked.expandload.v8f64(double* %Bptr, <8 x i1> %Mask, <8 x double> undef)
     ; Store the result in A
     call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %Tmp, <8 x double>* %Aptr, i32 8, <8 x i1> %Mask)
-
+    
     ; %Bptr should be increased on each iteration according to the number of '1' elements in the Mask.
     %MaskI = bitcast <8 x i1> %Mask to i8
     %MaskIPopcnt = call i8 @llvm.ctpop.i8(i8 %MaskI)
@@ -13524,7 +13503,7 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i
     %Tmp = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %Aptr, i32 8, <8 x i1> %Mask, <8 x double> undef)
     ; Store all selected elements consecutively in array B
     call <void> @llvm.masked.compressstore.v8f64(<8 x double> %Tmp, double* %Bptr, <8 x i1> %Mask)
-
+    
     ; %Bptr should be increased on each iteration according to the number of '1' elements in the Mask.
     %MaskI = bitcast <8 x i1> %Mask to i8
     %MaskIPopcnt = call i8 @llvm.ctpop.i8(i8 %MaskI)
@@ -14157,7 +14136,7 @@ Overview:
 
 The '``llvm.experimental.constrained.powi``' intrinsic returns the first operand
 raised to the (positive or negative) power specified by the second operand. The
-order of evaluation of multiplications is not defined. When a vector of
+order of evaluation of multiplications is not defined. When a vector of 
 floating-point type is used, the second argument remains a scalar integer value.
 
 
@@ -14483,7 +14462,7 @@ Overview:
 """""""""
 
 The '``llvm.experimental.constrained.nearbyint``' intrinsic returns the first
-operand rounded to the nearest integer. It will not raise an inexact
+operand rounded to the nearest integer. It will not raise an inexact 
 floating-point exception if the operand is not an integer.
 
 
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index da10119f438..a023aa5b3f6 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -70,7 +70,7 @@ namespace ISD {
     /// of the frame or return address to return.  An index of zero corresponds
     /// to the current function's frame or return address, an index of one to
     /// the parent's frame or return address, and so on.
-    FRAMEADDR, RETURNADDR, ADDROFRETURNADDR, SPONENTRY,
+    FRAMEADDR, RETURNADDR, ADDROFRETURNADDR,
 
     /// LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
     /// Materializes the offset from the local object pointer of another
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index c965140a00b..47a66a27e38 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -320,7 +320,6 @@ def int_gcwrite : Intrinsic<[],
 def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_addressofreturnaddress : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_frameaddress  : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_sponentry  : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_read_register  : Intrinsic<[llvm_anyint_ty], [llvm_metadata_ty],
                                    [IntrReadMem], "llvm.read_register">;
 def int_write_register : Intrinsic<[], [llvm_metadata_ty, llvm_anyint_ty],
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d5fb7a0697d..a96b8628ac8 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1059,7 +1059,6 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::FRAMEADDR:
   case ISD::RETURNADDR:
   case ISD::ADDROFRETURNADDR:
-  case ISD::SPONENTRY:
     // These operations lie about being legal: when they claim to be legal,
     // they should actually be custom-lowered.
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index cb650c6fc13..dac99eddec3 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5050,10 +5050,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, DAG.getNode(ISD::ADDROFRETURNADDR, sdl,
                              TLI.getPointerTy(DAG.getDataLayout())));
     return nullptr;
-  case Intrinsic::sponentry:
-    setValue(&I, DAG.getNode(ISD::SPONENTRY, sdl,
-                             TLI.getPointerTy(DAG.getDataLayout())));
-    return nullptr;
   case Intrinsic::frameaddress:
     setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
                              TLI.getPointerTy(DAG.getDataLayout()),
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index c21f2d3b717..5c17a5d295d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -124,7 +124,6 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::RETURNADDR:                 return "RETURNADDR";
   case ISD::ADDROFRETURNADDR:           return "ADDROFRETURNADDR";
   case ISD::FRAMEADDR:                  return "FRAMEADDR";
-  case ISD::SPONENTRY:                  return "SPONENTRY";
   case ISD::LOCAL_RECOVER:              return "LOCAL_RECOVER";
   case ISD::READ_REGISTER:              return "READ_REGISTER";
   case ISD::WRITE_REGISTER:             return "WRITE_REGISTER";
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 317c3f134db..5e4c5dcf09c 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3450,22 +3450,6 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     updateValueMap(II, SrcReg);
     return true;
   }
-  case Intrinsic::sponentry: {
-    MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
-
-    // SP = FP + Fixed Object + 16
-    MVT VT = TLI.getPointerTy(DL);
-    int FI = MFI.CreateFixedObject(4, 0, false);
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(AArch64::ADDXri), ResultReg)
-            .addFrameIndex(FI)
-            .addImm(0)
-            .addImm(0);
-
-    updateValueMap(II, ResultReg);
-    return true;
-  }
   case Intrinsic::memcpy:
   case Intrinsic::memmove: {
     const auto *MTI = cast<MemTransferInst>(II);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index f5652a9f380..3c107016c8b 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2863,8 +2863,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFP_EXTEND(Op, DAG);
   case ISD::FRAMEADDR:
     return LowerFRAMEADDR(Op, DAG);
-  case ISD::SPONENTRY:
-    return LowerSPONENTRY(Op, DAG);
   case ISD::RETURNADDR:
     return LowerRETURNADDR(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:
@@ -5173,16 +5171,6 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
   return FrameAddr;
 }
 
-SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
-                                              SelectionDAG &DAG) const {
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-
-  EVT VT = getPointerTy(DAG.getDataLayout());
-  SDLoc DL(Op);
-  int FI = MFI.CreateFixedObject(4, 0, false);
-  return DAG.getFrameIndex(FI, VT);
-}
-
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 7b4119a21d0..3e89de665a7 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -617,7 +617,6 @@ private:
   SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
diff --git a/test/CodeGen/AArch64/sponentry.ll b/test/CodeGen/AArch64/sponentry.ll
deleted file mode 100644
index 5b3638a1d86..00000000000
--- a/test/CodeGen/AArch64/sponentry.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; RUN: llc -mtriple=aarch64-windows-msvc -disable-fp-elim %s -o - | FileCheck %s
-; RUN: llc -mtriple=aarch64-windows-msvc -fast-isel -disable-fp-elim %s -o - | FileCheck %s
-; RUN: llc -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s --check-prefix=NOFP
-; RUN: llc -mtriple=aarch64-windows-msvc -fast-isel %s -o - | FileCheck %s --check-prefix=NOFP
-
-@env2 = common dso_local global [24 x i64]* null, align 8
-
-define dso_local void @bar() {
-  %1 = call i8* @llvm.sponentry()
-  %2 = load [24 x i64]*, [24 x i64]** @env2, align 8
-  %3 = getelementptr inbounds [24 x i64], [24 x i64]* %2, i32 0, i32 0
-  %4 = bitcast i64* %3 to i8*
-  %5 = call i32 @_setjmpex(i8* %4, i8* %1) #2
-  ret void
-}
-
-; CHECK: bar:
-; CHECK: mov     x29, sp
-; CHECK: add     x1, x29, #16
-; CEHCK: bl      _setjmpex
-
-; NOFP: str     x30, [sp, #-16]!
-; NOFP: add     x1, sp, #16
-
-define dso_local void @foo([24 x i64]*) {
-  %2 = alloca [24 x i64]*, align 8
-  %3 = alloca i32, align 4
-  %4 = alloca [100 x i32], align 4
-  store [24 x i64]* %0, [24 x i64]** %2, align 8
-  %5 = call i8* @llvm.sponentry()
-  %6 = load [24 x i64]*, [24 x i64]** %2, align 8
-  %7 = getelementptr inbounds [24 x i64], [24 x i64]* %6, i32 0, i32 0
-  %8 = bitcast i64* %7 to i8*
-  %9 = call i32 @_setjmpex(i8* %8, i8* %5)
-  store i32 %9, i32* %3, align 4
-  ret void
-}
-
-; CHECK: foo:
-; CHECK: sub     sp, sp, #448
-; CHECK: add     x29, sp, #432
-; CHECK: add     x1, x29, #16
-; CEHCK: bl      _setjmpex
-
-; NOFP: sub     sp, sp, #432
-; NOFP: add     x1, sp, #432
-
-define dso_local void @var_args(i8*, ...) {
-  %2 = alloca i8*, align 8
-  %3 = alloca i8*, align 8
-  store i8* %0, i8** %2, align 8
-  %4 = bitcast i8** %3 to i8*
-  call void @llvm.va_start(i8* %4)
-  %5 = load i8*, i8** %3, align 8
-  %6 = getelementptr inbounds i8, i8* %5, i64 8
-  store i8* %6, i8** %3, align 8
-  %7 = bitcast i8* %5 to i32*
-  %8 = load i32, i32* %7, align 8
-  %9 = bitcast i8** %3 to i8*
-  call void @llvm.va_end(i8* %9)
-  %10 = call i8* @llvm.sponentry()
-  %11 = load [24 x i64]*, [24 x i64]** @env2, align 8
-  %12 = getelementptr inbounds [24 x i64], [24 x i64]* %11, i32 0, i32 0
-  %13 = bitcast i64* %12 to i8*
-  %14 = call i32 @_setjmpex(i8* %13, i8* %10) #3
-  ret void
-}
-
-; CHECK: var_args:
-; CHECK: sub     sp, sp, #96
-; CHECK: add     x29, sp, #16
-; CHECK: add     x1, x29, #80
-; CEHCK: bl      _setjmpex
-
-; NOFP: sub     sp, sp, #96
-; NOFP: add     x1, sp, #96
-
-define dso_local void @manyargs(i64 %x1, i64 %x2, i64 %x3, i64 %x4, i64 %x5, i64 %x6, i64 %x7, i64 %x8, i64 %x9, i64 %x10) {
-  %1 = call i8* @llvm.sponentry()
-  %2 = load [24 x i64]*, [24 x i64]** @env2, align 8
-  %3 = getelementptr inbounds [24 x i64], [24 x i64]* %2, i32 0, i32 0
-  %4 = bitcast i64* %3 to i8*
-  %5 = call i32 @_setjmpex(i8* %4, i8* %1) #2
-  ret void
-}
-
-; CHECK: manyargs:
-; CHECK: stp     x29, x30, [sp, #-16]!
-; CHECK: add     x1, x29, #16
-
-; NOFP: str     x30, [sp, #-16]!
-; NOFP: add     x1, sp, #16
-
-; Function Attrs: nounwind readnone
-declare i8* @llvm.sponentry()
-
-; Function Attrs: returns_twice
-declare dso_local i32 @_setjmpex(i8*, i8*)
-
-; Function Attrs: nounwind
-declare void @llvm.va_start(i8*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.va_end(i8*) #1
-- 
GitLab


From f8ad7b8535a691b250e5c4f592bb030883b0a455 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 1 Nov 2018 18:02:27 +0000
Subject: [PATCH 0872/1116] [AArch64] Fix unintended fallthrough and strengthen
 cast

This was added in r330630. GCC's -Wimplicit-fallthrough seems to not
fire when the previous case contains a switch itself.

This fallthrough was bening because the helper function implementing the
case used dyn_cast to re-check the type of the node in question. After
fixing the fallthrough, we can strengthen the cast.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345864 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3c107016c8b..e6a036e64d8 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11036,9 +11036,9 @@ static SDValue performNVCASTCombine(SDNode *N) {
 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
                                            const AArch64Subtarget *Subtarget,
                                            const TargetMachine &TM) {
-  auto *GN = dyn_cast<GlobalAddressSDNode>(N);
-  if (!GN || Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
-                 AArch64II::MO_NO_FLAG)
+  auto *GN = cast<GlobalAddressSDNode>(N);
+  if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
+      AArch64II::MO_NO_FLAG)
     return SDValue();
 
   uint64_t MinOffset = -1ull;
@@ -11170,6 +11170,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     default:
       break;
     }
+    break;
   case ISD::GlobalAddress:
     return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
   }
-- 
GitLab


From daf3fe16b9b9206d53404790e98f222467e8fcae Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Thu, 1 Nov 2018 18:04:39 +0000
Subject: [PATCH 0873/1116] [llvm-mca] Add extra counters for move elimination
 in view RegisterFileStatistics.

This patch teaches view RegisterFileStatistics how to report events for
optimizable register moves.

For each processor register file, view RegisterFileStatistics reports the
following extra information:
 - Number of optimizable register moves
 - Number of register moves eliminated
 - Number of zero moves (i.e. register moves that propagate a zero)
 - Max Number of moves eliminated per cycle.

Differential Revision: https://reviews.llvm.org/D53976


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345865 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../X86/BtVer2/reg-move-elimination-1.s       |  4 +
 .../X86/BtVer2/reg-move-elimination-2.s       |  4 +
 .../X86/BtVer2/reg-move-elimination-3.s       |  4 +
 .../X86/BtVer2/reg-move-elimination-4.s       |  4 +
 .../X86/BtVer2/reg-move-elimination-5.s       |  4 +
 .../llvm-mca/Views/RegisterFileStatistics.cpp | 94 +++++++++++++++----
 tools/llvm-mca/Views/RegisterFileStatistics.h | 19 +++-
 .../include/HardwareUnits/RegisterFile.h      | 14 ++-
 tools/llvm-mca/include/Instruction.h          | 21 ++++-
 tools/llvm-mca/include/Stages/DispatchStage.h |  4 -
 .../lib/HardwareUnits/RegisterFile.cpp        | 44 ++++++---
 tools/llvm-mca/lib/Stages/DispatchStage.cpp   | 14 ++-
 12 files changed, 183 insertions(+), 47 deletions(-)

diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s
index 3b38173ebca..0c27d2cdac3 100644
--- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s
@@ -39,6 +39,10 @@ vaddps %xmm1, %xmm1, %xmm2
 # CHECK-NEXT:    Number of physical registers:     72
 # CHECK-NEXT:    Total number of mappings created: 3
 # CHECK-NEXT:    Max number of mappings used:      3
+# CHECK-NEXT:    Number of optimizable moves:      3
+# CHECK-NEXT:    Number of moves eliminated:       3  (100.0%)
+# CHECK-NEXT:    Number of zero moves:             3  (100.0%)
+# CHECK-NEXT:    Max moves eliminated per cycle:   1
 
 # CHECK:      *  Register File #2 -- JIntegerPRF:
 # CHECK-NEXT:    Number of physical registers:     64
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s
index 096fe6c5a8f..08465f907ee 100644
--- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s
@@ -49,6 +49,10 @@ movdqu %xmm5, %xmm0
 # CHECK-NEXT:    Number of physical registers:     72
 # CHECK-NEXT:    Total number of mappings created: 0
 # CHECK-NEXT:    Max number of mappings used:      0
+# CHECK-NEXT:    Number of optimizable moves:      21
+# CHECK-NEXT:    Number of moves eliminated:       21  (100.0%)
+# CHECK-NEXT:    Number of zero moves:             21  (100.0%)
+# CHECK-NEXT:    Max moves eliminated per cycle:   2
 
 # CHECK:      *  Register File #2 -- JIntegerPRF:
 # CHECK-NEXT:    Number of physical registers:     64
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s
index 3d64bfd0bfd..f3d850fc90a 100644
--- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s
@@ -44,6 +44,10 @@ vmovdqu %xmm5, %xmm0
 # CHECK-NEXT:    Number of physical registers:     72
 # CHECK-NEXT:    Total number of mappings created: 0
 # CHECK-NEXT:    Max number of mappings used:      0
+# CHECK-NEXT:    Number of optimizable moves:      18
+# CHECK-NEXT:    Number of moves eliminated:       18  (100.0%)
+# CHECK-NEXT:    Number of zero moves:             18  (100.0%)
+# CHECK-NEXT:    Max moves eliminated per cycle:   2
 
 # CHECK:      *  Register File #2 -- JIntegerPRF:
 # CHECK-NEXT:    Number of physical registers:     64
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s
index 223b4c2c239..c2df1baf5c0 100644
--- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s
@@ -45,6 +45,10 @@ mov %edx, %eax
 # CHECK-NEXT:    Number of physical registers:     64
 # CHECK-NEXT:    Total number of mappings created: 0
 # CHECK-NEXT:    Max number of mappings used:      0
+# CHECK-NEXT:    Number of optimizable moves:      12
+# CHECK-NEXT:    Number of moves eliminated:       12  (100.0%)
+# CHECK-NEXT:    Number of zero moves:             12  (100.0%)
+# CHECK-NEXT:    Max moves eliminated per cycle:   2
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - JALU0
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s
index ab873c7c43f..277293e429b 100644
--- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s
@@ -45,6 +45,10 @@ mov %rdx, %rax
 # CHECK-NEXT:    Number of physical registers:     64
 # CHECK-NEXT:    Total number of mappings created: 0
 # CHECK-NEXT:    Max number of mappings used:      0
+# CHECK-NEXT:    Number of optimizable moves:      12
+# CHECK-NEXT:    Number of moves eliminated:       12  (100.0%)
+# CHECK-NEXT:    Number of zero moves:             12  (100.0%)
+# CHECK-NEXT:    Max moves eliminated per cycle:   2
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - JALU0
diff --git a/tools/llvm-mca/Views/RegisterFileStatistics.cpp b/tools/llvm-mca/Views/RegisterFileStatistics.cpp
index bd638d9795a..06202bc4142 100644
--- a/tools/llvm-mca/Views/RegisterFileStatistics.cpp
+++ b/tools/llvm-mca/Views/RegisterFileStatistics.cpp
@@ -21,10 +21,12 @@ namespace mca {
 RegisterFileStatistics::RegisterFileStatistics(const MCSubtargetInfo &sti)
     : STI(sti) {
   const MCSchedModel &SM = STI.getSchedModel();
-  RegisterFileUsage Empty = {0, 0, 0};
+  RegisterFileUsage RFUEmpty = {0, 0, 0};
+  MoveEliminationInfo MEIEmpty = {0, 0, 0, 0, 0};
   if (!SM.hasExtraProcessorInfo()) {
     // Assume a single register file.
-    RegisterFiles.emplace_back(Empty);
+    PRFUsage.emplace_back(RFUEmpty);
+    MoveElimInfo.emplace_back(MEIEmpty);
     return;
   }
 
@@ -35,8 +37,42 @@ RegisterFileStatistics::RegisterFileStatistics(const MCSubtargetInfo &sti)
   // be skipped. If there are no user defined register files, then reserve a
   // single entry for the default register file at index #0.
   unsigned NumRegFiles = std::max(PI.NumRegisterFiles, 1U);
-  RegisterFiles.resize(NumRegFiles);
-  std::fill(RegisterFiles.begin(), RegisterFiles.end(), Empty);
+
+  PRFUsage.resize(NumRegFiles);
+  std::fill(PRFUsage.begin(), PRFUsage.end(), RFUEmpty);
+
+  MoveElimInfo.resize(NumRegFiles);
+  std::fill(MoveElimInfo.begin(), MoveElimInfo.end(), MEIEmpty);
+}
+
+void RegisterFileStatistics::updateRegisterFileUsage(
+    ArrayRef<unsigned> UsedPhysRegs) {
+  for (unsigned I = 0, E = PRFUsage.size(); I < E; ++I) {
+    RegisterFileUsage &RFU = PRFUsage[I];
+    unsigned NumUsedPhysRegs = UsedPhysRegs[I];
+    RFU.CurrentlyUsedMappings += NumUsedPhysRegs;
+    RFU.TotalMappings += NumUsedPhysRegs;
+    RFU.MaxUsedMappings =
+        std::max(RFU.MaxUsedMappings, RFU.CurrentlyUsedMappings);
+  }
+}
+
+void RegisterFileStatistics::updateMoveElimInfo(const Instruction &Inst) {
+  if (!Inst.isOptimizableMove())
+    return;
+
+  assert(Inst.getDefs().size() == 1 && "Expected a single definition!");
+  assert(Inst.getUses().size() == 1 && "Expected a single register use!");
+  const WriteState &WS = Inst.getDefs()[0];
+  const ReadState &RS = Inst.getUses()[0];
+
+  MoveEliminationInfo &Info =
+      MoveElimInfo[Inst.getDefs()[0].getRegisterFileID()];
+  Info.TotalMoveEliminationCandidates++;
+  if (WS.isEliminated())
+    Info.CurrentMovesEliminated++;
+  if (WS.isWriteZero() && RS.isReadZero())
+    Info.TotalMovesThatPropagateZero++;
 }
 
 void RegisterFileStatistics::onEvent(const HWInstructionEvent &Event) {
@@ -45,21 +81,24 @@ void RegisterFileStatistics::onEvent(const HWInstructionEvent &Event) {
     break;
   case HWInstructionEvent::Retired: {
     const auto &RE = static_cast<const HWInstructionRetiredEvent &>(Event);
-    for (unsigned I = 0, E = RegisterFiles.size(); I < E; ++I)
-      RegisterFiles[I].CurrentlyUsedMappings -= RE.FreedPhysRegs[I];
+    for (unsigned I = 0, E = PRFUsage.size(); I < E; ++I)
+      PRFUsage[I].CurrentlyUsedMappings -= RE.FreedPhysRegs[I];
     break;
   }
   case HWInstructionEvent::Dispatched: {
     const auto &DE = static_cast<const HWInstructionDispatchedEvent &>(Event);
-    for (unsigned I = 0, E = RegisterFiles.size(); I < E; ++I) {
-      RegisterFileUsage &RFU = RegisterFiles[I];
-      unsigned NumUsedPhysRegs = DE.UsedPhysRegs[I];
-      RFU.CurrentlyUsedMappings += NumUsedPhysRegs;
-      RFU.TotalMappings += NumUsedPhysRegs;
-      RFU.MaxUsedMappings =
-          std::max(RFU.MaxUsedMappings, RFU.CurrentlyUsedMappings);
-    }
+    updateRegisterFileUsage(DE.UsedPhysRegs);
+    updateMoveElimInfo(*DE.IR.getInstruction());
+  }
   }
+}
+
+void RegisterFileStatistics::onCycleEnd() {
+  for (MoveEliminationInfo &MEI : MoveElimInfo) {
+    unsigned &CurrentMax = MEI.MaxMovesEliminatedPerCycle;
+    CurrentMax = std::max(CurrentMax, MEI.CurrentMovesEliminated);
+    MEI.TotalMovesEliminated += MEI.CurrentMovesEliminated;
+    MEI.CurrentMovesEliminated = 0;
   }
 }
 
@@ -68,14 +107,14 @@ void RegisterFileStatistics::printView(raw_ostream &OS) const {
   raw_string_ostream TempStream(Buffer);
 
   TempStream << "\n\nRegister File statistics:";
-  const RegisterFileUsage &GlobalUsage = RegisterFiles[0];
+  const RegisterFileUsage &GlobalUsage = PRFUsage[0];
   TempStream << "\nTotal number of mappings created:    "
              << GlobalUsage.TotalMappings;
   TempStream << "\nMax number of mappings used:         "
              << GlobalUsage.MaxUsedMappings << '\n';
 
-  for (unsigned I = 1, E = RegisterFiles.size(); I < E; ++I) {
-    const RegisterFileUsage &RFU = RegisterFiles[I];
+  for (unsigned I = 1, E = PRFUsage.size(); I < E; ++I) {
+    const RegisterFileUsage &RFU = PRFUsage[I];
     // Obtain the register file descriptor from the scheduling model.
     assert(STI.getSchedModel().hasExtraProcessorInfo() &&
            "Unable to find register file info!");
@@ -98,6 +137,27 @@ void RegisterFileStatistics::printView(raw_ostream &OS) const {
                << RFU.TotalMappings;
     TempStream << "\n   Max number of mappings used:      "
                << RFU.MaxUsedMappings << '\n';
+    const MoveEliminationInfo &MEI = MoveElimInfo[I];
+
+    if (MEI.TotalMoveEliminationCandidates) {
+      TempStream << "   Number of optimizable moves:      "
+                 << MEI.TotalMoveEliminationCandidates;
+      double EliminatedMovProportion = (double)MEI.TotalMovesEliminated /
+                                       MEI.TotalMoveEliminationCandidates *
+                                       100.0;
+      double ZeroMovProportion = (double)MEI.TotalMovesThatPropagateZero /
+                                 MEI.TotalMoveEliminationCandidates * 100.0;
+      TempStream << "\n   Number of moves eliminated:       "
+                 << MEI.TotalMovesEliminated << "  "
+                 << format("(%.1f%%)",
+                           floor((EliminatedMovProportion * 10) + 0.5) / 10);
+      TempStream << "\n   Number of zero moves:             "
+                 << MEI.TotalMovesThatPropagateZero << "  "
+                 << format("(%.1f%%)",
+                           floor((ZeroMovProportion * 10) + 0.5) / 10);
+      TempStream << "\n   Max moves eliminated per cycle:   "
+                 << MEI.MaxMovesEliminatedPerCycle << '\n';
+    }
   }
 
   TempStream.flush();
diff --git a/tools/llvm-mca/Views/RegisterFileStatistics.h b/tools/llvm-mca/Views/RegisterFileStatistics.h
index 86858d8bba8..a2c52a668da 100644
--- a/tools/llvm-mca/Views/RegisterFileStatistics.h
+++ b/tools/llvm-mca/Views/RegisterFileStatistics.h
@@ -21,6 +21,10 @@
 ///    Number of physical registers:     72
 ///    Total number of mappings created: 0
 ///    Max number of mappings used:      0
+///    Number of optimizable moves:      200
+///    Number of moves eliminated:       200 (100.0%)
+///    Number of zero moves:             200 (100.0%)
+///    Max moves eliminated per cycle:   2
 ///
 /// *  Register File #2 -- IntegerPRF:
 ///    Number of physical registers:     64
@@ -49,12 +53,25 @@ class RegisterFileStatistics : public View {
     unsigned CurrentlyUsedMappings;
   };
 
+  struct MoveEliminationInfo {
+    unsigned TotalMoveEliminationCandidates;
+    unsigned TotalMovesEliminated;
+    unsigned TotalMovesThatPropagateZero;
+    unsigned MaxMovesEliminatedPerCycle;
+    unsigned CurrentMovesEliminated;
+  };
+
   // There is one entry for each register file implemented by the processor.
-  llvm::SmallVector<RegisterFileUsage, 4> RegisterFiles;
+  llvm::SmallVector<RegisterFileUsage, 4> PRFUsage;
+  llvm::SmallVector<MoveEliminationInfo, 4> MoveElimInfo;
+
+  void updateRegisterFileUsage(ArrayRef<unsigned> UsedPhysRegs);
+  void updateMoveElimInfo(const Instruction &Inst);
 
 public:
   RegisterFileStatistics(const llvm::MCSubtargetInfo &sti);
 
+  void onCycleEnd() override;
   void onEvent(const HWInstructionEvent &Event) override;
   void printView(llvm::raw_ostream &OS) const override;
 };
diff --git a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
index 1cca8b5294d..d9949bf4f6a 100644
--- a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
+++ b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
@@ -173,6 +173,11 @@ class RegisterFile : public HardwareUnit {
   void freePhysRegs(const RegisterRenamingInfo &Entry,
                     MutableArrayRef<unsigned> FreedPhysRegs);
 
+  // Collects writes that are in a RAW dependency with RS.
+  // This method is called from `addRegisterRead()`.
+  void collectWrites(const ReadState &RS,
+                     SmallVectorImpl<WriteRef> &Writes) const;
+
   // Create an instance of RegisterMappingTracker for every register file
   // specified by the processor model.
   // If no register file is specified, then this method creates a default
@@ -189,6 +194,10 @@ public:
   // No physical regiser is allocated if this write is from a zero-idiom.
   void addRegisterWrite(WriteRef Write, MutableArrayRef<unsigned> UsedPhysRegs);
 
+  // Collect writes that are in a data dependency with RS, and update RS
+  // internal state.
+  void addRegisterRead(ReadState &RS, SmallVectorImpl<WriteRef> &Writes) const;
+
   // Removes write \param WS from the register mappings.
   // Physical registers may be released to reflect this update.
   // No registers are released if this write is from a zero-idiom.
@@ -200,7 +209,7 @@ public:
   // If RS is a read from a zero register, and WS is eliminated, then
   // `WS.WritesZero` is also set, so that method addRegisterWrite() would not
   // reserve a physical register for it.
-  bool tryEliminateMove(WriteState &WS, const ReadState &RS);
+  bool tryEliminateMove(WriteState &WS, ReadState &RS);
 
   // Checks if there are enough physical registers in the register files.
   // Returns a "response mask" where each bit represents the response from a
@@ -212,7 +221,8 @@ public:
   // Current implementation can simulate up to 32 register files (including the
   // special register file at index #0).
   unsigned isAvailable(ArrayRef<unsigned> Regs) const;
-  void collectWrites(SmallVectorImpl<WriteRef> &Writes, unsigned RegID) const;
+
+  // Returns the number of PRFs implemented by this processor.
   unsigned getNumRegisterFiles() const { return RegisterFiles.size(); }
 
   // Notify each PRF that a new cycle just started.
diff --git a/tools/llvm-mca/include/Instruction.h b/tools/llvm-mca/include/Instruction.h
index f83be1ff4bb..7407283bca2 100644
--- a/tools/llvm-mca/include/Instruction.h
+++ b/tools/llvm-mca/include/Instruction.h
@@ -101,6 +101,9 @@ class WriteState {
   // field RegisterID from WD.
   unsigned RegisterID;
 
+  // Physical register file that serves register RegisterID.
+  unsigned PRFID;
+
   // True if this write implicitly clears the upper portion of RegisterID's
   // super-registers.
   bool ClearsSuperRegs;
@@ -135,7 +138,7 @@ public:
   WriteState(const WriteDescriptor &Desc, unsigned RegID,
              bool clearsSuperRegs = false, bool writesZero = false)
       : WD(&Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
-        ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero),
+        PRFID(0), ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero),
         IsEliminated(false), DependentWrite(nullptr), NumWriteUsers(0U) {}
 
   WriteState(const WriteState &Other) = default;
@@ -144,6 +147,7 @@ public:
   int getCyclesLeft() const { return CyclesLeft; }
   unsigned getWriteResourceID() const { return WD->SClassOrWriteResourceID; }
   unsigned getRegisterID() const { return RegisterID; }
+  unsigned getRegisterFileID() const { return PRFID; }
   unsigned getLatency() const { return WD->Latency; }
 
   void addUser(ReadState *Use, int ReadAdvance);
@@ -168,6 +172,8 @@ public:
     IsEliminated = true;
   }
 
+  void setPRF(unsigned PRF) { PRFID = PRF; }
+
   // On every cycle, update CyclesLeft and notify dependent users.
   void cycleEvent();
   void onInstructionIssued();
@@ -185,6 +191,8 @@ class ReadState {
   const ReadDescriptor *RD;
   // Physical register identified associated to this read.
   unsigned RegisterID;
+  // Physical register file that serves register RegisterID.
+  unsigned PRFID;
   // Number of writes that contribute to the definition of RegisterID.
   // In the absence of partial register updates, the number of DependentWrites
   // cannot be more than one.
@@ -201,18 +209,21 @@ class ReadState {
   // This field is set to true only if there are no dependent writes, and
   // there are no `CyclesLeft' to wait.
   bool IsReady;
+  // True if this is a read from a known zero register.
+  bool IsZero;
   // True if this register read is from a dependency-breaking instruction.
   bool IndependentFromDef;
 
 public:
   ReadState(const ReadDescriptor &Desc, unsigned RegID)
-      : RD(&Desc), RegisterID(RegID), DependentWrites(0),
+      : RD(&Desc), RegisterID(RegID), PRFID(0), DependentWrites(0),
         CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true),
-        IndependentFromDef(false) {}
+        IsZero(false), IndependentFromDef(false) {}
 
   const ReadDescriptor &getDescriptor() const { return *RD; }
   unsigned getSchedClass() const { return RD->SchedClassID; }
   unsigned getRegisterID() const { return RegisterID; }
+  unsigned getRegisterFileID() const { return PRFID; }
 
   bool isReady() const { return IsReady; }
   bool isImplicitRead() const { return RD->isImplicitRead(); }
@@ -226,6 +237,10 @@ public:
     DependentWrites = Writes;
     IsReady = !Writes;
   }
+
+  bool isReadZero() const { return IsZero; }
+  void setReadZero() { IsZero = true; }
+  void setPRF(unsigned ID) { PRFID = ID; }
 };
 
 /// A sequence of cycles.
diff --git a/tools/llvm-mca/include/Stages/DispatchStage.h b/tools/llvm-mca/include/Stages/DispatchStage.h
index 3595f3122cc..29cace1022e 100644
--- a/tools/llvm-mca/include/Stages/DispatchStage.h
+++ b/tools/llvm-mca/include/Stages/DispatchStage.h
@@ -68,10 +68,6 @@ class DispatchStage final : public Stage {
                                    ArrayRef<unsigned> UsedPhysRegs,
                                    unsigned uOps) const;
 
-  void collectWrites(SmallVectorImpl<WriteRef> &Vec, unsigned RegID) const {
-    return PRF.collectWrites(Vec, RegID);
-  }
-
 public:
   DispatchStage(const MCSubtargetInfo &Subtarget, const MCRegisterInfo &MRI,
                 unsigned MaxDispatchWidth, RetireControlUnit &R,
diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
index 71aec49ce77..6bc63a0db50 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
@@ -173,6 +173,7 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
   bool IsEliminated = WS.isEliminated();
   bool ShouldAllocatePhysRegs = !IsWriteZero && !IsEliminated;
   const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+  WS.setPRF(RRI.IndexPlusCost.first);
 
   if (RRI.RenameAs && RRI.RenameAs != RegID) {
     RegID = RRI.RenameAs;
@@ -217,9 +218,9 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
       RegisterMappings[*I].second.AliasRegID = 0U;
     }
 
-    // No physical registers are allocated for instructions that are optimized in
-    // hardware. For example, zero-latency data-dependency breaking instructions
-    // don't consume physical registers.
+    // No physical registers are allocated for instructions that are optimized
+    // in hardware. For example, zero-latency data-dependency breaking
+    // instructions don't consume physical registers.
     if (ShouldAllocatePhysRegs)
       allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs);
   }
@@ -288,7 +289,7 @@ void RegisterFile::removeRegisterWrite(
   }
 }
 
-bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) {
+bool RegisterFile::tryEliminateMove(WriteState &WS, ReadState &RS) {
   const RegisterMapping &RMFrom = RegisterMappings[RS.getRegisterID()];
   const RegisterMapping &RMTo = RegisterMappings[WS.getRegisterID()];
 
@@ -349,15 +350,18 @@ bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) {
   }
 
   RMT.NumMoveEliminated++;
-  if (IsZeroMove)
+  if (IsZeroMove) {
     WS.setWriteZero();
+    RS.setReadZero();
+  }
   WS.setEliminated();
 
   return true;
 }
 
-void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
-                                 unsigned RegID) const {
+void RegisterFile::collectWrites(const ReadState &RS,
+                                 SmallVectorImpl<WriteRef> &Writes) const {
+  unsigned RegID = RS.getRegisterID();
   assert(RegID && RegID < RegisterMappings.size());
   LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register "
                     << MRI.getName(RegID) << '\n');
@@ -379,11 +383,13 @@ void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
   }
 
   // Remove duplicate entries and resize the input vector.
-  sort(Writes, [](const WriteRef &Lhs, const WriteRef &Rhs) {
-    return Lhs.getWriteState() < Rhs.getWriteState();
-  });
-  auto It = std::unique(Writes.begin(), Writes.end());
-  Writes.resize(std::distance(Writes.begin(), It));
+  if (Writes.size() > 1) {
+    sort(Writes, [](const WriteRef &Lhs, const WriteRef &Rhs) {
+      return Lhs.getWriteState() < Rhs.getWriteState();
+    });
+    auto It = std::unique(Writes.begin(), Writes.end());
+    Writes.resize(std::distance(Writes.begin(), It));
+  }
 
   LLVM_DEBUG({
     for (const WriteRef &WR : Writes) {
@@ -395,6 +401,20 @@ void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
   });
 }
 
+void RegisterFile::addRegisterRead(ReadState &RS,
+                                   SmallVectorImpl<WriteRef> &Defs) const {
+  unsigned RegID = RS.getRegisterID();
+  const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+  RS.setPRF(RRI.IndexPlusCost.first);
+  if (RS.isIndependentFromDef())
+    return;
+
+  if (ZeroRegisters[RS.getRegisterID()])
+    RS.setReadZero();
+  collectWrites(RS, Defs);
+  RS.setDependentWrites(Defs.size());
+}
+
 unsigned RegisterFile::isAvailable(ArrayRef<unsigned> Regs) const {
   SmallVector<unsigned, 4> NumPhysRegs(getNumRegisterFiles());
 
diff --git a/tools/llvm-mca/lib/Stages/DispatchStage.cpp b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
index 104446e711e..838dbad22e3 100644
--- a/tools/llvm-mca/lib/Stages/DispatchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
@@ -67,8 +67,9 @@ void DispatchStage::updateRAWDependencies(ReadState &RS,
                                           const MCSubtargetInfo &STI) {
   SmallVector<WriteRef, 4> DependentWrites;
 
-  collectWrites(DependentWrites, RS.getRegisterID());
-  RS.setDependentWrites(DependentWrites.size());
+  // Collect all the dependent writes, and update RS internal state.
+  PRF.addRegisterRead(RS, DependentWrites);
+
   // We know that this read depends on all the writes in DependentWrites.
   // For each write, check if we have ReadAdvance information, and use it
   // to figure out in how many cycles this read becomes available.
@@ -116,10 +117,8 @@ Error DispatchStage::dispatch(InstRef IR) {
   // We also don't update data dependencies for instructions that have been
   // eliminated at register renaming stage.
   if (!IsEliminated) {
-    for (ReadState &RS : IS.getUses()) {
-      if (!RS.isIndependentFromDef())
-        updateRAWDependencies(RS, STI);
-    }
+    for (ReadState &RS : IS.getUses())
+      updateRAWDependencies(RS, STI);
   }
 
   // By default, a dependency-breaking zero-idiom is expected to be optimized
@@ -127,8 +126,7 @@ Error DispatchStage::dispatch(InstRef IR) {
   // to the instruction.
   SmallVector<unsigned, 4> RegisterFiles(PRF.getNumRegisterFiles());
   for (WriteState &WS : IS.getDefs())
-    PRF.addRegisterWrite(WriteRef(IR.getSourceIndex(), &WS),
-                         RegisterFiles);
+    PRF.addRegisterWrite(WriteRef(IR.getSourceIndex(), &WS), RegisterFiles);
 
   // Reserve slots in the RCU, and notify the instruction that it has been
   // dispatched to the schedulers for execution.
-- 
GitLab


From 6e439b1d65cc32f4cf488f7a06e473ade49f810d Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 1 Nov 2018 18:14:45 +0000
Subject: [PATCH 0874/1116] [Hexagon] Fix MO_JumpTable const extender
 conversion

Previously this case fell through to unreachable, so it is clearly not
covered by any test case in LLVM. It may be dynamically unreachable, in
fact. However, if it were to run, this is what it would logically do.
The assert suggests that the intended behavior was not to allow folding
offsets from jump table indices, which makes sense.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345868 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonConstExtenders.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Target/Hexagon/HexagonConstExtenders.cpp b/lib/Target/Hexagon/HexagonConstExtenders.cpp
index d096445f144..424be5e4476 100644
--- a/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -788,6 +788,7 @@ HCE::ExtValue::operator MachineOperand() const {
       return MachineOperand::CreateCPI(V.ImmVal, Offset, TF);
     case MachineOperand::MO_JumpTableIndex:
       assert(Offset == 0);
+      return MachineOperand::CreateJTI(V.ImmVal, TF);
     default:
       llvm_unreachable("Unhandled kind");
  }
-- 
GitLab


From 03e85999496e7309e27fecc78d0222cfffd4eb5d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 1 Nov 2018 18:22:11 +0000
Subject: [PATCH 0875/1116] [LegalizeDAG] Add generic vector CTPOP expansion
 (PR32655)

This patch adds support for expanding vector CTPOP instructions and removes the x86 'bitmath' lowering which replicates the same expansion.

Differential Revision: https://reviews.llvm.org/D53258

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345869 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/LegalizeVectorOps.cpp        | 13 +++++
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   | 15 ++++-
 lib/Target/X86/X86ISelLowering.cpp            | 55 +------------------
 3 files changed, 28 insertions(+), 55 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 1b68f217590..284c4e5b3dd 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -129,6 +129,7 @@ class VectorLegalizer {
   SDValue ExpandFNEG(SDValue Op);
   SDValue ExpandFSUB(SDValue Op);
   SDValue ExpandBITREVERSE(SDValue Op);
+  SDValue ExpandCTPOP(SDValue Op);
   SDValue ExpandCTLZ(SDValue Op);
   SDValue ExpandCTTZ(SDValue Op);
   SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
@@ -726,6 +727,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
     return UnrollVSETCC(Op);
   case ISD::BITREVERSE:
     return ExpandBITREVERSE(Op);
+  case ISD::CTPOP:
+    return ExpandCTPOP(Op);
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
     return ExpandCTLZ(Op);
@@ -1104,6 +1107,16 @@ SDValue VectorLegalizer::ExpandFSUB(SDValue Op) {
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
+SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) {
+  // Attempt to expand using TargetLowering.
+  SDValue Result;
+  if (TLI.expandCTPOP(Op.getNode(), Result, DAG))
+    return Result;
+
+  // Otherwise go ahead and unroll.
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
   // Attempt to expand using TargetLowering.
   SDValue Result;
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index d5665ab67c5..1788c163c5e 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4295,8 +4295,19 @@ bool TargetLowering::expandCTPOP(SDNode *Node, SDValue &Result,
   EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
   SDValue Op = Node->getOperand(0);
   unsigned Len = VT.getScalarSizeInBits();
-  assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
-         "CTPOP not implemented for this type.");
+  assert(VT.isInteger() && "CTPOP not implemented for this type.");
+
+  // TODO: Add support for irregular type lengths.
+  if (!(Len <= 128 && Len % 8 == 0))
+    return false;
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (VT.isVector() && (!isOperationLegalOrCustom(ISD::ADD, VT) ||
+                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
+                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
+                        (Len != 8 && !isOperationLegalOrCustom(ISD::MUL, VT)) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
+    return false;
 
   // This is the "best" algorithm from
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 498a8e8178a..d95f72035e0 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -25103,57 +25103,6 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
   return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
 }
 
-static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
-                                       const X86Subtarget &Subtarget,
-                                       SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-  assert(VT == MVT::v16i8 && "Only v16i8 vector CTPOP lowering supported.");
-
-  // This is the vectorized version of the "best" algorithm from
-  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-  // with a minor tweak to use a series of adds + shifts instead of vector
-  // multiplications. Implemented for all integer vector types. We only use
-  // this when we don't have SSSE3 which allows a LUT-based lowering that is
-  // much faster, even faster than using native popcnt instructions.
-
-  auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
-    MVT VT = V.getSimpleValueType();
-    SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
-    return DAG.getNode(OpCode, DL, VT, V, ShifterV);
-  };
-  auto GetMask = [&](SDValue V, APInt Mask) {
-    MVT VT = V.getSimpleValueType();
-    SDValue MaskV = DAG.getConstant(Mask, DL, VT);
-    return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
-  };
-
-  // We don't want to incur the implicit masks required to SRL vNi8 vectors on
-  // x86, so set the SRL type to have elements at least i16 wide. This is
-  // correct because all of our SRLs are followed immediately by a mask anyways
-  // that handles any bits that sneak into the high bits of the byte elements.
-  MVT SrlVT = MVT::v8i16;
-  SDValue V = Op;
-
-  // v = v - ((v >> 1) & 0x55555555...)
-  SDValue Srl =
-      DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
-  SDValue And = GetMask(Srl, APInt(8, 0x55));
-  V = DAG.getNode(ISD::SUB, DL, VT, V, And);
-
-  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
-  SDValue AndLHS = GetMask(V, APInt(8, 0x33));
-  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
-  SDValue AndRHS = GetMask(Srl, APInt(8, 0x33));
-  V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
-
-  // v = (v + (v >> 4)) & 0x0F0F0F0F...
-  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
-  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
-  V = GetMask(Add, APInt(8, 0x0F));
-
-  return V;
-}
-
 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
@@ -25193,9 +25142,9 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
     return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
   }
 
-  // We can't use the fast LUT approach, so fall back on vectorized bitmath.
+  // We can't use the fast LUT approach, so fall back on LegalizeDAG.
   if (!Subtarget.hasSSSE3())
-    return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
+    return SDValue();
 
   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
 }
-- 
GitLab


From 2ef07e92b85a36c2bd4a72d23cbb2fb64747dffa Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Thu, 1 Nov 2018 19:01:53 +0000
Subject: [PATCH 0876/1116] [GlobalISel] Fix a bug in
 LegalizeRuleSet::clampMaxNumElements

Summary:
This function was causing a crash when `MaxElements == 1` because
it was trying to create a single element vector type.

Reviewers: dsanders, aemerson, aditya_nandakumar

Reviewed By: dsanders

Subscribers: rovka, kristof.beyls, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D53734

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345875 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h   |  2 +
 lib/Target/AArch64/AArch64LegalizerInfo.cpp   |  6 ++-
 .../GlobalISel/legalize-load-fewerElts.mir    | 39 +++++++++++++++++++
 .../GlobalISel/legalize-load-v4s32.mir        | 21 ----------
 4 files changed, 45 insertions(+), 23 deletions(-)
 create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir
 delete mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-load-v4s32.mir

diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index e0ea5755387..755805de1b0 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -693,6 +693,8 @@ public:
         },
         [=](const LegalityQuery &Query) {
           LLT VecTy = Query.Types[TypeIdx];
+          if (MaxElements == 1)
+            return std::make_pair(TypeIdx, VecTy.getElementType());
           return std::make_pair(
               TypeIdx, LLT::vector(MaxElements, VecTy.getScalarSizeInBits()));
         });
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 474516ff2cc..4b5e10ac4ec 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -169,7 +169,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
       })
-      .clampNumElements(0, v2s32, v2s32);
+      .clampNumElements(0, v2s32, v2s32)
+      .clampMaxNumElements(0, s64, 1);
 
   getActionDefinitionsBuilder(G_STORE)
       .legalForTypesWithMemSize({{s8, p0, 8},
@@ -187,7 +188,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
         return Query.Types[0].isScalar() &&
                Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
       })
-      .clampNumElements(0, v2s32, v2s32);
+      .clampNumElements(0, v2s32, v2s32)
+      .clampMaxNumElements(0, s64, 1);
 
   // Constants
   getActionDefinitionsBuilder(G_CONSTANT)
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir
new file mode 100644
index 00000000000..7f42f6e6c33
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir
@@ -0,0 +1,39 @@
+# RUN: llc -march=aarch64 -o - -run-pass=legalizer -global-isel-abort=0 -debug-only=legalizer 2>&1 %s | FileCheck %s
+# REQUIRES: asserts
+
+# CHECK: Legalize Machine IR for: load_v4s32
+# CHECK-NEXT: %{{[0-9]+}}:_(<4 x s32>) = G_LOAD %{{[0-9]+}}:_(p0)
+# CHECK-NEXT: Reduce number of elements
+---
+name:            load_v4s32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x0
+
+    %0:_(p0) = COPY $x0
+    %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load 16, align 4)
+    %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<4 x s32>)
+    $w0 = COPY %5(s32)
+
+...
+
+# Make sure we are able to scalarize v2s64.
+# CHECK: Legalize Machine IR for: load_v2s64
+# CHECK-NEXT: %{{[0-9]+}}:_(<2 x s64>) = G_LOAD %{{[0-9]+}}:_(p0)
+# CHECK-NEXT: Reduce number of elements
+---
+name:            load_v2s64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x0
+
+    %0:_(p0) = COPY $x0
+    %1:_(<2 x s64>) = G_LOAD %0(p0) :: (load 16)
+    %2:_(s64), %3:_(s64) = G_UNMERGE_VALUES %1(<2 x s64>)
+    $x0 = COPY %3(s64)
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-v4s32.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-v4s32.mir
deleted file mode 100644
index 8493bd8292c..00000000000
--- a/test/CodeGen/AArch64/GlobalISel/legalize-load-v4s32.mir
+++ /dev/null
@@ -1,21 +0,0 @@
-# RUN: not llc -march=aarch64 -o - -run-pass=legalizer -debug-only=legalizer 2>&1 %s | FileCheck %s
-# REQUIRES: asserts
-
-# CHECK: Legalize Machine IR for: load_v4s32
-# CHECK-NEXT: %{{[0-9]+}}:_(<4 x s32>) = G_LOAD %{{[0-9]+}}:_(p0)
-# CHECK-NOT: Lower
-# CHECK: unable to legalize instruction
----
-name:            load_v4s32
-legalized:       false
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0
-
-    %0:_(p0) = COPY $x0
-    %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load 16, align 4)
-    %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<4 x s32>)
-    $w0 = COPY %5(s32)
-
-...
-- 
GitLab


From cf5c71234a4fdc146ffa0632cc09c452408c7a09 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 1 Nov 2018 19:11:05 +0000
Subject: [PATCH 0877/1116] Remove unnecessary fallthrough annotation after
 unreachable

Clang's -Wimplicit-fallthrough implementation warns on this. I built
clang with GCC 7.3 in +asserts and -asserts mode, and GCC doesn't warn
on this in either configuration. I think it is unnecessary. I separated
it from the large mechanical patch (https://reviews.llvm.org/D53950) in
case I am wrong and it has to be reverted.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345876 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/FunctionComparator.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp
index ef991d715fd..a717d9b7281 100644
--- a/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/lib/Transforms/Utils/FunctionComparator.cpp
@@ -410,8 +410,6 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
   switch (TyL->getTypeID()) {
   default:
     llvm_unreachable("Unknown type!");
-    // Fall through in Release mode.
-    LLVM_FALLTHROUGH;
   case Type::IntegerTyID:
     return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(),
                       cast<IntegerType>(TyR)->getBitWidth());
-- 
GitLab


From f31487a7c3cc92d75a5bf45725370b6443c7b3ed Mon Sep 17 00:00:00 2001
From: Florian Hahn <florian.hahn@arm.com>
Date: Thu, 1 Nov 2018 19:25:00 +0000
Subject: [PATCH 0878/1116] [LoopInterchange] Remove support for inner-only
 reductions.

Inner-loop only reductions require additional checks to make sure they
form a load-phi-store cycle across inner and outer loop. Otherwise the
reduction value is not properly preserved. This patch disables
interchanging such loops for now, as it causes miscompiles in some
cases and it seems to apply only for a tiny amount of loops. Across the
test-suite, SPEC2000 and SPEC2006, 61 instead of 62 loops are
interchange with inner loop reduction support disabled. With
-loop-interchange-threshold=-1000, 3256 instead of 3267.

See the discussion and history of D53027 for an outline of how such legality
checks could look like.

Reviewers: efriedma, mcrosier, davide

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D53027


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345877 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopInterchange.cpp     | 125 ++------
 .../LoopInterchange/inner-only-reductions.ll  | 124 ++++++++
 test/Transforms/LoopInterchange/lcssa.ll      |   1 -
 .../LoopInterchange/phi-ordering.ll           |  18 +-
 test/Transforms/LoopInterchange/reductions.ll | 272 ------------------
 5 files changed, 149 insertions(+), 391 deletions(-)
 create mode 100644 test/Transforms/LoopInterchange/inner-only-reductions.ll
 delete mode 100644 test/Transforms/LoopInterchange/reductions.ll

diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 21c8512b266..523bac79b69 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -339,16 +339,10 @@ public:
 
   bool currentLimitations();
 
-  bool hasInnerLoopReduction() { return InnerLoopHasReduction; }
-
 private:
   bool tightlyNested(Loop *Outer, Loop *Inner);
-  bool containsUnsafeInstructionsInHeader(BasicBlock *BB);
-  bool areAllUsesReductions(Instruction *Ins, Loop *L);
-  bool containsUnsafeInstructionsInLatch(BasicBlock *BB);
-  bool findInductionAndReductions(Loop *L,
-                                  SmallVector<PHINode *, 8> &Inductions,
-                                  SmallVector<PHINode *, 8> &Reductions);
+  bool containsUnsafeInstructions(BasicBlock *BB);
+  bool findInductions(Loop *L, SmallVector<PHINode *, 8> &Inductions);
 
   Loop *OuterLoop;
   Loop *InnerLoop;
@@ -358,7 +352,6 @@ private:
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
-  bool InnerLoopHasReduction = false;
 };
 
 /// LoopInterchangeProfitability checks if it is profitable to interchange the
@@ -391,11 +384,9 @@ class LoopInterchangeTransform {
 public:
   LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
                            LoopInfo *LI, DominatorTree *DT,
-                           BasicBlock *LoopNestExit,
-                           bool InnerLoopContainsReductions)
+                           BasicBlock *LoopNestExit)
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
-        LoopExit(LoopNestExit),
-        InnerLoopHasReduction(InnerLoopContainsReductions) {}
+        LoopExit(LoopNestExit) {}
 
   /// Interchange OuterLoop and InnerLoop.
   bool transform();
@@ -420,7 +411,6 @@ private:
   LoopInfo *LI;
   DominatorTree *DT;
   BasicBlock *LoopExit;
-  bool InnerLoopHasReduction;
 };
 
 // Main LoopInterchange Pass.
@@ -571,7 +561,7 @@ struct LoopInterchange : public LoopPass {
     });
 
     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
-                                 LoopNestExit, LIL.hasInnerLoopReduction());
+                                 LoopNestExit);
     LIT.transform();
     LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
     LoopsInterchanged++;
@@ -581,42 +571,12 @@ struct LoopInterchange : public LoopPass {
 
 } // end anonymous namespace
 
-bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
-  return llvm::none_of(Ins->users(), [=](User *U) -> bool {
-    auto *UserIns = dyn_cast<PHINode>(U);
-    RecurrenceDescriptor RD;
-    return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
+bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
+  return any_of(*BB, [](const Instruction &I) {
+    return I.mayHaveSideEffects() || I.mayReadFromMemory();
   });
 }
 
-bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
-    BasicBlock *BB) {
-  for (Instruction &I : *BB) {
-    // Load corresponding to reduction PHI's are safe while concluding if
-    // tightly nested.
-    if (LoadInst *L = dyn_cast<LoadInst>(&I)) {
-      if (!areAllUsesReductions(L, InnerLoop))
-        return true;
-    } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
-      return true;
-  }
-  return false;
-}
-
-bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
-    BasicBlock *BB) {
-  for (Instruction &I : *BB) {
-    // Stores corresponding to reductions are safe while concluding if tightly
-    // nested.
-    if (StoreInst *L = dyn_cast<StoreInst>(&I)) {
-      if (!isa<PHINode>(L->getOperand(0)))
-        return true;
-    } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
-      return true;
-  }
-  return false;
-}
-
 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
@@ -640,8 +600,8 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
   // We do not have any basic block in between now make sure the outer header
   // and outer loop latch doesn't contain any unsafe instructions.
-  if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
-      containsUnsafeInstructionsInLatch(OuterLoopLatch))
+  if (containsUnsafeInstructions(OuterLoopHeader) ||
+      containsUnsafeInstructions(OuterLoopLatch))
     return false;
 
   LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
@@ -673,9 +633,8 @@ bool LoopInterchangeLegality::isLoopStructureUnderstood(
   return true;
 }
 
-bool LoopInterchangeLegality::findInductionAndReductions(
-    Loop *L, SmallVector<PHINode *, 8> &Inductions,
-    SmallVector<PHINode *, 8> &Reductions) {
+bool LoopInterchangeLegality::findInductions(
+    Loop *L, SmallVector<PHINode *, 8> &Inductions) {
   if (!L->getLoopLatch() || !L->getLoopPredecessor())
     return false;
   for (PHINode &PHI : L->getHeader()->phis()) {
@@ -683,11 +642,8 @@ bool LoopInterchangeLegality::findInductionAndReductions(
     InductionDescriptor ID;
     if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
       Inductions.push_back(&PHI);
-    else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD))
-      Reductions.push_back(&PHI);
     else {
-      LLVM_DEBUG(
-          dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
+      LLVM_DEBUG(dbgs() << "Failed to recognize PHI as an induction.\n");
       return false;
     }
   }
@@ -737,8 +693,7 @@ bool LoopInterchangeLegality::currentLimitations() {
 
   PHINode *InnerInductionVar;
   SmallVector<PHINode *, 8> Inductions;
-  SmallVector<PHINode *, 8> Reductions;
-  if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
+  if (!findInductions(InnerLoop, Inductions)) {
     LLVM_DEBUG(
         dbgs() << "Only inner loops with induction or reduction PHI nodes "
                << "are supported currently.\n");
@@ -766,12 +721,9 @@ bool LoopInterchangeLegality::currentLimitations() {
     });
     return true;
   }
-  if (Reductions.size() > 0)
-    InnerLoopHasReduction = true;
 
   InnerInductionVar = Inductions.pop_back_val();
-  Reductions.clear();
-  if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
+  if (!findInductions(OuterLoop, Inductions)) {
     LLVM_DEBUG(
         dbgs() << "Only outer loops with induction or reduction PHI nodes "
                << "are supported currently.\n");
@@ -785,20 +737,6 @@ bool LoopInterchangeLegality::currentLimitations() {
     return true;
   }
 
-  // Outer loop cannot have reduction because then loops will not be tightly
-  // nested.
-  if (!Reductions.empty()) {
-    LLVM_DEBUG(dbgs() << "Outer loops with reductions are not supported "
-                      << "currently.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter",
-                                      OuterLoop->getStartLoc(),
-                                      OuterLoop->getHeader())
-             << "Outer loops with reductions cannot be interchangeed "
-                "currently.";
-    });
-    return true;
-  }
   // TODO: Currently we handle only loops with 1 induction variable.
   if (Inductions.size() != 1) {
     LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
@@ -1449,34 +1387,11 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   // replaced by Inners'.
   updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
 
-  // Now update the reduction PHIs in the inner and outer loop headers.
-  SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
-  for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1))
-    InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
-  for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1))
-    OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
-
-  for (PHINode *PHI : OuterLoopPHIs)
-    PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
-
-  // Move the PHI nodes from the inner loop header to the outer loop header.
-  // We have to deal with one kind of PHI nodes:
-  //  1) PHI nodes that are part of inner loop-only reductions.
-  // We only have to move the PHI node and update the incoming blocks.
-  for (PHINode *PHI : InnerLoopPHIs) {
-    PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
-    for (BasicBlock *InBB : PHI->blocks()) {
-      if (InnerLoop->contains(InBB))
-        continue;
-
-      assert(!isa<PHINode>(PHI->getIncomingValueForBlock(InBB)) &&
-             "Unexpected incoming PHI node, reductions in outer loop are not "
-             "supported yet");
-      PHI->replaceAllUsesWith(PHI->getIncomingValueForBlock(InBB));
-      PHI->eraseFromParent();
-      break;
-    }
-  }
+  // Make sure we have no other PHIs.
+  auto InnerPhis = drop_begin(InnerLoopHeader->phis(), 1);
+  auto OuterPhis = drop_begin(OuterLoopHeader->phis(), 1);
+  assert(begin(InnerPhis) == end(InnerPhis) && "Unexpected PHIs in inner loop");
+  assert(begin(OuterPhis) == end(OuterPhis) && "Unexpected PHis in outer loop");
 
   // Update the incoming blocks for moved PHI nodes.
   updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader);
diff --git a/test/Transforms/LoopInterchange/inner-only-reductions.ll b/test/Transforms/LoopInterchange/inner-only-reductions.ll
new file mode 100644
index 00000000000..74543fb1647
--- /dev/null
+++ b/test/Transforms/LoopInterchange/inner-only-reductions.ll
@@ -0,0 +1,124 @@
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
+; RUN:     -verify-dom-info -verify-loop-info 2>&1 | FileCheck -check-prefix=IR %s
+; RUN: FileCheck --input-file=%t %s
+
+; Inner loop only reductions are not supported currently. See discussion at
+; D53027 for more information on the required checks.
+
+@A = common global [500 x [500 x i32]] zeroinitializer
+@X = common global i32 0
+@B = common global [500 x [500 x i32]] zeroinitializer
+@Y = common global i32 0
+
+;; global X
+
+;;  for( int i=1;i<N;i++)
+;;    for( int j=1;j<N;j++)
+;;      X+=A[j][i];
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHI
+; CHECK-NEXT: Function:        reduction_01
+
+; IR-LABEL: @reduction_01(
+; IR-NOT: split
+
+define void @reduction_01(i32 %N) {
+entry:
+  %cmp16 = icmp sgt i32 %N, 1
+  br i1 %cmp16, label %for.body3.lr.ph, label %for.end8
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.for.inc6_crit_edge, %entry
+  %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ]
+  %X.promoted = load i32, i32* @X
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %add15 = phi i32 [ %X.promoted, %for.body3.lr.ph ], [ %add, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18
+  %0 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %add15, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body3
+
+for.cond1.for.inc6_crit_edge:                     ; preds = %for.body3
+  %add.lcssa = phi i32 [ %add, %for.body3 ]
+  store i32 %add.lcssa, i32* @X
+  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
+  %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
+  %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
+  br i1 %exitcond21, label %for.end8, label %for.body3.lr.ph
+
+for.end8:                                         ; preds = %for.cond1.for.inc6_crit_edge, %entry
+  ret void
+}
+
+;; Not tightly nested. Do not interchange.
+;;  for( int i=1;i<N;i++)
+;;    for( int j=1;j<N;j++) {
+;;      for( int k=1;k<N;k++) {
+;;        X+=A[k][j];
+;;      }
+;;      Y+=B[j][i];
+;;    }
+
+;; Not tightly nested. Do not interchange.
+;; Not interchanged hence the phi's in the inner loop will not be split.
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_03
+
+; IR-LABEL: @reduction_03(
+; IR-NOT: split
+
+define void @reduction_03(i32 %N) {
+entry:
+  %cmp35 = icmp sgt i32 %N, 1
+  br i1 %cmp35, label %for.cond4.preheader.lr.ph, label %for.end19
+
+for.cond4.preheader.lr.ph:                        ; preds = %for.cond1.for.inc17_crit_edge, %entry
+  %indvars.iv41 = phi i64 [ %indvars.iv.next42, %for.cond1.for.inc17_crit_edge ], [ 1, %entry ]
+  %Y.promoted = load i32, i32* @Y
+  br label %for.body6.lr.ph
+
+for.body6.lr.ph:                                  ; preds = %for.cond4.for.end_crit_edge, %for.cond4.preheader.lr.ph
+  %indvars.iv37 = phi i64 [ 1, %for.cond4.preheader.lr.ph ], [ %indvars.iv.next38, %for.cond4.for.end_crit_edge ]
+  %add1334 = phi i32 [ %Y.promoted, %for.cond4.preheader.lr.ph ], [ %add13, %for.cond4.for.end_crit_edge ]
+  %X.promoted = load i32, i32* @X
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.body6.lr.ph
+  %indvars.iv = phi i64 [ 1, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv37
+  %0 = load i32, i32* %arrayidx8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.cond4.for.end_crit_edge, label %for.body6
+
+for.cond4.for.end_crit_edge:                      ; preds = %for.body6
+  %arrayidx12 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @B, i64 0, i64 %indvars.iv37, i64 %indvars.iv41
+  %1 = load i32, i32* %arrayidx12
+  %add13 = add nsw i32 %add1334, %1
+  %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+  %lftr.wideiv39 = trunc i64 %indvars.iv.next38 to i32
+  %exitcond40 = icmp eq i32 %lftr.wideiv39, %N
+  br i1 %exitcond40, label %for.cond1.for.inc17_crit_edge, label %for.body6.lr.ph
+
+for.cond1.for.inc17_crit_edge:                    ; preds = %for.cond4.for.end_crit_edge
+  %add13.lcssa = phi i32 [ %add13, %for.cond4.for.end_crit_edge ]
+  store i32 %add13.lcssa, i32* @Y
+  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+  %lftr.wideiv43 = trunc i64 %indvars.iv.next42 to i32
+  %exitcond44 = icmp eq i32 %lftr.wideiv43, %N
+  br i1 %exitcond44, label %for.end19, label %for.cond4.preheader.lr.ph
+
+for.end19:                                        ; preds = %for.cond1.for.inc17_crit_edge, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopInterchange/lcssa.ll b/test/Transforms/LoopInterchange/lcssa.ll
index 8886cf4925f..2bd9ee69c16 100644
--- a/test/Transforms/LoopInterchange/lcssa.ll
+++ b/test/Transforms/LoopInterchange/lcssa.ll
@@ -246,7 +246,6 @@ for.body3:                                        ; preds = %for.body3, %outer.h
 
 outer.inc:                                        ; preds = %for.body3, %outer.header
   %sv = phi i64 [ 0, %outer.header ], [ 1, %for.body3 ]
-  store i64 %sv, i64* %ptr
   %iv.outer.next = add nsw i64 %iv.outer, 1
   %cmp = icmp eq i64 %iv.outer.next, 100
   br i1 %cmp, label %outer.header, label %for.exit
diff --git a/test/Transforms/LoopInterchange/phi-ordering.ll b/test/Transforms/LoopInterchange/phi-ordering.ll
index c7416973758..2854fe19f7a 100644
--- a/test/Transforms/LoopInterchange/phi-ordering.ll
+++ b/test/Transforms/LoopInterchange/phi-ordering.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -loop-interchange-threshold=-1000 -S 2>&1 | FileCheck %s
 ;; Checks the order of the inner phi nodes does not cause havoc.
 ;; The inner loop has a reduction into c. The IV is not the first phi.
 
@@ -23,8 +23,6 @@ define void @test(i32 %T, [90 x i32]* noalias nocapture %C, i16* noalias nocaptu
 ; CHECK-NEXT:    br label [[FOR2_HEADER:%.*]]
 ; CHECK:       for2.header:
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ [[INC17:%.*]], [[FOR2_INC16:%.*]] ], [ 0, [[FOR2_HEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [90 x i32], [90 x i32]* [[C:%.*]], i32 [[I]], i32 [[J]]
-; CHECK-NEXT:    [[ARRAYIDX14_PROMOTED:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    br label [[FOR3_SPLIT1:%.*]]
 ; CHECK:       for3.preheader:
 ; CHECK-NEXT:    br label [[FOR3:%.*]]
@@ -35,15 +33,14 @@ define void @test(i32 %T, [90 x i32]* noalias nocapture %C, i16* noalias nocaptu
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[K]], [[MUL]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i32 [[ADD]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
-; CHECK-NEXT:    [[ADD15:%.*]] = add nsw i32 [[CONV]], [[ARRAYIDX14_PROMOTED]]
+; CHECK-NEXT:    [[ADD15:%.*]] = add nsw i16 [[TMP0]], 1
+; CHECK-NEXT:    store i16 [[ADD15]], i16* [[ARRAYIDX]]
 ; CHECK-NEXT:    br label [[FOR2_INC16]]
 ; CHECK:       for3.split:
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[K]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 90
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR1_LOOPEXIT:%.*]], label [[FOR3]]
 ; CHECK:       for2.inc16:
-; CHECK-NEXT:    store i32 [[ADD15]], i32* [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    [[INC17]] = add nuw nsw i32 [[J]], 1
 ; CHECK-NEXT:    [[EXITCOND47:%.*]] = icmp eq i32 [[INC17]], 90
 ; CHECK-NEXT:    br i1 [[EXITCOND47]], label [[FOR1_INC19]], label [[FOR2_HEADER]]
@@ -66,25 +63,20 @@ for1.header:                                  ; preds = %entry
 
 for2.header:                                  ; preds = %for2.inc16, %for1.header
   %j = phi i32 [ 0, %for1.header ], [ %inc17, %for2.inc16 ]
-  %arrayidx14 = getelementptr inbounds [90 x i32], [90 x i32]* %C, i32 %i, i32 %j
-  %arrayidx14.promoted = load i32, i32* %arrayidx14, align 4
   br label %for3
 
 for3:                                        ; preds = %for3, %for2.header
-  %add1541 = phi i32 [ %arrayidx14.promoted, %for2.header ], [ %add15, %for3 ]
   %k = phi i32 [ 1, %for2.header ], [ %inc, %for3 ]
   %add = add nsw i32 %k, %mul
   %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
   %0 = load i16, i16* %arrayidx, align 2
-  %conv = sext i16 %0 to i32
-  %add15 = add nsw i32 %conv, %add1541
+  %add15 = add nsw i16 %0, 1
+  store i16 %add15, i16* %arrayidx
   %inc = add nuw nsw i32 %k, 1
   %exitcond = icmp eq i32 %inc, 90
   br i1 %exitcond, label %for2.inc16, label %for3
 
 for2.inc16:                                        ; preds = %for.body6
-  %add15.lcssa = phi i32 [ %add15, %for3 ]
-  store i32 %add15.lcssa, i32* %arrayidx14, align 4
   %inc17 = add nuw nsw i32 %j, 1
   %exitcond47 = icmp eq i32 %inc17, 90
   br i1 %exitcond47, label %for1.inc19, label %for2.header
diff --git a/test/Transforms/LoopInterchange/reductions.ll b/test/Transforms/LoopInterchange/reductions.ll
deleted file mode 100644
index 28a2d8d6a66..00000000000
--- a/test/Transforms/LoopInterchange/reductions.ll
+++ /dev/null
@@ -1,272 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -verify-loop-lcssa -S -debug 2>&1 | FileCheck %s
-
-@A = common global [500 x [500 x i32]] zeroinitializer
-@X = common global i32 0
-@B = common global [500 x [500 x i32]] zeroinitializer
-@Y = common global i32 0
-
-;;  for( int i=1;i<N;i++)
-;;    for( int j=1;j<N;j++)
-;;      X+=A[j][i];
-
-;; Loop is interchanged check that the phi nodes are split and the promoted value is used instead of the reduction phi.
-; CHECK: Loops interchanged.
-
-define void @reduction_01(i32 %N) {
-entry:
-  %cmp16 = icmp sgt i32 %N, 1
-  br i1 %cmp16, label %for.body3.lr.ph, label %for.end8
-
-for.body3.lr.ph:                                  ; preds = %for.cond1.for.inc6_crit_edge, %entry
-  %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ]
-  %X.promoted = load i32, i32* @X
-  br label %for.body3
-
-for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
-  %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
-  %add15 = phi i32 [ %X.promoted, %for.body3.lr.ph ], [ %add, %for.body3 ]
-  %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18
-  %0 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %add15, %0
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body3
-
-for.cond1.for.inc6_crit_edge:                     ; preds = %for.body3
-  %add.lcssa = phi i32 [ %add, %for.body3 ]
-  store i32 %add.lcssa, i32* @X
-  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
-  %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
-  %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
-  br i1 %exitcond21, label %for.end8, label %for.body3.lr.ph
-
-for.end8:                                         ; preds = %for.cond1.for.inc6_crit_edge, %entry
-  ret void
-}
-
-;; Test for more than 1 reductions inside a loop.
-;;  for( int i=1;i<N;i++)
-;;    for( int j=1;j<N;j++)
-;;      for( int k=1;k<N;k++) {
-;;        X+=A[k][j];
-;;        Y+=B[k][i];
-;;      }
-
-;; Loop is interchanged check that the phi nodes are split and the promoted value is used instead of the reduction phi.
-; CHECK: Loops interchanged.
-
-define void @reduction_02(i32 %N) {
-entry:
-  %cmp34 = icmp sgt i32 %N, 1
-  br i1 %cmp34, label %for.cond4.preheader.preheader, label %for.end19
-
-for.cond4.preheader.preheader:                    ; preds = %for.inc17, %entry
-  %indvars.iv40 = phi i64 [ %indvars.iv.next41, %for.inc17 ], [ 1, %entry ]
-  br label %for.body6.lr.ph
-
-for.body6.lr.ph:                                  ; preds = %for.cond4.for.inc14_crit_edge, %for.cond4.preheader.preheader
-  %indvars.iv36 = phi i64 [ %indvars.iv.next37, %for.cond4.for.inc14_crit_edge ], [ 1, %for.cond4.preheader.preheader ]
-  %X.promoted = load i32, i32* @X
-  %Y.promoted = load i32, i32* @Y
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.body6, %for.body6.lr.ph
-  %indvars.iv = phi i64 [ 1, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ]
-  %add1331 = phi i32 [ %Y.promoted, %for.body6.lr.ph ], [ %add13, %for.body6 ]
-  %add30 = phi i32 [ %X.promoted, %for.body6.lr.ph ], [ %add, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv36
-  %0 = load i32, i32* %arrayidx8
-  %add = add nsw i32 %add30, %0
-  %arrayidx12 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv40
-  %1 = load i32, i32* %arrayidx12
-  %add13 = add nsw i32 %add1331, %1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.cond4.for.inc14_crit_edge, label %for.body6
-
-for.cond4.for.inc14_crit_edge:                    ; preds = %for.body6
-  %add.lcssa = phi i32 [ %add, %for.body6 ]
-  %add13.lcssa = phi i32 [ %add13, %for.body6 ]
-  store i32 %add.lcssa, i32* @X
-  store i32 %add13.lcssa, i32* @Y
-  %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, 1
-  %lftr.wideiv38 = trunc i64 %indvars.iv.next37 to i32
-  %exitcond39 = icmp eq i32 %lftr.wideiv38, %N
-  br i1 %exitcond39, label %for.inc17, label %for.body6.lr.ph
-
-for.inc17:                                        ; preds = %for.cond4.for.inc14_crit_edge
-  %add.lcssa.lcssa = phi i32 [ %add.lcssa, %for.cond4.for.inc14_crit_edge ]
-  %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
-  %lftr.wideiv42 = trunc i64 %indvars.iv.next41 to i32
-  %exitcond43 = icmp eq i32 %lftr.wideiv42, %N
-  br i1 %exitcond43, label %for.end19, label %for.cond4.preheader.preheader
-
-for.end19:                                        ; preds = %for.inc17, %entry
-  %res1 = phi i32 [ 0, %entry ], [ %add.lcssa.lcssa, %for.inc17 ]
-  store i32 %res1, i32* @X
-  ret void
-}
-
-;; Not tightly nested. Do not interchange.
-;;  for( int i=1;i<N;i++)
-;;    for( int j=1;j<N;j++) {
-;;      for( int k=1;k<N;k++) {
-;;        X+=A[k][j];
-;;      }
-;;      Y+=B[j][i];
-;;    }
-
-;; Not tightly nested. Do not interchange.
-;; Not interchanged hence the phi's in the inner loop will not be split.
-; CHECK: Outer loops with reductions are not supported currently.
-
-define void @reduction_03(i32 %N) {
-entry:
-  %cmp35 = icmp sgt i32 %N, 1
-  br i1 %cmp35, label %for.cond4.preheader.lr.ph, label %for.end19
-
-for.cond4.preheader.lr.ph:                        ; preds = %for.cond1.for.inc17_crit_edge, %entry
-  %indvars.iv41 = phi i64 [ %indvars.iv.next42, %for.cond1.for.inc17_crit_edge ], [ 1, %entry ]
-  %Y.promoted = load i32, i32* @Y
-  br label %for.body6.lr.ph
-
-for.body6.lr.ph:                                  ; preds = %for.cond4.for.end_crit_edge, %for.cond4.preheader.lr.ph
-  %indvars.iv37 = phi i64 [ 1, %for.cond4.preheader.lr.ph ], [ %indvars.iv.next38, %for.cond4.for.end_crit_edge ]
-  %add1334 = phi i32 [ %Y.promoted, %for.cond4.preheader.lr.ph ], [ %add13, %for.cond4.for.end_crit_edge ]
-  %X.promoted = load i32, i32* @X
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.body6, %for.body6.lr.ph
-  %indvars.iv = phi i64 [ 1, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ]
-  %add31 = phi i32 [ %X.promoted, %for.body6.lr.ph ], [ %add, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv37
-  %0 = load i32, i32* %arrayidx8
-  %add = add nsw i32 %add31, %0
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.cond4.for.end_crit_edge, label %for.body6
-
-for.cond4.for.end_crit_edge:                      ; preds = %for.body6
-  %add.lcssa = phi i32 [ %add, %for.body6 ]
-  store i32 %add.lcssa, i32* @X
-  %arrayidx12 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @B, i64 0, i64 %indvars.iv37, i64 %indvars.iv41
-  %1 = load i32, i32* %arrayidx12
-  %add13 = add nsw i32 %add1334, %1
-  %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
-  %lftr.wideiv39 = trunc i64 %indvars.iv.next38 to i32
-  %exitcond40 = icmp eq i32 %lftr.wideiv39, %N
-  br i1 %exitcond40, label %for.cond1.for.inc17_crit_edge, label %for.body6.lr.ph
-
-for.cond1.for.inc17_crit_edge:                    ; preds = %for.cond4.for.end_crit_edge
-  %add13.lcssa = phi i32 [ %add13, %for.cond4.for.end_crit_edge ]
-  store i32 %add13.lcssa, i32* @Y
-  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
-  %lftr.wideiv43 = trunc i64 %indvars.iv.next42 to i32
-  %exitcond44 = icmp eq i32 %lftr.wideiv43, %N
-  br i1 %exitcond44, label %for.end19, label %for.cond4.preheader.lr.ph
-
-for.end19:                                        ; preds = %for.cond1.for.inc17_crit_edge, %entry
-  ret void
-}
-
-;; Multiple use of reduction not safe. Do not interchange.
-;;  for( int i=1;i<N;i++)
-;;    for( int j=1;j<N;j++)
-;;      for( int k=1;k<N;k++) {
-;;        X+=A[k][j];
-;;        Y+=X;
-;;      }
-
-;; Not interchanged hence the phi's in the inner loop will not be split.
-; CHECK: Only inner loops with induction or reduction PHI nodes are supported currently.
-
-define void @reduction_04(i32 %N) {
-entry:
-  %cmp28 = icmp sgt i32 %N, 1
-  br i1 %cmp28, label %for.cond4.preheader.preheader, label %for.end15
-
-for.cond4.preheader.preheader:                    ; preds = %for.inc13, %entry
-  %i.029 = phi i32 [ %inc14, %for.inc13 ], [ 1, %entry ]
-  br label %for.body6.lr.ph
-
-for.body6.lr.ph:                                  ; preds = %for.cond4.for.inc10_crit_edge, %for.cond4.preheader.preheader
-  %indvars.iv30 = phi i64 [ %indvars.iv.next31, %for.cond4.for.inc10_crit_edge ], [ 1, %for.cond4.preheader.preheader ]
-  %X.promoted = load i32, i32* @X
-  %Y.promoted = load i32, i32* @Y
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.body6, %for.body6.lr.ph
-  %indvars.iv = phi i64 [ 1, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ]
-  %add925 = phi i32 [ %Y.promoted, %for.body6.lr.ph ], [ %add9, %for.body6 ]
-  %add24 = phi i32 [ %X.promoted, %for.body6.lr.ph ], [ %add, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv30
-  %0 = load i32, i32* %arrayidx8
-  %add = add nsw i32 %add24, %0
-  %add9 = add nsw i32 %add925, %add
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.cond4.for.inc10_crit_edge, label %for.body6
-
-for.cond4.for.inc10_crit_edge:                    ; preds = %for.body6
-  %add.lcssa = phi i32 [ %add, %for.body6 ]
-  %add9.lcssa = phi i32 [ %add9, %for.body6 ]
-  store i32 %add.lcssa, i32* @X
-  store i32 %add9.lcssa, i32* @Y
-  %indvars.iv.next31 = add nuw nsw i64 %indvars.iv30, 1
-  %lftr.wideiv32 = trunc i64 %indvars.iv.next31 to i32
-  %exitcond33 = icmp eq i32 %lftr.wideiv32, %N
-  br i1 %exitcond33, label %for.inc13, label %for.body6.lr.ph
-
-for.inc13:                                        ; preds = %for.cond4.for.inc10_crit_edge
-  %inc14 = add nuw nsw i32 %i.029, 1
-  %exitcond34 = icmp eq i32 %inc14, %N
-  br i1 %exitcond34, label %for.end15, label %for.cond4.preheader.preheader
-
-for.end15:                                        ; preds = %for.inc13, %entry
-  ret void
-}
-
-;;  for( int i=1;i<N;i++)
-;;    for( int j=1;j<N;j++)
-;;      X+=A[j][i];
-;;  Y = X
-; CHECK: Loops interchanged.
-define void @reduction_05(i32 %N) {
-entry:
-  %cmp16 = icmp sgt i32 %N, 1
-  br i1 %cmp16, label %for.body7.lr.ph, label %for.end8
-
-for.body7.lr.ph:                                  ; preds = %for.cond1.for.inc6_crit_edge, %entry
-  %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ]
-  %X.promoted = load i32, i32* @X
-  br label %for.body7
-
-for.body7:                                        ; preds = %for.body7, %for.body7.lr.ph
-  %indvars.iv = phi i64 [ 1, %for.body7.lr.ph ], [ %indvars.iv.next, %for.body7 ]
-  %add15 = phi i32 [ %X.promoted, %for.body7.lr.ph ], [ %add, %for.body7 ]
-  %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18
-  %0 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %add15, %0
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body7
-
-for.cond1.for.inc6_crit_edge:                     ; preds = %for.body7
-  %add.lcssa = phi i32 [ %add, %for.body7 ]
-  store i32 %add.lcssa, i32* @X
-  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
-  %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
-  %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
-  br i1 %exitcond21, label %for.end8, label %for.body7.lr.ph
-
-for.end8:                                         ; preds = %for.cond1.for.inc6_crit_edge, %entry
-  %add.res = phi i32 [ %add.lcssa, %for.cond1.for.inc6_crit_edge ], [ 0, %entry ]
-  store i32 %add.res, i32* @Y
-  ret void
-}
-- 
GitLab


From a5b7f8b4abdc425dfd2ef141ebc13bb8c3567ffa Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 1 Nov 2018 19:32:04 +0000
Subject: [PATCH 0879/1116] Annotate possibly unintended fallthroughs in
 Hexagon MC code, NFC

Clang's -Wimplicit-fallthrough check fires on these switch cases. GCC
does not warn when a case body that ends in a switch falls through to a
case label of an outer switch.

It's not clear if these fall throughs are truly intended.  The Hexagon
tests pass regardless of whether these case blocks fall through or
break.

For now, I have applied the intended fallthrough annotation macro with a
FIXME comment to unblock enabling the warning. I will send a follow-up
patch that converts them to breaks to the Hexagon maintainers.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345878 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index c707dcb0316..bdb15584ffc 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -127,6 +127,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x4;
     }
+    LLVM_FALLTHROUGH; // FIXME: Intentional?
   case HexagonII::HSIG_L2:
     switch (Gb) {
     default:
@@ -138,6 +139,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x5;
     }
+    LLVM_FALLTHROUGH; // FIXME: Intentional?
   case HexagonII::HSIG_S1:
     switch (Gb) {
     default:
@@ -151,6 +153,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x6;
     }
+    LLVM_FALLTHROUGH; // FIXME: Intentional?
   case HexagonII::HSIG_S2:
     switch (Gb) {
     default:
@@ -166,6 +169,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x7;
     }
+    LLVM_FALLTHROUGH; // FIXME: Intentional?
   case HexagonII::HSIG_A:
     switch (Gb) {
     default:
@@ -173,11 +177,13 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x3;
     }
+    LLVM_FALLTHROUGH; // FIXME: Intentional?
   case HexagonII::HSIG_Compound:
     switch (Gb) {
     case HexagonII::HSIG_Compound:
       return 0xFFFFFFFF;
     }
+    break;
   }
   return 0xFFFFFFFF;
 }
-- 
GitLab


From a754e02f2afe50865de5b87a59c37463993ca606 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 1 Nov 2018 19:36:29 +0000
Subject: [PATCH 0880/1116] [codeview] Add breaks to fix -Wimplicit-fallthrough

This is a minor bug fix. Previously, if you tried to encode the RSP
register on the x86 platform, that might have succeeded and been encoded
incorrectly. However, no existing producer or consumer passes the x86_64
registers when targeting x86_32.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345879 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/CodeView/SymbolRecordMapping.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
index e73c69fff44..2af8205cebc 100644
--- a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
+++ b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
@@ -529,6 +529,7 @@ EncodedFramePtrReg codeview::encodeFramePtrReg(RegisterId Reg, CPUType CPU) {
     default:
       break;
     }
+    break;
   case CPUType::X64:
     switch (Reg) {
     case RegisterId::RSP:
@@ -540,6 +541,7 @@ EncodedFramePtrReg codeview::encodeFramePtrReg(RegisterId Reg, CPUType CPU) {
     default:
       break;
     }
+    break;
   }
   return EncodedFramePtrReg::None;
 }
-- 
GitLab


From 63131d5e3381ff415291c30204ae55242c7b563f Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Thu, 1 Nov 2018 19:38:44 +0000
Subject: [PATCH 0881/1116] [WebAssembly] Fixup `main` signature by default

Differential Revision: https://reviews.llvm.org/D53396

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345880 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp | 7 +------
 test/CodeGen/WebAssembly/call.ll                          | 4 ++--
 test/CodeGen/WebAssembly/function-bitcasts-varargs.ll     | 2 +-
 test/CodeGen/WebAssembly/function-bitcasts.ll             | 2 +-
 test/CodeGen/WebAssembly/main-declaration.ll              | 2 +-
 test/CodeGen/WebAssembly/main-no-args.ll                  | 2 +-
 test/CodeGen/WebAssembly/main-with-args.ll                | 2 +-
 7 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index 0644f1232f6..dffc4d168f5 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -36,11 +36,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-fix-function-bitcasts"
 
-static cl::opt<bool>
-    TemporaryWorkarounds("wasm-temporary-workarounds",
-                         cl::desc("Apply certain temporary workarounds"),
-                         cl::init(true), cl::Hidden);
-
 namespace {
 class FixFunctionBitcasts final : public ModulePass {
   StringRef getPassName() const override {
@@ -241,7 +236,7 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
     // "int main(int argc, char *argv[])", create an artificial call with it
     // bitcasted to that type so that we generate a wrapper for it, so that
     // the C runtime can call it.
-    if (!TemporaryWorkarounds && !F.isDeclaration() && F.getName() == "main") {
+    if (!F.isDeclaration() && F.getName() == "main") {
       Main = &F;
       LLVMContext &C = M.getContext();
       Type *MainArgTys[] = {Type::getInt32Ty(C),
diff --git a/test/CodeGen/WebAssembly/call.ll b/test/CodeGen/WebAssembly/call.ll
index eaa583f8a02..3d768de0d58 100644
--- a/test/CodeGen/WebAssembly/call.ll
+++ b/test/CodeGen/WebAssembly/call.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-temporary-workarounds=false -mattr=+sign-ext,+simd128 | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -fast-isel -fast-isel-abort=1 -wasm-temporary-workarounds=false -mattr=+sign-ext,+simd128 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -mattr=+sign-ext,+simd128 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -fast-isel -fast-isel-abort=1 -mattr=+sign-ext,+simd128 | FileCheck %s
 
 ; Test that basic call operations assemble as expected.
 
diff --git a/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll b/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll
index 633871a599b..015de4eb39c 100644
--- a/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll
+++ b/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -wasm-temporary-workarounds=false -wasm-keep-registers | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -wasm-keep-registers | FileCheck %s
 
 ; Test that function pointer casts casting away varargs are replaced with
 ; wrappers.
diff --git a/test/CodeGen/WebAssembly/function-bitcasts.ll b/test/CodeGen/WebAssembly/function-bitcasts.ll
index 0e7fcd5d570..0853549d1b4 100644
--- a/test/CodeGen/WebAssembly/function-bitcasts.ll
+++ b/test/CodeGen/WebAssembly/function-bitcasts.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -enable-emscripten-cxx-exceptions -wasm-temporary-workarounds=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -enable-emscripten-cxx-exceptions | FileCheck %s
 
 ; Test that function pointer casts are replaced with wrappers.
 
diff --git a/test/CodeGen/WebAssembly/main-declaration.ll b/test/CodeGen/WebAssembly/main-declaration.ll
index 8d9414f326e..23e5887608c 100644
--- a/test/CodeGen/WebAssembly/main-declaration.ll
+++ b/test/CodeGen/WebAssembly/main-declaration.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -wasm-temporary-workarounds=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
 
 ; Test main functions with alternate signatures.
 
diff --git a/test/CodeGen/WebAssembly/main-no-args.ll b/test/CodeGen/WebAssembly/main-no-args.ll
index 09a4feaed14..de3f04bebc4 100644
--- a/test/CodeGen/WebAssembly/main-no-args.ll
+++ b/test/CodeGen/WebAssembly/main-no-args.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -wasm-temporary-workarounds=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
 
 ; Test main functions with alternate signatures.
 
diff --git a/test/CodeGen/WebAssembly/main-with-args.ll b/test/CodeGen/WebAssembly/main-with-args.ll
index aa085409756..3c057afe306 100644
--- a/test/CodeGen/WebAssembly/main-with-args.ll
+++ b/test/CodeGen/WebAssembly/main-with-args.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -wasm-temporary-workarounds=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
 
 ; Test that main function with expected signature is not wrapped
 
-- 
GitLab


From 797cdde77bb739c9a723587d72eeba44a71b768e Mon Sep 17 00:00:00 2001
From: Florian Hahn <florian.hahn@arm.com>
Date: Thu, 1 Nov 2018 19:51:13 +0000
Subject: [PATCH 0882/1116] [LoopInterchange] Fix unused variables in release
 build

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345881 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopInterchange.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 523bac79b69..7a4ae2eb303 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1390,6 +1390,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   // Make sure we have no other PHIs.
   auto InnerPhis = drop_begin(InnerLoopHeader->phis(), 1);
   auto OuterPhis = drop_begin(OuterLoopHeader->phis(), 1);
+  (void) InnerPhis;
+  (void) OuterPhis;
   assert(begin(InnerPhis) == end(InnerPhis) && "Unexpected PHIs in inner loop");
   assert(begin(OuterPhis) == end(OuterPhis) && "Unexpected PHis in outer loop");
 
-- 
GitLab


From b7d45e1d881ea39593187f4c7c4e32a3397248a1 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 1 Nov 2018 19:54:45 +0000
Subject: [PATCH 0883/1116] Fix clang -Wimplicit-fallthrough warnings across
 llvm, NFC

This patch should not introduce any behavior changes. It consists of
mostly one of two changes:
1. Replacing fall through comments with the LLVM_FALLTHROUGH macro
2. Inserting 'break' before falling through into a case block consisting
   of only 'break'.

We were already using this warning with GCC, but its warning behaves
slightly differently. In this patch, the following differences are
relevant:
1. GCC recognizes comments that say "fall through" as annotations, clang
   doesn't
2. GCC doesn't warn on "case N: foo(); default: break;", clang does
3. GCC doesn't warn when the case contains a switch, but falls through
   the outer case.

I will enable the warning separately in a follow-up patch so that it can
be cleanly reverted if necessary.

Reviewers: alexfh, rsmith, lattner, rtrieu, EricWF, bollu

Differential Revision: https://reviews.llvm.org/D53950

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345882 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Demangle/ItaniumDemangle.h                | 1 +
 lib/Analysis/InlineCost.cpp                            | 1 +
 lib/Demangle/MicrosoftDemangleNodes.cpp                | 1 +
 lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp           | 2 +-
 lib/Target/AMDGPU/R600MachineScheduler.cpp             | 1 +
 lib/Target/AMDGPU/SIISelLowering.cpp                   | 3 +--
 lib/Target/BPF/AsmParser/BPFAsmParser.cpp              | 2 +-
 lib/Target/Hexagon/HexagonConstExtenders.cpp           | 1 +
 lib/Target/Hexagon/HexagonConstPropagation.cpp         | 1 +
 lib/Target/Hexagon/HexagonISelDAGToDAG.cpp             | 1 +
 lib/Target/Hexagon/HexagonMachineScheduler.cpp         | 1 +
 lib/Target/Hexagon/HexagonVLIWPacketizer.cpp           | 1 +
 lib/Target/Mips/AsmParser/MipsAsmParser.cpp            | 4 ++--
 lib/Target/Mips/MipsAsmPrinter.cpp                     | 1 +
 lib/Target/Mips/MipsSEISelDAGToDAG.cpp                 | 2 +-
 lib/Target/PowerPC/PPCFastISel.cpp                     | 2 +-
 lib/Target/PowerPC/PPCISelLowering.cpp                 | 4 ++--
 lib/Target/SystemZ/SystemZISelDAGToDAG.cpp             | 4 ++--
 lib/Target/WebAssembly/WebAssemblyFastISel.cpp         | 1 +
 lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp | 2 +-
 20 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/include/llvm/Demangle/ItaniumDemangle.h b/include/llvm/Demangle/ItaniumDemangle.h
index a465921843f..c5619a15bbe 100644
--- a/include/llvm/Demangle/ItaniumDemangle.h
+++ b/include/llvm/Demangle/ItaniumDemangle.h
@@ -2807,6 +2807,7 @@ AbstractManglingParser<Derived, Alloc>::parseCtorDtorName(Node *&SoFar,
       SoFar = make<ExpandedSpecialSubstitution>(SSK);
       if (!SoFar)
         return nullptr;
+      break;
     default:
       break;
     }
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 1b5150a0d18..923dbe59e86 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -720,6 +720,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
   case Instruction::FPToSI:
     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
       Cost += InlineConstants::CallPenalty;
+    break;
   default:
     break;
   }
diff --git a/lib/Demangle/MicrosoftDemangleNodes.cpp b/lib/Demangle/MicrosoftDemangleNodes.cpp
index d5ee47761bd..af893b9b68e 100644
--- a/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -558,6 +558,7 @@ void VariableSymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
   case StorageClass::PublicStatic:
   case StorageClass::ProtectedStatic:
     OS << "static ";
+    break;
   default:
     break;
   }
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8314b4a490f..05b714f924b 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -351,7 +351,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_SHL:
     if (isSALUMapping(MI))
       return getDefaultMappingSOP(MI);
-    // Fall-through
+    LLVM_FALLTHROUGH;
 
   case AMDGPU::G_FADD:
   case AMDGPU::G_FPTOSI:
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 478a473a51b..7769a35aadc 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -236,6 +236,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
       // MI will become a KILL, don't considers it in scheduling
       return AluDiscarded;
     }
+    break;
   default:
     break;
   }
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index e41cf6e771b..66eb9bbb84c 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5049,12 +5049,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::r600_read_tgid_z:
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
-  case Intrinsic::amdgcn_workitem_id_x: {
+  case Intrinsic::amdgcn_workitem_id_x:
   case Intrinsic::r600_read_tidig_x:
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDX);
-  }
   case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::r600_read_tidig_y:
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
diff --git a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 496f2befde5..8890fb8adf4 100644
--- a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -357,8 +357,8 @@ BPFAsmParser::parseOperandAsOperator(OperandVector &Operands) {
   case AsmToken::Plus: {
     if (getLexer().peekTok().is(AsmToken::Integer))
       return MatchOperand_NoMatch;
+    LLVM_FALLTHROUGH;
   }
-  // Fall through.
 
   case AsmToken::Equal:
   case AsmToken::Greater:
diff --git a/lib/Target/Hexagon/HexagonConstExtenders.cpp b/lib/Target/Hexagon/HexagonConstExtenders.cpp
index 424be5e4476..ba9f638796e 100644
--- a/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -1208,6 +1208,7 @@ void HCE::recordExtender(MachineInstr &MI, unsigned OpNum) {
       case Hexagon::S4_subaddi:       // (__: ## - Rs<<0)
         ED.Expr.Rs = MI.getOperand(OpNum+1);
         ED.Expr.Neg = true;
+        break;
       default:                        // (__: ## + __<<_)
         break;
     }
diff --git a/lib/Target/Hexagon/HexagonConstPropagation.cpp b/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 8f22a71dc1f..fa192391313 100644
--- a/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -2463,6 +2463,7 @@ APInt HexagonConstEvaluator::getCmpImm(unsigned Opc, unsigned OpX,
     case Hexagon::A4_cmpheqi:    // s8
     case Hexagon::C4_cmpneqi:   // s8
       Signed = true;
+      break;
     case Hexagon::A4_cmpbeqi:    // u8
       break;
     case Hexagon::C2_cmpgtui:      // u9
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 9a66aece579..470b05bda4c 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1547,6 +1547,7 @@ bool HexagonDAGToDAGISel::keepsLowBits(const SDValue &Val, unsigned NumBits,
         return true;
       }
     }
+    break;
   }
   default:
     break;
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index ebfe21bd17d..908ce24136c 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -105,6 +105,7 @@ bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) {
   default:
     if (!ResourcesModel->canReserveResources(*SU->getInstr()))
       return false;
+    break;
   case TargetOpcode::EXTRACT_SUBREG:
   case TargetOpcode::INSERT_SUBREG:
   case TargetOpcode::SUBREG_TO_REG:
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 93b5bedbb38..722699907ca 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -1568,6 +1568,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
             if (GlueAllocframeStore)
               continue;
           }
+          break;
         default:
           break;
       }
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 2e70d35fc4a..79e0c001a63 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -767,13 +767,13 @@ public:
 
   ~MipsOperand() override {
     switch (Kind) {
-    case k_Immediate:
-      break;
     case k_Memory:
       delete Mem.Base;
       break;
     case k_RegList:
       delete RegList.List;
+      break;
+    case k_Immediate:
     case k_RegisterIndex:
     case k_Token:
       break;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 2e0c25de2bc..a19c97e2ef0 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -561,6 +561,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         O << '$' << MipsInstPrinter::getRegisterName(Reg);
         return false;
       }
+      break;
     }
     case 'w':
       // Print MSA registers for the 'f' constraint
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index cf2899dd375..f030f83295d 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -244,7 +244,7 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
           MI.addOperand(MachineOperand::CreateReg(Mips::SP, false, true));
           break;
         }
-      // fallthrough
+        LLVM_FALLTHROUGH;
       case Mips::BuildPairF64:
       case Mips::ExtractElementF64:
         if (Subtarget->isABI_FPXX() && !Subtarget->hasMTHC1())
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index f212894035d..668169839e7 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -903,7 +903,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
     case MVT::i8:
     case MVT::i16:
       NeedsExt = true;
-      // Intentional fall-through.
+      LLVM_FALLTHROUGH;
     case MVT::i32:
       if (!UseImm)
         CmpOpc = IsZExt ? PPC::CMPLW : PPC::CMPW;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 4ed110e6663..c6f0212ab40 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -3970,7 +3970,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
 
       assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
              "Invalid QPX parameter type");
-      /* fall through */
+      LLVM_FALLTHROUGH;
 
     case MVT::v4f64:
     case MVT::v4i1:
@@ -6113,7 +6113,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
              "Invalid QPX parameter type");
 
-      /* fall through */
+      LLVM_FALLTHROUGH;
     case MVT::v4f64:
     case MVT::v4i1: {
       bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index c8474b15b18..0d2c2389847 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1308,7 +1308,7 @@ bool SystemZDAGToDAGISel::tryFoldLoadStoreIntoMemOperand(SDNode *Node) {
     return false;
   case SystemZISD::SSUBO:
     NegateOperand = true;
-    /* fall through */
+    LLVM_FALLTHROUGH;
   case SystemZISD::SADDO:
     if (MemVT == MVT::i32)
       NewOpc = SystemZ::ASI;
@@ -1319,7 +1319,7 @@ bool SystemZDAGToDAGISel::tryFoldLoadStoreIntoMemOperand(SDNode *Node) {
     break;
   case SystemZISD::USUBO:
     NegateOperand = true;
-    /* fall through */
+    LLVM_FALLTHROUGH;
   case SystemZISD::UADDO:
     if (MemVT == MVT::i32)
       NewOpc = SystemZ::ALSI;
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 5611a1b4588..00e37a4af29 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -447,6 +447,7 @@ unsigned WebAssemblyFastISel::zeroExtendToI32(unsigned Reg, const Value *V,
           (isa<Argument>(V) && cast<Argument>(V)->hasZExtAttr()))
         return copyValue(Reg);
     }
+    break;
   case MVT::i8:
   case MVT::i16:
     break;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index b6320bd0612..54d550b6065 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -1393,7 +1393,7 @@ static int readModRM(struct InternalInstruction* insn) {
       break;
     case 0x1:
       insn->displacementSize = 1;
-      /* FALLTHROUGH */
+      LLVM_FALLTHROUGH;
     case 0x2:
       insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
       switch (rm & 7) {
-- 
GitLab


From 064632dac717d4fafd4aca0cec5c6bb1d70873c8 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 1 Nov 2018 19:59:27 +0000
Subject: [PATCH 0884/1116] [Hexagon] Remove unintended fallthrough from MC
 duplex code

I added these annotations in r345878 because I wasn't sure if the
fallthrough was intended. Krzysztof Parzyszek confirmed that they should
be breaks, so that's what this patch does.

Reviewers: kparzysz

Differential Revision: https://reviews.llvm.org/D53991

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345883 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index bdb15584ffc..f0654d612b4 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -127,7 +127,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x4;
     }
-    LLVM_FALLTHROUGH; // FIXME: Intentional?
+    break;
   case HexagonII::HSIG_L2:
     switch (Gb) {
     default:
@@ -139,7 +139,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x5;
     }
-    LLVM_FALLTHROUGH; // FIXME: Intentional?
+    break;
   case HexagonII::HSIG_S1:
     switch (Gb) {
     default:
@@ -153,7 +153,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x6;
     }
-    LLVM_FALLTHROUGH; // FIXME: Intentional?
+    break;
   case HexagonII::HSIG_S2:
     switch (Gb) {
     default:
@@ -169,7 +169,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x7;
     }
-    LLVM_FALLTHROUGH; // FIXME: Intentional?
+    break;
   case HexagonII::HSIG_A:
     switch (Gb) {
     default:
@@ -177,7 +177,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x3;
     }
-    LLVM_FALLTHROUGH; // FIXME: Intentional?
+    break;
   case HexagonII::HSIG_Compound:
     switch (Gb) {
     case HexagonII::HSIG_Compound:
-- 
GitLab


From 3c3bf73840970d577e51e439b6fd2eaca14d8adc Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 1 Nov 2018 20:31:44 +0000
Subject: [PATCH 0885/1116] Enable -Wimplicit-fallthrough for clang as well as
 GCC

All instances of this warning should already be fixed across all LLVM
subprojects, at least on Linux.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345887 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/HandleLLVMOptions.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 05db1b076a1..b590f768244 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -580,6 +580,7 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
     append("-Wno-long-long" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
 
+  add_flag_if_supported("-Wimplicit-fallthrough" IMPLICIT_FALLTHROUGH_FLAG)
   add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG)
   append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
   append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
-- 
GitLab


From 11545515d0eba3737eb48131236987580eeb8c8c Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Thu, 1 Nov 2018 20:32:15 +0000
Subject: [PATCH 0886/1116] [WebAssembly] Fix signature parsing for 'try' in
 AsmParser

Summary:
Like `block` or `loop`, `try` can take an optional signature which can
be omitted. This patch allows `try`'s signature to be omitted. Also
added some tests for EH instructions.

Reviewers: aardappel

Subscribers: dschuff, sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53873

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345888 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../AsmParser/WebAssemblyAsmParser.cpp           |  2 +-
 test/MC/WebAssembly/basic-assembly.s             | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 60f82fac5de..c257e98d55d 100644
--- a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -303,7 +303,7 @@ public:
     // assembly, so we add a dummy one explicitly (since we have no control
     // over signature tables here, we assume these will be regenerated when
     // the wasm module is generated).
-    if (BaseName == "block" || BaseName == "loop") {
+    if (BaseName == "block" || BaseName == "loop" || BaseName == "try") {
       Operands.push_back(make_unique<WebAssemblyOperand>(
           WebAssemblyOperand::Integer, NameLoc, NameLoc,
           WebAssemblyOperand::IntOp{-1}));
diff --git a/test/MC/WebAssembly/basic-assembly.s b/test/MC/WebAssembly/basic-assembly.s
index a22fe7962d4..cfffda57bcc 100644
--- a/test/MC/WebAssembly/basic-assembly.s
+++ b/test/MC/WebAssembly/basic-assembly.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+simd128,+nontrapping-fptoint < %s | FileCheck %s
+# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s
 
     .text
     .type    test0,@function
@@ -46,6 +46,13 @@ test0:
     # TODO: enable once instruction has been added.
     #i32x4.trunc_s/f32x4:sat
     i32.trunc_s/f32
+    try
+.LBB0_3:
+    i32.catch   0
+.LBB0_4:
+    catch_all
+.LBB0_5:
+    end_try
     #i32.trunc_s:sat/f32
     get_global  __stack_pointer@GLOBAL
     end_function
@@ -88,5 +95,12 @@ test0:
 # CHECK-NEXT:      get_local   5
 # CHECK-NEXT:      f32x4.add
 # CHECK-NEXT:      i32.trunc_s/f32
+# CHECK-NEXT:      try
+# CHECK-NEXT:  .LBB0_3:
+# CHECK-NEXT:      i32.catch   0
+# CHECK-NEXT:  .LBB0_4:
+# CHECK-NEXT:      catch_all
+# CHECK-NEXT:  .LBB0_5:
+# CHECK-NEXT:      end_try
 # CHECK-NEXT:      get_global  __stack_pointer@GLOBAL
 # CHECK-NEXT:      end_function
-- 
GitLab


From 3516febde5b3c403ca1971490194e8c941e2b537 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Grang <mgrang@codeaurora.org>
Date: Thu, 1 Nov 2018 21:23:47 +0000
Subject: [PATCH 0887/1116] [COFF, ARM64] Implement llvm.addressofreturnaddress
 intrinsic

Reviewers: rnk, mstorsjo, efriedma, TomTan

Reviewed By: efriedma

Subscribers: javed.absar, kristof.beyls, chrib, llvm-commits

Differential Revision: https://reviews.llvm.org/D53962

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345892 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                           |  2 +-
 lib/Target/AArch64/AArch64ISelLowering.cpp | 16 +++++++
 test/CodeGen/AArch64/addr-of-ret-addr.ll   | 51 ++++++++++++++++++++++
 3 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/AArch64/addr-of-ret-addr.ll

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 39134fafd46..8ec51786739 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -10328,7 +10328,7 @@ Note that calling this intrinsic does not prevent function inlining or
 other aggressive transformations, so the value returned may not be that
 of the obvious source-language caller.
 
-This intrinsic is only implemented for x86.
+This intrinsic is only implemented for x86 and aarch64.
 
 '``llvm.frameaddress``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index e6a036e64d8..c65af806d51 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2865,6 +2865,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFRAMEADDR(Op, DAG);
   case ISD::RETURNADDR:
     return LowerRETURNADDR(Op, DAG);
+  case ISD::ADDROFRETURNADDR:
+    return LowerADDROFRETURNADDR(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
@@ -5221,6 +5223,20 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
                               + StringRef(RegName)  + "\"."));
 }
 
+SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  SDValue FrameAddr =
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+  SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
+
+  return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
+}
+
 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
                                                SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
diff --git a/test/CodeGen/AArch64/addr-of-ret-addr.ll b/test/CodeGen/AArch64/addr-of-ret-addr.ll
new file mode 100644
index 00000000000..247b2825e15
--- /dev/null
+++ b/test/CodeGen/AArch64/addr-of-ret-addr.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -disable-fp-elim -mtriple=arm64-windows | FileCheck %s
+
+; Test generated from C code:
+; #include <stdarg.h>
+; void *foo() {
+;   return _AddressOfReturnAddress();
+; }
+; int bar(int x(va_list, void*), ...) {
+;   va_list y;
+;   va_start(y, x);
+;   return x(y, _AddressOfReturnAddress()) + 1;
+; }
+
+declare void @llvm.va_start(i8*)
+declare i8* @llvm.addressofreturnaddress()
+
+define dso_local i8* @"foo"() {
+entry:
+  %0 = call i8* @llvm.addressofreturnaddress()
+  ret i8* %0
+
+; CHECK-LABEL: foo
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: add x0, x29, #8
+; CHECK: ldp x29, x30, [sp], #16
+}
+
+define dso_local i32 @"bar"(i32 (i8*, i8*)* %x, ...) {
+entry:
+  %x.addr = alloca i32 (i8*, i8*)*, align 8
+  %y = alloca i8*, align 8
+  store i32 (i8*, i8*)* %x, i32 (i8*, i8*)** %x.addr, align 8
+  %y1 = bitcast i8** %y to i8*
+  call void @llvm.va_start(i8* %y1)
+  %0 = load i32 (i8*, i8*)*, i32 (i8*, i8*)** %x.addr, align 8
+  %1 = call i8* @llvm.addressofreturnaddress()
+  %2 = load i8*, i8** %y, align 8
+  %call = call i32 %0(i8* %2, i8* %1)
+  %add = add nsw i32 %call, 1
+  ret i32 %add
+
+; CHECK-LABEL: bar
+; CHECK: sub sp, sp, #96
+; CHECK: stp x29, x30, [sp, #16]
+; CHECK: add x29, sp, #16
+; CHECK: str x1, [x29, #24]
+; CHECK: add x1, x29, #8
+; CHECK: ldp x29, x30, [sp, #16]
+; CHECK: add sp, sp, #96
+}
-- 
GitLab


From 1e74971827b7eaa7d48ba9f778ecc1a7a64c1a48 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 1 Nov 2018 21:24:33 +0000
Subject: [PATCH 0888/1116] Silence -Wimplicit-fallthrough in gold plugin

Fatal errors are likely fatal, but in case they aren't, return instead
of printing a second warning.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345894 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/gold/gold-plugin.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 239460d972d..ba9d3ac9345 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -449,6 +449,7 @@ static void diagnosticHandler(const DiagnosticInfo &DI) {
   case DS_Error:
     message(LDPL_FATAL, "LLVM gold plugin has failed to create LTO module: %s",
             ErrStorage.c_str());
+    return;
   case DS_Warning:
     Level = LDPL_WARNING;
     break;
-- 
GitLab


From 78ba7a6ff0df83fad9dd1cf69840fec22ce1ee10 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Thu, 1 Nov 2018 21:38:14 +0000
Subject: [PATCH 0889/1116] [llvm-objcopy/strip] [NFC] Clean up tablegen opts
 (clang-format + reorganizing things).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345896 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objcopy/ObjcopyOpts.td | 155 ++++++++++++++++--------------
 tools/llvm-objcopy/StripOpts.td   |  63 +++++-------
 2 files changed, 108 insertions(+), 110 deletions(-)

diff --git a/tools/llvm-objcopy/ObjcopyOpts.td b/tools/llvm-objcopy/ObjcopyOpts.td
index effcca89e4d..285ab9d69db 100644
--- a/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/tools/llvm-objcopy/ObjcopyOpts.td
@@ -1,35 +1,39 @@
 include "llvm/Option/OptParser.td"
 
 multiclass Eq<string name, string help> {
-  def NAME: Separate<["--", "-"], name>;
-  def NAME # _eq: Joined<["--", "-"], name # "=">, Alias<!cast<Separate>(NAME)>,
-    HelpText<help>;
+  def NAME : Separate<["--", "-"], name>;
+  def NAME #_eq : Joined<["--", "-"], name #"=">,
+                  Alias<!cast<Separate>(NAME)>,
+                  HelpText<help>;
 }
 
 def help : Flag<["-", "--"], "help">;
+
 defm binary_architecture
     : Eq<"binary-architecture", "Used when transforming an architecture-less "
                                 "format (such as binary) to another format">;
-def B : JoinedOrSeparate<["-"], "B">,
-        Alias<binary_architecture>;
+def B : JoinedOrSeparate<["-"], "B">, Alias<binary_architecture>;
+
 defm target : Eq<"target", "Format of the input and output file">,
               Values<"binary">;
-def F : JoinedOrSeparate<[ "-" ], "F">, Alias<target>;
+def F : JoinedOrSeparate<["-"], "F">, Alias<target>;
+
 defm input_target : Eq<"input-target", "Format of the input file">,
                     Values<"binary">;
-def I : JoinedOrSeparate<[ "-" ], "I">, Alias<input_target>;
+def I : JoinedOrSeparate<["-"], "I">, Alias<input_target>;
+
 defm output_target : Eq<"output-target", "Format of the output file">,
                      Values<"binary">;
+def O : JoinedOrSeparate<["-"], "O">, Alias<output_target>;
+
 def compress_debug_sections : Flag<["--", "-"], "compress-debug-sections">;
-def compress_debug_sections_eq : Joined<["--", "-"], "compress-debug-sections=">,
-                                 MetaVarName<"[ zlib | zlib-gnu ]">,
-                                 HelpText<"Compress DWARF debug sections using "
-                                          "specified style. Supported styles: "
-                                          "'zlib-gnu' and 'zlib'">;
+def compress_debug_sections_eq
+    : Joined<["--", "-"], "compress-debug-sections=">,
+      MetaVarName<"[ zlib | zlib-gnu ]">,
+      HelpText<"Compress DWARF debug sections using specified style. Supported "
+               "styles: 'zlib-gnu' and 'zlib'">;
 def decompress_debug_sections : Flag<["-", "--"], "decompress-debug-sections">,
                                 HelpText<"Decompress DWARF debug sections.">;
-def O : JoinedOrSeparate<["-"], "O">,
-        Alias<output_target>;
 defm split_dwo
     : Eq<"split-dwo", "Equivalent to extract-dwo on the input file to "
                       "<dwo-file>, then strip-dwo on the input file">,
@@ -51,39 +55,41 @@ def U : Flag<["-"], "U">,
         Alias<disable_deterministic_archives>,
         HelpText<"Alias for --disable-deterministic-archives">;
 
-def preserve_dates : Flag<[ "-", "--" ], "preserve-dates">,
+def preserve_dates : Flag<["-", "--"], "preserve-dates">,
                      HelpText<"Preserve access and modification timestamps">;
+def p : Flag<["-"], "p">, Alias<preserve_dates>;
 
-def p : Flag<[ "-" ], "p">, Alias<preserve_dates>;
+defm add_gnu_debuglink
+    : Eq<"add-gnu-debuglink", "Add a .gnu_debuglink for <debug-file>">,
+      MetaVarName<"debug-file">;
 
-defm add_gnu_debuglink : Eq<"add-gnu-debuglink", "Add a .gnu_debuglink for <debug-file>">,
-                         MetaVarName<"debug-file">;
 defm remove_section : Eq<"remove-section", "Remove <section>">,
                       MetaVarName<"section">;
+def R : JoinedOrSeparate<["-"], "R">, Alias<remove_section>;
+
 defm rename_section
     : Eq<"rename-section",
          "Renames a section from old to new, optionally with specified flags. "
          "Flags supported for GNU compatibility: alloc, load, noload, "
          "readonly, debug, code, data, rom, share, contents, merge, strings.">,
       MetaVarName<"old=new[,flag1,...]">;
-defm redefine_symbol : Eq<"redefine-sym", "Change the name of a symbol old to new">,
-                       MetaVarName<"old=new">;
-def R : JoinedOrSeparate<["-"], "R">,
-        Alias<remove_section>;
-defm keep : Eq<"keep", "Keep <section>">,
-            MetaVarName<"section">;
+defm redefine_symbol
+    : Eq<"redefine-sym", "Change the name of a symbol old to new">,
+      MetaVarName<"old=new">;
+defm keep : Eq<"keep", "Keep <section>">, MetaVarName<"section">;
 defm only_keep : Eq<"only-keep", "Remove all but <section>">,
                  MetaVarName<"section">;
-def j : JoinedOrSeparate<["-"], "j">,
-                      Alias<only_keep>;
+def j : JoinedOrSeparate<["-"], "j">, Alias<only_keep>;
 defm add_section
     : Eq<"add-section",
          "Make a section named <section> with the contents of <file>.">,
       MetaVarName<"section=file">;
-def strip_all : Flag<["-", "--"], "strip-all">,
-                HelpText<"Remove non-allocated sections other than .gnu.warning* sections">;
-def S : Flag<["-"], "S">,
-        Alias<strip_all>;
+
+def strip_all
+    : Flag<["-", "--"], "strip-all">,
+      HelpText<
+          "Remove non-allocated sections other than .gnu.warning* sections">;
+def S : Flag<["-"], "S">, Alias<strip_all>;
 def strip_all_gnu : Flag<["-", "--"], "strip-all-gnu">,
                     HelpText<"Compatible with GNU objcopy's --strip-all">;
 def strip_debug : Flag<["-", "--"], "strip-debug">,
@@ -94,60 +100,67 @@ def strip_sections : Flag<["-", "--"], "strip-sections">,
                      HelpText<"Remove all section headers">;
 def strip_non_alloc : Flag<["-", "--"], "strip-non-alloc">,
                       HelpText<"Remove all non-allocated sections">;
-def extract_dwo : Flag<["-", "--"], "extract-dwo">,
-                  HelpText<"Remove all sections that are not DWARF .dwo sections from file">;
-def localize_hidden : Flag<["-", "--"], "localize-hidden">,
-                      HelpText<"Mark all symbols that have hidden or internal visibility as local">;
+def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
+                     HelpText<"Remove all symbols not needed by relocations">;
+
+def extract_dwo
+    : Flag<["-", "--"], "extract-dwo">,
+      HelpText<
+          "Remove all sections that are not DWARF .dwo sections from file">;
+
+def localize_hidden
+    : Flag<["-", "--"], "localize-hidden">,
+      HelpText<
+          "Mark all symbols that have hidden or internal visibility as local">;
 defm localize_symbol : Eq<"localize-symbol", "Mark <symbol> as local">,
                        MetaVarName<"symbol">;
-def L : JoinedOrSeparate<["-"], "L">,
-        Alias<localize_symbol>;
-defm globalize_symbol : Eq<"globalize-symbol", "Mark <symbol> as global">,
-                       MetaVarName<"symbol">;
+def L : JoinedOrSeparate<["-"], "L">, Alias<localize_symbol>;
 
+defm globalize_symbol : Eq<"globalize-symbol", "Mark <symbol> as global">,
+                        MetaVarName<"symbol">;
 defm keep_global_symbol
-    : Eq<"keep-global-symbol", "Convert all symbols except <symbol> to local. May be repeated "
-               "to convert all except a set of symbols to local.">,
+    : Eq<"keep-global-symbol",
+         "Convert all symbols except <symbol> to local. May be repeated to "
+         "convert all except a set of symbols to local.">,
       MetaVarName<"symbol">;
-def G : JoinedOrSeparate<[ "-" ], "G">, Alias<keep_global_symbol>;
+def G : JoinedOrSeparate<["-"], "G">, Alias<keep_global_symbol>;
 
 defm keep_global_symbols
     : Eq<"keep-global-symbols",
          "Reads a list of symbols from <filename> and runs as if "
-	     "--keep-global-symbol=<symbol> is set for each one. <filename> "
-	     "contains one symbol per line and may contain comments beginning "
-	     "with '#'. Leading and trailing whitespace is stripped from each "
-	     "line. May be repeated to read symbols from many files.">,
+         "--keep-global-symbol=<symbol> is set for each one. <filename> "
+         "contains one symbol per line and may contain comments beginning with "
+         "'#'. Leading and trailing whitespace is stripped from each line. May "
+         "be repeated to read symbols from many files.">,
       MetaVarName<"filename">;
 
-def version : Flag<[ "-", "--" ], "version">,
-              HelpText<"Print the version and exit.">;
-
 defm weaken_symbol : Eq<"weaken-symbol", "Mark <symbol> as weak">,
-                       MetaVarName<"symbol">;
-def W : JoinedOrSeparate<["-"], "W">,
-        Alias<weaken_symbol>;
+                     MetaVarName<"symbol">;
+def W : JoinedOrSeparate<["-"], "W">, Alias<weaken_symbol>;
 def weaken : Flag<["-", "--"], "weaken">,
-                  HelpText<"Mark all global symbols as weak">;
-def discard_all : Flag<["-", "--"], "discard-all">,
-                      HelpText<"Remove all local symbols except file and section symbols">;
-def x : Flag<["-"], "x">,
-        Alias<discard_all>;
+             HelpText<"Mark all global symbols as weak">;
+def discard_all
+    : Flag<["-", "--"], "discard-all">,
+      HelpText<"Remove all local symbols except file and section symbols">;
+def x : Flag<["-"], "x">, Alias<discard_all>;
 defm strip_symbol : Eq<"strip-symbol", "Remove symbol <symbol>">,
-                       MetaVarName<"symbol">;
-def N : JoinedOrSeparate<["-"], "N">,
-        Alias<strip_symbol>;
+                    MetaVarName<"symbol">;
+def N : JoinedOrSeparate<["-"], "N">, Alias<strip_symbol>;
 defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
-                       MetaVarName<"symbol">;
-def K : JoinedOrSeparate<["-"], "K">,
-        Alias<keep_symbol>;
-def only_keep_debug : Flag<["-", "--"], "only-keep-debug">,
-                          HelpText<"Currently ignored. Only for compatibility with GNU objcopy.">;
-def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
-                      HelpText<"Remove all symbols not needed by relocations">;
+                   MetaVarName<"symbol">;
+def K : JoinedOrSeparate<["-"], "K">, Alias<keep_symbol>;
+def only_keep_debug
+    : Flag<["-", "--"], "only-keep-debug">,
+      HelpText<"Currently ignored. Only for compatibility with GNU objcopy.">;
 def keep_file_symbols : Flag<["-", "--"], "keep-file-symbols">,
-                      HelpText<"Do not remove file symbols">;
-defm dump_section : Eq<"dump-section", "Dump contents of section named <section> into file <file>">,
-                   MetaVarName<"section=file">;
-defm prefix_symbols : Eq<"prefix-symbols", "Add <prefix> to the start of every symbol name">,
-                       MetaVarName<"prefix">;
+                        HelpText<"Do not remove file symbols">;
+defm dump_section
+    : Eq<"dump-section",
+         "Dump contents of section named <section> into file <file>">,
+      MetaVarName<"section=file">;
+defm prefix_symbols
+    : Eq<"prefix-symbols", "Add <prefix> to the start of every symbol name">,
+      MetaVarName<"prefix">;
+
+def version : Flag<["-", "--"], "version">,
+              HelpText<"Print the version and exit.">;
diff --git a/tools/llvm-objcopy/StripOpts.td b/tools/llvm-objcopy/StripOpts.td
index 99d5d83914b..3660148f883 100644
--- a/tools/llvm-objcopy/StripOpts.td
+++ b/tools/llvm-objcopy/StripOpts.td
@@ -1,9 +1,10 @@
 include "llvm/Option/OptParser.td"
 
 multiclass Eq<string name, string help> {
-  def NAME: Separate<["--", "-"], name>;
-  def NAME # _eq: Joined<["--", "-"], name # "=">, Alias<!cast<Separate>(NAME)>,
-    HelpText<help>;
+  def NAME : Separate<["--", "-"], name>;
+  def NAME #_eq : Joined<["--", "-"], name #"=">,
+                  Alias<!cast<Separate>(NAME)>,
+                  HelpText<help>;
 }
 
 def help : Flag<["-", "--"], "help">;
@@ -24,57 +25,41 @@ def U : Flag<["-"], "U">,
         Alias<disable_deterministic_archives>,
         HelpText<"Alias for --disable-deterministic-archives">;
 
-defm output : Eq<"o", "Write output to <file>">,
-              MetaVarName<"output">;
+defm output : Eq<"o", "Write output to <file>">, MetaVarName<"output">;
 
-def preserve_dates : Flag<[ "-", "--" ], "preserve-dates">,
+def preserve_dates : Flag<["-", "--"], "preserve-dates">,
                      HelpText<"Preserve access and modification timestamps">;
+def p : Flag<["-"], "p">, Alias<preserve_dates>;
 
-def p : Flag<[ "-" ], "p">, Alias<preserve_dates>;
-
-def strip_all : Flag<["-", "--"], "strip-all">,
-                HelpText<"Remove non-allocated sections other than .gnu.warning* sections">;
-
-def s : Flag<["-"], "s">,
-        Alias<strip_all>;
+def strip_all
+    : Flag<["-", "--"], "strip-all">,
+      HelpText<
+          "Remove non-allocated sections other than .gnu.warning* sections">;
+def s : Flag<["-"], "s">, Alias<strip_all>;
 
 def strip_all_gnu : Flag<["-", "--"], "strip-all-gnu">,
                     HelpText<"Compatible with GNU strip's --strip-all">;
-
 def strip_debug : Flag<["-", "--"], "strip-debug">,
                   HelpText<"Remove debugging symbols only">;
-
-def d : Flag<["-"], "d">,
-        Alias<strip_debug>;
-
-def g : Flag<["-"], "g">,
-        Alias<strip_debug>;
-
-def S : Flag<["-"], "S">,
-        Alias<strip_debug>;
+def d : Flag<["-"], "d">, Alias<strip_debug>;
+def g : Flag<["-"], "g">, Alias<strip_debug>;
+def S : Flag<["-"], "S">, Alias<strip_debug>;
+def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
+                     HelpText<"Remove all symbols not needed by relocations">;
 
 defm remove_section : Eq<"remove-section", "Remove <section>">,
                       MetaVarName<"section">;
-
-def R : JoinedOrSeparate<["-"], "R">,
-        Alias<remove_section>;
+def R : JoinedOrSeparate<["-"], "R">, Alias<remove_section>;
 
 defm keep : Eq<"keep", "Keep <section>">, MetaVarName<"section">;
-
 defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
                    MetaVarName<"symbol">;
+def K : JoinedOrSeparate<["-"], "K">, Alias<keep_symbol>;
 
-def K : JoinedOrSeparate<["-"], "K">,
-        Alias<keep_symbol>;
-
-def discard_all : Flag<["-", "--"], "discard-all">,
-                  HelpText<"Remove all local symbols except file and section symbols">;
+def discard_all
+    : Flag<["-", "--"], "discard-all">,
+      HelpText<"Remove all local symbols except file and section symbols">;
+def x : Flag<["-"], "x">, Alias<discard_all>;
 
-def version : Flag<[ "-", "--" ], "version">,
+def version : Flag<["-", "--"], "version">,
               HelpText<"Print the version and exit.">;
-
-def x : Flag<["-"], "x">,
-        Alias<discard_all>;
-
-def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
-                      HelpText<"Remove all symbols not needed by relocations">;
-- 
GitLab


From 212225ec620c585e084110ed3aad7de19acb7ee7 Mon Sep 17 00:00:00 2001
From: Farhana Aleen <farhana.aleen@gmail.com>
Date: Thu, 1 Nov 2018 22:48:19 +0000
Subject: [PATCH 0890/1116] [AMDGPU] Handle the idot8 pattern generated by FE.

Summary: Different variants of idot8 codegen dag patterns are not generated by llvm-tablegen due to a huge
         increase in the compile time. Support the pattern that clang FE generates after reordering the
         additions in integer-dot8 source language pattern.

Author: FarhanaAleen

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D53937

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345902 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/VOP3PInstructions.td |   9 +
 test/CodeGen/AMDGPU/idot8.ll           | 220 +++++++++++++++++++++++++
 2 files changed, 229 insertions(+)

diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index c91d911a283..2efd28b9cd8 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -287,6 +287,15 @@ foreach Type = ["U", "I"] in
                       (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
     (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
 
+// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase
+// in the compile time. Directly handle the pattern generated by the FE here.
+foreach Type = ["U", "I"] in
+  def : GCNPat <
+    !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
+                      [7, 1, 2, 3, 4, 5, 6], lhs, y,
+                      (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
+    (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+
 } // End SubtargetPredicate = HasDLInsts
 
 multiclass VOP3P_Real_vi<bits<10> op> {
diff --git a/test/CodeGen/AMDGPU/idot8.ll b/test/CodeGen/AMDGPU/idot8.ll
index 044d2d3b914..e0cd2ad506b 100644
--- a/test/CodeGen/AMDGPU/idot8.ll
+++ b/test/CodeGen/AMDGPU/idot8.ll
@@ -4635,3 +4635,223 @@ entry:
   store i8 %add8, i8 addrspace(1)* %dst, align 4
   ret void
 }
+
+define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
+; GFX7-LABEL: udot8_variant1:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_and_b32 s7, s4, 15
+; GFX7-NEXT:    s_and_b32 s8, s5, 15
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40004
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s13, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s15, s4, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s19, s4, 0x40018
+; GFX7-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX7-NEXT:    v_mov_b32_e32 v0, s7
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v0, v1
+; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x40004
+; GFX7-NEXT:    s_bfe_u32 s12, s5, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s20, s5, 0x40018
+; GFX7-NEXT:    s_lshr_b32 s5, s5, 28
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
+; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s13
+; GFX7-NEXT:    v_mad_u32_u24 v0, s14, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s15
+; GFX7-NEXT:    v_mad_u32_u24 v0, s16, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s17
+; GFX7-NEXT:    v_mad_u32_u24 v0, s18, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s19
+; GFX7-NEXT:    v_mad_u32_u24 v0, s20, v1, v0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot8_variant1:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s0, s2, 15
+; GFX8-NEXT:    s_and_b32 s1, s3, 15
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s13, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s15, s2, 0x40018
+; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v2, v3
+; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s8, s3, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s10, s3, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s12, s3, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s14, s3, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s16, s3, 0x40018
+; GFX8-NEXT:    s_lshr_b32 s3, s3, 28
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s13
+; GFX8-NEXT:    v_mad_u32_u24 v2, s14, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX8-NEXT:    v_mad_u32_u24 v2, s16, v3, v2
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: udot8_variant1:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s0, s2, 15
+; GFX9-NEXT:    s_and_b32 s1, s3, 15
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s13, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s15, s2, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v2, v3
+; GFX9-NEXT:    s_bfe_u32 s6, s3, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s8, s3, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s10, s3, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 28
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    v_mad_u32_u24 v2, s12, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s13
+; GFX9-NEXT:    v_mad_u32_u24 v2, s14, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    v_mad_u32_u24 v2, s16, v3, v2
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot8_variant1:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s3, v2, v3
+; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    s_endpgm
+                                          i32 addrspace(1)* %v2addr,
+                                          i32 addrspace(1)* %dst) {
+entry:
+  %v1 = load i32, i32 addrspace(1)* %v1addr, align 4
+  %v2 = load i32, i32 addrspace(1)* %v2addr, align 4
+  %and = and i32 %v1, 15
+  %and1 = and i32 %v2, 15
+  %mul1 = mul nuw nsw i32 %and1, %and
+
+  %shr = lshr i32 %v1, 4
+  %and2 = and i32 %shr, 15
+  %shr3 = lshr i32 %v2, 4
+  %and4 = and i32 %shr3, 15
+  %mul2 = mul nuw nsw i32 %and4, %and2
+
+  %shr6 = lshr i32 %v1, 8
+  %and7 = and i32 %shr6, 15
+  %shr8 = lshr i32 %v2, 8
+  %and9 = and i32 %shr8, 15
+  %mul3 = mul nuw nsw i32 %and9, %and7
+
+  %shr12 = lshr i32 %v1, 12
+  %and13 = and i32 %shr12, 15
+  %shr14 = lshr i32 %v2, 12
+  %and15 = and i32 %shr14, 15
+  %mul4 = mul nuw nsw i32 %and15, %and13
+
+  %shr18 = lshr i32 %v1, 16
+  %and19 = and i32 %shr18, 15
+  %shr20 = lshr i32 %v2, 16
+  %and21 = and i32 %shr20, 15
+  %mul5 = mul nuw nsw i32 %and21, %and19
+
+  %shr24 = lshr i32 %v1, 20
+  %and25 = and i32 %shr24, 15
+  %shr26 = lshr i32 %v2, 20
+  %and27 = and i32 %shr26, 15
+  %mul6 = mul nuw nsw i32 %and27, %and25
+
+  %shr30 = lshr i32 %v1, 24
+  %and31 = and i32 %shr30, 15
+  %shr32 = lshr i32 %v2, 24
+  %and33 = and i32 %shr32, 15
+  %mul7 = mul nuw nsw i32 %and33, %and31
+
+  %shr36 = lshr i32 %v1, 28
+  %shr37 = lshr i32 %v2, 28
+  %mul8 = mul nuw nsw i32 %shr37, %shr36
+  %acc = load i32, i32 addrspace(1)* %dst, align 4
+
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul8
+  %add3 = add i32 %add2, %mul2
+  %add4 = add i32 %add3, %mul3
+  %add5 = add i32 %add4, %mul4
+  %add6 = add i32 %add5, %mul5
+  %add7 = add i32 %add6, %mul6
+  %add8 = add i32 %add7, %mul7
+  store i32 %add8, i32 addrspace(1)* %dst, align 4
+  ret void
+}
-- 
GitLab


From c429a87ae9ee694cf270d5758aec17e3d568133c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 1 Nov 2018 22:56:15 +0000
Subject: [PATCH 0891/1116] [IR] remove fake binop query for fneg

We want to remove this fneg API because it would silently fail
if we add an actual fneg instruction to IR (as proposed in
D53877 ).

We have a newer 'match' API that makes checking for
these patterns simpler. It also works with vectors
that may include undef elements in constants.

If any out-of-tree users need updating, they can model
their code changes on this commit:
https://reviews.llvm.org/rL345295


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345904 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/InstrTypes.h |  7 -------
 lib/IR/Instructions.cpp      | 19 -------------------
 2 files changed, 26 deletions(-)

diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index 4487768e6c6..ec782face6c 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -308,13 +308,6 @@ public:
   static BinaryOperator *CreateNot(Value *Op, const Twine &Name,
                                    BasicBlock *InsertAtEnd);
 
-  /// Check if the given Value is an FNeg instruction.
-  static bool isFNeg(const Value *V, bool IgnoreZeroSign=false);
-
-  /// Helper functions to extract the unary argument of an FNeg.
-  static const Value *getFNegArgument(const Value *BinOp);
-  static       Value *getFNegArgument(      Value *BinOp);
-
   BinaryOps getOpcode() const {
     return static_cast<BinaryOps>(Instruction::getOpcode());
   }
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index d92706500bc..3b8d8d0c690 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -2109,25 +2109,6 @@ BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
                             Op->getType(), Name, InsertAtEnd);
 }
 
-bool BinaryOperator::isFNeg(const Value *V, bool IgnoreZeroSign) {
-  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
-    if (Bop->getOpcode() == Instruction::FSub)
-      if (Constant *C = dyn_cast<Constant>(Bop->getOperand(0))) {
-        if (!IgnoreZeroSign)
-          IgnoreZeroSign = cast<Instruction>(V)->hasNoSignedZeros();
-        return !IgnoreZeroSign ? C->isNegativeZeroValue() : C->isZeroValue();
-      }
-  return false;
-}
-
-Value *BinaryOperator::getFNegArgument(Value *BinOp) {
-  return cast<BinaryOperator>(BinOp)->getOperand(1);
-}
-
-const Value *BinaryOperator::getFNegArgument(const Value *BinOp) {
-  return getFNegArgument(const_cast<Value*>(BinOp));
-}
-
 // Exchange the two operands to this instruction. This instruction is safe to
 // use on any binary instruction and does not modify the semantics of the
 // instruction. If the instruction is order-dependent (SetLT f.e.), the opcode
-- 
GitLab


From a921b710dfab9ff273e1cac4825c2af37c50c2b4 Mon Sep 17 00:00:00 2001
From: Dean Michael Berris <dberris@google.com>
Date: Thu, 1 Nov 2018 22:57:50 +0000
Subject: [PATCH 0892/1116] [XRay] Fix TSC and atomic custom/typed event
 accounting

Summary:
This is a follow-on change to D53858 which turns out to have had a TSC
accounting bug when writing out function exit records in FDR mode.

This change adds a number of tests to ensure that:

- We are handling the delta between the exit TSC and the last TSC we've
  seen.

- We are writing the custom event and typed event records as a single
  update to the buffer extents.

- We are able to catch boundary conditions when loading FDR logs.

We introduce a TSC matcher to the test helpers, which we use in the
testing/verification of the TSC accounting change.

Reviewers: mboerger

Subscribers: mgorny, hiraditya, jfb, llvm-commits

Differential Revision: https://reviews.llvm.org/D53967

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345905 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/XRay/FDRRecordProducer.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/XRay/FDRRecordProducer.cpp b/lib/XRay/FDRRecordProducer.cpp
index 4b010f1fa62..59b5697cd64 100644
--- a/lib/XRay/FDRRecordProducer.cpp
+++ b/lib/XRay/FDRRecordProducer.cpp
@@ -84,6 +84,12 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
   // the rest of the bytes.
   auto PreReadOffset = OffsetPtr;
   uint8_t FirstByte = E.getU8(&OffsetPtr);
+  if (OffsetPtr == PreReadOffset)
+    return createStringError(
+        std::make_error_code(std::errc::executable_format_error),
+        "Failed reading one byte from offset %d.", OffsetPtr);
+
+  // Set up our result record.
   std::unique_ptr<Record> R;
 
   // For metadata records, handle especially here.
-- 
GitLab


From 41e2e9fdff88b753bc54e0e71b04955a76cca031 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Thu, 1 Nov 2018 23:09:06 +0000
Subject: [PATCH 0893/1116] [MachineOutliner][NFC] Remember when you map
 something illegal across MBBs

Instruction mapping in the outliner uses "illegal numbers" to signify that
something can't ever be part of an outlining candidate. This means that the
number is unique and can't be part of any repeated substring.

Because each of these is unique, we can use a single unique number to represent
a range of things we can't outline.

The outliner tries to leverage this using a flag which is set in an MBB when
the previous instruction we tried to map was "illegal". This patch improves
that logic to work across MBBs. As a bonus, this also simplifies the mapping
logic somewhat.

This also updates the machine-outliner-remarks test, which was impacted by the
order of Candidates on an OutlinedFunction changing. This order isn't
guaranteed, so I added a FIXME to fix that in a follow-up. The order of
Candidates on an OutlinedFunction isn't important, so this still is NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345906 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineOutliner.cpp               | 47 +++++++++++--------
 .../AArch64/machine-outliner-remarks.ll       |  5 +-
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 00856361db0..c12bf52c0e2 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -548,6 +548,12 @@ struct InstructionMapper {
   /// at index i in \p UnsignedVec for each index i.
   std::vector<MachineBasicBlock::iterator> InstrList;
 
+  // Set if we added an illegal number in the previous step.
+  // Since each illegal number is unique, we only need one of them between
+  // each range of legal numbers. This lets us make sure we don't add more
+  // than one illegal number per range.
+  bool AddedIllegalLastTime = false;
+
   /// Maps \p *It to a legal integer.
   ///
   /// Updates \p InstrList, \p UnsignedVec, \p InstructionIntegerMap,
@@ -555,6 +561,9 @@ struct InstructionMapper {
   ///
   /// \returns The integer that \p *It was mapped to.
   unsigned mapToLegalUnsigned(MachineBasicBlock::iterator &It) {
+    // We added something legal, so we should unset the AddedLegalLastTime
+    // flag.
+    AddedIllegalLastTime = false;
 
     // Get the integer for this instruction or give it the current
     // LegalInstrNumber.
@@ -593,6 +602,12 @@ struct InstructionMapper {
   ///
   /// \returns The integer that \p *It was mapped to.
   unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It) {
+    // Only add one illegal number per range of legal numbers.
+    if (AddedIllegalLastTime)
+      return IllegalInstrNumber;
+
+    // Remember that we added an illegal number last time.
+    AddedIllegalLastTime = true;
     unsigned MINumber = IllegalInstrNumber;
 
     InstrList.push_back(It);
@@ -624,38 +639,28 @@ struct InstructionMapper {
   void convertToUnsignedVec(MachineBasicBlock &MBB,
                             const TargetInstrInfo &TII) {
     unsigned Flags = TII.getMachineOutlinerMBBFlags(MBB);
-
-    // Set to true whenever we map an illegal number.
-    bool AddedIllegalLastTime = false;
-    for (MachineBasicBlock::iterator It = MBB.begin(), Et = MBB.end(); It != Et;
-         It++) {
-
+    MachineBasicBlock::iterator It = MBB.begin();
+    for (MachineBasicBlock::iterator Et = MBB.end(); It != Et; It++) {
       // Keep track of where this instruction is in the module.
       switch (TII.getOutliningType(It, Flags)) {
       case InstrType::Illegal:
-        // If we added an illegal number last time, then don't add more of them.
-        // One number is all that is necessary to prevent matches on illegal
-        // instructions.
-        if (AddedIllegalLastTime)
-          break;
-        AddedIllegalLastTime = true;
         mapToIllegalUnsigned(It);
         break;
 
       case InstrType::Legal:
-        AddedIllegalLastTime = false;
         mapToLegalUnsigned(It);
         break;
 
       case InstrType::LegalTerminator:
         mapToLegalUnsigned(It);
-        InstrList.push_back(It);
-        AddedIllegalLastTime = true;
-        UnsignedVec.push_back(IllegalInstrNumber);
-        IllegalInstrNumber--;
+        // The instruction also acts as a terminator, so we have to record that
+        // in the string.
+        mapToIllegalUnsigned(It);
         break;
 
       case InstrType::Invisible:
+        // Normally this is set by mapTo(Blah)Unsigned, but we just want to
+        // skip this instruction. So, unset the flag here.
         AddedIllegalLastTime = false;
         break;
       }
@@ -665,9 +670,7 @@ struct InstructionMapper {
     // "string". This makes sure we won't match across basic block or function
     // boundaries since the "end" is encoded uniquely and thus appears in no
     // repeated substring.
-    InstrList.push_back(MBB.end());
-    UnsignedVec.push_back(IllegalInstrNumber);
-    IllegalInstrNumber--;
+    mapToIllegalUnsigned(It);
   }
 
   InstructionMapper() {
@@ -854,6 +857,10 @@ INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, "Machine Function Outliner", false,
 void MachineOutliner::emitNotOutliningCheaperRemark(
     unsigned StringLen, std::vector<Candidate> &CandidatesForRepeatedSeq,
     OutlinedFunction &OF) {
+  // FIXME: Right now, we arbitrarily choose some Candidate from the
+  // OutlinedFunction. This isn't necessarily fixed, nor does it have to be.
+  // We should probably sort these by function name or something to make sure
+  // the remarks are stable.
   Candidate &C = CandidatesForRepeatedSeq.front();
   MachineOptimizationRemarkEmitter MORE(*(C.getMF()), nullptr);
   MORE.emit([&]() {
diff --git a/test/CodeGen/AArch64/machine-outliner-remarks.ll b/test/CodeGen/AArch64/machine-outliner-remarks.ll
index e721b8a648a..29872d9518a 100644
--- a/test/CodeGen/AArch64/machine-outliner-remarks.ll
+++ b/test/CodeGen/AArch64/machine-outliner-remarks.ll
@@ -9,10 +9,13 @@
 ; CHECK-SAME: <UNKNOWN LOCATION>)
 ; RUN: llc %s -enable-machine-outliner -mtriple=aarch64-unknown-unknown -o /dev/null -pass-remarks-missed=machine-outliner -pass-remarks-output=%t.yaml
 ; RUN: cat %t.yaml | FileCheck %s -check-prefix=YAML
+
+; For the YAML case, the function we pick depends on the order of the candidate
+; list.
 ; YAML: --- !Missed
 ; YAML-NEXT: Pass:            machine-outliner
 ; YAML-NEXT: Name:            NotOutliningCheaper
-; YAML-NEXT: Function:        dog
+; YAML-NEXT: Function:
 ; YAML-NEXT: Args:            
 ; YAML-NEXT:   - String:          'Did not outline '
 ; YAML-NEXT:   - Length:          '2'
-- 
GitLab


From 7cdbd1162cba6d0e8212399552baa170262632de Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 1 Nov 2018 23:21:42 +0000
Subject: [PATCH 0894/1116] [X86] Add test cases for adding vector support to
 isTruncateOf in DAGCombiner::visitZERO_EXTEND

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345907 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/vector-pcmp.ll | 153 ++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)

diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll
index c963922addd..916c09a10ee 100644
--- a/test/CodeGen/X86/vector-pcmp.ll
+++ b/test/CodeGen/X86/vector-pcmp.ll
@@ -438,3 +438,156 @@ define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) {
   %zext = zext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %zext
 }
+
+; Test that we optimize a zext of a vector setcc ne zero where all bits but the
+; lsb are known to be zero.
+define <8 x i32> @cmpne_knownzeros_zext_v8i16_v8i32(<8 x i16> %x) {
+; SSE2-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrlw $15, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqw %xmm0, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    psrlw $15, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pcmpeqw %xmm0, %xmm2
+; SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm1
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; SSE42-NEXT:    pand %xmm2, %xmm0
+; SSE42-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE42-NEXT:    pand %xmm2, %xmm1
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %a = lshr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %b = icmp ne <8 x i16> %a, zeroinitializer
+  %c = zext <8 x i1> %b to <8 x i32>
+  ret <8 x i32> %c
+}
+
+define <8 x i32> @cmpne_knownzeros_zext_v8i32_v8i32(<8 x i32> %x) {
+; SSE-LABEL: cmpne_knownzeros_zext_v8i32_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrld $31, %xmm1
+; SSE-NEXT:    psrld $31, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1]
+; SSE-NEXT:    pandn %xmm3, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: cmpne_knownzeros_zext_v8i32_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: cmpne_knownzeros_zext_v8i32_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %a = lshr <8 x i32> %x, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %b = icmp ne <8 x i32> %a, zeroinitializer
+  %c = zext <8 x i1> %b to <8 x i32>
+  ret <8 x i32> %c
+}
+
+define <8 x i16> @cmpne_knownzeros_zext_v8i32_v8i16(<8 x i32> %x) {
+; SSE-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrld $31, %xmm0
+; SSE-NEXT:    psrld $31, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    psrlw $15, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %a = lshr <8 x i32> %x, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %b = icmp ne <8 x i32> %a, zeroinitializer
+  %c = zext <8 x i1> %b to <8 x i16>
+  ret <8 x i16> %c
+}
-- 
GitLab


From 41672bb1ce075acaa6d5d36f73ec30fb136784cb Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 1 Nov 2018 23:21:45 +0000
Subject: [PATCH 0895/1116] [DAGCombiner] Make the isTruncateOf call from
 visitZERO_EXTEND work for vectors. Remove FIXME.

I'm having trouble creating a test case for the ISD::TRUNCATE part of this that shows any codegen differences. But I was able to test the setcc path which is what the test changes here cover.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345908 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  29 +++---
 test/CodeGen/X86/vector-pcmp.ll          | 107 ++++++-----------------
 2 files changed, 42 insertions(+), 94 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d0c898f2e97..03145c5ce5a 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8696,27 +8696,25 @@ static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
     return true;
   }
 
-  if (N->getOpcode() != ISD::SETCC || N->getValueType(0) != MVT::i1 ||
-      cast<CondCodeSDNode>(N->getOperand(2))->get() != ISD::SETNE)
+  if (N.getOpcode() != ISD::SETCC ||
+      N.getValueType().getScalarType() != MVT::i1 ||
+      cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE)
     return false;
 
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   assert(Op0.getValueType() == Op1.getValueType());
 
-  if (isNullConstant(Op0))
+  if (isNullConstantOrNullSplatConstant(Op0))
     Op = Op1;
-  else if (isNullConstant(Op1))
+  else if (isNullConstantOrNullSplatConstant(Op1))
     Op = Op0;
   else
     return false;
 
   DAG.computeKnownBits(Op, Known);
 
-  if (!(Known.Zero | 1).isAllOnesValue())
-    return false;
-
-  return true;
+  return (Known.Zero | 1).isAllOnesValue();
 }
 
 SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
@@ -8736,17 +8734,16 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   // fold (zext (truncate x)) -> (zext x) or
   //      (zext (truncate x)) -> (truncate x)
   // This is valid when the truncated bits of x are already zero.
-  // FIXME: We should extend this to work for vectors too.
   SDValue Op;
   KnownBits Known;
-  if (!VT.isVector() && isTruncateOf(DAG, N0, Op, Known)) {
+  if (isTruncateOf(DAG, N0, Op, Known)) {
     APInt TruncatedBits =
-      (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ?
-      APInt(Op.getValueSizeInBits(), 0) :
-      APInt::getBitsSet(Op.getValueSizeInBits(),
-                        N0.getValueSizeInBits(),
-                        std::min(Op.getValueSizeInBits(),
-                                 VT.getSizeInBits()));
+      (Op.getScalarValueSizeInBits() == N0.getScalarValueSizeInBits()) ?
+      APInt(Op.getScalarValueSizeInBits(), 0) :
+      APInt::getBitsSet(Op.getScalarValueSizeInBits(),
+                        N0.getScalarValueSizeInBits(),
+                        std::min(Op.getScalarValueSizeInBits(),
+                                 VT.getScalarSizeInBits()));
     if (TruncatedBits.isSubsetOf(Known.Zero))
       return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
   }
diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll
index 916c09a10ee..683a56544cb 100644
--- a/test/CodeGen/X86/vector-pcmp.ll
+++ b/test/CodeGen/X86/vector-pcmp.ll
@@ -444,56 +444,36 @@ define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) {
 define <8 x i32> @cmpne_knownzeros_zext_v8i16_v8i32(<8 x i16> %x) {
 ; SSE2-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrlw $15, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $15, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpeqw %xmm0, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    psrlw $15, %xmm0
-; SSE42-NEXT:    pxor %xmm2, %xmm2
-; SSE42-NEXT:    pcmpeqw %xmm0, %xmm2
-; SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm2, %xmm1
-; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE42-NEXT:    pand %xmm2, %xmm0
-; SSE42-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE42-NEXT:    pand %xmm2, %xmm1
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT:    movdqa %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %a = lshr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   %b = icmp ne <8 x i16> %a, zeroinitializer
@@ -504,14 +484,8 @@ define <8 x i32> @cmpne_knownzeros_zext_v8i16_v8i32(<8 x i16> %x) {
 define <8 x i32> @cmpne_knownzeros_zext_v8i32_v8i32(<8 x i32> %x) {
 ; SSE-LABEL: cmpne_knownzeros_zext_v8i32_v8i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psrld $31, %xmm1
 ; SSE-NEXT:    psrld $31, %xmm0
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1]
-; SSE-NEXT:    pandn %xmm3, %xmm0
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    psrld $31, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: cmpne_knownzeros_zext_v8i32_v8i32:
@@ -519,23 +493,12 @@ define <8 x i32> @cmpne_knownzeros_zext_v8i32_v8i32(<8 x i32> %x) {
 ; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: cmpne_knownzeros_zext_v8i32_v8i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %a = lshr <8 x i32> %x, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
   %b = icmp ne <8 x i32> %a, zeroinitializer
@@ -544,46 +507,34 @@ define <8 x i32> @cmpne_knownzeros_zext_v8i32_v8i32(<8 x i32> %x) {
 }
 
 define <8 x i16> @cmpne_knownzeros_zext_v8i32_v8i16(<8 x i32> %x) {
-; SSE-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    psrld $31, %xmm0
-; SSE-NEXT:    psrld $31, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE-NEXT:    pxor %xmm3, %xmm1
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE-NEXT:    pxor %xmm3, %xmm0
-; SSE-NEXT:    packssdw %xmm1, %xmm0
-; SSE-NEXT:    psrlw $15, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrld $31, %xmm1
+; SSE2-NEXT:    psrld $31, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    psrld $31, %xmm1
+; SSE42-NEXT:    psrld $31, %xmm0
+; SSE42-NEXT:    packusdw %xmm1, %xmm0
+; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpackssdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
   %a = lshr <8 x i32> %x, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-- 
GitLab


From 08238b7fa9a979fc3fb71e429d7d537f26f571b7 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Grang <mgrang@codeaurora.org>
Date: Thu, 1 Nov 2018 23:22:25 +0000
Subject: [PATCH 0896/1116] [COFF, ARM64] Implement Intrinsic.sponentry for
 AArch64

Summary: This patch adds Intrinsic.sponentry. This intrinsic is required to correctly support setjmp for AArch64 Windows platform.

Patch by: Yin Ma (yinma@codeaurora.org)

Reviewers: mgrang, ssijaric, eli.friedman, TomTan, mstorsjo, rnk, compnerd, efriedma

Reviewed By: efriedma

Subscribers: efriedma, javed.absar, kristof.beyls, chrib, llvm-commits

Differential Revision: https://reviews.llvm.org/D53996

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345909 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                              |  79 ++++++++-----
 include/llvm/CodeGen/ISDOpcodes.h             |   2 +-
 include/llvm/IR/Intrinsics.td                 |   1 +
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |   1 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   4 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   1 +
 lib/Target/AArch64/AArch64FastISel.cpp        |  15 +++
 lib/Target/AArch64/AArch64ISelLowering.cpp    |  12 ++
 lib/Target/AArch64/AArch64ISelLowering.h      |   1 +
 test/CodeGen/AArch64/sponentry.ll             | 104 ++++++++++++++++++
 10 files changed, 190 insertions(+), 30 deletions(-)
 create mode 100644 test/CodeGen/AArch64/sponentry.ll

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 8ec51786739..b98862ecb42 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -2926,7 +2926,7 @@ Simple Constants
     hexadecimal notation (see below). The assembler requires the exact
     decimal value of a floating-point constant. For example, the
     assembler accepts 1.25 but rejects 1.3 because 1.3 is a repeating
-    decimal in binary. Floating-point constants must have a 
+    decimal in binary. Floating-point constants must have a
     :ref:`floating-point <t_floating>` type.
 **Null pointer constants**
     The identifier '``null``' is recognized as a null pointer constant
@@ -3331,7 +3331,7 @@ The following is the syntax for constant expressions:
     value won't fit in the integer type, the result is a
     :ref:`poison value <poisonvalues>`.
 ``uitofp (CST to TYPE)``
-    Convert an unsigned integer constant to the corresponding 
+    Convert an unsigned integer constant to the corresponding
     floating-point constant. TYPE must be a scalar or vector floating-point
     type.  CST must be of scalar or vector integer type. Both CST and TYPE must
     be scalars, or vectors of the same number of elements.
@@ -5434,7 +5434,7 @@ Irreducible loop header weights are typically based on profile data.
 '``invariant.group``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The experimental ``invariant.group`` metadata may be attached to 
+The experimental ``invariant.group`` metadata may be attached to
 ``load``/``store`` instructions referencing a single metadata with no entries.
 The existence of the ``invariant.group`` metadata on the instruction tells
 the optimizer that every ``load`` and ``store`` to the same pointer operand
@@ -6875,7 +6875,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fadd``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -6883,7 +6883,7 @@ Semantics:
 
 The value produced is the floating-point sum of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -6972,7 +6972,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fsub``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -6980,7 +6980,7 @@ Semantics:
 
 The value produced is the floating-point difference of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7067,7 +7067,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fmul``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7075,7 +7075,7 @@ Semantics:
 
 The value produced is the floating-point product of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7201,7 +7201,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fdiv``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7209,7 +7209,7 @@ Semantics:
 
 The value produced is the floating-point quotient of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7344,7 +7344,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``frem``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7352,10 +7352,10 @@ Semantics:
 
 The value produced is the floating-point remainder of the two operands.
 This is the same output as a libm '``fmod``' function, but without any
-possibility of setting ``errno``. The remainder has the same sign as the 
+possibility of setting ``errno``. The remainder has the same sign as the
 dividend.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -8809,7 +8809,7 @@ Semantics:
 
 The '``fptrunc``' instruction casts a ``value`` from a larger
 :ref:`floating-point <t_floating>` type to a smaller :ref:`floating-point
-<t_floating>` type.  
+<t_floating>` type.
 This instruction is assumed to execute in the default :ref:`floating-point
 environment <floatenv>`.
 
@@ -10330,6 +10330,27 @@ of the obvious source-language caller.
 
 This intrinsic is only implemented for x86 and aarch64.
 
+'``llvm.sponentry``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i8* @llvm.sponentry()
+
+Overview:
+"""""""""
+
+The '``llvm.sponentry``' intrinsic returns the stack pointer value at
+the entry of the current function calling this intrinsic.
+
+Semantics:
+""""""""""
+
+Note this intrinsic is only verified on AArch64.
+
 '``llvm.frameaddress``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -12115,11 +12136,11 @@ Overview:
 
 The '``llvm.fshl``' family of intrinsic functions performs a funnel shift left:
 the first two values are concatenated as { %a : %b } (%a is the most significant
-bits of the wide value), the combined value is shifted left, and the most 
-significant bits are extracted to produce a result that is the same size as the 
-original arguments. If the first 2 arguments are identical, this is equivalent 
-to a rotate left operation. For vector types, the operation occurs for each 
-element of the vector. The shift argument is treated as an unsigned amount 
+bits of the wide value), the combined value is shifted left, and the most
+significant bits are extracted to produce a result that is the same size as the
+original arguments. If the first 2 arguments are identical, this is equivalent
+to a rotate left operation. For vector types, the operation occurs for each
+element of the vector. The shift argument is treated as an unsigned amount
 modulo the element size of the arguments.
 
 Arguments:
@@ -12161,11 +12182,11 @@ Overview:
 
 The '``llvm.fshr``' family of intrinsic functions performs a funnel shift right:
 the first two values are concatenated as { %a : %b } (%a is the most significant
-bits of the wide value), the combined value is shifted right, and the least 
-significant bits are extracted to produce a result that is the same size as the 
-original arguments. If the first 2 arguments are identical, this is equivalent 
-to a rotate right operation. For vector types, the operation occurs for each 
-element of the vector. The shift argument is treated as an unsigned amount 
+bits of the wide value), the combined value is shifted right, and the least
+significant bits are extracted to produce a result that is the same size as the
+original arguments. If the first 2 arguments are identical, this is equivalent
+to a rotate right operation. For vector types, the operation occurs for each
+element of the vector. The shift argument is treated as an unsigned amount
 modulo the element size of the arguments.
 
 Arguments:
@@ -13446,7 +13467,7 @@ The '``llvm.masked.expandload``' intrinsic is designed for reading multiple scal
     %Tmp = call <8 x double> @llvm.masked.expandload.v8f64(double* %Bptr, <8 x i1> %Mask, <8 x double> undef)
     ; Store the result in A
     call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %Tmp, <8 x double>* %Aptr, i32 8, <8 x i1> %Mask)
-    
+
     ; %Bptr should be increased on each iteration according to the number of '1' elements in the Mask.
     %MaskI = bitcast <8 x i1> %Mask to i8
     %MaskIPopcnt = call i8 @llvm.ctpop.i8(i8 %MaskI)
@@ -13503,7 +13524,7 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i
     %Tmp = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %Aptr, i32 8, <8 x i1> %Mask, <8 x double> undef)
     ; Store all selected elements consecutively in array B
     call <void> @llvm.masked.compressstore.v8f64(<8 x double> %Tmp, double* %Bptr, <8 x i1> %Mask)
-    
+
     ; %Bptr should be increased on each iteration according to the number of '1' elements in the Mask.
     %MaskI = bitcast <8 x i1> %Mask to i8
     %MaskIPopcnt = call i8 @llvm.ctpop.i8(i8 %MaskI)
@@ -14136,7 +14157,7 @@ Overview:
 
 The '``llvm.experimental.constrained.powi``' intrinsic returns the first operand
 raised to the (positive or negative) power specified by the second operand. The
-order of evaluation of multiplications is not defined. When a vector of 
+order of evaluation of multiplications is not defined. When a vector of
 floating-point type is used, the second argument remains a scalar integer value.
 
 
@@ -14462,7 +14483,7 @@ Overview:
 """""""""
 
 The '``llvm.experimental.constrained.nearbyint``' intrinsic returns the first
-operand rounded to the nearest integer. It will not raise an inexact 
+operand rounded to the nearest integer. It will not raise an inexact
 floating-point exception if the operand is not an integer.
 
 
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index a023aa5b3f6..da10119f438 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -70,7 +70,7 @@ namespace ISD {
     /// of the frame or return address to return.  An index of zero corresponds
     /// to the current function's frame or return address, an index of one to
     /// the parent's frame or return address, and so on.
-    FRAMEADDR, RETURNADDR, ADDROFRETURNADDR,
+    FRAMEADDR, RETURNADDR, ADDROFRETURNADDR, SPONENTRY,
 
     /// LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
     /// Materializes the offset from the local object pointer of another
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 47a66a27e38..c965140a00b 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -320,6 +320,7 @@ def int_gcwrite : Intrinsic<[],
 def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_addressofreturnaddress : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_frameaddress  : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_sponentry  : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_read_register  : Intrinsic<[llvm_anyint_ty], [llvm_metadata_ty],
                                    [IntrReadMem], "llvm.read_register">;
 def int_write_register : Intrinsic<[], [llvm_metadata_ty, llvm_anyint_ty],
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index a96b8628ac8..d5fb7a0697d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1059,6 +1059,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::FRAMEADDR:
   case ISD::RETURNADDR:
   case ISD::ADDROFRETURNADDR:
+  case ISD::SPONENTRY:
     // These operations lie about being legal: when they claim to be legal,
     // they should actually be custom-lowered.
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dac99eddec3..cb650c6fc13 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5050,6 +5050,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, DAG.getNode(ISD::ADDROFRETURNADDR, sdl,
                              TLI.getPointerTy(DAG.getDataLayout())));
     return nullptr;
+  case Intrinsic::sponentry:
+    setValue(&I, DAG.getNode(ISD::SPONENTRY, sdl,
+                             TLI.getPointerTy(DAG.getDataLayout())));
+    return nullptr;
   case Intrinsic::frameaddress:
     setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
                              TLI.getPointerTy(DAG.getDataLayout()),
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 5c17a5d295d..c21f2d3b717 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -124,6 +124,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::RETURNADDR:                 return "RETURNADDR";
   case ISD::ADDROFRETURNADDR:           return "ADDROFRETURNADDR";
   case ISD::FRAMEADDR:                  return "FRAMEADDR";
+  case ISD::SPONENTRY:                  return "SPONENTRY";
   case ISD::LOCAL_RECOVER:              return "LOCAL_RECOVER";
   case ISD::READ_REGISTER:              return "READ_REGISTER";
   case ISD::WRITE_REGISTER:             return "WRITE_REGISTER";
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 5e4c5dcf09c..dfc08a12f51 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3450,6 +3450,21 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     updateValueMap(II, SrcReg);
     return true;
   }
+  case Intrinsic::sponentry: {
+    MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
+
+    // SP = FP + Fixed Object + 16
+    int FI = MFI.CreateFixedObject(4, 0, false);
+    unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::ADDXri), ResultReg)
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addImm(0);
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
   case Intrinsic::memcpy:
   case Intrinsic::memmove: {
     const auto *MTI = cast<MemTransferInst>(II);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index c65af806d51..a18284f892c 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2863,6 +2863,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFP_EXTEND(Op, DAG);
   case ISD::FRAMEADDR:
     return LowerFRAMEADDR(Op, DAG);
+  case ISD::SPONENTRY:
+    return LowerSPONENTRY(Op, DAG);
   case ISD::RETURNADDR:
     return LowerRETURNADDR(Op, DAG);
   case ISD::ADDROFRETURNADDR:
@@ -5173,6 +5175,16 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
   return FrameAddr;
 }
 
+SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+  EVT VT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+  int FI = MFI.CreateFixedObject(4, 0, false);
+  return DAG.getFrameIndex(FI, VT);
+}
+
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 3e89de665a7..7b4119a21d0 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -617,6 +617,7 @@ private:
   SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
diff --git a/test/CodeGen/AArch64/sponentry.ll b/test/CodeGen/AArch64/sponentry.ll
new file mode 100644
index 00000000000..5b3638a1d86
--- /dev/null
+++ b/test/CodeGen/AArch64/sponentry.ll
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple=aarch64-windows-msvc -disable-fp-elim %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-windows-msvc -fast-isel -disable-fp-elim %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s --check-prefix=NOFP
+; RUN: llc -mtriple=aarch64-windows-msvc -fast-isel %s -o - | FileCheck %s --check-prefix=NOFP
+
+@env2 = common dso_local global [24 x i64]* null, align 8
+
+define dso_local void @bar() {
+  %1 = call i8* @llvm.sponentry()
+  %2 = load [24 x i64]*, [24 x i64]** @env2, align 8
+  %3 = getelementptr inbounds [24 x i64], [24 x i64]* %2, i32 0, i32 0
+  %4 = bitcast i64* %3 to i8*
+  %5 = call i32 @_setjmpex(i8* %4, i8* %1) #2
+  ret void
+}
+
+; CHECK: bar:
+; CHECK: mov     x29, sp
+; CHECK: add     x1, x29, #16
+; CEHCK: bl      _setjmpex
+
+; NOFP: str     x30, [sp, #-16]!
+; NOFP: add     x1, sp, #16
+
+define dso_local void @foo([24 x i64]*) {
+  %2 = alloca [24 x i64]*, align 8
+  %3 = alloca i32, align 4
+  %4 = alloca [100 x i32], align 4
+  store [24 x i64]* %0, [24 x i64]** %2, align 8
+  %5 = call i8* @llvm.sponentry()
+  %6 = load [24 x i64]*, [24 x i64]** %2, align 8
+  %7 = getelementptr inbounds [24 x i64], [24 x i64]* %6, i32 0, i32 0
+  %8 = bitcast i64* %7 to i8*
+  %9 = call i32 @_setjmpex(i8* %8, i8* %5)
+  store i32 %9, i32* %3, align 4
+  ret void
+}
+
+; CHECK: foo:
+; CHECK: sub     sp, sp, #448
+; CHECK: add     x29, sp, #432
+; CHECK: add     x1, x29, #16
+; CEHCK: bl      _setjmpex
+
+; NOFP: sub     sp, sp, #432
+; NOFP: add     x1, sp, #432
+
+define dso_local void @var_args(i8*, ...) {
+  %2 = alloca i8*, align 8
+  %3 = alloca i8*, align 8
+  store i8* %0, i8** %2, align 8
+  %4 = bitcast i8** %3 to i8*
+  call void @llvm.va_start(i8* %4)
+  %5 = load i8*, i8** %3, align 8
+  %6 = getelementptr inbounds i8, i8* %5, i64 8
+  store i8* %6, i8** %3, align 8
+  %7 = bitcast i8* %5 to i32*
+  %8 = load i32, i32* %7, align 8
+  %9 = bitcast i8** %3 to i8*
+  call void @llvm.va_end(i8* %9)
+  %10 = call i8* @llvm.sponentry()
+  %11 = load [24 x i64]*, [24 x i64]** @env2, align 8
+  %12 = getelementptr inbounds [24 x i64], [24 x i64]* %11, i32 0, i32 0
+  %13 = bitcast i64* %12 to i8*
+  %14 = call i32 @_setjmpex(i8* %13, i8* %10) #3
+  ret void
+}
+
+; CHECK: var_args:
+; CHECK: sub     sp, sp, #96
+; CHECK: add     x29, sp, #16
+; CHECK: add     x1, x29, #80
+; CEHCK: bl      _setjmpex
+
+; NOFP: sub     sp, sp, #96
+; NOFP: add     x1, sp, #96
+
+define dso_local void @manyargs(i64 %x1, i64 %x2, i64 %x3, i64 %x4, i64 %x5, i64 %x6, i64 %x7, i64 %x8, i64 %x9, i64 %x10) {
+  %1 = call i8* @llvm.sponentry()
+  %2 = load [24 x i64]*, [24 x i64]** @env2, align 8
+  %3 = getelementptr inbounds [24 x i64], [24 x i64]* %2, i32 0, i32 0
+  %4 = bitcast i64* %3 to i8*
+  %5 = call i32 @_setjmpex(i8* %4, i8* %1) #2
+  ret void
+}
+
+; CHECK: manyargs:
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: add     x1, x29, #16
+
+; NOFP: str     x30, [sp, #-16]!
+; NOFP: add     x1, sp, #16
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.sponentry()
+
+; Function Attrs: returns_twice
+declare dso_local i32 @_setjmpex(i8*, i8*)
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*) #1
+
+; Function Attrs: nounwind
+declare void @llvm.va_end(i8*) #1
-- 
GitLab


From badd06e25e57f049fe9b33a12119d36ef168ccc7 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Grang <mgrang@codeaurora.org>
Date: Thu, 1 Nov 2018 23:34:12 +0000
Subject: [PATCH 0897/1116] [gold-plugin] Fix a bunch of build warnings

Phabricator: https://reviews.llvm.org/D53997

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345910 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/gold/gold-plugin.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index ba9d3ac9345..71e5b72a40c 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -447,9 +447,8 @@ static void diagnosticHandler(const DiagnosticInfo &DI) {
   ld_plugin_level Level;
   switch (DI.getSeverity()) {
   case DS_Error:
-    message(LDPL_FATAL, "LLVM gold plugin has failed to create LTO module: %s",
-            ErrStorage.c_str());
-    return;
+    Level = LDPL_FATAL;
+    break;
   case DS_Warning:
     Level = LDPL_WARNING;
     break;
-- 
GitLab


From 782574b28c5fab38643e3b097fd78fe37d68cadc Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Thu, 1 Nov 2018 23:37:51 +0000
Subject: [PATCH 0898/1116] [AliasSetTracker] Misc cleanup (NFCI)

Summary: Remove two redundant checks, add one in the unit test. Remove an unused method. Fix computation of TotalMayAliasSetSize.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345911 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/AliasSetTracker.h    |  4 ----
 lib/Analysis/AliasSetTracker.cpp           | 20 ++++++--------------
 unittests/Analysis/AliasSetTrackerTest.cpp |  2 ++
 3 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/include/llvm/Analysis/AliasSetTracker.h b/include/llvm/Analysis/AliasSetTracker.h
index d24453749fe..7ed5cd5c473 100644
--- a/include/llvm/Analysis/AliasSetTracker.h
+++ b/include/llvm/Analysis/AliasSetTracker.h
@@ -389,10 +389,6 @@ public:
   /// set is returned.
   AliasSet &getAliasSetFor(const MemoryLocation &MemLoc);
 
-  /// Return true if the specified instruction "may" (or must) alias one of the
-  /// members in any of the sets.
-  bool containsUnknown(const Instruction *I) const;
-
   /// Return the underlying alias analysis object used by this tracker.
   AliasAnalysis &getAliasAnalysis() const { return AA; }
 
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 22c8ae20113..c152b0ddeca 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -114,10 +114,9 @@ void AliasSetTracker::removeAliasSet(AliasSet *AS) {
   if (AliasSet *Fwd = AS->Forward) {
     Fwd->dropRef(*this);
     AS->Forward = nullptr;
-  }
-
-  if (AS->Alias == AliasSet::SetMayAlias)
-    TotalMayAliasSetSize -= AS->size();
+  } else // Update TotalMayAliasSetSize only if not forwarding.
+      if (AS->Alias == AliasSet::SetMayAlias)
+        TotalMayAliasSetSize -= AS->size();
 
   AliasSets.erase(AS);
 }
@@ -232,8 +231,8 @@ bool AliasSet::aliasesUnknownInst(const Instruction *Inst,
   if (AliasAny)
     return true;
 
-  if (!Inst->mayReadOrWriteMemory())
-    return false;
+  assert(Inst->mayReadOrWriteMemory() &&
+         "Instruction must either read or write memory.");
 
   for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
     if (auto *UnknownInst = getUnknownInst(i)) {
@@ -311,13 +310,6 @@ AliasSet *AliasSetTracker::mergeAliasSetsForPointer(const Value *Ptr,
   return FoundSet;
 }
 
-bool AliasSetTracker::containsUnknown(const Instruction *Inst) const {
-  for (const AliasSet &AS : *this)
-    if (!AS.Forward && AS.aliasesUnknownInst(Inst, AA))
-      return true;
-  return false;
-}
-
 AliasSet *AliasSetTracker::findAliasSetForUnknownInst(Instruction *Inst) {
   AliasSet *FoundSet = nullptr;
   for (iterator I = begin(), E = end(); I != E;) {
@@ -326,7 +318,7 @@ AliasSet *AliasSetTracker::findAliasSetForUnknownInst(Instruction *Inst) {
       continue;
     if (!FoundSet)            // If this is the first alias set ptr can go into.
       FoundSet = &*Cur;       // Remember it.
-    else if (!Cur->Forward)   // Otherwise, we must merge the sets.
+    else   // Otherwise, we must merge the sets.
       FoundSet->mergeSetIn(*Cur, *this);     // Merge in contents.
   }
   return FoundSet;
diff --git a/unittests/Analysis/AliasSetTrackerTest.cpp b/unittests/Analysis/AliasSetTrackerTest.cpp
index 886971c4d3a..57d21e2fcb8 100644
--- a/unittests/Analysis/AliasSetTrackerTest.cpp
+++ b/unittests/Analysis/AliasSetTrackerTest.cpp
@@ -78,6 +78,8 @@ TEST(AliasSetTracker, AliasUnknownInst) {
   for (auto &Inst : *Test->begin()) {
     bool FoundAS = false;
     for (AliasSet &AS : AST) {
+      if (!Inst.mayReadOrWriteMemory())
+        continue;
       if (!AS.aliasesUnknownInst(&Inst, AA))
         continue;
       ASSERT_NE(FoundAS, true);
-- 
GitLab


From a00405d87a7be80594eddf7aa859a99b19110cf7 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Fri, 2 Nov 2018 00:06:56 +0000
Subject: [PATCH 0899/1116] [WebAssembly] Expand inserts and extracts with
 variable indices

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53964

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345913 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  29 ++
 .../WebAssembly/WebAssemblyISelLowering.h     |   1 +
 test/CodeGen/WebAssembly/simd.ll              | 340 ++++++++++++++++++
 3 files changed, 370 insertions(+)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 70720e0c32c..d182bd9f369 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -209,6 +209,20 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     }
   }
 
+  // Custom lower lane accesses to expand out variable indices
+  if (Subtarget->hasSIMD128()) {
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) {
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom);
+    }
+    if (EnableUnimplementedWasmSIMDInstrs) {
+      for (auto T : {MVT::v2i64, MVT::v2f64}) {
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
+        setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom);
+      }
+    }
+  }
+
   // Trap lowers to wasm unreachable
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
@@ -859,6 +873,9 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
     return LowerCopyToReg(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerAccessVectorElement(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::SHL:
@@ -1050,6 +1067,18 @@ WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops);
 }
 
+SDValue
+WebAssemblyTargetLowering::LowerAccessVectorElement(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  // Allow constant lane indices, expand variable lane indices
+  SDNode *IdxNode = Op.getOperand(Op.getNumOperands() - 1).getNode();
+  if (isa<ConstantSDNode>(IdxNode) || IdxNode->isUndef())
+    return Op;
+  else
+    // Perform default expansion
+    return SDValue();
+}
+
 SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 61e78c71f2e..5182a58efc7 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -99,6 +99,7 @@ private:
   SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
 };
 
diff --git a/test/CodeGen/WebAssembly/simd.ll b/test/CodeGen/WebAssembly/simd.ll
index 1e1feeb35df..55a325b939c 100644
--- a/test/CodeGen/WebAssembly/simd.ll
+++ b/test/CodeGen/WebAssembly/simd.ll
@@ -54,6 +54,26 @@ define i32 @extract_v16i8_s(<16 x i8> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_var_v16i8_s:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]
+; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $2, $pop[[L5]]
+; SIMD128-NEXT: i32.load8_s $push[[R:[0-9]+]]=, 0($pop[[L6]])
+; SIMD128-NEXT: return $pop[[R]]
+define i32 @extract_var_v16i8_s(<16 x i8> %v, i32 %i) {
+  %elem = extractelement <16 x i8> %v, i32 %i
+  %a = sext i8 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_undef_v16i8_s:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128{{$}}
@@ -78,6 +98,26 @@ define i32 @extract_v16i8_u(<16 x i8> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_var_v16i8_u:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $2, $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32.load8_u $push[[R:[0-9]+]]=, 0($pop[[L6]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_var_v16i8_u(<16 x i8> %v, i32 %i) {
+  %elem = extractelement <16 x i8> %v, i32 %i
+  %a = zext i8 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_undef_v16i8_u:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128{{$}}
@@ -101,6 +141,25 @@ define i8 @extract_v16i8(<16 x i8> %v) {
   ret i8 %elem
 }
 
+; CHECK-LABEL: extract_var_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $2, $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32.load8_u $push[[R:[0-9]+]]=, 0($pop[[L6]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i8 @extract_var_v16i8(<16 x i8> %v, i32 %i) {
+  %elem = extractelement <16 x i8> %v, i32 %i
+  ret i8 %elem
+}
+
 ; CHECK-LABEL: extract_undef_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128{{$}}
@@ -123,6 +182,26 @@ define <16 x i8> @replace_v16i8(<16 x i8> %v, i8 %x) {
   ret <16 x i8> %res
 }
 
+; CHECK-LABEL: replace_var_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, i32, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $3, $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32.store8 0($pop[[L6]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @replace_var_v16i8(<16 x i8> %v, i32 %i, i8 %x) {
+  %res = insertelement <16 x i8> %v, i8 %x, i32 %i
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: replace_undef_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -252,6 +331,28 @@ define i32 @extract_v8i16_s(<8 x i16> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_var_v8i16_s:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.load16_s $push[[R:[0-9]+]]=, 0($pop[[L8]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_var_v8i16_s(<8 x i16> %v, i32 %i) {
+  %elem = extractelement <8 x i16> %v, i32 %i
+  %a = sext i16 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_undef_v8i16_s:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128{{$}}
@@ -276,6 +377,28 @@ define i32 @extract_v8i16_u(<8 x i16> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_var_v8i16_u:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.load16_u $push[[R:[0-9]+]]=, 0($pop[[L8]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_var_v8i16_u(<8 x i16> %v, i32 %i) {
+  %elem = extractelement <8 x i16> %v, i32 %i
+  %a = zext i16 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_undef_v8i16_u:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128{{$}}
@@ -299,6 +422,27 @@ define i16 @extract_v8i16(<8 x i16> %v) {
   ret i16 %elem
 }
 
+; CHECK-LABEL: extract_var_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.load16_u $push[[R:[0-9]+]]=, 0($pop[[L8]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i16 @extract_var_v8i16(<8 x i16> %v, i32 %i) {
+  %elem = extractelement <8 x i16> %v, i32 %i
+  ret i16 %elem
+}
+
 ; CHECK-LABEL: extract_undef_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128{{$}}
@@ -321,6 +465,28 @@ define <8 x i16> @replace_v8i16(<8 x i16> %v, i16 %x) {
   ret <8 x i16> %res
 }
 
+; CHECK-LABEL: replace_var_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, i32, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $3, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.store16 0($pop[[L8]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @replace_var_v8i16(<8 x i16> %v, i32 %i, i16 %x) {
+  %res = insertelement <8 x i16> %v, i16 %x, i32 %i
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: replace_undef_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -427,6 +593,27 @@ define i32 @extract_v4i32(<4 x i32> %v) {
   ret i32 %elem
 }
 
+; CHECK-LABEL: extract_var_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L4:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.load $push[[R:[0-9]+]]=, 0($pop[[L4]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_var_v4i32(<4 x i32> %v, i32 %i) {
+  %elem = extractelement <4 x i32> %v, i32 %i
+  ret i32 %elem
+}
+
 ; CHECK-LABEL: extract_undef_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128{{$}}
@@ -449,6 +636,28 @@ define <4 x i32> @replace_v4i32(<4 x i32> %v, i32 %x) {
   ret <4 x i32> %res
 }
 
+; CHECK-LABEL: replace_var_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, i32, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L4:[0-9]+]]=, $3, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.store 0($pop[[L4]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @replace_var_v4i32(<4 x i32> %v, i32 %i, i32 %x) {
+  %res = insertelement <4 x i32> %v, i32 %x, i32 %i
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: replace_undef_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -547,6 +756,27 @@ define i64 @extract_v2i64(<2 x i64> %v) {
   ret i64 %elem
 }
 
+; CHECK-LABEL: extract_var_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i64{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i64.load $push[[R:[0-9]+]]=, 0($pop[[L2]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i64 @extract_var_v2i64(<2 x i64> %v, i32 %i) {
+  %elem = extractelement <2 x i64> %v, i32 %i
+  ret i64 %elem
+}
+
 ; CHECK-LABEL: extract_undef_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
@@ -571,6 +801,29 @@ define <2 x i64> @replace_v2i64(<2 x i64> %v, i64 %x) {
   ret <2 x i64> %res
 }
 
+; CHECK-LABEL: replace_var_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-VM-NOT: i64x2
+; SIMD128-NEXT: .param v128, i32, i64{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $3, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i64.store 0($pop[[L2]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @replace_var_v2i64(<2 x i64> %v, i32 %i, i64 %x) {
+  %res = insertelement <2 x i64> %v, i64 %x, i32 %i
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: replace_undef_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
@@ -666,6 +919,27 @@ define float @extract_v4f32(<4 x float> %v) {
   ret float %elem
 }
 
+; CHECK-LABEL: extract_var_v4f32:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result f32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: f32.load $push[[R:[0-9]+]]=, 0($pop[[L2]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define float @extract_var_v4f32(<4 x float> %v, i32 %i) {
+  %elem = extractelement <4 x float> %v, i32 %i
+  ret float %elem
+}
+
 ; CHECK-LABEL: extract_undef_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128{{$}}
@@ -688,6 +962,28 @@ define <4 x float> @replace_v4f32(<4 x float> %v, float %x) {
   ret <4 x float> %res
 }
 
+; CHECK-LABEL: replace_var_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, i32, f32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $3, $pop[[L7]]{{$}}
+; SIMD128-NEXT: f32.store 0($pop[[L2]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @replace_var_v4f32(<4 x float> %v, i32 %i, float %x) {
+  %res = insertelement <4 x float> %v, float %x, i32 %i
+  ret <4 x float> %res
+}
+
 ; CHECK-LABEL: replace_undef_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, f32{{$}}
@@ -785,6 +1081,27 @@ define double @extract_v2f64(<2 x double> %v) {
   ret double %elem
 }
 
+; CHECK-LABEL: extract_var_v2f64:
+; NO-SIMD128-NOT: i62x2
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result f64{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: f64.load $push[[R:[0-9]+]]=, 0($pop[[L2]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define double @extract_var_v2f64(<2 x double> %v, i32 %i) {
+  %elem = extractelement <2 x double> %v, i32 %i
+  ret double %elem
+}
+
 ; CHECK-LABEL: extract_undef_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -809,6 +1126,29 @@ define <2 x double> @replace_v2f64(<2 x double> %v, double %x) {
   ret <2 x double> %res
 }
 
+; CHECK-LABEL: replace_var_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, i32, f64{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $3, $pop[[L7]]{{$}}
+; SIMD128-NEXT: f64.store 0($pop[[L2]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @replace_var_v2f64(<2 x double> %v, i32 %i, double %x) {
+  %res = insertelement <2 x double> %v, double %x, i32 %i
+  ret <2 x double> %res
+}
+
 ; CHECK-LABEL: replace_undef_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
-- 
GitLab


From c46ca76214ef8434096ae4d018830efe7aee67af Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Fri, 2 Nov 2018 00:21:45 +0000
Subject: [PATCH 0900/1116] [NFC][LICM] Factor out instruction erasing logic

This patch factors out a function that makes all required updates
whenever an instruction gets erased.

Differential Revision: https://reviews.llvm.org/D54011
Reviewed By: apilipenko


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345914 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LICM.cpp | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index e72342b88b6..d379808cd7f 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -123,6 +123,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
                             const LoopInfo *LI,
                             const LoopSafetyInfo *SafetyInfo);
 
+static void eraseInstruction(Instruction &I, AliasSetTracker *AST);
+
 namespace {
 struct LoopInvariantCodeMotion {
   using ASTrackerMapTy = DenseMap<Loop *, std::unique_ptr<AliasSetTracker>>;
@@ -404,8 +406,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
         salvageDebugInfo(I);
         ++II;
-        CurAST->deleteValue(&I);
-        I.eraseFromParent();
+        eraseInstruction(I, CurAST);
         Changed = true;
         continue;
       }
@@ -422,8 +423,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE, FreeInLoop)) {
           if (!FreeInLoop) {
             ++II;
-            CurAST->deleteValue(&I);
-            I.eraseFromParent();
+            eraseInstruction(I, CurAST);
           }
           Changed = true;
         }
@@ -480,10 +480,8 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                           << '\n');
         CurAST->copyValue(&I, C);
         I.replaceAllUsesWith(C);
-        if (isInstructionTriviallyDead(&I, TLI)) {
-          CurAST->deleteValue(&I);
-          I.eraseFromParent();
-        }
+        if (isInstructionTriviallyDead(&I, TLI))
+          eraseInstruction(I, CurAST);
         Changed = true;
         continue;
       }
@@ -519,7 +517,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         Product->setFastMathFlags(I.getFastMathFlags());
         Product->insertAfter(&I);
         I.replaceAllUsesWith(Product);
-        I.eraseFromParent();
+        eraseInstruction(I, CurAST);
 
         hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
         Changed = true;
@@ -888,6 +886,12 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
   return New;
 }
 
+static void eraseInstruction(Instruction &I, AliasSetTracker *AST) {
+  if (AST)
+    AST->deleteValue(&I);
+  I.eraseFromParent();
+}
+
 static Instruction *sinkThroughTriviallyReplaceablePHI(
     PHINode *TPN, Instruction *I, LoopInfo *LI,
     SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
@@ -1086,7 +1090,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
     Instruction *New = sinkThroughTriviallyReplaceablePHI(PN, &I, LI, SunkCopies,
                                                           SafetyInfo, CurLoop);
     PN->replaceAllUsesWith(New);
-    PN->eraseFromParent();
+    eraseInstruction(*PN, nullptr);
     Changed = true;
   }
   return Changed;
@@ -1516,7 +1520,7 @@ bool llvm::promoteLoopAccessesToScalars(
 
   // If the SSAUpdater didn't use the load in the preheader, just zap it now.
   if (PreheaderLoad->use_empty())
-    PreheaderLoad->eraseFromParent();
+    eraseInstruction(*PreheaderLoad, CurAST);
 
   return true;
 }
-- 
GitLab


From 437a9372206c957f6becf3811c07a4924b711d51 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Fri, 2 Nov 2018 00:39:57 +0000
Subject: [PATCH 0901/1116] [WebAssembly] General vector shift lowering

Summary: Adds support for lowering non-splat shifts.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53625

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345916 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  39 ++-
 test/CodeGen/WebAssembly/simd-arith.ll        | 241 ++++++++++++++++++
 2 files changed, 268 insertions(+), 12 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index d182bd9f369..578d23570f8 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -146,10 +146,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     }
   }
 
-  // Custom lowering to avoid having to emit a wrap for 2xi64 constant shifts
-  if (Subtarget->hasSIMD128() && EnableUnimplementedWasmSIMDInstrs)
-    for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})
-      setOperationAction(Op, MVT::v2i64, Custom);
+  // Custom lowering since wasm shifts must have a scalar shift amount
+  if (Subtarget->hasSIMD128()) {
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+      for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})
+        setOperationAction(Op, T, Custom);
+    if (EnableUnimplementedWasmSIMDInstrs)
+      for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})
+        setOperationAction(Op, MVT::v2i64, Custom);
+  }
 
   // There is no select instruction for vectors
   if (Subtarget->hasSIMD128()) {
@@ -1082,13 +1087,23 @@ WebAssemblyTargetLowering::LowerAccessVectorElement(SDValue Op,
 SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  auto *ShiftVec = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
-  APInt SplatValue, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!ShiftVec || !ShiftVec->isConstantSplat(SplatValue, SplatUndef,
-                                              SplatBitSize, HasAnyUndefs))
+
+  // Only manually lower vector shifts
+  assert(Op.getSimpleValueType().isVector());
+
+  // Unroll non-splat vector shifts
+  BuildVectorSDNode *ShiftVec;
+  SDValue SplatVal;
+  if (!(ShiftVec = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode())) ||
+      !(SplatVal = ShiftVec->getSplatValue()))
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // All splats except i64x2 const splats are handled by patterns
+  ConstantSDNode *SplatConst = dyn_cast<ConstantSDNode>(SplatVal);
+  if (!SplatConst || Op.getSimpleValueType() != MVT::v2i64)
     return Op;
+
+  // i64x2 const splats are custom lowered to avoid unnecessary wraps
   unsigned Opcode;
   switch (Op.getOpcode()) {
   case ISD::SHL:
@@ -1102,10 +1117,10 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
     break;
   default:
     llvm_unreachable("unexpected opcode");
-    return Op;
   }
+  APInt Shift = SplatConst->getAPIntValue().zextOrTrunc(32);
   return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(0),
-                     DAG.getConstant(SplatValue.trunc(32), DL, MVT::i32));
+                     DAG.getConstant(Shift, DL, MVT::i32));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/WebAssembly/simd-arith.ll b/test/CodeGen/WebAssembly/simd-arith.ll
index 573f4fff5ad..e092cd98ecb 100644
--- a/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/test/CodeGen/WebAssembly/simd-arith.ll
@@ -92,6 +92,25 @@ define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
   ret <16 x i8> %a
 }
 
+; CHECK-LABEL: shl_vec_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i8x16.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 14 lanes
+; SIMD128:      i8x16.extract_lane_u $push[[L4:[0-9]+]]=, $0, 15{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L5:[0-9]+]]=, $1, 15{{$}}
+; SIMD128-NEXT: i32.shl $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 15, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
+  %a = shl <16 x i8> %v, %x
+  ret <16 x i8> %a
+}
+
 ; CHECK-LABEL: shr_s_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -107,6 +126,33 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
   ret <16 x i8> %a
 }
 
+; CHECK-LABEL: shr_s_vec_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 24{{$}}
+; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L3:[0-9]+]]=, 24{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L5:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i8x16.splat $push[[L7:[0-9]+]]=, $pop[[L6]]{{$}}
+; Skip 14 lanes
+; SIMD128:      i8x16.extract_lane_u $push[[L7:[0-9]+]]=, $0, 15{{$}}
+; SIMD128-NEXT: i32.const $push[[L8:[0-9]+]]=, 24{{$}}
+; SIMD128-NEXT: i32.shl $push[[L9:[0-9]+]]=, $pop[[L7]], $pop[[L8]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L10:[0-9]+]]=, 24{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L11:[0-9]+]]=, $pop[[L9]], $pop[[L10]]{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L12:[0-9]+]]=, $1, 15{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L13:[0-9]+]]=, $pop[[L11]], $pop[[L12]]{{$}}
+; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $pop[[L14:[0-9]+]], 15, $pop[[L13]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
+  %a = ashr <16 x i8> %v, %x
+  ret <16 x i8> %a
+}
+
 ; CHECK-LABEL: shr_u_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -122,6 +168,25 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
   ret <16 x i8> %a
 }
 
+; CHECK-LABEL: shr_u_vec_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i8x16.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 14 lanes
+; SIMD128:      i8x16.extract_lane_u $push[[L4:[0-9]+]]=, $0, 15{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L5:[0-9]+]]=, $1, 15{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 15, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
+  %a = lshr <16 x i8> %v, %x
+  ret <16 x i8> %a
+}
+
 ; CHECK-LABEL: and_v16i8:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -265,6 +330,25 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
   ret <8 x i16> %a
 }
 
+; CHECK-LABEL: shl_vec_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i16x8.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 6 lanes
+; SIMD128:      i16x8.extract_lane_u $push[[L4:[0-9]+]]=, $0, 7{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L5:[0-9]+]]=, $1, 7{{$}}
+; SIMD128-NEXT: i32.shl $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 7, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
+  %a = shl <8 x i16> %v, %x
+  ret <8 x i16> %a
+}
+
 ; CHECK-LABEL: shr_s_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -279,6 +363,33 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
   ret <8 x i16> %a
 }
 
+; CHECK-LABEL: shr_s_vec_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L3:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L5:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i16x8.splat $push[[L7:[0-9]+]]=, $pop[[L6]]{{$}}
+; Skip 6 lanes
+; SIMD128:      i16x8.extract_lane_u $push[[L7:[0-9]+]]=, $0, 7{{$}}
+; SIMD128-NEXT: i32.const $push[[L8:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.shl $push[[L9:[0-9]+]]=, $pop[[L7]], $pop[[L8]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L10:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L11:[0-9]+]]=, $pop[[L9]], $pop[[L10]]{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L12:[0-9]+]]=, $1, 7{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L13:[0-9]+]]=, $pop[[L11]], $pop[[L12]]{{$}}
+; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $pop[[L14:[0-9]+]], 7, $pop[[L13]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
+  %a = ashr <8 x i16> %v, %x
+  ret <8 x i16> %a
+}
+
 ; CHECK-LABEL: shr_u_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -293,6 +404,25 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
   ret <8 x i16> %a
 }
 
+; CHECK-LABEL: shr_u_vec_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i16x8.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 6 lanes
+; SIMD128:      i16x8.extract_lane_u $push[[L4:[0-9]+]]=, $0, 7{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L5:[0-9]+]]=, $1, 7{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 7, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
+  %a = lshr <8 x i16> %v, %x
+  ret <8 x i16> %a
+}
+
 ; CHECK-LABEL: and_v8i16:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -432,6 +562,25 @@ define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
   ret <4 x i32> %a
 }
 
+; CHECK-LABEL: shl_vec_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 2 lanes
+; SIMD128:      i32x4.extract_lane $push[[L4:[0-9]+]]=, $0, 3{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L5:[0-9]+]]=, $1, 3{{$}}
+; SIMD128-NEXT: i32.shl $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 3, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
+  %a = shl <4 x i32> %v, %x
+  ret <4 x i32> %a
+}
+
 ; CHECK-LABEL: shr_s_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -446,6 +595,25 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
   ret <4 x i32> %a
 }
 
+; CHECK-LABEL: shr_s_vec_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 2 lanes
+; SIMD128:      i32x4.extract_lane $push[[L4:[0-9]+]]=, $0, 3{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L5:[0-9]+]]=, $1, 3{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 3, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
+  %a = ashr <4 x i32> %v, %x
+  ret <4 x i32> %a
+}
+
 ; CHECK-LABEL: shr_u_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -460,6 +628,25 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
   ret <4 x i32> %a
 }
 
+; CHECK-LABEL: shr_u_vec_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 2 lanes
+; SIMD128:      i32x4.extract_lane $push[[L4:[0-9]+]]=, $0, 3{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L5:[0-9]+]]=, $1, 3{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 3, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
+  %a = lshr <4 x i32> %v, %x
+  ret <4 x i32> %a
+}
+
 ; CHECK-LABEL: and_v4i32:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -613,6 +800,24 @@ define <2 x i64> @shl_const_v2i64(<2 x i64> %v) {
   ret <2 x i64> %a
 }
 
+; CHECK-LABEL: shl_vec_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i64.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L4:[0-9]+]]=, $0, 1{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L5:[0-9]+]]=, $1, 1{{$}}
+; SIMD128-NEXT: i64.shl $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L3]], 1, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shl_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
+  %a = shl <2 x i64> %v, %x
+  ret <2 x i64> %a
+}
+
 ; CHECK-LABEL: shr_s_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -653,6 +858,24 @@ define <2 x i64> @shr_s_const_v2i64(<2 x i64> %v) {
   ret <2 x i64> %a
 }
 
+; CHECK-LABEL: shr_s_vec_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i64.shr_s $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L4:[0-9]+]]=, $0, 1{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L5:[0-9]+]]=, $1, 1{{$}}
+; SIMD128-NEXT: i64.shr_s $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L3]], 1, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shr_s_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
+  %a = ashr <2 x i64> %v, %x
+  ret <2 x i64> %a
+}
+
 ; CHECK-LABEL: shr_u_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -693,6 +916,24 @@ define <2 x i64> @shr_u_const_v2i64(<2 x i64> %v) {
   ret <2 x i64> %a
 }
 
+; CHECK-LABEL: shr_u_vec_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i64.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L4:[0-9]+]]=, $0, 1{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L5:[0-9]+]]=, $1, 1{{$}}
+; SIMD128-NEXT: i64.shr_u $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L3]], 1, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shr_u_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
+  %a = lshr <2 x i64> %v, %x
+  ret <2 x i64> %a
+}
+
 ; CHECK-LABEL: and_v2i64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
-- 
GitLab


From 2495823e188ef054c61eea961caa63216d7d2650 Mon Sep 17 00:00:00 2001
From: Wouter van Oortmerssen <aardappel@gmail.com>
Date: Fri, 2 Nov 2018 00:45:00 +0000
Subject: [PATCH 0902/1116] [WebAssembly] Added a .globaltype directive to .s
 output.

Summary:
Assembly output can use globals like __stack_pointer implicitly,
but has no way of indicating the type of such a global, which makes
it hard for tools processing it (such as the MC Assembler) to
reconstruct this information.

The improved assembler directives parsing (in progress in
https://reviews.llvm.org/D53842) will make use of this information.

Also deleted code for the .import_global directive which was unused.

New test case in userstack.ll

Reviewers: dschuff, sbc100

Subscribers: jgravelle-google, aheejin, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D54012

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345917 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../MCTargetDesc/WebAssemblyTargetStreamer.cpp        | 11 +++++++----
 .../MCTargetDesc/WebAssemblyTargetStreamer.h          |  8 ++++----
 lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp      |  9 +++++++++
 test/CodeGen/WebAssembly/userstack.ll                 |  2 ++
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 2158ee3be04..4c4ca4e599c 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -99,8 +99,11 @@ void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
   OS << '\n';
 }
 
-void WebAssemblyTargetAsmStreamer::emitGlobalImport(StringRef name) {
-  OS << "\t.import_global\t" << name << '\n';
+void WebAssemblyTargetAsmStreamer::emitGlobalType(MCSymbolWasm *Sym) {
+  OS << "\t.globaltype\t" << Sym->getName() << ", " <<
+        WebAssembly::TypeToString(
+          static_cast<wasm::ValType>(Sym->getGlobalType().Type)) <<
+        '\n';
 }
 
 void WebAssemblyTargetAsmStreamer::emitImportModule(MCSymbolWasm *Sym,
@@ -152,8 +155,8 @@ void WebAssemblyTargetWasmStreamer::emitIndirectFunctionType(
   Symbol->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
 }
 
-void WebAssemblyTargetWasmStreamer::emitGlobalImport(StringRef name) {
-  llvm_unreachable(".global_import is not needed for direct wasm output");
+void WebAssemblyTargetWasmStreamer::emitGlobalType(MCSymbolWasm *Sym) {
+  // Not needed.
 }
 
 void WebAssemblyTargetWasmStreamer::emitImportModule(MCSymbolWasm *Sym,
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 43c422d593a..e60158b5def 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -43,8 +43,8 @@ public:
   virtual void emitIndirectFunctionType(MCSymbolWasm *Symbol) = 0;
   /// .indidx
   virtual void emitIndIdx(const MCExpr *Value) = 0;
-  /// .import_global
-  virtual void emitGlobalImport(StringRef name) = 0;
+  /// .globaltype
+  virtual void emitGlobalType(MCSymbolWasm *Sym) = 0;
   /// .import_module
   virtual void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) = 0;
 
@@ -65,7 +65,7 @@ public:
   void emitEndFunc() override;
   void emitIndirectFunctionType(MCSymbolWasm *Symbol) override;
   void emitIndIdx(const MCExpr *Value) override;
-  void emitGlobalImport(StringRef name) override;
+  void emitGlobalType(MCSymbolWasm *Sym) override;
   void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override;
 };
 
@@ -80,7 +80,7 @@ public:
   void emitEndFunc() override;
   void emitIndirectFunctionType(MCSymbolWasm *Symbol) override;
   void emitIndIdx(const MCExpr *Value) override;
-  void emitGlobalImport(StringRef name) override;
+  void emitGlobalType(MCSymbolWasm *Sym) override;
   void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override;
 };
 
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index b8ac85943eb..1e21ab92b62 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -78,6 +78,14 @@ WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() {
 //===----------------------------------------------------------------------===//
 
 void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  for (auto &It : OutContext.getSymbols()) {
+    // Emit a .globaltype declaration.
+    auto Sym = cast<MCSymbolWasm>(It.getValue());
+    if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_GLOBAL) {
+      getTargetStreamer()->emitGlobalType(Sym);
+    }
+  }
+
   for (const auto &F : M) {
     // Emit function type info for all undefined functions
     if (F.isDeclarationForLinker() && !F.isIntrinsic()) {
@@ -105,6 +113,7 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
       }
     }
   }
+
   for (const auto &G : M.globals()) {
     if (!G.hasInitializer() && G.hasExternalLinkage()) {
       if (G.getValueType()->isSized()) {
diff --git a/test/CodeGen/WebAssembly/userstack.ll b/test/CodeGen/WebAssembly/userstack.ll
index 4a6b48b977f..aa4acae5e07 100644
--- a/test/CodeGen/WebAssembly/userstack.ll
+++ b/test/CodeGen/WebAssembly/userstack.ll
@@ -330,4 +330,6 @@ define void @inline_asm() {
   ret void
 }
 
+; CHECK: .globaltype	__stack_pointer, i32{{$}}
+
 ; TODO: test over-aligned alloca
-- 
GitLab


From 94f7fc2ccd3100b131cff5d9f1471e7c60f334d6 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Fri, 2 Nov 2018 01:31:50 +0000
Subject: [PATCH 0903/1116] LLVMTargetMachine/TargetPassConfig: Simplify
 handling of start/stop options; NFC

- Make some TargetPassConfig methods that just check whether options have
  been set static.
- Shuffle code in LLVMTargetMachine around so addPassesToGenerateCode
  only deals with TargetPassConfig now (but not with MCContext or the
  creation of MachineModuleInfo)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345918 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/TargetPassConfig.h | 21 ++++-------
 lib/CodeGen/LLVMTargetMachine.cpp       | 46 +++++++++++--------------
 lib/CodeGen/TargetPassConfig.cpp        |  9 +++--
 3 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/include/llvm/CodeGen/TargetPassConfig.h b/include/llvm/CodeGen/TargetPassConfig.h
index 8f5c9cb8c3f..7fda8751d40 100644
--- a/include/llvm/CodeGen/TargetPassConfig.h
+++ b/include/llvm/CodeGen/TargetPassConfig.h
@@ -145,13 +145,13 @@ public:
 
   CodeGenOpt::Level getOptLevel() const;
 
-  /// Describe the status of the codegen
-  /// pipeline set by this target pass config.
-  /// Having a limited codegen pipeline means that options
-  /// have been used to restrict what codegen is doing.
-  /// In particular, that means that codegen won't emit
-  /// assembly code.
-  bool hasLimitedCodeGenPipeline() const;
+  /// Returns true if one of the `-start-after`, `-start-before`, `-stop-after`
+  /// or `-stop-before` options is set.
+  static bool hasLimitedCodeGenPipeline();
+
+  /// Returns true if none of the `-stop-before` and `-stop-after` options is
+  /// set.
+  static bool willCompleteCodeGenPipeline();
 
   /// If hasLimitedCodeGenPipeline is true, this method
   /// returns a string with the name of the options, separated
@@ -159,13 +159,6 @@ public:
   std::string
   getLimitedCodeGenPipelineReason(const char *Separator = "/") const;
 
-  /// Check if the codegen pipeline is limited in such a way that it
-  /// won't be complete. When the codegen pipeline is not complete,
-  /// this means it may not be possible to generate assembly from it.
-  bool willCompleteCodeGenPipeline() const {
-    return !hasLimitedCodeGenPipeline() || (!StopAfter && !StopBefore);
-  }
-
   void setDisableVerify(bool Disable) { setOpt(DisableVerify, Disable); }
 
   bool getEnableTailMerge() const { return EnableTailMerge; }
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 90337903008..52e832cc38c 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -95,29 +95,22 @@ LLVMTargetMachine::getTargetTransformInfo(const Function &F) {
 }
 
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
-static MCContext *
-addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
-                        bool DisableVerify, bool &WillCompleteCodeGenPipeline,
-                        raw_pwrite_stream &Out, MachineModuleInfo *MMI) {
+static TargetPassConfig *
+addPassesToGenerateCode(LLVMTargetMachine &TM, PassManagerBase &PM,
+                        bool DisableVerify, MachineModuleInfo &MMI) {
   // Targets may override createPassConfig to provide a target-specific
   // subclass.
-  TargetPassConfig *PassConfig = TM->createPassConfig(PM);
+  TargetPassConfig *PassConfig = TM.createPassConfig(PM);
   // Set PassConfig options provided by TargetMachine.
   PassConfig->setDisableVerify(DisableVerify);
-  WillCompleteCodeGenPipeline = PassConfig->willCompleteCodeGenPipeline();
   PM.add(PassConfig);
-  if (!MMI)
-    MMI = new MachineModuleInfo(TM);
-  PM.add(MMI);
+  PM.add(&MMI);
 
   if (PassConfig->addISelPasses())
     return nullptr;
   PassConfig->addMachinePasses();
   PassConfig->setInitialized();
-  if (!WillCompleteCodeGenPipeline)
-    PM.add(createPrintMIRPass(Out));
-
-  return &MMI->getContext();
+  return PassConfig;
 }
 
 bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
@@ -201,14 +194,16 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                             bool DisableVerify,
                                             MachineModuleInfo *MMI) {
   // Add common CodeGen passes.
-  bool WillCompleteCodeGenPipeline = true;
-  MCContext *Context = addPassesToGenerateCode(
-      this, PM, DisableVerify, WillCompleteCodeGenPipeline, Out, MMI);
-  if (!Context)
+  if (!MMI)
+    MMI = new MachineModuleInfo(this);
+  TargetPassConfig *PassConfig =
+      addPassesToGenerateCode(*this, PM, DisableVerify, *MMI);
+  if (!PassConfig)
     return true;
 
-  if (WillCompleteCodeGenPipeline &&
-      addAsmPrinter(PM, Out, DwoOut, FileType, *Context))
+  if (!TargetPassConfig::willCompleteCodeGenPipeline()) {
+    PM.add(createPrintMIRPass(Out));
+  } else if (addAsmPrinter(PM, Out, DwoOut, FileType, MMI->getContext()))
     return true;
 
   PM.add(createFreeMachineFunctionPass());
@@ -224,14 +219,15 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
                                           raw_pwrite_stream &Out,
                                           bool DisableVerify) {
   // Add common CodeGen passes.
-  bool WillCompleteCodeGenPipeline = true;
-  Ctx = addPassesToGenerateCode(this, PM, DisableVerify,
-                                WillCompleteCodeGenPipeline, Out,
-                                /*MachineModuleInfo*/ nullptr);
-  if (!Ctx)
+  MachineModuleInfo *MMI = new MachineModuleInfo(this);
+  TargetPassConfig *PassConfig =
+      addPassesToGenerateCode(*this, PM, DisableVerify, *MMI);
+  if (!PassConfig)
     return true;
-  assert(WillCompleteCodeGenPipeline && "CodeGen pipeline has been altered");
+  assert(TargetPassConfig::willCompleteCodeGenPipeline() &&
+         "Cannot emit MC with limited codegen pipeline");
 
+  Ctx = &MMI->getContext();
   if (Options.MCOptions.MCSaveTempLabels)
     Ctx->setAllowTemporaryLabels(false);
 
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index 6a9c3c05f03..9adacd2ed71 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -419,8 +419,13 @@ TargetPassConfig::TargetPassConfig()
                      "triple set?");
 }
 
-bool TargetPassConfig::hasLimitedCodeGenPipeline() const {
-  return StartBefore || StartAfter || StopBefore || StopAfter;
+bool TargetPassConfig::willCompleteCodeGenPipeline() {
+  return StopBeforeOpt.empty() && StopAfterOpt.empty();
+}
+
+bool TargetPassConfig::hasLimitedCodeGenPipeline() {
+  return !StartBeforeOpt.empty() || !StartAfterOpt.empty() ||
+         !willCompleteCodeGenPipeline();
 }
 
 std::string
-- 
GitLab


From f4ddd6209c233ff644b1cf807842947bcc5e2c25 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Fri, 2 Nov 2018 01:31:52 +0000
Subject: [PATCH 0904/1116] test/DebugInfo: Convert some tests to MIR

These tests are meant to test dwarf emission (or prolog/epilogue
generation) so we can convert them to .mir and only run the relevant
part of the pipeline.
This way they become independent of changes in earlier passes such as my
planned changes to RegAllocFast.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345919 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/DebugInfo/AArch64/asan-stack-vars.ll     | 324 ---------
 test/DebugInfo/AArch64/asan-stack-vars.mir    | 682 ++++++++++++++++++
 .../compiler-gen-bbs-livedebugvalues.ll       |  66 --
 .../compiler-gen-bbs-livedebugvalues.mir      | 110 +++
 test/DebugInfo/ARM/cfi-eof-prologue.ll        | 114 ---
 test/DebugInfo/ARM/cfi-eof-prologue.mir       | 212 ++++++
 test/DebugInfo/X86/debug-loc-asan.ll          | 190 -----
 test/DebugInfo/X86/debug-loc-asan.mir         | 346 +++++++++
 test/DebugInfo/X86/debug-loc-offset.ll        | 171 -----
 test/DebugInfo/X86/debug-loc-offset.mir       | 276 +++++++
 test/DebugInfo/X86/dw_op_minus.ll             |  80 --
 test/DebugInfo/X86/dw_op_minus.mir            | 119 +++
 test/DebugInfo/X86/pr19307.ll                 | 144 ----
 test/DebugInfo/X86/pr19307.mir                | 224 ++++++
 14 files changed, 1969 insertions(+), 1089 deletions(-)
 delete mode 100644 test/DebugInfo/AArch64/asan-stack-vars.ll
 create mode 100644 test/DebugInfo/AArch64/asan-stack-vars.mir
 delete mode 100644 test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.ll
 create mode 100644 test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir
 delete mode 100644 test/DebugInfo/ARM/cfi-eof-prologue.ll
 create mode 100644 test/DebugInfo/ARM/cfi-eof-prologue.mir
 delete mode 100644 test/DebugInfo/X86/debug-loc-asan.ll
 create mode 100644 test/DebugInfo/X86/debug-loc-asan.mir
 delete mode 100644 test/DebugInfo/X86/debug-loc-offset.ll
 create mode 100644 test/DebugInfo/X86/debug-loc-offset.mir
 delete mode 100644 test/DebugInfo/X86/dw_op_minus.ll
 create mode 100644 test/DebugInfo/X86/dw_op_minus.mir
 delete mode 100644 test/DebugInfo/X86/pr19307.ll
 create mode 100644 test/DebugInfo/X86/pr19307.mir

diff --git a/test/DebugInfo/AArch64/asan-stack-vars.ll b/test/DebugInfo/AArch64/asan-stack-vars.ll
deleted file mode 100644
index 5db46065476..00000000000
--- a/test/DebugInfo/AArch64/asan-stack-vars.ll
+++ /dev/null
@@ -1,324 +0,0 @@
-; RUN: llc -O0 -fast-isel -filetype=obj -o - %s | llvm-dwarfdump -v - | FileCheck %s
-;
-; Derived from (clang -O0 -g -fsanitize=address -fobjc-arc)
-;   @protocol NSObject
-;   @end
-;   @interface NSObject<NSObject>{}
-;   + (instancetype)alloc;
-;   @end
-;   struct CGSize {
-;     double width;
-;     double height;
-;   };
-;   typedef struct CGSize CGSize;
-;   @interface Object : NSObject
-;   - (instancetype)initWithSize:(CGSize)size;
-;   - (id)aMessage;
-;   @end
-;   @implementation MyObject
-;   + (id)doWithSize:(CGSize)imageSize andObject:(id)object {
-;     return [object aMessage];
-;   }
-;   @end
-;
-; CHECK: .debug_info contents:
-; CHECK: DW_TAG_subprogram
-; CHECK-NEXT:   DW_AT_low_pc [DW_FORM_addr]     (0x0000000000000000)
-; CHECK-NEXT:   DW_AT_high_pc [DW_FORM_addr]    ([[FN_END:.*]])
-; CHECK: "_cmd"
-; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_location
-; CHECK-NEXT:   [0x{{0*}}, 0x{{.*}}):
-; CHECK-NOT:    DW_AT_
-; CHECK:        [0x{{.*}}, [[FN_END]]):
-; CHECK-NEXT: DW_AT_name {{.*}}"imageSize"
-
-; ModuleID = 'm.m'
-source_filename = "m.m"
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios"
-
-%0 = type opaque
-%struct._class_t = type { %struct._class_t*, %struct._class_t*, %struct._objc_cache*, i8* (i8*, i8*)**, %struct._class_ro_t* }
-%struct._objc_cache = type opaque
-%struct._class_ro_t = type { i32, i32, i32, i8*, i8*, %struct.__method_list_t*, %struct._objc_protocol_list*, %struct._ivar_list_t*, i8*, %struct._prop_list_t* }
-%struct.__method_list_t = type { i32, i32, [0 x %struct._objc_method] }
-%struct._objc_method = type { i8*, i8*, i8* }
-%struct._objc_protocol_list = type { i64, [0 x %struct._protocol_t*] }
-%struct._protocol_t = type { i8*, i8*, %struct._objc_protocol_list*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct._prop_list_t*, i32, i32, i8**, i8*, %struct._prop_list_t* }
-%struct._ivar_list_t = type { i32, i32, [0 x %struct._ivar_t] }
-%struct._ivar_t = type { i32*, i8*, i8*, i32, i32 }
-%struct._prop_list_t = type { i32, i32, [0 x %struct._prop_t] }
-%struct._prop_t = type { i8*, i8* }
-%struct.CGSize = type { double, double }
-
-@"OBJC_CLASS_$_Object" = external global %struct._class_t
-@"OBJC_CLASSLIST_REFERENCES_$_" = private global %struct._class_t* @"OBJC_CLASS_$_Object", section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
-@OBJC_METH_VAR_NAME_ = private unnamed_addr constant [6 x i8] c"alloc\00", section "__TEXT,__objc_methname,cstring_literals", align 1
-@OBJC_SELECTOR_REFERENCES_ = private externally_initialized global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @OBJC_METH_VAR_NAME_, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
-@OBJC_METH_VAR_NAME_.1 = private unnamed_addr constant [14 x i8] c"initWithSize:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
-@OBJC_SELECTOR_REFERENCES_.2 = private externally_initialized global i8* getelementptr inbounds ([14 x i8], [14 x i8]* @OBJC_METH_VAR_NAME_.1, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
-@OBJC_METH_VAR_NAME_.3 = private unnamed_addr constant [9 x i8] c"aMessage\00", section "__TEXT,__objc_methname,cstring_literals", align 1
-@OBJC_SELECTOR_REFERENCES_.4 = private externally_initialized global i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_METH_VAR_NAME_.3, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
-@_objc_empty_cache = external global %struct._objc_cache
-@"OBJC_CLASS_$_MyObject" = global %struct._class_t { %struct._class_t* @"OBJC_METACLASS_$_MyObject", %struct._class_t* null, %struct._objc_cache* @_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* @"\01l_OBJC_CLASS_RO_$_MyObject" }, section "__DATA, __objc_data", align 8
-@"OBJC_METACLASS_$_MyObject" = global %struct._class_t { %struct._class_t* @"OBJC_METACLASS_$_MyObject", %struct._class_t* @"OBJC_CLASS_$_MyObject", %struct._objc_cache* @_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* @"\01l_OBJC_METACLASS_RO_$_MyObject" }, section "__DATA, __objc_data", align 8
-@OBJC_CLASS_NAME_ = private unnamed_addr constant [9 x i8] c"MyObject\00", section "__TEXT,__objc_classname,cstring_literals", align 1
-@OBJC_METH_VAR_NAME_.5 = private unnamed_addr constant [12 x i8] c"doWithSize:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
-@OBJC_METH_VAR_TYPE_ = private unnamed_addr constant [21 x i8] c"@32@0:8{CGSize=dd}16\00", section "__TEXT,__objc_methtype,cstring_literals", align 1
-@"\01l_OBJC_$_CLASS_METHODS_MyObject" = private global { i32, i32, [1 x %struct._objc_method] } { i32 24, i32 1, [1 x %struct._objc_method] [%struct._objc_method { i8* getelementptr inbounds ([12 x i8], [12 x i8]* @OBJC_METH_VAR_NAME_.5, i32 0, i32 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @OBJC_METH_VAR_TYPE_, i32 0, i32 0), i8* bitcast (i8* (i8*, i8*, [2 x double])* @"\01+[MyObject doWithSize:]" to i8*) }] }, section "__DATA, __objc_const", align 8
-@"\01l_OBJC_METACLASS_RO_$_MyObject" = private global %struct._class_ro_t { i32 131, i32 40, i32 40, i8* null, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01l_OBJC_$_CLASS_METHODS_MyObject" to %struct.__method_list_t*), %struct._objc_protocol_list* null, %struct._ivar_list_t* null, i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
-@"\01l_OBJC_CLASS_RO_$_MyObject" = private global %struct._class_ro_t { i32 130, i32 0, i32 0, i8* null, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* null, %struct._objc_protocol_list* null, %struct._ivar_list_t* null, i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
-@"OBJC_LABEL_CLASS_$" = private global [1 x i8*] [i8* bitcast (%struct._class_t* @"OBJC_CLASS_$_MyObject" to i8*)], section "__DATA, __objc_classlist, regular, no_dead_strip", align 8
-@llvm.compiler.used = appending global [12 x i8*] [i8* bitcast (%struct._class_t** @"OBJC_CLASSLIST_REFERENCES_$_" to i8*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @OBJC_METH_VAR_NAME_, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_ to i8*), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @OBJC_METH_VAR_NAME_.1, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_.2 to i8*), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_METH_VAR_NAME_.3, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_.4 to i8*), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @OBJC_METH_VAR_NAME_.5, i32 0, i32 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @OBJC_METH_VAR_TYPE_, i32 0, i32 0), i8* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01l_OBJC_$_CLASS_METHODS_MyObject" to i8*), i8* bitcast ([1 x i8*]* @"OBJC_LABEL_CLASS_$" to i8*)], section "llvm.metadata"
-@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @asan.module_ctor, i8* null }]
-@__asan_shadow_memory_dynamic_address = external global i64
-@___asan_gen_ = private unnamed_addr constant [34 x i8] c"2 32 16 9 imageSize 64 8 6 object\00", align 1
-
-; Function Attrs: noinline sanitize_address ssp uwtable
-define internal i8* @"\01+[MyObject doWithSize:]"(i8* %self, i8* %_cmd, [2 x double] %imageSize.coerce) #0 !dbg !14 {
-entry:
-  %0 = load i64, i64* @__asan_shadow_memory_dynamic_address
-  %self.addr = alloca i8*, align 8
-  %_cmd.addr = alloca i8*, align 8
-  %MyAlloca = alloca [96 x i8], align 32, !dbg !35
-  %1 = ptrtoint [96 x i8]* %MyAlloca to i64, !dbg !35
-  %2 = add i64 %1, 32, !dbg !35
-  %3 = inttoptr i64 %2 to %struct.CGSize*, !dbg !35
-  %4 = add i64 %1, 64, !dbg !35
-  %5 = inttoptr i64 %4 to %0**, !dbg !35
-  %6 = inttoptr i64 %1 to i64*, !dbg !35
-  store i64 1102416563, i64* %6, !dbg !35
-  %7 = add i64 %1, 8, !dbg !35
-  %8 = inttoptr i64 %7 to i64*, !dbg !35
-  store i64 ptrtoint ([34 x i8]* @___asan_gen_ to i64), i64* %8, !dbg !35
-  %9 = add i64 %1, 16, !dbg !35
-  %10 = inttoptr i64 %9 to i64*, !dbg !35
-  store i64 ptrtoint (i8* (i8*, i8*, [2 x double])* @"\01+[MyObject doWithSize:]" to i64), i64* %10, !dbg !35
-  %11 = lshr i64 %1, 3, !dbg !35
-  %12 = add i64 %11, %0, !dbg !35
-  %13 = add i64 %12, 0, !dbg !35
-  %14 = inttoptr i64 %13 to i64*, !dbg !35
-  store i64 -940689368107847183, i64* %14, align 1, !dbg !35
-  %15 = add i64 %12, 9, !dbg !35
-  %16 = inttoptr i64 %15 to i16*, !dbg !35
-  store i16 -3085, i16* %16, align 1, !dbg !35
-  %17 = add i64 %12, 11, !dbg !35
-  %18 = inttoptr i64 %17 to i8*, !dbg !35
-  store i8 -13, i8* %18, align 1, !dbg !35
-  call void @llvm.dbg.declare(metadata %struct.CGSize* %3, metadata !36, metadata !37), !dbg !38
-  call void @llvm.dbg.declare(metadata %0** %5, metadata !39, metadata !37), !dbg !45
-  %19 = bitcast %struct.CGSize* %3 to [2 x double]*
-  %20 = ptrtoint [2 x double]* %19 to i64
-  %21 = lshr i64 %20, 3
-  %22 = add i64 %21, %0
-  %23 = inttoptr i64 %22 to i16*
-  %24 = load i16, i16* %23
-  %25 = icmp ne i16 %24, 0
-  br i1 %25, label %26, label %27
-
-; <label>:26:                                     ; preds = %entry
-  call void @__asan_report_store16(i64 %20)
-  call void asm sideeffect "", ""()
-  unreachable
-
-; <label>:27:                                     ; preds = %entry
-  store [2 x double] %imageSize.coerce, [2 x double]* %19, align 8
-  store i8* %self, i8** %self.addr, align 8
-  call void @llvm.dbg.declare(metadata i8** %self.addr, metadata !46, metadata !48), !dbg !49
-  store i8* %_cmd, i8** %_cmd.addr, align 8
-  call void @llvm.dbg.declare(metadata i8** %_cmd.addr, metadata !50, metadata !48), !dbg !49
-  %28 = load %struct._class_t*, %struct._class_t** @"OBJC_CLASSLIST_REFERENCES_$_", align 8, !dbg !52
-  %29 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_ to i64), i64 3), %0, !dbg !52
-  %30 = inttoptr i64 %29 to i8*, !dbg !52
-  %31 = load i8, i8* %30, !dbg !52
-  %32 = icmp ne i8 %31, 0, !dbg !52
-  br i1 %32, label %33, label %34, !dbg !52
-
-; <label>:33:                                     ; preds = %27
-  call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_ to i64)), !dbg !52
-  call void asm sideeffect "", ""(), !dbg !52
-  unreachable, !dbg !52
-
-; <label>:34:                                     ; preds = %27
-  %35 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_, align 8, !dbg !52, !invariant.load !2
-  %36 = bitcast %struct._class_t* %28 to i8*, !dbg !52
-  %call = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %36, i8* %35), !dbg !52
-  %37 = bitcast i8* %call to %0*, !dbg !52
-  %38 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.2 to i64), i64 3), %0, !dbg !53
-  %39 = inttoptr i64 %38 to i8*, !dbg !53
-  %40 = load i8, i8* %39, !dbg !53
-  %41 = icmp ne i8 %40, 0, !dbg !53
-  br i1 %41, label %42, label %43, !dbg !53
-
-; <label>:42:                                     ; preds = %34
-  call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.2 to i64)), !dbg !53
-  call void asm sideeffect "", ""(), !dbg !53
-  unreachable, !dbg !53
-
-; <label>:43:                                     ; preds = %34
-  %44 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_.2, align 8, !dbg !53, !invariant.load !2
-  %45 = bitcast %0* %37 to i8*, !dbg !53
-  %46 = bitcast %struct.CGSize* %3 to [2 x double]*, !dbg !53
-  %47 = ptrtoint [2 x double]* %46 to i64, !dbg !53
-  %48 = lshr i64 %47, 3, !dbg !53
-  %49 = add i64 %48, %0, !dbg !53
-  %50 = inttoptr i64 %49 to i16*, !dbg !53
-  %51 = load i16, i16* %50, !dbg !53
-  %52 = icmp ne i16 %51, 0, !dbg !53
-  br i1 %52, label %53, label %54, !dbg !53
-
-; <label>:53:                                     ; preds = %43
-  call void @__asan_report_load16(i64 %47), !dbg !53
-  call void asm sideeffect "", ""(), !dbg !53
-  unreachable, !dbg !53
-
-; <label>:54:                                     ; preds = %43
-  %55 = load [2 x double], [2 x double]* %46, align 8, !dbg !53
-  %call1 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, [2 x double])*)(i8* %45, i8* %44, [2 x double] %55), !dbg !53
-  %56 = bitcast i8* %call1 to %0*, !dbg !53
-  %57 = ptrtoint %0** %5 to i64, !dbg !45
-  %58 = lshr i64 %57, 3, !dbg !45
-  %59 = add i64 %58, %0, !dbg !45
-  %60 = inttoptr i64 %59 to i8*, !dbg !45
-  %61 = load i8, i8* %60, !dbg !45
-  %62 = icmp ne i8 %61, 0, !dbg !45
-  br i1 %62, label %63, label %64, !dbg !45
-
-; <label>:63:                                     ; preds = %54
-  call void @__asan_report_store8(i64 %57), !dbg !45
-  call void asm sideeffect "", ""(), !dbg !45
-  unreachable, !dbg !45
-
-; <label>:64:                                     ; preds = %54
-  store %0* %56, %0** %5, align 8, !dbg !45
-  %65 = load %0*, %0** %5, align 8, !dbg !54
-  %66 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.4 to i64), i64 3), %0, !dbg !55
-  %67 = inttoptr i64 %66 to i8*, !dbg !55
-  %68 = load i8, i8* %67, !dbg !55
-  %69 = icmp ne i8 %68, 0, !dbg !55
-  br i1 %69, label %70, label %71, !dbg !55
-
-; <label>:70:                                     ; preds = %64
-  call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.4 to i64)), !dbg !55
-  call void asm sideeffect "", ""(), !dbg !55
-  unreachable, !dbg !55
-
-; <label>:71:                                     ; preds = %64
-  %72 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_.4, align 8, !dbg !55, !invariant.load !2
-  %73 = bitcast %0* %65 to i8*, !dbg !55
-  %call2 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %73, i8* %72), !dbg !55
-  call void asm sideeffect "mov\09fp, fp\09\09; marker for objc_retainAutoreleaseReturnValue", ""(), !dbg !55
-  %74 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call2) #3, !dbg !55
-  %75 = bitcast %0** %5 to i8**, !dbg !56
-  call void @objc_storeStrong(i8** %75, i8* null) #3, !dbg !56
-  %76 = tail call i8* @objc_autoreleaseReturnValue(i8* %74) #3, !dbg !56
-  store i64 1172321806, i64* %6, !dbg !56
-  %77 = add i64 %12, 0, !dbg !56
-  %78 = inttoptr i64 %77 to i64*, !dbg !56
-  store i64 0, i64* %78, align 1, !dbg !56
-  %79 = add i64 %12, 9, !dbg !56
-  %80 = inttoptr i64 %79 to i16*, !dbg !56
-  store i16 0, i16* %80, align 1, !dbg !56
-  %81 = add i64 %12, 11, !dbg !56
-  %82 = inttoptr i64 %81 to i8*, !dbg !56
-  store i8 0, i8* %82, align 1, !dbg !56
-  ret i8* %76, !dbg !56
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-; Function Attrs: nonlazybind
-declare i8* @objc_msgSend(i8*, i8*, ...) #2
-
-declare i8* @objc_retainAutoreleasedReturnValue(i8* returned)
-
-declare void @objc_storeStrong(i8**, i8*)
-
-declare i8* @objc_autoreleaseReturnValue(i8* returned)
-
-define internal void @asan.module_ctor() {
-  call void @__asan_init()
-  call void @__asan_version_mismatch_check_v8()
-  ret void
-}
-
-declare void @__asan_init()
-
-declare void @__asan_version_mismatch_check_v8()
-
-declare void @__asan_report_load8(i64)
-
-declare void @__asan_report_load16(i64)
-
-declare void @__asan_report_store8(i64)
-
-declare void @__asan_report_store16(i64)
-
-attributes #0 = { noinline sanitize_address ssp uwtable }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nonlazybind }
-attributes #3 = { nounwind }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!5, !6, !7, !8, !9, !10, !11, !12}
-!llvm.ident = !{!13}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !1, producer: "clang version 5.0.0 (trunk 295779) (llvm/trunk 295777)", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
-!1 = !DIFile(filename: "m.m", directory: "/")
-!2 = !{}
-!3 = !{!4}
-!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyObject", scope: !1, file: !1, line: 15, flags: DIFlagObjcClassComplete, elements: !2, runtimeLang: DW_LANG_ObjC)
-!5 = !{i32 1, !"Objective-C Version", i32 2}
-!6 = !{i32 1, !"Objective-C Image Info Version", i32 0}
-!7 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!8 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
-!9 = !{i32 1, !"Objective-C Class Properties", i32 64}
-!10 = !{i32 2, !"Dwarf Version", i32 2}
-!11 = !{i32 2, !"Debug Info Version", i32 3}
-!12 = !{i32 1, !"PIC Level", i32 2}
-!13 = !{!"clang version 5.0.0 (trunk 295779) (llvm/trunk 295777)"}
-!14 = distinct !DISubprogram(name: "+[MyObject doWithSize:]", scope: !1, file: !1, line: 16, type: !15, isLocal: true, isDefinition: true, scopeLine: 16, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
-!15 = !DISubroutineType(types: !16)
-!16 = !{!17, !24, !26, !29}
-!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "id", file: !1, baseType: !18)
-!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
-!19 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_object", file: !1, elements: !20)
-!20 = !{!21}
-!21 = !DIDerivedType(tag: DW_TAG_member, name: "isa", scope: !19, file: !1, baseType: !22, size: 64)
-!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64)
-!23 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_class", file: !1, flags: DIFlagFwdDecl)
-!24 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !25, flags: DIFlagArtificial | DIFlagObjectPointer)
-!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "Class", file: !1, baseType: !22)
-!26 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !1, baseType: !27, flags: DIFlagArtificial)
-!27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 64)
-!28 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_selector", file: !1, flags: DIFlagFwdDecl)
-!29 = !DIDerivedType(tag: DW_TAG_typedef, name: "CGSize", file: !1, line: 10, baseType: !30)
-!30 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CGSize", file: !1, line: 6, size: 128, elements: !31)
-!31 = !{!32, !34}
-!32 = !DIDerivedType(tag: DW_TAG_member, name: "width", scope: !30, file: !1, line: 7, baseType: !33, size: 64)
-!33 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
-!34 = !DIDerivedType(tag: DW_TAG_member, name: "height", scope: !30, file: !1, line: 8, baseType: !33, size: 64, offset: 64)
-!35 = !DILocation(line: 16, scope: !14)
-!36 = !DILocalVariable(name: "imageSize", arg: 3, scope: !14, file: !1, line: 16, type: !29)
-!37 = !DIExpression(DW_OP_deref)
-!38 = !DILocation(line: 16, column: 26, scope: !14)
-!39 = !DILocalVariable(name: "object", scope: !14, file: !1, line: 17, type: !40)
-!40 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !41, size: 64)
-!41 = !DICompositeType(tag: DW_TAG_structure_type, name: "Object", scope: !1, file: !1, line: 11, elements: !42, runtimeLang: DW_LANG_ObjC)
-!42 = !{!43}
-!43 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !41, baseType: !44)
-!44 = !DICompositeType(tag: DW_TAG_structure_type, name: "NSObject", scope: !1, file: !1, line: 3, elements: !2, runtimeLang: DW_LANG_ObjC)
-!45 = !DILocation(line: 17, column: 11, scope: !14)
-!46 = !DILocalVariable(name: "self", arg: 1, scope: !14, type: !47, flags: DIFlagArtificial | DIFlagObjectPointer)
-!47 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !25)
-!48 = !DIExpression()
-!49 = !DILocation(line: 0, scope: !14)
-!50 = !DILocalVariable(name: "_cmd", arg: 2, scope: !14, type: !51, flags: DIFlagArtificial)
-!51 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !1, baseType: !27)
-!52 = !DILocation(line: 17, column: 21, scope: !14)
-!53 = !DILocation(line: 17, column: 20, scope: !14)
-!54 = !DILocation(line: 18, column: 11, scope: !14)
-!55 = !DILocation(line: 18, column: 10, scope: !14)
-!56 = !DILocation(line: 19, column: 1, scope: !14)
diff --git a/test/DebugInfo/AArch64/asan-stack-vars.mir b/test/DebugInfo/AArch64/asan-stack-vars.mir
new file mode 100644
index 00000000000..428cef62722
--- /dev/null
+++ b/test/DebugInfo/AArch64/asan-stack-vars.mir
@@ -0,0 +1,682 @@
+# RUN: llc -O0 -start-before=livedebugvalues -filetype=obj -o - %s | llvm-dwarfdump -v - | FileCheck %s
+#
+# Derived from (clang -O0 -g -fsanitize=address -fobjc-arc)
+#   @protocol NSObject
+#   @end
+#   @interface NSObject<NSObject>{}
+#   + (instancetype)alloc;
+#   @end
+#   struct CGSize {
+#     double width;
+#     double height;
+#   };
+#   typedef struct CGSize CGSize;
+#   @interface Object : NSObject
+#   - (instancetype)initWithSize:(CGSize)size;
+#   - (id)aMessage;
+#   @end
+#   @implementation MyObject
+#   + (id)doWithSize:(CGSize)imageSize andObject:(id)object {
+#     return [object aMessage];
+#   }
+#   @end
+#
+# CHECK: .debug_info contents:
+# CHECK: DW_TAG_subprogram
+# CHECK-NEXT:   DW_AT_low_pc [DW_FORM_addr]     (0x0000000000000000)
+# CHECK-NEXT:   DW_AT_high_pc [DW_FORM_addr]    ([[FN_END:.*]])
+# CHECK: "_cmd"
+# CHECK: DW_TAG_formal_parameter
+# CHECK-NEXT: DW_AT_location
+# CHECK-NEXT:   [0x{{0*}}, 0x{{.*}}):
+# CHECK-NOT:    DW_AT_
+# CHECK:        [0x{{.*}}, [[FN_END]]):
+# CHECK-NEXT: DW_AT_name {{.*}}"imageSize"
+--- |
+  ; ModuleID = 'test/DebugInfo/AArch64/asan-stack-vars.ll'
+  source_filename = "m.m"
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "arm64-apple-ios"
+  
+  %0 = type opaque
+  %struct._class_t = type { %struct._class_t*, %struct._class_t*, %struct._objc_cache*, i8* (i8*, i8*)**, %struct._class_ro_t* }
+  %struct._objc_cache = type opaque
+  %struct._class_ro_t = type { i32, i32, i32, i8*, i8*, %struct.__method_list_t*, %struct._objc_protocol_list*, %struct._ivar_list_t*, i8*, %struct._prop_list_t* }
+  %struct.__method_list_t = type { i32, i32, [0 x %struct._objc_method] }
+  %struct._objc_method = type { i8*, i8*, i8* }
+  %struct._objc_protocol_list = type { i64, [0 x %struct._protocol_t*] }
+  %struct._protocol_t = type { i8*, i8*, %struct._objc_protocol_list*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct._prop_list_t*, i32, i32, i8**, i8*, %struct._prop_list_t* }
+  %struct._ivar_list_t = type { i32, i32, [0 x %struct._ivar_t] }
+  %struct._ivar_t = type { i32*, i8*, i8*, i32, i32 }
+  %struct._prop_list_t = type { i32, i32, [0 x %struct._prop_t] }
+  %struct._prop_t = type { i8*, i8* }
+  %struct.CGSize = type { double, double }
+  
+  @"OBJC_CLASS_$_Object" = external global %struct._class_t
+  @"OBJC_CLASSLIST_REFERENCES_$_" = private global %struct._class_t* @"OBJC_CLASS_$_Object", section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+  @OBJC_METH_VAR_NAME_ = private unnamed_addr constant [6 x i8] c"alloc\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+  @OBJC_SELECTOR_REFERENCES_ = private externally_initialized global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @OBJC_METH_VAR_NAME_, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
+  @OBJC_METH_VAR_NAME_.1 = private unnamed_addr constant [14 x i8] c"initWithSize:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+  @OBJC_SELECTOR_REFERENCES_.2 = private externally_initialized global i8* getelementptr inbounds ([14 x i8], [14 x i8]* @OBJC_METH_VAR_NAME_.1, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
+  @OBJC_METH_VAR_NAME_.3 = private unnamed_addr constant [9 x i8] c"aMessage\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+  @OBJC_SELECTOR_REFERENCES_.4 = private externally_initialized global i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_METH_VAR_NAME_.3, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
+  @_objc_empty_cache = external global %struct._objc_cache
+  @"OBJC_CLASS_$_MyObject" = global %struct._class_t { %struct._class_t* @"OBJC_METACLASS_$_MyObject", %struct._class_t* null, %struct._objc_cache* @_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* @"\01l_OBJC_CLASS_RO_$_MyObject" }, section "__DATA, __objc_data", align 8
+  @"OBJC_METACLASS_$_MyObject" = global %struct._class_t { %struct._class_t* @"OBJC_METACLASS_$_MyObject", %struct._class_t* @"OBJC_CLASS_$_MyObject", %struct._objc_cache* @_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* @"\01l_OBJC_METACLASS_RO_$_MyObject" }, section "__DATA, __objc_data", align 8
+  @OBJC_CLASS_NAME_ = private unnamed_addr constant [9 x i8] c"MyObject\00", section "__TEXT,__objc_classname,cstring_literals", align 1
+  @OBJC_METH_VAR_NAME_.5 = private unnamed_addr constant [12 x i8] c"doWithSize:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+  @OBJC_METH_VAR_TYPE_ = private unnamed_addr constant [21 x i8] c"@32@0:8{CGSize=dd}16\00", section "__TEXT,__objc_methtype,cstring_literals", align 1
+  @"\01l_OBJC_$_CLASS_METHODS_MyObject" = private global { i32, i32, [1 x %struct._objc_method] } { i32 24, i32 1, [1 x %struct._objc_method] [%struct._objc_method { i8* getelementptr inbounds ([12 x i8], [12 x i8]* @OBJC_METH_VAR_NAME_.5, i32 0, i32 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @OBJC_METH_VAR_TYPE_, i32 0, i32 0), i8* bitcast (i8* (i8*, i8*, [2 x double])* @"\01+[MyObject doWithSize:]" to i8*) }] }, section "__DATA, __objc_const", align 8
+  @"\01l_OBJC_METACLASS_RO_$_MyObject" = private global %struct._class_ro_t { i32 131, i32 40, i32 40, i8* null, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01l_OBJC_$_CLASS_METHODS_MyObject" to %struct.__method_list_t*), %struct._objc_protocol_list* null, %struct._ivar_list_t* null, i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
+  @"\01l_OBJC_CLASS_RO_$_MyObject" = private global %struct._class_ro_t { i32 130, i32 0, i32 0, i8* null, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* null, %struct._objc_protocol_list* null, %struct._ivar_list_t* null, i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
+  @"OBJC_LABEL_CLASS_$" = private global [1 x i8*] [i8* bitcast (%struct._class_t* @"OBJC_CLASS_$_MyObject" to i8*)], section "__DATA, __objc_classlist, regular, no_dead_strip", align 8
+  @llvm.compiler.used = appending global [12 x i8*] [i8* bitcast (%struct._class_t** @"OBJC_CLASSLIST_REFERENCES_$_" to i8*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @OBJC_METH_VAR_NAME_, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_ to i8*), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @OBJC_METH_VAR_NAME_.1, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_.2 to i8*), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_METH_VAR_NAME_.3, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_.4 to i8*), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @OBJC_METH_VAR_NAME_.5, i32 0, i32 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @OBJC_METH_VAR_TYPE_, i32 0, i32 0), i8* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01l_OBJC_$_CLASS_METHODS_MyObject" to i8*), i8* bitcast ([1 x i8*]* @"OBJC_LABEL_CLASS_$" to i8*)], section "llvm.metadata"
+  @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @asan.module_ctor, i8* null }]
+  @__asan_shadow_memory_dynamic_address = external global i64
+  @___asan_gen_ = private unnamed_addr constant [34 x i8] c"2 32 16 9 imageSize 64 8 6 object\00", align 1
+  @__stack_chk_guard = external global i8*
+  
+  ; Function Attrs: noinline sanitize_address ssp uwtable
+  define internal i8* @"\01+[MyObject doWithSize:]"(i8* %self, i8* %_cmd, [2 x double] %imageSize.coerce) #0 !dbg !14 {
+  entry:
+    %StackGuardSlot = alloca i8*
+    %0 = call i8* @llvm.stackguard()
+    call void @llvm.stackprotector(i8* %0, i8** %StackGuardSlot)
+    %1 = load i64, i64* @__asan_shadow_memory_dynamic_address
+    %self.addr = alloca i8*, align 8
+    %_cmd.addr = alloca i8*, align 8
+    %MyAlloca = alloca [96 x i8], align 32, !dbg !35
+    %2 = ptrtoint [96 x i8]* %MyAlloca to i64, !dbg !35
+    %3 = add i64 %2, 32, !dbg !35
+    %4 = inttoptr i64 %3 to %struct.CGSize*, !dbg !35
+    %5 = add i64 %2, 64, !dbg !35
+    %6 = inttoptr i64 %5 to %0**, !dbg !35
+    %7 = inttoptr i64 %2 to i64*, !dbg !35
+    store i64 1102416563, i64* %7, !dbg !35
+    %8 = add i64 %2, 8, !dbg !35
+    %9 = inttoptr i64 %8 to i64*, !dbg !35
+    store i64 ptrtoint ([34 x i8]* @___asan_gen_ to i64), i64* %9, !dbg !35
+    %10 = add i64 %2, 16, !dbg !35
+    %11 = inttoptr i64 %10 to i64*, !dbg !35
+    store i64 ptrtoint (i8* (i8*, i8*, [2 x double])* @"\01+[MyObject doWithSize:]" to i64), i64* %11, !dbg !35
+    %12 = lshr i64 %2, 3, !dbg !35
+    %13 = add i64 %12, %1, !dbg !35
+    %14 = add i64 %13, 0, !dbg !35
+    %15 = inttoptr i64 %14 to i64*, !dbg !35
+    store i64 -940689368107847183, i64* %15, align 1, !dbg !35
+    %16 = add i64 %13, 9, !dbg !35
+    %17 = inttoptr i64 %16 to i16*, !dbg !35
+    store i16 -3085, i16* %17, align 1, !dbg !35
+    %18 = add i64 %13, 11, !dbg !35
+    %19 = inttoptr i64 %18 to i8*, !dbg !35
+    store i8 -13, i8* %19, align 1, !dbg !35
+    call void @llvm.dbg.declare(metadata %struct.CGSize* %4, metadata !36, metadata !DIExpression(DW_OP_deref)), !dbg !37
+    call void @llvm.dbg.declare(metadata %0** %6, metadata !38, metadata !DIExpression(DW_OP_deref)), !dbg !44
+    %20 = bitcast %struct.CGSize* %4 to [2 x double]*
+    %21 = ptrtoint [2 x double]* %20 to i64
+    %22 = lshr i64 %21, 3
+    %23 = add i64 %22, %1
+    %24 = inttoptr i64 %23 to i16*
+    %25 = load i16, i16* %24
+    %26 = icmp ne i16 %25, 0
+    br i1 %26, label %27, label %28
+  
+  ; <label>:27:                                     ; preds = %entry
+    call void @__asan_report_store16(i64 %21)
+    call void asm sideeffect "", ""()
+    unreachable
+  
+  ; <label>:28:                                     ; preds = %entry
+    store [2 x double] %imageSize.coerce, [2 x double]* %20, align 8
+    store i8* %self, i8** %self.addr, align 8
+    call void @llvm.dbg.declare(metadata i8** %self.addr, metadata !45, metadata !DIExpression()), !dbg !47
+    store i8* %_cmd, i8** %_cmd.addr, align 8
+    call void @llvm.dbg.declare(metadata i8** %_cmd.addr, metadata !48, metadata !DIExpression()), !dbg !47
+    %29 = load %struct._class_t*, %struct._class_t** @"OBJC_CLASSLIST_REFERENCES_$_", align 8, !dbg !50
+    %30 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_ to i64), i64 3), %1, !dbg !50
+    %31 = inttoptr i64 %30 to i8*, !dbg !50
+    %32 = load i8, i8* %31, !dbg !50
+    %33 = icmp ne i8 %32, 0, !dbg !50
+    br i1 %33, label %34, label %35, !dbg !50
+  
+  ; <label>:34:                                     ; preds = %28
+    call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_ to i64)), !dbg !50
+    call void asm sideeffect "", ""(), !dbg !50
+    unreachable, !dbg !50
+  
+  ; <label>:35:                                     ; preds = %28
+    %36 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_, align 8, !dbg !50, !invariant.load !2
+    %37 = bitcast %struct._class_t* %29 to i8*, !dbg !50
+    %call = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %37, i8* %36), !dbg !50
+    %38 = bitcast i8* %call to %0*, !dbg !50
+    %39 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.2 to i64), i64 3), %1, !dbg !51
+    %40 = inttoptr i64 %39 to i8*, !dbg !51
+    %41 = load i8, i8* %40, !dbg !51
+    %42 = icmp ne i8 %41, 0, !dbg !51
+    br i1 %42, label %43, label %44, !dbg !51
+  
+  ; <label>:43:                                     ; preds = %35
+    call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.2 to i64)), !dbg !51
+    call void asm sideeffect "", ""(), !dbg !51
+    unreachable, !dbg !51
+  
+  ; <label>:44:                                     ; preds = %35
+    %45 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_.2, align 8, !dbg !51, !invariant.load !2
+    %46 = bitcast %0* %38 to i8*, !dbg !51
+    %47 = bitcast %struct.CGSize* %4 to [2 x double]*, !dbg !51
+    %48 = ptrtoint [2 x double]* %47 to i64, !dbg !51
+    %49 = lshr i64 %48, 3, !dbg !51
+    %50 = add i64 %49, %1, !dbg !51
+    %51 = inttoptr i64 %50 to i16*, !dbg !51
+    %52 = load i16, i16* %51, !dbg !51
+    %53 = icmp ne i16 %52, 0, !dbg !51
+    br i1 %53, label %54, label %55, !dbg !51
+  
+  ; <label>:54:                                     ; preds = %44
+    call void @__asan_report_load16(i64 %48), !dbg !51
+    call void asm sideeffect "", ""(), !dbg !51
+    unreachable, !dbg !51
+  
+  ; <label>:55:                                     ; preds = %44
+    %56 = load [2 x double], [2 x double]* %47, align 8, !dbg !51
+    %call1 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, [2 x double])*)(i8* %46, i8* %45, [2 x double] %56), !dbg !51
+    %57 = bitcast i8* %call1 to %0*, !dbg !51
+    %58 = ptrtoint %0** %6 to i64, !dbg !44
+    %59 = lshr i64 %58, 3, !dbg !44
+    %60 = add i64 %59, %1, !dbg !44
+    %61 = inttoptr i64 %60 to i8*, !dbg !44
+    %62 = load i8, i8* %61, !dbg !44
+    %63 = icmp ne i8 %62, 0, !dbg !44
+    br i1 %63, label %64, label %65, !dbg !44
+  
+  ; <label>:64:                                     ; preds = %55
+    call void @__asan_report_store8(i64 %58), !dbg !44
+    call void asm sideeffect "", ""(), !dbg !44
+    unreachable, !dbg !44
+  
+  ; <label>:65:                                     ; preds = %55
+    store %0* %57, %0** %6, align 8, !dbg !44
+    %66 = load %0*, %0** %6, align 8, !dbg !52
+    %67 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.4 to i64), i64 3), %1, !dbg !53
+    %68 = inttoptr i64 %67 to i8*, !dbg !53
+    %69 = load i8, i8* %68, !dbg !53
+    %70 = icmp ne i8 %69, 0, !dbg !53
+    br i1 %70, label %71, label %72, !dbg !53
+  
+  ; <label>:71:                                     ; preds = %65
+    call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.4 to i64)), !dbg !53
+    call void asm sideeffect "", ""(), !dbg !53
+    unreachable, !dbg !53
+  
+  ; <label>:72:                                     ; preds = %65
+    %73 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_.4, align 8, !dbg !53, !invariant.load !2
+    %74 = bitcast %0* %66 to i8*, !dbg !53
+    %call2 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %74, i8* %73), !dbg !53
+    call void asm sideeffect "mov\09fp, fp\09\09; marker for objc_retainAutoreleaseReturnValue", ""(), !dbg !53
+    %75 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call2) #3, !dbg !53
+    %76 = bitcast %0** %6 to i8**, !dbg !54
+    call void @objc_storeStrong(i8** %76, i8* null) #3, !dbg !54
+    %77 = tail call i8* @objc_autoreleaseReturnValue(i8* %75) #3, !dbg !54
+    store i64 1172321806, i64* %7, !dbg !54
+    %78 = add i64 %13, 0, !dbg !54
+    %79 = inttoptr i64 %78 to i64*, !dbg !54
+    store i64 0, i64* %79, align 1, !dbg !54
+    %80 = add i64 %13, 9, !dbg !54
+    %81 = inttoptr i64 %80 to i16*, !dbg !54
+    store i16 0, i16* %81, align 1, !dbg !54
+    %82 = add i64 %13, 11, !dbg !54
+    %83 = inttoptr i64 %82 to i8*, !dbg !54
+    store i8 0, i8* %83, align 1, !dbg !54
+    %84 = call i8* @llvm.stackguard()
+    %85 = load volatile i8*, i8** %StackGuardSlot
+    %86 = icmp eq i8* %84, %85
+    br i1 %86, label %SP_return, label %CallStackCheckFailBlk, !prof !55
+  
+  SP_return:                                        ; preds = %72
+    ret i8* %77, !dbg !54
+  
+  CallStackCheckFailBlk:                            ; preds = %72
+    call void @__stack_chk_fail(), !dbg !47
+    unreachable, !dbg !47
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+  
+  ; Function Attrs: nonlazybind
+  declare i8* @objc_msgSend(i8*, i8*, ...) #2
+  
+  declare i8* @objc_retainAutoreleasedReturnValue(i8* returned)
+  
+  declare void @objc_storeStrong(i8**, i8*)
+  
+  declare i8* @objc_autoreleaseReturnValue(i8* returned)
+  
+  define internal void @asan.module_ctor() {
+    call void @__asan_init()
+    call void @__asan_version_mismatch_check_v8()
+    ret void
+  }
+  
+  declare void @__asan_init()
+  
+  declare void @__asan_version_mismatch_check_v8()
+  
+  declare void @__asan_report_load8(i64)
+  
+  declare void @__asan_report_load16(i64)
+  
+  declare void @__asan_report_store8(i64)
+  
+  declare void @__asan_report_store16(i64)
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #3
+  
+  ; Function Attrs: nounwind
+  declare i8* @llvm.stackguard() #3
+  
+  declare void @__stack_chk_fail()
+  
+  attributes #0 = { noinline sanitize_address ssp uwtable }
+  attributes #1 = { nounwind readnone speculatable }
+  attributes #2 = { nonlazybind }
+  attributes #3 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!5, !6, !7, !8, !9, !10, !11, !12}
+  !llvm.ident = !{!13}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !1, producer: "clang version 5.0.0 (trunk 295779) (llvm/trunk 295777)", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+  !1 = !DIFile(filename: "m.m", directory: "/")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyObject", scope: !1, file: !1, line: 15, flags: DIFlagObjcClassComplete, elements: !2, runtimeLang: DW_LANG_ObjC)
+  !5 = !{i32 1, !"Objective-C Version", i32 2}
+  !6 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+  !7 = !{i32 1, !"Objective-C Image Info Section", !"__DATA,__objc_imageinfo,regular,no_dead_strip"}
+  !8 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
+  !9 = !{i32 1, !"Objective-C Class Properties", i32 64}
+  !10 = !{i32 2, !"Dwarf Version", i32 2}
+  !11 = !{i32 2, !"Debug Info Version", i32 3}
+  !12 = !{i32 7, !"PIC Level", i32 2}
+  !13 = !{!"clang version 5.0.0 (trunk 295779) (llvm/trunk 295777)"}
+  !14 = distinct !DISubprogram(name: "+[MyObject doWithSize:]", scope: !1, file: !1, line: 16, type: !15, isLocal: true, isDefinition: true, scopeLine: 16, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+  !15 = !DISubroutineType(types: !16)
+  !16 = !{!17, !24, !26, !29}
+  !17 = !DIDerivedType(tag: DW_TAG_typedef, name: "id", file: !1, baseType: !18)
+  !18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
+  !19 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_object", file: !1, elements: !20)
+  !20 = !{!21}
+  !21 = !DIDerivedType(tag: DW_TAG_member, name: "isa", scope: !19, file: !1, baseType: !22, size: 64)
+  !22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64)
+  !23 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_class", file: !1, flags: DIFlagFwdDecl)
+  !24 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !25, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !25 = !DIDerivedType(tag: DW_TAG_typedef, name: "Class", file: !1, baseType: !22)
+  !26 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !1, baseType: !27, flags: DIFlagArtificial)
+  !27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 64)
+  !28 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_selector", file: !1, flags: DIFlagFwdDecl)
+  !29 = !DIDerivedType(tag: DW_TAG_typedef, name: "CGSize", file: !1, line: 10, baseType: !30)
+  !30 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CGSize", file: !1, line: 6, size: 128, elements: !31)
+  !31 = !{!32, !34}
+  !32 = !DIDerivedType(tag: DW_TAG_member, name: "width", scope: !30, file: !1, line: 7, baseType: !33, size: 64)
+  !33 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+  !34 = !DIDerivedType(tag: DW_TAG_member, name: "height", scope: !30, file: !1, line: 8, baseType: !33, size: 64, offset: 64)
+  !35 = !DILocation(line: 16, scope: !14)
+  !36 = !DILocalVariable(name: "imageSize", arg: 3, scope: !14, file: !1, line: 16, type: !29)
+  !37 = !DILocation(line: 16, column: 26, scope: !14)
+  !38 = !DILocalVariable(name: "object", scope: !14, file: !1, line: 17, type: !39)
+  !39 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !40, size: 64)
+  !40 = !DICompositeType(tag: DW_TAG_structure_type, name: "Object", scope: !1, file: !1, line: 11, elements: !41, runtimeLang: DW_LANG_ObjC)
+  !41 = !{!42}
+  !42 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !40, baseType: !43)
+  !43 = !DICompositeType(tag: DW_TAG_structure_type, name: "NSObject", scope: !1, file: !1, line: 3, elements: !2, runtimeLang: DW_LANG_ObjC)
+  !44 = !DILocation(line: 17, column: 11, scope: !14)
+  !45 = !DILocalVariable(name: "self", arg: 1, scope: !14, type: !46, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !46 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !25)
+  !47 = !DILocation(line: 0, scope: !14)
+  !48 = !DILocalVariable(name: "_cmd", arg: 2, scope: !14, type: !49, flags: DIFlagArtificial)
+  !49 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !1, baseType: !27)
+  !50 = !DILocation(line: 17, column: 21, scope: !14)
+  !51 = !DILocation(line: 17, column: 20, scope: !14)
+  !52 = !DILocation(line: 18, column: 11, scope: !14)
+  !53 = !DILocation(line: 18, column: 10, scope: !14)
+  !54 = !DILocation(line: 19, column: 1, scope: !14)
+  !55 = !{!"branch_weights", i32 2147481600, i32 2048}
+
+...
+---
+name:            "\x01+[MyObject doWithSize:]"
+alignment:       2
+tracksRegLiveness: true
+liveins:         
+  - { reg: '$x0' }
+  - { reg: '$x1' }
+  - { reg: '$d0' }
+  - { reg: '$d1' }
+frameInfo:       
+  stackSize:       352
+  maxAlignment:    32
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  '%stack.0.StackGuardSlot'
+  maxCallFrameSize: 0
+  localFrameSize:  144
+stack:           
+  - { id: 0, name: StackGuardSlot, offset: -40, size: 8, alignment: 8, 
+      stack-id: 0, local-offset: -8 }
+  - { id: 1, name: self.addr, offset: -168, size: 8, alignment: 8, stack-id: 0, 
+      local-offset: -136, debug-info-variable: '!45', debug-info-expression: '!DIExpression()', 
+      debug-info-location: '!47' }
+  - { id: 2, name: _cmd.addr, offset: -176, size: 8, alignment: 8, stack-id: 0, 
+      local-offset: -144, debug-info-variable: '!48', debug-info-expression: '!DIExpression()', 
+      debug-info-location: '!47' }
+  - { id: 3, name: MyAlloca, offset: -160, size: 96, alignment: 32, stack-id: 0, 
+      local-offset: -128 }
+  - { id: 4, type: spill-slot, offset: -184, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 5, type: spill-slot, offset: -192, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 6, type: spill-slot, offset: -200, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 7, type: spill-slot, offset: -208, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 8, type: spill-slot, offset: -216, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 9, type: spill-slot, offset: -224, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 10, type: spill-slot, offset: -232, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 11, type: spill-slot, offset: -240, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 12, type: spill-slot, offset: -248, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 13, type: spill-slot, offset: -256, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 14, type: spill-slot, offset: -264, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 15, type: spill-slot, offset: -272, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 16, type: spill-slot, offset: -280, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 17, type: spill-slot, offset: -288, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 18, type: spill-slot, offset: -296, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 19, type: spill-slot, offset: -304, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 20, type: spill-slot, offset: -312, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 21, type: spill-slot, offset: -320, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 22, type: spill-slot, offset: -328, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 23, type: spill-slot, offset: -336, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 24, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$lr' }
+  - { id: 25, type: spill-slot, offset: -16, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$fp' }
+  - { id: 26, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$x27' }
+  - { id: 27, type: spill-slot, offset: -32, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$x28' }
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $d0, $d1, $x27, $x28, $lr
+  
+    early-clobber $sp = frame-setup STPXpre killed $x28, killed $x27, $sp, -4 :: (store 8 into %stack.27), (store 8 into %stack.26)
+    frame-setup STPXi killed $fp, killed $lr, $sp, 2 :: (store 8 into %stack.25), (store 8 into %stack.24)
+    $fp = frame-setup ADDXri $sp, 16, 0
+    $x9 = frame-setup SUBXri $sp, 320, 0
+    $sp = ANDXri killed $x9, 7930
+    frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    frame-setup CFI_INSTRUCTION offset $w30, -8
+    frame-setup CFI_INSTRUCTION offset $w29, -16
+    frame-setup CFI_INSTRUCTION offset $w27, -24
+    frame-setup CFI_INSTRUCTION offset $w28, -32
+    renamable $x8 = ADRP target-flags(aarch64-page) @"\01+[MyObject doWithSize:]"
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @"\01+[MyObject doWithSize:]", 0
+    renamable $x9 = ADRP target-flags(aarch64-page) @___asan_gen_
+    renamable $x9 = ADDXri killed renamable $x9, target-flags(aarch64-pageoff, aarch64-nc) @___asan_gen_, 0
+    $x10 = ADDXri $sp, 192, 0
+    renamable $x11 = ADRP target-flags(aarch64-page, aarch64-got) @__asan_shadow_memory_dynamic_address
+    renamable $x11 = LDRXui killed renamable $x11, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc) @__asan_shadow_memory_dynamic_address
+    $x12 = ADRP target-flags(aarch64-page, aarch64-got) @__stack_chk_guard
+    $x12 = LDRXui $x12, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc) @__stack_chk_guard
+    $x12 = LDRXui killed $x12, 0 :: (dereferenceable invariant load 8 from @__stack_chk_guard)
+    $x13 = ADRP target-flags(aarch64-page, aarch64-got) @__stack_chk_guard
+    $x13 = LDRXui $x13, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc) @__stack_chk_guard
+    $x13 = LDRXui killed $x13, 0 :: (dereferenceable invariant load 8 from @__stack_chk_guard)
+    STRXui killed renamable $x13, $sp, 39 :: (volatile store 8 into %stack.0.StackGuardSlot)
+    renamable $x11 = LDRXui killed renamable $x11, 0 :: (load 8 from @__asan_shadow_memory_dynamic_address)
+    renamable $x13 = ADDXri renamable $x10, 32, 0, debug-location !35
+    renamable $x14 = ADDXri renamable $x10, 64, 0, debug-location !35
+    $x15 = MOVZXi 35507, 0, debug-location !35
+    $x15 = MOVKXi $x15, 16821, 16, debug-location !35
+    STRXui killed renamable $x15, $sp, 24, debug-location !35 :: (store 96 into %stack.3.MyAlloca, align 32)
+    STRXui killed renamable $x9, $sp, 25, debug-location !35 :: (store 96 into %stack.3.MyAlloca + 1, align 32)
+    STRXui killed renamable $x8, $sp, 26, debug-location !35 :: (store 96 into %stack.3.MyAlloca + 2, align 32)
+    renamable $x8 = UBFMXri renamable $x10, 3, 63, debug-location !35
+    renamable $x9 = ADDXrs renamable $x11, renamable $x10, 67, debug-location !35
+    $x15 = MOVZXi 61937, 0, debug-location !35
+    $x15 = MOVKXi $x15, 61937, 16, debug-location !35
+    $x15 = MOVKXi $x15, 62194, 48, debug-location !35
+    STRXroX killed renamable $x15, renamable $x8, renamable $x11, 0, 0, debug-location !35 :: (store 8 into %ir.15, align 1)
+    renamable $x15 = ADDXrs renamable $x8, renamable $x11, 0, debug-location !35
+    $w16 = MOVZWi 62451, 0, debug-location !35
+    STURHHi killed renamable $w16, killed renamable $x15, 9, debug-location !35 :: (store 2 into %ir.17, align 1)
+    renamable $x8 = ADDXrs killed renamable $x8, renamable $x11, 0, debug-location !35
+    $w16 = MOVZWi 243, 0, debug-location !35
+    STRBBui killed renamable $w16, killed renamable $x8, 11, debug-location !35 :: (store 1 into %ir.19)
+    DBG_VALUE renamable $x13, 0, !36, !DIExpression(DW_OP_deref), debug-location !37
+    DBG_VALUE renamable $x14, 0, !38, !DIExpression(DW_OP_deref), debug-location !44
+    $x8 = ORRXrs $xzr, $x13, 0
+    renamable $x15 = UBFMXri renamable $x8, 3, 63
+    renamable $w16 = LDRHHroX killed renamable $x15, renamable $x11, 0, 0 :: (load 2 from %ir.24)
+    renamable $w16 = UBFMWri killed renamable $w16, 0, 15
+    STRXui killed $x1, $sp, 21 :: (store 8 into %stack.4)
+    STRDui killed $d1, $sp, 20 :: (store 8 into %stack.5)
+    STRDui killed $d0, $sp, 19 :: (store 8 into %stack.6)
+    STRXui killed $x0, $sp, 18 :: (store 8 into %stack.7)
+    STRXui killed $x13, $sp, 17 :: (store 8 into %stack.8)
+    DBG_VALUE $sp, 0, !36, !DIExpression(DW_OP_plus_uconst, 136, DW_OP_deref, DW_OP_deref), debug-location !37
+    STRXui killed $x14, $sp, 16 :: (store 8 into %stack.9)
+    DBG_VALUE $sp, 0, !38, !DIExpression(DW_OP_plus_uconst, 128, DW_OP_deref, DW_OP_deref), debug-location !44
+    STRXui killed $x10, $sp, 15 :: (store 8 into %stack.10)
+    STRXui killed $x12, $sp, 14 :: (store 8 into %stack.11)
+    STRXui killed $x11, $sp, 13 :: (store 8 into %stack.12)
+    STRXui killed $x9, $sp, 12 :: (store 8 into %stack.13)
+    STRXui killed $x8, $sp, 11 :: (store 8 into %stack.14)
+    CBZW killed renamable $w16, %bb.2
+  
+  bb.1 (%ir-block.27):
+    successors: 
+  
+    $x0 = LDRXui $sp, 11 :: (load 8 from %stack.14)
+    BL @__asan_report_store16, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp
+    INLINEASM &"", 1
+    BRK 1
+  
+  bb.2 (%ir-block.28):
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_, 0
+    renamable $x9 = ORRXri $xzr, 4097
+    renamable $x8 = LSRVXr killed renamable $x8, killed renamable $x9
+    renamable $x9 = ADRP target-flags(aarch64-page) @"OBJC_CLASSLIST_REFERENCES_$_"
+    renamable $x9 = ADDXri killed renamable $x9, target-flags(aarch64-pageoff, aarch64-nc) @"OBJC_CLASSLIST_REFERENCES_$_", 0
+    $d0 = LDRDui $sp, 19 :: (load 8 from %stack.6)
+    $x10 = LDRXui $sp, 11 :: (load 8 from %stack.14)
+    STRDui killed renamable $d0, renamable $x10, 0 :: (store 8 into %ir.20)
+    $d1 = LDRDui $sp, 20 :: (load 8 from %stack.5)
+    STRDui killed renamable $d1, killed renamable $x10, 1 :: (store 8 into %ir.20 + 8)
+    $x11 = LDRXui $sp, 18 :: (load 8 from %stack.7)
+    STRXui killed renamable $x11, $sp, 23 :: (store 8 into %stack.1.self.addr)
+    $x12 = LDRXui $sp, 21 :: (load 8 from %stack.4)
+    STRXui killed renamable $x12, $sp, 22 :: (store 8 into %stack.2._cmd.addr)
+    renamable $x9 = LDRXui killed renamable $x9, 0, debug-location !50 :: (load 8 from @"OBJC_CLASSLIST_REFERENCES_$_")
+    $x13 = LDRXui $sp, 13 :: (load 8 from %stack.12)
+    renamable $w14 = LDRBBroX killed renamable $x8, killed renamable $x13, 0, 0, debug-location !50 :: (load 1 from %ir.31)
+    renamable $w14 = UBFMWri killed renamable $w14, 0, 7, debug-location !50
+    STRXui killed $x9, $sp, 10 :: (store 8 into %stack.15)
+    CBZW killed renamable $w14, %bb.4, debug-location !50
+  
+  bb.3 (%ir-block.34):
+    successors: 
+  
+    $x0 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_, debug-location !50
+    renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_, 0, debug-location !50
+    BL @__asan_report_load8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, debug-location !50
+    INLINEASM &"", 1, debug-location !50
+    BRK 1, debug-location !50
+  
+  bb.4 (%ir-block.35):
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_, 0
+    renamable $x1 = LDRXui killed renamable $x8, 0, debug-location !50 :: (invariant load 8 from @OBJC_SELECTOR_REFERENCES_)
+    $x8 = LDRXui $sp, 10 :: (load 8 from %stack.15)
+    $x0 = ORRXrs $xzr, killed $x8, 0, debug-location !50
+    BL @objc_msgSend, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, implicit-def $x0, debug-location !50
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.2
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.2, 0
+    renamable $x1 = ORRXri $xzr, 4097
+    renamable $x8 = LSRVXr killed renamable $x8, killed renamable $x1
+    $x1 = LDRXui $sp, 13 :: (load 8 from %stack.12)
+    renamable $w9 = LDRBBroX killed renamable $x8, killed renamable $x1, 0, 0, debug-location !51 :: (load 1 from %ir.40)
+    renamable $w9 = UBFMWri killed renamable $w9, 0, 7, debug-location !51
+    STRXui killed $x0, $sp, 9 :: (store 8 into %stack.16)
+    CBZW killed renamable $w9, %bb.6, debug-location !51
+  
+  bb.5 (%ir-block.43):
+    successors: 
+  
+    $x0 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.2, debug-location !51
+    renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.2, 0, debug-location !51
+    BL @__asan_report_load8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, debug-location !51
+    INLINEASM &"", 1, debug-location !51
+    BRK 1, debug-location !51
+  
+  bb.6 (%ir-block.44):
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.2
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.2, 0
+    renamable $x1 = LDRXui killed renamable $x8, 0, debug-location !51 :: (invariant load 8 from @OBJC_SELECTOR_REFERENCES_.2)
+    $x8 = LDRXui $sp, 9 :: (load 8 from %stack.16)
+    $x9 = LDRXui $sp, 17 :: (load 8 from %stack.8)
+    renamable $x10 = UBFMXri renamable $x9, 3, 63, debug-location !51
+    $x11 = LDRXui $sp, 13 :: (load 8 from %stack.12)
+    renamable $w12 = LDRHHroX killed renamable $x10, killed renamable $x11, 0, 0, debug-location !51 :: (load 2 from %ir.51)
+    renamable $w12 = UBFMWri killed renamable $w12, 0, 15, debug-location !51
+    STRXui killed $x1, $sp, 8 :: (store 8 into %stack.17)
+    STRXui killed $x8, $sp, 7 :: (store 8 into %stack.18)
+    STRXui killed $x9, $sp, 6 :: (store 8 into %stack.19)
+    CBZW killed renamable $w12, %bb.8, debug-location !51
+  
+  bb.7 (%ir-block.54):
+    successors: 
+  
+    $x0 = LDRXui $sp, 6 :: (load 8 from %stack.19)
+    BL @__asan_report_load16, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, debug-location !51
+    INLINEASM &"", 1, debug-location !51
+    BRK 1, debug-location !51
+  
+  bb.8 (%ir-block.55):
+    $x8 = LDRXui $sp, 6 :: (load 8 from %stack.19)
+    renamable $d1 = LDRDui renamable $x8, 1, debug-location !51 :: (load 8 from %ir.47 + 8)
+    renamable $d0 = LDRDui killed renamable $x8, 0, debug-location !51 :: (load 8 from %ir.47)
+    $x0 = LDRXui $sp, 7 :: (load 8 from %stack.18)
+    $x1 = LDRXui $sp, 8 :: (load 8 from %stack.17)
+    BL @objc_msgSend, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, implicit killed $d0, implicit killed $d1, implicit-def $sp, implicit-def $x0, debug-location !51
+    $x8 = LDRXui $sp, 16 :: (load 8 from %stack.9)
+    renamable $x1 = UBFMXri killed renamable $x8, 3, 63, debug-location !44
+    $lr = LDRXui $sp, 13 :: (load 8 from %stack.12)
+    renamable $w9 = LDRBBroX killed renamable $x1, killed renamable $lr, 0, 0, debug-location !44 :: (load 1 from %ir.61)
+    renamable $w9 = UBFMWri killed renamable $w9, 0, 7, debug-location !44
+    STRXui killed $x0, $sp, 5 :: (store 8 into %stack.20)
+    CBZW killed renamable $w9, %bb.10, debug-location !44
+  
+  bb.9 (%ir-block.64):
+    successors: 
+  
+    $x0 = LDRXui $sp, 16 :: (load 8 from %stack.9)
+    BL @__asan_report_store8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, debug-location !44
+    INLINEASM &"", 1, debug-location !44
+    BRK 1, debug-location !44
+  
+  bb.10 (%ir-block.65):
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.4
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.4, 0
+    renamable $x9 = ORRXri $xzr, 4097
+    renamable $x8 = LSRVXr killed renamable $x8, killed renamable $x9
+    $x9 = LDRXui $sp, 5 :: (load 8 from %stack.20)
+    $x10 = LDRXui $sp, 16 :: (load 8 from %stack.9)
+    STRXui killed renamable $x9, renamable $x10, 0, debug-location !44 :: (store 8 into %ir.6)
+    renamable $x11 = LDRXui killed renamable $x10, 0, debug-location !52 :: (load 8 from %ir.6)
+    $x12 = LDRXui $sp, 13 :: (load 8 from %stack.12)
+    renamable $w13 = LDRBBroX killed renamable $x8, killed renamable $x12, 0, 0, debug-location !53 :: (load 1 from %ir.68)
+    renamable $w13 = UBFMWri killed renamable $w13, 0, 7, debug-location !53
+    STRXui killed $x11, $sp, 4 :: (store 8 into %stack.21)
+    CBZW killed renamable $w13, %bb.12, debug-location !53
+  
+  bb.11 (%ir-block.71):
+    successors: 
+  
+    $x0 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.4, debug-location !53
+    renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.4, 0, debug-location !53
+    BL @__asan_report_load8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, debug-location !53
+    INLINEASM &"", 1, debug-location !53
+    BRK 1, debug-location !53
+  
+  bb.12 (%ir-block.72):
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.4
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.4, 0
+    renamable $x1 = LDRXui killed renamable $x8, 0, debug-location !53 :: (invariant load 8 from @OBJC_SELECTOR_REFERENCES_.4)
+    $x8 = LDRXui $sp, 4 :: (load 8 from %stack.21)
+    $x0 = ORRXrs $xzr, killed $x8, 0, debug-location !53
+    BL @objc_msgSend, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, implicit-def $x0, debug-location !53
+    INLINEASM &"mov\09fp, fp\09\09; marker for objc_retainAutoreleaseReturnValue", 1, debug-location !53
+    BL @objc_retainAutoreleasedReturnValue, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $x0, debug-location !53
+    $x8 = LDRXui $sp, 16 :: (load 8 from %stack.9)
+    STRXui killed $x0, $sp, 3 :: (store 8 into %stack.22)
+    $x0 = ORRXrs $xzr, killed $x8, 0, debug-location !54
+    $x8 = ORRXrs $xzr, killed $xzr, 0, debug-location !54
+    $x1 = ORRXrs $xzr, killed $x8, 0, debug-location !54
+    BL @objc_storeStrong, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, debug-location !54
+    $x0 = LDRXui $sp, 3 :: (load 8 from %stack.22)
+    BL @objc_autoreleaseReturnValue, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $x0, debug-location !54
+    $x8 = MOVZXi 13838, 0, debug-location !54
+    $x8 = MOVKXi $x8, 17888, 16, debug-location !54
+    $x1 = LDRXui $sp, 15 :: (load 8 from %stack.10)
+    STRXui killed renamable $x8, killed renamable $x1, 0, debug-location !54 :: (store 8 into %ir.7)
+    $x8 = LDRXui $sp, 12 :: (load 8 from %stack.13)
+    STRXui $xzr, renamable $x8, 0, debug-location !54 :: (store 8 into %ir.79, align 1)
+    STURHHi $wzr, renamable $x8, 9, debug-location !54 :: (store 2 into %ir.81, align 1)
+    STRBBui $wzr, killed renamable $x8, 11, debug-location !54 :: (store 1 into %ir.83)
+    $lr = ADRP target-flags(aarch64-page, aarch64-got) @__stack_chk_guard
+    $lr = LDRXui $lr, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc) @__stack_chk_guard
+    $lr = LDRXui killed $lr, 0 :: (dereferenceable invariant load 8 from @__stack_chk_guard)
+    renamable $x9 = LDRXui $sp, 39 :: (load 8 from %stack.0.StackGuardSlot)
+    $xzr = SUBSXrs killed renamable $lr, killed renamable $x9, 0, implicit-def $nzcv, implicit-def $nzcv
+    STRXui killed $x0, $sp, 2 :: (store 8 into %stack.23)
+    Bcc 1, %bb.14, implicit $nzcv
+  
+  bb.13.SP_return:
+    $x0 = LDRXui $sp, 2 :: (load 8 from %stack.23)
+    $sp = frame-destroy SUBXri $fp, 16, 0, debug-location !54
+    $fp, $lr = frame-destroy LDPXi $sp, 2, debug-location !54 :: (load 8 from %stack.25), (load 8 from %stack.24)
+    early-clobber $sp, $x28, $x27 = frame-destroy LDPXpost $sp, 4, debug-location !54 :: (load 8 from %stack.27), (load 8 from %stack.26)
+    RET undef $lr, implicit killed $x0, debug-location !54
+  
+  bb.14.CallStackCheckFailBlk:
+    BL @__stack_chk_fail, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, debug-location !47
+    BRK 1, debug-location !47
+
+...
+---
+name:            asan.module_ctor
+alignment:       2
+tracksRegLiveness: true
+frameInfo:       
+  stackSize:       16
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:           
+  - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$lr' }
+  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$fp' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $lr
+  
+    early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store 8 into %stack.1), (store 8 into %stack.0)
+    frame-setup CFI_INSTRUCTION def_cfa_offset 16
+    frame-setup CFI_INSTRUCTION offset $w30, -8
+    frame-setup CFI_INSTRUCTION offset $w29, -16
+    BL @__asan_init, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp
+    BL @__asan_version_mismatch_check_v8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp
+    early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load 8 from %stack.1), (load 8 from %stack.0)
+    RET undef $lr
+
+...
diff --git a/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.ll b/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.ll
deleted file mode 100644
index 5c2fe8447a6..00000000000
--- a/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.ll
+++ /dev/null
@@ -1,66 +0,0 @@
-; RUN: llc -O0 -regalloc=fast -stop-after=livedebugvalues -o - < %s | \
-; RUN:   FileCheck %s -implicit-check-not=DBG_VALUE
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios12.1.0"
-
-declare void @use(i32 %x)
-
-define void @f1(i32 %x) !dbg !6 {
-; CHECK-LABEL: name: f1
-entry:
-; CHECK-LABEL: bb.0.entry:
-  %var = add i32 %x, 1, !dbg !12
-  call void @llvm.dbg.value(metadata i32 %var, metadata !9, metadata !DIExpression()), !dbg !12
-; CHECK: DBG_VALUE renamable $w0, $noreg, !9, !DIExpression(), debug-location !12
-; CHECK-NEXT: STRWui killed $w0, $sp, 3 :: (store 4 into %stack.0)
-; CHECK-NEXT: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
-
-  br label %artificial-bb-1, !dbg !13
-
-artificial-bb-1:                                  ; preds = %entry
-; CHECK-LABEL: bb.1.artificial-bb-1:
-; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
-
-  br label %artificial-bb-2
-
-artificial-bb-2:                                  ; preds = %artificial-bb-1
-; CHECK-LABEL: bb.2.artificial-bb-2:
-; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
-
-  %invisible = add i32 %var, 1
-  br label %return, !dbg !14
-
-return:                                           ; preds = %artificial-bb-2
-; CHECK-LABEL: bb.3.return:
-; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
-
-  call void @use(i32 %var)
-  ret void, !dbg !15
-}
-
-; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #0
-
-attributes #0 = { nounwind readnone speculatable }
-
-!llvm.dbg.cu = !{!0}
-!llvm.debugify = !{!3, !4}
-!llvm.module.flags = !{!5}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "compiler-gen-bbs-livedebugvalues.ll", directory: "/")
-!2 = !{}
-!3 = !{i32 6}
-!4 = !{i32 2}
-!5 = !{i32 2, !"Debug Info Version", i32 3}
-!6 = distinct !DISubprogram(name: "f1", linkageName: "f1", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
-!7 = !DISubroutineType(types: !2)
-!8 = !{!9, !11}
-!9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
-!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
-!11 = !DILocalVariable(name: "2", scope: !6, file: !1, line: 4, type: !10)
-!12 = !DILocation(line: 1, column: 1, scope: !6)
-!13 = !DILocation(line: 2, column: 1, scope: !6)
-!14 = !DILocation(line: 0, column: 1, scope: !6)
-!15 = !DILocation(line: 4, column: 1, scope: !6)
diff --git a/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir b/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir
new file mode 100644
index 00000000000..79f2ac77c11
--- /dev/null
+++ b/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir
@@ -0,0 +1,110 @@
+# RUN: llc -o - %s -O0 -regalloc=fast -run-pass=livedebugvalues | \
+# RUN:   FileCheck %s -implicit-check-not=DBG_VALUE
+--- |
+  target triple = "arm64-apple-ios12.1.0"
+  
+  declare void @use(i32)
+  
+  define void @f1(i32 %x) !dbg !6 {
+  entry:
+    %var = add i32 %x, 1, !dbg !12
+    call void @llvm.dbg.value(metadata i32 %var, metadata !9, metadata !DIExpression()), !dbg !12
+    br label %artificial-bb-1, !dbg !13
+  
+  artificial-bb-1:
+    br label %artificial-bb-2
+  
+  artificial-bb-2:
+    %invisible = add i32 %var, 1
+    br label %return, !dbg !14
+  
+  return:
+    call void @use(i32 %var)
+    ret void, !dbg !15
+  }
+  
+  declare void @llvm.dbg.value(metadata, metadata, metadata) #0
+  declare void @llvm.stackprotector(i8*, i8**) #1
+  
+  attributes #0 = { nounwind readnone speculatable }
+  attributes #1 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.debugify = !{!3, !4}
+  !llvm.module.flags = !{!5}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+  !1 = !DIFile(filename: "compiler-gen-bbs-livedebugvalues.ll", directory: "/")
+  !2 = !{}
+  !3 = !{i32 6}
+  !4 = !{i32 2}
+  !5 = !{i32 2, !"Debug Info Version", i32 3}
+  !6 = distinct !DISubprogram(name: "f1", linkageName: "f1", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+  !7 = !DISubroutineType(types: !2)
+  !8 = !{!9, !11}
+  !9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
+  !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+  !11 = !DILocalVariable(name: "2", scope: !6, file: !1, line: 4, type: !10)
+  !12 = !DILocation(line: 1, column: 1, scope: !6)
+  !13 = !DILocation(line: 2, column: 1, scope: !6)
+  !14 = !DILocation(line: 0, column: 1, scope: !6)
+  !15 = !DILocation(line: 4, column: 1, scope: !6)
+
+...
+---
+# CHECK-LABEL: name: f1
+name:            f1
+alignment:       2
+legalized:       true
+regBankSelected: true
+selected:        true
+tracksRegLiveness: true
+frameInfo:       
+  stackSize:       32
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:           
+  - { id: 0, type: spill-slot, offset: -20, size: 4, alignment: 4, stack-id: 0 }
+  - { id: 1, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$lr' }
+  - { id: 2, type: spill-slot, offset: -16, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$fp' }
+body:             |
+  ; CHECK-LABEL: bb.0.entry:
+  bb.0.entry:
+    liveins: $w0, $lr
+  
+    $sp = frame-setup SUBXri $sp, 32, 0
+    frame-setup STPXi killed $fp, killed $lr, $sp, 2 :: (store 8 into %stack.2), (store 8 into %stack.1)
+    frame-setup CFI_INSTRUCTION def_cfa_offset 32
+    frame-setup CFI_INSTRUCTION offset $w30, -8, debug-location !12
+    frame-setup CFI_INSTRUCTION offset $w29, -16, debug-location !12
+    renamable $w0 = ADDWri killed renamable $w0, 1, 0, debug-location !12
+    DBG_VALUE renamable $w0, $noreg, !9, !DIExpression(), debug-location !12
+    STRWui killed $w0, $sp, 3 :: (store 4 into %stack.0)
+    DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12), debug-location !12
+
+    ; CHECK: DBG_VALUE renamable $w0, $noreg, !9, !DIExpression(), debug-location !12
+    ; CHECK-NEXT: STRWui killed $w0, $sp, 3 :: (store 4 into %stack.0)
+    ; CHECK-NEXT: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+  
+  bb.1.artificial-bb-1:
+    ; CHECK-LABEL: bb.1.artificial-bb-1:
+    ; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+      
+  bb.2.artificial-bb-2:
+    ; CHECK-LABEL: bb.2.artificial-bb-2:
+    ; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+  
+  bb.3.return:
+    ; CHECK-LABEL: bb.3.return:
+    ; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+
+    $w0 = LDRWui $sp, 3 :: (load 4 from %stack.0)
+    BL @use, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit killed $w0
+    $fp, $lr = frame-destroy LDPXi $sp, 2, debug-location !15 :: (load 8 from %stack.2), (load 8 from %stack.1)
+    $sp = frame-destroy ADDXri $sp, 32, 0, debug-location !15
+    RET undef $lr, debug-location !15
+...
diff --git a/test/DebugInfo/ARM/cfi-eof-prologue.ll b/test/DebugInfo/ARM/cfi-eof-prologue.ll
deleted file mode 100644
index f7ee9a23bee..00000000000
--- a/test/DebugInfo/ARM/cfi-eof-prologue.ll
+++ /dev/null
@@ -1,114 +0,0 @@
-; struct A {
-;   A();
-;   virtual ~A();
-; };
-; struct B : A {
-;   B();
-;   virtual ~B();
-; };
-; B::B() {}
-; CHECK: __ZN1BC1Ev:
-; CHECK:     .loc	1 [[@LINE-2]] 0 prologue_end
-; CHECK-NOT: .loc	1 0 0 prologue_end
-
-; The location of the prologue_end marker should not be affected by the presence
-; of CFI instructions.
-
-; RUN: llc -O0 -filetype=asm -mtriple=thumbv7-apple-ios < %s | FileCheck %s
-; RUN: llc -O0 -filetype=asm -mtriple=thumbv6-apple-ios < %s | FileCheck %s
-
-; ModuleID = 'test1.cpp'
-target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
-target triple = "thumbv7-apple-ios"
-
-%struct.B = type { %struct.A }
-%struct.A = type { i32 (...)** }
-
-@_ZTV1B = external unnamed_addr constant [4 x i8*]
-
-; Function Attrs: nounwind
-define %struct.B* @_ZN1BC2Ev(%struct.B* %this) unnamed_addr #0 align 2 !dbg !28 {
-entry:
-  tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !30, metadata !40), !dbg !41
-  %0 = getelementptr inbounds %struct.B, %struct.B* %this, i32 0, i32 0, !dbg !42
-  %call = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #3, !dbg !42
-  %1 = getelementptr inbounds %struct.B, %struct.B* %this, i32 0, i32 0, i32 0, !dbg !42
-  store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 4, !dbg !42, !tbaa !43
-  ret %struct.B* %this, !dbg !42
-}
-
-declare %struct.A* @_ZN1AC2Ev(%struct.A*)
-
-; Function Attrs: nounwind
-define %struct.B* @_ZN1BC1Ev(%struct.B* %this) unnamed_addr #0 align 2 !dbg !32 {
-entry:
-  tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !34, metadata !40), !dbg !46
-  tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !47, metadata !40) #3, !dbg !49
-  %0 = getelementptr inbounds %struct.B, %struct.B* %this, i32 0, i32 0, !dbg !50
-  %call.i = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #3, !dbg !50
-  %1 = getelementptr inbounds %struct.B, %struct.B* %this, i32 0, i32 0, i32 0, !dbg !50
-  store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 4, !dbg !50, !tbaa !43
-  ret %struct.B* %this, !dbg !48
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!35, !36, !37, !38}
-!llvm.ident = !{!39}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !3, globals: !2, imports: !2)
-!1 = !DIFile(filename: "<stdin>", directory: "")
-!2 = !{}
-!3 = !{!4, !13}
-!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "B", line: 5, size: 32, align: 32, file: !5, elements: !6, vtableHolder: !13, identifier: "_ZTS1B")
-!5 = !DIFile(filename: "test1.cpp", directory: "")
-!6 = !{!7, !8, !12}
-!7 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !4, baseType: !13)
-!8 = !DISubprogram(name: "B", line: 6, isLocal: false, isDefinition: false, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !5, scope: !4, type: !9)
-!9 = !DISubroutineType(types: !10)
-!10 = !{null, !11}
-!11 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, flags: DIFlagArtificial | DIFlagObjectPointer, baseType: !4)
-!12 = !DISubprogram(name: "~B", line: 7, isLocal: false, isDefinition: false, virtuality: DW_VIRTUALITY_virtual, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 7, file: !5, scope: !4, type: !9, containingType: !4)
-!13 = !DICompositeType(tag: DW_TAG_structure_type, name: "A", line: 1, size: 32, align: 32, file: !5, elements: !14, vtableHolder: !13, identifier: "_ZTS1A")
-!14 = !{!15, !22, !26}
-!15 = !DIDerivedType(tag: DW_TAG_member, name: "_vptr$A", size: 32, flags: DIFlagArtificial, file: !5, scope: !16, baseType: !17)
-!16 = !DIFile(filename: "test1.cpp", directory: "")
-!17 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, baseType: !18)
-!18 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "__vtbl_ptr_type", size: 32, baseType: !19)
-!19 = !DISubroutineType(types: !20)
-!20 = !{!21}
-!21 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!22 = !DISubprogram(name: "A", line: 2, isLocal: false, isDefinition: false, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !5, scope: !13, type: !23)
-!23 = !DISubroutineType(types: !24)
-!24 = !{null, !25}
-!25 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, flags: DIFlagArtificial | DIFlagObjectPointer, baseType: !13)
-!26 = !DISubprogram(name: "~A", line: 3, isLocal: false, isDefinition: false, virtuality: DW_VIRTUALITY_virtual, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !5, scope: !13, type: !23, containingType: !13)
-!28 = distinct !DISubprogram(name: "B", linkageName: "_ZN1BC2Ev", line: 9, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 9, file: !5, scope: !4, type: !9, declaration: !8, retainedNodes: !29)
-!29 = !{!30}
-!30 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !28, type: !31)
-!31 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, baseType: !4)
-!32 = distinct !DISubprogram(name: "B", linkageName: "_ZN1BC1Ev", line: 9, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 9, file: !5, scope: !4, type: !9, declaration: !8, retainedNodes: !33)
-!33 = !{!34}
-!34 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !32, type: !31)
-!35 = !{i32 2, !"Dwarf Version", i32 4}
-!36 = !{i32 2, !"Debug Info Version", i32 3}
-!37 = !{i32 1, !"wchar_size", i32 4}
-!38 = !{i32 1, !"min_enum_size", i32 4}
-!39 = !{!"clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)"}
-!40 = !DIExpression()
-!41 = !DILocation(line: 0, scope: !28)
-!42 = !DILocation(line: 9, scope: !28)
-!43 = !{!44, !44, i64 0}
-!44 = !{!"vtable pointer", !45, i64 0}
-!45 = !{!"Simple C/C++ TBAA"}
-!46 = !DILocation(line: 0, scope: !32)
-!47 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !28, type: !31)
-!48 = !DILocation(line: 9, scope: !32)
-!49 = !DILocation(line: 0, scope: !28, inlinedAt: !48)
-!50 = !DILocation(line: 9, scope: !28, inlinedAt: !48)
diff --git a/test/DebugInfo/ARM/cfi-eof-prologue.mir b/test/DebugInfo/ARM/cfi-eof-prologue.mir
new file mode 100644
index 00000000000..d0808887770
--- /dev/null
+++ b/test/DebugInfo/ARM/cfi-eof-prologue.mir
@@ -0,0 +1,212 @@
+# RUN: llc -o - %s -mtriple=thumbv7-apple-ios -start-after=patchable-function | FileCheck %s
+# RUN: llc -o - %s -mtriple=thumbv6-apple-ios -start-after=patchable-function | FileCheck %s
+
+# struct A {
+#   A();
+#   virtual ~A();
+# };
+# struct B : A {
+#   B();
+#   virtual ~B();
+# };
+# B::B() {}
+# CHECK: __ZN1BC1Ev:
+# CHECK:     .loc       1 9 0 prologue_end
+# CHECK-NOT: .loc       1 0 0 prologue_end
+#
+# The location of the prologue_end marker should not be affected by the presence
+# of CFI instructions.
+
+--- |
+  %struct.B = type { %struct.A }
+  %struct.A = type { i32 (...)** }
+  
+  @_ZTV1B = external unnamed_addr constant [4 x i8*]
+  
+  ; Function Attrs: nounwind
+  define %struct.B* @_ZN1BC2Ev(%struct.B* %this) unnamed_addr #0 align 2 !dbg !31 {
+  entry:
+    tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !33, metadata !DIExpression()), !dbg !35
+    %0 = bitcast %struct.B* %this to %struct.A*, !dbg !36
+    %call = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #0, !dbg !36
+    %1 = bitcast %struct.B* %this to i32 (...)***, !dbg !36
+    store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 4, !dbg !36, !tbaa !37
+    ret %struct.B* %this, !dbg !36
+  }
+  
+  declare %struct.A* @_ZN1AC2Ev(%struct.A*)
+  
+  ; Function Attrs: nounwind
+  define %struct.B* @_ZN1BC1Ev(%struct.B* %this) unnamed_addr #0 align 2 !dbg !40 {
+  entry:
+    tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !42, metadata !DIExpression()), !dbg !43
+    tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !33, metadata !DIExpression()) #0, !dbg !44
+    %0 = bitcast %struct.B* %this to %struct.A*, !dbg !46
+    %call.i = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #0, !dbg !46
+    %1 = bitcast %struct.B* %this to i32 (...)***, !dbg !46
+    store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 4, !dbg !46, !tbaa !37
+    ret %struct.B* %this, !dbg !45
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+  
+  declare void @_Unwind_SjLj_Register({ i8*, i32, [4 x i32], i8*, i8*, [5 x i8*] }*)
+  
+  declare void @_Unwind_SjLj_Unregister({ i8*, i32, [4 x i32], i8*, i8*, [5 x i8*] }*)
+  
+  ; Function Attrs: nounwind readnone
+  declare i8* @llvm.frameaddress(i32) #2
+  
+  ; Function Attrs: nounwind
+  declare i8* @llvm.stacksave() #0
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackrestore(i8*) #0
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.eh.sjlj.setup.dispatch() #0
+  
+  ; Function Attrs: nounwind readnone
+  declare i8* @llvm.eh.sjlj.lsda() #2
+  
+  ; Function Attrs: nounwind readnone
+  declare void @llvm.eh.sjlj.callsite(i32) #2
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.eh.sjlj.functioncontext(i8*) #0
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #0
+  
+  attributes #0 = { nounwind }
+  attributes #1 = { nounwind readnone speculatable }
+  attributes #2 = { nounwind readnone }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!26, !27, !28, !29}
+  !llvm.ident = !{!30}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "<stdin>", directory: "")
+  !2 = !{}
+  !3 = !{!4, !8}
+  !4 = !DICompositeType(tag: DW_TAG_structure_type, name: "B", file: !5, line: 5, size: 32, align: 32, elements: !6, vtableHolder: !8, identifier: "_ZTS1B")
+  !5 = !DIFile(filename: "test1.cpp", directory: "")
+  !6 = !{!7, !21, !25}
+  !7 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !4, baseType: !8)
+  !8 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "A", file: !5, line: 1, size: 32, align: 32, elements: !9, vtableHolder: !8, identifier: "_ZTS1A")
+  !9 = !{!10, !16, !20}
+  !10 = !DIDerivedType(tag: DW_TAG_member, name: "_vptr$A", scope: !5, file: !5, baseType: !11, size: 32, flags: DIFlagArtificial)
+  !11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 32)
+  !12 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "__vtbl_ptr_type", baseType: !13, size: 32)
+  !13 = !DISubroutineType(types: !14)
+  !14 = !{!15}
+  !15 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !16 = !DISubprogram(name: "A", scope: !8, file: !5, line: 2, type: !17, isLocal: false, isDefinition: false, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true)
+  !17 = !DISubroutineType(types: !18)
+  !18 = !{null, !19}
+  !19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 32, align: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !20 = !DISubprogram(name: "~A", scope: !8, file: !5, line: 3, type: !17, isLocal: false, isDefinition: false, scopeLine: 3, containingType: !8, virtuality: DW_VIRTUALITY_virtual, virtualIndex: 0, flags: DIFlagPrototyped, isOptimized: true)
+  !21 = !DISubprogram(name: "B", scope: !4, file: !5, line: 6, type: !22, isLocal: false, isDefinition: false, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true)
+  !22 = !DISubroutineType(types: !23)
+  !23 = !{null, !24}
+  !24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 32, align: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !25 = !DISubprogram(name: "~B", scope: !4, file: !5, line: 7, type: !22, isLocal: false, isDefinition: false, scopeLine: 7, containingType: !4, virtuality: DW_VIRTUALITY_virtual, virtualIndex: 0, flags: DIFlagPrototyped, isOptimized: true)
+  !26 = !{i32 2, !"Dwarf Version", i32 4}
+  !27 = !{i32 2, !"Debug Info Version", i32 3}
+  !28 = !{i32 1, !"wchar_size", i32 4}
+  !29 = !{i32 1, !"min_enum_size", i32 4}
+  !30 = !{!"clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)"}
+  !31 = distinct !DISubprogram(name: "B", linkageName: "_ZN1BC2Ev", scope: !4, file: !5, line: 9, type: !22, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !21, retainedNodes: !32)
+  !32 = !{!33}
+  !33 = !DILocalVariable(name: "this", arg: 1, scope: !31, type: !34, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !34 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 32, align: 32)
+  !35 = !DILocation(line: 0, scope: !31)
+  !36 = !DILocation(line: 9, scope: !31)
+  !37 = !{!38, !38, i64 0}
+  !38 = !{!"vtable pointer", !39, i64 0}
+  !39 = !{!"Simple C/C++ TBAA"}
+  !40 = distinct !DISubprogram(name: "B", linkageName: "_ZN1BC1Ev", scope: !4, file: !5, line: 9, type: !22, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !21, retainedNodes: !41)
+  !41 = !{!42}
+  !42 = !DILocalVariable(name: "this", arg: 1, scope: !40, type: !34, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !43 = !DILocation(line: 0, scope: !40)
+  !44 = !DILocation(line: 0, scope: !31, inlinedAt: !45)
+  !45 = !DILocation(line: 9, scope: !40)
+  !46 = !DILocation(line: 9, scope: !31, inlinedAt: !45)
+
+...
+---
+name:            _ZN1BC2Ev
+alignment:       1
+liveins:         
+  - { reg: '$r0' }
+frameInfo:       
+  stackSize:       8
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:           
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, stack-id: 0, 
+      callee-saved-register: '$lr', callee-saved-restored: false }
+  - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, stack-id: 0, 
+      callee-saved-register: '$r4' }
+body:             |
+  bb.0.entry:
+    frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r4, -8
+    DBG_VALUE debug-use $r0, debug-use $noreg, !33, !DIExpression(), debug-location !35
+    $r4 = tMOVr $r0, 14, $noreg
+    DBG_VALUE debug-use $r4, debug-use $noreg, !33, !DIExpression(), debug-location !35
+    tBL 14, $noreg, @_ZN1AC2Ev, csr_ios, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit-def $sp, implicit-def dead $r0, debug-location !36
+    $r0 = t2MOVi16_ga_pcrel target-flags(arm-lo16, arm-nonlazy) @_ZTV1B, 0, debug-location !36
+    $r0 = t2MOVTi16_ga_pcrel killed $r0, target-flags(arm-hi16, arm-nonlazy) @_ZTV1B, 0, debug-location !36
+    $r0 = tPICADD killed $r0, 0, debug-location !36
+    renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg, debug-location !36 :: (load 4 from got)
+    renamable $r0, dead $cpsr = nuw tADDi8 killed renamable $r0, 8, 14, $noreg, debug-location !36
+    tSTRi killed renamable $r0, renamable $r4, 0, 14, $noreg, debug-location !36 :: (store 4 into %ir.1, !tbaa !37)
+    $r0 = tMOVr killed $r4, 14, $noreg, debug-location !36
+    tPOP_RET 14, $noreg, def $r4, def $pc, implicit killed $r0, debug-location !36
+
+...
+---
+name:            _ZN1BC1Ev
+alignment:       1
+liveins:         
+  - { reg: '$r0' }
+frameInfo:       
+  stackSize:       8
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:           
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, stack-id: 0, 
+      callee-saved-register: '$lr', callee-saved-restored: false }
+  - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, stack-id: 0, 
+      callee-saved-register: '$r4' }
+body:             |
+  bb.0.entry:
+    frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r4, -8
+    DBG_VALUE debug-use $r0, debug-use $noreg, !42, !DIExpression(), debug-location !43
+    DBG_VALUE debug-use $r0, debug-use $noreg, !33, !DIExpression(), debug-location !44
+    $r4 = tMOVr $r0, 14, $noreg
+    DBG_VALUE debug-use $r4, debug-use $noreg, !33, !DIExpression(), debug-location !44
+    DBG_VALUE debug-use $r4, debug-use $noreg, !42, !DIExpression(), debug-location !43
+    tBL 14, $noreg, @_ZN1AC2Ev, csr_ios, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit-def $sp, implicit-def dead $r0, debug-location !46
+    $r0 = t2MOVi16_ga_pcrel target-flags(arm-lo16, arm-nonlazy) @_ZTV1B, 0, debug-location !46
+    $r0 = t2MOVTi16_ga_pcrel killed $r0, target-flags(arm-hi16, arm-nonlazy) @_ZTV1B, 0, debug-location !46
+    $r0 = tPICADD killed $r0, 0, debug-location !46
+    renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg, debug-location !46 :: (load 4 from got)
+    renamable $r0, dead $cpsr = nuw tADDi8 killed renamable $r0, 8, 14, $noreg, debug-location !46
+    tSTRi killed renamable $r0, renamable $r4, 0, 14, $noreg, debug-location !46 :: (store 4 into %ir.1, !tbaa !37)
+    $r0 = tMOVr killed $r4, 14, $noreg, debug-location !45
+    tPOP_RET 14, $noreg, def $r4, def $pc, implicit killed $r0, debug-location !45
+
+...
diff --git a/test/DebugInfo/X86/debug-loc-asan.ll b/test/DebugInfo/X86/debug-loc-asan.ll
deleted file mode 100644
index 3e54035b7d7..00000000000
--- a/test/DebugInfo/X86/debug-loc-asan.ll
+++ /dev/null
@@ -1,190 +0,0 @@
-; RUN: llc -fast-isel-sink-local-values -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-; RUN: llc -fast-isel-sink-local-values  -O0 -mtriple=x86_64-unknown-linux-gnu -filetype=obj < %s \
-; RUN:   | llvm-dwarfdump -debug-info - | FileCheck %s --check-prefix=DWARF
-
-; Verify that we have correct debug info for local variables in code
-; instrumented with AddressSanitizer.
-
-; Generated from the source file test.cc:
-; int bar(int y) {
-;   return y + 2;
-; }
-; with "clang++ -S -emit-llvm -mllvm -asan-skip-promotable-allocas=0 -fsanitize=address -O0 -g test.cc"
-
-; The address of the (potentially now malloc'ed) alloca ends up
-; in rdi, after which it is spilled to the stack. We record the
-; spill OFFSET on the stack for checking the debug info below.
-; CHECK: #DEBUG_VALUE: bar:y <- [DW_OP_deref] [$rcx+0]
-; CHECK: movq %rcx, [[OFFSET:[0-9]+]](%rsp)
-; CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]
-; CHECK-NEXT: #DEBUG_VALUE: bar:y <- [DW_OP_plus_uconst [[OFFSET]], DW_OP_deref, DW_OP_deref]
-; This location should be valid until the end of the function.
-
-; CHECK:        movq    %rbp, %rsp
-; CHECK-NEXT: [[END_LABEL:.Ltmp[0-9]+]]:
-
-; CHECK: .Ldebug_loc{{[0-9]+}}:
-; We expect two location ranges for the variable.
-
-; First, its address is stored in %rcx:
-; CHECK:      .quad .Lfunc_begin0-.Lfunc_begin0
-; CHECK-NEXT: .quad [[START_LABEL]]-.Lfunc_begin0
-; CHECK: DW_OP_breg2
-; DWARF:       DW_TAG_formal_parameter
-; DWARF:         DW_AT_location
-; DWARF-NEXT:      [{{.*}}, {{.*}}): DW_OP_breg2 RCX+0, DW_OP_deref
-
-; Then it's addressed via %rsp:
-; CHECK:      .quad [[START_LABEL]]-.Lfunc_begin0
-; CHECK-NEXT: .quad [[END_LABEL]]-.Lfunc_begin0
-; CHECK: DW_OP_breg7
-; CHECK-NEXT: [[OFFSET]]
-; CHECK: DW_OP_deref
-; DWARF-NEXT:      [{{.*}}, {{.*}}): DW_OP_breg7 RSP+{{[0-9]+}}, DW_OP_deref, DW_OP_deref)
-
-; ModuleID = 'test.cc'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 1, void ()* @asan.module_ctor }]
-@__asan_option_detect_stack_use_after_return = external global i32
-@___asan_gen_ = private unnamed_addr constant [16 x i8] c"1 32 4 6 y.addr\00", align 1
-
-; Function Attrs: nounwind sanitize_address uwtable
-define i32 @_Z3bari(i32 %y) #0 !dbg !4 {
-entry:
-  %MyAlloca = alloca [64 x i8], align 32
-  %0 = ptrtoint [64 x i8]* %MyAlloca to i64
-  %1 = load i32, i32* @__asan_option_detect_stack_use_after_return
-  %2 = icmp ne i32 %1, 0
-  br i1 %2, label %3, label %5
-
-; <label>:3                                       ; preds = %entry
-  %4 = call i64 @__asan_stack_malloc_0(i64 64, i64 %0)
-  br label %5
-
-; <label>:5                                       ; preds = %entry, %3
-  %6 = phi i64 [ %0, %entry ], [ %4, %3 ]
-  %7 = add i64 %6, 32
-  %8 = inttoptr i64 %7 to i32*
-  %9 = inttoptr i64 %6 to i64*
-  store i64 1102416563, i64* %9
-  %10 = add i64 %6, 8
-  %11 = inttoptr i64 %10 to i64*
-  store i64 ptrtoint ([16 x i8]* @___asan_gen_ to i64), i64* %11
-  %12 = add i64 %6, 16
-  %13 = inttoptr i64 %12 to i64*
-  store i64 ptrtoint (i32 (i32)* @_Z3bari to i64), i64* %13
-  %14 = lshr i64 %6, 3
-  %15 = add i64 %14, 2147450880
-  %16 = add i64 %15, 0
-  %17 = inttoptr i64 %16 to i64*
-  store i64 -868083100587789839, i64* %17
-  %18 = ptrtoint i32* %8 to i64
-  %19 = lshr i64 %18, 3
-  %20 = add i64 %19, 2147450880
-  %21 = inttoptr i64 %20 to i8*
-  %22 = load i8, i8* %21
-  %23 = icmp ne i8 %22, 0
-  call void @llvm.dbg.declare(metadata i32* %8, metadata !12, metadata !14), !dbg !DILocation(scope: !4)
-  br i1 %23, label %24, label %30
-
-; <label>:24                                      ; preds = %5
-  %25 = and i64 %18, 7
-  %26 = add i64 %25, 3
-  %27 = trunc i64 %26 to i8
-  %28 = icmp sge i8 %27, %22
-  br i1 %28, label %29, label %30
-
-; <label>:29                                      ; preds = %24
-  call void @__asan_report_store4(i64 %18)
-  call void asm sideeffect "", ""()
-  unreachable
-
-; <label>:30                                      ; preds = %24, %5
-  store i32 %y, i32* %8, align 4
-  %31 = ptrtoint i32* %8 to i64, !dbg !13
-  %32 = lshr i64 %31, 3, !dbg !13
-  %33 = add i64 %32, 2147450880, !dbg !13
-  %34 = inttoptr i64 %33 to i8*, !dbg !13
-  %35 = load i8, i8* %34, !dbg !13
-  %36 = icmp ne i8 %35, 0, !dbg !13
-  br i1 %36, label %37, label %43, !dbg !13
-
-; <label>:37                                      ; preds = %30
-  %38 = and i64 %31, 7, !dbg !13
-  %39 = add i64 %38, 3, !dbg !13
-  %40 = trunc i64 %39 to i8, !dbg !13
-  %41 = icmp sge i8 %40, %35, !dbg !13
-  br i1 %41, label %42, label %43
-
-; <label>:42                                      ; preds = %37
-  call void @__asan_report_load4(i64 %31), !dbg !13
-  call void asm sideeffect "", ""()
-  unreachable
-
-; <label>:43                                      ; preds = %37, %30
-  %44 = load i32, i32* %8, align 4, !dbg !13
-  %add = add nsw i32 %44, 2, !dbg !13
-  store i64 1172321806, i64* %9, !dbg !13
-  %45 = icmp ne i64 %6, %0, !dbg !13
-  br i1 %45, label %46, label %53, !dbg !13
-
-; <label>:46                                      ; preds = %43
-  %47 = add i64 %15, 0, !dbg !13
-  %48 = inttoptr i64 %47 to i64*, !dbg !13
-  store i64 -723401728380766731, i64* %48, !dbg !13
-  %49 = add i64 %6, 56, !dbg !13
-  %50 = inttoptr i64 %49 to i64*, !dbg !13
-  %51 = load i64, i64* %50, !dbg !13
-  %52 = inttoptr i64 %51 to i8*, !dbg !13
-  store i8 0, i8* %52, !dbg !13
-  br label %56, !dbg !13
-
-; <label>:53                                      ; preds = %43
-  %54 = add i64 %15, 0, !dbg !13
-  %55 = inttoptr i64 %54 to i64*, !dbg !13
-  store i64 0, i64* %55, !dbg !13
-  br label %56, !dbg !13
-
-; <label>:56                                      ; preds = %53, %46
-  ret i32 %add, !dbg !13
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-define internal void @asan.module_ctor() {
-  call void @__asan_init_v3()
-  ret void
-}
-
-declare void @__asan_init_v3()
-
-declare void @__asan_report_load4(i64)
-
-declare void @__asan_report_store4(i64)
-
-declare i64 @__asan_stack_malloc_0(i64, i64)
-
-attributes #0 = { nounwind sanitize_address uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!9, !10}
-!llvm.ident = !{!11}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 (209308)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
-!1 = !DIFile(filename: "test.cc", directory: "/llvm_cmake_gcc")
-!2 = !{}
-!4 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bari", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
-!5 = !DIFile(filename: "test.cc", directory: "/llvm_cmake_gcc")
-!6 = !DISubroutineType(types: !7)
-!7 = !{!8, !8}
-!8 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!9 = !{i32 2, !"Dwarf Version", i32 4}
-!10 = !{i32 2, !"Debug Info Version", i32 3}
-!11 = !{!"clang version 3.5.0 (209308)"}
-!12 = !DILocalVariable(name: "y", line: 1, arg: 1, scope: !4, file: !5, type: !8)
-!13 = !DILocation(line: 2, scope: !4)
-!14 = !DIExpression(DW_OP_deref)
diff --git a/test/DebugInfo/X86/debug-loc-asan.mir b/test/DebugInfo/X86/debug-loc-asan.mir
new file mode 100644
index 00000000000..e4a6057deef
--- /dev/null
+++ b/test/DebugInfo/X86/debug-loc-asan.mir
@@ -0,0 +1,346 @@
+# RUN: llc -o - %s -start-after=patchable-function -O0 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+# RUN: llc -o - %s -start-after=patchable-function  -O0 -mtriple=x86_64-unknown-linux-gnu -filetype=obj \
+# RUN:   | llvm-dwarfdump -debug-info - | FileCheck %s --check-prefix=DWARF
+#
+# Verify that we have correct debug info for local variables in code
+# instrumented with AddressSanitizer.
+#
+# Generated from the source file test.cc:
+# int bar(int y) {
+#   return y + 2;
+# }
+# with "clang++ -S -emit-llvm -mllvm -asan-skip-promotable-allocas=0 -fsanitize=address -O0 -g test.cc"
+#
+# The address of the (potentially now malloc'ed) alloca ends up
+# in rdi, after which it is spilled to the stack. We record the
+# spill OFFSET on the stack for checking the debug info below.
+# CHECK: #DEBUG_VALUE: bar:y <- [DW_OP_deref] [$rcx+0]
+# CHECK: movq %rcx, [[OFFSET:[0-9]+]](%rsp)
+# CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]
+# CHECK-NEXT: #DEBUG_VALUE: bar:y <- [DW_OP_plus_uconst [[OFFSET]], DW_OP_deref, DW_OP_deref]
+# This location should be valid until the end of the function.
+#
+# CHECK:        movq    %rbp, %rsp
+# CHECK-NEXT: [[END_LABEL:.Ltmp[0-9]+]]:
+#
+# CHECK: .Ldebug_loc{{[0-9]+}}:
+# We expect two location ranges for the variable.
+#
+# First, its address is stored in %rcx:
+# CHECK:      .quad .Lfunc_begin0-.Lfunc_begin0
+# CHECK-NEXT: .quad [[START_LABEL]]-.Lfunc_begin0
+# CHECK: DW_OP_breg2
+# DWARF:       DW_TAG_formal_parameter
+# DWARF:         DW_AT_location
+# DWARF-NEXT:      [{{.*}}, {{.*}}): DW_OP_breg2 RCX+0, DW_OP_deref
+#
+# Then it's addressed via %rsp:
+# CHECK:      .quad [[START_LABEL]]-.Lfunc_begin0
+# CHECK-NEXT: .quad [[END_LABEL]]-.Lfunc_begin0
+# CHECK: DW_OP_breg7
+# CHECK-NEXT: [[OFFSET]]
+# CHECK: DW_OP_deref
+# DWARF-NEXT:      [{{.*}}, {{.*}}): DW_OP_breg7 RSP+{{[0-9]+}}, DW_OP_deref, DW_OP_deref)
+--- |
+  @__asan_option_detect_stack_use_after_return = external global i32
+  @___asan_gen_ = private unnamed_addr constant [16 x i8] c"1 32 4 6 y.addr\00", align 1
+  
+  ; Function Attrs: nounwind sanitize_address uwtable
+  define i32 @_Z3bari(i32 %y) #0 !dbg !6 {
+  entry:
+    %MyAlloca = alloca [64 x i8], align 32
+    %0 = ptrtoint [64 x i8]* %MyAlloca to i64
+    %1 = load i32, i32* @__asan_option_detect_stack_use_after_return
+    %2 = icmp ne i32 %1, 0
+    br i1 %2, label %3, label %5
+  
+  ; <label>:3:                                      ; preds = %entry
+    %4 = call i64 @__asan_stack_malloc_0(i64 64, i64 %0)
+    br label %5
+  
+  ; <label>:5:                                      ; preds = %3, %entry
+    %6 = phi i64 [ %0, %entry ], [ %4, %3 ]
+    %7 = add i64 %6, 32
+    %8 = inttoptr i64 %7 to i32*
+    %9 = inttoptr i64 %6 to i64*
+    store i64 1102416563, i64* %9
+    %10 = add i64 %6, 8
+    %11 = inttoptr i64 %10 to i64*
+    store i64 ptrtoint ([16 x i8]* @___asan_gen_ to i64), i64* %11
+    %12 = add i64 %6, 16
+    %13 = inttoptr i64 %12 to i64*
+    store i64 ptrtoint (i32 (i32)* @_Z3bari to i64), i64* %13
+    %14 = lshr i64 %6, 3
+    %15 = add i64 %14, 2147450880
+    %16 = add i64 %15, 0
+    %17 = inttoptr i64 %16 to i64*
+    store i64 -868083100587789839, i64* %17
+    %18 = ptrtoint i32* %8 to i64
+    %19 = lshr i64 %18, 3
+    %20 = add i64 %19, 2147450880
+    %21 = inttoptr i64 %20 to i8*
+    %22 = load i8, i8* %21
+    %23 = icmp ne i8 %22, 0
+    call void @llvm.dbg.declare(metadata i32* %8, metadata !10, metadata !DIExpression(DW_OP_deref)), !dbg !11
+    br i1 %23, label %24, label %30
+  
+  ; <label>:24:                                     ; preds = %5
+    %25 = and i64 %18, 7
+    %26 = add i64 %25, 3
+    %27 = trunc i64 %26 to i8
+    %28 = icmp sge i8 %27, %22
+    br i1 %28, label %29, label %30
+  
+  ; <label>:29:                                     ; preds = %24
+    call void @__asan_report_store4(i64 %18)
+    call void asm sideeffect "", ""()
+    unreachable
+  
+  ; <label>:30:                                     ; preds = %24, %5
+    store i32 %y, i32* %8, align 4
+    %31 = ptrtoint i32* %8 to i64, !dbg !12
+    %32 = lshr i64 %31, 3, !dbg !12
+    %33 = add i64 %32, 2147450880, !dbg !12
+    %34 = inttoptr i64 %33 to i8*, !dbg !12
+    %35 = load i8, i8* %34, !dbg !12
+    %36 = icmp ne i8 %35, 0, !dbg !12
+    br i1 %36, label %37, label %43, !dbg !12
+  
+  ; <label>:37:                                     ; preds = %30
+    %38 = and i64 %31, 7, !dbg !12
+    %39 = add i64 %38, 3, !dbg !12
+    %40 = trunc i64 %39 to i8, !dbg !12
+    %41 = icmp sge i8 %40, %35, !dbg !12
+    br i1 %41, label %42, label %43
+  
+  ; <label>:42:                                     ; preds = %37
+    call void @__asan_report_load4(i64 %31), !dbg !12
+    call void asm sideeffect "", ""()
+    unreachable
+  
+  ; <label>:43:                                     ; preds = %37, %30
+    %44 = load i32, i32* %8, align 4, !dbg !12
+    %add = add nsw i32 %44, 2, !dbg !12
+    store i64 1172321806, i64* %9, !dbg !12
+    %45 = icmp ne i64 %6, %0, !dbg !12
+    br i1 %45, label %46, label %53, !dbg !12
+  
+  ; <label>:46:                                     ; preds = %43
+    %47 = add i64 %15, 0, !dbg !12
+    %48 = inttoptr i64 %47 to i64*, !dbg !12
+    store i64 -723401728380766731, i64* %48, !dbg !12
+    %49 = add i64 %6, 56, !dbg !12
+    %50 = inttoptr i64 %49 to i64*, !dbg !12
+    %51 = load i64, i64* %50, !dbg !12
+    %52 = inttoptr i64 %51 to i8*, !dbg !12
+    store i8 0, i8* %52, !dbg !12
+    br label %56, !dbg !12
+  
+  ; <label>:53:                                     ; preds = %43
+    %54 = add i64 %15, 0, !dbg !12
+    %55 = inttoptr i64 %54 to i64*, !dbg !12
+    store i64 0, i64* %55, !dbg !12
+    br label %56, !dbg !12
+  
+  ; <label>:56:                                     ; preds = %53, %46
+    ret i32 %add, !dbg !12
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+  
+  declare void @__asan_init_v3()
+  
+  declare void @__asan_report_load4(i64)
+  
+  declare void @__asan_report_store4(i64)
+  
+  declare i64 @__asan_stack_malloc_0(i64, i64)
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #2
+  
+  attributes #0 = { nounwind sanitize_address uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { nounwind readnone speculatable }
+  attributes #2 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4}
+  !llvm.ident = !{!5}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5.0 (209308)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "test.cc", directory: "/llvm_cmake_gcc")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{!"clang version 3.5.0 (209308)"}
+  !6 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bari", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+  !7 = !DISubroutineType(types: !8)
+  !8 = !{!9, !9}
+  !9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !10 = !DILocalVariable(name: "y", arg: 1, scope: !6, file: !1, line: 1, type: !9)
+  !11 = !DILocation(line: 0, scope: !6)
+  !12 = !DILocation(line: 2, scope: !6)
+
+...
+---
+name:            _Z3bari
+alignment:       4
+tracksRegLiveness: true
+liveins:         
+  - { reg: '$edi' }
+frameInfo:       
+  stackSize:       152
+  offsetAdjustment: -160
+  maxAlignment:    32
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: 0 }
+stack:           
+  - { id: 0, name: MyAlloca, offset: -96, size: 64, alignment: 32, stack-id: 0 }
+  - { id: 1, type: spill-slot, offset: -100, size: 4, alignment: 4, stack-id: 0 }
+  - { id: 2, type: spill-slot, offset: -112, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 3, type: spill-slot, offset: -120, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 4, type: spill-slot, offset: -128, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 5, type: spill-slot, offset: -136, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 6, type: spill-slot, offset: -144, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 7, type: spill-slot, offset: -145, size: 1, alignment: 1, stack-id: 0 }
+  - { id: 8, type: spill-slot, offset: -146, size: 1, alignment: 1, stack-id: 0 }
+  - { id: 9, type: spill-slot, offset: -152, size: 4, alignment: 4, stack-id: 0 }
+body:             |
+  bb.0.entry:
+    liveins: $edi
+  
+    frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    CFI_INSTRUCTION offset $rbp, -16
+    $rbp = frame-setup MOV64rr $rsp
+    CFI_INSTRUCTION def_cfa_register $rbp
+    $rsp = frame-setup AND64ri8 $rsp, -32, implicit-def dead $eflags
+    $rsp = frame-setup SUB64ri32 $rsp, 160, implicit-def dead $eflags
+    renamable $rax = LEA64r $rsp, 1, $noreg, 64, $noreg
+    CMP32mi8 $noreg, 1, $noreg, @__asan_option_detect_stack_use_after_return, $noreg, 0, implicit-def $eflags :: (load 4 from @__asan_option_detect_stack_use_after_return)
+    $rcx = MOV64rr $rax
+    MOV32mr $rsp, 1, $noreg, 60, $noreg, killed $edi :: (store 4 into %stack.1)
+    MOV64mr $rsp, 1, $noreg, 48, $noreg, killed $rax :: (store 8 into %stack.2)
+    MOV64mr $rsp, 1, $noreg, 40, $noreg, killed $rcx :: (store 8 into %stack.3)
+    JE_1 %bb.2, implicit $eflags
+  
+  bb.1 (%ir-block.3):
+    $edi = MOV32ri 64, implicit-def $rdi
+    $rsi = MOV64rm $rsp, 1, $noreg, 48, $noreg :: (load 8 from %stack.2)
+    CALL64pcrel32 @__asan_stack_malloc_0, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit killed $rsi, implicit-def $rax
+    MOV64mr $rsp, 1, $noreg, 40, $noreg, killed $rax :: (store 8 into %stack.3)
+  
+  bb.2 (%ir-block.5):
+    $rax = MOV64rm $rsp, 1, $noreg, 40, $noreg :: (load 8 from %stack.3)
+    $rcx = MOV64rr $rax
+    renamable $rcx = ADD64ri8 renamable $rcx, 32, implicit-def $eflags
+    MOV64mi32 renamable $rax, 1, $noreg, 0, $noreg, 1102416563 :: (store 8 into %ir.9)
+    renamable $rdx = MOV64ri @___asan_gen_
+    MOV64mr renamable $rax, 1, $noreg, 8, $noreg, killed renamable $rdx :: (store 8 into %ir.11)
+    renamable $rdx = MOV64ri @_Z3bari
+    MOV64mr renamable $rax, 1, $noreg, 16, $noreg, killed renamable $rdx :: (store 8 into %ir.13)
+    $rdx = MOV64rr $rax
+    renamable $rdx = SHR64ri renamable $rdx, 3, implicit-def $eflags
+    $rsi = MOV64rr $rdx
+    renamable $rsi = ADD64ri32 renamable $rsi, 2147450880, implicit-def $eflags
+    renamable $rdi = MOV64ri -868083100587789839
+    MOV64mr killed renamable $rdx, 1, $noreg, 2147450880, $noreg, killed renamable $rdi :: (store 8 into %ir.17)
+    $rdx = MOV64rr $rcx
+    renamable $rdx = SHR64ri renamable $rdx, 3, implicit-def $eflags
+    renamable $r8b = MOV8rm killed renamable $rdx, 1, $noreg, 2147450880, $noreg :: (load 1 from %ir.21)
+    DBG_VALUE renamable $rcx, 0, !10, !DIExpression(DW_OP_deref), debug-location !11
+    CMP8ri renamable $r8b, 0, implicit-def $eflags
+    MOV64mr $rsp, 1, $noreg, 32, $noreg, killed $rax :: (store 8 into %stack.4)
+    MOV64mr $rsp, 1, $noreg, 24, $noreg, killed $rcx :: (store 8 into %stack.5)
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    MOV64mr $rsp, 1, $noreg, 16, $noreg, killed $rsi :: (store 8 into %stack.6)
+    MOV8mr $rsp, 1, $noreg, 15, $noreg, killed $r8b :: (store 1 into %stack.7)
+    JE_1 %bb.5, implicit $eflags
+  
+  bb.3 (%ir-block.24):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rax = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    renamable $rax = AND64ri8 renamable $rax, 7, implicit-def $eflags
+    renamable $rax = ADD64ri8 renamable $rax, 3, implicit-def $eflags
+    $cl = MOV8rr $al, implicit killed $rax
+    $dl = MOV8rm $rsp, 1, $noreg, 15, $noreg :: (load 1 from %stack.7)
+    CMP8rr killed renamable $cl, killed renamable $dl, implicit-def $eflags
+    JL_1 %bb.5, implicit $eflags
+  
+  bb.4 (%ir-block.29):
+    successors: 
+  
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rdi = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    CALL64pcrel32 @__asan_report_store4, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi
+    INLINEASM &"", 1
+  
+  bb.5 (%ir-block.30):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rax = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    $ecx = MOV32rm $rsp, 1, $noreg, 60, $noreg :: (load 4 from %stack.1)
+    MOV32mr renamable $rax, 1, $noreg, 0, $noreg, killed renamable $ecx :: (store 4 into %ir.8)
+    renamable $rax = SHR64ri renamable $rax, 3, implicit-def $eflags, debug-location !12
+    renamable $dl = MOV8rm killed renamable $rax, 1, $noreg, 2147450880, $noreg, debug-location !12 :: (load 1 from %ir.34)
+    CMP8ri renamable $dl, 0, implicit-def $eflags, debug-location !12
+    MOV8mr $rsp, 1, $noreg, 14, $noreg, killed $dl :: (store 1 into %stack.8)
+    JE_1 %bb.8, implicit $eflags, debug-location !12
+  
+  bb.6 (%ir-block.37):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rax = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    renamable $rax = AND64ri8 renamable $rax, 7, implicit-def $eflags, debug-location !12
+    renamable $rax = ADD64ri8 renamable $rax, 3, implicit-def $eflags, debug-location !12
+    $cl = MOV8rr $al, implicit killed $rax, debug-location !12
+    $dl = MOV8rm $rsp, 1, $noreg, 14, $noreg :: (load 1 from %stack.8)
+    CMP8rr killed renamable $cl, killed renamable $dl, implicit-def $eflags, debug-location !12
+    JL_1 %bb.8, implicit $eflags
+  
+  bb.7 (%ir-block.42):
+    successors: 
+  
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rdi = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    CALL64pcrel32 @__asan_report_load4, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, debug-location !12
+    INLINEASM &"", 1
+  
+  bb.8 (%ir-block.43):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rax = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    renamable $ecx = MOV32rm killed renamable $rax, 1, $noreg, 0, $noreg, debug-location !12 :: (load 4 from %ir.8)
+    renamable $ecx = ADD32ri8 renamable $ecx, 2, implicit-def $eflags, debug-location !12
+    $rdx = MOV64rm $rsp, 1, $noreg, 32, $noreg :: (load 8 from %stack.4)
+    MOV64mi32 renamable $rdx, 1, $noreg, 0, $noreg, 1172321806, debug-location !12 :: (store 8 into %ir.9)
+    $rsi = MOV64rm $rsp, 1, $noreg, 48, $noreg :: (load 8 from %stack.2)
+    CMP64rr killed renamable $rdx, killed renamable $rsi, implicit-def $eflags, debug-location !12
+    MOV32mr $rsp, 1, $noreg, 8, $noreg, killed $ecx :: (store 4 into %stack.9)
+    JE_1 %bb.10, implicit $eflags, debug-location !12
+  
+  bb.9 (%ir-block.46):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    renamable $rax = MOV64ri -723401728380766731, debug-location !12
+    $rcx = MOV64rm $rsp, 1, $noreg, 16, $noreg :: (load 8 from %stack.6)
+    MOV64mr killed renamable $rcx, 1, $noreg, 0, $noreg, killed renamable $rax, debug-location !12 :: (store 8 into %ir.48)
+    $rax = MOV64rm $rsp, 1, $noreg, 32, $noreg :: (load 8 from %stack.4)
+    renamable $rdx = MOV64rm killed renamable $rax, 1, $noreg, 56, $noreg, debug-location !12 :: (load 8 from %ir.50)
+    MOV8mi killed renamable $rdx, 1, $noreg, 0, $noreg, 0, debug-location !12 :: (store 1 into %ir.52)
+    JMP_1 %bb.11, debug-location !12
+  
+  bb.10 (%ir-block.53):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rax = MOV64rm $rsp, 1, $noreg, 16, $noreg :: (load 8 from %stack.6)
+    MOV64mi32 killed renamable $rax, 1, $noreg, 0, $noreg, 0, debug-location !12 :: (store 8 into %ir.55)
+  
+  bb.11 (%ir-block.56):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $eax = MOV32rm $rsp, 1, $noreg, 8, $noreg :: (load 4 from %stack.9)
+    $rsp = MOV64rr $rbp, debug-location !12
+    $rbp = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !12
+    CFI_INSTRUCTION def_cfa $rsp, 8, debug-location !12
+    RETQ implicit killed $eax, debug-location !12
+
+...
diff --git a/test/DebugInfo/X86/debug-loc-offset.ll b/test/DebugInfo/X86/debug-loc-offset.ll
deleted file mode 100644
index 521282bdcd7..00000000000
--- a/test/DebugInfo/X86/debug-loc-offset.ll
+++ /dev/null
@@ -1,171 +0,0 @@
-; RUN: llc %s -filetype=obj -O0 -mtriple=i386-unknown-linux-gnu -dwarf-version=4 -o %t
-; RUN: llvm-dwarfdump -v %t | FileCheck %s
-
-; From the code:
-
-; debug-loc-offset1.cc
-; int bar (int b) {
-;   return b+4;
-; }
-
-; debug-loc-offset2.cc
-; struct A {
-;   int var;
-;   virtual char foo();
-; };
-
-; void baz(struct A a) {
-;   int z = 2;
-;   if (a.var > 2)
-;     z++;
-;   if (a.foo() == 'a')
-;     z++;
-; }
-
-; Compiled separately for i386-pc-linux-gnu and linked together.
-; This ensures that we have multiple compile units and multiple location lists
-; so that we can verify that
-; debug_loc entries are relative to the low_pc of the CU. The loc entry for
-; the byval argument in foo.cpp is in the second CU and so should have
-; an offset relative to that CU rather than from the beginning of the text
-; section.
-
-; Checking that we have two compile units with two sets of high/lo_pc.
-; CHECK: .debug_info contents
-; CHECK: DW_TAG_compile_unit
-; CHECK: DW_AT_low_pc {{.*}} (0x0000000000000020)
-; CHECK: DW_AT_high_pc
-
-; CHECK: DW_TAG_subprogram
-; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_linkage_name [DW_FORM_strp]{{.*}}"_Z3baz1A"
-; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: DW_TAG_formal_parameter
-; CHECK-NOT: DW_TAG
-; CHECK:       DW_AT_location [DW_FORM_sec_offset]   ({{.*}}
-; CHECK-NEXT:    [0x00000020, 0x00000037): DW_OP_breg0 EAX+0, DW_OP_deref
-; CHECK-NEXT:    [0x00000037, 0x00000063): DW_OP_breg5 EBP-8, DW_OP_deref, DW_OP_deref
-; CHECK-NEXT:  DW_AT_name [DW_FORM_strp]{{.*}}"a"
-
-; CHECK: DW_TAG_variable
-; CHECK: DW_AT_location [DW_FORM_exprloc]
-; CHECK-NOT: DW_AT_location
-
-; CHECK: DW_TAG_compile_unit
-; CHECK: DW_AT_low_pc {{.*}} (0x0000000000000000)
-; CHECK: DW_AT_high_pc
-
-; CHECK: DW_TAG_subprogram
-; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_linkage_name [DW_FORM_strp]{{.*}}"_Z3bari"
-; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: DW_TAG_formal_parameter
-; CHECK-NOT: DW_TAG
-; CHECK:       DW_AT_location [DW_FORM_sec_offset]   ({{.*}}
-; CHECK-NEXT:    [0x00000000, 0x0000000a): DW_OP_consts +0, DW_OP_stack_value
-; CHECK-NEXT:    [0x0000000a, 0x00000017): DW_OP_consts +1, DW_OP_stack_value)
-; CHECK-NEXT:  DW_AT_name [DW_FORM_strp]{{.*}}"b"
-
-; CHECK: .debug_loc contents:
-; CHECK:       0x00000000:
-; CHECK-NEXT:    [0x00000000, 0x0000000a): DW_OP_consts +0, DW_OP_stack_value
-; CHECK-NEXT:    [0x0000000a, 0x00000017): DW_OP_consts +1, DW_OP_stack_value
-; CHECK:       0x00000022:
-; CHECK-NEXT:    [0x00000000, 0x00000017): DW_OP_breg0 EAX+0, DW_OP_deref
-; CHECK-NEXT:    [0x00000017, 0x00000043): DW_OP_breg5 EBP-8, DW_OP_deref, DW_OP_deref
-
-%struct.A = type { i32 (...)**, i32 }
-
-; Function Attrs: nounwind
-define i32 @_Z3bari(i32 %b) #0 !dbg !4 {
-entry:
-  %b.addr = alloca i32, align 4
-  store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.value(metadata i32 0, metadata !21, metadata !DIExpression()), !dbg !22
-  %0 = load i32, i32* %b.addr, align 4, !dbg !23
-  call void @llvm.dbg.value(metadata i32 1, metadata !21, metadata !DIExpression()), !dbg !22
-  %add = add nsw i32 %0, 4, !dbg !23
-  ret i32 %add, !dbg !23
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-define void @_Z3baz1A(%struct.A* %a) #2 !dbg !14 {
-entry:
-  %z = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata %struct.A* %a, metadata !24, metadata !DIExpression(DW_OP_deref)), !dbg !25
-  call void @llvm.dbg.declare(metadata i32* %z, metadata !26, metadata !DIExpression()), !dbg !27
-  store i32 2, i32* %z, align 4, !dbg !27
-  %var = getelementptr inbounds %struct.A, %struct.A* %a, i32 0, i32 1, !dbg !28
-  %0 = load i32, i32* %var, align 4, !dbg !28
-  %cmp = icmp sgt i32 %0, 2, !dbg !28
-  br i1 %cmp, label %if.then, label %if.end, !dbg !28
-
-if.then:                                          ; preds = %entry
-  %1 = load i32, i32* %z, align 4, !dbg !30
-  %inc = add nsw i32 %1, 1, !dbg !30
-  store i32 %inc, i32* %z, align 4, !dbg !30
-  br label %if.end, !dbg !30
-
-if.end:                                           ; preds = %if.then, %entry
-  %call = call signext i8 @_ZN1A3fooEv(%struct.A* %a), !dbg !31
-  %conv = sext i8 %call to i32, !dbg !31
-  %cmp1 = icmp eq i32 %conv, 97, !dbg !31
-  br i1 %cmp1, label %if.then2, label %if.end4, !dbg !31
-
-if.then2:                                         ; preds = %if.end
-  %2 = load i32, i32* %z, align 4, !dbg !33
-  %inc3 = add nsw i32 %2, 1, !dbg !33
-  store i32 %inc3, i32* %z, align 4, !dbg !33
-  br label %if.end4, !dbg !33
-
-if.end4:                                          ; preds = %if.then2, %if.end
-  ret void, !dbg !34
-}
-
-declare signext i8 @_ZN1A3fooEv(%struct.A*) #2
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.dbg.cu = !{!0, !9}
-!llvm.module.flags = !{!18, !19}
-!llvm.ident = !{!20, !20}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 (210479)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
-!1 = !DIFile(filename: "debug-loc-offset1.cc", directory: "/llvm_cmake_gcc")
-!2 = !{}
-!4 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bari", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
-!5 = !DIFile(filename: "debug-loc-offset1.cc", directory: "/llvm_cmake_gcc")
-!6 = !DISubroutineType(types: !7)
-!7 = !{!8, !8}
-!8 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!9 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 (210479)", isOptimized: false, emissionKind: FullDebug, file: !10, enums: !2, retainedTypes: !11, globals: !2, imports: !2)
-!10 = !DIFile(filename: "debug-loc-offset2.cc", directory: "/llvm_cmake_gcc")
-!11 = !{!12}
-!12 = !DICompositeType(tag: DW_TAG_structure_type, name: "A", line: 1, flags: DIFlagFwdDecl, file: !10, identifier: "_ZTS1A")
-!14 = distinct !DISubprogram(name: "baz", linkageName: "_Z3baz1A", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !9, scopeLine: 6, file: !10, scope: !15, type: !16, retainedNodes: !2)
-!15 = !DIFile(filename: "debug-loc-offset2.cc", directory: "/llvm_cmake_gcc")
-!16 = !DISubroutineType(types: !17)
-!17 = !{null, !12}
-!18 = !{i32 2, !"Dwarf Version", i32 4}
-!19 = !{i32 2, !"Debug Info Version", i32 3}
-!20 = !{!"clang version 3.5.0 (210479)"}
-!21 = !DILocalVariable(name: "b", line: 1, arg: 1, scope: !4, file: !5, type: !8)
-!22 = !DILocation(line: 1, scope: !4)
-!23 = !DILocation(line: 2, scope: !4)
-!24 = !DILocalVariable(name: "a", line: 6, arg: 1, scope: !14, file: !15, type: !12)
-!25 = !DILocation(line: 6, scope: !14)
-!26 = !DILocalVariable(name: "z", line: 7, scope: !14, file: !15, type: !8)
-!27 = !DILocation(line: 7, scope: !14)
-!28 = !DILocation(line: 8, scope: !29)
-!29 = distinct !DILexicalBlock(line: 8, column: 0, file: !10, scope: !14)
-!30 = !DILocation(line: 9, scope: !29)
-!31 = !DILocation(line: 10, scope: !32)
-!32 = distinct !DILexicalBlock(line: 10, column: 0, file: !10, scope: !14)
-!33 = !DILocation(line: 11, scope: !32)
-!34 = !DILocation(line: 12, scope: !14)
diff --git a/test/DebugInfo/X86/debug-loc-offset.mir b/test/DebugInfo/X86/debug-loc-offset.mir
new file mode 100644
index 00000000000..c5f80d06297
--- /dev/null
+++ b/test/DebugInfo/X86/debug-loc-offset.mir
@@ -0,0 +1,276 @@
+# RUN: llc -o - %s -start-after=patchable-function -filetype=obj -O0 -mtriple=i386-unknown-linux-gnu -dwarf-version=4 | llvm-dwarfdump -v - | FileCheck %s
+
+# From the code:
+#
+# debug-loc-offset1.cc
+# int bar (int b) {
+#   return b+4;
+# }
+#
+# debug-loc-offset2.cc
+# struct A {
+#   int var;
+#   virtual char foo();
+# };
+#
+# void baz(struct A a) {
+#   int z = 2;
+#   if (a.var > 2)
+#     z++;
+#   if (a.foo() == 'a')
+#     z++;
+# }
+#
+# Compiled separately for i386-pc-linux-gnu and linked together.
+# This ensures that we have multiple compile units and multiple location lists
+# so that we can verify that
+# debug_loc entries are relative to the low_pc of the CU. The loc entry for
+# the byval argument in foo.cpp is in the second CU and so should have
+# an offset relative to that CU rather than from the beginning of the text
+# section.
+#
+# Checking that we have two compile units with two sets of high/lo_pc.
+# CHECK: .debug_info contents
+# CHECK: DW_TAG_compile_unit
+# CHECK: DW_AT_low_pc {{.*}} (0x0000000000000020)
+# CHECK: DW_AT_high_pc
+#
+# CHECK: DW_TAG_subprogram
+# CHECK-NOT: DW_TAG
+# CHECK: DW_AT_linkage_name [DW_FORM_strp]{{.*}}"_Z3baz1A"
+# CHECK-NOT: {{DW_TAG|NULL}}
+# CHECK: DW_TAG_formal_parameter
+# CHECK-NOT: DW_TAG
+# CHECK:       DW_AT_location [DW_FORM_sec_offset]   ({{.*}}
+# CHECK-NEXT:    [0x00000020, 0x00000037): DW_OP_breg0 EAX+0, DW_OP_deref
+# CHECK-NEXT:    [0x00000037, 0x00000063): DW_OP_breg5 EBP-8, DW_OP_deref, DW_OP_deref
+# CHECK-NEXT:  DW_AT_name [DW_FORM_strp]{{.*}}"a"
+#
+# CHECK: DW_TAG_variable
+# CHECK: DW_AT_location [DW_FORM_exprloc]
+# CHECK-NOT: DW_AT_location
+#
+# CHECK: DW_TAG_compile_unit
+# CHECK: DW_AT_low_pc {{.*}} (0x0000000000000000)
+# CHECK: DW_AT_high_pc
+#
+# CHECK: DW_TAG_subprogram
+# CHECK-NOT: DW_TAG
+# CHECK: DW_AT_linkage_name [DW_FORM_strp]{{.*}}"_Z3bari"
+# CHECK-NOT: {{DW_TAG|NULL}}
+# CHECK: DW_TAG_formal_parameter
+# CHECK-NOT: DW_TAG
+# CHECK:       DW_AT_location [DW_FORM_sec_offset]   ({{.*}}
+# CHECK-NEXT:    [0x00000000, 0x0000000a): DW_OP_consts +0, DW_OP_stack_value
+# CHECK-NEXT:    [0x0000000a, 0x00000017): DW_OP_consts +1, DW_OP_stack_value)
+# CHECK-NEXT:  DW_AT_name [DW_FORM_strp]{{.*}}"b"
+#
+# CHECK: .debug_loc contents:
+# CHECK:       0x00000000:
+# CHECK-NEXT:    [0x00000000, 0x0000000a): DW_OP_consts +0, DW_OP_stack_value
+# CHECK-NEXT:    [0x0000000a, 0x00000017): DW_OP_consts +1, DW_OP_stack_value
+# CHECK:       0x00000022:
+# CHECK-NEXT:    [0x00000000, 0x00000017): DW_OP_breg0 EAX+0, DW_OP_deref
+# CHECK-NEXT:    [0x00000017, 0x00000043): DW_OP_breg5 EBP-8, DW_OP_deref, DW_OP_deref
+--- |
+  target triple = "i386-unknown-linux-gnu"
+  
+  %struct.A = type { i32 (...)**, i32 }
+  
+  ; Function Attrs: nounwind
+  define i32 @_Z3bari(i32 %b) #0 !dbg !10 {
+  entry:
+    %b.addr = alloca i32, align 4
+    store i32 %b, i32* %b.addr, align 4
+    call void @llvm.dbg.value(metadata i32 0, metadata !14, metadata !DIExpression()), !dbg !15
+    %0 = load i32, i32* %b.addr, align 4, !dbg !16
+    call void @llvm.dbg.value(metadata i32 1, metadata !14, metadata !DIExpression()), !dbg !15
+    %add = add nsw i32 %0, 4, !dbg !16
+    ret i32 %add, !dbg !16
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+  
+  define void @_Z3baz1A(%struct.A* %a) #2 !dbg !17 {
+  entry:
+    %z = alloca i32, align 4
+    call void @llvm.dbg.declare(metadata %struct.A* %a, metadata !20, metadata !DIExpression(DW_OP_deref)), !dbg !21
+    call void @llvm.dbg.declare(metadata i32* %z, metadata !22, metadata !DIExpression()), !dbg !23
+    store i32 2, i32* %z, align 4, !dbg !23
+    %var = getelementptr inbounds %struct.A, %struct.A* %a, i32 0, i32 1, !dbg !24
+    %0 = load i32, i32* %var, align 4, !dbg !24
+    %cmp = icmp sgt i32 %0, 2, !dbg !24
+    br i1 %cmp, label %if.then, label %if.end, !dbg !24
+  
+  if.then:                                          ; preds = %entry
+    %1 = load i32, i32* %z, align 4, !dbg !26
+    %inc = add nsw i32 %1, 1, !dbg !26
+    store i32 %inc, i32* %z, align 4, !dbg !26
+    br label %if.end, !dbg !26
+  
+  if.end:                                           ; preds = %if.then, %entry
+    %call = call signext i8 @_ZN1A3fooEv(%struct.A* %a), !dbg !27
+    %conv = sext i8 %call to i32, !dbg !27
+    %cmp1 = icmp eq i32 %conv, 97, !dbg !27
+    br i1 %cmp1, label %if.then2, label %if.end4, !dbg !27
+  
+  if.then2:                                         ; preds = %if.end
+    %2 = load i32, i32* %z, align 4, !dbg !29
+    %inc3 = add nsw i32 %2, 1, !dbg !29
+    store i32 %inc3, i32* %z, align 4, !dbg !29
+    br label %if.end4, !dbg !29
+  
+  if.end4:                                          ; preds = %if.then2, %if.end
+    ret void, !dbg !30
+  }
+  
+  declare signext i8 @_ZN1A3fooEv(%struct.A*) #2
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #3
+  
+  attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { nounwind readnone speculatable }
+  attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #3 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0, !3}
+  !llvm.module.flags = !{!7, !8}
+  !llvm.ident = !{!9, !9}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5.0 (210479)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "debug-loc-offset1.cc", directory: "/llvm_cmake_gcc")
+  !2 = !{}
+  !3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 3.5.0 (210479)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !5, globals: !2, imports: !2)
+  !4 = !DIFile(filename: "debug-loc-offset2.cc", directory: "/llvm_cmake_gcc")
+  !5 = !{!6}
+  !6 = !DICompositeType(tag: DW_TAG_structure_type, name: "A", file: !4, line: 1, flags: DIFlagFwdDecl, identifier: "_ZTS1A")
+  !7 = !{i32 2, !"Dwarf Version", i32 4}
+  !8 = !{i32 2, !"Debug Info Version", i32 3}
+  !9 = !{!"clang version 3.5.0 (210479)"}
+  !10 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bari", scope: !1, file: !1, line: 1, type: !11, isLocal: false, isDefinition: true, scopeLine: 1, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+  !11 = !DISubroutineType(types: !12)
+  !12 = !{!13, !13}
+  !13 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !14 = !DILocalVariable(name: "b", arg: 1, scope: !10, file: !1, line: 1, type: !13)
+  !15 = !DILocation(line: 1, scope: !10)
+  !16 = !DILocation(line: 2, scope: !10)
+  !17 = distinct !DISubprogram(name: "baz", linkageName: "_Z3baz1A", scope: !4, file: !4, line: 6, type: !18, isLocal: false, isDefinition: true, scopeLine: 6, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !3, retainedNodes: !2)
+  !18 = !DISubroutineType(types: !19)
+  !19 = !{null, !6}
+  !20 = !DILocalVariable(name: "a", arg: 1, scope: !17, file: !4, line: 6, type: !6)
+  !21 = !DILocation(line: 6, scope: !17)
+  !22 = !DILocalVariable(name: "z", scope: !17, file: !4, line: 7, type: !13)
+  !23 = !DILocation(line: 7, scope: !17)
+  !24 = !DILocation(line: 8, scope: !25)
+  !25 = distinct !DILexicalBlock(scope: !17, file: !4, line: 8)
+  !26 = !DILocation(line: 9, scope: !25)
+  !27 = !DILocation(line: 10, scope: !28)
+  !28 = distinct !DILexicalBlock(scope: !17, file: !4, line: 10)
+  !29 = !DILocation(line: 11, scope: !28)
+  !30 = !DILocation(line: 12, scope: !17)
+
+...
+---
+name:            _Z3bari
+alignment:       4
+tracksRegLiveness: true
+frameInfo:       
+  stackSize:       8
+  offsetAdjustment: -4
+  maxAlignment:    4
+  maxCallFrameSize: 0
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -8, size: 4, alignment: 8, stack-id: 0 }
+  - { id: 1, size: 4, alignment: 16, stack-id: 0 }
+stack:           
+  - { id: 0, type: spill-slot, offset: -12, size: 4, alignment: 4, stack-id: 0 }
+body:             |
+  bb.0.entry:
+    frame-setup PUSH32r killed $ebp, implicit-def $esp, implicit $esp
+    CFI_INSTRUCTION def_cfa_offset 8
+    CFI_INSTRUCTION offset $ebp, -8
+    $ebp = frame-setup MOV32rr $esp
+    CFI_INSTRUCTION def_cfa_register $ebp
+    frame-setup PUSH32r undef $eax, implicit-def $esp, implicit $esp
+    renamable $eax = MOV32rm $ebp, 1, $noreg, 8, $noreg :: (load 4 from %fixed-stack.1)
+    DBG_VALUE 0, 0, !14, !DIExpression(), debug-location !15
+    renamable $ecx = MOV32rm $ebp, 1, $noreg, 8, $noreg, debug-location !16 :: (load 4 from %ir.b.addr)
+    DBG_VALUE 1, 0, !14, !DIExpression(), debug-location !15
+    renamable $ecx = ADD32ri8 renamable $ecx, 4, implicit-def $eflags, debug-location !16
+    MOV32mr $ebp, 1, $noreg, -4, $noreg, killed $eax :: (store 4 into %fixed-stack.1)
+    $eax = MOV32rr killed $ecx, debug-location !16
+    $esp = frame-destroy ADD32ri8 $esp, 4, implicit-def dead $eflags, debug-location !16
+    $ebp = frame-destroy POP32r implicit-def $esp, implicit $esp, debug-location !16
+    CFI_INSTRUCTION def_cfa $esp, 4, debug-location !16
+    RETL implicit killed $eax, debug-location !16
+
+...
+---
+name:            _Z3baz1A
+alignment:       4
+tracksRegLiveness: true
+frameInfo:       
+  stackSize:       28
+  offsetAdjustment: -24
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 4
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -8, size: 4, alignment: 8, stack-id: 0 }
+  - { id: 1, size: 4, alignment: 16, stack-id: 0, isImmutable: true }
+stack:           
+  - { id: 0, name: z, offset: -12, size: 4, alignment: 4, stack-id: 0, 
+      debug-info-variable: '!22', debug-info-expression: '!DIExpression()', 
+      debug-info-location: '!23' }
+  - { id: 1, type: spill-slot, offset: -16, size: 4, alignment: 4, stack-id: 0 }
+body:             |
+  bb.0.entry:
+    frame-setup PUSH32r killed $ebp, implicit-def $esp, implicit $esp
+    CFI_INSTRUCTION def_cfa_offset 8
+    CFI_INSTRUCTION offset $ebp, -8
+    $ebp = frame-setup MOV32rr $esp
+    CFI_INSTRUCTION def_cfa_register $ebp
+    $esp = frame-setup SUB32ri8 $esp, 24, implicit-def dead $eflags
+    renamable $eax = MOV32rm $ebp, 1, $noreg, 8, $noreg :: (load 4 from %fixed-stack.1)
+    DBG_VALUE renamable $eax, 0, !20, !DIExpression(DW_OP_deref), debug-location !21
+    MOV32mi $ebp, 1, $noreg, -4, $noreg, 2, debug-location !23 :: (store 4 into %ir.z)
+    CMP32mi8 renamable $eax, 1, $noreg, 4, $noreg, 2, implicit-def $eflags, debug-location !24 :: (load 4 from %ir.var)
+    MOV32mr $ebp, 1, $noreg, -8, $noreg, killed $eax :: (store 4 into %stack.1)
+    DBG_VALUE $ebp, 0, !20, !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !21
+    JLE_1 %bb.2, implicit $eflags, debug-location !24
+  
+  bb.1.if.then:
+    DBG_VALUE $ebp, 0, !20, !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !21
+    renamable $eax = MOV32rm $ebp, 1, $noreg, -4, $noreg, debug-location !26 :: (load 4 from %ir.z)
+    renamable $eax = ADD32ri8 renamable $eax, 1, implicit-def $eflags, debug-location !26
+    MOV32mr $ebp, 1, $noreg, -4, $noreg, killed renamable $eax, debug-location !26 :: (store 4 into %ir.z)
+  
+  bb.2.if.end:
+    DBG_VALUE $ebp, 0, !20, !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !21
+    $eax = MOV32rm $ebp, 1, $noreg, -8, $noreg :: (load 4 from %stack.1)
+    MOV32mr $esp, 1, $noreg, 0, $noreg, killed renamable $eax, debug-location !27 :: (store 4 into stack)
+    CALLpcrel32 @_ZN1A3fooEv, csr_32, implicit $esp, implicit $ssp, implicit-def $al, debug-location !27
+    renamable $ecx = MOVSX32rr8 killed renamable $al, debug-location !27
+    CMP32ri8 killed renamable $ecx, 97, implicit-def $eflags, debug-location !27
+    JNE_1 %bb.4, implicit $eflags, debug-location !27
+  
+  bb.3.if.then2:
+    DBG_VALUE $ebp, 0, !20, !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !21
+    renamable $eax = MOV32rm $ebp, 1, $noreg, -4, $noreg, debug-location !29 :: (load 4 from %ir.z)
+    renamable $eax = ADD32ri8 renamable $eax, 1, implicit-def $eflags, debug-location !29
+    MOV32mr $ebp, 1, $noreg, -4, $noreg, killed renamable $eax, debug-location !29 :: (store 4 into %ir.z)
+  
+  bb.4.if.end4:
+    DBG_VALUE $ebp, 0, !20, !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !21
+    $esp = frame-destroy ADD32ri8 $esp, 24, implicit-def dead $eflags, debug-location !30
+    $ebp = frame-destroy POP32r implicit-def $esp, implicit $esp, debug-location !30
+    CFI_INSTRUCTION def_cfa $esp, 4, debug-location !30
+    RETL debug-location !30
+
+...
diff --git a/test/DebugInfo/X86/dw_op_minus.ll b/test/DebugInfo/X86/dw_op_minus.ll
deleted file mode 100644
index 8013c2cd023..00000000000
--- a/test/DebugInfo/X86/dw_op_minus.ll
+++ /dev/null
@@ -1,80 +0,0 @@
-; Test dwarf codegen of DW_OP_minus.
-; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-
-; This was built by compiling the following source with SafeStack and
-; simplifying the result a little.
-; extern "C" {
-; void Capture(int *);
-; void f() {
-;   int buf[100];
-;   Capture(buf);
-; }
-; }
-; The interesting part is !DIExpression(DW_OP_constu, 400, DW_OP_minus)
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@__safestack_unsafe_stack_ptr = external thread_local(initialexec) global i8*
-
-define void @f() !dbg !4 {
-entry:
-  %unsafe_stack_ptr = load i8*, i8** @__safestack_unsafe_stack_ptr
-  %unsafe_stack_static_top = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400
-  store i8* %unsafe_stack_static_top, i8** @__safestack_unsafe_stack_ptr
-  %0 = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400
-  %buf = bitcast i8* %0 to [100 x i32]*
-  %1 = bitcast [100 x i32]* %buf to i8*, !dbg !16
-  call void @llvm.dbg.declare(metadata i8* %unsafe_stack_ptr, metadata !8, metadata !17), !dbg !18
-  %arraydecay = getelementptr inbounds [100 x i32], [100 x i32]* %buf, i64 0, i64 0, !dbg !19
-  call void @Capture(i32* %arraydecay), !dbg !20
-  store i8* %unsafe_stack_ptr, i8** @__safestack_unsafe_stack_ptr, !dbg !21
-  ret void, !dbg !21
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-declare void @Capture(i32*)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!13, !14}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "1.cc", directory: "/tmp")
-!2 = !{}
-!4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !5, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !7)
-!5 = !DISubroutineType(types: !6)
-!6 = !{null}
-!7 = !{!8}
-!8 = !DILocalVariable(name: "buf", scope: !4, file: !1, line: 5, type: !9)
-!9 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 3200, align: 32, elements: !11)
-!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!11 = !{!12}
-!12 = !DISubrange(count: 100)
-!13 = !{i32 2, !"Dwarf Version", i32 4}
-!14 = !{i32 2, !"Debug Info Version", i32 3}
-!15 = !{!"clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)"}
-!16 = !DILocation(line: 5, column: 3, scope: !4)
-!17 = !DIExpression(DW_OP_constu, 400, DW_OP_minus)
-!18 = !DILocation(line: 5, column: 7, scope: !4)
-!19 = !DILocation(line: 6, column: 11, scope: !4)
-!20 = !DILocation(line: 6, column: 3, scope: !4)
-!21 = !DILocation(line: 7, column: 1, scope: !4)
-
-; RCX - 400
-; CHECK:      .short	3                       # Loc expr size
-; CHECK-NEXT: .byte	114                     # DW_OP_breg2
-; CHECK-NEXT: .byte	240                     # -400
-; CHECK-NEXT: .byte	124
-
-; RCX is clobbered in call @Capture, but there is a spilled copy.
-; *(RSP + 8) - 400
-; CHECK:      .short	7                       # Loc expr size
-; CHECK-NEXT: .byte	119                     # DW_OP_breg7
-; CHECK-NEXT: .byte	8                       # 8
-; CHECK-NEXT: .byte	6                       # DW_OP_deref
-; CHECK-NEXT: .byte	16                      # DW_OP_constu
-; CHECK-NEXT: .byte	144                     # 400
-; CHECK-NEXT: .byte	3                       #
-; CHECK-NEXT: .byte	28                      # DW_OP_minus
diff --git a/test/DebugInfo/X86/dw_op_minus.mir b/test/DebugInfo/X86/dw_op_minus.mir
new file mode 100644
index 00000000000..574e5aed442
--- /dev/null
+++ b/test/DebugInfo/X86/dw_op_minus.mir
@@ -0,0 +1,119 @@
+# RUN: llc -o - %s -start-after=patchable-function -O0 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+# Test dwarf codegen of DW_OP_minus.
+#
+# This was built by compiling the following source with SafeStack and
+# simplifying the result a little.
+# extern "C" {
+# void Capture(int *);
+# void f() {
+#   int buf[100];
+#   Capture(buf);
+# }
+# }
+# The interesting part is !DIExpression(DW_OP_constu, 400, DW_OP_minus)
+#
+# RCX - 400
+# CHECK:      .short    3                       # Loc expr size
+# CHECK-NEXT: .byte     114                     # DW_OP_breg2
+# CHECK-NEXT: .byte     240                     # -400
+# CHECK-NEXT: .byte     124
+#
+# RCX is clobbered in call @Capture, but there is a spilled copy.
+# *(RSP + 8) - 400
+# CHECK:      .short    7                       # Loc expr size
+# CHECK-NEXT: .byte     119                     # DW_OP_breg7
+# CHECK-NEXT: .byte     8                       # 8
+# CHECK-NEXT: .byte     6                       # DW_OP_deref
+# CHECK-NEXT: .byte     16                      # DW_OP_constu
+# CHECK-NEXT: .byte     144                     # 400
+# CHECK-NEXT: .byte     3                       #
+# CHECK-NEXT: .byte     28                      # DW_OP_minus
+--- |
+  @__safestack_unsafe_stack_ptr = external thread_local(initialexec) global i8*
+  
+  define void @f() !dbg !5 {
+  entry:
+    %unsafe_stack_ptr = load i8*, i8** @__safestack_unsafe_stack_ptr
+    %unsafe_stack_static_top = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400
+    store i8* %unsafe_stack_static_top, i8** @__safestack_unsafe_stack_ptr
+    %0 = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400
+    %buf = bitcast i8* %0 to [100 x i32]*
+    %1 = bitcast [100 x i32]* %buf to i8*, !dbg !14
+    call void @llvm.dbg.declare(metadata i8* %unsafe_stack_ptr, metadata !9, metadata !DIExpression(DW_OP_constu, 400, DW_OP_minus)), !dbg !15
+    %arraydecay = getelementptr inbounds [100 x i32], [100 x i32]* %buf, i64 0, i64 0, !dbg !16
+    call void @Capture(i32* %arraydecay), !dbg !17
+    store i8* %unsafe_stack_ptr, i8** @__safestack_unsafe_stack_ptr, !dbg !18
+    ret void, !dbg !18
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+  
+  declare void @Capture(i32*)
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #1
+  
+  attributes #0 = { nounwind readnone speculatable }
+  attributes #1 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+  !1 = !DIFile(filename: "1.cc", directory: "/tmp")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !8)
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{null}
+  !8 = !{!9}
+  !9 = !DILocalVariable(name: "buf", scope: !5, file: !1, line: 5, type: !10)
+  !10 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, size: 3200, align: 32, elements: !12)
+  !11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !12 = !{!13}
+  !13 = !DISubrange(count: 100)
+  !14 = !DILocation(line: 5, column: 3, scope: !5)
+  !15 = !DILocation(line: 5, column: 7, scope: !5)
+  !16 = !DILocation(line: 6, column: 11, scope: !5)
+  !17 = !DILocation(line: 6, column: 3, scope: !5)
+  !18 = !DILocation(line: 7, column: 1, scope: !5)
+
+...
+---
+name:            f
+alignment:       4
+tracksRegLiveness: true
+frameInfo:       
+  stackSize:       24
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:           
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: 0 }
+body:             |
+  bb.0.entry:
+    $rsp = frame-setup SUB64ri8 $rsp, 24, implicit-def dead $eflags
+    CFI_INSTRUCTION def_cfa_offset 32
+    renamable $rax = MOV64rm $rip, 1, $noreg, target-flags(x86-gottpoff) @__safestack_unsafe_stack_ptr, $noreg :: (load 8 from got)
+    renamable $rcx = MOV64rm renamable $rax, 1, $noreg, 0, $fs :: (dereferenceable load 8 from @__safestack_unsafe_stack_ptr)
+    DBG_VALUE renamable $rcx, 0, !9, !DIExpression(DW_OP_constu, 400, DW_OP_minus), debug-location !15
+    $rdx = MOV64rr $rcx
+    renamable $rdx = ADD64ri32 renamable $rdx, -400, implicit-def dead $eflags
+    MOV64mr renamable $rax, 1, $noreg, 0, $fs, renamable $rdx :: (store 8 into @__safestack_unsafe_stack_ptr)
+    $rdi = MOV64rr killed $rdx, debug-location !17
+    MOV64mr $rsp, 1, $noreg, 16, $noreg, killed $rax :: (store 8 into %stack.0)
+    MOV64mr $rsp, 1, $noreg, 8, $noreg, killed $rcx :: (store 8 into %stack.1)
+    DBG_VALUE $rsp, 0, !9, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_deref, DW_OP_constu, 400, DW_OP_minus), debug-location !15
+    CALL64pcrel32 @Capture, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, debug-location !17
+    $rax = MOV64rm $rsp, 1, $noreg, 16, $noreg :: (load 8 from %stack.0)
+    $rcx = MOV64rm $rsp, 1, $noreg, 8, $noreg :: (load 8 from %stack.1)
+    MOV64mr killed renamable $rax, 1, $noreg, 0, $fs, killed renamable $rcx, debug-location !18 :: (store 8 into @__safestack_unsafe_stack_ptr)
+    $rsp = frame-destroy ADD64ri8 $rsp, 24, implicit-def dead $eflags, debug-location !18
+    CFI_INSTRUCTION def_cfa_offset 8, debug-location !18
+    RETQ debug-location !18
+
+...
diff --git a/test/DebugInfo/X86/pr19307.ll b/test/DebugInfo/X86/pr19307.ll
deleted file mode 100644
index 90bbefaf3a7..00000000000
--- a/test/DebugInfo/X86/pr19307.ll
+++ /dev/null
@@ -1,144 +0,0 @@
-; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-
-; Generated from the source file pr19307.cc:
-; #include <string>
-; void parse_range(unsigned long long &offset, unsigned long long &limit,
-;                  std::string range) {
-;   if (range.compare(0, 6, "items=") != 0 || range[6] == '-')
-;     offset = 1;
-;   range.erase(0, 6);
-;   limit = 2;
-; }
-; with "clang++ -S -emit-llvm -O0 -g pr19307.cc"
-
-; Location of "range" string is spilled from %rdx to stack and is
-; addressed via %rbp.
-; CHECK: movq %rdx, {{[-0-9]+}}(%rbp)
-; CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]:
-; This location should be valid until the end of the function.
-
-; Verify that we have proper range in debug_loc section:
-; CHECK: .Ldebug_loc{{[0-9]+}}:
-; CHECK: DW_OP_breg1
-; CHECK:      .quad [[START_LABEL]]-.Lfunc_begin0
-; CHECK-NEXT: .quad .Lfunc_end0-.Lfunc_begin0
-; CHECK: DW_OP_breg6
-; CHECK: DW_OP_deref
-
-; ModuleID = 'pr19307.cc'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%"class.std::basic_string" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" }
-%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
-
-@.str = private unnamed_addr constant [7 x i8] c"items=\00", align 1
-
-; Function Attrs: uwtable
-define void @_Z11parse_rangeRyS_Ss(i64* %offset, i64* %limit, %"class.std::basic_string"* %range) #0 !dbg !13 {
-entry:
-  %offset.addr = alloca i64*, align 8
-  %limit.addr = alloca i64*, align 8
-  store i64* %offset, i64** %offset.addr, align 8
-  call void @llvm.dbg.declare(metadata i64** %offset.addr, metadata !45, metadata !DIExpression()), !dbg !46
-  store i64* %limit, i64** %limit.addr, align 8
-  call void @llvm.dbg.declare(metadata i64** %limit.addr, metadata !47, metadata !DIExpression()), !dbg !46
-  call void @llvm.dbg.declare(metadata %"class.std::basic_string"* %range, metadata !48, metadata !DIExpression(DW_OP_deref)), !dbg !49
-  %call = call i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"* %range, i64 0, i64 6, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0)), !dbg !50
-  %cmp = icmp ne i32 %call, 0, !dbg !50
-  br i1 %cmp, label %if.then, label %lor.lhs.false, !dbg !50
-
-lor.lhs.false:                                    ; preds = %entry
-  %call1 = call i8* @_ZNSsixEm(%"class.std::basic_string"* %range, i64 6), !dbg !52
-  %0 = load i8, i8* %call1, !dbg !52
-  %conv = sext i8 %0 to i32, !dbg !52
-  %cmp2 = icmp eq i32 %conv, 45, !dbg !52
-  br i1 %cmp2, label %if.then, label %if.end, !dbg !52
-
-if.then:                                          ; preds = %lor.lhs.false, %entry
-  %1 = load i64*, i64** %offset.addr, align 8, !dbg !54
-  store i64 1, i64* %1, align 8, !dbg !54
-  br label %if.end, !dbg !54
-
-if.end:                                           ; preds = %if.then, %lor.lhs.false
-  %call3 = call %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"* %range, i64 0, i64 6), !dbg !55
-  %2 = load i64*, i64** %limit.addr, align 8, !dbg !56
-  store i64 2, i64* %2, align 8, !dbg !56
-  ret void, !dbg !57
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-declare i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"*, i64, i64, i8*) #2
-
-declare i8* @_ZNSsixEm(%"class.std::basic_string"*, i64) #2
-
-declare %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"*, i64, i64) #2
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!42, !43}
-!llvm.ident = !{!44}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 (209308)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !3, globals: !2, imports: !21)
-!1 = !DIFile(filename: "pr19307.cc", directory: "/llvm_cmake_gcc")
-!2 = !{}
-!3 = !{!4, !6, !8}
-!4 = !DICompositeType(tag: DW_TAG_structure_type, line: 83, flags: DIFlagFwdDecl, file: !5, identifier: "_ZTS11__mbstate_t")
-!5 = !DIFile(filename: "/usr/include/wchar.h", directory: "/llvm_cmake_gcc")
-!6 = !DICompositeType(tag: DW_TAG_structure_type, name: "lconv", line: 54, flags: DIFlagFwdDecl, file: !7, identifier: "_ZTS5lconv")
-!7 = !DIFile(filename: "/usr/include/locale.h", directory: "/llvm_cmake_gcc")
-!8 = !DICompositeType(tag: DW_TAG_class_type, name: "basic_string<char, std::char_traits<char>, std::allocator<char> >", line: 1134, flags: DIFlagFwdDecl, file: !9, scope: !10, identifier: "_ZTSSs")
-!9 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/basic_string.tcc", directory: "/llvm_cmake_gcc")
-!10 = !DINamespace(name: "std", scope: null)
-!11 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/x86_64-linux-gnu/bits/c++config.h", directory: "/llvm_cmake_gcc")
-!13 = distinct !DISubprogram(name: "parse_range", linkageName: "_Z11parse_rangeRyS_Ss", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 4, file: !1, scope: !14, type: !15, retainedNodes: !2)
-!14 = !DIFile(filename: "pr19307.cc", directory: "/llvm_cmake_gcc")
-!15 = !DISubroutineType(types: !16)
-!16 = !{null, !17, !17, !19}
-!17 = !DIDerivedType(tag: DW_TAG_reference_type, baseType: !18)
-!18 = !DIBasicType(tag: DW_TAG_base_type, name: "long long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
-!19 = !DIDerivedType(tag: DW_TAG_typedef, name: "string", line: 65, file: !20, scope: !10, baseType: !8)
-!20 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", directory: "/llvm_cmake_gcc")
-!21 = !{!22, !26, !29, !33, !38, !41}
-!22 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !1, line: 57, scope: !23, entity: !25)
-!23 = !DINamespace(name: "__gnu_debug", scope: null)
-!24 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/debug/debug.h", directory: "/llvm_cmake_gcc")
-!25 = !DINamespace(name: "__debug", scope: !10)
-!26 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 66, scope: !10, entity: !27)
-!27 = !DIDerivedType(tag: DW_TAG_typedef, name: "mbstate_t", line: 106, file: !5, baseType: !28)
-!28 = !DIDerivedType(tag: DW_TAG_typedef, name: "__mbstate_t", line: 95, file: !5, baseType: !4)
-!29 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 141, scope: !10, entity: !30)
-!30 = !DIDerivedType(tag: DW_TAG_typedef, name: "wint_t", line: 141, file: !31, baseType: !32)
-!31 = !DIFile(filename: "/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", directory: "/llvm_cmake_gcc")
-!32 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
-!33 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 42, scope: !34, entity: !36)
-!34 = !DINamespace(name: "__gnu_cxx", scope: null)
-!35 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/cpp_type_traits.h", directory: "/llvm_cmake_gcc")
-!36 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", line: 155, file: !11, scope: !10, baseType: !37)
-!37 = !DIBasicType(tag: DW_TAG_base_type, name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
-!38 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 43, scope: !34, entity: !39)
-!39 = !DIDerivedType(tag: DW_TAG_typedef, name: "ptrdiff_t", line: 156, file: !11, scope: !10, baseType: !40)
-!40 = !DIBasicType(tag: DW_TAG_base_type, name: "long int", size: 64, align: 64, encoding: DW_ATE_signed)
-!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 55, scope: !10, entity: !6)
-!42 = !{i32 2, !"Dwarf Version", i32 4}
-!43 = !{i32 2, !"Debug Info Version", i32 3}
-!44 = !{!"clang version 3.5.0 (209308)"}
-!45 = !DILocalVariable(name: "offset", line: 3, arg: 1, scope: !13, file: !14, type: !17)
-!46 = !DILocation(line: 3, scope: !13)
-!47 = !DILocalVariable(name: "limit", line: 3, arg: 2, scope: !13, file: !14, type: !17)
-!48 = !DILocalVariable(name: "range", line: 4, arg: 3, scope: !13, file: !14, type: !19)
-!49 = !DILocation(line: 4, scope: !13)
-!50 = !DILocation(line: 5, scope: !51)
-!51 = distinct !DILexicalBlock(line: 5, column: 0, file: !1, scope: !13)
-!52 = !DILocation(line: 5, scope: !53)
-!53 = distinct !DILexicalBlock(line: 5, column: 0, file: !1, scope: !51)
-!54 = !DILocation(line: 6, scope: !51)
-!55 = !DILocation(line: 7, scope: !13)
-!56 = !DILocation(line: 8, scope: !13)
-!57 = !DILocation(line: 9, scope: !13)
-
diff --git a/test/DebugInfo/X86/pr19307.mir b/test/DebugInfo/X86/pr19307.mir
new file mode 100644
index 00000000000..b8380b703a9
--- /dev/null
+++ b/test/DebugInfo/X86/pr19307.mir
@@ -0,0 +1,224 @@
+# RUN: llc -o - %s -start-after=patchable-function -O0 | FileCheck %s
+
+# Generated from the source file pr19307.cc:
+# #include <string>
+# void parse_range(unsigned long long &offset, unsigned long long &limit,
+#                  std::string range) {
+#   if (range.compare(0, 6, "items=") != 0 || range[6] == '-')
+#     offset = 1;
+#   range.erase(0, 6);
+#   limit = 2;
+# }
+# with "clang++ -S -emit-llvm -O0 -g pr19307.cc"
+#
+# Location of "range" string is spilled from %rdx to stack and is
+# addressed via %rbp.
+# CHECK: movq %rdx, {{[-0-9]+}}(%rbp)
+# CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]:
+# This location should be valid until the end of the function.
+#
+# Verify that we have proper range in debug_loc section:
+# CHECK: .Ldebug_loc{{[0-9]+}}:
+# CHECK: DW_OP_breg1
+# CHECK:      .quad [[START_LABEL]]-.Lfunc_begin0
+# CHECK-NEXT: .quad .Lfunc_end0-.Lfunc_begin0
+# CHECK: DW_OP_breg6
+# CHECK: DW_OP_deref
+--- |
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  %"class.std::basic_string" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" }
+  %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
+  
+  @.str = private unnamed_addr constant [7 x i8] c"items=\00", align 1
+  
+  ; Function Attrs: uwtable
+  define void @_Z11parse_rangeRyS_Ss(i64* %offset, i64* %limit, %"class.std::basic_string"* %range) #0 !dbg !34 {
+  entry:
+    %offset.addr = alloca i64*, align 8
+    %limit.addr = alloca i64*, align 8
+    store i64* %offset, i64** %offset.addr, align 8
+    call void @llvm.dbg.declare(metadata i64** %offset.addr, metadata !41, metadata !DIExpression()), !dbg !42
+    store i64* %limit, i64** %limit.addr, align 8
+    call void @llvm.dbg.declare(metadata i64** %limit.addr, metadata !43, metadata !DIExpression()), !dbg !42
+    call void @llvm.dbg.declare(metadata %"class.std::basic_string"* %range, metadata !44, metadata !DIExpression(DW_OP_deref)), !dbg !45
+    %call = call i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"* %range, i64 0, i64 6, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0)), !dbg !46
+    %cmp = icmp ne i32 %call, 0, !dbg !46
+    br i1 %cmp, label %if.then, label %lor.lhs.false, !dbg !46
+  
+  lor.lhs.false:                                    ; preds = %entry
+    %call1 = call i8* @_ZNSsixEm(%"class.std::basic_string"* %range, i64 6), !dbg !48
+    %0 = load i8, i8* %call1, !dbg !48
+    %conv = sext i8 %0 to i32, !dbg !48
+    %cmp2 = icmp eq i32 %conv, 45, !dbg !48
+    br i1 %cmp2, label %if.then, label %if.end, !dbg !48
+  
+  if.then:                                          ; preds = %lor.lhs.false, %entry
+    %1 = load i64*, i64** %offset.addr, align 8, !dbg !50
+    store i64 1, i64* %1, align 8, !dbg !50
+    br label %if.end, !dbg !50
+  
+  if.end:                                           ; preds = %if.then, %lor.lhs.false
+    %call3 = call %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"* %range, i64 0, i64 6), !dbg !51
+    %2 = load i64*, i64** %limit.addr, align 8, !dbg !52
+    store i64 2, i64* %2, align 8, !dbg !52
+    ret void, !dbg !53
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+  
+  declare i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"*, i64, i64, i8*) #2
+  
+  declare i8* @_ZNSsixEm(%"class.std::basic_string"*, i64) #2
+  
+  declare %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"*, i64, i64) #2
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #3
+  
+  attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { nounwind readnone speculatable }
+  attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #3 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!31, !32}
+  !llvm.ident = !{!33}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5.0 (209308)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !2, imports: !11)
+  !1 = !DIFile(filename: "pr19307.cc", directory: "/llvm_cmake_gcc")
+  !2 = !{}
+  !3 = !{!4, !6, !8}
+  !4 = !DICompositeType(tag: DW_TAG_structure_type, file: !5, line: 83, flags: DIFlagFwdDecl, identifier: "_ZTS11__mbstate_t")
+  !5 = !DIFile(filename: "/usr/include/wchar.h", directory: "/llvm_cmake_gcc")
+  !6 = !DICompositeType(tag: DW_TAG_structure_type, name: "lconv", file: !7, line: 54, flags: DIFlagFwdDecl, identifier: "_ZTS5lconv")
+  !7 = !DIFile(filename: "/usr/include/locale.h", directory: "/llvm_cmake_gcc")
+  !8 = !DICompositeType(tag: DW_TAG_class_type, name: "basic_string<char, std::char_traits<char>, std::allocator<char> >", scope: !10, file: !9, line: 1134, flags: DIFlagFwdDecl, identifier: "_ZTSSs")
+  !9 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/basic_string.tcc", directory: "/llvm_cmake_gcc")
+  !10 = !DINamespace(name: "std", scope: null)
+  !11 = !{!12, !15, !18, !22, !27, !30}
+  !12 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !13, entity: !14, file: !1, line: 57)
+  !13 = !DINamespace(name: "__gnu_debug", scope: null)
+  !14 = !DINamespace(name: "__debug", scope: !10)
+  !15 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !10, entity: !16, file: !1, line: 66)
+  !16 = !DIDerivedType(tag: DW_TAG_typedef, name: "mbstate_t", file: !5, line: 106, baseType: !17)
+  !17 = !DIDerivedType(tag: DW_TAG_typedef, name: "__mbstate_t", file: !5, line: 95, baseType: !4)
+  !18 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !10, entity: !19, file: !1, line: 141)
+  !19 = !DIDerivedType(tag: DW_TAG_typedef, name: "wint_t", file: !20, line: 141, baseType: !21)
+  !20 = !DIFile(filename: "/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", directory: "/llvm_cmake_gcc")
+  !21 = !DIBasicType(name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
+  !22 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !23, entity: !24, file: !1, line: 42)
+  !23 = !DINamespace(name: "__gnu_cxx", scope: null)
+  !24 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", scope: !10, file: !25, line: 155, baseType: !26)
+  !25 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/x86_64-linux-gnu/bits/c++config.h", directory: "/llvm_cmake_gcc")
+  !26 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+  !27 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !23, entity: !28, file: !1, line: 43)
+  !28 = !DIDerivedType(tag: DW_TAG_typedef, name: "ptrdiff_t", scope: !10, file: !25, line: 156, baseType: !29)
+  !29 = !DIBasicType(name: "long int", size: 64, align: 64, encoding: DW_ATE_signed)
+  !30 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !10, entity: !6, file: !1, line: 55)
+  !31 = !{i32 2, !"Dwarf Version", i32 4}
+  !32 = !{i32 2, !"Debug Info Version", i32 3}
+  !33 = !{!"clang version 3.5.0 (209308)"}
+  !34 = distinct !DISubprogram(name: "parse_range", linkageName: "_Z11parse_rangeRyS_Ss", scope: !1, file: !1, line: 3, type: !35, isLocal: false, isDefinition: true, scopeLine: 4, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+  !35 = !DISubroutineType(types: !36)
+  !36 = !{null, !37, !37, !39}
+  !37 = !DIDerivedType(tag: DW_TAG_reference_type, baseType: !38)
+  !38 = !DIBasicType(name: "long long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+  !39 = !DIDerivedType(tag: DW_TAG_typedef, name: "string", scope: !10, file: !40, line: 65, baseType: !8)
+  !40 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", directory: "/llvm_cmake_gcc")
+  !41 = !DILocalVariable(name: "offset", arg: 1, scope: !34, file: !1, line: 3, type: !37)
+  !42 = !DILocation(line: 3, scope: !34)
+  !43 = !DILocalVariable(name: "limit", arg: 2, scope: !34, file: !1, line: 3, type: !37)
+  !44 = !DILocalVariable(name: "range", arg: 3, scope: !34, file: !1, line: 4, type: !39)
+  !45 = !DILocation(line: 4, scope: !34)
+  !46 = !DILocation(line: 5, scope: !47)
+  !47 = distinct !DILexicalBlock(scope: !34, file: !1, line: 5)
+  !48 = !DILocation(line: 5, scope: !49)
+  !49 = distinct !DILexicalBlock(scope: !47, file: !1, line: 5)
+  !50 = !DILocation(line: 6, scope: !47)
+  !51 = !DILocation(line: 7, scope: !34)
+  !52 = !DILocation(line: 8, scope: !34)
+  !53 = !DILocation(line: 9, scope: !34)
+
+...
+---
+name:            _Z11parse_rangeRyS_Ss
+alignment:       4
+tracksRegLiveness: true
+liveins:         
+  - { reg: '$rdi' }
+  - { reg: '$rsi' }
+  - { reg: '$rdx' }
+frameInfo:       
+  stackSize:       40
+  offsetAdjustment: -32
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: 0 }
+stack:           
+  - { id: 0, name: offset.addr, offset: -24, size: 8, alignment: 8, stack-id: 0, 
+      debug-info-variable: '!41', debug-info-expression: '!DIExpression()', 
+      debug-info-location: '!42' }
+  - { id: 1, name: limit.addr, offset: -32, size: 8, alignment: 8, stack-id: 0, 
+      debug-info-variable: '!43', debug-info-expression: '!DIExpression()', 
+      debug-info-location: '!42' }
+  - { id: 2, type: spill-slot, offset: -40, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 3, type: spill-slot, offset: -48, size: 8, alignment: 8, stack-id: 0 }
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $rsi, $rdx
+  
+    frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    CFI_INSTRUCTION offset $rbp, -16
+    $rbp = frame-setup MOV64rr $rsp
+    CFI_INSTRUCTION def_cfa_register $rbp
+    $rsp = frame-setup SUB64ri8 $rsp, 32, implicit-def dead $eflags
+    $eax = XOR32rr undef $eax, undef $eax, implicit-def $eflags, implicit-def $rax
+    MOV64mr $rbp, 1, $noreg, -8, $noreg, killed renamable $rdi :: (store 8 into %ir.offset.addr)
+    MOV64mr $rbp, 1, $noreg, -16, $noreg, killed renamable $rsi :: (store 8 into %ir.limit.addr)
+    DBG_VALUE renamable $rdx, 0, !44, !DIExpression(DW_OP_deref), debug-location !45
+    $rdi = MOV64rr $rdx, debug-location !46
+    $rsi = MOV64rr killed $rax, debug-location !46
+    $eax = MOV32ri 6, implicit-def $rax, debug-location !46
+    MOV64mr $rbp, 1, $noreg, -24, $noreg, killed $rdx :: (store 8 into %stack.2)
+    DBG_VALUE $rbp, 0, !44, !DIExpression(DW_OP_constu, 24, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !45
+    $rdx = MOV64rr killed $rax, debug-location !46
+    renamable $rcx = MOV64ri @.str, debug-location !46
+    CALL64pcrel32 @_ZNKSs7compareEmmPKc, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit killed $rsi, implicit killed $rdx, implicit killed $rcx, implicit-def $eax, debug-location !46
+    CMP32ri8 killed renamable $eax, 0, implicit-def $eflags, debug-location !46
+    JNE_1 %bb.2, implicit $eflags, debug-location !46
+  
+  bb.1.lor.lhs.false:
+    DBG_VALUE $rbp, 0, !44, !DIExpression(DW_OP_constu, 24, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !45
+    $rdi = MOV64rm $rbp, 1, $noreg, -24, $noreg :: (load 8 from %stack.2)
+    $esi = MOV32ri 6, implicit-def $rsi, debug-location !48
+    CALL64pcrel32 @_ZNSsixEm, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit killed $rsi, implicit-def $rax, debug-location !48
+    renamable $ecx = MOVSX32rm8 killed renamable $rax, 1, $noreg, 0, $noreg, debug-location !48 :: (load 1 from %ir.call1)
+    CMP32ri8 killed renamable $ecx, 45, implicit-def $eflags, debug-location !48
+    JNE_1 %bb.3, implicit $eflags, debug-location !48
+  
+  bb.2.if.then:
+    DBG_VALUE $rbp, 0, !44, !DIExpression(DW_OP_constu, 24, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !45
+    renamable $rax = MOV64rm $rbp, 1, $noreg, -8, $noreg, debug-location !50 :: (load 8 from %ir.offset.addr)
+    MOV64mi32 killed renamable $rax, 1, $noreg, 0, $noreg, 1, debug-location !50 :: (store 8 into %ir.1)
+  
+  bb.3.if.end:
+    DBG_VALUE $rbp, 0, !44, !DIExpression(DW_OP_constu, 24, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !45
+    $esi = XOR32rr undef $esi, undef $esi, implicit-def $eflags, implicit-def $rsi
+    $rdi = MOV64rm $rbp, 1, $noreg, -24, $noreg :: (load 8 from %stack.2)
+    $edx = MOV32ri 6, implicit-def $rdx, debug-location !51
+    CALL64pcrel32 @_ZNSs5eraseEmm, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit killed $rsi, implicit killed $rdx, implicit-def $rax, debug-location !51
+    renamable $rdx = MOV64rm $rbp, 1, $noreg, -16, $noreg, debug-location !52 :: (load 8 from %ir.limit.addr)
+    MOV64mi32 killed renamable $rdx, 1, $noreg, 0, $noreg, 2, debug-location !52 :: (store 8 into %ir.2)
+    MOV64mr $rbp, 1, $noreg, -32, $noreg, killed $rax :: (store 8 into %stack.3)
+    $rsp = frame-destroy ADD64ri8 $rsp, 32, implicit-def dead $eflags, debug-location !53
+    $rbp = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !53
+    CFI_INSTRUCTION def_cfa $rsp, 8, debug-location !53
+    RETQ debug-location !53
+
+...
-- 
GitLab


From e64378786d0eaf7190d5c9b088c7be3fa2be828b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 2 Nov 2018 02:43:55 +0000
Subject: [PATCH 0905/1116] AMDGPU: Fix assertion with bitcast from i64
 constant to v4i16

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345922 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp      |  7 ++--
 .../AMDGPU/bitcast-constant-to-vector.ll      | 38 +++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll

diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 9823dd7709d..ad0a9e388af 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3817,9 +3817,10 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
       if (Src.getValueType() == MVT::i64) {
         SDLoc SL(N);
         uint64_t CVal = C->getZExtValue();
-        return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
-                           DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
-                           DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+        SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
       }
     }
 
diff --git a/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll b/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
new file mode 100644
index 00000000000..ea5f01fbda0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}cast_constant_i64_to_build_vector_v4i16:
+; GCN: global_store_dwordx2
+; GCN: global_store_dword v
+; GCN: global_store_short
+define amdgpu_kernel void @cast_constant_i64_to_build_vector_v4i16(i8 addrspace(1)* nocapture %data) {
+entry:
+  store i8 72, i8 addrspace(1)* %data, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 1
+  store i8 101, i8 addrspace(1)* %arrayidx1, align 1
+  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 2
+  store i8 108, i8 addrspace(1)* %arrayidx2, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 3
+  store i8 108, i8 addrspace(1)* %arrayidx3, align 1
+  %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 4
+  store i8 111, i8 addrspace(1)* %arrayidx4, align 1
+  %arrayidx5 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 5
+  store i8 44, i8 addrspace(1)* %arrayidx5, align 1
+  %arrayidx6 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 6
+  store i8 32, i8 addrspace(1)* %arrayidx6, align 1
+  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 7
+  store i8 87, i8 addrspace(1)* %arrayidx7, align 1
+  %arrayidx8 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 8
+  store i8 111, i8 addrspace(1)* %arrayidx8, align 1
+  %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 9
+  store i8 114, i8 addrspace(1)* %arrayidx9, align 1
+  %arrayidx10 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 10
+  store i8 108, i8 addrspace(1)* %arrayidx10, align 1
+  %arrayidx11 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 11
+  store i8 100, i8 addrspace(1)* %arrayidx11, align 1
+  %arrayidx12 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 12
+  store i8 33, i8 addrspace(1)* %arrayidx12, align 1
+  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 13
+  store i8 72, i8 addrspace(1)* %arrayidx13, align 1
+  ret void
+}
+
-- 
GitLab


From 8de3dff19239ad6c4eaf77ed6e4e6e4dd57d5b30 Mon Sep 17 00:00:00 2001
From: Dean Michael Berris <dberris@google.com>
Date: Fri, 2 Nov 2018 08:07:38 +0000
Subject: [PATCH 0906/1116] [XRay] Update delta computations in runtime

Summary:
Fix some issues discovered from mostly manual inspection of outputs from
the `llvm-xray fdr-dump` tool.

It turns out we haven't been writing the deltas properly, and have been
writing down zeros for deltas of some records. This change fixes this
oversight born by the recent refactoring.

Reviewers: mboerger

Subscribers: llvm-commits, hiraditya

Differential Revision: https://reviews.llvm.org/D54022

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345954 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/XRay/RecordPrinter.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/XRay/RecordPrinter.cpp b/lib/XRay/RecordPrinter.cpp
index 81d77f67cc1..0d5ee2de83d 100644
--- a/lib/XRay/RecordPrinter.cpp
+++ b/lib/XRay/RecordPrinter.cpp
@@ -66,19 +66,19 @@ Error RecordPrinter::visit(FunctionRecord &R) {
   // FIXME: Support symbolization here?
   switch (R.recordType()) {
   case RecordTypes::ENTER:
-    OS << formatv("<Function Enter: #{0} delta = +{0}>", R.functionId(),
+    OS << formatv("<Function Enter: #{0} delta = +{1}>", R.functionId(),
                   R.delta());
     break;
   case RecordTypes::ENTER_ARG:
-    OS << formatv("<Function Enter With Arg: #{0} delta = +{0}>",
+    OS << formatv("<Function Enter With Arg: #{0} delta = +{1}>",
                   R.functionId(), R.delta());
     break;
   case RecordTypes::EXIT:
-    OS << formatv("<Function Exit: #{0} delta = +{0}>", R.functionId(),
+    OS << formatv("<Function Exit: #{0} delta = +{1}>", R.functionId(),
                   R.delta());
     break;
   case RecordTypes::TAIL_EXIT:
-    OS << formatv("<Function Tail Exit: #{0} delta = +{0}>", R.functionId(),
+    OS << formatv("<Function Tail Exit: #{0} delta = +{1}>", R.functionId(),
                   R.delta());
     break;
   }
-- 
GitLab


From b562d40af4a9fc823b9b5d712471b1b97dcb05da Mon Sep 17 00:00:00 2001
From: Dean Michael Berris <dberris@google.com>
Date: Fri, 2 Nov 2018 08:35:46 +0000
Subject: [PATCH 0907/1116] [XRay] Fix tests with updated fdr-dump

Follow-up to D54022.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345955 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt | 12 ++++++------
 test/tools/llvm-xray/X86/fdr-dump-arg1.txt           |  4 ++--
 unittests/XRay/FDRRecordPrinterTest.cpp              |  8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt b/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt
index 35303016af9..ccb8a1b0538 100644
--- a/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt
+++ b/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt
@@ -12,14 +12,14 @@
 ; CHECK-NEXT:  <CPU: id = 6, tsc = 2034042117104344>
 ; CHECK-NEXT:  <TSC Wrap: base = 2034042117104344>
 ; CHECK-EMPTY:
-; CHECK-NEXT: -  <Function Enter: #3 delta = +3>
-; CHECK-NEXT: -  <Function Exit: #3 delta = +3>
-; CHECK-NEXT: -  <Function Enter: #2 delta = +2>
-; CHECK-NEXT: -  <Function Exit: #2 delta = +2>
+; CHECK-NEXT: -  <Function Enter: #3 delta = +0>
+; CHECK-NEXT: -  <Function Exit: #3 delta = +94744>
+; CHECK-NEXT: -  <Function Enter: #2 delta = +1028487290>
+; CHECK-NEXT: -  <Function Exit: #2 delta = +75822>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Metadata: <TSC Wrap: base = 2034049739853430>
 ; CHECK-EMPTY:
-; CHECK-NEXT: -  <Function Enter: #1 delta = +1>
+; CHECK-NEXT: -  <Function Enter: #1 delta = +0>
 ; CHECK-NEXT:  : <Call Argument: data = 67 (hex = 0x43)>
-; CHECK-NEXT: -  <Function Exit: #1 delta = +1>
+; CHECK-NEXT: -  <Function Exit: #1 delta = +24724>
 
diff --git a/test/tools/llvm-xray/X86/fdr-dump-arg1.txt b/test/tools/llvm-xray/X86/fdr-dump-arg1.txt
index df39f6ddd5f..8fb381a170c 100644
--- a/test/tools/llvm-xray/X86/fdr-dump-arg1.txt
+++ b/test/tools/llvm-xray/X86/fdr-dump-arg1.txt
@@ -9,8 +9,8 @@
 ; CHECK-NEXT:  <CPU: id = 49, tsc = 18828908666540172>
 ; CHECK-NEXT:  <TSC Wrap: base = 18828908666540172>
 ; CHECK-EMPTY:
-; CHECK-NEXT: -  <Function Enter: #1 delta = +1>
+; CHECK-NEXT: -  <Function Enter: #1 delta = +3146>
 ; CHECK-NEXT:  : <Call Argument: data = 1 (hex = 0x1)>
-; CHECK-NEXT: -  <Function Exit: #1 delta = +1>
+; CHECK-NEXT: -  <Function Exit: #1 delta = +52286>
 ; CHECK-NEXT:  *** <End of Buffer>
 
diff --git a/unittests/XRay/FDRRecordPrinterTest.cpp b/unittests/XRay/FDRRecordPrinterTest.cpp
index 321892e7240..a0ec3f22bf5 100644
--- a/unittests/XRay/FDRRecordPrinterTest.cpp
+++ b/unittests/XRay/FDRRecordPrinterTest.cpp
@@ -132,7 +132,7 @@ TEST(FDRRecordPrinterTest, WriteFunctionRecordEnter) {
   FunctionRecord R(RecordTypes::ENTER, 1, 2);
   ASSERT_FALSE(errorToBool(R.apply(P)));
   OS.flush();
-  EXPECT_THAT(Data, Eq("<Function Enter: #1 delta = +1>"));
+  EXPECT_THAT(Data, Eq("<Function Enter: #1 delta = +2>"));
 }
 
 TEST(FDRRecordPrinterTest, WriteFunctionRecordExit) {
@@ -142,7 +142,7 @@ TEST(FDRRecordPrinterTest, WriteFunctionRecordExit) {
   FunctionRecord R(RecordTypes::EXIT, 1, 2);
   ASSERT_FALSE(errorToBool(R.apply(P)));
   OS.flush();
-  EXPECT_THAT(Data, Eq("<Function Exit: #1 delta = +1>"));
+  EXPECT_THAT(Data, Eq("<Function Exit: #1 delta = +2>"));
 }
 
 TEST(FDRRecordPrinterTest, WriteFunctionRecordTailExit) {
@@ -152,7 +152,7 @@ TEST(FDRRecordPrinterTest, WriteFunctionRecordTailExit) {
   FunctionRecord R(RecordTypes::TAIL_EXIT, 1, 2);
   ASSERT_FALSE(errorToBool(R.apply(P)));
   OS.flush();
-  EXPECT_THAT(Data, Eq("<Function Tail Exit: #1 delta = +1>"));
+  EXPECT_THAT(Data, Eq("<Function Tail Exit: #1 delta = +2>"));
 }
 
 TEST(FDRRecordPrinterTest, WriteFunctionRecordEnterArg) {
@@ -162,7 +162,7 @@ TEST(FDRRecordPrinterTest, WriteFunctionRecordEnterArg) {
   FunctionRecord R(RecordTypes::ENTER_ARG, 1, 2);
   ASSERT_FALSE(errorToBool(R.apply(P)));
   OS.flush();
-  EXPECT_THAT(Data, Eq("<Function Enter With Arg: #1 delta = +1>"));
+  EXPECT_THAT(Data, Eq("<Function Enter With Arg: #1 delta = +2>"));
 }
 
 } // namespace
-- 
GitLab


From 6d0de6568285912510ce6dd5c5f19699383c630c Mon Sep 17 00:00:00 2001
From: Ayal Zaks <ayal.zaks@intel.com>
Date: Fri, 2 Nov 2018 09:16:12 +0000
Subject: [PATCH 0908/1116] [LV] Avoid vectorizing loops under opt for size
 that involve SCEV checks

Fix PR39417, PR39497

The loop vectorizer may generate runtime SCEV checks for overflow and stride==1
cases, leading to execution of original scalar loop. The latter is forbidden
when optimizing for size. An assert introduced in r344743 triggered the above
PR's showing it does happen. This patch fixes this behavior by preventing
vectorization in such cases.

Differential Revision: https://reviews.llvm.org/D53612


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345959 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 26 +++++++-
 test/Transforms/LoopVectorize/X86/optsize.ll  | 60 +++++++++++++++++++
 .../pr39417-optsize-scevchecks.ll             | 54 +++++++++++++++++
 3 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 23d4a6b2166..c9c70b5c536 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2557,7 +2557,8 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
     if (C->isZero())
       return;
 
-  assert(!Cost->foldTailByMasking() && "Cannot check stride when folding tail");
+  assert(!Cost->foldTailByMasking() &&
+         "Cannot SCEV check stride or overflow when folding tail");
   // Create a new block containing the stride check.
   BB->setName("vector.scevcheck");
   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -4637,6 +4638,29 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
     return None;
   }
 
+  if (!PSE.getUnionPredicate().getPredicates().empty()) {
+    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
+              << "runtime SCEV checks needed. Enable vectorization of this "
+                 "loop with '#pragma clang loop vectorize(enable)' when "
+                 "compiling with -Os/-Oz");
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
+    return None;
+  }
+
+  // FIXME: Avoid specializing for stride==1 instead of bailing out.
+  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
+    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
+              << "runtime stride == 1 checks needed. Enable vectorization of "
+                 "this loop with '#pragma clang loop vectorize(enable)' when "
+                 "compiling with -Os/-Oz");
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
+    return None;
+  }
+
   // If we optimize the program for size, avoid creating the tail loop.
   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
diff --git a/test/Transforms/LoopVectorize/X86/optsize.ll b/test/Transforms/LoopVectorize/X86/optsize.ll
index 508823475ea..9fa65534f32 100644
--- a/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -3,6 +3,7 @@
 ; will produce a tail loop with the optimize for size or the minimize size
 ; attributes. This is a target-dependent version of the test.
 ; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s
+; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s --check-prefix AUTOVF
 
 target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
 
@@ -136,3 +137,62 @@ for.end:                                          ; preds = %for.body
 
 attributes #1 = { minsize }
 
+
+; We can't vectorize this one because we version for stride==1; even having TC
+; a multiple of VF.
+; CHECK-LABEL: @scev4stride1
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+; AUTOVF-LABEL: @scev4stride1
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.body:
+define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
+for.body.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = mul nsw i32 %i.07, %k
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
+  store i32 %0, i32* %arrayidx1, align 4
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 256
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  ret void
+}
+
+attributes #2 = { optsize }
+
+
+; PR39497
+; We can't vectorize this one because we version for overflow check and tiny
+; trip count leads to opt-for-size (which otherwise could fold the tail by
+; masking).
+; CHECK-LABEL: @main
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.cond:
+; AUTOVF-LABEL: @main
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.cond:
+define i32 @main() local_unnamed_addr {
+while.cond:
+  br label %for.cond
+
+for.cond:
+  %d.0 = phi i32 [ 0, %while.cond ], [ %add, %for.cond ]
+  %conv = and i32 %d.0, 65535
+  %cmp = icmp ult i32 %conv, 4
+  %add = add nuw nsw i32 %conv, 1
+  br i1 %cmp, label %for.cond, label %while.cond.loopexit
+
+while.cond.loopexit:
+  ret i32 0
+}
diff --git a/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
new file mode 100644
index 00000000000..6032fb18a38
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -0,0 +1,54 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; PR39417
+; Check that the need for overflow check prevents vectorizing a loop with tiny
+; trip count (which implies opt for size).
+; CHECK-LABEL: @func_34
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: bb67:
+define void @func_34() {
+bb1:
+  br label %bb67
+
+bb67:
+  %storemerge2 = phi i32 [ 0, %bb1 ], [ %_tmp2300, %bb67 ]
+  %sext = shl i32 %storemerge2, 16
+  %_tmp2299 = ashr exact i32 %sext, 16
+  %_tmp2300 = add nsw i32 %_tmp2299, 1
+  %_tmp2310 = trunc i32 %_tmp2300 to i16
+  %_tmp2312 = icmp slt i16 %_tmp2310, 3
+  br i1 %_tmp2312, label %bb67, label %bb68
+
+bb68:
+  ret void
+}
+
+; Check that the need for stride==1 check prevents vectorizing a loop under opt
+; for size.
+; CHECK-LABEL: @scev4stride1
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #0 {
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = mul nsw i32 %i.07, %k
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
+  store i32 %0, i32* %arrayidx1, align 4
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  ret void
+}
+
+attributes #0 = { optsize }
-- 
GitLab


From 180fb865fc1d1fa99db2516152cf599776e9c4f8 Mon Sep 17 00:00:00 2001
From: Alex Denisov <1101.debian@gmail.com>
Date: Fri, 2 Nov 2018 09:57:24 +0000
Subject: [PATCH 0909/1116] Fix a typo in a function name

Declaration and definition have slightly different names with a typo in the
declaration, which leads to a link error.
See the following bug report for more details: https://bugs.llvm.org/show_bug.cgi?id=39523


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345960 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm-c/ExecutionEngine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llvm-c/ExecutionEngine.h b/include/llvm-c/ExecutionEngine.h
index 49ae6fee45f..e8ebef9ab15 100644
--- a/include/llvm-c/ExecutionEngine.h
+++ b/include/llvm-c/ExecutionEngine.h
@@ -186,7 +186,7 @@ void LLVMDisposeMCJITMemoryManager(LLVMMCJITMemoryManagerRef MM);
 
 LLVMJITEventListenerRef LLVMCreateGDBRegistrationListener(void);
 LLVMJITEventListenerRef LLVMCreateIntelJITEventListener(void);
-LLVMJITEventListenerRef LLVMCreateOprofileJITEventListener(void);
+LLVMJITEventListenerRef LLVMCreateOProfileJITEventListener(void);
 LLVMJITEventListenerRef LLVMCreatePerfJITEventListener(void);
 
 /**
-- 
GitLab


From 70c62a14e09f0ade0e2d0a319855f43f8346d7ed Mon Sep 17 00:00:00 2001
From: Neil Henning <neil.henning@amd.com>
Date: Fri, 2 Nov 2018 10:24:57 +0000
Subject: [PATCH 0910/1116] [AMDGPU] UBSan bug fix for r345710

UBSan detected an error in our ISelLowering that is exposed only when
you have a dmask == 0x1. Fix this by adding in an explicit check to
ensure we don't do the UBSan detected shl << 32.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345962 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 66eb9bbb84c..55b1a872484 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8799,7 +8799,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
     // Set which texture component corresponds to the lane.
     unsigned Comp;
-    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
+    for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
       Comp = countTrailingZeros(Dmask);
       Dmask &= ~(1 << Comp);
     }
-- 
GitLab


From 44ee0056851e1e133e53d05269fe9051f92c7b56 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 2 Nov 2018 11:06:18 +0000
Subject: [PATCH 0911/1116] [DAGCombiner] Remove
 reduceBuildVecConvertToConvertBuildVec and rely on the vectorizers instead
 (PR35732)

reduceBuildVecConvertToConvertBuildVec vectorizes int2float in the DAGCombiner, which means that even if the LV/SLP has decided to keep scalar code using the cost models, this will override this.

While there are cases where vectorization is necessary in the DAG (mainly due to legalization artefacts), I don't think this is the case here, we should assume that the vectorizers know what they are doing.

Differential Revision: https://reviews.llvm.org/D53712

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345964 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |  75 ---------
 test/CodeGen/ARM/vdup.ll                      |  37 ++---
 test/CodeGen/Mips/cconv/vector.ll             | 151 +++++++++++++-----
 test/CodeGen/X86/2009-02-26-MachineLICMBug.ll |  22 +--
 test/CodeGen/X86/cvtv2f32.ll                  |  31 ++--
 5 files changed, 155 insertions(+), 161 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 03145c5ce5a..8c2f9e8d1f4 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -464,7 +464,6 @@ namespace {
     SDValue TransformFPLoadStorePair(SDNode *N);
     SDValue convertBuildVecZextToZext(SDNode *N);
     SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
-    SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
     SDValue reduceBuildVecToShuffle(SDNode *N);
     SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                   ArrayRef<int> VectorMask, SDValue VecIn1,
@@ -15854,77 +15853,6 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
   return DAG.getBitcast(VT, BV);
 }
 
-SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
-  EVT VT = N->getValueType(0);
-
-  unsigned NumInScalars = N->getNumOperands();
-  SDLoc DL(N);
-
-  EVT SrcVT = MVT::Other;
-  unsigned Opcode = ISD::DELETED_NODE;
-  unsigned NumDefs = 0;
-
-  for (unsigned i = 0; i != NumInScalars; ++i) {
-    SDValue In = N->getOperand(i);
-    unsigned Opc = In.getOpcode();
-
-    if (Opc == ISD::UNDEF)
-      continue;
-
-    // If all scalar values are floats and converted from integers.
-    if (Opcode == ISD::DELETED_NODE &&
-        (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) {
-      Opcode = Opc;
-    }
-
-    if (Opc != Opcode)
-      return SDValue();
-
-    EVT InVT = In.getOperand(0).getValueType();
-
-    // If all scalar values are typed differently, bail out. It's chosen to
-    // simplify BUILD_VECTOR of integer types.
-    if (SrcVT == MVT::Other)
-      SrcVT = InVT;
-    if (SrcVT != InVT)
-      return SDValue();
-    NumDefs++;
-  }
-
-  // If the vector has just one element defined, it's not worth to fold it into
-  // a vectorized one.
-  if (NumDefs < 2)
-    return SDValue();
-
-  assert((Opcode == ISD::UINT_TO_FP || Opcode == ISD::SINT_TO_FP)
-         && "Should only handle conversion from integer to float.");
-  assert(SrcVT != MVT::Other && "Cannot determine source type!");
-
-  EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);
-
-  if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
-    return SDValue();
-
-  // Just because the floating-point vector type is legal does not necessarily
-  // mean that the corresponding integer vector type is.
-  if (!isTypeLegal(NVT))
-    return SDValue();
-
-  SmallVector<SDValue, 8> Opnds;
-  for (unsigned i = 0; i != NumInScalars; ++i) {
-    SDValue In = N->getOperand(i);
-
-    if (In.isUndef())
-      Opnds.push_back(DAG.getUNDEF(SrcVT));
-    else
-      Opnds.push_back(In.getOperand(0));
-  }
-  SDValue BV = DAG.getBuildVector(NVT, DL, Opnds);
-  AddToWorklist(BV.getNode());
-
-  return DAG.getNode(Opcode, DL, VT, BV);
-}
-
 SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                            ArrayRef<int> VectorMask,
                                            SDValue VecIn1, SDValue VecIn2,
@@ -16371,9 +16299,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
     return V;
 
-  if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N))
-    return V;
-
   if (SDValue V = reduceBuildVecToShuffle(N))
     return V;
 
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index c16a2a9e3c0..5127dab2656 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -488,11 +488,12 @@ define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
 ; CHECK-LABEL: check_spr_splat2:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    lsl r2, r2, #16
-; CHECK-NEXT:    vmov d17, r0, r1
+; CHECK-NEXT:    vmov d16, r0, r1
 ; CHECK-NEXT:    asr r2, r2, #16
-; CHECK-NEXT:    vdup.32 d16, r2
-; CHECK-NEXT:    vcvt.f32.s32 d16, d16
-; CHECK-NEXT:    vsub.f32 d16, d16, d17
+; CHECK-NEXT:    vmov s0, r2
+; CHECK-NEXT:    vcvt.f32.s32 s0, s0
+; CHECK-NEXT:    vdup.32 d17, d0[0]
+; CHECK-NEXT:    vsub.f32 d16, d17, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
   %conv = sitofp i16 %q to float
@@ -505,13 +506,13 @@ define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
 define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
 ; CHECK-LABEL: check_spr_splat4:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    mov r12, sp
-; CHECK-NEXT:    vmov d19, r2, r3
-; CHECK-NEXT:    vld1.16 {d16[]}, [r12:16]
-; CHECK-NEXT:    vmov d18, r0, r1
-; CHECK-NEXT:    vmovl.s16 q8, d16
-; CHECK-NEXT:    vcvt.f32.s32 q8, q8
-; CHECK-NEXT:    vsub.f32 q8, q8, q9
+; CHECK-NEXT:    ldrsh r12, [sp]
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vmov s0, r12
+; CHECK-NEXT:    vcvt.f32.s32 s0, s0
+; CHECK-NEXT:    vdup.32 q9, d0[0]
+; CHECK-NEXT:    vsub.f32 q8, q9, q8
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
 ; CHECK-NEXT:    mov pc, lr
@@ -525,13 +526,13 @@ define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
 define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
 ; CHECK-LABEL: check_spr_splat4_lane1:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    mov r12, sp
-; CHECK-NEXT:    vmov d19, r2, r3
-; CHECK-NEXT:    vld1.16 {d16[]}, [r12:16]
-; CHECK-NEXT:    vmov d18, r0, r1
-; CHECK-NEXT:    vmovl.s16 q8, d16
-; CHECK-NEXT:    vcvt.f32.s32 q8, q8
-; CHECK-NEXT:    vsub.f32 q8, q8, q9
+; CHECK-NEXT:    ldrsh r12, [sp]
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vmov s0, r12
+; CHECK-NEXT:    vcvt.f32.s32 s0, s0
+; CHECK-NEXT:    vdup.32 q9, d0[0]
+; CHECK-NEXT:    vsub.f32 q8, q9, q8
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
 ; CHECK-NEXT:    mov pc, lr
diff --git a/test/CodeGen/Mips/cconv/vector.ll b/test/CodeGen/Mips/cconv/vector.ll
index d6e260786d1..9a55285feae 100644
--- a/test/CodeGen/Mips/cconv/vector.ll
+++ b/test/CodeGen/Mips/cconv/vector.ll
@@ -6181,14 +6181,15 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) {
 ; MIPS32R5-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5-NEXT:    and $sp, $sp, $1
 ; MIPS32R5-NEXT:    andi $1, $6, 255
-; MIPS32R5-NEXT:    sw $1, 36($sp)
-; MIPS32R5-NEXT:    sw $1, 32($sp)
+; MIPS32R5-NEXT:    mtc1 $1, $f0
+; MIPS32R5-NEXT:    cvt.s.w $f0, $f0
+; MIPS32R5-NEXT:    swc1 $f0, 36($sp)
+; MIPS32R5-NEXT:    swc1 $f0, 32($sp)
 ; MIPS32R5-NEXT:    sw $5, 4($sp)
 ; MIPS32R5-NEXT:    sw $4, 0($sp)
-; MIPS32R5-NEXT:    ld.w $w0, 32($sp)
-; MIPS32R5-NEXT:    ffint_s.w $w0, $w0
-; MIPS32R5-NEXT:    ld.w $w1, 0($sp)
-; MIPS32R5-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS32R5-NEXT:    ld.w $w0, 0($sp)
+; MIPS32R5-NEXT:    ld.w $w1, 32($sp)
+; MIPS32R5-NEXT:    fadd.w $w0, $w1, $w0
 ; MIPS32R5-NEXT:    lw $1, 84($fp)
 ; MIPS32R5-NEXT:    sw $1, 20($sp)
 ; MIPS32R5-NEXT:    lw $1, 80($fp)
@@ -6209,13 +6210,14 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) {
 ; MIPS64R5-NEXT:    .cfi_def_cfa_offset 48
 ; MIPS64R5-NEXT:    sll $1, $5, 0
 ; MIPS64R5-NEXT:    andi $1, $1, 255
-; MIPS64R5-NEXT:    sw $1, 36($sp)
-; MIPS64R5-NEXT:    sw $1, 32($sp)
+; MIPS64R5-NEXT:    mtc1 $1, $f0
+; MIPS64R5-NEXT:    cvt.s.w $f0, $f0
+; MIPS64R5-NEXT:    swc1 $f0, 36($sp)
+; MIPS64R5-NEXT:    swc1 $f0, 32($sp)
 ; MIPS64R5-NEXT:    sd $4, 0($sp)
-; MIPS64R5-NEXT:    ld.w $w0, 32($sp)
-; MIPS64R5-NEXT:    ffint_s.w $w0, $w0
-; MIPS64R5-NEXT:    ld.w $w1, 0($sp)
-; MIPS64R5-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS64R5-NEXT:    ld.w $w0, 0($sp)
+; MIPS64R5-NEXT:    ld.w $w1, 32($sp)
+; MIPS64R5-NEXT:    fadd.w $w0, $w1, $w0
 ; MIPS64R5-NEXT:    sd $6, 16($sp)
 ; MIPS64R5-NEXT:    ld.w $w1, 16($sp)
 ; MIPS64R5-NEXT:    fadd.w $w0, $w0, $w1
@@ -6337,36 +6339,59 @@ define <4 x float> @mixed_32(<4 x float> %a, i32 %b) {
 ; MIPS64EB-NEXT:    jr $ra
 ; MIPS64EB-NEXT:    nop
 ;
-; MIPS32R5-LABEL: mixed_32:
-; MIPS32R5:       # %bb.0: # %entry
-; MIPS32R5-NEXT:    ldi.b $w0, 0
-; MIPS32R5-NEXT:    insert.w $w0[0], $6
-; MIPS32R5-NEXT:    insert.w $w0[1], $7
-; MIPS32R5-NEXT:    lw $1, 16($sp)
-; MIPS32R5-NEXT:    insert.w $w0[2], $1
-; MIPS32R5-NEXT:    lw $1, 20($sp)
-; MIPS32R5-NEXT:    insert.w $w0[3], $1
-; MIPS32R5-NEXT:    lw $1, 24($sp)
-; MIPS32R5-NEXT:    fill.w $w1, $1
-; MIPS32R5-NEXT:    ffint_u.w $w1, $w1
-; MIPS32R5-NEXT:    fadd.w $w0, $w1, $w0
-; MIPS32R5-NEXT:    st.w $w0, 0($4)
-; MIPS32R5-NEXT:    jr $ra
-; MIPS32R5-NEXT:    nop
+; MIPS32R5EB-LABEL: mixed_32:
+; MIPS32R5EB:       # %bb.0: # %entry
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, -8
+; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 8
+; MIPS32R5EB-NEXT:    lui $1, 17200
+; MIPS32R5EB-NEXT:    sw $1, 0($sp)
+; MIPS32R5EB-NEXT:    lw $1, 32($sp)
+; MIPS32R5EB-NEXT:    sw $1, 4($sp)
+; MIPS32R5EB-NEXT:    lui $1, %hi($CPI41_0)
+; MIPS32R5EB-NEXT:    ldc1 $f0, %lo($CPI41_0)($1)
+; MIPS32R5EB-NEXT:    ldc1 $f1, 0($sp)
+; MIPS32R5EB-NEXT:    sub.d $f0, $f1, $f0
+; MIPS32R5EB-NEXT:    cvt.s.d $f0, $f0
+; MIPS32R5EB-NEXT:    ldi.b $w1, 0
+; MIPS32R5EB-NEXT:    splati.w $w0, $w0[0]
+; MIPS32R5EB-NEXT:    insert.w $w1[0], $6
+; MIPS32R5EB-NEXT:    insert.w $w1[1], $7
+; MIPS32R5EB-NEXT:    lw $1, 24($sp)
+; MIPS32R5EB-NEXT:    insert.w $w1[2], $1
+; MIPS32R5EB-NEXT:    lw $1, 28($sp)
+; MIPS32R5EB-NEXT:    insert.w $w1[3], $1
+; MIPS32R5EB-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS32R5EB-NEXT:    st.w $w0, 0($4)
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, 8
+; MIPS32R5EB-NEXT:    jr $ra
+; MIPS32R5EB-NEXT:    nop
 ;
 ; MIPS64R5EB-LABEL: mixed_32:
 ; MIPS64R5EB:       # %bb.0: # %entry
-; MIPS64R5EB-NEXT:    ldi.b $w0, 0
-; MIPS64R5EB-NEXT:    insert.d $w0[0], $4
-; MIPS64R5EB-NEXT:    insert.d $w0[1], $5
-; MIPS64R5EB-NEXT:    shf.w $w0, $w0, 177
-; MIPS64R5EB-NEXT:    sll $1, $6, 0
-; MIPS64R5EB-NEXT:    fill.w $w1, $1
-; MIPS64R5EB-NEXT:    ffint_u.w $w1, $w1
-; MIPS64R5EB-NEXT:    fadd.w $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(mixed_32)))
+; MIPS64R5EB-NEXT:    daddu $1, $1, $25
+; MIPS64R5EB-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(mixed_32)))
+; MIPS64R5EB-NEXT:    lui $2, 17200
+; MIPS64R5EB-NEXT:    sw $2, 8($sp)
+; MIPS64R5EB-NEXT:    sll $2, $6, 0
+; MIPS64R5EB-NEXT:    sw $2, 12($sp)
+; MIPS64R5EB-NEXT:    ld $1, %got_page(.LCPI41_0)($1)
+; MIPS64R5EB-NEXT:    ldc1 $f0, %got_ofst(.LCPI41_0)($1)
+; MIPS64R5EB-NEXT:    ldc1 $f1, 8($sp)
+; MIPS64R5EB-NEXT:    sub.d $f0, $f1, $f0
+; MIPS64R5EB-NEXT:    ldi.b $w1, 0
+; MIPS64R5EB-NEXT:    insert.d $w1[0], $4
+; MIPS64R5EB-NEXT:    insert.d $w1[1], $5
+; MIPS64R5EB-NEXT:    shf.w $w1, $w1, 177
+; MIPS64R5EB-NEXT:    cvt.s.d $f0, $f0
+; MIPS64R5EB-NEXT:    splati.w $w0, $w0[0]
+; MIPS64R5EB-NEXT:    fadd.w $w0, $w0, $w1
 ; MIPS64R5EB-NEXT:    shf.w $w0, $w0, 177
 ; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[0]
 ; MIPS64R5EB-NEXT:    copy_s.d $3, $w0[1]
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EB-NEXT:    jr $ra
 ; MIPS64R5EB-NEXT:    nop
 ;
@@ -6445,17 +6470,57 @@ define <4 x float> @mixed_32(<4 x float> %a, i32 %b) {
 ; MIPS64EL-NEXT:    jr $ra
 ; MIPS64EL-NEXT:    nop
 ;
+; MIPS32R5EL-LABEL: mixed_32:
+; MIPS32R5EL:       # %bb.0: # %entry
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, -8
+; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 8
+; MIPS32R5EL-NEXT:    lui $1, 17200
+; MIPS32R5EL-NEXT:    sw $1, 4($sp)
+; MIPS32R5EL-NEXT:    lw $1, 32($sp)
+; MIPS32R5EL-NEXT:    sw $1, 0($sp)
+; MIPS32R5EL-NEXT:    lui $1, %hi($CPI41_0)
+; MIPS32R5EL-NEXT:    ldc1 $f0, %lo($CPI41_0)($1)
+; MIPS32R5EL-NEXT:    ldc1 $f1, 0($sp)
+; MIPS32R5EL-NEXT:    sub.d $f0, $f1, $f0
+; MIPS32R5EL-NEXT:    cvt.s.d $f0, $f0
+; MIPS32R5EL-NEXT:    ldi.b $w1, 0
+; MIPS32R5EL-NEXT:    splati.w $w0, $w0[0]
+; MIPS32R5EL-NEXT:    insert.w $w1[0], $6
+; MIPS32R5EL-NEXT:    insert.w $w1[1], $7
+; MIPS32R5EL-NEXT:    lw $1, 24($sp)
+; MIPS32R5EL-NEXT:    insert.w $w1[2], $1
+; MIPS32R5EL-NEXT:    lw $1, 28($sp)
+; MIPS32R5EL-NEXT:    insert.w $w1[3], $1
+; MIPS32R5EL-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS32R5EL-NEXT:    st.w $w0, 0($4)
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, 8
+; MIPS32R5EL-NEXT:    jr $ra
+; MIPS32R5EL-NEXT:    nop
+;
 ; MIPS64R5EL-LABEL: mixed_32:
 ; MIPS64R5EL:       # %bb.0: # %entry
-; MIPS64R5EL-NEXT:    ldi.b $w0, 0
-; MIPS64R5EL-NEXT:    insert.d $w0[0], $4
-; MIPS64R5EL-NEXT:    insert.d $w0[1], $5
-; MIPS64R5EL-NEXT:    sll $1, $6, 0
-; MIPS64R5EL-NEXT:    fill.w $w1, $1
-; MIPS64R5EL-NEXT:    ffint_u.w $w1, $w1
-; MIPS64R5EL-NEXT:    fadd.w $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(mixed_32)))
+; MIPS64R5EL-NEXT:    daddu $1, $1, $25
+; MIPS64R5EL-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(mixed_32)))
+; MIPS64R5EL-NEXT:    lui $2, 17200
+; MIPS64R5EL-NEXT:    sw $2, 12($sp)
+; MIPS64R5EL-NEXT:    sll $2, $6, 0
+; MIPS64R5EL-NEXT:    sw $2, 8($sp)
+; MIPS64R5EL-NEXT:    ld $1, %got_page(.LCPI41_0)($1)
+; MIPS64R5EL-NEXT:    ldc1 $f0, %got_ofst(.LCPI41_0)($1)
+; MIPS64R5EL-NEXT:    ldc1 $f1, 8($sp)
+; MIPS64R5EL-NEXT:    sub.d $f0, $f1, $f0
+; MIPS64R5EL-NEXT:    ldi.b $w1, 0
+; MIPS64R5EL-NEXT:    insert.d $w1[0], $4
+; MIPS64R5EL-NEXT:    insert.d $w1[1], $5
+; MIPS64R5EL-NEXT:    cvt.s.d $f0, $f0
+; MIPS64R5EL-NEXT:    splati.w $w0, $w0[0]
+; MIPS64R5EL-NEXT:    fadd.w $w0, $w0, $w1
 ; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[0]
 ; MIPS64R5EL-NEXT:    copy_s.d $3, $w0[1]
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EL-NEXT:    jr $ra
 ; MIPS64R5EL-NEXT:    nop
 entry:
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 945f47337ba..0cce34fb7bd 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; REQUIRES: asserts
-; RUN: llc < %s -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machinelicm"
+; RUN: llc < %s -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "6 machinelicm"
 ; RUN: llc < %s -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
@@ -24,15 +24,17 @@ define %struct.__vv* @t(%struct.Key* %desc, i64 %p) nounwind ssp {
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_3: ## %bb.i
 ; CHECK-NEXT:    ## in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    pinsrd $1, 4, %xmm0
-; CHECK-NEXT:    pinsrd $2, 8, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7]
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; CHECK-NEXT:    addps {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    addps %xmm1, %xmm0
+; CHECK-NEXT:    movl 0, %eax
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    cvtsi2ssq %rax, %xmm0
+; CHECK-NEXT:    movl 4, %eax
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    cvtsi2ssq %rax, %xmm1
+; CHECK-NEXT:    movl 8, %eax
+; CHECK-NEXT:    xorps %xmm2, %xmm2
+; CHECK-NEXT:    cvtsi2ssq %rax, %xmm2
+; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
 ; CHECK-NEXT:    movaps %xmm0, 0
 ; CHECK-NEXT:  LBB0_1: ## %bb4
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
index cda0047ebee..c755d5f8bd7 100644
--- a/test/CodeGen/X86/cvtv2f32.ll
+++ b/test/CodeGen/X86/cvtv2f32.ll
@@ -8,26 +8,27 @@
 define <2 x float> @uitofp_2i32_cvt_buildvector(i32 %x, i32 %y, <2 x float> %v) {
 ; X32-LABEL: uitofp_2i32_cvt_buildvector:
 ; X32:       # %bb.0:
-; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
-; X32-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; X32-NEXT:    psrld $16, %xmm1
-; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7]
-; X32-NEXT:    addps {{\.LCPI.*}}, %xmm1
-; X32-NEXT:    addps %xmm2, %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT:    orpd %xmm2, %xmm1
+; X32-NEXT:    subsd %xmm2, %xmm1
+; X32-NEXT:    cvtsd2ss %xmm1, %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    orpd %xmm2, %xmm3
+; X32-NEXT:    subsd %xmm2, %xmm3
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    cvtsd2ss %xmm3, %xmm2
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; X32-NEXT:    mulps %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: uitofp_2i32_cvt_buildvector:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %edi, %xmm1
-; X64-NEXT:    pinsrd $1, %esi, %xmm1
-; X64-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
-; X64-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; X64-NEXT:    psrld $16, %xmm1
-; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7]
-; X64-NEXT:    addps {{.*}}(%rip), %xmm1
-; X64-NEXT:    addps %xmm2, %xmm1
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ssq %rax, %xmm1
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cvtsi2ssq %rax, %xmm2
+; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; X64-NEXT:    mulps %xmm1, %xmm0
 ; X64-NEXT:    retq
   %t1 = uitofp i32 %x to float
-- 
GitLab


From 11ef6e19fe894f726ad4b581262d8852e8d53740 Mon Sep 17 00:00:00 2001
From: David Stenberg <david.stenberg@ericsson.com>
Date: Fri, 2 Nov 2018 11:46:24 +0000
Subject: [PATCH 0912/1116] Allow null-valued function operands in
 getCalledFunction()

Summary:
Change the dynamic cast in CallBase::getCalledFunction() to allow
null-valued function operands.

This patch fixes a crash that occurred when a funtion operand of a
call instruction was dropped, and later on a metadata-carrying
instruction was printed out. When allocating the metadata slot numbers,
getCalledFunction() would be invoked on the call with the dropped
operand, resulting in a failed non-null assertion in isa<>.

This fixes PR38924, in which a printout in DBCE crashed due to this.

This aligns getCalledFunction() with getCalledValue(), as the latter
allows the operand to be null.

Reviewers: vsk, dexonsmith, hfinkel

Reviewed By: dexonsmith

Subscribers: hfinkel, llvm-commits

Differential Revision: https://reviews.llvm.org/D52537

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345966 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/Instructions.h |  2 +-
 unittests/IR/MetadataTest.cpp  | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index faea2973773..449e6e8dc7a 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -1523,7 +1523,7 @@ public:
   /// indirect function invocation.
   ///
   Function *getCalledFunction() const {
-    return dyn_cast<Function>(Op<-InstTy::ArgOffset>());
+    return dyn_cast_or_null<Function>(Op<-InstTy::ArgOffset>());
   }
 
   /// Determine whether this call has the given attribute.
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index 83b166a263c..100c4ed5e15 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -402,6 +402,27 @@ TEST_F(MDNodeTest, PrintFromMetadataAsValue) {
   EXPECT_PRINTER_EQ("metadata !0", MAV0->printAsOperand(OS, true, MST));
   EXPECT_PRINTER_EQ("metadata !1", MAV1->printAsOperand(OS, true, MST));
 }
+
+TEST_F(MDNodeTest, PrintWithDroppedCallOperand) {
+  Module M("test", Context);
+
+  auto *FTy = FunctionType::get(Type::getVoidTy(Context), false);
+  auto *F0 = Function::Create(FTy, GlobalValue::ExternalLinkage, "F0", &M);
+  auto *F1 = Function::Create(FTy, GlobalValue::ExternalLinkage, "F1", &M);
+  auto *BB0 = BasicBlock::Create(Context, "entry", F0);
+
+  CallInst *CI0 = CallInst::Create(F1, "", BB0);
+  CI0->dropAllReferences();
+
+  auto *R0 = ReturnInst::Create(Context, BB0);
+  auto *N0 = MDNode::getDistinct(Context, None);
+  R0->setMetadata("md", N0);
+
+  // Printing the metadata node would previously result in a failed assertion
+  // due to the call instruction's dropped function operand.
+  ModuleSlotTracker MST(&M);
+  EXPECT_PRINTER_EQ("!0 = distinct !{}", N0->print(OS, MST));
+}
 #undef EXPECT_PRINTER_EQ
 
 TEST_F(MDNodeTest, NullOperand) {
-- 
GitLab


From c5e4cd20b51481c8dad42e29579ecda318a60581 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Fri, 2 Nov 2018 13:47:47 +0000
Subject: [PATCH 0913/1116] [DEBUGINFO, NVPTX]DO not emit ',debug' option if no
 debug info or only debug directives are requested.

Summary:
If the output of debug directives only is requested, we should drop
emission of ',debug' option from the target directive. Required for
supporting of nvprof profiler.

Reviewers: probinson, echristo, dblaikie

Subscribers: Hahnfeld, jholewinski, llvm-commits, JDevlieghere, aprantl

Differential Revision: https://reviews.llvm.org/D46061

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345972 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../MCTargetDesc/NVPTXTargetStreamer.cpp      | 10 ++--
 .../NVPTX/MCTargetDesc/NVPTXTargetStreamer.h  |  3 ++
 lib/Target/NVPTX/NVPTXAsmPrinter.cpp          | 21 +++++++-
 test/DebugInfo/NVPTX/debug-file-loc-only.ll   | 48 +++++++++++++++++++
 4 files changed, 78 insertions(+), 4 deletions(-)
 create mode 100644 test/DebugInfo/NVPTX/debug-file-loc-only.ll

diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index 71ca7a5ca8d..f7b4cf3a0f7 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -25,6 +25,12 @@ NVPTXTargetStreamer::NVPTXTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
 NVPTXTargetStreamer::~NVPTXTargetStreamer() = default;
 
+void NVPTXTargetStreamer::outputDwarfFileDirectives() {
+  for (const std::string &S : DwarfFiles)
+    getStreamer().EmitRawText(S.data());
+  DwarfFiles.clear();
+}
+
 void NVPTXTargetStreamer::emitDwarfFileDirective(StringRef Directive) {
   DwarfFiles.emplace_back(Directive);
 }
@@ -82,9 +88,7 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
     OS << "//\t}\n";
   if (isDwarfSection(FI, Section)) {
     // Emit DWARF .file directives in the outermost scope.
-    for (const std::string &S : DwarfFiles)
-      getStreamer().EmitRawText(S.data());
-    DwarfFiles.clear();
+    outputDwarfFileDirectives();
     OS << "//\t.section";
     Section->PrintSwitchToSection(*getStreamer().getContext().getAsmInfo(),
                                   FI->getTargetTriple(), OS, SubSection);
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
index 34391a8b9ab..f18e61cdca5 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
@@ -24,6 +24,9 @@ public:
   NVPTXTargetStreamer(MCStreamer &S);
   ~NVPTXTargetStreamer() override;
 
+  /// Outputs the list of the DWARF '.file' directives to the streamer.
+  void outputDwarfFileDirectives();
+
   /// Record DWARF file directives for later output.
   /// According to PTX ISA, CUDA Toolkit documentation, 11.5.3. Debugging
   /// Directives: .file
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index a966b992840..9d9c75aceca 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -16,6 +16,7 @@
 #include "InstPrinter/NVPTXInstPrinter.h"
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "MCTargetDesc/NVPTXTargetStreamer.h"
 #include "NVPTX.h"
 #include "NVPTXMCExpr.h"
 #include "NVPTXMachineFunctionInfo.h"
@@ -880,8 +881,22 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
   if (NTM.getDrvInterface() == NVPTX::NVCL)
     O << ", texmode_independent";
 
+  bool HasFullDebugInfo = false;
+  for (DICompileUnit *CU : M.debug_compile_units()) {
+    switch(CU->getEmissionKind()) {
+    case DICompileUnit::NoDebug:
+    case DICompileUnit::DebugDirectivesOnly:
+      break;
+    case DICompileUnit::LineTablesOnly:
+    case DICompileUnit::FullDebug:
+      HasFullDebugInfo = true;
+      break;
+    }
+    if (HasFullDebugInfo)
+      break;
+  }
   // FIXME: remove comment once debug info is properly supported.
-  if (MMI && MMI->hasDebugInfo())
+  if (MMI && MMI->hasDebugInfo() && HasFullDebugInfo)
     O << "//, debug";
 
   O << "\n";
@@ -938,6 +953,10 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
   if (HasDebugInfo)
     OutStreamer->EmitRawText("//\t}");
 
+  // Output last DWARF .file directives, if any.
+  static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer())
+      ->outputDwarfFileDirectives();
+
   return ret;
 
   //bool Result = AsmPrinter::doFinalization(M);
diff --git a/test/DebugInfo/NVPTX/debug-file-loc-only.ll b/test/DebugInfo/NVPTX/debug-file-loc-only.ll
new file mode 100644
index 00000000000..389a7c65781
--- /dev/null
+++ b/test/DebugInfo/NVPTX/debug-file-loc-only.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+
+; // Bitcode int this test case is reduced version of compiled code below:
+;extern "C" {
+;#line 1 "/source/dir/foo.h"
+;__device__ void foo() {}
+;#line 2 "/source/dir/bar.cu"
+;__device__ void bar() {}
+;}
+
+; CHECK: .target sm_{{[0-9]+$}}
+
+; CHECK: .visible .func foo()
+; CHECK: .loc [[FOO:[0-9]+]] 1 31
+; CHECK:  ret;
+; CHECK: .visible .func bar()
+; CHECK: .loc [[BAR:[0-9]+]] 2 31
+; CHECK:  ret;
+
+define void @foo() !dbg !4 {
+bb:
+  ret void, !dbg !10
+}
+
+define void @bar() !dbg !7 {
+bb:
+  ret void, !dbg !11
+}
+
+; CHECK-DAG: .file [[FOO]] "{{.*}}foo.h"
+; CHECK-DAG: .file [[BAR]] "{{.*}}bar.cu"
+
+; CHECK-NOT: .section .debug{{.*}}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "", isOptimized: false, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2)
+!1 = !DIFile(filename: "bar.cu", directory: "/source/dir")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "foo", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!5 = !DIFile(filename: "foo.h", directory: "/source/dir")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 2, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!8 = !{i32 2, !"Dwarf Version", i32 2}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !DILocation(line: 1, column: 31, scope: !4)
+!11 = !DILocation(line: 2, column: 31, scope: !7)
-- 
GitLab


From b7577abc1104fa0f64a3de4357ddf3f1ef8d2a27 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Fri, 2 Nov 2018 14:17:47 +0000
Subject: [PATCH 0914/1116] [Hexagon] Do not reduce load size for globals in
 small-data

Small-data (i.e. GP-relative) loads and stores allow 16-bit scaled
offset. For a load of a value of type T, the small-data area is
equivalent to an array "T sdata[65536]". This implies that objects
of smaller sizes need to be closer to the beginning of sdata,
while larger objects may be farther away, or otherwise the offset
may be insufficient to reach it. Similarly, an object of a larger
size should not be accessed via a load of a smaller size.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345975 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonISelLowering.cpp | 15 +++++++++++++++
 lib/Target/Hexagon/HexagonISelLowering.h   |  3 +++
 test/CodeGen/Hexagon/sdata-load-size.ll    | 19 +++++++++++++++++++
 3 files changed, 37 insertions(+)
 create mode 100644 test/CodeGen/Hexagon/sdata-load-size.ll

diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index f2c27e5e39b..7a708a8ac24 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -3080,6 +3080,21 @@ HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
   return TargetLowering::findRepresentativeClass(TRI, VT);
 }
 
+bool HexagonTargetLowering::shouldReduceLoadWidth(SDNode *Load,
+      ISD::LoadExtType ExtTy, EVT NewVT) const {
+  auto *L = cast<LoadSDNode>(Load);
+  std::pair<SDValue,int> BO = getBaseAndOffset(L->getBasePtr());
+  // Small-data object, do not shrink.
+  if (BO.first.getOpcode() == HexagonISD::CONST32_GP)
+    return false;
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(BO.first)) {
+    auto &HTM = static_cast<const HexagonTargetMachine&>(getTargetMachine());
+    const auto *GO = dyn_cast_or_null<const GlobalObject>(GA->getGlobal());
+    return !GO || !HTM.getObjFileLowering()->isGlobalInSmallSection(GO, HTM);
+  }
+  return true;
+}
+
 Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
       AtomicOrdering Ord) const {
   BasicBlock *BB = Builder.GetInsertBlock();
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 8efb3c9cda5..39af19b9b07 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -304,6 +304,9 @@ namespace HexagonISD {
     SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
                                      const override;
 
+    bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+                               EVT NewVT) const override;
+
     // Handling of atomic RMW instructions.
     Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
         AtomicOrdering Ord) const override;
diff --git a/test/CodeGen/Hexagon/sdata-load-size.ll b/test/CodeGen/Hexagon/sdata-load-size.ll
new file mode 100644
index 00000000000..325713f7062
--- /dev/null
+++ b/test/CodeGen/Hexagon/sdata-load-size.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=hexagon -hexagon-small-data-threshold=8 < %s | FileCheck %s
+; CHECK: = memd(gp+#g0)
+; If an object will be placed in .sdata, do not shrink any references to it.
+; In this case, g0 must be loaded via memd.
+
+target triple = "hexagon"
+
+@g0 = common global i64 0, align 8
+
+define i32 @f0() #0 {
+entry:
+  %v0 = load i64, i64* @g0, align 8
+  %v1 = trunc i64 %v0 to i8
+  %v2 = zext i8 %v1 to i32
+  ret i32 %v2
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+small-data" }
+
-- 
GitLab


From 1b82386f5eb67c1a5e54c70aff291c01c4e028da Mon Sep 17 00:00:00 2001
From: Cameron McInally <cameron.mcinally@nyu.edu>
Date: Fri, 2 Nov 2018 15:51:43 +0000
Subject: [PATCH 0915/1116] [NFC] Remove some extra characters from
 docs/LangRef.rst

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345987 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index b98862ecb42..7f93716a504 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -14016,7 +14016,7 @@ value operands and has the same type as the operands.  The remainder has the
 same sign as the dividend.
 
 '``llvm.experimental.constrained.fma``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
@@ -14507,7 +14507,7 @@ mode argument is only intended as information to the compiler.
 
 
 '``llvm.experimental.constrained.maxnum``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
@@ -14544,7 +14544,7 @@ mode argument is only intended as information to the compiler.
 
 
 '``llvm.experimental.constrained.minnum``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
-- 
GitLab


From 6709348e9f65d8fbead4f2d25f9cc174edc075d9 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 2 Nov 2018 15:51:47 +0000
Subject: [PATCH 0916/1116] [ValueTracking] allow non-canonical shuffles when
 computing signbits

This possibility is noted in D53987 for a different case,
so we need to adjust the existing code.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345988 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ValueTracking.cpp           | 18 ++++++++++--------
 unittests/Analysis/ValueTrackingTest.cpp | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 3cef373f324..89a621576ec 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -2511,27 +2511,29 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
     // extended, shifted, etc).
     return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
 
-  case Instruction::ShuffleVector:
+  case Instruction::ShuffleVector: {
     // If the shuffle mask contains any undefined elements, that element of the
     // result is undefined. Propagating information from a source operand may
     // not be correct in that case, so just bail out.
     if (cast<ShuffleVectorInst>(U)->getMask()->containsUndefElement())
       break;
 
-    assert((!isa<UndefValue>(U->getOperand(0)) ||
-            !isa<UndefValue>(U->getOperand(1)))
-           && "Should have simplified shuffle with 2 undef inputs");
+    // If everything is undef, we can't say anything. This should be simplified.
+    Value *Op0 = U->getOperand(0), *Op1 = U->getOperand(1);
+    if (isa<UndefValue>(Op0) && isa<UndefValue>(Op1))
+      break;
 
     // Look through shuffle of 1 source vector.
-    if (isa<UndefValue>(U->getOperand(0)))
-      return ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-    if (isa<UndefValue>(U->getOperand(1)))
-      return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+    if (isa<UndefValue>(Op0))
+      return ComputeNumSignBits(Op1, Depth + 1, Q);
+    if (isa<UndefValue>(Op1))
+      return ComputeNumSignBits(Op0, Depth + 1, Q);
 
     // TODO: We can look through shuffles of 2 sources by computing the minimum
     // sign bits for each operand (similar to what we do for binops).
     break;
   }
+  }
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
diff --git a/unittests/Analysis/ValueTrackingTest.cpp b/unittests/Analysis/ValueTrackingTest.cpp
index ccae9d19ebb..f7d715c6447 100644
--- a/unittests/Analysis/ValueTrackingTest.cpp
+++ b/unittests/Analysis/ValueTrackingTest.cpp
@@ -494,6 +494,26 @@ TEST(ValueTracking, ComputeNumSignBits_PR32045) {
   EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
 }
 
+// No guarantees for canonical IR in this analysis, so this just bails out. 
+TEST(ValueTracking, ComputeNumSignBits_Shuffle) {
+  StringRef Assembly = "define <2 x i32> @f() { "
+                       "  %val = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 0> "
+                       "  ret <2 x i32> %val "
+                       "} ";
+
+  LLVMContext Context;
+  SMDiagnostic Error;
+  auto M = parseAssemblyString(Assembly, Error, Context);
+  assert(M && "Bad assembly?");
+
+  auto *F = M->getFunction("f");
+  assert(F && "Bad assembly?");
+
+  auto *RVal =
+      cast<ReturnInst>(F->getEntryBlock().getTerminator())->getOperand(0);
+  EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
+}
+
 TEST(ValueTracking, ComputeKnownBits) {
   StringRef Assembly = "define i32 @f(i32 %a, i32 %b) { "
                        "  %ash = mul i32 %a, 8 "
-- 
GitLab


From 2033b5aeb4def10f8cf2ea3f8bf90e59a4dbd3b1 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse.llvm@gmail.com>
Date: Fri, 2 Nov 2018 16:52:48 +0000
Subject: [PATCH 0917/1116] [MachineSink][DebugInfo] Correctly sink DBG_VALUEs

As reported in PR38952, postra-machine-sink relies on DBG_VALUE insns being
adjacent to the def of the register that they reference. This is not always
true, leading to register copies being sunk but not the associated DBG_VALUEs,
which gives the debugger a bad variable location.

This patch collects DBG_VALUEs as we walk through a BB looking for copies to
sink, then passes them down to performSink. Compile-time impact should be
negligable.

Differential Revision: https://reviews.llvm.org/D53992


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345996 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineSink.cpp  |  57 +++++++++++++++----
 test/CodeGen/X86/pr38952.mir | 103 +++++++++++++++++++++++++++++++++++
 2 files changed, 150 insertions(+), 10 deletions(-)
 create mode 100644 test/CodeGen/X86/pr38952.mir

diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 1d2e85accbc..d45855407f2 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -734,12 +734,18 @@ static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI,
          MBP.LHS.getReg() == BaseReg;
 }
 
-/// Sink an instruction and its associated debug instructions.
+/// Sink an instruction and its associated debug instructions. If the debug
+/// instructions to be sunk are already known, they can be provided in DbgVals.
 static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
-                        MachineBasicBlock::iterator InsertPos) {
-  // Collect matching debug values.
+                        MachineBasicBlock::iterator InsertPos,
+                        SmallVectorImpl<MachineInstr *> *DbgVals = nullptr) {
+  // If debug values are provided use those, otherwise call collectDebugValues.
   SmallVector<MachineInstr *, 2> DbgValuesToSink;
-  MI.collectDebugValues(DbgValuesToSink);
+  if (DbgVals)
+    DbgValuesToSink.insert(DbgValuesToSink.begin(),
+                           DbgVals->begin(), DbgVals->end());
+  else
+    MI.collectDebugValues(DbgValuesToSink);
 
   // If we cannot find a location to use (merge with), then we erase the debug
   // location to prevent debug-info driven tools from potentially reporting
@@ -951,6 +957,9 @@ private:
   /// Track which register units have been modified and used.
   LiveRegUnits ModifiedRegUnits, UsedRegUnits;
 
+  /// Track DBG_VALUEs of (unmodified) register units.
+  DenseMap<unsigned, TinyPtrVector<MachineInstr*>> SeenDbgInstrs;
+
   /// Sink Copy instructions unused in the same block close to their uses in
   /// successors.
   bool tryToSinkCopy(MachineBasicBlock &BB, MachineFunction &MF,
@@ -1105,11 +1114,34 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
   // block and the current instruction.
   ModifiedRegUnits.clear();
   UsedRegUnits.clear();
+  SeenDbgInstrs.clear();
 
   for (auto I = CurBB.rbegin(), E = CurBB.rend(); I != E;) {
     MachineInstr *MI = &*I;
     ++I;
 
+    // Track the operand index for use in Copy.
+    SmallVector<unsigned, 2> UsedOpsInCopy;
+    // Track the register number defed in Copy.
+    SmallVector<unsigned, 2> DefedRegsInCopy;
+
+    // We must sink this DBG_VALUE if its operand is sunk. To avoid searching
+    // for DBG_VALUEs later, record them when they're encountered.
+    if (MI->isDebugValue()) {
+      auto &MO = MI->getOperand(0);
+      if (MO.isReg() && TRI->isPhysicalRegister(MO.getReg())) {
+        // Bail if we can already tell the sink would be rejected, rather
+        // than needlessly accumulating lots of DBG_VALUEs.
+        if (hasRegisterDependency(MI, UsedOpsInCopy, DefedRegsInCopy,
+                                  ModifiedRegUnits, UsedRegUnits))
+          continue;
+
+        // Record debug use of this register.
+        SeenDbgInstrs[MO.getReg()].push_back(MI);
+      }
+      continue;
+    }
+
     if (MI->isDebugInstr())
       continue;
 
@@ -1123,11 +1155,6 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
       continue;
     }
 
-    // Track the operand index for use in Copy.
-    SmallVector<unsigned, 2> UsedOpsInCopy;
-    // Track the register number defed in Copy.
-    SmallVector<unsigned, 2> DefedRegsInCopy;
-
     // Don't sink the COPY if it would violate a register dependency.
     if (hasRegisterDependency(MI, UsedOpsInCopy, DefedRegsInCopy,
                               ModifiedRegUnits, UsedRegUnits)) {
@@ -1149,11 +1176,21 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
     assert((SuccBB->pred_size() == 1 && *SuccBB->pred_begin() == &CurBB) &&
            "Unexpected predecessor");
 
+    // Collect DBG_VALUEs that must sink with this copy.
+    SmallVector<MachineInstr *, 4> DbgValsToSink;
+    for (auto &MO : MI->operands()) {
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned reg = MO.getReg();
+      for (auto *MI : SeenDbgInstrs.lookup(reg))
+        DbgValsToSink.push_back(MI);
+    }
+
     // Clear the kill flag if SrcReg is killed between MI and the end of the
     // block.
     clearKillFlags(MI, CurBB, UsedOpsInCopy, UsedRegUnits, TRI);
     MachineBasicBlock::iterator InsertPos = SuccBB->getFirstNonPHI();
-    performSink(*MI, *SuccBB, InsertPos);
+    performSink(*MI, *SuccBB, InsertPos, &DbgValsToSink);
     updateLiveIn(MI, SuccBB, UsedOpsInCopy, DefedRegsInCopy);
 
     Changed = true;
diff --git a/test/CodeGen/X86/pr38952.mir b/test/CodeGen/X86/pr38952.mir
new file mode 100644
index 00000000000..57cdc017f9e
--- /dev/null
+++ b/test/CodeGen/X86/pr38952.mir
@@ -0,0 +1,103 @@
+# RUN: llc %s -run-pass=postra-machine-sink -o - | FileCheck %s
+--- |
+  ; Module stripped of everything, MIR below is what's interesting
+  ; ModuleID = '<stdin>'
+  source_filename = "justacall.cpp"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  ; Function Attrs: noinline norecurse nounwind uwtable
+  define dso_local i32 @main(i32 %argc, i8** nocapture readnone %argv) local_unnamed_addr #0 {
+  entry:
+    br label %if.end
+  if.end:
+    br label %return
+  return:
+    ret i32 0
+  }
+
+  !0 = !{!"dummy metadata"}
+  !2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: None)
+  !3 = !DIFile(filename: "justacall.cpp", directory: "/tmp")
+  !4 = !{}
+  !5 = !{!0}
+  !7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !14 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 7, type: !15, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !2, retainedNodes: !20)
+  !15 = !DISubroutineType(types: !16)
+  !16 = !{!7, !7}
+  !20 = !{!21}
+  !21 = !DILocalVariable(name: "argc", arg: 1, scope: !14, file: !3, line: 7, type: !7)
+
+...
+---
+name:            main
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+liveins:
+  - { reg: '$edi', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $edi
+  
+  ; Test that the DBG_VALUE on ebx below is sunk with the def of ebx, despite
+  ; not being adjacent to the def, see PR38952
+
+    DBG_VALUE $edi, $noreg, !21, !DIExpression()
+    renamable $ebx = COPY $edi
+    renamable $eax = MOV32r0 implicit-def dead $eflags
+    DBG_VALUE $ebx, $noreg, !21, !DIExpression()
+    CMP32ri $edi, 255, implicit-def $eflags
+    JG_1 %bb.2, implicit killed $eflags
+    JMP_1 %bb.1
+  
+  bb.1.if.end:
+  ; CHECK-LABEL: bb.1.if.end
+    successors: %bb.2(0x80000000)
+    liveins: $ebx
+  
+  ; CHECK: $ebx = COPY $edi
+  ; CHECK-NEXT: DBG_VALUE $ebx
+    renamable $rdx = MOVSX64rr32 renamable $ebx
+    renamable $rdx = nsw SHL64ri killed renamable $rdx, 2, implicit-def dead $eflags
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $rdi = MOV32ri64 0
+    $esi = MOV32r0 implicit-def dead $eflags
+    CALL64pcrel32 &memset, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit killed $esi, implicit $rdx, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  
+  bb.2.return:
+    liveins: $eax
+  
+    RET 0, $eax
+
+...
-- 
GitLab


From 7e36a98252d519e4d91215048bf82e98778dfdcc Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Fri, 2 Nov 2018 17:15:36 +0000
Subject: [PATCH 0918/1116] [SystemZ] Rework getInterleavedMemoryOpCost()

Model this function more closely after the BasicTTIImpl version, with
separate handling of loads and stores. For loads, the set of actually loaded
vectors is checked.

This makes it more readable and just slightly more accurate generally.

Review: Ulrich Weigand
https://reviews.llvm.org/D53071

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345998 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  64 ++++++--
 .../SystemZ/mem-interleaving-costs-02.ll      | 149 ++++++++++++++++++
 2 files changed, 197 insertions(+), 16 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll

diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index e7052e2e469..279a8218b1c 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -979,6 +979,11 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   return  NumOps;
 }
 
+// The generic implementation of getInterleavedMemoryOpCost() is based on
+// adding costs of the memory operations plus all the extracts and inserts
+// needed for using / defining the vector operands. The SystemZ version does
+// roughly the same but bases the computations on vector permutations
+// instead.
 int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
@@ -993,22 +998,49 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
 
-  int NumWideParts = getNumVectorRegs(VecTy);
-
-  // How many source vectors are handled to produce a vectorized operand?
-  int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts);
-  int NumSrcParts =
-    ((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts);
-
-  // A Load group may have gaps.
-  unsigned NumOperands =
-    ((Opcode == Instruction::Load) ? Indices.size() : Factor);
-
-  // Each needed permute takes two vectors as input.
-  if (NumSrcParts > 1)
-    NumSrcParts--;
-  int NumPermutes = NumSrcParts * NumOperands;
+  // Return the ceiling of dividing A by B.
+  auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
+
+  unsigned NumElts = VecTy->getVectorNumElements();
+  assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
+  unsigned VF = NumElts / Factor;
+  unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
+  unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
+  unsigned NumPermutes = 0;
+
+  if (Opcode == Instruction::Load) {
+    // Loading interleave groups may have gaps, which may mean fewer
+    // loads. Find out how many vectors will be loaded in total, and in how
+    // many of them each value will be in.
+    BitVector UsedInsts(NumVectorMemOps, false);
+    std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
+    for (unsigned Index : Indices)
+      for (unsigned Elt = 0; Elt < VF; ++Elt) {
+        unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
+        UsedInsts.set(Vec);
+        ValueVecs[Index].set(Vec);
+      }
+    NumVectorMemOps = UsedInsts.count();
+
+    for (unsigned Index : Indices) {
+      // Estimate that each loaded source vector containing this Index
+      // requires one operation, except that vperm can handle two input
+      // registers first time for each dst vector.
+      unsigned NumSrcVecs = ValueVecs[Index].count();
+      unsigned NumDstVecs = ceil(VF * getScalarSizeInBits(VecTy), 128U);
+      assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
+      NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
+    }
+  } else {
+    // Estimate the permutes for each stored vector as the smaller of the
+    // number of elements and the number of source vectors. Subtract one per
+    // dst vector for vperm (S.A.).
+    unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
+    unsigned NumDstVecs = NumVectorMemOps;
+    assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
+    NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
+  }
 
   // Cost of load/store operations and the permutations needed.
-  return NumWideParts + NumPermutes;
+  return NumVectorMemOps + NumPermutes;
 }
diff --git a/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll b/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
new file mode 100644
index 00000000000..4c992cedd88
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
@@ -0,0 +1,149 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -debug-only=loop-vectorize,vectorutils -max-interleave-group-factor=64\
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that some cost estimations for interleave groups make sense.
+
+; This loop is loading four i16 values at indices [0, 1, 2, 3], with a stride
+; of 4. At VF=4, memory interleaving means loading 4 * 4 * 16 bits = 2 vector
+; registers. Each of the 4 vector values must then be constructed from the
+; two vector registers using one vperm each, which gives a cost of 2 + 4 = 6.
+;
+; CHECK: LV: Checking a loop in "fun0"
+; CHECK: LV: Found an estimated cost of 6 for VF 4 For instruction:   %ld0 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld1 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld2 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld3 = load i16
+define void @fun0(i16 *%ptr, i16 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i16* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i16, i16* %ivptr, i64 0
+  %ld0 = load i16, i16* %ptr0
+  %ptr1 = getelementptr inbounds i16, i16* %ivptr, i64 1
+  %ld1 = load i16, i16* %ptr1
+  %ptr2 = getelementptr inbounds i16, i16* %ivptr, i64 2
+  %ld2 = load i16, i16* %ptr2
+  %ptr3 = getelementptr inbounds i16, i16* %ivptr, i64 3
+  %ld3 = load i16, i16* %ptr3
+  %a1 = add i16 %ld0, %ld1
+  %a2 = add i16 %a1, %ld2
+  %a3 = add i16 %a2, %ld3
+  %dstptr = getelementptr inbounds i16, i16* %dst, i64 %iv
+  store i16 %a3, i16* %dstptr
+  %ptr.next = getelementptr inbounds i16, i16* %ivptr, i64 4
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop loads one i8 value in a stride of 3. At VF=16, this means loading
+; 3 vector registers, and then constructing the vector value with two vperms,
+; which gives a cost of 5.
+;
+; CHECK: LV: Checking a loop in "fun1"
+; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction:   %ld0 = load i8
+define void @fun1(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %ld0, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop is loading 4 i8 values at indexes [0, 1, 2, 3], with a stride of
+; 32. At VF=2, this means loading 2 vector registers, and using 4 vperms to
+; produce the vector values, which gives a cost of 6.
+;
+; CHECK: LV: Checking a loop in "fun2"
+; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %ld0 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+define void @fun2(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %ptr1 = getelementptr inbounds i8, i8* %ivptr, i64 1
+  %ld1 = load i8, i8* %ptr1
+  %ptr2 = getelementptr inbounds i8, i8* %ivptr, i64 2
+  %ld2 = load i8, i8* %ptr2
+  %ptr3 = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %ld3 = load i8, i8* %ptr3
+  %a1 = add i8 %ld0, %ld1
+  %a2 = add i8 %a1, %ld2
+  %a3 = add i8 %a2, %ld3
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %a3, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 32
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop is loading 4 i8 values at indexes [0, 1, 2, 3], with a stride of
+; 30. At VF=2, this means loading 3 vector registers, and using 4 vperms to
+; produce the vector values, which gives a cost of 7. This is the same loop
+; as in fun2, except the stride makes the second iterations values overlap a
+; vector register boundary.
+;
+; CHECK: LV: Checking a loop in "fun3"
+; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction:   %ld0 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+define void @fun3(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %ptr1 = getelementptr inbounds i8, i8* %ivptr, i64 1
+  %ld1 = load i8, i8* %ptr1
+  %ptr2 = getelementptr inbounds i8, i8* %ivptr, i64 2
+  %ld2 = load i8, i8* %ptr2
+  %ptr3 = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %ld3 = load i8, i8* %ptr3
+  %a1 = add i8 %ld0, %ld1
+  %a2 = add i8 %a1, %ld2
+  %a3 = add i8 %a2, %ld3
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %a3, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 30
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
-- 
GitLab


From 062cd21484f5cc70015e168155cdae306c1a0f3a Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Fri, 2 Nov 2018 17:25:40 +0000
Subject: [PATCH 0919/1116] Fixed inclusion of M_PI fow MinGW-w64

Patch by KOLANICH


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346000 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 55b1a872484..88e07b99e72 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(__MINGW32__)
 // Provide M_PI.
 #define _USE_MATH_DEFINES
 #endif
-- 
GitLab


From 6ed7d3348e3a35bb59c1c3b1027c19baf6f761b6 Mon Sep 17 00:00:00 2001
From: Easwaran Raman <eraman@google.com>
Date: Fri, 2 Nov 2018 17:39:31 +0000
Subject: [PATCH 0920/1116] [ProfileSummary] Add options to override hot and
 cold count thresholds.

Summary:
The hot and cold count thresholds are derived from the summary, but for
debugging purposes it is convenient to provide the actual thresholds.

Reviewers: davidxl

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D54040

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346005 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ProfileSummaryInfo.cpp   | 18 ++++++++++++++++++
 test/Analysis/ProfileSummary/basic.ll | 12 ++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index aeaa5172b3e..7472b6201c2 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -51,6 +51,18 @@ static cl::opt<unsigned> ProfileSummaryHugeWorkingSetSizeThreshold(
              " blocks required to reach the -profile-summary-cutoff-hot"
              " percentile exceeds this count."));
 
+// The next two options override the counts derived from summary computation and
+// are useful for debugging purposes.
+static cl::opt<int> ProfileSummaryHotCount(
+    "profile-summary-hot-count", cl::ReallyHidden, cl::ZeroOrMore,
+    cl::desc("A fixed hot count that overrides the count derived from"
+             " profile-summary-cutoff-hot"));
+
+static cl::opt<int> ProfileSummaryColdCount(
+    "profile-summary-cold-count", cl::ReallyHidden, cl::ZeroOrMore,
+    cl::desc("A fixed cold count that overrides the count derived from"
+             " profile-summary-cutoff-cold"));
+
 // Find the summary entry for a desired percentile of counts.
 static const ProfileSummaryEntry &getEntryForPercentile(SummaryEntryVector &DS,
                                                         uint64_t Percentile) {
@@ -198,9 +210,15 @@ void ProfileSummaryInfo::computeThresholds() {
   auto &HotEntry =
       getEntryForPercentile(DetailedSummary, ProfileSummaryCutoffHot);
   HotCountThreshold = HotEntry.MinCount;
+  if (ProfileSummaryHotCount.getNumOccurrences() > 0)
+    HotCountThreshold = ProfileSummaryHotCount;
   auto &ColdEntry =
       getEntryForPercentile(DetailedSummary, ProfileSummaryCutoffCold);
   ColdCountThreshold = ColdEntry.MinCount;
+  if (ProfileSummaryColdCount.getNumOccurrences() > 0)
+    ColdCountThreshold = ProfileSummaryColdCount;
+  assert(ColdCountThreshold <= HotCountThreshold &&
+         "Cold count threshold cannot exceed hot count threshold!");
   HasHugeWorkingSetSize =
       HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold;
 }
diff --git a/test/Analysis/ProfileSummary/basic.ll b/test/Analysis/ProfileSummary/basic.ll
index e417e459f04..966a1117c47 100644
--- a/test/Analysis/ProfileSummary/basic.ll
+++ b/test/Analysis/ProfileSummary/basic.ll
@@ -1,19 +1,31 @@
 ; RUN: opt < %s -disable-output -passes=print-profile-summary -S 2>&1 | FileCheck %s
+; RUN: opt < %s -disable-output -profile-summary-hot-count=500 -passes=print-profile-summary -S 2>&1 | FileCheck %s -check-prefixes=OVERRIDE-HOT
+; RUN: opt < %s -disable-output -profile-summary-cold-count=0 -passes=print-profile-summary -S 2>&1 | FileCheck %s -check-prefixes=OVERRIDE-COLD
+; RUN: opt < %s -disable-output -profile-summary-cold-count=200 -profile-summary-hot-count=1000 -passes=print-profile-summary -S 2>&1 | FileCheck %s -check-prefixes=OVERRIDE-BOTH
 
 define void @f1() !prof !20 {
 ; CHECK-LABEL: f1 :hot
+; OVERRIDE-HOT-LABEL: f1
+; OVERRIDE-COLD-LABEL: f1 :hot
+; OVERRIDE-BOTH-LABEL: f1
 
   ret void
 }
 
 define void @f2() !prof !21 {
 ; CHECK-LABEL: f2 :cold
+; OVERRIDE-HOT-LABEL: f2 :cold
+; OVERRIDE-COLD-LABEL: f2
+; OVERRIDE-BOTH-LABEL: f2
 
   ret void
 }
 
 define void @f3() !prof !22 {
 ; CHECK-LABEL: f3
+; OVERRIDE-HOT-LABEL: f3
+; OVERRIDE-COLD-LABEL: f3
+; OVERRIDE-BOTH-LABEL: f3
 
   ret void
 }
-- 
GitLab


From 4860c625396ca5694659a16f4300c97d7d449804 Mon Sep 17 00:00:00 2001
From: Zachary Turner <zturner@google.com>
Date: Fri, 2 Nov 2018 17:49:01 +0000
Subject: [PATCH 0921/1116] Refactor the lit configuration files

A year or so ago, I re-wrote most of the lit infrastructure in LLVM so
that it wasn't so boilerplate-y. I added lots of common helper type
stuff, simplifed usage patterns, and made the code more elegant and
maintainable.

We migrated to this in LLVM, clang, and lld's lit files, but not in
LLDBs. This started to bite me recently, as the 4 most recent times I
tried to run the lit test suite in LLDB on a fresh checkout the first
thing that would happen is that python would just start crashing with
unhelpful backtraces and I would have to spend time investigating.

You can reproduce this today by doing a fresh cmake generation, doing
ninja lldb and then python bin/llvm-lit.py -sv ~/lldb/lit/SymbolFile at
which point you'll get a segfault that tells you nothing about what your
problem is.

I started trying to fix the issues with bandaids, but it became clear
that the proper solution was to just bring in the work I did in the rest
of the projects. The side benefit of this is that the lit configuration
files become much cleaner and more understandable as a result.

Differential Revision: https://reviews.llvm.org/D54009

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346008 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/lit/lit/llvm/config.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/lit/lit/llvm/config.py b/utils/lit/lit/llvm/config.py
index 0e446da3710..6bb7135f659 100644
--- a/utils/lit/lit/llvm/config.py
+++ b/utils/lit/lit/llvm/config.py
@@ -55,6 +55,8 @@ class LLVMConfig(object):
             features.add('system-windows')
         elif platform.system() == "Linux":
             features.add('system-linux')
+        elif platform.system() in ['FreeBSD']:
+            config.available_features.add('system-freebsd')
 
         # Native compilation: host arch == default triple arch
         # Both of these values should probably be in every site config (e.g. as
-- 
GitLab


From 04550cf8d34909e9b4e6b1b0a97f8ce1e0862123 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Fri, 2 Nov 2018 17:53:31 +0000
Subject: [PATCH 0922/1116] [SystemZ::TTI]  Improve cost handling of uint/sint
 to fp conversions.

Let i8/i16 uint/sint to fp conversions cost 1 if operand is a load.

Since the load already does the extension, there is no extra cost (previously
returned 2).

Review: Ulrich Weigand
https://reviews.llvm.org/D54028

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346009 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SystemZ/SystemZTargetTransformInfo.cpp    | 10 ++--
 test/Analysis/CostModel/SystemZ/fp-cast.ll    | 46 +++++++++++++++++++
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 279a8218b1c..f296d80dbf5 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -748,10 +748,12 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   else { // Scalar
     assert (!Dst->isVectorTy());
 
-    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)
-      return (SrcScalarBits >= 32
-                ? 1
-                : SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/);
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
+      if (SrcScalarBits >= 32 ||
+          (I != nullptr && isa<LoadInst>(I->getOperand(0))))
+        return 1;
+      return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
+    }
 
     if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
         Src->isIntegerTy(1)) {
diff --git a/test/Analysis/CostModel/SystemZ/fp-cast.ll b/test/Analysis/CostModel/SystemZ/fp-cast.ll
index 4ea5a5033d7..20feefb8025 100644
--- a/test/Analysis/CostModel/SystemZ/fp-cast.ll
+++ b/test/Analysis/CostModel/SystemZ/fp-cast.ll
@@ -539,3 +539,49 @@ define void @uitofp() {
 
   ret void;
 }
+
+define void @sitofp_extload(i16 *%src16, i8 *%src8) {
+  %ld16 = load i16, i16 *%src16
+  %v6 = sitofp i16 %ld16 to fp128
+  %v7 = sitofp i16 %ld16 to double
+  %v8 = sitofp i16 %ld16 to float
+
+  %ld8 = load i8, i8 *%src8
+  %v9 = sitofp i8 %ld8 to fp128
+  %v10 = sitofp i8 %ld8 to double
+  %v11 = sitofp i8 %ld8 to float
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %ld16 = load i16, i16* %src16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = sitofp i16 %ld16 to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v7 = sitofp i16 %ld16 to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v8 = sitofp i16 %ld16 to float
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %ld8 = load i8, i8* %src8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = sitofp i8 %ld8 to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v10 = sitofp i8 %ld8 to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = sitofp i8 %ld8 to float
+
+  ret void;
+}
+
+define void @uitofp_extload(i16 *%src16, i8 *%src8) {
+  %ld16 = load i16, i16 *%src16
+  %v6 = uitofp i16 %ld16 to fp128
+  %v7 = uitofp i16 %ld16 to double
+  %v8 = uitofp i16 %ld16 to float
+
+  %ld8 = load i8, i8 *%src8
+  %v9 = uitofp i8 %ld8 to fp128
+  %v10 = uitofp i8 %ld8 to double
+  %v11 = uitofp i8 %ld8 to float
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %ld16 = load i16, i16* %src16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = uitofp i16 %ld16 to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v7 = uitofp i16 %ld16 to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v8 = uitofp i16 %ld16 to float
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %ld8 = load i8, i8* %src8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = uitofp i8 %ld8 to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v10 = uitofp i8 %ld8 to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = uitofp i8 %ld8 to float
+
+  ret void;
+}
-- 
GitLab


From 437cbaffffc74a5bb3bac31e7a649a144e14fdd4 Mon Sep 17 00:00:00 2001
From: Leonard Mosescu <mosescu@google.com>
Date: Fri, 2 Nov 2018 18:00:37 +0000
Subject: [PATCH 0923/1116] Fix a few small issues in llvm-pdbutil

Running "llvm-pdbutil dump -all" on linux (using the native PDB reader),
over a few PDBs pulled from the Microsoft public symbol store uncovered
a few small issues:

- stripped PDBs might not have the strings stream (/names)
- stripped PDBs might not have the "module info" stream

Differential Revision: https://reviews.llvm.org/D54006


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346010 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-pdbdump/Inputs/Stripped.pdb   | Bin 0 -> 69632 bytes
 test/tools/llvm-pdbdump/checksum-string.test  |   1 +
 test/tools/llvm-pdbdump/class-layout.test     |   2 +
 .../complex-padding-graphical.test            |   2 +
 test/tools/llvm-pdbdump/enum-layout.test      |   2 +
 .../llvm-pdbdump/explain-dbi-stream.test      |   2 +
 .../llvm-pdbdump/explain-pdb-stream.test      |   2 +
 test/tools/llvm-pdbdump/explain.test          |   2 +
 test/tools/llvm-pdbdump/injected-sources.test |   2 +
 test/tools/llvm-pdbdump/lit.local.cfg         |   3 +-
 test/tools/llvm-pdbdump/load-address.test     |   2 +
 .../llvm-pdbdump/pretty-func-dumper.test      |   2 +
 test/tools/llvm-pdbdump/regex-filter.test     |   2 +
 .../simple-padding-graphical.test             |   2 +
 test/tools/llvm-pdbdump/stripped.test         | 109 ++++++++++++++++++
 test/tools/llvm-pdbdump/symbol-filters.test   |   2 +
 test/tools/llvm-pdbdump/type-qualifiers.test  |   2 +
 test/tools/llvm-pdbdump/usingnamespace.test   |   2 +
 tools/llvm-pdbutil/DumpOutputStyle.cpp        |  30 ++---
 tools/llvm-pdbutil/InputFile.cpp              |  13 ++-
 tools/llvm-pdbutil/InputFile.h                |   2 +
 21 files changed, 165 insertions(+), 21 deletions(-)
 create mode 100644 test/tools/llvm-pdbdump/Inputs/Stripped.pdb
 create mode 100644 test/tools/llvm-pdbdump/stripped.test

diff --git a/test/tools/llvm-pdbdump/Inputs/Stripped.pdb b/test/tools/llvm-pdbdump/Inputs/Stripped.pdb
new file mode 100644
index 0000000000000000000000000000000000000000..c0988c2c3bb7e963da50f6e6ab1bf0412d717faa
GIT binary patch
literal 69632
zcmeaxOfJeV&QB{*aMpL$)>iNhc2h9dGce%gl5z=VU|?VnU|?WkU|<krU|@&<@gYJ`
z)+jX^0;3@?8UmvsFd71*Aut*OqaiRF0;3@?8UmvsFd71*Autp|py9y(Q7{?;qaiRF
z0;3@?8UmvsFd71*Aut*OqaiRF0;3@?8UmvsFw8^1Lg3#p_u;7bMnhmU1V%$(Gz3ON
zU^E0qLtr!nMnhmU1V%$(Gz3ONU^E1#=ovCDX}=Q<TH}93w2rZA*D3#Hk#o$PrnBGW
zVqjp<_sPu5_RLGmXJBN2>;!<(u+<H$P{y4=V~`Y-H%g6$z-S1JhQMeDjE2By2#kin
zXb6mkz-S1JhQMeDjE2By2n?PO;DPSz_hVpSuwY<d@MeY3AQzs1?)6{5%)oF4G(Z3n
zWME+6V_;xlW?*38Wnf?c>1ShLVBlb2VBln6VBlt8U;vF9K$VVCqaiRF0;3@?8Umvs
zFd71*Aut*OqaiRF0;3@?8UmvsFd71bAOu(#7#KkF|DdUR^!b07^FVSS4C907(b;}b
zD2(FK5Eu=C(GVC7fzc2c4S~@R7!85Z5Eu=C(GVC7fzc2c8X>@a%8(H>?+-fXAB@3s
z`2YWd#s&WW2hH*i4S$Tfax?@+Ltr!nMnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1V%#u
zcL;z^Kq%)lWCShmXJ_E=<6ub30IlP<VPs$sVP;^cf$jl-nIOi%z#s`e2_U5nCI(s?
zV8X<}06GakLBJpZCJv$h|A)}nNoT8=;-ci3^8BLgn3B}ul9+;&q!=f?{G==fJW>#|
zK=vMh?nh7%2uJ|&Au=G640|1M+Y6Bbnacn^2?4~4XJBAZU|>i9g&l|w!T9afQUD#O
zkXodm#lWDuOYbAh&DhKW$>H)JNbUg+zk}pJjs(RMdboh(6c{1<H?YYW;E{8{Bj>^h
z+5Z8Ge~=xJuz-YB03LmyNCS;BVRK6X9(@gX<R;*eTfoS`0NU$<%{~Dp+~K0Y1ilKB
z0h>NtX&)304or~!Fd%)fFagmZxd0}}ei?K*Sb7p*U|?_%Z~z56NDCN)<Rlmv7(f`C
z9B3>LmmKK+PF!*g(EUFS3=AMAgY5zlAp6cRGBB8d+`?dhA;-eRz#xJ{PJ<b;FUUc_
z0L2fW_y(B`8e?o=FgO4T3lJZK*%;Uuc)<dU44|?JBz91Ufq@e&#sn8LU}a$728%Jn
z#X#mk)Um+DEI{cJtd12fb{@1}3FI*b2Dli=E||MOc7pJ~_y695w1O}wKY_%+7^E12
z*}w)s*kGwqVl)IsLtr!nMnhmU1V%$(Gz3ONU^D~<a|q~_q*jzLFff4H*x>npP-7m{
zkVbMesE?~xl#*DI$iM(<cQY6;Fff2N{ewCaNa{d+2R(=~2CxwhNRmiQ2L^@%dIb=L
zp!PXK0C+%wfk6&Q5x7sm;GkEOnv<W*zyNBqGk``GK<#z~40TSZBBQ*~5Eu=C(GVC7
zfzc2c4S~@R7!85Z5Eu=C(GVC7fzc2c4FTj308L#<aq53W78_-chQMeDjE2By2#kin
zXb6mkz-S1JhQMeDjE2By2#kinXb23i5D;VF6NLGQnIS$cr=+wvH3c*k&nF0)Y6q>Q
zW@NCh%FlOjac~HAb##n~h+;s<gVt{|GT5i(=Q}tVAt{h!fanLUUuR^nw>NMybZ`iC
zb#!z9YiEGTgVwM^<c(0|m7w}T>&h7!?2{6UK=y|_IXb|d05=1)z8vNTLnJo{LCpZI
z`DSFuP0Y+=V32}}f!6ba?R7y|3v&}_u`<F=Oh1Fxi9^hD4hjh_Nh~TUEdYffC@eu?
z23jl5$WTy}nOBkq5{H=!TC>i`V4sT~mN3797FRPe*n_q~7&$mZI{F2;I)+9#I5<Q>
z!W(23Xni*mgS~ycxr>8?nG+~f!K*A_c7WFQL){LNgF7B34_e<3lShv)kollB;>hM3
zIXIx04_cEC^#e+L>p{a6w5A>E2V^e<SQt9EIyw4zI7ESb1BnNaeIN|659C}kCy-^Z
zI0CJ!2gOmmWrTx+ij$**qa!%}Kzcw+PoVmYKmh^L2NK7`pq1?qQP5o#Ablzf4B#a!
zHVh059t;c&Aq))Qr6oBG3=9<v3=Ay{3=9((7#QX-FfgoOU|`t7z`y`X$7dKA7;Z2y
zFg#&kVEDkm!0-pOj+~K!0TgB+eIT(Dj0_Af7#SFN7#SF5FfcIefv&69U}9iMVT7#P
zzr(=5pu@<(;KRtkFolVMp@orw!Gno`p@NBlVFMEb!wN<Q1`bd{f;fAW91Vfd5Eu=C
z(GVC7fzc2c4S~@R7!85Z5Eu;ss)PV&k8TVOlVEEDc7XavOpv|YpuMo5edQq9k^v$X
z38gEc^h_wd7fRoS(x62KAbp_yy&zhb5u)E0N~c2URw%s^N}q(%FQGIi)GeA&+7n7w
zLg}ecdLxtuc@K0yA&3TJM3)fE1#RQ!WMF`GAz?I34ostMIRJ}cCGaSuYL^a&VPMDO
zaRbQBIJAPKpcu6C928=pSONzYNB}BFECtd}j9ReycoY(A4lW}Q=8yLOafQ*S_-F`>
zhQMeDjE2By2#kinXb6mkz-S1JhQLS&0Z<<wbp9Wxr$@^9e+&$x=l@|lvk!El7`}8f
zdj20GbU<kiw0;D1?i%O}JP;oy3u1%T8sL%xt);*v2U@d&OAfRaWc2($%rgZ+qa>iU
zC7?4rkjIBW?9uc8paYn&#0e@iA$ausKUkQI(xV|T8UmvsFd71*Aut*OqaiRF0z)DM
zu-*U1#V~sR-;jvBQAdu3z-S1JhQMeDjE2By2#kinXb6mkz-S1JhQMeDjD`S_AuvVH
zkg==%PBdtq@4x@-q?@zc+@~*BI?lB5K`<8s1A{*3x;oFiw0zKpc?OUw(9A4|hFJi*
z^$*0p6KKo`xz7xw4<s^*M?+vV1V%$(Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^D~<
zM+kuSM1#&R1FeY$t^f6AWnci&AlIE>fZTb$fSG~e3{saLbhjTL0|Nsu0|Ns)^eRBm
zE^SVR(f&Uu+(z+e2#kinXb6mkz-S1JhQMeDjE2By2#kinXb6mkz-R~zHo!mF95m{l
q(GVC7fzc2c4S~@R7!85Z5Eu=C(GVC7fzc2c4S~@R7!3ichX4RO<`{1P

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-pdbdump/checksum-string.test b/test/tools/llvm-pdbdump/checksum-string.test
index c3ecc265e9b..6925329a590 100644
--- a/test/tools/llvm-pdbdump/checksum-string.test
+++ b/test/tools/llvm-pdbdump/checksum-string.test
@@ -1,3 +1,4 @@
+; REQUIRES: diasdk
 ; RUN: llvm-pdbutil pretty -lines %p/Inputs/PrettyFuncDumperTest.pdb > %t
 
 ; CHECK: ---COMPILANDS---
diff --git a/test/tools/llvm-pdbdump/class-layout.test b/test/tools/llvm-pdbdump/class-layout.test
index 1b7e909dcb7..cb70dab0dc0 100644
--- a/test/tools/llvm-pdbdump/class-layout.test
+++ b/test/tools/llvm-pdbdump/class-layout.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -all -class-recurse-depth=1 \
 ; RUN:   %p/Inputs/ClassLayoutTest.pdb > %t
 ; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBALS_TEST
diff --git a/test/tools/llvm-pdbdump/complex-padding-graphical.test b/test/tools/llvm-pdbdump/complex-padding-graphical.test
index 9373c1ec6c2..42511db95ff 100644
--- a/test/tools/llvm-pdbdump/complex-padding-graphical.test
+++ b/test/tools/llvm-pdbdump/complex-padding-graphical.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -classes -class-definitions=layout \
 ; RUN:     -include-types=Test %p/Inputs/ComplexPaddingTest.pdb > %t
 
diff --git a/test/tools/llvm-pdbdump/enum-layout.test b/test/tools/llvm-pdbdump/enum-layout.test
index 5813321f000..57006d182bc 100644
--- a/test/tools/llvm-pdbdump/enum-layout.test
+++ b/test/tools/llvm-pdbdump/enum-layout.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -types %p/Inputs/ClassLayoutTest.pdb > %t
 ; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBAL_ENUM
 ; RUN: FileCheck -input-file=%t %s -check-prefix=MEMBER_ENUM
diff --git a/test/tools/llvm-pdbdump/explain-dbi-stream.test b/test/tools/llvm-pdbdump/explain-dbi-stream.test
index f393f976caa..030e51f8f05 100644
--- a/test/tools/llvm-pdbdump/explain-dbi-stream.test
+++ b/test/tools/llvm-pdbdump/explain-dbi-stream.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil explain \
 ; RUN: -offset=0xF000 \
 ; RUN: -offset=0xF004 \
diff --git a/test/tools/llvm-pdbdump/explain-pdb-stream.test b/test/tools/llvm-pdbdump/explain-pdb-stream.test
index 10efb5b6459..32ec800f8ca 100644
--- a/test/tools/llvm-pdbdump/explain-pdb-stream.test
+++ b/test/tools/llvm-pdbdump/explain-pdb-stream.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil explain \
 ; RUN: -offset=0x11000 \
 ; RUN: -offset=0x11004 \
diff --git a/test/tools/llvm-pdbdump/explain.test b/test/tools/llvm-pdbdump/explain.test
index d76e86add2d..1179fe5aad7 100644
--- a/test/tools/llvm-pdbdump/explain.test
+++ b/test/tools/llvm-pdbdump/explain.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil explain -offset=0 %p/Inputs/InjectedSource.pdb \
 ; RUN:  | FileCheck --check-prefix=ZERO %s
 ; RUN: llvm-pdbutil explain -offset=40 %p/Inputs/InjectedSource.pdb \
diff --git a/test/tools/llvm-pdbdump/injected-sources.test b/test/tools/llvm-pdbdump/injected-sources.test
index c04422e2a10..9d2d1b91a05 100644
--- a/test/tools/llvm-pdbdump/injected-sources.test
+++ b/test/tools/llvm-pdbdump/injected-sources.test
@@ -1,6 +1,8 @@
 ; The PDB committed to the repo does not seem to be recognized by older
 ; versions of DIA SDK, so we xfail the test temporarily until we can
 ; figure out how to get a PDB that makes all versions of MSVC happy.
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -injected-sources -injected-source-content \
 ; RUN:   %p/Inputs/InjectedSource.pdb | FileCheck %s
 ; RUN: llvm-pdbutil pretty -injected-sources -injected-source-content \
diff --git a/test/tools/llvm-pdbdump/lit.local.cfg b/test/tools/llvm-pdbdump/lit.local.cfg
index 28a895f5114..5f1f826babb 100644
--- a/test/tools/llvm-pdbdump/lit.local.cfg
+++ b/test/tools/llvm-pdbdump/lit.local.cfg
@@ -1 +1,2 @@
-config.unsupported = not config.have_dia_sdk
+if config.have_dia_sdk:
+  config.available_features.add("diasdk")
diff --git a/test/tools/llvm-pdbdump/load-address.test b/test/tools/llvm-pdbdump/load-address.test
index 4402790d71f..46b3a074e1c 100644
--- a/test/tools/llvm-pdbdump/load-address.test
+++ b/test/tools/llvm-pdbdump/load-address.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -externals %p/Inputs/LoadAddressTest.pdb \
 ; RUN:    | FileCheck --check-prefix=RVA %s
 ; RUN: llvm-pdbutil pretty -externals -load-address=0x40000000 \
diff --git a/test/tools/llvm-pdbdump/pretty-func-dumper.test b/test/tools/llvm-pdbdump/pretty-func-dumper.test
index 5e4dc8d998b..40bbcda2588 100644
--- a/test/tools/llvm-pdbdump/pretty-func-dumper.test
+++ b/test/tools/llvm-pdbdump/pretty-func-dumper.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -all -class-recurse-depth=1 \
 ; RUN:   %p/Inputs/PrettyFuncDumperTest.pdb > %t
 ; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBALS_FUNC
diff --git a/test/tools/llvm-pdbdump/regex-filter.test b/test/tools/llvm-pdbdump/regex-filter.test
index 1c49009bf36..7eed0963a5c 100644
--- a/test/tools/llvm-pdbdump/regex-filter.test
+++ b/test/tools/llvm-pdbdump/regex-filter.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -module-syms -globals -types %p/Inputs/FilterTest.pdb \
 ; RUN:    | FileCheck --check-prefix=NO_FILTER %s
 
diff --git a/test/tools/llvm-pdbdump/simple-padding-graphical.test b/test/tools/llvm-pdbdump/simple-padding-graphical.test
index 91da534ca01..00bae754297 100644
--- a/test/tools/llvm-pdbdump/simple-padding-graphical.test
+++ b/test/tools/llvm-pdbdump/simple-padding-graphical.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -classes -class-definitions=layout \
 ; RUN:     -include-types=SimplePad %p/Inputs/SimplePaddingTest.pdb > %t
 
diff --git a/test/tools/llvm-pdbdump/stripped.test b/test/tools/llvm-pdbdump/stripped.test
new file mode 100644
index 00000000000..1d12c9ecfa2
--- /dev/null
+++ b/test/tools/llvm-pdbdump/stripped.test
@@ -0,0 +1,109 @@
+; RUN: llvm-pdbutil dump -all %p/Inputs/Stripped.pdb > %t
+; RUN: FileCheck -input-file=%t %s
+
+; CHECK: Summary
+; CHECK-NEXT: ============================================================
+; CHECK-NEXT:  Block Size: 4096
+; CHECK-NEXT:  Number of blocks: 17
+; CHECK-NEXT:  Number of streams: 12
+; CHECK-NEXT:  Signature: 1541179274
+; CHECK-NEXT:  Age: 2
+; CHECK-NEXT:  GUID: {FF4F9B62-D99A-4647-97A7-22C702B1E053}
+; CHECK-NEXT:  Features: 0x1
+; CHECK-NEXT:  Has Debug Info: true
+; CHECK-NEXT:  Has Types: true
+; CHECK-NEXT:  Has IDs: true
+; CHECK-NEXT:  Has Globals: true
+; CHECK-NEXT:  Has Publics: true
+; CHECK-NEXT:  Is incrementally linked: false
+; CHECK-NEXT:  Has conflicting types: false
+; CHECK-NEXT:  Is stripped: true
+
+; CHECK: Streams
+; CHECK-NEXT: ============================================================
+; CHECK-NEXT:  Stream  0 (  88 bytes): [Old MSF Directory]
+; CHECK-NEXT:             Blocks: [4]
+; CHECK-NEXT:  Stream  1 (  78 bytes): [PDB Stream]
+; CHECK-NEXT:             Blocks: [14]
+; CHECK-NEXT:  Stream  2 (  56 bytes): [TPI Stream]
+; CHECK-NEXT:             Blocks: [13]
+; CHECK-NEXT:  Stream  3 (1355 bytes): [DBI Stream]
+; CHECK-NEXT:             Blocks: [7]
+; CHECK-NEXT:  Stream  4 (  56 bytes): [IPI Stream]
+; CHECK-NEXT:             Blocks: [6]
+; CHECK-NEXT:  Stream  5 (   0 bytes): [Named Stream "/LinkInfo"]
+; CHECK-NEXT:             Blocks: []
+; CHECK-NEXT:  Stream  6 ( 200 bytes): [Section Header Data]
+; CHECK-NEXT:             Blocks: [8]
+; CHECK-NEXT:  Stream  7 (  16 bytes): [Global Symbol Hash]
+; CHECK-NEXT:             Blocks: [9]
+; CHECK-NEXT:  Stream  8 ( 928 bytes): [Public Symbol Hash]
+; CHECK-NEXT:             Blocks: [11]
+; CHECK-NEXT:  Stream  9 ( 716 bytes): [Symbol Records]
+; CHECK-NEXT:             Blocks: [10]
+; CHECK-NEXT:  Stream 10 (   0 bytes): [TPI Hash]
+; CHECK-NEXT:             Blocks: []
+; CHECK-NEXT:  Stream 11 (   0 bytes): [IPI Hash]
+; CHECK-NEXT:             Blocks: []
+
+; CHECK: Module Stats
+; CHECK-NEXT: ============================================================
+
+; CHECK: S_UDT Record Stats
+; CHECK-NEXT: ============================================================
+
+; CHECK: String Table
+; CHECK-NEXT: ============================================================
+
+; CHECK: Modules
+; CHECK-NEXT: ============================================================
+
+; CHECK: Files
+; CHECK-NEXT: ============================================================
+
+; CHECK: Lines
+; CHECK-NEXT: ============================================================
+
+; CHECK: Inlinee Lines
+; CHECK-NEXT: ============================================================
+
+; CHECK: Cross Module Imports
+; CHECK-NEXT: ============================================================
+
+; CHECK: Cross Module Exports
+; CHECK-NEXT: ============================================================
+
+; CHECK: Old FPO Data
+; CHECK-NEXT: ============================================================
+
+; CHECK: New FPO Data
+; CHECK-NEXT: ============================================================
+
+; CHECK: Types (TPI Stream)
+; CHECK-NEXT: ============================================================
+
+; CHECK: Types (IPI Stream)
+; CHECK-NEXT: ============================================================
+
+; CHECK: Global Symbols
+; CHECK-NEXT: ============================================================
+
+; CHECK: Public Symbols
+; CHECK-NEXT: ============================================================
+
+; CHECK: Symbols
+; CHECK-NEXT: ============================================================
+
+; CHECK: Section Headers
+; CHECK-NEXT: ============================================================
+
+; CHECK: Original Section Headers
+; CHECK-NEXT: ============================================================
+
+; CHECK: Section Contributions
+; CHECK-NEXT: ============================================================
+
+; CHECK: Section Map
+; CHECK-NEXT: ============================================================
+
+
diff --git a/test/tools/llvm-pdbdump/symbol-filters.test b/test/tools/llvm-pdbdump/symbol-filters.test
index 80c24baf17c..4091d1d65c8 100644
--- a/test/tools/llvm-pdbdump/symbol-filters.test
+++ b/test/tools/llvm-pdbdump/symbol-filters.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -globals -module-syms -sym-types=data %p/Inputs/FilterTest.pdb \
 ; RUN:    | FileCheck --check-prefix=ONLY_DATA %s
 
diff --git a/test/tools/llvm-pdbdump/type-qualifiers.test b/test/tools/llvm-pdbdump/type-qualifiers.test
index 0969c15873c..9c8827cc5da 100644
--- a/test/tools/llvm-pdbdump/type-qualifiers.test
+++ b/test/tools/llvm-pdbdump/type-qualifiers.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -all -class-recurse-depth=1 \
 ; RUN:   %p/Inputs/TypeQualifiersTest.pdb > %t
 ; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBALS_FUNC
diff --git a/test/tools/llvm-pdbdump/usingnamespace.test b/test/tools/llvm-pdbdump/usingnamespace.test
index 954ec114010..d44b0cbf9e7 100644
--- a/test/tools/llvm-pdbdump/usingnamespace.test
+++ b/test/tools/llvm-pdbdump/usingnamespace.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -module-syms %p/Inputs/UsingNamespaceTest.pdb > %t
 ; RUN: FileCheck -input-file=%t %s
 
diff --git a/tools/llvm-pdbutil/DumpOutputStyle.cpp b/tools/llvm-pdbutil/DumpOutputStyle.cpp
index 7f309f48f2a..e4f6aa7f6ec 100644
--- a/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -250,7 +250,7 @@ Error DumpOutputStyle::dumpFileSummary() {
 static StatCollection getSymbolStats(const SymbolGroup &SG,
                                      StatCollection &CumulativeStats) {
   StatCollection Stats;
-  if (SG.getFile().isPdb()) {
+  if (SG.getFile().isPdb() && SG.hasDebugStream()) {
     // For PDB files, all symbols are packed into one stream.
     for (const auto &S : SG.getPdbModuleStream().symbols(nullptr)) {
       Stats.update(S.kind(), S.length());
@@ -1420,19 +1420,21 @@ Error DumpOutputStyle::dumpTpiStream(uint32_t StreamIdx) {
       P.formatLine("TI: {0}, Offset: {1}", IO.Type, fmtle(IO.Offset));
     }
 
-    P.NewLine();
-    P.formatLine("Hash Adjusters:");
-    auto &Adjusters = Stream.getHashAdjusters();
-    auto &Strings = Err(getPdb().getStringTable());
-    for (const auto &A : Adjusters) {
-      AutoIndent Indent2(P);
-      auto ExpectedStr = Strings.getStringForID(A.first);
-      TypeIndex TI(A.second);
-      if (ExpectedStr)
-        P.formatLine("`{0}` -> {1}", *ExpectedStr, TI);
-      else {
-        P.formatLine("unknown str id ({0}) -> {1}", A.first, TI);
-        consumeError(ExpectedStr.takeError());
+    if (getPdb().hasPDBStringTable()) {
+      P.NewLine();
+      P.formatLine("Hash Adjusters:");
+      auto &Adjusters = Stream.getHashAdjusters();
+      auto &Strings = Err(getPdb().getStringTable());
+      for (const auto &A : Adjusters) {
+        AutoIndent Indent2(P);
+        auto ExpectedStr = Strings.getStringForID(A.first);
+        TypeIndex TI(A.second);
+        if (ExpectedStr)
+          P.formatLine("`{0}` -> {1}", *ExpectedStr, TI);
+        else {
+          P.formatLine("unknown str id ({0}) -> {1}", A.first, TI);
+          consumeError(ExpectedStr.takeError());
+        }
       }
     }
   }
diff --git a/tools/llvm-pdbutil/InputFile.cpp b/tools/llvm-pdbutil/InputFile.cpp
index b2019642b2b..8eb116cf0d8 100644
--- a/tools/llvm-pdbutil/InputFile.cpp
+++ b/tools/llvm-pdbutil/InputFile.cpp
@@ -116,10 +116,6 @@ static std::string formatChecksumKind(FileChecksumKind Kind) {
   return formatUnknownEnum(Kind);
 }
 
-static const DebugStringTableSubsectionRef &extractStringTable(PDBFile &File) {
-  return cantFail(File.getStringTable()).getStringTable();
-}
-
 template <typename... Args>
 static void formatInternal(LinePrinter &Printer, bool Append, Args &&... args) {
   if (Append)
@@ -168,8 +164,13 @@ void SymbolGroup::initializeForPdb(uint32_t Modi) {
 
   // PDB always uses the same string table, but each module has its own
   // checksums.  So we only set the strings if they're not already set.
-  if (!SC.hasStrings())
-    SC.setStrings(extractStringTable(File->pdb()));
+  if (!SC.hasStrings()) {
+    auto StringTable = File->pdb().getStringTable();
+    if (StringTable)
+      SC.setStrings(StringTable->getStringTable());
+    else
+      consumeError(StringTable.takeError());
+  }
 
   SC.resetChecksums();
   auto MDS = getModuleDebugStream(File->pdb(), Name, Modi);
diff --git a/tools/llvm-pdbutil/InputFile.h b/tools/llvm-pdbutil/InputFile.h
index 552f3a3b212..ee4e651c1e9 100644
--- a/tools/llvm-pdbutil/InputFile.h
+++ b/tools/llvm-pdbutil/InputFile.h
@@ -110,6 +110,8 @@ public:
   const InputFile &getFile() const { return *File; }
   InputFile &getFile() { return *File; }
 
+  bool hasDebugStream() const { return DebugStream != nullptr; }
+
 private:
   void initializeForPdb(uint32_t Modi);
   void updatePdbModi(uint32_t Modi);
-- 
GitLab


From 6892afb33a7162281ee8727071649133e905df8d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 2 Nov 2018 18:14:24 +0000
Subject: [PATCH 0924/1116] [ValueTracking] add test for non-canonical shuffle;
 NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346025 91177308-0d34-0410-b5e6-96231b3b80d8
---
 unittests/Analysis/ValueTrackingTest.cpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/unittests/Analysis/ValueTrackingTest.cpp b/unittests/Analysis/ValueTrackingTest.cpp
index f7d715c6447..e66d8f77fd4 100644
--- a/unittests/Analysis/ValueTrackingTest.cpp
+++ b/unittests/Analysis/ValueTrackingTest.cpp
@@ -514,6 +514,29 @@ TEST(ValueTracking, ComputeNumSignBits_Shuffle) {
   EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
 }
 
+// FIXME:
+// No guarantees for canonical IR in this analysis, so a shuffle element that
+// references an undef value means this can't return any extra information. 
+TEST(ValueTracking, ComputeNumSignBits_Shuffle2) {
+  StringRef Assembly = "define <2 x i32> @f(<2 x i1> %x) { "
+                       "  %sext = sext <2 x i1> %x to <2 x i32> "
+                       "  %val = shufflevector <2 x i32> %sext, <2 x i32> undef, <2 x i32> <i32 0, i32 2> "
+                       "  ret <2 x i32> %val "
+                       "} ";
+
+  LLVMContext Context;
+  SMDiagnostic Error;
+  auto M = parseAssemblyString(Assembly, Error, Context);
+  assert(M && "Bad assembly?");
+
+  auto *F = M->getFunction("f");
+  assert(F && "Bad assembly?");
+
+  auto *RVal =
+      cast<ReturnInst>(F->getEntryBlock().getTerminator())->getOperand(0);
+  EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 32u);
+}
+
 TEST(ValueTracking, ComputeKnownBits) {
   StringRef Assembly = "define i32 @f(i32 %a, i32 %b) { "
                        "  %ash = mul i32 %a, 8 "
-- 
GitLab


From 7a01cbd4a152d774a29765291e0bc5dbd89a321b Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Fri, 2 Nov 2018 18:22:15 +0000
Subject: [PATCH 0925/1116] ARMExpandPseudoInsts: Fix CMP_SWAP expansion adding
 a kill flag to a def

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346026 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMExpandPseudoInsts.cpp |  9 +++++----
 test/CodeGen/ARM/cmpxchg.mir            | 24 ++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 4 deletions(-)
 create mode 100644 test/CodeGen/ARM/cmpxchg.mir

diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index b35a16b8a1e..eecd0a10dc7 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1030,10 +1030,10 @@ static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
   if (IsThumb) {
     unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
     unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
-    MIB.addReg(RegLo, Flags | getKillRegState(Reg.isDead()));
-    MIB.addReg(RegHi, Flags | getKillRegState(Reg.isDead()));
+    MIB.addReg(RegLo, Flags);
+    MIB.addReg(RegHi, Flags);
   } else
-    MIB.addReg(Reg.getReg(), Flags | getKillRegState(Reg.isDead()));
+    MIB.addReg(Reg.getReg(), Flags);
 }
 
 /// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop.
@@ -1103,7 +1103,8 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   //     bne .Lloadcmp
   unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
   MIB = BuildMI(StoreBB, DL, TII->get(STREXD), TempReg);
-  addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
+  unsigned Flags = getKillRegState(New.isDead());
+  addExclusiveRegPair(MIB, New, Flags, IsThumb, TRI);
   MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
 
   unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
diff --git a/test/CodeGen/ARM/cmpxchg.mir b/test/CodeGen/ARM/cmpxchg.mir
new file mode 100644
index 00000000000..6ae7e637249
--- /dev/null
+++ b/test/CodeGen/ARM/cmpxchg.mir
@@ -0,0 +1,24 @@
+# RUN: llc -o - %s -mtriple=armv7-unknown-linux-gnu -verify-machineinstrs -run-pass=arm-pseudo | FileCheck %s
+---
+# CHECK-LABEL: name: func
+name: func
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0_r1, $r4_r5, $r3, $lr
+    dead early-clobber renamable $r0_r1, dead early-clobber renamable $r2 = CMP_SWAP_64 killed renamable $r3, killed renamable $r4_r5, renamable $r4_r5 :: (volatile load store monotonic monotonic 8)
+    ; CHECK: bb.0:
+    ; CHECK:   liveins: $r0_r1, $r4_r5, $r3, $lr
+    ; CHECK: bb.1:
+    ; CHEKC:   liveins: $r4_r5, $r3
+    ; CHECK:   $r0_r1 = LDREXD $r3, 14, $noreg
+    ; CHECK:   CMPrr killed $r0, $r4, 14, $noreg, implicit-def $cpsr
+    ; CHECK:   CMPrr killed $r1, $r5, 0, killed $cpsr, implicit-def $cpsr
+    ; CHECK:   Bcc %bb.3, 1, killed $cpsr
+    ; CHECK: bb.2:
+    ; CHECK:   liveins: $r4_r5, $r3
+    ; CHECK:   early-clobber $r2 = STREXD $r4_r5, $r3, 14, $noreg
+    ; CHECK:   CMPri killed $r2, 0, 14, $noreg, implicit-def $cpsr
+    ; CHECK:   Bcc %bb.1, 1, killed $cpsr
+    ; CHECK: bb.3:
+...
-- 
GitLab


From b11a4e59ccae824b5c73fd09d7e13fd94e392678 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Fri, 2 Nov 2018 18:25:41 +0000
Subject: [PATCH 0926/1116] [DebugInfo][InstMerge] Fix -debugify for phi node
 created by -mldst-motion

Summary:
-mldst-motion creates a new phi node without any debug info. Use the merged debug location from the incoming stores to fix this.

Fixes PR38177. The test case here is (somewhat) simplified from:

```
struct S {
  int foo;
  void fn(int bar);
};
void S::fn(int bar) {
  if (bar)
    foo = 1;
  else
    foo = 0;
}
```

Reviewers: dblaikie, gbedwell, aprantl, vsk

Reviewed By: vsk

Subscribers: vsk, JDevlieghere, llvm-commits

Tags: #debug-info

Differential Revision: https://reviews.llvm.org/D54019

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346027 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Scalar/MergedLoadStoreMotion.cpp          |  1 +
 .../InstMerge/st_sink_check_debug.ll          | 32 +++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 test/Transforms/InstMerge/st_sink_check_debug.ll

diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 3464b759280..ee21feca8d2 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -211,6 +211,7 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
 
   auto *NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
                                 &BB->front());
+  NewPN->applyMergedLocation(S0->getDebugLoc(), S1->getDebugLoc());
   NewPN->addIncoming(Opd1, S0->getParent());
   NewPN->addIncoming(Opd2, S1->getParent());
   return NewPN;
diff --git a/test/Transforms/InstMerge/st_sink_check_debug.ll b/test/Transforms/InstMerge/st_sink_check_debug.ll
new file mode 100644
index 00000000000..94d46a58f4c
--- /dev/null
+++ b/test/Transforms/InstMerge/st_sink_check_debug.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -S -debugify -mldst-motion -o - | FileCheck %s
+
+%struct.S = type { i32 }
+
+define dso_local void @foo(%struct.S* %this, i32 %bar) {
+entry:
+  %this.addr = alloca %struct.S*, align 8
+  %bar.addr = alloca i32, align 4
+  store %struct.S* %this, %struct.S** %this.addr, align 8
+  store i32 %bar, i32* %bar.addr, align 4
+  %this1 = load %struct.S*, %struct.S** %this.addr, align 8
+  %0 = load i32, i32* %bar.addr, align 4
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %foo = getelementptr inbounds %struct.S, %struct.S* %this1, i32 0, i32 0
+  store i32 1, i32* %foo, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %foo2 = getelementptr inbounds %struct.S, %struct.S* %this1, i32 0, i32 0
+  store i32 0, i32* %foo2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; CHECK:      @foo
+; CHECK:      if.end: ; preds = %if.else, %if.then
+; CHECK-NEXT:   %.sink = phi {{.*}} !dbg
-- 
GitLab


From 088dcea9b165b0e5929d0fc89dfc1200dc3912f1 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Fri, 2 Nov 2018 18:38:52 +0000
Subject: [PATCH 0927/1116] [WebAssembly] Fix bugs in rethrow depth counting
 and InstPrinter

Summary:
EH stack depth is incremented at `try` and decremented at `catch`. When
there are more than two catch instructions for a try instruction, we
shouldn't count non-first catches when calculating EH stack depths.

This patch fixes two bugs:
- CFGStackify: Exclude `catch_all` in the terminate catch pad when
  calculating EH pad stack, because when we have multiple catches for a
  try we should count only the first catch instruction when calculating
  EH pad stack.
- InstPrinter: The initial intention was also to exclude non-first
  catches, but it didn't account nested try-catches, so it failed on
  this case:
```
try
  try
  catch
  end
catch    <-- (1)
end
```
In the example, when we are at the catch (1), the last seen EH
instruction is not `try` but `end_try`, violating the wrong assumption.

We don't need these after we switch to the second proposal because there
is gonna be only one `catch` instruction. But anyway before then these
bugfixes are necessary for keep trunk in working state.

Reviewers: dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53819

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346029 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstPrinter/WebAssemblyInstPrinter.cpp        |  5 ++---
 lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp | 15 ++++++++++++++-
 test/CodeGen/WebAssembly/exception.ll             |  1 +
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 6b97e14364f..e94faa1a214 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -120,10 +120,9 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
     case WebAssembly::CATCH_I64_S:
     case WebAssembly::CATCH_ALL:
     case WebAssembly::CATCH_ALL_S:
-      assert(LastSeenEHInst != END_TRY);
       // There can be multiple catch instructions for one try instruction, so we
-      // only print 'catch' label when the last seen EH instruction was 'try'.
-      if (LastSeenEHInst == TRY) {
+      // print a label only for the first 'catch' label.
+      if (LastSeenEHInst != CATCH) {
         assert(!EHPadStack.empty() && "try-catch mismatch!");
         printAnnotation(OS, "catch" + utostr(EHPadStack.pop_back_val()) + ':');
       }
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index a3b3901f019..b1955017c68 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -739,7 +739,20 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
       case WebAssembly::CATCH_I32:
       case WebAssembly::CATCH_I64:
       case WebAssembly::CATCH_ALL:
-        EHPadStack.push_back(&MBB);
+        // Currently the only case there are more than one catch for a try is
+        // for catch terminate pad, in the form of
+        //   try
+        //   catch
+        //     call @__clang_call_terminate
+        //     unreachable
+        //   catch_all
+        //     call @std::terminate
+        //     unreachable
+        //   end
+        // So we shouldn't push the current BB for the second catch_all block
+        // here.
+        if (!WebAssembly::isCatchAllTerminatePad(MBB))
+          EHPadStack.push_back(&MBB);
         break;
 
       case WebAssembly::LOOP:
diff --git a/test/CodeGen/WebAssembly/exception.ll b/test/CodeGen/WebAssembly/exception.ll
index 1714ad6dc40..bd7935c3684 100644
--- a/test/CodeGen/WebAssembly/exception.ll
+++ b/test/CodeGen/WebAssembly/exception.ll
@@ -1,5 +1,6 @@
 ; RUN: not llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
-- 
GitLab


From 11e8afc67a208dfd1596c00888585c4fa293764f Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Fri, 2 Nov 2018 19:25:09 +0000
Subject: [PATCH 0928/1116] [WebAssembly] Change indices types to unsined int
 (NFC)

Summary:
This changes int types to unsigned int in a few places: function indices
and `wasm::Valtype` (which is unsigend int enum).  Currently these
values cannot have negative values anyway, so this should not be a
functional change for now.

Reviewers: sbc100

Subscribers: dschuff, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D54044

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346031 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/MC/WasmObjectWriter.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index f9318ad5801..c1e0b7aa7ab 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -92,9 +92,9 @@ struct WasmFunctionTypeDenseMapInfo {
   static unsigned getHashValue(const WasmFunctionType &FuncTy) {
     uintptr_t Value = FuncTy.State;
     for (wasm::ValType Ret : FuncTy.Returns)
-      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Ret));
+      Value += DenseMapInfo<uint32_t>::getHashValue(uint32_t(Ret));
     for (wasm::ValType Param : FuncTy.Params)
-      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Param));
+      Value += DenseMapInfo<uint32_t>::getHashValue(uint32_t(Param));
     return Value;
   }
   static bool isEqual(const WasmFunctionType &LHS,
@@ -118,7 +118,7 @@ struct WasmDataSegment {
 
 // A wasm function to be written into the function section.
 struct WasmFunction {
-  int32_t Type;
+  uint32_t Type;
   const MCSymbolWasm *Sym;
 };
 
@@ -231,7 +231,7 @@ class WasmObjectWriter : public MCObjectWriter {
   // Map from section to defining function symbol.
   DenseMap<const MCSection *, const MCSymbol *> SectionFunctions;
 
-  DenseMap<WasmFunctionType, int32_t, WasmFunctionTypeDenseMapInfo>
+  DenseMap<WasmFunctionType, uint32_t, WasmFunctionTypeDenseMapInfo>
       FunctionTypeIndices;
   SmallVector<WasmFunctionType, 4> FunctionTypes;
   SmallVector<WasmGlobal, 4> Globals;
-- 
GitLab


From f15ca65fe6c7ee9e80247db5a88b24ffe2c79fd6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 2 Nov 2018 19:39:41 +0000
Subject: [PATCH 0929/1116] [X86][AVX512] Change mask ops on vpermi2var tests
 to not use zeroinitializer.

This is necessary as I'm wanting to remove the 'Constant Pool' shuffle decoding from getTargetShuffleMask - but using getTargetShuffleMaskIndices allows the shuffle combiner to realize that these calls are really broadcasts.....

As with a lot of the X86ISD::VPERMV3 code this causes some vperm2i/vperm2t shuffles to flip depending on optimal commutation.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346032 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx512vbmi-intrinsics.ll   | 12 +++++------
 test/CodeGen/X86/avx512vbmivl-intrinsics.ll | 24 +++++++++------------
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/test/CodeGen/X86/avx512vbmi-intrinsics.ll b/test/CodeGen/X86/avx512vbmi-intrinsics.ll
index 80dc7fcd703..ffce664b8d6 100644
--- a/test/CodeGen/X86/avx512vbmi-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vbmi-intrinsics.ll
@@ -77,9 +77,8 @@ define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x
 ; X86-NEXT:    vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda]
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca]
-; X86-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4]
-; X86-NEXT:    vpermi2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x75,0xe2]
-; X86-NEXT:    vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3]
+; X86-NEXT:    vpermt2b %zmm2, %zmm3, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xc9,0x7d,0xc2]
+; X86-NEXT:    vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3]
 ; X86-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -89,15 +88,14 @@ define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x
 ; X64-NEXT:    vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda]
 ; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca]
-; X64-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4]
-; X64-NEXT:    vpermi2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x75,0xe2]
-; X64-NEXT:    vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3]
+; X64-NEXT:    vpermt2b %zmm2, %zmm3, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xc9,0x7d,0xc2]
+; X64-NEXT:    vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3]
 ; X64-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2)
   %2 = bitcast i64 %x3 to <64 x i1>
   %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x1
-  %4 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2)
+  %4 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %1, <64 x i8> %x2)
   %5 = bitcast i64 %x3 to <64 x i1>
   %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer
   %7 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2)
diff --git a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
index 272ac903a9a..79f32103ddd 100644
--- a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
@@ -139,9 +139,8 @@ define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x
 ; X86-NEXT:    vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca]
-; X86-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X86-NEXT:    vpermi2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x75,0xe2]
-; X86-NEXT:    vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3]
+; X86-NEXT:    vpermt2b %xmm2, %xmm3, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0x89,0x7d,0xc2]
+; X86-NEXT:    vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -151,15 +150,14 @@ define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x
 ; X64-NEXT:    vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca]
-; X64-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X64-NEXT:    vpermi2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x75,0xe2]
-; X64-NEXT:    vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3]
+; X64-NEXT:    vpermt2b %xmm2, %xmm3, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0x89,0x7d,0xc2]
+; X64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2)
   %2 = bitcast i16 %x3 to <16 x i1>
   %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x1
-  %4 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2)
+  %4 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %1, <16 x i8> %x2)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer
   %7 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2)
@@ -177,9 +175,8 @@ define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x
 ; X86-NEXT:    vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca]
-; X86-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X86-NEXT:    vpermi2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x75,0xe2]
-; X86-NEXT:    vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3]
+; X86-NEXT:    vpermt2b %ymm2, %ymm3, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xa9,0x7d,0xc2]
+; X86-NEXT:    vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3]
 ; X86-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -189,15 +186,14 @@ define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x
 ; X64-NEXT:    vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca]
-; X64-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X64-NEXT:    vpermi2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x75,0xe2]
-; X64-NEXT:    vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3]
+; X64-NEXT:    vpermt2b %ymm2, %ymm3, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xa9,0x7d,0xc2]
+; X64-NEXT:    vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3]
 ; X64-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2)
   %2 = bitcast i32 %x3 to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x1
-  %4 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2)
+  %4 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %1, <32 x i8> %x2)
   %5 = bitcast i32 %x3 to <32 x i1>
   %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer
   %7 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2)
-- 
GitLab


From 584721e76a33c583943548098c001ec3b1f95ef5 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@lowrisc.org>
Date: Fri, 2 Nov 2018 19:50:38 +0000
Subject: [PATCH 0930/1116] [RISCV] Add some missing expansions for
 floating-point intrinsics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A number of intrinsics, such as llvm.sin.f32, would result in a failure to
select. This patch adds expansions for the relevant selection DAG nodes, as
well as exhaustive testing for all f32 and f64 intrinsics.

The codegen for FMA remains a TODO item, pending support for the various
RISC-V FMA instruction variants.

The llvm.minimum.f32.* and llvm.maximum.* tests are commented-out, pending
upstream support for target-independent expansion, as discussed in
http://lists.llvm.org/pipermail/llvm-dev/2018-November/127408.html.

Differential Revision: https://reviews.llvm.org/D54034
Patch by Luís Marques.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346034 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/RISCV/RISCVISelLowering.cpp  |   9 +
 test/CodeGen/RISCV/double-intrinsics.ll | 398 +++++++++++++++++++++++-
 test/CodeGen/RISCV/float-intrinsics.ll  | 359 +++++++++++++++++++++
 3 files changed, 759 insertions(+), 7 deletions(-)
 create mode 100644 test/CodeGen/RISCV/float-intrinsics.ll

diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp
index 5c347ca4684..85758c0cdf8 100644
--- a/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -111,6 +111,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       ISD::SETUGT, ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE,
       ISD::SETGT,  ISD::SETGE,  ISD::SETNE};
 
+  // TODO: add proper support for the various FMA variants
+  // (FMADD.S, FMSUB.S, FNMSUB.S, FNMADD.S).
+  ISD::NodeType FPOpToExtend[] = {
+      ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FMA};
+
   if (Subtarget.hasStdExtF()) {
     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
@@ -119,6 +124,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
     setOperationAction(ISD::SELECT, MVT::f32, Custom);
     setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+    for (auto Op : FPOpToExtend)
+      setOperationAction(Op, MVT::f32, Expand);
   }
 
   if (Subtarget.hasStdExtD()) {
@@ -131,6 +138,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BR_CC, MVT::f64, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+    for (auto Op : FPOpToExtend)
+      setOperationAction(Op, MVT::f64, Expand);
   }
 
   setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
diff --git a/test/CodeGen/RISCV/double-intrinsics.ll b/test/CodeGen/RISCV/double-intrinsics.ll
index 7d80d2cc8e0..4a5239f4f01 100644
--- a/test/CodeGen/RISCV/double-intrinsics.ll
+++ b/test/CodeGen/RISCV/double-intrinsics.ll
@@ -2,14 +2,323 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+d -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32IFD %s
 
-declare double @llvm.floor.f64(double)
+declare double @llvm.sqrt.f64(double)
+
+define double @sqrt_f64(double %a) {
+; RV32IFD-LABEL: sqrt_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    fsqrt.d ft0, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.sqrt.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.powi.f64(double, i32)
+
+define double @powi_f64(double %a, i32 %b) {
+; RV32IFD-LABEL: powi_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call __powidf2
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.powi.f64(double %a, i32 %b)
+	ret double %1
+}
+
+declare double @llvm.sin.f64(double)
+
+define double @sin_f64(double %a) {
+; RV32IFD-LABEL: sin_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call sin
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.sin.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.cos.f64(double)
+
+define double @cos_f64(double %a) {
+; RV32IFD-LABEL: cos_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call cos
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.cos.f64(double %a)
+	ret double %1
+}
+
+; The sin+cos combination results in an FSINCOS SelectionDAG node.
+define double @sincos_f64(double %a) {
+; RV32IFD-LABEL: sincos_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -32
+; RV32IFD-NEXT:    sw ra, 28(sp)
+; RV32IFD-NEXT:    sw s1, 24(sp)
+; RV32IFD-NEXT:    sw s2, 20(sp)
+; RV32IFD-NEXT:    sw s3, 16(sp)
+; RV32IFD-NEXT:    sw s4, 12(sp)
+; RV32IFD-NEXT:    mv s2, a1
+; RV32IFD-NEXT:    mv s1, a0
+; RV32IFD-NEXT:    call sin
+; RV32IFD-NEXT:    mv s3, a0
+; RV32IFD-NEXT:    mv s4, a1
+; RV32IFD-NEXT:    mv a0, s1
+; RV32IFD-NEXT:    mv a1, s2
+; RV32IFD-NEXT:    call cos
+; RV32IFD-NEXT:    sw a0, 0(sp)
+; RV32IFD-NEXT:    sw a1, 4(sp)
+; RV32IFD-NEXT:    fld ft0, 0(sp)
+; RV32IFD-NEXT:    sw s3, 0(sp)
+; RV32IFD-NEXT:    sw s4, 4(sp)
+; RV32IFD-NEXT:    fld ft1, 0(sp)
+; RV32IFD-NEXT:    fadd.d ft0, ft1, ft0
+; RV32IFD-NEXT:    fsd ft0, 0(sp)
+; RV32IFD-NEXT:    lw a0, 0(sp)
+; RV32IFD-NEXT:    lw a1, 4(sp)
+; RV32IFD-NEXT:    lw s4, 12(sp)
+; RV32IFD-NEXT:    lw s3, 16(sp)
+; RV32IFD-NEXT:    lw s2, 20(sp)
+; RV32IFD-NEXT:    lw s1, 24(sp)
+; RV32IFD-NEXT:    lw ra, 28(sp)
+; RV32IFD-NEXT:    addi sp, sp, 32
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.sin.f64(double %a)
+  %2 = call double @llvm.cos.f64(double %a)
+  %3 = fadd double %1, %2
+	ret double %3
+}
+
+declare double @llvm.pow.f64(double, double)
+
+define double @pow_f64(double %a, double %b) {
+; RV32IFD-LABEL: pow_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call pow
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.pow.f64(double %a, double %b)
+	ret double %1
+}
+
+declare double @llvm.exp.f64(double)
+
+define double @exp_f64(double %a) {
+; RV32IFD-LABEL: exp_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call exp
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.exp.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.exp2.f64(double)
+
+define double @exp2_f64(double %a) {
+; RV32IFD-LABEL: exp2_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call exp2
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.exp2.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.log.f64(double)
+
+define double @log_f64(double %a) {
+; RV32IFD-LABEL: log_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call log
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.log.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.log10.f64(double)
+
+define double @log10_f64(double %a) {
+; RV32IFD-LABEL: log10_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call log10
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.log10.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.log2.f64(double)
+
+define double @log2_f64(double %a) {
+; RV32IFD-LABEL: log2_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call log2
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.log2.f64(double %a)
+	ret double %1
+}
 
-; The call to ffloor is introduced very late, meaning this test case covers
-; aspects of passing f64 on RV32D soft-float that double-calling-conv.ll
-; doesn't.
+declare double @llvm.fma.f64(double, double, double)
 
-define double @foo(double %a) nounwind {
-; RV32IFD-LABEL: foo:
+; TODO: Select RISC-V FMA instruction.
+define double @fma_f64(double %a, double %b, double %c) {
+; RV32IFD-LABEL: fma_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call fma
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.fma.f64(double %a, double %b, double %c)
+	ret double %1
+}
+
+declare double @llvm.fabs.f64(double)
+
+define double @fabs_f64(double %a) {
+; RV32IFD-LABEL: fabs_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    fabs.d ft0, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.fabs.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.minnum.f64(double, double)
+
+define double @minnum_f64(double %a, double %b) nounwind {
+; RV32IFD-LABEL: minnum_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    fmin.d ft0, ft1, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.minnum.f64(double %a, double %b)
+  ret double %1
+}
+
+declare double @llvm.maxnum.f64(double, double)
+
+define double @maxnum_f64(double %a, double %b) nounwind {
+; RV32IFD-LABEL: maxnum_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    fmax.d ft0, ft1, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.maxnum.f64(double %a, double %b)
+  ret double %1
+}
+
+; TODO: FMINNAN and FMAXNAN aren't handled in
+; SelectionDAGLegalize::ExpandNode.
+
+; declare double @llvm.minimum.f64(double, double)
+
+; define double @fminimum_f64(double %a, double %b) nounwind {
+;   %1 = call double @llvm.minimum.f64(double %a, double %b)
+;   ret double %1
+; }
+
+; declare double @llvm.maximum.f64(double, double)
+
+; define double @fmaximum_f64(double %a, double %b) nounwind {
+;   %1 = call double @llvm.maximum.f64(double %a, double %b)
+;   ret double %1
+; }
+
+declare double @llvm.copysign.f64(double, double)
+
+define double @copysign_f64(double %a, double %b) nounwind {
+; RV32IFD-LABEL: copysign_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    fsgnj.d ft0, ft1, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.copysign.f64(double %a, double %b)
+  ret double %1
+}
+
+declare double @llvm.floor.f64(double)
+
+define double @floor_f64(double %a) {
+; RV32IFD-LABEL: floor_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
 ; RV32IFD-NEXT:    sw ra, 12(sp)
@@ -18,5 +327,80 @@ define double @foo(double %a) nounwind {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.floor.f64(double %a)
-  ret double %1
+	ret double %1
+}
+
+declare double @llvm.ceil.f64(double)
+
+define double @ceil_f64(double %a) {
+; RV32IFD-LABEL: ceil_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call ceil
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.ceil.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.trunc.f64(double)
+
+define double @trunc_f64(double %a) {
+; RV32IFD-LABEL: trunc_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call trunc
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.trunc.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.rint.f64(double)
+
+define double @rint_f64(double %a) {
+; RV32IFD-LABEL: rint_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call rint
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.rint.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.nearbyint.f64(double)
+
+define double @nearbyint_f64(double %a) {
+; RV32IFD-LABEL: nearbyint_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call nearbyint
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.nearbyint.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.round.f64(double)
+
+define double @round_f64(double %a) {
+; RV32IFD-LABEL: round_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call round
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.round.f64(double %a)
+	ret double %1
 }
diff --git a/test/CodeGen/RISCV/float-intrinsics.ll b/test/CodeGen/RISCV/float-intrinsics.ll
new file mode 100644
index 00000000000..1da644f5f9e
--- /dev/null
+++ b/test/CodeGen/RISCV/float-intrinsics.ll
@@ -0,0 +1,359 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+f -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32IF %s
+; RUN: llc -mtriple=riscv32 -mattr=+d -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32IF %s
+
+declare float @llvm.sqrt.f32(float)
+
+define float @sqrt_f32(float %a) {
+; RV32IF-LABEL: sqrt_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a0
+; RV32IF-NEXT:    fsqrt.s ft0, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.sqrt.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.powi.f32(float, i32)
+
+define float @powi_f32(float %a, i32 %b) {
+; RV32IF-LABEL: powi_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call __powisf2
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.powi.f32(float %a, i32 %b)
+	ret float %1
+}
+
+declare float @llvm.sin.f32(float)
+
+define float @sin_f32(float %a) {
+; RV32IF-LABEL: sin_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call sinf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.sin.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.cos.f32(float)
+
+define float @cos_f32(float %a) {
+; RV32IF-LABEL: cos_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call cosf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.cos.f32(float %a)
+	ret float %1
+}
+
+; The sin+cos combination results in an FSINCOS SelectionDAG node.
+define float @sincos_f32(float %a) {
+; RV32IF-LABEL: sincos_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    sw s1, 8(sp)
+; RV32IF-NEXT:    sw s2, 4(sp)
+; RV32IF-NEXT:    mv s1, a0
+; RV32IF-NEXT:    call sinf
+; RV32IF-NEXT:    mv s2, a0
+; RV32IF-NEXT:    mv a0, s1
+; RV32IF-NEXT:    call cosf
+; RV32IF-NEXT:    fmv.w.x ft0, a0
+; RV32IF-NEXT:    fmv.w.x ft1, s2
+; RV32IF-NEXT:    fadd.s ft0, ft1, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    lw s2, 4(sp)
+; RV32IF-NEXT:    lw s1, 8(sp)
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.sin.f32(float %a)
+  %2 = call float @llvm.cos.f32(float %a)
+  %3 = fadd float %1, %2
+	ret float %3
+}
+
+declare float @llvm.pow.f32(float, float)
+
+define float @pow_f32(float %a, float %b) {
+; RV32IF-LABEL: pow_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call powf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.pow.f32(float %a, float %b)
+	ret float %1
+}
+
+declare float @llvm.exp.f32(float)
+
+define float @exp_f32(float %a) {
+; RV32IF-LABEL: exp_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call expf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.exp.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.exp2.f32(float)
+
+define float @exp2_f32(float %a) {
+; RV32IF-LABEL: exp2_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call exp2f
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.exp2.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.log.f32(float)
+
+define float @log_f32(float %a) {
+; RV32IF-LABEL: log_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call logf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.log.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.log10.f32(float)
+
+define float @log10_f32(float %a) {
+; RV32IF-LABEL: log10_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call log10f
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.log10.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.log2.f32(float)
+
+define float @log2_f32(float %a) {
+; RV32IF-LABEL: log2_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call log2f
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.log2.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+; TODO: Select RISC-V FMA instruction.
+define float @fma_f32(float %a, float %b, float %c) {
+; RV32IF-LABEL: fma_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call fmaf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.fma.f32(float %a, float %b, float %c)
+	ret float %1
+}
+
+declare float @llvm.fabs.f32(float)
+
+define float @fabs_f32(float %a) {
+; RV32IF-LABEL: fabs_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    lui a1, 524288
+; RV32IF-NEXT:    addi a1, a1, -1
+; RV32IF-NEXT:    and a0, a0, a1
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.fabs.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.minnum.f32(float, float)
+
+define float @minnum_f32(float %a, float %b) nounwind {
+; RV32IF-LABEL: minnum_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a1
+; RV32IF-NEXT:    fmv.w.x ft1, a0
+; RV32IF-NEXT:    fmin.s ft0, ft1, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.minnum.f32(float %a, float %b)
+  ret float %1
+}
+
+declare float @llvm.maxnum.f32(float, float)
+
+define float @maxnum_f32(float %a, float %b) nounwind {
+; RV32IF-LABEL: maxnum_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a1
+; RV32IF-NEXT:    fmv.w.x ft1, a0
+; RV32IF-NEXT:    fmax.s ft0, ft1, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.maxnum.f32(float %a, float %b)
+  ret float %1
+}
+
+; TODO: FMINNAN and FMAXNAN aren't handled in
+; SelectionDAGLegalize::ExpandNode.
+
+; declare float @llvm.minimum.f32(float, float)
+
+; define float @fminimum_f32(float %a, float %b) nounwind {
+;   %1 = call float @llvm.minimum.f32(float %a, float %b)
+;   ret float %1
+; }
+
+; declare float @llvm.maximum.f32(float, float)
+
+; define float @fmaximum_f32(float %a, float %b) nounwind {
+;   %1 = call float @llvm.maximum.f32(float %a, float %b)
+;   ret float %1
+; }
+
+declare float @llvm.copysign.f32(float, float)
+
+define float @copysign_f32(float %a, float %b) nounwind {
+; RV32IF-LABEL: copysign_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a1
+; RV32IF-NEXT:    fmv.w.x ft1, a0
+; RV32IF-NEXT:    fsgnj.s ft0, ft1, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.copysign.f32(float %a, float %b)
+  ret float %1
+}
+
+declare float @llvm.floor.f32(float)
+
+define float @floor_f32(float %a) {
+; RV32IF-LABEL: floor_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call floorf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.floor.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.ceil.f32(float)
+
+define float @ceil_f32(float %a) {
+; RV32IF-LABEL: ceil_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call ceilf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.ceil.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.trunc.f32(float)
+
+define float @trunc_f32(float %a) {
+; RV32IF-LABEL: trunc_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call truncf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.trunc.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.rint.f32(float)
+
+define float @rint_f32(float %a) {
+; RV32IF-LABEL: rint_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call rintf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.rint.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.nearbyint.f32(float)
+
+define float @nearbyint_f32(float %a) {
+; RV32IF-LABEL: nearbyint_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call nearbyintf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.nearbyint.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.round.f32(float)
+
+define float @round_f32(float %a) {
+; RV32IF-LABEL: round_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call roundf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.round.f32(float %a)
+	ret float %1
+}
-- 
GitLab


From 349754262574b697785dc1f68339b28fa9d5f0af Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Fri, 2 Nov 2018 19:59:08 +0000
Subject: [PATCH 0931/1116] [AArch64] [Windows] Misc fixes for llvm-readobj
 -unwind.

Use getImageBase() helper to compute the image base. Fix various
offsets/addresses/masks so they're actually correct.

This allows decoding unwind info from DLLs, and unwind info from object
files containing multiple functions.

Differential Revision: https://reviews.llvm.org/D54015


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346036 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/ARMWinEH.h            |  4 +--
 test/tools/llvm-readobj/arm64-win-error1.s |  5 +--
 tools/llvm-readobj/ARMWinEHPrinter.cpp     | 38 +++++++++++-----------
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/include/llvm/Support/ARMWinEH.h b/include/llvm/Support/ARMWinEH.h
index 4f05965ed25..60174503ad4 100644
--- a/include/llvm/Support/ARMWinEH.h
+++ b/include/llvm/Support/ARMWinEH.h
@@ -339,7 +339,7 @@ struct EpilogueScope {
     return ((ES & 0xff000000) >> 24);
   }
 
-  uint8_t EpilogueStartIndexAArch64() const {
+  uint16_t EpilogueStartIndexAArch64() const {
     return ((ES & 0xffc00000) >> 22);
   }
 };
@@ -428,7 +428,7 @@ struct ExceptionDataRecord {
 
 inline size_t HeaderWords(const ExceptionDataRecord &XR) {
   if (XR.isAArch64)
-    return (XR.Data[0] & 0xffc0000) ? 1 : 2;
+    return (XR.Data[0] & 0xffc00000) ? 1 : 2;
   return (XR.Data[0] & 0xff800000) ? 1 : 2;
 }
 }
diff --git a/test/tools/llvm-readobj/arm64-win-error1.s b/test/tools/llvm-readobj/arm64-win-error1.s
index ba59edf3dea..cd449efb550 100644
--- a/test/tools/llvm-readobj/arm64-win-error1.s
+++ b/test/tools/llvm-readobj/arm64-win-error1.s
@@ -7,6 +7,7 @@
 
 // CHECK:     Prologue [
 // CHECK:        0xdf                ; Bad opcode!
+// CHECK:        0xff                ; Bad opcode!
 // CHECK:        0xd600              ; stp x19, lr, [sp, #0]
 // CHECK:        0x01                ; sub sp, #16
 // CHECK:        0xe4                ; end
@@ -48,6 +49,6 @@
 	.long		0x10800012
 	.long 		0x8
 	.long 		0xe
-	.long 		0x100d6df
-	.long 		0xe3e3e3e4
+	.long 		0x00d6ffdf
+	.long 		0xe3e3e401
 
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.cpp b/tools/llvm-readobj/ARMWinEHPrinter.cpp
index 56dd6c0aed4..eb575894db5 100644
--- a/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -788,7 +788,7 @@ void Decoder::decodeOpcodes(ArrayRef<uint8_t> Opcodes, unsigned Offset,
       if ((isAArch64 && (DI >= array_lengthof(Ring64))) ||
           (!isAArch64 && (DI >= array_lengthof(Ring)))) {
         SW.startLine() << format("0x%02x                ; Bad opcode!\n",
-                                 Opcodes.data()[Offset]);
+                                 Opcodes.data()[OI]);
         ++OI;
         break;
       }
@@ -871,6 +871,8 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
       SW.printNumber("EpilogueStartIndex",
                      isAArch64 ? ES.EpilogueStartIndexAArch64()
                                : ES.EpilogueStartIndexARM());
+      if (ES.ES & ~0xffc3ffff)
+        SW.printNumber("ReservedBits", (ES.ES >> 18) & 0xF);
 
       ListScope Opcodes(SW, "Opcodes");
       decodeOpcodes(XData.UnwindByteCode(),
@@ -887,10 +889,15 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
                                + (XData.E() ? 0 : XData.EpilogueCount())
                                + XData.CodeWords();
 
-    ErrorOr<SymbolRef> Symbol =
-      getRelocatedSymbol(COFF, Section, HandlerOffset * sizeof(uint32_t));
+    ErrorOr<SymbolRef> Symbol = getRelocatedSymbol(
+        COFF, Section, Offset + HandlerOffset * sizeof(uint32_t));
     if (!Symbol)
       Symbol = getSymbol(COFF, Address, /*FunctionOnly=*/true);
+    if (!Symbol) {
+      ListScope EHS(SW, "ExceptionHandler");
+      SW.printString("Routine", "(null)");
+      return true;
+    }
 
     Expected<StringRef> Name = Symbol->getName();
     if (!Name) {
@@ -950,10 +957,7 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
     }
     FunctionAddress = *FunctionAddressOrErr;
   } else {
-    const pe32_header *PEHeader;
-    if (COFF.getPE32Header(PEHeader))
-      return false;
-    FunctionAddress = PEHeader->ImageBase + RF.BeginAddress;
+    FunctionAddress = COFF.getImageBase() + RF.BeginAddress;
   }
 
   SW.printString("Function", formatSymbol(FunctionName, FunctionAddress));
@@ -988,22 +992,18 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
     }
     section_iterator SI = *SIOrErr;
 
-    return dumpXDataRecord(COFF, *SI, FunctionAddress, Address);
+    // FIXME: Do we need to add an offset from the relocation?
+    return dumpXDataRecord(COFF, *SI, FunctionAddress,
+                           RF.ExceptionInformationRVA());
   } else {
-    const pe32_header *PEHeader;
-    if (COFF.getPE32Header(PEHeader))
-      return false;
-
-    uint64_t Address = PEHeader->ImageBase + RF.ExceptionInformationRVA();
+    uint64_t Address = COFF.getImageBase() + RF.ExceptionInformationRVA();
     SW.printString("ExceptionRecord", formatSymbol("", Address));
 
-    ErrorOr<SectionRef> Section =
-      getSectionContaining(COFF, RF.ExceptionInformationRVA());
+    ErrorOr<SectionRef> Section = getSectionContaining(COFF, Address);
     if (!Section)
       return false;
 
-    return dumpXDataRecord(COFF, *Section, FunctionAddress,
-                           RF.ExceptionInformationRVA());
+    return dumpXDataRecord(COFF, *Section, FunctionAddress, Address);
   }
 }
 
@@ -1073,8 +1073,8 @@ bool Decoder::dumpProcedureDataEntry(const COFFObjectFile &COFF,
   if (Entry.Flag() == RuntimeFunctionFlag::RFF_Unpacked)
     return dumpUnpackedEntry(COFF, Section, Offset, Index, Entry);
   if (isAArch64) {
-    llvm::errs() << "Packed unwind data not yet supported for ARM64\n";
-    return false;
+    SW.startLine() << "Packed unwind data not yet supported for ARM64\n";
+    return true;
   }
   return dumpPackedEntry(COFF, Section, Offset, Index, Entry);
 }
-- 
GitLab


From e29a02ac3f1cde9d83e0317c856c4b095a087325 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 2 Nov 2018 20:34:40 +0000
Subject: [PATCH 0932/1116] [DWARF] Fix typo, .gnu_index -> .gdb_index

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346039 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/DWARF/DWARFContext.cpp        | 2 +-
 test/DebugInfo/dwarfdump-dump-gdbindex.test | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index a29c9c2f160..00e37d7b7c5 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -581,7 +581,7 @@ void DWARFContext::dump(
                              DObj->getStringDWOSection(), dwo_units(),
                              isLittleEndian(), getMaxDWOVersion());
 
-  if (shouldDump(Explicit, ".gnu_index", DIDT_ID_GdbIndex,
+  if (shouldDump(Explicit, ".gdb_index", DIDT_ID_GdbIndex,
                  DObj->getGdbIndexSection())) {
     getGdbIndex().dump(OS);
   }
diff --git a/test/DebugInfo/dwarfdump-dump-gdbindex.test b/test/DebugInfo/dwarfdump-dump-gdbindex.test
index cd5cd132d5d..2ff13eb4cf0 100644
--- a/test/DebugInfo/dwarfdump-dump-gdbindex.test
+++ b/test/DebugInfo/dwarfdump-dump-gdbindex.test
@@ -10,7 +10,7 @@ RUN: llvm-dwarfdump -gdb-index %p/Inputs/dwarfdump-gdbindex-v7.elf-x86-64 | File
 ; gcc version 5.3.1 20160413, GNU gold (GNU Binutils for Ubuntu 2.26) 1.11
 ; Info about gdb-index: https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html
 
-; CHECK-LABEL: .gnu_index contents:
+; CHECK-LABEL: .gdb_index contents:
 ; CHECK: Version = 7
 
 ; CHECK:      CU list offset = 0x18, has 2 entries:
-- 
GitLab


From 83e97cbe0d2105582fd8af5f0977f3c34a70a70a Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 2 Nov 2018 21:09:49 +0000
Subject: [PATCH 0933/1116] [X86] Don't emit *_extend_vector_inreg nodes when
 both the input and output types are legal with AVX1

We already have custom lowering for the AVX case in LegalizeVectorOps. So its better to keep the regular extend op around as long as possible.

I had to qualify one place in DAG combine that created illegal vector extending load operations. This change by itself had no effect on any tests which is why its included here.

I've made a few cleanups to the custom lowering. The sign extend code no longer creates an identity shuffle with undef elements. The zero extend code now emits a zero_extend_vector_inreg instead of an unpckl with a zero vector.

For the high half of the custom lowering of zero_extend/any_extend, we're now using an unpckh with a zero vector or undef. Previously we used used a pshufd to move the upper 64-bits to the lower 64-bits and then used a zero_extend_vector_inreg. I think the zero vector should require less execution resources and be smaller code size.

Differential Revision: https://reviews.llvm.org/D54024

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346043 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   2 +-
 lib/Target/X86/X86ISelLowering.cpp       |  45 ++-
 test/CodeGen/X86/avg.ll                  | 332 +++++++++++------------
 test/CodeGen/X86/cast-vsel.ll            |  13 +-
 test/CodeGen/X86/madd.ll                 |   9 +-
 test/CodeGen/X86/psubus.ll               |  96 +++----
 test/CodeGen/X86/shrink_vmul.ll          |  84 +++---
 test/CodeGen/X86/v8i1-masks.ll           |  16 +-
 test/CodeGen/X86/vec_cast2.ll            |  18 +-
 test/CodeGen/X86/vec_int_to_fp.ll        |  18 +-
 test/CodeGen/X86/vector-pcmp.ll          |   6 +-
 test/CodeGen/X86/vector-zext.ll          |  24 +-
 12 files changed, 315 insertions(+), 348 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8c2f9e8d1f4..f318b7fdb39 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8391,7 +8391,7 @@ static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner,
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   EVT MemVT = LN0->getMemoryVT();
-  if ((LegalOperations || LN0->isVolatile()) &&
+  if ((LegalOperations || LN0->isVolatile() || VT.isVector()) &&
       !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
     return {};
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index d95f72035e0..57e4cba9078 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -17446,27 +17446,26 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   // Optimize vectors in AVX mode:
   //
   //   v8i16 -> v8i32
-  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
+  //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.
   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
   //   Concat upper and lower parts.
   //
   //   v4i32 -> v4i64
-  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
+  //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.
   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   //   Concat upper and lower parts.
   //
 
-  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
+  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
+                                VT.getVectorNumElements() / 2);
+
+  SDValue OpLo = DAG.getZeroExtendVectorInReg(In, dl, HalfVT);
+
+  SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
   SDValue Undef = DAG.getUNDEF(InVT);
   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
-  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
-
-  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
-                             VT.getVectorNumElements()/2);
-
-  OpLo = DAG.getBitcast(HVT, OpLo);
-  OpHi = DAG.getBitcast(HVT, OpHi);
+  OpHi = DAG.getBitcast(HalfVT, OpHi);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
@@ -19878,29 +19877,21 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   //              v4i32 to v4i64
   //
   // Divide input vector into two parts
-  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+  // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
   // concat the vectors to original VT
 
-  unsigned NumElems = InVT.getVectorNumElements();
-  SDValue Undef = DAG.getUNDEF(InVT);
-
-  SmallVector<int,8> ShufMask1(NumElems, -1);
-  for (unsigned i = 0; i != NumElems/2; ++i)
-    ShufMask1[i] = i;
+  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
+                                VT.getVectorNumElements() / 2);
 
-  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
+  SDValue OpLo = DAG.getSignExtendVectorInReg(In, dl, HalfVT);
 
-  SmallVector<int,8> ShufMask2(NumElems, -1);
+  unsigned NumElems = InVT.getVectorNumElements();
+  SmallVector<int,8> ShufMask(NumElems, -1);
   for (unsigned i = 0; i != NumElems/2; ++i)
-    ShufMask2[i] = i + NumElems/2;
-
-  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
-
-  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
-                                VT.getVectorNumElements() / 2);
+    ShufMask[i] = i + NumElems/2;
 
-  OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
+  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
   OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
@@ -38323,7 +38314,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
 
   // On AVX2+ targets, if the input/output types are both legal then we will be
   // able to use SIGN_EXTEND/ZERO_EXTEND directly.
-  if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+  if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
     return SDValue();
 
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index 84f1296d51c..f090585951b 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -2142,243 +2142,231 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    pushq %r12
 ; AVX1-NEXT:    pushq %rbx
 ; AVX1-NEXT:    subq $24, %rsp
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm6, %rdi
+; AVX1-NEXT:    vmovq %xmm6, %rbp
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm5, %rbx
-; AVX1-NEXT:    vmovq %xmm5, %rbp
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX1-NEXT:    vmovq %xmm4, %rcx
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %r8
-; AVX1-NEXT:    vmovq %xmm4, %r11
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %r13
-; AVX1-NEXT:    vmovq %xmm3, %r12
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %r15
-; AVX1-NEXT:    vmovq %xmm4, %rdi
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    vmovq %xmm3, %r10
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX1-NEXT:    addq %rbx, %rdx
-; AVX1-NEXT:    vmovq %xmm4, %r9
-; AVX1-NEXT:    addq %rbp, %r9
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX1-NEXT:    addq %rsi, %rax
-; AVX1-NEXT:    movq %rax, %r14
-; AVX1-NEXT:    vmovq %xmm3, %rbp
-; AVX1-NEXT:    addq %rcx, %rbp
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vmovq %xmm5, %rsi
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX1-NEXT:    vmovq %xmm5, %rcx
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm6, %r13
+; AVX1-NEXT:    vmovq %xmm6, %r12
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm5, %r11
+; AVX1-NEXT:    vmovq %xmm5, %r14
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rsi
-; AVX1-NEXT:    addq %r8, %rsi
-; AVX1-NEXT:    vmovq %xmm3, %rax
-; AVX1-NEXT:    addq %r11, %rax
-; AVX1-NEXT:    movq %rax, %r11
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm5, %r9
+; AVX1-NEXT:    vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX1-NEXT:    addq %r13, %rax
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    vmovq %xmm2, %rax
-; AVX1-NEXT:    addq %r12, %rax
-; AVX1-NEXT:    movq %rax, %r8
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX1-NEXT:    addq %r15, %rax
-; AVX1-NEXT:    movq %rax, %rbx
-; AVX1-NEXT:    vmovq %xmm3, %rax
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm5, %rax
 ; AVX1-NEXT:    addq %rdi, %rax
+; AVX1-NEXT:    movq %rax, %rdi
+; AVX1-NEXT:    vmovq %xmm5, %rax
+; AVX1-NEXT:    addq %rbp, %rax
+; AVX1-NEXT:    movq %rax, %rbp
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm4, %r15
+; AVX1-NEXT:    addq %rbx, %r15
+; AVX1-NEXT:    vmovq %xmm4, %r10
+; AVX1-NEXT:    addq %rsi, %r10
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX1-NEXT:    addq %rdx, %rax
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    vmovq %xmm4, %r8
+; AVX1-NEXT:    addq %rcx, %r8
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm5, %rcx
+; AVX1-NEXT:    addq %r13, %rcx
+; AVX1-NEXT:    vmovq %xmm5, %rax
+; AVX1-NEXT:    addq %r12, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX1-NEXT:    addq %r11, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vmovq %xmm2, %rax
-; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX1-NEXT:    vmovq %xmm4, %rax
+; AVX1-NEXT:    addq %r14, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX1-NEXT:    addq %r9, %rax
+; AVX1-NEXT:    movq %rax, %r13
+; AVX1-NEXT:    vmovq %xmm1, %rbx
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
 ; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vmovq %xmm2, %r12
-; AVX1-NEXT:    addq %r10, %r12
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm0, %r10
-; AVX1-NEXT:    addq %rax, %r10
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vmovq %xmm0, %rdi
-; AVX1-NEXT:    addq %rax, %rdi
-; AVX1-NEXT:    addq $-1, %rdx
-; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %eax
-; AVX1-NEXT:    adcq $-1, %rax
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r9
-; AVX1-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX1-NEXT:    addq %rax, %rsi
+; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm2, %rax
+; AVX1-NEXT:    vmovq %xmm0, %rsi
+; AVX1-NEXT:    addq %rax, %rsi
+; AVX1-NEXT:    addq $-1, %rdi
+; AVX1-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r14
-; AVX1-NEXT:    movq %r14, (%rsp) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %rbp
+; AVX1-NEXT:    movq %rbp, (%rsp) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rbp
-; AVX1-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %r15
+; AVX1-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rsi
-; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %r10
+; AVX1-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r11
-; AVX1-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %rdx
+; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rcx
-; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %ebp
-; AVX1-NEXT:    adcq $-1, %rbp
 ; AVX1-NEXT:    addq $-1, %r8
 ; AVX1-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %r15d
-; AVX1-NEXT:    adcq $-1, %r15
-; AVX1-NEXT:    addq $-1, %rbx
-; AVX1-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movl $0, %r12d
+; AVX1-NEXT:    adcq $-1, %r12
+; AVX1-NEXT:    addq $-1, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
-; AVX1-NEXT:    movq %rax, %rsi
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    addq $-1, %rax
+; AVX1-NEXT:    movl $0, %ecx
+; AVX1-NEXT:    adcq $-1, %rcx
 ; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    movl $0, %r13d
-; AVX1-NEXT:    adcq $-1, %r13
+; AVX1-NEXT:    movl $0, %edx
+; AVX1-NEXT:    adcq $-1, %rdx
 ; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    movl $0, %r15d
+; AVX1-NEXT:    adcq $-1, %r15
+; AVX1-NEXT:    addq $-1, %r13
 ; AVX1-NEXT:    movl $0, %r14d
 ; AVX1-NEXT:    adcq $-1, %r14
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    addq $-1, %rdx
+; AVX1-NEXT:    addq $-1, %rbx
 ; AVX1-NEXT:    movl $0, %r11d
 ; AVX1-NEXT:    adcq $-1, %r11
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    addq $-1, %rax
-; AVX1-NEXT:    movl $0, %ebx
-; AVX1-NEXT:    adcq $-1, %rbx
-; AVX1-NEXT:    addq $-1, %r12
-; AVX1-NEXT:    movl $0, %r9d
-; AVX1-NEXT:    adcq $-1, %r9
-; AVX1-NEXT:    addq $-1, %r10
+; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    movl $0, %r8d
 ; AVX1-NEXT:    adcq $-1, %r8
-; AVX1-NEXT:    addq $-1, %rdi
-; AVX1-NEXT:    movl $0, %ecx
-; AVX1-NEXT:    adcq $-1, %rcx
-; AVX1-NEXT:    shldq $63, %rdi, %rcx
+; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    movl $0, %edi
+; AVX1-NEXT:    adcq $-1, %rdi
+; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    movl $0, %r10d
+; AVX1-NEXT:    adcq $-1, %r10
+; AVX1-NEXT:    movq %rsi, %rbp
+; AVX1-NEXT:    addq $-1, %rbp
+; AVX1-NEXT:    movl $0, %r9d
+; AVX1-NEXT:    adcq $-1, %r9
+; AVX1-NEXT:    shldq $63, %rbx, %r11
+; AVX1-NEXT:    shldq $63, %r13, %r14
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbx, %r15
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbx, %rdx
+; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    shldq $63, %rax, %rcx
 ; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    shldq $63, %r10, %r8
-; AVX1-NEXT:    shldq $63, %r12, %r9
-; AVX1-NEXT:    shldq $63, %rax, %rbx
-; AVX1-NEXT:    shldq $63, %rdx, %r11
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rdx, %r14
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rdx, %r13
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rsi
-; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %r15
+; AVX1-NEXT:    shldq $63, %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rbp
+; AVX1-NEXT:    shldq $63, %rax, %r12
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rax, %rsi
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %rdx
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rax, %rcx
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rdi
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX1-NEXT:    movq (%rsp), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %r12
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %r10
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rdx, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm8
-; AVX1-NEXT:    vmovq %r10, %xmm0
-; AVX1-NEXT:    vmovq %r12, %xmm1
-; AVX1-NEXT:    vmovq %rdi, %xmm11
-; AVX1-NEXT:    vmovq %rcx, %xmm2
-; AVX1-NEXT:    vmovq %rsi, %xmm13
-; AVX1-NEXT:    vmovq %rbp, %xmm14
-; AVX1-NEXT:    vmovq %r15, %xmm15
-; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 8-byte Folded Reload
+; AVX1-NEXT:    movq (%rsp), %rbx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbx, %rax
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %r13, %rbx
+; AVX1-NEXT:    shldq $63, %rbp, %r9
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbp, %r10
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbp, %rdi
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbp, %r8
+; AVX1-NEXT:    vmovq %rbx, %xmm8
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vmovq %rcx, %xmm1
+; AVX1-NEXT:    vmovq %rdx, %xmm11
+; AVX1-NEXT:    vmovq %rsi, %xmm2
+; AVX1-NEXT:    vmovq %r12, %xmm13
+; AVX1-NEXT:    vmovq %r8, %xmm14
+; AVX1-NEXT:    vmovq %rdi, %xmm15
+; AVX1-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 8-byte Reload
 ; AVX1-NEXT:    # xmm9 = mem[0],zero
-; AVX1-NEXT:    vmovq %r13, %xmm10
-; AVX1-NEXT:    vmovq %r14, %xmm12
-; AVX1-NEXT:    vmovq %r11, %xmm3
-; AVX1-NEXT:    vmovq %rbx, %xmm4
-; AVX1-NEXT:    vmovq %r9, %xmm5
-; AVX1-NEXT:    vmovq %r8, %xmm6
-; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
-; AVX1-NEXT:    # xmm7 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 8-byte Reload
+; AVX1-NEXT:    # xmm10 = mem[0],zero
+; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 8-byte Folded Reload
+; AVX1-NEXT:    # xmm12 = mem[0],zero
+; AVX1-NEXT:    vmovq %r15, %xmm3
+; AVX1-NEXT:    vmovq %r14, %xmm4
+; AVX1-NEXT:    vmovq %r11, %xmm5
+; AVX1-NEXT:    vmovq %r10, %xmm6
+; AVX1-NEXT:    vmovq %r9, %xmm7
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0]
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm11[0],xmm1[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm0[0,2],xmm8[0,2]
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm13[0],xmm2[0]
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm15[0],xmm14[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm11 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm11 = xmm1[0,2],xmm0[0,2]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm8, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm11, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm10[0],xmm9[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm10[0],xmm9[0]
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[0,2],xmm2[0,2]
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm3[0,2]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll
index b1e4243d01a..03efb540216 100644
--- a/test/CodeGen/X86/cast-vsel.ll
+++ b/test/CodeGen/X86/cast-vsel.ll
@@ -93,15 +93,14 @@ define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
 ; AVX1-LABEL: zext:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vcmpltps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext:
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index 4cb6daeec1a..9d29aa08cab 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -2061,12 +2061,11 @@ define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) {
 ;
 ; AVX1-LABEL: pmaddwd_negative1:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index 6f4a3812ffa..6e2e97980c7 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -547,25 +547,25 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
 ;
 ; AVX1-LABEL: test13:
 ; AVX1:       # %bb.0: # %vector.ph
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpmaxud %xmm5, %xmm2, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm2, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpackssdw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -930,25 +930,25 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
 ;
 ; AVX1-LABEL: test15:
 ; AVX1:       # %bb.0: # %vector.ph
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpminud %xmm5, %xmm2, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm2, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpackssdw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -1064,25 +1064,25 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
 ;
 ; AVX1-LABEL: test16:
 ; AVX1:       # %bb.0: # %vector.ph
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpmaxud %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpackssdw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index 729fb2f567e..3c9fa66d047 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -2316,8 +2316,8 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-AVX1-NEXT:    pushl %esi
 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 20
-; X86-AVX1-NEXT:    subl $16, %esp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 36
+; X86-AVX1-NEXT:    subl $8, %esp
+; X86-AVX1-NEXT:    .cfi_def_cfa_offset 28
 ; X86-AVX1-NEXT:    .cfi_offset %esi, -20
 ; X86-AVX1-NEXT:    .cfi_offset %edi, -16
 ; X86-AVX1-NEXT:    .cfi_offset %ebx, -12
@@ -2326,8 +2326,8 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    vmovdqa (%eax), %ymm2
 ; X86-AVX1-NEXT:    vmovdqa (%ecx), %ymm1
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
@@ -2339,50 +2339,50 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-AVX1-NEXT:    vpextrd $3, %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
 ; X86-AVX1-NEXT:    vpextrd $2, %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    movl %edx, %edi
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-AVX1-NEXT:    movl %edx, %ebx
 ; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
 ; X86-AVX1-NEXT:    vmovd %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    movl %edx, %ebp
+; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, %ebx
+; X86-AVX1-NEXT:    movl %edx, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %esi
 ; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %esi
 ; X86-AVX1-NEXT:    divl %esi
 ; X86-AVX1-NEXT:    movl %edx, %esi
+; X86-AVX1-NEXT:    vmovd %ebp, %xmm2
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %edi
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; X86-AVX1-NEXT:    divl %edi
-; X86-AVX1-NEXT:    movl %edx, %edi
-; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
+; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %ebp
+; X86-AVX1-NEXT:    divl %ebp
+; X86-AVX1-NEXT:    movl %edx, %ebp
+; X86-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm2, %xmm2
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
-; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    vmovd %edx, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $3, %ebx, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vmovd %ebp, %xmm1
-; X86-AVX1-NEXT:    vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT:    vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vpinsrd $2, %edi, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm1, %edi
+; X86-AVX1-NEXT:    vpinsrd $3, (%esp), %xmm0, %xmm0 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %edi
+; X86-AVX1-NEXT:    vmovd %edx, %xmm1
+; X86-AVX1-NEXT:    vpinsrd $1, %ebp, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Folded Reload
 ; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
 ; X86-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
@@ -2390,11 +2390,11 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]
 ; X86-AVX1-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpmulld %xmm4, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm1
 ; X86-AVX1-NEXT:    vmovd %xmm1, (%eax)
 ; X86-AVX1-NEXT:    vmovaps %ymm0, (%eax)
-; X86-AVX1-NEXT:    addl $16, %esp
+; X86-AVX1-NEXT:    addl $8, %esp
 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 20
 ; X86-AVX1-NEXT:    popl %esi
 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 16
@@ -2589,8 +2589,8 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X64-AVX1-NEXT:    .cfi_offset %rbp, -16
 ; X64-AVX1-NEXT:    vmovdqa (%rdi), %ymm2
 ; X64-AVX1-NEXT:    vmovdqa (%rsi), %ymm1
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
@@ -2618,38 +2618,38 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %esi
+; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %edi
-; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
 ; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %ecx
-; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %ebx
 ; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
+; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %ebx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ebx
 ; X64-AVX1-NEXT:    movl %edx, %ebx
-; X64-AVX1-NEXT:    vmovd %xmm1, %ebp
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vmovd %xmm1, %ebp
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ebp
-; X64-AVX1-NEXT:    vmovd %edx, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %esi, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
 ; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vmovd %esi, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vmovd %edx, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm2, %xmm2
 ; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vmovd %r8d, %xmm1
 ; X64-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X64-AVX1-NEXT:    vmovd %eax, %xmm2
diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll
index 7f9ae2e8518..de97281d60e 100644
--- a/test/CodeGen/X86/v8i1-masks.ll
+++ b/test/CodeGen/X86/v8i1-masks.ll
@@ -131,12 +131,8 @@ define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) {
 ; X32-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; X32-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vpand LCPI2_0, %xmm0, %xmm0
-; X32-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT:    vandps LCPI2_0, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: and_mask_constant:
@@ -145,12 +141,8 @@ define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) {
 ; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; X64-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; X64-NEXT:    retq
 ;
 ; X32-AVX2-LABEL: and_mask_constant:
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index 9a5feb83dbb..1bc4b690487 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -87,10 +87,10 @@ define <8 x float> @cvt_v8u8_v8f32(<8 x i8> %src) {
 ; CHECK-LABEL: cvt_v8u8_v8f32:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vpand LCPI4_0, %xmm0, %xmm0
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
 ;
@@ -109,19 +109,19 @@ define <8 x float> @cvt_v8u8_v8f32(<8 x i8> %src) {
 define <8 x float> @cvt_v8u16_v8f32(<8 x i16> %src) {
 ; CHECK-LABEL: cvt_v8u16_v8f32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-WIDE-LABEL: cvt_v8u16_v8f32:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-WIDE-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-WIDE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; CHECK-WIDE-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-WIDE-NEXT:    retl
   %res = uitofp <8 x i16> %src to <8 x float>
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index f28fca7b75b..d1091b37cdb 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -2413,10 +2413,10 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
 ;
 ; AVX1-LABEL: uitofp_8i16_to_4f32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX1-NEXT:    vzeroupper
@@ -2952,10 +2952,10 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
 ;
 ; AVX1-LABEL: uitofp_8i16_to_8f32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -5729,10 +5729,8 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
 ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    movq 24(%rdi), %rax
-; AVX1-NEXT:    vmovdqu 8(%rdi), %xmm0
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd 16(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    vmovaps %ymm0, (%rax)
diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll
index 683a56544cb..b75f61f3e6c 100644
--- a/test/CodeGen/X86/vector-pcmp.ll
+++ b/test/CodeGen/X86/vector-pcmp.ll
@@ -464,10 +464,10 @@ define <8 x i32> @cmpne_knownzeros_zext_v8i16_v8i32(<8 x i16> %x) {
 ; AVX1-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index ac7c81a8fb6..6de913079a5 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -64,10 +64,10 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
 ;
 ; AVX1-LABEL: zext_16i8_to_16i16:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_16i8_to_16i16:
@@ -526,10 +526,10 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
 ;
 ; AVX1-LABEL: zext_8i16_to_8i32:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_8i16_to_8i32:
@@ -825,10 +825,10 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
 ;
 ; AVX1-LABEL: zext_4i32_to_4i64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_4i32_to_4i64:
@@ -1540,10 +1540,10 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
 ; AVX1-LABEL: zext_8i8_to_8i32:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_8i8_to_8i32:
-- 
GitLab


From ef5bf36c741d327d2139bb3754c5cb7ec6fdbd4d Mon Sep 17 00:00:00 2001
From: Wouter van Oortmerssen <aardappel@gmail.com>
Date: Fri, 2 Nov 2018 22:04:33 +0000
Subject: [PATCH 0934/1116] [WebAssembly] Parsing missing directives to produce
 valid .o

Summary:
The assembler was able to assemble and then dump back to .s, but
was failing to parse certain directives necessary for valid .o
output:
- .type directives are now recognized to distinguish function symbols
  and others.
- .size is now parsed to provide function size.
- .globaltype (introduced in https://reviews.llvm.org/D54012) is now
  recognized to ensure symbols like __stack_pointer have a proper type
  set for both .s and .o output.

Also added tests for the above.

Reviewers: sbc100, dschuff

Subscribers: jgravelle-google, aheejin, dexonsmith, kristina, llvm-commits, sunfish

Differential Revision: https://reviews.llvm.org/D53842

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346047 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/MC/MCWasmStreamer.cpp                     |   2 +-
 .../AsmParser/WebAssemblyAsmParser.cpp        | 132 +++++++++++++-----
 test/MC/WebAssembly/basic-assembly.s          |   8 +-
 3 files changed, 108 insertions(+), 34 deletions(-)

diff --git a/lib/MC/MCWasmStreamer.cpp b/lib/MC/MCWasmStreamer.cpp
index 7321a30dd94..d2a152058b9 100644
--- a/lib/MC/MCWasmStreamer.cpp
+++ b/lib/MC/MCWasmStreamer.cpp
@@ -61,7 +61,7 @@ void MCWasmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
 void MCWasmStreamer::ChangeSection(MCSection *Section,
                                    const MCExpr *Subsection) {
   MCAssembler &Asm = getAssembler();
-  auto *SectionWasm = static_cast<const MCSectionWasm *>(Section);
+  auto *SectionWasm = cast<MCSectionWasm>(Section);
   const MCSymbol *Grp = SectionWasm->getGroup();
   if (Grp)
     Asm.registerSymbol(*Grp);
diff --git a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index c257e98d55d..efa6793cff2 100644
--- a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/TargetRegistry.h"
 
@@ -131,14 +132,14 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
 class WebAssemblyAsmParser final : public MCTargetAsmParser {
   MCAsmParser &Parser;
   MCAsmLexer &Lexer;
-  MCSymbol *LastLabel;
+  MCSymbolWasm *LastSymbol;
 
 public:
-  WebAssemblyAsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
-                       const MCInstrInfo &mii, const MCTargetOptions &Options)
-      : MCTargetAsmParser(Options, sti, mii), Parser(Parser),
-        Lexer(Parser.getLexer()), LastLabel(nullptr) {
-    setAvailableFeatures(ComputeAvailableFeatures(sti.getFeatureBits()));
+  WebAssemblyAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+                       const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, STI, MII), Parser(Parser),
+        Lexer(Parser.getLexer()), LastSymbol(nullptr) {
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
 #define GET_ASSEMBLER_HEADER
@@ -168,24 +169,26 @@ public:
     return false;
   }
 
-  MVT::SimpleValueType ParseRegType(const StringRef &RegType) {
+
+  std::pair<MVT::SimpleValueType, unsigned>
+  ParseRegType(const StringRef &RegType) {
     // Derive type from .param .local decls, or the instruction itself.
-    return StringSwitch<MVT::SimpleValueType>(RegType)
-        .Case("i32", MVT::i32)
-        .Case("i64", MVT::i64)
-        .Case("f32", MVT::f32)
-        .Case("f64", MVT::f64)
-        .Case("i8x16", MVT::v16i8)
-        .Case("i16x8", MVT::v8i16)
-        .Case("i32x4", MVT::v4i32)
-        .Case("i64x2", MVT::v2i64)
-        .Case("f32x4", MVT::v4f32)
-        .Case("f64x2", MVT::v2f64)
+    return StringSwitch<std::pair<MVT::SimpleValueType, unsigned>>(RegType)
+        .Case("i32", {MVT::i32, wasm::WASM_TYPE_I32})
+        .Case("i64", {MVT::i64, wasm::WASM_TYPE_I64})
+        .Case("f32", {MVT::f32, wasm::WASM_TYPE_F32})
+        .Case("f64", {MVT::f64, wasm::WASM_TYPE_F64})
+        .Case("i8x16", {MVT::v16i8, wasm::WASM_TYPE_V128})
+        .Case("i16x8", {MVT::v8i16, wasm::WASM_TYPE_V128})
+        .Case("i32x4", {MVT::v4i32, wasm::WASM_TYPE_V128})
+        .Case("i64x2", {MVT::v2i64, wasm::WASM_TYPE_V128})
+        .Case("f32x4", {MVT::v4f32, wasm::WASM_TYPE_V128})
+        .Case("f64x2", {MVT::v2f64, wasm::WASM_TYPE_V128})
         // arbitrarily chosen vector type to associate with "v128"
         // FIXME: should these be EVTs to avoid this arbitrary hack? Do we want
         // to accept more specific SIMD register types?
-        .Case("v128", MVT::v16i8)
-        .Default(MVT::INVALID_SIMPLE_VALUE_TYPE);
+        .Case("v128", {MVT::v16i8, wasm::WASM_TYPE_V128})
+        .Default({MVT::INVALID_SIMPLE_VALUE_TYPE, wasm::WASM_TYPE_NORESULT});
   }
 
   void ParseSingleInteger(bool IsNegative, OperandVector &Operands) {
@@ -311,24 +314,84 @@ public:
     return false;
   }
 
-  void onLabelParsed(MCSymbol *Symbol) override { LastLabel = Symbol; }
+  void onLabelParsed(MCSymbol *Symbol) override {
+    LastSymbol = cast<MCSymbolWasm>(Symbol);
+  }
 
   bool ParseDirective(AsmToken DirectiveID) override {
+    // This function has a really weird return value behavior that is different
+    // from all the other parsing functions:
+    // - return true && no tokens consumed -> don't know this directive / let
+    //   the generic parser handle it.
+    // - return true && tokens consumed -> a parsing error occurred.
+    // - return false -> processed this directive successfully.
     assert(DirectiveID.getKind() == AsmToken::Identifier);
     auto &Out = getStreamer();
     auto &TOut =
         reinterpret_cast<WebAssemblyTargetStreamer &>(*Out.getTargetStreamer());
-    // TODO: we're just parsing the subset of directives we're interested in,
-    // and ignoring ones we don't recognise. We should ideally verify
-    // all directives here.
+    // TODO: any time we return an error, at least one token must have been
+    // consumed, otherwise this will not signal an error to the caller.
     if (DirectiveID.getString() == ".type") {
       // This could be the start of a function, check if followed by
       // "label,@function"
-      if (!(IsNext(AsmToken::Identifier) && IsNext(AsmToken::Comma) &&
-            IsNext(AsmToken::At) && Lexer.is(AsmToken::Identifier)))
+      if (!Lexer.is(AsmToken::Identifier))
+        return Error("Expected label after .type directive, got: ",
+                     Lexer.getTok());
+      auto WasmSym = cast<MCSymbolWasm>(
+                       TOut.getStreamer().getContext().getOrCreateSymbol(
+                         Lexer.getTok().getString()));
+      Parser.Lex();
+      if (!(IsNext(AsmToken::Comma) && IsNext(AsmToken::At) &&
+            Lexer.is(AsmToken::Identifier)))
         return Error("Expected label,@type declaration, got: ", Lexer.getTok());
+      auto TypeName = Lexer.getTok().getString();
+      if (TypeName == "function")
+        WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+      else if (TypeName == "global")
+        WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+      else
+        return Error("Unknown WASM symbol type: ", Lexer.getTok());
+      Parser.Lex();
+      return Expect(AsmToken::EndOfStatement, "EOL");
+    } else if (DirectiveID.getString() == ".size") {
+      if (!Lexer.is(AsmToken::Identifier))
+        return Error("Expected label after .size directive, got: ",
+                     Lexer.getTok());
+      auto WasmSym = cast<MCSymbolWasm>(
+                       TOut.getStreamer().getContext().getOrCreateSymbol(
+                         Lexer.getTok().getString()));
       Parser.Lex();
-      // Out.EmitSymbolAttribute(??, MCSA_ELF_TypeFunction);
+      if (!IsNext(AsmToken::Comma))
+        return Error("Expected `,`, got: ", Lexer.getTok());
+      const MCExpr *Exp;
+      if (Parser.parseExpression(Exp))
+        return Error("Cannot parse .size expression: ", Lexer.getTok());
+      WasmSym->setSize(Exp);
+      return Expect(AsmToken::EndOfStatement, "EOL");
+    } else if (DirectiveID.getString() == ".globaltype") {
+      if (!Lexer.is(AsmToken::Identifier))
+        return Error("Expected symbol name after .globaltype directive, got: ",
+                     Lexer.getTok());
+      auto Name = Lexer.getTok().getString();
+      Parser.Lex();
+      if (!IsNext(AsmToken::Comma))
+        return Error("Expected `,`, got: ", Lexer.getTok());
+      if (!Lexer.is(AsmToken::Identifier))
+        return Error("Expected type in .globaltype directive, got: ",
+                     Lexer.getTok());
+      auto Type = ParseRegType(Lexer.getTok().getString()).second;
+      if (Type == wasm::WASM_TYPE_NORESULT)
+        return Error("Unknown type in .globaltype directive: ",
+                     Lexer.getTok());
+      Parser.Lex();
+      // Now set this symbol with the correct type.
+      auto WasmSym = cast<MCSymbolWasm>(
+                       TOut.getStreamer().getContext().getOrCreateSymbol(Name));
+      WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+      WasmSym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), true});
+      // And emit the directive again.
+      TOut.emitGlobalType(WasmSym);
+      return Expect(AsmToken::EndOfStatement, "EOL");
     } else if (DirectiveID.getString() == ".param" ||
                DirectiveID.getString() == ".local") {
       // Track the number of locals, needed for correct virtual register
@@ -337,7 +400,7 @@ public:
       std::vector<MVT> Params;
       std::vector<MVT> Locals;
       while (Lexer.is(AsmToken::Identifier)) {
-        auto RegType = ParseRegType(Lexer.getTok().getString());
+        auto RegType = ParseRegType(Lexer.getTok().getString()).first;
         if (RegType == MVT::INVALID_SIMPLE_VALUE_TYPE)
           return true;
         if (DirectiveID.getString() == ".param") {
@@ -349,15 +412,20 @@ public:
         if (!IsNext(AsmToken::Comma))
           break;
       }
-      assert(LastLabel);
-      TOut.emitParam(LastLabel, Params);
+      assert(LastSymbol);
+      // TODO: LastSymbol isn't even used by emitParam, so could be removed.
+      TOut.emitParam(LastSymbol, Params);
       TOut.emitLocal(Locals);
+      return Expect(AsmToken::EndOfStatement, "EOL");
     } else {
-      // For now, ignore anydirective we don't recognize:
+      // TODO: remove.
       while (Lexer.isNot(AsmToken::EndOfStatement))
         Parser.Lex();
+      return Expect(AsmToken::EndOfStatement, "EOL");
     }
-    return Expect(AsmToken::EndOfStatement, "EOL");
+    // TODO: current ELF directive parsing is broken, fix this is a followup.
+    //return true;  // We didn't process this directive.
+    return false;
   }
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned & /*Opcode*/,
diff --git a/test/MC/WebAssembly/basic-assembly.s b/test/MC/WebAssembly/basic-assembly.s
index cfffda57bcc..c2b316c9243 100644
--- a/test/MC/WebAssembly/basic-assembly.s
+++ b/test/MC/WebAssembly/basic-assembly.s
@@ -1,4 +1,6 @@
 # RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s
+# this one is just here to see if it converts to .o without errors, but doesn't check any output:
+# RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -mattr=+simd128,+nontrapping-fptoint,+exception-handling < %s
 
     .text
     .type    test0,@function
@@ -56,7 +58,9 @@ test0:
     #i32.trunc_s:sat/f32
     get_global  __stack_pointer@GLOBAL
     end_function
-
+.Lfunc_end0:
+	.size	test0, .Lfunc_end0-test0
+    .globaltype	__stack_pointer, i32
 
 # CHECK:           .text
 # CHECK-LABEL: test0:
@@ -104,3 +108,5 @@ test0:
 # CHECK-NEXT:      end_try
 # CHECK-NEXT:      get_global  __stack_pointer@GLOBAL
 # CHECK-NEXT:      end_function
+
+# CHECK:           .globaltype	__stack_pointer, i32
-- 
GitLab


From aeef43e1bb2002691342816e0e50f090ae8edbe6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 2 Nov 2018 22:48:02 +0000
Subject: [PATCH 0935/1116] [X86] In LowerEXTEND_VECTOR_INREG, emit a vector
 shuffle instead of directly using X86ISD::UNPCKL

The majority of the changes are because the rest of shuffle lowering/combining prefers to replace the undef input with the other operand. Using UNPCKL directly seemed to avoid this and just grabbed a randomish register for the undef which can create false dependencies.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346050 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp |   2 +-
 test/CodeGen/X86/madd.ll           |  36 ++---
 test/CodeGen/X86/vec_int_to_fp.ll  |  16 +--
 test/CodeGen/X86/vector-sext.ll    | 220 ++++++++++++++---------------
 4 files changed, 134 insertions(+), 140 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 57e4cba9078..7bf1cc6875e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -19820,7 +19820,7 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
   // As SRAI is only available on i16/i32 types, we expand only up to i32
   // and handle i64 separately.
   while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
-    Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
+    Curr = getUnpackl(DAG, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
     Curr = DAG.getBitcast(CurrVT, Curr);
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index 9d29aa08cab..bf46887b074 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -954,38 +954,38 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB7_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    movq {{.*#+}} xmm6 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm6
 ; SSE2-NEXT:    movq {{.*#+}} xmm7 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm7
 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pmaddwd %xmm5, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm9
-; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pmaddwd %xmm6, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm4
+; SSE2-NEXT:    movq {{.*#+}} xmm6 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm6
+; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm5
 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
 ; SSE2-NEXT:    pmaddwd %xmm7, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm9
 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
 ; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    paddd %xmm2, %xmm4
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pmaddwd %xmm6, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm3
 ; SSE2-NEXT:    addq $32, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB7_1
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index d1091b37cdb..9ea75e49351 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -1726,13 +1726,11 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
 define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
 ; SSE2-LABEL: sitofp_8i8_to_8f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    psrad $24, %xmm1
 ; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psrad $24, %xmm0
 ; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
@@ -1776,13 +1774,11 @@ define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
 define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
 ; SSE2-LABEL: sitofp_16i8_to_8f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    psrad $24, %xmm1
 ; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psrad $24, %xmm0
 ; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index b5f3b76c00c..9576f6482fd 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -46,20 +46,20 @@ entry:
 define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_16i8_to_16i16:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_16i8_to_16i16:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    psraw $8, %xmm2
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSSE3-NEXT:    psraw $8, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_16i16:
@@ -103,30 +103,32 @@ entry:
 define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_32i8_to_32i16:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    psraw $8, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm3
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_32i8_to_32i16:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSSE3-NEXT:    psraw $8, %xmm4
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSSE3-NEXT:    psraw $8, %xmm5
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    psraw $8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; SSSE3-NEXT:    psraw $8, %xmm2
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSSE3-NEXT:    psraw $8, %xmm3
-; SSSE3-NEXT:    movdqa %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa %xmm5, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_32i8_to_32i16:
@@ -230,24 +232,22 @@ entry:
 define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_16i8_to_8i32:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    psrad $24, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    psrad $24, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_16i8_to_8i32:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    psrad $24, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psrad $24, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_8i32:
@@ -292,37 +292,34 @@ entry:
 define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_16i8_to_16i32:
 ; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
 ; SSE2-NEXT:    psrad $24, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    psrad $24, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; SSE2-NEXT:    psrad $24, %xmm3
 ; SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_16i8_to_16i32:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    psrad $24, %xmm0
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    psrad $24, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSSE3-NEXT:    psrad $24, %xmm4
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    psrad $24, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[u,u,u,12,u,u,u,13,u,u,u,14,u,u,u,15]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psrad $24, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; SSSE3-NEXT:    psrad $24, %xmm3
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_16i32:
@@ -424,14 +421,13 @@ entry:
 define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_16i8_to_4i64:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    psrad $24, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
@@ -442,18 +438,19 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
 ;
 ; SSSE3-LABEL: sext_16i8_to_4i64:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    psrad $24, %xmm0
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,u,u,u,3,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    psrad $24, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
 ; SSSE3-NEXT:    psrad $24, %xmm1
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_4i64:
@@ -498,63 +495,62 @@ entry:
 define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_16i8_to_8i64:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    psrad $24, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    psrad $24, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT:    psrld $16, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    psrad $24, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_16i8_to_8i64:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <u,u,u,2,u,u,u,3,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    psrad $24, %xmm4
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    psrad $31, %xmm0
 ; SSSE3-NEXT:    psrad $24, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
-; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    psrad $24, %xmm0
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSSE3-NEXT:    pshufb %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    psrad $24, %xmm3
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
 ; SSSE3-NEXT:    psrad $24, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    psrad $24, %xmm3
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_8i64:
@@ -1291,7 +1287,8 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    movzwl (%rdi), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,u,u,u,1,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $31, %xmm1
 ; SSSE3-NEXT:    psrad $24, %xmm0
@@ -5064,7 +5061,8 @@ define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movzwl (%rdi), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,u,u,u,1,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSSE3-NEXT:    psrad $24, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; SSSE3-NEXT:    paddq %xmm0, %xmm0
-- 
GitLab


From 057bda604a7bdd9088900fbb5e1f314e44d516c1 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Fri, 2 Nov 2018 23:49:21 +0000
Subject: [PATCH 0936/1116] [LTO] Fix a crash caused by accessing an empty
 ValueInfo

ModuleSummaryIndex::exportToDot crashes when linking the Linux kernel
under ThinLTO using LLVMgold.so. This is due to the exportToDot
function trying to get the GUID of an empty ValueInfo. The root cause
related to the fact that we attempt to get the GUID of an aliasee
via its OriginalGUID recorded in the aliasee summary, and that is not
always possible. Specifically, we cannot do this mapping when the value
is internal linkage and there were other internal linkage symbols with
the same name.

There are 2 fixes for the problem included here.

1) In all cases where we can currently print the dot file from the
command line (which is only via save-temps), we have a valid AliaseeGUID
in the AliasSummary. Use that when it is available, so that we can get
the correct aliasee GUID whenever possible.

2) However, if we were to invoke exportToDot from the debugger right
after it is built during the initial analysis step (i.e. the per-module
summary), we won't have the AliaseeGUID field populated. In that case,
we have a fallback fix that will simply print "@"+GUID when we aren't
able to get the GUID from the OriginalGUID. It simply checks if the VI
is valid or not before attempting to get the name. Additionally, since
getAliaseeGUID will assert that the AliaseeGUID is non-zero, guard the
earlier fix #1 by a new function hasAliaseeGUID().

Reviewers: pcc, tmroeder

Subscribers: evgeny777, mehdi_amini, inglorion, dexonsmith, arphaman, llvm-commits

Differential Revision: https://reviews.llvm.org/D53986

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346055 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/ModuleSummaryIndex.h      |  1 +
 lib/IR/ModuleSummaryIndex.cpp             | 40 ++++++++++++++++-------
 test/ThinLTO/X86/Inputs/alias_internal.ll |  8 +++++
 test/ThinLTO/X86/alias_internal.ll        | 21 ++++++++++++
 4 files changed, 58 insertions(+), 12 deletions(-)
 create mode 100644 test/ThinLTO/X86/Inputs/alias_internal.ll
 create mode 100644 test/ThinLTO/X86/alias_internal.ll

diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index 778907b05eb..8510afe60a1 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -408,6 +408,7 @@ public:
     return const_cast<GlobalValueSummary &>(
                          static_cast<const AliasSummary *>(this)->getAliasee());
   }
+  bool hasAliaseeGUID() const { return AliaseeGUID != 0; }
   const GlobalValue::GUID &getAliaseeGUID() const {
     assert(AliaseeGUID && "Unexpected missing aliasee GUID");
     return AliaseeGUID;
diff --git a/lib/IR/ModuleSummaryIndex.cpp b/lib/IR/ModuleSummaryIndex.cpp
index d4368413584..e63407c3e75 100644
--- a/lib/IR/ModuleSummaryIndex.cpp
+++ b/lib/IR/ModuleSummaryIndex.cpp
@@ -198,9 +198,12 @@ static std::string getSummaryAttributes(GlobalValueSummary* GVS) {
          ", ffl: " + fflagsToString(FS->fflags());
 }
 
+static std::string getNodeVisualName(GlobalValue::GUID Id) {
+  return std::string("@") + std::to_string(Id);
+}
+
 static std::string getNodeVisualName(const ValueInfo &VI) {
-  return VI.name().empty() ? std::string("@") + std::to_string(VI.getGUID())
-                           : VI.name().str();
+  return VI.name().empty() ? getNodeVisualName(VI.getGUID()) : VI.name().str();
 }
 
 static std::string getNodeLabel(const ValueInfo &VI, GlobalValueSummary *GVS) {
@@ -221,13 +224,19 @@ static std::string getNodeLabel(const ValueInfo &VI, GlobalValueSummary *GVS) {
 // specific module associated with it. Typically this is function
 // or variable defined in native object or library.
 static void defineExternalNode(raw_ostream &OS, const char *Pfx,
-                               const ValueInfo &VI) {
-  auto StrId = std::to_string(VI.getGUID());
-  OS << "  " << StrId << " [label=\"" << getNodeVisualName(VI)
-     << "\"]; // defined externally\n";
+                               const ValueInfo &VI, GlobalValue::GUID Id) {
+  auto StrId = std::to_string(Id);
+  OS << "  " << StrId << " [label=\"";
+
+  if (VI) {
+    OS << getNodeVisualName(VI);
+  } else {
+    OS << getNodeVisualName(Id);
+  }
+  OS << "\"]; // defined externally\n";
 }
 
-void ModuleSummaryIndex::exportToDot(raw_ostream& OS) const {
+void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const {
   std::vector<Edge> CrossModuleEdges;
   DenseMap<GlobalValue::GUID, std::vector<uint64_t>> NodeMap;
   StringMap<GVSummaryMapTy> ModuleToDefinedGVS;
@@ -311,10 +320,17 @@ void ModuleSummaryIndex::exportToDot(raw_ostream& OS) const {
         Draw(SummaryIt.first, R.getGUID(), -1);
 
       if (auto *AS = dyn_cast_or_null<AliasSummary>(SummaryIt.second)) {
-        auto AliaseeOrigId = AS->getAliasee().getOriginalName();
-        auto AliaseeId = getGUIDFromOriginalID(AliaseeOrigId);
-
-        Draw(SummaryIt.first, AliaseeId ? AliaseeId : AliaseeOrigId, -2);
+        GlobalValue::GUID AliaseeId;
+        if (AS->hasAliaseeGUID())
+          AliaseeId = AS->getAliaseeGUID();
+        else {
+          auto AliaseeOrigId = AS->getAliasee().getOriginalName();
+          AliaseeId = getGUIDFromOriginalID(AliaseeOrigId);
+          if (!AliaseeId)
+            AliaseeId = AliaseeOrigId;
+        }
+
+        Draw(SummaryIt.first, AliaseeId, -2);
         continue;
       }
 
@@ -330,7 +346,7 @@ void ModuleSummaryIndex::exportToDot(raw_ostream& OS) const {
   for (auto &E : CrossModuleEdges) {
     auto &ModList = NodeMap[E.Dst];
     if (ModList.empty()) {
-      defineExternalNode(OS, "  ", getValueInfo(E.Dst));
+      defineExternalNode(OS, "  ", getValueInfo(E.Dst), E.Dst);
       // Add fake module to the list to draw an edge to an external node
       // in the loop below.
       ModList.push_back(-1);
diff --git a/test/ThinLTO/X86/Inputs/alias_internal.ll b/test/ThinLTO/X86/Inputs/alias_internal.ll
new file mode 100644
index 00000000000..e55e40b1d05
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/alias_internal.ll
@@ -0,0 +1,8 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal i32 @f(i8*) unnamed_addr {
+    ret i32 42
+}
+
+@a2 = weak alias i32 (i8*), i32 (i8*)* @f
diff --git a/test/ThinLTO/X86/alias_internal.ll b/test/ThinLTO/X86/alias_internal.ll
new file mode 100644
index 00000000000..d6433f6981d
--- /dev/null
+++ b/test/ThinLTO/X86/alias_internal.ll
@@ -0,0 +1,21 @@
+; Test to make sure dot dumper can correctly handle aliases to multiple
+; different internal aliasees with the same name.
+
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/alias_internal.ll -o %t2.bc
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.out -save-temps \
+; RUN:   -r %t1.bc,a1,plx \
+; RUN:   -r %t2.bc,a2,plx
+
+; RUN: cat %t.out.index.dot | FileCheck %s
+; CHECK-DAG: M0_12511626713252727690 -> M0_{{.*}} // alias
+; CHECK-DAG: M1_8129049334585965161 -> M1_{{.*}} // alias
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal i32 @f(i8*) unnamed_addr {
+    ret i32 42
+}
+
+@a1 = weak alias i32 (i8*), i32 (i8*)* @f
-- 
GitLab


From 974dfd1b517ed3286bb5369740e07b79123abbfe Mon Sep 17 00:00:00 2001
From: Wolfgang Pieb <Wolfgang.Pieb@sony.com>
Date: Sat, 3 Nov 2018 00:27:35 +0000
Subject: [PATCH 0937/1116] [DWARF v5] Verifier: Add checks for DW_FORM_strx*
 forms.

Adding functionality to the DWARF verifier for DWARF v5 strx* forms which
index into the string offsets table.

Differential Revision: https://reviews.llvm.org/D54049


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346061 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/DWARF/DWARFVerifier.cpp         | 39 ++++++++
 test/DebugInfo/X86/dwarfdump-str-offsets.s    | 31 ++++++-
 .../llvm-dwarfdump/X86/verify_debug_info.s    |  4 +-
 .../tools/llvm-dwarfdump/X86/verify_strings.s | 88 +++++++++++++++++++
 4 files changed, 156 insertions(+), 6 deletions(-)
 create mode 100644 test/tools/llvm-dwarfdump/X86/verify_strings.s

diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index f3b242c47d7..128bd0651ba 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -611,6 +611,45 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     }
     break;
   }
+  case DW_FORM_strx:
+  case DW_FORM_strx1:
+  case DW_FORM_strx2:
+  case DW_FORM_strx3:
+  case DW_FORM_strx4: {
+    auto Index = AttrValue.Value.getRawUValue();
+    auto DieCU = Die.getDwarfUnit();
+    // Check that we have a valid DWARF v5 string offsets table.
+    if (!DieCU->getStringOffsetsTableContribution()) {
+      ++NumErrors;
+      error() << FormEncodingString(Form)
+              << " used without a valid string offsets table:\n";
+      dump(Die) << '\n';
+      break;
+    }
+    // Check that the index is within the bounds of the section. 
+    unsigned ItemSize = DieCU->getDwarfStringOffsetsByteSize();
+    // Use a 64-bit type to calculate the offset to guard against overflow.
+    uint64_t Offset =
+        (uint64_t)DieCU->getStringOffsetsBase() + Index * ItemSize;
+    if (DObj.getStringOffsetSection().Data.size() < Offset + ItemSize) {
+      ++NumErrors;
+      error() << FormEncodingString(Form) << " uses index "
+              << format("%" PRIu64, Index) << ", which is too large:\n";
+      dump(Die) << '\n';
+      break;
+    }
+    // Check that the string offset is valid.
+    uint64_t StringOffset = *DieCU->getStringOffsetSectionItem(Index);
+    if (StringOffset >= DObj.getStringSection().size()) {
+      ++NumErrors;
+      error() << FormEncodingString(Form) << " uses index "
+              << format("%" PRIu64, Index)
+              << ", but the referenced string"
+                 " offset is beyond .debug_str bounds:\n";
+      dump(Die) << '\n';
+    }
+    break;
+  }
   default:
     break;
   }
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets.s b/test/DebugInfo/X86/dwarfdump-str-offsets.s
index e68f08b9c7a..2f4215a04ba 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets.s
@@ -1,5 +1,6 @@
 # RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o %t.o
 # RUN: llvm-dwarfdump -v %t.o 2> %t.err | FileCheck --check-prefix=COMMON --check-prefix=SPLIT %s
+# RUN: llvm-dwarfdump -verify %t.o | FileCheck --check-prefix=VERIFY %s
 # 
 # Check that we don't report an error on a non-existent range list table.
 # RUN: FileCheck -allow-empty --check-prefix ERR %s < %t.err
@@ -136,6 +137,8 @@ dwo_str_TU_5_type:
         .byte 0x00  # DW_CHILDREN_no
         .byte 0x03  # DW_AT_name
         .byte 0x26  # DW_FORM_strx2
+        .byte 0x49  # DW_AT_type
+        .byte 0x13  # DW_FORM_ref4
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
         .byte 0x06  # Abbrev code
@@ -143,6 +146,8 @@ dwo_str_TU_5_type:
         .byte 0x00  # DW_CHILDREN_no
         .byte 0x03  # DW_AT_name
         .byte 0x27  # DW_FORM_strx3
+        .byte 0x49  # DW_AT_type
+        .byte 0x13  # DW_FORM_ref4
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
         .byte 0x07  # Abbrev code
@@ -150,6 +155,15 @@ dwo_str_TU_5_type:
         .byte 0x00  # DW_CHILDREN_no
         .byte 0x03  # DW_AT_name
         .byte 0x28  # DW_FORM_strx4
+        .byte 0x49  # DW_AT_type
+        .byte 0x13  # DW_FORM_ref4
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x08  # Abbrev code
+        .byte 0x24  # DW_TAG_base_type
+        .byte 0x00  # DW_CHILDREN_no
+        .byte 0x3e  # DW_AT_encoding
+        .byte 0x0b  # DW_FORM_data1
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
         .byte 0x00  # EOM(3)
@@ -202,17 +216,24 @@ CU1_5_version:
 # A subprogram DIE with DW_AT_name, using DW_FORM_strx1.
         .byte 4                # Abbreviation code
         .byte 3                # Subprogram name string (DW_FORM_strx1)
-# A variable DIE with DW_AT_name, using DW_FORM_strx2.
+# A variable DIE with DW_AT_name, using DW_FORM_strx2, and DW_AT_type.
         .byte 5                # Abbreviation code
         .short 0x0004          # Subprogram name string (DW_FORM_strx2)
-# A variable DIE with DW_AT_name, using DW_FORM_strx3.
+        .long TypeDie-.debug_info
+# A variable DIE with DW_AT_name, using DW_FORM_strx3, and DW_AT_type.
         .byte 6                # Abbreviation code
         .byte 5                # Subprogram name string (DW_FORM_strx3)
         .short 0               # Subprogram name string (DW_FORM_strx3)
-# A variable DIE with DW_AT_name, using DW_FORM_strx4.
+        .long TypeDie-.debug_info
+# A variable DIE with DW_AT_name, using DW_FORM_strx4, and DW_AT_type.
         .byte 7                # Abbreviation code
-        .quad 0x00000006       # Subprogram name string (DW_FORM_strx4)
+        .long 6                # Subprogram name string (DW_FORM_strx4)
+        .long TypeDie-.debug_info
         .byte 0 # NULL
+# A base type DIE with DW_AT_encoding.
+TypeDie:
+        .byte 8                # Abbreviation code
+        .byte 5                # DW_ATE_signed
         .byte 0 # NULL
         .byte 0 # NULL
 CU1_5_end:
@@ -386,4 +407,6 @@ TU_split_5_end:
 # SPLIT-NEXT:  0x00000014: 00000047 "V5_split_type_unit"
 # SPLIT-NEXT:  0x00000018: 0000005a "V5_split_Mystruct"
 
+# VERIFY: No errors.
+
 # ERR-NOT: parsing a range list table:
diff --git a/test/tools/llvm-dwarfdump/X86/verify_debug_info.s b/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
index e5a748b89f9..e3eae9b986f 100644
--- a/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
+++ b/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
@@ -7,7 +7,7 @@
 # CHECK-NEXT: DW_AT_producer [DW_FORM_strp]	( .debug_str[0x00000000] = "clang version 5.0.0 (trunk 308185) (llvm/trunk 308186)")
 # CHECK-NEXT: DW_AT_language [DW_FORM_data2]	(DW_LANG_C99)
 # CHECK-NEXT: DW_AT_name [DW_FORM_strp]	( .debug_str[0x00000037] = "basic.c")
-# CHECK-NEXT: DW_AT_stmt_list [DW_FORM_strx4]	( indexed (00000000) string = )
+# CHECK-NEXT: DW_AT_stmt_list [DW_FORM_block4]
 # CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strp]	( .debug_str[0x0000003f] = "/Users/sgravani/Development/tests")
 # CHECK-NEXT: DW_AT_low_pc [DW_FORM_addr]	(0x0000000000000000)
 # CHECK-NEXT: DW_AT_high_pc [DW_FORM_data4]	(0x00000016){{[[:space:]]}}
@@ -82,7 +82,7 @@ Lsection_abbrev:
 	.byte	3                       ## DW_AT_name
 	.byte	14                      ## DW_FORM_strp
 	.byte	16                      ## DW_AT_stmt_list
-	.byte	40                      ## DW_FORM_sec_offset -- error: DIE has invalid DW_AT_stmt_list encoding:
+	.byte	4                       ## DW_FORM_sec_offset -- error: DIE has invalid DW_AT_stmt_list encoding:
 	.byte	27                      ## DW_AT_comp_dir
 	.byte	14                      ## DW_FORM_strp
 	.byte	17                      ## DW_AT_low_pc
diff --git a/test/tools/llvm-dwarfdump/X86/verify_strings.s b/test/tools/llvm-dwarfdump/X86/verify_strings.s
new file mode 100644
index 00000000000..e09ffd502cb
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/verify_strings.s
@@ -0,0 +1,88 @@
+# RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o %t.o
+# RUN: not llvm-dwarfdump -verify %t.o | FileCheck --check-prefix=VERIFY %s
+
+# Check that the verifier correctly diagnoses various error conditions with
+# the usage of string indices and string offsets tables.
+
+        .section .debug_str,"MS",@progbits,1
+str_producer:
+        .asciz "Handmade DWARF producer"
+
+        .section .debug_str_offsets,"",@progbits
+# The string offsets table
+        .long .debug_str_offsets_segment0_end-.debug_str_offsets_base0+4
+        .short 5    # DWARF version
+        .short 0    # Padding
+.debug_str_offsets_base0:
+        .long str_producer
+        .long 1000  # Invalid string address.
+.debug_str_offsets_segment0_end:
+
+# A simple abbrev section with a basic compile unit DIE.
+        .section .debug_abbrev,"",@progbits
+        .byte 0x01  # Abbrev code
+        .byte 0x11  # DW_TAG_compile_unit
+        .byte 0x01  # DW_CHILDREN_no
+        .byte 0x25  # DW_AT_producer
+        .byte 0x1a  # DW_FORM_strx
+        .byte 0x72  # DW_AT_str_offsets_base
+        .byte 0x17  # DW_FORM_sec_offset
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+
+        .section .debug_info,"",@progbits
+
+# The first unit's CU DIE has an invalid DW_AT_str_offsets_base which
+# renders any string index unresolvable.
+
+# DWARF v5 CU header.
+        .long  CU1_5_end-CU1_5_version  # Length of Unit
+CU1_5_version:
+        .short 5               # DWARF version number
+        .byte 1                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+# The compile-unit DIE, which has DW_AT_producer and DW_AT_str_offsets.
+        .byte 1                # Abbreviation code
+        .byte 0                # Index of string for DW_AT_producer.
+        .long 1000             # Bad value for DW_AT_str_offsets_base
+        .byte 0 # NULL
+CU1_5_end:
+
+# The second unit's CU DIE uses an invalid string index.
+
+# DWARF v5 CU header
+        .long  CU2_5_end-CU2_5_version  # Length of Unit
+CU2_5_version:
+        .short 5               # DWARF version number
+        .byte 1                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+# The compile-unit DIE, which has DW_AT_producer and DW_AT_str_offsets.
+        .byte 1                # Abbreviation code
+        .byte 100              # Invalid string index
+        .long .debug_str_offsets_base0
+        .byte 0 # NULL
+CU2_5_end:
+
+# The third unit's CU DIE uses a valid string index but the entry in the 
+# string offsets table is invalid. 
+
+# DWARF v5 CU header
+        .long  CU3_5_end-CU3_5_version  # Length of Unit
+CU3_5_version:
+        .short 5               # DWARF version number
+        .byte 1                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+# The compile-unit DIE, which has DW_AT_producer and DW_AT_str_offsets.
+        .byte 1                # Abbreviation code
+        .byte 1                # Index of string for DW_AT_producer.
+        .long .debug_str_offsets_base0
+        .byte 0 # NULL
+CU3_5_end:
+        
+# VERIFY-DAG:      error: DW_FORM_strx used without a valid string offsets table:
+# VERIFY-DAG:      error: DW_FORM_strx uses index 100, which is too large:
+# VERIFY-DAG:      error: DW_FORM_strx uses index 1, but the referenced string offset 
+# VERIFY-DAG-SAME: is beyond .debug_str bounds:
-- 
GitLab


From cc8a1a635b5f04eca98fb6dd415d31bfaa378fbc Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Sat, 3 Nov 2018 00:41:52 +0000
Subject: [PATCH 0938/1116] [codeview] Let the X86 backend tell us the VFRAME
 offset adjustment

Use MachineFrameInfo's OffsetAdjustment field to pass this information
from the target to CodeViewDebug.cpp. The X86 backend doesn't use it for
any other purpose.

This fixes PR38857 in the case where there is a non-aligned quantity of
CSRs and a non-aligned quantity of locals.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346062 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/CodeViewDebug.cpp  |  11 +-
 lib/CodeGen/AsmPrinter/CodeViewDebug.h    |   3 +
 lib/Target/X86/X86FrameLowering.cpp       |  18 +--
 test/CodeGen/MIR/X86/diexpr-win32.mir     |   4 +-
 test/DebugInfo/COFF/fpo-realign-vframe.ll |  12 +-
 test/DebugInfo/COFF/vframe-csr.ll         | 179 ++++++++++++++++++++++
 6 files changed, 202 insertions(+), 25 deletions(-)
 create mode 100644 test/DebugInfo/COFF/vframe-csr.ll

diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 9b2b3477be7..01d018fdde3 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -1339,6 +1339,7 @@ void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {
   // instruction (AArch64), this will be zero.
   CurFn->CSRSize = MFI.getCVBytesOfCalleeSavedRegisters();
   CurFn->FrameSize = MFI.getStackSize();
+  CurFn->OffsetAdjustment = MFI.getOffsetAdjustment();
   CurFn->HasStackRealignment = TRI->needsStackRealignment(*MF);
 
   // For this function S_FRAMEPROC record, figure out which codeview register
@@ -2599,16 +2600,10 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
 
       // 32-bit x86 call sequences often use PUSH instructions, which disrupt
       // ESP-relative offsets. Use the virtual frame pointer, VFRAME or $T0,
-      // instead. In simple cases, $T0 will be the CFA.
+      // instead. In frames without stack realignment, $T0 will be the CFA.
       if (RegisterId(Reg) == RegisterId::ESP) {
         Reg = unsigned(RegisterId::VFRAME);
-        Offset -= FI.FrameSize;
-
-        // If the frame requires realignment, VFRAME will be ESP after it is
-        // aligned. We have to remove the ESP adjustments made to push CSRs and
-        // EBP. EBP is not included in CSRSize.
-        if (FI.HasStackRealignment)
-          Offset += FI.CSRSize + 4;
+        Offset += FI.OffsetAdjustment;
       }
 
       // If we can use the chosen frame pointer for the frame and this isn't a
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index b6fbdc1373f..ef0f0c3635e 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -153,6 +153,9 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
     /// Number of bytes pushed to save CSRs.
     unsigned CSRSize = 0;
 
+    /// Adjustment to apply on x86 when using the VFRAME frame pointer.
+    int OffsetAdjustment = 0;
+
     /// Two-bit value indicating which register is the designated frame pointer
     /// register for local variables. Included in S_FRAMEPROC.
     codeview::EncodedFramePtrReg EncodedLocalFramePtrReg =
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 1eb9fa0bc1e..67ec867b562 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1103,15 +1103,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
       NumBytes = alignTo(NumBytes, MaxAlign);
 
-    // Get the offset of the stack slot for the EBP register, which is
-    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
-    // Update the frame offset adjustment.
-    if (!IsFunclet)
-      MFI.setOffsetAdjustment(-NumBytes);
-    else
-      assert(MFI.getOffsetAdjustment() == -(int)NumBytes &&
-             "should calculate same local variable offset for funclets");
-
     // Save EBP/RBP into the appropriate stack slot.
     BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
       .addReg(MachineFramePtr, RegState::Kill)
@@ -1167,6 +1158,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
   }
 
+  // Update the offset adjustment, which is mainly used by codeview to translate
+  // from ESP to VFRAME relative local variable offsets.
+  if (!IsFunclet) {
+    if (HasFP && TRI->needsStackRealignment(MF))
+      MFI.setOffsetAdjustment(-NumBytes);
+    else
+      MFI.setOffsetAdjustment(-StackSize);
+  }
+
   // For EH funclets, only allocate enough space for outgoing calls. Save the
   // NumBytes value that we would've used for the parent frame.
   unsigned ParentFrameNumBytes = NumBytes;
diff --git a/test/CodeGen/MIR/X86/diexpr-win32.mir b/test/CodeGen/MIR/X86/diexpr-win32.mir
index 384c6bf57c9..7c0461233bb 100644
--- a/test/CodeGen/MIR/X86/diexpr-win32.mir
+++ b/test/CodeGen/MIR/X86/diexpr-win32.mir
@@ -15,7 +15,7 @@
 # CHECK-NEXT: }
 # CHECK-NEXT: DefRangeFramePointerRelSym {
 # CHECK-NEXT:   Kind: S_DEFRANGE_FRAMEPOINTER_REL (0x1142)
-# CHECK-NEXT:   Offset: 8
+# CHECK-NEXT:   Offset: 12
 # CHECK-NEXT:   LocalVariableAddrRange {
 # CHECK-NEXT:     OffsetStart:
 # CHECK-NEXT:     ISectStart:
@@ -32,7 +32,7 @@
 # CHECK-NEXT: }
 # CHECK-NEXT: DefRangeFramePointerRelSym {
 # CHECK-NEXT:   Kind: S_DEFRANGE_FRAMEPOINTER_REL (0x1142)
-# CHECK-NEXT:   Offset: 4
+# CHECK-NEXT:   Offset: 8
 # CHECK-NEXT:   LocalVariableAddrRange {
 # CHECK-NEXT:     OffsetStart: .text+0x5
 # CHECK-NEXT:     ISectStart: 0x0
diff --git a/test/DebugInfo/COFF/fpo-realign-vframe.ll b/test/DebugInfo/COFF/fpo-realign-vframe.ll
index fded804a531..e5f8d5d34dc 100644
--- a/test/DebugInfo/COFF/fpo-realign-vframe.ll
+++ b/test/DebugInfo/COFF/fpo-realign-vframe.ll
@@ -83,12 +83,12 @@
 ; OBJ:   }
 ; OBJ:   FrameData {
 ; OBJ:     FrameFunc [
-; OBJ:       $T1 $ebp 4 + =
-; OBJ:       $T0 $T1 4 - 8 @ =
-; OBJ:       $eip $T1 ^ =
-; OBJ:       $esp $T1 4 + =
-; OBJ:       $ebp $T1 4 - ^ =
-; OBJ:     ]
+; OBJ-NEXT:   $T1 $ebp 4 + =
+; OBJ-NEXT:   $T0 $T1 4 - 8 @ =
+; OBJ-NEXT:   $eip $T1 ^ =
+; OBJ-NEXT:   $esp $T1 4 + =
+; OBJ-NEXT:   $ebp $T1 4 - ^ =
+; OBJ-NEXT: ]
 ; OBJ:   }
 ; OBJ: ]
 ; OBJ: Subsection [
diff --git a/test/DebugInfo/COFF/vframe-csr.ll b/test/DebugInfo/COFF/vframe-csr.ll
new file mode 100644
index 00000000000..1c1c0cec50e
--- /dev/null
+++ b/test/DebugInfo/COFF/vframe-csr.ll
@@ -0,0 +1,179 @@
+; RUN: llc < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -filetype=obj < %s | llvm-readobj -codeview | FileCheck %s --check-prefix=OBJ
+
+; PR38857
+
+; This test case is identical to the fpo-realign-vframe.ll test, except it uses
+; two callee-saved registers.
+
+; Match the prologue for the .cv_fpo* directives.
+; ASM-LABEL: _realign_with_csrs:
+; ASM:         .cv_fpo_proc    _realign_with_csrs 0
+; ASM: # %bb.0:                                # %entry
+; ASM:         pushl   %ebp
+; ASM:         .cv_fpo_pushreg %ebp
+; ASM:         movl    %esp, %ebp
+; ASM:         .cv_fpo_setframe        %ebp
+; ASM:         andl    $-8, %esp
+; ASM:         .cv_fpo_stackalign      8
+; FIXME: Why 24 bytes? We only need 12 bytes of data.
+; ASM:         subl    $24, %esp
+; ASM:         .cv_fpo_stackalloc      24
+; ASM:         .cv_fpo_endprologue
+
+; 'x' should be EBP-relative, 'a' and 'force_alignment' ESP relative.
+; ASM:         calll   _getval
+; ASM-DAG:     leal    8(%esp), %[[LEA_DBL:[^ ]*]]
+; ASM-DAG:     leal    4(%esp), %[[LEA_A:[^ ]*]]
+; ASM:         pushl   %[[LEA_DBL]]
+; ASM:         pushl   %[[LEA_A]]
+; ASM:         pushl   %[[LEA_A]]
+; ASM:         calll   _usevals
+; ASM:         addl    $12, %esp
+
+; OBJ: Subsection [
+; OBJ:   SubSectionType: Symbols (0xF1)
+; OBJ: ]
+; OBJ: Subsection [
+; OBJ:   SubSectionType: FrameData (0xF5)
+;   	Really, the only important FrameFunc is the last one.
+; OBJ:   FrameData {
+; OBJ:   }
+; OBJ:   FrameData {
+; OBJ:   }
+; OBJ:   FrameData {
+; OBJ:   }
+; OBJ:   FrameData {
+; OBJ:   }
+; OBJ:   FrameData {
+; OBJ:     FrameFunc [
+; OBJ-NEXT:   $T1 $ebp 4 + =
+; OBJ-NEXT:   $T0 $T1 8 - 8 @ =
+; OBJ-NEXT:   $eip $T1 ^ =
+; OBJ-NEXT:   $esp $T1 4 + =
+; OBJ-NEXT:   $ebp $T1 4 - ^ =
+; OBJ-NEXT:   $esi $T1 8 - ^ =
+; OBJ-NEXT: ]
+; OBJ:   }
+; OBJ: ]
+; OBJ: Subsection [
+; OBJ:   SubSectionType: Symbols (0xF1)
+; OBJ:   GlobalProcIdSym {
+; OBJ:     Kind: S_GPROC32_ID (0x1147)
+; OBJ:     DisplayName: realign_with_csrs
+; OBJ:     LinkageName: _realign_with_csrs
+; OBJ:   }
+; 	The frame register for locals should be VFRAME, and EBP for parameters.
+; OBJ:   FrameProcSym {
+; OBJ:     Kind: S_FRAMEPROC (0x1012)
+; OBJ:     TotalFrameBytes: 0x18
+; OBJ:     LocalFramePtrReg: VFRAME (0x7536)
+; OBJ:     ParamFramePtrReg: EBP (0x16)
+; OBJ:   }
+; 	ESP is VFRAME - 24, ESP offset of 'a' is 4, so -20.
+; OBJ:   LocalSym {
+; OBJ:     Kind: S_LOCAL (0x113E)
+; OBJ:     Type: int (0x74)
+; OBJ:     Flags [ (0x0)
+; OBJ:     ]
+; OBJ:     VarName: a
+; OBJ:   }
+; OBJ:   DefRangeFramePointerRelSym {
+; OBJ:     Kind: S_DEFRANGE_FRAMEPOINTER_REL (0x1142)
+; OBJ:     Offset: -20
+; OBJ:   }
+; 	ESP is VFRAME - 16, ESP offset of 'force_alignment' is 8, so -8.
+; OBJ:   LocalSym {
+; OBJ:     Kind: S_LOCAL (0x113E)
+; OBJ:     Type: double (0x41)
+; OBJ:     Flags [ (0x0)
+; OBJ:     ]
+; OBJ:     VarName: force_alignment
+; OBJ:   }
+; OBJ:   DefRangeFramePointerRelSym {
+; OBJ:     Kind: S_DEFRANGE_FRAMEPOINTER_REL (0x1142)
+; OBJ:     Offset: -16
+; OBJ:   }
+; OBJ:   ProcEnd {
+; OBJ:     Kind: S_PROC_ID_END (0x114F)
+; OBJ:   }
+; OBJ: ]
+
+; ModuleID = 't.c'
+source_filename = "t.c"
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc19.14.26433"
+
+; Function Attrs: nounwind
+define dso_local i32 @realign_with_csrs() local_unnamed_addr #0 !dbg !8 {
+entry:
+  %a = alloca i32, align 4
+  %force_alignment = alloca double, align 8
+  %0 = bitcast i32* %a to i8*, !dbg !22
+  call void @llvm.dbg.declare(metadata i32* %a, metadata !14, metadata !DIExpression()), !dbg !22
+  %csr1 = tail call i32 @getval() #4
+  %call = tail call i32 @getval() #4, !dbg !22
+  store i32 %call, i32* %a, align 4, !dbg !22, !tbaa !17
+  %1 = bitcast double* %force_alignment to i8*, !dbg !23
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1) #4, !dbg !23
+  call void @llvm.dbg.declare(metadata double* %force_alignment, metadata !15, metadata !DIExpression()), !dbg !23
+  store double 4.200000e-01, double* %force_alignment, align 8, !dbg !23, !tbaa !24
+  call void @usevals(i32* nonnull %a, i32* nonnull %a, double* nonnull %force_alignment) #4, !dbg !26
+  call void @usecsrs(i32 %csr1, i32 %csr1)
+  ret i32 0
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
+
+declare dso_local i32 @getval() local_unnamed_addr #3
+
+declare dso_local void @usevals(i32*, i32*, double*) local_unnamed_addr #3
+
+declare dso_local void @usecsrs(i32, i32) local_unnamed_addr #3
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "t.c", directory: "C:\5Csrc\5Cllvm-project\5Cbuild", checksumkind: CSK_MD5, checksum: "a646950309d5d01d8087fc10fea33941")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"CodeView", i32 1}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 2}
+!7 = !{!"clang version 8.0.0 "}
+!8 = distinct !DISubprogram(name: "realign_with_csrs", scope: !1, file: !1, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!9 = !DISubroutineType(types: !10)
+!10 = !{!11, !11}
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{!13, !14, !15}
+!13 = !DILocalVariable(name: "x", arg: 1, scope: !8, file: !1, line: 3, type: !11)
+!14 = !DILocalVariable(name: "a", scope: !8, file: !1, line: 4, type: !11)
+!15 = !DILocalVariable(name: "force_alignment", scope: !8, file: !1, line: 5, type: !16, align: 64)
+!16 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+!17 = !{!18, !18, i64 0}
+!18 = !{!"int", !19, i64 0}
+!19 = !{!"omnipotent char", !20, i64 0}
+!20 = !{!"Simple C/C++ TBAA"}
+!21 = !DILocation(line: 3, scope: !8)
+!22 = !DILocation(line: 4, scope: !8)
+!23 = !DILocation(line: 5, scope: !8)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"double", !19, i64 0}
+!26 = !DILocation(line: 6, scope: !8)
+!27 = !DILocation(line: 7, scope: !8)
+!28 = !DILocation(line: 8, scope: !8)
-- 
GitLab


From 7fad5fb0d0d32beea4e95e239cc065a850733358 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 3 Nov 2018 13:18:55 +0000
Subject: [PATCH 0939/1116] [ValueTracking] peek through 2-input shuffles in
 ComputeNumSignBits

This patch gives the IR ComputeNumSignBits the same functionality as the
DAG version (the code is derived from the existing code).

This an extension of the single input shuffle analysis added with D53659.

Differential Revision: https://reviews.llvm.org/D53987


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346071 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ValueTracking.cpp                | 53 ++++++++++++-------
 test/Transforms/InstCombine/logical-select.ll |  8 ++-
 unittests/Analysis/ValueTrackingTest.cpp      |  3 +-
 3 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 89a621576ec..6e08272c64e 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -2512,26 +2512,41 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
     return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
 
   case Instruction::ShuffleVector: {
-    // If the shuffle mask contains any undefined elements, that element of the
-    // result is undefined. Propagating information from a source operand may
-    // not be correct in that case, so just bail out.
-    if (cast<ShuffleVectorInst>(U)->getMask()->containsUndefElement())
-      break;
-
-    // If everything is undef, we can't say anything. This should be simplified.
-    Value *Op0 = U->getOperand(0), *Op1 = U->getOperand(1);
-    if (isa<UndefValue>(Op0) && isa<UndefValue>(Op1))
+    // TODO: This is copied almost directly from the SelectionDAG version of
+    //       ComputeNumSignBits. It would be better if we could share common
+    //       code. If not, make sure that changes are translated to the DAG.
+
+    // Collect the minimum number of sign bits that are shared by every vector
+    // element referenced by the shuffle.
+    auto *Shuf = cast<ShuffleVectorInst>(U);
+    int NumElts = Shuf->getOperand(0)->getType()->getVectorNumElements();
+    int NumMaskElts = Shuf->getMask()->getType()->getVectorNumElements();
+    APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
+    for (int i = 0; i != NumMaskElts; ++i) {
+      int M = Shuf->getMaskValue(i);
+      assert(M < NumElts * 2 && "Invalid shuffle mask constant");
+      // For undef elements, we don't know anything about the common state of
+      // the shuffle result.
+      if (M == -1)
+        return 1;
+      if (M < NumElts)
+        DemandedLHS.setBit(M % NumElts);
+      else
+        DemandedRHS.setBit(M % NumElts);
+    }
+    Tmp = std::numeric_limits<unsigned>::max();
+    if (!!DemandedLHS)
+      Tmp = ComputeNumSignBits(Shuf->getOperand(0), Depth + 1, Q);
+    if (!!DemandedRHS) {
+      Tmp2 = ComputeNumSignBits(Shuf->getOperand(1), Depth + 1, Q);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    // If we don't know anything, early out and try computeKnownBits fall-back.
+    if (Tmp == 1)
       break;
-
-    // Look through shuffle of 1 source vector.
-    if (isa<UndefValue>(Op0))
-      return ComputeNumSignBits(Op1, Depth + 1, Q);
-    if (isa<UndefValue>(Op1))
-      return ComputeNumSignBits(Op0, Depth + 1, Q);
-
-    // TODO: We can look through shuffles of 2 sources by computing the minimum
-    // sign bits for each operand (similar to what we do for binops).
-    break;
+    assert(Tmp <= V->getType()->getScalarSizeInBits() &&
+           "Failed to determine minimum sign bits");
+    return Tmp;
   }
   }
 
diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index 888c6a544ea..999e4512723 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -621,11 +621,9 @@ define <4 x i32> @computesignbits_through_two_input_shuffle(<4 x i32> %x, <4 x i
 ; CHECK-NEXT:    [[SEXT1:%.*]] = sext <4 x i1> [[COND1:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SEXT2:%.*]] = sext <4 x i1> [[COND2:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i32> [[SEXT1]], <4 x i32> [[SEXT2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[NOTCOND:%.*]] = xor <4 x i32> [[COND]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[AND1:%.*]] = and <4 x i32> [[NOTCOND]], [[X:%.*]]
-; CHECK-NEXT:    [[AND2:%.*]] = and <4 x i32> [[COND]], [[Y:%.*]]
-; CHECK-NEXT:    [[SEL:%.*]] = or <4 x i32> [[AND1]], [[AND2]]
-; CHECK-NEXT:    ret <4 x i32> [[SEL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[COND]] to <4 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %sext1 = sext <4 x i1> %cond1 to <4 x i32>
   %sext2 = sext <4 x i1> %cond2 to <4 x i32>
diff --git a/unittests/Analysis/ValueTrackingTest.cpp b/unittests/Analysis/ValueTrackingTest.cpp
index e66d8f77fd4..b13948adad7 100644
--- a/unittests/Analysis/ValueTrackingTest.cpp
+++ b/unittests/Analysis/ValueTrackingTest.cpp
@@ -514,7 +514,6 @@ TEST(ValueTracking, ComputeNumSignBits_Shuffle) {
   EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
 }
 
-// FIXME:
 // No guarantees for canonical IR in this analysis, so a shuffle element that
 // references an undef value means this can't return any extra information. 
 TEST(ValueTracking, ComputeNumSignBits_Shuffle2) {
@@ -534,7 +533,7 @@ TEST(ValueTracking, ComputeNumSignBits_Shuffle2) {
 
   auto *RVal =
       cast<ReturnInst>(F->getEntryBlock().getTerminator())->getOperand(0);
-  EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 32u);
+  EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
 }
 
 TEST(ValueTracking, ComputeKnownBits) {
-- 
GitLab


From ad3c2dda979bc6d822b0db03845ff27b54480a12 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sat, 3 Nov 2018 19:49:13 +0000
Subject: [PATCH 0940/1116] [X86] Update comment I forgot to change in r346043.
 NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346073 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7bf1cc6875e..0a97bf39641 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -38312,8 +38312,8 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
     return SDValue();
 
-  // On AVX2+ targets, if the input/output types are both legal then we will be
-  // able to use SIGN_EXTEND/ZERO_EXTEND directly.
+  // If the input/output types are both legal then we have at least AVX1 and
+  // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly.
   if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
     return SDValue();
-- 
GitLab


From 3ba729d27046f3f82af374ceebb5e8de3ee6a8e4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 4 Nov 2018 02:10:18 +0000
Subject: [PATCH 0941/1116] [SelectionDAG] Remove special methods for creating
 *_EXTEND_VECTOR_INREG nodes. Move asserts into getNode.

These methods were just wrappers around getNode with additional asserts (identical and repeated 3 times). But getNode already has a switch that can be used to hold these asserts that allows them to be shared for all 3 opcodes. This also enables checking on the places that create these nodes without using the wrappers.

The rest of the patch is just changing all callers to use getNode directly.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346087 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/SelectionDAG.h           | 18 --------
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |  6 ++-
 .../SelectionDAG/LegalizeVectorOps.cpp        |  2 +-
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 14 +++---
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     | 44 +++++--------------
 lib/Target/Hexagon/HexagonISelLoweringHVX.cpp |  3 +-
 lib/Target/X86/X86ISelLowering.cpp            | 33 ++++++++------
 unittests/CodeGen/AArch64SelectionDAGTest.cpp |  4 +-
 8 files changed, 45 insertions(+), 79 deletions(-)

diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 973a3ddb1ba..3b144b92e2a 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -786,24 +786,6 @@ public:
   /// value assuming it was the smaller SrcTy value.
   SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT);
 
-  /// Return an operation which will any-extend the low lanes of the operand
-  /// into the specified vector type. For example,
-  /// this can convert a v16i8 into a v4i32 by any-extending the low four
-  /// lanes of the operand from i8 to i32.
-  SDValue getAnyExtendVectorInReg(SDValue Op, const SDLoc &DL, EVT VT);
-
-  /// Return an operation which will sign extend the low lanes of the operand
-  /// into the specified vector type. For example,
-  /// this can convert a v16i8 into a v4i32 by sign extending the low four
-  /// lanes of the operand from i8 to i32.
-  SDValue getSignExtendVectorInReg(SDValue Op, const SDLoc &DL, EVT VT);
-
-  /// Return an operation which will zero extend the low lanes of the operand
-  /// into the specified vector type. For example,
-  /// this can convert a v16i8 into a v4i32 by zero extending the low four
-  /// lanes of the operand from i8 to i32.
-  SDValue getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL, EVT VT);
-
   /// Convert Op, which must be of integer type, to the integer type VT,
   /// by using an extension appropriate for the target's
   /// BooleanContent for type OpVT or truncating it.
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f318b7fdb39..83e9f2c23ca 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9402,7 +9402,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) {
     if (!LegalOperations ||
         TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))
-      return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT);
+      return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT,
+                         N0.getOperand(0));
   }
 
   // fold (sext_in_reg (zext x)) -> (sext x)
@@ -17049,7 +17050,8 @@ static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
       if (!LegalOperations ||
           TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
         return DAG.getBitcast(VT,
-                            DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT));
+                              DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG,
+                                          SDLoc(SVN), OutVT, N0));
   }
 
   return SDValue();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 284c4e5b3dd..bfc00ea28ef 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -872,7 +872,7 @@ SDValue VectorLegalizer::ExpandSIGN_EXTEND_VECTOR_INREG(SDValue Op) {
 
   // First build an any-extend node which can be legalized above when we
   // recurse through it.
-  Op = DAG.getAnyExtendVectorInReg(Src, DL, VT);
+  Op = DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Src);
 
   // Now we need sign extend. Do this by shifting the elements. Even if these
   // aren't legal operations, they have a better chance of being legalized
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 59bd751f4ec..6b52b374cd0 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2811,9 +2811,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which
       // accepts fewer elements in the result than in the input.
       if (Opcode == ISD::SIGN_EXTEND)
-        return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
+        return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
       if (Opcode == ISD::ZERO_EXTEND)
-        return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
+        return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
     }
   }
 
@@ -2883,11 +2883,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
     if (InVT.getSizeInBits() == WidenVT.getSizeInBits()) {
       switch (Opcode) {
       case ISD::ANY_EXTEND_VECTOR_INREG:
-        return DAG.getAnyExtendVectorInReg(InOp, DL, WidenVT);
       case ISD::SIGN_EXTEND_VECTOR_INREG:
-        return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
       case ISD::ZERO_EXTEND_VECTOR_INREG:
-        return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
+        return DAG.getNode(Opcode, DL, WidenVT, InOp);
       }
     }
   }
@@ -3722,11 +3720,11 @@ SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
   default:
     llvm_unreachable("Extend legalization on extend operation!");
   case ISD::ANY_EXTEND:
-    return DAG.getAnyExtendVectorInReg(InOp, DL, VT);
+    return DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, InOp);
   case ISD::SIGN_EXTEND:
-    return DAG.getSignExtendVectorInReg(InOp, DL, VT);
+    return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, InOp);
   case ISD::ZERO_EXTEND:
-    return DAG.getZeroExtendVectorInReg(InOp, DL, VT);
+    return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, InOp);
   }
 }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4d509c99c2e..66121c10a35 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1118,39 +1118,6 @@ SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
                  getConstant(Imm, DL, Op.getValueType()));
 }
 
-SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, const SDLoc &DL,
-                                              EVT VT) {
-  assert(VT.isVector() && "This DAG node is restricted to vector types.");
-  assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
-         "The sizes of the input and result must match in order to perform the "
-         "extend in-register.");
-  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
-         "The destination vector type must have fewer lanes than the input.");
-  return getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Op);
-}
-
-SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, const SDLoc &DL,
-                                               EVT VT) {
-  assert(VT.isVector() && "This DAG node is restricted to vector types.");
-  assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
-         "The sizes of the input and result must match in order to perform the "
-         "extend in-register.");
-  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
-         "The destination vector type must have fewer lanes than the input.");
-  return getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, Op);
-}
-
-SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL,
-                                               EVT VT) {
-  assert(VT.isVector() && "This DAG node is restricted to vector types.");
-  assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
-         "The sizes of the input and result must match in order to perform the "
-         "extend in-register.");
-  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
-         "The destination vector type must have fewer lanes than the input.");
-  return getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Op);
-}
-
 /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
 SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
   EVT EltVT = VT.getScalarType();
@@ -4196,6 +4163,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     break;
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    assert(VT.isVector() && "This DAG node is restricted to vector types.");
+    assert(VT.getSizeInBits() == Operand.getValueSizeInBits() &&
+           "The sizes of the input and result must match in order to perform the "
+           "extend in-register.");
+    assert(VT.getVectorNumElements() <
+             Operand.getValueType().getVectorNumElements() &&
+           "The destination vector type must have fewer lanes than the input.");
+    break;
   case ISD::ABS:
     assert(VT.isInteger() && VT == Operand.getValueType() &&
            "Invalid ABS!");
diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index b931f606ee5..a6400b5d826 100644
--- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1426,7 +1426,8 @@ SDValue
 HexagonTargetLowering::LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const {
   // Sign- and zero-extends are legal.
   assert(Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG);
-  return DAG.getZeroExtendVectorInReg(Op.getOperand(0), SDLoc(Op), ty(Op));
+  return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(Op), ty(Op),
+                     Op.getOperand(0));
 }
 
 SDValue
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0a97bf39641..905b99590a6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5453,8 +5453,9 @@ static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
   assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
 
   if (VT.is128BitVector() && InVT.is128BitVector())
-    return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
-                                : DAG.getZeroExtendVectorInReg(In, DL, VT);
+    return DAG.getNode(X86ISD::VSEXT == Opc ? ISD::SIGN_EXTEND_VECTOR_INREG
+                                            : ISD::ZERO_EXTEND_VECTOR_INREG,
+                       DL, VT, In);
 
   // For 256-bit vectors, we only need the lower (128-bit) input half.
   // For 512-bit vectors, we only need the lower input half or quarter.
@@ -17459,7 +17460,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
                                 VT.getVectorNumElements() / 2);
 
-  SDValue OpLo = DAG.getZeroExtendVectorInReg(In, dl, HalfVT);
+  SDValue OpLo = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, HalfVT, In);
 
   SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
   SDValue Undef = DAG.getUNDEF(InVT);
@@ -19884,7 +19885,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
                                 VT.getVectorNumElements() / 2);
 
-  SDValue OpLo = DAG.getSignExtendVectorInReg(In, dl, HalfVT);
+  SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
 
   unsigned NumElems = InVT.getVectorNumElements();
   SmallVector<int,8> ShufMask(NumElems, -1);
@@ -19892,7 +19893,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
     ShufMask[i] = i + NumElems/2;
 
   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
-  OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
+  OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
@@ -20138,7 +20139,8 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
     assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
            "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
 
-    SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
+    SDValue Shuff = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, RegVT,
+                                SlicedVec);
     return DAG.getMergeValues({Shuff, TF}, dl);
   }
 
@@ -20823,7 +20825,8 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
     MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
     if (Subtarget.hasSSE41())
-      ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
+      ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
+                          MVT::v2i64, ShAmt);
     else {
       SDValue ByteShift = DAG.getConstant(
           (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
@@ -20836,7 +20839,8 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
   } else if (Subtarget.hasSSE41() &&
              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
-    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
+    ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
+                        MVT::v2i64, ShAmt);
   } else {
     SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
                         DAG.getUNDEF(SVT)};
@@ -38349,9 +38353,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
       (VT.is256BitVector() && Subtarget.hasAVX()) ||
       (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
-    return Opcode == ISD::SIGN_EXTEND
-               ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
-               : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
+    Opcode = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
+                                        : ISD::ZERO_EXTEND_VECTOR_INREG;
+    return DAG.getNode(Opcode, DL, VT, ExOp);
   }
 
   auto SplitAndExtendInReg = [&](unsigned SplitSize) {
@@ -38360,14 +38364,15 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
     EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
 
+    unsigned IROpc = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
+                                                : ISD::ZERO_EXTEND_VECTOR_INREG;
+
     SmallVector<SDValue, 8> Opnds;
     for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
       SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
                                    DAG.getIntPtrConstant(Offset, DL));
       SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
-      SrcVec = Opcode == ISD::SIGN_EXTEND
-                   ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
-                   : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
+      SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec);
       Opnds.push_back(SrcVec);
     }
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
diff --git a/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index 620dfc8d234..dc2d1f9a357 100644
--- a/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -86,7 +86,7 @@ TEST_F(AArch64SelectionDAGTest, computeKnownBits_ZERO_EXTEND_VECTOR_INREG) {
   auto InVecVT = EVT::getVectorVT(Context, Int8VT, 4);
   auto OutVecVT = EVT::getVectorVT(Context, Int16VT, 2);
   auto InVec = DAG->getConstant(0, Loc, InVecVT);
-  auto Op = DAG->getZeroExtendVectorInReg(InVec, Loc, OutVecVT);
+  auto Op = DAG->getNode(ISD::ZERO_EXTEND_VECTOR_INREG, Loc, OutVecVT, InVec);
   auto DemandedElts = APInt(4, 15);
   KnownBits Known;
   DAG->computeKnownBits(Op, Known, DemandedElts);
@@ -118,7 +118,7 @@ TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_SIGN_EXTEND_VECTOR_INREG) {
   auto InVecVT = EVT::getVectorVT(Context, Int8VT, 4);
   auto OutVecVT = EVT::getVectorVT(Context, Int16VT, 2);
   auto InVec = DAG->getConstant(1, Loc, InVecVT);
-  auto Op = DAG->getSignExtendVectorInReg(InVec, Loc, OutVecVT);
+  auto Op = DAG->getNode(ISD::SIGN_EXTEND_VECTOR_INREG, Loc, OutVecVT, InVec);
   auto DemandedElts = APInt(4, 15);
   EXPECT_EQ(DAG->ComputeNumSignBits(Op, DemandedElts), 15u);
 }
-- 
GitLab


From 88f96230a8449abcfd8d6e403071ae01374e2b66 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 4 Nov 2018 06:56:32 +0000
Subject: [PATCH 0942/1116] [DAGCombiner] Remove 'else' after return. NFC

This makes this code consistent with the nearly identical code in visitZERO_EXTEND.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346090 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 83e9f2c23ca..1eac79f28f2 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9087,17 +9087,16 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
         return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
                              N0.getOperand(1),
                              cast<CondCodeSDNode>(N0.getOperand(2))->get());
+
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/any extend
-      else {
-        EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
-        SDValue VsetCC =
-          DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
-                        N0.getOperand(1),
-                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
-        return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
-      }
+      EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
+      SDValue VsetCC =
+        DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
+                      N0.getOperand(1),
+                      cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
     }
 
     // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
-- 
GitLab


From fd7c7ddb55634c12a62ee5805213d51ef625c920 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 4 Nov 2018 14:28:48 +0000
Subject: [PATCH 0943/1116] [ValueTracking] determine sign of 0.0 from select
 when matching min/max FP

In PR39475:
https://bugs.llvm.org/show_bug.cgi?id=39475
..we may fail to recognize/simplify fabs() in some cases because we do not
canonicalize fcmp with a -0.0 operand.

Adding that canonicalization can cause regressions on min/max FP tests, so
that's this patch: for the purpose of determining whether something is min/max,
let the value returned by the select determine how we treat a 0.0 operand in the fcmp.

This patch doesn't actually change the -0.0 to +0.0. It just changes the analysis, so
we don't fail to recognize equivalent min/max patterns that only differ in the
signbit of 0.0.

Differential Revision: https://reviews.llvm.org/D54001


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346097 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ValueTracking.cpp           | 21 ++++++
 test/Transforms/InstCombine/minmax-fp.ll | 38 +++++------
 unittests/Analysis/ValueTrackingTest.cpp | 84 +++++++++++++++---------
 3 files changed, 89 insertions(+), 54 deletions(-)

diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 6e08272c64e..ed17441d1e4 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -4760,6 +4760,27 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
                                               Value *TrueVal, Value *FalseVal,
                                               Value *&LHS, Value *&RHS,
                                               unsigned Depth) {
+  if (CmpInst::isFPPredicate(Pred)) {
+    // IEEE-754 ignores the sign of 0.0 in comparisons. So if the select has one
+    // 0.0 operand, set the compare's 0.0 operands to that same value for the
+    // purpose of identifying min/max. Disregard vector constants with undefined
+    // elements because those can not be back-propagated for analysis.
+    Value *OutputZeroVal = nullptr;
+    if (match(TrueVal, m_AnyZeroFP()) && !match(FalseVal, m_AnyZeroFP()) &&
+        !cast<Constant>(TrueVal)->containsUndefElement())
+      OutputZeroVal = TrueVal;
+    else if (match(FalseVal, m_AnyZeroFP()) && !match(TrueVal, m_AnyZeroFP()) &&
+             !cast<Constant>(FalseVal)->containsUndefElement())
+      OutputZeroVal = FalseVal;
+
+    if (OutputZeroVal) {
+      if (match(CmpLHS, m_AnyZeroFP()))
+        CmpLHS = OutputZeroVal;
+      if (match(CmpRHS, m_AnyZeroFP()))
+        CmpRHS = OutputZeroVal;
+    }
+  }
+
   LHS = CmpLHS;
   RHS = CmpRHS;
 
diff --git a/test/Transforms/InstCombine/minmax-fp.ll b/test/Transforms/InstCombine/minmax-fp.ll
index 292e50eb1f9..7bf8f57d4e8 100644
--- a/test/Transforms/InstCombine/minmax-fp.ll
+++ b/test/Transforms/InstCombine/minmax-fp.ll
@@ -57,16 +57,15 @@ define double @t5(float %a) {
   ret double %3
 }
 
-; TODO:
-; From IEEE754: "Comparisons shall ignore the sign of zero (so +0 = −0)."
+; From IEEE754: "Comparisons shall ignore the sign of zero (so +0 = -0)."
 ; So the compare constant may be treated as +0.0, and we sink the fpext.
 
 define double @t6(float %a) {
 ; CHECK-LABEL: @t6(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float [[A:%.*]], -0.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[A]] to double
-; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], double [[TMP2]], double 0.000000e+00
-; CHECK-NEXT:    ret double [[TMP3]]
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[TMP1]] to double
+; CHECK-NEXT:    ret double [[TMP2]]
 ;
   %1 = fcmp ult float %a, -0.0
   %2 = fpext float %a to double
@@ -74,16 +73,15 @@ define double @t6(float %a) {
   ret double %3
 }
 
-; TODO:
-; From IEEE754: "Comparisons shall ignore the sign of zero (so +0 = −0)."
+; From IEEE754: "Comparisons shall ignore the sign of zero (so +0 = -0)."
 ; So the compare constant may be treated as -0.0, and we sink the fpext.
 
 define double @t7(float %a) {
 ; CHECK-LABEL: @t7(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[A]] to double
-; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], double [[TMP2]], double -0.000000e+00
-; CHECK-NEXT:    ret double [[TMP3]]
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float -0.000000e+00, float [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[TMP1]] to double
+; CHECK-NEXT:    ret double [[TMP2]]
 ;
   %1 = fcmp ult float %a, 0.0
   %2 = fpext float %a to double
@@ -91,15 +89,12 @@ define double @t7(float %a) {
   ret double %3
 }
 
-; TODO:
 ; min(min(x, 0.0), 0.0) --> min(x, 0.0)
 
 define float @fmin_fmin_zero_mismatch(float %x) {
 ; CHECK-LABEL: @fmin_fmin_zero_mismatch(
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[X:%.*]], -0.000000e+00
-; CHECK-NEXT:    [[MIN1:%.*]] = select i1 [[CMP1]], float [[X]], float 0.000000e+00
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[MIN1]], 0.000000e+00
-; CHECK-NEXT:    [[MIN2:%.*]] = select i1 [[CMP2]], float [[MIN1]], float 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[MIN2:%.*]] = select i1 [[TMP1]], float [[X]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[MIN2]]
 ;
   %cmp1 = fcmp olt float %x, -0.0
@@ -109,16 +104,13 @@ define float @fmin_fmin_zero_mismatch(float %x) {
   ret float %min2
 }
 
-; TODO:
 ; max(max(x, -0.0), -0.0) --> max(x, -0.0)
 
 define float @fmax_fmax_zero_mismatch(float %x) {
 ; CHECK-LABEL: @fmax_fmax_zero_mismatch(
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], float [[X]], float -0.000000e+00
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[MAX1]], 0.000000e+00
-; CHECK-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], float -0.000000e+00, float [[MAX1]]
-; CHECK-NEXT:    ret float [[MAX2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[MAX11:%.*]] = select i1 [[TMP1]], float [[X]], float -0.000000e+00
+; CHECK-NEXT:    ret float [[MAX11]]
 ;
   %cmp1 = fcmp ogt float %x, 0.0
   %max1 = select i1 %cmp1, float %x, float -0.0
diff --git a/unittests/Analysis/ValueTrackingTest.cpp b/unittests/Analysis/ValueTrackingTest.cpp
index b13948adad7..c4adde4abe3 100644
--- a/unittests/Analysis/ValueTrackingTest.cpp
+++ b/unittests/Analysis/ValueTrackingTest.cpp
@@ -156,8 +156,8 @@ TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero1) {
       "  %A = select i1 %1, float 0.0, float %a\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_NAN, true});
 }
 
 TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero2) {
@@ -167,8 +167,8 @@ TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero2) {
       "  %A = select i1 %1, float 0.0, float %a\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_NAN, false});
 }
 
 TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero3) {
@@ -178,8 +178,8 @@ TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero3) {
       "  %A = select i1 %1, float -0.0, float %a\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_NAN, true});
 }
 
 TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero4) {
@@ -189,8 +189,8 @@ TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero4) {
       "  %A = select i1 %1, float -0.0, float %a\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_NAN, false});
 }
 
 TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero5) {
@@ -200,8 +200,8 @@ TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero5) {
       "  %A = select i1 %1, float %a, float 0.0\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, false});
 }
 
 TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero6) {
@@ -211,8 +211,8 @@ TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero6) {
       "  %A = select i1 %1, float %a, float 0.0\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, true});
 }
 
 TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero7) {
@@ -222,8 +222,8 @@ TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero7) {
       "  %A = select i1 %1, float %a, float -0.0\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, false});
 }
 
 TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero8) {
@@ -233,8 +233,8 @@ TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero8) {
       "  %A = select i1 %1, float %a, float -0.0\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, true});
 }
 
 TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero1) {
@@ -244,8 +244,8 @@ TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero1) {
       "  %A = select i1 %1, float 0.0, float %a\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_NAN, true});
 }
 
 TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero2) {
@@ -255,8 +255,8 @@ TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero2) {
       "  %A = select i1 %1, float 0.0, float %a\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_NAN, false});
 }
 
 TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero3) {
@@ -266,8 +266,8 @@ TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero3) {
       "  %A = select i1 %1, float -0.0, float %a\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_NAN, true});
 }
 
 TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero4) {
@@ -277,8 +277,8 @@ TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero4) {
       "  %A = select i1 %1, float -0.0, float %a\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_NAN, false});
 }
 
 TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero5) {
@@ -288,8 +288,8 @@ TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero5) {
       "  %A = select i1 %1, float %a, float 0.0\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_OTHER, false});
 }
 
 TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero6) {
@@ -299,8 +299,8 @@ TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero6) {
       "  %A = select i1 %1, float %a, float 0.0\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_OTHER, true});
 }
 
 TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero7) {
@@ -310,8 +310,8 @@ TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero7) {
       "  %A = select i1 %1, float %a, float -0.0\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
-  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_OTHER, false});
 }
 
 TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero8) {
@@ -321,7 +321,29 @@ TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero8) {
       "  %A = select i1 %1, float %a, float -0.0\n"
       "  ret float %A\n"
       "}\n");
-  // FIXME: The sign of zero doesn't matter in fcmp.
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_OTHER, true});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZeroVecUndef) {
+  parseAssembly(
+      "define <2 x float> @test(<2 x float> %a) {\n"
+      "  %1 = fcmp ogt <2 x float> %a, <float -0.0, float -0.0>\n"
+      "  %A = select <2 x i1> %1, <2 x float> <float undef, float 0.0>, <2 x float> %a\n"
+      "  ret <2 x float> %A\n"
+      "}\n");
+  // An undef in a vector constant can not be back-propagated for this analysis.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZeroVecUndef) {
+  parseAssembly(
+      "define <2 x float> @test(<2 x float> %a) {\n"
+      "  %1 = fcmp ogt <2 x float> %a, zeroinitializer\n"
+      "  %A = select <2 x i1> %1, <2 x float> %a, <2 x float> <float -0.0, float undef>\n"
+      "  ret <2 x float> %A\n"
+      "}\n");
+  // An undef in a vector constant can not be back-propagated for this analysis.
   expectPattern({SPF_UNKNOWN, SPNB_NA, false});
 }
 
-- 
GitLab


From 0547a961bb297d0aff7b83fe876aa95fac925370 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 4 Nov 2018 17:31:27 +0000
Subject: [PATCH 0944/1116] [X86] Add vector shift by immediate to
 SimplifyDemandedBitsForTargetNode.

Summary: This also enables some constant folding from KnownBits propagation. This helps on some cases vXi64 case in 32-bit mode where constant vectors appear as vXi32 and a bitcast. This can prevent getNode from constant folding sra/shl/srl.

Reviewers: RKSimon, spatel

Reviewed By: spatel

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D54069

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346102 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp        |  42 ++++++++
 test/CodeGen/X86/combine-srl.ll           |  12 +--
 test/CodeGen/X86/combine-udiv.ll          |  42 +++-----
 test/CodeGen/X86/known-signbits-vector.ll |  61 ++++-------
 test/CodeGen/X86/pr35918.ll               |  12 +--
 test/CodeGen/X86/vector-shift-ashr-128.ll |  12 +--
 test/CodeGen/X86/vector-shift-ashr-256.ll |  29 +++--
 test/CodeGen/X86/vector-trunc-usat.ll     | 122 +++++++++++-----------
 8 files changed, 165 insertions(+), 167 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 905b99590a6..891f4a4cbdf 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -31817,6 +31817,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
 bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
     SDValue Op, const APInt &OriginalDemandedBits, KnownBits &Known,
     TargetLoweringOpt &TLO, unsigned Depth) const {
+  unsigned BitWidth = OriginalDemandedBits.getBitWidth();
   unsigned Opc = Op.getOpcode();
   switch(Opc) {
   case X86ISD::PMULDQ:
@@ -31833,6 +31834,42 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
       return true;
     break;
   }
+  case X86ISD::VSHLI: {
+    if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      if (ShiftImm->getAPIntValue().uge(BitWidth))
+        break;
+
+      KnownBits KnownOp;
+      unsigned ShAmt = ShiftImm->getZExtValue();
+      APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
+      if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO,
+                               Depth + 1))
+        return true;
+    }
+    break;
+  }
+  case X86ISD::VSRAI:
+  case X86ISD::VSRLI: {
+    if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      if (ShiftImm->getAPIntValue().uge(BitWidth))
+        break;
+
+      KnownBits KnownOp;
+      unsigned ShAmt = ShiftImm->getZExtValue();
+      APInt DemandedMask = OriginalDemandedBits << ShAmt;
+
+      // If any of the demanded bits are produced by the sign extension, we also
+      // demand the input sign bit.
+      if (Opc == X86ISD::VSRAI &&
+          OriginalDemandedBits.countLeadingZeros() < ShAmt)
+        DemandedMask.setSignBit();
+
+      if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO,
+                               Depth + 1))
+        return true;
+    }
+    break;
+  }
   }
 
   return TargetLowering::SimplifyDemandedBitsForTargetNode(
@@ -34861,6 +34898,11 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
   }
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+                               APInt::getAllOnesValue(NumBitsPerElt), DCI))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index e0692166171..80dcb29209b 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -63,17 +63,7 @@ define <4 x i32> @combine_vec_lshr_known_zero0(<4 x i32> %x) {
 define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_lshr_known_zero1:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $11, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrld $9, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $10, %xmm1
-; SSE-NEXT:    psrld $8, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_lshr_known_zero1:
diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll
index 34f64e52437..346e5447627 100644
--- a/test/CodeGen/X86/combine-udiv.ll
+++ b/test/CodeGen/X86/combine-udiv.ll
@@ -669,20 +669,15 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; SSE41-NEXT:    pmullw %xmm0, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    pmullw %xmm3, %xmm0
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    packuswb %xmm0, %xmm3
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    psllw $1, %xmm3
-; SSE41-NEXT:    psllw $8, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    psllw $1, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm0, %xmm2
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
@@ -693,21 +688,16 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; AVX1-NEXT:    movl $171, %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpsllw $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll
index 169342a3da5..64bca733068 100644
--- a/test/CodeGen/X86/known-signbits-vector.ll
+++ b/test/CodeGen/X86/known-signbits-vector.ll
@@ -91,17 +91,14 @@ define float @signbits_ashr_extract_sitofp_1(<2 x i64> %a0) nounwind {
 ; X32-LABEL: signbits_ashr_extract_sitofp_1:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
-; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
-; X32-NEXT:    vpsrlq $63, %xmm1, %xmm2
-; X32-NEXT:    vpsrlq $32, %xmm1, %xmm1
-; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; X32-NEXT:    vpsrlq $63, %xmm0, %xmm2
+; X32-NEXT:    vpsrlq $63, %xmm0, %xmm1
 ; X32-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0]
 ; X32-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vmovd %xmm0, %eax
-; X32-NEXT:    vcvtsi2ssl %eax, %xmm3, %xmm0
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
 ; X32-NEXT:    vmovss %xmm0, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -128,18 +125,15 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
 ; X32-LABEL: signbits_ashr_shl_extract_sitofp:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
-; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
-; X32-NEXT:    vpsrlq $60, %xmm1, %xmm2
-; X32-NEXT:    vpsrlq $61, %xmm1, %xmm1
-; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; X32-NEXT:    vpsrlq $60, %xmm0, %xmm2
+; X32-NEXT:    vpsrlq $60, %xmm0, %xmm1
 ; X32-NEXT:    vpsrlq $61, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,0,0,0,8,0,0,0]
 ; X32-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vpsllq $20, %xmm0, %xmm0
 ; X32-NEXT:    vmovd %xmm0, %eax
-; X32-NEXT:    vcvtsi2ssl %eax, %xmm3, %xmm0
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
 ; X32-NEXT:    vmovss %xmm0, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -263,13 +257,10 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-NEXT:    vpsrlq $60, %xmm2, %xmm3
-; X32-NEXT:    vpsrlq $61, %xmm2, %xmm2
-; X32-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; X32-NEXT:    vpsrlq $60, %xmm0, %xmm3
+; X32-NEXT:    vpsrlq $60, %xmm0, %xmm2
 ; X32-NEXT:    vpsrlq $61, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0]
 ; X32-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; X32-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
 ; X32-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
@@ -281,7 +272,7 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
 ; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; X32-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vmovd %xmm0, %eax
-; X32-NEXT:    vcvtsi2ssl %eax, %xmm4, %xmm0
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm3, %xmm0
 ; X32-NEXT:    vmovss %xmm0, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -320,13 +311,10 @@ define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4
 ; X32-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
-; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-NEXT:    vpsrlq $60, %xmm2, %xmm3
-; X32-NEXT:    vpsrlq $61, %xmm2, %xmm2
-; X32-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; X32-NEXT:    vpsrlq $60, %xmm0, %xmm3
+; X32-NEXT:    vpsrlq $60, %xmm0, %xmm2
 ; X32-NEXT:    vpsrlq $61, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0]
 ; X32-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; X32-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
 ; X32-NEXT:    vpmovsxdq %xmm1, %xmm1
@@ -334,7 +322,7 @@ define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4
 ; X32-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; X32-NEXT:    vpxor %xmm0, %xmm1, %xmm0
 ; X32-NEXT:    vmovd %xmm0, %eax
-; X32-NEXT:    vcvtsi2ssl %eax, %xmm4, %xmm0
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm3, %xmm0
 ; X32-NEXT:    vmovss %xmm0, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -375,22 +363,19 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X32-NEXT:    subl $16, %esp
 ; X32-NEXT:    vpmovsxdq 16(%ebp), %xmm3
 ; X32-NEXT:    vpmovsxdq 8(%ebp), %xmm4
-; X32-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648]
+; X32-NEXT:    vextractf128 $1, %ymm2, %xmm5
 ; X32-NEXT:    vpsrlq $63, %xmm5, %xmm6
 ; X32-NEXT:    vpsrlq $33, %xmm5, %xmm5
 ; X32-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; X32-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; X32-NEXT:    vpsrlq $63, %xmm6, %xmm7
-; X32-NEXT:    vpsrlq $33, %xmm6, %xmm6
-; X32-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4,5,6,7]
-; X32-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; X32-NEXT:    vpsubq %xmm5, %xmm6, %xmm6
+; X32-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,16384,0,0,1,0,0,0]
+; X32-NEXT:    vpxor %xmm6, %xmm5, %xmm5
+; X32-NEXT:    vpsubq %xmm6, %xmm5, %xmm5
 ; X32-NEXT:    vpsrlq $63, %xmm2, %xmm7
 ; X32-NEXT:    vpsrlq $33, %xmm2, %xmm2
 ; X32-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4,5,6,7]
-; X32-NEXT:    vpxor %xmm5, %xmm2, %xmm2
-; X32-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; X32-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; X32-NEXT:    vpxor %xmm6, %xmm2, %xmm2
+; X32-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
+; X32-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
 ; X32-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; X32-NEXT:    vextractf128 $1, %ymm1, %xmm4
 ; X32-NEXT:    vextractf128 $1, %ymm0, %xmm5
diff --git a/test/CodeGen/X86/pr35918.ll b/test/CodeGen/X86/pr35918.ll
index f53bb86ee48..5c84bd946fd 100644
--- a/test/CodeGen/X86/pr35918.ll
+++ b/test/CodeGen/X86/pr35918.ll
@@ -11,9 +11,9 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
 ; X86-SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SKYLAKE-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SKYLAKE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X86-SKYLAKE-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-SKYLAKE-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; X86-SKYLAKE-NEXT:    vpsrad $16, %xmm0, %xmm0
+; X86-SKYLAKE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-SKYLAKE-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; X86-SKYLAKE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
 ; X86-SKYLAKE-NEXT:    vpsrld $7, %xmm0, %xmm0
@@ -29,7 +29,7 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
 ; X86-SKX-NEXT:    subl $8, %esp
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SKX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u]
+; X86-SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
 ; X86-SKX-NEXT:    vpsrad $16, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
@@ -50,9 +50,9 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
 ; X64-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8:
 ; X64-SKYLAKE:       # %bb.0: # %entry
 ; X64-SKYLAKE-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SKYLAKE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X64-SKYLAKE-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-SKYLAKE-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; X64-SKYLAKE-NEXT:    vpsrad $16, %xmm0, %xmm0
+; X64-SKYLAKE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKYLAKE-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; X64-SKYLAKE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
 ; X64-SKYLAKE-NEXT:    vpsrld $7, %xmm0, %xmm0
@@ -65,7 +65,7 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
 ; X64-SKX-LABEL: fetch_r16g16_snorm_unorm8:
 ; X64-SKX:       # %bb.0: # %entry
 ; X64-SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SKX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u]
+; X64-SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
 ; X64-SKX-NEXT:    vpsrad $16, %xmm0, %xmm0
 ; X64-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index c944902d0a3..584a54e68e8 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -990,15 +990,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ;
 ; X32-SSE-LABEL: constant_shift_v2i64:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X32-SSE-NEXT:    psrlq $1, %xmm2
-; X32-SSE-NEXT:    psrlq $7, %xmm1
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X32-SSE-NEXT:    psrlq $1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq $1, %xmm1
 ; X32-SSE-NEXT:    psrlq $7, %xmm0
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    movapd {{.*#+}} xmm1 = [2.0E+0,7.2911220195563975E-304]
 ; X32-SSE-NEXT:    xorpd %xmm1, %xmm0
 ; X32-SSE-NEXT:    psubq %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 7f120166a5d..6d79996164f 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -1066,25 +1066,20 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ;
 ; X32-AVX1-LABEL: constant_shift_v4i64:
 ; X32-AVX1:       # %bb.0:
-; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X32-AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
-; X32-AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm3
-; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; X32-AVX1-NEXT:    vpsrlq $62, %xmm3, %xmm4
-; X32-AVX1-NEXT:    vpsrlq $31, %xmm3, %xmm3
-; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
-; X32-AVX1-NEXT:    vpxor %xmm2, %xmm3, %xmm3
-; X32-AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
-; X32-AVX1-NEXT:    vpsrlq $7, %xmm1, %xmm3
-; X32-AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; X32-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,0,0]
+; X32-AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
 ; X32-AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; X32-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,16384,0,0,0,256]
+; X32-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX2-LABEL: constant_shift_v4i64:
diff --git a/test/CodeGen/X86/vector-trunc-usat.ll b/test/CodeGen/X86/vector-trunc-usat.ll
index 5b00ab58495..0c3766ac978 100644
--- a/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/test/CodeGen/X86/vector-trunc-usat.ll
@@ -716,26 +716,26 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64> %a0) {
 define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
 ; SSE2-LABEL: trunc_usat_v8i32_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm6
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pandn %xmm2, %xmm5
-; SSE2-NEXT:    por %xmm1, %xmm5
-; SSE2-NEXT:    pslld $16, %xmm5
-; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pslld $16, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
 ; SSE2-NEXT:    pslld $16, %xmm0
 ; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    packssdw %xmm5, %xmm0
+; SSE2-NEXT:    packssdw %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_usat_v8i32_v8i16:
@@ -826,36 +826,36 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
 define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) {
 ; SSE2-LABEL: trunc_usat_v16i32_v16i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm8
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm7, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
 ; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
 ; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    pxor %xmm7, %xmm2
 ; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm7, %xmm3
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pandn %xmm8, %xmm6
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm5
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    pandn %xmm8, %xmm5
-; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm7, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm5
 ; SSE2-NEXT:    psrad $16, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm0
@@ -870,36 +870,36 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) {
 ;
 ; SSSE3-LABEL: trunc_usat_v16i32_v16i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm6
+; SSSE3-NEXT:    movdqa %xmm1, %xmm8
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm7
+; SSSE3-NEXT:    pxor %xmm6, %xmm7
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm1
+; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm7
 ; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    pxor %xmm7, %xmm1
 ; SSSE3-NEXT:    por %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa %xmm3, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm6
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm6, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
 ; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    pxor %xmm7, %xmm2
 ; SSSE3-NEXT:    por %xmm3, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm7, %xmm3
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT:    pand %xmm6, %xmm0
-; SSSE3-NEXT:    pandn %xmm8, %xmm6
-; SSSE3-NEXT:    por %xmm6, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm5
-; SSSE3-NEXT:    pand %xmm5, %xmm4
-; SSSE3-NEXT:    pandn %xmm8, %xmm5
-; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pxor %xmm6, %xmm3
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm7, %xmm4
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT:    pxor %xmm5, %xmm7
+; SSSE3-NEXT:    pand %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm7, %xmm5
 ; SSSE3-NEXT:    pslld $16, %xmm5
 ; SSSE3-NEXT:    psrad $16, %xmm5
 ; SSSE3-NEXT:    pslld $16, %xmm0
-- 
GitLab


From 42f23f528396bae1af925a5e18145771c438b123 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 4 Nov 2018 21:37:43 +0000
Subject: [PATCH 0945/1116] [X86] Regenerate test checks to merge 32 and 64
 bit. Remove stale check prefixes. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346105 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../X86/avx2-intrinsics-x86-upgrade.ll        | 784 +++++-------------
 1 file changed, 229 insertions(+), 555 deletions(-)

diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
index d19c58eed73..81e10a5d242 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -5,15 +5,10 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=X64 --check-prefix=X64-AVX512
 
 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_pblendw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pblendw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pblendw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -21,15 +16,10 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind
 
 
 define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pblendd_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pblendd_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pblendd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -37,15 +27,10 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind
 
 
 define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pblendd_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pblendd_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pblendd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -70,15 +55,10 @@ declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
 
 
 define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: test_x86_avx2_mpsadbw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_mpsadbw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_mpsadbw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -86,15 +66,10 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind re
 
 
 define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_psll_dq_bs:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_psll_dq_bs:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_psll_dq_bs:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -102,15 +77,10 @@ declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_psrl_dq_bs:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_psrl_dq_bs:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_psrl_dq_bs:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -118,15 +88,10 @@ declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_psll_dq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_psll_dq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_psll_dq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -134,15 +99,10 @@ declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_psrl_dq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_psrl_dq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_psrl_dq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -150,17 +110,11 @@ declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
 
 
 define <2 x i64> @test_x86_avx2_vextracti128(<4 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_vextracti128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-NEXT:    vzeroupper
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vextracti128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vextracti128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64> %a0, i8 7)
   ret <2 x i64> %res
 }
@@ -168,15 +122,10 @@ declare <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64>, i8) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_vinserti128(<4 x i64> %a0, <2 x i64> %a1) {
-; X86-LABEL: test_x86_avx2_vinserti128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vinserti128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vinserti128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64> %a0, <2 x i64> %a1, i8 7)
   ret <4 x i64> %res
 }
@@ -184,15 +133,10 @@ declare <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64>, <2 x i64>, i8) nounwind
 
 
 define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
-; X86-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastsd %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
   ret <4 x double> %res
 }
@@ -200,15 +144,10 @@ declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind
 
 
 define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
-; X86-LABEL: test_x86_avx2_vbroadcast_ss_ps:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastss %xmm0, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vbroadcast_ss_ps:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastss %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
   ret <4 x float> %res
 }
@@ -216,15 +155,10 @@ declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readon
 
 
 define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
-; X86-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastss %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastss %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
   ret <8 x float> %res
 }
@@ -232,15 +166,10 @@ declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind re
 
 
 define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastb_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpbroadcastb %xmm0, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastb_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpbroadcastb %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastb_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0)
   ret <16 x i8> %res
 }
@@ -248,15 +177,10 @@ declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
 
 
 define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastb_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpbroadcastb %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastb_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastb_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0)
   ret <32 x i8> %res
 }
@@ -264,15 +188,10 @@ declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
 
 
 define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastw_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastw_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastw_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0)
   ret <8 x i16> %res
 }
@@ -280,15 +199,10 @@ declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
 
 
 define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastw_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpbroadcastw %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastw_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpbroadcastw %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastw_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0)
   ret <16 x i16> %res
 }
@@ -296,15 +210,10 @@ declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
 
 
 define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastd_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastss %xmm0, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastd_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastss %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0)
   ret <4 x i32> %res
 }
@@ -312,15 +221,10 @@ declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
 
 
 define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastd_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastss %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastd_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastss %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0)
   ret <8 x i32> %res
 }
@@ -328,15 +232,10 @@ declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
 
 
 define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastq_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastq_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastq_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0)
   ret <2 x i64> %res
 }
@@ -344,15 +243,10 @@ declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
 
 
 define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastq_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastsd %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastq_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastq_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0)
   ret <4 x i64> %res
 }
@@ -360,15 +254,10 @@ declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
 
 
 define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxbd:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxbd %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxbd:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxbd %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxbd:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -376,15 +265,10 @@ declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovsxbq(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxbq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxbq %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxbq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxbq %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxbq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -392,15 +276,10 @@ declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pmovsxbw(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxbw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxbw %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxbw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxbw %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxbw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -408,15 +287,10 @@ declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovsxdq(<4 x i32> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxdq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxdq %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxdq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxdq %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxdq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -424,15 +298,10 @@ declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pmovsxwd(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxwd:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxwd %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxwd:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxwd %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxwd:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -440,15 +309,10 @@ declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovsxwq(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxwq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxwq %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxwq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxwq %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxwq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -456,15 +320,10 @@ declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pmovzxbd(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxbd:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxbd:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxbd:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -472,15 +331,10 @@ declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovzxbq(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxbq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxbq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxbq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -488,15 +342,10 @@ declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pmovzxbw(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxbw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxbw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxbw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -504,15 +353,10 @@ declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovzxdq(<4 x i32> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxdq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxdq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxdq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -520,15 +364,10 @@ declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pmovzxwd(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxwd:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxwd:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxwd:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -536,15 +375,10 @@ declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxwq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxwq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxwq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -576,240 +410,160 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
 
 define <32 x i8> @mm256_max_epi8(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: mm256_max_epi8:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epi8:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epi8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1)
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <16 x i16> @mm256_max_epi16(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: mm256_max_epi16:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epi16:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epi16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1)
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <8 x i32> @mm256_max_epi32(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: mm256_max_epi32:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epi32:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epi32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1)
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 define <32 x i8> @mm256_max_epu8(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: mm256_max_epu8:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epu8:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epu8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1)
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <16 x i16> @mm256_max_epu16(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: mm256_max_epu16:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epu16:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epu16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1)
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <8 x i32> @mm256_max_epu32(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: mm256_max_epu32:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epu32:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epu32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1)
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 define <32 x i8> @mm256_min_epi8(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: mm256_min_epi8:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epi8:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epi8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1)
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <16 x i16> @mm256_min_epi16(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: mm256_min_epi16:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epi16:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epi16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1)
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <8 x i32> @mm256_min_epi32(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: mm256_min_epi32:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epi32:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epi32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1)
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 define <32 x i8> @mm256_min_epu8(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: mm256_min_epu8:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epu8:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epu8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1)
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <16 x i16> @mm256_min_epu16(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: mm256_min_epu16:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epu16:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epu16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1)
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <8 x i32> @mm256_min_epu32(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: mm256_min_epu32:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminud %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epu32:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminud %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epu32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1)
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 define <32 x i8> @mm256_avg_epu8(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: mm256_avg_epu8:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_avg_epu8:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_avg_epu8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <16 x i16> @mm256_avg_epu16(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: mm256_avg_epu16:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_avg_epu16:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_avg_epu16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <32 x i8> @test_x86_avx2_pabs_b(<32 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pabs_b:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpabsb %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pabs_b:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpabsb %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pabs_b:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpabsb %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
 
 define <8 x i32> @test_x86_avx2_pabs_d(<8 x i32> %a0) {
-; X86-LABEL: test_x86_avx2_pabs_d:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpabsd %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pabs_d:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpabsd %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pabs_d:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpabsd %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -817,15 +571,10 @@ declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pabs_w(<16 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pabs_w:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpabsw %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pabs_w:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpabsw %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pabs_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpabsw %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -833,15 +582,10 @@ declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
-; X86-LABEL: test_x86_avx2_vperm2i128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vperm2i128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vperm2i128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -849,15 +593,10 @@ declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind r
 
 
 define <4 x i64> @test_x86_avx2_pmulu_dq(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pmulu_dq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmulu_dq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmulu_dq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -865,15 +604,10 @@ declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnon
 
 
 define <4 x i64> @test_x86_avx2_pmul_dq(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pmul_dq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmul_dq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmul_dq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -881,25 +615,10 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_paddus_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_paddus_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_paddus_b:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -907,25 +626,10 @@ declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnon
 
 
 define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_paddus_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_paddus_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_paddus_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -933,25 +637,10 @@ declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind read
 
 
 define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psubus_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psubus_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_psubus_b:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -959,25 +648,10 @@ declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnon
 
 
 define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psubus_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psubus_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_psubus_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
-- 
GitLab


From ad684cff05250ba330955b7eae9a8e7f965f5abf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 4 Nov 2018 21:37:45 +0000
Subject: [PATCH 0946/1116] [X86] Add nounwind to some tests to remove cfi
 directives from checks. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346106 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/shrink_vmul.ll | 150 +++-----------------------------
 1 file changed, 14 insertions(+), 136 deletions(-)

diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index 3c9fa66d047..f190a417419 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -14,12 +14,10 @@
 ; %op2 = zext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -35,14 +33,11 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -53,7 +48,6 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8:
@@ -103,12 +97,10 @@ entry:
 ; %op2 = zext<4 x i32> %val2
 ; %rst = mul <4 x i32> %op1, %op2
 ;
-define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_4xi8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -123,14 +115,11 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    pmaddwd %xmm0, %xmm2
 ; X86-SSE-NEXT:    movdqu %xmm2, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_4xi8:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -140,7 +129,6 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
 ; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_4xi8:
@@ -188,12 +176,10 @@ entry:
 ; %op2 = zext<8 x i32> %val2
 ; %rst = mul <8 x i32> %op1, %op2
 ;
-define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_8xi8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -210,14 +196,11 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_8xi8:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -231,15 +214,12 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    vmovups %ymm0, (%esi,%ecx,4)
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_8xi8:
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -249,7 +229,6 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -315,12 +294,10 @@ entry:
 ; %op2 = zext<16 x i32> %val2
 ; %rst = mul <16 x i32> %op1, %op2
 ;
-define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_16xi8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -347,14 +324,11 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    movdqu %xmm4, 16(%esi,%ecx,4)
 ; X86-SSE-NEXT:    movdqu %xmm3, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi8:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -376,15 +350,12 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_16xi8:
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -398,7 +369,6 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -486,12 +456,10 @@ entry:
 ; %op2 = zext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi16:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -504,14 +472,11 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -523,7 +488,6 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16:
@@ -571,12 +535,10 @@ entry:
 ; %op2 = zext<4 x i32> %val2
 ; %rst = mul <4 x i32> %op1, %op2
 ;
-define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_4xi16:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -589,14 +551,11 @@ define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_4xi16:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -606,7 +565,6 @@ define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
 ; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_4xi16:
@@ -652,12 +610,10 @@ entry:
 ; %op2 = zext<8 x i32> %val2
 ; %rst = mul <8 x i32> %op1, %op2
 ;
-define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_8xi16:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -673,14 +629,11 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_8xi16:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -694,15 +647,12 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    vmovups %ymm0, (%esi,%ecx,4)
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_8xi16:
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -712,7 +662,6 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -777,12 +726,10 @@ entry:
 ; %op2 = zext<16 x i32> %val2
 ; %rst = mul <16 x i32> %op1, %op2
 ;
-define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_16xi16:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -808,14 +755,11 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6
 ; X86-SSE-NEXT:    movdqu %xmm2, 16(%esi,%ecx,4)
 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi16:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -837,15 +781,12 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6
 ; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_16xi16:
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -859,7 +800,6 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -946,12 +886,10 @@ entry:
 ; %op2 = sext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi8_sext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -969,14 +907,11 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b,
 ; X86-SSE-NEXT:    psrad $16, %xmm0
 ; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_sext:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -987,7 +922,6 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b,
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_sext:
@@ -1039,12 +973,10 @@ entry:
 ; %op2 = zext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi8_sext_zext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1063,14 +995,11 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_sext_zext:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1081,7 +1010,6 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_sext_zext:
@@ -1134,12 +1062,10 @@ entry:
 ; %op2 = sext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi16_sext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1152,14 +1078,11 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_sext:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1170,7 +1093,6 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16_sext:
@@ -1217,12 +1139,10 @@ entry:
 ; %op2 = zext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi16_sext_zext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1242,14 +1162,11 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_sext_zext:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1262,7 +1179,6 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16_sext_zext:
@@ -1318,12 +1234,10 @@ entry:
 ; %op2 = sext<16 x i32> %val2
 ; %rst = mul <16 x i32> %op1, %op2
 ;
-define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_16xi16_sext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1349,14 +1263,11 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %
 ; X86-SSE-NEXT:    movdqu %xmm2, 16(%esi,%ecx,4)
 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi16_sext:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1378,15 +1289,12 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %
 ; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_16xi16_sext:
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1400,7 +1308,6 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -2204,12 +2111,10 @@ entry:
 ; Illegal Types
 ;
 
-define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
+define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X86-SSE-LABEL: PR34947:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movdqa (%eax), %xmm5
@@ -2303,25 +2208,15 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-SSE-NEXT:    movdqa %xmm1, (%eax)
 ; X86-SSE-NEXT:    movdqa %xmm4, (%eax)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: PR34947:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX1-NEXT:    pushl %ebx
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 12
 ; X86-AVX1-NEXT:    pushl %edi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 20
 ; X86-AVX1-NEXT:    subl $8, %esp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 28
-; X86-AVX1-NEXT:    .cfi_offset %esi, -20
-; X86-AVX1-NEXT:    .cfi_offset %edi, -16
-; X86-AVX1-NEXT:    .cfi_offset %ebx, -12
-; X86-AVX1-NEXT:    .cfi_offset %ebp, -8
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    vmovdqa (%eax), %ymm2
@@ -2395,26 +2290,17 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-AVX1-NEXT:    vmovd %xmm1, (%eax)
 ; X86-AVX1-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-AVX1-NEXT:    addl $8, %esp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 20
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-AVX1-NEXT:    popl %edi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 12
 ; X86-AVX1-NEXT:    popl %ebx
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX1-NEXT:    popl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: PR34947:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    pushl %edi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 12
-; X86-AVX2-NEXT:    .cfi_offset %esi, -12
-; X86-AVX2-NEXT:    .cfi_offset %edi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqa (%eax), %ymm2
@@ -2479,9 +2365,7 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-AVX2-NEXT:    vmovd %xmm0, (%eax)
 ; X86-AVX2-NEXT:    vmovdqa %ymm1, (%eax)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX2-NEXT:    popl %edi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -2582,11 +2466,7 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X64-AVX1-LABEL: PR34947:
 ; X64-AVX1:       # %bb.0:
 ; X64-AVX1-NEXT:    pushq %rbp
-; X64-AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; X64-AVX1-NEXT:    pushq %rbx
-; X64-AVX1-NEXT:    .cfi_def_cfa_offset 24
-; X64-AVX1-NEXT:    .cfi_offset %rbx, -24
-; X64-AVX1-NEXT:    .cfi_offset %rbp, -16
 ; X64-AVX1-NEXT:    vmovdqa (%rdi), %ymm2
 ; X64-AVX1-NEXT:    vmovdqa (%rsi), %ymm1
 ; X64-AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
@@ -2657,9 +2537,7 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X64-AVX1-NEXT:    vmovd %xmm1, (%rax)
 ; X64-AVX1-NEXT:    vmovaps %ymm0, (%rax)
 ; X64-AVX1-NEXT:    popq %rbx
-; X64-AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; X64-AVX1-NEXT:    popq %rbp
-; X64-AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
-- 
GitLab


From fd24147338c1a44ed39fe2cd11391f49ddd94695 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Sun, 4 Nov 2018 23:11:57 +0000
Subject: [PATCH 0947/1116] [HotColdSplitting] Use TTI to inform outlining
 threshold

Using TargetTransformInfo allows the splitting pass to factor in the
code size cost of instructions as it decides whether or not outlining is
profitable.

This did not regress the overall amount of outlining seen on the handful
of internal frameworks I tested.

Thanks to Jun Bum Lim for suggesting this!

Differential Revision: https://reviews.llvm.org/D53835

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346108 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/HotColdSplitting.cpp       | 44 +++++++++++--------
 .../Transforms/HotColdSplit/X86/lit.local.cfg |  3 ++
 .../HotColdSplit/X86/outline-expensive.ll     | 25 +++++++++++
 test/Transforms/HotColdSplit/do-not-split.ll  |  3 ++
 test/Transforms/HotColdSplit/minsize.ll       |  4 +-
 .../HotColdSplit/split-out-dbg-val-of-arg.ll  |  3 ++
 6 files changed, 63 insertions(+), 19 deletions(-)
 create mode 100644 test/Transforms/HotColdSplit/X86/lit.local.cfg
 create mode 100644 test/Transforms/HotColdSplit/X86/outline-expensive.ll

diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index ce8a5060a3a..621ac7dc8ab 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -66,10 +66,10 @@ using namespace llvm;
 static cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis",
                               cl::init(true), cl::Hidden);
 
-static cl::opt<unsigned> MinOutliningInstCount(
-    "min-outlining-inst-count", cl::init(3), cl::Hidden,
-    cl::desc("Minimum number of instructions needed for a single-block region "
-             "to be an outlining candidate"));
+static cl::opt<int>
+    MinOutliningThreshold("min-outlining-thresh", cl::init(3), cl::Hidden,
+                          cl::desc("Code size threshold for outlining within a "
+                                   "single BB (as a multiple of TCC_Basic)"));
 
 namespace {
 
@@ -135,14 +135,18 @@ static bool mayExtractBlock(const BasicBlock &BB) {
   return !BB.hasAddressTaken();
 }
 
-/// Check whether \p BB has at least \p Min non-debug, non-terminator
-/// instructions.
-static bool hasMinimumInstCount(const BasicBlock &BB, unsigned Min) {
-  unsigned Count = 0;
+/// Check whether \p BB is profitable to outline (i.e. its code size cost meets
+/// the threshold set in \p MinOutliningThreshold).
+static bool isProfitableToOutline(const BasicBlock &BB,
+                                  TargetTransformInfo &TTI) {
+  int Cost = 0;
   for (const Instruction &I : BB) {
     if (isa<DbgInfoIntrinsic>(&I) || &I == BB.getTerminator())
       continue;
-    if (++Count >= Min)
+
+    Cost += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+
+    if (Cost >= (MinOutliningThreshold * TargetTransformInfo::TCC_Basic))
       return true;
   }
   return false;
@@ -156,8 +160,10 @@ static bool hasMinimumInstCount(const BasicBlock &BB, unsigned Min) {
 ///
 /// Return an empty sequence if the cold region is too small to outline, or if
 /// the cold region has no warm predecessors.
-static BlockSequence
-findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) {
+static BlockSequence findMaximalColdRegion(BasicBlock &SinkBB,
+                                           TargetTransformInfo &TTI,
+                                           DominatorTree &DT,
+                                           PostDomTree &PDT) {
   // The maximal cold region.
   BlockSequence ColdRegion = {};
 
@@ -241,8 +247,7 @@ findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) {
     ++SuccIt;
   }
 
-  if (ColdRegion.size() == 1 &&
-      !hasMinimumInstCount(*ColdRegion[0], MinOutliningInstCount))
+  if (ColdRegion.size() == 1 && !isProfitableToOutline(*ColdRegion[0], TTI))
     return {};
 
   return ColdRegion;
@@ -251,6 +256,7 @@ findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) {
 /// Get the largest cold region in \p F.
 static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI,
                                           BlockFrequencyInfo *BFI,
+                                          TargetTransformInfo &TTI,
                                           DominatorTree &DT, PostDomTree &PDT) {
   // Keep track of the largest cold region.
   BlockSequence LargestColdRegion = {};
@@ -270,7 +276,7 @@ static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI,
     });
 
     // Find a maximal cold region we can outline.
-    BlockSequence ColdRegion = findMaximalColdRegion(BB, DT, PDT);
+    BlockSequence ColdRegion = findMaximalColdRegion(BB, TTI, DT, PDT);
     if (ColdRegion.empty()) {
       LLVM_DEBUG(dbgs() << "  Skipping (block not profitable to extract)\n");
       continue;
@@ -305,7 +311,7 @@ public:
 private:
   bool shouldOutlineFrom(const Function &F) const;
   Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT,
-                              BlockFrequencyInfo *BFI,
+                              BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
                               OptimizationRemarkEmitter &ORE, unsigned Count);
   SmallPtrSet<const Function *, 2> OutlinedFunctions;
   ProfileSummaryInfo *PSI;
@@ -365,6 +371,7 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
 Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
                                               DominatorTree &DT,
                                               BlockFrequencyInfo *BFI,
+                                              TargetTransformInfo &TTI,
                                               OptimizationRemarkEmitter &ORE,
                                               unsigned Count) {
   assert(!Region.empty());
@@ -393,7 +400,7 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
     CallInst *CI = cast<CallInst>(U);
     CallSite CS(CI);
     NumColdRegionsOutlined++;
-    if (GetTTI(*OutF).useColdCCForColdCall(*OutF)) {
+    if (TTI.useColdCCForColdCall(*OutF)) {
       OutF->setCallingConv(CallingConv::Cold);
       CS.setCallingConv(CallingConv::Cold);
     }
@@ -437,14 +444,15 @@ bool HotColdSplitting::run(Module &M) {
     PostDomTree PDT(F);
     PDT.recalculate(F);
     BlockFrequencyInfo *BFI = GetBFI(F);
+    TargetTransformInfo &TTI = GetTTI(F);
 
-    BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, DT, PDT);
+    BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, TTI, DT, PDT);
     if (ColdRegion.empty())
       continue;
 
     OptimizationRemarkEmitter &ORE = (*GetORE)(F);
     Function *Outlined =
-        extractColdRegion(ColdRegion, DT, BFI, ORE, /*Count=*/1);
+        extractColdRegion(ColdRegion, DT, BFI, TTI, ORE, /*Count=*/1);
     if (Outlined) {
       OutlinedFunctions.insert(Outlined);
       Changed = true;
diff --git a/test/Transforms/HotColdSplit/X86/lit.local.cfg b/test/Transforms/HotColdSplit/X86/lit.local.cfg
new file mode 100644
index 00000000000..e71f3cc4c41
--- /dev/null
+++ b/test/Transforms/HotColdSplit/X86/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/HotColdSplit/X86/outline-expensive.ll b/test/Transforms/HotColdSplit/X86/outline-expensive.ll
new file mode 100644
index 00000000000..5b0cceae2af
--- /dev/null
+++ b/test/Transforms/HotColdSplit/X86/outline-expensive.ll
@@ -0,0 +1,25 @@
+; The magic number 6 comes from (1 * TCC_Expensive) + (1 * CostOfCallX86).
+; RUN: opt -hotcoldsplit -min-outlining-thresh=6 -S < %s | FileCheck %s
+
+; Test that we outline even though there are only two cold instructions. TTI
+; should determine that they are expensive in terms of code size.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: @fun
+; CHECK: call void @fun.cold.1
+define void @fun(i32 %x) {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  ret void
+
+if.else:
+  %y = sdiv i32 %x, 111
+  call void @sink(i32 %y)
+  ret void
+}
+
+declare void @sink(i32 %x) cold
diff --git a/test/Transforms/HotColdSplit/do-not-split.ll b/test/Transforms/HotColdSplit/do-not-split.ll
index 213681383ea..d5a8c44cc04 100644
--- a/test/Transforms/HotColdSplit/do-not-split.ll
+++ b/test/Transforms/HotColdSplit/do-not-split.ll
@@ -1,6 +1,9 @@
 ; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
 ; RUN: opt -passes=hotcoldsplit -S < %s | FileCheck %s
 
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
 ; Check that these functions are not split. Outlined functions are called from a
 ; basic block named codeRepl.
 
diff --git a/test/Transforms/HotColdSplit/minsize.ll b/test/Transforms/HotColdSplit/minsize.ll
index eb42ad14af2..69cd0979b94 100644
--- a/test/Transforms/HotColdSplit/minsize.ll
+++ b/test/Transforms/HotColdSplit/minsize.ll
@@ -1,8 +1,10 @@
 ; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
 
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
 ; CHECK-LABEL: @fun
 ; CHECK: call void @fun.cold.1
-
 define void @fun() {
 entry:
   br i1 undef, label %if.then, label %if.else
diff --git a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
index b77201fe0d3..becfaf8e63d 100644
--- a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
+++ b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
@@ -1,5 +1,8 @@
 ; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
 
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
 ; CHECK-LABEL: define {{.*}}@foo.cold
 ; CHECK-NOT: llvm.dbg.value
 
-- 
GitLab


From 32afb0f40d55c93debb8bdf39f3a492aa4c96d2b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 5 Nov 2018 01:21:52 +0000
Subject: [PATCH 0948/1116] [X86] Fix typo in test comment. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346110 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/combine-64bit-vec-binop.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CodeGen/X86/combine-64bit-vec-binop.ll b/test/CodeGen/X86/combine-64bit-vec-binop.ll
index e434bfc11c4..4bd1ebbc93e 100644
--- a/test/CodeGen/X86/combine-64bit-vec-binop.ll
+++ b/test/CodeGen/X86/combine-64bit-vec-binop.ll
@@ -97,7 +97,7 @@ define double @test2_mul(double %A, double %B) {
   ret double %3
 }
 
-; There is no legal ISD::MUL with type MVT::v8i16.
+; There is no legal ISD::MUL with type MVT::v16i8.
 define double @test3_mul(double %A, double %B) {
 ; SSE41-LABEL: test3_mul:
 ; SSE41:       # %bb.0:
-- 
GitLab


From 9de46729ce2fb4bf4b2a19b7b387898f256d827f Mon Sep 17 00:00:00 2001
From: Dylan McKay <me@dylanmckay.io>
Date: Mon, 5 Nov 2018 05:00:44 +0000
Subject: [PATCH 0949/1116] [AVR] Disallow the LDDWRdPtrQ instruction with Z as
 the destination

This is an AVR-specific workaround for a limitation of the register
allocator that only exposes itself on targets with high register
contention like AVR, which only has three pointer registers.

The three pointer registers are X, Y, and Z.
In most nontrivial functions, Y is reserved for the frame pointer,
as per the calling convention. This leaves X and Z. Some instructions,
such as LPM ("load program memory"), are only defined for the Z
register. Sometimes this just leaves X.

When the backend generates a LDDWRdPtrQ instruction with Z as the
destination pointer, it usually trips up the register allocator
with this error message:

  LLVM ERROR: ran out of registers during register allocation

This patch is a hacky workaround. We ban the LDDWRdPtrQ instruction
from ever using the Z register as an operand. This gives the
register allocator a bit more space to allocate, fixing the
regalloc exhaustion error.

Here is a description from the patch author Peter Nimmervoll

  As far as I understand the problem occurs when LDDWRdPtrQ uses
  the ptrdispregs register class as target register. This should work, but
  the allocator can't deal with this for some reason. So from my testing,
  it seams like (and I might be totally wrong on this) the allocator reserves
  the Z register for the ICALL instruction and then the register class
  ptrdispregs only has 1 register left and we can't use Y for source and
  destination. Removing the Z register from DREGS fixes the problem but
  removing Y register does not.

More information about the bug can be found on the avr-rust issue
tracker at https://github.com/avr-rust/rust/issues/37.

A bug has raised to track the removal of this workaround and a proper
fix; PR39553 at https://bugs.llvm.org/show_bug.cgi?id=39553.

Patch by Peter Nimmervoll

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346114 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AVR/AVRInstrInfo.td                |  2 +-
 lib/Target/AVR/AVRRegisterInfo.td             | 20 ++++++++++
 .../AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir    | 10 ++---
 test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir        |  6 +--
 test/CodeGen/AVR/rust-avr-bug-37.ll           | 25 +++++++++++++
 test/CodeGen/AVR/rust-avr-bug-95.ll           | 37 +++++++++++++++++++
 6 files changed, 91 insertions(+), 9 deletions(-)
 create mode 100644 test/CodeGen/AVR/rust-avr-bug-37.ll
 create mode 100644 test/CodeGen/AVR/rust-avr-bug-95.ll

diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index ec4b6c9a777..5720af7d8df 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -1222,7 +1222,7 @@ isReMaterializable = 1 in
   // ldd Rd,   P+q
   // ldd Rd+1, P+q+1
   let Constraints = "@earlyclobber $dst" in
-  def LDDWRdPtrQ : Pseudo<(outs DREGS:$dst),
+  def LDDWRdPtrQ : Pseudo<(outs DREGS_WITHOUT_Z_WORKAROUND:$dst),
                           (ins memri:$memri),
                           "lddw\t$dst, $memri",
                           [(set i16:$dst, (load addr:$memri))]>,
diff --git a/lib/Target/AVR/AVRRegisterInfo.td b/lib/Target/AVR/AVRRegisterInfo.td
index 8162f12052b..d55252bcac4 100644
--- a/lib/Target/AVR/AVRRegisterInfo.td
+++ b/lib/Target/AVR/AVRRegisterInfo.td
@@ -157,6 +157,26 @@ def DREGS : RegisterClass<"AVR", [i16], 8,
     R9R8, R7R6, R5R4, R3R2, R1R0
   )>;
 
+// The 16-bit DREGS register class, excluding the Z pointer register.
+//
+// This is used by instructions which cause high pointer register
+// contention which leads to an assertion in the register allocator.
+//
+// There is no technical reason why instructions that use this class
+// cannot use Z; it's simply a workaround a regalloc bug.
+//
+// More information can be found in PR39553.
+def DREGS_WITHOUT_Z_WORKAROUND : RegisterClass<"AVR", [i16], 8,
+  (
+    // Return value and arguments.
+    add R25R24, R19R18, R21R20, R23R22,
+    // Scratch registers.
+    R27R26,
+    // Callee saved registers.
+    R29R28, R17R16, R15R14, R13R12, R11R10,
+    R9R8, R7R6, R5R4, R3R2, R1R0
+  )>;
+
 // 16-bit register class for immediate instructions.
 def DLDREGS : RegisterClass<"AVR", [i16], 8,
   (
diff --git a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir
index df69f5fffa5..72b20d39d68 100644
--- a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir
+++ b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir
@@ -25,11 +25,11 @@ body: |
 
     ; CHECK-LABEL: test_lddwrdptrq
 
-    ; CHECK:      ldd [[SCRATCH:r[0-9]+]], Z+10
+    ; CHECK:      ldd [[SCRATCH:r[0-9]+]], Y+10
     ; CHECK-NEXT: push [[SCRATCH]]
-    ; CHECK-NEXT: ldd [[SCRATCH]], Z+11
-    ; CHECK-NEXT: mov r31, [[SCRATCH]]
-    ; CHECK-NEXT: pop r30
+    ; CHECK-NEXT: ldd [[SCRATCH]], Y+11
+    ; CHECK-NEXT: mov r29, [[SCRATCH]]
+    ; CHECK-NEXT: pop r28
 
-    early-clobber $r31r30 = LDDWRdPtrQ undef $r31r30, 10
+    early-clobber $r29r28 = LDDWRdPtrQ undef $r29r28, 10
 ...
diff --git a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir
index 59b3ce8b602..96d3809ed2d 100644
--- a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir
+++ b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir
@@ -18,8 +18,8 @@ body: |
 
     ; CHECK-LABEL: test_lddwrdptrq
 
-    ; CHECK:      ldd     r30, Y+10
-    ; CHECK-NEXT: ldd     r31, Y+11
+    ; CHECK:      ldd     r28, Z+10
+    ; CHECK-NEXT: ldd     r29, Z+11
 
-    early-clobber $r31r30 = LDDWRdPtrQ undef $r29r28, 10
+    early-clobber $r29r28 = LDDWRdPtrQ undef $r31r30, 10
 ...
diff --git a/test/CodeGen/AVR/rust-avr-bug-37.ll b/test/CodeGen/AVR/rust-avr-bug-37.ll
new file mode 100644
index 00000000000..9c269d3dab1
--- /dev/null
+++ b/test/CodeGen/AVR/rust-avr-bug-37.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+%"fmt::Formatter" = type { i32, { i8*, void (i8*)** } }
+
+@str.1b = external constant [0 x i8]
+
+define void @"TryFromIntError::Debug"(%"fmt::Formatter"* dereferenceable(32)) unnamed_addr #0 personality i32 (...)* @rust_eh_personality {
+; CHECK-LABEL: "TryFromIntError::Debug"
+start:
+  %builder = alloca i8, align 8
+  %1 = getelementptr inbounds %"fmt::Formatter", %"fmt::Formatter"* %0, i16 0, i32 1
+  %2 = bitcast { i8*, void (i8*)** }* %1 to {}**
+  %3 = load {}*, {}** %2, align 2
+  %4 = getelementptr inbounds %"fmt::Formatter", %"fmt::Formatter"* %0, i16 0, i32 1, i32 1
+  %5 = load void (i8*)**, void (i8*)*** %4, align 2
+  %6 = getelementptr inbounds void (i8*)*, void (i8*)** %5, i16 3
+  %7 = bitcast void (i8*)** %6 to i8 ({}*, i8*, i16)**
+  %8 = load i8 ({}*, i8*, i16)*, i8 ({}*, i8*, i16)** %7, align 2
+  %9 = tail call i8 %8({}* nonnull %3, i8* noalias nonnull readonly getelementptr inbounds ([0 x i8], [0 x i8]* @str.1b, i16 0, i16 0), i16 15)
+  unreachable
+}
+
+declare i32 @rust_eh_personality(...) unnamed_addr
+
+attributes #0 = { uwtable }
\ No newline at end of file
diff --git a/test/CodeGen/AVR/rust-avr-bug-95.ll b/test/CodeGen/AVR/rust-avr-bug-95.ll
new file mode 100644
index 00000000000..9534ceb26e7
--- /dev/null
+++ b/test/CodeGen/AVR/rust-avr-bug-95.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+%"fmt::Formatter.1.77.153.229.305.381.1673" = type { [0 x i8], i32, [0 x i8], i32, [0 x i8], i8, [0 x i8], %"option::Option<usize>.0.76.152.228.304.380.1672", [0 x i8], %"option::Option<usize>.0.76.152.228.304.380.1672", [0 x i8], { {}*, {}* }, [0 x i8], { i8*, i8* }, [0 x i8], { [0 x { i8*, i8* }]*, i16 }, [0 x i8] }
+%"option::Option<usize>.0.76.152.228.304.380.1672" = type { [0 x i8], i8, [2 x i8] }
+
+@str.4S = external constant [5 x i8]
+
+; Function Attrs: uwtable
+define void @"_ZN65_$LT$lib..str..Chars$LT$$u27$a$GT$$u20$as$u20$lib..fmt..Debug$GT$3fmt17h76a537e22649f739E"(%"fmt::Formatter.1.77.153.229.305.381.1673"* dereferenceable(27) %__arg_0) unnamed_addr #0 personality i32 (...)* @rust_eh_personality {
+; CHECK-LABEL: "_ZN65_$LT$lib..str..Chars$LT$$u27$a$GT$$u20$as$u20$lib..fmt..Debug$GT$3fmt17h76a537e22649f739E"
+start:
+  %0 = getelementptr inbounds %"fmt::Formatter.1.77.153.229.305.381.1673", %"fmt::Formatter.1.77.153.229.305.381.1673"* %__arg_0, i16 0, i32 11, i32 0
+  %1 = load {}*, {}** %0, align 1, !noalias !0, !nonnull !9
+  %2 = getelementptr inbounds %"fmt::Formatter.1.77.153.229.305.381.1673", %"fmt::Formatter.1.77.153.229.305.381.1673"* %__arg_0, i16 0, i32 11, i32 1
+  %3 = bitcast {}** %2 to i1 ({}*, [0 x i8]*, i16)***
+  %4 = load i1 ({}*, [0 x i8]*, i16)**, i1 ({}*, [0 x i8]*, i16)*** %3, align 1, !noalias !0, !nonnull !9
+  %5 = getelementptr inbounds i1 ({}*, [0 x i8]*, i16)*, i1 ({}*, [0 x i8]*, i16)** %4, i16 3
+  %6 = load i1 ({}*, [0 x i8]*, i16)*, i1 ({}*, [0 x i8]*, i16)** %5, align 1, !invariant.load !9, !noalias !0, !nonnull !9
+  %7 = tail call zeroext i1 %6({}* nonnull %1, [0 x i8]* noalias nonnull readonly bitcast ([5 x i8]* @str.4S to [0 x i8]*), i16 5), !noalias !10
+  unreachable
+}
+
+declare i32 @rust_eh_personality(...) unnamed_addr
+
+attributes #0 = { uwtable }
+
+!0 = !{!1, !3, !5, !6, !8}
+!1 = distinct !{!1, !2, !"_ZN3lib3fmt9Formatter9write_str17ha1a9656fc66ccbe5E: %data.0"}
+!2 = distinct !{!2, !"_ZN3lib3fmt9Formatter9write_str17ha1a9656fc66ccbe5E"}
+!3 = distinct !{!3, !4, !"_ZN3lib3fmt8builders16debug_struct_new17h352a1de8f89c2bc3E: argument 0"}
+!4 = distinct !{!4, !"_ZN3lib3fmt8builders16debug_struct_new17h352a1de8f89c2bc3E"}
+!5 = distinct !{!5, !4, !"_ZN3lib3fmt8builders16debug_struct_new17h352a1de8f89c2bc3E: %name.0"}
+!6 = distinct !{!6, !7, !"_ZN3lib3fmt9Formatter12debug_struct17ha1ff79f633171b68E: argument 0"}
+!7 = distinct !{!7, !"_ZN3lib3fmt9Formatter12debug_struct17ha1ff79f633171b68E"}
+!8 = distinct !{!8, !7, !"_ZN3lib3fmt9Formatter12debug_struct17ha1ff79f633171b68E: %name.0"}
+!9 = !{}
+!10 = !{!3, !6}
\ No newline at end of file
-- 
GitLab


From e1bfe1a5fa50176aec4b34c24515705cd58f6437 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 5 Nov 2018 05:02:12 +0000
Subject: [PATCH 0950/1116] [X86] Custom type legalize v2i8/v2i16/v2i32 mul to
 use to pmuludq.

v2i8/v2i16/v2i32 are promoted to v2i64. pmuludq takes a v2i64 input and produces a v2i64 output. Since we don't about the upper bits of the type legalized multiply we can use the pmuludq to produce the multiply result for the bits we do care about.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346115 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 42 ++++++++++++++++++++++++++++++
 test/CodeGen/X86/mmx-arith.ll      | 42 ++++++++++++------------------
 test/CodeGen/X86/mulvi32.ll        | 16 ------------
 3 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 891f4a4cbdf..99893be4e60 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -791,6 +791,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
     setOperationAction(ISD::UREM, MVT::v2i32, Custom);
 
+    setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
+    setOperationAction(ISD::MUL,                MVT::v2i16, Custom);
+    setOperationAction(ISD::MUL,                MVT::v2i32, Custom);
+
     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
@@ -25911,6 +25915,24 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case ISD::MUL: {
+    EVT VT = N->getValueType(0);
+    assert(VT.isVector() && VT.getVectorNumElements() == 2 && "Unexpected VT");
+    if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
+      // Promote to a pattern that will be turned into PMULUDQ.
+      SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
+                               N->getOperand(0));
+      N0 = DAG.getNode(ISD::AND, dl, MVT::v2i64, N0,
+                       DAG.getConstant(0xffffffff, dl, MVT::v2i64));
+      SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
+                               N->getOperand(1));
+      N1 = DAG.getNode(ISD::AND, dl, MVT::v2i64, N1,
+                       DAG.getConstant(0xffffffff, dl, MVT::v2i64));
+      SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v2i64, N0, N1);
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
+    }
+    return;
+  }
   case X86ISD::ADDUS:
   case X86ISD::SUBUS:
   case X86ISD::AVG: {
@@ -34422,6 +34444,26 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
 
+  // Look for multiply of 2 identical shuffles with a zero vector. Shuffle the
+  // result and insert the zero there instead. This can occur due to
+  // type legalization of v2i32 multiply to a PMULUDQ pattern.
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if (!DCI.isBeforeLegalize() && isa<ShuffleVectorSDNode>(LHS) &&
+      isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
+      LHS.getOperand(1) == RHS.getOperand(1) &&
+      ISD::isBuildVectorAllZeros(LHS.getOperand(1).getNode())) {
+    ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS);
+    ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
+    if (SVN0->getMask().equals(SVN1->getMask())) {
+      SDLoc dl(N);
+      SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, LHS.getOperand(0),
+                                RHS.getOperand(0));
+      return DAG.getVectorShuffle(VT, dl, Mul, DAG.getConstant(0, dl, VT),
+                                  SVN0->getMask());
+    }
+  }
+
   if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
     return V;
 
diff --git a/test/CodeGen/X86/mmx-arith.ll b/test/CodeGen/X86/mmx-arith.ll
index 2d24cb8df35..60735fba4cd 100644
--- a/test/CodeGen/X86/mmx-arith.ll
+++ b/test/CodeGen/X86/mmx-arith.ll
@@ -213,29 +213,24 @@ define void @test1(x86_mmx* %A, x86_mmx* %B) {
 ; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT:    movdqa %xmm1, %xmm2
-; X32-NEXT:    pmuludq %xmm0, %xmm2
-; X32-NEXT:    psrlq $32, %xmm1
-; X32-NEXT:    pmuludq %xmm0, %xmm1
-; X32-NEXT:    psllq $32, %xmm1
-; X32-NEXT:    paddq %xmm2, %xmm1
+; X32-NEXT:    pmuludq %xmm1, %xmm0
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; X32-NEXT:    movq %xmm1, (%eax)
+; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; X32-NEXT:    andps %xmm0, %xmm1
 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT:    andps %xmm1, %xmm0
+; X32-NEXT:    orps %xmm1, %xmm0
 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; X32-NEXT:    movq %xmm1, (%eax)
 ; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X32-NEXT:    orps %xmm0, %xmm1
+; X32-NEXT:    xorps %xmm0, %xmm1
 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X32-NEXT:    movq %xmm0, (%eax)
-; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT:    xorps %xmm1, %xmm0
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    emms
 ; X32-NEXT:    retl
 ;
@@ -250,29 +245,24 @@ define void @test1(x86_mmx* %A, x86_mmx* %B) {
 ; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT:    movdqa %xmm1, %xmm2
-; X64-NEXT:    pmuludq %xmm0, %xmm2
-; X64-NEXT:    psrlq $32, %xmm1
-; X64-NEXT:    pmuludq %xmm0, %xmm1
-; X64-NEXT:    psllq $32, %xmm1
-; X64-NEXT:    paddq %xmm2, %xmm1
+; X64-NEXT:    pmuludq %xmm1, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; X64-NEXT:    movq %xmm1, (%rdi)
+; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; X64-NEXT:    pand %xmm0, %xmm1
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    por %xmm1, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; X64-NEXT:    movq %xmm1, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X64-NEXT:    por %xmm0, %xmm1
+; X64-NEXT:    pxor %xmm0, %xmm1
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X64-NEXT:    movq %xmm0, (%rdi)
-; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/mulvi32.ll b/test/CodeGen/X86/mulvi32.ll
index 86bd96f88db..6c6737a614b 100644
--- a/test/CodeGen/X86/mulvi32.ll
+++ b/test/CodeGen/X86/mulvi32.ll
@@ -9,28 +9,12 @@
 define <2 x i32> @_mul2xi32a(<2 x i32>, <2 x i32>) {
 ; SSE-LABEL: _mul2xi32a:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrlq $32, %xmm2
-; SSE-NEXT:    pmuludq %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm1, %xmm3
-; SSE-NEXT:    psrlq $32, %xmm3
-; SSE-NEXT:    pmuludq %xmm0, %xmm3
-; SSE-NEXT:    paddq %xmm2, %xmm3
-; SSE-NEXT:    psllq $32, %xmm3
 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
-; SSE-NEXT:    paddq %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: _mul2xi32a:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm2
-; AVX-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vpsrlq $32, %xmm1, %xmm3
-; AVX-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
-; AVX-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsllq $32, %xmm2, %xmm2
 ; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %r = mul <2 x i32> %0, %1
   ret <2 x i32> %r
-- 
GitLab


From 2674d3c5ccb3b57fac1dd38b396bf9d1245bbf60 Mon Sep 17 00:00:00 2001
From: Dylan McKay <me@dylanmckay.io>
Date: Mon, 5 Nov 2018 05:49:04 +0000
Subject: [PATCH 0951/1116] [AVR] Fix a backend bug that left extraneous
 operands after expansion

This patch fixes a bug in the AVR FRMIDX expansion logic.

The expansion would leave a leftover operand from the original FRMIDX,
but now attached to a MOVWRdRr instruction. The MOVWRdRr instruction
did not expect this operand and so LLVM rejected the machine
instruction.

This would trigger an assertion:

    Assertion failed: ((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
                        OpNo < MCID->getNumOperands() || isMetaDataOp) &&
                        "Trying to add an operand to a machine instr that is already done!"),
    function addOperand, file llvm/lib/CodeGen/MachineInstr.cpp

Tim fixed this so that now the FRMIDX is expanded correctly into
a well-formed MOVWRdRr.

Patch by Tim Neumann

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346117 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AVR/AVRRegisterInfo.cpp   |  1 +
 test/CodeGen/AVR/rust-avr-bug-112.ll | 48 ++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 test/CodeGen/AVR/rust-avr-bug-112.ll

diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index d171a620760..808a85e459c 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -152,6 +152,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (MI.getOpcode() == AVR::FRMIDX) {
     MI.setDesc(TII.get(AVR::MOVWRdRr));
     MI.getOperand(FIOperandNum).ChangeToRegister(AVR::R29R28, false);
+    MI.RemoveOperand(2);
 
     assert(Offset > 0 && "Invalid offset");
 
diff --git a/test/CodeGen/AVR/rust-avr-bug-112.ll b/test/CodeGen/AVR/rust-avr-bug-112.ll
new file mode 100644
index 00000000000..7cf14330cdc
--- /dev/null
+++ b/test/CodeGen/AVR/rust-avr-bug-112.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+; The avr-rust bug can be found here:
+; https://github.com/avr-rust/rust/issues/112
+;
+; In this test, the codegen stage generates a FRMIDX
+; instruction. Later in the pipeline, the frame index
+; gets expanded into a 16-bit MOVWRdRr instruction.
+;
+; There was a bug in the FRMIDX->MOVWRdRr expansion logic
+; that could leave the MOVW instruction with an extraneous
+; operand, left over from the original FRMIDX.
+;
+; This would trigger an assertion:
+;
+;   Assertion failed: ((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
+;                       OpNo < MCID->getNumOperands() || isMetaDataOp) &&
+;                       "Trying to add an operand to a machine instr that is already done!"),
+;   function addOperand, file llvm/lib/CodeGen/MachineInstr.cpp
+;
+; The logic has since been fixed.
+
+; CHECK-LABEL: "core::str::slice_error_fail"
+define void @"core::str::slice_error_fail"(i16 %arg) personality i32 (...) addrspace(1)* @rust_eh_personality {
+start:
+  %char_range = alloca { i16, i16 }, align 1
+  br i1 undef, label %"<core::option::Option<T>>::unwrap.exit.thread", label %bb11.i.i
+
+"<core::option::Option<T>>::unwrap.exit.thread":
+  br label %"core::char::methods::<impl char>::len_utf8.exit"
+
+bb11.i.i:
+  %tmp = bitcast { i16, i16 }* %char_range to i8*
+  %tmp1 = icmp ult i32 undef, 65536
+  %..i = select i1 %tmp1, i16 3, i16 4
+  br label %"core::char::methods::<impl char>::len_utf8.exit"
+
+"core::char::methods::<impl char>::len_utf8.exit":
+  %tmp2 = phi i8* [ %tmp, %bb11.i.i ], [ undef, %"<core::option::Option<T>>::unwrap.exit.thread" ]
+  %_0.0.i12 = phi i16 [ %..i, %bb11.i.i ], [ 1, %"<core::option::Option<T>>::unwrap.exit.thread" ]
+  %tmp3 = add i16 %_0.0.i12, %arg
+  store i16 %tmp3, i16* undef, align 1
+  store i8* %tmp2, i8** undef, align 1
+  unreachable
+}
+
+declare i32 @rust_eh_personality(...) addrspace(1)
+
-- 
GitLab


From 8fe35cc92baff720a82ab7cf3baad92f08ab692d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 5 Nov 2018 05:53:03 +0000
Subject: [PATCH 0952/1116] [DAGCombiner] Remove an unused argument from
 tryFoldToZero. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346118 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1eac79f28f2..395d855a18f 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2540,8 +2540,7 @@ SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
 // Since it may not be valid to emit a fold to zero for vector initializers
 // check if we can before folding.
 static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
-                             SelectionDAG &DAG, bool LegalOperations,
-                             bool LegalTypes) {
+                             SelectionDAG &DAG, bool LegalOperations) {
   if (!VT.isVector())
     return DAG.getConstant(0, DL, VT);
   if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
@@ -2568,7 +2567,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   // fold (sub x, x) -> 0
   // FIXME: Refactor this and xor and other similar operations together.
   if (N0 == N1)
-    return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations, LegalTypes);
+    return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
     // fold (sub c1, c2) -> c1-c2
@@ -6154,7 +6153,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
 
   // fold (xor x, x) -> 0
   if (N0 == N1)
-    return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
+    return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations);
 
   // fold (xor (shl 1, x), -1) -> (rotl ~1, x)
   // Here is a concrete example of this equivalence:
-- 
GitLab


From 29dd59de7e11cf150a5a2870160f686fbec742f5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 5 Nov 2018 05:53:06 +0000
Subject: [PATCH 0953/1116] [DAGCombiner] Use tryFoldToZero to simplify some
 code and make it work correctly between LegalTypes and LegalOperations.

The original code avoided creating a zero vector after type legalization, but if we're after type legalization the type we have is legal. The real hazard we need to avoid is creating a build vector after op legalization. tryFoldToZero takes care of checking for this.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346119 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 395d855a18f..fc0e8efebdc 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3839,10 +3839,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
       // Don't try to fold this node if it requires introducing a
       // build vector of all zeros that might be illegal at this stage.
       if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
-        if (!LegalTypes)
-          ShOp = DAG.getConstant(0, SDLoc(N), VT);
-        else
-          ShOp = SDValue();
+        ShOp = tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations);
       }
 
       // (AND (shuf (A, C), shuf (B, C))) -> shuf (AND (A, B), C)
@@ -3860,10 +3857,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
       // build vector of all zeros that might be illegal at this stage.
       ShOp = N0->getOperand(0);
       if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
-        if (!LegalTypes)
-          ShOp = DAG.getConstant(0, SDLoc(N), VT);
-        else
-          ShOp = SDValue();
+        ShOp = tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations);
       }
 
       // (AND (shuf (C, A), shuf (C, B))) -> shuf (C, AND (A, B))
-- 
GitLab


From 98b26220ebd17bac45eb0e62d6784c8d7a333eba Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Mon, 5 Nov 2018 09:20:08 +0000
Subject: [PATCH 0954/1116] [NFC][x86][AArch64] extract-bits.ll: add test with
 'ashr'.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346121 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/AArch64/extract-bits.ll |  32 +
 test/CodeGen/X86/extract-bits.ll     | 885 +++++++++++++++++----------
 2 files changed, 581 insertions(+), 336 deletions(-)

diff --git a/test/CodeGen/AArch64/extract-bits.ll b/test/CodeGen/AArch64/extract-bits.ll
index 21bebc67969..5dbb71939bb 100644
--- a/test/CodeGen/AArch64/extract-bits.ll
+++ b/test/CodeGen/AArch64/extract-bits.ll
@@ -34,6 +34,22 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
   ret i32 %masked
 }
 
+define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; CHECK-LABEL: bextr32_a0_arithmetic:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w9, wzr, #0x1
+; CHECK-NEXT:    lsl w9, w9, w2
+; CHECK-NEXT:    asr w8, w0, w1
+; CHECK-NEXT:    sub w9, w9, #1 // =1
+; CHECK-NEXT:    and w0, w9, w8
+; CHECK-NEXT:    ret
+  %shifted = ashr i32 %val, %numskipbits
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
 define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_a1_indexzext:
 ; CHECK:       // %bb.0:
@@ -124,6 +140,22 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
   ret i64 %masked
 }
 
+define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; CHECK-LABEL: bextr64_a0_arithmetic:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w9, wzr, #0x1
+; CHECK-NEXT:    lsl x9, x9, x2
+; CHECK-NEXT:    asr x8, x0, x1
+; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    and x0, x9, x8
+; CHECK-NEXT:    ret
+  %shifted = ashr i64 %val, %numskipbits
+  %onebit = shl i64 1, %numlowbits
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
 define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_a1_indexzext:
 ; CHECK:       // %bb.0:
diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll
index 4cf3077b6d1..cfe6ba571df 100644
--- a/test/CodeGen/X86/extract-bits.ll
+++ b/test/CodeGen/X86/extract-bits.ll
@@ -97,6 +97,73 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
   ret i32 %masked
 }
 
+define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; X86-NOBMI-LABEL: bextr32_a0_arithmetic:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    sarl %cl, %esi
+; X86-NOBMI-NEXT:    movl $1, %eax
+; X86-NOBMI-NEXT:    movl %edx, %ecx
+; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    decl %eax
+; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: bextr32_a0_arithmetic:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    sarl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1BMI2-LABEL: bextr32_a0_arithmetic:
+; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    sarxl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: bextr32_a0_arithmetic:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI-NEXT:    sarl %cl, %edi
+; X64-NOBMI-NEXT:    movl $1, %eax
+; X64-NOBMI-NEXT:    movl %edx, %ecx
+; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    decl %eax
+; X64-NOBMI-NEXT:    andl %edi, %eax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: bextr32_a0_arithmetic:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    sarl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1BMI2-LABEL: bextr32_a0_arithmetic:
+; X64-BMI1BMI2:       # %bb.0:
+; X64-BMI1BMI2-NEXT:    sarxl %esi, %edi, %eax
+; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    retq
+  %shifted = ashr i32 %val, %numskipbits
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
 define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_a1_indexzext:
 ; X86-NOBMI:       # %bb.0:
@@ -499,22 +566,22 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB6_2
+; X86-NOBMI-NEXT:    je .LBB7_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB6_2:
+; X86-NOBMI-NEXT:  .LBB7_2:
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB6_4
+; X86-NOBMI-NEXT:    je .LBB7_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB6_4:
+; X86-NOBMI-NEXT:  .LBB7_4:
 ; X86-NOBMI-NEXT:    addl $-1, %eax
 ; X86-NOBMI-NEXT:    adcl $-1, %edx
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -535,22 +602,22 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB6_2
+; X86-BMI1NOTBM-NEXT:    je .LBB7_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB6_2:
+; X86-BMI1NOTBM-NEXT:  .LBB7_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %ch
-; X86-BMI1NOTBM-NEXT:    je .LBB6_4
+; X86-BMI1NOTBM-NEXT:    je .LBB7_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB6_4:
+; X86-BMI1NOTBM-NEXT:  .LBB7_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
@@ -571,22 +638,22 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB6_2
+; X86-BMI1BMI2-NEXT:    je .LBB7_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB6_2:
+; X86-BMI1BMI2-NEXT:  .LBB7_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %bl
-; X86-BMI1BMI2-NEXT:    je .LBB6_4
+; X86-BMI1BMI2-NEXT:    je .LBB7_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI1BMI2-NEXT:  .LBB6_4:
+; X86-BMI1BMI2-NEXT:  .LBB7_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %edx
 ; X86-BMI1BMI2-NEXT:    andl %esi, %eax
@@ -629,6 +696,152 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
   ret i64 %masked
 }
 
+define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; X86-NOBMI-LABEL: bextr64_a0_arithmetic:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    sarl %cl, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %edi
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB8_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    sarl $31, %eax
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:  .LBB8_2:
+; X86-NOBMI-NEXT:    movl $1, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movb %ch, %cl
+; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
+; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    testb $32, %ch
+; X86-NOBMI-NEXT:    je .LBB8_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB8_4:
+; X86-NOBMI-NEXT:    addl $-1, %eax
+; X86-NOBMI-NEXT:    adcl $-1, %edx
+; X86-NOBMI-NEXT:    andl %edi, %eax
+; X86-NOBMI-NEXT:    andl %esi, %edx
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: bextr64_a0_arithmetic:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl %eax, %esi
+; X86-BMI1NOTBM-NEXT:    sarl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %edi
+; X86-BMI1NOTBM-NEXT:    testb $32, %cl
+; X86-BMI1NOTBM-NEXT:    je .LBB8_2
+; X86-BMI1NOTBM-NEXT:  # %bb.1:
+; X86-BMI1NOTBM-NEXT:    sarl $31, %eax
+; X86-BMI1NOTBM-NEXT:    movl %esi, %edi
+; X86-BMI1NOTBM-NEXT:    movl %eax, %esi
+; X86-BMI1NOTBM-NEXT:  .LBB8_2:
+; X86-BMI1NOTBM-NEXT:    movl $1, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
+; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    testb $32, %ch
+; X86-BMI1NOTBM-NEXT:    je .LBB8_4
+; X86-BMI1NOTBM-NEXT:  # %bb.3:
+; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
+; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
+; X86-BMI1NOTBM-NEXT:  .LBB8_4:
+; X86-BMI1NOTBM-NEXT:    addl $-1, %eax
+; X86-BMI1NOTBM-NEXT:    adcl $-1, %edx
+; X86-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X86-BMI1NOTBM-NEXT:    andl %esi, %edx
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1BMI2-LABEL: bextr64_a0_arithmetic:
+; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1BMI2-NEXT:    sarxl %ecx, %eax, %edi
+; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    je .LBB8_2
+; X86-BMI1BMI2-NEXT:  # %bb.1:
+; X86-BMI1BMI2-NEXT:    sarl $31, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    movl %eax, %edi
+; X86-BMI1BMI2-NEXT:  .LBB8_2:
+; X86-BMI1BMI2-NEXT:    movl $1, %eax
+; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
+; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI1BMI2-NEXT:    shlxl %ebx, %eax, %eax
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
+; X86-BMI1BMI2-NEXT:    je .LBB8_4
+; X86-BMI1BMI2-NEXT:  # %bb.3:
+; X86-BMI1BMI2-NEXT:    movl %eax, %edx
+; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI1BMI2-NEXT:  .LBB8_4:
+; X86-BMI1BMI2-NEXT:    addl $-1, %eax
+; X86-BMI1BMI2-NEXT:    adcl $-1, %edx
+; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    andl %edi, %edx
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: bextr64_a0_arithmetic:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movq %rsi, %rcx
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI-NEXT:    sarq %cl, %rdi
+; X64-NOBMI-NEXT:    movl $1, %eax
+; X64-NOBMI-NEXT:    movl %edx, %ecx
+; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    decq %rax
+; X64-NOBMI-NEXT:    andq %rdi, %rax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: bextr64_a0_arithmetic:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-BMI1NOTBM-NEXT:    sarq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1BMI2-LABEL: bextr64_a0_arithmetic:
+; X64-BMI1BMI2:       # %bb.0:
+; X64-BMI1BMI2-NEXT:    sarxq %rsi, %rdi, %rax
+; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    retq
+  %shifted = ashr i64 %val, %numskipbits
+  %onebit = shl i64 1, %numlowbits
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
 define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_a1_indexzext:
 ; X86-NOBMI:       # %bb.0:
@@ -642,22 +855,22 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB7_2
+; X86-NOBMI-NEXT:    je .LBB9_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB7_2:
+; X86-NOBMI-NEXT:  .LBB9_2:
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB7_4
+; X86-NOBMI-NEXT:    je .LBB9_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB7_4:
+; X86-NOBMI-NEXT:  .LBB9_4:
 ; X86-NOBMI-NEXT:    addl $-1, %eax
 ; X86-NOBMI-NEXT:    adcl $-1, %edx
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -678,22 +891,22 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB7_2
+; X86-BMI1NOTBM-NEXT:    je .LBB9_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB7_2:
+; X86-BMI1NOTBM-NEXT:  .LBB9_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %ch
-; X86-BMI1NOTBM-NEXT:    je .LBB7_4
+; X86-BMI1NOTBM-NEXT:    je .LBB9_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB7_4:
+; X86-BMI1NOTBM-NEXT:  .LBB9_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
@@ -714,22 +927,22 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB7_2
+; X86-BMI1BMI2-NEXT:    je .LBB9_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB7_2:
+; X86-BMI1BMI2-NEXT:  .LBB9_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %bl
-; X86-BMI1BMI2-NEXT:    je .LBB7_4
+; X86-BMI1BMI2-NEXT:    je .LBB9_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI1BMI2-NEXT:  .LBB7_4:
+; X86-BMI1BMI2-NEXT:  .LBB9_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %edx
 ; X86-BMI1BMI2-NEXT:    andl %esi, %eax
@@ -791,22 +1004,22 @@ define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB8_2
+; X86-NOBMI-NEXT:    je .LBB10_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB8_2:
+; X86-NOBMI-NEXT:  .LBB10_2:
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB8_4
+; X86-NOBMI-NEXT:    je .LBB10_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB8_4:
+; X86-NOBMI-NEXT:  .LBB10_4:
 ; X86-NOBMI-NEXT:    addl $-1, %eax
 ; X86-NOBMI-NEXT:    adcl $-1, %edx
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -828,22 +1041,22 @@ define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB8_2
+; X86-BMI1NOTBM-NEXT:    je .LBB10_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB8_2:
+; X86-BMI1NOTBM-NEXT:  .LBB10_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %ch
-; X86-BMI1NOTBM-NEXT:    je .LBB8_4
+; X86-BMI1NOTBM-NEXT:    je .LBB10_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB8_4:
+; X86-BMI1NOTBM-NEXT:  .LBB10_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
@@ -865,22 +1078,22 @@ define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB8_2
+; X86-BMI1BMI2-NEXT:    je .LBB10_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB8_2:
+; X86-BMI1BMI2-NEXT:  .LBB10_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %bl
-; X86-BMI1BMI2-NEXT:    je .LBB8_4
+; X86-BMI1BMI2-NEXT:    je .LBB10_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI1BMI2-NEXT:  .LBB8_4:
+; X86-BMI1BMI2-NEXT:  .LBB10_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %edx
 ; X86-BMI1BMI2-NEXT:    andl %esi, %eax
@@ -940,22 +1153,22 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB9_2
+; X86-NOBMI-NEXT:    je .LBB11_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB9_2:
+; X86-NOBMI-NEXT:  .LBB11_2:
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB9_4
+; X86-NOBMI-NEXT:    je .LBB11_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB9_4:
+; X86-NOBMI-NEXT:  .LBB11_4:
 ; X86-NOBMI-NEXT:    addl $-1, %eax
 ; X86-NOBMI-NEXT:    adcl $-1, %edx
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -977,22 +1190,22 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB9_2
+; X86-BMI1NOTBM-NEXT:    je .LBB11_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB9_2:
+; X86-BMI1NOTBM-NEXT:  .LBB11_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %ch
-; X86-BMI1NOTBM-NEXT:    je .LBB9_4
+; X86-BMI1NOTBM-NEXT:    je .LBB11_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB9_4:
+; X86-BMI1NOTBM-NEXT:  .LBB11_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
@@ -1014,22 +1227,22 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB9_2
+; X86-BMI1BMI2-NEXT:    je .LBB11_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB9_2:
+; X86-BMI1BMI2-NEXT:  .LBB11_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %bl
-; X86-BMI1BMI2-NEXT:    je .LBB9_4
+; X86-BMI1BMI2-NEXT:    je .LBB11_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI1BMI2-NEXT:  .LBB9_4:
+; X86-BMI1BMI2-NEXT:  .LBB11_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %edx
 ; X86-BMI1BMI2-NEXT:    andl %esi, %eax
@@ -1093,22 +1306,22 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    shrl %cl, %edx
 ; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB10_2
+; X86-NOBMI-NEXT:    je .LBB12_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edx, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:  .LBB10_2:
+; X86-NOBMI-NEXT:  .LBB12_2:
 ; X86-NOBMI-NEXT:    movl $1, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB10_4
+; X86-NOBMI-NEXT:    je .LBB12_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %esi, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
-; X86-NOBMI-NEXT:  .LBB10_4:
+; X86-NOBMI-NEXT:  .LBB12_4:
 ; X86-NOBMI-NEXT:    addl $-1, %esi
 ; X86-NOBMI-NEXT:    adcl $-1, %edi
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -1129,22 +1342,22 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB10_2
+; X86-BMI1NOTBM-NEXT:    je .LBB12_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB10_2:
+; X86-BMI1NOTBM-NEXT:  .LBB12_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %esi, %edi
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %ch
-; X86-BMI1NOTBM-NEXT:    je .LBB10_4
+; X86-BMI1NOTBM-NEXT:    je .LBB12_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
-; X86-BMI1NOTBM-NEXT:  .LBB10_4:
+; X86-BMI1NOTBM-NEXT:  .LBB12_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
@@ -1165,22 +1378,22 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB10_2
+; X86-BMI1BMI2-NEXT:    je .LBB12_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB10_2:
+; X86-BMI1BMI2-NEXT:  .LBB12_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ebx, %edi, %ecx
 ; X86-BMI1BMI2-NEXT:    testb $32, %bl
-; X86-BMI1BMI2-NEXT:    je .LBB10_4
+; X86-BMI1BMI2-NEXT:    je .LBB12_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ecx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %ecx, %ecx
-; X86-BMI1BMI2-NEXT:  .LBB10_4:
+; X86-BMI1BMI2-NEXT:  .LBB12_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %ecx
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %esi
 ; X86-BMI1BMI2-NEXT:    andl %ecx, %eax
@@ -1240,22 +1453,22 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    shrl %cl, %ebp
 ; X86-NOBMI-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %al
-; X86-NOBMI-NEXT:    je .LBB11_2
+; X86-NOBMI-NEXT:    je .LBB13_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %ebp, %ebx
 ; X86-NOBMI-NEXT:    xorl %ebp, %ebp
-; X86-NOBMI-NEXT:  .LBB11_2:
+; X86-NOBMI-NEXT:  .LBB13_2:
 ; X86-NOBMI-NEXT:    movl $1, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %dl
-; X86-NOBMI-NEXT:    je .LBB11_4
+; X86-NOBMI-NEXT:    je .LBB13_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %esi, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
-; X86-NOBMI-NEXT:  .LBB11_4:
+; X86-NOBMI-NEXT:  .LBB13_4:
 ; X86-NOBMI-NEXT:    addl $-1, %esi
 ; X86-NOBMI-NEXT:    adcl $-1, %edi
 ; X86-NOBMI-NEXT:    andl %ebx, %esi
@@ -1290,22 +1503,22 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebp
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB11_2
+; X86-BMI1NOTBM-NEXT:    je .LBB13_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %ebp, %ebx
 ; X86-BMI1NOTBM-NEXT:    xorl %ebp, %ebp
-; X86-BMI1NOTBM-NEXT:  .LBB11_2:
+; X86-BMI1NOTBM-NEXT:  .LBB13_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %esi, %edi
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %dl
-; X86-BMI1NOTBM-NEXT:    je .LBB11_4
+; X86-BMI1NOTBM-NEXT:    je .LBB13_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
-; X86-BMI1NOTBM-NEXT:  .LBB11_4:
+; X86-BMI1NOTBM-NEXT:  .LBB13_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    andl %ebx, %esi
@@ -1339,22 +1552,22 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %esi, %ebp
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB11_2
+; X86-BMI1BMI2-NEXT:    je .LBB13_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
 ; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
-; X86-BMI1BMI2-NEXT:  .LBB11_2:
+; X86-BMI1BMI2-NEXT:  .LBB13_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI1BMI2-NEXT:    movl %edx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %edx, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %dl
-; X86-BMI1BMI2-NEXT:    je .LBB11_4
+; X86-BMI1BMI2-NEXT:    je .LBB13_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB11_4:
+; X86-BMI1BMI2-NEXT:  .LBB13_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %esi
 ; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
@@ -1893,22 +2106,22 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB18_2
+; X86-NOBMI-NEXT:    je .LBB20_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB18_2:
+; X86-NOBMI-NEXT:  .LBB20_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB18_4
+; X86-NOBMI-NEXT:    je .LBB20_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB18_4:
+; X86-NOBMI-NEXT:  .LBB20_4:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
@@ -1930,22 +2143,22 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB18_2
+; X86-BMI1NOTBM-NEXT:    je .LBB20_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB18_2:
+; X86-BMI1NOTBM-NEXT:  .LBB20_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB18_4
+; X86-BMI1NOTBM-NEXT:    je .LBB20_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB18_4:
+; X86-BMI1NOTBM-NEXT:  .LBB20_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    popl %esi
@@ -1965,21 +2178,21 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB18_2
+; X86-BMI1BMI2-NEXT:    je .LBB20_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB18_2:
+; X86-BMI1BMI2-NEXT:  .LBB20_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB18_4
+; X86-BMI1BMI2-NEXT:    je .LBB20_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB18_4:
+; X86-BMI1BMI2-NEXT:  .LBB20_4:
 ; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
@@ -2033,22 +2246,22 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB19_2
+; X86-NOBMI-NEXT:    je .LBB21_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB19_2:
+; X86-NOBMI-NEXT:  .LBB21_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB19_4
+; X86-NOBMI-NEXT:    je .LBB21_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB19_4:
+; X86-NOBMI-NEXT:  .LBB21_4:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
@@ -2070,22 +2283,22 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB19_2
+; X86-BMI1NOTBM-NEXT:    je .LBB21_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB19_2:
+; X86-BMI1NOTBM-NEXT:  .LBB21_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB19_4
+; X86-BMI1NOTBM-NEXT:    je .LBB21_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB19_4:
+; X86-BMI1NOTBM-NEXT:  .LBB21_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    popl %esi
@@ -2105,21 +2318,21 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB19_2
+; X86-BMI1BMI2-NEXT:    je .LBB21_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB19_2:
+; X86-BMI1BMI2-NEXT:  .LBB21_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB19_4
+; X86-BMI1BMI2-NEXT:    je .LBB21_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB19_4:
+; X86-BMI1BMI2-NEXT:  .LBB21_4:
 ; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
@@ -2179,22 +2392,22 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB20_2
+; X86-NOBMI-NEXT:    je .LBB22_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB20_2:
+; X86-NOBMI-NEXT:  .LBB22_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB20_4
+; X86-NOBMI-NEXT:    je .LBB22_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB20_4:
+; X86-NOBMI-NEXT:  .LBB22_4:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
@@ -2217,22 +2430,22 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB20_2
+; X86-BMI1NOTBM-NEXT:    je .LBB22_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB20_2:
+; X86-BMI1NOTBM-NEXT:  .LBB22_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB20_4
+; X86-BMI1NOTBM-NEXT:    je .LBB22_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB20_4:
+; X86-BMI1NOTBM-NEXT:  .LBB22_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    popl %esi
@@ -2253,21 +2466,21 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB20_2
+; X86-BMI1BMI2-NEXT:    je .LBB22_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB20_2:
+; X86-BMI1BMI2-NEXT:  .LBB22_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB20_4
+; X86-BMI1BMI2-NEXT:    je .LBB22_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB20_4:
+; X86-BMI1BMI2-NEXT:  .LBB22_4:
 ; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
@@ -2325,22 +2538,22 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB21_2
+; X86-NOBMI-NEXT:    je .LBB23_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB21_2:
+; X86-NOBMI-NEXT:  .LBB23_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB21_4
+; X86-NOBMI-NEXT:    je .LBB23_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB21_4:
+; X86-NOBMI-NEXT:  .LBB23_4:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
@@ -2363,22 +2576,22 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB21_2
+; X86-BMI1NOTBM-NEXT:    je .LBB23_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB21_2:
+; X86-BMI1NOTBM-NEXT:  .LBB23_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB21_4
+; X86-BMI1NOTBM-NEXT:    je .LBB23_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB21_4:
+; X86-BMI1NOTBM-NEXT:  .LBB23_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    popl %esi
@@ -2399,21 +2612,21 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB21_2
+; X86-BMI1BMI2-NEXT:    je .LBB23_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB21_2:
+; X86-BMI1BMI2-NEXT:  .LBB23_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB21_4
+; X86-BMI1BMI2-NEXT:    je .LBB23_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB21_4:
+; X86-BMI1BMI2-NEXT:  .LBB23_4:
 ; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
@@ -2475,22 +2688,22 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    shrl %cl, %edx
 ; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB22_2
+; X86-NOBMI-NEXT:    je .LBB24_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edx, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:  .LBB22_2:
+; X86-NOBMI-NEXT:  .LBB24_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edi
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %edi
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB22_4
+; X86-NOBMI-NEXT:    je .LBB24_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %esi, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
-; X86-NOBMI-NEXT:  .LBB22_4:
+; X86-NOBMI-NEXT:  .LBB24_4:
 ; X86-NOBMI-NEXT:    notl %edi
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %esi
@@ -2512,22 +2725,22 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB22_2
+; X86-BMI1NOTBM-NEXT:    je .LBB24_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB22_2:
+; X86-BMI1NOTBM-NEXT:  .LBB24_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB22_4
+; X86-BMI1NOTBM-NEXT:    je .LBB24_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB22_4:
+; X86-BMI1NOTBM-NEXT:  .LBB24_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    popl %esi
@@ -2547,21 +2760,21 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB22_2
+; X86-BMI1BMI2-NEXT:    je .LBB24_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB22_2:
+; X86-BMI1BMI2-NEXT:  .LBB24_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB22_4
+; X86-BMI1BMI2-NEXT:    je .LBB24_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB22_4:
+; X86-BMI1BMI2-NEXT:  .LBB24_4:
 ; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
@@ -2619,22 +2832,22 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    shrl %cl, %ebp
 ; X86-NOBMI-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %al
-; X86-NOBMI-NEXT:    je .LBB23_2
+; X86-NOBMI-NEXT:    je .LBB25_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %ebp, %ebx
 ; X86-NOBMI-NEXT:    xorl %ebp, %ebp
-; X86-NOBMI-NEXT:  .LBB23_2:
+; X86-NOBMI-NEXT:  .LBB25_2:
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %edi
 ; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    shldl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %dl
-; X86-NOBMI-NEXT:    je .LBB23_4
+; X86-NOBMI-NEXT:    je .LBB25_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB23_4:
+; X86-NOBMI-NEXT:  .LBB25_4:
 ; X86-NOBMI-NEXT:    notl %esi
 ; X86-NOBMI-NEXT:    andl %ebp, %esi
 ; X86-NOBMI-NEXT:    notl %edi
@@ -2669,22 +2882,22 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB23_2
+; X86-BMI1NOTBM-NEXT:    je .LBB25_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
-; X86-BMI1NOTBM-NEXT:  .LBB23_2:
+; X86-BMI1NOTBM-NEXT:  .LBB25_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebp
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %dl
-; X86-BMI1NOTBM-NEXT:    je .LBB23_4
+; X86-BMI1NOTBM-NEXT:    je .LBB25_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebp, %ebx
 ; X86-BMI1NOTBM-NEXT:    xorl %ebp, %ebp
-; X86-BMI1NOTBM-NEXT:  .LBB23_4:
+; X86-BMI1NOTBM-NEXT:  .LBB25_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    andnl %edi, %ebp, %edi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
@@ -2716,21 +2929,21 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %esi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB23_2
+; X86-BMI1BMI2-NEXT:    je .LBB25_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %esi, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB23_2:
+; X86-BMI1BMI2-NEXT:  .LBB25_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
 ; X86-BMI1BMI2-NEXT:    shlxl %edx, %ebp, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %edx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %ebp, %ebp
 ; X86-BMI1BMI2-NEXT:    testb $32, %dl
-; X86-BMI1BMI2-NEXT:    je .LBB23_4
+; X86-BMI1BMI2-NEXT:    je .LBB25_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB23_4:
+; X86-BMI1BMI2-NEXT:  .LBB25_4:
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebp, %esi
 ; X86-BMI1BMI2-NEXT:    andnl %edi, %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
@@ -3655,11 +3868,11 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB30_2
+; X86-NOBMI-NEXT:    je .LBB32_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB30_2:
+; X86-NOBMI-NEXT:  .LBB32_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
@@ -3667,11 +3880,11 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB30_4
+; X86-NOBMI-NEXT:    je .LBB32_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %ebx, %ebp
 ; X86-NOBMI-NEXT:    xorl %ebx, %ebx
-; X86-NOBMI-NEXT:  .LBB30_4:
+; X86-NOBMI-NEXT:  .LBB32_4:
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %ebp
@@ -3702,11 +3915,11 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB30_2
+; X86-BMI1NOTBM-NEXT:    je .LBB32_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB30_2:
+; X86-BMI1NOTBM-NEXT:  .LBB32_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
@@ -3714,11 +3927,11 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB30_4
+; X86-BMI1NOTBM-NEXT:    je .LBB32_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB30_4:
+; X86-BMI1NOTBM-NEXT:  .LBB32_4:
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %ebp
@@ -3748,22 +3961,22 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB30_2
+; X86-BMI1BMI2-NEXT:    je .LBB32_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB30_2:
+; X86-BMI1BMI2-NEXT:  .LBB32_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB30_4
+; X86-BMI1BMI2-NEXT:    je .LBB32_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
 ; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
-; X86-BMI1BMI2-NEXT:  .LBB30_4:
+; X86-BMI1BMI2-NEXT:  .LBB32_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
@@ -3864,11 +4077,11 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB31_2
+; X86-NOBMI-NEXT:    je .LBB33_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB31_2:
+; X86-NOBMI-NEXT:  .LBB33_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
@@ -3876,11 +4089,11 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB31_4
+; X86-NOBMI-NEXT:    je .LBB33_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %ebx, %ebp
 ; X86-NOBMI-NEXT:    xorl %ebx, %ebx
-; X86-NOBMI-NEXT:  .LBB31_4:
+; X86-NOBMI-NEXT:  .LBB33_4:
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %ebp
@@ -3911,11 +4124,11 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB31_2
+; X86-BMI1NOTBM-NEXT:    je .LBB33_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB31_2:
+; X86-BMI1NOTBM-NEXT:  .LBB33_2:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
@@ -3923,11 +4136,11 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB31_4
+; X86-BMI1NOTBM-NEXT:    je .LBB33_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB31_4:
+; X86-BMI1NOTBM-NEXT:  .LBB33_4:
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %ebp
@@ -3957,22 +4170,22 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB31_2
+; X86-BMI1BMI2-NEXT:    je .LBB33_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB31_2:
+; X86-BMI1BMI2-NEXT:  .LBB33_2:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB31_4
+; X86-BMI1BMI2-NEXT:    je .LBB33_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
 ; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
-; X86-BMI1BMI2-NEXT:  .LBB31_4:
+; X86-BMI1BMI2-NEXT:  .LBB33_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
@@ -4077,11 +4290,11 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB32_2
+; X86-NOBMI-NEXT:    je .LBB34_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB32_2:
+; X86-NOBMI-NEXT:  .LBB34_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
@@ -4089,11 +4302,11 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB32_4
+; X86-NOBMI-NEXT:    je .LBB34_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %ebx, %ebp
 ; X86-NOBMI-NEXT:    xorl %ebx, %ebx
-; X86-NOBMI-NEXT:  .LBB32_4:
+; X86-NOBMI-NEXT:  .LBB34_4:
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %ebp
@@ -4125,11 +4338,11 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB32_2
+; X86-BMI1NOTBM-NEXT:    je .LBB34_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB32_2:
+; X86-BMI1NOTBM-NEXT:  .LBB34_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
@@ -4137,11 +4350,11 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB32_4
+; X86-BMI1NOTBM-NEXT:    je .LBB34_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB32_4:
+; X86-BMI1NOTBM-NEXT:  .LBB34_4:
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %ebp
@@ -4172,22 +4385,22 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB32_2
+; X86-BMI1BMI2-NEXT:    je .LBB34_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB32_2:
+; X86-BMI1BMI2-NEXT:  .LBB34_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB32_4
+; X86-BMI1BMI2-NEXT:    je .LBB34_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
 ; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
-; X86-BMI1BMI2-NEXT:  .LBB32_4:
+; X86-BMI1BMI2-NEXT:  .LBB34_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
@@ -4290,11 +4503,11 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB33_2
+; X86-NOBMI-NEXT:    je .LBB35_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB33_2:
+; X86-NOBMI-NEXT:  .LBB35_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
@@ -4302,11 +4515,11 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB33_4
+; X86-NOBMI-NEXT:    je .LBB35_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %ebx, %ebp
 ; X86-NOBMI-NEXT:    xorl %ebx, %ebx
-; X86-NOBMI-NEXT:  .LBB33_4:
+; X86-NOBMI-NEXT:  .LBB35_4:
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %ebp
@@ -4338,11 +4551,11 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB33_2
+; X86-BMI1NOTBM-NEXT:    je .LBB35_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB33_2:
+; X86-BMI1NOTBM-NEXT:  .LBB35_2:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
@@ -4350,11 +4563,11 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB33_4
+; X86-BMI1NOTBM-NEXT:    je .LBB35_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB33_4:
+; X86-BMI1NOTBM-NEXT:  .LBB35_4:
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %ebp
@@ -4385,22 +4598,22 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB33_2
+; X86-BMI1BMI2-NEXT:    je .LBB35_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB33_2:
+; X86-BMI1BMI2-NEXT:  .LBB35_2:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB33_4
+; X86-BMI1BMI2-NEXT:    je .LBB35_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
 ; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
-; X86-BMI1BMI2-NEXT:  .LBB33_4:
+; X86-BMI1BMI2-NEXT:  .LBB35_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
@@ -4505,11 +4718,11 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB34_2
+; X86-NOBMI-NEXT:    je .LBB36_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB34_2:
+; X86-NOBMI-NEXT:  .LBB36_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
@@ -4517,11 +4730,11 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB34_4
+; X86-NOBMI-NEXT:    je .LBB36_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %ebx, %ebp
 ; X86-NOBMI-NEXT:    xorl %ebx, %ebx
-; X86-NOBMI-NEXT:  .LBB34_4:
+; X86-NOBMI-NEXT:  .LBB36_4:
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %ebp
@@ -4552,11 +4765,11 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB34_2
+; X86-BMI1NOTBM-NEXT:    je .LBB36_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB34_2:
+; X86-BMI1NOTBM-NEXT:  .LBB36_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
@@ -4564,11 +4777,11 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB34_4
+; X86-BMI1NOTBM-NEXT:    je .LBB36_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB34_4:
+; X86-BMI1NOTBM-NEXT:  .LBB36_4:
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %ebp
@@ -4598,22 +4811,22 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB34_2
+; X86-BMI1BMI2-NEXT:    je .LBB36_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB34_2:
+; X86-BMI1BMI2-NEXT:  .LBB36_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB34_4
+; X86-BMI1BMI2-NEXT:    je .LBB36_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
 ; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
-; X86-BMI1BMI2-NEXT:  .LBB34_4:
+; X86-BMI1BMI2-NEXT:  .LBB36_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
@@ -4714,11 +4927,11 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB35_2
+; X86-NOBMI-NEXT:    je .LBB37_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB35_2:
+; X86-NOBMI-NEXT:  .LBB37_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
@@ -4726,11 +4939,11 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    shrl %cl, %ebp
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB35_4
+; X86-NOBMI-NEXT:    je .LBB37_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %ebp, %ebx
 ; X86-NOBMI-NEXT:    xorl %ebp, %ebp
-; X86-NOBMI-NEXT:  .LBB35_4:
+; X86-NOBMI-NEXT:  .LBB37_4:
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ebp
 ; X86-NOBMI-NEXT:    pushl %ebx
@@ -4766,11 +4979,11 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB35_2
+; X86-BMI1NOTBM-NEXT:    je .LBB37_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB35_2:
+; X86-BMI1NOTBM-NEXT:  .LBB37_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
@@ -4778,11 +4991,11 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebp
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB35_4
+; X86-BMI1NOTBM-NEXT:    je .LBB37_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebp, %ebx
 ; X86-BMI1NOTBM-NEXT:    xorl %ebp, %ebp
-; X86-BMI1NOTBM-NEXT:  .LBB35_4:
+; X86-BMI1NOTBM-NEXT:  .LBB37_4:
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebx
@@ -4817,22 +5030,22 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB35_2
+; X86-BMI1BMI2-NEXT:    je .LBB37_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB35_2:
+; X86-BMI1BMI2-NEXT:  .LBB37_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebp, %ebx
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB35_4
+; X86-BMI1BMI2-NEXT:    je .LBB37_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB35_4:
+; X86-BMI1BMI2-NEXT:  .LBB37_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %ebp
@@ -5322,36 +5535,36 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB41_2
+; X86-NOBMI-NEXT:    je .LBB43_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB41_2:
+; X86-NOBMI-NEXT:  .LBB43_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB41_4
+; X86-NOBMI-NEXT:    jne .LBB43_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %ebx
-; X86-NOBMI-NEXT:  .LBB41_4:
+; X86-NOBMI-NEXT:  .LBB43_4:
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl $0, %edx
-; X86-NOBMI-NEXT:    jne .LBB41_6
+; X86-NOBMI-NEXT:    jne .LBB43_6
 ; X86-NOBMI-NEXT:  # %bb.5:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:  .LBB41_6:
+; X86-NOBMI-NEXT:  .LBB43_6:
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB41_8
+; X86-NOBMI-NEXT:    jne .LBB43_8
 ; X86-NOBMI-NEXT:  # %bb.7:
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:  .LBB41_8:
+; X86-NOBMI-NEXT:  .LBB43_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -5370,36 +5583,36 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB41_2
+; X86-BMI1NOTBM-NEXT:    je .LBB43_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB41_2:
+; X86-BMI1NOTBM-NEXT:  .LBB43_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %eax
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %ebx
-; X86-BMI1NOTBM-NEXT:    jne .LBB41_4
+; X86-BMI1NOTBM-NEXT:    jne .LBB43_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB41_4:
+; X86-BMI1NOTBM-NEXT:  .LBB43_4:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl $0, %edx
-; X86-BMI1NOTBM-NEXT:    jne .LBB41_6
+; X86-BMI1NOTBM-NEXT:    jne .LBB43_6
 ; X86-BMI1NOTBM-NEXT:  # %bb.5:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB41_6:
+; X86-BMI1NOTBM-NEXT:  .LBB43_6:
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    jne .LBB41_8
+; X86-BMI1NOTBM-NEXT:    jne .LBB43_8
 ; X86-BMI1NOTBM-NEXT:  # %bb.7:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB41_8:
+; X86-BMI1NOTBM-NEXT:  .LBB43_8:
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    popl %ebx
@@ -5416,32 +5629,32 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB41_2
+; X86-BMI1BMI2-NEXT:    je .LBB43_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB41_2:
+; X86-BMI1BMI2-NEXT:  .LBB43_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB41_4
+; X86-BMI1BMI2-NEXT:    je .LBB43_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    movl $0, %edi
-; X86-BMI1BMI2-NEXT:  .LBB41_4:
+; X86-BMI1BMI2-NEXT:  .LBB43_4:
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %eax
-; X86-BMI1BMI2-NEXT:    jne .LBB41_6
+; X86-BMI1BMI2-NEXT:    jne .LBB43_6
 ; X86-BMI1BMI2-NEXT:  # %bb.5:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
-; X86-BMI1BMI2-NEXT:  .LBB41_6:
+; X86-BMI1BMI2-NEXT:  .LBB43_6:
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    jne .LBB41_8
+; X86-BMI1BMI2-NEXT:    jne .LBB43_8
 ; X86-BMI1BMI2-NEXT:  # %bb.7:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %eax
-; X86-BMI1BMI2-NEXT:  .LBB41_8:
+; X86-BMI1BMI2-NEXT:  .LBB43_8:
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
@@ -5493,36 +5706,36 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB42_2
+; X86-NOBMI-NEXT:    je .LBB44_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB42_2:
+; X86-NOBMI-NEXT:  .LBB44_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB42_4
+; X86-NOBMI-NEXT:    jne .LBB44_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %ebx
-; X86-NOBMI-NEXT:  .LBB42_4:
+; X86-NOBMI-NEXT:  .LBB44_4:
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl $0, %edx
-; X86-NOBMI-NEXT:    jne .LBB42_6
+; X86-NOBMI-NEXT:    jne .LBB44_6
 ; X86-NOBMI-NEXT:  # %bb.5:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:  .LBB42_6:
+; X86-NOBMI-NEXT:  .LBB44_6:
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB42_8
+; X86-NOBMI-NEXT:    jne .LBB44_8
 ; X86-NOBMI-NEXT:  # %bb.7:
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:  .LBB42_8:
+; X86-NOBMI-NEXT:  .LBB44_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -5541,36 +5754,36 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB42_2
+; X86-BMI1NOTBM-NEXT:    je .LBB44_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB42_2:
+; X86-BMI1NOTBM-NEXT:  .LBB44_2:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %eax
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %ebx
-; X86-BMI1NOTBM-NEXT:    jne .LBB42_4
+; X86-BMI1NOTBM-NEXT:    jne .LBB44_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB42_4:
+; X86-BMI1NOTBM-NEXT:  .LBB44_4:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl $0, %edx
-; X86-BMI1NOTBM-NEXT:    jne .LBB42_6
+; X86-BMI1NOTBM-NEXT:    jne .LBB44_6
 ; X86-BMI1NOTBM-NEXT:  # %bb.5:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB42_6:
+; X86-BMI1NOTBM-NEXT:  .LBB44_6:
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    jne .LBB42_8
+; X86-BMI1NOTBM-NEXT:    jne .LBB44_8
 ; X86-BMI1NOTBM-NEXT:  # %bb.7:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB42_8:
+; X86-BMI1NOTBM-NEXT:  .LBB44_8:
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    popl %ebx
@@ -5587,32 +5800,32 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB42_2
+; X86-BMI1BMI2-NEXT:    je .LBB44_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB42_2:
+; X86-BMI1BMI2-NEXT:  .LBB44_2:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB42_4
+; X86-BMI1BMI2-NEXT:    je .LBB44_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    movl $0, %edi
-; X86-BMI1BMI2-NEXT:  .LBB42_4:
+; X86-BMI1BMI2-NEXT:  .LBB44_4:
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %eax
-; X86-BMI1BMI2-NEXT:    jne .LBB42_6
+; X86-BMI1BMI2-NEXT:    jne .LBB44_6
 ; X86-BMI1BMI2-NEXT:  # %bb.5:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
-; X86-BMI1BMI2-NEXT:  .LBB42_6:
+; X86-BMI1BMI2-NEXT:  .LBB44_6:
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    jne .LBB42_8
+; X86-BMI1BMI2-NEXT:    jne .LBB44_8
 ; X86-BMI1BMI2-NEXT:  # %bb.7:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %eax
-; X86-BMI1BMI2-NEXT:  .LBB42_8:
+; X86-BMI1BMI2-NEXT:  .LBB44_8:
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
@@ -5670,36 +5883,36 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB43_2
+; X86-NOBMI-NEXT:    je .LBB45_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB43_2:
+; X86-NOBMI-NEXT:  .LBB45_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB43_4
+; X86-NOBMI-NEXT:    jne .LBB45_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %ebx
-; X86-NOBMI-NEXT:  .LBB43_4:
+; X86-NOBMI-NEXT:  .LBB45_4:
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl $0, %edx
-; X86-NOBMI-NEXT:    jne .LBB43_6
+; X86-NOBMI-NEXT:    jne .LBB45_6
 ; X86-NOBMI-NEXT:  # %bb.5:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:  .LBB43_6:
+; X86-NOBMI-NEXT:  .LBB45_6:
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB43_8
+; X86-NOBMI-NEXT:    jne .LBB45_8
 ; X86-NOBMI-NEXT:  # %bb.7:
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:  .LBB43_8:
+; X86-NOBMI-NEXT:  .LBB45_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -5719,36 +5932,36 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB43_2
+; X86-BMI1NOTBM-NEXT:    je .LBB45_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB43_2:
+; X86-BMI1NOTBM-NEXT:  .LBB45_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %eax
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %ebx
-; X86-BMI1NOTBM-NEXT:    jne .LBB43_4
+; X86-BMI1NOTBM-NEXT:    jne .LBB45_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB43_4:
+; X86-BMI1NOTBM-NEXT:  .LBB45_4:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl $0, %edx
-; X86-BMI1NOTBM-NEXT:    jne .LBB43_6
+; X86-BMI1NOTBM-NEXT:    jne .LBB45_6
 ; X86-BMI1NOTBM-NEXT:  # %bb.5:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB43_6:
+; X86-BMI1NOTBM-NEXT:  .LBB45_6:
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    jne .LBB43_8
+; X86-BMI1NOTBM-NEXT:    jne .LBB45_8
 ; X86-BMI1NOTBM-NEXT:  # %bb.7:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB43_8:
+; X86-BMI1NOTBM-NEXT:  .LBB45_8:
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    popl %ebx
@@ -5766,32 +5979,32 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB43_2
+; X86-BMI1BMI2-NEXT:    je .LBB45_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB43_2:
+; X86-BMI1BMI2-NEXT:  .LBB45_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB43_4
+; X86-BMI1BMI2-NEXT:    je .LBB45_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    movl $0, %edi
-; X86-BMI1BMI2-NEXT:  .LBB43_4:
+; X86-BMI1BMI2-NEXT:  .LBB45_4:
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %eax
-; X86-BMI1BMI2-NEXT:    jne .LBB43_6
+; X86-BMI1BMI2-NEXT:    jne .LBB45_6
 ; X86-BMI1BMI2-NEXT:  # %bb.5:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
-; X86-BMI1BMI2-NEXT:  .LBB43_6:
+; X86-BMI1BMI2-NEXT:  .LBB45_6:
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    jne .LBB43_8
+; X86-BMI1BMI2-NEXT:    jne .LBB45_8
 ; X86-BMI1BMI2-NEXT:  # %bb.7:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %eax
-; X86-BMI1BMI2-NEXT:  .LBB43_8:
+; X86-BMI1BMI2-NEXT:  .LBB45_8:
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
@@ -5846,36 +6059,36 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB44_2
+; X86-NOBMI-NEXT:    je .LBB46_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB44_2:
+; X86-NOBMI-NEXT:  .LBB46_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB44_4
+; X86-NOBMI-NEXT:    jne .LBB46_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %ebx
-; X86-NOBMI-NEXT:  .LBB44_4:
+; X86-NOBMI-NEXT:  .LBB46_4:
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl $0, %edx
-; X86-NOBMI-NEXT:    jne .LBB44_6
+; X86-NOBMI-NEXT:    jne .LBB46_6
 ; X86-NOBMI-NEXT:  # %bb.5:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:  .LBB44_6:
+; X86-NOBMI-NEXT:  .LBB46_6:
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB44_8
+; X86-NOBMI-NEXT:    jne .LBB46_8
 ; X86-NOBMI-NEXT:  # %bb.7:
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:  .LBB44_8:
+; X86-NOBMI-NEXT:  .LBB46_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -5895,36 +6108,36 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB44_2
+; X86-BMI1NOTBM-NEXT:    je .LBB46_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB44_2:
+; X86-BMI1NOTBM-NEXT:  .LBB46_2:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %eax
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %ebx
-; X86-BMI1NOTBM-NEXT:    jne .LBB44_4
+; X86-BMI1NOTBM-NEXT:    jne .LBB46_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB44_4:
+; X86-BMI1NOTBM-NEXT:  .LBB46_4:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl $0, %edx
-; X86-BMI1NOTBM-NEXT:    jne .LBB44_6
+; X86-BMI1NOTBM-NEXT:    jne .LBB46_6
 ; X86-BMI1NOTBM-NEXT:  # %bb.5:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB44_6:
+; X86-BMI1NOTBM-NEXT:  .LBB46_6:
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    jne .LBB44_8
+; X86-BMI1NOTBM-NEXT:    jne .LBB46_8
 ; X86-BMI1NOTBM-NEXT:  # %bb.7:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB44_8:
+; X86-BMI1NOTBM-NEXT:  .LBB46_8:
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    popl %ebx
@@ -5942,32 +6155,32 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB44_2
+; X86-BMI1BMI2-NEXT:    je .LBB46_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB44_2:
+; X86-BMI1BMI2-NEXT:  .LBB46_2:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB44_4
+; X86-BMI1BMI2-NEXT:    je .LBB46_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    movl $0, %edi
-; X86-BMI1BMI2-NEXT:  .LBB44_4:
+; X86-BMI1BMI2-NEXT:  .LBB46_4:
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %eax
-; X86-BMI1BMI2-NEXT:    jne .LBB44_6
+; X86-BMI1BMI2-NEXT:    jne .LBB46_6
 ; X86-BMI1BMI2-NEXT:  # %bb.5:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
-; X86-BMI1BMI2-NEXT:  .LBB44_6:
+; X86-BMI1BMI2-NEXT:  .LBB46_6:
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    jne .LBB44_8
+; X86-BMI1BMI2-NEXT:    jne .LBB46_8
 ; X86-BMI1BMI2-NEXT:  # %bb.7:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %eax
-; X86-BMI1BMI2-NEXT:  .LBB44_8:
+; X86-BMI1BMI2-NEXT:  .LBB46_8:
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
@@ -6029,37 +6242,37 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %al
-; X86-NOBMI-NEXT:    je .LBB45_2
+; X86-NOBMI-NEXT:    je .LBB47_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %esi, %ebx
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
-; X86-NOBMI-NEXT:  .LBB45_2:
+; X86-NOBMI-NEXT:  .LBB47_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    shldl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    shll %cl, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl %ebx, %ebp
-; X86-NOBMI-NEXT:    jne .LBB45_4
+; X86-NOBMI-NEXT:    jne .LBB47_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %esi, %ebp
-; X86-NOBMI-NEXT:  .LBB45_4:
+; X86-NOBMI-NEXT:  .LBB47_4:
 ; X86-NOBMI-NEXT:    movl %ebp, %esi
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl $0, %edi
-; X86-NOBMI-NEXT:    jne .LBB45_6
+; X86-NOBMI-NEXT:    jne .LBB47_6
 ; X86-NOBMI-NEXT:  # %bb.5:
 ; X86-NOBMI-NEXT:    movl %ebx, %edx
 ; X86-NOBMI-NEXT:    movl %esi, %edi
-; X86-NOBMI-NEXT:  .LBB45_6:
+; X86-NOBMI-NEXT:  .LBB47_6:
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    jne .LBB45_8
+; X86-NOBMI-NEXT:    jne .LBB47_8
 ; X86-NOBMI-NEXT:  # %bb.7:
 ; X86-NOBMI-NEXT:    movl %edx, %esi
-; X86-NOBMI-NEXT:  .LBB45_8:
+; X86-NOBMI-NEXT:  .LBB47_8:
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ecx
 ; X86-NOBMI-NEXT:    pushl %eax
@@ -6090,37 +6303,37 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB45_2
+; X86-BMI1NOTBM-NEXT:    je .LBB47_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %ebx
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
-; X86-BMI1NOTBM-NEXT:  .LBB45_2:
+; X86-BMI1NOTBM-NEXT:  .LBB47_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
-; X86-BMI1NOTBM-NEXT:    jne .LBB45_4
+; X86-BMI1NOTBM-NEXT:    jne .LBB47_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %ebp
-; X86-BMI1NOTBM-NEXT:  .LBB45_4:
+; X86-BMI1NOTBM-NEXT:  .LBB47_4:
 ; X86-BMI1NOTBM-NEXT:    movl %ebp, %esi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl $0, %edi
-; X86-BMI1NOTBM-NEXT:    jne .LBB45_6
+; X86-BMI1NOTBM-NEXT:    jne .LBB47_6
 ; X86-BMI1NOTBM-NEXT:  # %bb.5:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edx
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB45_6:
+; X86-BMI1NOTBM-NEXT:  .LBB47_6:
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %edx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    jne .LBB45_8
+; X86-BMI1NOTBM-NEXT:    jne .LBB47_8
 ; X86-BMI1NOTBM-NEXT:  # %bb.7:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
-; X86-BMI1NOTBM-NEXT:  .LBB45_8:
+; X86-BMI1NOTBM-NEXT:  .LBB47_8:
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ecx
 ; X86-BMI1NOTBM-NEXT:    pushl %eax
@@ -6148,33 +6361,33 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB45_2
+; X86-BMI1BMI2-NEXT:    je .LBB47_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB45_2:
+; X86-BMI1BMI2-NEXT:  .LBB47_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB45_4
+; X86-BMI1BMI2-NEXT:    je .LBB47_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edx
 ; X86-BMI1BMI2-NEXT:    movl $0, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB45_4:
+; X86-BMI1BMI2-NEXT:  .LBB47_4:
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edi
-; X86-BMI1BMI2-NEXT:    jne .LBB45_6
+; X86-BMI1BMI2-NEXT:    jne .LBB47_6
 ; X86-BMI1BMI2-NEXT:  # %bb.5:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB45_6:
+; X86-BMI1BMI2-NEXT:  .LBB47_6:
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    jne .LBB45_8
+; X86-BMI1BMI2-NEXT:    jne .LBB47_8
 ; X86-BMI1BMI2-NEXT:  # %bb.7:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
-; X86-BMI1BMI2-NEXT:  .LBB45_8:
+; X86-BMI1BMI2-NEXT:  .LBB47_8:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    pushl %ecx
 ; X86-BMI1BMI2-NEXT:    pushl %eax
-- 
GitLab


From 13d535a9c98a2586c48cbdd2842e5fa02b2ba839 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Mon, 5 Nov 2018 09:31:43 +0000
Subject: [PATCH 0955/1116] Exclude wasm target from Windows packaging due to
 PR39448

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346122 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/release/build_llvm_package.bat | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/release/build_llvm_package.bat b/utils/release/build_llvm_package.bat
index 30767f6f350..1dac7878ff3 100755
--- a/utils/release/build_llvm_package.bat
+++ b/utils/release/build_llvm_package.bat
@@ -44,8 +44,8 @@ svn.exe export -r %revision% http://llvm.org/svn/llvm-project/openmp/%branch% ll
 
 
 REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226.
-set cmake_flags=-DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_USE_CRT_RELEASE=MT -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON -DCLANG_FORMAT_VS_VERSION=%clang_format_vs_version% -DPACKAGE_VERSION=%package_version% -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: "
-
+REM Excluding wasm target to work around PR39448.
+set cmake_flags=-DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_USE_CRT_RELEASE=MT -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON -DCLANG_FORMAT_VS_VERSION=%clang_format_vs_version% -DPACKAGE_VERSION=%package_version% -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " -DLLVM_TARGETS_TO_BUILD="AArch64;AMDGPU;ARM;BPF;Hexagon;Lanai;Mips;MSP430;NVPTX;PowerPC;Sparc;SystemZ;X86;XCore"
 REM TODO: Run all tests, including lld and compiler-rt.
 
 set "VSCMD_START_DIR=%CD%"
-- 
GitLab


From 76d01007eee4561a07d5eb608c0781316383e486 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Mon, 5 Nov 2018 10:58:37 +0000
Subject: [PATCH 0956/1116] [ARM][ARMCGP] Remove unecessary zexts and truncs

r345840 slightly changed the way promotion happens which could
result in zext and truncs having the same source and destination
types. This fixes that issue.

We can now also remove the zext and trunc in the following case:
(zext (trunc (promoted op)), i32)

This means that we can no longer treat a value, that is only used by
a sink, to be safe to promote.

I've also added in some extra asserts and replaced a cast for a
dyn_cast.

Differential Revision: https://reviews.llvm.org/D54032


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346125 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMCodeGenPrepare.cpp  | 101 +++++++++++++++++---------
 test/CodeGen/ARM/CGP/arm-cgp-calls.ll |   3 +-
 test/CodeGen/ARM/CGP/arm-cgp-casts.ll |  31 +++++++-
 3 files changed, 97 insertions(+), 38 deletions(-)

diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp
index 0a6ea9dc325..8a7555bb95c 100644
--- a/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -114,8 +114,8 @@ class IRPromoter {
   SmallPtrSet<Value*, 8> Promoted;
   Module *M = nullptr;
   LLVMContext &Ctx;
-  Type *ExtTy = nullptr;
-  Type *OrigTy = nullptr;
+  IntegerType *ExtTy = nullptr;
+  IntegerType *OrigTy = nullptr;
 
   void PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
                          SmallPtrSetImpl<Instruction*> &SafeToPromote);
@@ -126,20 +126,12 @@ class IRPromoter {
                    SmallPtrSetImpl<Instruction*> &SafeToPromote);
   void TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
                      SmallPtrSetImpl<Instruction*> &Sinks);
+  void Cleanup(SmallPtrSetImpl<Instruction*> &Sinks);
 
 public:
   IRPromoter(Module *M) : M(M), Ctx(M->getContext()),
                           ExtTy(Type::getInt32Ty(Ctx)) { }
 
-  void Cleanup() {
-    for (auto *I : InstsToRemove) {
-      LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
-      I->dropAllReferences();
-      I->eraseFromParent();
-    }
-    InstsToRemove.clear();
-    NewInsts.clear();
-  }
 
   void Mutate(Type *OrigTy,
               SmallPtrSetImpl<Value*> &Visited,
@@ -401,17 +393,7 @@ static bool isPromotedResultSafe(Value *V) {
   if (generateSignBits(V))
     return false;
 
-  // If I is only being used by something that will require its value to be
-  // truncated, then we don't care about the promoted result.
-  auto *I = cast<Instruction>(V);
-  if (I->hasOneUse() && isSink(*I->use_begin())) {
-    LLVM_DEBUG(dbgs() << "ARM CGP: Only use is a sink: " << *V << "\n");
-    return true;
-  }
-
-  if (isa<OverflowingBinaryOperator>(I))
-    return false;
-  return true;
+  return !isa<OverflowingBinaryOperator>(V);
 }
 
 /// Return the intrinsic for the instruction that can perform the same
@@ -514,21 +496,24 @@ void IRPromoter::ExtendSources(SmallPtrSetImpl<Value*> &Sources) {
   IRBuilder<> Builder{Ctx};
 
   auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
+    assert(V->getType() != ExtTy && "zext already extends to i32");
     LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
     Builder.SetInsertPoint(InsertPt);
     if (auto *I = dyn_cast<Instruction>(V))
       Builder.SetCurrentDebugLocation(I->getDebugLoc());
-    auto *ZExt = cast<Instruction>(Builder.CreateZExt(V, ExtTy));
-    if (isa<Argument>(V))
-      ZExt->moveBefore(InsertPt);
-    else
-      ZExt->moveAfter(InsertPt);
+
+    Value *ZExt = Builder.CreateZExt(V, ExtTy);
+    if (auto *I = dyn_cast<Instruction>(ZExt)) {
+      if (isa<Argument>(V))
+        I->moveBefore(InsertPt);
+      else
+        I->moveAfter(InsertPt);
+      NewInsts.insert(I);
+    }
     ReplaceAllUsersOfWith(V, ZExt);
-    NewInsts.insert(ZExt);
     TruncTysMap[ZExt] = TruncTysMap[V];
   };
 
-
   // Now, insert extending instructions between the sources and their users.
   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n");
   for (auto V : Sources) {
@@ -664,6 +649,49 @@ void IRPromoter::TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
       }
     }
   }
+
+}
+
+void IRPromoter::Cleanup(SmallPtrSetImpl<Instruction*> &Sinks) {
+  // Some zext sinks will now have become redundant, along with their trunc
+  // operands, so remove them.
+  for (auto I : Sinks) {
+    if (auto *ZExt = dyn_cast<ZExtInst>(I)) {
+      if (ZExt->getDestTy() != ExtTy)
+        continue;
+
+      Value *Src = ZExt->getOperand(0);
+      if (ZExt->getSrcTy() == ZExt->getDestTy()) {
+        LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary zext\n");
+        ReplaceAllUsersOfWith(ZExt, Src);
+        InstsToRemove.push_back(ZExt);
+        continue;
+      }
+
+      // For any truncs that we insert to handle zexts, we can replace the
+      // result of the zext with the input to the trunc.
+      if (NewInsts.count(Src) && isa<TruncInst>(Src)) {
+        auto *Trunc = cast<TruncInst>(Src);
+        assert(Trunc->getOperand(0)->getType() == ExtTy &&
+               "expected inserted trunc to be operating on i32");
+        LLVM_DEBUG(dbgs() << "ARM CGP: Replacing zext with trunc operand: "
+                   << *Trunc->getOperand(0));
+        ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
+        InstsToRemove.push_back(ZExt);
+      }
+    }
+  }
+
+  for (auto *I : InstsToRemove) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
+    I->dropAllReferences();
+    I->eraseFromParent();
+  }
+
+  InstsToRemove.clear();
+  NewInsts.clear();
+  TruncTysMap.clear();
+  Promoted.clear();
 }
 
 void IRPromoter::Mutate(Type *OrigTy,
@@ -673,7 +701,11 @@ void IRPromoter::Mutate(Type *OrigTy,
                         SmallPtrSetImpl<Instruction*> &SafeToPromote) {
   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
              << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
-  this->OrigTy = OrigTy;
+
+  assert(isa<IntegerType>(OrigTy) && "expected integer type");
+  this->OrigTy = cast<IntegerType>(OrigTy);
+  assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits() &&
+         "original type not smaller than extended type");
 
   // Cache original types.
   for (auto *V : Visited)
@@ -691,9 +723,13 @@ void IRPromoter::Mutate(Type *OrigTy,
   // promote.
   PromoteTree(Visited, Sources, Sinks, SafeToPromote);
 
-  // Finally, insert trunc instructions for use by calls, stores etc...
+  // Insert trunc instructions for use by calls, stores etc...
   TruncateSinks(Sources, Sinks);
 
+  // Finally, remove unecessary zexts and truncs, delete old instructions and
+  // clear the data structures.
+  Cleanup(Sinks);
+
   LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete:\n");
   LLVM_DEBUG(dbgs();
              for (auto *V : Sources)
@@ -943,9 +979,8 @@ bool ARMCodeGenPrepare::runOnFunction(Function &F) {
         }
       }
     }
-    Promoter->Cleanup();
     LLVM_DEBUG(if (verifyFunction(F, &dbgs())) {
-                dbgs();
+                dbgs() << F;
                 report_fatal_error("Broken function after type promotion");
                });
   }
diff --git a/test/CodeGen/ARM/CGP/arm-cgp-calls.ll b/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
index 5972980b8d6..244c6bdbf30 100644
--- a/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
@@ -144,10 +144,9 @@ if.then:                                          ; preds = %for.cond
   br label %for.cond.backedge
 }
 
-; Transform will bail because of the zext
 ; Check that d.sroa.0.0.be is promoted passed directly into the tail call.
 ; CHECK-LABEL: check_zext_phi_call_arg
-; CHECK: uxt
+; CHECK-NOT: uxt
 define i32 @check_zext_phi_call_arg() {
 entry:
   br label %for.cond
diff --git a/test/CodeGen/ARM/CGP/arm-cgp-casts.ll b/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
index 23467c9a20f..431846482c6 100644
--- a/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
@@ -232,9 +232,10 @@ exit:
 ; promote %1 for the call - unless we can generate a uadd16.
 ; CHECK-COMMON-LABEL: zext_load_sink_call:
 ; CHECK-COMMON: uxt
-; uadd16
-; cmp
-; CHECK-COMMON: uxt
+; CHECK-DSP-IMM: uadd16
+; CHECK-COMMON: cmp
+; CHECK-NODSP: uxt
+; CHECK-DSP-IMM-NOT: uxt
 define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) {
 entry:
   %0 = load i16, i16* %ptr, align 4
@@ -338,3 +339,27 @@ declare i32 @dummy(i32, i32)
 @d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1
 @sh1 = hidden local_unnamed_addr global i16 0, align 2
 @d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2
+
+; CHECK-LABEL: two_stage_zext_trunc_mix
+; CHECK-NOT: uxt
+define i8* @two_stage_zext_trunc_mix(i32* %this, i32 %__pos1, i32 %__n1, i32** %__str, i32 %__pos2, i32 %__n2) {
+entry:
+  %__size_.i.i.i.i = bitcast i32** %__str to i8*
+  %0 = load i8, i8* %__size_.i.i.i.i, align 4
+  %1 = and i8 %0, 1
+  %tobool.i.i.i.i = icmp eq i8 %1, 0
+  %__size_.i5.i.i = getelementptr inbounds i32*, i32** %__str, i32 %__n1
+  %cast = bitcast i32** %__size_.i5.i.i to i32*
+  %2 = load i32, i32* %cast, align 4
+  %3 = lshr i8 %0, 1
+  %4 = zext i8 %3 to i32
+  %cond.i.i = select i1 %tobool.i.i.i.i, i32 %4, i32 %2
+  %__size_.i.i.i.i.i = bitcast i32* %this to i8*
+  %5 = load i8, i8* %__size_.i.i.i.i.i, align 4
+  %6 = and i8 %5, 1
+  %tobool.i.i.i.i.i = icmp eq i8 %6, 0
+  %7 = getelementptr inbounds i8, i8* %__size_.i.i.i.i, i32 %__pos1
+  %8 = getelementptr inbounds i8, i8* %__size_.i.i.i.i, i32 %__pos2
+  %res = select i1 %tobool.i.i.i.i.i,  i8* %7, i8* %8
+  ret i8* %res
+}
-- 
GitLab


From 4bf0a8ee3f8e9d0f069a650dc6ddbb9abcd9326b Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Mon, 5 Nov 2018 11:26:04 +0000
Subject: [PATCH 0957/1116] [ARM] Turn assert into condition in ARMCGP

Turn the assert in PrepareConstants into a conditon so that we can
handle mul instructions with negative immediates.

Differential Revision: https://reviews.llvm.org/D54094


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346126 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMCodeGenPrepare.cpp  |  6 +++---
 test/CodeGen/ARM/CGP/arm-cgp-icmps.ll | 20 ++++++++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp
index 8a7555bb95c..0bd1f9ca639 100644
--- a/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -443,7 +443,7 @@ IRPromoter::PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
   //   > The operators that can wrap are: add, sub, mul and shl.
   //   > shl interprets its second operand as unsigned and if the first operand
   //     is an immediate, it will need zext to be nuw.
-  //   > I'm assuming mul cannot be nuw while using a negative immediate...
+  //   > I'm assuming mul has to interpret immediates as unsigned for nuw.
   //   > Which leaves the nuw add and sub to be handled; as with shl, if an
   //     immediate is used as operand 0, it will need zext to be nuw.
   // - We also allow add and sub to safely overflow in certain circumstances
@@ -468,8 +468,8 @@ IRPromoter::PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
           break;
 
         unsigned Opc = I->getOpcode();
-        assert((Opc == Instruction::Add || Opc == Instruction::Sub) &&
-               "expected only an add or sub to use a negative imm");
+        if (Opc != Instruction::Add && Opc != Instruction::Sub)
+          continue;
 
         LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n");
         auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs());
diff --git a/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll b/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll
index fca0be6da1f..8ff7db51e65 100644
--- a/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll
@@ -310,3 +310,23 @@ entry:
   ret i32 %conv1
 }
 
+; CHECK-COMMON-LABEL: mul_with_neg_imm
+; CHECK-COMMON-NOT: uxtb
+; CHECK-COMMON:     and [[BIT0:r[0-9]+]], r0, #1
+; CHECK-COMMON:     add.w [[MUL32:r[0-9]+]], [[BIT0]], [[BIT0]], lsl #5
+; CHECK-COMMON:     cmp.w r0, [[MUL32]], lsl #2
+define void @mul_with_neg_imm(i32, i32* %b) {
+entry:
+  %1 = trunc i32 %0 to i8
+  %2 = and i8 %1, 1
+  %conv.i = mul nuw i8 %2, -124
+  %tobool = icmp eq i8 %conv.i, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  store i32 0, i32* %b, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
-- 
GitLab


From 040e4d75d3bfa4f06f77f9f93ddbd44f18d50f15 Mon Sep 17 00:00:00 2001
From: Francis Visoiu Mistrih <francisvm@yahoo.com>
Date: Mon, 5 Nov 2018 11:57:44 +0000
Subject: [PATCH 0958/1116] [CMake] Expose opt-remark tooling through
 libOptRemarks.dylib

* Create an install target for it
* Add it under tools/opt-remarks
* Add an export file for the dylib
* Install the llvm-c/OptRemarks.h header
* Add an API to query its version

rdar://45458839

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346127 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/AddLLVM.cmake          |  1 +
 include/llvm-c/OptRemarks.h          |  7 +++++++
 tools/opt-remarks/CMakeLists.txt     | 22 ++++++++++++++++++++++
 tools/opt-remarks/OptRemarks.exports |  6 ++++++
 tools/opt-remarks/liboptremarks.cpp  | 18 ++++++++++++++++++
 5 files changed, 54 insertions(+)
 create mode 100644 tools/opt-remarks/CMakeLists.txt
 create mode 100644 tools/opt-remarks/OptRemarks.exports
 create mode 100644 tools/opt-remarks/liboptremarks.cpp

diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 410308d46d6..18997165558 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -635,6 +635,7 @@ macro(add_llvm_library name)
     set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS_BUILDTREE_ONLY ${name})
   else()
     if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "LTO" OR
+        ${name} STREQUAL "OptRemarks" OR
         (LLVM_LINK_LLVM_DYLIB AND ${name} STREQUAL "LLVM"))
       set(install_dir lib${LLVM_LIBDIR_SUFFIX})
       if(ARG_SHARED OR BUILD_SHARED_LIBS)
diff --git a/include/llvm-c/OptRemarks.h b/include/llvm-c/OptRemarks.h
index f3449cc1b8c..6a90394e711 100644
--- a/include/llvm-c/OptRemarks.h
+++ b/include/llvm-c/OptRemarks.h
@@ -186,6 +186,13 @@ LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser);
  */
 extern void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser);
 
+/**
+ * Returns the version of the opt-remarks dylib.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern uint32_t LLVMOptRemarkVersion(void);
+
 /**
  * @} // endgoup LLVMCOPTREMARKS
  */
diff --git a/tools/opt-remarks/CMakeLists.txt b/tools/opt-remarks/CMakeLists.txt
new file mode 100644
index 00000000000..a87beae1e89
--- /dev/null
+++ b/tools/opt-remarks/CMakeLists.txt
@@ -0,0 +1,22 @@
+set(LLVM_LINK_COMPONENTS
+  OptRemarks
+  )
+
+set(SOURCES
+  liboptremarks.cpp
+  )
+
+set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/OptRemarks.exports)
+
+add_llvm_library(OptRemarks SHARED ${SOURCES})
+
+install(FILES ${LLVM_MAIN_INCLUDE_DIR}/llvm-c/OptRemarks.h
+  DESTINATION include/llvm-c
+  COMPONENT OptRemarks)
+
+if (APPLE)
+  set(OPTREMARKS_VERSION ${LLVM_VERSION_MAJOR})
+  set_property(TARGET OptRemarks APPEND_STRING PROPERTY
+              LINK_FLAGS
+              " -compatibility_version 1 -current_version ${OPTREMARKS_VERSION}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}")
+endif()
diff --git a/tools/opt-remarks/OptRemarks.exports b/tools/opt-remarks/OptRemarks.exports
new file mode 100644
index 00000000000..c3f678d754f
--- /dev/null
+++ b/tools/opt-remarks/OptRemarks.exports
@@ -0,0 +1,6 @@
+LLVMOptRemarkParserCreate
+LLVMOptRemarkParserGetNext
+LLVMOptRemarkParserHasError
+LLVMOptRemarkParserGetErrorMessage
+LLVMOptRemarkParserDispose
+LLVMOptRemarkVersion
diff --git a/tools/opt-remarks/liboptremarks.cpp b/tools/opt-remarks/liboptremarks.cpp
new file mode 100644
index 00000000000..13acada06ac
--- /dev/null
+++ b/tools/opt-remarks/liboptremarks.cpp
@@ -0,0 +1,18 @@
+//===-liboptremarks.cpp - LLVM Opt-Remarks Shared Library -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Provide a library to work with optimization remarks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/OptRemarks.h"
+
+extern uint32_t LLVMOptRemarkVersion(void) {
+  return OPT_REMARKS_API_VERSION;
+}
-- 
GitLab


From 5efc1996ceec7395cc33325d8fef796f5b599205 Mon Sep 17 00:00:00 2001
From: Neil Henning <neil.henning@amd.com>
Date: Mon, 5 Nov 2018 12:04:48 +0000
Subject: [PATCH 0959/1116] [AMDGPU] Fix the new atomic optimizer in pixel
 shaders.

The new atomic optimizer I previously added in D51969 did not work
correctly when a pixel shader was using derivatives, and had helper
lanes active.

To fix this we add an llvm.amdgcn.ps.live call that guards a branch
around the entire atomic operation - ensuring that all helper lanes are
inactive within the wavefront when we compute our atomic results.

I've added a test case that can cause derivatives, and exposes the
problem.

Differential Revision: https://reviews.llvm.org/D53930

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346128 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp   | 41 ++++++++++++-
 .../atomic_optimizations_pixelshader.ll       | 59 +++++++++++++++++++
 2 files changed, 98 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll

diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 7af13f83401..644e4fd558b 100644
--- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -53,6 +53,7 @@ private:
   const DataLayout *DL;
   DominatorTree *DT;
   bool HasDPP;
+  bool IsPixelShader;
 
   void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
                       unsigned ValIdx, bool ValDivergent) const;
@@ -96,6 +97,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
   HasDPP = ST.hasDPP();
+  IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
 
   visit(F);
 
@@ -215,6 +217,31 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
   // Start building just before the instruction.
   IRBuilder<> B(&I);
 
+  // If we are in a pixel shader, because of how we have to mask out helper
+  // lane invocations, we need to record the entry and exit BB's.
+  BasicBlock *PixelEntryBB = nullptr;
+  BasicBlock *PixelExitBB = nullptr;
+
+  // If we're optimizing an atomic within a pixel shader, we need to wrap the
+  // entire atomic operation in a helper-lane check. We do not want any helper
+  // lanes that are around only for the purposes of derivatives to take part
+  // in any cross-lane communication, and we use a branch on whether the lane is
+  // live to do this.
+  if (IsPixelShader) {
+    // Record I's original position as the entry block.
+    PixelEntryBB = I.getParent();
+
+    Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
+    Instruction *const NonHelperTerminator =
+        SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
+
+    // Record I's new position as the exit block.
+    PixelExitBB = I.getParent();
+
+    I.moveBefore(NonHelperTerminator);
+    B.SetInsertPoint(&I);
+  }
+
   Type *const Ty = I.getType();
   const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
   Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);
@@ -398,8 +425,18 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
   // first lane, to get our lane's index into the atomic result.
   Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);
 
-  // Replace the original atomic instruction with the new one.
-  I.replaceAllUsesWith(Result);
+  if (IsPixelShader) {
+    // Need a final PHI to reconverge to above the helper lane branch mask.
+    B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+
+    PHINode *const PHI = B.CreatePHI(Ty, 2);
+    PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
+    PHI->addIncoming(Result, I.getParent());
+    I.replaceAllUsesWith(PHI);
+  } else {
+    // Replace the original atomic instruction with the new one.
+    I.replaceAllUsesWith(Result);
+  }
 
   // And delete the original.
   I.eraseFromParent();
diff --git a/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
new file mode 100644
index 00000000000..4b4b268df1f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
+; RUN: llc  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+
+declare i1 @llvm.amdgcn.wqm.vote(i1)
+declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1)
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1)
+
+; Show that what the atomic optimization pass will do for raw buffers.
+
+; GCN-LABEL: add_i32_constant:
+; GCN-LABEL: BB0_1:
+; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
+; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
+; GCN: buffer_atomic_add v[[value]]
+; GCN: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]
+define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) {
+entry:
+  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
+  %old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)
+  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
+  %cond = and i1 %cond1, %cond2
+  br i1 %cond, label %if, label %else
+if:
+  %bitcast = bitcast i32 %old to float
+  call void @llvm.amdgcn.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i1 0, i1 0)
+  ret void
+else:
+  ret void
+}
+
+; GCN-LABEL: add_i32_varying:
+; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
+; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
+; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], exec_lo, 0
+; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], exec_hi, v[[mbcnt_lo]]
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
+; GFX8MORE: buffer_atomic_add v[[value]]
+; GFX8MORE: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]
+define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) {
+entry:
+  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
+  %old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i1 0)
+  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
+  %cond = and i1 %cond1, %cond2
+  br i1 %cond, label %if, label %else
+if:
+  %bitcast = bitcast i32 %old to float
+  call void @llvm.amdgcn.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i1 0, i1 0)
+  ret void
+else:
+  ret void
+}
-- 
GitLab


From 7051a2ef11c10809ad8e28dbc70e36c8147ac8eb Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Mon, 5 Nov 2018 14:17:27 +0000
Subject: [PATCH 0960/1116] [NFC][ARM] Adding extra test for ARM CGP

Added a reproducer that I received a while ago.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346132 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/ARM/CGP/arm-cgp-calls.ll | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/test/CodeGen/ARM/CGP/arm-cgp-calls.ll b/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
index 244c6bdbf30..10cd6671ffc 100644
--- a/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
@@ -166,6 +166,19 @@ if.then:                                          ; preds = %for.cond
   br label %for.cond.backedge
 }
 
+%struct.atomic_flag = type { i8 }
+
+; CHECK-LABEL: atomic_flag_test_and_set
+; CHECK-NOT: uxt
+define zeroext i1 @atomic_flag_test_and_set(%struct.atomic_flag* %object) {
+entry:
+  %_Value = getelementptr inbounds %struct.atomic_flag, %struct.atomic_flag* %object, i32 0, i32 0
+  %call = tail call arm_aapcscc zeroext i8 @__atomic_exchange_1(i8* %_Value, i8 zeroext 1, i32 5) #1
+  %0 = and i8 %call, 1
+  %tobool = icmp ne i8 %0, 0
+  ret i1 %tobool
+}
+
 declare i32 @assert(...)
 declare i8 @dummy_i8(i8)
 declare i8 @dummy2(i8*, i8, i8)
@@ -173,6 +186,7 @@ declare i16 @dummy3(i16)
 
 declare dso_local i32 @e(...) local_unnamed_addr #1
 declare dso_local zeroext i16 @f(...) local_unnamed_addr #1
+declare dso_local arm_aapcscc i8 @__atomic_exchange_1(i8*, i8, i32) local_unnamed_addr
 
 declare noalias i16** @func_62(i8 zeroext %p_63, i32 %p_64, i16 signext %p_65, i32* nocapture readnone %p_66)
 declare fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %si2)
-- 
GitLab


From a6a81aee0be7bd13e95d9a0f9a77fe36af232bbd Mon Sep 17 00:00:00 2001
From: Stefan Maksimovic <stefan.maksimovic@mips.com>
Date: Mon, 5 Nov 2018 14:37:41 +0000
Subject: [PATCH 0961/1116] [Mips] Supplement long branch pseudo instructions

Expand on LONG_BRANCH_LUi and LONG_BRANCH_(D)ADDiu pseudo
instructions by creating variants which support
less operands/accept GPR64Opnds as their operand in order
to appease the machine verifier pass.

Differential Revision: https://reviews.llvm.org/D53977


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346133 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/Mips64InstrInfo.td               |  7 +++++++
 lib/Target/Mips/MipsAsmPrinter.cpp               |  6 +++++-
 lib/Target/Mips/MipsBranchExpansion.cpp          | 16 ++++++++--------
 lib/Target/Mips/MipsInstrInfo.td                 | 10 ++++++++--
 lib/Target/Mips/MipsMCInstLower.cpp              |  4 ++++
 .../Mips/longbranch/long-branch-expansion-3.ll   | 16 ++++++++--------
 6 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index b5317bec70c..5729182deaf 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -416,6 +416,13 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
 // long branches.  See the comment in file MipsLongBranch.cpp for detailed
 // explanation.
 
+// Expands to: lui $dst, %highest/%higher/%hi/%lo($tgt)
+def LONG_BRANCH_LUi2Op_64 : PseudoSE<(outs GPR64Opnd:$dst),
+  (ins brtarget:$tgt), []>, GPR_64;
+// Expands to: addiu $dst, %highest/%higher/%hi/%lo($tgt)
+def LONG_BRANCH_DADDiu2Op : PseudoSE<(outs GPR64Opnd:$dst),
+  (ins GPR64Opnd:$src, brtarget:$tgt), []>, GPR_64;
+
 // Expands to: daddiu $dst, $src, %PART($tgt - $baltgt)
 // where %PART may be %hi or %lo, depending on the relocation kind
 // that $tgt is annotated with.
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index a19c97e2ef0..16a2481a00d 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -1241,8 +1241,12 @@ void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
 
 bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const {
   return (Opcode == Mips::LONG_BRANCH_LUi
+          || Opcode == Mips::LONG_BRANCH_LUi2Op
+          || Opcode == Mips::LONG_BRANCH_LUi2Op_64
           || Opcode == Mips::LONG_BRANCH_ADDiu
-          || Opcode == Mips::LONG_BRANCH_DADDiu);
+          || Opcode == Mips::LONG_BRANCH_ADDiu2Op
+          || Opcode == Mips::LONG_BRANCH_DADDiu
+          || Opcode == Mips::LONG_BRANCH_DADDiu2Op);
 }
 
 // Force static initialization.
diff --git a/lib/Target/Mips/MipsBranchExpansion.cpp b/lib/Target/Mips/MipsBranchExpansion.cpp
index a8aca905965..e59267c4fd9 100644
--- a/lib/Target/Mips/MipsBranchExpansion.cpp
+++ b/lib/Target/Mips/MipsBranchExpansion.cpp
@@ -674,32 +674,32 @@ void MipsBranchExpansion::expandToLongBranch(MBBInfo &I) {
       // instructions, where we first load the offset into register, and then we
       // do branch register.
       if (ABI.IsN64()) {
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi))
-            .addReg(Mips::AT_64)
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi2Op_64),
+                Mips::AT_64)
             .addMBB(TgtMBB, MipsII::MO_HIGHEST);
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu2Op),
                 Mips::AT_64)
             .addReg(Mips::AT_64)
             .addMBB(TgtMBB, MipsII::MO_HIGHER);
         BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
             .addReg(Mips::AT_64)
             .addImm(16);
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu2Op),
                 Mips::AT_64)
             .addReg(Mips::AT_64)
             .addMBB(TgtMBB, MipsII::MO_ABS_HI);
         BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
             .addReg(Mips::AT_64)
             .addImm(16);
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu2Op),
                 Mips::AT_64)
             .addReg(Mips::AT_64)
             .addMBB(TgtMBB, MipsII::MO_ABS_LO);
       } else {
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi))
-            .addReg(Mips::AT)
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi2Op),
+                Mips::AT)
             .addMBB(TgtMBB, MipsII::MO_ABS_HI);
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_ADDiu),
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_ADDiu2Op),
                 Mips::AT)
             .addReg(Mips::AT)
             .addMBB(TgtMBB, MipsII::MO_ABS_LO);
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 0faa13d4d63..d9398b7d602 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -2002,13 +2002,19 @@ let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
 // branches.  See the comment in file MipsLongBranch.cpp for detailed
 // explanation.
 
-// Expands to: lui $dst, %hi($tgt - $baltgt)
+// Expands to: lui $dst, %highest/%higher/%hi/%lo($tgt - $baltgt)
 def LONG_BRANCH_LUi : PseudoSE<(outs GPR32Opnd:$dst),
   (ins brtarget:$tgt, brtarget:$baltgt), []>;
+// Expands to: lui $dst, highest/%higher/%hi/%lo($tgt)
+def LONG_BRANCH_LUi2Op : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins brtarget:$tgt), []>;
 
-// Expands to: addiu $dst, $src, %lo($tgt - $baltgt)
+// Expands to: addiu $dst, $src, %highest/%higher/%hi/%lo($tgt - $baltgt)
 def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
   (ins GPR32Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+// Expands to: addiu $dst, $src, %highest/%higher/%hi/%lo($tgt)
+def LONG_BRANCH_ADDiu2Op : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins GPR32Opnd:$src, brtarget:$tgt), []>;
 
 //===----------------------------------------------------------------------===//
 // Instruction definition
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 2b7f6409992..46b37ceae39 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -298,12 +298,16 @@ bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
   default:
     return false;
   case Mips::LONG_BRANCH_LUi:
+  case Mips::LONG_BRANCH_LUi2Op:
+  case Mips::LONG_BRANCH_LUi2Op_64:
     lowerLongBranchLUi(MI, OutMI);
     return true;
   case Mips::LONG_BRANCH_ADDiu:
+  case Mips::LONG_BRANCH_ADDiu2Op:
     lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu);
     return true;
   case Mips::LONG_BRANCH_DADDiu:
+  case Mips::LONG_BRANCH_DADDiu2Op:
     lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu);
     return true;
   }
diff --git a/test/CodeGen/Mips/longbranch/long-branch-expansion-3.ll b/test/CodeGen/Mips/longbranch/long-branch-expansion-3.ll
index 6fa4d4d072a..1fa78942af4 100644
--- a/test/CodeGen/Mips/longbranch/long-branch-expansion-3.ll
+++ b/test/CodeGen/Mips/longbranch/long-branch-expansion-3.ll
@@ -1,12 +1,12 @@
-; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r2 < %s -o - | FileCheck %s --check-prefixes=CHECK32R2
-; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r6 < %s -o - | FileCheck %s --check-prefixes=CHECK32R6
-; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r2 -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK32-IJH
-; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r6 -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK32-IJH
+; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r2 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=CHECK32R2
+; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r6 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=CHECK32R6
+; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r2 -verify-machineinstrs -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK32-IJH
+; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r6 -verify-machineinstrs -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK32-IJH
 
-; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r2 < %s -o - | FileCheck %s --check-prefixes=CHECK64R2
-; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r6 < %s -o - | FileCheck %s --check-prefixes=CHECK64R6
-; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r2 -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK64-IJH
-; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r6 -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK64-IJH
+; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r2 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=CHECK64R2
+; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r6 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=CHECK64R6
+; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r2 -verify-machineinstrs -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK64-IJH
+; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r6 -verify-machineinstrs -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK64-IJH
 
 declare i32 @foo(...)
 
-- 
GitLab


From 5cc99f0569c2d6ce457d7cfef3e119f83c957db6 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 5 Nov 2018 14:54:34 +0000
Subject: [PATCH 0962/1116] [Inliner] Penalise inlining of calls with loops at
 Oz

We currently seem to underestimate the size of functions with loops in them,
both in terms of absolute code size and in the difficulties of dealing with
such code. (Calls, for example, can be tail merged to further reduce
codesize). At -Oz, we can then increase code size by inlining small loops
multiple times.

This attempts to penalise functions with loops at -Oz by adding a CallPenalty
for each top level loop in the function. It uses LI (and hence DT) to calculate
the number of loops. As we are dealing with minsize, the inline threshold is
small and functions at this point should be relatively small, making the
construction of these cheap.

Differential Revision: https://reviews.llvm.org/D52716


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346134 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/InlineCost.cpp                 | 20 +++++
 test/Transforms/Inline/ARM/loop-add.ll      | 95 +++++++++++++++++++++
 test/Transforms/Inline/ARM/loop-memcpy.ll   | 87 +++++++++++++++++++
 test/Transforms/Inline/ARM/loop-noinline.ll | 49 +++++++++++
 4 files changed, 251 insertions(+)
 create mode 100644 test/Transforms/Inline/ARM/loop-add.ll
 create mode 100644 test/Transforms/Inline/ARM/loop-memcpy.ll
 create mode 100644 test/Transforms/Inline/ARM/loop-noinline.ll

diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 923dbe59e86..a3347dbcb93 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -30,6 +31,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/InstVisitor.h"
@@ -1885,6 +1887,24 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) {
   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
     return "noduplicate";
 
+  // Loops generally act a lot like calls in that they act like barriers to
+  // movement, require a certain amount of setup, etc. So when optimising for
+  // size, we penalise any call sites that perform loops. We do this after all
+  // other costs here, so will likely only be dealing with relatively small
+  // functions (and hence DT and LI will hopefully be cheap).
+  if (Caller->optForMinSize()) {
+    DominatorTree DT(F);
+    LoopInfo LI(DT);
+    int NumLoops = 0;
+    for (Loop *L : LI) {
+      // Ignore loops that will not be executed
+      if (DeadBlocks.count(L->getHeader()))
+        continue;
+      NumLoops++;
+    }
+    Cost += NumLoops * InlineConstants::CallPenalty;
+  }
+
   // We applied the maximum possible vector bonus at the beginning. Now,
   // subtract the excess bonus, if any, from the Threshold before
   // comparing against Cost.
diff --git a/test/Transforms/Inline/ARM/loop-add.ll b/test/Transforms/Inline/ARM/loop-add.ll
new file mode 100644
index 00000000000..a4717bc95b7
--- /dev/null
+++ b/test/Transforms/Inline/ARM/loop-add.ll
@@ -0,0 +1,95 @@
+; RUN: opt -inline %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+; CHECK-LABEL: void @doCalls
+define void @doCalls(i8* nocapture %p1, i8* nocapture %p2, i32 %n) #0 {
+entry:
+  %div = lshr i32 %n, 1
+; CHECK: call void @LoopCall
+  tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div) #0
+
+  %div2 = lshr i32 %n, 2
+; CHECK: call void @LoopCall
+  tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div2) #0
+
+; CHECK-NOT: call void @LoopCall
+  tail call void @LoopCall(i8* %p2, i8* %p1, i32 0) #0
+
+; CHECK-NOT: call void @LoopCall_internal
+  tail call void @LoopCall_internal(i8* %p1, i8* %p2, i32 %div2) #0
+
+  %div3 = lshr i32 %n, 4
+; CHECK-NOT: call void @SimpleCall
+  tail call void @SimpleCall(i8* %p2, i8* %p1, i32 %div3) #0
+  ret void
+}
+
+; CHECK-LABEL: define void @LoopCall
+define void @LoopCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  %c = icmp ne i32 %num, 0
+  br i1 %c, label %while.cond, label %while.end
+
+while.cond:                                       ; preds = %while.body, %entry
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = add i8 %0, %1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+; CHECK-LABEL-NOT: define void @LoopCall_internal
+define internal void @LoopCall_internal(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  %c = icmp ne i32 %num, 0
+  br i1 %c, label %while.cond, label %while.end
+
+while.cond:                                       ; preds = %while.body, %entry
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = add i8 %0, %1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+; CHECK-LABEL: define void @SimpleCall
+define void @SimpleCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  %arrayidx = getelementptr inbounds i8, i8* %source, i32 %num
+  %0 = load i8, i8* %arrayidx, align 1
+  %1 = xor i8 %0, 127
+  %arrayidx2 = getelementptr inbounds i8, i8* %dest, i32 %num
+  store i8 %1, i8* %arrayidx2, align 1
+  ret void
+}
+
+attributes #0 = { minsize optsize }
+
diff --git a/test/Transforms/Inline/ARM/loop-memcpy.ll b/test/Transforms/Inline/ARM/loop-memcpy.ll
new file mode 100644
index 00000000000..3b3625c6027
--- /dev/null
+++ b/test/Transforms/Inline/ARM/loop-memcpy.ll
@@ -0,0 +1,87 @@
+; RUN: opt -inline %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+; CHECK-LABEL: define void @matcpy
+define void @matcpy(i8* %dest, i8* %source, i32 %num) #0 {
+entry:
+  %0 = ptrtoint i8* %dest to i32
+  %1 = ptrtoint i8* %source to i32
+  %2 = xor i32 %0, %1
+  %3 = and i32 %2, 3
+  %cmp = icmp eq i32 %3, 0
+  br i1 %cmp, label %if.then, label %if.else20
+
+if.then:                                          ; preds = %entry
+  %sub = sub i32 0, %0
+  %and2 = and i32 %sub, 3
+  %add = or i32 %and2, 4
+  %cmp3 = icmp ugt i32 %add, %num
+  br i1 %cmp3, label %if.else, label %if.then4
+
+if.then4:                                         ; preds = %if.then
+  %sub5 = sub i32 %num, %and2
+  %shr = and i32 %sub5, -4
+  %sub7 = sub i32 %sub5, %shr
+  %tobool = icmp eq i32 %and2, 0
+  br i1 %tobool, label %if.end, label %if.then8
+
+if.then8:                                         ; preds = %if.then4
+; CHECK: call fastcc void @memcpy
+  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %and2) #0
+  %add.ptr = getelementptr inbounds i8, i8* %dest, i32 %and2
+  %add.ptr9 = getelementptr inbounds i8, i8* %source, i32 %and2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then4, %if.then8
+  %p_dest.0 = phi i8* [ %add.ptr, %if.then8 ], [ %dest, %if.then4 ]
+  %p_source.0 = phi i8* [ %add.ptr9, %if.then8 ], [ %source, %if.then4 ]
+  %tobool14 = icmp eq i32 %sub7, 0
+  br i1 %tobool14, label %if.end22, label %if.then15
+
+if.then15:                                        ; preds = %if.end
+  %add.ptr13 = getelementptr inbounds i8, i8* %p_source.0, i32 %shr
+  %add.ptr11 = getelementptr inbounds i8, i8* %p_dest.0, i32 %shr
+; CHECK: call fastcc void @memcpy
+  call fastcc void @memcpy(i8* %add.ptr11, i8* %add.ptr13, i32 %sub7) #0
+  br label %if.end22
+
+if.else:                                          ; preds = %if.then
+  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0
+  br label %if.end22
+
+if.else20:                                        ; preds = %entry
+  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0
+  br label %if.end22
+
+if.end22:                                         ; preds = %if.then15, %if.end, %if.else, %if.else20
+  ret void
+}
+
+; CHECK-LABEL: define internal void @memcpy
+define internal void @memcpy(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr1, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %0, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+attributes #0 = { minsize optsize }
+
diff --git a/test/Transforms/Inline/ARM/loop-noinline.ll b/test/Transforms/Inline/ARM/loop-noinline.ll
new file mode 100644
index 00000000000..8438d16b03e
--- /dev/null
+++ b/test/Transforms/Inline/ARM/loop-noinline.ll
@@ -0,0 +1,49 @@
+; RUN: opt -inline %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+; Check we don't inline loops at -Oz. They tend to be larger than we
+; expect.
+
+; CHECK: define i8* @H
+@digits = constant [16 x i8] c"0123456789ABCDEF", align 1
+define i8* @H(i8* %p, i32 %val, i32 %num) #0 {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %p.addr.0 = phi i8* [ %p, %entry ], [ %incdec.ptr, %do.body ]
+  %val.addr.0 = phi i32 [ %val, %entry ], [ %shl, %do.body ]
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %do.body ]
+  %shr = lshr i32 %val.addr.0, 28
+  %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* @digits, i32 0, i32 %shr
+  %0 = load i8, i8* %arrayidx, align 1
+  %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i32 1
+  store i8 %0, i8* %p.addr.0, align 1
+  %shl = shl i32 %val.addr.0, 4
+  %dec = add i32 %num.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %scevgep = getelementptr i8, i8* %p, i32 %num
+  ret i8* %scevgep
+}
+
+define nonnull i8* @call1(i8* %p, i32 %val, i32 %num) #0 {
+entry:
+; CHECK: tail call i8* @H
+  %call = tail call i8* @H(i8* %p, i32 %val, i32 %num) #0
+  ret i8* %call
+}
+
+define nonnull i8* @call2(i8* %p, i32 %val) #0 {
+entry:
+; CHECK: tail call i8* @H
+  %call = tail call i8* @H(i8* %p, i32 %val, i32 32) #0
+  ret i8* %call
+}
+
+attributes #0 = { minsize optsize }
+
-- 
GitLab


From 74ece931185f4f4793320cbd8d8b568ae18afa3b Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Nov 2018 15:08:36 +0000
Subject: [PATCH 0963/1116] [InstCombine] add tests for select with FP identity
 op; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346136 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/select-binop-cmp.ll           | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/test/Transforms/InstCombine/select-binop-cmp.ll b/test/Transforms/InstCombine/select-binop-cmp.ll
index 5609643235d..edbe310269a 100644
--- a/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -166,6 +166,23 @@ define float @select_fadd_fcmp_2(float %x, float %y, float %v) {
   ret float %C
 }
 
+; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_2_poszero(float %x, float %y, float %v) {
+; CHECK-LABEL: @select_fadd_fcmp_2_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[B:%.*]] = fadd float [[Z]], [[X]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp une float %x, 0.0
+  %z = fadd float %v, 0.0 ; cannot produce -0.0
+  %B = fadd float %z, %x
+  %C = select i1 %A, float %y, float %B
+  ret float %C
+}
+
 define float @select_fadd_fcmp_3(float %x, float %y) {
 ; CHECK-LABEL: @select_fadd_fcmp_3(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
@@ -178,6 +195,21 @@ define float @select_fadd_fcmp_3(float %x, float %y) {
   ret float %C
 }
 
+; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_3_poszero(float %x, float %y) {
+; CHECK-LABEL: @select_fadd_fcmp_3_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[B:%.*]] = fadd float [[X]], 6.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp une float %x, 0.0
+  %B = fadd float 6.0, %x
+  %C = select i1 %A, float %y, float %B
+  ret float %C
+}
+
 define float @select_fadd_fcmp_4(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_4(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
@@ -204,6 +236,23 @@ define float @select_fadd_fcmp_5(float %x, float %y, float %v) {
   ret float %C
 }
 
+; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_5_poszero(float %x, float %y, float %v) {
+; CHECK-LABEL: @select_fadd_fcmp_5_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[B:%.*]] = fadd float [[Z]], [[X]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp oeq float %x, 0.0
+  %z = fadd float %v, 0.0 ; cannot produce -0.0
+  %B = fadd float %z, %x
+  %C = select i1 %A, float %B, float %y
+  ret float %C
+}
+
 define float @select_fadd_fcmp_6(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_6(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
@@ -216,6 +265,21 @@ define float @select_fadd_fcmp_6(float %x, float %y, float %z) {
   ret float %C
 }
 
+; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_6_poszero(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fadd_fcmp_6_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[B:%.*]] = fadd float [[X]], 6.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp oeq float %x, 0.0
+  %B = fadd float %x, 6.0
+  %C = select i1 %A, float %B, float %y
+  ret float %C
+}
+
 define float @select_fmul_fcmp(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fmul_fcmp(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 1.000000e+00
-- 
GitLab


From 570bf673b0953f5809cec38e521969ad7b1bb90b Mon Sep 17 00:00:00 2001
From: Cameron McInally <cameron.mcinally@nyu.edu>
Date: Mon, 5 Nov 2018 15:28:10 +0000
Subject: [PATCH 0964/1116] [NFCI][FPEnv] Split constrained intrinsic tests

The constrained intrinsic tests have grown in number. Split off
the FMA tests into their own file to reduce double coverage.

Differential Revision: https://reviews.llvm.org/D53932


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346137 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../vector-constrained-fp-intrinsics-fma.ll   |  151 +
 .../X86/vector-constrained-fp-intrinsics.ll   | 4759 ++++++-----------
 2 files changed, 1650 insertions(+), 3260 deletions(-)
 create mode 100644 test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll

diff --git a/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll b/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll
new file mode 100644
index 00000000000..e35e76d2f38
--- /dev/null
+++ b/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s
+
+define <1 x float> @constrained_vector_fma_v1f32() {
+; CHECK-LABEL: constrained_vector_fma_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <1 x float> @llvm.experimental.constrained.fma.v1f32(
+           <1 x float> <float 0.5>,
+           <1 x float> <float 2.5>,
+           <1 x float> <float 4.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <1 x float> %fma
+}
+
+define <2 x double> @constrained_vector_fma_v2f64() {
+; CHECK-LABEL: constrained_vector_fma_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovapd {{.*#+}} xmm1 = [1.5E+0,5.0E-1]
+; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [3.5E+0,2.5E+0]
+; CHECK-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <2 x double> @llvm.experimental.constrained.fma.v2f64(
+           <2 x double> <double 1.5, double 0.5>,
+           <2 x double> <double 3.5, double 2.5>,
+           <2 x double> <double 5.5, double 4.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <2 x double> %fma
+}
+
+define <3 x float> @constrained_vector_fma_v3f32() {
+; CHECK-LABEL: constrained_vector_fma_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm0 * xmm2) + mem
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm3 = (xmm0 * xmm3) + mem
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[2,3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <3 x float> @llvm.experimental.constrained.fma.v3f32(
+           <3 x float> <float 2.5, float 1.5, float 0.5>,
+           <3 x float> <float 5.5, float 4.5, float 3.5>,
+           <3 x float> <float 8.5, float 7.5, float 6.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <3 x float> %fma
+}
+
+define <3 x double> @constrained_vector_fma_v3f64() {
+; CHECK-LABEL: constrained_vector_fma_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
+; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [2.5E+0,1.5E+0]
+; CHECK-NEXT:    vmovapd {{.*#+}} xmm2 = [5.5E+0,4.5E+0]
+; CHECK-NEXT:    vfmadd213pd {{.*#+}} xmm2 = (xmm0 * xmm2) + mem
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <3 x double> @llvm.experimental.constrained.fma.v3f64(
+           <3 x double> <double 2.5, double 1.5, double 0.5>,
+           <3 x double> <double 5.5, double 4.5, double 3.5>,
+           <3 x double> <double 8.5, double 7.5, double 6.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <3 x double> %fma
+}
+
+define <4 x double> @constrained_vector_fma_v4f64() {
+; CHECK-LABEL: constrained_vector_fma_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1]
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0]
+; CHECK-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <4 x double> @llvm.experimental.constrained.fma.v4f64(
+           <4 x double> <double 3.5, double 2.5, double 1.5, double 0.5>,
+           <4 x double> <double 7.5, double 6.5, double 5.5, double 4.5>,
+           <4 x double> <double 11.5, double 10.5, double 9.5, double 8.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <4 x double> %fma
+}
+
+define <4 x float> @constrained_vector_fma_v4f32() {
+; CHECK-LABEL: constrained_vector_fma_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1]
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0]
+; CHECK-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <4 x float> @llvm.experimental.constrained.fma.v4f32(
+           <4 x float> <float 3.5, float 2.5, float 1.5, float 0.5>,
+           <4 x float> <float 7.5, float 6.5, float 5.5, float 4.5>,
+           <4 x float> <float 11.5, float 10.5, float 9.5, float 8.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <4 x float> %fma
+}
+
+define <8 x float> @constrained_vector_fma_v8f32() {
+; CHECK-LABEL: constrained_vector_fma_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1,7.5E+0,6.5E+0,5.5E+0,4.5E+0]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0,1.15E+1,1.05E+1,9.5E+0,8.5E+0]
+; CHECK-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <8 x float> @llvm.experimental.constrained.fma.v8f32(
+           <8 x float> <float 3.5, float 2.5, float 1.5, float 0.5,
+                        float 7.5, float 6.5, float 5.5, float 4.5>,
+           <8 x float> <float 7.5, float 6.5, float 5.5, float 4.5,
+                        float 11.5, float 10.5, float 9.5, float 8.5>,
+           <8 x float> <float 11.5, float 10.5, float 9.5, float 8.5,
+                        float 15.5, float 14.5, float 13.5, float 12.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <8 x float> %fma
+}
+
+; Single width declarations
+declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
+
+; Scalar width declarations
+declare <1 x float> @llvm.experimental.constrained.fma.v1f32(<1 x float>, <1 x float>, <1 x float>, metadata, metadata)
+
+; Illegal width declarations
+declare <3 x float> @llvm.experimental.constrained.fma.v3f32(<3 x float>, <3 x float>, <3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.fma.v3f64(<3 x double>, <3 x double>, <3 x double>, metadata, metadata)
+
+; Double width declarations
+declare <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double>, <4 x double>, <4 x double>, metadata, metadata)
+declare <8 x float> @llvm.experimental.constrained.fma.v8f32(<8 x float>, <8 x float>, <8 x float>, metadata, metadata)
diff --git a/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index ad07e9ba12c..59a1729cc05 100644
--- a/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -1,19 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck --check-prefix=COMMON --check-prefix=NO-FMA --check-prefix=FMACALL64 --check-prefix=FMACALL32 %s
-; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck -check-prefix=COMMON --check-prefix=HAS-FMA --check-prefix=FMA64 --check-prefix=FMA32 %s
+; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s
 
 define <1 x float> @constrained_vector_fdiv_v1f32() {
-; NO-FMA-LABEL: constrained_vector_fdiv_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    divss {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fdiv_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vdivss {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fdiv_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %div = call <1 x float> @llvm.experimental.constrained.fdiv.v1f32(
            <1 x float> <float 1.000000e+00>,
@@ -24,17 +17,11 @@ entry:
 }
 
 define <2 x double> @constrained_vector_fdiv_v2f64() {
-; NO-FMA-LABEL: constrained_vector_fdiv_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
-; NO-FMA-NEXT:    divpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fdiv_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
-; HAS-FMA-NEXT:    vdivpd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fdiv_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
+; CHECK-NEXT:    divpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %div = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(
            <2 x double> <double 1.000000e+00, double 2.000000e+00>,
@@ -45,31 +32,18 @@ entry:
 }
 
 define <3 x float> @constrained_vector_fdiv_v3f32() {
-; NO-FMA-LABEL: constrained_vector_fdiv_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    divss %xmm1, %xmm2
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    divss %xmm1, %xmm0
-; NO-FMA-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    divss %xmm1, %xmm3
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; NO-FMA-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fdiv_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm1
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vdivss %xmm0, %xmm2, %xmm2
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vdivss %xmm0, %xmm3, %xmm0
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fdiv_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm1, %xmm2
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm1, %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm1, %xmm3
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT:    retq
 entry:
   %div = call <3 x float> @llvm.experimental.constrained.fdiv.v3f32(
            <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>,
@@ -80,26 +54,17 @@ entry:
 }
 
 define <3 x double> @constrained_vector_fdiv_v3f64() {
-; NO-FMA-LABEL: constrained_vector_fdiv_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
-; NO-FMA-NEXT:    divpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    divsd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movapd %xmm0, %xmm1
-; NO-FMA-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    fldl -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fdiv_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vdivsd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1.0E+0,2.0E+0]
-; HAS-FMA-NEXT:    vdivpd {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fdiv_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
+; CHECK-NEXT:    divpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    divsd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
 entry:
   %div = call <3 x double> @llvm.experimental.constrained.fdiv.v3f64(
            <3 x double> <double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>,
@@ -110,20 +75,14 @@ entry:
 }
 
 define <4 x double> @constrained_vector_fdiv_v4f64() {
-; NO-FMA-LABEL: constrained_vector_fdiv_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm2 = [1.0E+1,1.0E+1]
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
-; NO-FMA-NEXT:    divpd %xmm2, %xmm0
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [3.0E+0,4.0E+0]
-; NO-FMA-NEXT:    divpd %xmm2, %xmm1
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fdiv_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
-; HAS-FMA-NEXT:    vdivpd {{.*}}(%rip), %ymm0, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fdiv_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm2 = [1.0E+1,1.0E+1]
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
+; CHECK-NEXT:    divpd %xmm2, %xmm0
+; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [3.0E+0,4.0E+0]
+; CHECK-NEXT:    divpd %xmm2, %xmm1
+; CHECK-NEXT:    retq
 entry:
   %div = call <4 x double> @llvm.experimental.constrained.fdiv.v4f64(
            <4 x double> <double 1.000000e+00, double 2.000000e+00,
@@ -136,27 +95,16 @@ entry:
 }
 
 define <1 x float> @constrained_vector_frem_v1f32() {
-; NO-FMA-LABEL: constrained_vector_frem_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmodf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_frem_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmodf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_frem_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmodf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rem = call <1 x float> @llvm.experimental.constrained.frem.v1f32(
            <1 x float> <float 1.000000e+00>,
@@ -167,39 +115,22 @@ entry:
 }
 
 define <2 x double> @constrained_vector_frem_v2f64() {
-; NO-FMA-LABEL: constrained_vector_frem_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_frem_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_frem_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rem = call <2 x double> @llvm.experimental.constrained.frem.v2f64(
            <2 x double> <double 1.000000e+00, double 2.000000e+00>,
@@ -210,52 +141,29 @@ entry:
 }
 
 define <3 x float> @constrained_vector_frem_v3f32() {
-; NO-FMA-LABEL: constrained_vector_frem_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmodf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmodf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmodf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_frem_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmodf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmodf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmodf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_frem_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmodf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmodf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmodf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rem = call <3 x float> @llvm.experimental.constrained.frem.v3f32(
            <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>,
@@ -266,54 +174,30 @@ entry:
 }
 
 define <3 x double> @constrained_vector_frem_v3f64() {
-; NO-FMA-LABEL: constrained_vector_frem_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_frem_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_frem_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rem = call <3 x double> @llvm.experimental.constrained.frem.v3f64(
            <3 x double> <double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>,
@@ -324,62 +208,34 @@ entry:
 }
 
 define <4 x double> @constrained_vector_frem_v4f64() {
-; NO-FMA-LABEL: constrained_vector_frem_v4f64:
-; NO-FMA:       # %bb.0:
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_frem_v4f64:
-; HAS-FMA:       # %bb.0:
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_frem_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
   %rem = call <4 x double> @llvm.experimental.constrained.frem.v4f64(
            <4 x double> <double 1.000000e+00, double 2.000000e+00,
                          double 3.000000e+00, double 4.000000e+00>,
@@ -391,17 +247,11 @@ define <4 x double> @constrained_vector_frem_v4f64() {
 }
 
 define <1 x float> @constrained_vector_fmul_v1f32() {
-; NO-FMA-LABEL: constrained_vector_fmul_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    mulss {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fmul_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fmul_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %mul = call <1 x float> @llvm.experimental.constrained.fmul.v1f32(
            <1 x float> <float 0x7FF0000000000000>,
@@ -412,17 +262,11 @@ entry:
 }
 
 define <2 x double> @constrained_vector_fmul_v2f64() {
-; NO-FMA-LABEL: constrained_vector_fmul_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fmul_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fmul_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    mulpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %mul = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(
            <2 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF>,
@@ -433,27 +277,17 @@ entry:
 }
 
 define <3 x float> @constrained_vector_fmul_v3f32() {
-; NO-FMA-LABEL: constrained_vector_fmul_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    mulss %xmm1, %xmm2
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    mulss %xmm1, %xmm0
-; NO-FMA-NEXT:    mulss {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; NO-FMA-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fmul_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
-; HAS-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm2
-; HAS-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fmul_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    mulss %xmm1, %xmm2
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    mulss %xmm1, %xmm0
+; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT:    retq
 entry:
   %mul = call <3 x float> @llvm.experimental.constrained.fmul.v3f32(
            <3 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000,
@@ -465,26 +299,17 @@ entry:
 }
 
 define <3 x double> @constrained_vector_fmul_v3f64() {
-; NO-FMA-LABEL: constrained_vector_fmul_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    mulsd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movapd %xmm0, %xmm1
-; NO-FMA-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    fldl -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fmul_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmulsd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vmulpd {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fmul_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    mulpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    mulsd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
 entry:
   %mul = call <3 x double> @llvm.experimental.constrained.fmul.v3f64(
            <3 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -496,19 +321,13 @@ entry:
 }
 
 define <4 x double> @constrained_vector_fmul_v4f64() {
-; NO-FMA-LABEL: constrained_vector_fmul_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [2.0E+0,3.0E+0]
-; NO-FMA-NEXT:    mulpd %xmm1, %xmm0
-; NO-FMA-NEXT:    mulpd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fmul_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fmul_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [2.0E+0,3.0E+0]
+; CHECK-NEXT:    mulpd %xmm1, %xmm0
+; CHECK-NEXT:    mulpd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    retq
 entry:
   %mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64(
            <4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -521,17 +340,11 @@ entry:
 }
 
 define <1 x float> @constrained_vector_fadd_v1f32() {
-; NO-FMA-LABEL: constrained_vector_fadd_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    addss {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fadd_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fadd_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    addss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %add = call <1 x float> @llvm.experimental.constrained.fadd.v1f32(
            <1 x float> <float 0x7FF0000000000000>,
@@ -542,17 +355,11 @@ entry:
 }
 
 define <2 x double> @constrained_vector_fadd_v2f64() {
-; NO-FMA-LABEL: constrained_vector_fadd_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    addpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fadd_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vaddpd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fadd_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    addpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %add = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(
            <2 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF>,
@@ -563,28 +370,17 @@ entry:
 }
 
 define <3 x float> @constrained_vector_fadd_v3f32() {
-; NO-FMA-LABEL: constrained_vector_fadd_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    xorps %xmm1, %xmm1
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    addss %xmm2, %xmm1
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    addss %xmm2, %xmm0
-; NO-FMA-NEXT:    addss {{.*}}(%rip), %xmm2
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; NO-FMA-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fadd_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; HAS-FMA-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm2
-; HAS-FMA-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fadd_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    addss %xmm2, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    addss %xmm2, %xmm0
+; CHECK-NEXT:    addss {{.*}}(%rip), %xmm2
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
 entry:
   %add = call <3 x float> @llvm.experimental.constrained.fadd.v3f32(
            <3 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000,
@@ -596,26 +392,17 @@ entry:
 }
 
 define <3 x double> @constrained_vector_fadd_v3f64() {
-; NO-FMA-LABEL: constrained_vector_fadd_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    addpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    xorpd %xmm1, %xmm1
-; NO-FMA-NEXT:    addsd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movapd %xmm0, %xmm1
-; NO-FMA-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    fldl -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fadd_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vaddpd {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fadd_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    addpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    xorpd %xmm1, %xmm1
+; CHECK-NEXT:    addsd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
 entry:
   %add = call <3 x double> @llvm.experimental.constrained.fadd.v3f64(
            <3 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -627,19 +414,13 @@ entry:
 }
 
 define <4 x double> @constrained_vector_fadd_v4f64() {
-; NO-FMA-LABEL: constrained_vector_fadd_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,1.0000000000000001E-1]
-; NO-FMA-NEXT:    addpd %xmm1, %xmm0
-; NO-FMA-NEXT:    addpd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fadd_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fadd_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,1.0000000000000001E-1]
+; CHECK-NEXT:    addpd %xmm1, %xmm0
+; CHECK-NEXT:    addpd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    retq
 entry:
   %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64(
            <4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -652,17 +433,11 @@ entry:
 }
 
 define <1 x float> @constrained_vector_fsub_v1f32() {
-; NO-FMA-LABEL: constrained_vector_fsub_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    subss {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fsub_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsubss {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fsub_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    subss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %sub = call <1 x float> @llvm.experimental.constrained.fsub.v1f32(
            <1 x float> <float 0x7FF0000000000000>,
@@ -673,17 +448,11 @@ entry:
 }
 
 define <2 x double> @constrained_vector_fsub_v2f64() {
-; NO-FMA-LABEL: constrained_vector_fsub_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
-; NO-FMA-NEXT:    subpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fsub_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fsub_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; CHECK-NEXT:    subpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %sub = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(
            <2 x double> <double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF>,
@@ -694,29 +463,18 @@ entry:
 }
 
 define <3 x float> @constrained_vector_fsub_v3f32() {
-; NO-FMA-LABEL: constrained_vector_fsub_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    xorps %xmm0, %xmm0
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movaps %xmm1, %xmm2
-; NO-FMA-NEXT:    subss %xmm0, %xmm2
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    subss {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    subss {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; NO-FMA-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fsub_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsubss %xmm0, %xmm1, %xmm0
-; HAS-FMA-NEXT:    vsubss {{.*}}(%rip), %xmm1, %xmm2
-; HAS-FMA-NEXT:    vsubss {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fsub_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movaps %xmm1, %xmm2
+; CHECK-NEXT:    subss %xmm0, %xmm2
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    subss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    subss {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT:    retq
 entry:
   %sub = call <3 x float> @llvm.experimental.constrained.fsub.v3f32(
            <3 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000,
@@ -728,28 +486,18 @@ entry:
 }
 
 define <3 x double> @constrained_vector_fsub_v3f64() {
-; NO-FMA-LABEL: constrained_vector_fsub_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    xorpd %xmm0, %xmm0
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    subsd %xmm0, %xmm1
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
-; NO-FMA-NEXT:    subpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movapd %xmm0, %xmm1
-; NO-FMA-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    fldl -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fsub_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vsubpd {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fsub_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorpd %xmm0, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    subsd %xmm0, %xmm1
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; CHECK-NEXT:    subpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
 entry:
   %sub = call <3 x double> @llvm.experimental.constrained.fsub.v3f64(
            <3 x double> <double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF,
@@ -761,19 +509,13 @@ entry:
 }
 
 define <4 x double> @constrained_vector_fsub_v4f64() {
-; NO-FMA-LABEL: constrained_vector_fsub_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
-; NO-FMA-NEXT:    movapd %xmm1, %xmm0
-; NO-FMA-NEXT:    subpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    subpd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fsub_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vsubpd {{.*}}(%rip), %ymm0, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fsub_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; CHECK-NEXT:    movapd %xmm1, %xmm0
+; CHECK-NEXT:    subpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    subpd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    retq
 entry:
   %sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64(
            <4 x double> <double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF,
@@ -785,365 +527,12 @@ entry:
   ret <4 x double> %sub
 }
 
-define <1 x float> @constrained_vector_fma_v1f32() {
-; NO-FMA-LABEL: constrained_vector_fma_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <1 x float> @llvm.experimental.constrained.fma.v1f32(
-           <1 x float> <float 0.5>,
-           <1 x float> <float 2.5>,
-           <1 x float> <float 4.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <1 x float> %fma
-}
-
-define <2 x double> @constrained_vector_fma_v2f64() {
-; NO-FMA-LABEL: constrained_vector_fma_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1.5E+0,5.0E-1]
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [3.5E+0,2.5E+0]
-; HAS-FMA-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <2 x double> @llvm.experimental.constrained.fma.v2f64(
-           <2 x double> <double 1.5, double 0.5>,
-           <2 x double> <double 3.5, double 2.5>,
-           <2 x double> <double 5.5, double 4.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <2 x double> %fma
-}
-
-define <3 x float> @constrained_vector_fma_v3f32() {
-; NO-FMA-LABEL: constrained_vector_fma_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm0 * xmm2) + mem
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vfmadd213ss {{.*#+}} xmm3 = (xmm0 * xmm3) + mem
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <3 x float> @llvm.experimental.constrained.fma.v3f32(
-           <3 x float> <float 2.5, float 1.5, float 0.5>,
-           <3 x float> <float 5.5, float 4.5, float 3.5>,
-           <3 x float> <float 8.5, float 7.5, float 6.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <3 x float> %fma
-}
-
-define <3 x double> @constrained_vector_fma_v3f64() {
-; NO-FMA-LABEL: constrained_vector_fma_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [2.5E+0,1.5E+0]
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm2 = [5.5E+0,4.5E+0]
-; HAS-FMA-NEXT:    vfmadd213pd {{.*#+}} xmm2 = (xmm0 * xmm2) + mem
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm0
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <3 x double> @llvm.experimental.constrained.fma.v3f64(
-           <3 x double> <double 2.5, double 1.5, double 0.5>,
-           <3 x double> <double 5.5, double 4.5, double 3.5>,
-           <3 x double> <double 8.5, double 7.5, double 6.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <3 x double> %fma
-}
-
-define <4 x double> @constrained_vector_fma_v4f64() {
-; NO-FMA-LABEL: constrained_vector_fma_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1]
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0]
-; HAS-FMA-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <4 x double> @llvm.experimental.constrained.fma.v4f64(
-           <4 x double> <double 3.5, double 2.5, double 1.5, double 0.5>,
-           <4 x double> <double 7.5, double 6.5, double 5.5, double 4.5>,
-           <4 x double> <double 11.5, double 10.5, double 9.5, double 8.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <4 x double> %fma
-}
-
-define <4 x float> @constrained_vector_fma_v4f32() {
-; NO-FMA-LABEL: constrained_vector_fma_v4f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v4f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} xmm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1]
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} xmm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0]
-; HAS-FMA-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <4 x float> @llvm.experimental.constrained.fma.v4f32(
-           <4 x float> <float 3.5, float 2.5, float 1.5, float 0.5>,
-           <4 x float> <float 7.5, float 6.5, float 5.5, float 4.5>,
-           <4 x float> <float 11.5, float 10.5, float 9.5, float 8.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <4 x float> %fma
-}
-
-define <8 x float> @constrained_vector_fma_v8f32() {
-; NO-FMA-LABEL: constrained_vector_fma_v8f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $56, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 64
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $56, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v8f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1,7.5E+0,6.5E+0,5.5E+0,4.5E+0]
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} ymm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0,1.15E+1,1.05E+1,9.5E+0,8.5E+0]
-; HAS-FMA-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <8 x float> @llvm.experimental.constrained.fma.v8f32(
-           <8 x float> <float 3.5, float 2.5, float 1.5, float 0.5,
-                        float 7.5, float 6.5, float 5.5, float 4.5>,
-           <8 x float> <float 7.5, float 6.5, float 5.5, float 4.5,
-                        float 11.5, float 10.5, float 9.5, float 8.5>,
-           <8 x float> <float 11.5, float 10.5, float 9.5, float 8.5,
-                        float 15.5, float 14.5, float 13.5, float 12.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <8 x float> %fma
-}
-
 define <1 x float> @constrained_vector_sqrt_v1f32() {
-; NO-FMA-LABEL: constrained_vector_sqrt_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    sqrtss %xmm0, %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sqrt_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sqrt_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    sqrtss %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %sqrt = call <1 x float> @llvm.experimental.constrained.sqrt.v1f32(
                               <1 x float> <float 42.0>,
@@ -1153,15 +542,10 @@ entry:
 }
 
 define <2 x double> @constrained_vector_sqrt_v2f64() {
-; NO-FMA-LABEL: constrained_vector_sqrt_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sqrt_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vsqrtpd {{.*}}(%rip), %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sqrt_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %sqrt = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(
                               <2 x double> <double 42.0, double 42.1>,
@@ -1171,29 +555,17 @@ entry:
 }
 
 define <3 x float> @constrained_vector_sqrt_v3f32() {
-; NO-FMA-LABEL: constrained_vector_sqrt_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    sqrtss %xmm0, %xmm1
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    sqrtss %xmm0, %xmm0
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    sqrtss %xmm2, %xmm2
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; NO-FMA-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sqrt_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsqrtss %xmm2, %xmm2, %xmm2
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sqrt_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    sqrtss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    sqrtss %xmm0, %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    sqrtss %xmm2, %xmm2
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
 entry:
   %sqrt = call <3 x float> @llvm.experimental.constrained.sqrt.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -1203,24 +575,16 @@ entry:
 }
 
 define <3 x double> @constrained_vector_sqrt_v3f64() {
-; NO-FMA-LABEL: constrained_vector_sqrt_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    sqrtsd %xmm0, %xmm1
-; NO-FMA-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movapd %xmm0, %xmm1
-; NO-FMA-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    fldl -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sqrt_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vsqrtpd {{.*}}(%rip), %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sqrt_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    sqrtsd %xmm0, %xmm1
+; CHECK-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
 entry:
   %sqrt = call <3 x double> @llvm.experimental.constrained.sqrt.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -1230,17 +594,12 @@ entry:
 }
 
 define <4 x double> @constrained_vector_sqrt_v4f64() {
-; NO-FMA-LABEL: constrained_vector_sqrt_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    sqrtpd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sqrt_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vsqrtpd {{.*}}(%rip), %ymm0
-; HAS-FMA-NEXT:    retq
-entry:
+; CHECK-LABEL: constrained_vector_sqrt_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    sqrtpd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    retq
+ entry:
   %sqrt = call <4 x double> @llvm.experimental.constrained.sqrt.v4f64(
                               <4 x double> <double 42.0, double 42.1,
                                             double 42.2, double 42.3>,
@@ -1250,27 +609,16 @@ entry:
 }
 
 define <1 x float> @constrained_vector_pow_v1f32() {
-; NO-FMA-LABEL: constrained_vector_pow_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq powf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_pow_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq powf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_pow_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq powf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %pow = call <1 x float> @llvm.experimental.constrained.pow.v1f32(
                              <1 x float> <float 42.0>,
@@ -1281,39 +629,22 @@ entry:
 }
 
 define <2 x double> @constrained_vector_pow_v2f64() {
-; NO-FMA-LABEL: constrained_vector_pow_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_pow_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_pow_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %pow = call <2 x double> @llvm.experimental.constrained.pow.v2f64(
                              <2 x double> <double 42.1, double 42.2>,
@@ -1324,52 +655,29 @@ entry:
 }
 
 define <3 x float> @constrained_vector_pow_v3f32() {
-; NO-FMA-LABEL: constrained_vector_pow_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq powf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq powf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq powf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_pow_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq powf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq powf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq powf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_pow_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq powf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq powf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq powf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %pow = call <3 x float> @llvm.experimental.constrained.pow.v3f32(
                              <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -1380,54 +688,30 @@ entry:
 }
 
 define <3 x double> @constrained_vector_pow_v3f64() {
-; NO-FMA-LABEL: constrained_vector_pow_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_pow_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_pow_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %pow = call <3 x double> @llvm.experimental.constrained.pow.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -1438,62 +722,34 @@ entry:
 }
 
 define <4 x double> @constrained_vector_pow_v4f64() {
-; NO-FMA-LABEL: constrained_vector_pow_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_pow_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_pow_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %pow = call <4 x double> @llvm.experimental.constrained.pow.v4f64(
                              <4 x double> <double 42.1, double 42.2,
@@ -1506,27 +762,16 @@ entry:
 }
 
 define <1 x float> @constrained_vector_powi_v1f32() {
-; NO-FMA-LABEL: constrained_vector_powi_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powisf2
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_powi_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powisf2
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_powi_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powisf2
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %powi = call <1 x float> @llvm.experimental.constrained.powi.v1f32(
                               <1 x float> <float 42.0>,
@@ -1537,39 +782,22 @@ entry:
 }
 
 define <2 x double> @constrained_vector_powi_v2f64() {
-; NO-FMA-LABEL: constrained_vector_powi_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_powi_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_powi_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %powi = call <2 x double> @llvm.experimental.constrained.powi.v2f64(
                               <2 x double> <double 42.1, double 42.2>,
@@ -1580,52 +808,29 @@ entry:
 }
 
 define <3 x float> @constrained_vector_powi_v3f32() {
-; NO-FMA-LABEL: constrained_vector_powi_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powisf2
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powisf2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powisf2
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_powi_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powisf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powisf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powisf2
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_powi_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powisf2
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powisf2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powisf2
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %powi = call <3 x float> @llvm.experimental.constrained.powi.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -1636,54 +841,30 @@ entry:
 }
 
 define <3 x double> @constrained_vector_powi_v3f64() {
-; NO-FMA-LABEL: constrained_vector_powi_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_powi_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_powi_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %powi = call <3 x double> @llvm.experimental.constrained.powi.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -1694,62 +875,34 @@ entry:
 }
 
 define <4 x double> @constrained_vector_powi_v4f64() {
-; NO-FMA-LABEL: constrained_vector_powi_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_powi_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_powi_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %powi = call <4 x double> @llvm.experimental.constrained.powi.v4f64(
                               <4 x double> <double 42.1, double 42.2,
@@ -1761,25 +914,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_sin_v1f32() {
-; NO-FMA-LABEL: constrained_vector_sin_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq sinf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sin_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq sinf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sin_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq sinf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %sin = call <1 x float> @llvm.experimental.constrained.sin.v1f32(
                              <1 x float> <float 42.0>,
@@ -1789,35 +932,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_sin_v2f64() {
-; NO-FMA-LABEL: constrained_vector_sin_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sin_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sin_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %sin = call <2 x double> @llvm.experimental.constrained.sin.v2f64(
                              <2 x double> <double 42.0, double 42.1>,
@@ -1827,46 +955,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_sin_v3f32() {
-; NO-FMA-LABEL: constrained_vector_sin_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq sinf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq sinf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq sinf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sin_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq sinf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq sinf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq sinf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sin_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq sinf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq sinf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq sinf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %sin = call <3 x float> @llvm.experimental.constrained.sin.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -1876,48 +984,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_sin_v3f64() {
-; NO-FMA-LABEL: constrained_vector_sin_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sin_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sin_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %sin = call <3 x double> @llvm.experimental.constrained.sin.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -1927,54 +1014,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_sin_v4f64() {
-; NO-FMA-LABEL: constrained_vector_sin_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sin_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sin_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %sin = call <4 x double> @llvm.experimental.constrained.sin.v4f64(
                              <4 x double> <double 42.0, double 42.1,
@@ -1985,25 +1048,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_cos_v1f32() {
-; NO-FMA-LABEL: constrained_vector_cos_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq cosf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_cos_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq cosf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_cos_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq cosf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cos = call <1 x float> @llvm.experimental.constrained.cos.v1f32(
                              <1 x float> <float 42.0>,
@@ -2013,35 +1066,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_cos_v2f64() {
-; NO-FMA-LABEL: constrained_vector_cos_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_cos_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_cos_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cos = call <2 x double> @llvm.experimental.constrained.cos.v2f64(
                              <2 x double> <double 42.0, double 42.1>,
@@ -2051,46 +1089,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_cos_v3f32() {
-; NO-FMA-LABEL: constrained_vector_cos_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq cosf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq cosf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq cosf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_cos_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq cosf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq cosf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq cosf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_cos_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq cosf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq cosf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq cosf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cos = call <3 x float> @llvm.experimental.constrained.cos.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -2100,48 +1118,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_cos_v3f64() {
-; NO-FMA-LABEL: constrained_vector_cos_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_cos_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_cos_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cos = call <3 x double> @llvm.experimental.constrained.cos.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -2151,54 +1148,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_cos_v4f64() {
-; NO-FMA-LABEL: constrained_vector_cos_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_cos_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_cos_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cos = call <4 x double> @llvm.experimental.constrained.cos.v4f64(
                              <4 x double> <double 42.0, double 42.1,
@@ -2209,25 +1182,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_exp_v1f32() {
-; NO-FMA-LABEL: constrained_vector_exp_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq expf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq expf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq expf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp = call <1 x float> @llvm.experimental.constrained.exp.v1f32(
                              <1 x float> <float 42.0>,
@@ -2237,35 +1200,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_exp_v2f64() {
-; NO-FMA-LABEL: constrained_vector_exp_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp = call <2 x double> @llvm.experimental.constrained.exp.v2f64(
                              <2 x double> <double 42.0, double 42.1>,
@@ -2275,46 +1223,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_exp_v3f32() {
-; NO-FMA-LABEL: constrained_vector_exp_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq expf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq expf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq expf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq expf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq expf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq expf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq expf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq expf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq expf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp = call <3 x float> @llvm.experimental.constrained.exp.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -2324,48 +1252,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_exp_v3f64() {
-; NO-FMA-LABEL: constrained_vector_exp_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp = call <3 x double> @llvm.experimental.constrained.exp.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -2375,54 +1282,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_exp_v4f64() {
-; NO-FMA-LABEL: constrained_vector_exp_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp = call <4 x double> @llvm.experimental.constrained.exp.v4f64(
                              <4 x double> <double 42.0, double 42.1,
@@ -2433,25 +1316,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_exp2_v1f32() {
-; NO-FMA-LABEL: constrained_vector_exp2_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq exp2f
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp2_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq exp2f
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp2_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq exp2f
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp2 = call <1 x float> @llvm.experimental.constrained.exp2.v1f32(
                              <1 x float> <float 42.0>,
@@ -2461,35 +1334,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_exp2_v2f64() {
-; NO-FMA-LABEL: constrained_vector_exp2_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp2_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp2_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp2 = call <2 x double> @llvm.experimental.constrained.exp2.v2f64(
                               <2 x double> <double 42.1, double 42.0>,
@@ -2499,46 +1357,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_exp2_v3f32() {
-; NO-FMA-LABEL: constrained_vector_exp2_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq exp2f
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq exp2f
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq exp2f
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp2_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq exp2f
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq exp2f
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq exp2f
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp2_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq exp2f
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq exp2f
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq exp2f
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp2 = call <3 x float> @llvm.experimental.constrained.exp2.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -2548,48 +1386,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_exp2_v3f64() {
-; NO-FMA-LABEL: constrained_vector_exp2_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp2_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp2_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp2 = call <3 x double> @llvm.experimental.constrained.exp2.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -2599,54 +1416,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_exp2_v4f64() {
-; NO-FMA-LABEL: constrained_vector_exp2_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp2_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp2_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp2 = call <4 x double> @llvm.experimental.constrained.exp2.v4f64(
                               <4 x double> <double 42.1, double 42.2,
@@ -2657,25 +1450,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_log_v1f32() {
-; NO-FMA-LABEL: constrained_vector_log_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq logf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq logf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq logf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log = call <1 x float> @llvm.experimental.constrained.log.v1f32(
                              <1 x float> <float 42.0>,
@@ -2685,35 +1468,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_log_v2f64() {
-; NO-FMA-LABEL: constrained_vector_log_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log = call <2 x double> @llvm.experimental.constrained.log.v2f64(
                              <2 x double> <double 42.0, double 42.1>,
@@ -2723,46 +1491,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_log_v3f32() {
-; NO-FMA-LABEL: constrained_vector_log_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq logf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq logf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq logf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq logf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq logf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq logf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq logf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq logf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq logf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log = call <3 x float> @llvm.experimental.constrained.log.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -2772,48 +1520,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_log_v3f64() {
-; NO-FMA-LABEL: constrained_vector_log_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log = call <3 x double> @llvm.experimental.constrained.log.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -2823,54 +1550,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_log_v4f64() {
-; NO-FMA-LABEL: constrained_vector_log_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log = call <4 x double> @llvm.experimental.constrained.log.v4f64(
                              <4 x double> <double 42.0, double 42.1,
@@ -2881,25 +1584,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_log10_v1f32() {
-; NO-FMA-LABEL: constrained_vector_log10_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log10f
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log10_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log10f
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log10_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log10f
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log10 = call <1 x float> @llvm.experimental.constrained.log10.v1f32(
                              <1 x float> <float 42.0>,
@@ -2909,35 +1602,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_log10_v2f64() {
-; NO-FMA-LABEL: constrained_vector_log10_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log10_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log10_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log10 = call <2 x double> @llvm.experimental.constrained.log10.v2f64(
                                <2 x double> <double 42.0, double 42.1>,
@@ -2947,46 +1625,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_log10_v3f32() {
-; NO-FMA-LABEL: constrained_vector_log10_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log10f
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log10f
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log10f
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log10_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log10f
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log10f
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log10f
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log10_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log10f
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log10f
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log10f
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log10 = call <3 x float> @llvm.experimental.constrained.log10.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -2996,48 +1654,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_log10_v3f64() {
-; NO-FMA-LABEL: constrained_vector_log10_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log10_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log10_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log10 = call <3 x double> @llvm.experimental.constrained.log10.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -3047,54 +1684,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_log10_v4f64() {
-; NO-FMA-LABEL: constrained_vector_log10_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log10_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log10_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log10 = call <4 x double> @llvm.experimental.constrained.log10.v4f64(
                                <4 x double> <double 42.0, double 42.1,
@@ -3105,25 +1718,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_log2_v1f32() {
-; NO-FMA-LABEL: constrained_vector_log2_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log2f
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log2_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log2f
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log2_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log2f
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log2 = call <1 x float> @llvm.experimental.constrained.log2.v1f32(
                              <1 x float> <float 42.0>,
@@ -3133,35 +1736,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_log2_v2f64() {
-; NO-FMA-LABEL: constrained_vector_log2_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log2_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log2_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log2 = call <2 x double> @llvm.experimental.constrained.log2.v2f64(
                               <2 x double> <double 42.0, double 42.1>,
@@ -3171,46 +1759,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_log2_v3f32() {
-; NO-FMA-LABEL: constrained_vector_log2_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log2f
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log2f
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log2f
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log2_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log2f
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log2f
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log2f
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log2_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log2f
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log2f
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log2f
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log2 = call <3 x float> @llvm.experimental.constrained.log2.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -3220,48 +1788,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_log2_v3f64() {
-; NO-FMA-LABEL: constrained_vector_log2_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log2_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log2_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log2 = call <3 x double> @llvm.experimental.constrained.log2.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -3271,54 +1818,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_log2_v4f64() {
-; NO-FMA-LABEL: constrained_vector_log2_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log2_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log2_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log2 = call <4 x double> @llvm.experimental.constrained.log2.v4f64(
                               <4 x double> <double 42.0, double 42.1,
@@ -3329,21 +1852,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_rint_v1f32() {
-; NO-FMA-LABEL: constrained_vector_rint_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq rintf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_rint_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_rint_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq rintf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rint = call <1 x float> @llvm.experimental.constrained.rint.v1f32(
                              <1 x float> <float 42.0>,
@@ -3353,25 +1870,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_rint_v2f64() {
-; NO-FMA-LABEL: constrained_vector_rint_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_rint_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vroundpd $4, {{.*}}(%rip), %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_rint_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rint = call <2 x double> @llvm.experimental.constrained.rint.v2f64(
                         <2 x double> <double 42.1, double 42.0>,
@@ -3381,39 +1893,27 @@ entry:
 }
 
 define <3 x float> @constrained_vector_rint_v3f32() {
-; NO-FMA-LABEL: constrained_vector_rint_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq rintf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq rintf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq rintf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_rint_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $4, %xmm1, %xmm1, %xmm1
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $4, %xmm2, %xmm2, %xmm2
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; HAS-FMA-NEXT:    retq
-entry:
+; CHECK-LABEL: constrained_vector_rint_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq rintf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq rintf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq rintf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+ entry:
   %rint = call <3 x float> @llvm.experimental.constrained.rint.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
                               metadata !"round.dynamic",
@@ -3422,35 +1922,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_rint_v3f64() {
-; NO-FMA-LABEL: constrained_vector_rint_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_rint_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vroundsd $4, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vroundpd $4, {{.*}}(%rip), %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_rint_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rint = call <3 x double> @llvm.experimental.constrained.rint.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -3460,35 +1952,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_rint_v4f64() {
-; NO-FMA-LABEL: constrained_vector_rint_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_rint_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vroundpd $4, {{.*}}(%rip), %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_rint_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rint = call <4 x double> @llvm.experimental.constrained.rint.v4f64(
                         <4 x double> <double 42.1, double 42.2,
@@ -3499,21 +1986,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_nearbyint_v1f32() {
-; NO-FMA-LABEL: constrained_vector_nearbyint_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq nearbyintf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_nearbyint_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_nearbyint_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq nearbyintf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %nearby = call <1 x float> @llvm.experimental.constrained.nearbyint.v1f32(
                                <1 x float> <float 42.0>,
@@ -3523,25 +2004,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_nearbyint_v2f64() {
-; NO-FMA-LABEL: constrained_vector_nearbyint_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_nearbyint_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vroundpd $12, {{.*}}(%rip), %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_nearbyint_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %nearby = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(
                                 <2 x double> <double 42.1, double 42.0>,
@@ -3551,38 +2027,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_nearbyint_v3f32() {
-; NO-FMA-LABEL: constrained_vector_nearbyint_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq nearbyintf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq nearbyintf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq nearbyintf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_nearbyint_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $12, %xmm1, %xmm1, %xmm1
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $12, %xmm2, %xmm2, %xmm2
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_nearbyint_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq nearbyintf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq nearbyintf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq nearbyintf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %nearby = call <3 x float> @llvm.experimental.constrained.nearbyint.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -3592,35 +2056,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_nearby_v3f64() {
-; NO-FMA-LABEL: constrained_vector_nearby_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_nearby_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vroundpd $12, {{.*}}(%rip), %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_nearby_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %nearby = call <3 x double> @llvm.experimental.constrained.nearbyint.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -3630,35 +2086,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_nearbyint_v4f64() {
-; NO-FMA-LABEL: constrained_vector_nearbyint_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_nearbyint_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vroundpd $12, {{.*}}(%rip), %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_nearbyint_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %nearby = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(
                                 <4 x double> <double 42.1, double 42.2,
@@ -3669,27 +2120,16 @@ entry:
 }
 
 define <1 x float> @constrained_vector_maxnum_v1f32() {
-; NO-FMA-LABEL: constrained_vector_maxnum_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaxf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_maxnum_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmaxf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_maxnum_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmaxf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %max = call <1 x float> @llvm.experimental.constrained.maxnum.v1f32(
                                <1 x float> <float 42.0>, <1 x float> <float 41.0>,
@@ -3699,39 +2139,22 @@ entry:
 }
 
 define <2 x double> @constrained_vector_maxnum_v2f64() {
-; NO-FMA-LABEL: constrained_vector_maxnum_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmax
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmax
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_maxnum_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmax
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmax
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_maxnum_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %max = call <2 x double> @llvm.experimental.constrained.maxnum.v2f64(
                                 <2 x double> <double 43.0, double 42.0>,
@@ -3742,52 +2165,29 @@ entry:
 }
 
 define <3 x float> @constrained_vector_maxnum_v3f32() {
-; NO-FMA-LABEL: constrained_vector_maxnum_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaxf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaxf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaxf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_maxnum_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmaxf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmaxf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmaxf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_maxnum_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmaxf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmaxf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmaxf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %max = call <3 x float> @llvm.experimental.constrained.maxnum.v3f32(
                               <3 x float> <float 43.0, float 44.0, float 45.0>,
@@ -3798,54 +2198,30 @@ entry:
 }
 
 define <3 x double> @constrained_vector_max_v3f64() {
-; NO-FMA-LABEL: constrained_vector_max_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmax
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmax
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmax
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_max_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmax
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmax
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq fmax
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_max_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %max = call <3 x double> @llvm.experimental.constrained.maxnum.v3f64(
                           <3 x double> <double 43.0, double 44.0, double 45.0>,
@@ -3856,62 +2232,34 @@ entry:
 }
 
 define <4 x double> @constrained_vector_maxnum_v4f64() {
-; NO-FMA-LABEL: constrained_vector_maxnum_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmax
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmax
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmax
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmax
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_maxnum_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmax
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmax
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmax
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmax
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_maxnum_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %max = call <4 x double> @llvm.experimental.constrained.maxnum.v4f64(
                                 <4 x double> <double 44.0, double 45.0,
@@ -3924,28 +2272,17 @@ entry:
 }
 
 define <1 x float> @constrained_vector_minnum_v1f32() {
-; NO-FMA-LABEL: constrained_vector_minnum_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fminf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_minnum_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fminf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
-entry:
+; CHECK-LABEL: constrained_vector_minnum_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fminf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+ entry:
   %min = call <1 x float> @llvm.experimental.constrained.minnum.v1f32(
                                <1 x float> <float 42.0>, <1 x float> <float 41.0>,
                                metadata !"round.dynamic",
@@ -3954,39 +2291,22 @@ entry:
 }
 
 define <2 x double> @constrained_vector_minnum_v2f64() {
-; NO-FMA-LABEL: constrained_vector_minnum_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmin
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmin
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_minnum_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmin
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmin
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_minnum_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %min = call <2 x double> @llvm.experimental.constrained.minnum.v2f64(
                                 <2 x double> <double 43.0, double 42.0>,
@@ -3997,52 +2317,29 @@ entry:
 }
 
 define <3 x float> @constrained_vector_minnum_v3f32() {
-; NO-FMA-LABEL: constrained_vector_minnum_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fminf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fminf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fminf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_minnum_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fminf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fminf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fminf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_minnum_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fminf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fminf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fminf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %min = call <3 x float> @llvm.experimental.constrained.minnum.v3f32(
                               <3 x float> <float 43.0, float 44.0, float 45.0>,
@@ -4052,56 +2349,33 @@ entry:
   ret <3 x float> %min
 }
 
-define <3 x double> @constrained_vector_min_v3f64() {entry:
-; NO-FMA-LABEL: constrained_vector_min_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmin
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmin
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmin
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_min_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmin
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmin
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq fmin
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
-  %min = call <3 x double> @llvm.experimental.constrained.minnum.v3f64(
+define <3 x double> @constrained_vector_min_v3f64() {
+; CHECK-LABEL: constrained_vector_min_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+ %min = call <3 x double> @llvm.experimental.constrained.minnum.v3f64(
                           <3 x double> <double 43.0, double 44.0, double 45.0>,
                           <3 x double> <double 40.0, double 41.0, double 42.0>,
                           metadata !"round.dynamic",
@@ -4110,62 +2384,34 @@ define <3 x double> @constrained_vector_min_v3f64() {entry:
 }
 
 define <4 x double> @constrained_vector_minnum_v4f64() {
-; NO-FMA-LABEL: constrained_vector_minnum_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmin
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmin
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmin
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmin
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_minnum_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmin
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmin
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmin
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmin
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_minnum_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %min = call <4 x double> @llvm.experimental.constrained.minnum.v4f64(
                                 <4 x double> <double 44.0, double 45.0,
@@ -4183,8 +2429,6 @@ declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2
 declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.frem.v2f64(<2 x double>, <2 x double>, metadata, metadata)
-declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
-declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.pow.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.powi.v2f64(<2 x double>, i32, metadata, metadata)
@@ -4206,7 +2450,6 @@ declare <1 x float> @llvm.experimental.constrained.fsub.v1f32(<1 x float>, <1 x
 declare <1 x float> @llvm.experimental.constrained.fmul.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.fdiv.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.frem.v1f32(<1 x float>, <1 x float>, metadata, metadata)
-declare <1 x float> @llvm.experimental.constrained.fma.v1f32(<1 x float>, <1 x float>, <1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.sqrt.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.pow.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.powi.v1f32(<1 x float>, i32, metadata, metadata)
@@ -4233,8 +2476,6 @@ declare <3 x float> @llvm.experimental.constrained.fdiv.v3f32(<3 x float>, <3 x
 declare <3 x double> @llvm.experimental.constrained.fdiv.v3f64(<3 x double>, <3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.frem.v3f32(<3 x float>, <3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.frem.v3f64(<3 x double>, <3 x double>, metadata, metadata)
-declare <3 x float> @llvm.experimental.constrained.fma.v3f32(<3 x float>, <3 x float>, <3 x float>, metadata, metadata)
-declare <3 x double> @llvm.experimental.constrained.fma.v3f64(<3 x double>, <3 x double>, <3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.sqrt.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.sqrt.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.pow.v3f32(<3 x float>, <3 x float>, metadata, metadata)
@@ -4270,8 +2511,6 @@ declare <4 x double> @llvm.experimental.constrained.fsub.v4f64(<4 x double>, <4
 declare <4 x double> @llvm.experimental.constrained.fmul.v4f64(<4 x double>, <4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.fdiv.v4f64(<4 x double>, <4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.frem.v4f64(<4 x double>, <4 x double>, metadata, metadata)
-declare <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double>, <4 x double>, <4 x double>, metadata, metadata)
-declare <8 x float> @llvm.experimental.constrained.fma.v8f32(<8 x float>, <8 x float>, <8 x float>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.sqrt.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.pow.v4f64(<4 x double>, <4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32, metadata, metadata)
-- 
GitLab


From 28d03091b9bb01492def4f8a80dde4708cbbf641 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Nov 2018 15:45:01 +0000
Subject: [PATCH 0965/1116] [InstCombine] add/adjust tests for select with fsub
 identity op; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346138 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/select-binop-cmp.ll           | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/test/Transforms/InstCombine/select-binop-cmp.ll b/test/Transforms/InstCombine/select-binop-cmp.ll
index edbe310269a..1604522293c 100644
--- a/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -304,6 +304,21 @@ define float @select_fsub_fcmp(float %x, float %y, float %z) {
   ret float %C
 }
 
+; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fsub_fcmp_negzero(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fsub_fcmp_negzero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[B:%.*]] = fsub nsz float [[Z:%.*]], [[X]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp oeq float %x, -0.0
+  %B = fsub nsz float %z, %x
+  %C = select i1 %A, float %B, float %y
+  ret float %C
+}
+
 define float @select_fdiv_fcmp(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fdiv_fcmp(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 1.000000e+00
@@ -899,15 +914,18 @@ define float @select_fdiv_fcmp_bad_2(float %x, float %y, float %z) {
   ret float %C
 }
 
+; The transform is not valid when x = -0.0 and z = -0.0 
+; (optimized code would return -0.0, but this returns +0.0). 
+
 define float @select_fsub_fcmp_bad(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fsub_fcmp_bad(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fsub float [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = fsub float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp oeq float %x, 0.0
-  %B = fsub float %x, %z
+  %B = fsub float %z, %x
   %C = select i1 %A, float %B, float %y
   ret float %C
 }
@@ -915,12 +933,12 @@ define float @select_fsub_fcmp_bad(float %x, float %y, float %z) {
 define float @select_fsub_fcmp_bad_2(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fsub_fcmp_bad_2(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 1.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fsub nsz float [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = fsub nsz float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp oeq float %x, 1.0
-  %B = fsub nsz float %x, %z
+  %B = fsub nsz float %z, %x
   %C = select i1 %A, float %B, float %y
   ret float %C
 }
-- 
GitLab


From e3b515280e32f83113f49032fc6e7ccbaa96d35f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 5 Nov 2018 15:49:09 +0000
Subject: [PATCH 0966/1116] [TargetLowering] Begin generalizing
 TargetLowering::expandFP_TO_SINT support. NFCI.

Prior to initial work to add vector expansion support, remove assumptions that we're working on scalar types.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346139 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/TargetLowering.cpp | 52 ++++++++++-----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1788c163c5e..2bc9090428b 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4077,64 +4077,64 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 
 bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
                                SelectionDAG &DAG) const {
-  EVT VT = Node->getOperand(0).getValueType();
-  EVT NVT = Node->getValueType(0);
+  SDValue Src = Node->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
   SDLoc dl(SDValue(Node, 0));
 
   // FIXME: Only f32 to i64 conversions are supported.
-  if (VT != MVT::f32 || NVT != MVT::i64)
+  if (SrcVT != MVT::f32 || DstVT != MVT::i64)
     return false;
 
   // Expand f32 -> i64 conversion
   // This algorithm comes from compiler-rt's implementation of fixsfdi:
   // https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/fixsfdi.c
-  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(),
-                                VT.getSizeInBits());
+  unsigned SrcEltBits = SrcVT.getScalarSizeInBits();
+  EVT IntVT = SrcVT.changeTypeToInteger();
+  EVT IntShVT = getShiftAmountTy(IntVT, DAG.getDataLayout());
+
   SDValue ExponentMask = DAG.getConstant(0x7F800000, dl, IntVT);
   SDValue ExponentLoBit = DAG.getConstant(23, dl, IntVT);
   SDValue Bias = DAG.getConstant(127, dl, IntVT);
-  SDValue SignMask = DAG.getConstant(APInt::getSignMask(VT.getSizeInBits()), dl,
-                                     IntVT);
-  SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, dl, IntVT);
+  SDValue SignMask = DAG.getConstant(APInt::getSignMask(SrcEltBits), dl, IntVT);
+  SDValue SignLowBit = DAG.getConstant(SrcEltBits - 1, dl, IntVT);
   SDValue MantissaMask = DAG.getConstant(0x007FFFFF, dl, IntVT);
 
-  SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Node->getOperand(0));
+  SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Src);
 
-  auto &DL = DAG.getDataLayout();
   SDValue ExponentBits = DAG.getNode(
       ISD::SRL, dl, IntVT, DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
-      DAG.getZExtOrTrunc(ExponentLoBit, dl, getShiftAmountTy(IntVT, DL)));
+      DAG.getZExtOrTrunc(ExponentLoBit, dl, IntShVT));
   SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias);
 
-  SDValue Sign = DAG.getNode(
-      ISD::SRA, dl, IntVT, DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
-      DAG.getZExtOrTrunc(SignLowBit, dl, getShiftAmountTy(IntVT, DL)));
-  Sign = DAG.getSExtOrTrunc(Sign, dl, NVT);
+  SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT,
+                             DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
+                             DAG.getZExtOrTrunc(SignLowBit, dl, IntShVT));
+  Sign = DAG.getSExtOrTrunc(Sign, dl, DstVT);
 
   SDValue R = DAG.getNode(ISD::OR, dl, IntVT,
-      DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
-      DAG.getConstant(0x00800000, dl, IntVT));
+                          DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
+                          DAG.getConstant(0x00800000, dl, IntVT));
 
-  R = DAG.getZExtOrTrunc(R, dl, NVT);
+  R = DAG.getZExtOrTrunc(R, dl, DstVT);
 
   R = DAG.getSelectCC(
       dl, Exponent, ExponentLoBit,
-      DAG.getNode(ISD::SHL, dl, NVT, R,
+      DAG.getNode(ISD::SHL, dl, DstVT, R,
                   DAG.getZExtOrTrunc(
                       DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
-                      dl, getShiftAmountTy(IntVT, DL))),
-      DAG.getNode(ISD::SRL, dl, NVT, R,
+                      dl, IntShVT)),
+      DAG.getNode(ISD::SRL, dl, DstVT, R,
                   DAG.getZExtOrTrunc(
                       DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
-                      dl, getShiftAmountTy(IntVT, DL))),
+                      dl, IntShVT)),
       ISD::SETGT);
 
-  SDValue Ret = DAG.getNode(ISD::SUB, dl, NVT,
-      DAG.getNode(ISD::XOR, dl, NVT, R, Sign),
-      Sign);
+  SDValue Ret = DAG.getNode(ISD::SUB, dl, DstVT,
+                            DAG.getNode(ISD::XOR, dl, DstVT, R, Sign), Sign);
 
   Result = DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, dl, IntVT),
-      DAG.getConstant(0, dl, NVT), Ret, ISD::SETLT);
+                           DAG.getConstant(0, dl, DstVT), Ret, ISD::SETLT);
   return true;
 }
 
-- 
GitLab


From fbe10782a95bba2035a9bae0140947a040fd907b Mon Sep 17 00:00:00 2001
From: Xin Tong <trent.xin.tong@gmail.com>
Date: Mon, 5 Nov 2018 15:49:46 +0000
Subject: [PATCH 0967/1116] [ThinLTO] Add an option to disable (thin)lto
 internalization.

Summary:
LTO and ThinLTO optimizes the IR differently.

One source of differences is the amount of internalizations that
can happen.

Add an option to enable/disable internalization so that other
differences can be studied in isolation. e.g. inlining.

There are other things lto and thinlto do differently, I will add
flags to enable/disable them as needed.

Reviewers: tejohnson, pcc, steven_wu

Subscribers: mehdi_amini, inglorion, steven_wu, dexonsmith, dang, llvm-commits

Differential Revision: https://reviews.llvm.org/D53294

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346140 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/LTO/legacy/LTOCodeGenerator.h |  5 ++-
 lib/LTO/LTO.cpp                            | 10 ++++--
 test/LTO/X86/internalize.ll                | 42 ++++++++++++++++++++++
 test/ThinLTO/X86/internalize.ll            | 21 +++++++++++
 4 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 test/LTO/X86/internalize.ll

diff --git a/include/llvm/LTO/legacy/LTOCodeGenerator.h b/include/llvm/LTO/legacy/LTOCodeGenerator.h
index f48ab02863a..8f23b7cb457 100644
--- a/include/llvm/LTO/legacy/LTOCodeGenerator.h
+++ b/include/llvm/LTO/legacy/LTOCodeGenerator.h
@@ -48,6 +48,9 @@
 #include <string>
 #include <vector>
 
+/// Enable global value internalization in LTO.
+extern llvm::cl::opt<bool> EnableLTOInternalization;
+
 namespace llvm {
 template <typename T> class ArrayRef;
   class LLVMContext;
@@ -233,7 +236,7 @@ private:
   unsigned OptLevel = 2;
   lto_diagnostic_handler_t DiagHandler = nullptr;
   void *DiagContext = nullptr;
-  bool ShouldInternalize = true;
+  bool ShouldInternalize = EnableLTOInternalization;
   bool ShouldEmbedUselists = false;
   bool ShouldRestoreGlobalsLinkage = false;
   TargetMachine::CodeGenFileType FileType = TargetMachine::CGFT_ObjectFile;
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 6942cb28af2..2726b6785ed 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -56,6 +56,11 @@ static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
 
+/// Enable global value internalization in LTO.
+cl::opt<bool> EnableLTOInternalization(
+    "enable-lto-internalization", cl::init(true), cl::Hidden,
+    cl::desc("Enable global value internalization in LTO"));
+
 // Returns a unique hash for the Module considering the current list of
 // export/import and other global analysis results.
 // The hash is produced in \p Key.
@@ -344,7 +349,8 @@ static void thinLTOInternalizeAndPromoteGUID(
     if (isExported(S->modulePath(), GUID)) {
       if (GlobalValue::isLocalLinkage(S->linkage()))
         S->setLinkage(GlobalValue::ExternalLinkage);
-    } else if (!GlobalValue::isLocalLinkage(S->linkage()))
+    } else if (EnableLTOInternalization &&
+               !GlobalValue::isLocalLinkage(S->linkage()))
       S->setLinkage(GlobalValue::InternalLinkage);
   }
 }
@@ -876,7 +882,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
         continue;
       GV->setUnnamedAddr(R.second.UnnamedAddr ? GlobalValue::UnnamedAddr::Global
                                               : GlobalValue::UnnamedAddr::None);
-      if (R.second.Partition == 0)
+      if (EnableLTOInternalization && R.second.Partition == 0)
         GV->setLinkage(GlobalValue::InternalLinkage);
     }
 
diff --git a/test/LTO/X86/internalize.ll b/test/LTO/X86/internalize.ll
new file mode 100644
index 00000000000..6b18aa7ded0
--- /dev/null
+++ b/test/LTO/X86/internalize.ll
@@ -0,0 +1,42 @@
+; RUN: opt %s -o %t1.bc
+
+; RUN: llvm-lto %t1.bc -o %t1.save.opt  --exported-symbol=_foo -save-merged-module -O0
+; RUN: llvm-dis < %t1.save.opt.merged.bc | FileCheck %s --check-prefix=INTERNALIZE
+
+; Test the enable-lto-internalization option by setting it to false.
+; This makes sure internalization does not happen.
+; RUN: llvm-lto %t1.bc -enable-lto-internalization=false -o %t1.save.opt  \
+; RUN:                 --exported-symbol=_foo -save-merged-module -O0
+; RUN: llvm-dis < %t1.save.opt.merged.bc | FileCheck %s --check-prefix=INTERNALIZE-OPTION-DISABLE
+
+; RUN: llvm-lto2 run %t1.bc -o %t.o -save-temps \
+; RUN:     -r=%t1.bc,_foo,pxl \
+; RUN:     -r=%t1.bc,_bar,pl
+; RUN: llvm-dis < %t.o.0.2.internalize.bc | FileCheck  %s --check-prefix=INTERNALIZE2
+
+; Test the enable-lto-internalization option by setting it to false.
+; This makes sure internalization does not happen in runRegularLTO().
+; RUN: llvm-lto2 run %t1.bc -o %t.o -save-temps -enable-lto-internalization=false \
+; RUN:     -r=%t1.bc,_foo,pxl \
+; RUN:     -r=%t1.bc,_bar,pl
+; RUN: llvm-dis < %t.o.0.2.internalize.bc | FileCheck  %s --check-prefix=INTERNALIZE2-OPTION-DISABLE
+
+; INTERNALIZE: define void @foo
+; INTERNALIZE: define internal void @bar
+; INTERNALIZE-OPTION-DISABLE: define void @foo
+; INTERNALIZE-OPTION-DISABLE: define void @bar
+; INTERNALIZE2: define dso_local void @foo
+; INTERNALIZE2: define internal void @bar
+; INTERNALIZE2-OPTION-DISABLE: define dso_local void @foo
+; INTERNALIZE2-OPTION-DISABLE: define dso_local void @bar
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define void @foo() {
+    call void @bar()
+    ret void
+}
+define void @bar() {
+    ret void
+}
diff --git a/test/ThinLTO/X86/internalize.ll b/test/ThinLTO/X86/internalize.ll
index 433cfe40894..70b28469b48 100644
--- a/test/ThinLTO/X86/internalize.ll
+++ b/test/ThinLTO/X86/internalize.ll
@@ -3,12 +3,27 @@
 ; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=REGULAR
 ; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o -  --exported-symbol=foo | llvm-dis -o - | FileCheck %s --check-prefix=INTERNALIZE
 
+; Test the enable-lto-internalization option by setting it to false.
+; This makes sure indices are not marked as internallinkage and therefore
+; internalization does not happen.
+; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc \
+; RUN:          -enable-lto-internalization=false --exported-symbol=foo
+; RUN: llvm-dis < %t1.bc.thinlto.internalized.bc | FileCheck %s --check-prefix=INTERNALIZE-OPTION-DISABLE
+
 ; RUN: llvm-lto2 run %t1.bc -o %t.o -save-temps \
 ; RUN:     -r=%t1.bc,_foo,pxl \
 ; RUN:     -r=%t1.bc,_bar,pl \
 ; RUN:     -r=%t1.bc,_linkonce_func,pl
 ; RUN: llvm-dis < %t.o.1.2.internalize.bc | FileCheck  %s --check-prefix=INTERNALIZE2
 
+; Test the enable-lto-internalization option by setting it to false.
+; This makes sure indices are not marked as internallinkage and therefore
+; internalization does not happen.
+; RUN: llvm-lto2 run %t1.bc -o %t.o -save-temps -enable-lto-internalization=false \
+; RUN:     -r=%t1.bc,_foo,pxl \
+; RUN:     -r=%t1.bc,_bar,pl \
+; RUN:     -r=%t1.bc,_linkonce_func,pl
+; RUN: llvm-dis < %t.o.1.2.internalize.bc | FileCheck  %s --check-prefix=INTERNALIZE2-OPTION-DISABLE
 
 ; REGULAR: define void @foo
 ; REGULAR: define void @bar
@@ -16,9 +31,15 @@
 ; INTERNALIZE: define void @foo
 ; INTERNALIZE: define internal void @bar
 ; INTERNALIZE: define internal void @linkonce_func()
+; INTERNALIZE-OPTION-DISABLE: define void @foo
+; INTERNALIZE-OPTION-DISABLE: define void @bar
+; INTERNALIZE-OPTION-DISABLE: define linkonce void @linkonce_func()
 ; INTERNALIZE2: define dso_local void @foo
 ; INTERNALIZE2: define internal void @bar
 ; INTERNALIZE2: define internal void @linkonce_func()
+; INTERNALIZE2-OPTION-DISABLE: define dso_local void @foo
+; INTERNALIZE2-OPTION-DISABLE: define dso_local void @bar
+; INTERNALIZE2-OPTION-DISABLE: define weak dso_local void @linkonce_func()
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
-- 
GitLab


From 7c442634fab933f1b23576aef307c19db49f1b9a Mon Sep 17 00:00:00 2001
From: Cameron McInally <cameron.mcinally@nyu.edu>
Date: Mon, 5 Nov 2018 15:59:49 +0000
Subject: [PATCH 0968/1116] [FPEnv] Add constrained CEIL/FLOOR/ROUND/TRUNC
 intrinsics

Differential Revision: https://reviews.llvm.org/D53411


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346141 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                              | 145 ++++++
 include/llvm/CodeGen/ISDOpcodes.h             |   1 +
 include/llvm/CodeGen/SelectionDAGNodes.h      |   4 +
 include/llvm/CodeGen/TargetLowering.h         |   4 +
 include/llvm/IR/IntrinsicInst.h               |   4 +
 include/llvm/IR/Intrinsics.td                 |  18 +-
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |   8 +
 .../SelectionDAG/LegalizeVectorOps.cpp        |   8 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  12 +
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |   4 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  16 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   4 +
 lib/IR/IntrinsicInst.cpp                      |   4 +
 lib/IR/Verifier.cpp                           |   4 +
 .../X86/vector-constrained-fp-intrinsics.ll   | 424 ++++++++++++++++++
 15 files changed, 659 insertions(+), 1 deletion(-)

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 7f93716a504..0a22ced9850 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -14580,6 +14580,151 @@ mode is determined by the runtime floating-point environment. The rounding
 mode argument is only intended as information to the compiler.
 
 
+'``llvm.experimental.constrained.ceil``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.ceil(<type> <op1>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.ceil``' intrinsic returns the ceiling of the 
+first operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating-point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above. The rounding mode is currently unused for this
+intrinsic.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``ceil`` functions
+would and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.floor``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.floor(<type> <op1>,
+                                           metadata <rounding mode>,
+                                           metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.floor``' intrinsic returns the floor of the 
+first operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating-point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above. The rounding mode is currently unused for this
+intrinsic.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``floor`` functions
+would and handles error conditions in the same way. 
+
+
+'``llvm.experimental.constrained.round``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.round(<type> <op1>,
+                                           metadata <rounding mode>,
+                                           metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.round``' intrinsic returns the first 
+operand rounded to the nearest integer.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating-point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above. The rounding mode is currently unused for this
+intrinsic.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``round`` functions
+would and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.trunc``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.trunc(<type> <op1>,
+                                           metadata <truncing mode>,
+                                           metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.trunc``' intrinsic returns the first 
+operand rounded to the nearest integer not larger in magnitude than the 
+operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating-point numbers of the same
+type.
+
+The second and third arguments specify the truncing mode and exception
+behavior as described above. The truncing mode is currently unused for this
+intrinsic.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``trunc`` functions
+would and handles error conditions in the same way.
+
+
 General Intrinsics
 ------------------
 
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index da10119f438..ac620e4b69c 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -289,6 +289,7 @@ namespace ISD {
     STRICT_FSQRT, STRICT_FPOW, STRICT_FPOWI, STRICT_FSIN, STRICT_FCOS,
     STRICT_FEXP, STRICT_FEXP2, STRICT_FLOG, STRICT_FLOG10, STRICT_FLOG2,
     STRICT_FRINT, STRICT_FNEARBYINT, STRICT_FMAXNUM, STRICT_FMINNUM,
+    STRICT_FCEIL, STRICT_FFLOOR, STRICT_FROUND, STRICT_FTRUNC,
 
     /// FMA - Perform a * b + c with no intermediate rounding step.
     FMA,
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 262c7b7b58c..d125e888a57 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -674,6 +674,10 @@ public:
       case ISD::STRICT_FNEARBYINT:
       case ISD::STRICT_FMAXNUM:
       case ISD::STRICT_FMINNUM:
+      case ISD::STRICT_FCEIL:
+      case ISD::STRICT_FFLOOR:
+      case ISD::STRICT_FROUND:
+      case ISD::STRICT_FTRUNC:
         return true;
     }
   }
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 7a779f0b433..96a52abd453 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -821,6 +821,10 @@ public:
       case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
       case ISD::STRICT_FMAXNUM: EqOpc = ISD::FMAXNUM; break;
       case ISD::STRICT_FMINNUM: EqOpc = ISD::FMINNUM; break;
+      case ISD::STRICT_FCEIL: EqOpc = ISD::FCEIL; break;
+      case ISD::STRICT_FFLOOR: EqOpc = ISD::FFLOOR; break;
+      case ISD::STRICT_FROUND: EqOpc = ISD::FROUND; break;
+      case ISD::STRICT_FTRUNC: EqOpc = ISD::FTRUNC; break;
     }
 
     auto Action = getOperationAction(EqOpc, VT);
diff --git a/include/llvm/IR/IntrinsicInst.h b/include/llvm/IR/IntrinsicInst.h
index 54e344d829a..80a7a705257 100644
--- a/include/llvm/IR/IntrinsicInst.h
+++ b/include/llvm/IR/IntrinsicInst.h
@@ -253,6 +253,10 @@ namespace llvm {
       case Intrinsic::experimental_constrained_nearbyint:
       case Intrinsic::experimental_constrained_maxnum:
       case Intrinsic::experimental_constrained_minnum:
+      case Intrinsic::experimental_constrained_ceil:
+      case Intrinsic::experimental_constrained_floor:
+      case Intrinsic::experimental_constrained_round:
+      case Intrinsic::experimental_constrained_trunc:
         return true;
       default: return false;
       }
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index c965140a00b..1c6f81c07db 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -576,9 +576,25 @@ let IntrProperties = [IntrInaccessibleMemOnly] in {
                                                         LLVMMatchType<0>,
                                                         llvm_metadata_ty,
                                                         llvm_metadata_ty ]>;
+  def int_experimental_constrained_ceil : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_floor : Intrinsic<[ llvm_anyfloat_ty ],
+                                                     [ LLVMMatchType<0>,
+                                                       llvm_metadata_ty,
+                                                       llvm_metadata_ty ]>;
+  def int_experimental_constrained_round : Intrinsic<[ llvm_anyfloat_ty ],
+                                                     [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_trunc : Intrinsic<[ llvm_anyfloat_ty ],
+                                                     [ LLVMMatchType<0>,
+                                                       llvm_metadata_ty,
+                                                       llvm_metadata_ty ]>;
 }
 // FIXME: Add intrinsics for fcmp, fptrunc, fpext, fptoui and fptosi.
-// FIXME: Add intrinsics for fabs, copysign, floor, ceil, trunc and round?
+// FIXME: Add intrinsics for fabs and copysign? 
 
 
 //===------------------------- Expect Intrinsics --------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d5fb7a0697d..dcb479e4ce1 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1110,6 +1110,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::STRICT_FNEARBYINT:
   case ISD::STRICT_FMAXNUM:
   case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     // These pseudo-ops get legalized as if they were their non-strict
     // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
     // is also legal, but if ISD::FSQRT requires expansion then so does
@@ -3940,16 +3944,19 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                         RTLIB::EXP2_PPCF128));
     break;
   case ISD::FTRUNC:
+  case ISD::STRICT_FTRUNC:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
                                       RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
                                       RTLIB::TRUNC_PPCF128));
     break;
   case ISD::FFLOOR:
+  case ISD::STRICT_FFLOOR:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
                                       RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
                                       RTLIB::FLOOR_PPCF128));
     break;
   case ISD::FCEIL:
+  case ISD::STRICT_FCEIL:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
                                       RTLIB::CEIL_F80, RTLIB::CEIL_F128,
                                       RTLIB::CEIL_PPCF128));
@@ -3969,6 +3976,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                       RTLIB::NEARBYINT_PPCF128));
     break;
   case ISD::FROUND:
+  case ISD::STRICT_FROUND:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32,
                                       RTLIB::ROUND_F64,
                                       RTLIB::ROUND_F80,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index bfc00ea28ef..17f05c3ba97 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -308,6 +308,10 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::STRICT_FNEARBYINT:
   case ISD::STRICT_FMAXNUM:
   case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     // These pseudo-ops get legalized as if they were their non-strict
     // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
     // is also legal, but if ISD::FSQRT requires expansion then so does
@@ -758,6 +762,10 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
   case ISD::STRICT_FNEARBYINT:
   case ISD::STRICT_FMAXNUM:
   case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     return ExpandStrictFPOp(Op);
   default:
     return DAG.UnrollVectorOp(Op.getNode());
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 6b52b374cd0..88abd84366a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -166,6 +166,10 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FNEARBYINT:
   case ISD::STRICT_FMAXNUM:
   case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     R = ScalarizeVecRes_StrictFPOp(N);
     break;
   }
@@ -838,6 +842,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FNEARBYINT:
   case ISD::STRICT_FMAXNUM:
   case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     SplitVecRes_StrictFPOp(N, Lo, Hi);
     break;
   }
@@ -2406,6 +2414,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FNEARBYINT:
   case ISD::STRICT_FMAXNUM:
   case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     Res = WidenVecRes_StrictFP(N);
     break;
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 66121c10a35..fce14d53c22 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7384,6 +7384,10 @@ SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
     break;
   case ISD::STRICT_FMAXNUM: NewOpc = ISD::FMAXNUM; break;
   case ISD::STRICT_FMINNUM: NewOpc = ISD::FMINNUM; break;
+  case ISD::STRICT_FCEIL: NewOpc = ISD::FCEIL; IsUnary = true; break;
+  case ISD::STRICT_FFLOOR: NewOpc = ISD::FFLOOR; IsUnary = true; break;
+  case ISD::STRICT_FROUND: NewOpc = ISD::FROUND; IsUnary = true; break;
+  case ISD::STRICT_FTRUNC: NewOpc = ISD::FTRUNC; IsUnary = true; break;
   }
 
   // We're taking this node out of the chain, so we need to re-link things.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index cb650c6fc13..55ca5eb7c4e 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5633,6 +5633,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::experimental_constrained_nearbyint:
   case Intrinsic::experimental_constrained_maxnum:
   case Intrinsic::experimental_constrained_minnum:
+  case Intrinsic::experimental_constrained_ceil:
+  case Intrinsic::experimental_constrained_floor:
+  case Intrinsic::experimental_constrained_round:
+  case Intrinsic::experimental_constrained_trunc:
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
     return nullptr;
   case Intrinsic::fmuladd: {
@@ -6386,6 +6390,18 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   case Intrinsic::experimental_constrained_minnum:
     Opcode = ISD::STRICT_FMINNUM;
     break;
+  case Intrinsic::experimental_constrained_ceil:
+    Opcode = ISD::STRICT_FCEIL;
+    break;
+  case Intrinsic::experimental_constrained_floor:
+    Opcode = ISD::STRICT_FFLOOR;
+    break;
+  case Intrinsic::experimental_constrained_round:
+    Opcode = ISD::STRICT_FROUND;
+    break;
+  case Intrinsic::experimental_constrained_trunc:
+    Opcode = ISD::STRICT_FTRUNC;
+    break;
   }
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Chain = getRoot();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index c21f2d3b717..02d45df5864 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -193,13 +193,17 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STRICT_FCOS:                return "strict_fcos";
   case ISD::FSINCOS:                    return "fsincos";
   case ISD::FTRUNC:                     return "ftrunc";
+  case ISD::STRICT_FTRUNC:              return "strict_ftrunc";
   case ISD::FFLOOR:                     return "ffloor";
+  case ISD::STRICT_FFLOOR:              return "strict_ffloor";
   case ISD::FCEIL:                      return "fceil";
+  case ISD::STRICT_FCEIL:               return "strict_fceil";
   case ISD::FRINT:                      return "frint";
   case ISD::STRICT_FRINT:               return "strict_frint";
   case ISD::FNEARBYINT:                 return "fnearbyint";
   case ISD::STRICT_FNEARBYINT:          return "strict_fnearbyint";
   case ISD::FROUND:                     return "fround";
+  case ISD::STRICT_FROUND:              return "strict_fround";
   case ISD::FEXP:                       return "fexp";
   case ISD::STRICT_FEXP:                return "strict_fexp";
   case ISD::FEXP2:                      return "fexp2";
diff --git a/lib/IR/IntrinsicInst.cpp b/lib/IR/IntrinsicInst.cpp
index 43a93890a61..df3a38ac147 100644
--- a/lib/IR/IntrinsicInst.cpp
+++ b/lib/IR/IntrinsicInst.cpp
@@ -152,6 +152,10 @@ bool ConstrainedFPIntrinsic::isUnaryOp() const {
     case Intrinsic::experimental_constrained_log2:
     case Intrinsic::experimental_constrained_rint:
     case Intrinsic::experimental_constrained_nearbyint:
+    case Intrinsic::experimental_constrained_ceil:
+    case Intrinsic::experimental_constrained_floor:
+    case Intrinsic::experimental_constrained_round:
+    case Intrinsic::experimental_constrained_trunc:
       return true;
   }
 }
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 7c0381a7222..4d0135d8338 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -4106,6 +4106,10 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
   case Intrinsic::experimental_constrained_nearbyint:
   case Intrinsic::experimental_constrained_maxnum:
   case Intrinsic::experimental_constrained_minnum:
+  case Intrinsic::experimental_constrained_ceil:
+  case Intrinsic::experimental_constrained_floor:
+  case Intrinsic::experimental_constrained_round:
+  case Intrinsic::experimental_constrained_trunc:
     visitConstrainedFPIntrinsic(
         cast<ConstrainedFPIntrinsic>(*CS.getInstruction()));
     break;
diff --git a/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 59a1729cc05..55f5bc6bf36 100644
--- a/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -2423,6 +2423,409 @@ entry:
   ret <4 x double> %min
 }
 
+define <1 x float> @constrained_vector_ceil_v1f32() {
+; CHECK-LABEL: constrained_vector_ceil_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ceilf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %ceil = call <1 x float> @llvm.experimental.constrained.ceil.v1f32(
+                               <1 x float> <float 1.5>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %ceil
+}
+
+define <2 x double> @constrained_vector_ceil_v2f64() {
+; CHECK-LABEL: constrained_vector_ceil_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq ceil
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq ceil
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %ceil = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(
+                                <2 x double> <double 1.1, double 1.9>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %ceil
+}
+
+define <3 x float> @constrained_vector_ceil_v3f32() {
+; CHECK-LABEL: constrained_vector_ceil_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ceilf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ceilf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ceilf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %ceil = call <3 x float> @llvm.experimental.constrained.ceil.v3f32(
+                              <3 x float> <float 1.5, float 2.5, float 3.5>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %ceil
+}
+
+define <3 x double> @constrained_vector_ceil_v3f64() {
+; CHECK-LABEL: constrained_vector_ceil_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq ceil
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq ceil
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq ceil
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %ceil = call <3 x double> @llvm.experimental.constrained.ceil.v3f64(
+                          <3 x double> <double 1.1, double 1.9, double 1.5>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %ceil
+}
+
+define <1 x float> @constrained_vector_floor_v1f32() {
+; CHECK-LABEL: constrained_vector_floor_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq floorf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %floor = call <1 x float> @llvm.experimental.constrained.floor.v1f32(
+                               <1 x float> <float 1.5>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %floor
+}
+
+
+define <2 x double> @constrained_vector_floor_v2f64() {
+; CHECK-LABEL: constrained_vector_floor_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq floor
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq floor
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %floor = call <2 x double> @llvm.experimental.constrained.floor.v2f64(
+                                <2 x double> <double 1.1, double 1.9>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %floor
+}
+
+define <3 x float> @constrained_vector_floor_v3f32() {
+; CHECK-LABEL: constrained_vector_floor_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq floorf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq floorf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq floorf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %floor = call <3 x float> @llvm.experimental.constrained.floor.v3f32(
+                              <3 x float> <float 1.5, float 2.5, float 3.5>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %floor
+}
+
+define <3 x double> @constrained_vector_floor_v3f64() {
+; CHECK-LABEL: constrained_vector_floor_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq floor
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq floor
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq floor
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %floor = call <3 x double> @llvm.experimental.constrained.floor.v3f64(
+                          <3 x double> <double 1.1, double 1.9, double 1.5>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %floor
+}
+
+define <1 x float> @constrained_vector_round_v1f32() {
+; CHECK-LABEL: constrained_vector_round_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq roundf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %round = call <1 x float> @llvm.experimental.constrained.round.v1f32(
+                               <1 x float> <float 1.5>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %round
+}
+
+define <2 x double> @constrained_vector_round_v2f64() {
+; CHECK-LABEL: constrained_vector_round_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq round
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq round
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %round = call <2 x double> @llvm.experimental.constrained.round.v2f64(
+                                <2 x double> <double 1.1, double 1.9>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %round
+}
+
+define <3 x float> @constrained_vector_round_v3f32() {
+; CHECK-LABEL: constrained_vector_round_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq roundf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq roundf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq roundf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %round = call <3 x float> @llvm.experimental.constrained.round.v3f32(
+                              <3 x float> <float 1.5, float 2.5, float 3.5>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %round
+}
+
+
+define <3 x double> @constrained_vector_round_v3f64() {
+; CHECK-LABEL: constrained_vector_round_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq round
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq round
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq round
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %round = call <3 x double> @llvm.experimental.constrained.round.v3f64(
+                          <3 x double> <double 1.1, double 1.9, double 1.5>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %round
+}
+
+define <1 x float> @constrained_vector_trunc_v1f32() {
+; CHECK-LABEL: constrained_vector_trunc_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq truncf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %trunc = call <1 x float> @llvm.experimental.constrained.trunc.v1f32(
+                               <1 x float> <float 1.5>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %trunc
+}
+
+define <2 x double> @constrained_vector_trunc_v2f64() {
+; CHECK-LABEL: constrained_vector_trunc_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq trunc
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq trunc
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %trunc = call <2 x double> @llvm.experimental.constrained.trunc.v2f64(
+                                <2 x double> <double 1.1, double 1.9>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %trunc
+}
+
+define <3 x float> @constrained_vector_trunc_v3f32() {
+; CHECK-LABEL: constrained_vector_trunc_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq truncf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq truncf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq truncf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %trunc = call <3 x float> @llvm.experimental.constrained.trunc.v3f32(
+                              <3 x float> <float 1.5, float 2.5, float 3.5>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %trunc
+}
+
+define <3 x double> @constrained_vector_trunc_v3f64() {
+; CHECK-LABEL: constrained_vector_trunc_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq trunc
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq trunc
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq trunc
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %trunc = call <3 x double> @llvm.experimental.constrained.trunc.v3f64(
+                          <3 x double> <double 1.1, double 1.9, double 1.5>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %trunc
+}
+
+
 ; Single width declarations
 declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata)
@@ -2443,6 +2846,10 @@ declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, met
 declare <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.maxnum.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.minnum.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, metadata, metadata)
 
 ; Scalar width declarations
 declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata)
@@ -2464,6 +2871,10 @@ declare <1 x float> @llvm.experimental.constrained.rint.v1f32(<1 x float>, metad
 declare <1 x float> @llvm.experimental.constrained.nearbyint.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.maxnum.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.minnum.v1f32(<1 x float>, <1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.ceil.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.round.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.trunc.v1f32(<1 x float>, metadata, metadata)
 
 ; Illegal width declarations
 declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata)
@@ -2504,6 +2915,14 @@ declare <3 x float> @llvm.experimental.constrained.maxnum.v3f32(<3 x float>, <3
 declare <3 x double> @llvm.experimental.constrained.maxnum.v3f64(<3 x double>, <3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.minnum.v3f32(<3 x float>, <3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.minnum.v3f64(<3 x double>, <3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.ceil.v3f32(<3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.ceil.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.floor.v3f32(<3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.floor.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.round.v3f32(<3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.round.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.trunc.v3f32(<3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.trunc.v3f64(<3 x double>, metadata, metadata)
 
 ; Double width declarations
 declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata)
@@ -2525,3 +2944,8 @@ declare <4 x double> @llvm.experimental.constrained.rint.v4f64(<4 x double>, met
 declare <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.maxnum.v4f64(<4 x double>, <4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.minnum.v4f64(<4 x double>, <4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.ceil.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, metadata, metadata)
+
-- 
GitLab


From 36d0612088f8df1665f799e0c5c4f9c0b8913bcc Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Nov 2018 16:27:03 +0000
Subject: [PATCH 0969/1116] [InstCombine] adjust tests for select with FP
 identity op; NFC

These are mislabeled as negative tests.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346142 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/select-binop-cmp.ll           | 62 ++++++++++---------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/test/Transforms/InstCombine/select-binop-cmp.ll b/test/Transforms/InstCombine/select-binop-cmp.ll
index 1604522293c..a2ddfb40f37 100644
--- a/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -152,6 +152,21 @@ define float @select_fadd_fcmp(float %x, float %y, float %z) {
   ret float %C
 }
 
+; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_poszero(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fadd_fcmp_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp oeq float %x, 0.0
+  %B = fadd nsz float %z, %x
+  %C = select i1 %A, float %B, float %y
+  ret float %C
+}
+
 define float @select_fadd_fcmp_2(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_2(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
@@ -222,6 +237,21 @@ define float @select_fadd_fcmp_4(float %x, float %y, float %z) {
   ret float %C
 }
 
+; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_4_poszero(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fadd_fcmp_4_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp une float %x, 0.0
+  %B = fadd nsz float %z, %x
+  %C = select i1 %A, float %y, float %B
+  ret float %C
+}
+
 define float @select_fadd_fcmp_5(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_5(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
@@ -808,34 +838,6 @@ define float @select_fadd_fcmp_bad_14(float %x, float %y, float %z) {
   ret float %C
 }
 
-; Invalid identity constant for FP op
-define float @select_fadd_fcmp_bad_15(float %x, float %y, float %z) {
-; CHECK-LABEL: @select_fadd_fcmp_bad_15(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
-; CHECK-NEXT:    ret float [[C]]
-;
-  %A = fcmp une float %x, 0.0
-  %B = fadd nsz float %z, %x
-  %C = select i1 %A, float %y, float %B
-  ret float %C
-}
-
-; Invalid identity constant for FP op
-define float @select_fadd_fcmp_bad_16(float %x, float %y, float %z) {
-; CHECK-LABEL: @select_fadd_fcmp_bad_16(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
-; CHECK-NEXT:    ret float [[C]]
-;
-  %A = fcmp oeq float %x, 0.0
-  %B = fadd nsz float %z, %x
-  %C = select i1 %A, float %B, float %y
-  ret float %C
-}
-
 define float @select_fmul_fcmp_bad(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fmul_fcmp_bad(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 3.000000e+00
@@ -914,8 +916,8 @@ define float @select_fdiv_fcmp_bad_2(float %x, float %y, float %z) {
   ret float %C
 }
 
-; The transform is not valid when x = -0.0 and z = -0.0 
-; (optimized code would return -0.0, but this returns +0.0). 
+; The transform is not valid when x = -0.0 and z = -0.0
+; (optimized code would return -0.0, but this returns +0.0).
 
 define float @select_fsub_fcmp_bad(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fsub_fcmp_bad(
-- 
GitLab


From 60a9b3360daa6081d41a6a83bbf2b9b2e345c135 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Nov 2018 16:50:44 +0000
Subject: [PATCH 0970/1116] [InstCombine] loosen FP 0.0 constraint for
 fcmp+select substitution

It looks like we correctly removed edge cases with 0.0 from D50714,
but we were a bit conservative because getBinOpIdentity() doesn't
distinguish between +0.0 and -0.0 and 'nsz' is effectively always
true for fcmp (see discussion in:
https://bugs.llvm.org/show_bug.cgi?id=38086

Without this change, we would get regressions by canonicalizing
to +0.0 in all fcmp, and that's a step towards solving:
https://bugs.llvm.org/show_bug.cgi?id=39475


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346143 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineSelect.cpp         | 17 ++++++---
 .../InstCombine/select-binop-cmp.ll           | 35 ++++++++-----------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 00dcacccb40..724662f0128 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -75,13 +75,22 @@ static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
   else
     return nullptr;
 
-  // A select operand must be a binop, and the compare constant must be the
-  // identity constant for that binop.
+  // A select operand must be a binop.
   BinaryOperator *BO;
-  if (!match(Sel.getOperand(IsEq ? 1 : 2), m_BinOp(BO)) ||
-      ConstantExpr::getBinOpIdentity(BO->getOpcode(), BO->getType(), true) != C)
+  if (!match(Sel.getOperand(IsEq ? 1 : 2), m_BinOp(BO)))
     return nullptr;
 
+  // The compare constant must be the identity constant for that binop.
+  // If this a floating-point compare with 0.0, any zero constant will do.
+  Type *Ty = BO->getType();
+  Constant *IdC = ConstantExpr::getBinOpIdentity(BO->getOpcode(), Ty, true);
+  if (IdC != C) {
+    if (!IdC || !CmpInst::isFPPredicate(Pred))
+      return nullptr;
+    if (!match(IdC, m_AnyZeroFP()) || !match(C, m_AnyZeroFP()))
+      return nullptr;
+  }
+
   // Last, match the compare variable operand with a binop operand.
   Value *Y;
   if (!BO->isCommutative() && !match(BO, m_BinOp(m_Value(Y), m_Specific(X))))
diff --git a/test/Transforms/InstCombine/select-binop-cmp.ll b/test/Transforms/InstCombine/select-binop-cmp.ll
index a2ddfb40f37..c7361fa040e 100644
--- a/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -152,13 +152,12 @@ define float @select_fadd_fcmp(float %x, float %y, float %z) {
   ret float %C
 }
 
-; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
 
 define float @select_fadd_fcmp_poszero(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_poszero(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Z:%.*]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp oeq float %x, 0.0
@@ -181,14 +180,13 @@ define float @select_fadd_fcmp_2(float %x, float %y, float %v) {
   ret float %C
 }
 
-; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
 
 define float @select_fadd_fcmp_2_poszero(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_2_poszero(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd float [[Z]], [[X]]
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[Z]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp une float %x, 0.0
@@ -210,13 +208,12 @@ define float @select_fadd_fcmp_3(float %x, float %y) {
   ret float %C
 }
 
-; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
 
 define float @select_fadd_fcmp_3_poszero(float %x, float %y) {
 ; CHECK-LABEL: @select_fadd_fcmp_3_poszero(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd float [[X]], 6.000000e+00
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float 6.000000e+00
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp une float %x, 0.0
@@ -237,13 +234,12 @@ define float @select_fadd_fcmp_4(float %x, float %y, float %z) {
   ret float %C
 }
 
-; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
 
 define float @select_fadd_fcmp_4_poszero(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_4_poszero(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[Z:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp une float %x, 0.0
@@ -266,14 +262,13 @@ define float @select_fadd_fcmp_5(float %x, float %y, float %v) {
   ret float %C
 }
 
-; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
 
 define float @select_fadd_fcmp_5_poszero(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_5_poszero(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd float [[Z]], [[X]]
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Z]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp oeq float %x, 0.0
@@ -295,13 +290,12 @@ define float @select_fadd_fcmp_6(float %x, float %y, float %z) {
   ret float %C
 }
 
-; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
 
 define float @select_fadd_fcmp_6_poszero(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_6_poszero(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd float [[X]], 6.000000e+00
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float 6.000000e+00, float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp oeq float %x, 0.0
@@ -334,13 +328,12 @@ define float @select_fsub_fcmp(float %x, float %y, float %z) {
   ret float %C
 }
 
-; TODO: This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
 
 define float @select_fsub_fcmp_negzero(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fsub_fcmp_negzero(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fsub nsz float [[Z:%.*]], [[X]]
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Z:%.*]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp oeq float %x, -0.0
-- 
GitLab


From ab049d88fa9a70f1f4498f07594803b3c5d05d44 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Nov 2018 17:26:42 +0000
Subject: [PATCH 0971/1116] [InstCombine] canonicalize -0.0 to +0.0 in fcmp

As stated in IEEE-754 and discussed in:
https://bugs.llvm.org/show_bug.cgi?id=38086
...the sign of zero does not affect any FP compare predicate.

Known regressions were fixed with:
rL346097 (D54001)
rL346143

The transform will help reduce pattern-matching complexity to solve:
https://bugs.llvm.org/show_bug.cgi?id=39475
...as well as improve CSE and codegen (a zero constant is almost always
easier to produce than 0x80..00).


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346147 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCompares.cpp       |  7 ++++
 test/Transforms/InstCombine/fcmp-special.ll   | 12 +++----
 test/Transforms/InstCombine/fcmp.ll           |  2 +-
 test/Transforms/InstCombine/minmax-fp.ll      |  2 +-
 .../InstCombine/select-binop-cmp.ll           | 34 +++++++++----------
 5 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 9155ad12598..059f7523ff9 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5397,6 +5397,13 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         return nullptr;
     }
 
+  // The sign of 0.0 is ignored by fcmp, so canonicalize to +0.0:
+  // fcmp Pred X, -0.0 --> fcmp Pred X, 0.0
+  if (match(Op1, m_AnyZeroFP()) && !match(Op1, m_PosZeroFP())) {
+    I.setOperand(1, ConstantFP::getNullValue(Op1->getType()));
+    return &I;
+  }
+
   // Handle fcmp with instruction LHS and constant RHS.
   Instruction *LHSI;
   Constant *RHSC;
diff --git a/test/Transforms/InstCombine/fcmp-special.ll b/test/Transforms/InstCombine/fcmp-special.ll
index 5d4cc9a8616..490dab5f24d 100644
--- a/test/Transforms/InstCombine/fcmp-special.ll
+++ b/test/Transforms/InstCombine/fcmp-special.ll
@@ -190,7 +190,7 @@ define i1 @nnan_ops_to_fcmp_uno(float %x, float %y) {
 
 define i1 @negative_zero_oeq(float %x) {
 ; CHECK-LABEL: @negative_zero_oeq(
-; CHECK-NEXT:    [[R:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %r = fcmp oeq float %x, -0.0
@@ -199,7 +199,7 @@ define i1 @negative_zero_oeq(float %x) {
 
 define i1 @negative_zero_oge(double %x) {
 ; CHECK-LABEL: @negative_zero_oge(
-; CHECK-NEXT:    [[R:%.*]] = fcmp nnan oge double [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = fcmp nnan oge double [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %r = fcmp nnan oge double %x, -0.0
@@ -208,7 +208,7 @@ define i1 @negative_zero_oge(double %x) {
 
 define i1 @negative_zero_uge(half %x) {
 ; CHECK-LABEL: @negative_zero_uge(
-; CHECK-NEXT:    [[R:%.*]] = fcmp fast uge half [[X:%.*]], 0xH8000
+; CHECK-NEXT:    [[R:%.*]] = fcmp fast uge half [[X:%.*]], 0xH0000
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %r = fcmp fast uge half %x, -0.0
@@ -217,7 +217,7 @@ define i1 @negative_zero_uge(half %x) {
 
 define <2 x i1> @negative_zero_olt_vec(<2 x float> %x) {
 ; CHECK-LABEL: @negative_zero_olt_vec(
-; CHECK-NEXT:    [[R:%.*]] = fcmp reassoc ninf olt <2 x float> [[X:%.*]], <float -0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    [[R:%.*]] = fcmp reassoc ninf olt <2 x float> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %r = fcmp reassoc ninf olt <2 x float> %x, <float -0.0, float -0.0>
@@ -226,7 +226,7 @@ define <2 x i1> @negative_zero_olt_vec(<2 x float> %x) {
 
 define <2 x i1> @negative_zero_une_vec_undef(<2 x double> %x) {
 ; CHECK-LABEL: @negative_zero_une_vec_undef(
-; CHECK-NEXT:    [[R:%.*]] = fcmp nnan une <2 x double> [[X:%.*]], <double -0.000000e+00, double undef>
+; CHECK-NEXT:    [[R:%.*]] = fcmp nnan une <2 x double> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %r = fcmp nnan une <2 x double> %x, <double -0.0, double undef>
@@ -235,7 +235,7 @@ define <2 x i1> @negative_zero_une_vec_undef(<2 x double> %x) {
 
 define <2 x i1> @negative_zero_ule_vec_mixed(<2 x float> %x) {
 ; CHECK-LABEL: @negative_zero_ule_vec_mixed(
-; CHECK-NEXT:    [[R:%.*]] = fcmp ule <2 x float> [[X:%.*]], <float 0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    [[R:%.*]] = fcmp ule <2 x float> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %r = fcmp ule <2 x float> %x, <float 0.0, float -0.0>
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index c19aae4c03b..48c12a300a3 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -360,7 +360,7 @@ define i1 @test26_recipX_unorderd(float %X) {
 ; Fold <-1.0, -1.0> / X > <-0.0, -0.0>
 define <2 x i1> @test27_recipX_gt_vecsplat(<2 x float> %X) {
 ; CHECK-LABEL: @test27_recipX_gt_vecsplat(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf olt <2 x float> [[X:%.*]], <float -0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf olt <2 x float> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %div = fdiv ninf <2 x float> <float -1.0, float -1.0>, %X
diff --git a/test/Transforms/InstCombine/minmax-fp.ll b/test/Transforms/InstCombine/minmax-fp.ll
index 7bf8f57d4e8..11418156a48 100644
--- a/test/Transforms/InstCombine/minmax-fp.ll
+++ b/test/Transforms/InstCombine/minmax-fp.ll
@@ -78,7 +78,7 @@ define double @t6(float %a) {
 
 define double @t7(float %a) {
 ; CHECK-LABEL: @t7(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float -0.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[TMP1]] to double
 ; CHECK-NEXT:    ret double [[TMP2]]
diff --git a/test/Transforms/InstCombine/select-binop-cmp.ll b/test/Transforms/InstCombine/select-binop-cmp.ll
index c7361fa040e..a473acd7304 100644
--- a/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -142,7 +142,7 @@ define i32 @select_xor_inv_icmp2(i32 %x, i32 %y, i32 %z) {
 
 define float @select_fadd_fcmp(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Z:%.*]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -168,7 +168,7 @@ define float @select_fadd_fcmp_poszero(float %x, float %y, float %z) {
 
 define float @select_fadd_fcmp_2(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_2(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[Z]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -198,7 +198,7 @@ define float @select_fadd_fcmp_2_poszero(float %x, float %y, float %v) {
 
 define float @select_fadd_fcmp_3(float %x, float %y) {
 ; CHECK-LABEL: @select_fadd_fcmp_3(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float 6.000000e+00
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -224,7 +224,7 @@ define float @select_fadd_fcmp_3_poszero(float %x, float %y) {
 
 define float @select_fadd_fcmp_4(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_4(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[Z:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -250,7 +250,7 @@ define float @select_fadd_fcmp_4_poszero(float %x, float %y, float %z) {
 
 define float @select_fadd_fcmp_5(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_5(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Z]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -280,7 +280,7 @@ define float @select_fadd_fcmp_5_poszero(float %x, float %y, float %v) {
 
 define float @select_fadd_fcmp_6(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_6(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float 6.000000e+00, float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -332,7 +332,7 @@ define float @select_fsub_fcmp(float %x, float %y, float %z) {
 
 define float @select_fsub_fcmp_negzero(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fsub_fcmp_negzero(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Z:%.*]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -675,7 +675,7 @@ define float @select_fadd_fcmp_bad_3(float %x, float %y, float %z, float %k) {
 ; Invalid order of operands of select
 define float @select_fadd_fcmp_bad_4(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_4(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd float [[X]], [[Z:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -689,7 +689,7 @@ define float @select_fadd_fcmp_bad_4(float %x, float %y, float %z) {
 ; Invalid comparison type
 define float @select_fadd_fcmp_bad_5(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_5(
-; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -703,7 +703,7 @@ define float @select_fadd_fcmp_bad_5(float %x, float %y, float %z) {
 ; Invalid order of operands of select
 define float @select_fadd_fcmp_bad_6(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_6(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -717,7 +717,7 @@ define float @select_fadd_fcmp_bad_6(float %x, float %y, float %z) {
 ; Do not transform if we have signed zeros and if Z is possibly negative zero
 define float @select_fadd_fcmp_bad_7(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_7(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd float [[X]], [[Z:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -731,7 +731,7 @@ define float @select_fadd_fcmp_bad_7(float %x, float %y, float %z) {
 ; Invalid comparison type
 define float @select_fadd_fcmp_bad_8(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_8(
-; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], -1.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd float [[Z]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
@@ -747,7 +747,7 @@ define float @select_fadd_fcmp_bad_8(float %x, float %y, float %v) {
 ; Invalid comparison type
 define float @select_fadd_fcmp_bad_9(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_9(
-; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -761,7 +761,7 @@ define float @select_fadd_fcmp_bad_9(float %x, float %y, float %z) {
 ; Invalid comparison type
 define float @select_fadd_fcmp_bad_10(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_10(
-; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd float [[Z]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
@@ -777,7 +777,7 @@ define float @select_fadd_fcmp_bad_10(float %x, float %y, float %v) {
 ; Do not transform if Z is possibly negative zero
 define float @select_fadd_fcmp_bad_11(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_11(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], -1.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[Z]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -792,7 +792,7 @@ define float @select_fadd_fcmp_bad_11(float %x, float %y, float %v) {
 ; Do not transform if we have signed zeros and if Z is possibly negative zero
 define float @select_fadd_fcmp_bad_12(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_12(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -806,7 +806,7 @@ define float @select_fadd_fcmp_bad_12(float %x, float %y, float %z) {
 ; Invalid order of operands of select
 define float @select_fadd_fcmp_bad_13(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_13(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[X]], [[Z:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
 ; CHECK-NEXT:    ret float [[C]]
-- 
GitLab


From 26835b9b0c8d6e7e6007fd374e1edb046c329bd3 Mon Sep 17 00:00:00 2001
From: Zaara Syeda <syzaara@ca.ibm.com>
Date: Mon, 5 Nov 2018 17:31:26 +0000
Subject: [PATCH 0972/1116] [Power9] Add support for stxvw4x.be and stxvd2x.be
 intrinsics

On Power9, we don't have patterns to select the following intrinsics:
llvm.ppc.vsx.stxvw4x.be
llvm.ppc.vsx.stxvd2x.be

This patch adds support for these.

Differential Revision: https://reviews.llvm.org/D53581

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346148 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCInstrVSX.td    |  8 ++--
 test/CodeGen/PowerPC/vsx.ll          | 48 ------------------------
 test/CodeGen/PowerPC/vsx_builtins.ll | 56 ++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+), 52 deletions(-)
 create mode 100644 test/CodeGen/PowerPC/vsx_builtins.ll

diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 6a4586002b2..9d462df6fef 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1066,10 +1066,6 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
   // Stores.
   def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
             (STXVD2X $rS, xoaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
-            (STXVD2X $rS, xoaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
-            (STXVW4X $rS, xoaddr:$dst)>;
   def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
 }
 let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in {
@@ -1990,6 +1986,10 @@ let Predicates = [IsLittleEndian, HasVSX] in
   def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
             (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
 
+def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
+            (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
+            (STXVW4X $rS, xoaddr:$dst)>;
 def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
 def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
 
diff --git a/test/CodeGen/PowerPC/vsx.ll b/test/CodeGen/PowerPC/vsx.ll
index 3df501db41f..d6a5ed37040 100644
--- a/test/CodeGen/PowerPC/vsx.ll
+++ b/test/CodeGen/PowerPC/vsx.ll
@@ -1211,51 +1211,3 @@ entry:
 ; CHECK-LE: xscmpudp cr0, f3, f4
 ; CHECK-LE: beqlr cr0
 }
-
-; Function Attrs: nounwind readnone
-define <4 x i32> @test83(i8* %a) {
-  entry:
-    %0 = tail call <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8* %a)
-      ret <4 x i32> %0
-; CHECK-LABEL: test83
-; CHECK: lxvw4x v2, 0, r3
-; CHECK: blr
-}
-; Function Attrs: nounwind readnone
-declare <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8*)
-
-; Function Attrs: nounwind readnone
-define <2 x double> @test84(i8* %a) {
-  entry:
-    %0 = tail call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %a)
-      ret <2 x double> %0
-; CHECK-LABEL: test84
-; CHECK: lxvd2x v2, 0, r3
-; CHECK: blr
-}
-; Function Attrs: nounwind readnone
-declare <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8*)
-
-; Function Attrs: nounwind readnone
-define void @test85(<4 x i32> %a, i8* %b) {
-  entry:
-    tail call void @llvm.ppc.vsx.stxvw4x.be(<4 x i32> %a, i8* %b)
-    ret void
-; CHECK-LABEL: test85
-; CHECK: stxvw4x v2, 0, r5
-; CHECK: blr
-}
-; Function Attrs: nounwind readnone
-declare void @llvm.ppc.vsx.stxvw4x.be(<4 x i32>, i8*)
-
-; Function Attrs: nounwind readnone
-define void @test86(<2 x double> %a, i8* %b) {
-  entry:
-    tail call void @llvm.ppc.vsx.stxvd2x.be(<2 x double> %a, i8* %b)
-    ret void
-; CHECK-LABEL: test86
-; CHECK: stxvd2x v2, 0, r5
-; CHECK: blr
-}
-; Function Attrs: nounwind readnone
-declare void @llvm.ppc.vsx.stxvd2x.be(<2 x double>, i8*)
diff --git a/test/CodeGen/PowerPC/vsx_builtins.ll b/test/CodeGen/PowerPC/vsx_builtins.ll
new file mode 100644
index 00000000000..b386565500f
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx_builtins.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -relocation-model=static -verify-machineinstrs -mcpu=pwr9 \
+; RUN:     -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s
+
+; Function Attrs: nounwind readnone
+define <4 x i32> @test1(i8* %a) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvw4x v2, 0, r3
+; CHECK-NEXT:    blr
+  entry:
+    %0 = tail call <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8* %a)
+      ret <4 x i32> %0
+}
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8*)
+
+; Function Attrs: nounwind readnone
+define <2 x double> @test2(i8* %a) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvd2x v2, 0, r3
+; CHECK-NEXT:    blr
+  entry:
+    %0 = tail call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %a)
+      ret <2 x double> %0
+}
+; Function Attrs: nounwind readnone
+declare <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8*)
+
+; Function Attrs: nounwind readnone
+define void @test3(<4 x i32> %a, i8* %b) {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    stxvw4x v2, 0, r5
+; CHECK-NEXT:    blr
+  entry:
+    tail call void @llvm.ppc.vsx.stxvw4x.be(<4 x i32> %a, i8* %b)
+    ret void
+}
+; Function Attrs: nounwind readnone
+declare void @llvm.ppc.vsx.stxvw4x.be(<4 x i32>, i8*)
+
+; Function Attrs: nounwind readnone
+define void @test4(<2 x double> %a, i8* %b) {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    stxvd2x v2, 0, r5
+; CHECK-NEXT:    blr
+  entry:
+    tail call void @llvm.ppc.vsx.stxvd2x.be(<2 x double> %a, i8* %b)
+    ret void
+}
+; Function Attrs: nounwind readnone
+declare void @llvm.ppc.vsx.stxvd2x.be(<2 x double>, i8*)
-- 
GitLab


From 513520e5af92af4db74e5711b62910a90e6eed18 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Nov 2018 18:09:10 +0000
Subject: [PATCH 0973/1116] [InstCombine] add/adjust tests for fcmp+select
 substitution; NFC

There was no coverage for at least 2 out of the 4 patterns because
of fcmp canonicalization. The tests and code should be moved to
InstSimplify in a follow-up because this doesn't create any new values.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346150 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/fcmp-select.ll | 119 ++++++++++++++++-----
 1 file changed, 91 insertions(+), 28 deletions(-)

diff --git a/test/Transforms/InstCombine/fcmp-select.ll b/test/Transforms/InstCombine/fcmp-select.ll
index e04ab3e8923..7fc59bbcb7d 100644
--- a/test/Transforms/InstCombine/fcmp-select.ll
+++ b/test/Transforms/InstCombine/fcmp-select.ll
@@ -1,53 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
+declare void @use(i1)
+
+; X == 42.0 ? X : 42.0 --> 42.0
+
+define double @oeq(double %x) {
+; CHECK-LABEL: @oeq(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    ret double 4.200000e+01
+;
+  %cmp = fcmp oeq double %x, 42.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
+  %cond = select i1 %cmp, double %x, double 42.0
+  ret double %cond
+}
+
+; X == 42.0 ? 42.0 : X --> X
+
+define float @oeq_swapped(float %x) {
+; CHECK-LABEL: @oeq_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    ret float [[X]]
+;
+  %cmp = fcmp oeq float %x, 42.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
+  %cond = select i1 %cmp, float 42.0, float %x
+  ret float %cond
+}
+
 ; x != y ? x : y -> x if it's the right kind of != and at least
 ; one of x and y is not negative zero.
 
-; CHECK: f0
-; CHECK: ret double %x
-define double @f0(double %x) nounwind readnone {
-entry:
-  %cmp = fcmp une double %x, -1.0
-  %cond = select i1 %cmp, double %x, double -1.0
+; X != 42.0 ? X : 42.0 --> X
+
+define double @une(double %x) {
+; CHECK-LABEL: @une(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    ret double [[X]]
+;
+  %cmp = fcmp une double %x, 42.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
+  %cond = select i1 %cmp, double %x, double 42.0
   ret double %cond
 }
-; CHECK: f1
-; CHECK: ret double -1.000000e+00
-define double @f1(double %x) nounwind readnone {
-entry:
-  %cmp = fcmp une double %x, -1.0
-  %cond = select i1 %cmp, double -1.0, double %x
+
+; X != 42.0 ? 42.0 : X --> 42.0
+
+define double @une_swapped(double %x) {
+; CHECK-LABEL: @une_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    ret double 4.200000e+01
+;
+  %cmp = fcmp une double %x, 42.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
+  %cond = select i1 %cmp, double 42.0, double %x
   ret double %cond
 }
-; CHECK: f2
-; CHECK: ret double %cond
-define double @f2(double %x, double %y) nounwind readnone {
-entry:
+
+define double @une_could_be_negzero(double %x, double %y) {
+; CHECK-LABEL: @une_could_be_negzero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double [[Y]]
+; CHECK-NEXT:    ret double [[COND]]
+;
   %cmp = fcmp une double %x, %y
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
   %cond = select i1 %cmp, double %x, double %y
   ret double %cond
 }
-; CHECK: f3
-; CHECK: ret double %cond
-define double @f3(double %x, double %y) nounwind readnone {
-entry:
+
+define double @une_swapped_could_be_negzero(double %x, double %y) {
+; CHECK-LABEL: @une_swapped_could_be_negzero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[Y]], double [[X]]
+; CHECK-NEXT:    ret double [[COND]]
+;
   %cmp = fcmp une double %x, %y
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
   %cond = select i1 %cmp, double %y, double %x
   ret double %cond
 }
-; CHECK: f4
-; CHECK: ret double %cond
-define double @f4(double %x) nounwind readnone {
-entry:
+
+define double @one(double %x) {
+; CHECK-LABEL: @one(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[X:%.*]], -1.000000e+00
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double -1.000000e+00
+; CHECK-NEXT:    ret double [[COND]]
+;
   %cmp = fcmp one double %x, -1.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
   %cond = select i1 %cmp, double %x, double -1.0
   ret double %cond
 }
-; CHECK: f5
-; CHECK: ret double %cond
-define double @f5(double %x) nounwind readnone {
-entry:
+
+define double @one_swapped(double %x) {
+; CHECK-LABEL: @one_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[X:%.*]], -1.000000e+00
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double -1.000000e+00, double [[X]]
+; CHECK-NEXT:    ret double [[COND]]
+;
   %cmp = fcmp one double %x, -1.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
   %cond = select i1 %cmp, double -1.0, double %x
   ret double %cond
 }
+
-- 
GitLab


From 960c5993194b1be1a19c36a42f3e1a415075d673 Mon Sep 17 00:00:00 2001
From: Taewook Oh <twoh@fb.com>
Date: Mon, 5 Nov 2018 18:16:32 +0000
Subject: [PATCH 0974/1116] [MergeICmps] Do not perform the transformation if
 GEP is used outside of block

Summary:
This patch prevents MergeICmps to performn the transformation if the address operand GEP of the load instruction has a use outside of the load's parent block. Without this patch, compiler crashes with the given test case because the use of `%first.i` is still around when the basic block is erased from https://github.com/llvm-mirror/llvm/blob/master/lib/Transforms/Scalar/MergeICmps.cpp#L620. I think checking `isUsedOutsideOfBlock` with `GEP` is the original intention of the code, as the checking for `LoadI` is already performed in the same function.

This patch is incomplete though, as this makes the pass overly conservative and fails the test `tuple-four-int8.ll`. I believe what needs to be done is checking if GEP has a use outside of block that is not the part of "Comparisons" chain. Submit the patch as of now to prevent compiler crash.

Reviewers: courbet, trentxintong

Reviewed By: courbet

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D54089

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346151 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/MergeICmps.cpp          |  2 +-
 .../MergeICmps/X86/gep-used-outside.ll        | 36 +++++++++++++++++++
 .../MergeICmps/X86/tuple-four-int8.ll         |  1 +
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/MergeICmps/X86/gep-used-outside.ll

diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp
index 3633485d5d5..69fd8b163a0 100644
--- a/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/lib/Transforms/Scalar/MergeICmps.cpp
@@ -98,7 +98,7 @@ BCEAtom visitICmpLoadOperand(Value *const Val) {
     Value *const Addr = LoadI->getOperand(0);
     if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
       LLVM_DEBUG(dbgs() << "GEP\n");
-      if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+      if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) {
         LLVM_DEBUG(dbgs() << "used outside of block\n");
         return {};
       }
diff --git a/test/Transforms/MergeICmps/X86/gep-used-outside.ll b/test/Transforms/MergeICmps/X86/gep-used-outside.ll
new file mode 100644
index 00000000000..9c944d52500
--- /dev/null
+++ b/test/Transforms/MergeICmps/X86/gep-used-outside.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mergeicmps -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+
+%"struct.std::pair" = type { i32, i32 }
+
+; Check that the transformation is avoided when GEP has a use outside of the
+; parant block of the load instruction.
+
+define zeroext i32 @opeq1(
+; CHECK-LABEL: @opeq1(
+; CHECK-NOT:    [[MEMCMP:%.*]] = call i32 @memcmp
+
+  %"struct.std::pair"* nocapture readonly dereferenceable(16) %a,
+  %"struct.std::pair"* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 {
+entry:
+  %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 
+  %0 = load i32, i32* %first.i, align 4
+  %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 
+  %1 = load i32, i32* %first1.i, align 4
+  %cmp.i = icmp eq i32 %0, %1
+  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+  %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
+  %2 = load i32, i32* %second.i, align 4
+  %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
+  %3 = load i32, i32* %second2.i, align 4
+  %cmp3.i = icmp eq i32 %2, %3
+  br label %opeq1.exit
+
+opeq1.exit:
+  %4 = phi i1 [ false, %entry ], [ %cmp3.i,  %land.rhs.i]
+  %5 = load i32, i32* %first.i, align 4
+  %6 = select i1 %4, i32 %5, i32 0
+  ret i32 %6
+}
diff --git a/test/Transforms/MergeICmps/X86/tuple-four-int8.ll b/test/Transforms/MergeICmps/X86/tuple-four-int8.ll
index c7f2d45257d..097a1c232fc 100644
--- a/test/Transforms/MergeICmps/X86/tuple-four-int8.ll
+++ b/test/Transforms/MergeICmps/X86/tuple-four-int8.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mergeicmps -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
-- 
GitLab


From c0bf8d6c82c97d48d6a4daec180cf21b06050504 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alexandre.ganea@ubisoft.com>
Date: Mon, 5 Nov 2018 19:14:10 +0000
Subject: [PATCH 0975/1116] Only call FlushFileBuffers() when writing
 executables on Windows

This is a follow-up for "r325274: Call FlushFileBuffers on output files."

Previously, FlushFileBuffers() was called in all cases when writing a file. The objective was to go around a bug in the Windows kernel (as described here: https://randomascii.wordpress.com/2018/02/25/compiler-bug-linker-bug-windows-kernel-bug/). However that is required only when writing EXEs, any other file type doesn't need flushing.

This patch calls FlushFileBuffers() only for EXEs. In addition, we completly disable FlushFileBuffers() for known Windows 10 versions that do not exhibit the original kernel bug.

Differential Revision: https://reviews.llvm.org/D53727

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346152 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Windows/Path.inc         | 23 +++++++++++++++++-
 lib/Support/Windows/WindowsSupport.h | 35 ++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index e719d3c7b72..45d73ae3dfe 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -854,16 +854,37 @@ mapped_file_region::mapped_file_region(int fd, mapmode mode, size_t length,
     Mapping = 0;
 }
 
+static bool hasFlushBufferKernelBug() {
+  static bool Ret{GetWindowsOSVersion() < llvm::VersionTuple(10, 0, 0, 17763)};
+  return Ret;
+}
+
+static bool isEXE(StringRef Magic) {
+  static const char PEMagic[] = {'P', 'E', '\0', '\0'};
+  if (Magic.startswith(StringRef("MZ")) && Magic.size() >= 0x3c + 4) {
+    uint32_t off = read32le(Magic.data() + 0x3c);
+    // PE/COFF file, either EXE or DLL.
+    if (Magic.substr(off).startswith(StringRef(PEMagic, sizeof(PEMagic))))
+      return true;
+  }
+  return false;
+}
+
 mapped_file_region::~mapped_file_region() {
   if (Mapping) {
+
+    bool Exe = isEXE(StringRef((char *)Mapping, Size));
+
     ::UnmapViewOfFile(Mapping);
 
-    if (Mode == mapmode::readwrite) {
+    if (Mode == mapmode::readwrite && Exe && hasFlushBufferKernelBug()) {
       // There is a Windows kernel bug, the exact trigger conditions of which
       // are not well understood.  When triggered, dirty pages are not properly
       // flushed and subsequent process's attempts to read a file can return
       // invalid data.  Calling FlushFileBuffers on the write handle is
       // sufficient to ensure that this bug is not triggered.
+      // The bug only occurs when writing an executable and executing it right
+      // after, under high I/O pressure.
       ::FlushFileBuffers(FileHandle);
     }
 
diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index c2fd6bb982d..5adfa859c96 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -41,6 +41,7 @@
 #include "llvm/Config/config.h" // Get build system configuration settings
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/VersionTuple.h"
 #include <cassert>
 #include <string>
 #include <system_error>
@@ -71,6 +72,40 @@ inline bool RunningWindows8OrGreater() {
                             Mask) != FALSE;
 }
 
+typedef NTSTATUS(WINAPI* RtlGetVersionPtr)(PRTL_OSVERSIONINFOW);
+#define STATUS_SUCCESS ((NTSTATUS)0x00000000L)
+
+inline llvm::VersionTuple GetWindowsOSVersion() {
+  HMODULE hMod = ::GetModuleHandleW(L"ntdll.dll");
+  if (hMod) {
+    auto getVer = (RtlGetVersionPtr)::GetProcAddress(hMod, "RtlGetVersion");
+    if (getVer) {
+      RTL_OSVERSIONINFOEXW info{};
+      info.dwOSVersionInfoSize = sizeof(info);
+      if (getVer((PRTL_OSVERSIONINFOW)&info) == STATUS_SUCCESS) {
+        return llvm::VersionTuple(info.dwMajorVersion, info.dwMinorVersion, 0,
+                                  info.dwBuildNumber);
+      }
+    }
+  }
+
+  OSVERSIONINFOEX info;
+  ZeroMemory(&info, sizeof(OSVERSIONINFOEX));
+  info.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
+#pragma warning(push)
+#pragma warning(disable : 4996)
+  // Starting with Microsoft SDK for Windows 8.1, this function is deprecated
+  // in favor of the new Windows Version Helper APIs.  Since we don't specify a
+  // minimum SDK version, it's easier to simply disable the warning rather than
+  // try to support both APIs.
+  if (GetVersionEx((LPOSVERSIONINFO)&info) == 0)
+    return llvm::VersionTuple();
+#pragma warning(pop)
+
+  return llvm::VersionTuple(info.dwMajorVersion, info.dwMinorVersion, 0,
+                            info.dwBuildNumber);
+}
+
 inline bool MakeErrMsg(std::string *ErrMsg, const std::string &prefix) {
   if (!ErrMsg)
     return true;
-- 
GitLab


From 4a52ea2ea292ddeafd2abdd48f4cf16cdd535848 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alexandre.ganea@ubisoft.com>
Date: Mon, 5 Nov 2018 19:20:47 +0000
Subject: [PATCH 0976/1116] [COFF][LLD] Add link support for Microsoft
 precompiled headers OBJs

This change allows for link-time merging of debugging information from
Microsoft precompiled types OBJs compiled with cl.exe /Z7 /Yc and /Yu.

This fixes llvm.org/PR34278

Differential Revision: https://reviews.llvm.org/D45213


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346154 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../DebugInfo/CodeView/TypeStreamMerger.h     |   9 +-
 include/llvm/DebugInfo/PDB/GenericError.h     |  15 ++-
 include/llvm/Support/BinaryStreamArray.h      |   2 +
 lib/DebugInfo/CodeView/CodeViewError.cpp      |   4 +-
 lib/DebugInfo/CodeView/TypeStreamMerger.cpp   | 113 ++++++++++++------
 lib/DebugInfo/PDB/GenericError.cpp            |   2 -
 tools/llvm-readobj/COFFDumper.cpp             |   4 +-
 7 files changed, 100 insertions(+), 49 deletions(-)

diff --git a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
index 583740d2eb4..a84f074237d 100644
--- a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
+++ b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
@@ -83,18 +83,21 @@ Error mergeIdRecords(MergingTypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
 Error mergeTypeAndIdRecords(MergingTypeTableBuilder &DestIds,
                             MergingTypeTableBuilder &DestTypes,
                             SmallVectorImpl<TypeIndex> &SourceToDest,
-                            const CVTypeArray &IdsAndTypes);
+                            const CVTypeArray &IdsAndTypes,
+                            Optional<EndPrecompRecord> &EndPrecomp);
 
 Error mergeTypeAndIdRecords(GlobalTypeTableBuilder &DestIds,
                             GlobalTypeTableBuilder &DestTypes,
                             SmallVectorImpl<TypeIndex> &SourceToDest,
                             const CVTypeArray &IdsAndTypes,
-                            ArrayRef<GloballyHashedType> Hashes);
+                            ArrayRef<GloballyHashedType> Hashes,
+                            Optional<EndPrecompRecord> &EndPrecomp);
 
 Error mergeTypeRecords(GlobalTypeTableBuilder &Dest,
                        SmallVectorImpl<TypeIndex> &SourceToDest,
                        const CVTypeArray &Types,
-                       ArrayRef<GloballyHashedType> Hashes);
+                       ArrayRef<GloballyHashedType> Hashes,
+                       Optional<EndPrecompRecord> &EndPrecomp);
 
 Error mergeIdRecords(GlobalTypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
                      SmallVectorImpl<TypeIndex> &SourceToDest,
diff --git a/include/llvm/DebugInfo/PDB/GenericError.h b/include/llvm/DebugInfo/PDB/GenericError.h
index 4e2e8b163b5..7b5a8529596 100644
--- a/include/llvm/DebugInfo/PDB/GenericError.h
+++ b/include/llvm/DebugInfo/PDB/GenericError.h
@@ -21,24 +21,23 @@ enum class pdb_error_code {
   dia_sdk_not_present,
   dia_failed_loading,
   signature_out_of_date,
-  type_server_not_found,
   unspecified,
 };
-} // namespace codeview
+} // namespace pdb
 } // namespace llvm
 
 namespace std {
-    template <>
-    struct is_error_code_enum<llvm::pdb::pdb_error_code> : std::true_type {};
+template <>
+struct is_error_code_enum<llvm::pdb::pdb_error_code> : std::true_type {};
 } // namespace std
 
 namespace llvm {
 namespace pdb {
-    const std::error_category &PDBErrCategory();
+const std::error_category &PDBErrCategory();
 
-    inline std::error_code make_error_code(pdb_error_code E) {
-        return std::error_code(static_cast<int>(E), PDBErrCategory());
-    }
+inline std::error_code make_error_code(pdb_error_code E) {
+  return std::error_code(static_cast<int>(E), PDBErrCategory());
+}
 
 /// Base class for errors originating when parsing raw PDB files
 class PDBError : public ErrorInfo<PDBError, StringError> {
diff --git a/include/llvm/Support/BinaryStreamArray.h b/include/llvm/Support/BinaryStreamArray.h
index d1571cb37fc..7b8a95b4573 100644
--- a/include/llvm/Support/BinaryStreamArray.h
+++ b/include/llvm/Support/BinaryStreamArray.h
@@ -125,6 +125,8 @@ public:
   BinaryStreamRef getUnderlyingStream() const { return Stream; }
   void setUnderlyingStream(BinaryStreamRef S) { Stream = S; }
 
+  void drop_front() { Stream = Stream.drop_front(begin()->length()); }
+
 private:
   BinaryStreamRef Stream;
   Extractor E;
diff --git a/lib/DebugInfo/CodeView/CodeViewError.cpp b/lib/DebugInfo/CodeView/CodeViewError.cpp
index 914157ef0c1..2a9753add31 100644
--- a/lib/DebugInfo/CodeView/CodeViewError.cpp
+++ b/lib/DebugInfo/CodeView/CodeViewError.cpp
@@ -41,6 +41,8 @@ public:
 };
 
 static llvm::ManagedStatic<CodeViewErrorCategory> CodeViewErrCategory;
-const std::error_category &llvm::codeview::CVErrorCategory() { return *CodeViewErrCategory; }
+const std::error_category &llvm::codeview::CVErrorCategory() {
+  return *CodeViewErrCategory;
+}
 
 char CodeViewError::ID;
diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index 1a1d516ff3c..803818226e5 100644
--- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
@@ -63,7 +64,12 @@ class TypeStreamMerger {
 public:
   explicit TypeStreamMerger(SmallVectorImpl<TypeIndex> &SourceToDest)
       : IndexMap(SourceToDest) {
-    SourceToDest.clear();
+    // When dealing with precompiled headers objects, all data in SourceToDest
+    // belongs to the precompiled headers object, and is assumed to be already
+    // remapped to the target PDB. Any forthcoming type that will be merged in
+    // might potentially back-reference this data. We also don't want to resolve
+    // twice the types in the precompiled object.
+    CurIndex += SourceToDest.size();
   }
 
   static const TypeIndex Untranslated;
@@ -71,7 +77,8 @@ public:
   // Local hashing entry points
   Error mergeTypesAndIds(MergingTypeTableBuilder &DestIds,
                          MergingTypeTableBuilder &DestTypes,
-                         const CVTypeArray &IdsAndTypes);
+                         const CVTypeArray &IdsAndTypes,
+                         Optional<EndPrecompRecord> &EP);
   Error mergeIdRecords(MergingTypeTableBuilder &Dest,
                        ArrayRef<TypeIndex> TypeSourceToDest,
                        const CVTypeArray &Ids);
@@ -82,13 +89,15 @@ public:
   Error mergeTypesAndIds(GlobalTypeTableBuilder &DestIds,
                          GlobalTypeTableBuilder &DestTypes,
                          const CVTypeArray &IdsAndTypes,
-                         ArrayRef<GloballyHashedType> Hashes);
+                         ArrayRef<GloballyHashedType> Hashes,
+                         Optional<EndPrecompRecord> &EP);
   Error mergeIdRecords(GlobalTypeTableBuilder &Dest,
                        ArrayRef<TypeIndex> TypeSourceToDest,
                        const CVTypeArray &Ids,
                        ArrayRef<GloballyHashedType> Hashes);
   Error mergeTypeRecords(GlobalTypeTableBuilder &Dest, const CVTypeArray &Types,
-                         ArrayRef<GloballyHashedType> Hashes);
+                         ArrayRef<GloballyHashedType> Hashes,
+                         Optional<EndPrecompRecord> &EP);
 
 private:
   Error doit(const CVTypeArray &Types);
@@ -156,6 +165,8 @@ private:
     return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
   }
 
+  Expected<bool> shouldRemapType(const CVType &Type);
+
   Optional<Error> LastError;
 
   bool UseGlobalHashes = false;
@@ -185,6 +196,8 @@ private:
   /// Temporary storage that we use to copy a record's data while re-writing
   /// its type indices.
   SmallVector<uint8_t, 256> RemapStorage;
+
+  Optional<EndPrecompRecord> EndPrecomp; 
 };
 
 } // end anonymous namespace
@@ -261,22 +274,27 @@ Error TypeStreamMerger::mergeIdRecords(MergingTypeTableBuilder &Dest,
 
 Error TypeStreamMerger::mergeTypesAndIds(MergingTypeTableBuilder &DestIds,
                                          MergingTypeTableBuilder &DestTypes,
-                                         const CVTypeArray &IdsAndTypes) {
+                                         const CVTypeArray &IdsAndTypes,
+                                         Optional<EndPrecompRecord> &EP) {
   DestIdStream = &DestIds;
   DestTypeStream = &DestTypes;
   UseGlobalHashes = false;
-  return doit(IdsAndTypes);
+  auto Err = doit(IdsAndTypes);
+  EP = EndPrecomp;
+  return Err;
 }
 
 // Global hashing entry points
 Error TypeStreamMerger::mergeTypeRecords(GlobalTypeTableBuilder &Dest,
                                          const CVTypeArray &Types,
-                                         ArrayRef<GloballyHashedType> Hashes) {
+                                         ArrayRef<GloballyHashedType> Hashes,
+                                         Optional<EndPrecompRecord> &EP) {
   DestGlobalTypeStream = &Dest;
   UseGlobalHashes = true;
   GlobalHashes = Hashes;
-
-  return doit(Types);
+  auto Err = doit(Types);
+  EP = EndPrecomp;
+  return Err;
 }
 
 Error TypeStreamMerger::mergeIdRecords(GlobalTypeTableBuilder &Dest,
@@ -294,12 +312,15 @@ Error TypeStreamMerger::mergeIdRecords(GlobalTypeTableBuilder &Dest,
 Error TypeStreamMerger::mergeTypesAndIds(GlobalTypeTableBuilder &DestIds,
                                          GlobalTypeTableBuilder &DestTypes,
                                          const CVTypeArray &IdsAndTypes,
-                                         ArrayRef<GloballyHashedType> Hashes) {
+                                         ArrayRef<GloballyHashedType> Hashes,
+                                         Optional<EndPrecompRecord> &EP) {
   DestGlobalIdStream = &DestIds;
   DestGlobalTypeStream = &DestTypes;
   UseGlobalHashes = true;
   GlobalHashes = Hashes;
-  return doit(IdsAndTypes);
+  auto Err = doit(IdsAndTypes);
+  EP = EndPrecomp;
+  return Err;
 }
 
 Error TypeStreamMerger::doit(const CVTypeArray &Types) {
@@ -345,25 +366,30 @@ Error TypeStreamMerger::remapAllTypes(const CVTypeArray &Types) {
 }
 
 Error TypeStreamMerger::remapType(const CVType &Type) {
-  auto DoSerialize =
-      [this, Type](MutableArrayRef<uint8_t> Storage) -> ArrayRef<uint8_t> {
-    return remapIndices(Type, Storage);
-  };
+  auto R = shouldRemapType(Type);
+  if (!R)
+    return R.takeError();
 
   TypeIndex DestIdx = Untranslated;
-  if (LLVM_LIKELY(UseGlobalHashes)) {
-    GlobalTypeTableBuilder &Dest =
-        isIdRecord(Type.kind()) ? *DestGlobalIdStream : *DestGlobalTypeStream;
-    GloballyHashedType H = GlobalHashes[CurIndex.toArrayIndex()];
-    DestIdx = Dest.insertRecordAs(H, Type.RecordData.size(), DoSerialize);
-  } else {
-    MergingTypeTableBuilder &Dest =
-        isIdRecord(Type.kind()) ? *DestIdStream : *DestTypeStream;
-
-    RemapStorage.resize(Type.RecordData.size());
-    ArrayRef<uint8_t> Result = DoSerialize(RemapStorage);
-    if (!Result.empty())
-      DestIdx = Dest.insertRecordBytes(Result);
+  if (*R) {
+    auto DoSerialize =
+        [this, Type](MutableArrayRef<uint8_t> Storage) -> ArrayRef<uint8_t> {
+      return remapIndices(Type, Storage);
+    };
+    if (LLVM_LIKELY(UseGlobalHashes)) {
+      GlobalTypeTableBuilder &Dest =
+          isIdRecord(Type.kind()) ? *DestGlobalIdStream : *DestGlobalTypeStream;
+      GloballyHashedType H = GlobalHashes[CurIndex.toArrayIndex()];
+      DestIdx = Dest.insertRecordAs(H, Type.RecordData.size(), DoSerialize);
+    } else {
+      MergingTypeTableBuilder &Dest =
+          isIdRecord(Type.kind()) ? *DestIdStream : *DestTypeStream;
+
+      RemapStorage.resize(Type.RecordData.size());
+      ArrayRef<uint8_t> Result = DoSerialize(RemapStorage);
+      if (!Result.empty())
+        DestIdx = Dest.insertRecordBytes(Result);
+    }
   }
   addMapping(DestIdx);
 
@@ -418,25 +444,29 @@ Error llvm::codeview::mergeIdRecords(MergingTypeTableBuilder &Dest,
 
 Error llvm::codeview::mergeTypeAndIdRecords(
     MergingTypeTableBuilder &DestIds, MergingTypeTableBuilder &DestTypes,
-    SmallVectorImpl<TypeIndex> &SourceToDest, const CVTypeArray &IdsAndTypes) {
+    SmallVectorImpl<TypeIndex> &SourceToDest, const CVTypeArray &IdsAndTypes,
+    Optional<EndPrecompRecord> &EndPrecomp) {
   TypeStreamMerger M(SourceToDest);
-  return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes);
+  return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes, EndPrecomp);
 }
 
 Error llvm::codeview::mergeTypeAndIdRecords(
     GlobalTypeTableBuilder &DestIds, GlobalTypeTableBuilder &DestTypes,
     SmallVectorImpl<TypeIndex> &SourceToDest, const CVTypeArray &IdsAndTypes,
-    ArrayRef<GloballyHashedType> Hashes) {
+    ArrayRef<GloballyHashedType> Hashes,
+    Optional<EndPrecompRecord> &EndPrecomp) {
   TypeStreamMerger M(SourceToDest);
-  return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes, Hashes);
+  return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes, Hashes,
+                            EndPrecomp);
 }
 
 Error llvm::codeview::mergeTypeRecords(GlobalTypeTableBuilder &Dest,
                                        SmallVectorImpl<TypeIndex> &SourceToDest,
                                        const CVTypeArray &Types,
-                                       ArrayRef<GloballyHashedType> Hashes) {
+                                       ArrayRef<GloballyHashedType> Hashes,
+                                       Optional<EndPrecompRecord> &EndPrecomp) {
   TypeStreamMerger M(SourceToDest);
-  return M.mergeTypeRecords(Dest, Types, Hashes);
+  return M.mergeTypeRecords(Dest, Types, Hashes, EndPrecomp);
 }
 
 Error llvm::codeview::mergeIdRecords(GlobalTypeTableBuilder &Dest,
@@ -447,3 +477,18 @@ Error llvm::codeview::mergeIdRecords(GlobalTypeTableBuilder &Dest,
   TypeStreamMerger M(SourceToDest);
   return M.mergeIdRecords(Dest, Types, Ids, Hashes);
 }
+
+Expected<bool> TypeStreamMerger::shouldRemapType(const CVType &Type) {
+  // For object files containing precompiled types, we need to extract the
+  // signature, through EndPrecompRecord. This is done here for performance
+  // reasons, to avoid re-parsing the Types stream.
+  if (Type.kind() == LF_ENDPRECOMP) {
+    assert(!EndPrecomp);
+    EndPrecomp.emplace();
+    if (auto EC = TypeDeserializer::deserializeAs(const_cast<CVType &>(Type),
+                                                  EndPrecomp.getValue()))
+      return joinErrors(std::move(EC), errorCorruptRecord());
+    return false;
+  }
+  return true;
+}
diff --git a/lib/DebugInfo/PDB/GenericError.cpp b/lib/DebugInfo/PDB/GenericError.cpp
index 95f6c15cd30..5f5ff69fe3f 100644
--- a/lib/DebugInfo/PDB/GenericError.cpp
+++ b/lib/DebugInfo/PDB/GenericError.cpp
@@ -24,8 +24,6 @@ public:
     switch (static_cast<pdb_error_code>(Condition)) {
     case pdb_error_code::unspecified:
       return "An unknown error has occurred.";
-    case pdb_error_code::type_server_not_found:
-        return "Type server PDB was not found.";
     case pdb_error_code::dia_sdk_not_present:
       return "LLVM was not compiled with support for DIA. This usually means "
              "that you are not using MSVC, or your Visual Studio "
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 26fe1aa622f..7f590713993 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -1248,7 +1248,9 @@ void COFFDumper::mergeCodeViewTypes(MergingTypeTableBuilder &CVIDs,
         error(object_error::parse_failed);
       }
       SmallVector<TypeIndex, 128> SourceToDest;
-      if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, Types))
+      Optional<EndPrecompRecord> EndPrecomp;
+      if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, Types,
+                                          EndPrecomp))
         return error(std::move(EC));
     }
   }
-- 
GitLab


From 24e02bb6e1f96240d7340bc8aff5fb28e6b4c51f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 5 Nov 2018 19:45:37 +0000
Subject: [PATCH 0977/1116] [X86] Regenerate test checks in preparation for a
 patch. NFC

I'm preparing a patch to avoid creating critical edges in cmov expansion. Updating these tests to make the changes by the next patch easier to see.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346161 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/MachineSink-eflags.ll        |  33 ++-
 test/CodeGen/X86/atomic32.ll                  | 250 +++++++++---------
 test/CodeGen/X86/fdiv-combine.ll              |  18 +-
 test/CodeGen/X86/fp128-compare.ll             | 133 ++++++----
 .../X86/machine-trace-metrics-crash.ll        |  39 ++-
 test/CodeGen/X86/pr5145.ll                    |  85 ++++--
 test/CodeGen/X86/pseudo_cmov_lower2.ll        | 133 +++++++---
 7 files changed, 456 insertions(+), 235 deletions(-)

diff --git a/test/CodeGen/X86/MachineSink-eflags.ll b/test/CodeGen/X86/MachineSink-eflags.ll
index 4e52c8c5f7d..6302b3be671 100644
--- a/test/CodeGen/X86/MachineSink-eflags.ll
+++ b/test/CodeGen/X86/MachineSink-eflags.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-pc-linux"
@@ -11,6 +12,36 @@ target triple = "x86_64-pc-linux"
 %5 = type <{ void (i32)*, i8*, i32 (i8*, ...)* }>
 
 define void @foo(i8* nocapture %_stubArgs) nounwind {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq 48(%rdi), %rax
+; CHECK-NEXT:    movl 64(%rdi), %edx
+; CHECK-NEXT:    movl $200, %esi
+; CHECK-NEXT:    addl 68(%rdi), %esi
+; CHECK-NEXT:    imull $46, %edx, %ecx
+; CHECK-NEXT:    addq %rsi, %rcx
+; CHECK-NEXT:    shlq $4, %rcx
+; CHECK-NEXT:    imull $47, %edx, %edx
+; CHECK-NEXT:    addq %rsi, %rdx
+; CHECK-NEXT:    shlq $4, %rdx
+; CHECK-NEXT:    movaps (%rax,%rdx), %xmm0
+; CHECK-NEXT:    cmpl $0, (%rdi)
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    movaps (%rax,%rcx), %xmm1
+; CHECK-NEXT:  .LBB0_3: # %entry
+; CHECK-NEXT:    leaq -{{[0-9]+}}(%rsp), %rsp
+; CHECK-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %entry
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:  .LBB0_5: # %entry
+; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    addq $152, %rsp
+; CHECK-NEXT:    retq
 entry:
  %i0 = alloca i8*, align 8
  %i2 = alloca i8*, align 8
@@ -60,8 +91,6 @@ entry:
  %cmp432.i = icmp ult i32 %tmp156.i, %tmp1
 
 ; %shl.i should not be sinked below the compare.
-; CHECK: cmpl
-; CHECK-NOT: shlq
 
  %cond.i = select i1 %cmp432.i, <2 x double> %tmp162.i, <2 x double> zeroinitializer
  store <2 x double> %cond.i, <2 x double>* %ptr4438.i, align 16
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
index 7e2bff4116c..5e78444eea7 100644
--- a/test/CodeGen/X86/atomic32.ll
+++ b/test/CodeGen/X86/atomic32.ll
@@ -61,22 +61,22 @@ define void @atomic_fetch_and32() nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock andl $3, {{.*}}(%rip)
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB2_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB2_2
 ; X64-NEXT:    jmp .LBB2_1
 ; X64-NEXT:  .LBB2_2: # %atomicrmw.end
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    lock andl %eax, {{.*}}(%rip)
 ; X64-NEXT:    retq
 ;
@@ -85,10 +85,10 @@ define void @atomic_fetch_and32() nounwind {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    lock andl $3, sc32
 ; X86-NEXT:    movl sc32, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB2_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
@@ -96,7 +96,7 @@ define void @atomic_fetch_and32() nounwind {
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB2_2
 ; X86-NEXT:    jmp .LBB2_1
 ; X86-NEXT:  .LBB2_2: # %atomicrmw.end
@@ -115,22 +115,22 @@ define void @atomic_fetch_or32() nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $3, {{.*}}(%rip)
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB3_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    orl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB3_2
 ; X64-NEXT:    jmp .LBB3_1
 ; X64-NEXT:  .LBB3_2: # %atomicrmw.end
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    lock orl %eax, {{.*}}(%rip)
 ; X64-NEXT:    retq
 ;
@@ -139,10 +139,10 @@ define void @atomic_fetch_or32() nounwind {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    lock orl $3, sc32
 ; X86-NEXT:    movl sc32, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB3_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    orl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
@@ -150,7 +150,7 @@ define void @atomic_fetch_or32() nounwind {
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB3_2
 ; X86-NEXT:    jmp .LBB3_1
 ; X86-NEXT:  .LBB3_2: # %atomicrmw.end
@@ -169,22 +169,22 @@ define void @atomic_fetch_xor32() nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock xorl $3, {{.*}}(%rip)
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB4_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    xorl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB4_2
 ; X64-NEXT:    jmp .LBB4_1
 ; X64-NEXT:  .LBB4_2: # %atomicrmw.end
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    lock xorl %eax, {{.*}}(%rip)
 ; X64-NEXT:    retq
 ;
@@ -193,10 +193,10 @@ define void @atomic_fetch_xor32() nounwind {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    lock xorl $3, sc32
 ; X86-NEXT:    movl sc32, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB4_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
@@ -204,7 +204,7 @@ define void @atomic_fetch_xor32() nounwind {
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB4_2
 ; X86-NEXT:    jmp .LBB4_1
 ; X86-NEXT:  .LBB4_2: # %atomicrmw.end
@@ -222,19 +222,19 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_nand32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB5_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    andl %edx, %ecx
 ; X64-NEXT:    notl %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB5_2
 ; X64-NEXT:    jmp .LBB5_1
 ; X64-NEXT:  .LBB5_2: # %atomicrmw.end
@@ -246,13 +246,13 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl sc32, %ecx
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:  .LBB5_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    andl %edx, %ecx
 ; X86-NEXT:    notl %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
@@ -273,20 +273,20 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_max32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovgel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB6_2
 ; X64-NEXT:    jmp .LBB6_1
 ; X64-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -298,20 +298,20 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovgel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB6_2
 ; X86-CMOV-NEXT:    jmp .LBB6_1
 ; X86-CMOV-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -326,34 +326,34 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jge .LBB6_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB6_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB6_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB6_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB6_2
 ; X86-NOCMOV-NEXT:    jmp .LBB6_1
 ; X86-NOCMOV-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -369,20 +369,20 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_min32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB7_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovlel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB7_2
 ; X64-NEXT:    jmp .LBB7_1
 ; X64-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -394,20 +394,20 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB7_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovlel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB7_2
 ; X86-CMOV-NEXT:    jmp .LBB7_1
 ; X86-CMOV-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -422,34 +422,34 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB7_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jle .LBB7_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB7_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB7_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB7_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB7_2
 ; X86-NOCMOV-NEXT:    jmp .LBB7_1
 ; X86-NOCMOV-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -465,20 +465,20 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_umax32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmoval %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB8_2
 ; X64-NEXT:    jmp .LBB8_1
 ; X64-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -490,20 +490,20 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmoval %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB8_2
 ; X86-CMOV-NEXT:    jmp .LBB8_1
 ; X86-CMOV-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -518,34 +518,34 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    ja .LBB8_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB8_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB8_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB8_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB8_2
 ; X86-NOCMOV-NEXT:    jmp .LBB8_1
 ; X86-NOCMOV-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -561,20 +561,20 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_umin32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovbel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB9_2
 ; X64-NEXT:    jmp .LBB9_1
 ; X64-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -586,20 +586,20 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovbel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB9_2
 ; X86-CMOV-NEXT:    jmp .LBB9_1
 ; X86-CMOV-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -614,34 +614,34 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jbe .LBB9_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB9_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB9_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB9_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB9_2
 ; X86-NOCMOV-NEXT:    jmp .LBB9_1
 ; X86-NOCMOV-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -659,7 +659,7 @@ define void @atomic_fetch_cmpxchg32() nounwind {
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    movl $1, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: atomic_fetch_cmpxchg32:
@@ -694,7 +694,7 @@ define void @atomic_fetch_swap32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_swap32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    xchgl %edi, {{.*}}(%rip)
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: atomic_fetch_swap32:
diff --git a/test/CodeGen/X86/fdiv-combine.ll b/test/CodeGen/X86/fdiv-combine.ll
index 62e86e3ad2c..c0c5baa2c8b 100644
--- a/test/CodeGen/X86/fdiv-combine.ll
+++ b/test/CodeGen/X86/fdiv-combine.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
 
 ; More than one 'arcp' division using a single divisor operand
@@ -79,12 +80,12 @@ define float @div2_arcp_partial3(float %x, float %y, float %z) {
 }
 
 ; If the reciprocal is already calculated, we should not
-; generate an extra multiplication by 1.0. 
+; generate an extra multiplication by 1.0.
 
 define double @div3_arcp(double %x, double %y, double %z) {
 ; CHECK-LABEL: div3_arcp:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd{{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
 ; CHECK-NEXT:    divsd %xmm1, %xmm2
 ; CHECK-NEXT:    mulsd %xmm2, %xmm0
 ; CHECK-NEXT:    addsd %xmm2, %xmm0
@@ -132,9 +133,16 @@ define float @div_select_constant_fold_zero(i1 zeroext %arg) {
 
 define void @PR24141() {
 ; CHECK-LABEL: PR24141:
-; CHECK:	callq
-; CHECK-NEXT:	divsd
-; CHECK-NEXT:	jmp
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    # implicit-def: $xmm0
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB8_1: # %while.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq g
+; CHECK-NEXT:    divsd %xmm1, %xmm0
+; CHECK-NEXT:    jmp .LBB8_1
 entry:
   br label %while.body
 
diff --git a/test/CodeGen/X86/fp128-compare.ll b/test/CodeGen/X86/fp128-compare.ll
index 7ee2e90657c..6f2b0c514a8 100644
--- a/test/CodeGen/X86/fp128-compare.ll
+++ b/test/CodeGen/X86/fp128-compare.ll
@@ -1,103 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx \
 ; RUN:     -enable-legalize-types-checking | FileCheck %s
 ; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx \
 ; RUN:     -enable-legalize-types-checking | FileCheck %s
 
 define i32 @TestComp128GT(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128GT:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __gttf2
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    setg %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp ogt fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128GT:
-; CHECK:       callq __gttf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       setg  %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
 }
 
 define i32 @TestComp128GE(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128GE:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __getf2
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    setns %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp oge fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128GE:
-; CHECK:       callq __getf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       setns %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
 }
 
 define i32 @TestComp128LT(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128LT:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __lttf2
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp olt fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128LT:
-; CHECK:       callq __lttf2
-; CHECK-NEXT:  shrl $31, %eax
-; CHECK:       retq
-;
 ; The 'shrl' is a special optimization in llvm to combine
 ; the effect of 'fcmp olt' and 'zext'. The main purpose is
 ; to test soften call to __lttf2.
 }
 
 define i32 @TestComp128LE(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128LE:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __letf2
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    setle %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp ole fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128LE:
-; CHECK:       callq __letf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       setle %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
 }
 
 define i32 @TestComp128EQ(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128EQ:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __eqtf2
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    sete %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp oeq fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128EQ:
-; CHECK:       callq __eqtf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       sete  %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
 }
 
 define i32 @TestComp128NE(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128NE:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __netf2
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp une fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128NE:
-; CHECK:       callq __netf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       setne %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
 }
 
 define fp128 @TestMax(fp128 %x, fp128 %y) {
+; CHECK-LABEL: TestMax:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq __gttf2
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jg .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:  .LBB6_2: # %entry
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp ogt fp128 %x, %y
   %cond = select i1 %cmp, fp128 %x, fp128 %y
   ret fp128 %cond
-; CHECK-LABEL: TestMax:
-; CHECK: movaps %xmm0
-; CHECK: movaps %xmm1
-; CHECK: callq __gttf2
-; CHECK: movaps {{.*}}, %xmm0
-; CHECK: testl %eax, %eax
-; CHECK: movaps {{.*}}, %xmm0
-; CHECK: retq
 }
diff --git a/test/CodeGen/X86/machine-trace-metrics-crash.ll b/test/CodeGen/X86/machine-trace-metrics-crash.ll
index 6369ee4eb0e..c9e8c636186 100644
--- a/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s
 
 ; The debug info in this test case was causing a crash because machine trace metrics
@@ -6,9 +7,41 @@
 ; used machine trace metrics.
 
 define void @PR24199() {
-; CHECK-LABEL:	PR24199:
-; CHECK:	addss	%xmm1, %xmm0
-; CHECK:	addss	%xmm2, %xmm0
+; CHECK-LABEL: PR24199:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_2: # %if.then
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:  .LBB0_3: # %if.end
+; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
+; CHECK-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    mulss %xmm0, %xmm2
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    addss %xmm1, %xmm0
+; CHECK-NEXT:    addss %xmm2, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%rax)
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %if.end
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:  .LBB0_5: # %if.end
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    addss %xmm0, %xmm0
+; CHECK-NEXT:    addss %xmm1, %xmm0
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 
 entry:
   %i = alloca %struct.A, align 8
diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll
index 7da7c299791..02e9b4c1593 100644
--- a/test/CodeGen/X86/pr5145.ll
+++ b/test/CodeGen/X86/pr5145.ll
@@ -1,31 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s
 @sc8 = external global i8
 
 define void @atomic_maxmin_i8() {
-; CHECK: atomic_maxmin_i8
+; CHECK-LABEL: atomic_maxmin_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movb {{.*}}(%rip), %al
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpb $4, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    jg .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.start
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movb $5, %cl
+; CHECK-NEXT:  .LBB0_3: # %atomicrmw.start
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.4: # %atomicrmw.end
+; CHECK-NEXT:    movb {{.*}}(%rip), %al
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpb $7, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    jl .LBB0_7
+; CHECK-NEXT:  # %bb.6: # %atomicrmw.start2
+; CHECK-NEXT:    # in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    movb $6, %cl
+; CHECK-NEXT:  .LBB0_7: # %atomicrmw.start2
+; CHECK-NEXT:    # in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.8: # %atomicrmw.end1
+; CHECK-NEXT:    movb {{.*}}(%rip), %al
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_9: # %atomicrmw.start8
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpb $7, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    ja .LBB0_11
+; CHECK-NEXT:  # %bb.10: # %atomicrmw.start8
+; CHECK-NEXT:    # in Loop: Header=BB0_9 Depth=1
+; CHECK-NEXT:    movb $7, %cl
+; CHECK-NEXT:  .LBB0_11: # %atomicrmw.start8
+; CHECK-NEXT:    # in Loop: Header=BB0_9 Depth=1
+; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
+; CHECK-NEXT:    jne .LBB0_9
+; CHECK-NEXT:  # %bb.12: # %atomicrmw.end7
+; CHECK-NEXT:    movb {{.*}}(%rip), %al
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_13: # %atomicrmw.start14
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpb $9, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    jb .LBB0_15
+; CHECK-NEXT:  # %bb.14: # %atomicrmw.start14
+; CHECK-NEXT:    # in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    movb $8, %cl
+; CHECK-NEXT:  .LBB0_15: # %atomicrmw.start14
+; CHECK-NEXT:    # in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
+; CHECK-NEXT:    jne .LBB0_13
+; CHECK-NEXT:  # %bb.16: # %atomicrmw.end13
+; CHECK-NEXT:    retq
   %1 = atomicrmw max  i8* @sc8, i8 5 acquire
-; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: jg
-; CHECK: lock cmpxchgb
-; CHECK: jne [[LABEL1]]
   %2 = atomicrmw min  i8* @sc8, i8 6 acquire
-; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: jl
-; CHECK: lock cmpxchgb
-; CHECK: jne [[LABEL3]]
   %3 = atomicrmw umax i8* @sc8, i8 7 acquire
-; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: ja
-; CHECK: lock cmpxchgb
-; CHECK: jne [[LABEL5]]
   %4 = atomicrmw umin i8* @sc8, i8 8 acquire
-; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: jb
-; CHECK: lock cmpxchgb
-; CHECK: jne [[LABEL7]]
   ret void
 }
diff --git a/test/CodeGen/X86/pseudo_cmov_lower2.ll b/test/CodeGen/X86/pseudo_cmov_lower2.ll
index 1a61b0b9700..5218e1f0cee 100644
--- a/test/CodeGen/X86/pseudo_cmov_lower2.ll
+++ b/test/CodeGen/X86/pseudo_cmov_lower2.ll
@@ -1,14 +1,29 @@
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s 
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s
 
 ; This test checks that only a single jae gets generated in the final code
 ; for lowering the CMOV pseudos that get created for this IR.  The tricky part
 ; of this test is that it tests the special PHI operand rewriting code in
 ; X86TargetLowering::EmitLoweredSelect.
 ;
-; CHECK-LABEL: foo1:
-; CHECK: jae
-; CHECK-NOT: jae
 define double @foo1(float %p1, double %p2, double %p3) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorps %xmm3, %xmm3
+; CHECK-NEXT:    ucomiss %xmm3, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    jae .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    addsd %xmm0, %xmm1
+; CHECK-NEXT:    movapd %xmm1, %xmm0
+; CHECK-NEXT:    movapd %xmm1, %xmm2
+; CHECK-NEXT:  .LBB0_3: # %entry
+; CHECK-NEXT:    subsd %xmm1, %xmm0
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %c1 = fcmp oge float %p1, 0.000000e+00
   %d0 = fadd double %p2, 1.25e0
@@ -26,10 +41,24 @@ entry:
 ; of this test is that it tests the special PHI operand rewriting code in
 ; X86TargetLowering::EmitLoweredSelect.
 ;
-; CHECK-LABEL: foo2:
-; CHECK: jae
-; CHECK-NOT: jae
 define double @foo2(float %p1, double %p2, double %p3) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorps %xmm3, %xmm3
+; CHECK-NEXT:    ucomiss %xmm3, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    jae .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    addsd %xmm0, %xmm2
+; CHECK-NEXT:    movapd %xmm2, %xmm0
+; CHECK-NEXT:    movapd %xmm2, %xmm1
+; CHECK-NEXT:    jmp .LBB1_3
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    addsd %xmm1, %xmm0
+; CHECK-NEXT:  .LBB1_3: # %entry
+; CHECK-NEXT:    subsd %xmm1, %xmm0
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %c1 = fcmp oge float %p1, 0.000000e+00
   %d0 = fadd double %p2, 1.25e0
@@ -48,16 +77,17 @@ entry:
 ; X86TargetLowering::EmitLoweredSelect.  It also tests to make sure all
 ; the operands of the resulting instructions are from the proper places.
 ;
-; CHECK-LABEL: foo3:
-; CHECK:          js
-; CHECK-NOT: js
-; CHECK-LABEL: # %bb.1:
-; CHECK-DAG:      movapd  %xmm2, %xmm1
-; CHECK-DAG:      movapd  %xmm2, %xmm0
-; CHECK-LABEL:.LBB2_2:
-; CHECK:          divsd   %xmm1, %xmm0
-; CHECK:          ret
 define double @foo3(i32 %p1, double %p2, double %p3,
+; CHECK-LABEL: foo3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    js .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movapd %xmm2, %xmm1
+; CHECK-NEXT:    movapd %xmm2, %xmm0
+; CHECK-NEXT:  .LBB2_2: # %entry
+; CHECK-NEXT:    divsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
                              double %p4, double %p5) nounwind {
 entry:
   %c1 = icmp slt i32 %p1, 0
@@ -78,16 +108,17 @@ entry:
 ; condition code in the second two selects, but we also swap the operands
 ; of the selects to give the same actual computation.
 ;
-; CHECK-LABEL: foo4:
-; CHECK:          js
-; CHECK-NOT: js
-; CHECK-LABEL: # %bb.1:
-; CHECK-DAG:      movapd  %xmm2, %xmm1
-; CHECK-DAG:      movapd  %xmm2, %xmm0
-; CHECK-LABEL:.LBB3_2:
-; CHECK:          divsd   %xmm1, %xmm0
-; CHECK:          ret
 define double @foo4(i32 %p1, double %p2, double %p3,
+; CHECK-LABEL: foo4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    js .LBB3_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movapd %xmm2, %xmm1
+; CHECK-NEXT:    movapd %xmm2, %xmm0
+; CHECK-NEXT:  .LBB3_2: # %entry
+; CHECK-NEXT:    divsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
                              double %p4, double %p5) nounwind {
 entry:
   %c1 = icmp slt i32 %p1, 0
@@ -103,10 +134,24 @@ entry:
 ; for lowering the CMOV pseudos that get created for this IR.  The tricky part
 ; of this test is that it tests the special code in CodeGenPrepare.
 ;
-; CHECK-LABEL: foo5:
-; CHECK: jae
-; CHECK-NOT: jae
 define double @foo5(float %p1, double %p2, double %p3) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorps %xmm3, %xmm3
+; CHECK-NEXT:    ucomiss %xmm3, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    jae .LBB4_1
+; CHECK-NEXT:  # %bb.2: # %select.false
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:  .LBB4_3: # %select.end
+; CHECK-NEXT:    subsd %xmm1, %xmm0
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB4_1:
+; CHECK-NEXT:    addsd %xmm0, %xmm1
+; CHECK-NEXT:    movapd %xmm1, %xmm0
+; CHECK-NEXT:    movapd %xmm1, %xmm2
+; CHECK-NEXT:    jmp .LBB4_3
 entry:
   %c1 = fcmp oge float %p1, 0.000000e+00
   %d0 = fadd double %p2, 1.25e0
@@ -122,11 +167,35 @@ entry:
 ; We should expand select instructions into 3 conditional branches as their
 ; condtions are different.
 ;
-; CHECK-LABEL: foo6:
-; CHECK: jae
-; CHECK: jae
-; CHECK: jae
 define double @foo6(float %p1, double %p2, double %p3) nounwind {
+; CHECK-LABEL: foo6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movaps %xmm0, %xmm3
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    ucomiss %xmm0, %xmm3
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    jae .LBB5_1
+; CHECK-NEXT:  # %bb.2: # %select.false
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:  .LBB5_3: # %select.end
+; CHECK-NEXT:    ucomiss {{.*}}(%rip), %xmm3
+; CHECK-NEXT:    movapd %xmm0, %xmm4
+; CHECK-NEXT:    jae .LBB5_5
+; CHECK-NEXT:  # %bb.4: # %select.false2
+; CHECK-NEXT:    movapd %xmm1, %xmm4
+; CHECK-NEXT:  .LBB5_5: # %select.end1
+; CHECK-NEXT:    ucomiss {{.*}}(%rip), %xmm3
+; CHECK-NEXT:    movapd %xmm4, %xmm1
+; CHECK-NEXT:    jae .LBB5_7
+; CHECK-NEXT:  # %bb.6: # %select.false4
+; CHECK-NEXT:    movapd %xmm2, %xmm1
+; CHECK-NEXT:  .LBB5_7: # %select.end3
+; CHECK-NEXT:    subsd %xmm4, %xmm0
+; CHECK-NEXT:    addsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB5_1:
+; CHECK-NEXT:    addsd %xmm1, %xmm0
+; CHECK-NEXT:    jmp .LBB5_3
 entry:
   %c1 = fcmp oge float %p1, 0.000000e+00
   %c2 = fcmp oge float %p1, 1.000000e+00
-- 
GitLab


From 4d360e23f7df2818688c3973d8ddd20e884bb944 Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Mon, 5 Nov 2018 20:51:13 +0000
Subject: [PATCH 0978/1116] [GlobalISel] Refactor the artifact combiner a bit
 by using MIPatternMatch

Reviewers: aditya_nandakumar

Reviewed By: aditya_nandakumar

Subscribers: rovka, kristof.beyls, llvm-commits

Differential Revision: https://reviews.llvm.org/D54116

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346166 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../GlobalISel/LegalizationArtifactCombiner.h | 90 +++++++++++--------
 1 file changed, 54 insertions(+), 36 deletions(-)

diff --git a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 256f1ccbee7..d2389ddd4f9 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -14,12 +14,14 @@
 
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "legalizer"
+using namespace llvm::MIPatternMatch;
 
 namespace llvm {
 class LegalizationArtifactCombiner {
@@ -36,15 +38,21 @@ public:
                         SmallVectorImpl<MachineInstr *> &DeadInsts) {
     if (MI.getOpcode() != TargetOpcode::G_ANYEXT)
       return false;
-    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
-                                           MI.getOperand(1).getReg(), MRI)) {
+
+    Builder.setInstr(MI);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = MI.getOperand(1).getReg();
+
+    // Look through copy instructions.
+    while (mi_match(SrcReg, MRI, m_Copy(m_Reg(SrcReg))))
+      ;
+
+    // aext(trunc x) - > aext/copy/trunc x
+    unsigned TruncSrc;
+    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      unsigned DstReg = MI.getOperand(0).getReg();
-      unsigned SrcReg = DefMI->getOperand(1).getReg();
-      Builder.setInstr(MI);
-      // We get a copy/trunc/extend depending on the sizes
-      Builder.buildAnyExtOrTrunc(DstReg, SrcReg);
-      markInstAndDefDead(MI, *DefMI, DeadInsts);
+      Builder.buildAnyExtOrTrunc(DstReg, TruncSrc);
+      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
@@ -55,24 +63,29 @@ public:
 
     if (MI.getOpcode() != TargetOpcode::G_ZEXT)
       return false;
-    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
-                                           MI.getOperand(1).getReg(), MRI)) {
-      unsigned DstReg = MI.getOperand(0).getReg();
+
+    Builder.setInstr(MI);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = MI.getOperand(1).getReg();
+
+    // Look through copy instructions.
+    while (mi_match(SrcReg, MRI, m_Copy(m_Reg(SrcReg))))
+      ;
+
+    // zext(trunc x) - > and (aext/copy/trunc x), mask
+    unsigned TruncSrc;
+    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
       LLT DstTy = MRI.getType(DstReg);
       if (isInstUnsupported({TargetOpcode::G_AND, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
         return false;
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      Builder.setInstr(MI);
-      unsigned ZExtSrc = MI.getOperand(1).getReg();
-      LLT ZExtSrcTy = MRI.getType(ZExtSrc);
-      APInt Mask = APInt::getAllOnesValue(ZExtSrcTy.getSizeInBits());
-      auto MaskCstMIB = Builder.buildConstant(DstTy, Mask.getZExtValue());
-      unsigned TruncSrc = DefMI->getOperand(1).getReg();
-      // We get a copy/trunc/extend depending on the sizes
-      auto SrcCopyOrTrunc = Builder.buildAnyExtOrTrunc(DstTy, TruncSrc);
-      Builder.buildAnd(DstReg, SrcCopyOrTrunc, MaskCstMIB);
-      markInstAndDefDead(MI, *DefMI, DeadInsts);
+      LLT SrcTy = MRI.getType(SrcReg);
+      APInt Mask = APInt::getAllOnesValue(SrcTy.getSizeInBits());
+      auto MIBMask = Builder.buildConstant(DstTy, Mask.getZExtValue());
+      Builder.buildAnd(DstReg, Builder.buildAnyExtOrTrunc(DstTy, TruncSrc),
+                       MIBMask);
+      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
@@ -83,27 +96,32 @@ public:
 
     if (MI.getOpcode() != TargetOpcode::G_SEXT)
       return false;
-    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
-                                           MI.getOperand(1).getReg(), MRI)) {
-      unsigned DstReg = MI.getOperand(0).getReg();
+
+    Builder.setInstr(MI);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = MI.getOperand(1).getReg();
+
+    // Look through copy instructions.
+    while (mi_match(SrcReg, MRI, m_Copy(m_Reg(SrcReg))))
+      ;
+
+    // sext(trunc x) - > ashr (shl (aext/copy/trunc x), c), c
+    unsigned TruncSrc;
+    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
       LLT DstTy = MRI.getType(DstReg);
       if (isInstUnsupported({TargetOpcode::G_SHL, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_ASHR, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
         return false;
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      Builder.setInstr(MI);
-      unsigned SExtSrc = MI.getOperand(1).getReg();
-      LLT SExtSrcTy = MRI.getType(SExtSrc);
-      unsigned SizeDiff = DstTy.getSizeInBits() - SExtSrcTy.getSizeInBits();
-      auto SizeDiffMIB = Builder.buildConstant(DstTy, SizeDiff);
-      unsigned TruncSrcReg = DefMI->getOperand(1).getReg();
-      // We get a copy/trunc/extend depending on the sizes
-      auto SrcCopyExtOrTrunc = Builder.buildAnyExtOrTrunc(DstTy, TruncSrcReg);
-      auto ShlMIB = Builder.buildInstr(TargetOpcode::G_SHL, DstTy,
-                                       SrcCopyExtOrTrunc, SizeDiffMIB);
-      Builder.buildInstr(TargetOpcode::G_ASHR, DstReg, ShlMIB, SizeDiffMIB);
-      markInstAndDefDead(MI, *DefMI, DeadInsts);
+      LLT SrcTy = MRI.getType(SrcReg);
+      unsigned ShAmt = DstTy.getSizeInBits() - SrcTy.getSizeInBits();
+      auto MIBShAmt = Builder.buildConstant(DstTy, ShAmt);
+      auto MIBShl = Builder.buildInstr(
+          TargetOpcode::G_SHL, DstTy,
+          Builder.buildAnyExtOrTrunc(DstTy, TruncSrc), MIBShAmt);
+      Builder.buildInstr(TargetOpcode::G_ASHR, DstReg, MIBShl, MIBShAmt);
+      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
-- 
GitLab


From 80ce0ad512376158bbd7b8d2cb164d31709dcbd8 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Nov 2018 21:42:01 +0000
Subject: [PATCH 0979/1116] [InstSimplify] add tests for select+fcmp; NFC

These are translated from InstCombine's test file with the same name.
We should move the transform from InstCombine to InstSimplify.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346168 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstSimplify/fcmp-select.ll | 102 ++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 test/Transforms/InstSimplify/fcmp-select.ll

diff --git a/test/Transforms/InstSimplify/fcmp-select.ll b/test/Transforms/InstSimplify/fcmp-select.ll
new file mode 100644
index 00000000000..00f49b0edfc
--- /dev/null
+++ b/test/Transforms/InstSimplify/fcmp-select.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; X == 42.0 ? X : 42.0 --> 42.0
+
+define double @oeq(double %x) {
+; CHECK-LABEL: @oeq(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double 4.200000e+01
+; CHECK-NEXT:    ret double [[COND]]
+;
+  %cmp = fcmp oeq double %x, 42.0
+  %cond = select i1 %cmp, double %x, double 42.0
+  ret double %cond
+}
+
+; X == 42.0 ? 42.0 : X --> X
+
+define float @oeq_swapped(float %x) {
+; CHECK-LABEL: @oeq_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], float 4.200000e+01, float [[X]]
+; CHECK-NEXT:    ret float [[COND]]
+;
+  %cmp = fcmp oeq float %x, 42.0
+  %cond = select i1 %cmp, float 42.0, float %x
+  ret float %cond
+}
+
+; x != y ? x : y -> x if it's the right kind of != and at least
+; one of x and y is not negative zero.
+
+; X != 42.0 ? X : 42.0 --> X
+
+define double @une(double %x) {
+; CHECK-LABEL: @une(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double 4.200000e+01
+; CHECK-NEXT:    ret double [[COND]]
+;
+  %cmp = fcmp une double %x, 42.0
+  %cond = select i1 %cmp, double %x, double 42.0
+  ret double %cond
+}
+
+; X != 42.0 ? 42.0 : X --> 42.0
+
+define double @une_swapped(double %x) {
+; CHECK-LABEL: @une_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double 4.200000e+01, double [[X]]
+; CHECK-NEXT:    ret double [[COND]]
+;
+  %cmp = fcmp une double %x, 42.0
+  %cond = select i1 %cmp, double 42.0, double %x
+  ret double %cond
+}
+
+define double @une_could_be_negzero(double %x, double %y) {
+; CHECK-LABEL: @une_could_be_negzero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double [[Y]]
+; CHECK-NEXT:    ret double [[COND]]
+;
+  %cmp = fcmp une double %x, %y
+  %cond = select i1 %cmp, double %x, double %y
+  ret double %cond
+}
+
+define double @une_swapped_could_be_negzero(double %x, double %y) {
+; CHECK-LABEL: @une_swapped_could_be_negzero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[Y]], double [[X]]
+; CHECK-NEXT:    ret double [[COND]]
+;
+  %cmp = fcmp une double %x, %y
+  %cond = select i1 %cmp, double %y, double %x
+  ret double %cond
+}
+
+define double @one(double %x) {
+; CHECK-LABEL: @one(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[X:%.*]], -1.000000e+00
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double -1.000000e+00
+; CHECK-NEXT:    ret double [[COND]]
+;
+  %cmp = fcmp one double %x, -1.0
+  %cond = select i1 %cmp, double %x, double -1.0
+  ret double %cond
+}
+
+define double @one_swapped(double %x) {
+; CHECK-LABEL: @one_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[X:%.*]], -1.000000e+00
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double -1.000000e+00, double [[X]]
+; CHECK-NEXT:    ret double [[COND]]
+;
+  %cmp = fcmp one double %x, -1.0
+  %cond = select i1 %cmp, double -1.0, double %x
+  ret double %cond
+}
+
-- 
GitLab


From e357c8b87980be1721791394b173f17ad94e95e9 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Nov 2018 21:51:39 +0000
Subject: [PATCH 0980/1116] [InstSimplify] fold select (fcmp X, Y), X, Y

This is NFCI for InstCombine because it calls InstSimplify,
so I left the tests for this transform there. As noted in
the code comment, we can allow this fold more often by using
FMF and/or value tracking.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346169 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/InstructionSimplify.cpp          | 31 ++++++++++++
 .../InstCombine/InstCombineSelect.cpp         | 50 -------------------
 test/Transforms/InstSimplify/fcmp-select.ll   | 16 ++----
 3 files changed, 35 insertions(+), 62 deletions(-)

diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index db929aa7059..fd6f4ba476e 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -3874,6 +3874,34 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
   return nullptr;
 }
 
+/// Try to simplify a select instruction when its condition operand is a
+/// floating-point comparison.
+static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F) {
+  FCmpInst::Predicate Pred;
+  if (!match(Cond, m_FCmp(Pred, m_Specific(T), m_Specific(F))) &&
+      !match(Cond, m_FCmp(Pred, m_Specific(F), m_Specific(T))))
+    return nullptr;
+
+  // TODO: The transform may not be valid with -0.0. An incomplete way of
+  // testing for that possibility is to check if at least one operand is a
+  // non-zero constant.
+  const APFloat *C;
+  if ((match(T, m_APFloat(C)) && C->isNonZero()) ||
+      (match(F, m_APFloat(C)) && C->isNonZero())) {
+    // (T == F) ? T : F --> F
+    // (F == T) ? T : F --> F
+    if (Pred == FCmpInst::FCMP_OEQ)
+      return F;
+
+    // (T != F) ? T : F --> T
+    // (F != T) ? T : F --> T
+    if (Pred == FCmpInst::FCMP_UNE)
+      return T;
+  }
+
+  return nullptr;
+}
+
 /// Given operands for a SelectInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
@@ -3910,6 +3938,9 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
           simplifySelectWithICmpCond(Cond, TrueVal, FalseVal, Q, MaxRecurse))
     return V;
 
+  if (Value *V = simplifySelectWithFCmp(Cond, TrueVal, FalseVal))
+    return V;
+
   if (Value *V = foldSelectWithBinaryOp(Cond, TrueVal, FalseVal))
     return V;
 
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 724662f0128..88a72bb8eb5 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1660,31 +1660,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   // See if we are selecting two values based on a comparison of the two values.
   if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
     if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) {
-      // Transform (X == Y) ? X : Y  -> Y
-      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
-        // This is not safe in general for floating point:
-        // consider X== -0, Y== +0.
-        // It becomes safe if either operand is a nonzero constant.
-        ConstantFP *CFPt, *CFPf;
-        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
-              !CFPt->getValueAPF().isZero()) ||
-            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
-             !CFPf->getValueAPF().isZero()))
-        return replaceInstUsesWith(SI, FalseVal);
-      }
-      // Transform (X une Y) ? X : Y  -> X
-      if (FCI->getPredicate() == FCmpInst::FCMP_UNE) {
-        // This is not safe in general for floating point:
-        // consider X== -0, Y== +0.
-        // It becomes safe if either operand is a nonzero constant.
-        ConstantFP *CFPt, *CFPf;
-        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
-              !CFPt->getValueAPF().isZero()) ||
-            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
-             !CFPf->getValueAPF().isZero()))
-        return replaceInstUsesWith(SI, TrueVal);
-      }
-
       // Canonicalize to use ordered comparisons by swapping the select
       // operands.
       //
@@ -1703,31 +1678,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
       // NOTE: if we wanted to, this is where to detect MIN/MAX
     } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){
-      // Transform (X == Y) ? Y : X  -> X
-      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
-        // This is not safe in general for floating point:
-        // consider X== -0, Y== +0.
-        // It becomes safe if either operand is a nonzero constant.
-        ConstantFP *CFPt, *CFPf;
-        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
-              !CFPt->getValueAPF().isZero()) ||
-            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
-             !CFPf->getValueAPF().isZero()))
-          return replaceInstUsesWith(SI, FalseVal);
-      }
-      // Transform (X une Y) ? Y : X  -> Y
-      if (FCI->getPredicate() == FCmpInst::FCMP_UNE) {
-        // This is not safe in general for floating point:
-        // consider X== -0, Y== +0.
-        // It becomes safe if either operand is a nonzero constant.
-        ConstantFP *CFPt, *CFPf;
-        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
-              !CFPt->getValueAPF().isZero()) ||
-            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
-             !CFPf->getValueAPF().isZero()))
-          return replaceInstUsesWith(SI, TrueVal);
-      }
-
       // Canonicalize to use ordered comparisons by swapping the select
       // operands.
       //
diff --git a/test/Transforms/InstSimplify/fcmp-select.ll b/test/Transforms/InstSimplify/fcmp-select.ll
index 00f49b0edfc..eae885c8471 100644
--- a/test/Transforms/InstSimplify/fcmp-select.ll
+++ b/test/Transforms/InstSimplify/fcmp-select.ll
@@ -5,9 +5,7 @@
 
 define double @oeq(double %x) {
 ; CHECK-LABEL: @oeq(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[X:%.*]], 4.200000e+01
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double 4.200000e+01
-; CHECK-NEXT:    ret double [[COND]]
+; CHECK-NEXT:    ret double 4.200000e+01
 ;
   %cmp = fcmp oeq double %x, 42.0
   %cond = select i1 %cmp, double %x, double 42.0
@@ -18,9 +16,7 @@ define double @oeq(double %x) {
 
 define float @oeq_swapped(float %x) {
 ; CHECK-LABEL: @oeq_swapped(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[X:%.*]], 4.200000e+01
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], float 4.200000e+01, float [[X]]
-; CHECK-NEXT:    ret float [[COND]]
+; CHECK-NEXT:    ret float [[X:%.*]]
 ;
   %cmp = fcmp oeq float %x, 42.0
   %cond = select i1 %cmp, float 42.0, float %x
@@ -34,9 +30,7 @@ define float @oeq_swapped(float %x) {
 
 define double @une(double %x) {
 ; CHECK-LABEL: @une(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], 4.200000e+01
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double 4.200000e+01
-; CHECK-NEXT:    ret double [[COND]]
+; CHECK-NEXT:    ret double [[X:%.*]]
 ;
   %cmp = fcmp une double %x, 42.0
   %cond = select i1 %cmp, double %x, double 42.0
@@ -47,9 +41,7 @@ define double @une(double %x) {
 
 define double @une_swapped(double %x) {
 ; CHECK-LABEL: @une_swapped(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], 4.200000e+01
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double 4.200000e+01, double [[X]]
-; CHECK-NEXT:    ret double [[COND]]
+; CHECK-NEXT:    ret double 4.200000e+01
 ;
   %cmp = fcmp une double %x, 42.0
   %cond = select i1 %cmp, double 42.0, double %x
-- 
GitLab


From 94a5ca407cfe5261d68fc48d4eb72bcf5ec56c0e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 5 Nov 2018 22:08:17 +0000
Subject: [PATCH 0981/1116] [X86] Don't turn any_extend from a mask register
 into a sign_extend during lowering. Add patterns to match any_extend during
 isel instead.

SimplifyDemandedBits can turn a sign_extend back into an any_extend and trigger an infinite loop. So instead legalize it the same way as a sign_extend, but preserve the opcode. Then just pattern match it the same as sign_extend during isel.

I don't have a reduced test case for such an infinite loop yet.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346170 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp |  4 ++--
 lib/Target/X86/X86InstrAVX512.td   | 12 ++++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 99893be4e60..9e7a41c752a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -19697,7 +19697,7 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
   if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
     // If v16i32 is to be avoided, we'll need to split and concatenate.
     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
-      return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
+      return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
 
     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
   }
@@ -19716,7 +19716,7 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
   MVT WideEltVT = WideVT.getVectorElementType();
   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
-    V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
+    V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
   } else {
     SDValue NegOne = getOnesVector(WideVT, DAG, dl);
     SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index ec314f329fd..f8ade37f8df 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -9958,6 +9958,10 @@ def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
                   !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
                   [(set Vec.RC:$dst, (Vec.VT (sext Vec.KRC:$src)))]>,
                   EVEX, Sched<[WriteMove]>; // TODO - WriteVecTrunc?
+
+// Also need a pattern for anyextend.
+def : Pat<(Vec.VT (anyext Vec.KRC:$src)),
+          (!cast<Instruction>(NAME#"rr") Vec.KRC:$src)>;
 }
 
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
@@ -10031,11 +10035,19 @@ let Predicates = [HasDQI, NoBWI] in {
             (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
   def : Pat<(v16i16 (sext (v16i1 VK16:$src))),
             (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
+
+  def : Pat<(v16i8 (anyext (v16i1 VK16:$src))),
+            (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
+  def : Pat<(v16i16 (anyext (v16i1 VK16:$src))),
+            (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
 }
 
 let Predicates = [HasDQI, NoBWI, HasVLX] in {
   def : Pat<(v8i16 (sext (v8i1 VK8:$src))),
             (VPMOVDWZ256rr (v8i32 (VPMOVM2DZ256rr VK8:$src)))>;
+
+  def : Pat<(v8i16 (anyext (v8i1 VK8:$src))),
+            (VPMOVDWZ256rr (v8i32 (VPMOVM2DZ256rr VK8:$src)))>;
 }
 
 //===----------------------------------------------------------------------===//
-- 
GitLab


From 351cca42bd1fd38c444a516b862cd81c00734f13 Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Mon, 5 Nov 2018 22:25:01 +0000
Subject: [PATCH 0982/1116] Revert "[GlobalISel] Refactor the artifact combiner
 a bit by using MIPatternMatch"

This reverts r346166 as it breaks
test-suite-verify-machineinstrs-aarch64-globalisel-O0-g.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346175 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../GlobalISel/LegalizationArtifactCombiner.h | 90 ++++++++-----------
 1 file changed, 36 insertions(+), 54 deletions(-)

diff --git a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index d2389ddd4f9..256f1ccbee7 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -14,14 +14,12 @@
 
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "legalizer"
-using namespace llvm::MIPatternMatch;
 
 namespace llvm {
 class LegalizationArtifactCombiner {
@@ -38,21 +36,15 @@ public:
                         SmallVectorImpl<MachineInstr *> &DeadInsts) {
     if (MI.getOpcode() != TargetOpcode::G_ANYEXT)
       return false;
-
-    Builder.setInstr(MI);
-    unsigned DstReg = MI.getOperand(0).getReg();
-    unsigned SrcReg = MI.getOperand(1).getReg();
-
-    // Look through copy instructions.
-    while (mi_match(SrcReg, MRI, m_Copy(m_Reg(SrcReg))))
-      ;
-
-    // aext(trunc x) - > aext/copy/trunc x
-    unsigned TruncSrc;
-    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
+    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
+                                           MI.getOperand(1).getReg(), MRI)) {
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      Builder.buildAnyExtOrTrunc(DstReg, TruncSrc);
-      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned SrcReg = DefMI->getOperand(1).getReg();
+      Builder.setInstr(MI);
+      // We get a copy/trunc/extend depending on the sizes
+      Builder.buildAnyExtOrTrunc(DstReg, SrcReg);
+      markInstAndDefDead(MI, *DefMI, DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
@@ -63,29 +55,24 @@ public:
 
     if (MI.getOpcode() != TargetOpcode::G_ZEXT)
       return false;
-
-    Builder.setInstr(MI);
-    unsigned DstReg = MI.getOperand(0).getReg();
-    unsigned SrcReg = MI.getOperand(1).getReg();
-
-    // Look through copy instructions.
-    while (mi_match(SrcReg, MRI, m_Copy(m_Reg(SrcReg))))
-      ;
-
-    // zext(trunc x) - > and (aext/copy/trunc x), mask
-    unsigned TruncSrc;
-    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
+    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
+                                           MI.getOperand(1).getReg(), MRI)) {
+      unsigned DstReg = MI.getOperand(0).getReg();
       LLT DstTy = MRI.getType(DstReg);
       if (isInstUnsupported({TargetOpcode::G_AND, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
         return false;
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      LLT SrcTy = MRI.getType(SrcReg);
-      APInt Mask = APInt::getAllOnesValue(SrcTy.getSizeInBits());
-      auto MIBMask = Builder.buildConstant(DstTy, Mask.getZExtValue());
-      Builder.buildAnd(DstReg, Builder.buildAnyExtOrTrunc(DstTy, TruncSrc),
-                       MIBMask);
-      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
+      Builder.setInstr(MI);
+      unsigned ZExtSrc = MI.getOperand(1).getReg();
+      LLT ZExtSrcTy = MRI.getType(ZExtSrc);
+      APInt Mask = APInt::getAllOnesValue(ZExtSrcTy.getSizeInBits());
+      auto MaskCstMIB = Builder.buildConstant(DstTy, Mask.getZExtValue());
+      unsigned TruncSrc = DefMI->getOperand(1).getReg();
+      // We get a copy/trunc/extend depending on the sizes
+      auto SrcCopyOrTrunc = Builder.buildAnyExtOrTrunc(DstTy, TruncSrc);
+      Builder.buildAnd(DstReg, SrcCopyOrTrunc, MaskCstMIB);
+      markInstAndDefDead(MI, *DefMI, DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
@@ -96,32 +83,27 @@ public:
 
     if (MI.getOpcode() != TargetOpcode::G_SEXT)
       return false;
-
-    Builder.setInstr(MI);
-    unsigned DstReg = MI.getOperand(0).getReg();
-    unsigned SrcReg = MI.getOperand(1).getReg();
-
-    // Look through copy instructions.
-    while (mi_match(SrcReg, MRI, m_Copy(m_Reg(SrcReg))))
-      ;
-
-    // sext(trunc x) - > ashr (shl (aext/copy/trunc x), c), c
-    unsigned TruncSrc;
-    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
+    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
+                                           MI.getOperand(1).getReg(), MRI)) {
+      unsigned DstReg = MI.getOperand(0).getReg();
       LLT DstTy = MRI.getType(DstReg);
       if (isInstUnsupported({TargetOpcode::G_SHL, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_ASHR, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
         return false;
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      LLT SrcTy = MRI.getType(SrcReg);
-      unsigned ShAmt = DstTy.getSizeInBits() - SrcTy.getSizeInBits();
-      auto MIBShAmt = Builder.buildConstant(DstTy, ShAmt);
-      auto MIBShl = Builder.buildInstr(
-          TargetOpcode::G_SHL, DstTy,
-          Builder.buildAnyExtOrTrunc(DstTy, TruncSrc), MIBShAmt);
-      Builder.buildInstr(TargetOpcode::G_ASHR, DstReg, MIBShl, MIBShAmt);
-      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
+      Builder.setInstr(MI);
+      unsigned SExtSrc = MI.getOperand(1).getReg();
+      LLT SExtSrcTy = MRI.getType(SExtSrc);
+      unsigned SizeDiff = DstTy.getSizeInBits() - SExtSrcTy.getSizeInBits();
+      auto SizeDiffMIB = Builder.buildConstant(DstTy, SizeDiff);
+      unsigned TruncSrcReg = DefMI->getOperand(1).getReg();
+      // We get a copy/trunc/extend depending on the sizes
+      auto SrcCopyExtOrTrunc = Builder.buildAnyExtOrTrunc(DstTy, TruncSrcReg);
+      auto ShlMIB = Builder.buildInstr(TargetOpcode::G_SHL, DstTy,
+                                       SrcCopyExtOrTrunc, SizeDiffMIB);
+      Builder.buildInstr(TargetOpcode::G_ASHR, DstReg, ShlMIB, SizeDiffMIB);
+      markInstAndDefDead(MI, *DefMI, DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
-- 
GitLab


From 7829a6dfd5fa986cb4e620eb4063133ba1d5a66f Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Mon, 5 Nov 2018 22:44:19 +0000
Subject: [PATCH 0983/1116] AMDGPU: Add sram-ecc feature

Differential Revision: https://reviews.llvm.org/D53222


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346177 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/AMDGPUUsage.rst                          | 51 ++++++++++------
 include/llvm/BinaryFormat/ELF.h               |  7 ++-
 lib/ObjectYAML/ELFYAML.cpp                    |  1 +
 lib/Target/AMDGPU/AMDGPU.td                   | 29 ++++-----
 lib/Target/AMDGPU/AMDGPUSubtarget.cpp         |  2 +-
 lib/Target/AMDGPU/AMDGPUSubtarget.h           |  6 +-
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |  4 ++
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  6 ++
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h      |  1 +
 .../CodeGen/AMDGPU/directive-amdgcn-target.ll | 16 ++++-
 test/CodeGen/AMDGPU/elf-header-flags-mach.ll  |  1 +
 .../AMDGPU/elf-header-flags-sram-ecc.ll       | 38 ++++++++++++
 .../AMDGPU/elf-header-flags-sram-ecc.yaml     | 61 +++++++++++++++++++
 tools/llvm-readobj/ELFDumper.cpp              |  3 +-
 14 files changed, 182 insertions(+), 44 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll
 create mode 100644 test/Object/AMDGPU/elf-header-flags-sram-ecc.yaml

diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst
index 2692078d28b..bc3caf4448c 100644
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@@ -207,6 +207,8 @@ names from both the *Processor* and *Alternative Processor* can be used.
                                                                          names.
      ``gfx906``                  ``amdgcn``   dGPU  - xnack           *TBA*
                                                       [off]
+                                                      sram-ecc
+                                                      [on]
                                                                       .. TODO
                                                                          Add product
                                                                          names.
@@ -246,24 +248,26 @@ For example:
   .. table:: AMDGPU Target Features
      :name: amdgpu-target-feature-table
 
-     ============== ==================================================
-     Target Feature Description
-     ============== ==================================================
-     -m[no-]xnack   Enable/disable generating code that has
-                    memory clauses that are compatible with
-                    having XNACK replay enabled.
-
-                    This is used for demand paging and page
-                    migration. If XNACK replay is enabled in
-                    the device, then if a page fault occurs
-                    the code may execute incorrectly if the
-                    ``xnack`` feature is not enabled. Executing
-                    code that has the feature enabled on a
-                    device that does not have XNACK replay
-                    enabled will execute correctly, but may
-                    be less performant than code with the
-                    feature disabled.
-     ============== ==================================================
+     =============== ==================================================
+     Target Feature  Description
+     =============== ==================================================
+     -m[no-]xnack    Enable/disable generating code that has
+                     memory clauses that are compatible with
+                     having XNACK replay enabled.
+
+                     This is used for demand paging and page
+                     migration. If XNACK replay is enabled in
+                     the device, then if a page fault occurs
+                     the code may execute incorrectly if the
+                     ``xnack`` feature is not enabled. Executing
+                     code that has the feature enabled on a
+                     device that does not have XNACK replay
+                     enabled will execute correctly, but may
+                     be less performant than code with the
+                     feature disabled.
+     -m[no-]sram-ecc Enable/disable generating code that assumes SRAM
+                     ECC is enabled/disabled.
+     =============== ==================================================
 
 .. _amdgpu-address-spaces:
 
@@ -549,6 +553,17 @@ The AMDGPU backend uses the following ELF header:
                                                   be 0.
                                                   See
                                                   :ref:`amdgpu-target-features`.
+     ``EF_AMDGPU_SRAM_ECC``            0x00000200 Indicates if the ``sram-ecc``
+                                                  target feature is
+                                                  enabled for all code
+                                                  contained in the code object.
+                                                  If the processor
+                                                  does not support the
+                                                  ``sram-ecc`` target
+                                                  feature then must
+                                                  be 0.
+                                                  See
+                                                  :ref:`amdgpu-target-features`.
      ================================= ========== =============================
 
   .. table:: AMDGPU ``EF_AMDGPU_MACH`` Values
diff --git a/include/llvm/BinaryFormat/ELF.h b/include/llvm/BinaryFormat/ELF.h
index 26f65be9f1d..a42186bc270 100644
--- a/include/llvm/BinaryFormat/ELF.h
+++ b/include/llvm/BinaryFormat/ELF.h
@@ -711,9 +711,12 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
   EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX909,
 
-  // Indicates if the xnack target feature is enabled for all code contained in
-  // the object.
+  // Indicates if the "xnack" target feature is enabled for all code contained
+  // in the object.
   EF_AMDGPU_XNACK = 0x100,
+  // Indicates if the "sram-ecc" target feature is enabled for all code
+  // contained in the object.
+  EF_AMDGPU_SRAM_ECC = 0x200,
 };
 
 // ELF Relocation types for AMDGPU
diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp
index 2c69f115bca..189d71782bd 100644
--- a/lib/ObjectYAML/ELFYAML.cpp
+++ b/lib/ObjectYAML/ELFYAML.cpp
@@ -404,6 +404,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH);
     BCase(EF_AMDGPU_XNACK);
+    BCase(EF_AMDGPU_SRAM_ECC);
     break;
   case ELF::EM_X86_64:
     break;
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index ec351356f79..96a8029773d 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -266,13 +266,10 @@ def FeatureDLInsts : SubtargetFeature<"dl-insts",
   "Has deep learning instructions"
 >;
 
-def FeatureD16PreservesUnusedBits : SubtargetFeature<
-  "d16-preserves-unused-bits",
-  "D16PreservesUnusedBits",
+def FeatureSRAMECC : SubtargetFeature<"sram-ecc",
+  "EnableSRAMECC",
   "true",
-  "If present, then instructions defined by HasD16LoadStore predicate preserve "
-  "unused bits. Otherwise instructions defined by HasD16LoadStore predicate "
-  "zero unused bits."
+  "Enable SRAM ECC"
 >;
 
 //===------------------------------------------------------------===//
@@ -524,35 +521,32 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
 def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
   [FeatureGFX9,
    FeatureMadMixInsts,
-   FeatureLDSBankCount32,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureLDSBankCount32]>;
 
 def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
-   FeatureXNACK,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureXNACK]>;
 
 def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
   [FeatureGFX9,
    FeatureLDSBankCount32,
-   FeatureFmaMixInsts,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureFmaMixInsts]>;
 
 def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
   [FeatureGFX9,
    HalfRate64Ops,
    FeatureFmaMixInsts,
    FeatureLDSBankCount32,
-   FeatureDLInsts]>;
+   FeatureDLInsts,
+   FeatureSRAMECC]>;
 
 def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
-   FeatureXNACK,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureXNACK]>;
 
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
@@ -684,8 +678,9 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
 def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
   AssemblerPredicate<"!FeatureUnpackedD16VMem">;
 
-def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">,
-  AssemblerPredicate<"FeatureD16PreservesUnusedBits">;
+def D16PreservesUnusedBits :
+  Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">,
+  AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">;
 
 def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
 def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 9a7e6918d41..f1acd72b03a 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -198,7 +198,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasDPP(false),
     HasR128A16(false),
     HasDLInsts(false),
-    D16PreservesUnusedBits(false),
+    EnableSRAMECC(false),
     FlatAddressSpace(false),
     FlatInstOffsets(false),
     FlatGlobalInsts(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 162305ddee2..8b1cb23c672 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -353,7 +353,7 @@ protected:
   bool HasDPP;
   bool HasR128A16;
   bool HasDLInsts;
-  bool D16PreservesUnusedBits;
+  bool EnableSRAMECC;
   bool FlatAddressSpace;
   bool FlatInstOffsets;
   bool FlatGlobalInsts;
@@ -679,8 +679,8 @@ public:
     return HasDLInsts;
   }
 
-  bool d16PreservesUnusedBits() const {
-    return D16PreservesUnusedBits;
+  bool isSRAMECCEnabled() const {
+    return EnableSRAMECC;
   }
 
   // Scratch is allocated in 256 dword per wave blocks for the entire
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index a7b8c11288f..225bf5b7816 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -347,6 +347,10 @@ AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(
   if (AMDGPU::hasXNACK(STI))
     EFlags |= ELF::EF_AMDGPU_XNACK;
 
+  EFlags &= ~ELF::EF_AMDGPU_SRAM_ECC;
+  if (AMDGPU::hasSRAMECC(STI))
+    EFlags |= ELF::EF_AMDGPU_SRAM_ECC;
+
   MCA.setELFHeaderEFlags(EFlags);
 }
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 00e9ff7abfd..9d567579d71 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -152,6 +152,8 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
 
   if (hasXNACK(*STI))
     Stream << "+xnack";
+  if (hasSRAMECC(*STI))
+    Stream << "+sram-ecc";
 
   Stream.flush();
 }
@@ -593,6 +595,10 @@ bool hasXNACK(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
 }
 
+bool hasSRAMECC(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureSRAMECC];
+}
+
 bool hasMIMG_R128(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
 }
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 699b17061d7..af5ab9bf269 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -342,6 +342,7 @@ inline bool isKernel(CallingConv::ID CC) {
 }
 
 bool hasXNACK(const MCSubtargetInfo &STI);
+bool hasSRAMECC(const MCSubtargetInfo &STI);
 bool hasMIMG_R128(const MCSubtargetInfo &STI);
 bool hasPackedD16(const MCSubtargetInfo &STI);
 
diff --git a/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
index 757da908af9..4218cee9f1e 100644
--- a/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
+++ b/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
@@ -34,6 +34,12 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+code-object-v3,+xnack < %s | FileCheck --check-prefixes=XNACK-GFX900 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+code-object-v3,-xnack < %s | FileCheck --check-prefixes=NO-XNACK-GFX902 %s
 
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+code-object-v3,+sram-ecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX904 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3,-sram-ecc < %s | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+code-object-v3,+sram-ecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX904 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3,+xnack < %s | FileCheck --check-prefixes=XNACK-GFX906 %s
+
 ; GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600"
 ; GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601"
 ; GFX700: .amdgcn_target "amdgcn-amd-amdhsa--gfx700"
@@ -48,10 +54,16 @@
 ; GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900"
 ; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack"
 ; GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904"
-; GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906"
+; GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+sram-ecc"
 
 ; XNACK-GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack"
-; NO-XNACK-GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902
+; NO-XNACK-GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902"
+
+; SRAM-ECC-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+sram-ecc"
+; NO-SRAM-ECC-GFX906: "amdgcn-amd-amdhsa--gfx906"
+
+; SRAM-ECC-XNACK-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack+sram-ecc"
+; XNACK-GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sram-ecc"
 
 define amdgpu_kernel void @directive_amdgcn_target() {
   ret void
diff --git a/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
index 5887951b4e6..b64e077a59c 100644
--- a/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
+++ b/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
@@ -86,6 +86,7 @@
 ; GFX902-NEXT:   EF_AMDGPU_XNACK              (0x100)
 ; GFX904:        EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E)
 ; GFX906:        EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
+; GFX906-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
 ; GFX909:        EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31)
 ; ALL:         ]
 
diff --git a/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll b/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll
new file mode 100644
index 00000000000..b33b29b59ec
--- /dev/null
+++ b/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll
@@ -0,0 +1,38 @@
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=-sram-ecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=+sram-ecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX902 %s
+
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc,+xnack < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s
+
+; NO-SRAM-ECC-GFX902:      Flags [
+; NO-SRAM-ECC-GFX902-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D)
+; NO-SRAM-ECC-GFX902-NEXT:   EF_AMDGPU_XNACK              (0x100)
+; NO-SRAM-ECC-GFX902-NEXT: ]
+
+; SRAM-ECC-GFX902:      Flags [
+; SRAM-ECC-GFX902-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D)
+; SRAM-ECC-GFX902-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
+; SRAM-ECC-GFX902-NEXT:   EF_AMDGPU_XNACK              (0x100)
+; SRAM-ECC-GFX902-NEXT: ]
+
+; NO-SRAM-ECC-GFX906:      Flags [
+; NO-SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
+; NO-SRAM-ECC-GFX906-NEXT: ]
+
+; SRAM-ECC-GFX906:      Flags [
+; SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
+; SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
+; SRAM-ECC-GFX906-NEXT: ]
+
+; SRAM-ECC-XNACK-GFX906:      Flags [
+; SRAM-ECC-XNACK-GFX906-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
+; SRAM-ECC-XNACK-GFX906-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
+; SRAM-ECC-XNACK-GFX906-NEXT:   EF_AMDGPU_XNACK              (0x100)
+; SRAM-ECC-XNACK-GFX906-NEXT: ]
+
+define amdgpu_kernel void @elf_header() {
+  ret void
+}
diff --git a/test/Object/AMDGPU/elf-header-flags-sram-ecc.yaml b/test/Object/AMDGPU/elf-header-flags-sram-ecc.yaml
new file mode 100644
index 00000000000..78b2913be04
--- /dev/null
+++ b/test/Object/AMDGPU/elf-header-flags-sram-ecc.yaml
@@ -0,0 +1,61 @@
+# RUN: yaml2obj -docnum=1 %s > %t.o.1
+# RUN: llvm-readobj -s -file-headers %t.o.1 | FileCheck --check-prefixes=ELF-ALL,ELF-SRAM-ECC-NONE %s
+# RUN: obj2yaml %t.o.1 | FileCheck --check-prefixes=YAML-SRAM-ECC-NONE %s
+# RUN: yaml2obj -docnum=2 %s > %t.o.2
+# RUN: llvm-readobj -s -file-headers %t.o.2 | FileCheck --check-prefixes=ELF-ALL,ELF-SRAM-ECC-GFX900 %s
+# RUN: obj2yaml %t.o.2 | FileCheck --check-prefixes=YAML-SRAM-ECC-GFX900 %s
+# RUN: yaml2obj -docnum=3 %s > %t.o.3
+# RUN: llvm-readobj -s -file-headers %t.o.3 | FileCheck --check-prefixes=ELF-ALL,ELF-SRAM-ECC-XNACK-GFX900 %s
+# RUN: obj2yaml %t.o.3 | FileCheck --check-prefixes=YAML-SRAM-ECC-XNACK-GFX900 %s
+
+# ELF-SRAM-ECC-NONE:      Flags [
+# ELF-SRAM-ECC-NONE-NEXT:   EF_AMDGPU_SRAM_ECC (0x200)
+# ELF-SRAM-ECC-NONE-NEXT: ]
+
+# ELF-SRAM-ECC-GFX900:      Flags [
+# ELF-SRAM-ECC-GFX900-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C)
+# ELF-SRAM-ECC-GFX900-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
+# ELF-SRAM-ECC-GFX900-NEXT: ]
+
+# ELF-SRAM-ECC-XNACK-GFX900:      Flags [
+# ELF-SRAM-ECC-XNACK-GFX900-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C)
+# ELF-SRAM-ECC-XNACK-GFX900-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
+# ELF-SRAM-ECC-XNACK-GFX900-NEXT:   EF_AMDGPU_XNACK              (0x100)
+# ELF-SRAM-ECC-XNACK-GFX900-NEXT: ]
+
+# YAML-SRAM-ECC-NONE:         Flags: [ EF_AMDGPU_MACH_NONE, EF_AMDGPU_SRAM_ECC ]
+# YAML-SRAM-ECC-GFX900:       Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_SRAM_ECC ]
+# YAML-SRAM-ECC-XNACK-GFX900: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_XNACK, EF_AMDGPU_SRAM_ECC ]
+
+# Doc1
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  OSABI:   ELFOSABI_NONE
+  Type:    ET_REL
+  Machine: EM_AMDGPU
+  Flags:   [ EF_AMDGPU_SRAM_ECC ]
+...
+
+# Doc2
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  OSABI:   ELFOSABI_NONE
+  Type:    ET_REL
+  Machine: EM_AMDGPU
+  Flags:   [ EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_SRAM_ECC ]
+...
+
+# Doc3
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  OSABI:   ELFOSABI_NONE
+  Type:    ET_REL
+  Machine: EM_AMDGPU
+  Flags:   [ EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_XNACK, EF_AMDGPU_SRAM_ECC ]
+...
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index a1cf0aef1b4..ae9da9ace22 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -1355,7 +1355,8 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX904),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX906),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX909),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK)
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_SRAM_ECC)
 };
 
 static const EnumEntry<unsigned> ElfHeaderRISCVFlags[] = {
-- 
GitLab


From 40f2fec2547cb5509e2f7d82f588f0cd32af6f47 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 5 Nov 2018 23:26:13 +0000
Subject: [PATCH 0984/1116] [TargetLowering] Change
 TargetLoweringBase::getPreferredVectorAction to take an MVT instead of an
 EVT. NFC

The main caller of this already has an MVT and several targets called getSimpleVT inside without checking isSimple. This makes the simpleness explicit.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346180 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/TargetLowering.h             | 2 +-
 lib/Target/AArch64/AArch64ISelLowering.cpp        | 7 +++----
 lib/Target/AArch64/AArch64ISelLowering.h          | 2 +-
 lib/Target/AMDGPU/SIISelLowering.cpp              | 2 +-
 lib/Target/AMDGPU/SIISelLowering.h                | 2 +-
 lib/Target/Hexagon/HexagonISelLowering.cpp        | 4 ++--
 lib/Target/Hexagon/HexagonISelLowering.h          | 2 +-
 lib/Target/Hexagon/HexagonTargetTransformInfo.cpp | 2 +-
 lib/Target/NVPTX/NVPTXISelLowering.cpp            | 2 +-
 lib/Target/NVPTX/NVPTXISelLowering.h              | 2 +-
 lib/Target/PowerPC/PPCISelLowering.h              | 2 +-
 lib/Target/SystemZ/SystemZISelLowering.h          | 2 +-
 lib/Target/X86/X86ISelLowering.cpp                | 4 ++--
 lib/Target/X86/X86ISelLowering.h                  | 2 +-
 14 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 96a52abd453..38e575b1360 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -279,7 +279,7 @@ public:
 
   /// Return the preferred vector type legalization action.
   virtual TargetLoweringBase::LegalizeTypeAction
-  getPreferredVectorAction(EVT VT) const {
+  getPreferredVectorAction(MVT VT) const {
     // The default action for one element vectors is to scalarize
     if (VT.getVectorNumElements() == 1)
       return TypeScalarizeVector;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index a18284f892c..101e20c8f20 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11506,12 +11506,11 @@ unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
 }
 
 TargetLoweringBase::LegalizeTypeAction
-AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
-  MVT SVT = VT.getSimpleVT();
+AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
   // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
   // v4i16, v2i32 instead of to promote.
-  if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
-      || SVT == MVT::v1f32)
+  if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
+      VT == MVT::v1f32)
     return TypeWidenVector;
 
   return TargetLoweringBase::getPreferredVectorAction(VT);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 7b4119a21d0..7ee3b82a4ac 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -395,7 +395,7 @@ public:
 
   bool useLoadStackGuardNode() const override;
   TargetLoweringBase::LegalizeTypeAction
-  getPreferredVectorAction(EVT VT) const override;
+  getPreferredVectorAction(MVT VT) const override;
 
   /// If the target has a standard location for the stack protector cookie,
   /// returns the address of that location. Otherwise, returns nullptr.
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 88e07b99e72..672784a9873 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1185,7 +1185,7 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
 }
 
 TargetLoweringBase::LegalizeTypeAction
-SITargetLowering::getPreferredVectorAction(EVT VT) const {
+SITargetLowering::getPreferredVectorAction(MVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
     return TypeSplitVector;
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 09e0a12cce8..d12c3ae4dba 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -234,7 +234,7 @@ public:
   bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
   TargetLoweringBase::LegalizeTypeAction
-  getPreferredVectorAction(EVT VT) const override;
+  getPreferredVectorAction(MVT VT) const override;
 
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                         Type *Ty) const override;
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 7a708a8ac24..755a8539be7 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1834,12 +1834,12 @@ bool HexagonTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask,
 }
 
 TargetLoweringBase::LegalizeTypeAction
-HexagonTargetLowering::getPreferredVectorAction(EVT VT) const {
+HexagonTargetLowering::getPreferredVectorAction(MVT VT) const {
   if (VT.getVectorNumElements() == 1)
     return TargetLoweringBase::TypeScalarizeVector;
 
   // Always widen vectors of i1.
-  MVT ElemTy = VT.getSimpleVT().getVectorElementType();
+  MVT ElemTy = VT.getVectorElementType();
   if (ElemTy == MVT::i1)
     return TargetLoweringBase::TypeWidenVector;
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 39af19b9b07..265c37e6ae6 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -141,7 +141,7 @@ namespace HexagonISD {
         unsigned DefinedValues) const override;
 
     bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
-    TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+    TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
         const override;
 
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 4c671460c90..c942f645aa8 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -54,7 +54,7 @@ bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
     return false;
   if (ST.isHVXVectorType(VecVT.getSimpleVT()))
     return true;
-  auto Action = TLI.getPreferredVectorAction(VecVT);
+  auto Action = TLI.getPreferredVectorAction(VecVT.getSimpleVT());
   return Action == TargetLoweringBase::TypeWidenVector;
 }
 
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1f323b63034..c352b9b9c9d 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1170,7 +1170,7 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
 }
 
 TargetLoweringBase::LegalizeTypeAction
-NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
+NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
     return TypeSplitVector;
   if (VT == MVT::v2f16)
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index ef04a8573d4..3e109f75b66 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -511,7 +511,7 @@ public:
   }
 
   TargetLoweringBase::LegalizeTypeAction
-  getPreferredVectorAction(EVT VT) const override;
+  getPreferredVectorAction(MVT VT) const override;
 
   // Get the degree of precision we want from 32-bit floating point division
   // operations.
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index d597e9348a1..1020cab48c8 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -569,7 +569,7 @@ namespace llvm {
     /// of v4i8's and shuffle them. This will turn into a mess of 8 extending
     /// loads, moves back into VSR's (or memory ops if we don't have moves) and
     /// then the VPERM for the shuffle. All in all a very slow sequence.
-    TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+    TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
       const override {
       if (VT.getScalarSizeInBits() % 8 == 0)
         return TypeWidenVector;
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 6a29ed62065..9bf94407947 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -379,7 +379,7 @@ public:
     // want to clobber the upper 32 bits of a GPR unnecessarily.
     return MVT::i32;
   }
-  TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+  TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
     const override {
     // Widen subvectors to the full width rather than promoting integer
     // elements.  This is better because:
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9e7a41c752a..21b835ec5ba 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1787,13 +1787,13 @@ SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
 }
 
 TargetLoweringBase::LegalizeTypeAction
-X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+X86TargetLowering::getPreferredVectorAction(MVT VT) const {
   if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
     return TypeSplitVector;
 
   if (ExperimentalVectorWideningLegalization &&
       VT.getVectorNumElements() != 1 &&
-      VT.getVectorElementType().getSimpleVT() != MVT::i1)
+      VT.getVectorElementType() != MVT::i1)
     return TypeWidenVector;
 
   return TargetLoweringBase::getPreferredVectorAction(VT);
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index fea7ecbdbb4..7cda0259bf2 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -1113,7 +1113,7 @@ namespace llvm {
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
     /// Customize the preferred legalization strategy for certain types.
-    LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+    LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
 
     MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
                                       EVT VT) const override;
-- 
GitLab


From 1d019aeea899b139302bf32a02e46d50e89c52a9 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Mon, 5 Nov 2018 23:27:53 +0000
Subject: [PATCH 0985/1116] [DWARF] Support types CU list in .gdb_index dumping

Some executables have non-empty types CU list and -gdb-index would report "<error reporting>" before.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346181 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h |  9 ++++++++
 lib/DebugInfo/DWARF/DWARFGdbIndex.cpp        | 23 +++++++++++++++++---
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h b/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
index 8d1ac5c83c2..073e02903c3 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
@@ -24,6 +24,7 @@ class DWARFGdbIndex {
   uint32_t Version;
 
   uint32_t CuListOffset;
+  uint32_t TuListOffset;
   uint32_t AddressAreaOffset;
   uint32_t SymbolTableOffset;
   uint32_t ConstantPoolOffset;
@@ -34,6 +35,13 @@ class DWARFGdbIndex {
   };
   SmallVector<CompUnitEntry, 0> CuList;
 
+  struct TypeUnitEntry {
+    uint64_t Offset;
+    uint64_t TypeOffset;
+    uint64_t TypeSignature;
+  };
+  SmallVector<TypeUnitEntry, 0> TuList;
+
   struct AddressEntry {
     uint64_t LowAddress;  /// The low address.
     uint64_t HighAddress; /// The high address.
@@ -55,6 +63,7 @@ class DWARFGdbIndex {
   uint32_t StringPoolOffset;
 
   void dumpCUList(raw_ostream &OS) const;
+  void dumpTUList(raw_ostream &OS) const;
   void dumpAddressArea(raw_ostream &OS) const;
   void dumpSymbolTable(raw_ostream &OS) const;
   void dumpConstantPool(raw_ostream &OS) const;
diff --git a/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index ebd6104ab87..1abd931e3b8 100644
--- a/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -33,6 +34,16 @@ void DWARFGdbIndex::dumpCUList(raw_ostream &OS) const {
                  CU.Length);
 }
 
+void DWARFGdbIndex::dumpTUList(raw_ostream &OS) const {
+  OS << formatv("\n  Types CU list offset = {0:x}, has {1} entries:\n",
+                TuListOffset, TuList.size());
+  uint32_t I = 0;
+  for (const TypeUnitEntry &TU : TuList)
+    OS << formatv("    {0}: offset = {1:x8}, type_offset = {2:x8}, "
+                  "type_signature = {3:x16}\n",
+                  I++, TU.Offset, TU.TypeOffset, TU.TypeSignature);
+}
+
 void DWARFGdbIndex::dumpAddressArea(raw_ostream &OS) const {
   OS << format("\n  Address area offset = 0x%x, has %" PRId64 " entries:",
                AddressAreaOffset, (uint64_t)AddressArea.size())
@@ -94,6 +105,7 @@ void DWARFGdbIndex::dump(raw_ostream &OS) {
   if (HasContent) {
     OS << "  Version = " << Version << '\n';
     dumpCUList(OS);
+    dumpTUList(OS);
     dumpAddressArea(OS);
     dumpSymbolTable(OS);
     dumpConstantPool(OS);
@@ -127,9 +139,14 @@ bool DWARFGdbIndex::parseImpl(DataExtractor Data) {
 
   // CU Types are no longer needed as DWARF skeleton type units never made it
   // into the standard.
-  uint32_t CuTypesListSize = (AddressAreaOffset - CuTypesOffset) / 24;
-  if (CuTypesListSize != 0)
-    return false;
+  uint32_t TuListSize = (AddressAreaOffset - CuTypesOffset) / 24;
+  TuList.resize(TuListSize);
+  for (uint32_t I = 0; I < TuListSize; ++I) {
+    uint64_t CuOffset = Data.getU64(&Offset);
+    uint64_t TypeOffset = Data.getU64(&Offset);
+    uint64_t Signature = Data.getU64(&Offset);
+    TuList[I] = {CuOffset, TypeOffset, Signature};
+  }
 
   uint32_t AddressAreaSize = (SymbolTableOffset - AddressAreaOffset) / 20;
   AddressArea.reserve(AddressAreaSize);
-- 
GitLab


From 51c2c7a40e3e2dbc657da21de955e232abb8d931 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Mon, 5 Nov 2018 23:49:13 +0000
Subject: [PATCH 0986/1116] MachineModuleInfo: Store more specific reference to
 LLVMTargetMachine; NFC

MachineModuleInfo can only be used in code using lib/CodeGen, hence we
can keep a more specific reference to LLVMTargetMachine rather than just
TargetMachine around.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346182 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineModuleInfo.h      |  8 +++----
 lib/CodeGen/MachineModuleInfo.cpp             |  2 +-
 tools/llvm-exegesis/lib/Assembler.cpp         |  4 +++-
 unittests/CodeGen/AArch64SelectionDAGTest.cpp |  7 ++++---
 .../CodeGen/GlobalISel/LegalizerHelperTest.h  | 11 +++++-----
 .../CodeGen/GlobalISel/PatternMatchTest.cpp   | 21 ++++++++++---------
 unittests/MI/LiveIntervalTest.cpp             | 11 +++++-----
 unittests/Target/AArch64/InstSizes.cpp        | 15 ++++++-------
 .../WebAssemblyExceptionInfoTest.cpp          | 13 ++++++------
 9 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 554e89019b7..4371420bc7a 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -46,10 +46,10 @@ namespace llvm {
 class BasicBlock;
 class CallInst;
 class Function;
-class MachineFunction;
+class LLVMTargetMachine;
 class MMIAddrLabelMap;
+class MachineFunction;
 class Module;
-class TargetMachine;
 
 //===----------------------------------------------------------------------===//
 /// This class can be derived from and used by targets to hold private
@@ -76,7 +76,7 @@ protected:
 /// for specific use.
 ///
 class MachineModuleInfo : public ImmutablePass {
-  const TargetMachine &TM;
+  const LLVMTargetMachine &TM;
 
   /// This is the MCContext used for the entire code generator.
   MCContext Context;
@@ -145,7 +145,7 @@ class MachineModuleInfo : public ImmutablePass {
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  explicit MachineModuleInfo(const TargetMachine *TM = nullptr);
+  explicit MachineModuleInfo(const LLVMTargetMachine *TM = nullptr);
   ~MachineModuleInfo() override;
 
   // Initialization and Finalization
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index ce556903dc0..6ef8de88f8b 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -194,7 +194,7 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
   Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
 }
 
-MachineModuleInfo::MachineModuleInfo(const TargetMachine *TM)
+MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM)
   : ImmutablePass(ID), TM(*TM),
     Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(),
             TM->getObjFileLowering(), nullptr, false) {
diff --git a/tools/llvm-exegesis/lib/Assembler.cpp b/tools/llvm-exegesis/lib/Assembler.cpp
index 771a6e9ad24..2e3712ce7dc 100644
--- a/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/tools/llvm-exegesis/lib/Assembler.cpp
@@ -142,8 +142,10 @@ llvm::BitVector getFunctionReservedRegs(const llvm::TargetMachine &TM) {
       llvm::make_unique<llvm::LLVMContext>();
   std::unique_ptr<llvm::Module> Module =
       createModule(Context, TM.createDataLayout());
+  // TODO: This only works for targets implementing LLVMTargetMachine.
+  const LLVMTargetMachine &LLVMTM = static_cast<const LLVMTargetMachine&>(TM);
   std::unique_ptr<llvm::MachineModuleInfo> MMI =
-      llvm::make_unique<llvm::MachineModuleInfo>(&TM);
+      llvm::make_unique<llvm::MachineModuleInfo>(&LLVMTM);
   llvm::MachineFunction &MF =
       createVoidVoidPtrMachineFunction(FunctionID, Module.get(), MMI.get());
   // Saving reserved registers for client.
diff --git a/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index dc2d1f9a357..0c184d37187 100644
--- a/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -42,8 +42,9 @@ protected:
       return;
 
     TargetOptions Options;
-    TM = std::unique_ptr<TargetMachine>(T->createTargetMachine(
-        "AArch64", "", "", Options, None, None, CodeGenOpt::Aggressive));
+    TM = std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+        T->createTargetMachine("AArch64", "", "", Options, None, None,
+                               CodeGenOpt::Aggressive)));
     if (!TM)
       return;
 
@@ -70,7 +71,7 @@ protected:
   }
 
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = nullptr;
+  std::unique_ptr<LLVMTargetMachine> TM;
   std::unique_ptr<Module> M;
   Function *F;
   std::unique_ptr<MachineFunction> MF;
diff --git a/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h b/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h
index ca1aed544d2..28af811e1f1 100644
--- a/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h
+++ b/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h
@@ -44,7 +44,7 @@ void initLLVM() {
 
 /// Create a TargetMachine. As we lack a dedicated always available target for
 /// unittests, we go for "AArch64".
-std::unique_ptr<TargetMachine> createTargetMachine() {
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
   Triple TargetTriple("aarch64--");
   std::string Error;
   const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
@@ -52,8 +52,9 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
     return nullptr;
 
   TargetOptions Options;
-  return std::unique_ptr<TargetMachine>(T->createTargetMachine(
-      "AArch64", "", "", Options, None, None, CodeGenOpt::Aggressive));
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+      T->createTargetMachine("AArch64", "", "", Options, None, None,
+                             CodeGenOpt::Aggressive)));
 }
 
 std::unique_ptr<Module> parseMIR(LLVMContext &Context,
@@ -79,7 +80,7 @@ std::unique_ptr<Module> parseMIR(LLVMContext &Context,
 }
 
 std::pair<std::unique_ptr<Module>, std::unique_ptr<MachineModuleInfo>>
-createDummyModule(LLVMContext &Context, const TargetMachine &TM,
+createDummyModule(LLVMContext &Context, const LLVMTargetMachine &TM,
                   StringRef MIRFunc) {
   SmallString<512> S;
   StringRef MIRString = (Twine(R"MIR(
@@ -136,7 +137,7 @@ protected:
     B.setInsertPt(*EntryMBB, EntryMBB->end());
   }
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM;
+  std::unique_ptr<LLVMTargetMachine> TM;
   MachineFunction *MF;
   std::pair<std::unique_ptr<Module>, std::unique_ptr<MachineModuleInfo>>
       ModuleMMIPair;
diff --git a/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp b/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
index 8f17b1991df..1f3a690ad01 100644
--- a/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
+++ b/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
@@ -43,7 +43,7 @@ void initLLVM() {
 
 /// Create a TargetMachine. As we lack a dedicated always available target for
 /// unittests, we go for "AArch64".
-std::unique_ptr<TargetMachine> createTargetMachine() {
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
   Triple TargetTriple("aarch64--");
   std::string Error;
   const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
@@ -51,8 +51,9 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
     return nullptr;
 
   TargetOptions Options;
-  return std::unique_ptr<TargetMachine>(T->createTargetMachine(
-      "AArch64", "", "", Options, None, None, CodeGenOpt::Aggressive));
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+      T->createTargetMachine("AArch64", "", "", Options, None, None,
+                             CodeGenOpt::Aggressive)));
 }
 
 std::unique_ptr<Module> parseMIR(LLVMContext &Context,
@@ -78,7 +79,7 @@ std::unique_ptr<Module> parseMIR(LLVMContext &Context,
 }
 
 std::pair<std::unique_ptr<Module>, std::unique_ptr<MachineModuleInfo>>
-createDummyModule(LLVMContext &Context, const TargetMachine &TM,
+createDummyModule(LLVMContext &Context, const LLVMTargetMachine &TM,
                   StringRef MIRFunc) {
   SmallString<512> S;
   StringRef MIRString = (Twine(R"MIR(
@@ -122,7 +123,7 @@ static void collectCopies(SmallVectorImpl<unsigned> &Copies,
 
 TEST(PatternMatchInstr, MatchIntConstant) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
@@ -143,7 +144,7 @@ TEST(PatternMatchInstr, MatchIntConstant) {
 
 TEST(PatternMatchInstr, MatchBinaryOp) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
@@ -270,7 +271,7 @@ TEST(PatternMatchInstr, MatchBinaryOp) {
 
 TEST(PatternMatchInstr, MatchFPUnaryOp) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
@@ -341,7 +342,7 @@ TEST(PatternMatchInstr, MatchFPUnaryOp) {
 
 TEST(PatternMatchInstr, MatchExtendsTrunc) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
@@ -397,7 +398,7 @@ TEST(PatternMatchInstr, MatchExtendsTrunc) {
 
 TEST(PatternMatchInstr, MatchSpecificType) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
@@ -444,7 +445,7 @@ TEST(PatternMatchInstr, MatchSpecificType) {
 
 TEST(PatternMatchInstr, MatchCombinators) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
diff --git a/unittests/MI/LiveIntervalTest.cpp b/unittests/MI/LiveIntervalTest.cpp
index a39fd7f73cf..5ee9d13dbd9 100644
--- a/unittests/MI/LiveIntervalTest.cpp
+++ b/unittests/MI/LiveIntervalTest.cpp
@@ -35,7 +35,7 @@ void initLLVM() {
 /// Create a TargetMachine. As we lack a dedicated always available target for
 /// unittests, we go for "AMDGPU" to be able to test normal and subregister
 /// liveranges.
-std::unique_ptr<TargetMachine> createTargetMachine() {
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
   Triple TargetTriple("amdgcn--");
   std::string Error;
   const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
@@ -43,13 +43,14 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
     return nullptr;
 
   TargetOptions Options;
-  return std::unique_ptr<TargetMachine>(T->createTargetMachine(
-      "AMDGPU", "", "", Options, None, None, CodeGenOpt::Aggressive));
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+      T->createTargetMachine("AMDGPU", "", "", Options, None, None,
+                             CodeGenOpt::Aggressive)));
 }
 
 std::unique_ptr<Module> parseMIR(LLVMContext &Context,
     legacy::PassManagerBase &PM, std::unique_ptr<MIRParser> &MIR,
-    const TargetMachine &TM, StringRef MIRCode, const char *FuncName) {
+    const LLVMTargetMachine &TM, StringRef MIRCode, const char *FuncName) {
   SMDiagnostic Diagnostic;
   std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
   MIR = createMIRParser(std::move(MBuffer), Context);
@@ -128,7 +129,7 @@ static void testHandleMove(MachineFunction &MF, LiveIntervals &LIS,
 
 static void liveIntervalTest(StringRef MIRFunc, LiveIntervalTest T) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   // This test is designed for the X86 backend; stop if it is not available.
   if (!TM)
     return;
diff --git a/unittests/Target/AArch64/InstSizes.cpp b/unittests/Target/AArch64/InstSizes.cpp
index e58df0a45cc..a70f43c4379 100644
--- a/unittests/Target/AArch64/InstSizes.cpp
+++ b/unittests/Target/AArch64/InstSizes.cpp
@@ -10,7 +10,7 @@
 using namespace llvm;
 
 namespace {
-std::unique_ptr<TargetMachine> createTargetMachine() {
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
   auto TT(Triple::normalize("aarch64--"));
   std::string CPU("generic");
   std::string FS("");
@@ -22,8 +22,9 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
   std::string Error;
   const Target *TheTarget = TargetRegistry::lookupTarget(TT, Error);
 
-  return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
-      TT, CPU, FS, TargetOptions(), None, None, CodeGenOpt::Default));
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+      TheTarget->createTargetMachine(TT, CPU, FS, TargetOptions(), None, None,
+                                     CodeGenOpt::Default)));
 }
 
 std::unique_ptr<AArch64InstrInfo> createInstrInfo(TargetMachine *TM) {
@@ -37,7 +38,7 @@ std::unique_ptr<AArch64InstrInfo> createInstrInfo(TargetMachine *TM) {
 /// TODO: Some of this might be useful for other architectures as well - extract
 ///       the platform-independent parts somewhere they can be reused.
 void runChecks(
-    TargetMachine *TM, AArch64InstrInfo *II, const StringRef InputIRSnippet,
+    LLVMTargetMachine *TM, AArch64InstrInfo *II, const StringRef InputIRSnippet,
     const StringRef InputMIRSnippet,
     std::function<void(AArch64InstrInfo &, MachineFunction &)> Checks) {
   LLVMContext Context;
@@ -78,7 +79,7 @@ void runChecks(
 } // anonymous namespace
 
 TEST(InstSizes, STACKMAP) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   ASSERT_TRUE(TM);
   std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
 
@@ -93,7 +94,7 @@ TEST(InstSizes, STACKMAP) {
 }
 
 TEST(InstSizes, PATCHPOINT) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
 
   runChecks(TM.get(), II.get(), "",
@@ -108,7 +109,7 @@ TEST(InstSizes, PATCHPOINT) {
 }
 
 TEST(InstSizes, TLSDESC_CALLSEQ) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
 
   runChecks(
diff --git a/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp b/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
index 599f2e7f10f..095ee0665e1 100644
--- a/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
+++ b/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 
 namespace {
 
-std::unique_ptr<TargetMachine> createTargetMachine() {
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
   auto TT(Triple::normalize("wasm32-unknown-unknown"));
   std::string CPU("");
   std::string FS("");
@@ -35,8 +35,9 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
   const Target *TheTarget = TargetRegistry::lookupTarget(TT, Error);
   assert(TheTarget);
 
-  return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
-      TT, CPU, FS, TargetOptions(), None, None, CodeGenOpt::Default));
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+      TheTarget->createTargetMachine(TT, CPU, FS, TargetOptions(), None, None,
+                                     CodeGenOpt::Default)));
 }
 
 std::unique_ptr<Module> parseMIR(LLVMContext &Context,
@@ -64,7 +65,7 @@ std::unique_ptr<Module> parseMIR(LLVMContext &Context,
 } // namespace
 
 TEST(WebAssemblyExceptionInfoTest, TEST0) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   ASSERT_TRUE(TM);
 
   StringRef MIRString = R"MIR(
@@ -227,7 +228,7 @@ body: |
 }
 
 TEST(WebAssemblyExceptionInfoTest, TEST1) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   ASSERT_TRUE(TM);
 
   StringRef MIRString = R"MIR(
@@ -418,7 +419,7 @@ body: |
 
 // Terminate pad test
 TEST(WebAssemblyExceptionInfoTest, TEST2) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   ASSERT_TRUE(TM);
 
   StringRef MIRString = R"MIR(
-- 
GitLab


From dad0df638d35bab2e4bf3ceac1502c00f14cc087 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Mon, 5 Nov 2018 23:49:14 +0000
Subject: [PATCH 0987/1116] MachineFunction: Store more specific reference to
 LLVMTargetMachine; NFC

MachineFunction can only be used in code using lib/CodeGen, hence we
can keep a more specific reference to LLVMTargetMachine rather than just
TargetMachine around.

Do the same for references in ScheduleDAG and RegUsageInfoCollector.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346183 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineFunction.h   | 8 ++++----
 include/llvm/CodeGen/RegisterUsageInfo.h | 6 +++---
 include/llvm/CodeGen/ScheduleDAG.h       | 6 +++---
 lib/CodeGen/MachineFunction.cpp          | 3 ++-
 lib/CodeGen/RegUsageInfoCollector.cpp    | 2 +-
 lib/CodeGen/RegisterUsageInfo.cpp        | 2 +-
 lib/Target/NVPTX/NVPTXAsmPrinter.cpp     | 2 +-
 7 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index bc81e485a80..35305bd53b2 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -58,6 +58,7 @@ class DILocalVariable;
 class DILocation;
 class Function;
 class GlobalValue;
+class LLVMTargetMachine;
 class MachineConstantPool;
 class MachineFrameInfo;
 class MachineFunction;
@@ -70,7 +71,6 @@ class Pass;
 class PseudoSourceValueManager;
 class raw_ostream;
 class SlotIndexes;
-class TargetMachine;
 class TargetRegisterClass;
 class TargetSubtargetInfo;
 struct WasmEHFuncInfo;
@@ -225,7 +225,7 @@ struct LandingPadInfo {
 
 class MachineFunction {
   const Function &F;
-  const TargetMachine &Target;
+  const LLVMTargetMachine &Target;
   const TargetSubtargetInfo *STI;
   MCContext &Ctx;
   MachineModuleInfo &MMI;
@@ -388,7 +388,7 @@ public:
   using VariableDbgInfoMapTy = SmallVector<VariableDbgInfo, 4>;
   VariableDbgInfoMapTy VariableDbgInfos;
 
-  MachineFunction(const Function &F, const TargetMachine &Target,
+  MachineFunction(const Function &F, const LLVMTargetMachine &Target,
                   const TargetSubtargetInfo &STI, unsigned FunctionNum,
                   MachineModuleInfo &MMI);
   MachineFunction(const MachineFunction &) = delete;
@@ -436,7 +436,7 @@ public:
   unsigned getFunctionNumber() const { return FunctionNumber; }
 
   /// getTarget - Return the target machine this machine code is compiled with
-  const TargetMachine &getTarget() const { return Target; }
+  const LLVMTargetMachine &getTarget() const { return Target; }
 
   /// getSubtarget - Return the subtarget for which this machine code is being
   /// compiled.
diff --git a/include/llvm/CodeGen/RegisterUsageInfo.h b/include/llvm/CodeGen/RegisterUsageInfo.h
index efd175eeed3..efecc61d9c3 100644
--- a/include/llvm/CodeGen/RegisterUsageInfo.h
+++ b/include/llvm/CodeGen/RegisterUsageInfo.h
@@ -29,7 +29,7 @@
 namespace llvm {
 
 class Function;
-class TargetMachine;
+class LLVMTargetMachine;
 
 class PhysicalRegisterUsageInfo : public ImmutablePass {
 public:
@@ -41,7 +41,7 @@ public:
   }
 
   /// Set TargetMachine which is used to print analysis.
-  void setTargetMachine(const TargetMachine &TM);
+  void setTargetMachine(const LLVMTargetMachine &TM);
 
   bool doInitialization(Module &M) override;
 
@@ -63,7 +63,7 @@ private:
   /// and 1 means content of register will be preserved around function call.
   DenseMap<const Function *, std::vector<uint32_t>> RegMasks;
 
-  const TargetMachine *TM;
+  const LLVMTargetMachine *TM;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index f2b072768b2..0870d67db39 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -33,15 +33,15 @@
 namespace llvm {
 
 template<class Graph> class GraphWriter;
+class LLVMTargetMachine;
 class MachineFunction;
 class MachineRegisterInfo;
 class MCInstrDesc;
 struct MCSchedClassDesc;
-class ScheduleDAG;
 class SDNode;
 class SUnit;
+class ScheduleDAG;
 class TargetInstrInfo;
-class TargetMachine;
 class TargetRegisterClass;
 class TargetRegisterInfo;
 
@@ -558,7 +558,7 @@ class TargetRegisterInfo;
 
   class ScheduleDAG {
   public:
-    const TargetMachine &TM;            ///< Target processor
+    const LLVMTargetMachine &TM;        ///< Target processor
     const TargetInstrInfo *TII;         ///< Target instruction information
     const TargetRegisterInfo *TRI;      ///< Target processor register info
     MachineFunction &MF;                ///< Machine function
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 9e4963c4bdb..488481cec37 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -130,7 +130,8 @@ static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI,
   return STI->getFrameLowering()->getStackAlignment();
 }
 
-MachineFunction::MachineFunction(const Function &F, const TargetMachine &Target,
+MachineFunction::MachineFunction(const Function &F,
+                                 const LLVMTargetMachine &Target,
                                  const TargetSubtargetInfo &STI,
                                  unsigned FunctionNum, MachineModuleInfo &mmi)
     : F(F), Target(Target), STI(&STI), Ctx(mmi.getContext()), MMI(mmi) {
diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp
index 9db2af9f962..66c7c5cd7db 100644
--- a/lib/CodeGen/RegUsageInfoCollector.cpp
+++ b/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -81,7 +81,7 @@ FunctionPass *llvm::createRegUsageInfoCollector() {
 bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo *MRI = &MF.getRegInfo();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  const TargetMachine &TM = MF.getTarget();
+  const LLVMTargetMachine &TM = MF.getTarget();
 
   LLVM_DEBUG(dbgs() << " -------------------- " << getPassName()
                     << " -------------------- \n");
diff --git a/lib/CodeGen/RegisterUsageInfo.cpp b/lib/CodeGen/RegisterUsageInfo.cpp
index 1b3fbc25b6e..6b9880a8913 100644
--- a/lib/CodeGen/RegisterUsageInfo.cpp
+++ b/lib/CodeGen/RegisterUsageInfo.cpp
@@ -40,7 +40,7 @@ INITIALIZE_PASS(PhysicalRegisterUsageInfo, "reg-usage-info",
 
 char PhysicalRegisterUsageInfo::ID = 0;
 
-void PhysicalRegisterUsageInfo::setTargetMachine(const TargetMachine &TM) {
+void PhysicalRegisterUsageInfo::setTargetMachine(const LLVMTargetMachine &TM) {
   this->TM = &TM;
 }
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 9d9c75aceca..aec0d7db81a 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -200,7 +200,7 @@ bool NVPTXAsmPrinter::lowerImageHandleOperand(const MachineInstr *MI,
 
 void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) {
   // Ewwww
-  TargetMachine &TM = const_cast<TargetMachine&>(MF->getTarget());
+  LLVMTargetMachine &TM = const_cast<LLVMTargetMachine&>(MF->getTarget());
   NVPTXTargetMachine &nvTM = static_cast<NVPTXTargetMachine&>(TM);
   const NVPTXMachineFunctionInfo *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
   const char *Sym = MFI->getImageHandleSymbol(Index);
-- 
GitLab


From 5617e05f341c24230ce190f30c6752ec2ad3fc73 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Mon, 5 Nov 2018 23:49:15 +0000
Subject: [PATCH 0988/1116] TargetMachine: Move lib/CodeGen specific callbacks
 to LLVMTargetMachine; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346184 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetMachine.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index e743e9faa7e..f968fa80d50 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -284,18 +284,6 @@ public:
   void getNameWithPrefix(SmallVectorImpl<char> &Name, const GlobalValue *GV,
                          Mangler &Mang, bool MayAlwaysUsePrivate = false) const;
   MCSymbol *getSymbol(const GlobalValue *GV) const;
-
-  /// True if the target uses physical regs at Prolog/Epilog insertion
-  /// time. If true (most machines), all vregs must be allocated before
-  /// PEI. If false (virtual-register machines), then callee-save register
-  /// spilling and scavenging are not needed or used.
-  virtual bool usesPhysRegsForPEI() const { return true; }
-
-  /// True if the target wants to use interprocedural register allocation by
-  /// default. The -enable-ipra flag can be used to override this.
-  virtual bool useIPRA() const {
-    return false;
-  }
 };
 
 /// This class describes a target machine that is implemented with the LLVM
@@ -349,6 +337,18 @@ public:
   bool addAsmPrinter(PassManagerBase &PM, raw_pwrite_stream &Out,
                      raw_pwrite_stream *DwoOut, CodeGenFileType FileTYpe,
                      MCContext &Context);
+
+  /// True if the target uses physical regs at Prolog/Epilog insertion
+  /// time. If true (most machines), all vregs must be allocated before
+  /// PEI. If false (virtual-register machines), then callee-save register
+  /// spilling and scavenging are not needed or used.
+  virtual bool usesPhysRegsForPEI() const { return true; }
+
+  /// True if the target wants to use interprocedural register allocation by
+  /// default. The -enable-ipra flag can be used to override this.
+  virtual bool useIPRA() const {
+    return false;
+  }
 };
 
 } // end namespace llvm
-- 
GitLab


From 4579aeefa621a70cc0d73a267a4f173e8efef7a2 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Tue, 6 Nov 2018 00:16:32 +0000
Subject: [PATCH 0989/1116] Specify REQUIRES: default_triple in two debuginfo
 tests

These were failing when specifying LLVM_DEFAULT_TARGET_TRIPLE=''

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346185 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/DebugInfo/cross-cu-scope.ll     | 1 +
 test/DebugInfo/debuglineinfo-path.ll | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/DebugInfo/cross-cu-scope.ll b/test/DebugInfo/cross-cu-scope.ll
index dffd44885f0..7f663349e37 100644
--- a/test/DebugInfo/cross-cu-scope.ll
+++ b/test/DebugInfo/cross-cu-scope.ll
@@ -1,5 +1,6 @@
 ; RUN: %llc_dwarf %s -filetype=obj -o %t
 ; RUN: llvm-dwarfdump -debug-info %t | FileCheck %s
+; REQUIRES: default_triple
 
 ; Reduced test case from PR35212. Two DISubprogram belong to a different CU but
 ; share a scope. Both are declarations and end up in the scope's CU. We want to
diff --git a/test/DebugInfo/debuglineinfo-path.ll b/test/DebugInfo/debuglineinfo-path.ll
index d92e1facad1..88dd4824798 100644
--- a/test/DebugInfo/debuglineinfo-path.ll
+++ b/test/DebugInfo/debuglineinfo-path.ll
@@ -1,5 +1,6 @@
 ; Make sure that absolute source dir is detected correctly regardless of the platform.
-; REQUIRES: object-emission
+; REQUIRES: object-emission, default_triple
+
 ; On powerpc llvm-nm describes win_func as a global variable, not a function. It breaks the test.
 ; It is not essential to DWARF path handling code we're testing here.
 ; UNSUPPORTED: powerpc
-- 
GitLab


From fd192da8689bef637945fa5eade20abfd25539b5 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Tue, 6 Nov 2018 00:31:02 +0000
Subject: [PATCH 0990/1116] Revert "[WebAssembly] Fixup `main` signature by
 default"

This reverts rL345880.  It caused some test failures on the
webassembly waterfall.  e.g. binaryen2.test_mainenv fails due
the fact that `envp` ends up being undef rather than 0.

Differential Revision: https://reviews.llvm.org/D54117

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346187 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp | 7 ++++++-
 test/CodeGen/WebAssembly/call.ll                          | 4 ++--
 test/CodeGen/WebAssembly/function-bitcasts-varargs.ll     | 2 +-
 test/CodeGen/WebAssembly/function-bitcasts.ll             | 2 +-
 test/CodeGen/WebAssembly/main-declaration.ll              | 2 +-
 test/CodeGen/WebAssembly/main-no-args.ll                  | 2 +-
 test/CodeGen/WebAssembly/main-with-args.ll                | 2 +-
 7 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index dffc4d168f5..0644f1232f6 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -36,6 +36,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-fix-function-bitcasts"
 
+static cl::opt<bool>
+    TemporaryWorkarounds("wasm-temporary-workarounds",
+                         cl::desc("Apply certain temporary workarounds"),
+                         cl::init(true), cl::Hidden);
+
 namespace {
 class FixFunctionBitcasts final : public ModulePass {
   StringRef getPassName() const override {
@@ -236,7 +241,7 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
     // "int main(int argc, char *argv[])", create an artificial call with it
     // bitcasted to that type so that we generate a wrapper for it, so that
     // the C runtime can call it.
-    if (!F.isDeclaration() && F.getName() == "main") {
+    if (!TemporaryWorkarounds && !F.isDeclaration() && F.getName() == "main") {
       Main = &F;
       LLVMContext &C = M.getContext();
       Type *MainArgTys[] = {Type::getInt32Ty(C),
diff --git a/test/CodeGen/WebAssembly/call.ll b/test/CodeGen/WebAssembly/call.ll
index 3d768de0d58..eaa583f8a02 100644
--- a/test/CodeGen/WebAssembly/call.ll
+++ b/test/CodeGen/WebAssembly/call.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -mattr=+sign-ext,+simd128 | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -fast-isel -fast-isel-abort=1 -mattr=+sign-ext,+simd128 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-temporary-workarounds=false -mattr=+sign-ext,+simd128 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -fast-isel -fast-isel-abort=1 -wasm-temporary-workarounds=false -mattr=+sign-ext,+simd128 | FileCheck %s
 
 ; Test that basic call operations assemble as expected.
 
diff --git a/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll b/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll
index 015de4eb39c..633871a599b 100644
--- a/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll
+++ b/test/CodeGen/WebAssembly/function-bitcasts-varargs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -wasm-keep-registers | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -wasm-temporary-workarounds=false -wasm-keep-registers | FileCheck %s
 
 ; Test that function pointer casts casting away varargs are replaced with
 ; wrappers.
diff --git a/test/CodeGen/WebAssembly/function-bitcasts.ll b/test/CodeGen/WebAssembly/function-bitcasts.ll
index 0853549d1b4..0e7fcd5d570 100644
--- a/test/CodeGen/WebAssembly/function-bitcasts.ll
+++ b/test/CodeGen/WebAssembly/function-bitcasts.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -enable-emscripten-cxx-exceptions | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -enable-emscripten-cxx-exceptions -wasm-temporary-workarounds=false | FileCheck %s
 
 ; Test that function pointer casts are replaced with wrappers.
 
diff --git a/test/CodeGen/WebAssembly/main-declaration.ll b/test/CodeGen/WebAssembly/main-declaration.ll
index 23e5887608c..8d9414f326e 100644
--- a/test/CodeGen/WebAssembly/main-declaration.ll
+++ b/test/CodeGen/WebAssembly/main-declaration.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -wasm-temporary-workarounds=false | FileCheck %s
 
 ; Test main functions with alternate signatures.
 
diff --git a/test/CodeGen/WebAssembly/main-no-args.ll b/test/CodeGen/WebAssembly/main-no-args.ll
index de3f04bebc4..09a4feaed14 100644
--- a/test/CodeGen/WebAssembly/main-no-args.ll
+++ b/test/CodeGen/WebAssembly/main-no-args.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -wasm-temporary-workarounds=false | FileCheck %s
 
 ; Test main functions with alternate signatures.
 
diff --git a/test/CodeGen/WebAssembly/main-with-args.ll b/test/CodeGen/WebAssembly/main-with-args.ll
index 3c057afe306..aa085409756 100644
--- a/test/CodeGen/WebAssembly/main-with-args.ll
+++ b/test/CodeGen/WebAssembly/main-with-args.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -wasm-temporary-workarounds=false | FileCheck %s
 
 ; Test that main function with expected signature is not wrapped
 
-- 
GitLab


From a8f890f4777f53bd9243bfdd9ed2b5d171cc8c03 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 6 Nov 2018 00:31:27 +0000
Subject: [PATCH 0991/1116] [X86] Autogenerate complete checks. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346188 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/2009-06-05-VZextByteShort.ll | 35 +++++++++++++------
 test/CodeGen/X86/2011-10-19-LegelizeLoad.ll   | 23 +++++++++---
 .../CodeGen/X86/2012-03-15-build_vector_wl.ll |  8 +++--
 test/CodeGen/X86/4char-promote.ll             | 11 +++---
 test/CodeGen/X86/avx-fp2int.ll                | 18 ++++++----
 test/CodeGen/X86/extract-concat.ll            | 14 +++++---
 6 files changed, 74 insertions(+), 35 deletions(-)

diff --git a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
index d1d05a19001..be8563a9f2f 100644
--- a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
+++ b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
@@ -1,9 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-- -mcpu=core2 | FileCheck %s
 
 define <4 x i16> @a(i32* %x1) nounwind {
 ; CHECK-LABEL: a:
-; CHECK:         shrl %[[R:[^,]+]]
-; CHECK-NEXT:    movd %[[R]], %xmm0
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    retl
 
   %x2 = load i32, i32* %x1
@@ -15,9 +19,12 @@ define <4 x i16> @a(i32* %x1) nounwind {
 
 define <8 x i16> @b(i32* %x1) nounwind {
 ; CHECK-LABEL: b:
-; CHECK:         shrl %e[[R:.]]x
-; CHECK-NEXT:    movzwl %[[R]]x, %e[[R]]x
-; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    retl
 
   %x2 = load i32, i32* %x1
@@ -29,9 +36,12 @@ define <8 x i16> @b(i32* %x1) nounwind {
 
 define <8 x i8> @c(i32* %x1) nounwind {
 ; CHECK-LABEL: c:
-; CHECK:         shrl %e[[R:.]]x
-; CHECK-NEXT:    movzwl %[[R]]x, %e[[R]]x
-; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    retl
 
   %x2 = load i32, i32* %x1
@@ -43,9 +53,12 @@ define <8 x i8> @c(i32* %x1) nounwind {
 
 define <16 x i8> @d(i32* %x1) nounwind {
 ; CHECK-LABEL: d:
-; CHECK:         shrl %e[[R:.]]x
-; CHECK-NEXT:    movzbl %[[R]]l, %e[[R]]x
-; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    retl
 
   %x2 = load i32, i32* %x1
diff --git a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
index 2ae3d389d05..d8a6823f7b8 100644
--- a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
+++ b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=corei7 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i8:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -13,15 +14,29 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Also make sure that we sign-extend it.
 ; Based on /gcc-4_2-testsuite/src/gcc.c-torture/execute/pr23135.c
 
-; CHECK: main
 define i32 @main() nounwind uwtable {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pmovsxbq {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    pmovsxbq {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    pextrq $1, %xmm1, %rax
+; CHECK-NEXT:    pextrq $1, %xmm0, %rcx
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rcx
+; CHECK-NEXT:    movq %rax, %xmm2
+; CHECK-NEXT:    movq %xmm1, %rax
+; CHECK-NEXT:    movq %xmm0, %rcx
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rcx
+; CHECK-NEXT:    movq %rax, %xmm0
+; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NEXT:    pextrw $0, %xmm0, {{.*}}(%rip)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
 entry:
-; CHECK: pmovsxbq  i(%rip), %
-; CHECK: pmovsxbq  j(%rip), %
   %0 = load <2 x i8>, <2 x i8>* @i, align 8
   %1 = load <2 x i8>, <2 x i8>* @j, align 8
   %div = sdiv <2 x i8> %1, %0
   store <2 x i8> %div, <2 x i8>* getelementptr inbounds (%union.anon, %union.anon* @res, i32 0, i32 0), align 8
   ret i32 0
-; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
index c4b307e5a5d..95d78e47479 100644
--- a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
+++ b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
@@ -1,10 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-; CHECK: build_vector_again
 define <4 x i8> @build_vector_again(<16 x i8> %in) nounwind readnone {
+; CHECK-LABEL: build_vector_again:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT:    retq
 entry:
   %out = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: pmovzxbd
   ret <4 x i8> %out
-; CHECK: ret
 }
diff --git a/test/CodeGen/X86/4char-promote.ll b/test/CodeGen/X86/4char-promote.ll
index bfe025eaa91..27778be3b71 100644
--- a/test/CodeGen/X86/4char-promote.ll
+++ b/test/CodeGen/X86/4char-promote.ll
@@ -1,14 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; A test for checking PR 9623
 ; RUN: llc -mcpu=corei7 < %s | FileCheck %s
 
 target triple = "x86_64-apple-darwin"
 
-; CHECK:  pmulld
-; CHECK:  paddd
-; CHECK-NOT:  movdqa
-; CHECK:  ret
-
 define <4 x i8> @foo(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: foo:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pmulld %xmm0, %xmm1
+; CHECK-NEXT:    paddd %xmm1, %xmm0
+; CHECK-NEXT:    retq
 entry:
  %binop = mul <4 x i8> %x, %y
  %binop6 = add <4 x i8> %binop, %x
diff --git a/test/CodeGen/X86/avx-fp2int.ll b/test/CodeGen/X86/avx-fp2int.ll
index f06564b0f58..d1aa1f281fd 100644
--- a/test/CodeGen/X86/avx-fp2int.ll
+++ b/test/CodeGen/X86/avx-fp2int.ll
@@ -1,19 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
 ;; Check that FP_TO_SINT and FP_TO_UINT generate convert with truncate
 
-; CHECK-LABEL: test1:
-; CHECK: vcvttpd2dq
-; CHECK: ret
-; CHECK-LABEL: test2:
-; CHECK: vcvttpd2dq
-; CHECK: ret
-
 define <4 x i8> @test1(<4 x double> %d) {
+; CHECK-LABEL: test1:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
   %c = fptoui <4 x double> %d to <4 x i8>
   ret <4 x i8> %c
 }
 define <4 x i8> @test2(<4 x double> %d) {
+; CHECK-LABEL: test2:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
   %c = fptosi <4 x double> %d to <4 x i8>
   ret <4 x i8> %c
 }
diff --git a/test/CodeGen/X86/extract-concat.ll b/test/CodeGen/X86/extract-concat.ll
index 704309eb650..029c69a34cb 100644
--- a/test/CodeGen/X86/extract-concat.ll
+++ b/test/CodeGen/X86/extract-concat.ll
@@ -1,6 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 
 define void @foo(<4 x float> %in, <4 x i8>* %out) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    movl $255, %eax
+; CHECK-NEXT:    pinsrd $3, %eax, %xmm0
+; CHECK-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    movd %xmm0, (%rdi)
+; CHECK-NEXT:    retq
   %t0 = fptosi <4 x float> %in to <4 x i32>
   %t1 = trunc <4 x i32> %t0 to <4 x i16>
   %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -9,9 +18,4 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
   %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3
   store <4 x i8> %t5, <4 x i8>* %out
   ret void
-; CHECK: foo
-; CHECK: cvttps2dq
-; CHECK-NOT: pextrd
-; CHECK: pshufb
-; CHECK: ret
 }
-- 
GitLab


From 4fefaaea5a07a09d1bc08678a246b9db064510dc Mon Sep 17 00:00:00 2001
From: Robert Widmann <devteam.codafi@gmail.com>
Date: Tue, 6 Nov 2018 01:38:14 +0000
Subject: [PATCH 0992/1116] [LLVM-C] Improve Intrinsics Bindings

Summary:
Improve the intrinsic bindings with operations for

- Retrieving and automatically inserting the declaration of an intrinsic by ID
- Retrieving the name of a non-overloaded intrinsic by ID
- Retrieving the name of an overloaded intrinsic by ID and overloaded parameter types

Improve the echo test to copy non-overloaded intrinsics by ID.

Reviewers: whitequark, deadalnix

Reviewed By: whitequark

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53626

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346195 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm-c/Core.h        | 48 ++++++++++++++++++++++++++++++++++++
 lib/IR/Core.cpp              | 44 +++++++++++++++++++++++++++++++++
 test/Bindings/llvm-c/echo.ll | 16 ++++++++++++
 tools/llvm-c-test/echo.cpp   | 12 ++++++++-
 4 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index c905cfbb08d..c093c0906ce 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -2381,6 +2381,54 @@ void LLVMSetPersonalityFn(LLVMValueRef Fn, LLVMValueRef PersonalityFn);
  */
 unsigned LLVMGetIntrinsicID(LLVMValueRef Fn);
 
+/**
+ * Create or insert the declaration of an intrinsic.  For overloaded intrinsics,
+ * parameter types must be provided to uniquely identify an overload.
+ *
+ * @see llvm::Intrinsic::getDeclaration()
+ */
+LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod,
+                                         unsigned ID,
+                                         LLVMTypeRef *ParamTypes,
+                                         size_t ParamCount);
+
+/**
+ * Retrieves the type of an intrinsic.  For overloaded intrinsics, parameter
+ * types must be provided to uniquely identify an overload.
+ *
+ * @see llvm::Intrinsic::getType()
+ */
+LLVMTypeRef LLVMIntrinsicGetType(LLVMContextRef Ctx, unsigned ID,
+                                 LLVMTypeRef *ParamTypes, size_t ParamCount);
+
+/**
+ * Retrieves the name of an intrinsic.
+ *
+ * @see llvm::Intrinsic::getName()
+ */
+const char *LLVMIntrinsicGetName(unsigned ID, size_t *NameLength);
+
+/**
+ * Copies the name of an overloaded intrinsic identified by a given list of
+ * parameter types.
+ *
+ * Unlike LLVMIntrinsicGetName, the caller is responsible for freeing the
+ * returned string.
+ *
+ * @see llvm::Intrinsic::getName()
+ */
+const char *LLVMIntrinsicCopyOverloadedName(unsigned ID,
+                                            LLVMTypeRef *ParamTypes,
+                                            size_t ParamCount,
+                                            size_t *NameLength);
+
+/**
+ * Obtain if the intrinsic identified by the given ID is overloaded.
+ *
+ * @see llvm::Intrinsic::isOverloaded()
+ */
+LLVMBool LLVMIntrinsicIsOverloaded(unsigned ID);
+
 /**
  * Obtain the calling function of a function.
  *
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 028f9e6199d..f94b8a04738 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -2280,6 +2280,50 @@ unsigned LLVMGetIntrinsicID(LLVMValueRef Fn) {
   return 0;
 }
 
+static Intrinsic::ID llvm_map_to_intrinsic_id(unsigned ID) {
+  assert(ID < llvm::Intrinsic::num_intrinsics && "Intrinsic ID out of range");
+  return llvm::Intrinsic::ID(ID);
+}
+
+LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod,
+                                         unsigned ID,
+                                         LLVMTypeRef *ParamTypes,
+                                         size_t ParamCount) {
+  ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
+  auto IID = llvm_map_to_intrinsic_id(ID);
+  return wrap(llvm::Intrinsic::getDeclaration(unwrap(Mod), IID, Tys));
+}
+
+const char *LLVMIntrinsicGetName(unsigned ID, size_t *NameLength) {
+  auto IID = llvm_map_to_intrinsic_id(ID);
+  auto Str = llvm::Intrinsic::getName(IID);
+  *NameLength = Str.size();
+  return Str.data();
+}
+
+LLVMTypeRef LLVMIntrinsicGetType(LLVMContextRef Ctx, unsigned ID,
+                                 LLVMTypeRef *ParamTypes, size_t ParamCount) {
+  auto IID = llvm_map_to_intrinsic_id(ID);
+  ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
+  return wrap(llvm::Intrinsic::getType(*unwrap(Ctx), IID, Tys));
+}
+
+const char *LLVMIntrinsicCopyOverloadedName(unsigned ID,
+                                            LLVMTypeRef *ParamTypes,
+                                            size_t ParamCount,
+                                            size_t *NameLength) {
+  auto IID = llvm_map_to_intrinsic_id(ID);
+  ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
+  auto Str = llvm::Intrinsic::getName(IID, Tys);
+  *NameLength = Str.length();
+  return strndup(Str.c_str(), Str.length());
+}
+
+LLVMBool LLVMIntrinsicIsOverloaded(unsigned ID) {
+  auto IID = llvm_map_to_intrinsic_id(ID);
+  return llvm::Intrinsic::isOverloaded(IID);
+}
+
 unsigned LLVMGetFunctionCallConv(LLVMValueRef Fn) {
   return unwrap<Function>(Fn)->getCallingConv();
 }
diff --git a/test/Bindings/llvm-c/echo.ll b/test/Bindings/llvm-c/echo.ll
index 580293b3d04..118f822e432 100644
--- a/test/Bindings/llvm-c/echo.ll
+++ b/test/Bindings/llvm-c/echo.ll
@@ -170,6 +170,22 @@ define void @with_debuginfo() !dbg !4 {
   ret void, !dbg !7
 }
 
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
+
+define void @test_intrinsics() {
+entry:
+  %sp = call i8* @llvm.stacksave()
+  %x = alloca i32
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0)
+  call void @llvm.stackrestore(i8* %sp)
+  ret void
+}
+
 !llvm.dbg.cu = !{!0, !2}
 !llvm.module.flags = !{!3}
 
diff --git a/tools/llvm-c-test/echo.cpp b/tools/llvm-c-test/echo.cpp
index d4c61e2d13c..db926e8acea 100644
--- a/tools/llvm-c-test/echo.cpp
+++ b/tools/llvm-c-test/echo.cpp
@@ -240,7 +240,17 @@ static LLVMValueRef clone_constant_impl(LLVMValueRef Cst, LLVMModuleRef M) {
     // Try function
     if (LLVMIsAFunction(Cst)) {
       check_value_kind(Cst, LLVMFunctionValueKind);
-      LLVMValueRef Dst = LLVMGetNamedFunction(M, Name);
+
+      LLVMValueRef Dst = nullptr;
+      // Try an intrinsic
+      unsigned ID = LLVMGetIntrinsicID(Cst);
+      if (ID > 0 && !LLVMIntrinsicIsOverloaded(ID)) {
+        Dst = LLVMGetIntrinsicDeclaration(M, ID, nullptr, 0);
+      } else {
+        // Try a normal function
+        Dst = LLVMGetNamedFunction(M, Name);
+      }
+
       if (Dst)
         return Dst;
       report_fatal_error("Could not find function");
-- 
GitLab


From 46a96495e52ab88be80c366b844e5253c34adcc5 Mon Sep 17 00:00:00 2001
From: Robert Widmann <devteam.codafi@gmail.com>
Date: Tue, 6 Nov 2018 01:54:12 +0000
Subject: [PATCH 0993/1116] [LLVM-C] Fix Windows Build of Core

strndup doesn't exist outside of GNU-land and modern macOSes.  Use
strdup instead as c_str() is guaranteed to be NUL-terminated.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346197 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Core.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index f94b8a04738..a3065733c81 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -2316,7 +2316,7 @@ const char *LLVMIntrinsicCopyOverloadedName(unsigned ID,
   ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
   auto Str = llvm::Intrinsic::getName(IID, Tys);
   *NameLength = Str.length();
-  return strndup(Str.c_str(), Str.length());
+  return strdup(Str.c_str());
 }
 
 LLVMBool LLVMIntrinsicIsOverloaded(unsigned ID) {
-- 
GitLab


From a61bdda55771a8e483745ec5342bcd4a6ff9c411 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 6 Nov 2018 02:02:05 +0000
Subject: [PATCH 0994/1116] Revert "[IndVars] Smart hard uses detection"

This reverts commit 2f425e9c7946b9d74e64ebbfa33c1caa36914402.

It seems that the check that we still should do the transform if we
know the result is constant is missing in this code. So the logic that
has been deleted by this change is still sometimes accidentally useful.
I revert the change to see what can be done about it. The motivating
case is the following:

@Y = global [400 x i16] zeroinitializer, align 1

define i16 @foo() {
entry:
  br label %for.body

for.body:                                         ; preds = %entry, %for.body
  %i = phi i16 [ 0, %entry ], [ %inc, %for.body ]

  %arrayidx = getelementptr inbounds [400 x i16], [400 x i16]* @Y, i16 0, i16 %i
  store i16 0, i16* %arrayidx, align 1
  %inc = add nuw nsw i16 %i, 1
  %cmp = icmp ult i16 %inc, 400
  br i1 %cmp, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
  %inc.lcssa = phi i16 [ %inc, %for.body ]
  ret i16 %inc.lcssa
}

We should be able to figure out that the result is constant, but the patch
breaks it.

Differential Revision: https://reviews.llvm.org/D51584


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346198 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/IndVarSimplify.cpp      | 39 +++++---------
 test/Analysis/ScalarEvolution/pr28705.ll      |  6 +--
 .../IndVarSimplify/dont-recompute.ll          | 51 -------------------
 .../IndVarSimplify/lrev-existing-umin.ll      | 38 --------------
 4 files changed, 16 insertions(+), 118 deletions(-)

diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 3e4e0f46ca3..ec51ad71abc 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -145,7 +145,6 @@ class IndVarSimplify {
   bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
   bool rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
   bool rewriteFirstIterationLoopExitValues(Loop *L);
-  bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) const;
 
   bool linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
                                  PHINode *IndVar, SCEVExpander &Rewriter);
@@ -525,29 +524,6 @@ struct RewritePhi {
 // As a side effect, reduces the amount of IV processing within the loop.
 //===----------------------------------------------------------------------===//
 
-bool IndVarSimplify::hasHardUserWithinLoop(const Loop *L, const Instruction *I) const {
-  SmallPtrSet<const Instruction *, 8> Visited;
-  SmallVector<const Instruction *, 8> WorkList;
-  Visited.insert(I);
-  WorkList.push_back(I);
-  while (!WorkList.empty()) {
-    const Instruction *Curr = WorkList.pop_back_val();
-    // This use is outside the loop, nothing to do.
-    if (!L->contains(Curr))
-      continue;
-    // Do we assume it is a "hard" use which will not be eliminated easily?
-    if (Curr->mayHaveSideEffects())
-      return true;
-    // Otherwise, add all its users to worklist.
-    for (auto U : Curr->users()) {
-      auto *UI = cast<Instruction>(U);
-      if (Visited.insert(UI).second)
-        WorkList.push_back(UI);
-    }
-  }
-  return false;
-}
-
 /// Check to see if this loop has a computable loop-invariant execution count.
 /// If so, this means that we can compute the final value of any expressions
 /// that are recurrent in the loop, and substitute the exit values from the loop
@@ -622,8 +598,19 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         // Computing the value outside of the loop brings no benefit if it is
         // definitely used inside the loop in a way which can not be optimized
         // away.
-        if (hasHardUserWithinLoop(L, Inst))
-          continue;
+        if (ExitValue->getSCEVType()>=scMulExpr) {
+          bool HasHardInternalUses = false;
+          for (auto *IB : Inst->users()) {
+            Instruction *UseInstr = cast<Instruction>(IB);
+            unsigned Opc = UseInstr->getOpcode();
+            if (L->contains(UseInstr) && Opc == Instruction::Call) {
+              HasHardInternalUses = true;
+              break;
+            }
+          }
+          if (HasHardInternalUses)
+            continue;
+        }
 
         bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst);
         Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst);
diff --git a/test/Analysis/ScalarEvolution/pr28705.ll b/test/Analysis/ScalarEvolution/pr28705.ll
index 9a8487a6c66..8fbc08e3ca6 100644
--- a/test/Analysis/ScalarEvolution/pr28705.ll
+++ b/test/Analysis/ScalarEvolution/pr28705.ll
@@ -1,11 +1,11 @@
 ; PR28705
 ; RUN: opt < %s -indvars -S | FileCheck %s
 
-; Check IndVarSimplify doesn't replace external use of the induction var
-; "%inc.i.i" with "%.sroa.speculated + 1" because it is not profitable.
+; Check IndVarSimplify replaces the exitval use of the induction var "%inc.i.i"
+; with "%.sroa.speculated + 1".
 ;
 ; CHECK-LABEL: @foo(
-; CHECK: %[[EXIT:.+]] = phi i32 [ %inc.i.i, %for.body650 ]
+; CHECK: %[[EXIT:.+]] = sub i32 %.sroa.speculated, -1
 ; CHECK: %DB.sroa.9.0.lcssa = phi i32 [ 1, %entry ], [ %[[EXIT]], %loopexit ]
 ;
 define void @foo(i32 %sub.ptr.div.i, i8* %ref.i1174) local_unnamed_addr {
diff --git a/test/Transforms/IndVarSimplify/dont-recompute.ll b/test/Transforms/IndVarSimplify/dont-recompute.ll
index 22087710a9c..c87cd6596c6 100644
--- a/test/Transforms/IndVarSimplify/dont-recompute.ll
+++ b/test/Transforms/IndVarSimplify/dont-recompute.ll
@@ -123,54 +123,3 @@ for.end:                                          ; preds = %for.body
   tail call void @func(i32 %soft_use)
   ret void
 }
-
-; CHECK-LABEL: @test5(
-define void @test5(i32 %m) nounwind uwtable {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %a.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %add = add i32 %a.05, %m
-  %soft_use = add i32 %add, 123
-; CHECK: tail call void @func(i32 %soft_use)
-  tail call void @func(i32 %soft_use)
-  %inc = add nsw i32 %i.06, 1
-  %exitcond = icmp eq i32 %inc, 186
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-; CHECK: for.end:
-; CHECK-NOT: mul i32 %m, 186
-; CHECK:%add.lcssa = phi i32 [ %add, %for.body ]
-; CHECK-NEXT: tail call void @func(i32 %add.lcssa)
-  tail call void @func(i32 %add)
-  ret void
-}
-
-; CHECK-LABEL: @test6(
-define void @test6(i32 %m, i32* %p) nounwind uwtable {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %a.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %add = add i32 %a.05, %m
-  %soft_use = add i32 %add, 123
-; CHECK: store i32 %soft_use, i32* %pidx
-  %pidx = getelementptr i32, i32* %p, i32 %add
-  store i32 %soft_use, i32* %pidx
-  %inc = add nsw i32 %i.06, 1
-  %exitcond = icmp eq i32 %inc, 186
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-; CHECK: for.end:
-; CHECK-NOT: mul i32 %m, 186
-; CHECK:%add.lcssa = phi i32 [ %add, %for.body ]
-; CHECK-NEXT: tail call void @func(i32 %add.lcssa)
-  tail call void @func(i32 %add)
-  ret void
-}
diff --git a/test/Transforms/IndVarSimplify/lrev-existing-umin.ll b/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
index fff76675f17..961c9fd944d 100644
--- a/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
+++ b/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -S -indvars < %s | FileCheck %s
 
-; Do not rewrite the user outside the loop because we must keep the instruction
-; inside the loop due to store. Rewrite doesn't give us any profit.
 define void @f(i32 %length.i.88, i32 %length.i, i8* %tmp12, i32 %tmp10, i8* %tmp8) {
 ; CHECK-LABEL: @f(
 not_zero11.preheader:
@@ -24,42 +22,6 @@ not_zero11:
   %tmp23 = icmp slt i32 %tmp22, %tmp14
   br i1 %tmp23, label %not_zero11, label %main.exit.selector
 
-main.exit.selector:
-; CHECK-LABEL: main.exit.selector:
-; CHECK:   %tmp22.lcssa = phi i32 [ %tmp22, %not_zero11 ]
-; CHECK:   %tmp24 = icmp slt i32 %tmp22.lcssa, %length.
-  %tmp24 = icmp slt i32 %tmp22, %length.i
-  br i1 %tmp24, label %not_zero11.postloop, label %leave
-
-leave:
-  ret void
-
-not_zero11.postloop:
-  ret void
-}
-
-; Rewrite the user outside the loop because there is no hard users inside the loop.
-define void @f1(i32 %length.i.88, i32 %length.i, i8* %tmp12, i32 %tmp10, i8* %tmp8) {
-; CHECK-LABEL: @f1(
-not_zero11.preheader:
-  %tmp13 = icmp ugt i32 %length.i, %length.i.88
-  %tmp14 = select i1 %tmp13, i32 %length.i.88, i32 %length.i
-  %tmp15 = icmp sgt i32 %tmp14, 0
-  br i1 %tmp15, label %not_zero11, label %not_zero11.postloop
-
-not_zero11:
-  %v_1 = phi i32 [ %tmp22, %not_zero11 ], [ 0, %not_zero11.preheader ]
-  %tmp16 = zext i32 %v_1 to i64
-  %tmp17 = getelementptr inbounds i8, i8* %tmp8, i64 %tmp16
-  %tmp18 = load i8, i8* %tmp17, align 1
-  %tmp19 = zext i8 %tmp18 to i32
-  %tmp20 = or i32 %tmp19, %tmp10
-  %tmp21 = trunc i32 %tmp20 to i8
-  %addr22 = getelementptr inbounds i8, i8* %tmp12, i64 %tmp16
-  %tmp22 = add nuw nsw i32 %v_1, 1
-  %tmp23 = icmp slt i32 %tmp22, %tmp14
-  br i1 %tmp23, label %not_zero11, label %main.exit.selector
-
 main.exit.selector:
 ; CHECK-LABEL: main.exit.selector:
 ; CHECK: %tmp24 = icmp slt i32 %tmp14, %length.i
-- 
GitLab


From 7c8bf013119fe1f53067a03fa8a5714efe9c8701 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 6 Nov 2018 02:12:44 +0000
Subject: [PATCH 0995/1116] [NFC] Add motivating test case for revert in
 rL346198

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346199 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../IndVarSimplify/constant_result.ll         | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 test/Transforms/IndVarSimplify/constant_result.ll

diff --git a/test/Transforms/IndVarSimplify/constant_result.ll b/test/Transforms/IndVarSimplify/constant_result.ll
new file mode 100644
index 00000000000..749c4af07ae
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/constant_result.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -indvars -S | FileCheck %s
+
+@Y = global [400 x i16] zeroinitializer, align 1
+
+define i16 @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [400 x i16], [400 x i16]* @Y, i16 0, i16 [[I]]
+; CHECK-NEXT:    store i16 0, i16* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i16 [[I]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[INC]], 400
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i16 400
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i = phi i16 [ 0, %entry ], [ %inc, %for.body ]
+
+  %arrayidx = getelementptr inbounds [400 x i16], [400 x i16]* @Y, i16 0, i16 %i
+  store i16 0, i16* %arrayidx, align 1
+  %inc = add nuw nsw i16 %i, 1
+  %cmp = icmp ult i16 %inc, 400
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %inc.lcssa = phi i16 [ %inc, %for.body ]
+  ret i16 %inc.lcssa
+}
-- 
GitLab


From 5927508e1daed8db31a2f6a08fdac56269b121eb Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 6 Nov 2018 02:44:49 +0000
Subject: [PATCH 0996/1116] [LICM] Use ICFLoopSafetyInfo in LICM

This patch makes LICM use `ICFLoopSafetyInfo` that is a smarter version
of LoopSafetyInfo that leverages power of Implicit Control Flow Tracking
to keep track of throwing instructions and give less pessimistic answers
to queries related to throws.

The ICFLoopSafetyInfo itself has been introduced in rL344601. This patch
enables it in LICM only.

Differential Revision: https://reviews.llvm.org/D50377
Reviewed By: apilipenko


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346201 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/LoopUtils.h     |   7 +-
 lib/Transforms/Scalar/LICM.cpp                |  53 ++++---
 .../AMDGPU/build-vector-insert-elt-infloop.ll |   2 +-
 test/Transforms/LICM/guards.ll                |   6 +-
 test/Transforms/LICM/hoist-mustexec.ll        | 147 ++++++++++++++++++
 test/Transforms/LICM/hoist-nounwind.ll        |  29 +++-
 test/Transforms/LICM/preheader-safe.ll        |  21 +++
 7 files changed, 236 insertions(+), 29 deletions(-)

diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index c75a1de1137..f642852275c 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -109,7 +109,7 @@ bool formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
 /// arguments. Diagnostics is emitted via \p ORE. It returns changed status.
 bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
                 TargetLibraryInfo *, TargetTransformInfo *, Loop *,
-                AliasSetTracker *, LoopSafetyInfo *,
+                AliasSetTracker *, ICFLoopSafetyInfo *,
                 OptimizationRemarkEmitter *ORE);
 
 /// Walk the specified region of the CFG (defined by all blocks
@@ -122,7 +122,7 @@ bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
 /// ORE. It returns changed status.
 bool hoistRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
                  TargetLibraryInfo *, Loop *, AliasSetTracker *,
-                 LoopSafetyInfo *, OptimizationRemarkEmitter *ORE);
+                 ICFLoopSafetyInfo *, OptimizationRemarkEmitter *ORE);
 
 /// This function deletes dead loops. The caller of this function needs to
 /// guarantee that the loop is infact dead.
@@ -151,7 +151,8 @@ bool promoteLoopAccessesToScalars(const SmallSetVector<Value *, 8> &,
                                   SmallVectorImpl<Instruction *> &,
                                   PredIteratorCache &, LoopInfo *,
                                   DominatorTree *, const TargetLibraryInfo *,
-                                  Loop *, AliasSetTracker *, LoopSafetyInfo *,
+                                  Loop *, AliasSetTracker *,
+                                  ICFLoopSafetyInfo *,
                                   OptimizationRemarkEmitter *);
 
 /// Does a BFS from a given node to all of its children inside a given loop.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index d379808cd7f..d6dfdc7efed 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -103,10 +103,10 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
                                   const LoopSafetyInfo *SafetyInfo,
                                   TargetTransformInfo *TTI, bool &FreeInLoop);
 static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
-                  LoopSafetyInfo *SafetyInfo,
+                  ICFLoopSafetyInfo *SafetyInfo,
                   OptimizationRemarkEmitter *ORE);
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
-                 const Loop *CurLoop, LoopSafetyInfo *SafetyInfo,
+                 const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
                  OptimizationRemarkEmitter *ORE, bool FreeInLoop);
 static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                            const DominatorTree *DT,
@@ -123,7 +123,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
                             const LoopInfo *LI,
                             const LoopSafetyInfo *SafetyInfo);
 
-static void eraseInstruction(Instruction &I, AliasSetTracker *AST);
+static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
+                             AliasSetTracker *AST);
 
 namespace {
 struct LoopInvariantCodeMotion {
@@ -269,7 +270,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
   BasicBlock *Preheader = L->getLoopPreheader();
 
   // Compute loop safety information.
-  SimpleLoopSafetyInfo SafetyInfo;
+  ICFLoopSafetyInfo SafetyInfo(DT);
   SafetyInfo.computeLoopSafetyInfo(L);
 
   // We want to visit all of the instructions in this loop... that are not parts
@@ -376,7 +377,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
 bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                       DominatorTree *DT, TargetLibraryInfo *TLI,
                       TargetTransformInfo *TTI, Loop *CurLoop,
-                      AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+                      AliasSetTracker *CurAST, ICFLoopSafetyInfo *SafetyInfo,
                       OptimizationRemarkEmitter *ORE) {
 
   // Verify inputs.
@@ -406,7 +407,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
         salvageDebugInfo(I);
         ++II;
-        eraseInstruction(I, CurAST);
+        eraseInstruction(I, *SafetyInfo, CurAST);
         Changed = true;
         continue;
       }
@@ -423,7 +424,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE, FreeInLoop)) {
           if (!FreeInLoop) {
             ++II;
-            eraseInstruction(I, CurAST);
+            eraseInstruction(I, *SafetyInfo, CurAST);
           }
           Changed = true;
         }
@@ -440,7 +441,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
 ///
 bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                        DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
-                       AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+                       AliasSetTracker *CurAST, ICFLoopSafetyInfo *SafetyInfo,
                        OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
@@ -481,7 +482,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         CurAST->copyValue(&I, C);
         I.replaceAllUsesWith(C);
         if (isInstructionTriviallyDead(&I, TLI))
-          eraseInstruction(I, CurAST);
+          eraseInstruction(I, *SafetyInfo, CurAST);
         Changed = true;
         continue;
       }
@@ -510,14 +511,16 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
         auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
         ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+        SafetyInfo->insertInstructionTo(I.getParent());
         ReciprocalDivisor->insertBefore(&I);
 
         auto Product =
             BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
         Product->setFastMathFlags(I.getFastMathFlags());
+        SafetyInfo->insertInstructionTo(I.getParent());
         Product->insertAfter(&I);
         I.replaceAllUsesWith(Product);
-        eraseInstruction(I, CurAST);
+        eraseInstruction(I, *SafetyInfo, CurAST);
 
         hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
         Changed = true;
@@ -886,9 +889,11 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
   return New;
 }
 
-static void eraseInstruction(Instruction &I, AliasSetTracker *AST) {
+static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
+                             AliasSetTracker *AST) {
   if (AST)
     AST->deleteValue(&I);
+  SafetyInfo.removeInstruction(&I);
   I.eraseFromParent();
 }
 
@@ -999,7 +1004,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
 /// position, and may either delete it or move it to outside of the loop.
 ///
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
-                 const Loop *CurLoop, LoopSafetyInfo *SafetyInfo,
+                 const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
                  OptimizationRemarkEmitter *ORE, bool FreeInLoop) {
   LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
   ORE->emit([&]() {
@@ -1090,7 +1095,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
     Instruction *New = sinkThroughTriviallyReplaceablePHI(PN, &I, LI, SunkCopies,
                                                           SafetyInfo, CurLoop);
     PN->replaceAllUsesWith(New);
-    eraseInstruction(*PN, nullptr);
+    eraseInstruction(*PN, *SafetyInfo, nullptr);
     Changed = true;
   }
   return Changed;
@@ -1100,7 +1105,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
 /// is safe to hoist, this instruction is called to do the dirty work.
 ///
 static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
-                  LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
+                  ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
   auto *Preheader = CurLoop->getLoopPreheader();
   LLVM_DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
                     << "\n");
@@ -1120,6 +1125,8 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
       !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop))
     I.dropUnknownNonDebugMetadata();
 
+  SafetyInfo->removeInstruction(&I);
+  SafetyInfo->insertInstructionTo(Preheader);
   // Move the new node to the Preheader, before its terminator.
   I.moveBefore(Preheader->getTerminator());
 
@@ -1180,6 +1187,7 @@ class LoopPromoter : public LoadAndStorePromoter {
   int Alignment;
   bool UnorderedAtomic;
   AAMDNodes AATags;
+  ICFLoopSafetyInfo &SafetyInfo;
 
   Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
     if (Instruction *I = dyn_cast<Instruction>(V))
@@ -1202,11 +1210,13 @@ public:
                SmallVectorImpl<BasicBlock *> &LEB,
                SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
                AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
-               bool UnorderedAtomic, const AAMDNodes &AATags)
+               bool UnorderedAtomic, const AAMDNodes &AATags,
+               ICFLoopSafetyInfo &SafetyInfo)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
         LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
         LI(li), DL(std::move(dl)), Alignment(alignment),
-        UnorderedAtomic(UnorderedAtomic), AATags(AATags) {}
+        UnorderedAtomic(UnorderedAtomic), AATags(AATags), SafetyInfo(SafetyInfo)
+      {}
 
   bool isInstInList(Instruction *I,
                     const SmallVectorImpl<Instruction *> &) const override {
@@ -1243,7 +1253,10 @@ public:
     // Update alias analysis.
     AST.copyValue(LI, V);
   }
-  void instructionDeleted(Instruction *I) const override { AST.deleteValue(I); }
+  void instructionDeleted(Instruction *I) const override {
+    SafetyInfo.removeInstruction(I);
+    AST.deleteValue(I);
+  }
 };
 
 
@@ -1281,7 +1294,7 @@ bool llvm::promoteLoopAccessesToScalars(
     SmallVectorImpl<BasicBlock *> &ExitBlocks,
     SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC,
     LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
-    Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+    Loop *CurLoop, AliasSetTracker *CurAST, ICFLoopSafetyInfo *SafetyInfo,
     OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
   assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
@@ -1500,7 +1513,7 @@ bool llvm::promoteLoopAccessesToScalars(
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
                         InsertPts, PIC, *CurAST, *LI, DL, Alignment,
-                        SawUnorderedAtomic, AATags);
+                        SawUnorderedAtomic, AATags, *SafetyInfo);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
@@ -1520,7 +1533,7 @@ bool llvm::promoteLoopAccessesToScalars(
 
   // If the SSAUpdater didn't use the load in the preheader, just zap it now.
   if (PreheaderLoad->use_empty())
-    eraseInstruction(*PreheaderLoad, CurAST);
+    eraseInstruction(*PreheaderLoad, *SafetyInfo, CurAST);
 
   return true;
 }
diff --git a/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll b/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
index 865dccb2791..fd81c0438d6 100644
--- a/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
+++ b/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
@@ -18,7 +18,7 @@ bb1:
   %tmp4 = bitcast half %tmp3 to i16
   %tmp5 = insertelement <2 x i16> <i16 0, i16 undef>, i16 %tmp4, i32 1
   %tmp6 = bitcast i16* %arg to half*
-  store half %tmp2, half* %tmp6, align 2
+  store volatile half %tmp2, half* %tmp6, align 2
   %tmp7 = bitcast <2 x i16> %tmp to <2 x half>
   %tmp8 = extractelement <2 x half> %tmp7, i32 0
   br label %bb1
diff --git a/test/Transforms/LICM/guards.ll b/test/Transforms/LICM/guards.ll
index b2f672104f8..b37c4189284 100644
--- a/test/Transforms/LICM/guards.ll
+++ b/test/Transforms/LICM/guards.ll
@@ -85,15 +85,15 @@ loop:
 }
 
 
-; TODO: We can also hoist this load and guard from mustexec non-header block.
+; TODO: We can also hoist this guard from mustexec non-header block.
 define void @test4(i1 %c, i32* %p) {
 
 ; CHECK-LABEL: @test4(
 ; CHECK-LABEL: entry:
-; CHECK-LABEL: loop:
-; CHECK-LABEL: backedge:
 ; CHECK:       %a = load i32, i32* %p
 ; CHECK:       %invariant_cond = icmp ne i32 %a, 100
+; CHECK-LABEL: loop:
+; CHECK-LABEL: backedge:
 ; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond)
 
 entry:
diff --git a/test/Transforms/LICM/hoist-mustexec.ll b/test/Transforms/LICM/hoist-mustexec.ll
index 5bce1fbce1c..53f78e88f72 100644
--- a/test/Transforms/LICM/hoist-mustexec.ll
+++ b/test/Transforms/LICM/hoist-mustexec.ll
@@ -456,3 +456,150 @@ backedge:
 exit:
   ret void
 }
+
+; Check that we can hoist a mustexecute load from backedge even if something
+; throws after it.
+define void @test_hoist_from_backedge_01(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_01(
+; CHECK:       entry:
+; CHECK-NEXT:  %load = load i32, i32* %p
+; CHECK-NOT:   load i32
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %load = load i32, i32* %p
+  call void @may_throw()
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we don't hoist the load if something before it can throw.
+define void @test_hoist_from_backedge_02(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_02(
+; CHECK:       entry:
+; CHECK:       loop:
+; CHECK:       %load = load i32, i32* %p
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  call void @may_throw()
+  %load = load i32, i32* %p
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_hoist_from_backedge_03(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_03(
+; CHECK:       entry:
+; CHECK:       loop:
+; CHECK:       %load = load i32, i32* %p
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  call void @may_throw()
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %load = load i32, i32* %p
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_hoist_from_backedge_04(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_04(
+; CHECK:       entry:
+; CHECK:       loop:
+; CHECK:       %load = load i32, i32* %p
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  call void @may_throw()
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %load = load i32, i32* %p
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/LICM/hoist-nounwind.ll b/test/Transforms/LICM/hoist-nounwind.ll
index 9fc4903b830..d53e4043af1 100644
--- a/test/Transforms/LICM/hoist-nounwind.ll
+++ b/test/Transforms/LICM/hoist-nounwind.ll
@@ -49,14 +49,16 @@ for.cond.cleanup:
   ret i32 0
 }
 
-; Don't hoist load past volatile load.
+; Hoist a non-volatile load past volatile load.
 define i32 @test3(i32* noalias nocapture readonly %a, i32* %v) nounwind uwtable {
 ; CHECK-LABEL: @test3(
 entry:
   br label %for.body
 
+; CHECK: load i32
+; CHECK: for.body:
 ; CHECK: load volatile i32
-; CHECK-NEXT: load i32
+; CHECK-NOT: load
 for.body:
   %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   %x.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
@@ -70,3 +72,26 @@ for.body:
 for.cond.cleanup:
   ret i32 %add
 }
+
+; Don't a volatile load past volatile load.
+define i32 @test4(i32* noalias nocapture readonly %a, i32* %v) nounwind uwtable {
+; CHECK-LABEL: @test4(
+entry:
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: load volatile i32
+; CHECK-NEXT: load volatile i32
+for.body:
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %x.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %xxx = load volatile i32, i32* %v, align 4
+  %i1 = load volatile i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %x.05
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+}
\ No newline at end of file
diff --git a/test/Transforms/LICM/preheader-safe.ll b/test/Transforms/LICM/preheader-safe.ll
index 0bfe123862c..03a7258df11 100644
--- a/test/Transforms/LICM/preheader-safe.ll
+++ b/test/Transforms/LICM/preheader-safe.ll
@@ -112,11 +112,31 @@ loop-if:
 exit:
   ret void
 }
+
+; Positive test - can hoist something that happens before thrower.
+define void @nothrow_header_pos(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: nothrow_header_pos
+; CHECK-LABEL: entry
+; CHECK: %div = udiv i64 %x, %y
+; CHECK-LABEL: loop
+; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+loop:                                         ; preds = %entry, %for.inc
+  br label %loop-if
+loop-if:
+  %div = udiv i64 %x, %y
+  call void @use(i64 %div)
+  br label %loop
+}
+
+
 ; Negative test - can't move out of throwing block
 define void @nothrow_header_neg(i64 %x, i64 %y, i1 %cond) {
 ; CHECK-LABEL: nothrow_header_neg
 ; CHECK-LABEL: entry
 ; CHECK-LABEL: loop
+; CHECK: call void @maythrow()
 ; CHECK: %div = udiv i64 %x, %y
 ; CHECK: call void @use(i64 %div)
 entry:
@@ -124,6 +144,7 @@ entry:
 loop:                                         ; preds = %entry, %for.inc
   br label %loop-if
 loop-if:
+  call void @maythrow()
   %div = udiv i64 %x, %y
   call void @use(i64 %div)
   br label %loop
-- 
GitLab


From eb4aa4bedfad6c650dd782f367531f7eedfedb19 Mon Sep 17 00:00:00 2001
From: Zi Xuan Wu <wuzish@cn.ibm.com>
Date: Tue, 6 Nov 2018 03:07:03 +0000
Subject: [PATCH 0997/1116] It's a test commit, which is my first commit and
 also add my name to CREDITS.TXT

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346202 91177308-0d34-0410-b5e6-96231b3b80d8
---
 CREDITS.TXT | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CREDITS.TXT b/CREDITS.TXT
index cde8a441cac..e279701f57d 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -506,3 +506,7 @@ D: PowerPC Backend Developer
 N: Li Jia He
 E: hljhehlj@cn.ibm.com
 D: PowerPC Backend Developer
+
+N: Zixuan Wu
+E: wuzish@cn.ibm.com
+D: PowerPC Backend Developer
-- 
GitLab


From 326c246381338701159cc3a4ddccd5515e0d3833 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Tue, 6 Nov 2018 03:15:22 +0000
Subject: [PATCH 0998/1116] AArch64: Cleanup CCMP code; NFC

Cleanup CCMP pattern matching code in preparation for review/bugfix:
- Rename `isConjunctionDisjunctionTree()` to `canEmitConjunction()`
  (it won't accept arbitrary disjunctions and is really about whether we
   can transform the subtree into a conjunction that we can emit).
- Rename `emitConjunctionDisjunctionTree()` to `emitConjunction()`

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346203 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp | 59 +++++++++++-----------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 101e20c8f20..d5d6d5ca23e 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1521,7 +1521,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
 /// a comparison. They set the NZCV flags to a predefined value if their
 /// predicate is false. This allows to express arbitrary conjunctions, for
-/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
+/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
 /// expressed as:
 ///   cmp A
 ///   ccmp B, inv(CB), CA
@@ -1591,14 +1591,12 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
   return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
 }
 
-/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
-/// CanPushNegate is set to true if we can push a negate operation through
-/// the tree in a was that we are left with AND operations and negate operations
-/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
-/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
-/// brought into such a form.
-static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
-                                         unsigned Depth = 0) {
+/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
+/// expressed as a conjunction. See \ref AArch64CCMP.
+/// \param CanNegate        Set to true if we can also emit the negation of the
+///                         tree as a conjunction.
+static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
+                               unsigned Depth = 0) {
   if (!Val.hasOneUse())
     return false;
   unsigned Opcode = Val->getOpcode();
@@ -1615,10 +1613,10 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
     SDValue O0 = Val->getOperand(0);
     SDValue O1 = Val->getOperand(1);
     bool CanNegateL;
-    if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
+    if (!canEmitConjunction(O0, CanNegateL, Depth+1))
       return false;
     bool CanNegateR;
-    if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
+    if (!canEmitConjunction(O1, CanNegateR, Depth+1))
       return false;
 
     if (Opcode == ISD::OR) {
@@ -1626,8 +1624,11 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
       // we cannot do the transformation at all.
       if (!CanNegateL && !CanNegateR)
         return false;
-      // We can however change a (not (or x y)) to (and (not x) (not y)) if we
-      // can negate the x and y subtrees.
+      // However if we can negate x and y, then we can change
+      // (not (or x y))
+      // into
+      // (and (not x) (not y))
+      // to eliminate the outer negation.
       CanNegate = CanNegateL && CanNegateR;
     } else {
       // If the operands are OR expressions then we finally need to negate their
@@ -1637,7 +1638,7 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
       bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
       if (NeedsNegOutL && NeedsNegOutR)
         return false;
-      // We cannot negate an AND operation (it would become an OR),
+      // We cannot negate an AND operation.
       CanNegate = false;
     }
     return true;
@@ -1655,7 +1656,7 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
 /// for the comparisons in the current subtree; @p Depth limits the search
 /// depth to avoid stack overflow.
-static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
+static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
     AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
     AArch64CC::CondCode Predicate) {
   // We're at a tree leaf, produce a conditional comparison operation.
@@ -1712,13 +1713,13 @@ static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
   if (NegateOpsAndResult) {
     // See which side we can negate.
     bool CanNegateL;
-    bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
+    bool isValidL = canEmitConjunction(LHS, CanNegateL);
     assert(isValidL && "Valid conjunction/disjunction tree");
     (void)isValidL;
 
 #ifndef NDEBUG
     bool CanNegateR;
-    bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
+    bool isValidR = canEmitConjunction(RHS, CanNegateR);
     assert(isValidR && "Valid conjunction/disjunction tree");
     assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
 #endif
@@ -1740,12 +1741,12 @@ static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
   // through if we are already in a PushNegate case, otherwise we can negate
   // the "flags to test" afterwards.
   AArch64CC::CondCode RHSCC;
-  SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
+  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, Negate,
                                                    CCOp, Predicate);
   if (NegateOpsAndResult && !Negate)
     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
   // Emit LHS. We may need to negate it.
-  SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
+  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC,
                                                    NegateOpsAndResult, CmpR,
                                                    RHSCC);
   // If we transformed an OR to and AND then we have to negate the result
@@ -1755,17 +1756,17 @@ static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
   return CmpL;
 }
 
-/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
-/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
-/// \see emitConjunctionDisjunctionTreeRec().
-static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
-                                              AArch64CC::CondCode &OutCC) {
-  bool CanNegate;
-  if (!isConjunctionDisjunctionTree(Val, CanNegate))
+/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
+/// In some cases this is even possible with OR operations in the expression.
+/// See \ref AArch64CCMP.
+/// \see emitConjunctionRec().
+static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
+                               AArch64CC::CondCode &OutCC) {
+  bool DummyCanNegate;
+  if (!canEmitConjunction(Val, DummyCanNegate))
     return SDValue();
 
-  return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
-                                           AArch64CC::AL);
+  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
 }
 
 /// @}
@@ -1922,7 +1923,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     }
 
     if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
-      if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
+      if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
         if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
           AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
       }
-- 
GitLab


From bd5ea2b6c23caea7144b59163ea74121c4aeab3f Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 6 Nov 2018 04:17:40 +0000
Subject: [PATCH 0999/1116] [LICM] Remove too conservative IsMustExecute
 variable

LICM relies on variable `MustExecute` which is conservatively set to `false`
in all non-headers. It is used when we decide whether or not we want to hoist
an instruction or a guard.

For the guards, it might be too conservative to use this variable, we can
instead use a more precise logic from LoopSafetyInfo. Currently it is only NFC
because `IsMemoryNotModified` is also conservatively set to `false` for all
non-headers, and we cannot hoist guards from non-header blocks. However once we
give up using `IsMemoryNotModified` and use a smarter check instead, this will
allow us to hoist guards from all mustexecute non-header blocks.

Differential Revision: https://reviews.llvm.org/D50888
Reveiwed By: fedor.sergeev


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346204 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LICM.cpp | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index d6dfdc7efed..7789cb92345 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -460,14 +460,10 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
     if (inSubLoop(BB, CurLoop, LI))
       continue;
 
-    // Keep track of whether the prefix of instructions visited so far are such
-    // that the next instruction visited is guaranteed to execute if the loop
-    // is entered.
-    bool IsMustExecute = CurLoop->getHeader() == BB;
     // Keep track of whether the prefix instructions could have written memory.
-    // TODO: This and IsMustExecute may be done smarter if we keep track of all
-    // throwing and mem-writing operations in every block, e.g. using something
-    // similar to isGuaranteedToExecute.
+    // TODO: This may be done smarter if we keep track of all throwing and
+    // mem-writing operations in every block, e.g. using something similar to
+    // isGuaranteedToExecute.
     bool IsMemoryNotModified = CurLoop->getHeader() == BB;
 
     for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
@@ -493,10 +489,9 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       //
       if (CurLoop->hasLoopInvariantOperands(&I) &&
           canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, true, ORE) &&
-          (IsMustExecute ||
-           isSafeToExecuteUnconditionally(
-               I, DT, CurLoop, SafetyInfo, ORE,
-               CurLoop->getLoopPreheader()->getTerminator()))) {
+          isSafeToExecuteUnconditionally(
+              I, DT, CurLoop, SafetyInfo, ORE,
+              CurLoop->getLoopPreheader()->getTerminator())) {
         hoist(I, DT, CurLoop, SafetyInfo, ORE);
         Changed = true;
         continue;
@@ -531,15 +526,13 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       if (((I.use_empty() &&
             match(&I, m_Intrinsic<Intrinsic::invariant_start>())) ||
            isGuard(&I)) &&
-          IsMustExecute && IsMemoryNotModified &&
-          CurLoop->hasLoopInvariantOperands(&I)) {
+          IsMemoryNotModified && CurLoop->hasLoopInvariantOperands(&I) &&
+          SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) {
         hoist(I, DT, CurLoop, SafetyInfo, ORE);
         Changed = true;
         continue;
       }
 
-      if (IsMustExecute)
-        IsMustExecute = isGuaranteedToTransferExecutionToSuccessor(&I);
       if (IsMemoryNotModified)
         IsMemoryNotModified = !I.mayWriteToMemory();
     }
-- 
GitLab


From b92b0b860c768cc082eac0c6bcf89edd5a3a8913 Mon Sep 17 00:00:00 2001
From: Dean Michael Berris <dberris@google.com>
Date: Tue, 6 Nov 2018 08:51:37 +0000
Subject: [PATCH 1000/1116] [XRay] Update XRayRecord to support Custom/Typed
 Events

Summary:
This change cuts across LLVM and compiler-rt to add support for
rendering custom events in the XRayRecord type, to allow for including
user-provided annotations in the output YAML (as raw bytes).

This work enables us to add custom event and typed event records into
the `llvm::xray::Trace` type for user-provided events. This can then be
programmatically handled through the C++ API and can be included in some
of the tooling as well. For now we support printing the raw data we
encounter in the custom events in the converted output.

Future work will allow us to start interpreting these custom and typed
events through a yet-to-be-defined API for extending the trace analysis
library.

Reviewers: mboerger

Subscribers: hiraditya, llvm-commits

Differential Revision: https://reviews.llvm.org/D54139

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346214 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/XRay/FDRTraceExpander.h          |  4 +--
 include/llvm/XRay/XRayRecord.h                | 23 ++++++++++--
 include/llvm/XRay/YAMLXRayRecord.h            |  9 +++--
 lib/XRay/FDRTraceExpander.cpp                 | 20 ++++++++---
 lib/XRay/Profile.cpp                          |  6 ++++
 lib/XRay/RecordPrinter.cpp                    |  4 +++
 lib/XRay/Trace.cpp                            |  5 +--
 .../X86/convert-basic-arg1-to-yaml.txt        |  8 ++---
 ...onvert-basic-log-arg1-version3-to-yaml.txt | 12 +++----
 .../convert-basic-log-version3-to-yaml.txt    | 16 ++++-----
 .../X86/convert-fdr-arg1-to-yaml.txt          |  4 +--
 .../convert-fdr-log-arg1-version3-to-yaml.txt | 12 +++----
 .../X86/convert-fdr-log-version3-to-yaml.txt  | 12 +++----
 .../llvm-xray/X86/convert-fdr-to-yaml.txt     | 26 +++++++-------
 .../llvm-xray/X86/convert-roundtrip.yaml      |  4 +--
 test/tools/llvm-xray/X86/convert-to-yaml.txt  | 12 +++----
 .../llvm-xray/X86/convert-with-debug-syms.txt | 12 +++----
 .../X86/convert-with-standalone-instrmap.txt  | 12 +++----
 .../X86/convert-with-yaml-instrmap.txt        | 12 +++----
 tools/llvm-xray/xray-account.cpp              | 36 ++++++++++++-------
 tools/llvm-xray/xray-converter.cpp            | 21 ++++++++---
 tools/llvm-xray/xray-graph.cpp                |  4 +++
 tools/llvm-xray/xray-stacks.cpp               |  3 ++
 23 files changed, 173 insertions(+), 104 deletions(-)

diff --git a/include/llvm/XRay/FDRTraceExpander.h b/include/llvm/XRay/FDRTraceExpander.h
index 7f8236b82b5..64c459930b2 100644
--- a/include/llvm/XRay/FDRTraceExpander.h
+++ b/include/llvm/XRay/FDRTraceExpander.h
@@ -27,10 +27,10 @@ class TraceExpander : public RecordVisitor {
   int32_t PID = 0;
   int32_t TID = 0;
   uint64_t BaseTSC = 0;
-  XRayRecord CurrentRecord{0, 0, RecordTypes::ENTER, 0, 0, 0, 0, {}};
+  XRayRecord CurrentRecord{0, 0, RecordTypes::ENTER, 0, 0, 0, 0, {}, {}};
   uint16_t CPUId = 0;
   uint16_t LogVersion = 0;
-  bool BuildingFunction = false;
+  bool BuildingRecord = false;
   bool IgnoringRecords = false;
 
   void resetCurrentRecord();
diff --git a/include/llvm/XRay/XRayRecord.h b/include/llvm/XRay/XRayRecord.h
index 76873447f17..7685ec95838 100644
--- a/include/llvm/XRay/XRayRecord.h
+++ b/include/llvm/XRay/XRayRecord.h
@@ -17,6 +17,7 @@
 
 #include <cstdint>
 #include <vector>
+#include <string>
 
 namespace llvm {
 namespace xray {
@@ -54,10 +55,23 @@ struct XRayFileHeader {
 /// This may or may not correspond to actual record types in the raw trace (as
 /// the loader implementation may synthesize this information in the process of
 /// of loading).
-enum class RecordTypes { ENTER, EXIT, TAIL_EXIT, ENTER_ARG };
+enum class RecordTypes {
+  ENTER,
+  EXIT,
+  TAIL_EXIT,
+  ENTER_ARG,
+  CUSTOM_EVENT,
+  TYPED_EVENT
+};
 
+/// An XRayRecord is the denormalized view of data associated in a trace. These
+/// records may not correspond to actual entries in the raw traces, but they are
+/// the logical representation of records in a higher-level event log.
 struct XRayRecord {
-  /// The type of record.
+  /// RecordType values are used as "sub-types" which have meaning in the
+  /// context of the `Type` below. For function call and custom event records,
+  /// the RecordType is always 0, while for typed events we store the type in
+  /// the RecordType field.
   uint16_t RecordType;
 
   /// The CPU where the thread is running. We assume number of CPUs <= 65536.
@@ -66,7 +80,7 @@ struct XRayRecord {
   /// Identifies the type of record.
   RecordTypes Type;
 
-  /// The function ID for the record.
+  /// The function ID for the record, if this is a function call record.
   int32_t FuncId;
 
   /// Get the full 8 bytes of the TSC when we get the log record.
@@ -80,6 +94,9 @@ struct XRayRecord {
 
   /// The function call arguments.
   std::vector<uint64_t> CallArgs;
+
+  /// For custom and typed events, we provide the raw data from the trace.
+  std::string Data;
 };
 
 } // namespace xray
diff --git a/include/llvm/XRay/YAMLXRayRecord.h b/include/llvm/XRay/YAMLXRayRecord.h
index 0de9ea0968e..6150196ed98 100644
--- a/include/llvm/XRay/YAMLXRayRecord.h
+++ b/include/llvm/XRay/YAMLXRayRecord.h
@@ -39,6 +39,7 @@ struct YAMLXRayRecord {
   uint32_t TId;
   uint32_t PId;
   std::vector<uint64_t> CallArgs;
+  std::string Data;
 };
 
 struct YAMLXRayTrace {
@@ -58,6 +59,8 @@ template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
     IO.enumCase(Type, "function-exit", xray::RecordTypes::EXIT);
     IO.enumCase(Type, "function-tail-exit", xray::RecordTypes::TAIL_EXIT);
     IO.enumCase(Type, "function-enter-arg", xray::RecordTypes::ENTER_ARG);
+    IO.enumCase(Type, "custom-event", xray::RecordTypes::CUSTOM_EVENT);
+    IO.enumCase(Type, "typed-event", xray::RecordTypes::TYPED_EVENT);
   }
 };
 
@@ -73,16 +76,16 @@ template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
 
 template <> struct MappingTraits<xray::YAMLXRayRecord> {
   static void mapping(IO &IO, xray::YAMLXRayRecord &Record) {
-    // FIXME: Make this type actually be descriptive
     IO.mapRequired("type", Record.RecordType);
-    IO.mapRequired("func-id", Record.FuncId);
+    IO.mapOptional("func-id", Record.FuncId);
     IO.mapOptional("function", Record.Function);
     IO.mapOptional("args", Record.CallArgs);
     IO.mapRequired("cpu", Record.CPU);
-    IO.mapRequired("thread", Record.TId);
+    IO.mapOptional("thread", Record.TId, 0U);
     IO.mapOptional("process", Record.PId, 0U);
     IO.mapRequired("kind", Record.Type);
     IO.mapRequired("tsc", Record.TSC);
+    IO.mapOptional("data", Record.Data);
   }
 
   static constexpr bool flow = true;
diff --git a/lib/XRay/FDRTraceExpander.cpp b/lib/XRay/FDRTraceExpander.cpp
index 8e15db52ce6..adddb550ecd 100644
--- a/lib/XRay/FDRTraceExpander.cpp
+++ b/lib/XRay/FDRTraceExpander.cpp
@@ -12,10 +12,11 @@ namespace llvm {
 namespace xray {
 
 void TraceExpander::resetCurrentRecord() {
-  if (BuildingFunction)
+  if (BuildingRecord)
     C(CurrentRecord);
-  BuildingFunction = false;
+  BuildingRecord = false;
   CurrentRecord.CallArgs.clear();
+  CurrentRecord.Data.clear();
 }
 
 Error TraceExpander::visit(BufferExtents &) {
@@ -36,9 +37,18 @@ Error TraceExpander::visit(TSCWrapRecord &R) {
   return Error::success();
 }
 
-Error TraceExpander::visit(CustomEventRecord &) {
-  // TODO: Support custom event records in the future.
+Error TraceExpander::visit(CustomEventRecord &R) {
   resetCurrentRecord();
+  if (!IgnoringRecords) {
+    CurrentRecord.TSC = R.tsc();
+    CurrentRecord.CPU = R.cpu();
+    CurrentRecord.PId = PID;
+    CurrentRecord.TId = TID;
+    CurrentRecord.Type = RecordTypes::CUSTOM_EVENT;
+    std::copy(R.data().begin(), R.data().end(),
+              std::back_inserter(CurrentRecord.Data));
+    BuildingRecord = true;
+  }
   return Error::success();
 }
 
@@ -78,7 +88,7 @@ Error TraceExpander::visit(FunctionRecord &R) {
     CurrentRecord.PId = PID;
     CurrentRecord.TId = TID;
     CurrentRecord.CPU = CPUId;
-    BuildingFunction = true;
+    BuildingRecord = true;
   }
   return Error::success();
 }
diff --git a/lib/XRay/Profile.cpp b/lib/XRay/Profile.cpp
index fdd1953ab0f..e8a082884d6 100644
--- a/lib/XRay/Profile.cpp
+++ b/lib/XRay/Profile.cpp
@@ -374,6 +374,12 @@ Expected<Profile> profileFromTrace(const Trace &T) {
       }
 
       break;
+
+    case RecordTypes::CUSTOM_EVENT:
+    case RecordTypes::TYPED_EVENT:
+      // TODO: Support an extension point to allow handling of custom and typed
+      // events in profiles.
+      break;
     }
   }
 
diff --git a/lib/XRay/RecordPrinter.cpp b/lib/XRay/RecordPrinter.cpp
index 0d5ee2de83d..61a292cef85 100644
--- a/lib/XRay/RecordPrinter.cpp
+++ b/lib/XRay/RecordPrinter.cpp
@@ -81,6 +81,10 @@ Error RecordPrinter::visit(FunctionRecord &R) {
     OS << formatv("<Function Tail Exit: #{0} delta = +{1}>", R.functionId(),
                   R.delta());
     break;
+  case RecordTypes::CUSTOM_EVENT:
+  case RecordTypes::TYPED_EVENT:
+    // TODO: Flag as a bug?
+    break;
   }
   OS << Delim;
   return Error::success();
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
index e7b878cb83f..37cd147078e 100644
--- a/lib/XRay/Trace.cpp
+++ b/lib/XRay/Trace.cpp
@@ -352,8 +352,9 @@ Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
   Records.clear();
   std::transform(Trace.Records.begin(), Trace.Records.end(),
                  std::back_inserter(Records), [&](const YAMLXRayRecord &R) {
-                   return XRayRecord{R.RecordType, R.CPU, R.Type, R.FuncId,
-                                     R.TSC,        R.TId, R.PId,  R.CallArgs};
+                   return XRayRecord{R.RecordType, R.CPU,      R.Type,
+                                     R.FuncId,     R.TSC,      R.TId,
+                                     R.PId,        R.CallArgs, R.Data};
                  });
   return Error::success();
 }
diff --git a/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt b/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt
index 88a9dc2e58c..52ec12550a3 100644
--- a/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt
@@ -8,8 +8,8 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3500000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 17, thread: 8715, kind: function-enter, tsc: 22555670288232728 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 17, thread: 8715, kind: function-exit, tsc: 22555670288334784 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', args: [ 1 ], cpu: 17, thread: 8715, kind: function-enter-arg, tsc: 22555670288335768 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 17, thread: 8715, kind: function-exit, tsc: 22555670288365224 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 17, thread: 8715, kind: function-enter, tsc: 22555670288232728, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 17, thread: 8715, kind: function-exit, tsc: 22555670288334784, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', args: [ 1 ], cpu: 17, thread: 8715, kind: function-enter-arg, tsc: 22555670288335768, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 17, thread: 8715, kind: function-exit, tsc: 22555670288365224, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt b/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt
index 65232b79ba4..84c757c2b26 100644
--- a/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3900000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 0, thread: 2590, process: 2590, kind: function-enter, tsc: 2033303630902004 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033403115246844 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 0, thread: 2590, process: 2590, kind: function-enter, tsc: 2033490200702516 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033504122687120 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 67 ], cpu: 0, thread: 2590, process: 2590, kind: function-enter-arg, tsc: 2033505343905936 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033505343936752 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 0, thread: 2590, process: 2590, kind: function-enter, tsc: 2033303630902004, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033403115246844, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 0, thread: 2590, process: 2590, kind: function-enter, tsc: 2033490200702516, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033504122687120, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 67 ], cpu: 0, thread: 2590, process: 2590, kind: function-enter-arg, tsc: 2033505343905936, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033505343936752, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt b/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt
index 21a3b7e4a0f..d2af2fc09c2 100644
--- a/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt
@@ -8,12 +8,12 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3900000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070767347414784 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070767347496472 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070768324320264 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070768324344100 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070768921602152 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070768921625968 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070769627174140 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070769627197624 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070767347414784, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070767347496472, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070768324320264, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070768324344100, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070768921602152, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070768921625968, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070769627174140, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070769627197624, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt b/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt
index 06b5eb8904e..592796434bd 100644
--- a/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt
@@ -8,6 +8,6 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3500000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 1 ], cpu: 49, thread: 14648, kind: function-enter-arg, tsc: 18828908666543318 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 49, thread: 14648, kind: function-exit, tsc: 18828908666595604 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 1 ], cpu: 49, thread: 14648, kind: function-enter-arg, tsc: 18828908666543318, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 49, thread: 14648, kind: function-exit, tsc: 18828908666595604, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt b/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt
index a3a3ed6d22b..afeac68fa3d 100644
--- a/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3900000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 6, thread: 2631, process: 2631, kind: function-enter, tsc: 2034042117104344 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034042117199088 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 6, thread: 2631, process: 2631, kind: function-enter, tsc: 2034043145686378 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034043145762200 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 67 ], cpu: 6, thread: 2631, process: 2631, kind: function-enter-arg, tsc: 2034049739853430 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034049739878154 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 6, thread: 2631, process: 2631, kind: function-enter, tsc: 2034042117104344, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034042117199088, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 6, thread: 2631, process: 2631, kind: function-enter, tsc: 2034043145686378, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034043145762200, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 67 ], cpu: 6, thread: 2631, process: 2631, kind: function-enter-arg, tsc: 2034049739853430, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034049739878154, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt b/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt
index 46287b2572c..fc70015c41e 100644
--- a/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3900000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069294857657498 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069294857707502 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069295590705912 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069295590734308 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069296377598128 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069296377627032 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069294857657498, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069294857707502, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069295590705912, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069295590734308, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069296377598128, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069296377627032, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt b/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt
index 731ab3083d2..99bc7e11b97 100644
--- a/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt
@@ -8,17 +8,17 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 5678
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407340 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407346 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407347 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407387 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407437 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407467 }
-; CHECK-NEXT:   - { type: 0, func-id: 4, function: '4', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407492 }
-; CHECK-NEXT:   - { type: 0, func-id: 5, function: '5', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407517 }
-; CHECK-NEXT:   - { type: 0, func-id: 5, function: '5', cpu: 5, thread: 5, kind: function-tail-exit, tsc: 7238225556407542 }
-; CHECK-NEXT:   - { type: 0, func-id: 268435455, function: '268435455', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407552 }
-; CHECK-NEXT:   - { type: 0, func-id: 268435455, function: '268435455', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407562 }
-; CHECK-NEXT:   - { type: 0, func-id: 6, function: '6', cpu: 6, thread: 5, kind: function-enter, tsc: 7238225556407682 }
-; CHECK-NEXT:   - { type: 0, func-id: 6, function: '6', cpu: 6, thread: 5, kind: function-exit, tsc: 7238225556407755 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407340, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407346, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407347, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407387, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407437, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407467, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 4, function: '4', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407492, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 5, function: '5', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407517, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 5, function: '5', cpu: 5, thread: 5, kind: function-tail-exit, tsc: 7238225556407542, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 268435455, function: '268435455', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407552, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 268435455, function: '268435455', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407562, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 6, function: '6', cpu: 6, thread: 5, kind: function-enter, tsc: 7238225556407682, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 6, function: '6', cpu: 6, thread: 5, kind: function-exit, tsc: 7238225556407755, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-roundtrip.yaml b/test/tools/llvm-xray/X86/convert-roundtrip.yaml
index 4c5dfd18148..bbebd67e576 100644
--- a/test/tools/llvm-xray/X86/convert-roundtrip.yaml
+++ b/test/tools/llvm-xray/X86/convert-roundtrip.yaml
@@ -19,6 +19,6 @@ records:
 #CHECK-NEXT:    nonstop-tsc: true
 #CHECK-NEXT:    cycle-frequency: 2601000000
 #CHECK-NEXT:  records:
-#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
-#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-exit, tsc: 10100 }
+#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-enter, tsc: 10001, data: '' }
+#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-exit, tsc: 10100, data: '' }
 #CHECK-NEXT:  ...
diff --git a/test/tools/llvm-xray/X86/convert-to-yaml.txt b/test/tools/llvm-xray/X86/convert-to-yaml.txt
index 66a5618e12f..f807fae3a64 100644
--- a/test/tools/llvm-xray/X86/convert-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-to-yaml.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-debug-syms.txt b/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
index 76cee99d4b5..dbb98e3d3cf 100644
--- a/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
+++ b/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt b/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
index 700fa38ed38..9a121825656 100644
--- a/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
+++ b/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt b/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
index 6837072a1fc..1efcb3572ba 100644
--- a/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
+++ b/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828, data: '' }
 ; CHECK-NEXT: ...
diff --git a/tools/llvm-xray/xray-account.cpp b/tools/llvm-xray/xray-account.cpp
index 93bb271b328..3f01605fd85 100644
--- a/tools/llvm-xray/xray-account.cpp
+++ b/tools/llvm-xray/xray-account.cpp
@@ -146,6 +146,10 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
 
   auto &ThreadStack = PerThreadFunctionStack[Record.TId];
   switch (Record.Type) {
+  case RecordTypes::CUSTOM_EVENT:
+  case RecordTypes::TYPED_EVENT:
+    // TODO: Support custom and typed event accounting in the future.
+    return true;
   case RecordTypes::ENTER:
   case RecordTypes::ENTER_ARG: {
     ThreadStack.emplace_back(Record.FuncId, Record.TSC);
@@ -417,19 +421,25 @@ namespace llvm {
 template <> struct format_provider<llvm::xray::RecordTypes> {
   static void format(const llvm::xray::RecordTypes &T, raw_ostream &Stream,
                      StringRef Style) {
-    switch(T) {
-      case RecordTypes::ENTER:
-        Stream << "enter";
-        break;
-      case RecordTypes::ENTER_ARG:
-        Stream << "enter-arg";
-        break;
-      case RecordTypes::EXIT:
-        Stream << "exit";
-        break;
-      case RecordTypes::TAIL_EXIT:
-        Stream << "tail-exit";
-        break;
+    switch (T) {
+    case RecordTypes::ENTER:
+      Stream << "enter";
+      break;
+    case RecordTypes::ENTER_ARG:
+      Stream << "enter-arg";
+      break;
+    case RecordTypes::EXIT:
+      Stream << "exit";
+      break;
+    case RecordTypes::TAIL_EXIT:
+      Stream << "tail-exit";
+      break;
+    case RecordTypes::CUSTOM_EVENT:
+      Stream << "custom-event";
+      break;
+    case RecordTypes::TYPED_EVENT:
+      Stream << "typed-event";
+      break;
     }
   }
 };
diff --git a/tools/llvm-xray/xray-converter.cpp b/tools/llvm-xray/xray-converter.cpp
index 1faa49cf431..3f153b99bc9 100644
--- a/tools/llvm-xray/xray-converter.cpp
+++ b/tools/llvm-xray/xray-converter.cpp
@@ -92,9 +92,10 @@ void TraceConverter::exportAsYAML(const Trace &Records, raw_ostream &OS) {
     Trace.Records.push_back({R.RecordType, R.CPU, R.Type, R.FuncId,
                              Symbolize ? FuncIdHelper.SymbolOrNumber(R.FuncId)
                                        : llvm::to_string(R.FuncId),
-                             R.TSC, R.TId, R.PId, R.CallArgs});
+                             R.TSC, R.TId, R.PId, R.CallArgs, R.Data});
   }
   Output Out(OS, nullptr, 0);
+  Out.setWriteDefaultValues(false);
   Out << Trace;
 }
 
@@ -123,21 +124,27 @@ void TraceConverter::exportAsRAWv1(const Trace &Records, raw_ostream &OS) {
   // Then write out the rest of the records, still in an endian-appropriate
   // format.
   for (const auto &R : Records) {
-    Writer.write(R.RecordType);
-    // The on disk naive raw format uses 8 bit CPUs, but the record has 16.
-    // There's no choice but truncation.
-    Writer.write(static_cast<uint8_t>(R.CPU));
     switch (R.Type) {
     case RecordTypes::ENTER:
     case RecordTypes::ENTER_ARG:
+      Writer.write(R.RecordType);
+      Writer.write(static_cast<uint8_t>(R.CPU));
       Writer.write(uint8_t{0});
       break;
     case RecordTypes::EXIT:
+      Writer.write(R.RecordType);
+      Writer.write(static_cast<uint8_t>(R.CPU));
       Writer.write(uint8_t{1});
       break;
     case RecordTypes::TAIL_EXIT:
+      Writer.write(R.RecordType);
+      Writer.write(static_cast<uint8_t>(R.CPU));
       Writer.write(uint8_t{2});
       break;
+    case RecordTypes::CUSTOM_EVENT:
+    case RecordTypes::TYPED_EVENT:
+      // Skip custom and typed event records for v1 logs.
+      continue;
     }
     Writer.write(R.FuncId);
     Writer.write(R.TSC);
@@ -264,6 +271,10 @@ void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
     double EventTimestampUs = double(1000000) / CycleFreq * double(R.TSC);
     StackTrieNode *&StackCursor = StackCursorByThreadId[R.TId];
     switch (R.Type) {
+    case RecordTypes::CUSTOM_EVENT:
+    case RecordTypes::TYPED_EVENT:
+      // TODO: Support typed and custom event rendering on Chrome Trace Viewer.
+      break;
     case RecordTypes::ENTER:
     case RecordTypes::ENTER_ARG:
       StackCursor = findOrCreateStackNode(StackCursor, R.FuncId, R.TId,
diff --git a/tools/llvm-xray/xray-graph.cpp b/tools/llvm-xray/xray-graph.cpp
index c619bf86299..fe49cca20d5 100644
--- a/tools/llvm-xray/xray-graph.cpp
+++ b/tools/llvm-xray/xray-graph.cpp
@@ -246,6 +246,10 @@ Error GraphRenderer::accountRecord(const XRayRecord &Record) {
     updateStat(G[Record.FuncId].S, D);
     break;
   }
+  case RecordTypes::CUSTOM_EVENT:
+  case RecordTypes::TYPED_EVENT:
+    // TODO: Support custom and typed events in the graph processing?
+    break;
   }
 
   return Error::success();
diff --git a/tools/llvm-xray/xray-stacks.cpp b/tools/llvm-xray/xray-stacks.cpp
index 1a6069780a3..059940b7756 100644
--- a/tools/llvm-xray/xray-stacks.cpp
+++ b/tools/llvm-xray/xray-stacks.cpp
@@ -366,6 +366,9 @@ public:
                                     AccountRecordState *state) {
     auto &TS = ThreadStackMap[R.TId];
     switch (R.Type) {
+    case RecordTypes::CUSTOM_EVENT:
+    case RecordTypes::TYPED_EVENT:
+      return AccountRecordStatus::OK;
     case RecordTypes::ENTER:
     case RecordTypes::ENTER_ARG: {
       state->wasLastRecordExit = false;
-- 
GitLab


From d16e7d93840552959d89afecba419a78972c0125 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Tue, 6 Nov 2018 09:07:03 +0000
Subject: [PATCH 1001/1116] [NFC] Turn collectTransitivePredecessors into a
 static function

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346217 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/MustExecute.h | 7 -------
 lib/Analysis/MustExecute.cpp        | 7 +++++--
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h
index c4005a9af95..05c28d13988 100644
--- a/include/llvm/Analysis/MustExecute.h
+++ b/include/llvm/Analysis/MustExecute.h
@@ -49,13 +49,6 @@ class LoopSafetyInfo {
   // Used to update funclet bundle operands.
   DenseMap<BasicBlock *, ColorVector> BlockColors;
 
-  /// Collect all blocks from \p CurLoop which lie on all possible paths from
-  /// the header of \p CurLoop (inclusive) to BB (exclusive) into the set
-  /// \p Predecessors. If \p BB is the header, \p Predecessors will be empty.
-  void collectTransitivePredecessors(
-      const Loop *CurLoop, const BasicBlock *BB,
-      SmallPtrSetImpl<const BasicBlock *> &Predecessors) const;
-
 protected:
   /// Computes block colors.
   void computeBlockColors(const Loop *CurLoop);
diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp
index 7507aebb527..23e012626e2 100644
--- a/lib/Analysis/MustExecute.cpp
+++ b/lib/Analysis/MustExecute.cpp
@@ -145,9 +145,12 @@ static bool CanProveNotTakenFirstIteration(const BasicBlock *ExitBlock,
   return SimpleCst->isAllOnesValue();
 }
 
-void LoopSafetyInfo::collectTransitivePredecessors(
+/// Collect all blocks from \p CurLoop which lie on all possible paths from
+/// the header of \p CurLoop (inclusive) to BB (exclusive) into the set
+/// \p Predecessors. If \p BB is the header, \p Predecessors will be empty.
+static void collectTransitivePredecessors(
     const Loop *CurLoop, const BasicBlock *BB,
-    SmallPtrSetImpl<const BasicBlock *> &Predecessors) const {
+    SmallPtrSetImpl<const BasicBlock *> &Predecessors) {
   assert(Predecessors.empty() && "Garbage in predecessors set?");
   assert(CurLoop->contains(BB) && "Should only be called for loop blocks!");
   if (BB == CurLoop->getHeader())
-- 
GitLab


From 5bc1446f3bee779c5a15a0256169bc7623682121 Mon Sep 17 00:00:00 2001
From: Martin Storsjo <martin@martin.st>
Date: Tue, 6 Nov 2018 09:08:20 +0000
Subject: [PATCH 1002/1116] [Support] Fix `warning: unknown pragma ignored` for
 mingw target

Differential Revision: https://reviews.llvm.org/D54133

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346218 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Windows/WindowsSupport.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index 5adfa859c96..e05c3a73f2f 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -92,15 +92,19 @@ inline llvm::VersionTuple GetWindowsOSVersion() {
   OSVERSIONINFOEX info;
   ZeroMemory(&info, sizeof(OSVERSIONINFOEX));
   info.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
+#if defined(_MSC_VER)
 #pragma warning(push)
 #pragma warning(disable : 4996)
+#endif // _MSC_VER
   // Starting with Microsoft SDK for Windows 8.1, this function is deprecated
   // in favor of the new Windows Version Helper APIs.  Since we don't specify a
   // minimum SDK version, it's easier to simply disable the warning rather than
   // try to support both APIs.
   if (GetVersionEx((LPOSVERSIONINFO)&info) == 0)
     return llvm::VersionTuple();
+#if defined(_MSC_VER)
 #pragma warning(pop)
+#endif // _MSC_VER
 
   return llvm::VersionTuple(info.dwMajorVersion, info.dwMinorVersion, 0,
                             info.dwBuildNumber);
-- 
GitLab


From 241a3bcfa826aa4f9086a07e548ca89154b70acf Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 6 Nov 2018 11:28:22 +0000
Subject: [PATCH 1003/1116] [InstCombine] Ensure nested shifts are in range
 (OSS-Fuzz #9880)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346225 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineShifts.cpp         | 11 ++++++-----
 test/Transforms/InstCombine/apint-shift.ll    | 19 +++++++++++++++++++
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 045ce423ef6..c562d45a9e2 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -725,9 +725,9 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
 
     Value *X;
     const APInt *ShOp1;
-    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) {
-      unsigned ShlAmt = ShOp1->getZExtValue();
-      if (ShlAmt < ShAmt) {
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) {
+      if (ShOp1->ult(ShAmt)) {
+        unsigned ShlAmt = ShOp1->getZExtValue();
         Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
         if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
           // (X <<nuw C1) >>u C2 --> X >>u (C2 - C1)
@@ -740,7 +740,8 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
         APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
         return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
       }
-      if (ShlAmt > ShAmt) {
+      if (ShOp1->ugt(ShAmt)) {
+        unsigned ShlAmt = ShOp1->getZExtValue();
         Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
         if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
           // (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2)
@@ -753,7 +754,7 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
         APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
         return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
       }
-      assert(ShlAmt == ShAmt);
+      assert(*ShOp1 == ShAmt);
       // (X << C) >>u C --> X & (-1 >>u C)
       APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
       return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
diff --git a/test/Transforms/InstCombine/apint-shift.ll b/test/Transforms/InstCombine/apint-shift.ll
index 3266fa6e443..efc088637c4 100644
--- a/test/Transforms/InstCombine/apint-shift.ll
+++ b/test/Transforms/InstCombine/apint-shift.ll
@@ -526,3 +526,22 @@ define i40 @test26(i40 %A) {
   %D = shl i40 %C, 1
   ret i40 %D
 }
+
+; OSS-Fuzz #9880
+; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=9880
+define i177 @ossfuzz_9880(i177 %X) {
+; CHECK-LABEL: @ossfuzz_9880(
+; CHECK-NEXT:    ret i177 1
+;
+  %A = alloca i177
+  %L1 = load i177, i177* %A
+  %B = or i177 0, -1
+  %B5 = udiv i177 %L1, %B
+  %B4 = add i177 %B5, %B
+  %B2 = add i177 %B, %B4
+  %B6 = mul i177 %B5, %B2
+  %B20 = shl i177 %L1, %B6
+  %B14 = sub i177 %B20, %B5
+  %B1 = udiv i177 %B14, %B6
+  ret i177 %B1
+}
-- 
GitLab


From bfcda1dbe3ada9c60dd46c1fc7a4ae6355f1898f Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Tue, 6 Nov 2018 13:48:56 +0000
Subject: [PATCH 1004/1116] [X86][NFC] Fix comment.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346226 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/MCTargetDesc/X86BaseInfo.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index aef0df480bb..c85ce9bbd5a 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -266,12 +266,12 @@ namespace X86II {
     RawFrmSrc      = 4,
 
     /// RawFrmDst - This form is for instructions that use the destination index
-    /// register DI/EDI/ESI.
+    /// register DI/EDI/RDI.
     RawFrmDst      = 5,
 
-    /// RawFrmSrc - This form is for instructions that use the source index
-    /// register SI/ESI/ERI with a possible segment override, and also the
-    /// destination index register DI/ESI/RDI.
+    /// RawFrmDstSrc - This form is for instructions that use the source index
+    /// register SI/ESI/RSI with a possible segment override, and also the
+    /// destination index register DI/EDI/RDI.
     RawFrmDstSrc   = 6,
 
     /// RawFrmImm8 - This is used for the ENTER instruction, which has two
-- 
GitLab


From 302c74f88b090be491ce3f53e177f52855c92dc5 Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Tue, 6 Nov 2018 14:11:58 +0000
Subject: [PATCH 1005/1116] [llvm-exegesis] Ignore X86 pseudo instructions.

Summary: They do not lower to actual MCInsts and have no scheduling info.

Reviewers: gchatelet

Subscribers: llvm-commits, tschuett

Differential Revision: https://reviews.llvm.org/D54147

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346227 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/X86/Target.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index 6ae228e1124..b74d5dcde9f 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -75,6 +75,9 @@ getMemoryOperandRanges(llvm::ArrayRef<Operand> Operands) {
 
 static llvm::Error IsInvalidOpcode(const Instruction &Instr) {
   const auto OpcodeName = Instr.Name;
+  if ((Instr.Description->TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return llvm::make_error<BenchmarkFailure>(
+        "unsupported opcode: pseudo instruction");
   if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") ||
       OpcodeName.startswith("ADJCALLSTACK"))
     return llvm::make_error<BenchmarkFailure>(
-- 
GitLab


From b9e341e90c167b92136893c9085343aa254ddd37 Mon Sep 17 00:00:00 2001
From: Simon Atanasyan <simon@atanasyan.com>
Date: Tue, 6 Nov 2018 14:37:24 +0000
Subject: [PATCH 1006/1116] [mips] Support sigrie instruction

The `sigrie` instruction signals a Reserved Instruction Exception.
This patch adds support for assembling / disassembling the instruction.

Differential Revision: http://reviews.llvm.org/D53861

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346230 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MicroMips32r6InstrInfo.td | 11 +++++++++++
 lib/Target/Mips/MicroMipsInstrFormats.td  | 11 +++++++++++
 lib/Target/Mips/Mips32r6InstrFormats.td   | 10 ++++++++++
 lib/Target/Mips/Mips32r6InstrInfo.td      | 12 ++++++++++++
 lib/Target/Mips/MipsSchedule.td           |  2 ++
 lib/Target/Mips/MipsScheduleGeneric.td    |  2 +-
 test/MC/Mips/micromips32r6/valid.s        |  4 ++++
 test/MC/Mips/mips32r6/valid.s             |  4 ++++
 test/MC/Mips/mips64r6/valid.s             |  4 ++++
 9 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index b5896060a71..814918d25e7 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -159,6 +159,7 @@ class SYNC_MMR6_ENC : POOL32A_SYNC_FM_MMR6;
 class SYNCI_MMR6_ENC : POOL32I_SYNCI_FM_MMR6, MMR6Arch<"synci">;
 class RDPGPR_MMR6_ENC : POOL32A_RDPGPR_FM_MMR6<0b1110000101>;
 class SDBBP_MMR6_ENC : SDBBP_FM_MM, MMR6Arch<"sdbbp">;
+class SIGRIE_MMR6_ENC : SIGRIE_FM_MM, MMR6Arch<"sigrie">;
 class XOR_MMR6_ENC : ARITH_FM_MMR6<"xor", 0x310>;
 class XORI_MMR6_ENC : ADDI_FM_MMR6<"xori", 0x1c>;
 class ABS_S_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.s", 0, 0b0001101>;
@@ -1162,6 +1163,14 @@ class SDBBP_MMR6_DESC : MipsR6Inst {
   InstrItinClass Itinerary = II_SDBBP;
 }
 
+class SIGRIE_MMR6_DESC : MipsR6Inst {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins uimm16:$code_);
+  string AsmString = !strconcat("sigrie", "\t$code_");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_SIGRIE;
+}
+
 class LWM16_MMR6_DESC
     : MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
                       !strconcat("lwm16", "\t$rt, $addr"), [],
@@ -1427,6 +1436,7 @@ def SYNCI_MMR6 : StdMMR6Rel, SYNCI_MMR6_DESC, SYNCI_MMR6_ENC, ISA_MICROMIPS32R6;
 def RDPGPR_MMR6 : R6MMR6Rel, RDPGPR_MMR6_DESC, RDPGPR_MMR6_ENC,
                   ISA_MICROMIPS32R6;
 def SDBBP_MMR6 : R6MMR6Rel, SDBBP_MMR6_DESC, SDBBP_MMR6_ENC, ISA_MICROMIPS32R6;
+def SIGRIE_MMR6 : R6MMR6Rel, SIGRIE_MMR6_DESC, SIGRIE_MMR6_ENC, ISA_MICROMIPS32R6;
 def XOR_MMR6 : StdMMR6Rel, XOR_MMR6_DESC, XOR_MMR6_ENC, ISA_MICROMIPS32R6;
 def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6;
 let DecoderMethod = "DecodeMemMMImm16" in {
@@ -1635,6 +1645,7 @@ def B_MMR6_Pseudo : MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset),
 }
 def : MipsInstAlias<"sync", (SYNC_MMR6 0), 1>, ISA_MICROMIPS32R6;
 def : MipsInstAlias<"sdbbp", (SDBBP_MMR6 0), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"sigrie", (SIGRIE_MMR6 0), 1>, ISA_MICROMIPS32R6;
 def : MipsInstAlias<"rdhwr $rt, $rs",
                     (RDHWR_MMR6 GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>,
                     ISA_MICROMIPS32R6;
diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index a9c53e08b81..2a4cc279ef0 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -933,6 +933,17 @@ class SDBBP_FM_MM : MMArch {
   let Inst{5-0}   = 0x3c;
 }
 
+class SIGRIE_FM_MM : MMArch {
+  bits<16> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-22} = 0x0;
+  let Inst{21-6} = code_;
+  let Inst{5-0} = 0b111111;
+}
+
 class RDHWR_FM_MM : MMArch {
   bits<5> rt;
   bits<5> rd;
diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td
index e1d08cad88b..623af570a5e 100644
--- a/lib/Target/Mips/Mips32r6InstrFormats.td
+++ b/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -87,6 +87,7 @@ def OPCODE5_BC1NEZ : OPCODE5<0b01101>;
 def OPCODE5_BC2EQZ : OPCODE5<0b01001>;
 def OPCODE5_BC2NEZ : OPCODE5<0b01101>;
 def OPCODE5_BGEZAL : OPCODE5<0b10001>;
+def OPCODE5_SIGRIE : OPCODE5<0b10111>;
 // The next four constants are unnamed in the spec. These names are taken from
 // the OPGROUP names they are used with.
 def OPCODE5_LDC2   : OPCODE5<0b01110>;
@@ -602,3 +603,12 @@ class SPECIAL3_GINV<bits<2> ginv> : MipsR6Inst {
   let Inst{7-6}   = ginv;
   let Inst{5-0}   = 0b111101;
 }
+
+class SIGRIE_FM : MipsR6Inst {
+  bits<16> code_;
+
+  let Inst{31-26} = OPGROUP_REGIMM.Value;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = OPCODE5_SIGRIE.Value;
+  let Inst{15-0} = code_;
+}
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index d86fc3f658a..2bd0cf2d59a 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -200,6 +200,8 @@ class CRC32CW_ENC : SPECIAL3_2R_SZ_CRC<2,1>;
 class GINVI_ENC : SPECIAL3_GINV<0>;
 class GINVT_ENC : SPECIAL3_GINV<2>;
 
+class SIGRIE_ENC : SIGRIE_FM;
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Multiclasses
@@ -846,6 +848,14 @@ class GINVI_DESC : GINV_DESC_BASE<"ginvi", GPR32Opnd, II_GINVI> {
 }
 class GINVT_DESC : GINV_DESC_BASE<"ginvt", GPR32Opnd, II_GINVT>;
 
+class SIGRIE_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins uimm16:$code_);
+  string AsmString = "sigrie\t$code_";
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_SIGRIE;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Definitions
@@ -961,6 +971,7 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def SEL_S : R6MMR6Rel, SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
   def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
   def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
+  def SIGRIE : SIGRIE_ENC, SIGRIE_DESC, ISA_MIPS32R6;
 }
 
 let AdditionalPredicates = [NotInMicroMips] in {
@@ -988,6 +999,7 @@ def : MipsInstAlias<"evp", (EVP ZERO), 0>, ISA_MIPS32R6;
 
 let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6;
+def : MipsInstAlias<"sigrie", (SIGRIE 0)>, ISA_MIPS32R6;
 def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6, GPR_32;
 }
 
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 64db815a0f4..410fa655a22 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -154,6 +154,7 @@ def II_DERET            : InstrItinClass;
 def II_ERETNC           : InstrItinClass;
 def II_EHB              : InstrItinClass;
 def II_SDBBP            : InstrItinClass;
+def II_SIGRIE           : InstrItinClass;
 def II_SSNOP            : InstrItinClass;
 def II_SYSCALL          : InstrItinClass;
 def II_PAUSE            : InstrItinClass;
@@ -546,6 +547,7 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_ERETNC          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_EHB             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SDBBP           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SIGRIE          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SSNOP           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SYSCALL         , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_PAUSE           , [InstrStage<1,  [ALU]>]>,
diff --git a/lib/Target/Mips/MipsScheduleGeneric.td b/lib/Target/Mips/MipsScheduleGeneric.td
index 79c55dbb9e0..80ffe7ada7c 100644
--- a/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/lib/Target/Mips/MipsScheduleGeneric.td
@@ -179,7 +179,7 @@ def GenericWriteTrap : SchedWriteRes<[GenericIssueCTISTD]>;
 def : ItinRW<[GenericWriteTrap], [II_BREAK, II_SYSCALL, II_TEQ, II_TEQI,
                                   II_TGE, II_TGEI, II_TGEIU, II_TGEU, II_TNE,
                                   II_TNEI, II_TLT, II_TLTI, II_TLTU, II_TTLTIU,
-                                  II_TRAP, II_SDBBP]>;
+                                  II_TRAP, II_SDBBP, II_SIGRIE]>;
 
 // COP0 Pipeline
 // =============
diff --git a/test/MC/Mips/micromips32r6/valid.s b/test/MC/Mips/micromips32r6/valid.s
index 6b2aec9d5aa..b6af2b951c7 100644
--- a/test/MC/Mips/micromips32r6/valid.s
+++ b/test/MC/Mips/micromips32r6/valid.s
@@ -169,6 +169,10 @@
   rdpgpr $3, $9            # CHECK: $3, $9              # encoding: [0x00,0x69,0xe1,0x7c]
   sdbbp                    # CHECK: sdbbp               # encoding: [0x00,0x00,0xdb,0x7c]
   sdbbp 34                 # CHECK: sdbbp 34            # encoding: [0x00,0x22,0xdb,0x7c]
+  sigrie                   # CHECK: sigrie              # encoding: [0x00,0x00,0x00,0x3f]
+                           # CHECK-NEXT:                # <MCInst #{{[0-9]+}} SIGRIE_MM
+  sigrie    257            # CHECK: sigrie 257          # encoding: [0x00,0x00,0x40,0x7f]
+                           # CHECK-NEXT:                # <MCInst #{{[0-9]+}} SIGRIE_MM
   xor $3, $4, $5           # CHECK: xor $3, $4, $5      # encoding: [0x00,0xa4,0x1b,0x10]
   xori $3, $4, 1234        # CHECK: xori $3, $4, 1234   # encoding: [0x70,0x64,0x04,0xd2]
   sw $5, 4($6)             # CHECK: sw $5, 4($6)        # encoding: [0xf8,0xa6,0x00,0x04]
diff --git a/test/MC/Mips/mips32r6/valid.s b/test/MC/Mips/mips32r6/valid.s
index e60b5fad371..6c023d38573 100644
--- a/test/MC/Mips/mips32r6/valid.s
+++ b/test/MC/Mips/mips32r6/valid.s
@@ -281,6 +281,10 @@ a:
         sdbbp     34             # CHECK: sdbbp 34               # encoding: [0x00,0x00,0x08,0x8e]
                                  # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SDBBP
                                  # CHECK-NOT:                    # <MCInst #{{[0-9]+}} SDBBP_MM
+        sigrie                   # CHECK: sigrie                 # encoding: [0x04,0x17,0x00,0x00]
+                                 # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SIGRIE
+        sigrie    257            # CHECK: sigrie 257             # encoding: [0x04,0x17,0x01,0x01]
+                                 # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SIGRIE
         sync                     # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
                                  # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SYNC
         sync    1                # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
diff --git a/test/MC/Mips/mips64r6/valid.s b/test/MC/Mips/mips64r6/valid.s
index c79077ba400..c810a40e252 100644
--- a/test/MC/Mips/mips64r6/valid.s
+++ b/test/MC/Mips/mips64r6/valid.s
@@ -242,6 +242,10 @@ a:
         sdbbp     34             # CHECK: sdbbp 34               # encoding: [0x00,0x00,0x08,0x8e]
                                  # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SDBBP
                                  # CHECK-NOT:                    # <MCInst #{{[0-9]+}} SDBBP_MM
+        sigrie                   # CHECK: sigrie                 # encoding: [0x04,0x17,0x00,0x00]
+                                 # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SIGRIE
+        sigrie    257            # CHECK: sigrie 257             # encoding: [0x04,0x17,0x01,0x01]
+                                 # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SIGRIE
         sdc2    $20,629($s2)     # CHECK: sdc2 $20, 629($18)     # encoding: [0x49,0xf4,0x92,0x75]
         sel.d   $f0,$f1,$f2      # CHECK: sel.d $f0, $f1, $f2 # encoding: [0x46,0x22,0x08,0x10]
         sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
-- 
GitLab


From 07c5c7fbf9f5cbff886c655d4fd4dc401017f5d4 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 6 Nov 2018 15:21:44 +0000
Subject: [PATCH 1007/1116] [InstCombine] add tests for FMF propagation
 failure; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346232 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/fcmp.ll | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index 48c12a300a3..05b8e20d3a4 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -78,6 +78,30 @@ define <2 x i1> @fneg_constant_swap_pred_vec_undef(<2 x float> %x) {
   ret <2 x i1> %cmp
 }
 
+; FIXME: The new fcmp should have the same FMF as the original.
+
+define i1 @fneg_fmf(float %x) {
+; CHECK-LABEL: @fneg_fmf(
+; CHECK-NEXT:    [[R:%.*]] = fcmp oeq float [[X:%.*]], -4.200000e+01
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %n = fsub fast float -0.0, %x
+  %r = fcmp fast oeq float %n, 42.0
+  ret i1 %r
+}
+
+; FIXME: The new fcmp should have the same FMF as the original, vector edition.
+
+define <2 x i1> @fcmp_fneg_fmf_vec(<2 x float> %x) {
+; CHECK-LABEL: @fcmp_fneg_fmf_vec(
+; CHECK-NEXT:    [[R:%.*]] = fcmp ule <2 x float> [[X:%.*]], <float -4.200000e+01, float 1.900000e+01>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %n = fsub nsz <2 x float> zeroinitializer, %x
+  %r = fcmp nnan reassoc uge <2 x float> %n, <float 42.0, float -19.0>
+  ret <2 x i1> %r
+}
+
 define i1 @fneg_fneg_swap_pred(float %x, float %y) {
 ; CHECK-LABEL: @fneg_fneg_swap_pred(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
-- 
GitLab


From bd1a44f2b85c9bfde85bc83302d68f23bc3f4673 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 6 Nov 2018 15:49:45 +0000
Subject: [PATCH 1008/1116] [InstCombine] propagate fast-math-flags when
 folding fcmp+fneg

This is another part of solving PR39475:
https://bugs.llvm.org/show_bug.cgi?id=39475

This might be enough to fix that particular issue, but as noted
with the FIXME, we're still dropping FMF on other folds around here.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346234 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCompares.cpp       | 27 +++++++++++--------
 test/Transforms/InstCombine/fcmp.ll           |  4 +--
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 059f7523ff9..1946f8903b1 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5445,14 +5445,6 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
       if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
         return NV;
       break;
-    case Instruction::FSub: {
-      // fcmp pred (fneg x), C -> fcmp swap(pred) x, -C
-      Value *Op;
-      if (match(LHSI, m_FNeg(m_Value(Op))))
-        return new FCmpInst(I.getSwappedPredicate(), Op,
-                            ConstantExpr::getFNeg(RHSC));
-      break;
-    }
     case Instruction::FDiv:
       if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC))
         return NV;
@@ -5472,10 +5464,23 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   }
   }
 
-  // fcmp pred (fneg x), (fneg y) -> fcmp swap(pred) x, y
   Value *X, *Y;
-  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
-    return new FCmpInst(I.getSwappedPredicate(), X, Y);
+  if (match(Op0, m_FNeg(m_Value(X)))) {
+    if (match(Op1, m_FNeg(m_Value(Y)))) {
+      // FIXME: Drops FMF.
+      // fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y
+      return new FCmpInst(I.getSwappedPredicate(), X, Y);
+    }
+
+    Constant *C;
+    if (match(Op1, m_Constant(C))) {
+      // fcmp pred (fneg X), C --> fcmp swap(pred) X, -C
+      Constant *NegC = ConstantExpr::getFNeg(C);
+      Instruction *NewFCmp = new FCmpInst(I.getSwappedPredicate(), X, NegC);
+      NewFCmp->copyFastMathFlags(&I);
+      return NewFCmp;
+    }
+  }
 
   // fcmp (fpext x), (fpext y) -> fcmp x, y
   if (FPExtInst *LHSExt = dyn_cast<FPExtInst>(Op0))
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index 05b8e20d3a4..763ce9ec170 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -82,7 +82,7 @@ define <2 x i1> @fneg_constant_swap_pred_vec_undef(<2 x float> %x) {
 
 define i1 @fneg_fmf(float %x) {
 ; CHECK-LABEL: @fneg_fmf(
-; CHECK-NEXT:    [[R:%.*]] = fcmp oeq float [[X:%.*]], -4.200000e+01
+; CHECK-NEXT:    [[R:%.*]] = fcmp fast oeq float [[X:%.*]], -4.200000e+01
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %n = fsub fast float -0.0, %x
@@ -94,7 +94,7 @@ define i1 @fneg_fmf(float %x) {
 
 define <2 x i1> @fcmp_fneg_fmf_vec(<2 x float> %x) {
 ; CHECK-LABEL: @fcmp_fneg_fmf_vec(
-; CHECK-NEXT:    [[R:%.*]] = fcmp ule <2 x float> [[X:%.*]], <float -4.200000e+01, float 1.900000e+01>
+; CHECK-NEXT:    [[R:%.*]] = fcmp reassoc nnan ule <2 x float> [[X:%.*]], <float -4.200000e+01, float 1.900000e+01>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %n = fsub nsz <2 x float> zeroinitializer, %x
-- 
GitLab


From c52594aa970b7e4eb0789b226d85cbf8eabf5d22 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 6 Nov 2018 15:53:58 +0000
Subject: [PATCH 1009/1116] [InstCombine] reduce code; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346235 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineCompares.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 1946f8903b1..975cb83b8f3 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5284,7 +5284,7 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
   // Finally emit the new fcmp.
   Value *X = LHSI->getOperand(1);
   FCmpInst *NewFCI = new FCmpInst(Pred, X, RHSC);
-  NewFCI->setFastMathFlags(I.getFastMathFlags());
+  NewFCI->copyFastMathFlags(&I);
   return NewFCI;
 }
 
-- 
GitLab


From 9063923ffc9002e919c009e4dd2b1cf66115afb6 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 6 Nov 2018 15:57:52 +0000
Subject: [PATCH 1010/1116] [InstCombine] adjust tests to show dropping FMF;
 NFC

Also, remove some stale FIXME comments ( rL346234 ).


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346236 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/fcmp.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index 763ce9ec170..43e16793cb6 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -78,7 +78,7 @@ define <2 x i1> @fneg_constant_swap_pred_vec_undef(<2 x float> %x) {
   ret <2 x i1> %cmp
 }
 
-; FIXME: The new fcmp should have the same FMF as the original.
+; The new fcmp should have the same FMF as the original.
 
 define i1 @fneg_fmf(float %x) {
 ; CHECK-LABEL: @fneg_fmf(
@@ -90,7 +90,7 @@ define i1 @fneg_fmf(float %x) {
   ret i1 %r
 }
 
-; FIXME: The new fcmp should have the same FMF as the original, vector edition.
+; The new fcmp should have the same FMF as the original, vector edition.
 
 define <2 x i1> @fcmp_fneg_fmf_vec(<2 x float> %x) {
 ; CHECK-LABEL: @fcmp_fneg_fmf_vec(
@@ -109,7 +109,7 @@ define i1 @fneg_fneg_swap_pred(float %x, float %y) {
 ;
   %neg1 = fsub float -0.0, %x
   %neg2 = fsub float -0.0, %y
-  %cmp = fcmp olt float %neg1, %neg2
+  %cmp = fcmp nnan olt float %neg1, %neg2
   ret i1 %cmp
 }
 
@@ -120,7 +120,7 @@ define <2 x i1> @fneg_fneg_swap_pred_vec(<2 x float> %x, <2 x float> %y) {
 ;
   %neg1 = fsub <2 x float> <float -0.0, float -0.0>, %x
   %neg2 = fsub <2 x float> <float -0.0, float -0.0>, %y
-  %cmp = fcmp olt <2 x float> %neg1, %neg2
+  %cmp = fcmp ninf olt <2 x float> %neg1, %neg2
   ret <2 x i1> %cmp
 }
 
-- 
GitLab


From 458830edd57a4460792139a0ad9d9179c1bd879a Mon Sep 17 00:00:00 2001
From: Elizabeth Andrews <elizabeth.andrews@intel.com>
Date: Tue, 6 Nov 2018 15:57:59 +0000
Subject: [PATCH 1011/1116] [benchmark] Disable exceptions in Microsoft STL

This patch disables exceptions in Microsoft STL when exception
handling is not enabled in Benchmark project. It fixes Windows
builds that were failing due to C4530 warnings thrown by MS STL.

Differential Revision: https://reviews.llvm.org/D52998


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346237 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/benchmark/CMakeLists.txt | 1 +
 utils/benchmark/README.LLVM    | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/utils/benchmark/CMakeLists.txt b/utils/benchmark/CMakeLists.txt
index 6522ecf9d6b..686846bf1e0 100644
--- a/utils/benchmark/CMakeLists.txt
+++ b/utils/benchmark/CMakeLists.txt
@@ -99,6 +99,7 @@ if (MSVC)
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-EHs-)
     add_cxx_compiler_flag(-EHa-)
+    add_definitions(-D_HAS_EXCEPTIONS=0)
   endif()
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
diff --git a/utils/benchmark/README.LLVM b/utils/benchmark/README.LLVM
index 0121b145ded..5a20ec665ad 100644
--- a/utils/benchmark/README.LLVM
+++ b/utils/benchmark/README.LLVM
@@ -19,3 +19,5 @@ Changes:
   is applied to fix cross compilation with MinGW headers
 * https://github.com/google/benchmark/commit/439d6b1c2a6da5cb6adc4c4dfc555af235722396
   is applied to fix building with MinGW headers for ARM
+* https://github.com/google/benchmark/commit/a9b31c51b1ee7ec7b31438c647123c2cbac5d956
+  is applied to disable exceptions in Microsoft STL when exceptions are disabled
-- 
GitLab


From d174746db8edd440ca8481f3b2e6d2e41678e9ae Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 6 Nov 2018 15:58:57 +0000
Subject: [PATCH 1012/1116] [InstCombine] propagate fast-math-flags when
 folding fcmp+fneg, part 2

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346238 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineCompares.cpp | 5 +++--
 test/Transforms/InstCombine/fcmp.ll                | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 975cb83b8f3..c1c904b331a 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5467,9 +5467,10 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   Value *X, *Y;
   if (match(Op0, m_FNeg(m_Value(X)))) {
     if (match(Op1, m_FNeg(m_Value(Y)))) {
-      // FIXME: Drops FMF.
       // fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y
-      return new FCmpInst(I.getSwappedPredicate(), X, Y);
+      Instruction *NewFCmp = new FCmpInst(I.getSwappedPredicate(), X, Y);
+      NewFCmp->copyFastMathFlags(&I);
+      return NewFCmp;
     }
 
     Constant *C;
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index 43e16793cb6..b392c57d1f3 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -104,7 +104,7 @@ define <2 x i1> @fcmp_fneg_fmf_vec(<2 x float> %x) {
 
 define i1 @fneg_fneg_swap_pred(float %x, float %y) {
 ; CHECK-LABEL: @fneg_fneg_swap_pred(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ogt float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %neg1 = fsub float -0.0, %x
@@ -115,7 +115,7 @@ define i1 @fneg_fneg_swap_pred(float %x, float %y) {
 
 define <2 x i1> @fneg_fneg_swap_pred_vec(<2 x float> %x, <2 x float> %y) {
 ; CHECK-LABEL: @fneg_fneg_swap_pred_vec(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ogt <2 x float> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %neg1 = fsub <2 x float> <float -0.0, float -0.0>, %x
-- 
GitLab


From be4139f93422bd6d42d0d497957c5a1f6226883c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 6 Nov 2018 16:07:39 +0000
Subject: [PATCH 1013/1116] [InstCombine] adjust tests to show dropping FMF;
 NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346239 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/fcmp.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index b392c57d1f3..d48d75a7fca 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -12,7 +12,7 @@ define i1 @test1(float %x, float %y) {
 ;
   %ext1 = fpext float %x to double
   %ext2 = fpext float %y to double
-  %cmp = fcmp ogt double %ext1, %ext2
+  %cmp = fcmp nnan ogt double %ext1, %ext2
   ret i1 %cmp
 }
 
@@ -22,7 +22,7 @@ define i1 @test2(float %a) {
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ext = fpext float %a to double
-  %cmp = fcmp ogt double %ext, 1.000000e+00
+  %cmp = fcmp ninf ogt double %ext, 1.000000e+00
   ret i1 %cmp
 }
 
-- 
GitLab


From 6052aa37059967c72494528e4bcebd6267ea1037 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 6 Nov 2018 16:23:03 +0000
Subject: [PATCH 1014/1116] [InstCombine] propagate fast-math-flags when
 folding fcmp+fpext

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346240 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineCompares.cpp | 12 +++++++-----
 test/Transforms/InstCombine/fcmp.ll                |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c1c904b331a..2ff10f4fc40 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5483,11 +5483,13 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
     }
   }
 
-  // fcmp (fpext x), (fpext y) -> fcmp x, y
-  if (FPExtInst *LHSExt = dyn_cast<FPExtInst>(Op0))
-    if (FPExtInst *RHSExt = dyn_cast<FPExtInst>(Op1))
-      if (LHSExt->getSrcTy() == RHSExt->getSrcTy())
-        return new FCmpInst(Pred, LHSExt->getOperand(0), RHSExt->getOperand(0));
+  // fcmp (fpext X), (fpext Y) -> fcmp X, Y
+  if (match(Op0, m_FPExt(m_Value(X))) && match(Op1, m_FPExt(m_Value(Y))) &&
+      X->getType() == Y->getType()) {
+    Instruction *NewFCmp = new FCmpInst(Pred, X, Y);
+    NewFCmp->copyFastMathFlags(&I);
+    return NewFCmp;
+  }
 
   if (I.getType()->isVectorTy())
     if (Instruction *Res = foldVectorCmp(I, Builder))
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index d48d75a7fca..b72adc19a8e 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -7,7 +7,7 @@ declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 
 define i1 @test1(float %x, float %y) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ogt float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ext1 = fpext float %x to double
-- 
GitLab


From ac037d267ff3aba98b17ba36c477926c0a7b3158 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 6 Nov 2018 16:37:35 +0000
Subject: [PATCH 1015/1116] [InstCombine] rearrange code for fcmp+fpext; NFCI

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346241 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCompares.cpp       | 56 +++++++++----------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 2ff10f4fc40..e8e78d2e652 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5409,29 +5409,6 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   Constant *RHSC;
   if (match(Op0, m_Instruction(LHSI)) && match(Op1, m_Constant(RHSC))) {
     switch (LHSI->getOpcode()) {
-    case Instruction::FPExt: {
-      // fcmp (fpext x), C -> fcmp x, (fptrunc C) if fptrunc is lossless
-      FPExtInst *LHSExt = cast<FPExtInst>(LHSI);
-      ConstantFP *RHSF = dyn_cast<ConstantFP>(RHSC);
-      if (!RHSF)
-        break;
-
-      const fltSemantics &FPSem = LHSExt->getSrcTy()->getFltSemantics();
-      bool Lossy;
-      APFloat F = RHSF->getValueAPF();
-      F.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
-
-      // Avoid lossy conversions and denormals.
-      // Zero is a special case that's OK to convert.
-      APFloat Fabs = F;
-      Fabs.clearSign();
-      if (!Lossy &&
-          ((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
-                APFloat::cmpLessThan) || Fabs.isZero()))
-        return new FCmpInst(Pred, LHSExt->getOperand(0),
-                            ConstantFP::get(RHSC->getContext(), F));
-      break;
-    }
     case Instruction::PHI:
       // Only fold fcmp into the PHI if the phi and fcmp are in the same
       // block.  If in the same block, we're encouraging jump threading.  If
@@ -5483,12 +5460,33 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
     }
   }
 
-  // fcmp (fpext X), (fpext Y) -> fcmp X, Y
-  if (match(Op0, m_FPExt(m_Value(X))) && match(Op1, m_FPExt(m_Value(Y))) &&
-      X->getType() == Y->getType()) {
-    Instruction *NewFCmp = new FCmpInst(Pred, X, Y);
-    NewFCmp->copyFastMathFlags(&I);
-    return NewFCmp;
+  if (match(Op0, m_FPExt(m_Value(X)))) {
+    if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType()) {
+      // fcmp (fpext X), (fpext Y) -> fcmp X, Y
+      Instruction *NewFCmp = new FCmpInst(Pred, X, Y);
+      NewFCmp->copyFastMathFlags(&I);
+      return NewFCmp;
+    }
+
+    // TODO: Use m_APFloat to handle vector splats.
+    ConstantFP *C;
+    if (match(Op1, m_ConstantFP(C))) {
+      // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
+      const fltSemantics &FPSem = X->getType()->getFltSemantics();
+      bool Lossy;
+      APFloat F = C->getValueAPF();
+      F.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
+
+      // Avoid lossy conversions and denormals.
+      // Zero is a special case that's OK to convert.
+      APFloat Fabs = F;
+      Fabs.clearSign();
+      if (!Lossy &&
+          ((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
+            APFloat::cmpLessThan) || Fabs.isZero()))
+        // TODO: Propagate FMF.
+        return new FCmpInst(Pred, X, ConstantFP::get(C->getContext(), F));
+    }
   }
 
   if (I.getType()->isVectorTy())
-- 
GitLab


From 031587fccfe2a1f2afd06bde0820229d46d465a8 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 6 Nov 2018 16:45:27 +0000
Subject: [PATCH 1016/1116] [InstCombine] propagate fast-math-flags when
 folding fcmp+fpext, part 2

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346242 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineCompares.cpp | 9 ++++++---
 test/Transforms/InstCombine/fcmp.ll                | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index e8e78d2e652..62493decb4a 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5483,9 +5483,12 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
       Fabs.clearSign();
       if (!Lossy &&
           ((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
-            APFloat::cmpLessThan) || Fabs.isZero()))
-        // TODO: Propagate FMF.
-        return new FCmpInst(Pred, X, ConstantFP::get(C->getContext(), F));
+            APFloat::cmpLessThan) || Fabs.isZero())) {
+        Instruction *NewFCmp =
+            new FCmpInst(Pred, X, ConstantFP::get(C->getContext(), F));
+        NewFCmp->copyFastMathFlags(&I);
+        return NewFCmp;
+      }
     }
   }
 
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index b72adc19a8e..8ba4dd25157 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -18,7 +18,7 @@ define i1 @test1(float %x, float %y) {
 
 define i1 @test2(float %a) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[A:%.*]], 1.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ogt float [[A:%.*]], 1.000000e+00
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ext = fpext float %a to double
-- 
GitLab


From f49d01a74b4cef93c3bfa33cd988dd8feb7317aa Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 6 Nov 2018 17:06:58 +0000
Subject: [PATCH 1017/1116] [InstCombine] add vector test for fcmp+fpext; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346243 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/fcmp.ll | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index 8ba4dd25157..a24449a905c 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -5,8 +5,8 @@ declare half @llvm.fabs.f16(half)
 declare double @llvm.fabs.f64(double)
 declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 
-define i1 @test1(float %x, float %y) {
-; CHECK-LABEL: @test1(
+define i1 @fpext_fpext(float %x, float %y) {
+; CHECK-LABEL: @fpext_fpext(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ogt float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -16,8 +16,8 @@ define i1 @test1(float %x, float %y) {
   ret i1 %cmp
 }
 
-define i1 @test2(float %a) {
-; CHECK-LABEL: @test2(
+define i1 @fpext_constant(float %a) {
+; CHECK-LABEL: @fpext_constant(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ogt float [[A:%.*]], 1.000000e+00
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -26,8 +26,19 @@ define i1 @test2(float %a) {
   ret i1 %cmp
 }
 
-define i1 @test3(float %a) {
-; CHECK-LABEL: @test3(
+define <2 x i1> @fpext_constant_vec_splat(<2 x half> %a) {
+; CHECK-LABEL: @fpext_constant_vec_splat(
+; CHECK-NEXT:    [[EXT:%.*]] = fpext <2 x half> [[A:%.*]] to <2 x double>
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ole <2 x double> [[EXT]], <double 4.200000e+01, double 4.200000e+01>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %ext = fpext <2 x half> %a to <2 x double>
+  %cmp = fcmp nnan ole <2 x double> %ext, <double 42.0, double 42.0>
+  ret <2 x i1> %cmp
+}
+
+define i1 @fpext_constant_lossy(float %a) {
+; CHECK-LABEL: @fpext_constant_lossy(
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[A:%.*]] to double
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[EXT]], 0x3FF0000000000001
 ; CHECK-NEXT:    ret i1 [[CMP]]
@@ -37,8 +48,8 @@ define i1 @test3(float %a) {
   ret i1 %cmp
 }
 
-define i1 @test4(float %a) {
-; CHECK-LABEL: @test4(
+define i1 @fpext_constant_denorm(float %a) {
+; CHECK-LABEL: @fpext_constant_denorm(
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[A:%.*]] to double
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[EXT]], 0x36A0000000000000
 ; CHECK-NEXT:    ret i1 [[CMP]]
-- 
GitLab


From 09d77c0de9e2f7fcebe292a669faa5d8911dc4d6 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 6 Nov 2018 17:20:20 +0000
Subject: [PATCH 1018/1116] [InstCombine] allow vector types for fcmp+fpext
 fold

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346245 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCompares.cpp        | 18 +++++++++---------
 test/Transforms/InstCombine/fcmp.ll            |  3 +--
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 62493decb4a..2381e26a1d8 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5468,24 +5468,24 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
       return NewFCmp;
     }
 
-    // TODO: Use m_APFloat to handle vector splats.
-    ConstantFP *C;
-    if (match(Op1, m_ConstantFP(C))) {
+    const APFloat *C;
+    if (match(Op1, m_APFloat(C))) {
       // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
-      const fltSemantics &FPSem = X->getType()->getFltSemantics();
+      const fltSemantics &FPSem =
+          X->getType()->getScalarType()->getFltSemantics();
       bool Lossy;
-      APFloat F = C->getValueAPF();
-      F.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
+      APFloat TruncC = *C;
+      TruncC.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
 
       // Avoid lossy conversions and denormals.
       // Zero is a special case that's OK to convert.
-      APFloat Fabs = F;
+      APFloat Fabs = TruncC;
       Fabs.clearSign();
       if (!Lossy &&
           ((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
             APFloat::cmpLessThan) || Fabs.isZero())) {
-        Instruction *NewFCmp =
-            new FCmpInst(Pred, X, ConstantFP::get(C->getContext(), F));
+        Constant *NewC = ConstantFP::get(X->getType(), TruncC);
+        Instruction *NewFCmp = new FCmpInst(Pred, X, NewC);
         NewFCmp->copyFastMathFlags(&I);
         return NewFCmp;
       }
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index a24449a905c..919659e45f4 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -28,8 +28,7 @@ define i1 @fpext_constant(float %a) {
 
 define <2 x i1> @fpext_constant_vec_splat(<2 x half> %a) {
 ; CHECK-LABEL: @fpext_constant_vec_splat(
-; CHECK-NEXT:    [[EXT:%.*]] = fpext <2 x half> [[A:%.*]] to <2 x double>
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ole <2 x double> [[EXT]], <double 4.200000e+01, double 4.200000e+01>
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ole <2 x half> [[A:%.*]], <half 0xH5140, half 0xH5140>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %ext = fpext <2 x half> %a to <2 x double>
-- 
GitLab


From 992cf17563f09a41222f28b64ddf554f5c81ff3c Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@google.com>
Date: Tue, 6 Nov 2018 17:27:25 +0000
Subject: [PATCH 1019/1116] [WebAssembly] Add shared memory support to limits
 field

Support the IS_SHARED bit in the memory limits flag word.
The compiler does not create object files with memory definitions,
but the field is used by the linker.

Differential Revision: https://reviews.llvm.org/D54131

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346246 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/BinaryFormat/Wasm.h              |  1 +
 lib/Object/WasmObjectFile.cpp                 |  2 +-
 lib/ObjectYAML/WasmYAML.cpp                   |  1 +
 .../ObjectYAML/wasm/import_memory_shared.yaml | 36 +++++++++++++++++++
 4 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 test/ObjectYAML/wasm/import_memory_shared.yaml

diff --git a/include/llvm/BinaryFormat/Wasm.h b/include/llvm/BinaryFormat/Wasm.h
index 44dd92ea901..3d25c9d15e4 100644
--- a/include/llvm/BinaryFormat/Wasm.h
+++ b/include/llvm/BinaryFormat/Wasm.h
@@ -214,6 +214,7 @@ enum : unsigned {
 
 enum : unsigned {
   WASM_LIMITS_FLAG_HAS_MAX = 0x1,
+  WASM_LIMITS_FLAG_IS_SHARED = 0x2,
 };
 
 // Kind codes used in the custom "name" section
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 75925a5ea10..3bd66f9375f 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -193,7 +193,7 @@ static Error readInitExpr(wasm::WasmInitExpr &Expr,
 
 static wasm::WasmLimits readLimits(WasmObjectFile::ReadContext &Ctx) {
   wasm::WasmLimits Result;
-  Result.Flags = readVaruint1(Ctx);
+  Result.Flags = readVaruint32(Ctx);
   Result.Initial = readVaruint32(Ctx);
   if (Result.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
     Result.Maximum = readVaruint32(Ctx);
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
index 2e7a1d6f653..dba950af589 100644
--- a/lib/ObjectYAML/WasmYAML.cpp
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -416,6 +416,7 @@ void ScalarBitSetTraits<WasmYAML::LimitFlags>::bitset(
     IO &IO, WasmYAML::LimitFlags &Value) {
 #define BCase(X) IO.bitSetCase(Value, #X, wasm::WASM_LIMITS_FLAG_##X)
   BCase(HAS_MAX);
+  BCase(IS_SHARED);
 #undef BCase
 }
 
diff --git a/test/ObjectYAML/wasm/import_memory_shared.yaml b/test/ObjectYAML/wasm/import_memory_shared.yaml
new file mode 100644
index 00000000000..849bdc5314d
--- /dev/null
+++ b/test/ObjectYAML/wasm/import_memory_shared.yaml
@@ -0,0 +1,36 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ReturnType:      I32
+        ParamTypes:
+          - I32
+  - Type:            IMPORT
+    Imports:
+      - Module:          foo
+        Field:           imported_memory
+        Kind:            MEMORY
+        Memory:
+          Flags:           [ HAS_MAX, IS_SHARED ]
+          Initial:         0x00000010
+          Maximum:         0x00000011
+
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:   - Type:            IMPORT
+# CHECK:     Imports:         
+# CHECK:       - Module:          foo
+# CHECK:         Field:           imported_memory
+# CHECK:         Kind:            MEMORY
+# CHECK:         Memory:
+# CHECK:           Flags:           [ HAS_MAX, IS_SHARED ]
+# CHECK:           Initial:         0x00000010
+# CHECK:           Maximum:         0x00000011
+# CHECK: ...
-- 
GitLab


From d9f217dc39749ca94b33cfe721c823a00945b6f7 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Tue, 6 Nov 2018 18:23:32 +0000
Subject: [PATCH 1020/1116] Disable calls to *_finite and other glibc-only
 functions on Musl.

Non-GNU environments don't have __finite_*, so treat them as
unavailable.

Differential Revision: https://reviews.llvm.org/D51282


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346250 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/TargetLibraryInfo.cpp            | 10 ++---
 .../Transforms/ConstProp/calls-math-finite.ll | 43 +++++++++++++++++++
 .../Transforms/InferFunctionAttrs/annotate.ll |  2 +-
 3 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index fb678febe23..b3cd40e098e 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -413,17 +413,17 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_flsll);
   }
 
-  // The following functions are available on Linux,
-  // but Android uses bionic instead of glibc.
-  if (!T.isOSLinux() || T.isAndroid()) {
+  // The following functions are only available on GNU/Linux (using glibc).
+  // Linux variants without glibc (eg: bionic, musl) may have some subset.
+  if (!T.isOSLinux() || !T.isGNUEnvironment()) {
     TLI.setUnavailable(LibFunc_dunder_strdup);
     TLI.setUnavailable(LibFunc_dunder_strtok_r);
     TLI.setUnavailable(LibFunc_dunder_isoc99_scanf);
     TLI.setUnavailable(LibFunc_dunder_isoc99_sscanf);
     TLI.setUnavailable(LibFunc_under_IO_getc);
     TLI.setUnavailable(LibFunc_under_IO_putc);
-    // But, Android has memalign.
-    if (!T.isAndroid())
+    // But, Android and musl have memalign.
+    if (!T.isAndroid() && !T.isMusl())
       TLI.setUnavailable(LibFunc_memalign);
     TLI.setUnavailable(LibFunc_fopen64);
     TLI.setUnavailable(LibFunc_fseeko64);
diff --git a/test/Transforms/ConstProp/calls-math-finite.ll b/test/Transforms/ConstProp/calls-math-finite.ll
index 93741612fc5..d13b798bde2 100644
--- a/test/Transforms/ConstProp/calls-math-finite.ll
+++ b/test/Transforms/ConstProp/calls-math-finite.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -constprop -S | FileCheck %s
+; RUN: opt < %s -constprop -S -mtriple=unknown-unknown-linux-musl | FileCheck -check-prefix=MUSL %s
 
 ; Test to verify constant folding can occur when math routines are mapped
 ; to the __<func>_finite versions of functions due to __FINITE_MATH_ONLY__
@@ -57,6 +58,48 @@ define void @T() {
 ; CHECK-NEXT:    store float 0x40240926E0000000, float* [[SLOTF]]
 ; CHECK-NEXT:    ret void
 ;
+; MUSL-LABEL: @T(
+; MUSL-NEXT:    [[SLOT:%.*]] = alloca double
+; MUSL-NEXT:    [[SLOTF:%.*]] = alloca float
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+
   %slot = alloca double
   %slotf = alloca float
 
diff --git a/test/Transforms/InferFunctionAttrs/annotate.ll b/test/Transforms/InferFunctionAttrs/annotate.ll
index 37dfe41cfcb..161873be56e 100644
--- a/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -mtriple=x86_64-- -inferattrs -S | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-- -passes=inferattrs -S | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -inferattrs -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-DARWIN %s
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -inferattrs -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-LINUX %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -inferattrs -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-LINUX %s
 ; RUN: opt < %s -mtriple=nvptx -inferattrs -S | FileCheck -check-prefix=CHECK-NVPTX %s
 
 ; operator new routines
-- 
GitLab


From 0706b913a32e7ebf46a6860193511dd03f719fdd Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Tue, 6 Nov 2018 18:31:25 +0000
Subject: [PATCH 1021/1116] Reland r346166: [GlobalISel] Refactor the artifact
 combiner a bit by using MIPatternMatch

It was causing a crash because we were trying to get the definition
of a target register. Fixed the issue by adding a check and added
a test case for that.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346251 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../GlobalISel/LegalizationArtifactCombiner.h | 91 +++++++++++--------
 .../AArch64/GlobalISel/legalize-sext-copy.mir | 21 +++++
 2 files changed, 76 insertions(+), 36 deletions(-)
 create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-sext-copy.mir

diff --git a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 256f1ccbee7..e1132ac59c8 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -14,12 +14,14 @@
 
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "legalizer"
+using namespace llvm::MIPatternMatch;
 
 namespace llvm {
 class LegalizationArtifactCombiner {
@@ -36,15 +38,17 @@ public:
                         SmallVectorImpl<MachineInstr *> &DeadInsts) {
     if (MI.getOpcode() != TargetOpcode::G_ANYEXT)
       return false;
-    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
-                                           MI.getOperand(1).getReg(), MRI)) {
+
+    Builder.setInstr(MI);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = lookThroughCopyInstrs(MI.getOperand(1).getReg());
+
+    // aext(trunc x) - > aext/copy/trunc x
+    unsigned TruncSrc;
+    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      unsigned DstReg = MI.getOperand(0).getReg();
-      unsigned SrcReg = DefMI->getOperand(1).getReg();
-      Builder.setInstr(MI);
-      // We get a copy/trunc/extend depending on the sizes
-      Builder.buildAnyExtOrTrunc(DstReg, SrcReg);
-      markInstAndDefDead(MI, *DefMI, DeadInsts);
+      Builder.buildAnyExtOrTrunc(DstReg, TruncSrc);
+      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
@@ -55,24 +59,25 @@ public:
 
     if (MI.getOpcode() != TargetOpcode::G_ZEXT)
       return false;
-    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
-                                           MI.getOperand(1).getReg(), MRI)) {
-      unsigned DstReg = MI.getOperand(0).getReg();
+
+    Builder.setInstr(MI);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = lookThroughCopyInstrs(MI.getOperand(1).getReg());
+
+    // zext(trunc x) - > and (aext/copy/trunc x), mask
+    unsigned TruncSrc;
+    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
       LLT DstTy = MRI.getType(DstReg);
       if (isInstUnsupported({TargetOpcode::G_AND, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
         return false;
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      Builder.setInstr(MI);
-      unsigned ZExtSrc = MI.getOperand(1).getReg();
-      LLT ZExtSrcTy = MRI.getType(ZExtSrc);
-      APInt Mask = APInt::getAllOnesValue(ZExtSrcTy.getSizeInBits());
-      auto MaskCstMIB = Builder.buildConstant(DstTy, Mask.getZExtValue());
-      unsigned TruncSrc = DefMI->getOperand(1).getReg();
-      // We get a copy/trunc/extend depending on the sizes
-      auto SrcCopyOrTrunc = Builder.buildAnyExtOrTrunc(DstTy, TruncSrc);
-      Builder.buildAnd(DstReg, SrcCopyOrTrunc, MaskCstMIB);
-      markInstAndDefDead(MI, *DefMI, DeadInsts);
+      LLT SrcTy = MRI.getType(SrcReg);
+      APInt Mask = APInt::getAllOnesValue(SrcTy.getSizeInBits());
+      auto MIBMask = Builder.buildConstant(DstTy, Mask.getZExtValue());
+      Builder.buildAnd(DstReg, Builder.buildAnyExtOrTrunc(DstTy, TruncSrc),
+                       MIBMask);
+      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
@@ -83,27 +88,28 @@ public:
 
     if (MI.getOpcode() != TargetOpcode::G_SEXT)
       return false;
-    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
-                                           MI.getOperand(1).getReg(), MRI)) {
-      unsigned DstReg = MI.getOperand(0).getReg();
+
+    Builder.setInstr(MI);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = lookThroughCopyInstrs(MI.getOperand(1).getReg());
+
+    // sext(trunc x) - > ashr (shl (aext/copy/trunc x), c), c
+    unsigned TruncSrc;
+    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
       LLT DstTy = MRI.getType(DstReg);
       if (isInstUnsupported({TargetOpcode::G_SHL, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_ASHR, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
         return false;
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      Builder.setInstr(MI);
-      unsigned SExtSrc = MI.getOperand(1).getReg();
-      LLT SExtSrcTy = MRI.getType(SExtSrc);
-      unsigned SizeDiff = DstTy.getSizeInBits() - SExtSrcTy.getSizeInBits();
-      auto SizeDiffMIB = Builder.buildConstant(DstTy, SizeDiff);
-      unsigned TruncSrcReg = DefMI->getOperand(1).getReg();
-      // We get a copy/trunc/extend depending on the sizes
-      auto SrcCopyExtOrTrunc = Builder.buildAnyExtOrTrunc(DstTy, TruncSrcReg);
-      auto ShlMIB = Builder.buildInstr(TargetOpcode::G_SHL, DstTy,
-                                       SrcCopyExtOrTrunc, SizeDiffMIB);
-      Builder.buildInstr(TargetOpcode::G_ASHR, DstReg, ShlMIB, SizeDiffMIB);
-      markInstAndDefDead(MI, *DefMI, DeadInsts);
+      LLT SrcTy = MRI.getType(SrcReg);
+      unsigned ShAmt = DstTy.getSizeInBits() - SrcTy.getSizeInBits();
+      auto MIBShAmt = Builder.buildConstant(DstTy, ShAmt);
+      auto MIBShl = Builder.buildInstr(
+          TargetOpcode::G_SHL, DstTy,
+          Builder.buildAnyExtOrTrunc(DstTy, TruncSrc), MIBShAmt);
+      Builder.buildInstr(TargetOpcode::G_ASHR, DstReg, MIBShl, MIBShAmt);
+      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
@@ -289,6 +295,19 @@ private:
     auto Step = LI.getAction(Query);
     return Step.Action == Unsupported || Step.Action == NotFound;
   }
+
+  /// Looks through copy instructions and returns the actual
+  /// source register.
+  unsigned lookThroughCopyInstrs(unsigned Reg) {
+    unsigned TmpReg;
+    while (mi_match(Reg, MRI, m_Copy(m_Reg(TmpReg)))) {
+      if (MRI.getType(TmpReg).isValid())
+        Reg = TmpReg;
+      else
+        break;
+    }
+    return Reg;
+  }
 };
 
 } // namespace llvm
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-sext-copy.mir b/test/CodeGen/AArch64/GlobalISel/legalize-sext-copy.mir
new file mode 100644
index 00000000000..e84dae37a5e
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-sext-copy.mir
@@ -0,0 +1,21 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
+---
+name:            test_sext_copy
+body: |
+  bb.0.entry:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: test_sext_copy
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK: $w0 = COPY [[COPY]](s32)
+    ; CHECK: $w0 = COPY [[COPY]](s32)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
+    ; CHECK: $x0 = COPY [[SEXT]](s64)
+    %0:_(s32) = COPY $w1
+    $w0 = COPY %0(s32)
+    $w0 = COPY %0(s32)
+    %1:_(s32) = COPY $w0
+    %2:_(s64) = G_SEXT %1(s32)
+    $x0 = COPY %2(s64)
+...
-- 
GitLab


From 7bad2acd44e919becf42c3129cef7c38f5715f7e Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Tue, 6 Nov 2018 18:52:30 +0000
Subject: [PATCH 1022/1116] [utils] Update SmallVector lldb formatter for
 r337514

SmallVector was changed to store a begin and a size rather than a
begin and an end a while back. Update the formatter to look at the
correct members.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346252 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/lldbDataFormatters.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/utils/lldbDataFormatters.py b/utils/lldbDataFormatters.py
index db1e22af792..fcb381cc54d 100644
--- a/utils/lldbDataFormatters.py
+++ b/utils/lldbDataFormatters.py
@@ -26,9 +26,7 @@ class SmallVectorSynthProvider:
         self.update() # initialize this provider
 
     def num_children(self):
-        begin = self.begin.GetValueAsUnsigned(0)
-        end = self.end.GetValueAsUnsigned(0)
-        return (end - begin)/self.type_size
+        return self.size.GetValueAsUnsigned(0)
 
     def get_child_index(self, name):
         try:
@@ -49,7 +47,7 @@ class SmallVectorSynthProvider:
 
     def update(self):
         self.begin = self.valobj.GetChildMemberWithName('BeginX')
-        self.end = self.valobj.GetChildMemberWithName('EndX')
+        self.size = self.valobj.GetChildMemberWithName('Size')
         the_type = self.valobj.GetType()
         # If this is a reference type we have to dereference it to get to the
         # template parameter.
-- 
GitLab


From 68241312ac2ab0648a881fa7d5e90e75f8725e56 Mon Sep 17 00:00:00 2001
From: Volkan Keles <vkeles@apple.com>
Date: Tue, 6 Nov 2018 18:59:18 +0000
Subject: [PATCH 1023/1116] [AArch64][GlobalISel] Simplify and autogenerate the
 legalizer tests

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346253 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../AArch64/GlobalISel/legalize-add.mir       | 192 +++++++-----------
 .../AArch64/GlobalISel/legalize-and.mir       |  43 ++--
 .../AArch64/GlobalISel/legalize-cmp.mir       |  62 ++----
 .../AArch64/GlobalISel/legalize-div.mir       |  46 ++---
 .../AArch64/GlobalISel/legalize-ext.mir       | 127 ++++--------
 .../AArch64/GlobalISel/legalize-extload.mir   |  21 +-
 .../AArch64/GlobalISel/legalize-fcmp.mir      |  42 +---
 .../AArch64/GlobalISel/legalize-gep.mir       |  32 +--
 .../AArch64/GlobalISel/legalize-mul.mir       |  68 ++-----
 .../AArch64/GlobalISel/legalize-pow.mir       |  51 +++--
 .../AArch64/GlobalISel/legalize-rem.mir       | 117 +++--------
 .../AArch64/GlobalISel/legalize-sextload.mir  |  25 +--
 .../AArch64/GlobalISel/legalize-shift.mir     |  52 ++---
 .../AArch64/GlobalISel/legalize-simple.mir    | 183 ++++++-----------
 .../AArch64/GlobalISel/legalize-sub.mir       |  38 +---
 .../AArch64/GlobalISel/legalize-undef.mir     |  10 +-
 .../AArch64/GlobalISel/legalize-xor.mir       |  38 +---
 .../AArch64/GlobalISel/legalize-zextload.mir  |  25 +--
 18 files changed, 365 insertions(+), 807 deletions(-)

diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
index 4b2d54bcd0d..fe6079c0db4 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
@@ -1,37 +1,9 @@
-# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_scalar_add_big() {
-  entry:
-    ret void
-  }
-  define void @test_scalar_add_big_nonpow2() {
-  entry:
-    ret void
-  }
-  define void @test_scalar_add_small() {
-  entry:
-    ret void
-  }
-  define void @test_vector_add() {
-  entry:
-    ret void
-  }
-  define void @test_vector_add_nonpow2() {
-  entry:
-    ret void
-  }
-...
-
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_scalar_add_big
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_add_big
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -47,63 +19,48 @@ body: |
     %1:_(s64) = COPY $x1
     %2:_(s64) = COPY $x2
     %3:_(s64) = COPY $x3
-    %4:_(s128) = G_MERGE_VALUES %0, %1
-    %5:_(s128) = G_MERGE_VALUES %2, %3
+    %4:_(s128) = G_MERGE_VALUES %0(s64), %1(s64)
+    %5:_(s128) = G_MERGE_VALUES %2(s64), %3(s64)
     %6:_(s128) = G_ADD %4, %5
-    %7:_(s64), %8:_(s64) = G_UNMERGE_VALUES %6
-    $x0 = COPY %7
-    $x1 = COPY %8
-...
+    %7:_(s64), %8:_(s64) = G_UNMERGE_VALUES %6(s128)
+    $x0 = COPY %7(s64)
+    $x1 = COPY %8(s64)
 
+...
 ---
 name:            test_scalar_add_big_nonpow2
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-  - { id: 9, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_scalar_add_big_nonpow2
-    ; CHECK-NOT: G_MERGE_VALUES
-    ; CHECK-NOT: G_UNMERGE_VALUES
-    ; CHECK-DAG: [[CARRY0_32:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-DAG: [[CARRY0:%[0-9]+]]:_(s1) = G_TRUNC [[CARRY0_32]]
-    ; CHECK: [[RES_LO:%[0-9]+]]:_(s64), [[CARRY1:%[0-9]+]]:_(s1) = G_UADDE %0, %1, [[CARRY0]]
-    ; CHECK: [[RES_MI:%[0-9]+]]:_(s64), [[CARRY2:%[0-9]+]]:_(s1) = G_UADDE %1, %2, [[CARRY1]]
-    ; CHECK: [[RES_HI:%[0-9]+]]:_(s64), {{%.*}}(s1) = G_UADDE %2, %3, [[CARRY2]]
-    ; CHECK-NOT: G_MERGE_VALUES
-    ; CHECK-NOT: G_UNMERGE_VALUES
-    ; CHECK: $x0 = COPY [[RES_LO]]
-    ; CHECK: $x1 = COPY [[RES_MI]]
-    ; CHECK: $x2 = COPY [[RES_HI]]
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[COPY]], [[COPY1]], [[TRUNC]]
+    ; CHECK: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[COPY1]], [[COPY2]], [[UADDE1]]
+    ; CHECK: [[UADDE4:%[0-9]+]]:_(s64), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[COPY2]], [[COPY3]], [[UADDE3]]
+    ; CHECK: $x0 = COPY [[UADDE]](s64)
+    ; CHECK: $x1 = COPY [[UADDE2]](s64)
+    ; CHECK: $x2 = COPY [[UADDE4]](s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s192) = G_MERGE_VALUES %0(s64), %1(s64), %2(s64)
+    %5:_(s192) = G_MERGE_VALUES %1(s64), %2(s64), %3(s64)
+    %6:_(s192) = G_ADD %4, %5
+    %7:_(s64), %8:_(s64), %9:_(s64) = G_UNMERGE_VALUES %6(s192)
+    $x0 = COPY %7(s64)
+    $x1 = COPY %8(s64)
+    $x2 = COPY %9(s64)
 
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s64) = COPY $x2
-    %3(s64) = COPY $x3
-    %4(s192) = G_MERGE_VALUES %0, %1, %2
-    %5(s192) = G_MERGE_VALUES %1, %2, %3
-    %6(s192) = G_ADD %4, %5
-    %7(s64), %8(s64), %9(s64) = G_UNMERGE_VALUES %6
-    $x0 = COPY %7
-    $x1 = COPY %8
-    $x2 = COPY %9
 ...
-
 ---
 name:            test_scalar_add_small
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_add_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -114,19 +71,17 @@ body: |
     ; CHECK: $x0 = COPY [[ANYEXT]](s64)
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
-    %2:_(s8) = G_TRUNC %0
-    %3:_(s8) = G_TRUNC %1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
     %4:_(s8) = G_ADD %2, %3
-    %5:_(s64) = G_ANYEXT %4
-    $x0 = COPY %5
-...
+    %5:_(s64) = G_ANYEXT %4(s8)
+    $x0 = COPY %5(s64)
 
+...
 ---
 name:            test_vector_add
-body: |
+body:             |
   bb.0.entry:
-    liveins: $q0, $q1, $q2, $q3
-
     ; CHECK-LABEL: name: test_vector_add
     ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
@@ -140,50 +95,39 @@ body: |
     %1:_(<2 x s64>) = COPY $q1
     %2:_(<2 x s64>) = COPY $q2
     %3:_(<2 x s64>) = COPY $q3
-    %4:_(<4 x s64>) = G_MERGE_VALUES %0, %1
-    %5:_(<4 x s64>) = G_MERGE_VALUES %2, %3
+    %4:_(<4 x s64>) = G_MERGE_VALUES %0(<2 x s64>), %1(<2 x s64>)
+    %5:_(<4 x s64>) = G_MERGE_VALUES %2(<2 x s64>), %3(<2 x s64>)
     %6:_(<4 x s64>) = G_ADD %4, %5
-    %7:_(<2 x s64>), %8:_(<2 x s64>) = G_UNMERGE_VALUES %6
-    $q0 = COPY %7
-    $q1 = COPY %8
+    %7:_(<2 x s64>), %8:_(<2 x s64>) = G_UNMERGE_VALUES %6(<4 x s64>)
+    $q0 = COPY %7(<2 x s64>)
+    $q1 = COPY %8(<2 x s64>)
+
 ...
 ---
 name:            test_vector_add_nonpow2
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-  - { id: 9, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $q0, $q1, $q2, $q3
     ; CHECK-LABEL: name: test_vector_add_nonpow2
-    ; CHECK-NOT: G_EXTRACT
-    ; CHECK-NOT: G_SEQUENCE
-    ; CHECK: [[RES_LO:%[0-9]+]]:_(<2 x s64>) = G_ADD %0, %1
-    ; CHECK: [[RES_MI:%[0-9]+]]:_(<2 x s64>) = G_ADD %1, %2
-    ; CHECK: [[RES_HI:%[0-9]+]]:_(<2 x s64>) = G_ADD %2, %3
-    ; CHECK-NOT: G_EXTRACT
-    ; CHECK-NOT: G_SEQUENCE
-    ; CHECK: $q0 = COPY [[RES_LO]]
-    ; CHECK: $q1 = COPY [[RES_MI]]
-    ; CHECK: $q2 = COPY [[RES_HI]]
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+    ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
+    ; CHECK: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY1]], [[COPY2]]
+    ; CHECK: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK: $q0 = COPY [[ADD]](<2 x s64>)
+    ; CHECK: $q1 = COPY [[ADD1]](<2 x s64>)
+    ; CHECK: $q2 = COPY [[ADD2]](<2 x s64>)
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = COPY $q1
+    %2:_(<2 x s64>) = COPY $q2
+    %3:_(<2 x s64>) = COPY $q3
+    %4:_(<6 x s64>) = G_MERGE_VALUES %0(<2 x s64>), %1(<2 x s64>), %2(<2 x s64>)
+    %5:_(<6 x s64>) = G_MERGE_VALUES %1(<2 x s64>), %2(<2 x s64>), %3(<2 x s64>)
+    %6:_(<6 x s64>) = G_ADD %4, %5
+    %7:_(<2 x s64>), %8:_(<2 x s64>), %9:_(<2 x s64>) = G_UNMERGE_VALUES %6(<6 x s64>)
+    $q0 = COPY %7(<2 x s64>)
+    $q1 = COPY %8(<2 x s64>)
+    $q2 = COPY %9(<2 x s64>)
 
-    %0(<2 x s64>) = COPY $q0
-    %1(<2 x s64>) = COPY $q1
-    %2(<2 x s64>) = COPY $q2
-    %3(<2 x s64>) = COPY $q3
-    %4(<6 x s64>) = G_MERGE_VALUES %0, %1, %2
-    %5(<6 x s64>) = G_MERGE_VALUES %1, %2, %3
-    %6(<6 x s64>) = G_ADD %4, %5
-    %7(<2 x s64>), %8(<2 x s64>), %9(<2 x s64>) = G_UNMERGE_VALUES %6
-    $q0 = COPY %7
-    $q1 = COPY %8
-    $q2 = COPY %9
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-and.mir b/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
index fdcf79e5536..af683e302f4 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
@@ -1,29 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_scalar_and_small() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_scalar_and_small
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_and_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -34,13 +14,14 @@ body: |
     ; CHECK: $w0 = COPY [[COPY2]](s32)
     ; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
     ; CHECK: $x0 = COPY [[COPY3]](s64)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-    %4(s8) = G_AND %2, %3
-    %6(s32) = G_ANYEXT %4
-    $w0 = COPY %6
-    %5(s64) = G_ANYEXT %2
-    $x0 = COPY %5
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_AND %2, %3
+    %6:_(s32) = G_ANYEXT %4(s8)
+    $w0 = COPY %6(s32)
+    %5:_(s64) = G_ANYEXT %2(s8)
+    $x0 = COPY %5(s64)
+
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir b/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
index ca5646a1c7b..ef86df6a5c1 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
@@ -1,36 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_icmp() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_icmp
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-  - { id: 9, class: _ }
-  - { id: 10, class: _ }
-  - { id: 11, class: _ }
-  - { id: 12, class: _ }
-  - { id: 13, class: _ }
-  - { id: 14, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_icmp
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
@@ -50,22 +23,19 @@ body: |
     ; CHECK: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[INTTOPTR]](p0), [[INTTOPTR]]
     ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ICMP2]](s32)
     ; CHECK: $w0 = COPY [[COPY4]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x0
-
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-
-    %4(s1) = G_ICMP intpred(sge), %0, %1
-    %11(s32) = G_ANYEXT %4
-    $w0 = COPY %11
-
-    %8(s1) = G_ICMP intpred(ult), %2, %3
-    %12(s32) = G_ANYEXT %8
-    $w0 = COPY %12
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x0
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s1) = G_ICMP intpred(sge), %0(s64), %1
+    %11:_(s32) = G_ANYEXT %4(s1)
+    $w0 = COPY %11(s32)
+    %8:_(s1) = G_ICMP intpred(ult), %2(s8), %3
+    %12:_(s32) = G_ANYEXT %8(s1)
+    $w0 = COPY %12(s32)
+    %9:_(p0) = G_INTTOPTR %0(s64)
+    %10:_(s1) = G_ICMP intpred(eq), %9(p0), %9
+    %14:_(s32) = G_ANYEXT %10(s1)
+    $w0 = COPY %14(s32)
 
-    %9(p0) = G_INTTOPTR %0(s64)
-    %10(s1) = G_ICMP intpred(eq), %9(p0), %9(p0)
-    %14(s32) = G_ANYEXT %10
-    $w0 = COPY %14
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-div.mir b/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
index a21b83bb5ca..4753e17ca1c 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
@@ -1,27 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_div() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_div
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_div
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -45,19 +27,15 @@ body: |
     ; CHECK: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
     ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
     ; CHECK: $w0 = COPY [[COPY3]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-
-
-    %4(s8) = G_SDIV %2, %3
-    %6:_(s32) = G_ANYEXT %4
-    $w0 = COPY %6
-
-
-    %5(s8) = G_UDIV %2, %3
-    %7:_(s32) = G_ANYEXT %5
-    $w0 = COPY %7
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_SDIV %2, %3
+    %6:_(s32) = G_ANYEXT %4(s8)
+    $w0 = COPY %6(s32)
+    %5:_(s8) = G_UDIV %2, %3
+    %7:_(s32) = G_ANYEXT %5(s8)
+    $w0 = COPY %7(s32)
 
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
index c4bcbb683c1..b1be33cbeb5 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
@@ -1,40 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_ext() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_ext
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-  - { id: 9, class: _ }
-  - { id: 10, class: _ }
-  - { id: 11, class: _ }
-  - { id: 12, class: _ }
-  - { id: 13, class: _ }
-  - { id: 14, class: _ }
-  - { id: 15, class: _ }
-  - { id: 16, class: _ }
-  - { id: 17, class: _ }
-  - { id: 18, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_ext
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
@@ -98,55 +67,48 @@ body: |
     ; CHECK: $w0 = COPY [[C8]](s32)
     ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; CHECK: $w0 = COPY [[DEF]](s32)
-    %0(s64) = COPY $x0
-
-    %1(s1) = G_TRUNC %0
-    %19:_(s32) = G_ANYEXT %1
-    $w0 = COPY %19
-    %2(s8) = G_TRUNC %0
-    %20:_(s32) = G_ANYEXT %2
-    $w0 = COPY %20
-    %3(s16) = G_TRUNC %0
-    %21:_(s32) = G_ANYEXT %3
-    $w0 = COPY %21
-    %4(s32) = G_TRUNC %0
-    $w0 = COPY %4
-
-    %5(s64) = G_ANYEXT %1
-    $x0 = COPY %5
-    %6(s64) = G_ZEXT %2
-    $x0 = COPY %6
-    %7(s64) = G_ANYEXT %3
-    $x0 = COPY %7
-    %8(s64) = G_SEXT %4
-    $x0 = COPY %8
-
-    %9(s32) = G_SEXT %1
-    $w0 = COPY %9
-    %10(s32) = G_ZEXT %2
-    $w0 = COPY %10
-    %11(s32) = G_ANYEXT %3
-    $w0 = COPY %11
-
-    %12(s32) = G_ZEXT %1
-    $w0 = COPY %12
-    %13(s32) = G_ANYEXT %2
-    $w0 = COPY %13
-    %14(s32) = G_SEXT %3
-    $w0 = COPY %14
-
-    %15(s8) = G_ZEXT %1
-    %22:_(s32) = G_ANYEXT %15
-    $w0 = COPY %22
-    %16(s16) = G_ANYEXT %2
-    %23:_(s32) = G_ANYEXT %16
-    $w0 = COPY %23
-
-    %17(s32) = G_TRUNC  %0
-    $w0 = COPY %17
-    %18(s64) = G_FPEXT %17
-    $x0 = COPY %18
-
+    %0:_(s64) = COPY $x0
+    %1:_(s1) = G_TRUNC %0(s64)
+    %19:_(s32) = G_ANYEXT %1(s1)
+    $w0 = COPY %19(s32)
+    %2:_(s8) = G_TRUNC %0(s64)
+    %20:_(s32) = G_ANYEXT %2(s8)
+    $w0 = COPY %20(s32)
+    %3:_(s16) = G_TRUNC %0(s64)
+    %21:_(s32) = G_ANYEXT %3(s16)
+    $w0 = COPY %21(s32)
+    %4:_(s32) = G_TRUNC %0(s64)
+    $w0 = COPY %4(s32)
+    %5:_(s64) = G_ANYEXT %1(s1)
+    $x0 = COPY %5(s64)
+    %6:_(s64) = G_ZEXT %2(s8)
+    $x0 = COPY %6(s64)
+    %7:_(s64) = G_ANYEXT %3(s16)
+    $x0 = COPY %7(s64)
+    %8:_(s64) = G_SEXT %4(s32)
+    $x0 = COPY %8(s64)
+    %9:_(s32) = G_SEXT %1(s1)
+    $w0 = COPY %9(s32)
+    %10:_(s32) = G_ZEXT %2(s8)
+    $w0 = COPY %10(s32)
+    %11:_(s32) = G_ANYEXT %3(s16)
+    $w0 = COPY %11(s32)
+    %12:_(s32) = G_ZEXT %1(s1)
+    $w0 = COPY %12(s32)
+    %13:_(s32) = G_ANYEXT %2(s8)
+    $w0 = COPY %13(s32)
+    %14:_(s32) = G_SEXT %3(s16)
+    $w0 = COPY %14(s32)
+    %15:_(s8) = G_ZEXT %1(s1)
+    %22:_(s32) = G_ANYEXT %15(s8)
+    $w0 = COPY %22(s32)
+    %16:_(s16) = G_ANYEXT %2(s8)
+    %23:_(s32) = G_ANYEXT %16(s16)
+    $w0 = COPY %23(s32)
+    %17:_(s32) = G_TRUNC %0(s64)
+    $w0 = COPY %17(s32)
+    %18:_(s64) = G_FPEXT %17(s32)
+    $x0 = COPY %18(s64)
     %24:_(s16) = G_IMPLICIT_DEF
     %25:_(s32) = G_ZEXT %24(s16)
     $w0 = COPY %25(s32)
@@ -154,4 +116,5 @@ body: |
     $w0 = COPY %26(s32)
     %27:_(s32) = G_ANYEXT %24(s16)
     $w0 = COPY %27(s32)
+
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-extload.mir b/test/CodeGen/AArch64/GlobalISel/legalize-extload.mir
index 816484108d2..a26704497c3 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-extload.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-extload.mir
@@ -1,24 +1,15 @@
-# RUN: llc -O0 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_extload(i8* %addr) {
-  entry:
-    ret void
-  }
-...
-
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
 ---
 name:            test_extload
 body: |
   bb.0.entry:
     liveins: $x0
     ; CHECK-LABEL: name: test_extload
-    ; CHECK: [[T0:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[T1:%[0-9]+]]:_(s32) = G_LOAD [[T0]](p0) :: (load 1 from %ir.addr)
-    ; CHECK: $w0 = COPY [[T1]](s32)
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1)
+    ; CHECK: $w0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $x0
-    %1:_(s32) = G_LOAD %0 :: (load 1 from %ir.addr)
+    %1:_(s32) = G_LOAD %0 :: (load 1)
     $w0 = COPY %1
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir b/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
index 7a688e7eb93..2176bb021f7 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
@@ -1,29 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_icmp() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_icmp
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_icmp
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
@@ -33,15 +13,13 @@ body: |
     ; CHECK: $w0 = COPY [[FCMP]](s32)
     ; CHECK: [[FCMP1:%[0-9]+]]:_(s32) = G_FCMP floatpred(uno), [[TRUNC]](s32), [[TRUNC1]]
     ; CHECK: $w0 = COPY [[FCMP1]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x0
-
-    %2(s32) = G_TRUNC %0
-    %3(s32) = G_TRUNC %1
-
-    %4(s32) = G_FCMP floatpred(oge), %0, %1
-    $w0 = COPY %4
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x0
+    %2:_(s32) = G_TRUNC %0(s64)
+    %3:_(s32) = G_TRUNC %1(s64)
+    %4:_(s32) = G_FCMP floatpred(oge), %0(s64), %1
+    $w0 = COPY %4(s32)
+    %5:_(s32) = G_FCMP floatpred(uno), %2(s32), %3
+    $w0 = COPY %5(s32)
 
-    %5(s32) = G_FCMP floatpred(uno), %2, %3
-    $w0 = COPY %5
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir b/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir
index f7d77c72a38..373a1db41ed 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir
@@ -1,26 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_gep_small() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_gep_small
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_gep_small
     ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -30,9 +13,10 @@ body: |
     ; CHECK: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]]
     ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[ASHR]](s64)
     ; CHECK: $x0 = COPY [[GEP]](p0)
-    %0(p0) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %1
-    %3(p0) = G_GEP %0, %2(s8)
-    $x0 = COPY %3
+    %0:_(p0) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %1(s64)
+    %3:_(p0) = G_GEP %0, %2(s8)
+    $x0 = COPY %3(p0)
+
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir b/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
index e6e6ab7825f..3260eb6ca6f 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
@@ -1,34 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_scalar_mul_small() {
-  entry:
-    ret void
-  }
-  define void @test_smul_overflow() {
-    ret void
-  }
-  define void @test_umul_overflow() {
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_scalar_mul_small
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_mul_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -37,22 +12,19 @@ body: |
     ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[TRUNC]], [[TRUNC1]]
     ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[MUL]](s32)
     ; CHECK: $x0 = COPY [[ANYEXT]](s64)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-    %4(s8) = G_MUL %2, %3
-    %5(s64) = G_ANYEXT %4
-    $x0 = COPY %5
-...
-
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_MUL %2, %3
+    %5:_(s64) = G_ANYEXT %4(s8)
+    $x0 = COPY %5(s64)
 
+...
 ---
 name:            test_smul_overflow
-body: |
+body:             |
   bb.0:
-    liveins: $x0, $x1, $w2, $w3
-
     ; CHECK-LABEL: name: test_smul_overflow
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -67,19 +39,15 @@ body: |
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
     %2:_(s64), %3:_(s1) = G_SMULO %0, %1
-    $x0 = COPY %2
-    %4:_(s32) = G_ANYEXT %3
-    $w0 = COPY %4
+    $x0 = COPY %2(s64)
+    %4:_(s32) = G_ANYEXT %3(s1)
+    $w0 = COPY %4(s32)
 
 ...
-
-
 ---
 name:            test_umul_overflow
-body: |
+body:             |
   bb.0:
-    liveins: $x0, $x1, $w2, $w3
-
     ; CHECK-LABEL: name: test_umul_overflow
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -93,8 +61,8 @@ body: |
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
     %2:_(s64), %3:_(s1) = G_UMULO %0, %1
-    $x0 = COPY %2
-    %4:_(s32) = G_ANYEXT %3
-    $w0 = COPY %4
+    $x0 = COPY %2(s64)
+    %4:_(s32) = G_ANYEXT %3(s1)
+    $w0 = COPY %4(s32)
 
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir b/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir
index 0b328b6345e..3b301798bff 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir
@@ -1,40 +1,35 @@
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_pow() {
-  entry:
-    ret void
-  }
-...
-
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_pow
-body: |
+body:             |
   bb.0.entry:
-    liveins: $d0, $d1, $s2, $s3
-
     ; CHECK-LABEL: name: test_pow
-    ; CHECK: hasCalls: true
-
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $d1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $s2
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $s3
+    ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK: $d0 = COPY [[COPY]](s64)
+    ; CHECK: $d1 = COPY [[COPY1]](s64)
+    ; CHECK: BL &pow, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $d1, implicit-def $d0
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK: $x0 = COPY [[COPY4]](s64)
+    ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK: $s0 = COPY [[COPY2]](s32)
+    ; CHECK: $s1 = COPY [[COPY3]](s32)
+    ; CHECK: BL &powf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $s1, implicit-def $s0
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK: $w0 = COPY [[COPY5]](s32)
     %0:_(s64) = COPY $d0
     %1:_(s64) = COPY $d1
     %2:_(s32) = COPY $s2
     %3:_(s32) = COPY $s3
-
-    ; CHECK: $d0 = COPY %0
-    ; CHECK: $d1 = COPY %1
-    ; CHECK: BL &pow, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $d1, implicit-def $d0
-    ; CHECK: %4:_(s64) = COPY $d0
     %4:_(s64) = G_FPOW %0, %1
-    $x0 = COPY %4
-
-    ; CHECK: $s0 = COPY %2
-    ; CHECK: $s1 = COPY %3
-    ; CHECK: BL &powf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $s1, implicit-def $s0
-    ; CHECK: %5:_(s32) = COPY $s0
+    $x0 = COPY %4(s64)
     %5:_(s32) = G_FPOW %2, %3
-    $w0 = COPY %5
+    $w0 = COPY %5(s32)
 
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir b/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
index 35e71d61556..69d1b6d761d 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
@@ -1,37 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_urem_64() {
-  entry:
-    ret void
-  }
-  define void @test_srem_32() {
-  entry:
-    ret void
-  }
-  define void @test_srem_8() {
-  entry:
-    ret void
-  }
-  define void @test_frem() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_urem_64
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_urem_64
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -39,25 +11,16 @@ body: |
     ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[UDIV]], [[COPY1]]
     ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[MUL]]
     ; CHECK: $x0 = COPY [[SUB]](s64)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s64) = G_UREM %0, %1
-    $x0 = COPY %2
-
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = G_UREM %0, %1
+    $x0 = COPY %2(s64)
 
 ...
 ---
 name:            test_srem_32
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_srem_32
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -67,27 +30,18 @@ body: |
     ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SDIV]], [[TRUNC1]]
     ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC]], [[MUL]]
     ; CHECK: $w0 = COPY [[SUB]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %3(s32) = G_TRUNC %0
-    %4(s32) = G_TRUNC %1
-    %5(s32) = G_SREM %3, %4
-    $w0 = COPY %5
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s32) = G_TRUNC %0(s64)
+    %3:_(s32) = G_TRUNC %1(s64)
+    %4:_(s32) = G_SREM %2, %3
+    $w0 = COPY %4(s32)
 
 ...
 ---
 name:            test_srem_8
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
-
     ; CHECK-LABEL: name: test_srem_8
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -108,27 +62,19 @@ body: |
     ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC3]], [[COPY3]]
     ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
     ; CHECK: $w0 = COPY [[COPY4]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %6(s8) = G_TRUNC %0
-    %7(s8) = G_TRUNC %1
-    %8(s8) = G_SREM %6, %7
-    %9:_(s32) = G_ANYEXT %8
-    $w0 = COPY %9
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_SREM %2, %3
+    %5:_(s32) = G_ANYEXT %4(s8)
+    $w0 = COPY %5(s32)
+
 ...
 ---
 name:            test_frem
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_frem
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -148,12 +94,13 @@ body: |
     ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $s0
     ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
     ; CHECK: $w0 = COPY [[COPY3]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s64) = G_FREM %0, %1
-    $x0 = COPY %2
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = G_FREM %0, %1
+    $x0 = COPY %2(s64)
+    %3:_(s32) = G_TRUNC %0(s64)
+    %4:_(s32) = G_TRUNC %1(s64)
+    %5:_(s32) = G_FREM %3, %4
+    $w0 = COPY %5(s32)
 
-    %3(s32) = G_TRUNC %0
-    %4(s32) = G_TRUNC %1
-    %5(s32) = G_FREM %3, %4
-    $w0 = COPY %5
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-sextload.mir b/test/CodeGen/AArch64/GlobalISel/legalize-sextload.mir
index cfd1550303f..7f568f8d617 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-sextload.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-sextload.mir
@@ -1,24 +1,15 @@
-# RUN: llc -O0 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_zextload(i8* %addr) {
-  entry:
-    ret void
-  }
-...
-
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
 ---
-name:            test_zextload
+name:            test_sextload
 body: |
   bb.0.entry:
     liveins: $x0
-    ; CHECK-LABEL: name: test_zextload
-    ; CHECK: [[T0:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[T1:%[0-9]+]]:_(s32) = G_SEXTLOAD [[T0]](p0) :: (load 1 from %ir.addr)
-    ; CHECK: $w0 = COPY [[T1]](s32)
+    ; CHECK-LABEL: name: test_sextload
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load 1)
+    ; CHECK: $w0 = COPY [[SEXTLOAD]](s32)
     %0:_(p0) = COPY $x0
-    %1:_(s32) = G_SEXTLOAD %0 :: (load 1 from %ir.addr)
+    %1:_(s32) = G_SEXTLOAD %0 :: (load 1)
     $w0 = COPY %1
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir b/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
index 781b5d8cde8..ad1f431c160 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
@@ -1,28 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_shift() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_shift
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_shift
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -52,21 +33,18 @@ body: |
     ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC4]], [[AND3]]
     ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SHL1]](s32)
     ; CHECK: $w0 = COPY [[COPY4]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-
-    %4(s8) = G_ASHR %2, %3
-    %7:_(s32) = G_ANYEXT %4
-    $w0 = COPY %7
-
-
-    %5(s8) = G_LSHR %2, %3
-    %8:_(s32) = G_ANYEXT %5
-    $w0 = COPY %8
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_ASHR %2, %3
+    %7:_(s32) = G_ANYEXT %4(s8)
+    $w0 = COPY %7(s32)
+    %5:_(s8) = G_LSHR %2, %3
+    %8:_(s32) = G_ANYEXT %5(s8)
+    $w0 = COPY %8(s32)
+    %6:_(s8) = G_SHL %2, %3
+    %9:_(s32) = G_ANYEXT %6(s8)
+    $w0 = COPY %9(s32)
 
-    %6(s8) = G_SHL %2, %3
-    %9:_(s32) = G_ANYEXT %6
-    $w0 = COPY %9
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir b/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
index 3da689d4265..51cda7d793e 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
@@ -1,49 +1,10 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_simple() {
-  entry:
-    ret void
-  next:
-    ret void
-  }
-  define void @bitcast128() {
-    ret void
-  }
-  define void @testExtOfCopyOfTrunc() {
-    ret void
-  }
-  define void @testExtOf2CopyOfTrunc() {
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_simple
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-  - { id: 9, class: _ }
-  - { id: 10, class: _ }
-  - { id: 11, class: _ }
-  - { id: 12, class: _ }
-  - { id: 13, class: _ }
-  - { id: 14, class: _ }
-  - { id: 15, class: _ }
-  - { id: 16, class: _ }
-body: |
+body:             |
   ; CHECK-LABEL: name: test_simple
-  ; CHECK: bb.0.{{[a-zA-Z0-9]+}}:
+  ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
   ; CHECK:   [[COPY:%[0-9]+]]:_(s64) = COPY $x0
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
@@ -52,7 +13,7 @@ body: |
   ; CHECK:   [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[INTTOPTR]](p0)
   ; CHECK:   $x0 = COPY [[PTRTOINT]](s64)
   ; CHECK:   G_BRCOND [[TRUNC]](s1), %bb.1
-  ; CHECK: bb.1.{{[a-zA-Z0-9]+}}:
+  ; CHECK: bb.1:
   ; CHECK:   [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
   ; CHECK:   [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
   ; CHECK:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[TRUNC]](s1), [[TRUNC2]], [[TRUNC3]]
@@ -83,127 +44,101 @@ body: |
   ; CHECK:   [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST5]](<2 x s16>)
   ; CHECK:   $w0 = COPY [[BITCAST6]](s32)
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-    %0(s64) = COPY $x0
-
-    %1(s1) = G_TRUNC %0
-    %2(s8) = G_TRUNC %0
-    %3(s16) = G_TRUNC %0
-    %4(s32) = G_TRUNC %0
-
-    %5(p0) = G_INTTOPTR %0
-    %6(s64) = G_PTRTOINT %5
-    $x0 = COPY %6
-
-    G_BRCOND %1, %bb.1
-
-  bb.1.next:
-
-    %7(s1) = G_SELECT %1, %1, %1
-    %21:_(s32) = G_ANYEXT %7
-    $w0 = COPY %21
+    successors: %bb.1(0x80000000)
 
-    %8(s8) = G_SELECT %1, %2, %2
-    %20:_(s32) = G_ANYEXT %8
-    $w0 = COPY %20
+    %0:_(s64) = COPY $x0
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s16) = G_TRUNC %0(s64)
+    %4:_(s32) = G_TRUNC %0(s64)
+    %5:_(p0) = G_INTTOPTR %0(s64)
+    %6:_(s64) = G_PTRTOINT %5(p0)
+    $x0 = COPY %6(s64)
+    G_BRCOND %1(s1), %bb.1
 
-    %9(s16) = G_SELECT %1, %3, %3
-    %19:_(s32) = G_ANYEXT %9
-    $w0 = COPY %19
-
-    %10(s32) = G_SELECT %1, %4, %4
-    %11(s64) = G_SELECT %1, %0, %0
-    $x0 = COPY %11
+  bb.1:
+    %7:_(s1) = G_SELECT %1(s1), %1, %1
+    %17:_(s32) = G_ANYEXT %7(s1)
+    $w0 = COPY %17(s32)
+    %8:_(s8) = G_SELECT %1(s1), %2, %2
+    %18:_(s32) = G_ANYEXT %8(s8)
+    $w0 = COPY %18(s32)
+    %9:_(s16) = G_SELECT %1(s1), %3, %3
+    %19:_(s32) = G_ANYEXT %9(s16)
+    $w0 = COPY %19(s32)
+    %10:_(s32) = G_SELECT %1(s1), %4, %4
+    %11:_(s64) = G_SELECT %1(s1), %0, %0
+    $x0 = COPY %11(s64)
+    %12:_(<2 x s32>) = G_BITCAST %0(s64)
+    %13:_(s64) = G_BITCAST %12(<2 x s32>)
+    $x0 = COPY %13(s64)
+    %14:_(s32) = G_BITCAST %10(s32)
+    $w0 = COPY %14(s32)
+    %15:_(<4 x s8>) = G_BITCAST %0(s64)
+    %20:_(s32) = G_BITCAST %15(<4 x s8>)
+    $w0 = COPY %20(s32)
+    %16:_(<2 x s16>) = G_BITCAST %0(s64)
+    %21:_(s32) = G_BITCAST %16(<2 x s16>)
+    $w0 = COPY %21(s32)
 
-    %12(<2 x s32>) = G_BITCAST %0
-    %13(s64) = G_BITCAST %12
-    $x0 = COPY %13
-    %14(s32) = G_BITCAST %10
-    $w0 = COPY %14
-    %15(<4 x s8>) = G_BITCAST %0
-    %17:_(s32) = G_BITCAST %15
-    $w0 = COPY %17
-    %16(<2 x s16>) = G_BITCAST %0
-    %18:_(s32) = G_BITCAST %16
-    $w0 = COPY %18
 ...
-
 ---
 name:            bitcast128
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _}
-  - { id: 1, class: _}
-  - { id: 2, class: _}
-  - { id: 3, class: _}
 body:             |
-  bb.1:
+  bb.0:
     liveins: $x0, $x1
-    ; This is legal and shouldn't be changed.
+
     ; CHECK-LABEL: name: bitcast128
-    ; CHECK: liveins: $x0, $x1
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[COPY]](s64), [[COPY1]](s64)
     ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[MV]](s128)
     ; CHECK: $q0 = COPY [[BITCAST]](<2 x s64>)
     ; CHECK: RET_ReallyLR implicit $q0
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %3(s128) = G_MERGE_VALUES %0(s64), %1(s64)
-    %2(<2 x s64>) = G_BITCAST %3(s128)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %3:_(s128) = G_MERGE_VALUES %0(s64), %1(s64)
+    %2:_(<2 x s64>) = G_BITCAST %3(s128)
     $q0 = COPY %2(<2 x s64>)
     RET_ReallyLR implicit $q0
 
 ...
 ---
 name:            testExtOfCopyOfTrunc
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _}
-  - { id: 1, class: _}
-  - { id: 2, class: _}
-  - { id: 3, class: _}
 body:             |
-  bb.1:
+  bb.0:
     liveins: $x0
+
     ; CHECK-LABEL: name: testExtOfCopyOfTrunc
-    ; CHECK: liveins: $x0
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
     ; CHECK: $x0 = COPY [[COPY1]](s64)
     ; CHECK: RET_ReallyLR implicit $x0
-    %0(s64) = COPY $x0
-    %1(s1) = G_TRUNC %0
-    %2(s1) = COPY %1
-    %3(s64) = G_ANYEXT %2
-    $x0 = COPY %3
+    %0:_(s64) = COPY $x0
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(s1) = COPY %1(s1)
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x0 = COPY %3(s64)
     RET_ReallyLR implicit $x0
 
 ...
 ---
 name:            testExtOf2CopyOfTrunc
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _}
-  - { id: 1, class: _}
-  - { id: 2, class: _}
-  - { id: 3, class: _}
 body:             |
-  bb.1:
+  bb.0:
     liveins: $x0
+
     ; CHECK-LABEL: name: testExtOf2CopyOfTrunc
-    ; CHECK: liveins: $x0
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
     ; CHECK: $x0 = COPY [[COPY1]](s64)
     ; CHECK: RET_ReallyLR implicit $x0
-    %0(s64) = COPY $x0
-    %1(s1) = G_TRUNC %0
-    %2(s1) = COPY %1
-    %4:_(s1) = COPY %2
-    %3(s64) = G_ANYEXT %4
-    $x0 = COPY %3
+    %0:_(s64) = COPY $x0
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(s1) = COPY %1(s1)
+    %4:_(s1) = COPY %2(s1)
+    %3:_(s64) = G_ANYEXT %4(s1)
+    $x0 = COPY %3(s64)
     RET_ReallyLR implicit $x0
 
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir b/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
index 5f50ce047ce..32796e0948c 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
@@ -1,28 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_scalar_sub_small() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_scalar_sub_small
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_sub_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -31,11 +12,12 @@ body: |
     ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC]], [[TRUNC1]]
     ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SUB]](s32)
     ; CHECK: $x0 = COPY [[ANYEXT]](s64)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-    %4(s8) = G_SUB %2, %3
-    %5(s64) = G_ANYEXT %4
-    $x0 = COPY %5
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_SUB %2, %3
+    %5:_(s64) = G_ANYEXT %4(s8)
+    $x0 = COPY %5(s64)
+
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir b/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
index 022fd13d178..e46c9ad79c6 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
@@ -1,9 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_implicit_def
-registers:
 body: |
   bb.0.entry:
     liveins:
@@ -12,7 +10,9 @@ body: |
     ; CHECK: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CHECK: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[DEF]](s64), [[DEF1]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[MV]](s128)
+    ; CHECK: $x0 = COPY [[TRUNC]](s64)
     %0:_(s128) = G_IMPLICIT_DEF
-    %1:_(s64) = G_TRUNC %0
-    $x0 = COPY %1
+    %1:_(s64) = G_TRUNC %0(s128)
+    $x0 = COPY %1(s64)
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir b/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir
index 6958d30d365..3305c4baef4 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir
@@ -1,28 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_scalar_xor_small() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_scalar_xor_small
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_xor_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -31,11 +12,12 @@ body: |
     ; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
     ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
     ; CHECK: $x0 = COPY [[ANYEXT]](s64)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-    %4(s8) = G_XOR %2, %3
-    %5(s64) = G_ANYEXT %4
-    $x0 = COPY %5
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_XOR %2, %3
+    %5:_(s64) = G_ANYEXT %4(s8)
+    $x0 = COPY %5(s64)
+
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-zextload.mir b/test/CodeGen/AArch64/GlobalISel/legalize-zextload.mir
index 66c3f257736..ad3603d1d13 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-zextload.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-zextload.mir
@@ -1,24 +1,15 @@
-# RUN: llc -O0 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_sextload(i8* %addr) {
-  entry:
-    ret void
-  }
-...
-
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
 ---
-name:            test_sextload
+name:            test_zextload
 body: |
   bb.0.entry:
     liveins: $x0
-    ; CHECK-LABEL: name: test_sextload
-    ; CHECK: [[T0:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[T1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[T0]](p0) :: (load 1 from %ir.addr)
-    ; CHECK: $w0 = COPY [[T1]](s32)
+    ; CHECK-LABEL: name: test_zextload
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load 1)
+    ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32)
     %0:_(p0) = COPY $x0
-    %1:_(s32) = G_ZEXTLOAD %0 :: (load 1 from %ir.addr)
+    %1:_(s32) = G_ZEXTLOAD %0 :: (load 1)
     $w0 = COPY %1
 ...
-- 
GitLab


From 27ad7c20cc1fc02faa4cea18fef8361c0eb3b417 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Tue, 6 Nov 2018 19:00:11 +0000
Subject: [PATCH 1024/1116] LivePhysRegs/IfConversion: Change some types from
 unsigned to MCPhysReg; NFC

Change the type in a couple of lists and sets that only store physical
registers from unsigned to MCPhysRegs. The later is only 16bits and
saves us a bit of memory.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346254 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/LivePhysRegs.h         | 17 +++++++++--------
 include/llvm/CodeGen/LiveRegUnits.h         |  8 ++++----
 lib/CodeGen/IfConversion.cpp                | 20 ++++++++++----------
 lib/CodeGen/LivePhysRegs.cpp                | 10 +++++-----
 lib/Target/Hexagon/HexagonFrameLowering.cpp |  2 +-
 lib/Target/PowerPC/PPCExpandISEL.cpp        |  2 +-
 lib/Target/X86/X86InstrInfo.cpp             |  2 +-
 7 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/include/llvm/CodeGen/LivePhysRegs.h b/include/llvm/CodeGen/LivePhysRegs.h
index 301a45066b4..7312902e21b 100644
--- a/include/llvm/CodeGen/LivePhysRegs.h
+++ b/include/llvm/CodeGen/LivePhysRegs.h
@@ -48,7 +48,8 @@ class raw_ostream;
 /// when walking backward/forward through a basic block.
 class LivePhysRegs {
   const TargetRegisterInfo *TRI = nullptr;
-  SparseSet<unsigned> LiveRegs;
+  using RegisterSet = SparseSet<MCPhysReg, identity<MCPhysReg>>;
+  RegisterSet LiveRegs;
 
 public:
   /// Constructs an unitialized set. init() needs to be called to initialize it.
@@ -76,7 +77,7 @@ public:
   bool empty() const { return LiveRegs.empty(); }
 
   /// Adds a physical register and all its sub-registers to the set.
-  void addReg(unsigned Reg) {
+  void addReg(MCPhysReg Reg) {
     assert(TRI && "LivePhysRegs is not initialized.");
     assert(Reg <= TRI->getNumRegs() && "Expected a physical register.");
     for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
@@ -86,7 +87,7 @@ public:
 
   /// Removes a physical register, all its sub-registers, and all its
   /// super-registers from the set.
-  void removeReg(unsigned Reg) {
+  void removeReg(MCPhysReg Reg) {
     assert(TRI && "LivePhysRegs is not initialized.");
     assert(Reg <= TRI->getNumRegs() && "Expected a physical register.");
     for (MCRegAliasIterator R(Reg, TRI, true); R.isValid(); ++R)
@@ -95,7 +96,7 @@ public:
 
   /// Removes physical registers clobbered by the regmask operand \p MO.
   void removeRegsInMask(const MachineOperand &MO,
-        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> *Clobbers =
+        SmallVectorImpl<std::pair<MCPhysReg, const MachineOperand*>> *Clobbers =
         nullptr);
 
   /// Returns true if register \p Reg is contained in the set. This also
@@ -103,10 +104,10 @@ public:
   /// addReg() always adds all sub-registers to the set as well.
   /// Note: Returns false if just some sub registers are live, use available()
   /// when searching a free register.
-  bool contains(unsigned Reg) const { return LiveRegs.count(Reg); }
+  bool contains(MCPhysReg Reg) const { return LiveRegs.count(Reg); }
 
   /// Returns true if register \p Reg and no aliasing register is in the set.
-  bool available(const MachineRegisterInfo &MRI, unsigned Reg) const;
+  bool available(const MachineRegisterInfo &MRI, MCPhysReg Reg) const;
 
   /// Remove defined registers and regmask kills from the set.
   void removeDefs(const MachineInstr &MI);
@@ -126,7 +127,7 @@ public:
   /// defined or clobbered by a regmask.  The operand will identify whether this
   /// is a regmask or register operand.
   void stepForward(const MachineInstr &MI,
-        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> &Clobbers);
+        SmallVectorImpl<std::pair<MCPhysReg, const MachineOperand*>> &Clobbers);
 
   /// Adds all live-in registers of basic block \p MBB.
   /// Live in registers are the registers in the blocks live-in list and the
@@ -143,7 +144,7 @@ public:
   /// registers.
   void addLiveOutsNoPristines(const MachineBasicBlock &MBB);
 
-  using const_iterator = SparseSet<unsigned>::const_iterator;
+  using const_iterator = RegisterSet::const_iterator;
 
   const_iterator begin() const { return LiveRegs.begin(); }
   const_iterator end() const { return LiveRegs.end(); }
diff --git a/include/llvm/CodeGen/LiveRegUnits.h b/include/llvm/CodeGen/LiveRegUnits.h
index 249545906e0..5e9dd8b3cdf 100644
--- a/include/llvm/CodeGen/LiveRegUnits.h
+++ b/include/llvm/CodeGen/LiveRegUnits.h
@@ -85,14 +85,14 @@ public:
   bool empty() const { return Units.none(); }
 
   /// Adds register units covered by physical register \p Reg.
-  void addReg(unsigned Reg) {
+  void addReg(MCPhysReg Reg) {
     for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit)
       Units.set(*Unit);
   }
 
   /// Adds register units covered by physical register \p Reg that are
   /// part of the lanemask \p Mask.
-  void addRegMasked(unsigned Reg, LaneBitmask Mask) {
+  void addRegMasked(MCPhysReg Reg, LaneBitmask Mask) {
     for (MCRegUnitMaskIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) {
       LaneBitmask UnitMask = (*Unit).second;
       if (UnitMask.none() || (UnitMask & Mask).any())
@@ -101,7 +101,7 @@ public:
   }
 
   /// Removes all register units covered by physical register \p Reg.
-  void removeReg(unsigned Reg) {
+  void removeReg(MCPhysReg Reg) {
     for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit)
       Units.reset(*Unit);
   }
@@ -115,7 +115,7 @@ public:
   void addRegsInMask(const uint32_t *RegMask);
 
   /// Returns true if no part of physical register \p Reg is live.
-  bool available(unsigned Reg) const {
+  bool available(MCPhysReg Reg) const {
     for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) {
       if (Units.test(*Unit))
         return false;
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index f12d00071b2..5666626ab31 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -273,7 +273,7 @@ namespace {
     void PredicateBlock(BBInfo &BBI,
                         MachineBasicBlock::iterator E,
                         SmallVectorImpl<MachineOperand> &Cond,
-                        SmallSet<unsigned, 4> *LaterRedefs = nullptr);
+                        SmallSet<MCPhysReg, 4> *LaterRedefs = nullptr);
     void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                SmallVectorImpl<MachineOperand> &Cond,
                                bool IgnoreBr = false);
@@ -1366,12 +1366,12 @@ static void UpdatePredRedefs(MachineInstr &MI, LivePhysRegs &Redefs) {
   // Before stepping forward past MI, remember which regs were live
   // before MI. This is needed to set the Undef flag only when reg is
   // dead.
-  SparseSet<unsigned> LiveBeforeMI;
+  SparseSet<MCPhysReg, identity<MCPhysReg>> LiveBeforeMI;
   LiveBeforeMI.setUniverse(TRI->getNumRegs());
   for (unsigned Reg : Redefs)
     LiveBeforeMI.insert(Reg);
 
-  SmallVector<std::pair<unsigned, const MachineOperand*>, 4> Clobbers;
+  SmallVector<std::pair<MCPhysReg, const MachineOperand*>, 4> Clobbers;
   Redefs.stepForward(MI, Clobbers);
 
   // Now add the implicit uses for each of the clobbered values.
@@ -1740,7 +1740,7 @@ bool IfConverter::IfConvertDiamondCommon(
 
   if (MRI->tracksLiveness()) {
     for (const MachineInstr &MI : make_range(MBB1.begin(), DI1)) {
-      SmallVector<std::pair<unsigned, const MachineOperand*>, 4> Dummy;
+      SmallVector<std::pair<MCPhysReg, const MachineOperand*>, 4> Dummy;
       Redefs.stepForward(MI, Dummy);
     }
   }
@@ -1806,13 +1806,13 @@ bool IfConverter::IfConvertDiamondCommon(
   // generate:
   //   sub    r0, r1, #1
   //   addne  r0, r1, #1
-  SmallSet<unsigned, 4> RedefsByFalse;
-  SmallSet<unsigned, 4> ExtUses;
+  SmallSet<MCPhysReg, 4> RedefsByFalse;
+  SmallSet<MCPhysReg, 4> ExtUses;
   if (TII->isProfitableToUnpredicate(MBB1, MBB2)) {
     for (const MachineInstr &FI : make_range(MBB2.begin(), DI2)) {
       if (FI.isDebugInstr())
         continue;
-      SmallVector<unsigned, 4> Defs;
+      SmallVector<MCPhysReg, 4> Defs;
       for (const MachineOperand &MO : FI.operands()) {
         if (!MO.isReg())
           continue;
@@ -1830,7 +1830,7 @@ bool IfConverter::IfConvertDiamondCommon(
         }
       }
 
-      for (unsigned Reg : Defs) {
+      for (MCPhysReg Reg : Defs) {
         if (!ExtUses.count(Reg)) {
           for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
                SubRegs.isValid(); ++SubRegs)
@@ -1976,7 +1976,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
 }
 
 static bool MaySpeculate(const MachineInstr &MI,
-                         SmallSet<unsigned, 4> &LaterRedefs) {
+                         SmallSet<MCPhysReg, 4> &LaterRedefs) {
   bool SawStore = true;
   if (!MI.isSafeToMove(nullptr, SawStore))
     return false;
@@ -1999,7 +1999,7 @@ static bool MaySpeculate(const MachineInstr &MI,
 void IfConverter::PredicateBlock(BBInfo &BBI,
                                  MachineBasicBlock::iterator E,
                                  SmallVectorImpl<MachineOperand> &Cond,
-                                 SmallSet<unsigned, 4> *LaterRedefs) {
+                                 SmallSet<MCPhysReg, 4> *LaterRedefs) {
   bool AnyUnpred = false;
   bool MaySpec = LaterRedefs != nullptr;
   for (MachineInstr &I : make_range(BBI.BB->begin(), E)) {
diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp
index 86c6c8e29f9..619643acb6d 100644
--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@@ -29,8 +29,8 @@ using namespace llvm;
 /// The clobbers set will be the list of live registers clobbered
 /// by the regmask.
 void LivePhysRegs::removeRegsInMask(const MachineOperand &MO,
-        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> *Clobbers) {
-  SparseSet<unsigned>::iterator LRI = LiveRegs.begin();
+    SmallVectorImpl<std::pair<MCPhysReg, const MachineOperand*>> *Clobbers) {
+  RegisterSet::iterator LRI = LiveRegs.begin();
   while (LRI != LiveRegs.end()) {
     if (MO.clobbersPhysReg(*LRI)) {
       if (Clobbers)
@@ -83,7 +83,7 @@ void LivePhysRegs::stepBackward(const MachineInstr &MI) {
 /// on accurate kill flags. If possible use stepBackward() instead of this
 /// function.
 void LivePhysRegs::stepForward(const MachineInstr &MI,
-        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> &Clobbers) {
+    SmallVectorImpl<std::pair<MCPhysReg, const MachineOperand*>> &Clobbers) {
   // Remove killed registers from the set.
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (O->isReg() && !O->isDebug()) {
@@ -142,7 +142,7 @@ LLVM_DUMP_METHOD void LivePhysRegs::dump() const {
 #endif
 
 bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
-                             unsigned Reg) const {
+                             MCPhysReg Reg) const {
   if (LiveRegs.count(Reg))
     return false;
   if (MRI.isReserved(Reg))
@@ -157,7 +157,7 @@ bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
 /// Add live-in registers of basic block \p MBB to \p LiveRegs.
 void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) {
   for (const auto &LI : MBB.liveins()) {
-    unsigned Reg = LI.PhysReg;
+    MCPhysReg Reg = LI.PhysReg;
     LaneBitmask Mask = LI.LaneMask;
     MCSubRegIndexIterator S(Reg, TRI);
     assert(Mask.any() && "Invalid livein mask");
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index a2598244dab..2f3e18c99c5 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1708,7 +1708,7 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
   // register that is entirely undefined.
   LivePhysRegs LPR(HRI);
   LPR.addLiveIns(B);
-  SmallVector<std::pair<unsigned, const MachineOperand*>,2> Clobbers;
+  SmallVector<std::pair<MCPhysReg, const MachineOperand*>,2> Clobbers;
   for (auto R = B.begin(); R != It; ++R) {
     Clobbers.clear();
     LPR.stepForward(*R, Clobbers);
diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp
index fe41e1b36a5..a03e691ef5b 100644
--- a/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ b/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -392,7 +392,7 @@ void PPCExpandISEL::reorganizeBlockLayout(BlockISELList &BIL,
     // liveness state at the end of MBB (liveOut of MBB) as the liveIn for
     // NewSuccessor. Otherwise, will cause cyclic dependence.
     LivePhysRegs LPR(*MF->getSubtarget<PPCSubtarget>().getRegisterInfo());
-    SmallVector<std::pair<unsigned, const MachineOperand *>, 2> Clobbers;
+    SmallVector<std::pair<MCPhysReg, const MachineOperand *>, 2> Clobbers;
     for (MachineInstr &MI : *MBB)
       LPR.stepForward(MI, Clobbers);
     for (auto &LI : LPR)
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index ae45301f04b..fe26389050c 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -2550,7 +2550,7 @@ void X86InstrInfo::replaceBranchWithTailCall(
   // call. This way they still appear live across the call.
   LivePhysRegs LiveRegs(getRegisterInfo());
   LiveRegs.addLiveOuts(MBB);
-  SmallVector<std::pair<unsigned, const MachineOperand *>, 8> Clobbers;
+  SmallVector<std::pair<MCPhysReg, const MachineOperand *>, 8> Clobbers;
   LiveRegs.stepForward(*MIB, Clobbers);
   for (const auto &C : Clobbers) {
     MIB.addReg(C.first, RegState::Implicit);
-- 
GitLab


From b109c4206afa553e8d25a56d6d851f612e57e3e7 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Tue, 6 Nov 2018 19:05:53 +0000
Subject: [PATCH 1025/1116] [CodeExtractor] Erase use-without-def debug
 intrinsics in parent func

When CodeExtractor moves instructions to a new function, debug
intrinsics referring to those instructions within the parent function
become invalid.

This results in the same verifier failure which motivated r344545, about
function-local metadata being used in the wrong function.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346255 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/CodeExtractor.cpp        |  9 ++++
 .../delete-use-without-def-dbg-val.ll         | 53 +++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 test/Transforms/HotColdSplit/delete-use-without-def-dbg-val.ll

diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 462dc588cd5..4e48910b03c 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -57,6 +57,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -1305,12 +1306,20 @@ Function *CodeExtractor::extractCodeRegion() {
   // for the new function.
   for (BasicBlock &BB : *newFunction) {
     auto BlockIt = BB.begin();
+    // Remove debug info intrinsics from the new function.
     while (BlockIt != BB.end()) {
       Instruction *Inst = &*BlockIt;
       ++BlockIt;
       if (isa<DbgInfoIntrinsic>(Inst))
         Inst->eraseFromParent();
     }
+    // Remove debug info intrinsics which refer to values in the new function
+    // from the old function.
+    SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
+    for (Instruction &I : BB)
+      findDbgUsers(DbgUsers, &I);
+    for (DbgVariableIntrinsic *DVI : DbgUsers)
+      DVI->eraseFromParent();
   }
 
   LLVM_DEBUG(if (verifyFunction(*newFunction))
diff --git a/test/Transforms/HotColdSplit/delete-use-without-def-dbg-val.ll b/test/Transforms/HotColdSplit/delete-use-without-def-dbg-val.ll
new file mode 100644
index 00000000000..878db486380
--- /dev/null
+++ b/test/Transforms/HotColdSplit/delete-use-without-def-dbg-val.ll
@@ -0,0 +1,53 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK-NOT: call {{.*}}llvm.dbg.value
+
+; CHECK-LABEL: define {{.*}}@foo.cold
+; CHECK-NOT: call {{.*}}llvm.dbg.value
+
+define void @foo() !dbg !6 {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %cleanup
+
+if.end:                                           ; preds = %entry
+  ; We expect this block to be outlined. That kills the definition of %var.
+  %var = add i32 0, 0, !dbg !11
+  call void @sink()
+  call void @sink()
+  call void @sink()
+  br label %cleanup
+
+cleanup:
+  ; This dbg.value should be deleted after outlining, otherwise the verifier
+  ; complains about function-local metadata being used outside of a function.
+  call void @llvm.dbg.value(metadata i32 %var, metadata !9, metadata !DIExpression()), !dbg !11
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+declare void @sink() cold
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!3, !4}
+!llvm.module.flags = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!2 = !{}
+!3 = !{i32 7}
+!4 = !{i32 1}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+!7 = !DISubroutineType(types: !2)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+!11 = !DILocation(line: 1, column: 1, scope: !6)
-- 
GitLab


From 2b755762149521d3d0adb21c6a8a59c91238f6d7 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Tue, 6 Nov 2018 19:06:08 +0000
Subject: [PATCH 1026/1116] [CodeExtractor] Do not extract calls to
 eh_typeid_for (PR39545)

The lowering for a call to eh_typeid_for changes when it's moved from
one function to another.

There are several proposals for fixing this issue in llvm.org/PR39545.
Until some solution is in place, do not allow CodeExtractor to extract
calls to eh_typeid_for, as that results in serious miscompilations.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346256 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/CodeExtractor.cpp        | 14 +++++++---
 test/Transforms/HotColdSplit/eh-typeid-for.ll | 26 +++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)
 create mode 100644 test/Transforms/HotColdSplit/eh-typeid-for.ll

diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 4e48910b03c..419e1db08bf 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -168,14 +168,22 @@ static bool isBlockValidForExtraction(const BasicBlock &BB,
       continue;
     }
 
-    if (const CallInst *CI = dyn_cast<CallInst>(I))
-      if (const Function *F = CI->getCalledFunction())
-        if (F->getIntrinsicID() == Intrinsic::vastart) {
+    if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (const Function *F = CI->getCalledFunction()) {
+        auto IID = F->getIntrinsicID();
+        if (IID == Intrinsic::vastart) {
           if (AllowVarArgs)
             continue;
           else
             return false;
         }
+
+        // Currently, we miscompile outlined copies of eh_typid_for. There are
+        // proposals for fixing this in llvm.org/PR39545.
+        if (IID == Intrinsic::eh_typeid_for)
+          return false;
+      }
+    }
   }
 
   return true;
diff --git a/test/Transforms/HotColdSplit/eh-typeid-for.ll b/test/Transforms/HotColdSplit/eh-typeid-for.ll
new file mode 100644
index 00000000000..75f9e672332
--- /dev/null
+++ b/test/Transforms/HotColdSplit/eh-typeid-for.ll
@@ -0,0 +1,26 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+; Do not outline calls to @llvm.eh.typeid.for. See llvm.org/PR39545.
+
+@_ZTIi = external constant i8*
+
+; CHECK-LABEL: @fun
+; CHECK-NOT: call {{.*}}@fun.cold.1
+define void @fun() {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  ret void
+
+if.else:
+  %t = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  call void @sink()
+  call void @sink()
+  call void @sink()
+  ret void
+}
+
+declare void @sink() cold
+
+declare i32 @llvm.eh.typeid.for(i8*)
-- 
GitLab


From d1583a44982955b8b214accf8f63359669c6b17d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 6 Nov 2018 19:24:21 +0000
Subject: [PATCH 1027/1116] [X86] Add custom promotion of v2i8/v2i16 fp_to_sint
 to avoid over promotion to v2i64 which would force scalarization.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346259 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp |  25 +-
 test/CodeGen/X86/vec_cast3.ll      |  96 +------
 test/CodeGen/X86/vec_fp_to_int.ll  | 442 +++++++++++++++++++++++++++++
 3 files changed, 474 insertions(+), 89 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 21b835ec5ba..38d3a30cb19 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -882,6 +882,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i16, Custom);
+    // Custom legalize these to avoid over promotion.
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i8,  Custom);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i16, Custom);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i8,  Custom);
 
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
@@ -26025,6 +26030,24 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Src = N->getOperand(0);
     EVT SrcVT = Src.getValueType();
 
+    // Promote these manually to avoid over promotion to v2i64. Type
+    // legalization will revisit the v2i32 operation for more cleanup.
+    if ((VT == MVT::v2i8 || VT == MVT::v2i16) &&
+        getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
+      // AVX512DQ provides instructions that produce a v2i64 result.
+      if (Subtarget.hasDQI())
+        return;
+
+      SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src);
+      Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
+                                                          : ISD::AssertSext,
+                        dl, MVT::v2i32, Res,
+                        DAG.getValueType(VT.getVectorElementType()));
+      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+      Results.push_back(Res);
+      return;
+    }
+
     if (VT == MVT::v2i32) {
       assert((IsSigned || Subtarget.hasAVX512()) &&
              "Can only handle signed conversion without AVX512");
@@ -26051,7 +26074,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         return;
       }
       if (SrcVT == MVT::v2f32 &&
-          getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) {
+          getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
         SDValue Idx = DAG.getIntPtrConstant(0, dl);
         SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                   DAG.getUNDEF(MVT::v2f32));
diff --git a/test/CodeGen/X86/vec_cast3.ll b/test/CodeGen/X86/vec_cast3.ll
index e0cc4f3e396..e8662b8cc34 100644
--- a/test/CodeGen/X86/vec_cast3.ll
+++ b/test/CodeGen/X86/vec_cast3.ll
@@ -111,19 +111,8 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) {
 define <2 x i8> @cvt_v2f32_v2i8(<2 x float> %src) {
 ; CHECK-LABEL: cvt_v2f32_v2i8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subl $68, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 72
-; CHECK-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll (%esp)
-; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-NEXT:    addl $68, %esp
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2i8:
@@ -141,19 +130,8 @@ define <2 x i8> @cvt_v2f32_v2i8(<2 x float> %src) {
 define <2 x i16> @cvt_v2f32_v2i16(<2 x float> %src) {
 ; CHECK-LABEL: cvt_v2f32_v2i16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subl $68, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 72
-; CHECK-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll (%esp)
-; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-NEXT:    addl $68, %esp
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2i16:
@@ -186,37 +164,8 @@ define <2 x i32> @cvt_v2f32_v2i32(<2 x float> %src) {
 define <2 x i8> @cvt_v2f32_v2u8(<2 x float> %src) {
 ; CHECK-LABEL: cvt_v2f32_v2u8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subl $68, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 72
-; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vcmpltss %xmm2, %xmm1, %xmm3
-; CHECK-NEXT:    vsubss %xmm2, %xmm1, %xmm4
-; CHECK-NEXT:    vblendvps %xmm3, %xmm1, %xmm4, %xmm3
-; CHECK-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    vcmpltss %xmm2, %xmm0, %xmm3
-; CHECK-NEXT:    vsubss %xmm2, %xmm0, %xmm4
-; CHECK-NEXT:    vblendvps %xmm3, %xmm0, %xmm4, %xmm3
-; CHECK-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll (%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    vucomiss %xmm2, %xmm1
-; CHECK-NEXT:    setae %al
-; CHECK-NEXT:    shll $31, %eax
-; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    vucomiss %xmm2, %xmm0
-; CHECK-NEXT:    setae %cl
-; CHECK-NEXT:    shll $31, %ecx
-; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    addl $68, %esp
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2u8:
@@ -234,37 +183,8 @@ define <2 x i8> @cvt_v2f32_v2u8(<2 x float> %src) {
 define <2 x i16> @cvt_v2f32_v2u16(<2 x float> %src) {
 ; CHECK-LABEL: cvt_v2f32_v2u16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subl $68, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 72
-; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vcmpltss %xmm2, %xmm1, %xmm3
-; CHECK-NEXT:    vsubss %xmm2, %xmm1, %xmm4
-; CHECK-NEXT:    vblendvps %xmm3, %xmm1, %xmm4, %xmm3
-; CHECK-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    vcmpltss %xmm2, %xmm0, %xmm3
-; CHECK-NEXT:    vsubss %xmm2, %xmm0, %xmm4
-; CHECK-NEXT:    vblendvps %xmm3, %xmm0, %xmm4, %xmm3
-; CHECK-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll (%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    vucomiss %xmm2, %xmm1
-; CHECK-NEXT:    setae %al
-; CHECK-NEXT:    shll $31, %eax
-; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    vucomiss %xmm2, %xmm0
-; CHECK-NEXT:    setae %cl
-; CHECK-NEXT:    shll $31, %ecx
-; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    addl $68, %esp
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2u16:
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index e80abc91cd1..651c0e65aa0 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -2866,3 +2866,445 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %ext
 }
+
+define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) {
+; SSE-LABEL: fptosi_2f32_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_2f32_to_2i8:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_2f32_to_2i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_2f32_to_2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_2f32_to_2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f32_to_2i8:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN-LABEL: fptosi_2f32_to_2i8:
+; WIDEN:       # %bb.0:
+; WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
+; WIDEN-NEXT:    vpmovdb %zmm0, %xmm0
+; WIDEN-NEXT:    vzeroupper
+; WIDEN-NEXT:    retq
+  %cvt = fptosi <2 x float> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) {
+; SSE-LABEL: fptosi_2f32_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_2f32_to_2i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_2f32_to_2i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_2f32_to_2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_2f32_to_2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f32_to_2i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN_SKX-LABEL: fptosi_2f32_to_2i16:
+; WIDEN_SKX:       # %bb.0:
+; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; WIDEN_SKX-NEXT:    vcvttps2dq %ymm0, %ymm0
+; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; WIDEN_SKX-NEXT:    vzeroupper
+; WIDEN_SKX-NEXT:    retq
+;
+; WIDEN_KNL-LABEL: fptosi_2f32_to_2i16:
+; WIDEN_KNL:       # %bb.0:
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; WIDEN_KNL-NEXT:    vcvttps2dq %ymm0, %ymm0
+; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; WIDEN_KNL-NEXT:    vzeroupper
+; WIDEN_KNL-NEXT:    retq
+  %cvt = fptosi <2 x float> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
+
+define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) {
+; SSE-LABEL: fptoui_2f32_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_2f32_to_2i8:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f32_to_2i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f32_to_2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f32_to_2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f32_to_2i8:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN-LABEL: fptoui_2f32_to_2i8:
+; WIDEN:       # %bb.0:
+; WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
+; WIDEN-NEXT:    vpmovdb %zmm0, %xmm0
+; WIDEN-NEXT:    vzeroupper
+; WIDEN-NEXT:    retq
+  %cvt = fptoui <2 x float> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) {
+; SSE-LABEL: fptoui_2f32_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_2f32_to_2i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f32_to_2i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f32_to_2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f32_to_2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f32_to_2i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN_SKX-LABEL: fptoui_2f32_to_2i16:
+; WIDEN_SKX:       # %bb.0:
+; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; WIDEN_SKX-NEXT:    vcvttps2dq %ymm0, %ymm0
+; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; WIDEN_SKX-NEXT:    vzeroupper
+; WIDEN_SKX-NEXT:    retq
+;
+; WIDEN_KNL-LABEL: fptoui_2f32_to_2i16:
+; WIDEN_KNL:       # %bb.0:
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; WIDEN_KNL-NEXT:    vcvttps2dq %ymm0, %ymm0
+; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; WIDEN_KNL-NEXT:    vzeroupper
+; WIDEN_KNL-NEXT:    retq
+  %cvt = fptoui <2 x float> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
+
+define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) {
+; SSE-LABEL: fptosi_2f64_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_2f64_to_2i8:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_2f64_to_2i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_2f64_to_2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_2f64_to_2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f64_to_2i8:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN-LABEL: fptosi_2f64_to_2i8:
+; WIDEN:       # %bb.0:
+; WIDEN-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; WIDEN-NEXT:    vcvttsd2si %xmm1, %eax
+; WIDEN-NEXT:    vcvttsd2si %xmm0, %ecx
+; WIDEN-NEXT:    vmovd %ecx, %xmm0
+; WIDEN-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; WIDEN-NEXT:    retq
+  %cvt = fptosi <2 x double> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) {
+; SSE-LABEL: fptosi_2f64_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_2f64_to_2i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_2f64_to_2i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_2f64_to_2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_2f64_to_2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f64_to_2i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN_SKX-LABEL: fptosi_2f64_to_2i16:
+; WIDEN_SKX:       # %bb.0:
+; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN_SKX-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; WIDEN_SKX-NEXT:    vzeroupper
+; WIDEN_SKX-NEXT:    retq
+;
+; WIDEN_KNL-LABEL: fptosi_2f64_to_2i16:
+; WIDEN_KNL:       # %bb.0:
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN_KNL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; WIDEN_KNL-NEXT:    vzeroupper
+; WIDEN_KNL-NEXT:    retq
+  %cvt = fptosi <2 x double> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
+
+define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_2f64_to_2i8:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f64_to_2i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f64_to_2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f64_to_2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2uqq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f64_to_2i8:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN-LABEL: fptoui_2f64_to_2i8:
+; WIDEN:       # %bb.0:
+; WIDEN-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; WIDEN-NEXT:    vcvttsd2si %xmm1, %eax
+; WIDEN-NEXT:    vcvttsd2si %xmm0, %ecx
+; WIDEN-NEXT:    vmovd %ecx, %xmm0
+; WIDEN-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; WIDEN-NEXT:    retq
+  %cvt = fptoui <2 x double> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_2f64_to_2i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f64_to_2i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f64_to_2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f64_to_2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2uqq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f64_to_2i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN_SKX-LABEL: fptoui_2f64_to_2i16:
+; WIDEN_SKX:       # %bb.0:
+; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN_SKX-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; WIDEN_SKX-NEXT:    vzeroupper
+; WIDEN_SKX-NEXT:    retq
+;
+; WIDEN_KNL-LABEL: fptoui_2f64_to_2i16:
+; WIDEN_KNL:       # %bb.0:
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN_KNL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; WIDEN_KNL-NEXT:    vzeroupper
+; WIDEN_KNL-NEXT:    retq
+  %cvt = fptoui <2 x double> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
-- 
GitLab


From 645cd31982fe7e5eab26700ce4a66696083dc7e7 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 6 Nov 2018 19:41:35 +0000
Subject: [PATCH 1028/1116] [ThinLTO] Split NotEligibleToImport into legality
 and inlinability flags

Summary:
The NotEligibleToImport flag on the GlobalValueSummary was set if it
isn't legal to import (e.g. because it references unpromotable locals)
and when it can't be inlined (in which case importing is pointless).

I split out the inlinable piece into a separate flag on the
FunctionSummary (doesn't make sense for aliases or global variables),
because in the future we may want to import for reasons other than
inlining.

Reviewers: davidxl

Subscribers: mehdi_amini, inglorion, eraman, steven_wu, dexonsmith, arphaman, llvm-commits

Differential Revision: https://reviews.llvm.org/D53345

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346261 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/ModuleSummaryIndex.h         | 13 ++++++++-----
 include/llvm/Transforms/IPO/FunctionImport.h |  8 ++++++--
 lib/Analysis/ModuleSummaryAnalysis.cpp       | 19 +++++++++----------
 lib/AsmParser/LLLexer.cpp                    |  1 +
 lib/AsmParser/LLParser.cpp                   |  7 +++++++
 lib/AsmParser/LLToken.h                      |  1 +
 lib/Bitcode/Reader/BitcodeReader.cpp         |  1 +
 lib/Bitcode/Writer/BitcodeWriter.cpp         |  1 +
 lib/IR/AsmWriter.cpp                         |  1 +
 lib/IR/ModuleSummaryIndex.cpp                |  5 +++--
 lib/Transforms/IPO/FunctionImport.cpp        | 10 ++++++++++
 test/Assembler/thinlto-summary.ll            |  4 ++--
 test/Bitcode/thinlto-function-summary.ll     |  2 +-
 test/ThinLTO/X86/dot-dumper.ll               |  2 +-
 14 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index 8510afe60a1..9a456acf966 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -478,13 +478,17 @@ public:
         TypeCheckedLoadConstVCalls;
   };
 
-  /// Function attribute flags. Used to track if a function accesses memory,
-  /// recurses or aliases.
+  /// Flags specific to function summaries.
   struct FFlags {
+    // Function attribute flags. Used to track if a function accesses memory,
+    // recurses or aliases.
     unsigned ReadNone : 1;
     unsigned ReadOnly : 1;
     unsigned NoRecurse : 1;
     unsigned ReturnDoesNotAlias : 1;
+
+    // Indicate if the global value cannot be inlined.
+    unsigned NoInline : 1;
   };
 
   /// Create an empty FunctionSummary (with specified call edges).
@@ -511,8 +515,7 @@ private:
   /// during the initial compile step when the summary index is first built.
   unsigned InstCount;
 
-  /// Function attribute flags. Used to track if a function accesses memory,
-  /// recurses or aliases.
+  /// Function summary specific flags.
   FFlags FunFlags;
 
   /// List of <CalleeValueInfo, CalleeInfo> call edge pairs from this function.
@@ -546,7 +549,7 @@ public:
     return GVS->getSummaryKind() == FunctionKind;
   }
 
-  /// Get function attribute flags.
+  /// Get function summary flags.
   FFlags fflags() const { return FunFlags; }
 
   /// Get the instruction count recorded for this function.
diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h
index d427cb809bf..5ad880574ef 100644
--- a/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/include/llvm/Transforms/IPO/FunctionImport.h
@@ -56,10 +56,14 @@ public:
     // to find at least one summary for the GUID that is global or a local
     // in the referenced module for direct calls.
     LocalLinkageNotInModule,
-    // This corresponse to the NotEligibleToImport being set on the summary,
+    // This corresponds to the NotEligibleToImport being set on the summary,
     // which can happen in a few different cases (e.g. local that can't be
     // renamed or promoted because it is referenced on a llvm*.used variable).
-    NotEligible
+    NotEligible,
+    // This corresponds to NoInline being set on the function summary,
+    // which will happen if it is known that the inliner will not be able
+    // to inline the function (e.g. it is marked with a NoInline attribute).
+    NoInline
   };
 
   /// Information optionally tracked for candidates the importer decided
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index 3eb150becfa..29b96ac746b 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -350,20 +350,18 @@ static void computeFunctionSummary(
 
   bool NonRenamableLocal = isNonRenamableLocal(F);
   bool NotEligibleForImport =
-      NonRenamableLocal || HasInlineAsmMaybeReferencingInternal ||
-      // Inliner doesn't handle variadic functions.
-      // FIXME: refactor this to use the same code that inliner is using.
-      F.isVarArg() ||
-      // Don't try to import functions with noinline attribute.
-      F.getAttributes().hasFnAttribute(Attribute::NoInline);
+      NonRenamableLocal || HasInlineAsmMaybeReferencingInternal;
   GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport,
                                     /* Live = */ false, F.isDSOLocal());
   FunctionSummary::FFlags FunFlags{
       F.hasFnAttribute(Attribute::ReadNone),
       F.hasFnAttribute(Attribute::ReadOnly),
-      F.hasFnAttribute(Attribute::NoRecurse),
-      F.returnDoesNotAlias(),
-  };
+      F.hasFnAttribute(Attribute::NoRecurse), F.returnDoesNotAlias(),
+      // Inliner doesn't handle variadic functions.
+      // FIXME: refactor this to use the same code that inliner is using.
+      F.isVarArg() ||
+          // Don't try to import functions with noinline attribute.
+          F.getAttributes().hasFnAttribute(Attribute::NoInline)};
   auto FuncSummary = llvm::make_unique<FunctionSummary>(
       Flags, NumInsts, FunFlags, RefEdges.takeVector(),
       CallGraphEdges.takeVector(), TypeTests.takeVector(),
@@ -478,7 +476,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                         F->hasFnAttribute(Attribute::ReadNone),
                         F->hasFnAttribute(Attribute::ReadOnly),
                         F->hasFnAttribute(Attribute::NoRecurse),
-                        F->returnDoesNotAlias()},
+                        F->returnDoesNotAlias(),
+                        /* NoInline = */ false},
                     ArrayRef<ValueInfo>{}, ArrayRef<FunctionSummary::EdgeTy>{},
                     ArrayRef<GlobalValue::GUID>{},
                     ArrayRef<FunctionSummary::VFuncId>{},
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 53787b25d0b..af4f43986ef 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -740,6 +740,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(readOnly);
   KEYWORD(noRecurse);
   KEYWORD(returnDoesNotAlias);
+  KEYWORD(noInline);
   KEYWORD(calls);
   KEYWORD(callee);
   KEYWORD(hotness);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index e83b9f80592..5fe1e125d48 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -7714,6 +7714,7 @@ bool LLParser::ParseFlag(unsigned &Val) {
 ///   := 'funcFlags' ':' '(' ['readNone' ':' Flag]?
 ///        [',' 'readOnly' ':' Flag]? [',' 'noRecurse' ':' Flag]?
 ///        [',' 'returnDoesNotAlias' ':' Flag]? ')'
+///        [',' 'noInline' ':' Flag]? ')'
 bool LLParser::ParseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
   assert(Lex.getKind() == lltok::kw_funcFlags);
   Lex.Lex();
@@ -7749,6 +7750,12 @@ bool LLParser::ParseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
         return true;
       FFlags.ReturnDoesNotAlias = Val;
       break;
+    case lltok::kw_noInline:
+      Lex.Lex();
+      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+        return true;
+      FFlags.NoInline = Val;
+      break;
     default:
       return Error(Lex.getLoc(), "expected function flag type");
     }
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index a3a9930f9e3..f8f5955a16c 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -369,6 +369,7 @@ enum Kind {
   kw_readOnly,
   kw_noRecurse,
   kw_returnDoesNotAlias,
+  kw_noInline,
   kw_calls,
   kw_callee,
   kw_hotness,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index aa83955e646..56e05f8f085 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -876,6 +876,7 @@ static FunctionSummary::FFlags getDecodedFFlags(uint64_t RawFlags) {
   Flags.ReadOnly = (RawFlags >> 1) & 0x1;
   Flags.NoRecurse = (RawFlags >> 2) & 0x1;
   Flags.ReturnDoesNotAlias = (RawFlags >> 3) & 0x1;
+  Flags.NoInline = (RawFlags >> 4) & 0x1;
   return Flags;
 }
 
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index d59da255be4..f4634c9d3f4 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -971,6 +971,7 @@ static uint64_t getEncodedFFlags(FunctionSummary::FFlags Flags) {
   RawFlags |= (Flags.ReadOnly << 1);
   RawFlags |= (Flags.NoRecurse << 2);
   RawFlags |= (Flags.ReturnDoesNotAlias << 3);
+  RawFlags |= (Flags.NoInline << 4);
   return RawFlags;
 }
 
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index eb2311da63b..3b575739263 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -2871,6 +2871,7 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
     Out << ", readOnly: " << FFlags.ReadOnly;
     Out << ", noRecurse: " << FFlags.NoRecurse;
     Out << ", returnDoesNotAlias: " << FFlags.ReturnDoesNotAlias;
+    Out << ", noInline: " << FFlags.NoInline;
     Out << ")";
   }
   if (!FS->calls().empty()) {
diff --git a/lib/IR/ModuleSummaryIndex.cpp b/lib/IR/ModuleSummaryIndex.cpp
index e63407c3e75..8d85f7901b0 100644
--- a/lib/IR/ModuleSummaryIndex.cpp
+++ b/lib/IR/ModuleSummaryIndex.cpp
@@ -182,8 +182,9 @@ static std::string linkageToString(GlobalValue::LinkageTypes LT) {
 
 static std::string fflagsToString(FunctionSummary::FFlags F) {
   auto FlagValue = [](unsigned V) { return V ? '1' : '0'; };
-  char FlagRep[] = {FlagValue(F.ReadNone), FlagValue(F.ReadOnly),
-                    FlagValue(F.NoRecurse), FlagValue(F.ReturnDoesNotAlias), 0};
+  char FlagRep[] = {FlagValue(F.ReadNone),     FlagValue(F.ReadOnly),
+                    FlagValue(F.NoRecurse),    FlagValue(F.ReturnDoesNotAlias),
+                    FlagValue(F.NoInline), 0};
 
   return FlagRep;
 }
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 16a3d112b29..31531beea5e 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -237,11 +237,19 @@ selectCallee(const ModuleSummaryIndex &Index,
           return false;
         }
 
+        // Skip if it isn't legal to import (e.g. may reference unpromotable
+        // locals).
         if (Summary->notEligibleToImport()) {
           Reason = FunctionImporter::ImportFailureReason::NotEligible;
           return false;
         }
 
+        // Don't bother importing if we can't inline it anyway.
+        if (Summary->fflags().NoInline) {
+          Reason = FunctionImporter::ImportFailureReason::NoInline;
+          return false;
+        }
+
         return true;
       });
   if (It == CalleeSummaryList.end())
@@ -318,6 +326,8 @@ getFailureName(FunctionImporter::ImportFailureReason Reason) {
     return "LocalLinkageNotInModule";
   case FunctionImporter::ImportFailureReason::NotEligible:
     return "NotEligible";
+  case FunctionImporter::ImportFailureReason::NoInline:
+    return "NoInline";
   }
   llvm_unreachable("invalid reason");
 }
diff --git a/test/Assembler/thinlto-summary.ll b/test/Assembler/thinlto-summary.ll
index 01bf3a8c810..64af835ae2b 100644
--- a/test/Assembler/thinlto-summary.ll
+++ b/test/Assembler/thinlto-summary.ll
@@ -81,8 +81,8 @@
 ; CHECK: ^13 = gv: (guid: 12, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0))))
 ; CHECK: ^14 = gv: (guid: 13, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1))))
 ; CHECK: ^15 = gv: (guid: 14, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 1, live: 1, dsoLocal: 0), insts: 1)))
-; CHECK: ^16 = gv: (guid: 15, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 1, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0))))
-; CHECK: ^17 = gv: (guid: 16, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 0, readOnly: 1, noRecurse: 0, returnDoesNotAlias: 1), calls: ((callee: ^15)))))
+; CHECK: ^16 = gv: (guid: 15, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 1, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 0))))
+; CHECK: ^17 = gv: (guid: 16, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 0, readOnly: 1, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0), calls: ((callee: ^15)))))
 ; CHECK: ^18 = gv: (guid: 17, summaries: (alias: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1), aliasee: ^14)))
 ; CHECK: ^19 = gv: (guid: 18, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 4, typeIdInfo: (typeTests: (^24, ^26)))))
 ; CHECK: ^20 = gv: (guid: 19, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 8, typeIdInfo: (typeTestAssumeVCalls: (vFuncId: (^27, offset: 16))))))
diff --git a/test/Bitcode/thinlto-function-summary.ll b/test/Bitcode/thinlto-function-summary.ll
index 5922a8b3c4d..7f59eeabd9c 100644
--- a/test/Bitcode/thinlto-function-summary.ll
+++ b/test/Bitcode/thinlto-function-summary.ll
@@ -20,7 +20,7 @@
 ; BC-NEXT: <PERMODULE {{.*}} op0=1 op1=0
 ; BC-NEXT: <PERMODULE {{.*}} op0=2 op1=0
 ; BC-NEXT: <PERMODULE {{.*}} op0=3 op1=7
-; BC-NEXT: <PERMODULE {{.*}} op0=4 op1=16
+; BC-NEXT: <PERMODULE {{.*}} op0=4 op1=0 op2=1 op3=16
 ; BC-NEXT: <ALIAS {{.*}} op0=5 op1=0 op2=3
 ; BC-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 ; BC: <STRTAB_BLOCK
diff --git a/test/ThinLTO/X86/dot-dumper.ll b/test/ThinLTO/X86/dot-dumper.ll
index 25cd0ed617f..72175a1ea4d 100644
--- a/test/ThinLTO/X86/dot-dumper.ll
+++ b/test/ThinLTO/X86/dot-dumper.ll
@@ -34,7 +34,7 @@
 ; CLUSTER1:         // Module: {{.*}}2.bc
 ; CLUSTER1-NEXT:    subgraph cluster_1 {
 ; CLUSTER1-DAG:       M1_[[A:[0-9]+]] [{{.*}}A|extern{{.*}}]; // variable
-; CLUSTER1-DAG:       M1_[[FOO:[0-9]+]] [{{.*}}foo|extern{{.*}}]; // function, not eligible to import
+; CLUSTER1-DAG:       M1_[[FOO:[0-9]+]] [{{.*}}foo|extern{{.*}} ffl: 00001{{.*}}]; // function
 ; CLUSTER1-DAG:       M1_[[B:[0-9]+]] [{{.*}}B|extern{{.*}}]; // variable
 ; CLUSTER1-DAG:       M1_[[BAR:[0-9]+]] [{{.*}}bar|extern{{.*}}]; // function, dead
 ; CLUSTER1-NEXT:      // Edges:
-- 
GitLab


From ffb90c46415a9bc3a2fca57b809b3317fe598bc5 Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Tue, 6 Nov 2018 20:23:53 +0000
Subject: [PATCH 1029/1116] AMDGPU/Docs: Fix the processor table

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346263 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/AMDGPUUsage.rst | 202 +++++++++++++++++++++----------------------
 1 file changed, 101 insertions(+), 101 deletions(-)

diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst
index bc3caf4448c..b4cab62ccf4 100644
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@@ -78,146 +78,146 @@ names from both the *Processor* and *Alternative Processor* can be used.
   .. table:: AMDGPU Processors
      :name: amdgpu-processor-table
 
-     =========== =============== ============ ===== ========= ======= ==================
-     Processor   Alternative     Target       dGPU/ Target    ROCm    Example
-                 Processor       Triple       APU   Features  Support Products
+     =========== =============== ============ ===== ========== ======= ======================
+     Processor   Alternative     Target       dGPU/ Target     ROCm    Example
+                 Processor       Triple       APU   Features   Support Products
                                  Architecture       Supported
                                                     [Default]
-     =========== =============== ============ ===== ========= ======= ==================
+     =========== =============== ============ ===== ========== ======= ======================
      **Radeon HD 2000/3000 Series (R600)** [AMD-RADEON-HD-2000-3000]_
-     -----------------------------------------------------------------------------------
+     ----------------------------------------------------------------------------------------
      ``r600``                    ``r600``     dGPU
      ``r630``                    ``r600``     dGPU
      ``rs880``                   ``r600``     dGPU
      ``rv670``                   ``r600``     dGPU
      **Radeon HD 4000 Series (R700)** [AMD-RADEON-HD-4000]_
-     -----------------------------------------------------------------------------------
+     ----------------------------------------------------------------------------------------
      ``rv710``                   ``r600``     dGPU
      ``rv730``                   ``r600``     dGPU
      ``rv770``                   ``r600``     dGPU
      **Radeon HD 5000 Series (Evergreen)** [AMD-RADEON-HD-5000]_
-     -----------------------------------------------------------------------------------
+     ----------------------------------------------------------------------------------------
      ``cedar``                   ``r600``     dGPU
      ``cypress``                 ``r600``     dGPU
      ``juniper``                 ``r600``     dGPU
      ``redwood``                 ``r600``     dGPU
      ``sumo``                    ``r600``     dGPU
      **Radeon HD 6000 Series (Northern Islands)** [AMD-RADEON-HD-6000]_
-     -----------------------------------------------------------------------------------
+     ----------------------------------------------------------------------------------------
      ``barts``                   ``r600``     dGPU
      ``caicos``                  ``r600``     dGPU
      ``cayman``                  ``r600``     dGPU
      ``turks``                   ``r600``     dGPU
      **GCN GFX6 (Southern Islands (SI))** [AMD-GCN-GFX6]_
-     -----------------------------------------------------------------------------------
+     ----------------------------------------------------------------------------------------
      ``gfx600``  - ``tahiti``    ``amdgcn``   dGPU
      ``gfx601``  - ``hainan``    ``amdgcn``   dGPU
                  - ``oland``
                  - ``pitcairn``
                  - ``verde``
      **GCN GFX7 (Sea Islands (CI))** [AMD-GCN-GFX7]_
-     -----------------------------------------------------------------------------------
-     ``gfx700``  - ``kaveri``    ``amdgcn``   APU                     - A6-7000
-                                                                      - A6 Pro-7050B
-                                                                      - A8-7100
-                                                                      - A8 Pro-7150B
-                                                                      - A10-7300
-                                                                      - A10 Pro-7350B
-                                                                      - FX-7500
-                                                                      - A8-7200P
-                                                                      - A10-7400P
-                                                                      - FX-7600P
-     ``gfx701``  - ``hawaii``    ``amdgcn``   dGPU            ROCm    - FirePro W8100
-                                                                      - FirePro W9100
-                                                                      - FirePro S9150
-                                                                      - FirePro S9170
-     ``gfx702``                  ``amdgcn``   dGPU            ROCm    - Radeon R9 290
-                                                                      - Radeon R9 290x
-                                                                      - Radeon R390
-                                                                      - Radeon R390x
-     ``gfx703``  - ``kabini``    ``amdgcn``   APU                     - E1-2100
-                 - ``mullins``                                        - E1-2200
-                                                                      - E1-2500
-                                                                      - E2-3000
-                                                                      - E2-3800
-                                                                      - A4-5000
-                                                                      - A4-5100
-                                                                      - A6-5200
-                                                                      - A4 Pro-3340B
-     ``gfx704``  - ``bonaire``   ``amdgcn``   dGPU                    - Radeon HD 7790
-                                                                      - Radeon HD 8770
-                                                                      - R7 260
-                                                                      - R7 260X
+     ----------------------------------------------------------------------------------------
+     ``gfx700``  - ``kaveri``    ``amdgcn``   APU                      - A6-7000
+                                                                       - A6 Pro-7050B
+                                                                       - A8-7100
+                                                                       - A8 Pro-7150B
+                                                                       - A10-7300
+                                                                       - A10 Pro-7350B
+                                                                       - FX-7500
+                                                                       - A8-7200P
+                                                                       - A10-7400P
+                                                                       - FX-7600P
+     ``gfx701``  - ``hawaii``    ``amdgcn``   dGPU             ROCm    - FirePro W8100
+                                                                       - FirePro W9100
+                                                                       - FirePro S9150
+                                                                       - FirePro S9170
+     ``gfx702``                  ``amdgcn``   dGPU             ROCm    - Radeon R9 290
+                                                                       - Radeon R9 290x
+                                                                       - Radeon R390
+                                                                       - Radeon R390x
+     ``gfx703``  - ``kabini``    ``amdgcn``   APU                      - E1-2100
+                 - ``mullins``                                         - E1-2200
+                                                                       - E1-2500
+                                                                       - E2-3000
+                                                                       - E2-3800
+                                                                       - A4-5000
+                                                                       - A4-5100
+                                                                       - A6-5200
+                                                                       - A4 Pro-3340B
+     ``gfx704``  - ``bonaire``   ``amdgcn``   dGPU                     - Radeon HD 7790
+                                                                       - Radeon HD 8770
+                                                                       - R7 260
+                                                                       - R7 260X
      **GCN GFX8 (Volcanic Islands (VI))** [AMD-GCN-GFX8]_
-     -----------------------------------------------------------------------------------
-     ``gfx801``  - ``carrizo``   ``amdgcn``   APU   - xnack           - A6-8500P
-                                                      [on]            - Pro A6-8500B
-                                                                      - A8-8600P
-                                                                      - Pro A8-8600B
-                                                                      - FX-8800P
-                                                                      - Pro A12-8800B
-     \                           ``amdgcn``   APU   - xnack   ROCm    - A10-8700P
-                                                      [on]            - Pro A10-8700B
-                                                                      - A10-8780P
-     \                           ``amdgcn``   APU   - xnack           - A10-9600P
-                                                      [on]            - A10-9630P
-                                                                      - A12-9700P
-                                                                      - A12-9730P
-                                                                      - FX-9800P
-                                                                      - FX-9830P
-     \                           ``amdgcn``   APU   - xnack           - E2-9010
-                                                      [on]            - A6-9210
-                                                                      - A9-9410
-     ``gfx802``  - ``iceland``   ``amdgcn``   dGPU  - xnack   ROCm    - FirePro S7150
-                 - ``tonga``                          [off]           - FirePro S7100
-                                                                      - FirePro W7100
-                                                                      - Radeon R285
-                                                                      - Radeon R9 380
-                                                                      - Radeon R9 385
-                                                                      - Mobile FirePro
-                                                                        M7170
-     ``gfx803``  - ``fiji``      ``amdgcn``   dGPU  - xnack   ROCm    - Radeon R9 Nano
-                                                      [off]           - Radeon R9 Fury
-                                                                      - Radeon R9 FuryX
-                                                                      - Radeon Pro Duo
-                                                                      - FirePro S9300x2
-                                                                      - Radeon Instinct MI8
-     \           - ``polaris10`` ``amdgcn``   dGPU  - xnack   ROCm    - Radeon RX 470
-                                                      [off]           - Radeon RX 480
-                                                                      - Radeon Instinct MI6
-     \           - ``polaris11`` ``amdgcn``   dGPU  - xnack   ROCm    - Radeon RX 460
+     ----------------------------------------------------------------------------------------
+     ``gfx801``  - ``carrizo``   ``amdgcn``   APU   - xnack            - A6-8500P
+                                                      [on]             - Pro A6-8500B
+                                                                       - A8-8600P
+                                                                       - Pro A8-8600B
+                                                                       - FX-8800P
+                                                                       - Pro A12-8800B
+     \                           ``amdgcn``   APU   - xnack    ROCm    - A10-8700P
+                                                      [on]             - Pro A10-8700B
+                                                                       - A10-8780P
+     \                           ``amdgcn``   APU   - xnack            - A10-9600P
+                                                      [on]             - A10-9630P
+                                                                       - A12-9700P
+                                                                       - A12-9730P
+                                                                       - FX-9800P
+                                                                       - FX-9830P
+     \                           ``amdgcn``   APU   - xnack            - E2-9010
+                                                      [on]             - A6-9210
+                                                                       - A9-9410
+     ``gfx802``  - ``iceland``   ``amdgcn``   dGPU  - xnack    ROCm    - FirePro S7150
+                 - ``tonga``                          [off]            - FirePro S7100
+                                                                       - FirePro W7100
+                                                                       - Radeon R285
+                                                                       - Radeon R9 380
+                                                                       - Radeon R9 385
+                                                                       - Mobile FirePro
+                                                                         M7170
+     ``gfx803``  - ``fiji``      ``amdgcn``   dGPU  - xnack    ROCm    - Radeon R9 Nano
+                                                      [off]            - Radeon R9 Fury
+                                                                       - Radeon R9 FuryX
+                                                                       - Radeon Pro Duo
+                                                                       - FirePro S9300x2
+                                                                       - Radeon Instinct MI8
+     \           - ``polaris10`` ``amdgcn``   dGPU  - xnack    ROCm    - Radeon RX 470
+                                                      [off]            - Radeon RX 480
+                                                                       - Radeon Instinct MI6
+     \           - ``polaris11`` ``amdgcn``   dGPU  - xnack    ROCm    - Radeon RX 460
                                                       [off]
      ``gfx810``  - ``stoney``    ``amdgcn``   APU   - xnack
                                                       [on]
      **GCN GFX9** [AMD-GCN-GFX9]_
-     -----------------------------------------------------------------------------------
-     ``gfx900``                  ``amdgcn``   dGPU  - xnack   ROCm    - Radeon Vega
-                                                      [off]             Frontier Edition
-                                                                      - Radeon RX Vega 56
-                                                                      - Radeon RX Vega 64
-                                                                      - Radeon RX Vega 64
-                                                                        Liquid
-                                                                      - Radeon Instinct MI25
-     ``gfx902``                  ``amdgcn``   APU   - xnack           - Ryzen 3 2200G
-                                                      [on]            - Ryzen 5 2400G
-     ``gfx904``                  ``amdgcn``   dGPU  - xnack           *TBA*
+     ----------------------------------------------------------------------------------------
+     ``gfx900``                  ``amdgcn``   dGPU  - xnack    ROCm    - Radeon Vega
+                                                      [off]              Frontier Edition
+                                                                       - Radeon RX Vega 56
+                                                                       - Radeon RX Vega 64
+                                                                       - Radeon RX Vega 64
+                                                                         Liquid
+                                                                       - Radeon Instinct MI25
+     ``gfx902``                  ``amdgcn``   APU   - xnack            - Ryzen 3 2200G
+                                                      [on]             - Ryzen 5 2400G
+     ``gfx904``                  ``amdgcn``   dGPU  - xnack            *TBA*
                                                       [off]
-                                                                      .. TODO
-                                                                         Add product
-                                                                         names.
-     ``gfx906``                  ``amdgcn``   dGPU  - xnack           *TBA*
+                                                                       .. TODO
+                                                                          Add product
+                                                                          names.
+     ``gfx906``                  ``amdgcn``   dGPU  - xnack            *TBA*
                                                       [off]
                                                       sram-ecc
                                                       [on]
-                                                                      .. TODO
-                                                                         Add product
-                                                                         names.
-     ``gfx909``                  ``amdgcn``   APU   - xnack           *TBA* (Raven Ridge 2)
+                                                                       .. TODO
+                                                                          Add product
+                                                                          names.
+     ``gfx909``                  ``amdgcn``   APU   - xnack            *TBA* (Raven Ridge 2)
                                                       [on]
-                                                                      .. TODO
-                                                                         Add product
-                                                                         names.
-     =========== =============== ============ ===== ========= ======= ==================
+                                                                       .. TODO
+                                                                          Add product
+                                                                          names.
+     =========== =============== ============ ===== ========== ======= ======================
 
 .. _amdgpu-target-features:
 
-- 
GitLab


From 01ba68f9b5470cdfdf1451a620078d527c65f55b Mon Sep 17 00:00:00 2001
From: Yaxun Liu <Yaxun.Liu@amd.com>
Date: Tue, 6 Nov 2018 21:28:17 +0000
Subject: [PATCH 1030/1116] AMDGPU: Add an option
 -disable-promote-alloca-to-lds

Add this option for debugging and providing workaround.

By default it is off so no behavior change in backend.

Differential Revision: https://reviews.llvm.org/D54158


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346267 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp         | 8 ++++++++
 test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll | 3 +++
 2 files changed, 11 insertions(+)

diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index fe9e4ca0ca4..ec7ea2baec0 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -70,6 +70,11 @@ static cl::opt<bool> DisablePromoteAllocaToVector(
   cl::desc("Disable promote alloca to vector"),
   cl::init(false));
 
+static cl::opt<bool> DisablePromoteAllocaToLDS(
+  "disable-promote-alloca-to-lds",
+  cl::desc("Disable promote alloca to LDS"),
+  cl::init(false));
+
 // FIXME: This can create globals so should be a module pass.
 class AMDGPUPromoteAlloca : public FunctionPass {
 private:
@@ -706,6 +711,9 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   if (tryPromoteAllocaToVector(&I))
     return true; // Promoted to vector.
 
+  if (DisablePromoteAllocaToLDS)
+    return false;
+
   const Function &ContainingFunction = *I.getParent()->getParent();
   CallingConv::ID CC = ContainingFunction.getCallingConv();
 
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
index ebef6122990..8d12a725594 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -1,8 +1,11 @@
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-lds< %s | FileCheck -check-prefix=NOLDS %s
 
 ; This normally would be fixed by instcombine to be compare to the GEP
 ; indices
 
+; NOLDS-NOT: addrspace(3)
+
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
 ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
-- 
GitLab


From 0bbaef20df4fe4c420a562a36460ed4cff4a1ee4 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 6 Nov 2018 21:40:32 +0000
Subject: [PATCH 1031/1116] Silence deprecation warning for GetVersionEx with
 clang-cl

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346268 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Windows/WindowsSupport.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index e05c3a73f2f..cfb5c0dd470 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -96,12 +96,19 @@ inline llvm::VersionTuple GetWindowsOSVersion() {
 #pragma warning(push)
 #pragma warning(disable : 4996)
 #endif // _MSC_VER
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated"
+#endif // __clang__
   // Starting with Microsoft SDK for Windows 8.1, this function is deprecated
   // in favor of the new Windows Version Helper APIs.  Since we don't specify a
   // minimum SDK version, it's easier to simply disable the warning rather than
   // try to support both APIs.
   if (GetVersionEx((LPOSVERSIONINFO)&info) == 0)
     return llvm::VersionTuple();
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif // __clang__
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif // _MSC_VER
-- 
GitLab


From 3d04968f3095d7ef368c511c83dffd0ac7f83011 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Tue, 6 Nov 2018 21:46:41 +0000
Subject: [PATCH 1032/1116] [MachineOutliner][NFC] Add findRepeatedSubstrings
 to SuffixTree, kill LeafVector

Instead of iterating over the leaves to find repeated substrings, and walking
collecting leaf children when we don't necessarily need them, let's just
calculate what we need and iterate over that.

By doing this, we don't have to save every leaf. It's easier to read the code
too and understand what's going on.

The goal here, at the end of the day, is to set up to allow us to do something
like

for (RepeatedSubstring &RS : ST) {
 ... do stuff with RS ...
}

Which would let us perform the cost model stuff and the repeated substring
query at the same time.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346269 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineOutliner.cpp | 193 ++++++++++++++++++--------------
 1 file changed, 106 insertions(+), 87 deletions(-)

diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index c12bf52c0e2..936a8106224 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -231,14 +231,18 @@ struct SuffixTreeNode {
 /// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
 class SuffixTree {
 public:
-  /// Stores each leaf node in the tree.
-  ///
-  /// This is used for finding outlining candidates.
-  std::vector<SuffixTreeNode *> LeafVector;
-
   /// Each element is an integer representing an instruction in the module.
   ArrayRef<unsigned> Str;
 
+  /// A repeated substring in the tree.
+  struct RepeatedSubstring {
+    /// The length of the string.
+    unsigned Length;
+
+    /// The start indices of each occurrence.
+    std::vector<unsigned> StartIndices;
+  };
+
 private:
   /// Maintains each node in the tree.
   SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
@@ -322,8 +326,7 @@ private:
   }
 
   /// Set the suffix indices of the leaves to the start indices of their
-  /// respective suffixes. Also stores each leaf in \p LeafVector at its
-  /// respective suffix index.
+  /// respective suffixes.
   ///
   /// \param[in] CurrNode The node currently being visited.
   /// \param CurrIdx The current index of the string being visited.
@@ -353,9 +356,6 @@ private:
       CurrNode.SuffixIdx = Str.size() - CurrIdx;
       assert(CurrNode.Parent && "CurrNode had no parent!");
       CurrNode.Parent->OccurrenceCount++;
-
-      // Store the leaf in the leaf vector for pruning later.
-      LeafVector[CurrNode.SuffixIdx] = &CurrNode;
     }
   }
 
@@ -489,6 +489,44 @@ private:
     return SuffixesToAdd;
   }
 
+  /// Helper function for findRepeatedSubstrings.
+  /// Traverses the suffix tree that finds all nodes associated with a repeated
+  /// substring. That is, all internal non-root nodes. If the given node has
+  /// more than one leaf child, store the repeated strings in Substrings.
+  void
+  findRepeatedSubstringsHelper(SuffixTreeNode &Curr,
+                               std::vector<RepeatedSubstring> &Substrings,
+                               const unsigned MinLength = 1) {
+  assert(!Curr.isLeaf() && "Visited a leaf?");
+  std::vector<SuffixTreeNode *> LeafChildren;
+  unsigned Length = Curr.ConcatLen;
+
+  for (auto &ChildPair : Curr.Children) {
+    if (!ChildPair.second->isLeaf())
+      findRepeatedSubstringsHelper(*ChildPair.second, Substrings, MinLength);
+    else if (Length >= MinLength)
+      LeafChildren.push_back(ChildPair.second);
+  }
+
+  // The root node never has repeats. Quit here.
+  if (Curr.isRoot())
+    return;
+
+  // If there are no occurrences of the minimum length, then quit.
+  if (LeafChildren.empty() || LeafChildren.size() < 2)
+    return;
+
+  // We have a node associated with a repeated substring. Store that in
+  // Substrings and move on.
+  RepeatedSubstring RS;
+  RS.Length = Length;
+
+  // Each occurrence starts at a suffix given by a leaf child.
+  for (SuffixTreeNode *Leaf : LeafChildren)
+    RS.StartIndices.push_back(Leaf->SuffixIdx);
+  Substrings.push_back(RS);
+}
+
 public:
   /// Construct a suffix tree from a sequence of unsigned integers.
   ///
@@ -497,7 +535,6 @@ public:
     Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
     Root->IsInTree = true;
     Active.Node = Root;
-    LeafVector = std::vector<SuffixTreeNode *>(Str.size());
 
     // Keep track of the number of suffixes we have to add of the current
     // prefix.
@@ -518,6 +555,15 @@ public:
     assert(Root && "Root node can't be nullptr!");
     setSuffixIndices(*Root, 0);
   }
+
+  /// Finds all repeated substrings with an optionally-provided minimum length
+  /// and stores them in \p Substrings.
+  /// If \p MinLength is provided, only return those with a given minimum
+  /// length.
+  void findRepeatedSubstrings(std::vector<RepeatedSubstring> &Substrings,
+                              const unsigned MinLength = 1) {
+    findRepeatedSubstringsHelper(*Root, Substrings, MinLength);
+  }
 };
 
 /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
@@ -925,80 +971,55 @@ unsigned MachineOutliner::findCandidates(
   FunctionList.clear();
   unsigned MaxLen = 0;
 
-  // FIXME: Visit internal nodes instead of leaves.
-  for (SuffixTreeNode *Leaf : ST.LeafVector) {
-    assert(Leaf && "Leaves in LeafVector cannot be null!");
-    if (!Leaf->IsInTree)
-      continue;
-
-    assert(Leaf->Parent && "All leaves must have parents!");
-    SuffixTreeNode &Parent = *(Leaf->Parent);
-
-    // If it doesn't appear enough, or we already outlined from it, skip it.
-    if (Parent.OccurrenceCount < 2 || Parent.isRoot() || !Parent.IsInTree)
-      continue;
+  // First, find dall of the repeated substrings in the tree of minimum length
+  // 2.
+  // FIXME: 2 is an approximation which isn't necessarily true for, say, X86.
+  // If we factor in instruction lengths, we need more information than this.
+  // FIXME: It'd be nice if we could just have a repeated substring iterator.
+  std::vector<SuffixTree::RepeatedSubstring> RepeatedSubstrings;
+  ST.findRepeatedSubstrings(RepeatedSubstrings, 2);
 
-    // Figure out if this candidate is beneficial.
-    unsigned StringLen = Leaf->ConcatLen - (unsigned)Leaf->size();
-
-    // Too short to be beneficial; skip it.
-    // FIXME: This isn't necessarily true for, say, X86. If we factor in
-    // instruction lengths we need more information than this.
-    if (StringLen < 2)
-      continue;
-
-    // If this is a beneficial class of candidate, then every one is stored in
-    // this vector.
+  for (SuffixTree::RepeatedSubstring &RS : RepeatedSubstrings) {
     std::vector<Candidate> CandidatesForRepeatedSeq;
-
-    // Figure out the call overhead for each instance of the sequence.
-    for (auto &ChildPair : Parent.Children) {
-      SuffixTreeNode *M = ChildPair.second;
-
-      if (M && M->IsInTree && M->isLeaf()) {
-        // Never visit this leaf again.
-        M->IsInTree = false;
-        unsigned StartIdx = M->SuffixIdx;
-        unsigned EndIdx = StartIdx + StringLen - 1;
-
-        // Trick: Discard some candidates that would be incompatible with the
-        // ones we've already found for this sequence. This will save us some
-        // work in candidate selection.
-        //
-        // If two candidates overlap, then we can't outline them both. This
-        // happens when we have candidates that look like, say
-        //
-        // AA (where each "A" is an instruction).
-        //
-        // We might have some portion of the module that looks like this:
-        // AAAAAA (6 A's)
-        //
-        // In this case, there are 5 different copies of "AA" in this range, but
-        // at most 3 can be outlined. If only outlining 3 of these is going to
-        // be unbeneficial, then we ought to not bother.
-        //
-        // Note that two things DON'T overlap when they look like this:
-        // start1...end1 .... start2...end2
-        // That is, one must either
-        // * End before the other starts
-        // * Start after the other ends
-        if (std::all_of(CandidatesForRepeatedSeq.begin(),
-                        CandidatesForRepeatedSeq.end(),
-                        [&StartIdx, &EndIdx](const Candidate &C) {
-                          return (EndIdx < C.getStartIdx() ||
-                                  StartIdx > C.getEndIdx());
-                        })) {
-          // It doesn't overlap with anything, so we can outline it.
-          // Each sequence is over [StartIt, EndIt].
-          // Save the candidate and its location.
-
-          MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx];
-          MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
-
-          CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt,
-                                                EndIt, StartIt->getParent(),
-                                                FunctionList.size());
-        }
+    unsigned StringLen = RS.Length;
+    for (const unsigned &StartIdx : RS.StartIndices) {
+      unsigned EndIdx = StartIdx + StringLen - 1;
+      // Trick: Discard some candidates that would be incompatible with the
+      // ones we've already found for this sequence. This will save us some
+      // work in candidate selection.
+      //
+      // If two candidates overlap, then we can't outline them both. This
+      // happens when we have candidates that look like, say
+      //
+      // AA (where each "A" is an instruction).
+      //
+      // We might have some portion of the module that looks like this:
+      // AAAAAA (6 A's)
+      //
+      // In this case, there are 5 different copies of "AA" in this range, but
+      // at most 3 can be outlined. If only outlining 3 of these is going to
+      // be unbeneficial, then we ought to not bother.
+      //
+      // Note that two things DON'T overlap when they look like this:
+      // start1...end1 .... start2...end2
+      // That is, one must either
+      // * End before the other starts
+      // * Start after the other ends
+      if (std::all_of(
+              CandidatesForRepeatedSeq.begin(), CandidatesForRepeatedSeq.end(),
+              [&StartIdx, &EndIdx](const Candidate &C) {
+                return (EndIdx < C.getStartIdx() || StartIdx > C.getEndIdx());
+              })) {
+        // It doesn't overlap with anything, so we can outline it.
+        // Each sequence is over [StartIt, EndIt].
+        // Save the candidate and its location.
+
+        MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx];
+        MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
+
+        CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt,
+                                              EndIt, StartIt->getParent(),
+                                              FunctionList.size());
       }
     }
 
@@ -1021,7 +1042,8 @@ unsigned MachineOutliner::findCandidates(
       continue;
 
     std::vector<unsigned> Seq;
-    for (unsigned i = Leaf->SuffixIdx; i < Leaf->SuffixIdx + StringLen; i++)
+    unsigned StartIdx = RS.StartIndices[0]; // Grab any start index.
+    for (unsigned i = StartIdx; i < StartIdx + StringLen; i++)
       Seq.push_back(ST.Str[i]);
     OF.Sequence = Seq;
     OF.Name = FunctionList.size();
@@ -1040,9 +1062,6 @@ unsigned MachineOutliner::findCandidates(
     for (std::shared_ptr<Candidate> &C : OF.Candidates)
       CandidateList.push_back(C);
     FunctionList.push_back(OF);
-
-    // Move to the next function.
-    Parent.IsInTree = false;
   }
 
   return MaxLen;
-- 
GitLab


From e86187410999c8d6b0f57f9a6e911d8759df8fba Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl@gmail.com>
Date: Tue, 6 Nov 2018 22:07:03 +0000
Subject: [PATCH 1033/1116] [FileCheck] Parse command-line options from
 FILECHECK_OPTS

This feature makes it easy to tune FileCheck diagnostic output when
running the test suite via ninja, a bot, or an IDE.  For example:

```
$ FILECHECK_OPTS='-color -v -dump-input-on-failure' \
  LIT_FILTER='OpenMP/for_codegen.cpp' ninja check-clang \
  | less -R
```

Reviewed By: probinson

Differential Revision: https://reviews.llvm.org/D53517

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346272 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/CommandGuide/FileCheck.rst    |  3 +++
 include/llvm/Support/CommandLine.h | 11 ++++++++++-
 lib/Support/CommandLine.cpp        | 23 +++++++++++++++++++++--
 test/FileCheck/envvar-opts.txt     | 15 +++++++++++++++
 utils/FileCheck/FileCheck.cpp      |  3 ++-
 utils/lit/lit/TestingConfig.py     |  2 +-
 6 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 test/FileCheck/envvar-opts.txt

diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 830b1e00d4e..6581b33ba1c 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -24,6 +24,9 @@ match.  The file to verify is read from standard input unless the
 OPTIONS
 -------
 
+Options are parsed from the environment variable ``FILECHECK_OPTS``
+and from the command line.
+
 .. option:: -help
 
  Print a summary of command line options.
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index 799b41fbf8b..cd3543c130e 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -56,9 +56,18 @@ namespace cl {
 // Returns true on success. Otherwise, this will print the error message to
 // stderr and exit if \p Errs is not set (nullptr by default), or print the
 // error message to \p Errs and return false if \p Errs is provided.
+//
+// If EnvVar is not nullptr, command-line options are also parsed from the
+// environment variable named by EnvVar.  Precedence is given to occurrences
+// from argv.  This precedence is currently implemented by parsing argv after
+// the environment variable, so it is only implemented correctly for options
+// that give precedence to later occurrences.  If your program supports options
+// that give precedence to earlier occurrences, you will need to extend this
+// function to support it correctly.
 bool ParseCommandLineOptions(int argc, const char *const *argv,
                              StringRef Overview = "",
-                             raw_ostream *Errs = nullptr);
+                             raw_ostream *Errs = nullptr,
+                             const char *EnvVar = nullptr);
 
 //===----------------------------------------------------------------------===//
 // ParseEnvironmentOptions - Environment variable option processing alternate
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index b169bb60964..cb2a2e557fa 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -1061,8 +1061,27 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
 }
 
 bool cl::ParseCommandLineOptions(int argc, const char *const *argv,
-                                 StringRef Overview, raw_ostream *Errs) {
-  return GlobalParser->ParseCommandLineOptions(argc, argv, Overview,
+                                 StringRef Overview, raw_ostream *Errs,
+                                 const char *EnvVar) {
+  SmallVector<const char *, 20> NewArgv;
+  BumpPtrAllocator A;
+  StringSaver Saver(A);
+  NewArgv.push_back(argv[0]);
+
+  // Parse options from environment variable.
+  if (EnvVar) {
+    if (llvm::Optional<std::string> EnvValue =
+            sys::Process::GetEnv(StringRef(EnvVar)))
+      TokenizeGNUCommandLine(*EnvValue, Saver, NewArgv);
+  }
+
+  // Append options from command line.
+  for (int I = 1; I < argc; ++I)
+    NewArgv.push_back(argv[I]);
+  int NewArgc = static_cast<int>(NewArgv.size());
+
+  // Parse all options.
+  return GlobalParser->ParseCommandLineOptions(NewArgc, &NewArgv[0], Overview,
                                                Errs);
 }
 
diff --git a/test/FileCheck/envvar-opts.txt b/test/FileCheck/envvar-opts.txt
new file mode 100644
index 00000000000..bc52d88e0ab
--- /dev/null
+++ b/test/FileCheck/envvar-opts.txt
@@ -0,0 +1,15 @@
+; Create a case that produces a simple diagnostic.
+; RUN: echo foo > %t.in
+; CHECK: foo
+; CHECK: bar
+
+; RUN: FILECHECK_OPTS= \
+; RUN: not FileCheck %s -input-file %t.in 2>&1 \
+; RUN: | FileCheck -check-prefix QUIET %s
+
+; RUN: FILECHECK_OPTS=-v \
+; RUN: not FileCheck %s -input-file %t.in 2>&1 \
+; RUN: | FileCheck -check-prefix VERB %s
+
+; QUIET-NOT: remark: {{CHECK}}: expected string found in input
+; VERB:      remark: {{CHECK}}: expected string found in input
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index bf3c3983cfa..967d22f12b6 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -114,7 +114,8 @@ int main(int argc, char **argv) {
   llvm::sys::Process::UseANSIEscapeCodes(true);
 
   InitLLVM X(argc, argv);
-  cl::ParseCommandLineOptions(argc, argv);
+  cl::ParseCommandLineOptions(argc, argv, /*Overview*/ "", /*Errs*/ nullptr,
+                              "FILECHECK_OPTS");
 
   FileCheckRequest Req;
   for (auto Prefix : CheckPrefixes)
diff --git a/utils/lit/lit/TestingConfig.py b/utils/lit/lit/TestingConfig.py
index e2ac73b0b42..d5adb535775 100644
--- a/utils/lit/lit/TestingConfig.py
+++ b/utils/lit/lit/TestingConfig.py
@@ -26,7 +26,7 @@ class TestingConfig:
                      'LSAN_OPTIONS', 'ADB', 'ANDROID_SERIAL',
                      'SANITIZER_IGNORE_CVE_2016_2143', 'TMPDIR', 'TMP', 'TEMP',
                      'TEMPDIR', 'AVRLIT_BOARD', 'AVRLIT_PORT',
-                     'FILECHECK_DUMP_INPUT_ON_FAILURE']
+                     'FILECHECK_DUMP_INPUT_ON_FAILURE', 'FILECHECK_OPTS']
         for var in pass_vars:
             val = os.environ.get(var, '')
             # Check for empty string as some variables such as LD_PRELOAD cannot be empty
-- 
GitLab


From 86faa6e488ca5ae3b94a7ddcf9955ec01e1b31bf Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Tue, 6 Nov 2018 22:17:14 +0000
Subject: [PATCH 1034/1116] [PATCH] [AArch64] Refactor helper functions (NFC)

Refactor helper functions in AArch64InstrInfo to be static methods.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346273 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrInfo.cpp | 8 ++++----
 lib/Target/AArch64/AArch64InstrInfo.h   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index c168184beb9..7b4e0512805 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -758,7 +758,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
 }
 
-bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) {
   unsigned Reg, Imm, Shift;
 
   switch (MI.getOpcode()) {
@@ -829,7 +829,7 @@ bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) const {
   }
 }
 
-bool AArch64InstrInfo::isExynosLdStExtFast(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isExynosLdStExtFast(const MachineInstr &MI) {
   unsigned Imm;
   AArch64_AM::ShiftExtendType Ext;
 
@@ -894,7 +894,7 @@ bool AArch64InstrInfo::isExynosLdStExtFast(const MachineInstr &MI) const {
   }
 }
 
-bool AArch64InstrInfo::isExynosShiftExtFast(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isExynosShiftExtFast(const MachineInstr &MI) {
   unsigned Imm, Shift;
   AArch64_AM::ShiftExtendType Ext;
 
@@ -963,7 +963,7 @@ bool AArch64InstrInfo::isExynosShiftExtFast(const MachineInstr &MI) const {
   }
 }
 
-bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return false;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index c156df57127..43011dd4c3e 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -256,16 +256,16 @@ public:
   bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
   /// Returns true if the instruction sets a constant value that can be
   /// executed more efficiently.
-  bool isExynosResetFast(const MachineInstr &MI) const;
+  static bool isExynosResetFast(const MachineInstr &MI);
   /// Returns true if the load or store has an extension that can be executed
   /// more efficiently.
-  bool isExynosLdStExtFast(const MachineInstr &MI) const;
+  static bool isExynosLdStExtFast(const MachineInstr &MI);
   /// Returns true if the instruction has a constant shift left or extension
   /// that can be executed more efficiently.
-  bool isExynosShiftExtFast(const MachineInstr &MI) const;
+  static bool isExynosShiftExtFast(const MachineInstr &MI);
   /// Returns true if the instruction has a shift by immediate that can be
   /// executed in one cycle less.
-  bool isFalkorShiftExtFast(const MachineInstr &MI) const;
+  static bool isFalkorShiftExtFast(const MachineInstr &MI);
   /// Return true if the instructions is a SEH instruciton used for unwinding
   /// on Windows.
   static bool isSEHInstruction(const MachineInstr &MI);
-- 
GitLab


From d2bfbec96c790cdad7e55effaf4dc68de01d6715 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Tue, 6 Nov 2018 22:21:11 +0000
Subject: [PATCH 1035/1116] [MachineOutliner][NFC] Remove IsInTree from
 SuffixTreeNode

After changing the way we find repeated substrings in r346269, this
field is no longer used by anything, so it can be removed.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346274 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineOutliner.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 936a8106224..021e6c3acde 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -128,9 +128,6 @@ struct SuffixTreeNode {
   /// mapping by tacking that character on the end of the current string.
   DenseMap<unsigned, SuffixTreeNode *> Children;
 
-  /// A flag set to false if the node has been pruned from the tree.
-  bool IsInTree = true;
-
   /// The start index of this node's substring in the main string.
   unsigned StartIdx = EmptyIdx;
 
@@ -533,7 +530,6 @@ public:
   /// \param Str The string to construct the suffix tree for.
   SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
     Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
-    Root->IsInTree = true;
     Active.Node = Root;
 
     // Keep track of the number of suffixes we have to add of the current
-- 
GitLab


From 4e5e34ac8f6a8efb32e3035852bfe5835b309371 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Tue, 6 Nov 2018 22:23:13 +0000
Subject: [PATCH 1036/1116] [MachineOutliner][NFC] Remove OccurrenceCount from
 SuffixTreeNode

After changing the way we find candidates in r346269, this is no longer used.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346275 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineOutliner.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 021e6c3acde..bccf9523312 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -167,12 +167,6 @@ struct SuffixTreeNode {
   /// The parent of this node. Every node except for the root has a parent.
   SuffixTreeNode *Parent = nullptr;
 
-  /// The number of times this node's string appears in the tree.
-  ///
-  /// This is equal to the number of leaf children of the string. It represents
-  /// the number of suffixes that the node's string is a prefix of.
-  unsigned OccurrenceCount = 0;
-
   /// The length of the string formed by concatenating the edge labels from the
   /// root to this node.
   unsigned ConcatLen = 0;
@@ -352,7 +346,6 @@ private:
       // If yes, give it a suffix index and bump its parent's occurrence count.
       CurrNode.SuffixIdx = Str.size() - CurrIdx;
       assert(CurrNode.Parent && "CurrNode had no parent!");
-      CurrNode.Parent->OccurrenceCount++;
     }
   }
 
-- 
GitLab


From 6b969961cbebfe379e2cca4d829d681fb93630a9 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl@gmail.com>
Date: Tue, 6 Nov 2018 22:42:10 +0000
Subject: [PATCH 1037/1116] [FileCheck] Try to fix windows bots broken by
 r346272

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346277 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/FileCheck/envvar-opts.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/FileCheck/envvar-opts.txt b/test/FileCheck/envvar-opts.txt
index bc52d88e0ab..b25ecb89a31 100644
--- a/test/FileCheck/envvar-opts.txt
+++ b/test/FileCheck/envvar-opts.txt
@@ -3,11 +3,11 @@
 ; CHECK: foo
 ; CHECK: bar
 
-; RUN: FILECHECK_OPTS= \
+; RUN: env FILECHECK_OPTS= \
 ; RUN: not FileCheck %s -input-file %t.in 2>&1 \
 ; RUN: | FileCheck -check-prefix QUIET %s
 
-; RUN: FILECHECK_OPTS=-v \
+; RUN: env FILECHECK_OPTS=-v \
 ; RUN: not FileCheck %s -input-file %t.in 2>&1 \
 ; RUN: | FileCheck -check-prefix VERB %s
 
-- 
GitLab


From db36e6f42155c16b8c016107e39d84fc5955c9fd Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 6 Nov 2018 23:39:59 +0000
Subject: [PATCH 1038/1116] [Windows] Simplify WindowsSupport.h

Sink Windows version detection code from WindowsSupport.h to Path.inc.
These functions don't need to be inlined. I randomly picked Process.inc
for the Windows version helpers, since that's the most related file.

Sink MakeErrMsg to Program.inc since it's the main client.

Move those functions into the llvm namespace, and delete the scoped
handle copy and assignment operators.

Reviewers: zturner, aganea

Differential Revision: https://reviews.llvm.org/D54182

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346280 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Windows/Process.inc      | 24 +++++++
 lib/Support/Windows/Program.inc      | 19 ++++++
 lib/Support/Windows/WindowsSupport.h | 93 ++++------------------------
 3 files changed, 54 insertions(+), 82 deletions(-)

diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index ce646d63609..2b2d7923143 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -460,3 +460,27 @@ unsigned Process::GetRandomNumber() {
     ReportLastErrorFatal("Could not generate a random number");
   return Ret;
 }
+
+typedef NTSTATUS(WINAPI* RtlGetVersionPtr)(PRTL_OSVERSIONINFOW);
+#define STATUS_SUCCESS ((NTSTATUS)0x00000000L)
+
+llvm::VersionTuple llvm::GetWindowsOSVersion() {
+  HMODULE hMod = ::GetModuleHandleW(L"ntdll.dll");
+  if (hMod) {
+    auto getVer = (RtlGetVersionPtr)::GetProcAddress(hMod, "RtlGetVersion");
+    if (getVer) {
+      RTL_OSVERSIONINFOEXW info{};
+      info.dwOSVersionInfoSize = sizeof(info);
+      if (getVer((PRTL_OSVERSIONINFOW)&info) == STATUS_SUCCESS) {
+        return llvm::VersionTuple(info.dwMajorVersion, info.dwMinorVersion, 0,
+                                  info.dwBuildNumber);
+      }
+    }
+  }
+  return llvm::VersionTuple(0, 0, 0, 0);
+}
+
+bool llvm::RunningWindows8OrGreater() {
+  // Windows 8 is version 6.2, service pack 0.
+  return GetWindowsOSVersion() >= llvm::VersionTuple(6, 2, 0, 0);
+}
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 88c56bc173b..c037956603f 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -105,6 +105,25 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name,
   return std::string(U8Result.begin(), U8Result.end());
 }
 
+bool MakeErrMsg(std::string *ErrMsg, const std::string &prefix) {
+  if (!ErrMsg)
+    return true;
+  char *buffer = NULL;
+  DWORD LastError = GetLastError();
+  DWORD R = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+                               FORMAT_MESSAGE_FROM_SYSTEM |
+                               FORMAT_MESSAGE_MAX_WIDTH_MASK,
+                           NULL, LastError, 0, (LPSTR)&buffer, 1, NULL);
+  if (R)
+    *ErrMsg = prefix + ": " + buffer;
+  else
+    *ErrMsg = prefix + ": Unknown error";
+  *ErrMsg += " (0x" + llvm::utohexstr(LastError) + ")";
+
+  LocalFree(buffer);
+  return R != 0;
+}
+
 static HANDLE RedirectIO(Optional<StringRef> Path, int fd,
                          std::string *ErrMsg) {
   HANDLE h;
diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index cfb5c0dd470..979cc5d0139 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -50,99 +50,29 @@
 // Must be included after windows.h
 #include <wincrypt.h>
 
+namespace llvm {
+
 /// Determines if the program is running on Windows 8 or newer. This
 /// reimplements one of the helpers in the Windows 8.1 SDK, which are intended
 /// to supercede raw calls to GetVersionEx. Old SDKs, Cygwin, and MinGW don't
 /// yet have VersionHelpers.h, so we have our own helper.
-inline bool RunningWindows8OrGreater() {
-  // Windows 8 is version 6.2, service pack 0.
-  OSVERSIONINFOEXW osvi = {};
-  osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
-  osvi.dwMajorVersion = 6;
-  osvi.dwMinorVersion = 2;
-  osvi.wServicePackMajor = 0;
-
-  DWORDLONG Mask = 0;
-  Mask = VerSetConditionMask(Mask, VER_MAJORVERSION, VER_GREATER_EQUAL);
-  Mask = VerSetConditionMask(Mask, VER_MINORVERSION, VER_GREATER_EQUAL);
-  Mask = VerSetConditionMask(Mask, VER_SERVICEPACKMAJOR, VER_GREATER_EQUAL);
-
-  return VerifyVersionInfoW(&osvi, VER_MAJORVERSION | VER_MINORVERSION |
-                                       VER_SERVICEPACKMAJOR,
-                            Mask) != FALSE;
-}
-
-typedef NTSTATUS(WINAPI* RtlGetVersionPtr)(PRTL_OSVERSIONINFOW);
-#define STATUS_SUCCESS ((NTSTATUS)0x00000000L)
-
-inline llvm::VersionTuple GetWindowsOSVersion() {
-  HMODULE hMod = ::GetModuleHandleW(L"ntdll.dll");
-  if (hMod) {
-    auto getVer = (RtlGetVersionPtr)::GetProcAddress(hMod, "RtlGetVersion");
-    if (getVer) {
-      RTL_OSVERSIONINFOEXW info{};
-      info.dwOSVersionInfoSize = sizeof(info);
-      if (getVer((PRTL_OSVERSIONINFOW)&info) == STATUS_SUCCESS) {
-        return llvm::VersionTuple(info.dwMajorVersion, info.dwMinorVersion, 0,
-                                  info.dwBuildNumber);
-      }
-    }
-  }
+bool RunningWindows8OrGreater();
 
-  OSVERSIONINFOEX info;
-  ZeroMemory(&info, sizeof(OSVERSIONINFOEX));
-  info.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4996)
-#endif // _MSC_VER
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated"
-#endif // __clang__
-  // Starting with Microsoft SDK for Windows 8.1, this function is deprecated
-  // in favor of the new Windows Version Helper APIs.  Since we don't specify a
-  // minimum SDK version, it's easier to simply disable the warning rather than
-  // try to support both APIs.
-  if (GetVersionEx((LPOSVERSIONINFO)&info) == 0)
-    return llvm::VersionTuple();
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif // __clang__
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif // _MSC_VER
-
-  return llvm::VersionTuple(info.dwMajorVersion, info.dwMinorVersion, 0,
-                            info.dwBuildNumber);
-}
+/// Returns the Windows version as Major.Minor.0.BuildNumber. Uses
+/// RtlGetVersion or GetVersionEx under the hood depending on what is available.
+/// GetVersionEx is deprecated, but this API exposes the build number which can
+/// be useful for working around certain kernel bugs.
+llvm::VersionTuple GetWindowsOSVersion();
 
-inline bool MakeErrMsg(std::string *ErrMsg, const std::string &prefix) {
-  if (!ErrMsg)
-    return true;
-  char *buffer = NULL;
-  DWORD LastError = GetLastError();
-  DWORD R = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER |
-                               FORMAT_MESSAGE_FROM_SYSTEM |
-                               FORMAT_MESSAGE_MAX_WIDTH_MASK,
-                           NULL, LastError, 0, (LPSTR)&buffer, 1, NULL);
-  if (R)
-    *ErrMsg = prefix + ": " + buffer;
-  else
-    *ErrMsg = prefix + ": Unknown error";
-  *ErrMsg += " (0x" + llvm::utohexstr(LastError) + ")";
-
-  LocalFree(buffer);
-  return R != 0;
-}
+bool MakeErrMsg(std::string *ErrMsg, const std::string &prefix);
 
 template <typename HandleTraits>
 class ScopedHandle {
   typedef typename HandleTraits::handle_type handle_type;
   handle_type Handle;
 
-  ScopedHandle(const ScopedHandle &other); // = delete;
-  void operator=(const ScopedHandle &other); // = delete;
+  ScopedHandle(const ScopedHandle &other) = delete;
+  void operator=(const ScopedHandle &other) = delete;
 public:
   ScopedHandle()
     : Handle(HandleTraits::GetInvalid()) {}
@@ -247,7 +177,6 @@ typedef ScopedHandle<RegTraits>          ScopedRegHandle;
 typedef ScopedHandle<FindHandleTraits>   ScopedFindHandle;
 typedef ScopedHandle<JobHandleTraits>    ScopedJobHandle;
 
-namespace llvm {
 template <class T>
 class SmallVectorImpl;
 
-- 
GitLab


From be9ce13f15d774caf48d3bf11e7b435f88adf000 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 7 Nov 2018 00:00:42 +0000
Subject: [PATCH 1039/1116] [IR] add optional parameter for copying IR flags to
 compare instructions

As shown, this is used to eliminate redundant code in InstCombine,
and there are more cases where we should be using this pattern, but
we're currently unintentionally dropping flags.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346282 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/InstrTypes.h                  |  3 +-
 include/llvm/IR/Instructions.h                |  9 ++--
 lib/IR/Instructions.cpp                       |  9 ++--
 .../InstCombine/InstCombineCompares.cpp       | 43 ++++++-------------
 4 files changed, 27 insertions(+), 37 deletions(-)

diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index ec782face6c..f8d23c7f614 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -677,7 +677,8 @@ public:
 protected:
   CmpInst(Type *ty, Instruction::OtherOps op, Predicate pred,
           Value *LHS, Value *RHS, const Twine &Name = "",
-          Instruction *InsertBefore = nullptr);
+          Instruction *InsertBefore = nullptr,
+          Instruction *FlagsSource = nullptr);
 
   CmpInst(Type *ty, Instruction::OtherOps op, Predicate pred,
           Value *LHS, Value *RHS, const Twine &Name,
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 449e6e8dc7a..7b2c13c5328 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -1299,12 +1299,13 @@ public:
 
   /// Constructor with no-insertion semantics
   FCmpInst(
-    Predicate pred, ///< The predicate to use for the comparison
+    Predicate Pred, ///< The predicate to use for the comparison
     Value *LHS,     ///< The left-hand-side of the expression
     Value *RHS,     ///< The right-hand-side of the expression
-    const Twine &NameStr = "" ///< Name of the instruction
-  ) : CmpInst(makeCmpResultType(LHS->getType()),
-              Instruction::FCmp, pred, LHS, RHS, NameStr) {
+    const Twine &NameStr = "", ///< Name of the instruction
+    Instruction *FlagsSource = nullptr
+  ) : CmpInst(makeCmpResultType(LHS->getType()), Instruction::FCmp, Pred, LHS,
+              RHS, NameStr, nullptr, FlagsSource) {
     AssertOK();
   }
 
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 3b8d8d0c690..7d4b6df18d9 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -3149,15 +3149,18 @@ AddrSpaceCastInst::AddrSpaceCastInst(
 //===----------------------------------------------------------------------===//
 
 CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
-                 Value *RHS, const Twine &Name, Instruction *InsertBefore)
+                 Value *RHS, const Twine &Name, Instruction *InsertBefore,
+                 Instruction *FlagsSource)
   : Instruction(ty, op,
                 OperandTraits<CmpInst>::op_begin(this),
                 OperandTraits<CmpInst>::operands(this),
                 InsertBefore) {
-    Op<0>() = LHS;
-    Op<1>() = RHS;
+  Op<0>() = LHS;
+  Op<1>() = RHS;
   setPredicate((Predicate)predicate);
   setName(Name);
+  if (FlagsSource)
+    copyIRFlags(FlagsSource);
 }
 
 CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 2381e26a1d8..c6dbfd92844 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5281,11 +5281,7 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
   if (C->isNegative())
     Pred = I.getSwappedPredicate();
 
-  // Finally emit the new fcmp.
-  Value *X = LHSI->getOperand(1);
-  FCmpInst *NewFCI = new FCmpInst(Pred, X, RHSC);
-  NewFCI->copyFastMathFlags(&I);
-  return NewFCI;
+  return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I);
 }
 
 /// Optimize fabs(X) compared with zero.
@@ -5434,43 +5430,34 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
             if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
               return Res;
       break;
-    case Instruction::Call:
-      if (Instruction *X = foldFabsWithFcmpZero(I))
-        return X;
-      break;
   }
   }
 
+  if (Instruction *R = foldFabsWithFcmpZero(I))
+    return R;
+
   Value *X, *Y;
   if (match(Op0, m_FNeg(m_Value(X)))) {
-    if (match(Op1, m_FNeg(m_Value(Y)))) {
-      // fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y
-      Instruction *NewFCmp = new FCmpInst(I.getSwappedPredicate(), X, Y);
-      NewFCmp->copyFastMathFlags(&I);
-      return NewFCmp;
-    }
+    // fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y
+    if (match(Op1, m_FNeg(m_Value(Y))))
+      return new FCmpInst(I.getSwappedPredicate(), X, Y, "", &I);
 
+    // fcmp pred (fneg X), C --> fcmp swap(pred) X, -C
     Constant *C;
     if (match(Op1, m_Constant(C))) {
-      // fcmp pred (fneg X), C --> fcmp swap(pred) X, -C
       Constant *NegC = ConstantExpr::getFNeg(C);
-      Instruction *NewFCmp = new FCmpInst(I.getSwappedPredicate(), X, NegC);
-      NewFCmp->copyFastMathFlags(&I);
-      return NewFCmp;
+      return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I);
     }
   }
 
   if (match(Op0, m_FPExt(m_Value(X)))) {
-    if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType()) {
-      // fcmp (fpext X), (fpext Y) -> fcmp X, Y
-      Instruction *NewFCmp = new FCmpInst(Pred, X, Y);
-      NewFCmp->copyFastMathFlags(&I);
-      return NewFCmp;
-    }
+    // fcmp (fpext X), (fpext Y) -> fcmp X, Y
+    if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType())
+      return new FCmpInst(Pred, X, Y, "", &I);
 
+    // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
     const APFloat *C;
     if (match(Op1, m_APFloat(C))) {
-      // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
       const fltSemantics &FPSem =
           X->getType()->getScalarType()->getFltSemantics();
       bool Lossy;
@@ -5485,9 +5472,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
           ((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
             APFloat::cmpLessThan) || Fabs.isZero())) {
         Constant *NewC = ConstantFP::get(X->getType(), TruncC);
-        Instruction *NewFCmp = new FCmpInst(Pred, X, NewC);
-        NewFCmp->copyFastMathFlags(&I);
-        return NewFCmp;
+        return new FCmpInst(Pred, X, NewC, "", &I);
       }
     }
   }
-- 
GitLab


From 0f48414c8c30a7056d74e17e102133644d894784 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Wed, 7 Nov 2018 01:58:50 +0000
Subject: [PATCH 1040/1116] [WebAssembly] Update test cases after
 FixFunctionBitcasts

Summary:
This updates generated binaries and corresponding test cases up to date
after applying FixFunctionBitcasts pass.

Reviewers: sbc100

Subscribers: dschuff, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D54070

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346286 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Object/Inputs/trivial-object-test.wasm   | Bin 324 -> 370 bytes
 test/Object/nm-trivial-object.test            |   1 +
 .../llvm-objdump/Inputs/trivial.obj.wasm      | Bin 324 -> 370 bytes
 .../WebAssembly/symbol-table.test             |   1 +
 test/tools/llvm-objdump/wasm.txt              |  15 ++++++------
 .../llvm-readobj/Inputs/trivial.obj.wasm      | Bin 295 -> 341 bytes
 test/tools/llvm-readobj/print-hex.test        |   3 ++-
 test/tools/llvm-readobj/relocations.test      |   5 ++++
 test/tools/llvm-readobj/sections.test         |  22 +++++++++---------
 test/tools/llvm-readobj/symbols.test          |   5 ++++
 10 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/test/Object/Inputs/trivial-object-test.wasm b/test/Object/Inputs/trivial-object-test.wasm
index 2aa042d54dc5bf56ae1abfd89144ab15ef5b832e..8652d67f69222ab57610df1b13d3972e031460a1 100644
GIT binary patch
delta 180
zcmX@Y^odEDA+b1@k%57MabiP50|QF}17m#xV?ASi0t3TDi8Lc-=4OyM69W_15->xY
z!Lb3v5CD@>0$^!J25v3}M+Sk$hK7cTOH@U_f>f~QWaedO=A|=o90qaOCO(vqV`7%p
z^9jz+P4zFyNG)<J%}XxH%+HHY$}CAvEH0V&!$5u(NG(@UYEFK#p0mG;D>DlV3nK#q
M7ZZaZGlLup07v^X#Q*>R

delta 135
zcmeywbc9KQA+b1@k%57Mv9F<_fq^-Jfw4Y;v7WJhqI8-X6LS+tkdc9FCYTmyaBKiE
z1i++}KqH9bIB~P8#7z*3Jts3SJ2Nkxk&%&sV+lxvb>drz$tsNcGSfjkuA<bO{A4|6
Te-~F~W)>Dk1_mxB20>;3IL##I

diff --git a/test/Object/nm-trivial-object.test b/test/Object/nm-trivial-object.test
index 621a1779166..ca9c2782611 100644
--- a/test/Object/nm-trivial-object.test
+++ b/test/Object/nm-trivial-object.test
@@ -61,6 +61,7 @@ COFF32-NEXT:          U _puts
 
 
 WASM:      00000000 d .L.str
+WASM-NEXT: 00000003 t .LSomeOtherFunction_bitcast
 WASM-NEXT:          U SomeOtherFunction
 WASM-NEXT: 00000002 T main
 WASM-NEXT:          U puts
diff --git a/test/tools/llvm-objdump/Inputs/trivial.obj.wasm b/test/tools/llvm-objdump/Inputs/trivial.obj.wasm
index 2aa042d54dc5bf56ae1abfd89144ab15ef5b832e..8652d67f69222ab57610df1b13d3972e031460a1 100644
GIT binary patch
delta 180
zcmX@Y^odEDA+b1@k%57MabiP50|QF}17m#xV?ASi0t3TDi8Lc-=4OyM69W_15->xY
z!Lb3v5CD@>0$^!J25v3}M+Sk$hK7cTOH@U_f>f~QWaedO=A|=o90qaOCO(vqV`7%p
z^9jz+P4zFyNG)<J%}XxH%+HHY$}CAvEH0V&!$5u(NG(@UYEFK#p0mG;D>DlV3nK#q
M7ZZaZGlLup07v^X#Q*>R

delta 135
zcmeywbc9KQA+b1@k%57Mv9F<_fq^-Jfw4Y;v7WJhqI8-X6LS+tkdc9FCYTmyaBKiE
z1i++}KqH9bIB~P8#7z*3Jts3SJ2Nkxk&%&sV+lxvb>drz$tsNcGSfjkuA<bO{A4|6
Te-~F~W)>Dk1_mxB20>;3IL##I

diff --git a/test/tools/llvm-objdump/WebAssembly/symbol-table.test b/test/tools/llvm-objdump/WebAssembly/symbol-table.test
index 43c52873c9f..fff4c9fe52c 100644
--- a/test/tools/llvm-objdump/WebAssembly/symbol-table.test
+++ b/test/tools/llvm-objdump/WebAssembly/symbol-table.test
@@ -4,5 +4,6 @@ CHECK:      SYMBOL TABLE:
 CHECK-NEXT: 00000002 g     F CODE	main
 CHECK-NEXT: 00000000 l       DATA	.L.str
 CHECK-NEXT: 00000000 g     F *UND*	puts
+CHECK-NEXT: 00000003 l     F CODE	.LSomeOtherFunction_bitcast
 CHECK-NEXT: 00000000 g     F *UND*	SomeOtherFunction
 CHECK-NEXT: 00000010 g       DATA	var
diff --git a/test/tools/llvm-objdump/wasm.txt b/test/tools/llvm-objdump/wasm.txt
index d24db89188e..93517fed6d4 100644
--- a/test/tools/llvm-objdump/wasm.txt
+++ b/test/tools/llvm-objdump/wasm.txt
@@ -2,13 +2,13 @@
 
 # CHECK:      Sections:
 # CHECK-NEXT: Idx Name          Size      Address          Type
-# CHECK-NEXT:  0 TYPE          0000000e 0000000000000000
+# CHECK-NEXT:  0 TYPE          00000011 0000000000000000
 # CHECK-NEXT:  1 IMPORT        0000005d 0000000000000000
-# CHECK-NEXT:  2 FUNCTION      00000002 0000000000000000
-# CHECK-NEXT:  3 CODE          00000019 0000000000000000 TEXT
+# CHECK-NEXT:  2 FUNCTION      00000003 0000000000000000
+# CHECK-NEXT:  3 CODE          00000024 0000000000000000 TEXT
 # CHECK-NEXT:  4 DATA          0000001c 0000000000000000 DATA
-# CHECK-NEXT:  5 linking       00000051 0000000000000000
-# CHECK-NEXT:  6 reloc.CODE    0000000c 0000000000000000
+# CHECK-NEXT:  5 linking       0000006d 0000000000000000
+# CHECK-NEXT:  6 reloc.CODE    0000000f 0000000000000000
 
 # RUN: llvm-objdump -p %p/Inputs/trivial.obj.wasm | FileCheck %s -check-prefix CHECK-HEADER
 
@@ -18,5 +18,6 @@
 # RUN: llvm-objdump -s --section=CODE %p/Inputs/trivial.obj.wasm | FileCheck %s -check-prefix CHECK-SECTIONS
 
 # CHECK-SECTIONS: Contents of section CODE:
-# CHECK-SECTIONS: 0000 01170041 80808080 00108080 8080001a  ...A............
-# CHECK-SECTIONS: 0010 10818080 80004100 0b                 ......A..
+# CHECK-SECTIONS: 0000 02170041 80808080 00108080 8080001a  ...A............
+# CHECK-SECTIONS: 0010 10838080 80004100 0b0a0041 00108180  ......A....A....
+# CHECK-SECTIONS: 0020 8080000b                             ....
diff --git a/test/tools/llvm-readobj/Inputs/trivial.obj.wasm b/test/tools/llvm-readobj/Inputs/trivial.obj.wasm
index 0e3efb66a7feecfc91fb60c605285947ae0f7244..2f99d3446123f06927724b5d2c733d32cdef044d 100644
GIT binary patch
delta 225
zcmZ3^bd^b(A+b1@k%57MabiP50|QF}17m#xV?ASi0t3TDiL`oV=4OyM69W_15->xY
z!Lb3v5CD@>0$^!J25v3}M+SjLkPJ8XWDv#3;K;zu>yet1ldlk-UzC%=#lY|oB*C7O
znU|fJm(Iws3&dq*U|?d&P0Y+=WMb3v(JL+~VqoB95MW?nVwTpE@(Iq*P4zFyNG)<J
z%}XxH%+HHY$}CAvEH0V&UPgWv$N;XQ)SUceJ!gLxS7sI#7DfgJE+z&+W(GMH09YS8
A8vp<R

delta 193
zcmcc0w46zTA+b1@k%57Mv9F<_fq^-Jfw4Y;v7WJhqI6n46LS+tkdc9FCYTmyaBKiE
z1i++}KqH9b$iU4#8N^^@aAe@-^+?Uh$yW%^FUm>bVqiE5l3>rt%*)QqOJ`(cWZ;+$
z5@BIrU}DKl%*<nCV$<`{D=sNwVBlpCU|<knWSs<3z{sapls{R6kxynih|5)!nv<Wb
V=j`v|%FN8d!pOkD#l#@U3;;%MFLeL_

diff --git a/test/tools/llvm-readobj/print-hex.test b/test/tools/llvm-readobj/print-hex.test
index c220eb3739d..71e561e479e 100644
--- a/test/tools/llvm-readobj/print-hex.test
+++ b/test/tools/llvm-readobj/print-hex.test
@@ -22,4 +22,5 @@ MACHO: 0x00000010 000031c0 5ac3                       ..1.Z.
 RUN: llvm-readobj -x 1 %p/Inputs/trivial.obj.wasm \
 RUN:     | FileCheck %s --check-prefix WASM
 
-WASM: 0x00000000 03600001 7f60017f 017f6001 7f00 .`...`....`...
+WASM: 0x00000000 04600001 7f60017f 017f6000 0060017f .`...`....`..`..
+WASM: 0x00000010 00                                  .
diff --git a/test/tools/llvm-readobj/relocations.test b/test/tools/llvm-readobj/relocations.test
index be298130408..4a7dfa5eba0 100644
--- a/test/tools/llvm-readobj/relocations.test
+++ b/test/tools/llvm-readobj/relocations.test
@@ -302,6 +302,11 @@ WASM-NEXT:     }
 WASM-NEXT:     Relocation {
 WASM-NEXT:       Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB (0)
 WASM-NEXT:       Offset: 0x11
+WASM-NEXT:       Symbol: .LSomeOtherFunction_bitcast
+WASM-NEXT:     }
+WASM-NEXT:     Relocation {
+WASM-NEXT:       Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB (0)
+WASM-NEXT:       Offset: 0x1E
 WASM-NEXT:       Symbol: SomeOtherFunction
 WASM-NEXT:     }
 WASM-NEXT:   }
diff --git a/test/tools/llvm-readobj/sections.test b/test/tools/llvm-readobj/sections.test
index 4900c4f57b6..c371f4bb644 100644
--- a/test/tools/llvm-readobj/sections.test
+++ b/test/tools/llvm-readobj/sections.test
@@ -496,28 +496,28 @@ MACHO-ARM-NEXT:]
 WASM:      Sections [
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: TYPE (0x1)
-WASM-NEXT:     Size: 14
+WASM-NEXT:     Size: 17
 WASM-NEXT:     Offset: 8
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: IMPORT (0x2)
 WASM-NEXT:     Size: 93
-WASM-NEXT:     Offset: 28
+WASM-NEXT:     Offset: 31
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: FUNCTION (0x3)
-WASM-NEXT:     Size: 2
-WASM-NEXT:     Offset: 127
+WASM-NEXT:     Size: 3
+WASM-NEXT:     Offset: 130
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: CODE (0xA)
-WASM-NEXT:     Size: 25
-WASM-NEXT:     Offset: 135
+WASM-NEXT:     Size: 36
+WASM-NEXT:     Offset: 139
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: DATA (0xB)
 WASM-NEXT:     Size: 19
-WASM-NEXT:     Offset: 166
+WASM-NEXT:     Offset: 181
 WASM-NEXT:     Segments [
 WASM-NEXT:       Segment {
 WASM-NEXT:         Name: .rodata..L.str
@@ -528,14 +528,14 @@ WASM-NEXT:     ]
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: CUSTOM (0x0)
-WASM-NEXT:     Size: 61
-WASM-NEXT:     Offset: 191
+WASM-NEXT:     Size: 89
+WASM-NEXT:     Offset: 206
 WASM-NEXT:     Name: linking
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: CUSTOM (0x0)
-WASM-NEXT:     Size: 12
-WASM-NEXT:     Offset: 266
+WASM-NEXT:     Size: 15
+WASM-NEXT:     Offset: 309
 WASM-NEXT:     Name: reloc.CODE
 WASM-NEXT:   }
 WASM-NEXT: ]
diff --git a/test/tools/llvm-readobj/symbols.test b/test/tools/llvm-readobj/symbols.test
index d6bb870942d..1a0cacdeccd 100644
--- a/test/tools/llvm-readobj/symbols.test
+++ b/test/tools/llvm-readobj/symbols.test
@@ -88,6 +88,11 @@ WASM-NEXT:     Type: FUNCTION (0x0)
 WASM-NEXT:     Flags: 0x10
 WASM-NEXT:   }
 WASM-NEXT:   Symbol {
+WASM-NEXT:     Name: .LSomeOtherFunction_bitcast
+WASM-NEXT:     Type: FUNCTION (0x0)
+WASM-NEXT:     Flags: 0x2
+WASM-NEXT:   }
+WASM-NEXT:   Symbol {
 WASM-NEXT:     Name: SomeOtherFunction
 WASM-NEXT:     Type: FUNCTION (0x0)
 WASM-NEXT:     Flags: 0x10
-- 
GitLab


From f41ebcbed2d8a877ffb2b81641b0104f22dcc746 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Wed, 7 Nov 2018 02:04:07 +0000
Subject: [PATCH 1041/1116] RegAllocFast: Rename statistic from NumCopies to
 NumCoalesced

The metric does not return the number of remaining (or inserted) copies
but the number of copies that were coalesced. Pick a more descriptive
name.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346287 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/RegAllocFast.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 7b57c6cbcdb..a388cd608d1 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -54,7 +54,7 @@ using namespace llvm;
 
 STATISTIC(NumStores, "Number of stores added");
 STATISTIC(NumLoads , "Number of loads added");
-STATISTIC(NumCopies, "Number of copies coalesced");
+STATISTIC(NumCoalesced, "Number of copies coalesced");
 
 static RegisterRegAlloc
   fastRegAlloc("fast", "fast register allocator", createFastRegisterAllocator);
@@ -1079,7 +1079,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   // LiveVirtRegs might refer to the instrs.
   for (MachineInstr *MI : Coalesced)
     MBB.erase(MI);
-  NumCopies += Coalesced.size();
+  NumCoalesced += Coalesced.size();
 
   LLVM_DEBUG(MBB.dump());
 }
-- 
GitLab


From a007c003b5a11e98e89f3c55a2bba9bb417d1813 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Wed, 7 Nov 2018 02:04:11 +0000
Subject: [PATCH 1042/1116] RegAllocFast: Cleanups; NFC

This is in preparation of https://reviews.llvm.org/D52010.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346288 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/RegAllocFast.cpp | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index a388cd608d1..1e3e35af56d 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -88,7 +88,7 @@ namespace {
       unsigned short LastOpNum = 0;    ///< OpNum on LastUse.
       bool Dirty = false;              ///< Register needs spill.
 
-      explicit LiveReg(unsigned v) : VirtReg(v) {}
+      explicit LiveReg(unsigned VirtReg) : VirtReg(VirtReg) {}
 
       unsigned getSparseSetIndex() const {
         return TargetRegisterInfo::virtReg2Index(VirtReg);
@@ -96,14 +96,13 @@ namespace {
     };
 
     using LiveRegMap = SparseSet<LiveReg>;
-
     /// This map contains entries for each virtual register that is currently
     /// available in a physical register.
     LiveRegMap LiveVirtRegs;
 
-    DenseMap<unsigned, SmallVector<MachineInstr *, 4>> LiveDbgValueMap;
+    DenseMap<unsigned, SmallVector<MachineInstr *, 2>> LiveDbgValueMap;
 
-    /// Track the state of a physical register.
+    /// State of a physical register.
     enum RegState {
       /// A disabled register is not available for allocation, but an alias may
       /// be in use. A register can only be moved out of the disabled state if
@@ -123,18 +122,16 @@ namespace {
       /// register. In that case, LiveVirtRegs contains the inverse mapping.
     };
 
-    /// One of the RegState enums, or a virtreg.
+    /// Maps each physical register to a RegState enum or a virtual register.
     std::vector<unsigned> PhysRegState;
 
     SmallVector<unsigned, 16> VirtDead;
     SmallVector<MachineInstr *, 32> Coalesced;
 
-    /// Set of register units.
-    using UsedInInstrSet = SparseSet<unsigned>;
-
+    using RegUnitSet = SparseSet<uint16_t, identity<uint16_t>>;
     /// Set of register units that are used in the current instruction, and so
     /// cannot be allocated.
-    UsedInInstrSet UsedInInstr;
+    RegUnitSet UsedInInstr;
 
     /// Mark a physreg as used in this instruction.
     void markRegUsedInInstr(MCPhysReg PhysReg) {
@@ -155,7 +152,7 @@ namespace {
     bool isBulkSpilling = false;
 
     enum : unsigned {
-      spillClean = 1,
+      spillClean = 50,
       spillDirty = 100,
       spillImpossible = ~0u
     };
@@ -180,10 +177,11 @@ namespace {
 
   private:
     bool runOnMachineFunction(MachineFunction &MF) override;
+
     void allocateBasicBlock(MachineBasicBlock &MBB);
     void handleThroughOperands(MachineInstr &MI,
                                SmallVectorImpl<unsigned> &VirtDead);
-    int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass &RC);
+    int getStackSpaceFor(unsigned VirtReg);
     bool isLastUseOfLocalReg(const MachineOperand &MO) const;
 
     void addKillFlag(const LiveReg &LRI);
@@ -228,8 +226,7 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
 
 /// This allocates space for the specified virtual register to be held on the
 /// stack.
-int RegAllocFast::getStackSpaceFor(unsigned VirtReg,
-                                   const TargetRegisterClass &RC) {
+int RegAllocFast::getStackSpaceFor(unsigned VirtReg) {
   // Find the location Reg would belong...
   int SS = StackSlotForVirtReg[VirtReg];
   // Already has space allocated?
@@ -237,6 +234,7 @@ int RegAllocFast::getStackSpaceFor(unsigned VirtReg,
     return SS;
 
   // Allocate a new stack object for this spill location...
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
   unsigned Size = TRI->getSpillSize(RC);
   unsigned Align = TRI->getSpillAlignment(RC);
   int FrameIdx = MFI->CreateSpillStackObject(Size, Align);
@@ -325,7 +323,7 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
     LLVM_DEBUG(dbgs() << "Spilling " << printReg(LRI->VirtReg, TRI) << " in "
                       << printReg(LR.PhysReg, TRI));
     const TargetRegisterClass &RC = *MRI->getRegClass(LRI->VirtReg);
-    int FI = getStackSpaceFor(LRI->VirtReg, RC);
+    int FI = getStackSpaceFor(LRI->VirtReg);
     LLVM_DEBUG(dbgs() << " to stack slot #" << FI << "\n");
     TII->storeRegToStackSlot(*MBB, MI, LR.PhysReg, SpillKill, FI, &RC, TRI);
     ++NumStores;   // Update statistics
@@ -656,7 +654,7 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::reloadVirtReg(MachineInstr &MI,
   if (New) {
     LRI = allocVirtReg(MI, LRI, Hint);
     const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
-    int FrameIndex = getStackSpaceFor(VirtReg, RC);
+    int FrameIndex = getStackSpaceFor(VirtReg);
     LLVM_DEBUG(dbgs() << "Reloading " << printReg(VirtReg, TRI) << " into "
                       << printReg(LRI->PhysReg, TRI) << "\n");
     TII->loadRegFromStackSlot(*MBB, MI, LRI->PhysReg, FrameIndex, &RC, TRI);
@@ -1084,7 +1082,6 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   LLVM_DEBUG(MBB.dump());
 }
 
-/// Allocates registers for a function.
 bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
                     << "********** Function: " << MF.getName() << '\n');
-- 
GitLab


From a2606fd7a7608f334b96bf3434494e9682ec0d4d Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Wed, 7 Nov 2018 02:04:12 +0000
Subject: [PATCH 1043/1116] RegAllocFast: Factor spill/reload creation into
 their own functions; NFC

This is in preparation of https://reviews.llvm.org/D52010.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346289 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/RegAllocFast.cpp | 82 ++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 32 deletions(-)

diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 1e3e35af56d..242e952bb24 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -181,7 +181,6 @@ namespace {
     void allocateBasicBlock(MachineBasicBlock &MBB);
     void handleThroughOperands(MachineInstr &MI,
                                SmallVectorImpl<unsigned> &VirtDead);
-    int getStackSpaceFor(unsigned VirtReg);
     bool isLastUseOfLocalReg(const MachineOperand &MO) const;
 
     void addKillFlag(const LiveReg &LRI);
@@ -214,6 +213,12 @@ namespace {
     void spillAll(MachineBasicBlock::iterator MI);
     bool setPhysReg(MachineInstr &MI, unsigned OpNum, MCPhysReg PhysReg);
 
+    int getStackSpaceFor(unsigned VirtReg);
+    void spill(MachineBasicBlock::iterator Before, unsigned VirtReg,
+               MCPhysReg AssignedReg, bool Kill);
+    void reload(MachineBasicBlock::iterator Before, unsigned VirtReg,
+                MCPhysReg PhysReg);
+
     void dumpState();
   };
 
@@ -244,6 +249,46 @@ int RegAllocFast::getStackSpaceFor(unsigned VirtReg) {
   return FrameIdx;
 }
 
+/// Insert spill instruction for \p AssignedReg before \p Before. Update
+/// DBG_VALUEs with \p VirtReg operands with the stack slot.
+void RegAllocFast::spill(MachineBasicBlock::iterator Before, unsigned VirtReg,
+                         MCPhysReg AssignedReg, bool Kill) {
+  LLVM_DEBUG(dbgs() << "Spilling " << printReg(VirtReg, TRI)
+                    << " in " << printReg(AssignedReg, TRI));
+  int FI = getStackSpaceFor(VirtReg);
+  LLVM_DEBUG(dbgs() << " to stack slot #" << FI << "\n");
+
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI);
+  ++NumStores;
+
+  // If this register is used by DBG_VALUE then insert new DBG_VALUE to
+  // identify spilled location as the place to find corresponding variable's
+  // value.
+  SmallVectorImpl<MachineInstr *> &LRIDbgValues = LiveDbgValueMap[VirtReg];
+  for (MachineInstr *DBG : LRIDbgValues) {
+    MachineInstr *NewDV = buildDbgValueForSpill(*MBB, Before, *DBG, FI);
+    assert(NewDV->getParent() == MBB && "dangling parent pointer");
+    (void)NewDV;
+    LLVM_DEBUG(dbgs() << "Inserting debug info due to spill:\n" << *NewDV);
+  }
+  // Now this register is spilled there is should not be any DBG_VALUE
+  // pointing to this register because they are all pointing to spilled value
+  // now.
+  LRIDbgValues.clear();
+}
+
+/// Insert reload instruction for \p PhysReg before \p Before.
+void RegAllocFast::reload(MachineBasicBlock::iterator Before, unsigned VirtReg,
+                          MCPhysReg PhysReg) {
+  LLVM_DEBUG(dbgs() << "Reloading " << printReg(VirtReg, TRI) << " into "
+                    << printReg(PhysReg, TRI) << "\n");
+  int FI = getStackSpaceFor(VirtReg);
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, TRI);
+  ++NumLoads;
+}
+
 /// Return true if MO is the only remaining reference to its virtual register,
 /// and it is guaranteed to be a block-local register.
 bool RegAllocFast::isLastUseOfLocalReg(const MachineOperand &MO) const {
@@ -320,31 +365,9 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
     // instruction, not on the spill.
     bool SpillKill = MachineBasicBlock::iterator(LR.LastUse) != MI;
     LR.Dirty = false;
-    LLVM_DEBUG(dbgs() << "Spilling " << printReg(LRI->VirtReg, TRI) << " in "
-                      << printReg(LR.PhysReg, TRI));
-    const TargetRegisterClass &RC = *MRI->getRegClass(LRI->VirtReg);
-    int FI = getStackSpaceFor(LRI->VirtReg);
-    LLVM_DEBUG(dbgs() << " to stack slot #" << FI << "\n");
-    TII->storeRegToStackSlot(*MBB, MI, LR.PhysReg, SpillKill, FI, &RC, TRI);
-    ++NumStores;   // Update statistics
-
-    // If this register is used by DBG_VALUE then insert new DBG_VALUE to
-    // identify spilled location as the place to find corresponding variable's
-    // value.
-    SmallVectorImpl<MachineInstr *> &LRIDbgValues =
-      LiveDbgValueMap[LRI->VirtReg];
-    for (MachineInstr *DBG : LRIDbgValues) {
-      MachineInstr *NewDV = buildDbgValueForSpill(*MBB, MI, *DBG, FI);
-      assert(NewDV->getParent() == MBB && "dangling parent pointer");
-      (void)NewDV;
-      LLVM_DEBUG(dbgs() << "Inserting debug info due to spill:"
-                        << "\n"
-                        << *NewDV);
-    }
-    // Now this register is spilled there is should not be any DBG_VALUE
-    // pointing to this register because they are all pointing to spilled value
-    // now.
-    LRIDbgValues.clear();
+
+    spill(MI, LRI->VirtReg, LR.PhysReg, SpillKill);
+
     if (SpillKill)
       LR.LastUse = nullptr; // Don't kill register again
   }
@@ -653,12 +676,7 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::reloadVirtReg(MachineInstr &MI,
   MachineOperand &MO = MI.getOperand(OpNum);
   if (New) {
     LRI = allocVirtReg(MI, LRI, Hint);
-    const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
-    int FrameIndex = getStackSpaceFor(VirtReg);
-    LLVM_DEBUG(dbgs() << "Reloading " << printReg(VirtReg, TRI) << " into "
-                      << printReg(LRI->PhysReg, TRI) << "\n");
-    TII->loadRegFromStackSlot(*MBB, MI, LRI->PhysReg, FrameIndex, &RC, TRI);
-    ++NumLoads;
+    reload(MI, VirtReg, LRI->PhysReg);
   } else if (LRI->Dirty) {
     if (isLastUseOfLocalReg(MO)) {
       LLVM_DEBUG(dbgs() << "Killing last use: " << MO << "\n");
-- 
GitLab


From a1b1c95f7003d575c09a6348f30cb44696286b3d Mon Sep 17 00:00:00 2001
From: Shoaib Meenai <smeenai@fb.com>
Date: Wed, 7 Nov 2018 02:22:59 +0000
Subject: [PATCH 1044/1116] [cmake] Fix typo. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346290 91177308-0d34-0410-b5e6-96231b3b80d8
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c189bd875b4..6d767208edb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -855,7 +855,7 @@ if( LLVM_INCLUDE_UTILS )
 else()
   if ( LLVM_INCLUDE_TESTS )
     message(FATAL_ERROR "Including tests when not building utils will not work.
-    Either set LLVM_INCLUDE_UTILS to On, or set LLVM_INCLDE_TESTS to Off.")
+    Either set LLVM_INCLUDE_UTILS to On, or set LLVM_INCLUDE_TESTS to Off.")
   endif()
 endif()
 
-- 
GitLab


From 7fe05795840708d135416fb9d9a5b58a27e84c4b Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Wed, 7 Nov 2018 02:26:03 +0000
Subject: [PATCH 1045/1116] [WebAssembly] Update more test cases after
 FixFunctionBitcasts

These test updates were missing from rL346286.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346291 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Object/obj2yaml.test            | 4 +++-
 test/Object/objdump-relocations.test | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test
index d9540d4422f..46a0d773636 100644
--- a/test/Object/obj2yaml.test
+++ b/test/Object/obj2yaml.test
@@ -667,8 +667,10 @@ WASM-NEXT:         Size:            13
 WASM-NEXT:       - Index:           2
 WASM:              Name:            puts
 WASM:            - Index:           3
-WASM:              Name:            SomeOtherFunction
+WASM:              Name:            .LSomeOtherFunction_bitcast
 WASM:            - Index:           4
+WASM:              Name:            SomeOtherFunction
+WASM:            - Index:           5
 WASM:              Name:            var
 WASM:          SegmentInfo:
 WASM-NEXT:       - Index:           0
diff --git a/test/Object/objdump-relocations.test b/test/Object/objdump-relocations.test
index 24fd4a32efa..3a1793e3daf 100644
--- a/test/Object/objdump-relocations.test
+++ b/test/Object/objdump-relocations.test
@@ -62,6 +62,7 @@ ELF-MIPSEL: R_MIPS_CALL16 SomeOtherFunction
 WASM:      CODE
 WASM-NEXT: R_WEBASSEMBLY_MEMORY_ADDR_SLEB .L.str
 WASM-NEXT: R_WEBASSEMBLY_FUNCTION_INDEX_LEB puts
+WASM-NEXT: R_WEBASSEMBLY_FUNCTION_INDEX_LEB .LSomeOtherFunction_bitcast
 WASM-NEXT: R_WEBASSEMBLY_FUNCTION_INDEX_LEB SomeOtherFunction
 
 ELF-complex-x86-64: .text
-- 
GitLab


From 63cd3e5cd361ce074783fc0df241ecc6dc547dca Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 7 Nov 2018 03:02:11 +0000
Subject: [PATCH 1046/1116] [llvm-strip] Check "strip" with StringRef::contains
 instead of ends_with

Summary: If argv[0] is version suffixed, e.g. llvm-strip-7, this will still work.

Reviewers: rupprecht, jhenderson, alexshap, jakehehrlich

Reviewed By: rupprecht

Subscribers: alexshap, jakehehrlich, llvm-commits

Differential Revision: https://reviews.llvm.org/D54193

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346292 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objcopy/llvm-objcopy.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index a033aaecb98..755c786cee9 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -219,7 +219,7 @@ int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   ToolName = argv[0];
   DriverConfig DriverConfig;
-  if (sys::path::stem(ToolName).endswith_lower("strip"))
+  if (sys::path::stem(ToolName).contains("strip"))
     DriverConfig = parseStripOptions(makeArrayRef(argv + 1, argc));
   else
     DriverConfig = parseObjcopyOptions(makeArrayRef(argv + 1, argc));
-- 
GitLab


From 8fa10fd67fbc6912188ab1a6453348b7cbe7289b Mon Sep 17 00:00:00 2001
From: Dean Michael Berris <dberris@google.com>
Date: Wed, 7 Nov 2018 04:37:42 +0000
Subject: [PATCH 1047/1116] [XRay] Use TSC delta encoding for custom/typed
 events

Summary:
This change updates the version number for FDR logs to 5, and update the
trace processing to support changes in the custom event records.

In the runtime, since we're already writing down the record preamble to
handle CPU migrations and TSC wraparound, we can use the same TSC delta
encoding in the custom event and typed event records that we use in
function event records. We do the same change to typed events (which
were unsupported before this change in the trace processing) which now
show up in the trace.

Future changes should increase our testing coverage to make custom and
typed events as first class entities in the FDR mode log processing
tools.

This change is also a good example of how we end up supporting new
record types in the FDR mode implementation. This shows the places where
new record types are added and supported.

Depends on D54139.

Reviewers: mboerger

Subscribers: hiraditya, arphaman, jfb, llvm-commits

Differential Revision: https://reviews.llvm.org/D54140

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346293 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/XRay/BlockIndexer.h     |  2 +
 include/llvm/XRay/BlockPrinter.h     |  2 +
 include/llvm/XRay/BlockVerifier.h    |  3 +
 include/llvm/XRay/FDRRecords.h       | 53 ++++++++++++++-
 include/llvm/XRay/FDRTraceExpander.h |  2 +
 include/llvm/XRay/FDRTraceWriter.h   |  2 +
 include/llvm/XRay/RecordPrinter.h    |  2 +
 lib/XRay/BlockIndexer.cpp            | 10 +++
 lib/XRay/BlockPrinter.cpp            | 18 +++++
 lib/XRay/BlockVerifier.cpp           | 28 ++++++--
 lib/XRay/FDRRecordProducer.cpp       |  5 +-
 lib/XRay/FDRRecords.cpp              |  2 +
 lib/XRay/FDRTraceExpander.cpp        | 33 ++++++++++
 lib/XRay/FDRTraceWriter.cpp          | 18 +++++
 lib/XRay/FileHeaderReader.cpp        |  4 --
 lib/XRay/RecordInitializer.cpp       | 99 ++++++++++++++++++++++++++++
 lib/XRay/RecordPrinter.cpp           | 15 +++++
 lib/XRay/Trace.cpp                   | 13 +++-
 unittests/XRay/FDRRecordsTest.cpp    | 12 ++++
 19 files changed, 310 insertions(+), 13 deletions(-)

diff --git a/include/llvm/XRay/BlockIndexer.h b/include/llvm/XRay/BlockIndexer.h
index 46a7243685f..b42fa17f3fb 100644
--- a/include/llvm/XRay/BlockIndexer.h
+++ b/include/llvm/XRay/BlockIndexer.h
@@ -54,6 +54,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 
   /// The flush() function will clear out the current state of the visitor, to
   /// allow for explicitly flushing a block's records to the currently
diff --git a/include/llvm/XRay/BlockPrinter.h b/include/llvm/XRay/BlockPrinter.h
index 3a8f6e0d35e..bfb21e23951 100644
--- a/include/llvm/XRay/BlockPrinter.h
+++ b/include/llvm/XRay/BlockPrinter.h
@@ -50,6 +50,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 
   void reset() { CurrentState = State::Start; }
 };
diff --git a/include/llvm/XRay/BlockVerifier.h b/include/llvm/XRay/BlockVerifier.h
index b43a435e93b..46371c13891 100644
--- a/include/llvm/XRay/BlockVerifier.h
+++ b/include/llvm/XRay/BlockVerifier.h
@@ -33,6 +33,7 @@ public:
     NewCPUId,
     TSCWrap,
     CustomEvent,
+    TypedEvent,
     Function,
     CallArg,
     EndOfBuffer,
@@ -58,6 +59,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 
   Error verify();
   void reset();
diff --git a/include/llvm/XRay/FDRRecords.h b/include/llvm/XRay/FDRRecords.h
index 2d47ab3cfe5..9d48332d508 100644
--- a/include/llvm/XRay/FDRRecords.h
+++ b/include/llvm/XRay/FDRRecords.h
@@ -66,6 +66,7 @@ public:
     PIDEntry,
     NewBuffer,
     EndOfBuffer,
+    TypedEvent,
   };
 
   Type type() const override { return Type::Metadata; }
@@ -174,6 +175,52 @@ public:
   Error apply(RecordVisitor &V) override;
 };
 
+class CustomEventRecordV5 : public MetadataRecord {
+  int32_t Size = 0;
+  int32_t Delta = 0;
+  std::string Data{};
+  friend class RecordInitializer;
+
+public:
+  CustomEventRecordV5() = default;
+  explicit CustomEventRecordV5(int32_t S, int32_t D, std::string P)
+      : MetadataRecord(), Size(S), Delta(D), Data(std::move(P)) {}
+
+  MetadataType metadataType() const override {
+    return MetadataType::CustomEvent;
+  }
+
+  int32_t size() const { return Size; }
+  int32_t delta() const { return Delta; }
+  StringRef data() const { return Data; }
+
+  Error apply(RecordVisitor &V) override;
+};
+
+class TypedEventRecord : public MetadataRecord {
+  int32_t Size = 0;
+  int32_t Delta = 0;
+  uint16_t EventType = 0;
+  std::string Data{};
+  friend class RecordInitializer;
+
+public:
+  TypedEventRecord() = default;
+  explicit TypedEventRecord(int32_t S, int32_t D, uint16_t E, std::string P)
+      : MetadataRecord(), Size(S), Delta(D), Data(std::move(P)) {}
+
+  MetadataType metadataType() const override {
+    return MetadataType::TypedEvent;
+  }
+
+  int32_t size() const { return Size; }
+  int32_t delta() const { return Delta; }
+  uint16_t eventType() const { return EventType; }
+  StringRef data() const { return Data; }
+
+  Error apply(RecordVisitor &V) override;
+};
+
 class CallArgRecord : public MetadataRecord {
   uint64_t Arg;
   friend class RecordInitializer;
@@ -269,6 +316,8 @@ public:
   virtual Error visit(NewBufferRecord &) = 0;
   virtual Error visit(EndBufferRecord &) = 0;
   virtual Error visit(FunctionRecord &) = 0;
+  virtual Error visit(CustomEventRecordV5 &) = 0;
+  virtual Error visit(TypedEventRecord &) = 0;
 };
 
 class RecordInitializer : public RecordVisitor {
@@ -277,7 +326,7 @@ class RecordInitializer : public RecordVisitor {
   uint16_t Version;
 
 public:
-  static constexpr uint16_t DefaultVersion = 4u;
+  static constexpr uint16_t DefaultVersion = 5u;
 
   explicit RecordInitializer(DataExtractor &DE, uint32_t &OP, uint16_t V)
       : RecordVisitor(), E(DE), OffsetPtr(OP), Version(V) {}
@@ -295,6 +344,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 };
 
 } // namespace xray
diff --git a/include/llvm/XRay/FDRTraceExpander.h b/include/llvm/XRay/FDRTraceExpander.h
index 64c459930b2..02a21bed5ce 100644
--- a/include/llvm/XRay/FDRTraceExpander.h
+++ b/include/llvm/XRay/FDRTraceExpander.h
@@ -49,6 +49,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 
   // Must be called after all the records have been processed, to handle the
   // most recent record generated.
diff --git a/include/llvm/XRay/FDRTraceWriter.h b/include/llvm/XRay/FDRTraceWriter.h
index 91488f89ecc..7b3b5fa25ef 100644
--- a/include/llvm/XRay/FDRTraceWriter.h
+++ b/include/llvm/XRay/FDRTraceWriter.h
@@ -43,6 +43,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 
 private:
   support::endian::Writer OS;
diff --git a/include/llvm/XRay/RecordPrinter.h b/include/llvm/XRay/RecordPrinter.h
index bad1a5742b4..649c64ab6f5 100644
--- a/include/llvm/XRay/RecordPrinter.h
+++ b/include/llvm/XRay/RecordPrinter.h
@@ -40,6 +40,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 };
 
 } // namespace xray
diff --git a/lib/XRay/BlockIndexer.cpp b/lib/XRay/BlockIndexer.cpp
index 98e91f7de54..4dbe2d2717a 100644
--- a/lib/XRay/BlockIndexer.cpp
+++ b/lib/XRay/BlockIndexer.cpp
@@ -39,6 +39,16 @@ Error BlockIndexer::visit(CustomEventRecord &R) {
   return Error::success();
 }
 
+Error BlockIndexer::visit(CustomEventRecordV5 &R) {
+  CurrentBlock.Records.push_back(&R);
+  return Error::success();
+}
+
+Error BlockIndexer::visit(TypedEventRecord &R) {
+  CurrentBlock.Records.push_back(&R);
+  return Error::success();
+}
+
 Error BlockIndexer::visit(CallArgRecord &R) {
   CurrentBlock.Records.push_back(&R);
   return Error::success();
diff --git a/lib/XRay/BlockPrinter.cpp b/lib/XRay/BlockPrinter.cpp
index c8b65fc12d7..0acebee0cbd 100644
--- a/lib/XRay/BlockPrinter.cpp
+++ b/lib/XRay/BlockPrinter.cpp
@@ -68,6 +68,24 @@ Error BlockPrinter::visit(CustomEventRecord &R) {
   return E;
 }
 
+Error BlockPrinter::visit(CustomEventRecordV5 &R) {
+  if (CurrentState == State::Metadata)
+    OS << "\n";
+  CurrentState = State::CustomEvent;
+  OS << "*  ";
+  auto E = RP.visit(R);
+  return E;
+}
+
+Error BlockPrinter::visit(TypedEventRecord &R) {
+  if (CurrentState == State::Metadata)
+    OS << "\n";
+  CurrentState = State::CustomEvent;
+  OS << "*  ";
+  auto E = RP.visit(R);
+  return E;
+}
+
 // Function call printing.
 Error BlockPrinter::visit(FunctionRecord &R) {
   if (CurrentState == State::Metadata)
diff --git a/lib/XRay/BlockVerifier.cpp b/lib/XRay/BlockVerifier.cpp
index 62be1a87ab5..5e949ec4e46 100644
--- a/lib/XRay/BlockVerifier.cpp
+++ b/lib/XRay/BlockVerifier.cpp
@@ -43,6 +43,8 @@ StringRef recordToString(BlockVerifier::State R) {
     return "CallArg";
   case BlockVerifier::State::EndOfBuffer:
     return "EndOfBuffer";
+  case BlockVerifier::State::TypedEvent:
+    return "TypedEvent";
   case BlockVerifier::State::StateMax:
   case BlockVerifier::State::Unknown:
     return "Unknown";
@@ -75,27 +77,34 @@ Error BlockVerifier::transition(State To) {
                        {State::NewCPUId,
                         {mask(State::NewCPUId) | mask(State::TSCWrap) |
                          mask(State::CustomEvent) | mask(State::Function) |
-                         mask(State::EndOfBuffer)}},
+                         mask(State::EndOfBuffer) | mask(State::TypedEvent)}},
 
                        {State::TSCWrap,
                         {mask(State::TSCWrap) | mask(State::NewCPUId) |
                          mask(State::CustomEvent) | mask(State::Function) |
-                         mask(State::EndOfBuffer)}},
+                         mask(State::EndOfBuffer) | mask(State::TypedEvent)}},
 
                        {State::CustomEvent,
                         {mask(State::CustomEvent) | mask(State::TSCWrap) |
                          mask(State::NewCPUId) | mask(State::Function) |
-                         mask(State::EndOfBuffer)}},
+                         mask(State::EndOfBuffer) | mask(State::TypedEvent)}},
+
+                       {State::TypedEvent,
+                        {mask(State::TypedEvent) | mask(State::TSCWrap) |
+                         mask(State::NewCPUId) | mask(State::Function) |
+                         mask(State::EndOfBuffer) | mask(State::CustomEvent)}},
 
                        {State::Function,
                         {mask(State::Function) | mask(State::TSCWrap) |
                          mask(State::NewCPUId) | mask(State::CustomEvent) |
-                         mask(State::CallArg) | mask(State::EndOfBuffer)}},
+                         mask(State::CallArg) | mask(State::EndOfBuffer) |
+                         mask(State::TypedEvent)}},
 
                        {State::CallArg,
                         {mask(State::CallArg) | mask(State::Function) |
                          mask(State::TSCWrap) | mask(State::NewCPUId) |
-                         mask(State::CustomEvent) | mask(State::EndOfBuffer)}},
+                         mask(State::CustomEvent) | mask(State::EndOfBuffer) |
+                         mask(State::TypedEvent)}},
 
                        {State::EndOfBuffer, {}}}};
 
@@ -145,6 +154,14 @@ Error BlockVerifier::visit(CustomEventRecord &) {
   return transition(State::CustomEvent);
 }
 
+Error BlockVerifier::visit(CustomEventRecordV5 &) {
+  return transition(State::CustomEvent);
+}
+
+Error BlockVerifier::visit(TypedEventRecord &) {
+  return transition(State::TypedEvent);
+}
+
 Error BlockVerifier::visit(CallArgRecord &) {
   return transition(State::CallArg);
 }
@@ -169,6 +186,7 @@ Error BlockVerifier::verify() {
   case State::EndOfBuffer:
   case State::NewCPUId:
   case State::CustomEvent:
+  case State::TypedEvent:
   case State::Function:
   case State::CallArg:
   case State::TSCWrap:
diff --git a/lib/XRay/FDRRecordProducer.cpp b/lib/XRay/FDRRecordProducer.cpp
index 59b5697cd64..122578010c4 100644
--- a/lib/XRay/FDRRecordProducer.cpp
+++ b/lib/XRay/FDRRecordProducer.cpp
@@ -53,14 +53,15 @@ metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
   case MetadataRecordKinds::WalltimeMarkerKind:
     return make_unique<WallclockRecord>();
   case MetadataRecordKinds::CustomEventMarkerKind:
+    if (Header.Version >= 5)
+      return make_unique<CustomEventRecordV5>();
     return make_unique<CustomEventRecord>();
   case MetadataRecordKinds::CallArgumentKind:
     return make_unique<CallArgRecord>();
   case MetadataRecordKinds::BufferExtentsKind:
     return make_unique<BufferExtents>();
   case MetadataRecordKinds::TypedEventMarkerKind:
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Encountered an unsupported TypedEventMarker.");
+    return make_unique<TypedEventRecord>();
   case MetadataRecordKinds::PidKind:
     return make_unique<PIDRecord>();
   case MetadataRecordKinds::EnumEndMarker:
diff --git a/lib/XRay/FDRRecords.cpp b/lib/XRay/FDRRecords.cpp
index 66d17ffcb53..2b68a73686f 100644
--- a/lib/XRay/FDRRecords.cpp
+++ b/lib/XRay/FDRRecords.cpp
@@ -26,6 +26,8 @@ Error PIDRecord::apply(RecordVisitor &V) { return V.visit(*this); }
 Error NewBufferRecord::apply(RecordVisitor &V) { return V.visit(*this); }
 Error EndBufferRecord::apply(RecordVisitor &V) { return V.visit(*this); }
 Error FunctionRecord::apply(RecordVisitor &V) { return V.visit(*this); }
+Error CustomEventRecordV5::apply(RecordVisitor &V) { return V.visit(*this); }
+Error TypedEventRecord::apply(RecordVisitor &V) { return V.visit(*this); }
 
 } // namespace xray
 } // namespace llvm
diff --git a/lib/XRay/FDRTraceExpander.cpp b/lib/XRay/FDRTraceExpander.cpp
index adddb550ecd..e67f4b5d89f 100644
--- a/lib/XRay/FDRTraceExpander.cpp
+++ b/lib/XRay/FDRTraceExpander.cpp
@@ -52,6 +52,39 @@ Error TraceExpander::visit(CustomEventRecord &R) {
   return Error::success();
 }
 
+Error TraceExpander::visit(CustomEventRecordV5 &R) {
+  resetCurrentRecord();
+  if (!IgnoringRecords) {
+    BaseTSC += R.delta();
+    CurrentRecord.TSC = BaseTSC;
+    CurrentRecord.CPU = CPUId;
+    CurrentRecord.PId = PID;
+    CurrentRecord.TId = TID;
+    CurrentRecord.Type = RecordTypes::CUSTOM_EVENT;
+    std::copy(R.data().begin(), R.data().end(),
+              std::back_inserter(CurrentRecord.Data));
+    BuildingRecord = true;
+  }
+  return Error::success();
+}
+
+Error TraceExpander::visit(TypedEventRecord &R) {
+  resetCurrentRecord();
+  if (!IgnoringRecords) {
+    BaseTSC += R.delta();
+    CurrentRecord.TSC = BaseTSC;
+    CurrentRecord.CPU = CPUId;
+    CurrentRecord.PId = PID;
+    CurrentRecord.TId = TID;
+    CurrentRecord.RecordType = R.eventType();
+    CurrentRecord.Type = RecordTypes::TYPED_EVENT;
+    std::copy(R.data().begin(), R.data().end(),
+              std::back_inserter(CurrentRecord.Data));
+    BuildingRecord = true;
+  }
+  return Error::success();
+}
+
 Error TraceExpander::visit(CallArgRecord &R) {
   CurrentRecord.CallArgs.push_back(R.arg());
   CurrentRecord.Type = RecordTypes::ENTER_ARG;
diff --git a/lib/XRay/FDRTraceWriter.cpp b/lib/XRay/FDRTraceWriter.cpp
index 4f40593cba0..d5f96979986 100644
--- a/lib/XRay/FDRTraceWriter.cpp
+++ b/lib/XRay/FDRTraceWriter.cpp
@@ -102,6 +102,24 @@ Error FDRTraceWriter::visit(CustomEventRecord &R) {
   return Error::success();
 }
 
+Error FDRTraceWriter::visit(CustomEventRecordV5 &R) {
+  if (auto E = writeMetadata<5u>(OS, R.size(), R.delta()))
+    return E;
+  auto D = R.data();
+  ArrayRef<char> Bytes(D.data(), D.size());
+  OS.write(Bytes);
+  return Error::success();
+}
+
+Error FDRTraceWriter::visit(TypedEventRecord &R) {
+  if (auto E = writeMetadata<7u>(OS, R.size(), R.delta(), R.eventType()))
+    return E;
+  auto D = R.data();
+  ArrayRef<char> Bytes(D.data(), D.size());
+  OS.write(Bytes);
+  return Error::success();
+}
+
 Error FDRTraceWriter::visit(CallArgRecord &R) {
   return writeMetadata<6u>(OS, R.arg());
 }
diff --git a/lib/XRay/FileHeaderReader.cpp b/lib/XRay/FileHeaderReader.cpp
index 9dea217840b..0b3fb8b6f69 100644
--- a/lib/XRay/FileHeaderReader.cpp
+++ b/lib/XRay/FileHeaderReader.cpp
@@ -63,10 +63,6 @@ Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
   // Manually advance the offset pointer 16 bytes, after getting a raw memcpy
   // from the underlying data.
   OffsetPtr += 16;
-  if (FileHeader.Version < 1 || FileHeader.Version > 4)
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Unsupported XRay file version: %d at offset %d",
-                             FileHeader.Version, OffsetPtr);
   return std::move(FileHeader);
 }
 
diff --git a/lib/XRay/RecordInitializer.cpp b/lib/XRay/RecordInitializer.cpp
index 2ebaa1cec26..cc9dd460949 100644
--- a/lib/XRay/RecordInitializer.cpp
+++ b/lib/XRay/RecordInitializer.cpp
@@ -151,6 +151,105 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
   return Error::success();
 }
 
+Error RecordInitializer::visit(CustomEventRecordV5 &R) {
+  if (!E.isValidOffsetForDataOfSize(OffsetPtr,
+                                    MetadataRecord::kMetadataBodySize))
+    return createStringError(std::make_error_code(std::errc::bad_address),
+                             "Invalid offset for a custom event record (%d).",
+                             OffsetPtr);
+
+  auto BeginOffset = OffsetPtr;
+  auto PreReadOffset = OffsetPtr;
+
+  R.Size = E.getSigned(&OffsetPtr, sizeof(int32_t));
+  if (PreReadOffset == OffsetPtr)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a custom event record size field offset %d.", OffsetPtr);
+
+  PreReadOffset = OffsetPtr;
+  R.Delta = E.getSigned(&OffsetPtr, sizeof(int32_t));
+  if (PreReadOffset == OffsetPtr)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a custom event record TSC delta field at offset %d.",
+        OffsetPtr);
+
+  assert(OffsetPtr > BeginOffset &&
+         OffsetPtr - BeginOffset <= MetadataRecord::kMetadataBodySize);
+  OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - BeginOffset);
+
+  // Next we read in a fixed chunk of data from the given offset.
+  if (!E.isValidOffsetForDataOfSize(OffsetPtr, R.Size))
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Cannot read %d bytes of custom event data from offset %d.", R.Size,
+        OffsetPtr);
+
+  std::vector<uint8_t> Buffer;
+  Buffer.resize(R.Size);
+  if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Failed reading data into buffer of size %d at offset %d.", R.Size,
+        OffsetPtr);
+  R.Data.assign(Buffer.begin(), Buffer.end());
+  return Error::success();
+}
+
+Error RecordInitializer::visit(TypedEventRecord &R) {
+  if (!E.isValidOffsetForDataOfSize(OffsetPtr,
+                                    MetadataRecord::kMetadataBodySize))
+    return createStringError(std::make_error_code(std::errc::bad_address),
+                             "Invalid offset for a typed event record (%d).",
+                             OffsetPtr);
+
+  auto BeginOffset = OffsetPtr;
+  auto PreReadOffset = OffsetPtr;
+
+  R.Size = E.getSigned(&OffsetPtr, sizeof(int32_t));
+  if (PreReadOffset == OffsetPtr)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a typed event record size field offset %d.", OffsetPtr);
+
+  PreReadOffset = OffsetPtr;
+  R.Delta = E.getSigned(&OffsetPtr, sizeof(int32_t));
+  if (PreReadOffset == OffsetPtr)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a typed event record TSC delta field at offset %d.",
+        OffsetPtr);
+
+  PreReadOffset = OffsetPtr;
+  R.EventType = E.getU16(&OffsetPtr);
+  if (PreReadOffset == OffsetPtr)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a typed event record type field at offset %d.", OffsetPtr);
+
+  assert(OffsetPtr > BeginOffset &&
+         OffsetPtr - BeginOffset <= MetadataRecord::kMetadataBodySize);
+  OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - BeginOffset);
+
+  // Next we read in a fixed chunk of data from the given offset.
+  if (!E.isValidOffsetForDataOfSize(OffsetPtr, R.Size))
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Cannot read %d bytes of custom event data from offset %d.", R.Size,
+        OffsetPtr);
+
+  std::vector<uint8_t> Buffer;
+  Buffer.resize(R.Size);
+  if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Failed reading data into buffer of size %d at offset %d.", R.Size,
+        OffsetPtr);
+  R.Data.assign(Buffer.begin(), Buffer.end());
+  return Error::success();
+}
+
 Error RecordInitializer::visit(CallArgRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr,
                                     MetadataRecord::kMetadataBodySize))
diff --git a/lib/XRay/RecordPrinter.cpp b/lib/XRay/RecordPrinter.cpp
index 61a292cef85..71ea7d0e969 100644
--- a/lib/XRay/RecordPrinter.cpp
+++ b/lib/XRay/RecordPrinter.cpp
@@ -42,6 +42,21 @@ Error RecordPrinter::visit(CustomEventRecord &R) {
   return Error::success();
 }
 
+Error RecordPrinter::visit(CustomEventRecordV5 &R) {
+  OS << formatv("<Custom Event: delta = +{0}, size = {1}, data = '{2}'>",
+                R.delta(), R.size(), R.data())
+     << Delim;
+  return Error::success();
+}
+
+Error RecordPrinter::visit(TypedEventRecord &R) {
+  OS << formatv(
+            "<Typed Event: delta = +{0}, type = {1}, size = {2}, data = '{3}'",
+            R.delta(), R.eventType(), R.size(), R.data())
+     << Delim;
+  return Error::success();
+}
+
 Error RecordPrinter::visit(CallArgRecord &R) {
   OS << formatv("<Call Argument: data = {0} (hex = {0:x})>", R.arg()) << Delim;
   return Error::success();
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
index 37cd147078e..4f28f3f754c 100644
--- a/lib/XRay/Trace.cpp
+++ b/lib/XRay/Trace.cpp
@@ -247,6 +247,17 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
 /// ThreadBuffer: BufferExtents NewBuffer WallClockTime Pid NewCPUId
 ///               FunctionSequence
 /// EOB: *deprecated*
+///
+/// In Version 4, we make the following changes:
+///
+/// CustomEventRecord now includes the CPU data.
+///
+/// In Version 5, we make the following changes:
+///
+/// CustomEventRecord and TypedEventRecord now use TSC delta encoding similar to
+/// what FunctionRecord instances use, and we no longer need to include the CPU
+/// id in the CustomEventRecord.
+///
 Error loadFDRLog(StringRef Data, bool IsLittleEndian,
                  XRayFileHeader &FileHeader, std::vector<XRayRecord> &Records) {
 
@@ -435,7 +446,7 @@ Expected<Trace> llvm::xray::loadTrace(const DataExtractor &DE, bool Sort) {
     }
     break;
   case FLIGHT_DATA_RECORDER_FORMAT:
-    if (Version >= 1 && Version <= 4) {
+    if (Version >= 1 && Version <= 5) {
       if (auto E = loadFDRLog(DE.getData(), DE.isLittleEndian(), T.FileHeader,
                               T.Records))
         return std::move(E);
diff --git a/unittests/XRay/FDRRecordsTest.cpp b/unittests/XRay/FDRRecordsTest.cpp
index 1cce1c2b2c1..86b478a5a45 100644
--- a/unittests/XRay/FDRRecordsTest.cpp
+++ b/unittests/XRay/FDRRecordsTest.cpp
@@ -34,6 +34,8 @@ TEST(XRayFDRTest, BuilderAndBlockIndexer) {
                     .add<PIDRecord>(1)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
   auto Block1 = LogBuilder()
                     .add<BufferExtents>(100)
@@ -42,6 +44,8 @@ TEST(XRayFDRTest, BuilderAndBlockIndexer) {
                     .add<PIDRecord>(1)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
   auto Block2 = LogBuilder()
                     .add<BufferExtents>(100)
@@ -50,6 +54,8 @@ TEST(XRayFDRTest, BuilderAndBlockIndexer) {
                     .add<PIDRecord>(1)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
   BlockIndexer::Index Index;
   BlockIndexer Indexer(Index);
@@ -92,6 +98,8 @@ TEST(XRayFDRTest, IndexAndVerifyBlocks) {
                     .add<NewCPUIDRecord>(1, 2)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
   auto Block1 = LogBuilder()
                     .add<BufferExtents>(64)
@@ -101,6 +109,8 @@ TEST(XRayFDRTest, IndexAndVerifyBlocks) {
                     .add<NewCPUIDRecord>(1, 2)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
   auto Block2 = LogBuilder()
                     .add<BufferExtents>(64)
@@ -110,6 +120,8 @@ TEST(XRayFDRTest, IndexAndVerifyBlocks) {
                     .add<NewCPUIDRecord>(1, 2)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
 
   // First, index the records in different blocks.
-- 
GitLab


From 2e8ff631d6999258f3503c72ef8d9d69c7bd35b8 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Wed, 7 Nov 2018 05:58:10 +0000
Subject: [PATCH 1048/1116] [NFC] Add missing test case, some test renaming

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346295 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../LoopSimplifyCFG/constant-fold-branch.ll   | 47 +++++++++++++++++--
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll b/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 69d79d5a6f1..44f1c0bcd88 100644
--- a/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -42,6 +42,45 @@ exit:
   ret i32 %i.2
 }
 
+; Make sure that we can eliminate a provably dead backedge with switch.
+define i32 @dead_backedge_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_backedge_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_BE:%.*]], [[HEADER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[I_1:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[I_1]], 100
+; CHECK-NEXT:    br i1 [[CMP1]], label [[HEADER_BACKEDGE]], label [[DEAD_BACKEDGE:%.*]]
+; CHECK:       header.backedge:
+; CHECK-NEXT:    [[I_BE]] = phi i32 [ [[I_1]], [[HEADER]] ], [ [[I_2:%.*]], [[DEAD_BACKEDGE]] ]
+; CHECK-NEXT:    br label [[HEADER]]
+; CHECK:       dead_backedge:
+; CHECK-NEXT:    [[I_2]] = add i32 [[I_1]], 10
+; CHECK-NEXT:    switch i32 1, label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_2_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.1, %header], [%i.2, %dead_backedge]
+  %i.1 = add i32 %i, 1
+  %cmp1 = icmp slt i32 %i.1, 100
+  br i1 %cmp1, label %header, label %dead_backedge
+
+dead_backedge:
+  %i.2 = add i32 %i.1, 10
+  switch i32 1, label %exit [i32 0, label %header]
+
+exit:
+  ret i32 %i.2
+}
+
 ; Check that we can eliminate a triangle.
 define i32 @dead_block_test_branch_loop(i32 %end) {
 ; CHECK-LABEL: @dead_block_test_branch_loop(
@@ -660,8 +699,8 @@ exit:
 }
 
 ; Check that when the block is not actually dead, we don't remove it.
-define i32 @no_live_block_test_branch_loop(i1 %c, i32 %end) {
-; CHECK-LABEL: @no_live_block_test_branch_loop(
+define i32 @live_block_test_branch_loop(i1 %c, i32 %end) {
+; CHECK-LABEL: @live_block_test_branch_loop(
 ; CHECK-NEXT:  preheader:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
 ; CHECK:       header:
@@ -705,8 +744,8 @@ exit:
   ret i32 %i.inc
 }
 
-define i32 @no_live_block_test_switch_loop(i1 %c, i32 %end) {
-; CHECK-LABEL: @no_live_block_test_switch_loop(
+define i32 @live_block_test_switch_loop(i1 %c, i32 %end) {
+; CHECK-LABEL: @live_block_test_switch_loop(
 ; CHECK-NEXT:  preheader:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
 ; CHECK:       header:
-- 
GitLab


From 2c10712291f9d0c185ce8082270187d4f72c4c56 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Wed, 7 Nov 2018 06:57:00 +0000
Subject: [PATCH 1049/1116] RegAllocFast: Refactor PhysRegState usage; NFC

This is in preparation of https://reviews.llvm.org/D52010.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346296 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/RegAllocFast.cpp | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 242e952bb24..1cd5caba67b 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -133,6 +133,8 @@ namespace {
     /// cannot be allocated.
     RegUnitSet UsedInInstr;
 
+    void setPhysRegState(MCPhysReg PhysReg, unsigned NewState);
+
     /// Mark a physreg as used in this instruction.
     void markRegUsedInInstr(MCPhysReg PhysReg) {
       for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
@@ -229,6 +231,10 @@ char RegAllocFast::ID = 0;
 INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
                 false)
 
+void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
+  PhysRegState[PhysReg] = NewState;
+}
+
 /// This allocates space for the specified virtual register to be held on the
 /// stack.
 int RegAllocFast::getStackSpaceFor(unsigned VirtReg) {
@@ -328,7 +334,7 @@ void RegAllocFast::killVirtReg(LiveRegMap::iterator LRI) {
   addKillFlag(*LRI);
   assert(PhysRegState[LRI->PhysReg] == LRI->VirtReg &&
          "Broken RegState mapping");
-  PhysRegState[LRI->PhysReg] = regFree;
+  setPhysRegState(LRI->PhysReg, regFree);
   // Erase from LiveVirtRegs unless we're spilling in bulk.
   if (!isBulkSpilling)
     LiveVirtRegs.erase(LRI);
@@ -438,12 +444,12 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) {
     case regFree:
       if (TRI->isSuperRegister(PhysReg, Alias)) {
         // Leave the superregister in the working set.
-        PhysRegState[Alias] = regFree;
+        setPhysRegState(Alias, regFree);
         MO.getParent()->addRegisterKilled(Alias, TRI, true);
         return;
       }
       // Some other alias was in the working set - clear it.
-      PhysRegState[Alias] = regDisabled;
+      setPhysRegState(Alias, regDisabled);
       break;
     default:
       llvm_unreachable("Instruction uses an alias of an allocated register");
@@ -451,7 +457,7 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) {
   }
 
   // All aliases are disabled, bring register into working set.
-  PhysRegState[PhysReg] = regFree;
+  setPhysRegState(PhysReg, regFree);
   MO.setIsKill();
 }
 
@@ -469,12 +475,12 @@ void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
     LLVM_FALLTHROUGH;
   case regFree:
   case regReserved:
-    PhysRegState[PhysReg] = NewState;
+    setPhysRegState(PhysReg, NewState);
     return;
   }
 
   // This is a disabled register, disable all aliases.
-  PhysRegState[PhysReg] = NewState;
+  setPhysRegState(PhysReg, NewState);
   for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
     MCPhysReg Alias = *AI;
     switch (unsigned VirtReg = PhysRegState[Alias]) {
@@ -485,7 +491,7 @@ void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
       LLVM_FALLTHROUGH;
     case regFree:
     case regReserved:
-      PhysRegState[Alias] = regDisabled;
+      setPhysRegState(Alias, regDisabled);
       if (TRI->isSuperRegister(PhysReg, Alias))
         return;
       break;
@@ -547,11 +553,13 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
 /// proper container for VirtReg now.  The physical register must not be used
 /// for anything else when this is called.
 void RegAllocFast::assignVirtToPhysReg(LiveReg &LR, MCPhysReg PhysReg) {
-  LLVM_DEBUG(dbgs() << "Assigning " << printReg(LR.VirtReg, TRI) << " to "
+  unsigned VirtReg = LR.VirtReg;
+  LLVM_DEBUG(dbgs() << "Assigning " << printReg(VirtReg, TRI) << " to "
                     << printReg(PhysReg, TRI) << "\n");
-  PhysRegState[PhysReg] = LR.VirtReg;
-  assert(!LR.PhysReg && "Already assigned a physreg");
+  assert(LR.PhysReg == 0 && "Already assigned a physreg");
+  assert(PhysReg != 0 && "Trying to assign no register");
   LR.PhysReg = PhysReg;
+  setPhysRegState(PhysReg, VirtReg);
 }
 
 RegAllocFast::LiveRegMap::iterator
-- 
GitLab


From b8a4e3806cc71b7f7a6052eda78e6fd5079d55c1 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Wed, 7 Nov 2018 06:57:02 +0000
Subject: [PATCH 1050/1116] RegAllocFast: Further cleanups; NFC

This is in preparation of https://reviews.llvm.org/D52010.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346297 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/RegAllocFast.cpp | 66 +++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 31 deletions(-)

diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 1cd5caba67b..e849bcec199 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -262,7 +262,7 @@ void RegAllocFast::spill(MachineBasicBlock::iterator Before, unsigned VirtReg,
   LLVM_DEBUG(dbgs() << "Spilling " << printReg(VirtReg, TRI)
                     << " in " << printReg(AssignedReg, TRI));
   int FI = getStackSpaceFor(VirtReg);
-  LLVM_DEBUG(dbgs() << " to stack slot #" << FI << "\n");
+  LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n');
 
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
   TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI);
@@ -288,7 +288,7 @@ void RegAllocFast::spill(MachineBasicBlock::iterator Before, unsigned VirtReg,
 void RegAllocFast::reload(MachineBasicBlock::iterator Before, unsigned VirtReg,
                           MCPhysReg PhysReg) {
   LLVM_DEBUG(dbgs() << "Reloading " << printReg(VirtReg, TRI) << " into "
-                    << printReg(PhysReg, TRI) << "\n");
+                    << printReg(PhysReg, TRI) << '\n');
   int FI = getStackSpaceFor(VirtReg);
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
   TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, TRI);
@@ -555,7 +555,7 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
 void RegAllocFast::assignVirtToPhysReg(LiveReg &LR, MCPhysReg PhysReg) {
   unsigned VirtReg = LR.VirtReg;
   LLVM_DEBUG(dbgs() << "Assigning " << printReg(VirtReg, TRI) << " to "
-                    << printReg(PhysReg, TRI) << "\n");
+                    << printReg(PhysReg, TRI) << '\n');
   assert(LR.PhysReg == 0 && "Already assigned a physreg");
   assert(PhysReg != 0 && "Trying to assign no register");
   LR.PhysReg = PhysReg;
@@ -578,8 +578,11 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Can only allocate virtual registers");
 
-  // Take hint when possible.
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  LLVM_DEBUG(dbgs() << "Search register for " << printReg(VirtReg)
+                    << " in class " << TRI->getRegClassName(&RC) << '\n');
+
+  // Take hint when possible.
   if (TargetRegisterInfo::isPhysicalRegister(Hint) &&
       MRI->isAllocatable(Hint) && RC.contains(Hint)) {
     // Ignore the hint if we would have to spill a dirty register.
@@ -594,8 +597,8 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
   }
 
   // First try to find a completely free register.
-  ArrayRef<MCPhysReg> AO = RegClassInfo.getOrder(&RC);
-  for (MCPhysReg PhysReg : AO) {
+  ArrayRef<MCPhysReg> AllocationOrder = RegClassInfo.getOrder(&RC);
+  for (MCPhysReg PhysReg : AllocationOrder) {
     if (PhysRegState[PhysReg] == regFree && !isRegUsedInInstr(PhysReg)) {
       assignVirtToPhysReg(*LRI, PhysReg);
       return LRI;
@@ -603,38 +606,39 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
   }
 
   LLVM_DEBUG(dbgs() << "Allocating " << printReg(VirtReg) << " from "
-                    << TRI->getRegClassName(&RC) << "\n");
+                    << TRI->getRegClassName(&RC) << '\n');
 
   unsigned BestReg = 0;
   unsigned BestCost = spillImpossible;
-  for (MCPhysReg PhysReg : AO) {
+  for (MCPhysReg PhysReg : AllocationOrder) {
+    LLVM_DEBUG(dbgs() << "\tRegister: " << printReg(PhysReg, TRI) << ' ');
     unsigned Cost = calcSpillCost(PhysReg);
-    LLVM_DEBUG(dbgs() << "\tRegister: " << printReg(PhysReg, TRI) << "\n");
-    LLVM_DEBUG(dbgs() << "\tCost: " << Cost << "\n");
-    LLVM_DEBUG(dbgs() << "\tBestCost: " << BestCost << "\n");
+    LLVM_DEBUG(dbgs() << "Cost: " << Cost << " BestCost: " << BestCost << '\n');
     // Cost is 0 when all aliases are already disabled.
     if (Cost == 0) {
       assignVirtToPhysReg(*LRI, PhysReg);
       return LRI;
     }
-    if (Cost < BestCost)
-      BestReg = PhysReg, BestCost = Cost;
+    if (Cost < BestCost) {
+      BestReg = PhysReg;
+      BestCost = Cost;
+    }
   }
 
-  if (BestReg) {
-    definePhysReg(MI, BestReg, regFree);
-    // definePhysReg may kill virtual registers and modify LiveVirtRegs.
-    // That invalidates LRI, so run a new lookup for VirtReg.
-    return assignVirtToPhysReg(VirtReg, BestReg);
+  if (!BestReg) {
+    // Nothing we can do. Report an error and keep going with a bad allocation.
+    if (MI.isInlineAsm())
+      MI.emitError("inline assembly requires more registers than available");
+    else
+      MI.emitError("ran out of registers during register allocation");
+    definePhysReg(MI, *AllocationOrder.begin(), regFree);
+    return assignVirtToPhysReg(VirtReg, *AllocationOrder.begin());
   }
 
-  // Nothing we can do. Report an error and keep going with a bad allocation.
-  if (MI.isInlineAsm())
-    MI.emitError("inline assembly requires more registers than available");
-  else
-    MI.emitError("ran out of registers during register allocation");
-  definePhysReg(MI, *AO.begin(), regFree);
-  return assignVirtToPhysReg(VirtReg, *AO.begin());
+  definePhysReg(MI, BestReg, regFree);
+  // definePhysReg may kill virtual registers and modify LiveVirtRegs.
+  // That invalidates LRI, so run a new lookup for VirtReg.
+  return assignVirtToPhysReg(VirtReg, BestReg);
 }
 
 /// Allocates a register for VirtReg and mark it as dirty.
@@ -687,16 +691,16 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::reloadVirtReg(MachineInstr &MI,
     reload(MI, VirtReg, LRI->PhysReg);
   } else if (LRI->Dirty) {
     if (isLastUseOfLocalReg(MO)) {
-      LLVM_DEBUG(dbgs() << "Killing last use: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Killing last use: " << MO << '\n');
       if (MO.isUse())
         MO.setIsKill();
       else
         MO.setIsDead();
     } else if (MO.isKill()) {
-      LLVM_DEBUG(dbgs() << "Clearing dubious kill: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Clearing dubious kill: " << MO << '\n');
       MO.setIsKill(false);
     } else if (MO.isDead()) {
-      LLVM_DEBUG(dbgs() << "Clearing dubious dead: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Clearing dubious dead: " << MO << '\n');
       MO.setIsDead(false);
     }
   } else if (MO.isKill()) {
@@ -704,10 +708,10 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::reloadVirtReg(MachineInstr &MI,
     // register would be killed immediately, and there might be a second use:
     //   %foo = OR killed %x, %x
     // This would cause a second reload of %x into a different register.
-    LLVM_DEBUG(dbgs() << "Clearing clean kill: " << MO << "\n");
+    LLVM_DEBUG(dbgs() << "Clearing clean kill: " << MO << '\n');
     MO.setIsKill(false);
   } else if (MO.isDead()) {
-    LLVM_DEBUG(dbgs() << "Clearing clean dead: " << MO << "\n");
+    LLVM_DEBUG(dbgs() << "Clearing clean dead: " << MO << '\n');
     MO.setIsDead(false);
   }
   assert(LRI->PhysReg && "Register not assigned");
@@ -800,7 +804,7 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
       // Note: we don't update the def operand yet. That would cause the normal
       // def-scan to attempt spilling.
     } else if (MO.getSubReg() && MI.readsVirtualRegister(Reg)) {
-      LLVM_DEBUG(dbgs() << "Partial redefine: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Partial redefine: " << MO << '\n');
       // Reload the register, but don't assign to the operand just yet.
       // That would confuse the later phys-def processing pass.
       LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, 0);
-- 
GitLab


From b7a96d6f835679d8a391c77fcd63d7a827717e50 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Wed, 7 Nov 2018 06:57:03 +0000
Subject: [PATCH 1051/1116] RegAllocFast: Leave unassigned virtreg entries in
 map

Set `LiveReg::PhysReg` to zero when freeing a register instead of
removing it from the entry from `LiveRegMap`. This way no iterators get
invalidated and we can avoid passing around and updating iterators all
over the place.

This does not change any allocator decisions. It is not completely NFC
because the arbitrary iteration order through `LiveRegMap` in
`spillAll()` changes so we may get a different order in those spill
sequences (the amount of spills does not change).

This is in preparation of https://reviews.llvm.org/D52010.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346298 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/RegAllocFast.cpp                  | 167 ++++-----
 .../AMDGPU/control-flow-fastregalloc.ll       |  33 +-
 .../AMDGPU/partial-sgpr-to-vgpr-spills.ll     | 342 ++++++++----------
 test/CodeGen/AMDGPU/spill-m0.ll               |  24 +-
 test/CodeGen/Mips/atomic.ll                   |  78 ++--
 test/CodeGen/Mips/atomic64.ll                 |   2 +-
 test/CodeGen/Mips/atomicCmpSwapPW.ll          |   6 +-
 test/CodeGen/X86/atomic32.ll                  |  28 +-
 test/CodeGen/X86/avx-load-store.ll            |  32 +-
 test/CodeGen/X86/avx512-mask-zext-bugfix.ll   |  42 +--
 test/CodeGen/X86/pr30430.ll                   |   4 +-
 test/CodeGen/X86/pr32284.ll                   |   2 +-
 test/CodeGen/X86/pr32345.ll                   |   2 +-
 test/CodeGen/X86/pr34592.ll                   |   6 +-
 test/CodeGen/X86/pr34653.ll                   |  30 +-
 15 files changed, 374 insertions(+), 424 deletions(-)

diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index e849bcec199..ea7f247214d 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -149,10 +149,6 @@ namespace {
       return false;
     }
 
-    /// This flag is set when LiveRegMap will be cleared completely after
-    /// spilling all live registers. LiveRegMap entries should not be erased.
-    bool isBulkSpilling = false;
-
     enum : unsigned {
       spillClean = 50,
       spillDirty = 100,
@@ -186,9 +182,9 @@ namespace {
     bool isLastUseOfLocalReg(const MachineOperand &MO) const;
 
     void addKillFlag(const LiveReg &LRI);
-    void killVirtReg(LiveRegMap::iterator LRI);
+    void killVirtReg(LiveReg &LR);
     void killVirtReg(unsigned VirtReg);
-    void spillVirtReg(MachineBasicBlock::iterator MI, LiveRegMap::iterator);
+    void spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR);
     void spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg);
 
     void usePhysReg(MachineOperand &MO);
@@ -205,13 +201,11 @@ namespace {
       return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
     }
 
-    LiveRegMap::iterator assignVirtToPhysReg(unsigned VirtReg, MCPhysReg PhysReg);
-    LiveRegMap::iterator allocVirtReg(MachineInstr &MI, LiveRegMap::iterator,
-                                      unsigned Hint);
-    LiveRegMap::iterator defineVirtReg(MachineInstr &MI, unsigned OpNum,
-                                       unsigned VirtReg, unsigned Hint);
-    LiveRegMap::iterator reloadVirtReg(MachineInstr &MI, unsigned OpNum,
-                                       unsigned VirtReg, unsigned Hint);
+    void allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint);
+    MCPhysReg defineVirtReg(MachineInstr &MI, unsigned OpNum, unsigned VirtReg,
+                            unsigned Hint);
+    LiveReg &reloadVirtReg(MachineInstr &MI, unsigned OpNum, unsigned VirtReg,
+                           unsigned Hint);
     void spillAll(MachineBasicBlock::iterator MI);
     bool setPhysReg(MachineInstr &MI, unsigned OpNum, MCPhysReg PhysReg);
 
@@ -330,14 +324,12 @@ void RegAllocFast::addKillFlag(const LiveReg &LR) {
 }
 
 /// Mark virtreg as no longer available.
-void RegAllocFast::killVirtReg(LiveRegMap::iterator LRI) {
-  addKillFlag(*LRI);
-  assert(PhysRegState[LRI->PhysReg] == LRI->VirtReg &&
+void RegAllocFast::killVirtReg(LiveReg &LR) {
+  addKillFlag(LR);
+  assert(PhysRegState[LR.PhysReg] == LR.VirtReg &&
          "Broken RegState mapping");
-  setPhysRegState(LRI->PhysReg, regFree);
-  // Erase from LiveVirtRegs unless we're spilling in bulk.
-  if (!isBulkSpilling)
-    LiveVirtRegs.erase(LRI);
+  setPhysRegState(LR.PhysReg, regFree);
+  LR.PhysReg = 0;
 }
 
 /// Mark virtreg as no longer available.
@@ -345,8 +337,8 @@ void RegAllocFast::killVirtReg(unsigned VirtReg) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "killVirtReg needs a virtual register");
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
-  if (LRI != LiveVirtRegs.end())
-    killVirtReg(LRI);
+  if (LRI != LiveVirtRegs.end() && LRI->PhysReg)
+    killVirtReg(*LRI);
 }
 
 /// This method spills the value specified by VirtReg into the corresponding
@@ -356,15 +348,14 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Spilling a physical register is illegal!");
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
-  assert(LRI != LiveVirtRegs.end() && "Spilling unmapped virtual register");
-  spillVirtReg(MI, LRI);
+  assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
+         "Spilling unmapped virtual register");
+  spillVirtReg(MI, *LRI);
 }
 
 /// Do the actual work of spilling.
-void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
-                                LiveRegMap::iterator LRI) {
-  LiveReg &LR = *LRI;
-  assert(PhysRegState[LR.PhysReg] == LRI->VirtReg && "Broken RegState mapping");
+void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR) {
+  assert(PhysRegState[LR.PhysReg] == LR.VirtReg && "Broken RegState mapping");
 
   if (LR.Dirty) {
     // If this physreg is used by the instruction, we want to kill it on the
@@ -372,25 +363,25 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
     bool SpillKill = MachineBasicBlock::iterator(LR.LastUse) != MI;
     LR.Dirty = false;
 
-    spill(MI, LRI->VirtReg, LR.PhysReg, SpillKill);
+    spill(MI, LR.VirtReg, LR.PhysReg, SpillKill);
 
     if (SpillKill)
       LR.LastUse = nullptr; // Don't kill register again
   }
-  killVirtReg(LRI);
+  killVirtReg(LR);
 }
 
 /// Spill all dirty virtregs without killing them.
 void RegAllocFast::spillAll(MachineBasicBlock::iterator MI) {
   if (LiveVirtRegs.empty()) return;
-  isBulkSpilling = true;
   // The LiveRegMap is keyed by an unsigned (the virtreg number), so the order
   // of spilling here is deterministic, if arbitrary.
-  for (LiveRegMap::iterator I = LiveVirtRegs.begin(), E = LiveVirtRegs.end();
-       I != E; ++I)
-    spillVirtReg(MI, I);
+  for (LiveReg &LR : LiveVirtRegs) {
+    if (!LR.PhysReg)
+      continue;
+    spillVirtReg(MI, LR);
+  }
   LiveVirtRegs.clear();
-  isBulkSpilling = false;
 }
 
 /// Handle the direct use of a physical register.  Check that the register is
@@ -519,9 +510,10 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
                       << printReg(PhysReg, TRI) << " is reserved already.\n");
     return spillImpossible;
   default: {
-    LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
-    assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
-    return I->Dirty ? spillDirty : spillClean;
+    LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
+    assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
+           "Missing VirtReg entry");
+    return LRI->Dirty ? spillDirty : spillClean;
   }
   }
 
@@ -539,9 +531,10 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
     case regReserved:
       return spillImpossible;
     default: {
-      LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
-      assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
-      Cost += I->Dirty ? spillDirty : spillClean;
+      LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
+      assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
+             "Missing VirtReg entry");
+      Cost += LRI->Dirty ? spillDirty : spillClean;
       break;
     }
     }
@@ -562,18 +555,9 @@ void RegAllocFast::assignVirtToPhysReg(LiveReg &LR, MCPhysReg PhysReg) {
   setPhysRegState(PhysReg, VirtReg);
 }
 
-RegAllocFast::LiveRegMap::iterator
-RegAllocFast::assignVirtToPhysReg(unsigned VirtReg, MCPhysReg PhysReg) {
-  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
-  assert(LRI != LiveVirtRegs.end() && "VirtReg disappeared");
-  assignVirtToPhysReg(*LRI, PhysReg);
-  return LRI;
-}
-
 /// Allocates a physical register for VirtReg.
-RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
-    LiveRegMap::iterator LRI, unsigned Hint) {
-  const unsigned VirtReg = LRI->VirtReg;
+void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint) {
+  const unsigned VirtReg = LR.VirtReg;
 
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Can only allocate virtual registers");
@@ -590,9 +574,8 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
     if (Cost < spillDirty) {
       if (Cost)
         definePhysReg(MI, Hint, regFree);
-      // definePhysReg may kill virtual registers and modify LiveVirtRegs.
-      // That invalidates LRI, so run a new lookup for VirtReg.
-      return assignVirtToPhysReg(VirtReg, Hint);
+      assignVirtToPhysReg(LR, Hint);
+      return;
     }
   }
 
@@ -600,8 +583,8 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
   ArrayRef<MCPhysReg> AllocationOrder = RegClassInfo.getOrder(&RC);
   for (MCPhysReg PhysReg : AllocationOrder) {
     if (PhysRegState[PhysReg] == regFree && !isRegUsedInInstr(PhysReg)) {
-      assignVirtToPhysReg(*LRI, PhysReg);
-      return LRI;
+      assignVirtToPhysReg(LR, PhysReg);
+      return;
     }
   }
 
@@ -616,8 +599,8 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
     LLVM_DEBUG(dbgs() << "Cost: " << Cost << " BestCost: " << BestCost << '\n');
     // Cost is 0 when all aliases are already disabled.
     if (Cost == 0) {
-      assignVirtToPhysReg(*LRI, PhysReg);
-      return LRI;
+      assignVirtToPhysReg(LR, PhysReg);
+      return;
     }
     if (Cost < BestCost) {
       BestReg = PhysReg;
@@ -632,26 +615,23 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
     else
       MI.emitError("ran out of registers during register allocation");
     definePhysReg(MI, *AllocationOrder.begin(), regFree);
-    return assignVirtToPhysReg(VirtReg, *AllocationOrder.begin());
+    assignVirtToPhysReg(LR, *AllocationOrder.begin());
+    return;
   }
 
   definePhysReg(MI, BestReg, regFree);
-  // definePhysReg may kill virtual registers and modify LiveVirtRegs.
-  // That invalidates LRI, so run a new lookup for VirtReg.
-  return assignVirtToPhysReg(VirtReg, BestReg);
+  assignVirtToPhysReg(LR, BestReg);
 }
 
 /// Allocates a register for VirtReg and mark it as dirty.
-RegAllocFast::LiveRegMap::iterator RegAllocFast::defineVirtReg(MachineInstr &MI,
-                                                               unsigned OpNum,
-                                                               unsigned VirtReg,
-                                                               unsigned Hint) {
+MCPhysReg RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
+                                      unsigned VirtReg, unsigned Hint) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Not a virtual register");
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
-  if (New) {
+  if (!LRI->PhysReg) {
     // If there is no hint, peek at the only use of this register.
     if ((!Hint || !TargetRegisterInfo::isPhysicalRegister(Hint)) &&
         MRI->hasOneNonDBGUse(VirtReg)) {
@@ -660,7 +640,7 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::defineVirtReg(MachineInstr &MI,
       if (UseMI.isCopyLike())
         Hint = UseMI.getOperand(0).getReg();
     }
-    LRI = allocVirtReg(MI, LRI, Hint);
+    allocVirtReg(MI, *LRI, Hint);
   } else if (LRI->LastUse) {
     // Redefining a live register - kill at the last use, unless it is this
     // instruction defining VirtReg multiple times.
@@ -672,22 +652,22 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::defineVirtReg(MachineInstr &MI,
   LRI->LastOpNum = OpNum;
   LRI->Dirty = true;
   markRegUsedInInstr(LRI->PhysReg);
-  return LRI;
+  return LRI->PhysReg;
 }
 
 /// Make sure VirtReg is available in a physreg and return it.
-RegAllocFast::LiveRegMap::iterator RegAllocFast::reloadVirtReg(MachineInstr &MI,
-                                                               unsigned OpNum,
-                                                               unsigned VirtReg,
-                                                               unsigned Hint) {
+RegAllocFast::LiveReg &RegAllocFast::reloadVirtReg(MachineInstr &MI,
+                                                   unsigned OpNum,
+                                                   unsigned VirtReg,
+                                                   unsigned Hint) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Not a virtual register");
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
   MachineOperand &MO = MI.getOperand(OpNum);
-  if (New) {
-    LRI = allocVirtReg(MI, LRI, Hint);
+  if (!LRI->PhysReg) {
+    allocVirtReg(MI, *LRI, Hint);
     reload(MI, VirtReg, LRI->PhysReg);
   } else if (LRI->Dirty) {
     if (isLastUseOfLocalReg(MO)) {
@@ -718,7 +698,7 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::reloadVirtReg(MachineInstr &MI,
   LRI->LastUse = &MI;
   LRI->LastOpNum = OpNum;
   markRegUsedInInstr(LRI->PhysReg);
-  return LRI;
+  return *LRI;
 }
 
 /// Changes operand OpNum in MI the refer the PhysReg, considering subregs. This
@@ -798,8 +778,8 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
       LLVM_DEBUG(dbgs() << "Operand " << I << "(" << MO
                         << ") is tied to operand " << MI.findTiedOperandIdx(I)
                         << ".\n");
-      LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, 0);
-      MCPhysReg PhysReg = LRI->PhysReg;
+      LiveReg &LR = reloadVirtReg(MI, I, Reg, 0);
+      MCPhysReg PhysReg = LR.PhysReg;
       setPhysReg(MI, I, PhysReg);
       // Note: we don't update the def operand yet. That would cause the normal
       // def-scan to attempt spilling.
@@ -807,8 +787,8 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
       LLVM_DEBUG(dbgs() << "Partial redefine: " << MO << '\n');
       // Reload the register, but don't assign to the operand just yet.
       // That would confuse the later phys-def processing pass.
-      LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, 0);
-      PartialDefs.push_back(LRI->PhysReg);
+      LiveReg &LR = reloadVirtReg(MI, I, Reg, 0);
+      PartialDefs.push_back(LR.PhysReg);
     }
   }
 
@@ -821,8 +801,7 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
     if (!MO.isEarlyClobber())
       continue;
     // Note: defineVirtReg may invalidate MO.
-    LiveRegMap::iterator LRI = defineVirtReg(MI, I, Reg, 0);
-    MCPhysReg PhysReg = LRI->PhysReg;
+    MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, 0);
     if (setPhysReg(MI, I, PhysReg))
       VirtDead.push_back(Reg);
   }
@@ -856,11 +835,12 @@ void RegAllocFast::dumpState() {
       break;
     default: {
       dbgs() << '=' << printReg(PhysRegState[Reg]);
-      LiveRegMap::iterator I = findLiveVirtReg(PhysRegState[Reg]);
-      assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
-      if (I->Dirty)
+      LiveRegMap::iterator LRI = findLiveVirtReg(PhysRegState[Reg]);
+      assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
+             "Missing VirtReg entry");
+      if (LRI->Dirty)
         dbgs() << "*";
-      assert(I->PhysReg == Reg && "Bad inverse map");
+      assert(LRI->PhysReg == Reg && "Bad inverse map");
       break;
     }
     }
@@ -869,6 +849,8 @@ void RegAllocFast::dumpState() {
   // Check that LiveVirtRegs is the inverse.
   for (LiveRegMap::iterator i = LiveVirtRegs.begin(),
        e = LiveVirtRegs.end(); i != e; ++i) {
+    if (!i->PhysReg)
+      continue;
     assert(TargetRegisterInfo::isVirtualRegister(i->VirtReg) &&
            "Bad map key");
     assert(TargetRegisterInfo::isPhysicalRegister(i->PhysReg) &&
@@ -916,7 +898,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
       // See if this virtual register has already been allocated to a physical
       // register or spilled to a stack slot.
       LiveRegMap::iterator LRI = findLiveVirtReg(Reg);
-      if (LRI != LiveVirtRegs.end())
+      if (LRI != LiveVirtRegs.end() && LRI->PhysReg)
         setPhysReg(*DebugMI, 0, LRI->PhysReg);
       else {
         int SS = StackSlotForVirtReg[Reg];
@@ -1026,11 +1008,11 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
       unsigned Reg = MO.getReg();
       if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
       if (MO.isUse()) {
-        LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, CopyDstReg);
-        MCPhysReg PhysReg = LRI->PhysReg;
+        LiveReg &LR = reloadVirtReg(MI, I, Reg, CopyDstReg);
+        MCPhysReg PhysReg = LR.PhysReg;
         CopySrcReg = (CopySrcReg == Reg || CopySrcReg == PhysReg) ? PhysReg : 0;
         if (setPhysReg(MI, I, PhysReg))
-          killVirtReg(LRI);
+          killVirtReg(LR);
       }
     }
 
@@ -1074,8 +1056,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
         definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
         continue;
       }
-      LiveRegMap::iterator LRI = defineVirtReg(MI, I, Reg, CopySrcReg);
-      MCPhysReg PhysReg = LRI->PhysReg;
+      MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, CopySrcReg);
       if (setPhysReg(MI, I, PhysReg)) {
         VirtDead.push_back(Reg);
         CopyDstReg = 0; // cancel coalescing;
diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index d19072a6c4e..41ecdd403d7 100644
--- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -21,18 +21,17 @@
 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]]
 
+; Spill load
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
 
-
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:8 ; 4-byte Folded Spill
-
-; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
@@ -57,11 +56,11 @@
 
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:8 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
@@ -103,7 +102,7 @@ endif:
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
 
 ; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -111,9 +110,9 @@ endif:
 
 
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:28 ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
@@ -122,7 +121,7 @@ endif:
 
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
-; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
 ; GCN: v_cmp_ne_u32_e32 vcc,
 ; GCN: s_and_b64 vcc, exec, vcc
@@ -134,11 +133,11 @@ endif:
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:28 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
@@ -182,7 +181,7 @@ end:
 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
 ; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -237,13 +236,13 @@ end:
 
 ; GCN: BB{{[0-9]+}}_2: ; %if
 ; GCN: ds_read_b32
-; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
 ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[ELSE]]: ; %else
-; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 ; GCN-NEXT: s_branch [[FLOW]]
diff --git a/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index d31d636cc41..a38bacd97a6 100644
--- a/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -82,95 +82,95 @@
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
-; GCN: v_writelane_b32 v0, s[[TMP_LO]], 48
-; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 49
-; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50
-; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51
-; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52
-; GCN-NEXT: v_writelane_b32 v0, s9, 53
-; GCN-NEXT: v_writelane_b32 v0, s10, 54
-; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55
-
-; GCN-NEXT: v_writelane_b32 v0, s84, 56
-; GCN-NEXT: v_writelane_b32 v0, s85, 57
-; GCN-NEXT: v_writelane_b32 v0, s86, 58
-; GCN-NEXT: v_writelane_b32 v0, s87, 59
-; GCN-NEXT: v_writelane_b32 v0, s88, 60
-; GCN-NEXT: v_writelane_b32 v0, s89, 61
-; GCN-NEXT: v_writelane_b32 v0, s90, 62
-; GCN-NEXT: v_writelane_b32 v0, s91, 63
-; GCN-NEXT: v_writelane_b32 v1, s12, 0
-; GCN-NEXT: v_writelane_b32 v1, s13, 1
-; GCN-NEXT: v_writelane_b32 v1, s14, 2
-; GCN-NEXT: v_writelane_b32 v1, s15, 3
-; GCN-NEXT: v_writelane_b32 v1, s16, 4
-; GCN-NEXT: v_writelane_b32 v1, s17, 5
-; GCN-NEXT: v_writelane_b32 v1, s18, 6
-; GCN-NEXT: v_writelane_b32 v1, s19, 7
-; GCN-NEXT: v_writelane_b32 v1, s20, 8
-; GCN-NEXT: v_writelane_b32 v1, s21, 9
-; GCN-NEXT: v_writelane_b32 v1, s22, 10
-; GCN-NEXT: v_writelane_b32 v1, s23, 11
-; GCN-NEXT: v_writelane_b32 v1, s24, 12
-; GCN-NEXT: v_writelane_b32 v1, s25, 13
-; GCN-NEXT: v_writelane_b32 v1, s26, 14
-; GCN-NEXT: v_writelane_b32 v1, s27, 15
-; GCN-NEXT: v_writelane_b32 v1, s28, 16
-; GCN-NEXT: v_writelane_b32 v1, s29, 17
-; GCN-NEXT: v_writelane_b32 v1, s30, 18
-; GCN-NEXT: v_writelane_b32 v1, s31, 19
-; GCN-NEXT: v_writelane_b32 v1, s32, 20
-; GCN-NEXT: v_writelane_b32 v1, s33, 21
-; GCN-NEXT: v_writelane_b32 v1, s34, 22
-; GCN-NEXT: v_writelane_b32 v1, s35, 23
-; GCN-NEXT: v_writelane_b32 v1, s36, 24
-; GCN-NEXT: v_writelane_b32 v1, s37, 25
-; GCN-NEXT: v_writelane_b32 v1, s38, 26
-; GCN-NEXT: v_writelane_b32 v1, s39, 27
-; GCN-NEXT: v_writelane_b32 v1, s40, 28
-; GCN-NEXT: v_writelane_b32 v1, s41, 29
-; GCN-NEXT: v_writelane_b32 v1, s42, 30
-; GCN-NEXT: v_writelane_b32 v1, s43, 31
-; GCN-NEXT: v_writelane_b32 v1, s44, 32
-; GCN-NEXT: v_writelane_b32 v1, s45, 33
-; GCN-NEXT: v_writelane_b32 v1, s46, 34
-; GCN-NEXT: v_writelane_b32 v1, s47, 35
-; GCN-NEXT: v_writelane_b32 v1, s48, 36
-; GCN-NEXT: v_writelane_b32 v1, s49, 37
-; GCN-NEXT: v_writelane_b32 v1, s50, 38
-; GCN-NEXT: v_writelane_b32 v1, s51, 39
-; GCN-NEXT: v_writelane_b32 v1, s52, 40
-; GCN-NEXT: v_writelane_b32 v1, s53, 41
-; GCN-NEXT: v_writelane_b32 v1, s54, 42
-; GCN-NEXT: v_writelane_b32 v1, s55, 43
-; GCN-NEXT: v_writelane_b32 v1, s56, 44
-; GCN-NEXT: v_writelane_b32 v1, s57, 45
-; GCN-NEXT: v_writelane_b32 v1, s58, 46
-; GCN-NEXT: v_writelane_b32 v1, s59, 47
-; GCN-NEXT: v_writelane_b32 v1, s60, 48
-; GCN-NEXT: v_writelane_b32 v1, s61, 49
-; GCN-NEXT: v_writelane_b32 v1, s62, 50
-; GCN-NEXT: v_writelane_b32 v1, s63, 51
-; GCN-NEXT: v_writelane_b32 v1, s64, 52
-; GCN-NEXT: v_writelane_b32 v1, s65, 53
-; GCN-NEXT: v_writelane_b32 v1, s66, 54
-; GCN-NEXT: v_writelane_b32 v1, s67, 55
-; GCN-NEXT: v_writelane_b32 v1, s68, 56
-; GCN-NEXT: v_writelane_b32 v1, s69, 57
-; GCN-NEXT: v_writelane_b32 v1, s70, 58
-; GCN-NEXT: v_writelane_b32 v1, s71, 59
-; GCN-NEXT: v_writelane_b32 v1, s72, 60
-; GCN-NEXT: v_writelane_b32 v1, s73, 61
-; GCN-NEXT: v_writelane_b32 v1, s74, 62
-; GCN-NEXT: v_writelane_b32 v1, s75, 63
-; GCN-NEXT: v_writelane_b32 v2, s76, 0
-; GCN-NEXT: v_writelane_b32 v2, s77, 1
-; GCN-NEXT: v_writelane_b32 v2, s78, 2
-; GCN-NEXT: v_writelane_b32 v2, s79, 3
-; GCN-NEXT: v_writelane_b32 v2, s80, 4
-; GCN-NEXT: v_writelane_b32 v2, s81, 5
-; GCN-NEXT: v_writelane_b32 v2, s82, 6
-; GCN-NEXT: v_writelane_b32 v2, s83, 7
+; GCN: v_writelane_b32 v0, s12, 48
+; GCN-NEXT: v_writelane_b32 v0, s13, 49
+; GCN-NEXT: v_writelane_b32 v0, s14, 50
+; GCN-NEXT: v_writelane_b32 v0, s15, 51
+; GCN-NEXT: v_writelane_b32 v0, s16, 52
+; GCN-NEXT: v_writelane_b32 v0, s17, 53
+; GCN-NEXT: v_writelane_b32 v0, s18, 54
+; GCN-NEXT: v_writelane_b32 v0, s19, 55
+
+; GCN-NEXT: v_writelane_b32 v0, s20, 56
+; GCN-NEXT: v_writelane_b32 v0, s21, 57
+; GCN-NEXT: v_writelane_b32 v0, s22, 58
+; GCN-NEXT: v_writelane_b32 v0, s23, 59
+; GCN-NEXT: v_writelane_b32 v0, s24, 60
+; GCN-NEXT: v_writelane_b32 v0, s25, 61
+; GCN-NEXT: v_writelane_b32 v0, s26, 62
+; GCN-NEXT: v_writelane_b32 v0, s27, 63
+; GCN-NEXT: v_writelane_b32 v1, s28, 0
+; GCN-NEXT: v_writelane_b32 v1, s29, 1
+; GCN-NEXT: v_writelane_b32 v1, s30, 2
+; GCN-NEXT: v_writelane_b32 v1, s31, 3
+; GCN-NEXT: v_writelane_b32 v1, s32, 4
+; GCN-NEXT: v_writelane_b32 v1, s33, 5
+; GCN-NEXT: v_writelane_b32 v1, s34, 6
+; GCN-NEXT: v_writelane_b32 v1, s35, 7
+; GCN-NEXT: v_writelane_b32 v1, s36, 8
+; GCN-NEXT: v_writelane_b32 v1, s37, 9
+; GCN-NEXT: v_writelane_b32 v1, s38, 10
+; GCN-NEXT: v_writelane_b32 v1, s39, 11
+; GCN-NEXT: v_writelane_b32 v1, s40, 12
+; GCN-NEXT: v_writelane_b32 v1, s41, 13
+; GCN-NEXT: v_writelane_b32 v1, s42, 14
+; GCN-NEXT: v_writelane_b32 v1, s43, 15
+; GCN-NEXT: v_writelane_b32 v1, s44, 16
+; GCN-NEXT: v_writelane_b32 v1, s45, 17
+; GCN-NEXT: v_writelane_b32 v1, s46, 18
+; GCN-NEXT: v_writelane_b32 v1, s47, 19
+; GCN-NEXT: v_writelane_b32 v1, s48, 20
+; GCN-NEXT: v_writelane_b32 v1, s49, 21
+; GCN-NEXT: v_writelane_b32 v1, s50, 22
+; GCN-NEXT: v_writelane_b32 v1, s51, 23
+; GCN-NEXT: v_writelane_b32 v1, s52, 24
+; GCN-NEXT: v_writelane_b32 v1, s53, 25
+; GCN-NEXT: v_writelane_b32 v1, s54, 26
+; GCN-NEXT: v_writelane_b32 v1, s55, 27
+; GCN-NEXT: v_writelane_b32 v1, s56, 28
+; GCN-NEXT: v_writelane_b32 v1, s57, 29
+; GCN-NEXT: v_writelane_b32 v1, s58, 30
+; GCN-NEXT: v_writelane_b32 v1, s59, 31
+; GCN-NEXT: v_writelane_b32 v1, s60, 32
+; GCN-NEXT: v_writelane_b32 v1, s61, 33
+; GCN-NEXT: v_writelane_b32 v1, s62, 34
+; GCN-NEXT: v_writelane_b32 v1, s63, 35
+; GCN-NEXT: v_writelane_b32 v1, s64, 36
+; GCN-NEXT: v_writelane_b32 v1, s65, 37
+; GCN-NEXT: v_writelane_b32 v1, s66, 38
+; GCN-NEXT: v_writelane_b32 v1, s67, 39
+; GCN-NEXT: v_writelane_b32 v1, s68, 40
+; GCN-NEXT: v_writelane_b32 v1, s69, 41
+; GCN-NEXT: v_writelane_b32 v1, s70, 42
+; GCN-NEXT: v_writelane_b32 v1, s71, 43
+; GCN-NEXT: v_writelane_b32 v1, s72, 44
+; GCN-NEXT: v_writelane_b32 v1, s73, 45
+; GCN-NEXT: v_writelane_b32 v1, s74, 46
+; GCN-NEXT: v_writelane_b32 v1, s75, 47
+; GCN-NEXT: v_writelane_b32 v1, s76, 48
+; GCN-NEXT: v_writelane_b32 v1, s77, 49
+; GCN-NEXT: v_writelane_b32 v1, s78, 50
+; GCN-NEXT: v_writelane_b32 v1, s79, 51
+; GCN-NEXT: v_writelane_b32 v1, s80, 52
+; GCN-NEXT: v_writelane_b32 v1, s81, 53
+; GCN-NEXT: v_writelane_b32 v1, s82, 54
+; GCN-NEXT: v_writelane_b32 v1, s83, 55
+; GCN-NEXT: v_writelane_b32 v1, s84, 56
+; GCN-NEXT: v_writelane_b32 v1, s85, 57
+; GCN-NEXT: v_writelane_b32 v1, s86, 58
+; GCN-NEXT: v_writelane_b32 v1, s87, 59
+; GCN-NEXT: v_writelane_b32 v1, s88, 60
+; GCN-NEXT: v_writelane_b32 v1, s89, 61
+; GCN-NEXT: v_writelane_b32 v1, s90, 62
+; GCN-NEXT: v_writelane_b32 v1, s91, 63
+; GCN-NEXT: v_writelane_b32 v2, s4, 0
+; GCN-NEXT: v_writelane_b32 v2, s5, 1
+; GCN-NEXT: v_writelane_b32 v2, s6, 2
+; GCN-NEXT: v_writelane_b32 v2, s7, 3
+; GCN-NEXT: v_writelane_b32 v2, s8, 4
+; GCN-NEXT: v_writelane_b32 v2, s9, 5
+; GCN-NEXT: v_writelane_b32 v2, s10, 6
+; GCN-NEXT: v_writelane_b32 v2, s11, 7
 ; GCN: s_cbranch_scc1
 
 
@@ -184,6 +184,25 @@
 ; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 7
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 48
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 49
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 50
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 51
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 52
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 53
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 54
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 55
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 56
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 57
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 58
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 59
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 60
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 61
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 62
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 63
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
 ; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 0
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 1
@@ -265,26 +284,6 @@
 ; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 63
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
-; GCN: v_readlane_b32 s{{[0-9]+}}, v2, 0
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 1
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 2
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 3
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 4
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 5
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 6
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 7
-; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
-
-; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 56
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 57
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 58
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 59
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 60
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 61
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 62
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 63
-; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
-
 ; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 8
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 9
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 10
@@ -335,14 +334,14 @@
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 47
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
-; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 48
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 49
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 50
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 51
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 52
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 53
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 54
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 55
+; GCN: v_readlane_b32 s{{[0-9]+}}, v2, 0
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 1
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 2
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 3
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 4
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 5
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 6
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 7
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, i32 %in) #0 {
   %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
@@ -396,39 +395,39 @@ ret:
 ; GCN: def s[4:19]
 ; GCN: def s[20:35]
 
-; GCN: v_writelane_b32 v0, s4, 50
-; GCN-NEXT: v_writelane_b32 v0, s5, 51
-; GCN-NEXT: v_writelane_b32 v0, s6, 52
-; GCN-NEXT: v_writelane_b32 v0, s7, 53
-; GCN-NEXT: v_writelane_b32 v0, s8, 54
-; GCN-NEXT: v_writelane_b32 v0, s9, 55
-; GCN-NEXT: v_writelane_b32 v0, s10, 56
-; GCN-NEXT: v_writelane_b32 v0, s11, 57
-; GCN-NEXT: v_writelane_b32 v0, s12, 58
-; GCN-NEXT: v_writelane_b32 v0, s13, 59
-; GCN-NEXT: v_writelane_b32 v0, s14, 60
-; GCN-NEXT: v_writelane_b32 v0, s15, 61
-; GCN-NEXT: v_writelane_b32 v0, s16, 62
-; GCN-NEXT: v_writelane_b32 v0, s17, 63
-; GCN-NEXT: v_writelane_b32 v1, s18, 0
-; GCN-NEXT: v_writelane_b32 v1, s19, 1
-
-; GCN: v_readlane_b32 s4, v0, 50
-; GCN-NEXT: v_readlane_b32 s5, v0, 51
-; GCN-NEXT: v_readlane_b32 s6, v0, 52
-; GCN-NEXT: v_readlane_b32 s7, v0, 53
-; GCN-NEXT: v_readlane_b32 s8, v0, 54
-; GCN-NEXT: v_readlane_b32 s9, v0, 55
-; GCN-NEXT: v_readlane_b32 s10, v0, 56
-; GCN-NEXT: v_readlane_b32 s11, v0, 57
-; GCN-NEXT: v_readlane_b32 s12, v0, 58
-; GCN-NEXT: v_readlane_b32 s13, v0, 59
-; GCN-NEXT: v_readlane_b32 s14, v0, 60
-; GCN-NEXT: v_readlane_b32 s15, v0, 61
-; GCN-NEXT: v_readlane_b32 s16, v0, 62
-; GCN-NEXT: v_readlane_b32 s17, v0, 63
-; GCN-NEXT: v_readlane_b32 s18, v1, 0
-; GCN-NEXT: v_readlane_b32 s19, v1, 1
+; GCN: v_writelane_b32 v0, s4, 48
+; GCN-NEXT: v_writelane_b32 v0, s5, 49
+; GCN-NEXT: v_writelane_b32 v0, s6, 50
+; GCN-NEXT: v_writelane_b32 v0, s7, 51
+; GCN-NEXT: v_writelane_b32 v0, s8, 52
+; GCN-NEXT: v_writelane_b32 v0, s9, 53
+; GCN-NEXT: v_writelane_b32 v0, s10, 54
+; GCN-NEXT: v_writelane_b32 v0, s11, 55
+; GCN-NEXT: v_writelane_b32 v0, s12, 56
+; GCN-NEXT: v_writelane_b32 v0, s13, 57
+; GCN-NEXT: v_writelane_b32 v0, s14, 58
+; GCN-NEXT: v_writelane_b32 v0, s15, 59
+; GCN-NEXT: v_writelane_b32 v0, s16, 60
+; GCN-NEXT: v_writelane_b32 v0, s17, 61
+; GCN-NEXT: v_writelane_b32 v0, s18, 62
+; GCN-NEXT: v_writelane_b32 v0, s19, 63
+
+; GCN: v_readlane_b32 s4, v0, 48
+; GCN-NEXT: v_readlane_b32 s5, v0, 49
+; GCN-NEXT: v_readlane_b32 s6, v0, 50
+; GCN-NEXT: v_readlane_b32 s7, v0, 51
+; GCN-NEXT: v_readlane_b32 s8, v0, 52
+; GCN-NEXT: v_readlane_b32 s9, v0, 53
+; GCN-NEXT: v_readlane_b32 s10, v0, 54
+; GCN-NEXT: v_readlane_b32 s11, v0, 55
+; GCN-NEXT: v_readlane_b32 s12, v0, 56
+; GCN-NEXT: v_readlane_b32 s13, v0, 57
+; GCN-NEXT: v_readlane_b32 s14, v0, 58
+; GCN-NEXT: v_readlane_b32 s15, v0, 59
+; GCN-NEXT: v_readlane_b32 s16, v0, 60
+; GCN-NEXT: v_readlane_b32 s17, v0, 61
+; GCN-NEXT: v_readlane_b32 s18, v0, 62
+; GCN-NEXT: v_readlane_b32 s19, v0, 63
 define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 {
   %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
   %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
@@ -493,8 +492,8 @@ ret:
 ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 31
 
 ; GCN: def s[0:1]
-; GCN:      v_writelane_b32 v23, s0, 32
-; GCN-NEXT: v_writelane_b32 v23, s1, 33
+; GCN:      v_writelane_b32 v23, s20, 32
+; GCN-NEXT: v_writelane_b32 v23, s21, 33
 
 ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 34
 ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 35
@@ -513,20 +512,6 @@ ret:
 ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 48
 ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 49
 
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 ; GCN: s_cbranch_scc1
@@ -551,7 +536,9 @@ ret:
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
 
-; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 34
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 32
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 33
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 34
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 35
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 36
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 37
@@ -564,9 +551,7 @@ ret:
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 44
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 45
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 46
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 47
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 48
-; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 49
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 47
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
 ; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 16
@@ -587,25 +572,10 @@ ret:
 ; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 31
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 
-; GCN: v_readlane_b32 s0, v23, 32
-; GCN: v_readlane_b32 s1, v23, 33
+; GCN: v_readfirstlane_b32 s1, v0
 ; GCN: ;;#ASMSTART
 ; GCN: ; use s[0:1]
 define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll
index ab54f9096cf..509b7a2dd68 100644
--- a/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/test/CodeGen/AMDGPU/spill-m0.ll
@@ -13,29 +13,29 @@
 ; GCN-DAG: s_cmp_lg_u32
 
 ; TOVGPR-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
-; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0
+; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 2
 
 ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]
-; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Spill
+; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 ; 4-byte Folded Spill
 
 ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
-; TOSMEM: s_add_u32 m0, s3, 0x100{{$}}
+; TOSMEM: s_add_u32 m0, s3, 0x300{{$}}
 ; TOSMEM-NOT: [[M0_COPY]]
 ; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill
 
 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[ENDIF]]:
-; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 0
+; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 2
 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]
 
-; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Reload
+; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 ; 4-byte Folded Reload
 ; TOVMEM: s_waitcnt vmcnt(0)
 ; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]]
 ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
 
-; TOSMEM: s_add_u32 m0, s3, 0x100{{$}}
+; TOSMEM: s_add_u32 m0, s3, 0x300{{$}}
 ; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload
 ; TOSMEM-NOT: [[M0_RESTORE]]
 ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]]
@@ -80,7 +80,7 @@ endif:
 ; TOSMEM: s_branch
 
 ; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM: s_add_u32 m0, s7, 0x400
+; TOSMEM: s_add_u32 m0, s7, 0x500
 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
 
 
@@ -162,17 +162,17 @@ endif:
 ; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
 ; FIXME-TOSMEM-NOT: m0
 ; TOSMEM: s_add_u32 m0, s3, 0x100
-; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
-; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x300
 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill
 ; FIXME-TOSMEM-NOT: m0
+; TOSMEM: s_add_u32 m0, s3, 0x200
+; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
+; FIXME-TOSMEM-NOT: m0
 ; TOSMEM: s_cbranch_scc1
 
 ; TOSMEM: s_mov_b32 m0, -1
 
 ; TOSMEM: s_mov_b32 s0, m0
-; TOSMEM: s_add_u32 m0, s3, 0x100
+; TOSMEM: s_add_u32 m0, s3, 0x200
 ; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload
 ; TOSMEM: s_mov_b32 m0, s0
 ; TOSMEM: s_waitcnt lgkmcnt(0)
@@ -180,7 +180,7 @@ endif:
 ; TOSMEM: ds_write_b64
 
 ; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x300
+; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload
 ; FIXME-TOSMEM-NOT: m0
 ; TOSMEM: s_waitcnt lgkmcnt(0)
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index b58338aa6fd..3d516ea2638 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -2038,10 +2038,10 @@ define i32 @AtomicCmpSwap32(i32 signext %oldval, i32 signext %newval) nounwind {
 ; MIPS32R6O0-NEXT:    beqzc $7, $BB7_1
 ; MIPS32R6O0-NEXT:  $BB7_3: # %entry
 ; MIPS32R6O0-NEXT:    move $2, $6
+; MIPS32R6O0-NEXT:    sw $25, 12($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $1, 8($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:    sw $6, 16($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $1, 12($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $3, 8($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $25, 4($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $3, 4($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:    addiu $sp, $sp, 24
 ; MIPS32R6O0-NEXT:    jrc $ra
 ;
@@ -4550,11 +4550,11 @@ define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwi
 ; MIPS32R6O0-NEXT:    srlv $8, $10, $2
 ; MIPS32R6O0-NEXT:    seb $8, $8
 ; MIPS32R6O0-NEXT:  # %bb.4: # %entry
-; MIPS32R6O0-NEXT:    sw $1, 12($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $8, 8($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $25, 4($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $25, 12($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $1, 8($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $8, 4($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:  # %bb.5: # %entry
-; MIPS32R6O0-NEXT:    lw $2, 8($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $2, 4($sp) # 4-byte Folded Reload
 ; MIPS32R6O0-NEXT:    addiu $sp, $sp, 16
 ; MIPS32R6O0-NEXT:    jrc $ra
 ;
@@ -5127,14 +5127,14 @@ define i1 @AtomicCmpSwapRes8(i8* %ptr, i8 signext %oldval, i8 signext %newval) n
 ; MIPS32R6O0-NEXT:    srlv $11, $13, $4
 ; MIPS32R6O0-NEXT:    seb $11, $11
 ; MIPS32R6O0-NEXT:  # %bb.4: # %entry
-; MIPS32R6O0-NEXT:    sw $11, 20($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $5, 16($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $3, 12($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $1, 8($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $2, 4($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $5, 20($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $1, 16($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $2, 12($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $3, 8($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $11, 4($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:  # %bb.5: # %entry
-; MIPS32R6O0-NEXT:    lw $1, 20($sp) # 4-byte Folded Reload
-; MIPS32R6O0-NEXT:    lw $2, 16($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $1, 4($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $2, 20($sp) # 4-byte Folded Reload
 ; MIPS32R6O0-NEXT:    xor $1, $1, $2
 ; MIPS32R6O0-NEXT:    sltiu $2, $1, 1
 ; MIPS32R6O0-NEXT:    addiu $sp, $sp, 24
@@ -5282,7 +5282,7 @@ define i1 @AtomicCmpSwapRes8(i8* %ptr, i8 signext %oldval, i8 signext %newval) n
 ;
 ; MIPS64R6O0-LABEL: AtomicCmpSwapRes8:
 ; MIPS64R6O0:       # %bb.0: # %entry
-; MIPS64R6O0-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R6O0-NEXT:    daddiu $sp, $sp, -32
 ; MIPS64R6O0-NEXT:    move $1, $6
 ; MIPS64R6O0-NEXT:    move $2, $5
 ; MIPS64R6O0-NEXT:    move $5, $4
@@ -5313,15 +5313,15 @@ define i1 @AtomicCmpSwapRes8(i8* %ptr, i8 signext %oldval, i8 signext %newval) n
 ; MIPS64R6O0-NEXT:    srlv $10, $12, $3
 ; MIPS64R6O0-NEXT:    seb $10, $10
 ; MIPS64R6O0-NEXT:  # %bb.4: # %entry
-; MIPS64R6O0-NEXT:    sd $5, 8($sp) # 8-byte Folded Spill
-; MIPS64R6O0-NEXT:    sw $10, 4($sp) # 4-byte Folded Spill
-; MIPS64R6O0-NEXT:    sw $2, 0($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $2, 28($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sd $5, 16($sp) # 8-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $10, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.5: # %entry
-; MIPS64R6O0-NEXT:    lw $1, 4($sp) # 4-byte Folded Reload
-; MIPS64R6O0-NEXT:    lw $2, 0($sp) # 4-byte Folded Reload
+; MIPS64R6O0-NEXT:    lw $1, 12($sp) # 4-byte Folded Reload
+; MIPS64R6O0-NEXT:    lw $2, 28($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    xor $1, $1, $2
 ; MIPS64R6O0-NEXT:    sltiu $2, $1, 1
-; MIPS64R6O0-NEXT:    daddiu $sp, $sp, 16
+; MIPS64R6O0-NEXT:    daddiu $sp, $sp, 32
 ; MIPS64R6O0-NEXT:    jrc $ra
 ;
 ; MM32-LABEL: AtomicCmpSwapRes8:
@@ -6233,20 +6233,20 @@ define {i16, i1} @foo(i16* %addr, i16 %l, i16 %r, i16 %new) {
 ; MIPS32R6O0-NEXT:    srlv $12, $14, $4
 ; MIPS32R6O0-NEXT:    seh $12, $12
 ; MIPS32R6O0-NEXT:  # %bb.4:
-; MIPS32R6O0-NEXT:    sw $12, 20($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $3, 16($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $8, 12($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $5, 8($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $2, 0($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $1, 20($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $2, 16($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $3, 12($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $8, 8($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $5, 4($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $12, 0($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:  # %bb.5:
-; MIPS32R6O0-NEXT:    lw $1, 8($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $1, 4($sp) # 4-byte Folded Reload
 ; MIPS32R6O0-NEXT:    seh $2, $1
-; MIPS32R6O0-NEXT:    lw $3, 20($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $3, 0($sp) # 4-byte Folded Reload
 ; MIPS32R6O0-NEXT:    xor $2, $3, $2
 ; MIPS32R6O0-NEXT:    sltiu $3, $2, 1
 ; MIPS32R6O0-NEXT:    sync
-; MIPS32R6O0-NEXT:    lw $2, 20($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $2, 0($sp) # 4-byte Folded Reload
 ; MIPS32R6O0-NEXT:    addiu $sp, $sp, 24
 ; MIPS32R6O0-NEXT:    jrc $ra
 ;
@@ -6449,17 +6449,17 @@ define {i16, i1} @foo(i16* %addr, i16 %l, i16 %r, i16 %new) {
 ; MIPS64R6O0-NEXT:    srlv $11, $13, $3
 ; MIPS64R6O0-NEXT:    seh $11, $11
 ; MIPS64R6O0-NEXT:  # %bb.4:
-; MIPS64R6O0-NEXT:    sw $2, 12($sp) # 4-byte Folded Spill
-; MIPS64R6O0-NEXT:    sw $11, 8($sp) # 4-byte Folded Spill
-; MIPS64R6O0-NEXT:    sd $5, 0($sp) # 8-byte Folded Spill
+; MIPS64R6O0-NEXT:    sd $5, 8($sp) # 8-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $2, 4($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $11, 0($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.5:
-; MIPS64R6O0-NEXT:    lw $1, 12($sp) # 4-byte Folded Reload
+; MIPS64R6O0-NEXT:    lw $1, 4($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    seh $2, $1
-; MIPS64R6O0-NEXT:    lw $3, 8($sp) # 4-byte Folded Reload
+; MIPS64R6O0-NEXT:    lw $3, 0($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    xor $2, $3, $2
 ; MIPS64R6O0-NEXT:    sltiu $3, $2, 1
 ; MIPS64R6O0-NEXT:    sync
-; MIPS64R6O0-NEXT:    lw $2, 8($sp) # 4-byte Folded Reload
+; MIPS64R6O0-NEXT:    lw $2, 0($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R6O0-NEXT:    jrc $ra
 ;
@@ -7016,8 +7016,8 @@ define i32 @zeroreg() nounwind {
 ; MIPS32O0-NEXT:    xor $2, $5, $2
 ; MIPS32O0-NEXT:    sltiu $2, $2, 1
 ; MIPS32O0-NEXT:    andi $2, $2, 1
-; MIPS32O0-NEXT:    sw $3, 8($sp) # 4-byte Folded Spill
 ; MIPS32O0-NEXT:    sw $5, 12($sp) # 4-byte Folded Spill
+; MIPS32O0-NEXT:    sw $3, 8($sp) # 4-byte Folded Spill
 ; MIPS32O0-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
 ; MIPS32O0-NEXT:    addiu $sp, $sp, 16
 ; MIPS32O0-NEXT:    jr $ra
@@ -7099,8 +7099,8 @@ define i32 @zeroreg() nounwind {
 ; MIPS32R6O0-NEXT:    xor $1, $5, $1
 ; MIPS32R6O0-NEXT:    sltiu $2, $1, 1
 ; MIPS32R6O0-NEXT:    sync
-; MIPS32R6O0-NEXT:    sw $3, 0($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:    sw $5, 4($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $3, 0($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:    addiu $sp, $sp, 8
 ; MIPS32R6O0-NEXT:    jrc $ra
 ;
@@ -7234,8 +7234,8 @@ define i32 @zeroreg() nounwind {
 ; MIPS64R6O0-NEXT:    xor $2, $6, $3
 ; MIPS64R6O0-NEXT:    sltiu $2, $2, 1
 ; MIPS64R6O0-NEXT:    sync
-; MIPS64R6O0-NEXT:    sw $4, 8($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $4, 8($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R6O0-NEXT:    jrc $ra
 ;
diff --git a/test/CodeGen/Mips/atomic64.ll b/test/CodeGen/Mips/atomic64.ll
index aa8442d488b..8e5002b38b0 100644
--- a/test/CodeGen/Mips/atomic64.ll
+++ b/test/CodeGen/Mips/atomic64.ll
@@ -1289,8 +1289,8 @@ define i64 @AtomicCmpSwap64(i64 signext %oldval, i64 signext %newval) nounwind {
 ; MIPS64R6O0-NEXT:  .LBB7_3: # %entry
 ; MIPS64R6O0-NEXT:    sd $2, 24($sp) # 8-byte Folded Spill
 ; MIPS64R6O0-NEXT:    move $2, $6
-; MIPS64R6O0-NEXT:    sd $6, 32($sp) # 8-byte Folded Spill
 ; MIPS64R6O0-NEXT:    sd $25, 16($sp) # 8-byte Folded Spill
+; MIPS64R6O0-NEXT:    sd $6, 32($sp) # 8-byte Folded Spill
 ; MIPS64R6O0-NEXT:    sd $3, 8($sp) # 8-byte Folded Spill
 ; MIPS64R6O0-NEXT:    daddiu $sp, $sp, 48
 ; MIPS64R6O0-NEXT:    jrc $ra
diff --git a/test/CodeGen/Mips/atomicCmpSwapPW.ll b/test/CodeGen/Mips/atomicCmpSwapPW.ll
index 10610e34e71..973f3a5bf0b 100644
--- a/test/CodeGen/Mips/atomicCmpSwapPW.ll
+++ b/test/CodeGen/Mips/atomicCmpSwapPW.ll
@@ -32,10 +32,10 @@ define void @foo(i32 %new, i32 %old) {
 ; O32-NEXT:    nop
 ; O32-NEXT:  $BB0_3: # %entry
 ; O32-NEXT:    sync
+; O32-NEXT:    sw $1, 8($sp) # 4-byte Folded Spill
+; O32-NEXT:    sw $2, 4($sp) # 4-byte Folded Spill
 ; O32-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
-; O32-NEXT:    sw $6, 8($sp) # 4-byte Folded Spill
-; O32-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
-; O32-NEXT:    sw $2, 0($sp) # 4-byte Folded Spill
+; O32-NEXT:    sw $6, 0($sp) # 4-byte Folded Spill
 ; O32-NEXT:    addiu $sp, $sp, 16
 ; O32-NEXT:    jr $ra
 ; O32-NEXT:    nop
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
index 5e78444eea7..519b169c0f9 100644
--- a/test/CodeGen/X86/atomic32.ll
+++ b/test/CodeGen/X86/atomic32.ll
@@ -71,8 +71,8 @@ define void @atomic_fetch_and32() nounwind {
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB2_2
 ; X64-NEXT:    jmp .LBB2_1
 ; X64-NEXT:  .LBB2_2: # %atomicrmw.end
@@ -95,8 +95,8 @@ define void @atomic_fetch_and32() nounwind {
 ; X86-NEXT:    sete %dl
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    jne .LBB2_2
 ; X86-NEXT:    jmp .LBB2_1
 ; X86-NEXT:  .LBB2_2: # %atomicrmw.end
@@ -125,8 +125,8 @@ define void @atomic_fetch_or32() nounwind {
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB3_2
 ; X64-NEXT:    jmp .LBB3_1
 ; X64-NEXT:  .LBB3_2: # %atomicrmw.end
@@ -149,8 +149,8 @@ define void @atomic_fetch_or32() nounwind {
 ; X86-NEXT:    sete %dl
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    jne .LBB3_2
 ; X86-NEXT:    jmp .LBB3_1
 ; X86-NEXT:  .LBB3_2: # %atomicrmw.end
@@ -179,8 +179,8 @@ define void @atomic_fetch_xor32() nounwind {
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB4_2
 ; X64-NEXT:    jmp .LBB4_1
 ; X64-NEXT:  .LBB4_2: # %atomicrmw.end
@@ -203,8 +203,8 @@ define void @atomic_fetch_xor32() nounwind {
 ; X86-NEXT:    sete %dl
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    jne .LBB4_2
 ; X86-NEXT:    jmp .LBB4_1
 ; X86-NEXT:  .LBB4_2: # %atomicrmw.end
@@ -285,8 +285,8 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB6_2
 ; X64-NEXT:    jmp .LBB6_1
 ; X64-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -310,8 +310,8 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
-; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB6_2
 ; X86-CMOV-NEXT:    jmp .LBB6_1
 ; X86-CMOV-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -381,8 +381,8 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB7_2
 ; X64-NEXT:    jmp .LBB7_1
 ; X64-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -406,8 +406,8 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
-; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB7_2
 ; X86-CMOV-NEXT:    jmp .LBB7_1
 ; X86-CMOV-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -477,8 +477,8 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB8_2
 ; X64-NEXT:    jmp .LBB8_1
 ; X64-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -502,8 +502,8 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
-; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB8_2
 ; X86-CMOV-NEXT:    jmp .LBB8_1
 ; X86-CMOV-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -573,8 +573,8 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB9_2
 ; X64-NEXT:    jmp .LBB9_1
 ; X64-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -598,8 +598,8 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
-; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB9_2
 ; X86-CMOV-NEXT:    jmp .LBB9_1
 ; X86-CMOV-NEXT:  .LBB9_2: # %atomicrmw.end
diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll
index d55bbac5dc1..ea42aa34d8c 100644
--- a/test/CodeGen/X86/avx-load-store.ll
+++ b/test/CodeGen/X86/avx-load-store.ll
@@ -13,15 +13,15 @@ define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>*
 ; CHECK-NEXT:    movq %rsi, %r15
 ; CHECK-NEXT:    movq %rdi, %rbx
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps (%rsi), %ymm1
-; CHECK-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps (%rdx), %ymm2
 ; CHECK-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
 ; CHECK-NEXT:    callq dummy
-; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; CHECK-NEXT:    vmovaps %ymm0, (%rbx)
-; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; CHECK-NEXT:    vmovaps %ymm0, (%r15)
 ; CHECK-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; CHECK-NEXT:    vmovaps %ymm0, (%r14)
@@ -38,21 +38,21 @@ define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>*
 ; CHECK_O0-NEXT:    vmovapd (%rdi), %ymm0
 ; CHECK_O0-NEXT:    vmovaps (%rsi), %ymm1
 ; CHECK_O0-NEXT:    vmovdqa (%rdx), %ymm2
-; CHECK_O0-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
-; CHECK_O0-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
-; CHECK_O0-NEXT:    vmovups %ymm2, {{[0-9]+}}(%rsp) # 32-byte Spill
-; CHECK_O0-NEXT:    movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK_O0-NEXT:    movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK_O0-NEXT:    movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK_O0-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK_O0-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK_O0-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK_O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK_O0-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK_O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK_O0-NEXT:    callq dummy
-; CHECK_O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdx # 8-byte Reload
-; CHECK_O0-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK_O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; CHECK_O0-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; CHECK_O0-NEXT:    vmovapd %ymm0, (%rdx)
-; CHECK_O0-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; CHECK_O0-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm1 # 32-byte Reload
+; CHECK_O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; CHECK_O0-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; CHECK_O0-NEXT:    vmovaps %ymm1, (%rsi)
-; CHECK_O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; CHECK_O0-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm2 # 32-byte Reload
+; CHECK_O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK_O0-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; CHECK_O0-NEXT:    vmovdqa %ymm2, (%rdi)
 ; CHECK_O0-NEXT:    addq $152, %rsp
 ; CHECK_O0-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx512-mask-zext-bugfix.ll b/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
index 4d939bd5b8c..fed87ebf6eb 100755
--- a/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
+++ b/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
@@ -17,50 +17,50 @@ declare i32 @check_mask16(i16 zeroext %res_mask, i16 zeroext %exp_mask, i8* %fna
 define void @test_xmm(i32 %shift, i32 %mulp, <2 x i64> %a,i8* %arraydecay,i8* %fname){
 ; CHECK-LABEL: test_xmm:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subq $56, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-NEXT:    vpmovw2m %xmm0, %k0
 ; CHECK-NEXT:    movl $2, %esi
 ; CHECK-NEXT:    movl $8, %eax
 ; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; CHECK-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill
+; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; CHECK-NEXT:    callq _calc_expected_mask_val
 ; CHECK-NEXT:    movl %eax, %edx
 ; CHECK-NEXT:    movw %dx, %r8w
 ; CHECK-NEXT:    movzwl %r8w, %esi
-; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; CHECK-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
 ; CHECK-NEXT:    kmovb %k0, %edi
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdx ## 8-byte Reload
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
 ; CHECK-NEXT:    callq _check_mask16
-; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; CHECK-NEXT:    vpmovd2m %xmm0, %k0
 ; CHECK-NEXT:    kmovq %k0, %k1
 ; CHECK-NEXT:    kmovd %k0, %esi
 ; CHECK-NEXT:    movb %sil, %r9b
 ; CHECK-NEXT:    movzbl %r9b, %esi
 ; CHECK-NEXT:    movw %si, %r8w
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi ## 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
 ; CHECK-NEXT:    movl $4, %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %edx ## 4-byte Reload
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; CHECK-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT:    movw %r8w, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx ## 4-byte Reload
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    movw %r8w, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; CHECK-NEXT:    callq _calc_expected_mask_val
 ; CHECK-NEXT:    movw %ax, %r8w
-; CHECK-NEXT:    movw {{[0-9]+}}(%rsp), %r10w ## 2-byte Reload
+; CHECK-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %r10w ## 2-byte Reload
 ; CHECK-NEXT:    movzwl %r10w, %edi
 ; CHECK-NEXT:    movzwl %r8w, %esi
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdx ## 8-byte Reload
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
 ; CHECK-NEXT:    callq _check_mask16
-; CHECK-NEXT:    movl %eax, (%rsp) ## 4-byte Spill
-; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    addq $72, %rsp
 ; CHECK-NEXT:    retq
   %d2 = bitcast <2 x i64> %a to <8 x i16>
   %m2 = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %d2)
diff --git a/test/CodeGen/X86/pr30430.ll b/test/CodeGen/X86/pr30430.ll
index 94deca3a292..a81e26c51a1 100644
--- a/test/CodeGen/X86/pr30430.ll
+++ b/test/CodeGen/X86/pr30430.ll
@@ -116,14 +116,14 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm24, %zmm24
 ; CHECK-NEXT:    vmovaps %zmm24, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
-; CHECK-NEXT:    vmovss %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    vmovss %xmm14, (%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm15, (%rsp) # 4-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index ab6680cf45a..3998fcec9c7 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -222,8 +222,8 @@ define void @f1() {
 ; 686-O0-NEXT:    movl %ebp, _ZN8struct_210member_2_0E
 ; 686-O0-NEXT:    movl $0, _ZN8struct_210member_2_0E+4
 ; 686-O0-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; 686-O0-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 686-O0-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; 686-O0-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 686-O0-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 686-O0-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; 686-O0-NEXT:    addl $24, %esp
diff --git a/test/CodeGen/X86/pr32345.ll b/test/CodeGen/X86/pr32345.ll
index 3a2db27727a..65fcf055f28 100644
--- a/test/CodeGen/X86/pr32345.ll
+++ b/test/CodeGen/X86/pr32345.ll
@@ -77,8 +77,8 @@ define void @foo() {
 ; 6860-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; 6860-NEXT:    shrdl %cl, %edi, %esi
 ; 6860-NEXT:    testb $32, %bl
-; 6860-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 6860-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; 6860-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 6860-NEXT:    jne .LBB0_2
 ; 6860-NEXT:  # %bb.1: # %bb
 ; 6860-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
diff --git a/test/CodeGen/X86/pr34592.ll b/test/CodeGen/X86/pr34592.ll
index 34e80fb23c4..b010429d973 100644
--- a/test/CodeGen/X86/pr34592.ll
+++ b/test/CodeGen/X86/pr34592.ll
@@ -53,12 +53,12 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
 ; CHECK-NEXT:    vmovaps %ymm5, %ymm1
 ; CHECK-NEXT:    vmovaps %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps %ymm9, %ymm3
-; CHECK-NEXT:    vmovaps %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT:    vmovaps %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT:    vmovaps %ymm14, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm4, (%rsp) # 32-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
diff --git a/test/CodeGen/X86/pr34653.ll b/test/CodeGen/X86/pr34653.ll
index 54d2e714635..3578806596f 100644
--- a/test/CodeGen/X86/pr34653.ll
+++ b/test/CodeGen/X86/pr34653.ll
@@ -130,23 +130,12 @@ define void @pr34653() {
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -155,12 +144,23 @@ define void @pr34653() {
 ; CHECK-NEXT:    vmovsd %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm7, (%rsp) # 8-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
-- 
GitLab


From 1f633af49eb599d342789b74c2b463b700f0c04d Mon Sep 17 00:00:00 2001
From: Kristof Beyls <kristof.beyls@arm.com>
Date: Wed, 7 Nov 2018 08:49:36 +0000
Subject: [PATCH 1052/1116] Introduce bug life cycle documentation.

Document what is expected during:
* triaging
* actively working on a bug
* closing/resolving

Also document how we maintain:
* product/component breakdown
* default-cc lists per component

Differential Revision: https://reviews.llvm.org/D53691


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346299 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/BugLifeCycle.rst | 140 ++++++++++++++++++++++++++++++++++++++++++
 docs/Phabricator.rst  |   6 ++
 docs/index.rst        |   4 ++
 3 files changed, 150 insertions(+)
 create mode 100644 docs/BugLifeCycle.rst

diff --git a/docs/BugLifeCycle.rst b/docs/BugLifeCycle.rst
new file mode 100644
index 00000000000..c74aa1d3a62
--- /dev/null
+++ b/docs/BugLifeCycle.rst
@@ -0,0 +1,140 @@
+===================
+LLVM Bug Life Cycle
+===================
+
+.. contents::
+   :local:
+
+
+
+Introduction - Achieving consistency in how we deal with bug reports
+====================================================================
+
+We aim to achieve a basic level of consistency in how reported bugs evolve from
+being reported, to being worked on, and finally getting closed out. The
+consistency helps reporters, developers and others to gain a better
+understanding of what a particular bug state actually means and what to expect
+might happen next.
+
+At the same time, we aim to not over-specify the life cycle of bugs in the
+`the LLVM Bug Tracking System <https://bugs.llvm.org/enter_bug.cgi>`_, as the
+overall goal is to make it easier to work with and understand the bug reports.
+
+The main parts of the life cycle documented here are:
+
+#. `Reporting`_
+#. `Triaging`_
+#. `Actively working on fixing`_
+#. `Closing`_
+
+Furthermore, some of the metadata in the bug tracker, such as who to notify on
+newly reported bugs or what the breakdown into products & components is we use,
+needs to be maintained. See the following for details:
+
+#. `Maintenance of Bug products/component metadata`_
+#. `Maintenance of cc-by-default settings`_
+
+
+.. _Reporting:
+
+Reporting bugs
+==============
+
+See :doc:`HowToSubmitABug` on further details on how to submit good bug reports.
+
+Make sure that you have one or more people on cc on the bug report that you
+think will react to it. We aim to automatically add specific people on cc for
+most products/components, but may not always succeed in doing so.
+
+If you know the area of LLVM code the root cause of the bug is in, good
+candidates to add as cc may be the same people you'd ask for a code review in
+that area. See :ref:`finding-potential-reviewers` for more details.
+
+
+.. _Triaging:
+
+Triaging bugs
+=============
+
+Bugs with status NEW indicate that they still need to be triaged.
+When triage is complete, the status of the bug is moved to CONFIRMED.
+
+The goal of triaging a bug is to make sure a newly reported bug ends up in a
+good, actionable, state. Try to answer the following questions while triaging.
+
+* Is the reported behavior actually wrong?
+
+  * E.g. does a miscompile example depend on undefined behavior?
+
+* Can you easily reproduce the bug?
+
+  * If not, are there reasonable excuses why it cannot easily be reproduced?
+
+* Is it related to an already reported bug?
+
+  * Use the "See also"/"depends on"/"blocks" fields if so.
+  * Close it as a duplicate if so, pointing to the issue it duplicates.
+
+* Are the following fields filled in correctly?
+
+  * Product
+  * Component
+  * Title
+
+* CC others not already cc’ed that you happen to know would be good to pull in.
+* Add the "beginner" keyword if you think this would be a good bug to be fixed
+  by someone new to LLVM.
+
+.. _Actively working on fixing:
+
+Actively working on fixing bugs
+===============================
+
+Please remember to assign the bug to yourself if you're actively working on
+fixing it and to unassign it when you're no longer actively working on it.  You
+unassign a bug by setting the Assignee field to "unassignedbugs@nondot.org".
+
+.. _Closing:
+
+Resolving/Closing bugs
+======================
+
+For simplicity, we only have 1 status for all resolved or closed bugs:
+RESOLVED.
+
+Resolving bugs is good! Make sure to properly record the reason for resolving.
+Examples of reasons for resolving are:
+
+* Revision NNNNNN fixed the bug.
+* The bug cannot be reproduced with revision NNNNNN.
+* The circumstances for the bug don't apply anymore.
+* There is a sound reason for not fixing it (WONTFIX).
+* There is a specific and plausible reason to think that a given bug is
+  otherwise inapplicable or obsolete.
+
+  * One example is an old open bug that doesn't contain enough information to
+    clearly understand the problem being reported (e.g. not reproducible). It is
+    fine to resolve such a bug e.g. with resolution WORKSFORME and leaving a
+    comment to encourage the reporter to reopen the bug with more information
+    if it's still reproducable on their end.
+
+If a bug is resolved, please fill in the revision number it was fixed in in the
+"Fixed by Commit(s)" field.
+
+
+.. _Maintenance of Bug products/component metadata:
+
+Maintenance of products/components metadata
+===========================================
+
+Please raise a bug against "Bugzilla Admin"/"Products" to request any changes
+to be made to the breakdown of products & components modeled in Bugzilla.
+
+
+.. _Maintenance of cc-by-default settings:
+
+Maintenance of cc-by-default settings
+=====================================
+
+Please raise a bug against "Bugzilla Admin"/"Products" to request any changes
+to be made to the cc-by-default settings for specific components.
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
index 53cb3b5980a..640e1611da6 100644
--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@@ -94,6 +94,12 @@ them to participate. Many people will see the email notification on cfe-commits
 or llvm-commits, and if the subject line suggests the patch is something they
 should look at, they will.
 
+
+.. _finding-potential-reviewers:
+
+Finding potential reviewers
+---------------------------
+
 Here are a couple of ways to pick the initial reviewer(s):
 
 * Use ``svn blame`` and the commit log to find names of people who have
diff --git a/docs/index.rst b/docs/index.rst
index de9218e6f4c..df70de095bd 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -454,6 +454,7 @@ Information about LLVM's development process.
    Packaging
    ReleaseProcess
    Phabricator
+   BugLifeCycle
 
 :doc:`Contributing`
    An overview on how to contribute to LLVM.
@@ -484,6 +485,9 @@ Information about LLVM's development process.
    Describes how to use the Phabricator code review tool hosted on
    http://reviews.llvm.org/ and its command line interface, Arcanist.
 
+:doc:`BugLifeCycle`
+   Describes how bugs are reported, triaged and closed.
+
 Community
 =========
 
-- 
GitLab


From 20a6832a83a4427fd87a9ea942b177153455c233 Mon Sep 17 00:00:00 2001
From: Dean Michael Berris <dberris@google.com>
Date: Wed, 7 Nov 2018 11:44:00 +0000
Subject: [PATCH 1053/1116] [XRay] Use explicit string conversion

Instead of using std::copy(...), use a conversion to string instead from
StringRef to std::string.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346304 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/XRay/FDRTraceExpander.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/XRay/FDRTraceExpander.cpp b/lib/XRay/FDRTraceExpander.cpp
index e67f4b5d89f..6fe3f86c3da 100644
--- a/lib/XRay/FDRTraceExpander.cpp
+++ b/lib/XRay/FDRTraceExpander.cpp
@@ -45,8 +45,7 @@ Error TraceExpander::visit(CustomEventRecord &R) {
     CurrentRecord.PId = PID;
     CurrentRecord.TId = TID;
     CurrentRecord.Type = RecordTypes::CUSTOM_EVENT;
-    std::copy(R.data().begin(), R.data().end(),
-              std::back_inserter(CurrentRecord.Data));
+    CurrentRecord.Data = R.data();
     BuildingRecord = true;
   }
   return Error::success();
-- 
GitLab


From d22ff2569080badbdbeced79d2787cf98cda873d Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@rt-rk.com>
Date: Wed, 7 Nov 2018 11:45:43 +0000
Subject: [PATCH 1054/1116] [MIPS GlobalISel] Set operand order for G_MERGE and
 G_UNMERGE

Set operands order for G_MERGE_VALUES and G_UNMERGE_VALUES so
that least significant bits always go first, regardless of endianness.

Differential Revision: https://reviews.llvm.org/D54098


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346305 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MipsCallLowering.cpp          |  8 ++---
 lib/Target/Mips/MipsCallLowering.h            |  2 +-
 lib/Target/Mips/MipsLegalizerInfo.cpp         |  8 ++---
 .../GlobalISel/irtranslator/split_args.ll     | 30 +++++++++----------
 .../CodeGen/Mips/GlobalISel/legalizer/add.mir | 10 +++----
 .../Mips/GlobalISel/legalizer/constants.mir   |  4 +--
 6 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/lib/Target/Mips/MipsCallLowering.cpp b/lib/Target/Mips/MipsCallLowering.cpp
index 4d070f9f523..c550fadf663 100644
--- a/lib/Target/Mips/MipsCallLowering.cpp
+++ b/lib/Target/Mips/MipsCallLowering.cpp
@@ -45,9 +45,9 @@ bool MipsCallLowering::MipsHandler::assignVRegs(ArrayRef<unsigned> VRegs,
   return true;
 }
 
-void MipsCallLowering::MipsHandler::setMostSignificantFirst(
+void MipsCallLowering::MipsHandler::setLeastSignificantFirst(
     SmallVectorImpl<unsigned> &VRegs) {
-  if (MIRBuilder.getMF().getDataLayout().isLittleEndian())
+  if (!MIRBuilder.getMF().getDataLayout().isLittleEndian())
     std::reverse(VRegs.begin(), VRegs.end());
 }
 
@@ -181,7 +181,7 @@ bool IncomingValueHandler::handleSplit(SmallVectorImpl<unsigned> &VRegs,
                                        unsigned ArgsReg) {
   if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex))
     return false;
-  setMostSignificantFirst(VRegs);
+  setLeastSignificantFirst(VRegs);
   MIRBuilder.buildMerge(ArgsReg, VRegs);
   return true;
 }
@@ -283,7 +283,7 @@ bool OutgoingValueHandler::handleSplit(SmallVectorImpl<unsigned> &VRegs,
                                        unsigned ArgLocsStartIndex,
                                        unsigned ArgsReg) {
   MIRBuilder.buildUnmerge(VRegs, ArgsReg);
-  setMostSignificantFirst(VRegs);
+  setLeastSignificantFirst(VRegs);
   if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex))
     return false;
 
diff --git a/lib/Target/Mips/MipsCallLowering.h b/lib/Target/Mips/MipsCallLowering.h
index a0d4464e2c0..9916b04ef50 100644
--- a/lib/Target/Mips/MipsCallLowering.h
+++ b/lib/Target/Mips/MipsCallLowering.h
@@ -38,7 +38,7 @@ public:
     bool assignVRegs(ArrayRef<unsigned> VRegs, ArrayRef<CCValAssign> ArgLocs,
                      unsigned Index);
 
-    void setMostSignificantFirst(SmallVectorImpl<unsigned> &VRegs);
+    void setLeastSignificantFirst(SmallVectorImpl<unsigned> &VRegs);
 
     MachineIRBuilder &MIRBuilder;
     MachineRegisterInfo &MRI;
diff --git a/lib/Target/Mips/MipsLegalizerInfo.cpp b/lib/Target/Mips/MipsLegalizerInfo.cpp
index 525f2143190..02701f31e32 100644
--- a/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -80,15 +80,15 @@ bool MipsLegalizerInfo::legalizeCustom(MachineInstr &MI,
     unsigned Carry = MRI.createGenericVirtualRegister(sHalf);
     unsigned TmpResHigh = MRI.createGenericVirtualRegister(sHalf);
 
-    MIRBuilder.buildUnmerge({RHSHigh, RHSLow}, MI.getOperand(2).getReg());
-    MIRBuilder.buildUnmerge({LHSHigh, LHSLow}, MI.getOperand(1).getReg());
+    MIRBuilder.buildUnmerge({RHSLow, RHSHigh}, MI.getOperand(2).getReg());
+    MIRBuilder.buildUnmerge({LHSLow, LHSHigh}, MI.getOperand(1).getReg());
 
     MIRBuilder.buildAdd(TmpResHigh, LHSHigh, RHSHigh);
     MIRBuilder.buildAdd(ResLow, LHSLow, RHSLow);
     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, Carry, ResLow, LHSLow);
     MIRBuilder.buildAdd(ResHigh, TmpResHigh, Carry);
 
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResHigh, ResLow});
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResLow, ResHigh});
 
     MI.eraseFromParent();
     break;
@@ -109,7 +109,7 @@ bool MipsLegalizerInfo::legalizeCustom(MachineInstr &MI,
         ResHigh, *ConstantInt::get(MI.getMF()->getFunction().getContext(),
                                    CImmValue.lshr(Size / 2).trunc(Size / 2)));
 
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResHigh, ResLow});
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResLow, ResHigh});
 
     MI.eraseFromParent();
     break;
diff --git a/test/CodeGen/Mips/GlobalISel/irtranslator/split_args.ll b/test/CodeGen/Mips/GlobalISel/irtranslator/split_args.ll
index f51b72060de..13ffd24bcb9 100644
--- a/test/CodeGen/Mips/GlobalISel/irtranslator/split_args.ll
+++ b/test/CodeGen/Mips/GlobalISel/irtranslator/split_args.ll
@@ -6,10 +6,10 @@ define i64 @i64_reg(i64 %a) {
   ; MIPS32:   liveins: $a0, $a1
   ; MIPS32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
   ; MIPS32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY]](s32)
+  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; MIPS32:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; MIPS32:   $v0 = COPY [[UV1]](s32)
-  ; MIPS32:   $v1 = COPY [[UV]](s32)
+  ; MIPS32:   $v0 = COPY [[UV]](s32)
+  ; MIPS32:   $v1 = COPY [[UV1]](s32)
   ; MIPS32:   RetRA implicit $v0, implicit $v1
 entry:
   ret i64 %a
@@ -30,10 +30,10 @@ define i64 @i64_stack(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i64 %a) {
   ; MIPS32:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load 4 from %fixed-stack.[[STACK1]], align 0)
   ; MIPS32:   [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
   ; MIPS32:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load 4 from %fixed-stack.[[STACK0]], align 0)
-  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[LOAD]](s32)
+  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
   ; MIPS32:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; MIPS32:   $v0 = COPY [[UV1]](s32)
-  ; MIPS32:   $v1 = COPY [[UV]](s32)
+  ; MIPS32:   $v0 = COPY [[UV]](s32)
+  ; MIPS32:   $v1 = COPY [[UV1]](s32)
   ; MIPS32:   RetRA implicit $v0, implicit $v1
 entry:
   ret i64 %a
@@ -46,10 +46,10 @@ define i64 @i64_reg_allign(i32 %a0, i64 %a) {
   ; MIPS32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
   ; MIPS32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $a2
   ; MIPS32:   [[COPY2:%[0-9]+]]:_(s32) = COPY $a3
-  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY1]](s32)
+  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; MIPS32:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; MIPS32:   $v0 = COPY [[UV1]](s32)
-  ; MIPS32:   $v1 = COPY [[UV]](s32)
+  ; MIPS32:   $v0 = COPY [[UV]](s32)
+  ; MIPS32:   $v1 = COPY [[UV1]](s32)
   ; MIPS32:   RetRA implicit $v0, implicit $v1
 entry:
   ret i64 %a
@@ -73,10 +73,10 @@ define i64 @i64_stack_allign(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %s16, i64 %
   ; MIPS32:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load 4 from %fixed-stack.[[STACK1]], align 0)
   ; MIPS32:   [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
   ; MIPS32:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (load 4 from %fixed-stack.[[STACK0]], align 0)
-  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD1]](s32)
+  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[LOAD2]](s32)
   ; MIPS32:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; MIPS32:   $v0 = COPY [[UV1]](s32)
-  ; MIPS32:   $v1 = COPY [[UV]](s32)
+  ; MIPS32:   $v0 = COPY [[UV]](s32)
+  ; MIPS32:   $v1 = COPY [[UV1]](s32)
   ; MIPS32:   RetRA implicit $v0, implicit $v1
 entry:
   ret i64 %a
@@ -96,10 +96,10 @@ define i64 @i64_reg_stack(i32 %a0, i32 %a1, i32 %a2, i64 %a) {
   ; MIPS32:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load 4 from %fixed-stack.[[STACK1]], align 0)
   ; MIPS32:   [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
   ; MIPS32:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load 4 from %fixed-stack.[[STACK0]], align 0)
-  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[LOAD]](s32)
+  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
   ; MIPS32:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; MIPS32:   $v0 = COPY [[UV1]](s32)
-  ; MIPS32:   $v1 = COPY [[UV]](s32)
+  ; MIPS32:   $v0 = COPY [[UV]](s32)
+  ; MIPS32:   $v1 = COPY [[UV1]](s32)
   ; MIPS32:   RetRA implicit $v0, implicit $v1
 entry:
   ret i64 %a
diff --git a/test/CodeGen/Mips/GlobalISel/legalizer/add.mir b/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
index efd071636b5..ff9ae06a937 100644
--- a/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
+++ b/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
@@ -226,12 +226,12 @@ body:             |
     ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY1]]
-    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[COPY2]]
+    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY]]
+    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY1]]
+    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[COPY3]]
     ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[ICMP]]
-    ; MIPS32: $v0 = COPY [[ADD1]](s32)
-    ; MIPS32: $v1 = COPY [[ADD2]](s32)
+    ; MIPS32: $v0 = COPY [[ADD2]](s32)
+    ; MIPS32: $v1 = COPY [[ADD1]](s32)
     ; MIPS32: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
diff --git a/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir b/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
index 4ed50f2d7ef..d223411c58a 100644
--- a/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
+++ b/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
@@ -21,8 +21,8 @@ body:             |
     ; MIPS32-LABEL: name: any_i64
     ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; MIPS32: $v0 = COPY [[C]](s32)
-    ; MIPS32: $v1 = COPY [[C1]](s32)
+    ; MIPS32: $v0 = COPY [[C1]](s32)
+    ; MIPS32: $v1 = COPY [[C]](s32)
     ; MIPS32: RetRA implicit $v0, implicit $v1
     %0:_(s64) = G_CONSTANT i64 -9223372036854775808
     %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(s64)
-- 
GitLab


From b44657a6b76bc280752fcd0e011d1ec5f0815055 Mon Sep 17 00:00:00 2001
From: Dean Michael Berris <dberris@google.com>
Date: Wed, 7 Nov 2018 11:52:22 +0000
Subject: [PATCH 1055/1116] [XRay] Clean up more std::copy(...)'s

Update a couple more places to use conversion from StringRef to string.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346306 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/XRay/FDRTraceExpander.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/XRay/FDRTraceExpander.cpp b/lib/XRay/FDRTraceExpander.cpp
index 6fe3f86c3da..a6e1521da87 100644
--- a/lib/XRay/FDRTraceExpander.cpp
+++ b/lib/XRay/FDRTraceExpander.cpp
@@ -60,8 +60,7 @@ Error TraceExpander::visit(CustomEventRecordV5 &R) {
     CurrentRecord.PId = PID;
     CurrentRecord.TId = TID;
     CurrentRecord.Type = RecordTypes::CUSTOM_EVENT;
-    std::copy(R.data().begin(), R.data().end(),
-              std::back_inserter(CurrentRecord.Data));
+    CurrentRecord.Data = R.data();
     BuildingRecord = true;
   }
   return Error::success();
@@ -77,8 +76,7 @@ Error TraceExpander::visit(TypedEventRecord &R) {
     CurrentRecord.TId = TID;
     CurrentRecord.RecordType = R.eventType();
     CurrentRecord.Type = RecordTypes::TYPED_EVENT;
-    std::copy(R.data().begin(), R.data().end(),
-              std::back_inserter(CurrentRecord.Data));
+    CurrentRecord.Data = R.data();
     BuildingRecord = true;
   }
   return Error::success();
-- 
GitLab


From fff37cd57e7b9e62bce1cedfc80b288ecff68790 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Wed, 7 Nov 2018 12:26:00 +0000
Subject: [PATCH 1056/1116] [X86][FixupLEA] Avoid checking target features for
 every single processed instruction. NFCI

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346309 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FixupLEAs.cpp | 42 +++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index da5f1695750..ad42cb87804 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -39,8 +39,8 @@ class FixupLEAPass : public MachineFunctionPass {
   /// Loop over all of the instructions in the basic block
   /// replacing applicable instructions with LEA instructions,
   /// where appropriate.
-  bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
-
+  bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI,
+                         bool IsSlowLEA, bool IsSlow3OpsLEA);
 
   /// Given a machine register, look for the instruction
   /// which writes it in the current basic block. If found,
@@ -192,8 +192,11 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
 
   MF = &Func;
   const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
+  bool IsSlowLEA = ST.slowLEA();
+  bool IsSlow3OpsLEA = ST.slow3OpsLEA();
+
   OptIncDec = !ST.slowIncDec() || Func.getFunction().optForMinSize();
-  OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA();
+  OptLEA = ST.LEAusesAG() || IsSlowLEA || IsSlow3OpsLEA;
 
   if (!OptLEA && !OptIncDec)
     return false;
@@ -204,7 +207,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   // Process all basic blocks.
   for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
-    processBasicBlock(Func, I);
+    processBasicBlock(Func, I, IsSlowLEA, IsSlow3OpsLEA);
   LLVM_DEBUG(dbgs() << "End X86FixupLEAs\n";);
 
   return true;
@@ -280,8 +283,9 @@ static inline bool isInefficientLEAReg(unsigned int Reg) {
 static inline bool isRegOperand(const MachineOperand &Op) {
   return Op.isReg() && Op.getReg() != X86::NoRegister;
 }
-/// hasIneffecientLEARegs - LEA that uses base and index registers
-/// where the base is EBP, RBP, or R13
+
+/// Returns true if this LEA uses base an index registers, and the base register
+/// is known to be inefficient for the subtarget.
 // TODO: use a variant scheduling class to model the latency profile
 // of LEA instructions, and implement this logic as a scheduling predicate.
 static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
@@ -566,26 +570,28 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
 }
 
 bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
-                                     MachineFunction::iterator MFI) {
-
+                                     MachineFunction::iterator MFI,
+                                     bool IsSlowLEA, bool IsSlow3OpsLEA) {
   for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
     if (OptIncDec)
       if (fixupIncDec(I, MFI))
         continue;
 
     if (OptLEA) {
-      if (MF.getSubtarget<X86Subtarget>().slowLEA())
+      if (IsSlowLEA) {
         processInstructionForSlowLEA(I, MFI);
-
-      else {
-        if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) {
-          if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
-            MFI->erase(I);
-            I = NewMI;
-          }
-        } else
-          processInstruction(I, MFI);
+        continue;
+      }
+      
+      if (IsSlow3OpsLEA) {
+        if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
+          MFI->erase(I);
+          I = NewMI;
+        }
+        continue;
       }
+
+      processInstruction(I, MFI);
     }
   }
   return false;
-- 
GitLab


From bb746321447cbfed1f650392a7506371a165a793 Mon Sep 17 00:00:00 2001
From: Calixte Denizet <cdenizet@mozilla.com>
Date: Wed, 7 Nov 2018 13:49:17 +0000
Subject: [PATCH 1057/1116] [GCOV] Flush counters before to avoid counting the
 execution before fork twice and for exec** functions we must flush before the
 call

Summary:
This is replacement for patch in https://reviews.llvm.org/D49460.
When we fork, the counters are duplicate as they're and so the values are finally wrong when writing gcda for parent and child.
So just before to fork, we flush the counters and so the parent and the child have new counters set to zero.
For exec** functions, we need to flush before the call to have some data.

Reviewers: vsk, davidxl, marco-c

Reviewed By: marco-c

Subscribers: llvm-commits, sylvestre.ledru, marco-c

Differential Revision: https://reviews.llvm.org/D53593

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346313 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/TargetLibraryInfo.def   | 27 ++++++++++++++
 lib/Analysis/TargetLibraryInfo.cpp            | 24 +++++++++++++
 .../Instrumentation/GCOVProfiling.cpp         | 36 +++++++++++++++++++
 3 files changed, 87 insertions(+)

diff --git a/include/llvm/Analysis/TargetLibraryInfo.def b/include/llvm/Analysis/TargetLibraryInfo.def
index f94debba9c5..518a85ee1a0 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/include/llvm/Analysis/TargetLibraryInfo.def
@@ -565,6 +565,30 @@ TLI_DEFINE_STRING_INTERNAL("cosl")
 /// char *ctermid(char *s);
 TLI_DEFINE_ENUM_INTERNAL(ctermid)
 TLI_DEFINE_STRING_INTERNAL("ctermid")
+/// int execl(const char *path, const char *arg, ...);
+TLI_DEFINE_ENUM_INTERNAL(execl)
+TLI_DEFINE_STRING_INTERNAL("execl")
+/// int execle(const char *file, const char *arg, ..., char * const envp[]);
+TLI_DEFINE_ENUM_INTERNAL(execle)
+TLI_DEFINE_STRING_INTERNAL("execle")
+/// int execlp(const char *file, const char *arg, ...);
+TLI_DEFINE_ENUM_INTERNAL(execlp)
+TLI_DEFINE_STRING_INTERNAL("execlp")
+/// int execv(const char *path, char *const argv[]);
+TLI_DEFINE_ENUM_INTERNAL(execv)
+TLI_DEFINE_STRING_INTERNAL("execv")
+/// int execvP(const char *file, const char *search_path, char *const argv[]);
+TLI_DEFINE_ENUM_INTERNAL(execvP)
+TLI_DEFINE_STRING_INTERNAL("execvP")
+/// int execve(const char *filename, char *const argv[], char *const envp[]);
+TLI_DEFINE_ENUM_INTERNAL(execve)
+TLI_DEFINE_STRING_INTERNAL("execve")
+/// int execvp(const char *file, char *const argv[]);
+TLI_DEFINE_ENUM_INTERNAL(execvp)
+TLI_DEFINE_STRING_INTERNAL("execvp")
+/// int execvpe(const char *file, char *const argv[], char *const envp[]);
+TLI_DEFINE_ENUM_INTERNAL(execvpe)
+TLI_DEFINE_STRING_INTERNAL("execvpe")
 /// double exp(double x);
 TLI_DEFINE_ENUM_INTERNAL(exp)
 TLI_DEFINE_STRING_INTERNAL("exp")
@@ -709,6 +733,9 @@ TLI_DEFINE_STRING_INTERNAL("fopen")
 /// FILE *fopen64(const char *filename, const char *opentype)
 TLI_DEFINE_ENUM_INTERNAL(fopen64)
 TLI_DEFINE_STRING_INTERNAL("fopen64")
+/// int fork();
+TLI_DEFINE_ENUM_INTERNAL(fork)
+TLI_DEFINE_STRING_INTERNAL("fork")
 /// int fprintf(FILE *stream, const char *format, ...);
 TLI_DEFINE_ENUM_INTERNAL(fprintf)
 TLI_DEFINE_STRING_INTERNAL("fprintf")
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index b3cd40e098e..e6c6b4a76d6 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -613,6 +613,28 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   unsigned NumParams = FTy.getNumParams();
 
   switch (F) {
+  case LibFunc_execl:
+  case LibFunc_execlp:
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy(32));
+  case LibFunc_execle:
+    return (NumParams >= 3 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getParamType(NumParams - 1)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy(32));
+  case LibFunc_execv:
+  case LibFunc_execvp:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy(32));
+  case LibFunc_execvP:
+  case LibFunc_execvpe:
+  case LibFunc_execve:
+    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getParamType(2)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy(32));
   case LibFunc_strlen:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getReturnType()->isIntegerTy());
@@ -863,6 +885,8 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
+  case LibFunc_fork:
+    return (NumParams == 0 && FTy.getReturnType()->isIntegerTy(32));
   case LibFunc_fdopen:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 01938a0f357..084e6b7e436 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -109,6 +109,8 @@ private:
   insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
   Function *insertFlush(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
 
+  void AddFlushBeforeForkAndExec();
+
   enum class GCovFileType { GCNO, GCDA };
   std::string mangleName(const DICompileUnit *CU, GCovFileType FileType);
 
@@ -468,6 +470,8 @@ bool GCOVProfiler::runOnModule(Module &M, const TargetLibraryInfo &TLI) {
   this->TLI = &TLI;
   Ctx = &M.getContext();
 
+  AddFlushBeforeForkAndExec();
+
   if (Options.EmitNotes) emitProfileNotes();
   if (Options.EmitData) return emitProfileArcs();
   return false;
@@ -524,6 +528,38 @@ static bool shouldKeepInEntry(BasicBlock::iterator It) {
 	return false;
 }
 
+void GCOVProfiler::AddFlushBeforeForkAndExec() {
+  SmallVector<Instruction *, 2> ForkAndExecs;
+  for (auto &F : M->functions()) {
+    for (auto &I : instructions(F)) {
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        if (Function *Callee = CI->getCalledFunction()) {
+          LibFunc LF;
+          if (TLI->getLibFunc(*Callee, LF) &&
+              (LF == LibFunc_fork || LF == LibFunc_execl ||
+               LF == LibFunc_execle || LF == LibFunc_execlp ||
+               LF == LibFunc_execv || LF == LibFunc_execvp ||
+               LF == LibFunc_execve || LF == LibFunc_execvpe ||
+               LF == LibFunc_execvP)) {
+            ForkAndExecs.push_back(&I);
+          }
+        }
+      }
+    }
+  }
+
+  // We need to split the block after the fork/exec call
+  // because else the counters for the lines after will be
+  // the same as before the call.
+  for (auto I : ForkAndExecs) {
+    IRBuilder<> Builder(I);
+    FunctionType *FTy = FunctionType::get(Builder.getVoidTy(), {}, false);
+    Constant *GCOVFlush = M->getOrInsertFunction("__gcov_flush", FTy);
+    Builder.CreateCall(GCOVFlush);
+    I->getParent()->splitBasicBlock(I);
+  }
+}
+
 void GCOVProfiler::emitProfileNotes() {
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes) return;
-- 
GitLab


From 238b8867a8e7a18e9cdf1c72b049d8bed200ee7a Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 7 Nov 2018 14:12:41 +0000
Subject: [PATCH 1058/1116] [InstCombine] do not shrink switch conditions to
 illegal types (PR29009)

This patch makes shrinking switch conditions less aggressive which was introduced by:
rL274233

Note that we have 2 new bugs to track potential follow-ups that might have solved PR29009
in different ways:
https://bugs.llvm.org/show_bug.cgi?id=39569
https://bugs.llvm.org/show_bug.cgi?id=39578

Patch by:
@dendibakh (Denis Bakhvalov)

Differential Revision: https://reviews.llvm.org/D54115


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346315 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstructionCombining.cpp      |  8 ++-
 test/Transforms/InstCombine/narrow-switch.ll  | 68 +++++++++++++++++--
 2 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index fd64cc58a1d..3e72b1da779 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2510,9 +2510,11 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);
 
   // Shrink the condition operand if the new type is smaller than the old type.
-  // This may produce a non-standard type for the switch, but that's ok because
-  // the backend should extend back to a legal type for the target.
-  if (NewWidth > 0 && NewWidth < Known.getBitWidth()) {
+  // But do not shrink to a non-standard type, because backend can't generate 
+  // good code for that yet.
+  // TODO: We can make it agressive again after fixing PR39569.
+  if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
+      shouldChangeType(Known.getBitWidth(), NewWidth)) {
     IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
     Builder.SetInsertPoint(&SI);
     Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc");
diff --git a/test/Transforms/InstCombine/narrow-switch.ll b/test/Transforms/InstCombine/narrow-switch.ll
index 474bd820c8f..a8fa3e528db 100644
--- a/test/Transforms/InstCombine/narrow-switch.ll
+++ b/test/Transforms/InstCombine/narrow-switch.ll
@@ -3,9 +3,6 @@
 ; RUN: opt < %s -instcombine -S -data-layout=n32    | FileCheck %s --check-prefix=ALL --check-prefix=CHECK32
 ; RUN: opt < %s -instcombine -S -data-layout=n32:64 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK64
 
-; In all cases, the data-layout is irrelevant. We should shrink as much as possible in InstCombine
-; and allow the backend to expand as much as needed to ensure optimal codegen for any target.
-
 define i32 @positive1(i64 %a) {
 ; ALL-LABEL: @positive1(
 ; ALL:         switch i32
@@ -102,13 +99,19 @@ return:
 ; Make sure to avoid assertion crashes and use the type before
 ; truncation to generate the sub constant expressions that leads
 ; to the recomputed condition.
+; We allow to truncate from i64 to i59 if in 32-bit mode,
+; because both are illegal.
 
 define void @trunc64to59(i64 %a) {
 ; ALL-LABEL: @trunc64to59(
-; ALL:         switch i59
-; ALL-NEXT:    i59 0, label %sw.bb1
-; ALL-NEXT:    i59 18717182647723699, label %sw.bb2
-; ALL-NEXT:    ]
+; ALL-CHECK32:         switch i59
+; ALL-CHECK32-NEXT:    i59 0, label %sw.bb1
+; ALL-CHECK32-NEXT:    i59 18717182647723699, label %sw.bb2
+; ALL-CHECK32-NEXT:    ]
+; ALL-CHECK64:         switch i64
+; ALL-CHECK64-NEXT:    i64 0, label %sw.bb1
+; ALL-CHECK64-NEXT:    i64 18717182647723699, label %sw.bb2
+; ALL-CHECK64-NEXT:    ]
 ;
 entry:
   %tmp0 = and i64 %a, 15
@@ -206,3 +209,54 @@ return:                                           ; preds = %sw.epilog, %sw.bb2,
   ret i32 %rval
 }
 
+; https://llvm.org/bugs/show_bug.cgi?id=29009
+
+@a = global i32 0, align 4
+@njob = global i32 0, align 4
+
+declare i32 @goo()
+
+; Make sure we do not shrink to illegal types (i3 in this case)
+; if original type is legal (i32 in this case)
+
+define void @PR29009() {
+; ALL-LABEL: @PR29009(
+; ALL:         switch i32
+; ALL-NEXT:    i32 0, label
+; ALL-NEXT:    i32 3, label
+; ALL-NEXT:    ]
+;
+  br label %1
+
+; <label>:1:                                      ; preds = %10, %0
+  %2 = load volatile i32, i32* @njob, align 4
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %4, label %11
+
+; <label>:4:                                      ; preds = %1
+  %5 = call i32 @goo()
+  %6 = and i32 %5, 7
+  switch i32 %6, label %7 [
+    i32 0, label %8
+    i32 3, label %9
+  ]
+
+; <label>:7:                                      ; preds = %4
+  store i32 6, i32* @a, align 4
+  br label %10
+
+; <label>:8:                                      ; preds = %4
+  store i32 1, i32* @a, align 4
+  br label %10
+
+; <label>:9:                                      ; preds = %4
+  store i32 2, i32* @a, align 4
+  br label %10
+
+; <label>:10:                                     ; preds = %13, %12, %11, %10, %9, %8, %7
+  br label %1
+
+; <label>:11:                                     ; preds = %1
+  ret void
+}
+
-- 
GitLab


From e9e41a0702020bd3b4c5136edac6a7f54f45794b Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 7 Nov 2018 14:35:36 +0000
Subject: [PATCH 1059/1116] fix typos aggressively; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346316 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUInline.cpp                  | 2 +-
 lib/Target/X86/X86SelectionDAGInfo.cpp              | 2 +-
 lib/Transforms/InstCombine/InstructionCombining.cpp | 2 +-
 lib/Transforms/Scalar/CallSiteSplitting.cpp         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index a5f9a85f50d..945c9acd379 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -44,7 +44,7 @@ ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
               cl::desc("Cost of alloca argument"));
 
 // If the amount of scratch memory to eliminate exceeds our ability to allocate
-// it into registers we gain nothing by agressively inlining functions for that
+// it into registers we gain nothing by aggressively inlining functions for that
 // heuristic.
 static cl::opt<unsigned>
 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index a71e5d39595..008a9ec2ba3 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -250,7 +250,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
 
     if (Repeats.BytesLeft() > 0 &&
         DAG.getMachineFunction().getFunction().optForMinSize()) {
-      // When agressively optimizing for size, avoid generating the code to
+      // When aggressively optimizing for size, avoid generating the code to
       // handle BytesLeft.
       Repeats.AVT = MVT::i8;
     }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 3e72b1da779..a3962a04b50 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2512,7 +2512,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   // Shrink the condition operand if the new type is smaller than the old type.
   // But do not shrink to a non-standard type, because backend can't generate 
   // good code for that yet.
-  // TODO: We can make it agressive again after fixing PR39569.
+  // TODO: We can make it aggressive again after fixing PR39569.
   if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
       shouldChangeType(Known.getBitWidth(), NewWidth)) {
     IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
index bac6ef99f03..b9e8e3424cc 100644
--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -197,7 +197,7 @@ static bool canSplitCallSite(CallSite CS, TargetTransformInfo &TTI) {
       isa<IndirectBrInst>(Preds[1]->getTerminator()))
     return false;
 
-  // BasicBlock::canSplitPredecessors is more agressive, so checking for
+  // BasicBlock::canSplitPredecessors is more aggressive, so checking for
   // BasicBlock::isEHPad as well.
   if (!CallSiteBB->canSplitPredecessors() || CallSiteBB->isEHPad())
     return false;
-- 
GitLab


From a9c6bcdb5375c719246414fa5420c3c2f75be1cb Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 7 Nov 2018 14:44:09 +0000
Subject: [PATCH 1060/1116] [InstCombine] add FMF to fcmp to show failure to
 propagate; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346317 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/fcmp.ll | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index 919659e45f4..dc91ee9552b 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -193,7 +193,7 @@ define <2 x i1> @fabs_ole(<2 x float> %a) {
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
-  %cmp = fcmp ole <2 x float> %call, zeroinitializer
+  %cmp = fcmp ninf ole <2 x float> %call, zeroinitializer
   ret <2 x i1> %cmp
 }
 
@@ -203,7 +203,7 @@ define i1 @fabs_ogt(double %a) {
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp ogt double %call, 0.0
+  %cmp = fcmp reassoc ogt double %call, 0.0
   ret i1 %cmp
 }
 
@@ -213,7 +213,7 @@ define i1 @fabs_oge(double %a) {
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp oge double %call, 0.0
+  %cmp = fcmp afn oge double %call, 0.0
   ret i1 %cmp
 }
 
@@ -223,7 +223,7 @@ define i1 @fabs_une(half %a) {
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call half @llvm.fabs.f16(half %a)
-  %cmp = fcmp une half %call, 0.0
+  %cmp = fcmp ninf une half %call, 0.0
   ret i1 %cmp
 }
 
@@ -233,7 +233,7 @@ define i1 @fabs_oeq(double %a) {
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp oeq double %call, 0.0
+  %cmp = fcmp ninf reassoc oeq double %call, 0.0
   ret i1 %cmp
 }
 
@@ -243,7 +243,7 @@ define i1 @fabs_one(double %a) {
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp one double %call, 0.0
+  %cmp = fcmp fast one double %call, 0.0
   ret i1 %cmp
 }
 
@@ -253,7 +253,7 @@ define <2 x i1> @fabs_ueq(<2 x float> %a) {
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
-  %cmp = fcmp ueq <2 x float> %call, zeroinitializer
+  %cmp = fcmp arcp ueq <2 x float> %call, zeroinitializer
   ret <2 x i1> %cmp
 }
 
-- 
GitLab


From 5cd01d9130d2d42b192dfefa5e49586219739181 Mon Sep 17 00:00:00 2001
From: Calixte Denizet <cdenizet@mozilla.com>
Date: Wed, 7 Nov 2018 14:46:26 +0000
Subject: [PATCH 1061/1116] Fix unit tests after patch
 https://reviews.llvm.org/rL346313

Summary: Tests are broken so fix them.

Reviewers: marco-c

Reviewed By: marco-c

Subscribers: sylvestre.ledru, llvm-commits

Differential Revision: https://reviews.llvm.org/D54208

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346318 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/TargetLibraryInfo.cpp           |   6 +-
 unittests/Analysis/TargetLibraryInfoTest.cpp | 939 ++++++++++---------
 2 files changed, 475 insertions(+), 470 deletions(-)

diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index e6c6b4a76d6..4643f75da42 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -615,13 +615,9 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   switch (F) {
   case LibFunc_execl:
   case LibFunc_execlp:
-    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
-            FTy.getParamType(1)->isPointerTy() &&
-            FTy.getReturnType()->isIntegerTy(32));
   case LibFunc_execle:
-    return (NumParams >= 3 && FTy.getParamType(0)->isPointerTy() &&
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy() &&
-            FTy.getParamType(NumParams - 1)->isPointerTy() &&
             FTy.getReturnType()->isIntegerTy(32));
   case LibFunc_execv:
   case LibFunc_execvp:
diff --git a/unittests/Analysis/TargetLibraryInfoTest.cpp b/unittests/Analysis/TargetLibraryInfoTest.cpp
index ec0f89a7e50..482d9d8d7c0 100644
--- a/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -76,471 +76,480 @@ TEST_F(TargetLibraryInfoTest, InvalidProto) {
 // Check that we do accept know-correct prototypes.
 TEST_F(TargetLibraryInfoTest, ValidProto) {
   parseAssembly(
-    // These functions use a 64-bit size_t; use the appropriate datalayout.
-    "target datalayout = \"p:64:64:64\"\n"
-
-    // Struct pointers are replaced with an opaque pointer.
-    "%struct = type opaque\n"
-
-    // These functions were extracted as-is from the OS X headers.
-    "declare double @__cospi(double)\n"
-    "declare float @__cospif(float)\n"
-    "declare { double, double } @__sincospi_stret(double)\n"
-    "declare <2 x float> @__sincospif_stret(float)\n"
-    "declare double @__sinpi(double)\n"
-    "declare float @__sinpif(float)\n"
-    "declare i32 @abs(i32)\n"
-    "declare i32 @access(i8*, i32)\n"
-    "declare double @acos(double)\n"
-    "declare float @acosf(float)\n"
-    "declare double @acosh(double)\n"
-    "declare float @acoshf(float)\n"
-    "declare x86_fp80 @acoshl(x86_fp80)\n"
-    "declare x86_fp80 @acosl(x86_fp80)\n"
-    "declare double @asin(double)\n"
-    "declare float @asinf(float)\n"
-    "declare double @asinh(double)\n"
-    "declare float @asinhf(float)\n"
-    "declare x86_fp80 @asinhl(x86_fp80)\n"
-    "declare x86_fp80 @asinl(x86_fp80)\n"
-    "declare double @atan(double)\n"
-    "declare double @atan2(double, double)\n"
-    "declare float @atan2f(float, float)\n"
-    "declare x86_fp80 @atan2l(x86_fp80, x86_fp80)\n"
-    "declare float @atanf(float)\n"
-    "declare double @atanh(double)\n"
-    "declare float @atanhf(float)\n"
-    "declare x86_fp80 @atanhl(x86_fp80)\n"
-    "declare x86_fp80 @atanl(x86_fp80)\n"
-    "declare double @atof(i8*)\n"
-    "declare i32 @atoi(i8*)\n"
-    "declare i64 @atol(i8*)\n"
-    "declare i64 @atoll(i8*)\n"
-    "declare i32 @bcmp(i8*, i8*, i64)\n"
-    "declare void @bcopy(i8*, i8*, i64)\n"
-    "declare void @bzero(i8*, i64)\n"
-    "declare i8* @calloc(i64, i64)\n"
-    "declare double @cbrt(double)\n"
-    "declare float @cbrtf(float)\n"
-    "declare x86_fp80 @cbrtl(x86_fp80)\n"
-    "declare double @ceil(double)\n"
-    "declare float @ceilf(float)\n"
-    "declare x86_fp80 @ceill(x86_fp80)\n"
-    "declare i32 @chown(i8*, i32, i32)\n"
-    "declare void @clearerr(%struct*)\n"
-    "declare double @copysign(double, double)\n"
-    "declare float @copysignf(float, float)\n"
-    "declare x86_fp80 @copysignl(x86_fp80, x86_fp80)\n"
-    "declare double @cabs([2 x double])\n"
-    "declare float @cabsf([2 x float])\n"
-    "declare x86_fp80 @cabsl([2 x x86_fp80])\n"
-    "declare double @cos(double)\n"
-    "declare float @cosf(float)\n"
-    "declare double @cosh(double)\n"
-    "declare float @coshf(float)\n"
-    "declare x86_fp80 @coshl(x86_fp80)\n"
-    "declare x86_fp80 @cosl(x86_fp80)\n"
-    "declare i8* @ctermid(i8*)\n"
-    "declare double @exp(double)\n"
-    "declare double @exp2(double)\n"
-    "declare float @exp2f(float)\n"
-    "declare x86_fp80 @exp2l(x86_fp80)\n"
-    "declare float @expf(float)\n"
-    "declare x86_fp80 @expl(x86_fp80)\n"
-    "declare double @expm1(double)\n"
-    "declare float @expm1f(float)\n"
-    "declare x86_fp80 @expm1l(x86_fp80)\n"
-    "declare double @fabs(double)\n"
-    "declare float @fabsf(float)\n"
-    "declare x86_fp80 @fabsl(x86_fp80)\n"
-    "declare i32 @fclose(%struct*)\n"
-    "declare i32 @feof(%struct*)\n"
-    "declare i32 @ferror(%struct*)\n"
-    "declare i32 @fflush(%struct*)\n"
-    "declare i32 @ffs(i32)\n"
-    "declare i32 @ffsl(i64)\n"
-    "declare i32 @ffsll(i64)\n"
-    "declare i32 @fgetc(%struct*)\n"
-    "declare i32 @fgetc_unlocked(%struct*)\n"
-    "declare i32 @fgetpos(%struct*, i64*)\n"
-    "declare i8* @fgets(i8*, i32, %struct*)\n"
-    "declare i8* @fgets_unlocked(i8*, i32, %struct*)\n"
-    "declare i32 @fileno(%struct*)\n"
-    "declare void @flockfile(%struct*)\n"
-    "declare double @floor(double)\n"
-    "declare float @floorf(float)\n"
-    "declare x86_fp80 @floorl(x86_fp80)\n"
-    "declare i32 @fls(i32)\n"
-    "declare i32 @flsl(i64)\n"
-    "declare i32 @flsll(i64)\n"
-    "declare double @fmax(double, double)\n"
-    "declare float @fmaxf(float, float)\n"
-    "declare x86_fp80 @fmaxl(x86_fp80, x86_fp80)\n"
-    "declare double @fmin(double, double)\n"
-    "declare float @fminf(float, float)\n"
-    "declare x86_fp80 @fminl(x86_fp80, x86_fp80)\n"
-    "declare double @fmod(double, double)\n"
-    "declare float @fmodf(float, float)\n"
-    "declare x86_fp80 @fmodl(x86_fp80, x86_fp80)\n"
-    "declare i32 @fprintf(%struct*, i8*, ...)\n"
-    "declare i32 @fputc(i32, %struct*)\n"
-    "declare i32 @fputc_unlocked(i32, %struct*)\n"
-    "declare i64 @fread(i8*, i64, i64, %struct*)\n"
-    "declare i64 @fread_unlocked(i8*, i64, i64, %struct*)\n"
-    "declare void @free(i8*)\n"
-    "declare double @frexp(double, i32*)\n"
-    "declare float @frexpf(float, i32*)\n"
-    "declare x86_fp80 @frexpl(x86_fp80, i32*)\n"
-    "declare i32 @fscanf(%struct*, i8*, ...)\n"
-    "declare i32 @fseek(%struct*, i64, i32)\n"
-    "declare i32 @fseeko(%struct*, i64, i32)\n"
-    "declare i32 @fsetpos(%struct*, i64*)\n"
-    "declare i32 @fstatvfs(i32, %struct*)\n"
-    "declare i64 @ftell(%struct*)\n"
-    "declare i64 @ftello(%struct*)\n"
-    "declare i32 @ftrylockfile(%struct*)\n"
-    "declare void @funlockfile(%struct*)\n"
-    "declare i32 @getc(%struct*)\n"
-    "declare i32 @getc_unlocked(%struct*)\n"
-    "declare i32 @getchar()\n"
-    "declare i32 @getchar_unlocked()\n"
-    "declare i8* @getenv(i8*)\n"
-    "declare i32 @getitimer(i32, %struct*)\n"
-    "declare i32 @getlogin_r(i8*, i64)\n"
-    "declare %struct* @getpwnam(i8*)\n"
-    "declare i8* @gets(i8*)\n"
-    "declare i32 @gettimeofday(%struct*, i8*)\n"
-    "declare i32 @_Z7isasciii(i32)\n"
-    "declare i32 @_Z7isdigiti(i32)\n"
-    "declare i64 @labs(i64)\n"
-    "declare double @ldexp(double, i32)\n"
-    "declare float @ldexpf(float, i32)\n"
-    "declare x86_fp80 @ldexpl(x86_fp80, i32)\n"
-    "declare i64 @llabs(i64)\n"
-    "declare double @log(double)\n"
-    "declare double @log10(double)\n"
-    "declare float @log10f(float)\n"
-    "declare x86_fp80 @log10l(x86_fp80)\n"
-    "declare double @log1p(double)\n"
-    "declare float @log1pf(float)\n"
-    "declare x86_fp80 @log1pl(x86_fp80)\n"
-    "declare double @log2(double)\n"
-    "declare float @log2f(float)\n"
-    "declare x86_fp80 @log2l(x86_fp80)\n"
-    "declare double @logb(double)\n"
-    "declare float @logbf(float)\n"
-    "declare x86_fp80 @logbl(x86_fp80)\n"
-    "declare float @logf(float)\n"
-    "declare x86_fp80 @logl(x86_fp80)\n"
-    "declare i8* @malloc(i64)\n"
-    "declare i8* @memccpy(i8*, i8*, i32, i64)\n"
-    "declare i8* @memchr(i8*, i32, i64)\n"
-    "declare i32 @memcmp(i8*, i8*, i64)\n"
-    "declare i8* @memcpy(i8*, i8*, i64)\n"
-    "declare i8* @memmove(i8*, i8*, i64)\n"
-    "declare i8* @memset(i8*, i32, i64)\n"
-    "declare void @memset_pattern16(i8*, i8*, i64)\n"
-    "declare i32 @mkdir(i8*, i16)\n"
-    "declare double @modf(double, double*)\n"
-    "declare float @modff(float, float*)\n"
-    "declare x86_fp80 @modfl(x86_fp80, x86_fp80*)\n"
-    "declare double @nearbyint(double)\n"
-    "declare float @nearbyintf(float)\n"
-    "declare x86_fp80 @nearbyintl(x86_fp80)\n"
-    "declare i32 @pclose(%struct*)\n"
-    "declare void @perror(i8*)\n"
-    "declare i32 @posix_memalign(i8**, i64, i64)\n"
-    "declare double @pow(double, double)\n"
-    "declare float @powf(float, float)\n"
-    "declare x86_fp80 @powl(x86_fp80, x86_fp80)\n"
-    "declare i32 @printf(i8*, ...)\n"
-    "declare i32 @putc(i32, %struct*)\n"
-    "declare i32 @putc_unlocked(i32, %struct*)\n"
-    "declare i32 @putchar(i32)\n"
-    "declare i32 @putchar_unlocked(i32)\n"
-    "declare i32 @puts(i8*)\n"
-    "declare void @qsort(i8*, i64, i64, i32 (i8*, i8*)*)\n"
-    "declare i64 @readlink(i8*, i8*, i64)\n"
-    "declare i8* @realloc(i8*, i64)\n"
-    "declare i8* @reallocf(i8*, i64)\n"
-    "declare i32 @remove(i8*)\n"
-    "declare i32 @rename(i8*, i8*)\n"
-    "declare void @rewind(%struct*)\n"
-    "declare double @rint(double)\n"
-    "declare float @rintf(float)\n"
-    "declare x86_fp80 @rintl(x86_fp80)\n"
-    "declare i32 @rmdir(i8*)\n"
-    "declare double @round(double)\n"
-    "declare float @roundf(float)\n"
-    "declare x86_fp80 @roundl(x86_fp80)\n"
-    "declare i32 @scanf(i8*, ...)\n"
-    "declare void @setbuf(%struct*, i8*)\n"
-    "declare i32 @setitimer(i32, %struct*, %struct*)\n"
-    "declare i32 @setvbuf(%struct*, i8*, i32, i64)\n"
-    "declare double @sin(double)\n"
-    "declare float @sinf(float)\n"
-    "declare double @sinh(double)\n"
-    "declare float @sinhf(float)\n"
-    "declare x86_fp80 @sinhl(x86_fp80)\n"
-    "declare x86_fp80 @sinl(x86_fp80)\n"
-    "declare i32 @snprintf(i8*, i64, i8*, ...)\n"
-    "declare i32 @sprintf(i8*, i8*, ...)\n"
-    "declare double @sqrt(double)\n"
-    "declare float @sqrtf(float)\n"
-    "declare x86_fp80 @sqrtl(x86_fp80)\n"
-    "declare i32 @sscanf(i8*, i8*, ...)\n"
-    "declare i32 @statvfs(i8*, %struct*)\n"
-    "declare i8* @stpcpy(i8*, i8*)\n"
-    "declare i8* @stpncpy(i8*, i8*, i64)\n"
-    "declare i32 @strcasecmp(i8*, i8*)\n"
-    "declare i8* @strcat(i8*, i8*)\n"
-    "declare i8* @strchr(i8*, i32)\n"
-    "declare i32 @strcmp(i8*, i8*)\n"
-    "declare i32 @strcoll(i8*, i8*)\n"
-    "declare i8* @strcpy(i8*, i8*)\n"
-    "declare i64 @strcspn(i8*, i8*)\n"
-    "declare i8* @strdup(i8*)\n"
-    "declare i64 @strlen(i8*)\n"
-    "declare i32 @strncasecmp(i8*, i8*, i64)\n"
-    "declare i8* @strncat(i8*, i8*, i64)\n"
-    "declare i32 @strncmp(i8*, i8*, i64)\n"
-    "declare i8* @strncpy(i8*, i8*, i64)\n"
-    "declare i8* @strndup(i8*, i64)\n"
-    "declare i64 @strnlen(i8*, i64)\n"
-    "declare i8* @strpbrk(i8*, i8*)\n"
-    "declare i8* @strrchr(i8*, i32)\n"
-    "declare i64 @strspn(i8*, i8*)\n"
-    "declare i8* @strstr(i8*, i8*)\n"
-    "declare i8* @strtok(i8*, i8*)\n"
-    "declare i8* @strtok_r(i8*, i8*, i8**)\n"
-    "declare i64 @strtol(i8*, i8**, i32)\n"
-    "declare x86_fp80 @strtold(i8*, i8**)\n"
-    "declare i64 @strtoll(i8*, i8**, i32)\n"
-    "declare i64 @strtoul(i8*, i8**, i32)\n"
-    "declare i64 @strtoull(i8*, i8**, i32)\n"
-    "declare i64 @strxfrm(i8*, i8*, i64)\n"
-    "declare double @tan(double)\n"
-    "declare float @tanf(float)\n"
-    "declare double @tanh(double)\n"
-    "declare float @tanhf(float)\n"
-    "declare x86_fp80 @tanhl(x86_fp80)\n"
-    "declare x86_fp80 @tanl(x86_fp80)\n"
-    "declare i64 @times(%struct*)\n"
-    "declare %struct* @tmpfile()\n"
-    "declare i32 @_Z7toasciii(i32)\n"
-    "declare double @trunc(double)\n"
-    "declare float @truncf(float)\n"
-    "declare x86_fp80 @truncl(x86_fp80)\n"
-    "declare i32 @uname(%struct*)\n"
-    "declare i32 @ungetc(i32, %struct*)\n"
-    "declare i32 @unlink(i8*)\n"
-    "declare i32 @utime(i8*, %struct*)\n"
-    "declare i32 @utimes(i8*, %struct*)\n"
-    "declare i8* @valloc(i64)\n"
-    "declare i32 @vfprintf(%struct*, i8*, %struct*)\n"
-    "declare i32 @vfscanf(%struct*, i8*, %struct*)\n"
-    "declare i32 @vprintf(i8*, %struct*)\n"
-    "declare i32 @vscanf(i8*, %struct*)\n"
-    "declare i32 @vsnprintf(i8*, i64, i8*, %struct*)\n"
-    "declare i32 @vsprintf(i8*, i8*, %struct*)\n"
-    "declare i32 @vsscanf(i8*, i8*, %struct*)\n"
-    "declare i64 @wcslen(i32*)\n"
-
-    // These functions were also extracted from the OS X headers, but they are
-    // available with a special name on darwin.
-    // This test uses the default TLI name instead.
-    "declare i32 @chmod(i8*, i16)\n"
-    "declare i32 @closedir(%struct*)\n"
-    "declare %struct* @fdopen(i32, i8*)\n"
-    "declare %struct* @fopen(i8*, i8*)\n"
-    "declare i32 @fputs(i8*, %struct*)\n"
-    "declare i32 @fputs_unlocked(i8*, %struct*)\n"
-    "declare i32 @fstat(i32, %struct*)\n"
-    "declare i64 @fwrite(i8*, i64, i64, %struct*)\n"
-    "declare i64 @fwrite_unlocked(i8*, i64, i64, %struct*)\n"
-    "declare i32 @lchown(i8*, i32, i32)\n"
-    "declare i32 @lstat(i8*, %struct*)\n"
-    "declare i64 @mktime(%struct*)\n"
-    "declare i32 @open(i8*, i32, ...)\n"
-    "declare %struct* @opendir(i8*)\n"
-    "declare %struct* @popen(i8*, i8*)\n"
-    "declare i64 @pread(i32, i8*, i64, i64)\n"
-    "declare i64 @pwrite(i32, i8*, i64, i64)\n"
-    "declare i64 @read(i32, i8*, i64)\n"
-    "declare i8* @realpath(i8*, i8*)\n"
-    "declare i32 @stat(i8*, %struct*)\n"
-    "declare double @strtod(i8*, i8**)\n"
-    "declare float @strtof(i8*, i8**)\n"
-    "declare i32 @system(i8*)\n"
-    "declare i32 @unsetenv(i8*)\n"
-    "declare i64 @write(i32, i8*, i64)\n"
-
-    // These functions are available on Linux but not Darwin; they only differ
-    // from their non-64 counterparts in the struct type.
-    // Use the same prototype as the non-64 variant.
-    "declare %struct* @fopen64(i8*, i8*)\n"
-    "declare i32 @fstat64(i32, %struct*)\n"
-    "declare i32 @fstatvfs64(i32, %struct*)\n"
-    "declare i32 @lstat64(i8*, %struct*)\n"
-    "declare i32 @open64(i8*, i32, ...)\n"
-    "declare i32 @stat64(i8*, %struct*)\n"
-    "declare i32 @statvfs64(i8*, %struct*)\n"
-    "declare %struct* @tmpfile64()\n"
-
-    // These functions are also -64 variants, but do differ in the type of the
-    // off_t (vs off64_t) parameter.  The non-64 variants declared above used
-    // a 64-bit off_t, so, in practice, they are also equivalent.
-    "declare i32 @fseeko64(%struct*, i64, i32)\n"
-    "declare i64 @ftello64(%struct*)\n"
-
-    "declare void @_ZdaPv(i8*)\n"
-    "declare void @_ZdaPvRKSt9nothrow_t(i8*, %struct*)\n"
-    "declare void @_ZdaPvSt11align_val_t(i8*, i64)\n"
-    "declare void @_ZdaPvSt11align_val_tRKSt9nothrow_t(i8*, i64, %struct*)\n"
-    "declare void @_ZdaPvj(i8*, i32)\n"
-    "declare void @_ZdaPvm(i8*, i64)\n"
-    "declare void @_ZdlPv(i8*)\n"
-    "declare void @_ZdlPvRKSt9nothrow_t(i8*, %struct*)\n"
-    "declare void @_ZdlPvSt11align_val_t(i8*, i64)\n"
-    "declare void @_ZdlPvSt11align_val_tRKSt9nothrow_t(i8*, i64, %struct*)\n"
-    "declare void @_ZdlPvj(i8*, i32)\n"
-    "declare void @_ZdlPvm(i8*, i64)\n"
-    "declare i8* @_Znaj(i32)\n"
-    "declare i8* @_ZnajRKSt9nothrow_t(i32, %struct*)\n"
-    "declare i8* @_ZnajSt11align_val_t(i32, i32)\n"
-    "declare i8* @_ZnajSt11align_val_tRKSt9nothrow_t(i32, i32, %struct*)\n"
-    "declare i8* @_Znam(i64)\n"
-    "declare i8* @_ZnamRKSt9nothrow_t(i64, %struct*)\n"
-    "declare i8* @_ZnamSt11align_val_t(i64, i64)\n"
-    "declare i8* @_ZnamSt11align_val_tRKSt9nothrow_t(i64, i64, %struct*)\n"
-    "declare i8* @_Znwj(i32)\n"
-    "declare i8* @_ZnwjRKSt9nothrow_t(i32, %struct*)\n"
-    "declare i8* @_ZnwjSt11align_val_t(i32, i32)\n"
-    "declare i8* @_ZnwjSt11align_val_tRKSt9nothrow_t(i32, i32, %struct*)\n"
-    "declare i8* @_Znwm(i64)\n"
-    "declare i8* @_ZnwmRKSt9nothrow_t(i64, %struct*)\n"
-    "declare i8* @_ZnwmSt11align_val_t(i64, i64)\n"
-    "declare i8* @_ZnwmSt11align_val_tRKSt9nothrow_t(i64, i64, %struct*)\n"
-
-    "declare void @\"??3@YAXPEAX@Z\"(i8*)\n"
-    "declare void @\"??3@YAXPEAXAEBUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
-    "declare void @\"??3@YAXPEAX_K@Z\"(i8*, i64)\n"
-    "declare void @\"??_V@YAXPEAX@Z\"(i8*)\n"
-    "declare void @\"??_V@YAXPEAXAEBUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
-    "declare void @\"??_V@YAXPEAX_K@Z\"(i8*, i64)\n"
-    "declare i8* @\"??2@YAPAXI@Z\"(i32)\n"
-    "declare i8* @\"??2@YAPAXIABUnothrow_t@std@@@Z\"(i32, %struct*)\n"
-    "declare i8* @\"??2@YAPEAX_K@Z\"(i64)\n"
-    "declare i8* @\"??2@YAPEAX_KAEBUnothrow_t@std@@@Z\"(i64, %struct*)\n"
-    "declare i8* @\"??_U@YAPAXI@Z\"(i32)\n"
-    "declare i8* @\"??_U@YAPAXIABUnothrow_t@std@@@Z\"(i32, %struct*)\n"
-    "declare i8* @\"??_U@YAPEAX_K@Z\"(i64)\n"
-    "declare i8* @\"??_U@YAPEAX_KAEBUnothrow_t@std@@@Z\"(i64, %struct*)\n"
-
-    "declare void @\"??3@YAXPAX@Z\"(i8*)\n"
-    "declare void @\"??3@YAXPAXABUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
-    "declare void @\"??3@YAXPAXI@Z\"(i8*, i32)\n"
-    "declare void @\"??_V@YAXPAX@Z\"(i8*)\n"
-    "declare void @\"??_V@YAXPAXABUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
-    "declare void @\"??_V@YAXPAXI@Z\"(i8*, i32)\n"
-
-    // These other functions were derived from the .def C declaration.
-    "declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*)\n"
-    "declare void @__cxa_guard_abort(%struct*)\n"
-    "declare i32 @__cxa_guard_acquire(%struct*)\n"
-    "declare void @__cxa_guard_release(%struct*)\n"
-
-    "declare i32 @__nvvm_reflect(i8*)\n"
-
-    "declare i8* @__memcpy_chk(i8*, i8*, i64, i64)\n"
-    "declare i8* @__memmove_chk(i8*, i8*, i64, i64)\n"
-    "declare i8* @__memset_chk(i8*, i32, i64, i64)\n"
-    "declare i8* @__stpcpy_chk(i8*, i8*, i64)\n"
-    "declare i8* @__stpncpy_chk(i8*, i8*, i64, i64)\n"
-    "declare i8* @__strcpy_chk(i8*, i8*, i64)\n"
-    "declare i8* @__strncpy_chk(i8*, i8*, i64, i64)\n"
-
-    "declare i8* @memalign(i64, i64)\n"
-    "declare i8* @mempcpy(i8*, i8*, i64)\n"
-    "declare i8* @memrchr(i8*, i32, i64)\n"
-
-    // These are similar to the FILE* fgetc/fputc.
-    "declare i32 @_IO_getc(%struct*)\n"
-    "declare i32 @_IO_putc(i32, %struct*)\n"
-
-    "declare i32 @__isoc99_scanf(i8*, ...)\n"
-    "declare i32 @__isoc99_sscanf(i8*, i8*, ...)\n"
-    "declare i8* @__strdup(i8*)\n"
-    "declare i8* @__strndup(i8*, i64)\n"
-    "declare i8* @__strtok_r(i8*, i8*, i8**)\n"
-
-    "declare double @__sqrt_finite(double)\n"
-    "declare float @__sqrtf_finite(float)\n"
-    "declare x86_fp80 @__sqrtl_finite(x86_fp80)\n"
-    "declare double @exp10(double)\n"
-    "declare float @exp10f(float)\n"
-    "declare x86_fp80 @exp10l(x86_fp80)\n"
-
-    // These printf variants have the same prototype as the non-'i' versions.
-    "declare i32 @fiprintf(%struct*, i8*, ...)\n"
-    "declare i32 @iprintf(i8*, ...)\n"
-    "declare i32 @siprintf(i8*, i8*, ...)\n"
-
-    "declare i32 @htonl(i32)\n"
-    "declare i16 @htons(i16)\n"
-    "declare i32 @ntohl(i32)\n"
-    "declare i16 @ntohs(i16)\n"
-
-    "declare i32 @isascii(i32)\n"
-    "declare i32 @isdigit(i32)\n"
-    "declare i32 @toascii(i32)\n"
-
-    // These functions were extracted from math-finite.h which provides
-    // functions similar to those in math.h, but optimized for handling
-    // finite values only.
-    "declare double @__acos_finite(double)\n"
-    "declare float @__acosf_finite(float)\n"
-    "declare x86_fp80 @__acosl_finite(x86_fp80)\n"
-    "declare double @__acosh_finite(double)\n"
-    "declare float @__acoshf_finite(float)\n"
-    "declare x86_fp80 @__acoshl_finite(x86_fp80)\n"
-    "declare double @__asin_finite(double)\n"
-    "declare float @__asinf_finite(float)\n"
-    "declare x86_fp80 @__asinl_finite(x86_fp80)\n"
-    "declare double @__atan2_finite(double, double)\n"
-    "declare float @__atan2f_finite(float, float)\n"
-    "declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80)\n"
-    "declare double @__atanh_finite(double)\n"
-    "declare float @__atanhf_finite(float)\n"
-    "declare x86_fp80 @__atanhl_finite(x86_fp80)\n"
-    "declare double @__cosh_finite(double)\n"
-    "declare float @__coshf_finite(float)\n"
-    "declare x86_fp80 @__coshl_finite(x86_fp80)\n"
-    "declare double @__exp10_finite(double)\n"
-    "declare float @__exp10f_finite(float)\n"
-    "declare x86_fp80 @__exp10l_finite(x86_fp80)\n"
-    "declare double @__exp2_finite(double)\n"
-    "declare float @__exp2f_finite(float)\n"
-    "declare x86_fp80 @__exp2l_finite(x86_fp80)\n"
-    "declare double @__exp_finite(double)\n"
-    "declare float @__expf_finite(float)\n"
-    "declare x86_fp80 @__expl_finite(x86_fp80)\n"     
-    "declare double @__log10_finite(double)\n"
-    "declare float @__log10f_finite(float)\n"
-    "declare x86_fp80 @__log10l_finite(x86_fp80)\n"
-    "declare double @__log2_finite(double)\n"
-    "declare float @__log2f_finite(float)\n"
-    "declare x86_fp80 @__log2l_finite(x86_fp80)\n"
-    "declare double @__log_finite(double)\n"
-    "declare float @__logf_finite(float)\n"
-    "declare x86_fp80 @__logl_finite(x86_fp80)\n"
-    "declare double @__pow_finite(double, double)\n"
-    "declare float @__powf_finite(float, float)\n"
-    "declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80)\n"
-    "declare double @__sinh_finite(double)\n"
-    "declare float @__sinhf_finite(float)\n"
-    "declare x86_fp80 @__sinhl_finite(x86_fp80)\n"
-    );
+      // These functions use a 64-bit size_t; use the appropriate datalayout.
+      "target datalayout = \"p:64:64:64\"\n"
+
+      // Struct pointers are replaced with an opaque pointer.
+      "%struct = type opaque\n"
+
+      // These functions were extracted as-is from the OS X headers.
+      "declare double @__cospi(double)\n"
+      "declare float @__cospif(float)\n"
+      "declare { double, double } @__sincospi_stret(double)\n"
+      "declare <2 x float> @__sincospif_stret(float)\n"
+      "declare double @__sinpi(double)\n"
+      "declare float @__sinpif(float)\n"
+      "declare i32 @abs(i32)\n"
+      "declare i32 @access(i8*, i32)\n"
+      "declare double @acos(double)\n"
+      "declare float @acosf(float)\n"
+      "declare double @acosh(double)\n"
+      "declare float @acoshf(float)\n"
+      "declare x86_fp80 @acoshl(x86_fp80)\n"
+      "declare x86_fp80 @acosl(x86_fp80)\n"
+      "declare double @asin(double)\n"
+      "declare float @asinf(float)\n"
+      "declare double @asinh(double)\n"
+      "declare float @asinhf(float)\n"
+      "declare x86_fp80 @asinhl(x86_fp80)\n"
+      "declare x86_fp80 @asinl(x86_fp80)\n"
+      "declare double @atan(double)\n"
+      "declare double @atan2(double, double)\n"
+      "declare float @atan2f(float, float)\n"
+      "declare x86_fp80 @atan2l(x86_fp80, x86_fp80)\n"
+      "declare float @atanf(float)\n"
+      "declare double @atanh(double)\n"
+      "declare float @atanhf(float)\n"
+      "declare x86_fp80 @atanhl(x86_fp80)\n"
+      "declare x86_fp80 @atanl(x86_fp80)\n"
+      "declare double @atof(i8*)\n"
+      "declare i32 @atoi(i8*)\n"
+      "declare i64 @atol(i8*)\n"
+      "declare i64 @atoll(i8*)\n"
+      "declare i32 @bcmp(i8*, i8*, i64)\n"
+      "declare void @bcopy(i8*, i8*, i64)\n"
+      "declare void @bzero(i8*, i64)\n"
+      "declare i8* @calloc(i64, i64)\n"
+      "declare double @cbrt(double)\n"
+      "declare float @cbrtf(float)\n"
+      "declare x86_fp80 @cbrtl(x86_fp80)\n"
+      "declare double @ceil(double)\n"
+      "declare float @ceilf(float)\n"
+      "declare x86_fp80 @ceill(x86_fp80)\n"
+      "declare i32 @chown(i8*, i32, i32)\n"
+      "declare void @clearerr(%struct*)\n"
+      "declare double @copysign(double, double)\n"
+      "declare float @copysignf(float, float)\n"
+      "declare x86_fp80 @copysignl(x86_fp80, x86_fp80)\n"
+      "declare double @cabs([2 x double])\n"
+      "declare float @cabsf([2 x float])\n"
+      "declare x86_fp80 @cabsl([2 x x86_fp80])\n"
+      "declare double @cos(double)\n"
+      "declare float @cosf(float)\n"
+      "declare double @cosh(double)\n"
+      "declare float @coshf(float)\n"
+      "declare x86_fp80 @coshl(x86_fp80)\n"
+      "declare x86_fp80 @cosl(x86_fp80)\n"
+      "declare i8* @ctermid(i8*)\n"
+      "declare double @exp(double)\n"
+      "declare double @exp2(double)\n"
+      "declare float @exp2f(float)\n"
+      "declare x86_fp80 @exp2l(x86_fp80)\n"
+      "declare float @expf(float)\n"
+      "declare x86_fp80 @expl(x86_fp80)\n"
+      "declare double @expm1(double)\n"
+      "declare float @expm1f(float)\n"
+      "declare x86_fp80 @expm1l(x86_fp80)\n"
+      "declare double @fabs(double)\n"
+      "declare float @fabsf(float)\n"
+      "declare x86_fp80 @fabsl(x86_fp80)\n"
+      "declare i32 @fclose(%struct*)\n"
+      "declare i32 @feof(%struct*)\n"
+      "declare i32 @ferror(%struct*)\n"
+      "declare i32 @fflush(%struct*)\n"
+      "declare i32 @ffs(i32)\n"
+      "declare i32 @ffsl(i64)\n"
+      "declare i32 @ffsll(i64)\n"
+      "declare i32 @fgetc(%struct*)\n"
+      "declare i32 @fgetc_unlocked(%struct*)\n"
+      "declare i32 @fgetpos(%struct*, i64*)\n"
+      "declare i8* @fgets(i8*, i32, %struct*)\n"
+      "declare i8* @fgets_unlocked(i8*, i32, %struct*)\n"
+      "declare i32 @fileno(%struct*)\n"
+      "declare void @flockfile(%struct*)\n"
+      "declare double @floor(double)\n"
+      "declare float @floorf(float)\n"
+      "declare x86_fp80 @floorl(x86_fp80)\n"
+      "declare i32 @fls(i32)\n"
+      "declare i32 @flsl(i64)\n"
+      "declare i32 @flsll(i64)\n"
+      "declare double @fmax(double, double)\n"
+      "declare float @fmaxf(float, float)\n"
+      "declare x86_fp80 @fmaxl(x86_fp80, x86_fp80)\n"
+      "declare double @fmin(double, double)\n"
+      "declare float @fminf(float, float)\n"
+      "declare x86_fp80 @fminl(x86_fp80, x86_fp80)\n"
+      "declare double @fmod(double, double)\n"
+      "declare float @fmodf(float, float)\n"
+      "declare x86_fp80 @fmodl(x86_fp80, x86_fp80)\n"
+      "declare i32 @fprintf(%struct*, i8*, ...)\n"
+      "declare i32 @fputc(i32, %struct*)\n"
+      "declare i32 @fputc_unlocked(i32, %struct*)\n"
+      "declare i64 @fread(i8*, i64, i64, %struct*)\n"
+      "declare i64 @fread_unlocked(i8*, i64, i64, %struct*)\n"
+      "declare void @free(i8*)\n"
+      "declare double @frexp(double, i32*)\n"
+      "declare float @frexpf(float, i32*)\n"
+      "declare x86_fp80 @frexpl(x86_fp80, i32*)\n"
+      "declare i32 @fscanf(%struct*, i8*, ...)\n"
+      "declare i32 @fseek(%struct*, i64, i32)\n"
+      "declare i32 @fseeko(%struct*, i64, i32)\n"
+      "declare i32 @fsetpos(%struct*, i64*)\n"
+      "declare i32 @fstatvfs(i32, %struct*)\n"
+      "declare i64 @ftell(%struct*)\n"
+      "declare i64 @ftello(%struct*)\n"
+      "declare i32 @ftrylockfile(%struct*)\n"
+      "declare void @funlockfile(%struct*)\n"
+      "declare i32 @getc(%struct*)\n"
+      "declare i32 @getc_unlocked(%struct*)\n"
+      "declare i32 @getchar()\n"
+      "declare i32 @getchar_unlocked()\n"
+      "declare i8* @getenv(i8*)\n"
+      "declare i32 @getitimer(i32, %struct*)\n"
+      "declare i32 @getlogin_r(i8*, i64)\n"
+      "declare %struct* @getpwnam(i8*)\n"
+      "declare i8* @gets(i8*)\n"
+      "declare i32 @gettimeofday(%struct*, i8*)\n"
+      "declare i32 @_Z7isasciii(i32)\n"
+      "declare i32 @_Z7isdigiti(i32)\n"
+      "declare i64 @labs(i64)\n"
+      "declare double @ldexp(double, i32)\n"
+      "declare float @ldexpf(float, i32)\n"
+      "declare x86_fp80 @ldexpl(x86_fp80, i32)\n"
+      "declare i64 @llabs(i64)\n"
+      "declare double @log(double)\n"
+      "declare double @log10(double)\n"
+      "declare float @log10f(float)\n"
+      "declare x86_fp80 @log10l(x86_fp80)\n"
+      "declare double @log1p(double)\n"
+      "declare float @log1pf(float)\n"
+      "declare x86_fp80 @log1pl(x86_fp80)\n"
+      "declare double @log2(double)\n"
+      "declare float @log2f(float)\n"
+      "declare x86_fp80 @log2l(x86_fp80)\n"
+      "declare double @logb(double)\n"
+      "declare float @logbf(float)\n"
+      "declare x86_fp80 @logbl(x86_fp80)\n"
+      "declare float @logf(float)\n"
+      "declare x86_fp80 @logl(x86_fp80)\n"
+      "declare i8* @malloc(i64)\n"
+      "declare i8* @memccpy(i8*, i8*, i32, i64)\n"
+      "declare i8* @memchr(i8*, i32, i64)\n"
+      "declare i32 @memcmp(i8*, i8*, i64)\n"
+      "declare i8* @memcpy(i8*, i8*, i64)\n"
+      "declare i8* @memmove(i8*, i8*, i64)\n"
+      "declare i8* @memset(i8*, i32, i64)\n"
+      "declare void @memset_pattern16(i8*, i8*, i64)\n"
+      "declare i32 @mkdir(i8*, i16)\n"
+      "declare double @modf(double, double*)\n"
+      "declare float @modff(float, float*)\n"
+      "declare x86_fp80 @modfl(x86_fp80, x86_fp80*)\n"
+      "declare double @nearbyint(double)\n"
+      "declare float @nearbyintf(float)\n"
+      "declare x86_fp80 @nearbyintl(x86_fp80)\n"
+      "declare i32 @pclose(%struct*)\n"
+      "declare void @perror(i8*)\n"
+      "declare i32 @posix_memalign(i8**, i64, i64)\n"
+      "declare double @pow(double, double)\n"
+      "declare float @powf(float, float)\n"
+      "declare x86_fp80 @powl(x86_fp80, x86_fp80)\n"
+      "declare i32 @printf(i8*, ...)\n"
+      "declare i32 @putc(i32, %struct*)\n"
+      "declare i32 @putc_unlocked(i32, %struct*)\n"
+      "declare i32 @putchar(i32)\n"
+      "declare i32 @putchar_unlocked(i32)\n"
+      "declare i32 @puts(i8*)\n"
+      "declare void @qsort(i8*, i64, i64, i32 (i8*, i8*)*)\n"
+      "declare i64 @readlink(i8*, i8*, i64)\n"
+      "declare i8* @realloc(i8*, i64)\n"
+      "declare i8* @reallocf(i8*, i64)\n"
+      "declare i32 @remove(i8*)\n"
+      "declare i32 @rename(i8*, i8*)\n"
+      "declare void @rewind(%struct*)\n"
+      "declare double @rint(double)\n"
+      "declare float @rintf(float)\n"
+      "declare x86_fp80 @rintl(x86_fp80)\n"
+      "declare i32 @rmdir(i8*)\n"
+      "declare double @round(double)\n"
+      "declare float @roundf(float)\n"
+      "declare x86_fp80 @roundl(x86_fp80)\n"
+      "declare i32 @scanf(i8*, ...)\n"
+      "declare void @setbuf(%struct*, i8*)\n"
+      "declare i32 @setitimer(i32, %struct*, %struct*)\n"
+      "declare i32 @setvbuf(%struct*, i8*, i32, i64)\n"
+      "declare double @sin(double)\n"
+      "declare float @sinf(float)\n"
+      "declare double @sinh(double)\n"
+      "declare float @sinhf(float)\n"
+      "declare x86_fp80 @sinhl(x86_fp80)\n"
+      "declare x86_fp80 @sinl(x86_fp80)\n"
+      "declare i32 @snprintf(i8*, i64, i8*, ...)\n"
+      "declare i32 @sprintf(i8*, i8*, ...)\n"
+      "declare double @sqrt(double)\n"
+      "declare float @sqrtf(float)\n"
+      "declare x86_fp80 @sqrtl(x86_fp80)\n"
+      "declare i32 @sscanf(i8*, i8*, ...)\n"
+      "declare i32 @statvfs(i8*, %struct*)\n"
+      "declare i8* @stpcpy(i8*, i8*)\n"
+      "declare i8* @stpncpy(i8*, i8*, i64)\n"
+      "declare i32 @strcasecmp(i8*, i8*)\n"
+      "declare i8* @strcat(i8*, i8*)\n"
+      "declare i8* @strchr(i8*, i32)\n"
+      "declare i32 @strcmp(i8*, i8*)\n"
+      "declare i32 @strcoll(i8*, i8*)\n"
+      "declare i8* @strcpy(i8*, i8*)\n"
+      "declare i64 @strcspn(i8*, i8*)\n"
+      "declare i8* @strdup(i8*)\n"
+      "declare i64 @strlen(i8*)\n"
+      "declare i32 @strncasecmp(i8*, i8*, i64)\n"
+      "declare i8* @strncat(i8*, i8*, i64)\n"
+      "declare i32 @strncmp(i8*, i8*, i64)\n"
+      "declare i8* @strncpy(i8*, i8*, i64)\n"
+      "declare i8* @strndup(i8*, i64)\n"
+      "declare i64 @strnlen(i8*, i64)\n"
+      "declare i8* @strpbrk(i8*, i8*)\n"
+      "declare i8* @strrchr(i8*, i32)\n"
+      "declare i64 @strspn(i8*, i8*)\n"
+      "declare i8* @strstr(i8*, i8*)\n"
+      "declare i8* @strtok(i8*, i8*)\n"
+      "declare i8* @strtok_r(i8*, i8*, i8**)\n"
+      "declare i64 @strtol(i8*, i8**, i32)\n"
+      "declare x86_fp80 @strtold(i8*, i8**)\n"
+      "declare i64 @strtoll(i8*, i8**, i32)\n"
+      "declare i64 @strtoul(i8*, i8**, i32)\n"
+      "declare i64 @strtoull(i8*, i8**, i32)\n"
+      "declare i64 @strxfrm(i8*, i8*, i64)\n"
+      "declare double @tan(double)\n"
+      "declare float @tanf(float)\n"
+      "declare double @tanh(double)\n"
+      "declare float @tanhf(float)\n"
+      "declare x86_fp80 @tanhl(x86_fp80)\n"
+      "declare x86_fp80 @tanl(x86_fp80)\n"
+      "declare i64 @times(%struct*)\n"
+      "declare %struct* @tmpfile()\n"
+      "declare i32 @_Z7toasciii(i32)\n"
+      "declare double @trunc(double)\n"
+      "declare float @truncf(float)\n"
+      "declare x86_fp80 @truncl(x86_fp80)\n"
+      "declare i32 @uname(%struct*)\n"
+      "declare i32 @ungetc(i32, %struct*)\n"
+      "declare i32 @unlink(i8*)\n"
+      "declare i32 @utime(i8*, %struct*)\n"
+      "declare i32 @utimes(i8*, %struct*)\n"
+      "declare i8* @valloc(i64)\n"
+      "declare i32 @vfprintf(%struct*, i8*, %struct*)\n"
+      "declare i32 @vfscanf(%struct*, i8*, %struct*)\n"
+      "declare i32 @vprintf(i8*, %struct*)\n"
+      "declare i32 @vscanf(i8*, %struct*)\n"
+      "declare i32 @vsnprintf(i8*, i64, i8*, %struct*)\n"
+      "declare i32 @vsprintf(i8*, i8*, %struct*)\n"
+      "declare i32 @vsscanf(i8*, i8*, %struct*)\n"
+      "declare i64 @wcslen(i32*)\n"
+      "declare i32 @fork()\n"
+      "declare i32 @execl(i8*, i8*, ...)\n"
+      "declare i32 @execle(i8*, i8*, ...)\n"
+      "declare i32 @execlp(i8*, i8*, ...)\n"
+      "declare i32 @execv(i8*, i8**)\n"
+      "declare i32 @execvP(i8*, i8*, i8**)\n"
+      "declare i32 @execve(i8*, i8**, i8**)\n"
+      "declare i32 @execvp(i8*, i8**)\n"
+      "declare i32 @execvpe(i8*, i8**, i8**)\n"
+
+      // These functions were also extracted from the OS X headers, but they are
+      // available with a special name on darwin.
+      // This test uses the default TLI name instead.
+      "declare i32 @chmod(i8*, i16)\n"
+      "declare i32 @closedir(%struct*)\n"
+      "declare %struct* @fdopen(i32, i8*)\n"
+      "declare %struct* @fopen(i8*, i8*)\n"
+      "declare i32 @fputs(i8*, %struct*)\n"
+      "declare i32 @fputs_unlocked(i8*, %struct*)\n"
+      "declare i32 @fstat(i32, %struct*)\n"
+      "declare i64 @fwrite(i8*, i64, i64, %struct*)\n"
+      "declare i64 @fwrite_unlocked(i8*, i64, i64, %struct*)\n"
+      "declare i32 @lchown(i8*, i32, i32)\n"
+      "declare i32 @lstat(i8*, %struct*)\n"
+      "declare i64 @mktime(%struct*)\n"
+      "declare i32 @open(i8*, i32, ...)\n"
+      "declare %struct* @opendir(i8*)\n"
+      "declare %struct* @popen(i8*, i8*)\n"
+      "declare i64 @pread(i32, i8*, i64, i64)\n"
+      "declare i64 @pwrite(i32, i8*, i64, i64)\n"
+      "declare i64 @read(i32, i8*, i64)\n"
+      "declare i8* @realpath(i8*, i8*)\n"
+      "declare i32 @stat(i8*, %struct*)\n"
+      "declare double @strtod(i8*, i8**)\n"
+      "declare float @strtof(i8*, i8**)\n"
+      "declare i32 @system(i8*)\n"
+      "declare i32 @unsetenv(i8*)\n"
+      "declare i64 @write(i32, i8*, i64)\n"
+
+      // These functions are available on Linux but not Darwin; they only differ
+      // from their non-64 counterparts in the struct type.
+      // Use the same prototype as the non-64 variant.
+      "declare %struct* @fopen64(i8*, i8*)\n"
+      "declare i32 @fstat64(i32, %struct*)\n"
+      "declare i32 @fstatvfs64(i32, %struct*)\n"
+      "declare i32 @lstat64(i8*, %struct*)\n"
+      "declare i32 @open64(i8*, i32, ...)\n"
+      "declare i32 @stat64(i8*, %struct*)\n"
+      "declare i32 @statvfs64(i8*, %struct*)\n"
+      "declare %struct* @tmpfile64()\n"
+
+      // These functions are also -64 variants, but do differ in the type of the
+      // off_t (vs off64_t) parameter.  The non-64 variants declared above used
+      // a 64-bit off_t, so, in practice, they are also equivalent.
+      "declare i32 @fseeko64(%struct*, i64, i32)\n"
+      "declare i64 @ftello64(%struct*)\n"
+
+      "declare void @_ZdaPv(i8*)\n"
+      "declare void @_ZdaPvRKSt9nothrow_t(i8*, %struct*)\n"
+      "declare void @_ZdaPvSt11align_val_t(i8*, i64)\n"
+      "declare void @_ZdaPvSt11align_val_tRKSt9nothrow_t(i8*, i64, %struct*)\n"
+      "declare void @_ZdaPvj(i8*, i32)\n"
+      "declare void @_ZdaPvm(i8*, i64)\n"
+      "declare void @_ZdlPv(i8*)\n"
+      "declare void @_ZdlPvRKSt9nothrow_t(i8*, %struct*)\n"
+      "declare void @_ZdlPvSt11align_val_t(i8*, i64)\n"
+      "declare void @_ZdlPvSt11align_val_tRKSt9nothrow_t(i8*, i64, %struct*)\n"
+      "declare void @_ZdlPvj(i8*, i32)\n"
+      "declare void @_ZdlPvm(i8*, i64)\n"
+      "declare i8* @_Znaj(i32)\n"
+      "declare i8* @_ZnajRKSt9nothrow_t(i32, %struct*)\n"
+      "declare i8* @_ZnajSt11align_val_t(i32, i32)\n"
+      "declare i8* @_ZnajSt11align_val_tRKSt9nothrow_t(i32, i32, %struct*)\n"
+      "declare i8* @_Znam(i64)\n"
+      "declare i8* @_ZnamRKSt9nothrow_t(i64, %struct*)\n"
+      "declare i8* @_ZnamSt11align_val_t(i64, i64)\n"
+      "declare i8* @_ZnamSt11align_val_tRKSt9nothrow_t(i64, i64, %struct*)\n"
+      "declare i8* @_Znwj(i32)\n"
+      "declare i8* @_ZnwjRKSt9nothrow_t(i32, %struct*)\n"
+      "declare i8* @_ZnwjSt11align_val_t(i32, i32)\n"
+      "declare i8* @_ZnwjSt11align_val_tRKSt9nothrow_t(i32, i32, %struct*)\n"
+      "declare i8* @_Znwm(i64)\n"
+      "declare i8* @_ZnwmRKSt9nothrow_t(i64, %struct*)\n"
+      "declare i8* @_ZnwmSt11align_val_t(i64, i64)\n"
+      "declare i8* @_ZnwmSt11align_val_tRKSt9nothrow_t(i64, i64, %struct*)\n"
+
+      "declare void @\"??3@YAXPEAX@Z\"(i8*)\n"
+      "declare void @\"??3@YAXPEAXAEBUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
+      "declare void @\"??3@YAXPEAX_K@Z\"(i8*, i64)\n"
+      "declare void @\"??_V@YAXPEAX@Z\"(i8*)\n"
+      "declare void @\"??_V@YAXPEAXAEBUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
+      "declare void @\"??_V@YAXPEAX_K@Z\"(i8*, i64)\n"
+      "declare i8* @\"??2@YAPAXI@Z\"(i32)\n"
+      "declare i8* @\"??2@YAPAXIABUnothrow_t@std@@@Z\"(i32, %struct*)\n"
+      "declare i8* @\"??2@YAPEAX_K@Z\"(i64)\n"
+      "declare i8* @\"??2@YAPEAX_KAEBUnothrow_t@std@@@Z\"(i64, %struct*)\n"
+      "declare i8* @\"??_U@YAPAXI@Z\"(i32)\n"
+      "declare i8* @\"??_U@YAPAXIABUnothrow_t@std@@@Z\"(i32, %struct*)\n"
+      "declare i8* @\"??_U@YAPEAX_K@Z\"(i64)\n"
+      "declare i8* @\"??_U@YAPEAX_KAEBUnothrow_t@std@@@Z\"(i64, %struct*)\n"
+
+      "declare void @\"??3@YAXPAX@Z\"(i8*)\n"
+      "declare void @\"??3@YAXPAXABUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
+      "declare void @\"??3@YAXPAXI@Z\"(i8*, i32)\n"
+      "declare void @\"??_V@YAXPAX@Z\"(i8*)\n"
+      "declare void @\"??_V@YAXPAXABUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
+      "declare void @\"??_V@YAXPAXI@Z\"(i8*, i32)\n"
+
+      // These other functions were derived from the .def C declaration.
+      "declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*)\n"
+      "declare void @__cxa_guard_abort(%struct*)\n"
+      "declare i32 @__cxa_guard_acquire(%struct*)\n"
+      "declare void @__cxa_guard_release(%struct*)\n"
+
+      "declare i32 @__nvvm_reflect(i8*)\n"
+
+      "declare i8* @__memcpy_chk(i8*, i8*, i64, i64)\n"
+      "declare i8* @__memmove_chk(i8*, i8*, i64, i64)\n"
+      "declare i8* @__memset_chk(i8*, i32, i64, i64)\n"
+      "declare i8* @__stpcpy_chk(i8*, i8*, i64)\n"
+      "declare i8* @__stpncpy_chk(i8*, i8*, i64, i64)\n"
+      "declare i8* @__strcpy_chk(i8*, i8*, i64)\n"
+      "declare i8* @__strncpy_chk(i8*, i8*, i64, i64)\n"
+
+      "declare i8* @memalign(i64, i64)\n"
+      "declare i8* @mempcpy(i8*, i8*, i64)\n"
+      "declare i8* @memrchr(i8*, i32, i64)\n"
+
+      // These are similar to the FILE* fgetc/fputc.
+      "declare i32 @_IO_getc(%struct*)\n"
+      "declare i32 @_IO_putc(i32, %struct*)\n"
+
+      "declare i32 @__isoc99_scanf(i8*, ...)\n"
+      "declare i32 @__isoc99_sscanf(i8*, i8*, ...)\n"
+      "declare i8* @__strdup(i8*)\n"
+      "declare i8* @__strndup(i8*, i64)\n"
+      "declare i8* @__strtok_r(i8*, i8*, i8**)\n"
+
+      "declare double @__sqrt_finite(double)\n"
+      "declare float @__sqrtf_finite(float)\n"
+      "declare x86_fp80 @__sqrtl_finite(x86_fp80)\n"
+      "declare double @exp10(double)\n"
+      "declare float @exp10f(float)\n"
+      "declare x86_fp80 @exp10l(x86_fp80)\n"
+
+      // These printf variants have the same prototype as the non-'i' versions.
+      "declare i32 @fiprintf(%struct*, i8*, ...)\n"
+      "declare i32 @iprintf(i8*, ...)\n"
+      "declare i32 @siprintf(i8*, i8*, ...)\n"
+
+      "declare i32 @htonl(i32)\n"
+      "declare i16 @htons(i16)\n"
+      "declare i32 @ntohl(i32)\n"
+      "declare i16 @ntohs(i16)\n"
+
+      "declare i32 @isascii(i32)\n"
+      "declare i32 @isdigit(i32)\n"
+      "declare i32 @toascii(i32)\n"
+
+      // These functions were extracted from math-finite.h which provides
+      // functions similar to those in math.h, but optimized for handling
+      // finite values only.
+      "declare double @__acos_finite(double)\n"
+      "declare float @__acosf_finite(float)\n"
+      "declare x86_fp80 @__acosl_finite(x86_fp80)\n"
+      "declare double @__acosh_finite(double)\n"
+      "declare float @__acoshf_finite(float)\n"
+      "declare x86_fp80 @__acoshl_finite(x86_fp80)\n"
+      "declare double @__asin_finite(double)\n"
+      "declare float @__asinf_finite(float)\n"
+      "declare x86_fp80 @__asinl_finite(x86_fp80)\n"
+      "declare double @__atan2_finite(double, double)\n"
+      "declare float @__atan2f_finite(float, float)\n"
+      "declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80)\n"
+      "declare double @__atanh_finite(double)\n"
+      "declare float @__atanhf_finite(float)\n"
+      "declare x86_fp80 @__atanhl_finite(x86_fp80)\n"
+      "declare double @__cosh_finite(double)\n"
+      "declare float @__coshf_finite(float)\n"
+      "declare x86_fp80 @__coshl_finite(x86_fp80)\n"
+      "declare double @__exp10_finite(double)\n"
+      "declare float @__exp10f_finite(float)\n"
+      "declare x86_fp80 @__exp10l_finite(x86_fp80)\n"
+      "declare double @__exp2_finite(double)\n"
+      "declare float @__exp2f_finite(float)\n"
+      "declare x86_fp80 @__exp2l_finite(x86_fp80)\n"
+      "declare double @__exp_finite(double)\n"
+      "declare float @__expf_finite(float)\n"
+      "declare x86_fp80 @__expl_finite(x86_fp80)\n"
+      "declare double @__log10_finite(double)\n"
+      "declare float @__log10f_finite(float)\n"
+      "declare x86_fp80 @__log10l_finite(x86_fp80)\n"
+      "declare double @__log2_finite(double)\n"
+      "declare float @__log2f_finite(float)\n"
+      "declare x86_fp80 @__log2l_finite(x86_fp80)\n"
+      "declare double @__log_finite(double)\n"
+      "declare float @__logf_finite(float)\n"
+      "declare x86_fp80 @__logl_finite(x86_fp80)\n"
+      "declare double @__pow_finite(double, double)\n"
+      "declare float @__powf_finite(float, float)\n"
+      "declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80)\n"
+      "declare double @__sinh_finite(double)\n"
+      "declare float @__sinhf_finite(float)\n"
+      "declare x86_fp80 @__sinhl_finite(x86_fp80)\n"
+      );
 
   for (unsigned FI = 0; FI != LibFunc::NumLibFuncs; ++FI) {
     LibFunc LF = (LibFunc)FI;
-- 
GitLab


From 0d2cd950387e617d77adedfdcde4ca14a11ba2f4 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 7 Nov 2018 15:01:09 +0000
Subject: [PATCH 1062/1116] [InstCombine] add test for fcmp+fabs; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346320 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/fcmp.ll | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index dc91ee9552b..e14a56bbcb2 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -217,6 +217,26 @@ define i1 @fabs_oge(double %a) {
   ret i1 %cmp
 }
 
+define i1 @fabs_ult(double %a) {
+; CHECK-LABEL: @fabs_ult(
+; CHECK-NEXT:    [[CALL:%.*]] = call double @llvm.fabs.f64(double [[A:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc arcp ult double [[CALL]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp reassoc arcp ult double %call, 0.0
+  ret i1 %cmp
+}
+
+define <2 x i1> @fabs_ult_nnan(<2 x float> %a) {
+; CHECK-LABEL: @fabs_ult_nnan(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp nnan reassoc arcp ult <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
 define i1 @fabs_une(half %a) {
 ; CHECK-LABEL: @fabs_une(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp une half [[A:%.*]], 0xH0000
-- 
GitLab


From b9f324c2611a50adc3e25d1ad7d3b8c9bf8a4b2e Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 7 Nov 2018 15:11:32 +0000
Subject: [PATCH 1063/1116] [InstCombine] add fold for fabs(X) u< 0.0

The sibling fold for 'oge' --> 'ord' was already here,
but this half was missing.

The result of fabs() must be positive or nan, so asking
if the result is negative or nan is the same as asking
if the result is nan.

This is another step towards fixing:
https://bugs.llvm.org/show_bug.cgi?id=39475


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346321 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineCompares.cpp | 5 +++++
 test/Transforms/InstCombine/fcmp.ll                | 3 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c6dbfd92844..d5164222321 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5311,6 +5311,11 @@ static Instruction *foldFabsWithFcmpZero(FCmpInst &I) {
     assert(!I.hasNoNaNs() && "fcmp should have simplified");
     return new FCmpInst(FCmpInst::FCMP_ORD, X, I.getOperand(1));
 
+  case FCmpInst::FCMP_ULT:
+    // fabs(X) u< 0.0 --> isnan(X)
+    assert(!I.hasNoNaNs() && "fcmp should have simplified");
+    return new FCmpInst(FCmpInst::FCMP_UNO, X, I.getOperand(1));
+
   case FCmpInst::FCMP_OEQ:
   case FCmpInst::FCMP_UEQ:
   case FCmpInst::FCMP_ONE:
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index e14a56bbcb2..c49ed262ab8 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -219,8 +219,7 @@ define i1 @fabs_oge(double %a) {
 
 define i1 @fabs_ult(double %a) {
 ; CHECK-LABEL: @fabs_ult(
-; CHECK-NEXT:    [[CALL:%.*]] = call double @llvm.fabs.f64(double [[A:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc arcp ult double [[CALL]], 0.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno double [[A:%.*]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
-- 
GitLab


From 3125e35140679e85021058b473589c8a02999018 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Wed, 7 Nov 2018 15:24:12 +0000
Subject: [PATCH 1064/1116] Add support for llvm.is.constant intrinsic (PR4898)

This adds the llvm-side support for post-inlining evaluation of the
__builtin_constant_p GCC intrinsic.

Also fixed SCCPSolver::visitCallSite to not blow up when seeing a call
to a function where canConstantFoldTo returns true, and one of the
arguments is a struct.

Updated from patch initially by Janusz Sobczak.

Differential Revision: https://reviews.llvm.org/D4276

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346322 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                              |  45 +++++++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   1 +
 include/llvm/IR/Intrinsics.td                 |   4 +
 lib/Analysis/ConstantFolding.cpp              |  22 ++++
 lib/CodeGen/CodeGenPrepare.cpp                |  43 ++++---
 lib/CodeGen/GlobalISel/IRTranslator.cpp       |   5 +
 lib/CodeGen/SelectionDAG/FastISel.cpp         |   8 ++
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   7 ++
 lib/Transforms/Scalar/SCCP.cpp                |   2 +
 test/CodeGen/Generic/is-constant.ll           | 114 ++++++++++++++++++
 test/CodeGen/X86/is-constant.ll               |  50 ++++++++
 test/Transforms/SCCP/ipsccp-basic.ll          |  13 ++
 12 files changed, 300 insertions(+), 14 deletions(-)
 create mode 100644 test/CodeGen/Generic/is-constant.ll
 create mode 100644 test/CodeGen/X86/is-constant.ll

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 0a22ced9850..06e092fb9fc 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -15417,6 +15417,51 @@ Semantics:
 This intrinsic actually does nothing, but optimizers must assume that it
 has externally observable side effects.
 
+'``llvm.is.constant.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use llvm.is.constant with any argument type.
+
+::
+
+      declare i1 @llvm.is.constant.i32(i32 %operand) nounwind readnone
+      declare i1 @llvm.is.constant.f32(float %operand) nounwind readnone
+      declare i1 @llvm.is.constant.TYPENAME(TYPE %operand) nounwind readnone
+
+Overview:
+"""""""""
+
+The '``llvm.is.constant``' intrinsic will return true if the argument
+is known to be a manifest compile-time constant. It is guaranteed to
+fold to either true or false before generating machine code.
+
+Semantics:
+""""""""""
+
+This intrinsic generates no code. If its argument is known to be a
+manifest compile-time constant value, then the intrinsic will be
+converted to a constant true value. Otherwise, it will be converted to
+a constant false value.
+
+In particular, note that if the argument is a constant expression
+which refers to a global (the address of which _is_ a constant, but
+not manifest during the compile), then the intrinsic evaluates to
+false.
+
+The result also intentionally depends on the result of optimization
+passes -- e.g., the result can change depending on whether a
+function gets inlined or not. A function's parameters are
+obviously not constant. However, a call like
+``llvm.is.constant.i32(i32 %param)`` *can* return true after the
+function is inlined, if the value passed to the function parameter was
+a constant.
+
+On the other hand, if constant folding is not run, it will never
+evaluate to true, even in simple cases.
+
 Stack Map Intrinsics
 --------------------
 
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 05e93099e12..5e79c5cdfe0 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -160,6 +160,7 @@ public:
     case Intrinsic::invariant_end:
     case Intrinsic::launder_invariant_group:
     case Intrinsic::strip_invariant_group:
+    case Intrinsic::is_constant:
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
     case Intrinsic::objectsize:
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 1c6f81c07db..04de1ca63a2 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -900,6 +900,10 @@ def int_convert_from_fp16 : Intrinsic<[llvm_anyfloat_ty], [llvm_i16_ty]>;
 def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
                                 [], "llvm.clear_cache">;
 
+// Intrinsic to detect whether its argument is a constant.
+def int_is_constant : Intrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem], "llvm.is.constant">;
+
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_store : Intrinsic<[], [llvm_anyvector_ty,
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 9ae8f1728c2..92b05559137 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1426,6 +1426,7 @@ bool llvm::canConstantFoldCallTo(ImmutableCallSite CS, const Function *F) {
   case Intrinsic::x86_avx512_vcvtsd2usi64:
   case Intrinsic::x86_avx512_cvttsd2usi:
   case Intrinsic::x86_avx512_cvttsd2usi64:
+  case Intrinsic::is_constant:
     return true;
   default:
     return false;
@@ -1600,11 +1601,32 @@ double getValueAsDouble(ConstantFP *Op) {
   return APF.convertToDouble();
 }
 
+static bool isManifestConstant(const Constant *c) {
+  if (isa<ConstantData>(c)) {
+    return true;
+  } else if (isa<ConstantAggregate>(c) || isa<ConstantExpr>(c)) {
+    for (const Value *subc : c->operand_values()) {
+      if (!isManifestConstant(cast<Constant>(subc)))
+        return false;
+    }
+    return true;
+  }
+  return false;
+}
+
 Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
                                  ArrayRef<Constant *> Operands,
                                  const TargetLibraryInfo *TLI,
                                  ImmutableCallSite CS) {
   if (Operands.size() == 1) {
+    if (IntrinsicID == Intrinsic::is_constant) {
+      // We know we have a "Constant" argument. But we want to only
+      // return true for manifest constants, not those that depend on
+      // constants with unknowable values, e.g. GlobalValue or BlockAddress.
+      if (isManifestConstant(Operands[0]))
+        return ConstantInt::getTrue(Ty->getContext());
+      return nullptr;
+    }
     if (isa<UndefValue>(Operands[0])) {
       // cosine(arg) is between -1 and 1. cosine(invalid arg) is NaN
       if (IntrinsicID == Intrinsic::cos)
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 6e73f7d773b..651873bb911 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -321,6 +321,24 @@ class TypePromotionTransaction;
     }
 
   private:
+    template <typename F>
+    void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) {
+      // Substituting can cause recursive simplifications, which can invalidate
+      // our iterator.  Use a WeakTrackingVH to hold onto it in case this
+      // happens.
+      Value *CurValue = &*CurInstIterator;
+      WeakTrackingVH IterHandle(CurValue);
+
+      f();
+
+      // If the iterator instruction was recursively deleted, start over at the
+      // start of the block.
+      if (IterHandle != CurValue) {
+        CurInstIterator = BB->begin();
+        SunkAddrs.clear();
+      }
+    }
+
     bool eliminateFallThrough(Function &F);
     bool eliminateMostlyEmptyBlocks(Function &F);
     BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);
@@ -1690,21 +1708,18 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       // Lower all uses of llvm.objectsize.*
       ConstantInt *RetVal =
           lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true);
-      // Substituting this can cause recursive simplifications, which can
-      // invalidate our iterator.  Use a WeakTrackingVH to hold onto it in case
-      // this
-      // happens.
-      Value *CurValue = &*CurInstIterator;
-      WeakTrackingVH IterHandle(CurValue);
 
-      replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
-
-      // If the iterator instruction was recursively deleted, start over at the
-      // start of the block.
-      if (IterHandle != CurValue) {
-        CurInstIterator = BB->begin();
-        SunkAddrs.clear();
-      }
+      resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
+        replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
+      });
+      return true;
+    }
+    case Intrinsic::is_constant: {
+      // If is_constant hasn't folded away yet, lower it to false now.
+      Constant *RetVal = ConstantInt::get(II->getType(), 0);
+      resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
+        replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
+      });
       return true;
     }
     case Intrinsic::aarch64_stlxr:
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index aa5022cd397..ef090777726 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -947,6 +947,11 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     MIRBuilder.buildConstant(getOrCreateVReg(CI), Min->isZero() ? -1ULL : 0);
     return true;
   }
+  case Intrinsic::is_constant:
+    // If this wasn't constant-folded away by now, then it's not a
+    // constant.
+    MIRBuilder.buildConstant(getOrCreateVReg(CI), 0);
+    return true;
   case Intrinsic::stackguard:
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 035844294f4..d5f066c2423 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1450,6 +1450,14 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     updateValueMap(II, ResultReg);
     return true;
   }
+  case Intrinsic::is_constant: {
+    Constant *ResCI = ConstantInt::get(II->getType(), 0);
+    unsigned ResultReg = getRegForValue(ResCI);
+    if (!ResultReg)
+      return false;
+    updateValueMap(II, ResultReg);
+    return true;
+  }
   case Intrinsic::launder_invariant_group:
   case Intrinsic::strip_invariant_group:
   case Intrinsic::expect: {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 55ca5eb7c4e..bf24d7f7562 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5897,6 +5897,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, Res);
     return nullptr;
   }
+
+  case Intrinsic::is_constant:
+    // If this wasn't constant-folded away by now, then it's not a
+    // constant.
+    setValue(&I, DAG.getConstant(0, sdl, MVT::i1));
+    return nullptr;
+
   case Intrinsic::annotation:
   case Intrinsic::ptr_annotation:
   case Intrinsic::launder_invariant_group:
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index b7340f294fd..1f98128f923 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -1230,6 +1230,8 @@ CallOverdefined:
       SmallVector<Constant*, 8> Operands;
       for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
            AI != E; ++AI) {
+        if (AI->get()->getType()->isStructTy())
+          return markOverdefined(I); // Can't handle struct args.
         LatticeVal State = getValueState(*AI);
 
         if (State.isUnknown())
diff --git a/test/CodeGen/Generic/is-constant.ll b/test/CodeGen/Generic/is-constant.ll
new file mode 100644
index 00000000000..baeedc4c9b0
--- /dev/null
+++ b/test/CodeGen/Generic/is-constant.ll
@@ -0,0 +1,114 @@
+; RUN: opt -O2 -S < %s  | FileCheck %s
+; RUN: llc -o /dev/null 2>&1 < %s
+; RUN: llc -O0 -o /dev/null 2>&1 < %s
+
+;; The llc runs above are just to ensure it doesn't blow up upon
+;; seeing an is_constant intrinsic.
+
+declare i1 @llvm.is.constant.i32(i32 %a)
+declare i1 @llvm.is.constant.i64(i64 %a)
+declare i1 @llvm.is.constant.i256(i256 %a)
+declare i1 @llvm.is.constant.v2i64(<2 x i64> %a)
+declare i1 @llvm.is.constant.f32(float %a)
+declare i1 @llvm.is.constant.sl_i32i32s({i32, i32} %a)
+declare i1 @llvm.is.constant.a2i32([2 x i32] %a)
+declare i1 @llvm.is.constant.p0i64(i64* %a)
+
+;; Basic test that optimization folds away the is.constant when given
+;; a constant.
+define i1 @test_constant() #0 {
+; CHECK-LABEL: @test_constant(
+; CHECK-NOT: llvm.is.constant
+; CHECK: ret i1 true
+%y = call i1 @llvm.is.constant.i32(i32 44)
+  ret i1 %y
+}
+
+;; And test that the intrinsic sticks around when given a
+;; non-constant.
+define i1 @test_nonconstant(i32 %x) #0 {
+; CHECK-LABEL: @test_nonconstant(
+; CHECK: @llvm.is.constant
+  %y = call i1 @llvm.is.constant.i32(i32 %x)
+  ret i1 %y
+}
+
+;; Ensure that nested is.constants fold.
+define i32 @test_nested() #0 {
+; CHECK-LABEL: @test_nested(
+; CHECK-NOT: llvm.is.constant
+; CHECK: ret i32 13
+  %val1 = call i1 @llvm.is.constant.i32(i32 27)
+  %val2 = zext i1 %val1 to i32
+  %val3 = add i32 %val2, 12
+  %1 = call i1 @llvm.is.constant.i32(i32 %val3)
+  %2 = zext i1 %1 to i32
+  %3 = add i32 %2, 12
+  ret i32 %3
+}
+
+@G = global [2 x i64] zeroinitializer
+define i1 @test_global() #0 {
+; CHECK-LABEL: @test_global(
+; CHECK: llvm.is.constant
+  %ret = call i1 @llvm.is.constant.p0i64(i64* getelementptr ([2 x i64], [2 x i64]* @G, i32 0, i32 0))
+  ret i1 %ret
+}
+
+define i1 @test_diff() #0 {
+; CHECK-LABEL: @test_diff(
+  %ret = call i1 @llvm.is.constant.i64(i64 sub (
+      i64 ptrtoint (i64* getelementptr inbounds ([2 x i64], [2 x i64]* @G, i64 0, i64 1) to i64),
+      i64 ptrtoint ([2 x i64]* @G to i64)))
+  ret i1 %ret
+}
+
+define i1 @test_various_types(i256 %int, float %float, <2 x i64> %vec, {i32, i32} %struct, [2 x i32] %arr, i64* %ptr) #0 {
+; CHECK-LABEL: @test_various_types(
+; CHECK: llvm.is.constant
+; CHECK: llvm.is.constant
+; CHECK: llvm.is.constant
+; CHECK: llvm.is.constant
+; CHECK: llvm.is.constant
+; CHECK: llvm.is.constant
+; CHECK-NOT: llvm.is.constant
+  %v1 = call i1 @llvm.is.constant.i256(i256 %int)
+  %v2 = call i1 @llvm.is.constant.f32(float %float)
+  %v3 = call i1 @llvm.is.constant.v2i64(<2 x i64> %vec)
+  %v4 = call i1 @llvm.is.constant.sl_i32i32s({i32, i32} %struct)
+  %v5 = call i1 @llvm.is.constant.a2i32([2 x i32] %arr)
+  %v6 = call i1 @llvm.is.constant.p0i64(i64* %ptr)
+
+  %c1 = call i1 @llvm.is.constant.i256(i256 -1)
+  %c2 = call i1 @llvm.is.constant.f32(float 17.0)
+  %c3 = call i1 @llvm.is.constant.v2i64(<2 x i64> <i64 -1, i64 44>)
+  %c4 = call i1 @llvm.is.constant.sl_i32i32s({i32, i32} {i32 -1, i32 32})
+  %c5 = call i1 @llvm.is.constant.a2i32([2 x i32] [i32 -1, i32 32])
+  %c6 = call i1 @llvm.is.constant.p0i64(i64* inttoptr (i32 42 to i64*))
+
+  %x1 = add i1 %v1, %c1
+  %x2 = add i1 %v2, %c2
+  %x3 = add i1 %v3, %c3
+  %x4 = add i1 %v4, %c4
+  %x5 = add i1 %v5, %c5
+  %x6 = add i1 %v6, %c6
+
+  %res2 = add i1 %x1, %x2
+  %res3 = add i1 %res2, %x3
+  %res4 = add i1 %res3, %x4
+  %res5 = add i1 %res4, %x5
+  %res6 = add i1 %res5, %x6
+
+  ret i1 %res6
+}
+
+define i1 @test_various_types2() #0 {
+; CHECK-LABEL: @test_various_types2(
+; CHECK: ret i1 false
+  %r = call i1 @test_various_types(i256 -1, float 22.0, <2 x i64> <i64 -1, i64 44>,
+                     {i32, i32} {i32 -1, i32 55}, [2 x i32] [i32 -1, i32 55],
+		     i64* inttoptr (i64 42 to i64*))
+  ret i1 %r
+}
+
+attributes #0 = { nounwind uwtable }
diff --git a/test/CodeGen/X86/is-constant.ll b/test/CodeGen/X86/is-constant.ll
new file mode 100644
index 00000000000..d02bbae2085
--- /dev/null
+++ b/test/CodeGen/X86/is-constant.ll
@@ -0,0 +1,50 @@
+; RUN: llc -O2 < %s | FileCheck %s --check-prefix=CHECK-O2 --check-prefix=CHECK
+; RUN: llc -O0 -fast-isel < %s | FileCheck %s --check-prefix=CHECK-O0 --check-prefix=CHECK
+; RUN: llc -O0 -fast-isel=0 < %s | FileCheck %s --check-prefix=CHECK-O0 --check-prefix=CHECK
+; RUN: llc -O0 -global-isel < %s | FileCheck %s --check-prefix=CHECK-O0 --check-prefix=CHECK
+
+;; Ensure that an unfoldable is.constant gets lowered reasonably in
+;; optimized codegen, in particular, that the "true" branch is
+;; eliminated.
+;;
+;; This isn't asserting any specific output from non-optimized runs,
+;; (e.g., currently the not-taken branch does not get eliminated). But
+;; it does ensure that lowering succeeds in all 3 codegen paths.
+
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i1 @llvm.is.constant.i32(i32 %a) nounwind readnone
+declare i1 @llvm.is.constant.i64(i64 %a) nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1) nounwind readnone
+
+declare i32 @subfun_1()
+declare i32 @subfun_2()
+
+define i32 @test_branch(i32 %in) nounwind {
+; CHECK-LABEL:    test_branch:
+; CHECK-O2:       %bb.0:
+; CHECK-O2-NEXT:  jmp subfun_2
+  %v = call i1 @llvm.is.constant.i32(i32 %in)
+  br i1 %v, label %True, label %False
+
+True:
+  %call1 = tail call i32 @subfun_1()
+  ret i32 %call1
+
+False:
+  %call2 = tail call i32 @subfun_2()
+  ret i32 %call2
+}
+
+;; llvm.objectsize is another tricky case which gets folded to -1 very
+;; late in the game. We'd like to ensure that llvm.is.constant of
+;; llvm.objectsize is true.
+define i1 @test_objectsize(i8* %obj) nounwind {
+; CHECK-LABEL:    test_objectsize:
+; CHECK-O2:       %bb.0:
+; CHECK-O2:       movb $1, %al
+; CHECK-O2-NEXT:  retq
+  %os = call i64 @llvm.objectsize.i64.p0i8(i8* %obj, i1 false, i1 false)
+  %v = call i1 @llvm.is.constant.i64(i64 %os)
+  ret i1 %v
+}
diff --git a/test/Transforms/SCCP/ipsccp-basic.ll b/test/Transforms/SCCP/ipsccp-basic.ll
index ae08b4823c9..b1660b54565 100644
--- a/test/Transforms/SCCP/ipsccp-basic.ll
+++ b/test/Transforms/SCCP/ipsccp-basic.ll
@@ -258,3 +258,16 @@ define i64 @test11b() {
 }
 
 declare i64 @llvm.ctpop.i64(i64)
+
+;;======================== test12
+;; Ensure that a struct as an arg to a potentially constant-foldable
+;; function does not crash SCCP (for now it'll just ignores it)
+
+define i1 @test12() {
+  %c = call i1 @llvm.is.constant.sl_i32i32s({i32, i32} {i32 -1, i32 32})
+  ret i1 %c
+; CHECK-LABEL: define i1 @test12
+; CHECK: ret i1 %c
+}
+
+declare i1 @llvm.is.constant.sl_i32i32s({i32, i32} %a)
-- 
GitLab


From 8c397800298bca7d37ab0279c20c848f8a435514 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 7 Nov 2018 15:27:02 +0000
Subject: [PATCH 1065/1116] [InstCombine] add tests for more fcmp+fabs preds;
 NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346323 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/fcmp.ll | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index c49ed262ab8..de93cff5bec 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -197,6 +197,17 @@ define <2 x i1> @fabs_ole(<2 x float> %a) {
   ret <2 x i1> %cmp
 }
 
+define <2 x i1> @fabs_ule(<2 x float> %a) {
+; CHECK-LABEL: @fabs_ule(
+; CHECK-NEXT:    [[CALL:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf arcp ule <2 x float> [[CALL]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp ninf arcp ule <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
 define i1 @fabs_ogt(double %a) {
 ; CHECK-LABEL: @fabs_ogt(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[A:%.*]], 0.000000e+00
@@ -207,6 +218,17 @@ define i1 @fabs_ogt(double %a) {
   ret i1 %cmp
 }
 
+define i1 @fabs_ugt(double %a) {
+; CHECK-LABEL: @fabs_ugt(
+; CHECK-NEXT:    [[CALL:%.*]] = call double @llvm.fabs.f64(double [[A:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc ninf ugt double [[CALL]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp ninf reassoc ugt double %call, 0.0
+  ret i1 %cmp
+}
+
 define i1 @fabs_oge(double %a) {
 ; CHECK-LABEL: @fabs_oge(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord double [[A:%.*]], 0.000000e+00
-- 
GitLab


From b3939a8dff7440f7a0088f4b4c20ecb1a7e43aaf Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 7 Nov 2018 15:33:03 +0000
Subject: [PATCH 1066/1116] [InstCombine] add folds for fcmp Pred fabs(X), 0.0

Similar to rL346321, we had folds for the ordered
versions of these compares already, so add the
unordered siblings for completeness.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346324 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineCompares.cpp | 8 ++++++++
 test/Transforms/InstCombine/fcmp.ll                | 6 ++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index d5164222321..be26d5e8124 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5302,10 +5302,18 @@ static Instruction *foldFabsWithFcmpZero(FCmpInst &I) {
     // fabs(X) > 0.0 --> X != 0.0
     return new FCmpInst(FCmpInst::FCMP_ONE, X, I.getOperand(1));
 
+  case FCmpInst::FCMP_UGT:
+    // fabs(X) u> 0.0 --> X u!= 0.0
+    return new FCmpInst(FCmpInst::FCMP_UNE, X, I.getOperand(1));
+
   case FCmpInst::FCMP_OLE:
     // fabs(X) <= 0.0 --> X == 0.0
     return new FCmpInst(FCmpInst::FCMP_OEQ, X, I.getOperand(1));
 
+  case FCmpInst::FCMP_ULE:
+    // fabs(X) u<= 0.0 --> X u== 0.0
+    return new FCmpInst(FCmpInst::FCMP_UEQ, X, I.getOperand(1));
+
   case FCmpInst::FCMP_OGE:
     // fabs(X) >= 0.0 --> !isnan(X)
     assert(!I.hasNoNaNs() && "fcmp should have simplified");
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index de93cff5bec..5aea40f0918 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -199,8 +199,7 @@ define <2 x i1> @fabs_ole(<2 x float> %a) {
 
 define <2 x i1> @fabs_ule(<2 x float> %a) {
 ; CHECK-LABEL: @fabs_ule(
-; CHECK-NEXT:    [[CALL:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf arcp ule <2 x float> [[CALL]], zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq <2 x float> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
@@ -220,8 +219,7 @@ define i1 @fabs_ogt(double %a) {
 
 define i1 @fabs_ugt(double %a) {
 ; CHECK-LABEL: @fabs_ugt(
-; CHECK-NEXT:    [[CALL:%.*]] = call double @llvm.fabs.f64(double [[A:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc ninf ugt double [[CALL]], 0.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[A:%.*]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
-- 
GitLab


From 772306b17d2e2109ac82979a40c6f7a6061eb772 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 7 Nov 2018 15:36:23 +0000
Subject: [PATCH 1067/1116] [InstCombine] add tests for isnan(fabs(X)); NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346325 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/fcmp.ll | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index 5aea40f0918..b8cf6e42b2f 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -296,6 +296,28 @@ define <2 x i1> @fabs_ueq(<2 x float> %a) {
   ret <2 x i1> %cmp
 }
 
+define <2 x i1> @fabs_ord(<2 x float> %a) {
+; CHECK-LABEL: @fabs_ord(
+; CHECK-NEXT:    [[CALL:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp arcp ord <2 x float> [[CALL]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp arcp ord <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @fabs_uno(<2 x float> %a) {
+; CHECK-LABEL: @fabs_uno(
+; CHECK-NEXT:    [[CALL:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp arcp uno <2 x float> [[CALL]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp arcp uno <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
 ; Don't crash.
 define i32 @test17(double %a, double (double)* %p) {
 ; CHECK-LABEL: @test17(
-- 
GitLab


From 3bfaa4d7be433f0ba92ba8f664a0af45afd016db Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 7 Nov 2018 15:44:26 +0000
Subject: [PATCH 1068/1116] [InstCombine] peek through fabs() when checking
 isnan()

That should be the end of the missing cases for this fold.
See earlier patches in this series:
rL346321
rL346324


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346327 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineCompares.cpp | 7 ++++++-
 test/Transforms/InstCombine/fcmp.ll                | 6 ++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index be26d5e8124..7a56313000d 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5328,8 +5328,13 @@ static Instruction *foldFabsWithFcmpZero(FCmpInst &I) {
   case FCmpInst::FCMP_UEQ:
   case FCmpInst::FCMP_ONE:
   case FCmpInst::FCMP_UNE:
-    // fabs(X) == 0.0 --> X == 0.0
+  case FCmpInst::FCMP_ORD:
+  case FCmpInst::FCMP_UNO:
+    // Look through the fabs() because it doesn't change anything but the sign.
+    // fabs(X) == 0.0 --> X == 0.0,
     // fabs(X) != 0.0 --> X != 0.0
+    // isnan(fabs(X)) --> isnan(X)
+    // !isnan(fabs(X) --> !isnan(X)
     return new FCmpInst(I.getPredicate(), X, I.getOperand(1));
 
   default:
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index b8cf6e42b2f..15d9368f49e 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -298,8 +298,7 @@ define <2 x i1> @fabs_ueq(<2 x float> %a) {
 
 define <2 x i1> @fabs_ord(<2 x float> %a) {
 ; CHECK-LABEL: @fabs_ord(
-; CHECK-NEXT:    [[CALL:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp arcp ord <2 x float> [[CALL]], zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord <2 x float> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
@@ -309,8 +308,7 @@ define <2 x i1> @fabs_ord(<2 x float> %a) {
 
 define <2 x i1> @fabs_uno(<2 x float> %a) {
 ; CHECK-LABEL: @fabs_uno(
-; CHECK-NEXT:    [[CALL:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp arcp uno <2 x float> [[CALL]], zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno <2 x float> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
-- 
GitLab


From caeba011f77c2e33e3b7e8dae892fae6d93cb80c Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Wed, 7 Nov 2018 15:46:45 +0000
Subject: [PATCH 1069/1116] [llvm-exegesis] Increasing wrapping limit.

Summary: Fixes PR39097.

Reviewers: gchatelet

Subscribers: llvm-commits, tschuett

Differential Revision: https://reviews.llvm.org/D54151

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346328 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/BenchmarkResult.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/tools/llvm-exegesis/lib/BenchmarkResult.cpp
index 4b91c6c3b3c..0507ae8959d 100644
--- a/tools/llvm-exegesis/lib/BenchmarkResult.cpp
+++ b/tools/llvm-exegesis/lib/BenchmarkResult.cpp
@@ -344,7 +344,7 @@ InstructionBenchmark::readYamls(const LLVMState &State,
 
 void InstructionBenchmark::writeYamlTo(const LLVMState &State,
                                        llvm::raw_ostream &OS) {
-  llvm::yaml::Output Yout(OS);
+  llvm::yaml::Output Yout(OS, nullptr /*Ctx*/, 200 /*WrapColumn*/);
   YamlContext Context(State);
   Yout.beginDocuments();
   llvm::yaml::yamlize(Yout, *this, /*unused*/ true, Context);
-- 
GitLab


From 523f4005eb0735d5ae01a48101ec9b37764fb4c0 Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Wed, 7 Nov 2018 16:14:55 +0000
Subject: [PATCH 1070/1116] [llvm-exegesis] Correclty handle all X86 memory
 encoding formats.

Summary:
Add unit tests to check the support for each supported format to avoid
regressions such as the one in PR36906.

Reviewers: gchatelet

Subscribers: tschuett, lebedev.ri, llvm-commits

Differential Revision: https://reviews.llvm.org/D54144

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346330 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-exegesis/X86/uops-ADD32mi8.s  |   6 +
 test/tools/llvm-exegesis/X86/uops-ADD32mr.s   |   6 +
 test/tools/llvm-exegesis/X86/uops-ADD32rm.s   |   6 +
 test/tools/llvm-exegesis/X86/uops-BEXTR32rm.s |   6 +
 test/tools/llvm-exegesis/X86/uops-BSF16rm.s   |   6 +
 test/tools/llvm-exegesis/X86/uops-BTR64mr.s   |   6 +
 .../llvm-exegesis/X86/uops-VFMADDSS4rm.s      |   6 +
 tools/llvm-exegesis/lib/X86/Target.cpp        | 204 ++++++++++++------
 8 files changed, 183 insertions(+), 63 deletions(-)
 create mode 100644 test/tools/llvm-exegesis/X86/uops-ADD32mi8.s
 create mode 100644 test/tools/llvm-exegesis/X86/uops-ADD32mr.s
 create mode 100644 test/tools/llvm-exegesis/X86/uops-ADD32rm.s
 create mode 100644 test/tools/llvm-exegesis/X86/uops-BEXTR32rm.s
 create mode 100644 test/tools/llvm-exegesis/X86/uops-BSF16rm.s
 create mode 100644 test/tools/llvm-exegesis/X86/uops-BTR64mr.s
 create mode 100644 test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s

diff --git a/test/tools/llvm-exegesis/X86/uops-ADD32mi8.s b/test/tools/llvm-exegesis/X86/uops-ADD32mi8.s
new file mode 100644
index 00000000000..e3b3b80efc2
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-ADD32mi8.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=ADD32mi8 | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     ADD32mi8
diff --git a/test/tools/llvm-exegesis/X86/uops-ADD32mr.s b/test/tools/llvm-exegesis/X86/uops-ADD32mr.s
new file mode 100644
index 00000000000..80ecb3033b0
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-ADD32mr.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=ADD32mr | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     ADD32mr
diff --git a/test/tools/llvm-exegesis/X86/uops-ADD32rm.s b/test/tools/llvm-exegesis/X86/uops-ADD32rm.s
new file mode 100644
index 00000000000..0e6bdb587b9
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-ADD32rm.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=ADD32rm | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     ADD32rm
diff --git a/test/tools/llvm-exegesis/X86/uops-BEXTR32rm.s b/test/tools/llvm-exegesis/X86/uops-BEXTR32rm.s
new file mode 100644
index 00000000000..c4d2c7d840f
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-BEXTR32rm.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=BEXTR32rm | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     BEXTR32rm
diff --git a/test/tools/llvm-exegesis/X86/uops-BSF16rm.s b/test/tools/llvm-exegesis/X86/uops-BSF16rm.s
new file mode 100644
index 00000000000..9cb278dc052
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-BSF16rm.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=BSF16rm | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     BSF16rm
diff --git a/test/tools/llvm-exegesis/X86/uops-BTR64mr.s b/test/tools/llvm-exegesis/X86/uops-BTR64mr.s
new file mode 100644
index 00000000000..6d4544b5c52
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-BTR64mr.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=BTR64mr | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     BTR64mr
diff --git a/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s b/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s
new file mode 100644
index 00000000000..c323395ef5b
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=VFMADDSS4rm | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     VFMADDSS4rm
diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index b74d5dcde9f..7f26adbe237 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -22,55 +22,124 @@ namespace exegesis {
 
 namespace {
 
-// A chunk of instruction's operands that represents a single memory access.
-struct MemoryOperandRange {
-  MemoryOperandRange(llvm::ArrayRef<Operand> Operands) : Ops(Operands) {}
-
-  // Setup InstructionTemplate so the memory access represented by this object
-  // points to [reg] + offset.
-  void fillOrDie(InstructionTemplate &IT, unsigned Reg, unsigned Offset) {
-    switch (Ops.size()) {
-    case 5:
-      IT.getValueFor(Ops[0]) = llvm::MCOperand::createReg(Reg);    // BaseReg
-      IT.getValueFor(Ops[1]) = llvm::MCOperand::createImm(1);      // ScaleAmt
-      IT.getValueFor(Ops[2]) = llvm::MCOperand::createReg(0);      // IndexReg
-      IT.getValueFor(Ops[3]) = llvm::MCOperand::createImm(Offset); // Disp
-      IT.getValueFor(Ops[4]) = llvm::MCOperand::createReg(0);      // Segment
-      break;
-    default:
-      llvm::errs() << Ops.size() << "-op are not handled right now ("
-                   << IT.Instr.Name << ")\n";
-      llvm_unreachable("Invalid memory configuration");
-    }
-  }
-
-  // Returns whether Range can be filled.
-  static bool isValid(const MemoryOperandRange &Range) {
-    return Range.Ops.size() == 5;
-  }
-
-  // Returns whether Op is a valid memory operand.
-  static bool isMemoryOperand(const Operand &Op) {
-    return Op.isMemory() && Op.isExplicit();
-  }
-
-  llvm::ArrayRef<Operand> Ops;
-};
-
-// X86 memory access involve non constant number of operands, this function
-// extracts contiguous memory operands into MemoryOperandRange so it's easier to
-// check and fill.
-static std::vector<MemoryOperandRange>
-getMemoryOperandRanges(llvm::ArrayRef<Operand> Operands) {
-  std::vector<MemoryOperandRange> Result;
-  while (!Operands.empty()) {
-    Operands = Operands.drop_until(MemoryOperandRange::isMemoryOperand);
-    auto MemoryOps = Operands.take_while(MemoryOperandRange::isMemoryOperand);
-    if (!MemoryOps.empty())
-      Result.push_back(MemoryOps);
-    Operands = Operands.drop_front(MemoryOps.size());
+// Returns an error if we cannot handle the memory references in this
+// instruction.
+Error isInvalidMemoryInstr(const Instruction &Instr) {
+  switch (Instr.Description->TSFlags & X86II::FormMask) {
+  default:
+    llvm_unreachable("Unknown FormMask value");
+  // These have no memory access.
+  case X86II::Pseudo:
+  case X86II::RawFrm:
+  case X86II::MRMDestReg:
+  case X86II::MRMSrcReg:
+  case X86II::MRMSrcReg4VOp3:
+  case X86II::MRMSrcRegOp4:
+  case X86II::MRMXr:
+  case X86II::MRM0r:
+  case X86II::MRM1r:
+  case X86II::MRM2r:
+  case X86II::MRM3r:
+  case X86II::MRM4r:
+  case X86II::MRM5r:
+  case X86II::MRM6r:
+  case X86II::MRM7r:
+  case X86II::MRM_C0:
+  case X86II::MRM_C1:
+  case X86II::MRM_C2:
+  case X86II::MRM_C3:
+  case X86II::MRM_C4:
+  case X86II::MRM_C5:
+  case X86II::MRM_C6:
+  case X86II::MRM_C7:
+  case X86II::MRM_C8:
+  case X86II::MRM_C9:
+  case X86II::MRM_CA:
+  case X86II::MRM_CB:
+  case X86II::MRM_CC:
+  case X86II::MRM_CD:
+  case X86II::MRM_CE:
+  case X86II::MRM_CF:
+  case X86II::MRM_D0:
+  case X86II::MRM_D1:
+  case X86II::MRM_D2:
+  case X86II::MRM_D3:
+  case X86II::MRM_D4:
+  case X86II::MRM_D5:
+  case X86II::MRM_D6:
+  case X86II::MRM_D7:
+  case X86II::MRM_D8:
+  case X86II::MRM_D9:
+  case X86II::MRM_DA:
+  case X86II::MRM_DB:
+  case X86II::MRM_DC:
+  case X86II::MRM_DD:
+  case X86II::MRM_DE:
+  case X86II::MRM_DF:
+  case X86II::MRM_E0:
+  case X86II::MRM_E1:
+  case X86II::MRM_E2:
+  case X86II::MRM_E3:
+  case X86II::MRM_E4:
+  case X86II::MRM_E5:
+  case X86II::MRM_E6:
+  case X86II::MRM_E7:
+  case X86II::MRM_E8:
+  case X86II::MRM_E9:
+  case X86II::MRM_EA:
+  case X86II::MRM_EB:
+  case X86II::MRM_EC:
+  case X86II::MRM_ED:
+  case X86II::MRM_EE:
+  case X86II::MRM_EF:
+  case X86II::MRM_F0:
+  case X86II::MRM_F1:
+  case X86II::MRM_F2:
+  case X86II::MRM_F3:
+  case X86II::MRM_F4:
+  case X86II::MRM_F5:
+  case X86II::MRM_F6:
+  case X86II::MRM_F7:
+  case X86II::MRM_F8:
+  case X86II::MRM_F9:
+  case X86II::MRM_FA:
+  case X86II::MRM_FB:
+  case X86II::MRM_FC:
+  case X86II::MRM_FD:
+  case X86II::MRM_FE:
+  case X86II::MRM_FF:
+  case X86II::RawFrmImm8:
+    return Error::success();
+  case X86II::AddRegFrm:
+    return (Instr.Description->Opcode == X86::POP16r || Instr.Description->Opcode == X86::POP32r ||
+            Instr.Description->Opcode == X86::PUSH16r || Instr.Description->Opcode == X86::PUSH32r)
+               ? make_error<BenchmarkFailure>(
+                     "unsupported opcode: unsupported memory access")
+               : Error::success();
+  // These access memory and are handled.
+  case X86II::MRMDestMem:
+  case X86II::MRMSrcMem:
+  case X86II::MRMSrcMem4VOp3:
+  case X86II::MRMSrcMemOp4:
+  case X86II::MRMXm:
+  case X86II::MRM0m:
+  case X86II::MRM1m:
+  case X86II::MRM2m:
+  case X86II::MRM3m:
+  case X86II::MRM4m:
+  case X86II::MRM5m:
+  case X86II::MRM6m:
+  case X86II::MRM7m:
+    return Error::success();
+  // These access memory and are not handled yet.
+  case X86II::RawFrmImm16:
+  case X86II::RawFrmMemOffs:
+  case X86II::RawFrmSrc:
+  case X86II::RawFrmDst:
+  case X86II::RawFrmDstSrc:
+    return make_error<BenchmarkFailure>(
+        "unsupported opcode: non uniform memory access");
   }
-  return Result;
 }
 
 static llvm::Error IsInvalidOpcode(const Instruction &Instr) {
@@ -82,23 +151,14 @@ static llvm::Error IsInvalidOpcode(const Instruction &Instr) {
       OpcodeName.startswith("ADJCALLSTACK"))
     return llvm::make_error<BenchmarkFailure>(
         "unsupported opcode: Push/Pop/AdjCallStack");
-  const bool ValidMemoryOperands = llvm::all_of(
-      getMemoryOperandRanges(Instr.Operands), MemoryOperandRange::isValid);
-  if (!ValidMemoryOperands)
-    return llvm::make_error<BenchmarkFailure>(
-        "unsupported opcode: non uniform memory access");
+  if (llvm::Error Error = isInvalidMemoryInstr(Instr))
+    return std::move(Error);
   // We do not handle instructions with OPERAND_PCREL.
   for (const Operand &Op : Instr.Operands)
     if (Op.isExplicit() &&
         Op.getExplicitOperandInfo().OperandType == llvm::MCOI::OPERAND_PCREL)
       return llvm::make_error<BenchmarkFailure>(
           "unsupported opcode: PC relative operand");
-  for (const Operand &Op : Instr.Operands)
-    if (Op.isReg() && Op.isExplicit() &&
-        Op.getExplicitOperandInfo().RegClass ==
-            llvm::X86::SEGMENT_REGRegClassID)
-      return llvm::make_error<BenchmarkFailure>(
-          "unsupported opcode: access segment memory");
   // We do not handle second-form X87 instructions. We only handle first-form
   // ones (_Fp), see comment in X86InstrFPStack.td.
   for (const Operand &Op : Instr.Operands)
@@ -357,10 +417,28 @@ private:
 
   void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
                           unsigned Offset) const override {
-    // FIXME: For instructions that read AND write to memory, we use the same
-    // value for input and output.
-    for (auto &MemoryRange : getMemoryOperandRanges(IT.Instr.Operands))
-      MemoryRange.fillOrDie(IT, Reg, Offset);
+    assert(!isInvalidMemoryInstr(IT.Instr) &&
+           "fillMemoryOperands requires a valid memory instruction");
+    int MemOpIdx = X86II::getMemoryOperandNo(IT.Instr.Description->TSFlags);
+    assert(MemOpIdx >= 0 && "invalid memory operand index");
+    // getMemoryOperandNo() ignores tied operands, so we have to add them back.
+    for (unsigned I = 0; I <= static_cast<unsigned>(MemOpIdx); ++I) {
+      const auto &Op = IT.Instr.Operands[I];
+      if (Op.isTied() && Op.getTiedToIndex() < I) {
+        ++MemOpIdx;
+      }
+    }
+    // Now fill in the memory operands.
+    const auto SetOp = [&IT](int OpIdx, const MCOperand &OpVal) {
+      const auto Op = IT.Instr.Operands[OpIdx];
+      assert(Op.isMemory() && Op.isExplicit() && "invalid memory pattern");
+      IT.getValueFor(Op) = OpVal;
+    };
+    SetOp(MemOpIdx + 0, MCOperand::createReg(Reg));    // BaseReg
+    SetOp(MemOpIdx + 1, MCOperand::createImm(1));      // ScaleAmt
+    SetOp(MemOpIdx + 2, MCOperand::createReg(0));      // IndexReg
+    SetOp(MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
+    SetOp(MemOpIdx + 4, MCOperand::createReg(0));      // Segment
   }
 
   std::vector<llvm::MCInst> setRegTo(const llvm::MCSubtargetInfo &STI,
-- 
GitLab


From 7a2b35fa201a092c55af833fcab13fab2546e299 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 7 Nov 2018 16:15:01 +0000
Subject: [PATCH 1071/1116] [InstCombine] propagate FMF for fcmp+fabs folds

By morphing the instruction rather than deleting and creating a new one,
we retain fast-math-flags and potentially other metadata (profile info?).


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346331 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCompares.cpp       | 20 ++++++++++------
 test/Transforms/InstCombine/fcmp.ll           | 24 +++++++++----------
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 7a56313000d..2ba1174517f 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5291,6 +5291,12 @@ static Instruction *foldFabsWithFcmpZero(FCmpInst &I) {
       !match(I.getOperand(1), m_PosZeroFP()))
     return nullptr;
 
+  auto replacePredAndOp0 = [](FCmpInst *I, FCmpInst::Predicate P, Value *X) {
+    I->setPredicate(P);
+    I->setOperand(0, X);
+    return I;
+  };
+
   switch (I.getPredicate()) {
   case FCmpInst::FCMP_UGE:
   case FCmpInst::FCMP_OLT:
@@ -5300,29 +5306,29 @@ static Instruction *foldFabsWithFcmpZero(FCmpInst &I) {
 
   case FCmpInst::FCMP_OGT:
     // fabs(X) > 0.0 --> X != 0.0
-    return new FCmpInst(FCmpInst::FCMP_ONE, X, I.getOperand(1));
+    return replacePredAndOp0(&I, FCmpInst::FCMP_ONE, X);
 
   case FCmpInst::FCMP_UGT:
     // fabs(X) u> 0.0 --> X u!= 0.0
-    return new FCmpInst(FCmpInst::FCMP_UNE, X, I.getOperand(1));
+    return replacePredAndOp0(&I, FCmpInst::FCMP_UNE, X);
 
   case FCmpInst::FCMP_OLE:
     // fabs(X) <= 0.0 --> X == 0.0
-    return new FCmpInst(FCmpInst::FCMP_OEQ, X, I.getOperand(1));
+    return replacePredAndOp0(&I, FCmpInst::FCMP_OEQ, X);
 
   case FCmpInst::FCMP_ULE:
     // fabs(X) u<= 0.0 --> X u== 0.0
-    return new FCmpInst(FCmpInst::FCMP_UEQ, X, I.getOperand(1));
+    return replacePredAndOp0(&I, FCmpInst::FCMP_UEQ, X);
 
   case FCmpInst::FCMP_OGE:
     // fabs(X) >= 0.0 --> !isnan(X)
     assert(!I.hasNoNaNs() && "fcmp should have simplified");
-    return new FCmpInst(FCmpInst::FCMP_ORD, X, I.getOperand(1));
+    return replacePredAndOp0(&I, FCmpInst::FCMP_ORD, X);
 
   case FCmpInst::FCMP_ULT:
     // fabs(X) u< 0.0 --> isnan(X)
     assert(!I.hasNoNaNs() && "fcmp should have simplified");
-    return new FCmpInst(FCmpInst::FCMP_UNO, X, I.getOperand(1));
+    return replacePredAndOp0(&I, FCmpInst::FCMP_UNO, X);
 
   case FCmpInst::FCMP_OEQ:
   case FCmpInst::FCMP_UEQ:
@@ -5335,7 +5341,7 @@ static Instruction *foldFabsWithFcmpZero(FCmpInst &I) {
     // fabs(X) != 0.0 --> X != 0.0
     // isnan(fabs(X)) --> isnan(X)
     // !isnan(fabs(X) --> !isnan(X)
-    return new FCmpInst(I.getPredicate(), X, I.getOperand(1));
+    return replacePredAndOp0(&I, I.getPredicate(), X);
 
   default:
     return nullptr;
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index 15d9368f49e..be7aedc7c60 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -189,7 +189,7 @@ define i1 @fabs_olt(half %a) {
 
 define <2 x i1> @fabs_ole(<2 x float> %a) {
 ; CHECK-LABEL: @fabs_ole(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf oeq <2 x float> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
@@ -199,7 +199,7 @@ define <2 x i1> @fabs_ole(<2 x float> %a) {
 
 define <2 x i1> @fabs_ule(<2 x float> %a) {
 ; CHECK-LABEL: @fabs_ule(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf arcp ueq <2 x float> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
@@ -209,7 +209,7 @@ define <2 x i1> @fabs_ule(<2 x float> %a) {
 
 define i1 @fabs_ogt(double %a) {
 ; CHECK-LABEL: @fabs_ogt(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc one double [[A:%.*]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
@@ -219,7 +219,7 @@ define i1 @fabs_ogt(double %a) {
 
 define i1 @fabs_ugt(double %a) {
 ; CHECK-LABEL: @fabs_ugt(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc ninf une double [[A:%.*]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
@@ -229,7 +229,7 @@ define i1 @fabs_ugt(double %a) {
 
 define i1 @fabs_oge(double %a) {
 ; CHECK-LABEL: @fabs_oge(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp afn ord double [[A:%.*]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
@@ -239,7 +239,7 @@ define i1 @fabs_oge(double %a) {
 
 define i1 @fabs_ult(double %a) {
 ; CHECK-LABEL: @fabs_ult(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc arcp uno double [[A:%.*]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
@@ -258,7 +258,7 @@ define <2 x i1> @fabs_ult_nnan(<2 x float> %a) {
 
 define i1 @fabs_une(half %a) {
 ; CHECK-LABEL: @fabs_une(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une half [[A:%.*]], 0xH0000
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf une half [[A:%.*]], 0xH0000
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call half @llvm.fabs.f16(half %a)
@@ -268,7 +268,7 @@ define i1 @fabs_une(half %a) {
 
 define i1 @fabs_oeq(double %a) {
 ; CHECK-LABEL: @fabs_oeq(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc ninf oeq double [[A:%.*]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
@@ -278,7 +278,7 @@ define i1 @fabs_oeq(double %a) {
 
 define i1 @fabs_one(double %a) {
 ; CHECK-LABEL: @fabs_one(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast one double [[A:%.*]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = call double @llvm.fabs.f64(double %a)
@@ -288,7 +288,7 @@ define i1 @fabs_one(double %a) {
 
 define <2 x i1> @fabs_ueq(<2 x float> %a) {
 ; CHECK-LABEL: @fabs_ueq(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp arcp ueq <2 x float> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
@@ -298,7 +298,7 @@ define <2 x i1> @fabs_ueq(<2 x float> %a) {
 
 define <2 x i1> @fabs_ord(<2 x float> %a) {
 ; CHECK-LABEL: @fabs_ord(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp arcp ord <2 x float> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
@@ -308,7 +308,7 @@ define <2 x i1> @fabs_ord(<2 x float> %a) {
 
 define <2 x i1> @fabs_uno(<2 x float> %a) {
 ; CHECK-LABEL: @fabs_uno(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp arcp uno <2 x float> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
-- 
GitLab


From 7c7c0a2cd982d34de3e49dc0ef7a459e1262f306 Mon Sep 17 00:00:00 2001
From: Serge Guelton <sguelton@quarkslab.com>
Date: Wed, 7 Nov 2018 16:17:30 +0000
Subject: [PATCH 1072/1116] Fix ignorded type qualifier warning [NFC]

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346332 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/TargetLoweringBase.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 30887e2d5f8..166ff18e775 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -1104,7 +1104,7 @@ void TargetLoweringBase::computeRegisterProperties(
       LegalIntReg = IntReg;
     } else {
       RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
-        (const MVT::SimpleValueType)LegalIntReg;
+        (MVT::SimpleValueType)LegalIntReg;
       ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
     }
   }
-- 
GitLab


From 7fe3470de009c438bb6f9aa4fb63b43ade978ba9 Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Wed, 7 Nov 2018 16:52:50 +0000
Subject: [PATCH 1073/1116] [llvm-exegesis] Remove superfluous move.

/Users/buildslave/as-bldslv9_new/lld-x86_64-darwin13/llvm.src/tools/llvm-exegesis/lib/X86/Target.cpp:155:12: error: moving a local object in a return statement prevents copy elision [-Werror,-Wpessimizing-move]
    return std::move(Error);
           ^
/Users/buildslave/as-bldslv9_new/lld-x86_64-darwin13/llvm.src/tools/llvm-exegesis/lib/X86/Target.cpp:155:12: note: remove std::move call here
    return std::move(Error);
           ^~~~~~~~~~     ~

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346333 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/X86/Target.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index 7f26adbe237..618e4d77db4 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -152,7 +152,7 @@ static llvm::Error IsInvalidOpcode(const Instruction &Instr) {
     return llvm::make_error<BenchmarkFailure>(
         "unsupported opcode: Push/Pop/AdjCallStack");
   if (llvm::Error Error = isInvalidMemoryInstr(Instr))
-    return std::move(Error);
+    return Error;
   // We do not handle instructions with OPERAND_PCREL.
   for (const Operand &Op : Instr.Operands)
     if (Op.isExplicit() &&
-- 
GitLab


From bf862d6543990dcbf284d74c04d6f7200021923b Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Wed, 7 Nov 2018 17:01:47 +0000
Subject: [PATCH 1074/1116] Workaround PPC backend bug in test for r346322.

It seems that the PPC backend croaks when lowering a call to a
function with an argument of type [2 x i32].

Just modify the type slightly to avoid this -- I wasn't actually
intending to stress test the backend...

llvm/lib/Target/PowerPC/PPCISelLowering.cpp:6172: llvm::SDValue llvm::PPCTargetLowering::LowerCall_64SVR4(...): Assertion `(!HasParameterArea || NumBytesActuallyUsed == ArgOffset) && "mismatch in size of parameter area"' failed.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346334 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/Generic/is-constant.ll | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/CodeGen/Generic/is-constant.ll b/test/CodeGen/Generic/is-constant.ll
index baeedc4c9b0..2e1f4261d6a 100644
--- a/test/CodeGen/Generic/is-constant.ll
+++ b/test/CodeGen/Generic/is-constant.ll
@@ -11,7 +11,7 @@ declare i1 @llvm.is.constant.i256(i256 %a)
 declare i1 @llvm.is.constant.v2i64(<2 x i64> %a)
 declare i1 @llvm.is.constant.f32(float %a)
 declare i1 @llvm.is.constant.sl_i32i32s({i32, i32} %a)
-declare i1 @llvm.is.constant.a2i32([2 x i32] %a)
+declare i1 @llvm.is.constant.a2i64([2 x i64] %a)
 declare i1 @llvm.is.constant.p0i64(i64* %a)
 
 ;; Basic test that optimization folds away the is.constant when given
@@ -63,7 +63,7 @@ define i1 @test_diff() #0 {
   ret i1 %ret
 }
 
-define i1 @test_various_types(i256 %int, float %float, <2 x i64> %vec, {i32, i32} %struct, [2 x i32] %arr, i64* %ptr) #0 {
+define i1 @test_various_types(i256 %int, float %float, <2 x i64> %vec, {i32, i32} %struct, [2 x i64] %arr, i64* %ptr) #0 {
 ; CHECK-LABEL: @test_various_types(
 ; CHECK: llvm.is.constant
 ; CHECK: llvm.is.constant
@@ -76,14 +76,14 @@ define i1 @test_various_types(i256 %int, float %float, <2 x i64> %vec, {i32, i32
   %v2 = call i1 @llvm.is.constant.f32(float %float)
   %v3 = call i1 @llvm.is.constant.v2i64(<2 x i64> %vec)
   %v4 = call i1 @llvm.is.constant.sl_i32i32s({i32, i32} %struct)
-  %v5 = call i1 @llvm.is.constant.a2i32([2 x i32] %arr)
+  %v5 = call i1 @llvm.is.constant.a2i64([2 x i64] %arr)
   %v6 = call i1 @llvm.is.constant.p0i64(i64* %ptr)
 
   %c1 = call i1 @llvm.is.constant.i256(i256 -1)
   %c2 = call i1 @llvm.is.constant.f32(float 17.0)
   %c3 = call i1 @llvm.is.constant.v2i64(<2 x i64> <i64 -1, i64 44>)
   %c4 = call i1 @llvm.is.constant.sl_i32i32s({i32, i32} {i32 -1, i32 32})
-  %c5 = call i1 @llvm.is.constant.a2i32([2 x i32] [i32 -1, i32 32])
+  %c5 = call i1 @llvm.is.constant.a2i64([2 x i64] [i64 -1, i64 32])
   %c6 = call i1 @llvm.is.constant.p0i64(i64* inttoptr (i32 42 to i64*))
 
   %x1 = add i1 %v1, %c1
@@ -106,7 +106,7 @@ define i1 @test_various_types2() #0 {
 ; CHECK-LABEL: @test_various_types2(
 ; CHECK: ret i1 false
   %r = call i1 @test_various_types(i256 -1, float 22.0, <2 x i64> <i64 -1, i64 44>,
-                     {i32, i32} {i32 -1, i32 55}, [2 x i32] [i32 -1, i32 55],
+                     {i32, i32} {i32 -1, i32 55}, [2 x i64] [i64 -1, i64 55],
 		     i64* inttoptr (i64 42 to i64*))
   ret i1 %r
 }
-- 
GitLab


From bdee2b6fc17530d4218e7139a23131dfae7a9ccd Mon Sep 17 00:00:00 2001
From: Florian Hahn <florian.hahn@arm.com>
Date: Wed, 7 Nov 2018 17:20:07 +0000
Subject: [PATCH 1075/1116] [NewGVN] Make sure we do not add a user to itself.

If we simplify an instruction to itself, we do not need to add a user to
itself. For congruence classes with a defining expression, we already
use a similar logic.

Fixes PR38259.

Reviewers: davide, efriedma, mcrosier

Reviewed By: davide

Differential Revision: https://reviews.llvm.org/D51168


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346335 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/NewGVN.cpp       | 10 +++++++---
 test/Transforms/NewGVN/simp-to-self.ll | 27 ++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 test/Transforms/NewGVN/simp-to-self.ll

diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index cd57ebd0c6f..9803bcb485d 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -1086,9 +1086,13 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
   CongruenceClass *CC = ValueToClass.lookup(V);
   if (CC) {
     if (CC->getLeader() && CC->getLeader() != I) {
-      // Don't add temporary instructions to the user lists.
-      if (!AllTempInstructions.count(I))
-        addAdditionalUsers(V, I);
+      // If we simplified to something else, we need to communicate
+      // that we're users of the value we simplified to.
+      if (I != V) {
+        // Don't add temporary instructions to the user lists.
+        if (!AllTempInstructions.count(I))
+          addAdditionalUsers(V, I);
+      }
       return createVariableOrConstant(CC->getLeader());
     }
     if (CC->getDefiningExpr()) {
diff --git a/test/Transforms/NewGVN/simp-to-self.ll b/test/Transforms/NewGVN/simp-to-self.ll
new file mode 100644
index 00000000000..ca46af76849
--- /dev/null
+++ b/test/Transforms/NewGVN/simp-to-self.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S < %s -newgvn | FileCheck %s
+
+; CHECK-LABEL: for.cond:
+; CHECK-NEXT:    %lv = load i32, i32* bitcast (i64* @a to i32*)
+; CHECK-NEXT:    %bf.clear = and i32 %lv, -131072
+; CHECK-NEXT:    %bf.set = or i32 1, %bf.clear
+; CHECK-NEXT:    br i1 %bc, label %for.cond, label %exit
+@a = external global i64
+
+define void @fn1(i1 %bc) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond1.1, %entry
+  %tmp = phi i1 [ undef, %entry ], [ 1, %for.cond ]
+  %conv = zext i1 %tmp to i32
+  %lv = load i32, i32* bitcast (i64* @a to i32*)
+  %bf.clear = and i32 %lv, -131072
+  %bf.set = or i32 %conv, %bf.clear
+  %bf.clear.1 = and i32 %bf.set, -131072
+  %bf.set.1 = or i32 1, %bf.clear.1
+  br i1 %bc, label %for.cond, label %exit
+
+exit:                              ; preds = %for.cond1
+  store i32 %bf.set.1, i32* bitcast (i64* @a to i32*)
+  ret void
+}
-- 
GitLab


From f43a1f459d5adebb9a9abd701f32482397b94b4d Mon Sep 17 00:00:00 2001
From: Than McIntosh <thanm@google.com>
Date: Wed, 7 Nov 2018 17:41:57 +0000
Subject: [PATCH 1076/1116] [X86] improve split-stack machine BB placement

Summary:
The conditional branch created to support -fsplit-stack for X86 is
left unbiased/unhinted, resulting in less than ideal block placement:
the __morestack call block is kept on the main hot path. Bias the
branch to insure that the stack allocation block is treated as a
"cold" block during machine basic block placement.

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D54123

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346336 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FrameLowering.cpp          |   4 +-
 test/CodeGen/X86/segmented-stacks-dynamic.ll |  36 +++---
 test/CodeGen/X86/segmented-stacks.ll         | 110 ++++++++++---------
 test/CodeGen/X86/x86-shrink-wrap-unwind.ll   |  15 ++-
 4 files changed, 83 insertions(+), 82 deletions(-)

diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 67ec867b562..e40b0f81e33 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -2471,8 +2471,8 @@ void X86FrameLowering::adjustForSegmentedStacks(
 
   allocMBB->addSuccessor(&PrologueMBB);
 
-  checkMBB->addSuccessor(allocMBB);
-  checkMBB->addSuccessor(&PrologueMBB);
+  checkMBB->addSuccessor(allocMBB, BranchProbability::getZero());
+  checkMBB->addSuccessor(&PrologueMBB, BranchProbability::getOne());
 
 #ifdef EXPENSIVE_CHECKS
   MF.verify();
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index e34ba5412f0..bce51d6bc23 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -25,12 +25,7 @@ false:
 ; X32-LABEL:      test_basic:
 
 ; X32:      cmpl %gs:48, %esp
-; X32-NEXT: ja      .LBB0_2
-
-; X32:      pushl $4
-; X32-NEXT: pushl $12
-; X32-NEXT: calll __morestack
-; X32-NEXT: ret
+; X32-NEXT: jbe	.LBB0_1
 
 ; X32:      movl %esp, %eax
 ; X32:      subl %ecx, %eax
@@ -43,15 +38,15 @@ false:
 ; X32-NEXT: calll __morestack_allocate_stack_space
 ; X32-NEXT: addl $16, %esp
 
+; X32:      pushl $4
+; X32-NEXT: pushl $12
+; X32-NEXT: calll __morestack
+; X32-NEXT: ret
+
 ; X64-LABEL:      test_basic:
 
 ; X64:      cmpq %fs:112, %rsp
-; X64-NEXT: ja      .LBB0_2
-
-; X64:      movabsq $24, %r10
-; X64-NEXT: movabsq $0, %r11
-; X64-NEXT: callq __morestack
-; X64-NEXT: ret
+; X64-NEXT: jbe      .LBB0_1
 
 ; X64:      movq %rsp, %[[RDI:rdi|rax]]
 ; X64:      subq %{{.*}}, %[[RDI]]
@@ -63,15 +58,15 @@ false:
 ; X64-NEXT: callq __morestack_allocate_stack_space
 ; X64:      movq %rax, %rdi
 
+; X64:      movabsq $24, %r10
+; X64-NEXT: movabsq $0, %r11
+; X64-NEXT: callq __morestack
+; X64-NEXT: ret
+
 ; X32ABI-LABEL:      test_basic:
 
 ; X32ABI:      cmpl %fs:64, %esp
-; X32ABI-NEXT: ja      .LBB0_2
-
-; X32ABI:      movl $24, %r10d
-; X32ABI-NEXT: movl $0, %r11d
-; X32ABI-NEXT: callq __morestack
-; X32ABI-NEXT: ret
+; X32ABI-NEXT: jbe      .LBB0_1
 
 ; X32ABI:      movl %esp, %[[EDI:edi|eax]]
 ; X32ABI:      subl %{{.*}}, %[[EDI]]
@@ -83,6 +78,11 @@ false:
 ; X32ABI-NEXT: callq __morestack_allocate_stack_space
 ; X32ABI:      movl %eax, %edi
 
+; X32ABI:      movl $24, %r10d
+; X32ABI-NEXT: movl $0, %r11d
+; X32ABI-NEXT: callq __morestack
+; X32ABI-NEXT: ret
+
 }
 
 attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 588262bbb39..fac9a33394b 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -41,7 +41,7 @@ define void @test_basic() #0 {
 ; X32-Linux-LABEL:       test_basic:
 
 ; X32-Linux:       cmpl %gs:48, %esp
-; X32-Linux-NEXT:  ja      .LBB0_2
+; X32-Linux-NEXT:  jbe	.LBB0_1
 
 ; X32-Linux:       pushl $0
 ; X32-Linux-NEXT:  pushl $44
@@ -51,7 +51,7 @@ define void @test_basic() #0 {
 ; X64-Linux-LABEL:       test_basic:
 
 ; X64-Linux:       cmpq %fs:112, %rsp
-; X64-Linux-NEXT:  ja      .LBB0_2
+; X64-Linux-NEXT:  jbe	.LBB0_1
 
 ; X64-Linux:       movabsq $40, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
@@ -61,7 +61,7 @@ define void @test_basic() #0 {
 ; X64-Linux-Large-LABEL:       test_basic:
 
 ; X64-Linux-Large:       cmpq %fs:112, %rsp
-; X64-Linux-Large-NEXT:  ja      .LBB0_2
+; X64-Linux-Large-NEXT:  jbe	.LBB0_1
 
 ; X64-Linux-Large:       movabsq $40, %r10
 ; X64-Linux-Large-NEXT:  movabsq $0, %r11
@@ -71,7 +71,7 @@ define void @test_basic() #0 {
 ; X32ABI-LABEL:       test_basic:
 
 ; X32ABI:       cmpl %fs:64, %esp
-; X32ABI-NEXT:  ja      .LBB0_2
+; X32ABI-NEXT:  jbe	.LBB0_1
 
 ; X32ABI:       movl $40, %r10d
 ; X32ABI-NEXT:  movl $0, %r11d
@@ -82,7 +82,7 @@ define void @test_basic() #0 {
 
 ; X32-Darwin:      movl $432, %ecx
 ; X32-Darwin-NEXT: cmpl %gs:(%ecx), %esp
-; X32-Darwin-NEXT: ja      LBB0_2
+; X32-Darwin-NEXT: jbe	LBB0_1
 
 ; X32-Darwin:      pushl $0
 ; X32-Darwin-NEXT: pushl $60
@@ -92,7 +92,7 @@ define void @test_basic() #0 {
 ; X64-Darwin-LABEL:      test_basic:
 
 ; X64-Darwin:      cmpq %gs:816, %rsp
-; X64-Darwin-NEXT: ja      LBB0_2
+; X64-Darwin-NEXT: jbe	LBB0_1
 
 ; X64-Darwin:      movabsq $40, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
@@ -102,7 +102,7 @@ define void @test_basic() #0 {
 ; X32-MinGW-LABEL:       test_basic:
 
 ; X32-MinGW:       cmpl %fs:20, %esp
-; X32-MinGW-NEXT:  ja      LBB0_2
+; X32-MinGW-NEXT:  jbe      LBB0_1
 
 ; X32-MinGW:       pushl $0
 ; X32-MinGW-NEXT:  pushl $40
@@ -112,7 +112,7 @@ define void @test_basic() #0 {
 ; X64-MinGW-LABEL:       test_basic:
 
 ; X64-MinGW:       cmpq %gs:40, %rsp
-; X64-MinGW-NEXT:  ja      .LBB0_2
+; X64-MinGW-NEXT:  jbe      .LBB0_1
 
 ; X64-MinGW:       movabsq $72, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
@@ -122,7 +122,7 @@ define void @test_basic() #0 {
 ; X64-FreeBSD-LABEL:       test_basic:
 
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
-; X64-FreeBSD-NEXT:  ja      .LBB0_2
+; X64-FreeBSD-NEXT:  jbe      .LBB0_1
 
 ; X64-FreeBSD:       movabsq $40, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
@@ -132,7 +132,7 @@ define void @test_basic() #0 {
 ; X32-DFlyBSD-LABEL:       test_basic:
 
 ; X32-DFlyBSD:       cmpl %fs:16, %esp
-; X32-DFlyBSD-NEXT:  ja      .LBB0_2
+; X32-DFlyBSD-NEXT:  jbe      .LBB0_1
 
 ; X32-DFlyBSD:       pushl $0
 ; X32-DFlyBSD-NEXT:  pushl $40
@@ -142,7 +142,7 @@ define void @test_basic() #0 {
 ; X64-DFlyBSD-LABEL:       test_basic:
 
 ; X64-DFlyBSD:       cmpq %fs:32, %rsp
-; X64-DFlyBSD-NEXT:  ja      .LBB0_2
+; X64-DFlyBSD-NEXT:  jbe      .LBB0_1
 
 ; X64-DFlyBSD:       movabsq $40, %r10
 ; X64-DFlyBSD-NEXT:  movabsq $0, %r11
@@ -159,7 +159,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        ret i32 %result
 
 ; X32-Linux:       cmpl %gs:48, %esp
-; X32-Linux-NEXT:  ja      .LBB1_2
+; X32-Linux-NEXT:  jbe	.LBB1_1
 
 ; X32-Linux:       pushl $4
 ; X32-Linux-NEXT:  pushl $44
@@ -167,7 +167,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X32-Linux-NEXT:  ret
 
 ; X64-Linux:       cmpq %fs:112, %rsp
-; X64-Linux-NEXT:  ja      .LBB1_2
+; X64-Linux-NEXT:  jbe	.LBB1_1
 
 ; X64-Linux:       movq %r10, %rax
 ; X64-Linux-NEXT:  movabsq $56, %r10
@@ -177,7 +177,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-Linux-NEXT:  movq %rax, %r10
 
 ; X32ABI:       cmpl %fs:64, %esp
-; X32ABI-NEXT:  ja      .LBB1_2
+; X32ABI-NEXT:  jbe	.LBB1_1
 
 ; X32ABI:       movl %r10d, %eax
 ; X32ABI-NEXT:  movl $56, %r10d
@@ -188,7 +188,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 
 ; X32-Darwin:      movl $432, %edx
 ; X32-Darwin-NEXT: cmpl %gs:(%edx), %esp
-; X32-Darwin-NEXT: ja      LBB1_2
+; X32-Darwin-NEXT: jbe	LBB1_1
 
 ; X32-Darwin:      pushl $4
 ; X32-Darwin-NEXT: pushl $60
@@ -196,7 +196,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X32-Darwin-NEXT: ret
 
 ; X64-Darwin:      cmpq %gs:816, %rsp
-; X64-Darwin-NEXT: ja      LBB1_2
+; X64-Darwin-NEXT: jbe	LBB1_1
 
 ; X64-Darwin:      movq %r10, %rax
 ; X64-Darwin-NEXT: movabsq $56, %r10
@@ -206,7 +206,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-Darwin-NEXT: movq %rax, %r10
 
 ; X32-MinGW:       cmpl %fs:20, %esp
-; X32-MinGW-NEXT:  ja      LBB1_2
+; X32-MinGW-NEXT:  jbe      LBB1_1
 
 ; X32-MinGW:       pushl $4
 ; X32-MinGW-NEXT:  pushl $44
@@ -215,7 +215,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 
 ; X64-MinGW-LABEL: test_nested:
 ; X64-MinGW:       cmpq %gs:40, %rsp
-; X64-MinGW-NEXT:  ja      .LBB1_2
+; X64-MinGW-NEXT:  jbe      .LBB1_1
 
 ; X64-MinGW:       movq %r10, %rax
 ; X64-MinGW-NEXT:  movabsq $88, %r10
@@ -225,7 +225,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-MinGW-NEXT:  movq %rax, %r10
 
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
-; X64-FreeBSD-NEXT:  ja      .LBB1_2
+; X64-FreeBSD-NEXT:  jbe      .LBB1_1
 
 ; X64-FreeBSD:       movq %r10, %rax
 ; X64-FreeBSD-NEXT:  movabsq $56, %r10
@@ -235,7 +235,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-FreeBSD-NEXT:  movq %rax, %r10
 
 ; X32-DFlyBSD:       cmpl %fs:16, %esp
-; X32-DFlyBSD-NEXT:  ja      .LBB1_2
+; X32-DFlyBSD-NEXT:  jbe      .LBB1_1
 
 ; X32-DFlyBSD:       pushl $4
 ; X32-DFlyBSD-NEXT:  pushl $44
@@ -243,7 +243,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X32-DFlyBSD-NEXT:  ret
 
 ; X64-DFlyBSD:       cmpq %fs:32, %rsp
-; X64-DFlyBSD-NEXT:  ja      .LBB1_2
+; X64-DFlyBSD-NEXT:  jbe      .LBB1_1
 
 ; X64-DFlyBSD:       movq %r10, %rax
 ; X64-DFlyBSD-NEXT:  movabsq $56, %r10
@@ -256,12 +256,14 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 
 define void @test_large() #0 {
         %mem = alloca i32, i32 10000
-        call void @dummy_use (i32* %mem, i32 0)
+        call void @dummy_use (i32* %mem, i32 3)
         ret void
 
+; X32-Linux-LABEL:       test_large:
+
 ; X32-Linux:       leal -40012(%esp), %ecx
 ; X32-Linux-NEXT:  cmpl %gs:48, %ecx
-; X32-Linux-NEXT:  ja      .LBB2_2
+; X32-Linux-NEXT:  jbe	.LBB2_1
 
 ; X32-Linux:       pushl $0
 ; X32-Linux-NEXT:  pushl $40012
@@ -270,7 +272,7 @@ define void @test_large() #0 {
 
 ; X64-Linux:       leaq -40008(%rsp), %r11
 ; X64-Linux-NEXT:  cmpq %fs:112, %r11
-; X64-Linux-NEXT:  ja      .LBB2_2
+; X64-Linux-NEXT:  jbe	.LBB2_1
 
 ; X64-Linux:       movabsq $40008, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
@@ -279,7 +281,7 @@ define void @test_large() #0 {
 
 ; X32ABI:       leal -40008(%rsp), %r11d
 ; X32ABI-NEXT:  cmpl %fs:64, %r11d
-; X32ABI-NEXT:  ja      .LBB2_2
+; X32ABI-NEXT:  jbe	.LBB2_1
 
 ; X32ABI:       movl $40008, %r10d
 ; X32ABI-NEXT:  movl $0, %r11d
@@ -289,7 +291,7 @@ define void @test_large() #0 {
 ; X32-Darwin:      leal -40012(%esp), %ecx
 ; X32-Darwin-NEXT: movl $432, %eax
 ; X32-Darwin-NEXT: cmpl %gs:(%eax), %ecx
-; X32-Darwin-NEXT: ja      LBB2_2
+; X32-Darwin-NEXT: jbe	LBB2_1
 
 ; X32-Darwin:      pushl $0
 ; X32-Darwin-NEXT: pushl $40012
@@ -298,7 +300,7 @@ define void @test_large() #0 {
 
 ; X64-Darwin:      leaq -40008(%rsp), %r11
 ; X64-Darwin-NEXT: cmpq %gs:816, %r11
-; X64-Darwin-NEXT: ja      LBB2_2
+; X64-Darwin-NEXT: jbe      LBB2_1
 
 ; X64-Darwin:      movabsq $40008, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
@@ -307,7 +309,7 @@ define void @test_large() #0 {
 
 ; X32-MinGW:       leal -40000(%esp), %ecx
 ; X32-MinGW-NEXT:  cmpl %fs:20, %ecx
-; X32-MinGW-NEXT:  ja      LBB2_2
+; X32-MinGW-NEXT:  jbe      LBB2_1
 
 ; X32-MinGW:       pushl $0
 ; X32-MinGW-NEXT:  pushl $40000
@@ -317,7 +319,7 @@ define void @test_large() #0 {
 ; X64-MinGW-LABEL: test_large:
 ; X64-MinGW:       leaq -40040(%rsp), %r11
 ; X64-MinGW-NEXT:  cmpq %gs:40, %r11
-; X64-MinGW-NEXT:  ja      .LBB2_2
+; X64-MinGW-NEXT:  jbe      .LBB2_1
 
 ; X64-MinGW:       movabsq $40040, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
@@ -326,7 +328,7 @@ define void @test_large() #0 {
 
 ; X64-FreeBSD:       leaq -40008(%rsp), %r11
 ; X64-FreeBSD-NEXT:  cmpq %fs:24, %r11
-; X64-FreeBSD-NEXT:  ja      .LBB2_2
+; X64-FreeBSD-NEXT:  jbe      .LBB2_1
 
 ; X64-FreeBSD:       movabsq $40008, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
@@ -335,7 +337,7 @@ define void @test_large() #0 {
 
 ; X32-DFlyBSD:       leal -40000(%esp), %ecx
 ; X32-DFlyBSD-NEXT:  cmpl %fs:16, %ecx
-; X32-DFlyBSD-NEXT:  ja      .LBB2_2
+; X32-DFlyBSD-NEXT:  jbe      .LBB2_1
 
 ; X32-DFlyBSD:       pushl $0
 ; X32-DFlyBSD-NEXT:  pushl $40000
@@ -344,7 +346,7 @@ define void @test_large() #0 {
 
 ; X64-DFlyBSD:       leaq -40008(%rsp), %r11
 ; X64-DFlyBSD-NEXT:  cmpq %fs:32, %r11
-; X64-DFlyBSD-NEXT:  ja      .LBB2_2
+; X64-DFlyBSD-NEXT:  jbe      .LBB2_1
 
 ; X64-DFlyBSD:       movabsq $40008, %r10
 ; X64-DFlyBSD-NEXT:  movabsq $0, %r11
@@ -361,7 +363,7 @@ define fastcc void @test_fastcc() #0 {
 ; X32-Linux-LABEL:       test_fastcc:
 
 ; X32-Linux:       cmpl %gs:48, %esp
-; X32-Linux-NEXT:  ja      .LBB3_2
+; X32-Linux-NEXT:  jbe	.LBB3_1
 
 ; X32-Linux:       pushl $0
 ; X32-Linux-NEXT:  pushl $44
@@ -371,7 +373,7 @@ define fastcc void @test_fastcc() #0 {
 ; X64-Linux-LABEL:       test_fastcc:
 
 ; X64-Linux:       cmpq %fs:112, %rsp
-; X64-Linux-NEXT:  ja      .LBB3_2
+; X64-Linux-NEXT:  jbe	.LBB3_1
 
 ; X64-Linux:       movabsq $40, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
@@ -381,7 +383,7 @@ define fastcc void @test_fastcc() #0 {
 ; X32ABI-LABEL:       test_fastcc:
 
 ; X32ABI:       cmpl %fs:64, %esp
-; X32ABI-NEXT:  ja      .LBB3_2
+; X32ABI-NEXT:  jbe	.LBB3_1
 
 ; X32ABI:       movl $40, %r10d
 ; X32ABI-NEXT:  movl $0, %r11d
@@ -392,7 +394,7 @@ define fastcc void @test_fastcc() #0 {
 
 ; X32-Darwin:      movl $432, %eax
 ; X32-Darwin-NEXT: cmpl %gs:(%eax), %esp
-; X32-Darwin-NEXT: ja      LBB3_2
+; X32-Darwin-NEXT: jbe	LBB3_1
 
 ; X32-Darwin:      pushl $0
 ; X32-Darwin-NEXT: pushl $60
@@ -402,7 +404,7 @@ define fastcc void @test_fastcc() #0 {
 ; X64-Darwin-LABEL:      test_fastcc:
 
 ; X64-Darwin:      cmpq %gs:816, %rsp
-; X64-Darwin-NEXT: ja      LBB3_2
+; X64-Darwin-NEXT: jbe	LBB3_1
 
 ; X64-Darwin:      movabsq $40, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
@@ -412,7 +414,7 @@ define fastcc void @test_fastcc() #0 {
 ; X32-MinGW-LABEL:       test_fastcc:
 
 ; X32-MinGW:       cmpl %fs:20, %esp
-; X32-MinGW-NEXT:  ja      LBB3_2
+; X32-MinGW-NEXT:  jbe      LBB3_1
 
 ; X32-MinGW:       pushl $0
 ; X32-MinGW-NEXT:  pushl $40
@@ -422,7 +424,7 @@ define fastcc void @test_fastcc() #0 {
 ; X64-MinGW-LABEL:       test_fastcc:
 
 ; X64-MinGW:       cmpq %gs:40, %rsp
-; X64-MinGW-NEXT:  ja      .LBB3_2
+; X64-MinGW-NEXT:  jbe      .LBB3_1
 
 ; X64-MinGW:       movabsq $72, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
@@ -432,7 +434,7 @@ define fastcc void @test_fastcc() #0 {
 ; X64-FreeBSD-LABEL:       test_fastcc:
 
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
-; X64-FreeBSD-NEXT:  ja      .LBB3_2
+; X64-FreeBSD-NEXT:  jbe    .LBB3_1
 
 ; X64-FreeBSD:       movabsq $40, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
@@ -442,7 +444,7 @@ define fastcc void @test_fastcc() #0 {
 ; X32-DFlyBSD-LABEL:       test_fastcc:
 
 ; X32-DFlyBSD:       cmpl %fs:16, %esp
-; X32-DFlyBSD-NEXT:  ja      .LBB3_2
+; X32-DFlyBSD-NEXT:  jbe     .LBB3_1
 
 ; X32-DFlyBSD:       pushl $0
 ; X32-DFlyBSD-NEXT:  pushl $40
@@ -452,7 +454,7 @@ define fastcc void @test_fastcc() #0 {
 ; X64-DFlyBSD-LABEL:       test_fastcc:
 
 ; X64-DFlyBSD:       cmpq %fs:32, %rsp
-; X64-DFlyBSD-NEXT:  ja      .LBB3_2
+; X64-DFlyBSD-NEXT:  jbe      .LBB3_1
 
 ; X64-DFlyBSD:       movabsq $40, %r10
 ; X64-DFlyBSD-NEXT:  movabsq $0, %r11
@@ -463,14 +465,14 @@ define fastcc void @test_fastcc() #0 {
 
 define fastcc void @test_fastcc_large() #0 {
         %mem = alloca i32, i32 10000
-        call void @dummy_use (i32* %mem, i32 0)
+        call void @dummy_use (i32* %mem, i32 3)
         ret void
 
 ; X32-Linux-LABEL:       test_fastcc_large:
 
 ; X32-Linux:       leal -40012(%esp), %eax
 ; X32-Linux-NEXT:  cmpl %gs:48, %eax
-; X32-Linux-NEXT:  ja      .LBB4_2
+; X32-Linux-NEXT:  jbe	.LBB4_1
 
 ; X32-Linux:       pushl $0
 ; X32-Linux-NEXT:  pushl $40012
@@ -481,7 +483,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X64-Linux:       leaq -40008(%rsp), %r11
 ; X64-Linux-NEXT:  cmpq %fs:112, %r11
-; X64-Linux-NEXT:  ja      .LBB4_2
+; X64-Linux-NEXT:  jbe	.LBB4_1
 
 ; X64-Linux:       movabsq $40008, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
@@ -492,7 +494,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X32ABI:       leal -40008(%rsp), %r11d
 ; X32ABI-NEXT:  cmpl %fs:64, %r11d
-; X32ABI-NEXT:  ja      .LBB4_2
+; X32ABI-NEXT:  jbe	.LBB4_1
 
 ; X32ABI:       movl $40008, %r10d
 ; X32ABI-NEXT:  movl $0, %r11d
@@ -504,7 +506,7 @@ define fastcc void @test_fastcc_large() #0 {
 ; X32-Darwin:      leal -40012(%esp), %eax
 ; X32-Darwin-NEXT: movl $432, %ecx
 ; X32-Darwin-NEXT: cmpl %gs:(%ecx), %eax
-; X32-Darwin-NEXT: ja      LBB4_2
+; X32-Darwin-NEXT: jbe	LBB4_1
 
 ; X32-Darwin:      pushl $0
 ; X32-Darwin-NEXT: pushl $40012
@@ -515,7 +517,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X64-Darwin:      leaq -40008(%rsp), %r11
 ; X64-Darwin-NEXT: cmpq %gs:816, %r11
-; X64-Darwin-NEXT: ja      LBB4_2
+; X64-Darwin-NEXT: jbe	LBB4_1
 
 ; X64-Darwin:      movabsq $40008, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
@@ -526,7 +528,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X32-MinGW:       leal -40000(%esp), %eax
 ; X32-MinGW-NEXT:  cmpl %fs:20, %eax
-; X32-MinGW-NEXT:  ja      LBB4_2
+; X32-MinGW-NEXT:  jbe      LBB4_1
 
 ; X32-MinGW:       pushl $0
 ; X32-MinGW-NEXT:  pushl $40000
@@ -537,7 +539,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X64-MinGW:       leaq -40040(%rsp), %r11
 ; X64-MinGW-NEXT:  cmpq %gs:40, %r11
-; X64-MinGW-NEXT:  ja      .LBB4_2
+; X64-MinGW-NEXT:  jbe      .LBB4_1
 
 ; X64-MinGW:       movabsq $40040, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
@@ -548,7 +550,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X64-FreeBSD:       leaq -40008(%rsp), %r11
 ; X64-FreeBSD-NEXT:  cmpq %fs:24, %r11
-; X64-FreeBSD-NEXT:  ja      .LBB4_2
+; X64-FreeBSD-NEXT:  jbe     .LBB4_1
 
 ; X64-FreeBSD:       movabsq $40008, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
@@ -559,7 +561,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X32-DFlyBSD:       leal -40000(%esp), %eax
 ; X32-DFlyBSD-NEXT:  cmpl %fs:16, %eax
-; X32-DFlyBSD-NEXT:  ja      .LBB4_2
+; X32-DFlyBSD-NEXT:  jbe      .LBB4_1
 
 ; X32-DFlyBSD:       pushl $0
 ; X32-DFlyBSD-NEXT:  pushl $40000
@@ -570,7 +572,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X64-DFlyBSD:       leaq -40008(%rsp), %r11
 ; X64-DFlyBSD-NEXT:  cmpq %fs:32, %r11
-; X64-DFlyBSD-NEXT:  ja      .LBB4_2
+; X64-DFlyBSD-NEXT:  jbe      .LBB4_1
 
 ; X64-DFlyBSD:       movabsq $40008, %r10
 ; X64-DFlyBSD-NEXT:  movabsq $0, %r11
@@ -593,7 +595,7 @@ define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) #0 {
 ; X32-Darwin-NEXT: movl $432, %ecx
 ; X32-Darwin-NEXT: cmpl %gs:(%ecx), %eax
 ; X32-Darwin-NEXT: popl %ecx
-; X32-Darwin-NEXT: ja      LBB5_2
+; X32-Darwin-NEXT: jbe	LBB5_1
 
 ; X32-Darwin:      pushl $0
 ; X32-Darwin-NEXT: pushl $40012
diff --git a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
index f1f81da926d..75dcbfd8471 100644
--- a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
+++ b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -160,14 +160,7 @@ attributes #2 = { "no-frame-pointer-elim"="false" nounwind }
 ;
 ; CHECK-LABEL: segmentedStack:
 ; CHECK: cmpq
-; CHECK-NEXT: ja [[ENTRY_LABEL:LBB[0-9_]+]]
-;
-; CHECK: callq ___morestack
-; CHECK-NEXT: retq
-;
-; CHECK: [[ENTRY_LABEL]]:
-; Prologue
-; CHECK: push
+; CHECK-NEXT: jbe [[ENTRY_LABEL:LBB[0-9_]+]]
 ;
 ; In PR26107, we use to drop these two basic blocks, because
 ; the segmentedStack entry block was jumping directly to
@@ -186,6 +179,12 @@ attributes #2 = { "no-frame-pointer-elim"="false" nounwind }
 ;
 ; CHECK: [[STRINGS_EQUAL]]
 ; CHECK: popq
+;
+; CHECK: [[ENTRY_LABEL]]:
+; CHECK: callq ___morestack
+; CHECK-NEXT: retq
+;
+
 define zeroext i1 @segmentedStack(i8* readonly %vk1, i8* readonly %vk2, i64 %key_size) #5 {
 entry:
   %cmp.i = icmp eq i8* %vk1, null
-- 
GitLab


From 06a55fc2f4556b1e47801514ce500dc316d8e38c Mon Sep 17 00:00:00 2001
From: Mandeep Singh Grang <mgrang@codeaurora.org>
Date: Wed, 7 Nov 2018 18:26:24 +0000
Subject: [PATCH 1077/1116] [LoopSink] Do not sink instructions into non-cold
 blocks

Summary: This fixes PR39570.

Reviewers: danielcdh, rnk, bkramer

Reviewed By: rnk

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D54181

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346337 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopSink.cpp       |   7 ++
 test/Transforms/LICM/loopsink-pr39570.ll | 112 +++++++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 test/Transforms/LICM/loopsink-pr39570.ll

diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index db502e1c5db..ce6ecea2dc2 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -202,6 +202,13 @@ static bool sinkInstruction(Loop &L, Instruction &I,
   if (BBsToSinkInto.empty())
     return false;
 
+  // Return if any of the candidate blocks to sink into is non-cold.
+  if (BBsToSinkInto.size() > 1) {
+    for (auto *BB : BBsToSinkInto)
+      if (!LoopBlockNumber.count(BB))
+        return false;
+  }
+
   // Copy the final BBs into a vector and sort them using the total ordering
   // of the loop block numbers as iterating the set doesn't give a useful
   // order. No need to stable sort as the block numbers are a total ordering.
diff --git a/test/Transforms/LICM/loopsink-pr39570.ll b/test/Transforms/LICM/loopsink-pr39570.ll
new file mode 100644
index 00000000000..65d3e1f5139
--- /dev/null
+++ b/test/Transforms/LICM/loopsink-pr39570.ll
@@ -0,0 +1,112 @@
+; RUN: opt -S -loop-sink < %s | FileCheck %s
+
+; CHECK: pr39570
+; Make sure not to assert.
+
+%0 = type { i32, %1*, %2, %6*, %33* }
+%1 = type { i32 (...)** }
+%2 = type { %3* }
+%3 = type { %4, i32, %5* }
+%4 = type { i32 (...)**, i32 }
+%5 = type opaque
+%6 = type { %7, %1*, %31*, i8, %2, %32* }
+%7 = type <{ %8, %9*, %10, i32, %33*, %33*, %33*, %27, %28, i16 }>
+%8 = type { i32 (...)** }
+%9 = type opaque
+%10 = type { %11, %16, %18, %19 }
+%11 = type { %12*, i32, i32, %13* }
+%12 = type { i32 (...)** }
+%13 = type { %14*, %14* }
+%14 = type { %15, i32 }
+%15 = type { %12*, i32, i32, i16* }
+%16 = type { %12*, i32, i32, %17* }
+%17 = type { %13, %14* }
+%18 = type { %12*, i32, i32, %14** }
+%19 = type { %20, %21, %12*, float, i32, i32, %22, %22, %24, i32, i32 }
+%20 = type { i8 }
+%21 = type { i8 }
+%22 = type { %12*, %23*, %23* }
+%23 = type opaque
+%24 = type { %12*, i32, i32, %25* }
+%25 = type { %12*, i32, i32, %26* }
+%26 = type opaque
+%27 = type { %33* }
+%28 = type { %29, i32, i32, %14* }
+%29 = type { %30 }
+%30 = type { i32 (...)** }
+%31 = type opaque
+%32 = type { i32 (...)** }
+%33 = type <{ %8, %9*, %10, i32, %33*, %33*, %33*, %27, %28, i16, [2 x i8] }>
+
+define dso_local void @pr39570() local_unnamed_addr align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !prof !1 {
+  br i1 undef, label %8, label %1, !prof !2
+
+; <label>:1:                                      ; preds = %0
+  %2 = load %0*, %0** undef, align 4
+  br label %3
+
+; <label>:3:                                      ; preds = %7, %1
+  %4 = getelementptr inbounds %0, %0* %2, i32 undef, i32 0
+  br label %5
+
+; <label>:5:                                      ; preds = %3
+  %6 = getelementptr inbounds %0, %0* %2, i32 undef, i32 4
+  br i1 undef, label %18, label %7, !prof !3
+
+; <label>:7:                                      ; preds = %5
+  br label %3
+
+; <label>:8:                                      ; preds = %0
+  invoke void @baz()
+          to label %9 unwind label %12
+
+; <label>:9:                                      ; preds = %8
+  invoke void @bar()
+          to label %17 unwind label %10
+
+; <label>:10:                                     ; preds = %9
+  %11 = landingpad { i8*, i32 }
+          catch i8* null
+  unreachable
+
+; <label>:12:                                     ; preds = %8
+  %13 = landingpad { i8*, i32 }
+          cleanup
+  invoke void @bar()
+          to label %16 unwind label %14
+
+; <label>:14:                                     ; preds = %12
+  %15 = landingpad { i8*, i32 }
+          catch i8* null
+  unreachable
+
+; <label>:16:                                     ; preds = %12
+  resume { i8*, i32 } %13
+
+; <label>:17:                                     ; preds = %9
+  br label %18
+
+; <label>:18:                                     ; preds = %17, %5
+  invoke void @baz()
+          to label %19 unwind label %20
+
+; <label>:19:                                     ; preds = %18
+  invoke void @bar()
+          to label %22 unwind label %20
+
+; <label>:20:                                     ; preds = %19
+  %21 = landingpad { i8*, i32 }
+          catch i8* null
+  unreachable
+
+; <label>:22:                                     ; preds = %19
+  ret void
+}
+
+declare dso_local i32 @__gxx_personality_v0(...)
+declare dso_local void @bar() local_unnamed_addr
+declare dso_local void @baz() local_unnamed_addr align 2
+
+!1 = !{!"function_entry_count", i64 0}
+!2 = !{!"branch_weights", i32 1, i32 3215551}
+!3 = !{!"branch_weights", i32 3215551, i32 1}
-- 
GitLab


From fc6b6a704eea68966e85f70583cb4a78c74a4416 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Wed, 7 Nov 2018 18:36:43 +0000
Subject: [PATCH 1078/1116] [MachineOutliner] Don't store outlined function
 numberings on OutlinedFunction

NFC-ish. This doesn't change the behaviour of the outliner, but does make sure
that you won't end up with say

OUTLINED_FUNCTION_2:
...
ret

OUTLINED_FUNCTION_248:
...
ret

as the only outlined functions in your module. Those should really be

OUTLINED_FUNCTION_0:
...
ret

OUTLINED_FUNCTION_1:
...
ret

If we produce outlined functions, they probably should have sequential numbers
attached to them. This makes it a bit easier+stable to write outliner tests.

The point of this is to move towards a bit more stability in outlined function
names. By doing this, we at least don't rely on the traversal order of the
suffix tree. Instead, we rely on the order of the candidate list, which is
*far* more consistent. The candidate list is ordered by the end indices of
candidates, so we're more likely to get a stable ordering. This is still
susceptible to changes in the cost model though (like, if we suddenly find new
candidates, for example).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346340 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineOutliner.h |  3 ---
 lib/CodeGen/MachineOutliner.cpp        | 18 +++++++++++++-----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/include/llvm/CodeGen/MachineOutliner.h b/include/llvm/CodeGen/MachineOutliner.h
index 95bfc24b57f..eaa741353ab 100644
--- a/include/llvm/CodeGen/MachineOutliner.h
+++ b/include/llvm/CodeGen/MachineOutliner.h
@@ -169,9 +169,6 @@ public:
   /// This is initialized after we go through and create the actual function.
   MachineFunction *MF = nullptr;
 
-  /// A number assigned to this function which appears at the end of its name.
-  unsigned Name;
-
   /// The sequence of integers corresponding to the instructions in this
   /// function.
   std::vector<unsigned> Sequence;
diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index bccf9523312..1b2b448ebed 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -801,7 +801,8 @@ struct MachineOutliner : public ModulePass {
 
   /// Creates a function for \p OF and inserts it into the module.
   MachineFunction *createOutlinedFunction(Module &M, const OutlinedFunction &OF,
-                                          InstructionMapper &Mapper);
+                                          InstructionMapper &Mapper,
+                                          unsigned Name);
 
   /// Find potential outlining candidates and store them in \p CandidateList.
   ///
@@ -1035,7 +1036,6 @@ unsigned MachineOutliner::findCandidates(
     for (unsigned i = StartIdx; i < StartIdx + StringLen; i++)
       Seq.push_back(ST.Str[i]);
     OF.Sequence = Seq;
-    OF.Name = FunctionList.size();
 
     // Is it better to outline this candidate than not?
     if (OF.getBenefit() < 1) {
@@ -1190,13 +1190,16 @@ unsigned MachineOutliner::buildCandidateList(
 
 MachineFunction *
 MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
-                                        InstructionMapper &Mapper) {
+                                        InstructionMapper &Mapper,
+                                        unsigned Name) {
 
   // Create the function name. This should be unique. For now, just hash the
   // module name and include it in the function name plus the number of this
   // function.
   std::ostringstream NameStream;
-  NameStream << "OUTLINED_FUNCTION_" << OF.Name;
+  // FIXME: We should have a better naming scheme. This should be stable,
+  // regardless of changes to the outliner's cost model/traversal order.
+  NameStream << "OUTLINED_FUNCTION_" << Name;
 
   // Create the function using an IR-level function.
   LLVMContext &C = M.getContext();
@@ -1295,6 +1298,10 @@ bool MachineOutliner::outline(
     std::vector<OutlinedFunction> &FunctionList, InstructionMapper &Mapper) {
 
   bool OutlinedSomething = false;
+
+  // Number to append to the current outlined function.
+  unsigned OutlinedFunctionNum = 0;
+
   // Replace the candidates with calls to their respective outlined functions.
   for (const std::shared_ptr<Candidate> &Cptr : CandidateList) {
     Candidate &C = *Cptr;
@@ -1311,9 +1318,10 @@ bool MachineOutliner::outline(
 
     // Does this candidate have a function yet?
     if (!OF.MF) {
-      OF.MF = createOutlinedFunction(M, OF, Mapper);
+      OF.MF = createOutlinedFunction(M, OF, Mapper, OutlinedFunctionNum);
       emitOutlinedFunctionRemark(OF);
       FunctionsCreated++;
+      OutlinedFunctionNum++; // Created a function, move to the next name.
     }
 
     MachineFunction *MF = OF.MF;
-- 
GitLab


From 0bb3a2c11fb93af9f68cf0a6ae6d7e149b54385e Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@chromium.org>
Date: Wed, 7 Nov 2018 18:36:50 +0000
Subject: [PATCH 1079/1116] [llvm-mt] Accept and ignore notify_update flag

This flag is being set by CMake when invoking mt.

Differential Revision: https://reviews.llvm.org/D54196

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346341 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mt/Opts.td | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/llvm-mt/Opts.td b/tools/llvm-mt/Opts.td
index 6dc3eea524e..da5b2c992ee 100644
--- a/tools/llvm-mt/Opts.td
+++ b/tools/llvm-mt/Opts.td
@@ -23,6 +23,7 @@ def validate_file_hashes : Joined<["/", "-"], "validate_file_hashes:">, HelpText
 def canonicalize : Flag<["/", "-"], "canonicalize:">, HelpText<"Not supported">, Group<unsupported>;
 def check_for_duplicates : Flag<["/", "-"], "check_for_duplicates:">, HelpText<"Not supported">, Group<unsupported>;
 def make_cdfs : Flag<["/", "-"], "makecdfs:">, HelpText<"Not supported">, Group<unsupported>;
+def notify_update : Flag<["/", "-"], "notify_update">, HelpText<"Not supported">, Group<unsupported>;
 def verbose : Flag<["/", "-"], "verbose">, HelpText<"Not supported">, Group<unsupported>;
 def help : Flag<["/", "-"], "?">;
 def help_long : Flag<["/", "-"], "help">, Alias<help>;
-- 
GitLab


From 1969d2694f73f129f50ffebb1acea59558002407 Mon Sep 17 00:00:00 2001
From: Matt Davis <Matthew.Davis@sony.com>
Date: Wed, 7 Nov 2018 19:20:04 +0000
Subject: [PATCH 1080/1116] [llvm-mca] Move the AssembleInput logic into its
 own class.

Summary:
This patch introduces a CodeRegionGenerator class which is responsible for parsing some type of input and creating a 'CodeRegions' instance for use by llvm-mca.  In the future, we will also have a CodeRegionGenerator subclass for converting an input object file into CodeRegions.  For now, we only have the subclass for converting input assembly into CodeRegions.

This is mostly a NFC patch, as the logic remains close to the original, but now encapsulated in its own class and moved outside of llvm-mca.cpp.

Reviewers: andreadb, courbet, RKSimon

Reviewed By: andreadb

Subscribers: mgorny, tschuett, gbedwell, llvm-commits

Differential Revision: https://reviews.llvm.org/D54179

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346344 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-mca/CMakeLists.txt          |   1 +
 tools/llvm-mca/CodeRegion.h            |   1 +
 tools/llvm-mca/CodeRegionGenerator.cpp | 137 +++++++++++++++++++++++++
 tools/llvm-mca/CodeRegionGenerator.h   |  70 +++++++++++++
 tools/llvm-mca/llvm-mca.cpp            | 112 ++------------------
 5 files changed, 218 insertions(+), 103 deletions(-)
 create mode 100644 tools/llvm-mca/CodeRegionGenerator.cpp
 create mode 100644 tools/llvm-mca/CodeRegionGenerator.h

diff --git a/tools/llvm-mca/CMakeLists.txt b/tools/llvm-mca/CMakeLists.txt
index fead673ef69..4339d48d461 100644
--- a/tools/llvm-mca/CMakeLists.txt
+++ b/tools/llvm-mca/CMakeLists.txt
@@ -14,6 +14,7 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_tool(llvm-mca
   llvm-mca.cpp
   CodeRegion.cpp
+  CodeRegionGenerator.cpp
   PipelinePrinter.cpp
   Views/DispatchStatistics.cpp
   Views/InstructionInfoView.cpp
diff --git a/tools/llvm-mca/CodeRegion.h b/tools/llvm-mca/CodeRegion.h
index 6ca2bd15128..867aa18bb4f 100644
--- a/tools/llvm-mca/CodeRegion.h
+++ b/tools/llvm-mca/CodeRegion.h
@@ -106,6 +106,7 @@ public:
   void beginRegion(llvm::StringRef Description, llvm::SMLoc Loc);
   void endRegion(llvm::SMLoc Loc);
   void addInstruction(const llvm::MCInst &Instruction);
+  llvm::SourceMgr &getSourceMgr() const { return SM; }
 
   CodeRegions(llvm::SourceMgr &S) : SM(S) {
     // Create a default region for the input code sequence.
diff --git a/tools/llvm-mca/CodeRegionGenerator.cpp b/tools/llvm-mca/CodeRegionGenerator.cpp
new file mode 100644
index 00000000000..5bd37adeeae
--- /dev/null
+++ b/tools/llvm-mca/CodeRegionGenerator.cpp
@@ -0,0 +1,137 @@
+//===----------------------- CodeRegionGenerator.cpp ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines classes responsible for generating llvm-mca
+/// CodeRegions from various types of input. llvm-mca only analyzes CodeRegions,
+/// so the classes here provide the input-to-CodeRegions translation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeRegionGenerator.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/SMLoc.h"
+#include <memory>
+
+namespace llvm {
+namespace mca {
+
+// This virtual dtor serves as the anchor for the CodeRegionGenerator class.
+CodeRegionGenerator::~CodeRegionGenerator() {}
+
+// A comment consumer that parses strings.  The only valid tokens are strings.
+class MCACommentConsumer : public AsmCommentConsumer {
+public:
+  CodeRegions &Regions;
+
+  MCACommentConsumer(CodeRegions &R) : Regions(R) {}
+  void HandleComment(SMLoc Loc, StringRef CommentText) override;
+};
+
+// This class provides the callbacks that occur when parsing input assembly.
+class MCStreamerWrapper final : public MCStreamer {
+  CodeRegions &Regions;
+
+public:
+  MCStreamerWrapper(MCContext &Context, mca::CodeRegions &R)
+      : MCStreamer(Context), Regions(R) {}
+
+  // We only want to intercept the emission of new instructions.
+  virtual void EmitInstruction(const MCInst &Inst,
+                               const MCSubtargetInfo & /* unused */,
+                               bool /* unused */) override {
+    Regions.addInstruction(Inst);
+  }
+
+  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override {
+    return true;
+  }
+
+  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                        unsigned ByteAlignment) override {}
+  void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
+                    uint64_t Size = 0, unsigned ByteAlignment = 0,
+                    SMLoc Loc = SMLoc()) override {}
+  void EmitGPRel32Value(const MCExpr *Value) override {}
+  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
+  void EmitCOFFSymbolStorageClass(int StorageClass) override {}
+  void EmitCOFFSymbolType(int Type) override {}
+  void EndCOFFSymbolDef() override {}
+
+  ArrayRef<MCInst> GetInstructionSequence(unsigned Index) const {
+    return Regions.getInstructionSequence(Index);
+  }
+};
+
+void MCACommentConsumer::HandleComment(SMLoc Loc, StringRef CommentText) {
+  // Skip empty comments.
+  StringRef Comment(CommentText);
+  if (Comment.empty())
+    return;
+
+  // Skip spaces and tabs.
+  unsigned Position = Comment.find_first_not_of(" \t");
+  if (Position >= Comment.size())
+    // We reached the end of the comment. Bail out.
+    return;
+
+  Comment = Comment.drop_front(Position);
+  if (Comment.consume_front("LLVM-MCA-END")) {
+    Regions.endRegion(Loc);
+    return;
+  }
+
+  // Try to parse the LLVM-MCA-BEGIN comment.
+  if (!Comment.consume_front("LLVM-MCA-BEGIN"))
+    return;
+
+  // Skip spaces and tabs.
+  Position = Comment.find_first_not_of(" \t");
+  if (Position < Comment.size())
+    Comment = Comment.drop_front(Position);
+  // Use the rest of the string as a descriptor for this code snippet.
+  Regions.beginRegion(Comment, Loc);
+}
+
+Expected<const CodeRegions &> AsmCodeRegionGenerator::parseCodeRegions() {
+  MCTargetOptions Opts;
+  Opts.PreserveAsmComments = false;
+  MCStreamerWrapper Str(Ctx, Regions);
+
+  // Create a MCAsmParser and setup the lexer to recognize llvm-mca ASM
+  // comments.
+  std::unique_ptr<MCAsmParser> Parser(
+      createMCAsmParser(Regions.getSourceMgr(), Ctx, Str, MAI));
+  MCAsmLexer &Lexer = Parser->getLexer();
+  MCACommentConsumer CC(Regions);
+  Lexer.setCommentConsumer(&CC);
+
+  // Create a target-specific parser and perform the parse.
+  std::unique_ptr<MCTargetAsmParser> TAP(
+      TheTarget.createMCAsmParser(STI, *Parser, MCII, Opts));
+  if (!TAP)
+    return make_error<StringError>(
+        "This target does not support assembly parsing.",
+        inconvertibleErrorCode());
+  Parser->setTargetParser(*TAP);
+  Parser->Run(false);
+
+  // Get the assembler dialect from the input.  llvm-mca will use this as the
+  // default dialect when printing reports.
+  AssemblerDialect = Parser->getAssemblerDialect();
+  return Regions;
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/CodeRegionGenerator.h b/tools/llvm-mca/CodeRegionGenerator.h
new file mode 100644
index 00000000000..892cafb9268
--- /dev/null
+++ b/tools/llvm-mca/CodeRegionGenerator.h
@@ -0,0 +1,70 @@
+//===----------------------- CodeRegionGenerator.h --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares classes responsible for generating llvm-mca
+/// CodeRegions from various types of input. llvm-mca only analyzes CodeRegions,
+/// so the classes here provide the input-to-CodeRegions translation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_CODEREGION_GENERATOR_H
+#define LLVM_TOOLS_LLVM_MCA_CODEREGION_GENERATOR_H
+
+#include "CodeRegion.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <memory>
+
+namespace llvm {
+namespace mca {
+
+/// This class is responsible for parsing the input given to the llvm-mca
+/// driver, and converting that into a CodeRegions instance.
+class CodeRegionGenerator {
+protected:
+  CodeRegions Regions;
+  CodeRegionGenerator(const CodeRegionGenerator &) = delete;
+  CodeRegionGenerator &operator=(const CodeRegionGenerator &) = delete;
+
+public:
+  CodeRegionGenerator(SourceMgr &SM) : Regions(SM) {}
+  virtual ~CodeRegionGenerator();
+  virtual Expected<const CodeRegions &> parseCodeRegions() = 0;
+};
+
+/// This class is responsible for parsing input ASM and generating
+/// a CodeRegions instance.
+class AsmCodeRegionGenerator final : public CodeRegionGenerator {
+  const Target &TheTarget;
+  MCContext &Ctx;
+  const MCAsmInfo &MAI;
+  const MCSubtargetInfo &STI;
+  const MCInstrInfo &MCII;
+  unsigned AssemblerDialect; // This is set during parsing.
+
+public:
+  AsmCodeRegionGenerator(const Target &T, SourceMgr &SM, MCContext &C,
+                         const MCAsmInfo &A, const MCSubtargetInfo &S,
+                         const MCInstrInfo &I)
+      : CodeRegionGenerator(SM), TheTarget(T), Ctx(C), MAI(A), STI(S), MCII(I),
+        AssemblerDialect(0) {}
+
+  unsigned getAssemblerDialect() const { return AssemblerDialect; }
+  Expected<const CodeRegions &> parseCodeRegions() override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif // LLVM_TOOLS_LLVM_MCA_CODEREGION_GENERATOR_H
diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp
index b3a4c495d7e..3a066f713bc 100644
--- a/tools/llvm-mca/llvm-mca.cpp
+++ b/tools/llvm-mca/llvm-mca.cpp
@@ -22,6 +22,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeRegion.h"
+#include "CodeRegionGenerator.h"
 #include "PipelinePrinter.h"
 #include "Stages/FetchStage.h"
 #include "Stages/InstructionTables.h"
@@ -39,9 +40,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
@@ -199,59 +198,6 @@ const Target *getTarget(const char *ProgName) {
   return TheTarget;
 }
 
-// A comment consumer that parses strings.
-// The only valid tokens are strings.
-class MCACommentConsumer : public AsmCommentConsumer {
-public:
-  mca::CodeRegions &Regions;
-
-  MCACommentConsumer(mca::CodeRegions &R) : Regions(R) {}
-  void HandleComment(SMLoc Loc, StringRef CommentText) override {
-    // Skip empty comments.
-    StringRef Comment(CommentText);
-    if (Comment.empty())
-      return;
-
-    // Skip spaces and tabs
-    unsigned Position = Comment.find_first_not_of(" \t");
-    if (Position >= Comment.size())
-      // We reached the end of the comment. Bail out.
-      return;
-
-    Comment = Comment.drop_front(Position);
-    if (Comment.consume_front("LLVM-MCA-END")) {
-      Regions.endRegion(Loc);
-      return;
-    }
-
-    // Now try to parse string LLVM-MCA-BEGIN
-    if (!Comment.consume_front("LLVM-MCA-BEGIN"))
-      return;
-
-    // Skip spaces and tabs
-    Position = Comment.find_first_not_of(" \t");
-    if (Position < Comment.size())
-      Comment = Comment.drop_front(Position);
-    // Use the rest of the string as a descriptor for this code snippet.
-    Regions.beginRegion(Comment, Loc);
-  }
-};
-
-int AssembleInput(MCAsmParser &Parser, const Target *TheTarget,
-                  MCSubtargetInfo &STI, MCInstrInfo &MCII,
-                  MCTargetOptions &MCOptions) {
-  std::unique_ptr<MCTargetAsmParser> TAP(
-      TheTarget->createMCAsmParser(STI, Parser, MCII, MCOptions));
-
-  if (!TAP) {
-    WithColor::error() << "this target does not support assembly parsing.\n";
-    return 1;
-  }
-
-  Parser.setTargetParser(*TAP);
-  return Parser.Run(false);
-}
-
 ErrorOr<std::unique_ptr<ToolOutputFile>> getOutputStream() {
   if (OutputFilename == "")
     OutputFilename = "-";
@@ -262,40 +208,6 @@ ErrorOr<std::unique_ptr<ToolOutputFile>> getOutputStream() {
     return std::move(Out);
   return EC;
 }
-
-class MCStreamerWrapper final : public MCStreamer {
-  mca::CodeRegions &Regions;
-
-public:
-  MCStreamerWrapper(MCContext &Context, mca::CodeRegions &R)
-      : MCStreamer(Context), Regions(R) {}
-
-  // We only want to intercept the emission of new instructions.
-  virtual void EmitInstruction(const MCInst &Inst,
-                               const MCSubtargetInfo & /* unused */,
-                               bool /* unused */) override {
-    Regions.addInstruction(Inst);
-  }
-
-  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override {
-    return true;
-  }
-
-  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                        unsigned ByteAlignment) override {}
-  void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
-                    uint64_t Size = 0, unsigned ByteAlignment = 0,
-                    SMLoc Loc = SMLoc()) override {}
-  void EmitGPRel32Value(const MCExpr *Value) override {}
-  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
-  void EmitCOFFSymbolStorageClass(int StorageClass) override {}
-  void EmitCOFFSymbolType(int Type) override {}
-  void EndCOFFSymbolDef() override {}
-
-  ArrayRef<MCInst> GetInstructionSequence(unsigned Index) const {
-    return Regions.getInstructionSequence(Index);
-  }
-};
 } // end of anonymous namespace
 
 static void processOptionImpl(cl::opt<bool> &O, const cl::opt<bool> &Default) {
@@ -352,9 +264,6 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv,
                               "llvm machine code performance analyzer.\n");
 
-  MCTargetOptions MCOptions;
-  MCOptions.PreserveAsmComments = false;
-
   // Get the target from the triple. If a triple is not specified, then select
   // the default triple for the host. If the triple doesn't correspond to any
   // registered target, then exit with an error message.
@@ -394,9 +303,6 @@ int main(int argc, char **argv) {
 
   std::unique_ptr<buffer_ostream> BOS;
 
-  mca::CodeRegions Regions(SrcMgr);
-  MCStreamerWrapper Str(Ctx, Regions);
-
   std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
 
   std::unique_ptr<MCInstrAnalysis> MCIA(
@@ -429,14 +335,14 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  std::unique_ptr<MCAsmParser> P(createMCAsmParser(SrcMgr, Ctx, Str, *MAI));
-  MCAsmLexer &Lexer = P->getLexer();
-  MCACommentConsumer CC(Regions);
-  Lexer.setCommentConsumer(&CC);
-
-  if (AssembleInput(*P, TheTarget, *STI, *MCII, MCOptions))
+  // Parse the input and create CodeRegions that llvm-mca can analyze.
+  mca::AsmCodeRegionGenerator CRG(*TheTarget, SrcMgr, Ctx, *MAI, *STI, *MCII);
+  Expected<const mca::CodeRegions &> RegionsOrErr = CRG.parseCodeRegions();
+  if (auto Err = RegionsOrErr.takeError()) {
+    WithColor::error() << Err << "\n";
     return 1;
-
+  }
+  const mca::CodeRegions &Regions = *RegionsOrErr;
   if (Regions.empty()) {
     WithColor::error() << "no assembly instructions found.\n";
     return 1;
@@ -449,7 +355,7 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  unsigned AssemblerDialect = P->getAssemblerDialect();
+  unsigned AssemblerDialect = CRG.getAssemblerDialect();
   if (OutputAsmVariant >= 0)
     AssemblerDialect = static_cast<unsigned>(OutputAsmVariant);
   std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
-- 
GitLab


From e8113b8c3036169d04ee31402016a0b8437e7cf3 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Wed, 7 Nov 2018 19:20:55 +0000
Subject: [PATCH 1081/1116] [MachineOutliner][NFC] Traverse suffix tree using a
 RepeatedSubstring iterator

This takes the traversal methods introduced in r346269 and adapts them
into an iterator. This allows the outliner to iterate over repeated substrings
within the suffix tree directly without having to initially find all of the
substrings and then iterate over them after you've found them.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346345 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineOutliner.cpp | 164 +++++++++++++++++++++-----------
 1 file changed, 111 insertions(+), 53 deletions(-)

diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 1b2b448ebed..458d949136c 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -479,44 +479,6 @@ private:
     return SuffixesToAdd;
   }
 
-  /// Helper function for findRepeatedSubstrings.
-  /// Traverses the suffix tree that finds all nodes associated with a repeated
-  /// substring. That is, all internal non-root nodes. If the given node has
-  /// more than one leaf child, store the repeated strings in Substrings.
-  void
-  findRepeatedSubstringsHelper(SuffixTreeNode &Curr,
-                               std::vector<RepeatedSubstring> &Substrings,
-                               const unsigned MinLength = 1) {
-  assert(!Curr.isLeaf() && "Visited a leaf?");
-  std::vector<SuffixTreeNode *> LeafChildren;
-  unsigned Length = Curr.ConcatLen;
-
-  for (auto &ChildPair : Curr.Children) {
-    if (!ChildPair.second->isLeaf())
-      findRepeatedSubstringsHelper(*ChildPair.second, Substrings, MinLength);
-    else if (Length >= MinLength)
-      LeafChildren.push_back(ChildPair.second);
-  }
-
-  // The root node never has repeats. Quit here.
-  if (Curr.isRoot())
-    return;
-
-  // If there are no occurrences of the minimum length, then quit.
-  if (LeafChildren.empty() || LeafChildren.size() < 2)
-    return;
-
-  // We have a node associated with a repeated substring. Store that in
-  // Substrings and move on.
-  RepeatedSubstring RS;
-  RS.Length = Length;
-
-  // Each occurrence starts at a suffix given by a leaf child.
-  for (SuffixTreeNode *Leaf : LeafChildren)
-    RS.StartIndices.push_back(Leaf->SuffixIdx);
-  Substrings.push_back(RS);
-}
-
 public:
   /// Construct a suffix tree from a sequence of unsigned integers.
   ///
@@ -545,14 +507,115 @@ public:
     setSuffixIndices(*Root, 0);
   }
 
-  /// Finds all repeated substrings with an optionally-provided minimum length
-  /// and stores them in \p Substrings.
-  /// If \p MinLength is provided, only return those with a given minimum
-  /// length.
-  void findRepeatedSubstrings(std::vector<RepeatedSubstring> &Substrings,
-                              const unsigned MinLength = 1) {
-    findRepeatedSubstringsHelper(*Root, Substrings, MinLength);
-  }
+
+  /// Iterator for finding all repeated substrings in the suffix tree.
+  struct RepeatedSubstringIterator {
+    private:
+    /// The current node we're visiting.
+    SuffixTreeNode *N = nullptr;
+
+    /// The repeated substring associated with this node.
+    RepeatedSubstring RS;
+
+    /// The nodes left to visit.
+    std::vector<SuffixTreeNode *> ToVisit;
+
+    /// The minimum length of a repeated substring to find.
+    /// Since we're outlining, we want at least two instructions in the range.
+    /// FIXME: This may not be true for targets like X86 which support many
+    /// instruction lengths.
+    const unsigned MinLength = 2;
+
+    /// Move the iterator to the next repeated substring.
+    void advance() {
+      // Clear the current state. If we're at the end of the range, then this
+      // is the state we want to be in.
+      RS = RepeatedSubstring();
+      N = nullptr;
+
+      // Continue visiting nodes until we find one which repeats more than once.
+      while (!ToVisit.empty()) {
+        SuffixTreeNode *Curr = ToVisit.back();
+        ToVisit.pop_back();
+
+        // Keep track of the length of the string associated with the node. If
+        // it's too short, we'll quit.
+        unsigned Length = Curr->ConcatLen;
+
+        // Each leaf node represents a repeat of a string.
+        std::vector<SuffixTreeNode *> LeafChildren;
+
+        // Iterate over each child, saving internal nodes for visiting, and
+        // leaf nodes in LeafChildren. Internal nodes represent individual
+        // strings, which may repeat.
+        for (auto &ChildPair : Curr->Children) {
+          // Save all of this node's children for processing.
+          if (!ChildPair.second->isLeaf())
+            ToVisit.push_back(ChildPair.second);
+
+          // It's not an internal node, so it must be a leaf. If we have a
+          // long enough string, then save the leaf children.
+          else if (Length >= MinLength)
+            LeafChildren.push_back(ChildPair.second);
+        }
+
+        // The root never represents a repeated substring. If we're looking at
+        // that, then skip it.
+        if (Curr->isRoot())
+          continue;
+
+        // Do we have any repeated substrings?
+        if (LeafChildren.size() >= 2) {
+          // Yes. Update the state to reflect this, and then bail out.
+          N = Curr;
+          RS.Length = Length;
+          for (SuffixTreeNode *Leaf : LeafChildren)
+            RS.StartIndices.push_back(Leaf->SuffixIdx);
+          break;
+        }
+      }
+
+      // At this point, either NewRS is an empty RepeatedSubstring, or it was
+      // set in the above loop. Similarly, N is either nullptr, or the node
+      // associated with NewRS.
+    }
+
+  public:
+    /// Return the current repeated substring.
+    RepeatedSubstring &operator*() { return RS; }
+
+    RepeatedSubstringIterator &operator++() {
+      advance();
+      return *this;
+    }
+
+    RepeatedSubstringIterator operator++(int I) {
+      RepeatedSubstringIterator It(*this);
+      advance();
+      return It;
+    }
+
+    bool operator==(const RepeatedSubstringIterator &Other) {
+      return N == Other.N;
+    }
+    bool operator!=(const RepeatedSubstringIterator &Other) {
+      return !(*this == Other);
+    }
+
+    RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
+      // Do we have a non-null node?
+      if (N) {
+        // Yes. At the first step, we need to visit all of N's children.
+        // Note: This means that we visit N last.
+        ToVisit.push_back(N);
+        advance();
+      }
+    }
+};
+
+  typedef RepeatedSubstringIterator iterator;
+  iterator begin() { return iterator(Root); }
+  iterator end() { return iterator(nullptr); }
 };
 
 /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
@@ -963,13 +1026,8 @@ unsigned MachineOutliner::findCandidates(
 
   // First, find dall of the repeated substrings in the tree of minimum length
   // 2.
-  // FIXME: 2 is an approximation which isn't necessarily true for, say, X86.
-  // If we factor in instruction lengths, we need more information than this.
-  // FIXME: It'd be nice if we could just have a repeated substring iterator.
-  std::vector<SuffixTree::RepeatedSubstring> RepeatedSubstrings;
-  ST.findRepeatedSubstrings(RepeatedSubstrings, 2);
-
-  for (SuffixTree::RepeatedSubstring &RS : RepeatedSubstrings) {
+  for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) {
+    SuffixTree::RepeatedSubstring RS = *It;
     std::vector<Candidate> CandidatesForRepeatedSeq;
     unsigned StringLen = RS.Length;
     for (const unsigned &StartIdx : RS.StartIndices) {
-- 
GitLab


From 8d33018536a1efe881a1ef16eae4c0b1c66d0902 Mon Sep 17 00:00:00 2001
From: Martin Elshuber <martin.elshuber@theobroma-systems.com>
Date: Wed, 7 Nov 2018 19:35:04 +0000
Subject: [PATCH 1082/1116] [Support] Fix line width to 80

Test commit


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346348 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/ErrorHandling.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/llvm/Support/ErrorHandling.h b/include/llvm/Support/ErrorHandling.h
index 39cbfed2436..fec39e59a71 100644
--- a/include/llvm/Support/ErrorHandling.h
+++ b/include/llvm/Support/ErrorHandling.h
@@ -112,8 +112,8 @@ void install_out_of_memory_new_handler();
 /// in the unwind chain.
 ///
 /// If no error handler is installed (default), then a bad_alloc exception
-/// is thrown, if LLVM is compiled with exception support, otherwise an assertion
-/// is called.
+/// is thrown, if LLVM is compiled with exception support, otherwise an
+/// assertion is called.
 void report_bad_alloc_error(const char *Reason, bool GenCrashDiag = true);
 
 /// This function calls abort(), and prints the optional message to stderr.
-- 
GitLab


From 24b118cefd35887c024042b75fc2d91af7533ec4 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Wed, 7 Nov 2018 19:56:13 +0000
Subject: [PATCH 1083/1116] [MachineOutliner][NFC] Remove Parent field from
 SuffixTreeNode

This is only used for calculating ConcatLen. This isn't necessary,
since it's easily derived from the traversal setting suffix indices.

Remove that. Rename CurrIdx to CurrNodeLen to better describe what's
going on.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346349 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineOutliner.cpp | 42 +++++++++++----------------------
 1 file changed, 14 insertions(+), 28 deletions(-)

diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 458d949136c..56b3fe202f0 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -164,9 +164,6 @@ struct SuffixTreeNode {
   /// construction algorithm O(N^2) rather than O(N).
   SuffixTreeNode *Link = nullptr;
 
-  /// The parent of this node. Every node except for the root has a parent.
-  SuffixTreeNode *Parent = nullptr;
-
   /// The length of the string formed by concatenating the edge labels from the
   /// root to this node.
   unsigned ConcatLen = 0;
@@ -191,9 +188,8 @@ struct SuffixTreeNode {
     return *EndIdx - StartIdx + 1;
   }
 
-  SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link,
-                 SuffixTreeNode *Parent)
-      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link), Parent(Parent) {}
+  SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
+      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
 
   SuffixTreeNode() {}
 };
@@ -286,7 +282,7 @@ private:
     assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
 
     SuffixTreeNode *N = new (NodeAllocator.Allocate())
-        SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr, &Parent);
+        SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
     Parent.Children[Edge] = N;
 
     return N;
@@ -309,7 +305,7 @@ private:
 
     unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
     SuffixTreeNode *N = new (NodeAllocator.Allocate())
-        SuffixTreeNode(StartIdx, E, Root, Parent);
+        SuffixTreeNode(StartIdx, E, Root);
     if (Parent)
       Parent->Children[Edge] = N;
 
@@ -320,33 +316,24 @@ private:
   /// respective suffixes.
   ///
   /// \param[in] CurrNode The node currently being visited.
-  /// \param CurrIdx The current index of the string being visited.
-  void setSuffixIndices(SuffixTreeNode &CurrNode, unsigned CurrIdx) {
+  /// \param CurrNodeLen The concatenation of all node sizes from the root to
+  /// this node. Used to produce suffix indices.
+  void setSuffixIndices(SuffixTreeNode &CurrNode, unsigned CurrNodeLen) {
 
     bool IsLeaf = CurrNode.Children.size() == 0 && !CurrNode.isRoot();
 
-    // Store the length of the concatenation of all strings from the root to
-    // this node.
-    if (!CurrNode.isRoot()) {
-      if (CurrNode.ConcatLen == 0)
-        CurrNode.ConcatLen = CurrNode.size();
-
-      if (CurrNode.Parent)
-        CurrNode.ConcatLen += CurrNode.Parent->ConcatLen;
-    }
-
+    // Store the concatenation of lengths down from the root.
+    CurrNode.ConcatLen = CurrNodeLen;
     // Traverse the tree depth-first.
     for (auto &ChildPair : CurrNode.Children) {
       assert(ChildPair.second && "Node had a null child!");
-      setSuffixIndices(*ChildPair.second, CurrIdx + ChildPair.second->size());
+      setSuffixIndices(*ChildPair.second,
+                       CurrNodeLen + ChildPair.second->size());
     }
 
-    // Is this node a leaf?
-    if (IsLeaf) {
-      // If yes, give it a suffix index and bump its parent's occurrence count.
-      CurrNode.SuffixIdx = Str.size() - CurrIdx;
-      assert(CurrNode.Parent && "CurrNode had no parent!");
-    }
+    // Is this node a leaf? If it is, give it a suffix index.
+    if (IsLeaf)
+      CurrNode.SuffixIdx = Str.size() - CurrNodeLen;
   }
 
   /// Construct the suffix tree for the prefix of the input ending at
@@ -451,7 +438,6 @@ private:
         // Make the old node a child of the split node and update its start
         // index. This is the node n from the diagram.
         NextNode->StartIdx += Active.Len;
-        NextNode->Parent = SplitNode;
         SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
 
         // SplitNode is an internal node, update the suffix link.
-- 
GitLab


From 3a7cf3b32fb1dab03835b36cbc5c9adc07c6bab7 Mon Sep 17 00:00:00 2001
From: Fedor Sergeev <fedor.sergeev@azul.com>
Date: Wed, 7 Nov 2018 20:05:11 +0000
Subject: [PATCH 1084/1116] [SimpleLoopUnswitch] partial unswitch needs to be
 careful when replacing invariants with constants

When partial unswitch operates on multiple conditions at once, .e.g:
   if (Cond1 || Cond2 || NonInv) ...

it should infer (and replace) values for individual conditions only on one
side of unswitch and not another.

More precisely only these derivations hold true:
   (Cond1 || Cond2) == false  =>  Cond1 == Cond2 == false
   (Cond1 && Cond2) == true   =>  Cond1 == Cond2 == true

By the way we organize unswitching it means only replacing on "continue" blocks
and never on "unswitched" ones. Since trivial unswitch does not have "unswitched"
blocks it does not have this problem.

Fixes PR 39568.

Reviewers: chandlerc, asbirlea
Differential Revision: https://reviews.llvm.org/D54211

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346350 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/SimpleLoopUnswitch.cpp  |  15 ++-
 .../SimpleLoopUnswitch/nontrivial-unswitch.ll | 109 ++++++++++++++++--
 2 files changed, 112 insertions(+), 12 deletions(-)

diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 81fba5d15ee..368f0925aba 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2044,6 +2044,18 @@ static void unswitchNontrivialInvariants(
     assert(UnswitchedSuccBBs.size() == 1 &&
            "Only one possible unswitched block for a branch!");
     BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+
+    // When considering multiple partially-unswitched invariants
+    // we cant just go replace them with constants in both branches.
+    //
+    // For 'AND' we infer that true branch ("continue") means true
+    // for each invariant operand.
+    // For 'OR' we can infer that false branch ("continue") means false
+    // for each invariant operand.
+    // So it happens that for multiple-partial case we dont replace
+    // in the unswitched branch.
+    bool ReplaceUnswitched = FullUnswitch || (Invariants.size() == 1);
+
     ConstantInt *UnswitchedReplacement =
         Direction ? ConstantInt::getTrue(BI->getContext())
                   : ConstantInt::getFalse(BI->getContext());
@@ -2063,7 +2075,8 @@ static void unswitchNontrivialInvariants(
         // unswitched if in the cloned blocks.
         if (DT.dominates(LoopPH, UserI->getParent()))
           U->set(ContinueReplacement);
-        else if (DT.dominates(ClonedPH, UserI->getParent()))
+        else if (ReplaceUnswitched &&
+                 DT.dominates(ClonedPH, UserI->getParent()))
           U->set(UnswitchedReplacement);
       }
   }
diff --git a/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll b/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
index fc8cd5be25c..367d6fe28e9 100644
--- a/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
+++ b/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
@@ -2796,10 +2796,10 @@ loop_begin:
 ; CHECK:       loop_begin.us:
 ; CHECK-NEXT:    %[[V1_US:.*]] = load i1, i1* %ptr1
 ; CHECK-NEXT:    %[[V2_US:.*]] = load i1, i1* %ptr2
-; CHECK-NEXT:    %[[AND1_US:.*]] = and i1 %[[V1_US]], false
+; CHECK-NEXT:    %[[AND1_US:.*]] = and i1 %[[V1_US]], %cond1
 ; CHECK-NEXT:    %[[OR1_US:.*]] = or i1 %[[V2_US]], %cond2
 ; CHECK-NEXT:    %[[AND2_US:.*]] = and i1 %[[AND1_US]], %[[OR1_US]]
-; CHECK-NEXT:    %[[AND3_US:.*]] = and i1 %[[AND2_US]], false
+; CHECK-NEXT:    %[[AND3_US:.*]] = and i1 %[[AND2_US]], %cond3
 ; CHECK-NEXT:    br label %loop_b.us
 ;
 ; CHECK:       loop_b.us:
@@ -2857,12 +2857,99 @@ loop_exit:
 ; CHECK-NEXT:    ret
 }
 
-; Non-trivial unswitching of a switch.
-define i32 @test27(i1* %ptr, i32 %cond) {
+; Non-trivial partial loop unswitching of multiple invariant inputs to an `or`
+; chain. Basically an inverted version of corresponding `and` test (test26).
+define i32 @test27(i1* %ptr1, i1* %ptr2, i1* %ptr3, i1 %cond1, i1 %cond2, i1 %cond3) {
 ; CHECK-LABEL: @test27(
 entry:
   br label %loop_begin
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %[[INV_OR:.*]] = or i1 %cond3, %cond1
+; CHECK-NEXT:    br i1 %[[INV_OR]], label %entry.split.us, label %entry.split
+
+loop_begin:
+  %v1 = load i1, i1* %ptr1
+  %v2 = load i1, i1* %ptr2
+  %cond_or1 = or i1 %v1, %cond1
+  %cond_and1 = and i1 %v2, %cond2
+  %cond_or2 = or i1 %cond_or1, %cond_and1
+  %cond_or3 = or i1 %cond_or2, %cond3
+  br i1 %cond_or3, label %loop_b, label %loop_a
+; The 'loop_b' unswitched loop.
+;
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label %loop_begin.us
+;
+; CHECK:       loop_begin.us:
+; CHECK-NEXT:    %[[V1_US:.*]] = load i1, i1* %ptr1
+; CHECK-NEXT:    %[[V2_US:.*]] = load i1, i1* %ptr2
+; CHECK-NEXT:    %[[OR1_US:.*]] = or i1 %[[V1_US]], %cond1
+; CHECK-NEXT:    %[[AND1_US:.*]] = and i1 %[[V2_US]], %cond2
+; CHECK-NEXT:    %[[OR2_US:.*]] = or i1 %[[OR1_US]], %[[AND1_US]]
+; CHECK-NEXT:    %[[OR3_US:.*]] = or i1 %[[OR2_US]], %cond3
+; CHECK-NEXT:    br label %loop_b.us
+;
+; CHECK:       loop_b.us:
+; CHECK-NEXT:    call i32 @b()
+; CHECK-NEXT:    br label %latch.us
+;
+; CHECK:       latch.us:
+; CHECK-NEXT:    %[[V3_US:.*]] = load i1, i1* %ptr3
+; CHECK-NEXT:    br i1 %[[V3_US]], label %loop_begin.us, label %loop_exit.split.us
+;
+; CHECK:       loop_exit.split.us:
+; CHECK-NEXT:    br label %loop_exit
+
+; The original loop.
+;
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label %loop_begin
+;
+; CHECK:       loop_begin:
+; CHECK-NEXT:    %[[V1:.*]] = load i1, i1* %ptr1
+; CHECK-NEXT:    %[[V2:.*]] = load i1, i1* %ptr2
+; CHECK-NEXT:    %[[OR1:.*]] = or i1 %[[V1]], false
+; CHECK-NEXT:    %[[AND1:.*]] = and i1 %[[V2]], %cond2
+; CHECK-NEXT:    %[[OR2:.*]] = or i1 %[[OR1]], %[[AND1]]
+; CHECK-NEXT:    %[[OR3:.*]] = or i1 %[[OR2]], false
+; CHECK-NEXT:    br i1 %[[OR3]], label %loop_b, label %loop_a
+
+loop_a:
+  call i32 @a()
+  br label %latch
+; CHECK:       loop_a:
+; CHECK-NEXT:    call i32 @a()
+; CHECK-NEXT:    br label %latch
+
+loop_b:
+  call i32 @b()
+  br label %latch
+; CHECK:       loop_b:
+; CHECK-NEXT:    call i32 @b()
+; CHECK-NEXT:    br label %latch
+
+latch:
+  %v3 = load i1, i1* %ptr3
+  br i1 %v3, label %loop_begin, label %loop_exit
+; CHECK:       latch:
+; CHECK-NEXT:    %[[V3:.*]] = load i1, i1* %ptr3
+; CHECK-NEXT:    br i1 %[[V3]], label %loop_begin, label %loop_exit.split
+
+loop_exit:
+  ret i32 0
+; CHECK:       loop_exit.split:
+; CHECK-NEXT:    br label %loop_exit
+;
+; CHECK:       loop_exit:
+; CHECK-NEXT:    ret
+}
+
+; Non-trivial unswitching of a switch.
+define i32 @test28(i1* %ptr, i32 %cond) {
+; CHECK-LABEL: @test28(
+entry:
+  br label %loop_begin
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 %cond, label %[[ENTRY_SPLIT_LATCH:.*]] [
 ; CHECK-NEXT:      i32 0, label %[[ENTRY_SPLIT_A:.*]]
 ; CHECK-NEXT:      i32 1, label %[[ENTRY_SPLIT_B:.*]]
@@ -2970,8 +3057,8 @@ loop_exit:
 ; can introduce multiple edges to successors. These need lots of special case
 ; handling as they get collapsed in many cases (domtree, the unswitch itself)
 ; but not in all cases (the PHI node operands).
-define i32 @test28(i32 %arg) {
-; CHECK-LABEL: @test28(
+define i32 @test29(i32 %arg) {
+; CHECK-LABEL: @test29(
 entry:
   br label %header
 ; CHECK-NEXT:  entry:
@@ -3149,12 +3236,12 @@ exit:
 ; CHECK-NEXT:    ret i32 %[[EXIT_PHI2]]
 }
 
-; Similar to @test28 but designed to have one of the duplicate edges be
+; Similar to @test29 but designed to have one of the duplicate edges be
 ; a loop exit edge as those can in some cases be special. Among other things,
 ; this includes an LCSSA phi with multiple entries despite being a dedicated
 ; exit block.
-define i32 @test29(i32 %arg) {
-; CHECK-LABEL: define i32 @test29(
+define i32 @test30(i32 %arg) {
+; CHECK-LABEL: define i32 @test30(
 entry:
   br label %header
 ; CHECK-NEXT:  entry:
@@ -3946,8 +4033,8 @@ exit:
 ; viable for unswitching the inner-most loop. This lets us check that the
 ; unswitching doesn't end up cycling infinitely even when the cycle is
 ; indirect and due to revisiting a loop after cloning.
-define void @test30(i32 %arg) {
-; CHECK-LABEL: define void @test30(
+define void @test31(i32 %arg) {
+; CHECK-LABEL: define void @test31(
 entry:
   br label %outer.header
 ; CHECK-NEXT:  entry:
-- 
GitLab


From 0cb12ca8f662a1053b1c31ab35dab26f4ec2582c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 7 Nov 2018 20:26:42 +0000
Subject: [PATCH 1085/1116] Allow subclassing ExternalAA

This allows testing AMDGPU alias analysis like any
other alias analysis pass. This fixes the existing
test pointlessly running opt -O3 when it really
just wants to run the one analysis.

Before there was no way to test this using -aa-eval
with opt, since the default constructed pass
is run. The wrapper subclass allows the
default constructor to pass the necessary callback.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346353 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/AliasAnalysis.h        | 23 ++++++++++++++++++++
 lib/Analysis/AliasAnalysis.cpp               | 22 -------------------
 lib/Target/AMDGPU/AMDGPU.h                   |  2 ++
 lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp    |  8 +++++++
 lib/Target/AMDGPU/AMDGPUAliasAnalysis.h      | 13 +++++++++++
 lib/Target/AMDGPU/AMDGPUTargetMachine.cpp    |  8 +------
 test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll |  4 ++--
 7 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 88a70f4fe59..2efcd9dafa1 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -1074,6 +1074,29 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 };
 
+/// A wrapper pass for external alias analyses. This just squirrels away the
+/// callback used to run any analyses and register their results.
+struct ExternalAAWrapperPass : ImmutablePass {
+  using CallbackT = std::function<void(Pass &, Function &, AAResults &)>;
+
+  CallbackT CB;
+
+  static char ID;
+
+  ExternalAAWrapperPass() : ImmutablePass(ID) {
+    initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  explicit ExternalAAWrapperPass(CallbackT CB)
+      : ImmutablePass(ID), CB(std::move(CB)) {
+    initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
+
 FunctionPass *createAAResultsWrapperPass();
 
 /// A wrapper pass around a callback which can be used to populate the
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 937437791d1..8ed48390818 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -640,28 +640,6 @@ AnalysisKey AAManager::Key;
 
 namespace {
 
-/// A wrapper pass for external alias analyses. This just squirrels away the
-/// callback used to run any analyses and register their results.
-struct ExternalAAWrapperPass : ImmutablePass {
-  using CallbackT = std::function<void(Pass &, Function &, AAResults &)>;
-
-  CallbackT CB;
-
-  static char ID;
-
-  ExternalAAWrapperPass() : ImmutablePass(ID) {
-    initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  explicit ExternalAAWrapperPass(CallbackT CB)
-      : ImmutablePass(ID), CB(std::move(CB)) {
-    initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-};
 
 } // end anonymous namespace
 
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 457ec9f9a95..07e5d97dff9 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -198,6 +198,8 @@ extern char &AMDGPUUnifyDivergentExitNodesID;
 
 ImmutablePass *createAMDGPUAAWrapperPass();
 void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
+ImmutablePass *createAMDGPUExternalAAWrapperPass();
+void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
 
 void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
 
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 81df0c628a2..73709ba1364 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -34,14 +34,22 @@ using namespace llvm;
 
 // Register this pass...
 char AMDGPUAAWrapperPass::ID = 0;
+char AMDGPUExternalAAWrapper::ID = 0;
 
 INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa",
                 "AMDGPU Address space based Alias Analysis", false, true)
 
+INITIALIZE_PASS(AMDGPUExternalAAWrapper, "amdgpu-aa-wrapper",
+                "AMDGPU Address space based Alias Analysis Wrapper", false, true)
+
 ImmutablePass *llvm::createAMDGPUAAWrapperPass() {
   return new AMDGPUAAWrapperPass();
 }
 
+ImmutablePass *llvm::createAMDGPUExternalAAWrapperPass() {
+  return new AMDGPUExternalAAWrapper();
+}
+
 void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
 }
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index 9a507d004d0..d76c9fc4819 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -96,6 +96,19 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 };
 
+// Wrapper around ExternalAAWrapperPass so that the default constructor gets the
+// callback.
+class AMDGPUExternalAAWrapper : public ExternalAAWrapperPass {
+public:
+  static char ID;
+
+  AMDGPUExternalAAWrapper() : ExternalAAWrapperPass(
+    [](Pass &P, Function &, AAResults &AAR) {
+      if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+        AAR.addAAResult(WrapperPass->getResult());
+    }) {}
+};
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 48cde90a972..403dace533a 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -192,6 +192,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIFormMemoryClausesPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
+  initializeAMDGPUExternalAAWrapperPass(*PR);
   initializeAMDGPUUseNativeCallsPass(*PR);
   initializeAMDGPUSimplifyLibCallsPass(*PR);
   initializeAMDGPUInlinerPass(*PR);
@@ -340,13 +341,6 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
     FSAttr.getValueAsString();
 }
 
-static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
-  return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
-      if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
-        AAR.addAAResult(WrapperPass->getResult());
-      });
-}
-
 /// Predicate for Internalize pass.
 static bool mustPreserveGV(const GlobalValue &GV) {
   if (const Function *F = dyn_cast<Function>(&GV))
diff --git a/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll b/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll
index 08e7883023d..3e09618bc28 100644
--- a/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mtriple=amdgcn-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt -mtriple=r600-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=amdgcn-- -aa-eval -amdgpu-aa -amdgpu-aa-wrapper -disable-basicaa  -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=r600-- -aa-eval -amdgpu-aa -amdgpu-aa-wrapper -disable-basicaa  -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: NoAlias:      i8 addrspace(1)* %p1, i8 addrspace(5)* %p
 
-- 
GitLab


From ba9a1e1d784d3959eba0ab73e7cd0e17bded809a Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Wed, 7 Nov 2018 20:54:16 +0000
Subject: [PATCH 1086/1116] AMDGPU/Docs: Add product names for Vega20

Differential Revision: https://reviews.llvm.org/D54178


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346354 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/AMDGPUUsage.rst | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst
index b4cab62ccf4..03685f9e352 100644
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@@ -205,13 +205,10 @@ names from both the *Processor* and *Alternative Processor* can be used.
                                                                        .. TODO
                                                                           Add product
                                                                           names.
-     ``gfx906``                  ``amdgcn``   dGPU  - xnack            *TBA*
-                                                      [off]
+     ``gfx906``                  ``amdgcn``   dGPU  - xnack            - Radeon Instinct MI50
+                                                      [off]            - Radeon Instinct MI60
                                                       sram-ecc
                                                       [on]
-                                                                       .. TODO
-                                                                          Add product
-                                                                          names.
      ``gfx909``                  ``amdgcn``   APU   - xnack            *TBA* (Raven Ridge 2)
                                                       [on]
                                                                        .. TODO
-- 
GitLab


From 7c67a1fab15b92f9437e51e3fb91cc435d6d2268 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Wed, 7 Nov 2018 21:08:13 +0000
Subject: [PATCH 1087/1116] [ARM] Fix CPSR liveness in tMOVCCr_pseudo lowering.

The lowering was missing live-ins in certain cases, like a sequence of
multiple tMOVCCr_pseudo instructions.  This would lead to a verifier
failure, and on pre-v6 Thumb CPSR would be incorrectly clobbered.

For reasons I don't completely understand, it's hard to get a sequence
of multiple tMOVCCr_pseudo instructions; the issue only seems to show up
with 64-bit comparisons where the result is zero-extended. I added some
extra testcases in case that changes in the future. Probably some
optimization opportunities here if anyone is interested. (@test_slt_not
is the case that was getting miscompiled.)

The code to check the liveness of CPSR was stolen from
X86ISelLowering.cpp; maybe it could be refactored into common helper,
but I have no idea where to put it.

Differential Revision: https://reviews.llvm.org/D54192


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346355 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMISelLowering.cpp |  44 ++++++
 test/CodeGen/ARM/wide-compares.ll  | 226 ++++++++++++++++++++++++++++-
 test/CodeGen/Thumb/select.ll       |  25 +++-
 3 files changed, 290 insertions(+), 5 deletions(-)

diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0f68fb0287c..56d2e510cb7 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -9160,6 +9160,42 @@ ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
   return ContBB;
 }
 
+// The CPSR operand of SelectItr might be missing a kill marker
+// because there were multiple uses of CPSR, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
+                                   MachineBasicBlock* BB,
+                                   const TargetRegisterInfo* TRI) {
+  // Scan forward through BB for a use/def of CPSR.
+  MachineBasicBlock::iterator miI(std::next(SelectItr));
+  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+    const MachineInstr& mi = *miI;
+    if (mi.readsRegister(ARM::CPSR))
+      return false;
+    if (mi.definesRegister(ARM::CPSR))
+      break; // Should have kill-flag - update below.
+  }
+
+  // If we hit the end of the block, check whether CPSR is live into a
+  // successor.
+  if (miI == BB->end()) {
+    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                          sEnd = BB->succ_end();
+         sItr != sEnd; ++sItr) {
+      MachineBasicBlock* succ = *sItr;
+      if (succ->isLiveIn(ARM::CPSR))
+        return false;
+    }
+  }
+
+  // We found a def, or hit the end of the basic block and CPSR wasn't live
+  // out. SelectMI should have a kill flag on CPSR.
+  SelectItr->addRegisterKilled(ARM::CPSR, TRI);
+  return true;
+}
+
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -9259,6 +9295,14 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     F->insert(It, copy0MBB);
     F->insert(It, sinkMBB);
 
+    // Check whether CPSR is live past the tMOVCCr_pseudo.
+    const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+    if (!MI.killsRegister(ARM::CPSR) &&
+        !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
+      copy0MBB->addLiveIn(ARM::CPSR);
+      sinkMBB->addLiveIn(ARM::CPSR);
+    }
+
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     sinkMBB->splice(sinkMBB->begin(), BB,
                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
diff --git a/test/CodeGen/ARM/wide-compares.ll b/test/CodeGen/ARM/wide-compares.ll
index 9b22f5fedfe..6584f0c7616 100644
--- a/test/CodeGen/ARM/wide-compares.ll
+++ b/test/CodeGen/ARM/wide-compares.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=armv7-unknown-linux < %s | FileCheck --check-prefix=CHECK-ARM %s
-; RUN: llc -mtriple=thumbv6-unknown-linux < %s | FileCheck --check-prefix=CHECK-THUMB1 %s
-; RUN: llc -mtriple=thumbv7-unknown-linux < %s | FileCheck --check-prefix=CHECK-THUMB2 %s
+; RUN: llc -mtriple=armv7-unknown-linux < %s -verify-machineinstrs | FileCheck --check-prefix=CHECK-ARM %s
+; RUN: llc -mtriple=thumb-eabi < %s  -verify-machineinstrs | FileCheck --check-prefix=CHECK-THUMB1-NOMOV %s
+; RUN: llc -mtriple=thumbv6-unknown-linux < %s -verify-machineinstrs | FileCheck --check-prefix=CHECK-THUMB1 %s
+; RUN: llc -mtriple=thumbv7-unknown-linux < %s -verify-machineinstrs | FileCheck --check-prefix=CHECK-THUMB2 %s
 
 define i32 @test_slt1(i64 %a, i64 %b) {
 ; CHECK-ARM-LABEL: test_slt1:
@@ -13,6 +14,18 @@ define i32 @test_slt1(i64 %a, i64 %b) {
 ; CHECK-ARM-NEXT:    mov r0, r12
 ; CHECK-ARM-NEXT:    bx lr
 ;
+; CHECK-THUMB1-NOMOV-LABEL: test_slt1:
+; CHECK-THUMB1-NOMOV:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    subs r0, r0, r2
+; CHECK-THUMB1-NOMOV-NEXT:    sbcs r1, r3
+; CHECK-THUMB1-NOMOV-NEXT:    bge .LBB0_2
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.1: @ %bb1
+; CHECK-THUMB1-NOMOV-NEXT:    movs r0, #1
+; CHECK-THUMB1-NOMOV-NEXT:    bx lr
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB0_2: @ %bb2
+; CHECK-THUMB1-NOMOV-NEXT:    movs r0, #2
+; CHECK-THUMB1-NOMOV-NEXT:    bx lr
+;
 ; CHECK-THUMB1-LABEL: test_slt1:
 ; CHECK-THUMB1:       @ %bb.0: @ %entry
 ; CHECK-THUMB1-NEXT:    subs r0, r0, r2
@@ -57,6 +70,23 @@ define void @test_slt2(i64 %a, i64 %b) {
 ; CHECK-ARM-NEXT:    bl g
 ; CHECK-ARM-NEXT:    pop {r11, pc}
 ;
+; CHECK-THUMB1-NOMOV-LABEL: test_slt2:
+; CHECK-THUMB1-NOMOV:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    .save {r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    push {r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    subs r0, r0, r2
+; CHECK-THUMB1-NOMOV-NEXT:    sbcs r1, r3
+; CHECK-THUMB1-NOMOV-NEXT:    bge .LBB1_2
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.1: @ %bb1
+; CHECK-THUMB1-NOMOV-NEXT:    bl f
+; CHECK-THUMB1-NOMOV-NEXT:    b .LBB1_3
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB1_2: @ %bb2
+; CHECK-THUMB1-NOMOV-NEXT:    bl g
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB1_3: @ %bb1
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r7}
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r0}
+; CHECK-THUMB1-NOMOV-NEXT:    bx r0
+;
 ; CHECK-THUMB1-LABEL: test_slt2:
 ; CHECK-THUMB1:       @ %bb.0: @ %entry
 ; CHECK-THUMB1-NEXT:    push {r7, lr}
@@ -95,3 +125,193 @@ bb2:
 
 declare void @f()
 declare void @g()
+
+define i64 @test_slt_select(i64 %c, i64 %d, i64 %a, i64 %b) {
+; CHECK-ARM-LABEL: test_slt_select:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-NEXT:    ldr r12, [sp, #32]
+; CHECK-ARM-NEXT:    mov r6, #0
+; CHECK-ARM-NEXT:    ldr lr, [sp, #24]
+; CHECK-ARM-NEXT:    ldr r7, [sp, #36]
+; CHECK-ARM-NEXT:    ldr r5, [sp, #28]
+; CHECK-ARM-NEXT:    subs r4, lr, r12
+; CHECK-ARM-NEXT:    sbcs r7, r5, r7
+; CHECK-ARM-NEXT:    movwlo r6, #1
+; CHECK-ARM-NEXT:    cmp r6, #0
+; CHECK-ARM-NEXT:    moveq r0, r2
+; CHECK-ARM-NEXT:    moveq r1, r3
+; CHECK-ARM-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-THUMB1-NOMOV-LABEL: test_slt_select:
+; CHECK-THUMB1-NOMOV:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    .pad #4
+; CHECK-THUMB1-NOMOV-NEXT:    sub sp, #4
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r4, [sp, #36]
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r5, [sp, #28]
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r6, [sp, #32]
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r7, [sp, #24]
+; CHECK-THUMB1-NOMOV-NEXT:    subs r6, r7, r6
+; CHECK-THUMB1-NOMOV-NEXT:    sbcs r5, r4
+; CHECK-THUMB1-NOMOV-NEXT:    blo .LBB2_2
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.1: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    movs r4, #0
+; CHECK-THUMB1-NOMOV-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NOMOV-NEXT:    beq .LBB2_3
+; CHECK-THUMB1-NOMOV-NEXT:    b .LBB2_4
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB2_2:
+; CHECK-THUMB1-NOMOV-NEXT:    movs r4, #1
+; CHECK-THUMB1-NOMOV-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NOMOV-NEXT:    bne .LBB2_4
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB2_3: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    movs r0, r2
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB2_4: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NOMOV-NEXT:    bne .LBB2_6
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.5: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    movs r1, r3
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB2_6: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    add sp, #4
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r4, r5, r6, r7}
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r2}
+; CHECK-THUMB1-NOMOV-NEXT:    bx r2
+;
+; CHECK-THUMB1-LABEL: test_slt_select:
+; CHECK-THUMB1:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-THUMB1-NEXT:    sub sp, #4
+; CHECK-THUMB1-NEXT:    ldr r4, [sp, #36]
+; CHECK-THUMB1-NEXT:    ldr r5, [sp, #28]
+; CHECK-THUMB1-NEXT:    ldr r6, [sp, #32]
+; CHECK-THUMB1-NEXT:    ldr r7, [sp, #24]
+; CHECK-THUMB1-NEXT:    subs r6, r7, r6
+; CHECK-THUMB1-NEXT:    sbcs r5, r4
+; CHECK-THUMB1-NEXT:    blo .LBB2_2
+; CHECK-THUMB1-NEXT:  @ %bb.1: @ %entry
+; CHECK-THUMB1-NEXT:    movs r4, #0
+; CHECK-THUMB1-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NEXT:    beq .LBB2_3
+; CHECK-THUMB1-NEXT:    b .LBB2_4
+; CHECK-THUMB1-NEXT:  .LBB2_2:
+; CHECK-THUMB1-NEXT:    movs r4, #1
+; CHECK-THUMB1-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NEXT:    bne .LBB2_4
+; CHECK-THUMB1-NEXT:  .LBB2_3: @ %entry
+; CHECK-THUMB1-NEXT:    mov r0, r2
+; CHECK-THUMB1-NEXT:  .LBB2_4: @ %entry
+; CHECK-THUMB1-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NEXT:    beq .LBB2_6
+; CHECK-THUMB1-NEXT:  @ %bb.5: @ %entry
+; CHECK-THUMB1-NEXT:    add sp, #4
+; CHECK-THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-THUMB1-NEXT:  .LBB2_6: @ %entry
+; CHECK-THUMB1-NEXT:    mov r1, r3
+; CHECK-THUMB1-NEXT:    add sp, #4
+; CHECK-THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB2-LABEL: test_slt_select:
+; CHECK-THUMB2:       @ %bb.0: @ %entry
+; CHECK-THUMB2-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-THUMB2-NEXT:    sub sp, #4
+; CHECK-THUMB2-NEXT:    ldrd r12, r7, [sp, #32]
+; CHECK-THUMB2-NEXT:    movs r6, #0
+; CHECK-THUMB2-NEXT:    ldrd lr, r5, [sp, #24]
+; CHECK-THUMB2-NEXT:    subs.w r4, lr, r12
+; CHECK-THUMB2-NEXT:    sbcs.w r7, r5, r7
+; CHECK-THUMB2-NEXT:    it lo
+; CHECK-THUMB2-NEXT:    movlo r6, #1
+; CHECK-THUMB2-NEXT:    cmp r6, #0
+; CHECK-THUMB2-NEXT:    itt eq
+; CHECK-THUMB2-NEXT:    moveq r0, r2
+; CHECK-THUMB2-NEXT:    moveq r1, r3
+; CHECK-THUMB2-NEXT:    add sp, #4
+; CHECK-THUMB2-NEXT:    pop {r4, r5, r6, r7, pc}
+entry:
+    %cmp = icmp ult i64 %a, %b
+    %r1 = select i1 %cmp, i64 %c, i64 %d
+    ret i64 %r1
+}
+
+define {i32, i32} @test_slt_not(i32 %c, i32 %d, i64 %a, i64 %b) {
+; CHECK-ARM-LABEL: test_slt_not:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    ldr r12, [sp]
+; CHECK-ARM-NEXT:    mov r1, #0
+; CHECK-ARM-NEXT:    ldr r0, [sp, #4]
+; CHECK-ARM-NEXT:    subs r2, r2, r12
+; CHECK-ARM-NEXT:    sbcs r0, r3, r0
+; CHECK-ARM-NEXT:    mov r0, #0
+; CHECK-ARM-NEXT:    movwge r1, #1
+; CHECK-ARM-NEXT:    movwlt r0, #1
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-THUMB1-NOMOV-LABEL: test_slt_not:
+; CHECK-THUMB1-NOMOV:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    push {r4, r5, r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    movs r1, #1
+; CHECK-THUMB1-NOMOV-NEXT:    movs r4, #0
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r0, [sp, #20]
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r5, [sp, #16]
+; CHECK-THUMB1-NOMOV-NEXT:    subs r2, r2, r5
+; CHECK-THUMB1-NOMOV-NEXT:    sbcs r3, r0
+; CHECK-THUMB1-NOMOV-NEXT:    push {r1}
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r0}
+; CHECK-THUMB1-NOMOV-NEXT:    blt .LBB3_2
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.1: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    push {r4}
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r0}
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB3_2: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    bge .LBB3_4
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.3: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    movs r1, r4
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB3_4: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r4, r5, r7}
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r2}
+; CHECK-THUMB1-NOMOV-NEXT:    bx r2
+;
+; CHECK-THUMB1-LABEL: test_slt_not:
+; CHECK-THUMB1:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NEXT:    push {r4, r5, r7, lr}
+; CHECK-THUMB1-NEXT:    movs r1, #1
+; CHECK-THUMB1-NEXT:    movs r4, #0
+; CHECK-THUMB1-NEXT:    ldr r0, [sp, #20]
+; CHECK-THUMB1-NEXT:    ldr r5, [sp, #16]
+; CHECK-THUMB1-NEXT:    subs r2, r2, r5
+; CHECK-THUMB1-NEXT:    sbcs r3, r0
+; CHECK-THUMB1-NEXT:    mov r0, r1
+; CHECK-THUMB1-NEXT:    bge .LBB3_3
+; CHECK-THUMB1-NEXT:  @ %bb.1: @ %entry
+; CHECK-THUMB1-NEXT:    blt .LBB3_4
+; CHECK-THUMB1-NEXT:  .LBB3_2: @ %entry
+; CHECK-THUMB1-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-THUMB1-NEXT:  .LBB3_3: @ %entry
+; CHECK-THUMB1-NEXT:    mov r0, r4
+; CHECK-THUMB1-NEXT:    bge .LBB3_2
+; CHECK-THUMB1-NEXT:  .LBB3_4: @ %entry
+; CHECK-THUMB1-NEXT:    mov r1, r4
+; CHECK-THUMB1-NEXT:    pop {r4, r5, r7, pc}
+;
+; CHECK-THUMB2-LABEL: test_slt_not:
+; CHECK-THUMB2:       @ %bb.0: @ %entry
+; CHECK-THUMB2-NEXT:    ldr.w r12, [sp]
+; CHECK-THUMB2-NEXT:    movs r1, #0
+; CHECK-THUMB2-NEXT:    ldr r0, [sp, #4]
+; CHECK-THUMB2-NEXT:    subs.w r2, r2, r12
+; CHECK-THUMB2-NEXT:    sbcs.w r0, r3, r0
+; CHECK-THUMB2-NEXT:    mov.w r0, #0
+; CHECK-THUMB2-NEXT:    ite lt
+; CHECK-THUMB2-NEXT:    movlt r0, #1
+; CHECK-THUMB2-NEXT:    movge r1, #1
+; CHECK-THUMB2-NEXT:    bx lr
+entry:
+    %cmp = icmp slt i64 %a, %b
+    %not = xor i1 %cmp, true
+    %r1 = zext i1 %cmp to i32
+    %r2 = zext i1 %not to i32
+    %z = insertvalue { i32, i32 } undef, i32 %r1, 0
+    %z2 = insertvalue { i32, i32 } %z, i32 %r2, 1
+    ret { i32, i32 } %z2
+}
diff --git a/test/CodeGen/Thumb/select.ll b/test/CodeGen/Thumb/select.ll
index 41ace62de53..36f16ad44a0 100644
--- a/test/CodeGen/Thumb/select.ll
+++ b/test/CodeGen/Thumb/select.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=thumb-pc-linux-gnueabi | FileCheck -check-prefix=CHECK-EABI %s
+; RUN: llc < %s -mtriple=thumb-apple-darwin -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-pc-linux-gnueabi -verify-machineinstrs | FileCheck -check-prefix=CHECK-EABI %s
 
 define i32 @f1(i32 %a.s) {
 entry:
@@ -80,3 +80,24 @@ define double @f7(double %a, double %b) {
 ; CHECK-EABI: __aeabi_dcmplt
 ; CHECK-EABI: {{bne|beq}}
 ; CHECK-EABI: {{bne|beq}}
+
+define {i32, i32} @f8(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+    %cmp = icmp slt i32 %a, %b
+    %r1 = select i1 %cmp, i32 %c, i32 %a
+    %r2 = select i1 %cmp, i32 %d, i32 %b
+    %z = insertvalue { i32, i32 } undef, i32 %r1, 0
+    %z2 = insertvalue { i32, i32 } %z, i32 %r2, 1
+    ret { i32, i32 } %z2
+}
+
+; CHECK-LABEL: f8:
+; CHECK: cmp r0, r1
+; CHECK: blt
+; CHECK: movs
+; CHECK: cmp r0, r1
+; CHECK: blt
+; CHECK: movs
+; CHECK: movs
+; CHECK: movs
+; CHECK: bx lr
-- 
GitLab


From 75bd5e318c35d206e9d607614479fd1385abd3eb Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Wed, 7 Nov 2018 21:21:32 +0000
Subject: [PATCH 1088/1116] AMDGPU/NFC: Split MUBUF_Pseudo_Atomics into
 RTN/NO_RTN multiclasses

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346357 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/BUFInstructions.td | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 18a59729faa..e48b73b0f1e 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -656,11 +656,10 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
   let AsmMatchConverter = "cvtMubufAtomicReturn";
 }
 
-multiclass MUBUF_Pseudo_Atomics <string opName,
-                                 RegisterClass vdataClass,
-                                 ValueType vdataType,
-                                 SDPatternOperator atomic> {
-
+multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
+                                        RegisterClass vdataClass,
+                                        ValueType vdataType,
+                                        SDPatternOperator atomic> {
   def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
                 MUBUFAddr64Table <0, NAME>;
   def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
@@ -668,7 +667,12 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
   def _OFFEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass>;
   def _IDXEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass>;
   def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+}
 
+multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
+                                     RegisterClass vdataClass,
+                                     ValueType vdataType,
+                                     SDPatternOperator atomic> {
   def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
     [(set vdataType:$vdata,
      (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
@@ -686,6 +690,13 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
   def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
 }
 
+multiclass MUBUF_Pseudo_Atomics <string opName,
+                                 RegisterClass vdataClass,
+                                 ValueType vdataType,
+                                 SDPatternOperator atomic> :
+  MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType, atomic>,
+  MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
+
 
 //===----------------------------------------------------------------------===//
 // MUBUF Instructions
-- 
GitLab


From ed3959a17866fd3ce9742184ae8fa80f5bcd52aa Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Wed, 7 Nov 2018 21:31:14 +0000
Subject: [PATCH 1089/1116] [AArch64] [Windows] Trap after noreturn calls.

Like the comment says, this isn't the most efficient fix in terms of
codesize, but it works.

Differential Revision: https://reviews.llvm.org/D54129


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346358 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64TargetMachine.cpp | 10 ++++++++++
 test/CodeGen/AArch64/windows-trap.ll        | 17 +++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 test/CodeGen/AArch64/windows-trap.ll

diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index fe2eea65ffe..5168c0c67da 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -263,6 +263,16 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
     this->Options.NoTrapAfterNoreturn = true;
   }
 
+  if (TT.isOSWindows()) {
+    // Unwinding can get confused if the last instruction in an
+    // exception-handling region (function, funclet, try block, etc.)
+    // is a call.
+    //
+    // FIXME: We could elide the trap if the next instruction would be in
+    // the same region anyway.
+    this->Options.TrapUnreachable = true;
+  }
+
   // Enable GlobalISel at or below EnableGlobalISelAt0.
   if (getOptLevel() <= EnableGlobalISelAtO)
     setGlobalISel(true);
diff --git a/test/CodeGen/AArch64/windows-trap.ll b/test/CodeGen/AArch64/windows-trap.ll
new file mode 100644
index 00000000000..5cf0ece48e9
--- /dev/null
+++ b/test/CodeGen/AArch64/windows-trap.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=aarch64-win32 %s -o - | FileCheck %s
+
+declare void @callee() noreturn
+
+; Make sure the call isn't the last instruction in the function; if it is,
+; unwinding may break.
+;
+; (The instruction after the call doesn't have to be anything in particular,
+; but trapping has the nice side-effect of catching bugs.)
+
+define void @test_unreachable() {
+; CHECK-LABEL: test_unreachable:
+; CHECK: bl      callee
+; CHECK-NEXT: brk #0x1
+  call void @callee() noreturn
+  unreachable
+}
-- 
GitLab


From 1b8bc9b6d8d3562655eac93ba3c6169515c662de Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Wed, 7 Nov 2018 21:34:33 +0000
Subject: [PATCH 1090/1116] Fix spelling error

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346359 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/dsymutil/X86/dummy-debug-map.map | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/tools/dsymutil/X86/dummy-debug-map.map b/test/tools/dsymutil/X86/dummy-debug-map.map
index f9bc7b09985..aa000182e47 100644
--- a/test/tools/dsymutil/X86/dummy-debug-map.map
+++ b/test/tools/dsymutil/X86/dummy-debug-map.map
@@ -1,6 +1,6 @@
 # This is a dummy debug map used for some tests where the contents of the
 # map are just an implementation detail. The tests wanting to use that file
-# should put all there object files in an explicitely named sub-directory
+# should put all their object files in an explicitely named sub-directory
 # of Inputs, and they should be named 1.o, 2.o, ...
 # As not finding an object file or symbols isn't a fatal error for dsymutil,
 # you can extend this file with as much object files and symbols as needed.
-- 
GitLab


From 0bb019fff933187a61b0e83f729c590c19df3ac6 Mon Sep 17 00:00:00 2001
From: Paul Robinson <paul.robinson@sony.com>
Date: Wed, 7 Nov 2018 21:39:09 +0000
Subject: [PATCH 1091/1116] [DWARFv5] Read and dump multiple .debug_info
 sections. Type units go in .debug_info comdats, not .debug_types, in v5.

Differential Revision: https://reviews.llvm.org/D53907

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346360 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFObject.h |  6 +-
 lib/DebugInfo/DWARF/DWARFContext.cpp       | 86 +++++++++++++++-------
 lib/DebugInfo/DWARF/DWARFVerifier.cpp      | 13 ++--
 test/DebugInfo/X86/dwarfdump-header.s      | 60 ++++++++-------
 4 files changed, 102 insertions(+), 63 deletions(-)

diff --git a/include/llvm/DebugInfo/DWARF/DWARFObject.h b/include/llvm/DebugInfo/DWARF/DWARFObject.h
index 8e582da3172..5a808b0ec6a 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFObject.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFObject.h
@@ -33,7 +33,8 @@ public:
   virtual ArrayRef<SectionName> getSectionNames() const { return {}; }
   virtual bool isLittleEndian() const = 0;
   virtual uint8_t getAddressSize() const { llvm_unreachable("unimplemented"); }
-  virtual const DWARFSection &getInfoSection() const { return Dummy; }
+  virtual void
+  forEachInfoSections(function_ref<void(const DWARFSection &)> F) const {}
   virtual void
   forEachTypesSections(function_ref<void(const DWARFSection &)> F) const {}
   virtual StringRef getAbbrevSection() const { return ""; }
@@ -53,7 +54,8 @@ public:
   virtual StringRef getGnuPubNamesSection() const { return ""; }
   virtual StringRef getGnuPubTypesSection() const { return ""; }
   virtual const DWARFSection &getStringOffsetSection() const { return Dummy; }
-  virtual const DWARFSection &getInfoDWOSection() const { return Dummy; }
+  virtual void
+  forEachInfoDWOSections(function_ref<void(const DWARFSection &)> F) const {}
   virtual void
   forEachTypesDWOSections(function_ref<void(const DWARFSection &)> F) const {}
   virtual StringRef getAbbrevDWOSection() const { return ""; }
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 00e37d7b7c5..99cf9b985c0 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -351,20 +351,22 @@ void DWARFContext::dump(
                  DObj->getAbbrevDWOSection()))
     getDebugAbbrevDWO()->dump(OS);
 
-  auto dumpDebugInfo = [&](unit_iterator_range Units) {
-    if (DumpOffset)
-      getDIEForOffset(DumpOffset.getValue())
-          .dump(OS, 0, DumpOpts.noImplicitRecursion());
+  auto dumpDebugInfo = [&](const char *Name, unit_iterator_range Units) {
+    OS << '\n' << Name << " contents:\n";
+    if (DumpOffset = DumpOffsets[DIDT_ID_DebugInfo])
+      for (const auto &U : Units)
+        U->getDIEForOffset(DumpOffset.getValue())
+            .dump(OS, 0, DumpOpts.noImplicitRecursion());
     else
       for (const auto &U : Units)
         U->dump(OS, DumpOpts);
   };
-  if (shouldDump(Explicit, ".debug_info", DIDT_ID_DebugInfo,
-                 DObj->getInfoSection().Data))
-    dumpDebugInfo(info_section_units());
-  if (shouldDump(ExplicitDWO, ".debug_info.dwo", DIDT_ID_DebugInfo,
-                 DObj->getInfoDWOSection().Data))
-    dumpDebugInfo(dwo_info_section_units());
+  if ((DumpType & DIDT_DebugInfo)) {
+    if (Explicit || getNumCompileUnits())
+      dumpDebugInfo(".debug_info", info_section_units());
+    if (ExplicitDWO || getNumDWOCompileUnits())
+      dumpDebugInfo(".debug_info.dwo", dwo_info_section_units());
+  }
 
   auto dumpDebugType = [&](const char *Name, unit_iterator_range Units) {
     OS << '\n' << Name << " contents:\n";
@@ -872,7 +874,9 @@ Expected<const DWARFDebugLine::LineTable *> DWARFContext::getLineTableForUnit(
 void DWARFContext::parseNormalUnits() {
   if (!NormalUnits.empty())
     return;
-  NormalUnits.addUnitsForSection(*this, DObj->getInfoSection(), DW_SECT_INFO);
+  DObj->forEachInfoSections([&](const DWARFSection &S) {
+    NormalUnits.addUnitsForSection(*this, S, DW_SECT_INFO);
+  });
   NormalUnits.finishedInfoUnits();
   DObj->forEachTypesSections([&](const DWARFSection &S) {
     NormalUnits.addUnitsForSection(*this, S, DW_SECT_TYPES);
@@ -882,8 +886,9 @@ void DWARFContext::parseNormalUnits() {
 void DWARFContext::parseDWOUnits(bool Lazy) {
   if (!DWOUnits.empty())
     return;
-  DWOUnits.addUnitsForDWOSection(*this, DObj->getInfoDWOSection(), DW_SECT_INFO,
-                                 Lazy);
+  DObj->forEachInfoDWOSections([&](const DWARFSection &S) {
+    DWOUnits.addUnitsForDWOSection(*this, S, DW_SECT_INFO, Lazy);
+  });
   DWOUnits.finishedInfoUnits();
   DObj->forEachTypesDWOSections([&](const DWARFSection &S) {
     DWOUnits.addUnitsForDWOSection(*this, S, DW_SECT_TYPES, Lazy);
@@ -1235,20 +1240,20 @@ class DWARFObjInMemory final : public DWARFObject {
   const object::ObjectFile *Obj = nullptr;
   std::vector<SectionName> SectionNames;
 
-  using TypeSectionMap = MapVector<object::SectionRef, DWARFSectionMap,
+  using InfoSectionMap = MapVector<object::SectionRef, DWARFSectionMap,
                                    std::map<object::SectionRef, unsigned>>;
 
-  TypeSectionMap TypesSections;
-  TypeSectionMap TypesDWOSections;
+  InfoSectionMap InfoSections;
+  InfoSectionMap TypesSections;
+  InfoSectionMap InfoDWOSections;
+  InfoSectionMap TypesDWOSections;
 
-  DWARFSectionMap InfoSection;
   DWARFSectionMap LocSection;
   DWARFSectionMap LocListsSection;
   DWARFSectionMap LineSection;
   DWARFSectionMap RangeSection;
   DWARFSectionMap RnglistsSection;
   DWARFSectionMap StringOffsetSection;
-  DWARFSectionMap InfoDWOSection;
   DWARFSectionMap LineDWOSection;
   DWARFSectionMap LocDWOSection;
   DWARFSectionMap StringOffsetDWOSection;
@@ -1263,14 +1268,12 @@ class DWARFObjInMemory final : public DWARFObject {
 
   DWARFSectionMap *mapNameToDWARFSection(StringRef Name) {
     return StringSwitch<DWARFSectionMap *>(Name)
-        .Case("debug_info", &InfoSection)
         .Case("debug_loc", &LocSection)
         .Case("debug_loclists", &LocListsSection)
         .Case("debug_line", &LineSection)
         .Case("debug_str_offsets", &StringOffsetSection)
         .Case("debug_ranges", &RangeSection)
         .Case("debug_rnglists", &RnglistsSection)
-        .Case("debug_info.dwo", &InfoDWOSection)
         .Case("debug_loc.dwo", &LocDWOSection)
         .Case("debug_line.dwo", &LineDWOSection)
         .Case("debug_names", &DebugNamesSection)
@@ -1359,6 +1362,16 @@ public:
     for (const auto &SecIt : Sections) {
       if (StringRef *SectionData = mapSectionToMember(SecIt.first()))
         *SectionData = SecIt.second->getBuffer();
+      else if (SecIt.first() == "debug_info")
+        // Find debug_info and debug_types data by section rather than name as
+        // there are multiple, comdat grouped, of these sections.
+        InfoSections[SectionRef()].Data = SecIt.second->getBuffer();
+      else if (SecIt.first() == "debug_info.dwo")
+        InfoDWOSections[SectionRef()].Data = SecIt.second->getBuffer();
+      else if (SecIt.first() == "debug_types")
+        TypesSections[SectionRef()].Data = SecIt.second->getBuffer();
+      else if (SecIt.first() == "debug_types.dwo")
+        TypesDWOSections[SectionRef()].Data = SecIt.second->getBuffer();
     }
   }
   DWARFObjInMemory(const object::ObjectFile &Obj, const LoadedObjectInfo *L,
@@ -1413,9 +1426,13 @@ public:
           // FIXME: Use the other dwo range section when we emit it.
           RangeDWOSection.Data = Data;
         }
+      } else if (Name == "debug_info") {
+        // Find debug_info and debug_types data by section rather than name as
+        // there are multiple, comdat grouped, of these sections.
+        InfoSections[Section].Data = Data;
+      } else if (Name == "debug_info.dwo") {
+        InfoDWOSections[Section].Data = Data;
       } else if (Name == "debug_types") {
-        // Find debug_types data by section rather than name as there are
-        // multiple, comdat grouped, debug_types sections.
         TypesSections[Section].Data = Data;
       } else if (Name == "debug_types.dwo") {
         TypesDWOSections[Section].Data = Data;
@@ -1450,9 +1467,16 @@ public:
       DWARFSectionMap *Sec = mapNameToDWARFSection(RelSecName);
       RelocAddrMap *Map = Sec ? &Sec->Relocs : nullptr;
       if (!Map) {
-        // Find debug_types relocs by section rather than name as there are
-        // multiple, comdat grouped, debug_types sections.
-        if (RelSecName == "debug_types")
+        // Find debug_info and debug_types relocs by section rather than name
+        // as there are multiple, comdat grouped, of these sections.
+        if (RelSecName == "debug_info")
+          Map = &static_cast<DWARFSectionMap &>(InfoSections[*RelocatedSection])
+                     .Relocs;
+        else if (RelSecName == "debug_info.dwo")
+          Map = &static_cast<DWARFSectionMap &>(
+                     InfoDWOSections[*RelocatedSection])
+                     .Relocs;
+        else if (RelSecName == "debug_types")
           Map =
               &static_cast<DWARFSectionMap &>(TypesSections[*RelocatedSection])
                    .Relocs;
@@ -1550,8 +1574,10 @@ public:
   StringRef getLineStringSection() const override { return LineStringSection; }
 
   // Sections for DWARF5 split dwarf proposal.
-  const DWARFSection &getInfoDWOSection() const override {
-    return InfoDWOSection;
+  void forEachInfoDWOSections(
+      function_ref<void(const DWARFSection &)> F) const override {
+    for (auto &P : InfoDWOSections)
+      F(P.second);
   }
   void forEachTypesDWOSections(
       function_ref<void(const DWARFSection &)> F) const override {
@@ -1598,7 +1624,11 @@ public:
 
   StringRef getFileName() const override { return FileName; }
   uint8_t getAddressSize() const override { return AddressSize; }
-  const DWARFSection &getInfoSection() const override { return InfoSection; }
+  void forEachInfoSections(
+      function_ref<void(const DWARFSection &)> F) const override {
+    for (auto &P : InfoSections)
+      F(P.second);
+  }
   void forEachTypesSections(
       function_ref<void(const DWARFSection &)> F) const override {
     for (auto &P : TypesSections)
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 128bd0651ba..f8370178b62 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -364,15 +364,18 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
 
 bool DWARFVerifier::handleDebugInfo() {
   const DWARFObject &DObj = DCtx.getDWARFObj();
+  unsigned NumErrors = 0;
 
   OS << "Verifying .debug_info Unit Header Chain...\n";
-  unsigned result = verifyUnitSection(DObj.getInfoSection(), DW_SECT_INFO);
+  DObj.forEachInfoSections([&](const DWARFSection &S) {
+    NumErrors += verifyUnitSection(S, DW_SECT_INFO);
+  });
 
   OS << "Verifying .debug_types Unit Header Chain...\n";
   DObj.forEachTypesSections([&](const DWARFSection &S) {
-    result += verifyUnitSection(S, DW_SECT_TYPES);
+    NumErrors += verifyUnitSection(S, DW_SECT_TYPES);
   });
-  return result == 0;
+  return NumErrors == 0;
 }
 
 unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
@@ -551,6 +554,7 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
 unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
                                             DWARFAttribute &AttrValue) {
   const DWARFObject &DObj = DCtx.getDWARFObj();
+  auto DieCU = Die.getDwarfUnit();
   unsigned NumErrors = 0;
   const auto Form = AttrValue.Value.getForm();
   switch (Form) {
@@ -563,7 +567,6 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     Optional<uint64_t> RefVal = AttrValue.Value.getAsReference();
     assert(RefVal);
     if (RefVal) {
-      auto DieCU = Die.getDwarfUnit();
       auto CUSize = DieCU->getNextUnitOffset() - DieCU->getOffset();
       auto CUOffset = AttrValue.Value.getRawUValue();
       if (CUOffset >= CUSize) {
@@ -588,7 +591,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     Optional<uint64_t> RefVal = AttrValue.Value.getAsReference();
     assert(RefVal);
     if (RefVal) {
-      if (*RefVal >= DObj.getInfoSection().Data.size()) {
+      if (*RefVal >= DieCU->getInfoSection().Data.size()) {
         ++NumErrors;
         error() << "DW_FORM_ref_addr offset beyond .debug_info "
                    "bounds:\n";
diff --git a/test/DebugInfo/X86/dwarfdump-header.s b/test/DebugInfo/X86/dwarfdump-header.s
index 7daba5f6961..daf03614f9d 100644
--- a/test/DebugInfo/X86/dwarfdump-header.s
+++ b/test/DebugInfo/X86/dwarfdump-header.s
@@ -152,35 +152,13 @@ CU_split_5_end:
 # CHECK-NEXT: DW_AT_producer {{.*}} "Handmade DWO producer"
 # CHECK-NEXT: DW_AT_name {{.*}} "V5_dwo_compile_unit"
 
-        .section .debug_types,"",@progbits
-# CHECK-LABEL: .debug_types contents:
-
-# DWARF v4 Type unit header. Normal/split are identical so we do only one.
-TU_4_start:
-        .long  TU_4_end-TU_4_version  # Length of Unit
-TU_4_version:
-        .short 4               # DWARF version number
-        .long .debug_abbrev    # Offset Into Abbrev. Section
-        .byte 8                # Address Size (in bytes)
-        .quad 0x0011223344556677 # Type Signature
-        .long TU_4_type-TU_4_start # Type offset
-# The type-unit DIE, which has a name.
-        .byte 2
-        .long str_TU_4
-# The type DIE, which has a name.
-TU_4_type:
-        .byte 3
-        .long str_TU_4
-        .byte 0 # NULL
-        .byte 0 # NULL
-TU_4_end:
-
-# CHECK: 0x00000000: Type Unit: length = 0x0000001f version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = 'V4_type_unit' type_signature = 0x0011223344556677 type_offset = 0x001c (next unit at 0x00000023)
-# CHECK: 0x00000017: DW_TAG_type_unit
+# Now a DWARF v5 type unit, which goes in a .debug_info.dwo comdat.
+# Note there will not be another ".debug_info.dwo contents:" line, even though
+# there is a separate ELF section header; it's dumped along with the previous
+# unit as if they were in a single section.
 
-        .section .debug_types.dwo,"",@progbits
-# FIXME: DWARF v5 wants type units in .debug_info[.dwo] not .debug_types[.dwo].
-# CHECK: .debug_types.dwo contents:
+        .section .debug_info.dwo,"G",@progbits,5555,comdat
+# CHECK-NOT: .debug_info.dwo
 
 # DWARF v5 split type unit header.
 TU_split_5_start:
@@ -206,6 +184,32 @@ TU_split_5_end:
 # CHECK: 0x00000000: Type Unit: length = 0x00000020 version = 0x0005 unit_type = DW_UT_split_type abbr_offset = 0x0000 addr_size = 0x08 name = 'V5_split_type_unit' type_signature = 0x8899aabbccddeeff type_offset = 0x001d (next unit at 0x00000024)
 # CHECK: 0x00000018: DW_TAG_type_unit
 
+        .section .debug_types,"",@progbits
+# CHECK-LABEL: .debug_types contents:
+
+# DWARF v4 Type unit header. Normal/split are identical so we do only one.
+TU_4_start:
+        .long  TU_4_end-TU_4_version  # Length of Unit
+TU_4_version:
+        .short 4               # DWARF version number
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+        .byte 8                # Address Size (in bytes)
+        .quad 0x0011223344556677 # Type Signature
+        .long TU_4_type-TU_4_start # Type offset
+# The type-unit DIE, which has a name.
+        .byte 2
+        .long str_TU_4
+# The type DIE, which has a name.
+TU_4_type:
+        .byte 3
+        .long str_TU_4
+        .byte 0 # NULL
+        .byte 0 # NULL
+TU_4_end:
+
+# CHECK: 0x00000000: Type Unit: length = 0x0000001f version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = 'V4_type_unit' type_signature = 0x0011223344556677 type_offset = 0x001c (next unit at 0x00000023)
+# CHECK: 0x00000017: DW_TAG_type_unit
+
         .section .debug_line,"",@progbits
 # CHECK-LABEL: .debug_line contents:
 
-- 
GitLab


From 0e1b12743fb6da1836bb9455c605c8eb4b21afb4 Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Wed, 7 Nov 2018 21:42:13 +0000
Subject: [PATCH 1092/1116] AMDGPU/NFC: Split FLAT_Global_Atomic_Pseudo into
 RTN/NO_RTN multiclasses

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346361 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/FLATInstructions.td | 41 ++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 43130dfcae9..18e8b8a1c2d 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -275,7 +275,7 @@ multiclass FLAT_Atomic_Pseudo<
        AtomicNoRet <opName, 1>;
 }
 
-multiclass FLAT_Global_Atomic_Pseudo<
+multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
   string opName,
   RegisterClass vdst_rc,
   ValueType vt,
@@ -292,16 +292,6 @@ multiclass FLAT_Global_Atomic_Pseudo<
     let PseudoInstr = NAME;
   }
 
-  def _RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_rc:$vdst),
-      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
-    " $vdst, $vaddr, $vdata, off$offset glc$slc",
-    [(set vt:$vdst,
-      (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
-      AtomicNoRet <opName, 1> {
-    let has_saddr = 1;
-  }
-
   def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
@@ -311,6 +301,25 @@ multiclass FLAT_Global_Atomic_Pseudo<
     let enabled_saddr = 1;
     let PseudoInstr = NAME#"_SADDR";
   }
+}
+
+multiclass FLAT_Global_Atomic_Pseudo_RTN<
+  string opName,
+  RegisterClass vdst_rc,
+  ValueType vt,
+  SDPatternOperator atomic = null_frag,
+  ValueType data_vt = vt,
+  RegisterClass data_rc = vdst_rc> {
+
+  def _RTN : FLAT_AtomicRet_Pseudo <opName,
+    (outs vdst_rc:$vdst),
+      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
+    " $vdst, $vaddr, $vdata, off$offset glc$slc",
+    [(set vt:$vdst,
+      (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+      AtomicNoRet <opName, 1> {
+    let has_saddr = 1;
+  }
 
   def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
@@ -323,6 +332,16 @@ multiclass FLAT_Global_Atomic_Pseudo<
   }
 }
 
+multiclass FLAT_Global_Atomic_Pseudo<
+  string opName,
+  RegisterClass vdst_rc,
+  ValueType vt,
+  SDPatternOperator atomic = null_frag,
+  ValueType data_vt = vt,
+  RegisterClass data_rc = vdst_rc> :
+    FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>,
+    FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>;
+
 class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
   (ops node:$ptr, node:$value),
   (atomic_op node:$ptr, node:$value),
-- 
GitLab


From 6747ae50adc5830adedc28f84ba8d690c8e9e9a7 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Wed, 7 Nov 2018 21:53:29 +0000
Subject: [PATCH 1093/1116] AMDGPU/InsertWaitcnts: Remove kill-related logic

Summary:
This is not needed, because we don't actually insert relevant branches
for KILLs that late in the compilation flow.

Besides, this was always checking for the wrong kill opcode anyway...

Reviewers: msearles, rampitec, scott.linder, kanarayan

Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D54085

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346362 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 102 +------------------------
 1 file changed, 1 insertion(+), 101 deletions(-)

diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 819b1b9fcd7..a785461b13d 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -382,8 +382,6 @@ private:
 
   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
 
-  std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
-
   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
   // because of amdgpu-waitcnt-forcezero flag
   bool ForceEmitZeroWaitcnts;
@@ -410,13 +408,6 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
-    // The waitcnt information is copied because it changes as the block is
-    // traversed.
-    KillWaitBrackets.push_back(
-        llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
-  }
-
   bool isForceEmitWaitcnt() const {
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1))
@@ -1425,24 +1416,6 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
   }
 
-  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
-  // Also handle kills for exit block.
-  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
-    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        int Span = KillWaitBrackets[I]->getScoreUB(T) -
-                   KillWaitBrackets[I]->getScoreLB(T);
-        MaxPending[T] = std::max(MaxPending[T], Span);
-        Span = KillWaitBrackets[I]->pendingFlat(T) -
-               KillWaitBrackets[I]->getScoreLB(T);
-        MaxFlat[T] = std::max(MaxFlat[T], Span);
-      }
-
-      MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
-    }
-  }
-
   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
@@ -1460,18 +1433,6 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
   }
 
-  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
-  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
-    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
-      int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
-                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
-      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
-      int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
-                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
-      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
-    }
-  }
-
 #if 0
   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
   // TODO: how does LC distinguish between function entry and main entry?
@@ -1551,60 +1512,6 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
     }
   }
 
-  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
-  // Set the register scoreboard.
-  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
-    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
-      // Now merge the gpr_reg_score information.
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        int PredLB = KillWaitBrackets[I]->getScoreLB(T);
-        int PredUB = KillWaitBrackets[I]->getScoreUB(T);
-        if (PredLB < PredUB) {
-          int PredScale = MaxPending[T] - PredUB;
-          // Merge vgpr scores.
-          for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
-            int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
-            if (PredRegScore <= PredLB)
-              continue;
-            int NewRegScore = PredScale + PredRegScore;
-            ScoreBrackets->setRegScore(
-                J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
-          }
-          // Also need to merge sgpr scores for lgkm_cnt.
-          if (T == LGKM_CNT) {
-            for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
-              int PredRegScore =
-                  KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
-              if (PredRegScore <= PredLB)
-                continue;
-              int NewRegScore = PredScale + PredRegScore;
-              ScoreBrackets->setRegScore(
-                  J + NUM_ALL_VGPRS, LGKM_CNT,
-                  std::max(
-                      ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
-                      NewRegScore));
-            }
-          }
-        }
-      }
-
-      // Also merge the WaitEvent information.
-      ForAllWaitEventType(W) {
-        enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
-        int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
-        if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
-          int NewEventUB =
-              MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
-          if (NewEventUB > 0) {
-            ScoreBrackets->setEventUB(
-                W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
-          }
-        }
-      }
-    }
-  }
-
   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
   // sequencing predecessors, because changes to EXEC require waitcnts due to
   // the delayed nature of these operations.
@@ -1701,13 +1608,6 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       continue;
     }
 
-    // Kill instructions generate a conditional branch to the endmain block.
-    // Merge the current waitcnt state into the endmain block information.
-    // TODO: Are there other flavors of KILL instruction?
-    if (Inst.getOpcode() == AMDGPU::KILL) {
-      addKillWaitBracket(ScoreBrackets);
-    }
-
     bool VCCZBugWorkAround = false;
     if (readsVCCZ(Inst) &&
         (!VCCZBugHandledSet.count(&Inst))) {
@@ -1871,7 +1771,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   LoopWaitcntDataMap.clear();
   BlockWaitcntProcessedSet.clear();
 
-  // Walk over the blocks in reverse post-dominator order, inserting
+  // Walk over the blocks in reverse post order, inserting
   // s_waitcnt where needed.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   bool Modified = false;
-- 
GitLab


From d3698200d1f0d17ad23f0e50ed5d3fa71985188c Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Wed, 7 Nov 2018 21:53:36 +0000
Subject: [PATCH 1094/1116] AMDGPU/InsertWaitcnts: Cleanup some old cruft
 (NFCI)

Summary: Remove redundant logic and simplify control flow.

Reviewers: msearles, rampitec, scott.linder, kanarayan

Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D54086

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346363 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 162 +++++++++++--------------
 1 file changed, 71 insertions(+), 91 deletions(-)

diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index a785461b13d..eb39984f795 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -880,24 +880,14 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
   // Start with an assumption that there is no need to emit.
   unsigned int EmitWaitcnt = 0;
 
-  // No need to wait before phi. If a phi-move exists, then the wait should
-  // has been inserted before the move. If a phi-move does not exist, then
-  // wait should be inserted before the real use. The same is true for
-  // sc-merge. It is not a coincident that all these cases correspond to the
-  // instructions that are skipped in the assembling loop.
-  bool NeedLineMapping = false; // TODO: Check on this.
-
   // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
   bool ForceEmitZeroWaitcnt = false;
 
   setForceEmitWaitcnt();
   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
 
-  if (MI.isDebugInstr() &&
-      // TODO: any other opcode?
-      !NeedLineMapping) {
+  if (MI.isDebugInstr())
     return;
-  }
 
   // See if an s_waitcnt is forced at block entry, or is needed at
   // program end.
@@ -1141,7 +1131,6 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
   if (EmitWaitcnt || IsForceEmitWaitcnt) {
     int CntVal[NUM_INST_CNTS];
 
-    bool UseDefaultWaitcntStrategy = true;
     if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
       // Force all waitcnts to 0.
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@@ -1151,10 +1140,7 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
       CntVal[VM_CNT] = 0;
       CntVal[EXP_CNT] = 0;
       CntVal[LGKM_CNT] = 0;
-      UseDefaultWaitcntStrategy = false;
-    }
-
-    if (UseDefaultWaitcntStrategy) {
+    } else {
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
            T = (enum InstCounterType)(T + 1)) {
         if (EmitWaitcnt & CNT_MASK(T)) {
@@ -1178,95 +1164,89 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
       }
     }
 
-    // If we are not waiting on any counter we can skip the wait altogether.
-    if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
-      MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
-      int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
-      if (!OldWaitcnt ||
-          (AMDGPU::decodeVmcnt(IV, Imm) !=
-                          (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
-          (AMDGPU::decodeExpcnt(IV, Imm) !=
-           (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
-          (AMDGPU::decodeLgkmcnt(IV, Imm) !=
-           (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
-        MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
-        if (ContainingLoop) {
-          MachineBasicBlock *TBB = ContainingLoop->getHeader();
-          BlockWaitcntBrackets *ScoreBracket =
-              BlockWaitcntBracketsMap[TBB].get();
-          if (!ScoreBracket) {
-            assert(!BlockVisitedSet.count(TBB));
-            BlockWaitcntBracketsMap[TBB] =
-                llvm::make_unique<BlockWaitcntBrackets>(ST);
-            ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
-          }
-          ScoreBracket->setRevisitLoop(true);
-          LLVM_DEBUG(dbgs()
-                         << "set-revisit2: Block"
-                         << ContainingLoop->getHeader()->getNumber() << '\n';);
+    MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
+    int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
+    if (!OldWaitcnt ||
+        (AMDGPU::decodeVmcnt(IV, Imm) !=
+         (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
+        (AMDGPU::decodeExpcnt(IV, Imm) !=
+         (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
+        (AMDGPU::decodeLgkmcnt(IV, Imm) !=
+         (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
+      MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
+      if (ContainingLoop) {
+        MachineBasicBlock *TBB = ContainingLoop->getHeader();
+        BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
+        if (!ScoreBracket) {
+          assert(!BlockVisitedSet.count(TBB));
+          BlockWaitcntBracketsMap[TBB] =
+              llvm::make_unique<BlockWaitcntBrackets>(ST);
+          ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
         }
+        ScoreBracket->setRevisitLoop(true);
+        LLVM_DEBUG(dbgs() << "set-revisit2: Block"
+                          << ContainingLoop->getHeader()->getNumber() << '\n';);
       }
+    }
 
-      // Update an existing waitcount, or make a new one.
-      unsigned Enc = AMDGPU::encodeWaitcnt(IV,
+    // Update an existing waitcount, or make a new one.
+    unsigned Enc = AMDGPU::encodeWaitcnt(IV,
                       ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
                       ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
                       ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
-      // We don't remove waitcnts that existed prior to the waitcnt
-      // pass. Check if the waitcnt to-be-inserted can be avoided
-      // or if the prev waitcnt can be updated.
-      bool insertSWaitInst = true;
-      for (MachineBasicBlock::iterator I = MI.getIterator(),
-                                       B = MI.getParent()->begin();
-           insertSWaitInst && I != B; --I) {
-        if (I == MI.getIterator())
-          continue;
+    // We don't remove waitcnts that existed prior to the waitcnt
+    // pass. Check if the waitcnt to-be-inserted can be avoided
+    // or if the prev waitcnt can be updated.
+    bool insertSWaitInst = true;
+    for (MachineBasicBlock::iterator I = MI.getIterator(),
+                                     B = MI.getParent()->begin();
+         insertSWaitInst && I != B; --I) {
+      if (I == MI.getIterator())
+        continue;
 
-        switch (I->getOpcode()) {
-        case AMDGPU::S_WAITCNT:
-          if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
-            insertSWaitInst = false;
-          else if (!OldWaitcnt) {
-            OldWaitcnt = &*I;
-            Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
-          }
-          break;
-        // TODO: skip over instructions which never require wait.
+      switch (I->getOpcode()) {
+      case AMDGPU::S_WAITCNT:
+        if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
+          insertSWaitInst = false;
+        else if (!OldWaitcnt) {
+          OldWaitcnt = &*I;
+          Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
         }
         break;
+        // TODO: skip over instructions which never require wait.
       }
-      if (insertSWaitInst) {
-        if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
-          if (ForceEmitZeroWaitcnts)
-            LLVM_DEBUG(
-                dbgs()
-                << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
-          if (IsForceEmitWaitcnt)
-            LLVM_DEBUG(dbgs()
-                       << "Force emit a s_waitcnt due to debug counter\n");
-
-          OldWaitcnt->getOperand(0).setImm(Enc);
-          if (!OldWaitcnt->getParent())
-            MI.getParent()->insert(MI, OldWaitcnt);
-
-          LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
-                            << "Old Instr: " << MI << '\n'
-                            << "New Instr: " << *OldWaitcnt << '\n');
-        } else {
-            auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
-                               MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+      break;
+    }
+    if (insertSWaitInst) {
+      if (OldWaitcnt) {
+        assert(OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT);
+        if (ForceEmitZeroWaitcnts)
+          LLVM_DEBUG(dbgs()
+                     << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
+        if (IsForceEmitWaitcnt)
+          LLVM_DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n");
+
+        OldWaitcnt->getOperand(0).setImm(Enc);
+        if (!OldWaitcnt->getParent())
+          MI.getParent()->insert(MI, OldWaitcnt);
+
+        LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+                          << "Old Instr: " << MI << '\n'
+                          << "New Instr: " << *OldWaitcnt << '\n');
+      } else {
+        auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
+                                 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
                              .addImm(Enc);
-            TrackedWaitcntSet.insert(SWaitInst);
+        TrackedWaitcntSet.insert(SWaitInst);
 
-            LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
-                              << "Old Instr: " << MI << '\n'
-                              << "New Instr: " << *SWaitInst << '\n');
-        }
+        LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                          << "Old Instr: " << MI << '\n'
+                          << "New Instr: " << *SWaitInst << '\n');
       }
+    }
 
-      if (CntVal[EXP_CNT] == 0) {
-        ScoreBrackets->setMixedExpTypes(false);
-      }
+    if (CntVal[EXP_CNT] == 0) {
+      ScoreBrackets->setMixedExpTypes(false);
     }
   }
 }
-- 
GitLab


From 69f971eb1814487fc23ee092a69532a8d152c80d Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Wed, 7 Nov 2018 21:53:43 +0000
Subject: [PATCH 1095/1116] Revert "AMDGPU: Divergence-driven selection of
 scalar buffer load intrinsics"

This reverts commit r344696 for now (except for some test additions).

See https://bugs.freedesktop.org/show_bug.cgi?id=108611.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346364 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIISelLowering.cpp       | 107 ++++--------
 lib/Target/AMDGPU/SIISelLowering.h         |   4 +-
 lib/Target/AMDGPU/SIInstrInfo.cpp          | 185 ++++++++++++++++++++-
 lib/Target/AMDGPU/SIInstrInfo.h            |   2 +
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp |   7 +-
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h   |   5 +-
 test/CodeGen/AMDGPU/smrd-fold-offset.mir   |   8 +-
 test/CodeGen/AMDGPU/smrd.ll                |  50 +++---
 8 files changed, 242 insertions(+), 126 deletions(-)

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 672784a9873..254f1362f1f 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4847,70 +4847,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   return SDValue(NewNode, 0);
 }
 
-SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
-                                       SDValue Offset, SDValue GLC,
-                                       SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
-          MachineMemOperand::MOInvariant,
-      VT.getStoreSize(), VT.getStoreSize());
-
-  if (!Offset->isDivergent()) {
-    SDValue Ops[] = {
-        Rsrc,
-        Offset, // Offset
-        GLC     // glc
-    };
-    return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
-                                   DAG.getVTList(VT), Ops, VT, MMO);
-  }
-
-  // We have a divergent offset. Emit a MUBUF buffer load instead. We can
-  // assume that the buffer is unswizzled.
-  SmallVector<SDValue, 4> Loads;
-  unsigned NumLoads = 1;
-  MVT LoadVT = VT.getSimpleVT();
-
-  assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 ||
-         LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32);
-
-  if (VT == MVT::v8i32 || VT == MVT::v16i32) {
-    NumLoads = VT == MVT::v16i32 ? 4 : 2;
-    LoadVT = MVT::v4i32;
-  }
-
-  SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
-  unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
-  SDValue Ops[] = {
-      DAG.getEntryNode(),                         // Chain
-      Rsrc,                                       // rsrc
-      DAG.getConstant(0, DL, MVT::i32),           // vindex
-      {},                                         // voffset
-      {},                                         // soffset
-      {},                                         // offset
-      DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
-      DAG.getConstant(0, DL, MVT::i1),            // idxen
-  };
-
-  // Use the alignment to ensure that the required offsets will fit into the
-  // immediate offsets.
-  setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
-
-  uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
-  for (unsigned i = 0; i < NumLoads; ++i) {
-    Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
-    Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
-                                            Ops, LoadVT, MMO));
-  }
-
-  if (VT == MVT::v8i32 || VT == MVT::v16i32)
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
-
-  return Loads[0];
-}
-
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -5065,15 +5001,38 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDZ);
   case AMDGPUIntrinsic::SI_load_const: {
-    SDValue Load =
-        lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
-                     DAG.getTargetConstant(0, DL, MVT::i1), DAG);
+    SDValue Ops[] = {
+      Op.getOperand(1),   // Ptr
+      Op.getOperand(2),   // Offset
+      DAG.getTargetConstant(0, DL, MVT::i1) // glc
+    };
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo(),
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+            MachineMemOperand::MOInvariant,
+        VT.getStoreSize(), 4);
+    SDVTList VTList = DAG.getVTList(MVT::i32);
+    SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+                                           VTList, Ops, MVT::i32, MMO);
+
     return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
   }
   case Intrinsic::amdgcn_s_buffer_load: {
     unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-    return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
-                        DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
+    SDValue Ops[] = {
+      Op.getOperand(1), // Ptr
+      Op.getOperand(2), // Offset
+      DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc
+    };
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo(),
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+            MachineMemOperand::MOInvariant,
+        VT.getStoreSize(), VT.getStoreSize());
+    return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
   }
   case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
@@ -6108,13 +6067,13 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
 // pointed to by Offsets.
 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
-                                        SelectionDAG &DAG, SDValue *Offsets,
-                                        unsigned Align) const {
+                                        SelectionDAG &DAG,
+                                        SDValue *Offsets) const {
   SDLoc DL(CombinedOffset);
   if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
     uint32_t Imm = C->getZExtValue();
     uint32_t SOffset, ImmOffset;
-    if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
+    if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) {
       Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
       Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
@@ -6126,8 +6085,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
     SDValue N1 = CombinedOffset.getOperand(1);
     uint32_t SOffset, ImmOffset;
     int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
-    if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
-                                                Subtarget, Align)) {
+    if (Offset >= 0
+        && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) {
       Offsets[0] = N0;
       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
       Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index d12c3ae4dba..73fa05ea58f 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -60,8 +60,6 @@ private:
                                  MVT VT, unsigned Offset) const;
   SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
                      SelectionDAG &DAG) const;
-  SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
-                       SDValue GLC, SelectionDAG &DAG) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -192,7 +190,7 @@ private:
   // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
   // pointed to by Offsets.
   void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
-                        SDValue *Offsets, unsigned Align = 4) const;
+                        SDValue *Offsets) const;
 
 public:
   SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4dd06df1233..562428ef37c 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3558,13 +3558,8 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
   // pointer value is uniform.
   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
-    unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
-    SBase->setReg(SGPR);
-  }
-  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
-  if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
-    unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
-    SOff->setReg(SGPR);
+      unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+      SBase->setReg(SGPR);
   }
 }
 
@@ -4193,6 +4188,115 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
       Inst.eraseFromParent();
       continue;
+
+    case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
+    case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+    case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+    case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+    case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
+      unsigned VDst;
+      unsigned NewOpcode;
+
+      switch(Opcode) {
+      case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
+        NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
+        VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        break;
+      case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+        NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
+        VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+        break;
+      case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+        NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+        VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
+        break;
+      case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+      case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
+        splitScalarBuffer(Worklist, Inst);
+        Inst.eraseFromParent();
+        continue;
+      }
+
+      const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
+      auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
+      unsigned Offset = 0;
+
+      // FIXME: This isn't safe because the addressing mode doesn't work
+      // correctly if vaddr is negative.
+      //
+      // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
+      //
+      // See if we can extract an immediate offset by recognizing one of these:
+      //   V_ADD_I32_e32 dst, imm, src1
+      //   V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
+      // V_ADD will be removed by "Remove dead machine instructions".
+      if (Add &&
+          (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
+           Add->getOpcode() == AMDGPU::V_ADD_U32_e32 ||
+           Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
+        static const unsigned SrcNames[2] = {
+          AMDGPU::OpName::src0,
+          AMDGPU::OpName::src1,
+        };
+
+        // Find a literal offset in one of source operands.
+        for (int i = 0; i < 2; i++) {
+          const MachineOperand *Src =
+            getNamedOperand(*Add, SrcNames[i]);
+
+          if (Src->isReg()) {
+            MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg());
+            if (Def) {
+              if (Def->isMoveImmediate())
+                Src = &Def->getOperand(1);
+              else if (Def->isCopy()) {
+                auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
+                if (Mov && Mov->isMoveImmediate()) {
+                  Src = &Mov->getOperand(1);
+                }
+              }
+            }
+          }
+
+          if (Src) {
+            if (Src->isImm())
+              Offset = Src->getImm();
+            else if (Src->isCImm())
+              Offset = Src->getCImm()->getZExtValue();
+          }
+
+          if (Offset && isLegalMUBUFImmOffset(Offset)) {
+            VAddr = getNamedOperand(*Add, SrcNames[!i]);
+            break;
+          }
+
+          Offset = 0;
+        }
+      }
+
+      MachineInstr *NewInstr =
+          BuildMI(*MBB, Inst, Inst.getDebugLoc(),
+                  get(NewOpcode), VDst)
+              .add(*VAddr)                                        // vaddr
+              .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
+              .addImm(0)                                          // soffset
+              .addImm(Offset)                                     // offset
+              .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
+              .addImm(0) // slc
+              .addImm(0) // tfe
+              .cloneMemRefs(Inst)
+              .getInstr();
+
+      MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
+                         VDst);
+      addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
+      Inst.eraseFromParent();
+
+      // Legalize all operands other than the offset. Notably, convert the srsrc
+      // into SGPRs using v_readfirstlane if needed.
+      legalizeOperands(*NewInstr, MDT);
+      continue;
+    }
     }
 
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
@@ -4674,6 +4778,73 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
+void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist,
+                                    MachineInstr &Inst) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineBasicBlock::iterator MII = Inst;
+  auto &DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);;
+  MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase);
+  MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff);
+  MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc);
+
+  unsigned Opcode = Inst.getOpcode();
+  unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+  unsigned Count = 0;
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+
+  switch(Opcode) {
+  default:
+    return;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+    Count = 2;
+    break;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
+    Count = 4;
+    break;
+  }
+
+  // FIXME: Should also attempt to build VAddr and Offset like the non-split
+  // case (see call site for this function)
+
+  // Create a vector of result registers
+  SmallVector<unsigned, 8> ResultRegs;
+  for (unsigned i = 0; i < Count ; ++i) {
+    unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
+    MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg)
+      .addReg(Offset.getReg())  // offset
+      .addReg(Rsrc.getReg())    // rsrc
+      .addImm(0)                // soffset
+      .addImm(i << 4)           // inst_offset
+      .addImm(Glc.getImm())     // glc
+      .addImm(0)                // slc
+      .addImm(0)                // tfe
+      .addMemOperand(*Inst.memoperands_begin());
+    // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE
+    auto &NewDestOp = NewMI.getOperand(0);
+    for (unsigned i = 0 ; i < 4 ; i++)
+      ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass,
+                                              RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass));
+  }
+  // Create a new combined result to replace original with
+  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+  MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL,
+                                  get(TargetOpcode::REG_SEQUENCE), FullDestReg);
+
+  for (unsigned i = 0 ; i < Count * 4 ; ++i) {
+    CombinedResBuilder
+      .addReg(ResultRegs[i])
+      .addImm(RI.getSubRegFromChannel(i));
+  }
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
 void SIInstrInfo::addUsersToMoveToVALUWorklist(
   unsigned DstReg,
   MachineRegisterInfo &MRI,
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 34cac88cbf1..2f51b199950 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -103,6 +103,8 @@ private:
                             MachineInstr &Inst) const;
   void splitScalar64BitBFE(SetVectorType &Worklist,
                            MachineInstr &Inst) const;
+  void splitScalarBuffer(SetVectorType &Worklist,
+                         MachineInstr &Inst) const;
   void movePackToVALU(SetVectorType &Worklist,
                       MachineRegisterInfo &MRI,
                       MachineInstr &Inst) const;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 9d567579d71..634ec8fcc3d 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -894,12 +894,9 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
 // Given Imm, split it into the values to put into the SOffset and ImmOffset
 // fields in an MUBUF instruction. Return false if it is not possible (due to a
 // hardware bug needing a workaround).
-//
-// The required alignment ensures that individual address components remain
-// aligned if they are aligned to begin with. It also ensures that additional
-// offsets within the given alignment can be added to the resulting ImmOffset.
 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
-                      const GCNSubtarget *Subtarget, uint32_t Align) {
+                      const GCNSubtarget *Subtarget) {
+  const uint32_t Align = 4;
   const uint32_t MaxImm = alignDown(4095, Align);
   uint32_t Overflow = 0;
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index af5ab9bf269..d45f4249869 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -441,8 +441,11 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 /// not the encoded offset.
 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 
+// Given Imm, split it into the values to put into the SOffset and ImmOffset
+// fields in an MUBUF instruction. Return false if it is not possible (due to a
+// hardware bug needing a workaround).
 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
-                      const GCNSubtarget *Subtarget, uint32_t Align = 4);
+                      const GCNSubtarget *Subtarget);
 
 /// \returns true if the intrinsic is divergent
 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/test/CodeGen/AMDGPU/smrd-fold-offset.mir b/test/CodeGen/AMDGPU/smrd-fold-offset.mir
index 10601ccaeb7..44954f06523 100644
--- a/test/CodeGen/AMDGPU/smrd-fold-offset.mir
+++ b/test/CodeGen/AMDGPU/smrd-fold-offset.mir
@@ -1,8 +1,6 @@
 # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies -o - %s | FileCheck -check-prefix=GCN %s
 
-# GCN-LABEL: name: smrd_vgpr_offset_imm
-# GCN: V_READFIRSTLANE_B32
-# GCN: S_BUFFER_LOAD_DWORD_SGPR
+# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
 ---
 name:            smrd_vgpr_offset_imm
 body:             |
@@ -24,9 +22,7 @@ body:             |
     SI_RETURN_TO_EPILOG $vgpr0
 ...
 
-# GCN-LABEL: name: smrd_vgpr_offset_imm_add_u32
-# GCN: V_READFIRSTLANE_B32
-# GCN: S_BUFFER_LOAD_DWORD_SGPR
+# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
 ---
 name:            smrd_vgpr_offset_imm_add_u32
 body:             |
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index f453cfdbd1f..c87145a1a5b 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -292,19 +292,18 @@ main_body:
 
 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm:
 ; GCN-NEXT: %bb.
-; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ;
+; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ;
 define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 {
 main_body:
-  %off = add i32 %offset, 4092
+  %off = add i32 %offset, 4095
   %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off)
   ret float %r
 }
 
 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large:
 ; GCN-NEXT: %bb.
-; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
-; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
-; VIGFX9-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ;
+; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
+; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
 define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 {
 main_body:
   %off = add i32 %offset, 4096
@@ -511,15 +510,12 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}smrd_load_nonconst4:
-; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
-; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
-; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
-; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
-; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ;
+; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
+; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0xff8, v0 ;
+; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
 ; GCN: ; return to shader part epilog
 define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 {
 main_body:
@@ -530,16 +526,12 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}smrd_load_nonconst5:
-; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0
-; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
-; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
-; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
-; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
-; VIGFX9: s_movk_i32 s4, 0xfc0
-; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ;
+; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0x1004, v0
+; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x1004, v0
+; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
 ; GCN: ; return to shader part epilog
 define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 {
 main_body:
@@ -567,10 +559,9 @@ main_body:
 
 ; GCN-LABEL: {{^}}smrd_uniform_loop:
 ;
-; TODO: we should keep the loop counter in an SGPR
+; TODO: this should use an s_buffer_load
 ;
-; GCN: v_readfirstlane_b32
-; GCN: s_buffer_load_dword
+; GCN: buffer_load_dword
 define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
 main_body:
   br label %loop
@@ -594,10 +585,9 @@ exit:
 ; (this test differs from smrd_uniform_loop by the more complex structure of phis,
 ; which used to confuse the DivergenceAnalysis after structurization)
 ;
-; TODO: we should keep the loop counter in an SGPR
+; TODO: we should keep the loop counter in an SGPR and use an S_BUFFER_LOAD
 ;
-; GCN: v_readfirstlane_b32
-; GCN: s_buffer_load_dword
+; GCN: buffer_load_dword
 define amdgpu_ps float @smrd_uniform_loop2(<4 x i32> inreg %desc, i32 %bound, i32 %bound.a) #0 {
 main_body:
   br label %loop
-- 
GitLab


From 9a031b8a93abc05c7fdac300da593630e2e1da46 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Wed, 7 Nov 2018 22:30:01 +0000
Subject: [PATCH 1096/1116] Add parentheses to silence warning.

DWARFContext.cpp:356:20: error: using the result of an assignment as a condition without parentheses [-Werror,-Wparentheses]

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346365 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/DWARF/DWARFContext.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 99cf9b985c0..7ab54de6bc4 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -353,7 +353,7 @@ void DWARFContext::dump(
 
   auto dumpDebugInfo = [&](const char *Name, unit_iterator_range Units) {
     OS << '\n' << Name << " contents:\n";
-    if (DumpOffset = DumpOffsets[DIDT_ID_DebugInfo])
+    if ((DumpOffset = DumpOffsets[DIDT_ID_DebugInfo]))
       for (const auto &U : Units)
         U->getDIEForOffset(DumpOffset.getValue())
             .dump(OS, 0, DumpOpts.noImplicitRecursion());
-- 
GitLab


From 6b181a92166280ec9d751b437ed9fb7143ecd5c3 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Wed, 7 Nov 2018 22:30:56 +0000
Subject: [PATCH 1097/1116] [AArch64] [Windows] Address post-commit review
 comment on r346358.

In this context, usesWindowsCFI() is basically the same thing as
isOSWindows(), but it makes the relevant property of the target
more explicit.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346366 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64TargetMachine.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 5168c0c67da..2f3f87d02b7 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
@@ -263,7 +264,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
     this->Options.NoTrapAfterNoreturn = true;
   }
 
-  if (TT.isOSWindows()) {
+  if (getMCAsmInfo()->usesWindowsCFI()) {
     // Unwinding can get confused if the last instruction in an
     // exception-handling region (function, funclet, try block, etc.)
     // is a call.
-- 
GitLab


From ba1ac60e22894fe10717eb1e73a2b52b2cfeb943 Mon Sep 17 00:00:00 2001
From: Nathan Lanza <nathan@lanza.io>
Date: Wed, 7 Nov 2018 23:22:09 +0000
Subject: [PATCH 1098/1116] Reorder FindPythonInterp so that config-ix can use
 PYTHON_EXECUTABLE

Summary:
Code in config-ix tries to call `PYTHON_EXECUTABLE` to search for some
python modules but that variable isn't set until the moved chunk of
code that finds Python is called.

Reorder it so CMake can use PYTHON_EXECUTABLE

Subscribers: mgorny, llvm-commits

Differential Revision: https://reviews.llvm.org/D52763

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346367 91177308-0d34-0410-b5e6-96231b3b80d8
---
 CMakeLists.txt | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d767208edb..4591ce56b1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -572,6 +572,22 @@ mark_as_advanced(LLVM_TARGET_TRIPLE_ENV)
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR OFF CACHE BOOL
   "Enable per-target runtimes directory")
 
+# Verify that we can find a Python 2 interpreter.  Python 3 is unsupported.
+# FIXME: We should support systems with only Python 3, but that requires work
+# on LLDB.
+set(Python_ADDITIONAL_VERSIONS 2.7)
+include(FindPythonInterp)
+if( NOT PYTHONINTERP_FOUND )
+  message(FATAL_ERROR
+"Unable to find Python interpreter, required for builds and testing.
+
+Please install Python or specify the PYTHON_EXECUTABLE CMake variable.")
+endif()
+
+if( ${PYTHON_VERSION_STRING} VERSION_LESS 2.7 )
+  message(FATAL_ERROR "Python 2.7 or newer is required")
+endif()
+
 # All options referred to from HandleLLVMOptions have to be specified
 # BEFORE this include, otherwise options will not be correctly set on
 # first cmake run
@@ -591,22 +607,6 @@ message(STATUS "LLVM default target triple: ${LLVM_DEFAULT_TARGET_TRIPLE}")
 
 include(HandleLLVMOptions)
 
-# Verify that we can find a Python 2 interpreter.  Python 3 is unsupported.
-# FIXME: We should support systems with only Python 3, but that requires work
-# on LLDB.
-set(Python_ADDITIONAL_VERSIONS 2.7)
-include(FindPythonInterp)
-if( NOT PYTHONINTERP_FOUND )
-  message(FATAL_ERROR
-"Unable to find Python interpreter, required for builds and testing.
-
-Please install Python or specify the PYTHON_EXECUTABLE CMake variable.")
-endif()
-
-if( ${PYTHON_VERSION_STRING} VERSION_LESS 2.7 )
-  message(FATAL_ERROR "Python 2.7 or newer is required")
-endif()
-
 ######
 # LLVMBuild Integration
 #
-- 
GitLab


From bcb50253e9128093b09314e7cb941ab7cad32908 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Wed, 7 Nov 2018 23:33:55 +0000
Subject: [PATCH 1099/1116] Add 'REQUIRES: default_triple' to
 test/CodeGen/MIR/X86/zero-probability.mir

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346368 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/MIR/X86/zero-probability.mir | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/CodeGen/MIR/X86/zero-probability.mir b/test/CodeGen/MIR/X86/zero-probability.mir
index c6863dfbbda..a0200229901 100644
--- a/test/CodeGen/MIR/X86/zero-probability.mir
+++ b/test/CodeGen/MIR/X86/zero-probability.mir
@@ -1,5 +1,6 @@
 # RUN: llc -run-pass=none -o /dev/null %s
 # REQUIRES: asserts
+# REQUIRES: default_triple
 # Makes sure that having a probability of 0x00000000 to branch to a successor
 # doesn't hit an APInt assert in the MIParser.
 
-- 
GitLab


From d079edbd8df3a1a0269e95409e92316fefe2ac19 Mon Sep 17 00:00:00 2001
From: Rong Xu <xur@google.com>
Date: Wed, 7 Nov 2018 23:51:20 +0000
Subject: [PATCH 1100/1116] [PGO] Exit early if all count values are zero

If all the edge counts for a function are zero, skip count population and
annotation, as nothing will happen. This can save some compile time.

Differential Revision: https://reviews.llvm.org/D54212


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346370 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Instrumentation/PGOInstrumentation.cpp    | 15 ++++++++--
 .../PGOProfile/Inputs/func_entry.proftext     | 17 +++++++++++
 test/Transforms/PGOProfile/func_entry.ll      | 29 +++++++++++++++++++
 3 files changed, 58 insertions(+), 3 deletions(-)
 create mode 100644 test/Transforms/PGOProfile/Inputs/func_entry.proftext
 create mode 100644 test/Transforms/PGOProfile/func_entry.ll

diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 4790c9e5cfe..876ae23dfd2 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -859,7 +859,7 @@ public:
         FreqAttr(FFA_Normal) {}
 
   // Read counts for the instrumented BB from profile.
-  bool readCounters(IndexedInstrProfReader *PGOReader);
+  bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros);
 
   // Populate the counts for all BBs.
   void populateCounters();
@@ -904,6 +904,7 @@ public:
     FuncInfo.dumpInfo(Str);
   }
 
+  uint64_t getProgramMaxCount() const { return ProgramMaxCount; }
 private:
   Function &F;
   Module *M;
@@ -1013,7 +1014,7 @@ void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
 // Read the profile from ProfileFileName and assign the value to the
 // instrumented BB and the edges. This function also updates ProgramMaxCount.
 // Return true if the profile are successfully read, and false on errors.
-bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
+bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros) {
   auto &Ctx = M->getContext();
   Expected<InstrProfRecord> Result =
       PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
@@ -1053,6 +1054,7 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
     LLVM_DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n");
     ValueSum += CountFromProfile[I];
   }
+  AllZeros = (ValueSum == 0);
 
   LLVM_DEBUG(dbgs() << "SUM =  " << ValueSum << "\n");
 
@@ -1477,8 +1479,15 @@ static bool annotateAllFunctions(
     // later in getInstrBB() to avoid invalidating it.
     SplitIndirectBrCriticalEdges(F, BPI, BFI);
     PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI);
-    if (!Func.readCounters(PGOReader.get()))
+    bool AllZeros = false;
+    if (!Func.readCounters(PGOReader.get(), AllZeros))
       continue;
+    if (AllZeros) {
+      F.setEntryCount(ProfileCount(0, Function::PCT_Real));
+      if (Func.getProgramMaxCount() != 0)
+        ColdFunctions.push_back(&F);
+      continue;
+    }
     Func.populateCounters();
     Func.setBranchWeights();
     Func.annotateValueSites();
diff --git a/test/Transforms/PGOProfile/Inputs/func_entry.proftext b/test/Transforms/PGOProfile/Inputs/func_entry.proftext
new file mode 100644
index 00000000000..2dc2c2ec9f3
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/func_entry.proftext
@@ -0,0 +1,17 @@
+# IR level Instrumentation Flag
+:ir
+foo
+# Func Hash:
+12884901887
+# Num Counters:
+1
+# Counter Values:
+9999
+
+bar
+# Func Hash:
+12884901887
+# Num Counters:
+1
+# Counter Values:
+0
diff --git a/test/Transforms/PGOProfile/func_entry.ll b/test/Transforms/PGOProfile/func_entry.ll
new file mode 100644
index 00000000000..dac996e35cb
--- /dev/null
+++ b/test/Transforms/PGOProfile/func_entry.ll
@@ -0,0 +1,29 @@
+; RUN: llvm-profdata merge %S/Inputs/func_entry.proftext -o %t.profdata
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@s = common dso_local local_unnamed_addr global i32 0, align 4
+
+define void @bar() {
+; CHECK-LABEL: @bar
+; CHECK-SAME: !prof ![[FUNC_ENTRY_COUNT_ZERO:[0-9]+]]
+
+entry:
+  store i32 1, i32* @s, align 4
+  ret void
+}
+
+define void @foo() {
+; CHECK-LABEL: @foo
+; CHECK-SAME: !prof ![[FUNC_ENTRY_COUNT_NON_ZERO:[0-9]+]]
+entry:
+  %0 = load i32, i32* @s, align 4
+  %add = add nsw i32 %0, 4
+  store i32 %add, i32* @s, align 4
+  ret void
+}
+
+; CHECK-DAG: ![[FUNC_ENTRY_COUNT_ZERO]] = !{!"function_entry_count", i64 0}
+; CHECK-DAG: ![[FUNC_ENTRY_COUNT_NON_ZERO]] = !{!"function_entry_count", i64 9999}
-- 
GitLab


From 8e906179e358b0eaab378f5f93e293410c6e72e8 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Wed, 7 Nov 2018 23:53:50 +0000
Subject: [PATCH 1101/1116] [llvm-readobj] Implement LLVM style printer for
 --notes

Summary:
Port the GNU style printNotes method to the LLVMStyle subclass.

This is basically just a heavy refactor so that the note parsing/formatting logic from the GNUStyle::printNotes can be shared with LLVMStyle::printNotes.

Reviewers: MaskRay

Reviewed By: MaskRay

Subscribers: dschuff, fedor.sergeev, llvm-commits

Differential Revision: https://reviews.llvm.org/D54220

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346371 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-readobj/gnu-notes.test       |  62 +++-
 test/tools/llvm-readobj/note-gnu-property.s  |  62 +++-
 test/tools/llvm-readobj/note-gnu-property2.s |  26 +-
 tools/llvm-readobj/ELFDumper.cpp             | 312 +++++++++++++------
 4 files changed, 341 insertions(+), 121 deletions(-)

diff --git a/test/tools/llvm-readobj/gnu-notes.test b/test/tools/llvm-readobj/gnu-notes.test
index 1a9c7e304b1..21078231f19 100644
--- a/test/tools/llvm-readobj/gnu-notes.test
+++ b/test/tools/llvm-readobj/gnu-notes.test
@@ -1,15 +1,55 @@
 # RUN: yaml2obj %s > %t.so
-# RUN: llvm-readobj -elf-output-style GNU --notes %t.so | FileCheck %s
+# RUN: llvm-readobj -elf-output-style GNU --notes %t.so | FileCheck %s --check-prefix=GNU
+# RUN: llvm-readobj -elf-output-style LLVM --notes %t.so | FileCheck %s --check-prefix=LLVM
 
-# CHECK: Displaying notes found at file offset 0x00000300 with length 0x00000020:
-# CHECK:   Owner                 Data size       Description
-# CHECK:   GNU                  0x00000010       NT_GNU_BUILD_ID (unique build ID bitstring)
-# CHECK:     Build ID: 4fcb712aa6387724a9f465a32cd8c14b
+# GNU:      Displaying notes found at file offset 0x00000340 with length 0x00000020:
+# GNU-NEXT:   Owner                 Data size       Description
+# GNU-NEXT:   GNU                   0x00000010      NT_GNU_ABI_TAG (ABI version tag)
+# GNU-NEXT:     OS: Linux, ABI: 2.6.32
 
-# CHECK: Displaying notes found at file offset 0x0000036c with length 0x0000001c:
-# CHECK:   Owner                 Data size       Description
-# CHECK:   GNU                  0x00000009       NT_GNU_GOLD_VERSION (gold version)
-# CHECK:     Version: gold 1.11
+# GNU:      Displaying notes found at file offset 0x00000360 with length 0x00000020:
+# GNU-NEXT:   Owner                 Data size       Description
+# GNU-NEXT:   GNU                  0x00000010       NT_GNU_BUILD_ID (unique build ID bitstring)
+# GNU-NEXT:     Build ID: 4fcb712aa6387724a9f465a32cd8c14b
+
+# GNU:      Displaying notes found at file offset 0x000003cc with length 0x0000001c:
+# GNU-NEXT:   Owner                 Data size       Description
+# GNU-NEXT:   GNU                  0x00000009       NT_GNU_GOLD_VERSION (gold version)
+# GNU-NEXT:     Version: gold 1.11
+
+# LLVM:      Notes [
+# LLVM-NEXT:   NoteSection {
+# LLVM-NEXT:     Offset: 0x340
+# LLVM-NEXT:     Size: 0x20
+# LLVM-NEXT:     Note {
+# LLVM-NEXT:       Owner: GNU
+# LLVM-NEXT:       Data size: 0x10
+# LLVM-NEXT:       Type: NT_GNU_ABI_TAG (ABI version tag)
+# LLVM-NEXT:       OS: Linux
+# LLVM-NEXT:       ABI: 2.6.32
+# LLVM-NEXT:     }
+# LLVM-NEXT:   }
+# LLVM-NEXT:   NoteSection {
+# LLVM-NEXT:     Offset: 0x360
+# LLVM-NEXT:     Size: 0x20
+# LLVM-NEXT:     Note {
+# LLVM-NEXT:       Owner: GNU
+# LLVM-NEXT:       Data size: 0x10
+# LLVM-NEXT:       Type: NT_GNU_BUILD_ID (unique build ID bitstring)
+# LLVM-NEXT:       Build ID: 4fcb712aa6387724a9f465a32cd8c14b
+# LLVM-NEXT:     }
+# LLVM-NEXT:   }
+# LLVM-NEXT:   NoteSection {
+# LLVM-NEXT:     Offset: 0x3CC
+# LLVM-NEXT:     Size: 0x1C
+# LLVM-NEXT:     Note {
+# LLVM-NEXT:       Owner: GNU
+# LLVM-NEXT:       Data size: 0x9
+# LLVM-NEXT:       Type: NT_GNU_GOLD_VERSION (gold version)
+# LLVM-NEXT:       Version: gold 1.11
+# LLVM-NEXT:     }
+# LLVM-NEXT:   }
+# LLVM-NEXT: ]
 
 --- !ELF
 FileHeader:
@@ -18,6 +58,10 @@ FileHeader:
   Type:            ET_EXEC
   Machine:         EM_X86_64
 Sections:
+  - Name:            .note.ABI-tag
+    Type:            SHT_NOTE
+    AddressAlign:    0x0000000000000004
+    Content:         040000001000000001000000474E550000000000020000000600000020000000
   - Name:            .note.gnu.build-id
     Type:            SHT_NOTE
     Flags:           [ SHF_ALLOC ]
diff --git a/test/tools/llvm-readobj/note-gnu-property.s b/test/tools/llvm-readobj/note-gnu-property.s
index f0a9b131ed5..d513a3e460c 100644
--- a/test/tools/llvm-readobj/note-gnu-property.s
+++ b/test/tools/llvm-readobj/note-gnu-property.s
@@ -1,23 +1,51 @@
 // REQUIRES: x86-registered-target
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t
-// RUN: llvm-readobj -elf-output-style GNU --notes %t | FileCheck %s
+// RUN: llvm-readobj -elf-output-style GNU --notes %t | FileCheck %s --check-prefix=GNU
+// RUN: llvm-readobj -elf-output-style LLVM --notes %t | FileCheck %s --check-prefix=LLVM
 
-// CHECK:      Displaying notes found at file offset 0x00000040 with length 0x000000b8:
-// CHECK-NEXT:   Owner                 Data size       Description
-// CHECK-NEXT:   GNU                   0x000000a8      NT_GNU_PROPERTY_TYPE_0 (property note)
-// CHECK-NEXT:     Properties:  stack size: 0x100
-// CHECK-NEXT:     stack size: 0x100
-// CHECK-NEXT:     no copy on protected
-// CHECK-NEXT:     X86 features: SHSTK
-// CHECK-NEXT:     X86 features: IBT, SHSTK
-// CHECK-NEXT:     X86 features: none
-// CHECK-NEXT:     <application-specific type 0xfefefefe>
-// CHECK-NEXT:     stack size: <corrupt length: 0x0>
-// CHECK-NEXT:     stack size: <corrupt length: 0x4> 
-// CHECK-NEXT:     no copy on protected <corrupt length: 0x1>
-// CHECK-NEXT:     X86 features: <corrupt length: 0x0>
-// CHECK-NEXT:     X86 features: IBT, <unknown flags: 0xf000f000f000f000>
-// CHECK-NEXT:     <corrupt type (0x2) datasz: 0x1>
+// GNU:      Displaying notes found at file offset 0x00000040 with length 0x000000b8:
+// GNU-NEXT:   Owner                 Data size       Description
+// GNU-NEXT:   GNU                   0x000000a8      NT_GNU_PROPERTY_TYPE_0 (property note)
+// GNU-NEXT:     Properties:  stack size: 0x100
+// GNU-NEXT:     stack size: 0x100
+// GNU-NEXT:     no copy on protected
+// GNU-NEXT:     X86 features: SHSTK
+// GNU-NEXT:     X86 features: IBT, SHSTK
+// GNU-NEXT:     X86 features: none
+// GNU-NEXT:     <application-specific type 0xfefefefe>
+// GNU-NEXT:     stack size: <corrupt length: 0x0>
+// GNU-NEXT:     stack size: <corrupt length: 0x4>
+// GNU-NEXT:     no copy on protected <corrupt length: 0x1>
+// GNU-NEXT:     X86 features: <corrupt length: 0x0>
+// GNU-NEXT:     X86 features: IBT, <unknown flags: 0xf000f000f000f000>
+// GNU-NEXT:     <corrupt type (0x2) datasz: 0x1>
+
+// LLVM:      Notes [
+// LLVM-NEXT:   NoteSection {
+// LLVM-NEXT:     Offset: 0x40
+// LLVM-NEXT:     Size: 0xB8
+// LLVM-NEXT:     Note {
+// LLVM-NEXT:       Owner: GNU
+// LLVM-NEXT:       Data size: 0xA8
+// LLVM-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+// LLVM-NEXT:       Property [
+// LLVM-NEXT:         stack size: 0x100
+// LLVM-NEXT:         stack size: 0x100
+// LLVM-NEXT:         no copy on protected
+// LLVM-NEXT:         X86 features: SHSTK
+// LLVM-NEXT:         X86 features: IBT, SHSTK
+// LLVM-NEXT:         X86 features: none
+// LLVM-NEXT:         <application-specific type 0xfefefefe>
+// LLVM-NEXT:         stack size: <corrupt length: 0x0>
+// LLVM-NEXT:         stack size: <corrupt length: 0x4>
+// LLVM-NEXT:         no copy on protected <corrupt length: 0x1>
+// LLVM-NEXT:         X86 features: <corrupt length: 0x0>
+// LLVM-NEXT:         X86 features: IBT, <unknown flags: 0xf000f000f000f000>
+// LLVM-NEXT:         <corrupt type (0x2) datasz: 0x1>
+// LLVM-NEXT:       ]
+// LLVM-NEXT:     }
+// LLVM-NEXT:   }
+// LLVM-NEXT: ]
 
 .section ".note.gnu.property", "a"
 .align 4 
diff --git a/test/tools/llvm-readobj/note-gnu-property2.s b/test/tools/llvm-readobj/note-gnu-property2.s
index a7eca87eb3c..473e0a24a7e 100644
--- a/test/tools/llvm-readobj/note-gnu-property2.s
+++ b/test/tools/llvm-readobj/note-gnu-property2.s
@@ -1,11 +1,27 @@
 // REQUIRES: x86-registered-target
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t
-// RUN: llvm-readobj -elf-output-style GNU --notes %t | FileCheck %s
+// RUN: llvm-readobj -elf-output-style GNU --notes %t | FileCheck %s --check-prefix=GNU
+// RUN: llvm-readobj -elf-output-style LLVM --notes %t | FileCheck %s --check-prefix=LLVM
 
-// CHECK:      Displaying notes found at file offset 0x00000040 with length 0x00000014:
-// CHECK-NEXT:   Owner                 Data size       Description
-// CHECK-NEXT:   GNU                   0x00000004      NT_GNU_PROPERTY_TYPE_0 (property note)
-// CHECK-NEXT:     Properties:  <corrupted GNU_PROPERTY_TYPE_0>
+// GNU:      Displaying notes found at file offset 0x00000040 with length 0x00000014:
+// GNU-NEXT:   Owner                 Data size       Description
+// GNU-NEXT:   GNU                   0x00000004      NT_GNU_PROPERTY_TYPE_0 (property note)
+// GNU-NEXT:     Properties:  <corrupted GNU_PROPERTY_TYPE_0>
+
+// LLVM:      Notes [
+// LLVM-NEXT:   NoteSection {
+// LLVM-NEXT:     Offset: 0x40
+// LLVM-NEXT:     Size: 0x14
+// LLVM-NEXT:     Note {
+// LLVM-NEXT:       Owner: GNU
+// LLVM-NEXT:       Data size: 0x4
+// LLVM-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+// LLVM-NEXT:       Property [
+// LLVM-NEXT:         <corrupted GNU_PROPERTY_TYPE_0>
+// LLVM-NEXT:       ]
+// LLVM-NEXT:     }
+// LLVM-NEXT:   }
+// LLVM-NEXT: ]
 
 // Section below is broken, check we report that.
 
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index ae9da9ace22..c91d2c548bf 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -22,9 +22,9 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -43,6 +43,7 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
@@ -3654,40 +3655,41 @@ static std::string getAMDGPUNoteTypeName(const uint32_t NT) {
 }
 
 template <typename ELFT>
-static void printGNUProperty(raw_ostream &OS, uint32_t Type, uint32_t DataSize,
-                             ArrayRef<uint8_t> Data) {
+static std::string getGNUProperty(uint32_t Type, uint32_t DataSize,
+                                  ArrayRef<uint8_t> Data) {
+  std::string str;
+  raw_string_ostream OS(str);
   switch (Type) {
   default:
-    OS << format("    <application-specific type 0x%x>\n", Type);
-    return;
+    OS << format("<application-specific type 0x%x>", Type);
+    return OS.str();
   case GNU_PROPERTY_STACK_SIZE: {
-    OS << "    stack size: ";
+    OS << "stack size: ";
     if (DataSize == sizeof(typename ELFT::uint))
-      OS << format("0x%llx\n",
-                   (uint64_t)(*(const typename ELFT::Addr *)Data.data()));
+      OS << formatv("{0:x}",
+                    (uint64_t)(*(const typename ELFT::Addr *)Data.data()));
     else
-      OS << format("<corrupt length: 0x%x>\n", DataSize);
-    break;
+      OS << format("<corrupt length: 0x%x>", DataSize);
+    return OS.str();
   }
   case GNU_PROPERTY_NO_COPY_ON_PROTECTED:
-    OS << "    no copy on protected";
+    OS << "no copy on protected";
     if (DataSize)
       OS << format(" <corrupt length: 0x%x>", DataSize);
-    OS << "\n";
-    break;
+    return OS.str();
   case GNU_PROPERTY_X86_FEATURE_1_AND:
-    OS << "    X86 features: ";
+    OS << "X86 features: ";
     if (DataSize != 4 && DataSize != 8) {
-      OS << format("<corrupt length: 0x%x>\n", DataSize);
-      break;
+      OS << format("<corrupt length: 0x%x>", DataSize);
+      return OS.str();
     }
     uint64_t CFProtection =
         (DataSize == 4)
             ? support::endian::read32<ELFT::TargetEndianness>(Data.data())
             : support::endian::read64<ELFT::TargetEndianness>(Data.data());
     if (CFProtection == 0) {
-      OS << "none\n";
-      break;
+      OS << "none";
+      return OS.str();
     }
     if (CFProtection & GNU_PROPERTY_X86_FEATURE_1_IBT) {
       OS << "IBT";
@@ -3703,105 +3705,144 @@ static void printGNUProperty(raw_ostream &OS, uint32_t Type, uint32_t DataSize,
     }
     if (CFProtection)
       OS << format("<unknown flags: 0x%llx>", CFProtection);
-    OS << "\n";
-    break;
+    return OS.str();
   }
 }
 
 template <typename ELFT>
-static void printGNUNote(raw_ostream &OS, uint32_t NoteType,
-                         ArrayRef<typename ELFT::Word> Words, size_t Size) {
+static SmallVector<std::string, 4>
+getGNUPropertyList(ArrayRef<typename ELFT::Word> Words) {
   using Elf_Word = typename ELFT::Word;
 
+  SmallVector<std::string, 4> Properties;
+  ArrayRef<uint8_t> Arr(reinterpret_cast<const uint8_t *>(Words.data()),
+                        Words.size());
+  while (Arr.size() >= 8) {
+    uint32_t Type = *reinterpret_cast<const Elf_Word *>(Arr.data());
+    uint32_t DataSize = *reinterpret_cast<const Elf_Word *>(Arr.data() + 4);
+    Arr = Arr.drop_front(8);
+
+    // Take padding size into account if present.
+    uint64_t PaddedSize = alignTo(DataSize, sizeof(typename ELFT::uint));
+    std::string str;
+    raw_string_ostream OS(str);
+    if (Arr.size() < PaddedSize) {
+      OS << format("<corrupt type (0x%x) datasz: 0x%x>", Type, DataSize);
+      Properties.push_back(OS.str());
+      break;
+    }
+    Properties.push_back(
+        getGNUProperty<ELFT>(Type, DataSize, Arr.take_front(PaddedSize)));
+    Arr = Arr.drop_front(PaddedSize);
+  }
+
+  if (!Arr.empty())
+    Properties.push_back("<corrupted GNU_PROPERTY_TYPE_0>");
+
+  return Properties;
+}
+
+struct GNUAbiTag {
+  std::string OSName;
+  std::string ABI;
+  bool IsValid;
+};
+
+template <typename ELFT>
+static GNUAbiTag getGNUAbiTag(ArrayRef<typename ELFT::Word> Words) {
+  if (Words.size() < 4)
+    return {"", "", /*IsValid=*/false};
+
+  static const char *OSNames[] = {
+      "Linux", "Hurd", "Solaris", "FreeBSD", "NetBSD", "Syllable", "NaCl",
+  };
+  StringRef OSName = "Unknown";
+  if (Words[0] < array_lengthof(OSNames))
+    OSName = OSNames[Words[0]];
+  uint32_t Major = Words[1], Minor = Words[2], Patch = Words[3];
+  std::string str;
+  raw_string_ostream ABI(str);
+  ABI << Major << "." << Minor << "." << Patch;
+  return {OSName, ABI.str(), /*IsValid=*/true};
+}
+
+template <typename ELFT>
+static std::string getGNUBuildId(ArrayRef<typename ELFT::Word> Words) {
+  std::string str;
+  raw_string_ostream OS(str);
+  ArrayRef<uint8_t> ID(reinterpret_cast<const uint8_t *>(Words.data()),
+                       Words.size());
+  for (const auto &B : ID)
+    OS << format_hex_no_prefix(B, 2);
+  return OS.str();
+}
+
+template <typename ELFT>
+static StringRef getGNUGoldVersion(ArrayRef<typename ELFT::Word> Words) {
+  return StringRef(reinterpret_cast<const char *>(Words.data()), Words.size());
+}
+
+template <typename ELFT>
+static void printGNUNote(raw_ostream &OS, uint32_t NoteType,
+                         ArrayRef<typename ELFT::Word> Words) {
   switch (NoteType) {
   default:
     return;
   case ELF::NT_GNU_ABI_TAG: {
-    static const char *OSNames[] = {
-        "Linux", "Hurd", "Solaris", "FreeBSD", "NetBSD", "Syllable", "NaCl",
-    };
-
-    StringRef OSName = "Unknown";
-    if (Words[0] < array_lengthof(OSNames))
-      OSName = OSNames[Words[0]];
-    uint32_t Major = Words[1], Minor = Words[2], Patch = Words[3];
-
-    if (Words.size() < 4)
+    const GNUAbiTag &AbiTag = getGNUAbiTag<ELFT>(Words);
+    if (!AbiTag.IsValid)
       OS << "    <corrupt GNU_ABI_TAG>";
     else
-      OS << "    OS: " << OSName << ", ABI: " << Major << "." << Minor << "."
-         << Patch;
+      OS << "    OS: " << AbiTag.OSName << ", ABI: " << AbiTag.ABI;
     break;
   }
   case ELF::NT_GNU_BUILD_ID: {
-    OS << "    Build ID: ";
-    ArrayRef<uint8_t> ID(reinterpret_cast<const uint8_t *>(Words.data()), Size);
-    for (const auto &B : ID)
-      OS << format_hex_no_prefix(B, 2);
+    OS << "    Build ID: " << getGNUBuildId<ELFT>(Words);
     break;
   }
   case ELF::NT_GNU_GOLD_VERSION:
-    OS << "    Version: "
-       << StringRef(reinterpret_cast<const char *>(Words.data()), Size);
+    OS << "    Version: " << getGNUGoldVersion<ELFT>(Words);
     break;
   case ELF::NT_GNU_PROPERTY_TYPE_0:
     OS << "    Properties:";
-
-    ArrayRef<uint8_t> Arr(reinterpret_cast<const uint8_t *>(Words.data()),
-                          Size);
-    while (Arr.size() >= 8) {
-      uint32_t Type = *reinterpret_cast<const Elf_Word *>(Arr.data());
-      uint32_t DataSize = *reinterpret_cast<const Elf_Word *>(Arr.data() + 4);
-      Arr = Arr.drop_front(8);
-
-      // Take padding size into account if present.
-      uint64_t PaddedSize = alignTo(DataSize, sizeof(typename ELFT::uint));
-      if (Arr.size() < PaddedSize) {
-        OS << format("    <corrupt type (0x%x) datasz: 0x%x>\n", Type,
-                     DataSize);
-        break;
-      }
-      printGNUProperty<ELFT>(OS, Type, DataSize, Arr.take_front(PaddedSize));
-      Arr = Arr.drop_front(PaddedSize);
-    }
-
-    if (!Arr.empty())
-      OS << "    <corrupted GNU_PROPERTY_TYPE_0>";
+    for (const auto &Property : getGNUPropertyList<ELFT>(Words))
+      OS << "    " << Property << "\n";
     break;
   }
   OS << '\n';
 }
 
+struct AMDGPUNote {
+  std::string type;
+  std::string value;
+};
+
 template <typename ELFT>
-static void printAMDGPUNote(raw_ostream &OS, uint32_t NoteType,
-                            ArrayRef<typename ELFT::Word> Words, size_t Size) {
+static AMDGPUNote getAMDGPUNote(uint32_t NoteType,
+                                ArrayRef<typename ELFT::Word> Words) {
   switch (NoteType) {
   default:
-    return;
-    case ELF::NT_AMD_AMDGPU_HSA_METADATA:
-      OS << "    HSA Metadata:\n"
-         << StringRef(reinterpret_cast<const char *>(Words.data()), Size);
-      break;
-    case ELF::NT_AMD_AMDGPU_ISA:
-      OS << "    ISA Version:\n"
-         << "        "
-         << StringRef(reinterpret_cast<const char *>(Words.data()), Size);
-      break;
-    case ELF::NT_AMD_AMDGPU_PAL_METADATA:
-      const uint32_t *PALMetadataBegin = reinterpret_cast<const uint32_t *>(Words.data());
-      const uint32_t *PALMetadataEnd = PALMetadataBegin + Size;
-      std::vector<uint32_t> PALMetadata(PALMetadataBegin, PALMetadataEnd);
-      std::string PALMetadataString;
-      auto Error = AMDGPU::PALMD::toString(PALMetadata, PALMetadataString);
-      OS << "    PAL Metadata:\n";
-      if (Error) {
-        OS << "        Invalid";
-        return;
-      }
-      OS << PALMetadataString;
-      break;
+    return {"", ""};
+  case ELF::NT_AMD_AMDGPU_HSA_METADATA:
+    return {"HSA Metadata",
+            std::string(reinterpret_cast<const char *>(Words.data()),
+                        Words.size())};
+  case ELF::NT_AMD_AMDGPU_ISA:
+    return {"ISA Version",
+            std::string(reinterpret_cast<const char *>(Words.data()),
+                        Words.size())};
+  case ELF::NT_AMD_AMDGPU_PAL_METADATA:
+    const uint32_t *PALMetadataBegin =
+        reinterpret_cast<const uint32_t *>(Words.data());
+    const uint32_t *PALMetadataEnd = PALMetadataBegin + Words.size();
+    std::vector<uint32_t> PALMetadata(PALMetadataBegin, PALMetadataEnd);
+    std::string PALMetadataString;
+    auto Error = AMDGPU::PALMD::toString(PALMetadata, PALMetadataString);
+    if (Error) {
+      return {"PAL Metadata", "Invalid"};
+    }
+    return {"PAL Metadata", PALMetadataString};
   }
-  OS.flush();
 }
 
 template <class ELFT>
@@ -3826,12 +3867,14 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
 
     if (Name == "GNU") {
       OS << getGNUNoteTypeName(Type) << '\n';
-      printGNUNote<ELFT>(OS, Type, Descriptor, Descriptor.size());
+      printGNUNote<ELFT>(OS, Type, Descriptor);
     } else if (Name == "FreeBSD") {
       OS << getFreeBSDNoteTypeName(Type) << '\n';
     } else if (Name == "AMD") {
       OS << getAMDGPUNoteTypeName(Type) << '\n';
-      printAMDGPUNote<ELFT>(OS, Type, Descriptor, Descriptor.size());
+      const AMDGPUNote N = getAMDGPUNote<ELFT>(Type, Descriptor);
+      if (!N.type.empty())
+        OS << "    " << N.type << ":\n        " << N.value << '\n';
     } else {
       OS << "Unknown note type: (" << format_hex(Type, 10) << ')';
     }
@@ -4435,9 +4478,98 @@ void LLVMStyle<ELFT>::printAddrsig(const ELFFile<ELFT> *Obj) {
   }
 }
 
+template <typename ELFT>
+static void printGNUNoteLLVMStyle(uint32_t NoteType,
+                                  ArrayRef<typename ELFT::Word> Words,
+                                  ScopedPrinter &W) {
+  switch (NoteType) {
+  default:
+    return;
+  case ELF::NT_GNU_ABI_TAG: {
+    const GNUAbiTag &AbiTag = getGNUAbiTag<ELFT>(Words);
+    if (!AbiTag.IsValid) {
+      W.printString("ABI", "<corrupt GNU_ABI_TAG>");
+    } else {
+      W.printString("OS", AbiTag.OSName);
+      W.printString("ABI", AbiTag.ABI);
+    }
+    break;
+  }
+  case ELF::NT_GNU_BUILD_ID: {
+    W.printString("Build ID", getGNUBuildId<ELFT>(Words));
+    break;
+  }
+  case ELF::NT_GNU_GOLD_VERSION:
+    W.printString("Version", getGNUGoldVersion<ELFT>(Words));
+    break;
+  case ELF::NT_GNU_PROPERTY_TYPE_0:
+    ListScope D(W, "Property");
+    for (const auto &Property : getGNUPropertyList<ELFT>(Words))
+      W.printString(Property);
+    break;
+  }
+}
+
 template <class ELFT>
 void LLVMStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
-  W.startLine() << "printNotes not implemented!\n";
+  ListScope L(W, "Notes");
+  const Elf_Ehdr *e = Obj->getHeader();
+  bool IsCore = e->e_type == ELF::ET_CORE;
+
+  auto PrintHeader = [&](const typename ELFT::Off Offset,
+                         const typename ELFT::Addr Size) {
+    W.printHex("Offset", Offset);
+    W.printHex("Size", Size);
+  };
+
+  auto ProcessNote = [&](const Elf_Note &Note) {
+    DictScope D2(W, "Note");
+    StringRef Name = Note.getName();
+    ArrayRef<Elf_Word> Descriptor = Note.getDesc();
+    Elf_Word Type = Note.getType();
+
+    W.printString("Owner", Name);
+    W.printHex("Data size", Descriptor.size());
+    if (Name == "GNU") {
+      W.printString("Type", getGNUNoteTypeName(Type));
+      printGNUNoteLLVMStyle<ELFT>(Type, Descriptor, W);
+    } else if (Name == "FreeBSD") {
+      W.printString("Type", getFreeBSDNoteTypeName(Type));
+    } else if (Name == "AMD") {
+      W.printString("Type", getAMDGPUNoteTypeName(Type));
+      const AMDGPUNote N = getAMDGPUNote<ELFT>(Type, Descriptor);
+      if (!N.type.empty())
+        W.printString(N.type, N.value);
+    } else {
+      W.getOStream() << "Unknown note type: (" << format_hex(Type, 10) << ')';
+    }
+  };
+
+  if (IsCore) {
+    for (const auto &P : unwrapOrError(Obj->program_headers())) {
+      if (P.p_type != PT_NOTE)
+        continue;
+      DictScope D(W, "NoteSection");
+      PrintHeader(P.p_offset, P.p_filesz);
+      Error Err = Error::success();
+      for (const auto &Note : Obj->notes(P, Err))
+        ProcessNote(Note);
+      if (Err)
+        error(std::move(Err));
+    }
+  } else {
+    for (const auto &S : unwrapOrError(Obj->sections())) {
+      if (S.sh_type != SHT_NOTE)
+        continue;
+      DictScope D(W, "NoteSection");
+      PrintHeader(S.sh_offset, S.sh_size);
+      Error Err = Error::success();
+      for (const auto &Note : Obj->notes(S, Err))
+        ProcessNote(Note);
+      if (Err)
+        error(std::move(Err));
+    }
+  }
 }
 
 template <class ELFT>
-- 
GitLab


From 33443e8e9470f2ef09ac18b4c6de3efdc561bdae Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 8 Nov 2018 00:01:32 +0000
Subject: [PATCH 1102/1116] Extend virtual file system with `isLocal` method

Expose the `llvm::sys::fs::is_local` function through the VFS.

Differential revision: https://reviews.llvm.org/D54127

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346372 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/VirtualFileSystem.h    |  6 ++++-
 lib/Support/VirtualFileSystem.cpp           | 25 +++++++++++++++++++++
 unittests/Support/VirtualFileSystemTest.cpp | 11 +++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/include/llvm/Support/VirtualFileSystem.h b/include/llvm/Support/VirtualFileSystem.h
index 282893e21dc..b3326bbbe48 100644
--- a/include/llvm/Support/VirtualFileSystem.h
+++ b/include/llvm/Support/VirtualFileSystem.h
@@ -279,6 +279,9 @@ public:
   /// Check whether a file exists. Provided for convenience.
   bool exists(const Twine &Path);
 
+  /// Is the file mounted on a local filesystem?
+  virtual std::error_code isLocal(const Twine &Path, bool &Result);
+
   /// Make \a Path an absolute path.
   ///
   /// Makes \a Path absolute using the current directory if it is not already.
@@ -326,6 +329,7 @@ public:
   directory_iterator dir_begin(const Twine &Dir, std::error_code &EC) override;
   llvm::ErrorOr<std::string> getCurrentWorkingDirectory() const override;
   std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
+  std::error_code isLocal(const Twine &Path, bool &Result) override;
   std::error_code getRealPath(const Twine &Path,
                               SmallVectorImpl<char> &Output) const override;
 
@@ -463,7 +467,7 @@ public:
   /// system.
   std::error_code getRealPath(const Twine &Path,
                               SmallVectorImpl<char> &Output) const override;
-
+  std::error_code isLocal(const Twine &Path, bool &Result) override;
   std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
 };
 
diff --git a/lib/Support/VirtualFileSystem.cpp b/lib/Support/VirtualFileSystem.cpp
index 9440eacaa89..e8b0435b9cd 100644
--- a/lib/Support/VirtualFileSystem.cpp
+++ b/lib/Support/VirtualFileSystem.cpp
@@ -136,6 +136,10 @@ std::error_code FileSystem::getRealPath(const Twine &Path,
   return errc::operation_not_permitted;
 }
 
+std::error_code FileSystem::isLocal(const Twine &Path, bool &Result) {
+  return errc::operation_not_permitted;
+}
+
 bool FileSystem::exists(const Twine &Path) {
   auto Status = status(Path);
   return Status && Status->exists();
@@ -233,6 +237,7 @@ public:
 
   llvm::ErrorOr<std::string> getCurrentWorkingDirectory() const override;
   std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
+  std::error_code isLocal(const Twine &Path, bool &Result) override;
   std::error_code getRealPath(const Twine &Path,
                               SmallVectorImpl<char> &Output) const override;
 
@@ -288,6 +293,10 @@ std::error_code RealFileSystem::setCurrentWorkingDirectory(const Twine &Path) {
   return std::error_code();
 }
 
+std::error_code RealFileSystem::isLocal(const Twine &Path, bool &Result) {
+  return llvm::sys::fs::is_local(Path, Result);
+}
+
 std::error_code
 RealFileSystem::getRealPath(const Twine &Path,
                             SmallVectorImpl<char> &Output) const {
@@ -377,6 +386,13 @@ OverlayFileSystem::setCurrentWorkingDirectory(const Twine &Path) {
   return {};
 }
 
+std::error_code OverlayFileSystem::isLocal(const Twine &Path, bool &Result) {
+  for (auto &FS : FSList)
+    if (FS->exists(Path))
+      return FS->isLocal(Path, Result);
+  return errc::no_such_file_or_directory;
+}
+
 std::error_code
 OverlayFileSystem::getRealPath(const Twine &Path,
                                SmallVectorImpl<char> &Output) const {
@@ -913,6 +929,11 @@ InMemoryFileSystem::getRealPath(const Twine &Path,
   return {};
 }
 
+std::error_code InMemoryFileSystem::isLocal(const Twine &Path, bool &Result) {
+  Result = false;
+  return {};
+}
+
 } // namespace vfs
 } // namespace llvm
 
@@ -1170,6 +1191,10 @@ public:
     return ExternalFS->setCurrentWorkingDirectory(Path);
   }
 
+  std::error_code isLocal(const Twine &Path, bool &Result) override {
+    return ExternalFS->isLocal(Path, Result);
+  }
+
   directory_iterator dir_begin(const Twine &Dir, std::error_code &EC) override {
     ErrorOr<Entry *> E = lookupPath(Dir);
     if (!E) {
diff --git a/unittests/Support/VirtualFileSystemTest.cpp b/unittests/Support/VirtualFileSystemTest.cpp
index d5c01141bba..466cd117a50 100644
--- a/unittests/Support/VirtualFileSystemTest.cpp
+++ b/unittests/Support/VirtualFileSystemTest.cpp
@@ -885,6 +885,17 @@ TEST_F(InMemoryFileSystemTest, WorkingDirectory) {
             getPosixPath(NormalizedFS.getCurrentWorkingDirectory().get()));
 }
 
+TEST_F(InMemoryFileSystemTest, IsLocal) {
+  FS.setCurrentWorkingDirectory("/b");
+  FS.addFile("c", 0, MemoryBuffer::getMemBuffer(""));
+
+  std::error_code EC;
+  bool IsLocal = true;
+  EC = FS.isLocal("c", IsLocal);
+  ASSERT_FALSE(EC);
+  ASSERT_FALSE(IsLocal);
+}
+
 #if !defined(_WIN32)
 TEST_F(InMemoryFileSystemTest, GetRealPath) {
   SmallString<16> Path;
-- 
GitLab


From de78771378d907c9d6da92c4604e517e85d32d7b Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Thu, 8 Nov 2018 00:02:11 +0000
Subject: [PATCH 1103/1116] [MachineOutliner][NFC] Don't map MBBs that don't
 contain legal instructions

I noticed that there are lots of basic blocks that don't have enough legal
instructions in them to warrant outlining. We can skip mapping these entirely.

In sqlite3, compiled for AArch64 at -Oz, this results in a 10% reduction of
the total nodes in the suffix tree. These nodes can never be part of a
repeated substring, and so they don't impact the result at all.

Before this, there were 62128 nodes in the tree for sqlite3. After this, there
are 56457 nodes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346373 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineOutliner.cpp | 65 ++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 18 deletions(-)

diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 56b3fe202f0..5032b70e872 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -640,18 +640,24 @@ struct InstructionMapper {
 
   /// Maps \p *It to a legal integer.
   ///
-  /// Updates \p InstrList, \p UnsignedVec, \p InstructionIntegerMap,
-  /// \p IntegerInstructionMap, and \p LegalInstrNumber.
+  /// Updates \p InstrListForMBB, \p UnsignedVecForMBB, \p
+  /// InstructionIntegerMap, \p IntegerInstructionMap, and \p LegalInstrNumber.
   ///
   /// \returns The integer that \p *It was mapped to.
-  unsigned mapToLegalUnsigned(MachineBasicBlock::iterator &It) {
+  unsigned mapToLegalUnsigned(
+      MachineBasicBlock::iterator &It, unsigned &NumLegalInBlock,
+      std::vector<unsigned> &UnsignedVecForMBB,
+      std::vector<MachineBasicBlock::iterator> &InstrListForMBB) {
     // We added something legal, so we should unset the AddedLegalLastTime
     // flag.
     AddedIllegalLastTime = false;
 
+    // Keep track of the number of legal instructions we insert.
+    NumLegalInBlock++;
+
     // Get the integer for this instruction or give it the current
     // LegalInstrNumber.
-    InstrList.push_back(It);
+    InstrListForMBB.push_back(It);
     MachineInstr &MI = *It;
     bool WasInserted;
     DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>::iterator
@@ -666,7 +672,7 @@ struct InstructionMapper {
       IntegerInstructionMap.insert(std::make_pair(MINumber, &MI));
     }
 
-    UnsignedVec.push_back(MINumber);
+    UnsignedVecForMBB.push_back(MINumber);
 
     // Make sure we don't overflow or use any integers reserved by the DenseMap.
     if (LegalInstrNumber >= IllegalInstrNumber)
@@ -682,10 +688,13 @@ struct InstructionMapper {
 
   /// Maps \p *It to an illegal integer.
   ///
-  /// Updates \p InstrList, \p UnsignedVec, and \p IllegalInstrNumber.
+  /// Updates \p InstrListForMBB, \p UnsignedVecForMBB, and \p
+  /// IllegalInstrNumber.
   ///
   /// \returns The integer that \p *It was mapped to.
-  unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It) {
+  unsigned mapToIllegalUnsigned(
+      MachineBasicBlock::iterator &It, std::vector<unsigned> &UnsignedVecForMBB,
+      std::vector<MachineBasicBlock::iterator> &InstrListForMBB) {
     // Only add one illegal number per range of legal numbers.
     if (AddedIllegalLastTime)
       return IllegalInstrNumber;
@@ -694,8 +703,8 @@ struct InstructionMapper {
     AddedIllegalLastTime = true;
     unsigned MINumber = IllegalInstrNumber;
 
-    InstrList.push_back(It);
-    UnsignedVec.push_back(IllegalInstrNumber);
+    InstrListForMBB.push_back(It);
+    UnsignedVecForMBB.push_back(IllegalInstrNumber);
     IllegalInstrNumber--;
 
     assert(LegalInstrNumber < IllegalInstrNumber &&
@@ -724,22 +733,34 @@ struct InstructionMapper {
                             const TargetInstrInfo &TII) {
     unsigned Flags = TII.getMachineOutlinerMBBFlags(MBB);
     MachineBasicBlock::iterator It = MBB.begin();
+
+    // The number of instructions in this block that will be considered for
+    // outlining.
+    unsigned NumLegalInBlock = 0;
+
+    // FIXME: Should this all just be handled in the target, rather than using
+    // repeated calls to getOutliningType?
+    std::vector<unsigned> UnsignedVecForMBB;
+    std::vector<MachineBasicBlock::iterator> InstrListForMBB;
+
     for (MachineBasicBlock::iterator Et = MBB.end(); It != Et; It++) {
       // Keep track of where this instruction is in the module.
       switch (TII.getOutliningType(It, Flags)) {
       case InstrType::Illegal:
-        mapToIllegalUnsigned(It);
+        mapToIllegalUnsigned(It, UnsignedVecForMBB, InstrListForMBB);
         break;
 
       case InstrType::Legal:
-        mapToLegalUnsigned(It);
+        mapToLegalUnsigned(It, NumLegalInBlock, UnsignedVecForMBB,
+                           InstrListForMBB);
         break;
 
       case InstrType::LegalTerminator:
-        mapToLegalUnsigned(It);
+        mapToLegalUnsigned(It, NumLegalInBlock, UnsignedVecForMBB,
+                           InstrListForMBB);
         // The instruction also acts as a terminator, so we have to record that
         // in the string.
-        mapToIllegalUnsigned(It);
+        mapToIllegalUnsigned(It, UnsignedVecForMBB, InstrListForMBB);
         break;
 
       case InstrType::Invisible:
@@ -750,11 +771,19 @@ struct InstructionMapper {
       }
     }
 
-    // After we're done every insertion, uniquely terminate this part of the
-    // "string". This makes sure we won't match across basic block or function
-    // boundaries since the "end" is encoded uniquely and thus appears in no
-    // repeated substring.
-    mapToIllegalUnsigned(It);
+    // Are there enough legal instructions in the block for outlining to be
+    // possible?
+    if (NumLegalInBlock > 1) {
+      // After we're done every insertion, uniquely terminate this part of the
+      // "string". This makes sure we won't match across basic block or function
+      // boundaries since the "end" is encoded uniquely and thus appears in no
+      // repeated substring.
+      mapToIllegalUnsigned(It, UnsignedVecForMBB, InstrListForMBB);
+      InstrList.insert(InstrList.end(), InstrListForMBB.begin(),
+                       InstrListForMBB.end());
+      UnsignedVec.insert(UnsignedVec.end(), UnsignedVecForMBB.begin(),
+                         UnsignedVecForMBB.end());
+    }
   }
 
   InstructionMapper() {
-- 
GitLab


From 5fedd08c092551ba031fa657290f3f34506555e2 Mon Sep 17 00:00:00 2001
From: Anton Korobeynikov <anton@korobeynikov.info>
Date: Thu, 8 Nov 2018 00:03:45 +0000
Subject: [PATCH 1104/1116] [MSP430] Add MC layer

Summary:
This change implements assembler parser, code emitter, ELF object writer
and disassembler for the MSP430 ISA.  Also, more instruction forms are added
to the target description.

Reviewers: asl

Reviewed By: asl

Subscribers: pftbest, krisb, mgorny, llvm-commits

Differential Revision: https://reviews.llvm.org/D53661

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346374 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/BinaryFormat/ELF.h               |   34 +
 .../llvm/BinaryFormat/ELFRelocs/MSP430.def    |   16 +
 include/llvm/Object/ELFObjectFile.h           |    4 +
 include/llvm/module.modulemap                 |    1 +
 lib/Object/ELF.cpp                            |    7 +
 lib/Target/MSP430/AsmParser/CMakeLists.txt    |    3 +
 lib/Target/MSP430/AsmParser/LLVMBuild.txt     |   23 +
 .../MSP430/AsmParser/MSP430AsmParser.cpp      |  562 +++++++
 lib/Target/MSP430/CMakeLists.txt              |    5 +
 lib/Target/MSP430/Disassembler/CMakeLists.txt |    3 +
 lib/Target/MSP430/Disassembler/LLVMBuild.txt  |   23 +
 .../Disassembler/MSP430Disassembler.cpp       |  375 +++++
 .../MSP430/InstPrinter/MSP430InstPrinter.cpp  |   36 +-
 .../MSP430/InstPrinter/MSP430InstPrinter.h    |    7 +
 lib/Target/MSP430/LLVMBuild.txt               |    4 +-
 lib/Target/MSP430/MCTargetDesc/CMakeLists.txt |    6 +-
 .../MSP430/MCTargetDesc/MSP430AsmBackend.cpp  |  178 ++
 .../MCTargetDesc/MSP430ELFObjectWriter.cpp    |   59 +
 .../MSP430/MCTargetDesc/MSP430ELFStreamer.cpp |   81 +
 .../MSP430/MCTargetDesc/MSP430FixupKinds.h    |   53 +
 .../MCTargetDesc/MSP430MCCodeEmitter.cpp      |  212 +++
 .../MCTargetDesc/MSP430MCTargetDesc.cpp       |   27 +-
 .../MSP430/MCTargetDesc/MSP430MCTargetDesc.h  |   27 +
 lib/Target/MSP430/MSP430.h                    |    2 +
 lib/Target/MSP430/MSP430.td                   |   18 +
 lib/Target/MSP430/MSP430ISelDAGToDAG.cpp      |   32 +-
 lib/Target/MSP430/MSP430ISelLowering.cpp      |   91 +-
 lib/Target/MSP430/MSP430ISelLowering.h        |    8 +-
 lib/Target/MSP430/MSP430InstrFormats.td       |  422 +++--
 lib/Target/MSP430/MSP430InstrInfo.cpp         |   45 +-
 lib/Target/MSP430/MSP430InstrInfo.h           |   16 -
 lib/Target/MSP430/MSP430InstrInfo.td          | 1427 +++++++----------
 lib/Target/MSP430/MSP430MCInstLower.cpp       |    3 +
 lib/Target/MSP430/MSP430RegisterInfo.td       |   29 +-
 test/CodeGen/MSP430/AddrMode-bis-rx.ll        |   10 +-
 test/CodeGen/MSP430/AddrMode-bis-xr.ll        |   10 +-
 test/CodeGen/MSP430/AddrMode-mov-rx.ll        |   10 +-
 test/CodeGen/MSP430/AddrMode-mov-xr.ll        |   10 +-
 test/CodeGen/MSP430/Inst16mi.ll               |   10 +-
 test/CodeGen/MSP430/Inst16mm.ll               |   14 +-
 test/CodeGen/MSP430/Inst16mr.ll               |   12 +-
 test/CodeGen/MSP430/Inst16ri.ll               |   10 +-
 test/CodeGen/MSP430/Inst16rm.ll               |   10 +-
 test/CodeGen/MSP430/Inst16rr.ll               |   12 +-
 test/CodeGen/MSP430/Inst8mi.ll                |    2 +-
 test/CodeGen/MSP430/Inst8ri.ll                |    2 +-
 test/CodeGen/MSP430/Inst8rr.ll                |    8 +-
 test/CodeGen/MSP430/asm-clobbers.ll           |    4 +-
 test/CodeGen/MSP430/bit.ll                    |   16 +-
 test/CodeGen/MSP430/byval.ll                  |    8 +-
 test/CodeGen/MSP430/cc_args.ll                |  104 +-
 test/CodeGen/MSP430/cc_ret.ll                 |   28 +-
 test/CodeGen/MSP430/fp.ll                     |   10 +-
 test/CodeGen/MSP430/jumptable.ll              |    8 +-
 test/CodeGen/MSP430/memset.ll                 |    6 +-
 test/CodeGen/MSP430/misched-msp430.ll         |    2 +-
 test/CodeGen/MSP430/postinc.ll                |   10 +-
 test/CodeGen/MSP430/select-use-sr.ll          |    4 +-
 test/CodeGen/MSP430/setcc.ll                  |   56 +-
 test/CodeGen/MSP430/shifts.ll                 |    8 +-
 test/CodeGen/MSP430/struct-return.ll          |   16 +-
 test/CodeGen/MSP430/struct_layout.ll          |    8 +-
 .../MSP430/transient-stack-alignment.ll       |    6 +-
 test/CodeGen/MSP430/vararg.ll                 |   20 +-
 test/MC/Disassembler/MSP430/lit.local.cfg     |    3 +
 test/MC/Disassembler/MSP430/msp430.txt        |   27 +
 test/MC/MSP430/addrmode.s                     |  110 ++
 test/MC/MSP430/altreg.s                       |    7 +
 test/MC/MSP430/const.s                        |   10 +
 test/MC/MSP430/invalid.s                      |   19 +
 test/MC/MSP430/lit.local.cfg                  |    3 +
 test/MC/MSP430/opcode.s                       |  163 ++
 test/MC/MSP430/reloc.s                        |   22 +
 73 files changed, 3328 insertions(+), 1309 deletions(-)
 create mode 100644 include/llvm/BinaryFormat/ELFRelocs/MSP430.def
 create mode 100644 lib/Target/MSP430/AsmParser/CMakeLists.txt
 create mode 100644 lib/Target/MSP430/AsmParser/LLVMBuild.txt
 create mode 100644 lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
 create mode 100644 lib/Target/MSP430/Disassembler/CMakeLists.txt
 create mode 100644 lib/Target/MSP430/Disassembler/LLVMBuild.txt
 create mode 100644 lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
 create mode 100644 lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
 create mode 100644 lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
 create mode 100644 lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
 create mode 100644 lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h
 create mode 100644 lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
 create mode 100644 test/MC/Disassembler/MSP430/lit.local.cfg
 create mode 100644 test/MC/Disassembler/MSP430/msp430.txt
 create mode 100644 test/MC/MSP430/addrmode.s
 create mode 100644 test/MC/MSP430/altreg.s
 create mode 100644 test/MC/MSP430/const.s
 create mode 100644 test/MC/MSP430/invalid.s
 create mode 100644 test/MC/MSP430/lit.local.cfg
 create mode 100644 test/MC/MSP430/opcode.s
 create mode 100644 test/MC/MSP430/reloc.s

diff --git a/include/llvm/BinaryFormat/ELF.h b/include/llvm/BinaryFormat/ELF.h
index a42186bc270..ebbf830a60e 100644
--- a/include/llvm/BinaryFormat/ELF.h
+++ b/include/llvm/BinaryFormat/ELF.h
@@ -729,6 +729,38 @@ enum {
 #include "ELFRelocs/BPF.def"
 };
 
+// MSP430 specific e_flags
+enum : unsigned {
+  EF_MSP430_MACH_MSP430x11 = 11,
+  EF_MSP430_MACH_MSP430x11x1 = 110,
+  EF_MSP430_MACH_MSP430x12 = 12,
+  EF_MSP430_MACH_MSP430x13 = 13,
+  EF_MSP430_MACH_MSP430x14 = 14,
+  EF_MSP430_MACH_MSP430x15 = 15,
+  EF_MSP430_MACH_MSP430x16 = 16,
+  EF_MSP430_MACH_MSP430x20 = 20,
+  EF_MSP430_MACH_MSP430x22 = 22,
+  EF_MSP430_MACH_MSP430x23 = 23,
+  EF_MSP430_MACH_MSP430x24 = 24,
+  EF_MSP430_MACH_MSP430x26 = 26,
+  EF_MSP430_MACH_MSP430x31 = 31,
+  EF_MSP430_MACH_MSP430x32 = 32,
+  EF_MSP430_MACH_MSP430x33 = 33,
+  EF_MSP430_MACH_MSP430x41 = 41,
+  EF_MSP430_MACH_MSP430x42 = 42,
+  EF_MSP430_MACH_MSP430x43 = 43,
+  EF_MSP430_MACH_MSP430x44 = 44,
+  EF_MSP430_MACH_MSP430X = 45,
+  EF_MSP430_MACH_MSP430x46 = 46,
+  EF_MSP430_MACH_MSP430x47 = 47,
+  EF_MSP430_MACH_MSP430x54 = 54,
+};
+
+// ELF Relocation types for MSP430
+enum {
+#include "ELFRelocs/MSP430.def"
+};
+
 #undef ELF_RELOC
 
 // Section header.
@@ -833,6 +865,8 @@ enum : unsigned {
   SHT_MIPS_DWARF = 0x7000001e,          // DWARF debugging section.
   SHT_MIPS_ABIFLAGS = 0x7000002a,       // ABI information.
 
+  SHT_MSP430_ATTRIBUTES = 0x70000003U,
+
   SHT_HIPROC = 0x7fffffff,              // Highest processor arch-specific type.
   SHT_LOUSER = 0x80000000,              // Lowest type reserved for applications.
   SHT_HIUSER = 0xffffffff               // Highest type reserved for applications.
diff --git a/include/llvm/BinaryFormat/ELFRelocs/MSP430.def b/include/llvm/BinaryFormat/ELFRelocs/MSP430.def
new file mode 100644
index 00000000000..96990abf2db
--- /dev/null
+++ b/include/llvm/BinaryFormat/ELFRelocs/MSP430.def
@@ -0,0 +1,16 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+ELF_RELOC(R_MSP430_NONE,               0)
+ELF_RELOC(R_MSP430_32,                 1)
+ELF_RELOC(R_MSP430_10_PCREL,           2)
+ELF_RELOC(R_MSP430_16,                 3)
+ELF_RELOC(R_MSP430_16_PCREL,           4)
+ELF_RELOC(R_MSP430_16_BYTE,            5)
+ELF_RELOC(R_MSP430_16_PCREL_BYTE,      6)
+ELF_RELOC(R_MSP430_2X_PCREL,           7)
+ELF_RELOC(R_MSP430_RL_PCREL,           8)
+ELF_RELOC(R_MSP430_8,                  9)
+ELF_RELOC(R_MSP430_SYM_DIFF,           10)
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index 54907cbca4c..dff08607839 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -1021,6 +1021,8 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
       return "ELF32-lanai";
     case ELF::EM_MIPS:
       return "ELF32-mips";
+    case ELF::EM_MSP430:
+      return "ELF32-msp430";
     case ELF::EM_PPC:
       return "ELF32-ppc";
     case ELF::EM_RISCV:
@@ -1091,6 +1093,8 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
     default:
       report_fatal_error("Invalid ELFCLASS!");
     }
+  case ELF::EM_MSP430:
+    return Triple::msp430;
   case ELF::EM_PPC:
     return Triple::ppc;
   case ELF::EM_PPC64:
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index 138eb06078d..c918eff2b97 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap
@@ -52,6 +52,7 @@ module LLVM_BinaryFormat {
     textual header "BinaryFormat/ELFRelocs/i386.def"
     textual header "BinaryFormat/ELFRelocs/Lanai.def"
     textual header "BinaryFormat/ELFRelocs/Mips.def"
+    textual header "BinaryFormat/ELFRelocs/MSP430.def"
     textual header "BinaryFormat/ELFRelocs/PowerPC64.def"
     textual header "BinaryFormat/ELFRelocs/PowerPC.def"
     textual header "BinaryFormat/ELFRelocs/RISCV.def"
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
index da56d97c4bc..2edab0b1373 100644
--- a/lib/Object/ELF.cpp
+++ b/lib/Object/ELF.cpp
@@ -139,6 +139,13 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
       break;
     }
     break;
+  case ELF::EM_MSP430:
+    switch (Type) {
+#include "llvm/BinaryFormat/ELFRelocs/MSP430.def"
+    default:
+      break;
+    }
+    break;
   default:
     break;
   }
diff --git a/lib/Target/MSP430/AsmParser/CMakeLists.txt b/lib/Target/MSP430/AsmParser/CMakeLists.txt
new file mode 100644
index 00000000000..bb484898afa
--- /dev/null
+++ b/lib/Target/MSP430/AsmParser/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMMSP430AsmParser
+  MSP430AsmParser.cpp
+)
diff --git a/lib/Target/MSP430/AsmParser/LLVMBuild.txt b/lib/Target/MSP430/AsmParser/LLVMBuild.txt
new file mode 100644
index 00000000000..58f67c07db1
--- /dev/null
+++ b/lib/Target/MSP430/AsmParser/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- lib/Target/MSP430/AsmParser/LLVMBuild.txt ----------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = MSP430AsmParser
+parent = MSP430
+required_libraries = MC MCParser MSP430Desc MSP430Info Support
+add_to_library_groups = MSP430
diff --git a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
new file mode 100644
index 00000000000..3f7d1860e9a
--- /dev/null
+++ b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -0,0 +1,562 @@
+//===- MSP430AsmParser.cpp - Parse MSP430 assembly to MCInst instructions -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430RegisterInfo.h"
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define DEBUG_TYPE "msp430-asm-parser"
+
+namespace llvm {
+
+/// Parses MSP430 assembly from a stream.
+class MSP430AsmParser : public MCTargetAsmParser {
+  const MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  const MCRegisterInfo *MRI;
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
+  bool parseJccInstruction(ParseInstructionInfo &Info, StringRef Name,
+                           SMLoc NameLoc, OperandVector &Operands);
+
+  bool ParseOperand(OperandVector &Operands);
+
+  bool ParseLiteralValues(unsigned Size, SMLoc L);
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  /// @name Auto-generated Matcher Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "MSP430GenAsmMatcher.inc"
+
+  /// }
+
+public:
+  MSP430AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+                  const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, STI, MII), STI(STI), Parser(Parser) {
+    MCAsmParserExtension::Initialize(Parser);
+    MRI = getContext().getRegisterInfo();
+
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+};
+
+/// A parsed MSP430 assembly operand.
+class MSP430Operand : public MCParsedAsmOperand {
+  typedef MCParsedAsmOperand Base;
+
+  enum KindTy {
+    k_Imm,
+    k_Reg,
+    k_Tok,
+    k_Mem,
+    k_IndReg,
+    k_PostIndReg
+  } Kind;
+
+  struct Memory {
+    unsigned Reg;
+    const MCExpr *Offset;
+  };
+  union {
+    const MCExpr *Imm;
+    unsigned      Reg;
+    StringRef     Tok;
+    Memory        Mem;
+  };
+
+  SMLoc Start, End;
+
+public:
+  MSP430Operand(StringRef Tok, SMLoc const &S)
+      : Base(), Kind(k_Tok), Tok(Tok), Start(S), End(S) {}
+  MSP430Operand(KindTy Kind, unsigned Reg, SMLoc const &S, SMLoc const &E)
+      : Base(), Kind(Kind), Reg(Reg), Start(S), End(E) {}
+  MSP430Operand(MCExpr const *Imm, SMLoc const &S, SMLoc const &E)
+      : Base(), Kind(k_Imm), Imm(Imm), Start(S), End(E) {}
+  MSP430Operand(unsigned Reg, MCExpr const *Expr, SMLoc const &S, SMLoc const &E)
+      : Base(), Kind(k_Mem), Mem({Reg, Expr}), Start(S), End(E) {}
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert((Kind == k_Reg || Kind == k_IndReg || Kind == k_PostIndReg) &&
+        "Unexpected operand kind");
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(Reg));
+  }
+
+  void addExprOperand(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediate when possible
+    if (!Expr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(Kind == k_Imm && "Unexpected operand kind");
+    assert(N == 1 && "Invalid number of operands!");
+
+    addExprOperand(Inst, Imm);
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert(Kind == k_Mem && "Unexpected operand kind");
+    assert(N == 2 && "Invalid number of operands");
+
+    Inst.addOperand(MCOperand::createReg(Mem.Reg));
+    addExprOperand(Inst, Mem.Offset);
+  }
+
+  bool isReg() const        { return Kind == k_Reg; }
+  bool isImm() const        { return Kind == k_Imm; }
+  bool isToken() const      { return Kind == k_Tok; }
+  bool isMem() const        { return Kind == k_Mem; }
+  bool isIndReg() const     { return Kind == k_IndReg; }
+  bool isPostIndReg() const { return Kind == k_PostIndReg; }
+
+  bool isCGImm() const {
+    if (Kind != k_Imm)
+      return false;
+
+    int64_t Val;
+    if (!Imm->evaluateAsAbsolute(Val))
+      return false;
+    
+    if (Val == 0 || Val == 1 || Val == 2 || Val == 4 || Val == 8 || Val == -1)
+      return true;
+
+    return false;
+  }
+
+  StringRef getToken() const {
+    assert(Kind == k_Tok && "Invalid access!");
+    return Tok;
+  }
+
+  unsigned getReg() const {
+    assert(Kind == k_Reg && "Invalid access!");
+    return Reg;
+  }
+
+  void setReg(unsigned RegNo) {
+    assert(Kind == k_Reg && "Invalid access!");
+    Reg = RegNo;
+  }
+
+  static std::unique_ptr<MSP430Operand> CreateToken(StringRef Str, SMLoc S) {
+    return make_unique<MSP430Operand>(Str, S);
+  }
+
+  static std::unique_ptr<MSP430Operand> CreateReg(unsigned RegNum, SMLoc S,
+                                                  SMLoc E) {
+    return make_unique<MSP430Operand>(k_Reg, RegNum, S, E);
+  }
+
+  static std::unique_ptr<MSP430Operand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                  SMLoc E) {
+    return make_unique<MSP430Operand>(Val, S, E);
+  }
+
+  static std::unique_ptr<MSP430Operand> CreateMem(unsigned RegNum,
+                                                  const MCExpr *Val,
+                                                  SMLoc S, SMLoc E) {
+    return make_unique<MSP430Operand>(RegNum, Val, S, E);
+  }
+
+  static std::unique_ptr<MSP430Operand> CreateIndReg(unsigned RegNum, SMLoc S,
+                                                  SMLoc E) {
+    return make_unique<MSP430Operand>(k_IndReg, RegNum, S, E);
+  }
+
+  static std::unique_ptr<MSP430Operand> CreatePostIndReg(unsigned RegNum, SMLoc S,
+                                                  SMLoc E) {
+    return make_unique<MSP430Operand>(k_PostIndReg, RegNum, S, E);
+  }
+
+  SMLoc getStartLoc() const { return Start; }
+  SMLoc getEndLoc() const { return End; }
+
+  virtual void print(raw_ostream &O) const {
+    switch (Kind) {
+    case k_Tok:
+      O << "Token " << Tok;
+      break;
+    case k_Reg:
+      O << "Register " << Reg;
+      break;
+    case k_Imm:
+      O << "Immediate " << *Imm;
+      break;
+    case k_Mem:
+      O << "Memory ";
+      O << *Mem.Offset << "(" << Reg << ")";
+      break;
+    case k_IndReg:
+      O << "RegInd " << Reg;
+      break;
+    case k_PostIndReg:
+      O << "PostInc " << Reg;
+      break;
+    }
+  }
+};
+
+bool MSP430AsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
+                                              OperandVector &Operands,
+                                              MCStreamer &Out,
+                                              uint64_t &ErrorInfo,
+                                              bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+
+  switch (MatchResult) {
+  case Match_Success:
+    Inst.setLoc(Loc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  case Match_MnemonicFail:
+    return Error(Loc, "invalid instruction mnemonic");
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = Loc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(ErrorLoc, "too few operands for instruction");
+
+      ErrorLoc = ((MSP430Operand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = Loc;
+    }
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  default:
+    return true;
+  }
+}
+
+// Auto-generated by TableGen
+static unsigned MatchRegisterName(StringRef Name);
+static unsigned MatchRegisterAltName(StringRef Name);
+
+bool MSP430AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                    SMLoc &EndLoc) {
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    auto Name = getLexer().getTok().getIdentifier().lower();
+    RegNo = MatchRegisterName(Name);
+    if (RegNo == MSP430::NoRegister) {
+      RegNo = MatchRegisterAltName(Name);
+      if (RegNo == MSP430::NoRegister)
+        return true;
+    }
+
+    AsmToken const &T = getParser().getTok();
+    StartLoc = T.getLoc();
+    EndLoc = T.getEndLoc();
+    getLexer().Lex(); // eat register token
+
+    return false;
+  }
+
+  return Error(StartLoc, "invalid register name");
+}
+
+bool MSP430AsmParser::parseJccInstruction(ParseInstructionInfo &Info,
+                                          StringRef Name, SMLoc NameLoc,
+                                          OperandVector &Operands) {
+  if (!Name.startswith_lower("j"))
+    return true;
+
+  auto CC = Name.drop_front().lower();
+  unsigned CondCode;
+  if (CC == "ne" || CC == "nz")
+    CondCode = MSP430CC::COND_NE;
+  else if (CC == "eq" || CC == "z")
+    CondCode = MSP430CC::COND_E;
+  else if (CC == "lo" || CC == "nc")
+    CondCode = MSP430CC::COND_LO;
+  else if (CC == "hs" || CC == "c")
+    CondCode = MSP430CC::COND_HS;
+  else if (CC == "n")
+    CondCode = MSP430CC::COND_N;
+  else if (CC == "ge")
+    CondCode = MSP430CC::COND_GE;
+  else if (CC == "l")
+    CondCode = MSP430CC::COND_L;
+  else if (CC == "mp")
+    CondCode = MSP430CC::COND_NONE;
+  else
+    return Error(NameLoc, "unknown instruction");
+
+  if (CondCode == (unsigned)MSP430CC::COND_NONE)
+    Operands.push_back(MSP430Operand::CreateToken("jmp", NameLoc));
+  else {
+    Operands.push_back(MSP430Operand::CreateToken("j", NameLoc));
+    const MCExpr *CCode = MCConstantExpr::create(CondCode, getContext());
+    Operands.push_back(MSP430Operand::CreateImm(CCode, SMLoc(), SMLoc()));
+  }
+
+  // Skip optional '$' sign.
+  if (getLexer().getKind() == AsmToken::Dollar)
+    getLexer().Lex(); // Eat '$'
+
+  const MCExpr *Val;
+  SMLoc ExprLoc = getLexer().getLoc();
+  if (getParser().parseExpression(Val))
+    return Error(ExprLoc, "expected expression operand");
+
+  int64_t Res;
+  if (Val->evaluateAsAbsolute(Res))
+    if (Res < -512 || Res > 511)
+      return Error(ExprLoc, "invalid jump offset");
+
+  Operands.push_back(MSP430Operand::CreateImm(Val, ExprLoc,
+    getLexer().getLoc()));
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    getParser().eatToEndOfStatement();
+    return Error(Loc, "unexpected token");
+  }
+
+  getParser().Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MSP430AsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                       StringRef Name, SMLoc NameLoc,
+                                       OperandVector &Operands) {
+  // Drop .w suffix
+  if (Name.endswith_lower(".w"))
+    Name = Name.drop_back(2);
+
+  if (!parseJccInstruction(Info, Name, NameLoc, Operands))
+    return false;
+
+  // First operand is instruction mnemonic
+  Operands.push_back(MSP430Operand::CreateToken(Name, NameLoc));
+
+  // If there are no more operands, then finish
+  if (getLexer().is(AsmToken::EndOfStatement))
+    return false;
+
+  // Parse first operand
+  if (ParseOperand(Operands))
+    return true;
+
+  // Parse second operand if any
+  if (getLexer().is(AsmToken::Comma)) {
+    getLexer().Lex(); // Eat ','
+    if (ParseOperand(Operands))
+      return true;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    getParser().eatToEndOfStatement();
+    return Error(Loc, "unexpected token");
+  }
+
+  getParser().Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MSP430AsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal.lower() == ".long") {
+    ParseLiteralValues(4, DirectiveID.getLoc());
+  } else if (IDVal.lower() == ".word" || IDVal.lower() == ".short") {
+    ParseLiteralValues(2, DirectiveID.getLoc());
+  } else if (IDVal.lower() == ".byte") {
+    ParseLiteralValues(1, DirectiveID.getLoc());
+  }
+  return true;
+}
+
+bool MSP430AsmParser::ParseOperand(OperandVector &Operands) {
+  switch (getLexer().getKind()) {
+    default: return true;
+    case AsmToken::Identifier: {
+      // try rN
+      unsigned RegNo;
+      SMLoc StartLoc, EndLoc;
+      if (!ParseRegister(RegNo, StartLoc, EndLoc)) {
+        Operands.push_back(MSP430Operand::CreateReg(RegNo, StartLoc, EndLoc));
+        return false;
+      }
+      LLVM_FALLTHROUGH;
+    }
+    case AsmToken::Integer:
+    case AsmToken::Plus:
+    case AsmToken::Minus: {
+      SMLoc StartLoc = getParser().getTok().getLoc();
+      const MCExpr *Val;
+      // Try constexpr[(rN)]
+      if (!getParser().parseExpression(Val)) {
+        unsigned RegNo = MSP430::PC;
+        SMLoc EndLoc = getParser().getTok().getLoc();
+        // Try (rN)
+        if (getLexer().getKind() == AsmToken::LParen) {
+          getLexer().Lex(); // Eat '('
+          SMLoc RegStartLoc;
+          if (ParseRegister(RegNo, RegStartLoc, EndLoc))
+            return true;
+          if (getLexer().getKind() != AsmToken::RParen)
+            return true;
+          EndLoc = getParser().getTok().getEndLoc();
+          getLexer().Lex(); // Eat ')'
+        }
+        Operands.push_back(MSP430Operand::CreateMem(RegNo, Val, StartLoc,
+          EndLoc));
+        return false;
+      }
+      return true;
+    }
+    case AsmToken::Amp: {
+      // Try &constexpr
+      SMLoc StartLoc = getParser().getTok().getLoc();
+      getLexer().Lex(); // Eat '&'
+      const MCExpr *Val;
+      if (!getParser().parseExpression(Val)) {
+        SMLoc EndLoc = getParser().getTok().getLoc();
+        Operands.push_back(MSP430Operand::CreateMem(MSP430::SR, Val, StartLoc,
+          EndLoc));
+        return false;
+      }
+      return true;
+    }
+    case AsmToken::At: {
+      // Try @rN[+]
+      SMLoc StartLoc = getParser().getTok().getLoc();
+      getLexer().Lex(); // Eat '@'
+      unsigned RegNo;
+      SMLoc RegStartLoc, EndLoc;
+      if (ParseRegister(RegNo, RegStartLoc, EndLoc))
+        return true;
+      if (getLexer().getKind() == AsmToken::Plus) {
+        Operands.push_back(MSP430Operand::CreatePostIndReg(RegNo, StartLoc, EndLoc));
+        getLexer().Lex(); // Eat '+'
+        return false;
+      }
+      Operands.push_back(MSP430Operand::CreateIndReg(RegNo, StartLoc, EndLoc));
+      return false;
+    }
+    case AsmToken::Hash:
+      // Try #constexpr
+      SMLoc StartLoc = getParser().getTok().getLoc();
+      getLexer().Lex(); // Eat '#'
+      const MCExpr *Val;
+      if (!getParser().parseExpression(Val)) {
+        SMLoc EndLoc = getParser().getTok().getLoc();
+        Operands.push_back(MSP430Operand::CreateImm(Val, StartLoc, EndLoc));
+        return false;
+      }
+      return true;
+  }
+}
+
+bool MSP430AsmParser::ParseLiteralValues(unsigned Size, SMLoc L) {
+  auto parseOne = [&]() -> bool {
+    const MCExpr *Value;
+    if (getParser().parseExpression(Value))
+      return true;
+    getParser().getStreamer().EmitValue(Value, Size, L);
+    return false;
+  };
+  return (parseMany(parseOne));
+}
+
+extern "C" void LLVMInitializeMSP430AsmParser() {
+  RegisterMCAsmParser<MSP430AsmParser> X(getTheMSP430Target());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "MSP430GenAsmMatcher.inc"
+
+static unsigned convertGR16ToGR8(unsigned Reg) {
+  switch (Reg) {
+  default:
+    llvm_unreachable("Unknown GR16 register");
+  case MSP430::PC:  return MSP430::PCB;
+  case MSP430::SP:  return MSP430::SPB;
+  case MSP430::SR:  return MSP430::SRB;
+  case MSP430::CG:  return MSP430::CGB;
+  case MSP430::FP:  return MSP430::FPB;
+  case MSP430::R5:  return MSP430::R5B;
+  case MSP430::R6:  return MSP430::R6B;
+  case MSP430::R7:  return MSP430::R7B;
+  case MSP430::R8:  return MSP430::R8B;
+  case MSP430::R9:  return MSP430::R9B;
+  case MSP430::R10: return MSP430::R10B;
+  case MSP430::R11: return MSP430::R11B;
+  case MSP430::R12: return MSP430::R12B;
+  case MSP430::R13: return MSP430::R13B;
+  case MSP430::R14: return MSP430::R14B;
+  case MSP430::R15: return MSP430::R15B;
+  }
+}
+
+unsigned MSP430AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                     unsigned Kind) {
+  MSP430Operand &Op = static_cast<MSP430Operand &>(AsmOp);
+
+  if (!Op.isReg())
+    return Match_InvalidOperand;
+
+  unsigned Reg = Op.getReg();
+  bool isGR16 =
+      MSP430MCRegisterClasses[MSP430::GR16RegClassID].contains(Reg);
+
+  if (isGR16 && (Kind == MCK_GR8)) {
+    Op.setReg(convertGR16ToGR8(Reg));
+    return Match_Success;
+  }
+
+  return Match_InvalidOperand;
+}
+
+} // end of namespace llvm
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index 3facfd526a5..2a0848fb308 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -1,9 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS MSP430.td)
 
+tablegen(LLVM MSP430GenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM MSP430GenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM MSP430GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM MSP430GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM MSP430GenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM MSP430GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM MSP430GenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM MSP430GenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM MSP430GenSubtargetInfo.inc -gen-subtarget)
 
@@ -26,3 +29,5 @@ add_llvm_target(MSP430CodeGen
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(TargetInfo)
+add_subdirectory(AsmParser)
+add_subdirectory(Disassembler)
diff --git a/lib/Target/MSP430/Disassembler/CMakeLists.txt b/lib/Target/MSP430/Disassembler/CMakeLists.txt
new file mode 100644
index 00000000000..bc33b906772
--- /dev/null
+++ b/lib/Target/MSP430/Disassembler/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMMSP430Disassembler
+  MSP430Disassembler.cpp
+  )
diff --git a/lib/Target/MSP430/Disassembler/LLVMBuild.txt b/lib/Target/MSP430/Disassembler/LLVMBuild.txt
new file mode 100644
index 00000000000..8af9cd9c222
--- /dev/null
+++ b/lib/Target/MSP430/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;====- lib/Target/MSP430/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = MSP430Disassembler
+parent = MSP430
+required_libraries = MCDisassembler MSP430Info Support
+add_to_library_groups = MSP430
diff --git a/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
new file mode 100644
index 00000000000..2a66b4ed7f2
--- /dev/null
+++ b/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
@@ -0,0 +1,375 @@
+//===-- MSP430Disassembler.cpp - Disassembler for MSP430 ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430Disassembler class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class MSP430Disassembler : public MCDisassembler {
+  DecodeStatus getInstructionI(MCInst &MI, uint64_t &Size,
+                               ArrayRef<uint8_t> Bytes, uint64_t Address,
+                               raw_ostream &VStream,
+                               raw_ostream &CStream) const;
+
+  DecodeStatus getInstructionII(MCInst &MI, uint64_t &Size,
+                                ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                raw_ostream &VStream,
+                                raw_ostream &CStream) const;
+
+  DecodeStatus getInstructionCJ(MCInst &MI, uint64_t &Size,
+                                ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                raw_ostream &VStream,
+                                raw_ostream &CStream) const;
+
+public:
+  MSP430Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+      : MCDisassembler(STI, Ctx) {}
+
+  DecodeStatus getInstruction(MCInst &MI, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createMSP430Disassembler(const Target &T,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new MSP430Disassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeMSP430Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(getTheMSP430Target(),
+                                         createMSP430Disassembler);
+}
+
+static const unsigned GR8DecoderTable[] = {
+  MSP430::PCB,  MSP430::SPB,  MSP430::SRB,  MSP430::CGB,
+  MSP430::FPB,  MSP430::R5B,  MSP430::R6B,  MSP430::R7B,
+  MSP430::R8B,  MSP430::R9B,  MSP430::R10B, MSP430::R11B,
+  MSP430::R12B, MSP430::R13B, MSP430::R14B, MSP430::R15B
+};
+
+static DecodeStatus DecodeGR8RegisterClass(MCInst &MI, uint64_t RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = GR8DecoderTable[RegNo];
+  MI.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static const unsigned GR16DecoderTable[] = {
+  MSP430::PC,  MSP430::SP,  MSP430::SR,  MSP430::CG,
+  MSP430::FP,  MSP430::R5,  MSP430::R6,  MSP430::R7,
+  MSP430::R8,  MSP430::R9,  MSP430::R10, MSP430::R11,
+  MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
+};
+
+static DecodeStatus DecodeGR16RegisterClass(MCInst &MI, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = GR16DecoderTable[RegNo];
+  MI.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address,
+                                const void *Decoder);
+
+static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+#include "MSP430GenDisassemblerTables.inc"
+
+static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address,
+                                const void *Decoder) {
+  int64_t Imm;
+  switch (Bits) {
+  default:
+    llvm_unreachable("Invalid immediate value");
+  case 0x22: Imm =  4; break;
+  case 0x32: Imm =  8; break;
+  case 0x03: Imm =  0; break;
+  case 0x13: Imm =  1; break;
+  case 0x23: Imm =  2; break;
+  case 0x33: Imm = -1; break;
+  }
+  MI.addOperand(MCOperand::createImm(Imm));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  unsigned Reg = Bits & 15;
+  unsigned Imm = Bits >> 4;
+
+  if (DecodeGR16RegisterClass(MI, Reg, Address, Decoder) !=
+      MCDisassembler::Success)
+    return MCDisassembler::Fail;
+  
+  MI.addOperand(MCOperand::createImm((int16_t)Imm));
+  return MCDisassembler::Success;
+}
+
+enum AddrMode {
+  amInvalid = 0,
+  amRegister,
+  amIndexed,
+  amIndirect,
+  amIndirectPost,
+  amSymbolic,
+  amImmediate,
+  amAbsolute,
+  amConstant
+};
+
+static AddrMode DecodeSrcAddrMode(unsigned Rs, unsigned As) {
+  switch (Rs) {
+  case 0:
+    if (As == 1) return amSymbolic;
+    if (As == 2) return amInvalid;
+    if (As == 3) return amImmediate;
+    break;
+  case 2:
+    if (As == 1) return amAbsolute;
+    if (As == 2) return amConstant;
+    if (As == 3) return amConstant;
+    break;
+  case 3:
+    return amConstant;
+  default:
+    break;
+  }
+  switch (As) {
+  case 0: return amRegister;
+  case 1: return amIndexed;
+  case 2: return amIndirect;
+  case 3: return amIndirectPost;
+  default:
+    llvm_unreachable("As out of range");
+  }
+}
+
+static AddrMode DecodeSrcAddrModeI(unsigned Insn) {
+  unsigned Rs = fieldFromInstruction(Insn, 8, 4);
+  unsigned As = fieldFromInstruction(Insn, 4, 2);
+  return DecodeSrcAddrMode(Rs, As);
+}
+
+static AddrMode DecodeSrcAddrModeII(unsigned Insn) {
+  unsigned Rs = fieldFromInstruction(Insn, 0, 4);
+  unsigned As = fieldFromInstruction(Insn, 4, 2);
+  return DecodeSrcAddrMode(Rs, As);
+}
+
+static AddrMode DecodeDstAddrMode(unsigned Insn) {
+  unsigned Rd = fieldFromInstruction(Insn, 0, 4);
+  unsigned Ad = fieldFromInstruction(Insn, 7, 1);
+  switch (Rd) {
+  case 0: return Ad ? amSymbolic : amRegister;
+  case 2: return Ad ? amAbsolute : amRegister;
+  default:
+    break;
+  }
+  return Ad ? amIndexed : amRegister;
+}
+
+static const uint8_t *getDecoderTable(AddrMode SrcAM, unsigned Words) {
+  assert(0 < Words && Words < 4 && "Incorrect number of words");
+  switch (SrcAM) {
+  default:
+    llvm_unreachable("Invalid addressing mode");
+  case amRegister:
+    assert(Words < 3 && "Incorrect number of words");
+    return Words == 2 ? DecoderTableAlpha32 : DecoderTableAlpha16;
+  case amConstant:
+    assert(Words < 3 && "Incorrect number of words");
+    return Words == 2 ? DecoderTableBeta32 : DecoderTableBeta16;
+  case amIndexed:
+  case amSymbolic:
+  case amImmediate:
+  case amAbsolute:
+    assert(Words > 1 && "Incorrect number of words");
+    return Words == 2 ? DecoderTableGamma32 : DecoderTableGamma48;
+  case amIndirect:
+  case amIndirectPost:
+    assert(Words < 3 && "Incorrect number of words");
+    return Words == 2 ? DecoderTableDelta32 : DecoderTableDelta16;
+  }
+}
+
+DecodeStatus MSP430Disassembler::getInstructionI(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &VStream,
+                                                 raw_ostream &CStream) const {
+  uint64_t Insn = support::endian::read16le(Bytes.data());
+  AddrMode SrcAM = DecodeSrcAddrModeI(Insn);
+  AddrMode DstAM = DecodeDstAddrMode(Insn);
+  if (SrcAM == amInvalid || DstAM == amInvalid) {
+    Size = 2; // skip one word and let disassembler to try further
+    return MCDisassembler::Fail;
+  }
+
+  unsigned Words = 1;
+  switch (SrcAM) {
+  case amIndexed:
+  case amSymbolic:
+  case amImmediate:
+  case amAbsolute:
+    Insn |= (uint64_t)support::endian::read16le(Bytes.data() + 2) << 16;
+    ++Words;
+    break;
+  default:
+    break;
+  }
+  switch (DstAM) {
+  case amIndexed:
+  case amSymbolic:
+  case amAbsolute:
+    Insn |= (uint64_t)support::endian::read16le(Bytes.data() + Words * 2)
+        << (Words * 16);
+    ++Words;
+    break;
+  default:
+    break;
+  }
+
+  DecodeStatus Result = decodeInstruction(getDecoderTable(SrcAM, Words), MI,
+                                          Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = Words * 2;
+    return Result;
+  }
+
+  Size = 2;
+  return DecodeStatus::Fail;
+}
+
+DecodeStatus MSP430Disassembler::getInstructionII(MCInst &MI, uint64_t &Size,
+                                                  ArrayRef<uint8_t> Bytes,
+                                                  uint64_t Address,
+                                                  raw_ostream &VStream,
+                                                  raw_ostream &CStream) const {
+  uint64_t Insn = support::endian::read16le(Bytes.data());
+  AddrMode SrcAM = DecodeSrcAddrModeII(Insn);
+  if (SrcAM == amInvalid) {
+    Size = 2; // skip one word and let disassembler to try further
+    return MCDisassembler::Fail;
+  }
+
+  unsigned Words = 1;
+  switch (SrcAM) {
+  case amIndexed:
+  case amSymbolic:
+  case amImmediate:
+  case amAbsolute:
+    Insn |= (uint64_t)support::endian::read16le(Bytes.data() + 2) << 16;
+    ++Words;
+    break;
+  default:
+    break;
+  }
+
+  const uint8_t *DecoderTable = Words == 2 ? DecoderTable32 : DecoderTable16;
+  DecodeStatus Result = decodeInstruction(DecoderTable, MI, Insn, Address,
+                                          this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = Words * 2;
+    return Result;
+  }
+
+  Size = 2;
+  return DecodeStatus::Fail;
+}
+
+static MSP430CC::CondCodes getCondCode(unsigned Cond) {
+  switch (Cond) {
+  case 0: return MSP430CC::COND_NE;
+  case 1: return MSP430CC::COND_E;
+  case 2: return MSP430CC::COND_LO;
+  case 3: return MSP430CC::COND_HS;
+  case 4: return MSP430CC::COND_N;
+  case 5: return MSP430CC::COND_GE;
+  case 6: return MSP430CC::COND_L;
+  case 7: return MSP430CC::COND_NONE;
+  default:
+    llvm_unreachable("Cond out of range");
+  }
+}
+
+DecodeStatus MSP430Disassembler::getInstructionCJ(MCInst &MI, uint64_t &Size,
+                                                  ArrayRef<uint8_t> Bytes,
+                                                  uint64_t Address,
+                                                  raw_ostream &VStream,
+                                                  raw_ostream &CStream) const {
+  uint64_t Insn = support::endian::read16le(Bytes.data());
+  unsigned Cond = fieldFromInstruction(Insn, 10, 3);
+  unsigned Offset = fieldFromInstruction(Insn, 0, 10);
+
+  MI.addOperand(MCOperand::createImm(SignExtend32(Offset, 10)));
+
+  if (Cond == 7)
+    MI.setOpcode(MSP430::JMP);
+  else {
+    MI.setOpcode(MSP430::JCC);
+    MI.addOperand(MCOperand::createImm(getCondCode(Cond)));
+  }
+
+  Size = 2;
+  return DecodeStatus::Success;
+}
+
+DecodeStatus MSP430Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                ArrayRef<uint8_t> Bytes,
+                                                uint64_t Address,
+                                                raw_ostream &VStream,
+                                                raw_ostream &CStream) const {
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint64_t Insn = support::endian::read16le(Bytes.data());
+  unsigned Opc = fieldFromInstruction(Insn, 13, 3);
+  switch (Opc) {
+  case 0:
+    return getInstructionII(MI, Size, Bytes, Address, VStream, CStream);
+  case 1:
+    return getInstructionCJ(MI, Size, Bytes, Address, VStream, CStream);
+  default:
+    return getInstructionI(MI, Size, Bytes, Address, VStream, CStream);
+  }
+}
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
index be6d1a84a37..4d62547bc65 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
@@ -16,28 +16,34 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-
 // Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
 #include "MSP430GenAsmWriter.inc"
 
 void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                   StringRef Annot, const MCSubtargetInfo &STI) {
-  printInstruction(MI, O);
+  if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
   printAnnotation(O, Annot);
 }
 
 void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
                                              raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isImm())
-    O << Op.getImm();
-  else {
+  if (Op.isImm()) {
+    int64_t Imm = Op.getImm() * 2 + 2;
+    O << "$";
+    if (Imm >= 0)
+      O << '+';
+    O << Imm;
+  } else {
     assert(Op.isExpr() && "unknown pcrel immediate operand");
     Op.getExpr()->print(O, &MAI);
   }
@@ -72,7 +78,7 @@ void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
   // vs
   //   mov.w glb(r1), r2
   // Otherwise (!) msp430-as will silently miscompile the output :(
-  if (!Base.getReg())
+  if (Base.getReg() == MSP430::SR)
     O << '&';
 
   if (Disp.isExpr())
@@ -83,10 +89,23 @@ void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
   }
 
   // Print register base field
-  if (Base.getReg())
+  if ((Base.getReg() != MSP430::SR) &&
+      (Base.getReg() != MSP430::PC))
     O << '(' << getRegisterName(Base.getReg()) << ')';
 }
 
+void MSP430InstPrinter::printIndRegOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  const MCOperand &Base = MI->getOperand(OpNo);
+  O << "@" << getRegisterName(Base.getReg());
+}
+
+void MSP430InstPrinter::printPostIndRegOperand(const MCInst *MI, unsigned OpNo,
+                                               raw_ostream &O) {
+  const MCOperand &Base = MI->getOperand(OpNo);
+  O << "@" << getRegisterName(Base.getReg()) << "+";
+}
+
 void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
   unsigned CC = MI->getOperand(OpNo).getImm();
@@ -112,5 +131,8 @@ void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo,
   case MSP430CC::COND_L:
    O << 'l';
    break;
+  case MSP430CC::COND_N:
+   O << 'n';
+   break;
   }
 }
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
index 72afec18bec..cd02c4fa645 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -28,13 +28,20 @@ namespace llvm {
 
     // Autogenerated by tblgen.
     void printInstruction(const MCInst *MI, raw_ostream &O);
+    bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+    void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                 unsigned PrintMethodIdx, raw_ostream &O);
     static const char *getRegisterName(unsigned RegNo);
 
+private:
     void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                       const char *Modifier = nullptr);
     void printPCRelImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
     void printSrcMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                             const char *Modifier = nullptr);
+    void printIndRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+    void printPostIndRegOperand(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O);
     void printCCOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   };
diff --git a/lib/Target/MSP430/LLVMBuild.txt b/lib/Target/MSP430/LLVMBuild.txt
index 51d9702ac56..0cbd1851777 100644
--- a/lib/Target/MSP430/LLVMBuild.txt
+++ b/lib/Target/MSP430/LLVMBuild.txt
@@ -16,13 +16,15 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = InstPrinter MCTargetDesc TargetInfo
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
 name = MSP430
 parent = Target
+has_asmparser = 1
 has_asmprinter = 1
+has_disassembler = 1
 
 [component_1]
 type = Library
diff --git a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
index 0f3ebd30392..a2f468779f5 100644
--- a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,8 @@
 add_llvm_library(LLVMMSP430Desc
-  MSP430MCTargetDesc.cpp
+  MSP430AsmBackend.cpp
+  MSP430ELFObjectWriter.cpp
+  MSP430ELFStreamer.cpp
   MSP430MCAsmInfo.cpp
+  MSP430MCCodeEmitter.cpp
+  MSP430MCTargetDesc.cpp
   )
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
new file mode 100644
index 00000000000..bd69a9d8d79
--- /dev/null
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
@@ -0,0 +1,178 @@
+//===-- MSP430AsmBackend.cpp - MSP430 Assembler Backend -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MSP430FixupKinds.h"
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class MSP430AsmBackend : public MCAsmBackend {
+  uint8_t OSABI;
+
+  uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                            MCContext &Ctx) const;
+
+public:
+  MSP430AsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI)
+      : MCAsmBackend(support::little), OSABI(OSABI) {}
+  ~MSP430AsmBackend() override {}
+
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createMSP430ELFObjectWriter(OSABI);
+  }
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
+                                    uint64_t Value,
+                                    const MCRelaxableFragment *DF,
+                                    const MCAsmLayout &Layout,
+                                    const bool WasForced) const override {
+    return false;
+  }
+
+  unsigned getNumFixupKinds() const override {
+    return MSP430::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[MSP430::NumTargetFixupKinds] = {
+      // This table must be in the same order of enum in MSP430FixupKinds.h.
+      //
+      // name            offset bits flags
+      {"fixup_32",            0, 32, 0},
+      {"fixup_10_pcrel",      0, 10, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_16",            0, 16, 0},
+      {"fixup_16_pcrel",      0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_16_byte",       0, 16, 0},
+      {"fixup_16_pcrel_byte", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_2x_pcrel",      0, 10, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_rl_pcrel",      0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_8",             0,  8, 0},
+      {"fixup_sym_diff",      0, 32, 0},
+    };
+    static_assert((array_lengthof(Infos)) == MSP430::NumTargetFixupKinds,
+                  "Not all fixup kinds added to Infos array");
+  
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+  
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
+    return false;
+  }
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
+
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+};
+
+uint64_t MSP430AsmBackend::adjustFixupValue(const MCFixup &Fixup,
+                                            uint64_t Value,
+                                            MCContext &Ctx) const {
+  unsigned Kind = Fixup.getKind();
+  switch (Kind) {
+  case MSP430::fixup_10_pcrel: {
+    if (Value & 0x1)
+      Ctx.reportError(Fixup.getLoc(), "fixup value must be 2-byte aligned");
+
+    // Offset is signed
+    int16_t Offset = Value;
+    // Jumps are in words
+    Offset >>= 1;
+    // PC points to the next instruction so decrement by one
+    --Offset;
+
+    if (Offset < -512 || Offset > 511)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+
+    // Mask 10 bits
+    Offset &= 0x3ff;
+
+    return Offset;
+  }
+  default:
+    return Value;
+  }
+}
+
+void MSP430AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                  const MCValue &Target,
+                                  MutableArrayRef<char> Data,
+                                  uint64_t Value, bool IsResolved,
+                                  const MCSubtargetInfo *STI) const {
+  Value = adjustFixupValue(Fixup, Value, Asm.getContext());
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  if (!Value)
+    return; // Doesn't change encoding.
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
+
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+  }
+}
+
+bool MSP430AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+  if ((Count % 2) != 0)
+    return false;
+
+  // The canonical nop on MSP430 is mov #0, r3
+  uint64_t NopCount = Count / 2;
+  while (NopCount--)
+    OS.write("\x03\x43", 2);
+
+  return true;
+}
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createMSP430MCAsmBackend(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             const MCRegisterInfo &MRI,
+                                             const MCTargetOptions &Options) {
+  return new MSP430AsmBackend(STI, ELF::ELFOSABI_STANDALONE);
+}
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
new file mode 100644
index 00000000000..30d077b5b58
--- /dev/null
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
@@ -0,0 +1,59 @@
+//===-- MSP430ELFObjectWriter.cpp - MSP430 ELF Writer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MSP430FixupKinds.h"
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class MSP430ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  MSP430ELFObjectWriter(uint8_t OSABI)
+    : MCELFObjectTargetWriter(false, OSABI, ELF::EM_MSP430,
+                              /*HasRelocationAddend*/ true) {}
+
+  ~MSP430ELFObjectWriter() override {}
+
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override {
+    // Translate fixup kind to ELF relocation type.
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_1:                   return ELF::R_MSP430_8;
+    case FK_Data_2:                   return ELF::R_MSP430_16;
+    case FK_Data_4:                   return ELF::R_MSP430_32;
+    case MSP430::fixup_32:            return ELF::R_MSP430_32;
+    case MSP430::fixup_10_pcrel:      return ELF::R_MSP430_10_PCREL;
+    case MSP430::fixup_16:            return ELF::R_MSP430_16;
+    case MSP430::fixup_16_pcrel:      return ELF::R_MSP430_16_PCREL;
+    case MSP430::fixup_16_byte:       return ELF::R_MSP430_16_BYTE;
+    case MSP430::fixup_16_pcrel_byte: return ELF::R_MSP430_16_PCREL_BYTE;
+    case MSP430::fixup_2x_pcrel:      return ELF::R_MSP430_2X_PCREL;
+    case MSP430::fixup_rl_pcrel:      return ELF::R_MSP430_RL_PCREL;
+    case MSP430::fixup_8:             return ELF::R_MSP430_8;
+    case MSP430::fixup_sym_diff:      return ELF::R_MSP430_SYM_DIFF;
+    default:
+      llvm_unreachable("Invalid fixup kind");
+    }
+  }
+};
+} // end of anonymous namespace
+
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createMSP430ELFObjectWriter(uint8_t OSABI) {
+  return llvm::make_unique<MSP430ELFObjectWriter>(OSABI);
+}
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
new file mode 100644
index 00000000000..9449cb27802
--- /dev/null
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
@@ -0,0 +1,81 @@
+//===-- MSP430ELFStreamer.cpp - MSP430 ELF Target Streamer Methods --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MSP430 specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+class MSP430TargetELFStreamer : public MCTargetStreamer {
+public:
+  MCELFStreamer &getStreamer();
+  MSP430TargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+};
+
+// This part is for ELF object output.
+MSP430TargetELFStreamer::MSP430TargetELFStreamer(MCStreamer &S,
+                                                 const MCSubtargetInfo &STI)
+    : MCTargetStreamer(S) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+  MCA.setELFHeaderEFlags(EFlags);
+
+  // Emit build attributes section according to
+  // MSP430 EABI (slaa534.pdf, part 13).
+  MCSection *AttributeSection = getStreamer().getContext().getELFSection(
+      ".MSP430.attributes", ELF::SHT_MSP430_ATTRIBUTES, 0);
+  Streamer.SwitchSection(AttributeSection);
+
+  // Format version.
+  Streamer.EmitIntValue(0x41, 1);
+  // Subsection length.
+  Streamer.EmitIntValue(22, 4);
+  // Vendor name string, zero-terminated.
+  Streamer.EmitBytes("mspabi");
+  Streamer.EmitIntValue(0, 1);
+
+  // Attribute vector scope tag. 1 stands for the entire file.
+  Streamer.EmitIntValue(1, 1);
+  // Attribute vector length.
+  Streamer.EmitIntValue(11, 4);
+  // OFBA_MSPABI_Tag_ISA(4) = 1, MSP430
+  Streamer.EmitIntValue(4, 1);
+  Streamer.EmitIntValue(1, 1);
+  // OFBA_MSPABI_Tag_Code_Model(6) = 1, Small
+  Streamer.EmitIntValue(6, 1);
+  Streamer.EmitIntValue(1, 1);
+  // OFBA_MSPABI_Tag_Data_Model(8) = 1, Small
+  Streamer.EmitIntValue(8, 1);
+  Streamer.EmitIntValue(1, 1);
+}
+
+MCELFStreamer &MSP430TargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(Streamer);
+}
+
+MCTargetStreamer *
+createMSP430ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new MSP430TargetELFStreamer(S, STI);
+  return nullptr;
+}
+
+} // namespace llvm
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h b/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h
new file mode 100644
index 00000000000..1eb6a275942
--- /dev/null
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h
@@ -0,0 +1,53 @@
+//===-- MSP430FixupKinds.h - MSP430 Specific Fixup Entries ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430FIXUPKINDS_H
+#define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+#undef MSP430
+
+namespace llvm {
+namespace MSP430 {
+
+// This table must be in the same order of
+// MCFixupKindInfo Infos[MSP430::NumTargetFixupKinds]
+// in MSP430AsmBackend.cpp.
+//
+enum Fixups {
+  // A 32 bit absolute fixup.
+  fixup_32 = FirstTargetFixupKind,
+  // A 10 bit PC relative fixup.
+  fixup_10_pcrel,
+  // A 16 bit absolute fixup.
+  fixup_16,
+  // A 16 bit PC relative fixup.
+  fixup_16_pcrel,
+  // A 16 bit absolute fixup for byte operations.
+  fixup_16_byte,
+  // A 16 bit PC relative fixup for command address.
+  fixup_16_pcrel_byte,
+  // A 10 bit PC relative fixup for complicated polymorphs.
+  fixup_2x_pcrel,
+  // A 16 bit relaxable fixup.
+  fixup_rl_pcrel,
+  // A 8 bit absolute fixup.
+  fixup_8,
+  // A 32 bit symbol difference fixup.
+  fixup_sym_diff,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // end namespace MSP430
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
new file mode 100644
index 00000000000..ba9f7d7a9a5
--- /dev/null
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
@@ -0,0 +1,212 @@
+//===-- MSP430MCCodeEmitter.cpp - Convert MSP430 code to machine code -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "MCTargetDesc/MSP430FixupKinds.h"
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace llvm {
+
+class MSP430MCCodeEmitter : public MCCodeEmitter {
+  MCContext &Ctx;
+  MCInstrInfo const &MCII;
+
+  // Offset keeps track of current word number being emitted
+  // inside a particular instruction.
+  mutable unsigned Offset;
+
+  /// TableGen'erated function for getting the binary encoding for an
+  /// instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// Returns the binary encoding of operands.
+  ///
+  /// If an operand requires relocation, the relocation is recorded
+  /// and zero is returned.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getMemOpValue(const MCInst &MI, unsigned Op,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
+
+  unsigned getPCRelImmOpValue(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  unsigned getCGImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const;
+
+  unsigned getCCOpValue(const MCInst &MI, unsigned Op,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const;
+
+public:
+  MSP430MCCodeEmitter(MCContext &ctx, MCInstrInfo const &MCII)
+      : Ctx(ctx), MCII(MCII) {}
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+};
+
+void MSP430MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  // Get byte count of instruction.
+  unsigned Size = Desc.getSize();
+
+  // Initialize fixup offset
+  Offset = 2;
+
+  uint64_t BinaryOpCode = getBinaryCodeForInstr(MI, Fixups, STI);
+  const uint16_t *Words = reinterpret_cast<uint16_t const *>(&BinaryOpCode);
+  size_t WordCount = Size / 2;
+
+  for (size_t i = 0; i < WordCount; ++i) {
+    uint16_t Word = Words[i];
+    support::endian::write(OS, Word, support::little);
+  }
+}
+
+unsigned MSP430MCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                                const MCOperand &MO,
+                                                SmallVectorImpl<MCFixup> &Fixups,
+                                                const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  if (MO.isImm()) {
+    Offset += 2;
+    return MO.getImm();
+  }
+
+  assert(MO.isExpr() && "Expected expr operand");
+  Fixups.push_back(MCFixup::create(Offset, MO.getExpr(),
+      static_cast<MCFixupKind>(MSP430::fixup_16_byte), MI.getLoc()));
+  Offset += 2;
+  return 0;
+}
+
+unsigned MSP430MCCodeEmitter::getMemOpValue(const MCInst &MI, unsigned Op,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO1 = MI.getOperand(Op);
+  assert(MO1.isReg() && "Register operand expected");
+  unsigned Reg = Ctx.getRegisterInfo()->getEncodingValue(MO1.getReg());
+
+  const MCOperand &MO2 = MI.getOperand(Op + 1);
+  if (MO2.isImm()) {
+    Offset += 2;
+    return (MO2.getImm() << 4) | Reg;
+  }
+
+  assert(MO2.isExpr() && "Expr operand expected");
+  MSP430::Fixups FixupKind;
+  switch (Reg) {
+  case 0:
+    FixupKind = MSP430::fixup_16_pcrel_byte;
+    break;
+  case 2:
+    FixupKind = MSP430::fixup_16_byte;
+    break;
+  default:
+    FixupKind = MSP430::fixup_16_byte;
+    break;
+  }
+  Fixups.push_back(MCFixup::create(Offset, MO2.getExpr(),
+    static_cast<MCFixupKind>(FixupKind), MI.getLoc()));
+  Offset += 2;
+  return Reg;
+}
+
+unsigned MSP430MCCodeEmitter::getPCRelImmOpValue(const MCInst &MI, unsigned Op,
+                                                 SmallVectorImpl<MCFixup> &Fixups,
+                                                 const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(Op);
+  if (MO.isImm())
+    return MO.getImm();
+
+  assert(MO.isExpr() && "Expr operand expected");
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+    static_cast<MCFixupKind>(MSP430::fixup_10_pcrel), MI.getLoc()));
+  return 0;
+}
+
+unsigned MSP430MCCodeEmitter::getCGImmOpValue(const MCInst &MI, unsigned Op,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(Op);
+  assert(MO.isImm() && "Expr operand expected");
+  
+  int64_t Imm = MO.getImm();
+  switch (Imm) {
+  default:
+    llvm_unreachable("Invalid immediate value");
+  case 4:  return 0x22;
+  case 8:  return 0x32;
+  case 0:  return 0x03;
+  case 1:  return 0x13;
+  case 2:  return 0x23;
+  case -1: return 0x33;
+  }
+}
+
+unsigned MSP430MCCodeEmitter::getCCOpValue(const MCInst &MI, unsigned Op,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(Op);
+  assert(MO.isImm() && "Immediate operand expected");
+  switch (MO.getImm()) {
+  case MSP430CC::COND_NE: return 0;
+  case MSP430CC::COND_E:  return 1;
+  case MSP430CC::COND_LO: return 2;
+  case MSP430CC::COND_HS: return 3;
+  case MSP430CC::COND_N:  return 4;
+  case MSP430CC::COND_GE: return 5;
+  case MSP430CC::COND_L:  return 6;
+  default:
+    llvm_unreachable("Unknown condition code");
+  }
+}
+
+MCCodeEmitter *createMSP430MCCodeEmitter(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         MCContext &Ctx) {
+  return new MSP430MCCodeEmitter(Ctx, MCII);
+}
+
+#include "MSP430GenMCCodeEmitter.inc"
+
+} // end of namespace llvm
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index 8c715500f38..b21145d3904 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -58,22 +58,15 @@ static MCInstPrinter *createMSP430MCInstPrinter(const Triple &T,
 }
 
 extern "C" void LLVMInitializeMSP430TargetMC() {
-  // Register the MC asm info.
-  RegisterMCAsmInfo<MSP430MCAsmInfo> X(getTheMSP430Target());
+  Target &T = getTheMSP430Target();
 
-  // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(getTheMSP430Target(),
-                                      createMSP430MCInstrInfo);
-
-  // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(getTheMSP430Target(),
-                                    createMSP430MCRegisterInfo);
-
-  // Register the MC subtarget info.
-  TargetRegistry::RegisterMCSubtargetInfo(getTheMSP430Target(),
-                                          createMSP430MCSubtargetInfo);
-
-  // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(getTheMSP430Target(),
-                                        createMSP430MCInstPrinter);
+  RegisterMCAsmInfo<MSP430MCAsmInfo> X(T);
+  TargetRegistry::RegisterMCInstrInfo(T, createMSP430MCInstrInfo);
+  TargetRegistry::RegisterMCRegInfo(T, createMSP430MCRegisterInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(T, createMSP430MCSubtargetInfo);
+  TargetRegistry::RegisterMCInstPrinter(T, createMSP430MCInstPrinter);
+  TargetRegistry::RegisterMCCodeEmitter(T, createMSP430MCCodeEmitter);
+  TargetRegistry::RegisterMCAsmBackend(T, createMSP430MCAsmBackend);
+  TargetRegistry::RegisterObjectTargetStreamer(
+      T, createMSP430ObjectTargetStreamer);
 }
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
index b901c5f0979..e484c79c9ee 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -15,12 +15,39 @@
 #define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
+#include <memory>
 
 namespace llvm {
 class Target;
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCInstrInfo;
+class MCSubtargetInfo;
+class MCRegisterInfo;
+class MCContext;
+class MCTargetOptions;
+class MCObjectTargetWriter;
+class MCStreamer;
+class MCTargetStreamer;
 
 Target &getTheMSP430Target();
 
+/// Creates a machine code emitter for MSP430.
+MCCodeEmitter *createMSP430MCCodeEmitter(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         MCContext &Ctx);
+
+MCAsmBackend *createMSP430MCAsmBackend(const Target &T,
+                                       const MCSubtargetInfo &STI,
+                                       const MCRegisterInfo &MRI,
+                                       const MCTargetOptions &Options);
+
+MCTargetStreamer *
+createMSP430ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
+std::unique_ptr<MCObjectTargetWriter>
+createMSP430ELFObjectWriter(uint8_t OSABI);
+
 } // End llvm namespace
 
 // Defines symbolic names for MSP430 registers.
diff --git a/lib/Target/MSP430/MSP430.h b/lib/Target/MSP430/MSP430.h
index 796f2523312..7a5314a1084 100644
--- a/lib/Target/MSP430/MSP430.h
+++ b/lib/Target/MSP430/MSP430.h
@@ -27,6 +27,8 @@ namespace MSP430CC {
     COND_LO = 3,  // aka COND_NC
     COND_GE = 4,
     COND_L  = 5,
+    COND_N  = 6,  // jump if negative
+    COND_NONE,    // unconditional
 
     COND_INVALID = -1
   };
diff --git a/lib/Target/MSP430/MSP430.td b/lib/Target/MSP430/MSP430.td
index 203864dd406..8fa99dc13dd 100644
--- a/lib/Target/MSP430/MSP430.td
+++ b/lib/Target/MSP430/MSP430.td
@@ -64,11 +64,29 @@ include "MSP430InstrInfo.td"
 
 def MSP430InstrInfo : InstrInfo;
 
+//===---------------------------------------------------------------------===//
+// Assembly Printers
+//===---------------------------------------------------------------------===//
+
+def MSP430AsmWriter : AsmWriter {
+  string AsmWriterClassName = "InstPrinter";
+}
+
+//===---------------------------------------------------------------------===//
+// Assembly Parsers
+//===---------------------------------------------------------------------===//
+
+def MSP430AsmParser : AsmParser {
+  let AllowDuplicateRegisterNames = 1;
+  let ShouldEmitMatchRegisterAltName = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Target Declaration
 //===----------------------------------------------------------------------===//
 
 def MSP430 : Target {
   let InstructionSet = MSP430InstrInfo;
+  let AssemblyParsers = [MSP430AsmParser];
 }
 
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index b196c013902..7a1998ad355 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -98,6 +98,7 @@ namespace {
     MSP430DAGToDAGISel(MSP430TargetMachine &TM, CodeGenOpt::Level OptLevel)
         : SelectionDAGISel(TM, OptLevel) {}
 
+  private:
     StringRef getPassName() const override {
       return "MSP430 DAG->DAG Pattern Instruction Selection";
     }
@@ -112,8 +113,9 @@ namespace {
     // Include the pieces autogenerated from the target description.
   #include "MSP430GenDAGISel.inc"
 
-  private:
+    // Main method to transform nodes into machine nodes.
     void Select(SDNode *N) override;
+
     bool tryIndexedLoad(SDNode *Op);
     bool tryIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2, unsigned Opc8,
                          unsigned Opc16);
@@ -250,11 +252,9 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
   if (MatchAddress(N, AM))
     return false;
 
-  EVT VT = N.getValueType();
-  if (AM.BaseType == MSP430ISelAddressMode::RegBase) {
+  if (AM.BaseType == MSP430ISelAddressMode::RegBase)
     if (!AM.Base.Reg.getNode())
-      AM.Base.Reg = CurDAG->getRegister(0, VT);
-  }
+      AM.Base.Reg = CurDAG->getRegister(MSP430::SR, MVT::i16);
 
   Base = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase)
              ? CurDAG->getTargetFrameIndex(
@@ -336,10 +336,10 @@ bool MSP430DAGToDAGISel::tryIndexedLoad(SDNode *N) {
   unsigned Opcode = 0;
   switch (VT.SimpleTy) {
   case MVT::i8:
-    Opcode = MSP430::MOV8rm_POST;
+    Opcode = MSP430::MOV8rp;
     break;
   case MVT::i16:
-    Opcode = MSP430::MOV16rm_POST;
+    Opcode = MSP430::MOV16rp;
     break;
   default:
     return false;
@@ -412,47 +412,47 @@ void MSP430DAGToDAGISel::Select(SDNode *Node) {
     break;
   case ISD::ADD:
     if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
-                        MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+                        MSP430::ADD8rp, MSP430::ADD16rp))
       return;
     else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                             MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+                             MSP430::ADD8rp, MSP430::ADD16rp))
       return;
 
     // Other cases are autogenerated.
     break;
   case ISD::SUB:
     if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
-                        MSP430::SUB8rm_POST, MSP430::SUB16rm_POST))
+                        MSP430::SUB8rp, MSP430::SUB16rp))
       return;
 
     // Other cases are autogenerated.
     break;
   case ISD::AND:
     if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
-                        MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+                        MSP430::AND8rp, MSP430::AND16rp))
       return;
     else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                             MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+                             MSP430::AND8rp, MSP430::AND16rp))
       return;
 
     // Other cases are autogenerated.
     break;
   case ISD::OR:
     if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
-                        MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+                        MSP430::BIS8rp, MSP430::BIS16rp))
       return;
     else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                             MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+                             MSP430::BIS8rp, MSP430::BIS16rp))
       return;
 
     // Other cases are autogenerated.
     break;
   case ISD::XOR:
     if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
-                        MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+                        MSP430::XOR8rp, MSP430::XOR16rp))
       return;
     else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                             MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+                             MSP430::XOR8rp, MSP430::XOR16rp))
       return;
 
     // Other cases are autogenerated.
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index f5b2bda5d1e..ac93d7efc2b 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -940,18 +940,7 @@ SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
 
   // Expand non-constant shifts to loops:
   if (!isa<ConstantSDNode>(N->getOperand(1)))
-    switch (Opc) {
-    default: llvm_unreachable("Invalid shift opcode!");
-    case ISD::SHL:
-      return DAG.getNode(MSP430ISD::SHL, dl,
-                         VT, N->getOperand(0), N->getOperand(1));
-    case ISD::SRA:
-      return DAG.getNode(MSP430ISD::SRA, dl,
-                         VT, N->getOperand(0), N->getOperand(1));
-    case ISD::SRL:
-      return DAG.getNode(MSP430ISD::SRL, dl,
-                         VT, N->getOperand(0), N->getOperand(1));
-    }
+    return Op;
 
   uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
 
@@ -963,7 +952,7 @@ SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
   if (Opc == ISD::SRL && ShiftAmount) {
     // Emit a special goodness here:
     // srl A, 1 => clrc; rrc A
-    Victim = DAG.getNode(MSP430ISD::RRC, dl, VT, Victim);
+    Victim = DAG.getNode(MSP430ISD::RRCL, dl, VT, Victim);
     ShiftAmount -= 1;
   }
 
@@ -1342,15 +1331,14 @@ const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MSP430ISD::RRA:                return "MSP430ISD::RRA";
   case MSP430ISD::RLA:                return "MSP430ISD::RLA";
   case MSP430ISD::RRC:                return "MSP430ISD::RRC";
+  case MSP430ISD::RRCL:               return "MSP430ISD::RRCL";
   case MSP430ISD::CALL:               return "MSP430ISD::CALL";
   case MSP430ISD::Wrapper:            return "MSP430ISD::Wrapper";
   case MSP430ISD::BR_CC:              return "MSP430ISD::BR_CC";
   case MSP430ISD::CMP:                return "MSP430ISD::CMP";
   case MSP430ISD::SETCC:              return "MSP430ISD::SETCC";
   case MSP430ISD::SELECT_CC:          return "MSP430ISD::SELECT_CC";
-  case MSP430ISD::SHL:                return "MSP430ISD::SHL";
-  case MSP430ISD::SRA:                return "MSP430ISD::SRA";
-  case MSP430ISD::SRL:                return "MSP430ISD::SRL";
+  case MSP430ISD::DADD:               return "MSP430ISD::DADD";
   }
   return nullptr;
 }
@@ -1397,33 +1385,49 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
   const TargetInstrInfo &TII = *F->getSubtarget().getInstrInfo();
 
   unsigned Opc;
+  bool ClearCarry = false;
   const TargetRegisterClass * RC;
   switch (MI.getOpcode()) {
   default: llvm_unreachable("Invalid shift opcode!");
   case MSP430::Shl8:
-   Opc = MSP430::SHL8r1;
-   RC = &MSP430::GR8RegClass;
-   break;
+    Opc = MSP430::ADD8rr;
+    RC = &MSP430::GR8RegClass;
+    break;
   case MSP430::Shl16:
-   Opc = MSP430::SHL16r1;
-   RC = &MSP430::GR16RegClass;
-   break;
+    Opc = MSP430::ADD16rr;
+    RC = &MSP430::GR16RegClass;
+    break;
   case MSP430::Sra8:
-   Opc = MSP430::SAR8r1;
-   RC = &MSP430::GR8RegClass;
-   break;
+    Opc = MSP430::RRA8r;
+    RC = &MSP430::GR8RegClass;
+    break;
   case MSP430::Sra16:
-   Opc = MSP430::SAR16r1;
-   RC = &MSP430::GR16RegClass;
-   break;
+    Opc = MSP430::RRA16r;
+    RC = &MSP430::GR16RegClass;
+    break;
   case MSP430::Srl8:
-   Opc = MSP430::SAR8r1c;
-   RC = &MSP430::GR8RegClass;
-   break;
+    ClearCarry = true;
+    Opc = MSP430::RRC8r;
+    RC = &MSP430::GR8RegClass;
+    break;
   case MSP430::Srl16:
-   Opc = MSP430::SAR16r1c;
-   RC = &MSP430::GR16RegClass;
-   break;
+    ClearCarry = true;
+    Opc = MSP430::RRC16r;
+    RC = &MSP430::GR16RegClass;
+    break;
+  case MSP430::Rrcl8:
+  case MSP430::Rrcl16: {
+    BuildMI(*BB, MI, dl, TII.get(MSP430::BIC16rc), MSP430::SR)
+      .addReg(MSP430::SR).addImm(1);
+    unsigned SrcReg = MI.getOperand(1).getReg();
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned RrcOpc = MI.getOpcode() == MSP430::Rrcl16
+                    ? MSP430::RRC16r : MSP430::RRC8r;
+    BuildMI(*BB, MI, dl, TII.get(RrcOpc), DstReg)
+      .addReg(SrcReg);
+    MI.eraseFromParent(); // The pseudo instruction is gone now.
+    return BB;
+  }
   }
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -1476,8 +1480,16 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
   BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftAmtReg)
     .addReg(ShiftAmtSrcReg).addMBB(BB)
     .addReg(ShiftAmtReg2).addMBB(LoopBB);
-  BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
-    .addReg(ShiftReg);
+  if (ClearCarry)
+    BuildMI(LoopBB, dl, TII.get(MSP430::BIC16rc), MSP430::SR)
+      .addReg(MSP430::SR).addImm(1);
+  if (Opc == MSP430::ADD8rr || Opc == MSP430::ADD16rr)
+    BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
+      .addReg(ShiftReg)
+      .addReg(ShiftReg);
+  else
+    BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
+      .addReg(ShiftReg);
   BuildMI(LoopBB, dl, TII.get(MSP430::SUB8ri), ShiftAmtReg2)
     .addReg(ShiftAmtReg).addImm(1);
   BuildMI(LoopBB, dl, TII.get(MSP430::JCC))
@@ -1499,9 +1511,10 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                   MachineBasicBlock *BB) const {
   unsigned Opc = MI.getOpcode();
 
-  if (Opc == MSP430::Shl8 || Opc == MSP430::Shl16 ||
-      Opc == MSP430::Sra8 || Opc == MSP430::Sra16 ||
-      Opc == MSP430::Srl8 || Opc == MSP430::Srl16)
+  if (Opc == MSP430::Shl8  || Opc == MSP430::Shl16 ||
+      Opc == MSP430::Sra8  || Opc == MSP430::Sra16 ||
+      Opc == MSP430::Srl8  || Opc == MSP430::Srl16 ||
+      Opc == MSP430::Rrcl8 || Opc == MSP430::Rrcl16)
     return EmitShiftInstr(MI, BB);
 
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 842d03df32f..731bc140671 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -36,6 +36,9 @@ namespace llvm {
       /// Y = RRC X, rotate right via carry
       RRC,
 
+      /// Rotate right via carry, carry gets cleared beforehand by clrc
+      RRCL,
+
       /// CALL - These operations represent an abstract call
       /// instruction, which includes a bunch of information.
       CALL,
@@ -61,8 +64,9 @@ namespace llvm {
       /// is condition code and operand 4 is flag operand.
       SELECT_CC,
 
-      /// SHL, SRA, SRL - Non-constant shifts.
-      SHL, SRA, SRL
+      /// DADD - Decimal addition with carry
+      /// TODO Nothing generates a node of this type yet.
+      DADD,
     };
   }
 
diff --git a/lib/Target/MSP430/MSP430InstrFormats.td b/lib/Target/MSP430/MSP430InstrFormats.td
index a9e87dad0cd..e2e4503db20 100644
--- a/lib/Target/MSP430/MSP430InstrFormats.td
+++ b/lib/Target/MSP430/MSP430InstrFormats.td
@@ -11,201 +11,431 @@
 //  Describe MSP430 instructions format here
 //
 
-// Format specifies the encoding used by the instruction.  This is part of the
-// ad-hoc solution used to emit machine instruction encodings by our machine
-// code emitter.
-class Format<bits<2> val> {
-  bits<2> Value = val;
-}
-
-def PseudoFrm   : Format<0>;
-def SingleOpFrm : Format<1>;
-def DoubleOpFrm : Format<2>;
-def CondJumpFrm : Format<3>;
-
 class SourceMode<bits<2> val> {
   bits<2> Value = val;
 }
 
-def SrcReg      : SourceMode<0>;
-def SrcMem      : SourceMode<1>;
-def SrcIndReg   : SourceMode<2>;
-def SrcPostInc  : SourceMode<3>;
-def SrcImm      : SourceMode<3>;
+def SrcReg      : SourceMode<0>; // r
+def SrcMem      : SourceMode<1>; // m
+def SrcIndReg   : SourceMode<2>; // n
+def SrcPostInc  : SourceMode<3>; // p
+def SrcImm      : SourceMode<3>; // i
+//  SrcCGImm    : SourceMode< >; // c
 
 class DestMode<bit val> {
   bit Value = val;
 }
 
-def DstReg      : DestMode<0>;
-def DstMem      : DestMode<1>;
-
-class SizeVal<bits<3> val> {
-  bits<3> Value = val;
-}
-
-def SizeUnknown : SizeVal<0>; // Unknown / unset size
-def SizeSpecial : SizeVal<1>; // Special instruction, e.g. pseudo
-def Size2Bytes  : SizeVal<2>;
-def Size4Bytes  : SizeVal<3>;
-def Size6Bytes  : SizeVal<4>;
+def DstReg      : DestMode<0>;   // r
+def DstMem      : DestMode<1>;   // m
 
 // Generic MSP430 Format
-class MSP430Inst<dag outs, dag ins, SizeVal sz, Format f,
-                 string asmstr> : Instruction {
-  field bits<16> Inst;
+class MSP430Inst<dag outs, dag ins, int size, string asmstr> : Instruction {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
 
   let Namespace = "MSP430";
 
   dag OutOperandList = outs;
   dag InOperandList  = ins;
 
-  Format Form = f;
-  SizeVal Sz = sz;
-
-  // Define how we want to layout our TargetSpecific information field... This
-  // should be kept up-to-date with the fields in the MSP430InstrInfo.h file.
-  let TSFlags{1-0} = Form.Value;
-  let TSFlags{4-2} = Sz.Value;
-
-  let AsmString   = asmstr;
+  let AsmString = asmstr;
+  let Size = size;
 }
 
-// FIXME: Create different classes for different addressing modes.
-
 // MSP430 Double Operand (Format I) Instructions
-class IForm<bits<4> opcode, DestMode dest, bit bw, SourceMode src, SizeVal sz,
+class IForm<bits<4> opcode, DestMode ad, bit bw, SourceMode as, int size,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : MSP430Inst<outs, ins, sz, DoubleOpFrm, asmstr> {
+  : MSP430Inst<outs, ins, size, asmstr> {
   let Pattern = pattern;
 
-  DestMode ad = dest;
-  SourceMode as = src;
-  
-  let Inst{12-15} = opcode;
+  bits<4> rs;
+  bits<4> rd;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-8}  = rs;
   let Inst{7}     = ad.Value;
   let Inst{6}     = bw;
-  let Inst{4-5}   = as.Value;
+  let Inst{5-4}   = as.Value;
+  let Inst{3-0}   = rd;
 }
 
 // 8 bit IForm instructions
-class IForm8<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+class IForm8<bits<4> opcode, DestMode dest, SourceMode src, int size,
              dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm<opcode, dest, 1, src, sz, outs, ins, asmstr, pattern>;
+  : IForm<opcode, dest, 1, src, size, outs, ins, asmstr, pattern>;
 
 class I8rr<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstReg, SrcReg, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Alpha";
+}
 
 class I8ri<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstReg, SrcImm, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<16> imm;
+  let Inst{31-16} = imm;
+  let rs = 0b0000;
+}
+
+class I8rc<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 2, asmstr> {
+  let DecoderNamespace = "Beta";
+  let Pattern = pattern;
+
+  bits<6> imm;
+  bits<4> rd;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-8}  = imm{3-0};
+  let Inst{7}     = DstReg.Value;
+  let Inst{6}     = 1;
+  let Inst{5-4}   = imm{5-4};
+  let Inst{3-0}   = rd;
+}
 
 class I8rm<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstReg, SrcMem, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<20> src;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+}
+
+class I8rn<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcIndReg, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+}
+
+class I8rp<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcPostInc, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+}
 
 class I8mr<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstMem, SrcReg, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Alpha";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
 
 class I8mi<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstMem, SrcImm, 6, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<16> imm;
+  bits<20> dst;
+  let rs = 0b0000;
+  let Inst{31-16} = imm;
+  let rd = dst{3-0};
+  let Inst{47-32} = dst{19-4};
+}
+
+class I8mc<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 4, asmstr> {
+  let DecoderNamespace = "Beta";
+  let Pattern = pattern;
+
+  bits<6> imm;
+  bits<20> dst;
+
+  let Inst{31-16} = dst{19-4};
+  let Inst{15-12} = opcode;
+  let Inst{11-8}  = imm{3-0};
+  let Inst{7}     = DstMem.Value;
+  let Inst{6}     = 1;
+  let Inst{5-4}   = imm{5-4};
+  let Inst{3-0}   = dst{3-0};
+}
 
 class I8mm<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstMem, SrcMem, 6, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<20> src;
+  bits<20> dst;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+  let rd = dst{3-0};
+  let Inst{47-32} = dst{19-4};
+}
+
+class I8mn<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcIndReg, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
+
+class I8mp<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcPostInc, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
 
 // 16 bit IForm instructions
-class IForm16<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+class IForm16<bits<4> opcode, DestMode dest, SourceMode src, int size,
               dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm<opcode, dest, 0, src, sz, outs, ins, asmstr, pattern>;
+  : IForm<opcode, dest, 0, src, size, outs, ins, asmstr, pattern>;
 
 class I16rr<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstReg, SrcReg, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Alpha";
+}
 
 class I16ri<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstReg, SrcImm, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<16> imm;
+  let Inst{31-16} = imm;
+  let rs = 0b0000;
+}
+
+class I16rc<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 2, asmstr> {
+  let DecoderNamespace = "Beta";
+  let Pattern = pattern;
+
+  bits<6> imm;
+  bits<4> rd;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-8}  = imm{3-0};
+  let Inst{7}     = DstReg.Value;
+  let Inst{6}     = 0;
+  let Inst{5-4}   = imm{5-4};
+  let Inst{3-0}   = rd;
+}
 
 class I16rm<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstReg, SrcMem, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<20> src;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+}
+
+class I16rn<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcIndReg, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+}
+
+class I16rp<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcPostInc, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+}
 
 class I16mr<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstMem, SrcReg, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Alpha";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
 
 class I16mi<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstMem, SrcImm, 6, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<16> imm;
+  bits<20> dst;
+  let Inst{31-16} = imm;
+  let rs = 0b0000;
+  let rd = dst{3-0};
+  let Inst{47-32} = dst{19-4};
+}
+
+class I16mc<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 4, asmstr> {
+  let DecoderNamespace = "Beta";
+  let Pattern = pattern;
+
+  bits<6> imm;
+  bits<20> dst;
+
+  let Inst{31-16} = dst{19-4};
+  let Inst{15-12} = opcode;
+  let Inst{11-8}  = imm{3-0};
+  let Inst{7}     = DstMem.Value;
+  let Inst{6}     = 0;
+  let Inst{5-4}   = imm{5-4};
+  let Inst{3-0}   = dst{3-0};
+}
 
 class I16mm<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstMem, SrcMem, 6, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<20> src;
+  bits<20> dst;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+  let rd = dst{3-0};
+  let Inst{47-32} = dst{19-4};
+}
+
+class I16mn<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcIndReg, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
+
+class I16mp<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcPostInc, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
 
 // MSP430 Single Operand (Format II) Instructions
-class IIForm<bits<9> opcode, bit bw, SourceMode src, SizeVal sz,
+class IIForm<bits<3> opcode, bit bw, SourceMode as, int size,
              dag outs, dag ins, string asmstr, list<dag> pattern>
-  : MSP430Inst<outs, ins, sz, SingleOpFrm, asmstr> {
+  : MSP430Inst<outs, ins, size, asmstr> {
   let Pattern = pattern;
-  
-  SourceMode as = src;
 
-  let Inst{7-15} = opcode;
-  let Inst{6}    = bw;
-  let Inst{4-5}  = as.Value;
+  bits<4> rs;
+
+  let Inst{15-10} = 0b000100;
+  let Inst{9-7}   = opcode;
+  let Inst{6}     = bw;
+  let Inst{5-4}   = as.Value;
+  let Inst{3-0}   = rs;
 }
 
 // 8 bit IIForm instructions
-class IIForm8<bits<9> opcode, SourceMode src, SizeVal sz,
+class IIForm8<bits<3> opcode, SourceMode src, int size,
               dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm<opcode, 1, src, sz, outs, ins, asmstr, pattern>;
+  : IIForm<opcode, 1, src, size, outs, ins, asmstr, pattern>;
+
+class II8r<bits<3> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcReg, 2, outs, ins, asmstr, pattern>;
 
-class II8r<bits<9> opcode,
+class II8m<bits<3> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm8<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+  : IIForm8<opcode, SrcMem, 4, outs, ins, asmstr, pattern> {
+  bits<20> src;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+}
 
-class II8m<bits<9> opcode,
+class II8i<bits<3> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm8<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IIForm8<opcode, SrcImm, 4, outs, ins, asmstr, pattern> {
+  bits<16> imm;
+  let rs = 0b0000;
+  let Inst{31-16} = imm;
+}
 
-class II8i<bits<9> opcode,
+class II8c<bits<3> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm8<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+  : MSP430Inst<outs, ins, 2, asmstr> {
+  let Pattern = pattern;
+
+  bits<6> imm;
+
+  let Inst{15-10} = 0b000100;
+  let Inst{9-7}   = opcode;
+  let Inst{6}     = 1;
+  let Inst{5-0}   = imm;
+}
+
+class II8n<bits<3> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcIndReg, 2, outs, ins, asmstr, pattern>;
+
+class II8p<bits<3> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcPostInc, 2, outs, ins, asmstr, pattern>;
 
 // 16 bit IIForm instructions
-class IIForm16<bits<9> opcode, SourceMode src, SizeVal sz,
+class IIForm16<bits<3> opcode, SourceMode src, int size,
                dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm<opcode, 0, src, sz, outs, ins, asmstr, pattern>;
+  : IIForm<opcode, 0, src, size, outs, ins, asmstr, pattern>;
 
-class II16r<bits<9> opcode,
+class II16r<bits<3> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm16<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+  : IIForm16<opcode, SrcReg, 2, outs, ins, asmstr, pattern>;
 
-class II16m<bits<9> opcode,
+class II16m<bits<3> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm16<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IIForm16<opcode, SrcMem, 4, outs, ins, asmstr, pattern> {
+  bits<20> src;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+}
 
-class II16i<bits<9> opcode,
+class II16i<bits<3> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm16<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IIForm16<opcode, SrcImm, 4, outs, ins, asmstr, pattern> {
+  bits<16> imm;
+  let rs = 0b0000;
+  let Inst{31-16} = imm;
+}
+
+class II16c<bits<3> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 2, asmstr> {
+  let Pattern = pattern;
+
+  bits<6> imm;
+
+  let Inst{15-10} = 0b000100;
+  let Inst{9-7}   = opcode;
+  let Inst{6}     = 0;
+  let Inst{5-0}   = imm;
+}
+
+class II16n<bits<3> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcIndReg, 2, outs, ins, asmstr, pattern>;
+
+class II16p<bits<3> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcPostInc, 2, outs, ins, asmstr, pattern>;
 
 // MSP430 Conditional Jumps Instructions
-class CJForm<bits<3> opcode, bits<3> cond,
-             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : MSP430Inst<outs, ins, Size2Bytes, CondJumpFrm, asmstr> {
+class CJForm<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 2, asmstr> {
   let Pattern = pattern;
   
-  let Inst{13-15} = opcode;
-  let Inst{10-12} = cond;
+  bits<3> cond;
+  bits<10> dst;
+
+  let Inst{15-13} = 0b001;
+  let Inst{12-10} = cond;
+  let Inst{9-0} = dst;
 }
 
 // Pseudo instructions
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : MSP430Inst<outs, ins, SizeSpecial, PseudoFrm, asmstr> {
+  : MSP430Inst<outs, ins, 0, asmstr> {
   let Pattern = pattern;
-  let Inst{15-0} = 0;
 }
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index dd1b30a3e47..c136933a51b 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -301,35 +301,20 @@ unsigned MSP430InstrInfo::insertBranch(MachineBasicBlock &MBB,
 unsigned MSP430InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   const MCInstrDesc &Desc = MI.getDesc();
 
-  switch (Desc.TSFlags & MSP430II::SizeMask) {
-  default:
-    switch (Desc.getOpcode()) {
-    default: llvm_unreachable("Unknown instruction size!");
-    case TargetOpcode::CFI_INSTRUCTION:
-    case TargetOpcode::EH_LABEL:
-    case TargetOpcode::IMPLICIT_DEF:
-    case TargetOpcode::KILL:
-    case TargetOpcode::DBG_VALUE:
-      return 0;
-    case TargetOpcode::INLINEASM: {
-      const MachineFunction *MF = MI.getParent()->getParent();
-      const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
-      return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
-                                    *MF->getTarget().getMCAsmInfo());
-    }
-    }
-  case MSP430II::SizeSpecial:
-    switch (MI.getOpcode()) {
-    default: llvm_unreachable("Unknown instruction size!");
-    case MSP430::SAR8r1c:
-    case MSP430::SAR16r1c:
-      return 4;
-    }
-  case MSP430II::Size2Bytes:
-    return 2;
-  case MSP430II::Size4Bytes:
-    return 4;
-  case MSP430II::Size6Bytes:
-    return 6;
+  switch (Desc.getOpcode()) {
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::DBG_VALUE:
+    return 0;
+  case TargetOpcode::INLINEASM: {
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+    return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
+                                  *MF->getTarget().getMCAsmInfo());
   }
+  }
+
+  return Desc.getSize();
 }
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index 45357f54c9c..fee3bea9b8d 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -24,22 +24,6 @@ namespace llvm {
 
 class MSP430Subtarget;
 
-/// MSP430II - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace MSP430II {
-  enum {
-    SizeShift   = 2,
-    SizeMask    = 7 << SizeShift,
-
-    SizeUnknown = 0 << SizeShift,
-    SizeSpecial = 1 << SizeShift,
-    Size2Bytes  = 2 << SizeShift,
-    Size4Bytes  = 3 << SizeShift,
-    Size6Bytes  = 4 << SizeShift
-  };
-}
-
 class MSP430InstrInfo : public MSP430GenInstrInfo {
   const MSP430RegisterInfo RI;
   virtual void anchor();
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index cec43040f60..3ed17374a2d 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -34,8 +34,9 @@ def SDT_MSP430BrCC         : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
 def SDT_MSP430SelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
                                                   SDTCisSameAs<1, 2>, 
                                                   SDTCisVT<3, i8>]>;
-def SDT_MSP430Shift        : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
-                                                  SDTCisI8<2>]>;
+def SDT_MSP430DAdd         : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                                  SDTCisSameAs<0, 2>,
+                                                  SDTCisInt<0>]>;
 
 //===----------------------------------------------------------------------===//
 // MSP430 Specific Node Definitions.
@@ -48,6 +49,7 @@ def MSP430retiflag : SDNode<"MSP430ISD::RETI_FLAG", SDTNone,
 def MSP430rra     : SDNode<"MSP430ISD::RRA", SDTIntUnaryOp, []>;
 def MSP430rla     : SDNode<"MSP430ISD::RLA", SDTIntUnaryOp, []>;
 def MSP430rrc     : SDNode<"MSP430ISD::RRC", SDTIntUnaryOp, []>;
+def MSP430rrcl    : SDNode<"MSP430ISD::RRCL", SDTIntUnaryOp, []>;
 
 def MSP430call    : SDNode<"MSP430ISD::CALL", SDT_MSP430Call,
                      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
@@ -63,33 +65,88 @@ def MSP430brcc    : SDNode<"MSP430ISD::BR_CC", SDT_MSP430BrCC,
                             [SDNPHasChain, SDNPInGlue]>;
 def MSP430selectcc: SDNode<"MSP430ISD::SELECT_CC", SDT_MSP430SelectCC,
                             [SDNPInGlue]>;
-def MSP430shl     : SDNode<"MSP430ISD::SHL", SDT_MSP430Shift, []>;
-def MSP430sra     : SDNode<"MSP430ISD::SRA", SDT_MSP430Shift, []>;
-def MSP430srl     : SDNode<"MSP430ISD::SRL", SDT_MSP430Shift, []>;
+def MSP430dadd    : SDNode<"MSP430ISD::DADD", SDT_MSP430DAdd, []>;
 
 //===----------------------------------------------------------------------===//
 // MSP430 Operand Definitions.
 //===----------------------------------------------------------------------===//
 
+def MemAsmOperand : AsmOperandClass {
+  let Name = "Mem";
+}
+
 // Address operands
 def memsrc : Operand<i16> {
   let PrintMethod = "printSrcMemOperand";
   let MIOperandInfo = (ops GR16, i16imm);
+  let ParserMatchClass = MemAsmOperand;
+  let EncoderMethod = "getMemOpValue";
+  let DecoderMethod = "DecodeMemOperand";
 }
 
 def memdst : Operand<i16> {
   let PrintMethod = "printSrcMemOperand";
   let MIOperandInfo = (ops GR16, i16imm);
+  let ParserMatchClass = MemAsmOperand;
+  let EncoderMethod = "getMemOpValue";
+  let DecoderMethod = "DecodeMemOperand";
+}
+
+def IndRegAsmOperand : AsmOperandClass {
+  let Name = "IndReg";
+  let RenderMethod = "addRegOperands";
+}
+
+def indreg : Operand<i16> {
+  let PrintMethod = "printIndRegOperand";
+  let MIOperandInfo = (ops GR16);
+  let ParserMatchClass = IndRegAsmOperand;
+  let DecoderMethod = "DecodeGR16RegisterClass";
+}
+
+def PostIndRegAsmOperand : AsmOperandClass {
+  let Name = "PostIndReg";
+  let RenderMethod = "addRegOperands";
+}
+
+def postreg : Operand<i16> {
+  let PrintMethod = "printPostIndRegOperand";
+  let MIOperandInfo = (ops GR16);
+  let ParserMatchClass = PostIndRegAsmOperand;
+  let DecoderMethod = "DecodeGR16RegisterClass";
 }
 
 // Short jump targets have OtherVT type and are printed as pcrel imm values.
 def jmptarget : Operand<OtherVT> {
   let PrintMethod = "printPCRelImmOperand";
+  let EncoderMethod = "getPCRelImmOpValue";
 }
 
 // Operand for printing out a condition code.
 def cc : Operand<i8> {
   let PrintMethod = "printCCOperand";
+  let EncoderMethod = "getCCOpValue";
+}
+
+def CGImmAsmOperand : AsmOperandClass {
+  let Name = "CGImm";
+  let RenderMethod = "addImmOperands";
+}
+
+def cg8imm : Operand<i8>,
+             ImmLeaf<i8, [{return Imm == 0 || Imm == 1 || Imm == 2 ||
+                                  Imm == 4 || Imm == 8 || Imm == -1;}]> {
+  let ParserMatchClass = CGImmAsmOperand;
+  let EncoderMethod = "getCGImmOpValue";
+  let DecoderMethod = "DecodeCGImm";
+}
+
+def cg16imm : Operand<i16>,
+              ImmLeaf<i16, [{return Imm == 0 || Imm == 1 || Imm == 2 ||
+                                    Imm == 4 || Imm == 8 || Imm == -1;}]> {
+  let ParserMatchClass = CGImmAsmOperand;
+  let EncoderMethod = "getCGImmOpValue";
+  let DecoderMethod = "DecodeCGImm";
 }
 
 //===----------------------------------------------------------------------===//
@@ -102,6 +159,7 @@ def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], []>;
 // Pattern Fragments
 def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
 def  extloadi16i8 : PatFrag<(ops node:$ptr), (i16 ( extloadi8 node:$ptr))>;
+def bic : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, (not node:$rhs))>;
 def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
   return N->hasOneUse();
 }]>;
@@ -113,21 +171,21 @@ def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
 // pointer before prolog-epilog rewriting occurs.
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber SR.
-let Defs = [SP, SR], Uses = [SP] in {
+let isCodeGenOnly = 1, Defs = [SP, SR], Uses = [SP] in {
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
-                              "#ADJCALLSTACKDOWN",
+                              "#ADJCALLSTACKDOWN $amt1 $amt2",
                               [(MSP430callseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
-                              "#ADJCALLSTACKUP",
+                              "#ADJCALLSTACKUP $amt1 $amt2",
                               [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
-let Defs = [SR], Uses = [SP] in {
+let isCodeGenOnly = 1, Defs = [SR], Uses = [SP] in {
 def ADDframe : Pseudo<(outs GR16:$dst), (ins i16imm:$base, i16imm:$offset),
                       "# ADDframe PSEUDO", []>;
 }
 
-let usesCustomInserter = 1 in {
+let isCodeGenOnly = 1, usesCustomInserter = 1 in {
   let Uses = [SR] in {
   def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc),
                         "# Select8 PSEUDO",
@@ -141,38 +199,45 @@ let usesCustomInserter = 1 in {
   let Defs = [SR] in {
   def Shl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
                         "# Shl8 PSEUDO",
-                        [(set GR8:$dst, (MSP430shl GR8:$src, GR8:$cnt))]>;
+                        [(set GR8:$dst, (shl GR8:$src, GR8:$cnt))]>;
   def Shl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
                         "# Shl16 PSEUDO",
-                        [(set GR16:$dst, (MSP430shl GR16:$src, GR8:$cnt))]>;
+                        [(set GR16:$dst, (shl GR16:$src, GR8:$cnt))]>;
   def Sra8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
                         "# Sra8 PSEUDO",
-                        [(set GR8:$dst, (MSP430sra GR8:$src, GR8:$cnt))]>;
+                        [(set GR8:$dst, (sra GR8:$src, GR8:$cnt))]>;
   def Sra16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
                         "# Sra16 PSEUDO",
-                        [(set GR16:$dst, (MSP430sra GR16:$src, GR8:$cnt))]>;
+                        [(set GR16:$dst, (sra GR16:$src, GR8:$cnt))]>;
   def Srl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
                         "# Srl8 PSEUDO",
-                        [(set GR8:$dst, (MSP430srl GR8:$src, GR8:$cnt))]>;
+                        [(set GR8:$dst, (srl GR8:$src, GR8:$cnt))]>;
   def Srl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
                         "# Srl16 PSEUDO",
-                        [(set GR16:$dst, (MSP430srl GR16:$src, GR8:$cnt))]>;
-
+                        [(set GR16:$dst, (srl GR16:$src, GR8:$cnt))]>;
+  def Rrcl8    : Pseudo<(outs GR8:$dst), (ins GR8:$src), "",
+                        [(set GR8:$dst, (MSP430rrcl GR8:$src))]>;
+  def Rrcl16   : Pseudo<(outs GR16:$dst), (ins GR16:$src), "",
+                        [(set GR16:$dst, (MSP430rrcl GR16:$src))]>;
   }
 }
 
-let hasSideEffects = 0 in
-def NOP : Pseudo<(outs), (ins), "nop", []>;
-
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions...
 //
 
 // FIXME: Provide proper encoding!
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-  def RET  : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                     (outs), (ins), "ret",  [(MSP430retflag)]>;
-  def RETI : II16r<0x0, (outs), (ins), "reti", [(MSP430retiflag)]>;
+  def RET  : IForm16<0b0100, DstReg, SrcPostInc, 2,
+                     (outs), (ins), "ret",  [(MSP430retflag)]> {
+    let DecoderNamespace = "Delta";
+    let rs = 1;
+    let rd = 0;
+  }
+  def RETI : IIForm16<0b110, SrcReg, 2,
+                      (outs), (ins), "reti", [(MSP430retiflag)]> {
+    let rs = 0;
+  }
 }
 
 let isBranch = 1, isTerminator = 1 in {
@@ -182,64 +247,69 @@ let isBranch = 1, isTerminator = 1 in {
 // Direct branch
 let isBarrier = 1 in {
   // Short branch
-  def JMP : CJForm<0, 0, (outs), (ins jmptarget:$dst),
+  def JMP : CJForm<(outs), (ins jmptarget:$dst),
                    "jmp\t$dst",
-                   [(br bb:$dst)]>;
-  let isIndirectBranch = 1 in {
+                   [(br bb:$dst)]> {
+    let cond = 0b111;
+  }
+  let isIndirectBranch = 1, rd = 0 in {
     // Long branches
-    def Bi  : I16ri<0, (outs), (ins i16imm:$brdst),
-                    "br\t$brdst",
-                    [(brind tblockaddress:$brdst)]>;
-    def Br  : I16rr<0, (outs), (ins GR16:$brdst),
-                    "br\t$brdst",
-                    [(brind GR16:$brdst)]>;
-    def Bm  : I16rm<0, (outs), (ins memsrc:$brdst),
-                    "br\t$brdst",
-                    [(brind (load addr:$brdst))]>;
+    def Bi  : I16ri<0b0100, (outs), (ins i16imm:$imm),
+                    "br\t$imm",
+                    [(brind tblockaddress:$imm)]>;
+    def Br  : I16rr<0b0100, (outs), (ins GR16:$rs),
+                    "br\t$rs",
+                    [(brind GR16:$rs)]>;
+    def Bm  : I16rm<0b0100, (outs), (ins memsrc:$src),
+                    "br\t$src",
+                    [(brind (load addr:$src))]>;
   }
 }
 
 // Conditional branches
 let Uses = [SR] in
-  def JCC : CJForm<0, 0,
-                   (outs), (ins jmptarget:$dst, cc:$cc),
-                   "j$cc\t$dst",
-                   [(MSP430brcc bb:$dst, imm:$cc)]>;
+  def JCC : CJForm<(outs), (ins jmptarget:$dst, cc:$cond),
+                   "j$cond\t$dst",
+                   [(MSP430brcc bb:$dst, imm:$cond)]>;
 } // isBranch, isTerminator
 
 //===----------------------------------------------------------------------===//
 //  Call Instructions...
 //
-let isCall = 1 in
-  // All calls clobber the non-callee saved registers. SPW is marked as
-  // a use to prevent stack-pointer assignments that appear immediately
-  // before calls from potentially appearing dead. Uses for argument
-  // registers are added manually.
-  let Defs = [R11, R12, R13, R14, R15, SR],
-      Uses = [SP] in {
-    def CALLi     : II16i<0x0,
-                          (outs), (ins i16imm:$dst),
-                          "call\t$dst", [(MSP430call imm:$dst)]>;
-    def CALLr     : II16r<0x0,
-                          (outs), (ins GR16:$dst),
-                          "call\t$dst", [(MSP430call GR16:$dst)]>;
-    def CALLm     : II16m<0x0,
-                          (outs), (ins memsrc:$dst),
-                          "call\t${dst:mem}", [(MSP430call (load addr:$dst))]>;
-  }
-
+// All calls clobber the non-callee saved registers. SPW is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let isCall = 1,
+    Defs = [R11, R12, R13, R14, R15, SR],
+    Uses = [SP] in {
+  def CALLi     : II16i<0b101,
+                        (outs), (ins i16imm:$imm),
+                        "call\t$imm", [(MSP430call imm:$imm)]>;
+  def CALLr     : II16r<0b101,
+                        (outs), (ins GR16:$rs),
+                        "call\t$rs", [(MSP430call GR16:$rs)]>;
+  def CALLm     : II16m<0b101,
+                        (outs), (ins memsrc:$src),
+                        "call\t$src", [(MSP430call (load addr:$src))]>;
+}
 
 //===----------------------------------------------------------------------===//
 //  Miscellaneous Instructions...
 //
-let Defs = [SP], Uses = [SP], hasSideEffects=0 in {
+let Defs = [SP], Uses = [SP], hasSideEffects = 0 in {
 let mayLoad = 1 in
-def POP16r   : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                       (outs GR16:$reg), (ins), "pop.w\t$reg", []>;
+def POP16r   : IForm16<0b0100, DstReg, SrcPostInc, 2,
+                       (outs GR16:$rd), (ins), "pop\t$rd", []> {
+  let DecoderNamespace = "Delta";
+  let rs = 1;
+}
 
 let mayStore = 1 in
-def PUSH16r  : II16r<0x0,
-                     (outs), (ins GR16:$reg), "push.w\t$reg",[]>;
+def PUSH8r :  II8r<0b100, (outs), (ins GR8:$rs), "push.b\t$rs", []>;
+def PUSH16r : II16r<0b100, (outs), (ins GR16:$rs), "push\t$rs", []>;
+def PUSH16c : II16c<0b100, (outs), (ins cg16imm:$imm), "push\t$imm", []>;
+def PUSH16i : II16i<0b100, (outs), (ins i16imm:$imm), "push\t$imm", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -247,55 +317,73 @@ def PUSH16r  : II16r<0x0,
 
 // FIXME: Provide proper encoding!
 let hasSideEffects = 0 in {
-def MOV8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src),
-                   "mov.b\t{$src, $dst}",
+def MOV8rr  : I8rr<0b0100,
+                   (outs GR8:$rd), (ins GR8:$rs),
+                   "mov.b\t{$rs, $rd}",
                    []>;
-def MOV16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src),
-                    "mov.w\t{$src, $dst}",
+def MOV16rr : I16rr<0b0100,
+                    (outs GR16:$rd), (ins GR16:$rs),
+                    "mov\t{$rs, $rd}",
                     []>;
 }
 
 // FIXME: Provide proper encoding!
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-def MOV8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins i8imm:$src),
-                   "mov.b\t{$src, $dst}",
-                   [(set GR8:$dst, imm:$src)]>;
-def MOV16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins i16imm:$src),
-                    "mov.w\t{$src, $dst}",
-                    [(set GR16:$dst, imm:$src)]>;
+def MOV8rc : I8rc<0b0100,
+                   (outs GR8:$rd), (ins cg8imm:$imm),
+                   "mov.b\t$imm, $rd",
+                   [(set GR8:$rd, cg8imm:$imm)]>;
+def MOV16rc : I16rc<0b0100,
+                    (outs GR16:$rd), (ins cg16imm:$imm),
+                    "mov\t$imm, $rd",
+                    [(set GR16:$rd, cg16imm:$imm)]>;
+def MOV8ri  : I8ri<0b0100,
+                   (outs GR8:$rd), (ins i8imm:$imm),
+                   "mov.b\t{$imm, $rd}",
+                   [(set GR8:$rd, imm:$imm)]>;
+def MOV16ri : I16ri<0b0100,
+                    (outs GR16:$rd), (ins i16imm:$imm),
+                    "mov\t{$imm, $rd}",
+                    [(set GR16:$rd, imm:$imm)]>;
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-def MOV8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins memsrc:$src),
-                   "mov.b\t{$src, $dst}",
-                   [(set GR8:$dst, (load addr:$src))]>;
-def MOV16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins memsrc:$src),
-                    "mov.w\t{$src, $dst}",
-                    [(set GR16:$dst, (load addr:$src))]>;
-}
-
-def MOVZX16rr8 : I8rr<0x0,
-                      (outs GR16:$dst), (ins GR8:$src),
-                      "mov.b\t{$src, $dst}",
-                      [(set GR16:$dst, (zext GR8:$src))]>;
-def MOVZX16rm8 : I8rm<0x0,
-                      (outs GR16:$dst), (ins memsrc:$src),
-                      "mov.b\t{$src, $dst}",
-                      [(set GR16:$dst, (zextloadi16i8 addr:$src))]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, Constraints = "$base = $base_wb" in {
-def MOV8rm_POST  : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                         (outs GR8:$dst, GR16:$base_wb), (ins GR16:$base),
-                         "mov.b\t{@$base+, $dst}", []>;
-def MOV16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                           (outs GR16:$dst, GR16:$base_wb), (ins GR16:$base),
-                           "mov.w\t{@$base+, $dst}", []>;
+def MOV8rm  : I8rm<0b0100,
+                   (outs GR8:$rd), (ins memsrc:$src),
+                   "mov.b\t{$src, $rd}",
+                   [(set GR8:$rd, (load addr:$src))]>;
+def MOV16rm : I16rm<0b0100,
+                    (outs GR16:$rd), (ins memsrc:$src),
+                    "mov\t{$src, $rd}",
+                    [(set GR16:$rd, (load addr:$src))]>;
+def MOV8rn  : I8rn<0b0100,
+                   (outs GR8:$rd), (ins indreg:$rs),
+                   "mov.b\t{$rs, $rd}",
+                   [(set GR8:$rd, (load addr:$rs))]>;
+def MOV16rn : I16rn<0b0100,
+                    (outs GR16:$rd), (ins indreg:$rs),
+                    "mov\t{$rs, $rd}",
+                    [(set GR16:$rd, (load addr:$rs))]>;
+}
+
+let isCodeGenOnly = 1 in {
+def MOVZX16rr8 : I8rr<0b0100,
+                      (outs GR16:$rd), (ins GR8:$rs),
+                      "mov.b\t{$rs, $rd}",
+                      [(set GR16:$rd, (zext GR8:$rs))]>;
+def MOVZX16rm8 : I8rm<0b0100,
+                      (outs GR16:$rd), (ins memsrc:$src),
+                      "mov.b\t{$src, $rd}",
+                      [(set GR16:$rd, (zextloadi16i8 addr:$src))]>;
+}
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, Constraints = "$rs = $wb" in {
+def MOV8rp  : I8rp<0b0100,
+                   (outs GR8:$rd, GR16:$wb), (ins postreg:$rs),
+                   "mov.b\t{$rs, $rd}", []>;
+def MOV16rp : I16rp<0b0100,
+                    (outs GR16:$rd, GR16:$wb), (ins postreg:$rs),
+                    "mov\t{$rs, $rd}", []>;
 }
 
 // Any instruction that defines a 8-bit result leaves the high half of the
@@ -313,821 +401,450 @@ def def8 : PatLeaf<(i8 GR8:$src), [{
 def : Pat<(i16 (zext def8:$src)),
           (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
 
-def MOV8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "mov.b\t{$src, $dst}",
-                   [(store (i8 imm:$src), addr:$dst)]>;
-def MOV16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "mov.w\t{$src, $dst}",
-                    [(store (i16 imm:$src), addr:$dst)]>;
-
-def MOV8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "mov.b\t{$src, $dst}",
-                   [(store GR8:$src, addr:$dst)]>;
-def MOV16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "mov.w\t{$src, $dst}",
-                    [(store GR16:$src, addr:$dst)]>;
-
-def MOV8mm  : I8mm<0x0,
+def MOV8mc  : I8mc<0b0100,
+                   (outs), (ins memdst:$dst, cg8imm:$imm),
+                   "mov.b\t{$imm, $dst}",
+                   [(store (i8 cg8imm:$imm), addr:$dst)]>;
+def MOV16mc : I16mc<0b0100,
+                    (outs), (ins memdst:$dst, cg16imm:$imm),
+                    "mov\t{$imm, $dst}",
+                    [(store (i16 cg16imm:$imm), addr:$dst)]>;
+
+def MOV8mi  : I8mi<0b0100,
+                   (outs), (ins memdst:$dst, i8imm:$imm),
+                   "mov.b\t{$imm, $dst}",
+                   [(store (i8 imm:$imm), addr:$dst)]>;
+def MOV16mi : I16mi<0b0100,
+                    (outs), (ins memdst:$dst, i16imm:$imm),
+                    "mov\t{$imm, $dst}",
+                    [(store (i16 imm:$imm), addr:$dst)]>;
+
+def MOV8mr  : I8mr<0b0100,
+                   (outs), (ins memdst:$dst, GR8:$rs),
+                   "mov.b\t{$rs, $dst}",
+                   [(store GR8:$rs, addr:$dst)]>;
+def MOV16mr : I16mr<0b0100,
+                    (outs), (ins memdst:$dst, GR16:$rs),
+                    "mov\t{$rs, $dst}",
+                    [(store GR16:$rs, addr:$dst)]>;
+
+def MOV8mm  : I8mm<0b0100,
                    (outs), (ins memdst:$dst, memsrc:$src),
                    "mov.b\t{$src, $dst}",
                    [(store (i8 (load addr:$src)), addr:$dst)]>;
-def MOV16mm : I16mm<0x0,
+def MOV16mm : I16mm<0b0100,
                     (outs), (ins memdst:$dst, memsrc:$src),
-                    "mov.w\t{$src, $dst}",
+                    "mov\t{$src, $dst}",
                     [(store (i16 (load addr:$src)), addr:$dst)]>;
 
 //===----------------------------------------------------------------------===//
 // Arithmetic Instructions
 
-let Constraints = "$src = $dst" in {
-
-let Defs = [SR] in {
-
-let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
-
-def ADD8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "add.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (add GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def ADD16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "add.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (add GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-}
-
-def ADD8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "add.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (add GR8:$src, (load addr:$src2))),
-                    (implicit SR)]>;
-def ADD16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "add.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (add GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src = $dst" in {
-def ADD8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                         (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src, GR16:$base),
-                         "add.b\t{@$base+, $dst}", []>;
-def ADD16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                           (outs GR16:$dst, GR16:$base_wb),
-                           (ins GR16:$src, GR16:$base),
-                          "add.w\t{@$base+, $dst}", []>;
-}
-
-
-def ADD8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "add.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (add GR8:$src, imm:$src2)),
-                    (implicit SR)]>;
-def ADD16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "add.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (add GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-let Constraints = "" in {
-def ADD8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "add.b\t{$src, $dst}",
-                   [(store (add (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SR)]>;
-def ADD16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "add.w\t{$src, $dst}",
-                    [(store (add (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
-
-def ADD8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "add.b\t{$src, $dst}",
-                   [(store (add (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def ADD16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "add.w\t{$src, $dst}",
-                    [(store (add (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
-
-def ADD8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "add.b\t{$src, $dst}",
-                   [(store (add (load addr:$dst), 
-                                (i8 (load addr:$src))), addr:$dst),
-                    (implicit SR)]>;
-def ADD16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "add.w\t{$src, $dst}",
-                    [(store (add (load addr:$dst), 
-                                  (i16 (load addr:$src))), addr:$dst),
-                     (implicit SR)]>;
-}
-
-let Uses = [SR] in {
-
-let isCommutable = 1 in { // X = ADDC Y, Z  == X = ADDC Z, Y
-def ADC8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "addc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (adde GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def ADC16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "addc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (adde GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-} // isCommutable
-
-def ADC8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "addc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (adde GR8:$src, imm:$src2)),
-                    (implicit SR)]>;
-def ADC16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "addc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (adde GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-def ADC8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "addc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (adde GR8:$src, (load addr:$src2))),
-                    (implicit SR)]>;
-def ADC16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "addc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (adde GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let Constraints = "" in {
-def ADC8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "addc.b\t{$src, $dst}",
-                   [(store (adde (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SR)]>;
-def ADC16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "addc.w\t{$src, $dst}",
-                    [(store (adde (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
-
-def ADC8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "addc.b\t{$src, $dst}",
-                   [(store (adde (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def ADC16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "addc.w\t{$src, $dst}",
-                    [(store (adde (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
-
-def ADC8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "addc.b\t{$src, $dst}",
-                   [(store (adde (load addr:$dst), 
-                                 (i8 (load addr:$src))), addr:$dst),
-                    (implicit SR)]>;
-def ADC16mm : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "addc.w\t{$src, $dst}",
-                   [(store (adde (load addr:$dst), 
-                                 (i16 (load addr:$src))), addr:$dst),
-                    (implicit SR)]>;
-}
-
-} // Uses = [SR]
-
-let isCommutable = 1 in { // X = AND Y, Z  == X = AND Z, Y
-def AND8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "and.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def AND16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "and.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-}
-
-def AND8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "and.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src, imm:$src2)),
+multiclass Arith<bits<4> opcode, string asmstring, SDNode node,
+                 bit commutes, list<Register> uses> {
+  let Defs = [SR], Uses = uses in {
+  let Constraints = "$src2 = $rd" in {
+  let isCommutable = commutes in {
+  def 8rr : I8rr<opcode, (outs GR8:$rd), (ins GR8:$src2, GR8:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $rd"),
+                 [(set GR8:$rd, (node GR8:$src2, GR8:$rs)),
+                  (implicit SR)]>;
+  def 16rr : I16rr<opcode, (outs GR16:$rd), (ins GR16:$src2, GR16:$rs),
+                   !strconcat(asmstring, "\t$rs, $rd"),
+                   [(set GR16:$rd, (node GR16:$src2, GR16:$rs)),
                     (implicit SR)]>;
-def AND16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "and.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-def AND8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "and.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src, (load addr:$src2))),
-                    (implicit SR)]>;
-def AND16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "and.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src = $dst" in {
-def AND8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                         (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src, GR16:$base),
-                         "and.b\t{@$base+, $dst}", []>;
-def AND16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                           (outs GR16:$dst, GR16:$base_wb),
-                           (ins GR16:$src, GR16:$base),
-                           "and.w\t{@$base+, $dst}", []>;
-}
-
-let Constraints = "" in {
-def AND8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "and.b\t{$src, $dst}",
-                   [(store (and (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SR)]>;
-def AND16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "and.w\t{$src, $dst}",
-                    [(store (and (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
-
-def AND8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "and.b\t{$src, $dst}",
-                   [(store (and (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def AND16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "and.w\t{$src, $dst}",
-                    [(store (and (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
-
-def AND8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "and.b\t{$src, $dst}",
-                   [(store (and (load addr:$dst), 
-                                (i8 (load addr:$src))), addr:$dst),
-                    (implicit SR)]>;
-def AND16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "and.w\t{$src, $dst}",
-                    [(store (and (load addr:$dst), 
+  }
+  def 8rm : I8rm<opcode, (outs GR8:$rd), (ins GR8:$src2, memsrc:$src),
+                 !strconcat(asmstring, ".b\t$src, $rd"),
+                 [(set GR8:$rd, (node GR8:$src2, (load addr:$src))),
+                  (implicit SR)]>;
+  def 16rm : I16rm<opcode, (outs GR16:$rd), (ins GR16:$src2, memsrc:$src),
+                   !strconcat(asmstring, "\t$src, $rd"),
+                   [(set GR16:$rd, (node GR16:$src2, (load addr:$src))),
+                    (implicit SR)]>;
+  def 8rn : I8rn<opcode, (outs GR8:$rd), (ins GR8:$src2, indreg:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $rd"), []>;
+  def 16rn : I16rn<opcode, (outs GR16:$rd), (ins GR16:$src2, indreg:$rs),
+                   !strconcat(asmstring, "\t$rs, $rd"), []>;
+  let mayLoad = 1,
+      hasExtraDefRegAllocReq = 1,
+      Constraints = "$rs = $wb, $src2 = $rd" in {
+  def 8rp : I8rp<opcode, (outs GR8:$rd, GR16:$wb), (ins GR8:$src2, postreg:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $rd"), []>;
+  def 16rp : I16rp<opcode, (outs GR16:$rd, GR16:$wb), (ins GR16:$src2, postreg:$rs),
+                   !strconcat(asmstring, "\t$rs, $rd"), []>;
+  }
+  def 8rc : I8rc<opcode, (outs GR8:$rd), (ins GR8:$src2, cg8imm:$imm),
+                 !strconcat(asmstring, ".b\t$imm, $rd"),
+                 [(set GR8:$rd, (node GR8:$src2, cg8imm:$imm)),
+                  (implicit SR)]>;
+  def 16rc : I16rc<opcode, (outs GR16:$rd), (ins GR16:$src2, cg16imm:$imm),
+                 !strconcat(asmstring, "\t$imm, $rd"),
+                 [(set GR16:$rd, (node GR16:$src2, cg16imm:$imm)),
+                  (implicit SR)]>;
+  def 8ri : I8ri<opcode, (outs GR8:$rd), (ins GR8:$src2, i8imm:$imm),
+                 !strconcat(asmstring, ".b\t$imm, $rd"),
+                 [(set GR8:$rd, (node GR8:$src2, imm:$imm)),
+                  (implicit SR)]>;
+  def 16ri : I16ri<opcode, (outs GR16:$rd), (ins GR16:$src2, i16imm:$imm),
+                 !strconcat(asmstring, "\t$imm, $rd"),
+                 [(set GR16:$rd, (node GR16:$src2, imm:$imm)),
+                  (implicit SR)]>;
+  }
+  def 8mr : I8mr<opcode, (outs), (ins memdst:$dst, GR8:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $dst"),
+                 [(store (node (load addr:$dst), GR8:$rs), addr:$dst),
+                  (implicit SR)]>;
+  def 16mr : I16mr<opcode, (outs), (ins memdst:$dst, GR16:$rs),
+                   !strconcat(asmstring, "\t$rs, $dst"),
+                   [(store (node (load addr:$dst), GR16:$rs), addr:$dst),
+                    (implicit SR)]>;
+  def 8mc : I8mc<opcode, (outs), (ins memdst:$dst, cg8imm:$imm),
+                 !strconcat(asmstring, ".b\t$imm, $dst"),
+                 [(store (node (load addr:$dst), (i8 cg8imm:$imm)), addr:$dst),
+                  (implicit SR)]>;
+  def 16mc : I16mc<opcode, (outs), (ins memdst:$dst, cg16imm:$imm),
+                   !strconcat(asmstring, "\t$imm, $dst"),
+                   [(store (node (load addr:$dst), (i16 cg16imm:$imm)), addr:$dst),
+                    (implicit SR)]>;
+  def 8mi : I8mi<opcode, (outs), (ins memdst:$dst, i8imm:$imm),
+                 !strconcat(asmstring, ".b\t$imm, $dst"),
+                 [(store (node (load addr:$dst), (i8 imm:$imm)), addr:$dst),
+                  (implicit SR)]>;
+  def 16mi : I16mi<opcode, (outs), (ins memdst:$dst, i16imm:$imm),
+                   !strconcat(asmstring, "\t$imm, $dst"),
+                   [(store (node (load addr:$dst), (i16 imm:$imm)), addr:$dst),
+                    (implicit SR)]>;
+  def 8mm : I8mm<opcode, (outs), (ins memdst:$dst, memsrc:$src),
+                 !strconcat(asmstring, ".b\t$src, $dst"),
+                 [(store (node (load addr:$dst), 
+                               (i8 (load addr:$src))), addr:$dst),
+                  (implicit SR)]>;
+  def 16mm : I16mm<opcode, (outs), (ins memdst:$dst, memsrc:$src),
+                   !strconcat(asmstring, "\t$src, $dst"),
+                   [(store (node (load addr:$dst), 
                                  (i16 (load addr:$src))), addr:$dst),
-                     (implicit SR)]>;
-}
-
-let isCommutable = 1 in { // X = OR Y, Z  == X = OR Z, Y
-def OR8rr  : I8rr<0x0,
-                  (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                  "bis.b\t{$src2, $dst}",
-                  [(set GR8:$dst, (or GR8:$src, GR8:$src2))]>;
-def OR16rr : I16rr<0x0,
-                   (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                   "bis.w\t{$src2, $dst}",
-                   [(set GR16:$dst, (or GR16:$src, GR16:$src2))]>;
-}
-
-def OR8ri  : I8ri<0x0,
-                  (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                  "bis.b\t{$src2, $dst}",
-                  [(set GR8:$dst, (or GR8:$src, imm:$src2))]>;
-def OR16ri : I16ri<0x0,
-                   (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                   "bis.w\t{$src2, $dst}",
-                   [(set GR16:$dst, (or GR16:$src, imm:$src2))]>;
-
-def OR8rm  : I8rm<0x0,
-                  (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                  "bis.b\t{$src2, $dst}",
-                  [(set GR8:$dst, (or GR8:$src, (load addr:$src2)))]>;
-def OR16rm : I16rm<0x0,
-                   (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                   "bis.w\t{$src2, $dst}",
-                   [(set GR16:$dst, (or GR16:$src, (load addr:$src2)))]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src = $dst" in {
-def OR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                        (outs GR8:$dst, GR16:$base_wb),
-                        (ins GR8:$src, GR16:$base),
-                        "bis.b\t{@$base+, $dst}", []>;
-def OR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                          (outs GR16:$dst, GR16:$base_wb),
-                          (ins GR16:$src, GR16:$base),
-                          "bis.w\t{@$base+, $dst}", []>;
-}
-
-let Constraints = "" in {
-def OR8mr  : I8mr<0x0,
-                  (outs), (ins memdst:$dst, GR8:$src),
-                  "bis.b\t{$src, $dst}",
-                  [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>;
-def OR16mr : I16mr<0x0,
-                   (outs), (ins memdst:$dst, GR16:$src),
-                   "bis.w\t{$src, $dst}",
-                   [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>;
-
-def OR8mi  : I8mi<0x0, 
-                  (outs), (ins memdst:$dst, i8imm:$src),
-                  "bis.b\t{$src, $dst}",
-                  [(store (or (load addr:$dst), (i8 imm:$src)), addr:$dst)]>;
-def OR16mi : I16mi<0x0,
-                   (outs), (ins memdst:$dst, i16imm:$src),
-                   "bis.w\t{$src, $dst}",
-                   [(store (or (load addr:$dst), (i16 imm:$src)), addr:$dst)]>;
-
-def OR8mm  : I8mm<0x0,
-                  (outs), (ins memdst:$dst, memsrc:$src),
-                  "bis.b\t{$src, $dst}",
-                  [(store (or (i8 (load addr:$dst)),
-                              (i8 (load addr:$src))), addr:$dst)]>;
-def OR16mm : I16mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "bis.w\t{$src, $dst}",
-                   [(store (or (i16 (load addr:$dst)),
-                               (i16 (load addr:$src))), addr:$dst)]>;
-}
-
-// bic does not modify condition codes
-def BIC8rr :  I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "bic.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src, (not GR8:$src2)))]>;
-def BIC16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "bic.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src, (not GR16:$src2)))]>;
-
-def BIC8rm :  I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "bic.b\t{$src2, $dst}",
-                    [(set GR8:$dst, (and GR8:$src, (not (i8 (load addr:$src2)))))]>;
-def BIC16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "bic.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src, (not (i16 (load addr:$src2)))))]>;
-
-let Constraints = "" in {
-def BIC8mr :  I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "bic.b\t{$src, $dst}",
-                   [(store (and (load addr:$dst), (not GR8:$src)), addr:$dst)]>;
-def BIC16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "bic.w\t{$src, $dst}",
-                    [(store (and (load addr:$dst), (not GR16:$src)), addr:$dst)]>;
-
-def BIC8mm :  I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "bic.b\t{$src, $dst}",
-                   [(store (and (load addr:$dst),
-                                (not (i8 (load addr:$src)))), addr:$dst)]>;
-def BIC16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "bic.w\t{$src, $dst}",
-                    [(store (and (load addr:$dst),
-                                 (not (i16 (load addr:$src)))), addr:$dst)]>;
-}
-
-let isCommutable = 1 in { // X = XOR Y, Z  == X = XOR Z, Y
-def XOR8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "xor.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (xor GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def XOR16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "xor.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (xor GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-}
-
-def XOR8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "xor.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (xor GR8:$src, imm:$src2)),
-                    (implicit SR)]>;
-def XOR16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "xor.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (xor GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-def XOR8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "xor.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (xor GR8:$src, (load addr:$src2))),
-                    (implicit SR)]>;
-def XOR16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "xor.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (xor GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src = $dst" in {
-def XOR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                         (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src, GR16:$base),
-                         "xor.b\t{@$base+, $dst}", []>;
-def XOR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                           (outs GR16:$dst, GR16:$base_wb),
-                           (ins GR16:$src, GR16:$base),
-                           "xor.w\t{@$base+, $dst}", []>;
-}
-
-let Constraints = "" in {
-def XOR8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "xor.b\t{$src, $dst}",
-                   [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SR)]>;
-def XOR16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "xor.w\t{$src, $dst}",
-                    [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
-
-def XOR8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "xor.b\t{$src, $dst}",
-                   [(store (xor (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def XOR16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "xor.w\t{$src, $dst}",
-                    [(store (xor (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
-
-def XOR8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "xor.b\t{$src, $dst}",
-                   [(store (xor (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
                     (implicit SR)]>;
-def XOR16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "xor.w\t{$src, $dst}",
-                    [(store (xor (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
-                     (implicit SR)]>;
-}
-
-
-def SUB8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "sub.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sub GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def SUB16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "sub.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sub GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-
-def SUB8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "sub.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sub GR8:$src, imm:$src2)),
-                    (implicit SR)]>;
-def SUB16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "sub.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sub GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-def SUB8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "sub.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sub GR8:$src, (load addr:$src2))),
-                    (implicit SR)]>;
-def SUB16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "sub.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sub GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src = $dst" in {
-def SUB8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                         (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src, GR16:$base),
-                         "sub.b\t{@$base+, $dst}", []>;
-def SUB16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                          (outs GR16:$dst, GR16:$base_wb),
-                          (ins GR16:$src, GR16:$base),
-                          "sub.w\t{@$base+, $dst}", []>;
+  def 8mn : I8mn<opcode, (outs), (ins memdst:$dst, indreg:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $dst"), []>;
+  def 16mn : I16mn<opcode, (outs), (ins memdst:$dst, indreg:$rs),
+                   !strconcat(asmstring, "\t$rs, $dst"), []>;
+  def 8mp : I8mp<opcode, (outs), (ins memdst:$dst, postreg:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $dst"), []>;
+  def 16mp : I16mp<opcode, (outs), (ins memdst:$dst, postreg:$rs),
+                   !strconcat(asmstring, "\t$rs, $dst"), []>;
+  }
 }
 
-let Constraints = "" in {
-def SUB8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "sub.b\t{$src, $dst}",
-                   [(store (sub (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SR)]>;
-def SUB16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "sub.w\t{$src, $dst}",
-                    [(store (sub (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
+defm ADD  : Arith<0b0101, "add",  add,  1, []>;
+defm ADDC : Arith<0b0110, "addc", adde, 1, [SR]>;
+defm AND  : Arith<0b1111, "and",  and,  1, []>;
+defm BIS  : Arith<0b1101, "bis",  or,   1, []>;
+defm BIC  : Arith<0b1100, "bic",  bic,  0, []>;
+defm XOR  : Arith<0b1110, "xor",  xor,  1, []>;
+defm SUB  : Arith<0b1000, "sub",  sub,  0, []>;
+defm SUBC : Arith<0b0111, "subc", sube, 0, [SR]>;
+defm DADD : Arith<0b1010, "dadd", MSP430dadd, 1, [SR]>;
+
+def ADC8r   : InstAlias<"adc.b\t$dst",  (ADDC8rc   GR8:$dst,     0)>;
+def ADC16r  : InstAlias<"adc\t$dst",    (ADDC16rc  GR16:$dst,    0)>;
+def ADC8m   : InstAlias<"adc.b\t$dst",  (ADDC8mc   memdst:$dst,  0)>;
+def ADC16m  : InstAlias<"adc\t$dst",    (ADDC16mc  memdst:$dst,  0)>;
+
+def DADC8r  : InstAlias<"dadc.b\t$dst", (DADD8rc   GR8:$dst,     0)>;
+def DADC16r : InstAlias<"dadc\t$dst",   (DADD16rc  GR16:$dst,    0)>;
+def DADC8m  : InstAlias<"dadc.b\t$dst", (DADD8mc   memdst:$dst,  0)>;
+def DADC16m : InstAlias<"dadc\t$dst",   (DADD16mc  memdst:$dst,  0)>;
+
+def DEC8r   : InstAlias<"dec.b\t$dst",  (SUB8rc    GR8:$dst,     1)>;
+def DEC16r  : InstAlias<"dec\t$dst",    (SUB16rc   GR16:$dst,    1)>;
+def DEC8m   : InstAlias<"dec.b\t$dst",  (SUB8mc    memdst:$dst,  1)>;
+def DEC16m  : InstAlias<"dec\t$dst",    (SUB16mc   memdst:$dst,  1)>;
+
+def DECD8r  : InstAlias<"decd.b\t$dst", (SUB8rc    GR8:$dst,     2)>;
+def DECD16r : InstAlias<"decd\t$dst",   (SUB16rc   GR16:$dst,    2)>;
+def DECD8m  : InstAlias<"decd.b\t$dst", (SUB8mc    memdst:$dst,  2)>;
+def DECD16m : InstAlias<"decd\t$dst",   (SUB16mc   memdst:$dst,  2)>;
+
+def INC8r   : InstAlias<"inc.b\t$dst",  (ADD8rc    GR8:$dst,     1)>;
+def INC16r  : InstAlias<"inc\t$dst",    (ADD16rc   GR16:$dst,    1)>;
+def INC8m   : InstAlias<"inc.b\t$dst",  (ADD8mc    memdst:$dst,  1)>;
+def INC16m  : InstAlias<"inc\t$dst",    (ADD16mc   memdst:$dst,  1)>;
+
+def INCD8r  : InstAlias<"incd.b\t$dst", (ADD8rc    GR8:$dst,     2)>;
+def INCD16r : InstAlias<"incd\t$dst",   (ADD16rc   GR16:$dst,    2)>;
+def INCD8m  : InstAlias<"incd.b\t$dst", (ADD8mc    memdst:$dst,  2)>;
+def INCD16m : InstAlias<"incd\t$dst",   (ADD16mc   memdst:$dst,  2)>;
+
+def SBC8r   : InstAlias<"sbc.b\t$dst",  (SUBC8rc   GR8:$dst,     0)>;
+def SBC16r  : InstAlias<"sbc\t$dst",    (SUBC16rc  GR16:$dst,    0)>;
+def SBC8m   : InstAlias<"sbc.b\t$dst",  (SUBC8mc   memdst:$dst,  0)>;
+def SBC16m  : InstAlias<"sbc\t$dst",    (SUBC16mc  memdst:$dst,  0)>;
+
+def INV8r   : InstAlias<"inv.b\t$dst",  (XOR8rc    GR8:$dst,    -1)>;
+def INV16r  : InstAlias<"inv\t$dst",    (XOR16rc   GR16:$dst,   -1)>;
+def INV8m   : InstAlias<"inv.b\t$dst",  (XOR8mc    memdst:$dst, -1)>;
+def INV16m  : InstAlias<"inv\t$dst",    (XOR16mc   memdst:$dst, -1)>;
+
+// printAliasInstr() doesn't check $dst operands are actually equal
+// for RLA and RLC aliases below, so disable printing aliases.
+
+def RLA8r   : InstAlias<"rla.b\t$dst",  (ADD8rr    GR8:$dst,     GR8:$dst),    0>;
+def RLA16r  : InstAlias<"rla\t$dst",    (ADD16rr   GR16:$dst,    GR16:$dst),   0>;
+def RLA8m   : InstAlias<"rla.b\t$dst",  (ADD8mm    memdst:$dst,  memdst:$dst), 0>;
+def RLA16m  : InstAlias<"rla\t$dst",    (ADD16mm   memdst:$dst,  memdst:$dst), 0>;
+
+def RLC8r   : InstAlias<"rlc.b\t$dst",  (ADDC8rr   GR8:$dst,     GR8:$dst),    0>;
+def RLC16r  : InstAlias<"rlc\t$dst",    (ADDC16rr  GR16:$dst,    GR16:$dst),   0>;
+def RLC8m   : InstAlias<"rlc.b\t$dst",  (ADDC8mm   memdst:$dst,  memdst:$dst), 0>;
+def RLC16m  : InstAlias<"rlc\t$dst",    (ADDC16mm  memdst:$dst,  memdst:$dst), 0>;
+
+def DINT : InstAlias<"dint", (BIC16rc SR, 8)>;
+def EINT : InstAlias<"eint", (BIS16rc SR, 8)>;
+
+def NOP  : InstAlias<"nop",  (MOV16rc CG, 0)>;
+
+def CLR8r   : InstAlias<"clr.b\t$dst",  (MOV8rc    GR8:$dst,     0)>;
+def CLR16r  : InstAlias<"clr\t$dst",    (MOV16rc   GR16:$dst,    0)>;
+def CLR8m   : InstAlias<"clr.b\t$dst",  (MOV8mc    memdst:$dst,  0)>;
+def CLR16m  : InstAlias<"clr\t$dst",    (MOV16mc   memdst:$dst,  0)>;
+
+def CLRC : InstAlias<"clrc", (BIC16rc SR, 1)>;
+def CLRN : InstAlias<"clrn", (BIC16rc SR, 4)>;
+def CLRZ : InstAlias<"clrz", (BIC16rc SR, 2)>;
+def SETC : InstAlias<"setc", (BIS16rc SR, 1)>;
+def SETN : InstAlias<"setn", (BIS16rc SR, 4)>;
+def SETZ : InstAlias<"setz", (BIS16rc SR, 2)>;
+
+def : Pat<(MSP430rla GR8:$dst),  (ADD8rr  $dst, $dst)>;
+def : Pat<(MSP430rla GR16:$dst), (ADD16rr $dst, $dst)>;
+
+let Constraints = "$rs = $rd" in {
 
-def SUB8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "sub.b\t{$src, $dst}",
-                   [(store (sub (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def SUB16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "sub.w\t{$src, $dst}",
-                    [(store (sub (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
+let Defs = [SR] in {
 
-def SUB8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "sub.b\t{$src, $dst}",
-                   [(store (sub (load addr:$dst), 
-                                (i8 (load addr:$src))), addr:$dst),
+// FIXME: memory variant!
+def RRA8r :   II8r<0b010,
+                   (outs GR8:$rd), (ins GR8:$rs),
+                   "rra.b\t$rd",
+                   [(set GR8:$rd, (MSP430rra GR8:$rs)),
                     (implicit SR)]>;
-def SUB16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "sub.w\t{$src, $dst}",
-                    [(store (sub (load addr:$dst), 
-                                 (i16 (load addr:$src))), addr:$dst),
+def RRA16r : II16r<0b010,
+                    (outs GR16:$rd), (ins GR16:$rs),
+                    "rra\t$rd",
+                    [(set GR16:$rd, (MSP430rra GR16:$rs)),
                      (implicit SR)]>;
-}
 
 let Uses = [SR] in {
-def SBC8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "subc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sube GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def SBC16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "subc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sube GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-
-def SBC8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "subc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sube GR8:$src, imm:$src2)),
+def RRC8r :   II8r<0b000,
+                   (outs GR8:$rd), (ins GR8:$rs),
+                   "rrc.b\t$rd",
+                   [(set GR8:$rd, (MSP430rrc GR8:$rs)),
                     (implicit SR)]>;
-def SBC16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "subc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sube GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-def SBC8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "subc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sube GR8:$src, (load addr:$src2))),
+def RRC16r : II16r<0b000,
+                   (outs GR16:$rd), (ins GR16:$rs),
+                   "rrc\t$rd",
+                   [(set GR16:$rd, (MSP430rrc GR16:$rs)),
                     (implicit SR)]>;
-def SBC16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "subc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sube GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let Constraints = "" in {
-def SBC8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "subc.b\t{$src, $dst}",
-                  [(store (sube (load addr:$dst), GR8:$src), addr:$dst),
-                   (implicit SR)]>;
-def SBC16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "subc.w\t{$src, $dst}",
-                    [(store (sube (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
-
-def SBC8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "subc.b\t{$src, $dst}",
-                   [(store (sube (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def SBC16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "subc.w\t{$src, $dst}",
-                    [(store (sube (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
-
-def SBC8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "subc.b\t{$src, $dst}",
-                   [(store (sube (load addr:$dst),
-                                 (i8 (load addr:$src))), addr:$dst),
-                    (implicit SR)]>;
-def SBC16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "subc.w\t{$src, $dst}",
-                    [(store (sube (load addr:$dst),
-                            (i16 (load addr:$src))), addr:$dst),
-                     (implicit SR)]>;
 }
 
-} // Uses = [SR]
-
-// FIXME: memory variant!
-def SAR8r1  : II8r<0x0,
-                   (outs GR8:$dst), (ins GR8:$src),
-                   "rra.b\t$dst",
-                   [(set GR8:$dst, (MSP430rra GR8:$src)),
-                    (implicit SR)]>;
-def SAR16r1 : II16r<0x0,
-                    (outs GR16:$dst), (ins GR16:$src),
-                    "rra.w\t$dst",
-                    [(set GR16:$dst, (MSP430rra GR16:$src)),
-                     (implicit SR)]>;
-
-def SHL8r1  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src),
-                   "rla.b\t$dst",
-                   [(set GR8:$dst, (MSP430rla GR8:$src)),
-                    (implicit SR)]>;
-def SHL16r1 : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src),
-                    "rla.w\t$dst",
-                    [(set GR16:$dst, (MSP430rla GR16:$src)),
-                     (implicit SR)]>;
-
-def SAR8r1c  : Pseudo<(outs GR8:$dst), (ins GR8:$src),
-                      "clrc\n\t"
-                      "rrc.b\t$dst",
-                      [(set GR8:$dst, (MSP430rrc GR8:$src)),
-                       (implicit SR)]>;
-def SAR16r1c : Pseudo<(outs GR16:$dst), (ins GR16:$src),
-                      "clrc\n\t"
-                      "rrc.w\t$dst",
-                      [(set GR16:$dst, (MSP430rrc GR16:$src)),
-                       (implicit SR)]>;
-
 // FIXME: Memory sext's ?
-def SEXT16r : II16r<0x0,
-                    (outs GR16:$dst), (ins GR16:$src),
-                    "sxt\t$dst",
-                    [(set GR16:$dst, (sext_inreg GR16:$src, i8)),
+def SEXT16r : II16r<0b011,
+                    (outs GR16:$rd), (ins GR16:$rs),
+                    "sxt\t$rd",
+                    [(set GR16:$rd, (sext_inreg GR16:$rs, i8)),
                      (implicit SR)]>;
 
 } // Defs = [SR]
 
-def ZEXT16r : I8rr<0x0,
-                   (outs GR16:$dst), (ins GR16:$src),
-                   "mov.b\t{$src, $dst}",
-                   [(set GR16:$dst, (zext (trunc GR16:$src)))]>;
+let isCodeGenOnly = 1 in
+def ZEXT16r : I8rr<0b0100,
+                   (outs GR16:$rd), (ins GR16:$rs),
+                   "mov.b\t{$rs, $rd}",
+                   [(set GR16:$rd, (zext (trunc GR16:$rs)))]>;
 
 // FIXME: Memory bitswaps?
-def SWPB16r : II16r<0x0,
-                    (outs GR16:$dst), (ins GR16:$src),
-                    "swpb\t$dst",
-                    [(set GR16:$dst, (bswap GR16:$src))]>;
+def SWPB16r : II16r<0b001,
+                    (outs GR16:$rd), (ins GR16:$rs),
+                    "swpb\t$rd",
+                    [(set GR16:$rd, (bswap GR16:$rs))]>;
 
 } // Constraints = "$src = $dst"
 
 // Integer comparisons
 let Defs = [SR] in {
-def CMP8rr  : I8rr<0x0,
-                   (outs), (ins GR8:$src, GR8:$src2),
-                   "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp GR8:$src, GR8:$src2), (implicit SR)]>;
-def CMP16rr : I16rr<0x0,
-                    (outs), (ins GR16:$src, GR16:$src2),
-                    "cmp.w\t{$src2, $src}",
-                    [(MSP430cmp GR16:$src, GR16:$src2), (implicit SR)]>;
-
-def CMP8ri  : I8ri<0x0,
-                   (outs), (ins GR8:$src, i8imm:$src2),
-                   "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp GR8:$src, imm:$src2), (implicit SR)]>;
-def CMP16ri : I16ri<0x0,
-                    (outs), (ins GR16:$src, i16imm:$src2),
-                    "cmp.w\t{$src2, $src}",
-                    [(MSP430cmp GR16:$src, imm:$src2), (implicit SR)]>;
-
-def CMP8mi  : I8mi<0x0,
-                   (outs), (ins memsrc:$src, i8imm:$src2),
-                   "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp (load addr:$src),
-                               (i8 imm:$src2)), (implicit SR)]>;
-def CMP16mi : I16mi<0x0,
-                    (outs), (ins memsrc:$src, i16imm:$src2),
-                    "cmp.w\t{$src2, $src}",
-                     [(MSP430cmp (load addr:$src),
-                                 (i16 imm:$src2)), (implicit SR)]>;
-
-def CMP8rm  : I8rm<0x0,
-                   (outs), (ins GR8:$src, memsrc:$src2),
-                   "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp GR8:$src, (load addr:$src2)), 
-                    (implicit SR)]>;
-def CMP16rm : I16rm<0x0,
-                    (outs), (ins GR16:$src, memsrc:$src2),
-                    "cmp.w\t{$src2, $src}",
-                    [(MSP430cmp GR16:$src, (load addr:$src2)),
-                     (implicit SR)]>;
-
-def CMP8mr  : I8mr<0x0,
-                   (outs), (ins memsrc:$src, GR8:$src2),
-                   "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp (load addr:$src), GR8:$src2),
-                    (implicit SR)]>;
-def CMP16mr : I16mr<0x0,
-                    (outs), (ins memsrc:$src, GR16:$src2),
-                    "cmp.w\t{$src2, $src}",
-                    [(MSP430cmp (load addr:$src), GR16:$src2), 
+def CMP8rr  : I8rr<0b1001,
+                   (outs), (ins GR8:$rd, GR8:$rs),
+                   "cmp.b\t$rs, $rd",
+                   [(MSP430cmp GR8:$rd, GR8:$rs), (implicit SR)]>;
+def CMP16rr : I16rr<0b1001,
+                    (outs), (ins GR16:$rd, GR16:$rs),
+                    "cmp\t$rs, $rd",
+                    [(MSP430cmp GR16:$rd, GR16:$rs), (implicit SR)]>;
+
+def CMP8rc  : I8rc<0b1001,
+                   (outs), (ins GR8:$rd, cg8imm:$imm),
+                   "cmp.b\t$imm, $rd",
+                   [(MSP430cmp GR8:$rd, cg8imm:$imm), (implicit SR)]>;
+def CMP16rc : I16rc<0b1001,
+                    (outs), (ins GR16:$rd, cg16imm:$imm),
+                    "cmp\t$imm, $rd",
+                    [(MSP430cmp GR16:$rd, cg16imm:$imm), (implicit SR)]>;
+
+def CMP8ri  : I8ri<0b1001,
+                   (outs), (ins GR8:$rd, i8imm:$imm),
+                   "cmp.b\t$imm, $rd",
+                   [(MSP430cmp GR8:$rd, imm:$imm), (implicit SR)]>;
+def CMP16ri : I16ri<0b1001,
+                    (outs), (ins GR16:$rd, i16imm:$imm),
+                    "cmp\t$imm, $rd",
+                    [(MSP430cmp GR16:$rd, imm:$imm), (implicit SR)]>;
+
+def CMP8mc  : I8mc<0b1001,
+                   (outs), (ins memsrc:$dst, cg8imm:$imm),
+                   "cmp.b\t$imm, $dst",
+                   [(MSP430cmp (load addr:$dst), (i8 cg8imm:$imm)),
+                    (implicit SR)]>;
+def CMP16mc : I16mc<0b1001,
+                    (outs), (ins memsrc:$dst, cg16imm:$imm),
+                    "cmp\t$imm, $dst",
+                    [(MSP430cmp (load addr:$dst), (i16 cg16imm:$imm)),
+                     (implicit SR)]>;
+
+def CMP8mi  : I8mi<0b1001,
+                   (outs), (ins memsrc:$dst, i8imm:$imm),
+                   "cmp.b\t$imm, $dst",
+                   [(MSP430cmp (load addr:$dst),
+                               (i8 imm:$imm)), (implicit SR)]>;
+def CMP16mi : I16mi<0b1001,
+                    (outs), (ins memsrc:$dst, i16imm:$imm),
+                    "cmp\t$imm, $dst",
+                     [(MSP430cmp (load addr:$dst),
+                                 (i16 imm:$imm)), (implicit SR)]>;
+
+def CMP8rm  : I8rm<0b1001,
+                   (outs), (ins GR8:$rd, memsrc:$src),
+                   "cmp.b\t$src, $rd",
+                   [(MSP430cmp GR8:$rd, (load addr:$src)), 
+                    (implicit SR)]>;
+def CMP16rm : I16rm<0b1001,
+                    (outs), (ins GR16:$rd, memsrc:$src),
+                    "cmp\t$src, $rd",
+                    [(MSP430cmp GR16:$rd, (load addr:$src)),
+                     (implicit SR)]>;
+
+def CMP8mr  : I8mr<0b1001,
+                   (outs), (ins memsrc:$dst, GR8:$rs),
+                   "cmp.b\t$rs, $dst",
+                   [(MSP430cmp (load addr:$dst), GR8:$rs),
+                    (implicit SR)]>;
+def CMP16mr : I16mr<0b1001,
+                    (outs), (ins memsrc:$dst, GR16:$rs),
+                    "cmp\t$rs, $dst",
+                    [(MSP430cmp (load addr:$dst), GR16:$rs), 
                      (implicit SR)]>;
 
-
 // BIT TESTS, just sets condition codes
 // Note that the C condition is set differently than when using CMP.
 let isCommutable = 1 in {
-def BIT8rr  : I8rr<0x0,
-                   (outs), (ins GR8:$src, GR8:$src2),
-                   "bit.b\t{$src2, $src}",
-                   [(MSP430cmp (and_su GR8:$src, GR8:$src2), 0),
+def BIT8rr  : I8rr<0b1011,
+                   (outs), (ins GR8:$rd, GR8:$rs),
+                   "bit.b\t$rs, $rd",
+                   [(MSP430cmp (and_su GR8:$rd, GR8:$rs), 0),
                     (implicit SR)]>;
-def BIT16rr : I16rr<0x0,
-                    (outs), (ins GR16:$src, GR16:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su GR16:$src, GR16:$src2), 0),
+def BIT16rr : I16rr<0b1011,
+                    (outs), (ins GR16:$rd, GR16:$rs),
+                    "bit\t$rs, $rd",
+                    [(MSP430cmp (and_su GR16:$rd, GR16:$rs), 0),
                      (implicit SR)]>;
 }
-def BIT8ri  : I8ri<0x0,
-                   (outs), (ins GR8:$src, i8imm:$src2),
-                   "bit.b\t{$src2, $src}",
-                   [(MSP430cmp (and_su GR8:$src, imm:$src2), 0),
+def BIT8rc  : I8rc<0b1011,
+                   (outs), (ins GR8:$rd, cg8imm:$imm),
+                   "bit.b\t$imm, $rd",
+                   [(MSP430cmp (and_su GR8:$rd, cg8imm:$imm), 0),
+                    (implicit SR)]>;
+def BIT16rc : I16rc<0b1011,
+                    (outs), (ins GR16:$rd, cg16imm:$imm),
+                    "bit\t$imm, $rd",
+                    [(MSP430cmp (and_su GR16:$rd, cg16imm:$imm), 0),
+                     (implicit SR)]>;
+
+def BIT8ri  : I8ri<0b1011,
+                   (outs), (ins GR8:$rd, i8imm:$imm),
+                   "bit.b\t$imm, $rd",
+                   [(MSP430cmp (and_su GR8:$rd, imm:$imm), 0),
                     (implicit SR)]>;
-def BIT16ri : I16ri<0x0,
-                    (outs), (ins GR16:$src, i16imm:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su GR16:$src, imm:$src2), 0),
+def BIT16ri : I16ri<0b1011,
+                    (outs), (ins GR16:$rd, i16imm:$imm),
+                    "bit\t$imm, $rd",
+                    [(MSP430cmp (and_su GR16:$rd, imm:$imm), 0),
                      (implicit SR)]>;
 
-def BIT8rm  : I8rm<0x0,
-                   (outs), (ins GR8:$src, memdst:$src2),
-                   "bit.b\t{$src2, $src}",
-                   [(MSP430cmp (and_su GR8:$src,  (load addr:$src2)), 0),
+def BIT8rm  : I8rm<0b1011,
+                   (outs), (ins GR8:$rd, memdst:$src),
+                   "bit.b\t$src, $rd",
+                   [(MSP430cmp (and_su GR8:$rd,  (load addr:$src)), 0),
                     (implicit SR)]>;
-def BIT16rm : I16rm<0x0,
-                    (outs), (ins GR16:$src, memdst:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su GR16:$src,  (load addr:$src2)), 0),
+def BIT16rm : I16rm<0b1011,
+                    (outs), (ins GR16:$rd, memdst:$src),
+                    "bit\t$src, $rd",
+                    [(MSP430cmp (and_su GR16:$rd,  (load addr:$src)), 0),
                      (implicit SR)]>;
 
-def BIT8mr  : I8mr<0x0,
-                  (outs), (ins memsrc:$src, GR8:$src2),
-                  "bit.b\t{$src2, $src}",
-                  [(MSP430cmp (and_su (load addr:$src), GR8:$src2), 0),
+def BIT8mr  : I8mr<0b1011,
+                  (outs), (ins memsrc:$dst, GR8:$rs),
+                  "bit.b\t$rs, $dst",
+                  [(MSP430cmp (and_su (load addr:$dst), GR8:$rs), 0),
                    (implicit SR)]>;
-def BIT16mr : I16mr<0x0,
-                    (outs), (ins memsrc:$src, GR16:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su (load addr:$src), GR16:$src2), 0),
+def BIT16mr : I16mr<0b1011,
+                    (outs), (ins memsrc:$dst, GR16:$rs),
+                    "bit\t$rs, $dst",
+                    [(MSP430cmp (and_su (load addr:$dst), GR16:$rs), 0),
+                     (implicit SR)]>;
+
+def BIT8mc  : I8mc<0b1011,
+                   (outs), (ins memsrc:$dst, cg8imm:$imm),
+                   "bit.b\t$imm, $dst",
+                   [(MSP430cmp (and_su (load addr:$dst), (i8 cg8imm:$imm)), 0),
+                    (implicit SR)]>;
+def BIT16mc : I16mc<0b1011,
+                    (outs), (ins memsrc:$dst, i16imm:$imm),
+                    "bit\t$imm, $dst",
+                    [(MSP430cmp (and_su (load addr:$dst), (i16 cg16imm:$imm)), 0),
                      (implicit SR)]>;
 
-def BIT8mi  : I8mi<0x0,
-                   (outs), (ins memsrc:$src, i8imm:$src2),
-                   "bit.b\t{$src2, $src}",
-                   [(MSP430cmp (and_su (load addr:$src), (i8 imm:$src2)), 0),
+def BIT8mi  : I8mi<0b1011,
+                   (outs), (ins memsrc:$dst, i8imm:$imm),
+                   "bit.b\t$imm, $dst",
+                   [(MSP430cmp (and_su (load addr:$dst), (i8 imm:$imm)), 0),
                     (implicit SR)]>;
-def BIT16mi : I16mi<0x0,
-                    (outs), (ins memsrc:$src, i16imm:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su (load addr:$src), (i16 imm:$src2)), 0),
+def BIT16mi : I16mi<0b1011,
+                    (outs), (ins memsrc:$dst, i16imm:$imm),
+                    "bit\t$imm, $dst",
+                    [(MSP430cmp (and_su (load addr:$dst), (i16 imm:$imm)), 0),
                      (implicit SR)]>;
 
-def BIT8mm  : I8mm<0x0,
-                   (outs), (ins memsrc:$src, memsrc:$src2),
-                   "bit.b\t{$src2, $src}",
-                   [(MSP430cmp (and_su (i8 (load addr:$src)),
-                                       (load addr:$src2)),
+def BIT8mm  : I8mm<0b1011,
+                   (outs), (ins memsrc:$dst, memsrc:$src),
+                   "bit.b\t$src, $dst",
+                   [(MSP430cmp (and_su (i8 (load addr:$dst)),
+                                       (load addr:$src)),
                                  0),
                       (implicit SR)]>;
-def BIT16mm : I16mm<0x0,
-                    (outs), (ins memsrc:$src, memsrc:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su (i16 (load addr:$src)),
-                                        (load addr:$src2)),
+def BIT16mm : I16mm<0b1011,
+                    (outs), (ins memsrc:$dst, memsrc:$src),
+                    "bit\t$src, $dst",
+                    [(MSP430cmp (and_su (i16 (load addr:$dst)),
+                                        (load addr:$src)),
                                  0),
                      (implicit SR)]>;
 } // Defs = [SR]
 
+def TST8r   : InstAlias<"tst.b\t$dst",  (CMP8rc    GR8:$dst,     0)>;
+def TST16r  : InstAlias<"tst\t$dst",    (CMP16rc   GR16:$dst,    0)>;
+def TST8m   : InstAlias<"tst.b\t$dst",  (CMP8mc    memdst:$dst,  0)>;
+def TST16m  : InstAlias<"tst\t$dst",    (CMP16mc   memdst:$dst,  0)>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 
diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index e7716382b22..860c0006f78 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -110,6 +110,9 @@ LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
   return MCOperand::createExpr(Expr);
 }
 
+#define GET_REGINFO_ENUM
+#include "MSP430GenRegisterInfo.inc"
+
 void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
index b5a6ed0f0a5..1e86bdf34a0 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.td
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -11,26 +11,31 @@
 //  Declarations that describe the MSP430 register file
 //===----------------------------------------------------------------------===//
 
-class MSP430Reg<bits<4> num, string n> : Register<n> {
+class MSP430Reg<bits<4> num, string n, list<string> alt = []> : Register<n> {
   field bits<4> Num = num;
   let Namespace = "MSP430";
+  let HWEncoding{3-0} = num;
+  let AltNames = alt;
 }
 
-class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs> 
+class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs,
+                           list<string> alt = []> 
   : RegisterWithSubRegs<n, subregs> {
   field bits<4> Num = num;
   let Namespace = "MSP430";
+  let HWEncoding{3-0} = num;
+  let AltNames = alt;
 }
 
 //===----------------------------------------------------------------------===//
 //  Registers
 //===----------------------------------------------------------------------===//
 
-def PCB  : MSP430Reg<0,  "r0">;
-def SPB  : MSP430Reg<1,  "r1">;
-def SRB  : MSP430Reg<2,  "r2">;
-def CGB  : MSP430Reg<3,  "r3">;
-def FPB  : MSP430Reg<4,  "r4">;
+def PCB  : MSP430Reg<0,  "r0", ["pc"]>;
+def SPB  : MSP430Reg<1,  "r1", ["sp"]>;
+def SRB  : MSP430Reg<2,  "r2", ["sr"]>;
+def CGB  : MSP430Reg<3,  "r3", ["cg"]>;
+def FPB  : MSP430Reg<4,  "r4", ["fp"]>;
 def R5B  : MSP430Reg<5,  "r5">;
 def R6B  : MSP430Reg<6,  "r6">;
 def R7B  : MSP430Reg<7,  "r7">;
@@ -46,11 +51,11 @@ def R15B : MSP430Reg<15, "r15">;
 def subreg_8bit : SubRegIndex<8> { let Namespace = "MSP430"; }
 
 let SubRegIndices = [subreg_8bit] in {
-def PC  : MSP430RegWithSubregs<0,  "r0",  [PCB]>;
-def SP  : MSP430RegWithSubregs<1,  "r1",  [SPB]>;
-def SR  : MSP430RegWithSubregs<2,  "r2",  [SRB]>;
-def CG  : MSP430RegWithSubregs<3,  "r3",  [CGB]>;
-def FP  : MSP430RegWithSubregs<4,  "r4",  [FPB]>;
+def PC  : MSP430RegWithSubregs<0,  "r0",  [PCB], ["pc"]>;
+def SP  : MSP430RegWithSubregs<1,  "r1",  [SPB], ["sp"]>;
+def SR  : MSP430RegWithSubregs<2,  "r2",  [SRB], ["sr"]>;
+def CG  : MSP430RegWithSubregs<3,  "r3",  [CGB], ["cg"]>;
+def FP  : MSP430RegWithSubregs<4,  "r4",  [FPB], ["fp"]>;
 def R5  : MSP430RegWithSubregs<5,  "r5",  [R5B]>;
 def R6  : MSP430RegWithSubregs<6,  "r6",  [R6B]>;
 def R7  : MSP430RegWithSubregs<7,  "r7",  [R7B]>;
diff --git a/test/CodeGen/MSP430/AddrMode-bis-rx.ll b/test/CodeGen/MSP430/AddrMode-bis-rx.ll
index f4cb30f2d01..948b67eb66c 100644
--- a/test/CodeGen/MSP430/AddrMode-bis-rx.ll
+++ b/test/CodeGen/MSP430/AddrMode-bis-rx.ll
@@ -8,7 +8,7 @@ define i16 @am1(i16 %x, i16* %a) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am1:
-; CHECK:		bis.w	0(r13), r12
+; CHECK:		bis	0(r13), r12
 
 @foo = external global i16
 
@@ -18,7 +18,7 @@ define i16 @am2(i16 %x) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am2:
-; CHECK:		bis.w	&foo, r12
+; CHECK:		bis	&foo, r12
 
 @bar = internal constant [2 x i8] [ i8 32, i8 64 ]
 
@@ -37,7 +37,7 @@ define i16 @am4(i16 %x) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am4:
-; CHECK:		bis.w	&32, r12
+; CHECK:		bis	&32, r12
 
 define i16 @am5(i16 %x, i16* %a) nounwind {
 	%1 = getelementptr i16, i16* %a, i16 2
@@ -46,7 +46,7 @@ define i16 @am5(i16 %x, i16* %a) nounwind {
 	ret i16 %3
 }
 ; CHECK-LABEL: am5:
-; CHECK:		bis.w	4(r13), r12
+; CHECK:		bis	4(r13), r12
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer, align 1
@@ -57,7 +57,7 @@ define i16 @am6(i16 %x) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am6:
-; CHECK:		bis.w	&baz+2, r12
+; CHECK:		bis	&baz+2, r12
 
 %T = type { i16, [2 x i8] }
 @duh = internal constant %T { i16 16, [2 x i8][i8 32, i8 64 ] }
diff --git a/test/CodeGen/MSP430/AddrMode-bis-xr.ll b/test/CodeGen/MSP430/AddrMode-bis-xr.ll
index 1e150f38206..6d3a497386d 100644
--- a/test/CodeGen/MSP430/AddrMode-bis-xr.ll
+++ b/test/CodeGen/MSP430/AddrMode-bis-xr.ll
@@ -9,7 +9,7 @@ define void @am1(i16* %a, i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am1:
-; CHECK:		bis.w	r13, 0(r12)
+; CHECK:		bis	r13, 0(r12)
 
 @foo = external global i16
 
@@ -20,7 +20,7 @@ define void @am2(i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am2:
-; CHECK:		bis.w	r12, &foo
+; CHECK:		bis	r12, &foo
 
 @bar = external global [2 x i8]
 
@@ -41,7 +41,7 @@ define void @am4(i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am4:
-; CHECK:		bis.w	r12, &32
+; CHECK:		bis	r12, &32
 
 define void @am5(i16* %a, i16 %x) readonly {
 	%1 = getelementptr inbounds i16, i16* %a, i16 2
@@ -51,7 +51,7 @@ define void @am5(i16* %a, i16 %x) readonly {
 	ret void
 }
 ; CHECK-LABEL: am5:
-; CHECK:		bis.w	r13, 4(r12)
+; CHECK:		bis	r13, 4(r12)
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer
@@ -63,7 +63,7 @@ define void @am6(i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am6:
-; CHECK:		bis.w	r12, &baz+2
+; CHECK:		bis	r12, &baz+2
 
 %T = type { i16, [2 x i8] }
 @duh = external global %T
diff --git a/test/CodeGen/MSP430/AddrMode-mov-rx.ll b/test/CodeGen/MSP430/AddrMode-mov-rx.ll
index 808aca0ea10..0605e8e86ce 100644
--- a/test/CodeGen/MSP430/AddrMode-mov-rx.ll
+++ b/test/CodeGen/MSP430/AddrMode-mov-rx.ll
@@ -7,7 +7,7 @@ define i16 @am1(i16* %a) nounwind {
 	ret i16 %1
 }
 ; CHECK-LABEL: am1:
-; CHECK:		mov.w	0(r12), r12
+; CHECK:		mov	0(r12), r12
 
 @foo = external global i16
 
@@ -16,7 +16,7 @@ define i16 @am2() nounwind {
 	ret i16 %1
 }
 ; CHECK-LABEL: am2:
-; CHECK:		mov.w	&foo, r12
+; CHECK:		mov	&foo, r12
 
 @bar = internal constant [2 x i8] [ i8 32, i8 64 ]
 
@@ -33,7 +33,7 @@ define i16 @am4() nounwind {
 	ret i16 %1
 }
 ; CHECK-LABEL: am4:
-; CHECK:		mov.w	&32, r12
+; CHECK:		mov	&32, r12
 
 define i16 @am5(i16* %a) nounwind {
 	%1 = getelementptr i16, i16* %a, i16 2
@@ -41,7 +41,7 @@ define i16 @am5(i16* %a) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am5:
-; CHECK:		mov.w	4(r12), r12
+; CHECK:		mov	4(r12), r12
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer, align 1
@@ -51,7 +51,7 @@ define i16 @am6() nounwind {
 	ret i16 %1
 }
 ; CHECK-LABEL: am6:
-; CHECK:		mov.w	&baz+2, r12
+; CHECK:		mov	&baz+2, r12
 
 %T = type { i16, [2 x i8] }
 @duh = internal constant %T { i16 16, [2 x i8][i8 32, i8 64 ] }
diff --git a/test/CodeGen/MSP430/AddrMode-mov-xr.ll b/test/CodeGen/MSP430/AddrMode-mov-xr.ll
index c336289a60d..acc0b825711 100644
--- a/test/CodeGen/MSP430/AddrMode-mov-xr.ll
+++ b/test/CodeGen/MSP430/AddrMode-mov-xr.ll
@@ -7,7 +7,7 @@ define void @am1(i16* %a, i16 %b) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am1:
-; CHECK:		mov.w	r13, 0(r12)
+; CHECK:		mov	r13, 0(r12)
 
 @foo = external global i16
 
@@ -16,7 +16,7 @@ define void @am2(i16 %a) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am2:
-; CHECK:		mov.w	r12, &foo
+; CHECK:		mov	r12, &foo
 
 @bar = external global [2 x i8]
 
@@ -33,7 +33,7 @@ define void @am4(i16 %a) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am4:
-; CHECK:		mov.w	r12, &32
+; CHECK:		mov	r12, &32
 
 define void @am5(i16* nocapture %p, i16 %a) nounwind readonly {
 	%1 = getelementptr inbounds i16, i16* %p, i16 2
@@ -41,7 +41,7 @@ define void @am5(i16* nocapture %p, i16 %a) nounwind readonly {
 	ret void
 }
 ; CHECK-LABEL: am5:
-; CHECK:		mov.w	r13, 4(r12)
+; CHECK:		mov	r13, 4(r12)
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer, align 1
@@ -51,7 +51,7 @@ define void @am6(i16 %a) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am6:
-; CHECK:		mov.w	r12, &baz+2
+; CHECK:		mov	r12, &baz+2
 
 %T = type { i16, [2 x i8] }
 @duh = external global %T
diff --git a/test/CodeGen/MSP430/Inst16mi.ll b/test/CodeGen/MSP430/Inst16mi.ll
index 38c16f2ba23..bb99e28a1ba 100644
--- a/test/CodeGen/MSP430/Inst16mi.ll
+++ b/test/CodeGen/MSP430/Inst16mi.ll
@@ -6,14 +6,14 @@ target triple = "msp430-generic-generic"
 
 define void @mov() nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	#2, &foo
+; CHECK: mov	#2, &foo
 	store i16 2, i16 * @foo
 	ret void
 }
 
 define void @add() nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	#2, &foo
+; CHECK: incd	&foo
 	%1 = load i16, i16* @foo
 	%2 = add i16 %1, 2
 	store i16 %2, i16 * @foo
@@ -22,7 +22,7 @@ define void @add() nounwind {
 
 define void @and() nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	#2, &foo
+; CHECK: and	#2, &foo
 	%1 = load i16, i16* @foo
 	%2 = and i16 %1, 2
 	store i16 %2, i16 * @foo
@@ -31,7 +31,7 @@ define void @and() nounwind {
 
 define void @bis() nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	#2, &foo
+; CHECK: bis	#2, &foo
 	%1 = load i16, i16* @foo
 	%2 = or i16 %1, 2
 	store i16 %2, i16 * @foo
@@ -40,7 +40,7 @@ define void @bis() nounwind {
 
 define void @xor() nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	#2, &foo
+; CHECK: xor	#2, &foo
 	%1 = load i16, i16* @foo
 	%2 = xor i16 %1, 2
 	store i16 %2, i16 * @foo
diff --git a/test/CodeGen/MSP430/Inst16mm.ll b/test/CodeGen/MSP430/Inst16mm.ll
index 14a799b9171..21fab42fd59 100644
--- a/test/CodeGen/MSP430/Inst16mm.ll
+++ b/test/CodeGen/MSP430/Inst16mm.ll
@@ -6,7 +6,7 @@ target triple = "msp430-generic-generic"
 
 define void @mov() nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	&bar, &foo
+; CHECK: mov	&bar, &foo
         %1 = load i16, i16* @bar
         store i16 %1, i16* @foo
         ret void
@@ -14,7 +14,7 @@ define void @mov() nounwind {
 
 define void @add() nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	&bar, &foo
+; CHECK: add	&bar, &foo
 	%1 = load i16, i16* @bar
 	%2 = load i16, i16* @foo
 	%3 = add i16 %2, %1
@@ -24,7 +24,7 @@ define void @add() nounwind {
 
 define void @and() nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	&bar, &foo
+; CHECK: and	&bar, &foo
 	%1 = load i16, i16* @bar
 	%2 = load i16, i16* @foo
 	%3 = and i16 %2, %1
@@ -34,7 +34,7 @@ define void @and() nounwind {
 
 define void @bis() nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	&bar, &foo
+; CHECK: bis	&bar, &foo
 	%1 = load i16, i16* @bar
 	%2 = load i16, i16* @foo
 	%3 = or i16 %2, %1
@@ -44,7 +44,7 @@ define void @bis() nounwind {
 
 define void @xor() nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	&bar, &foo
+; CHECK: xor	&bar, &foo
 	%1 = load i16, i16* @bar
 	%2 = load i16, i16* @foo
 	%3 = xor i16 %2, %1
@@ -64,6 +64,6 @@ entry:
  %0 = load i16, i16* %retval                          ; <i16> [#uses=1]
  ret i16 %0
 ; CHECK-LABEL: mov2:
-; CHECK-DAG:	mov.w	2(r1), 6(r1)
-; CHECK-DAG:	mov.w	0(r1), 4(r1)
+; CHECK-DAG:	mov	2(r1), 6(r1)
+; CHECK-DAG:	mov	0(r1), 4(r1)
 }
diff --git a/test/CodeGen/MSP430/Inst16mr.ll b/test/CodeGen/MSP430/Inst16mr.ll
index 847c093f408..e3f23d9c562 100644
--- a/test/CodeGen/MSP430/Inst16mr.ll
+++ b/test/CodeGen/MSP430/Inst16mr.ll
@@ -5,14 +5,14 @@ target triple = "msp430-generic-generic"
 
 define void @mov(i16 %a) nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	r12, &foo
+; CHECK: mov	r12, &foo
 	store i16 %a, i16* @foo
 	ret void
 }
 
 define void @add(i16 %a) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	r12, &foo
+; CHECK: add	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = add i16 %a, %1
 	store i16 %2, i16* @foo
@@ -21,7 +21,7 @@ define void @add(i16 %a) nounwind {
 
 define void @and(i16 %a) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	r12, &foo
+; CHECK: and	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = and i16 %a, %1
 	store i16 %2, i16* @foo
@@ -30,7 +30,7 @@ define void @and(i16 %a) nounwind {
 
 define void @bis(i16 %a) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	r12, &foo
+; CHECK: bis	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = or i16 %a, %1
 	store i16 %2, i16* @foo
@@ -39,7 +39,7 @@ define void @bis(i16 %a) nounwind {
 
 define void @bic(i16 zeroext %m) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.w   r12, &foo
+; CHECK: bic   r12, &foo
         %1 = xor i16 %m, -1
         %2 = load i16, i16* @foo
         %3 = and i16 %2, %1
@@ -49,7 +49,7 @@ define void @bic(i16 zeroext %m) nounwind {
 
 define void @xor(i16 %a) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	r12, &foo
+; CHECK: xor	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = xor i16 %a, %1
 	store i16 %2, i16* @foo
diff --git a/test/CodeGen/MSP430/Inst16ri.ll b/test/CodeGen/MSP430/Inst16ri.ll
index 3a4bb6a93d9..58b2791194a 100644
--- a/test/CodeGen/MSP430/Inst16ri.ll
+++ b/test/CodeGen/MSP430/Inst16ri.ll
@@ -4,34 +4,34 @@ target triple = "msp430-generic-generic"
 
 define i16 @mov() nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	#1, r12
+; CHECK: mov	#1, r12
 	ret i16 1
 }
 
 define i16 @add(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	#1, r12
+; CHECK: inc	r12
 	%1 = add i16 %a, 1
 	ret i16 %1
 }
 
 define i16 @and(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	#1, r12
+; CHECK: and	#1, r12
 	%1 = and i16 %a, 1
 	ret i16 %1
 }
 
 define i16 @bis(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	#1, r12
+; CHECK: bis	#1, r12
 	%1 = or i16 %a, 1
 	ret i16 %1
 }
 
 define i16 @xor(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	#1, r12
+; CHECK: xor	#1, r12
 	%1 = xor i16 %a, 1
 	ret i16 %1
 }
diff --git a/test/CodeGen/MSP430/Inst16rm.ll b/test/CodeGen/MSP430/Inst16rm.ll
index 44b8f39d8fa..8a3cd0a46fb 100644
--- a/test/CodeGen/MSP430/Inst16rm.ll
+++ b/test/CodeGen/MSP430/Inst16rm.ll
@@ -5,7 +5,7 @@ target triple = "msp430-generic-generic"
 
 define i16 @add(i16 %a) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	&foo, r12
+; CHECK: add	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = add i16 %a, %1
 	ret i16 %2
@@ -13,7 +13,7 @@ define i16 @add(i16 %a) nounwind {
 
 define i16 @and(i16 %a) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	&foo, r12
+; CHECK: and	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = and i16 %a, %1
 	ret i16 %2
@@ -21,7 +21,7 @@ define i16 @and(i16 %a) nounwind {
 
 define i16 @bis(i16 %a) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	&foo, r12
+; CHECK: bis	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = or i16 %a, %1
 	ret i16 %2
@@ -29,7 +29,7 @@ define i16 @bis(i16 %a) nounwind {
 
 define i16  @bic(i16 %a) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.w	&foo, r12
+; CHECK: bic	&foo, r12
         %1 = load i16, i16* @foo
         %2 = xor i16 %1, -1
         %3 = and i16 %a, %2
@@ -38,7 +38,7 @@ define i16  @bic(i16 %a) nounwind {
 
 define i16 @xor(i16 %a) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	&foo, r12
+; CHECK: xor	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = xor i16 %a, %1
 	ret i16 %2
diff --git a/test/CodeGen/MSP430/Inst16rr.ll b/test/CodeGen/MSP430/Inst16rr.ll
index 75440ca2b40..124d42113a2 100644
--- a/test/CodeGen/MSP430/Inst16rr.ll
+++ b/test/CodeGen/MSP430/Inst16rr.ll
@@ -4,34 +4,34 @@ target triple = "msp430-generic-generic"
 
 define i16 @mov(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	r13, r12
+; CHECK: mov	r13, r12
 	ret i16 %b
 }
 
 define i16 @add(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	r13, r12
+; CHECK: add	r13, r12
 	%1 = add i16 %a, %b
 	ret i16 %1
 }
 
 define i16 @and(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	r13, r12
+; CHECK: and	r13, r12
 	%1 = and i16 %a, %b
 	ret i16 %1
 }
 
 define i16 @bis(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	r13, r12
+; CHECK: bis	r13, r12
 	%1 = or i16 %a, %b
 	ret i16 %1
 }
 
 define i16 @bic(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.w	r13, r12
+; CHECK: bic	r13, r12
         %1 = xor i16 %b, -1
         %2 = and i16 %a, %1
         ret i16 %2
@@ -39,7 +39,7 @@ define i16 @bic(i16 %a, i16 %b) nounwind {
 
 define i16 @xor(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	r13, r12
+; CHECK: xor	r13, r12
 	%1 = xor i16 %a, %b
 	ret i16 %1
 }
diff --git a/test/CodeGen/MSP430/Inst8mi.ll b/test/CodeGen/MSP430/Inst8mi.ll
index ff22d7e1eb3..36eb3f91f84 100644
--- a/test/CodeGen/MSP430/Inst8mi.ll
+++ b/test/CodeGen/MSP430/Inst8mi.ll
@@ -12,7 +12,7 @@ define void @mov() nounwind {
 
 define void @add() nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.b	#2, &foo
+; CHECK: incd.b	&foo
 	%1 = load i8, i8* @foo
 	%2 = add i8 %1, 2
 	store i8 %2, i8 * @foo
diff --git a/test/CodeGen/MSP430/Inst8ri.ll b/test/CodeGen/MSP430/Inst8ri.ll
index 0e50f17f2a5..ff3dee8bfb9 100644
--- a/test/CodeGen/MSP430/Inst8ri.ll
+++ b/test/CodeGen/MSP430/Inst8ri.ll
@@ -10,7 +10,7 @@ define i8 @mov() nounwind {
 
 define i8 @add(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.b	#1, r12
+; CHECK: inc.b	r12
 	%1 = add i8 %a, 1
 	ret i8 %1
 }
diff --git a/test/CodeGen/MSP430/Inst8rr.ll b/test/CodeGen/MSP430/Inst8rr.ll
index f37bc32a28f..20c4fa5aacf 100644
--- a/test/CodeGen/MSP430/Inst8rr.ll
+++ b/test/CodeGen/MSP430/Inst8rr.ll
@@ -4,7 +4,7 @@ target triple = "msp430-generic-generic"
 
 define i8 @mov(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.{{[bw]}} r13, r12
+; CHECK: mov r13, r12
 	ret i8 %b
 }
 
@@ -17,14 +17,14 @@ define i8 @add(i8 %a, i8 %b) nounwind {
 
 define i8 @and(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	r13, r12
+; CHECK: and	r13, r12
 	%1 = and i8 %a, %b
 	ret i8 %1
 }
 
 define i8 @bis(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	r13, r12
+; CHECK: bis	r13, r12
 	%1 = or i8 %a, %b
 	ret i8 %1
 }
@@ -39,7 +39,7 @@ define i8 @bic(i8 %a, i8 %b) nounwind {
 
 define i8 @xor(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	r13, r12
+; CHECK: xor	r13, r12
 	%1 = xor i8 %a, %b
 	ret i8 %1
 }
diff --git a/test/CodeGen/MSP430/asm-clobbers.ll b/test/CodeGen/MSP430/asm-clobbers.ll
index 216a3fe4018..0a0335057f1 100644
--- a/test/CodeGen/MSP430/asm-clobbers.ll
+++ b/test/CodeGen/MSP430/asm-clobbers.ll
@@ -6,8 +6,8 @@ target triple = "msp430---elf"
 define void @test() {
 entry:
 ; CHECK-LABEL: test:
-; CHECK: push.w r10
+; CHECK: push r10
   call void asm sideeffect "", "~{r10}"()
-; CHECK: pop.w r10
+; CHECK: pop r10
   ret void
 }
diff --git a/test/CodeGen/MSP430/bit.ll b/test/CodeGen/MSP430/bit.ll
index 172822fbb5f..a4b781243b4 100644
--- a/test/CodeGen/MSP430/bit.ll
+++ b/test/CodeGen/MSP430/bit.ll
@@ -93,7 +93,7 @@ define i16 @bitwrr(i16 %a, i16 %b) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: bitwrr:
-; CHECK: bit.w	r13, r12
+; CHECK: bit	r13, r12
 
 define i16 @bitwri(i16 %a) nounwind {
 	%t1 = and i16 %a, 4080
@@ -102,7 +102,7 @@ define i16 @bitwri(i16 %a) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: bitwri:
-; CHECK: bit.w	#4080, r12
+; CHECK: bit	#4080, r12
 
 define i16 @bitwir(i16 %a) nounwind {
 	%t1 = and i16 4080, %a
@@ -111,7 +111,7 @@ define i16 @bitwir(i16 %a) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: bitwir:
-; CHECK: bit.w	#4080, r12
+; CHECK: bit	#4080, r12
 
 define i16 @bitwmi() nounwind {
 	%t1 = load i16, i16* @foo16
@@ -121,7 +121,7 @@ define i16 @bitwmi() nounwind {
 	ret i16 %t4
 }
 ; CHECK-LABEL: bitwmi:
-; CHECK: bit.w	#4080, &foo16
+; CHECK: bit	#4080, &foo16
 
 define i16 @bitwim() nounwind {
 	%t1 = load i16, i16* @foo16
@@ -131,7 +131,7 @@ define i16 @bitwim() nounwind {
 	ret i16 %t4
 }
 ; CHECK-LABEL: bitwim:
-; CHECK: bit.w	#4080, &foo16
+; CHECK: bit	#4080, &foo16
 
 define i16 @bitwrm(i16 %a) nounwind {
 	%t1 = load i16, i16* @foo16
@@ -141,7 +141,7 @@ define i16 @bitwrm(i16 %a) nounwind {
 	ret i16 %t4
 }
 ; CHECK-LABEL: bitwrm:
-; CHECK: bit.w	&foo16, r12
+; CHECK: bit	&foo16, r12
 
 define i16 @bitwmr(i16 %a) nounwind {
 	%t1 = load i16, i16* @foo16
@@ -151,7 +151,7 @@ define i16 @bitwmr(i16 %a) nounwind {
 	ret i16 %t4
 }
 ; CHECK-LABEL: bitwmr:
-; CHECK: bit.w	r12, &foo16
+; CHECK: bit	r12, &foo16
 
 define i16 @bitwmm() nounwind {
 	%t1 = load i16, i16* @foo16
@@ -162,5 +162,5 @@ define i16 @bitwmm() nounwind {
 	ret i16 %t5
 }
 ; CHECK-LABEL: bitwmm:
-; CHECK: bit.w	&bar16, &foo16
+; CHECK: bit	&bar16, &foo16
 
diff --git a/test/CodeGen/MSP430/byval.ll b/test/CodeGen/MSP430/byval.ll
index 401896b43c2..838e883d4be 100644
--- a/test/CodeGen/MSP430/byval.ll
+++ b/test/CodeGen/MSP430/byval.ll
@@ -9,7 +9,7 @@ target triple = "msp430---elf"
 define i16 @callee(%struct.Foo* byval %f) nounwind {
 entry:
 ; CHECK-LABEL: callee:
-; CHECK: mov.w 2(r1), r12
+; CHECK: mov 2(r1), r12
   %0 = getelementptr inbounds %struct.Foo, %struct.Foo* %f, i32 0, i32 0
   %1 = load i16, i16* %0, align 2
   ret i16 %1
@@ -18,9 +18,9 @@ entry:
 define void @caller() nounwind {
 entry:
 ; CHECK-LABEL: caller:
-; CHECK: mov.w &foo+4, 4(r1)
-; CHECK-NEXT: mov.w &foo+2, 2(r1)
-; CHECK-NEXT: mov.w &foo, 0(r1)
+; CHECK: mov &foo+4, 4(r1)
+; CHECK-NEXT: mov &foo+2, 2(r1)
+; CHECK-NEXT: mov &foo, 0(r1)
   %call = call i16 @callee(%struct.Foo* byval @foo)
   ret void
 }
diff --git a/test/CodeGen/MSP430/cc_args.ll b/test/CodeGen/MSP430/cc_args.ll
index 70ac901f7e4..eb7e470a9b6 100644
--- a/test/CodeGen/MSP430/cc_args.ll
+++ b/test/CodeGen/MSP430/cc_args.ll
@@ -7,50 +7,50 @@ define void @test() #0 {
 entry:
 ; CHECK: test:
 
-; CHECK: mov.w #1, r12
+; CHECK: mov #1, r12
 ; CHECK: call #f_i16
   call void @f_i16(i16 1)
 
-; CHECK: mov.w #772, r12
-; CHECK: mov.w #258, r13
+; CHECK: mov #772, r12
+; CHECK: mov #258, r13
 ; CHECK: call #f_i32
   call void @f_i32(i32 16909060)
 
-; CHECK: mov.w #1800, r12
-; CHECK: mov.w #1286, r13
-; CHECK: mov.w #772, r14
-; CHECK: mov.w #258, r15
+; CHECK: mov #1800, r12
+; CHECK: mov #1286, r13
+; CHECK: mov #772, r14
+; CHECK: mov #258, r15
 ; CHECK: call #f_i64
   call void @f_i64(i64 72623859790382856)
 
-; CHECK: mov.w #772, r12
-; CHECK: mov.w #258, r13
-; CHECK: mov.w #1800, r14
-; CHECK: mov.w #1286, r15
+; CHECK: mov #772, r12
+; CHECK: mov #258, r13
+; CHECK: mov #1800, r14
+; CHECK: mov #1286, r15
 ; CHECK: call #f_i32_i32
   call void @f_i32_i32(i32 16909060, i32 84281096)
 
-; CHECK: mov.w #1, r12
-; CHECK: mov.w #772, r13
-; CHECK: mov.w #258, r14
-; CHECK: mov.w #2, r15
+; CHECK: mov #1, r12
+; CHECK: mov #772, r13
+; CHECK: mov #258, r14
+; CHECK: mov #2, r15
 ; CHECK: call #f_i16_i32_i16
   call void @f_i16_i32_i16(i16 1, i32 16909060, i16 2)
 
-; CHECK: mov.w #1286, 0(r1)
-; CHECK: mov.w #1, r12
-; CHECK: mov.w #772, r13
-; CHECK: mov.w #258, r14
-; CHECK: mov.w #1800, r15
+; CHECK: mov #1286, 0(r1)
+; CHECK: mov #1, r12
+; CHECK: mov #772, r13
+; CHECK: mov #258, r14
+; CHECK: mov #1800, r15
 ; CHECK: call #f_i16_i32_i32
   call void @f_i16_i32_i32(i16 1, i32 16909060, i32 84281096)
 
-; CHECK: mov.w #258, 6(r1)
-; CHECK: mov.w #772, 4(r1)
-; CHECK: mov.w #1286, 2(r1)
-; CHECK: mov.w #1800, 0(r1)
-; CHECK: mov.w #1, r12
-; CHECK: mov.w #2, r13
+; CHECK: mov #258, 6(r1)
+; CHECK: mov #772, 4(r1)
+; CHECK: mov #1286, 2(r1)
+; CHECK: mov #1800, 0(r1)
+; CHECK: mov #1, r12
+; CHECK: mov #2, r13
 ; CHECK: call #f_i16_i64_i16
   call void @f_i16_i64_i16(i16 1, i64 72623859790382856, i16 2)
 
@@ -63,75 +63,75 @@ entry:
 
 define void @f_i16(i16 %a) #0 {
 ; CHECK: f_i16:
-; CHECK: mov.w r12, &g_i16
+; CHECK: mov r12, &g_i16
   store volatile i16 %a, i16* @g_i16, align 2
   ret void
 }
 
 define void @f_i32(i32 %a) #0 {
 ; CHECK: f_i32:
-; CHECK: mov.w r13, &g_i32+2
-; CHECK: mov.w r12, &g_i32
+; CHECK: mov r13, &g_i32+2
+; CHECK: mov r12, &g_i32
   store volatile i32 %a, i32* @g_i32, align 2
   ret void
 }
 
 define void @f_i64(i64 %a) #0 {
 ; CHECK: f_i64:
-; CHECK: mov.w r15, &g_i64+6
-; CHECK: mov.w r14, &g_i64+4
-; CHECK: mov.w r13, &g_i64+2
-; CHECK: mov.w r12, &g_i64
+; CHECK: mov r15, &g_i64+6
+; CHECK: mov r14, &g_i64+4
+; CHECK: mov r13, &g_i64+2
+; CHECK: mov r12, &g_i64
   store volatile i64 %a, i64* @g_i64, align 2
   ret void
 }
 
 define void @f_i32_i32(i32 %a, i32 %b) #0 {
 ; CHECK: f_i32_i32:
-; CHECK: mov.w r13, &g_i32+2
-; CHECK: mov.w r12, &g_i32
+; CHECK: mov r13, &g_i32+2
+; CHECK: mov r12, &g_i32
   store volatile i32 %a, i32* @g_i32, align 2
-; CHECK: mov.w r15, &g_i32+2
-; CHECK: mov.w r14, &g_i32
+; CHECK: mov r15, &g_i32+2
+; CHECK: mov r14, &g_i32
   store volatile i32 %b, i32* @g_i32, align 2
   ret void
 }
 
 define void @f_i16_i32_i32(i16 %a, i32 %b, i32 %c) #0 {
 ; CHECK: f_i16_i32_i32:
-; CHECK: mov.w r12, &g_i16
+; CHECK: mov r12, &g_i16
   store volatile i16 %a, i16* @g_i16, align 2
-; CHECK: mov.w r14, &g_i32+2
-; CHECK: mov.w r13, &g_i32
+; CHECK: mov r14, &g_i32+2
+; CHECK: mov r13, &g_i32
   store volatile i32 %b, i32* @g_i32, align 2
-; CHECK: mov.w r15, &g_i32
-; CHECK: mov.w 4(r4), &g_i32+2
+; CHECK: mov r15, &g_i32
+; CHECK: mov 4(r4), &g_i32+2
   store volatile i32 %c, i32* @g_i32, align 2
   ret void
 }
 
 define void @f_i16_i32_i16(i16 %a, i32 %b, i16 %c) #0 {
 ; CHECK: f_i16_i32_i16:
-; CHECK: mov.w r12, &g_i16
+; CHECK: mov r12, &g_i16
   store volatile i16 %a, i16* @g_i16, align 2
-; CHECK: mov.w r14, &g_i32+2
-; CHECK: mov.w r13, &g_i32
+; CHECK: mov r14, &g_i32+2
+; CHECK: mov r13, &g_i32
   store volatile i32 %b, i32* @g_i32, align 2
-; CHECK: mov.w r15, &g_i16
+; CHECK: mov r15, &g_i16
   store volatile i16 %c, i16* @g_i16, align 2
   ret void
 }
 
 define void @f_i16_i64_i16(i16 %a, i64 %b, i16 %c) #0 {
 ; CHECK: f_i16_i64_i16:
-; CHECK: mov.w r12, &g_i16
+; CHECK: mov r12, &g_i16
   store volatile i16 %a, i16* @g_i16, align 2
-;CHECK: mov.w 10(r4), &g_i64+6
-;CHECK: mov.w 8(r4), &g_i64+4
-;CHECK: mov.w 6(r4), &g_i64+2
-;CHECK: mov.w 4(r4), &g_i64
+;CHECK: mov 10(r4), &g_i64+6
+;CHECK: mov 8(r4), &g_i64+4
+;CHECK: mov 6(r4), &g_i64+2
+;CHECK: mov 4(r4), &g_i64
   store volatile i64 %b, i64* @g_i64, align 2
-;CHECK: mov.w r13, &g_i16
+;CHECK: mov r13, &g_i16
   store volatile i16 %c, i16* @g_i16, align 2
   ret void
 }
diff --git a/test/CodeGen/MSP430/cc_ret.ll b/test/CodeGen/MSP430/cc_ret.ll
index 937db6dbf3b..b4bb0554208 100644
--- a/test/CodeGen/MSP430/cc_ret.ll
+++ b/test/CodeGen/MSP430/cc_ret.ll
@@ -8,21 +8,21 @@ entry:
 ; CHECK: test:
 
 ; CHECK: call #f_i16
-; CHECK: mov.w r12, &g_i16
+; CHECK: mov r12, &g_i16
   %0 = call i16 @f_i16()
   store volatile i16 %0, i16* @g_i16
 
 ; CHECK: call #f_i32
-; CHECK: mov.w r13, &g_i32+2
-; CHECK: mov.w r12, &g_i32
+; CHECK: mov r13, &g_i32+2
+; CHECK: mov r12, &g_i32
   %1 = call i32 @f_i32()
   store volatile i32 %1, i32* @g_i32
 
 ; CHECK: call #f_i64
-; CHECK: mov.w r15, &g_i64+6
-; CHECK: mov.w r14, &g_i64+4
-; CHECK: mov.w r13, &g_i64+2
-; CHECK: mov.w r12, &g_i64
+; CHECK: mov r15, &g_i64+6
+; CHECK: mov r14, &g_i64+4
+; CHECK: mov r13, &g_i64+2
+; CHECK: mov r12, &g_i64
   %2 = call i64 @f_i64()
   store volatile i64 %2, i64* @g_i64
 
@@ -35,25 +35,25 @@ entry:
 
 define i16 @f_i16() #0 {
 ; CHECK: f_i16:
-; CHECK: mov.w #1, r12
+; CHECK: mov #1, r12
 ; CHECK: ret
   ret i16 1
 }
 
 define i32 @f_i32() #0 {
 ; CHECK: f_i32:
-; CHECK: mov.w #772, r12
-; CHECK: mov.w #258, r13
+; CHECK: mov #772, r12
+; CHECK: mov #258, r13
 ; CHECK: ret
   ret i32 16909060
 }
 
 define i64 @f_i64() #0 {
 ; CHECK: f_i64:
-; CHECK: mov.w #1800, r12
-; CHECK: mov.w #1286, r13
-; CHECK: mov.w #772, r14
-; CHECK: mov.w #258, r15
+; CHECK: mov #1800, r12
+; CHECK: mov #1286, r13
+; CHECK: mov #772, r14
+; CHECK: mov #258, r15
 ; CHECK: ret
   ret i64 72623859790382856
 }
diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll
index 2559e23ae1f..87c4055829c 100644
--- a/test/CodeGen/MSP430/fp.ll
+++ b/test/CodeGen/MSP430/fp.ll
@@ -6,13 +6,13 @@ target triple = "msp430---elf"
 define void @fp() nounwind {
 entry:
 ; CHECK-LABEL: fp:
-; CHECK: push.w r4
-; CHECK: mov.w r1, r4
-; CHECK: sub.w #2, r1
+; CHECK: push r4
+; CHECK: mov r1, r4
+; CHECK: sub #2, r1
   %i = alloca i16, align 2
-; CHECK: mov.w #0, -2(r4)
+; CHECK: clr -2(r4)
   store i16 0, i16* %i, align 2
-; CHECK: pop.w r4
+; CHECK: pop r4
   ret void
 }
 
diff --git a/test/CodeGen/MSP430/jumptable.ll b/test/CodeGen/MSP430/jumptable.ll
index 49f23166a0a..6121f7ebed6 100644
--- a/test/CodeGen/MSP430/jumptable.ll
+++ b/test/CodeGen/MSP430/jumptable.ll
@@ -7,15 +7,15 @@ target triple = "msp430---elf"
 define i16 @test(i16 %i) #0 {
 entry:
 ; CHECK-LABEL: test:
-; CHECK:      sub.w   #4, r1
-; CHECK-NEXT: mov.w   r12, 0(r1)
-; CHECK-NEXT: cmp.w   #4, r12
+; CHECK:      sub   #4, r1
+; CHECK-NEXT: mov   r12, 0(r1)
+; CHECK-NEXT: cmp   #4, r12
 ; CHECK-NEXT: jhs     .LBB0_3
   %retval = alloca i16, align 2
   %i.addr = alloca i16, align 2
   store i16 %i, i16* %i.addr, align 2
   %0 = load i16, i16* %i.addr, align 2
-; CHECK:      rla.w r12
+; CHECK:      add   r12, r12
 ; CHECK-NEXT: br .LJTI0_0(r12)
   switch i16 %0, label %sw.default [
     i16 0, label %sw.bb
diff --git a/test/CodeGen/MSP430/memset.ll b/test/CodeGen/MSP430/memset.ll
index 10b506c60d9..0f83b607820 100644
--- a/test/CodeGen/MSP430/memset.ll
+++ b/test/CodeGen/MSP430/memset.ll
@@ -9,9 +9,9 @@ define void @test() nounwind {
 entry:
 ; CHECK-LABEL: test:
   %0 = load i8*, i8** @buf, align 2
-; CHECK: mov.w &buf, r12
-; CHECK-NEXT: mov.w #5, r13
-; CHECK-NEXT: mov.w #128, r14
+; CHECK: mov &buf, r12
+; CHECK-NEXT: mov #5, r13
+; CHECK-NEXT: mov #128, r14
 ; CHECK-NEXT: call #memset
   call void @llvm.memset.p0i8.i16(i8* %0, i8 5, i16 128, i1 false)
   ret void
diff --git a/test/CodeGen/MSP430/misched-msp430.ll b/test/CodeGen/MSP430/misched-msp430.ll
index 3d18fa005a6..f44f10ccd3e 100644
--- a/test/CodeGen/MSP430/misched-msp430.ll
+++ b/test/CodeGen/MSP430/misched-msp430.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
 ; only verifies that the code generator ran successfully.
 ;
 ; CHECK-LABEL: @f
-; CHECK: mov.w &y, &x
+; CHECK: mov &y, &x
 ; CHECK: ret
 define void @f() {
 entry:
diff --git a/test/CodeGen/MSP430/postinc.ll b/test/CodeGen/MSP430/postinc.ll
index 75a927f33fc..20ee8fb3c85 100644
--- a/test/CodeGen/MSP430/postinc.ll
+++ b/test/CodeGen/MSP430/postinc.ll
@@ -12,7 +12,7 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
   %arrayidx = getelementptr i16, i16* %a, i16 %i.010   ; <i16*> [#uses=1]
 ; CHECK-LABEL: add:
-; CHECK: add.w @r{{[0-9]+}}+, r{{[0-9]+}}
+; CHECK: add @r{{[0-9]+}}+, r{{[0-9]+}}
   %tmp4 = load i16, i16* %arrayidx                     ; <i16> [#uses=1]
   %add = add i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
   %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
@@ -34,7 +34,7 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
   %arrayidx = getelementptr i16, i16* %a, i16 %i.010   ; <i16*> [#uses=1]
 ; CHECK-LABEL: sub:
-; CHECK: sub.w @r{{[0-9]+}}+, r{{[0-9]+}}
+; CHECK: sub @r{{[0-9]+}}+, r{{[0-9]+}}
   %tmp4 = load i16, i16* %arrayidx                     ; <i16> [#uses=1]
   %add = sub i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
   %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
@@ -56,7 +56,7 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
   %arrayidx = getelementptr i16, i16* %a, i16 %i.010   ; <i16*> [#uses=1]
 ; CHECK-LABEL: or:
-; CHECK: bis.w @r{{[0-9]+}}+, r{{[0-9]+}}
+; CHECK: bis @r{{[0-9]+}}+, r{{[0-9]+}}
   %tmp4 = load i16, i16* %arrayidx                     ; <i16> [#uses=1]
   %add = or i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
   %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
@@ -78,7 +78,7 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
   %arrayidx = getelementptr i16, i16* %a, i16 %i.010   ; <i16*> [#uses=1]
 ; CHECK-LABEL: xor:
-; CHECK: xor.w @r{{[0-9]+}}+, r{{[0-9]+}}
+; CHECK: xor @r{{[0-9]+}}+, r{{[0-9]+}}
   %tmp4 = load i16, i16* %arrayidx                     ; <i16> [#uses=1]
   %add = xor i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
   %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
@@ -100,7 +100,7 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
   %arrayidx = getelementptr i16, i16* %a, i16 %i.010   ; <i16*> [#uses=1]
 ; CHECK-LABEL: and:
-; CHECK: and.w @r{{[0-9]+}}+, r{{[0-9]+}}
+; CHECK: and @r{{[0-9]+}}+, r{{[0-9]+}}
   %tmp4 = load i16, i16* %arrayidx                     ; <i16> [#uses=1]
   %add = and i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
   %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
diff --git a/test/CodeGen/MSP430/select-use-sr.ll b/test/CodeGen/MSP430/select-use-sr.ll
index 3f67fb85f79..159fc93db5a 100644
--- a/test/CodeGen/MSP430/select-use-sr.ll
+++ b/test/CodeGen/MSP430/select-use-sr.ll
@@ -6,8 +6,8 @@ target triple = "msp430"
 ; Test that CMP instruction is not removed by MachineCSE.
 ;
 ; CHECK-LABEL: @f
-; CHECK: cmp.w r15, r13
-; CHECK: cmp.w r15, r13
+; CHECK: cmp r15, r13
+; CHECK: cmp r15, r13
 ; CHECK-NEXT: jeq .LBB0_2
 define i16 @f(i16, i16, i16, i16) {
 entry:
diff --git a/test/CodeGen/MSP430/setcc.ll b/test/CodeGen/MSP430/setcc.ll
index 6e2ec8ea3ea..52baf642903 100644
--- a/test/CodeGen/MSP430/setcc.ll
+++ b/test/CodeGen/MSP430/setcc.ll
@@ -9,10 +9,10 @@ define i16 @sccweqand(i16 %a, i16 %b) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: sccweqand:
-; CHECK:	bit.w	r13, r12
-; CHECK:	mov.w	r2, r12
-; CHECK:	rra.w   r12
-; CHECK:	and.w	#1, r12
+; CHECK:	bit	r13, r12
+; CHECK:	mov	r2, r12
+; CHECK:	rra   r12
+; CHECK:	and	#1, r12
 
 define i16 @sccwneand(i16 %a, i16 %b) nounwind {
 	%t1 = and i16 %a, %b
@@ -21,9 +21,9 @@ define i16 @sccwneand(i16 %a, i16 %b) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: sccwneand:
-; CHECK: 	bit.w	r13, r12
-; CHECK:	mov.w	r2, r12
-; CHECK:	and.w	#1, r12
+; CHECK: 	bit	r13, r12
+; CHECK:	mov	r2, r12
+; CHECK:	and	#1, r12
 
 define i16 @sccwne(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ne i16 %a, %b
@@ -31,11 +31,11 @@ define i16 @sccwne(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwne:
-; CHECK:	cmp.w	r13, r12
-; CHECK:	mov.w	r2, r13
-; CHECK:	rra.w	r13
-; CHECK:	mov.w	#1, r12
-; CHECK:	bic.w	r13, r12
+; CHECK:	cmp	r13, r12
+; CHECK:	mov	r2, r13
+; CHECK:	rra	r13
+; CHECK:	mov	#1, r12
+; CHECK:	bic	r13, r12
 
 define i16 @sccweq(i16 %a, i16 %b) nounwind {
 	%t1 = icmp eq i16 %a, %b
@@ -43,10 +43,10 @@ define i16 @sccweq(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccweq:
-; CHECK:	cmp.w	r13, r12
-; CHECK:	mov.w	r2, r12
-; CHECK:	rra.w	r12
-; CHECK:	and.w	#1, r12
+; CHECK:	cmp	r13, r12
+; CHECK:	mov	r2, r12
+; CHECK:	rra	r12
+; CHECK:	and	#1, r12
 
 define i16 @sccwugt(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ugt i16 %a, %b
@@ -54,9 +54,9 @@ define i16 @sccwugt(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwugt:
-; CHECK:	cmp.w	r12, r13
-; CHECK:	mov.w	#1, r12
-; CHECK:	bic.w	r2, r12
+; CHECK:	cmp	r12, r13
+; CHECK:	mov	#1, r12
+; CHECK:	bic	r2, r12
 
 define i16 @sccwuge(i16 %a, i16 %b) nounwind {
 	%t1 = icmp uge i16 %a, %b
@@ -64,9 +64,9 @@ define i16 @sccwuge(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwuge:
-; CHECK:	cmp.w	r13, r12
-; CHECK:	mov.w	r2, r12
-; CHECK:	and.w	#1, r12
+; CHECK:	cmp	r13, r12
+; CHECK:	mov	r2, r12
+; CHECK:	and	#1, r12
 
 define i16 @sccwult(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ult i16 %a, %b
@@ -74,9 +74,9 @@ define i16 @sccwult(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwult:
-; CHECK:	cmp.w	r13, r12
-; CHECK:	mov.w	#1, r12
-; CHECK:	bic.w	r2, r12
+; CHECK:	cmp	r13, r12
+; CHECK:	mov	#1, r12
+; CHECK:	bic	r2, r12
 
 define i16 @sccwule(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ule i16 %a, %b
@@ -84,9 +84,9 @@ define i16 @sccwule(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwule:
-; CHECK:	cmp.w	r12, r13
-; CHECK:	mov.w	r2, r12
-; CHECK:	and.w	#1, r12
+; CHECK:	cmp	r12, r13
+; CHECK:	mov	r2, r12
+; CHECK:	and	#1, r12
 
 define i16 @sccwsgt(i16 %a, i16 %b) nounwind {
 	%t1 = icmp sgt i16 %a, %b
diff --git a/test/CodeGen/MSP430/shifts.ll b/test/CodeGen/MSP430/shifts.ll
index 22ae59ef4b0..6d4050f42be 100644
--- a/test/CodeGen/MSP430/shifts.ll
+++ b/test/CodeGen/MSP430/shifts.ll
@@ -21,7 +21,7 @@ entry:
 define zeroext i8 @shl8(i8 zeroext %a, i8 zeroext %cnt) nounwind readnone {
 entry:
 ; CHECK: shl8
-; CHECK: rla.b
+; CHECK: add.b
   %shl = shl i8 %a, %cnt
   ret i8 %shl
 }
@@ -29,7 +29,7 @@ entry:
 define zeroext i16 @lshr16(i16 zeroext %a, i16 zeroext %cnt) nounwind readnone {
 entry:
 ; CHECK-LABEL: lshr16:
-; CHECK: rrc.w
+; CHECK: rrc
   %shr = lshr i16 %a, %cnt
   ret i16 %shr
 }
@@ -37,7 +37,7 @@ entry:
 define signext i16 @ashr16(i16 signext %a, i16 zeroext %cnt) nounwind readnone {
 entry:
 ; CHECK-LABEL: ashr16:
-; CHECK: rra.w
+; CHECK: rra
   %shr = ashr i16 %a, %cnt
   ret i16 %shr
 }
@@ -45,7 +45,7 @@ entry:
 define zeroext i16 @shl16(i16 zeroext %a, i16 zeroext %cnt) nounwind readnone {
 entry:
 ; CHECK-LABEL: shl16:
-; CHECK: rla.w
+; CHECK: add
   %shl = shl i16 %a, %cnt
   ret i16 %shl
 }
diff --git a/test/CodeGen/MSP430/struct-return.ll b/test/CodeGen/MSP430/struct-return.ll
index c28bf06af43..a52ea1b702a 100644
--- a/test/CodeGen/MSP430/struct-return.ll
+++ b/test/CodeGen/MSP430/struct-return.ll
@@ -9,14 +9,14 @@ target triple = "msp430---elf"
 
 define %s @fred() #0 {
 ; CHECK-LABEL: fred:
-; CHECK: mov.w	#2314, 14(r12)
-; CHECK: mov.w	#2828, 12(r12)
-; CHECK: mov.w	#3342, 10(r12)
-; CHECK: mov.w	#3840, 8(r12)
-; CHECK: mov.w	#258, 6(r12)
-; CHECK: mov.w	#772, 4(r12)
-; CHECK: mov.w	#1286, 2(r12)
-; CHECK: mov.w	#1800, 0(r12)
+; CHECK: mov	#2314, 14(r12)
+; CHECK: mov	#2828, 12(r12)
+; CHECK: mov	#3342, 10(r12)
+; CHECK: mov	#3840, 8(r12)
+; CHECK: mov	#258, 6(r12)
+; CHECK: mov	#772, 4(r12)
+; CHECK: mov	#1286, 2(r12)
+; CHECK: mov	#1800, 0(r12)
   ret %s {i64 72623859790382856, i64 651345242494996224} 
 }
 
diff --git a/test/CodeGen/MSP430/struct_layout.ll b/test/CodeGen/MSP430/struct_layout.ll
index 60ae9f09b4e..4c5a131acca 100644
--- a/test/CodeGen/MSP430/struct_layout.ll
+++ b/test/CodeGen/MSP430/struct_layout.ll
@@ -5,7 +5,7 @@ target triple = "msp430"
 %struct.X = type { i8 }
 
 ; CHECK-LABEL: @foo
-; CHECK: sub.w   #4, r1
+; CHECK: sub   #4, r1
 ; CHECK: mov.b   #1, 3(r1)
 define void @foo() {
   %1 = alloca %struct.X
@@ -21,7 +21,7 @@ define void @foo() {
 }
 
 ; CHECK-LABEL: @bar
-; CHECK: sub.w   #4, r1
+; CHECK: sub   #4, r1
 ; CHECK: mov.b   #1, 3(r1)
 define void @bar() {
   %1 = alloca [3 x %struct.X]
@@ -40,8 +40,8 @@ define void @bar() {
 %struct.Y = type { i8, i16 }
 
 ; CHECK-LABEL: @baz
-; CHECK: sub.w   #8, r1
-; CHECK: mov.w   #2, 6(r1)
+; CHECK: sub   #8, r1
+; CHECK: mov   #2, 6(r1)
 define void @baz() {
   %1 = alloca %struct.Y, align 2
   %2 = alloca %struct.Y, align 2
diff --git a/test/CodeGen/MSP430/transient-stack-alignment.ll b/test/CodeGen/MSP430/transient-stack-alignment.ll
index cca83509cf4..a2ddf8a0b08 100644
--- a/test/CodeGen/MSP430/transient-stack-alignment.ll
+++ b/test/CodeGen/MSP430/transient-stack-alignment.ll
@@ -5,11 +5,11 @@ target triple = "msp430---elf"
 
 define void @test() #0 {
 ; CHECK-LABEL: test:
-; CHECK: sub.w #2, r1
+; CHECK: sub #2, r1
   %1 = alloca i8, align 1
-; CHECK-NEXT: mov.b #0, 1(r1)
+; CHECK-NEXT: clr.b 1(r1)
   store i8 0, i8* %1, align 1
-; CHECK-NEXT: add.w #2, r1
+; CHECK-NEXT: add #2, r1
 ; CHECK-NEXT: ret
   ret void
 }
diff --git a/test/CodeGen/MSP430/vararg.ll b/test/CodeGen/MSP430/vararg.ll
index 3501861f575..edb61d2221e 100644
--- a/test/CodeGen/MSP430/vararg.ll
+++ b/test/CodeGen/MSP430/vararg.ll
@@ -10,12 +10,12 @@ declare void @llvm.va_copy(i8*, i8*) nounwind
 define void @va_start(i16 %a, ...) nounwind {
 entry:
 ; CHECK-LABEL: va_start:
-; CHECK: sub.w #2, r1
+; CHECK: sub #2, r1
   %vl = alloca i8*, align 2
   %vl1 = bitcast i8** %vl to i8*
-; CHECK-NEXT: mov.w r1, [[REG:r[0-9]+]]
-; CHECK-NEXT: add.w #6, [[REG]]
-; CHECK-NEXT: mov.w [[REG]], 0(r1)
+; CHECK-NEXT: mov r1, [[REG:r[0-9]+]]
+; CHECK-NEXT: add #6, [[REG]]
+; CHECK-NEXT: mov [[REG]], 0(r1)
   call void @llvm.va_start(i8* %vl1)
   call void @llvm.va_end(i8* %vl1)
   ret void
@@ -26,11 +26,11 @@ entry:
 ; CHECK-LABEL: va_arg:
   %vl.addr = alloca i8*, align 2
   store i8* %vl, i8** %vl.addr, align 2
-; CHECK: mov.w r12, [[REG:r[0-9]+]]
-; CHECK-NEXT: add.w #2, [[REG]]
-; CHECK-NEXT: mov.w [[REG]], 0(r1)
+; CHECK: mov r12, [[REG:r[0-9]+]]
+; CHECK-NEXT: incd [[REG]]
+; CHECK-NEXT: mov [[REG]], 0(r1)
   %0 = va_arg i8** %vl.addr, i16
-; CHECK-NEXT: mov.w 0(r12), r12
+; CHECK-NEXT: mov 0(r12), r12
   ret i16 %0
 }
 
@@ -39,11 +39,11 @@ entry:
 ; CHECK-LABEL: va_copy:
   %vl.addr = alloca i8*, align 2
   %vl2 = alloca i8*, align 2
-; CHECK-DAG: mov.w r12, 2(r1)
+; CHECK-DAG: mov r12, 2(r1)
   store i8* %vl, i8** %vl.addr, align 2
   %0 = bitcast i8** %vl2 to i8*
   %1 = bitcast i8** %vl.addr to i8*
-; CHECK-DAG: mov.w r12, 0(r1)
+; CHECK-DAG: mov r12, 0(r1)
   call void @llvm.va_copy(i8* %0, i8* %1)
   ret void
 }
diff --git a/test/MC/Disassembler/MSP430/lit.local.cfg b/test/MC/Disassembler/MSP430/lit.local.cfg
new file mode 100644
index 00000000000..b1cf1fbd21d
--- /dev/null
+++ b/test/MC/Disassembler/MSP430/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'MSP430' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/MC/Disassembler/MSP430/msp430.txt b/test/MC/Disassembler/MSP430/msp430.txt
new file mode 100644
index 00000000000..c7d6ff576da
--- /dev/null
+++ b/test/MC/Disassembler/MSP430/msp430.txt
@@ -0,0 +1,27 @@
+# RUN: llvm-mc -disassemble %s -triple=msp430 | FileCheck %s
+0x0f 0x47                     # CHECK: mov r7, r15
+0x2f 0x48                     # CHECK: mov @r8, r15
+0x3f 0x48                     # CHECK: mov @r8+, r15
+0x0f 0x43                     # CHECK: clr r15
+0x08 0x57                     # CHECK: add r7, r8
+0x28 0x57                     # CHECK: add @r7, r8
+0x38 0x57                     # CHECK: add @r7+, r8
+0x87 0x12                     # CHECK: call r7
+0x00 0x47                     # CHECK: br r7
+0x39 0xb2                     # CHECK: bit #8, r9
+
+0xfe 0x3f                     # CHECK: jmp $-2
+0xfe 0x23                     # CHECK: jne $-2
+
+0x3f 0x40 0x2a 0x00           # CHECK: mov #42, r15
+0x1f 0x48 0x2a 0x00           # CHECK: mov 42(r8), r15
+0x1f 0x42 0x2a 0x00           # CHECK: mov &42, r15
+0x1f 0x40 0x2a 0x00           # CHECK: mov 42, r15
+0xb0 0x12 0x81 0x01           # CHECK: call #385
+0x97 0x12 0x06 0x00           # CHECK: call 6(r7)
+0xa7 0xb2 0x02 0x00           # CHECK: bit #34, 2(r7)
+0xa9 0x57 0x08 0x00           # CHECK: add @r7, 8(r9)
+0xb7 0xe7 0xfe 0xff           # CHECK: xor @r7+, -2(r7)
+
+0xbf 0x40 0x2a 0x00 0x0c 0x00 # CHECK: mov #42, 12(r15)
+0x9a 0xb9 0x10 0x00 0x08 0x00 # CHECK: bit 16(r9), 8(r10)
diff --git a/test/MC/MSP430/addrmode.s b/test/MC/MSP430/addrmode.s
new file mode 100644
index 00000000000..46051c00fed
--- /dev/null
+++ b/test/MC/MSP430/addrmode.s
@@ -0,0 +1,110 @@
+; RUN: llvm-mc -triple msp430 -show-encoding < %s | FileCheck %s
+
+foo:
+  mov r8, r15
+  mov disp+2(r8), r15
+  mov disp+2, r15
+  mov &disp+2, r15
+  mov @r8, r15
+  mov @r8+, r15
+  mov #disp+2, r15
+
+; CHECK: mov r8, r15           ; encoding: [0x0f,0x48]
+; CHECK: mov disp+2(r8), r15   ; encoding: [0x1f,0x48,A,A]
+; CHECK: mov disp+2, r15       ; encoding: [0x1f,0x40,A,A]
+; CHECK: mov &disp+2, r15      ; encoding: [0x1f,0x42,A,A]
+; CHECK: mov @r8, r15          ; encoding: [0x2f,0x48]
+; CHECK: mov @r8+, r15         ; encoding: [0x3f,0x48]
+; CHECK: mov #disp+2, r15      ; encoding: [0x3f,0x40,A,A]
+
+  mov #42, r15
+  mov #42, 12(r15)
+  mov #42, &disp
+  mov disp, disp+2
+
+; CHECK: mov #42, r15          ; encoding: [0x3f,0x40,0x2a,0x00]
+; CHECK: mov #42, 12(r15)      ; encoding: [0xbf,0x40,0x2a,0x00,0x0c,0x00]
+; CHECK: mov #42, &disp        ; encoding: [0xb2,0x40,0x2a,0x00,A,A]
+; CHECK: mov disp, disp+2      ; encoding: [0x90,0x40,A,A,B,B]
+
+  add r7, r8
+  add 6(r7), r8
+  add &disp, r8
+  add disp, r8
+  add @r9, r8
+  add @r9+, r8
+  add #42, r8
+
+; CHECK: add r7, r8            ; encoding: [0x08,0x57]
+; CHECK: add 6(r7), r8         ; encoding: [0x18,0x57,0x06,0x00]
+; CHECK: add &disp, r8         ; encoding: [0x18,0x52,A,A]
+; CHECK: add disp, r8          ; encoding: [0x18,0x50,A,A]
+; CHECK: add @r9, r8           ; encoding: [0x28,0x59]
+; CHECK: add @r9+, r8          ; encoding: [0x38,0x59]
+; CHECK: add #42, r8           ; encoding: [0x38,0x50,0x2a,0x00]
+
+  add r7, 6(r5)
+  add 6(r7), 6(r5)
+  add &disp, 6(r5)
+  add disp, 6(r5)
+  add @r9, 6(r5)
+  add @r9+, 6(r5)
+  add #42, 6(r5)
+
+; CHECK: add r7, 6(r5)         ; encoding: [0x85,0x57,0x06,0x00]
+; CHECK: add 6(r7), 6(r5)      ; encoding: [0x95,0x57,0x06,0x00,0x06,0x00]
+; CHECK: add &disp, 6(r5)      ; encoding: [0x95,0x52,A,A,0x06,0x00]
+; CHECK: add disp, 6(r5)       ; encoding: [0x95,0x50,A,A,0x06,0x00]
+; CHECK: add @r9, 6(r5)        ; encoding: [0xa5,0x59,0x06,0x00]
+; CHECK: add @r9+, 6(r5)       ; encoding: [0xb5,0x59,0x06,0x00]
+; CHECK: add #42, 6(r5)        ; encoding: [0xb5,0x50,0x2a,0x00,0x06,0x00]
+
+  add r7, &disp
+  add 6(r7), &disp
+  add &disp, &disp
+  add disp, &disp
+  add @r9, &disp
+  add @r9+, &disp
+  add #42, &disp
+
+; CHECK: add r7, &disp         ; encoding: [0x82,0x57,A,A]
+; CHECK: add 6(r7), &disp      ; encoding: [0x92,0x57,0x06,0x00,A,A]
+; CHECK: add &disp, &disp      ; encoding: [0x92,0x52,A,A,B,B]
+; CHECK: add disp, &disp       ; encoding: [0x92,0x50,A,A,B,B]
+; CHECK: add @r9, &disp        ; encoding: [0xa2,0x59,A,A]
+; CHECK: add @r9+, &disp       ; encoding: [0xb2,0x59,A,A]
+; CHECK: add #42, &disp        ; encoding: [0xb2,0x50,0x2a,0x00,A,A]
+
+  add r7, disp
+  add 6(r7), disp
+  add &disp, disp
+  add disp, disp
+  add @r9, disp
+  add @r9+, disp
+  add #42, disp
+
+; CHECK: add r7, disp          ; encoding: [0x80,0x57,A,A]
+; CHECK: add 6(r7), disp       ; encoding: [0x90,0x57,0x06,0x00,A,A]
+; CHECK: add &disp, disp       ; encoding: [0x90,0x52,A,A,B,B]
+; CHECK: add disp, disp        ; encoding: [0x90,0x50,A,A,B,B]
+; CHECK: add @r9, disp         ; encoding: [0xa0,0x59,A,A]
+; CHECK: add @r9+, disp        ; encoding: [0xb0,0x59,A,A]
+; CHECK: add #42, disp         ; encoding: [0xb0,0x50,0x2a,0x00,A,A]
+
+  call r7
+  call 6(r7)
+  call disp+6(r7)
+  call &disp
+  call disp
+  call #disp
+
+; CHECK: call r7               ; encoding: [0x87,0x12]
+; CHECK: call 6(r7)            ; encoding: [0x97,0x12,0x06,0x00]
+; CHECK: call disp+6(r7)       ; encoding: [0x97,0x12,A,A]
+; CHECK: call &disp            ; encoding: [0x92,0x12,A,A]
+; CHECK: call disp             ; encoding: [0x90,0x12,A,A]
+; CHECK: call #disp            ; encoding: [0xb0,0x12,A,A]
+
+disp:
+  .word 0xcafe
+  .word 0xbabe
diff --git a/test/MC/MSP430/altreg.s b/test/MC/MSP430/altreg.s
new file mode 100644
index 00000000000..fe1e3a43772
--- /dev/null
+++ b/test/MC/MSP430/altreg.s
@@ -0,0 +1,7 @@
+; RUN: llvm-mc -triple msp430 -show-encoding < %s | FileCheck %s
+  mov pc, r0 ; CHECK: mov r0, r0
+  mov sp, r1 ; CHECK: mov r1, r1
+  mov sr, r2 ; CHECK: mov r2, r2
+  mov cg, r3 ; CHECK: mov r3, r3
+  mov fp, r4 ; CHECK: mov r4, r4
+  bic #8, SR ; CHECK: dint
diff --git a/test/MC/MSP430/const.s b/test/MC/MSP430/const.s
new file mode 100644
index 00000000000..f5cca109a50
--- /dev/null
+++ b/test/MC/MSP430/const.s
@@ -0,0 +1,10 @@
+; RUN: llvm-mc -triple msp430 -show-encoding < %s | FileCheck %s
+  mov #4, r15 ; CHECK: mov #4, r15 ; encoding: [0x2f,0x42]
+  mov #8, r15 ; CHECK: mov #8, r15 ; encoding: [0x3f,0x42]
+  mov #0, r15 ; CHECK: clr r15     ; encoding: [0x0f,0x43]
+  mov #1, r15 ; CHECK: mov #1, r15 ; encoding: [0x1f,0x43]
+  mov #2, r15 ; CHECK: mov #2, r15 ; encoding: [0x2f,0x43]
+  mov #-1, r7 ; CHECK: mov #-1, r7 ; encoding: [0x37,0x43]
+
+  push #8     ; CHECK: push #8     ; encoding: [0x32,0x12]
+  push #42    ; CHECK: push #42    ; encoding: [0x30,0x12,0x2a,0x00]
diff --git a/test/MC/MSP430/invalid.s b/test/MC/MSP430/invalid.s
new file mode 100644
index 00000000000..2815b520dd5
--- /dev/null
+++ b/test/MC/MSP430/invalid.s
@@ -0,0 +1,19 @@
+; RUN: not llvm-mc -triple msp430 < %s 2>&1 | FileCheck %s
+foo:
+  ;; invalid operand count
+  mov    r7        ; CHECK: :[[@LINE]]:3: error: too few operands for instruction
+
+  ;; invalid destination addressing modes
+  mov    r7, @r15  ; CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+  mov    r7, @r15+ ; CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+  mov    r7, #0    ; CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+  mov    r7, #123  ; CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+
+  ;; invalid byte instructions
+  swpb.b r7        ; CHECK: :[[@LINE]]:3: error: invalid instruction mnemonic
+  sxt.b  r7        ; CHECK: :[[@LINE]]:3: error: invalid instruction mnemonic
+  call.b r7        ; CHECK: :[[@LINE]]:3: error: invalid instruction mnemonic
+
+  ;; invalid conditional jump offsets
+  jmp    -513      ; CHECK: :[[@LINE]]:10: error: invalid jump offset
+  jmp    512       ; CHECK: :[[@LINE]]:10: error: invalid jump offset
diff --git a/test/MC/MSP430/lit.local.cfg b/test/MC/MSP430/lit.local.cfg
new file mode 100644
index 00000000000..b1cf1fbd21d
--- /dev/null
+++ b/test/MC/MSP430/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'MSP430' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/MC/MSP430/opcode.s b/test/MC/MSP430/opcode.s
new file mode 100644
index 00000000000..14655fe091f
--- /dev/null
+++ b/test/MC/MSP430/opcode.s
@@ -0,0 +1,163 @@
+; RUN: llvm-mc -triple msp430 -show-encoding %s \
+; RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+; RUN: llvm-mc -triple msp430 -filetype=obj %s \
+; RUN:     | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INST %s
+
+  ;; IForm8 instructions
+  mov.b  r7, r8 ; CHECK-INST: mov.b  r7, r8
+                ; CHECK: encoding: [0x48,0x47]
+  add.b  r7, r8 ; CHECK-INST: add.b  r7, r8
+                ; CHECK: encoding: [0x48,0x57]
+  addc.b r7, r8 ; CHECK-INST: addc.b r7, r8
+                ; CHECK: encoding: [0x48,0x67]
+  subc.b r7, r8 ; CHECK-INST: subc.b r7, r8
+                ; CHECK: encoding: [0x48,0x77]
+  sub.b  r7, r8 ; CHECK-INST: sub.b  r7, r8
+                ; CHECK: encoding: [0x48,0x87]
+  cmp.b  r7, r8 ; CHECK-INST: cmp.b  r7, r8
+                ; CHECK: encoding: [0x48,0x97]
+  dadd.b r7, r8 ; CHECK-INST: dadd.b r7, r8
+                ; CHECK: encoding: [0x48,0xa7]
+  bit.b  r7, r8 ; CHECK-INST: bit.b  r7, r8
+                ; CHECK: encoding: [0x48,0xb7]
+  bic.b  r7, r8 ; CHECK-INST: bic.b  r7, r8
+                ; CHECK: encoding: [0x48,0xc7]
+  bis.b  r7, r8 ; CHECK-INST: bis.b  r7, r8
+                ; CHECK: encoding: [0x48,0xd7]
+  xor.b  r7, r8 ; CHECK-INST: xor.b  r7, r8
+                ; CHECK: encoding: [0x48,0xe7]
+  and.b  r7, r8 ; CHECK-INST: and.b  r7, r8
+                ; CHECK: encoding: [0x48,0xf7]
+
+  ;; IForm16 instructions
+  mov    r7, r8 ; CHECK-INST: mov    r7, r8
+                ; CHECK: encoding: [0x08,0x47]
+  add    r7, r8 ; CHECK-INST: add    r7, r8
+                ; CHECK: encoding: [0x08,0x57]
+  addc   r7, r8 ; CHECK-INST: addc   r7, r8
+                ; CHECK: encoding: [0x08,0x67]
+  subc   r7, r8 ; CHECK-INST: subc   r7, r8
+                ; CHECK: encoding: [0x08,0x77]
+  sub    r7, r8 ; CHECK-INST: sub    r7, r8
+                ; CHECK: encoding: [0x08,0x87]
+  cmp    r7, r8 ; CHECK-INST: cmp    r7, r8
+                ; CHECK: encoding: [0x08,0x97]
+  dadd   r7, r8 ; CHECK-INST: dadd   r7, r8
+                ; CHECK: encoding: [0x08,0xa7]
+  bit    r7, r8 ; CHECK-INST: bit    r7, r8
+                ; CHECK: encoding: [0x08,0xb7]
+  bic    r7, r8 ; CHECK-INST: bic    r7, r8
+                ; CHECK: encoding: [0x08,0xc7]
+  bis    r7, r8 ; CHECK-INST: bis    r7, r8
+                ; CHECK: encoding: [0x08,0xd7]
+  xor    r7, r8 ; CHECK-INST: xor    r7, r8
+                ; CHECK: encoding: [0x08,0xe7]
+  and    r7, r8 ; CHECK-INST: and    r7, r8
+                ; CHECK: encoding: [0x08,0xf7]
+
+  ;; IIForm8 instructions
+  rrc.b  r7     ; CHECK-INST: rrc.b  r7    
+                ; CHECK: encoding: [0x47,0x10]
+  rra.b  r7     ; CHECK-INST: rra.b  r7    
+                ; CHECK: encoding: [0x47,0x11]
+  push.b r7     ; CHECK-INST: push.b r7    
+                ; CHECK: encoding: [0x47,0x12]
+
+  ;; IIForm16 instructions
+  rrc    r7     ; CHECK-INST: rrc    r7    
+                ; CHECK: encoding: [0x07,0x10]
+  swpb   r7     ; CHECK-INST: swpb   r7    
+                ; CHECK: encoding: [0x87,0x10]
+  rra    r7     ; CHECK-INST: rra    r7    
+                ; CHECK: encoding: [0x07,0x11]
+  sxt    r7     ; CHECK-INST: sxt    r7    
+                ; CHECK: encoding: [0x87,0x11]
+  push   r7     ; CHECK-INST: push   r7    
+                ; CHECK: encoding: [0x07,0x12]
+  call   r7     ; CHECK-INST: call   r7    
+                ; CHECK: encoding: [0x87,0x12]
+  reti          ; CHECK-INST: reti         
+                ; CHECK: encoding: [0x00,0x13]
+
+  ;; CJForm instructions
+  jnz    -2     ; CHECK-INST: jne    $-2
+                ; CHECK: encoding: [0xfe,0x23]
+  jne    -2     ; CHECK-INST: jne    $-2
+                ; CHECK: encoding: [0xfe,0x23]
+  jeq    -2     ; CHECK-INST: jeq    $-2
+                ; CHECK: encoding: [0xfe,0x27]
+  jz     -2     ; CHECK-INST: jeq    $-2
+                ; CHECK: encoding: [0xfe,0x27]
+  jnc    -2     ; CHECK-INST: jlo    $-2
+                ; CHECK: encoding: [0xfe,0x2b]
+  jlo    -2     ; CHECK-INST: jlo    $-2
+                ; CHECK: encoding: [0xfe,0x2b]
+  jc     -2     ; CHECK-INST: jhs    $-2
+                ; CHECK: encoding: [0xfe,0x2f]
+  jhs    -2     ; CHECK-INST: jhs    $-2
+                ; CHECK: encoding: [0xfe,0x2f]
+  jn     -2     ; CHECK-INST: jn     $-2
+                ; CHECK: encoding: [0xfe,0x33]
+  jge    -2     ; CHECK-INST: jge    $-2
+                ; CHECK: encoding: [0xfe,0x37]
+  jl     -2     ; CHECK-INST: jl     $-2
+                ; CHECK: encoding: [0xfe,0x3b]
+  jmp    $-2    ; CHECK-INST: jmp    $-2
+                ; CHECK: encoding: [0xfe,0x3f]
+
+  ;; Emulated arithmetic instructions
+  adc    r7     ; CHECK-INST: adc    r7
+                ; CHECK: encoding: [0x07,0x63]
+  dadc   r7     ; CHECK-INST: dadc   r7
+                ; CHECK: encoding: [0x07,0xa3]
+  dec    r7     ; CHECK-INST: dec    r7
+                ; CHECK: encoding: [0x17,0x83]
+  decd   r7     ; CHECK-INST: decd   r7
+                ; CHECK: encoding: [0x27,0x83]
+  inc    r7     ; CHECK-INST: inc    r7
+                ; CHECK: encoding: [0x17,0x53]
+  incd   r7     ; CHECK-INST: incd   r7
+                ; CHECK: encoding: [0x27,0x53]
+  sbc    r7     ; CHECK-INST: sbc    r7
+                ; CHECK: encoding: [0x07,0x73]
+
+  ;; Emulated logical instructions
+  inv    r7     ; CHECK-INST: inv    r7
+                ; CHECK: encoding: [0x37,0xe3]
+  rla    r7     ; CHECK-INST: add    r7, r7
+                ; CHECK: encoding: [0x07,0x57]
+  rlc    r7     ; CHECK-INST: addc   r7, r7
+                ; CHECK: encoding: [0x07,0x67]
+
+  ;; Emulated program flow control instructions
+  br     r7     ; CHECK-INST: br     r7    
+                ; CHECK: encoding: [0x00,0x47]
+  dint          ; CHECK-INST: dint
+                ; CHECK: encoding: [0x32,0xc2]
+  eint          ; CHECK-INST: eint
+                ; CHECK: encoding: [0x32,0xd2]
+  nop           ; CHECK-INST: nop
+                ; CHECK: encoding: [0x03,0x43]
+  ret           ; CHECK-INST: ret          
+                ; CHECK: encoding: [0x30,0x41]
+
+  ;; Emulated data instruction
+  clr    r7     ; CHECK-INST: clr    r7
+                ; CHECK: encoding: [0x07,0x43]
+  clrc          ; CHECK-INST: clrc
+                ; CHECK: encoding: [0x12,0xc3]
+  clrn          ; CHECK-INST: clrn
+                ; CHECK: encoding: [0x22,0xc2]
+  clrz          ; CHECK-INST: clrz
+                ; CHECK: encoding: [0x22,0xc3]
+  pop    r7     ; CHECK-INST: pop    r7
+                ; CHECK: encoding: [0x37,0x41]
+  setc          ; CHECK-INST: setc
+                ; CHECK: encoding: [0x12,0xd3]
+  setn          ; CHECK-INST: setn
+                ; CHECK: encoding: [0x22,0xd2]
+  setz          ; CHECK-INST: setz
+                ; CHECK: encoding: [0x22,0xd3]
+  tst    r7     ; CHECK-INST: tst    r7
+                ; CHECK: encoding: [0x07,0x93]
diff --git a/test/MC/MSP430/reloc.s b/test/MC/MSP430/reloc.s
new file mode 100644
index 00000000000..42dd64a43c5
--- /dev/null
+++ b/test/MC/MSP430/reloc.s
@@ -0,0 +1,22 @@
+; RUN: llvm-mc -triple msp430 -show-encoding < %s | FileCheck %s
+
+         mov    disp+2(r8), r15
+; CHECK: mov    disp+2(r8), r15 ; encoding: [0x1f,0x48,A,A]
+; CHECK:                        ;   fixup A - offset: 2, value: disp+2, kind: fixup_16_byte
+
+         mov    disp+2, r15
+; CHECK: mov    disp+2, r15     ; encoding: [0x1f,0x40,A,A]
+; CHECK:                        ;   fixup A - offset: 2, value: disp+2, kind: fixup_16_pcrel_byte
+
+         mov    &disp+2, r15
+; CHECK: mov    &disp+2, r15    ; encoding: [0x1f,0x42,A,A]
+; CHECK:                        ;   fixup A - offset: 2, value: disp+2, kind: fixup_16
+
+         mov    disp, disp+2
+; CHECK: mov    disp, disp+2    ; encoding: [0x90,0x40,A,A,B,B]
+; CHECK:                        ;   fixup A - offset: 2, value: disp, kind: fixup_16_pcrel_byte
+; CHECK:                        ;   fixup B - offset: 4, value: disp+2, kind: fixup_16_pcrel_byte
+
+         jmp    foo
+; CHECK: jmp    foo             ; encoding: [A,0b001111AA]
+; CHECK:                        ;   fixup A - offset: 0, value: foo, kind: fixup_10_pcrel
-- 
GitLab


From 0db0961141df5199a54450aa0e2d9fd9f11cefe3 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Thu, 8 Nov 2018 00:16:23 +0000
Subject: [PATCH 1105/1116] [AMDGPU] Extend promote alloca vectorization

Promote alloca can vectorize a small array by bitcasting it to a
vector type. Extend vectorization for the case when alloca is
already a vector type. We still want to replace GEPs with an
insert/extract element instructions in this case.

Differential Revision: https://reviews.llvm.org/D54219

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346376 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp     |  24 ++-
 .../AMDGPU/promote-alloca-vector-to-vector.ll | 189 ++++++++++++++++++
 2 files changed, 209 insertions(+), 4 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll

diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ec7ea2baec0..5d087c09918 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -328,6 +328,10 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
     // Currently only handle the case where the Pointer Operand is a GEP.
     // Also we could not vectorize volatile or atomic loads.
     LoadInst *LI = cast<LoadInst>(Inst);
+    if (isa<AllocaInst>(User) &&
+        LI->getPointerOperandType() == User->getType() &&
+        isa<VectorType>(LI->getType()))
+      return true;
     return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
   }
   case Instruction::BitCast:
@@ -337,6 +341,10 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
     // since it should be canonical form, the User should be a GEP.
     // Also we could not vectorize volatile or atomic stores.
     StoreInst *SI = cast<StoreInst>(Inst);
+    if (isa<AllocaInst>(User) &&
+        SI->getPointerOperandType() == User->getType() &&
+        isa<VectorType>(SI->getValueOperand()->getType()))
+      return true;
     return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
   }
   default:
@@ -351,7 +359,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     return false;
   }
 
-  ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
+  Type *AT = Alloca->getAllocatedType();
+  SequentialType *AllocaTy = dyn_cast<SequentialType>(AT);
 
   LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 
@@ -398,7 +407,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     }
   }
 
-  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+  VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);
+  if (!VectorTy)
+    VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy));
 
   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                     << *VectorTy << '\n');
@@ -408,6 +419,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
+      if (Inst->getType() == AT)
+        break;
+
       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
@@ -420,9 +434,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
       break;
     }
     case Instruction::Store: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
-
       StoreInst *SI = cast<StoreInst>(Inst);
+      if (SI->getValueOperand()->getType() == AT)
+        break;
+
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = SI->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
diff --git a/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
new file mode 100644
index 00000000000..80112160412
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -0,0 +1,189 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+
+; GCN-LABEL: {{^}}float4_alloca_store4:
+; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
+
+; GFX-NOT: buffer_
+; GCN:  v_readfirstlane_b32
+; GFX8: v_movrels_b32
+; GFX9: s_set_gpr_idx_on
+; GFX9: s_set_gpr_idx_off
+
+; OPT:  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT:  store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float> addrspace(5)* %alloca, align 4
+; OPT:  %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
+; OPT:  %1 = extractelement <4 x float> %0, i32 %sel2
+; OPT:  store float %1, float addrspace(1)* %out, align 4
+
+define amdgpu_kernel void @float4_alloca_store4(float addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x float>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> addrspace(5)* %alloca, align 4
+  %load = load float, float addrspace(5)* %gep, align 4
+  store float %load, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}float4_alloca_load4:
+; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4
+
+; GFX-NOT: buffer_
+; GCN:  v_readfirstlane_b32
+; GFX8: v_movreld_b32
+; GFX9: s_set_gpr_idx_on
+; GFX9: s_set_gpr_idx_off
+
+; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
+; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
+; OPT: store <4 x float> %1, <4 x float> addrspace(5)* %alloca
+; OPT: %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
+; OPT:  store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
+
+define amdgpu_kernel void @float4_alloca_load4(<4 x float> addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x float>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store float 1.0, float addrspace(5)* %gep, align 4
+  %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
+  store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}half4_alloca_store4:
+; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4
+
+; GFX-NOT: buffer_
+; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
+; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
+; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
+
+; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, <4 x half> addrspace(5)* %alloca, align 2
+; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
+; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
+; OPT: store half %1, half addrspace(1)* %out, align 2
+
+define amdgpu_kernel void @half4_alloca_store4(half addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x half>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, <4 x half> addrspace(5)* %alloca, align 2
+  %load = load half, half addrspace(5)* %gep, align 2
+  store half %load, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}half4_alloca_load4:
+; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4
+
+; GFX-NOT: buffer_
+; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
+; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
+
+; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
+; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
+; OPT: store <4 x half> %1, <4 x half> addrspace(5)* %alloca
+; OPT: %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
+; OPT: store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
+
+define amdgpu_kernel void @half4_alloca_load4(<4 x half> addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x half>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store half 1.0, half addrspace(5)* %gep, align 4
+  %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
+  store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}short4_alloca_store4:
+; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4
+
+; GFX-NOT: buffer_
+; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003
+; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
+; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
+
+; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
+; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
+; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
+; OPT: store i16 %1, i16 addrspace(1)* %out, align 2
+
+define amdgpu_kernel void @short4_alloca_store4(i16 addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x i16>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
+  %load = load i16, i16 addrspace(5)* %gep, align 2
+  store i16 %load, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}short4_alloca_load4:
+; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4
+
+; GFX-NOT: buffer_
+; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
+; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
+
+; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
+; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
+; OPT: store <4 x i16> %1, <4 x i16> addrspace(5)* %alloca
+; OPT: %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
+; OPT: store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
+
+define amdgpu_kernel void @short4_alloca_load4(<4 x i16> addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x i16>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store i16 1, i16 addrspace(5)* %gep, align 4
+  %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
+  store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
-- 
GitLab


From fb2083df4e3fd735e109e270b289cabcd4fcec3e Mon Sep 17 00:00:00 2001
From: Shoaib Meenai <smeenai@fb.com>
Date: Thu, 8 Nov 2018 00:18:12 +0000
Subject: [PATCH 1106/1116] [cmake] Set CMP0075 to NEW

Make the check_include_file* macros honor CMAKE_REQUIRED_LIBRARIES. This
shouldn't cause any of the configuration checks to give different
results (and I did clean configures before and after this change and
confirmed that the resulting CMake caches were identical, though of
course that's just one machine). This suppresses a warning when building
with CMake 3.12 or later.

This doesn't suppress the warning in clang, because clang does its own
cmake_minimum_required call even when being built in-tree, and that
resets all policy settings. I'll address that separately.

Differential Revision: https://reviews.llvm.org/D54236

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346377 91177308-0d34-0410-b5e6-96231b3b80d8
---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4591ce56b1f..c17b2f31ed8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,10 @@ if(POLICY CMP0068)
   set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
 endif()
 
+if(POLICY CMP0075)
+  cmake_policy(SET CMP0075 NEW)
+endif()
+
 if(NOT DEFINED LLVM_VERSION_MAJOR)
   set(LLVM_VERSION_MAJOR 8)
 endif()
-- 
GitLab


From 0157cb57214a58c395d97d777dba100732892e77 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Thu, 8 Nov 2018 00:33:38 +0000
Subject: [PATCH 1107/1116] [MachineOutliner][NFC] Only map blocks which have
 adjacent legal instructions

If a block doesn't have any ranges of adjacent legal instructions, then it
can't have outlining candidates. There's no point in mapping legal isntructions
in situations like this.

I noticed this reduces the size of the suffix tree in sqlite3 for AArch64 at
-Oz by about 3%.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346379 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineOutliner.cpp | 50 ++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 5032b70e872..c69bfecd8dc 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -640,18 +640,26 @@ struct InstructionMapper {
 
   /// Maps \p *It to a legal integer.
   ///
-  /// Updates \p InstrListForMBB, \p UnsignedVecForMBB, \p
-  /// InstructionIntegerMap, \p IntegerInstructionMap, and \p LegalInstrNumber.
+  /// Updates \p CanOutlineWithPrevInstr, \p HaveLegalRange, \p InstrListForMBB,
+  /// \p UnsignedVecForMBB, \p InstructionIntegerMap, \p IntegerInstructionMap,
+  /// and \p LegalInstrNumber.
   ///
   /// \returns The integer that \p *It was mapped to.
   unsigned mapToLegalUnsigned(
-      MachineBasicBlock::iterator &It, unsigned &NumLegalInBlock,
+      MachineBasicBlock::iterator &It, bool &CanOutlineWithPrevInstr,
+      bool &HaveLegalRange, unsigned &NumLegalInBlock,
       std::vector<unsigned> &UnsignedVecForMBB,
       std::vector<MachineBasicBlock::iterator> &InstrListForMBB) {
     // We added something legal, so we should unset the AddedLegalLastTime
     // flag.
     AddedIllegalLastTime = false;
 
+    // If we have at least two adjacent legal instructions (which may have
+    // invisible instructions in between), remember that.
+    if (CanOutlineWithPrevInstr)
+      HaveLegalRange = true;
+    CanOutlineWithPrevInstr = true;
+
     // Keep track of the number of legal instructions we insert.
     NumLegalInBlock++;
 
@@ -692,9 +700,12 @@ struct InstructionMapper {
   /// IllegalInstrNumber.
   ///
   /// \returns The integer that \p *It was mapped to.
-  unsigned mapToIllegalUnsigned(
-      MachineBasicBlock::iterator &It, std::vector<unsigned> &UnsignedVecForMBB,
-      std::vector<MachineBasicBlock::iterator> &InstrListForMBB) {
+  unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It,
+  bool &CanOutlineWithPrevInstr, std::vector<unsigned> &UnsignedVecForMBB,
+  std::vector<MachineBasicBlock::iterator> &InstrListForMBB) {
+    // Can't outline an illegal instruction. Set the flag.
+    CanOutlineWithPrevInstr = false;
+
     // Only add one illegal number per range of legal numbers.
     if (AddedIllegalLastTime)
       return IllegalInstrNumber;
@@ -738,6 +749,14 @@ struct InstructionMapper {
     // outlining.
     unsigned NumLegalInBlock = 0;
 
+    // True if we have at least two legal instructions which aren't separated
+    // by an illegal instruction.
+    bool HaveLegalRange = false;
+
+    // True if we can perform outlining given the last mapped (non-invisible)
+    // instruction. This lets us know if we have a legal range.
+    bool CanOutlineWithPrevInstr = false;
+
     // FIXME: Should this all just be handled in the target, rather than using
     // repeated calls to getOutliningType?
     std::vector<unsigned> UnsignedVecForMBB;
@@ -747,20 +766,22 @@ struct InstructionMapper {
       // Keep track of where this instruction is in the module.
       switch (TII.getOutliningType(It, Flags)) {
       case InstrType::Illegal:
-        mapToIllegalUnsigned(It, UnsignedVecForMBB, InstrListForMBB);
+        mapToIllegalUnsigned(It, CanOutlineWithPrevInstr,
+                             UnsignedVecForMBB, InstrListForMBB);
         break;
 
       case InstrType::Legal:
-        mapToLegalUnsigned(It, NumLegalInBlock, UnsignedVecForMBB,
-                           InstrListForMBB);
+        mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
+                           NumLegalInBlock, UnsignedVecForMBB, InstrListForMBB);
         break;
 
       case InstrType::LegalTerminator:
-        mapToLegalUnsigned(It, NumLegalInBlock, UnsignedVecForMBB,
-                           InstrListForMBB);
+        mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
+                           NumLegalInBlock, UnsignedVecForMBB, InstrListForMBB);
         // The instruction also acts as a terminator, so we have to record that
         // in the string.
-        mapToIllegalUnsigned(It, UnsignedVecForMBB, InstrListForMBB);
+        mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
+        InstrListForMBB);
         break;
 
       case InstrType::Invisible:
@@ -773,12 +794,13 @@ struct InstructionMapper {
 
     // Are there enough legal instructions in the block for outlining to be
     // possible?
-    if (NumLegalInBlock > 1) {
+    if (HaveLegalRange) {
       // After we're done every insertion, uniquely terminate this part of the
       // "string". This makes sure we won't match across basic block or function
       // boundaries since the "end" is encoded uniquely and thus appears in no
       // repeated substring.
-      mapToIllegalUnsigned(It, UnsignedVecForMBB, InstrListForMBB);
+      mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
+      InstrListForMBB);
       InstrList.insert(InstrList.end(), InstrListForMBB.begin(),
                        InstrListForMBB.end());
       UnsignedVec.insert(UnsignedVec.end(), UnsignedVecForMBB.begin(),
-- 
GitLab


From ab9a477d3fe96b9fa6e75445c0ef54c13027d43b Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Thu, 8 Nov 2018 00:35:54 +0000
Subject: [PATCH 1108/1116] NFC: DebugInfo: Track the origin CU rather than
 just the base address for range lists

Turns out knowing more than just the base address might be useful -
specifically a future change to respect a DICompileUnit flag for the use
of base address specifiers in DWARF < 5.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346380 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp |  3 +--
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp       |  3 ++-
 lib/CodeGen/AsmPrinter/DwarfFile.cpp        |  7 +++----
 lib/CodeGen/AsmPrinter/DwarfFile.h          | 11 +++++------
 4 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index a32cd8bc904..d93c7f6c845 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -430,8 +430,7 @@ void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
   // Add the range list to the set of ranges to be emitted.
   auto IndexAndList =
       (DD->getDwarfVersion() < 5 && Skeleton ? Skeleton->DU : DU)
-          ->addRange((Skeleton ? Skeleton->BaseAddress : BaseAddress),
-                     std::move(Range));
+          ->addRange(*(Skeleton ? Skeleton : this), std::move(Range));
 
   uint32_t Index = IndexAndList.first;
   auto &List = *IndexAndList.second;
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 2807734969a..070b8fe4ec1 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2275,7 +2275,8 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm,
   for (const RangeSpan &Range : List.getRanges())
     SectionRanges[&Range.getStart()->getSection()].push_back(&Range);
 
-  const MCSymbol *CUBase = List.getBaseAddress();
+  const DwarfCompileUnit &CU = List.getCU();
+  const MCSymbol *CUBase = CU.getBaseAddress();
   bool BaseIsSet = false;
   for (const auto &P : SectionRanges) {
     // Don't bother with a base address entry if there's only one range in
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 1e5b7f18958..4e410bb49be 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -111,9 +111,8 @@ void DwarfFile::addScopeLabel(LexicalScope *LS, DbgLabel *Label) {
 }
 
 std::pair<uint32_t, RangeSpanList *>
-DwarfFile::addRange(const MCSymbol *&CUBaseAddress,
-                    SmallVector<RangeSpan, 2> R) {
-  CURangeLists.push_back(RangeSpanList(Asm->createTempSymbol("debug_ranges"),
-                                       CUBaseAddress, std::move(R)));
+DwarfFile::addRange(const DwarfCompileUnit &CU, SmallVector<RangeSpan, 2> R) {
+  CURangeLists.push_back(
+      RangeSpanList(Asm->createTempSymbol("debug_ranges"), CU, std::move(R)));
   return std::make_pair(CURangeLists.size() - 1, &CURangeLists.back());
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 1e5c99e26eb..51acca8c1e5 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -48,17 +48,16 @@ class RangeSpanList {
 private:
   // Index for locating within the debug_range section this particular span.
   MCSymbol *RangeSym;
-  const MCSymbol **CUBaseAddress;
+  const DwarfCompileUnit *CU;
   // List of ranges.
   SmallVector<RangeSpan, 2> Ranges;
 
 public:
-  RangeSpanList(MCSymbol *Sym, const MCSymbol *&CUBaseAddress,
+  RangeSpanList(MCSymbol *Sym, const DwarfCompileUnit &CU,
                 SmallVector<RangeSpan, 2> Ranges)
-      : RangeSym(Sym), CUBaseAddress(&CUBaseAddress),
-        Ranges(std::move(Ranges)) {}
+      : RangeSym(Sym), CU(&CU), Ranges(std::move(Ranges)) {}
   MCSymbol *getSym() const { return RangeSym; }
-  const MCSymbol *&getBaseAddress() const { return *CUBaseAddress; }
+  const DwarfCompileUnit &getCU() const { return *CU; }
   const SmallVectorImpl<RangeSpan> &getRanges() const { return Ranges; }
   void addRange(RangeSpan Range) { Ranges.push_back(Range); }
 };
@@ -123,7 +122,7 @@ public:
     return CUs;
   }
 
-  std::pair<uint32_t, RangeSpanList *> addRange(const MCSymbol *&CUBaseAddress,
+  std::pair<uint32_t, RangeSpanList *> addRange(const DwarfCompileUnit &CU,
                                                 SmallVector<RangeSpan, 2> R);
 
   /// getRangeLists - Get the vector of range lists.
-- 
GitLab


From 654fe49fb85ca70b1497578ff7adabd7e582ac92 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 8 Nov 2018 00:57:33 +0000
Subject: [PATCH 1109/1116] [sancov] Put .SCOV* sections into the right comdat
 groups on COFF

Avoids linker errors about relocations against discarded sections.

This was uncovered during the Chromium clang roll here:
https://chromium-review.googlesource.com/c/chromium/src/+/1321863#message-717516acfcf829176f6a2f50980f7a4bdd66469a

After this change, Chromium's libGLESv2 links successfully for me.

Reviewers: metzman, hans, morehouse

Differential Revision: https://reviews.llvm.org/D54232

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346381 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Instrumentation.h     |  4 +-
 .../Instrumentation/HWAddressSanitizer.cpp    |  3 +-
 .../Instrumentation/Instrumentation.cpp       | 21 ++++-
 .../Instrumentation/SanitizerCoverage.cpp     |  5 +-
 .../SanitizerCoverage/coff-comdat.ll          | 85 +++++++++++++++++++
 5 files changed, 110 insertions(+), 8 deletions(-)
 create mode 100644 test/Instrumentation/SanitizerCoverage/coff-comdat.ll

diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 0011a5b3c51..81ed0a2237e 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -24,6 +24,7 @@
 
 namespace llvm {
 
+class Triple;
 class FunctionPass;
 class ModulePass;
 class OptimizationRemarkEmitter;
@@ -45,7 +46,8 @@ GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
 // Returns F.getComdat() if it exists.
 // Otherwise creates a new comdat, sets F's comdat, and returns it.
 // Returns nullptr on failure.
-Comdat *GetOrCreateFunctionComdat(Function &F, const std::string &ModuleId);
+Comdat *GetOrCreateFunctionComdat(Function &F, Triple &T,
+                                  const std::string &ModuleId);
 
 // Insert GCOV profiling instrumentation
 struct GCOVOptions {
diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 34a66296f6f..91021604169 100644
--- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -743,7 +743,8 @@ void HWAddressSanitizer::createFrameGlobal(Function &F,
   appendToCompilerUsed(M, GV);
   // Put GV into the F's Comadat so that if F is deleted GV can be deleted too.
   if (&F != HwasanCtorFunction)
-    if (auto Comdat = GetOrCreateFunctionComdat(F, CurModuleUniqueId))
+    if (auto Comdat =
+            GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
       GV->setComdat(Comdat);
 }
 
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 16976ef90ce..eb6a3730ad9 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm-c/Initialization.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
@@ -70,19 +71,31 @@ GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str,
   return GV;
 }
 
-Comdat *llvm::GetOrCreateFunctionComdat(Function &F,
+Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T,
                                         const std::string &ModuleId) {
   if (auto Comdat = F.getComdat()) return Comdat;
   assert(F.hasName());
   Module *M = F.getParent();
   std::string Name = F.getName();
-  if (F.hasLocalLinkage()) {
+
+  // Make a unique comdat name for internal linkage things on ELF. On COFF, the
+  // name of the comdat group identifies the leader symbol of the comdat group.
+  // The linkage of the leader symbol is considered during comdat resolution,
+  // and internal symbols with the same name from different objects will not be
+  // merged.
+  if (T.isOSBinFormatELF() && F.hasLocalLinkage()) {
     if (ModuleId.empty())
       return nullptr;
     Name += ModuleId;
   }
-  F.setComdat(M->getOrInsertComdat(Name));
-  return F.getComdat();
+
+  // Make a new comdat for the function. Use the "no duplicates" selection kind
+  // for non-weak symbols if the object file format supports it.
+  Comdat *C = M->getOrInsertComdat(Name);
+  if (T.isOSBinFormatCOFF() && !F.isWeakForLinker())
+    C->setSelectionKind(Comdat::NoDuplicates);
+  F.setComdat(C);
+  return C;
 }
 
 /// initializeInstrumentation - Initialize all passes in the TransformUtils
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 074ae1347f1..7f683ad089f 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -577,8 +577,9 @@ GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
       *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage,
       Constant::getNullValue(ArrayTy), "__sancov_gen_");
 
-  if (TargetTriple.isOSBinFormatELF())
-    if (auto Comdat = GetOrCreateFunctionComdat(F, CurModuleUniqueId))
+  if (TargetTriple.supportsCOMDAT())
+    if (auto Comdat =
+            GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
       Array->setComdat(Comdat);
   Array->setSection(getSectionName(Section));
   Array->setAlignment(Ty->isPointerTy() ? DL->getPointerSize()
diff --git a/test/Instrumentation/SanitizerCoverage/coff-comdat.ll b/test/Instrumentation/SanitizerCoverage/coff-comdat.ll
new file mode 100644
index 00000000000..61a9dcd92de
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/coff-comdat.ll
@@ -0,0 +1,85 @@
+; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -sanitizer-coverage-pc-table=1 -S | FileCheck %s
+
+; Make sure we use the right comdat groups for COFF to avoid relocations
+; against discarded sections. Internal linkage functions are also different from
+; ELF. We don't add a module unique identifier.
+
+; Test based on this source:
+; int baz(int);
+; static int __attribute__((noinline)) bar(int x) {
+;   if (x)
+;     return baz(x);
+;   return 0;
+; }
+; int foo(int x) {
+;   if (baz(0))
+;     x = bar(x);
+;   return x;
+; }
+
+; Both new comdats should no duplicates on COFF.
+
+; CHECK: $foo = comdat noduplicates
+; CHECK: $bar = comdat noduplicates
+
+; Tables for 'foo' should be in the 'foo' comdat.
+
+; CHECK: @__sancov_gen_{{.*}} = private global [1 x i8] zeroinitializer, section ".SCOV$CM", comdat($foo), align 1
+
+; CHECK: @__sancov_gen_{{.*}} = private constant [2 x i64*]
+; CHECK-SAME: [i64* bitcast (i32 (i32)* @foo to i64*), i64* inttoptr (i64 1 to i64*)],
+; CHECK-SAME: section ".SCOVP$M", comdat($foo), align 8
+
+; Tables for 'bar' should be in the 'bar' comdat.
+
+; CHECK: @__sancov_gen_{{.*}} = private global [1 x i8] zeroinitializer, section ".SCOV$CM", comdat($bar), align 1
+
+; CHECK: @__sancov_gen_{{.*}} = private constant [2 x i64*]
+; CHECK-SAME: [i64* bitcast (i32 (i32)* @bar to i64*), i64* inttoptr (i64 1 to i64*)],
+; CHECK-SAME: section ".SCOVP$M", comdat($bar), align 8
+
+; 'foo' and 'bar' should be in their new comdat groups.
+
+; CHECK: define dso_local i32 @foo(i32 %x){{.*}} comdat {
+; CHECK: define internal fastcc i32 @bar(i32 %x){{.*}} comdat {
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.14.26433"
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @foo(i32 %x) local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 @baz(i32 0) #3
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call fastcc i32 @bar(i32 %x)
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %x.addr.0 = phi i32 [ %call1, %if.then ], [ %x, %entry ]
+  ret i32 %x.addr.0
+}
+
+declare dso_local i32 @baz(i32) local_unnamed_addr #1
+
+; Function Attrs: noinline nounwind uwtable
+define internal fastcc i32 @bar(i32 %x) unnamed_addr #2 {
+entry:
+  %tobool = icmp eq i32 %x, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 @baz(i32 %x) #3
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { "asdf" }
+attributes #2 = { noinline nounwind uwtable }
+attributes #3 = { nounwind }
-- 
GitLab


From c1873bf60aa20b3873d859c99ecaf5967c20ba3f Mon Sep 17 00:00:00 2001
From: Nathan Lanza <nathan@lanza.io>
Date: Thu, 8 Nov 2018 01:10:24 +0000
Subject: [PATCH 1110/1116] Revert "Reorder FindPythonInterp so that config-ix
 can use PYTHON_EXECUTABLE"

This reverts commit rL346367 due to test error in compiler-rt.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346383 91177308-0d34-0410-b5e6-96231b3b80d8
---
 CMakeLists.txt | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c17b2f31ed8..4ff0e6a90e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -576,22 +576,6 @@ mark_as_advanced(LLVM_TARGET_TRIPLE_ENV)
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR OFF CACHE BOOL
   "Enable per-target runtimes directory")
 
-# Verify that we can find a Python 2 interpreter.  Python 3 is unsupported.
-# FIXME: We should support systems with only Python 3, but that requires work
-# on LLDB.
-set(Python_ADDITIONAL_VERSIONS 2.7)
-include(FindPythonInterp)
-if( NOT PYTHONINTERP_FOUND )
-  message(FATAL_ERROR
-"Unable to find Python interpreter, required for builds and testing.
-
-Please install Python or specify the PYTHON_EXECUTABLE CMake variable.")
-endif()
-
-if( ${PYTHON_VERSION_STRING} VERSION_LESS 2.7 )
-  message(FATAL_ERROR "Python 2.7 or newer is required")
-endif()
-
 # All options referred to from HandleLLVMOptions have to be specified
 # BEFORE this include, otherwise options will not be correctly set on
 # first cmake run
@@ -611,6 +595,22 @@ message(STATUS "LLVM default target triple: ${LLVM_DEFAULT_TARGET_TRIPLE}")
 
 include(HandleLLVMOptions)
 
+# Verify that we can find a Python 2 interpreter.  Python 3 is unsupported.
+# FIXME: We should support systems with only Python 3, but that requires work
+# on LLDB.
+set(Python_ADDITIONAL_VERSIONS 2.7)
+include(FindPythonInterp)
+if( NOT PYTHONINTERP_FOUND )
+  message(FATAL_ERROR
+"Unable to find Python interpreter, required for builds and testing.
+
+Please install Python or specify the PYTHON_EXECUTABLE CMake variable.")
+endif()
+
+if( ${PYTHON_VERSION_STRING} VERSION_LESS 2.7 )
+  message(FATAL_ERROR "Python 2.7 or newer is required")
+endif()
+
 ######
 # LLVMBuild Integration
 #
-- 
GitLab


From 8319d448af67c7be68b4ada5ebc842519d7b8636 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Thu, 8 Nov 2018 02:35:28 +0000
Subject: [PATCH 1111/1116] [WebAssembly] Add V128 to
 WebAssemblyInstrInfo::copyPhysReg

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits

Differential Revision: https://reviews.llvm.org/D53872

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346384 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyInstrInfo.cpp      |  2 +
 test/CodeGen/WebAssembly/regcopy.mir          | 80 +++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 test/CodeGen/WebAssembly/regcopy.mir

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index a1e0516a53b..5efff32d616 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -70,6 +70,8 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     CopyOpcode = WebAssembly::COPY_F32;
   else if (RC == &WebAssembly::F64RegClass)
     CopyOpcode = WebAssembly::COPY_F64;
+  else if (RC == &WebAssembly::V128RegClass)
+    CopyOpcode = WebAssembly::COPY_V128;
   else
     llvm_unreachable("Unexpected register class");
 
diff --git a/test/CodeGen/WebAssembly/regcopy.mir b/test/CodeGen/WebAssembly/regcopy.mir
new file mode 100644
index 00000000000..5115cde6d24
--- /dev/null
+++ b/test/CodeGen/WebAssembly/regcopy.mir
@@ -0,0 +1,80 @@
+# RUN: llc %s -o - -run-pass=postrapseudos | FileCheck %s
+--- |
+  target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+  target triple = "wasm32-unknown-unknown"
+
+  define void @copy_i32() {
+    ret void
+  }
+
+  define void @copy_i64() {
+    ret void
+  }
+
+  define void @copy_f32() {
+    ret void
+  }
+
+  define void @copy_f64() {
+    ret void
+  }
+
+  define void @copy_v128() {
+    ret void
+  }
+...
+---
+name: copy_i32
+# CHECK-LABEL: copy_i32
+body:             |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:i32 = COPY_I32 %1:i32
+  ; CHECK-NEXT: RETURN_VOID
+  bb.0:
+    %0:i32 = COPY %1:i32
+    RETURN_VOID implicit-def $arguments
+...
+---
+name: copy_i64
+# CHECK-LABEL: copy_i64
+body:             |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:i64 = COPY_I64 %1:i64
+  ; CHECK-NEXT: RETURN_VOID
+  bb.0:
+    %0:i64 = COPY %1:i64
+    RETURN_VOID implicit-def $arguments
+...
+---
+name: copy_f32
+# CHECK-LABEL: copy_f32
+body:             |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:f32 = COPY_F32 %1:f32
+  ; CHECK-NEXT: RETURN_VOID
+  bb.0:
+    %0:f32 = COPY %1:f32
+    RETURN_VOID implicit-def $arguments
+...
+---
+name: copy_f64
+# CHECK-LABEL: copy_f64
+body:             |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:f64 = COPY_F64 %1:f64
+  ; CHECK-NEXT: RETURN_VOID
+  bb.0:
+    %0:f64 = COPY %1:f64
+    RETURN_VOID implicit-def $arguments
+...
+---
+name: copy_v128
+# CHECK-LABEL: copy_v128
+body:             |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:v128 = COPY_V128 %1:v128
+  ; CHECK-NEXT: RETURN_VOID
+  bb.0:
+    %0:v128 = COPY %1:v128
+    RETURN_VOID implicit-def $arguments
+...
-- 
GitLab


From 17045f041878224f62685b0bdba357e16d2e13bc Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Thu, 8 Nov 2018 03:57:55 +0000
Subject: [PATCH 1112/1116] [MergeFuncs] Call removeUsers() prior to
 unnamed_addr RAUW

Summary:
For unnamed_addr functions we RAUW instead of only replacing direct callers. However, functions in which replacements were performed currently are not added back to the worklist, resulting in missed merging opportunities.

Fix this by calling removeUsers() prior to RAUW.

Reviewers: jfb, whitequark

Reviewed By: whitequark

Subscribers: rkruppe, llvm-commits

Differential Revision: https://reviews.llvm.org/D53262

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346385 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/MergeFunctions.cpp         |  1 +
 .../MergeFunc/unnamed-addr-reprocessing.ll    | 35 +++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 test/Transforms/MergeFunc/unnamed-addr-reprocessing.ll

diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index e8056e6cc61..49b3eacbaff 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -770,6 +770,7 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
         GlobalNumbers.erase(G);
         // If G's address is not significant, replace it entirely.
         Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
+        removeUsers(G);
         G->replaceAllUsesWith(BitcastF);
       } else {
         // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
diff --git a/test/Transforms/MergeFunc/unnamed-addr-reprocessing.ll b/test/Transforms/MergeFunc/unnamed-addr-reprocessing.ll
new file mode 100644
index 00000000000..5902edc0e88
--- /dev/null
+++ b/test/Transforms/MergeFunc/unnamed-addr-reprocessing.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+
+; After test3 and test4 have been merged, we should detect that
+; test1 and test2 can also be merged.
+
+; CHECK: define void @test4() unnamed_addr
+; CHECK-NEXT: tail call void @test3()
+; CHECK: define void @test2() unnamed_addr
+; CHECK-NEXT: tail call void @test1()
+
+declare void @dummy()
+  
+define void @test1() unnamed_addr {
+    call void @test3()
+    call void @test3()
+    ret void
+}
+
+define void @test2() unnamed_addr {
+    call void @test4()
+    call void @test4()
+    ret void
+}
+
+define void @test3() unnamed_addr {
+    call void @dummy()
+    call void @dummy()
+    ret void
+}
+
+define void @test4() unnamed_addr {
+    call void @dummy()
+    call void @dummy()
+    ret void
+}
-- 
GitLab


From 19a9621221e759cada6a36f1f8b26840e5f23037 Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Thu, 8 Nov 2018 03:58:01 +0000
Subject: [PATCH 1113/1116] [MergeFuncs] Improve ordering of equal functions

Summary:
MergeFunctions currently tries to process strong functions before
weak functions, because weak functions can simply call strong
functions, while a strong/weak function cannot call a weak function
(a backing strong function is needed).

This patch additionally tries to process external functions before
local functions, because we definitely have to keep the external
function, but may be able to drop the local one (and definitely
can if it is also unnamed_addr).

Unfortunately, this exposes an existing bug in the implementation:
The FnTree and FNodesInTree structures can currently go out of
sync in the case where two weak functions are merged, because the
function in FnTree/FNodesInTree is RAUWed. This leaves it behind in
FnTree (this is intended, as it is the strong backing function which
should be used for further merges), while it is replaced in
FNodesInTree (this is not intended).

This is fixed by switching FNodesInTree from using a ValueMap to
using a DenseMap of AssertingVH.

This exposes another minor issue: Currently FNodesInTree is not
cleared after MergeFunctions finishes running. Currently, this is
potentially dangerous (e.g. if something else wants to RAUW a function
with a non-function), but at the very least it is unnecessary/inefficient.
After the change to use AssertingVH it becomes more problematic,
because there are certainly passes that remove functions.

This issue is fixed by clearing FNodesInTree at the end of the pass.

Reviewers: jfb, whitequark

Reviewed By: whitequark

Subscribers: rkruppe, llvm-commits

Differential Revision: https://reviews.llvm.org/D53271

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346386 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/MergeFunctions.cpp         | 30 +++++++---
 .../MergeFunc/external-before-local.ll        | 55 +++++++++++++++++++
 2 files changed, 76 insertions(+), 9 deletions(-)
 create mode 100644 test/Transforms/MergeFunc/external-before-local.ll

diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 49b3eacbaff..4c51cd131a1 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -284,7 +284,7 @@ private:
   // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid
   // dangling iterators into FnTree. The invariant that preserves this is that
   // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree.
-  ValueMap<Function*, FnTreeType::iterator> FNodesInTree;
+  DenseMap<AssertingVH<Function>, FnTreeType::iterator> FNodesInTree;
 };
 
 } // end anonymous namespace
@@ -425,6 +425,7 @@ bool MergeFunctions::runOnModule(Module &M) {
   } while (!Deferred.empty());
 
   FnTree.clear();
+  FNodesInTree.clear();
   GlobalNumbers.clear();
 
   return Changed;
@@ -817,6 +818,24 @@ void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN,
   FN.replaceBy(G);
 }
 
+// Ordering for functions that are equal under FunctionComparator
+static bool isFuncOrderCorrect(const Function *F, const Function *G) {
+  if (F->isInterposable() != G->isInterposable()) {
+    // Strong before weak, because the weak function may call the strong
+    // one, but not the other way around.
+    return !F->isInterposable();
+  }
+  if (F->hasLocalLinkage() != G->hasLocalLinkage()) {
+    // External before local, because we definitely have to keep the external
+    // function, but may be able to drop the local one.
+    return !F->hasLocalLinkage();
+  }
+  // Impose a total order (by name) on the replacement of functions. This is
+  // important when operating on more than one module independently to prevent
+  // cycles of thunks calling each other when the modules are linked together.
+  return F->getName() <= G->getName();
+}
+
 // Insert a ComparableFunction into the FnTree, or merge it away if equal to one
 // that was already inserted.
 bool MergeFunctions::insert(Function *NewFunction) {
@@ -833,14 +852,7 @@ bool MergeFunctions::insert(Function *NewFunction) {
 
   const FunctionNode &OldF = *Result.first;
 
-  // Impose a total order (by name) on the replacement of functions. This is
-  // important when operating on more than one module independently to prevent
-  // cycles of thunks calling each other when the modules are linked together.
-  //
-  // First of all, we process strong functions before weak functions.
-  if ((OldF.getFunc()->isInterposable() && !NewFunction->isInterposable()) ||
-     (OldF.getFunc()->isInterposable() == NewFunction->isInterposable() &&
-       OldF.getFunc()->getName() > NewFunction->getName())) {
+  if (!isFuncOrderCorrect(OldF.getFunc(), NewFunction)) {
     // Swap the two functions.
     Function *F = OldF.getFunc();
     replaceFunctionInTree(*Result.first, NewFunction);
diff --git a/test/Transforms/MergeFunc/external-before-local.ll b/test/Transforms/MergeFunc/external-before-local.ll
new file mode 100644
index 00000000000..7dcdb0153df
--- /dev/null
+++ b/test/Transforms/MergeFunc/external-before-local.ll
@@ -0,0 +1,55 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+
+; We should normalize to test2 rather than test1,
+; because it allows us to drop test1 entirely
+
+; CHECK-NOT: define internal void @test1() unnamed_addr
+; CHECK: define void @test3() unnamed_addr
+; CHECK-NEXT: call void @test2()
+; CHECK-NEXT: call void @test2()
+  
+declare void @dummy()
+
+define internal void @test1() unnamed_addr {
+    call void @dummy()
+    call void @dummy()
+    ret void
+}
+
+define void @test2() unnamed_addr {
+    call void @dummy()
+    call void @dummy()
+    ret void
+}
+
+define void @test3() unnamed_addr {
+    call void @test1()
+    call void @test2()
+    ret void
+}
+
+; We should normalize to the existing test6 rather than
+; to a new anonymous strong backing function
+
+; CHECK: define weak void @test5()
+; CHECK-NEXT: tail call void @test6()
+; CHECK: define weak void @test4()
+; CHECK-NEXT: tail call void @test6()
+
+declare void @dummy2()
+  
+define weak void @test4() {
+    call void @dummy2()
+    call void @dummy2()
+    ret void
+}
+define weak void @test5() {
+    call void @dummy2()
+    call void @dummy2()
+    ret void
+}
+define void @test6() {
+    call void @dummy2()
+    call void @dummy2()
+    ret void
+}
-- 
GitLab


From 9ac4c44c949de9dbe4dc01b1d52885cc066c073b Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Thu, 8 Nov 2018 04:00:18 +0000
Subject: [PATCH 1114/1116] [OCaml] Fix incorrect use of CAMLlocal in nested
 blocks

Summary:
The OCaml manual states:

> Local variables of type value must be declared with one of the
> CAMLlocal macros. [...] These macros must be used at the beginning
> of the function, not in a nested block.

This patch moves several instances of CAMLlocal macros from nested
blocks to the function beginning.

Reviewers: whitequark

Reviewed By: whitequark

Subscribers: CodaFi, llvm-commits

Differential Revision: https://reviews.llvm.org/D53841

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346387 91177308-0d34-0410-b5e6-96231b3b80d8
---
 bindings/ocaml/llvm/llvm_ocaml.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/bindings/ocaml/llvm/llvm_ocaml.c b/bindings/ocaml/llvm/llvm_ocaml.c
index c637941d81d..cdf6c6a1206 100644
--- a/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/bindings/ocaml/llvm/llvm_ocaml.c
@@ -483,9 +483,9 @@ CAMLprim value llvm_struct_set_body(LLVMTypeRef Ty,
 CAMLprim value llvm_struct_name(LLVMTypeRef Ty)
 {
   CAMLparam0();
+  CAMLlocal1(result);
   const char *C = LLVMGetStructName(Ty);
   if (C) {
-    CAMLlocal1(result);
     result = caml_alloc_small(1, 0);
     Store_field(result, 0, caml_copy_string(C));
     CAMLreturn(result);
@@ -636,6 +636,7 @@ enum ValueKind {
 
 CAMLprim value llvm_classify_value(LLVMValueRef Val) {
   CAMLparam0();
+  CAMLlocal1(result);
   if (!Val)
     CAMLreturn(Val_int(NullValue));
   if (LLVMIsAConstant(Val)) {
@@ -652,7 +653,6 @@ CAMLprim value llvm_classify_value(LLVMValueRef Val) {
     DEFINE_CASE(Val, ConstantVector);
   }
   if (LLVMIsAInstruction(Val)) {
-    CAMLlocal1(result);
     result = caml_alloc_small(1, 0);
     Store_field(result, 0, Val_int(LLVMGetInstructionOpcode(Val)));
     CAMLreturn(result);
@@ -822,12 +822,11 @@ CAMLprim LLVMValueRef llvm_mdnull(LLVMContextRef C) {
 /* llvalue -> string option */
 CAMLprim value llvm_get_mdstring(LLVMValueRef V) {
   CAMLparam0();
+  CAMLlocal2(Option, Str);
   const char *S;
   unsigned Len;
 
   if ((S = LLVMGetMDString(V, &Len))) {
-    CAMLlocal2(Option, Str);
-
     Str = caml_alloc_string(Len);
     memcpy(String_val(Str), S, Len);
     Option = alloc(1,0);
-- 
GitLab


From 2c000da033ca03bf8ac5aa1c1b8a5fd5bcc5ecf5 Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Thu, 8 Nov 2018 04:04:04 +0000
Subject: [PATCH 1115/1116] [bindings/go] Add Go bindings to LLVMGetIndices

Summary: This instruction is useful for inspecting extractvalue/insertvalue in IR. Unlike most other operations, indices cannot be inspected using the generic Value.Opcode() function so a specialized function needs to be added.

Reviewers: whitequark, pcc

Reviewed By: whitequark

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53883

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346388 91177308-0d34-0410-b5e6-96231b3b80d8
---
 bindings/go/llvm/ir.go | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/bindings/go/llvm/ir.go b/bindings/go/llvm/ir.go
index cad21814cd7..1872a2ffe51 100644
--- a/bindings/go/llvm/ir.go
+++ b/bindings/go/llvm/ir.go
@@ -1258,6 +1258,19 @@ func InlineAsm(t Type, asmString, constraints string, hasSideEffects, isAlignSta
 	return
 }
 
+// Operations on aggregates
+func (v Value) Indices() []uint32 {
+	num := C.LLVMGetNumIndices(v.C)
+	indicesPtr := C.LLVMGetIndices(v.C)
+	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+	rawIndices := (*[1 << 30]C.uint)(unsafe.Pointer(indicesPtr))[:num:num]
+	indices := make([]uint32, num)
+	for i := range indices {
+		indices[i] = uint32(rawIndices[i])
+	}
+	return indices
+}
+
 //-------------------------------------------------------------------------
 // llvm.Builder
 //-------------------------------------------------------------------------
-- 
GitLab


From b4ace5f3454131a3070ef7c11e19e42fc9a80b4e Mon Sep 17 00:00:00 2001
From: Max Kazantsev <max.kazantsev@azul.com>
Date: Thu, 8 Nov 2018 05:07:58 +0000
Subject: [PATCH 1116/1116] [SCEV][NFC] Verify IR in
 isLoop[Entry,Backedge]GuardedByCond

We have a lot of various bugs that are caused by misuse of SCEV (in particular in LV),
all of them can simply be described as "we ask SCEV to prove some fact on invalid IR".
Some of examples of those are PR36311, PR37221, PR39160.

The problem is that these failues manifest differently (what we saw was failure of various
asserts across SCEV, but there can also be miscompiles). This patch adds an assert into two
SCEV methods that strongly rely on correctness of the IR and are involved in known failues.
This will at least allow us to have a clear indication of what was wrong in this case.

This patch also fixes a unit test with incorrect IR that fails this verification.

Differential Revision: https://reviews.llvm.org/D52930
Reviewed By: fhahn


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346389 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ScalarEvolution.cpp           | 15 +++++++++++++++
 unittests/Analysis/ScalarEvolutionTest.cpp |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 77f2467d72d..e5134f2eeda 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -112,6 +112,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -162,6 +163,11 @@ static cl::opt<bool>
                   cl::desc("Verify no dangling value in ScalarEvolution's "
                            "ExprValueMap (slow)"));
 
+static cl::opt<bool> VerifyIR(
+    "scev-verify-ir", cl::Hidden,
+    cl::desc("Verify IR correctness when making sensitive SCEV queries (slow)"),
+    cl::init(false));
+
 static cl::opt<unsigned> MulOpsInlineThreshold(
     "scev-mulops-inline-threshold", cl::Hidden,
     cl::desc("Threshold for inlining multiplication operands into a SCEV"),
@@ -9370,6 +9376,11 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
   // (interprocedural conditions notwithstanding).
   if (!L) return true;
 
+  if (VerifyIR)
+    assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()) &&
+           "This cannot be done on broken IR!");
+
+
   if (isKnownViaNonRecursiveReasoning(Pred, LHS, RHS))
     return true;
 
@@ -9475,6 +9486,10 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   // (interprocedural conditions notwithstanding).
   if (!L) return false;
 
+  if (VerifyIR)
+    assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()) &&
+           "This cannot be done on broken IR!");
+
   // Both LHS and RHS must be available at loop entry.
   assert(isAvailableAtLoopEntry(LHS, L) &&
          "LHS is not available at Loop Entry");
diff --git a/unittests/Analysis/ScalarEvolutionTest.cpp b/unittests/Analysis/ScalarEvolutionTest.cpp
index 98fc44e4923..3da0614bb71 100644
--- a/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -701,7 +701,7 @@ TEST_F(ScalarEvolutionsTest, SCEVZeroExtendExpr) {
     PN->addIncoming(Dec, IncBB);
     BranchInst::Create(CondBB, IncBB);
 
-    Accum = GetElementPtrInst::Create(I8Ty, Accum, Dec, "gep", EndBB);
+    Accum = GetElementPtrInst::Create(I8Ty, Accum, PN, "gep", EndBB);
 
     PrevBB = CondBB;
     CondBB = NextBB;
-- 
GitLab